diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ff0e6a90e571e23ab899b057a1e7141c24547d9..15f95d2e580c94ec4592b6c9c6bfa5cf16de136c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,6 +187,11 @@ option(LLVM_INSTALL_UTILS "Include utility binaries in the 'install' target." OF
 
 option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF)
 
+# Unfortunatly Clang is too eager to search directories for module maps, which can cause the
+# installed version of the maps to be found when building LLVM from source. Therefore we turn off
+# the installation by default. See llvm.org/PR31905.
+option(LLVM_INSTALL_MODULEMAPS "Install the modulemap files in the 'install' target." OFF)
+
 option(LLVM_USE_FOLDERS "Enable solution folders in Visual Studio. Disable for Express versions." ON)
 if ( LLVM_USE_FOLDERS )
   set_property(GLOBAL PROPERTY USE_FOLDERS ON)
@@ -365,6 +370,10 @@ option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF)
 option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON)
 option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)
 
+if (MSVC)
+   option(LLVM_ENABLE_INCREMENTAL_LINK "Link incrementally. Enabling it might produce slower executables." OFF)
+endif()
+
 option(LLVM_ENABLE_DUMP "Enable dump functions even when assertions are disabled" OFF)
 
 if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
@@ -399,8 +408,8 @@ option(LLVM_USE_OPROFILE
 option(LLVM_EXTERNALIZE_DEBUGINFO
   "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF)
 
-option(LLVM_CODESIGNING_IDENTITY
-  "Sign executables and dylibs with the given identity (Darwin Only)" OFF)
+set(LLVM_CODESIGNING_IDENTITY "" CACHE STRING
+  "Sign executables and dylibs with the given identity or skip if empty (Darwin Only)")
 
 # If enabled, verify we are on a platform that supports oprofile.
 if( LLVM_USE_OPROFILE )
@@ -973,6 +982,20 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     PATTERN ".svn" EXCLUDE
     )
 
+  if (LLVM_INSTALL_MODULEMAPS)
+    install(DIRECTORY include/llvm include/llvm-c
+            DESTINATION include
+            COMPONENT llvm-headers
+            FILES_MATCHING
+            PATTERN "module.modulemap"
+            )
+    install(FILES include/llvm/module.install.modulemap
+            DESTINATION include/llvm
+            COMPONENT llvm-headers
+            RENAME "module.extern.modulemap"
+            )
+  endif(LLVM_INSTALL_MODULEMAPS)
+
   # Installing the headers needs to depend on generating any public
   # tablegen'd headers.
   add_custom_target(llvm-headers DEPENDS intrinsics_gen)
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index 6a0dd0cbadc330bc7ed2cd36a045ab065f616cc9..a6e10fc0e27039fe3147abd5b3655b751e2df5be 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -71,7 +71,7 @@ D: Loop Strength Reduction, Register allocators
 N: Andrea Di Biagio
 E: andrea.dibiagio@sony.com
 E: andrea.dibiagio@gmail.com
-D: llvm-mca
+D: MCA, llvm-mca
 
 N: Duncan P. N. Exon Smith
 E: dexonsmith@apple.com
@@ -222,3 +222,7 @@ D: C API, OCaml bindings
 N: Jake Ehrlich
 E: jakehehrlich@google.com
 D: llvm-objcopy (tools/llvm-objcopy)
+
+N: Martin Storsjö
+E: martin@martin.st
+D: MinGW
diff --git a/CREDITS.TXT b/CREDITS.TXT
index e279701f57d90bb182b24dfb48d88c4b64443843..0a8154c8842f0830901f1a54b78fb4604bc0f56a 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -510,3 +510,7 @@ D: PowerPC Backend Developer
 N: Zixuan Wu
 E: wuzish@cn.ibm.com
 D: PowerPC Backend Developer
+
+N: Kang Zhang
+E: shkzhang@cn.ibm.com
+D: PowerPC Backend Developer
diff --git a/RELEASE_TESTERS.TXT b/RELEASE_TESTERS.TXT
index 462941b35b0e48b088daffe04986a7a5a0c40340..2e5ac1b9078db259ee530a3442aa20a23617f6f1 100644
--- a/RELEASE_TESTERS.TXT
+++ b/RELEASE_TESTERS.TXT
@@ -13,8 +13,8 @@ O: Ubuntu
 
 N: Sylvestre Ledru
 E: sylvestre@debian.org
-T: x86
-O: Debian
+T: All supported archs Debian/Ubuntu
+O: Debian/Ubuntu packages
 
 N: Nikola Smiljanic
 E: popizdeh@gmail.com
@@ -41,7 +41,7 @@ E: hans@chromium.org
 T: x86
 O: Windows
 
-N: Diana Picus
-E: diana.picus@linaro.org
+N: Diana Picus, Yvan Roux
+E: diana.picus@linaro.org, yvan.roux@linaro.org
 T: ARM, AArch64
 O: Linux
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 18977d9950ff84ed01288fe92633e55efb71cad4..900c35ee4f0c9bb01c39097e8ba4b013b0c73505 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -7,6 +7,7 @@ include(CheckIncludeFile)
 include(CheckLibraryExists)
 include(CheckSymbolExists)
 include(CheckFunctionExists)
+include(CheckStructHasMember)
 include(CheckCCompilerFlag)
 
 include(CheckCompilerVersion)
@@ -128,7 +129,7 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
     endif()
     if(LLVM_ENABLE_TERMINFO)
       set(HAVE_TERMINFO 0)
-      foreach(library tinfo terminfo curses ncurses ncursesw)
+      foreach(library terminfo tinfo curses ncurses ncursesw)
         string(TOUPPER ${library} library_suffix)
         check_library_exists(${library} setupterm "" HAVE_TERMINFO_${library_suffix})
         if(HAVE_TERMINFO_${library_suffix})
@@ -248,6 +249,11 @@ if( HAVE_DLFCN_H )
   endif()
 endif()
 
+CHECK_STRUCT_HAS_MEMBER("struct stat" st_mtimespec.tv_nsec
+    "sys/types.h;sys/stat.h" HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC)
+CHECK_STRUCT_HAS_MEMBER("struct stat" st_mtim.tv_nsec
+    "sys/types.h;sys/stat.h" HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC)
+
 check_symbol_exists(__GLIBC__ stdio.h LLVM_USING_GLIBC)
 if( LLVM_USING_GLIBC )
   add_definitions( -D_GNU_SOURCE )
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 189971655583a40532272d608d807212198b14be..c5aa961a292b4a2b8ba83c986aa51c78451ec5b3 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -73,7 +73,7 @@ function(add_llvm_symbol_exports target_name export_file)
       VERBATIM
       COMMENT "Creating export file for ${target_name}")
     set_property(TARGET ${target_name} APPEND_STRING PROPERTY
-                 LINK_FLAGS " -Wl,-exported_symbols_list,${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}")
+                 LINK_FLAGS " -Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}\"")
   elseif(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
     set_property(TARGET ${target_name} APPEND_STRING PROPERTY
                  LINK_FLAGS " -Wl,-bE:${export_file}")
@@ -93,10 +93,10 @@ function(add_llvm_symbol_exports target_name export_file)
       COMMENT "Creating export file for ${target_name}")
     if (${LLVM_LINKER_IS_SOLARISLD})
       set_property(TARGET ${target_name} APPEND_STRING PROPERTY
-                   LINK_FLAGS "  -Wl,-M,${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}")
+                   LINK_FLAGS "  -Wl,-M,\"${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}\"")
     else()
       set_property(TARGET ${target_name} APPEND_STRING PROPERTY
-                   LINK_FLAGS "  -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}")
+                   LINK_FLAGS "  -Wl,--version-script,\"${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}\"")
     endif()
   else()
     set(native_export_file "${target_name}.def")
@@ -372,12 +372,14 @@ endfunction(set_windows_version_resource_properties)
 #     May specify header files for IDE generators.
 #   SONAME
 #     Should set SONAME link flags and create symlinks
+#   NO_INSTALL_RPATH
+#     Suppress default RPATH settings in shared libraries.
 #   PLUGIN_TOOL
 #     The tool (i.e. cmake target) that this plugin will link against
 #   )
 function(llvm_add_library name)
   cmake_parse_arguments(ARG
-    "MODULE;SHARED;STATIC;OBJECT;DISABLE_LLVM_LINK_LLVM_DYLIB;SONAME"
+    "MODULE;SHARED;STATIC;OBJECT;DISABLE_LLVM_LINK_LLVM_DYLIB;SONAME;NO_INSTALL_RPATH"
     "OUTPUT_NAME;PLUGIN_TOOL"
     "ADDITIONAL_HEADERS;DEPENDS;LINK_COMPONENTS;LINK_LIBS;OBJLIBS"
     ${ARGN})
@@ -448,17 +450,19 @@ function(llvm_add_library name)
 
   if(ARG_MODULE)
     add_library(${name} MODULE ${ALL_FILES})
-    llvm_setup_rpath(${name})
   elseif(ARG_SHARED)
     add_windows_version_resource_file(ALL_FILES ${ALL_FILES})
     add_library(${name} SHARED ${ALL_FILES})
-
-    llvm_setup_rpath(${name})
-
   else()
     add_library(${name} STATIC ${ALL_FILES})
   endif()
 
+  if(NOT ARG_NO_INSTALL_RPATH)
+    if(ARG_MODULE OR ARG_SHARED)
+      llvm_setup_rpath(${name})
+    endif()
+  endif()
+
   setup_dependency_debugging(${name} ${LLVM_COMMON_DEPENDS})
 
   if(DEFINED windows_resource_file)
@@ -708,7 +712,12 @@ endmacro(add_llvm_loadable_module name)
 
 
 macro(add_llvm_executable name)
-  cmake_parse_arguments(ARG "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO;NO_INSTALL_RPATH" "" "DEPENDS" ${ARGN})
+  cmake_parse_arguments(ARG
+    "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO;NO_INSTALL_RPATH"
+    "ENTITLEMENTS"
+    "DEPENDS"
+    ${ARGN})
+
   llvm_process_sources( ALL_FILES ${ARG_UNPARSED_ARGUMENTS} )
 
   list(APPEND LLVM_COMMON_DEPENDS ${ARG_DEPENDS})
@@ -787,7 +796,7 @@ macro(add_llvm_executable name)
     target_link_libraries(${name} PRIVATE ${LLVM_PTHREAD_LIB})
   endif()
 
-  llvm_codesign(${name})
+  llvm_codesign(${name} ENTITLEMENTS ${ARG_ENTITLEMENTS})
 endmacro(add_llvm_executable name)
 
 function(export_executable_symbols target)
@@ -973,47 +982,50 @@ endfunction(canonicalize_tool_name)
 # Custom add_subdirectory wrapper
 # Takes in a project name (i.e. LLVM), the subdirectory name, and an optional
 # path if it differs from the name.
-macro(add_llvm_subdirectory project type name)
+function(add_llvm_subdirectory project type name)
   set(add_llvm_external_dir "${ARGN}")
   if("${add_llvm_external_dir}" STREQUAL "")
     set(add_llvm_external_dir ${name})
   endif()
   canonicalize_tool_name(${name} nameUPPER)
+  set(canonical_full_name ${project}_${type}_${nameUPPER})
+  get_property(already_processed GLOBAL PROPERTY ${canonical_full_name}_PROCESSED)
+  if(already_processed)
+    return()
+  endif()
+  set_property(GLOBAL PROPERTY ${canonical_full_name}_PROCESSED YES)
+
   if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}/CMakeLists.txt)
     # Treat it as in-tree subproject.
-    option(${project}_${type}_${nameUPPER}_BUILD
+    option(${canonical_full_name}_BUILD
            "Whether to build ${name} as part of ${project}" On)
     mark_as_advanced(${project}_${type}_${name}_BUILD)
-    if(${project}_${type}_${nameUPPER}_BUILD)
+    if(${canonical_full_name}_BUILD)
       add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir} ${add_llvm_external_dir})
-      # Don't process it in add_llvm_implicit_projects().
-      set(${project}_${type}_${nameUPPER}_BUILD OFF)
     endif()
   else()
     set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR
       "${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}"
       CACHE PATH "Path to ${name} source directory")
-    set(${project}_${type}_${nameUPPER}_BUILD_DEFAULT ON)
+    set(${canonical_full_name}_BUILD_DEFAULT ON)
     if(NOT LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR OR NOT EXISTS ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR})
-      set(${project}_${type}_${nameUPPER}_BUILD_DEFAULT OFF)
+      set(${canonical_full_name}_BUILD_DEFAULT OFF)
     endif()
     if("${LLVM_EXTERNAL_${nameUPPER}_BUILD}" STREQUAL "OFF")
-      set(${project}_${type}_${nameUPPER}_BUILD_DEFAULT OFF)
+      set(${canonical_full_name}_BUILD_DEFAULT OFF)
     endif()
-    option(${project}_${type}_${nameUPPER}_BUILD
+    option(${canonical_full_name}_BUILD
       "Whether to build ${name} as part of LLVM"
-      ${${project}_${type}_${nameUPPER}_BUILD_DEFAULT})
-    if (${project}_${type}_${nameUPPER}_BUILD)
+      ${${canonical_full_name}_BUILD_DEFAULT})
+    if (${canonical_full_name}_BUILD)
       if(EXISTS ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR})
         add_subdirectory(${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR} ${add_llvm_external_dir})
       elseif(NOT "${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}" STREQUAL "")
         message(WARNING "Nonexistent directory for ${name}: ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}")
       endif()
-      # FIXME: It'd be redundant.
-      set(${project}_${type}_${nameUPPER}_BUILD Off)
     endif()
   endif()
-endmacro()
+endfunction()
 
 # Add external project that may want to be built as part of llvm such as Clang,
 # lld, and Polly. This adds two options. One for the source directory of the
@@ -1369,16 +1381,6 @@ function(add_lit_target target comment)
     message(STATUS "${target} does nothing.")
   endif()
 
-  # Add lit test dependencies.
-  set(llvm_utils_deps
-    FileCheck count not
-  )
-  foreach(dep ${llvm_utils_deps})
-    if (TARGET ${dep})
-      add_dependencies(${target} ${dep})
-    endif()
-  endforeach()
-
   if (ARG_DEPENDS)
     add_dependencies(${target} ${ARG_DEPENDS})
   endif()
@@ -1602,6 +1604,13 @@ function(llvm_externalize_debuginfo name)
     endif()
   endif()
 
+  if(LLVM_EXTERNALIZE_DEBUGINFO_OUTPUT_DIR)
+    if(APPLE)
+      set(output_name "$<TARGET_FILE_NAME:${name}>.dSYM")
+      set(output_path "-o=${LLVM_EXTERNALIZE_DEBUGINFO_OUTPUT_DIR}/${output_name}")
+    endif()
+  endif()
+
   if(APPLE)
     if(CMAKE_CXX_FLAGS MATCHES "-flto"
       OR CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE} MATCHES "-flto")
@@ -1614,7 +1623,7 @@ function(llvm_externalize_debuginfo name)
       set(CMAKE_DSYMUTIL xcrun dsymutil)
     endif()
     add_custom_command(TARGET ${name} POST_BUILD
-      COMMAND ${CMAKE_DSYMUTIL} $<TARGET_FILE:${name}>
+      COMMAND ${CMAKE_DSYMUTIL} ${output_path} $<TARGET_FILE:${name}>
       ${strip_command}
       )
   else()
@@ -1626,7 +1635,10 @@ function(llvm_externalize_debuginfo name)
   endif()
 endfunction()
 
+# Usage: llvm_codesign(name [ENTITLEMENTS file])
 function(llvm_codesign name)
+  cmake_parse_arguments(ARG "" "ENTITLEMENTS" "" ${ARGN})
+
   if(NOT LLVM_CODESIGNING_IDENTITY)
     return()
   endif()
@@ -1642,12 +1654,21 @@ function(llvm_codesign name)
         OUTPUT_VARIABLE CMAKE_CODESIGN_ALLOCATE
       )
     endif()
+    if(DEFINED ARG_ENTITLEMENTS)
+      set(pass_entitlements --entitlements ${ARG_ENTITLEMENTS})
+    endif()
+    if(CMAKE_GENERATOR STREQUAL "Xcode")
+      # Avoid double-signing error: Since output overwrites input, Xcode runs
+      # the post-build rule even if the actual build-step was skipped.
+      set(pass_force --force)
+    endif()
+
     add_custom_command(
       TARGET ${name} POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E
               env CODESIGN_ALLOCATE=${CMAKE_CODESIGN_ALLOCATE}
               ${CMAKE_CODESIGN} -s ${LLVM_CODESIGNING_IDENTITY}
-              $<TARGET_FILE:${name}>
+              ${pass_entitlements} ${pass_force} $<TARGET_FILE:${name}>
     )
   endif()
 endfunction()
diff --git a/cmake/modules/ChooseMSVCCRT.cmake b/cmake/modules/ChooseMSVCCRT.cmake
index 0e6e1aa55254e51480ba489e54c545c7a6accc25..cdd7deec4c84ce1790efb5c80432b73ff327e91b 100644
--- a/cmake/modules/ChooseMSVCCRT.cmake
+++ b/cmake/modules/ChooseMSVCCRT.cmake
@@ -66,6 +66,15 @@ variables (LLVM_USE_CRT_DEBUG, etc) instead.")
       get_current_crt(LLVM_USE_CRT_${build}
         MSVC_CRT_REGEX
         CMAKE_CXX_FLAGS_${build})
+
+      # Make /MT the default in Release builds to make them faster
+      # and avoid the DLL function thunking.
+      if ((${build} STREQUAL "MINSIZEREL") OR
+          (${build} STREQUAL "RELEASE") OR
+          (${build} STREQUAL "RELWITHDEBINFO"))
+          set(LLVM_USE_CRT_${build} "MT")
+      endif()
+
       set(LLVM_USE_CRT_${build}
         "${LLVM_USE_CRT_${build}}"
         CACHE STRING "Specify VC++ CRT to use for ${build_type} configurations."
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index b590f768244540ffc696a12f73944dadc275fec8..e8b0e27e000d2414a19ca2f481dfb23cae0e02b4 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -138,7 +138,8 @@ endif()
 # build might work on ELF but fail on MachO/COFF.
 if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32 OR CYGWIN OR
         ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR
-        ${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD") AND
+	${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD" OR
+	${CMAKE_SYSTEM_NAME} MATCHES "DragonFly") AND
    NOT LLVM_USE_SANITIZER)
   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,defs")
 endif()
@@ -380,6 +381,14 @@ if( MSVC )
   # "Enforce type conversion rules".
   append("/Zc:rvalueCast" CMAKE_CXX_FLAGS)
 
+  if (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND NOT LLVM_ENABLE_INCREMENTAL_LINK)
+    foreach(CONFIG RELEASE RELWITHDEBINFO MINSIZEREL)
+      foreach(FLAG EXE MODULE SHARED STATIC)
+        string(REGEX REPLACE "[-/](INCREMENTAL:YES|INCREMENTAL:NO|INCREMENTAL)" "/INCREMENTAL:NO" CMAKE_${FLAG}_LINKER_FLAGS_${CONFIG} "${CMAKE_${FLAG}_LINKER_FLAGS_${CONFIG}}")
+      endforeach()
+    endforeach()
+  endif()
+
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT LLVM_ENABLE_LTO)
     # clang-cl and cl by default produce non-deterministic binaries because
     # link.exe /incremental requires a timestamp in the .obj file.  clang-cl
@@ -512,6 +521,10 @@ if (MSVC)
           # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2.
       -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation)
       -wd4319 # Suppress ''operator' : zero extending 'type' to 'type' of greater size'
+          # C4709 is disabled because of a bug with Visual Studio 2017 as of
+          # v15.8.8. Re-evaluate the usefulness of this diagnostic when the bug
+          # is fixed.
+      -wd4709 # Suppress comma operator within array index expression
 
       # Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't
       # support the 'aligned' attribute in the way that clang sources requires (for
diff --git a/cmake/modules/LLVMExternalProjectUtils.cmake b/cmake/modules/LLVMExternalProjectUtils.cmake
index f736c3947843fd54479c1cc501064e05b294df04..4d26a30f97becc8c58ce80d5b2374f10b4bbcc40 100644
--- a/cmake/modules/LLVMExternalProjectUtils.cmake
+++ b/cmake/modules/LLVMExternalProjectUtils.cmake
@@ -162,8 +162,35 @@ function(llvm_ExternalProject_Add name source_dir)
                       -DCMAKE_OBJDUMP=${CMAKE_OBJDUMP}
                       -DCMAKE_STRIP=${CMAKE_STRIP})
     set(llvm_config_path ${LLVM_CONFIG_PATH})
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" CLANG_VERSION
+             ${PACKAGE_VERSION})
+      set(resource_dir "${LLVM_LIBRARY_DIR}/clang/${CLANG_VERSION}")
+      set(flag_types ASM C CXX MODULE_LINKER SHARED_LINKER EXE_LINKER)
+      foreach(type ${flag_types})
+        set(${type}_flag -DCMAKE_${type}_FLAGS=-resource-dir=${resource_dir})
+      endforeach()
+      string(REPLACE ";" "|" flag_string "${flag_types}")
+      foreach(arg ${ARG_CMAKE_ARGS})
+        if(arg MATCHES "^-DCMAKE_(${flag_string})_FLAGS")
+          foreach(type ${flag_types})
+            if(arg MATCHES "^-DCMAKE_${type}_FLAGS")
+              string(REGEX REPLACE "^-DCMAKE_${type}_FLAGS=(.*)$" "\\1" flag_value "${arg}")
+              set(${type}_flag "${${type}_flag} ${flag_value}")
+            endif()
+          endforeach()
+        else()
+          list(APPEND cmake_args ${arg})
+        endif()
+      endforeach()
+      foreach(type ${flag_types})
+        list(APPEND cmake_args ${${type}_flag})
+      endforeach()
+    endif()
   else()
     set(llvm_config_path "$<TARGET_FILE:llvm-config>")
+    set(cmake_args ${ARG_CMAKE_ARGS})
   endif()
 
   ExternalProject_Add(${name}
@@ -187,7 +214,7 @@ function(llvm_ExternalProject_Add name source_dir)
                -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
                -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
                -DCMAKE_EXPORT_COMPILE_COMMANDS=1
-               ${ARG_CMAKE_ARGS}
+               ${cmake_args}
                ${PASSTHROUGH_VARIABLES}
     INSTALL_COMMAND ""
     STEP_TARGETS configure build
diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake
index d1afcb42f9de7de87e421fb8b5265e2e25e39372..2d942435222ee472d75b4e535c147ae4510e255b 100644
--- a/cmake/modules/TableGen.cmake
+++ b/cmake/modules/TableGen.cmake
@@ -158,6 +158,12 @@ macro(add_tablegen target project)
       llvm_ExternalProject_BuildCmd(tblgen_build_cmd ${target}
                                     ${LLVM_NATIVE_BUILD}
                                     CONFIGURATION Release)
+      # Create an artificial dependency between tablegen projects, because they
+      # compile the same dependencies, thus using the same build folders.
+      # FIXME: A proper fix requires sequentially chaining tablegens.
+      if (NOT ${project} STREQUAL LLVM AND TARGET ${project}-tablegen-host)
+        add_dependencies(${project}-tablegen-host LLVM-tablegen-host)
+      endif()
       add_custom_command(OUTPUT ${${project}_TABLEGEN_EXE}
         COMMAND ${tblgen_build_cmd}
         DEPENDS CONFIGURE_LLVM_NATIVE ${target}
diff --git a/docs/AMDGPU/AMDGPUAsmGFX7.rst b/docs/AMDGPU/AMDGPUAsmGFX7.rst
new file mode 100644
index 0000000000000000000000000000000000000000..58c13b5556b436293f03db6b4c76a20e65ea5e4b
--- /dev/null
+++ b/docs/AMDGPU/AMDGPUAsmGFX7.rst
@@ -0,0 +1,1411 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+============================
+Syntax of GFX7 Instructions
+============================
+
+.. contents::
+  :local:
+
+Notation
+========
+
+Notation used in this document is explained :ref:`here<amdgpu_syn_instruction_notation>`.
+
+Introduction
+============
+
+An overview of generic syntax and other features of AMDGPU instructions may be found :ref:`in this document<amdgpu_syn_instructions>`.
+
+Instructions
+============
+
+
+DS
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**         **SRC0**      **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    ds_add_rtn_u32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_rtn_u64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_src2_u32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_src2_u64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_u32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_u64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_b32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_b64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_rtn_b32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_rtn_b64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_src2_b32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_src2_b64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_append                      :ref:`vdst<amdgpu_synid7_vdst32_0>`                                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_b32                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_b64                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_f32                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_f64                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_b32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_b64               :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_f64               :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_condxchg32_rtn_b64          :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_consume                     :ref:`vdst<amdgpu_synid7_vdst32_0>`                                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_rtn_u32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_rtn_u64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_src2_u32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_src2_u64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_u32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_u64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_barrier                             :ref:`vdata<amdgpu_synid7_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_init                                :ref:`vdata<amdgpu_synid7_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_br                             :ref:`vdata<amdgpu_synid7_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_p                                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_release_all                                                       :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_v                                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_rtn_u32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_rtn_u64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_src2_u32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_src2_u64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_u32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_u64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_f32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_f64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_i32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_i64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_f32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_f64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_i32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_i64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_u32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_u64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_f32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_f64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_i32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_i64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_u32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_u64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_u32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_u64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_f32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_f64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_i32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_i64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_f32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_f64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_i32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_i64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_u32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_u64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_f32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_f64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_i32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_i64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_u32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_u64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_u32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_u64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_b32                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_b64                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_rtn_b32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_rtn_b64               :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_nop
+    ds_or_b32                                  :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_b64                                  :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_rtn_b32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_rtn_b64                  :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_src2_b32                             :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_src2_b64                             :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_ordered_count               :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2_b32                   :ref:`vdst<amdgpu_synid7_vdst64_0>`::ref:`b32x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2_b64                   :ref:`vdst<amdgpu_synid7_vdst128_0>`::ref:`b64x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2st64_b32               :ref:`vdst<amdgpu_synid7_vdst64_0>`::ref:`b32x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2st64_b64               :ref:`vdst<amdgpu_synid7_vdst128_0>`::ref:`b64x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b128                   :ref:`vdst<amdgpu_synid7_vdst128_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b64                    :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b96                    :ref:`vdst<amdgpu_synid7_vdst96_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_i16                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_i8                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u16                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u8                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_rtn_u32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_rtn_u64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_src2_u32                           :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_src2_u64                           :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_u32                                :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_u64                                :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_rtn_u32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_rtn_u64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_src2_u32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_src2_u64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_u32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_u64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_swizzle_b32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`sw_offset16<amdgpu_synid_sw_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrap_rtn_b32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2_b32                              :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2_b64                              :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2st64_b32                          :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2st64_b64                          :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b128                              :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata128_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b16                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b32                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b64                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b8                                :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b96                               :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata96_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_src2_b32                          :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_src2_b64                          :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2_rtn_b32             :ref:`vdst<amdgpu_synid7_vdst64_0>`::ref:`b32x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2_rtn_b64             :ref:`vdst<amdgpu_synid7_vdst128_0>`::ref:`b64x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2st64_rtn_b32         :ref:`vdst<amdgpu_synid7_vdst64_0>`::ref:`b32x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata32_0>`,   :ref:`vdata1<amdgpu_synid7_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2st64_rtn_b64         :ref:`vdst<amdgpu_synid7_vdst128_0>`::ref:`b64x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata0<amdgpu_synid7_vdata64_0>`,   :ref:`vdata1<amdgpu_synid7_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg_rtn_b32              :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg_rtn_b64              :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_b32                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_b64                                 :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_rtn_b32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_rtn_b64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,       :ref:`vaddr<amdgpu_synid7_addr_ds>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_src2_b32                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_src2_b64                            :ref:`vaddr<amdgpu_synid7_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+
+EXP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**      **SRC3**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    exp                            :ref:`tgt<amdgpu_synid7_tgt>`,      :ref:`vsrc0<amdgpu_synid7_src_exp>`,    :ref:`vsrc1<amdgpu_synid7_src_exp>`,    :ref:`vsrc2<amdgpu_synid7_src_exp>`,    :ref:`vsrc3<amdgpu_synid7_src_exp>`          :ref:`done<amdgpu_synid_done>` :ref:`compr<amdgpu_synid_compr>` :ref:`vm<amdgpu_synid_vm>`
+
+FLAT
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**           **SRC0**      **SRC1**             **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    flat_atomic_add                :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_add_x2             :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_and                :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_and_x2             :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_cmpswap            :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`b32x2<amdgpu_synid7_type_dev>`      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_cmpswap_x2         :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata128_0>`::ref:`b64x2<amdgpu_synid7_type_dev>`      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_dec                :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`u32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`::ref:`u32<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_dec_x2             :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`u64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`u64<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_fcmpswap           :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`f32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`f32x2<amdgpu_synid7_type_dev>`      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_fcmpswap_x2        :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`f64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata128_0>`::ref:`f64x2<amdgpu_synid7_type_dev>`      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_fmax               :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`f32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`::ref:`f32<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_fmax_x2            :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`f64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`f64<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_fmin               :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`f32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`::ref:`f32<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_fmin_x2            :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`f64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`f64<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_inc                :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`u32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`::ref:`u32<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_inc_x2             :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`u64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`u64<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_or                 :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_or_x2              :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smax               :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`s32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`::ref:`s32<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smax_x2            :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`s64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`s64<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smin               :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`s32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`::ref:`s32<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smin_x2            :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`s64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`s64<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_sub                :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_sub_x2             :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_swap               :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_swap_x2            :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umax               :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`u32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`::ref:`u32<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umax_x2            :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`u64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`u64<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umin               :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`::ref:`u32<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`::ref:`u32<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umin_x2            :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`::ref:`u64<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`::ref:`u64<amdgpu_synid7_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_xor                :ref:`vdst<amdgpu_synid7_dst_flat_atomic32>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_xor_x2             :ref:`vdst<amdgpu_synid7_dst_flat_atomic64>`::ref:`opt<amdgpu_synid7_opt>`,     :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dword                :ref:`vdst<amdgpu_synid7_vdst32_0>`,         :ref:`vaddr<amdgpu_synid7_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx2              :ref:`vdst<amdgpu_synid7_vdst64_0>`,         :ref:`vaddr<amdgpu_synid7_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx3              :ref:`vdst<amdgpu_synid7_vdst96_0>`,         :ref:`vaddr<amdgpu_synid7_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx4              :ref:`vdst<amdgpu_synid7_vdst128_0>`,         :ref:`vaddr<amdgpu_synid7_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_sbyte                :ref:`vdst<amdgpu_synid7_vdst32_0>`,         :ref:`vaddr<amdgpu_synid7_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_sshort               :ref:`vdst<amdgpu_synid7_vdst32_0>`,         :ref:`vaddr<amdgpu_synid7_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_ubyte                :ref:`vdst<amdgpu_synid7_vdst32_0>`,         :ref:`vaddr<amdgpu_synid7_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_ushort               :ref:`vdst<amdgpu_synid7_vdst32_0>`,         :ref:`vaddr<amdgpu_synid7_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_byte                              :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dword                             :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx2                           :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx3                           :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata96_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx4                           :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata128_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_short                             :ref:`vaddr<amdgpu_synid7_addr_flat>`,    :ref:`vdata<amdgpu_synid7_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+
+MIMG
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**       **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    image_atomic_add                         :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_and                         :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_cmpswap                     :ref:`vdata<amdgpu_synid7_data_mimg_atomic_cmp>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_dec                         :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_inc                         :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_or                          :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_smax                        :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_smin                        :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_sub                         :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_swap                        :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_umax                        :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_umin                        :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_xor                         :ref:`vdata<amdgpu_synid7_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid7_ret>`, :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4                  :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_b                :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_b_cl             :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_b_cl_o           :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_b_o              :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c                :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_b              :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_b_cl           :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_b_cl_o         :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_b_o            :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_cl             :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_cl_o           :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_l              :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_l_o            :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_lz             :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_lz_o           :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_c_o              :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_cl               :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_cl_o             :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_l                :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_l_o              :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_lz               :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_lz_o             :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4_o                :ref:`vdst<amdgpu_synid7_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_get_lod                  :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_get_resinfo              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load                     :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_mip                 :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_mip_pck             :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_mip_pck_sgn         :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_pck                 :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_pck_sgn             :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample                   :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_b                 :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_b_cl              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_b_cl_o            :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_b_o               :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c                 :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_b               :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_b_cl            :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_b_cl_o          :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_b_o             :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_cd              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_cd_cl           :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_cd_cl_o         :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_cd_o            :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_cl              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_cl_o            :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_d               :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_d_cl            :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_d_cl_o          :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_d_o             :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_l               :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_l_o             :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_lz              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_lz_o            :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_c_o               :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_cd                :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_cd_cl             :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_cd_cl_o           :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_cd_o              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_cl                :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_cl_o              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_d                 :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_d_cl              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_d_cl_o            :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_d_o               :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_l                 :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_l_o               :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_lz                :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_lz_o              :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample_o                 :ref:`vdst<amdgpu_synid7_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,     :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid7_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_store                              :ref:`vdata<amdgpu_synid7_data_mimg_store>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_store_mip                          :ref:`vdata<amdgpu_synid7_data_mimg_store>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_store_mip_pck                      :ref:`vdata<amdgpu_synid7_data_mimg_store>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_store_pck                          :ref:`vdata<amdgpu_synid7_data_mimg_store>`,     :ref:`vaddr<amdgpu_synid7_addr_mimg>`,    :ref:`srsrc<amdgpu_synid7_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+
+MUBUF
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**             **SRC1**      **SRC2**      **SRC3**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    buffer_atomic_add                        :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_add_x2                     :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_and                        :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_and_x2                     :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_cmpswap                    :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`::ref:`b32x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_cmpswap_x2                 :ref:`vdata<amdgpu_synid7_data_buf_atomic128>`::ref:`dst<amdgpu_synid7_ret>`::ref:`b64x2<amdgpu_synid7_type_dev>`, :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_dec                        :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_dec_x2                     :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`::ref:`u64<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_inc                        :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_inc_x2                     :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`::ref:`u64<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_or                         :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_or_x2                      :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smax                       :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`::ref:`s32<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smax_x2                    :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`::ref:`s64<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smin                       :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`::ref:`s32<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smin_x2                    :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`::ref:`s64<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_sub                        :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_sub_x2                     :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_swap                       :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_swap_x2                    :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umax                       :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umax_x2                    :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`::ref:`u64<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umin                       :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umin_x2                    :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`::ref:`u64<amdgpu_synid7_type_dev>`,   :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_xor                        :ref:`vdata<amdgpu_synid7_data_buf_atomic32>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_xor_x2                     :ref:`vdata<amdgpu_synid7_data_buf_atomic64>`::ref:`dst<amdgpu_synid7_ret>`,       :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dword              :ref:`vdst<amdgpu_synid7_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_dwordx2            :ref:`vdst<amdgpu_synid7_dst_buf_64>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dwordx3            :ref:`vdst<amdgpu_synid7_dst_buf_96>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dwordx4            :ref:`vdst<amdgpu_synid7_dst_buf_128>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_x           :ref:`vdst<amdgpu_synid7_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_format_xy          :ref:`vdst<amdgpu_synid7_dst_buf_64>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_xyz         :ref:`vdst<amdgpu_synid7_dst_buf_96>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_xyzw        :ref:`vdst<amdgpu_synid7_dst_buf_128>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_sbyte              :ref:`vdst<amdgpu_synid7_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_sshort             :ref:`vdst<amdgpu_synid7_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_ubyte              :ref:`vdst<amdgpu_synid7_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_ushort             :ref:`vdst<amdgpu_synid7_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid7_addr_buf>`,           :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_store_byte                        :ref:`vdata<amdgpu_synid7_vdata32_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dword                       :ref:`vdata<amdgpu_synid7_vdata32_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx2                     :ref:`vdata<amdgpu_synid7_vdata64_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx3                     :ref:`vdata<amdgpu_synid7_vdata96_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx4                     :ref:`vdata<amdgpu_synid7_vdata128_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_x                    :ref:`vdata<amdgpu_synid7_vdata32_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xy                   :ref:`vdata<amdgpu_synid7_vdata64_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xyz                  :ref:`vdata<amdgpu_synid7_vdata96_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xyzw                 :ref:`vdata<amdgpu_synid7_vdata128_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_short                       :ref:`vdata<amdgpu_synid7_vdata32_0>`,           :ref:`vaddr<amdgpu_synid7_addr_buf>`,    :ref:`srsrc<amdgpu_synid7_rsrc_buf>`,    :ref:`soffset<amdgpu_synid7_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_wbinvl1
+    buffer_wbinvl1_vol
+
+SMRD
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_buffer_load_dword            :ref:`sdst<amdgpu_synid7_sdst32_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_buf>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_buffer_load_dwordx16         :ref:`sdst<amdgpu_synid7_sdst512_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_buf>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_buffer_load_dwordx2          :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_buf>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_buffer_load_dwordx4          :ref:`sdst<amdgpu_synid7_sdst128_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_buf>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_buffer_load_dwordx8          :ref:`sdst<amdgpu_synid7_sdst256_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_buf>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_dcache_inv
+    s_dcache_inv_vol
+    s_load_dword                   :ref:`sdst<amdgpu_synid7_sdst32_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_addr>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_load_dwordx16                :ref:`sdst<amdgpu_synid7_sdst512_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_addr>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_load_dwordx2                 :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_addr>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_load_dwordx4                 :ref:`sdst<amdgpu_synid7_sdst128_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_addr>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_load_dwordx8                 :ref:`sdst<amdgpu_synid7_sdst256_0>`,     :ref:`sbase<amdgpu_synid7_base_smem_addr>`,    :ref:`soffset<amdgpu_synid7_offset_smem>`
+    s_memtime                      :ref:`sdst<amdgpu_synid7_sdst64_0>`
+
+SOP1
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_abs_i32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_and_saveexec_b64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_andn2_saveexec_b64           :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_bcnt0_i32_b32                :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_bcnt0_i32_b64                :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_bcnt1_i32_b32                :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_bcnt1_i32_b64                :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_bitset0_b32                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_bitset0_b64                  :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`::ref:`b32<amdgpu_synid7_type_dev>`
+    s_bitset1_b32                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_bitset1_b64                  :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`::ref:`b32<amdgpu_synid7_type_dev>`
+    s_brev_b32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_brev_b64                     :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_cbranch_join                           :ref:`ssrc<amdgpu_synid7_ssrc32_1>`
+    s_cmov_b32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_cmov_b64                     :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_ff0_i32_b32                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_ff0_i32_b64                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_ff1_i32_b32                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_ff1_i32_b64                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_flbit_i32                    :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_flbit_i32_b32                :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_flbit_i32_b64                :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_flbit_i32_i64                :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_getpc_b64                    :ref:`sdst<amdgpu_synid7_sdst64_1>`
+    s_mov_b32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_mov_b64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_mov_fed_b32                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_movreld_b32                  :ref:`sdst<amdgpu_synid7_sdst32_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_movreld_b64                  :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_movrels_b32                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_2>`
+    s_movrels_b64                  :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_1>`
+    s_nand_saveexec_b64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_nor_saveexec_b64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_not_b32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_not_b64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_or_saveexec_b64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_orn2_saveexec_b64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_quadmask_b32                 :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_quadmask_b64                 :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_rfe_b64                                :ref:`ssrc<amdgpu_synid7_ssrc64_1>`
+    s_setpc_b64                              :ref:`ssrc<amdgpu_synid7_ssrc64_1>`
+    s_sext_i32_i16                 :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_3>`
+    s_sext_i32_i8                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_3>`
+    s_swappc_b64                   :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_1>`
+    s_wqm_b32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc32_0>`
+    s_wqm_b64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_xnor_saveexec_b64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+    s_xor_saveexec_b64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`ssrc<amdgpu_synid7_ssrc64_0>`
+
+SOP2
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**       **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_absdiff_i32                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_add_i32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_add_u32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_addc_u32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_and_b32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_and_b64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+    s_andn2_b32                    :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_andn2_b64                    :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+    s_ashr_i32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_ashr_i64                     :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_bfe_i32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_bfe_i64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_bfe_u32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_bfe_u64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_bfm_b32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_bfm_b64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`::ref:`b32<amdgpu_synid7_type_dev>`, :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`b32<amdgpu_synid7_type_dev>`
+    s_cbranch_g_fork                         :ref:`ssrc0<amdgpu_synid7_ssrc64_2>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_2>`
+    s_cselect_b32                  :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cselect_b64                  :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+    s_lshl_b32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_lshl_b64                     :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_lshr_b32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_lshr_b64                     :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_max_i32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_max_u32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_min_i32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_min_u32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_mul_i32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_nand_b32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_nand_b64                     :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+    s_nor_b32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_nor_b64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+    s_or_b32                       :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_or_b64                       :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+    s_orn2_b32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_orn2_b64                     :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+    s_sub_i32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_sub_u32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_subb_u32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_xnor_b32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_xnor_b64                     :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+    s_xor_b32                      :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_xor_b64                      :ref:`sdst<amdgpu_synid7_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid7_ssrc64_0>`
+
+SOPC
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_bitcmp0_b32                  :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_bitcmp0_b64                  :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_bitcmp1_b32                  :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_bitcmp1_b64                  :ref:`ssrc0<amdgpu_synid7_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    s_cmp_eq_i32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_eq_u32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_ge_i32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_ge_u32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_gt_i32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_gt_u32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_le_i32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_le_u32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_lg_i32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_lg_u32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_lt_i32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_cmp_lt_u32                   :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+    s_setvskip                     :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_0>`
+
+SOPK
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_addk_i32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_cbranch_i_fork                         :ref:`ssrc<amdgpu_synid7_ssrc64_3>`,     :ref:`label<amdgpu_synid7_label>`
+    s_cmovk_i32                    :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_cmpk_eq_i32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_cmpk_eq_u32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_uimm16>`
+    s_cmpk_ge_i32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_cmpk_ge_u32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_uimm16>`
+    s_cmpk_gt_i32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_cmpk_gt_u32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_uimm16>`
+    s_cmpk_le_i32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_cmpk_le_u32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_uimm16>`
+    s_cmpk_lg_i32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_cmpk_lg_u32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_uimm16>`
+    s_cmpk_lt_i32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_cmpk_lt_u32                            :ref:`ssrc<amdgpu_synid7_ssrc32_1>`,     :ref:`imm16<amdgpu_synid7_uimm16>`
+    s_getreg_b32                   :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`hwreg<amdgpu_synid7_hwreg>`
+    s_movk_i32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_mulk_i32                     :ref:`sdst<amdgpu_synid7_sdst32_1>`,     :ref:`imm16<amdgpu_synid7_simm16>`
+    s_setreg_b32                   :ref:`hwreg<amdgpu_synid7_hwreg>`,    :ref:`ssrc<amdgpu_synid7_ssrc32_1>`
+    s_setreg_imm32_b32             :ref:`hwreg<amdgpu_synid7_hwreg>`,    :ref:`imm32<amdgpu_synid7_bimm32>`
+
+SOPP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_barrier
+    s_branch                       :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_cdbgsys              :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_cdbgsys_and_user     :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_cdbgsys_or_user      :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_cdbguser             :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_execnz               :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_execz                :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_scc0                 :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_scc1                 :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_vccnz                :ref:`label<amdgpu_synid7_label>`
+    s_cbranch_vccz                 :ref:`label<amdgpu_synid7_label>`
+    s_decperflevel                 :ref:`imm16<amdgpu_synid7_bimm16>`
+    s_endpgm
+    s_icache_inv
+    s_incperflevel                 :ref:`imm16<amdgpu_synid7_bimm16>`
+    s_nop                          :ref:`imm16<amdgpu_synid7_bimm16>`
+    s_sendmsg                      :ref:`msg<amdgpu_synid7_msg>`
+    s_sendmsghalt                  :ref:`msg<amdgpu_synid7_msg>`
+    s_sethalt                      :ref:`imm16<amdgpu_synid7_bimm16>`
+    s_setkill                      :ref:`imm16<amdgpu_synid7_bimm16>`
+    s_setprio                      :ref:`imm16<amdgpu_synid7_bimm16>`
+    s_sleep                        :ref:`imm16<amdgpu_synid7_bimm16>`
+    s_trap                         :ref:`imm16<amdgpu_synid7_bimm16>`
+    s_ttracedata
+    s_waitcnt                      :ref:`waitcnt<amdgpu_synid7_waitcnt>`
+
+VINTRP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**       **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_interp_mov_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`param<amdgpu_synid7_param>`::ref:`b32<amdgpu_synid7_type_dev>`, :ref:`attr<amdgpu_synid7_attr>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_interp_p1_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vsrc<amdgpu_synid7_vsrc32_0>`,      :ref:`attr<amdgpu_synid7_attr>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_interp_p2_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vsrc<amdgpu_synid7_vsrc32_0>`,      :ref:`attr<amdgpu_synid7_attr>`::ref:`b32<amdgpu_synid7_type_dev>`
+
+VOP1
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_bfrev_b32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_ceil_f32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_ceil_f64                     :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_clrexcp
+    v_cos_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f16_f32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f32_f16                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_1>`
+    v_cvt_f32_f64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_cvt_f32_i32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f32_u32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f32_ubyte0               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f32_ubyte1               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f32_ubyte2               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f32_ubyte3               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f64_f32                  :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f64_i32                  :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_f64_u32                  :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_flr_i32_f32              :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_i32_f32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_i32_f64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_cvt_off_f32_i4               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_rpi_i32_f32              :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_u32_f32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_cvt_u32_f64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_exp_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_exp_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_ffbh_i32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_ffbh_u32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_ffbl_b32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_floor_f32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_floor_f64                    :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_fract_f32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_fract_f64                    :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_frexp_exp_i32_f32            :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_frexp_exp_i32_f64            :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_frexp_mant_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_frexp_mant_f64               :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_log_clamp_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_log_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_log_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_mov_b32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_mov_fed_b32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_movreld_b32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_movrels_b32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vsrc<amdgpu_synid7_vsrc32_0>`
+    v_movrelsd_b32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vsrc<amdgpu_synid7_vsrc32_0>`
+    v_nop
+    v_not_b32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_rcp_clamp_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_rcp_clamp_f64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_rcp_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_rcp_f64                      :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_rcp_iflag_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_rcp_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_readfirstlane_b32            :ref:`sdst<amdgpu_synid7_sdst32_2>`,     :ref:`vsrc<amdgpu_synid7_vsrc32_0>`
+    v_rndne_f32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_rndne_f64                    :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_rsq_clamp_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_rsq_clamp_f64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_rsq_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_rsq_f64                      :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_rsq_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_sin_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_sqrt_f32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_sqrt_f64                     :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+    v_trunc_f32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`src<amdgpu_synid7_src32_0>`
+    v_trunc_f64                    :ref:`vdst<amdgpu_synid7_vdst64_0>`,     :ref:`src<amdgpu_synid7_src64_0>`
+
+VOP2
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST0**      **DST1**      **SRC0**      **SRC1**      **SRC2**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_add_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_add_i32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_addc_u32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`,    :ref:`vcc<amdgpu_synid7_vcc_64>`
+    v_and_b32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_ashr_i32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_ashrrev_i32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`::ref:`u32<amdgpu_synid7_type_dev>`, :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_bcnt_u32_b32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_bfm_b32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cndmask_b32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`,    :ref:`vcc<amdgpu_synid7_vcc_64>`
+    v_cvt_pk_i16_i32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cvt_pk_u16_u32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cvt_pkaccum_u8_f32           :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_cvt_pknorm_i16_f32           :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cvt_pknorm_u16_f32           :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cvt_pkrtz_f16_f32            :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_ldexp_f32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`i32<amdgpu_synid7_type_dev>`
+    v_lshl_b32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_lshlrev_b32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`::ref:`u32<amdgpu_synid7_type_dev>`, :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_lshr_b32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_lshrrev_b32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`::ref:`u32<amdgpu_synid7_type_dev>`, :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mac_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mac_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_madak_f32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`,    :ref:`imm32<amdgpu_synid7_fimm32>`
+    v_madmk_f32                    :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`imm32<amdgpu_synid7_fimm32>`,    :ref:`vsrc2<amdgpu_synid7_vsrc32_0>`
+    v_max_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_max_i32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_max_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_max_u32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mbcnt_hi_u32_b32             :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mbcnt_lo_u32_b32             :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_min_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_min_i32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_min_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_min_u32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mul_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mul_hi_i32_i24               :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mul_hi_u32_u24               :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mul_i32_i24                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mul_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_mul_u32_u24                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_or_b32                       :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_readlane_b32                 :ref:`sdst<amdgpu_synid7_sdst32_2>`,               :ref:`vsrc0<amdgpu_synid7_vsrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_4>`
+    v_sub_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_sub_i32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_subb_u32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`,    :ref:`vcc<amdgpu_synid7_vcc_64>`
+    v_subbrev_u32                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`,    :ref:`vcc<amdgpu_synid7_vcc_64>`
+    v_subrev_f32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_subrev_i32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,     :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_writelane_b32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`ssrc0<amdgpu_synid7_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid7_ssrc32_4>`
+    v_xor_b32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,               :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+
+VOP3
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST0**       **DST1**      **SRC0**        **SRC1**        **SRC2**            **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_add_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_add_f64                      :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_add_i32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,      :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_addc_u32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,      :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`ssrc2<amdgpu_synid7_ssrc64_1>`
+    v_alignbit_b32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_alignbyte_b32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_and_b32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_ashr_i32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_ashr_i64                     :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_ashrrev_i32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`src1<amdgpu_synid7_src32_2>`
+    v_bcnt_u32_b32_e64             :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_bfe_i32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`src2<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_bfe_u32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_bfi_b32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_bfm_b32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_bfrev_b32_e64                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_ceil_f32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ceil_f64_e64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_clrexcp_e64
+    v_cmp_class_f32_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_cmp_class_f64_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_cmp_eq_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_eq_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_eq_i32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_eq_i64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_eq_u32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_eq_u64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_f_f32_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_f_f64_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_f_i32_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_f_i64_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_f_u32_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_f_u64_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_ge_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_ge_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_ge_i32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_ge_i64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_ge_u32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_ge_u64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_gt_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_gt_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_gt_i32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_gt_i64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_gt_u32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_gt_u64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_le_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_le_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_le_i32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_le_i64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_le_u32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_le_u64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_lg_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_lg_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_lt_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_lt_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_lt_i32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_lt_i64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_lt_u32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_lt_u64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_ne_i32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_ne_i64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_ne_u32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_ne_u64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_neq_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_neq_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_nge_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_nge_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_ngt_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_ngt_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_nle_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_nle_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_nlg_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_nlg_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_nlt_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_nlt_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_o_f32_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_o_f64_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_t_i32_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_t_i64_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_t_u32_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmp_t_u64_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmp_tru_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_tru_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_u_f32_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmp_u_f64_e64                :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_eq_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_eq_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_f_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_f_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_ge_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_ge_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_gt_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_gt_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_le_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_le_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_lg_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_lg_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_lt_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_lt_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_neq_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_neq_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_nge_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_nge_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_ngt_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_ngt_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_nle_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_nle_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_nlg_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_nlg_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_nlt_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_nlt_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_o_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_o_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_tru_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_tru_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_u_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmps_u_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_eq_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_eq_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_f_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_f_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_ge_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_ge_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_gt_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_gt_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_le_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_le_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_lg_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_lg_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_lt_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_lt_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_neq_f32_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_neq_f64_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_nge_f32_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_nge_f64_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_ngt_f32_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_ngt_f64_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_nle_f32_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_nle_f64_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_nlg_f32_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_nlg_f64_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_nlt_f32_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_nlt_f64_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_o_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_o_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_tru_f32_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_tru_f64_e64            :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_u_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpsx_u_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_class_f32_e64           :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_cmpx_class_f64_e64           :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_cmpx_eq_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_eq_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_eq_i32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_eq_i64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_eq_u32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_eq_u64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_f_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_f_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_f_i32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_f_i64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_f_u32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_f_u64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_ge_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_ge_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_ge_i32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_ge_i64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_ge_u32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_ge_u64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_gt_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_gt_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_gt_i32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_gt_i64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_gt_u32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_gt_u64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_le_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_le_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_le_i32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_le_i64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_le_u32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_le_u64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_lg_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_lg_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_lt_f32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_lt_f64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_lt_i32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_lt_i64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_lt_u32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_lt_u64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_ne_i32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_ne_i64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_ne_u32_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_ne_u64_e64              :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_neq_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_neq_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_nge_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_nge_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_ngt_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_ngt_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_nle_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_nle_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_nlg_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_nlg_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_nlt_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_nlt_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_o_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_o_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_t_i32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_t_i64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_t_u32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cmpx_t_u64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`
+    v_cmpx_tru_f32_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_tru_f64_e64             :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_u_f32_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cmpx_u_f64_e64               :ref:`sdst<amdgpu_synid7_sdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cndmask_b32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`ssrc2<amdgpu_synid7_ssrc64_1>`
+    v_cos_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubeid_f32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubema_f32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubesc_f32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubetc_f32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f16_f32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_f32_f16_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_3>`                                     :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_f64_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_i32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`                                     :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_u32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`                                     :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte0_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_cvt_f32_ubyte1_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_cvt_f32_ubyte2_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_cvt_f32_ubyte3_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_cvt_f64_f32_e64              :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f64_i32_e64              :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src32_2>`                                     :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f64_u32_e64              :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src32_2>`                                     :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_flr_i32_f32_e64          :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_i32_f32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_i32_f64_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_off_f32_i4_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`                                     :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_pk_i16_i32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cvt_pk_u16_u32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_cvt_pk_u8_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`src2<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_cvt_pkaccum_u8_f32_e64       :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_cvt_pknorm_i16_f32_e64       :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_pknorm_u16_f32_e64       :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_pkrtz_f16_f32_e64        :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_rpi_i32_f32_e64          :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_u32_f32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`
+    v_cvt_u32_f64_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_div_fixup_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fixup_f64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fmas_f32                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fmas_f64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_scale_f32                :ref:`vdst<amdgpu_synid7_vdst32_0>`,      :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_div_scale_f64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,      :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src64_1>`,       :ref:`src2<amdgpu_synid7_src64_1>`
+    v_exp_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_exp_legacy_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ffbh_i32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_ffbh_u32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_ffbl_b32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_floor_f32_e64                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_floor_f64_e64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fma_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fma_f64                      :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fract_f32_e64                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fract_f64_e64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_frexp_exp_i32_f32_e64        :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_frexp_exp_i32_f64_e64        :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`
+    v_frexp_mant_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_frexp_mant_f64_e64           :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ldexp_f32_e64                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`i32<amdgpu_synid7_type_dev>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ldexp_f64                    :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`i32<amdgpu_synid7_type_dev>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_lerp_u8                      :ref:`vdst<amdgpu_synid7_vdst32_0>`::ref:`u32<amdgpu_synid7_type_dev>`,            :ref:`src0<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`,   :ref:`src1<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`,   :ref:`src2<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_log_clamp_f32_e64            :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_log_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_log_legacy_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_lshl_b32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_lshl_b64                     :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_lshlrev_b32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`src1<amdgpu_synid7_src32_2>`
+    v_lshr_b32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_lshr_b64                     :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`,       :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_lshrrev_b32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`,   :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mac_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mac_legacy_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_f32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_i32_i24                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`::ref:`i32<amdgpu_synid7_type_dev>`
+    v_mad_i64_i32                  :ref:`vdst<amdgpu_synid7_vdst64_0>`,      :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src64_1>`::ref:`i64<amdgpu_synid7_type_dev>`
+    v_mad_legacy_f32               :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_u32_u24                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_mad_u64_u32                  :ref:`vdst<amdgpu_synid7_vdst64_0>`,      :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src64_1>`::ref:`u64<amdgpu_synid7_type_dev>`
+    v_max3_f32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max3_i32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_max3_u32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_max_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max_f64                      :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max_i32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_max_legacy_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max_u32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mbcnt_hi_u32_b32_e64         :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mbcnt_lo_u32_b32_e64         :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_med3_f32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_med3_i32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_med3_u32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_min3_f32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min3_i32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_min3_u32                     :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_min_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min_f64                      :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min_i32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_min_legacy_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min_u32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mov_b32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_mov_fed_b32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_movreld_b32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_movrels_b32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`vsrc<amdgpu_synid7_vsrc32_0>`
+    v_movrelsd_b32_e64             :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`vsrc<amdgpu_synid7_vsrc32_0>`
+    v_mqsad_pk_u16_u8              :ref:`vdst<amdgpu_synid7_vdst64_0>`::ref:`b64<amdgpu_synid7_type_dev>`,            :ref:`src0<amdgpu_synid7_src64_2>`::ref:`b64<amdgpu_synid7_type_dev>`,   :ref:`src1<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`,   :ref:`src2<amdgpu_synid7_src64_2>`::ref:`b64<amdgpu_synid7_type_dev>`
+    v_mqsad_u32_u8                 :ref:`vdst<amdgpu_synid7_vdst128_0>`::ref:`b128<amdgpu_synid7_type_dev>`,           :ref:`src0<amdgpu_synid7_src64_2>`::ref:`b64<amdgpu_synid7_type_dev>`,   :ref:`src1<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`,   :ref:`vsrc2<amdgpu_synid7_vsrc128_0>`::ref:`b128<amdgpu_synid7_type_dev>`
+    v_msad_u8                      :ref:`vdst<amdgpu_synid7_vdst32_0>`::ref:`u32<amdgpu_synid7_type_dev>`,            :ref:`src0<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`,   :ref:`src1<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`,   :ref:`src2<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_mul_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_f64                      :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_hi_i32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mul_hi_i32_i24_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mul_hi_u32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mul_hi_u32_u24_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mul_i32_i24_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mul_legacy_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_lo_i32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mul_lo_u32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mul_u32_u24_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_mullit_f32                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src2<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_nop_e64
+    v_not_b32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`
+    v_or_b32_e64                   :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_qsad_pk_u16_u8               :ref:`vdst<amdgpu_synid7_vdst64_0>`::ref:`b64<amdgpu_synid7_type_dev>`,            :ref:`src0<amdgpu_synid7_src64_2>`::ref:`b64<amdgpu_synid7_type_dev>`,   :ref:`src1<amdgpu_synid7_src32_1>`::ref:`b32<amdgpu_synid7_type_dev>`,   :ref:`src2<amdgpu_synid7_src64_2>`::ref:`b64<amdgpu_synid7_type_dev>`
+    v_rcp_clamp_f32_e64            :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_clamp_f64_e64            :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_f64_e64                  :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_iflag_f32_e64            :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_legacy_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rndne_f32_e64                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rndne_f64_e64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_clamp_f32_e64            :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_clamp_f64_e64            :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_f64_e64                  :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_legacy_f32_e64           :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sad_hi_u8                    :ref:`vdst<amdgpu_synid7_vdst32_0>`::ref:`u32<amdgpu_synid7_type_dev>`,            :ref:`src0<amdgpu_synid7_src32_1>`::ref:`u8x4<amdgpu_synid7_type_dev>`,  :ref:`src1<amdgpu_synid7_src32_1>`::ref:`u8x4<amdgpu_synid7_type_dev>`,  :ref:`src2<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_sad_u16                      :ref:`vdst<amdgpu_synid7_vdst32_0>`::ref:`u32<amdgpu_synid7_type_dev>`,            :ref:`src0<amdgpu_synid7_src32_1>`::ref:`u16x2<amdgpu_synid7_type_dev>`, :ref:`src1<amdgpu_synid7_src32_1>`::ref:`u16x2<amdgpu_synid7_type_dev>`, :ref:`src2<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_sad_u32                      :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`src2<amdgpu_synid7_src32_2>`
+    v_sad_u8                       :ref:`vdst<amdgpu_synid7_vdst32_0>`::ref:`u32<amdgpu_synid7_type_dev>`,            :ref:`src0<amdgpu_synid7_src32_1>`::ref:`u8x4<amdgpu_synid7_type_dev>`,  :ref:`src1<amdgpu_synid7_src32_1>`::ref:`u8x4<amdgpu_synid7_type_dev>`,  :ref:`src2<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`
+    v_sin_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sqrt_f32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sqrt_f64_e64                 :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sub_f32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sub_i32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,      :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_subb_u32_e64                 :ref:`vdst<amdgpu_synid7_vdst32_0>`,      :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`ssrc2<amdgpu_synid7_ssrc64_1>`
+    v_subbrev_u32_e64              :ref:`vdst<amdgpu_synid7_vdst32_0>`,      :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`,       :ref:`ssrc2<amdgpu_synid7_ssrc64_1>`
+    v_subrev_f32_e64               :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_subrev_i32_e64               :ref:`vdst<amdgpu_synid7_vdst32_0>`,      :ref:`sdst<amdgpu_synid7_sdst64_0>`,     :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+    v_trig_preop_f64               :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src0<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`,     :ref:`src1<amdgpu_synid7_src32_2>`::ref:`u32<amdgpu_synid7_type_dev>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_trunc_f32_e64                :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src<amdgpu_synid7_src32_2>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_trunc_f64_e64                :ref:`vdst<amdgpu_synid7_vdst64_0>`,                :ref:`src<amdgpu_synid7_src64_1>`::ref:`m<amdgpu_synid7_mod>`                                   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_xor_b32_e64                  :ref:`vdst<amdgpu_synid7_vdst32_0>`,                :ref:`src0<amdgpu_synid7_src32_2>`,       :ref:`src1<amdgpu_synid7_src32_2>`
+
+VOPC
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_cmp_class_f32                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_cmp_class_f64                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_cmp_eq_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_eq_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_eq_i32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_eq_i64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_eq_u32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_eq_u64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_f_f32                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_f_f64                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_f_i32                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_f_i64                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_f_u32                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_f_u64                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_ge_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_ge_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_ge_i32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_ge_i64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_ge_u32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_ge_u64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_gt_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_gt_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_gt_i32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_gt_i64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_gt_u32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_gt_u64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_le_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_le_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_le_i32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_le_i64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_le_u32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_le_u64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_lg_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_lg_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_lt_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_lt_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_lt_i32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_lt_i64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_lt_u32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_lt_u64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_ne_i32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_ne_i64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_ne_u32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_ne_u64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_neq_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_neq_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_nge_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_nge_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_ngt_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_ngt_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_nle_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_nle_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_nlg_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_nlg_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_nlt_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_nlt_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_o_f32                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_o_f64                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_t_i32                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_t_i64                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_t_u32                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_t_u64                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_tru_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_tru_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmp_u_f32                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmp_u_f64                    :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_eq_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_eq_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_f_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_f_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_ge_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_ge_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_gt_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_gt_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_le_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_le_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_lg_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_lg_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_lt_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_lt_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_neq_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_neq_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_nge_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_nge_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_ngt_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_ngt_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_nle_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_nle_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_nlg_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_nlg_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_nlt_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_nlt_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_o_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_o_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_tru_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_tru_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmps_u_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmps_u_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_eq_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_eq_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_f_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_f_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_ge_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_ge_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_gt_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_gt_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_le_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_le_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_lg_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_lg_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_lt_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_lt_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_neq_f32                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_neq_f64                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_nge_f32                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_nge_f64                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_ngt_f32                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_ngt_f64                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_nle_f32                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_nle_f64                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_nlg_f32                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_nlg_f64                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_nlt_f32                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_nlt_f64                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_o_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_o_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_tru_f32                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_tru_f64                :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpsx_u_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpsx_u_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_class_f32               :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_cmpx_class_f64               :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`::ref:`b32<amdgpu_synid7_type_dev>`
+    v_cmpx_eq_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_eq_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_eq_i32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_eq_i64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_eq_u32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_eq_u64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_f_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_f_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_f_i32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_f_i64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_f_u32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_f_u64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_ge_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_ge_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_ge_i32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_ge_i64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_ge_u32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_ge_u64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_gt_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_gt_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_gt_i32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_gt_i64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_gt_u32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_gt_u64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_le_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_le_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_le_i32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_le_i64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_le_u32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_le_u64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_lg_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_lg_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_lt_f32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_lt_f64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_lt_i32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_lt_i64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_lt_u32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_lt_u64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_ne_i32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_ne_i64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_ne_u32                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_ne_u64                  :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_neq_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_neq_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_nge_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_nge_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_ngt_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_ngt_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_nle_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_nle_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_nlg_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_nlg_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_nlt_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_nlt_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_o_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_o_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_t_i32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_t_i64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_t_u32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_t_u64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_tru_f32                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_tru_f64                 :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+    v_cmpx_u_f32                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src32_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc32_0>`
+    v_cmpx_u_f64                   :ref:`vcc<amdgpu_synid7_vcc_64>`,      :ref:`src0<amdgpu_synid7_src64_0>`,     :ref:`vsrc1<amdgpu_synid7_vsrc64_0>`
+
+.. |---| unicode:: U+02014 .. em dash
+
+
+.. toctree::
+    :hidden:
+
+    gfx7_attr
+    gfx7_bimm16
+    gfx7_bimm32
+    gfx7_fimm32
+    gfx7_hwreg
+    gfx7_label
+    gfx7_msg
+    gfx7_param
+    gfx7_simm16
+    gfx7_tgt
+    gfx7_uimm16
+    gfx7_waitcnt
+    gfx7_addr_buf
+    gfx7_addr_ds
+    gfx7_addr_flat
+    gfx7_addr_mimg
+    gfx7_base_smem_addr
+    gfx7_base_smem_buf
+    gfx7_data_buf_atomic128
+    gfx7_data_buf_atomic32
+    gfx7_data_buf_atomic64
+    gfx7_data_mimg_atomic_cmp
+    gfx7_data_mimg_atomic_reg
+    gfx7_data_mimg_store
+    gfx7_dst_buf_128
+    gfx7_dst_buf_64
+    gfx7_dst_buf_96
+    gfx7_dst_buf_lds
+    gfx7_dst_flat_atomic32
+    gfx7_dst_flat_atomic64
+    gfx7_dst_mimg_gather4
+    gfx7_dst_mimg_regular
+    gfx7_offset_buf
+    gfx7_offset_smem
+    gfx7_rsrc_buf
+    gfx7_rsrc_mimg
+    gfx7_samp_mimg
+    gfx7_sdst128_0
+    gfx7_sdst256_0
+    gfx7_sdst32_0
+    gfx7_sdst32_1
+    gfx7_sdst32_2
+    gfx7_sdst512_0
+    gfx7_sdst64_0
+    gfx7_sdst64_1
+    gfx7_src32_0
+    gfx7_src32_1
+    gfx7_src32_2
+    gfx7_src32_3
+    gfx7_src64_0
+    gfx7_src64_1
+    gfx7_src64_2
+    gfx7_src_exp
+    gfx7_ssrc32_0
+    gfx7_ssrc32_1
+    gfx7_ssrc32_2
+    gfx7_ssrc32_3
+    gfx7_ssrc32_4
+    gfx7_ssrc64_0
+    gfx7_ssrc64_1
+    gfx7_ssrc64_2
+    gfx7_ssrc64_3
+    gfx7_vcc_64
+    gfx7_vdata128_0
+    gfx7_vdata32_0
+    gfx7_vdata64_0
+    gfx7_vdata96_0
+    gfx7_vdst128_0
+    gfx7_vdst32_0
+    gfx7_vdst64_0
+    gfx7_vdst96_0
+    gfx7_vsrc128_0
+    gfx7_vsrc32_0
+    gfx7_vsrc64_0
+    gfx7_mod
+    gfx7_opt
+    gfx7_ret
+    gfx7_type_dev
diff --git a/docs/AMDGPU/AMDGPUAsmGFX8.rst b/docs/AMDGPU/AMDGPUAsmGFX8.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9dc78e10089125b7263a21ac0fee4660fac3bdf4
--- /dev/null
+++ b/docs/AMDGPU/AMDGPUAsmGFX8.rst
@@ -0,0 +1,1846 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+============================
+Syntax of GFX8 Instructions
+============================
+
+.. contents::
+  :local:
+
+Notation
+========
+
+Notation used in this document is explained :ref:`here<amdgpu_syn_instruction_notation>`.
+
+Introduction
+============
+
+An overview of generic syntax and other features of AMDGPU instructions may be found :ref:`in this document<amdgpu_syn_instructions>`.
+
+Instructions
+============
+
+
+DS
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**         **SRC0**      **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    ds_add_f32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_rtn_f32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_rtn_u32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_rtn_u64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_src2_f32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_src2_u32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_src2_u64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_u32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_u64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_b32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_b64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_rtn_b32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_rtn_b64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_src2_b32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_src2_b64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_append                      :ref:`vdst<amdgpu_synid8_vdst32_0>`                                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_bpermute_b32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>`
+    ds_cmpst_b32                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_b64                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_f32                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_f64                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_b32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_b64               :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_f32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_f64               :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_condxchg32_rtn_b64          :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_consume                     :ref:`vdst<amdgpu_synid8_vdst32_0>`                                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_rtn_u32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_rtn_u64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_src2_u32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_src2_u64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_u32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_u64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_barrier                             :ref:`vdata<amdgpu_synid8_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_init                                :ref:`vdata<amdgpu_synid8_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_br                             :ref:`vdata<amdgpu_synid8_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_p                                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_release_all                                                       :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_v                                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_rtn_u32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_rtn_u64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_src2_u32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_src2_u64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_u32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_u64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_f32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_f64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_i32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_i64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_f32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_f64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_i32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_i64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_u32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_u64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_f32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_f64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_i32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_i64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_u32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_u64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_u32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_u64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_f32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_f64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_i32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_i64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_f32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_f64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_i32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_i64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_u32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_u64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_f32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_f64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_i32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_i64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_u32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_u64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_u32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_u64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_b32                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_b64                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_rtn_b32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_rtn_b64               :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_nop
+    ds_or_b32                                  :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_b64                                  :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_rtn_b32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_rtn_b64                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_src2_b32                             :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_src2_b64                             :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_ordered_count               :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_permute_b32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>`
+    ds_read2_b32                   :ref:`vdst<amdgpu_synid8_vdst64_0>`::ref:`b32x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2_b64                   :ref:`vdst<amdgpu_synid8_vdst128_0>`::ref:`b64x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2st64_b32               :ref:`vdst<amdgpu_synid8_vdst64_0>`::ref:`b32x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2st64_b64               :ref:`vdst<amdgpu_synid8_vdst128_0>`::ref:`b64x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b128                   :ref:`vdst<amdgpu_synid8_vdst128_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b64                    :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b96                    :ref:`vdst<amdgpu_synid8_vdst96_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_i16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_i8                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u8                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_rtn_u32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_rtn_u64                :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_src2_u32                           :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_src2_u64                           :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_u32                                :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_u64                                :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_rtn_u32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_rtn_u64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_src2_u32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_src2_u64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_u32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_u64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_swizzle_b32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`sw_offset16<amdgpu_synid_sw_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrap_rtn_b32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2_b32                              :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2_b64                              :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2st64_b32                          :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2st64_b64                          :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b128                              :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata128_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b16                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b32                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b64                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b8                                :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b96                               :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata96_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_src2_b32                          :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_src2_b64                          :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2_rtn_b32             :ref:`vdst<amdgpu_synid8_vdst64_0>`::ref:`b32x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2_rtn_b64             :ref:`vdst<amdgpu_synid8_vdst128_0>`::ref:`b64x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2st64_rtn_b32         :ref:`vdst<amdgpu_synid8_vdst64_0>`::ref:`b32x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata32_0>`,   :ref:`vdata1<amdgpu_synid8_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2st64_rtn_b64         :ref:`vdst<amdgpu_synid8_vdst128_0>`::ref:`b64x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata0<amdgpu_synid8_vdata64_0>`,   :ref:`vdata1<amdgpu_synid8_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg_rtn_b32              :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg_rtn_b64              :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_b32                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_b64                                 :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_rtn_b32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_rtn_b64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,       :ref:`vaddr<amdgpu_synid8_addr_ds>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_src2_b32                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_src2_b64                            :ref:`vaddr<amdgpu_synid8_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+
+EXP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**      **SRC3**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    exp                            :ref:`tgt<amdgpu_synid8_tgt>`,      :ref:`vsrc0<amdgpu_synid8_src_exp>`,    :ref:`vsrc1<amdgpu_synid8_src_exp>`,    :ref:`vsrc2<amdgpu_synid8_src_exp>`,    :ref:`vsrc3<amdgpu_synid8_src_exp>`          :ref:`done<amdgpu_synid_done>` :ref:`compr<amdgpu_synid_compr>` :ref:`vm<amdgpu_synid_vm>`
+
+FLAT
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**           **SRC0**      **SRC1**             **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    flat_atomic_add                :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_add_x2             :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_and                :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_and_x2             :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_cmpswap            :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`::ref:`b32x2<amdgpu_synid8_type_dev>`      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_cmpswap_x2         :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata128_0>`::ref:`b64x2<amdgpu_synid8_type_dev>`      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_dec                :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`::ref:`u32<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`::ref:`u32<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_dec_x2             :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`::ref:`u64<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`::ref:`u64<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_inc                :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`::ref:`u32<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`::ref:`u32<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_inc_x2             :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`::ref:`u64<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`::ref:`u64<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_or                 :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_or_x2              :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smax               :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`::ref:`s32<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`::ref:`s32<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smax_x2            :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`::ref:`s64<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`::ref:`s64<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smin               :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`::ref:`s32<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`::ref:`s32<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smin_x2            :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`::ref:`s64<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`::ref:`s64<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_sub                :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_sub_x2             :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_swap               :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_swap_x2            :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umax               :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`::ref:`u32<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`::ref:`u32<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umax_x2            :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`::ref:`u64<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`::ref:`u64<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umin               :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`::ref:`u32<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`::ref:`u32<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umin_x2            :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`::ref:`u64<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`::ref:`u64<amdgpu_synid8_type_dev>`        :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_xor                :ref:`vdst<amdgpu_synid8_dst_flat_atomic32>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_xor_x2             :ref:`vdst<amdgpu_synid8_dst_flat_atomic64>`::ref:`opt<amdgpu_synid8_opt>`,     :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dword                :ref:`vdst<amdgpu_synid8_vdst32_0>`,         :ref:`vaddr<amdgpu_synid8_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx2              :ref:`vdst<amdgpu_synid8_vdst64_0>`,         :ref:`vaddr<amdgpu_synid8_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx3              :ref:`vdst<amdgpu_synid8_vdst96_0>`,         :ref:`vaddr<amdgpu_synid8_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx4              :ref:`vdst<amdgpu_synid8_vdst128_0>`,         :ref:`vaddr<amdgpu_synid8_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_sbyte                :ref:`vdst<amdgpu_synid8_vdst32_0>`,         :ref:`vaddr<amdgpu_synid8_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_sshort               :ref:`vdst<amdgpu_synid8_vdst32_0>`,         :ref:`vaddr<amdgpu_synid8_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_ubyte                :ref:`vdst<amdgpu_synid8_vdst32_0>`,         :ref:`vaddr<amdgpu_synid8_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_ushort               :ref:`vdst<amdgpu_synid8_vdst32_0>`,         :ref:`vaddr<amdgpu_synid8_addr_flat>`                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_byte                              :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dword                             :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx2                           :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata64_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx3                           :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata96_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx4                           :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata128_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_short                             :ref:`vaddr<amdgpu_synid8_addr_flat>`,    :ref:`vdata<amdgpu_synid8_vdata32_0>`            :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+
+MIMG
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**       **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    image_atomic_add                         :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_and                         :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_cmpswap                     :ref:`vdata<amdgpu_synid8_data_mimg_atomic_cmp>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_dec                         :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_inc                         :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_or                          :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_smax                        :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_smin                        :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_sub                         :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_swap                        :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_umax                        :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_umin                        :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_xor                         :ref:`vdata<amdgpu_synid8_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid8_ret>`, :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4                  :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_b                :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_b_cl             :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_b_cl_o           :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_b_o              :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c                :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_b              :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_b_cl           :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_b_cl_o         :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_b_o            :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_cl             :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_cl_o           :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_l              :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_l_o            :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_lz             :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_lz_o           :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_o              :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_cl               :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_cl_o             :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_l                :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_l_o              :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_lz               :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_lz_o             :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_o                :ref:`vdst<amdgpu_synid8_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_get_lod                  :ref:`vdst<amdgpu_synid8_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_get_resinfo              :ref:`vdst<amdgpu_synid8_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load                     :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_load_mip                 :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_load_mip_pck             :ref:`vdst<amdgpu_synid8_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_mip_pck_sgn         :ref:`vdst<amdgpu_synid8_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_pck                 :ref:`vdst<amdgpu_synid8_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_pck_sgn             :ref:`vdst<amdgpu_synid8_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample                   :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_b                 :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_b_cl              :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_b_cl_o            :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_b_o               :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c                 :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_b               :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_b_cl            :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_b_cl_o          :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_b_o             :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cd              :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cd_cl           :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cd_cl_o         :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cd_o            :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cl              :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cl_o            :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_d               :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_d_cl            :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_d_cl_o          :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_d_o             :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_l               :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_l_o             :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_lz              :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_lz_o            :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_o               :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cd                :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cd_cl             :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cd_cl_o           :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cd_o              :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cl                :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cl_o              :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_d                 :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_d_cl              :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_d_cl_o            :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_d_o               :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_l                 :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_l_o               :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_lz                :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_lz_o              :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_o                 :ref:`vdst<amdgpu_synid8_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,     :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid8_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_store                              :ref:`vdata<amdgpu_synid8_data_mimg_store_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_store_mip                          :ref:`vdata<amdgpu_synid8_data_mimg_store_d16>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_store_mip_pck                      :ref:`vdata<amdgpu_synid8_data_mimg_store>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_store_pck                          :ref:`vdata<amdgpu_synid8_data_mimg_store>`,     :ref:`vaddr<amdgpu_synid8_addr_mimg>`,    :ref:`srsrc<amdgpu_synid8_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+
+MUBUF
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**             **SRC1**      **SRC2**      **SRC3**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    buffer_atomic_add                        :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_add_x2                     :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_and                        :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_and_x2                     :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_cmpswap                    :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`::ref:`b32x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_cmpswap_x2                 :ref:`vdata<amdgpu_synid8_data_buf_atomic128>`::ref:`dst<amdgpu_synid8_ret>`::ref:`b64x2<amdgpu_synid8_type_dev>`, :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_dec                        :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_dec_x2                     :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`::ref:`u64<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_inc                        :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_inc_x2                     :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`::ref:`u64<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_or                         :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_or_x2                      :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smax                       :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`::ref:`s32<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smax_x2                    :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`::ref:`s64<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smin                       :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`::ref:`s32<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smin_x2                    :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`::ref:`s64<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_sub                        :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_sub_x2                     :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_swap                       :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_swap_x2                    :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umax                       :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umax_x2                    :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`::ref:`u64<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umin                       :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umin_x2                    :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`::ref:`u64<amdgpu_synid8_type_dev>`,   :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_xor                        :ref:`vdata<amdgpu_synid8_data_buf_atomic32>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_xor_x2                     :ref:`vdata<amdgpu_synid8_data_buf_atomic64>`::ref:`dst<amdgpu_synid8_ret>`,       :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dword              :ref:`vdst<amdgpu_synid8_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_dwordx2            :ref:`vdst<amdgpu_synid8_dst_buf_64>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dwordx3            :ref:`vdst<amdgpu_synid8_dst_buf_96>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dwordx4            :ref:`vdst<amdgpu_synid8_dst_buf_128>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_x       :ref:`vdst<amdgpu_synid8_dst_buf_d16_32>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_xy      :ref:`vdst<amdgpu_synid8_dst_buf_d16_64>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_xyz     :ref:`vdst<amdgpu_synid8_dst_buf_d16_96>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_xyzw    :ref:`vdst<amdgpu_synid8_dst_buf_d16_128>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_x           :ref:`vdst<amdgpu_synid8_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_format_xy          :ref:`vdst<amdgpu_synid8_dst_buf_64>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_xyz         :ref:`vdst<amdgpu_synid8_dst_buf_96>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_xyzw        :ref:`vdst<amdgpu_synid8_dst_buf_128>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_sbyte              :ref:`vdst<amdgpu_synid8_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_sshort             :ref:`vdst<amdgpu_synid8_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_ubyte              :ref:`vdst<amdgpu_synid8_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_ushort             :ref:`vdst<amdgpu_synid8_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid8_addr_buf>`,           :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_store_byte                        :ref:`vdata<amdgpu_synid8_vdata32_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dword                       :ref:`vdata<amdgpu_synid8_vdata32_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx2                     :ref:`vdata<amdgpu_synid8_vdata64_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx3                     :ref:`vdata<amdgpu_synid8_vdata96_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx4                     :ref:`vdata<amdgpu_synid8_vdata128_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_x                :ref:`vdata<amdgpu_synid8_data_buf_d16_32>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_xy               :ref:`vdata<amdgpu_synid8_data_buf_d16_64>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_xyz              :ref:`vdata<amdgpu_synid8_data_buf_d16_96>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_xyzw             :ref:`vdata<amdgpu_synid8_data_buf_d16_128>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_x                    :ref:`vdata<amdgpu_synid8_vdata32_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xy                   :ref:`vdata<amdgpu_synid8_vdata64_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xyz                  :ref:`vdata<amdgpu_synid8_vdata96_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xyzw                 :ref:`vdata<amdgpu_synid8_vdata128_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_lds_dword                   :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,           :ref:`soffset<amdgpu_synid8_offset_buf>`                            :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`lds<amdgpu_synid_lds>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_short                       :ref:`vdata<amdgpu_synid8_vdata32_0>`,           :ref:`vaddr<amdgpu_synid8_addr_buf>`,    :ref:`srsrc<amdgpu_synid8_rsrc_buf>`,    :ref:`soffset<amdgpu_synid8_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_wbinvl1
+    buffer_wbinvl1_vol
+
+SMEM
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_atc_probe                              :ref:`imm3<amdgpu_synid8_perm_smem>`,     :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`
+    s_atc_probe_buffer                       :ref:`imm3<amdgpu_synid8_perm_smem>`,     :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`
+    s_buffer_load_dword            :ref:`sdst<amdgpu_synid8_sdst32_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dwordx16         :ref:`sdst<amdgpu_synid8_sdst512_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dwordx2          :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dwordx4          :ref:`sdst<amdgpu_synid8_sdst128_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dwordx8          :ref:`sdst<amdgpu_synid8_sdst256_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_store_dword                     :ref:`sdata<amdgpu_synid8_sdata32_0>`,    :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_store>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_store_dwordx2                   :ref:`sdata<amdgpu_synid8_sdata64_0>`,    :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_store>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_store_dwordx4                   :ref:`sdata<amdgpu_synid8_sdata128_0>`,    :ref:`sbase<amdgpu_synid8_base_smem_buf>`,    :ref:`soffset<amdgpu_synid8_offset_smem_store>`        :ref:`glc<amdgpu_synid_glc>`
+    s_dcache_inv
+    s_dcache_inv_vol
+    s_dcache_wb
+    s_dcache_wb_vol
+    s_load_dword                   :ref:`sdst<amdgpu_synid8_sdst32_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_load_dwordx16                :ref:`sdst<amdgpu_synid8_sdst512_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_load_dwordx2                 :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_load_dwordx4                 :ref:`sdst<amdgpu_synid8_sdst128_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_load_dwordx8                 :ref:`sdst<amdgpu_synid8_sdst256_0>`,     :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_load>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_memrealtime                  :ref:`sdst<amdgpu_synid8_sdst64_0>`
+    s_memtime                      :ref:`sdst<amdgpu_synid8_sdst64_0>`
+    s_store_dword                            :ref:`sdata<amdgpu_synid8_sdata32_0>`,    :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_store>`        :ref:`glc<amdgpu_synid_glc>`
+    s_store_dwordx2                          :ref:`sdata<amdgpu_synid8_sdata64_0>`,    :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_store>`        :ref:`glc<amdgpu_synid_glc>`
+    s_store_dwordx4                          :ref:`sdata<amdgpu_synid8_sdata128_0>`,    :ref:`sbase<amdgpu_synid8_base_smem_addr>`,    :ref:`soffset<amdgpu_synid8_offset_smem_store>`        :ref:`glc<amdgpu_synid_glc>`
+
+SOP1
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_abs_i32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_and_saveexec_b64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_andn2_saveexec_b64           :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_bcnt0_i32_b32                :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_bcnt0_i32_b64                :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_bcnt1_i32_b32                :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_bcnt1_i32_b64                :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_bitset0_b32                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_bitset0_b64                  :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    s_bitset1_b32                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_bitset1_b64                  :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    s_brev_b32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_brev_b64                     :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_cbranch_join                           :ref:`ssrc<amdgpu_synid8_ssrc32_1>`
+    s_cmov_b32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_cmov_b64                     :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_ff0_i32_b32                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_ff0_i32_b64                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_ff1_i32_b32                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_ff1_i32_b64                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_flbit_i32                    :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_flbit_i32_b32                :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_flbit_i32_b64                :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_flbit_i32_i64                :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_getpc_b64                    :ref:`sdst<amdgpu_synid8_sdst64_1>`
+    s_mov_b32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_mov_b64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_mov_fed_b32                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_movreld_b32                  :ref:`sdst<amdgpu_synid8_sdst32_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_movreld_b64                  :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_movrels_b32                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_1>`
+    s_movrels_b64                  :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_1>`
+    s_nand_saveexec_b64            :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_nor_saveexec_b64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_not_b32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_not_b64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_or_saveexec_b64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_orn2_saveexec_b64            :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_quadmask_b32                 :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_quadmask_b64                 :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_rfe_b64                                :ref:`ssrc<amdgpu_synid8_ssrc64_1>`
+    s_set_gpr_idx_idx                        :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_setpc_b64                              :ref:`ssrc<amdgpu_synid8_ssrc64_1>`
+    s_sext_i32_i16                 :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_sext_i32_i8                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_swappc_b64                   :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_1>`
+    s_wqm_b32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc32_0>`
+    s_wqm_b64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_xnor_saveexec_b64            :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+    s_xor_saveexec_b64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`ssrc<amdgpu_synid8_ssrc64_0>`
+
+SOP2
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**       **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_absdiff_i32                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_add_i32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_add_u32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_addc_u32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_and_b32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_and_b64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_andn2_b32                    :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_andn2_b64                    :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_ashr_i32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_ashr_i64                     :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_bfe_i32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_bfe_i64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_bfe_u32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_bfe_u64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_bfm_b32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_bfm_b64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`, :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    s_cbranch_g_fork                         :ref:`ssrc0<amdgpu_synid8_ssrc64_2>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_2>`
+    s_cselect_b32                  :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cselect_b64                  :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_lshl_b32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_lshl_b64                     :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_lshr_b32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_lshr_b64                     :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_max_i32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_max_u32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_min_i32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_min_u32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_mul_i32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_nand_b32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_nand_b64                     :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_nor_b32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_nor_b64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_or_b32                       :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_or_b64                       :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_orn2_b32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_orn2_b64                     :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_rfe_restore_b64                        :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    s_sub_i32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_sub_u32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_subb_u32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_xnor_b32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_xnor_b64                     :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_xor_b32                      :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_xor_b64                      :ref:`sdst<amdgpu_synid8_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,     :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+
+SOPC
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_bitcmp0_b32                  :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_bitcmp0_b64                  :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_bitcmp1_b32                  :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_bitcmp1_b64                  :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`
+    s_cmp_eq_i32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_eq_u32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_eq_u64                   :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_cmp_ge_i32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_ge_u32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_gt_i32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_gt_u32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_le_i32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_le_u32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_lg_i32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_lg_u32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_lg_u64                   :ref:`ssrc0<amdgpu_synid8_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc64_0>`
+    s_cmp_lt_i32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_cmp_lt_u32                   :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+    s_set_gpr_idx_on               :ref:`ssrc<amdgpu_synid8_ssrc32_0>`,     :ref:`imm4<amdgpu_synid8_imm4>`
+    s_setvskip                     :ref:`ssrc0<amdgpu_synid8_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid8_ssrc32_0>`
+
+SOPK
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_addk_i32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_cbranch_i_fork                         :ref:`ssrc<amdgpu_synid8_ssrc64_3>`,     :ref:`label<amdgpu_synid8_label>`
+    s_cmovk_i32                    :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_cmpk_eq_i32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_cmpk_eq_u32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_uimm16>`
+    s_cmpk_ge_i32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_cmpk_ge_u32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_uimm16>`
+    s_cmpk_gt_i32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_cmpk_gt_u32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_uimm16>`
+    s_cmpk_le_i32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_cmpk_le_u32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_uimm16>`
+    s_cmpk_lg_i32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_cmpk_lg_u32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_uimm16>`
+    s_cmpk_lt_i32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_cmpk_lt_u32                            :ref:`ssrc<amdgpu_synid8_ssrc32_2>`,     :ref:`imm16<amdgpu_synid8_uimm16>`
+    s_getreg_b32                   :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`hwreg<amdgpu_synid8_hwreg>`
+    s_movk_i32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_mulk_i32                     :ref:`sdst<amdgpu_synid8_sdst32_1>`,     :ref:`imm16<amdgpu_synid8_simm16>`
+    s_setreg_b32                   :ref:`hwreg<amdgpu_synid8_hwreg>`,    :ref:`ssrc<amdgpu_synid8_ssrc32_2>`
+    s_setreg_imm32_b32             :ref:`hwreg<amdgpu_synid8_hwreg>`,    :ref:`imm32<amdgpu_synid8_bimm32>`
+
+SOPP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_barrier
+    s_branch                       :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_cdbgsys              :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_cdbgsys_and_user     :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_cdbgsys_or_user      :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_cdbguser             :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_execnz               :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_execz                :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_scc0                 :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_scc1                 :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_vccnz                :ref:`label<amdgpu_synid8_label>`
+    s_cbranch_vccz                 :ref:`label<amdgpu_synid8_label>`
+    s_decperflevel                 :ref:`imm16<amdgpu_synid8_bimm16>`
+    s_endpgm
+    s_endpgm_saved
+    s_icache_inv
+    s_incperflevel                 :ref:`imm16<amdgpu_synid8_bimm16>`
+    s_nop                          :ref:`imm16<amdgpu_synid8_bimm16>`
+    s_sendmsg                      :ref:`msg<amdgpu_synid8_msg>`
+    s_sendmsghalt                  :ref:`msg<amdgpu_synid8_msg>`
+    s_set_gpr_idx_mode             :ref:`imm4<amdgpu_synid8_imm4>`
+    s_set_gpr_idx_off
+    s_sethalt                      :ref:`imm16<amdgpu_synid8_bimm16>`
+    s_setkill                      :ref:`imm16<amdgpu_synid8_bimm16>`
+    s_setprio                      :ref:`imm16<amdgpu_synid8_bimm16>`
+    s_sleep                        :ref:`imm16<amdgpu_synid8_bimm16>`
+    s_trap                         :ref:`imm16<amdgpu_synid8_bimm16>`
+    s_ttracedata
+    s_waitcnt                      :ref:`waitcnt<amdgpu_synid8_waitcnt>`
+    s_wakeup
+
+VINTRP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**       **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_interp_mov_f32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`param<amdgpu_synid8_param>`::ref:`b32<amdgpu_synid8_type_dev>`, :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_interp_p1_f32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`,      :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_interp_p2_f32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`,      :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`
+
+VOP1
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**            **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_bfrev_b32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_bfrev_b32_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_bfrev_b32_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ceil_f16                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_ceil_f16_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ceil_f16_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ceil_f32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_ceil_f32_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ceil_f32_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ceil_f64                     :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_clrexcp
+    v_cos_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cos_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cos_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cos_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cos_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cos_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f16_f32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f16_f32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f16_f32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f16_i16                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f16_i16_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f16_i16_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f16_u16                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f16_u16_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f16_u16_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_f16                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f32_f16_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_f16_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_f64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_cvt_f32_i32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f32_i32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_i32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_u32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f32_u32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_u32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_ubyte0               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f32_ubyte0_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_ubyte0_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_ubyte1               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f32_ubyte1_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_ubyte1_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_ubyte2               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f32_ubyte2_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_ubyte2_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_ubyte3               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f32_ubyte3_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_ubyte3_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f64_f32                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f64_i32                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_f64_u32                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_flr_i32_f32              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_flr_i32_f32_dpp          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_flr_i32_f32_sdwa         :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_i16_f16                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_i16_f16_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_i16_f16_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_i32_f32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_i32_f32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_i32_f32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_i32_f64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_cvt_off_f32_i4               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_off_f32_i4_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_off_f32_i4_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_rpi_i32_f32              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_rpi_i32_f32_dpp          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_rpi_i32_f32_sdwa         :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_u16_f16                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_u16_f16_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_u16_f16_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_u32_f32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_cvt_u32_f32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_u32_f32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_u32_f64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_exp_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_exp_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_exp_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_exp_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_exp_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_exp_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_exp_legacy_f32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_exp_legacy_f32_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_exp_legacy_f32_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ffbh_i32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_ffbh_i32_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ffbh_i32_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ffbh_u32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_ffbh_u32_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ffbh_u32_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ffbl_b32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_ffbl_b32_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ffbl_b32_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_floor_f16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_floor_f16_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_floor_f16_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_floor_f32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_floor_f32_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_floor_f32_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_floor_f64                    :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_fract_f16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_fract_f16_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_fract_f16_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_fract_f32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_fract_f32_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_fract_f32_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_fract_f64                    :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_frexp_exp_i16_f16            :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_frexp_exp_i16_f16_dpp        :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_frexp_exp_i16_f16_sdwa       :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_frexp_exp_i32_f32            :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_frexp_exp_i32_f32_dpp        :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_frexp_exp_i32_f32_sdwa       :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_frexp_exp_i32_f64            :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_frexp_mant_f16               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_frexp_mant_f16_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_frexp_mant_f16_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_frexp_mant_f32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_frexp_mant_f32_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_frexp_mant_f32_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_frexp_mant_f64               :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_log_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_log_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_log_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_log_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_log_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_log_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_log_legacy_f32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_log_legacy_f32_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_log_legacy_f32_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_mov_b32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_mov_b32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mov_b32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_mov_fed_b32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_mov_fed_b32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mov_fed_b32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_movreld_b32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_movrels_b32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`
+    v_movrelsd_b32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`
+    v_nop
+    v_not_b32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_not_b32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_not_b32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`         :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rcp_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_rcp_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rcp_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rcp_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_rcp_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rcp_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rcp_f64                      :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_rcp_iflag_f32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_rcp_iflag_f32_dpp            :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rcp_iflag_f32_sdwa           :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_readfirstlane_b32            :ref:`sdst<amdgpu_synid8_sdst32_2>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`
+    v_rndne_f16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_rndne_f16_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rndne_f16_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rndne_f32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_rndne_f32_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rndne_f32_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rndne_f64                    :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_rsq_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_rsq_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rsq_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rsq_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_rsq_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rsq_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rsq_f64                      :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_sin_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_sin_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sin_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sin_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_sin_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sin_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sqrt_f16                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_sqrt_f16_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sqrt_f16_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sqrt_f32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_sqrt_f32_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sqrt_f32_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sqrt_f64                     :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+    v_trunc_f16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_trunc_f16_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_trunc_f16_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_trunc_f32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`src<amdgpu_synid8_src32_0>`
+    v_trunc_f32_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_trunc_f32_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_trunc_f64                    :ref:`vdst<amdgpu_synid8_vdst64_0>`,     :ref:`src<amdgpu_synid8_src64_0>`
+
+VOP2
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST0**      **DST1**      **SRC0**         **SRC1**        **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_add_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_add_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_add_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_add_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_add_u16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_add_u16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_u16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_add_u32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_add_u32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_u32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_addc_u32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`
+    v_addc_u32_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`            :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_addc_u32_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,    :ref:`vcc<amdgpu_synid8_vcc_64>`            :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_and_b32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_and_b32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_and_b32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_ashrrev_i16                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`::ref:`u16<amdgpu_synid8_type_dev>`,    :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_ashrrev_i16_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`u16<amdgpu_synid8_type_dev>`,   :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ashrrev_i16_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`u16<amdgpu_synid8_type_dev>`, :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_ashrrev_i32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_ashrrev_i32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ashrrev_i32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`u32<amdgpu_synid8_type_dev>`, :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cndmask_b32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`
+    v_cndmask_b32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`            :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cndmask_b32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,    :ref:`vcc<amdgpu_synid8_vcc_64>`            :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_ldexp_f16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`i16<amdgpu_synid8_type_dev>`
+    v_ldexp_f16_dpp                :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`i16<amdgpu_synid8_type_dev>`                  :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ldexp_f16_sdwa               :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`i16<amdgpu_synid8_type_dev>`                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_lshlrev_b16                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`::ref:`u16<amdgpu_synid8_type_dev>`,    :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_lshlrev_b16_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`u16<amdgpu_synid8_type_dev>`,   :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_lshlrev_b16_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`u16<amdgpu_synid8_type_dev>`, :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_lshlrev_b32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_lshlrev_b32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_lshlrev_b32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`u32<amdgpu_synid8_type_dev>`, :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_lshrrev_b16                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`::ref:`u16<amdgpu_synid8_type_dev>`,    :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_lshrrev_b16_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`u16<amdgpu_synid8_type_dev>`,   :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_lshrrev_b16_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`u16<amdgpu_synid8_type_dev>`, :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_lshrrev_b32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_lshrrev_b32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_lshrrev_b32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`u32<amdgpu_synid8_type_dev>`, :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mac_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mac_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mac_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mac_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mac_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mac_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_madak_f16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`imm32<amdgpu_synid8_fimm16>`
+    v_madak_f32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`imm32<amdgpu_synid8_fimm32>`
+    v_madmk_f16                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`imm32<amdgpu_synid8_fimm16>`,      :ref:`vsrc2<amdgpu_synid8_vsrc32_0>`
+    v_madmk_f32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`imm32<amdgpu_synid8_fimm32>`,      :ref:`vsrc2<amdgpu_synid8_vsrc32_0>`
+    v_max_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_max_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_max_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_i16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_max_i16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_i16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_i32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_max_i32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_i32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_u16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_max_u16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_u16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_u32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_max_u32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_u32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_min_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_min_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_i16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_min_i16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_i16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_i32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_min_i32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_i32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_u16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_min_u16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_u16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_u32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_min_u32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_u32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mul_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mul_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_hi_i32_i24               :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mul_hi_i32_i24_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_hi_i32_i24_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_hi_u32_u24               :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mul_hi_u32_u24_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_hi_u32_u24_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_i32_i24                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mul_i32_i24_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_i32_i24_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_legacy_f32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mul_legacy_f32_dpp           :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_legacy_f32_sdwa          :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_lo_u16                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mul_lo_u16_dpp               :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_lo_u16_sdwa              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_u32_u24                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_mul_u32_u24_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_u32_u24_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_or_b32                       :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_or_b32_dpp                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_or_b32_sdwa                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_sub_f16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_f16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_sub_f32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_f32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_u16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_sub_u16_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_u16_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_u32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_sub_u32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_u32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subb_u32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`
+    v_subb_u32_dpp                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`            :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subb_u32_sdwa                :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,    :ref:`vcc<amdgpu_synid8_vcc_64>`            :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subbrev_u32                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`
+    v_subbrev_u32_dpp              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`            :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subbrev_u32_sdwa             :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,    :ref:`vcc<amdgpu_synid8_vcc_64>`            :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_f16                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_subrev_f16_dpp               :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_f16_sdwa              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_f32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_subrev_f32_dpp               :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_f32_sdwa              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_u16                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_subrev_u16_dpp               :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_u16_sdwa              :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_u32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_subrev_u32_dpp               :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_u32_sdwa              :ref:`vdst<amdgpu_synid8_vdst32_0>`,     :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_xor_b32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`src0<amdgpu_synid8_src32_0>`,        :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_xor_b32_dpp                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_xor_b32_sdwa                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+
+VOP3
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST0**       **DST1**      **SRC0**         **SRC1**        **SRC2**               **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_add_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_add_f64                      :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_add_u16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_add_u32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,      :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_addc_u32_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,      :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`ssrc2<amdgpu_synid8_ssrc64_1>`
+    v_alignbit_b32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_alignbyte_b32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_and_b32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_ashrrev_i16_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u16<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`
+    v_ashrrev_i32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`
+    v_ashrrev_i64                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src64_1>`
+    v_bcnt_u32_b32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_bfe_i32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`src2<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`
+    v_bfe_u32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_bfi_b32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_bfm_b32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_bfrev_b32_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`
+    v_ceil_f16_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_ceil_f32_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ceil_f64_e64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_clrexcp_e64
+    v_cmp_class_f16_e64            :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmp_class_f32_e64            :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmp_class_f64_e64            :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmp_eq_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_i16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_eq_i32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_eq_i64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_eq_u16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_eq_u32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_eq_u64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_f_f16_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_f32_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_f64_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_i16_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_f_i32_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_f_i64_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_f_u16_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_f_u32_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_f_u64_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_ge_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_i16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_ge_i32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_ge_i64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_ge_u16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_ge_u32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_ge_u64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_gt_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_i16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_gt_i32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_gt_i64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_gt_u16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_gt_u32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_gt_u64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_le_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_i16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_le_i32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_le_i64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_le_u16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_le_u32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_le_u64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_lg_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lg_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lg_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_i16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_lt_i32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_lt_i64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_lt_u16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_lt_u32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_lt_u64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_ne_i16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_ne_i32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_ne_i64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_ne_u16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_ne_u32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_ne_u64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_neq_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_neq_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_neq_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f16_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f32_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f64_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_i16_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_t_i32_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_t_i64_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_t_u16_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_t_u32_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmp_t_u64_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmp_tru_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_tru_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_tru_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f16_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f32_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f64_e64                :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_class_f16_e64           :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmpx_class_f32_e64           :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmpx_class_f64_e64           :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmpx_eq_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_i16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_eq_i32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_eq_i64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_eq_u16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_eq_u32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_eq_u64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_f_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_i16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_f_i32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_f_i64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_f_u16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_f_u32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_f_u64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_ge_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_i16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_ge_i32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_ge_i64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_ge_u16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_ge_u32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_ge_u64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_gt_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_i16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_gt_i32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_gt_i64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_gt_u16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_gt_u32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_gt_u64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_le_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_i16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_le_i32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_le_i64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_le_u16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_le_u32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_le_u64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_lg_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lg_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lg_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_i16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_lt_i32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_lt_i64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_lt_u16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_lt_u32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_lt_u64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_ne_i16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_ne_i32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_ne_i64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_ne_u16_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_ne_u32_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_ne_u64_e64              :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_neq_f16_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_neq_f32_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_neq_f64_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f16_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f32_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f64_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f16_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f32_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f64_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f16_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f32_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f64_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f16_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f32_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f64_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f16_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f32_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f64_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_i16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_t_i32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_t_i64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_t_u16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_t_u32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cmpx_t_u64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`
+    v_cmpx_tru_f16_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_tru_f32_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_tru_f64_e64             :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f16_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f32_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f64_e64               :ref:`sdst<amdgpu_synid8_sdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cndmask_b32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`ssrc2<amdgpu_synid8_ssrc64_1>`
+    v_cos_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_cos_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubeid_f32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubema_f32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubesc_f32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubetc_f32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f16_f32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f16_i16_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f16_u16_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_f16_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_f64_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_i32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_u32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte0_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte1_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte2_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte3_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f64_f32_e64              :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f64_i32_e64              :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f64_u32_e64              :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_flr_i32_f32_e64          :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_i16_f16_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_i32_f32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_i32_f64_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_off_f32_i4_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_pk_i16_i32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cvt_pk_u16_u32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_cvt_pk_u8_f32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`,   :ref:`src2<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`
+    v_cvt_pkaccum_u8_f32           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`
+    v_cvt_pknorm_i16_f32           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_pknorm_u16_f32           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_pkrtz_f16_f32            :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_rpi_i32_f32_e64          :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_u16_f16_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_u32_f32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_cvt_u32_f64_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_div_fixup_f16                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fixup_f32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fixup_f64                :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fmas_f32                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fmas_f64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_scale_f32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_div_scale_f64                :ref:`vdst<amdgpu_synid8_vdst64_0>`,      :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_1>`,        :ref:`src1<amdgpu_synid8_src64_1>`,       :ref:`src2<amdgpu_synid8_src64_1>`
+    v_exp_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_exp_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_exp_legacy_f32_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ffbh_i32_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`
+    v_ffbh_u32_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`
+    v_ffbl_b32_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`
+    v_floor_f16_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_floor_f32_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_floor_f64_e64                :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fma_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>`
+    v_fma_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fma_f64                      :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fract_f16_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_fract_f32_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fract_f64_e64                :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_frexp_exp_i16_f16_e64        :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_frexp_exp_i32_f32_e64        :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_frexp_exp_i32_f64_e64        :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`
+    v_frexp_mant_f16_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_frexp_mant_f32_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_frexp_mant_f64_e64           :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_mov_f32_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`param<amdgpu_synid8_param>`::ref:`b32<amdgpu_synid8_type_dev>`,   :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p1_f32_e64            :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p1ll_f16              :ref:`vdst<amdgpu_synid8_vdst32_0>`::ref:`f32<amdgpu_synid8_type_dev>`,            :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid8_type_dev>`,  :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`                       :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p1lv_f16              :ref:`vdst<amdgpu_synid8_vdst32_0>`::ref:`f32<amdgpu_synid8_type_dev>`,            :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid8_type_dev>`, :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`,   :ref:`vsrc2<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`::ref:`f16x2<amdgpu_synid8_type_dev>`      :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p2_f16                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid8_type_dev>`, :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`,   :ref:`vsrc2<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid8_type_dev>`        :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_interp_p2_f32_e64            :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`vsrc<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`attr<amdgpu_synid8_attr>`::ref:`b32<amdgpu_synid8_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ldexp_f16_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`i16<amdgpu_synid8_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_ldexp_f32                    :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`i32<amdgpu_synid8_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ldexp_f64                    :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`i32<amdgpu_synid8_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_lerp_u8                      :ref:`vdst<amdgpu_synid8_vdst32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,            :ref:`src0<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`,   :ref:`src2<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_log_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_log_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_log_legacy_f32_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_lshlrev_b16_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u16<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`
+    v_lshlrev_b32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`
+    v_lshlrev_b64                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src64_1>`
+    v_lshrrev_b16_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u16<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`
+    v_lshrrev_b32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`
+    v_lshrrev_b64                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src64_1>`
+    v_mac_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_mac_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_f16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_f32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_i16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`               :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_i32_i24                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`::ref:`i32<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_i64_i32                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,      :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src64_1>`::ref:`i64<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_legacy_f32               :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_u16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`               :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u32_u24                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u64_u32                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,      :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src64_1>`::ref:`u64<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_f32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max3_i32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_max3_u32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_max_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max_f64                      :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max_i16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_max_i32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_max_u16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_max_u32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mbcnt_hi_u32_b32             :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mbcnt_lo_u32_b32             :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_med3_f32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_med3_i32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_med3_u32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_min3_f32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min3_i32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_min3_u32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_min_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min_f64                      :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min_i16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_min_i32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_min_u16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_min_u32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mov_b32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`
+    v_mov_fed_b32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`
+    v_movreld_b32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`
+    v_movrels_b32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`vsrc<amdgpu_synid8_vsrc32_0>`
+    v_movrelsd_b32_e64             :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`vsrc<amdgpu_synid8_vsrc32_0>`
+    v_mqsad_pk_u16_u8              :ref:`vdst<amdgpu_synid8_vdst64_0>`::ref:`b64<amdgpu_synid8_type_dev>`,            :ref:`src0<amdgpu_synid8_src64_1>`::ref:`b64<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`,   :ref:`src2<amdgpu_synid8_src64_1>`::ref:`b64<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mqsad_u32_u8                 :ref:`vdst<amdgpu_synid8_vdst128_0>`::ref:`b128<amdgpu_synid8_type_dev>`,           :ref:`src0<amdgpu_synid8_src64_1>`::ref:`b64<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`,   :ref:`vsrc2<amdgpu_synid8_vsrc128_0>`::ref:`b128<amdgpu_synid8_type_dev>`         :ref:`clamp<amdgpu_synid_clamp>`
+    v_msad_u8                      :ref:`vdst<amdgpu_synid8_vdst32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,            :ref:`src0<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`,   :ref:`src2<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_f64                      :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_hi_i32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mul_hi_i32_i24_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mul_hi_u32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mul_hi_u32_u24_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mul_i32_i24_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mul_legacy_f32_e64           :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_lo_u16_e64               :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mul_lo_u32                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_mul_u32_u24_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_nop_e64
+    v_not_b32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`
+    v_or_b32_e64                   :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_perm_b32                     :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`
+    v_qsad_pk_u16_u8               :ref:`vdst<amdgpu_synid8_vdst64_0>`::ref:`b64<amdgpu_synid8_type_dev>`,            :ref:`src0<amdgpu_synid8_src64_1>`::ref:`b64<amdgpu_synid8_type_dev>`,    :ref:`src1<amdgpu_synid8_src32_1>`::ref:`b32<amdgpu_synid8_type_dev>`,   :ref:`src2<amdgpu_synid8_src64_1>`::ref:`b64<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_rcp_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_rcp_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_f64_e64                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_iflag_f32_e64            :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_readlane_b32                 :ref:`sdst<amdgpu_synid8_sdst32_2>`,                :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`,       :ref:`ssrc1<amdgpu_synid8_ssrc32_3>`
+    v_rndne_f16_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_rndne_f32_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rndne_f64_e64                :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_rsq_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_f64_e64                  :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sad_hi_u8                    :ref:`vdst<amdgpu_synid8_vdst32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,            :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u8x4<amdgpu_synid8_type_dev>`,   :ref:`src1<amdgpu_synid8_src32_1>`::ref:`u8x4<amdgpu_synid8_type_dev>`,  :ref:`src2<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u16                      :ref:`vdst<amdgpu_synid8_vdst32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,            :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u16x2<amdgpu_synid8_type_dev>`,  :ref:`src1<amdgpu_synid8_src32_1>`::ref:`u16x2<amdgpu_synid8_type_dev>`, :ref:`src2<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u32                      :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`src2<amdgpu_synid8_src32_1>`               :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u8                       :ref:`vdst<amdgpu_synid8_vdst32_0>`::ref:`u32<amdgpu_synid8_type_dev>`,            :ref:`src0<amdgpu_synid8_src32_1>`::ref:`u8x4<amdgpu_synid8_type_dev>`,   :ref:`src1<amdgpu_synid8_src32_1>`::ref:`u8x4<amdgpu_synid8_type_dev>`,  :ref:`src2<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_sin_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_sin_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sqrt_f16_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_sqrt_f32_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sqrt_f64_e64                 :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sub_f16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_f32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sub_u16_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_sub_u32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,      :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_subb_u32_e64                 :ref:`vdst<amdgpu_synid8_vdst32_0>`,      :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`ssrc2<amdgpu_synid8_ssrc64_1>`
+    v_subbrev_u32_e64              :ref:`vdst<amdgpu_synid8_vdst32_0>`,      :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`,       :ref:`ssrc2<amdgpu_synid8_ssrc64_1>`
+    v_subrev_f16_e64               :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_subrev_f32_e64               :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_subrev_u16_e64               :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_subrev_u32_e64               :ref:`vdst<amdgpu_synid8_vdst32_0>`,      :ref:`sdst<amdgpu_synid8_sdst64_0>`,     :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+    v_trig_preop_f64               :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src0<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid8_src32_1>`::ref:`u32<amdgpu_synid8_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_trunc_f16_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_trunc_f32_e64                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src<amdgpu_synid8_src32_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_trunc_f64_e64                :ref:`vdst<amdgpu_synid8_vdst64_0>`,                :ref:`src<amdgpu_synid8_src64_1>`::ref:`m<amdgpu_synid8_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_writelane_b32                :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`ssrc0<amdgpu_synid8_ssrc32_4>`,       :ref:`ssrc1<amdgpu_synid8_ssrc32_3>`
+    v_xor_b32_e64                  :ref:`vdst<amdgpu_synid8_vdst32_0>`,                :ref:`src0<amdgpu_synid8_src32_1>`,        :ref:`src1<amdgpu_synid8_src32_1>`
+
+VOPC
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**             **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_cmp_class_f16                :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmp_class_f16_sdwa           :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`b32<amdgpu_synid8_type_dev>`      :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_class_f32                :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmp_class_f32_sdwa           :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`b32<amdgpu_synid8_type_dev>`      :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_class_f64                :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmp_eq_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_eq_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_eq_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_eq_i16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_eq_i16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_i32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_eq_i32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_i64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_eq_u16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_eq_u16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_u32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_eq_u32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_u64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_f_f16                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_f_f16_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_f32                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_f_f32_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_f64                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_f_i16                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_f_i16_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_i32                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_f_i32_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_i64                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_f_u16                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_f_u16_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_u32                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_f_u32_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_u64                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_ge_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ge_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ge_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_ge_i16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ge_i16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_i32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ge_i32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_i64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_ge_u16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ge_u16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_u32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ge_u32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_u64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_gt_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_gt_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_gt_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_gt_i16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_gt_i16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_i32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_gt_i32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_i64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_gt_u16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_gt_u16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_u32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_gt_u32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_u64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_le_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_le_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_le_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_le_i16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_le_i16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_i32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_le_i32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_i64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_le_u16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_le_u16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_u32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_le_u32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_u64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_lg_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_lg_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lg_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_lg_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lg_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_lt_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_lt_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_lt_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_lt_i16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_lt_i16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_i32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_lt_i32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_i64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_lt_u16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_lt_u16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_u32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_lt_u32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_u64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_ne_i16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ne_i16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ne_i32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ne_i32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ne_i64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_ne_u16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ne_u16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ne_u32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ne_u32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ne_u64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_neq_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_neq_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_neq_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_neq_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_neq_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_nge_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_nge_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nge_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_nge_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nge_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_ngt_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ngt_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ngt_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_ngt_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ngt_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_nle_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_nle_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nle_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_nle_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nle_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_nlg_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_nlg_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nlg_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_nlg_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nlg_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_nlt_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_nlt_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nlt_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_nlt_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nlt_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_o_f16                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_o_f16_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_o_f32                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_o_f32_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_o_f64                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_t_i16                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_t_i16_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_t_i32                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_t_i32_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_t_i64                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_t_u16                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_t_u16_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_t_u32                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_t_u32_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_t_u64                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_tru_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_tru_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_tru_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_tru_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_tru_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmp_u_f16                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_u_f16_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_u_f32                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmp_u_f32_sdwa               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_u_f64                    :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_class_f16               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmpx_class_f16_sdwa          :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`b32<amdgpu_synid8_type_dev>`      :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_class_f32               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmpx_class_f32_sdwa          :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`::ref:`b32<amdgpu_synid8_type_dev>`      :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_class_f64               :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`b32<amdgpu_synid8_type_dev>`
+    v_cmpx_eq_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_eq_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_eq_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_eq_i16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_eq_i16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_i32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_eq_i32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_i64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_eq_u16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_eq_u16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_u32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_eq_u32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_u64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_f_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_f_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_f_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_f_i16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_f_i16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_i32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_f_i32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_i64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_f_u16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_f_u16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_u32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_f_u32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_u64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_ge_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ge_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ge_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_ge_i16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ge_i16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_i32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ge_i32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_i64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_ge_u16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ge_u16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_u32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ge_u32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_u64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_gt_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_gt_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_gt_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_gt_i16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_gt_i16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_i32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_gt_i32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_i64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_gt_u16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_gt_u16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_u32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_gt_u32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_u64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_le_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_le_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_le_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_le_i16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_le_i16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_i32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_le_i32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_i64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_le_u16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_le_u16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_u32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_le_u32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_u64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_lg_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_lg_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lg_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_lg_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lg_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_lt_f16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_lt_f16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_f32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_lt_f32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_f64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_lt_i16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_lt_i16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_i32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_lt_i32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_i64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_lt_u16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_lt_u16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_u32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_lt_u32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_u64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_ne_i16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ne_i16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ne_i32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ne_i32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ne_i64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_ne_u16                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ne_u16_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ne_u32                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ne_u32_sdwa             :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ne_u64                  :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_neq_f16                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_neq_f16_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_neq_f32                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_neq_f32_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_neq_f64                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_nge_f16                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_nge_f16_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nge_f32                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_nge_f32_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nge_f64                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_ngt_f16                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ngt_f16_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ngt_f32                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_ngt_f32_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ngt_f64                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_nle_f16                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_nle_f16_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nle_f32                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_nle_f32_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nle_f64                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_nlg_f16                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_nlg_f16_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nlg_f32                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_nlg_f32_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nlg_f64                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_nlt_f16                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_nlt_f16_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nlt_f32                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_nlt_f32_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nlt_f64                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_o_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_o_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_o_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_o_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_o_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_t_i16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_t_i16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_t_i32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_t_i32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_t_i64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_t_u16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_t_u16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_t_u32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_t_u32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_t_u64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_tru_f16                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_tru_f16_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_tru_f32                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_tru_f32_sdwa            :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_tru_f64                 :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+    v_cmpx_u_f16                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_u_f16_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_u_f32                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src32_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`
+    v_cmpx_u_f32_sdwa              :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`vsrc0<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`,  :ref:`vsrc1<amdgpu_synid8_vsrc32_0>`::ref:`m<amdgpu_synid8_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_u_f64                   :ref:`vcc<amdgpu_synid8_vcc_64>`,      :ref:`src0<amdgpu_synid8_src64_0>`,     :ref:`vsrc1<amdgpu_synid8_vsrc64_0>`
+
+.. |---| unicode:: U+02014 .. em dash
+
+
+.. toctree::
+    :hidden:
+
+    gfx8_attr
+    gfx8_bimm16
+    gfx8_bimm32
+    gfx8_fimm16
+    gfx8_fimm32
+    gfx8_hwreg
+    gfx8_imm4
+    gfx8_label
+    gfx8_msg
+    gfx8_param
+    gfx8_perm_smem
+    gfx8_simm16
+    gfx8_tgt
+    gfx8_uimm16
+    gfx8_waitcnt
+    gfx8_addr_buf
+    gfx8_addr_ds
+    gfx8_addr_flat
+    gfx8_addr_mimg
+    gfx8_base_smem_addr
+    gfx8_base_smem_buf
+    gfx8_data_buf_atomic128
+    gfx8_data_buf_atomic32
+    gfx8_data_buf_atomic64
+    gfx8_data_buf_d16_128
+    gfx8_data_buf_d16_32
+    gfx8_data_buf_d16_64
+    gfx8_data_buf_d16_96
+    gfx8_data_mimg_atomic_cmp
+    gfx8_data_mimg_atomic_reg
+    gfx8_data_mimg_store
+    gfx8_data_mimg_store_d16
+    gfx8_dst_buf_128
+    gfx8_dst_buf_64
+    gfx8_dst_buf_96
+    gfx8_dst_buf_d16_128
+    gfx8_dst_buf_d16_32
+    gfx8_dst_buf_d16_64
+    gfx8_dst_buf_d16_96
+    gfx8_dst_buf_lds
+    gfx8_dst_flat_atomic32
+    gfx8_dst_flat_atomic64
+    gfx8_dst_mimg_gather4
+    gfx8_dst_mimg_regular
+    gfx8_dst_mimg_regular_d16
+    gfx8_offset_buf
+    gfx8_offset_smem_load
+    gfx8_offset_smem_store
+    gfx8_rsrc_buf
+    gfx8_rsrc_mimg
+    gfx8_samp_mimg
+    gfx8_sdata128_0
+    gfx8_sdata32_0
+    gfx8_sdata64_0
+    gfx8_sdst128_0
+    gfx8_sdst256_0
+    gfx8_sdst32_0
+    gfx8_sdst32_1
+    gfx8_sdst32_2
+    gfx8_sdst512_0
+    gfx8_sdst64_0
+    gfx8_sdst64_1
+    gfx8_src32_0
+    gfx8_src32_1
+    gfx8_src64_0
+    gfx8_src64_1
+    gfx8_src_exp
+    gfx8_ssrc32_0
+    gfx8_ssrc32_1
+    gfx8_ssrc32_2
+    gfx8_ssrc32_3
+    gfx8_ssrc32_4
+    gfx8_ssrc64_0
+    gfx8_ssrc64_1
+    gfx8_ssrc64_2
+    gfx8_ssrc64_3
+    gfx8_vcc_64
+    gfx8_vdata128_0
+    gfx8_vdata32_0
+    gfx8_vdata64_0
+    gfx8_vdata96_0
+    gfx8_vdst128_0
+    gfx8_vdst32_0
+    gfx8_vdst64_0
+    gfx8_vdst96_0
+    gfx8_vsrc128_0
+    gfx8_vsrc32_0
+    gfx8_vsrc64_0
+    gfx8_mod_dpp_sdwa_abs_neg
+    gfx8_mod_sdwa_sext
+    gfx8_mod_vop3_abs_neg
+    gfx8_opt
+    gfx8_ret
+    gfx8_type_dev
diff --git a/docs/AMDGPU/AMDGPUAsmGFX9.rst b/docs/AMDGPU/AMDGPUAsmGFX9.rst
new file mode 100644
index 0000000000000000000000000000000000000000..71052d0cf6e4ca1d96874f84ecc652165945e743
--- /dev/null
+++ b/docs/AMDGPU/AMDGPUAsmGFX9.rst
@@ -0,0 +1,2102 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+============================
+Syntax of GFX9 Instructions
+============================
+
+.. contents::
+  :local:
+
+Notation
+========
+
+Notation used in this document is explained :ref:`here<amdgpu_syn_instruction_notation>`.
+
+Introduction
+============
+
+An overview of generic syntax and other features of AMDGPU instructions may be found :ref:`in this document<amdgpu_syn_instructions>`.
+
+Instructions
+============
+
+
+DS
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**         **SRC0**      **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    ds_add_f32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_rtn_f32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_rtn_u32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_rtn_u64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_src2_f32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_src2_u32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_src2_u64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_u32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_add_u64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_b32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_b64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_rtn_b32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_rtn_b64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_src2_b32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_and_src2_b64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_append                      :ref:`vdst<amdgpu_synid9_vdst32_0>`                                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_bpermute_b32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>`
+    ds_cmpst_b32                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_b64                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_f32                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_f64                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_b32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_b64               :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_f32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_cmpst_rtn_f64               :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_condxchg32_rtn_b64          :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_consume                     :ref:`vdst<amdgpu_synid9_vdst32_0>`                                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_rtn_u32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_rtn_u64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_src2_u32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_src2_u64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_u32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_dec_u64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_barrier                             :ref:`vdata<amdgpu_synid9_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_init                                :ref:`vdata<amdgpu_synid9_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_br                             :ref:`vdata<amdgpu_synid9_vdata32_0>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_p                                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_release_all                                                       :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_gws_sema_v                                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_rtn_u32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_rtn_u64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_src2_u32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_src2_u64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_u32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_inc_u64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_f32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_f64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_i32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_i64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_f32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_f64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_i32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_i64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_u32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_rtn_u64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_f32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_f64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_i32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_i64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_u32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_src2_u64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_u32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_max_u64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_f32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_f64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_i32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_i64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_f32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_f64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_i32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_i64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_u32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_rtn_u64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_f32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_f64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_i32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_i64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_u32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_src2_u64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_u32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_min_u64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_b32                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_b64                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_rtn_b32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_mskor_rtn_b64               :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_nop
+    ds_or_b32                                  :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_b64                                  :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_rtn_b32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_rtn_b64                  :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_src2_b32                             :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_or_src2_b64                             :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_ordered_count               :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_permute_b32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>`
+    ds_read2_b32                   :ref:`vdst<amdgpu_synid9_vdst64_0>`::ref:`b32x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2_b64                   :ref:`vdst<amdgpu_synid9_vdst128_0>`::ref:`b64x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2st64_b32               :ref:`vdst<amdgpu_synid9_vdst64_0>`::ref:`b32x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read2st64_b64               :ref:`vdst<amdgpu_synid9_vdst128_0>`::ref:`b64x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b128                   :ref:`vdst<amdgpu_synid9_vdst128_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b32                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b64                    :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_b96                    :ref:`vdst<amdgpu_synid9_vdst96_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_i16                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_i8                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_i8_d16                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_i8_d16_hi              :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u16                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u16_d16                :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u16_d16_hi             :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u8                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u8_d16                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_read_u8_d16_hi              :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_rtn_u32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_rtn_u64                :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_src2_u32                           :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_src2_u64                           :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_u32                                :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_rsub_u64                                :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_rtn_u32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_rtn_u64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_src2_u32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_src2_u64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_u32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_sub_u64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_swizzle_b32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`sw_offset16<amdgpu_synid_sw_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrap_rtn_b32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2_b32                              :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2_b64                              :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2st64_b32                          :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write2st64_b64                          :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b128                              :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata128_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b16                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b16_d16_hi                        :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b32                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b64                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b8                                :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b8_d16_hi                         :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_b96                               :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata96_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_src2_b32                          :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_write_src2_b64                          :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2_rtn_b32             :ref:`vdst<amdgpu_synid9_vdst64_0>`::ref:`b32x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2_rtn_b64             :ref:`vdst<amdgpu_synid9_vdst128_0>`::ref:`b64x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2st64_rtn_b32         :ref:`vdst<amdgpu_synid9_vdst64_0>`::ref:`b32x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata32_0>`,   :ref:`vdata1<amdgpu_synid9_vdata32_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg2st64_rtn_b64         :ref:`vdst<amdgpu_synid9_vdst128_0>`::ref:`b64x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata0<amdgpu_synid9_vdata64_0>`,   :ref:`vdata1<amdgpu_synid9_vdata64_0>`         :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg_rtn_b32              :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_wrxchg_rtn_b64              :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_b32                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_b64                                 :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_rtn_b32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_rtn_b64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,       :ref:`vaddr<amdgpu_synid9_addr_ds>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                    :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_src2_b32                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+    ds_xor_src2_b64                            :ref:`vaddr<amdgpu_synid9_addr_ds>`                              :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
+
+EXP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**      **SRC3**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    exp                            :ref:`tgt<amdgpu_synid9_tgt>`,      :ref:`vsrc0<amdgpu_synid9_src_exp>`,    :ref:`vsrc1<amdgpu_synid9_src_exp>`,    :ref:`vsrc2<amdgpu_synid9_src_exp>`,    :ref:`vsrc3<amdgpu_synid9_src_exp>`          :ref:`done<amdgpu_synid_done>` :ref:`compr<amdgpu_synid_compr>` :ref:`vm<amdgpu_synid_vm>`
+
+FLAT
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**           **SRC0**      **SRC1**         **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    flat_atomic_add                :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_add_x2             :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_and                :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_and_x2             :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_cmpswap            :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`b32x2<amdgpu_synid9_type_dev>`                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_cmpswap_x2         :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata128_0>`::ref:`b64x2<amdgpu_synid9_type_dev>`                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_dec                :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`u32<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_dec_x2             :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`u64<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_inc                :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`u32<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_inc_x2             :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`u64<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_or                 :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_or_x2              :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smax               :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`s32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`s32<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smax_x2            :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`s64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`s64<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smin               :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`s32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`s32<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_smin_x2            :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`s64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`s64<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_sub                :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_sub_x2             :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_swap               :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_swap_x2            :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umax               :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`u32<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umax_x2            :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`u64<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umin               :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`u32<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_umin_x2            :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`u64<amdgpu_synid9_type_dev>`                   :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_xor                :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_atomic_xor_x2             :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dword                :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx2              :ref:`vdst<amdgpu_synid9_vdst64_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx3              :ref:`vdst<amdgpu_synid9_vdst96_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_dwordx4              :ref:`vdst<amdgpu_synid9_vdst128_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_sbyte                :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_sbyte_d16            :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_sbyte_d16_hi         :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_short_d16            :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_short_d16_hi         :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_sshort               :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_ubyte                :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_ubyte_d16            :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_ubyte_d16_hi         :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_load_ushort               :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_addr_flat>`                                 :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_byte                              :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_byte_d16_hi                       :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dword                             :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx2                           :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx3                           :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata96_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_dwordx4                           :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata128_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_short                             :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    flat_store_short_d16_hi                      :ref:`vaddr<amdgpu_synid9_addr_flat>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`                       :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_add              :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_add_x2           :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_and              :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_and_x2           :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_cmpswap          :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`b32x2<amdgpu_synid9_type_dev>`, :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_cmpswap_x2       :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata128_0>`::ref:`b64x2<amdgpu_synid9_type_dev>`, :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_dec              :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_dec_x2           :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_inc              :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_inc_x2           :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_or               :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_or_x2            :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_smax             :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`s32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`s32<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_smax_x2          :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`s64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`s64<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_smin             :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`s32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`s32<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_smin_x2          :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`s64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`s64<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_sub              :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_sub_x2           :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_swap             :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_swap_x2          :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_umax             :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_umax_x2          :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_umin             :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_umin_x2          :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`::ref:`u64<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_xor              :ref:`vdst<amdgpu_synid9_dst_flat_atomic32>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_atomic_xor_x2           :ref:`vdst<amdgpu_synid9_dst_flat_atomic64>`::ref:`opt<amdgpu_synid9_opt>`,     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_dword              :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_dwordx2            :ref:`vdst<amdgpu_synid9_vdst64_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_dwordx3            :ref:`vdst<amdgpu_synid9_vdst96_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_dwordx4            :ref:`vdst<amdgpu_synid9_vdst128_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_sbyte              :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_sbyte_d16          :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_sbyte_d16_hi       :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_short_d16          :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_short_d16_hi       :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_sshort             :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_ubyte              :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_ubyte_d16          :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_ubyte_d16_hi       :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_load_ushort             :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_global>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_store_byte                            :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_store_byte_d16_hi                     :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_store_dword                           :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_store_dwordx2                         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_store_dwordx3                         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata96_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_store_dwordx4                         :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata128_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_store_short                           :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    global_store_short_d16_hi                    :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_global>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_dword             :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_dwordx2           :ref:`vdst<amdgpu_synid9_vdst64_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_dwordx3           :ref:`vdst<amdgpu_synid9_vdst96_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_dwordx4           :ref:`vdst<amdgpu_synid9_vdst128_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_sbyte             :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_sbyte_d16         :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_sbyte_d16_hi      :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_short_d16         :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_short_d16_hi      :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_sshort            :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_ubyte             :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_ubyte_d16         :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_ubyte_d16_hi      :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_load_ushort            :ref:`vdst<amdgpu_synid9_vdst32_0>`,         :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`                       :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_store_byte                           :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_store_byte_d16_hi                    :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_store_dword                          :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_store_dwordx2                        :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`vdata<amdgpu_synid9_vdata64_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_store_dwordx3                        :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`vdata<amdgpu_synid9_vdata96_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_store_dwordx4                        :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`vdata<amdgpu_synid9_vdata128_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_store_short                          :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    scratch_store_short_d16_hi                   :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>`,    :ref:`vdata<amdgpu_synid9_vdata32_0>`,       :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>`          :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+
+MIMG
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**       **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    image_atomic_add                         :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_and                         :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_cmpswap                     :ref:`vdata<amdgpu_synid9_data_mimg_atomic_cmp>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_dec                         :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_inc                         :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_or                          :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_smax                        :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_smin                        :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_sub                         :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_swap                        :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_umax                        :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_umin                        :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_atomic_xor                         :ref:`vdata<amdgpu_synid9_data_mimg_atomic_reg>`::ref:`dst<amdgpu_synid9_ret>`, :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_gather4                  :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_b                :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_b_cl             :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_b_cl_o           :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_b_o              :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c                :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_b              :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_b_cl           :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_b_cl_o         :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_b_o            :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_cl             :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_cl_o           :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_l              :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_l_o            :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_lz             :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_lz_o           :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_c_o              :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_cl               :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_cl_o             :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_l                :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_l_o              :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_lz               :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_lz_o             :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_gather4_o                :ref:`vdst<amdgpu_synid9_dst_mimg_gather4>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_get_lod                  :ref:`vdst<amdgpu_synid9_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_get_resinfo              :ref:`vdst<amdgpu_synid9_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load                     :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_load_mip                 :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_load_mip_pck             :ref:`vdst<amdgpu_synid9_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_mip_pck_sgn         :ref:`vdst<amdgpu_synid9_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_pck                 :ref:`vdst<amdgpu_synid9_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_load_pck_sgn             :ref:`vdst<amdgpu_synid9_dst_mimg_regular>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`                    :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_sample                   :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_b                 :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_b_cl              :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_b_cl_o            :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_b_o               :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c                 :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_b               :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_b_cl            :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_b_cl_o          :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_b_o             :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cd              :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cd_cl           :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cd_cl_o         :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cd_o            :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cl              :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_cl_o            :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_d               :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_d_cl            :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_d_cl_o          :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_d_o             :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_l               :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_l_o             :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_lz              :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_lz_o            :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_c_o               :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cd                :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cd_cl             :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cd_cl_o           :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cd_o              :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cl                :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_cl_o              :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_d                 :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_d_cl              :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_d_cl_o            :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_d_o               :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_l                 :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_l_o               :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_lz                :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_lz_o              :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_sample_o                 :ref:`vdst<amdgpu_synid9_dst_mimg_regular_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,     :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`,    :ref:`ssamp<amdgpu_synid9_samp_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_store                              :ref:`vdata<amdgpu_synid9_data_mimg_store_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_store_mip                          :ref:`vdata<amdgpu_synid9_data_mimg_store_d16>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
+    image_store_mip_pck                      :ref:`vdata<amdgpu_synid9_data_mimg_store>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+    image_store_pck                          :ref:`vdata<amdgpu_synid9_data_mimg_store>`,     :ref:`vaddr<amdgpu_synid9_addr_mimg>`,    :ref:`srsrc<amdgpu_synid9_rsrc_mimg>`          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`a16<amdgpu_synid_a16>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
+
+MUBUF
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**             **SRC1**      **SRC2**      **SRC3**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    buffer_atomic_add                        :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_add_x2                     :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_and                        :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_and_x2                     :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_cmpswap                    :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`b32x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_cmpswap_x2                 :ref:`vdata<amdgpu_synid9_data_buf_atomic128>`::ref:`dst<amdgpu_synid9_ret>`::ref:`b64x2<amdgpu_synid9_type_dev>`, :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_dec                        :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_dec_x2                     :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_inc                        :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_inc_x2                     :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_or                         :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_or_x2                      :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smax                       :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s32<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smax_x2                    :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s64<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smin                       :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s32<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_smin_x2                    :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s64<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_sub                        :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_sub_x2                     :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_swap                       :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_swap_x2                    :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umax                       :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umax_x2                    :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umin                       :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_umin_x2                    :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_xor                        :ref:`vdata<amdgpu_synid9_data_buf_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_atomic_xor_x2                     :ref:`vdata<amdgpu_synid9_data_buf_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dword              :ref:`vdst<amdgpu_synid9_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_dwordx2            :ref:`vdst<amdgpu_synid9_dst_buf_64>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dwordx3            :ref:`vdst<amdgpu_synid9_dst_buf_96>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_dwordx4            :ref:`vdst<amdgpu_synid9_dst_buf_128>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_hi_x    :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_x       :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_xy      :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_xyz     :ref:`vdst<amdgpu_synid9_dst_buf_64>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_d16_xyzw    :ref:`vdst<amdgpu_synid9_dst_buf_64>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_x           :ref:`vdst<amdgpu_synid9_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_format_xy          :ref:`vdst<amdgpu_synid9_dst_buf_64>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_xyz         :ref:`vdst<amdgpu_synid9_dst_buf_96>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_format_xyzw        :ref:`vdst<amdgpu_synid9_dst_buf_128>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_sbyte              :ref:`vdst<amdgpu_synid9_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_sbyte_d16          :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_sbyte_d16_hi       :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_short_d16          :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_short_d16_hi       :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_sshort             :ref:`vdst<amdgpu_synid9_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_ubyte              :ref:`vdst<amdgpu_synid9_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_load_ubyte_d16          :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_ubyte_d16_hi       :ref:`vdst<amdgpu_synid9_dst_buf_32>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_load_ushort             :ref:`vdst<amdgpu_synid9_dst_buf_lds>`,     :ref:`vaddr<amdgpu_synid9_addr_buf>`,           :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`                  :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_store_byte                        :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_byte_d16_hi                 :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dword                       :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx2                     :ref:`vdata<amdgpu_synid9_vdata64_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx3                     :ref:`vdata<amdgpu_synid9_vdata96_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_dwordx4                     :ref:`vdata<amdgpu_synid9_vdata128_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_hi_x             :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_x                :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_xy               :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_xyz              :ref:`vdata<amdgpu_synid9_vdata64_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_d16_xyzw             :ref:`vdata<amdgpu_synid9_vdata64_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_x                    :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xy                   :ref:`vdata<amdgpu_synid9_vdata64_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xyz                  :ref:`vdata<amdgpu_synid9_vdata96_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_format_xyzw                 :ref:`vdata<amdgpu_synid9_vdata128_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_lds_dword                   :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,           :ref:`soffset<amdgpu_synid9_offset_buf>`                            :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`lds<amdgpu_synid_lds>`
+    buffer_store_short                       :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_store_short_d16_hi                :ref:`vdata<amdgpu_synid9_vdata32_0>`,           :ref:`vaddr<amdgpu_synid9_addr_buf>`,    :ref:`srsrc<amdgpu_synid9_rsrc_buf>`,    :ref:`soffset<amdgpu_synid9_offset_buf>`        :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
+    buffer_wbinvl1
+    buffer_wbinvl1_vol
+
+SMEM
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**             **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_atc_probe                              :ref:`imm3<amdgpu_synid9_perm_smem>`,            :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`
+    s_atc_probe_buffer                       :ref:`imm3<amdgpu_synid9_perm_smem>`,            :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`
+    s_atomic_add                             :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_add_x2                          :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_and                             :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_and_x2                          :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_cmpswap                         :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`b32x2<amdgpu_synid9_type_dev>`, :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_cmpswap_x2                      :ref:`sdata<amdgpu_synid9_data_smem_atomic128>`::ref:`dst<amdgpu_synid9_ret>`::ref:`b64x2<amdgpu_synid9_type_dev>`, :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_dec                             :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_dec_x2                          :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_inc                             :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_inc_x2                          :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_or                              :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_or_x2                           :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_smax                            :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_smax_x2                         :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_smin                            :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_smin_x2                         :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_sub                             :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_sub_x2                          :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_swap                            :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_swap_x2                         :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_umax                            :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_umax_x2                         :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_umin                            :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_umin_x2                         :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_xor                             :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_atomic_xor_x2                          :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_add                      :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_add_x2                   :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_and                      :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_and_x2                   :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_cmpswap                  :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`b32x2<amdgpu_synid9_type_dev>`, :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_cmpswap_x2               :ref:`sdata<amdgpu_synid9_data_smem_atomic128>`::ref:`dst<amdgpu_synid9_ret>`::ref:`b64x2<amdgpu_synid9_type_dev>`, :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_dec                      :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_dec_x2                   :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_inc                      :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_inc_x2                   :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_or                       :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_or_x2                    :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_smax                     :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_smax_x2                  :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_smin                     :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_smin_x2                  :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`s64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_sub                      :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_sub_x2                   :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_swap                     :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_swap_x2                  :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_umax                     :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_umax_x2                  :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_umin                     :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_umin_x2                  :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`::ref:`u64<amdgpu_synid9_type_dev>`,   :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_xor                      :ref:`sdata<amdgpu_synid9_data_smem_atomic32>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_atomic_xor_x2                   :ref:`sdata<amdgpu_synid9_data_smem_atomic64>`::ref:`dst<amdgpu_synid9_ret>`,       :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dword            :ref:`sdst<amdgpu_synid9_sdst32_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_buf>`,           :ref:`soffset<amdgpu_synid9_offset_smem_buf>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dwordx16         :ref:`sdst<amdgpu_synid9_sdst512_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_buf>`,           :ref:`soffset<amdgpu_synid9_offset_smem_buf>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dwordx2          :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_buf>`,           :ref:`soffset<amdgpu_synid9_offset_smem_buf>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dwordx4          :ref:`sdst<amdgpu_synid9_sdst128_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_buf>`,           :ref:`soffset<amdgpu_synid9_offset_smem_buf>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_load_dwordx8          :ref:`sdst<amdgpu_synid9_sdst256_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_buf>`,           :ref:`soffset<amdgpu_synid9_offset_smem_buf>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_store_dword                     :ref:`sdata<amdgpu_synid9_sdata32_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_store_dwordx2                   :ref:`sdata<amdgpu_synid9_sdata64_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_buffer_store_dwordx4                   :ref:`sdata<amdgpu_synid9_sdata128_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_buf>`,    :ref:`soffset<amdgpu_synid9_offset_smem_buf>`        :ref:`glc<amdgpu_synid_glc>`
+    s_dcache_discard                         :ref:`sbase<amdgpu_synid9_base_smem_addr>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`
+    s_dcache_discard_x2                      :ref:`sbase<amdgpu_synid9_base_smem_addr>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`
+    s_dcache_inv
+    s_dcache_inv_vol
+    s_dcache_wb
+    s_dcache_wb_vol
+    s_load_dword                   :ref:`sdst<amdgpu_synid9_sdst32_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_addr>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_load_dwordx16                :ref:`sdst<amdgpu_synid9_sdst512_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_addr>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_load_dwordx2                 :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_addr>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_load_dwordx4                 :ref:`sdst<amdgpu_synid9_sdst128_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_addr>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_load_dwordx8                 :ref:`sdst<amdgpu_synid9_sdst256_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_addr>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_memrealtime                  :ref:`sdst<amdgpu_synid9_sdst64_0>`
+    s_memtime                      :ref:`sdst<amdgpu_synid9_sdst64_0>`
+    s_scratch_load_dword           :ref:`sdst<amdgpu_synid9_sdst32_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_scratch>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_scratch_load_dwordx2         :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_scratch>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_scratch_load_dwordx4         :ref:`sdst<amdgpu_synid9_sdst128_0>`,     :ref:`sbase<amdgpu_synid9_base_smem_scratch>`,           :ref:`soffset<amdgpu_synid9_offset_smem_plain>`                  :ref:`glc<amdgpu_synid_glc>`
+    s_scratch_store_dword                    :ref:`sdata<amdgpu_synid9_sdata32_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_scratch>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_scratch_store_dwordx2                  :ref:`sdata<amdgpu_synid9_sdata64_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_scratch>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_scratch_store_dwordx4                  :ref:`sdata<amdgpu_synid9_sdata128_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_scratch>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_store_dword                            :ref:`sdata<amdgpu_synid9_sdata32_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_store_dwordx2                          :ref:`sdata<amdgpu_synid9_sdata64_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+    s_store_dwordx4                          :ref:`sdata<amdgpu_synid9_sdata128_0>`,           :ref:`sbase<amdgpu_synid9_base_smem_addr>`,    :ref:`soffset<amdgpu_synid9_offset_smem_plain>`        :ref:`glc<amdgpu_synid_glc>`
+
+SOP1
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_abs_i32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_and_saveexec_b64             :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_andn1_saveexec_b64           :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_andn1_wrexec_b64             :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_andn2_saveexec_b64           :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_andn2_wrexec_b64             :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_bcnt0_i32_b32                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_bcnt0_i32_b64                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_bcnt1_i32_b32                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_bcnt1_i32_b64                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_bitreplicate_b64_b32         :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_bitset0_b32                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_bitset0_b64                  :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    s_bitset1_b32                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_bitset1_b64                  :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    s_brev_b32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_brev_b64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_cbranch_join                           :ref:`ssrc<amdgpu_synid9_ssrc32_1>`
+    s_cmov_b32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_cmov_b64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_ff0_i32_b32                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_ff0_i32_b64                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_ff1_i32_b32                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_ff1_i32_b64                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_flbit_i32                    :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_flbit_i32_b32                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_flbit_i32_b64                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_flbit_i32_i64                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_getpc_b64                    :ref:`sdst<amdgpu_synid9_sdst64_1>`
+    s_mov_b32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_mov_b64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_mov_fed_b32                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_movreld_b32                  :ref:`sdst<amdgpu_synid9_sdst32_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_movreld_b64                  :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_movrels_b32                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_1>`
+    s_movrels_b64                  :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_1>`
+    s_nand_saveexec_b64            :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_nor_saveexec_b64             :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_not_b32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_not_b64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_or_saveexec_b64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_orn1_saveexec_b64            :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_orn2_saveexec_b64            :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_quadmask_b32                 :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_quadmask_b64                 :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_rfe_b64                                :ref:`ssrc<amdgpu_synid9_ssrc64_1>`
+    s_set_gpr_idx_idx                        :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_setpc_b64                              :ref:`ssrc<amdgpu_synid9_ssrc64_1>`
+    s_sext_i32_i16                 :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_sext_i32_i8                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_swappc_b64                   :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_1>`
+    s_wqm_b32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc32_0>`
+    s_wqm_b64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_xnor_saveexec_b64            :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+    s_xor_saveexec_b64             :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`ssrc<amdgpu_synid9_ssrc64_0>`
+
+SOP2
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**         **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_absdiff_i32                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_add_i32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_add_u32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_addc_u32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_and_b32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_and_b64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_andn2_b32                    :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_andn2_b64                    :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_ashr_i32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_ashr_i64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_bfe_i32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_bfe_i64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_bfe_u32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_bfe_u64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_bfm_b32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_bfm_b64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    s_cbranch_g_fork                         :ref:`ssrc0<amdgpu_synid9_ssrc64_2>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_2>`
+    s_cselect_b32                  :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cselect_b64                  :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_lshl1_add_u32                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_lshl2_add_u32                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_lshl3_add_u32                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_lshl4_add_u32                :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_lshl_b32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_lshl_b64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_lshr_b32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_lshr_b64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_max_i32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_max_u32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_min_i32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_min_u32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_mul_hi_i32                   :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_mul_hi_u32                   :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_mul_i32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_nand_b32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_nand_b64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_nor_b32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_nor_b64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_or_b32                       :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_or_b64                       :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_orn2_b32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_orn2_b64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_pack_hh_b32_b16              :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`::ref:`b16x2<amdgpu_synid9_type_dev>`, :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`b16x2<amdgpu_synid9_type_dev>`
+    s_pack_lh_b32_b16              :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`b16x2<amdgpu_synid9_type_dev>`
+    s_pack_ll_b32_b16              :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_rfe_restore_b64                        :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    s_sub_i32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_sub_u32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_subb_u32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_xnor_b32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_xnor_b64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_xor_b32                      :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_xor_b64                      :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+
+SOPC
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_bitcmp0_b32                  :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_bitcmp0_b64                  :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_bitcmp1_b32                  :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_bitcmp1_b64                  :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`
+    s_cmp_eq_i32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_eq_u32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_eq_u64                   :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_cmp_ge_i32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_ge_u32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_gt_i32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_gt_u32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_le_i32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_le_u32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_lg_i32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_lg_u32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_lg_u64                   :ref:`ssrc0<amdgpu_synid9_ssrc64_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc64_0>`
+    s_cmp_lt_i32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_cmp_lt_u32                   :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+    s_set_gpr_idx_on               :ref:`ssrc<amdgpu_synid9_ssrc32_0>`,     :ref:`imm4<amdgpu_synid9_imm4>`
+    s_setvskip                     :ref:`ssrc0<amdgpu_synid9_ssrc32_0>`,    :ref:`ssrc1<amdgpu_synid9_ssrc32_0>`
+
+SOPK
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_addk_i32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_call_b64                     :ref:`sdst<amdgpu_synid9_sdst64_1>`,     :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_i_fork                         :ref:`ssrc<amdgpu_synid9_ssrc64_3>`,     :ref:`label<amdgpu_synid9_label>`
+    s_cmovk_i32                    :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_cmpk_eq_i32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_cmpk_eq_u32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_uimm16>`
+    s_cmpk_ge_i32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_cmpk_ge_u32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_uimm16>`
+    s_cmpk_gt_i32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_cmpk_gt_u32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_uimm16>`
+    s_cmpk_le_i32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_cmpk_le_u32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_uimm16>`
+    s_cmpk_lg_i32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_cmpk_lg_u32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_uimm16>`
+    s_cmpk_lt_i32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_cmpk_lt_u32                            :ref:`ssrc<amdgpu_synid9_ssrc32_2>`,     :ref:`imm16<amdgpu_synid9_uimm16>`
+    s_getreg_b32                   :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`hwreg<amdgpu_synid9_hwreg>`
+    s_movk_i32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_mulk_i32                     :ref:`sdst<amdgpu_synid9_sdst32_1>`,     :ref:`imm16<amdgpu_synid9_simm16>`
+    s_setreg_b32                   :ref:`hwreg<amdgpu_synid9_hwreg>`,    :ref:`ssrc<amdgpu_synid9_ssrc32_2>`
+    s_setreg_imm32_b32             :ref:`hwreg<amdgpu_synid9_hwreg>`,    :ref:`imm32<amdgpu_synid9_bimm32>`
+
+SOPP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_barrier
+    s_branch                       :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_cdbgsys              :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_cdbgsys_and_user     :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_cdbgsys_or_user      :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_cdbguser             :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_execnz               :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_execz                :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_scc0                 :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_scc1                 :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_vccnz                :ref:`label<amdgpu_synid9_label>`
+    s_cbranch_vccz                 :ref:`label<amdgpu_synid9_label>`
+    s_decperflevel                 :ref:`imm16<amdgpu_synid9_bimm16>`
+    s_endpgm
+    s_endpgm_ordered_ps_done
+    s_endpgm_saved
+    s_icache_inv
+    s_incperflevel                 :ref:`imm16<amdgpu_synid9_bimm16>`
+    s_nop                          :ref:`imm16<amdgpu_synid9_bimm16>`
+    s_sendmsg                      :ref:`msg<amdgpu_synid9_msg>`
+    s_sendmsghalt                  :ref:`msg<amdgpu_synid9_msg>`
+    s_set_gpr_idx_mode             :ref:`imm4<amdgpu_synid9_imm4>`
+    s_set_gpr_idx_off
+    s_sethalt                      :ref:`imm16<amdgpu_synid9_bimm16>`
+    s_setkill                      :ref:`imm16<amdgpu_synid9_bimm16>`
+    s_setprio                      :ref:`imm16<amdgpu_synid9_bimm16>`
+    s_sleep                        :ref:`imm16<amdgpu_synid9_bimm16>`
+    s_trap                         :ref:`imm16<amdgpu_synid9_bimm16>`
+    s_ttracedata
+    s_waitcnt                      :ref:`waitcnt<amdgpu_synid9_waitcnt>`
+    s_wakeup
+
+VINTRP
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**       **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_interp_mov_f32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`param<amdgpu_synid9_param>`::ref:`b32<amdgpu_synid9_type_dev>`, :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_interp_p1_f32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`,      :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_interp_p2_f32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`,      :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`
+
+VOP1
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                     **DST**       **SRC**            **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_bfrev_b32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_bfrev_b32_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_bfrev_b32_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ceil_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_ceil_f16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ceil_f16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ceil_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_ceil_f32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ceil_f32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ceil_f64                      :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_clrexcp
+    v_cos_f16                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cos_f16_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cos_f16_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cos_f32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cos_f32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cos_f32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f16_f32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f16_f32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f16_f32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f16_i16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f16_i16_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f16_i16_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f16_u16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f16_u16_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f16_u16_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f32_f16_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_f16_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_f64                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_cvt_f32_i32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f32_i32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_i32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_u32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f32_u32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_u32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_ubyte0                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f32_ubyte0_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_ubyte0_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_ubyte1                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f32_ubyte1_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_ubyte1_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_ubyte2                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f32_ubyte2_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_ubyte2_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f32_ubyte3                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f32_ubyte3_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_f32_ubyte3_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_f64_f32                   :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f64_i32                   :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_f64_u32                   :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_flr_i32_f32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_flr_i32_f32_dpp           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_flr_i32_f32_sdwa          :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_i16_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_i16_f16_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_i16_f16_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_i32_f32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_i32_f32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_i32_f32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_i32_f64                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_cvt_norm_i16_f16              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_norm_i16_f16_dpp          :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_norm_i16_f16_sdwa         :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_norm_u16_f16              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_norm_u16_f16_dpp          :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_norm_u16_f16_sdwa         :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_off_f32_i4                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_off_f32_i4_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_off_f32_i4_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_rpi_i32_f32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_rpi_i32_f32_dpp           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_rpi_i32_f32_sdwa          :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_u16_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_u16_f16_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_u16_f16_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_u32_f32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_cvt_u32_f32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cvt_u32_f32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_cvt_u32_f64                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_exp_f16                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_exp_f16_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_exp_f16_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_exp_f32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_exp_f32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_exp_f32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_exp_legacy_f32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_exp_legacy_f32_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_exp_legacy_f32_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ffbh_i32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_ffbh_i32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ffbh_i32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ffbh_u32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_ffbh_u32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ffbh_u32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_ffbl_b32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_ffbl_b32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ffbl_b32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_floor_f16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_floor_f16_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_floor_f16_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_floor_f32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_floor_f32_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_floor_f32_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_floor_f64                     :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_fract_f16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_fract_f16_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_fract_f16_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_fract_f32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_fract_f32_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_fract_f32_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_fract_f64                     :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_frexp_exp_i16_f16             :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_frexp_exp_i16_f16_dpp         :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_frexp_exp_i16_f16_sdwa        :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_frexp_exp_i32_f32             :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_frexp_exp_i32_f32_dpp         :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_frexp_exp_i32_f32_sdwa        :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_frexp_exp_i32_f64             :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_frexp_mant_f16                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_frexp_mant_f16_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_frexp_mant_f16_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_frexp_mant_f32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_frexp_mant_f32_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_frexp_mant_f32_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_frexp_mant_f64                :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_log_f16                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_log_f16_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_log_f16_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_log_f32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_log_f32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_log_f32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_log_legacy_f32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_log_legacy_f32_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_log_legacy_f32_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_mov_b32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_mov_b32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mov_b32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_mov_fed_b32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_mov_fed_b32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mov_fed_b32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_nop
+    v_not_b32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_not_b32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_not_b32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rcp_f16                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_rcp_f16_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rcp_f16_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rcp_f32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_rcp_f32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rcp_f32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rcp_f64                       :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_rcp_iflag_f32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_rcp_iflag_f32_dpp             :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rcp_iflag_f32_sdwa            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_readfirstlane_b32             :ref:`sdst<amdgpu_synid9_sdst32_2>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`
+    v_rndne_f16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_rndne_f16_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rndne_f16_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rndne_f32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_rndne_f32_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rndne_f32_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rndne_f64                     :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_rsq_f16                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_rsq_f16_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rsq_f16_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rsq_f32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_rsq_f32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_rsq_f32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_rsq_f64                       :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_sat_pk_u8_i16                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_sat_pk_u8_i16_dpp             :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sat_pk_u8_i16_sdwa            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_screen_partition_4se_b32      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_screen_partition_4se_b32_dpp  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`           :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_screen_partition_4se_b32_sdwa :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sin_f16                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_sin_f16_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sin_f16_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sin_f32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_sin_f32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sin_f32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sqrt_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_sqrt_f16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sqrt_f16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sqrt_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_sqrt_f32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sqrt_f32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_sqrt_f64                      :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+    v_swap_b32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`
+    v_trunc_f16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_trunc_f16_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_trunc_f16_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_trunc_f32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`
+    v_trunc_f32_dpp                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_trunc_f32_sdwa                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
+    v_trunc_f64                     :ref:`vdst<amdgpu_synid9_vdst64_0>`,     :ref:`src<amdgpu_synid9_src64_0>`
+
+VOP2
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST0**      **DST1**      **SRC0**        **SRC1**        **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_add_co_u32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_add_co_u32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_co_u32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_add_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_add_f16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_f16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_add_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_add_f32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_f32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_add_u16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_add_u16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_u16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_add_u32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_add_u32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_add_u32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_addc_co_u32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`
+    v_addc_co_u32_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`            :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_addc_co_u32_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,    :ref:`vcc<amdgpu_synid9_vcc_64>`            :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_and_b32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_and_b32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_and_b32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_ashrrev_i16                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`u16<amdgpu_synid9_type_dev>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_ashrrev_i16_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`u16<amdgpu_synid9_type_dev>`,  :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ashrrev_i16_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`u16<amdgpu_synid9_type_dev>`, :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_ashrrev_i32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_ashrrev_i32_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,  :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ashrrev_i32_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cndmask_b32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`
+    v_cndmask_b32_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`            :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_cndmask_b32_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,    :ref:`vcc<amdgpu_synid9_vcc_64>`            :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_ldexp_f16                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`i16<amdgpu_synid9_type_dev>`
+    v_ldexp_f16_dpp                :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`i16<amdgpu_synid9_type_dev>`                  :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_ldexp_f16_sdwa               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`i16<amdgpu_synid9_type_dev>`                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_lshlrev_b16                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`u16<amdgpu_synid9_type_dev>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_lshlrev_b16_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`u16<amdgpu_synid9_type_dev>`,  :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_lshlrev_b16_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`u16<amdgpu_synid9_type_dev>`, :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_lshlrev_b32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_lshlrev_b32_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,  :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_lshlrev_b32_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_lshrrev_b16                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`u16<amdgpu_synid9_type_dev>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_lshrrev_b16_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`u16<amdgpu_synid9_type_dev>`,  :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_lshrrev_b16_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`u16<amdgpu_synid9_type_dev>`, :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_lshrrev_b32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_lshrrev_b32_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,  :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_lshrrev_b32_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`u32<amdgpu_synid9_type_dev>`, :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mac_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mac_f16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mac_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mac_f32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_madak_f16                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`imm32<amdgpu_synid9_fimm16>`
+    v_madak_f32                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`imm32<amdgpu_synid9_fimm32>`
+    v_madmk_f16                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`imm32<amdgpu_synid9_fimm16>`,      :ref:`vsrc2<amdgpu_synid9_vsrc32_0>`
+    v_madmk_f32                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`imm32<amdgpu_synid9_fimm32>`,      :ref:`vsrc2<amdgpu_synid9_vsrc32_0>`
+    v_max_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_max_f16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_f16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_max_f32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_f32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_i16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_max_i16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_i16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_i32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_max_i32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_i32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_u16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_max_u16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_u16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_max_u32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_max_u32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_max_u32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_min_f16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_f16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_min_f32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_f32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_i16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_min_i16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_i16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_i32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_min_i32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_i32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_u16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_min_u16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_u16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_min_u32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_min_u32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_min_u32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mul_f16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_f16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mul_f32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_f32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_hi_i32_i24               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mul_hi_i32_i24_dpp           :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_hi_i32_i24_sdwa          :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_hi_u32_u24               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mul_hi_u32_u24_dpp           :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_hi_u32_u24_sdwa          :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_i32_i24                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mul_i32_i24_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_i32_i24_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_legacy_f32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mul_legacy_f32_dpp           :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_legacy_f32_sdwa          :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_lo_u16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mul_lo_u16_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_lo_u16_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_mul_u32_u24                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_mul_u32_u24_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_mul_u32_u24_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_or_b32                       :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_or_b32_dpp                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_or_b32_sdwa                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_co_u32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_sub_co_u32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_co_u32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_sub_f16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_f16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_sub_f32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_f32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_u16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_sub_u16_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_u16_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_sub_u32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_sub_u32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_sub_u32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subb_co_u32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`
+    v_subb_co_u32_dpp              :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`            :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subb_co_u32_sdwa             :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,    :ref:`vcc<amdgpu_synid9_vcc_64>`            :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subbrev_co_u32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`
+    v_subbrev_co_u32_dpp           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`            :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subbrev_co_u32_sdwa          :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,    :ref:`vcc<amdgpu_synid9_vcc_64>`            :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_co_u32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_subrev_co_u32_dpp            :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_co_u32_sdwa           :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_subrev_f16_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_f16_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_f32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_subrev_f32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,    :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_f32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_u16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_subrev_u16_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_u16_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_subrev_u32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_subrev_u32_dpp               :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_subrev_u32_sdwa              :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_xor_b32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`,       :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_xor_b32_dpp                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,      :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
+    v_xor_b32_sdwa                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,               :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`                    :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+
+VOP3
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST0**       **DST1**      **SRC0**         **SRC1**        **SRC2**               **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_add3_u32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_add_co_u32_e64               :ref:`vdst<amdgpu_synid9_vdst32_0>`,      :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_add_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_add_f64                      :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_add_i16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`                           :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_i32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_add_lshl_u32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_add_u16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_add_u32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_addc_co_u32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,      :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`ssrc2<amdgpu_synid9_ssrc64_1>`
+    v_alignbit_b32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_alignbyte_b32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_and_b32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_and_or_b32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_ashrrev_i16_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u16<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`
+    v_ashrrev_i32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`
+    v_ashrrev_i64                  :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src64_1>`
+    v_bcnt_u32_b32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_bfe_i32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`src2<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`
+    v_bfe_u32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_bfi_b32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_bfm_b32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_bfrev_b32_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_ceil_f16_e64                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_ceil_f32_e64                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ceil_f64_e64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_clrexcp_e64
+    v_cmp_class_f16_e64            :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmp_class_f32_e64            :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmp_class_f64_e64            :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmp_eq_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_i16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_eq_i32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_eq_i64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_eq_u16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_eq_u32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_eq_u64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_f_f16_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_f32_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_f64_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_i16_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_f_i32_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_f_i64_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_f_u16_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_f_u32_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_f_u64_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_ge_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_i16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_ge_i32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_ge_i64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_ge_u16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_ge_u32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_ge_u64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_gt_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_i16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_gt_i32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_gt_i64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_gt_u16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_gt_u32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_gt_u64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_le_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_i16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_le_i32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_le_i64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_le_u16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_le_u32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_le_u64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_lg_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lg_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lg_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_i16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_lt_i32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_lt_i64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_lt_u16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_lt_u32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_lt_u64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_ne_i16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_ne_i32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_ne_i64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_ne_u16_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_ne_u32_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_ne_u64_e64               :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_neq_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_neq_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_neq_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f16_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f32_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f64_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_i16_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_t_i32_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_t_i64_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_t_u16_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_t_u32_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmp_t_u64_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmp_tru_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_tru_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_tru_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f16_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f32_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f64_e64                :ref:`sdst<amdgpu_synid9_sdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_class_f16_e64           :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmpx_class_f32_e64           :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmpx_class_f64_e64           :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmpx_eq_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_i16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_eq_i32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_eq_i64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_eq_u16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_eq_u32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_eq_u64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_f_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_i16_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_f_i32_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_f_i64_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_f_u16_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_f_u32_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_f_u64_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_ge_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_i16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_ge_i32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_ge_i64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_ge_u16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_ge_u32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_ge_u64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_gt_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_i16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_gt_i32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_gt_i64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_gt_u16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_gt_u32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_gt_u64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_le_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_i16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_le_i32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_le_i64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_le_u16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_le_u32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_le_u64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_lg_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lg_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lg_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_i16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_lt_i32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_lt_i64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_lt_u16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_lt_u32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_lt_u64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_ne_i16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_ne_i32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_ne_i64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_ne_u16_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_ne_u32_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_ne_u64_e64              :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_neq_f16_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_neq_f32_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_neq_f64_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f16_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f32_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f64_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f16_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f32_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f64_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f16_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f32_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f64_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f16_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f32_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f64_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f16_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f32_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f64_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_i16_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_t_i32_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_t_i64_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_t_u16_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_t_u32_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cmpx_t_u64_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`
+    v_cmpx_tru_f16_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_tru_f32_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_tru_f64_e64             :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f16_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f32_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f64_e64               :ref:`sdst<amdgpu_synid9_sdst64_1>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cndmask_b32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`ssrc2<amdgpu_synid9_ssrc64_1>`
+    v_cos_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_cos_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubeid_f32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubema_f32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubesc_f32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cubetc_f32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f16_f32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f16_i16_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f16_u16_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_f16_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_f64_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_i32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_u32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte0_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte1_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte2_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f32_ubyte3_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f64_f32_e64              :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f64_i32_e64              :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_f64_u32_e64              :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_flr_i32_f32_e64          :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_cvt_i16_f16_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_i32_f32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_i32_f64_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_norm_i16_f16_e64         :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_cvt_norm_u16_f16_e64         :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_cvt_off_f32_i4_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`                                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_cvt_pk_i16_i32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cvt_pk_u16_u32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_cvt_pk_u8_f32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`src2<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`
+    v_cvt_pkaccum_u8_f32           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`
+    v_cvt_pknorm_i16_f16           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_cvt_pknorm_i16_f32           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_cvt_pknorm_u16_f16           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_cvt_pknorm_u16_f32           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_cvt_pkrtz_f16_f32            :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_cvt_rpi_i32_f32_e64          :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_cvt_u16_f16_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_u32_f32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_u32_f64_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fixup_f16                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fixup_f32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fixup_f64                :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fixup_legacy_f16         :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fmas_f32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_fmas_f64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_div_scale_f32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_div_scale_f64                :ref:`vdst<amdgpu_synid9_vdst64_0>`,      :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_1>`,        :ref:`src1<amdgpu_synid9_src64_1>`,       :ref:`src2<amdgpu_synid9_src64_1>`
+    v_exp_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_exp_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_exp_legacy_f32_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ffbh_i32_e64                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_ffbh_u32_e64                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_ffbl_b32_e64                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_floor_f16_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_floor_f32_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_floor_f64_e64                :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fma_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fma_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fma_f64                      :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fma_legacy_f16               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>`
+    v_fract_f16_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_fract_f32_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_fract_f64_e64                :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_frexp_exp_i16_f16_e64        :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_frexp_exp_i32_f32_e64        :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_frexp_exp_i32_f64_e64        :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`
+    v_frexp_mant_f16_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_frexp_mant_f32_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_frexp_mant_f64_e64           :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_mov_f32_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`param<amdgpu_synid9_param>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p1_f32_e64            :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p1ll_f16              :ref:`vdst<amdgpu_synid9_vdst32_0>`::ref:`f32<amdgpu_synid9_type_dev>`,            :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid9_type_dev>`,  :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`                       :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p1lv_f16              :ref:`vdst<amdgpu_synid9_vdst32_0>`::ref:`f32<amdgpu_synid9_type_dev>`,            :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid9_type_dev>`, :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`vsrc2<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`::ref:`f16x2<amdgpu_synid9_type_dev>`      :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p2_f16                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid9_type_dev>`, :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`vsrc2<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid9_type_dev>`        :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_interp_p2_f32_e64            :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`vsrc<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_interp_p2_legacy_f16         :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid9_type_dev>`, :ref:`attr<amdgpu_synid9_attr>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`vsrc2<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`::ref:`f32<amdgpu_synid9_type_dev>`        :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ldexp_f16_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`i16<amdgpu_synid9_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_ldexp_f32                    :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`i32<amdgpu_synid9_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_ldexp_f64                    :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`i32<amdgpu_synid9_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_lerp_u8                      :ref:`vdst<amdgpu_synid9_vdst32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,            :ref:`src0<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`src2<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_log_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_log_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_log_legacy_f32_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_lshl_add_u32                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_lshl_or_b32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,   :ref:`src2<amdgpu_synid9_src32_1>`
+    v_lshlrev_b16_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u16<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`
+    v_lshlrev_b32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`
+    v_lshlrev_b64                  :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src64_1>`
+    v_lshrrev_b16_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u16<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`
+    v_lshrrev_b32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`
+    v_lshrrev_b64                  :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src64_1>`
+    v_mac_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_mac_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_f16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_f32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_i16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_i32_i16                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`::ref:`i32<amdgpu_synid9_type_dev>`           :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_i32_i24                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`::ref:`i32<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_i64_i32                  :ref:`vdst<amdgpu_synid9_vdst64_0>`,      :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src64_1>`::ref:`i64<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_legacy_f16               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_legacy_f32               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mad_legacy_i16               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_legacy_u16               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u32_u16                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`           :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u32_u24                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u64_u32                  :ref:`vdst<amdgpu_synid9_vdst64_0>`,      :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src64_1>`::ref:`u64<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_f16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_f32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max3_i16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_max3_i32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_max3_u16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_max3_u32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_max_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max_f64                      :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_max_i16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_max_i32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_max_u16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_max_u32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mbcnt_hi_u32_b32             :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mbcnt_lo_u32_b32             :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_med3_f16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_med3_f32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_med3_i16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_med3_i32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_med3_u16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_med3_u32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_min3_f16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min3_f32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,     :ref:`src2<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`             :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min3_i16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_min3_i32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_min3_u16                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_min3_u32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_min_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min_f64                      :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_min_i16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_min_i32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_min_u16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_min_u32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mov_b32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_mov_fed_b32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_mqsad_pk_u16_u8              :ref:`vdst<amdgpu_synid9_vdst64_0>`::ref:`b64<amdgpu_synid9_type_dev>`,            :ref:`src0<amdgpu_synid9_src64_1>`::ref:`b64<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`src2<amdgpu_synid9_src64_1>`::ref:`b64<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mqsad_u32_u8                 :ref:`vdst<amdgpu_synid9_vdst128_0>`::ref:`b128<amdgpu_synid9_type_dev>`,           :ref:`src0<amdgpu_synid9_src64_1>`::ref:`b64<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`vsrc2<amdgpu_synid9_vsrc128_0>`::ref:`b128<amdgpu_synid9_type_dev>`         :ref:`clamp<amdgpu_synid_clamp>`
+    v_msad_u8                      :ref:`vdst<amdgpu_synid9_vdst32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,            :ref:`src0<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`src2<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_f64                      :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_hi_i32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mul_hi_i32_i24_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mul_hi_u32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mul_hi_u32_u24_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mul_i32_i24_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mul_legacy_f32_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_mul_lo_u16_e64               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mul_lo_u32                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_mul_u32_u24_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_nop_e64
+    v_not_b32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_or3_b32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_or_b32_e64                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_pack_b32_f16                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>`
+    v_perm_b32                     :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_qsad_pk_u16_u8               :ref:`vdst<amdgpu_synid9_vdst64_0>`::ref:`b64<amdgpu_synid9_type_dev>`,            :ref:`src0<amdgpu_synid9_src64_1>`::ref:`b64<amdgpu_synid9_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`::ref:`b32<amdgpu_synid9_type_dev>`,   :ref:`src2<amdgpu_synid9_src64_1>`::ref:`b64<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_rcp_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_rcp_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_f64_e64                  :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rcp_iflag_f32_e64            :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_readlane_b32                 :ref:`sdst<amdgpu_synid9_sdst32_2>`,                :ref:`vsrc0<amdgpu_synid9_vsrc32_0>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_3>`
+    v_rndne_f16_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_rndne_f32_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rndne_f64_e64                :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_rsq_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_rsq_f64_e64                  :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sad_hi_u8                    :ref:`vdst<amdgpu_synid9_vdst32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,            :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u8x4<amdgpu_synid9_type_dev>`,   :ref:`src1<amdgpu_synid9_src32_1>`::ref:`u8x4<amdgpu_synid9_type_dev>`,  :ref:`src2<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,            :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u16x2<amdgpu_synid9_type_dev>`,  :ref:`src1<amdgpu_synid9_src32_1>`::ref:`u16x2<amdgpu_synid9_type_dev>`, :ref:`src2<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`               :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u8                       :ref:`vdst<amdgpu_synid9_vdst32_0>`::ref:`u32<amdgpu_synid9_type_dev>`,            :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u8x4<amdgpu_synid9_type_dev>`,   :ref:`src1<amdgpu_synid9_src32_1>`::ref:`u8x4<amdgpu_synid9_type_dev>`,  :ref:`src2<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`           :ref:`clamp<amdgpu_synid_clamp>`
+    v_sat_pk_u8_i16_e64            :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_screen_partition_4se_b32_e64 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`
+    v_sin_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_sin_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sqrt_f16_e64                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_sqrt_f32_e64                 :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sqrt_f64_e64                 :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sub_co_u32_e64               :ref:`vdst<amdgpu_synid9_vdst32_0>`,      :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_sub_f16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_f32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_sub_i16                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`                           :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_i32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_sub_u16_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_sub_u32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_subb_co_u32_e64              :ref:`vdst<amdgpu_synid9_vdst32_0>`,      :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`ssrc2<amdgpu_synid9_ssrc64_1>`
+    v_subbrev_co_u32_e64           :ref:`vdst<amdgpu_synid9_vdst32_0>`,      :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`ssrc2<amdgpu_synid9_ssrc64_1>`
+    v_subrev_co_u32_e64            :ref:`vdst<amdgpu_synid9_vdst32_0>`,      :ref:`sdst<amdgpu_synid9_sdst64_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_subrev_f16_e64               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>`
+    v_subrev_f32_e64               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_subrev_u16_e64               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_subrev_u32_e64               :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+    v_trig_preop_f64               :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src0<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`,      :ref:`src1<amdgpu_synid9_src32_1>`::ref:`u32<amdgpu_synid9_type_dev>`                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_trunc_f16_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>`
+    v_trunc_f32_e64                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src<amdgpu_synid9_src32_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_trunc_f64_e64                :ref:`vdst<amdgpu_synid9_vdst64_0>`,                :ref:`src<amdgpu_synid9_src64_1>`::ref:`m<amdgpu_synid9_mod_vop3_abs_neg>`                                       :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
+    v_writelane_b32                :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`ssrc0<amdgpu_synid9_ssrc32_4>`,       :ref:`ssrc1<amdgpu_synid9_ssrc32_3>`
+    v_xad_u32                      :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`,       :ref:`src2<amdgpu_synid9_src32_1>`
+    v_xor_b32_e64                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,                :ref:`src0<amdgpu_synid9_src32_1>`,        :ref:`src1<amdgpu_synid9_src32_1>`
+
+VOP3P
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**        **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_mad_mix_f32                  :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`,  :ref:`src2<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`        :ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>` :ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_mixhi_f16                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`,  :ref:`src2<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`        :ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>` :ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_mixlo_f16                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`,    :ref:`src1<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`,  :ref:`src2<amdgpu_synid9_src32_1>`::ref:`fx<amdgpu_synid9_mad_type_dev>`        :ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>` :ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_add_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_add_i16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_add_u16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_ashrrev_i16               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u16x2<amdgpu_synid9_type_dev>`, :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
+    v_pk_fma_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`,     :ref:`src2<amdgpu_synid9_src32_1>`           :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_lshlrev_b16               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u16x2<amdgpu_synid9_type_dev>`, :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
+    v_pk_lshrrev_b16               :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`::ref:`u16x2<amdgpu_synid9_type_dev>`, :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
+    v_pk_mad_i16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`,     :ref:`src2<amdgpu_synid9_src32_1>`           :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_mad_u16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`,     :ref:`src2<amdgpu_synid9_src32_1>`           :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_max_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_max_i16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
+    v_pk_max_u16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
+    v_pk_min_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_min_i16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
+    v_pk_min_u16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
+    v_pk_mul_f16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_mul_lo_u16                :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
+    v_pk_sub_i16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_sub_u16                   :ref:`vdst<amdgpu_synid9_vdst32_0>`,     :ref:`src0<amdgpu_synid9_src32_1>`,       :ref:`src1<amdgpu_synid9_src32_1>`                     :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOPC
+-----------------------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**             **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_cmp_class_f16                :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmp_class_f16_sdwa           :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`b32<amdgpu_synid9_type_dev>`      :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_class_f32                :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmp_class_f32_sdwa           :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`b32<amdgpu_synid9_type_dev>`      :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_class_f64                :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmp_eq_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_eq_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_eq_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_eq_i16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_eq_i16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_i32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_eq_i32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_i64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_eq_u16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_eq_u16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_u32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_eq_u32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_eq_u64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_f_f16                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_f_f16_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_f32                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_f_f32_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_f64                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_f_i16                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_f_i16_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_i32                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_f_i32_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_i64                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_f_u16                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_f_u16_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_u32                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_f_u32_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_f_u64                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_ge_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ge_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ge_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_ge_i16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ge_i16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_i32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ge_i32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_i64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_ge_u16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ge_u16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_u32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ge_u32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ge_u64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_gt_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_gt_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_gt_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_gt_i16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_gt_i16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_i32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_gt_i32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_i64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_gt_u16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_gt_u16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_u32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_gt_u32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_gt_u64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_le_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_le_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_le_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_le_i16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_le_i16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_i32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_le_i32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_i64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_le_u16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_le_u16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_u32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_le_u32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_le_u64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_lg_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_lg_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lg_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_lg_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lg_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_lt_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_lt_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_lt_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_lt_i16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_lt_i16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_i32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_lt_i32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_i64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_lt_u16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_lt_u16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_u32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_lt_u32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_lt_u64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_ne_i16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ne_i16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ne_i32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ne_i32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ne_i64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_ne_u16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ne_u16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ne_u32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ne_u32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ne_u64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_neq_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_neq_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_neq_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_neq_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_neq_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_nge_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_nge_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nge_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_nge_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nge_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_ngt_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ngt_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ngt_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_ngt_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_ngt_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_nle_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_nle_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nle_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_nle_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nle_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_nlg_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_nlg_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nlg_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_nlg_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nlg_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_nlt_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_nlt_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nlt_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_nlt_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_nlt_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_o_f16                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_o_f16_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_o_f32                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_o_f32_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_o_f64                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_t_i16                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_t_i16_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_t_i32                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_t_i32_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_t_i64                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_t_u16                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_t_u16_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_t_u32                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_t_u32_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_t_u64                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_tru_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_tru_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_tru_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_tru_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_tru_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmp_u_f16                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_u_f16_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_u_f32                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmp_u_f32_sdwa               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmp_u_f64                    :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_class_f16               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmpx_class_f16_sdwa          :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`b32<amdgpu_synid9_type_dev>`      :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_class_f32               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmpx_class_f32_sdwa          :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`::ref:`b32<amdgpu_synid9_type_dev>`      :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_class_f64               :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`b32<amdgpu_synid9_type_dev>`
+    v_cmpx_eq_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_eq_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_eq_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_eq_i16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_eq_i16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_i32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_eq_i32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_i64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_eq_u16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_eq_u16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_u32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_eq_u32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_eq_u64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_f_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_f_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_f_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_f_i16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_f_i16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_i32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_f_i32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_i64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_f_u16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_f_u16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_u32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_f_u32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_f_u64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_ge_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ge_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ge_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_ge_i16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ge_i16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_i32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ge_i32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_i64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_ge_u16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ge_u16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_u32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ge_u32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ge_u64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_gt_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_gt_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_gt_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_gt_i16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_gt_i16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_i32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_gt_i32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_i64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_gt_u16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_gt_u16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_u32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_gt_u32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_gt_u64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_le_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_le_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_le_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_le_i16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_le_i16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_i32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_le_i32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_i64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_le_u16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_le_u16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_u32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_le_u32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_le_u64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_lg_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_lg_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lg_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_lg_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lg_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_lt_f16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_lt_f16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_f32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_lt_f32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_f64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_lt_i16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_lt_i16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_i32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_lt_i32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_i64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_lt_u16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_lt_u16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_u32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_lt_u32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_lt_u64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_ne_i16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ne_i16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ne_i32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ne_i32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ne_i64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_ne_u16                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ne_u16_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ne_u32                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ne_u32_sdwa             :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ne_u64                  :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_neq_f16                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_neq_f16_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_neq_f32                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_neq_f32_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_neq_f64                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_nge_f16                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_nge_f16_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nge_f32                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_nge_f32_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nge_f64                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_ngt_f16                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ngt_f16_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ngt_f32                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_ngt_f32_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_ngt_f64                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_nle_f16                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_nle_f16_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nle_f32                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_nle_f32_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nle_f64                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_nlg_f16                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_nlg_f16_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nlg_f32                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_nlg_f32_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nlg_f64                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_nlt_f16                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_nlt_f16_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nlt_f32                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_nlt_f32_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_nlt_f64                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_o_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_o_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_o_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_o_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_o_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_t_i16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_t_i16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_t_i32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_t_i32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_t_i64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_t_u16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_t_u16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_t_u32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_t_u32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_sdwa_sext>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_t_u64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_tru_f16                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_tru_f16_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_tru_f32                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_tru_f32_sdwa            :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_tru_f64                 :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+    v_cmpx_u_f16                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_u_f16_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_u_f32                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`
+    v_cmpx_u_f32_sdwa              :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`,   :ref:`vsrc1<amdgpu_synid9_vsrc32_0>`::ref:`m<amdgpu_synid9_mod_dpp_sdwa_abs_neg>`          :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
+    v_cmpx_u_f64                   :ref:`vcc<amdgpu_synid9_vcc_64>`,      :ref:`src0<amdgpu_synid9_src64_0>`,     :ref:`vsrc1<amdgpu_synid9_vsrc64_0>`
+
+.. |---| unicode:: U+02014 .. em dash
+
+
+.. toctree::
+    :hidden:
+
+    gfx9_attr
+    gfx9_bimm16
+    gfx9_bimm32
+    gfx9_fimm16
+    gfx9_fimm32
+    gfx9_hwreg
+    gfx9_imm4
+    gfx9_label
+    gfx9_msg
+    gfx9_param
+    gfx9_perm_smem
+    gfx9_simm16
+    gfx9_tgt
+    gfx9_uimm16
+    gfx9_waitcnt
+    gfx9_addr_buf
+    gfx9_addr_ds
+    gfx9_addr_flat
+    gfx9_addr_mimg
+    gfx9_base_smem_addr
+    gfx9_base_smem_buf
+    gfx9_base_smem_scratch
+    gfx9_data_buf_atomic128
+    gfx9_data_buf_atomic32
+    gfx9_data_buf_atomic64
+    gfx9_data_mimg_atomic_cmp
+    gfx9_data_mimg_atomic_reg
+    gfx9_data_mimg_store
+    gfx9_data_mimg_store_d16
+    gfx9_data_smem_atomic128
+    gfx9_data_smem_atomic32
+    gfx9_data_smem_atomic64
+    gfx9_dst_buf_128
+    gfx9_dst_buf_32
+    gfx9_dst_buf_64
+    gfx9_dst_buf_96
+    gfx9_dst_buf_lds
+    gfx9_dst_flat_atomic32
+    gfx9_dst_flat_atomic64
+    gfx9_dst_mimg_gather4
+    gfx9_dst_mimg_regular
+    gfx9_dst_mimg_regular_d16
+    gfx9_offset_buf
+    gfx9_offset_smem_buf
+    gfx9_offset_smem_plain
+    gfx9_rsrc_buf
+    gfx9_rsrc_mimg
+    gfx9_saddr_flat_global
+    gfx9_saddr_flat_scratch
+    gfx9_samp_mimg
+    gfx9_sdata128_0
+    gfx9_sdata32_0
+    gfx9_sdata64_0
+    gfx9_sdst128_0
+    gfx9_sdst256_0
+    gfx9_sdst32_0
+    gfx9_sdst32_1
+    gfx9_sdst32_2
+    gfx9_sdst512_0
+    gfx9_sdst64_0
+    gfx9_sdst64_1
+    gfx9_src32_0
+    gfx9_src32_1
+    gfx9_src64_0
+    gfx9_src64_1
+    gfx9_src_exp
+    gfx9_ssrc32_0
+    gfx9_ssrc32_1
+    gfx9_ssrc32_2
+    gfx9_ssrc32_3
+    gfx9_ssrc32_4
+    gfx9_ssrc64_0
+    gfx9_ssrc64_1
+    gfx9_ssrc64_2
+    gfx9_ssrc64_3
+    gfx9_vaddr_flat_global
+    gfx9_vaddr_flat_scratch
+    gfx9_vcc_64
+    gfx9_vdata128_0
+    gfx9_vdata32_0
+    gfx9_vdata64_0
+    gfx9_vdata96_0
+    gfx9_vdst128_0
+    gfx9_vdst32_0
+    gfx9_vdst64_0
+    gfx9_vdst96_0
+    gfx9_vsrc128_0
+    gfx9_vsrc32_0
+    gfx9_vsrc64_0
+    gfx9_mad_type_dev
+    gfx9_mod_dpp_sdwa_abs_neg
+    gfx9_mod_sdwa_sext
+    gfx9_mod_vop3_abs_neg
+    gfx9_opt
+    gfx9_ret
+    gfx9_type_dev
diff --git a/docs/AMDGPU/gfx7_addr_buf.rst b/docs/AMDGPU/gfx7_addr_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22dc7d35338586c44dd3c7a6b632c459e835ea9a
--- /dev/null
+++ b/docs/AMDGPU/gfx7_addr_buf.rst
@@ -0,0 +1,24 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_addr_buf:
+
+vaddr
+===========================
+
+This is an optional operand which may specify a 64-bit address, offset and/or index.
+
+*Size:* 0, 1 or 2 dwords. Size is controlled by modifiers :ref:`addr64<amdgpu_synid_addr64>`, :ref:`offen<amdgpu_synid_offen>` and :ref:`idxen<amdgpu_synid_idxen>`:
+
+* If only :ref:`addr64<amdgpu_synid_addr64>` is specified, this operand supplies a 64-bit address. Size is 2 dwords.
+* If only :ref:`idxen<amdgpu_synid_idxen>` is specified, this operand supplies an index. Size is 1 dword.
+* If only :ref:`offen<amdgpu_synid_offen>` is specified, this operand supplies an offset. Size is 1 dword.
+* If both :ref:`idxen<amdgpu_synid_idxen>` and :ref:`offen<amdgpu_synid_offen>` are specified, index is in the first register and offset is in the second. Size is 2 dwords.
+* If none of these modifiers are specified, this operand must be set to :ref:`off<amdgpu_synid_off>`.
+* All other combinations of these modifiers are illegal.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx7_addr_ds.rst b/docs/AMDGPU/gfx7_addr_ds.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c9cab7d45ec8ab27a6f27e89373ccb7795c9d0c1
--- /dev/null
+++ b/docs/AMDGPU/gfx7_addr_ds.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_addr_ds:
+
+vaddr
+===========================
+
+An offset from the start of GDS/LDS memory.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_addr_flat.rst b/docs/AMDGPU/gfx7_addr_flat.rst
new file mode 100644
index 0000000000000000000000000000000000000000..93deea6d7a2fee503aa131ff53eaea07794f3a51
--- /dev/null
+++ b/docs/AMDGPU/gfx7_addr_flat.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_addr_flat:
+
+vaddr
+===========================
+
+A 64-bit flat address.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_addr_mimg.rst b/docs/AMDGPU/gfx7_addr_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..76eb4846f1ae137d170ed385e298f590f3d58e55
--- /dev/null
+++ b/docs/AMDGPU/gfx7_addr_mimg.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_addr_mimg:
+
+vaddr
+===========================
+
+Image address which includes from one to four dimensional coordinates and other data used to locate a position in the image.
+
+*Size:* 1, 2, 3, 4, 8 or 16 dwords. Actual size depends on opcode and specific image being handled.
+
+    Note 1. Image format and dimensions are encoded in the image resource constant but not in the instruction.
+
+    Note 2. Actually image address size may vary from 1 to 13 dwords, but assembler currently supports a limited range of register sequences.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_attr.rst b/docs/AMDGPU/gfx7_attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..219b77414ab1a9f189a047c45a59e7184246a112
--- /dev/null
+++ b/docs/AMDGPU/gfx7_attr.rst
@@ -0,0 +1,30 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_attr:
+
+attr
+===========================
+
+Interpolation attribute and channel:
+
+    ============== ===================================
+    Syntax         Description
+    ============== ===================================
+    attr{0..32}.x  Attribute 0..32 with *x* channel.
+    attr{0..32}.y  Attribute 0..32 with *y* channel.
+    attr{0..32}.z  Attribute 0..32 with *z* channel.
+    attr{0..32}.w  Attribute 0..32 with *w* channel.
+    ============== ===================================
+
+Examples:
+
+.. parsed-literal::
+
+    v_interp_p1_f32 v1, v0, attr0.x
+    v_interp_p1_f32 v1, v0, attr32.w
+
diff --git a/docs/AMDGPU/gfx7_base_smem_addr.rst b/docs/AMDGPU/gfx7_base_smem_addr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9cc3cc71fdf7eb7ca939caeefdae1faf8c9ccf3b
--- /dev/null
+++ b/docs/AMDGPU/gfx7_base_smem_addr.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_base_smem_addr:
+
+sbase
+===========================
+
+A 64-bit base address for scalar memory operations.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx7_base_smem_buf.rst b/docs/AMDGPU/gfx7_base_smem_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..416cac715c71b36f86eb6a705f2ddf83824e6d5d
--- /dev/null
+++ b/docs/AMDGPU/gfx7_base_smem_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_base_smem_buf:
+
+sbase
+===========================
+
+A 128-bit buffer resource constant for scalar memory operations which provides a base address, a size and a stride.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx7_bimm16.rst b/docs/AMDGPU/gfx7_bimm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..eb43f9b36a9d533137b3e828a50a122b8eef159d
--- /dev/null
+++ b/docs/AMDGPU/gfx7_bimm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_bimm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits.
+
diff --git a/docs/AMDGPU/gfx7_bimm32.rst b/docs/AMDGPU/gfx7_bimm32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4d8f89d7ae39aa2ae59f3cc3abe81bca485ec5e9
--- /dev/null
+++ b/docs/AMDGPU/gfx7_bimm32.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_bimm32:
+
+imm32
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 32 bits.
+
diff --git a/docs/AMDGPU/gfx7_data_buf_atomic128.rst b/docs/AMDGPU/gfx7_data_buf_atomic128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..33ff26c6c594de28b49b609c205a24f44d38bb35
--- /dev/null
+++ b/docs/AMDGPU/gfx7_data_buf_atomic128.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_data_buf_atomic128:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 4 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_data_buf_atomic32.rst b/docs/AMDGPU/gfx7_data_buf_atomic32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..df4a6e42ad457f94d7c5c0bbc92649f59474b4bb
--- /dev/null
+++ b/docs/AMDGPU/gfx7_data_buf_atomic32.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_data_buf_atomic32:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 1 dword by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_data_buf_atomic64.rst b/docs/AMDGPU/gfx7_data_buf_atomic64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4892e41631f4bfe07db0e4326e93ca86d6f24aea
--- /dev/null
+++ b/docs/AMDGPU/gfx7_data_buf_atomic64.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_data_buf_atomic64:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 2 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_data_mimg_atomic_cmp.rst b/docs/AMDGPU/gfx7_data_mimg_atomic_cmp.rst
new file mode 100644
index 0000000000000000000000000000000000000000..82c3337aeb2bde11d64c304dd4db24b4c5de6bb2
--- /dev/null
+++ b/docs/AMDGPU/gfx7_data_mimg_atomic_cmp.rst
@@ -0,0 +1,27 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_data_mimg_atomic_cmp:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify 2 data elements for 32-bit-per-pixel surfaces or 4 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+  Note. The surface data format is indicated in the image resource constant but not in the instruction.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_data_mimg_atomic_reg.rst b/docs/AMDGPU/gfx7_data_mimg_atomic_reg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..729548dcb87b5718980c61d407f1da3601315828
--- /dev/null
+++ b/docs/AMDGPU/gfx7_data_mimg_atomic_reg.rst
@@ -0,0 +1,26 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_data_mimg_atomic_reg:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify 1 data element for 32-bit-per-pixel surfaces or 2 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+  Note. The surface data format is indicated in the image resource constant but not in the instruction.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_data_mimg_store.rst b/docs/AMDGPU/gfx7_data_mimg_store.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1858547a7c0c81a0ff24acb5973c8347e3e1c8fe
--- /dev/null
+++ b/docs/AMDGPU/gfx7_data_mimg_store.rst
@@ -0,0 +1,18 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_data_mimg_store:
+
+vdata
+===========================
+
+Image data to store by an *image_store* instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` which may specify from 1 to 4 data elements. Each data element occupies 1 dword.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_dst_buf_128.rst b/docs/AMDGPU/gfx7_dst_buf_128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..701616ed3201edd9d5cd94724ce42acc7016ce30
--- /dev/null
+++ b/docs/AMDGPU/gfx7_dst_buf_128.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_dst_buf_128:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 4 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_dst_buf_64.rst b/docs/AMDGPU/gfx7_dst_buf_64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..de62a568bf5ff58c03727073e2f6358304840d91
--- /dev/null
+++ b/docs/AMDGPU/gfx7_dst_buf_64.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_dst_buf_64:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 2 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_dst_buf_96.rst b/docs/AMDGPU/gfx7_dst_buf_96.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1abfcc07aecf289ec3ff81db3e4a5a76eb194956
--- /dev/null
+++ b/docs/AMDGPU/gfx7_dst_buf_96.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_dst_buf_96:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 3 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_dst_buf_lds.rst b/docs/AMDGPU/gfx7_dst_buf_lds.rst
new file mode 100644
index 0000000000000000000000000000000000000000..435f6bb874b4cb85c205fe872f0f96d983436215
--- /dev/null
+++ b/docs/AMDGPU/gfx7_dst_buf_lds.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_dst_buf_lds:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+If :ref:`lds<amdgpu_synid_lds>` is specified, this operand is ignored by H/W and data are stored directly into LDS.
+
+*Size:* 1 dword by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+    Note that :ref:`tfe<amdgpu_synid_tfe>` and :ref:`lds<amdgpu_synid_lds>` cannot be used together.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_dst_flat_atomic32.rst b/docs/AMDGPU/gfx7_dst_flat_atomic32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4a85656ca86deb2e08e2cdbf6b17a23dc22b063d
--- /dev/null
+++ b/docs/AMDGPU/gfx7_dst_flat_atomic32.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_dst_flat_atomic32:
+
+vdst
+===========================
+
+Data returned by a 32-bit atomic flat instruction.
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_dst_flat_atomic64.rst b/docs/AMDGPU/gfx7_dst_flat_atomic64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cb1fddd931871cf7cacbcec3476e7ee1c72e5bd4
--- /dev/null
+++ b/docs/AMDGPU/gfx7_dst_flat_atomic64.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_dst_flat_atomic64:
+
+vdst
+===========================
+
+Data returned by a 64-bit atomic flat instruction.
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_dst_mimg_gather4.rst b/docs/AMDGPU/gfx7_dst_mimg_gather4.rst
new file mode 100644
index 0000000000000000000000000000000000000000..17fcff5246a363a55a383ddd0774af4a38f7167f
--- /dev/null
+++ b/docs/AMDGPU/gfx7_dst_mimg_gather4.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_dst_mimg_gather4:
+
+vdst
+===========================
+
+Image data to load by an *image_gather4* instruction.
+
+*Size:* 4 data elements by default. Each data element occupies 1 dword. :ref:`tfe<amdgpu_synid_tfe>` adds one more dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_dst_mimg_regular.rst b/docs/AMDGPU/gfx7_dst_mimg_regular.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aa165b81e62136fb002cf2005fd8eece5b4352f2
--- /dev/null
+++ b/docs/AMDGPU/gfx7_dst_mimg_regular.rst
@@ -0,0 +1,20 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_dst_mimg_regular:
+
+vdst
+===========================
+
+Image data to load by an image instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify from 1 to 4 data elements. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_fimm32.rst b/docs/AMDGPU/gfx7_fimm32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..70c81891e9784d81415f5712c8f23e55336027bc
--- /dev/null
+++ b/docs/AMDGPU/gfx7_fimm32.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_fimm32:
+
+imm32
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>` or a :ref:`floating-point_number<amdgpu_synid_floating-point_number>`. The value is converted to *f32* as described :ref:`here<amdgpu_synid_lit_conv>`.
+
diff --git a/docs/AMDGPU/gfx7_hwreg.rst b/docs/AMDGPU/gfx7_hwreg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1e2d96417c33985188f2bfd25e6af485390364cc
--- /dev/null
+++ b/docs/AMDGPU/gfx7_hwreg.rst
@@ -0,0 +1,60 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_hwreg:
+
+hwreg
+===========================
+
+Bits of a hardware register being accessed.
+
+The bits of this operand have the following meaning:
+
+    ============ ===================================
+    Bits         Description
+    ============ ===================================
+    5:0          Register *id*.
+    10:6         First bit *offset* (0..31).
+    15:11        *Size* in bits (1..32).
+    ============ ===================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>` or using the syntax described below.
+
+    ==================================== ============================================================================
+    Syntax                               Description
+    ==================================== ============================================================================
+    hwreg({0..63})                       All bits of a register indicated by its *id*.
+    hwreg(<*name*>)                      All bits of a register indicated by its *name*.
+    hwreg({0..63}, {0..31}, {1..32})     Register bits indicated by register *id*, first bit *offset* and *size*.
+    hwreg(<*name*>, {0..31}, {1..32})    Register bits indicated by register *name*, first bit *offset* and *size*.
+    ==================================== ============================================================================
+
+Register *id*, *offset* and *size* must be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`.
+
+Defined register *names* include:
+
+    =================== ==========================================
+    Name                Description
+    =================== ==========================================
+    HW_REG_MODE         Shader writeable mode bits.
+    HW_REG_STATUS       Shader read-only status.
+    HW_REG_TRAPSTS      Trap status.
+    HW_REG_HW_ID        Id of wave, simd, compute unit, etc.
+    HW_REG_GPR_ALLOC    Per-wave SGPR and VGPR allocation.
+    HW_REG_LDS_ALLOC    Per-wave LDS allocation.
+    HW_REG_IB_STS       Counters of outstanding instructions.
+    =================== ==========================================
+
+Examples:
+
+.. parsed-literal::
+
+    s_getreg_b32 s2, 0x6
+    s_getreg_b32 s2, hwreg(15)
+    s_getreg_b32 s2, hwreg(51, 1, 31)
+    s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1)
+
diff --git a/docs/AMDGPU/gfx7_label.rst b/docs/AMDGPU/gfx7_label.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed2f3a41666771a1e80b7d57e4e7875ab5b9095a
--- /dev/null
+++ b/docs/AMDGPU/gfx7_label.rst
@@ -0,0 +1,30 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_label:
+
+label
+===========================
+
+A branch target which is a 16-bit signed integer treated as a PC-relative dword offset.
+
+This operand may be specified as:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>`. The number is truncated to 16 bits.
+* An :ref:`absolute_expression<amdgpu_synid_absolute_expression>` which must start with an :ref:`integer_number<amdgpu_synid_integer_number>`. The value of the expression is truncated to 16 bits.
+* A :ref:`symbol<amdgpu_synid_symbol>` (for example, a label). The value is handled as a 16-bit PC-relative dword offset to be resolved by a linker.
+
+Examples:
+
+.. parsed-literal::
+
+  offset = 30
+  s_branch loop_end
+  s_branch 2 + offset
+  s_branch 32
+  loop_end:
+
diff --git a/docs/AMDGPU/gfx7_mod.rst b/docs/AMDGPU/gfx7_mod.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fcaa6caf9247cf8624688df4b32242f8100ddddd
--- /dev/null
+++ b/docs/AMDGPU/gfx7_mod.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_mod:
+
+m
+===========================
+
+This operand may be used with floating point operand modifiers :ref:`abs<amdgpu_synid_abs>` and :ref:`neg<amdgpu_synid_neg>`.
+
diff --git a/docs/AMDGPU/gfx7_msg.rst b/docs/AMDGPU/gfx7_msg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5476053ccc1cdab76b2d1f7b5bad1e755eb7b6c7
--- /dev/null
+++ b/docs/AMDGPU/gfx7_msg.rst
@@ -0,0 +1,72 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_msg:
+
+msg
+===========================
+
+A 16-bit message code. The bits of this operand have the following meaning:
+
+    ============ ======================================================
+    Bits         Description
+    ============ ======================================================
+    3:0          Message *type*.
+    6:4          Optional *operation*.
+    9:7          Optional *parameters*.
+    15:10        Unused.
+    ============ ======================================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>` or using the syntax described below:
+
+    ======================================== ========================================================================
+    Syntax                                   Description
+    ======================================== ========================================================================
+    sendmsg(<*type*>)                        A message identified by its *type*.
+    sendmsg(<*type*>, <*op*>)                A message identified by its *type* and *operation*.
+    sendmsg(<*type*>, <*op*>, <*stream*>)    A message identified by its *type* and *operation* with a stream *id*.
+    ======================================== ========================================================================
+
+*Type* may be specified using message *name* or message *id*.
+
+*Op* may be specified using operation *name* or operation *id*.
+
+Stream *id* is an integer in the range 0..3.
+
+Message *id*, operation *id* and stream *id* must be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`.
+
+Each message type supports specific operations:
+
+    ================= ========== ============================== ============ ==========
+    Message name      Message Id Supported Operations           Operation Id Stream Id
+    ================= ========== ============================== ============ ==========
+    MSG_INTERRUPT     1          \-                             \-           \-
+    MSG_GS            2          GS_OP_CUT                      1            Optional
+    \                            GS_OP_EMIT                     2            Optional
+    \                            GS_OP_EMIT_CUT                 3            Optional
+    MSG_GS_DONE       3          GS_OP_NOP                      0            \-
+    \                            GS_OP_CUT                      1            Optional
+    \                            GS_OP_EMIT                     2            Optional
+    \                            GS_OP_EMIT_CUT                 3            Optional
+    MSG_SYSMSG        15         SYSMSG_OP_ECC_ERR_INTERRUPT    1            \-
+    \                            SYSMSG_OP_REG_RD               2            \-
+    \                            SYSMSG_OP_HOST_TRAP_ACK        3            \-
+    \                            SYSMSG_OP_TTRACE_PC            4            \-
+    ================= ========== ============================== ============ ==========
+
+Examples:
+
+.. parsed-literal::
+
+    s_sendmsg 0x12
+    s_sendmsg sendmsg(MSG_INTERRUPT)
+    s_sendmsg sendmsg(2, GS_OP_CUT)
+    s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT)
+    s_sendmsg sendmsg(MSG_GS, 2)
+    s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_EMIT_CUT, 1)
+    s_sendmsg sendmsg(MSG_SYSMSG, SYSMSG_OP_TTRACE_PC)
+
diff --git a/docs/AMDGPU/gfx7_offset_buf.rst b/docs/AMDGPU/gfx7_offset_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c36df06a7bbfbe2a8f57092925e4315eca401220
--- /dev/null
+++ b/docs/AMDGPU/gfx7_offset_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_offset_buf:
+
+soffset
+===========================
+
+An unsigned byte offset.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx7_offset_smem.rst b/docs/AMDGPU/gfx7_offset_smem.rst
new file mode 100644
index 0000000000000000000000000000000000000000..85ed5f186737624ec5262e547f55b182acf05d9a
--- /dev/null
+++ b/docs/AMDGPU/gfx7_offset_smem.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_offset_smem:
+
+soffset
+===========================
+
+An unsigned offset added to the base address to get memory address.
+
+* If offset is specified as a register, it supplies an unsigned byte offset but 2 lsb's are ignored.
+* If offset is specified as an :ref:`uimm32<amdgpu_synid_uimm32>`, it supplies a 32-bit unsigned byte offset but 2 lsb's are ignored.
+* If offset is specified as an :ref:`uimm8<amdgpu_synid_uimm8>`, it supplies an 8-bit unsigned dword offset.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`uimm8<amdgpu_synid_uimm8>`, :ref:`uimm32<amdgpu_synid_uimm32>`
diff --git a/docs/AMDGPU/gfx7_opt.rst b/docs/AMDGPU/gfx7_opt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a48733dda976a0f0ae7bc7e942f04c658766949
--- /dev/null
+++ b/docs/AMDGPU/gfx7_opt.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_opt:
+
+opt
+===========================
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
diff --git a/docs/AMDGPU/gfx7_param.rst b/docs/AMDGPU/gfx7_param.rst
new file mode 100644
index 0000000000000000000000000000000000000000..13e533b3b288a4c18bcd67268f9c0d539c739eac
--- /dev/null
+++ b/docs/AMDGPU/gfx7_param.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_param:
+
+param
+===========================
+
+Interpolation parameter to read:
+
+    ============ ===================================
+    Syntax       Description
+    ============ ===================================
+    p0           Parameter *P0*.
+    p10          Parameter *P10*.
+    p20          Parameter *P20*.
+    ============ ===================================
+
diff --git a/docs/AMDGPU/gfx7_ret.rst b/docs/AMDGPU/gfx7_ret.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25301c2cde8ffdeebbb98374b5a199d17194e2e9
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ret.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ret:
+
+dst
+===========================
+
+This is an input operand. It may optionally serve as a destination if :ref:`glc<amdgpu_synid_glc>` is specified.
+
diff --git a/docs/AMDGPU/gfx7_rsrc_buf.rst b/docs/AMDGPU/gfx7_rsrc_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7ebcebc68efeb1602b0f3619d668392bd4a1df76
--- /dev/null
+++ b/docs/AMDGPU/gfx7_rsrc_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_rsrc_buf:
+
+srsrc
+===========================
+
+Buffer resource constant which defines the address and characteristics of the buffer in memory.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx7_rsrc_mimg.rst b/docs/AMDGPU/gfx7_rsrc_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0e40fe09cc61e033cda66e61fecfbdbe45c23c9
--- /dev/null
+++ b/docs/AMDGPU/gfx7_rsrc_mimg.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_rsrc_mimg:
+
+srsrc
+===========================
+
+Image resource constant which defines the location of the image buffer in memory, its dimensions, tiling, and data format.
+
+*Size:* 8 dwords by default, 4 dwords if :ref:`r128<amdgpu_synid_r128>` is specified.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx7_samp_mimg.rst b/docs/AMDGPU/gfx7_samp_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..738cad415fef6723f936de1ea361a23a8110b587
--- /dev/null
+++ b/docs/AMDGPU/gfx7_samp_mimg.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_samp_mimg:
+
+ssamp
+===========================
+
+Sampler constant used to specify filtering options applied to the image data after it is read.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx7_sdst128_0.rst b/docs/AMDGPU/gfx7_sdst128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..735a30cefd62173a69062256666e80bc4a3f3fb8
--- /dev/null
+++ b/docs/AMDGPU/gfx7_sdst128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_sdst128_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx7_sdst256_0.rst b/docs/AMDGPU/gfx7_sdst256_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c4e6f9fcf6627f34487021b4ddd6eb8f12eab2d9
--- /dev/null
+++ b/docs/AMDGPU/gfx7_sdst256_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_sdst256_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx7_sdst32_0.rst b/docs/AMDGPU/gfx7_sdst32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..183d89f9e906e307aa219ec72aabadc807e3243f
--- /dev/null
+++ b/docs/AMDGPU/gfx7_sdst32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_sdst32_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx7_sdst32_1.rst b/docs/AMDGPU/gfx7_sdst32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e49e20f9d5c9cbded58a479aaddcfafc623058a
--- /dev/null
+++ b/docs/AMDGPU/gfx7_sdst32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_sdst32_1:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx7_sdst32_2.rst b/docs/AMDGPU/gfx7_sdst32_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8212e5d7a6120360569d3a4c1ed26f3a146a6a31
--- /dev/null
+++ b/docs/AMDGPU/gfx7_sdst32_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_sdst32_2:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx7_sdst512_0.rst b/docs/AMDGPU/gfx7_sdst512_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d8c64b1e395c9d307c2971c71f2ed6be7de0911b
--- /dev/null
+++ b/docs/AMDGPU/gfx7_sdst512_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_sdst512_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 16 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`
diff --git a/docs/AMDGPU/gfx7_sdst64_0.rst b/docs/AMDGPU/gfx7_sdst64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..af608801b390130c27305246f0cf97fef913b035
--- /dev/null
+++ b/docs/AMDGPU/gfx7_sdst64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_sdst64_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx7_sdst64_1.rst b/docs/AMDGPU/gfx7_sdst64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..207df73932cae9d9157fef5c03a444486fec7ec9
--- /dev/null
+++ b/docs/AMDGPU/gfx7_sdst64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_sdst64_1:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx7_simm16.rst b/docs/AMDGPU/gfx7_simm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..66e560ecec8c368706814af1afc3c5e543148099
--- /dev/null
+++ b/docs/AMDGPU/gfx7_simm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_simm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits and then sign-extended to 32 bits.
+
diff --git a/docs/AMDGPU/gfx7_src32_0.rst b/docs/AMDGPU/gfx7_src32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22ff73ddf47d9991073d2a3dccfbb74ac28ddaf5
--- /dev/null
+++ b/docs/AMDGPU/gfx7_src32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_src32_0:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx7_src32_1.rst b/docs/AMDGPU/gfx7_src32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..00594597ff17fcbe5f5b800f5cfe875544875008
--- /dev/null
+++ b/docs/AMDGPU/gfx7_src32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_src32_1:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`iconst<amdgpu_synid_iconst>`
diff --git a/docs/AMDGPU/gfx7_src32_2.rst b/docs/AMDGPU/gfx7_src32_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b939c45603489b58cc41f0a0d68324e82776f8fc
--- /dev/null
+++ b/docs/AMDGPU/gfx7_src32_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_src32_2:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx7_src32_3.rst b/docs/AMDGPU/gfx7_src32_3.rst
new file mode 100644
index 0000000000000000000000000000000000000000..83aa9ca7199261e8b68f3d786d6ee9b2a709b872
--- /dev/null
+++ b/docs/AMDGPU/gfx7_src32_3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_src32_3:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx7_src64_0.rst b/docs/AMDGPU/gfx7_src64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a19b6ee8bd5aa94e23265431669b3b76600f39b7
--- /dev/null
+++ b/docs/AMDGPU/gfx7_src64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_src64_0:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx7_src64_1.rst b/docs/AMDGPU/gfx7_src64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c81864c9573fb4cef9e0a8f9c77761e8be5dd5dd
--- /dev/null
+++ b/docs/AMDGPU/gfx7_src64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_src64_1:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx7_src64_2.rst b/docs/AMDGPU/gfx7_src64_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..189245e548df362bfedf88eec48ababd25e797fc
--- /dev/null
+++ b/docs/AMDGPU/gfx7_src64_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_src64_2:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`iconst<amdgpu_synid_iconst>`
diff --git a/docs/AMDGPU/gfx7_src_exp.rst b/docs/AMDGPU/gfx7_src_exp.rst
new file mode 100644
index 0000000000000000000000000000000000000000..32f71a88b645329dbd2e1a2a3c1a2389773a4a83
--- /dev/null
+++ b/docs/AMDGPU/gfx7_src_exp.rst
@@ -0,0 +1,28 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_src_exp:
+
+vsrc
+===========================
+
+Data to copy to export buffers. This is an optional operand. Must be specified as :ref:`off<amdgpu_synid_off>` if not used.
+
+:ref:`compr<amdgpu_synid_compr>` modifier indicates use of compressed (16-bit) data. This limits number of source operands from 4 to 2:
+
+* src0 and src1 must specify the first register (or :ref:`off<amdgpu_synid_off>`).
+* src2 and src3 must specify the second register (or :ref:`off<amdgpu_synid_off>`).
+
+An example:
+
+.. parsed-literal::
+
+  exp mrtz v3, v3, off, off compr
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx7_ssrc32_0.rst b/docs/AMDGPU/gfx7_ssrc32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..843db249713caf0177c44d131fb031cf2635b5ad
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc32_0:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx7_ssrc32_1.rst b/docs/AMDGPU/gfx7_ssrc32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e626d8367329ad3121a7087101bdb1c0b0aed78
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc32_1:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx7_ssrc32_2.rst b/docs/AMDGPU/gfx7_ssrc32_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c7ff032f880039dce64731464db3cdcd9bb46993
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc32_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc32_2:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx7_ssrc32_3.rst b/docs/AMDGPU/gfx7_ssrc32_3.rst
new file mode 100644
index 0000000000000000000000000000000000000000..68a24153b6887fb11244b8ca9414821822fa83cc
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc32_3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc32_3:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`iconst<amdgpu_synid_iconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx7_ssrc32_4.rst b/docs/AMDGPU/gfx7_ssrc32_4.rst
new file mode 100644
index 0000000000000000000000000000000000000000..669ae4e7e2d03a0724b42ee71c0e178794464e51
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc32_4.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc32_4:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`iconst<amdgpu_synid_iconst>`
diff --git a/docs/AMDGPU/gfx7_ssrc64_0.rst b/docs/AMDGPU/gfx7_ssrc64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..283e7ea63dac08f2a29708efe9272d33f79f1af4
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc64_0:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx7_ssrc64_1.rst b/docs/AMDGPU/gfx7_ssrc64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..42dc8c5690611b60f8f020132f85c9a28b4ef2aa
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc64_1:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx7_ssrc64_2.rst b/docs/AMDGPU/gfx7_ssrc64_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..344147fdd279d47730f9668ee5421121954ba75e
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc64_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc64_2:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx7_ssrc64_3.rst b/docs/AMDGPU/gfx7_ssrc64_3.rst
new file mode 100644
index 0000000000000000000000000000000000000000..173f550d4ec9c0aea67bc33a6fc00ce78ed26928
--- /dev/null
+++ b/docs/AMDGPU/gfx7_ssrc64_3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_ssrc64_3:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx7_tgt.rst b/docs/AMDGPU/gfx7_tgt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c407c0c56bbe6618c291b7c165f729469e2cd8b1
--- /dev/null
+++ b/docs/AMDGPU/gfx7_tgt.rst
@@ -0,0 +1,24 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_tgt:
+
+tgt
+===========================
+
+An export target:
+
+    ============== ===================================
+    Syntax         Description
+    ============== ===================================
+    pos{0..3}      Copy vertex position 0..3.
+    param{0..31}   Copy vertex parameter 0..31.
+    mrt{0..7}      Copy pixel color to the MRTs 0..7.
+    mrtz           Copy pixel depth (Z) data.
+    null           Copy nothing.
+    ============== ===================================
+
diff --git a/docs/AMDGPU/gfx7_type_dev.rst b/docs/AMDGPU/gfx7_type_dev.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6eab0e1b3964bdc30aca6ab2b9c2073bce739b83
--- /dev/null
+++ b/docs/AMDGPU/gfx7_type_dev.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_type_dev:
+
+Type deviation
+===========================
+
+*Type* of this operand differs from *type* :ref:`implied by the opcode<amdgpu_syn_instruction_type>`. This tag specifies actual operand *type*.
+
diff --git a/docs/AMDGPU/gfx7_uimm16.rst b/docs/AMDGPU/gfx7_uimm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bd0d4c2fb14421dc679b1e28d2542c440e5f3c91
--- /dev/null
+++ b/docs/AMDGPU/gfx7_uimm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_uimm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits and then zero-extended to 32 bits.
+
diff --git a/docs/AMDGPU/gfx7_vcc_64.rst b/docs/AMDGPU/gfx7_vcc_64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b1285e0e8b8d5e08cb7af465f4a0a37e0de3ccb0
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vcc_64.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vcc_64:
+
+vcc
+===========================
+
+Vector condition code.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`vcc<amdgpu_synid_vcc>`
diff --git a/docs/AMDGPU/gfx7_vdata128_0.rst b/docs/AMDGPU/gfx7_vdata128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5ed2b823bf7479c3500f69b62df58c825b11877f
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vdata128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vdata128_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vdata32_0.rst b/docs/AMDGPU/gfx7_vdata32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1615abb4f6c3e50a02b0f8b1dffed8e86d7beb80
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vdata32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vdata32_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vdata64_0.rst b/docs/AMDGPU/gfx7_vdata64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fceea9fd4b185b3fee3ed84bdbbcfa9001f07f2a
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vdata64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vdata64_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vdata96_0.rst b/docs/AMDGPU/gfx7_vdata96_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b9fe5996e53565de5ad8fb85a8b54454c441d1c6
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vdata96_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vdata96_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vdst128_0.rst b/docs/AMDGPU/gfx7_vdst128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c18652e88419be12651718543f973ddc3a284af9
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vdst128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vdst128_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vdst32_0.rst b/docs/AMDGPU/gfx7_vdst32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e93620355e6d77c865921d29006d1f300a6ae30e
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vdst32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vdst32_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vdst64_0.rst b/docs/AMDGPU/gfx7_vdst64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4cacaa0d63f10897f32ab4d620b15b363c0ae969
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vdst64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vdst64_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vdst96_0.rst b/docs/AMDGPU/gfx7_vdst96_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3c5bf880ce99143146fea552d3e54dc7028e436a
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vdst96_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vdst96_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vsrc128_0.rst b/docs/AMDGPU/gfx7_vsrc128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..975237916bb6c6c8cde84d2b1060851f3185c48d
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vsrc128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vsrc128_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vsrc32_0.rst b/docs/AMDGPU/gfx7_vsrc32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..93b12b00a1c74f1fbe198d87a668bf628a011354
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vsrc32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vsrc32_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_vsrc64_0.rst b/docs/AMDGPU/gfx7_vsrc64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d8c9d45ddb5fa43f73031e3a6b334c95e661a96a
--- /dev/null
+++ b/docs/AMDGPU/gfx7_vsrc64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_vsrc64_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx7_waitcnt.rst b/docs/AMDGPU/gfx7_waitcnt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3f5e07d164934ea3e92177d6c81d921e32c147d4
--- /dev/null
+++ b/docs/AMDGPU/gfx7_waitcnt.rst
@@ -0,0 +1,55 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid7_waitcnt:
+
+waitcnt
+===========================
+
+Counts of outstanding instructions to wait for.
+
+The bits of this operand have the following meaning:
+
+    ============ ======================================================
+    Bits         Description
+    ============ ======================================================
+    3:0          VM_CNT: vector memory operations count.
+    6:4          EXP_CNT: export count.
+    12:8         LGKM_CNT: LDS, GDS, Constant and Message count.
+    ============ ======================================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>`
+or as a combination of the following symbolic helpers:
+
+    ====================== ======================================================================
+    Syntax                 Description
+    ====================== ======================================================================
+    vmcnt(<*N*>)           VM_CNT value. *N* must not exceed the largest VM_CNT value.
+    expcnt(<*N*>)          EXP_CNT value. *N* must not exceed the largest EXP_CNT value.
+    lgkmcnt(<*N*>)         LGKM_CNT value. *N* must not exceed the largest LGKM_CNT value.
+    vmcnt_sat(<*N*>)       VM_CNT value computed as min(*N*, the largest VM_CNT value).
+    expcnt_sat(<*N*>)      EXP_CNT value computed as min(*N*, the largest EXP_CNT value).
+    lgkmcnt_sat(<*N*>)     LGKM_CNT value computed as min(*N*, the largest LGKM_CNT value).
+    ====================== ======================================================================
+
+These helpers may be specified in any order. Ampersands and commas may be used as optional separators.
+
+*N* is either an
+:ref:`integer number<amdgpu_synid_integer_number>` or an
+:ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+
+Examples:
+
+.. parsed-literal::
+
+    s_waitcnt 0
+    s_waitcnt vmcnt(1)
+    s_waitcnt expcnt(2) lgkmcnt(3)
+    s_waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+    s_waitcnt vmcnt(1), expcnt(2), lgkmcnt(3)
+    s_waitcnt vmcnt(1) & lgkmcnt_sat(100) & expcnt(2)
+
diff --git a/docs/AMDGPU/gfx8_addr_buf.rst b/docs/AMDGPU/gfx8_addr_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..74aa275c19973bcc19576524b00400897a2ad40e
--- /dev/null
+++ b/docs/AMDGPU/gfx8_addr_buf.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_addr_buf:
+
+vaddr
+===========================
+
+This is an optional operand which may specify offset and/or index.
+
+*Size:* 0, 1 or 2 dwords. Size is controlled by modifiers :ref:`offen<amdgpu_synid_offen>` and :ref:`idxen<amdgpu_synid_idxen>`:
+
+* If only :ref:`idxen<amdgpu_synid_idxen>` is specified, this operand supplies an index. Size is 1 dword.
+* If only :ref:`offen<amdgpu_synid_offen>` is specified, this operand supplies an offset. Size is 1 dword.
+* If both modifiers are specified, index is in the first register and offset is in the second. Size is 2 dwords.
+* If none of these modifiers are specified, this operand must be set to :ref:`off<amdgpu_synid_off>`.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx8_addr_ds.rst b/docs/AMDGPU/gfx8_addr_ds.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7115ff0cf04337c4284e64cf73b024b47ce1d0e2
--- /dev/null
+++ b/docs/AMDGPU/gfx8_addr_ds.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_addr_ds:
+
+vaddr
+===========================
+
+An offset from the start of GDS/LDS memory.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_addr_flat.rst b/docs/AMDGPU/gfx8_addr_flat.rst
new file mode 100644
index 0000000000000000000000000000000000000000..53dfcc3e895094150ac1a0f4664d33ee1c0e35be
--- /dev/null
+++ b/docs/AMDGPU/gfx8_addr_flat.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_addr_flat:
+
+vaddr
+===========================
+
+A 64-bit flat address.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_addr_mimg.rst b/docs/AMDGPU/gfx8_addr_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f1052badacdac84c13926d2e1a92ba2918b663a2
--- /dev/null
+++ b/docs/AMDGPU/gfx8_addr_mimg.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_addr_mimg:
+
+vaddr
+===========================
+
+Image address which includes from one to four dimensional coordinates and other data used to locate a position in the image.
+
+*Size:* 1, 2, 3, 4, 8 or 16 dwords. Actual size depends on opcode and specific image being handled.
+
+    Note 1. Image format and dimensions are encoded in the image resource constant but not in the instruction.
+
+    Note 2. Actually image address size may vary from 1 to 13 dwords, but assembler currently supports a limited range of register sequences.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_attr.rst b/docs/AMDGPU/gfx8_attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..12fa2cde842271f6cfddf088a9943a622120c89f
--- /dev/null
+++ b/docs/AMDGPU/gfx8_attr.rst
@@ -0,0 +1,30 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_attr:
+
+attr
+===========================
+
+Interpolation attribute and channel:
+
+    ============== ===================================
+    Syntax         Description
+    ============== ===================================
+    attr{0..32}.x  Attribute 0..32 with *x* channel.
+    attr{0..32}.y  Attribute 0..32 with *y* channel.
+    attr{0..32}.z  Attribute 0..32 with *z* channel.
+    attr{0..32}.w  Attribute 0..32 with *w* channel.
+    ============== ===================================
+
+Examples:
+
+.. parsed-literal::
+
+    v_interp_p1_f32 v1, v0, attr0.x
+    v_interp_p1_f32 v1, v0, attr32.w
+
diff --git a/docs/AMDGPU/gfx8_base_smem_addr.rst b/docs/AMDGPU/gfx8_base_smem_addr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..81ef25586d6e1f0a82ac741a3dc3f7d03a3286e8
--- /dev/null
+++ b/docs/AMDGPU/gfx8_base_smem_addr.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_base_smem_addr:
+
+sbase
+===========================
+
+A 64-bit base address for scalar memory operations.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx8_base_smem_buf.rst b/docs/AMDGPU/gfx8_base_smem_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fb243d0f262f8ef8b98844e1bbaa8a246a168214
--- /dev/null
+++ b/docs/AMDGPU/gfx8_base_smem_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_base_smem_buf:
+
+sbase
+===========================
+
+A 128-bit buffer resource constant for scalar memory operations which provides a base address, a size and a stride.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx8_bimm16.rst b/docs/AMDGPU/gfx8_bimm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed50e5582329ac480b5ea62849250a4812d4a351
--- /dev/null
+++ b/docs/AMDGPU/gfx8_bimm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_bimm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits.
+
diff --git a/docs/AMDGPU/gfx8_bimm32.rst b/docs/AMDGPU/gfx8_bimm32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d03c27b1732b8267c7931c67eca75f8e60f1d3a1
--- /dev/null
+++ b/docs/AMDGPU/gfx8_bimm32.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_bimm32:
+
+imm32
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 32 bits.
+
diff --git a/docs/AMDGPU/gfx8_data_buf_atomic128.rst b/docs/AMDGPU/gfx8_data_buf_atomic128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..40b6d3a236e86c09503048f4c418adfb524e9ef3
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_buf_atomic128.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_buf_atomic128:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 4 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_buf_atomic32.rst b/docs/AMDGPU/gfx8_data_buf_atomic32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..51121820d740423e762ade9731c14130cc421106
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_buf_atomic32.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_buf_atomic32:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 1 dword by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_buf_atomic64.rst b/docs/AMDGPU/gfx8_data_buf_atomic64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..107998e37d9317c0d0ea6f4b9bfba6eba49aa4b1
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_buf_atomic64.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_buf_atomic64:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 2 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_buf_d16_128.rst b/docs/AMDGPU/gfx8_data_buf_d16_128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3c98a360252e34b71715cdc1fbd77ea667eea404
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_buf_d16_128.rst
@@ -0,0 +1,20 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_buf_d16_128:
+
+vdata
+===========================
+
+16-bit data to store by a buffer instruction.
+
+*Size:* depends on GFX8 GPU revision:
+
+* 4 dwords for GFX8.0. This H/W supports no packing.
+* 2 dwords for GFX8.1+. This H/W supports data packing.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_buf_d16_32.rst b/docs/AMDGPU/gfx8_data_buf_d16_32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..64328756f84c10335aee0edcb23aa722a6b65205
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_buf_d16_32.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_buf_d16_32:
+
+vdata
+===========================
+
+16-bit data to store by a buffer instruction.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_buf_d16_64.rst b/docs/AMDGPU/gfx8_data_buf_d16_64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..932ce6920fca17a18b848407e8316b7982cf1dd8
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_buf_d16_64.rst
@@ -0,0 +1,20 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_buf_d16_64:
+
+vdata
+===========================
+
+16-bit data to store by a buffer instruction.
+
+*Size:* depends on GFX8 GPU revision:
+
+* 2 dwords for GFX8.0. This H/W supports no packing.
+* 1 dword for GFX8.1+. This H/W supports data packing.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_buf_d16_96.rst b/docs/AMDGPU/gfx8_data_buf_d16_96.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b9e6915229fdc404d3871ff7e2ecb2217121dab9
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_buf_d16_96.rst
@@ -0,0 +1,20 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_buf_d16_96:
+
+vdata
+===========================
+
+16-bit data to store by a buffer instruction.
+
+*Size:* depends on GFX8 GPU revision:
+
+* 3 dwords for GFX8.0. This H/W supports no packing.
+* 2 dwords for GFX8.1+. This H/W supports data packing.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_mimg_atomic_cmp.rst b/docs/AMDGPU/gfx8_data_mimg_atomic_cmp.rst
new file mode 100644
index 0000000000000000000000000000000000000000..80222ead81a83a6f5f84e2011205db89d88f13aa
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_mimg_atomic_cmp.rst
@@ -0,0 +1,27 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_mimg_atomic_cmp:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify 2 data elements for 32-bit-per-pixel surfaces or 4 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+  Note. The surface data format is indicated in the image resource constant but not in the instruction.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_mimg_atomic_reg.rst b/docs/AMDGPU/gfx8_data_mimg_atomic_reg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8baf9269b887b303abb58040dc218dc51fd8535e
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_mimg_atomic_reg.rst
@@ -0,0 +1,26 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_mimg_atomic_reg:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify 1 data element for 32-bit-per-pixel surfaces or 2 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+  Note. The surface data format is indicated in the image resource constant but not in the instruction.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_mimg_store.rst b/docs/AMDGPU/gfx8_data_mimg_store.rst
new file mode 100644
index 0000000000000000000000000000000000000000..65a1a49f67e5d7123bcd4f599f74cf0534c01574
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_mimg_store.rst
@@ -0,0 +1,18 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_mimg_store:
+
+vdata
+===========================
+
+Image data to store by an *image_store* instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` which may specify from 1 to 4 data elements. Each data element occupies 1 dword.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_data_mimg_store_d16.rst b/docs/AMDGPU/gfx8_data_mimg_store_d16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7524c884e7650c977fc0ca4cc59014a7d1186bad
--- /dev/null
+++ b/docs/AMDGPU/gfx8_data_mimg_store_d16.rst
@@ -0,0 +1,24 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_data_mimg_store_d16:
+
+vdata
+===========================
+
+Image data to store by an *image_store* instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`d16<amdgpu_synid_d16>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify from 1 to 4 data elements. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16<amdgpu_synid_d16>`.
+* :ref:`d16<amdgpu_synid_d16>` has different meaning for GFX8.0 and GFX8.1:
+
+  * For GFX8.0 this modifier does not affect size of data elements in registers. Data in registers are stored in low 16 bits, high 16 bits are unused. There is no packing.
+  * Starting from GFX8.1 this modifier specifies that data elements in registers are packed; each value occupies 16 bits.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_buf_128.rst b/docs/AMDGPU/gfx8_dst_buf_128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d076c7025f63d23ac256f22888975446e23ee0b7
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_buf_128.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_buf_128:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 4 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_buf_64.rst b/docs/AMDGPU/gfx8_dst_buf_64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f5da65bf57377de599b8e41eaf9317eb9cbe4f81
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_buf_64.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_buf_64:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 2 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_buf_96.rst b/docs/AMDGPU/gfx8_dst_buf_96.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0012c1a88d30d625f21ecfa0bfed9eeb63d3e1f3
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_buf_96.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_buf_96:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 3 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_buf_d16_128.rst b/docs/AMDGPU/gfx8_dst_buf_d16_128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f5318d8d8ce863d5c3a961a9e9c31f1f3079525
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_buf_d16_128.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_buf_d16_128:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer and converted to a 16-bit format.
+
+*Size:* depends on GFX8 GPU revision and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* 4 dwords for GFX8.0. This H/W supports no packing.
+* 2 dwords for GFX8.1+. This H/W supports data packing.
+* :ref:`tfe<amdgpu_synid_tfe>` adds one dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_buf_d16_32.rst b/docs/AMDGPU/gfx8_dst_buf_d16_32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6288c2def70aafaa07e1a019381c549cd1c49cb1
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_buf_d16_32.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_buf_d16_32:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer and converted to a 16-bit format.
+
+*Size:* 1 dword by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_buf_d16_64.rst b/docs/AMDGPU/gfx8_dst_buf_d16_64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b46310fd5241ac3d0d520da8cbcb61214a24f8c8
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_buf_d16_64.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_buf_d16_64:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer and converted to a 16-bit format.
+
+*Size:* depends on GFX8 GPU revision and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* 2 dwords for GFX8.0. This H/W supports no packing.
+* 1 dword for GFX8.1+. This H/W supports data packing.
+* :ref:`tfe<amdgpu_synid_tfe>` adds one dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_buf_d16_96.rst b/docs/AMDGPU/gfx8_dst_buf_d16_96.rst
new file mode 100644
index 0000000000000000000000000000000000000000..15e7e8901970a43e3aedb114bf8db58be06876fc
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_buf_d16_96.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_buf_d16_96:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer and converted to a 16-bit format.
+
+*Size:* depends on GFX8 GPU revision and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* 3 dwords for GFX8.0. This H/W supports no packing.
+* 2 dwords for GFX8.1+. This H/W supports data packing.
+* :ref:`tfe<amdgpu_synid_tfe>` adds one dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_buf_lds.rst b/docs/AMDGPU/gfx8_dst_buf_lds.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b1cb1458dfac3a79ed62370cba80a8c9e47e6e64
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_buf_lds.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_buf_lds:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+If :ref:`lds<amdgpu_synid_lds>` is specified, this operand is ignored by H/W and data are stored directly into LDS.
+
+*Size:* 1 dword by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+    Note that :ref:`tfe<amdgpu_synid_tfe>` and :ref:`lds<amdgpu_synid_lds>` cannot be used together.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_flat_atomic32.rst b/docs/AMDGPU/gfx8_dst_flat_atomic32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a8ae4646ae98b350e64c791d4319fbc31b5580d7
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_flat_atomic32.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_flat_atomic32:
+
+vdst
+===========================
+
+Data returned by a 32-bit atomic flat instruction.
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_flat_atomic64.rst b/docs/AMDGPU/gfx8_dst_flat_atomic64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5b46e88f5610003073fef917c6880c40a52b50e6
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_flat_atomic64.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_flat_atomic64:
+
+vdst
+===========================
+
+Data returned by a 64-bit atomic flat instruction.
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_mimg_gather4.rst b/docs/AMDGPU/gfx8_dst_mimg_gather4.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6fc01926ed176ae8a3f4c8ad81dcdbc48a89d1a8
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_mimg_gather4.rst
@@ -0,0 +1,26 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_mimg_gather4:
+
+vdst
+===========================
+
+Image data to load by an *image_gather4* instruction.
+
+*Size:* 4 data elements by default. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16<amdgpu_synid_d16>`.
+
+:ref:`d16<amdgpu_synid_d16>` and :ref:`tfe<amdgpu_synid_tfe>` affect operand size as follows:
+
+* :ref:`d16<amdgpu_synid_d16>` has different meaning for GFX8.0 and GFX8.1:
+
+  * For GFX8.0 this modifier does not affect size of data elements in registers. Data in registers are stored in low 16 bits, high 16 bits are unused. There is no packing.
+  * Starting from GFX8.1 this modifier specifies that data elements in registers are packed; each value occupies 16 bits.
+
+* :ref:`tfe<amdgpu_synid_tfe>` adds one dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_mimg_regular.rst b/docs/AMDGPU/gfx8_dst_mimg_regular.rst
new file mode 100644
index 0000000000000000000000000000000000000000..be1037a5ce5ae19c0e2ef3f3e7f692291c0913c7
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_mimg_regular.rst
@@ -0,0 +1,20 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_mimg_regular:
+
+vdst
+===========================
+
+Image data to load by an image instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify from 1 to 4 data elements. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_dst_mimg_regular_d16.rst b/docs/AMDGPU/gfx8_dst_mimg_regular_d16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4eb7037386f10abb64912e05a4418d8e74f8f877
--- /dev/null
+++ b/docs/AMDGPU/gfx8_dst_mimg_regular_d16.rst
@@ -0,0 +1,26 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_dst_mimg_regular_d16:
+
+vdst
+===========================
+
+Image data to load by an image instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>`, :ref:`tfe<amdgpu_synid_tfe>` and :ref:`d16<amdgpu_synid_d16>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify from 1 to 4 data elements. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16<amdgpu_synid_d16>`.
+* :ref:`d16<amdgpu_synid_d16>` has different meaning for GFX8.0 and GFX8.1:
+
+  * For GFX8.0 this modifier does not affect size of data elements in registers. Data in registers are stored in low 16 bits, high 16 bits are unused. There is no packing.
+  * Starting from GFX8.1 this modifier specifies that data elements in registers are packed; each value occupies 16 bits.
+
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_fimm16.rst b/docs/AMDGPU/gfx8_fimm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5e387f5cb77464b33e77f38052ee8c6722fdb2a4
--- /dev/null
+++ b/docs/AMDGPU/gfx8_fimm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_fimm16:
+
+imm32
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>` or a :ref:`floating-point_number<amdgpu_synid_floating-point_number>`. The number is converted to *f16* as described :ref:`here<amdgpu_synid_lit_conv>`.
+
diff --git a/docs/AMDGPU/gfx8_fimm32.rst b/docs/AMDGPU/gfx8_fimm32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e29e7704b8aa5657aee16a42a45ea0651e3b5a88
--- /dev/null
+++ b/docs/AMDGPU/gfx8_fimm32.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_fimm32:
+
+imm32
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>` or a :ref:`floating-point_number<amdgpu_synid_floating-point_number>`. The value is converted to *f32* as described :ref:`here<amdgpu_synid_lit_conv>`.
+
diff --git a/docs/AMDGPU/gfx8_hwreg.rst b/docs/AMDGPU/gfx8_hwreg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ffa1ea5afde3dd946300976a44e6575fe281f45c
--- /dev/null
+++ b/docs/AMDGPU/gfx8_hwreg.rst
@@ -0,0 +1,60 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_hwreg:
+
+hwreg
+===========================
+
+Bits of a hardware register being accessed.
+
+The bits of this operand have the following meaning:
+
+    ============ ===================================
+    Bits         Description
+    ============ ===================================
+    5:0          Register *id*.
+    10:6         First bit *offset* (0..31).
+    15:11        *Size* in bits (1..32).
+    ============ ===================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>` or using the syntax described below.
+
+    ==================================== ============================================================================
+    Syntax                               Description
+    ==================================== ============================================================================
+    hwreg({0..63})                       All bits of a register indicated by its *id*.
+    hwreg(<*name*>)                      All bits of a register indicated by its *name*.
+    hwreg({0..63}, {0..31}, {1..32})     Register bits indicated by register *id*, first bit *offset* and *size*.
+    hwreg(<*name*>, {0..31}, {1..32})    Register bits indicated by register *name*, first bit *offset* and *size*.
+    ==================================== ============================================================================
+
+Register *id*, *offset* and *size* must be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`.
+
+Defined register *names* include:
+
+    =================== ==========================================
+    Name                Description
+    =================== ==========================================
+    HW_REG_MODE         Shader writeable mode bits.
+    HW_REG_STATUS       Shader read-only status.
+    HW_REG_TRAPSTS      Trap status.
+    HW_REG_HW_ID        Id of wave, simd, compute unit, etc.
+    HW_REG_GPR_ALLOC    Per-wave SGPR and VGPR allocation.
+    HW_REG_LDS_ALLOC    Per-wave LDS allocation.
+    HW_REG_IB_STS       Counters of outstanding instructions.
+    =================== ==========================================
+
+Examples:
+
+.. parsed-literal::
+
+    s_getreg_b32 s2, 0x6
+    s_getreg_b32 s2, hwreg(15)
+    s_getreg_b32 s2, hwreg(51, 1, 31)
+    s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1)
+
diff --git a/docs/AMDGPU/gfx8_imm4.rst b/docs/AMDGPU/gfx8_imm4.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a03de76e86efbaa3daf0cd7d0a2fdda68e6d403a
--- /dev/null
+++ b/docs/AMDGPU/gfx8_imm4.rst
@@ -0,0 +1,25 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_imm4:
+
+imm4
+===========================
+
+A positive :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 4 bits.
+
+This operand is a mask which controls indexing mode for operands of subsequent instructions. Value 1 enables indexing and value 0 disables it.
+
+    ============ ========================================
+    Bit          Meaning
+    ============ ========================================
+    0            Enables or disables *src0* indexing.
+    1            Enables or disables *src1* indexing.
+    2            Enables or disables *src2* indexing.
+    3            Enables or disables *dst* indexing.
+    ============ ========================================
+
diff --git a/docs/AMDGPU/gfx8_label.rst b/docs/AMDGPU/gfx8_label.rst
new file mode 100644
index 0000000000000000000000000000000000000000..99e384ee392e86961045828cbeacf30550c138ad
--- /dev/null
+++ b/docs/AMDGPU/gfx8_label.rst
@@ -0,0 +1,30 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_label:
+
+label
+===========================
+
+A branch target which is a 16-bit signed integer treated as a PC-relative dword offset.
+
+This operand may be specified as:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>`. The number is truncated to 16 bits.
+* An :ref:`absolute_expression<amdgpu_synid_absolute_expression>` which must start with an :ref:`integer_number<amdgpu_synid_integer_number>`. The value of the expression is truncated to 16 bits.
+* A :ref:`symbol<amdgpu_synid_symbol>` (for example, a label). The value is handled as a 16-bit PC-relative dword offset to be resolved by a linker.
+
+Examples:
+
+.. parsed-literal::
+
+  offset = 30
+  s_branch loop_end
+  s_branch 2 + offset
+  s_branch 32
+  loop_end:
+
diff --git a/docs/AMDGPU/gfx8_mod_dpp_sdwa_abs_neg.rst b/docs/AMDGPU/gfx8_mod_dpp_sdwa_abs_neg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..be7b4b511b177754179154a2a63cf2ad0428e901
--- /dev/null
+++ b/docs/AMDGPU/gfx8_mod_dpp_sdwa_abs_neg.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_mod_dpp_sdwa_abs_neg:
+
+m
+===========================
+
+This operand may be used with floating point operand modifiers :ref:`abs<amdgpu_synid_abs>` and :ref:`neg<amdgpu_synid_neg>`.
+
diff --git a/docs/AMDGPU/gfx8_mod_sdwa_sext.rst b/docs/AMDGPU/gfx8_mod_sdwa_sext.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b48c521f3b394bfcf54a264d2af877176f01d53f
--- /dev/null
+++ b/docs/AMDGPU/gfx8_mod_sdwa_sext.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_mod_sdwa_sext:
+
+m
+===========================
+
+This operand may be used with integer operand modifier :ref:`sext<amdgpu_synid_sext>`.
+
diff --git a/docs/AMDGPU/gfx8_mod_vop3_abs_neg.rst b/docs/AMDGPU/gfx8_mod_vop3_abs_neg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..960e8b1f38f1309b111135d1628b3a4a5b1d9855
--- /dev/null
+++ b/docs/AMDGPU/gfx8_mod_vop3_abs_neg.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_mod_vop3_abs_neg:
+
+m
+===========================
+
+This operand may be used with floating point operand modifiers :ref:`abs<amdgpu_synid_abs>` and :ref:`neg<amdgpu_synid_neg>`.
+
diff --git a/docs/AMDGPU/gfx8_msg.rst b/docs/AMDGPU/gfx8_msg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..313d8e68b4b80a88793f515199105fff3a7e3e6a
--- /dev/null
+++ b/docs/AMDGPU/gfx8_msg.rst
@@ -0,0 +1,72 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_msg:
+
+msg
+===========================
+
+A 16-bit message code. The bits of this operand have the following meaning:
+
+    ============ ======================================================
+    Bits         Description
+    ============ ======================================================
+    3:0          Message *type*.
+    6:4          Optional *operation*.
+    9:7          Optional *parameters*.
+    15:10        Unused.
+    ============ ======================================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>` or using the syntax described below:
+
+    ======================================== ========================================================================
+    Syntax                                   Description
+    ======================================== ========================================================================
+    sendmsg(<*type*>)                        A message identified by its *type*.
+    sendmsg(<*type*>, <*op*>)                A message identified by its *type* and *operation*.
+    sendmsg(<*type*>, <*op*>, <*stream*>)    A message identified by its *type* and *operation* with a stream *id*.
+    ======================================== ========================================================================
+
+*Type* may be specified using message *name* or message *id*.
+
+*Op* may be specified using operation *name* or operation *id*.
+
+Stream *id* is an integer in the range 0..3.
+
+Message *id*, operation *id* and stream *id* must be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`.
+
+Each message type supports specific operations:
+
+    ================= ========== ============================== ============ ==========
+    Message name      Message Id Supported Operations           Operation Id Stream Id
+    ================= ========== ============================== ============ ==========
+    MSG_INTERRUPT     1          \-                             \-           \-
+    MSG_GS            2          GS_OP_CUT                      1            Optional
+    \                            GS_OP_EMIT                     2            Optional
+    \                            GS_OP_EMIT_CUT                 3            Optional
+    MSG_GS_DONE       3          GS_OP_NOP                      0            \-
+    \                            GS_OP_CUT                      1            Optional
+    \                            GS_OP_EMIT                     2            Optional
+    \                            GS_OP_EMIT_CUT                 3            Optional
+    MSG_SYSMSG        15         SYSMSG_OP_ECC_ERR_INTERRUPT    1            \-
+    \                            SYSMSG_OP_REG_RD               2            \-
+    \                            SYSMSG_OP_HOST_TRAP_ACK        3            \-
+    \                            SYSMSG_OP_TTRACE_PC            4            \-
+    ================= ========== ============================== ============ ==========
+
+Examples:
+
+.. parsed-literal::
+
+    s_sendmsg 0x12
+    s_sendmsg sendmsg(MSG_INTERRUPT)
+    s_sendmsg sendmsg(2, GS_OP_CUT)
+    s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT)
+    s_sendmsg sendmsg(MSG_GS, 2)
+    s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_EMIT_CUT, 1)
+    s_sendmsg sendmsg(MSG_SYSMSG, SYSMSG_OP_TTRACE_PC)
+
diff --git a/docs/AMDGPU/gfx8_offset_buf.rst b/docs/AMDGPU/gfx8_offset_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..42c452416b5be4d3ac93ff3a3fb98a6b8f59351c
--- /dev/null
+++ b/docs/AMDGPU/gfx8_offset_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_offset_buf:
+
+soffset
+===========================
+
+An unsigned byte offset.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx8_offset_smem_load.rst b/docs/AMDGPU/gfx8_offset_smem_load.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5c30a87f569c07589edd5f51ea1b126d2e95746e
--- /dev/null
+++ b/docs/AMDGPU/gfx8_offset_smem_load.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_offset_smem_load:
+
+soffset
+===========================
+
+An unsigned byte offset added to the base address to get memory address.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`uimm20<amdgpu_synid_uimm20>`
diff --git a/docs/AMDGPU/gfx8_offset_smem_store.rst b/docs/AMDGPU/gfx8_offset_smem_store.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9ff90f98b35463a0ebf4caf3054e6004e8da7ca4
--- /dev/null
+++ b/docs/AMDGPU/gfx8_offset_smem_store.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_offset_smem_store:
+
+soffset
+===========================
+
+An unsigned byte offset added to the base address to get memory address.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`m0<amdgpu_synid_m0>`, :ref:`uimm20<amdgpu_synid_uimm20>`
diff --git a/docs/AMDGPU/gfx8_opt.rst b/docs/AMDGPU/gfx8_opt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..417d7fa1ff38bf9327ad16b68798de1d2aa96be0
--- /dev/null
+++ b/docs/AMDGPU/gfx8_opt.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_opt:
+
+opt
+===========================
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
diff --git a/docs/AMDGPU/gfx8_param.rst b/docs/AMDGPU/gfx8_param.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0bd88549f0923552f66aa9f97f7a36f69c0cd9d3
--- /dev/null
+++ b/docs/AMDGPU/gfx8_param.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_param:
+
+param
+===========================
+
+Interpolation parameter to read:
+
+    ============ ===================================
+    Syntax       Description
+    ============ ===================================
+    p0           Parameter *P0*.
+    p10          Parameter *P10*.
+    p20          Parameter *P20*.
+    ============ ===================================
+
diff --git a/docs/AMDGPU/gfx8_perm_smem.rst b/docs/AMDGPU/gfx8_perm_smem.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0035ac821a7248c6ff5777a02ec9656a7408be78
--- /dev/null
+++ b/docs/AMDGPU/gfx8_perm_smem.rst
@@ -0,0 +1,24 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_perm_smem:
+
+imm3
+===========================
+
+A bit mask which indicates request permissions.
+
+This operand must be specified as an :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 7 bits, but only 3 low bits are significant.
+
+    ============ ==============================
+    Bit Number   Description
+    ============ ==============================
+    0            Request *read* permission.
+    1            Request *write* permission.
+    2            Request *execute* permission.
+    ============ ==============================
+
diff --git a/docs/AMDGPU/gfx8_ret.rst b/docs/AMDGPU/gfx8_ret.rst
new file mode 100644
index 0000000000000000000000000000000000000000..91fdaf378a1d5ce47f609343454814929761887b
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ret.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ret:
+
+dst
+===========================
+
+This is an input operand. It may optionally serve as a destination if :ref:`glc<amdgpu_synid_glc>` is specified.
+
diff --git a/docs/AMDGPU/gfx8_rsrc_buf.rst b/docs/AMDGPU/gfx8_rsrc_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ecdb0a0d9dcf5063ea12b98528e739b5431d67d4
--- /dev/null
+++ b/docs/AMDGPU/gfx8_rsrc_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_rsrc_buf:
+
+srsrc
+===========================
+
+Buffer resource constant which defines the address and characteristics of the buffer in memory.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx8_rsrc_mimg.rst b/docs/AMDGPU/gfx8_rsrc_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3ca255928983f100f3aa57eaa5f23f970e9e5a0c
--- /dev/null
+++ b/docs/AMDGPU/gfx8_rsrc_mimg.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_rsrc_mimg:
+
+srsrc
+===========================
+
+Image resource constant which defines the location of the image buffer in memory, its dimensions, tiling, and data format.
+
+*Size:* 8 dwords by default, 4 dwords if :ref:`r128<amdgpu_synid_r128>` is specified.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx8_samp_mimg.rst b/docs/AMDGPU/gfx8_samp_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c4b27125768d8488cd57494b7d8b6e31b9341042
--- /dev/null
+++ b/docs/AMDGPU/gfx8_samp_mimg.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_samp_mimg:
+
+ssamp
+===========================
+
+Sampler constant used to specify filtering options applied to the image data after it is read.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx8_sdata128_0.rst b/docs/AMDGPU/gfx8_sdata128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a52703c3545ad1d69d5cf90f2d87e820e1abafd0
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdata128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdata128_0:
+
+sdata
+===========================
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx8_sdata32_0.rst b/docs/AMDGPU/gfx8_sdata32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9ccd7bd8c7136d7673cf227b8b60cc55e8d3faf9
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdata32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdata32_0:
+
+sdata
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx8_sdata64_0.rst b/docs/AMDGPU/gfx8_sdata64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8718449228c7df64fd4e4611123b5dcbafc845a3
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdata64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdata64_0:
+
+sdata
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx8_sdst128_0.rst b/docs/AMDGPU/gfx8_sdst128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..277e3db02c410f9a8e3ac5d6561f3dc39d357257
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdst128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdst128_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx8_sdst256_0.rst b/docs/AMDGPU/gfx8_sdst256_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e54b9b1462c8bfa3ec4148f76ee9b0f164ec6fe
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdst256_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdst256_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx8_sdst32_0.rst b/docs/AMDGPU/gfx8_sdst32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..44e6cdc8f7f5473727532d0af791205d841f5aae
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdst32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdst32_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx8_sdst32_1.rst b/docs/AMDGPU/gfx8_sdst32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7156225fa3c4556aad784b1ddbac69804092cf16
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdst32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdst32_1:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx8_sdst32_2.rst b/docs/AMDGPU/gfx8_sdst32_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..af446d3e54ce7868a0c09c8f7ca15492a2024e51
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdst32_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdst32_2:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx8_sdst512_0.rst b/docs/AMDGPU/gfx8_sdst512_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..95b82a7d024988fdbf5f4c7137768612f7521ffc
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdst512_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdst512_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 16 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`
diff --git a/docs/AMDGPU/gfx8_sdst64_0.rst b/docs/AMDGPU/gfx8_sdst64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9195778a5e82b3385dceaa54c9b3cc2c41a96e6a
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdst64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdst64_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx8_sdst64_1.rst b/docs/AMDGPU/gfx8_sdst64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..165e0c0175f93a42287140372f08a5caf7cebec2
--- /dev/null
+++ b/docs/AMDGPU/gfx8_sdst64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_sdst64_1:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx8_simm16.rst b/docs/AMDGPU/gfx8_simm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..730f239b6bea705047f01439d1ff3f08590574b9
--- /dev/null
+++ b/docs/AMDGPU/gfx8_simm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_simm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits and then sign-extended to 32 bits.
+
diff --git a/docs/AMDGPU/gfx8_src32_0.rst b/docs/AMDGPU/gfx8_src32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a9c11fea8e3b59338efd0d05151261fd7516f13a
--- /dev/null
+++ b/docs/AMDGPU/gfx8_src32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_src32_0:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx8_src32_1.rst b/docs/AMDGPU/gfx8_src32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..67dcdc8ed5e9bef0cbd6605cac1ee618d2fb1f4e
--- /dev/null
+++ b/docs/AMDGPU/gfx8_src32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_src32_1:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx8_src64_0.rst b/docs/AMDGPU/gfx8_src64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..573fd68b2a1176744fccd054bab8956ca82b6695
--- /dev/null
+++ b/docs/AMDGPU/gfx8_src64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_src64_0:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx8_src64_1.rst b/docs/AMDGPU/gfx8_src64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d2c78b754d7efb3e1223d86687244b888f4d55f7
--- /dev/null
+++ b/docs/AMDGPU/gfx8_src64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_src64_1:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx8_src_exp.rst b/docs/AMDGPU/gfx8_src_exp.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10449b4e36e845daec37ce6dd8366ac49797f9e7
--- /dev/null
+++ b/docs/AMDGPU/gfx8_src_exp.rst
@@ -0,0 +1,28 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_src_exp:
+
+vsrc
+===========================
+
+Data to copy to export buffers. This is an optional operand. Must be specified as :ref:`off<amdgpu_synid_off>` if not used.
+
+:ref:`compr<amdgpu_synid_compr>` modifier indicates use of compressed (16-bit) data. This limits number of source operands from 4 to 2:
+
+* src0 and src1 must specify the first register (or :ref:`off<amdgpu_synid_off>`).
+* src2 and src3 must specify the second register (or :ref:`off<amdgpu_synid_off>`).
+
+An example:
+
+.. parsed-literal::
+
+  exp mrtz v3, v3, off, off compr
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx8_ssrc32_0.rst b/docs/AMDGPU/gfx8_ssrc32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..82d18b1585375d2861a7d27bb8ba010a3bdb317e
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc32_0:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx8_ssrc32_1.rst b/docs/AMDGPU/gfx8_ssrc32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..203d9c5397fc80bb9439448d2959eb8681a7def7
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc32_1:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx8_ssrc32_2.rst b/docs/AMDGPU/gfx8_ssrc32_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9b893e962a0ac8434f239d82b26cf63aa63cbefb
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc32_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc32_2:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx8_ssrc32_3.rst b/docs/AMDGPU/gfx8_ssrc32_3.rst
new file mode 100644
index 0000000000000000000000000000000000000000..131765fee506c27063983973f9c781b23c95f44d
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc32_3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc32_3:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`iconst<amdgpu_synid_iconst>`
diff --git a/docs/AMDGPU/gfx8_ssrc32_4.rst b/docs/AMDGPU/gfx8_ssrc32_4.rst
new file mode 100644
index 0000000000000000000000000000000000000000..02d90a4d1da43a84bad65670a32b654e1cdd304c
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc32_4.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc32_4:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx8_ssrc64_0.rst b/docs/AMDGPU/gfx8_ssrc64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b8389dcdbae14b822a4065ae89aa0c65f4d9e4c5
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc64_0:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx8_ssrc64_1.rst b/docs/AMDGPU/gfx8_ssrc64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c4fddf4ace2b77bd65385f030e7d1f3347c76cc0
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc64_1:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`
diff --git a/docs/AMDGPU/gfx8_ssrc64_2.rst b/docs/AMDGPU/gfx8_ssrc64_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..209dffc8da03b614e9895fbd63b00a16186afea8
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc64_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc64_2:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx8_ssrc64_3.rst b/docs/AMDGPU/gfx8_ssrc64_3.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9ab5436572d10e33d2d71d3250b572540f24d530
--- /dev/null
+++ b/docs/AMDGPU/gfx8_ssrc64_3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_ssrc64_3:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`trap<amdgpu_synid_trap>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx8_tgt.rst b/docs/AMDGPU/gfx8_tgt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1be54a73269b15d72ddb7209675b6cb957a5fa41
--- /dev/null
+++ b/docs/AMDGPU/gfx8_tgt.rst
@@ -0,0 +1,24 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_tgt:
+
+tgt
+===========================
+
+An export target:
+
+    ============== ===================================
+    Syntax         Description
+    ============== ===================================
+    pos{0..3}      Copy vertex position 0..3.
+    param{0..31}   Copy vertex parameter 0..31.
+    mrt{0..7}      Copy pixel color to the MRTs 0..7.
+    mrtz           Copy pixel depth (Z) data.
+    null           Copy nothing.
+    ============== ===================================
+
diff --git a/docs/AMDGPU/gfx8_type_dev.rst b/docs/AMDGPU/gfx8_type_dev.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2f5b36f9f9b40cedfb0418f8feb31252e20df44b
--- /dev/null
+++ b/docs/AMDGPU/gfx8_type_dev.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_type_dev:
+
+Type deviation
+===========================
+
+*Type* of this operand differs from *type* :ref:`implied by the opcode<amdgpu_syn_instruction_type>`. This tag specifies actual operand *type*.
+
diff --git a/docs/AMDGPU/gfx8_uimm16.rst b/docs/AMDGPU/gfx8_uimm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a20abcc1344867d9d93c722518b16a75d8801d4c
--- /dev/null
+++ b/docs/AMDGPU/gfx8_uimm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_uimm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits and then zero-extended to 32 bits.
+
diff --git a/docs/AMDGPU/gfx8_vcc_64.rst b/docs/AMDGPU/gfx8_vcc_64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e31df0e51c2ee9c0e385cbf78c196f61f34167c3
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vcc_64.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vcc_64:
+
+vcc
+===========================
+
+Vector condition code.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`vcc<amdgpu_synid_vcc>`
diff --git a/docs/AMDGPU/gfx8_vdata128_0.rst b/docs/AMDGPU/gfx8_vdata128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bf7e3db732b1fa1c1666dc06ccdf9d32904dd4ac
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vdata128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vdata128_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vdata32_0.rst b/docs/AMDGPU/gfx8_vdata32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b89d65b1c7931668eee592e7c5b41799e906f1f7
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vdata32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vdata32_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vdata64_0.rst b/docs/AMDGPU/gfx8_vdata64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..438054456060285468b48c9b4d2cff68dcdeba11
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vdata64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vdata64_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vdata96_0.rst b/docs/AMDGPU/gfx8_vdata96_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b8ad22d1f24aa6a7ac9ea129e4dd65760defcefa
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vdata96_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vdata96_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vdst128_0.rst b/docs/AMDGPU/gfx8_vdst128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1eccc95230928e8e5fc9bc44924e052e41295770
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vdst128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vdst128_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vdst32_0.rst b/docs/AMDGPU/gfx8_vdst32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..781fcb6ec35b703c45fc964ed76018f41038afa7
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vdst32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vdst32_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vdst64_0.rst b/docs/AMDGPU/gfx8_vdst64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..af2dfe9bf38c883e4f14390f916749f8f01402e9
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vdst64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vdst64_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vdst96_0.rst b/docs/AMDGPU/gfx8_vdst96_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4895b657900294a360fa09ba01c74246da0c8347
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vdst96_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vdst96_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vsrc128_0.rst b/docs/AMDGPU/gfx8_vsrc128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25b1794d85f317a6cc7726caa9936e074c635163
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vsrc128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vsrc128_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vsrc32_0.rst b/docs/AMDGPU/gfx8_vsrc32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..524f36a0a8932a995a8d8a9d41fd4f43e6c05afa
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vsrc32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vsrc32_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_vsrc64_0.rst b/docs/AMDGPU/gfx8_vsrc64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7c2c39ff753083255f95dfb05764f0e4b0823275
--- /dev/null
+++ b/docs/AMDGPU/gfx8_vsrc64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_vsrc64_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx8_waitcnt.rst b/docs/AMDGPU/gfx8_waitcnt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4bad594171164d6defd7ded0f4f6ded3a6fa4c5c
--- /dev/null
+++ b/docs/AMDGPU/gfx8_waitcnt.rst
@@ -0,0 +1,55 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid8_waitcnt:
+
+waitcnt
+===========================
+
+Counts of outstanding instructions to wait for.
+
+The bits of this operand have the following meaning:
+
+    ============ ======================================================
+    Bits         Description
+    ============ ======================================================
+    3:0          VM_CNT: vector memory operations count.
+    6:4          EXP_CNT: export count.
+    11:8         LGKM_CNT: LDS, GDS, Constant and Message count.
+    ============ ======================================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>`
+or as a combination of the following symbolic helpers:
+
+    ====================== ======================================================================
+    Syntax                 Description
+    ====================== ======================================================================
+    vmcnt(<*N*>)           VM_CNT value. *N* must not exceed the largest VM_CNT value.
+    expcnt(<*N*>)          EXP_CNT value. *N* must not exceed the largest EXP_CNT value.
+    lgkmcnt(<*N*>)         LGKM_CNT value. *N* must not exceed the largest LGKM_CNT value.
+    vmcnt_sat(<*N*>)       VM_CNT value computed as min(*N*, the largest VM_CNT value).
+    expcnt_sat(<*N*>)      EXP_CNT value computed as min(*N*, the largest EXP_CNT value).
+    lgkmcnt_sat(<*N*>)     LGKM_CNT value computed as min(*N*, the largest LGKM_CNT value).
+    ====================== ======================================================================
+
+These helpers may be specified in any order. Ampersands and commas may be used as optional separators.
+
+*N* is either an
+:ref:`integer number<amdgpu_synid_integer_number>` or an
+:ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+
+Examples:
+
+.. parsed-literal::
+
+    s_waitcnt 0
+    s_waitcnt vmcnt(1)
+    s_waitcnt expcnt(2) lgkmcnt(3)
+    s_waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+    s_waitcnt vmcnt(1), expcnt(2), lgkmcnt(3)
+    s_waitcnt vmcnt(1) & lgkmcnt_sat(100) & expcnt(2)
+
diff --git a/docs/AMDGPU/gfx9_addr_buf.rst b/docs/AMDGPU/gfx9_addr_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c253640fa5fb63a6b26c9feab6cb822a14f8d86e
--- /dev/null
+++ b/docs/AMDGPU/gfx9_addr_buf.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_addr_buf:
+
+vaddr
+===========================
+
+This is an optional operand which may specify offset and/or index.
+
+*Size:* 0, 1 or 2 dwords. Size is controlled by modifiers :ref:`offen<amdgpu_synid_offen>` and :ref:`idxen<amdgpu_synid_idxen>`:
+
+* If only :ref:`idxen<amdgpu_synid_idxen>` is specified, this operand supplies an index. Size is 1 dword.
+* If only :ref:`offen<amdgpu_synid_offen>` is specified, this operand supplies an offset. Size is 1 dword.
+* If both modifiers are specified, index is in the first register and offset is in the second. Size is 2 dwords.
+* If none of these modifiers are specified, this operand must be set to :ref:`off<amdgpu_synid_off>`.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx9_addr_ds.rst b/docs/AMDGPU/gfx9_addr_ds.rst
new file mode 100644
index 0000000000000000000000000000000000000000..11742462b9466940364f9d52efbd4bb104bef6f5
--- /dev/null
+++ b/docs/AMDGPU/gfx9_addr_ds.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_addr_ds:
+
+vaddr
+===========================
+
+An offset from the start of GDS/LDS memory.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_addr_flat.rst b/docs/AMDGPU/gfx9_addr_flat.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c748d07090eccccc40e3d904c8322aaae4349b79
--- /dev/null
+++ b/docs/AMDGPU/gfx9_addr_flat.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_addr_flat:
+
+vaddr
+===========================
+
+A 64-bit flat address.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_addr_mimg.rst b/docs/AMDGPU/gfx9_addr_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..eb6ca882b6afe2d51000572022703328049a8b57
--- /dev/null
+++ b/docs/AMDGPU/gfx9_addr_mimg.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_addr_mimg:
+
+vaddr
+===========================
+
+Image address which includes from one to four dimensional coordinates and other data used to locate a position in the image.
+
+*Size:* 1, 2, 3, 4, 8 or 16 dwords. Actual size depends on opcode, specific image being handled and :ref:`a16<amdgpu_synid_a16>`.
+
+  Note 1. Image format and dimensions are encoded in the image resource constant but not in the instruction.
+
+  Note 2. Actually image address size may vary from 1 to 13 dwords, but assembler currently supports a limited range of register sequences.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_attr.rst b/docs/AMDGPU/gfx9_attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..faffcc7ed18a32baf185d16d52292c252ff72c86
--- /dev/null
+++ b/docs/AMDGPU/gfx9_attr.rst
@@ -0,0 +1,30 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_attr:
+
+attr
+===========================
+
+Interpolation attribute and channel:
+
+    ============== ===================================
+    Syntax         Description
+    ============== ===================================
+    attr{0..32}.x  Attribute 0..32 with *x* channel.
+    attr{0..32}.y  Attribute 0..32 with *y* channel.
+    attr{0..32}.z  Attribute 0..32 with *z* channel.
+    attr{0..32}.w  Attribute 0..32 with *w* channel.
+    ============== ===================================
+
+Examples:
+
+.. parsed-literal::
+
+    v_interp_p1_f32 v1, v0, attr0.x
+    v_interp_p1_f32 v1, v0, attr32.w
+
diff --git a/docs/AMDGPU/gfx9_base_smem_addr.rst b/docs/AMDGPU/gfx9_base_smem_addr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..63c2cbd5fbd275713650270a1062b846424b358e
--- /dev/null
+++ b/docs/AMDGPU/gfx9_base_smem_addr.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_base_smem_addr:
+
+sbase
+===========================
+
+A 64-bit base address for scalar memory operations.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_base_smem_buf.rst b/docs/AMDGPU/gfx9_base_smem_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..191ecbaeabe486fe94c8ae563f80e849a61701ba
--- /dev/null
+++ b/docs/AMDGPU/gfx9_base_smem_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_base_smem_buf:
+
+sbase
+===========================
+
+A 128-bit buffer resource constant for scalar memory operations which provides a base address, a size and a stride.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_base_smem_scratch.rst b/docs/AMDGPU/gfx9_base_smem_scratch.rst
new file mode 100644
index 0000000000000000000000000000000000000000..83fd760d75af2049a4aef42ab436dfccd0b8400f
--- /dev/null
+++ b/docs/AMDGPU/gfx9_base_smem_scratch.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_base_smem_scratch:
+
+sbase
+===========================
+
+This operand is ignored by H/W and :ref:`flat_scratch<amdgpu_synid_flat_scratch>` is supplied instead.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_bimm16.rst b/docs/AMDGPU/gfx9_bimm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2c9dc5c52156183b62a7ef92bef61b26460109b0
--- /dev/null
+++ b/docs/AMDGPU/gfx9_bimm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_bimm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits.
+
diff --git a/docs/AMDGPU/gfx9_bimm32.rst b/docs/AMDGPU/gfx9_bimm32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e9b89674a2ef8c66c36e9211a5041a4261122327
--- /dev/null
+++ b/docs/AMDGPU/gfx9_bimm32.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_bimm32:
+
+imm32
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 32 bits.
+
diff --git a/docs/AMDGPU/gfx9_data_buf_atomic128.rst b/docs/AMDGPU/gfx9_data_buf_atomic128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..11c7c73479d20b74d35714331cafb5d379ec89db
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_buf_atomic128.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_buf_atomic128:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 4 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_data_buf_atomic32.rst b/docs/AMDGPU/gfx9_data_buf_atomic32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7b7d88b7a2d0f8227b6b0c60137c188f6b3e2a8c
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_buf_atomic32.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_buf_atomic32:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 1 dword by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_data_buf_atomic64.rst b/docs/AMDGPU/gfx9_data_buf_atomic64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..71b2b4be2caf405b9906c4f4c21666555365b400
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_buf_atomic64.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_buf_atomic64:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 2 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_data_mimg_atomic_cmp.rst b/docs/AMDGPU/gfx9_data_mimg_atomic_cmp.rst
new file mode 100644
index 0000000000000000000000000000000000000000..08fe297b4f39f9f8b6f4a343db98bc62515fc5af
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_mimg_atomic_cmp.rst
@@ -0,0 +1,27 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_mimg_atomic_cmp:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify 2 data elements for 32-bit-per-pixel surfaces or 4 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+  Note. The surface data format is indicated in the image resource constant but not in the instruction.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_data_mimg_atomic_reg.rst b/docs/AMDGPU/gfx9_data_mimg_atomic_reg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2037dfd5356b420992444713759ec9bf4b8f8f36
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_mimg_atomic_reg.rst
@@ -0,0 +1,26 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_mimg_atomic_reg:
+
+vdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify 1 data element for 32-bit-per-pixel surfaces or 2 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+  Note. The surface data format is indicated in the image resource constant but not in the instruction.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_data_mimg_store.rst b/docs/AMDGPU/gfx9_data_mimg_store.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5e2b8b6a103c31ff48f44cac38c2efe014851572
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_mimg_store.rst
@@ -0,0 +1,18 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_mimg_store:
+
+vdata
+===========================
+
+Image data to store by an *image_store* instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` which may specify from 1 to 4 data elements. Each data element occupies 1 dword.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_data_mimg_store_d16.rst b/docs/AMDGPU/gfx9_data_mimg_store_d16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5c521f80a49a05cabc79ac6d6c2b69f16bc1d23c
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_mimg_store_d16.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_mimg_store_d16:
+
+vdata
+===========================
+
+Image data to store by an *image_store* instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`d16<amdgpu_synid_d16>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify from 1 to 4 data elements. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16<amdgpu_synid_d16>`.
+* :ref:`d16<amdgpu_synid_d16>` specifies that data in registers are packed; each value occupies 16 bits.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_data_smem_atomic128.rst b/docs/AMDGPU/gfx9_data_smem_atomic128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..677361894818f7983d3d00504e3bc8b20b0751f4
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_smem_atomic128.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_smem_atomic128:
+
+sdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_data_smem_atomic32.rst b/docs/AMDGPU/gfx9_data_smem_atomic32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9ad25f3403de2fee20152e97cc54d397b18abb38
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_smem_atomic32.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_smem_atomic32:
+
+sdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_data_smem_atomic64.rst b/docs/AMDGPU/gfx9_data_smem_atomic64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6f67bffbd5486bc8a3ff0d7c5753f1b238f53404
--- /dev/null
+++ b/docs/AMDGPU/gfx9_data_smem_atomic64.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_data_smem_atomic64:
+
+sdata
+===========================
+
+Input data for an atomic instruction.
+
+Optionally may serve as an output data:
+
+* If :ref:`glc<amdgpu_synid_glc>` is specified, gets the memory value before the operation.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_dst_buf_128.rst b/docs/AMDGPU/gfx9_dst_buf_128.rst
new file mode 100644
index 0000000000000000000000000000000000000000..691be0f69fe2bcafe189987185cebb7aa91872c5
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_buf_128.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_buf_128:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 4 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_buf_32.rst b/docs/AMDGPU/gfx9_dst_buf_32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5ee1aa2c0c6ec55a21f6cd4ca6d746e4155686bb
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_buf_32.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_buf_32:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 1 dword by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_buf_64.rst b/docs/AMDGPU/gfx9_dst_buf_64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e27264ffe35285888222084b42f06f34c54224a
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_buf_64.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_buf_64:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 2 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_buf_96.rst b/docs/AMDGPU/gfx9_dst_buf_96.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8011edc0cb1b8dc768efa8e42bddaf66d8335ef3
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_buf_96.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_buf_96:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+*Size:* 3 dwords by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_buf_lds.rst b/docs/AMDGPU/gfx9_dst_buf_lds.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0445619d640821305336cf9407dc96ffb1755598
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_buf_lds.rst
@@ -0,0 +1,21 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_buf_lds:
+
+vdst
+===========================
+
+Instruction output: data read from a memory buffer.
+
+If :ref:`lds<amdgpu_synid_lds>` is specified, this operand is ignored by H/W and data are stored directly into LDS.
+
+*Size:* 1 dword by default. :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+    Note that :ref:`tfe<amdgpu_synid_tfe>` and :ref:`lds<amdgpu_synid_lds>` cannot be used together.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_flat_atomic32.rst b/docs/AMDGPU/gfx9_dst_flat_atomic32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..94a7fdac9703d30c56408f77ffe61238c2adbaae
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_flat_atomic32.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_flat_atomic32:
+
+vdst
+===========================
+
+Data returned by a 32-bit atomic flat instruction.
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_flat_atomic64.rst b/docs/AMDGPU/gfx9_dst_flat_atomic64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7f684a7165e4b39957bc894ae091dd8d5e01321d
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_flat_atomic64.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_flat_atomic64:
+
+vdst
+===========================
+
+Data returned by a 64-bit atomic flat instruction.
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_mimg_gather4.rst b/docs/AMDGPU/gfx9_dst_mimg_gather4.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3bb6f30811f15c5703b0e41ea4c3913c3f8b72da
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_mimg_gather4.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_mimg_gather4:
+
+vdst
+===========================
+
+Image data to load by an *image_gather4* instruction.
+
+*Size:* 4 data elements by default. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16<amdgpu_synid_d16>`.
+
+:ref:`d16<amdgpu_synid_d16>` and :ref:`tfe<amdgpu_synid_tfe>` affect operand size as follows:
+
+* :ref:`d16<amdgpu_synid_d16>` specifies that data elements in registers are packed; each value occupies 16 bits.
+* :ref:`tfe<amdgpu_synid_tfe>` adds one dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_mimg_regular.rst b/docs/AMDGPU/gfx9_dst_mimg_regular.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a7b848c1ee9c58232c1c82283a26c65ad805654
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_mimg_regular.rst
@@ -0,0 +1,20 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_mimg_regular:
+
+vdst
+===========================
+
+Image data to load by an image instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>` and :ref:`tfe<amdgpu_synid_tfe>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify from 1 to 4 data elements. Each data element occupies 1 dword.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_dst_mimg_regular_d16.rst b/docs/AMDGPU/gfx9_dst_mimg_regular_d16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a155639bf3e61c52cecc22b39b5379cc66347659
--- /dev/null
+++ b/docs/AMDGPU/gfx9_dst_mimg_regular_d16.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_dst_mimg_regular_d16:
+
+vdst
+===========================
+
+Image data to load by an image instruction.
+
+*Size:* depends on :ref:`dmask<amdgpu_synid_dmask>`, :ref:`tfe<amdgpu_synid_tfe>` and :ref:`d16<amdgpu_synid_d16>`:
+
+* :ref:`dmask<amdgpu_synid_dmask>` may specify from 1 to 4 data elements. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16<amdgpu_synid_d16>`.
+* :ref:`d16<amdgpu_synid_d16>` specifies that data elements in registers are packed; each value occupies 16 bits.
+* :ref:`tfe<amdgpu_synid_tfe>` adds 1 dword if specified.
+
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_fimm16.rst b/docs/AMDGPU/gfx9_fimm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a438b452eecfa55e57056cf8be6010105db238e7
--- /dev/null
+++ b/docs/AMDGPU/gfx9_fimm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_fimm16:
+
+imm32
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>` or a :ref:`floating-point_number<amdgpu_synid_floating-point_number>`. The number is converted to *f16* as described :ref:`here<amdgpu_synid_lit_conv>`.
+
diff --git a/docs/AMDGPU/gfx9_fimm32.rst b/docs/AMDGPU/gfx9_fimm32.rst
new file mode 100644
index 0000000000000000000000000000000000000000..11103e70529e3b373d16679677317608d4db4846
--- /dev/null
+++ b/docs/AMDGPU/gfx9_fimm32.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_fimm32:
+
+imm32
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>` or a :ref:`floating-point_number<amdgpu_synid_floating-point_number>`. The value is converted to *f32* as described :ref:`here<amdgpu_synid_lit_conv>`.
+
diff --git a/docs/AMDGPU/gfx9_hwreg.rst b/docs/AMDGPU/gfx9_hwreg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7ebb38b42fe01ffa3bb41a738c87a5bf0a1feb0e
--- /dev/null
+++ b/docs/AMDGPU/gfx9_hwreg.rst
@@ -0,0 +1,61 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_hwreg:
+
+hwreg
+===========================
+
+Bits of a hardware register being accessed.
+
+The bits of this operand have the following meaning:
+
+    ============ ===================================
+    Bits         Description
+    ============ ===================================
+    5:0          Register *id*.
+    10:6         First bit *offset* (0..31).
+    15:11        *Size* in bits (1..32).
+    ============ ===================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>` or using the syntax described below.
+
+    ==================================== ============================================================================
+    Syntax                               Description
+    ==================================== ============================================================================
+    hwreg({0..63})                       All bits of a register indicated by its *id*.
+    hwreg(<*name*>)                      All bits of a register indicated by its *name*.
+    hwreg({0..63}, {0..31}, {1..32})     Register bits indicated by register *id*, first bit *offset* and *size*.
+    hwreg(<*name*>, {0..31}, {1..32})    Register bits indicated by register *name*, first bit *offset* and *size*.
+    ==================================== ============================================================================
+
+Register *id*, *offset* and *size* must be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`.
+
+Defined register *names* include:
+
+    =================== ==========================================
+    Name                Description
+    =================== ==========================================
+    HW_REG_MODE         Shader writeable mode bits.
+    HW_REG_STATUS       Shader read-only status.
+    HW_REG_TRAPSTS      Trap status.
+    HW_REG_HW_ID        Id of wave, simd, compute unit, etc.
+    HW_REG_GPR_ALLOC    Per-wave SGPR and VGPR allocation.
+    HW_REG_LDS_ALLOC    Per-wave LDS allocation.
+    HW_REG_IB_STS       Counters of outstanding instructions.
+    HW_REG_SH_MEM_BASES Memory aperture.
+    =================== ==========================================
+
+Examples:
+
+.. parsed-literal::
+
+    s_getreg_b32 s2, 0x6
+    s_getreg_b32 s2, hwreg(15)
+    s_getreg_b32 s2, hwreg(51, 1, 31)
+    s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1)
+
diff --git a/docs/AMDGPU/gfx9_imm4.rst b/docs/AMDGPU/gfx9_imm4.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b1c97fb0b29d8c688afc5d58dd54b35df1cc7e88
--- /dev/null
+++ b/docs/AMDGPU/gfx9_imm4.rst
@@ -0,0 +1,25 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_imm4:
+
+imm4
+===========================
+
+A positive :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 4 bits.
+
+This operand is a mask which controls indexing mode for operands of subsequent instructions. Value 1 enables indexing and value 0 disables it.
+
+    ============ ========================================
+    Bit          Meaning
+    ============ ========================================
+    0            Enables or disables *src0* indexing.
+    1            Enables or disables *src1* indexing.
+    2            Enables or disables *src2* indexing.
+    3            Enables or disables *dst* indexing.
+    ============ ========================================
+
diff --git a/docs/AMDGPU/gfx9_label.rst b/docs/AMDGPU/gfx9_label.rst
new file mode 100644
index 0000000000000000000000000000000000000000..32771722f71de630e511e937daaed1b32211967c
--- /dev/null
+++ b/docs/AMDGPU/gfx9_label.rst
@@ -0,0 +1,30 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_label:
+
+label
+===========================
+
+A branch target which is a 16-bit signed integer treated as a PC-relative dword offset.
+
+This operand may be specified as:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>`. The number is truncated to 16 bits.
+* An :ref:`absolute_expression<amdgpu_synid_absolute_expression>` which must start with an :ref:`integer_number<amdgpu_synid_integer_number>`. The value of the expression is truncated to 16 bits.
+* A :ref:`symbol<amdgpu_synid_symbol>` (for example, a label). The value is handled as a 16-bit PC-relative dword offset to be resolved by a linker.
+
+Examples:
+
+.. parsed-literal::
+
+  offset = 30
+  s_branch loop_end
+  s_branch 2 + offset
+  s_branch 32
+  loop_end:
+
diff --git a/docs/AMDGPU/gfx9_mad_type_dev.rst b/docs/AMDGPU/gfx9_mad_type_dev.rst
new file mode 100644
index 0000000000000000000000000000000000000000..53c78e441128883e447d0b2ef2573c59a2d5861e
--- /dev/null
+++ b/docs/AMDGPU/gfx9_mad_type_dev.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_mad_type_dev:
+
+fx
+===========================
+
+This is an *f32* or *f16* operand depending on instruction modifiers:
+
+* Operand size is controlled by :ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>`.
+* Location of 16-bit operand is controlled by :ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>`.
+
diff --git a/docs/AMDGPU/gfx9_mod_dpp_sdwa_abs_neg.rst b/docs/AMDGPU/gfx9_mod_dpp_sdwa_abs_neg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ccbad8a5c87c5ac647085fc43838546f3a8b9f93
--- /dev/null
+++ b/docs/AMDGPU/gfx9_mod_dpp_sdwa_abs_neg.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_mod_dpp_sdwa_abs_neg:
+
+m
+===========================
+
+This operand may be used with floating point operand modifiers :ref:`abs<amdgpu_synid_abs>` and :ref:`neg<amdgpu_synid_neg>`.
+
diff --git a/docs/AMDGPU/gfx9_mod_sdwa_sext.rst b/docs/AMDGPU/gfx9_mod_sdwa_sext.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e832097f340e674d740bb0ecf75492bc2c9684bd
--- /dev/null
+++ b/docs/AMDGPU/gfx9_mod_sdwa_sext.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_mod_sdwa_sext:
+
+m
+===========================
+
+This operand may be used with integer operand modifier :ref:`sext<amdgpu_synid_sext>`.
+
diff --git a/docs/AMDGPU/gfx9_mod_vop3_abs_neg.rst b/docs/AMDGPU/gfx9_mod_vop3_abs_neg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2dac4b1bd7d3684f5cecfd7d534fe9c4f765dd0a
--- /dev/null
+++ b/docs/AMDGPU/gfx9_mod_vop3_abs_neg.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_mod_vop3_abs_neg:
+
+m
+===========================
+
+This operand may be used with floating point operand modifiers :ref:`abs<amdgpu_synid_abs>` and :ref:`neg<amdgpu_synid_neg>`.
+
diff --git a/docs/AMDGPU/gfx9_msg.rst b/docs/AMDGPU/gfx9_msg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f18cff46ecb24a8ac1c3c030dbed2ea7643bddfd
--- /dev/null
+++ b/docs/AMDGPU/gfx9_msg.rst
@@ -0,0 +1,72 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_msg:
+
+msg
+===========================
+
+A 16-bit message code. The bits of this operand have the following meaning:
+
+    ============ ======================================================
+    Bits         Description
+    ============ ======================================================
+    3:0          Message *type*.
+    6:4          Optional *operation*.
+    9:7          Optional *parameters*.
+    15:10        Unused.
+    ============ ======================================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>` or using the syntax described below:
+
+    ======================================== ========================================================================
+    Syntax                                   Description
+    ======================================== ========================================================================
+    sendmsg(<*type*>)                        A message identified by its *type*.
+    sendmsg(<*type*>, <*op*>)                A message identified by its *type* and *operation*.
+    sendmsg(<*type*>, <*op*>, <*stream*>)    A message identified by its *type* and *operation* with a stream *id*.
+    ======================================== ========================================================================
+
+*Type* may be specified using message *name* or message *id*.
+
+*Op* may be specified using operation *name* or operation *id*.
+
+Stream *id* is an integer in the range 0..3.
+
+Message *id*, operation *id* and stream *id* must be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`.
+
+Each message type supports specific operations:
+
+    ================= ========== ============================== ============ ==========
+    Message name      Message Id Supported Operations           Operation Id Stream Id
+    ================= ========== ============================== ============ ==========
+    MSG_INTERRUPT     1          \-                             \-           \-
+    MSG_GS            2          GS_OP_CUT                      1            Optional
+    \                            GS_OP_EMIT                     2            Optional
+    \                            GS_OP_EMIT_CUT                 3            Optional
+    MSG_GS_DONE       3          GS_OP_NOP                      0            \-
+    \                            GS_OP_CUT                      1            Optional
+    \                            GS_OP_EMIT                     2            Optional
+    \                            GS_OP_EMIT_CUT                 3            Optional
+    MSG_SYSMSG        15         SYSMSG_OP_ECC_ERR_INTERRUPT    1            \-
+    \                            SYSMSG_OP_REG_RD               2            \-
+    \                            SYSMSG_OP_HOST_TRAP_ACK        3            \-
+    \                            SYSMSG_OP_TTRACE_PC            4            \-
+    ================= ========== ============================== ============ ==========
+
+Examples:
+
+.. parsed-literal::
+
+    s_sendmsg 0x12
+    s_sendmsg sendmsg(MSG_INTERRUPT)
+    s_sendmsg sendmsg(2, GS_OP_CUT)
+    s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT)
+    s_sendmsg sendmsg(MSG_GS, 2)
+    s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_EMIT_CUT, 1)
+    s_sendmsg sendmsg(MSG_SYSMSG, SYSMSG_OP_TTRACE_PC)
+
diff --git a/docs/AMDGPU/gfx9_offset_buf.rst b/docs/AMDGPU/gfx9_offset_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ec6cc330072e7e69dfad6d1dde03e5d0d1011189
--- /dev/null
+++ b/docs/AMDGPU/gfx9_offset_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_offset_buf:
+
+soffset
+===========================
+
+An unsigned byte offset.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx9_offset_smem_buf.rst b/docs/AMDGPU/gfx9_offset_smem_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fdc4b09ee8163a52e26b3735a86623bc83018c84
--- /dev/null
+++ b/docs/AMDGPU/gfx9_offset_smem_buf.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_offset_smem_buf:
+
+soffset
+===========================
+
+An unsigned byte offset added to the base address to get memory address.
+
+.. WARNING:: Assembler currently supports 20-bit offsets only. Use :ref:`uimm20<amdgpu_synid_uimm20>` instead of :ref:`uimm21<amdgpu_synid_uimm21>`.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`uimm21<amdgpu_synid_uimm21>`
diff --git a/docs/AMDGPU/gfx9_offset_smem_plain.rst b/docs/AMDGPU/gfx9_offset_smem_plain.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a58df5593730e5d4ba5143343196e7870dbf9f3c
--- /dev/null
+++ b/docs/AMDGPU/gfx9_offset_smem_plain.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_offset_smem_plain:
+
+soffset
+===========================
+
+An offset added to the base address to get memory address.
+
+* If offset is specified as a register, it supplies an unsigned byte offset.
+* If offset is specified as a 21-bit immediate, it supplies a signed byte offset.
+
+.. WARNING:: Assembler currently supports 20-bit unsigned offsets only. Use :ref:`uimm20<amdgpu_synid_uimm20>` instead of :ref:`simm21<amdgpu_synid_simm21>`.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`simm21<amdgpu_synid_simm21>`
diff --git a/docs/AMDGPU/gfx9_opt.rst b/docs/AMDGPU/gfx9_opt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..50d73f9fcde9faa15e3d8d9b6c79250800a8b4ac
--- /dev/null
+++ b/docs/AMDGPU/gfx9_opt.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_opt:
+
+opt
+===========================
+
+This is an optional operand. It must be used if and only if :ref:`glc<amdgpu_synid_glc>` is specified.
+
diff --git a/docs/AMDGPU/gfx9_param.rst b/docs/AMDGPU/gfx9_param.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2831a65820f81ce27361736dac7eb65c0c26a85d
--- /dev/null
+++ b/docs/AMDGPU/gfx9_param.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_param:
+
+param
+===========================
+
+Interpolation parameter to read:
+
+    ============ ===================================
+    Syntax       Description
+    ============ ===================================
+    p0           Parameter *P0*.
+    p10          Parameter *P10*.
+    p20          Parameter *P20*.
+    ============ ===================================
+
diff --git a/docs/AMDGPU/gfx9_perm_smem.rst b/docs/AMDGPU/gfx9_perm_smem.rst
new file mode 100644
index 0000000000000000000000000000000000000000..370fb0d67b33fed20979dc4de1efdbee4f9a94e9
--- /dev/null
+++ b/docs/AMDGPU/gfx9_perm_smem.rst
@@ -0,0 +1,24 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_perm_smem:
+
+imm3
+===========================
+
+A bit mask which indicates request permissions.
+
+This operand must be specified as an :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 7 bits, but only 3 low bits are significant.
+
+    ============ ==============================
+    Bit Number   Description
+    ============ ==============================
+    0            Request *read* permission.
+    1            Request *write* permission.
+    2            Request *execute* permission.
+    ============ ==============================
+
diff --git a/docs/AMDGPU/gfx9_ret.rst b/docs/AMDGPU/gfx9_ret.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7015be6298d70749724676218d529a1195f9829e
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ret.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ret:
+
+dst
+===========================
+
+This is an input operand. It may optionally serve as a destination if :ref:`glc<amdgpu_synid_glc>` is specified.
+
diff --git a/docs/AMDGPU/gfx9_rsrc_buf.rst b/docs/AMDGPU/gfx9_rsrc_buf.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3dc17753d61222dcde09e29f75e894f06f87e1d6
--- /dev/null
+++ b/docs/AMDGPU/gfx9_rsrc_buf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_rsrc_buf:
+
+srsrc
+===========================
+
+Buffer resource constant which defines the address and characteristics of the buffer in memory.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_rsrc_mimg.rst b/docs/AMDGPU/gfx9_rsrc_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..29c90a14b8a7a109588ddf8fb4e31e00b36d59b4
--- /dev/null
+++ b/docs/AMDGPU/gfx9_rsrc_mimg.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_rsrc_mimg:
+
+srsrc
+===========================
+
+Image resource constant which defines the location of the image buffer in memory, its dimensions, tiling, and data format.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_saddr_flat_global.rst b/docs/AMDGPU/gfx9_saddr_flat_global.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7396df0bc726b923e2bbebb0f5ccb6e9a431099a
--- /dev/null
+++ b/docs/AMDGPU/gfx9_saddr_flat_global.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_saddr_flat_global:
+
+saddr
+===========================
+
+An optional 64-bit flat global address. Must be specified as :ref:`off<amdgpu_synid_off>` if not used.
+
+See :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>` for description of available addressing modes.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx9_saddr_flat_scratch.rst b/docs/AMDGPU/gfx9_saddr_flat_scratch.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5bdbf394af1aeff480512fb6e6008d7575495fcb
--- /dev/null
+++ b/docs/AMDGPU/gfx9_saddr_flat_scratch.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_saddr_flat_scratch:
+
+saddr
+===========================
+
+An optional 32-bit flat scratch offset. Must be specified as :ref:`off<amdgpu_synid_off>` if not used.
+
+Either this operand or :ref:`vaddr<amdgpu_synid9_vaddr_flat_scratch>` must be set to :ref:`off<amdgpu_synid_off>`.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx9_samp_mimg.rst b/docs/AMDGPU/gfx9_samp_mimg.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f901142909bb97b35660dee7637018f28bf2f6a0
--- /dev/null
+++ b/docs/AMDGPU/gfx9_samp_mimg.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_samp_mimg:
+
+ssamp
+===========================
+
+Sampler constant used to specify filtering options applied to the image data after it is read.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdata128_0.rst b/docs/AMDGPU/gfx9_sdata128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d3609d497f6671e9aad4167f7d3f471def155e2c
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdata128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdata128_0:
+
+sdata
+===========================
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdata32_0.rst b/docs/AMDGPU/gfx9_sdata32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4de36350dc7a94637f53594b5260aba5c551cd58
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdata32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdata32_0:
+
+sdata
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdata64_0.rst b/docs/AMDGPU/gfx9_sdata64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7cde210d7b4f667a2e39c23360602fd6ca898d84
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdata64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdata64_0:
+
+sdata
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdst128_0.rst b/docs/AMDGPU/gfx9_sdst128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..974df548f465b9cda48f72d19e00a26c32c6c411
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdst128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdst128_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdst256_0.rst b/docs/AMDGPU/gfx9_sdst256_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b492921d47a5246afe460996509a2530a983bf5e
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdst256_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdst256_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdst32_0.rst b/docs/AMDGPU/gfx9_sdst32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..911c843e7f47abf4905d7a2a62da5c36016e1620
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdst32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdst32_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdst32_1.rst b/docs/AMDGPU/gfx9_sdst32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8f63024f18721a67599e263419176f53d5fc99f6
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdst32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdst32_1:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx9_sdst32_2.rst b/docs/AMDGPU/gfx9_sdst32_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..04f3cda2d08a63d559f7965f41d694b13562dc69
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdst32_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdst32_2:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdst512_0.rst b/docs/AMDGPU/gfx9_sdst512_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7f7dab60aa42a4153d568dc5dd1670804c38b648
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdst512_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdst512_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 16 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdst64_0.rst b/docs/AMDGPU/gfx9_sdst64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dc5f4c72fb98258e353339723c456938068593ef
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdst64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdst64_0:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_sdst64_1.rst b/docs/AMDGPU/gfx9_sdst64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..208d68b41715f6293a55716e5b8e956d246eae76
--- /dev/null
+++ b/docs/AMDGPU/gfx9_sdst64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_sdst64_1:
+
+sdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx9_simm16.rst b/docs/AMDGPU/gfx9_simm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..47b200a72070c2377db3bc579d0144bf6596e367
--- /dev/null
+++ b/docs/AMDGPU/gfx9_simm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_simm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits and then sign-extended to 32 bits.
+
diff --git a/docs/AMDGPU/gfx9_src32_0.rst b/docs/AMDGPU/gfx9_src32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..288ccbb37d095ab6cae3231742585ff576a41bc4
--- /dev/null
+++ b/docs/AMDGPU/gfx9_src32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_src32_0:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx9_src32_1.rst b/docs/AMDGPU/gfx9_src32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a06764da8b221dd7fab7566cd5246b6a3a5c3074
--- /dev/null
+++ b/docs/AMDGPU/gfx9_src32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_src32_1:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx9_src64_0.rst b/docs/AMDGPU/gfx9_src64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f8ef842ef4f16edd256c675a8c0fba15870c8192
--- /dev/null
+++ b/docs/AMDGPU/gfx9_src64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_src64_0:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx9_src64_1.rst b/docs/AMDGPU/gfx9_src64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fe7b7fda1f5e689bbc63e6cc02b9e3bc6ea6e993
--- /dev/null
+++ b/docs/AMDGPU/gfx9_src64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_src64_1:
+
+src
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx9_src_exp.rst b/docs/AMDGPU/gfx9_src_exp.rst
new file mode 100644
index 0000000000000000000000000000000000000000..91a5d53b92ba5a3246af9a755863a58f973ee9e2
--- /dev/null
+++ b/docs/AMDGPU/gfx9_src_exp.rst
@@ -0,0 +1,28 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_src_exp:
+
+vsrc
+===========================
+
+Data to copy to export buffers. This is an optional operand. Must be specified as :ref:`off<amdgpu_synid_off>` if not used.
+
+:ref:`compr<amdgpu_synid_compr>` modifier indicates use of compressed (16-bit) data. This limits number of source operands from 4 to 2:
+
+* src0 and src1 must specify the first register (or :ref:`off<amdgpu_synid_off>`).
+* src2 and src3 must specify the second register (or :ref:`off<amdgpu_synid_off>`).
+
+An example:
+
+.. parsed-literal::
+
+  exp mrtz v3, v3, off, off compr
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx9_ssrc32_0.rst b/docs/AMDGPU/gfx9_ssrc32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25d6b3744fd64cf5c5bbfffcebbb24eaeed72fc2
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc32_0:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx9_ssrc32_1.rst b/docs/AMDGPU/gfx9_ssrc32_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..caea7640661489c80b8b1eb29584d1279f16e895
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc32_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc32_1:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_ssrc32_2.rst b/docs/AMDGPU/gfx9_ssrc32_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..034f20e30ee915d658e645f599d558ccc3a331d5
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc32_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc32_2:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx9_ssrc32_3.rst b/docs/AMDGPU/gfx9_ssrc32_3.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1b08cad751d822fc99a83c774737bd0e3c44efe5
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc32_3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc32_3:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`iconst<amdgpu_synid_iconst>`
diff --git a/docs/AMDGPU/gfx9_ssrc32_4.rst b/docs/AMDGPU/gfx9_ssrc32_4.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7e5542737bd88cf3ba664e0302a124c395d7b528
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc32_4.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc32_4:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx9_ssrc64_0.rst b/docs/AMDGPU/gfx9_ssrc64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b2f86e11cb0d4938d82a2f1583562e7bd2fdbcf2
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc64_0:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/docs/AMDGPU/gfx9_ssrc64_1.rst b/docs/AMDGPU/gfx9_ssrc64_1.rst
new file mode 100644
index 0000000000000000000000000000000000000000..02be6c537b72f76894918f4eef2f9456b5c80cc4
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc64_1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc64_1:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`
diff --git a/docs/AMDGPU/gfx9_ssrc64_2.rst b/docs/AMDGPU/gfx9_ssrc64_2.rst
new file mode 100644
index 0000000000000000000000000000000000000000..72d2c46d3a9b0dfff88cf5609b0134711e806cb6
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc64_2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc64_2:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`exec<amdgpu_synid_exec>`, :ref:`constant<amdgpu_synid_constant>`
diff --git a/docs/AMDGPU/gfx9_ssrc64_3.rst b/docs/AMDGPU/gfx9_ssrc64_3.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3414802295ad07ad39beed38748fc97a1dbc3c28
--- /dev/null
+++ b/docs/AMDGPU/gfx9_ssrc64_3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_ssrc64_3:
+
+ssrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`flat_scratch<amdgpu_synid_flat_scratch>`, :ref:`xnack<amdgpu_synid_xnack>`, :ref:`vcc<amdgpu_synid_vcc>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`exec<amdgpu_synid_exec>`
diff --git a/docs/AMDGPU/gfx9_tgt.rst b/docs/AMDGPU/gfx9_tgt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3ba8baebb3bc0fe2cb2af2cdc5ab2ac04df4e5fe
--- /dev/null
+++ b/docs/AMDGPU/gfx9_tgt.rst
@@ -0,0 +1,24 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_tgt:
+
+tgt
+===========================
+
+An export target:
+
+    ============== ===================================
+    Syntax         Description
+    ============== ===================================
+    pos{0..3}      Copy vertex position 0..3.
+    param{0..31}   Copy vertex parameter 0..31.
+    mrt{0..7}      Copy pixel color to the MRTs 0..7.
+    mrtz           Copy pixel depth (Z) data.
+    null           Copy nothing.
+    ============== ===================================
+
diff --git a/docs/AMDGPU/gfx9_type_dev.rst b/docs/AMDGPU/gfx9_type_dev.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ae2b0bf6c50c5ac7be0bdfdb4ecd83847e31b273
--- /dev/null
+++ b/docs/AMDGPU/gfx9_type_dev.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_type_dev:
+
+Type deviation
+===========================
+
+*Type* of this operand differs from *type* :ref:`implied by the opcode<amdgpu_syn_instruction_type>`. This tag specifies actual operand *type*.
+
diff --git a/docs/AMDGPU/gfx9_uimm16.rst b/docs/AMDGPU/gfx9_uimm16.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4d1fe1de3f20bfbc8239694e7265c527cd1262b4
--- /dev/null
+++ b/docs/AMDGPU/gfx9_uimm16.rst
@@ -0,0 +1,14 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_uimm16:
+
+imm16
+===========================
+
+An :ref:`integer_number<amdgpu_synid_integer_number>`. The value is truncated to 16 bits and then zero-extended to 32 bits.
+
diff --git a/docs/AMDGPU/gfx9_vaddr_flat_global.rst b/docs/AMDGPU/gfx9_vaddr_flat_global.rst
new file mode 100644
index 0000000000000000000000000000000000000000..38beb6c44321bcdc81c30e05635439326e91e191
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vaddr_flat_global.rst
@@ -0,0 +1,22 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vaddr_flat_global:
+
+vaddr
+===========================
+
+A 64-bit flat global address or a 32-bit offset depending on addressing mode:
+
+* Address = :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>` + :ref:`flat_offset13<amdgpu_synid_flat_offset13>`. :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>` is a 64-bit address. This mode is indicated by :ref:`saddr<amdgpu_synid9_saddr_flat_global>` set to :ref:`off<amdgpu_synid_off>`.
+* Address = :ref:`saddr<amdgpu_synid9_saddr_flat_global>` + :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>` + :ref:`flat_offset13<amdgpu_synid_flat_offset13>`. :ref:`vaddr<amdgpu_synid9_vaddr_flat_global>` is a 32-bit offset. This mode is used when :ref:`saddr<amdgpu_synid9_saddr_flat_global>` is not :ref:`off<amdgpu_synid_off>`.
+
+.. WARNING:: Assembler currently expects a 64-bit *vaddr* regardless of addressing mode. This have to be fixed.
+
+*Size:* 1 or 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vaddr_flat_scratch.rst b/docs/AMDGPU/gfx9_vaddr_flat_scratch.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a72cf0662f420be512464b528b164fe463ac18e2
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vaddr_flat_scratch.rst
@@ -0,0 +1,19 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vaddr_flat_scratch:
+
+vaddr
+===========================
+
+An optional 32-bit flat scratch offset. Must be specified as :ref:`off<amdgpu_synid_off>` if not used.
+
+Either this operand or :ref:`saddr<amdgpu_synid9_saddr_flat_scratch>` must be set to :ref:`off<amdgpu_synid_off>`.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`off<amdgpu_synid_off>`
diff --git a/docs/AMDGPU/gfx9_vcc_64.rst b/docs/AMDGPU/gfx9_vcc_64.rst
new file mode 100644
index 0000000000000000000000000000000000000000..788306b2a5e46874d3427b0806b7d6e9a5257b05
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vcc_64.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vcc_64:
+
+vcc
+===========================
+
+Vector condition code.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`vcc<amdgpu_synid_vcc>`
diff --git a/docs/AMDGPU/gfx9_vdata128_0.rst b/docs/AMDGPU/gfx9_vdata128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3d2b4905fa3f61e010dec1033acb7ff7ebeb993d
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vdata128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vdata128_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vdata32_0.rst b/docs/AMDGPU/gfx9_vdata32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e2857695acfb92e28bb4bc4ac10e86fbbd57bf4a
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vdata32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vdata32_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vdata64_0.rst b/docs/AMDGPU/gfx9_vdata64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fcd1d0f827087cbb6c27a0c7384d36a03e546e4d
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vdata64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vdata64_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vdata96_0.rst b/docs/AMDGPU/gfx9_vdata96_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8a8cb7647bce709709465f89581210c49cc7e6ba
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vdata96_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vdata96_0:
+
+vdata
+===========================
+
+Instruction input.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vdst128_0.rst b/docs/AMDGPU/gfx9_vdst128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b8bbf89c2729910b7b1bc464efab43ee7450537d
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vdst128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vdst128_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vdst32_0.rst b/docs/AMDGPU/gfx9_vdst32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ccee55f7c10615fcb10ba8c044d9d8f7b4ac8d65
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vdst32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vdst32_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vdst64_0.rst b/docs/AMDGPU/gfx9_vdst64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..60bf384f2c11227044bd614799f7ebbb97eee626
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vdst64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vdst64_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vdst96_0.rst b/docs/AMDGPU/gfx9_vdst96_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..834d32a4387c0f1759804abfd5f1ac4a4d3882a4
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vdst96_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vdst96_0:
+
+vdst
+===========================
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vsrc128_0.rst b/docs/AMDGPU/gfx9_vsrc128_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3e8c6dedf5d5d49008f04c71a313928f31db5cd1
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vsrc128_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vsrc128_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vsrc32_0.rst b/docs/AMDGPU/gfx9_vsrc32_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a056f3a1dffb726a0f1f1d56779e8fc3ac5c11a7
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vsrc32_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vsrc32_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_vsrc64_0.rst b/docs/AMDGPU/gfx9_vsrc64_0.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b91b3405c1db44776e99faa1742ff5fe05cbbf49
--- /dev/null
+++ b/docs/AMDGPU/gfx9_vsrc64_0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_vsrc64_0:
+
+vsrc
+===========================
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/docs/AMDGPU/gfx9_waitcnt.rst b/docs/AMDGPU/gfx9_waitcnt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..015a51ae8c329485b6fcfff2b3747f0a7666ab45
--- /dev/null
+++ b/docs/AMDGPU/gfx9_waitcnt.rst
@@ -0,0 +1,56 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid9_waitcnt:
+
+waitcnt
+===========================
+
+Counts of outstanding instructions to wait for.
+
+The bits of this operand have the following meaning:
+
+    ============ ======================================================
+    Bits         Description
+    ============ ======================================================
+    3:0          VM_CNT: vector memory operations count, lower bits.
+    6:4          EXP_CNT: export count.
+    11:8         LGKM_CNT: LDS, GDS, Constant and Message count.
+    15:14        VM_CNT: vector memory operations count, upper bits.
+    ============ ======================================================
+
+This operand may be specified as a positive 16-bit :ref:`integer_number<amdgpu_synid_integer_number>`
+or as a combination of the following symbolic helpers:
+
+    ====================== ======================================================================
+    Syntax                 Description
+    ====================== ======================================================================
+    vmcnt(<*N*>)           VM_CNT value. *N* must not exceed the largest VM_CNT value.
+    expcnt(<*N*>)          EXP_CNT value. *N* must not exceed the largest EXP_CNT value.
+    lgkmcnt(<*N*>)         LGKM_CNT value. *N* must not exceed the largest LGKM_CNT value.
+    vmcnt_sat(<*N*>)       VM_CNT value computed as min(*N*, the largest VM_CNT value).
+    expcnt_sat(<*N*>)      EXP_CNT value computed as min(*N*, the largest EXP_CNT value).
+    lgkmcnt_sat(<*N*>)     LGKM_CNT value computed as min(*N*, the largest LGKM_CNT value).
+    ====================== ======================================================================
+
+These helpers may be specified in any order. Ampersands and commas may be used as optional separators.
+
+*N* is either an
+:ref:`integer number<amdgpu_synid_integer_number>` or an
+:ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+
+Examples:
+
+.. parsed-literal::
+
+    s_waitcnt 0
+    s_waitcnt vmcnt(1)
+    s_waitcnt expcnt(2) lgkmcnt(3)
+    s_waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+    s_waitcnt vmcnt(1), expcnt(2), lgkmcnt(3)
+    s_waitcnt vmcnt(1) & lgkmcnt_sat(100) & expcnt(2)
+
diff --git a/docs/AMDGPUAsmGFX7.rst b/docs/AMDGPUAsmGFX7.rst
deleted file mode 100644
index 8973c5035ad64005c0d63afa87b17b221386727f..0000000000000000000000000000000000000000
--- a/docs/AMDGPUAsmGFX7.rst
+++ /dev/null
@@ -1,1255 +0,0 @@
-..
-    **************************************************
-    *                                                *
-    *   Automatically generated file, do not edit!   *
-    *                                                *
-    **************************************************
-
-===========================
-Syntax of GFX7 Instructions
-===========================
-
-.. contents::
-  :local:
-
-
-DS
-===========================
-
-.. parsed-literal::
-
-    ds_add_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_b32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_b64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_rtn_b32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_rtn_b64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_src2_b32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_src2_b64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_append                      dst                            :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_b32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_b64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_f32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_f64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_b32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_b64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_f32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_f64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_condxchg32_rtn_b64          dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_consume                     dst                            :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_barrier                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_init                    src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_br                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_p                  src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_release_all        src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_v                  src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_f32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_f64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_i32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_i64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_f32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_f64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_i32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_i64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_f32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_f64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_i32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_i64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_f32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_f64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_i32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_i64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_f32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_f64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_i32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_i64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_f32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_f64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_i32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_i64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_b32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_b64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_rtn_b32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_rtn_b64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_nop                         src0
-    ds_or_b32                      src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_b64                      src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_rtn_b32                  dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_rtn_b64                  dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_src2_b32                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_src2_b64                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_ordered_count               dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2_b32                   dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2_b64                   dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2st64_b32               dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2st64_b64               dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b128                   dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b32                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b64                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b96                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_i16                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_i8                     dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u16                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u8                     dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_rtn_u32                dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_rtn_u64                dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_src2_u32               src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_src2_u64               src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_u32                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_u64                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_swizzle_b32                 dst, src0                      :ref:`sw_offset16<amdgpu_synid_sw_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrap_rtn_b32                dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2_b32                  src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2_b64                  src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2st64_b32              src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2st64_b64              src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b128                  src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b16                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b32                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b64                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b8                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b96                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_src2_b32              src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_src2_b64              src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2_rtn_b32             dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2_rtn_b64             dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2st64_rtn_b32         dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2st64_rtn_b64         dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg_rtn_b32              dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg_rtn_b64              dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_b32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_b64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_rtn_b32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_rtn_b64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_src2_b32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_src2_b64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-
-EXP
-===========================
-
-.. parsed-literal::
-
-    exp                            dst, src0, src1, src2, src3    :ref:`done<amdgpu_synid_done>` :ref:`compr<amdgpu_synid_compr>` :ref:`vm<amdgpu_synid_vm>`
-
-FLAT
-===========================
-
-.. parsed-literal::
-
-    flat_atomic_add                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_add_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_and                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_and_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_cmpswap            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_cmpswap_x2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_dec                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_dec_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_fcmpswap           dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_fcmpswap_x2        dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_fmax               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_fmax_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_fmin               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_fmin_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_inc                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_inc_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_or                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_or_x2              dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smax               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smax_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smin               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smin_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_sub                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_sub_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_swap               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_swap_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umax               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umax_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umin               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umin_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_xor                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_xor_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dword                dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx2              dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx3              dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx4              dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_sbyte                dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_sshort               dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_ubyte                dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_ushort               dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_byte                src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dword               src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx2             src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx3             src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx4             src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_short               src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-
-MIMG
-===========================
-
-.. parsed-literal::
-
-    image_atomic_add               src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_and               src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_cmpswap           src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_dec               src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_inc               src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_or                src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_smax              src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_smin              src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_sub               src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_swap              src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_umax              src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_umin              src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_xor               src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4                  dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_b                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_b_cl             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_b_cl_o           dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_b_o              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_b              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_b_cl           dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_b_cl_o         dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_b_o            dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_cl             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_cl_o           dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_l              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_l_o            dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_lz             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_lz_o           dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_c_o              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_cl               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_cl_o             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_l                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_l_o              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_lz               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_lz_o             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4_o                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_get_lod                  dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_get_resinfo              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load                     dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_mip                 dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_mip_pck             dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_mip_pck_sgn         dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_pck                 dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_pck_sgn             dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample                   dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_b                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_b_cl              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_c                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_c_b               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_c_b_cl            dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_c_cd              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_c_cl              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_c_d               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_c_l               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_c_lz              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_cl                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_l                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample_lz                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_store                    src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_store_mip                src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_store_mip_pck            src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_store_pck                src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-
-MUBUF
-===========================
-
-.. parsed-literal::
-
-    buffer_atomic_add              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_add_x2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_and              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_and_x2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_cmpswap          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_cmpswap_x2       src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_dec              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_dec_x2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_inc              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_inc_x2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_or               src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_or_x2            src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smax             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smax_x2          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smin             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smin_x2          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_sub              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_sub_x2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_swap             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_swap_x2          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umax             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umax_x2          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umin             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umin_x2          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_xor              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_xor_x2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dword              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_dwordx2            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dwordx3            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dwordx4            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_x           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_format_xy          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_xyz         dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_xyzw        dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_sbyte              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_sshort             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_ubyte              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_ushort             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_store_byte              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dword             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx3           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx4           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_x          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xy         src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xyz        src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xyzw       src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_short             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`addr64<amdgpu_synid_addr64>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_wbinvl1
-    buffer_wbinvl1_vol
-
-SMRD
-===========================
-
-.. parsed-literal::
-
-    s_buffer_load_dword            dst, src0, src1
-    s_buffer_load_dwordx16         dst, src0, src1
-    s_buffer_load_dwordx2          dst, src0, src1
-    s_buffer_load_dwordx4          dst, src0, src1
-    s_buffer_load_dwordx8          dst, src0, src1
-    s_dcache_inv
-    s_dcache_inv_vol
-    s_load_dword                   dst, src0, src1
-    s_load_dwordx16                dst, src0, src1
-    s_load_dwordx2                 dst, src0, src1
-    s_load_dwordx4                 dst, src0, src1
-    s_load_dwordx8                 dst, src0, src1
-    s_memtime                      dst
-
-SOP1
-===========================
-
-.. parsed-literal::
-
-    s_abs_i32                      dst, src0
-    s_and_saveexec_b64             dst, src0
-    s_andn2_saveexec_b64           dst, src0
-    s_bcnt0_i32_b32                dst, src0
-    s_bcnt0_i32_b64                dst, src0
-    s_bcnt1_i32_b32                dst, src0
-    s_bcnt1_i32_b64                dst, src0
-    s_bitset0_b32                  dst, src0
-    s_bitset0_b64                  dst, src0
-    s_bitset1_b32                  dst, src0
-    s_bitset1_b64                  dst, src0
-    s_brev_b32                     dst, src0
-    s_brev_b64                     dst, src0
-    s_cbranch_join                 src0
-    s_cmov_b32                     dst, src0
-    s_cmov_b64                     dst, src0
-    s_ff0_i32_b32                  dst, src0
-    s_ff0_i32_b64                  dst, src0
-    s_ff1_i32_b32                  dst, src0
-    s_ff1_i32_b64                  dst, src0
-    s_flbit_i32                    dst, src0
-    s_flbit_i32_b32                dst, src0
-    s_flbit_i32_b64                dst, src0
-    s_flbit_i32_i64                dst, src0
-    s_getpc_b64                    dst
-    s_mov_b32                      dst, src0
-    s_mov_b64                      dst, src0
-    s_mov_fed_b32                  dst, src0
-    s_movreld_b32                  dst, src0
-    s_movreld_b64                  dst, src0
-    s_movrels_b32                  dst, src0
-    s_movrels_b64                  dst, src0
-    s_nand_saveexec_b64            dst, src0
-    s_nor_saveexec_b64             dst, src0
-    s_not_b32                      dst, src0
-    s_not_b64                      dst, src0
-    s_or_saveexec_b64              dst, src0
-    s_orn2_saveexec_b64            dst, src0
-    s_quadmask_b32                 dst, src0
-    s_quadmask_b64                 dst, src0
-    s_rfe_b64                      src0
-    s_setpc_b64                    src0
-    s_sext_i32_i16                 dst, src0
-    s_sext_i32_i8                  dst, src0
-    s_swappc_b64                   dst, src0
-    s_wqm_b32                      dst, src0
-    s_wqm_b64                      dst, src0
-    s_xnor_saveexec_b64            dst, src0
-    s_xor_saveexec_b64             dst, src0
-
-SOP2
-===========================
-
-.. parsed-literal::
-
-    s_absdiff_i32                  dst, src0, src1
-    s_add_i32                      dst, src0, src1
-    s_add_u32                      dst, src0, src1
-    s_addc_u32                     dst, src0, src1
-    s_and_b32                      dst, src0, src1
-    s_and_b64                      dst, src0, src1
-    s_andn2_b32                    dst, src0, src1
-    s_andn2_b64                    dst, src0, src1
-    s_ashr_i32                     dst, src0, src1
-    s_ashr_i64                     dst, src0, src1
-    s_bfe_i32                      dst, src0, src1
-    s_bfe_i64                      dst, src0, src1
-    s_bfe_u32                      dst, src0, src1
-    s_bfe_u64                      dst, src0, src1
-    s_bfm_b32                      dst, src0, src1
-    s_bfm_b64                      dst, src0, src1
-    s_cbranch_g_fork               src0, src1
-    s_cselect_b32                  dst, src0, src1
-    s_cselect_b64                  dst, src0, src1
-    s_lshl_b32                     dst, src0, src1
-    s_lshl_b64                     dst, src0, src1
-    s_lshr_b32                     dst, src0, src1
-    s_lshr_b64                     dst, src0, src1
-    s_max_i32                      dst, src0, src1
-    s_max_u32                      dst, src0, src1
-    s_min_i32                      dst, src0, src1
-    s_min_u32                      dst, src0, src1
-    s_mul_i32                      dst, src0, src1
-    s_nand_b32                     dst, src0, src1
-    s_nand_b64                     dst, src0, src1
-    s_nor_b32                      dst, src0, src1
-    s_nor_b64                      dst, src0, src1
-    s_or_b32                       dst, src0, src1
-    s_or_b64                       dst, src0, src1
-    s_orn2_b32                     dst, src0, src1
-    s_orn2_b64                     dst, src0, src1
-    s_sub_i32                      dst, src0, src1
-    s_sub_u32                      dst, src0, src1
-    s_subb_u32                     dst, src0, src1
-    s_xnor_b32                     dst, src0, src1
-    s_xnor_b64                     dst, src0, src1
-    s_xor_b32                      dst, src0, src1
-    s_xor_b64                      dst, src0, src1
-
-SOPC
-===========================
-
-.. parsed-literal::
-
-    s_bitcmp0_b32                  src0, src1
-    s_bitcmp0_b64                  src0, src1
-    s_bitcmp1_b32                  src0, src1
-    s_bitcmp1_b64                  src0, src1
-    s_cmp_eq_i32                   src0, src1
-    s_cmp_eq_u32                   src0, src1
-    s_cmp_ge_i32                   src0, src1
-    s_cmp_ge_u32                   src0, src1
-    s_cmp_gt_i32                   src0, src1
-    s_cmp_gt_u32                   src0, src1
-    s_cmp_le_i32                   src0, src1
-    s_cmp_le_u32                   src0, src1
-    s_cmp_lg_i32                   src0, src1
-    s_cmp_lg_u32                   src0, src1
-    s_cmp_lt_i32                   src0, src1
-    s_cmp_lt_u32                   src0, src1
-    s_setvskip                     src0, src1
-
-SOPK
-===========================
-
-.. parsed-literal::
-
-    s_addk_i32                     dst, src0
-    s_cbranch_i_fork               src0, src1
-    s_cmovk_i32                    dst, src0
-    s_cmpk_eq_i32                  src0, src1
-    s_cmpk_eq_u32                  src0, src1
-    s_cmpk_ge_i32                  src0, src1
-    s_cmpk_ge_u32                  src0, src1
-    s_cmpk_gt_i32                  src0, src1
-    s_cmpk_gt_u32                  src0, src1
-    s_cmpk_le_i32                  src0, src1
-    s_cmpk_le_u32                  src0, src1
-    s_cmpk_lg_i32                  src0, src1
-    s_cmpk_lg_u32                  src0, src1
-    s_cmpk_lt_i32                  src0, src1
-    s_cmpk_lt_u32                  src0, src1
-    s_getreg_b32                   dst, src0
-    s_movk_i32                     dst, src0
-    s_mulk_i32                     dst, src0
-    s_setreg_b32                   dst, src0
-    s_setreg_imm32_b32             dst, src0
-
-SOPP
-===========================
-
-.. parsed-literal::
-
-    s_barrier
-    s_branch                       src0
-    s_cbranch_cdbgsys              src0
-    s_cbranch_cdbgsys_and_user     src0
-    s_cbranch_cdbgsys_or_user      src0
-    s_cbranch_cdbguser             src0
-    s_cbranch_execnz               src0
-    s_cbranch_execz                src0
-    s_cbranch_scc0                 src0
-    s_cbranch_scc1                 src0
-    s_cbranch_vccnz                src0
-    s_cbranch_vccz                 src0
-    s_decperflevel                 src0
-    s_endpgm
-    s_icache_inv
-    s_incperflevel                 src0
-    s_nop                          src0
-    s_sendmsg                      src0
-    s_sendmsghalt                  src0
-    s_sethalt                      src0
-    s_setkill                      src0
-    s_setprio                      src0
-    s_sleep                        src0
-    s_trap                         src0
-    s_ttracedata
-    s_waitcnt                      src0
-
-VINTRP
-===========================
-
-.. parsed-literal::
-
-    v_interp_mov_f32               dst, src0, src1
-    v_interp_p1_f32                dst, src0, src1
-    v_interp_p2_f32                dst, src0, src1
-
-VOP1
-===========================
-
-.. parsed-literal::
-
-    v_bfrev_b32                    dst, src0
-    v_ceil_f32                     dst, src0
-    v_ceil_f64                     dst, src0
-    v_clrexcp
-    v_cos_f32                      dst, src0
-    v_cvt_f16_f32                  dst, src0
-    v_cvt_f32_f16                  dst, src0
-    v_cvt_f32_f64                  dst, src0
-    v_cvt_f32_i32                  dst, src0
-    v_cvt_f32_u32                  dst, src0
-    v_cvt_f32_ubyte0               dst, src0
-    v_cvt_f32_ubyte1               dst, src0
-    v_cvt_f32_ubyte2               dst, src0
-    v_cvt_f32_ubyte3               dst, src0
-    v_cvt_f64_f32                  dst, src0
-    v_cvt_f64_i32                  dst, src0
-    v_cvt_f64_u32                  dst, src0
-    v_cvt_flr_i32_f32              dst, src0
-    v_cvt_i32_f32                  dst, src0
-    v_cvt_i32_f64                  dst, src0
-    v_cvt_off_f32_i4               dst, src0
-    v_cvt_rpi_i32_f32              dst, src0
-    v_cvt_u32_f32                  dst, src0
-    v_cvt_u32_f64                  dst, src0
-    v_exp_f32                      dst, src0
-    v_exp_legacy_f32               dst, src0
-    v_ffbh_i32                     dst, src0
-    v_ffbh_u32                     dst, src0
-    v_ffbl_b32                     dst, src0
-    v_floor_f32                    dst, src0
-    v_floor_f64                    dst, src0
-    v_fract_f32                    dst, src0
-    v_fract_f64                    dst, src0
-    v_frexp_exp_i32_f32            dst, src0
-    v_frexp_exp_i32_f64            dst, src0
-    v_frexp_mant_f32               dst, src0
-    v_frexp_mant_f64               dst, src0
-    v_log_clamp_f32                dst, src0
-    v_log_f32                      dst, src0
-    v_log_legacy_f32               dst, src0
-    v_mov_b32                      dst, src0
-    v_mov_fed_b32                  dst, src0
-    v_movreld_b32                  dst, src0
-    v_movrels_b32                  dst, src0
-    v_movrelsd_b32                 dst, src0
-    v_nop
-    v_not_b32                      dst, src0
-    v_rcp_clamp_f32                dst, src0
-    v_rcp_clamp_f64                dst, src0
-    v_rcp_f32                      dst, src0
-    v_rcp_f64                      dst, src0
-    v_rcp_iflag_f32                dst, src0
-    v_rcp_legacy_f32               dst, src0
-    v_readfirstlane_b32            dst, src0
-    v_rndne_f32                    dst, src0
-    v_rndne_f64                    dst, src0
-    v_rsq_clamp_f32                dst, src0
-    v_rsq_clamp_f64                dst, src0
-    v_rsq_f32                      dst, src0
-    v_rsq_f64                      dst, src0
-    v_rsq_legacy_f32               dst, src0
-    v_sin_f32                      dst, src0
-    v_sqrt_f32                     dst, src0
-    v_sqrt_f64                     dst, src0
-    v_trunc_f32                    dst, src0
-    v_trunc_f64                    dst, src0
-
-VOP2
-===========================
-
-.. parsed-literal::
-
-    v_add_f32                      dst, src0, src1
-    v_add_i32                      dst0, dst1, src0, src1
-    v_addc_u32                     dst0, dst1, src0, src1, src2
-    v_and_b32                      dst, src0, src1
-    v_ashr_i32                     dst, src0, src1
-    v_ashrrev_i32                  dst, src0, src1
-    v_bcnt_u32_b32                 dst, src0, src1
-    v_bfm_b32                      dst, src0, src1
-    v_cndmask_b32                  dst, src0, src1, src2
-    v_cvt_pk_i16_i32               dst, src0, src1
-    v_cvt_pk_u16_u32               dst, src0, src1
-    v_cvt_pkaccum_u8_f32           dst, src0, src1
-    v_cvt_pknorm_i16_f32           dst, src0, src1
-    v_cvt_pknorm_u16_f32           dst, src0, src1
-    v_cvt_pkrtz_f16_f32            dst, src0, src1
-    v_ldexp_f32                    dst, src0, src1
-    v_lshl_b32                     dst, src0, src1
-    v_lshlrev_b32                  dst, src0, src1
-    v_lshr_b32                     dst, src0, src1
-    v_lshrrev_b32                  dst, src0, src1
-    v_mac_f32                      dst, src0, src1
-    v_mac_legacy_f32               dst, src0, src1
-    v_madak_f32                    dst, src0, src1, src2
-    v_madmk_f32                    dst, src0, src1, src2
-    v_max_f32                      dst, src0, src1
-    v_max_i32                      dst, src0, src1
-    v_max_legacy_f32               dst, src0, src1
-    v_max_u32                      dst, src0, src1
-    v_mbcnt_hi_u32_b32             dst, src0, src1
-    v_mbcnt_lo_u32_b32             dst, src0, src1
-    v_min_f32                      dst, src0, src1
-    v_min_i32                      dst, src0, src1
-    v_min_legacy_f32               dst, src0, src1
-    v_min_u32                      dst, src0, src1
-    v_mul_f32                      dst, src0, src1
-    v_mul_hi_i32_i24               dst, src0, src1
-    v_mul_hi_u32_u24               dst, src0, src1
-    v_mul_i32_i24                  dst, src0, src1
-    v_mul_legacy_f32               dst, src0, src1
-    v_mul_u32_u24                  dst, src0, src1
-    v_or_b32                       dst, src0, src1
-    v_readlane_b32                 dst, src0, src1
-    v_sub_f32                      dst, src0, src1
-    v_sub_i32                      dst0, dst1, src0, src1
-    v_subb_u32                     dst0, dst1, src0, src1, src2
-    v_subbrev_u32                  dst0, dst1, src0, src1, src2
-    v_subrev_f32                   dst, src0, src1
-    v_subrev_i32                   dst0, dst1, src0, src1
-    v_writelane_b32                dst, src0, src1
-    v_xor_b32                      dst, src0, src1
-
-VOP3
-===========================
-
-.. parsed-literal::
-
-    v_add_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_i32_e64                  dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_addc_u32_e64                 dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_alignbit_b32                 dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_alignbyte_b32                dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_and_b32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ashr_i32_e64                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ashr_i64                     dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ashrrev_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bcnt_u32_b32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bfe_i32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfe_u32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfi_b32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfm_b32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bfrev_b32_e64                dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ceil_f32_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ceil_f64_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_clrexcp_e64                                                 :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_class_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_class_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_f32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_f64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_i32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_i64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_u32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_u64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lg_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lg_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_neq_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_neq_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nge_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nge_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ngt_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ngt_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nle_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nle_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlg_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlg_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlt_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlt_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_o_f32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_o_f64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_i32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_i64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_u32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_u64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_tru_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_tru_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_u_f32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_u_f64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_eq_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_eq_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_f_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_f_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_ge_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_ge_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_gt_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_gt_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_le_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_le_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_lg_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_lg_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_lt_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_lt_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_neq_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_neq_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_nge_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_nge_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_ngt_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_ngt_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_nle_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_nle_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_nlg_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_nlg_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_nlt_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_nlt_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_o_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_o_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_tru_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_tru_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_u_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmps_u_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_eq_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_eq_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_f_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_f_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_ge_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_ge_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_gt_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_gt_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_le_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_le_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_lg_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_lg_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_lt_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_lt_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_neq_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_neq_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_nge_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_nge_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_ngt_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_ngt_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_nle_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_nle_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_nlg_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_nlg_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_nlt_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_nlt_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_o_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_o_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_tru_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_tru_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_u_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpsx_u_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_class_f32_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_class_f64_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lg_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lg_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_f32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_f64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_neq_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_neq_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nge_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nge_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ngt_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ngt_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nle_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nle_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlg_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlg_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlt_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlt_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_o_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_o_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_tru_f32_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_tru_f64_e64             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_u_f32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_u_f64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cndmask_b32_e64              dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_cos_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubeid_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubema_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubesc_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubetc_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f16_f32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_f16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_f64_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_i32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_u32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte0_e64           dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte1_e64           dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte2_e64           dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte3_e64           dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_f32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_i32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_u32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_flr_i32_f32_e64          dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_i32_f32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_i32_f64_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_off_f32_i4_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_i16_i32_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_u16_u32_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_u8_f32                dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pkaccum_u8_f32_e64       dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pknorm_i16_f32_e64       dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pknorm_u16_f32_e64       dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pkrtz_f16_f32_e64        dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_rpi_i32_f32_e64          dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_u32_f32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_u32_f64_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_f32                dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_f64                dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fmas_f32                 dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fmas_f64                 dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_scale_f32                dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_div_scale_f64                dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_exp_f32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_exp_legacy_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ffbh_i32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ffbh_u32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ffbl_b32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_floor_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_floor_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_f32                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_f64                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fract_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fract_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_exp_i32_f32_e64        dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_exp_i32_f64_e64        dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_mant_f32_e64           dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_mant_f64_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ldexp_f32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ldexp_f64                    dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_lerp_u8                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_log_clamp_f32_e64            dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_log_f32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_log_legacy_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_lshl_b32_e64                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshl_b64                     dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshlrev_b32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshr_b32_e64                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshr_b64                     dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshrrev_b32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mac_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mac_legacy_f32_e64           dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_f32                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i32_i24                  dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i64_i32                  dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_mad_legacy_f32               dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u32_u24                  dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u64_u32                  dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_max3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_max3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_max_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_i32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_max_legacy_f32_e64           dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_u32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mbcnt_hi_u32_b32_e64         dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mbcnt_lo_u32_b32_e64         dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_med3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_med3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_med3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_i32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_min_legacy_f32_e64           dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_u32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mov_b32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_mov_fed_b32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_movreld_b32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_movrels_b32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_movrelsd_b32_e64             dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_mqsad_pk_u16_u8              dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_mqsad_u32_u8                 dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_msad_u8                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_mul_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_i32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_i32_i24_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_u32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_u32_u24_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_i32_i24_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_legacy_f32_e64           dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_lo_i32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_lo_u32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_u32_u24_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mullit_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_nop_e64                                                     :ref:`omod<amdgpu_synid_omod>`
-    v_not_b32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_or_b32_e64                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_qsad_pk_u16_u8               dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_clamp_f32_e64            dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_clamp_f64_e64            dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_f32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_f64_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_iflag_f32_e64            dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_legacy_f32_e64           dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_rndne_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rndne_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_clamp_f32_e64            dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_clamp_f64_e64            dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_f64_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_legacy_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_hi_u8                    dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u16                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u8                       dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_sin_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sqrt_f32_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sqrt_f64_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_i32_e64                  dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_subb_u32_e64                 dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_subbrev_u32_e64              dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_i32_e64               dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_trig_preop_f64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_trunc_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_trunc_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_xor_b32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-
-VOPC
-===========================
-
-.. parsed-literal::
-
-    v_cmp_class_f32                dst, src0, src1
-    v_cmp_class_f64                dst, src0, src1
-    v_cmp_eq_f32                   dst, src0, src1
-    v_cmp_eq_f64                   dst, src0, src1
-    v_cmp_eq_i32                   dst, src0, src1
-    v_cmp_eq_i64                   dst, src0, src1
-    v_cmp_eq_u32                   dst, src0, src1
-    v_cmp_eq_u64                   dst, src0, src1
-    v_cmp_f_f32                    dst, src0, src1
-    v_cmp_f_f64                    dst, src0, src1
-    v_cmp_f_i32                    dst, src0, src1
-    v_cmp_f_i64                    dst, src0, src1
-    v_cmp_f_u32                    dst, src0, src1
-    v_cmp_f_u64                    dst, src0, src1
-    v_cmp_ge_f32                   dst, src0, src1
-    v_cmp_ge_f64                   dst, src0, src1
-    v_cmp_ge_i32                   dst, src0, src1
-    v_cmp_ge_i64                   dst, src0, src1
-    v_cmp_ge_u32                   dst, src0, src1
-    v_cmp_ge_u64                   dst, src0, src1
-    v_cmp_gt_f32                   dst, src0, src1
-    v_cmp_gt_f64                   dst, src0, src1
-    v_cmp_gt_i32                   dst, src0, src1
-    v_cmp_gt_i64                   dst, src0, src1
-    v_cmp_gt_u32                   dst, src0, src1
-    v_cmp_gt_u64                   dst, src0, src1
-    v_cmp_le_f32                   dst, src0, src1
-    v_cmp_le_f64                   dst, src0, src1
-    v_cmp_le_i32                   dst, src0, src1
-    v_cmp_le_i64                   dst, src0, src1
-    v_cmp_le_u32                   dst, src0, src1
-    v_cmp_le_u64                   dst, src0, src1
-    v_cmp_lg_f32                   dst, src0, src1
-    v_cmp_lg_f64                   dst, src0, src1
-    v_cmp_lt_f32                   dst, src0, src1
-    v_cmp_lt_f64                   dst, src0, src1
-    v_cmp_lt_i32                   dst, src0, src1
-    v_cmp_lt_i64                   dst, src0, src1
-    v_cmp_lt_u32                   dst, src0, src1
-    v_cmp_lt_u64                   dst, src0, src1
-    v_cmp_ne_i32                   dst, src0, src1
-    v_cmp_ne_i64                   dst, src0, src1
-    v_cmp_ne_u32                   dst, src0, src1
-    v_cmp_ne_u64                   dst, src0, src1
-    v_cmp_neq_f32                  dst, src0, src1
-    v_cmp_neq_f64                  dst, src0, src1
-    v_cmp_nge_f32                  dst, src0, src1
-    v_cmp_nge_f64                  dst, src0, src1
-    v_cmp_ngt_f32                  dst, src0, src1
-    v_cmp_ngt_f64                  dst, src0, src1
-    v_cmp_nle_f32                  dst, src0, src1
-    v_cmp_nle_f64                  dst, src0, src1
-    v_cmp_nlg_f32                  dst, src0, src1
-    v_cmp_nlg_f64                  dst, src0, src1
-    v_cmp_nlt_f32                  dst, src0, src1
-    v_cmp_nlt_f64                  dst, src0, src1
-    v_cmp_o_f32                    dst, src0, src1
-    v_cmp_o_f64                    dst, src0, src1
-    v_cmp_t_i32                    dst, src0, src1
-    v_cmp_t_i64                    dst, src0, src1
-    v_cmp_t_u32                    dst, src0, src1
-    v_cmp_t_u64                    dst, src0, src1
-    v_cmp_tru_f32                  dst, src0, src1
-    v_cmp_tru_f64                  dst, src0, src1
-    v_cmp_u_f32                    dst, src0, src1
-    v_cmp_u_f64                    dst, src0, src1
-    v_cmps_eq_f32                  dst, src0, src1
-    v_cmps_eq_f64                  dst, src0, src1
-    v_cmps_f_f32                   dst, src0, src1
-    v_cmps_f_f64                   dst, src0, src1
-    v_cmps_ge_f32                  dst, src0, src1
-    v_cmps_ge_f64                  dst, src0, src1
-    v_cmps_gt_f32                  dst, src0, src1
-    v_cmps_gt_f64                  dst, src0, src1
-    v_cmps_le_f32                  dst, src0, src1
-    v_cmps_le_f64                  dst, src0, src1
-    v_cmps_lg_f32                  dst, src0, src1
-    v_cmps_lg_f64                  dst, src0, src1
-    v_cmps_lt_f32                  dst, src0, src1
-    v_cmps_lt_f64                  dst, src0, src1
-    v_cmps_neq_f32                 dst, src0, src1
-    v_cmps_neq_f64                 dst, src0, src1
-    v_cmps_nge_f32                 dst, src0, src1
-    v_cmps_nge_f64                 dst, src0, src1
-    v_cmps_ngt_f32                 dst, src0, src1
-    v_cmps_ngt_f64                 dst, src0, src1
-    v_cmps_nle_f32                 dst, src0, src1
-    v_cmps_nle_f64                 dst, src0, src1
-    v_cmps_nlg_f32                 dst, src0, src1
-    v_cmps_nlg_f64                 dst, src0, src1
-    v_cmps_nlt_f32                 dst, src0, src1
-    v_cmps_nlt_f64                 dst, src0, src1
-    v_cmps_o_f32                   dst, src0, src1
-    v_cmps_o_f64                   dst, src0, src1
-    v_cmps_tru_f32                 dst, src0, src1
-    v_cmps_tru_f64                 dst, src0, src1
-    v_cmps_u_f32                   dst, src0, src1
-    v_cmps_u_f64                   dst, src0, src1
-    v_cmpsx_eq_f32                 dst, src0, src1
-    v_cmpsx_eq_f64                 dst, src0, src1
-    v_cmpsx_f_f32                  dst, src0, src1
-    v_cmpsx_f_f64                  dst, src0, src1
-    v_cmpsx_ge_f32                 dst, src0, src1
-    v_cmpsx_ge_f64                 dst, src0, src1
-    v_cmpsx_gt_f32                 dst, src0, src1
-    v_cmpsx_gt_f64                 dst, src0, src1
-    v_cmpsx_le_f32                 dst, src0, src1
-    v_cmpsx_le_f64                 dst, src0, src1
-    v_cmpsx_lg_f32                 dst, src0, src1
-    v_cmpsx_lg_f64                 dst, src0, src1
-    v_cmpsx_lt_f32                 dst, src0, src1
-    v_cmpsx_lt_f64                 dst, src0, src1
-    v_cmpsx_neq_f32                dst, src0, src1
-    v_cmpsx_neq_f64                dst, src0, src1
-    v_cmpsx_nge_f32                dst, src0, src1
-    v_cmpsx_nge_f64                dst, src0, src1
-    v_cmpsx_ngt_f32                dst, src0, src1
-    v_cmpsx_ngt_f64                dst, src0, src1
-    v_cmpsx_nle_f32                dst, src0, src1
-    v_cmpsx_nle_f64                dst, src0, src1
-    v_cmpsx_nlg_f32                dst, src0, src1
-    v_cmpsx_nlg_f64                dst, src0, src1
-    v_cmpsx_nlt_f32                dst, src0, src1
-    v_cmpsx_nlt_f64                dst, src0, src1
-    v_cmpsx_o_f32                  dst, src0, src1
-    v_cmpsx_o_f64                  dst, src0, src1
-    v_cmpsx_tru_f32                dst, src0, src1
-    v_cmpsx_tru_f64                dst, src0, src1
-    v_cmpsx_u_f32                  dst, src0, src1
-    v_cmpsx_u_f64                  dst, src0, src1
-    v_cmpx_class_f32               dst, src0, src1
-    v_cmpx_class_f64               dst, src0, src1
-    v_cmpx_eq_f32                  dst, src0, src1
-    v_cmpx_eq_f64                  dst, src0, src1
-    v_cmpx_eq_i32                  dst, src0, src1
-    v_cmpx_eq_i64                  dst, src0, src1
-    v_cmpx_eq_u32                  dst, src0, src1
-    v_cmpx_eq_u64                  dst, src0, src1
-    v_cmpx_f_f32                   dst, src0, src1
-    v_cmpx_f_f64                   dst, src0, src1
-    v_cmpx_f_i32                   dst, src0, src1
-    v_cmpx_f_i64                   dst, src0, src1
-    v_cmpx_f_u32                   dst, src0, src1
-    v_cmpx_f_u64                   dst, src0, src1
-    v_cmpx_ge_f32                  dst, src0, src1
-    v_cmpx_ge_f64                  dst, src0, src1
-    v_cmpx_ge_i32                  dst, src0, src1
-    v_cmpx_ge_i64                  dst, src0, src1
-    v_cmpx_ge_u32                  dst, src0, src1
-    v_cmpx_ge_u64                  dst, src0, src1
-    v_cmpx_gt_f32                  dst, src0, src1
-    v_cmpx_gt_f64                  dst, src0, src1
-    v_cmpx_gt_i32                  dst, src0, src1
-    v_cmpx_gt_i64                  dst, src0, src1
-    v_cmpx_gt_u32                  dst, src0, src1
-    v_cmpx_gt_u64                  dst, src0, src1
-    v_cmpx_le_f32                  dst, src0, src1
-    v_cmpx_le_f64                  dst, src0, src1
-    v_cmpx_le_i32                  dst, src0, src1
-    v_cmpx_le_i64                  dst, src0, src1
-    v_cmpx_le_u32                  dst, src0, src1
-    v_cmpx_le_u64                  dst, src0, src1
-    v_cmpx_lg_f32                  dst, src0, src1
-    v_cmpx_lg_f64                  dst, src0, src1
-    v_cmpx_lt_f32                  dst, src0, src1
-    v_cmpx_lt_f64                  dst, src0, src1
-    v_cmpx_lt_i32                  dst, src0, src1
-    v_cmpx_lt_i64                  dst, src0, src1
-    v_cmpx_lt_u32                  dst, src0, src1
-    v_cmpx_lt_u64                  dst, src0, src1
-    v_cmpx_ne_i32                  dst, src0, src1
-    v_cmpx_ne_i64                  dst, src0, src1
-    v_cmpx_ne_u32                  dst, src0, src1
-    v_cmpx_ne_u64                  dst, src0, src1
-    v_cmpx_neq_f32                 dst, src0, src1
-    v_cmpx_neq_f64                 dst, src0, src1
-    v_cmpx_nge_f32                 dst, src0, src1
-    v_cmpx_nge_f64                 dst, src0, src1
-    v_cmpx_ngt_f32                 dst, src0, src1
-    v_cmpx_ngt_f64                 dst, src0, src1
-    v_cmpx_nle_f32                 dst, src0, src1
-    v_cmpx_nle_f64                 dst, src0, src1
-    v_cmpx_nlg_f32                 dst, src0, src1
-    v_cmpx_nlg_f64                 dst, src0, src1
-    v_cmpx_nlt_f32                 dst, src0, src1
-    v_cmpx_nlt_f64                 dst, src0, src1
-    v_cmpx_o_f32                   dst, src0, src1
-    v_cmpx_o_f64                   dst, src0, src1
-    v_cmpx_t_i32                   dst, src0, src1
-    v_cmpx_t_i64                   dst, src0, src1
-    v_cmpx_t_u32                   dst, src0, src1
-    v_cmpx_t_u64                   dst, src0, src1
-    v_cmpx_tru_f32                 dst, src0, src1
-    v_cmpx_tru_f64                 dst, src0, src1
-    v_cmpx_u_f32                   dst, src0, src1
-    v_cmpx_u_f64                   dst, src0, src1
diff --git a/docs/AMDGPUAsmGFX8.rst b/docs/AMDGPUAsmGFX8.rst
deleted file mode 100644
index 44c48432fda11e6d79474f6fa3e9676df0d8995a..0000000000000000000000000000000000000000
--- a/docs/AMDGPUAsmGFX8.rst
+++ /dev/null
@@ -1,1672 +0,0 @@
-..
-    **************************************************
-    *                                                *
-    *   Automatically generated file, do not edit!   *
-    *                                                *
-    **************************************************
-
-===========================
-Syntax of GFX8 Instructions
-===========================
-
-.. contents::
-  :local:
-
-
-DS
-===========================
-
-.. parsed-literal::
-
-    ds_add_f32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_rtn_f32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_src2_f32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_b32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_b64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_rtn_b32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_rtn_b64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_src2_b32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_src2_b64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_append                      dst                            :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_bpermute_b32                dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>`
-    ds_cmpst_b32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_b64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_f32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_f64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_b32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_b64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_f32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_f64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_condxchg32_rtn_b64          dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_consume                     dst                            :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_barrier                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_init                    src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_br                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_p                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_release_all                                       :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_v                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_f32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_f64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_i32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_i64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_f32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_f64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_i32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_i64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_f32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_f64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_i32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_i64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_f32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_f64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_i32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_i64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_f32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_f64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_i32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_i64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_f32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_f64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_i32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_i64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_b32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_b64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_rtn_b32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_rtn_b64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_nop
-    ds_or_b32                      src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_b64                      src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_rtn_b32                  dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_rtn_b64                  dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_src2_b32                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_src2_b64                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_ordered_count               dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_permute_b32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>`
-    ds_read2_b32                   dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2_b64                   dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2st64_b32               dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2st64_b64               dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b128                   dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b32                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b64                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b96                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_i16                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_i8                     dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u16                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u8                     dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_rtn_u32                dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_rtn_u64                dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_src2_u32               src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_src2_u64               src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_u32                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_u64                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_swizzle_b32                 dst, src0                      :ref:`sw_offset16<amdgpu_synid_sw_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrap_rtn_b32                dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2_b32                  src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2_b64                  src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2st64_b32              src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2st64_b64              src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b128                  src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b16                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b32                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b64                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b8                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b96                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_src2_b32              src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_src2_b64              src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2_rtn_b32             dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2_rtn_b64             dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2st64_rtn_b32         dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2st64_rtn_b64         dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg_rtn_b32              dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg_rtn_b64              dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_b32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_b64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_rtn_b32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_rtn_b64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_src2_b32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_src2_b64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-
-EXP
-===========================
-
-.. parsed-literal::
-
-    exp                            dst, src0, src1, src2, src3    :ref:`done<amdgpu_synid_done>` :ref:`compr<amdgpu_synid_compr>` :ref:`vm<amdgpu_synid_vm>`
-
-FLAT
-===========================
-
-.. parsed-literal::
-
-    flat_atomic_add                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_add_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_and                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_and_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_cmpswap            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_cmpswap_x2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_dec                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_dec_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_inc                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_inc_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_or                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_or_x2              dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smax               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smax_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smin               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smin_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_sub                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_sub_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_swap               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_swap_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umax               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umax_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umin               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umin_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_xor                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_xor_x2             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dword                dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx2              dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx3              dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx4              dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_sbyte                dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_sshort               dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_ubyte                dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_ushort               dst, src0                      :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_byte                src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dword               src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx2             src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx3             src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx4             src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_short               src0, src1                     :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-
-MIMG
-===========================
-
-.. parsed-literal::
-
-    image_atomic_add               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_and               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_cmpswap           dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_dec               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_inc               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_or                dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_smax              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_smin              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_sub               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_swap              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_umax              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_umin              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_xor               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4                  dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_b                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_b_cl             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_b_cl_o           dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_b_o              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_b              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_b_cl           dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_b_cl_o         dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_b_o            dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_cl             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_cl_o           dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_l              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_l_o            dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_lz             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_lz_o           dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_o              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_cl               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_cl_o             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_l                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_l_o              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_lz               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_lz_o             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_o                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_get_lod                  dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_get_resinfo              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load                     dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_load_mip                 dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_load_mip_pck             dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_mip_pck_sgn         dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_pck                 dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_pck_sgn             dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample                   dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_b                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_b_cl              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_c                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_c_b               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_c_b_cl            dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_c_cl              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_c_l               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_c_lz              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_cl                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_l                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_lz                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_store                    src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_store_mip                src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_store_mip_pck            src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_store_pck                src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-
-MUBUF
-===========================
-
-.. parsed-literal::
-
-    buffer_atomic_add              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_add_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_and              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_and_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_cmpswap          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_cmpswap_x2       dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_dec              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_dec_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_inc              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_inc_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_or               dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_or_x2            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smax             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smax_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smin             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smin_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_sub              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_sub_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_swap             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_swap_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umax             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umax_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umin             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umin_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_xor              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_xor_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dword              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_dwordx2            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dwordx3            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dwordx4            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_x       dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_xy      dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_xyz     dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_xyzw    dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_x           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_format_xy          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_xyz         dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_xyzw        dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_sbyte              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_sshort             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_ubyte              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_ushort             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_store_byte              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dword             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx3           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx4           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_x      src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_xy     src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_xyz    src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_xyzw   src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_x          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xy         src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xyz        src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xyzw       src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_lds_dword         src0, src1                     :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`lds<amdgpu_synid_lds>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_short             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_wbinvl1
-    buffer_wbinvl1_vol
-
-SMEM
-===========================
-
-.. parsed-literal::
-
-    s_atc_probe                    src0, src1, src2
-    s_atc_probe_buffer             src0, src1, src2
-    s_buffer_load_dword            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dwordx16         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dwordx2          dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dwordx4          dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dwordx8          dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_store_dword           src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_store_dwordx2         src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_store_dwordx4         src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_dcache_inv
-    s_dcache_inv_vol
-    s_dcache_wb
-    s_dcache_wb_vol
-    s_load_dword                   dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_load_dwordx16                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_load_dwordx2                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_load_dwordx4                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_load_dwordx8                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_memrealtime                  dst
-    s_memtime                      dst
-    s_store_dword                  src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_store_dwordx2                src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_store_dwordx4                src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-
-SOP1
-===========================
-
-.. parsed-literal::
-
-    s_abs_i32                      dst, src0
-    s_and_saveexec_b64             dst, src0
-    s_andn2_saveexec_b64           dst, src0
-    s_bcnt0_i32_b32                dst, src0
-    s_bcnt0_i32_b64                dst, src0
-    s_bcnt1_i32_b32                dst, src0
-    s_bcnt1_i32_b64                dst, src0
-    s_bitset0_b32                  dst, src0
-    s_bitset0_b64                  dst, src0
-    s_bitset1_b32                  dst, src0
-    s_bitset1_b64                  dst, src0
-    s_brev_b32                     dst, src0
-    s_brev_b64                     dst, src0
-    s_cbranch_join                 src0
-    s_cmov_b32                     dst, src0
-    s_cmov_b64                     dst, src0
-    s_ff0_i32_b32                  dst, src0
-    s_ff0_i32_b64                  dst, src0
-    s_ff1_i32_b32                  dst, src0
-    s_ff1_i32_b64                  dst, src0
-    s_flbit_i32                    dst, src0
-    s_flbit_i32_b32                dst, src0
-    s_flbit_i32_b64                dst, src0
-    s_flbit_i32_i64                dst, src0
-    s_getpc_b64                    dst
-    s_mov_b32                      dst, src0
-    s_mov_b64                      dst, src0
-    s_mov_fed_b32                  dst, src0
-    s_movreld_b32                  dst, src0
-    s_movreld_b64                  dst, src0
-    s_movrels_b32                  dst, src0
-    s_movrels_b64                  dst, src0
-    s_nand_saveexec_b64            dst, src0
-    s_nor_saveexec_b64             dst, src0
-    s_not_b32                      dst, src0
-    s_not_b64                      dst, src0
-    s_or_saveexec_b64              dst, src0
-    s_orn2_saveexec_b64            dst, src0
-    s_quadmask_b32                 dst, src0
-    s_quadmask_b64                 dst, src0
-    s_rfe_b64                      src0
-    s_set_gpr_idx_idx              src0
-    s_setpc_b64                    src0
-    s_sext_i32_i16                 dst, src0
-    s_sext_i32_i8                  dst, src0
-    s_swappc_b64                   dst, src0
-    s_wqm_b32                      dst, src0
-    s_wqm_b64                      dst, src0
-    s_xnor_saveexec_b64            dst, src0
-    s_xor_saveexec_b64             dst, src0
-
-SOP2
-===========================
-
-.. parsed-literal::
-
-    s_absdiff_i32                  dst, src0, src1
-    s_add_i32                      dst, src0, src1
-    s_add_u32                      dst, src0, src1
-    s_addc_u32                     dst, src0, src1
-    s_and_b32                      dst, src0, src1
-    s_and_b64                      dst, src0, src1
-    s_andn2_b32                    dst, src0, src1
-    s_andn2_b64                    dst, src0, src1
-    s_ashr_i32                     dst, src0, src1
-    s_ashr_i64                     dst, src0, src1
-    s_bfe_i32                      dst, src0, src1
-    s_bfe_i64                      dst, src0, src1
-    s_bfe_u32                      dst, src0, src1
-    s_bfe_u64                      dst, src0, src1
-    s_bfm_b32                      dst, src0, src1
-    s_bfm_b64                      dst, src0, src1
-    s_cbranch_g_fork               src0, src1
-    s_cselect_b32                  dst, src0, src1
-    s_cselect_b64                  dst, src0, src1
-    s_lshl_b32                     dst, src0, src1
-    s_lshl_b64                     dst, src0, src1
-    s_lshr_b32                     dst, src0, src1
-    s_lshr_b64                     dst, src0, src1
-    s_max_i32                      dst, src0, src1
-    s_max_u32                      dst, src0, src1
-    s_min_i32                      dst, src0, src1
-    s_min_u32                      dst, src0, src1
-    s_mul_i32                      dst, src0, src1
-    s_nand_b32                     dst, src0, src1
-    s_nand_b64                     dst, src0, src1
-    s_nor_b32                      dst, src0, src1
-    s_nor_b64                      dst, src0, src1
-    s_or_b32                       dst, src0, src1
-    s_or_b64                       dst, src0, src1
-    s_orn2_b32                     dst, src0, src1
-    s_orn2_b64                     dst, src0, src1
-    s_rfe_restore_b64              src0, src1
-    s_sub_i32                      dst, src0, src1
-    s_sub_u32                      dst, src0, src1
-    s_subb_u32                     dst, src0, src1
-    s_xnor_b32                     dst, src0, src1
-    s_xnor_b64                     dst, src0, src1
-    s_xor_b32                      dst, src0, src1
-    s_xor_b64                      dst, src0, src1
-
-SOPC
-===========================
-
-.. parsed-literal::
-
-    s_bitcmp0_b32                  src0, src1
-    s_bitcmp0_b64                  src0, src1
-    s_bitcmp1_b32                  src0, src1
-    s_bitcmp1_b64                  src0, src1
-    s_cmp_eq_i32                   src0, src1
-    s_cmp_eq_u32                   src0, src1
-    s_cmp_eq_u64                   src0, src1
-    s_cmp_ge_i32                   src0, src1
-    s_cmp_ge_u32                   src0, src1
-    s_cmp_gt_i32                   src0, src1
-    s_cmp_gt_u32                   src0, src1
-    s_cmp_le_i32                   src0, src1
-    s_cmp_le_u32                   src0, src1
-    s_cmp_lg_i32                   src0, src1
-    s_cmp_lg_u32                   src0, src1
-    s_cmp_lg_u64                   src0, src1
-    s_cmp_lt_i32                   src0, src1
-    s_cmp_lt_u32                   src0, src1
-    s_set_gpr_idx_on               src0, src1
-    s_setvskip                     src0, src1
-
-SOPK
-===========================
-
-.. parsed-literal::
-
-    s_addk_i32                     dst, src0
-    s_cbranch_i_fork               src0, src1
-    s_cmovk_i32                    dst, src0
-    s_cmpk_eq_i32                  src0, src1
-    s_cmpk_eq_u32                  src0, src1
-    s_cmpk_ge_i32                  src0, src1
-    s_cmpk_ge_u32                  src0, src1
-    s_cmpk_gt_i32                  src0, src1
-    s_cmpk_gt_u32                  src0, src1
-    s_cmpk_le_i32                  src0, src1
-    s_cmpk_le_u32                  src0, src1
-    s_cmpk_lg_i32                  src0, src1
-    s_cmpk_lg_u32                  src0, src1
-    s_cmpk_lt_i32                  src0, src1
-    s_cmpk_lt_u32                  src0, src1
-    s_getreg_b32                   dst, src0
-    s_movk_i32                     dst, src0
-    s_mulk_i32                     dst, src0
-    s_setreg_b32                   dst, src0
-    s_setreg_imm32_b32             dst, src0
-
-SOPP
-===========================
-
-.. parsed-literal::
-
-    s_barrier
-    s_branch                       src0
-    s_cbranch_cdbgsys              src0
-    s_cbranch_cdbgsys_and_user     src0
-    s_cbranch_cdbgsys_or_user      src0
-    s_cbranch_cdbguser             src0
-    s_cbranch_execnz               src0
-    s_cbranch_execz                src0
-    s_cbranch_scc0                 src0
-    s_cbranch_scc1                 src0
-    s_cbranch_vccnz                src0
-    s_cbranch_vccz                 src0
-    s_decperflevel                 src0
-    s_endpgm
-    s_endpgm_saved
-    s_icache_inv
-    s_incperflevel                 src0
-    s_nop                          src0
-    s_sendmsg                      src0
-    s_sendmsghalt                  src0
-    s_set_gpr_idx_mode             src0
-    s_set_gpr_idx_off
-    s_sethalt                      src0
-    s_setkill                      src0
-    s_setprio                      src0
-    s_sleep                        src0
-    s_trap                         src0
-    s_ttracedata
-    s_waitcnt                      src0
-    s_wakeup
-
-VINTRP
-===========================
-
-.. parsed-literal::
-
-    v_interp_mov_f32               dst, src0, src1
-    v_interp_p1_f32                dst, src0, src1
-    v_interp_p2_f32                dst, src0, src1
-
-VOP1
-===========================
-
-.. parsed-literal::
-
-    v_bfrev_b32                    dst, src0
-    v_bfrev_b32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_bfrev_b32_sdwa               dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ceil_f16                     dst, src0
-    v_ceil_f16_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ceil_f16_sdwa                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ceil_f32                     dst, src0
-    v_ceil_f32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ceil_f32_sdwa                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ceil_f64                     dst, src0
-    v_clrexcp
-    v_cos_f16                      dst, src0
-    v_cos_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cos_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cos_f32                      dst, src0
-    v_cos_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cos_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f16_f32                  dst, src0
-    v_cvt_f16_f32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f16_f32_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f16_i16                  dst, src0
-    v_cvt_f16_i16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f16_i16_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f16_u16                  dst, src0
-    v_cvt_f16_u16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f16_u16_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_f16                  dst, src0
-    v_cvt_f32_f16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_f16_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_f64                  dst, src0
-    v_cvt_f32_i32                  dst, src0
-    v_cvt_f32_i32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_i32_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_u32                  dst, src0
-    v_cvt_f32_u32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_u32_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_ubyte0               dst, src0
-    v_cvt_f32_ubyte0_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_ubyte0_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_ubyte1               dst, src0
-    v_cvt_f32_ubyte1_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_ubyte1_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_ubyte2               dst, src0
-    v_cvt_f32_ubyte2_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_ubyte2_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_ubyte3               dst, src0
-    v_cvt_f32_ubyte3_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_ubyte3_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f64_f32                  dst, src0
-    v_cvt_f64_i32                  dst, src0
-    v_cvt_f64_u32                  dst, src0
-    v_cvt_flr_i32_f32              dst, src0
-    v_cvt_flr_i32_f32_dpp          dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_flr_i32_f32_sdwa         dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_i16_f16                  dst, src0
-    v_cvt_i16_f16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_i16_f16_sdwa             dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_i32_f32                  dst, src0
-    v_cvt_i32_f32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_i32_f32_sdwa             dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_i32_f64                  dst, src0
-    v_cvt_off_f32_i4               dst, src0
-    v_cvt_off_f32_i4_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_off_f32_i4_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_rpi_i32_f32              dst, src0
-    v_cvt_rpi_i32_f32_dpp          dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_rpi_i32_f32_sdwa         dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_u16_f16                  dst, src0
-    v_cvt_u16_f16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_u16_f16_sdwa             dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_u32_f32                  dst, src0
-    v_cvt_u32_f32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_u32_f32_sdwa             dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_u32_f64                  dst, src0
-    v_exp_f16                      dst, src0
-    v_exp_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_exp_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_exp_f32                      dst, src0
-    v_exp_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_exp_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_exp_legacy_f32               dst, src0
-    v_exp_legacy_f32_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_exp_legacy_f32_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ffbh_i32                     dst, src0
-    v_ffbh_i32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ffbh_i32_sdwa                dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ffbh_u32                     dst, src0
-    v_ffbh_u32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ffbh_u32_sdwa                dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ffbl_b32                     dst, src0
-    v_ffbl_b32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ffbl_b32_sdwa                dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_floor_f16                    dst, src0
-    v_floor_f16_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_floor_f16_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_floor_f32                    dst, src0
-    v_floor_f32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_floor_f32_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_floor_f64                    dst, src0
-    v_fract_f16                    dst, src0
-    v_fract_f16_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_fract_f16_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_fract_f32                    dst, src0
-    v_fract_f32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_fract_f32_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_fract_f64                    dst, src0
-    v_frexp_exp_i16_f16            dst, src0
-    v_frexp_exp_i16_f16_dpp        dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_frexp_exp_i16_f16_sdwa       dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_frexp_exp_i32_f32            dst, src0
-    v_frexp_exp_i32_f32_dpp        dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_frexp_exp_i32_f32_sdwa       dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_frexp_exp_i32_f64            dst, src0
-    v_frexp_mant_f16               dst, src0
-    v_frexp_mant_f16_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_frexp_mant_f16_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_frexp_mant_f32               dst, src0
-    v_frexp_mant_f32_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_frexp_mant_f32_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_frexp_mant_f64               dst, src0
-    v_log_f16                      dst, src0
-    v_log_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_log_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_log_f32                      dst, src0
-    v_log_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_log_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_log_legacy_f32               dst, src0
-    v_log_legacy_f32_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_log_legacy_f32_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_mov_b32                      dst, src0
-    v_mov_b32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mov_b32_sdwa                 dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_mov_fed_b32                  dst, src0
-    v_mov_fed_b32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mov_fed_b32_sdwa             dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_movreld_b32                  dst, src0
-    v_movrels_b32                  dst, src0
-    v_movrelsd_b32                 dst, src0
-    v_nop
-    v_not_b32                      dst, src0
-    v_not_b32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_not_b32_sdwa                 dst, src0                      :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rcp_f16                      dst, src0
-    v_rcp_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rcp_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rcp_f32                      dst, src0
-    v_rcp_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rcp_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rcp_f64                      dst, src0
-    v_rcp_iflag_f32                dst, src0
-    v_rcp_iflag_f32_dpp            dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rcp_iflag_f32_sdwa           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_readfirstlane_b32            dst, src0
-    v_rndne_f16                    dst, src0
-    v_rndne_f16_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rndne_f16_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rndne_f32                    dst, src0
-    v_rndne_f32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rndne_f32_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rndne_f64                    dst, src0
-    v_rsq_f16                      dst, src0
-    v_rsq_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rsq_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rsq_f32                      dst, src0
-    v_rsq_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rsq_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rsq_f64                      dst, src0
-    v_sin_f16                      dst, src0
-    v_sin_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sin_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sin_f32                      dst, src0
-    v_sin_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sin_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sqrt_f16                     dst, src0
-    v_sqrt_f16_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sqrt_f16_sdwa                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sqrt_f32                     dst, src0
-    v_sqrt_f32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sqrt_f32_sdwa                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sqrt_f64                     dst, src0
-    v_trunc_f16                    dst, src0
-    v_trunc_f16_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_trunc_f16_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_trunc_f32                    dst, src0
-    v_trunc_f32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_trunc_f32_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_trunc_f64                    dst, src0
-
-VOP2
-===========================
-
-.. parsed-literal::
-
-    v_add_f16                      dst, src0, src1
-    v_add_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_add_f32                      dst, src0, src1
-    v_add_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_add_u16                      dst, src0, src1
-    v_add_u16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_u16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_add_u32                      dst0, dst1, src0, src1
-    v_add_u32_dpp                  dst0, dst1, src0, src1         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_u32_sdwa                 dst0, dst1, src0, src1         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_addc_u32                     dst0, dst1, src0, src1, src2
-    v_addc_u32_dpp                 dst0, dst1, src0, src1, src2   :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_addc_u32_sdwa                dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_and_b32                      dst, src0, src1
-    v_and_b32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_and_b32_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_ashrrev_i16                  dst, src0, src1
-    v_ashrrev_i16_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ashrrev_i16_sdwa             dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_ashrrev_i32                  dst, src0, src1
-    v_ashrrev_i32_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ashrrev_i32_sdwa             dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cndmask_b32                  dst, src0, src1, src2
-    v_cndmask_b32_dpp              dst, src0, src1, src2          :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cndmask_b32_sdwa             dst, src0, src1, src2          :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_ldexp_f16                    dst, src0, src1
-    v_ldexp_f16_dpp                dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ldexp_f16_sdwa               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_lshlrev_b16                  dst, src0, src1
-    v_lshlrev_b16_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_lshlrev_b16_sdwa             dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_lshlrev_b32                  dst, src0, src1
-    v_lshlrev_b32_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_lshlrev_b32_sdwa             dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_lshrrev_b16                  dst, src0, src1
-    v_lshrrev_b16_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_lshrrev_b16_sdwa             dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_lshrrev_b32                  dst, src0, src1
-    v_lshrrev_b32_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_lshrrev_b32_sdwa             dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mac_f16                      dst, src0, src1
-    v_mac_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mac_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mac_f32                      dst, src0, src1
-    v_mac_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mac_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_madak_f16                    dst, src0, src1, src2
-    v_madak_f32                    dst, src0, src1, src2
-    v_madmk_f16                    dst, src0, src1, src2
-    v_madmk_f32                    dst, src0, src1, src2
-    v_max_f16                      dst, src0, src1
-    v_max_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_f32                      dst, src0, src1
-    v_max_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_i16                      dst, src0, src1
-    v_max_i16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_i16_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_i32                      dst, src0, src1
-    v_max_i32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_i32_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_u16                      dst, src0, src1
-    v_max_u16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_u16_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_u32                      dst, src0, src1
-    v_max_u32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_u32_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_f16                      dst, src0, src1
-    v_min_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_f32                      dst, src0, src1
-    v_min_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_i16                      dst, src0, src1
-    v_min_i16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_i16_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_i32                      dst, src0, src1
-    v_min_i32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_i32_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_u16                      dst, src0, src1
-    v_min_u16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_u16_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_u32                      dst, src0, src1
-    v_min_u32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_u32_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_f16                      dst, src0, src1
-    v_mul_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_f32                      dst, src0, src1
-    v_mul_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_hi_i32_i24               dst, src0, src1
-    v_mul_hi_i32_i24_dpp           dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_hi_i32_i24_sdwa          dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_hi_u32_u24               dst, src0, src1
-    v_mul_hi_u32_u24_dpp           dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_hi_u32_u24_sdwa          dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_i32_i24                  dst, src0, src1
-    v_mul_i32_i24_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_i32_i24_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_legacy_f32               dst, src0, src1
-    v_mul_legacy_f32_dpp           dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_legacy_f32_sdwa          dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_lo_u16                   dst, src0, src1
-    v_mul_lo_u16_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_lo_u16_sdwa              dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_u32_u24                  dst, src0, src1
-    v_mul_u32_u24_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_u32_u24_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_or_b32                       dst, src0, src1
-    v_or_b32_dpp                   dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_or_b32_sdwa                  dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_f16                      dst, src0, src1
-    v_sub_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_f32                      dst, src0, src1
-    v_sub_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_u16                      dst, src0, src1
-    v_sub_u16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_u16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_u32                      dst0, dst1, src0, src1
-    v_sub_u32_dpp                  dst0, dst1, src0, src1         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_u32_sdwa                 dst0, dst1, src0, src1         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subb_u32                     dst0, dst1, src0, src1, src2
-    v_subb_u32_dpp                 dst0, dst1, src0, src1, src2   :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subb_u32_sdwa                dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subbrev_u32                  dst0, dst1, src0, src1, src2
-    v_subbrev_u32_dpp              dst0, dst1, src0, src1, src2   :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subbrev_u32_sdwa             dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_f16                   dst, src0, src1
-    v_subrev_f16_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_f32                   dst, src0, src1
-    v_subrev_f32_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_u16                   dst, src0, src1
-    v_subrev_u16_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_u16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_u32                   dst0, dst1, src0, src1
-    v_subrev_u32_dpp               dst0, dst1, src0, src1         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_u32_sdwa              dst0, dst1, src0, src1         :ref:`clamp<amdgpu_synid_clamp>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_xor_b32                      dst, src0, src1
-    v_xor_b32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_xor_b32_sdwa                 dst, src0, src1                :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-
-VOP3
-===========================
-
-.. parsed-literal::
-
-    v_add_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_u16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_add_u32_e64                  dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_addc_u32_e64                 dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_alignbit_b32                 dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_alignbyte_b32                dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_and_b32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ashrrev_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ashrrev_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ashrrev_i64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bcnt_u32_b32                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bfe_i32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfe_u32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfi_b32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfm_b32                      dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bfrev_b32_e64                dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ceil_f16_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ceil_f32_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ceil_f64_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_clrexcp_e64                                                 :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_class_f16_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_class_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_class_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_f16_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_f32_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_f64_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_i16_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_i32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_i64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_u16_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_u32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_u64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lg_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lg_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lg_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_neq_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_neq_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_neq_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nge_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nge_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nge_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ngt_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ngt_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ngt_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nle_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nle_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nle_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlg_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlg_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlg_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlt_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlt_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlt_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_o_f16_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_o_f32_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_o_f64_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_i16_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_i32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_i64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_u16_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_u32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_u64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_tru_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_tru_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_tru_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_u_f16_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_u_f32_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_u_f64_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_class_f16_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_class_f32_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_class_f64_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lg_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lg_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lg_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_neq_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_neq_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_neq_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nge_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nge_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nge_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ngt_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ngt_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ngt_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nle_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nle_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nle_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlg_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlg_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlg_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlt_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlt_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlt_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_o_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_o_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_o_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_tru_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_tru_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_tru_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_u_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_u_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_u_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cndmask_b32_e64              dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_cos_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cos_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubeid_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubema_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubesc_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubetc_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f16_f32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f16_i16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f16_u16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_f16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_f64_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_i32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_u32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte0_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte1_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte2_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte3_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_f32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_i32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_u32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_flr_i32_f32_e64          dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_i16_f16_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_i32_f32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_i32_f64_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_off_f32_i4_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_i16_i32               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_u16_u32               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_u8_f32                dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pkaccum_u8_f32           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pknorm_i16_f32           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pknorm_u16_f32           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pkrtz_f16_f32            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_rpi_i32_f32_e64          dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_u16_f16_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_u32_f32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_u32_f64_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_f16                dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_f32                dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_f64                dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fmas_f32                 dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fmas_f64                 dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_scale_f32                dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_div_scale_f64                dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_exp_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_exp_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_exp_legacy_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ffbh_i32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ffbh_u32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ffbl_b32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_floor_f16_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_floor_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_floor_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_f16                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_f32                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_f64                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fract_f16_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fract_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fract_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_exp_i16_f16_e64        dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_exp_i32_f32_e64        dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_exp_i32_f64_e64        dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_mant_f16_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_mant_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_mant_f64_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_mov_f32_e64           dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p1_f32_e64            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p1ll_f16              dst, src0, src1                :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p1lv_f16              dst, src0, src1, src2          :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p2_f16                dst, src0, src1, src2          :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p2_f32_e64            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ldexp_f16_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ldexp_f32                    dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ldexp_f64                    dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_lerp_u8                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_log_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_log_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_log_legacy_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_lshlrev_b16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshlrev_b32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshlrev_b64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshrrev_b16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshrrev_b32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshrrev_b64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mac_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mac_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_f16                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_f32                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i16                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i32_i24                  dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i64_i32                  dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_legacy_f32               dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u16                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u32_u24                  dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u64_u32                  dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_max3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_max_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_i16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_max_i32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_max_u16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_max_u32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mbcnt_hi_u32_b32             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mbcnt_lo_u32_b32             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_med3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_med3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_med3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_i16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_min_i32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_min_u16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_min_u32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mov_b32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_mov_fed_b32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_movreld_b32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_movrels_b32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_movrelsd_b32_e64             dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_mqsad_pk_u16_u8              dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mqsad_u32_u8                 dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_msad_u8                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_i32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_i32_i24_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_u32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_u32_u24_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_i32_i24_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_legacy_f32_e64           dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_lo_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_lo_u32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_u32_u24_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_nop_e64                                                     :ref:`omod<amdgpu_synid_omod>`
-    v_not_b32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_or_b32_e64                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_perm_b32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_qsad_pk_u16_u8               dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_f64_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_iflag_f32_e64            dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_readlane_b32                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_rndne_f16_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rndne_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rndne_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_f64_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_hi_u8                    dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u16                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u32                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u8                       dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sin_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sin_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sqrt_f16_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sqrt_f32_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sqrt_f64_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_u16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_sub_u32_e64                  dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_subb_u32_e64                 dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_subbrev_u32_e64              dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_u32_e64               dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_trig_preop_f64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_trunc_f16_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_trunc_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_trunc_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_writelane_b32                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_xor_b32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-
-VOPC
-===========================
-
-.. parsed-literal::
-
-    v_cmp_class_f16                dst, src0, src1
-    v_cmp_class_f16_sdwa           dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_class_f32                dst, src0, src1
-    v_cmp_class_f32_sdwa           dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_class_f64                dst, src0, src1
-    v_cmp_eq_f16                   dst, src0, src1
-    v_cmp_eq_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_f32                   dst, src0, src1
-    v_cmp_eq_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_f64                   dst, src0, src1
-    v_cmp_eq_i16                   dst, src0, src1
-    v_cmp_eq_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_i32                   dst, src0, src1
-    v_cmp_eq_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_i64                   dst, src0, src1
-    v_cmp_eq_u16                   dst, src0, src1
-    v_cmp_eq_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_u32                   dst, src0, src1
-    v_cmp_eq_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_u64                   dst, src0, src1
-    v_cmp_f_f16                    dst, src0, src1
-    v_cmp_f_f16_sdwa               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_f32                    dst, src0, src1
-    v_cmp_f_f32_sdwa               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_f64                    dst, src0, src1
-    v_cmp_f_i16                    dst, src0, src1
-    v_cmp_f_i16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_i32                    dst, src0, src1
-    v_cmp_f_i32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_i64                    dst, src0, src1
-    v_cmp_f_u16                    dst, src0, src1
-    v_cmp_f_u16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_u32                    dst, src0, src1
-    v_cmp_f_u32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_u64                    dst, src0, src1
-    v_cmp_ge_f16                   dst, src0, src1
-    v_cmp_ge_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_f32                   dst, src0, src1
-    v_cmp_ge_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_f64                   dst, src0, src1
-    v_cmp_ge_i16                   dst, src0, src1
-    v_cmp_ge_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_i32                   dst, src0, src1
-    v_cmp_ge_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_i64                   dst, src0, src1
-    v_cmp_ge_u16                   dst, src0, src1
-    v_cmp_ge_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_u32                   dst, src0, src1
-    v_cmp_ge_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_u64                   dst, src0, src1
-    v_cmp_gt_f16                   dst, src0, src1
-    v_cmp_gt_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_f32                   dst, src0, src1
-    v_cmp_gt_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_f64                   dst, src0, src1
-    v_cmp_gt_i16                   dst, src0, src1
-    v_cmp_gt_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_i32                   dst, src0, src1
-    v_cmp_gt_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_i64                   dst, src0, src1
-    v_cmp_gt_u16                   dst, src0, src1
-    v_cmp_gt_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_u32                   dst, src0, src1
-    v_cmp_gt_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_u64                   dst, src0, src1
-    v_cmp_le_f16                   dst, src0, src1
-    v_cmp_le_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_f32                   dst, src0, src1
-    v_cmp_le_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_f64                   dst, src0, src1
-    v_cmp_le_i16                   dst, src0, src1
-    v_cmp_le_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_i32                   dst, src0, src1
-    v_cmp_le_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_i64                   dst, src0, src1
-    v_cmp_le_u16                   dst, src0, src1
-    v_cmp_le_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_u32                   dst, src0, src1
-    v_cmp_le_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_u64                   dst, src0, src1
-    v_cmp_lg_f16                   dst, src0, src1
-    v_cmp_lg_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lg_f32                   dst, src0, src1
-    v_cmp_lg_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lg_f64                   dst, src0, src1
-    v_cmp_lt_f16                   dst, src0, src1
-    v_cmp_lt_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_f32                   dst, src0, src1
-    v_cmp_lt_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_f64                   dst, src0, src1
-    v_cmp_lt_i16                   dst, src0, src1
-    v_cmp_lt_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_i32                   dst, src0, src1
-    v_cmp_lt_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_i64                   dst, src0, src1
-    v_cmp_lt_u16                   dst, src0, src1
-    v_cmp_lt_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_u32                   dst, src0, src1
-    v_cmp_lt_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_u64                   dst, src0, src1
-    v_cmp_ne_i16                   dst, src0, src1
-    v_cmp_ne_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ne_i32                   dst, src0, src1
-    v_cmp_ne_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ne_i64                   dst, src0, src1
-    v_cmp_ne_u16                   dst, src0, src1
-    v_cmp_ne_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ne_u32                   dst, src0, src1
-    v_cmp_ne_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ne_u64                   dst, src0, src1
-    v_cmp_neq_f16                  dst, src0, src1
-    v_cmp_neq_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_neq_f32                  dst, src0, src1
-    v_cmp_neq_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_neq_f64                  dst, src0, src1
-    v_cmp_nge_f16                  dst, src0, src1
-    v_cmp_nge_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nge_f32                  dst, src0, src1
-    v_cmp_nge_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nge_f64                  dst, src0, src1
-    v_cmp_ngt_f16                  dst, src0, src1
-    v_cmp_ngt_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ngt_f32                  dst, src0, src1
-    v_cmp_ngt_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ngt_f64                  dst, src0, src1
-    v_cmp_nle_f16                  dst, src0, src1
-    v_cmp_nle_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nle_f32                  dst, src0, src1
-    v_cmp_nle_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nle_f64                  dst, src0, src1
-    v_cmp_nlg_f16                  dst, src0, src1
-    v_cmp_nlg_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nlg_f32                  dst, src0, src1
-    v_cmp_nlg_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nlg_f64                  dst, src0, src1
-    v_cmp_nlt_f16                  dst, src0, src1
-    v_cmp_nlt_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nlt_f32                  dst, src0, src1
-    v_cmp_nlt_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nlt_f64                  dst, src0, src1
-    v_cmp_o_f16                    dst, src0, src1
-    v_cmp_o_f16_sdwa               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_o_f32                    dst, src0, src1
-    v_cmp_o_f32_sdwa               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_o_f64                    dst, src0, src1
-    v_cmp_t_i16                    dst, src0, src1
-    v_cmp_t_i16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_t_i32                    dst, src0, src1
-    v_cmp_t_i32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_t_i64                    dst, src0, src1
-    v_cmp_t_u16                    dst, src0, src1
-    v_cmp_t_u16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_t_u32                    dst, src0, src1
-    v_cmp_t_u32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_t_u64                    dst, src0, src1
-    v_cmp_tru_f16                  dst, src0, src1
-    v_cmp_tru_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_tru_f32                  dst, src0, src1
-    v_cmp_tru_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_tru_f64                  dst, src0, src1
-    v_cmp_u_f16                    dst, src0, src1
-    v_cmp_u_f16_sdwa               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_u_f32                    dst, src0, src1
-    v_cmp_u_f32_sdwa               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_u_f64                    dst, src0, src1
-    v_cmpx_class_f16               dst, src0, src1
-    v_cmpx_class_f16_sdwa          dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_class_f32               dst, src0, src1
-    v_cmpx_class_f32_sdwa          dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_class_f64               dst, src0, src1
-    v_cmpx_eq_f16                  dst, src0, src1
-    v_cmpx_eq_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_f32                  dst, src0, src1
-    v_cmpx_eq_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_f64                  dst, src0, src1
-    v_cmpx_eq_i16                  dst, src0, src1
-    v_cmpx_eq_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_i32                  dst, src0, src1
-    v_cmpx_eq_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_i64                  dst, src0, src1
-    v_cmpx_eq_u16                  dst, src0, src1
-    v_cmpx_eq_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_u32                  dst, src0, src1
-    v_cmpx_eq_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_u64                  dst, src0, src1
-    v_cmpx_f_f16                   dst, src0, src1
-    v_cmpx_f_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_f32                   dst, src0, src1
-    v_cmpx_f_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_f64                   dst, src0, src1
-    v_cmpx_f_i16                   dst, src0, src1
-    v_cmpx_f_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_i32                   dst, src0, src1
-    v_cmpx_f_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_i64                   dst, src0, src1
-    v_cmpx_f_u16                   dst, src0, src1
-    v_cmpx_f_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_u32                   dst, src0, src1
-    v_cmpx_f_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_u64                   dst, src0, src1
-    v_cmpx_ge_f16                  dst, src0, src1
-    v_cmpx_ge_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_f32                  dst, src0, src1
-    v_cmpx_ge_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_f64                  dst, src0, src1
-    v_cmpx_ge_i16                  dst, src0, src1
-    v_cmpx_ge_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_i32                  dst, src0, src1
-    v_cmpx_ge_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_i64                  dst, src0, src1
-    v_cmpx_ge_u16                  dst, src0, src1
-    v_cmpx_ge_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_u32                  dst, src0, src1
-    v_cmpx_ge_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_u64                  dst, src0, src1
-    v_cmpx_gt_f16                  dst, src0, src1
-    v_cmpx_gt_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_f32                  dst, src0, src1
-    v_cmpx_gt_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_f64                  dst, src0, src1
-    v_cmpx_gt_i16                  dst, src0, src1
-    v_cmpx_gt_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_i32                  dst, src0, src1
-    v_cmpx_gt_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_i64                  dst, src0, src1
-    v_cmpx_gt_u16                  dst, src0, src1
-    v_cmpx_gt_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_u32                  dst, src0, src1
-    v_cmpx_gt_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_u64                  dst, src0, src1
-    v_cmpx_le_f16                  dst, src0, src1
-    v_cmpx_le_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_f32                  dst, src0, src1
-    v_cmpx_le_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_f64                  dst, src0, src1
-    v_cmpx_le_i16                  dst, src0, src1
-    v_cmpx_le_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_i32                  dst, src0, src1
-    v_cmpx_le_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_i64                  dst, src0, src1
-    v_cmpx_le_u16                  dst, src0, src1
-    v_cmpx_le_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_u32                  dst, src0, src1
-    v_cmpx_le_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_u64                  dst, src0, src1
-    v_cmpx_lg_f16                  dst, src0, src1
-    v_cmpx_lg_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lg_f32                  dst, src0, src1
-    v_cmpx_lg_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lg_f64                  dst, src0, src1
-    v_cmpx_lt_f16                  dst, src0, src1
-    v_cmpx_lt_f16_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_f32                  dst, src0, src1
-    v_cmpx_lt_f32_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_f64                  dst, src0, src1
-    v_cmpx_lt_i16                  dst, src0, src1
-    v_cmpx_lt_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_i32                  dst, src0, src1
-    v_cmpx_lt_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_i64                  dst, src0, src1
-    v_cmpx_lt_u16                  dst, src0, src1
-    v_cmpx_lt_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_u32                  dst, src0, src1
-    v_cmpx_lt_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_u64                  dst, src0, src1
-    v_cmpx_ne_i16                  dst, src0, src1
-    v_cmpx_ne_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ne_i32                  dst, src0, src1
-    v_cmpx_ne_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ne_i64                  dst, src0, src1
-    v_cmpx_ne_u16                  dst, src0, src1
-    v_cmpx_ne_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ne_u32                  dst, src0, src1
-    v_cmpx_ne_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ne_u64                  dst, src0, src1
-    v_cmpx_neq_f16                 dst, src0, src1
-    v_cmpx_neq_f16_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_neq_f32                 dst, src0, src1
-    v_cmpx_neq_f32_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_neq_f64                 dst, src0, src1
-    v_cmpx_nge_f16                 dst, src0, src1
-    v_cmpx_nge_f16_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nge_f32                 dst, src0, src1
-    v_cmpx_nge_f32_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nge_f64                 dst, src0, src1
-    v_cmpx_ngt_f16                 dst, src0, src1
-    v_cmpx_ngt_f16_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ngt_f32                 dst, src0, src1
-    v_cmpx_ngt_f32_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ngt_f64                 dst, src0, src1
-    v_cmpx_nle_f16                 dst, src0, src1
-    v_cmpx_nle_f16_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nle_f32                 dst, src0, src1
-    v_cmpx_nle_f32_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nle_f64                 dst, src0, src1
-    v_cmpx_nlg_f16                 dst, src0, src1
-    v_cmpx_nlg_f16_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nlg_f32                 dst, src0, src1
-    v_cmpx_nlg_f32_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nlg_f64                 dst, src0, src1
-    v_cmpx_nlt_f16                 dst, src0, src1
-    v_cmpx_nlt_f16_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nlt_f32                 dst, src0, src1
-    v_cmpx_nlt_f32_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nlt_f64                 dst, src0, src1
-    v_cmpx_o_f16                   dst, src0, src1
-    v_cmpx_o_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_o_f32                   dst, src0, src1
-    v_cmpx_o_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_o_f64                   dst, src0, src1
-    v_cmpx_t_i16                   dst, src0, src1
-    v_cmpx_t_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_t_i32                   dst, src0, src1
-    v_cmpx_t_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_t_i64                   dst, src0, src1
-    v_cmpx_t_u16                   dst, src0, src1
-    v_cmpx_t_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_t_u32                   dst, src0, src1
-    v_cmpx_t_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_t_u64                   dst, src0, src1
-    v_cmpx_tru_f16                 dst, src0, src1
-    v_cmpx_tru_f16_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_tru_f32                 dst, src0, src1
-    v_cmpx_tru_f32_sdwa            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_tru_f64                 dst, src0, src1
-    v_cmpx_u_f16                   dst, src0, src1
-    v_cmpx_u_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_u_f32                   dst, src0, src1
-    v_cmpx_u_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_u_f64                   dst, src0, src1
diff --git a/docs/AMDGPUAsmGFX9.rst b/docs/AMDGPUAsmGFX9.rst
deleted file mode 100644
index 97c13f2476bd4dec4776e3979b2a2d4f4b3ccc14..0000000000000000000000000000000000000000
--- a/docs/AMDGPUAsmGFX9.rst
+++ /dev/null
@@ -1,1906 +0,0 @@
-..
-    **************************************************
-    *                                                *
-    *   Automatically generated file, do not edit!   *
-    *                                                *
-    **************************************************
-
-===========================
-Syntax of GFX9 Instructions
-===========================
-
-.. contents::
-  :local:
-
-
-DS
-===========================
-
-.. parsed-literal::
-
-    ds_add_f32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_rtn_f32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_src2_f32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_add_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_b32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_b64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_rtn_b32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_rtn_b64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_src2_b32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_and_src2_b64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_append                      dst                            :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_bpermute_b32                dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>`
-    ds_cmpst_b32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_b64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_f32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_f64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_b32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_b64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_f32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_cmpst_rtn_f64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_condxchg32_rtn_b64          dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_consume                     dst                            :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_dec_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_barrier                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_init                    src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_br                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_p                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_release_all                                       :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_gws_sema_v                                                 :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_inc_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_f32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_f64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_i32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_i64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_f32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_f64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_i32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_i64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_f32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_f64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_i32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_i64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_max_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_f32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_f64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_i32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_i64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_f32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_f64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_i32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_i64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_f32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_f64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_i32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_i64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_min_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_b32                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_b64                   src0, src1, src2               :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_rtn_b32               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_mskor_rtn_b64               dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_nop
-    ds_or_b32                      src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_b64                      src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_rtn_b32                  dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_rtn_b64                  dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_src2_b32                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_or_src2_b64                 src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_ordered_count               dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_permute_b32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>`
-    ds_read2_b32                   dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2_b64                   dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2st64_b32               dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read2st64_b64               dst, src0                      :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b128                   dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b32                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b64                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_b96                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_i16                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_i8                     dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_i8_d16                 dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_i8_d16_hi              dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u16                    dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u16_d16                dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u16_d16_hi             dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u8                     dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u8_d16                 dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_read_u8_d16_hi              dst, src0                      :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_rtn_u32                dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_rtn_u64                dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_src2_u32               src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_src2_u64               src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_u32                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_rsub_u64                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_rtn_u32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_rtn_u64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_src2_u32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_src2_u64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_u32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_sub_u64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_swizzle_b32                 dst, src0                      :ref:`sw_offset16<amdgpu_synid_sw_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrap_rtn_b32                dst, src0, src1, src2          :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2_b32                  src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2_b64                  src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2st64_b32              src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write2st64_b64              src0, src1, src2               :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b128                  src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b16                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b16_d16_hi            src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b32                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b64                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b8                    src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b8_d16_hi             src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_b96                   src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_src2_b32              src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_write_src2_b64              src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2_rtn_b32             dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2_rtn_b64             dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2st64_rtn_b32         dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg2st64_rtn_b64         dst, src0, src1, src2          :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`ds_offset8<amdgpu_synid_ds_offset8>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg_rtn_b32              dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_wrxchg_rtn_b64              dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_b32                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_b64                     src0, src1                     :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_rtn_b32                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_rtn_b64                 dst, src0, src1                :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_src2_b32                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-    ds_xor_src2_b64                src0                           :ref:`ds_offset16<amdgpu_synid_ds_offset16>` :ref:`gds<amdgpu_synid_gds>`
-
-EXP
-===========================
-
-.. parsed-literal::
-
-    exp                            dst, src0, src1, src2, src3    :ref:`done<amdgpu_synid_done>` :ref:`compr<amdgpu_synid_compr>` :ref:`vm<amdgpu_synid_vm>`
-
-FLAT
-===========================
-
-.. parsed-literal::
-
-    flat_atomic_add                dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_add_x2             dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_and                dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_and_x2             dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_cmpswap            dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_cmpswap_x2         dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_dec                dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_dec_x2             dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_inc                dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_inc_x2             dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_or                 dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_or_x2              dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smax               dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smax_x2            dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smin               dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_smin_x2            dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_sub                dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_sub_x2             dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_swap               dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_swap_x2            dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umax               dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umax_x2            dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umin               dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_umin_x2            dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_xor                dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_atomic_xor_x2             dst, src0, src1                :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dword                dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx2              dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx3              dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_dwordx4              dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_sbyte                dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_sbyte_d16            dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_sbyte_d16_hi         dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_short_d16            dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_short_d16_hi         dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_sshort               dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_ubyte                dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_ubyte_d16            dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_ubyte_d16_hi         dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_load_ushort               dst, src0                      :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_byte                src0, src1                     :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_byte_d16_hi         src0, src1                     :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dword               src0, src1                     :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx2             src0, src1                     :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx3             src0, src1                     :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_dwordx4             src0, src1                     :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_short               src0, src1                     :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    flat_store_short_d16_hi        src0, src1                     :ref:`flat_offset12<amdgpu_synid_flat_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    global_atomic_add              dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_add_x2           dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_and              dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_and_x2           dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_cmpswap          dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_cmpswap_x2       dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_dec              dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_dec_x2           dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_inc              dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_inc_x2           dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_or               dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_or_x2            dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_smax             dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_smax_x2          dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_smin             dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_smin_x2          dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_sub              dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_sub_x2           dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_swap             dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_swap_x2          dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_umax             dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_umax_x2          dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_umin             dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_umin_x2          dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_xor              dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_atomic_xor_x2           dst, src0, src1, src2          :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_dword              dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_dwordx2            dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_dwordx3            dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_dwordx4            dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_sbyte              dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_sbyte_d16          dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_sbyte_d16_hi       dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_short_d16          dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_short_d16_hi       dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_sshort             dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_ubyte              dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_ubyte_d16          dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_ubyte_d16_hi       dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_load_ushort             dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_store_byte              src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_store_byte_d16_hi       src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_store_dword             src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_store_dwordx2           src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_store_dwordx3           src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_store_dwordx4           src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_store_short             src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    global_store_short_d16_hi      src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>`
-    scratch_load_dword             dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_dwordx2           dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_dwordx3           dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_dwordx4           dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_sbyte             dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_sbyte_d16         dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_sbyte_d16_hi      dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_short_d16         dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_short_d16_hi      dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_sshort            dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_ubyte             dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_ubyte_d16         dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_ubyte_d16_hi      dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_load_ushort            dst, src0, src1                :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_store_byte             src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_store_byte_d16_hi      src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_store_dword            src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_store_dwordx2          src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_store_dwordx3          src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_store_dwordx4          src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_store_short            src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    scratch_store_short_d16_hi     src0, src1, src2               :ref:`flat_offset13<amdgpu_synid_flat_offset13>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-
-MIMG
-===========================
-
-.. parsed-literal::
-
-    image_atomic_add               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_and               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_cmpswap           dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_dec               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_inc               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_or                dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_smax              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_smin              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_sub               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_swap              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_umax              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_umin              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_atomic_xor               dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_gather4                  dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_b                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_c_lz             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_cl               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_l                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_lz               dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_lz_o             dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_gather4_o                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_get_lod                  dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_get_resinfo              dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load                     dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_load_mip                 dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_load_mip_pck             dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_mip_pck_sgn         dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_pck                 dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_load_pck_sgn             dst, src0, src1                :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_sample                   dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_b                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_c                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_c_lz              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_cl                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_l                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_lz                dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_lz_o              dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_sample_o                 dst, src0, src1, src2          :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_store                    src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_store_mip                src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>` :ref:`d16<amdgpu_synid_d16>`
-    image_store_mip_pck            src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-    image_store_pck                src0, src1, src2               :ref:`dmask<amdgpu_synid_dmask>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`da<amdgpu_synid_da>`
-
-MUBUF
-===========================
-
-.. parsed-literal::
-
-    buffer_atomic_add              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_add_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_and              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_and_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_cmpswap          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_cmpswap_x2       dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_dec              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_dec_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_inc              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_inc_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_or               dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_or_x2            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smax             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smax_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smin             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_smin_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_sub              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_sub_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_swap             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_swap_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umax             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umax_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umin             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_umin_x2          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_xor              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_atomic_xor_x2           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dword              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_dwordx2            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dwordx3            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_dwordx4            dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_hi_x    dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_x       dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_xy      dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_xyz     dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_d16_xyzw    dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_x           dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_format_xy          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_xyz         dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_format_xyzw        dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_sbyte              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_sbyte_d16          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_sbyte_d16_hi       dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_short_d16          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_short_d16_hi       dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_sshort             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_ubyte              dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_load_ubyte_d16          dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_ubyte_d16_hi       dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_load_ushort             dst, src0, src1, src2          :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_store_byte              src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_byte_d16_hi       src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dword             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx2           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx3           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_dwordx4           src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_hi_x   src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_x      src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_xy     src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_xyz    src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_d16_xyzw   src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_x          src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xy         src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xyz        src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_format_xyzw       src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_lds_dword         src0, src1                     :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`lds<amdgpu_synid_lds>`
-    buffer_store_short             src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_store_short_d16_hi      src0, src1, src2, src3         :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`buf_offset12<amdgpu_synid_buf_offset12>` :ref:`glc<amdgpu_synid_glc>` :ref:`slc<amdgpu_synid_slc>`
-    buffer_wbinvl1
-    buffer_wbinvl1_vol
-
-SMEM
-===========================
-
-.. parsed-literal::
-
-    s_atc_probe                    src0, src1, src2
-    s_atc_probe_buffer             src0, src1, src2
-    s_atomic_add                   dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_add_x2                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_and                   dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_and_x2                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_cmpswap               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_cmpswap_x2            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_dec                   dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_dec_x2                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_inc                   dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_inc_x2                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_or                    dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_or_x2                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_smax                  dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_smax_x2               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_smin                  dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_smin_x2               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_sub                   dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_sub_x2                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_swap                  dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_swap_x2               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_umax                  dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_umax_x2               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_umin                  dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_umin_x2               dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_xor                   dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_atomic_xor_x2                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_add            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_add_x2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_and            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_and_x2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_cmpswap        dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_cmpswap_x2     dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_dec            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_dec_x2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_inc            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_inc_x2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_or             dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_or_x2          dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_smax           dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_smax_x2        dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_smin           dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_smin_x2        dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_sub            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_sub_x2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_swap           dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_swap_x2        dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_umax           dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_umax_x2        dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_umin           dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_umin_x2        dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_xor            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_atomic_xor_x2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dword            dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dwordx16         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dwordx2          dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dwordx4          dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_load_dwordx8          dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_store_dword           src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_store_dwordx2         src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_buffer_store_dwordx4         src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_dcache_discard               src0, src1
-    s_dcache_discard_x2            src0, src1
-    s_dcache_inv
-    s_dcache_inv_vol
-    s_dcache_wb
-    s_dcache_wb_vol
-    s_load_dword                   dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_load_dwordx16                dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_load_dwordx2                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_load_dwordx4                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_load_dwordx8                 dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_memrealtime                  dst
-    s_memtime                      dst
-    s_scratch_load_dword           dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_scratch_load_dwordx2         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_scratch_load_dwordx4         dst, src0, src1                :ref:`glc<amdgpu_synid_glc>`
-    s_scratch_store_dword          src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_scratch_store_dwordx2        src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_scratch_store_dwordx4        src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_store_dword                  src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_store_dwordx2                src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-    s_store_dwordx4                src0, src1, src2               :ref:`glc<amdgpu_synid_glc>`
-
-SOP1
-===========================
-
-.. parsed-literal::
-
-    s_abs_i32                      dst, src0
-    s_and_saveexec_b64             dst, src0
-    s_andn1_saveexec_b64           dst, src0
-    s_andn1_wrexec_b64             dst, src0
-    s_andn2_saveexec_b64           dst, src0
-    s_andn2_wrexec_b64             dst, src0
-    s_bcnt0_i32_b32                dst, src0
-    s_bcnt0_i32_b64                dst, src0
-    s_bcnt1_i32_b32                dst, src0
-    s_bcnt1_i32_b64                dst, src0
-    s_bitreplicate_b64_b32         dst, src0
-    s_bitset0_b32                  dst, src0
-    s_bitset0_b64                  dst, src0
-    s_bitset1_b32                  dst, src0
-    s_bitset1_b64                  dst, src0
-    s_brev_b32                     dst, src0
-    s_brev_b64                     dst, src0
-    s_cbranch_join                 src0
-    s_cmov_b32                     dst, src0
-    s_cmov_b64                     dst, src0
-    s_ff0_i32_b32                  dst, src0
-    s_ff0_i32_b64                  dst, src0
-    s_ff1_i32_b32                  dst, src0
-    s_ff1_i32_b64                  dst, src0
-    s_flbit_i32                    dst, src0
-    s_flbit_i32_b32                dst, src0
-    s_flbit_i32_b64                dst, src0
-    s_flbit_i32_i64                dst, src0
-    s_getpc_b64                    dst
-    s_mov_b32                      dst, src0
-    s_mov_b64                      dst, src0
-    s_mov_fed_b32                  dst, src0
-    s_movreld_b32                  dst, src0
-    s_movreld_b64                  dst, src0
-    s_movrels_b32                  dst, src0
-    s_movrels_b64                  dst, src0
-    s_nand_saveexec_b64            dst, src0
-    s_nor_saveexec_b64             dst, src0
-    s_not_b32                      dst, src0
-    s_not_b64                      dst, src0
-    s_or_saveexec_b64              dst, src0
-    s_orn1_saveexec_b64            dst, src0
-    s_orn2_saveexec_b64            dst, src0
-    s_quadmask_b32                 dst, src0
-    s_quadmask_b64                 dst, src0
-    s_rfe_b64                      src0
-    s_set_gpr_idx_idx              src0
-    s_setpc_b64                    src0
-    s_sext_i32_i16                 dst, src0
-    s_sext_i32_i8                  dst, src0
-    s_swappc_b64                   dst, src0
-    s_wqm_b32                      dst, src0
-    s_wqm_b64                      dst, src0
-    s_xnor_saveexec_b64            dst, src0
-    s_xor_saveexec_b64             dst, src0
-
-SOP2
-===========================
-
-.. parsed-literal::
-
-    s_absdiff_i32                  dst, src0, src1
-    s_add_i32                      dst, src0, src1
-    s_add_u32                      dst, src0, src1
-    s_addc_u32                     dst, src0, src1
-    s_and_b32                      dst, src0, src1
-    s_and_b64                      dst, src0, src1
-    s_andn2_b32                    dst, src0, src1
-    s_andn2_b64                    dst, src0, src1
-    s_ashr_i32                     dst, src0, src1
-    s_ashr_i64                     dst, src0, src1
-    s_bfe_i32                      dst, src0, src1
-    s_bfe_i64                      dst, src0, src1
-    s_bfe_u32                      dst, src0, src1
-    s_bfe_u64                      dst, src0, src1
-    s_bfm_b32                      dst, src0, src1
-    s_bfm_b64                      dst, src0, src1
-    s_cbranch_g_fork               src0, src1
-    s_cselect_b32                  dst, src0, src1
-    s_cselect_b64                  dst, src0, src1
-    s_lshl1_add_u32                dst, src0, src1
-    s_lshl2_add_u32                dst, src0, src1
-    s_lshl3_add_u32                dst, src0, src1
-    s_lshl4_add_u32                dst, src0, src1
-    s_lshl_b32                     dst, src0, src1
-    s_lshl_b64                     dst, src0, src1
-    s_lshr_b32                     dst, src0, src1
-    s_lshr_b64                     dst, src0, src1
-    s_max_i32                      dst, src0, src1
-    s_max_u32                      dst, src0, src1
-    s_min_i32                      dst, src0, src1
-    s_min_u32                      dst, src0, src1
-    s_mul_hi_i32                   dst, src0, src1
-    s_mul_hi_u32                   dst, src0, src1
-    s_mul_i32                      dst, src0, src1
-    s_nand_b32                     dst, src0, src1
-    s_nand_b64                     dst, src0, src1
-    s_nor_b32                      dst, src0, src1
-    s_nor_b64                      dst, src0, src1
-    s_or_b32                       dst, src0, src1
-    s_or_b64                       dst, src0, src1
-    s_orn2_b32                     dst, src0, src1
-    s_orn2_b64                     dst, src0, src1
-    s_pack_hh_b32_b16              dst, src0, src1
-    s_pack_lh_b32_b16              dst, src0, src1
-    s_pack_ll_b32_b16              dst, src0, src1
-    s_rfe_restore_b64              src0, src1
-    s_sub_i32                      dst, src0, src1
-    s_sub_u32                      dst, src0, src1
-    s_subb_u32                     dst, src0, src1
-    s_xnor_b32                     dst, src0, src1
-    s_xnor_b64                     dst, src0, src1
-    s_xor_b32                      dst, src0, src1
-    s_xor_b64                      dst, src0, src1
-
-SOPC
-===========================
-
-.. parsed-literal::
-
-    s_bitcmp0_b32                  src0, src1
-    s_bitcmp0_b64                  src0, src1
-    s_bitcmp1_b32                  src0, src1
-    s_bitcmp1_b64                  src0, src1
-    s_cmp_eq_i32                   src0, src1
-    s_cmp_eq_u32                   src0, src1
-    s_cmp_eq_u64                   src0, src1
-    s_cmp_ge_i32                   src0, src1
-    s_cmp_ge_u32                   src0, src1
-    s_cmp_gt_i32                   src0, src1
-    s_cmp_gt_u32                   src0, src1
-    s_cmp_le_i32                   src0, src1
-    s_cmp_le_u32                   src0, src1
-    s_cmp_lg_i32                   src0, src1
-    s_cmp_lg_u32                   src0, src1
-    s_cmp_lg_u64                   src0, src1
-    s_cmp_lt_i32                   src0, src1
-    s_cmp_lt_u32                   src0, src1
-    s_set_gpr_idx_on               src0, src1
-    s_setvskip                     src0, src1
-
-SOPK
-===========================
-
-.. parsed-literal::
-
-    s_addk_i32                     dst, src0
-    s_call_b64                     dst, src0
-    s_cbranch_i_fork               src0, src1
-    s_cmovk_i32                    dst, src0
-    s_cmpk_eq_i32                  src0, src1
-    s_cmpk_eq_u32                  src0, src1
-    s_cmpk_ge_i32                  src0, src1
-    s_cmpk_ge_u32                  src0, src1
-    s_cmpk_gt_i32                  src0, src1
-    s_cmpk_gt_u32                  src0, src1
-    s_cmpk_le_i32                  src0, src1
-    s_cmpk_le_u32                  src0, src1
-    s_cmpk_lg_i32                  src0, src1
-    s_cmpk_lg_u32                  src0, src1
-    s_cmpk_lt_i32                  src0, src1
-    s_cmpk_lt_u32                  src0, src1
-    s_getreg_b32                   dst, src0
-    s_movk_i32                     dst, src0
-    s_mulk_i32                     dst, src0
-    s_setreg_b32                   dst, src0
-    s_setreg_imm32_b32             dst, src0
-
-SOPP
-===========================
-
-.. parsed-literal::
-
-    s_barrier
-    s_branch                       src0
-    s_cbranch_cdbgsys              src0
-    s_cbranch_cdbgsys_and_user     src0
-    s_cbranch_cdbgsys_or_user      src0
-    s_cbranch_cdbguser             src0
-    s_cbranch_execnz               src0
-    s_cbranch_execz                src0
-    s_cbranch_scc0                 src0
-    s_cbranch_scc1                 src0
-    s_cbranch_vccnz                src0
-    s_cbranch_vccz                 src0
-    s_decperflevel                 src0
-    s_endpgm
-    s_endpgm_ordered_ps_done
-    s_endpgm_saved
-    s_icache_inv
-    s_incperflevel                 src0
-    s_nop                          src0
-    s_sendmsg                      src0
-    s_sendmsghalt                  src0
-    s_set_gpr_idx_mode             src0
-    s_set_gpr_idx_off
-    s_sethalt                      src0
-    s_setkill                      src0
-    s_setprio                      src0
-    s_sleep                        src0
-    s_trap                         src0
-    s_ttracedata
-    s_waitcnt                      src0
-    s_wakeup
-
-VINTRP
-===========================
-
-.. parsed-literal::
-
-    v_interp_mov_f32               dst, src0, src1
-    v_interp_p1_f32                dst, src0, src1
-    v_interp_p2_f32                dst, src0, src1
-
-VOP1
-===========================
-
-.. parsed-literal::
-
-    v_bfrev_b32                    dst, src0
-    v_bfrev_b32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_bfrev_b32_sdwa               dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ceil_f16                     dst, src0
-    v_ceil_f16_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ceil_f16_sdwa                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ceil_f32                     dst, src0
-    v_ceil_f32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ceil_f32_sdwa                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ceil_f64                     dst, src0
-    v_clrexcp
-    v_cos_f16                      dst, src0
-    v_cos_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cos_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cos_f32                      dst, src0
-    v_cos_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cos_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f16_f32                  dst, src0
-    v_cvt_f16_f32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f16_f32_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f16_i16                  dst, src0
-    v_cvt_f16_i16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f16_i16_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f16_u16                  dst, src0
-    v_cvt_f16_u16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f16_u16_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_f16                  dst, src0
-    v_cvt_f32_f16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_f16_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_f64                  dst, src0
-    v_cvt_f32_i32                  dst, src0
-    v_cvt_f32_i32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_i32_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_u32                  dst, src0
-    v_cvt_f32_u32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_u32_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_ubyte0               dst, src0
-    v_cvt_f32_ubyte0_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_ubyte0_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_ubyte1               dst, src0
-    v_cvt_f32_ubyte1_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_ubyte1_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_ubyte2               dst, src0
-    v_cvt_f32_ubyte2_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_ubyte2_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f32_ubyte3               dst, src0
-    v_cvt_f32_ubyte3_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_f32_ubyte3_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_f64_f32                  dst, src0
-    v_cvt_f64_i32                  dst, src0
-    v_cvt_f64_u32                  dst, src0
-    v_cvt_flr_i32_f32              dst, src0
-    v_cvt_flr_i32_f32_dpp          dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_flr_i32_f32_sdwa         dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_i16_f16                  dst, src0
-    v_cvt_i16_f16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_i16_f16_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_i32_f32                  dst, src0
-    v_cvt_i32_f32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_i32_f32_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_i32_f64                  dst, src0
-    v_cvt_norm_i16_f16             dst, src0
-    v_cvt_norm_i16_f16_dpp         dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_norm_i16_f16_sdwa        dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_norm_u16_f16             dst, src0
-    v_cvt_norm_u16_f16_dpp         dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_norm_u16_f16_sdwa        dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_off_f32_i4               dst, src0
-    v_cvt_off_f32_i4_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_off_f32_i4_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_rpi_i32_f32              dst, src0
-    v_cvt_rpi_i32_f32_dpp          dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_rpi_i32_f32_sdwa         dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_u16_f16                  dst, src0
-    v_cvt_u16_f16_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_u16_f16_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_u32_f32                  dst, src0
-    v_cvt_u32_f32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cvt_u32_f32_sdwa             dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_cvt_u32_f64                  dst, src0
-    v_exp_f16                      dst, src0
-    v_exp_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_exp_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_exp_f32                      dst, src0
-    v_exp_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_exp_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_exp_legacy_f32               dst, src0
-    v_exp_legacy_f32_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_exp_legacy_f32_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ffbh_i32                     dst, src0
-    v_ffbh_i32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ffbh_i32_sdwa                dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ffbh_u32                     dst, src0
-    v_ffbh_u32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ffbh_u32_sdwa                dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_ffbl_b32                     dst, src0
-    v_ffbl_b32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ffbl_b32_sdwa                dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_floor_f16                    dst, src0
-    v_floor_f16_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_floor_f16_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_floor_f32                    dst, src0
-    v_floor_f32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_floor_f32_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_floor_f64                    dst, src0
-    v_fract_f16                    dst, src0
-    v_fract_f16_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_fract_f16_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_fract_f32                    dst, src0
-    v_fract_f32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_fract_f32_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_fract_f64                    dst, src0
-    v_frexp_exp_i16_f16            dst, src0
-    v_frexp_exp_i16_f16_dpp        dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_frexp_exp_i16_f16_sdwa       dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_frexp_exp_i32_f32            dst, src0
-    v_frexp_exp_i32_f32_dpp        dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_frexp_exp_i32_f32_sdwa       dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_frexp_exp_i32_f64            dst, src0
-    v_frexp_mant_f16               dst, src0
-    v_frexp_mant_f16_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_frexp_mant_f16_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_frexp_mant_f32               dst, src0
-    v_frexp_mant_f32_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_frexp_mant_f32_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_frexp_mant_f64               dst, src0
-    v_log_f16                      dst, src0
-    v_log_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_log_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_log_f32                      dst, src0
-    v_log_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_log_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_log_legacy_f32               dst, src0
-    v_log_legacy_f32_dpp           dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_log_legacy_f32_sdwa          dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_mov_b32                      dst, src0
-    v_mov_b32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mov_b32_sdwa                 dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_mov_fed_b32                  dst, src0
-    v_mov_fed_b32_dpp              dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mov_fed_b32_sdwa             dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_nop
-    v_not_b32                      dst, src0
-    v_not_b32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_not_b32_sdwa                 dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rcp_f16                      dst, src0
-    v_rcp_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rcp_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rcp_f32                      dst, src0
-    v_rcp_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rcp_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rcp_f64                      dst, src0
-    v_rcp_iflag_f32                dst, src0
-    v_rcp_iflag_f32_dpp            dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rcp_iflag_f32_sdwa           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_readfirstlane_b32            dst, src0
-    v_rndne_f16                    dst, src0
-    v_rndne_f16_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rndne_f16_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rndne_f32                    dst, src0
-    v_rndne_f32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rndne_f32_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rndne_f64                    dst, src0
-    v_rsq_f16                      dst, src0
-    v_rsq_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rsq_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rsq_f32                      dst, src0
-    v_rsq_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_rsq_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_rsq_f64                      dst, src0
-    v_sat_pk_u8_i16                dst, src0
-    v_sat_pk_u8_i16_dpp            dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sat_pk_u8_i16_sdwa           dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_screen_partition_4se_b32     dst, src0
-    v_screen_partition_4se_b32_dpp dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_screen_partition_4se_b32_sdwa dst, src0                      :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sin_f16                      dst, src0
-    v_sin_f16_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sin_f16_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sin_f32                      dst, src0
-    v_sin_f32_dpp                  dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sin_f32_sdwa                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sqrt_f16                     dst, src0
-    v_sqrt_f16_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sqrt_f16_sdwa                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sqrt_f32                     dst, src0
-    v_sqrt_f32_dpp                 dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sqrt_f32_sdwa                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_sqrt_f64                     dst, src0
-    v_swap_b32                     dst, src0
-    v_trunc_f16                    dst, src0
-    v_trunc_f16_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_trunc_f16_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_trunc_f32                    dst, src0
-    v_trunc_f32_dpp                dst, src0                      :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_trunc_f32_sdwa               dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>`
-    v_trunc_f64                    dst, src0
-
-VOP2
-===========================
-
-.. parsed-literal::
-
-    v_add_co_u32                   dst0, dst1, src0, src1
-    v_add_co_u32_dpp               dst0, dst1, src0, src1         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_co_u32_sdwa              dst0, dst1, src0, src1         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_add_f16                      dst, src0, src1
-    v_add_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_add_f32                      dst, src0, src1
-    v_add_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_add_u16                      dst, src0, src1
-    v_add_u16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_u16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_add_u32                      dst, src0, src1
-    v_add_u32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_add_u32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_addc_co_u32                  dst0, dst1, src0, src1, src2
-    v_addc_co_u32_dpp              dst0, dst1, src0, src1, src2   :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_addc_co_u32_sdwa             dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_and_b32                      dst, src0, src1
-    v_and_b32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_and_b32_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_ashrrev_i16                  dst, src0, src1
-    v_ashrrev_i16_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ashrrev_i16_sdwa             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_ashrrev_i32                  dst, src0, src1
-    v_ashrrev_i32_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ashrrev_i32_sdwa             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cndmask_b32                  dst, src0, src1, src2
-    v_cndmask_b32_dpp              dst, src0, src1, src2          :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_cndmask_b32_sdwa             dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_ldexp_f16                    dst, src0, src1
-    v_ldexp_f16_dpp                dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_ldexp_f16_sdwa               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_lshlrev_b16                  dst, src0, src1
-    v_lshlrev_b16_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_lshlrev_b16_sdwa             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_lshlrev_b32                  dst, src0, src1
-    v_lshlrev_b32_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_lshlrev_b32_sdwa             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_lshrrev_b16                  dst, src0, src1
-    v_lshrrev_b16_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_lshrrev_b16_sdwa             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_lshrrev_b32                  dst, src0, src1
-    v_lshrrev_b32_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_lshrrev_b32_sdwa             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mac_f16                      dst, src0, src1
-    v_mac_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mac_f32                      dst, src0, src1
-    v_mac_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_madak_f16                    dst, src0, src1, src2
-    v_madak_f32                    dst, src0, src1, src2
-    v_madmk_f16                    dst, src0, src1, src2
-    v_madmk_f32                    dst, src0, src1, src2
-    v_max_f16                      dst, src0, src1
-    v_max_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_f32                      dst, src0, src1
-    v_max_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_i16                      dst, src0, src1
-    v_max_i16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_i16_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_i32                      dst, src0, src1
-    v_max_i32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_i32_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_u16                      dst, src0, src1
-    v_max_u16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_u16_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_max_u32                      dst, src0, src1
-    v_max_u32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_max_u32_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_f16                      dst, src0, src1
-    v_min_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_f32                      dst, src0, src1
-    v_min_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_i16                      dst, src0, src1
-    v_min_i16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_i16_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_i32                      dst, src0, src1
-    v_min_i32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_i32_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_u16                      dst, src0, src1
-    v_min_u16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_u16_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_min_u32                      dst, src0, src1
-    v_min_u32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_min_u32_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_f16                      dst, src0, src1
-    v_mul_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_f32                      dst, src0, src1
-    v_mul_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_hi_i32_i24               dst, src0, src1
-    v_mul_hi_i32_i24_dpp           dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_hi_i32_i24_sdwa          dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_hi_u32_u24               dst, src0, src1
-    v_mul_hi_u32_u24_dpp           dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_hi_u32_u24_sdwa          dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_i32_i24                  dst, src0, src1
-    v_mul_i32_i24_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_i32_i24_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_legacy_f32               dst, src0, src1
-    v_mul_legacy_f32_dpp           dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_legacy_f32_sdwa          dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_lo_u16                   dst, src0, src1
-    v_mul_lo_u16_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_lo_u16_sdwa              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_mul_u32_u24                  dst, src0, src1
-    v_mul_u32_u24_dpp              dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_mul_u32_u24_sdwa             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_or_b32                       dst, src0, src1
-    v_or_b32_dpp                   dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_or_b32_sdwa                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_co_u32                   dst0, dst1, src0, src1
-    v_sub_co_u32_dpp               dst0, dst1, src0, src1         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_co_u32_sdwa              dst0, dst1, src0, src1         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_f16                      dst, src0, src1
-    v_sub_f16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_f16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_f32                      dst, src0, src1
-    v_sub_f32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_f32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_u16                      dst, src0, src1
-    v_sub_u16_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_u16_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_sub_u32                      dst, src0, src1
-    v_sub_u32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_sub_u32_sdwa                 dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subb_co_u32                  dst0, dst1, src0, src1, src2
-    v_subb_co_u32_dpp              dst0, dst1, src0, src1, src2   :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subb_co_u32_sdwa             dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subbrev_co_u32               dst0, dst1, src0, src1, src2
-    v_subbrev_co_u32_dpp           dst0, dst1, src0, src1, src2   :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subbrev_co_u32_sdwa          dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_co_u32                dst0, dst1, src0, src1
-    v_subrev_co_u32_dpp            dst0, dst1, src0, src1         :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_co_u32_sdwa           dst0, dst1, src0, src1         :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_f16                   dst, src0, src1
-    v_subrev_f16_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_f16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_f32                   dst, src0, src1
-    v_subrev_f32_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_f32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_u16                   dst, src0, src1
-    v_subrev_u16_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_u16_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_subrev_u32                   dst, src0, src1
-    v_subrev_u32_dpp               dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_subrev_u32_sdwa              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_xor_b32                      dst, src0, src1
-    v_xor_b32_dpp                  dst, src0, src1                :ref:`dpp_ctrl<amdgpu_synid_dpp_ctrl>` :ref:`row_mask<amdgpu_synid_row_mask>` :ref:`bank_mask<amdgpu_synid_bank_mask>` :ref:`bound_ctrl<amdgpu_synid_bound_ctrl>`
-    v_xor_b32_sdwa                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>` :ref:`dst_sel<amdgpu_synid_dst_sel>` :ref:`dst_unused<amdgpu_synid_dst_unused>` :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-
-VOP3
-===========================
-
-.. parsed-literal::
-
-    v_add3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_add_co_u32_e64               dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_add_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_i16                      dst, src0, src1                :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_add_i32                      dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_add_lshl_u32                 dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_add_u16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_add_u32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_addc_co_u32_e64              dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_alignbit_b32                 dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_alignbyte_b32                dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_and_b32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_and_or_b32                   dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_ashrrev_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ashrrev_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_ashrrev_i64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bcnt_u32_b32                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bfe_i32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfe_u32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfi_b32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_bfm_b32                      dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_bfrev_b32_e64                dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ceil_f16_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ceil_f32_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ceil_f64_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_clrexcp_e64                                                 :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_class_f16_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_class_f32_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_class_f64_e64            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_eq_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_f16_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_f32_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_f64_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_i16_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_i32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_i64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_u16_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_u32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_f_u64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ge_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_gt_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_le_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lg_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lg_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lg_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_lt_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ne_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_neq_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_neq_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_neq_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nge_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nge_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nge_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ngt_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ngt_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_ngt_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nle_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nle_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nle_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlg_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlg_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlg_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlt_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlt_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_nlt_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_o_f16_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_o_f32_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_o_f64_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_i16_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_i32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_i64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_u16_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_u32_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_t_u64_e64                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_tru_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_tru_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_tru_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_u_f16_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_u_f32_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmp_u_f64_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_class_f16_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_class_f32_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_class_f64_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_eq_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_f_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ge_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_gt_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_le_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lg_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lg_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lg_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_f16_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_f32_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_f64_e64              dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_lt_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_i16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_i32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_i64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_u16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_u32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ne_u64_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_neq_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_neq_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_neq_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nge_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nge_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nge_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ngt_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ngt_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_ngt_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nle_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nle_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nle_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlg_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlg_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlg_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlt_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlt_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_nlt_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_o_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_o_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_o_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_i16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_i32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_i64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_t_u64_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_tru_f16_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_tru_f32_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_tru_f64_e64             dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_u_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_u_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cmpx_u_f64_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cndmask_b32_e64              dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_cos_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cos_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubeid_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubema_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubesc_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cubetc_f32                   dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f16_f32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f16_i16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f16_u16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_f16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_f64_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_i32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_u32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte0_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte1_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte2_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f32_ubyte3_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_f32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_i32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_f64_u32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_flr_i32_f32_e64          dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_i16_f16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_i32_f32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_i32_f64_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_norm_i16_f16_e64         dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_norm_u16_f16_e64         dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_off_f32_i4_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_i16_i32               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_u16_u32               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pk_u8_f32                dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pkaccum_u8_f32           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pknorm_i16_f16           dst, src0, src1                :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pknorm_i16_f32           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pknorm_u16_f16           dst, src0, src1                :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pknorm_u16_f32           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_pkrtz_f16_f32            dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_rpi_i32_f32_e64          dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_u16_f16_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_u32_f32_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_cvt_u32_f64_e64              dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_f16                dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_f32                dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_f64                dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fixup_legacy_f16         dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fmas_f32                 dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_fmas_f64                 dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_div_scale_f32                dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_div_scale_f64                dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_exp_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_exp_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_exp_legacy_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ffbh_i32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ffbh_u32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_ffbl_b32_e64                 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_floor_f16_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_floor_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_floor_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_f16                      dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_f32                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_f64                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fma_legacy_f16               dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fract_f16_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fract_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_fract_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_exp_i16_f16_e64        dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_exp_i32_f32_e64        dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_exp_i32_f64_e64        dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_mant_f16_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_mant_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_frexp_mant_f64_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_mov_f32_e64           dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p1_f32_e64            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p1ll_f16              dst, src0, src1                :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p1lv_f16              dst, src0, src1, src2          :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p2_f16                dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p2_f32_e64            dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_interp_p2_legacy_f16         dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`high<amdgpu_synid_high>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ldexp_f16_e64                dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ldexp_f32                    dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_ldexp_f64                    dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_lerp_u8                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_log_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_log_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_log_legacy_f32_e64           dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_lshl_add_u32                 dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_lshl_or_b32                  dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_lshlrev_b16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshlrev_b32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshlrev_b64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshrrev_b16_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshrrev_b32_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_lshrrev_b64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mac_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mac_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_f16                      dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_f32                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i16                      dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i32_i16                  dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i32_i24                  dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_i64_i32                  dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_legacy_f16               dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_legacy_f32               dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_legacy_i16               dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_legacy_u16               dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u16                      dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u32_u16                  dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u32_u24                  dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mad_u64_u32                  dst0, dst1, src0, src1, src2   :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max3_f16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max3_i16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_max3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_max3_u16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_max3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_max_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_max_i16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_max_i32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_max_u16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_max_u32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mbcnt_hi_u32_b32             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mbcnt_lo_u32_b32             dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_med3_f16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_med3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_med3_i16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_med3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_med3_u16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_med3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min3_f16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min3_f32                     dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min3_i16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_min3_i32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min3_u16                     dst, src0, src1, src2          :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_min3_u32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_min_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_min_i16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_min_i32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_min_u16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_min_u32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mov_b32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_mov_fed_b32_e64              dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_mqsad_pk_u16_u8              dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mqsad_u32_u8                 dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_msad_u8                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_f64                      dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_i32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_i32_i24_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_u32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_hi_u32_u24_e64           dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_i32_i24_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_legacy_f32_e64           dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_mul_lo_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_lo_u32                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_mul_u32_u24_e64              dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_nop_e64                                                     :ref:`omod<amdgpu_synid_omod>`
-    v_not_b32_e64                  dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_or3_b32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_or_b32_e64                   dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_pack_b32_f16                 dst, src0, src1                :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`omod<amdgpu_synid_omod>`
-    v_perm_b32                     dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_qsad_pk_u16_u8               dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_f64_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rcp_iflag_f32_e64            dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_readlane_b32                 dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_rndne_f16_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rndne_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rndne_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_rsq_f64_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_hi_u8                    dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u16                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u32                      dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sad_u8                       dst, src0, src1, src2          :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sat_pk_u8_i16_e64            dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_screen_partition_4se_b32_e64 dst, src0                      :ref:`omod<amdgpu_synid_omod>`
-    v_sin_f16_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sin_f32_e64                  dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sqrt_f16_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sqrt_f32_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sqrt_f64_e64                 dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_co_u32_e64               dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_sub_f16_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_f32_e64                  dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_i16                      dst, src0, src1                :ref:`vop3_op_sel<amdgpu_synid_vop3_op_sel>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_sub_i32                      dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_sub_u16_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_sub_u32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_subb_co_u32_e64              dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_subbrev_co_u32_e64           dst0, dst1, src0, src1, src2   :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_co_u32_e64            dst0, dst1, src0, src1         :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_f16_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_f32_e64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_u16_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_subrev_u32_e64               dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_trig_preop_f64               dst, src0, src1                :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_trunc_f16_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_trunc_f32_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_trunc_f64_e64                dst, src0                      :ref:`clamp<amdgpu_synid_clamp>` :ref:`omod<amdgpu_synid_omod>`
-    v_writelane_b32                dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-    v_xad_u32                      dst, src0, src1, src2          :ref:`omod<amdgpu_synid_omod>`
-    v_xor_b32_e64                  dst, src0, src1                :ref:`omod<amdgpu_synid_omod>`
-
-VOP3P
-===========================
-
-.. parsed-literal::
-
-    v_mad_mix_f32                  dst, src0, src1, src2          :ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>` :ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_mad_mixhi_f16                dst, src0, src1, src2          :ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>` :ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_mad_mixlo_f16                dst, src0, src1, src2          :ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>` :ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_add_f16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_add_i16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_add_u16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_ashrrev_i16               dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
-    v_pk_fma_f16                   dst, src0, src1, src2          :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_lshlrev_b16               dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
-    v_pk_lshrrev_b16               dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
-    v_pk_mad_i16                   dst, src0, src1, src2          :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_mad_u16                   dst, src0, src1, src2          :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_max_f16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_max_i16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
-    v_pk_max_u16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
-    v_pk_min_f16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_min_i16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
-    v_pk_min_u16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
-    v_pk_mul_f16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`neg_lo<amdgpu_synid_neg_lo>` :ref:`neg_hi<amdgpu_synid_neg_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_mul_lo_u16                dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`
-    v_pk_sub_i16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-    v_pk_sub_u16                   dst, src0, src1                :ref:`op_sel<amdgpu_synid_op_sel>` :ref:`op_sel_hi<amdgpu_synid_op_sel_hi>` :ref:`clamp<amdgpu_synid_clamp>`
-
-VOPC
-===========================
-
-.. parsed-literal::
-
-    v_cmp_class_f16                dst, src0, src1
-    v_cmp_class_f16_sdwa           dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_class_f32                dst, src0, src1
-    v_cmp_class_f32_sdwa           dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_class_f64                dst, src0, src1
-    v_cmp_eq_f16                   dst, src0, src1
-    v_cmp_eq_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_f32                   dst, src0, src1
-    v_cmp_eq_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_f64                   dst, src0, src1
-    v_cmp_eq_i16                   dst, src0, src1
-    v_cmp_eq_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_i32                   dst, src0, src1
-    v_cmp_eq_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_i64                   dst, src0, src1
-    v_cmp_eq_u16                   dst, src0, src1
-    v_cmp_eq_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_u32                   dst, src0, src1
-    v_cmp_eq_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_eq_u64                   dst, src0, src1
-    v_cmp_f_f16                    dst, src0, src1
-    v_cmp_f_f16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_f32                    dst, src0, src1
-    v_cmp_f_f32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_f64                    dst, src0, src1
-    v_cmp_f_i16                    dst, src0, src1
-    v_cmp_f_i16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_i32                    dst, src0, src1
-    v_cmp_f_i32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_i64                    dst, src0, src1
-    v_cmp_f_u16                    dst, src0, src1
-    v_cmp_f_u16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_u32                    dst, src0, src1
-    v_cmp_f_u32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_f_u64                    dst, src0, src1
-    v_cmp_ge_f16                   dst, src0, src1
-    v_cmp_ge_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_f32                   dst, src0, src1
-    v_cmp_ge_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_f64                   dst, src0, src1
-    v_cmp_ge_i16                   dst, src0, src1
-    v_cmp_ge_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_i32                   dst, src0, src1
-    v_cmp_ge_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_i64                   dst, src0, src1
-    v_cmp_ge_u16                   dst, src0, src1
-    v_cmp_ge_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_u32                   dst, src0, src1
-    v_cmp_ge_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ge_u64                   dst, src0, src1
-    v_cmp_gt_f16                   dst, src0, src1
-    v_cmp_gt_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_f32                   dst, src0, src1
-    v_cmp_gt_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_f64                   dst, src0, src1
-    v_cmp_gt_i16                   dst, src0, src1
-    v_cmp_gt_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_i32                   dst, src0, src1
-    v_cmp_gt_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_i64                   dst, src0, src1
-    v_cmp_gt_u16                   dst, src0, src1
-    v_cmp_gt_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_u32                   dst, src0, src1
-    v_cmp_gt_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_gt_u64                   dst, src0, src1
-    v_cmp_le_f16                   dst, src0, src1
-    v_cmp_le_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_f32                   dst, src0, src1
-    v_cmp_le_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_f64                   dst, src0, src1
-    v_cmp_le_i16                   dst, src0, src1
-    v_cmp_le_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_i32                   dst, src0, src1
-    v_cmp_le_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_i64                   dst, src0, src1
-    v_cmp_le_u16                   dst, src0, src1
-    v_cmp_le_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_u32                   dst, src0, src1
-    v_cmp_le_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_le_u64                   dst, src0, src1
-    v_cmp_lg_f16                   dst, src0, src1
-    v_cmp_lg_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lg_f32                   dst, src0, src1
-    v_cmp_lg_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lg_f64                   dst, src0, src1
-    v_cmp_lt_f16                   dst, src0, src1
-    v_cmp_lt_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_f32                   dst, src0, src1
-    v_cmp_lt_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_f64                   dst, src0, src1
-    v_cmp_lt_i16                   dst, src0, src1
-    v_cmp_lt_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_i32                   dst, src0, src1
-    v_cmp_lt_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_i64                   dst, src0, src1
-    v_cmp_lt_u16                   dst, src0, src1
-    v_cmp_lt_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_u32                   dst, src0, src1
-    v_cmp_lt_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_lt_u64                   dst, src0, src1
-    v_cmp_ne_i16                   dst, src0, src1
-    v_cmp_ne_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ne_i32                   dst, src0, src1
-    v_cmp_ne_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ne_i64                   dst, src0, src1
-    v_cmp_ne_u16                   dst, src0, src1
-    v_cmp_ne_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ne_u32                   dst, src0, src1
-    v_cmp_ne_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ne_u64                   dst, src0, src1
-    v_cmp_neq_f16                  dst, src0, src1
-    v_cmp_neq_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_neq_f32                  dst, src0, src1
-    v_cmp_neq_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_neq_f64                  dst, src0, src1
-    v_cmp_nge_f16                  dst, src0, src1
-    v_cmp_nge_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nge_f32                  dst, src0, src1
-    v_cmp_nge_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nge_f64                  dst, src0, src1
-    v_cmp_ngt_f16                  dst, src0, src1
-    v_cmp_ngt_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ngt_f32                  dst, src0, src1
-    v_cmp_ngt_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_ngt_f64                  dst, src0, src1
-    v_cmp_nle_f16                  dst, src0, src1
-    v_cmp_nle_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nle_f32                  dst, src0, src1
-    v_cmp_nle_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nle_f64                  dst, src0, src1
-    v_cmp_nlg_f16                  dst, src0, src1
-    v_cmp_nlg_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nlg_f32                  dst, src0, src1
-    v_cmp_nlg_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nlg_f64                  dst, src0, src1
-    v_cmp_nlt_f16                  dst, src0, src1
-    v_cmp_nlt_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nlt_f32                  dst, src0, src1
-    v_cmp_nlt_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_nlt_f64                  dst, src0, src1
-    v_cmp_o_f16                    dst, src0, src1
-    v_cmp_o_f16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_o_f32                    dst, src0, src1
-    v_cmp_o_f32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_o_f64                    dst, src0, src1
-    v_cmp_t_i16                    dst, src0, src1
-    v_cmp_t_i16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_t_i32                    dst, src0, src1
-    v_cmp_t_i32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_t_i64                    dst, src0, src1
-    v_cmp_t_u16                    dst, src0, src1
-    v_cmp_t_u16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_t_u32                    dst, src0, src1
-    v_cmp_t_u32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_t_u64                    dst, src0, src1
-    v_cmp_tru_f16                  dst, src0, src1
-    v_cmp_tru_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_tru_f32                  dst, src0, src1
-    v_cmp_tru_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_tru_f64                  dst, src0, src1
-    v_cmp_u_f16                    dst, src0, src1
-    v_cmp_u_f16_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_u_f32                    dst, src0, src1
-    v_cmp_u_f32_sdwa               dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmp_u_f64                    dst, src0, src1
-    v_cmpx_class_f16               dst, src0, src1
-    v_cmpx_class_f16_sdwa          dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_class_f32               dst, src0, src1
-    v_cmpx_class_f32_sdwa          dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_class_f64               dst, src0, src1
-    v_cmpx_eq_f16                  dst, src0, src1
-    v_cmpx_eq_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_f32                  dst, src0, src1
-    v_cmpx_eq_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_f64                  dst, src0, src1
-    v_cmpx_eq_i16                  dst, src0, src1
-    v_cmpx_eq_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_i32                  dst, src0, src1
-    v_cmpx_eq_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_i64                  dst, src0, src1
-    v_cmpx_eq_u16                  dst, src0, src1
-    v_cmpx_eq_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_u32                  dst, src0, src1
-    v_cmpx_eq_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_eq_u64                  dst, src0, src1
-    v_cmpx_f_f16                   dst, src0, src1
-    v_cmpx_f_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_f32                   dst, src0, src1
-    v_cmpx_f_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_f64                   dst, src0, src1
-    v_cmpx_f_i16                   dst, src0, src1
-    v_cmpx_f_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_i32                   dst, src0, src1
-    v_cmpx_f_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_i64                   dst, src0, src1
-    v_cmpx_f_u16                   dst, src0, src1
-    v_cmpx_f_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_u32                   dst, src0, src1
-    v_cmpx_f_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_f_u64                   dst, src0, src1
-    v_cmpx_ge_f16                  dst, src0, src1
-    v_cmpx_ge_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_f32                  dst, src0, src1
-    v_cmpx_ge_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_f64                  dst, src0, src1
-    v_cmpx_ge_i16                  dst, src0, src1
-    v_cmpx_ge_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_i32                  dst, src0, src1
-    v_cmpx_ge_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_i64                  dst, src0, src1
-    v_cmpx_ge_u16                  dst, src0, src1
-    v_cmpx_ge_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_u32                  dst, src0, src1
-    v_cmpx_ge_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ge_u64                  dst, src0, src1
-    v_cmpx_gt_f16                  dst, src0, src1
-    v_cmpx_gt_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_f32                  dst, src0, src1
-    v_cmpx_gt_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_f64                  dst, src0, src1
-    v_cmpx_gt_i16                  dst, src0, src1
-    v_cmpx_gt_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_i32                  dst, src0, src1
-    v_cmpx_gt_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_i64                  dst, src0, src1
-    v_cmpx_gt_u16                  dst, src0, src1
-    v_cmpx_gt_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_u32                  dst, src0, src1
-    v_cmpx_gt_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_gt_u64                  dst, src0, src1
-    v_cmpx_le_f16                  dst, src0, src1
-    v_cmpx_le_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_f32                  dst, src0, src1
-    v_cmpx_le_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_f64                  dst, src0, src1
-    v_cmpx_le_i16                  dst, src0, src1
-    v_cmpx_le_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_i32                  dst, src0, src1
-    v_cmpx_le_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_i64                  dst, src0, src1
-    v_cmpx_le_u16                  dst, src0, src1
-    v_cmpx_le_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_u32                  dst, src0, src1
-    v_cmpx_le_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_le_u64                  dst, src0, src1
-    v_cmpx_lg_f16                  dst, src0, src1
-    v_cmpx_lg_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lg_f32                  dst, src0, src1
-    v_cmpx_lg_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lg_f64                  dst, src0, src1
-    v_cmpx_lt_f16                  dst, src0, src1
-    v_cmpx_lt_f16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_f32                  dst, src0, src1
-    v_cmpx_lt_f32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_f64                  dst, src0, src1
-    v_cmpx_lt_i16                  dst, src0, src1
-    v_cmpx_lt_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_i32                  dst, src0, src1
-    v_cmpx_lt_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_i64                  dst, src0, src1
-    v_cmpx_lt_u16                  dst, src0, src1
-    v_cmpx_lt_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_u32                  dst, src0, src1
-    v_cmpx_lt_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_lt_u64                  dst, src0, src1
-    v_cmpx_ne_i16                  dst, src0, src1
-    v_cmpx_ne_i16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ne_i32                  dst, src0, src1
-    v_cmpx_ne_i32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ne_i64                  dst, src0, src1
-    v_cmpx_ne_u16                  dst, src0, src1
-    v_cmpx_ne_u16_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ne_u32                  dst, src0, src1
-    v_cmpx_ne_u32_sdwa             dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ne_u64                  dst, src0, src1
-    v_cmpx_neq_f16                 dst, src0, src1
-    v_cmpx_neq_f16_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_neq_f32                 dst, src0, src1
-    v_cmpx_neq_f32_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_neq_f64                 dst, src0, src1
-    v_cmpx_nge_f16                 dst, src0, src1
-    v_cmpx_nge_f16_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nge_f32                 dst, src0, src1
-    v_cmpx_nge_f32_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nge_f64                 dst, src0, src1
-    v_cmpx_ngt_f16                 dst, src0, src1
-    v_cmpx_ngt_f16_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ngt_f32                 dst, src0, src1
-    v_cmpx_ngt_f32_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_ngt_f64                 dst, src0, src1
-    v_cmpx_nle_f16                 dst, src0, src1
-    v_cmpx_nle_f16_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nle_f32                 dst, src0, src1
-    v_cmpx_nle_f32_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nle_f64                 dst, src0, src1
-    v_cmpx_nlg_f16                 dst, src0, src1
-    v_cmpx_nlg_f16_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nlg_f32                 dst, src0, src1
-    v_cmpx_nlg_f32_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nlg_f64                 dst, src0, src1
-    v_cmpx_nlt_f16                 dst, src0, src1
-    v_cmpx_nlt_f16_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nlt_f32                 dst, src0, src1
-    v_cmpx_nlt_f32_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_nlt_f64                 dst, src0, src1
-    v_cmpx_o_f16                   dst, src0, src1
-    v_cmpx_o_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_o_f32                   dst, src0, src1
-    v_cmpx_o_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_o_f64                   dst, src0, src1
-    v_cmpx_t_i16                   dst, src0, src1
-    v_cmpx_t_i16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_t_i32                   dst, src0, src1
-    v_cmpx_t_i32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_t_i64                   dst, src0, src1
-    v_cmpx_t_u16                   dst, src0, src1
-    v_cmpx_t_u16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_t_u32                   dst, src0, src1
-    v_cmpx_t_u32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_t_u64                   dst, src0, src1
-    v_cmpx_tru_f16                 dst, src0, src1
-    v_cmpx_tru_f16_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_tru_f32                 dst, src0, src1
-    v_cmpx_tru_f32_sdwa            dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_tru_f64                 dst, src0, src1
-    v_cmpx_u_f16                   dst, src0, src1
-    v_cmpx_u_f16_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_u_f32                   dst, src0, src1
-    v_cmpx_u_f32_sdwa              dst, src0, src1                :ref:`src0_sel<amdgpu_synid_src0_sel>` :ref:`src1_sel<amdgpu_synid_src1_sel>`
-    v_cmpx_u_f64                   dst, src0, src1
diff --git a/docs/AMDGPUInstructionNotation.rst b/docs/AMDGPUInstructionNotation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2b41d5b81949f4a478ac6e625a5eb5951e2d0cbf
--- /dev/null
+++ b/docs/AMDGPUInstructionNotation.rst
@@ -0,0 +1,110 @@
+============================
+AMDGPU Instructions Notation
+============================
+
+.. contents::
+   :local:
+
+.. _amdgpu_syn_instruction_notation:
+
+Introduction
+============
+
+This is an overview of notation used to describe the syntax of AMDGPU assembler instructions.
+
+This notation mimics the :ref:`syntax of assembler instructions<amdgpu_syn_instructions>`
+except that instead of real operands and modifiers it provides references to their description.
+
+Instructions
+============
+
+Notation
+~~~~~~~~
+
+This is the notation used to describe AMDGPU instructions:
+
+    ``<``\ :ref:`opcode description<amdgpu_syn_opcode_notation>`\ ``>  <``\ :ref:`operands description<amdgpu_syn_instruction_operands_notation>`\ ``>  <``\ :ref:`modifiers description<amdgpu_syn_instruction_modifiers_notation>`\ ``>``
+
+.. _amdgpu_syn_opcode_notation:
+
+Opcode
+======
+
+Notation
+~~~~~~~~
+
+TBD
+
+.. _amdgpu_syn_instruction_operands_notation:
+
+Operands
+========
+
+An instruction may have zero or more *operands*. They are comma-separated in the description:
+
+    ``<``\ :ref:`description of operand 0<amdgpu_syn_instruction_operand_notation>`\ ``>, <``\ :ref:`description of operand 1<amdgpu_syn_instruction_operand_notation>`\ ``>, ...``
+
+The order of *operands* is fixed. *Operands* cannot be omitted
+except for special cases described below.
+
+.. _amdgpu_syn_instruction_operand_notation:
+
+Notation
+~~~~~~~~
+
+An operand is described using the following notation:
+
+    *<name><tag0><tag1>...*
+
+Where:
+
+* *name* is a link to a description of the operand.
+* *tags* are optional. They are used to indicate special operand properties:
+
+.. _amdgpu_syn_instruction_operand_tags:
+
+    ============== =================================================================================
+    Operand tag    Meaning
+    ============== =================================================================================
+    :opt           An optional operand.
+    :m             An operand which may be used with
+                   :ref:`VOP3 operand modifiers<amdgpu_synid_vop3_operand_modifiers>` or
+                   :ref:`SDWA operand modifiers<amdgpu_synid_sdwa_operand_modifiers>`.
+    :dst           An input operand which may also serve as a destination
+                   if :ref:`glc<amdgpu_synid_glc>` modifier is specified.
+    :fx            This is an *f32* or *f16* operand depending on
+                   :ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>` modifier.
+    :<type>        Operand *type* differs from *type*
+                   :ref:`implied by the opcode name<amdgpu_syn_instruction_type>`.
+                   This tag specifies actual operand *type*.
+    ============== =================================================================================
+
+Examples:
+
+.. parsed-literal::
+
+    src1:m             // src1 operand may be used with operand modifiers
+    vdata:dst          // vdata operand may be used as both source and destination
+    vdst:u32           // vdst operand has u32 type
+
+.. _amdgpu_syn_instruction_modifiers_notation:
+
+Modifiers
+=========
+
+An instruction may have zero or more optional *modifiers*. They are space-separated in the description:
+
+    ``<``\ :ref:`description of modifier 0<amdgpu_syn_instruction_modifier_notation>`\ ``> <``\ :ref:`description of modifier 1<amdgpu_syn_instruction_modifier_notation>`\ ``> ...``
+
+The order of *modifiers* is fixed.
+
+.. _amdgpu_syn_instruction_modifier_notation:
+
+Notation
+~~~~~~~~
+
+A *modifier* is described using the following notation:
+
+    *<name>*
+
+Where *name* is a link to a description of the *modifier*.
diff --git a/docs/AMDGPUInstructionSyntax.rst b/docs/AMDGPUInstructionSyntax.rst
new file mode 100644
index 0000000000000000000000000000000000000000..90ad54aade839479e026d730ef85805a8736c30c
--- /dev/null
+++ b/docs/AMDGPUInstructionSyntax.rst
@@ -0,0 +1,170 @@
+=========================
+AMDGPU Instruction Syntax
+=========================
+
+.. contents::
+   :local:
+
+.. _amdgpu_syn_instructions:
+
+Instructions
+============
+
+Syntax
+~~~~~~
+
+An instruction has the following syntax:
+
+    ``<``\ *opcode mnemonic*\ ``>    <``\ *operand0*\ ``>, <``\ *operand1*\ ``>,...    <``\ *modifier0*\ ``> <``\ *modifier1*\ ``>...``
+
+:doc:`Operands<AMDGPUOperandSyntax>` are normally comma-separated while
+:doc:`modifiers<AMDGPUModifierSyntax>` are space-separated.
+
+The order of *operands* and *modifiers* is fixed.
+Most *modifiers* are optional and may be omitted.
+
+.. _amdgpu_syn_instruction_mnemo:
+
+Opcode Mnemonic
+~~~~~~~~~~~~~~~
+
+Opcode mnemonic describes opcode semantics and may include one or more suffices in this order:
+
+* :ref:`Destination operand type suffix<amdgpu_syn_instruction_type>`.
+* :ref:`Source operand type suffix<amdgpu_syn_instruction_type>`.
+* :ref:`Encoding suffix<amdgpu_syn_instruction_enc>`.
+
+.. _amdgpu_syn_instruction_type:
+
+Type and Size Suffices
+~~~~~~~~~~~~~~~~~~~~~~
+
+Instructions which operate with data have an implied type of *data* operands.
+This data type is specified as a suffix of instruction mnemonic.
+
+There are instructions which have 2 type suffices:
+the first is the data type of the destination operand,
+the second is the data type of source *data* operand(s).
+
+Note that data type specified by an instruction does not apply
+to other kinds of operands such as *addresses*, *offsets* and so on.
+
+The following table enumerates the most frequently used type suffices.
+
+    ============================================ ======================= =================
+    Type Suffices                                Packed instruction?     Data Type
+    ============================================ ======================= =================
+    _b512, _b256, _b128, _b64, _b32, _b16, _b8   No                      Bits.
+    _u64, _u32, _u16, _u8                        No                      Unsigned integer.
+    _i64, _i32, _i16, _i8                        No                      Signed integer.
+    _f64, _f32, _f16                             No                      Floating-point.
+    _b16, _u16, _i16, _f16                       Yes                     Packed.
+    ============================================ ======================= =================
+
+Instructions which have no type suffices are assumed to operate with typeless data.
+The size of data is specified by size suffices:
+
+    ================= =================== =====================================
+    Size Suffix       Implied data type   Required register size in dwords
+    ================= =================== =====================================
+    \-                b32                 1
+    x2                b64                 2
+    x3                b96                 3
+    x4                b128                4
+    x8                b256                8
+    x16               b512                16
+    x                 b32                 1
+    xy                b64                 2
+    xyz               b96                 3
+    xyzw              b128                4
+    d16_x             b16                 1
+    d16_xy            b16x2               2 for GFX8.0, 1 for GFX8.1 and GFX9
+    d16_xyz           b16x3               3 for GFX8.0, 2 for GFX8.1 and GFX9
+    d16_xyzw          b16x4               4 for GFX8.0, 2 for GFX8.1 and GFX9
+    ================= =================== =====================================
+
+.. WARNING::
+    There are exceptions from rules described above.
+    Operands which have type different from type specified by the opcode are
+    :ref:`tagged<amdgpu_syn_instruction_operand_tags>` in the description.
+
+Examples of instructions with different types of source and destination operands:
+
+.. parsed-literal::
+
+    s_bcnt0_i32_b64
+    v_cvt_f32_u32
+
+Examples of instructions with one data type:
+
+.. parsed-literal::
+
+    v_max3_f32
+    v_max3_i16
+
+Examples of instructions which operate with packed data:
+
+.. parsed-literal::
+
+    v_pk_add_u16
+    v_pk_add_i16
+    v_pk_add_f16
+
+Examples of typeless instructions which operate on b128 data:
+
+.. parsed-literal::
+
+    buffer_store_dwordx4
+    flat_load_dwordx4
+
+.. _amdgpu_syn_instruction_enc:
+
+Encoding Suffices
+~~~~~~~~~~~~~~~~~
+
+Most *VOP1*, *VOP2* and *VOPC* instructions have several variants:
+they may also be encoded in *VOP3*, *DPP* and *SDWA* formats.
+
+The assembler will automatically use optimal encoding based on instruction operands.
+To force specific encoding, one can add a suffix to the opcode of the instruction:
+
+    =================================================== =================
+    Encoding                                            Encoding Suffix
+    =================================================== =================
+    Native 32-bit encoding (*VOP1*, *VOP2* or *VOPC*)   _e32
+    *VOP3* (64-bit) encoding                            _e64
+    *DPP* encoding                                      _dpp
+    *SDWA* encoding                                     _sdwa
+    =================================================== =================
+
+These suffices are used in this reference to indicate the assumed encoding.
+When no suffix is specified, a native encoding is implied.
+
+Operands
+========
+
+Syntax
+~~~~~~
+
+Syntax of most operands is described :doc:`in this document<AMDGPUOperandSyntax>`.
+
+For detailed information about operands follow *operand links* in GPU-specific documents:
+
+* :doc:`GFX7<AMDGPU/AMDGPUAsmGFX7>`
+* :doc:`GFX8<AMDGPU/AMDGPUAsmGFX8>`
+* :doc:`GFX9<AMDGPU/AMDGPUAsmGFX9>`
+
+Modifiers
+=========
+
+Syntax
+~~~~~~
+
+Syntax of modifiers is described :doc:`in this document<AMDGPUModifierSyntax>`.
+
+Information about modifiers supported for individual instructions may be found in GPU-specific documents:
+
+* :doc:`GFX7<AMDGPU/AMDGPUAsmGFX7>`
+* :doc:`GFX8<AMDGPU/AMDGPUAsmGFX8>`
+* :doc:`GFX9<AMDGPU/AMDGPUAsmGFX9>`
+
diff --git a/docs/AMDGPUModifierSyntax.rst b/docs/AMDGPUModifierSyntax.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e2b8bb3f952a7c6d05c97db09c142db0cfe67ad7
--- /dev/null
+++ b/docs/AMDGPUModifierSyntax.rst
@@ -0,0 +1,1248 @@
+======================================
+Syntax of AMDGPU Instruction Modifiers
+======================================
+
+.. contents::
+   :local:
+
+Conventions
+===========
+
+The following notation is used throughout this document:
+
+    =================== =============================================================
+    Notation            Description
+    =================== =============================================================
+    {0..N}              Any integer value in the range from 0 to N (inclusive).
+    <x>                 Syntax and meaning of *x* is explained elsewhere.
+    =================== =============================================================
+
+.. _amdgpu_syn_modifiers:
+
+Modifiers
+=========
+
+DS Modifiers
+------------
+
+.. _amdgpu_synid_ds_offset8:
+
+ds_offset8
+~~~~~~~~~~
+
+Specifies an immediate unsigned 8-bit offset, in bytes. The default value is 0.
+
+Used with DS instructions which have 2 addresses.
+
+    =================== =====================================================
+    Syntax              Description
+    =================== =====================================================
+    offset:{0..0xFF}    Specifies an unsigned 8-bit offset as a positive
+                        :ref:`integer number <amdgpu_synid_integer_number>`.
+    =================== =====================================================
+
+Examples:
+
+.. parsed-literal::
+
+  offset:255
+  offset:0xff
+
+.. _amdgpu_synid_ds_offset16:
+
+ds_offset16
+~~~~~~~~~~~
+
+Specifies an immediate unsigned 16-bit offset, in bytes. The default value is 0.
+
+Used with DS instructions which have 1 address.
+
+    ==================== ======================================================
+    Syntax               Description
+    ==================== ======================================================
+    offset:{0..0xFFFF}   Specifies an unsigned 16-bit offset as a positive
+                         :ref:`integer number <amdgpu_synid_integer_number>`.
+    ==================== ======================================================
+
+Examples:
+
+.. parsed-literal::
+
+  offset:65535
+  offset:0xffff
+
+.. _amdgpu_synid_sw_offset16:
+
+sw_offset16
+~~~~~~~~~~~
+
+This is a special modifier which may be used with *ds_swizzle_b32* instruction only.
+It specifies a swizzle pattern in numeric or symbolic form. The default value is 0.
+
+See AMD documentation for more information.
+
+    ======================================================= ===========================================================
+    Syntax                                                  Description
+    ======================================================= ===========================================================
+    offset:{0..0xFFFF}                                      Specifies a 16-bit swizzle pattern.
+    offset:swizzle(QUAD_PERM,{0..3},{0..3},{0..3},{0..3})   Specifies a quad permute mode pattern
+
+                                                            Each number is a lane *id*.
+    offset:swizzle(BITMASK_PERM, "<mask>")                  Specifies a bitmask permute mode pattern.
+
+                                                            The pattern converts a 5-bit lane *id* to another
+                                                            lane *id* with which the lane interacts.
+
+                                                            *mask* is a 5 character sequence which
+                                                            specifies how to transform the bits of the
+                                                            lane *id*. 
+
+                                                            The following characters are allowed:
+
+                                                            * "0" - set bit to 0.
+
+                                                            * "1" - set bit to 1.
+
+                                                            * "p" - preserve bit.
+
+                                                            * "i" - inverse bit.
+
+    offset:swizzle(BROADCAST,{2..32},{0..N})                Specifies a broadcast mode.
+
+                                                            Broadcasts the value of any particular lane to
+                                                            all lanes in its group.
+
+                                                            The first numeric parameter is a group
+                                                            size and must be equal to 2, 4, 8, 16 or 32.
+
+                                                            The second numeric parameter is an index of the
+                                                            lane being broadcasted. 
+
+                                                            The index must not exceed group size.
+    offset:swizzle(SWAP,{1..16})                            Specifies a swap mode.
+
+                                                            Swaps the neighboring groups of
+                                                            1, 2, 4, 8 or 16 lanes.
+    offset:swizzle(REVERSE,{2..32})                         Specifies a reverse mode.
+
+                                                            Reverses the lanes for groups of 2, 4, 8, 16 or 32 lanes.
+    ======================================================= ===========================================================
+
+Numeric parameters may be specified as either :ref:`integer numbers<amdgpu_synid_integer_number>` or
+:ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+
+Examples:
+
+.. parsed-literal::
+
+  offset:255
+  offset:0xffff
+  offset:swizzle(QUAD_PERM, 0, 1, 2 ,3)
+  offset:swizzle(BITMASK_PERM, "01pi0")
+  offset:swizzle(BROADCAST, 2, 0)
+  offset:swizzle(SWAP, 8)
+  offset:swizzle(REVERSE, 30 + 2)
+
+.. _amdgpu_synid_gds:
+
+gds
+~~~
+
+Specifies whether to use GDS or LDS memory (LDS is the default).
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    gds                                      Use GDS memory.
+    ======================================== ================================================
+
+
+EXP Modifiers
+-------------
+
+.. _amdgpu_synid_done:
+
+done
+~~~~
+
+Specifies if this is the last export from the shader to the target. By default, current
+instruction does not finish an export sequence.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    done                                     Indicates the last export operation.
+    ======================================== ================================================
+
+.. _amdgpu_synid_compr:
+
+compr
+~~~~~
+
+Indicates if the data are compressed (data are not compressed by default).
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    compr                                    Data are compressed.
+    ======================================== ================================================
+
+.. _amdgpu_synid_vm:
+
+vm
+~~
+
+Specifies valid mask flag state (off by default).
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    vm                                       Set valid mask flag.
+    ======================================== ================================================
+
+FLAT Modifiers
+--------------
+
+.. _amdgpu_synid_flat_offset12:
+
+flat_offset12
+~~~~~~~~~~~~~
+
+Specifies an immediate unsigned 12-bit offset, in bytes. The default value is 0.
+
+Cannot be used with *global/scratch* opcodes. GFX9 only.
+
+    ================= ======================================================
+    Syntax            Description
+    ================= ======================================================
+    offset:{0..4095}  Specifies a 12-bit unsigned offset as a positive
+                      :ref:`integer number <amdgpu_synid_integer_number>`.
+    ================= ======================================================
+
+Examples:
+
+.. parsed-literal::
+
+  offset:4095
+  offset:0xff
+
+.. _amdgpu_synid_flat_offset13:
+
+flat_offset13
+~~~~~~~~~~~~~
+
+Specifies an immediate signed 13-bit offset, in bytes. The default value is 0.
+
+Can be used with *global/scratch* opcodes only. GFX9 only.
+
+    ============================ =======================================================
+    Syntax                       Description
+    ============================ =======================================================
+    offset:{-4096..+4095}        Specifies a 13-bit signed offset as an
+                                 :ref:`integer number <amdgpu_synid_integer_number>`.
+    ============================ =======================================================
+
+Examples:
+
+.. parsed-literal::
+
+  offset:-4000
+  offset:0x10
+
+glc
+~~~
+
+See a description :ref:`here<amdgpu_synid_glc>`.
+
+slc
+~~~
+
+See a description :ref:`here<amdgpu_synid_slc>`.
+
+tfe
+~~~
+
+See a description :ref:`here<amdgpu_synid_tfe>`.
+
+nv
+~~
+
+See a description :ref:`here<amdgpu_synid_nv>`.
+
+MIMG Modifiers
+--------------
+
+.. _amdgpu_synid_dmask:
+
+dmask
+~~~~~
+
+Specifies which channels (image components) are used by the operation. By default, no channels
+are used.
+
+    =============== =====================================================
+    Syntax          Description
+    =============== =====================================================
+    dmask:{0..15}   Specifies image channels as a positive
+                    :ref:`integer number <amdgpu_synid_integer_number>`.
+
+                    Each bit corresponds to one of 4 image
+                    components (RGBA).
+
+                    If the specified bit value
+                    is 0, the component is not used, value 1 means
+                    that the component is used.
+    =============== =====================================================
+
+This modifier has some limitations depending on instruction kind:
+
+    =================================================== ========================
+    Instruction Kind                                    Valid dmask Values
+    =================================================== ========================
+    32-bit atomic *cmpswap*                             0x3
+    32-bit atomic instructions except for *cmpswap*     0x1
+    64-bit atomic *cmpswap*                             0xF
+    64-bit atomic instructions except for *cmpswap*     0x3
+    *gather4*                                           0x1, 0x2, 0x4, 0x8
+    Other instructions                                  any value
+    =================================================== ========================
+
+Examples:
+
+.. parsed-literal::
+
+  dmask:0xf
+  dmask:0b1111
+  dmask:3
+
+.. _amdgpu_synid_unorm:
+
+unorm
+~~~~~
+
+Specifies whether the address is normalized or not (the address is normalized by default).
+
+    ======================== ========================================
+    Syntax                   Description
+    ======================== ========================================
+    unorm                    Force the address to be unnormalized.
+    ======================== ========================================
+
+glc
+~~~
+
+See a description :ref:`here<amdgpu_synid_glc>`.
+
+slc
+~~~
+
+See a description :ref:`here<amdgpu_synid_slc>`.
+
+.. _amdgpu_synid_r128:
+
+r128
+~~~~
+
+Specifies texture resource size. The default size is 256 bits.
+
+GFX7 and GFX8 only.
+
+    =================== ================================================
+    Syntax              Description
+    =================== ================================================
+    r128                Specifies 128 bits texture resource size.
+    =================== ================================================
+
+.. WARNING:: Using this modifier should descrease *rsrc* register size from 8 to 4 dwords, but assembler does not currently support this feature.
+
+tfe
+~~~
+
+See a description :ref:`here<amdgpu_synid_tfe>`.
+
+.. _amdgpu_synid_lwe:
+
+lwe
+~~~
+
+Specifies LOD warning status (LOD warning is disabled by default).
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    lwe                                      Enables LOD warning.
+    ======================================== ================================================
+
+.. _amdgpu_synid_da:
+
+da
+~~
+
+Specifies if an array index must be sent to TA. By default, array index is not sent.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    da                                       Send an array-index to TA.
+    ======================================== ================================================
+
+.. _amdgpu_synid_d16:
+
+d16
+~~~
+
+Specifies data size: 16 or 32 bits (32 bits by default). Not supported by GFX7.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    d16                                      Enables 16-bits data mode.
+
+                                             On loads, convert data in memory to 16-bit
+                                             format before storing it in VGPRs.
+
+                                             For stores, convert 16-bit data in VGPRs to
+                                             32 bits before going to memory.
+
+                                             Note that GFX8.0 does not support data packing.
+                                             Each 16-bit data element occupies 1 VGPR.
+
+                                             GFX8.1 and GFX9 support data packing.
+                                             Each pair of 16-bit data elements 
+                                             occupies 1 VGPR.
+    ======================================== ================================================
+
+.. _amdgpu_synid_a16:
+
+a16
+~~~
+
+Specifies size of image address components: 16 or 32 bits (32 bits by default). GFX9 only.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    a16                                      Enables 16-bits image address components.
+    ======================================== ================================================
+
+Miscellaneous Modifiers
+-----------------------
+
+.. _amdgpu_synid_glc:
+
+glc
+~~~
+
+This modifier has different meaning for loads, stores, and atomic operations.
+The default value is off (0).
+
+See AMD documentation for details.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    glc                                      Set glc bit to 1.
+    ======================================== ================================================
+
+.. _amdgpu_synid_slc:
+
+slc
+~~~
+
+Specifies cache policy. The default value is off (0).
+
+See AMD documentation for details.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    slc                                      Set slc bit to 1.
+    ======================================== ================================================
+
+.. _amdgpu_synid_tfe:
+
+tfe
+~~~
+
+Controls access to partially resident textures. The default value is off (0).
+
+See AMD documentation for details.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    tfe                                      Set tfe bit to 1.
+    ======================================== ================================================
+
+.. _amdgpu_synid_nv:
+
+nv
+~~
+
+Specifies if instruction is operating on non-volatile memory. By default, memory is volatile.
+
+GFX9 only.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    nv                                       Indicates that instruction operates on
+                                             non-volatile memory.
+    ======================================== ================================================
+
+MUBUF/MTBUF Modifiers
+---------------------
+
+.. _amdgpu_synid_idxen:
+
+idxen
+~~~~~
+
+Specifies whether address components include an index. By default, no components are used.
+
+Can be used together with :ref:`offen<amdgpu_synid_offen>`.
+
+Cannot be used with :ref:`addr64<amdgpu_synid_addr64>`.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    idxen                                    Address components include an index.
+    ======================================== ================================================
+
+.. _amdgpu_synid_offen:
+
+offen
+~~~~~
+
+Specifies whether address components include an offset. By default, no components are used.
+
+Can be used together with :ref:`idxen<amdgpu_synid_idxen>`.
+
+Cannot be used with :ref:`addr64<amdgpu_synid_addr64>`.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    offen                                    Address components include an offset.
+    ======================================== ================================================
+
+.. _amdgpu_synid_addr64:
+
+addr64
+~~~~~~
+
+Specifies whether a 64-bit address is used. By default, no address is used.
+
+GFX7 only. Cannot be used with :ref:`offen<amdgpu_synid_offen>` and
+:ref:`idxen<amdgpu_synid_idxen>` modifiers.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    addr64                                   A 64-bit address is used.
+    ======================================== ================================================
+
+.. _amdgpu_synid_buf_offset12:
+
+buf_offset12
+~~~~~~~~~~~~
+
+Specifies an immediate unsigned 12-bit offset, in bytes. The default value is 0.
+
+    =============================== ======================================================
+    Syntax                          Description
+    =============================== ======================================================
+    offset:{0..0xFFF}               Specifies a 12-bit unsigned offset as a positive
+                                    :ref:`integer number <amdgpu_synid_integer_number>`.
+    =============================== ======================================================
+
+Examples:
+
+.. parsed-literal::
+
+  offset:0
+  offset:0x10
+
+glc
+~~~
+
+See a description :ref:`here<amdgpu_synid_glc>`.
+
+slc
+~~~
+
+See a description :ref:`here<amdgpu_synid_slc>`.
+
+.. _amdgpu_synid_lds:
+
+lds
+~~~
+
+Specifies where to store the result: VGPRs or LDS (VGPRs by default).
+
+    ======================================== ===========================
+    Syntax                                   Description
+    ======================================== ===========================
+    lds                                      Store result in LDS.
+    ======================================== ===========================
+
+tfe
+~~~
+
+See a description :ref:`here<amdgpu_synid_tfe>`.
+
+.. _amdgpu_synid_dfmt:
+
+dfmt
+~~~~
+
+TBD
+
+.. _amdgpu_synid_nfmt:
+
+nfmt
+~~~~
+
+TBD
+
+SMRD/SMEM Modifiers
+-------------------
+
+glc
+~~~
+
+See a description :ref:`here<amdgpu_synid_glc>`.
+
+nv
+~~
+
+See a description :ref:`here<amdgpu_synid_nv>`.
+
+VINTRP Modifiers
+----------------
+
+.. _amdgpu_synid_high:
+
+high
+~~~~
+
+Specifies which half of the LDS word to use. Low half of LDS word is used by default.
+GFX9 only.
+
+    ======================================== ================================
+    Syntax                                   Description
+    ======================================== ================================
+    high                                     Use high half of LDS word.
+    ======================================== ================================
+
+VOP1/VOP2 DPP Modifiers
+-----------------------
+
+GFX8 and GFX9 only.
+
+.. _amdgpu_synid_dpp_ctrl:
+
+dpp_ctrl
+~~~~~~~~
+
+Specifies how data are shared between threads. This is a mandatory modifier.
+There is no default value.
+
+Note. The lanes of a wavefront are organized in four banks and four rows.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    quad_perm:[{0..3},{0..3},{0..3},{0..3}]  Full permute of 4 threads.
+    row_mirror                               Mirror threads within row.
+    row_half_mirror                          Mirror threads within 1/2 row (8 threads).
+    row_bcast:15                             Broadcast 15th thread of each row to next row.
+    row_bcast:31                             Broadcast thread 31 to rows 2 and 3.
+    wave_shl:1                               Wavefront left shift by 1 thread.
+    wave_rol:1                               Wavefront left rotate by 1 thread.
+    wave_shr:1                               Wavefront right shift by 1 thread.
+    wave_ror:1                               Wavefront right rotate by 1 thread.
+    row_shl:{1..15}                          Row shift left by 1-15 threads.
+    row_shr:{1..15}                          Row shift right by 1-15 threads.
+    row_ror:{1..15}                          Row rotate right by 1-15 threads.
+    ======================================== ================================================
+
+Note: Numeric parameters may be specified as either
+:ref:`integer numbers<amdgpu_synid_integer_number>` or
+:ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+
+Examples:
+
+.. parsed-literal::
+
+  quad_perm:[0, 1, 2, 3]
+  row_shl:3
+
+.. _amdgpu_synid_row_mask:
+
+row_mask
+~~~~~~~~
+
+Controls which rows are enabled for data sharing. By default, all rows are enabled.
+
+Note. The lanes of a wavefront are organized in four banks and four rows.
+
+    ======================================== =====================================================
+    Syntax                                   Description
+    ======================================== =====================================================
+    row_mask:{0..15}                         Specifies a *row mask* as a positive
+                                             :ref:`integer number <amdgpu_synid_integer_number>`.
+
+                                             Each of 4 bits in the mask controls one
+                                             row (0 - disabled, 1 - enabled).
+    ======================================== =====================================================
+
+Examples:
+
+.. parsed-literal::
+
+  row_mask:0xf
+  row_mask:0b1010
+  row_mask:0b1111
+
+.. _amdgpu_synid_bank_mask:
+
+bank_mask
+~~~~~~~~~
+
+Controls which banks are enabled for data sharing. By default, all banks are enabled.
+
+Note. The lanes of a wavefront are organized in four banks and four rows.
+
+    ======================================== =======================================================
+    Syntax                                   Description
+    ======================================== =======================================================
+    bank_mask:{0..15}                        Specifies a *bank mask* as a positive
+                                             :ref:`integer number <amdgpu_synid_integer_number>`.
+
+                                             Each of 4 bits in the mask controls one
+                                             bank (0 - disabled, 1 - enabled).
+    ======================================== =======================================================
+
+Examples:
+
+.. parsed-literal::
+
+  bank_mask:0x3
+  bank_mask:0b0011
+  bank_mask:0b1111
+
+.. _amdgpu_synid_bound_ctrl:
+
+bound_ctrl
+~~~~~~~~~~
+
+Controls data sharing when accessing an invalid lane. By default, data sharing with
+invalid lanes is disabled.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    bound_ctrl:0                             Enables data sharing with invalid lanes.
+
+                                             Accessing data from an invalid lane will
+                                             return zero.
+    ======================================== ================================================
+
+VOP1/VOP2/VOPC SDWA Modifiers
+-----------------------------
+
+GFX8 and GFX9 only.
+
+clamp
+~~~~~
+
+See a description :ref:`here<amdgpu_synid_clamp>`.
+
+omod
+~~~~
+
+See a description :ref:`here<amdgpu_synid_omod>`.
+
+GFX9 only.
+
+.. _amdgpu_synid_dst_sel:
+
+dst_sel
+~~~~~~~
+
+Selects which bits in the destination are affected. By default, all bits are affected.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    dst_sel:DWORD                            Use bits 31:0.
+    dst_sel:BYTE_0                           Use bits 7:0.
+    dst_sel:BYTE_1                           Use bits 15:8.
+    dst_sel:BYTE_2                           Use bits 23:16.
+    dst_sel:BYTE_3                           Use bits 31:24.
+    dst_sel:WORD_0                           Use bits 15:0.
+    dst_sel:WORD_1                           Use bits 31:16.
+    ======================================== ================================================
+
+
+.. _amdgpu_synid_dst_unused:
+
+dst_unused
+~~~~~~~~~~
+
+Controls what to do with the bits in the destination which are not selected
+by :ref:`dst_sel<amdgpu_synid_dst_sel>`.
+By default, unused bits are preserved.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    dst_unused:UNUSED_PAD                    Pad with zeros.
+    dst_unused:UNUSED_SEXT                   Sign-extend upper bits, zero lower bits.
+    dst_unused:UNUSED_PRESERVE               Preserve bits.
+    ======================================== ================================================
+
+.. _amdgpu_synid_src0_sel:
+
+src0_sel
+~~~~~~~~
+
+Controls which bits in the src0 are used. By default, all bits are used.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    src0_sel:DWORD                           Use bits 31:0.
+    src0_sel:BYTE_0                          Use bits 7:0.
+    src0_sel:BYTE_1                          Use bits 15:8.
+    src0_sel:BYTE_2                          Use bits 23:16.
+    src0_sel:BYTE_3                          Use bits 31:24.
+    src0_sel:WORD_0                          Use bits 15:0.
+    src0_sel:WORD_1                          Use bits 31:16.
+    ======================================== ================================================
+
+.. _amdgpu_synid_src1_sel:
+
+src1_sel
+~~~~~~~~
+
+Controls which bits in the src1 are used. By default, all bits are used.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    src1_sel:DWORD                           Use bits 31:0.
+    src1_sel:BYTE_0                          Use bits 7:0.
+    src1_sel:BYTE_1                          Use bits 15:8.
+    src1_sel:BYTE_2                          Use bits 23:16.
+    src1_sel:BYTE_3                          Use bits 31:24.
+    src1_sel:WORD_0                          Use bits 15:0.
+    src1_sel:WORD_1                          Use bits 31:16.
+    ======================================== ================================================
+
+.. _amdgpu_synid_sdwa_operand_modifiers:
+
+VOP1/VOP2/VOPC SDWA Operand Modifiers
+-------------------------------------
+
+Operand modifiers are not used separately. They are applied to source operands.
+
+GFX8 and GFX9 only.
+
+abs
+~~~
+
+See a description :ref:`here<amdgpu_synid_abs>`.
+
+neg
+~~~
+
+See a description :ref:`here<amdgpu_synid_neg>`.
+
+.. _amdgpu_synid_sext:
+
+sext
+~~~~
+
+Sign-extends value of a (sub-dword) operand to fill all 32 bits.
+Has no effect for 32-bit operands.
+
+Valid for integer operands only.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    sext(<operand>)                          Sign-extend operand value.
+    ======================================== ================================================
+
+Examples:
+
+.. parsed-literal::
+
+  sext(v4)
+  sext(v255)
+
+VOP3 Modifiers
+--------------
+
+.. _amdgpu_synid_vop3_op_sel:
+
+vop3_op_sel
+~~~~~~~~~~~
+
+Selects the low [15:0] or high [31:16] operand bits for source and destination operands.
+By default, low bits are used for all operands.
+
+The number of values specified with the op_sel modifier must match the number of instruction
+operands (both source and destination). First value controls src0, second value controls src1
+and so on, except that the last value controls destination.
+The value 0 selects the low bits, while 1 selects the high bits.
+
+Note. op_sel modifier affects 16-bit operands only. For 32-bit operands the value specified
+by op_sel must be 0.
+
+GFX9 only.
+
+    ======================================== ============================================================
+    Syntax                                   Description
+    ======================================== ============================================================
+    op_sel:[{0..1},{0..1}]                   Select operand bits for instructions with 1 source operand.
+    op_sel:[{0..1},{0..1},{0..1}]            Select operand bits for instructions with 2 source operands.
+    op_sel:[{0..1},{0..1},{0..1},{0..1}]     Select operand bits for instructions with 3 source operands.
+    ======================================== ============================================================
+
+Examples:
+
+.. parsed-literal::
+
+  op_sel:[0,0]
+  op_sel:[0,1]
+
+.. _amdgpu_synid_clamp:
+
+clamp
+~~~~~
+
+Clamp meaning depends on instruction.
+
+For *v_cmp* instructions, clamp modifier indicates that the compare signals
+if a floating point exception occurs. By default, signaling is disabled.
+Not supported by GFX7.
+
+For integer operations, clamp modifier indicates that the result must be clamped
+to the largest and smallest representable value. By default, there is no clamping.
+Integer clamping is not supported by GFX7.
+
+For floating point operations, clamp modifier indicates that the result must be clamped
+to the range [0.0, 1.0]. By default, there is no clamping.
+
+Note. Clamp modifier is applied after :ref:`output modifiers<amdgpu_synid_omod>` (if any).
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    clamp                                    Enables clamping (or signaling).
+    ======================================== ================================================
+
+.. _amdgpu_synid_omod:
+
+omod
+~~~~
+
+Specifies if an output modifier must be applied to the result.
+By default, no output modifiers are applied.
+
+Note. Output modifiers are applied before :ref:`clamping<amdgpu_synid_clamp>` (if any).
+
+Output modifiers are valid for f32 and f64 floating point results only.
+They must not be used with f16.
+
+Note. *v_cvt_f16_f32* is an exception. This instruction produces f16 result
+but accepts output modifiers.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    mul:2                                    Multiply the result by 2.
+    mul:4                                    Multiply the result by 4.
+    div:2                                    Multiply the result by 0.5.
+    ======================================== ================================================
+
+.. _amdgpu_synid_vop3_operand_modifiers:
+
+VOP3 Operand Modifiers
+----------------------
+
+Operand modifiers are not used separately. They are applied to source operands.
+
+.. _amdgpu_synid_abs:
+
+abs
+~~~
+
+Computes absolute value of its operand. Applied before :ref:`neg<amdgpu_synid_neg>` (if any).
+Valid for floating point operands only.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    abs(<operand>)                           Get absolute value of operand.
+    \|<operand>|                             The same as above.
+    ======================================== ================================================
+
+Examples:
+
+.. parsed-literal::
+
+  abs(v36)
+  \|v36|
+
+.. _amdgpu_synid_neg:
+
+neg
+~~~
+
+Computes negative value of its operand. Applied after :ref:`abs<amdgpu_synid_abs>` (if any).
+Valid for floating point operands only.
+
+    ======================================== ================================================
+    Syntax                                   Description
+    ======================================== ================================================
+    neg(<operand>)                           Get negative value of operand.
+    -<operand>                               The same as above.
+    ======================================== ================================================
+
+Examples:
+
+.. parsed-literal::
+
+  neg(v[0])
+  -v4
+
+VOP3P Modifiers
+---------------
+
+This section describes modifiers of *regular* VOP3P instructions.
+
+*v_mad_mix_f32*, *v_mad_mixhi_f16* and *v_mad_mixlo_f16*
+instructions use these modifiers :ref:`in a special manner<amdgpu_synid_mad_mix>`.
+
+GFX9 only.
+
+.. _amdgpu_synid_op_sel:
+
+op_sel
+~~~~~~
+
+Selects the low [15:0] or high [31:16] operand bits as input to the operation
+which results in the lower-half of the destination.
+By default, low bits are used for all operands.
+
+The number of values specified by the *op_sel* modifier must match the number of source
+operands. First value controls src0, second value controls src1 and so on.
+
+The value 0 selects the low bits, while 1 selects the high bits.
+
+    ================================= =============================================================
+    Syntax                            Description
+    ================================= =============================================================
+    op_sel:[{0..1}]                   Select operand bits for instructions with 1 source operand.
+    op_sel:[{0..1},{0..1}]            Select operand bits for instructions with 2 source operands.
+    op_sel:[{0..1},{0..1},{0..1}]     Select operand bits for instructions with 3 source operands.
+    ================================= =============================================================
+
+Examples:
+
+.. parsed-literal::
+
+  op_sel:[0,0]
+  op_sel:[0,1,0]
+
+.. _amdgpu_synid_op_sel_hi:
+
+op_sel_hi
+~~~~~~~~~
+
+Selects the low [15:0] or high [31:16] operand bits as input to the operation
+which results in the upper-half of the destination.
+By default, high bits are used for all operands.
+
+The number of values specified by the *op_sel_hi* modifier must match the number of source
+operands. First value controls src0, second value controls src1 and so on.
+
+The value 0 selects the low bits, while 1 selects the high bits.
+
+    =================================== =============================================================
+    Syntax                              Description
+    =================================== =============================================================
+    op_sel_hi:[{0..1}]                  Select operand bits for instructions with 1 source operand.
+    op_sel_hi:[{0..1},{0..1}]           Select operand bits for instructions with 2 source operands.
+    op_sel_hi:[{0..1},{0..1},{0..1}]    Select operand bits for instructions with 3 source operands.
+    =================================== =============================================================
+
+Examples:
+
+.. parsed-literal::
+
+  op_sel_hi:[0,0]
+  op_sel_hi:[0,0,1]
+
+.. _amdgpu_synid_neg_lo:
+
+neg_lo
+~~~~~~
+
+Specifies whether to change sign of operand values selected by
+:ref:`op_sel<amdgpu_synid_op_sel>`. These values are then used
+as input to the operation which results in the upper-half of the destination.
+
+The number of values specified by this modifier must match the number of source
+operands. First value controls src0, second value controls src1 and so on.
+
+The value 0 indicates that the corresponding operand value is used unmodified,
+the value 1 indicates that negative value of the operand must be used.
+
+By default, operand values are used unmodified.
+
+This modifier is valid for floating point operands only.
+
+    ================================ ==================================================================
+    Syntax                           Description
+    ================================ ==================================================================
+    neg_lo:[{0..1}]                  Select affected operands for instructions with 1 source operand.
+    neg_lo:[{0..1},{0..1}]           Select affected operands for instructions with 2 source operands.
+    neg_lo:[{0..1},{0..1},{0..1}]    Select affected operands for instructions with 3 source operands.
+    ================================ ==================================================================
+
+Examples:
+
+.. parsed-literal::
+
+  neg_lo:[0]
+  neg_lo:[0,1]
+
+.. _amdgpu_synid_neg_hi:
+
+neg_hi
+~~~~~~
+
+Specifies whether to change sign of operand values selected by
+:ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`. These values are then used
+as input to the operation which results in the upper-half of the destination.
+
+The number of values specified by this modifier must match the number of source
+operands. First value controls src0, second value controls src1 and so on.
+
+The value 0 indicates that the corresponding operand value is used unmodified,
+the value 1 indicates that negative value of the operand must be used.
+
+By default, operand values are used unmodified.
+
+This modifier is valid for floating point operands only.
+
+    =============================== ==================================================================
+    Syntax                          Description
+    =============================== ==================================================================
+    neg_hi:[{0..1}]                 Select affected operands for instructions with 1 source operand.
+    neg_hi:[{0..1},{0..1}]          Select affected operands for instructions with 2 source operands.
+    neg_hi:[{0..1},{0..1},{0..1}]   Select affected operands for instructions with 3 source operands.
+    =============================== ==================================================================
+
+Examples:
+
+.. parsed-literal::
+
+  neg_hi:[1,0]
+  neg_hi:[0,1,1]
+
+clamp
+~~~~~
+
+See a description :ref:`here<amdgpu_synid_clamp>`.
+
+.. _amdgpu_synid_mad_mix:
+
+VOP3P V_MAD_MIX Modifiers
+-------------------------
+
+*v_mad_mix_f32*, *v_mad_mixhi_f16* and *v_mad_mixlo_f16* instructions
+use *op_sel* and *op_sel_hi* modifiers 
+in a manner different from *regular* VOP3P instructions.
+
+See a description below.
+
+GFX9 only.
+
+.. _amdgpu_synid_mad_mix_op_sel:
+
+mad_mix_op_sel
+~~~~~~~~~~~~~~
+
+This operand has meaning only for 16-bit source operands as indicated by
+:ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>`.
+It specifies to select either the low [15:0] or high [31:16] operand bits
+as input to the operation.
+
+The number of values specified by the *op_sel* modifier must match the number of source
+operands. First value controls src0, second value controls src1 and so on.
+
+The value 0 indicates the low bits, the value 1 indicates the high 16 bits.
+
+By default, low bits are used for all operands.
+
+    =============================== ================================================
+    Syntax                          Description
+    =============================== ================================================
+    op_sel:[{0..1},{0..1},{0..1}]   Select location of each 16-bit source operand.
+    =============================== ================================================
+
+Examples:
+
+.. parsed-literal::
+
+  op_sel:[0,1]
+
+.. _amdgpu_synid_mad_mix_op_sel_hi:
+
+mad_mix_op_sel_hi
+~~~~~~~~~~~~~~~~~
+
+Selects the size of source operands: either 32 bits or 16 bits.
+By default, 32 bits are used for all source operands.
+
+The number of values specified by the *op_sel_hi* modifier must match the number of source
+operands. First value controls src0, second value controls src1 and so on.
+
+The value 0 indicates 32 bits, the value 1 indicates 16 bits.
+
+The location of 16 bits in the operand may be specified by
+:ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>`.
+
+    ======================================== ====================================
+    Syntax                                   Description
+    ======================================== ====================================
+    op_sel_hi:[{0..1},{0..1},{0..1}]         Select size of each source operand.
+    ======================================== ====================================
+
+Examples:
+
+.. parsed-literal::
+
+  op_sel_hi:[1,1,1]
+
+abs
+~~~
+
+See a description :ref:`here<amdgpu_synid_abs>`.
+
+neg
+~~~
+
+See a description :ref:`here<amdgpu_synid_neg>`.
+
+clamp
+~~~~~
+
+See a description :ref:`here<amdgpu_synid_clamp>`.
diff --git a/docs/AMDGPUOperandSyntax.rst b/docs/AMDGPUOperandSyntax.rst
index 4f3536eed40d05b6475e7ba146fc8b4458b5c2ec..5f5d822526530f04cc34265caf50ae63d0589e2b 100644
--- a/docs/AMDGPUOperandSyntax.rst
+++ b/docs/AMDGPUOperandSyntax.rst
@@ -1,6 +1,6 @@
-=================================================
-Syntax of AMDGPU Assembler Operands and Modifiers
-=================================================
+=====================================
+Syntax of AMDGPU Instruction Operands
+=====================================
 
 .. contents::
    :local:
@@ -8,1048 +8,1056 @@ Syntax of AMDGPU Assembler Operands and Modifiers
 Conventions
 ===========
 
-The following conventions are used in syntax description:
+The following notation is used throughout this document:
 
-    =================== =============================================================
+    =================== =============================================================================
     Notation            Description
-    =================== =============================================================
+    =================== =============================================================================
     {0..N}              Any integer value in the range from 0 to N (inclusive).
-                        Unless stated otherwise, this value may be specified as
-                        either a literal or an llvm expression.
-    <x>                 Syntax and meaning of *<x>* is explained elsewhere.
-    =================== =============================================================
+    <x>                 Syntax and meaning of *x* is explained elsewhere.
+    =================== =============================================================================
 
 .. _amdgpu_syn_operands:
 
 Operands
 ========
 
-TBD
+.. _amdgpu_synid_v:
 
-.. _amdgpu_syn_modifiers:
+v
+-
 
-Modifiers
-=========
+Vector registers. There are 256 32-bit vector registers.
 
-DS Modifiers
-------------
-
-.. _amdgpu_synid_ds_offset8:
-
-ds_offset8
-~~~~~~~~~~
-
-Specifies an immediate unsigned 8-bit offset, in bytes. The default value is 0.
-
-Used with DS instructions which have 2 addresses.
-
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    offset:{0..0xFF}                         Specifies a 8-bit offset.
-    ======================================== ================================================
-
-.. _amdgpu_synid_ds_offset16:
-
-ds_offset16
-~~~~~~~~~~~
-
-Specifies an immediate unsigned 16-bit offset, in bytes. The default value is 0.
-
-Used with DS instructions which have 1 address.
-
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    offset:{0..0xFFFF}                       Specifies a 16-bit offset.
-    ======================================== ================================================
-
-.. _amdgpu_synid_sw_offset16:
-
-sw_offset16
-~~~~~~~~~~~
-
-This is a special modifier which may be used with *ds_swizzle_b32* instruction only.
-Specifies a sizzle pattern in numeric or symbolic form. The default value is 0.
-
-See AMD documentation for more information.
-
-    ======================================================= ===================================================
-    Syntax                                                  Description
-    ======================================================= ===================================================
-    offset:{0..0xFFFF}                                      Specifies a 16-bit swizzle pattern
-                                                            in a numeric form.
-    offset:swizzle(QUAD_PERM,{0..3},{0..3},{0..3},{0..3})   Specifies a quad permute mode pattern; each
-                                                            number is a lane id.
-    offset:swizzle(BITMASK_PERM, "<mask>")                  Specifies a bitmask permute mode pattern
-                                                            which converts a 5-bit lane id to another
-                                                            lane id with which the lane interacts.
-
-                                                            <mask> is a 5 character sequence which
-                                                            specifies how to transform the bits of the
-                                                            lane id. The following characters are allowed:
-
-                                                              * "0" - set bit to 0.
-
-                                                              * "1" - set bit to 1.
+A sequence of *vector* registers may be used to operate with more than 32 bits of data.
 
-                                                              * "p" - preserve bit.
+Assembler currently supports sequences of 1, 2, 3, 4, 8 and 16 *vector* registers.
 
-                                                              * "i" - inverse bit.
+    =================================================== ====================================================================
+    Syntax                                              Description
+    =================================================== ====================================================================
+    **v**\<N>                                           A single 32-bit *vector* register.
 
-    offset:swizzle(BROADCAST,{2..32},{0..N})                Specifies a broadcast mode.
-                                                            Broadcasts the value of any particular lane to
-                                                            all lanes in its group.
+                                                        *N* must be a decimal integer number.
+    **v[**\ <N>\ **]**                                  A single 32-bit *vector* register.
 
-                                                            The first numeric parameter is a group
-                                                            size and must be equal to 2, 4, 8, 16 or 32.
+                                                        *N* may be specified as an
+                                                        :ref:`integer number<amdgpu_synid_integer_number>`
+                                                        or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+    **v[**\ <N>:<K>\ **]**                              A sequence of (\ *K-N+1*\ ) *vector* registers.
 
-                                                            The second numeric parameter is an index of the
-                                                            lane being broadcasted. The index must not exceed
-                                                            group size.
-    offset:swizzle(SWAP,{1..16})                            Specifies a swap mode.
-                                                            Swaps the neighboring groups of
-                                                            1, 2, 4, 8 or 16 lanes.
-    offset:swizzle(REVERSE,{2..32})                         Specifies a reverse mode. Reverses
-                                                            the lanes for groups of 2, 4, 8, 16 or 32 lanes.
-    ======================================================= ===================================================
+                                                        *N* and *K* may be specified as
+                                                        :ref:`integer numbers<amdgpu_synid_integer_number>`
+                                                        or :ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+    **[v**\ <N>, \ **v**\ <N+1>, ... **v**\ <K>\ **]**  A sequence of (\ *K-N+1*\ ) *vector* registers.
 
-.. _amdgpu_synid_gds:
+                                                        Register indices must be specified as decimal integer numbers.
+    =================================================== ====================================================================
 
-gds
-~~~
+Note. *N* and *K* must satisfy the following conditions:
 
-Specifies whether to use GDS or LDS memory (LDS is the default).
+* *N* <= *K*.
+* 0 <= *N* <= 255.
+* 0 <= *K* <= 255.
+* *K-N+1* must be equal to 1, 2, 3, 4, 8 or 16.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    gds                                      Use GDS memory.
-    ======================================== ================================================
+Examples:
 
+.. parsed-literal::
 
-EXP Modifiers
--------------
+  v255
+  v[0]
+  v[0:1]
+  v[1:1]
+  v[0:3]
+  v[2*2]
+  v[1-1:2-1]
+  [v252]
+  [v252,v253,v254,v255]
 
-.. _amdgpu_synid_done:
+.. _amdgpu_synid_s:
 
-done
-~~~~
+s
+-
 
-Specifies if this is the last export from the shader to the target. By default, current
-instruction does not finish an export sequence.
+Scalar 32-bit registers. The number of available *scalar* registers depends on GPU:
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    done                                     Indicates the last export operation.
-    ======================================== ================================================
+    ======= ============================
+    GPU     Number of *scalar* registers
+    ======= ============================
+    GFX7    104
+    GFX8    102
+    GFX9    102
+    ======= ============================
 
-.. _amdgpu_synid_compr:
+A sequence of *scalar* registers may be used to operate with more than 32 bits of data.
+Assembler currently supports sequences of 1, 2, 4, 8 and 16 *scalar* registers.
 
-compr
-~~~~~
+Pairs of *scalar* registers must be even-aligned (the first register must be even).
+Sequences of 4 and more *scalar* registers must be quad-aligned.
 
-Indicates if the data are compressed (not compressed by default).
+    ======================================================== ====================================================================
+    Syntax                                                   Description
+    ======================================================== ====================================================================
+    **s**\ <N>                                               A single 32-bit *scalar* register.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    compr                                    Data are compressed.
-    ======================================== ================================================
+                                                             *N* must be a decimal integer number.
+    **s[**\ <N>\ **]**                                       A single 32-bit *scalar* register.
 
-.. _amdgpu_synid_vm:
+                                                             *N* may be specified as an
+                                                             :ref:`integer number<amdgpu_synid_integer_number>`
+                                                             or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+    **s[**\ <N>:<K>\ **]**                                   A sequence of (\ *K-N+1*\ ) *scalar* registers.
 
-vm
-~~
+                                                             *N* and *K* may be specified as
+                                                             :ref:`integer numbers<amdgpu_synid_integer_number>`
+                                                             or :ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+    **[s**\ <N>, \ **s**\ <N+1>, ... **s**\ <K>\ **]**       A sequence of (\ *K-N+1*\ ) *scalar* registers.
 
-Specifies valid mask flag state (off by default).
+                                                             Register indices must be specified as decimal integer numbers.
+    ======================================================== ====================================================================
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    vm                                       Set valid mask flag.
-    ======================================== ================================================
+Note. *N* and *K* must satisfy the following conditions:
 
-FLAT Modifiers
---------------
+* *N* must be properly aligned based on sequence size.
+* *N* <= *K*.
+* 0 <= *N* < *SMAX*\ , where *SMAX* is the number of available *scalar* registers.
+* 0 <= *K* < *SMAX*\ , where *SMAX* is the number of available *scalar* registers.
+* *K-N+1* must be equal to 1, 2, 4, 8 or 16.
 
-.. _amdgpu_synid_flat_offset12:
+Examples:
 
-flat_offset12
-~~~~~~~~~~~~~
+.. parsed-literal::
 
-Specifies an immediate unsigned 12-bit offset, in bytes. The default value is 0.
+  s0
+  s[0]
+  s[0:1]
+  s[1:1]
+  s[0:3]
+  s[2*2]
+  s[1-1:2-1]
+  [s4]
+  [s4,s5,s6,s7]
 
-Cannot be used with *global/scratch* opcodes. GFX9 only.
+Examples of *scalar* registers with an invalid alignment:
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    offset:{0..4095}                         Specifies a 12-bit unsigned offset.
-    ======================================== ================================================
+.. parsed-literal::
 
-.. _amdgpu_synid_flat_offset13:
+  s[1:2]
+  s[2:5]
 
-flat_offset13
-~~~~~~~~~~~~~
+.. _amdgpu_synid_trap:
 
-Specifies an immediate signed 13-bit offset, in bytes. The default value is 0.
+trap
+----
 
-Can be used with *global/scratch* opcodes only. GFX9 only.
+A set of trap handler registers:
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    offset:{-4096..+4095}                    Specifies a 13-bit signed offset.
-    ======================================== ================================================
+* :ref:`ttmp<amdgpu_synid_ttmp>`
+* :ref:`tba<amdgpu_synid_tba>`
+* :ref:`tma<amdgpu_synid_tma>`
 
-glc
-~~~
+.. _amdgpu_synid_ttmp:
 
-See a description :ref:`here<amdgpu_synid_glc>`.
+ttmp
+----
 
-slc
-~~~
+Trap handler temporary scalar registers, 32-bits wide.
+The number of available *ttmp* registers depends on GPU:
 
-See a description :ref:`here<amdgpu_synid_slc>`.
+    ======= ===========================
+    GPU     Number of *ttmp* registers
+    ======= ===========================
+    GFX7    12
+    GFX8    12
+    GFX9    16
+    ======= ===========================
 
-tfe
-~~~
+A sequence of *ttmp* registers may be used to operate with more than 32 bits of data.
+Assembler currently supports sequences of 1, 2, 4, 8 and 16 *ttmp* registers.
 
-See a description :ref:`here<amdgpu_synid_tfe>`.
+Pairs of *ttmp* registers must be even-aligned (the first register must be even).
+Sequences of 4 and more *ttmp* registers must be quad-aligned.
 
-nv
-~~
+    ============================================================= ====================================================================
+    Syntax                                                        Description
+    ============================================================= ====================================================================
+    **ttmp**\ <N>                                                 A single 32-bit *ttmp* register.
 
-See a description :ref:`here<amdgpu_synid_nv>`.
+                                                                  *N* must be a decimal integer number.
+    **ttmp[**\ <N>\ **]**                                         A single 32-bit *ttmp* register.
 
-MIMG Modifiers
---------------
+                                                                  *N* may be specified as an
+                                                                  :ref:`integer number<amdgpu_synid_integer_number>`
+                                                                  or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+    **ttmp[**\ <N>:<K>\ **]**                                     A sequence of (\ *K-N+1*\ ) *ttmp* registers.
 
-.. _amdgpu_synid_dmask:
+                                                                  *N* and *K* may be specified as
+                                                                  :ref:`integer numbers<amdgpu_synid_integer_number>`
+                                                                  or :ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+    **[ttmp**\ <N>, \ **ttmp**\ <N+1>, ... **ttmp**\ <K>\ **]**   A sequence of (\ *K-N+1*\ ) *ttmp* registers.
 
-dmask
-~~~~~
+                                                                  Register indices must be specified as decimal integer numbers.
+    ============================================================= ====================================================================
 
-Specifies which channels (image components) are used by the operation. By default, no channels
-are used.
+Note. *N* and *K* must satisfy the following conditions:
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    dmask:{0..15}                            Each bit corresponds to one of 4 image
-                                             components (RGBA). If the specified bit value
-                                             is 0, the component is not used, value 1 means
-                                             that the component is used.
-    ======================================== ================================================
+* *N* must be properly aligned based on sequence size.
+* *N* <= *K*.
+* 0 <= *N* < *TMAX*, where *TMAX* is the number of available *ttmp* registers.
+* 0 <= *K* < *TMAX*, where *TMAX* is the number of available *ttmp* registers.
+* *K-N+1* must be equal to 1, 2, 4, 8 or 16.
 
-This modifier has some limitations depending on instruction kind:
+Examples:
 
-    ======================================== ================================================
-    Instruction Kind                         Valid dmask Values
-    ======================================== ================================================
-    32-bit atomic cmpswap                    0x3
-    other 32-bit atomic instructions         0x1
-    64-bit atomic cmpswap                    0xF
-    other 64-bit atomic instructions         0x3
-    GATHER4                                  0x1, 0x2, 0x4, 0x8
-    Other instructions                       any value
-    ======================================== ================================================
+.. parsed-literal::
 
-.. _amdgpu_synid_unorm:
+  ttmp0
+  ttmp[0]
+  ttmp[0:1]
+  ttmp[1:1]
+  ttmp[0:3]
+  ttmp[2*2]
+  ttmp[1-1:2-1]
+  [ttmp4]
+  [ttmp4,ttmp5,ttmp6,ttmp7]
 
-unorm
-~~~~~
+Examples of *ttmp* registers with an invalid alignment:
 
-Specifies whether address is normalized or not (normalized by default).
+.. parsed-literal::
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    unorm                                    Force address to be un-normalized.
-    ======================================== ================================================
+  ttmp[1:2]
+  ttmp[2:5]
 
-glc
-~~~
+.. _amdgpu_synid_tba:
 
-See a description :ref:`here<amdgpu_synid_glc>`.
+tba
+---
 
-slc
-~~~
+Trap base address, 64-bits wide. Holds the pointer to the current trap handler program.
 
-See a description :ref:`here<amdgpu_synid_slc>`.
+    ================== ======================================================================= =============
+    Syntax             Description                                                             Availability
+    ================== ======================================================================= =============
+    tba                64-bit *trap base address* register.                                    GFX7, GFX8
+    [tba]              64-bit *trap base address* register (an alternative syntax).            GFX7, GFX8
+    [tba_lo,tba_hi]    64-bit *trap base address* register (an alternative syntax).            GFX7, GFX8
+    ================== ======================================================================= =============
 
-.. _amdgpu_synid_r128:
+High and low 32 bits of *trap base address* may be accessed as separate registers:
 
-r128
-~~~~
+    ================== ======================================================================= =============
+    Syntax             Description                                                             Availability
+    ================== ======================================================================= =============
+    tba_lo             Low 32 bits of *trap base address* register.                            GFX7, GFX8
+    tba_hi             High 32 bits of *trap base address* register.                           GFX7, GFX8
+    [tba_lo]           Low 32 bits of *trap base address* register (an alternative syntax).    GFX7, GFX8
+    [tba_hi]           High 32 bits of *trap base address* register (an alternative syntax).   GFX7, GFX8
+    ================== ======================================================================= =============
 
-Specifies texture resource size. The default size is 256 bits.
+Note that *tba*, *tba_lo* and *tba_hi* are not accessible as assembler registers in GFX9,
+but *tba* is readable/writable with the help of *s_get_reg* and *s_set_reg* instructions.
 
-GFX7 and GFX8 only.
+.. _amdgpu_synid_tma:
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    r128                                     Specifies 128 bits texture resource size.
-    ======================================== ================================================
+tma
+---
 
-tfe
-~~~
+Trap memory address, 64-bits wide.
 
-See a description :ref:`here<amdgpu_synid_tfe>`.
+    ================= ======================================================================= ==================
+    Syntax            Description                                                             Availability
+    ================= ======================================================================= ==================
+    tma               64-bit *trap memory address* register.                                  GFX7, GFX8
+    [tma]             64-bit *trap memory address* register (an alternative syntax).          GFX7, GFX8
+    [tma_lo,tma_hi]   64-bit *trap memory address* register (an alternative syntax).          GFX7, GFX8
+    ================= ======================================================================= ==================
 
-.. _amdgpu_synid_lwe:
+High and low 32 bits of *trap memory address* may be accessed as separate registers:
 
-lwe
-~~~
+    ================= ======================================================================= ==================
+    Syntax            Description                                                             Availability
+    ================= ======================================================================= ==================
+    tma_lo            Low 32 bits of *trap memory address* register.                          GFX7, GFX8
+    tma_hi            High 32 bits of *trap memory address* register.                         GFX7, GFX8
+    [tma_lo]          Low 32 bits of *trap memory address* register (an alternative syntax).  GFX7, GFX8
+    [tma_hi]          High 32 bits of *trap memory address* register (an alternative syntax). GFX7, GFX8
+    ================= ======================================================================= ==================
 
-Specifies LOD warning status (LOD warning is disabled by default).
+Note that *tma*, *tma_lo* and *tma_hi* are not accessible as assembler registers in GFX9,
+but *tma* is readable/writable with the help of *s_get_reg* and *s_set_reg* instructions.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    lwe                                      Enables LOD warning.
-    ======================================== ================================================
+.. _amdgpu_synid_flat_scratch:
 
-.. _amdgpu_synid_da:
-
-da
-~~
-
-Specifies if an array index must be sent to TA. By default, array index is not sent.
-
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    da                                       Send an array-index to TA.
-    ======================================== ================================================
-
-.. _amdgpu_synid_d16:
+flat_scratch
+------------
 
-d16
-~~~
+Flat scratch address, 64-bits wide. Holds the base address of scratch memory.
 
-Specifies data size: 16 or 32 bits (32 bits by default). Not supported by GFX7.
+    ================================== ================================================================
+    Syntax                             Description
+    ================================== ================================================================
+    flat_scratch                       64-bit *flat scratch* address register.
+    [flat_scratch]                     64-bit *flat scratch* address register (an alternative syntax).
+    [flat_scratch_lo,flat_scratch_hi]  64-bit *flat scratch* address register (an alternative syntax).
+    ================================== ================================================================
+
+High and low 32 bits of *flat scratch* address may be accessed as separate registers:
+
+    ========================= =========================================================================
+    Syntax                    Description
+    ========================= =========================================================================
+    flat_scratch_lo           Low 32 bits of *flat scratch* address register.
+    flat_scratch_hi           High 32 bits of *flat scratch* address register.
+    [flat_scratch_lo]         Low 32 bits of *flat scratch* address register (an alternative syntax).
+    [flat_scratch_hi]         High 32 bits of *flat scratch* address register (an alternative syntax).
+    ========================= =========================================================================
+
+.. _amdgpu_synid_xnack:
+
+xnack
+-----
+
+Xnack mask, 64-bits wide. Holds a 64-bit mask of which threads
+received an *XNACK* due to a vector memory operation.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    d16                                      Enables 16-bits data mode.
+.. WARNING:: GFX7 does not support *xnack* feature. Not all GFX8 and GFX9 :ref:`processors<amdgpu-processors>` support *xnack* feature.
 
-                                             On loads, convert data in memory to 16-bit
-                                             format before storing it in VGPRs.
+\
 
-                                             For stores, convert 16-bit data in VGPRs to
-                                             32 bits before going to memory.
+    ============================== =====================================================
+    Syntax                         Description
+    ============================== =====================================================
+    xnack_mask                     64-bit *xnack mask* register.
+    [xnack_mask]                   64-bit *xnack mask* register (an alternative syntax).
+    [xnack_mask_lo,xnack_mask_hi]  64-bit *xnack mask* register (an alternative syntax).
+    ============================== =====================================================
 
-                                             Note that 16-bit data are stored in VGPRs
-                                             unpacked in GFX8.0. In GFX8.1 and GFX9 16-bit
-                                             data are packed.
-    ======================================== ================================================
+High and low 32 bits of *xnack mask* may be accessed as separate registers:
 
-.. _amdgpu_synid_a16:
+    ===================== ==============================================================
+    Syntax                Description
+    ===================== ==============================================================
+    xnack_mask_lo         Low 32 bits of *xnack mask* register.
+    xnack_mask_hi         High 32 bits of *xnack mask* register.
+    [xnack_mask_lo]       Low 32 bits of *xnack mask* register (an alternative syntax).
+    [xnack_mask_hi]       High 32 bits of *xnack mask* register (an alternative syntax).
+    ===================== ==============================================================
 
-a16
-~~~
+.. _amdgpu_synid_vcc:
 
-Specifies size of image address components: 16 or 32 bits (32 bits by default). GFX9 only.
+vcc
+---
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    a16                                      Enables 16-bits image address components.
-    ======================================== ================================================
+Vector condition code, 64-bits wide. A bit mask with one bit per thread;
+it holds the result of a vector compare operation.
 
-Miscellaneous Modifiers
------------------------
+    ================ =========================================================================
+    Syntax           Description
+    ================ =========================================================================
+    vcc              64-bit *vector condition code* register.
+    [vcc]            64-bit *vector condition code* register (an alternative syntax).
+    [vcc_lo,vcc_hi]  64-bit *vector condition code* register (an alternative syntax).
+    ================ =========================================================================
 
-.. _amdgpu_synid_glc:
+High and low 32 bits of *vector condition code* may be accessed as separate registers:
 
-glc
-~~~
+    ================ =========================================================================
+    Syntax           Description
+    ================ =========================================================================
+    vcc_lo           Low 32 bits of *vector condition code* register.
+    vcc_hi           High 32 bits of *vector condition code* register.
+    [vcc_lo]         Low 32 bits of *vector condition code* register (an alternative syntax).
+    [vcc_hi]         High 32 bits of *vector condition code* register (an alternative syntax).
+    ================ =========================================================================
 
-This modifier has different meaning for loads, stores, and atomic operations.
-The default value is off (0).
+.. _amdgpu_synid_m0:
 
-See AMD documentation for details.
+m0
+--
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    glc                                      Set glc bit to 1.
-    ======================================== ================================================
+A 32-bit memory register. It has various uses,
+including register indexing and bounds checking.
 
-.. _amdgpu_synid_slc:
+    =========== ===================================================
+    Syntax      Description
+    =========== ===================================================
+    m0          A 32-bit *memory* register.
+    [m0]        A 32-bit *memory* register (an alternative syntax).
+    =========== ===================================================
 
-slc
-~~~
+.. _amdgpu_synid_exec:
 
-Specifies cache policy. The default value is off (0).
+exec
+----
 
-See AMD documentation for details.
+Execute mask, 64-bits wide. A bit mask with one bit per thread,
+which is applied to vector instructions and controls which threads execute
+and which ignore the instruction.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    slc                                      Set slc bit to 1.
-    ======================================== ================================================
+    ===================== =================================================================
+    Syntax                Description
+    ===================== =================================================================
+    exec                  64-bit *execute mask* register.
+    [exec]                64-bit *execute mask* register (an alternative syntax).
+    [exec_lo,exec_hi]     64-bit *execute mask* register (an alternative syntax).
+    ===================== =================================================================
 
-.. _amdgpu_synid_tfe:
+High and low 32 bits of *execute mask* may be accessed as separate registers:
 
-tfe
-~~~
+    ===================== =================================================================
+    Syntax                Description
+    ===================== =================================================================
+    exec_lo               Low 32 bits of *execute mask* register.
+    exec_hi               High 32 bits of *execute mask* register.
+    [exec_lo]             Low 32 bits of *execute mask* register (an alternative syntax).
+    [exec_hi]             High 32 bits of *execute mask* register (an alternative syntax).
+    ===================== =================================================================
 
-Controls access to partially resident textures. The default value is off (0).
+.. _amdgpu_synid_vccz:
 
-See AMD documentation for details.
+vccz
+----
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    tfe                                      Set tfe bit to 1.
-    ======================================== ================================================
+A single bit-flag indicating that the :ref:`vcc<amdgpu_synid_vcc>` is all zeros.
 
-.. _amdgpu_synid_nv:
+.. WARNING:: This operand is not currently supported by AMDGPU assembler.
 
-nv
-~~
+.. _amdgpu_synid_execz:
 
-Specifies if instruction is operating on non-volatile memory. By default, memory is volatile.
+execz
+-----
 
-GFX9 only.
+A single bit flag indicating that the :ref:`exec<amdgpu_synid_exec>` is all zeros.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    nv                                       Indicates that instruction operates on
-                                             non-volatile memory.
-    ======================================== ================================================
+.. WARNING:: This operand is not currently supported by AMDGPU assembler.
 
-MUBUF/MTBUF Modifiers
----------------------
+.. _amdgpu_synid_scc:
 
-.. _amdgpu_synid_idxen:
+scc
+---
 
-idxen
-~~~~~
+A single bit flag indicating the result of a scalar compare operation.
 
-Specifies whether address components include an index. By default, no components are used.
+.. WARNING:: This operand is not currently supported by AMDGPU assembler.
 
-Can be used together with :ref:`offen<amdgpu_synid_offen>`.
+lds_direct
+----------
 
-Cannot be used with :ref:`addr64<amdgpu_synid_addr64>`.
+A special operand which supplies a 32-bit value
+fetched from *LDS* memory using :ref:`m0<amdgpu_synid_m0>` as an address.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    idxen                                    Address components include an index.
-    ======================================== ================================================
+.. WARNING:: This operand is not currently supported by AMDGPU assembler.
 
-.. _amdgpu_synid_offen:
+.. _amdgpu_synid_constant:
 
-offen
-~~~~~
+constant
+--------
 
-Specifies whether address components include an offset. By default, no components are used.
+A set of integer and floating-point *inline constants*:
 
-Can be used together with :ref:`idxen<amdgpu_synid_idxen>`.
+* :ref:`iconst<amdgpu_synid_iconst>`
+* :ref:`fconst<amdgpu_synid_fconst>`
 
-Cannot be used with :ref:`addr64<amdgpu_synid_addr64>`.
+These operands are encoded as a part of instruction.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    offen                                    Address components include an offset.
-    ======================================== ================================================
+If a number may be encoded as either
+a :ref:`literal<amdgpu_synid_literal>` or 
+an :ref:`inline constant<amdgpu_synid_constant>`,
+assembler selects the latter encoding as more efficient.
 
-.. _amdgpu_synid_addr64:
+.. _amdgpu_synid_iconst:
 
-addr64
-~~~~~~
+iconst
+------
 
-Specifies whether a 64-bit address is used. By default, no address is used.
+An :ref:`integer number<amdgpu_synid_integer_number>`
+encoded as an *inline constant*.
 
-GFX7 only. Cannot be used with :ref:`offen<amdgpu_synid_offen>` and
-:ref:`idxen<amdgpu_synid_idxen>` modifiers.
+Only a small fraction of integer numbers may be encoded as *inline constants*.
+They are enumerated in the table below.
+Other integer numbers have to be encoded as :ref:`literals<amdgpu_synid_literal>`.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    addr64                                   A 64-bit address is used.
-    ======================================== ================================================
+Integer *inline constants* are converted to
+:ref:`expected operand type<amdgpu_syn_instruction_type>`
+as described :ref:`here<amdgpu_synid_int_const_conv>`.
 
-.. _amdgpu_synid_buf_offset12:
+    ================================== ====================================
+    Value                              Note
+    ================================== ====================================
+    {0..64}                            Positive integer inline constants.
+    {-16..-1}                          Negative integer inline constants.
+    ================================== ====================================
 
-buf_offset12
-~~~~~~~~~~~~
+.. WARNING:: GFX7 does not support inline constants for *f16* operands.
 
-Specifies an immediate unsigned 12-bit offset, in bytes. The default value is 0.
+There are also symbolic inline constants which provide read-only access to H/W registers.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    offset:{0..0xFFF}                        Specifies a 12-bit unsigned offset.
-    ======================================== ================================================
+.. WARNING:: These inline constants are not currently supported by AMDGPU assembler.
 
-glc
-~~~
+\
 
-See a description :ref:`here<amdgpu_synid_glc>`.
+    ======================== ================================================ =============
+    Syntax                   Note                                             Availability
+    ======================== ================================================ =============
+    shared_base              Base address of shared memory region.            GFX9
+    shared_limit             Address of the end of shared memory region.      GFX9
+    private_base             Base address of private memory region.           GFX9
+    private_limit            Address of the end of private memory region.     GFX9
+    pops_exiting_wave_id     A dedicated counter for POPS.                    GFX9
+    ======================== ================================================ =============
 
-slc
-~~~
+.. _amdgpu_synid_fconst:
 
-See a description :ref:`here<amdgpu_synid_slc>`.
+fconst
+------
 
-.. _amdgpu_synid_lds:
+A :ref:`floating-point number<amdgpu_synid_floating-point_number>`
+encoded as an *inline constant*.
 
-lds
-~~~
+Only a small fraction of floating-point numbers may be encoded as *inline constants*.
+They are enumerated in the table below.
+Other floating-point numbers have to be encoded as :ref:`literals<amdgpu_synid_literal>`.
 
-Specifies where to store the result: VGPRs or LDS (VGPRs by default).
+Floating-point *inline constants* are converted to
+:ref:`expected operand type<amdgpu_syn_instruction_type>`
+as described :ref:`here<amdgpu_synid_fp_const_conv>`.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    lds                                      Store result in LDS.
-    ======================================== ================================================
+    ================================== ===================================================== ==================
+    Value                              Note                                                  Availability
+    ================================== ===================================================== ==================
+    0.0                                The same as integer constant 0.                       All GPUs
+    0.5                                Floating-point constant 0.5                           All GPUs
+    1.0                                Floating-point constant 1.0                           All GPUs
+    2.0                                Floating-point constant 2.0                           All GPUs
+    4.0                                Floating-point constant 4.0                           All GPUs
+    -0.5                               Floating-point constant -0.5                          All GPUs
+    -1.0                               Floating-point constant -1.0                          All GPUs
+    -2.0                               Floating-point constant -2.0                          All GPUs
+    -4.0                               Floating-point constant -4.0                          All GPUs
+    0.1592                             1.0/(2.0*pi). Use only for 16-bit operands.           GFX8, GFX9
+    0.15915494                         1.0/(2.0*pi). Use only for 16- and 32-bit operands.   GFX8, GFX9
+    0.159154943091895317852646485335   1.0/(2.0*pi).                                         GFX8, GFX9
+    ================================== ===================================================== ==================
 
-tfe
-~~~
+.. WARNING:: GFX7 does not support inline constants for *f16* operands.
 
-See a description :ref:`here<amdgpu_synid_tfe>`.
+.. _amdgpu_synid_literal:
 
-.. _amdgpu_synid_dfmt:
+literal
+-------
 
-dfmt
-~~~~
+A literal is a 64-bit value which is encoded as a separate 32-bit dword in the instruction stream.
 
-TBD
+If a number may be encoded as either
+a :ref:`literal<amdgpu_synid_literal>` or 
+an :ref:`inline constant<amdgpu_synid_constant>`,
+assembler selects the latter encoding as more efficient.
 
-.. _amdgpu_synid_nfmt:
+Literals may be specified as :ref:`integer numbers<amdgpu_synid_integer_number>`,
+:ref:`floating-point numbers<amdgpu_synid_floating-point_number>` or
+:ref:`expressions<amdgpu_synid_expression>`
+(expressions are currently supported for 32-bit operands only).
 
-nfmt
-~~~~
+A 64-bit literal value is converted by assembler
+to an :ref:`expected operand type<amdgpu_syn_instruction_type>`
+as described :ref:`here<amdgpu_synid_lit_conv>`.
 
-TBD
+An instruction may use only one literal but several operands may refer the same literal.
 
-SMRD/SMEM Modifiers
--------------------
+.. _amdgpu_synid_uimm8:
 
-glc
-~~~
+uimm8
+-----
 
-See a description :ref:`here<amdgpu_synid_glc>`.
+A 8-bit positive :ref:`integer number<amdgpu_synid_integer_number>`.
+The value is encoded as part of the opcode so it is free to use.
 
-nv
-~~
+.. _amdgpu_synid_uimm32:
 
-See a description :ref:`here<amdgpu_synid_nv>`.
+uimm32
+------
 
-VINTRP Modifiers
-----------------
+A 32-bit positive :ref:`integer number<amdgpu_synid_integer_number>`.
+The value is stored as a separate 32-bit dword in the instruction stream.
 
-.. _amdgpu_synid_high:
+.. _amdgpu_synid_uimm20:
 
-high
-~~~~
+uimm20
+------
 
-Specifies which half of the LDS word to use. Low half of LDS word is used by default.
-GFX9 only.
+A 20-bit positive :ref:`integer number<amdgpu_synid_integer_number>`.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    high                                     Use high half of LDS word.
-    ======================================== ================================================
+.. _amdgpu_synid_uimm21:
 
-VOP1/VOP2 DPP Modifiers
------------------------
+uimm21
+------
 
-GFX8 and GFX9 only.
+A 21-bit positive :ref:`integer number<amdgpu_synid_integer_number>`.
 
-.. _amdgpu_synid_dpp_ctrl:
+.. WARNING:: Assembler currently supports 20-bit offsets only. Use :ref:`uimm20<amdgpu_synid_uimm20>` as a replacement.
 
-dpp_ctrl
-~~~~~~~~
+.. _amdgpu_synid_simm21:
 
-Specifies how data are shared between threads. This is a mandatory modifier.
-There is no default value.
+simm21
+------
 
-Note. The lanes of a wavefront are organized in four banks and four rows.
+A 21-bit :ref:`integer number<amdgpu_synid_integer_number>`.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    quad_perm:[{0..3},{0..3},{0..3},{0..3}]  Full permute of 4 threads.
-    row_mirror                               Mirror threads within row.
-    row_half_mirror                          Mirror threads within 1/2 row (8 threads).
-    row_bcast:15                             Broadcast 15th thread of each row to next row.
-    row_bcast:31                             Broadcast thread 31 to rows 2 and 3.
-    wave_shl:1                               Wavefront left shift by 1 thread.
-    wave_rol:1                               Wavefront left rotate by 1 thread.
-    wave_shr:1                               Wavefront right shift by 1 thread.
-    wave_ror:1                               Wavefront right rotate by 1 thread.
-    row_shl:{1..15}                          Row shift left by 1-15 threads.
-    row_shr:{1..15}                          Row shift right by 1-15 threads.
-    row_ror:{1..15}                          Row rotate right by 1-15 threads.
-    ======================================== ================================================
+.. WARNING:: Assembler currently supports 20-bit unsigned offsets only .Use :ref:`uimm20<amdgpu_synid_uimm20>` as a replacement.
 
-.. _amdgpu_synid_row_mask:
+.. _amdgpu_synid_off:
 
-row_mask
-~~~~~~~~
+off
+---
 
-Controls which rows are enabled for data sharing. By default, all rows are enabled.
+A special entity which indicates that the value of this operand is not used.
 
-Note. The lanes of a wavefront are organized in four banks and four rows.
+    ================================== ===================================================
+    Syntax                             Description
+    ================================== ===================================================
+    off                                Indicates an unused operand.
+    ================================== ===================================================
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    row_mask:{0..15}                         Each of 4 bits in the mask controls one
-                                             row (0 - disabled, 1 - enabled).
-    ======================================== ================================================
 
-.. _amdgpu_synid_bank_mask:
+.. _amdgpu_synid_number:
 
-bank_mask
-~~~~~~~~~
+Numbers
+=======
 
-Controls which banks are enabled for data sharing. By default, all banks are enabled.
+.. _amdgpu_synid_integer_number:
 
-Note. The lanes of a wavefront are organized in four banks and four rows.
+Integer Numbers
+---------------
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    bank_mask:{0..15}                        Each of 4 bits in the mask controls one
-                                             bank (0 - disabled, 1 - enabled).
-    ======================================== ================================================
+Integer numbers are 64 bits wide.
+They may be specified in binary, octal, hexadecimal and decimal formats:
 
-.. _amdgpu_synid_bound_ctrl:
+    ============== ====================================
+    Format         Syntax
+    ============== ====================================
+    Decimal        [-]?[1-9][0-9]*
+    Binary         [-]?0b[01]+
+    Octal          [-]?0[0-7]+
+    Hexadecimal    [-]?0x[0-9a-fA-F]+
+    \              [-]?[0x]?[0-9][0-9a-fA-F]*[hH]
+    ============== ====================================
 
-bound_ctrl
-~~~~~~~~~~
+Examples:
 
-Controls data sharing when accessing an invalid lane. By default, data sharing with
-invalid lanes is disabled.
+.. parsed-literal::
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    bound_ctrl:0                             Enables data sharing with invalid lanes.
-                                             Accessing data from an invalid lane will
-                                             return zero.
-    ======================================== ================================================
+  -1234
+  0b1010
+  010
+  0xff
+  0ffh
 
-VOP1/VOP2/VOPC SDWA Modifiers
------------------------------
+.. _amdgpu_synid_floating-point_number:
 
-GFX8 and GFX9 only.
+Floating-Point Numbers
+----------------------
 
-clamp
-~~~~~
+All floating-point numbers are handled as double (64 bits wide).
 
-See a description :ref:`here<amdgpu_synid_clamp>`.
+Floating-point numbers may be specified in hexadecimal and decimal formats:
 
-omod
-~~~~
+    ============== ======================================================== ========================================================
+    Format         Syntax                                                   Note
+    ============== ======================================================== ========================================================
+    Decimal        [-]?[0-9]*[.][0-9]*([eE][+-]?[0-9]*)?                    Must include either a decimal separator or an exponent.
+    Hexadecimal    [-]0x[0-9a-fA-F]*(.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
+    ============== ======================================================== ========================================================
 
-See a description :ref:`here<amdgpu_synid_omod>`.
+Examples:
 
-GFX9 only.
+.. parsed-literal::
 
-.. _amdgpu_synid_dst_sel:
+ -1.234
+ 234e2
+ -0x1afp-10
+ 0x.1afp10
 
-dst_sel
-~~~~~~~
+.. _amdgpu_synid_expression:
 
-Selects which bits in the destination are affected. By default, all bits are affected.
+Expressions
+===========
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    dst_sel:DWORD                            Use bits 31:0.
-    dst_sel:BYTE_0                           Use bits 7:0.
-    dst_sel:BYTE_1                           Use bits 15:8.
-    dst_sel:BYTE_2                           Use bits 23:16.
-    dst_sel:BYTE_3                           Use bits 31:24.
-    dst_sel:WORD_0                           Use bits 15:0.
-    dst_sel:WORD_1                           Use bits 31:16.
-    ======================================== ================================================
+An expression specifies an address or a numeric value.
+There are two kinds of expressions:
 
+* :ref:`Absolute<amdgpu_synid_absolute_expression>`.
+* :ref:`Relocatable<amdgpu_synid_relocatable_expression>`.
 
-.. _amdgpu_synid_dst_unused:
+.. _amdgpu_synid_absolute_expression:
 
-dst_unused
-~~~~~~~~~~
+Absolute Expressions
+--------------------
 
-Controls what to do with the bits in the destination which are not selected
-by :ref:`dst_sel<amdgpu_synid_dst_sel>`.
-By default, unused bits are preserved.
+The value of an absolute expression remains the same after program relocation.
+Absolute expressions must not include unassigned and relocatable values
+such as labels.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    dst_unused:UNUSED_PAD                    Pad with zeros.
-    dst_unused:UNUSED_SEXT                   Sign-extend upper bits, zero lower bits.
-    dst_unused:UNUSED_PRESERVE               Preserve bits.
-    ======================================== ================================================
+Examples:
 
-.. _amdgpu_synid_src0_sel:
+.. parsed-literal::
 
-src0_sel
-~~~~~~~~
+    x = -1
+    y = x + 10
 
-Controls which bits in the src0 are used. By default, all bits are used.
+.. _amdgpu_synid_relocatable_expression:
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    src0_sel:DWORD                           Use bits 31:0.
-    src0_sel:BYTE_0                          Use bits 7:0.
-    src0_sel:BYTE_1                          Use bits 15:8.
-    src0_sel:BYTE_2                          Use bits 23:16.
-    src0_sel:BYTE_3                          Use bits 31:24.
-    src0_sel:WORD_0                          Use bits 15:0.
-    src0_sel:WORD_1                          Use bits 31:16.
-    ======================================== ================================================
+Relocatable Expressions
+-----------------------
 
-.. _amdgpu_synid_src1_sel:
+The value of a relocatable expression depends on program relocation.
 
-src1_sel
-~~~~~~~~
+Note that use of relocatable expressions is limited with branch targets
+and 32-bit :ref:`literals<amdgpu_synid_literal>`.
 
-Controls which bits in the src1 are used. By default, all bits are used.
+Addition information about relocation may be found :ref:`here<amdgpu-relocation-records>`.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    src1_sel:DWORD                           Use bits 31:0.
-    src1_sel:BYTE_0                          Use bits 7:0.
-    src1_sel:BYTE_1                          Use bits 15:8.
-    src1_sel:BYTE_2                          Use bits 23:16.
-    src1_sel:BYTE_3                          Use bits 31:24.
-    src1_sel:WORD_0                          Use bits 15:0.
-    src1_sel:WORD_1                          Use bits 31:16.
-    ======================================== ================================================
+Examples:
 
-VOP1/VOP2/VOPC SDWA Operand Modifiers
--------------------------------------
+.. parsed-literal::
 
-Operand modifiers are not used separately. They are applied to source operands.
+    y = x + 10 // x is not yet defined. Undefined symbols are assumed to be PC-relative.
+    z = .
 
-GFX8 and GFX9 only.
+Expression Data Type
+--------------------
 
-abs
-~~~
+Expressions and operands of expressions are interpreted as 64-bit integers.
 
-See a description :ref:`here<amdgpu_synid_abs>`.
+Expressions may include 64-bit :ref:`floating-point numbers<amdgpu_synid_floating-point_number>` (double).
+However these operands are also handled as 64-bit integers
+using binary representation of specified floating-point numbers.
+No conversion from floating-point to integer is performed.
 
-neg
-~~~
+Examples:
 
-See a description :ref:`here<amdgpu_synid_neg>`.
+.. parsed-literal::
 
-.. _amdgpu_synid_sext:
+    x = 0.1    // x is assigned an integer 4591870180066957722 which is a binary representation of 0.1.
+    y = x + x  // y is a sum of two integer values; it is not equal to 0.2!
 
-sext
-~~~~
+Syntax
+------
 
-Sign-extends value of a (sub-dword) operand to fill all 32 bits.
-Has no effect for 32-bit operands.
+Expressions are composed of
+:ref:`symbols<amdgpu_synid_symbol>`,
+:ref:`integer numbers<amdgpu_synid_integer_number>`,
+:ref:`floating-point numbers<amdgpu_synid_floating-point_number>`,
+:ref:`binary operators<amdgpu_synid_expression_bin_op>`,
+:ref:`unary operators<amdgpu_synid_expression_un_op>` and subexpressions.
 
-Valid for integer operands only.
+Expressions may also use "." which is a reference to the current PC (program counter).
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    sext(<operand>)                          Sign-extend operand value.
-    ======================================== ================================================
+The syntax of expressions is shown below::
 
-VOP3 Modifiers
---------------
+    expr ::= expr binop expr | primaryexpr ;
 
-.. _amdgpu_synid_vop3_op_sel:
+    primaryexpr ::= '(' expr ')' | symbol | number | '.' | unop primaryexpr ;
 
-vop3_op_sel
-~~~~~~~~~~~
+    binop ::= '&&'
+            | '||'
+            | '|'
+            | '^'
+            | '&'
+            | '!'
+            | '=='
+            | '!='
+            | '<>'
+            | '<'
+            | '<='
+            | '>'
+            | '>='
+            | '<<'
+            | '>>'
+            | '+'
+            | '-'
+            | '*'
+            | '/'
+            | '%' ;
 
-Selects the low [15:0] or high [31:16] operand bits for source and destination operands.
-By default, low bits are used for all operands.
+    unop ::= '~'
+           | '+'
+           | '-'
+           | '!' ;
 
-The number of values specified with the op_sel modifier must match the number of instruction
-operands (both source and destination). First value controls src0, second value controls src1
-and so on, except that the last value controls destination.
-The value 0 selects the low bits, while 1 selects the high bits.
+.. _amdgpu_synid_expression_bin_op:
 
-Note. op_sel modifier affects 16-bit operands only. For 32-bit operands the value specified
-by op_sel must be 0.
+Binary Operators
+----------------
 
-GFX9 only.
+Binary operators are described in the following table.
+They operate on and produce 64-bit integers.
+Operators with higher priority are performed first.
+
+    ========== ========= ===============================================
+    Operator   Priority  Meaning
+    ========== ========= ===============================================
+       \*         5      Integer multiplication.
+       /          5      Integer division.
+       %          5      Integer signed remainder.
+       \+         4      Integer addition.
+       \-         4      Integer subtraction.
+       <<         3      Integer shift left.
+       >>         3      Logical shift right.
+       ==         2      Equality comparison.
+       !=         2      Inequality comparison.
+       <>         2      Inequality comparison.
+       <          2      Signed less than comparison.
+       <=         2      Signed less than or equal comparison.
+       >          2      Signed greater than comparison.
+       >=         2      Signed greater than or equal comparison.
+      \|          1      Bitwise or.
+       ^          1      Bitwise xor.
+       &          1      Bitwise and.
+       &&         0      Logical and.
+       ||         0      Logical or.
+    ========== ========= ===============================================
+
+.. _amdgpu_synid_expression_un_op:
+
+Unary Operators
+---------------
 
-    ======================================== ============================================================
-    Syntax                                   Description
-    ======================================== ============================================================
-    op_sel:[{0..1},{0..1}]                   Select operand bits for instructions with 1 source operand.
-    op_sel:[{0..1},{0..1},{0..1}]            Select operand bits for instructions with 2 source operands.
-    op_sel:[{0..1},{0..1},{0..1},{0..1}]     Select operand bits for instructions with 3 source operands.
-    ======================================== ============================================================
+Unary operators are described in the following table.
+They operate on and produce 64-bit integers.
 
-.. _amdgpu_synid_clamp:
+    ========== ===============================================
+    Operator   Meaning
+    ========== ===============================================
+       !       Logical negation.
+       ~       Bitwise negation.
+       \+      Integer unary plus.
+       \-      Integer unary minus.
+    ========== ===============================================
 
-clamp
-~~~~~
+.. _amdgpu_synid_symbol:
 
-Clamp meaning depends on instruction.
+Symbols
+-------
 
-For *v_cmp* instructions, clamp modifier indicates that the compare signals
-if a floating point exception occurs. By default, signaling is disabled.
-Not supported by GFX7.
+A symbol is a named 64-bit value, representing a relocatable
+address or an absolute (non-relocatable) number.
 
-For integer operations, clamp modifier indicates that the result must be clamped
-to the largest and smallest representable value. By default, there is no clamping.
-Integer clamping is not supported by GFX7.
+Symbol names have the following syntax:
+    ``[a-zA-Z_.][a-zA-Z0-9_$.@]*``
 
-For floating point operations, clamp modifier indicates that the result must be clamped
-to the range [0.0, 1.0]. By default, there is no clamping.
+The table below provides several examples of syntax used for symbol definition.
 
-Note. Clamp modifier is applied after :ref:`output modifiers<amdgpu_synid_omod>` (if any).
+    ================ ==========================================================
+    Syntax           Meaning
+    ================ ==========================================================
+    .globl <S>       Declares a global symbol S without assigning it a value.
+    .set <S>, <E>    Assigns the value of an expression E to a symbol S.
+    <S> = <E>        Assigns the value of an expression E to a symbol S.
+    <S>:             Declares a label S and assigns it the current PC value.
+    ================ ==========================================================
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    clamp                                    Enables clamping (or signaling).
-    ======================================== ================================================
+A symbol may be used before it is declared or assigned;
+unassigned symbols are assumed to be PC-relative.
 
-.. _amdgpu_synid_omod:
+Addition information about symbols may be found :ref:`here<amdgpu-symbols>`.
 
-omod
-~~~~
+.. _amdgpu_synid_conv:
 
-Specifies if an output modifier must be applied to the result.
-By default, no output modifiers are applied.
+Conversions
+===========
 
-Note. Output modifiers are applied before :ref:`clamping<amdgpu_synid_clamp>` (if any).
+This section describes what happens when a 64-bit
+:ref:`integer number<amdgpu_synid_integer_number>`, a
+:ref:`floating-point numbers<amdgpu_synid_floating-point_number>` or a
+:ref:`symbol<amdgpu_synid_symbol>`
+is used for an operand which has a different type or size.
 
-Output modifiers are valid for f32 and f64 floating point results only.
-They must not be used with f16.
+Depending on operand kind, this conversion is performed by either assembler or AMDGPU H/W:
 
-Note. *v_cvt_f16_f32* is an exception. This instruction produces f16 result
-but accepts output modifiers.
+* Values encoded as :ref:`inline constants<amdgpu_synid_constant>` are handled by H/W.
+* Values encoded as :ref:`literals<amdgpu_synid_literal>` are converted by assembler.
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    mul:2                                    Multiply the result by 2.
-    mul:4                                    Multiply the result by 4.
-    div:2                                    Multiply the result by 0.5.
-    ======================================== ================================================
+.. _amdgpu_synid_const_conv:
 
-VOP3 Operand Modifiers
-----------------------
+Inline Constants
+----------------
 
-Operand modifiers are not used separately. They are applied to source operands.
+.. _amdgpu_synid_int_const_conv:
 
-.. _amdgpu_synid_abs:
+Integer Inline Constants
+~~~~~~~~~~~~~~~~~~~~~~~~
 
-abs
-~~~
+Integer :ref:`inline constants<amdgpu_synid_constant>`
+may be thought of as 64-bit
+:ref:`integer numbers<amdgpu_synid_integer_number>`;
+when used as operands they are truncated to the size of
+:ref:`expected operand type<amdgpu_syn_instruction_type>`.
+No data type conversions are performed.
 
-Computes absolute value of its operand. Applied before :ref:`neg<amdgpu_synid_neg>` (if any).
-Valid for floating point operands only.
+Examples:
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    abs(<operand>)                           Get absolute value of operand.
-    \|<operand>|                             The same as above.
-    ======================================== ================================================
+.. parsed-literal::
 
-.. _amdgpu_synid_neg:
+    // GFX9
 
-neg
-~~~
+    v_add_u16 v0, -1, 0    // v0 = 0xFFFF
+    v_add_f16 v0, -1, 0    // v0 = 0xFFFF (NaN)
 
-Computes negative value of its operand. Applied after :ref:`abs<amdgpu_synid_abs>` (if any).
-Valid for floating point operands only.
+    v_add_u32 v0, -1, 0    // v0 = 0xFFFFFFFF
+    v_add_f32 v0, -1, 0    // v0 = 0xFFFFFFFF (NaN)
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    neg(<operand>)                           Get negative value of operand.
-    -<operand>                               The same as above.
-    ======================================== ================================================
+.. _amdgpu_synid_fp_const_conv:
 
-VOP3P Modifiers
----------------
+Floating-Point Inline Constants
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-This section describes modifiers of regular VOP3P instructions.
-*v_mad_mix* modifiers are described :ref:`in a separate section<amdgpu_synid_mad_mix>`.
+Floating-point :ref:`inline constants<amdgpu_synid_constant>`
+may be thought of as 64-bit
+:ref:`floating-point numbers<amdgpu_synid_floating-point_number>`;
+when used as operands they are converted to a floating-point number of
+:ref:`expected operand size<amdgpu_syn_instruction_type>`.
 
-GFX9 only.
+Examples:
 
-.. _amdgpu_synid_op_sel:
+.. parsed-literal::
 
-op_sel
-~~~~~~
+    // GFX9
 
-Selects the low [15:0] or high [31:16] operand bits as input to the operation
-which results in the lower-half of the destination.
-By default, low bits are used for all operands.
+    v_add_f16 v0, 1.0, 0    // v0 = 0x3C00 (1.0)
+    v_add_u16 v0, 1.0, 0    // v0 = 0x3C00
 
-The number of values specified with the op_sel modifier must match the number of source
-operands. First value controls src0, second value controls src1 and so on.
-The value 0 selects the low bits, while 1 selects the high bits.
+    v_add_f32 v0, 1.0, 0    // v0 = 0x3F800000 (1.0)
+    v_add_u32 v0, 1.0, 0    // v0 = 0x3F800000
 
-    ======================================== =============================================================
-    Syntax                                   Description
-    ======================================== =============================================================
-    op_sel:[{0..1}]                          Select operand bits for instructions with 1 source operand.
-    op_sel:[{0..1},{0..1}]                   Select operand bits for instructions with 2 source operands.
-    op_sel:[{0..1},{0..1},{0..1}]            Select operand bits for instructions with 3 source operands.
-    ======================================== =============================================================
 
-.. _amdgpu_synid_op_sel_hi:
+.. _amdgpu_synid_lit_conv:
 
-op_sel_hi
-~~~~~~~~~
+Literals
+--------
 
-Selects the low [15:0] or high [31:16] operand bits as input to the operation
-which results in the upper-half of the destination.
-By default, high bits are used for all operands.
+.. _amdgpu_synid_int_lit_conv:
 
-The number of values specified with the op_sel_hi modifier must match the number of source
-operands. First value controls src0, second value controls src1 and so on.
-The value 0 selects the low bits, while 1 selects the high bits.
+Integer Literals
+~~~~~~~~~~~~~~~~
 
-    ======================================== =============================================================
-    Syntax                                   Description
-    ======================================== =============================================================
-    op_sel_hi:[{0..1}]                       Select operand bits for instructions with 1 source operand.
-    op_sel_hi:[{0..1},{0..1}]                Select operand bits for instructions with 2 source operands.
-    op_sel_hi:[{0..1},{0..1},{0..1}]         Select operand bits for instructions with 3 source operands.
-    ======================================== =============================================================
+Integer :ref:`literals<amdgpu_synid_literal>`
+are specified as 64-bit :ref:`integer numbers<amdgpu_synid_integer_number>`.
 
-.. _amdgpu_synid_neg_lo:
+When used as operands they are converted to
+:ref:`expected operand type<amdgpu_syn_instruction_type>` as described below.
 
-neg_lo
-~~~~~~
+    ============== ============== =============== ====================================================================
+    Expected type  Condition      Result          Note
+    ============== ============== =============== ====================================================================
+    i16, u16, b16  cond(num, 16)  num.u16         Truncate to 16 bits.
+    i32, u32, b32  cond(num, 32)  num.u32         Truncate to 32 bits.
+    i64            cond(num, 32)  {-1, num.i32}   Truncate to 32 bits and then sign-extend the result to 64 bits.
+    u64, b64       cond(num, 32)  { 0, num.u32}   Truncate to 32 bits and then zero-extend the result to 64 bits.
+    f16            cond(num, 16)  num.u16         Use low 16 bits as an f16 value.
+    f32            cond(num, 32)  num.u32         Use low 32 bits as an f32 value.
+    f64            cond(num, 32)  {num.u32, 0}    Use low 32 bits of the number as high 32 bits
+                                                  of the result; low 32 bits of the result are zeroed.
+    ============== ============== =============== ====================================================================
 
-Specifies whether to change sign of operand values selected by
-:ref:`op_sel<amdgpu_synid_op_sel>`. These values are then used
-as input to the operation which results in the upper-half of the destination.
+The condition *cond(X,S)* indicates if a 64-bit number *X*
+can be converted to a smaller size *S* by truncation of upper bits.
+There are two cases when the conversion is possible:
 
-The number of values specified with this modifier must match the number of source
-operands. First value controls src0, second value controls src1 and so on.
+* The truncated bits are all 0.
+* The truncated bits are all 1 and the value after truncation has its MSB bit set.
 
-The value 0 indicates that the corresponding operand value is used unmodified,
-the value 1 indicates that negative value of the operand must be used.
+Examples of valid literals:
 
-By default, operand values are used unmodified.
+.. parsed-literal::
 
-This modifier is valid for floating point operands only.
+    // GFX9
 
-    ======================================== ==================================================================
-    Syntax                                   Description
-    ======================================== ==================================================================
-    neg_lo:[{0..1}]                          Select affected operands for instructions with 1 source operand.
-    neg_lo:[{0..1},{0..1}]                   Select affected operands for instructions with 2 source operands.
-    neg_lo:[{0..1},{0..1},{0..1}]            Select affected operands for instructions with 3 source operands.
-    ======================================== ==================================================================
+    v_add_u16 v0, 0xff00, v0                     // value after conversion: 0xff00
+    v_add_u16 v0, 0xffffffffffffff00, v0         // value after conversion: 0xff00
+    v_add_u16 v0, -256, v0                       // value after conversion: 0xff00
 
-.. _amdgpu_synid_neg_hi:
+    s_bfe_i64 s[0:1], 0xffefffff, s3             // value after conversion: 0xffffffffffefffff
+    s_bfe_u64 s[0:1], 0xffefffff, s3             // value after conversion: 0x00000000ffefffff
+    v_ceil_f64_e32 v[0:1], 0xffefffff            // value after conversion: 0xffefffff00000000 (-1.7976922776554302e308)
 
-neg_hi
-~~~~~~
+Examples of invalid literals:
 
-Specifies whether to change sign of operand values selected by
-:ref:`op_sel_hi<amdgpu_synid_op_sel_hi>`. These values are then used
-as input to the operation which results in the upper-half of the destination.
+.. parsed-literal::
 
-The number of values specified with this modifier must match the number of source
-operands. First value controls src0, second value controls src1 and so on.
+    // GFX9
 
-The value 0 indicates that the corresponding operand value is used unmodified,
-the value 1 indicates that negative value of the operand must be used.
+    v_add_u16 v0, 0x1ff00, v0               // conversion is not possible as truncated bits are not all 0 or 1
+    v_add_u16 v0, 0xffffffffffff00ff, v0    // conversion is not possible as truncated bits do not match MSB of the result
 
-By default, operand values are used unmodified.
+.. _amdgpu_synid_fp_lit_conv:
 
-This modifier is valid for floating point operands only.
+Floating-Point Literals
+~~~~~~~~~~~~~~~~~~~~~~~
 
-    ======================================== ==================================================================
-    Syntax                                   Description
-    ======================================== ==================================================================
-    neg_hi:[{0..1}]                          Select affected operands for instructions with 1 source operand.
-    neg_hi:[{0..1},{0..1}]                   Select affected operands for instructions with 2 source operands.
-    neg_hi:[{0..1},{0..1},{0..1}]            Select affected operands for instructions with 3 source operands.
-    ======================================== ==================================================================
+Floating-point :ref:`literals<amdgpu_synid_literal>` are specified as 64-bit
+:ref:`floating-point numbers<amdgpu_synid_floating-point_number>`.
 
-clamp
-~~~~~
+When used as operands they are converted to
+:ref:`expected operand type<amdgpu_syn_instruction_type>` as described below.
 
-See a description :ref:`here<amdgpu_synid_clamp>`.
+    ============== ============== ================= =================================================================
+    Expected type  Condition      Result            Note
+    ============== ============== ================= =================================================================
+    i16, u16, b16  cond(num, 16)  f16(num)          Convert to f16 and use bits of the result as an integer value.
+    i32, u32, b32  cond(num, 32)  f32(num)          Convert to f32 and use bits of the result as an integer value.
+    i64, u64, b64  false          \-                Conversion disabled because of an unclear semantics.
+    f16            cond(num, 16)  f16(num)          Convert to f16.
+    f32            cond(num, 32)  f32(num)          Convert to f32.
+    f64            true           {num.u32.hi, 0}   Use high 32 bits of the number as high 32 bits of the result;
+                                                    zero-fill low 32 bits of the result.
 
-.. _amdgpu_synid_mad_mix:
+                                                    Note that the result may differ from the original number.
+    ============== ============== ================= =================================================================
 
-VOP3P V_MAD_MIX Modifiers
--------------------------
+The condition *cond(X,S)* indicates if an f64 number *X* can be converted
+to a smaller *S*-bit floating-point type without overflow or underflow.
+Precision lost is allowed.
 
-These instructions use VOP3P format but have different modifiers.
+Examples of valid literals:
 
-GFX9 only.
+.. parsed-literal::
 
-.. _amdgpu_synid_mad_mix_op_sel:
+    // GFX9
 
-mad_mix_op_sel
-~~~~~~~~~~~~~~
+    v_add_f16 v1, 65500.0, v2
+    v_add_f32 v1, 65600.0, v2
 
-This operand has meaning only for 16-bit source operands as indicated by
-:ref:`mad_mix_op_sel_hi<amdgpu_synid_mad_mix_op_sel_hi>`.
-It specifies to select either the low [15:0] or high [31:16] operand bits
-as input to the operation.
+                                                 // value before conversion: 0x7fefffffffffffff (1.7976931348623157e308)
+    v_ceil_f64 v[0:1], 1.7976931348623157e308    // value after conversion:  0x7fefffff00000000 (1.7976922776554302e308)
 
-The value 0 indicates the low bits, the value 1 indicates the high 16 bits.
-By default, low bits are used for all operands.
+Examples of invalid literals:
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    op_sel:[{0..1},{0..1},{0..1}]            Select location of each 16-bit source operand.
-    ======================================== ================================================
+.. parsed-literal::
 
-.. _amdgpu_synid_mad_mix_op_sel_hi:
+    // GFX9
 
-mad_mix_op_sel_hi
-~~~~~~~~~~~~~~~~~
+    v_add_f16 v1, 65600.0, v2                    // cannot be converted to f16 because of overflow
 
-Selects the size of source operands: either 32 bits or 16 bits.
-By default, 32 bits are used for all source operands.
+.. _amdgpu_synid_exp_conv:
 
-The value 0 indicates 32 bits, the value 1 indicates 16 bits.
-The location of 16 bits in the operand may be specified by
-:ref:`mad_mix_op_sel<amdgpu_synid_mad_mix_op_sel>`.
+Expressions
+~~~~~~~~~~~
 
-    ======================================== ================================================
-    Syntax                                   Description
-    ======================================== ================================================
-    op_sel_hi:[{0..1},{0..1},{0..1}]         Select size of each source operand.
-    ======================================== ================================================
+Expressions operate with and result in 64-bit integers.
 
-abs
-~~~
+When used as operands they are truncated to
+:ref:`expected operand size<amdgpu_syn_instruction_type>`.
+No data type conversions are performed.
 
-See a description :ref:`here<amdgpu_synid_abs>`.
+Examples:
 
-neg
-~~~
+.. parsed-literal::
 
-See a description :ref:`here<amdgpu_synid_neg>`.
+    // GFX9
 
-clamp
-~~~~~
+    x = 0.1
+    v_sqrt_f32 v0, x           // v0 = [low 32 bits of 0.1 (double)]
+    v_sqrt_f32 v0, (0.1 + 0)   // the same as above
+    v_sqrt_f32 v0, 0.1         // v0 = [0.1 (double) converted to float]
 
-See a description :ref:`here<amdgpu_synid_clamp>`.
diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index 03685f9e352834d826da2f6196fbe0f0b7bc3f27..c60d60af5a82cf1204fb9854816f2762ff8c2ecc 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -679,17 +679,18 @@ aligned. In addition, minimal zero byte padding must be generated to ensure the
 ``desc`` field size is a multiple of 4 bytes. The ``sh_addralign`` field of the
 ``.note`` section must be at least 4 to indicate at least 8 byte alignment.
 
-The AMDGPU backend code object uses the following ELF note records in the
-``.note`` section. The *Description* column specifies the layout of the note
-record's ``desc`` field. All fields are consecutive bytes. Note records with
-variable size strings have a corresponding ``*_size`` field that specifies the
-number of bytes, including the terminating null character, in the string. The
-string(s) come immediately after the preceding fields.
+.. _amdgpu-note-records-v2:
+
+Code Object V2 Note Records (-mattr=-code-object-v3)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The AMDGPU backend code object uses the following ELF note record in the
+``.note`` section.
 
 Additional note records can be present.
 
-  .. table:: AMDGPU ELF Note Records
-     :name: amdgpu-elf-note-records-table
+  .. table:: AMDGPU Code Object V2 ELF Note Records
+     :name: amdgpu-elf-note-records-table-v2
 
      ===== ============================== ======================================
      Name  Type                           Description
@@ -699,8 +700,8 @@ Additional note records can be present.
 
 ..
 
-  .. table:: AMDGPU ELF Note Record Enumeration Values
-     :name: amdgpu-elf-note-record-enumeration-values-table
+  .. table:: AMDGPU Code Object V2 ELF Note Record Enumeration Values
+     :name: amdgpu-elf-note-record-enumeration-values-table-v2
 
      ============================== =====
      Name                           Value
@@ -714,9 +715,47 @@ Additional note records can be present.
   Specifies extensible metadata associated with the code objects executed on HSA
   [HSA]_ compatible runtimes such as AMD's ROCm [AMD-ROCm]_. It is required when
   the target triple OS is ``amdhsa`` (see :ref:`amdgpu-target-triples`). See
-  :ref:`amdgpu-amdhsa-code-object-metadata` for the syntax of the code
+  :ref:`amdgpu-amdhsa-code-object-metadata-v2` for the syntax of the code
   object metadata string.
 
+.. _amdgpu-note-records-v3:
+
+Code Object V3 Note Records (-mattr=+code-object-v3)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The AMDGPU backend code object uses the following ELF note record in the
+``.note`` section.
+
+Additional note records can be present.
+
+  .. table:: AMDGPU Code Object V3 ELF Note Records
+     :name: amdgpu-elf-note-records-table-v3
+
+     ======== ============================== ======================================
+     Name     Type                           Description
+     ======== ============================== ======================================
+     "AMDGPU" ``NT_AMDGPU_METADATA``         Metadata in Message Pack [MsgPack]_
+                                             binary format.
+     ======== ============================== ======================================
+
+..
+
+  .. table:: AMDGPU Code Object V3 ELF Note Record Enumeration Values
+     :name: amdgpu-elf-note-record-enumeration-values-table-v3
+
+     ============================== =====
+     Name                           Value
+     ============================== =====
+     *reserved*                     0-31
+     ``NT_AMDGPU_METADATA``         32
+     ============================== =====
+
+``NT_AMDGPU_METADATA``
+  Specifies extensible metadata associated with an AMDGPU code
+  object. It is encoded as a map in the Message Pack [MsgPack]_ binary
+  data format. See :ref:`amdgpu-amdhsa-code-object-metadata-v3` for the
+  map keys defined for the ``amdhsa`` OS.
+
 .. _amdgpu-symbols:
 
 Symbols
@@ -1009,13 +1048,21 @@ Code Object Metadata
 
 The code object metadata specifies extensible metadata associated with the code
 objects executed on HSA [HSA]_ compatible runtimes such as AMD's ROCm
-[AMD-ROCm]_. It is specified by the ``NT_AMD_AMDGPU_HSA_METADATA`` note record
-(see :ref:`amdgpu-note-records`) and is required when the target triple OS is
-``amdhsa`` (see :ref:`amdgpu-target-triples`). It must contain the minimum
-information necessary to support the ROCM kernel queries. For example, the
-segment sizes needed in a dispatch packet. In addition, a high level language
-runtime may require other information to be included. For example, the AMD
-OpenCL runtime records kernel argument information.
+[AMD-ROCm]_. It is specified in a note record (see :ref:`amdgpu-note-records`)
+and is required when the target triple OS is ``amdhsa`` (see
+:ref:`amdgpu-target-triples`). It must contain the minimum information
+necessary to support the ROCM kernel queries. For example, the segment sizes
+needed in a dispatch packet. In addition, a high level language runtime may
+require other information to be included. For example, the AMD OpenCL runtime
+records kernel argument information.
+
+.. _amdgpu-amdhsa-code-object-metadata-v2:
+
+Code Object V2 Metadata (-mattr=-code-object-v3)
+++++++++++++++++++++++++++++++++++++++++++++++++
+
+Code object V2 metadata is specified by the ``NT_AMD_AMDGPU_METADATA`` note
+record (see :ref:`amdgpu-note-records-v2`).
 
 The metadata is specified as a YAML formatted string (see [YAML]_ and
 :doc:`YamlIO`).
@@ -1025,7 +1072,7 @@ The metadata is specified as a YAML formatted string (see [YAML]_ and
    contain null characters, otherwise it should be.
 
 The metadata is represented as a single YAML document comprised of the mapping
-defined in table :ref:`amdgpu-amdhsa-code-object-metadata-mapping-table` and
+defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v2` and
 referenced tables.
 
 For boolean values, the string values of ``false`` and ``true`` are used for
@@ -1034,8 +1081,8 @@ false and true respectively.
 Additional information can be added to the mappings. To avoid conflicts, any
 non-AMD key names should be prefixed by "*vendor-name*.".
 
-  .. table:: AMDHSA Code Object Metadata Mapping
-     :name: amdgpu-amdhsa-code-object-metadata-mapping-table
+  .. table:: AMDHSA Code Object V2 Metadata Map
+     :name: amdgpu-amdhsa-code-object-metadata-map-table-v2
 
      ========== ============== ========= =======================================
      String Key Value Type     Required? Description
@@ -1072,14 +1119,14 @@ non-AMD key names should be prefixed by "*vendor-name*.".
                                            printf function call.
      "Kernels"  sequence of    Required  Sequence of the mappings for each
                 mapping                  kernel in the code object. See
-                                         :ref:`amdgpu-amdhsa-code-object-kernel-metadata-mapping-table`
+                                         :ref:`amdgpu-amdhsa-code-object-kernel-metadata-map-table-v2`
                                          for the definition of the mapping.
      ========== ============== ========= =======================================
 
 ..
 
-  .. table:: AMDHSA Code Object Kernel Metadata Mapping
-     :name: amdgpu-amdhsa-code-object-kernel-metadata-mapping-table
+  .. table:: AMDHSA Code Object V2 Kernel Metadata Map
+     :name: amdgpu-amdhsa-code-object-kernel-metadata-map-table-v2
 
      ================= ============== ========= ================================
      String Key        Value Type     Required? Description
@@ -1101,22 +1148,22 @@ non-AMD key names should be prefixed by "*vendor-name*.".
                                                   minor version.
      "Attrs"           mapping                  Mapping of kernel attributes.
                                                 See
-                                                :ref:`amdgpu-amdhsa-code-object-kernel-attribute-metadata-mapping-table`
+                                                :ref:`amdgpu-amdhsa-code-object-kernel-attribute-metadata-map-table-v2`
                                                 for the mapping definition.
      "Args"            sequence of              Sequence of mappings of the
                        mapping                  kernel arguments. See
-                                                :ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-mapping-table`
+                                                :ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v2`
                                                 for the definition of the mapping.
      "CodeProps"       mapping                  Mapping of properties related to
                                                 the kernel code. See
-                                                :ref:`amdgpu-amdhsa-code-object-kernel-code-properties-metadata-mapping-table`
+                                                :ref:`amdgpu-amdhsa-code-object-kernel-code-properties-metadata-map-table-v2`
                                                 for the mapping definition.
      ================= ============== ========= ================================
 
 ..
 
-  .. table:: AMDHSA Code Object Kernel Attribute Metadata Mapping
-     :name: amdgpu-amdhsa-code-object-kernel-attribute-metadata-mapping-table
+  .. table:: AMDHSA Code Object V2 Kernel Attribute Metadata Map
+     :name: amdgpu-amdhsa-code-object-kernel-attribute-metadata-map-table-v2
 
      =================== ============== ========= ==============================
      String Key          Value Type     Required? Description
@@ -1156,8 +1203,8 @@ non-AMD key names should be prefixed by "*vendor-name*.".
 
 ..
 
-  .. table:: AMDHSA Code Object Kernel Argument Metadata Mapping
-     :name: amdgpu-amdhsa-code-object-kernel-argument-metadata-mapping-table
+  .. table:: AMDHSA Code Object V2 Kernel Argument Metadata Map
+     :name: amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v2
 
      ================= ============== ========= ================================
      String Key        Value Type     Required? Description
@@ -1354,8 +1401,8 @@ non-AMD key names should be prefixed by "*vendor-name*.".
 
 ..
 
-  .. table:: AMDHSA Code Object Kernel Code Properties Metadata Mapping
-     :name: amdgpu-amdhsa-code-object-kernel-code-properties-metadata-mapping-table
+  .. table:: AMDHSA Code Object V2 Kernel Code Properties Metadata Map
+     :name: amdgpu-amdhsa-code-object-kernel-code-properties-metadata-map-table-v2
 
      ============================ ============== ========= =====================
      String Key                   Value Type     Required? Description
@@ -1433,6 +1480,414 @@ non-AMD key names should be prefixed by "*vendor-name*.".
                                                            location.
      ============================ ============== ========= =====================
 
+.. _amdgpu-amdhsa-code-object-metadata-v3:
+
+Code Object V3 Metadata (-mattr=+code-object-v3)
+++++++++++++++++++++++++++++++++++++++++++++++++
+
+Code object V3 metadata is specified by the ``NT_AMDGPU_METADATA`` note record
+(see :ref:`amdgpu-note-records-v3`).
+
+The metadata is represented as Message Pack formatted binary data (see
+[MsgPack]_). The top level is a Message Pack map that includes the
+keys defined in table
+:ref:`amdgpu-amdhsa-code-object-metadata-map-table-v3` and referenced
+tables.
+
+Additional information can be added to the maps. To avoid conflicts,
+any key names should be prefixed by "*vendor-name*." where
+``vendor-name`` can be the the name of the vendor and specific vendor
+tool that generates the information. The prefix is abbreviated to
+simply "." when it appears within a map that has been added by the
+same *vendor-name*.
+
+  .. table:: AMDHSA Code Object V3 Metadata Map
+     :name: amdgpu-amdhsa-code-object-metadata-map-table-v3
+
+     ================= ============== ========= =======================================
+     String Key        Value Type     Required? Description
+     ================= ============== ========= =======================================
+     "amdhsa.version"  sequence of    Required  - The first integer is the major
+                       2 integers                 version. Currently 1.
+                                                - The second integer is the minor
+                                                  version. Currently 0.
+     "amdhsa.printf"   sequence of              Each string is encoded information
+                       strings                  about a printf function call. The
+                                                encoded information is organized as
+                                                fields separated by colon (':'):
+
+                                                ``ID:N:S[0]:S[1]:...:S[N-1]:FormatString``
+
+                                                where:
+
+                                                ``ID``
+                                                  A 32 bit integer as a unique id for
+                                                  each printf function call
+
+                                                ``N``
+                                                  A 32 bit integer equal to the number
+                                                  of arguments of printf function call
+                                                  minus 1
+
+                                                ``S[i]`` (where i = 0, 1, ... , N-1)
+                                                  32 bit integers for the size in bytes
+                                                  of the i-th FormatString argument of
+                                                  the printf function call
+
+                                                FormatString
+                                                  The format string passed to the
+                                                  printf function call.
+     "amdhsa.kernels"  sequence of    Required  Sequence of the maps for each
+                       map                      kernel in the code object. See
+                                                :ref:`amdgpu-amdhsa-code-object-kernel-metadata-map-table-v3`
+                                                for the definition of the keys included
+                                                in that map.
+     ================= ============== ========= =======================================
+
+..
+
+  .. table:: AMDHSA Code Object V3 Kernel Metadata Map
+     :name: amdgpu-amdhsa-code-object-kernel-metadata-map-table-v3
+
+     =================================== ============== ========= ================================
+     String Key                          Value Type     Required? Description
+     =================================== ============== ========= ================================
+     ".name"                             string         Required  Source name of the kernel.
+     ".symbol"                           string         Required  Name of the kernel
+                                                                  descriptor ELF symbol.
+     ".language"                         string                   Source language of the kernel.
+                                                                  Values include:
+
+                                                                  - "OpenCL C"
+                                                                  - "OpenCL C++"
+                                                                  - "HCC"
+                                                                  - "HIP"
+                                                                  - "OpenMP"
+                                                                  - "Assembler"
+
+     ".language_version"                 sequence of              - The first integer is the major
+                                         2 integers                 version.
+                                                                  - The second integer is the
+                                                                    minor version.
+     ".args"                             sequence of              Sequence of maps of the
+                                         map                      kernel arguments. See
+                                                                  :ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v3`
+                                                                  for the definition of the keys
+                                                                  included in that map.
+     ".reqd_workgroup_size"              sequence of              If not 0, 0, 0 then all values
+                                         3 integers               must be >=1 and the dispatch
+                                                                  work-group size X, Y, Z must
+                                                                  correspond to the specified
+                                                                  values. Defaults to 0, 0, 0.
+
+                                                                  Corresponds to the OpenCL
+                                                                  ``reqd_work_group_size``
+                                                                  attribute.
+     ".workgroup_size_hint"              sequence of              The dispatch work-group size
+                                         3 integers               X, Y, Z is likely to be the
+                                                                  specified values.
+
+                                                                  Corresponds to the OpenCL
+                                                                  ``work_group_size_hint``
+                                                                  attribute.
+     ".vec_type_hint"                    string                   The name of a scalar or vector
+                                                                  type.
+
+                                                                  Corresponds to the OpenCL
+                                                                  ``vec_type_hint`` attribute.
+
+     ".device_enqueue_symbol"            string                   The external symbol name
+                                                                  associated with a kernel.
+                                                                  OpenCL runtime allocates a
+                                                                  global buffer for the symbol
+                                                                  and saves the kernel's address
+                                                                  to it, which is used for
+                                                                  device side enqueueing. Only
+                                                                  available for device side
+                                                                  enqueued kernels.
+     ".kernarg_segment_size"             integer        Required  The size in bytes of
+                                                                  the kernarg segment
+                                                                  that holds the values
+                                                                  of the arguments to
+                                                                  the kernel.
+     ".group_segment_fixed_size"         integer        Required  The amount of group
+                                                                  segment memory
+                                                                  required by a
+                                                                  work-group in
+                                                                  bytes. This does not
+                                                                  include any
+                                                                  dynamically allocated
+                                                                  group segment memory
+                                                                  that may be added
+                                                                  when the kernel is
+                                                                  dispatched.
+     ".private_segment_fixed_size"       integer        Required  The amount of fixed
+                                                                  private address space
+                                                                  memory required for a
+                                                                  work-item in
+                                                                  bytes. If the kernel
+                                                                  uses a dynamic call
+                                                                  stack then additional
+                                                                  space must be added
+                                                                  to this value for the
+                                                                  call stack.
+     ".kernarg_segment_align"            integer        Required  The maximum byte
+                                                                  alignment of
+                                                                  arguments in the
+                                                                  kernarg segment. Must
+                                                                  be a power of 2.
+     ".wavefront_size"                   integer        Required  Wavefront size. Must
+                                                                  be a power of 2.
+     ".sgpr_count"                       integer        Required  Number of scalar
+                                                                  registers required by a
+                                                                  wavefront for
+                                                                  GFX6-GFX9. A register
+                                                                  is required if it is
+                                                                  used explicitly, or
+                                                                  if a higher numbered
+                                                                  register is used
+                                                                  explicitly. This
+                                                                  includes the special
+                                                                  SGPRs for VCC, Flat
+                                                                  Scratch (GFX7-GFX9)
+                                                                  and XNACK (for
+                                                                  GFX8-GFX9). It does
+                                                                  not include the 16
+                                                                  SGPR added if a trap
+                                                                  handler is
+                                                                  enabled. It is not
+                                                                  rounded up to the
+                                                                  allocation
+                                                                  granularity.
+     ".vgpr_count"                       integer        Required  Number of vector
+                                                                  registers required by
+                                                                  each work-item for
+                                                                  GFX6-GFX9. A register
+                                                                  is required if it is
+                                                                  used explicitly, or
+                                                                  if a higher numbered
+                                                                  register is used
+                                                                  explicitly.
+     ".max_flat_workgroup_size"          integer        Required  Maximum flat
+                                                                  work-group size
+                                                                  supported by the
+                                                                  kernel in work-items.
+                                                                  Must be >=1 and
+                                                                  consistent with
+                                                                  ReqdWorkGroupSize if
+                                                                  not 0, 0, 0.
+     ".sgpr_spill_count"                 integer                  Number of stores from
+                                                                  a scalar register to
+                                                                  a register allocator
+                                                                  created spill
+                                                                  location.
+     ".vgpr_spill_count"                 integer                  Number of stores from
+                                                                  a vector register to
+                                                                  a register allocator
+                                                                  created spill
+                                                                  location.
+     =================================== ============== ========= ================================
+
+..
+
+  .. table:: AMDHSA Code Object V3 Kernel Argument Metadata Map
+     :name: amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v3
+
+     ====================== ============== ========= ================================
+     String Key             Value Type     Required? Description
+     ====================== ============== ========= ================================
+     ".name"                string                   Kernel argument name.
+     ".type_name"           string                   Kernel argument type name.
+     ".size"                integer        Required  Kernel argument size in bytes.
+     ".offset"              integer        Required  Kernel argument offset in
+                                                     bytes. The offset must be a
+                                                     multiple of the alignment
+                                                     required by the argument.
+     ".value_kind"          string         Required  Kernel argument kind that
+                                                     specifies how to set up the
+                                                     corresponding argument.
+                                                     Values include:
+
+                                                     "by_value"
+                                                       The argument is copied
+                                                       directly into the kernarg.
+
+                                                     "global_buffer"
+                                                       A global address space pointer
+                                                       to the buffer data is passed
+                                                       in the kernarg.
+
+                                                     "dynamic_shared_pointer"
+                                                       A group address space pointer
+                                                       to dynamically allocated LDS
+                                                       is passed in the kernarg.
+
+                                                     "sampler"
+                                                       A global address space
+                                                       pointer to a S# is passed in
+                                                       the kernarg.
+
+                                                     "image"
+                                                       A global address space
+                                                       pointer to a T# is passed in
+                                                       the kernarg.
+
+                                                     "pipe"
+                                                       A global address space pointer
+                                                       to an OpenCL pipe is passed in
+                                                       the kernarg.
+
+                                                     "queue"
+                                                       A global address space pointer
+                                                       to an OpenCL device enqueue
+                                                       queue is passed in the
+                                                       kernarg.
+
+                                                     "hidden_global_offset_x"
+                                                       The OpenCL grid dispatch
+                                                       global offset for the X
+                                                       dimension is passed in the
+                                                       kernarg.
+
+                                                     "hidden_global_offset_y"
+                                                       The OpenCL grid dispatch
+                                                       global offset for the Y
+                                                       dimension is passed in the
+                                                       kernarg.
+
+                                                     "hidden_global_offset_z"
+                                                       The OpenCL grid dispatch
+                                                       global offset for the Z
+                                                       dimension is passed in the
+                                                       kernarg.
+
+                                                     "hidden_none"
+                                                       An argument that is not used
+                                                       by the kernel. Space needs to
+                                                       be left for it, but it does
+                                                       not need to be set up.
+
+                                                     "hidden_printf_buffer"
+                                                       A global address space pointer
+                                                       to the runtime printf buffer
+                                                       is passed in kernarg.
+
+                                                     "hidden_default_queue"
+                                                       A global address space pointer
+                                                       to the OpenCL device enqueue
+                                                       queue that should be used by
+                                                       the kernel by default is
+                                                       passed in the kernarg.
+
+                                                     "hidden_completion_action"
+                                                       A global address space pointer
+                                                       to help link enqueued kernels into
+                                                       the ancestor tree for determining
+                                                       when the parent kernel has finished.
+
+     ".value_type"          string         Required  Kernel argument value type. Only
+                                                     present if ".value_kind" is
+                                                     "by_value". For vector data
+                                                     types, the value is for the
+                                                     element type. Values include:
+
+                                                     - "struct"
+                                                     - "i8"
+                                                     - "u8"
+                                                     - "i16"
+                                                     - "u16"
+                                                     - "f16"
+                                                     - "i32"
+                                                     - "u32"
+                                                     - "f32"
+                                                     - "i64"
+                                                     - "u64"
+                                                     - "f64"
+
+                                                     .. TODO
+                                                        How can it be determined if a
+                                                        vector type, and what size
+                                                        vector?
+     ".pointee_align"       integer                  Alignment in bytes of pointee
+                                                     type for pointer type kernel
+                                                     argument. Must be a power
+                                                     of 2. Only present if
+                                                     ".value_kind" is
+                                                     "dynamic_shared_pointer".
+     ".address_space"       string                   Kernel argument address space
+                                                     qualifier. Only present if
+                                                     ".value_kind" is "global_buffer" or
+                                                     "dynamic_shared_pointer". Values
+                                                     are:
+
+                                                     - "private"
+                                                     - "global"
+                                                     - "constant"
+                                                     - "local"
+                                                     - "generic"
+                                                     - "region"
+
+                                                     .. TODO
+                                                        Is "global_buffer" only "global"
+                                                        or "constant"? Is
+                                                        "dynamic_shared_pointer" always
+                                                        "local"? Can HCC allow "generic"?
+                                                        How can "private" or "region"
+                                                        ever happen?
+     ".access"              string                   Kernel argument access
+                                                     qualifier. Only present if
+                                                     ".value_kind" is "image" or
+                                                     "pipe". Values
+                                                     are:
+
+                                                     - "read_only"
+                                                     - "write_only"
+                                                     - "read_write"
+
+                                                     .. TODO
+                                                        Does this apply to
+                                                        "global_buffer"?
+     ".actual_access"       string                   The actual memory accesses
+                                                     performed by the kernel on the
+                                                     kernel argument. Only present if
+                                                     ".value_kind" is "global_buffer",
+                                                     "image", or "pipe". This may be
+                                                     more restrictive than indicated
+                                                     by ".access" to reflect what the
+                                                     kernel actual does. If not
+                                                     present then the runtime must
+                                                     assume what is implied by
+                                                     ".access" and ".is_const"      . Values
+                                                     are:
+
+                                                     - "read_only"
+                                                     - "write_only"
+                                                     - "read_write"
+
+     ".is_const"            boolean                  Indicates if the kernel argument
+                                                     is const qualified. Only present
+                                                     if ".value_kind" is
+                                                     "global_buffer".
+
+     ".is_restrict"         boolean                  Indicates if the kernel argument
+                                                     is restrict qualified. Only
+                                                     present if ".value_kind" is
+                                                     "global_buffer".
+
+     ".is_volatile"         boolean                  Indicates if the kernel argument
+                                                     is volatile qualified. Only
+                                                     present if ".value_kind" is
+                                                     "global_buffer".
+
+     ".is_pipe"             boolean                  Indicates if the kernel argument
+                                                     is pipe qualified. Only present
+                                                     if ".value_kind" is "pipe".
+
+                                                     .. TODO
+                                                        Can "global_buffer" be pipe
+                                                        qualified?
+     ====================== ============== ========= ================================
+
 ..
 
 Kernel Dispatch
@@ -4103,21 +4558,26 @@ Instructions
 .. toctree::
    :hidden:
 
-   AMDGPUAsmGFX7
-   AMDGPUAsmGFX8
-   AMDGPUAsmGFX9
+   AMDGPU/AMDGPUAsmGFX7
+   AMDGPU/AMDGPUAsmGFX8
+   AMDGPU/AMDGPUAsmGFX9
+   AMDGPUModifierSyntax
    AMDGPUOperandSyntax
+   AMDGPUInstructionSyntax
+   AMDGPUInstructionNotation
 
-An instruction has the following syntax:
+An instruction has the following :doc:`syntax<AMDGPUInstructionSyntax>`:
 
-    *<opcode> <operand0>, <operand1>,... <modifier0> <modifier1>...*
+    ``<``\ *opcode*\ ``>    <``\ *operand0*\ ``>, <``\ *operand1*\ ``>,...    <``\ *modifier0*\ ``> <``\ *modifier1*\ ``>...``
 
-Note that operands are normally comma-separated while modifiers are space-separated.
+:doc:`Operands<AMDGPUOperandSyntax>` are normally comma-separated while
+:doc:`modifiers<AMDGPUModifierSyntax>` are space-separated.
 
-The order of operands and modifiers is fixed. Most modifiers are optional and may be omitted.
+The order of *operands* and *modifiers* is fixed.
+Most *modifiers* are optional and may be omitted.
 
-See detailed instruction syntax description for :doc:`GFX7<AMDGPUAsmGFX7>`,
-:doc:`GFX8<AMDGPUAsmGFX8>` and :doc:`GFX9<AMDGPUAsmGFX9>`.
+See detailed instruction syntax description for :doc:`GFX7<AMDGPU/AMDGPUAsmGFX7>`,
+:doc:`GFX8<AMDGPU/AMDGPUAsmGFX8>` and :doc:`GFX9<AMDGPU/AMDGPUAsmGFX9>`.
 
 Note that features under development are not included in this description.
 
@@ -4128,22 +4588,12 @@ operands, refer to one of instruction set architecture manuals
 Operands
 ~~~~~~~~
 
-The following syntax for register operands is supported:
-
-* SGPR registers: s0, ... or s[0], ...
-* VGPR registers: v0, ... or v[0], ...
-* TTMP registers: ttmp0, ... or ttmp[0], ...
-* Special registers: exec (exec_lo, exec_hi), vcc (vcc_lo, vcc_hi), flat_scratch (flat_scratch_lo, flat_scratch_hi)
-* Special trap registers: tba (tba_lo, tba_hi), tma (tma_lo, tma_hi)
-* Register pairs, quads, etc: s[2:3], v[10:11], ttmp[5:6], s[4:7], v[12:15], ttmp[4:7], s[8:15], ...
-* Register lists: [s0, s1], [ttmp0, ttmp1, ttmp2, ttmp3]
-* Register index expressions: v[2*2], s[1-1:2-1]
-* 'off' indicates that an operand is not enabled
+Detailed description of operands may be found :doc:`here<AMDGPUOperandSyntax>`.
 
 Modifiers
 ~~~~~~~~~
 
-Detailed description of modifiers may be found :doc:`here<AMDGPUOperandSyntax>`.
+Detailed description of modifiers may be found :doc:`here<AMDGPUModifierSyntax>`.
 
 Instruction Examples
 ~~~~~~~~~~~~~~~~~~~~
@@ -4373,7 +4823,7 @@ used.  The default value for all keys is 0, with the following exceptions:
 - *wavefront_size* defaults to 6.
 - *kernarg_segment_alignment*, *group_segment_alignment*, and
   *private_segment_alignment* default to 4. Note that alignments are specified
-  as a power of two, so a value of **n** means an alignment of 2^ **n**.
+  as a power of 2, so a value of **n** means an alignment of 2^ **n**.
 
 The *.amd_kernel_code_t* directive must be placed immediately after the
 function label and before any instructions.
@@ -4586,6 +5036,17 @@ terminated by an ``.end_amdhsa_kernel`` directive.
                                                                                             :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx9-table`.
      ======================================================== ================ ============ ===================
 
+.amdgpu_metadata
+++++++++++++++++
+
+Optional directive which declares the contents of the ``NT_AMDGPU_METADATA``
+note record (see :ref:`amdgpu-elf-note-records-table-v3`).
+
+The contents must be in the [YAML]_ markup format, with the same structure and
+semantics described in :ref:`amdgpu-amdhsa-code-object-metadata-v3`.
+
+This directive is terminated by an ``.end_amdgpu_metadata`` directive.
+
 Example HSA Source Code (-mattr=+code-object-v3)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -4618,6 +5079,24 @@ Here is an example of a minimal assembly source file, defining one HSA kernel:
     .amdhsa_next_free_sgpr .amdgcn.next_free_sgpr
   .end_amdhsa_kernel
 
+  .amdgpu_metadata
+  ---
+  amdhsa.version:
+    - 1
+    - 0
+  amdhsa.kernels:
+    - .name: hello_world
+      .symbol: hello_world.kd
+      .kernarg_segment_size: 48
+      .group_segment_fixed_size: 0
+      .private_segment_fixed_size: 0
+      .kernarg_segment_align: 4
+      .wavefront_size: 64
+      .sgpr_count: 2
+      .vgpr_count: 3
+      .max_flat_workgroup_size: 256
+  ...
+  .end_amdgpu_metadata
 
 Additional Documentation
 ========================
@@ -4636,6 +5115,7 @@ Additional Documentation
 .. [ELF] `Executable and Linkable Format (ELF) <http://www.sco.com/developers/gabi/>`__
 .. [DWARF] `DWARF Debugging Information Format <http://dwarfstd.org/>`__
 .. [YAML] `YAML Ain't Markup Language (YAML™) Version 1.2 <http://www.yaml.org/spec/1.2/spec.html>`__
+.. [MsgPack] `Message Pack <http://www.msgpack.org/>`__
 .. [OpenCL] `The OpenCL Specification Version 2.0 <http://www.khronos.org/registry/cl/specs/opencl-2.0.pdf>`__
 .. [HRF] `Heterogeneous-race-free Memory Models <http://benedictgaster.org/wp-content/uploads/2014/01/asplos269-FINAL.pdf>`__
 .. [CLANG-ATTR] `Attributes in Clang <http://clang.llvm.org/docs/AttributeReference.html>`__
diff --git a/docs/AdvancedBuilds.rst b/docs/AdvancedBuilds.rst
index d2a2ef58b23e4c85a6bea3a2f16baa9a49e36afa..695dcfb62a1fd21fa8dc97e415f36c05ab155640 100644
--- a/docs/AdvancedBuilds.rst
+++ b/docs/AdvancedBuilds.rst
@@ -51,6 +51,15 @@ CMake option, each variable separated by a ";". As example:
   $ cmake -G Ninja -DCLANG_ENABLE_BOOTSTRAP=On -DCLANG_BOOTSTRAP_PASSTHROUGH="CMAKE_INSTALL_PREFIX;CMAKE_VERBOSE_MAKEFILE" <path to source>
   $ ninja stage2
 
+CMake options starting by ``BOOTSTRAP_`` will be passed only to the stage2 build.
+This gives the opportunity to use Clang specific build flags.
+For example, the following CMake call will enabled '-fno-addrsig' only during
+the stage2 build for C and C++.
+
+.. code-block:: console
+
+  $ cmake [..]  -DBOOTSTRAP_CMAKE_CXX_FLAGS='-fno-addrsig' -DBOOTSTRAP_CMAKE_C_FLAGS='-fno-addrsig' [..]
+
 The clang build system refers to builds as stages. A stage1 build is a standard
 build using the compiler installed on the host, and a stage2 build is built
 using the stage1 compiler. This nomenclature holds up to more stages too. In
diff --git a/docs/Atomics.rst b/docs/Atomics.rst
index 4961348d0c97d7915f13c3afa51511a97e1283c3..450b5b36f63abeaf2aaa3fb8864d5aa5c9a7636d 100644
--- a/docs/Atomics.rst
+++ b/docs/Atomics.rst
@@ -461,8 +461,24 @@ atomic constructs. Here are some lowerings it can do:
 * atomic rmw -> loop with cmpxchg or load-linked/store-conditional
   by overriding ``expandAtomicRMWInIR()``
 * expansion to __atomic_* libcalls for unsupported sizes.
-
-For an example of all of these, look at the ARM backend.
+* part-word atomicrmw/cmpxchg -> target-specific intrinsic by overriding
+  ``shouldExpandAtomicRMWInIR``, ``emitMaskedAtomicRMWIntrinsic``,
+  ``shouldExpandAtomicCmpXchgInIR``, and ``emitMaskedAtomicCmpXchgIntrinsic``.
+
+For an example of these look at the ARM (first five lowerings) or RISC-V (last
+lowering) backend.
+
+AtomicExpandPass supports two strategies for lowering atomicrmw/cmpxchg to
+load-linked/store-conditional (LL/SC) loops. The first expands the LL/SC loop
+in IR, calling target lowering hooks to emit intrinsics for the LL and SC
+operations. However, many architectures have strict requirements for LL/SC
+loops to ensure forward progress, such as restrictions on the number and type
+of instructions in the loop. It isn't possible to enforce these restrictions
+when the loop is expanded in LLVM IR, and so affected targets may prefer to
+expand to LL/SC loops at a very late stage (i.e. after register allocation).
+AtomicExpandPass can help support lowering of part-word atomicrmw or cmpxchg
+using this strategy by producing IR for any shifting and masking that can be
+performed outside of the LL/SC loop.
 
 Libcalls: __atomic_*
 ====================
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 92cad91a08e44b7824fe2da9a419bca4a721fb3b..88d8ff804d981f333e57c64dff88dab3ca69c813 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -305,6 +305,21 @@ useful to use C style (``/* */``) comments however:
 #. When writing a source file that is used by a tool that only accepts C style
    comments.
 
+#. When documenting the significance of constants used as actual parameters in
+   a call. This is most helpful for ``bool`` parameters, or passing ``0`` or
+   ``nullptr``. Typically you add the formal parameter name, which ought to be
+   meaningful. For example, it's not clear what the parameter means in this call:
+
+   .. code-block:: c++
+
+     Object.emitName(nullptr);
+
+   An in-line C-style comment makes the intent obvious:
+
+   .. code-block:: c++
+
+     Object.emitName(/*Prefix=*/nullptr);
+
 Commenting out large blocks of code is discouraged, but if you really have to do
 this (for documentation purposes or as a suggestion for debug printing), use
 ``#if 0`` and ``#endif``. These nest properly and are better behaved in general
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 6581b33ba1c716ccc6e0f53c39b5b2fb3745907e..721d2c2e782e8c2949691b6074e4dc97d575c2c7 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -80,9 +80,16 @@ and from the command line.
   -verify``. With this option FileCheck will verify that input does not contain
   warnings not covered by any ``CHECK:`` patterns.
 
+.. option:: --dump-input <mode>
+
+  Dump input to stderr, adding annotations representing currently enabled
+  diagnostics.  Do this either 'always', on 'fail', or 'never'.  Specify 'help'
+  to explain the dump format and quit.
+
 .. option:: --dump-input-on-failure
 
-  When the check fails, dump all of the original input.
+  When the check fails, dump all of the original input.  This option is
+  deprecated in favor of `--dump-input=fail`.
 
 .. option:: --enable-var-scope
 
@@ -311,6 +318,29 @@ can be used:
    ; CHECK: ret i8
    }
 
+The "CHECK-COUNT:" directive
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to match multiple lines with the same pattern over and over again
+you can repeat a plain ``CHECK:`` as many times as needed. If that looks too
+boring you can instead use a counted check "``CHECK-COUNT-<num>:``", where
+``<num>`` is a positive decimal number. It will match the pattern exactly
+``<num>`` times, no more and no less. If you specified a custom check prefix,
+just use "``<PREFIX>-COUNT-<num>:``" for the same effect.
+Here is a simple example:
+
+.. code-block:: text
+
+   Loop at depth 1
+   Loop at depth 1
+   Loop at depth 1
+   Loop at depth 1
+     Loop at depth 2
+       Loop at depth 3
+
+   ; CHECK-COUNT-6: Loop at depth {{[0-9]+}}
+   ; CHECK-NOT:     Loop at depth {{[0-9]+}}
+
 The "CHECK-DAG:" directive
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/CommandGuide/llvm-cov.rst b/docs/CommandGuide/llvm-cov.rst
index 6f1b6e46c48ae869a0ef943b32dff41fdc2b8acd..71924e997d934a6f0725852a7cb3328f903da6ff 100644
--- a/docs/CommandGuide/llvm-cov.rst
+++ b/docs/CommandGuide/llvm-cov.rst
@@ -374,9 +374,15 @@ SYNOPSIS
 DESCRIPTION
 ^^^^^^^^^^^
 
-The :program:`llvm-cov export` command exports regions, functions, expansions,
-and summaries of the coverage of the binaries *BIN*,... using the profile data
-*PROFILE* as JSON. It can optionally be filtered to only export the coverage
+The :program:`llvm-cov export` command exports coverage data of the binaries
+*BIN*,... using the profile data *PROFILE* in either JSON or lcov trace file
+format.
+
+When exporting JSON, the regions, functions, expansions, and summaries of the
+coverage data will be exported. When exporting an lcov trace file, the
+line-based coverage and summaries will be exported.
+
+The exported data can optionally be filtered to only export the coverage
 for the files listed in *SOURCES*.
 
 For information on compiling programs for coverage and generating profile data,
@@ -392,12 +398,18 @@ OPTIONS
  universal binary or to use an architecture that does not match a
  non-universal binary.
 
+.. option:: -format=<FORMAT>
+
+ Use the specified output format. The supported formats are: "text" (JSON),
+ "lcov".
+
 .. option:: -summary-only
 
  Export only summary information for each file in the coverage data. This mode
  will not export coverage information for smaller units such as individual
- functions or regions. The result will be the same as produced by :program:
- `llvm-cov report` command, but presented in JSON format rather than text.
+ functions or regions. The result will contain the same information as produced
+ by the :program:`llvm-cov report` command, but presented in JSON or lcov
+ format rather than text.
 
 .. option:: -ignore-filename-regex=<PATTERN>
 
diff --git a/docs/CommandGuide/llvm-mca.rst b/docs/CommandGuide/llvm-mca.rst
index 100136e4d1734c9cd72cb49e16c58f9ccff22ef7..bc50794e0cbc613755610ac0918080a7cfbbee93 100644
--- a/docs/CommandGuide/llvm-mca.rst
+++ b/docs/CommandGuide/llvm-mca.rst
@@ -516,6 +516,10 @@ sections.
    1,           102  (16.7%)
    2,           399  (65.4%)
 
+  Total ROB Entries:                64
+  Max Used ROB Entries:             35  ( 54.7% )
+  Average Used ROB Entries per cy:  32  ( 50.0% )
+
 
   Register File statistics:
   Total number of mappings created:    900
diff --git a/docs/CompileCudaWithLLVM.rst b/docs/CompileCudaWithLLVM.rst
index 6ad8652cfc1d527b0ca3b2c8cacdbcbc4aeb712f..95d6b0d9b82a19fef66520d88d92ed5dda32ade9 100644
--- a/docs/CompileCudaWithLLVM.rst
+++ b/docs/CompileCudaWithLLVM.rst
@@ -22,21 +22,21 @@ Compiling CUDA Code
 Prerequisites
 -------------
 
-CUDA is supported in llvm 3.9, but it's still in active development, so we
-recommend you `compile clang/LLVM from HEAD
-<http://llvm.org/docs/GettingStarted.html>`_.
-
-Before you build CUDA code, you'll need to have installed the appropriate
-driver for your nvidia GPU and the CUDA SDK.  See `NVIDIA's CUDA installation
-guide <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html>`_
-for details.  Note that clang `does not support
-<https://llvm.org/bugs/show_bug.cgi?id=26966>`_ the CUDA toolkit as installed
-by many Linux package managers; you probably need to install nvidia's package.
-
-You will need CUDA 7.0, 7.5, or 8.0 to compile with clang.
-
-CUDA compilation is supported on Linux, on MacOS as of 2016-11-18, and on
-Windows as of 2017-01-05.
+CUDA is supported since llvm 3.9. Current release of clang (7.0.0) supports CUDA
+7.0 through 9.2. If you need support for CUDA 10, you will need to use clang
+built from r342924 or newer.
+
+Before you build CUDA code, you'll need to have installed the appropriate driver
+for your nvidia GPU and the CUDA SDK.  See `NVIDIA's CUDA installation guide
+<https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html>`_ for
+details.  Note that clang `does not support
+<https://llvm.org/bugs/show_bug.cgi?id=26966>`_ the CUDA toolkit as installed by
+many Linux package managers; you probably need to install CUDA in a single
+directory from NVIDIA's package.
+
+CUDA compilation is supported on Linux. Compilation on MacOS and Windows may or
+may not work and currently have no maintainers. Compilation with CUDA-9.x is
+`currently broken on Windows <https://bugs.llvm.org/show_bug.cgi?id=38811>`_.
 
 Invoking clang
 --------------
@@ -73,7 +73,9 @@ run your program.
   Pass e.g. ``-L/usr/local/cuda/lib64`` if compiling in 64-bit mode; otherwise,
   pass e.g. ``-L/usr/local/cuda/lib``.  (In CUDA, the device code and host code
   always have the same pointer widths, so if you're compiling 64-bit code for
-  the host, you're also compiling 64-bit code for the device.)
+  the host, you're also compiling 64-bit code for the device.) Note that as of
+  v10.0 CUDA SDK `no longer supports compilation of 32-bit
+  applications <https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#deprecated-features>`_.
 
 * ``<GPU arch>`` -- the `compute capability
   <https://developer.nvidia.com/cuda-gpus>`_ of your GPU. For example, if you
@@ -89,8 +91,7 @@ run your program.
 
 The `-L` and `-l` flags only need to be passed when linking.  When compiling,
 you may also need to pass ``--cuda-path=/path/to/cuda`` if you didn't install
-the CUDA SDK into ``/usr/local/cuda``, ``/usr/local/cuda-7.0``, or
-``/usr/local/cuda-7.5``.
+the CUDA SDK into ``/usr/local/cuda`` or ``/usr/local/cuda-X.Y``.
 
 Flags that control numerical code
 ---------------------------------
@@ -548,9 +549,9 @@ The relevant tools are now just vanilla clang/LLVM.
 | Jingyue Wu, Artem Belevich, Eli Bendersky, Mark Heffernan, Chris Leary, Jacques Pienaar, Bjarke Roune, Rob Springer, Xuetian Weng, Robert Hundt
 | *Proceedings of the 2016 International Symposium on Code Generation and Optimization (CGO 2016)*
 |
-| `Slides from the CGO talk <http://wujingyue.com/docs/gpucc-talk.pdf>`_
+| `Slides from the CGO talk <http://wujingyue.github.io/docs/gpucc-talk.pdf>`_
 |
-| `Tutorial given at CGO <http://wujingyue.com/docs/gpucc-tutorial.pdf>`_
+| `Tutorial given at CGO <http://wujingyue.github.io/docs/gpucc-tutorial.pdf>`_
 
 Obtaining Help
 ==============
diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst
index e4f5802f887949058b6c3cd476ad42c0a2603122..5e671bc10b6988f4a7f3a398184fa7dea8af5049 100644
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst
@@ -835,45 +835,23 @@ for collector plugins which implement reference counting or a shadow stack.
 
 .. _init-roots:
 
-Initializing roots to null: ``InitRoots``
------------------------------------------
-
-.. code-block:: c++
-
-  MyGC::MyGC() {
-    InitRoots = true;
-  }
-
-When set, LLVM will automatically initialize each root to ``null`` upon entry to
-the function.  This prevents the GC's sweep phase from visiting uninitialized
-pointers, which will almost certainly cause it to crash.  This initialization
-occurs before custom lowering, so the two may be used together.
-
-Since LLVM does not yet compute liveness information, there is no means of
-distinguishing an uninitialized stack root from an initialized one.  Therefore,
-this feature should be used by all GC plugins.  It is enabled by default.
+Initializing roots to null
+---------------------------
 
-Custom lowering of intrinsics: ``CustomRoots``, ``CustomReadBarriers``, and ``CustomWriteBarriers``
----------------------------------------------------------------------------------------------------
+It is recommended that frontends initialize roots explicitly to avoid
+potentially confusing the optimizer.  This prevents the GC from visiting
+uninitialized pointers, which will almost certainly cause it to crash.
 
-For GCs which use barriers or unusual treatment of stack roots, these 
-flags allow the collector to perform arbitrary transformations of the
-LLVM IR:
+As a fallback, LLVM will automatically initialize each root to ``null``
+upon entry to the function.  Support for this mode in code generation is
+largely a legacy detail to keep old collector implementations working.
 
-.. code-block:: c++
-
-  class MyGC : public GCStrategy {
-  public:
-    MyGC() {
-      CustomRoots = true;
-      CustomReadBarriers = true;
-      CustomWriteBarriers = true;
-    }
-  };
+Custom lowering of intrinsics
+------------------------------
 
-If any of these flags are set, LLVM suppresses its default lowering for
-the corresponding intrinsics.  Instead, you must provide a custom Pass
-which lowers the intrinsics as desired.  If you have opted in to custom
+For GCs which use barriers or unusual treatment of stack roots, the
+implementor is responsibly for providing a custom pass to lower the
+intrinsics with the desired semantics.  If you have opted in to custom
 lowering of a particular intrinsic your pass **must** eliminate all 
 instances of the corresponding intrinsic in functions which opt in to
 your GC.  The best example of such a pass is the ShadowStackGC and it's 
@@ -884,62 +862,14 @@ without building a custom copy of LLVM.
 
 .. _safe-points:
 
-Generating safe points: ``NeededSafePoints``
---------------------------------------------
-
-LLVM can compute four kinds of safe points:
-
-.. code-block:: c++
-
-  namespace GC {
-    /// PointKind - The type of a collector-safe point.
-    ///
-    enum PointKind {
-      Loop,    //< Instr is a loop (backwards branch).
-      Return,  //< Instr is a return instruction.
-      PreCall, //< Instr is a call instruction.
-      PostCall //< Instr is the return address of a call.
-    };
-  }
-
-A collector can request any combination of the four by setting the
-``NeededSafePoints`` mask:
-
-.. code-block:: c++
-
-  MyGC::MyGC()  {
-    NeededSafePoints = 1 << GC::Loop
-                     | 1 << GC::Return
-                     | 1 << GC::PreCall
-                     | 1 << GC::PostCall;
-  }
-
-It can then use the following routines to access safe points.
-
-.. code-block:: c++
-
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    GCFunctionInfo *MD = *I;
-    size_t PointCount = MD->size();
-
-    for (GCFunctionInfo::iterator PI = MD->begin(),
-                                  PE = MD->end(); PI != PE; ++PI) {
-      GC::PointKind PointKind = PI->Kind;
-      unsigned PointNum = PI->Num;
-    }
-  }
-
-Almost every collector requires ``PostCall`` safe points, since these correspond
-to the moments when the function is suspended during a call to a subroutine.
-
-Threaded programs generally require ``Loop`` safe points to guarantee that the
-application will reach a safe point within a bounded amount of time, even if it
-is executing a long-running loop which contains no function calls.
+Generating safe points
+-----------------------
 
-Threaded collectors may also require ``Return`` and ``PreCall`` safe points to
-implement "stop the world" techniques using self-modifying code, where it is
-important that the program not exit the function without reaching a safe point
-(because only the topmost function has been patched).
+LLVM provides support for associating stackmaps with the return address of
+a call.  Any loop or return safepoints required by a given collector design
+can be modeled via calls to runtime routines, or potentially patchable call
+sequences.  Using gcroot, all call instructions are inferred to be possible
+safepoints and will thus have an associated stackmap.
 
 .. _assembly:
 
diff --git a/docs/HowToBuildWithPGO.rst b/docs/HowToBuildWithPGO.rst
index ba93bc64a294ab9b2cdd1763160620336bf6070a..77867dbc1d5dad0fb06790577d15eb6e3167f10c 100644
--- a/docs/HowToBuildWithPGO.rst
+++ b/docs/HowToBuildWithPGO.rst
@@ -125,12 +125,12 @@ In more detailed steps:
       It's recommended to build the ``all`` target with your instrumented Clang,
       since more coverage is often better.
 
-  b. You should now have a few ``*.profdata`` files in
+  b. You should now have a few ``*.profraw`` files in
      ``path/to/stage2/profiles/``. You need to merge these using
      ``llvm-profdata`` (even if you only have one! The profile merge transforms
      profraw into actual profile data, as well). This can be done with
-     ``/path/to/stage1/llvm-profdata -merge
-     -output=/path/to/output/profdata.prof path/to/stage2/profiles/*.profdata``.
+     ``/path/to/stage1/llvm-profdata merge
+     -output=/path/to/output/profdata.prof path/to/stage2/profiles/*.profraw``.
 
 4. Now, build your final, PGO-optimized Clang. To do this, you'll want to pass
    the following additional arguments to CMake.
diff --git a/docs/HowToCrossCompileBuiltinsOnArm.rst b/docs/HowToCrossCompileBuiltinsOnArm.rst
index 4b4d563a5a968dfd4317d785dc1cdb3c4355b143..6ad93c773b25758c428dcd25abc15d6892e66c5b 100644
--- a/docs/HowToCrossCompileBuiltinsOnArm.rst
+++ b/docs/HowToCrossCompileBuiltinsOnArm.rst
@@ -19,33 +19,46 @@ may need to adapt the instructions here to fit your own local situation.
 Prerequisites
 =============
 
-In this use case we'll be using CMake on a Debian-based Linux system,
+In this use case we'll be using cmake on a Debian-based Linux system,
 cross-compiling from an x86_64 host to a hard-float Armv7-A target. We'll be
 using as many of the LLVM tools as we can, but it is possible to use GNU
 equivalents.
 
  * ``A build of LLVM/clang for the llvm-tools and llvm-config``
+ * ``A clang executable with support for the ARM target``
+ * ``compiler-rt sources``
  * ``The qemu-arm user mode emulator``
  * ``An arm-linux-gnueabihf sysroot``
 
+In this example we will be using ninja.
+
 See https://compiler-rt.llvm.org/ for more information about the dependencies
 on clang and LLVM.
 
+See https://llvm.org/docs/GettingStarted.html for information about obtaining
+the source for LLVM and compiler-rt. Note that the getting started guide
+places compiler-rt in the projects subdirectory, but this is not essential and
+if you are using the BaremetalARM.cmake cache for v6-M, v7-M and v7-EM then
+compiler-rt must be placed in the runtimes directory.
+
 ``qemu-arm`` should be available as a package for your Linux distribution.
 
 The most complicated of the prequisites to satisfy is the arm-linux-gnueabihf
-sysroot. The :doc:`HowToCrossCompileLLVM` has information about how to use the
-Linux distributions multiarch support to fulfill the dependencies for building
-LLVM. Alternatively, as building and testing just the compiler-rt builtins
-requires fewer dependencies than LLVM, it is possible to use the Linaro
-arm-linux-gnueabihf gcc installation as our sysroot.
+sysroot. In theory it is possible to use the Linux distributions multiarch
+support to fulfill the dependencies for building but unfortunately due to
+/usr/local/include being added some host includes are selected. The easiest way
+to supply a sysroot is to download the arm-linux-gnueabihf toolchain. This can
+be found at:
+* https://developer.arm.com/open-source/gnu-toolchain/gnu-a/downloads for gcc 8 and above
+* https://releases.linaro.org/components/toolchain/binaries/ for gcc 4.9 to 7.3
 
 Building compiler-rt builtins for Arm
 =====================================
 We will be doing a standalone build of compiler-rt using the following cmake
 options.
 
-* ``path/to/llvm/projects/compiler-rt``
+* ``path/to/compiler-rt``
+* ``-G Ninja``
 * ``-DCOMPILER_RT_BUILD_BUILTINS=ON``
 * ``-DCOMPILER_RT_BUILD_SANITIZERS=OFF``
 * ``-DCOMPILER_RT_BUILD_XRAY=OFF``
@@ -57,22 +70,38 @@ options.
 * ``-DCMAKE_RANLIB=/path/to/llvm-ranlib``
 * ``-DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld"``
 * ``-DCMAKE_C_COMPILER_TARGET="arm-linux-gnueabihf"``
+* ``-DCMAKE_ASM_COMPILER_TARGET="arm-linux-gnueabihf"``
 * ``-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON``
 * ``-DLLVM_CONFIG_PATH=/path/to/llvm-config``
 * ``-DCMAKE_C_FLAGS="build-c-flags"``
+* ``-DCMAKE_ASM_FLAGS="build-c-flags"``
 
-The build-c-flags need to be sufficient to pass the C-make compiler check and
-to compile compiler-rt. When using a GCC 7 Linaro arm-linux-gnueabihf
-installation the following flags are needed:
+The ``build-c-flags`` need to be sufficient to pass the C-make compiler check,
+compile compiler-rt, and if you are running the tests, compile and link the
+tests. When cross-compiling with clang we will need to pass sufficient
+information to generate code for the Arm architecture we are targeting. We will
+need to select the Arm target, select the Armv7-A architecture and choose
+between using Arm or Thumb.
+instructions. For example:
 
 * ``--target=arm-linux-gnueabihf``
-* ``--march=armv7a``
+* ``-march=armv7a``
+* ``-mthumb``
+
+When using a GCC arm-linux-gnueabihf toolchain the following flags are
+needed to pick up the includes and libraries:
+
 * ``--gcc-toolchain=/path/to/dir/toolchain``
 * ``--sysroot=/path/to/toolchain/arm-linux-gnueabihf/libc``
 
-Depending on how your sysroot is laid out, you may not need ``--gcc-toolchain``.
-For example if you have added armhf as an architecture using your Linux
-distributions multiarch support then you should be able to use ``--sysroot=/``.
+In this example we will be adding all of the command line options to both
+``CMAKE_C_FLAGS`` and ``CMAKE_ASM_FLAGS``. There are cmake flags to pass some of
+these options individually which can be used to simplify the ``build-c-flags``:
+
+* ``-DCMAKE_C_COMPILER_TARGET="arm-linux-gnueabihf"``
+* ``-DCMAKE_ASM_COMPILER_TARGET="arm-linux-gnueabihf"``
+* ``-DCMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN=/path/to/dir/toolchain``
+* ``-DCMAKE_SYSROOT=/path/to/dir/toolchain/arm-linux-gnueabihf/libc``
 
 Once cmake has completed the builtins can be built with ``ninja builtins``
 
@@ -90,12 +119,72 @@ cmake that we wish to run the tests on ``qemu-arm``.
 The ``/path/to/armhf/sysroot`` should be the same as the one passed to
 ``--sysroot`` in the "build-c-flags".
 
-The "test-c-flags" can be the same as the "build-c-flags", with the addition
-of ``"-fuse-ld=lld`` if you wish to use lld to link the tests.
+The "test-c-flags" need to include the target, architecture, gcc-toolchain,
+sysroot and arm/thumb state. The additional cmake defines such as
+``CMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN`` do not apply when building the tests. If
+you have put all of these in "build-c-flags" then these can be repeated. If you
+wish to use lld to link the tests then add ``"-fuse-ld=lld``.
 
 Once cmake has completed the tests can be built and run using
 ``ninja check-builtins``
 
+Troubleshooting
+===============
+
+The cmake try compile stage fails
+---------------------------------
+At an early stage cmake will attempt to compile and link a simple C program to
+test if the toolchain is working.
+
+This stage can often fail at link time if the ``--sysroot`` and
+``--gcc-toolchain`` options are not passed to the compiler. Check the
+``CMAKE_C_FLAGS`` and ``CMAKE_C_COMPILER_TARGET`` flags.
+
+It can be useful to build a simple example outside of cmake with your toolchain
+to make sure it is working. For example: ``clang --target=arm-linux-gnueabi -march=armv7a --gcc-toolchain=/path/to/gcc-toolchain --sysroot=/path/to/gcc-toolchain/arm-linux-gnueabihf/libc helloworld.c``
+
+Clang uses the host header files
+--------------------------------
+On debian based systems it is possible to install multiarch support for
+arm-linux-gnueabi and arm-linux-gnueabihf. In many cases clang can successfully
+use this multiarch support when -gcc-toolchain and --sysroot are not supplied.
+Unfortunately clang adds ``/usr/local/include`` before
+``/usr/include/arm-linux-gnueabihf`` leading to errors when compiling the hosts
+header files.
+
+The multiarch support is not sufficient to build the builtins you will need to
+use a separate arm-linux-gnueabihf toolchain.
+
+No target passed to clang
+-------------------------
+If clang is not given a target it will typically use the host target, this will
+not understand the Arm assembly language files resulting in error messages such
+as ``error: unknown directive .syntax unified``.
+
+You can check the clang invocation in the error message to see if there is no
+``--target`` or if it is set incorrectly. The cause is usually
+``CMAKE_ASM_FLAGS`` not containing ``--target`` or ``CMAKE_ASM_COMPILER_TARGET`` not being present.
+
+Arm architecture not given
+--------------------------
+The ``--target=arm-linux-gnueabihf`` will default to arm architecture v4t which
+cannot assemble the barrier instructions used in the synch_and_fetch source
+files.
+
+The cause is usually a missing ``-march=armv7a`` from the ``CMAKE_ASM_FLAGS``.
+
+Compiler-rt builds but the tests fail to build
+----------------------------------------------
+The flags used to build the tests are not the same as those used to build the
+builtins. The c flags are provided by ``COMPILER_RT_TEST_COMPILE_CFLAGS`` and
+the ``CMAKE_C_COMPILER_TARGET``, ``CMAKE_ASM_COMPILER_TARGET``,
+``CMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN`` and ``CMAKE_SYSROOT`` flags are not
+applied.
+
+Make sure that ``COMPILER_RT_TEST_COMPILE_CFLAGS`` contains all the necessary
+information.
+
+
 Modifications for other Targets
 ===============================
 
@@ -112,6 +201,8 @@ may need extra c-flags such as ``-mfloat-abi=softfp`` for use of floating-point
 instructions, and ``-mfloat-abi=soft -mfpu=none`` for software floating-point
 emulation.
 
+You will need to use an arm-linux-gnueabi GNU toolchain for soft-float.
+
 AArch64 Target
 --------------
 The instructions for Arm can be used for AArch64 by substituting AArch64
@@ -125,39 +216,24 @@ The CMAKE_C_FLAGS and COMPILER_RT_TEST_COMPILER_CFLAGS may also need:
 
 Armv6-m, Armv7-m and Armv7E-M targets
 -------------------------------------
-If you wish to build, but not test compiler-rt for Armv6-M, Armv7-M or Armv7E-M
-then the easiest way is to use the BaremetalARM.cmake recipe in
-clang/cmake/caches.
-
-You will need a bare metal sysroot such as that provided by the GNU ARM
-Embedded toolchain.
-
-The libraries can be built with the cmake options:
-
-* ``-DBAREMETAL_ARMV6M_SYSROOT=/path/to/bare/metal/sysroot``
-* ``-DBAREMETAL_ARMV7M_SYSROOT=/path/to/bare/metal/sysroot``
-* ``-DBAREMETAL_ARMV7EM_SYSROOT=/path/to/bare/metal/sysroot``
-* ``-C /path/to/llvm/source/tools/clang/cmake/caches/BaremetalARM.cmake``
-
-**Note** that for the recipe to work the compiler-rt source must be checked out
-into the directory llvm/runtimes and not llvm/projects.
-
 To build and test the libraries using a similar method to Armv7-A is possible
 but more difficult. The main problems are:
 
 * There isn't a ``qemu-arm`` user-mode emulator for bare-metal systems. The ``qemu-system-arm`` can be used but this is significantly more difficult to setup.
-* The target to compile compiler-rt have the suffix -none-eabi. This uses the BareMetal driver in clang and by default won't find the libraries needed to pass the cmake compiler check.
+* The targets to compile compiler-rt have the suffix -none-eabi. This uses the BareMetal driver in clang and by default won't find the libraries needed to pass the cmake compiler check.
 
 As the Armv6-M, Armv7-M and Armv7E-M builds of compiler-rt only use instructions
 that are supported on Armv7-A we can still get most of the value of running the
 tests using the same ``qemu-arm`` that we used for Armv7-A by building and
 running the test cases for Armv7-A but using the builtins compiled for
-Armv6-M, Armv7-M or Armv7E-M. This will not catch instructions that are
-supported on Armv7-A but not Armv6-M, Armv7-M and Armv7E-M.
-
-To get the cmake compile test to pass the libraries needed to successfully link
-the test application will need to be manually added to ``CMAKE_CFLAGS``.
-Alternatively if you are using version 3.6 or above of cmake you can use
+Armv6-M, Armv7-M or Armv7E-M. This will test that the builtins can be linked
+into a binary and execute the tests correctly but it will not catch if the
+builtins use instructions that are supported on Armv7-A but not Armv6-M,
+Armv7-M and Armv7E-M.
+
+To get the cmake compile test to pass you will need to pass the libraries
+needed to successfully link the cmake test via ``CMAKE_CFLAGS``. It is
+strongly recommended that you use version 3.6 or above of cmake so you can use
 ``CMAKE_TRY_COMPILE_TARGET=STATIC_LIBRARY`` to skip the link step.
 
 * ``-DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY``
@@ -169,6 +245,7 @@ Alternatively if you are using version 3.6 or above of cmake you can use
 * ``-DCOMPILER_RT_BUILD_PROFILE=OFF``
 * ``-DCMAKE_C_COMPILER=${host_install_dir}/bin/clang``
 * ``-DCMAKE_C_COMPILER_TARGET="your *-none-eabi target"``
+* ``-DCMAKE_ASM_COMPILER_TARGET="your *-none-eabi target"``
 * ``-DCMAKE_AR=/path/to/llvm-ar``
 * ``-DCMAKE_NM=/path/to/llvm-nm``
 * ``-DCMAKE_RANLIB=/path/to/llvm-ranlib``
@@ -176,7 +253,7 @@ Alternatively if you are using version 3.6 or above of cmake you can use
 * ``-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON``
 * ``-DLLVM_CONFIG_PATH=/path/to/llvm-config``
 * ``-DCMAKE_C_FLAGS="build-c-flags"``
-* ``-DCMAKE_ASM_FLAGS="${arm_cflags}"``
+* ``-DCMAKE_ASM_FLAGS="build-c-flags"``
 * ``-DCOMPILER_RT_EMULATOR="qemu-arm -L /path/to/armv7-A/sysroot"``
 * ``-DCOMPILER_RT_INCLUDE_TESTS=ON``
 * ``-DCOMPILER_RT_TEST_COMPILER="/path/to/clang"``
@@ -186,16 +263,28 @@ The Armv6-M builtins will use the soft-float ABI. When compiling the tests for
 Armv7-A we must include ``"-mthumb -mfloat-abi=soft -mfpu=none"`` in the
 test-c-flags. We must use an Armv7-A soft-float abi sysroot for ``qemu-arm``.
 
-Unfortunately at time of writing the Armv7-M and Armv7E-M builds of
-compiler-rt will always include assembler files including floating point
-instructions. This means that building for a cpu without a floating point unit
-requires something like removing the arm_Thumb1_VFPv2_SOURCES from the
-arm_Thumb1_SOURCES in builtins/CMakeLists.txt. The float-abi of the compiler-rt
-library must be matched by the float abi of the Armv7-A sysroot used by
-qemu-arm.
-
 Depending on the linker used for the test cases you may encounter BuildAttribute
 mismatches between the M-profile objects from compiler-rt and the A-profile
-objects from the test. The lld linker does not check the BuildAttributes so it
-can be used to link the tests by adding -fuse-ld=lld to the
+objects from the test. The lld linker does not check the profile
+BuildAttribute so it can be used to link the tests by adding -fuse-ld=lld to the
 ``COMPILER_RT_TEST_COMPILER_CFLAGS``.
+
+Alternative using a cmake cache
+-------------------------------
+If you wish to build, but not test compiler-rt for Armv6-M, Armv7-M or Armv7E-M
+the easiest way is to use the BaremetalARM.cmake recipe in clang/cmake/caches.
+
+You will need a bare metal sysroot such as that provided by the GNU ARM
+Embedded toolchain.
+
+The libraries can be built with the cmake options:
+
+* ``-DBAREMETAL_ARMV6M_SYSROOT=/path/to/bare/metal/toolchain/arm-none-eabi``
+* ``-DBAREMETAL_ARMV7M_SYSROOT=/path/to/bare/metal/toolchain/arm-none-eabi``
+* ``-DBAREMETAL_ARMV7EM_SYSROOT=/path/to/bare/metal/toolchain/arm-none-eabi``
+* ``-C /path/to/llvm/source/tools/clang/cmake/caches/BaremetalARM.cmake``
+* ``/path/to/llvm``
+
+**Note** that for the recipe to work the compiler-rt source must be checked out
+into the directory llvm/runtimes. You will also need clang and lld checked out.
+
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 06e092fb9fc5269b60fd6f5d69a315002d933be9..ea879a5db5b70f5ae78ef12b0ecf7b878833feea 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -1643,19 +1643,15 @@ example:
 ``speculative_load_hardening``
     This attribute indicates that
     `Speculative Load Hardening <https://llvm.org/docs/SpeculativeLoadHardening.html>`_
-    should be enabled for the function body. This is a best-effort attempt to
-    mitigate all known speculative execution information leak vulnerabilities
-    that are based on the fundamental principles of modern processors'
-    speculative execution. These vulnerabilities are classified as "Spectre
-    variant #1" vulnerabilities typically. Notably, this does not attempt to
-    mitigate any vulnerabilities where the speculative execution and/or
-    prediction devices of specific processors can be *completely* undermined
-    (such as "Branch Target Injection", a.k.a, "Spectre variant #2"). Instead,
-    this is a target-independent request to harden against the completely
-    generic risk posed by speculative execution to incorrectly load secret data,
-    making it available to some micro-architectural side-channel for information
-    leak. For a processor without any speculative execution or predictors, this
-    is expected to be a no-op.
+    should be enabled for the function body.
+
+    Speculative Load Hardening is a best-effort mitigation against
+    information leak attacks that make use of control flow
+    miss-speculation - specifically miss-speculation of whether a branch
+    is taken or not. Typically vulnerabilities enabling such attacks are
+    classified as "Spectre variant #1". Notably, this does not attempt to
+    mitigate against miss-speculation of branch target, classified as
+    "Spectre variant #2" vulnerabilities.
 
     When inlining, the attribute is sticky. Inlining a function that carries
     this attribute will cause the caller to gain the attribute. This is intended
@@ -5080,6 +5076,8 @@ optimizations related to compare and branch instructions. The metadata
 is treated as a boolean value; if it exists, it signals that the branch
 or switch that it is attached to is completely unpredictable.
 
+.. _llvm.loop:
+
 '``llvm.loop``'
 ^^^^^^^^^^^^^^^
 
@@ -5113,6 +5111,26 @@ suggests an unroll factor to the loop unroller:
     !0 = !{!0, !1}
     !1 = !{!"llvm.loop.unroll.count", i32 4}
 
+'``llvm.loop.disable_nonforced``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata disables all optional loop transformations unless
+explicitly instructed using other transformation metdata such as
+``llvm.loop.unroll.enable``. That is, no heuristic will try to determine
+whether a transformation is profitable. The purpose is to avoid that the
+loop is transformed to a different loop before an explicitly requested
+(forced) transformation is applied. For instance, loop fusion can make
+other transformations impossible. Mandatory loop canonicalizations such
+as loop rotation are still applied.
+
+It is recommended to use this metadata in addition to any llvm.loop.*
+transformation directive. Also, any loop should have at most one
+directive applied to it (and a sequence of transformations built using
+followup-attributes). Otherwise, which transformation will be applied
+depends on implementation details such as the pass pipeline order.
+
+See :ref:`transformation-metadata` for details.
+
 '``llvm.loop.vectorize``' and '``llvm.loop.interleave``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -5171,6 +5189,29 @@ vectorization of the loop. If ``llvm.loop.vectorize.width`` is set to
 0 or if the loop does not have this metadata the width will be
 determined automatically.
 
+'``llvm.loop.vectorize.followup_vectorized``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which loop attributes the vectorized loop will
+have. See :ref:`transformation-metadata` for details.
+
+'``llvm.loop.vectorize.followup_epilogue``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which loop attributes the epilogue will have. The
+epilogue is not vectorized and is executed when either the vectorized
+loop is not known to preserve semantics (because e.g., it processes two
+arrays that are found to alias by a runtime check) or for the last
+iterations that do not fill a complete set of vector lanes. See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
+'``llvm.loop.vectorize.followup_all``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Attributes in the metadata will be added to both the vectorized and
+epilogue loop.
+See :ref:`Transformation Metadata <transformation-metadata>` for details.
+
 '``llvm.loop.unroll``'
 ^^^^^^^^^^^^^^^^^^^^^^
 
@@ -5239,6 +5280,19 @@ For example:
 
    !0 = !{!"llvm.loop.unroll.full"}
 
+'``llvm.loop.unroll.followup``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which loop attributes the unrolled loop will have.
+See :ref:`Transformation Metadata <transformation-metadata>` for details.
+
+'``llvm.loop.unroll.followup_remainder``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which loop attributes the remainder loop after
+partial/runtime unrolling will have. See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
 '``llvm.loop.unroll_and_jam``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -5292,6 +5346,43 @@ string ``llvm.loop.unroll_and_jam.enable``.  For example:
 
    !0 = !{!"llvm.loop.unroll_and_jam.enable"}
 
+'``llvm.loop.unroll_and_jam.followup_outer``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which loop attributes the outer unrolled loop will
+have. See :ref:`Transformation Metadata <transformation-metadata>` for
+details.
+
+'``llvm.loop.unroll_and_jam.followup_inner``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which loop attributes the inner jammed loop will
+have. See :ref:`Transformation Metadata <transformation-metadata>` for
+details.
+
+'``llvm.loop.unroll_and_jam.followup_remainder_outer``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which attributes the epilogue of the outer loop
+will have. This loop is usually unrolled, meaning there is no such
+loop. This attribute will be ignored in this case. See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
+'``llvm.loop.unroll_and_jam.followup_remainder_inner``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which attributes the inner loop of the epilogue
+will have. The outer epilogue will usually be unrolled, meaning there
+can be multiple inner remainder loops. See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
+'``llvm.loop.unroll_and_jam.followup_all``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Attributes specified in the metadata is added to all
+``llvm.loop.unroll_and_jam.*`` loops. See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
 '``llvm.loop.licm_versioning.disable``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -5324,6 +5415,34 @@ enabled. A value of 0 disables distribution:
 This metadata should be used in conjunction with ``llvm.loop`` loop
 identification metadata.
 
+'``llvm.loop.distribute.followup_coincident``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which attributes extracted loops with no cyclic
+dependencies will have (i.e. can be vectorized). See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
+'``llvm.loop.distribute.followup_sequential``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata defines which attributes the isolated loops with unsafe
+memory dependencies will have. See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
+'``llvm.loop.distribute.followup_fallback``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If loop versioning is necessary, this metadata defined the attributes
+the non-distributed fallback version will have. See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
+'``llvm.loop.distribute.followup_all``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Thes attributes in this metdata is added to all followup loops of the
+loop distribution pass. See
+:ref:`Transformation Metadata <transformation-metadata>` for details.
+
 '``llvm.mem``'
 ^^^^^^^^^^^^^^^
 
@@ -6790,6 +6909,55 @@ Semantics:
 
 The '``unreachable``' instruction has no defined semantics.
 
+.. _unaryops:
+
+Unary Operations
+-----------------
+
+Unary operators require a single operand, execute an operation on
+it, and produce a single value. The operand might represent multiple
+data, as is the case with the :ref:`vector <t_vector>` data type. The
+result value has the same type as its operand.
+
+.. _i_fneg:
+
+'``fneg``' Instruction
+^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      <result> = fneg [fast-math flags]* <ty> <op1>   ; yields ty:result
+
+Overview:
+"""""""""
+
+The '``fneg``' instruction returns the negation of its operand.
+
+Arguments:
+""""""""""
+
+The argument to the '``fneg``' instruction must be a
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
+floating-point values. 
+
+Semantics:
+""""""""""
+
+The value produced is a copy of the operand with its sign bit flipped.
+This instruction can also take any number of :ref:`fast-math
+flags <fastmath>`, which are optimization hints to enable otherwise
+unsafe floating-point optimizations:
+
+Example:
+""""""""
+
+.. code-block:: text
+
+      <result> = fneg float %val          ; yields float:result = -%var
+
 .. _binaryops:
 
 Binary Operations
@@ -6965,9 +7133,6 @@ Overview:
 
 The '``fsub``' instruction returns the difference of its two operands.
 
-Note that the '``fsub``' instruction is used to represent the '``fneg``'
-instruction present in most other intermediate representations.
-
 Arguments:
 """"""""""
 
@@ -12530,6 +12695,276 @@ Examples:
       %obit = extractvalue {i32, i1} %res, 1
       br i1 %obit, label %overflow, label %normal
 
+Saturation Arithmetic Intrinsics
+---------------------------------
+
+Saturation arithmetic is a version of arithmetic in which operations are
+limited to a fixed range between a minimum and maximum value. If the result of
+an operation is greater than the maximum value, the result is set (or
+"clamped") to this maximum. If it is below the minimum, it is clamped to this
+minimum.
+
+
+'``llvm.sadd.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.sadd.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.sadd.sat.i16(i16 %a, i16 %b)
+      declare i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
+      declare i64 @llvm.sadd.sat.i64(i64 %a, i64 %b)
+      declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview
+"""""""""
+
+The '``llvm.sadd.sat``' family of intrinsic functions perform signed
+saturation addition on the 2 arguments.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo signed addition.
+
+Semantics:
+""""""""""
+
+The maximum value this operation can clamp to is the largest signed value
+representable by the bit width of the arguments. The minimum value is the
+smallest signed value representable by this bit width.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.sadd.sat.i4(i4 1, i4 2)  ; %res = 3
+      %res = call i4 @llvm.sadd.sat.i4(i4 5, i4 6)  ; %res = 7
+      %res = call i4 @llvm.sadd.sat.i4(i4 -4, i4 2)  ; %res = -2
+      %res = call i4 @llvm.sadd.sat.i4(i4 -4, i4 -5)  ; %res = -8
+
+
+'``llvm.uadd.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.uadd.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.uadd.sat.i16(i16 %a, i16 %b)
+      declare i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
+      declare i64 @llvm.uadd.sat.i64(i64 %a, i64 %b)
+      declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview
+"""""""""
+
+The '``llvm.uadd.sat``' family of intrinsic functions perform unsigned
+saturation addition on the 2 arguments.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo unsigned addition.
+
+Semantics:
+""""""""""
+
+The maximum value this operation can clamp to is the largest unsigned value
+representable by the bit width of the arguments. Because this is an unsigned
+operation, the result will never saturate towards zero.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.uadd.sat.i4(i4 1, i4 2)  ; %res = 3
+      %res = call i4 @llvm.uadd.sat.i4(i4 5, i4 6)  ; %res = 11
+      %res = call i4 @llvm.uadd.sat.i4(i4 8, i4 8)  ; %res = 15
+
+
+'``llvm.ssub.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.ssub.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.ssub.sat.i16(i16 %a, i16 %b)
+      declare i32 @llvm.ssub.sat.i32(i32 %a, i32 %b)
+      declare i64 @llvm.ssub.sat.i64(i64 %a, i64 %b)
+      declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview
+"""""""""
+
+The '``llvm.ssub.sat``' family of intrinsic functions perform signed
+saturation subtraction on the 2 arguments.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo signed subtraction.
+
+Semantics:
+""""""""""
+
+The maximum value this operation can clamp to is the largest signed value
+representable by the bit width of the arguments. The minimum value is the
+smallest signed value representable by this bit width.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.ssub.sat.i4(i4 2, i4 1)  ; %res = 1
+      %res = call i4 @llvm.ssub.sat.i4(i4 2, i4 6)  ; %res = -4
+      %res = call i4 @llvm.ssub.sat.i4(i4 -4, i4 5)  ; %res = -8
+      %res = call i4 @llvm.ssub.sat.i4(i4 4, i4 -5)  ; %res = 7
+
+
+'``llvm.usub.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.usub.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.usub.sat.i16(i16 %a, i16 %b)
+      declare i32 @llvm.usub.sat.i32(i32 %a, i32 %b)
+      declare i64 @llvm.usub.sat.i64(i64 %a, i64 %b)
+      declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview
+"""""""""
+
+The '``llvm.usub.sat``' family of intrinsic functions perform unsigned
+saturation subtraction on the 2 arguments.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo unsigned subtraction.
+
+Semantics:
+""""""""""
+
+The minimum value this operation can clamp to is 0, which is the smallest
+unsigned value representable by the bit width of the unsigned arguments.
+Because this is an unsigned operation, the result will never saturate towards
+the largest possible value representable by this bit width.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.usub.sat.i4(i4 2, i4 1)  ; %res = 1
+      %res = call i4 @llvm.usub.sat.i4(i4 2, i4 6)  ; %res = 0
+
+
+Fixed Point Arithmetic Intrinsics
+---------------------------------
+
+A fixed point number represents a real data type for a number that has a fixed
+number of digits after a radix point (equivalent to the decimal point '.').
+The number of digits after the radix point is referred as the ``scale``. These
+are useful for representing fractional values to a specific precision. The
+following intrinsics perform fixed point arithmetic operations on 2 operands
+of the same scale, specified as the third argument.
+
+
+'``llvm.smul.fix.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.smul.fix``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.smul.fix.i16(i16 %a, i16 %b, i32 %scale)
+      declare i32 @llvm.smul.fix.i32(i32 %a, i32 %b, i32 %scale)
+      declare i64 @llvm.smul.fix.i64(i64 %a, i64 %b, i32 %scale)
+      declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.smul.fix``' family of intrinsic functions perform signed
+fixed point multiplication on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo signed fixed point multiplication. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point multiplication on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+It is undefined behavior if the source value does not fit within the range of
+the fixed point type.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.smul.fix.i4(i4 3, i4 2, i32 0)  ; %res = 6 (2 x 3 = 6)
+      %res = call i4 @llvm.smul.fix.i4(i4 3, i4 2, i32 1)  ; %res = 3 (1.5 x 1 = 1.5)
+      %res = call i4 @llvm.smul.fix.i4(i4 3, i4 -2, i32 1)  ; %res = -3 (1.5 x -1 = -1.5)
+
+      ; The result in the following could be rounded up to -2 or down to -2.5
+      %res = call i4 @llvm.smul.fix.i4(i4 3, i4 -3, i32 1)  ; %res = -5 (or -4) (1.5 x -1.5 = -2.25)
+
+
 Specialised Arithmetic Intrinsics
 ---------------------------------
 
@@ -14868,7 +15303,7 @@ Syntax:
 
 ::
 
-      declare void @llvm.trap() noreturn nounwind
+      declare void @llvm.trap() cold noreturn nounwind
 
 Overview:
 """""""""
@@ -15362,6 +15797,145 @@ if"); and this allows for "check widening" type optimizations.
 ``@llvm.experimental.guard`` cannot be invoked.
 
 
+'``llvm.experimental.widenable.condition``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i1 @llvm.experimental.widenable.condition()
+
+Overview:
+"""""""""
+
+This intrinsic represents a "widenable condition" which is
+boolean expressions with the following property: whether this
+expression is `true` or `false`, the program is correct and
+well-defined.
+
+Together with :ref:`deoptimization operand bundles <deopt_opbundles>`,
+``@llvm.experimental.widenable.condition`` allows frontends to
+express guards or checks on optimistic assumptions made during
+compilation and represent them as branch instructions on special
+conditions.
+
+While this may appear similar in semantics to `undef`, it is very
+different in that an invocation produces a particular, singular
+value. It is also intended to be lowered late, and remain available
+for specific optimizations and transforms that can benefit from its
+special properties.
+
+Arguments:
+""""""""""
+
+None.
+
+Semantics:
+""""""""""
+
+The intrinsic ``@llvm.experimental.widenable.condition()``
+returns either `true` or `false`. For each evaluation of a call
+to this intrinsic, the program must be valid and correct both if
+it returns `true` and if it returns `false`. This allows
+transformation passes to replace evaluations of this intrinsic
+with either value whenever one is beneficial.
+
+When used in a branch condition, it allows us to choose between
+two alternative correct solutions for the same problem, like
+in example below:
+
+.. code-block:: text
+
+    %cond = call i1 @llvm.experimental.widenable.condition()
+    br i1 %cond, label %solution_1, label %solution_2
+
+  label %fast_path:
+    ; Apply memory-consuming but fast solution for a task.
+
+  label %slow_path:
+    ; Cheap in memory but slow solution.
+
+Whether the result of intrinsic's call is `true` or `false`,
+it should be correct to pick either solution. We can switch
+between them by replacing the result of
+``@llvm.experimental.widenable.condition`` with different
+`i1` expressions.
+
+This is how it can be used to represent guards as widenable branches:
+
+.. code-block:: text
+
+  block:
+    ; Unguarded instructions
+    call void @llvm.experimental.guard(i1 %cond, <args...>) ["deopt"(<deopt_args...>)]
+    ; Guarded instructions
+
+Can be expressed in an alternative equivalent form of explicit branch using
+``@llvm.experimental.widenable.condition``:
+
+.. code-block:: text
+
+  block:
+    ; Unguarded instructions
+    %widenable_condition = call i1 @llvm.experimental.widenable.condition()
+    %guard_condition = and i1 %cond, %widenable_condition
+    br i1 %guard_condition, label %guarded, label %deopt
+
+  guarded:
+    ; Guarded instructions
+
+  deopt:
+    call type @llvm.experimental.deoptimize(<args...>) [ "deopt"(<deopt_args...>) ]
+
+So the block `guarded` is only reachable when `%cond` is `true`,
+and it should be valid to go to the block `deopt` whenever `%cond`
+is `true` or `false`.
+
+``@llvm.experimental.widenable.condition`` will never throw, thus
+it cannot be invoked.
+
+Guard widening:
+"""""""""""""""
+
+When ``@llvm.experimental.widenable.condition()`` is used in
+condition of a guard represented as explicit branch, it is
+legal to widen the guard's condition with any additional
+conditions.
+
+Guard widening looks like replacement of
+
+.. code-block:: text
+
+  %widenable_cond = call i1 @llvm.experimental.widenable.condition()
+  %guard_cond = and i1 %cond, %widenable_cond
+  br i1 %guard_cond, label %guarded, label %deopt
+
+with
+
+.. code-block:: text
+
+  %widenable_cond = call i1 @llvm.experimental.widenable.condition()
+  %new_cond = and i1 %any_other_cond, %widenable_cond
+  %new_guard_cond = and i1 %cond, %new_cond
+  br i1 %new_guard_cond, label %guarded, label %deopt
+
+for this branch. Here `%any_other_cond` is an arbitrarily chosen
+well-defined `i1` value. By making guard widening, we may
+impose stricter conditions on `guarded` block and bail to the
+deopt when the new condition is not met.
+
+Lowering:
+"""""""""
+
+Default lowering strategy is replacing the result of
+call of ``@llvm.experimental.widenable.condition``  with
+constant `true`. However it is always correct to replace
+it with any other `i1` value. Any pass can
+freely do it if it can benefit from non-default lowering.
+
+
 '``llvm.load.relative``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -15697,3 +16271,263 @@ lowered to a call to the symbol ``__llvm_memset_element_unordered_atomic_*``. Wh
 is replaced with an actual element size.
 
 The optimizer is allowed to inline the memory assignment when it's profitable to do so.
+
+Objective-C ARC Runtime Intrinsics
+----------------------------------
+
+LLVM provides intrinsics that lower to Objective-C ARC runtime entry points.
+LLVM is aware of the semantics of these functions, and optimizes based on that
+knowledge. You can read more about the details of Objective-C ARC `here
+<https://clang.llvm.org/docs/AutomaticReferenceCounting.html>`_.
+
+'``llvm.objc.autorelease``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.autorelease(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_autorelease <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-autorelease>`_.
+
+'``llvm.objc.autoreleasePoolPop``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare void @llvm.objc.autoreleasePoolPop(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_autoreleasePoolPop <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#void-objc-autoreleasepoolpop-void-pool>`_.
+
+'``llvm.objc.autoreleasePoolPush``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.autoreleasePoolPush()
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_autoreleasePoolPush <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#void-objc-autoreleasepoolpush-void>`_.
+
+'``llvm.objc.autoreleaseReturnValue``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_autoreleaseReturnValue <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-autoreleasereturnvalue>`_.
+
+'``llvm.objc.copyWeak``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare void @llvm.objc.copyWeak(i8**, i8**)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_copyWeak <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#void-objc-copyweak-id-dest-id-src>`_.
+
+'``llvm.objc.destroyWeak``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare void @llvm.objc.destroyWeak(i8**)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_destroyWeak <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#void-objc-destroyweak-id-object>`_.
+
+'``llvm.objc.initWeak``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.initWeak(i8**, i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_initWeak <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-initweak>`_.
+
+'``llvm.objc.loadWeak``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.loadWeak(i8**)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_loadWeak <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-loadweak>`_.
+
+'``llvm.objc.loadWeakRetained``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.loadWeakRetained(i8**)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_loadWeakRetained <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-loadweakretained>`_.
+
+'``llvm.objc.moveWeak``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare void @llvm.objc.moveWeak(i8**, i8**)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_moveWeak <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#void-objc-moveweak-id-dest-id-src>`_.
+
+'``llvm.objc.release``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare void @llvm.objc.release(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_release <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#void-objc-release-id-value>`_.
+
+'``llvm.objc.retain``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.retain(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_retain <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-retain>`_.
+
+'``llvm.objc.retainAutorelease``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.retainAutorelease(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_retainAutorelease <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-retainautorelease>`_.
+
+'``llvm.objc.retainAutoreleaseReturnValue``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.retainAutoreleaseReturnValue(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_retainAutoreleaseReturnValue <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-retainautoreleasereturnvalue>`_.
+
+'``llvm.objc.retainAutoreleasedReturnValue``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_retainAutoreleasedReturnValue <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-retainautoreleasedreturnvalue>`_.
+
+'``llvm.objc.retainBlock``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.retainBlock(i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_retainBlock <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-retainblock>`_.
+
+'``llvm.objc.storeStrong``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare void @llvm.objc.storeStrong(i8**, i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_storeStrong <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#void-objc-storestrong-id-object-id-value>`_.
+
+'``llvm.objc.storeWeak``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+::
+
+      declare i8* @llvm.objc.storeWeak(i8**, i8*)
+
+Lowering:
+"""""""""
+
+Lowers to a call to `objc_storeWeak <https://clang.llvm.org/docs/AutomaticReferenceCounting.html#arc-runtime-objc-storeweak>`_.
diff --git a/docs/MIRLangRef.rst b/docs/MIRLangRef.rst
index 9a7e18c091c5ae9cd41ad8fa11d91e1cb9c3f735..ca6c0107fa66e6d1f89ea60de6514264c268ced4 100644
--- a/docs/MIRLangRef.rst
+++ b/docs/MIRLangRef.rst
@@ -60,6 +60,11 @@ runs just before the pass that we are trying to test:
 
    ``llc -stop-after=machine-cp bug-trigger.ll > test.mir``
 
+If the same pass is run multiple times, a run index can be included
+after the name with a comma.
+
+   ``llc -stop-after=dead-mi-elimination,1 bug-trigger.ll > test.mir``
+
 After generating the input MIR file, you'll have to add a run line that uses
 the ``-run-pass`` option to it. In order to test the post register allocation
 pseudo instruction expansion pass on X86-64, a run line like the one shown
diff --git a/docs/Passes.rst b/docs/Passes.rst
index 9ab214984d2d91a1dde3007d64b21976903c4e63..81bd8acfc2dad49d4299528ab8a29a4638da0084 100644
--- a/docs/Passes.rst
+++ b/docs/Passes.rst
@@ -355,6 +355,15 @@ between different iterations.
 ``ScalarEvolution`` has a more complete understanding of pointer arithmetic
 than ``BasicAliasAnalysis``' collection of ad-hoc analyses.
 
+``-stack-safety``: Stack Safety Analysis
+------------------------------------------------
+
+The ``StackSafety`` analysis can be used to determine if stack allocated
+variables can be considered safe from memory access bugs.
+
+This analysis' primary purpose is to be used by sanitizers to avoid unnecessary
+instrumentation of safe variables.
+
 ``-targetdata``: Target Data Layout
 -----------------------------------
 
@@ -1215,3 +1224,8 @@ Displays the post dominator tree using the GraphViz tool.
 Displays the post dominator tree using the GraphViz tool, but omitting function
 bodies.
 
+``-transform-warning``: Report missed forced transformations
+------------------------------------------------------------
+
+Emits warnings about not yet applied forced transformations (e.g. from
+``#pragma omp simd``).
diff --git a/docs/Proposals/TestSuite.rst b/docs/Proposals/TestSuite.rst
index 8c7531783d44bfe44a8d619d888701a7536e8e65..85f3642a2e922c63fb84ce49f93402bb3e130485 100644
--- a/docs/Proposals/TestSuite.rst
+++ b/docs/Proposals/TestSuite.rst
@@ -259,6 +259,10 @@ https://github.com/darktable-org/rawspeed
 Its test dataset is 756 MB in size, which is too large to be included
 into the test-suite repository.
 
+C++ Performance Benchmarks
+--------------------------
+https://gitlab.com/chriscox/CppPerformanceBenchmarks
+
 Generic Algorithms
 ==================
 
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 0da765a302776ea1915b4352f4f3fe341f93b0d4..a3500ba3f743f502929ab726c5c7b36792af1aeb 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -40,7 +40,8 @@ Non-comprehensive list of changes in this release
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
-* Note..
+* The **llvm-cov** tool can now export lcov trace files using the
+  `-format=lcov` option of the `export` command.
 
 .. NOTE
    If you would like to document a larger change, then you can add a
diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index 4cccfb40bd4d1eb69be1f7cdf07a455d0b5bddfc..dab300ac7b76078843ecf5fbe22c6b0821a9cfe8 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@@ -1661,4 +1661,4 @@ are correctly pointing to the ``[[C]]`` and ``[[D]]`` variables.
    for a ``DILocation`` to have a specific line number, and someone later adds
    an instruction before the one we check the test will fail. In the cases this
    can't be avoided (say, if a test wouldn't be precise enough), moving the
-   test to it's own file is preferred.
+   test to its own file is preferred.
diff --git a/docs/StackMaps.rst b/docs/StackMaps.rst
index 99c5e5fbe4de13bf3df4c4bc02af60f84ef360f0..2ceb408097a718dee820ca6574d5d3fcb28bcfb5 100644
--- a/docs/StackMaps.rst
+++ b/docs/StackMaps.rst
@@ -427,8 +427,11 @@ this section, it invokes the callback and passes the section name. The
 JIT can record the in-memory address of the section at this time and
 later parse it to recover the stack map data.
 
-On Darwin, the stack map section name is "__llvm_stackmaps". The
-segment name is "__LLVM_STACKMAPS".
+For MachO (e.g. on Darwin), the stack map section name is
+"__llvm_stackmaps". The segment name is "__LLVM_STACKMAPS".
+
+For ELF (e.g. on Linux), the stack map section name is
+".llvm_stackmaps".  The segment name is "__LLVM_STACKMAPS".
 
 Stack Map Usage
 ===============
diff --git a/docs/StackSafetyAnalysis.rst b/docs/StackSafetyAnalysis.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8573b7a2bb97fc5178c97ac019e222a7595a36ce
--- /dev/null
+++ b/docs/StackSafetyAnalysis.rst
@@ -0,0 +1,56 @@
+==================================
+Stack Safety Analysis
+==================================
+
+
+Introduction
+============
+
+The Stack Safety Analysis determines if stack allocated variables can be
+considered 'safe' from memory access bugs.
+
+The primary purpose of the analysis is to be used by sanitizers to avoid
+unnecessary instrumentation of 'safe' variables. SafeStack is going to be the
+first user.
+
+'safe' variables can be defined as variables that can not be used out-of-scope
+(e.g. use-after-return) or accessed out of bounds. In the future it can be
+extended to track other variable properties. E.g. we plan to extend
+implementation with a check to make sure that variable is always initialized
+before every read to optimize use-of-uninitialized-memory checks.
+
+How it works
+============
+
+The analysis is implemented in two stages:
+
+The intra-procedural, or 'local', stage performs a depth-first search inside
+functions to collect all uses of each alloca, including loads/stores and uses as
+arguments functions. After this stage we know which parts of the alloca are used
+by functions itself but we don't know what happens after it is passed as
+an argument to another function.
+
+The inter-procedural, or 'global', stage, resolves what happens to allocas after
+they are passed as function arguments. This stage performs a depth-first search
+on function calls inside a single module and propagates allocas usage through
+functions calls.
+
+When used with ThinLTO, the global stage performs a whole program analysis over
+the Module Summary Index.
+
+Testing
+=======
+
+The analysis is covered with lit tests.
+
+We expect that users can tolerate false classification of variables as
+'unsafe' when in-fact it's 'safe'. This may lead to inefficient code. However, we
+can't accept false 'safe' classification which may cause sanitizers to miss actual
+bugs in instrumented code. To avoid that we want additional validation tool.
+
+AddressSanitizer may help with this validation. We can instrument all variables
+as usual but additionally store stack-safe information in the
+``ASanStackVariableDescription``. Then if AddressSanitizer detects a bug on
+a 'safe' variable we can produce an additional report to let the user know that
+probably Stack Safety Analysis failed and we should check for a bug in the
+compiler.
diff --git a/docs/Statepoints.rst b/docs/Statepoints.rst
index 6ed4e46b73bcb760a820b02c744dd640ca3f60c4..3832a43b287ef98eea1ac29aa5aace60517f8b77 100644
--- a/docs/Statepoints.rst
+++ b/docs/Statepoints.rst
@@ -26,8 +26,8 @@ historical interest at this point with one exception - its implementation of
 shadow stacks has been used successfully by a number of language frontends and
 is still supported.  
 
-Overview
-========
+Overview & Core Concepts
+========================
 
 To collect dead objects, garbage collectors must be able to identify
 any references to objects contained within executing code, and,
@@ -93,7 +93,10 @@ the collector must be able to:
 
 This document describes the mechanism by which an LLVM based compiler
 can provide this information to a language runtime/collector, and
-ensure that all pointers can be read and updated if desired.  
+ensure that all pointers can be read and updated if desired.
+
+Abstract Machine Model
+^^^^^^^^^^^^^^^^^^^^^^^
 
 At a high level, LLVM has been extended to support compiling to an abstract 
 machine which extends the actual target with a non-integral pointer type 
@@ -103,10 +106,16 @@ integer representation.  This semantic quirk allows the runtime to pick a
 integer mapping for each point in the program allowing relocations of objects 
 without visible effects.
 
-Warning: Non-Integral Pointer Types are a newly added concept in LLVM IR.  
-It's possible that we've missed disabling some of the optimizations which 
-assume an integral value for pointers.  If you find such a case, please 
-file a bug or share a patch.
+This high level abstract machine model is used for most of the optimizer.  As
+a result, transform passes do not need to be extended to look through explicit
+relocation sequence.  Before starting code generation, we switch
+representations to an explicit form.  The exact location chosen for lowering
+is an implementation detail.
+
+Note that most of the value of the abstract machine model comes for collectors
+which need to model potentially relocatable objects.  For a compiler which
+supports only a non-relocating collector, you may wish to consider starting
+with the fully explicit form.  
 
 Warning: There is one currently known semantic hole in the definition of 
 non-integral pointers which has not been addressed upstream.  To work around
@@ -116,10 +125,13 @@ not safe to speculate a load if doing causes a non-integral pointer value to
 be loaded as any other type or vice versa.  In practice, this restriction is 
 well isolated to isSafeToSpeculate in ValueTracking.cpp.
 
-This high level abstract machine model is used for most of the LLVM optimizer.
-Before starting code generation, we switch representations to an explicit form.
-In theory, a frontend could directly generate this low level explicit form, but 
-doing so is likely to inhibit optimization.  
+Explicit Representation
+^^^^^^^^^^^^^^^^^^^^^^^
+
+A frontend could directly generate this low level explicit form, but 
+doing so may inhibit optimization.  Instead, it is recommended that
+compilers with relocating collectors target the abstract machine model just
+described.  
 
 The heart of the explicit approach is to construct (or rewrite) the IR in a 
 manner where the possible updates performed by the garbage collector are
@@ -242,6 +254,56 @@ following command.
 
   opt -rewrite-statepoints-for-gc test/Transforms/RewriteStatepointsForGC/basics.ll -S | llc -debug-only=stackmaps
 
+Simplifications for Non-Relocating GCs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Some of the complexity in the previous example is unnecessary for a
+non-relocating collector.  While a non-relocating collector still needs the
+information about which location contain live references, it doesn't need to
+represent explicit relocations.  As such, the previously described explicit
+lowering can be simplified to remove all of the ``gc.relocate`` intrinsic
+calls and leave uses in terms of the original reference value.  
+
+Here's the explicit lowering for the previous example for a non-relocating
+collector:
+
+.. code-block:: llvm
+
+  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
+         gc "statepoint-example" {
+    call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj)
+    ret i8 addrspace(1)* %obj
+  }
+
+Recording On Stack Regions
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In addition to the explicit relocation form previously described, the
+statepoint infrastructure also allows the listing of allocas within the gc
+pointer list.  Allocas can be listed with or without additional explicit gc
+pointer values and relocations.
+
+An alloca in the gc region of the statepoint operand list will cause the
+address of the stack region to be listed in the stackmap for the statepoint.
+
+This mechanism can be used to describe explicit spill slots if desired.  It
+then becomes the generator's responsibility to ensure that values are
+spill/filled to/from the alloca as needed on either side of the safepoint.
+Note that there is no way to indicate a corresponding base pointer for such
+an explicitly specified spill slot, so usage is restricted to values for
+which the associated collector can derive the object base from the pointer
+itself.
+
+This mechanism can be used to describe on stack objects containing
+references provided that the collector can map from the location on the
+stack to a heap map describing the internal layout of the references the
+collector needs to process.
+
+WARNING: At the moment, this alternate form is not well exercised.  It is
+recommended to use this with caution and expect to have to fix a few bugs.
+In particular, the RewriteStatepointsForGC utility pass does not do
+anything for allocas today.
+  
 Base & Derived Pointers
 ^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -590,8 +652,15 @@ Stack Map Format
 ================
 
 Locations for each pointer value which may need read and/or updated by
-the runtime or collector are provided via the :ref:`Stack Map format
-<stackmap-format>` specified in the PatchPoint documentation.
+the runtime or collector are provided in a separate section of the
+generated object file as specified in the PatchPoint documentation.
+This special section is encoded per the
+:ref:`Stack Map format <stackmap-format>`.
+
+The general expectation is that a JIT compiler will parse and discard this
+format; it is not particularly memory efficient.  If you need an alternate
+format (e.g. for an ahead of time compiler), see discussion under
+:ref: `open work items <OpenWork>` below.
 
 Each statepoint generates the following Locations:
 
@@ -831,35 +900,73 @@ Supported Architectures
 =======================
 
 Support for statepoint generation requires some code for each backend.
-Today, only X86_64 is supported.  
-
-Problem Areas and Active Work
-=============================
-
-#. Support for languages which allow unmanaged pointers to garbage collected
-   objects (i.e. pass a pointer to an object to a C routine) via pinning.
-
-#. Support for garbage collected objects allocated on the stack.  Specifically,
-   allocas are always assumed to be in address space 0 and we need a
-   cast/promotion operator to let rewriting identify them.
-
-#. The current statepoint lowering is known to be somewhat poor.  In the very
-   long term, we'd like to integrate statepoints with the register allocator;
-   in the near term this is unlikely to happen.  We've found the quality of
-   lowering to be relatively unimportant as hot-statepoints are almost always
-   inliner bugs.
-
-#. Concerns have been raised that the statepoint representation results in a
-   large amount of IR being produced for some examples and that this
-   contributes to higher than expected memory usage and compile times.  There's
-   no immediate plans to make changes due to this, but alternate models may be
-   explored in the future.
-
-#. Relocations along exceptional paths are currently broken in ToT.  In
-   particular, there is current no way to represent a rethrow on a path which
-   also has relocations.  See `this llvm-dev discussion
-   <https://groups.google.com/forum/#!topic/llvm-dev/AE417XjgxvI>`_ for more
-   detail.
+Today, only X86_64 is supported.
+
+.. _OpenWork:
+
+Limitations and Half Baked Ideas
+================================
+
+Mixing References and Raw Pointers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Support for languages which allow unmanaged pointers to garbage collected
+objects (i.e. pass a pointer to an object to a C routine) in the abstract
+machine model.  At the moment, the best idea on how to approach this
+involves an intrinsic or opaque function which hides the connection between
+the reference value and the raw pointer.  The problem is that having a
+ptrtoint or inttoptr cast (which is common for such use cases) breaks the
+rules used for inferring base pointers for arbitrary references when
+lowering out of the abstract model to the explicit physical model.  Note
+that a frontend which lowers directly to the physical model doesn't have
+any problems here.
+
+Objects on the Stack
+^^^^^^^^^^^^^^^^^^^^
+
+As noted above, the explicit lowering supports objects allocated on the
+stack provided the collector can find a heap map given the stack address.
+
+The missing pieces are a) integration with rewriting (RS4GC) from the
+abstract machine model and b) support for optionally decomposing on stack
+objects so as not to require heap maps for them.  The later is required
+for ease of integration with some collectors.  
+
+Lowering Quality and Representation Overhead
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The current statepoint lowering is known to be somewhat poor.  In the very
+long term, we'd like to integrate statepoints with the register allocator;
+in the near term this is unlikely to happen.  We've found the quality of
+lowering to be relatively unimportant as hot-statepoints are almost always
+inliner bugs.
+
+Concerns have been raised that the statepoint representation results in a
+large amount of IR being produced for some examples and that this
+contributes to higher than expected memory usage and compile times.  There's
+no immediate plans to make changes due to this, but alternate models may be
+explored in the future.
+
+Relocations Along Exceptional Edges
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Relocations along exceptional paths are currently broken in ToT.  In
+particular, there is current no way to represent a rethrow on a path which
+also has relocations.  See `this llvm-dev discussion
+<https://groups.google.com/forum/#!topic/llvm-dev/AE417XjgxvI>`_ for more
+detail.
+
+Support for alternate stackmap formats
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For some use cases, it is
+desirable to directly encode a final memory efficient stackmap format for
+use by the runtime.  This is particularly relevant for ahead of time
+compilers which wish to directly link object files without the need for
+post processing of each individual object file.  While not implemented
+today for statepoints, there is precedent for a GCStrategy to be able to
+select a customer GCMetataPrinter for this purpose.  Patches to enable
+this functionality upstream are welcome.   
 
 Bugs and Enhancements
 =====================
diff --git a/docs/TableGen/LangRef.rst b/docs/TableGen/LangRef.rst
index 439d646034ada188c4baa085e0204c1d935bb8db..2efee12ec9d5b991814cde6bafba0be76eaa96c5 100644
--- a/docs/TableGen/LangRef.rst
+++ b/docs/TableGen/LangRef.rst
@@ -33,7 +33,7 @@ Lexical Analysis
 ================
 
 TableGen supports BCPL (``// ...``) and nestable C-style (``/* ... */``)
-comments.
+comments.  TableGen also provides simple `Preprocessing Support`_.
 
 The following is a listing of the basic punctuation tokens::
 
@@ -448,3 +448,50 @@ applied at the end of parsing the base classes of a record.
    BaseMultiClassList: `MultiClassID` ("," `MultiClassID`)*
    MultiClassID: `TokIdentifier`
    MultiClassObject: `Def` | `Defm` | `Let` | `Foreach`
+
+Preprocessing Support
+=====================
+
+TableGen's embedded preprocessor is only intended for conditional compilation.
+It supports the following directives:
+
+.. productionlist::
+   LineBegin: ^
+   LineEnd: "\n" | "\r" | EOF
+   WhiteSpace: " " | "\t"
+   CStyleComment: "/*" (.* - "*/") "*/"
+   BCPLComment: "//" (.* - `LineEnd`) `LineEnd`
+   WhiteSpaceOrCStyleComment: `WhiteSpace` | `CStyleComment`
+   WhiteSpaceOrAnyComment: `WhiteSpace` | `CStyleComment` | `BCPLComment`
+   MacroName: `ualpha` (`ualpha` | "0"..."9")*
+   PrepDefine: `LineBegin` (`WhiteSpaceOrCStyleComment`)*
+             : "#define" (`WhiteSpace`)+ `MacroName`
+             : (`WhiteSpaceOrAnyComment`)* `LineEnd`
+   PrepIfdef: `LineBegin` (`WhiteSpaceOrCStyleComment`)*
+            : "#ifdef" (`WhiteSpace`)+ `MacroName`
+            : (`WhiteSpaceOrAnyComment`)* `LineEnd`
+   PrepElse: `LineBegin` (`WhiteSpaceOrCStyleComment`)*
+           : "#else" (`WhiteSpaceOrAnyComment`)* `LineEnd`
+   PrepEndif: `LineBegin` (`WhiteSpaceOrCStyleComment`)*
+            : "#endif" (`WhiteSpaceOrAnyComment`)* `LineEnd`
+   PrepRegContentException: `PredIfdef` | `PredElse` | `PredEndif` | EOF
+   PrepRegion: .* - `PrepRegContentException`
+             :| `PrepIfDef`
+             :  (`PrepRegion`)*
+             :  [`PrepElse`]
+             :  (`PrepRegion`)*
+             :  `PrepEndif`
+
+:token:`PrepRegion` may occur anywhere in a TD file, as long as it matches
+the grammar specification.
+
+:token:`PrepDefine` allows defining a :token:`MacroName` so that any following
+:token:`PrepIfdef` - :token:`PrepElse` preprocessing region part and
+:token:`PrepIfdef` - :token:`PrepEndif` preprocessing region
+are enabled for TableGen tokens parsing.
+
+A preprocessing region, starting (i.e. having its :token:`PrepIfdef`) in a file,
+must end (i.e. have its :token:`PrepEndif`) in the same file.
+
+A :token:`MacroName` may be defined externally by using ``{ -D<NAME> }``
+option of TableGen.
diff --git a/docs/TransformMetadata.rst b/docs/TransformMetadata.rst
new file mode 100644
index 0000000000000000000000000000000000000000..68649424b717c0bdd5f72ee71d2b7329d5ea8213
--- /dev/null
+++ b/docs/TransformMetadata.rst
@@ -0,0 +1,441 @@
+.. _transformation-metadata:
+
+============================
+Code Transformation Metadata
+============================
+
+.. contents::
+   :local:
+
+Overview
+========
+
+LLVM transformation passes can be controlled by attaching metadata to
+the code to transform. By default, transformation passes use heuristics
+to determine whether or not to perform transformations, and when doing
+so, other details of how the transformations are applied (e.g., which
+vectorization factor to select).
+Unless the optimizer is otherwise directed, transformations are applied
+conservatively. This conservatism generally allows the optimizer to
+avoid unprofitable transformations, but in practice, this results in the
+optimizer not applying transformations that would be highly profitable.
+
+Frontends can give additional hints to LLVM passes on which
+transformations they should apply. This can be additional knowledge that
+cannot be derived from the emitted IR, or directives passed from the
+user/programmer. OpenMP pragmas are an example of the latter.
+
+If any such metadata is dropped from the program, the code's semantics
+must not change.
+
+Metadata on Loops
+=================
+
+Attributes can be attached to loops as described in :ref:`llvm.loop`.
+Attributes can describe properties of the loop, disable transformations,
+force specific transformations and set transformation options.
+
+Because metadata nodes are immutable (with the exception of
+``MDNode::replaceOperandWith`` which is dangerous to use on uniqued
+metadata), in order to add or remove a loop attributes, a new ``MDNode``
+must be created and assigned as the new ``llvm.loop`` metadata. Any
+connection between the old ``MDNode`` and the loop is lost. The
+``llvm.loop`` node is also used as LoopID (``Loop::getLoopID()``), i.e.
+the loop effectively gets a new identifier. For instance,
+``llvm.mem.parallel_loop_access`` references the LoopID. Therefore, if
+the parallel access property is to be preserved after adding/removing
+loop attributes, any ``llvm.mem.parallel_loop_access`` reference must be
+updated to the new LoopID.
+
+Transformation Metadata Structure
+=================================
+
+Some attributes describe code transformations (unrolling, vectorizing,
+loop distribution, etc.). They can either be a hint to the optimizer
+that a transformation might be beneficial, instruction to use a specific
+option, , or convey a specific request from the user (such as
+``#pragma clang loop`` or ``#pragma omp simd``).
+
+If a transformation is forced but cannot be carried-out for any reason,
+an optimization-missed warning must be emitted. Semantic information
+such as a transformation being safe (e.g.
+``llvm.mem.parallel_loop_access``) can be unused by the optimizer
+without generating a warning.
+
+Unless explicitly disabled, any optimization pass may heuristically
+determine whether a transformation is beneficial and apply it. If
+metadata for another transformation was specified, applying a different
+transformation before it might be inadvertent due to being applied on a
+different loop or the loop not existing anymore. To avoid having to
+explicitly disable an unknown number of passes, the attribute
+``llvm.loop.disable_nonforced`` disables all optional, high-level,
+restructuring transformations.
+
+The following example avoids the loop being altered before being
+vectorized, for instance being unrolled.
+
+.. code-block:: llvm
+
+      br i1 %exitcond, label %for.exit, label %for.header, !llvm.loop !0
+    ...
+    !0 = distinct !{!0, !1, !2}
+    !1 = !{!"llvm.loop.vectorize.enable", i1 true}
+    !2 = !{!"llvm.loop.disable_nonforced"}
+
+After a transformation is applied, follow-up attributes are set on the
+transformed and/or new loop(s). This allows additional attributes
+including followup-transformations to be specified. Specifying multiple
+transformations in the same metadata node is possible for compatibility
+reasons, but their execution order is undefined. For instance, when
+``llvm.loop.vectorize.enable`` and ``llvm.loop.unroll.enable`` are
+specified at the same time, unrolling may occur either before or after
+vectorization.
+
+As an example, the following instructs a loop to be vectorized and only
+then unrolled.
+
+.. code-block:: llvm
+
+    !0 = distinct !{!0, !1, !2, !3}
+    !1 = !{!"llvm.loop.vectorize.enable", i1 true}
+    !2 = !{!"llvm.loop.disable_nonforced"}
+    !3 = !{!"llvm.loop.vectorize.followup_vectorized", !{"llvm.loop.unroll.enable"}}
+
+If, and only if, no followup is specified, the pass may add attributes itself.
+For instance, the vectorizer adds a ``llvm.loop.isvectorized`` attribute and
+all attributes from the original loop excluding its loop vectorizer
+attributes. To avoid this, an empty followup attribute can be used, e.g.
+
+.. code-block:: llvm
+
+    !3 = !{!"llvm.loop.vectorize.followup_vectorized"}
+
+The followup attributes of a transformation that cannot be applied will
+never be added to a loop and are therefore effectively ignored. This means
+that any followup-transformation in such attributes requires that its
+prior transformations are applied before the followup-transformation.
+The user should receive a warning about the first transformation in the
+transformation chain that could not be applied if it a forced
+transformation. All following transformations are skipped.
+
+Pass-Specific Transformation Metadata
+=====================================
+
+Transformation options are specific to each transformation. In the
+following, we present the model for each LLVM loop optimization pass and
+the metadata to influence them.
+
+Loop Vectorization and Interleaving
+-----------------------------------
+
+Loop vectorization and interleaving is interpreted as a single
+transformation. It is interpreted as forced if
+``!{"llvm.loop.vectorize.enable", i1 true}`` is set.
+
+Assuming the pre-vectorization loop is
+
+.. code-block:: c
+
+    for (int i = 0; i < n; i+=1) // original loop
+      Stmt(i);
+
+then the code after vectorization will be approximately (assuming an
+SIMD width of 4):
+
+.. code-block:: c
+
+    int i = 0;
+    if (rtc) {
+      for (; i + 3 < n; i+=4) // vectorized/interleaved loop
+        Stmt(i:i+3);
+    }
+    for (; i < n; i+=1) // epilogue loop
+      Stmt(i);
+
+where ``rtc`` is a generated runtime check.
+
+``llvm.loop.vectorize.followup_vectorized`` will set the attributes for
+the vectorized loop. If not specified, ``llvm.loop.isvectorized`` is
+combined with the original loop's attributes to avoid it being
+vectorized multiple times.
+
+``llvm.loop.vectorize.followup_epilogue`` will set the attributes for
+the remainder loop. If not specified, it will have the original loop's
+attributes combined with ``llvm.loop.isvectorized`` and
+``llvm.loop.unroll.runtime.disable`` (unless the original loop already
+has unroll metadata).
+
+The attributes specified by ``llvm.loop.vectorize.followup_all`` are
+added to both loops.
+
+When using a follow-up attribute, it replaces any automatically deduced
+attributes for the generated loop in question. Therefore it is
+recommended to add ``llvm.loop.isvectorized`` to
+``llvm.loop.vectorize.followup_all`` which avoids that the loop
+vectorizer tries to optimize the loops again.
+
+Loop Unrolling
+--------------
+
+Unrolling is interpreted as forced any ``!{!"llvm.loop.unroll.enable"}``
+metadata or option (``llvm.loop.unroll.count``, ``llvm.loop.unroll.full``)
+is present. Unrolling can be full unrolling, partial unrolling of a loop
+with constant trip count or runtime unrolling of a loop with a trip
+count unknown at compile-time.
+
+If the loop has been unrolled fully, there is no followup-loop. For
+partial/runtime unrolling, the original loop of
+
+.. code-block:: c
+
+    for (int i = 0; i < n; i+=1) // original loop
+      Stmt(i);
+
+is transformed into (using an unroll factor of 4):
+
+.. code-block:: c
+
+    int i = 0;
+    for (; i + 3 < n; i+=4) // unrolled loop
+      Stmt(i);
+      Stmt(i+1);
+      Stmt(i+2);
+      Stmt(i+3);
+    }
+    for (; i < n; i+=1) // remainder loop
+      Stmt(i);
+
+``llvm.loop.unroll.followup_unrolled`` will set the loop attributes of
+the unrolled loop. If not specified, the attributes of the original loop
+without the ``llvm.loop.unroll.*`` attributes are copied and
+``llvm.loop.unroll.disable`` added to it.
+
+``llvm.loop.unroll.followup_remainder`` defines the attributes of the
+remainder loop. If not specified the remainder loop will have no
+attributes. The remainder loop might not be present due to being fully
+unrolled in which case this attribute has no effect.
+
+Attributes defined in ``llvm.loop.unroll.followup_all`` are added to the
+unrolled and remainder loops.
+
+To avoid that the partially unrolled loop is unrolled again, it is
+recommended to add ``llvm.loop.unroll.disable`` to
+``llvm.loop.unroll.followup_all``. If no follow-up attribute specified
+for a generated loop, it is added automatically.
+
+Unroll-And-Jam
+--------------
+
+Unroll-and-jam uses the following transformation model (here with an
+unroll factor if 2). Currently, it does not support a fallback version
+when the transformation is unsafe.
+
+.. code-block:: c
+
+    for (int i = 0; i < n; i+=1) { // original outer loop
+      Fore(i);
+      for (int j = 0; j < m; j+=1) // original inner loop
+        SubLoop(i, j);
+      Aft(i);
+    }
+
+.. code-block:: c
+
+    int i = 0;
+    for (; i + 1 < n; i+=2) { // unrolled outer loop
+      Fore(i);
+      Fore(i+1);
+      for (int j = 0; j < m; j+=1) { // unrolled inner loop
+        SubLoop(i, j);
+        SubLoop(i+1, j);
+      }
+      Aft(i);
+      Aft(i+1);
+    }
+    for (; i < n; i+=1) { // remainder outer loop
+      Fore(i);
+      for (int j = 0; j < m; j+=1) // remainder inner loop
+        SubLoop(i, j);
+      Aft(i);
+    }
+
+``llvm.loop.unroll_and_jam.followup_outer`` will set the loop attributes
+of the unrolled outer loop. If not specified, the attributes of the
+original outer loop without the ``llvm.loop.unroll.*`` attributes are
+copied and ``llvm.loop.unroll.disable`` added to it.
+
+``llvm.loop.unroll_and_jam.followup_inner`` will set the loop attributes
+of the unrolled inner loop. If not specified, the attributes of the
+original inner loop are used unchanged.
+
+``llvm.loop.unroll_and_jam.followup_remainder_outer`` sets the loop
+attributes of the outer remainder loop. If not specified it will not
+have any attributes. The remainder loop might not be present due to
+being fully unrolled.
+
+``llvm.loop.unroll_and_jam.followup_remainder_inner`` sets the loop
+attributes of the inner remainder loop. If not specified it will have
+the attributes of the original inner loop. It the outer remainder loop
+is unrolled, the inner remainder loop might be present multiple times.
+
+Attributes defined in ``llvm.loop.unroll_and_jam.followup_all`` are
+added to all of the aforementioned output loops.
+
+To avoid that the unrolled loop is unrolled again, it is
+recommended to add ``llvm.loop.unroll.disable`` to
+``llvm.loop.unroll_and_jam.followup_all``. It suppresses unroll-and-jam
+as well as an additional inner loop unrolling. If no follow-up
+attribute specified for a generated loop, it is added automatically.
+
+Loop Distribution
+-----------------
+
+The LoopDistribution pass tries to separate vectorizable parts of a loop
+from the non-vectorizable part (which otherwise would make the entire
+loop non-vectorizable). Conceptually, it transforms a loop such as
+
+.. code-block:: c
+
+    for (int i = 1; i < n; i+=1) { // original loop
+      A[i] = i;
+      B[i] = 2 + B[i];
+      C[i] = 3 + C[i - 1];
+    }
+
+into the following code:
+
+.. code-block:: c
+
+    if (rtc) {
+      for (int i = 1; i < n; i+=1) // coincident loop
+        A[i] = i;
+      for (int i = 1; i < n; i+=1) // coincident loop
+        B[i] = 2 + B[i];
+      for (int i = 1; i < n; i+=1) // sequential loop
+        C[i] = 3 + C[i - 1];
+    } else {
+      for (int i = 1; i < n; i+=1) { // fallback loop
+        A[i] = i;
+        B[i] = 2 + B[i];
+        C[i] = 3 + C[i - 1];
+      }
+    }
+
+where ``rtc`` is a generated runtime check.
+
+``llvm.loop.distribute.followup_coincident`` sets the loop attributes of
+all loops without loop-carried dependencies (i.e. vectorizable loops).
+There might be more than one such loops. If not defined, the loops will
+inherit the original loop's attributes.
+
+``llvm.loop.distribute.followup_sequential`` sets the loop attributes of the
+loop with potentially unsafe dependencies. There should be at most one
+such loop. If not defined, the loop will inherit the original loop's
+attributes.
+
+``llvm.loop.distribute.followup_fallback`` defines the loop attributes
+for the fallback loop, which is a copy of the original loop for when
+loop versioning is required. If undefined, the fallback loop inherits
+all attributes from the original loop.
+
+Attributes defined in ``llvm.loop.distribute.followup_all`` are added to
+all of the aforementioned output loops.
+
+It is recommended to add ``llvm.loop.disable_nonforced`` to
+``llvm.loop.distribute.followup_fallback``. This avoids that the
+fallback version (which is likely never executed) is further optimzed
+which would increase the code size.
+
+Versioning LICM
+---------------
+
+The pass hoists code out of loops that are only loop-invariant when
+dynamic conditions apply. For instance, it transforms the loop
+
+.. code-block:: c
+
+    for (int i = 0; i < n; i+=1) // original loop
+      A[i] = B[0];
+
+into:
+
+.. code-block:: c
+
+    if (rtc) {
+      auto b = B[0];
+      for (int i = 0; i < n; i+=1) // versioned loop
+        A[i] = b;
+    } else {
+      for (int i = 0; i < n; i+=1) // unversioned loop
+        A[i] = B[0];
+    }
+
+The runtime condition (``rtc``) checks that the array ``A`` and the
+element `B[0]` do not alias.
+
+Currently, this transformation does not support followup-attributes.
+
+Loop Interchange
+----------------
+
+Currently, the ``LoopInterchange`` pass does not use any metadata.
+
+Ambiguous Transformation Order
+==============================
+
+If there multiple transformations defined, the order in which they are
+executed depends on the order in LLVM's pass pipeline, which is subject
+to change. The default optimization pipeline (anything higher than
+``-O0``) has the following order.
+
+When using the legacy pass manager:
+
+ - LoopInterchange (if enabled)
+ - SimpleLoopUnroll/LoopFullUnroll (only performs full unrolling)
+ - VersioningLICM (if enabled)
+ - LoopDistribute
+ - LoopVectorizer
+ - LoopUnrollAndJam (if enabled)
+ - LoopUnroll (partial and runtime unrolling)
+
+When using the legacy pass manager with LTO:
+
+ - LoopInterchange (if enabled)
+ - SimpleLoopUnroll/LoopFullUnroll (only performs full unrolling)
+ - LoopVectorizer
+ - LoopUnroll (partial and runtime unrolling)
+
+When using the new pass manager:
+
+ - SimpleLoopUnroll/LoopFullUnroll (only performs full unrolling)
+ - LoopDistribute
+ - LoopVectorizer
+ - LoopUnrollAndJam (if enabled)
+ - LoopUnroll (partial and runtime unrolling)
+
+Leftover Transformations
+========================
+
+Forced transformations that have not been applied after the last
+transformation pass should be reported to the user. The transformation
+passes themselves cannot be responsible for this reporting because they
+might not be in the pipeline, there might be multiple passes able to
+apply a transformation (e.g. ``LoopInterchange`` and Polly) or a
+transformation attribute may be 'hidden' inside another passes' followup
+attribute.
+
+The pass ``-transform-warning`` (``WarnMissedTransformationsPass``)
+emits such warnings. It should be placed after the last transformation
+pass.
+
+The current pass pipeline has a fixed order in which transformations
+passes are executed. A transformation can be in the followup of a pass
+that is executed later and thus leftover. For instance, a loop nest
+cannot be distributed and then interchanged with the current pass
+pipeline. The loop distribution will execute, but there is no loop
+interchange pass following such that any loop interchange metadata will
+be ignored. The ``-transform-warning`` should emit a warning in this
+case.
+
+Future versions of LLVM may fix this by executing transformations using
+a dynamic ordering.
diff --git a/docs/index.rst b/docs/index.rst
index df70de095bd94761c7d8c01ce6d18dfe609765b4..698e6764bfb3ab9f67022dd8b249265bd9a65407 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -292,6 +292,7 @@ For API clients and LLVM developers.
    Statepoints
    MergeFunctions
    TypeMetadata
+   TransformMetadata
    FaultMaps
    MIRLangRef
    Coroutines
@@ -302,6 +303,7 @@ For API clients and LLVM developers.
    PDB/index
    CFIVerify
    SpeculativeLoadHardening
+   StackSafetyAnalysis
 
 :doc:`WritingAnLLVMPass`
    Information on how to write LLVM transformations and analyses.
@@ -438,6 +440,10 @@ For API clients and LLVM developers.
 :doc:`SpeculativeLoadHardening`
   A description of the Speculative Load Hardening mitigation for Spectre v1.
 
+:doc:`StackSafetyAnalysis`
+  This document describes the design of the stack safety analysis of local
+  variables.
+
 Development Process Documentation
 =================================
 
diff --git a/docs/llvm-objdump.1 b/docs/llvm-objdump.1
new file mode 100644
index 0000000000000000000000000000000000000000..72d1b93f6bb37cb7038dbc356ffbc5e18765d44e
--- /dev/null
+++ b/docs/llvm-objdump.1
@@ -0,0 +1,197 @@
+.\" This file is distributed under the University of Illinois Open Source
+.\" License. See LICENSE.TXT for details.
+.\"
+.Dd December 19, 2018
+.Dt LLVM-OBJDUMP 1
+.Os
+.Sh NAME
+.Nm llvm-objdump
+.Nd LLVM object file dumper
+.Sh SYNOPSIS
+.Nm llvm-objdump
+.Op Ar options
+.Ar objfile ...
+.Sh DESCRIPTION
+.Nm
+prints the contents of object files and final linked images named on the
+command line.
+If no file name is specified,
+.Nm
+will attempt to read from
+.Pa a.out .
+If
+.Pa -
+is used as a file name,
+.Nm
+will process a file on its standard input stream.
+.Nm
+accepts many of the same command line arguments as GNU objdump.
+.Sh OPTIONS
+.Ss General Options
+.Bl -tag -width indent
+.It Fl -aarch64-neon-syntax Ns = Ns Ar value
+Choose style of NEON code to emit from AArch64 backend.
+.Ar value
+may be one of:
+.Bl -tag -width indent
+.It generic
+Generic NEON assembly
+.It apple
+Apple-style NEON assembly
+.El
+.It Fl -arch Ns = Ns Ar value
+Choose architecture(s) from a Mach-O file to dump
+.It Fl -arch-name Ns = Ns ar arch
+Target arch to disassemble for.
+See
+.Fl -version
+for available targets.
+.It Fl -bind
+Display mach-o binding info.
+.It Fl -color
+Use colored syntax highlighting.
+Default autodetect.
+.It Fl -disassemble
+Display assembler mnemonics for machine instructions.
+.It Fl -disassemble-all
+Display assembler mnemonics for the machine instruction in all sections.
+.It Fl -dsym Ns = Ns Ar file
+Use
+.Ar file
+for debug info.
+.It Fl -dwarf Ns = Ns Ar sections
+Dump of dwarf debug sections.
+.Bl -tag -width indent
+.It frames
+.Dv .debug_frame
+.El
+.It Fl -exports-trie
+Display mach-o exported symbols.
+.It Fl -fault-map-section
+Display contents of faultmap section.
+.It Fl -filter-print-funcs Ns = Ns Ar functions
+Only print IR for functions whose name match
+.Ar functions
+for all print-[before|after][-all] options.
+.It Fl -full-leading-addr
+Print full leading address.
+.It Fl g
+Print line information from debug info if available.
+.It Fl h , -headers , -section-headers
+Display summaries of the headers for each section.
+.It Fl -help
+Display available options.
+Use
+.Fl -help-hidden
+for more.
+.It Fl -lazy-bind
+Display mach-o lazy binding info.
+.It Fl -line-numbers
+Display source line numbers with disassembly.
+Implies disassemble object.
+.It Fl -macho
+Use MachO specific object file parser.
+.It Fl -mattr Ns = Ns Ar attribute ...
+Target specific attributes.
+.It Fl -mcpu Ns = Ns Ar CPU
+Target a specific cpu type.
+Use
+.Fl mcpu Ns = Ns help
+for details.
+.It Fl -no-leading-addr
+Print no leading address.
+.It Fl -no-leading-headers
+Print no leading headers.
+.It Fl -no-show-raw-insn
+When disassembling instructions, do not print the instruction bytes.
+.It Fl -print-imm-hex
+Use hex format for immediate values.
+.It Fl -private-header
+Display only the first format specific file header.
+.It Fl -private-headers
+Display format specific file headers.
+.It Fl r
+Display the relocation entries in the file.
+.It Fl -raw-clang-ast
+Dump the raw binary contents of the clang AST section.
+.It Fl -rebase
+Display mach-o rebasing info.
+.It Fl -reverse-iterate
+Reverse iterate.
+.It Fl s
+Display the content of each section.
+.It Fl -section Ns = Ns Ar section
+Operate on the specified sections only.
+With
+.Fl -macho
+dump segment,section.
+.It Fl -source
+Display source inline with disassembly.
+Implies disassmble object.
+.It Fl -start-address Ns = Ns Ar address
+Disassemble beginning at
+.Ar address .
+.It Fl -stop-address Ns = Ns Ar address
+Stop disassembly at
+.Ar address .
+.It Fl t
+Display the symbol table.
+.It Fl -triple Ns = Ns Ar triple
+Target triple to disassemble for.
+See
+.Fl -version
+for available targets.
+.It Fl -unwind-info
+Display unwind information.
+.It Fl -version
+Display the version of this program.
+.It Fl -weak-bind
+Display mach-o weak binding info.
+.It Fl -x86-asm-syntax Ns = Ns Ar syntax
+Choose style of code to emit from X86 backend.
+.Bl -tag -width indent
+.It att
+Emit AT&T-style assembly.
+.It intel
+Emit Intel-style assembly.
+.El
+.El
+.Ss Mach-O Options
+There are a number of options specific to the Mach-O format.
+These are used in combination with the
+.Fl -macho
+option.
+.Bl -tag -width indent
+.It Fl -archive-headers
+Print archive headers for Mach-O archives.
+.It Fl -archive-member-offsets
+Print the offset to each archive member for Mach-O archives.
+Requires
+.Fl -macho
+and
+.Fl -archive-headers .
+.It Fl -data-in-code
+Print the data in code table for Mach-O objects.
+.It Fl -dis-symname Ns = Ns Ar symbol
+Disassemble just
+.Ar symbol 's
+instructions.
+.It Fl -dylib-id
+Print the shared library's id for the dylib Mach-O file.
+.It Fl -dylibs-used
+Print the shared libraries used for linked Mach-O files.
+.It Fl -indirect-symbols
+Print indirect symbol table for Mach-O objects.
+.It Fl -info-plist
+Print the info plist section as strings for Mach-O objects.
+.It Fl -link-opt-hints
+Print the linker optimization hints for Mach-O objects.
+.It Fl -no-symbolic-operands
+do not symbolic operands when disassembling.
+.It Fl -non-verbose
+Print the info for Mach-O objects in non-verbose or numeric form.
+.It Fl -objc-meta-data
+Print the Objective-C runtime meta data for Mach-O files.
+.It Fl -universal-headers
+Print Mach-O universal headers.
+.El
diff --git a/docs/tutorial/BuildingAJIT2.rst b/docs/tutorial/BuildingAJIT2.rst
index 15c9c3586bc37c8f0c0e8e659b0add5b10b9816a..7d25ccaba0aa1ff24f2df272f8f854d864935ab5 100644
--- a/docs/tutorial/BuildingAJIT2.rst
+++ b/docs/tutorial/BuildingAJIT2.rst
@@ -12,10 +12,11 @@ we welcome any feedback.
 Chapter 2 Introduction
 ======================
 
-**Warning: This text is currently out of date due to ORC API updates.**
+**Warning: This tutorial is currently being updated to account for ORC API
+changes. Only Chapters 1 and 2 are up-to-date.**
 
-**The example code has been updated and can be used. The text will be updated
-once the API churn dies down.**
+**Example code from Chapters 3 to 5 will compile and run, but has not been
+updated**
 
 Welcome to Chapter 2 of the "Building an ORC-based JIT in LLVM" tutorial. In
 `Chapter 1 <BuildingAJIT1.html>`_ of this series we examined a basic JIT
@@ -42,67 +43,49 @@ added to it. In this Chapter we will make optimization a phase of our JIT
 instead. For now this will provide us a motivation to learn more about ORC
 layers, but in the long term making optimization part of our JIT will yield an
 important benefit: When we begin lazily compiling code (i.e. deferring
-compilation of each function until the first time it's run), having
+compilation of each function until the first time it's run) having
 optimization managed by our JIT will allow us to optimize lazily too, rather
 than having to do all our optimization up-front.
 
 To add optimization support to our JIT we will take the KaleidoscopeJIT from
 Chapter 1 and compose an ORC *IRTransformLayer* on top. We will look at how the
 IRTransformLayer works in more detail below, but the interface is simple: the
-constructor for this layer takes a reference to the layer below (as all layers
-do) plus an *IR optimization function* that it will apply to each Module that
-is added via addModule:
+constructor for this layer takes a reference to the execution session and the
+layer below (as all layers do) plus an *IR optimization function* that it will
+apply to each Module that is added via addModule:
 
 .. code-block:: c++
 
   class KaleidoscopeJIT {
   private:
-    std::unique_ptr<TargetMachine> TM;
-    const DataLayout DL;
-    RTDyldObjectLinkingLayer<> ObjectLayer;
-    IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
+    ExecutionSession ES;
+    RTDyldObjectLinkingLayer ObjectLayer;
+    IRCompileLayer CompileLayer;
+    IRTransformLayer TransformLayer;
 
-    using OptimizeFunction =
-        std::function<std::shared_ptr<Module>(std::shared_ptr<Module>)>;
-
-    IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+    DataLayout DL;
+    MangleAndInterner Mangle;
+    ThreadSafeContext Ctx;
 
   public:
-    using ModuleHandle = decltype(OptimizeLayer)::ModuleHandleT;
-
-    KaleidoscopeJIT()
-        : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
-          ObjectLayer([]() { return std::make_shared<SectionMemoryManager>(); }),
-          CompileLayer(ObjectLayer, SimpleCompiler(*TM)),
-          OptimizeLayer(CompileLayer,
-                        [this](std::unique_ptr<Module> M) {
-                          return optimizeModule(std::move(M));
-                        }) {
-      llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
+
+    KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL)
+        : ObjectLayer(ES,
+                      []() { return llvm::make_unique<SectionMemoryManager>(); }),
+          CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))),
+          TransformLayer(ES, CompileLayer, optimizeModule),
+          DL(std::move(DL)), Mangle(ES, this->DL),
+          Ctx(llvm::make_unique<LLVMContext>()) {
+      ES.getMainJITDylib().setGenerator(
+          cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
     }
 
 Our extended KaleidoscopeJIT class starts out the same as it did in Chapter 1,
-but after the CompileLayer we introduce a typedef for our optimization function.
-In this case we use a std::function (a handy wrapper for "function-like" things)
-from a single unique_ptr<Module> input to a std::unique_ptr<Module> output. With
-our optimization function typedef in place we can declare our OptimizeLayer,
-which sits on top of our CompileLayer.
-
-To initialize our OptimizeLayer we pass it a reference to the CompileLayer
-below (standard practice for layers), and we initialize the OptimizeFunction
-using a lambda that calls out to an "optimizeModule" function that we will
-define below.
-
-.. code-block:: c++
-
-  // ...
-  auto Resolver = createLambdaResolver(
-      [&](const std::string &Name) {
-        if (auto Sym = OptimizeLayer.findSymbol(Name, false))
-          return Sym;
-        return JITSymbol(nullptr);
-      },
-  // ...
+but after the CompileLayer we introduce a new member, TransformLayer, which sits
+on top of our CompileLayer. We initialize our OptimizeLayer with a reference to
+the ExecutionSession and output layer (standard practice for layers), along with
+a *transform function*. For our transform function we supply our classes
+optimizeModule static method.
 
 .. code-block:: c++
 
@@ -111,26 +94,13 @@ define below.
                                           std::move(Resolver)));
   // ...
 
-.. code-block:: c++
-
-  // ...
-  return OptimizeLayer.findSymbol(MangledNameStream.str(), true);
-  // ...
-
-.. code-block:: c++
-
-  // ...
-  cantFail(OptimizeLayer.removeModule(H));
-  // ...
-
-Next we need to replace references to 'CompileLayer' with references to
-OptimizeLayer in our key methods: addModule, findSymbol, and removeModule. In
-addModule we need to be careful to replace both references: the findSymbol call
-inside our resolver, and the call through to addModule.
+Next we need to update our addModule method to replace the call to
+``CompileLayer::add`` with a call to ``OptimizeLayer::add`` instead.
 
 .. code-block:: c++
 
-  std::shared_ptr<Module> optimizeModule(std::shared_ptr<Module> M) {
+  static Expected<ThreadSafeModule>
+  optimizeModule(ThreadSafeModule M, const MaterializationResponsibility &R) {
     // Create a function pass manager.
     auto FPM = llvm::make_unique<legacy::FunctionPassManager>(M.get());
 
@@ -150,12 +120,18 @@ inside our resolver, and the call through to addModule.
   }
 
 At the bottom of our JIT we add a private method to do the actual optimization:
-*optimizeModule*. This function sets up a FunctionPassManager, adds some passes
-to it, runs it over every function in the module, and then returns the mutated
-module. The specific optimizations are the same ones used in
-`Chapter 4 <LangImpl04.html>`_ of the "Implementing a language with LLVM"
-tutorial series. Readers may visit that chapter for a more in-depth
-discussion of these, and of IR optimization in general.
+*optimizeModule*. This function takes the module to be transformed as input (as
+a ThreadSafeModule) along with a reference to a reference to a new class:
+``MaterializationResponsibility``. The MaterializationResponsibility argument
+can be used to query JIT state for the module being transformed, such as the set
+of definitions in the module that JIT'd code is actively trying to call/access.
+For now we will ignore this argument and use a standard optimization
+pipeline. To do this we set up a FunctionPassManager, add some passes to it, run
+it over every function in the module, and then return the mutated module. The
+specific optimizations are the same ones used in `Chapter 4 <LangImpl04.html>`_
+of the "Implementing a language with LLVM" tutorial series. Readers may visit
+that chapter for a more in-depth discussion of these, and of IR optimization in
+general.
 
 And that's it in terms of changes to KaleidoscopeJIT: When a module is added via
 addModule the OptimizeLayer will call our optimizeModule function before passing
@@ -163,148 +139,122 @@ the transformed module on to the CompileLayer below. Of course, we could have
 called optimizeModule directly in our addModule function and not gone to the
 bother of using the IRTransformLayer, but doing so gives us another opportunity
 to see how layers compose. It also provides a neat entry point to the *layer*
-concept itself, because IRTransformLayer turns out to be one of the simplest
-implementations of the layer concept that can be devised:
+concept itself, because IRTransformLayer is one of the simplest layers that
+can be implemented.
 
 .. code-block:: c++
 
-  template <typename BaseLayerT, typename TransformFtor>
-  class IRTransformLayer {
+  // From IRTransformLayer.h:
+  class IRTransformLayer : public IRLayer {
   public:
-    using ModuleHandleT = typename BaseLayerT::ModuleHandleT;
+    using TransformFunction = std::function<Expected<ThreadSafeModule>(
+        ThreadSafeModule, const MaterializationResponsibility &R)>;
 
-    IRTransformLayer(BaseLayerT &BaseLayer,
-                     TransformFtor Transform = TransformFtor())
-      : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
+    IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer,
+                     TransformFunction Transform = identityTransform);
 
-    Expected<ModuleHandleT>
-    addModule(std::shared_ptr<Module> M,
-              std::shared_ptr<JITSymbolResolver> Resolver) {
-      return BaseLayer.addModule(Transform(std::move(M)), std::move(Resolver));
+    void setTransform(TransformFunction Transform) {
+      this->Transform = std::move(Transform);
     }
 
-    void removeModule(ModuleHandleT H) { BaseLayer.removeModule(H); }
-
-    JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
-      return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
+    static ThreadSafeModule
+    identityTransform(ThreadSafeModule TSM,
+                      const MaterializationResponsibility &R) {
+      return TSM;
     }
 
-    JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
-                           bool ExportedSymbolsOnly) {
-      return BaseLayer.findSymbolIn(H, Name, ExportedSymbolsOnly);
-    }
+    void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
 
-    void emitAndFinalize(ModuleHandleT H) {
-      BaseLayer.emitAndFinalize(H);
-    }
+  private:
+    IRLayer &BaseLayer;
+    TransformFunction Transform;
+  };
 
-    TransformFtor& getTransform() { return Transform; }
+  // From IRTransfomrLayer.cpp:
 
-    const TransformFtor& getTransform() const { return Transform; }
+  IRTransformLayer::IRTransformLayer(ExecutionSession &ES,
+                                     IRLayer &BaseLayer,
+                                     TransformFunction Transform)
+      : IRLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-  private:
-    BaseLayerT &BaseLayer;
-    TransformFtor Transform;
-  };
+  void IRTransformLayer::emit(MaterializationResponsibility R,
+                              ThreadSafeModule TSM) {
+    assert(TSM.getModule() && "Module must not be null");
+
+    if (auto TransformedTSM = Transform(std::move(TSM), R))
+      BaseLayer.emit(std::move(R), std::move(*TransformedTSM));
+    else {
+      R.failMaterialization();
+      getExecutionSession().reportError(TransformedTSM.takeError());
+    }
+  }
 
 This is the whole definition of IRTransformLayer, from
-``llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h``, stripped of its
-comments. It is a template class with two template arguments: ``BaesLayerT`` and
-``TransformFtor`` that provide the type of the base layer and the type of the
-"transform functor" (in our case a std::function) respectively. This class is
-concerned with two very simple jobs: (1) Running every IR Module that is added
-with addModule through the transform functor, and (2) conforming to the ORC
-layer interface. The interface consists of one typedef and five methods:
-
-+------------------+-----------------------------------------------------------+
-|     Interface    |                         Description                       |
-+==================+===========================================================+
-|                  | Provides a handle that can be used to identify a module   |
-| ModuleHandleT    | set when calling findSymbolIn, removeModule, or           |
-|                  | emitAndFinalize.                                          |
-+------------------+-----------------------------------------------------------+
-|                  | Takes a given set of Modules and makes them "available    |
-|                  | for execution". This means that symbols in those modules  |
-|                  | should be searchable via findSymbol and findSymbolIn, and |
-|                  | the address of the symbols should be read/writable (for   |
-|                  | data symbols), or executable (for function symbols) after |
-|                  | JITSymbol::getAddress() is called. Note: This means that  |
-|   addModule      | addModule doesn't have to compile (or do any other        |
-|                  | work) up-front. It *can*, like IRCompileLayer, act        |
-|                  | eagerly, but it can also simply record the module and     |
-|                  | take no further action until somebody calls               |
-|                  | JITSymbol::getAddress(). In IRTransformLayer's case       |
-|                  | addModule eagerly applies the transform functor to        |
-|                  | each module in the set, then passes the resulting set     |
-|                  | of mutated modules down to the layer below.               |
-+------------------+-----------------------------------------------------------+
-|                  | Removes a set of modules from the JIT. Code or data       |
-|  removeModule    | defined in these modules will no longer be available, and |
-|                  | the memory holding the JIT'd definitions will be freed.   |
-+------------------+-----------------------------------------------------------+
-|                  | Searches for the named symbol in all modules that have    |
-|                  | previously been added via addModule (and not yet          |
-|    findSymbol    | removed by a call to removeModule). In                    |
-|                  | IRTransformLayer we just pass the query on to the layer   |
-|                  | below. In our REPL this is our default way to search for  |
-|                  | function definitions.                                     |
-+------------------+-----------------------------------------------------------+
-|                  | Searches for the named symbol in the module set indicated |
-|                  | by the given ModuleHandleT. This is just an optimized     |
-|                  | search, better for lookup-speed when you know exactly     |
-|                  | a symbol definition should be found. In IRTransformLayer  |
-|   findSymbolIn   | we just pass this query on to the layer below. In our     |
-|                  | REPL we use this method to search for functions           |
-|                  | representing top-level expressions, since we know exactly |
-|                  | where we'll find them: in the top-level expression module |
-|                  | we just added.                                            |
-+------------------+-----------------------------------------------------------+
-|                  | Forces all of the actions required to make the code and   |
-|                  | data in a module set (represented by a ModuleHandleT)     |
-|                  | accessible. Behaves as if some symbol in the set had been |
-|                  | searched for and JITSymbol::getSymbolAddress called. This |
-| emitAndFinalize  | is rarely needed, but can be useful when dealing with     |
-|                  | layers that usually behave lazily if the user wants to    |
-|                  | trigger early compilation (for example, to use idle CPU   |
-|                  | time to eagerly compile code in the background).          |
-+------------------+-----------------------------------------------------------+
-
-This interface attempts to capture the natural operations of a JIT (with some
-wrinkles like emitAndFinalize for performance), similar to the basic JIT API
-operations we identified in Chapter 1. Conforming to the layer concept allows
-classes to compose neatly by implementing their behaviors in terms of the these
-same operations, carried out on the layer below. For example, an eager layer
-(like IRTransformLayer) can implement addModule by running each module in the
-set through its transform up-front and immediately passing the result to the
-layer below. A lazy layer, by contrast, could implement addModule by
-squirreling away the modules doing no other up-front work, but applying the
-transform (and calling addModule on the layer below) when the client calls
-findSymbol instead. The JIT'd program behavior will be the same either way, but
-these choices will have different performance characteristics: Doing work
-eagerly means the JIT takes longer up-front, but proceeds smoothly once this is
-done. Deferring work allows the JIT to get up-and-running quickly, but will
-force the JIT to pause and wait whenever some code or data is needed that hasn't
-already been processed.
-
-Our current REPL is eager: Each function definition is optimized and compiled as
-soon as it's typed in. If we were to make the transform layer lazy (but not
-change things otherwise) we could defer optimization until the first time we
-reference a function in a top-level expression (see if you can figure out why,
-then check out the answer below [1]_). In the next chapter, however we'll
-introduce fully lazy compilation, in which function's aren't compiled until
-they're first called at run-time. At this point the trade-offs get much more
+``llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h`` and
+``llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp``.  This class is concerned
+with two very simple jobs: (1) Running every IR Module that is emitted via this
+layer through the transform function object, and (2) implementing the ORC
+``IRLayer`` interface (which itself conforms to the general ORC Layer concept,
+more on that below). Most of the class is straightforward: a typedef for the
+transform function, a constructor to initialize the members, a setter for the
+transform function value, and a default no-op transform. The most important
+method is ``emit`` as this is half of our IRLayer interface. The emit method
+applies our transform to each module that it is called on and, if the transform
+succeeds, passes the transformed module to the base layer. If the transform
+fails, our emit function calls
+``MaterializationResponsibility::failMaterialization`` (this JIT clients who
+may be waiting on other threads know that the code they were waiting for has
+failed to compile) and logs the error with the execution session before bailing
+out.
+
+The other half of the IRLayer interface we inherit unmodified from the IRLayer
+class:
+
+.. code-block:: c++
+
+  Error IRLayer::add(JITDylib &JD, ThreadSafeModule TSM, VModuleKey K) {
+    return JD.define(llvm::make_unique<BasicIRLayerMaterializationUnit>(
+        *this, std::move(K), std::move(TSM)));
+  }
+
+This code, from ``llvm/lib/ExecutionEngine/Orc/Layer.cpp``, adds a
+ThreadSafeModule to a given JITDylib by wrapping it up in a
+``MaterializationUnit`` (in this case a ``BasicIRLayerMaterializationUnit``).
+Most layers that derived from IRLayer can rely on this default implementation
+of the ``add`` method.
+
+These two operations, ``add`` and ``emit``, together constitute the layer
+concept: A layer is a way to wrap a portion of a compiler pipeline (in this case
+the "opt" phase of an LLVM compiler) whose API is is opaque to ORC in an
+interface that allows ORC to invoke it when needed. The add method takes an
+module in some input program representation (in this case an LLVM IR module) and
+stores it in the target JITDylib, arranging for it to be passed back to the
+Layer's emit method when any symbol defined by that module is requested. Layers
+can compose neatly by calling the 'emit' method of a base layer to complete
+their work. For example, in this tutorial our IRTransformLayer calls through to
+our IRCompileLayer to compile the transformed IR, and our IRCompileLayer in turn
+calls our ObjectLayer to link the object file produced by our compiler.
+
+
+So far we have learned how to optimize and compile our LLVM IR, but we have not
+focused on when compilation happens. Our current REPL is eager: Each function
+definition is optimized and compiled as soon as it is referenced by any other
+code, regardless of whether it is ever called at runtime. In the next chapter we
+will introduce fully lazy compilation, in which functions are not compiled until
+they are first called at run-time. At this point the trade-offs get much more
 interesting: the lazier we are, the quicker we can start executing the first
-function, but the more often we'll have to pause to compile newly encountered
-functions. If we only code-gen lazily, but optimize eagerly, we'll have a slow
-startup (which everything is optimized) but relatively short pauses as each
-function just passes through code-gen. If we both optimize and code-gen lazily
-we can start executing the first function more quickly, but we'll have longer
-pauses as each function has to be both optimized and code-gen'd when it's first
-executed. Things become even more interesting if we consider interproceedural
-optimizations like inlining, which must be performed eagerly. These are
-complex trade-offs, and there is no one-size-fits all solution to them, but by
-providing composable layers we leave the decisions to the person implementing
-the JIT, and make it easy for them to experiment with different configurations.
+function, but the more often we will have to pause to compile newly encountered
+functions. If we only code-gen lazily, but optimize eagerly, we will have a
+longer startup time (as everything is optimized) but relatively short pauses as
+each function just passes through code-gen. If we both optimize and code-gen
+lazily we can start executing the first function more quickly, but we will have
+longer pauses as each function has to be both optimized and code-gen'd when it
+is first executed. Things become even more interesting if we consider
+interproceedural optimizations like inlining, which must be performed eagerly.
+These are complex trade-offs, and there is no one-size-fits all solution to
+them, but by providing composable layers we leave the decisions to the person
+implementing the JIT, and make it easy for them to experiment with different
+configurations.
 
 `Next: Adding Per-function Lazy Compilation <BuildingAJIT3.html>`_
 
@@ -325,10 +275,3 @@ Here is the code:
 
 .. literalinclude:: ../../examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
    :language: c++
-
-.. [1] When we add our top-level expression to the JIT, any calls to functions
-       that we defined earlier will appear to the RTDyldObjectLinkingLayer as
-       external symbols. The RTDyldObjectLinkingLayer will call the SymbolResolver
-       that we defined in addModule, which in turn calls findSymbol on the
-       OptimizeLayer, at which point even a lazy transform layer will have to
-       do its work.
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
index 5a66b367c27368a8d383cb9ad8685e4b4cdccee8..8cbd5d6f2f7737fb2f1b109e973b31e6f7292129 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
@@ -679,9 +679,8 @@ static std::unique_ptr<FunctionAST> ParseDefinition() {
 static std::unique_ptr<FunctionAST> ParseTopLevelExpr(unsigned ExprCount) {
   if (auto E = ParseExpression()) {
     // Make an anonymous proto.
-    auto Proto = llvm::make_unique<PrototypeAST>(("__anon_expr" +
-                                                  Twine(ExprCount)).str(),
-                                                 std::vector<std::string>());
+    auto Proto = llvm::make_unique<PrototypeAST>
+        (("__anon_expr" + Twine(ExprCount)).str(), std::vector<std::string>());
     return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(E));
   }
   return nullptr;
@@ -1157,7 +1156,7 @@ static void HandleTopLevelExpression() {
 
       // Get the anonymous expression's JITSymbol.
       auto Sym =
-        ExitOnErr(TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str()));
+          ExitOnErr(TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str()));
 
       auto *FP = (double (*)())(intptr_t)Sym.getAddress();
       assert(FP && "Failed to codegen function");
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
index 7c803b138c0666e9406bccad50f6aeb653c07ef1..4dc08470dffabf70188014569572c051b59623e5 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
@@ -14,29 +14,23 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
 #define LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
-#include <algorithm>
 #include <memory>
-#include <string>
-#include <vector>
 
 namespace llvm {
 namespace orc {
@@ -44,69 +38,57 @@ namespace orc {
 class KaleidoscopeJIT {
 private:
   ExecutionSession ES;
-  std::shared_ptr<SymbolResolver> Resolver;
-  std::unique_ptr<TargetMachine> TM;
-  const DataLayout DL;
-  LegacyRTDyldObjectLinkingLayer ObjectLayer;
-  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  RTDyldObjectLinkingLayer ObjectLayer;
+  IRCompileLayer CompileLayer;
+  IRTransformLayer OptimizeLayer;
 
-  using OptimizeFunction =
-      std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
-
-  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  DataLayout DL;
+  MangleAndInterner Mangle;
+  ThreadSafeContext Ctx;
 
 public:
-  KaleidoscopeJIT()
-      : Resolver(createLegacyLookupResolver(
-            ES,
-            [this](const std::string &Name) -> JITSymbol {
-              if (auto Sym = OptimizeLayer.findSymbol(Name, false))
-                return Sym;
-              else if (auto Err = Sym.takeError())
-                return std::move(Err);
-              if (auto SymAddr =
-                      RTDyldMemoryManager::getSymbolAddressInProcess(Name))
-                return JITSymbol(SymAddr, JITSymbolFlags::Exported);
-              return nullptr;
-            },
-            [](Error Err) { cantFail(std::move(Err), "lookupFlags failed"); })),
-        TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
-        ObjectLayer(ES,
-                    [this](VModuleKey) {
-                      return LegacyRTDyldObjectLinkingLayer::Resources{
-                          std::make_shared<SectionMemoryManager>(), Resolver};
-                    }),
-        CompileLayer(ObjectLayer, SimpleCompiler(*TM)),
-        OptimizeLayer(CompileLayer, [this](std::unique_ptr<Module> M) {
-          return optimizeModule(std::move(M));
-        }) {
-    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
+  KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL)
+      : ObjectLayer(ES,
+                    []() { return llvm::make_unique<SectionMemoryManager>(); }),
+        CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))),
+        OptimizeLayer(ES, CompileLayer, optimizeModule),
+        DL(std::move(DL)), Mangle(ES, this->DL),
+        Ctx(llvm::make_unique<LLVMContext>()) {
+    ES.getMainJITDylib().setGenerator(
+        cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
   }
 
-  TargetMachine &getTargetMachine() { return *TM; }
+  const DataLayout &getDataLayout() const { return DL; }
+
+  LLVMContext &getContext() { return *Ctx.getContext(); }
+
+  static Expected<std::unique_ptr<KaleidoscopeJIT>> Create() {
+    auto JTMB = JITTargetMachineBuilder::detectHost();
+
+    if (!JTMB)
+      return JTMB.takeError();
+
+    auto DL = JTMB->getDefaultDataLayoutForTarget();
+    if (!DL)
+      return DL.takeError();
 
-  VModuleKey addModule(std::unique_ptr<Module> M) {
-    // Add the module to the JIT with a new VModuleKey.
-    auto K = ES.allocateVModule();
-    cantFail(OptimizeLayer.addModule(K, std::move(M)));
-    return K;
+    return llvm::make_unique<KaleidoscopeJIT>(std::move(*JTMB), std::move(*DL));
   }
 
-  JITSymbol findSymbol(const std::string Name) {
-    std::string MangledName;
-    raw_string_ostream MangledNameStream(MangledName);
-    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    return OptimizeLayer.findSymbol(MangledNameStream.str(), true);
+  Error addModule(std::unique_ptr<Module> M) {
+    return OptimizeLayer.add(ES.getMainJITDylib(),
+                             ThreadSafeModule(std::move(M), Ctx));
   }
 
-  void removeModule(VModuleKey K) {
-    cantFail(OptimizeLayer.removeModule(K));
+  Expected<JITEvaluatedSymbol> lookup(StringRef Name) {
+    return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name.str()));
   }
 
 private:
-  std::unique_ptr<Module> optimizeModule(std::unique_ptr<Module> M) {
+  static Expected<ThreadSafeModule>
+  optimizeModule(ThreadSafeModule TSM, const MaterializationResponsibility &R) {
     // Create a function pass manager.
-    auto FPM = llvm::make_unique<legacy::FunctionPassManager>(M.get());
+    auto FPM = llvm::make_unique<legacy::FunctionPassManager>(TSM.getModule());
 
     // Add some optimizations.
     FPM->add(createInstructionCombiningPass());
@@ -117,10 +99,10 @@ private:
 
     // Run the optimizations over all functions in the module being added to
     // the JIT.
-    for (auto &F : *M)
+    for (auto &F : *TSM.getModule())
       FPM->run(F);
 
-    return M;
+    return TSM;
   }
 };
 
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp
index 2471344c6d65fdd3aaba7211e50a94dc892f94fa..e7c3624aa04d575ced97dfaa0233eba6c6d6301b 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp
@@ -676,11 +676,11 @@ static std::unique_ptr<FunctionAST> ParseDefinition() {
 }
 
 /// toplevelexpr ::= expression
-static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
+static std::unique_ptr<FunctionAST> ParseTopLevelExpr(unsigned ExprCount) {
   if (auto E = ParseExpression()) {
     // Make an anonymous proto.
-    auto Proto = llvm::make_unique<PrototypeAST>("__anon_expr",
-                                                 std::vector<std::string>());
+    auto Proto = llvm::make_unique<PrototypeAST>(
+        ("__anon_expr" + Twine(ExprCount)).str(), std::vector<std::string>());
     return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(E));
   }
   return nullptr;
@@ -696,12 +696,13 @@ static std::unique_ptr<PrototypeAST> ParseExtern() {
 // Code Generation
 //===----------------------------------------------------------------------===//
 
-static LLVMContext TheContext;
-static IRBuilder<> Builder(TheContext);
+static std::unique_ptr<KaleidoscopeJIT> TheJIT;
+static LLVMContext *TheContext;
+static std::unique_ptr<IRBuilder<>> Builder;
 static std::unique_ptr<Module> TheModule;
 static std::map<std::string, AllocaInst *> NamedValues;
-static std::unique_ptr<KaleidoscopeJIT> TheJIT;
 static std::map<std::string, std::unique_ptr<PrototypeAST>> FunctionProtos;
+static ExitOnError ExitOnErr;
 
 Value *LogErrorV(const char *Str) {
   LogError(Str);
@@ -729,11 +730,11 @@ static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
                                           const std::string &VarName) {
   IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
                    TheFunction->getEntryBlock().begin());
-  return TmpB.CreateAlloca(Type::getDoubleTy(TheContext), nullptr, VarName);
+  return TmpB.CreateAlloca(Type::getDoubleTy(*TheContext), nullptr, VarName);
 }
 
 Value *NumberExprAST::codegen() {
-  return ConstantFP::get(TheContext, APFloat(Val));
+  return ConstantFP::get(*TheContext, APFloat(Val));
 }
 
 Value *VariableExprAST::codegen() {
@@ -743,7 +744,7 @@ Value *VariableExprAST::codegen() {
     return LogErrorV("Unknown variable name");
 
   // Load the value.
-  return Builder.CreateLoad(V, Name.c_str());
+  return Builder->CreateLoad(V, Name.c_str());
 }
 
 Value *UnaryExprAST::codegen() {
@@ -755,7 +756,7 @@ Value *UnaryExprAST::codegen() {
   if (!F)
     return LogErrorV("Unknown unary operator");
 
-  return Builder.CreateCall(F, OperandV, "unop");
+  return Builder->CreateCall(F, OperandV, "unop");
 }
 
 Value *BinaryExprAST::codegen() {
@@ -778,7 +779,7 @@ Value *BinaryExprAST::codegen() {
     if (!Variable)
       return LogErrorV("Unknown variable name");
 
-    Builder.CreateStore(Val, Variable);
+    Builder->CreateStore(Val, Variable);
     return Val;
   }
 
@@ -789,15 +790,15 @@ Value *BinaryExprAST::codegen() {
 
   switch (Op) {
   case '+':
-    return Builder.CreateFAdd(L, R, "addtmp");
+    return Builder->CreateFAdd(L, R, "addtmp");
   case '-':
-    return Builder.CreateFSub(L, R, "subtmp");
+    return Builder->CreateFSub(L, R, "subtmp");
   case '*':
-    return Builder.CreateFMul(L, R, "multmp");
+    return Builder->CreateFMul(L, R, "multmp");
   case '<':
-    L = Builder.CreateFCmpULT(L, R, "cmptmp");
+    L = Builder->CreateFCmpULT(L, R, "cmptmp");
     // Convert bool 0/1 to double 0.0 or 1.0
-    return Builder.CreateUIToFP(L, Type::getDoubleTy(TheContext), "booltmp");
+    return Builder->CreateUIToFP(L, Type::getDoubleTy(*TheContext), "booltmp");
   default:
     break;
   }
@@ -808,7 +809,7 @@ Value *BinaryExprAST::codegen() {
   assert(F && "binary operator not found!");
 
   Value *Ops[] = {L, R};
-  return Builder.CreateCall(F, Ops, "binop");
+  return Builder->CreateCall(F, Ops, "binop");
 }
 
 Value *CallExprAST::codegen() {
@@ -828,7 +829,7 @@ Value *CallExprAST::codegen() {
       return nullptr;
   }
 
-  return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
+  return Builder->CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::codegen() {
@@ -837,46 +838,46 @@ Value *IfExprAST::codegen() {
     return nullptr;
 
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(
-      CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
+  CondV = Builder->CreateFCmpONE(
+      CondV, ConstantFP::get(*TheContext, APFloat(0.0)), "ifcond");
 
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
-  BasicBlock *ThenBB = BasicBlock::Create(TheContext, "then", TheFunction);
-  BasicBlock *ElseBB = BasicBlock::Create(TheContext, "else");
-  BasicBlock *MergeBB = BasicBlock::Create(TheContext, "ifcont");
+  BasicBlock *ThenBB = BasicBlock::Create(*TheContext, "then", TheFunction);
+  BasicBlock *ElseBB = BasicBlock::Create(*TheContext, "else");
+  BasicBlock *MergeBB = BasicBlock::Create(*TheContext, "ifcont");
 
-  Builder.CreateCondBr(CondV, ThenBB, ElseBB);
+  Builder->CreateCondBr(CondV, ThenBB, ElseBB);
 
   // Emit then value.
-  Builder.SetInsertPoint(ThenBB);
+  Builder->SetInsertPoint(ThenBB);
 
   Value *ThenV = Then->codegen();
   if (!ThenV)
     return nullptr;
 
-  Builder.CreateBr(MergeBB);
+  Builder->CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
-  ThenBB = Builder.GetInsertBlock();
+  ThenBB = Builder->GetInsertBlock();
 
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
-  Builder.SetInsertPoint(ElseBB);
+  Builder->SetInsertPoint(ElseBB);
 
   Value *ElseV = Else->codegen();
   if (!ElseV)
     return nullptr;
 
-  Builder.CreateBr(MergeBB);
+  Builder->CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
-  ElseBB = Builder.GetInsertBlock();
+  ElseBB = Builder->GetInsertBlock();
 
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
-  Builder.SetInsertPoint(MergeBB);
-  PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(TheContext), 2, "iftmp");
+  Builder->SetInsertPoint(MergeBB);
+  PHINode *PN = Builder->CreatePHI(Type::getDoubleTy(*TheContext), 2, "iftmp");
 
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
@@ -903,7 +904,7 @@ Value *IfExprAST::codegen() {
 //   br endcond, loop, endloop
 // outloop:
 Value *ForExprAST::codegen() {
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Create an alloca for the variable in the entry block.
   AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
@@ -914,17 +915,17 @@ Value *ForExprAST::codegen() {
     return nullptr;
 
   // Store the value into the alloca.
-  Builder.CreateStore(StartVal, Alloca);
+  Builder->CreateStore(StartVal, Alloca);
 
   // Make the new basic block for the loop header, inserting after current
   // block.
-  BasicBlock *LoopBB = BasicBlock::Create(TheContext, "loop", TheFunction);
+  BasicBlock *LoopBB = BasicBlock::Create(*TheContext, "loop", TheFunction);
 
   // Insert an explicit fall through from the current block to the LoopBB.
-  Builder.CreateBr(LoopBB);
+  Builder->CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
-  Builder.SetInsertPoint(LoopBB);
+  Builder->SetInsertPoint(LoopBB);
 
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
@@ -945,7 +946,7 @@ Value *ForExprAST::codegen() {
       return nullptr;
   } else {
     // If not specified, use 1.0.
-    StepVal = ConstantFP::get(TheContext, APFloat(1.0));
+    StepVal = ConstantFP::get(*TheContext, APFloat(1.0));
   }
 
   // Compute the end condition.
@@ -955,23 +956,23 @@ Value *ForExprAST::codegen() {
 
   // Reload, increment, and restore the alloca.  This handles the case where
   // the body of the loop mutates the variable.
-  Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str());
-  Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
-  Builder.CreateStore(NextVar, Alloca);
+  Value *CurVar = Builder->CreateLoad(Alloca, VarName.c_str());
+  Value *NextVar = Builder->CreateFAdd(CurVar, StepVal, "nextvar");
+  Builder->CreateStore(NextVar, Alloca);
 
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(
-      EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
+  EndCond = Builder->CreateFCmpONE(
+      EndCond, ConstantFP::get(*TheContext, APFloat(0.0)), "loopcond");
 
   // Create the "after loop" block and insert it.
   BasicBlock *AfterBB =
-      BasicBlock::Create(TheContext, "afterloop", TheFunction);
+      BasicBlock::Create(*TheContext, "afterloop", TheFunction);
 
   // Insert the conditional branch into the end of LoopEndBB.
-  Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
+  Builder->CreateCondBr(EndCond, LoopBB, AfterBB);
 
   // Any new code will be inserted in AfterBB.
-  Builder.SetInsertPoint(AfterBB);
+  Builder->SetInsertPoint(AfterBB);
 
   // Restore the unshadowed variable.
   if (OldVal)
@@ -980,13 +981,13 @@ Value *ForExprAST::codegen() {
     NamedValues.erase(VarName);
 
   // for expr always returns 0.0.
-  return Constant::getNullValue(Type::getDoubleTy(TheContext));
+  return Constant::getNullValue(Type::getDoubleTy(*TheContext));
 }
 
 Value *VarExprAST::codegen() {
   std::vector<AllocaInst *> OldBindings;
 
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Register all variables and emit their initializer.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i) {
@@ -1004,11 +1005,11 @@ Value *VarExprAST::codegen() {
       if (!InitVal)
         return nullptr;
     } else { // If not specified, use 0.0.
-      InitVal = ConstantFP::get(TheContext, APFloat(0.0));
+      InitVal = ConstantFP::get(*TheContext, APFloat(0.0));
     }
 
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
-    Builder.CreateStore(InitVal, Alloca);
+    Builder->CreateStore(InitVal, Alloca);
 
     // Remember the old variable binding so that we can restore the binding when
     // we unrecurse.
@@ -1033,9 +1034,9 @@ Value *VarExprAST::codegen() {
 
 Function *PrototypeAST::codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type *> Doubles(Args.size(), Type::getDoubleTy(TheContext));
+  std::vector<Type *> Doubles(Args.size(), Type::getDoubleTy(*TheContext));
   FunctionType *FT =
-      FunctionType::get(Type::getDoubleTy(TheContext), Doubles, false);
+      FunctionType::get(Type::getDoubleTy(*TheContext), Doubles, false);
 
   Function *F =
       Function::Create(FT, Function::ExternalLinkage, Name, TheModule.get());
@@ -1062,8 +1063,8 @@ Function *FunctionAST::codegen() {
     BinopPrecedence[P.getOperatorName()] = P.getBinaryPrecedence();
 
   // Create a new basic block to start insertion into.
-  BasicBlock *BB = BasicBlock::Create(TheContext, "entry", TheFunction);
-  Builder.SetInsertPoint(BB);
+  BasicBlock *BB = BasicBlock::Create(*TheContext, "entry", TheFunction);
+  Builder->SetInsertPoint(BB);
 
   // Record the function arguments in the NamedValues map.
   NamedValues.clear();
@@ -1072,7 +1073,7 @@ Function *FunctionAST::codegen() {
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, Arg.getName());
 
     // Store the initial value into the alloca.
-    Builder.CreateStore(&Arg, Alloca);
+    Builder->CreateStore(&Arg, Alloca);
 
     // Add arguments to variable symbol table.
     NamedValues[Arg.getName()] = Alloca;
@@ -1080,7 +1081,7 @@ Function *FunctionAST::codegen() {
 
   if (Value *RetVal = Body->codegen()) {
     // Finish off the function.
-    Builder.CreateRet(RetVal);
+    Builder->CreateRet(RetVal);
 
     // Validate the generated code, checking for consistency.
     verifyFunction(*TheFunction);
@@ -1102,8 +1103,11 @@ Function *FunctionAST::codegen() {
 
 static void InitializeModule() {
   // Open a new module.
-  TheModule = llvm::make_unique<Module>("my cool jit", TheContext);
-  TheModule->setDataLayout(TheJIT->getTargetMachine().createDataLayout());
+  TheModule = llvm::make_unique<Module>("my cool jit", *TheContext);
+  TheModule->setDataLayout(TheJIT->getDataLayout());
+
+  // Create a new builder for the module.
+  Builder = llvm::make_unique<IRBuilder<>>(*TheContext);
 }
 
 static void HandleDefinition() {
@@ -1112,7 +1116,7 @@ static void HandleDefinition() {
       fprintf(stderr, "Read function definition:");
       FnIR->print(errs());
       fprintf(stderr, "\n");
-      TheJIT->addModule(std::move(TheModule));
+      ExitOnErr(TheJIT->addModule(std::move(TheModule)));
       InitializeModule();
     }
   } else {
@@ -1136,25 +1140,27 @@ static void HandleExtern() {
 }
 
 static void HandleTopLevelExpression() {
+  static unsigned ExprCount = 0;
+
+  // Update ExprCount. This number will be added to anonymous expressions to
+  // prevent them from clashing.
+  ++ExprCount;
+
   // Evaluate a top-level expression into an anonymous function.
-  if (auto FnAST = ParseTopLevelExpr()) {
+  if (auto FnAST = ParseTopLevelExpr(ExprCount)) {
     if (FnAST->codegen()) {
       // JIT the module containing the anonymous expression, keeping a handle so
       // we can free it later.
-      auto H = TheJIT->addModule(std::move(TheModule));
+      ExitOnErr(TheJIT->addModule(std::move(TheModule)));
       InitializeModule();
 
-      // Search the JIT for the __anon_expr symbol.
-      auto ExprSymbol = TheJIT->findSymbol("__anon_expr");
-      assert(ExprSymbol && "Function not found");
+      // Get the anonymous expression's JITSymbol.
+      auto Sym =
+          ExitOnErr(TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str()));
 
-      // Get the symbol's address and cast it to the right type (takes no
-      // arguments, returns a double) so we can call it as a native function.
-      double (*FP)() = (double (*)())(intptr_t)cantFail(ExprSymbol.getAddress());
+      auto *FP = (double (*)())(intptr_t)Sym.getAddress();
+      assert(FP && "Failed to codegen function");
       fprintf(stderr, "Evaluated to %f\n", FP());
-
-      // Delete the anonymous expression module from the JIT.
-      TheJIT->removeModule(H);
     }
   } else {
     // Skip token for error recovery.
@@ -1222,7 +1228,8 @@ int main() {
   fprintf(stderr, "ready> ");
   getNextToken();
 
-  TheJIT = llvm::make_unique<KaleidoscopeJIT>();
+  TheJIT = ExitOnErr(KaleidoscopeJIT::Create());
+  TheContext = &TheJIT->getContext();
 
   InitializeModule();
 
diff --git a/examples/Kaleidoscope/Chapter9/toy.cpp b/examples/Kaleidoscope/Chapter9/toy.cpp
index 7a12b5a80cc848ca970b827fede7223875b8226c..19bac1a8bc7021c89241b983ead61ea467c7ac9d 100644
--- a/examples/Kaleidoscope/Chapter9/toy.cpp
+++ b/examples/Kaleidoscope/Chapter9/toy.cpp
@@ -1245,9 +1245,8 @@ Function *FunctionAST::codegen() {
   unsigned ScopeLine = LineNo;
   DISubprogram *SP = DBuilder->createFunction(
       FContext, P.getName(), StringRef(), Unit, LineNo,
-      CreateFunctionType(TheFunction->arg_size(), Unit),
-      false /* internal linkage */, true /* definition */, ScopeLine,
-      DINode::FlagPrototyped, false);
+      CreateFunctionType(TheFunction->arg_size(), Unit), ScopeLine,
+      DINode::FlagPrototyped, DISubprogram::SPFlagDefinition);
   TheFunction->setSubprogram(SP);
 
   // Push the current scope.
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index c093c0906ce3b32ddf9f19bb7126dcaaba2050ff..0784275ace8cb393283c4b3f515662787aa5575d 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -54,6 +54,8 @@ extern "C" {
  * @{
  */
 
+/// External users depend on the following values being stable. It is not safe
+/// to reorder them.
 typedef enum {
   /* Terminator Instructions */
   LLVMRet            = 1,
@@ -64,6 +66,9 @@ typedef enum {
   /* removed 6 due to API changes */
   LLVMUnreachable    = 7,
 
+  /* Standard Unary Operators */
+  LLVMFNeg           = 66,
+
   /* Standard Binary Operators */
   LLVMAdd            = 8,
   LLVMFAdd           = 9,
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 52ed183c78aed1fa2f12eaac1b355d25d13b52aa..c6fa5ad674f6898e79ab7aaf604ef6ddf3d8f416 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -870,13 +870,13 @@ public:
   /// Factory for NaN values.
   ///
   /// \param Negative - True iff the NaN generated should be negative.
-  /// \param type - The unspecified fill bits for creating the NaN, 0 by
+  /// \param payload - The unspecified fill bits for creating the NaN, 0 by
   /// default.  The value is truncated as necessary.
   static APFloat getNaN(const fltSemantics &Sem, bool Negative = false,
-                        unsigned type = 0) {
-    if (type) {
-      APInt fill(64, type);
-      return getQNaN(Sem, Negative, &fill);
+                        uint64_t payload = 0) {
+    if (payload) {
+      APInt intPayload(64, payload);
+      return getQNaN(Sem, Negative, &intPayload);
     } else {
       return getQNaN(Sem, Negative, nullptr);
     }
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 1befe7a0273a86399396d68474bc7626a05c6264..6e106ff8bf5dca7786acc1069e6b2739b18067e9 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -1105,6 +1105,12 @@ public:
   APInt sshl_ov(const APInt &Amt, bool &Overflow) const;
   APInt ushl_ov(const APInt &Amt, bool &Overflow) const;
 
+  // Operations that saturate
+  APInt sadd_sat(const APInt &RHS) const;
+  APInt uadd_sat(const APInt &RHS) const;
+  APInt ssub_sat(const APInt &RHS) const;
+  APInt usub_sat(const APInt &RHS) const;
+
   /// Array-indexing support.
   ///
   /// \returns the bit value at bitPosition
@@ -1395,7 +1401,7 @@ public:
   ///
   /// Set the given bit to 1 whose position is given as "bitPosition".
   void setBit(unsigned BitPosition) {
-    assert(BitPosition <= BitWidth && "BitPosition out of range");
+    assert(BitPosition < BitWidth && "BitPosition out of range");
     WordType Mask = maskBit(BitPosition);
     if (isSingleWord())
       U.VAL |= Mask;
@@ -1454,7 +1460,7 @@ public:
   ///
   /// Set the given bit to 0 whose position is given as "bitPosition".
   void clearBit(unsigned BitPosition) {
-    assert(BitPosition <= BitWidth && "BitPosition out of range");
+    assert(BitPosition < BitWidth && "BitPosition out of range");
     WordType Mask = ~maskBit(BitPosition);
     if (isSingleWord())
       U.VAL &= Mask;
diff --git a/include/llvm/ADT/Optional.h b/include/llvm/ADT/Optional.h
index 353e5d0ec9df20c7ddaffd821f10ffefa45b08c7..76937d632ae13a6f5325deb1c4473578a024d242 100644
--- a/include/llvm/ADT/Optional.h
+++ b/include/llvm/ADT/Optional.h
@@ -29,7 +29,7 @@ namespace llvm {
 
 namespace optional_detail {
 /// Storage for any type.
-template <typename T, bool IsPodLike> struct OptionalStorage {
+template <typename T, bool = isPodLike<T>::value> struct OptionalStorage {
   AlignedCharArrayUnion<T> storage;
   bool hasVal = false;
 
@@ -108,28 +108,10 @@ template <typename T, bool IsPodLike> struct OptionalStorage {
   }
 };
 
-#if !defined(__GNUC__) || defined(__clang__) // GCC up to GCC7 miscompiles this.
-/// Storage for trivially copyable types only.
-template <typename T> struct OptionalStorage<T, true> {
-  AlignedCharArrayUnion<T> storage;
-  bool hasVal = false;
-
-  OptionalStorage() = default;
-
-  OptionalStorage(const T &y) : hasVal(true) { new (storage.buffer) T(y); }
-  OptionalStorage &operator=(const T &y) {
-    *reinterpret_cast<T *>(storage.buffer) = y;
-    hasVal = true;
-    return *this;
-  }
-
-  void reset() { hasVal = false; }
-};
-#endif
 } // namespace optional_detail
 
 template <typename T> class Optional {
-  optional_detail::OptionalStorage<T, isPodLike<T>::value> Storage;
+  optional_detail::OptionalStorage<T> Storage;
 
 public:
   using value_type = T;
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 4a93ee55e76dcae13dd5b1c1aea7f27efc8fd0fe..8685f0e4aada7f1372afb56d54aed2e0da56cc41 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -71,6 +71,10 @@ template <typename B1, typename... Bn>
 struct conjunction<B1, Bn...>
     : std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
 
+template <typename T> struct make_const_ptr {
+  using type =
+      typename std::add_pointer<typename std::add_const<T>::type>::type;
+};
 //===----------------------------------------------------------------------===//
 //     Extra additions to <functional>
 //===----------------------------------------------------------------------===//
@@ -508,9 +512,11 @@ make_early_inc_range(RangeT &&Range) {
                     EarlyIncIteratorT(std::end(std::forward<RangeT>(Range))));
 }
 
-// forward declarations required by zip_shortest/zip_first
+// forward declarations required by zip_shortest/zip_first/zip_longest
 template <typename R, typename UnaryPredicate>
 bool all_of(R &&range, UnaryPredicate P);
+template <typename R, typename UnaryPredicate>
+bool any_of(R &&range, UnaryPredicate P);
 
 template <size_t... I> struct index_sequence;
 
@@ -661,6 +667,132 @@ detail::zippy<detail::zip_first, T, U, Args...> zip_first(T &&t, U &&u,
       std::forward<T>(t), std::forward<U>(u), std::forward<Args>(args)...);
 }
 
+namespace detail {
+template <typename Iter>
+static Iter next_or_end(const Iter &I, const Iter &End) {
+  if (I == End)
+    return End;
+  return std::next(I);
+}
+
+template <typename Iter>
+static auto deref_or_none(const Iter &I, const Iter &End)
+    -> llvm::Optional<typename std::remove_const<
+        typename std::remove_reference<decltype(*I)>::type>::type> {
+  if (I == End)
+    return None;
+  return *I;
+}
+
+template <typename Iter> struct ZipLongestItemType {
+  using type =
+      llvm::Optional<typename std::remove_const<typename std::remove_reference<
+          decltype(*std::declval<Iter>())>::type>::type>;
+};
+
+template <typename... Iters> struct ZipLongestTupleType {
+  using type = std::tuple<typename ZipLongestItemType<Iters>::type...>;
+};
+
+template <typename... Iters>
+class zip_longest_iterator
+    : public iterator_facade_base<
+          zip_longest_iterator<Iters...>,
+          typename std::common_type<
+              std::forward_iterator_tag,
+              typename std::iterator_traits<Iters>::iterator_category...>::type,
+          typename ZipLongestTupleType<Iters...>::type,
+          typename std::iterator_traits<typename std::tuple_element<
+              0, std::tuple<Iters...>>::type>::difference_type,
+          typename ZipLongestTupleType<Iters...>::type *,
+          typename ZipLongestTupleType<Iters...>::type> {
+public:
+  using value_type = typename ZipLongestTupleType<Iters...>::type;
+
+private:
+  std::tuple<Iters...> iterators;
+  std::tuple<Iters...> end_iterators;
+
+  template <size_t... Ns>
+  bool test(const zip_longest_iterator<Iters...> &other,
+            index_sequence<Ns...>) const {
+    return llvm::any_of(
+        std::initializer_list<bool>{std::get<Ns>(this->iterators) !=
+                                    std::get<Ns>(other.iterators)...},
+        identity<bool>{});
+  }
+
+  template <size_t... Ns> value_type deref(index_sequence<Ns...>) const {
+    return value_type(
+        deref_or_none(std::get<Ns>(iterators), std::get<Ns>(end_iterators))...);
+  }
+
+  template <size_t... Ns>
+  decltype(iterators) tup_inc(index_sequence<Ns...>) const {
+    return std::tuple<Iters...>(
+        next_or_end(std::get<Ns>(iterators), std::get<Ns>(end_iterators))...);
+  }
+
+public:
+  zip_longest_iterator(std::pair<Iters &&, Iters &&>... ts)
+      : iterators(std::forward<Iters>(ts.first)...),
+        end_iterators(std::forward<Iters>(ts.second)...) {}
+
+  value_type operator*() { return deref(index_sequence_for<Iters...>{}); }
+
+  value_type operator*() const { return deref(index_sequence_for<Iters...>{}); }
+
+  zip_longest_iterator<Iters...> &operator++() {
+    iterators = tup_inc(index_sequence_for<Iters...>{});
+    return *this;
+  }
+
+  bool operator==(const zip_longest_iterator<Iters...> &other) const {
+    return !test(other, index_sequence_for<Iters...>{});
+  }
+};
+
+template <typename... Args> class zip_longest_range {
+public:
+  using iterator =
+      zip_longest_iterator<decltype(adl_begin(std::declval<Args>()))...>;
+  using iterator_category = typename iterator::iterator_category;
+  using value_type = typename iterator::value_type;
+  using difference_type = typename iterator::difference_type;
+  using pointer = typename iterator::pointer;
+  using reference = typename iterator::reference;
+
+private:
+  std::tuple<Args...> ts;
+
+  template <size_t... Ns> iterator begin_impl(index_sequence<Ns...>) const {
+    return iterator(std::make_pair(adl_begin(std::get<Ns>(ts)),
+                                   adl_end(std::get<Ns>(ts)))...);
+  }
+
+  template <size_t... Ns> iterator end_impl(index_sequence<Ns...>) const {
+    return iterator(std::make_pair(adl_end(std::get<Ns>(ts)),
+                                   adl_end(std::get<Ns>(ts)))...);
+  }
+
+public:
+  zip_longest_range(Args &&... ts_) : ts(std::forward<Args>(ts_)...) {}
+
+  iterator begin() const { return begin_impl(index_sequence_for<Args...>{}); }
+  iterator end() const { return end_impl(index_sequence_for<Args...>{}); }
+};
+} // namespace detail
+
+/// Iterate over two or more iterators at the same time. Iteration continues
+/// until all iterators reach the end. The llvm::Optional only contains a value
+/// if the iterator has not reached the end.
+template <typename T, typename U, typename... Args>
+detail::zip_longest_range<T, U, Args...> zip_longest(T &&t, U &&u,
+                                                     Args &&... args) {
+  return detail::zip_longest_range<T, U, Args...>(
+      std::forward<T>(t), std::forward<U>(u), std::forward<Args>(args)...);
+}
+
 /// Iterator wrapper that concatenates sequences together.
 ///
 /// This can concatenate different iterators, even with different types, into
@@ -1405,6 +1537,40 @@ auto apply_tuple(F &&f, Tuple &&t) -> decltype(detail::apply_tuple_impl(
                                   Indices{});
 }
 
+/// Return true if the sequence [Begin, End) has exactly N items. Runs in O(N)
+/// time. Not meant for use with random-access iterators.
+template <typename IterTy>
+bool hasNItems(
+    IterTy &&Begin, IterTy &&End, unsigned N,
+    typename std::enable_if<
+        !std::is_same<
+            typename std::iterator_traits<typename std::remove_reference<
+                decltype(Begin)>::type>::iterator_category,
+            std::random_access_iterator_tag>::value,
+        void>::type * = nullptr) {
+  for (; N; --N, ++Begin)
+    if (Begin == End)
+      return false; // Too few.
+  return Begin == End;
+}
+
+/// Return true if the sequence [Begin, End) has N or more items. Runs in O(N)
+/// time. Not meant for use with random-access iterators.
+template <typename IterTy>
+bool hasNItemsOrMore(
+    IterTy &&Begin, IterTy &&End, unsigned N,
+    typename std::enable_if<
+        !std::is_same<
+            typename std::iterator_traits<typename std::remove_reference<
+                decltype(Begin)>::type>::iterator_category,
+            std::random_access_iterator_tag>::value,
+        void>::type * = nullptr) {
+  for (; N; --N, ++Begin)
+    if (Begin == End)
+      return false; // Too few.
+  return true;
+}
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_STLEXTRAS_H
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index f86bebd14cc5bc0d9f952552a6d175a08601d224..0a73dbd6067182a66495d57233a62e700685e181 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -92,10 +92,6 @@ public:
   };
 
 private:
-  bool isSmall() const {
-    return X & uintptr_t(1);
-  }
-
   BitVector *getPointer() const {
     assert(!isSmall());
     return reinterpret_cast<BitVector *>(X);
@@ -186,6 +182,8 @@ public:
     return make_range(set_bits_begin(), set_bits_end());
   }
 
+  bool isSmall() const { return X & uintptr_t(1); }
+
   /// Tests whether there are no bits in this bitvector.
   bool empty() const {
     return isSmall() ? getSmallSize() == 0 : getPointer()->empty();
@@ -242,7 +240,7 @@ public:
       uintptr_t Bits = getSmallBits();
       if (Bits == 0)
         return -1;
-      return NumBaseBits - countLeadingZeros(Bits);
+      return NumBaseBits - countLeadingZeros(Bits) - 1;
     }
     return getPointer()->find_last();
   }
@@ -265,7 +263,9 @@ public:
         return -1;
 
       uintptr_t Bits = getSmallBits();
-      return NumBaseBits - countLeadingOnes(Bits);
+      // Set unused bits.
+      Bits |= ~uintptr_t(0) << getSmallSize();
+      return NumBaseBits - countLeadingOnes(Bits) - 1;
     }
     return getPointer()->find_last_unset();
   }
@@ -487,10 +487,17 @@ public:
   bool operator==(const SmallBitVector &RHS) const {
     if (size() != RHS.size())
       return false;
-    if (isSmall())
+    if (isSmall() && RHS.isSmall())
       return getSmallBits() == RHS.getSmallBits();
-    else
+    else if (!isSmall() && !RHS.isSmall())
       return *getPointer() == *RHS.getPointer();
+    else {
+      for (size_t i = 0, e = size(); i != e; ++i) {
+        if ((*this)[i] != RHS[i])
+          return false;
+      }
+      return true;
+    }
   }
 
   bool operator!=(const SmallBitVector &RHS) const {
@@ -498,16 +505,19 @@ public:
   }
 
   // Intersection, union, disjoint union.
+  // FIXME BitVector::operator&= does not resize the LHS but this does
   SmallBitVector &operator&=(const SmallBitVector &RHS) {
     resize(std::max(size(), RHS.size()));
-    if (isSmall())
+    if (isSmall() && RHS.isSmall())
       setSmallBits(getSmallBits() & RHS.getSmallBits());
-    else if (!RHS.isSmall())
+    else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator&=(*RHS.getPointer());
     else {
-      SmallBitVector Copy = RHS;
-      Copy.resize(size());
-      getPointer()->operator&=(*Copy.getPointer());
+      size_t i, e;
+      for (i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
+        (*this)[i] = test(i) && RHS.test(i);
+      for (e = size(); i != e; ++i)
+        reset(i);
     }
     return *this;
   }
@@ -547,28 +557,26 @@ public:
 
   SmallBitVector &operator|=(const SmallBitVector &RHS) {
     resize(std::max(size(), RHS.size()));
-    if (isSmall())
+    if (isSmall() && RHS.isSmall())
       setSmallBits(getSmallBits() | RHS.getSmallBits());
-    else if (!RHS.isSmall())
+    else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator|=(*RHS.getPointer());
     else {
-      SmallBitVector Copy = RHS;
-      Copy.resize(size());
-      getPointer()->operator|=(*Copy.getPointer());
+      for (size_t i = 0, e = RHS.size(); i != e; ++i)
+        (*this)[i] = test(i) || RHS.test(i);
     }
     return *this;
   }
 
   SmallBitVector &operator^=(const SmallBitVector &RHS) {
     resize(std::max(size(), RHS.size()));
-    if (isSmall())
+    if (isSmall() && RHS.isSmall())
       setSmallBits(getSmallBits() ^ RHS.getSmallBits());
-    else if (!RHS.isSmall())
+    else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator^=(*RHS.getPointer());
     else {
-      SmallBitVector Copy = RHS;
-      Copy.resize(size());
-      getPointer()->operator^=(*Copy.getPointer());
+      for (size_t i = 0, e = RHS.size(); i != e; ++i)
+        (*this)[i] = test(i) != RHS.test(i);
     }
     return *this;
   }
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index e4ddd12afdf4c96906aad4ecbf9c11823b344c9e..0636abbb1fbfdc8d2158b0438b747e7cb8585bd9 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -182,7 +182,7 @@ public:
 
 /// SmallVectorTemplateBase<isPodLike = false> - This is where we put method
 /// implementations that are designed to work with non-POD-like T's.
-template <typename T, bool isPodLike>
+template <typename T, bool = isPodLike<T>::value>
 class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
 protected:
   SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
@@ -320,8 +320,8 @@ public:
 /// This class consists of common code factored out of the SmallVector class to
 /// reduce code duplication based on the SmallVector 'N' template parameter.
 template <typename T>
-class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
-  using SuperClass = SmallVectorTemplateBase<T, isPodLike<T>::value>;
+class SmallVectorImpl : public SmallVectorTemplateBase<T> {
+  using SuperClass = SmallVectorTemplateBase<T>;
 
 public:
   using iterator = typename SuperClass::iterator;
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index fe78f78aaab2d70ff44527c982f758f2c38e762a..04b64e2fa60b00303ef7639a1a43dcd9eeeb0fc3 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -186,7 +186,8 @@ public:
     Contiki,
     AMDPAL,     // AMD PAL Runtime
     HermitCore, // HermitCore Unikernel/Multikernel
-    LastOSType = HermitCore
+    Hurd,       // GNU/Hurd
+    LastOSType = Hurd
   };
   enum EnvironmentType {
     UnknownEnvironment,
@@ -582,9 +583,15 @@ public:
     return getOS() == Triple::KFreeBSD;
   }
 
+  /// Tests whether the OS is Hurd.
+  bool isOSHurd() const {
+    return getOS() == Triple::Hurd;
+  }
+
   /// Tests whether the OS uses glibc.
   bool isOSGlibc() const {
-    return (getOS() == Triple::Linux || getOS() == Triple::KFreeBSD) &&
+    return (getOS() == Triple::Linux || getOS() == Triple::KFreeBSD ||
+            getOS() == Triple::Hurd) &&
            !isAndroid();
   }
 
diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h
index 7f7ed69a005447c19177a6a599c5f577743a4f75..40e490cf7864ea69abb23662c35d3e9946531b93 100644
--- a/include/llvm/ADT/iterator.h
+++ b/include/llvm/ADT/iterator.h
@@ -309,8 +309,10 @@ make_pointee_range(RangeT &&Range) {
 template <typename WrappedIteratorT,
           typename T = decltype(&*std::declval<WrappedIteratorT>())>
 class pointer_iterator
-    : public iterator_adaptor_base<pointer_iterator<WrappedIteratorT, T>,
-                                   WrappedIteratorT, T> {
+    : public iterator_adaptor_base<
+          pointer_iterator<WrappedIteratorT, T>, WrappedIteratorT,
+          typename std::iterator_traits<WrappedIteratorT>::iterator_category,
+          T> {
   mutable T Ptr;
 
 public:
diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h
index f150064a910a05e035ebd9848b026993c7e81c76..61b99f6c3e6b8cb5115f492dd93029d35e8734fe 100644
--- a/include/llvm/Analysis/CGSCCPassManager.h
+++ b/include/llvm/Analysis/CGSCCPassManager.h
@@ -441,7 +441,10 @@ public:
 
             PreservedAnalyses PassPA = Pass.run(*C, CGAM, CG, UR);
 
-            PI.runAfterPass<LazyCallGraph::SCC>(Pass, *C);
+            if (UR.InvalidatedSCCs.count(C))
+              PI.runAfterPassInvalidated<LazyCallGraph::SCC>(Pass);
+            else
+              PI.runAfterPass<LazyCallGraph::SCC>(Pass, *C);
 
             // Update the SCC and RefSCC if necessary.
             C = UR.UpdatedC ? UR.UpdatedC : C;
@@ -762,7 +765,10 @@ public:
 
       PreservedAnalyses PassPA = Pass.run(*C, AM, CG, UR);
 
-      PI.runAfterPass<LazyCallGraph::SCC>(Pass, *C);
+      if (UR.InvalidatedSCCs.count(C))
+        PI.runAfterPassInvalidated<LazyCallGraph::SCC>(Pass);
+      else
+        PI.runAfterPass<LazyCallGraph::SCC>(Pass, *C);
 
       // If the SCC structure has changed, bail immediately and let the outer
       // CGSCC layer handle any iteration to reflect the refined structure.
diff --git a/include/llvm/Analysis/CaptureTracking.h b/include/llvm/Analysis/CaptureTracking.h
index 7a869a51233a035bbb8d51f4d327d2e7d652a957..aaaaff9ae252c6ac9a93a3274c4cfc8ac6647f77 100644
--- a/include/llvm/Analysis/CaptureTracking.h
+++ b/include/llvm/Analysis/CaptureTracking.h
@@ -22,6 +22,14 @@ namespace llvm {
   class DominatorTree;
   class OrderedBasicBlock;
 
+  /// The default value for MaxUsesToExplore argument. It's relatively small to
+  /// keep the cost of analysis reasonable for clients like BasicAliasAnalysis,
+  /// where the results can't be cached.
+  /// TODO: we should probably introduce a caching CaptureTracking analysis and
+  /// use it where possible. The caching version can use much higher limit or
+  /// don't have this cap at all.
+  unsigned constexpr DefaultMaxUsesToExplore = 20;
+
   /// PointerMayBeCaptured - Return true if this pointer value may be captured
   /// by the enclosing function (which is required to exist).  This routine can
   /// be expensive, so consider caching the results.  The boolean ReturnCaptures
@@ -29,9 +37,12 @@ namespace llvm {
   /// counts as capturing it or not.  The boolean StoreCaptures specified
   /// whether storing the value (or part of it) into memory anywhere
   /// automatically counts as capturing it or not.
+  /// MaxUsesToExplore specifies how many uses should the analysis explore for
+  /// one value before giving up due too "too many uses".
   bool PointerMayBeCaptured(const Value *V,
                             bool ReturnCaptures,
-                            bool StoreCaptures);
+                            bool StoreCaptures,
+                            unsigned MaxUsesToExplore = DefaultMaxUsesToExplore);
 
   /// PointerMayBeCapturedBefore - Return true if this pointer value may be
   /// captured by the enclosing function (which is required to exist). If a
@@ -44,10 +55,13 @@ namespace llvm {
   /// or not. Captures by the provided instruction are considered if the
   /// final parameter is true. An ordered basic block in \p OBB could be used
   /// to speed up capture-tracker queries.
+  /// MaxUsesToExplore specifies how many uses should the analysis explore for
+  /// one value before giving up due too "too many uses".
   bool PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
                                   bool StoreCaptures, const Instruction *I,
                                   const DominatorTree *DT, bool IncludeI = false,
-                                  OrderedBasicBlock *OBB = nullptr);
+                                  OrderedBasicBlock *OBB = nullptr,
+                                  unsigned MaxUsesToExplore = DefaultMaxUsesToExplore);
 
   /// This callback is used in conjunction with PointerMayBeCaptured. In
   /// addition to the interface here, you'll need to provide your own getters
@@ -75,7 +89,10 @@ namespace llvm {
   /// PointerMayBeCaptured - Visit the value and the values derived from it and
   /// find values which appear to be capturing the pointer value. This feeds
   /// results into and is controlled by the CaptureTracker object.
-  void PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker);
+  /// MaxUsesToExplore specifies how many uses should the analysis explore for
+  /// one value before giving up due too "too many uses".
+  void PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
+                            unsigned MaxUsesToExplore = DefaultMaxUsesToExplore);
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Analysis/CmpInstAnalysis.h b/include/llvm/Analysis/CmpInstAnalysis.h
index 3cc69d9fea29f33dd02d9c07dd6bd4f183b1d0f1..0e9c6a96b0f41095234f5877c6f4a6b6bf4b83e8 100644
--- a/include/llvm/Analysis/CmpInstAnalysis.h
+++ b/include/llvm/Analysis/CmpInstAnalysis.h
@@ -46,19 +46,18 @@ namespace llvm {
   ///
   unsigned getICmpCode(const ICmpInst *ICI, bool InvertPred = false);
 
-  /// This is the complement of getICmpCode, which turns an opcode and two
-  /// operands into either a constant true or false, or the predicate for a new
-  /// ICmp instruction. The sign is passed in to determine which kind of
-  /// predicate to use in the new icmp instruction.
+  /// This is the complement of getICmpCode. It turns a predicate code into
+  /// either a constant true or false or the predicate for a new ICmp.
+  /// The sign is passed in to determine which kind of predicate to use in the
+  /// new ICmp instruction.
   /// Non-NULL return value will be a true or false constant.
-  /// NULL return means a new ICmp is needed. The predicate for which is output
-  /// in NewICmpPred.
-  Value *getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
-                      CmpInst::Predicate &NewICmpPred);
+  /// NULL return means a new ICmp is needed. The predicate is output in Pred.
+  Constant *getPredForICmpCode(unsigned Code, bool Sign, Type *OpTy,
+                               CmpInst::Predicate &Pred);
 
   /// Return true if both predicates match sign or if at least one of them is an
   /// equality comparison (which is signless).
-  bool PredicatesFoldable(CmpInst::Predicate p1, CmpInst::Predicate p2);
+  bool predicatesFoldable(CmpInst::Predicate P1, CmpInst::Predicate P2);
 
   /// Decompose an icmp into the form ((X & Mask) pred 0) if possible. The
   /// returned predicate is either == or !=. Returns false if decomposition
diff --git a/include/llvm/Analysis/DemandedBits.h b/include/llvm/Analysis/DemandedBits.h
index d4384609762d4a7356800dcaadc4b40fe2882219..f751d2edcfffb24e21237eb40d33304a2243d9f7 100644
--- a/include/llvm/Analysis/DemandedBits.h
+++ b/include/llvm/Analysis/DemandedBits.h
@@ -44,6 +44,14 @@ public:
     F(F), AC(AC), DT(DT) {}
 
   /// Return the bits demanded from instruction I.
+  ///
+  /// For vector instructions individual vector elements are not distinguished:
+  /// A bit is demanded if it is demanded for any of the vector elements. The
+  /// size of the return value corresponds to the type size in bits of the
+  /// scalar type.
+  ///
+  /// Instructions that do not have integer or vector of integer type are
+  /// accepted, but will always produce a mask with all bits set.
   APInt getDemandedBits(Instruction *I);
 
   /// Return true if, during analysis, I could not be reached.
diff --git a/include/llvm/Analysis/DivergenceAnalysis.h b/include/llvm/Analysis/DivergenceAnalysis.h
index 9fadf52288bc87eb38a043a8b71a3abaebd9835d..d834862db09573261cfd49609ed94b6b7ddf5d59 100644
--- a/include/llvm/Analysis/DivergenceAnalysis.h
+++ b/include/llvm/Analysis/DivergenceAnalysis.h
@@ -173,6 +173,33 @@ private:
   std::vector<const Instruction *> Worklist;
 };
 
+/// \brief Divergence analysis frontend for GPU kernels.
+class GPUDivergenceAnalysis {
+  SyncDependenceAnalysis SDA;
+  DivergenceAnalysis DA;
+
+public:
+  /// Runs the divergence analysis on @F, a GPU kernel
+  GPUDivergenceAnalysis(Function &F, const DominatorTree &DT,
+                        const PostDominatorTree &PDT, const LoopInfo &LI,
+                        const TargetTransformInfo &TTI);
+
+  /// Whether any divergence was detected.
+  bool hasDivergence() const { return DA.hasDetectedDivergence(); }
+
+  /// The GPU kernel this analysis result is for
+  const Function &getFunction() const { return DA.getFunction(); }
+
+  /// Whether \p V is divergent.
+  bool isDivergent(const Value &V) const;
+
+  /// Whether \p V is uniform/non-divergent
+  bool isUniform(const Value &V) const { return !isDivergent(V); }
+
+  /// Print all divergent values in the kernel.
+  void print(raw_ostream &OS, const Module *) const;
+};
+
 } // namespace llvm
 
 #endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
diff --git a/include/llvm/Analysis/IVDescriptors.h b/include/llvm/Analysis/IVDescriptors.h
index d1d7e5ef0227e40a62fbb6c18b9e23189f385595..64b4ae23cc59de8e7d0790995f02105c0091cb45 100644
--- a/include/llvm/Analysis/IVDescriptors.h
+++ b/include/llvm/Analysis/IVDescriptors.h
@@ -140,7 +140,8 @@ public:
 
   /// Returns true if instruction I has multiple uses in Insts
   static bool hasMultipleUsesOf(Instruction *I,
-                                SmallPtrSetImpl<Instruction *> &Insts);
+                                SmallPtrSetImpl<Instruction *> &Insts,
+                                unsigned MaxNumUses);
 
   /// Returns true if all uses of the instruction I is within the Set.
   static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set);
@@ -150,6 +151,10 @@ public:
   /// or max(X, Y).
   static InstDesc isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev);
 
+  /// Returns a struct describing if the instruction is a
+  /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
+  static InstDesc isConditionalRdxPattern(RecurrenceKind Kind, Instruction *I);
+
   /// Returns identity corresponding to the RecurrenceKind.
   static Constant *getRecurrenceIdentity(RecurrenceKind K, Type *Tp);
 
diff --git a/include/llvm/Analysis/InstructionPrecedenceTracking.h b/include/llvm/Analysis/InstructionPrecedenceTracking.h
index 5178a33d2e84a324462002d66ccb1a40c860606e..b754557ecc41d36780319f743ebbbdf183f8d809 100644
--- a/include/llvm/Analysis/InstructionPrecedenceTracking.h
+++ b/include/llvm/Analysis/InstructionPrecedenceTracking.h
@@ -114,6 +114,31 @@ public:
   virtual bool isSpecialInstruction(const Instruction *Insn) const;
 };
 
+class MemoryWriteTracking : public InstructionPrecedenceTracking {
+public:
+  MemoryWriteTracking(DominatorTree *DT) : InstructionPrecedenceTracking(DT) {}
+
+  /// Returns the topmost instruction that may write memory from the given
+  /// basic block. Returns nullptr if there is no such instructions in the block.
+  const Instruction *getFirstMemoryWrite(const BasicBlock *BB) {
+    return getFirstSpecialInstruction(BB);
+  }
+
+  /// Returns true if at least one instruction from the given basic block may
+  /// write memory.
+  bool mayWriteToMemory(const BasicBlock *BB) {
+    return hasSpecialInstructions(BB);
+  }
+
+  /// Returns true if the first memory writing instruction of Insn's block
+  /// exists and dominates Insn.
+  bool isDominatedByMemoryWriteFromSameBlock(const Instruction *Insn) {
+    return isPreceededBySpecialInstruction(Insn);
+  }
+
+  virtual bool isSpecialInstruction(const Instruction *Insn) const;
+};
+
 } // llvm
 
 #endif // LLVM_ANALYSIS_INSTRUCTIONPRECEDENCETRACKING_H
diff --git a/include/llvm/Analysis/IteratedDominanceFrontier.h b/include/llvm/Analysis/IteratedDominanceFrontier.h
index fdebb1bc3963beb4ee6281a4cba5a58d481b753e..3083db75b81c1c1eb15a6f3b357d6e29dbb16c12 100644
--- a/include/llvm/Analysis/IteratedDominanceFrontier.h
+++ b/include/llvm/Analysis/IteratedDominanceFrontier.h
@@ -6,7 +6,7 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
+/// \file
 /// Compute iterated dominance frontiers using a linear time algorithm.
 ///
 /// The algorithm used here is based on:
diff --git a/include/llvm/Analysis/LegacyDivergenceAnalysis.h b/include/llvm/Analysis/LegacyDivergenceAnalysis.h
index 173ca28a7f4873329feea6458a1982de9b1c61b7..fc426ad7fb645f0088eb415b625e07bd521776a0 100644
--- a/include/llvm/Analysis/LegacyDivergenceAnalysis.h
+++ b/include/llvm/Analysis/LegacyDivergenceAnalysis.h
@@ -19,9 +19,11 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 
 namespace llvm {
 class Value;
+class GPUDivergenceAnalysis;
 class LegacyDivergenceAnalysis : public FunctionPass {
 public:
   static char ID;
@@ -41,7 +43,7 @@ public:
   //
   // Even if this function returns false, V may still be divergent when used
   // in a different basic block.
-  bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
+  bool isDivergent(const Value *V) const;
 
   // Returns true if V is uniform/non-divergent.
   //
@@ -53,6 +55,12 @@ public:
   void removeValue(const Value *V) { DivergentValues.erase(V); }
 
 private:
+  // Whether analysis should be performed by GPUDivergenceAnalysis.
+  bool shouldUseGPUDivergenceAnalysis(const Function &F) const;
+
+  // (optional) handle to new DivergenceAnalysis
+  std::unique_ptr<GPUDivergenceAnalysis> gpuDA;
+
   // Stores all divergent values.
   DenseSet<const Value *> DivergentValues;
 };
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index c59c86c499404536c640efa718ac01f21fdb9400..b5a964de9eb0b61b6cf723c0941e57d74de169f6 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -97,6 +97,17 @@ public:
   /// Set of potential dependent memory accesses.
   typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
 
+  /// Type to keep track of the status of the dependence check. The order of
+  /// the elements is important and has to be from most permissive to least
+  /// permissive.
+  enum class VectorizationSafetyStatus {
+    // Can vectorize safely without RT checks. All dependences are known to be
+    // safe.
+    Safe,
+    // Cannot vectorize due to unsafe or unknown dependencies.
+    Unsafe,
+  };
+
   /// Dependece between memory access instructions.
   struct Dependence {
     /// The type of the dependence.
@@ -146,7 +157,7 @@ public:
     Instruction *getDestination(const LoopAccessInfo &LAI) const;
 
     /// Dependence types that don't prevent vectorization.
-    static bool isSafeForVectorization(DepType Type);
+    static VectorizationSafetyStatus isSafeForVectorization(DepType Type);
 
     /// Lexically forward dependence.
     bool isForward() const;
@@ -164,8 +175,8 @@ public:
 
   MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
       : PSE(PSE), InnermostLoop(L), AccessIdx(0), MaxSafeRegisterWidth(-1U),
-        ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true),
-        RecordDependences(true) {}
+        ShouldRetryWithRuntimeCheck(false),
+        Status(VectorizationSafetyStatus::Safe), RecordDependences(true) {}
 
   /// Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -193,7 +204,9 @@ public:
 
   /// No memory dependence was encountered that would inhibit
   /// vectorization.
-  bool isSafeForVectorization() const { return SafeForVectorization; }
+  bool isSafeForVectorization() const {
+    return Status == VectorizationSafetyStatus::Safe;
+  }
 
   /// The maximum number of bytes of a vector register we can vectorize
   /// the accesses safely with.
@@ -269,9 +282,9 @@ private:
   /// vectorize this loop with runtime checks.
   bool ShouldRetryWithRuntimeCheck;
 
-  /// No memory dependence was encountered that would inhibit
-  /// vectorization.
-  bool SafeForVectorization;
+  /// Result of the dependence checks, indicating whether the checked
+  /// dependences are safe for vectorization or not.
+  VectorizationSafetyStatus Status;
 
   //// True if Dependences reflects the dependences in the
   //// loop.  If false we exceeded MaxDependences and
@@ -304,6 +317,10 @@ private:
   /// \return false if we shouldn't vectorize at all or avoid larger
   /// vectorization factors by limiting MaxSafeDepDistBytes.
   bool couldPreventStoreLoadForward(uint64_t Distance, uint64_t TypeByteSize);
+
+  /// Updates the current safety status with \p S. We can go from Safe to
+  /// to Unsafe.
+  void mergeInStatus(VectorizationSafetyStatus S);
 };
 
 /// Holds information about the memory runtime legality checks to verify
@@ -564,10 +581,10 @@ public:
   /// Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// If the loop has multiple stores to an invariant address, then
-  /// return true, else return false.
-  bool hasMultipleStoresToLoopInvariantAddress() const {
-    return HasMultipleStoresToLoopInvariantAddress;
+  /// If the loop has memory dependence involving an invariant address, i.e. two
+  /// stores or a store and a load, then return true, else return false.
+  bool hasDependenceInvolvingLoopInvariantAddress() const {
+    return HasDependenceInvolvingLoopInvariantAddress;
   }
 
   /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
@@ -620,8 +637,8 @@ private:
   /// Cache the result of analyzeLoop.
   bool CanVecMem;
 
-  /// Indicator that there are multiple stores to a uniform address.
-  bool HasMultipleStoresToLoopInvariantAddress;
+  /// Indicator that there are non vectorizable stores to a uniform address.
+  bool HasDependenceInvolvingLoopInvariantAddress;
 
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index 05c28d139889af4b3a1e3a88a11c580b55343640..b973447ca8221fb3d6d86f6b42701995062a8414 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -126,6 +126,8 @@ class ICFLoopSafetyInfo: public LoopSafetyInfo {
                                // may throw.
   // Contains information about implicit control flow in this loop's blocks.
   mutable ImplicitControlFlowTracking ICF;
+  // Contains information about instruction that may possibly write memory.
+  mutable MemoryWriteTracking MW;
 
 public:
   virtual bool blockMayThrow(const BasicBlock *BB) const;
@@ -138,6 +140,16 @@ public:
                                      const DominatorTree *DT,
                                      const Loop *CurLoop) const;
 
+  /// Returns true if we could not execute a memory-modifying instruction before
+  /// we enter \p BB under assumption that \p CurLoop is entered.
+  bool doesNotWriteMemoryBefore(const BasicBlock *BB, const Loop *CurLoop)
+      const;
+
+  /// Returns true if we could not execute a memory-modifying instruction before
+  /// we execute \p I under assumption that \p CurLoop is entered.
+  bool doesNotWriteMemoryBefore(const Instruction &I, const Loop *CurLoop)
+      const;
+
   /// Inform the safety info that we are planning to insert a new instruction
   /// into the basic block \p BB. It will make all cache updates to keep it
   /// correct after this insertion.
@@ -148,7 +160,7 @@ public:
   /// this removal.
   void removeInstruction(const Instruction *Inst);
 
-  ICFLoopSafetyInfo(DominatorTree *DT) : LoopSafetyInfo(), ICF(DT) {};
+  ICFLoopSafetyInfo(DominatorTree *DT) : LoopSafetyInfo(), ICF(DT), MW(DT) {};
 
   virtual ~ICFLoopSafetyInfo() {};
 };
diff --git a/include/llvm/Analysis/ObjCARCAnalysisUtils.h b/include/llvm/Analysis/ObjCARCAnalysisUtils.h
index 07beb0bb60a32dfb1322c7aad090c41bcb8731ce..1f497fab35da8b8fe4bb9ded547d2dbcc93bc7f1 100644
--- a/include/llvm/Analysis/ObjCARCAnalysisUtils.h
+++ b/include/llvm/Analysis/ObjCARCAnalysisUtils.h
@@ -51,25 +51,25 @@ extern bool EnableARCOpts;
 /// on.
 inline bool ModuleHasARC(const Module &M) {
   return
-    M.getNamedValue("objc_retain") ||
-    M.getNamedValue("objc_release") ||
-    M.getNamedValue("objc_autorelease") ||
-    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
-    M.getNamedValue("objc_unsafeClaimAutoreleasedReturnValue") ||
-    M.getNamedValue("objc_retainBlock") ||
-    M.getNamedValue("objc_autoreleaseReturnValue") ||
-    M.getNamedValue("objc_autoreleasePoolPush") ||
-    M.getNamedValue("objc_loadWeakRetained") ||
-    M.getNamedValue("objc_loadWeak") ||
-    M.getNamedValue("objc_destroyWeak") ||
-    M.getNamedValue("objc_storeWeak") ||
-    M.getNamedValue("objc_initWeak") ||
-    M.getNamedValue("objc_moveWeak") ||
-    M.getNamedValue("objc_copyWeak") ||
-    M.getNamedValue("objc_retainedObject") ||
-    M.getNamedValue("objc_unretainedObject") ||
-    M.getNamedValue("objc_unretainedPointer") ||
-    M.getNamedValue("clang.arc.use");
+    M.getNamedValue("llvm.objc.retain") ||
+    M.getNamedValue("llvm.objc.release") ||
+    M.getNamedValue("llvm.objc.autorelease") ||
+    M.getNamedValue("llvm.objc.retainAutoreleasedReturnValue") ||
+    M.getNamedValue("llvm.objc.unsafeClaimAutoreleasedReturnValue") ||
+    M.getNamedValue("llvm.objc.retainBlock") ||
+    M.getNamedValue("llvm.objc.autoreleaseReturnValue") ||
+    M.getNamedValue("llvm.objc.autoreleasePoolPush") ||
+    M.getNamedValue("llvm.objc.loadWeakRetained") ||
+    M.getNamedValue("llvm.objc.loadWeak") ||
+    M.getNamedValue("llvm.objc.destroyWeak") ||
+    M.getNamedValue("llvm.objc.storeWeak") ||
+    M.getNamedValue("llvm.objc.initWeak") ||
+    M.getNamedValue("llvm.objc.moveWeak") ||
+    M.getNamedValue("llvm.objc.copyWeak") ||
+    M.getNamedValue("llvm.objc.retainedObject") ||
+    M.getNamedValue("llvm.objc.unretainedObject") ||
+    M.getNamedValue("llvm.objc.unretainedPointer") ||
+    M.getNamedValue("llvm.objc.clang.arc.use");
 }
 
 /// This is a wrapper around getUnderlyingObject which also knows how to
diff --git a/include/llvm/Analysis/ObjCARCInstKind.h b/include/llvm/Analysis/ObjCARCInstKind.h
index 0b92d8b4835601aaf798fbc4075d0174eb84a785..018ea1f851bebaf35c5ef36f6d61e00ce1a225aa 100644
--- a/include/llvm/Analysis/ObjCARCInstKind.h
+++ b/include/llvm/Analysis/ObjCARCInstKind.h
@@ -11,6 +11,7 @@
 #define LLVM_ANALYSIS_OBJCARCINSTKIND_H
 
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Instructions.h"
 
 namespace llvm {
@@ -48,7 +49,7 @@ enum class ARCInstKind {
   CopyWeak,                 ///< objc_copyWeak (derived)
   DestroyWeak,              ///< objc_destroyWeak (derived)
   StoreStrong,              ///< objc_storeStrong (derived)
-  IntrinsicUser,            ///< clang.arc.use
+  IntrinsicUser,            ///< llvm.objc.clang.arc.use
   CallOrUser,               ///< could call objc_release and/or "use" pointers
   Call,                     ///< could call objc_release
   User,                     ///< could "use" a pointer
diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h
index 58b67e74ba5130074615e02d903167d4cc59613b..3aef4be72d71f49e0d7d0b07abcd874dcfed2132 100644
--- a/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -98,14 +98,14 @@ public:
   bool isFunctionEntryCold(const Function *F);
   /// Returns true if \p F contains only cold code.
   bool isFunctionColdInCallGraph(const Function *F, BlockFrequencyInfo &BFI);
-  /// Returns true if \p F is a hot function.
+  /// Returns true if count \p C is considered hot.
   bool isHotCount(uint64_t C);
   /// Returns true if count \p C is considered cold.
   bool isColdCount(uint64_t C);
-  /// Returns true if BasicBlock \p B is considered hot.
-  bool isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI);
-  /// Returns true if BasicBlock \p B is considered cold.
-  bool isColdBB(const BasicBlock *B, BlockFrequencyInfo *BFI);
+  /// Returns true if BasicBlock \p BB is considered hot.
+  bool isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI);
+  /// Returns true if BasicBlock \p BB is considered cold.
+  bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI);
   /// Returns true if CallSite \p CS is considered hot.
   bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI);
   /// Returns true if Callsite \p CS is considered cold.
@@ -134,9 +134,8 @@ public:
   static char ID;
   ProfileSummaryInfoWrapperPass();
 
-  ProfileSummaryInfo *getPSI() {
-    return &*PSI;
-  }
+  ProfileSummaryInfo &getPSI() { return *PSI; }
+  const ProfileSummaryInfo &getPSI() const { return *PSI; }
 
   bool doInitialization(Module &M) override;
   bool doFinalization(Module &M) override;
diff --git a/include/llvm/Analysis/StackSafetyAnalysis.h b/include/llvm/Analysis/StackSafetyAnalysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a151650a34c07e6922466aeab7154ac9054bed1
--- /dev/null
+++ b/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -0,0 +1,120 @@
+//===- StackSafetyAnalysis.h - Stack memory safety analysis -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Stack Safety Analysis detects allocas and arguments with safe access.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_STACKSAFETYANALYSIS_H
+#define LLVM_ANALYSIS_STACKSAFETYANALYSIS_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// Interface to access stack safety analysis results for single function.
+class StackSafetyInfo {
+public:
+  struct FunctionInfo;
+
+private:
+  std::unique_ptr<FunctionInfo> Info;
+
+public:
+  StackSafetyInfo();
+  StackSafetyInfo(FunctionInfo &&Info);
+  StackSafetyInfo(StackSafetyInfo &&);
+  StackSafetyInfo &operator=(StackSafetyInfo &&);
+  ~StackSafetyInfo();
+
+  // TODO: Add useful for client methods.
+  void print(raw_ostream &O) const;
+};
+
+/// StackSafetyInfo wrapper for the new pass manager.
+class StackSafetyAnalysis : public AnalysisInfoMixin<StackSafetyAnalysis> {
+  friend AnalysisInfoMixin<StackSafetyAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = StackSafetyInfo;
+  StackSafetyInfo run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Printer pass for the \c StackSafetyAnalysis results.
+class StackSafetyPrinterPass : public PassInfoMixin<StackSafetyPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit StackSafetyPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// StackSafetyInfo wrapper for the legacy pass manager
+class StackSafetyInfoWrapperPass : public FunctionPass {
+  StackSafetyInfo SSI;
+
+public:
+  static char ID;
+  StackSafetyInfoWrapperPass();
+
+  const StackSafetyInfo &getResult() const { return SSI; }
+
+  void print(raw_ostream &O, const Module *M) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnFunction(Function &F) override;
+};
+
+using StackSafetyGlobalInfo = std::map<const GlobalValue *, StackSafetyInfo>;
+
+/// This pass performs the global (interprocedural) stack safety analysis (new
+/// pass manager).
+class StackSafetyGlobalAnalysis
+    : public AnalysisInfoMixin<StackSafetyGlobalAnalysis> {
+  friend AnalysisInfoMixin<StackSafetyGlobalAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = StackSafetyGlobalInfo;
+  Result run(Module &M, ModuleAnalysisManager &AM);
+};
+
+/// Printer pass for the \c StackSafetyGlobalAnalysis results.
+class StackSafetyGlobalPrinterPass
+    : public PassInfoMixin<StackSafetyGlobalPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit StackSafetyGlobalPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+/// This pass performs the global (interprocedural) stack safety analysis
+/// (legacy pass manager).
+class StackSafetyGlobalInfoWrapperPass : public ModulePass {
+  StackSafetyGlobalInfo SSI;
+
+public:
+  static char ID;
+
+  StackSafetyGlobalInfoWrapperPass();
+
+  const StackSafetyGlobalInfo &getResult() const { return SSI; }
+
+  void print(raw_ostream &O, const Module *M) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnModule(Module &M) override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_ANALYSIS_STACKSAFETYANALYSIS_H
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index eb0e0270157fd05f3eaf4b36f3aeb6e86b3fcd9d..6ddea8bbdce09ffe142ab7cecea912aa3d499b90 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -744,9 +744,9 @@ public:
   /// and the number of execution units in the CPU.
   unsigned getMaxInterleaveFactor(unsigned VF) const;
 
-  /// Collect properties of V used in cost analyzis, e.g. OP_PowerOf2.
-  OperandValueKind getOperandInfo(Value *V,
-                                  OperandValueProperties &OpProps) const;
+  /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
+  static OperandValueKind getOperandInfo(Value *V,
+                                         OperandValueProperties &OpProps);
 
   /// This is an approximation of reciprocal throughput of a math/logic op.
   /// A higher cost indicates less expected throughput.
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index a92ba8e374f09282558cf7b59ca95f94ff2aaae4..7c2e9a65ac27dcdf653e2818b6455186604c76b6 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -610,6 +610,12 @@ class Value;
   Optional<bool> isImpliedCondition(const Value *LHS, const Value *RHS,
                                     const DataLayout &DL, bool LHSIsTrue = true,
                                     unsigned Depth = 0);
+
+  /// Return the boolean condition value in the context of the given instruction
+  /// if it is known based on dominating conditions.
+  Optional<bool> isImpliedByDomCondition(const Value *Cond,
+                                         const Instruction *ContextI,
+                                         const DataLayout &DL);
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_VALUETRACKING_H
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 797260f439a04078787fad427f874067d3e96980..5d44a52d4c53c64e1ab57bb09b10b2b8200b9cdc 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -24,7 +24,7 @@ namespace llvm {
 template <typename T> class ArrayRef;
 class DemandedBits;
 class GetElementPtrInst;
-class InterleaveGroup; 
+template <typename InstTy> class InterleaveGroup;
 class Loop;
 class ScalarEvolution;
 class TargetTransformInfo;
@@ -138,7 +138,7 @@ Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 /// create[*]Mask() utilities which create a shuffle mask (mask that
 /// consists of indices).
 Constant *createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
-                               const InterleaveGroup &Group);
+                               const InterleaveGroup<Instruction> &Group);
 
 /// Create a mask with replicated elements.
 ///
@@ -233,9 +233,12 @@ Value *concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs);
 ///
 /// Note: the interleaved load group could have gaps (missing members), but
 /// the interleaved store group doesn't allow gaps.
-class InterleaveGroup {
+template <typename InstTy> class InterleaveGroup {
 public:
-  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
+  InterleaveGroup(unsigned Factor, bool Reverse, unsigned Align)
+      : Factor(Factor), Reverse(Reverse), Align(Align), InsertPos(nullptr) {}
+
+  InterleaveGroup(InstTy *Instr, int Stride, unsigned Align)
       : Align(Align), InsertPos(Instr) {
     assert(Align && "The alignment should be non-zero");
 
@@ -256,7 +259,7 @@ public:
   /// negative if it is the new leader.
   ///
   /// \returns false if the instruction doesn't belong to the group.
-  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
+  bool insertMember(InstTy *Instr, int Index, unsigned NewAlign) {
     assert(NewAlign && "The new member's alignment should be non-zero");
 
     int Key = Index + SmallestKey;
@@ -288,7 +291,7 @@ public:
   /// Get the member with the given index \p Index
   ///
   /// \returns nullptr if contains no such member.
-  Instruction *getMember(unsigned Index) const {
+  InstTy *getMember(unsigned Index) const {
     int Key = SmallestKey + Index;
     auto Member = Members.find(Key);
     if (Member == Members.end())
@@ -299,16 +302,17 @@ public:
 
   /// Get the index for the given member. Unlike the key in the member
   /// map, the index starts from 0.
-  unsigned getIndex(Instruction *Instr) const {
-    for (auto I : Members)
+  unsigned getIndex(const InstTy *Instr) const {
+    for (auto I : Members) {
       if (I.second == Instr)
         return I.first - SmallestKey;
+    }
 
     llvm_unreachable("InterleaveGroup contains no such member");
   }
 
-  Instruction *getInsertPos() const { return InsertPos; }
-  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
+  InstTy *getInsertPos() const { return InsertPos; }
+  void setInsertPos(InstTy *Inst) { InsertPos = Inst; }
 
   /// Add metadata (e.g. alias info) from the instructions in this group to \p
   /// NewInst.
@@ -316,12 +320,7 @@ public:
   /// FIXME: this function currently does not add noalias metadata a'la
   /// addNewMedata.  To do that we need to compute the intersection of the
   /// noalias info from all members.
-  void addMetadata(Instruction *NewInst) const {
-    SmallVector<Value *, 4> VL;
-    std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
-                   [](std::pair<int, Instruction *> p) { return p.second; });
-    propagateMetadata(NewInst, VL);
-  }
+  void addMetadata(InstTy *NewInst) const;
 
   /// Returns true if this Group requires a scalar iteration to handle gaps.
   bool requiresScalarEpilogue() const {
@@ -344,7 +343,7 @@ private:
   unsigned Factor; // Interleave Factor.
   bool Reverse;
   unsigned Align;
-  DenseMap<int, Instruction *> Members;
+  DenseMap<int, InstTy *> Members;
   int SmallestKey = 0;
   int LargestKey = 0;
 
@@ -359,7 +358,7 @@ private:
   //      store i32 %even
   //      %odd = add i32               // Def of %odd
   //      store i32 %odd               // Insert Position
-  Instruction *InsertPos;
+  InstTy *InsertPos;
 };
 
 /// Drive the analysis of interleaved memory accesses in the loop.
@@ -390,7 +389,7 @@ public:
   /// formation for predicated accesses, we may be able to relax this limitation
   /// in the future once we handle more complicated blocks.
   void reset() {
-    SmallPtrSet<InterleaveGroup *, 4> DelSet;
+    SmallPtrSet<InterleaveGroup<Instruction> *, 4> DelSet;
     // Avoid releasing a pointer twice.
     for (auto &I : InterleaveGroupMap)
       DelSet.insert(I.second);
@@ -409,11 +408,16 @@ public:
   /// Get the interleave group that \p Instr belongs to.
   ///
   /// \returns nullptr if doesn't have such group.
-  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
-    auto Group = InterleaveGroupMap.find(Instr);
-    if (Group == InterleaveGroupMap.end())
-      return nullptr;
-    return Group->second;
+  InterleaveGroup<Instruction> *
+  getInterleaveGroup(const Instruction *Instr) const {
+    if (InterleaveGroupMap.count(Instr))
+      return InterleaveGroupMap.find(Instr)->second;
+    return nullptr;
+  }
+
+  iterator_range<SmallPtrSetIterator<llvm::InterleaveGroup<Instruction> *>>
+  getInterleaveGroups() {
+    return make_range(InterleaveGroups.begin(), InterleaveGroups.end());
   }
 
   /// Returns true if an interleaved group that may access memory
@@ -443,7 +447,9 @@ private:
   bool RequiresScalarEpilogue = false;
 
   /// Holds the relationships between the members and the interleave group.
-  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
+  DenseMap<Instruction *, InterleaveGroup<Instruction> *> InterleaveGroupMap;
+
+  SmallPtrSet<InterleaveGroup<Instruction> *, 4> InterleaveGroups;
 
   /// Holds dependences among the memory accesses in the loop. It maps a source
   /// access to a set of dependent sink accesses.
@@ -476,19 +482,23 @@ private:
   /// stride \p Stride and alignment \p Align.
   ///
   /// \returns the newly created interleave group.
-  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
-                                         unsigned Align) {
-    assert(!isInterleaved(Instr) && "Already in an interleaved access group");
-    InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
+  InterleaveGroup<Instruction> *
+  createInterleaveGroup(Instruction *Instr, int Stride, unsigned Align) {
+    assert(!InterleaveGroupMap.count(Instr) &&
+           "Already in an interleaved access group");
+    InterleaveGroupMap[Instr] =
+        new InterleaveGroup<Instruction>(Instr, Stride, Align);
+    InterleaveGroups.insert(InterleaveGroupMap[Instr]);
     return InterleaveGroupMap[Instr];
   }
 
   /// Release the group and remove all the relationships.
-  void releaseGroup(InterleaveGroup *Group) {
+  void releaseGroup(InterleaveGroup<Instruction> *Group) {
     for (unsigned i = 0; i < Group->getFactor(); i++)
       if (Instruction *Member = Group->getMember(i))
         InterleaveGroupMap.erase(Member);
 
+    InterleaveGroups.erase(Group);
     delete Group;
   }
 
diff --git a/include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h b/include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..de44f41720edb3369350ac1d95fc1a4700971e5e
--- /dev/null
+++ b/include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h
@@ -0,0 +1,70 @@
+//===- AMDGPUMetadataVerifier.h - MsgPack Types -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This is a verifier for AMDGPU HSA metadata, which can verify both
+/// well-typed metadata and untyped metadata. When verifying in the non-strict
+/// mode, untyped metadata is coerced into the correct type if possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H
+#define LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H
+
+#include "llvm/BinaryFormat/MsgPackTypes.h"
+
+namespace llvm {
+namespace AMDGPU {
+namespace HSAMD {
+namespace V3 {
+
+/// Verifier for AMDGPU HSA metadata.
+///
+/// Operates in two modes:
+///
+/// In strict mode, metadata must already be well-typed.
+///
+/// In non-strict mode, metadata is coerced into expected types when possible.
+class MetadataVerifier {
+  bool Strict;
+
+  bool verifyScalar(msgpack::Node &Node, msgpack::ScalarNode::ScalarKind SKind,
+                    function_ref<bool(msgpack::ScalarNode &)> verifyValue = {});
+  bool verifyInteger(msgpack::Node &Node);
+  bool verifyArray(msgpack::Node &Node,
+                   function_ref<bool(msgpack::Node &)> verifyNode,
+                   Optional<size_t> Size = None);
+  bool verifyEntry(msgpack::MapNode &MapNode, StringRef Key, bool Required,
+                   function_ref<bool(msgpack::Node &)> verifyNode);
+  bool
+  verifyScalarEntry(msgpack::MapNode &MapNode, StringRef Key, bool Required,
+                    msgpack::ScalarNode::ScalarKind SKind,
+                    function_ref<bool(msgpack::ScalarNode &)> verifyValue = {});
+  bool verifyIntegerEntry(msgpack::MapNode &MapNode, StringRef Key,
+                          bool Required);
+  bool verifyKernelArgs(msgpack::Node &Node);
+  bool verifyKernel(msgpack::Node &Node);
+
+public:
+  /// Construct a MetadataVerifier, specifying whether it will operate in \p
+  /// Strict mode.
+  MetadataVerifier(bool Strict) : Strict(Strict) {}
+
+  /// Verify given HSA metadata.
+  ///
+  /// \returns True when successful, false when metadata is invalid.
+  bool verify(msgpack::Node &HSAMetadataRoot);
+};
+
+} // end namespace V3
+} // end namespace HSAMD
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H
diff --git a/include/llvm/BinaryFormat/Dwarf.def b/include/llvm/BinaryFormat/Dwarf.def
index 512cc64926db5a6e0692895a6af1660c0e9d6a8f..cb9f7f5145d1fbc48ea71945e5a083dc1f18b01c 100644
--- a/include/llvm/BinaryFormat/Dwarf.def
+++ b/include/llvm/BinaryFormat/Dwarf.def
@@ -18,7 +18,8 @@
     defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||             \
     defined HANDLE_DW_CC || defined HANDLE_DW_LNS || defined HANDLE_DW_LNE ||  \
     defined HANDLE_DW_LNCT || defined HANDLE_DW_MACRO ||                       \
-    defined HANDLE_DW_RLE || defined HANDLE_DW_CFA ||                          \
+    defined HANDLE_DW_RLE ||                                                   \
+    (defined HANDLE_DW_CFA && defined HANDLE_DW_CFA_PRED) ||                   \
     defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||                \
     defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX ||                   \
     defined HANDLE_DW_END)
@@ -85,6 +86,10 @@
 #define HANDLE_DW_CFA(ID, NAME)
 #endif
 
+#ifndef HANDLE_DW_CFA_PRED
+#define HANDLE_DW_CFA_PRED(ID, NAME, PRED)
+#endif
+
 #ifndef HANDLE_DW_APPLE_PROPERTY
 #define HANDLE_DW_APPLE_PROPERTY(ID, NAME)
 #endif
@@ -831,9 +836,10 @@ HANDLE_DW_CFA(0x14, val_offset)
 HANDLE_DW_CFA(0x15, val_offset_sf)
 HANDLE_DW_CFA(0x16, val_expression)
 // Vendor extensions:
-HANDLE_DW_CFA(0x1d, MIPS_advance_loc8)
-HANDLE_DW_CFA(0x2d, GNU_window_save)
-HANDLE_DW_CFA(0x2e, GNU_args_size)
+HANDLE_DW_CFA_PRED(0x1d, MIPS_advance_loc8, SELECT_MIPS64)
+HANDLE_DW_CFA_PRED(0x2d, GNU_window_save, SELECT_SPARC)
+HANDLE_DW_CFA_PRED(0x2d, AARCH64_negate_ra_state, SELECT_AARCH64)
+HANDLE_DW_CFA_PRED(0x2e, GNU_args_size, SELECT_X86)
 
 // Apple Objective-C Property Attributes.
 // Keep this list in sync with clang's DeclSpec.h ObjCPropertyAttributeKind!
@@ -916,6 +922,7 @@ HANDLE_DW_IDX(0x05, type_hash)
 #undef HANDLE_DW_MACRO
 #undef HANDLE_DW_RLE
 #undef HANDLE_DW_CFA
+#undef HANDLE_DW_CFA_PRED
 #undef HANDLE_DW_APPLE_PROPERTY
 #undef HANDLE_DW_UT
 #undef HANDLE_DWARF_SECTION
diff --git a/include/llvm/BinaryFormat/Dwarf.h b/include/llvm/BinaryFormat/Dwarf.h
index 330e31cffae678d3c7dea13a4729e6b9b3ae7edf..c25b6cd0966b13ef37982833fb3591b81c18ae7a 100644
--- a/include/llvm/BinaryFormat/Dwarf.h
+++ b/include/llvm/BinaryFormat/Dwarf.h
@@ -26,6 +26,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadicDetails.h"
+#include "llvm/ADT/Triple.h"
 
 namespace llvm {
 class StringRef;
@@ -272,6 +273,7 @@ enum RangeListEntries {
 /// Call frame instruction encodings.
 enum CallFrameInfo {
 #define HANDLE_DW_CFA(ID, NAME) DW_CFA_##NAME = ID,
+#define HANDLE_DW_CFA_PRED(ID, NAME, ARCH) DW_CFA_##NAME = ID,
 #include "llvm/BinaryFormat/Dwarf.def"
   DW_CFA_extended = 0x00,
 
@@ -430,7 +432,7 @@ StringRef LNStandardString(unsigned Standard);
 StringRef LNExtendedString(unsigned Encoding);
 StringRef MacinfoString(unsigned Encoding);
 StringRef RangeListEncodingString(unsigned Encoding);
-StringRef CallFrameString(unsigned Encoding);
+StringRef CallFrameString(unsigned Encoding, Triple::ArchType Arch);
 StringRef ApplePropertyString(unsigned);
 StringRef UnitTypeString(unsigned);
 StringRef AtomTypeString(unsigned Atom);
diff --git a/include/llvm/BinaryFormat/ELF.h b/include/llvm/BinaryFormat/ELF.h
index ebbf830a60e9db9318dcc2d8a580da094250e74e..ce35d127d43390e764795c22c98bed1ac073e464 100644
--- a/include/llvm/BinaryFormat/ELF.h
+++ b/include/llvm/BinaryFormat/ELF.h
@@ -582,6 +582,7 @@ enum {
   EF_HEXAGON_MACH_V60 = 0x00000060, // Hexagon V60
   EF_HEXAGON_MACH_V62 = 0x00000062, // Hexagon V62
   EF_HEXAGON_MACH_V65 = 0x00000065, // Hexagon V65
+  EF_HEXAGON_MACH_V66 = 0x00000066, // Hexagon V66
 
   // Highest ISA version flags
   EF_HEXAGON_ISA_MACH = 0x00000000, // Same as specified in bits[11:0]
@@ -594,6 +595,7 @@ enum {
   EF_HEXAGON_ISA_V60 = 0x00000060,  // Hexagon V60 ISA
   EF_HEXAGON_ISA_V62 = 0x00000062,  // Hexagon V62 ISA
   EF_HEXAGON_ISA_V65 = 0x00000065,  // Hexagon V65 ISA
+  EF_HEXAGON_ISA_V66 = 0x00000066,  // Hexagon V66 ISA
 };
 
 // Hexagon-specific section indexes for common small data
@@ -1359,7 +1361,7 @@ enum {
   GNU_PROPERTY_X86_FEATURE_1_SHSTK = 1 << 1
 };
 
-// AMDGPU specific notes.
+// AMD specific notes. (Code Object V2)
 enum {
   // Note types with values between 0 and 9 (inclusive) are reserved.
   NT_AMD_AMDGPU_HSA_METADATA = 10,
@@ -1367,6 +1369,12 @@ enum {
   NT_AMD_AMDGPU_PAL_METADATA = 12
 };
 
+// AMDGPU specific notes. (Code Object V3)
+enum {
+  // Note types with values between 0 and 31 (inclusive) are reserved.
+  NT_AMDGPU_METADATA = 32
+};
+
 enum {
   GNU_ABI_TAG_LINUX = 0,
   GNU_ABI_TAG_HURD = 1,
@@ -1377,6 +1385,8 @@ enum {
   GNU_ABI_TAG_NACL = 6,
 };
 
+constexpr const char *ELF_NOTE_GNU = "GNU";
+
 // Android packed relocation group flags.
 enum {
   RELOCATION_GROUPED_BY_INFO_FLAG = 1,
diff --git a/include/llvm/BinaryFormat/MsgPackTypes.h b/include/llvm/BinaryFormat/MsgPackTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..f96cd4c338fd9e55edccbe0d784deefdcc0755ff
--- /dev/null
+++ b/include/llvm/BinaryFormat/MsgPackTypes.h
@@ -0,0 +1,372 @@
+//===- MsgPackTypes.h - MsgPack Types ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This is a data structure for representing MessagePack "documents", with
+/// methods to go to and from MessagePack. The types also specialize YAMLIO
+/// traits in order to go to and from YAML.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/BinaryFormat/MsgPackReader.h"
+#include "llvm/BinaryFormat/MsgPackWriter.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <vector>
+
+#ifndef LLVM_BINARYFORMAT_MSGPACKTYPES_H
+#define LLVM_BINARYFORMAT_MSGPACKTYPES_H
+
+namespace llvm {
+namespace msgpack {
+
+class Node;
+
+/// Short-hand for a Node pointer.
+using NodePtr = std::shared_ptr<Node>;
+
+/// Short-hand for an Optional Node pointer.
+using OptNodePtr = Optional<NodePtr>;
+
+/// Abstract base-class which can be any MessagePack type.
+class Node {
+public:
+  enum NodeKind {
+    NK_Scalar,
+    NK_Array,
+    NK_Map,
+  };
+
+private:
+  virtual void anchor() = 0;
+  const NodeKind Kind;
+
+  static Expected<OptNodePtr> readArray(Reader &MPReader, size_t Length);
+  static Expected<OptNodePtr> readMap(Reader &MPReader, size_t Length);
+
+public:
+  NodeKind getKind() const { return Kind; }
+
+  /// Construct a Node. Used by derived classes to track kind information.
+  Node(NodeKind Kind) : Kind(Kind) {}
+
+  virtual ~Node() = default;
+
+  /// Read from a MessagePack reader \p MPReader, returning an error if one is
+  /// encountered, or None if \p MPReader is at the end of stream, or some Node
+  /// pointer if some type is read.
+  static Expected<OptNodePtr> read(Reader &MPReader);
+
+  /// Write to a MessagePack writer \p MPWriter.
+  virtual void write(Writer &MPWriter) = 0;
+};
+
+/// A MessagePack scalar.
+class ScalarNode : public Node {
+public:
+  enum ScalarKind {
+    SK_Int,
+    SK_UInt,
+    SK_Nil,
+    SK_Boolean,
+    SK_Float,
+    SK_String,
+    SK_Binary,
+  };
+
+private:
+  void anchor() override;
+
+  void destroy();
+
+  ScalarKind SKind;
+
+  union {
+    int64_t IntValue;
+    uint64_t UIntValue;
+    bool BoolValue;
+    double FloatValue;
+    std::string StringValue;
+  };
+
+public:
+  /// Construct an Int ScalarNode.
+  ScalarNode(int64_t IntValue);
+  /// Construct an Int ScalarNode.
+  ScalarNode(int32_t IntValue);
+  /// Construct an UInt ScalarNode.
+  ScalarNode(uint64_t UIntValue);
+  /// Construct an UInt ScalarNode.
+  ScalarNode(uint32_t UIntValue);
+  /// Construct a Nil ScalarNode.
+  ScalarNode();
+  /// Construct a Boolean ScalarNode.
+  ScalarNode(bool BoolValue);
+  /// Construct a Float ScalarNode.
+  ScalarNode(double FloatValue);
+  /// Construct a String ScalarNode.
+  ScalarNode(StringRef StringValue);
+  /// Construct a String ScalarNode.
+  ScalarNode(const char *StringValue);
+  /// Construct a String ScalarNode.
+  ScalarNode(std::string &&StringValue);
+  /// Construct a Binary ScalarNode.
+  ScalarNode(MemoryBufferRef BinaryValue);
+
+  ~ScalarNode();
+
+  ScalarNode &operator=(const ScalarNode &RHS) = delete;
+  /// A ScalarNode can only be move assigned.
+  ScalarNode &operator=(ScalarNode &&RHS);
+
+  /// Change the kind of this ScalarNode, zero initializing it to the new type.
+  void setScalarKind(ScalarKind SKind) {
+    switch (SKind) {
+    case SK_Int:
+      *this = int64_t(0);
+      break;
+    case SK_UInt:
+      *this = uint64_t(0);
+      break;
+    case SK_Boolean:
+      *this = false;
+      break;
+    case SK_Float:
+      *this = 0.0;
+      break;
+    case SK_String:
+      *this = StringRef();
+      break;
+    case SK_Binary:
+      *this = MemoryBufferRef("", "");
+      break;
+    case SK_Nil:
+      *this = ScalarNode();
+      break;
+    }
+  }
+
+  /// Get the current kind of ScalarNode.
+  ScalarKind getScalarKind() { return SKind; }
+
+  /// Get the value of an Int scalar.
+  ///
+  /// \warning Assumes getScalarKind() == SK_Int
+  int64_t getInt() {
+    assert(SKind == SK_Int);
+    return IntValue;
+  }
+
+  /// Get the value of a UInt scalar.
+  ///
+  /// \warning Assumes getScalarKind() == SK_UInt
+  uint64_t getUInt() {
+    assert(SKind == SK_UInt);
+    return UIntValue;
+  }
+
+  /// Get the value of an Boolean scalar.
+  ///
+  /// \warning Assumes getScalarKind() == SK_Boolean
+  bool getBool() {
+    assert(SKind == SK_Boolean);
+    return BoolValue;
+  }
+
+  /// Get the value of an Float scalar.
+  ///
+  /// \warning Assumes getScalarKind() == SK_Float
+  double getFloat() {
+    assert(SKind == SK_Float);
+    return FloatValue;
+  }
+
+  /// Get the value of a String scalar.
+  ///
+  /// \warning Assumes getScalarKind() == SK_String
+  StringRef getString() {
+    assert(SKind == SK_String);
+    return StringValue;
+  }
+
+  /// Get the value of a Binary scalar.
+  ///
+  /// \warning Assumes getScalarKind() == SK_Binary
+  StringRef getBinary() {
+    assert(SKind == SK_Binary);
+    return StringValue;
+  }
+
+  static bool classof(const Node *N) { return N->getKind() == NK_Scalar; }
+
+  void write(Writer &MPWriter) override;
+
+  /// Parse a YAML scalar of the current ScalarKind from \p ScalarStr.
+  ///
+  /// \returns An empty string on success, otherwise an error message.
+  StringRef inputYAML(StringRef ScalarStr);
+
+  /// Output a YAML scalar of the current ScalarKind into \p OS.
+  void outputYAML(raw_ostream &OS) const;
+
+  /// Determine which YAML quoting type the current value would need when
+  /// output.
+  yaml::QuotingType mustQuoteYAML(StringRef ScalarStr) const;
+
+  /// Get the YAML tag for the current ScalarKind.
+  StringRef getYAMLTag() const;
+
+  /// Flag which affects how the type handles YAML tags when reading and
+  /// writing.
+  ///
+  /// When false, tags are used when reading and writing. When reading, the tag
+  /// is used to decide the ScalarKind before parsing. When writing, the tag is
+  /// output along with the value.
+  ///
+  /// When true, tags are ignored when reading and writing. When reading, the
+  /// ScalarKind is always assumed to be String. When writing, the tag is not
+  /// output.
+  bool IgnoreTag = false;
+
+  static const char *IntTag;
+  static const char *NilTag;
+  static const char *BooleanTag;
+  static const char *FloatTag;
+  static const char *StringTag;
+  static const char *BinaryTag;
+};
+
+class ArrayNode : public Node, public std::vector<NodePtr> {
+  void anchor() override;
+
+public:
+  ArrayNode() : Node(NK_Array) {}
+  static bool classof(const Node *N) { return N->getKind() == NK_Array; }
+
+  void write(Writer &MPWriter) override {
+    MPWriter.writeArraySize(this->size());
+    for (auto &N : *this)
+      N->write(MPWriter);
+  }
+};
+
+class MapNode : public Node, public StringMap<NodePtr> {
+  void anchor() override;
+
+public:
+  MapNode() : Node(NK_Map) {}
+  static bool classof(const Node *N) { return N->getKind() == NK_Map; }
+
+  void write(Writer &MPWriter) override {
+    MPWriter.writeMapSize(this->size());
+    for (auto &N : *this) {
+      MPWriter.write(N.first());
+      N.second->write(MPWriter);
+    }
+  }
+};
+
+} // end namespace msgpack
+
+namespace yaml {
+
+template <> struct PolymorphicTraits<msgpack::NodePtr> {
+  static NodeKind getKind(const msgpack::NodePtr &N) {
+    if (isa<msgpack::ScalarNode>(*N))
+      return NodeKind::Scalar;
+    if (isa<msgpack::MapNode>(*N))
+      return NodeKind::Map;
+    if (isa<msgpack::ArrayNode>(*N))
+      return NodeKind::Sequence;
+    llvm_unreachable("NodeKind not supported");
+  }
+  static msgpack::ScalarNode &getAsScalar(msgpack::NodePtr &N) {
+    if (!N || !isa<msgpack::ScalarNode>(*N))
+      N.reset(new msgpack::ScalarNode());
+    return *cast<msgpack::ScalarNode>(N.get());
+  }
+  static msgpack::MapNode &getAsMap(msgpack::NodePtr &N) {
+    if (!N || !isa<msgpack::MapNode>(*N))
+      N.reset(new msgpack::MapNode());
+    return *cast<msgpack::MapNode>(N.get());
+  }
+  static msgpack::ArrayNode &getAsSequence(msgpack::NodePtr &N) {
+    if (!N || !isa<msgpack::ArrayNode>(*N))
+      N.reset(new msgpack::ArrayNode());
+    return *cast<msgpack::ArrayNode>(N.get());
+  }
+};
+
+template <> struct TaggedScalarTraits<msgpack::ScalarNode> {
+  static void output(const msgpack::ScalarNode &S, void *Ctxt,
+                     raw_ostream &ScalarOS, raw_ostream &TagOS) {
+    if (!S.IgnoreTag)
+      TagOS << S.getYAMLTag();
+    S.outputYAML(ScalarOS);
+  }
+
+  static StringRef input(StringRef ScalarStr, StringRef Tag, void *Ctxt,
+                         msgpack::ScalarNode &S) {
+    if (Tag == msgpack::ScalarNode::IntTag) {
+      S.setScalarKind(msgpack::ScalarNode::SK_UInt);
+      if (S.inputYAML(ScalarStr) == StringRef())
+        return StringRef();
+      S.setScalarKind(msgpack::ScalarNode::SK_Int);
+      return S.inputYAML(ScalarStr);
+    }
+
+    if (S.IgnoreTag || Tag == msgpack::ScalarNode::StringTag ||
+        Tag == "tag:yaml.org,2002:str")
+      S.setScalarKind(msgpack::ScalarNode::SK_String);
+    else if (Tag == msgpack::ScalarNode::NilTag)
+      S.setScalarKind(msgpack::ScalarNode::SK_Nil);
+    else if (Tag == msgpack::ScalarNode::BooleanTag)
+      S.setScalarKind(msgpack::ScalarNode::SK_Boolean);
+    else if (Tag == msgpack::ScalarNode::FloatTag)
+      S.setScalarKind(msgpack::ScalarNode::SK_Float);
+    else if (Tag == msgpack::ScalarNode::StringTag)
+      S.setScalarKind(msgpack::ScalarNode::SK_String);
+    else if (Tag == msgpack::ScalarNode::BinaryTag)
+      S.setScalarKind(msgpack::ScalarNode::SK_Binary);
+    else
+      return "Unsupported messagepack tag";
+
+    return S.inputYAML(ScalarStr);
+  }
+
+  static QuotingType mustQuote(const msgpack::ScalarNode &S, StringRef Str) {
+    return S.mustQuoteYAML(Str);
+  }
+};
+
+template <> struct CustomMappingTraits<msgpack::MapNode> {
+  static void inputOne(IO &IO, StringRef Key, msgpack::MapNode &M) {
+    IO.mapRequired(Key.str().c_str(), M[Key]);
+  }
+  static void output(IO &IO, msgpack::MapNode &M) {
+    for (auto &N : M)
+      IO.mapRequired(N.getKey().str().c_str(), N.getValue());
+  }
+};
+
+template <> struct SequenceTraits<msgpack::ArrayNode> {
+  static size_t size(IO &IO, msgpack::ArrayNode &A) { return A.size(); }
+  static msgpack::NodePtr &element(IO &IO, msgpack::ArrayNode &A,
+                                   size_t Index) {
+    if (Index >= A.size())
+      A.resize(Index + 1);
+    return A[Index];
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+#endif //  LLVM_BINARYFORMAT_MSGPACKTYPES_H
diff --git a/include/llvm/BinaryFormat/Wasm.h b/include/llvm/BinaryFormat/Wasm.h
index 3d25c9d15e4e7ff2fe73c716fd0425efa443bc06..ca752374e157087a22688c45d5e350bb99c52920 100644
--- a/include/llvm/BinaryFormat/Wasm.h
+++ b/include/llvm/BinaryFormat/Wasm.h
@@ -35,6 +35,14 @@ struct WasmObjectHeader {
   uint32_t Version;
 };
 
+struct WasmDylinkInfo {
+  uint32_t MemorySize; // Memory size in bytes
+  uint32_t MemoryAlignment;  // P2 alignment of memory
+  uint32_t TableSize;  // Table size in elements
+  uint32_t TableAlignment;  // P2 alignment of table
+  std::vector<StringRef> Needed; // Shared library depenedencies
+};
+
 struct WasmExport {
   StringRef Name;
   uint8_t Kind;
@@ -75,6 +83,18 @@ struct WasmGlobal {
   StringRef SymbolName; // from the "linking" section
 };
 
+struct WasmEventType {
+  // Kind of event. Currently only WASM_EVENT_ATTRIBUTE_EXCEPTION is possible.
+  uint32_t Attribute;
+  uint32_t SigIndex;
+};
+
+struct WasmEvent {
+  uint32_t Index;
+  WasmEventType Type;
+  StringRef SymbolName; // from the "linking" section
+};
+
 struct WasmImport {
   StringRef Module;
   StringRef Field;
@@ -84,6 +104,7 @@ struct WasmImport {
     WasmGlobalType Global;
     WasmTable Table;
     WasmLimits Memory;
+    WasmEventType Event;
   };
 };
 
@@ -167,18 +188,20 @@ struct WasmLinkingData {
 };
 
 enum : unsigned {
-  WASM_SEC_CUSTOM = 0,   // Custom / User-defined section
-  WASM_SEC_TYPE = 1,     // Function signature declarations
-  WASM_SEC_IMPORT = 2,   // Import declarations
-  WASM_SEC_FUNCTION = 3, // Function declarations
-  WASM_SEC_TABLE = 4,    // Indirect function table and other tables
-  WASM_SEC_MEMORY = 5,   // Memory attributes
-  WASM_SEC_GLOBAL = 6,   // Global declarations
-  WASM_SEC_EXPORT = 7,   // Exports
-  WASM_SEC_START = 8,    // Start function declaration
-  WASM_SEC_ELEM = 9,     // Elements section
-  WASM_SEC_CODE = 10,    // Function bodies (code)
-  WASM_SEC_DATA = 11     // Data segments
+  WASM_SEC_CUSTOM = 0,     // Custom / User-defined section
+  WASM_SEC_TYPE = 1,       // Function signature declarations
+  WASM_SEC_IMPORT = 2,     // Import declarations
+  WASM_SEC_FUNCTION = 3,   // Function declarations
+  WASM_SEC_TABLE = 4,      // Indirect function table and other tables
+  WASM_SEC_MEMORY = 5,     // Memory attributes
+  WASM_SEC_GLOBAL = 6,     // Global declarations
+  WASM_SEC_EXPORT = 7,     // Exports
+  WASM_SEC_START = 8,      // Start function declaration
+  WASM_SEC_ELEM = 9,       // Elements section
+  WASM_SEC_CODE = 10,      // Function bodies (code)
+  WASM_SEC_DATA = 11,      // Data segments
+  WASM_SEC_DATACOUNT = 12, // Data segment count
+  WASM_SEC_EVENT = 13      // Event declarations
 };
 
 // Type immediate encodings used in various contexts.
@@ -200,6 +223,7 @@ enum : unsigned {
   WASM_EXTERNAL_TABLE = 0x1,
   WASM_EXTERNAL_MEMORY = 0x2,
   WASM_EXTERNAL_GLOBAL = 0x3,
+  WASM_EXTERNAL_EVENT = 0x4,
 };
 
 // Opcodes used in initializer expressions.
@@ -243,6 +267,12 @@ enum WasmSymbolType : unsigned {
   WASM_SYMBOL_TYPE_DATA = 0x1,
   WASM_SYMBOL_TYPE_GLOBAL = 0x2,
   WASM_SYMBOL_TYPE_SECTION = 0x3,
+  WASM_SYMBOL_TYPE_EVENT = 0x4,
+};
+
+// Kinds of event attributes.
+enum WasmEventAttribute : unsigned {
+  WASM_EVENT_ATTRIBUTE_EXCEPTION = 0x0,
 };
 
 const unsigned WASM_SYMBOL_BINDING_MASK = 0x3;
diff --git a/include/llvm/BinaryFormat/WasmRelocs.def b/include/llvm/BinaryFormat/WasmRelocs.def
index 8ffd51e483f38bdf9cba9e289d7a0a16601f7987..b3a08e70c1d5d55b8333f4663c1bd50cc5655733 100644
--- a/include/llvm/BinaryFormat/WasmRelocs.def
+++ b/include/llvm/BinaryFormat/WasmRelocs.def
@@ -1,4 +1,3 @@
-
 #ifndef WASM_RELOC
 #error "WASM_RELOC must be defined"
 #endif
@@ -13,3 +12,4 @@ WASM_RELOC(R_WEBASSEMBLY_TYPE_INDEX_LEB,       6)
 WASM_RELOC(R_WEBASSEMBLY_GLOBAL_INDEX_LEB,     7)
 WASM_RELOC(R_WEBASSEMBLY_FUNCTION_OFFSET_I32,  8)
 WASM_RELOC(R_WEBASSEMBLY_SECTION_OFFSET_I32,   9)
+WASM_RELOC(R_WEBASSEMBLY_EVENT_INDEX_LEB,     10)
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 5467ecc26262a7f67dc08458e8ea714d62f21d49..f0d11e9c16894e849e6d83f21374e5eb8e13cdab 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -342,6 +342,7 @@ enum ConstantsCodes {
   CST_CODE_INLINEASM = 23,       // INLINEASM:     [sideeffect|alignstack|
                                  //                 asmdialect,asmstr,conststr]
   CST_CODE_CE_GEP_WITH_INRANGE_INDEX = 24, //      [opty, flags, n x operands]
+  CST_CODE_CE_UNOP = 25,         // CE_UNOP:      [opcode, opval]
 };
 
 /// CastOpcodes - These are values used in the bitcode files to encode which
@@ -364,6 +365,14 @@ enum CastOpcodes {
   CAST_ADDRSPACECAST = 12
 };
 
+/// UnaryOpcodes - These are values used in the bitcode files to encode which
+/// unop a CST_CODE_CE_UNOP or a XXX refers to.  The values of these enums
+/// have no fixed relation to the LLVM IR enum values.  Changing these will
+/// break compatibility with old files.
+enum UnaryOpcodes {
+  UNOP_NEG = 0
+};
+
 /// BinaryOpcodes - These are values used in the bitcode files to encode which
 /// binop a CST_CODE_CE_BINOP or a XXX refers to.  The values of these enums
 /// have no fixed relation to the LLVM IR enum values.  Changing these will
@@ -524,6 +533,7 @@ enum FunctionCodes {
   // 53 is unused.
   // 54 is unused.
   FUNC_CODE_OPERAND_BUNDLE = 55, // OPERAND_BUNDLE: [tag#, value...]
+  FUNC_CODE_INST_UNOP = 56,      // UNOP:       [opcode, ty, opval]
 };
 
 enum UseListCodes {
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 14bc08167825e3d7332d59f8457326b22f15446d..413901d218f961e1b70577f25925b3d77f52a1e2 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -71,6 +71,7 @@ class MCTargetOptions;
 class MDNode;
 class Module;
 class raw_ostream;
+class StackMaps;
 class TargetLoweringObjectFile;
 class TargetMachine;
 
@@ -137,6 +138,9 @@ private:
 
   static char ID;
 
+protected:
+  /// Protected struct HandlerInfo and Handlers permit target extended
+  /// AsmPrinter adds their own handlers.
   struct HandlerInfo {
     AsmPrinterHandler *Handler;
     const char *TimerName;
@@ -365,6 +369,9 @@ public:
   /// emit the proxies we previously omitted in EmitGlobalVariable.
   void emitGlobalGOTEquivs();
 
+  /// Emit the stack maps.
+  void emitStackMaps(StackMaps &SM);
+
   //===------------------------------------------------------------------===//
   // Overridable Hooks
   //===------------------------------------------------------------------===//
@@ -542,7 +549,7 @@ public:
   ///
   /// \p Value - The value to emit.
   /// \p Size - The size of the integer (in bytes) to emit.
-  virtual void EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const;
+  virtual void EmitDebugValue(const MCExpr *Value, unsigned Size) const;
 
   //===------------------------------------------------------------------===//
   // Dwarf Lowering Routines
@@ -652,6 +659,8 @@ private:
   void EmitLLVMUsedList(const ConstantArray *InitList);
   /// Emit llvm.ident metadata in an '.ident' directive.
   void EmitModuleIdents(Module &M);
+  /// Emit bytes for llvm.commandline metadata.
+  void EmitModuleCommandLines(Module &M);
   void EmitXXStructorList(const DataLayout &DL, const Constant *List,
                           bool isCtor);
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/include/llvm/CodeGen/AsmPrinterHandler.h
similarity index 92%
rename from lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
rename to include/llvm/CodeGen/AsmPrinterHandler.h
index f5ac95a20b10db2df8e6b84f0d2eecf47ca46a44..a8b13200dd4e33eba1301e9ec5e1ffb296ee59ad 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
+++ b/include/llvm/CodeGen/AsmPrinterHandler.h
@@ -1,4 +1,4 @@
-//===-- lib/CodeGen/AsmPrinter/AsmPrinterHandler.h -------------*- C++ -*--===//
+//===-- llvm/CodeGen/AsmPrinterHandler.h -----------------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H
-#define LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H
+#ifndef LLVM_CODEGEN_ASMPRINTERHANDLER_H
+#define LLVM_CODEGEN_ASMPRINTERHANDLER_H
 
 #include "llvm/Support/DataTypes.h"
 
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 224a41bc2b7aba965904741c4567ffa942f01ea6..f105d887c397f3386eb1dfe9acd6019c17faafc4 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -118,6 +118,50 @@ private:
     return Cost;
   }
 
+  /// Estimate a cost of subvector extraction as a sequence of extract and
+  /// insert operations.
+  unsigned getExtractSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
+    assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
+           "Can only extract subvectors from vectors");
+    int NumSubElts = SubTy->getVectorNumElements();
+    assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
+           "SK_ExtractSubvector index out of range");
+
+    unsigned Cost = 0;
+    // Subvector extraction cost is equal to the cost of extracting element from
+    // the source type plus the cost of inserting them into the result vector
+    // type.
+    for (int i = 0; i != NumSubElts; ++i) {
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::ExtractElement, Ty, i + Index);
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, SubTy, i);
+    }
+    return Cost;
+  }
+
+  /// Estimate a cost of subvector insertion as a sequence of extract and
+  /// insert operations.
+  unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
+    assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
+           "Can only insert subvectors into vectors");
+    int NumSubElts = SubTy->getVectorNumElements();
+    assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
+           "SK_InsertSubvector index out of range");
+
+    unsigned Cost = 0;
+    // Subvector insertion cost is equal to the cost of extracting element from
+    // the source type plus the cost of inserting them into the result vector
+    // type.
+    for (int i = 0; i != NumSubElts; ++i) {
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::ExtractElement, SubTy, i);
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, Ty, i + Index);
+    }
+    return Cost;
+  }
+
   /// Local query method delegates up to T which *must* implement this!
   const TargetSubtargetInfo *getST() const {
     return static_cast<const T *>(this)->getST();
@@ -579,9 +623,12 @@ public:
     case TTI::SK_PermuteSingleSrc:
     case TTI::SK_PermuteTwoSrc:
       return getPermuteShuffleOverhead(Tp);
-    default:
-      return 1;
+    case TTI::SK_ExtractSubvector:
+      return getExtractSubvectorOverhead(Tp, Index, SubTp);
+    case TTI::SK_InsertSubvector:
+      return getInsertSubvectorOverhead(Tp, Index, SubTp);
     }
+    llvm_unreachable("Unknown TTI::ShuffleKind");
   }
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -961,6 +1008,7 @@ public:
                                  unsigned VF = 1) {
     unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
     assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+    auto *ConcreteTTI = static_cast<T *>(this);
 
     switch (IID) {
     default: {
@@ -986,29 +1034,24 @@ public:
         ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
       }
 
-      return static_cast<T *>(this)->
-        getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost);
+      return ConcreteTTI->getIntrinsicInstrCost(IID, RetTy, Types, FMF,
+                                                ScalarizationCost);
     }
     case Intrinsic::masked_scatter: {
       assert(VF == 1 && "Can't vectorize types here.");
       Value *Mask = Args[3];
       bool VarMask = !isa<Constant>(Mask);
       unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
-      return
-        static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
-                                                       Args[0]->getType(),
-                                                       Args[1], VarMask,
-                                                       Alignment);
+      return ConcreteTTI->getGatherScatterOpCost(
+          Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment);
     }
     case Intrinsic::masked_gather: {
       assert(VF == 1 && "Can't vectorize types here.");
       Value *Mask = Args[2];
       bool VarMask = !isa<Constant>(Mask);
       unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
-      return
-        static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
-                                                       RetTy, Args[0], VarMask,
-                                                       Alignment);
+      return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy,
+                                                 Args[0], VarMask, Alignment);
     }
     case Intrinsic::experimental_vector_reduce_add:
     case Intrinsic::experimental_vector_reduce_mul:
@@ -1024,6 +1067,45 @@ public:
     case Intrinsic::experimental_vector_reduce_umax:
     case Intrinsic::experimental_vector_reduce_umin:
       return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF);
+    case Intrinsic::fshl:
+    case Intrinsic::fshr: {
+      Value *X = Args[0];
+      Value *Y = Args[1];
+      Value *Z = Args[2];
+      TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
+      TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
+      TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
+      TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
+      TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
+      OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
+                                                              : TTI::OP_None;
+      // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+      // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+      unsigned Cost = 0;
+      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Or, RetTy);
+      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Sub, RetTy);
+      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Shl, RetTy,
+                                                  OpKindX, OpKindZ, OpPropsX);
+      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::LShr, RetTy,
+                                                  OpKindY, OpKindZ, OpPropsY);
+      // Non-constant shift amounts requires a modulo.
+      if (OpKindZ != TTI::OK_UniformConstantValue &&
+          OpKindZ != TTI::OK_NonUniformConstantValue)
+        Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
+                                                    OpKindZ, OpKindBW, OpPropsZ,
+                                                    OpPropsBW);
+      // For non-rotates (X != Y) we must add shift-by-zero handling costs.
+      if (X != Y) {
+        Type *CondTy = Type::getInt1Ty(RetTy->getContext());
+        if (RetVF > 1)
+          CondTy = VectorType::get(CondTy, RetVF);
+        Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy,
+                                                CondTy, nullptr);
+        Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+                                                CondTy, nullptr);
+      }
+      return Cost;
+    }
     }
   }
 
@@ -1349,20 +1431,31 @@ public:
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
                                                  NumVecElts, SubTy);
-      ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
+      ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, SubTy);
       Ty = SubTy;
       ++LongVectorCount;
     }
+
+    NumReduxLevels -= LongVectorCount;
+
     // The minimal length of the vector is limited by the real length of vector
     // operations performed on the current platform. That's why several final
     // reduction operations are performed on the vectors with the same
     // architecture-dependent length.
-    ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
+
+    // Non pairwise reductions need one shuffle per reduction level. Pairwise
+    // reductions need two shuffles on every level, but the last one. On that
+    // level one of the shuffles is <0, u, u, ...> which is identity.
+    unsigned NumShuffles = NumReduxLevels;
+    if (IsPairwise && NumReduxLevels >= 1)
+      NumShuffles += NumReduxLevels - 1;
+    ShuffleCost += NumShuffles *
                    ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
                                                0, Ty);
-    ArithCost += (NumReduxLevels - LongVectorCount) *
+    ArithCost += NumReduxLevels *
                  ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
-    return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+    return ShuffleCost + ArithCost +
+           ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
 
   /// Try to calculate op costs for min/max reduction operations.
@@ -1393,37 +1486,45 @@ public:
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
       Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
+      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
+
       // Assume the pairwise shuffles add a cost.
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
                                                  NumVecElts, SubTy);
       MinMaxCost +=
-          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
-          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, nullptr) +
+          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
                                           nullptr);
       Ty = SubTy;
-      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
       ++LongVectorCount;
     }
+
+    NumReduxLevels -= LongVectorCount;
+
     // The minimal length of the vector is limited by the real length of vector
     // operations performed on the current platform. That's why several final
     // reduction opertions are perfomed on the vectors with the same
     // architecture-dependent length.
-    ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
+
+    // Non pairwise reductions need one shuffle per reduction level. Pairwise
+    // reductions need two shuffles on every level, but the last one. On that
+    // level one of the shuffles is <0, u, u, ...> which is identity.
+    unsigned NumShuffles = NumReduxLevels;
+    if (IsPairwise && NumReduxLevels >= 1)
+      NumShuffles += NumReduxLevels - 1;
+    ShuffleCost += NumShuffles *
                    ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
                                                0, Ty);
     MinMaxCost +=
-        (NumReduxLevels - LongVectorCount) *
+        NumReduxLevels *
         (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
                                          nullptr));
-    // Need 3 extractelement instructions for scalarization + an additional
-    // scalar select instruction.
+    // The last min/max should be in vector registers and we counted it above.
+    // So just need a single extractelement.
     return ShuffleCost + MinMaxCost +
-           3 * getScalarizationOverhead(Ty, /*Insert=*/false,
-                                        /*Extract=*/true) +
-           ConcreteTTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                           ScalarCondTy, nullptr);
+           ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
 
   unsigned getVectorSplitCost() { return 1; }
diff --git a/include/llvm/CodeGen/GCs.h b/include/llvm/CodeGen/BuiltinGCs.h
similarity index 56%
rename from include/llvm/CodeGen/GCs.h
rename to include/llvm/CodeGen/BuiltinGCs.h
index 5207f801c84ea652f858ed3e70f3a4824346eb5d..1767922fb5ac0c7d77c7c5f3a8c4445cb5b99bb6 100644
--- a/include/llvm/CodeGen/GCs.h
+++ b/include/llvm/CodeGen/BuiltinGCs.h
@@ -1,4 +1,4 @@
-//===-- GCs.h - Garbage collector linkage hacks ---------------------------===//
+//===-- BuiltinGCs.h - Garbage collector linkage hacks --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains hack functions to force linking in the GC components.
+// This file contains hack functions to force linking in the builtin GC
+// components.
 //
 //===----------------------------------------------------------------------===//
 
@@ -15,32 +16,18 @@
 #define LLVM_CODEGEN_GCS_H
 
 namespace llvm {
-class GCStrategy;
-class GCMetadataPrinter;
 
 /// FIXME: Collector instances are not useful on their own. These no longer
 ///        serve any purpose except to link in the plugins.
 
-/// Creates a CoreCLR-compatible garbage collector.
-void linkCoreCLRGC();
-
-/// Creates an ocaml-compatible garbage collector.
-void linkOcamlGC();
+/// Ensure the definition of the builtin GCs gets linked in
+void linkAllBuiltinGCs();
 
 /// Creates an ocaml-compatible metadata printer.
 void linkOcamlGCPrinter();
 
-/// Creates an erlang-compatible garbage collector.
-void linkErlangGC();
-
 /// Creates an erlang-compatible metadata printer.
 void linkErlangGCPrinter();
-
-/// Creates a shadow stack garbage collector. This collector requires no code
-/// generator support.
-void linkShadowStackGC();
-
-void linkStatepointExampleGC();
 }
 
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.h b/include/llvm/CodeGen/DbgEntityHistoryCalculator.h
similarity index 92%
rename from lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.h
rename to include/llvm/CodeGen/DbgEntityHistoryCalculator.h
index 660d2ff551bea061433ecf6af7290ca04765ea7b..befc28f084e7f5d5e8ab382095f2382d354324a0 100644
--- a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.h
+++ b/include/llvm/CodeGen/DbgEntityHistoryCalculator.h
@@ -1,4 +1,4 @@
-//===- llvm/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.h -----*- C++ -*-===//
+//===- llvm/CodeGen/DbgEntityHistoryCalculator.h ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
-#define LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
+#ifndef LLVM_CODEGEN_DBGVALUEHISTORYCALCULATOR_H
+#define LLVM_CODEGEN_DBGVALUEHISTORYCALCULATOR_H
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -84,4 +84,4 @@ void calculateDbgEntityHistory(const MachineFunction *MF,
 
 } // end namespace llvm
 
-#endif // LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
+#endif // LLVM_CODEGEN_DBGVALUEHISTORYCALCULATOR_H
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/include/llvm/CodeGen/DebugHandlerBase.h
similarity index 94%
rename from lib/CodeGen/AsmPrinter/DebugHandlerBase.h
rename to include/llvm/CodeGen/DebugHandlerBase.h
index cdf8dc72b0753180cf82741cdda098ece7e3efa0..4f0d14d317f2a11d87fab58e01b18f6119312a4b 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
+++ b/include/llvm/CodeGen/DebugHandlerBase.h
@@ -1,4 +1,4 @@
-//===-- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h --------*- C++ -*--===//
+//===-- llvm/CodeGen/DebugHandlerBase.h -----------------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGHANDLERBASE_H
-#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGHANDLERBASE_H
+#ifndef LLVM_CODEGEN_DEBUGHANDLERBASE_H
+#define LLVM_CODEGEN_DEBUGHANDLERBASE_H
 
-#include "AsmPrinterHandler.h"
-#include "DbgEntityHistoryCalculator.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/AsmPrinterHandler.h"
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfoMetadata.h"
diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h
index 5fe4f89f34b22c2a991fd96bece59b513ac80981..7c658515de0952ff84f72cd67f50a0ed610f7cf4 100644
--- a/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -246,6 +246,7 @@ public:
       return 0;
     unsigned &R = ValueMap[V];
     assert(R == 0 && "Already initialized this value register!");
+    assert(VirtReg2Value.empty());
     return R = CreateRegs(V->getType());
   }
 
diff --git a/include/llvm/CodeGen/GCMetadata.h b/include/llvm/CodeGen/GCMetadata.h
index ad2599fc120e54e064e5bfe21063a934459c98ca..7fb27202c1226c1d88362e4f7b4136c21616bd4c 100644
--- a/include/llvm/CodeGen/GCMetadata.h
+++ b/include/llvm/CodeGen/GCMetadata.h
@@ -55,12 +55,11 @@ class MCSymbol;
 /// GCPoint - Metadata for a collector-safe point in machine code.
 ///
 struct GCPoint {
-  GC::PointKind Kind; ///< The kind of the safe point.
   MCSymbol *Label;    ///< A label.
   DebugLoc Loc;
 
-  GCPoint(GC::PointKind K, MCSymbol *L, DebugLoc DL)
-      : Kind(K), Label(L), Loc(std::move(DL)) {}
+  GCPoint(MCSymbol *L, DebugLoc DL)
+      : Label(L), Loc(std::move(DL)) {}
 };
 
 /// GCRoot - Metadata for a pointer to an object managed by the garbage
@@ -124,8 +123,8 @@ public:
   /// addSafePoint - Notes the existence of a safe point. Num is the ID of the
   /// label just prior to the safe point (if the code generator is using
   /// MachineModuleInfo).
-  void addSafePoint(GC::PointKind Kind, MCSymbol *Label, const DebugLoc &DL) {
-    SafePoints.emplace_back(Kind, Label, DL);
+  void addSafePoint(MCSymbol *Label, const DebugLoc &DL) {
+    SafePoints.emplace_back(Label, DL);
   }
 
   /// getFrameSize/setFrameSize - Records the function's frame size.
diff --git a/include/llvm/CodeGen/GCMetadataPrinter.h b/include/llvm/CodeGen/GCMetadataPrinter.h
index 1cc69a7b71af5fd01c535d37922a57e2bcb0bb80..5f1efb2ce02c2d81dc10531a6912678668927386 100644
--- a/include/llvm/CodeGen/GCMetadataPrinter.h
+++ b/include/llvm/CodeGen/GCMetadataPrinter.h
@@ -29,6 +29,7 @@ class GCMetadataPrinter;
 class GCModuleInfo;
 class GCStrategy;
 class Module;
+class StackMaps;
 
 /// GCMetadataPrinterRegistry - The GC assembly printer registry uses all the
 /// defaults from Registry.
@@ -60,6 +61,11 @@ public:
   /// Called after the assembly for the module is generated by
   /// the AsmPrinter (but before target specific hooks)
   virtual void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) {}
+
+  /// Called when the stack maps are generated. Return true if
+  /// stack maps with a custom format are generated. Otherwise
+  /// returns false and the default format will be used.
+  virtual bool emitStackMaps(StackMaps &SM, AsmPrinter &AP) { return false; }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/CodeGen/GCStrategy.h b/include/llvm/CodeGen/GCStrategy.h
index f835bacfb5481110ea31a9f5d634f8662b7bbc30..5a60cd7cb8236dd54e1d1bf1368370748f2ad3cf 100644
--- a/include/llvm/CodeGen/GCStrategy.h
+++ b/include/llvm/CodeGen/GCStrategy.h
@@ -59,19 +59,6 @@ namespace llvm {
 
 class Type;
 
-namespace GC {
-
-/// PointKind - Used to indicate whether the address of the call instruction
-/// or the address after the call instruction is listed in the stackmap.  For
-/// most runtimes, PostCall safepoints are appropriate.
-///
-enum PointKind {
-  PreCall, ///< Instr is a call instruction.
-  PostCall ///< Instr is the return address of a call.
-};
-
-} // end namespace GC
-
 /// GCStrategy describes a garbage collector algorithm's code generation
 /// requirements, and provides overridable hooks for those needs which cannot
 /// be abstractly described.  GCStrategy objects must be looked up through
@@ -88,11 +75,7 @@ protected:
                                /// if set, none of the other options can be
                                /// anything but their default values.
 
-  unsigned NeededSafePoints = 0;    ///< Bitmask of required safe points.
-  bool CustomReadBarriers = false;  ///< Default is to insert loads.
-  bool CustomWriteBarriers = false; ///< Default is to insert stores.
-  bool CustomRoots = false;      ///< Default is to pass through to backend.
-  bool InitRoots= true;          ///< If set, roots are nulled during lowering.
+  bool NeededSafePoints = false;    ///< if set, calls are inferred to be safepoints
   bool UsesMetadata = false;     ///< If set, backend must emit metadata tables.
 
 public:
@@ -103,16 +86,6 @@ public:
   /// name string specified on functions which use this strategy.
   const std::string &getName() const { return Name; }
 
-  /// By default, write barriers are replaced with simple store
-  /// instructions. If true, you must provide a custom pass to lower
-  /// calls to \@llvm.gcwrite.
-  bool customWriteBarrier() const { return CustomWriteBarriers; }
-
-  /// By default, read barriers are replaced with simple load
-  /// instructions. If true, you must provide a custom pass to lower
-  /// calls to \@llvm.gcread.
-  bool customReadBarrier() const { return CustomReadBarriers; }
-
   /// Returns true if this strategy is expecting the use of gc.statepoints,
   /// and false otherwise.
   bool useStatepoints() const { return UseStatepoints; }
@@ -135,25 +108,8 @@ public:
    */
   ///@{
 
-  /// True if safe points of any kind are required. By default, none are
-  /// recorded.
-  bool needsSafePoints() const { return NeededSafePoints != 0; }
-
-  /// True if the given kind of safe point is required. By default, none are
-  /// recorded.
-  bool needsSafePoint(GC::PointKind Kind) const {
-    return (NeededSafePoints & 1 << Kind) != 0;
-  }
-
-  /// By default, roots are left for the code generator so it can generate a
-  /// stack map. If true, you must provide a custom pass to lower
-  /// calls to \@llvm.gcroot.
-  bool customRoots() const { return CustomRoots; }
-
-  /// If set, gcroot intrinsics should initialize their allocas to null
-  /// before the first use. This is necessary for most GCs and is enabled by
-  /// default.
-  bool initializeRoots() const { return InitRoots; }
+  /// True if safe points need to be inferred on call sites
+  bool needsSafePoints() const { return NeededSafePoints; }
 
   /// If set, appropriate metadata tables must be emitted by the back-end
   /// (assembler, JIT, or otherwise). For statepoint, this method is
diff --git a/include/llvm/CodeGen/GlobalISel/Combiner.h b/include/llvm/CodeGen/GlobalISel/Combiner.h
index fa49e8c20d28bfac9b0f90caec7f7cc4ec9bbfdb..36a33deb4a642a6338fdee468191f1eca7f0737f 100644
--- a/include/llvm/CodeGen/GlobalISel/Combiner.h
+++ b/include/llvm/CodeGen/GlobalISel/Combiner.h
@@ -24,16 +24,6 @@ class CombinerInfo;
 class TargetPassConfig;
 class MachineFunction;
 
-class CombinerChangeObserver {
-public:
-  virtual ~CombinerChangeObserver() {}
-
-  /// An instruction was erased.
-  virtual void erasedInstr(MachineInstr &MI) = 0;
-  /// An instruction was created and inseerted into the function.
-  virtual void createdInstr(MachineInstr &MI) = 0;
-};
-
 class Combiner {
 public:
   Combiner(CombinerInfo &CombinerInfo, const TargetPassConfig *TPC);
diff --git a/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index ee67f90a39e508c4c7495aa24d599307ed7f0f98..6e9ac01c1ee291a1dd6cff05c0b12f903d425f69 100644
--- a/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -1,4 +1,4 @@
-//== llvm/CodeGen/GlobalISel/CombinerHelper.h -------------- -*- C++ -*-==//
+//===-- llvm/CodeGen/GlobalISel/CombinerHelper.h --------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,21 +20,27 @@
 
 namespace llvm {
 
-class CombinerChangeObserver;
+class GISelChangeObserver;
 class MachineIRBuilder;
 class MachineRegisterInfo;
 class MachineInstr;
+class MachineOperand;
 
 class CombinerHelper {
   MachineIRBuilder &Builder;
   MachineRegisterInfo &MRI;
-  CombinerChangeObserver &Observer;
-
-  void eraseInstr(MachineInstr &MI);
-  void scheduleForVisit(MachineInstr &MI);
+  GISelChangeObserver &Observer;
 
 public:
-  CombinerHelper(CombinerChangeObserver &Observer, MachineIRBuilder &B);
+  CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B);
+
+  /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
+  void replaceRegWith(MachineRegisterInfo &MRI, unsigned FromReg, unsigned ToReg) const;
+
+  /// Replace a single register operand with a new register and inform the
+  /// observer of the changes.
+  void replaceRegOpWith(MachineRegisterInfo &MRI, MachineOperand &FromRegOp,
+                        unsigned ToReg) const;
 
   /// If \p MI is COPY, try to combine it.
   /// Returns true if MI changed.
diff --git a/include/llvm/CodeGen/GlobalISel/CombinerInfo.h b/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
index 98c5b981462697343692c761d8f4e486bd5ff40c..d21aa3f725d92e77ad472fd91c9a1370749d1d2c 100644
--- a/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
@@ -17,7 +17,7 @@
 #include <cassert>
 namespace llvm {
 
-class CombinerChangeObserver;
+class GISelChangeObserver;
 class LegalizerInfo;
 class MachineInstr;
 class MachineIRBuilder;
@@ -43,7 +43,18 @@ public:
   /// illegal ops that are created.
   bool LegalizeIllegalOps; // TODO: Make use of this.
   const LegalizerInfo *LInfo;
-  virtual bool combine(CombinerChangeObserver &Observer, MachineInstr &MI,
+
+  /// Attempt to combine instructions using MI as the root.
+  ///
+  /// Use Observer to report the creation, modification, and erasure of
+  /// instructions. GISelChangeObserver will automatically report certain
+  /// kinds of operations. These operations are:
+  /// * Instructions that are newly inserted into the MachineFunction
+  /// * Instructions that are erased from the MachineFunction.
+  ///
+  /// However, it is important to report instruction modification and this is
+  /// not automatic.
+  virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
                        MachineIRBuilder &B) const = 0;
 };
 } // namespace llvm
diff --git a/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h b/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h
index 8d61f9a68279829f6d07b651639e71abe4186398..118c65ed39632edb9170c73d67bf7988348d9086 100644
--- a/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h
+++ b/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h
@@ -68,38 +68,18 @@ static Optional<APInt> ConstantFoldBinOp(unsigned Opcode, const unsigned Op1,
 
 /// An MIRBuilder which does trivial constant folding of binary ops.
 /// Calls to buildInstr will also try to constant fold binary ops.
-class ConstantFoldingMIRBuilder
-    : public FoldableInstructionsBuilder<ConstantFoldingMIRBuilder> {
+class ConstantFoldingMIRBuilder : public MachineIRBuilder {
 public:
   // Pull in base class constructors.
-  using FoldableInstructionsBuilder<
-      ConstantFoldingMIRBuilder>::FoldableInstructionsBuilder;
-  // Unhide buildInstr
-  using FoldableInstructionsBuilder<ConstantFoldingMIRBuilder>::buildInstr;
+  using MachineIRBuilder::MachineIRBuilder;
 
-  // Implement buildBinaryOp required by FoldableInstructionsBuilder which
-  // tries to constant fold.
-  MachineInstrBuilder buildBinaryOp(unsigned Opcode, unsigned Dst,
-                                    unsigned Src0, unsigned Src1) {
-    validateBinaryOp(Dst, Src0, Src1);
-    auto MaybeCst = ConstantFoldBinOp(Opcode, Src0, Src1, getMF().getRegInfo());
-    if (MaybeCst)
-      return buildConstant(Dst, MaybeCst->getSExtValue());
-    return buildInstr(Opcode).addDef(Dst).addUse(Src0).addUse(Src1);
-  }
-
-  template <typename DstTy, typename UseArg1Ty, typename UseArg2Ty>
-  MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty, UseArg1Ty &&Arg1,
-                                 UseArg2Ty &&Arg2) {
-    unsigned Dst = getDestFromArg(Ty);
-    return buildInstr(Opc, Dst, getRegFromArg(std::forward<UseArg1Ty>(Arg1)),
-                      getRegFromArg(std::forward<UseArg2Ty>(Arg2)));
-  }
+  virtual ~ConstantFoldingMIRBuilder() = default;
 
   // Try to provide an overload for buildInstr for binary ops in order to
   // constant fold.
-  MachineInstrBuilder buildInstr(unsigned Opc, unsigned Dst, unsigned Src0,
-                                 unsigned Src1) {
+  MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps,
+                                 ArrayRef<SrcOp> SrcOps,
+                                 Optional<unsigned> Flags = None) override {
     switch (Opc) {
     default:
       break;
@@ -116,19 +96,18 @@ public:
     case TargetOpcode::G_SDIV:
     case TargetOpcode::G_UREM:
     case TargetOpcode::G_SREM: {
-      return buildBinaryOp(Opc, Dst, Src0, Src1);
+      assert(DstOps.size() == 1 && "Invalid dst ops");
+      assert(SrcOps.size() == 2 && "Invalid src ops");
+      const DstOp &Dst = DstOps[0];
+      const SrcOp &Src0 = SrcOps[0];
+      const SrcOp &Src1 = SrcOps[1];
+      if (auto MaybeCst =
+              ConstantFoldBinOp(Opc, Src0.getReg(), Src1.getReg(), *getMRI()))
+        return buildConstant(Dst, MaybeCst->getSExtValue());
+      break;
     }
     }
-    return buildInstr(Opc).addDef(Dst).addUse(Src0).addUse(Src1);
-  }
-
-  // Fallback implementation of buildInstr.
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty,
-                                 UseArgsTy &&... Args) {
-    auto MIB = buildInstr(Opc).addDef(getDestFromArg(Ty));
-    addUsesFromArgs(MIB, std::forward<UseArgsTy>(Args)...);
-    return MIB;
+    return MachineIRBuilder::buildInstr(Opc, DstOps, SrcOps);
   }
 };
 } // namespace llvm
diff --git a/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h b/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h
new file mode 100644
index 0000000000000000000000000000000000000000..d21c73309a5202f9ad3b4be9afa95e039bc4940c
--- /dev/null
+++ b/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h
@@ -0,0 +1,55 @@
+//===----- llvm/CodeGen/GlobalISel/GISelChangeObserver.h ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// This contains common code to allow clients to notify changes to machine
+/// instr.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CODEGEN_GLOBALISEL_GISELCHANGEOBSERVER_H
+#define LLVM_CODEGEN_GLOBALISEL_GISELCHANGEOBSERVER_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+
+namespace llvm {
+class MachineInstr;
+class MachineRegisterInfo;
+
+/// Abstract class that contains various methods for clients to notify about
+/// changes. This should be the preferred way for APIs to notify changes.
+/// Typically calling erasingInstr/createdInstr multiple times should not affect
+/// the result. The observer would likely need to check if it was already
+/// notified earlier (consider using GISelWorkList).
+class GISelChangeObserver {
+  SmallPtrSet<MachineInstr *, 4> ChangingAllUsesOfReg;
+
+public:
+  virtual ~GISelChangeObserver() {}
+
+  /// An instruction is about to be erased.
+  virtual void erasingInstr(const MachineInstr &MI) = 0;
+  /// An instruction was created and inserted into the function.
+  virtual void createdInstr(const MachineInstr &MI) = 0;
+  /// This instruction is about to be mutated in some way.
+  virtual void changingInstr(const MachineInstr &MI) = 0;
+  /// This instruction was mutated in some way.
+  virtual void changedInstr(const MachineInstr &MI) = 0;
+
+  /// All the instructions using the given register are being changed.
+  /// For convenience, finishedChangingAllUsesOfReg() will report the completion
+  /// of the changes. The use list may change between this call and
+  /// finishedChangingAllUsesOfReg().
+  void changingAllUsesOfReg(const MachineRegisterInfo &MRI, unsigned Reg);
+  /// All instructions reported as changing by changingAllUsesOfReg() have
+  /// finished being changed.
+  void finishedChangingAllUsesOfReg();
+
+};
+
+} // namespace llvm
+#endif
diff --git a/include/llvm/CodeGen/GlobalISel/GISelWorkList.h b/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
index 167905dc9aa1a49efc9c23fe25e11902e3783111..b32c5af537ccd6578a7e3083ce90f5900ad674c6 100644
--- a/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
+++ b/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
@@ -12,38 +12,68 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm {
 
-class MachineInstr;
+class MachineFunction;
 
-// Worklist which mostly works similar to InstCombineWorkList, but on MachineInstrs.
-// The main difference with something like a SetVector is that erasing an element doesn't
-// move all elements over one place - instead just nulls out the element of the vector.
-// FIXME: Does it make sense to factor out common code with the instcombinerWorkList?
+// Worklist which mostly works similar to InstCombineWorkList, but on
+// MachineInstrs. The main difference with something like a SetVector is that
+// erasing an element doesn't move all elements over one place - instead just
+// nulls out the element of the vector.
+//
+// This worklist operates on instructions within a particular function. This is
+// important for acquiring the rights to modify/replace instructions a
+// GISelChangeObserver reports as the observer doesn't have the right to make
+// changes to the instructions it sees so we use our access to the
+// MachineFunction to establish that it's ok to add a given instruction to the
+// worklist.
+//
+// FIXME: Does it make sense to factor out common code with the
+// instcombinerWorkList?
 template<unsigned N>
 class GISelWorkList {
-  SmallVector<MachineInstr*, N> Worklist;
-  DenseMap<MachineInstr*, unsigned> WorklistMap;
+  MachineFunction *MF;
+  SmallVector<MachineInstr *, N> Worklist;
+  DenseMap<MachineInstr *, unsigned> WorklistMap;
 
 public:
-  GISelWorkList() = default;
+  GISelWorkList(MachineFunction *MF) : MF(MF) {}
 
   bool empty() const { return WorklistMap.empty(); }
 
   unsigned size() const { return WorklistMap.size(); }
 
-  /// Add - Add the specified instruction to the worklist if it isn't already
-  /// in it.
+  /// Add the specified instruction to the worklist if it isn't already in it.
   void insert(MachineInstr *I) {
-    if (WorklistMap.try_emplace(I, Worklist.size()).second) {
-      Worklist.push_back(I);
+    // It would be safe to add this instruction to the worklist regardless but
+    // for consistency with the const version, check that the instruction we're
+    // adding would have been accepted if we were given a const pointer instead.
+    insert(const_cast<const MachineInstr *>(I));
+  }
+
+  void insert(const MachineInstr *I) {
+    // Confirm we'd be able to find the non-const pointer we want to schedule if
+    // we wanted to. We have the right to schedule work that may modify any
+    // instruction in MF.
+    assert(I->getParent() && "Expected parent BB");
+    assert(I->getParent()->getParent() && "Expected parent function");
+    assert((!MF || I->getParent()->getParent() == MF) &&
+           "Expected parent function to be current function or not given");
+
+    // But don't actually do the search since we can derive it from the const
+    // pointer.
+    MachineInstr *NonConstI = const_cast<MachineInstr *>(I);
+    if (WorklistMap.try_emplace(NonConstI, Worklist.size()).second) {
+      Worklist.push_back(NonConstI);
     }
   }
 
-  /// Remove - remove I from the worklist if it exists.
-  void remove(MachineInstr *I) {
+  /// Remove I from the worklist if it exists.
+  void remove(const MachineInstr *I) {
     auto It = WorklistMap.find(I);
     if (It == WorklistMap.end()) return; // Not in worklist.
 
diff --git a/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 2498ee9332104a0afc622a3ac56a999f442a2cd8..04629f6b320464f99bcd6be503e56dbc50e6ef59 100644
--- a/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -300,6 +300,8 @@ private:
 
   bool translateFSub(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateFNeg(const User &U, MachineIRBuilder &MIRBuilder);
+
   bool translateAdd(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateBinaryOp(TargetOpcode::G_ADD, U, MIRBuilder);
   }
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index e1132ac59c829af869ec17919fcdc8940838e7ce..20bec76501796c68b9848bb2bfaddea727ad0d2b 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h --===========//
+//===-- llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h -----*- C++ -*-//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -51,6 +51,18 @@ public:
       markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
+
+    // aext([asz]ext x) -> [asz]ext x
+    unsigned ExtSrc;
+    MachineInstr *ExtMI;
+    if (mi_match(SrcReg, MRI,
+                 m_all_of(m_MInstr(ExtMI), m_any_of(m_GAnyExt(m_Reg(ExtSrc)),
+                                                    m_GSExt(m_Reg(ExtSrc)),
+                                                    m_GZExt(m_Reg(ExtSrc)))))) {
+      Builder.buildInstr(ExtMI->getOpcode(), {DstReg}, {ExtSrc});
+      markInstAndDefDead(MI, *ExtMI, DeadInsts);
+      return true;
+    }
     return tryFoldImplicitDef(MI, DeadInsts);
   }
 
@@ -106,9 +118,9 @@ public:
       unsigned ShAmt = DstTy.getSizeInBits() - SrcTy.getSizeInBits();
       auto MIBShAmt = Builder.buildConstant(DstTy, ShAmt);
       auto MIBShl = Builder.buildInstr(
-          TargetOpcode::G_SHL, DstTy,
-          Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt);
-      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, MIBShl, MIBShAmt);
+          TargetOpcode::G_SHL, {DstTy},
+          {Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt});
+      Builder.buildInstr(TargetOpcode::G_ASHR, {DstReg}, {MIBShl, MIBShAmt});
       markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
@@ -134,7 +146,7 @@ public:
         if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}}))
           return false;
         LLVM_DEBUG(dbgs() << ".. Combine G_ANYEXT(G_IMPLICIT_DEF): " << MI;);
-        Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg);
+        Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, {DstReg}, {});
       } else {
         // G_[SZ]EXT (G_IMPLICIT_DEF) -> G_CONSTANT 0 because the top
         // bits will be 0 for G_ZEXT and 0/1 for the G_SEXT.
@@ -157,8 +169,20 @@ public:
       return false;
 
     unsigned NumDefs = MI.getNumOperands() - 1;
-    MachineInstr *MergeI = getOpcodeDef(TargetOpcode::G_MERGE_VALUES,
-                                        MI.getOperand(NumDefs).getReg(), MRI);
+
+    unsigned MergingOpcode;
+    LLT OpTy = MRI.getType(MI.getOperand(NumDefs).getReg());
+    LLT DestTy = MRI.getType(MI.getOperand(0).getReg());
+    if (OpTy.isVector() && DestTy.isVector())
+      MergingOpcode = TargetOpcode::G_CONCAT_VECTORS;
+    else if (OpTy.isVector() && !DestTy.isVector())
+      MergingOpcode = TargetOpcode::G_BUILD_VECTOR;
+    else
+      MergingOpcode = TargetOpcode::G_MERGE_VALUES;
+
+    MachineInstr *MergeI =
+        getOpcodeDef(MergingOpcode, MI.getOperand(NumDefs).getReg(), MRI);
+
     if (!MergeI)
       return false;
 
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 4d804d0d0d25ab6172318ce73040dc8641083a58..4d3e4a3763865c86f1d64b485a8d3e7d72474726 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -32,6 +32,7 @@ namespace llvm {
 class LegalizerInfo;
 class Legalizer;
 class MachineRegisterInfo;
+class GISelChangeObserver;
 
 class LegalizerHelper {
 public:
@@ -48,8 +49,9 @@ public:
     UnableToLegalize,
   };
 
-  LegalizerHelper(MachineFunction &MF);
-  LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI);
+  LegalizerHelper(MachineFunction &MF, GISelChangeObserver &Observer);
+  LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
+                  GISelChangeObserver &Observer);
 
   /// Replace \p MI by a sequence of legal instructions that can implement the
   /// same operation. Note that this means \p MI may be deleted, so any iterator
@@ -117,6 +119,8 @@ private:
 
   MachineRegisterInfo &MRI;
   const LegalizerInfo &LI;
+  /// To keep track of changes made by the LegalizerHelper.
+  GISelChangeObserver &Observer;
 };
 
 /// Helper function that creates the given libcall.
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 755805de1b08e35bc4121cabb713053cfee44ab7..13776dd3e87d18dd76a2ab8eb5ee07b37d6d9834 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -39,6 +39,7 @@ class MachineInstr;
 class MachineIRBuilder;
 class MachineRegisterInfo;
 class MCInstrInfo;
+class GISelChangeObserver;
 
 namespace LegalizeActions {
 enum LegalizeAction : std::uint8_t {
@@ -651,6 +652,20 @@ public:
     return minScalar(TypeIdx, MinTy).maxScalar(TypeIdx, MaxTy);
   }
 
+  /// Widen the scalar to match the size of another.
+  LegalizeRuleSet &minScalarSameAs(unsigned TypeIdx, unsigned LargeTypeIdx) {
+    typeIdx(TypeIdx);
+    return widenScalarIf(
+        [=](const LegalityQuery &Query) {
+          return Query.Types[LargeTypeIdx].getScalarSizeInBits() >
+                 Query.Types[TypeIdx].getSizeInBits();
+        },
+        [=](const LegalityQuery &Query) {
+          return std::make_pair(TypeIdx,
+                                Query.Types[LargeTypeIdx].getElementType());
+        });
+  }
+
   /// Add more elements to the vector to reach the next power of two.
   /// No effect if the type is not a vector or the element count is a power of
   /// two.
@@ -949,9 +964,9 @@ public:
 
   bool isLegal(const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
 
-  virtual bool legalizeCustom(MachineInstr &MI,
-                              MachineRegisterInfo &MRI,
-                              MachineIRBuilder &MIRBuilder) const;
+  virtual bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              MachineIRBuilder &MIRBuilder,
+                              GISelChangeObserver &Observer) const;
 
 private:
   /// Determine what action should be taken to legalize the given generic
diff --git a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index a837cf05691ad4900a79e20e6bd7fa527602a390..1745b9702d8f9da587e082f177a76ff140e6ecd9 100644
--- a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -30,6 +30,7 @@ namespace llvm {
 class MachineFunction;
 class MachineInstr;
 class TargetInstrInfo;
+class GISelChangeObserver;
 
 /// Class which stores all the state required in a MachineIRBuilder.
 /// Since MachineIRBuilders will only store state in this object, it allows
@@ -50,57 +51,158 @@ struct MachineIRBuilderState {
   MachineBasicBlock::iterator II;
   /// @}
 
-  std::function<void(MachineInstr *)> InsertedInstr;
+  GISelChangeObserver *Observer;
 };
 
-/// Helper class to build MachineInstr.
-/// It keeps internally the insertion point and debug location for all
-/// the new instructions we want to create.
-/// This information can be modify via the related setters.
-class MachineIRBuilderBase {
-
-  MachineIRBuilderState State;
-  void validateTruncExt(unsigned Dst, unsigned Src, bool IsExtend);
+class DstOp {
+  union {
+    LLT LLTTy;
+    unsigned Reg;
+    const TargetRegisterClass *RC;
+  };
 
-protected:
-  unsigned getDestFromArg(unsigned Reg) { return Reg; }
-  unsigned getDestFromArg(LLT Ty) {
-    return getMF().getRegInfo().createGenericVirtualRegister(Ty);
+public:
+  enum class DstType { Ty_LLT, Ty_Reg, Ty_RC };
+  DstOp(unsigned R) : Reg(R), Ty(DstType::Ty_Reg) {}
+  DstOp(const LLT &T) : LLTTy(T), Ty(DstType::Ty_LLT) {}
+  DstOp(const TargetRegisterClass *TRC) : RC(TRC), Ty(DstType::Ty_RC) {}
+
+  void addDefToMIB(MachineRegisterInfo &MRI, MachineInstrBuilder &MIB) const {
+    switch (Ty) {
+    case DstType::Ty_Reg:
+      MIB.addDef(Reg);
+      break;
+    case DstType::Ty_LLT:
+      MIB.addDef(MRI.createGenericVirtualRegister(LLTTy));
+      break;
+    case DstType::Ty_RC:
+      MIB.addDef(MRI.createVirtualRegister(RC));
+      break;
+    }
   }
-  unsigned getDestFromArg(const TargetRegisterClass *RC) {
-    return getMF().getRegInfo().createVirtualRegister(RC);
+
+  DstType getType() const { return Ty; }
+
+  LLT getLLTTy(const MachineRegisterInfo &MRI) const {
+    switch (Ty) {
+    case DstType::Ty_RC:
+      return LLT{};
+    case DstType::Ty_LLT:
+      return LLTTy;
+    case DstType::Ty_Reg:
+      return MRI.getType(Reg);
+    }
+    llvm_unreachable("Unrecognised DstOp::DstType enum");
   }
 
-  void addUseFromArg(MachineInstrBuilder &MIB, unsigned Reg) {
-    MIB.addUse(Reg);
+  DstType getDstOpKind() const { return Ty; }
+
+private:
+  DstType Ty;
+};
+
+class SrcOp {
+  union {
+    MachineInstrBuilder SrcMIB;
+    unsigned Reg;
+    CmpInst::Predicate Pred;
+  };
+
+public:
+  enum class SrcType { Ty_Reg, Ty_MIB, Ty_Predicate };
+  SrcOp(unsigned R) : Reg(R), Ty(SrcType::Ty_Reg) {}
+  SrcOp(const MachineInstrBuilder &MIB) : SrcMIB(MIB), Ty(SrcType::Ty_MIB) {}
+  SrcOp(const CmpInst::Predicate P) : Pred(P), Ty(SrcType::Ty_Predicate) {}
+
+  void addSrcToMIB(MachineInstrBuilder &MIB) const {
+    switch (Ty) {
+    case SrcType::Ty_Predicate:
+      MIB.addPredicate(Pred);
+      break;
+    case SrcType::Ty_Reg:
+      MIB.addUse(Reg);
+      break;
+    case SrcType::Ty_MIB:
+      MIB.addUse(SrcMIB->getOperand(0).getReg());
+      break;
+    }
   }
 
-  void addUseFromArg(MachineInstrBuilder &MIB, const MachineInstrBuilder &UseMIB) {
-    MIB.addUse(UseMIB->getOperand(0).getReg());
+  LLT getLLTTy(const MachineRegisterInfo &MRI) const {
+    switch (Ty) {
+    case SrcType::Ty_Predicate:
+      llvm_unreachable("Not a register operand");
+    case SrcType::Ty_Reg:
+      return MRI.getType(Reg);
+    case SrcType::Ty_MIB:
+      return MRI.getType(SrcMIB->getOperand(0).getReg());
+    }
+    llvm_unreachable("Unrecognised SrcOp::SrcType enum");
   }
 
-  void addUsesFromArgs(MachineInstrBuilder &MIB) { }
-  template<typename UseArgTy, typename ... UseArgsTy>
-  void addUsesFromArgs(MachineInstrBuilder &MIB, UseArgTy &&Arg1, UseArgsTy &&... Args) {
-    addUseFromArg(MIB, Arg1);
-    addUsesFromArgs(MIB, std::forward<UseArgsTy>(Args)...);
+  unsigned getReg() const {
+    switch (Ty) {
+    case SrcType::Ty_Predicate:
+      llvm_unreachable("Not a register operand");
+    case SrcType::Ty_Reg:
+      return Reg;
+    case SrcType::Ty_MIB:
+      return SrcMIB->getOperand(0).getReg();
+    }
+    llvm_unreachable("Unrecognised SrcOp::SrcType enum");
   }
-  unsigned getRegFromArg(unsigned Reg) { return Reg; }
-  unsigned getRegFromArg(const MachineInstrBuilder &MIB) {
-    return MIB->getOperand(0).getReg();
+
+  CmpInst::Predicate getPredicate() const {
+    switch (Ty) {
+    case SrcType::Ty_Predicate:
+      return Pred;
+    default:
+      llvm_unreachable("Not a register operand");
+    }
   }
 
-  void validateBinaryOp(unsigned Res, unsigned Op0, unsigned Op1);
+  SrcType getSrcOpKind() const { return Ty; }
+
+private:
+  SrcType Ty;
+};
+
+class FlagsOp {
+  Optional<unsigned> Flags;
+
+public:
+  explicit FlagsOp(unsigned F) : Flags(F) {}
+  FlagsOp() : Flags(None) {}
+  Optional<unsigned> getFlags() const { return Flags; }
+};
+/// Helper class to build MachineInstr.
+/// It keeps internally the insertion point and debug location for all
+/// the new instructions we want to create.
+/// This information can be modify via the related setters.
+class MachineIRBuilder {
+
+  MachineIRBuilderState State;
+
+protected:
+  void validateTruncExt(const LLT &Dst, const LLT &Src, bool IsExtend);
+
+  void validateBinaryOp(const LLT &Res, const LLT &Op0, const LLT &Op1);
+
+  void validateSelectOp(const LLT &ResTy, const LLT &TstTy, const LLT &Op0Ty,
+                        const LLT &Op1Ty);
+  void recordInsertion(MachineInstr *MI) const;
 
 public:
   /// Some constructors for easy use.
-  MachineIRBuilderBase() = default;
-  MachineIRBuilderBase(MachineFunction &MF) { setMF(MF); }
-  MachineIRBuilderBase(MachineInstr &MI) : MachineIRBuilderBase(*MI.getMF()) {
+  MachineIRBuilder() = default;
+  MachineIRBuilder(MachineFunction &MF) { setMF(MF); }
+  MachineIRBuilder(MachineInstr &MI) : MachineIRBuilder(*MI.getMF()) {
     setInstr(MI);
   }
 
-  MachineIRBuilderBase(const MachineIRBuilderState &BState) : State(BState) {}
+  virtual ~MachineIRBuilder() = default;
+
+  MachineIRBuilder(const MachineIRBuilderState &BState) : State(BState) {}
 
   const TargetInstrInfo &getTII() {
     assert(State.TII && "TargetInstrInfo is not set");
@@ -151,12 +253,8 @@ public:
   void setInstr(MachineInstr &MI);
   /// @}
 
-  /// \name Control where instructions we create are recorded (typically for
-  /// visiting again later during legalization).
-  /// @{
-  void recordInsertion(MachineInstr *InsertedInstr) const;
-  void recordInsertions(std::function<void(MachineInstr *)> InsertedInstr);
-  void stopRecordingInsertions();
+  void setChangeObserver(GISelChangeObserver &Observer);
+  void stopObservingChanges();
   /// @}
 
   /// Set the debug location to \p DL for all the next build instructions.
@@ -300,9 +398,9 @@ public:
   ///      registers with the same scalar type (typically s1)
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildUAdde(unsigned Res, unsigned CarryOut, unsigned Op0,
-                                 unsigned Op1, unsigned CarryIn);
-
+  MachineInstrBuilder buildUAdde(const DstOp &Res, const DstOp &CarryOut,
+                                 const SrcOp &Op0, const SrcOp &Op1,
+                                 const SrcOp &CarryIn);
 
   /// Build and insert \p Res = G_ANYEXT \p Op0
   ///
@@ -318,11 +416,7 @@ public:
   ///
   /// \return The newly created instruction.
 
-  MachineInstrBuilder buildAnyExt(unsigned Res, unsigned Op);
-  template <typename DstType, typename ArgType>
-  MachineInstrBuilder buildAnyExt(DstType &&Res, ArgType &&Arg) {
-    return buildAnyExt(getDestFromArg(Res), getRegFromArg(Arg));
-  }
+  MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op);
 
   /// Build and insert \p Res = G_SEXT \p Op
   ///
@@ -336,11 +430,7 @@ public:
   /// \pre \p Op must be smaller than \p Res
   ///
   /// \return The newly created instruction.
-  template <typename DstType, typename ArgType>
-  MachineInstrBuilder buildSExt(DstType &&Res, ArgType &&Arg) {
-    return buildSExt(getDestFromArg(Res), getRegFromArg(Arg));
-  }
-  MachineInstrBuilder buildSExt(unsigned Res, unsigned Op);
+  MachineInstrBuilder buildSExt(const DstOp &Res, const SrcOp &Op);
 
   /// Build and insert \p Res = G_ZEXT \p Op
   ///
@@ -354,11 +444,7 @@ public:
   /// \pre \p Op must be smaller than \p Res
   ///
   /// \return The newly created instruction.
-  template <typename DstType, typename ArgType>
-  MachineInstrBuilder buildZExt(DstType &&Res, ArgType &&Arg) {
-    return buildZExt(getDestFromArg(Res), getRegFromArg(Arg));
-  }
-  MachineInstrBuilder buildZExt(unsigned Res, unsigned Op);
+  MachineInstrBuilder buildZExt(const DstOp &Res, const SrcOp &Op);
 
   /// Build and insert \p Res = G_SEXT \p Op, \p Res = G_TRUNC \p Op, or
   /// \p Res = COPY \p Op depending on the differing sizes of \p Res and \p Op.
@@ -368,11 +454,7 @@ public:
   /// \pre \p Op must be a generic virtual register with scalar or vector type.
   ///
   /// \return The newly created instruction.
-  template <typename DstTy, typename UseArgTy>
-  MachineInstrBuilder buildSExtOrTrunc(DstTy &&Dst, UseArgTy &&Use) {
-    return buildSExtOrTrunc(getDestFromArg(Dst), getRegFromArg(Use));
-  }
-  MachineInstrBuilder buildSExtOrTrunc(unsigned Res, unsigned Op);
+  MachineInstrBuilder buildSExtOrTrunc(const DstOp &Res, const SrcOp &Op);
 
   /// Build and insert \p Res = G_ZEXT \p Op, \p Res = G_TRUNC \p Op, or
   /// \p Res = COPY \p Op depending on the differing sizes of \p Res and \p Op.
@@ -382,11 +464,7 @@ public:
   /// \pre \p Op must be a generic virtual register with scalar or vector type.
   ///
   /// \return The newly created instruction.
-  template <typename DstTy, typename UseArgTy>
-  MachineInstrBuilder buildZExtOrTrunc(DstTy &&Dst, UseArgTy &&Use) {
-    return buildZExtOrTrunc(getDestFromArg(Dst), getRegFromArg(Use));
-  }
-  MachineInstrBuilder buildZExtOrTrunc(unsigned Res, unsigned Op);
+  MachineInstrBuilder buildZExtOrTrunc(const DstOp &Res, const SrcOp &Op);
 
   // Build and insert \p Res = G_ANYEXT \p Op, \p Res = G_TRUNC \p Op, or
   /// \p Res = COPY \p Op depending on the differing sizes of \p Res and \p Op.
@@ -396,11 +474,7 @@ public:
   /// \pre \p Op must be a generic virtual register with scalar or vector type.
   ///
   /// \return The newly created instruction.
-  template <typename DstTy, typename UseArgTy>
-  MachineInstrBuilder buildAnyExtOrTrunc(DstTy &&Dst, UseArgTy &&Use) {
-    return buildAnyExtOrTrunc(getDestFromArg(Dst), getRegFromArg(Use));
-  }
-  MachineInstrBuilder buildAnyExtOrTrunc(unsigned Res, unsigned Op);
+  MachineInstrBuilder buildAnyExtOrTrunc(const DstOp &Res, const SrcOp &Op);
 
   /// Build and insert \p Res = \p ExtOpc, \p Res = G_TRUNC \p
   /// Op, or \p Res = COPY \p Op depending on the differing sizes of \p Res and
@@ -411,15 +485,11 @@ public:
   /// \pre \p Op must be a generic virtual register with scalar or vector type.
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildExtOrTrunc(unsigned ExtOpc, unsigned Res,
-                                      unsigned Op);
+  MachineInstrBuilder buildExtOrTrunc(unsigned ExtOpc, const DstOp &Res,
+                                      const SrcOp &Op);
 
   /// Build and insert an appropriate cast between two registers of equal size.
-  template <typename DstType, typename ArgType>
-  MachineInstrBuilder buildCast(DstType &&Res, ArgType &&Arg) {
-    return buildCast(getDestFromArg(Res), getRegFromArg(Arg));
-  }
-  MachineInstrBuilder buildCast(unsigned Dst, unsigned Src);
+  MachineInstrBuilder buildCast(const DstOp &Dst, const SrcOp &Src);
 
   /// Build and insert G_BR \p Dest
   ///
@@ -464,7 +534,7 @@ public:
   ///      type.
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildConstant(unsigned Res, const ConstantInt &Val);
+  MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val);
 
   /// Build and insert \p Res = G_CONSTANT \p Val
   ///
@@ -474,12 +544,8 @@ public:
   /// \pre \p Res must be a generic virtual register with scalar type.
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildConstant(unsigned Res, int64_t Val);
+  MachineInstrBuilder buildConstant(const DstOp &Res, int64_t Val);
 
-  template <typename DstType>
-  MachineInstrBuilder buildConstant(DstType &&Res, int64_t Val) {
-    return buildConstant(getDestFromArg(Res), Val);
-  }
   /// Build and insert \p Res = G_FCONSTANT \p Val
   ///
   /// G_FCONSTANT is a floating-point constant with the specified size and
@@ -489,17 +555,9 @@ public:
   /// \pre \p Res must be a generic virtual register with scalar type.
   ///
   /// \return The newly created instruction.
-  template <typename DstType>
-  MachineInstrBuilder buildFConstant(DstType &&Res, const ConstantFP &Val) {
-    return buildFConstant(getDestFromArg(Res), Val);
-  }
-  MachineInstrBuilder buildFConstant(unsigned Res, const ConstantFP &Val);
+  MachineInstrBuilder buildFConstant(const DstOp &Res, const ConstantFP &Val);
 
-  template <typename DstType>
-  MachineInstrBuilder buildFConstant(DstType &&Res, double Val) {
-    return buildFConstant(getDestFromArg(Res), Val);
-  }
-  MachineInstrBuilder buildFConstant(unsigned Res, double Val);
+  MachineInstrBuilder buildFConstant(const DstOp &Res, double Val);
 
   /// Build and insert \p Res = COPY Op
   ///
@@ -508,11 +566,7 @@ public:
   /// \pre setBasicBlock or setMI must have been called.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildCopy(unsigned Res, unsigned Op);
-  template <typename DstType, typename SrcType>
-  MachineInstrBuilder buildCopy(DstType &&Res, SrcType &&Src) {
-    return buildCopy(getDestFromArg(Res), getRegFromArg(Src));
-  }
+  MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op);
 
   /// Build and insert `Res = G_LOAD Addr, MMO`.
   ///
@@ -559,10 +613,7 @@ public:
   MachineInstrBuilder buildExtract(unsigned Res, unsigned Src, uint64_t Index);
 
   /// Build and insert \p Res = IMPLICIT_DEF.
-  template <typename DstType> MachineInstrBuilder buildUndef(DstType &&Res) {
-    return buildUndef(getDestFromArg(Res));
-  }
-  MachineInstrBuilder buildUndef(unsigned Res);
+  MachineInstrBuilder buildUndef(const DstOp &Res);
 
   /// Build and insert instructions to put \p Ops together at the specified p
   /// Indices to form a larger register.
@@ -591,7 +642,7 @@ public:
   /// \pre The type of all \p Ops registers must be identical.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildMerge(unsigned Res, ArrayRef<unsigned> Ops);
+  MachineInstrBuilder buildMerge(const DstOp &Res, ArrayRef<unsigned> Ops);
 
   /// Build and insert \p Res0, ... = G_UNMERGE_VALUES \p Op
   ///
@@ -603,7 +654,50 @@ public:
   /// \pre The type of all \p Res registers must be identical.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildUnmerge(ArrayRef<unsigned> Res, unsigned Op);
+  MachineInstrBuilder buildUnmerge(ArrayRef<LLT> Res, const SrcOp &Op);
+  MachineInstrBuilder buildUnmerge(ArrayRef<unsigned> Res, const SrcOp &Op);
+
+  /// Build and insert \p Res = G_BUILD_VECTOR \p Op0, ...
+  ///
+  /// G_BUILD_VECTOR creates a vector value from multiple scalar registers.
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre The entire register \p Res (and no more) must be covered by the
+  ///      input scalar registers.
+  /// \pre The type of all \p Ops registers must be identical.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildBuildVector(const DstOp &Res,
+                                       ArrayRef<unsigned> Ops);
+
+  /// Build and insert \p Res = G_BUILD_VECTOR_TRUNC \p Op0, ...
+  ///
+  /// G_BUILD_VECTOR_TRUNC creates a vector value from multiple scalar registers
+  /// which have types larger than the destination vector element type, and
+  /// truncates the values to fit.
+  ///
+  /// If the operands given are already the same size as the vector elt type,
+  /// then this method will instead create a G_BUILD_VECTOR instruction.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre The type of all \p Ops registers must be identical.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildBuildVectorTrunc(const DstOp &Res,
+                                            ArrayRef<unsigned> Ops);
+
+  /// Build and insert \p Res = G_CONCAT_VECTORS \p Op0, ...
+  ///
+  /// G_CONCAT_VECTORS creates a vector from the concatenation of 2 or more
+  /// vectors.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre The entire register \p Res (and no more) must be covered by the input
+  ///      registers.
+  /// \pre The type of all source operands must be identical.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildConcatVectors(const DstOp &Res,
+                                         ArrayRef<unsigned> Ops);
 
   MachineInstrBuilder buildInsert(unsigned Res, unsigned Src,
                                   unsigned Op, unsigned Index);
@@ -631,11 +725,7 @@ public:
   /// \pre \p Res must be smaller than \p Op
   ///
   /// \return The newly created instruction.
-  template <typename DstType, typename SrcType>
-  MachineInstrBuilder buildFPTrunc(DstType &&Res, SrcType &&Src) {
-    return buildFPTrunc(getDestFromArg(Res), getRegFromArg(Src));
-  }
-  MachineInstrBuilder buildFPTrunc(unsigned Res, unsigned Op);
+  MachineInstrBuilder buildFPTrunc(const DstOp &Res, const SrcOp &Op);
 
   /// Build and insert \p Res = G_TRUNC \p Op
   ///
@@ -648,11 +738,7 @@ public:
   /// \pre \p Res must be smaller than \p Op
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildTrunc(unsigned Res, unsigned Op);
-  template <typename DstType, typename SrcType>
-  MachineInstrBuilder buildTrunc(DstType &&Res, SrcType &&Src) {
-    return buildTrunc(getDestFromArg(Res), getRegFromArg(Src));
-  }
+  MachineInstrBuilder buildTrunc(const DstOp &Res, const SrcOp &Op);
 
   /// Build and insert a \p Res = G_ICMP \p Pred, \p Op0, \p Op1
   ///
@@ -666,13 +752,8 @@ public:
   /// \pre \p Pred must be an integer predicate.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildICmp(CmpInst::Predicate Pred,
-                                unsigned Res, unsigned Op0, unsigned Op1);
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildICmp(CmpInst::Predicate Pred, DstTy &&Dst,
-                                UseArgsTy &&... UseArgs) {
-    return buildICmp(Pred, getDestFromArg(Dst), getRegFromArg(UseArgs)...);
-  }
+  MachineInstrBuilder buildICmp(CmpInst::Predicate Pred, const DstOp &Res,
+                                const SrcOp &Op0, const SrcOp &Op1);
 
   /// Build and insert a \p Res = G_FCMP \p Pred\p Op0, \p Op1
   ///
@@ -686,8 +767,8 @@ public:
   /// \pre \p Pred must be a floating-point predicate.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildFCmp(CmpInst::Predicate Pred,
-                                unsigned Res, unsigned Op0, unsigned Op1);
+  MachineInstrBuilder buildFCmp(CmpInst::Predicate Pred, const DstOp &Res,
+                                const SrcOp &Op0, const SrcOp &Op1);
 
   /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1
   ///
@@ -699,12 +780,8 @@ public:
   ///      elements as the other parameters.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildSelect(unsigned Res, unsigned Tst,
-                                  unsigned Op0, unsigned Op1);
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildSelect(DstTy &&Dst, UseArgsTy &&... UseArgs) {
-    return buildSelect(getDestFromArg(Dst), getRegFromArg(UseArgs)...);
-  }
+  MachineInstrBuilder buildSelect(const DstOp &Res, const SrcOp &Tst,
+                                  const SrcOp &Op0, const SrcOp &Op1);
 
   /// Build and insert \p Res = G_INSERT_VECTOR_ELT \p Val,
   /// \p Elt, \p Idx
@@ -716,8 +793,10 @@ public:
   ///      with scalar type.
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildInsertVectorElement(unsigned Res, unsigned Val,
-                                               unsigned Elt, unsigned Idx);
+  MachineInstrBuilder buildInsertVectorElement(const DstOp &Res,
+                                               const SrcOp &Val,
+                                               const SrcOp &Elt,
+                                               const SrcOp &Idx);
 
   /// Build and insert \p Res = G_EXTRACT_VECTOR_ELT \p Val, \p Idx
   ///
@@ -727,8 +806,9 @@ public:
   /// \pre \p Idx must be a generic virtual register with scalar type.
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildExtractVectorElement(unsigned Res, unsigned Val,
-                                                unsigned Idx);
+  MachineInstrBuilder buildExtractVectorElement(const DstOp &Res,
+                                                const SrcOp &Val,
+                                                const SrcOp &Idx);
 
   /// Build and insert `OldValRes<def>, SuccessRes<def> =
   /// G_ATOMIC_CMPXCHG_WITH_SUCCESS Addr, CmpVal, NewVal, MMO`.
@@ -965,19 +1045,7 @@ public:
   ///
   /// \return The newly created instruction.
   MachineInstrBuilder buildBlockAddress(unsigned Res, const BlockAddress *BA);
-};
-
-/// A CRTP class that contains methods for building instructions that can
-/// be constant folded. MachineIRBuilders that want to inherit from this will
-/// need to implement buildBinaryOp (for constant folding binary ops).
-/// Alternatively, they can implement buildInstr(Opc, Dst, Uses...) to perform
-/// additional folding for Opc.
-template <typename Base>
-class FoldableInstructionsBuilder : public MachineIRBuilderBase {
-  Base &base() { return static_cast<Base &>(*this); }
 
-public:
-  using MachineIRBuilderBase::MachineIRBuilderBase;
   /// Build and insert \p Res = G_ADD \p Op0, \p Op1
   ///
   /// G_ADD sets \p Res to the sum of integer parameters \p Op0 and \p Op1,
@@ -989,13 +1057,10 @@ public:
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
 
-  MachineInstrBuilder buildAdd(unsigned Dst, unsigned Src0, unsigned Src1) {
-    return base().buildBinaryOp(TargetOpcode::G_ADD, Dst, Src0, Src1);
-  }
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildAdd(DstTy &&Ty, UseArgsTy &&... UseArgs) {
-    unsigned Res = base().getDestFromArg(Ty);
-    return base().buildAdd(Res, (base().getRegFromArg(UseArgs))...);
+  MachineInstrBuilder buildAdd(const DstOp &Dst, const SrcOp &Src0,
+                               const SrcOp &Src1,
+                               Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_ADD, {Dst}, {Src0, Src1}, Flags);
   }
 
   /// Build and insert \p Res = G_SUB \p Op0, \p Op1
@@ -1009,13 +1074,10 @@ public:
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
 
-  MachineInstrBuilder buildSub(unsigned Dst, unsigned Src0, unsigned Src1) {
-    return base().buildBinaryOp(TargetOpcode::G_SUB, Dst, Src0, Src1);
-  }
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildSub(DstTy &&Ty, UseArgsTy &&... UseArgs) {
-    unsigned Res = base().getDestFromArg(Ty);
-    return base().buildSub(Res, (base().getRegFromArg(UseArgs))...);
+  MachineInstrBuilder buildSub(const DstOp &Dst, const SrcOp &Src0,
+                               const SrcOp &Src1,
+                               Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_SUB, {Dst}, {Src0, Src1}, Flags);
   }
 
   /// Build and insert \p Res = G_MUL \p Op0, \p Op1
@@ -1028,13 +1090,10 @@ public:
   ///      with the same (scalar or vector) type).
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildMul(unsigned Dst, unsigned Src0, unsigned Src1) {
-    return base().buildBinaryOp(TargetOpcode::G_MUL, Dst, Src0, Src1);
-  }
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildMul(DstTy &&Ty, UseArgsTy &&... UseArgs) {
-    unsigned Res = base().getDestFromArg(Ty);
-    return base().buildMul(Res, (base().getRegFromArg(UseArgs))...);
+  MachineInstrBuilder buildMul(const DstOp &Dst, const SrcOp &Src0,
+                               const SrcOp &Src1,
+                               Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_MUL, {Dst}, {Src0, Src1}, Flags);
   }
 
   /// Build and insert \p Res = G_AND \p Op0, \p Op1
@@ -1048,13 +1107,9 @@ public:
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
 
-  MachineInstrBuilder buildAnd(unsigned Dst, unsigned Src0, unsigned Src1) {
-    return base().buildBinaryOp(TargetOpcode::G_AND, Dst, Src0, Src1);
-  }
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildAnd(DstTy &&Ty, UseArgsTy &&... UseArgs) {
-    unsigned Res = base().getDestFromArg(Ty);
-    return base().buildAnd(Res, (base().getRegFromArg(UseArgs))...);
+  MachineInstrBuilder buildAnd(const DstOp &Dst, const SrcOp &Src0,
+                               const SrcOp &Src1) {
+    return buildInstr(TargetOpcode::G_AND, {Dst}, {Src0, Src1});
   }
 
   /// Build and insert \p Res = G_OR \p Op0, \p Op1
@@ -1067,39 +1122,14 @@ public:
   ///      with the same (scalar or vector) type).
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildOr(unsigned Dst, unsigned Src0, unsigned Src1) {
-    return base().buildBinaryOp(TargetOpcode::G_OR, Dst, Src0, Src1);
+  MachineInstrBuilder buildOr(const DstOp &Dst, const SrcOp &Src0,
+                              const SrcOp &Src1) {
+    return buildInstr(TargetOpcode::G_OR, {Dst}, {Src0, Src1});
   }
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildOr(DstTy &&Ty, UseArgsTy &&... UseArgs) {
-    unsigned Res = base().getDestFromArg(Ty);
-    return base().buildOr(Res, (base().getRegFromArg(UseArgs))...);
-  }
-};
 
-class MachineIRBuilder : public FoldableInstructionsBuilder<MachineIRBuilder> {
-public:
-  using FoldableInstructionsBuilder<
-      MachineIRBuilder>::FoldableInstructionsBuilder;
-  MachineInstrBuilder buildBinaryOp(unsigned Opcode, unsigned Dst,
-                                    unsigned Src0, unsigned Src1) {
-    validateBinaryOp(Dst, Src0, Src1);
-    return buildInstr(Opcode).addDef(Dst).addUse(Src0).addUse(Src1);
-  }
-  using FoldableInstructionsBuilder<MachineIRBuilder>::buildInstr;
-  /// DAG like Generic method for building arbitrary instructions as above.
-  /// \Opc opcode for the instruction.
-  /// \Ty Either LLT/TargetRegisterClass/unsigned types for Dst
-  /// \Args Variadic list of uses of types(unsigned/MachineInstrBuilder)
-  /// Uses of type MachineInstrBuilder will perform
-  /// getOperand(0).getReg() to convert to register.
-  template <typename DstTy, typename... UseArgsTy>
-  MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty,
-                                 UseArgsTy &&... Args) {
-    auto MIB = buildInstr(Opc).addDef(getDestFromArg(Ty));
-    addUsesFromArgs(MIB, std::forward<UseArgsTy>(Args)...);
-    return MIB;
-  }
+  virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps,
+                                         ArrayRef<SrcOp> SrcOps,
+                                         Optional<unsigned> Flags = None);
 };
 
 } // End namespace llvm.
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index ac620e4b69c501353ebf0986ede0751aa775d135..9c918ae1104f9dbcec2e9acd0efe198a0755abe1 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -272,6 +272,13 @@ namespace ISD {
     /// resulting value is this minimum value.
     SSUBSAT, USUBSAT,
 
+    /// RESULT = SMULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on
+    /// 2 integers with the same width and scale. SCALE represents the scale of
+    /// both operands as fixed point numbers. This SCALE parameter must be a
+    /// constant integer. A scale of zero is effectively performing
+    /// multiplication on 2 integers.
+    SMULFIX,
+
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
 
@@ -394,9 +401,13 @@ namespace ISD {
     /// When the 1st operand is a vector, the shift amount must be in the same
     /// type. (TLI.getShiftAmountTy() will return the same type when the input
     /// type is a vector.)
-    /// For rotates, the shift amount is treated as an unsigned amount modulo
-    /// the element size of the first operand.
-    SHL, SRA, SRL, ROTL, ROTR,
+    /// For rotates and funnel shifts, the shift amount is treated as an unsigned
+    /// amount modulo the element size of the first operand.
+    ///
+    /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
+    /// fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+    /// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+    SHL, SRA, SRL, ROTL, ROTR, FSHL, FSHR,
 
     /// Byte Swap and Counting operators.
     BSWAP, CTTZ, CTLZ, CTPOP, BITREVERSE,
@@ -478,31 +489,33 @@ namespace ISD {
     /// in-register any-extension of the low lanes of an integer vector. The
     /// result type must have fewer elements than the operand type, and those
     /// elements must be larger integer types such that the total size of the
-    /// operand type and the result type match. Each of the low operand
-    /// elements is any-extended into the corresponding, wider result
-    /// elements with the high bits becoming undef.
+    /// operand type is less than or equal to the size of the result type. Each
+    /// of the low operand elements is any-extended into the corresponding,
+    /// wider result elements with the high bits becoming undef.
+    /// NOTE: The type legalizer prefers to make the operand and result size
+    /// the same to allow expansion to shuffle vector during op legalization.
     ANY_EXTEND_VECTOR_INREG,
 
     /// SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an
     /// in-register sign-extension of the low lanes of an integer vector. The
     /// result type must have fewer elements than the operand type, and those
     /// elements must be larger integer types such that the total size of the
-    /// operand type and the result type match. Each of the low operand
-    /// elements is sign-extended into the corresponding, wider result
-    /// elements.
-    // FIXME: The SIGN_EXTEND_INREG node isn't specifically limited to
-    // scalars, but it also doesn't handle vectors well. Either it should be
-    // restricted to scalars or this node (and its handling) should be merged
-    // into it.
+    /// operand type is less than or equal to the size of the result type. Each
+    /// of the low operand elements is sign-extended into the corresponding,
+    /// wider result elements.
+    /// NOTE: The type legalizer prefers to make the operand and result size
+    /// the same to allow expansion to shuffle vector during op legalization.
     SIGN_EXTEND_VECTOR_INREG,
 
     /// ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an
     /// in-register zero-extension of the low lanes of an integer vector. The
     /// result type must have fewer elements than the operand type, and those
     /// elements must be larger integer types such that the total size of the
-    /// operand type and the result type match. Each of the low operand
-    /// elements is zero-extended into the corresponding, wider result
-    /// elements.
+    /// operand type is less than or equal to the size of the result type. Each
+    /// of the low operand elements is zero-extended into the corresponding,
+    /// wider result elements.
+    /// NOTE: The type legalizer prefers to make the operand and result size
+    /// the same to allow expansion to shuffle vector during op legalization.
     ZERO_EXTEND_VECTOR_INREG,
 
     /// FP_TO_[US]INT - Convert a floating point value to a signed or unsigned
diff --git a/include/llvm/CodeGen/LinkAllAsmWriterComponents.h b/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
index c3046da90b8db83e2e95a0d5adf10880583caf14..38fcb37b1e69e97c13cb56704ac373b50eb7cb5b 100644
--- a/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
+++ b/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_CODEGEN_LINKALLASMWRITERCOMPONENTS_H
 #define LLVM_CODEGEN_LINKALLASMWRITERCOMPONENTS_H
 
-#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/BuiltinGCs.h"
 #include <cstdlib>
 
 namespace {
diff --git a/include/llvm/CodeGen/LinkAllCodegenComponents.h b/include/llvm/CodeGen/LinkAllCodegenComponents.h
index fee131e4a3c6b73211796a3ac98e7e1edf9b0e66..18c13ca8f59877f6e8f1b2747e43d601d214c852 100644
--- a/include/llvm/CodeGen/LinkAllCodegenComponents.h
+++ b/include/llvm/CodeGen/LinkAllCodegenComponents.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_CODEGEN_LINKALLCODEGENCOMPONENTS_H
 #define LLVM_CODEGEN_LINKALLCODEGENCOMPONENTS_H
 
-#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/BuiltinGCs.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/Target/TargetMachine.h"
@@ -36,11 +36,7 @@ namespace {
       (void) llvm::createGreedyRegisterAllocator();
       (void) llvm::createDefaultPBQPRegisterAllocator();
 
-      llvm::linkCoreCLRGC();
-      llvm::linkOcamlGC();
-      llvm::linkErlangGC();
-      llvm::linkShadowStackGC();
-      llvm::linkStatepointExampleGC();
+      llvm::linkAllBuiltinGCs();
 
       (void) llvm::createBURRListDAGScheduler(nullptr,
                                               llvm::CodeGenOpt::Default);
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 02b2f1be648a726d929b4595ebe5673c95c7175b..c2706a21a1778f1f1ec8bbf1ecaafd126ab99a2c 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -28,9 +28,14 @@ class AllocaInst;
 
 /// The CalleeSavedInfo class tracks the information need to locate where a
 /// callee saved register is in the current frame.
+/// Callee saved reg can also be saved to a different register rather than
+/// on the stack by setting DstReg instead of FrameIdx.
 class CalleeSavedInfo {
   unsigned Reg;
-  int FrameIdx;
+  union {
+    int FrameIdx;
+    unsigned DstReg;
+  };
   /// Flag indicating whether the register is actually restored in the epilog.
   /// In most cases, if a register is saved, it is also restored. There are
   /// some situations, though, when this is not the case. For example, the
@@ -44,17 +49,29 @@ class CalleeSavedInfo {
   /// by implicit uses on the return instructions, however, the required
   /// changes in the ARM backend would be quite extensive.
   bool Restored;
+  /// Flag indicating whether the register is spilled to stack or another
+  /// register.
+  bool SpilledToReg;
 
 public:
   explicit CalleeSavedInfo(unsigned R, int FI = 0)
-  : Reg(R), FrameIdx(FI), Restored(true) {}
+  : Reg(R), FrameIdx(FI), Restored(true), SpilledToReg(false) {}
 
   // Accessors.
   unsigned getReg()                        const { return Reg; }
   int getFrameIdx()                        const { return FrameIdx; }
-  void setFrameIdx(int FI)                       { FrameIdx = FI; }
+  unsigned getDstReg()                     const { return DstReg; }
+  void setFrameIdx(int FI) {
+    FrameIdx = FI;
+    SpilledToReg = false;
+  }
+  void setDstReg(unsigned SpillReg) {
+    DstReg = SpillReg;
+    SpilledToReg = true;
+  }
   bool isRestored()                        const { return Restored; }
   void setRestored(bool R)                       { Restored = R; }
+  bool isSpilledToReg()                    const { return SpilledToReg; }
 };
 
 /// The MachineFrameInfo class represents an abstract stack frame until
@@ -271,9 +288,9 @@ private:
   unsigned CVBytesOfCalleeSavedRegisters = 0;
 
   /// The prolog/epilog code inserter fills in this vector with each
-  /// callee saved register saved in the frame.  Beyond its use by the prolog/
-  /// epilog code inserter, this data used for debug info and exception
-  /// handling.
+  /// callee saved register saved in either the frame or a different
+  /// register.  Beyond its use by the prolog/ epilog code inserter,
+  /// this data is used for debug info and exception handling.
   std::vector<CalleeSavedInfo> CSInfo;
 
   /// Has CSInfo been set yet?
diff --git a/include/llvm/CodeGen/MachineOutliner.h b/include/llvm/CodeGen/MachineOutliner.h
index eaa741353abb0c4cb27298ac95bfe5bdeedd8dc5..bfd1e994053a75a0c4a8ccce324abbe752431a3b 100644
--- a/include/llvm/CodeGen/MachineOutliner.h
+++ b/include/llvm/CodeGen/MachineOutliner.h
@@ -61,9 +61,6 @@ public:
   /// \p OutlinedFunctions.
   unsigned FunctionIdx;
 
-  /// Set to false if the candidate overlapped with another candidate.
-  bool InCandidateList = true;
-
   /// Identifier denoting the instructions to emit to call an outlined function
   /// from this point. Defined by the target.
   unsigned CallConstructionID;
@@ -82,6 +79,12 @@ public:
   /// been used across the sequence.
   LiveRegUnits UsedInSequence;
 
+  /// Target-specific flags for this Candidate's MBB.
+  unsigned Flags = 0x0;
+
+  /// True if initLRU has been called on this Candidate.
+  bool LRUWasSet = false;
+
   /// Return the number of instructions in this Candidate.
   unsigned getLength() const { return Len; }
 
@@ -99,9 +102,7 @@ public:
   }
 
   /// Returns the call overhead of this candidate if it is in the list.
-  unsigned getCallOverhead() const {
-    return InCandidateList ? CallOverhead : 0;
-  }
+  unsigned getCallOverhead() const { return CallOverhead; }
 
   MachineBasicBlock::iterator &front() { return FirstInst; }
   MachineBasicBlock::iterator &back() { return LastInst; }
@@ -120,9 +121,9 @@ public:
   Candidate(unsigned StartIdx, unsigned Len,
             MachineBasicBlock::iterator &FirstInst,
             MachineBasicBlock::iterator &LastInst, MachineBasicBlock *MBB,
-            unsigned FunctionIdx)
+            unsigned FunctionIdx, unsigned Flags)
       : StartIdx(StartIdx), Len(Len), FirstInst(FirstInst), LastInst(LastInst),
-        MBB(MBB), FunctionIdx(FunctionIdx) {}
+        MBB(MBB), FunctionIdx(FunctionIdx), Flags(Flags) {}
   Candidate() {}
 
   /// Used to ensure that \p Candidates are outlined in an order that
@@ -138,6 +139,10 @@ public:
   void initLRU(const TargetRegisterInfo &TRI) {
     assert(MBB->getParent()->getRegInfo().tracksLiveness() &&
            "Candidate's Machine Function must track liveness");
+    // Only initialize once.
+    if (LRUWasSet)
+      return;
+    LRUWasSet = true;
     LRU.init(TRI);
     LRU.addLiveOuts(*MBB);
 
@@ -158,21 +163,13 @@ public:
 /// class of candidate.
 struct OutlinedFunction {
 
-private:
-  /// The number of candidates for this \p OutlinedFunction.
-  unsigned OccurrenceCount = 0;
-
 public:
-  std::vector<std::shared_ptr<Candidate>> Candidates;
+  std::vector<Candidate> Candidates;
 
   /// The actual outlined function created.
   /// This is initialized after we go through and create the actual function.
   MachineFunction *MF = nullptr;
 
-  /// The sequence of integers corresponding to the instructions in this
-  /// function.
-  std::vector<unsigned> Sequence;
-
   /// Represents the size of a sequence in bytes. (Some instructions vary
   /// widely in size, so just counting the instructions isn't very useful.)
   unsigned SequenceSize;
@@ -184,49 +181,41 @@ public:
   unsigned FrameConstructionID;
 
   /// Return the number of candidates for this \p OutlinedFunction.
-  unsigned getOccurrenceCount() { return OccurrenceCount; }
-
-  /// Decrement the occurrence count of this OutlinedFunction and return the
-  /// new count.
-  unsigned decrement() {
-    assert(OccurrenceCount > 0 && "Can't decrement an empty function!");
-    OccurrenceCount--;
-    return getOccurrenceCount();
-  }
+  unsigned getOccurrenceCount() const { return Candidates.size(); }
 
   /// Return the number of bytes it would take to outline this
   /// function.
-  unsigned getOutliningCost() {
+  unsigned getOutliningCost() const {
     unsigned CallOverhead = 0;
-    for (std::shared_ptr<Candidate> &C : Candidates)
-      CallOverhead += C->getCallOverhead();
+    for (const Candidate &C : Candidates)
+      CallOverhead += C.getCallOverhead();
     return CallOverhead + SequenceSize + FrameOverhead;
   }
 
   /// Return the size in bytes of the unoutlined sequences.
-  unsigned getNotOutlinedCost() { return OccurrenceCount * SequenceSize; }
+  unsigned getNotOutlinedCost() const {
+    return getOccurrenceCount() * SequenceSize;
+  }
 
   /// Return the number of instructions that would be saved by outlining
   /// this function.
-  unsigned getBenefit() {
+  unsigned getBenefit() const {
     unsigned NotOutlinedCost = getNotOutlinedCost();
     unsigned OutlinedCost = getOutliningCost();
     return (NotOutlinedCost < OutlinedCost) ? 0
                                             : NotOutlinedCost - OutlinedCost;
   }
 
-  OutlinedFunction(std::vector<Candidate> &Cands,
-                   unsigned SequenceSize, unsigned FrameOverhead,
-                   unsigned FrameConstructionID)
-      : SequenceSize(SequenceSize), FrameOverhead(FrameOverhead),
-        FrameConstructionID(FrameConstructionID) {
-    OccurrenceCount = Cands.size();
-    for (Candidate &C : Cands)
-      Candidates.push_back(std::make_shared<outliner::Candidate>(C));
-
-    unsigned B = getBenefit();
-    for (std::shared_ptr<Candidate> &C : Candidates)
-      C->Benefit = B;
+  /// Return the number of instructions in this sequence.
+  unsigned getNumInstrs() const { return Candidates[0].getLength(); }
+
+  OutlinedFunction(std::vector<Candidate> &Candidates, unsigned SequenceSize,
+                   unsigned FrameOverhead, unsigned FrameConstructionID)
+      : Candidates(Candidates), SequenceSize(SequenceSize),
+        FrameOverhead(FrameOverhead), FrameConstructionID(FrameConstructionID) {
+    const unsigned B = getBenefit();
+    for (Candidate &C : Candidates)
+      C.Benefit = B;
   }
 
   OutlinedFunction() {}
diff --git a/include/llvm/CodeGen/MachinePassRegistry.h b/include/llvm/CodeGen/MachinePassRegistry.h
index 3aba0bba7d1a3c8dfbfbca05052bde12aef97da8..a031c92d914f408193d430aed0480ab042e26c36 100644
--- a/include/llvm/CodeGen/MachinePassRegistry.h
+++ b/include/llvm/CodeGen/MachinePassRegistry.h
@@ -24,22 +24,20 @@
 
 namespace llvm {
 
-using MachinePassCtor = void *(*)();
-
 //===----------------------------------------------------------------------===//
 ///
 /// MachinePassRegistryListener - Listener to adds and removals of nodes in
 /// registration list.
 ///
 //===----------------------------------------------------------------------===//
-class MachinePassRegistryListener {
-  virtual void anchor();
+template <class PassCtorTy> class MachinePassRegistryListener {
+  virtual void anchor() {}
 
 public:
   MachinePassRegistryListener() = default;
   virtual ~MachinePassRegistryListener() = default;
 
-  virtual void NotifyAdd(StringRef N, MachinePassCtor C, StringRef D) = 0;
+  virtual void NotifyAdd(StringRef N, PassCtorTy C, StringRef D) = 0;
   virtual void NotifyRemove(StringRef N) = 0;
 };
 
@@ -48,15 +46,15 @@ public:
 /// MachinePassRegistryNode - Machine pass node stored in registration list.
 ///
 //===----------------------------------------------------------------------===//
-class MachinePassRegistryNode {
+template <typename PassCtorTy> class MachinePassRegistryNode {
 private:
   MachinePassRegistryNode *Next = nullptr; // Next function pass in list.
   StringRef Name;                       // Name of function pass.
   StringRef Description;                // Description string.
-  MachinePassCtor Ctor;                 // Function pass creator.
+  PassCtorTy Ctor;                      // Pass creator.
 
 public:
-  MachinePassRegistryNode(const char *N, const char *D, MachinePassCtor C)
+  MachinePassRegistryNode(const char *N, const char *D, PassCtorTy C)
       : Name(N), Description(D), Ctor(C) {}
 
   // Accessors
@@ -64,7 +62,7 @@ public:
   MachinePassRegistryNode **getNextAddress()    { return &Next; }
   StringRef getName()                   const { return Name; }
   StringRef getDescription()            const { return Description; }
-  MachinePassCtor getCtor()               const { return Ctor; }
+  PassCtorTy getCtor() const { return Ctor; }
   void setNext(MachinePassRegistryNode *N)      { Next = N; }
 };
 
@@ -73,11 +71,12 @@ public:
 /// MachinePassRegistry - Track the registration of machine passes.
 ///
 //===----------------------------------------------------------------------===//
-class MachinePassRegistry {
+template <typename PassCtorTy> class MachinePassRegistry {
 private:
-  MachinePassRegistryNode *List;        // List of registry nodes.
-  MachinePassCtor Default;              // Default function pass creator.
-  MachinePassRegistryListener *Listener; // Listener for list adds are removes.
+  MachinePassRegistryNode<PassCtorTy> *List; // List of registry nodes.
+  PassCtorTy Default;                        // Default function pass creator.
+  MachinePassRegistryListener<PassCtorTy>
+      *Listener; // Listener for list adds are removes.
 
 public:
   // NO CONSTRUCTOR - we don't want static constructor ordering to mess
@@ -85,19 +84,47 @@ public:
 
   // Accessors.
   //
-  MachinePassRegistryNode *getList()                    { return List; }
-  MachinePassCtor getDefault()                          { return Default; }
-  void setDefault(MachinePassCtor C)                    { Default = C; }
-  void setDefault(StringRef Name);
-  void setListener(MachinePassRegistryListener *L)      { Listener = L; }
+  MachinePassRegistryNode<PassCtorTy> *getList() { return List; }
+  PassCtorTy getDefault() { return Default; }
+  void setDefault(PassCtorTy C) { Default = C; }
+  /// setDefault - Set the default constructor by name.
+  void setDefault(StringRef Name) {
+    PassCtorTy Ctor = nullptr;
+    for (MachinePassRegistryNode<PassCtorTy> *R = getList(); R;
+         R = R->getNext()) {
+      if (R->getName() == Name) {
+        Ctor = R->getCtor();
+        break;
+      }
+    }
+    assert(Ctor && "Unregistered pass name");
+    setDefault(Ctor);
+  }
+  void setListener(MachinePassRegistryListener<PassCtorTy> *L) { Listener = L; }
 
   /// Add - Adds a function pass to the registration list.
   ///
-  void Add(MachinePassRegistryNode *Node);
+  void Add(MachinePassRegistryNode<PassCtorTy> *Node) {
+    Node->setNext(List);
+    List = Node;
+    if (Listener)
+      Listener->NotifyAdd(Node->getName(), Node->getCtor(),
+                          Node->getDescription());
+  }
 
   /// Remove - Removes a function pass from the registration list.
   ///
-  void Remove(MachinePassRegistryNode *Node);
+  void Remove(MachinePassRegistryNode<PassCtorTy> *Node) {
+    for (MachinePassRegistryNode<PassCtorTy> **I = &List; *I;
+         I = (*I)->getNextAddress()) {
+      if (*I == Node) {
+        if (Listener)
+          Listener->NotifyRemove(Node->getName());
+        *I = (*I)->getNext();
+        break;
+      }
+    }
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -105,9 +132,11 @@ public:
 /// RegisterPassParser class - Handle the addition of new machine passes.
 ///
 //===----------------------------------------------------------------------===//
-template<class RegistryClass>
-class RegisterPassParser : public MachinePassRegistryListener,
-                   public cl::parser<typename RegistryClass::FunctionPassCtor> {
+template <class RegistryClass>
+class RegisterPassParser
+    : public MachinePassRegistryListener<
+          typename RegistryClass::FunctionPassCtor>,
+      public cl::parser<typename RegistryClass::FunctionPassCtor> {
 public:
   RegisterPassParser(cl::Option &O)
       : cl::parser<typename RegistryClass::FunctionPassCtor>(O) {}
@@ -129,8 +158,9 @@ public:
   }
 
   // Implement the MachinePassRegistryListener callbacks.
-  void NotifyAdd(StringRef N, MachinePassCtor C, StringRef D) override {
-    this->addLiteralOption(N, (typename RegistryClass::FunctionPassCtor)C, D);
+  void NotifyAdd(StringRef N, typename RegistryClass::FunctionPassCtor C,
+                 StringRef D) override {
+    this->addLiteralOption(N, C, D);
   }
   void NotifyRemove(StringRef N) override {
     this->removeLiteralOption(N);
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 6003b1b6b82c334ccc81fbc4c282e98ea79985bd..4bc31ae7c61aca6eecb0a7d2349f2c21852d0dbc 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -132,17 +132,19 @@ struct MachineSchedContext {
 
 /// MachineSchedRegistry provides a selection of available machine instruction
 /// schedulers.
-class MachineSchedRegistry : public MachinePassRegistryNode {
+class MachineSchedRegistry
+    : public MachinePassRegistryNode<
+          ScheduleDAGInstrs *(*)(MachineSchedContext *)> {
 public:
   using ScheduleDAGCtor = ScheduleDAGInstrs *(*)(MachineSchedContext *);
 
   // RegisterPassParser requires a (misnamed) FunctionPassCtor type.
   using FunctionPassCtor = ScheduleDAGCtor;
 
-  static MachinePassRegistry Registry;
+  static MachinePassRegistry<ScheduleDAGCtor> Registry;
 
   MachineSchedRegistry(const char *N, const char *D, ScheduleDAGCtor C)
-    : MachinePassRegistryNode(N, D, (MachinePassCtor)C) {
+      : MachinePassRegistryNode(N, D, C) {
     Registry.Add(this);
   }
 
@@ -158,7 +160,7 @@ public:
     return (MachineSchedRegistry *)Registry.getList();
   }
 
-  static void setListener(MachinePassRegistryListener *L) {
+  static void setListener(MachinePassRegistryListener<FunctionPassCtor> *L) {
     Registry.setListener(L);
   }
 };
@@ -792,7 +794,7 @@ public:
   /// Represent the type of SchedCandidate found within a single queue.
   /// pickNodeBidirectional depends on these listed by decreasing priority.
   enum CandReason : uint8_t {
-    NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak,
+    NoCand, Only1, PhysReg, RegExcess, RegCritical, Stall, Cluster, Weak,
     RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
     TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
 
@@ -926,7 +928,7 @@ bool tryPressure(const PressureChange &TryP,
                  const TargetRegisterInfo *TRI,
                  const MachineFunction &MF);
 unsigned getWeakLeft(const SUnit *SU, bool isTop);
-int biasPhysRegCopy(const SUnit *SU, bool isTop);
+int biasPhysReg(const SUnit *SU, bool isTop);
 
 /// GenericScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
@@ -1004,7 +1006,7 @@ protected:
                          const RegPressureTracker &RPTracker,
                          SchedCandidate &Candidate);
 
-  void reschedulePhysRegCopies(SUnit *SU, bool isTop);
+  void reschedulePhysReg(SUnit *SU, bool isTop);
 };
 
 /// PostGenericScheduler - Interface to the scheduling algorithm used by
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index cb12b14f4435ef152edadf7789d435f36fe44f70..acf1ebb5bc8396a203e0dd501a5473412ecdde4c 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -379,14 +379,20 @@ namespace llvm {
   ///
   FunctionPass *createInterleavedAccessPass();
 
+  /// InterleavedLoadCombines Pass - This pass identifies interleaved loads and
+  /// combines them into wide loads detectable by InterleavedAccessPass
+  ///
+  FunctionPass *createInterleavedLoadCombinePass();
+
   /// LowerEmuTLS - This pass generates __emutls_[vt].xyz variables for all
   /// TLS variables for the emulated TLS model.
   ///
   ModulePass *createLowerEmuTLSPass();
 
-  /// This pass lowers the \@llvm.load.relative intrinsic to instructions.
-  /// This is unsafe to do earlier because a pass may combine the constant
-  /// initializer into the load, which may result in an overflowing evaluation.
+  /// This pass lowers the \@llvm.load.relative and \@llvm.objc.* intrinsics to
+  /// instructions.  This is unsafe to do earlier because a pass may combine the
+  /// constant initializer into the load, which may result in an overflowing
+  /// evaluation.
   ModulePass *createPreISelIntrinsicLoweringPass();
 
   /// GlobalMerge - This pass merges internal (by default) globals into structs
diff --git a/include/llvm/CodeGen/PreISelIntrinsicLowering.h b/include/llvm/CodeGen/PreISelIntrinsicLowering.h
index 7a007eb8bceac52c9be510d757bc272ae8c59998..b7f83e515b7e95acc0c510609bd1d1fbe6ca0b40 100644
--- a/include/llvm/CodeGen/PreISelIntrinsicLowering.h
+++ b/include/llvm/CodeGen/PreISelIntrinsicLowering.h
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass implements IR lowering for the llvm.load.relative intrinsic.
+// This pass implements IR lowering for the llvm.load.relative and llvm.objc.*
+// intrinsics.
 //
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CODEGEN_PREISELINTRINSICLOWERING_H
diff --git a/include/llvm/CodeGen/RegAllocRegistry.h b/include/llvm/CodeGen/RegAllocRegistry.h
index 481747dc163eea6a997e9b6e82b62580fc429c33..b518fbb9c9da66f677b581d8291cac198f4a05b0 100644
--- a/include/llvm/CodeGen/RegAllocRegistry.h
+++ b/include/llvm/CodeGen/RegAllocRegistry.h
@@ -26,14 +26,14 @@ class FunctionPass;
 /// RegisterRegAlloc class - Track the registration of register allocators.
 ///
 //===----------------------------------------------------------------------===//
-class RegisterRegAlloc : public MachinePassRegistryNode {
+class RegisterRegAlloc : public MachinePassRegistryNode<FunctionPass *(*)()> {
 public:
   using FunctionPassCtor = FunctionPass *(*)();
 
-  static MachinePassRegistry Registry;
+  static MachinePassRegistry<FunctionPassCtor> Registry;
 
   RegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-      : MachinePassRegistryNode(N, D, (MachinePassCtor)C) {
+      : MachinePassRegistryNode(N, D, C) {
     Registry.Add(this);
   }
 
@@ -48,15 +48,11 @@ public:
     return (RegisterRegAlloc *)Registry.getList();
   }
 
-  static FunctionPassCtor getDefault() {
-    return (FunctionPassCtor)Registry.getDefault();
-  }
+  static FunctionPassCtor getDefault() { return Registry.getDefault(); }
 
-  static void setDefault(FunctionPassCtor C) {
-    Registry.setDefault((MachinePassCtor)C);
-  }
+  static void setDefault(FunctionPassCtor C) { Registry.setDefault(C); }
 
-  static void setListener(MachinePassRegistryListener *L) {
+  static void setListener(MachinePassRegistryListener<FunctionPassCtor> *L) {
     Registry.setListener(L);
   }
 };
diff --git a/include/llvm/CodeGen/SchedulerRegistry.h b/include/llvm/CodeGen/SchedulerRegistry.h
index badf927d0e95618b7f5c091649f01f24e0893045..fbe559f25556c86f09999b82601622c309228382 100644
--- a/include/llvm/CodeGen/SchedulerRegistry.h
+++ b/include/llvm/CodeGen/SchedulerRegistry.h
@@ -29,16 +29,19 @@ namespace llvm {
 class ScheduleDAGSDNodes;
 class SelectionDAGISel;
 
-class RegisterScheduler : public MachinePassRegistryNode {
+class RegisterScheduler
+    : public MachinePassRegistryNode<
+          ScheduleDAGSDNodes *(*)(SelectionDAGISel *, CodeGenOpt::Level)> {
 public:
   using FunctionPassCtor = ScheduleDAGSDNodes *(*)(SelectionDAGISel*,
                                                    CodeGenOpt::Level);
 
-  static MachinePassRegistry Registry;
+  static MachinePassRegistry<FunctionPassCtor> Registry;
 
   RegisterScheduler(const char *N, const char *D, FunctionPassCtor C)
-  : MachinePassRegistryNode(N, D, (MachinePassCtor)C)
-  { Registry.Add(this); }
+      : MachinePassRegistryNode(N, D, C) {
+    Registry.Add(this);
+  }
   ~RegisterScheduler() { Registry.Remove(this); }
 
 
@@ -51,7 +54,7 @@ public:
     return (RegisterScheduler *)Registry.getList();
   }
 
-  static void setListener(MachinePassRegistryListener *L) {
+  static void setListener(MachinePassRegistryListener<FunctionPassCtor> *L) {
     Registry.setListener(L);
   }
 };
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 3b144b92e2a6eb93499429e32741c2f93ec2d911..8093fdab5491eb10787fb091b0962022c09abdcc 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -929,41 +929,45 @@ public:
                           Type *SizeTy, unsigned ElemSz, bool isTailCall,
                           MachinePointerInfo DstPtrInfo);
 
-  /// Helper function to make it easier to build SetCC's if you just
-  /// have an ISD::CondCode instead of an SDValue.
-  ///
+  /// Helper function to make it easier to build SetCC's if you just have an
+  /// ISD::CondCode instead of an SDValue.
   SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS,
                    ISD::CondCode Cond) {
     assert(LHS.getValueType().isVector() == RHS.getValueType().isVector() &&
-      "Cannot compare scalars to vectors");
+           "Cannot compare scalars to vectors");
     assert(LHS.getValueType().isVector() == VT.isVector() &&
-      "Cannot compare scalars to vectors");
+           "Cannot compare scalars to vectors");
     assert(Cond != ISD::SETCC_INVALID &&
-        "Cannot create a setCC of an invalid node.");
+           "Cannot create a setCC of an invalid node.");
     return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond));
   }
 
-  /// Helper function to make it easier to build Select's if you just
-  /// have operands and don't want to check for vector.
+  /// Helper function to make it easier to build Select's if you just have
+  /// operands and don't want to check for vector.
   SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
                     SDValue RHS) {
     assert(LHS.getValueType() == RHS.getValueType() &&
            "Cannot use select on differing types");
     assert(VT.isVector() == LHS.getValueType().isVector() &&
            "Cannot mix vectors and scalars");
-    return getNode(Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
-                   Cond, LHS, RHS);
+    auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
+    return getNode(Opcode, DL, VT, Cond, LHS, RHS);
   }
 
-  /// Helper function to make it easier to build SelectCC's if you
-  /// just have an ISD::CondCode instead of an SDValue.
-  ///
+  /// Helper function to make it easier to build SelectCC's if you just have an
+  /// ISD::CondCode instead of an SDValue.
   SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
                       SDValue False, ISD::CondCode Cond) {
-    return getNode(ISD::SELECT_CC, DL, True.getValueType(),
-                   LHS, RHS, True, False, getCondCode(Cond));
+    return getNode(ISD::SELECT_CC, DL, True.getValueType(), LHS, RHS, True,
+                   False, getCondCode(Cond));
   }
 
+  /// Try to simplify a select/vselect into 1 of its operands or a constant.
+  SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal);
+
+  /// Try to simplify a shift into 1 of its operands or a constant.
+  SDValue simplifyShift(SDValue X, SDValue Y);
+
   /// VAArg produces a result and token chain, and takes a pointer
   /// and a source value as input.
   SDValue getVAArg(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
@@ -1511,6 +1515,18 @@ public:
   /// allow an 'add' to be transformed into an 'or'.
   bool haveNoCommonBitsSet(SDValue A, SDValue B) const;
 
+  /// Test whether \p V has a splatted value for all the demanded elements.
+  ///
+  /// On success \p UndefElts will indicate the elements that have UNDEF
+  /// values instead of the splat value, this is only guaranteed to be correct
+  /// for \p DemandedElts.
+  ///
+  /// NOTE: The function will return true for a demanded splat of UNDEF values.
+  bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts);
+
+  /// Test whether \p V has a splatted value.
+  bool isSplatValue(SDValue V, bool AllowUndefs = false);
+
   /// Match a binop + shuffle pyramid that represents a horizontal reduction
   /// over the elements of a vector starting from the EXTRACT_VECTOR_ELT node /p
   /// Extract. The reduction must use one of the opcodes listed in /p
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index 86df0af7303fe4f536cb86db045da26dc339504f..6758c55c696afa4b8030b177d3b0d82acb1ce849 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -132,6 +132,7 @@ public:
     OPC_CheckChild2Same, OPC_CheckChild3Same,
     OPC_CheckPatternPredicate,
     OPC_CheckPredicate,
+    OPC_CheckPredicateWithOperands,
     OPC_CheckOpcode,
     OPC_SwitchOpcode,
     OPC_CheckType,
@@ -267,6 +268,17 @@ public:
     llvm_unreachable("Tblgen should generate the implementation of this!");
   }
 
+  /// CheckNodePredicateWithOperands - This function is generated by tblgen in
+  /// the target.
+  /// It runs node predicate number PredNo and returns true if it succeeds or
+  /// false if it fails.  The number is a private implementation detail to the
+  /// code tblgen produces.
+  virtual bool CheckNodePredicateWithOperands(
+      SDNode *N, unsigned PredNo,
+      const SmallVectorImpl<SDValue> &Operands) const {
+    llvm_unreachable("Tblgen should generate the implementation of this!");
+  }
+
   virtual bool CheckComplexPattern(SDNode *Root, SDNode *Parent, SDValue N,
                                    unsigned PatternNo,
                         SmallVectorImpl<std::pair<SDValue, SDNode*> > &Result) {
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index d125e888a5742d145f10ea43ec8046e917dcc078..2931717b72deb1dafe45c442a5e904a57ea84fc8 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1613,6 +1613,21 @@ ConstantSDNode *isConstOrConstSplat(SDValue N, bool AllowUndefs = false);
 /// Returns the SDNode if it is a constant splat BuildVector or constant float.
 ConstantFPSDNode *isConstOrConstSplatFP(SDValue N, bool AllowUndefs = false);
 
+/// Return true if the value is a constant 0 integer or a splatted vector of
+/// a constant 0 integer (with no undefs).
+/// Build vector implicit truncation is not an issue for null values.
+bool isNullOrNullSplat(SDValue V);
+
+/// Return true if the value is a constant 1 integer or a splatted vector of a
+/// constant 1 integer (with no undefs).
+/// Does not permit build vector implicit truncation.
+bool isOneOrOneSplat(SDValue V);
+
+/// Return true if the value is a constant -1 integer or a splatted vector of a
+/// constant -1 integer (with no undefs).
+/// Does not permit build vector implicit truncation.
+bool isAllOnesOrAllOnesSplat(SDValue V);
+
 class GlobalAddressSDNode : public SDNode {
   friend class SelectionDAG;
 
diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
index e584a4136e4fb39fe9e02a51c7f2afb59fea1bc7..8be9ae378557eafe01c4321e0b0d6c4dd0e98146 100644
--- a/include/llvm/CodeGen/StackMaps.h
+++ b/include/llvm/CodeGen/StackMaps.h
@@ -236,25 +236,6 @@ public:
     FnInfos.clear();
   }
 
-  /// Generate a stackmap record for a stackmap instruction.
-  ///
-  /// MI must be a raw STACKMAP, not a PATCHPOINT.
-  void recordStackMap(const MachineInstr &MI);
-
-  /// Generate a stackmap record for a patchpoint instruction.
-  void recordPatchPoint(const MachineInstr &MI);
-
-  /// Generate a stackmap record for a statepoint instruction.
-  void recordStatepoint(const MachineInstr &MI);
-
-  /// If there is any stack map data, create a stack map section and serialize
-  /// the map info into it. This clears the stack map data structures
-  /// afterwards.
-  void serializeToStackMapSection();
-
-private:
-  static const char *WSMP;
-
   using LocationVec = SmallVector<Location, 8>;
   using LiveOutVec = SmallVector<LiveOutReg, 8>;
   using ConstantPool = MapVector<uint64_t, uint64_t>;
@@ -283,6 +264,31 @@ private:
   using FnInfoMap = MapVector<const MCSymbol *, FunctionInfo>;
   using CallsiteInfoList = std::vector<CallsiteInfo>;
 
+  /// Generate a stackmap record for a stackmap instruction.
+  ///
+  /// MI must be a raw STACKMAP, not a PATCHPOINT.
+  void recordStackMap(const MachineInstr &MI);
+
+  /// Generate a stackmap record for a patchpoint instruction.
+  void recordPatchPoint(const MachineInstr &MI);
+
+  /// Generate a stackmap record for a statepoint instruction.
+  void recordStatepoint(const MachineInstr &MI);
+
+  /// If there is any stack map data, create a stack map section and serialize
+  /// the map info into it. This clears the stack map data structures
+  /// afterwards.
+  void serializeToStackMapSection();
+
+  /// Get call site info.
+  CallsiteInfoList &getCSInfos() { return CSInfos; }
+
+  /// Get function info.
+  FnInfoMap &getFnInfos() { return FnInfos; }
+
+private:
+  static const char *WSMP;
+
   AsmPrinter &AP;
   CallsiteInfoList CSInfos;
   ConstantPool ConstPool;
diff --git a/include/llvm/CodeGen/TargetInstrInfo.h b/include/llvm/CodeGen/TargetInstrInfo.h
index 1fa3de421eacffa5b6eda502c1a192dea92302dc..961b90e9bc12d2e10f1537c53a38c786256f439c 100644
--- a/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1136,11 +1136,11 @@ public:
     return false;
   }
 
-  /// Get the base register and byte offset of an instruction that reads/writes
+  /// Get the base operand and byte offset of an instruction that reads/writes
   /// memory.
-  virtual bool getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
-                                     int64_t &Offset,
-                                     const TargetRegisterInfo *TRI) const {
+  virtual bool getMemOperandWithOffset(MachineInstr &MI,
+                                       MachineOperand *&BaseOp, int64_t &Offset,
+                                       const TargetRegisterInfo *TRI) const {
     return false;
   }
 
@@ -1164,8 +1164,8 @@ public:
   /// or
   ///   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   /// to TargetPassConfig::createMachineScheduler() to have an effect.
-  virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
-                                   MachineInstr &SecondLdSt, unsigned BaseReg2,
+  virtual bool shouldClusterMemOps(MachineOperand &BaseOp1,
+                                   MachineOperand &BaseOp2,
                                    unsigned NumLoads) const {
     llvm_unreachable("target did not implement shouldClusterMemOps()");
   }
@@ -1635,10 +1635,11 @@ public:
         "Target didn't implement TargetInstrInfo::getOutliningType!");
   }
 
-  /// Returns target-defined flags defining properties of the MBB for
-  /// the outliner.
-  virtual unsigned getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const {
-    return 0x0;
+  /// Optional target hook that returns true if \p MBB is safe to outline from,
+  /// and returns any target-specific information in \p Flags.
+  virtual bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                                      unsigned &Flags) const {
+    return true;
   }
 
   /// Insert a custom frame for outlined functions.
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 38e575b1360fb1e6d38eec123d348ef058d8468b..d4e6da80a4c8f8cc23ec2a7e888d3e7a0daadbc6 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -269,6 +269,14 @@ public:
     return true;
   }
 
+  /// Return true if it is profitable to convert a select of FP constants into
+  /// a constant pool load whose address depends on the select condition. The
+  /// parameter may be used to differentiate a select with FP compare from
+  /// integer compare.
+  virtual bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
+    return true;
+  }
+
   /// Return true if multiple condition registers are available.
   bool hasMultipleConditionRegisters() const {
     return HasMultipleConditionRegisters;
@@ -797,6 +805,38 @@ public:
     return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
   }
 
+  /// Custom method defined by each target to indicate if an operation which
+  /// may require a scale is supported natively by the target.
+  /// If not, the operation is illegal.
+  virtual bool isSupportedFixedPointOperation(unsigned Op, EVT VT,
+                                              unsigned Scale) const {
+    return false;
+  }
+
+  /// Some fixed point operations may be natively supported by the target but
+  /// only for specific scales. This method allows for checking
+  /// if the width is supported by the target for a given operation that may
+  /// depend on scale.
+  LegalizeAction getFixedPointOperationAction(unsigned Op, EVT VT,
+                                              unsigned Scale) const {
+    auto Action = getOperationAction(Op, VT);
+    if (Action != Legal)
+      return Action;
+
+    // This operation is supported in this type but may only work on specific
+    // scales.
+    bool Supported;
+    switch (Op) {
+    default:
+      llvm_unreachable("Unexpected fixed point operation.");
+    case ISD::SMULFIX:
+      Supported = isSupportedFixedPointOperation(Op, VT, Scale);
+      break;
+    }
+
+    return Supported ? Action : Expand;
+  }
+
   LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const {
     unsigned EqOpc;
     switch (Op) {
@@ -1213,13 +1253,15 @@ public:
   /// reduce runtime.
   virtual bool ShouldShrinkFPConstant(EVT) const { return true; }
 
-  // Return true if it is profitable to reduce the given load node to a smaller
-  // type.
-  //
-  // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed
-  virtual bool shouldReduceLoadWidth(SDNode *Load,
-                                     ISD::LoadExtType ExtTy,
+  /// Return true if it is profitable to reduce a load to a smaller type.
+  /// Example: (i16 (trunc (i32 (load x))) -> i16 load x
+  virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
                                      EVT NewVT) const {
+    // By default, assume that it is cheaper to extract a subvector from a wide
+    // vector load rather than creating multiple narrow vector loads.
+    if (NewVT.isVector() && !Load->hasOneUse())
+      return false;
+
     return true;
   }
 
@@ -1736,6 +1778,16 @@ public:
     return false;
   }
 
+  /// Return true if it is more correct/profitable to use strict FP_TO_INT
+  /// conversion operations - canonicalizing the FP source value instead of
+  /// converting all cases and then selecting based on value.
+  /// This may be true if the target throws exceptions for out of bounds
+  /// conversions or has fast FP CMOV.
+  virtual bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
+                                        bool IsSigned) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
@@ -2210,6 +2262,12 @@ public:
     return false;
   }
 
+  /// Return true if sign-extension from FromTy to ToTy is cheaper than
+  /// zero-extension.
+  virtual bool isSExtCheaperThanZExt(EVT FromTy, EVT ToTy) const {
+    return false;
+  }
+
   /// Return true if the target supplies and combines to a paired load
   /// two loaded values of type LoadedType next to each other in memory.
   /// RequiredAlignment gives the minimal alignment constraints that must be met
@@ -2841,22 +2899,28 @@ public:
   bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx, const APInt &Demanded,
                             DAGCombinerInfo &DCI, TargetLoweringOpt &TLO) const;
 
-  /// Look at Op.  At this point, we know that only the DemandedMask bits of the
+  /// Look at Op.  At this point, we know that only the DemandedBits bits of the
   /// result of Op are ever used downstream.  If we can use this information to
   /// simplify Op, create a new simplified DAG node and return true, returning
   /// the original and new nodes in Old and New.  Otherwise, analyze the
   /// expression and return a mask of KnownOne and KnownZero bits for the
   /// expression (used to simplify the caller).  The KnownZero/One bits may only
-  /// be accurate for those bits in the DemandedMask.
+  /// be accurate for those bits in the Demanded masks.
   /// \p AssumeSingleUse When this parameter is true, this function will
   ///    attempt to simplify \p Op even if there are multiple uses.
   ///    Callers are responsible for correctly updating the DAG based on the
   ///    results of this function, because simply replacing replacing TLO.Old
   ///    with TLO.New will be incorrect when this parameter is true and TLO.Old
   ///    has multiple uses.
-  bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
-                            KnownBits &Known,
-                            TargetLoweringOpt &TLO,
+  bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                            const APInt &DemandedElts, KnownBits &Known,
+                            TargetLoweringOpt &TLO, unsigned Depth = 0,
+                            bool AssumeSingleUse = false) const;
+
+  /// Helper wrapper around SimplifyDemandedBits, demanding all elements.
+  /// Adds Op back to the worklist upon success.
+  bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                            KnownBits &Known, TargetLoweringOpt &TLO,
                             unsigned Depth = 0,
                             bool AssumeSingleUse = false) const;
 
@@ -2927,13 +2991,14 @@ public:
       SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
       APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
 
-  /// Attempt to simplify any target nodes based on the demanded bits,
+  /// Attempt to simplify any target nodes based on the demanded bits/elts,
   /// returning true on success. Otherwise, analyze the
   /// expression and return a mask of KnownOne and KnownZero bits for the
   /// expression (used to simplify the caller).  The KnownZero/One bits may only
-  /// be accurate for those bits in the DemandedMask.
+  /// be accurate for those bits in the Demanded masks.
   virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op,
                                                  const APInt &DemandedBits,
+                                                 const APInt &DemandedElts,
                                                  KnownBits &Known,
                                                  TargetLoweringOpt &TLO,
                                                  unsigned Depth = 0) const;
@@ -3663,6 +3728,18 @@ public:
                  SDValue LL = SDValue(), SDValue LH = SDValue(),
                  SDValue RL = SDValue(), SDValue RH = SDValue()) const;
 
+  /// Expand funnel shift.
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandFunnelShift(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
+  /// Expand rotations.
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandROT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
   /// Expand float(f32) to SINT(i64) conversion
   /// \param N Node to expand
   /// \param Result output after conversion
@@ -3743,10 +3820,15 @@ public:
                                   SDValue Index) const;
 
   /// Method for building the DAG expansion of ISD::[US][ADD|SUB]SAT. This
-  /// method accepts integers or vectors of integers as its arguments.
+  /// method accepts integers as its arguments.
   SDValue getExpandedSaturationAdditionSubtraction(SDNode *Node,
                                                    SelectionDAG &DAG) const;
 
+  /// Method for building the DAG expansion of ISD::SMULFIX. This method accepts
+  /// integers as its arguments.
+  SDValue getExpandedFixedPointMultiplication(SDNode *Node,
+                                              SelectionDAG &DAG) const;
+
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
   //
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index f5c7fc824ab47864ec11bbbaebfede5745be5b8f..052d1f8bc6863c4f65c82b90776cd6fc22d651f8 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -90,6 +90,8 @@ public:
   const MCExpr *lowerRelativeReference(const GlobalValue *LHS,
                                        const GlobalValue *RHS,
                                        const TargetMachine &TM) const override;
+
+  MCSection *getSectionForCommandLines() const override;
 };
 
 class TargetLoweringObjectFileMachO : public TargetLoweringObjectFile {
diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h
index 7fda8751d40ae9a296b0ca177c37f29f5ac854db..3288711a335d2a8ef31fe4ea307a8ba52565e62d 100644
--- a/include/llvm/CodeGen/TargetPassConfig.h
+++ b/include/llvm/CodeGen/TargetPassConfig.h
@@ -90,6 +90,19 @@ private:
   AnalysisID StartAfter = nullptr;
   AnalysisID StopBefore = nullptr;
   AnalysisID StopAfter = nullptr;
+
+  unsigned StartBeforeInstanceNum = 0;
+  unsigned StartBeforeCount = 0;
+
+  unsigned StartAfterInstanceNum = 0;
+  unsigned StartAfterCount = 0;
+
+  unsigned StopBeforeInstanceNum = 0;
+  unsigned StopBeforeCount = 0;
+
+  unsigned StopAfterInstanceNum = 0;
+  unsigned StopAfterCount = 0;
+
   bool Started = true;
   bool Stopped = false;
   bool AddingMachinePasses = false;
diff --git a/include/llvm/CodeGen/WasmEHFuncInfo.h b/include/llvm/CodeGen/WasmEHFuncInfo.h
index aa979349edfd01a493f81148f487267676d7b192..219fff988f6e0f974bfc7ad4e497d713ffc28ded 100644
--- a/include/llvm/CodeGen/WasmEHFuncInfo.h
+++ b/include/llvm/CodeGen/WasmEHFuncInfo.h
@@ -21,6 +21,8 @@
 
 namespace llvm {
 
+enum EventTag { CPP_EXCEPTION = 0, C_LONGJMP = 1 };
+
 using BBOrMBB = PointerUnion<const BasicBlock *, MachineBasicBlock *>;
 
 struct WasmEHFuncInfo {
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index aece421b77d28b032cc1b1f6d5bd9d6c9d189b22..03bbd74d6d325e1b42b095801eb9e2e3e7521c68 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -208,6 +208,12 @@
 /* Define to 1 if you have the <sys/time.h> header file. */
 #cmakedefine HAVE_SYS_TIME_H ${HAVE_SYS_TIME_H}
 
+/* Define to 1 if stat struct has st_mtimespec member .*/
+#cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
+
+/* Define to 1 if stat struct has st_mtim member. */
+#cmakedefine HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC}
+
 /* Define to 1 if you have the <sys/types.h> header file. */
 #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H}
 
diff --git a/include/llvm/DebugInfo/CodeView/CVRecord.h b/include/llvm/DebugInfo/CodeView/CVRecord.h
index 9dbeb438f4ae24d83a441cedf33240bc2213a7af..11ca9ff108de731553c6f581435b1b5a6cdde63a 100644
--- a/include/llvm/DebugInfo/CodeView/CVRecord.h
+++ b/include/llvm/DebugInfo/CodeView/CVRecord.h
@@ -45,13 +45,8 @@ public:
     return RecordData.drop_front(sizeof(RecordPrefix));
   }
 
-  Optional<uint32_t> hash() const { return Hash; }
-
-  void setHash(uint32_t Value) { Hash = Value; }
-
   Kind Type;
   ArrayRef<uint8_t> RecordData;
-  Optional<uint32_t> Hash;
 };
 
 template <typename Kind> struct RemappedRecord {
diff --git a/include/llvm/DebugInfo/CodeView/CodeView.h b/include/llvm/DebugInfo/CodeView/CodeView.h
index 4b96bc14ac1739a819bb2080df6c5cb479547823..8e0d9f608e93758161885e541fc5c9cc2f8123c4 100644
--- a/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -358,7 +358,9 @@ enum class PointerOptions : uint32_t {
   Const = 0x00000400,
   Unaligned = 0x00000800,
   Restrict = 0x00001000,
-  WinRTSmartPointer = 0x00080000
+  WinRTSmartPointer = 0x00080000,
+  LValueRefThisPointer = 0x00100000,
+  RValueRefThisPointer = 0x00200000
 };
 CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(PointerOptions)
 
diff --git a/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h b/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
index f74120a53608650c216fd3d4e41b80994a85ae9f..847d93f0e98585a8a4a8eb543b023400fc78bfd1 100644
--- a/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
+++ b/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
@@ -13,6 +13,7 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -31,10 +32,10 @@ public:
   FixedStreamArray<FrameData>::Iterator begin() const { return Frames.begin(); }
   FixedStreamArray<FrameData>::Iterator end() const { return Frames.end(); }
 
-  const uint32_t *getRelocPtr() const { return RelocPtr; }
+  const support::ulittle32_t *getRelocPtr() const { return RelocPtr; }
 
 private:
-  const uint32_t *RelocPtr = nullptr;
+  const support::ulittle32_t *RelocPtr = nullptr;
   FixedStreamArray<FrameData> Frames;
 };
 
diff --git a/include/llvm/DebugInfo/CodeView/RecordSerialization.h b/include/llvm/DebugInfo/CodeView/RecordSerialization.h
index 58449c2c7565b9f8905d880d26d282afdc0502ed..36237e1a4d9e426a8bb1f995340b2d6df0db0b98 100644
--- a/include/llvm/DebugInfo/CodeView/RecordSerialization.h
+++ b/include/llvm/DebugInfo/CodeView/RecordSerialization.h
@@ -180,26 +180,6 @@ template <typename T> serialize_numeric_impl<T> serialize_numeric(T &Item) {
   return serialize_numeric_impl<T>(Item);
 }
 
-// This field is only present in the byte record if the condition is true.  The
-// condition is evaluated lazily, so it can depend on items that were
-// deserialized
-// earlier.
-#define CV_CONDITIONAL_FIELD(I, C)                                             \
-  serialize_conditional(I, [&]() { return !!(C); })
-
-// This is an array of N items, where N is evaluated lazily, so it can refer
-// to a field deserialized earlier.
-#define CV_ARRAY_FIELD_N(I, N) serialize_array(I, [&]() { return N; })
-
-// This is an array that exhausts the remainder of the input buffer.
-#define CV_ARRAY_FIELD_TAIL(I) serialize_array_tail(I)
-
-// This is an array that consumes null terminated strings until a double null
-// is encountered.
-#define CV_STRING_ARRAY_NULL_TERM(I) serialize_null_term_string_array(I)
-
-#define CV_NUMERIC_FIELD(I) serialize_numeric(I)
-
 template <typename T, typename U>
 Error consume(BinaryStreamReader &Reader,
               const serialize_conditional_impl<T, U> &Item) {
@@ -242,9 +222,6 @@ Error consume(BinaryStreamReader &Reader, T &&X, U &&Y, Args &&... Rest) {
   return consume(Reader, Y, std::forward<Args>(Rest)...);
 }
 
-#define CV_DESERIALIZE(...)                                                    \
-  if (auto EC = consume(__VA_ARGS__))                                          \
-    return std::move(EC);
 }
 }
 
diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
index c63fb9871909ec40c9772861fea3524f73254e61..b58825c4a7887a90b54a13edd15ce7d3e87e8998 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
@@ -400,6 +400,7 @@ public:
   uint16_t Module;
   StringRef Name;
 
+  uint16_t modi() const { return Module - 1; }
   uint32_t RecordOffset;
 };
 
diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h b/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..3713fe118eaa9843e100ae60666d58dba4afad41
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h
@@ -0,0 +1,62 @@
+//===- SymbolRecordHelpers.h ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_SYMBOLRECORDHELPERS_H
+#define LLVM_DEBUGINFO_CODEVIEW_SYMBOLRECORDHELPERS_H
+
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+
+namespace llvm {
+namespace codeview {
+/// Return true if this symbol opens a scope. This implies that the symbol has
+/// "parent" and "end" fields, which contain the offset of the S_END or
+/// S_INLINESITE_END record.
+inline bool symbolOpensScope(SymbolKind Kind) {
+  switch (Kind) {
+  case SymbolKind::S_GPROC32:
+  case SymbolKind::S_LPROC32:
+  case SymbolKind::S_LPROC32_ID:
+  case SymbolKind::S_GPROC32_ID:
+  case SymbolKind::S_BLOCK32:
+  case SymbolKind::S_SEPCODE:
+  case SymbolKind::S_THUNK32:
+  case SymbolKind::S_INLINESITE:
+  case SymbolKind::S_INLINESITE2:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
+/// Return true if this ssymbol ends a scope.
+inline bool symbolEndsScope(SymbolKind Kind) {
+  switch (Kind) {
+  case SymbolKind::S_END:
+  case SymbolKind::S_PROC_ID_END:
+  case SymbolKind::S_INLINESITE_END:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
+/// Given a symbol P for which symbolOpensScope(P) == true, return the
+/// corresponding end offset.
+uint32_t getScopeEndOffset(const CVSymbol &Symbol);
+uint32_t getScopeParentOffset(const CVSymbol &Symbol);
+
+CVSymbolArray limitSymbolArrayToScope(const CVSymbolArray &Symbols,
+                                      uint32_t ScopeBegin);
+
+} // namespace codeview
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index 76f1f98ab660ae678ed5e4a50877eed37212aa24..e7b5293dc51a419dfe9fee3f59fb9d8faa8dc153 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -264,14 +264,18 @@ public:
 // LF_POINTER
 class PointerRecord : public TypeRecord {
 public:
+  // ---------------------------XXXXX
   static const uint32_t PointerKindShift = 0;
   static const uint32_t PointerKindMask = 0x1F;
 
+  // ------------------------XXX-----
   static const uint32_t PointerModeShift = 5;
   static const uint32_t PointerModeMask = 0x07;
 
-  static const uint32_t PointerOptionMask = 0xFF;
+  // ----------XXX------XXXXX--------
+  static const uint32_t PointerOptionMask = 0x381f00;
 
+  // -------------XXXXXX------------
   static const uint32_t PointerSizeShift = 13;
   static const uint32_t PointerSizeMask = 0xFF;
 
@@ -305,7 +309,7 @@ public:
   }
 
   PointerOptions getOptions() const {
-    return static_cast<PointerOptions>(Attrs);
+    return static_cast<PointerOptions>(Attrs & PointerOptionMask);
   }
 
   uint8_t getSize() const {
@@ -334,6 +338,14 @@ public:
     return !!(Attrs & uint32_t(PointerOptions::Restrict));
   }
 
+  bool isLValueReferenceThisPtr() const {
+    return !!(Attrs & uint32_t(PointerOptions::LValueRefThisPointer));
+  }
+
+  bool isRValueReferenceThisPtr() const {
+    return !!(Attrs & uint32_t(PointerOptions::RValueRefThisPointer));
+  }
+
   TypeIndex ReferentType;
   uint32_t Attrs;
   Optional<MemberPointerInfo> MemberInfo;
@@ -429,6 +441,10 @@ public:
     return (Options & ClassOptions::ForwardReference) != ClassOptions::None;
   }
 
+  bool containsNestedClass() const {
+    return (Options & ClassOptions::ContainsNestedClass) != ClassOptions::None;
+  }
+
   bool isScoped() const {
     return (Options & ClassOptions::Scoped) != ClassOptions::None;
   }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 221f1f796980f343eb92ce666b54c679f8b6896c..dbb6be04544be7a1ab204f52f55dcd484536e7c2 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -360,6 +360,10 @@ public:
   /// Dump Error as warning message to stderr.
   static void dumpWarning(Error Warning);
 
+  Triple::ArchType getArch() const {
+    return getDWARFObj().getFile()->getArch();
+  }
+
 private:
   /// Return the compile unit which contains instruction with provided
   /// address.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index ff1c7fb383898731de832f9e9bf65478ab3413d0..7dc07d774aba9cf08eb829af248e608b575e4690 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/Support/Error.h"
@@ -59,9 +60,11 @@ public:
   unsigned size() const { return (unsigned)Instructions.size(); }
   bool empty() const { return Instructions.empty(); }
 
-  CFIProgram(uint64_t CodeAlignmentFactor, int64_t DataAlignmentFactor)
+  CFIProgram(uint64_t CodeAlignmentFactor, int64_t DataAlignmentFactor,
+             Triple::ArchType Arch)
       : CodeAlignmentFactor(CodeAlignmentFactor),
-        DataAlignmentFactor(DataAlignmentFactor) {}
+        DataAlignmentFactor(DataAlignmentFactor),
+        Arch(Arch) {}
 
   /// Parse and store a sequence of CFI instructions from Data,
   /// starting at *Offset and ending at EndOffset. *Offset is updated
@@ -76,6 +79,7 @@ private:
   std::vector<Instruction> Instructions;
   const uint64_t CodeAlignmentFactor;
   const int64_t DataAlignmentFactor;
+  Triple::ArchType Arch;
 
   /// Convenience method to add a new instruction with the given opcode.
   void addInstruction(uint8_t Opcode) {
@@ -130,8 +134,9 @@ public:
   enum FrameKind { FK_CIE, FK_FDE };
 
   FrameEntry(FrameKind K, uint64_t Offset, uint64_t Length, uint64_t CodeAlign,
-             int64_t DataAlign)
-      : Kind(K), Offset(Offset), Length(Length), CFIs(CodeAlign, DataAlign) {}
+             int64_t DataAlign, Triple::ArchType Arch)
+      : Kind(K), Offset(Offset), Length(Length),
+        CFIs(CodeAlign, DataAlign, Arch) {}
 
   virtual ~FrameEntry() {}
 
@@ -168,9 +173,9 @@ public:
       int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister,
       SmallString<8> AugmentationData, uint32_t FDEPointerEncoding,
       uint32_t LSDAPointerEncoding, Optional<uint64_t> Personality,
-      Optional<uint32_t> PersonalityEnc)
+      Optional<uint32_t> PersonalityEnc, Triple::ArchType Arch)
       : FrameEntry(FK_CIE, Offset, Length, CodeAlignmentFactor,
-                   DataAlignmentFactor),
+                   DataAlignmentFactor, Arch),
         Version(Version), Augmentation(std::move(Augmentation)),
         AddressSize(AddressSize), SegmentDescriptorSize(SegmentDescriptorSize),
         CodeAlignmentFactor(CodeAlignmentFactor),
@@ -224,10 +229,11 @@ public:
   // is obtained lazily once it's actually required.
   FDE(uint64_t Offset, uint64_t Length, int64_t LinkedCIEOffset,
       uint64_t InitialLocation, uint64_t AddressRange, CIE *Cie,
-      Optional<uint64_t> LSDAAddress)
+      Optional<uint64_t> LSDAAddress, Triple::ArchType Arch)
       : FrameEntry(FK_FDE, Offset, Length,
                    Cie ? Cie->getCodeAlignmentFactor() : 0,
-                   Cie ? Cie->getDataAlignmentFactor() : 0),
+                   Cie ? Cie->getDataAlignmentFactor() : 0,
+                   Arch),
         LinkedCIEOffset(LinkedCIEOffset), InitialLocation(InitialLocation),
         AddressRange(AddressRange), LinkedCIE(Cie), LSDAAddress(LSDAAddress) {}
 
@@ -256,6 +262,7 @@ private:
 
 /// A parsed .debug_frame or .eh_frame section
 class DWARFDebugFrame {
+  const Triple::ArchType Arch;
   // True if this is parsing an eh_frame section.
   const bool IsEH;
   // Not zero for sane pointer values coming out of eh_frame
@@ -272,7 +279,8 @@ public:
   // it is a .debug_frame section. EHFrameAddress should be different
   // than zero for correct parsing of .eh_frame addresses when they
   // use a PC-relative encoding.
-  DWARFDebugFrame(bool IsEH = false, uint64_t EHFrameAddress = 0);
+  DWARFDebugFrame(Triple::ArchType Arch,
+                  bool IsEH = false, uint64_t EHFrameAddress = 0);
   ~DWARFDebugFrame();
 
   /// Dump the section data into the given stream.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h b/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
index cae4804e61d3b3c8dbd16a374ae29d7bc7c7e514..9e1656eb1615c4336671c5dde0512aa409a0b4cf 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFObject.h"
 #include <cstdint>
 #include <vector>
 
@@ -67,7 +68,8 @@ private:
   bool GnuStyle;
 
 public:
-  DWARFDebugPubTable(StringRef Data, bool LittleEndian, bool GnuStyle);
+  DWARFDebugPubTable(const DWARFObject &Obj, const DWARFSection &Sec,
+                     bool LittleEndian, bool GnuStyle);
 
   void dump(raw_ostream &OS) const;
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index baa47c2bfa580494c7bb1996bd1eca24dabbd662..56d46cd739a27108b5bc9419d1fb850694f6e651 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -180,6 +180,7 @@ public:
   /// \returns a valid DWARFDie instance if the attribute exists, or an invalid
   /// DWARFDie object if it doesn't.
   DWARFDie getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const;
+  DWARFDie getAttributeValueAsReferencedDie(const DWARFFormValue &V) const;
 
   /// Extract the range base attribute from this DIE as absolute section offset.
   ///
diff --git a/include/llvm/DebugInfo/DWARF/DWARFObject.h b/include/llvm/DebugInfo/DWARF/DWARFObject.h
index 5a808b0ec6a97ba13486475b9b8ba602e5a4dffa..d611b5d075c8ea13ab3194ae073b0ed7c63e4857 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFObject.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFObject.h
@@ -49,10 +49,10 @@ public:
   virtual const DWARFSection &getRangeSection() const { return Dummy; }
   virtual const DWARFSection &getRnglistsSection() const { return Dummy; }
   virtual StringRef getMacinfoSection() const { return ""; }
-  virtual StringRef getPubNamesSection() const { return ""; }
-  virtual StringRef getPubTypesSection() const { return ""; }
-  virtual StringRef getGnuPubNamesSection() const { return ""; }
-  virtual StringRef getGnuPubTypesSection() const { return ""; }
+  virtual const DWARFSection &getPubNamesSection() const { return Dummy; }
+  virtual const DWARFSection &getPubTypesSection() const { return Dummy; }
+  virtual const DWARFSection &getGnuPubNamesSection() const { return Dummy; }
+  virtual const DWARFSection &getGnuPubTypesSection() const { return Dummy; }
   virtual const DWARFSection &getStringOffsetSection() const { return Dummy; }
   virtual void
   forEachInfoDWOSections(function_ref<void(const DWARFSection &)> F) const {}
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 458278e4282fd777386ae0bc847d9976dcd51579..9909def1072355eb0959ba5e802f006c12b967d2 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -72,7 +72,8 @@ public:
   /// Parse a unit header from \p debug_info starting at \p offset_ptr.
   bool extract(DWARFContext &Context, const DWARFDataExtractor &debug_info,
                uint32_t *offset_ptr, DWARFSectionKind Kind = DW_SECT_INFO,
-               const DWARFUnitIndex *Index = nullptr);
+               const DWARFUnitIndex *Index = nullptr,
+               const DWARFUnitIndex::Entry *Entry = nullptr);
   uint32_t getOffset() const { return Offset; }
   const dwarf::FormParams &getFormParams() const { return FormParams; }
   uint16_t getVersion() const { return FormParams.Version; }
@@ -108,7 +109,8 @@ const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context,
 /// .debug_info and .debug_types, or from .debug_info.dwo and .debug_types.dwo.
 class DWARFUnitVector final : public SmallVector<std::unique_ptr<DWARFUnit>, 1> {
   std::function<std::unique_ptr<DWARFUnit>(uint32_t, DWARFSectionKind,
-                                           const DWARFSection *)>
+                                           const DWARFSection *,
+                                           const DWARFUnitIndex::Entry *)>
       Parser;
   int NumInfoUnits = -1;
 
diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h
index 7b5a85295963b78760b81d3dd8bd36d51da647fa..997f13f5f30ece7d53428948adff39c69460020e 100644
--- a/include/llvm/DebugInfo/PDB/GenericError.h
+++ b/include/llvm/DebugInfo/PDB/GenericError.h
@@ -21,6 +21,7 @@ enum class pdb_error_code {
   dia_sdk_not_present,
   dia_failed_loading,
   signature_out_of_date,
+  external_cmdline_ref,
   unspecified,
 };
 } // namespace pdb
diff --git a/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h b/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
index ce4d07917755c1d7eac87ad7c3e4dfdb0a23a7c7..ac7f741afefa7a5162bc067fc59c597592808850 100644
--- a/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
@@ -51,6 +51,7 @@ public:
   void setObjFileName(StringRef Name);
   void setFirstSectionContrib(const SectionContrib &SC);
   void addSymbol(codeview::CVSymbol Symbol);
+  void addSymbolsInBulk(ArrayRef<uint8_t> BulkSymbols);
 
   void
   addDebugSubsection(std::shared_ptr<codeview::DebugSubsection> Subsection);
@@ -91,7 +92,7 @@ private:
   std::string ModuleName;
   std::string ObjFileName;
   std::vector<std::string> SourceFiles;
-  std::vector<codeview::CVSymbol> Symbols;
+  std::vector<ArrayRef<uint8_t>> Symbols;
 
   std::vector<std::unique_ptr<codeview::DebugSubsectionRecordBuilder>>
       C13Builders;
diff --git a/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
index 1a4f89d607dffd9d55a66e1654a7e2f9eb17228d..4c39ca762b5b74f874068aa30457c39ec447f746 100644
--- a/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
@@ -61,7 +61,6 @@ public:
   void addGlobalSymbol(const codeview::ProcRefSym &Sym);
   void addGlobalSymbol(const codeview::DataSym &Sym);
   void addGlobalSymbol(const codeview::ConstantSym &Sym);
-  void addGlobalSymbol(const codeview::UDTSym &Sym);
   void addGlobalSymbol(const codeview::CVSymbol &Sym);
 
 private:
diff --git a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
index 19b0ebdfac841b35fdcaba9db94ed57226d2a7fd..8d590df288f378d5010210beb2ccbd9820f0b39c 100644
--- a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
@@ -44,6 +44,8 @@ public:
   symbols(bool *HadError) const;
 
   const codeview::CVSymbolArray &getSymbolArray() const { return SymbolArray; }
+  const codeview::CVSymbolArray
+  getSymbolArrayForScope(uint32_t ScopeBegin) const;
 
   BinarySubstreamRef getSymbolsSubstream() const;
   BinarySubstreamRef getC11LinesSubstream() const;
diff --git a/include/llvm/Demangle/Demangle.h b/include/llvm/Demangle/Demangle.h
index b9b4d158e2ae7a39cc6b0e93ed9cca9088b4bd92..4c9dc9569e18835621ff6f924867bc734138dd12 100644
--- a/include/llvm/Demangle/Demangle.h
+++ b/include/llvm/Demangle/Demangle.h
@@ -31,11 +31,6 @@ enum : int {
 char *itaniumDemangle(const char *mangled_name, char *buf, size_t *n,
                       int *status);
 
-/// Calls the callback \c Callback with \c Ctx as an argument whenever a type is
-/// encountered. Returns true if \c MangledName couldn't be parsed.
-bool itaniumFindTypesInMangledName(const char *MangledName, void *Ctx,
-                                   void (*Callback)(void *, const char *));
-
 
 enum MSDemangleFlags { MSDF_None = 0, MSDF_DumpBackrefs = 1 << 0 };
 char *microsoftDemangle(const char *mangled_name, char *buf, size_t *n,
diff --git a/include/llvm/Demangle/ItaniumDemangle.h b/include/llvm/Demangle/ItaniumDemangle.h
index c5619a15bbee9370355c1c9a48748d0ba72b28a3..0b9187f30a5a627d3c628fa59984d34820e25e1f 100644
--- a/include/llvm/Demangle/ItaniumDemangle.h
+++ b/include/llvm/Demangle/ItaniumDemangle.h
@@ -2157,9 +2157,6 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   // conversion operator's type, and are resolved in the enclosing <encoding>.
   PODSmallVector<ForwardTemplateReference *, 4> ForwardTemplateRefs;
 
-  void (*TypeCallback)(void *, const char *) = nullptr;
-  void *TypeCallbackContext = nullptr;
-
   bool TryToParseTemplateArgs = true;
   bool PermitForwardTemplateReferences = false;
   bool ParsingLambdaParams = false;
@@ -3453,9 +3450,6 @@ template <typename Derived, typename Alloc>
 Node *AbstractManglingParser<Derived, Alloc>::parseType() {
   Node *Result = nullptr;
 
-  if (TypeCallback != nullptr)
-    TypeCallback(TypeCallbackContext, First);
-
   switch (look()) {
   //             ::= <qualified-type>
   case 'r':
diff --git a/include/llvm/Demangle/MicrosoftDemangleNodes.h b/include/llvm/Demangle/MicrosoftDemangleNodes.h
index 1d0b66a7bf41c86b6a69fda67e0e9ce5eb574b4b..1eca6762475a0f502fcee27f39ffadd16648408a 100644
--- a/include/llvm/Demangle/MicrosoftDemangleNodes.h
+++ b/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -235,6 +235,8 @@ struct Node {
 
   virtual void output(OutputStream &OS, OutputFlags Flags) const = 0;
 
+  std::string toString() const;
+
 private:
   NodeKind Kind;
 };
diff --git a/include/llvm/Demangle/Utility.h b/include/llvm/Demangle/Utility.h
index da4a0c050bc5c5b54f3d8052a806c6b8807552fb..1d1601c81635ab81969182fabab34644f0e8ae8a 100644
--- a/include/llvm/Demangle/Utility.h
+++ b/include/llvm/Demangle/Utility.h
@@ -175,13 +175,13 @@ inline bool initializeOutputStream(char *Buf, size_t *N, OutputStream &S,
   if (Buf == nullptr) {
     Buf = static_cast<char *>(std::malloc(InitSize));
     if (Buf == nullptr)
-      return true;
+      return false;
     BufferSize = InitSize;
   } else
     BufferSize = *N;
 
   S.reset(Buf, BufferSize);
-  return false;
+  return true;
 }
 
 #endif
diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index 589ca612f0466198400112b9d7f8f52f16af5001..1b08379b8c3b119a3aadc6ecc3bf7d70f709c220 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -42,23 +42,26 @@ class ObjectFile;
 /// The default implementation of each method does nothing.
 class JITEventListener {
 public:
+  using ObjectKey = uint64_t;
+
   JITEventListener() = default;
   virtual ~JITEventListener() = default;
 
-  /// NotifyObjectEmitted - Called after an object has been successfully
-  /// emitted to memory.  NotifyFunctionEmitted will not be called for
+  /// notifyObjectLoaded - Called after an object has had its sections allocated
+  /// and addresses assigned to all symbols. Note: Section memory will not have
+  /// been relocated yet. notifyFunctionLoaded will not be called for
   /// individual functions in the object.
   ///
   /// ELF-specific information
   /// The ObjectImage contains the generated object image
   /// with section headers updated to reflect the address at which sections
   /// were loaded and with relocations performed in-place on debug sections.
-  virtual void NotifyObjectEmitted(const object::ObjectFile &Obj,
-                                   const RuntimeDyld::LoadedObjectInfo &L) {}
+  virtual void notifyObjectLoaded(ObjectKey K, const object::ObjectFile &Obj,
+                                  const RuntimeDyld::LoadedObjectInfo &L) {}
 
-  /// NotifyFreeingObject - Called just before the memory associated with
+  /// notifyFreeingObject - Called just before the memory associated with
   /// a previously emitted object is released.
-  virtual void NotifyFreeingObject(const object::ObjectFile &Obj) {}
+  virtual void notifyFreeingObject(ObjectKey K) {}
 
   // Get a pointe to the GDB debugger registration listener.
   static JITEventListener *createGDBRegistrationListener();
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 5aaaaf3c396bdc812ba2040fb5bde7d01a57ccdb..9fc4614af0105bff3b45d1e9eb9f812d629cbcd7 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -230,29 +230,33 @@ public:
 
   /// Add an argument attribute. Returns a new set because attribute sets are
   /// immutable.
-  AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const;
+  LLVM_NODISCARD AttributeSet addAttribute(LLVMContext &C,
+                                           Attribute::AttrKind Kind) const;
 
   /// Add a target-dependent attribute. Returns a new set because attribute sets
   /// are immutable.
-  AttributeSet addAttribute(LLVMContext &C, StringRef Kind,
-                            StringRef Value = StringRef()) const;
+  LLVM_NODISCARD AttributeSet addAttribute(LLVMContext &C, StringRef Kind,
+                                           StringRef Value = StringRef()) const;
 
   /// Add attributes to the attribute set. Returns a new set because attribute
   /// sets are immutable.
-  AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const;
+  LLVM_NODISCARD AttributeSet addAttributes(LLVMContext &C,
+                                            AttributeSet AS) const;
 
   /// Remove the specified attribute from this set. Returns a new set because
   /// attribute sets are immutable.
-  AttributeSet removeAttribute(LLVMContext &C, Attribute::AttrKind Kind) const;
+  LLVM_NODISCARD AttributeSet removeAttribute(LLVMContext &C,
+                                              Attribute::AttrKind Kind) const;
 
   /// Remove the specified attribute from this set. Returns a new set because
   /// attribute sets are immutable.
-  AttributeSet removeAttribute(LLVMContext &C, StringRef Kind) const;
+  LLVM_NODISCARD AttributeSet removeAttribute(LLVMContext &C,
+                                              StringRef Kind) const;
 
   /// Remove the specified attributes from this set. Returns a new set because
   /// attribute sets are immutable.
-  AttributeSet removeAttributes(LLVMContext &C,
-                                const AttrBuilder &AttrsToRemove) const;
+  LLVM_NODISCARD AttributeSet
+  removeAttributes(LLVMContext &C, const AttrBuilder &AttrsToRemove) const;
 
   /// Return the number of attributes in this set.
   unsigned getNumAttributes() const;
@@ -375,133 +379,140 @@ public:
 
   /// Add an attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
-  AttributeList addAttribute(LLVMContext &C, unsigned Index,
-                             Attribute::AttrKind Kind) const;
+  LLVM_NODISCARD AttributeList addAttribute(LLVMContext &C, unsigned Index,
+                                            Attribute::AttrKind Kind) const;
 
   /// Add an attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
-  AttributeList addAttribute(LLVMContext &C, unsigned Index, StringRef Kind,
-                             StringRef Value = StringRef()) const;
+  LLVM_NODISCARD AttributeList
+  addAttribute(LLVMContext &C, unsigned Index, StringRef Kind,
+               StringRef Value = StringRef()) const;
 
   /// Add an attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
-  AttributeList addAttribute(LLVMContext &C, unsigned Index, Attribute A) const;
+  LLVM_NODISCARD AttributeList addAttribute(LLVMContext &C, unsigned Index,
+                                            Attribute A) const;
 
   /// Add attributes to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
-  AttributeList addAttributes(LLVMContext &C, unsigned Index,
-                              const AttrBuilder &B) const;
+  LLVM_NODISCARD AttributeList addAttributes(LLVMContext &C, unsigned Index,
+                                             const AttrBuilder &B) const;
 
   /// Add an argument attribute to the list. Returns a new list because
   /// attribute lists are immutable.
-  AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo,
-                                  Attribute::AttrKind Kind) const {
+  LLVM_NODISCARD AttributeList addParamAttribute(
+      LLVMContext &C, unsigned ArgNo, Attribute::AttrKind Kind) const {
     return addAttribute(C, ArgNo + FirstArgIndex, Kind);
   }
 
   /// Add an argument attribute to the list. Returns a new list because
   /// attribute lists are immutable.
-  AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo,
-                                  StringRef Kind,
-                                  StringRef Value = StringRef()) const {
+  LLVM_NODISCARD AttributeList
+  addParamAttribute(LLVMContext &C, unsigned ArgNo, StringRef Kind,
+                    StringRef Value = StringRef()) const {
     return addAttribute(C, ArgNo + FirstArgIndex, Kind, Value);
   }
 
   /// Add an attribute to the attribute list at the given arg indices. Returns a
   /// new list because attribute lists are immutable.
-  AttributeList addParamAttribute(LLVMContext &C, ArrayRef<unsigned> ArgNos,
-                                  Attribute A) const;
+  LLVM_NODISCARD AttributeList addParamAttribute(LLVMContext &C,
+                                                 ArrayRef<unsigned> ArgNos,
+                                                 Attribute A) const;
 
   /// Add an argument attribute to the list. Returns a new list because
   /// attribute lists are immutable.
-  AttributeList addParamAttributes(LLVMContext &C, unsigned ArgNo,
-                                   const AttrBuilder &B) const {
+  LLVM_NODISCARD AttributeList addParamAttributes(LLVMContext &C,
+                                                  unsigned ArgNo,
+                                                  const AttrBuilder &B) const {
     return addAttributes(C, ArgNo + FirstArgIndex, B);
   }
 
   /// Remove the specified attribute at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  AttributeList removeAttribute(LLVMContext &C, unsigned Index,
-                                Attribute::AttrKind Kind) const;
+  LLVM_NODISCARD AttributeList removeAttribute(LLVMContext &C, unsigned Index,
+                                               Attribute::AttrKind Kind) const;
 
   /// Remove the specified attribute at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  AttributeList removeAttribute(LLVMContext &C, unsigned Index,
-                                StringRef Kind) const;
+  LLVM_NODISCARD AttributeList removeAttribute(LLVMContext &C, unsigned Index,
+                                               StringRef Kind) const;
 
   /// Remove the specified attributes at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  AttributeList removeAttributes(LLVMContext &C, unsigned Index,
-                                 const AttrBuilder &AttrsToRemove) const;
+  LLVM_NODISCARD AttributeList removeAttributes(
+      LLVMContext &C, unsigned Index, const AttrBuilder &AttrsToRemove) const;
 
   /// Remove all attributes at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  AttributeList removeAttributes(LLVMContext &C, unsigned Index) const;
+  LLVM_NODISCARD AttributeList removeAttributes(LLVMContext &C,
+                                                unsigned Index) const;
 
   /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo,
-                                     Attribute::AttrKind Kind) const {
+  LLVM_NODISCARD AttributeList removeParamAttribute(
+      LLVMContext &C, unsigned ArgNo, Attribute::AttrKind Kind) const {
     return removeAttribute(C, ArgNo + FirstArgIndex, Kind);
   }
 
   /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo,
-                                     StringRef Kind) const {
+  LLVM_NODISCARD AttributeList removeParamAttribute(LLVMContext &C,
+                                                    unsigned ArgNo,
+                                                    StringRef Kind) const {
     return removeAttribute(C, ArgNo + FirstArgIndex, Kind);
   }
 
   /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo,
-                                      const AttrBuilder &AttrsToRemove) const {
+  LLVM_NODISCARD AttributeList removeParamAttributes(
+      LLVMContext &C, unsigned ArgNo, const AttrBuilder &AttrsToRemove) const {
     return removeAttributes(C, ArgNo + FirstArgIndex, AttrsToRemove);
   }
 
   /// Remove all attributes at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo) const {
+  LLVM_NODISCARD AttributeList removeParamAttributes(LLVMContext &C,
+                                                     unsigned ArgNo) const {
     return removeAttributes(C, ArgNo + FirstArgIndex);
   }
 
   /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// index. Returns a new list because attribute lists are immutable.
-  AttributeList addDereferenceableAttr(LLVMContext &C, unsigned Index,
-                                       uint64_t Bytes) const;
+  LLVM_NODISCARD AttributeList addDereferenceableAttr(LLVMContext &C,
+                                                      unsigned Index,
+                                                      uint64_t Bytes) const;
 
   /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// arg index. Returns a new list because attribute lists are immutable.
-  AttributeList addDereferenceableParamAttr(LLVMContext &C, unsigned ArgNo,
-                                            uint64_t Bytes) const {
+  LLVM_NODISCARD AttributeList addDereferenceableParamAttr(
+      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const {
     return addDereferenceableAttr(C, ArgNo + FirstArgIndex, Bytes);
   }
 
   /// Add the dereferenceable_or_null attribute to the attribute set at
   /// the given index. Returns a new list because attribute lists are immutable.
-  AttributeList addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
-                                             uint64_t Bytes) const;
+  LLVM_NODISCARD AttributeList addDereferenceableOrNullAttr(
+      LLVMContext &C, unsigned Index, uint64_t Bytes) const;
 
   /// Add the dereferenceable_or_null attribute to the attribute set at
   /// the given arg index. Returns a new list because attribute lists are
   /// immutable.
-  AttributeList addDereferenceableOrNullParamAttr(LLVMContext &C,
-                                                  unsigned ArgNo,
-                                                  uint64_t Bytes) const {
+  LLVM_NODISCARD AttributeList addDereferenceableOrNullParamAttr(
+      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const {
     return addDereferenceableOrNullAttr(C, ArgNo + FirstArgIndex, Bytes);
   }
 
   /// Add the allocsize attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
-  AttributeList addAllocSizeAttr(LLVMContext &C, unsigned Index,
-                                 unsigned ElemSizeArg,
-                                 const Optional<unsigned> &NumElemsArg);
+  LLVM_NODISCARD AttributeList
+  addAllocSizeAttr(LLVMContext &C, unsigned Index, unsigned ElemSizeArg,
+                   const Optional<unsigned> &NumElemsArg);
 
   /// Add the allocsize attribute to the attribute set at the given arg index.
   /// Returns a new list because attribute lists are immutable.
-  AttributeList addAllocSizeParamAttr(LLVMContext &C, unsigned ArgNo,
-                                      unsigned ElemSizeArg,
-                                      const Optional<unsigned> &NumElemsArg) {
+  LLVM_NODISCARD AttributeList
+  addAllocSizeParamAttr(LLVMContext &C, unsigned ArgNo, unsigned ElemSizeArg,
+                        const Optional<unsigned> &NumElemsArg) {
     return addAllocSizeAttr(C, ArgNo + FirstArgIndex, ElemSizeArg, NumElemsArg);
   }
 
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 7244bba1ca59e36f29b03645a3fc05a9ff2bce2c..99eac33f742ec2a7c8d0c5a7d87bba65bbd09098 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -237,6 +237,12 @@ public:
                  static_cast<const BasicBlock *>(this)->getUniquePredecessor());
   }
 
+  /// Return true if this block has exactly N predecessors.
+  bool hasNPredecessors(unsigned N) const;
+
+  /// Return true if this block has N predecessors or more.
+  bool hasNPredecessorsOrMore(unsigned N) const;
+
   /// Return the successor of this block if it has a single successor.
   /// Otherwise return a null pointer.
   ///
diff --git a/include/llvm/IR/CFG.h b/include/llvm/IR/CFG.h
index 4140c8a212e7eb1f300c1fc451e0eab1fd9f35ce..8385c4647e12c2bd65b40af07446d624c470ff5e 100644
--- a/include/llvm/IR/CFG.h
+++ b/include/llvm/IR/CFG.h
@@ -117,6 +117,8 @@ inline const_pred_iterator pred_end(const BasicBlock *BB) {
 inline bool pred_empty(const BasicBlock *BB) {
   return pred_begin(BB) == pred_end(BB);
 }
+/// Get the number of predecessors of \p BB. This is a linear time operation.
+/// Use \ref BasicBlock::hasNPredecessors() or hasNPredecessorsOrMore if able.
 inline unsigned pred_size(const BasicBlock *BB) {
   return std::distance(pred_begin(BB), pred_end(BB));
 }
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index 2162ccb982b007c94362dbfc969ae28a3dce20f7..a3e78049f4bef2c4971dc72d58ba9f36a6a874ec 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -656,10 +656,7 @@ public:
 
 private:
   IterTy getCallee() const {
-    if (isCall()) // Skip Callee
-      return cast<CallInst>(getInstruction())->op_end() - 1;
-    else // Skip BB, BB, Callee
-      return cast<InvokeInst>(getInstruction())->op_end() - 3;
+    return cast<CallBase>(getInstruction())->op_end() - 1;
   }
 };
 
diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index 5fdf0ea00f008d038698a883dbdf52c5692c69b2..98437f8eff1fa435109857fa5853942b3fe72135 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h
@@ -114,7 +114,8 @@ public:
 
   /// For aggregates (struct/array/vector) return the constant that corresponds
   /// to the specified element if possible, or null if not. This can return null
-  /// if the element index is a ConstantExpr, or if 'this' is a constant expr.
+  /// if the element index is a ConstantExpr, if 'this' is a constant expr or
+  /// if the constant does not fit into an uint64_t.
   Constant *getAggregateElement(unsigned Elt) const;
   Constant *getAggregateElement(Constant *Elt) const;
 
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index f9d5ebc560c70ac5f1f58e04a398b2dafaff8244..afc93cd61d474da770cb5948179bf45a6944f114 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -290,7 +290,11 @@ public:
 
   static Constant *get(Type* Ty, StringRef Str);
   static ConstantFP *get(LLVMContext &Context, const APFloat &V);
-  static Constant *getNaN(Type *Ty, bool Negative = false, unsigned type = 0);
+  static Constant *getNaN(Type *Ty, bool Negative = false, uint64_t Payload = 0);
+  static Constant *getQNaN(Type *Ty, bool Negative = false,
+                           APInt *Payload = nullptr);
+  static Constant *getSNaN(Type *Ty, bool Negative = false,
+                           APInt *Payload = nullptr);
   static Constant *getNegativeZero(Type *Ty);
   static Constant *getInfinity(Type *Ty, bool Negative = false);
 
@@ -1114,6 +1118,13 @@ public:
   static Constant *getSelect(Constant *C, Constant *V1, Constant *V2,
                              Type *OnlyIfReducedTy = nullptr);
 
+  /// get - Return a unary operator constant expression,
+  /// folding if possible.
+  ///
+  /// \param OnlyIfReducedTy see \a getWithOperands() docs.
+  static Constant *get(unsigned Opcode, Constant *C1, unsigned Flags = 0, 
+                       Type *OnlyIfReducedTy = nullptr);
+
   /// get - Return a binary or shift operator constant expression,
   /// folding if possible.
   ///
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 16b759709b745602ba6653c20e2027fc6b223346..1fe7b36db00b5ef19d37232f20540ec0e8c44a41 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -145,7 +145,8 @@ namespace llvm {
                       uint64_t DWOId = 0, bool SplitDebugInlining = true,
                       bool DebugInfoForProfiling = false,
                       DICompileUnit::DebugNameTableKind NameTableKind =
-                          DICompileUnit::DebugNameTableKind::Default);
+                          DICompileUnit::DebugNameTableKind::Default,
+                      bool RangesBaseAddress = false);
 
     /// Create a file descriptor to hold debugging information for a file.
     /// \param Filename  File name.
@@ -652,29 +653,28 @@ namespace llvm {
     /// \param File          File where this variable is defined.
     /// \param LineNo        Line number.
     /// \param Ty            Function type.
-    /// \param isLocalToUnit True if this function is not externally visible.
-    /// \param isDefinition  True if this is a function definition.
     /// \param ScopeLine     Set to the beginning of the scope this starts
     /// \param Flags         e.g. is this function prototyped or not.
     ///                      These flags are used to emit dwarf attributes.
-    /// \param isOptimized   True if optimization is ON.
+    /// \param SPFlags       Additional flags specific to subprograms.
     /// \param TParams       Function template parameters.
     /// \param ThrownTypes   Exception types this function may throw.
-    DISubprogram *createFunction(
-        DIScope *Scope, StringRef Name, StringRef LinkageName, DIFile *File,
-        unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
-        bool isDefinition, unsigned ScopeLine,
-        DINode::DIFlags Flags = DINode::FlagZero, bool isOptimized = false,
-        DITemplateParameterArray TParams = nullptr,
-        DISubprogram *Decl = nullptr, DITypeArray ThrownTypes = nullptr);
+    DISubprogram *
+    createFunction(DIScope *Scope, StringRef Name, StringRef LinkageName,
+                   DIFile *File, unsigned LineNo, DISubroutineType *Ty,
+                   unsigned ScopeLine, DINode::DIFlags Flags = DINode::FlagZero,
+                   DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagZero,
+                   DITemplateParameterArray TParams = nullptr,
+                   DISubprogram *Decl = nullptr,
+                   DITypeArray ThrownTypes = nullptr);
 
     /// Identical to createFunction,
     /// except that the resulting DbgNode is meant to be RAUWed.
     DISubprogram *createTempFunctionFwdDecl(
         DIScope *Scope, StringRef Name, StringRef LinkageName, DIFile *File,
-        unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
-        bool isDefinition, unsigned ScopeLine,
-        DINode::DIFlags Flags = DINode::FlagZero, bool isOptimized = false,
+        unsigned LineNo, DISubroutineType *Ty, unsigned ScopeLine,
+        DINode::DIFlags Flags = DINode::FlagZero,
+        DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagZero,
         DITemplateParameterArray TParams = nullptr,
         DISubprogram *Decl = nullptr, DITypeArray ThrownTypes = nullptr);
 
@@ -686,10 +686,6 @@ namespace llvm {
     /// \param File          File where this variable is defined.
     /// \param LineNo        Line number.
     /// \param Ty            Function type.
-    /// \param isLocalToUnit True if this function is not externally visible..
-    /// \param isDefinition  True if this is a function definition.
-    /// \param Virtuality    Attributes describing virtualness. e.g. pure
-    ///                      virtual function.
     /// \param VTableIndex   Index no of this method in virtual table, or -1u if
     ///                      unrepresentable.
     /// \param ThisAdjustment
@@ -698,17 +694,18 @@ namespace llvm {
     /// \param VTableHolder  Type that holds vtable.
     /// \param Flags         e.g. is this function prototyped or not.
     ///                      This flags are used to emit dwarf attributes.
-    /// \param isOptimized   True if optimization is ON.
+    /// \param SPFlags       Additional flags specific to subprograms.
     /// \param TParams       Function template parameters.
     /// \param ThrownTypes   Exception types this function may throw.
-    DISubprogram *createMethod(
-        DIScope *Scope, StringRef Name, StringRef LinkageName, DIFile *File,
-        unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
-        bool isDefinition, unsigned Virtuality = 0, unsigned VTableIndex = 0,
-        int ThisAdjustment = 0, DIType *VTableHolder = nullptr,
-        DINode::DIFlags Flags = DINode::FlagZero, bool isOptimized = false,
-        DITemplateParameterArray TParams = nullptr,
-        DITypeArray ThrownTypes = nullptr);
+    DISubprogram *
+    createMethod(DIScope *Scope, StringRef Name, StringRef LinkageName,
+                 DIFile *File, unsigned LineNo, DISubroutineType *Ty,
+                 unsigned VTableIndex = 0, int ThisAdjustment = 0,
+                 DIType *VTableHolder = nullptr,
+                 DINode::DIFlags Flags = DINode::FlagZero,
+                 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagZero,
+                 DITemplateParameterArray TParams = nullptr,
+                 DITypeArray ThrownTypes = nullptr);
 
     /// This creates new descriptor for a namespace with the specified
     /// parent scope.
diff --git a/include/llvm/IR/DebugInfoFlags.def b/include/llvm/IR/DebugInfoFlags.def
index a3c3430768f40df9dfbc83f9245ded8550515917..a0a9c6bcacb8639a46fd0867f0bec11272236d07 100644
--- a/include/llvm/IR/DebugInfoFlags.def
+++ b/include/llvm/IR/DebugInfoFlags.def
@@ -11,11 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: Add other DW-based macros.
+#if !(defined HANDLE_DI_FLAG || defined HANDLE_DISP_FLAG)
+#error "Missing macro definition of HANDLE_DI*"
+#endif
+
 #ifndef HANDLE_DI_FLAG
-#error "Missing macro definition of HANDLE_DI_FLAG"
+#define HANDLE_DI_FLAG(ID, NAME)
 #endif
 
+#ifndef HANDLE_DISP_FLAG
+#define HANDLE_DISP_FLAG(ID, NAME)
+#endif
+
+// General flags kept in DINode.
+
 HANDLE_DI_FLAG(0, Zero) // Use it as zero value.
                         // For example: void foo(DIFlags Flags = FlagZero).
 HANDLE_DI_FLAG(1, Private)
@@ -64,4 +73,25 @@ HANDLE_DI_FLAG((1 << 29), Largest)
 #undef DI_FLAG_LARGEST_NEEDED
 #endif
 
+// Subprogram-specific flags kept in DISubprogram.
+
+// Use this as a zero/initialization value.
+// For example: void foo(DISPFlags Flags = SPFlagZero).
+HANDLE_DISP_FLAG(0, Zero)
+// Virtuality is a two-bit enum field in the LSB of the word.
+// Values should match DW_VIRTUALITY_*.
+HANDLE_DISP_FLAG(1u, Virtual)
+HANDLE_DISP_FLAG(2u, PureVirtual)
+HANDLE_DISP_FLAG((1u << 2), LocalToUnit)
+HANDLE_DISP_FLAG((1u << 3), Definition)
+HANDLE_DISP_FLAG((1u << 4), Optimized)
+
+#ifdef DISP_FLAG_LARGEST_NEEDED
+// Intended to be used with ADT/BitmaskEnum.h.
+// NOTE: Always must be equal to largest flag, check this when adding new flags.
+HANDLE_DISP_FLAG((1 << 4), Largest)
+#undef DISP_FLAG_LARGEST_NEEDED
+#endif
+
 #undef HANDLE_DI_FLAG
+#undef HANDLE_DISP_FLAG
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 2a9181e17c6bad57f2ec09bf64a17dc7a55e29dd..0830b5ccbeca6327f41da041d717c0ffcda9c53f 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -1192,18 +1192,19 @@ private:
   bool SplitDebugInlining;
   bool DebugInfoForProfiling;
   unsigned NameTableKind;
+  bool RangesBaseAddress;
 
   DICompileUnit(LLVMContext &C, StorageType Storage, unsigned SourceLanguage,
                 bool IsOptimized, unsigned RuntimeVersion,
                 unsigned EmissionKind, uint64_t DWOId, bool SplitDebugInlining,
                 bool DebugInfoForProfiling, unsigned NameTableKind,
-                ArrayRef<Metadata *> Ops)
+                bool RangesBaseAddress, ArrayRef<Metadata *> Ops)
       : DIScope(C, DICompileUnitKind, Storage, dwarf::DW_TAG_compile_unit, Ops),
         SourceLanguage(SourceLanguage), IsOptimized(IsOptimized),
         RuntimeVersion(RuntimeVersion), EmissionKind(EmissionKind),
         DWOId(DWOId), SplitDebugInlining(SplitDebugInlining),
         DebugInfoForProfiling(DebugInfoForProfiling),
-        NameTableKind(NameTableKind) {
+        NameTableKind(NameTableKind), RangesBaseAddress(RangesBaseAddress) {
     assert(Storage != Uniqued);
   }
   ~DICompileUnit() = default;
@@ -1217,15 +1218,16 @@ private:
           DIGlobalVariableExpressionArray GlobalVariables,
           DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros,
           uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling,
-          unsigned NameTableKind, StorageType Storage,
+          unsigned NameTableKind, bool RangesBaseAddress, StorageType Storage,
           bool ShouldCreate = true) {
-    return getImpl(
-        Context, SourceLanguage, File, getCanonicalMDString(Context, Producer),
-        IsOptimized, getCanonicalMDString(Context, Flags), RuntimeVersion,
-        getCanonicalMDString(Context, SplitDebugFilename), EmissionKind,
-        EnumTypes.get(), RetainedTypes.get(), GlobalVariables.get(),
-        ImportedEntities.get(), Macros.get(), DWOId, SplitDebugInlining,
-        DebugInfoForProfiling, NameTableKind, Storage, ShouldCreate);
+    return getImpl(Context, SourceLanguage, File,
+                   getCanonicalMDString(Context, Producer), IsOptimized,
+                   getCanonicalMDString(Context, Flags), RuntimeVersion,
+                   getCanonicalMDString(Context, SplitDebugFilename),
+                   EmissionKind, EnumTypes.get(), RetainedTypes.get(),
+                   GlobalVariables.get(), ImportedEntities.get(), Macros.get(),
+                   DWOId, SplitDebugInlining, DebugInfoForProfiling,
+                   NameTableKind, RangesBaseAddress, Storage, ShouldCreate);
   }
   static DICompileUnit *
   getImpl(LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
@@ -1235,16 +1237,16 @@ private:
           Metadata *GlobalVariables, Metadata *ImportedEntities,
           Metadata *Macros, uint64_t DWOId, bool SplitDebugInlining,
           bool DebugInfoForProfiling, unsigned NameTableKind,
-          StorageType Storage, bool ShouldCreate = true);
+          bool RangesBaseAddress, StorageType Storage, bool ShouldCreate = true);
 
   TempDICompileUnit cloneImpl() const {
-    return getTemporary(getContext(), getSourceLanguage(), getFile(),
-                        getProducer(), isOptimized(), getFlags(),
-                        getRuntimeVersion(), getSplitDebugFilename(),
-                        getEmissionKind(), getEnumTypes(), getRetainedTypes(),
-                        getGlobalVariables(), getImportedEntities(),
-                        getMacros(), DWOId, getSplitDebugInlining(),
-                        getDebugInfoForProfiling(), getNameTableKind());
+    return getTemporary(
+        getContext(), getSourceLanguage(), getFile(), getProducer(),
+        isOptimized(), getFlags(), getRuntimeVersion(), getSplitDebugFilename(),
+        getEmissionKind(), getEnumTypes(), getRetainedTypes(),
+        getGlobalVariables(), getImportedEntities(), getMacros(), DWOId,
+        getSplitDebugInlining(), getDebugInfoForProfiling(), getNameTableKind(),
+        getRangesBaseAddress());
   }
 
 public:
@@ -1260,11 +1262,11 @@ public:
        DIGlobalVariableExpressionArray GlobalVariables,
        DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros,
        uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling,
-       DebugNameTableKind NameTableKind),
+       DebugNameTableKind NameTableKind, bool RangesBaseAddress),
       (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion,
        SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes,
        GlobalVariables, ImportedEntities, Macros, DWOId, SplitDebugInlining,
-       DebugInfoForProfiling, (unsigned)NameTableKind))
+       DebugInfoForProfiling, (unsigned)NameTableKind, RangesBaseAddress))
   DEFINE_MDNODE_GET_DISTINCT_TEMPORARY(
       DICompileUnit,
       (unsigned SourceLanguage, Metadata *File, MDString *Producer,
@@ -1273,11 +1275,11 @@ public:
        Metadata *RetainedTypes, Metadata *GlobalVariables,
        Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId,
        bool SplitDebugInlining, bool DebugInfoForProfiling,
-       unsigned NameTableKind),
+       unsigned NameTableKind, bool RangesBaseAddress),
       (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion,
        SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes,
        GlobalVariables, ImportedEntities, Macros, DWOId, SplitDebugInlining,
-       DebugInfoForProfiling, NameTableKind))
+       DebugInfoForProfiling, NameTableKind, RangesBaseAddress))
 
   TempDICompileUnit clone() const { return cloneImpl(); }
 
@@ -1294,9 +1296,14 @@ public:
   DebugNameTableKind getNameTableKind() const {
     return (DebugNameTableKind)NameTableKind;
   }
-  StringRef getProducer() const { return getStringOperand(1); }
-  StringRef getFlags() const { return getStringOperand(2); }
-  StringRef getSplitDebugFilename() const { return getStringOperand(3); }
+  bool getRangesBaseAddress() const {
+    return RangesBaseAddress; }
+  StringRef getProducer() const {
+    return getStringOperand(1); }
+  StringRef getFlags() const {
+    return getStringOperand(2); }
+  StringRef getSplitDebugFilename() const {
+    return getStringOperand(3); }
   DICompositeTypeArray getEnumTypes() const {
     return cast_or_null<MDTuple>(getRawEnumTypes());
   }
@@ -1607,102 +1614,118 @@ class DISubprogram : public DILocalScope {
   /// negative.
   int ThisAdjustment;
 
-  // Virtuality can only assume three values, so we can pack
-  // in 2 bits (none/pure/pure_virtual).
-  unsigned Virtuality : 2;
+public:
+  /// Debug info subprogram flags.
+  enum DISPFlags : uint32_t {
+#define HANDLE_DISP_FLAG(ID, NAME) SPFlag##NAME = ID,
+#define DISP_FLAG_LARGEST_NEEDED
+#include "llvm/IR/DebugInfoFlags.def"
+    SPFlagNonvirtual = SPFlagZero,
+    SPFlagVirtuality = SPFlagVirtual | SPFlagPureVirtual,
+    LLVM_MARK_AS_BITMASK_ENUM(SPFlagLargest)
+  };
 
-  // These are boolean flags so one bit is enough.
-  // MSVC starts a new container field every time the base
-  // type changes so we can't use 'bool' to ensure these bits
-  // are packed.
-  unsigned IsLocalToUnit : 1;
-  unsigned IsDefinition : 1;
-  unsigned IsOptimized : 1;
+  static DISPFlags getFlag(StringRef Flag);
+  static StringRef getFlagString(DISPFlags Flag);
 
-  unsigned Padding : 3;
+  /// Split up a flags bitfield for easier printing.
+  ///
+  /// Split \c Flags into \c SplitFlags, a vector of its components.  Returns
+  /// any remaining (unrecognized) bits.
+  static DISPFlags splitFlags(DISPFlags Flags,
+                              SmallVectorImpl<DISPFlags> &SplitFlags);
+
+  // Helper for converting old bitfields to new flags word.
+  static DISPFlags toSPFlags(bool IsLocalToUnit, bool IsDefinition,
+                             bool IsOptimized,
+                             unsigned Virtuality = SPFlagNonvirtual) {
+    // We're assuming virtuality is the low-order field.
+    static_assert(
+        int(SPFlagVirtual) == int(dwarf::DW_VIRTUALITY_virtual) &&
+            int(SPFlagPureVirtual) == int(dwarf::DW_VIRTUALITY_pure_virtual),
+        "Virtuality constant mismatch");
+    return static_cast<DISPFlags>(
+        (Virtuality & SPFlagVirtuality) |
+        (IsLocalToUnit ? SPFlagLocalToUnit : SPFlagZero) |
+        (IsDefinition ? SPFlagDefinition : SPFlagZero) |
+        (IsOptimized ? SPFlagOptimized : SPFlagZero));
+  }
 
+private:
   DIFlags Flags;
+  DISPFlags SPFlags;
 
   DISubprogram(LLVMContext &C, StorageType Storage, unsigned Line,
-               unsigned ScopeLine, unsigned Virtuality, unsigned VirtualIndex,
-               int ThisAdjustment, DIFlags Flags, bool IsLocalToUnit,
-               bool IsDefinition, bool IsOptimized, ArrayRef<Metadata *> Ops)
+               unsigned ScopeLine, unsigned VirtualIndex, int ThisAdjustment,
+               DIFlags Flags, DISPFlags SPFlags, ArrayRef<Metadata *> Ops)
       : DILocalScope(C, DISubprogramKind, Storage, dwarf::DW_TAG_subprogram,
                      Ops),
         Line(Line), ScopeLine(ScopeLine), VirtualIndex(VirtualIndex),
-        ThisAdjustment(ThisAdjustment), Virtuality(Virtuality),
-        IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition),
-        IsOptimized(IsOptimized), Flags(Flags) {
+        ThisAdjustment(ThisAdjustment), Flags(Flags), SPFlags(SPFlags) {
     static_assert(dwarf::DW_VIRTUALITY_max < 4, "Virtuality out of range");
-    assert(Virtuality < 4 && "Virtuality out of range");
   }
   ~DISubprogram() = default;
 
   static DISubprogram *
   getImpl(LLVMContext &Context, DIScopeRef Scope, StringRef Name,
           StringRef LinkageName, DIFile *File, unsigned Line,
-          DISubroutineType *Type, bool IsLocalToUnit, bool IsDefinition,
-          unsigned ScopeLine, DITypeRef ContainingType, unsigned Virtuality,
+          DISubroutineType *Type, unsigned ScopeLine, DITypeRef ContainingType,
           unsigned VirtualIndex, int ThisAdjustment, DIFlags Flags,
-          bool IsOptimized, DICompileUnit *Unit,
+          DISPFlags SPFlags, DICompileUnit *Unit,
           DITemplateParameterArray TemplateParams, DISubprogram *Declaration,
           DINodeArray RetainedNodes, DITypeArray ThrownTypes,
           StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
                    getCanonicalMDString(Context, LinkageName), File, Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams.get(), Declaration, RetainedNodes.get(),
-                   ThrownTypes.get(), Storage, ShouldCreate);
+                   ScopeLine, ContainingType, VirtualIndex, ThisAdjustment,
+                   Flags, SPFlags, Unit, TemplateParams.get(), Declaration,
+                   RetainedNodes.get(), ThrownTypes.get(), Storage,
+                   ShouldCreate);
   }
-  static DISubprogram *
-  getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
-          MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
-          bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
-          Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
-          int ThisAdjustment, DIFlags Flags, bool IsOptimized, Metadata *Unit,
-          Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes,
-          Metadata *ThrownTypes, StorageType Storage, bool ShouldCreate = true);
+  static DISubprogram *getImpl(LLVMContext &Context, Metadata *Scope,
+                               MDString *Name, MDString *LinkageName,
+                               Metadata *File, unsigned Line, Metadata *Type,
+                               unsigned ScopeLine, Metadata *ContainingType,
+                               unsigned VirtualIndex, int ThisAdjustment,
+                               DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
+                               Metadata *TemplateParams, Metadata *Declaration,
+                               Metadata *RetainedNodes, Metadata *ThrownTypes,
+                               StorageType Storage, bool ShouldCreate = true);
 
   TempDISubprogram cloneImpl() const {
     return getTemporary(getContext(), getScope(), getName(), getLinkageName(),
-                        getFile(), getLine(), getType(), isLocalToUnit(),
-                        isDefinition(), getScopeLine(), getContainingType(),
-                        getVirtuality(), getVirtualIndex(), getThisAdjustment(),
-                        getFlags(), isOptimized(), getUnit(),
-                        getTemplateParams(), getDeclaration(), getRetainedNodes(),
-                        getThrownTypes());
+                        getFile(), getLine(), getType(), getScopeLine(),
+                        getContainingType(), getVirtualIndex(),
+                        getThisAdjustment(), getFlags(), getSPFlags(),
+                        getUnit(), getTemplateParams(), getDeclaration(),
+                        getRetainedNodes(), getThrownTypes());
   }
 
 public:
-  DEFINE_MDNODE_GET(DISubprogram,
-                    (DIScopeRef Scope, StringRef Name, StringRef LinkageName,
-                     DIFile *File, unsigned Line, DISubroutineType *Type,
-                     bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
-                     DITypeRef ContainingType, unsigned Virtuality,
-                     unsigned VirtualIndex, int ThisAdjustment, DIFlags Flags,
-                     bool IsOptimized, DICompileUnit *Unit,
-                     DITemplateParameterArray TemplateParams = nullptr,
-                     DISubprogram *Declaration = nullptr,
-                     DINodeArray RetainedNodes = nullptr,
-                     DITypeArray ThrownTypes = nullptr),
-                    (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
-                     IsDefinition, ScopeLine, ContainingType, Virtuality,
-                     VirtualIndex, ThisAdjustment, Flags, IsOptimized, Unit,
-                     TemplateParams, Declaration, RetainedNodes, ThrownTypes))
+  DEFINE_MDNODE_GET(
+      DISubprogram,
+      (DIScopeRef Scope, StringRef Name, StringRef LinkageName, DIFile *File,
+       unsigned Line, DISubroutineType *Type, unsigned ScopeLine,
+       DITypeRef ContainingType, unsigned VirtualIndex, int ThisAdjustment,
+       DIFlags Flags, DISPFlags SPFlags, DICompileUnit *Unit,
+       DITemplateParameterArray TemplateParams = nullptr,
+       DISubprogram *Declaration = nullptr, DINodeArray RetainedNodes = nullptr,
+       DITypeArray ThrownTypes = nullptr),
+      (Scope, Name, LinkageName, File, Line, Type, ScopeLine, ContainingType,
+       VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit, TemplateParams,
+       Declaration, RetainedNodes, ThrownTypes))
+
   DEFINE_MDNODE_GET(
       DISubprogram,
       (Metadata * Scope, MDString *Name, MDString *LinkageName, Metadata *File,
-       unsigned Line, Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
-       unsigned ScopeLine, Metadata *ContainingType, unsigned Virtuality,
-       unsigned VirtualIndex, int ThisAdjustment, DIFlags Flags,
-       bool IsOptimized, Metadata *Unit, Metadata *TemplateParams = nullptr,
-       Metadata *Declaration = nullptr, Metadata *RetainedNodes = nullptr,
-       Metadata *ThrownTypes = nullptr),
-      (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
-       ScopeLine, ContainingType, Virtuality, VirtualIndex, ThisAdjustment,
-       Flags, IsOptimized, Unit, TemplateParams, Declaration, RetainedNodes,
-       ThrownTypes))
+       unsigned Line, Metadata *Type, unsigned ScopeLine,
+       Metadata *ContainingType, unsigned VirtualIndex, int ThisAdjustment,
+       DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
+       Metadata *TemplateParams = nullptr, Metadata *Declaration = nullptr,
+       Metadata *RetainedNodes = nullptr, Metadata *ThrownTypes = nullptr),
+      (Scope, Name, LinkageName, File, Line, Type, ScopeLine, ContainingType,
+       VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit, TemplateParams,
+       Declaration, RetainedNodes, ThrownTypes))
 
   TempDISubprogram clone() const { return cloneImpl(); }
 
@@ -1715,14 +1738,15 @@ public:
 
 public:
   unsigned getLine() const { return Line; }
-  unsigned getVirtuality() const { return Virtuality; }
+  unsigned getVirtuality() const { return getSPFlags() & SPFlagVirtuality; }
   unsigned getVirtualIndex() const { return VirtualIndex; }
   int getThisAdjustment() const { return ThisAdjustment; }
   unsigned getScopeLine() const { return ScopeLine; }
   DIFlags getFlags() const { return Flags; }
-  bool isLocalToUnit() const { return IsLocalToUnit; }
-  bool isDefinition() const { return IsDefinition; }
-  bool isOptimized() const { return IsOptimized; }
+  DISPFlags getSPFlags() const { return SPFlags; }
+  bool isLocalToUnit() const { return getSPFlags() & SPFlagLocalToUnit; }
+  bool isDefinition() const { return getSPFlags() & SPFlagDefinition; }
+  bool isOptimized() const { return getSPFlags() & SPFlagOptimized; }
 
   bool isArtificial() const { return getFlags() & FlagArtificial; }
   bool isPrivate() const {
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index b8fdae2fcadbeb88b05f34dd6171deed3e2ab39c..b5ed2c7d5097812d4fc8e2640fcc96442f788624 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -340,7 +340,7 @@ private:
 };
 
 class DiagnosticLocation {
-  StringRef Filename;
+  DIFile *File = nullptr;
   unsigned Line = 0;
   unsigned Column = 0;
 
@@ -349,8 +349,11 @@ public:
   DiagnosticLocation(const DebugLoc &DL);
   DiagnosticLocation(const DISubprogram *SP);
 
-  bool isValid() const { return !Filename.empty(); }
-  StringRef getFilename() const { return Filename; }
+  bool isValid() const { return File; }
+  /// Return the full path to the file.
+  std::string getAbsolutePath() const;
+  /// Return the file name relative to the compilation directory.
+  StringRef getRelativePath() const;
   unsigned getLine() const { return Line; }
   unsigned getColumn() const { return Column; }
 };
@@ -375,9 +378,13 @@ public:
   const std::string getLocationStr() const;
 
   /// Return location information for this diagnostic in three parts:
-  /// the source file name, line number and column.
-  void getLocation(StringRef *Filename, unsigned *Line, unsigned *Column) const;
+  /// the relative source file path, line number and column.
+  void getLocation(StringRef &RelativePath, unsigned &Line,
+                   unsigned &Column) const;
 
+  /// Return the absolute path tot the file.
+  std::string getAbsolutePath() const;
+  
   const Function &getFunction() const { return Fn; }
   DiagnosticLocation getLocation() const { return Loc; }
 
diff --git a/include/llvm/IR/InstVisitor.h b/include/llvm/IR/InstVisitor.h
index 554417f984aa5cc1e1d0d5b666a854fb346fd866..5e9b18d90a0cade0c32f7649472cfa21c73c91d6 100644
--- a/include/llvm/IR/InstVisitor.h
+++ b/include/llvm/IR/InstVisitor.h
@@ -263,6 +263,7 @@ public:
   // of instructions...
   //
   RetTy visitCastInst(CastInst &I)                { DELEGATE(UnaryInstruction);}
+  RetTy visitUnaryOperator(UnaryOperator &I)      { DELEGATE(UnaryInstruction);}
   RetTy visitBinaryOperator(BinaryOperator &I)    { DELEGATE(Instruction);}
   RetTy visitCmpInst(CmpInst &I)                  { DELEGATE(Instruction);}
   RetTy visitUnaryInstruction(UnaryInstruction &I){ DELEGATE(Instruction);}
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index f8d23c7f6140f2ef78278f94166cc17bb6e1ca2a..4611a61e0b034b5fc151c396a46f2a89880346c3 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
@@ -904,76 +905,6 @@ struct OperandTraits<CmpInst> : public FixedNumOperandTraits<CmpInst, 2> {
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CmpInst, Value)
 
-//===----------------------------------------------------------------------===//
-//                           FuncletPadInst Class
-//===----------------------------------------------------------------------===//
-class FuncletPadInst : public Instruction {
-private:
-  FuncletPadInst(const FuncletPadInst &CPI);
-
-  explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
-                          ArrayRef<Value *> Args, unsigned Values,
-                          const Twine &NameStr, Instruction *InsertBefore);
-  explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
-                          ArrayRef<Value *> Args, unsigned Values,
-                          const Twine &NameStr, BasicBlock *InsertAtEnd);
-
-  void init(Value *ParentPad, ArrayRef<Value *> Args, const Twine &NameStr);
-
-protected:
-  // Note: Instruction needs to be a friend here to call cloneImpl.
-  friend class Instruction;
-  friend class CatchPadInst;
-  friend class CleanupPadInst;
-
-  FuncletPadInst *cloneImpl() const;
-
-public:
-  /// Provide fast operand accessors
-  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
-
-  /// getNumArgOperands - Return the number of funcletpad arguments.
-  ///
-  unsigned getNumArgOperands() const { return getNumOperands() - 1; }
-
-  /// Convenience accessors
-
-  /// Return the outer EH-pad this funclet is nested within.
-  ///
-  /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst
-  /// is a CatchPadInst.
-  Value *getParentPad() const { return Op<-1>(); }
-  void setParentPad(Value *ParentPad) {
-    assert(ParentPad);
-    Op<-1>() = ParentPad;
-  }
-
-  /// getArgOperand/setArgOperand - Return/set the i-th funcletpad argument.
-  ///
-  Value *getArgOperand(unsigned i) const { return getOperand(i); }
-  void setArgOperand(unsigned i, Value *v) { setOperand(i, v); }
-
-  /// arg_operands - iteration adapter for range-for loops.
-  op_range arg_operands() { return op_range(op_begin(), op_end() - 1); }
-
-  /// arg_operands - iteration adapter for range-for loops.
-  const_op_range arg_operands() const {
-    return const_op_range(op_begin(), op_end() - 1);
-  }
-
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static bool classof(const Instruction *I) { return I->isFuncletPad(); }
-  static bool classof(const Value *V) {
-    return isa<Instruction>(V) && classof(cast<Instruction>(V));
-  }
-};
-
-template <>
-struct OperandTraits<FuncletPadInst>
-    : public VariadicOperandTraits<FuncletPadInst, /*MINARITY=*/1> {};
-
-DEFINE_TRANSPARENT_OPERAND_ACCESSORS(FuncletPadInst, Value)
-
 /// A lightweight accessor for an operand bundle meant to be passed
 /// around by value.
 struct OperandBundleUse {
@@ -1058,54 +989,468 @@ public:
 using OperandBundleDef = OperandBundleDefT<Value *>;
 using ConstOperandBundleDef = OperandBundleDefT<const Value *>;
 
-/// A mixin to add operand bundle functionality to llvm instruction
-/// classes.
-///
-/// OperandBundleUser uses the descriptor area co-allocated with the host User
-/// to store some meta information about which operands are "normal" operands,
-/// and which ones belong to some operand bundle.
-///
-/// The layout of an operand bundle user is
-///
-///          +-----------uint32_t End-------------------------------------+
-///          |                                                            |
-///          |  +--------uint32_t Begin--------------------+              |
-///          |  |                                          |              |
-///          ^  ^                                          v              v
-///  |------|------|----|----|----|----|----|---------|----|---------|----|-----
-///  | BOI0 | BOI1 | .. | DU | U0 | U1 | .. | BOI0_U0 | .. | BOI1_U0 | .. | Un
-///  |------|------|----|----|----|----|----|---------|----|---------|----|-----
-///   v  v                                  ^              ^
-///   |  |                                  |              |
-///   |  +--------uint32_t Begin------------+              |
-///   |                                                    |
-///   +-----------uint32_t End-----------------------------+
-///
-///
-/// BOI0, BOI1 ... are descriptions of operand bundles in this User's use list.
-/// These descriptions are installed and managed by this class, and they're all
-/// instances of OperandBundleUser<T>::BundleOpInfo.
-///
-/// DU is an additional descriptor installed by User's 'operator new' to keep
-/// track of the 'BOI0 ... BOIN' co-allocation.  OperandBundleUser does not
-/// access or modify DU in any way, it's an implementation detail private to
-/// User.
-///
-/// The regular Use& vector for the User starts at U0.  The operand bundle uses
-/// are part of the Use& vector, just like normal uses.  In the diagram above,
-/// the operand bundle uses start at BOI0_U0.  Each instance of BundleOpInfo has
-/// information about a contiguous set of uses constituting an operand bundle,
-/// and the total set of operand bundle uses themselves form a contiguous set of
-/// uses (i.e. there are no gaps between uses corresponding to individual
-/// operand bundles).
+//===----------------------------------------------------------------------===//
+//                               CallBase Class
+//===----------------------------------------------------------------------===//
+
+/// Base class for all callable instructions (InvokeInst and CallInst)
+/// Holds everything related to calling a function.
 ///
-/// This class does not know the location of the set of operand bundle uses
-/// within the use list -- that is decided by the User using this class via the
-/// BeginIdx argument in populateBundleOperandInfos.
+/// All call-like instructions are required to use a common operand layout:
+/// - Zero or more arguments to the call,
+/// - Zero or more operand bundles with zero or more operand inputs each
+///   bundle,
+/// - Zero or more subclass controlled operands
+/// - The called function.
 ///
-/// Currently operand bundle users with hung-off operands are not supported.
-template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
+/// This allows this base class to easily access the called function and the
+/// start of the arguments without knowing how many other operands a particular
+/// subclass requires. Note that accessing the end of the argument list isn't
+/// as cheap as most other operations on the base class.
+class CallBase : public Instruction {
+protected:
+  /// The last operand is the called operand.
+  static constexpr int CalledOperandOpEndIdx = -1;
+
+  AttributeList Attrs; ///< parameter attributes for callable
+  FunctionType *FTy;
+
+  template <class... ArgsTy>
+  CallBase(AttributeList const &A, FunctionType *FT, ArgsTy &&... Args)
+      : Instruction(std::forward<ArgsTy>(Args)...), Attrs(A), FTy(FT) {}
+
+  using Instruction::Instruction;
+
+  bool hasDescriptor() const { return Value::HasDescriptor; }
+
+  unsigned getNumSubclassExtraOperands() const {
+    switch (getOpcode()) {
+    case Instruction::Call:
+      return 0;
+    case Instruction::Invoke:
+      return 2;
+    }
+    llvm_unreachable("Invalid opcode!");
+  }
+
 public:
+  using Instruction::getContext;
+
+  static bool classof(const Instruction *I) {
+    return I->getOpcode() == Instruction::Call ||
+           I->getOpcode() == Instruction::Invoke;
+  }
+
+  FunctionType *getFunctionType() const { return FTy; }
+
+  void mutateFunctionType(FunctionType *FTy) {
+    Value::mutateType(FTy->getReturnType());
+    this->FTy = FTy;
+  }
+
+  /// Return the iterator pointing to the beginning of the argument list.
+  User::op_iterator arg_begin() { return op_begin(); }
+  User::const_op_iterator arg_begin() const {
+    return const_cast<CallBase *>(this)->arg_begin();
+  }
+
+  /// Return the iterator pointing to the end of the argument list.
+  User::op_iterator arg_end() {
+    // Walk from the end of the operands over the called operand, the subclass
+    // operands, and any operands for bundles to find the end of the argument
+    // operands.
+    return op_end() - getNumTotalBundleOperands() -
+           getNumSubclassExtraOperands() - 1;
+  }
+  User::const_op_iterator arg_end() const {
+    return const_cast<CallBase *>(this)->arg_end();
+  }
+
+  /// Iteration adapter for range-for loops.
+  iterator_range<User::op_iterator> arg_operands() {
+    return make_range(arg_begin(), arg_end());
+  }
+  iterator_range<User::const_op_iterator> arg_operands() const {
+    return make_range(arg_begin(), arg_end());
+  }
+
+  unsigned getNumArgOperands() const { return arg_end() - arg_begin(); }
+
+  Value *getArgOperand(unsigned i) const {
+    assert(i < getNumArgOperands() && "Out of bounds!");
+    return getOperand(i);
+  }
+
+  void setArgOperand(unsigned i, Value *v) {
+    assert(i < getNumArgOperands() && "Out of bounds!");
+    setOperand(i, v);
+  }
+
+  /// Wrappers for getting the \c Use of a call argument.
+  const Use &getArgOperandUse(unsigned i) const {
+    assert(i < getNumArgOperands() && "Out of bounds!");
+    return User::getOperandUse(i);
+  }
+  Use &getArgOperandUse(unsigned i) {
+    assert(i < getNumArgOperands() && "Out of bounds!");
+    return User::getOperandUse(i);
+  }
+
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  Value *getCalledOperand() const { return Op<CalledOperandOpEndIdx>(); }
+
+  // DEPRECATED: This routine will be removed in favor of `getCalledOperand` in
+  // the near future.
+  Value *getCalledValue() const { return getCalledOperand(); }
+
+  const Use &getCalledOperandUse() const { return Op<CalledOperandOpEndIdx>(); }
+  Use &getCalledOperandUse() { return Op<CalledOperandOpEndIdx>(); }
+
+  /// Returns the function called, or null if this is an
+  /// indirect function invocation.
+  Function *getCalledFunction() const {
+    return dyn_cast_or_null<Function>(getCalledOperand());
+  }
+
+  void setCalledOperand(Value *V) { Op<CalledOperandOpEndIdx>() = V; }
+
+  /// Sets the function called, including updating the function type.
+  void setCalledFunction(Value *Fn) {
+    setCalledFunction(
+        cast<FunctionType>(cast<PointerType>(Fn->getType())->getElementType()),
+        Fn);
+  }
+
+  /// Sets the function called, including updating to the specified function
+  /// type.
+  void setCalledFunction(FunctionType *FTy, Value *Fn) {
+    this->FTy = FTy;
+    assert(FTy == cast<FunctionType>(
+                      cast<PointerType>(Fn->getType())->getElementType()));
+    setCalledOperand(Fn);
+  }
+
+  CallingConv::ID getCallingConv() const {
+    return static_cast<CallingConv::ID>(getSubclassDataFromInstruction() >> 2);
+  }
+
+  void setCallingConv(CallingConv::ID CC) {
+    auto ID = static_cast<unsigned>(CC);
+    assert(!(ID & ~CallingConv::MaxID) && "Unsupported calling convention");
+    setInstructionSubclassData((getSubclassDataFromInstruction() & 3) |
+                               (ID << 2));
+  }
+
+  /// \name Attribute API
+  ///
+  /// These methods access and modify attributes on this call (including
+  /// looking through to the attributes on the called function when necessary).
+  ///@{
+
+  /// Return the parameter attributes for this call.
+  ///
+  AttributeList getAttributes() const { return Attrs; }
+
+  /// Set the parameter attributes for this call.
+  ///
+  void setAttributes(AttributeList A) { Attrs = A; }
+
+  /// Determine whether this call has the given attribute.
+  bool hasFnAttr(Attribute::AttrKind Kind) const {
+    assert(Kind != Attribute::NoBuiltin &&
+           "Use CallBase::isNoBuiltin() to check for Attribute::NoBuiltin");
+    return hasFnAttrImpl(Kind);
+  }
+
+  /// Determine whether this call has the given attribute.
+  bool hasFnAttr(StringRef Kind) const { return hasFnAttrImpl(Kind); }
+
+  /// adds the attribute to the list of attributes.
+  void addAttribute(unsigned i, Attribute::AttrKind Kind) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addAttribute(getContext(), i, Kind);
+    setAttributes(PAL);
+  }
+
+  /// adds the attribute to the list of attributes.
+  void addAttribute(unsigned i, Attribute Attr) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addAttribute(getContext(), i, Attr);
+    setAttributes(PAL);
+  }
+
+  /// Adds the attribute to the indicated argument
+  void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
+    setAttributes(PAL);
+  }
+
+  /// Adds the attribute to the indicated argument
+  void addParamAttr(unsigned ArgNo, Attribute Attr) {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
+    setAttributes(PAL);
+  }
+
+  /// removes the attribute from the list of attributes.
+  void removeAttribute(unsigned i, Attribute::AttrKind Kind) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeAttribute(getContext(), i, Kind);
+    setAttributes(PAL);
+  }
+
+  /// removes the attribute from the list of attributes.
+  void removeAttribute(unsigned i, StringRef Kind) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeAttribute(getContext(), i, Kind);
+    setAttributes(PAL);
+  }
+
+  /// Removes the attribute from the given argument
+  void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+    setAttributes(PAL);
+  }
+
+  /// Removes the attribute from the given argument
+  void removeParamAttr(unsigned ArgNo, StringRef Kind) {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+    setAttributes(PAL);
+  }
+
+  /// adds the dereferenceable attribute to the list of attributes.
+  void addDereferenceableAttr(unsigned i, uint64_t Bytes) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
+    setAttributes(PAL);
+  }
+
+  /// adds the dereferenceable_or_null attribute to the list of
+  /// attributes.
+  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
+    setAttributes(PAL);
+  }
+
+  /// Determine whether the return value has the given attribute.
+  bool hasRetAttr(Attribute::AttrKind Kind) const;
+
+  /// Determine whether the argument or parameter has the given attribute.
+  bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const;
+
+  /// Get the attribute of a given kind at a position.
+  Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const {
+    return getAttributes().getAttribute(i, Kind);
+  }
+
+  /// Get the attribute of a given kind at a position.
+  Attribute getAttribute(unsigned i, StringRef Kind) const {
+    return getAttributes().getAttribute(i, Kind);
+  }
+
+  /// Get the attribute of a given kind from a given arg
+  Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    return getAttributes().getParamAttr(ArgNo, Kind);
+  }
+
+  /// Get the attribute of a given kind from a given arg
+  Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const {
+    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    return getAttributes().getParamAttr(ArgNo, Kind);
+  }
+
+  /// Return true if the data operand at index \p i has the attribute \p
+  /// A.
+  ///
+  /// Data operands include call arguments and values used in operand bundles,
+  /// but does not include the callee operand.  This routine dispatches to the
+  /// underlying AttributeList or the OperandBundleUser as appropriate.
+  ///
+  /// The index \p i is interpreted as
+  ///
+  ///  \p i == Attribute::ReturnIndex  -> the return value
+  ///  \p i in [1, arg_size + 1)  -> argument number (\p i - 1)
+  ///  \p i in [arg_size + 1, data_operand_size + 1) -> bundle operand at index
+  ///     (\p i - 1) in the operand list.
+  bool dataOperandHasImpliedAttr(unsigned i, Attribute::AttrKind Kind) const {
+    // Note that we have to add one because `i` isn't zero-indexed.
+    assert(i < (getNumArgOperands() + getNumTotalBundleOperands() + 1) &&
+           "Data operand index out of bounds!");
+
+    // The attribute A can either be directly specified, if the operand in
+    // question is a call argument; or be indirectly implied by the kind of its
+    // containing operand bundle, if the operand is a bundle operand.
+
+    if (i == AttributeList::ReturnIndex)
+      return hasRetAttr(Kind);
+
+    // FIXME: Avoid these i - 1 calculations and update the API to use
+    // zero-based indices.
+    if (i < (getNumArgOperands() + 1))
+      return paramHasAttr(i - 1, Kind);
+
+    assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
+           "Must be either a call argument or an operand bundle!");
+    return bundleOperandHasAttr(i - 1, Kind);
+  }
+
+  /// Extract the alignment of the return value.
+  unsigned getRetAlignment() const { return Attrs.getRetAlignment(); }
+
+  /// Extract the alignment for a call or parameter (0=unknown).
+  unsigned getParamAlignment(unsigned ArgNo) const {
+    return Attrs.getParamAlignment(ArgNo);
+  }
+
+  /// Extract the number of dereferenceable bytes for a call or
+  /// parameter (0=unknown).
+  uint64_t getDereferenceableBytes(unsigned i) const {
+    return Attrs.getDereferenceableBytes(i);
+  }
+
+  /// Extract the number of dereferenceable_or_null bytes for a call or
+  /// parameter (0=unknown).
+  uint64_t getDereferenceableOrNullBytes(unsigned i) const {
+    return Attrs.getDereferenceableOrNullBytes(i);
+  }
+
+  /// Determine if the return value is marked with NoAlias attribute.
+  bool returnDoesNotAlias() const {
+    return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  }
+
+  /// If one of the arguments has the 'returned' attribute, returns its
+  /// operand value. Otherwise, return nullptr.
+  Value *getReturnedArgOperand() const;
+
+  /// Return true if the call should not be treated as a call to a
+  /// builtin.
+  bool isNoBuiltin() const {
+    return hasFnAttrImpl(Attribute::NoBuiltin) &&
+           !hasFnAttrImpl(Attribute::Builtin);
+  }
+
+  /// Determine if the call requires strict floating point semantics.
+  bool isStrictFP() const { return hasFnAttr(Attribute::StrictFP); }
+
+  /// Return true if the call should not be inlined.
+  bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
+  void setIsNoInline() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
+  }
+  /// Determine if the call does not access memory.
+  bool doesNotAccessMemory() const { return hasFnAttr(Attribute::ReadNone); }
+  void setDoesNotAccessMemory() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+  }
+
+  /// Determine if the call does not access or only reads memory.
+  bool onlyReadsMemory() const {
+    return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
+  }
+  void setOnlyReadsMemory() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
+  }
+
+  /// Determine if the call does not access or only writes memory.
+  bool doesNotReadMemory() const {
+    return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly);
+  }
+  void setDoesNotReadMemory() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly);
+  }
+
+  /// Determine if the call can access memmory only using pointers based
+  /// on its arguments.
+  bool onlyAccessesArgMemory() const {
+    return hasFnAttr(Attribute::ArgMemOnly);
+  }
+  void setOnlyAccessesArgMemory() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly);
+  }
+
+  /// Determine if the function may only access memory that is
+  /// inaccessible from the IR.
+  bool onlyAccessesInaccessibleMemory() const {
+    return hasFnAttr(Attribute::InaccessibleMemOnly);
+  }
+  void setOnlyAccessesInaccessibleMemory() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly);
+  }
+
+  /// Determine if the function may only access memory that is
+  /// either inaccessible from the IR or pointed to by its arguments.
+  bool onlyAccessesInaccessibleMemOrArgMem() const {
+    return hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+  }
+  void setOnlyAccessesInaccessibleMemOrArgMem() {
+    addAttribute(AttributeList::FunctionIndex,
+                 Attribute::InaccessibleMemOrArgMemOnly);
+  }
+  /// Determine if the call cannot return.
+  bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
+  void setDoesNotReturn() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
+  }
+
+  /// Determine if the call should not perform indirect branch tracking.
+  bool doesNoCfCheck() const { return hasFnAttr(Attribute::NoCfCheck); }
+
+  /// Determine if the call cannot unwind.
+  bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
+  void setDoesNotThrow() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+  }
+
+  /// Determine if the invoke cannot be duplicated.
+  bool cannotDuplicate() const { return hasFnAttr(Attribute::NoDuplicate); }
+  void setCannotDuplicate() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate);
+  }
+
+  /// Determine if the invoke is convergent
+  bool isConvergent() const { return hasFnAttr(Attribute::Convergent); }
+  void setConvergent() {
+    addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
+  }
+  void setNotConvergent() {
+    removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
+  }
+
+  /// Determine if the call returns a structure through first
+  /// pointer argument.
+  bool hasStructRetAttr() const {
+    if (getNumArgOperands() == 0)
+      return false;
+
+    // Be friendly and also check the callee.
+    return paramHasAttr(0, Attribute::StructRet);
+  }
+
+  /// Determine if any call argument is an aggregate passed by value.
+  bool hasByValArgument() const {
+    return Attrs.hasAttrSomewhere(Attribute::ByVal);
+  }
+
+  ///@{
+  // End of attribute API.
+
+  /// \name Operand Bundle API
+  ///
+  /// This group of methods provides the API to access and manipulate operand
+  /// bundles on this call.
+  /// @{
+
   /// Return the number of operand bundles associated with this User.
   unsigned getNumOperandBundles() const {
     return std::distance(bundle_op_info_begin(), bundle_op_info_end());
@@ -1261,8 +1606,7 @@ public:
   /// Return true if \p Other has the same sequence of operand bundle
   /// tags with the same number of operands on each one of them as this
   /// OperandBundleUser.
-  bool hasIdenticalOperandBundleSchema(
-      const OperandBundleUser<InstrTy, OpIteratorTy> &Other) const {
+  bool hasIdenticalOperandBundleSchema(const CallBase &Other) const {
     if (getNumOperandBundles() != Other.getNumOperandBundles())
       return false;
 
@@ -1281,7 +1625,6 @@ public:
     return false;
   }
 
-protected:
   /// Is the function attribute S disallowed by some operand bundle on
   /// this operand bundle user?
   bool isFnAttrDisallowedByOpBundle(StringRef S) const {
@@ -1340,8 +1683,8 @@ protected:
   /// OperandBundleUse.
   OperandBundleUse
   operandBundleFromBundleOpInfo(const BundleOpInfo &BOI) const {
-    auto op_begin = static_cast<const InstrTy *>(this)->op_begin();
-    ArrayRef<Use> Inputs(op_begin + BOI.Begin, op_begin + BOI.End);
+    auto begin = op_begin();
+    ArrayRef<Use> Inputs(begin + BOI.Begin, begin + BOI.End);
     return OperandBundleUse(BOI.Tag, Inputs);
   }
 
@@ -1350,37 +1693,79 @@ protected:
 
   /// Return the start of the list of BundleOpInfo instances associated
   /// with this OperandBundleUser.
+  ///
+  /// OperandBundleUser uses the descriptor area co-allocated with the host User
+  /// to store some meta information about which operands are "normal" operands,
+  /// and which ones belong to some operand bundle.
+  ///
+  /// The layout of an operand bundle user is
+  ///
+  ///          +-----------uint32_t End-------------------------------------+
+  ///          |                                                            |
+  ///          |  +--------uint32_t Begin--------------------+              |
+  ///          |  |                                          |              |
+  ///          ^  ^                                          v              v
+  ///  |------|------|----|----|----|----|----|---------|----|---------|----|-----
+  ///  | BOI0 | BOI1 | .. | DU | U0 | U1 | .. | BOI0_U0 | .. | BOI1_U0 | .. | Un
+  ///  |------|------|----|----|----|----|----|---------|----|---------|----|-----
+  ///   v  v                                  ^              ^
+  ///   |  |                                  |              |
+  ///   |  +--------uint32_t Begin------------+              |
+  ///   |                                                    |
+  ///   +-----------uint32_t End-----------------------------+
+  ///
+  ///
+  /// BOI0, BOI1 ... are descriptions of operand bundles in this User's use
+  /// list. These descriptions are installed and managed by this class, and
+  /// they're all instances of OperandBundleUser<T>::BundleOpInfo.
+  ///
+  /// DU is an additional descriptor installed by User's 'operator new' to keep
+  /// track of the 'BOI0 ... BOIN' co-allocation.  OperandBundleUser does not
+  /// access or modify DU in any way, it's an implementation detail private to
+  /// User.
+  ///
+  /// The regular Use& vector for the User starts at U0.  The operand bundle
+  /// uses are part of the Use& vector, just like normal uses.  In the diagram
+  /// above, the operand bundle uses start at BOI0_U0.  Each instance of
+  /// BundleOpInfo has information about a contiguous set of uses constituting
+  /// an operand bundle, and the total set of operand bundle uses themselves
+  /// form a contiguous set of uses (i.e. there are no gaps between uses
+  /// corresponding to individual operand bundles).
+  ///
+  /// This class does not know the location of the set of operand bundle uses
+  /// within the use list -- that is decided by the User using this class via
+  /// the BeginIdx argument in populateBundleOperandInfos.
+  ///
+  /// Currently operand bundle users with hung-off operands are not supported.
   bundle_op_iterator bundle_op_info_begin() {
-    if (!static_cast<InstrTy *>(this)->hasDescriptor())
+    if (!hasDescriptor())
       return nullptr;
 
-    uint8_t *BytesBegin = static_cast<InstrTy *>(this)->getDescriptor().begin();
+    uint8_t *BytesBegin = getDescriptor().begin();
     return reinterpret_cast<bundle_op_iterator>(BytesBegin);
   }
 
   /// Return the start of the list of BundleOpInfo instances associated
   /// with this OperandBundleUser.
   const_bundle_op_iterator bundle_op_info_begin() const {
-    auto *NonConstThis =
-        const_cast<OperandBundleUser<InstrTy, OpIteratorTy> *>(this);
+    auto *NonConstThis = const_cast<CallBase *>(this);
     return NonConstThis->bundle_op_info_begin();
   }
 
   /// Return the end of the list of BundleOpInfo instances associated
   /// with this OperandBundleUser.
   bundle_op_iterator bundle_op_info_end() {
-    if (!static_cast<InstrTy *>(this)->hasDescriptor())
+    if (!hasDescriptor())
       return nullptr;
 
-    uint8_t *BytesEnd = static_cast<InstrTy *>(this)->getDescriptor().end();
+    uint8_t *BytesEnd = getDescriptor().end();
     return reinterpret_cast<bundle_op_iterator>(BytesEnd);
   }
 
   /// Return the end of the list of BundleOpInfo instances associated
   /// with this OperandBundleUser.
   const_bundle_op_iterator bundle_op_info_end() const {
-    auto *NonConstThis =
-        const_cast<OperandBundleUser<InstrTy, OpIteratorTy> *>(this);
+    auto *NonConstThis = const_cast<CallBase *>(this);
     return NonConstThis->bundle_op_info_end();
   }
 
@@ -1400,30 +1785,8 @@ protected:
   ///
   /// Each \p OperandBundleDef instance is tracked by a OperandBundleInfo
   /// instance allocated in this User's descriptor.
-  OpIteratorTy populateBundleOperandInfos(ArrayRef<OperandBundleDef> Bundles,
-                                          const unsigned BeginIndex) {
-    auto It = static_cast<InstrTy *>(this)->op_begin() + BeginIndex;
-    for (auto &B : Bundles)
-      It = std::copy(B.input_begin(), B.input_end(), It);
-
-    auto *ContextImpl = static_cast<InstrTy *>(this)->getContext().pImpl;
-    auto BI = Bundles.begin();
-    unsigned CurrentIndex = BeginIndex;
-
-    for (auto &BOI : bundle_op_infos()) {
-      assert(BI != Bundles.end() && "Incorrect allocation?");
-
-      BOI.Tag = ContextImpl->getOrInsertBundleTag(BI->getTag());
-      BOI.Begin = CurrentIndex;
-      BOI.End = CurrentIndex + BI->input_size();
-      CurrentIndex = BOI.End;
-      BI++;
-    }
-
-    assert(BI == Bundles.end() && "Incorrect allocation?");
-
-    return It;
-  }
+  op_iterator populateBundleOperandInfos(ArrayRef<OperandBundleDef> Bundles,
+                                         const unsigned BeginIndex);
 
   /// Return the BundleOpInfo for the operand at index OpIdx.
   ///
@@ -1437,6 +1800,7 @@ protected:
     llvm_unreachable("Did not find operand bundle for operand!");
   }
 
+protected:
   /// Return the total number of values used in \p Bundles.
   static unsigned CountBundleInputs(ArrayRef<OperandBundleDef> Bundles) {
     unsigned Total = 0;
@@ -1444,8 +1808,102 @@ protected:
       Total += B.input_size();
     return Total;
   }
+
+  /// @}
+  // End of operand bundle API.
+
+private:
+  bool hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const;
+  bool hasFnAttrOnCalledFunction(StringRef Kind) const;
+
+  template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
+    if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind))
+      return true;
+
+    // Operand bundles override attributes on the called function, but don't
+    // override attributes directly present on the call instruction.
+    if (isFnAttrDisallowedByOpBundle(Kind))
+      return false;
+
+    return hasFnAttrOnCalledFunction(Kind);
+  }
+};
+
+template <>
+struct OperandTraits<CallBase> : public VariadicOperandTraits<CallBase, 1> {};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallBase, Value)
+
+//===----------------------------------------------------------------------===//
+//                           FuncletPadInst Class
+//===----------------------------------------------------------------------===//
+class FuncletPadInst : public Instruction {
+private:
+  FuncletPadInst(const FuncletPadInst &CPI);
+
+  explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
+                          ArrayRef<Value *> Args, unsigned Values,
+                          const Twine &NameStr, Instruction *InsertBefore);
+  explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
+                          ArrayRef<Value *> Args, unsigned Values,
+                          const Twine &NameStr, BasicBlock *InsertAtEnd);
+
+  void init(Value *ParentPad, ArrayRef<Value *> Args, const Twine &NameStr);
+
+protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+  friend class CatchPadInst;
+  friend class CleanupPadInst;
+
+  FuncletPadInst *cloneImpl() const;
+
+public:
+  /// Provide fast operand accessors
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  /// getNumArgOperands - Return the number of funcletpad arguments.
+  ///
+  unsigned getNumArgOperands() const { return getNumOperands() - 1; }
+
+  /// Convenience accessors
+
+  /// Return the outer EH-pad this funclet is nested within.
+  ///
+  /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst
+  /// is a CatchPadInst.
+  Value *getParentPad() const { return Op<-1>(); }
+  void setParentPad(Value *ParentPad) {
+    assert(ParentPad);
+    Op<-1>() = ParentPad;
+  }
+
+  /// getArgOperand/setArgOperand - Return/set the i-th funcletpad argument.
+  ///
+  Value *getArgOperand(unsigned i) const { return getOperand(i); }
+  void setArgOperand(unsigned i, Value *v) { setOperand(i, v); }
+
+  /// arg_operands - iteration adapter for range-for loops.
+  op_range arg_operands() { return op_range(op_begin(), op_end() - 1); }
+
+  /// arg_operands - iteration adapter for range-for loops.
+  const_op_range arg_operands() const {
+    return const_op_range(op_begin(), op_end() - 1);
+  }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const Instruction *I) { return I->isFuncletPad(); }
+  static bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
 };
 
+template <>
+struct OperandTraits<FuncletPadInst>
+    : public VariadicOperandTraits<FuncletPadInst, /*MINARITY=*/1> {};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(FuncletPadInst, Value)
+
 } // end namespace llvm
 
 #endif // LLVM_IR_INSTRTYPES_H
diff --git a/include/llvm/IR/Instruction.def b/include/llvm/IR/Instruction.def
index 86617299c44ac6a2713a4fc8a4be7e325eace047..58e4e2e1d6cc50b1564438e2d58c2642fcf073f9 100644
--- a/include/llvm/IR/Instruction.def
+++ b/include/llvm/IR/Instruction.def
@@ -32,6 +32,20 @@
 #define LAST_TERM_INST(num)
 #endif
 
+#ifndef FIRST_UNARY_INST
+#define FIRST_UNARY_INST(num)
+#endif
+#ifndef HANDLE_UNARY_INST
+#ifndef HANDLE_INST
+#define HANDLE_UNARY_INST(num, opcode, instclass)
+#else
+#define HANDLE_UNARY_INST(num, opcode, Class) HANDLE_INST(num, opcode, Class)
+#endif
+#endif
+#ifndef LAST_UNARY_INST
+#define LAST_UNARY_INST(num)
+#endif
+
 #ifndef FIRST_BINARY_INST
 #define FIRST_BINARY_INST(num)
 #endif
@@ -123,87 +137,96 @@ HANDLE_TERM_INST  ( 9, CatchRet      , CatchReturnInst)
 HANDLE_TERM_INST  (10, CatchSwitch   , CatchSwitchInst)
   LAST_TERM_INST  (10)
 
+// Standard unary operators...
+ FIRST_UNARY_INST(11)
+HANDLE_UNARY_INST(11, FNeg  , UnaryOperator)
+  LAST_UNARY_INST(11)
+
 // Standard binary operators...
- FIRST_BINARY_INST(11)
-HANDLE_BINARY_INST(11, Add  , BinaryOperator)
-HANDLE_BINARY_INST(12, FAdd , BinaryOperator)
-HANDLE_BINARY_INST(13, Sub  , BinaryOperator)
-HANDLE_BINARY_INST(14, FSub , BinaryOperator)
-HANDLE_BINARY_INST(15, Mul  , BinaryOperator)
-HANDLE_BINARY_INST(16, FMul , BinaryOperator)
-HANDLE_BINARY_INST(17, UDiv , BinaryOperator)
-HANDLE_BINARY_INST(18, SDiv , BinaryOperator)
-HANDLE_BINARY_INST(19, FDiv , BinaryOperator)
-HANDLE_BINARY_INST(20, URem , BinaryOperator)
-HANDLE_BINARY_INST(21, SRem , BinaryOperator)
-HANDLE_BINARY_INST(22, FRem , BinaryOperator)
+ FIRST_BINARY_INST(12)
+HANDLE_BINARY_INST(12, Add  , BinaryOperator)
+HANDLE_BINARY_INST(13, FAdd , BinaryOperator)
+HANDLE_BINARY_INST(14, Sub  , BinaryOperator)
+HANDLE_BINARY_INST(15, FSub , BinaryOperator)
+HANDLE_BINARY_INST(16, Mul  , BinaryOperator)
+HANDLE_BINARY_INST(17, FMul , BinaryOperator)
+HANDLE_BINARY_INST(18, UDiv , BinaryOperator)
+HANDLE_BINARY_INST(19, SDiv , BinaryOperator)
+HANDLE_BINARY_INST(20, FDiv , BinaryOperator)
+HANDLE_BINARY_INST(21, URem , BinaryOperator)
+HANDLE_BINARY_INST(22, SRem , BinaryOperator)
+HANDLE_BINARY_INST(23, FRem , BinaryOperator)
 
 // Logical operators (integer operands)
-HANDLE_BINARY_INST(23, Shl  , BinaryOperator) // Shift left  (logical)
-HANDLE_BINARY_INST(24, LShr , BinaryOperator) // Shift right (logical)
-HANDLE_BINARY_INST(25, AShr , BinaryOperator) // Shift right (arithmetic)
-HANDLE_BINARY_INST(26, And  , BinaryOperator)
-HANDLE_BINARY_INST(27, Or   , BinaryOperator)
-HANDLE_BINARY_INST(28, Xor  , BinaryOperator)
-  LAST_BINARY_INST(28)
+HANDLE_BINARY_INST(24, Shl  , BinaryOperator) // Shift left  (logical)
+HANDLE_BINARY_INST(25, LShr , BinaryOperator) // Shift right (logical)
+HANDLE_BINARY_INST(26, AShr , BinaryOperator) // Shift right (arithmetic)
+HANDLE_BINARY_INST(27, And  , BinaryOperator)
+HANDLE_BINARY_INST(28, Or   , BinaryOperator)
+HANDLE_BINARY_INST(29, Xor  , BinaryOperator)
+  LAST_BINARY_INST(29)
 
 // Memory operators...
- FIRST_MEMORY_INST(29)
-HANDLE_MEMORY_INST(29, Alloca, AllocaInst)  // Stack management
-HANDLE_MEMORY_INST(30, Load  , LoadInst  )  // Memory manipulation instrs
-HANDLE_MEMORY_INST(31, Store , StoreInst )
-HANDLE_MEMORY_INST(32, GetElementPtr, GetElementPtrInst)
-HANDLE_MEMORY_INST(33, Fence , FenceInst )
-HANDLE_MEMORY_INST(34, AtomicCmpXchg , AtomicCmpXchgInst )
-HANDLE_MEMORY_INST(35, AtomicRMW , AtomicRMWInst )
-  LAST_MEMORY_INST(35)
+ FIRST_MEMORY_INST(30)
+HANDLE_MEMORY_INST(30, Alloca, AllocaInst)  // Stack management
+HANDLE_MEMORY_INST(31, Load  , LoadInst  )  // Memory manipulation instrs
+HANDLE_MEMORY_INST(32, Store , StoreInst )
+HANDLE_MEMORY_INST(33, GetElementPtr, GetElementPtrInst)
+HANDLE_MEMORY_INST(34, Fence , FenceInst )
+HANDLE_MEMORY_INST(35, AtomicCmpXchg , AtomicCmpXchgInst )
+HANDLE_MEMORY_INST(36, AtomicRMW , AtomicRMWInst )
+  LAST_MEMORY_INST(36)
 
 // Cast operators ...
 // NOTE: The order matters here because CastInst::isEliminableCastPair
 // NOTE: (see Instructions.cpp) encodes a table based on this ordering.
- FIRST_CAST_INST(36)
-HANDLE_CAST_INST(36, Trunc   , TruncInst   )  // Truncate integers
-HANDLE_CAST_INST(37, ZExt    , ZExtInst    )  // Zero extend integers
-HANDLE_CAST_INST(38, SExt    , SExtInst    )  // Sign extend integers
-HANDLE_CAST_INST(39, FPToUI  , FPToUIInst  )  // floating point -> UInt
-HANDLE_CAST_INST(40, FPToSI  , FPToSIInst  )  // floating point -> SInt
-HANDLE_CAST_INST(41, UIToFP  , UIToFPInst  )  // UInt -> floating point
-HANDLE_CAST_INST(42, SIToFP  , SIToFPInst  )  // SInt -> floating point
-HANDLE_CAST_INST(43, FPTrunc , FPTruncInst )  // Truncate floating point
-HANDLE_CAST_INST(44, FPExt   , FPExtInst   )  // Extend floating point
-HANDLE_CAST_INST(45, PtrToInt, PtrToIntInst)  // Pointer -> Integer
-HANDLE_CAST_INST(46, IntToPtr, IntToPtrInst)  // Integer -> Pointer
-HANDLE_CAST_INST(47, BitCast , BitCastInst )  // Type cast
-HANDLE_CAST_INST(48, AddrSpaceCast, AddrSpaceCastInst)  // addrspace cast
-  LAST_CAST_INST(48)
-
- FIRST_FUNCLETPAD_INST(49)
-HANDLE_FUNCLETPAD_INST(49, CleanupPad, CleanupPadInst)
-HANDLE_FUNCLETPAD_INST(50, CatchPad  , CatchPadInst)
-  LAST_FUNCLETPAD_INST(50)
+ FIRST_CAST_INST(37)
+HANDLE_CAST_INST(37, Trunc   , TruncInst   )  // Truncate integers
+HANDLE_CAST_INST(38, ZExt    , ZExtInst    )  // Zero extend integers
+HANDLE_CAST_INST(39, SExt    , SExtInst    )  // Sign extend integers
+HANDLE_CAST_INST(40, FPToUI  , FPToUIInst  )  // floating point -> UInt
+HANDLE_CAST_INST(41, FPToSI  , FPToSIInst  )  // floating point -> SInt
+HANDLE_CAST_INST(42, UIToFP  , UIToFPInst  )  // UInt -> floating point
+HANDLE_CAST_INST(43, SIToFP  , SIToFPInst  )  // SInt -> floating point
+HANDLE_CAST_INST(44, FPTrunc , FPTruncInst )  // Truncate floating point
+HANDLE_CAST_INST(45, FPExt   , FPExtInst   )  // Extend floating point
+HANDLE_CAST_INST(46, PtrToInt, PtrToIntInst)  // Pointer -> Integer
+HANDLE_CAST_INST(47, IntToPtr, IntToPtrInst)  // Integer -> Pointer
+HANDLE_CAST_INST(48, BitCast , BitCastInst )  // Type cast
+HANDLE_CAST_INST(49, AddrSpaceCast, AddrSpaceCastInst)  // addrspace cast
+  LAST_CAST_INST(49)
+
+ FIRST_FUNCLETPAD_INST(50)
+HANDLE_FUNCLETPAD_INST(50, CleanupPad, CleanupPadInst)
+HANDLE_FUNCLETPAD_INST(51, CatchPad  , CatchPadInst)
+  LAST_FUNCLETPAD_INST(51)
 
 // Other operators...
- FIRST_OTHER_INST(51)
-HANDLE_OTHER_INST(51, ICmp   , ICmpInst   )  // Integer comparison instruction
-HANDLE_OTHER_INST(52, FCmp   , FCmpInst   )  // Floating point comparison instr.
-HANDLE_OTHER_INST(53, PHI    , PHINode    )  // PHI node instruction
-HANDLE_OTHER_INST(54, Call   , CallInst   )  // Call a function
-HANDLE_OTHER_INST(55, Select , SelectInst )  // select instruction
-HANDLE_USER_INST (56, UserOp1, Instruction)  // May be used internally in a pass
-HANDLE_USER_INST (57, UserOp2, Instruction)  // Internal to passes only
-HANDLE_OTHER_INST(58, VAArg  , VAArgInst  )  // vaarg instruction
-HANDLE_OTHER_INST(59, ExtractElement, ExtractElementInst)// extract from vector
-HANDLE_OTHER_INST(60, InsertElement, InsertElementInst)  // insert into vector
-HANDLE_OTHER_INST(61, ShuffleVector, ShuffleVectorInst)  // shuffle two vectors.
-HANDLE_OTHER_INST(62, ExtractValue, ExtractValueInst)// extract from aggregate
-HANDLE_OTHER_INST(63, InsertValue, InsertValueInst)  // insert into aggregate
-HANDLE_OTHER_INST(64, LandingPad, LandingPadInst)  // Landing pad instruction.
-  LAST_OTHER_INST(64)
+ FIRST_OTHER_INST(52)
+HANDLE_OTHER_INST(52, ICmp   , ICmpInst   )  // Integer comparison instruction
+HANDLE_OTHER_INST(53, FCmp   , FCmpInst   )  // Floating point comparison instr.
+HANDLE_OTHER_INST(54, PHI    , PHINode    )  // PHI node instruction
+HANDLE_OTHER_INST(55, Call   , CallInst   )  // Call a function
+HANDLE_OTHER_INST(56, Select , SelectInst )  // select instruction
+HANDLE_USER_INST (57, UserOp1, Instruction)  // May be used internally in a pass
+HANDLE_USER_INST (58, UserOp2, Instruction)  // Internal to passes only
+HANDLE_OTHER_INST(59, VAArg  , VAArgInst  )  // vaarg instruction
+HANDLE_OTHER_INST(60, ExtractElement, ExtractElementInst)// extract from vector
+HANDLE_OTHER_INST(61, InsertElement, InsertElementInst)  // insert into vector
+HANDLE_OTHER_INST(62, ShuffleVector, ShuffleVectorInst)  // shuffle two vectors.
+HANDLE_OTHER_INST(63, ExtractValue, ExtractValueInst)// extract from aggregate
+HANDLE_OTHER_INST(64, InsertValue, InsertValueInst)  // insert into aggregate
+HANDLE_OTHER_INST(65, LandingPad, LandingPadInst)  // Landing pad instruction.
+  LAST_OTHER_INST(65)
 
 #undef  FIRST_TERM_INST
 #undef HANDLE_TERM_INST
 #undef   LAST_TERM_INST
 
+#undef  FIRST_UNARY_INST
+#undef HANDLE_UNARY_INST
+#undef   LAST_UNARY_INST
+
 #undef  FIRST_BINARY_INST
 #undef HANDLE_BINARY_INST
 #undef   LAST_BINARY_INST
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 2a75801c649ec513f7dc98980a14d3b30b6351f6..7a81f700ee4ecccd6ca45652776c421a3295b287 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -127,6 +127,7 @@ public:
 
   const char *getOpcodeName() const { return getOpcodeName(getOpcode()); }
   bool isTerminator() const { return isTerminator(getOpcode()); }
+  bool isUnaryOp() const { return isUnaryOp(getOpcode()); }
   bool isBinaryOp() const { return isBinaryOp(getOpcode()); }
   bool isIntDivRem() const { return isIntDivRem(getOpcode()); }
   bool isShift() { return isShift(getOpcode()); }
@@ -142,6 +143,9 @@ public:
     return OpCode >= TermOpsBegin && OpCode < TermOpsEnd;
   }
 
+  static inline bool isUnaryOp(unsigned Opcode) {
+    return Opcode >= UnaryOpsBegin && Opcode < UnaryOpsEnd;
+  }
   static inline bool isBinaryOp(unsigned Opcode) {
     return Opcode >= BinaryOpsBegin && Opcode < BinaryOpsEnd;
   }
@@ -586,6 +590,14 @@ public:
         static_cast<const Instruction *>(this)->getNextNonDebugInstruction());
   }
 
+  /// Return a pointer to the previous non-debug instruction in the same basic
+  /// block as 'this', or nullptr if no such instruction exists.
+  const Instruction *getPrevNonDebugInstruction() const;
+  Instruction *getPrevNonDebugInstruction() {
+    return const_cast<Instruction *>(
+        static_cast<const Instruction *>(this)->getPrevNonDebugInstruction());
+  }
+
   /// Create a copy of 'this' instruction that is identical in all ways except
   /// the following:
   ///   * The instruction has no parent
@@ -654,6 +666,13 @@ public:
 #include "llvm/IR/Instruction.def"
   };
 
+  enum UnaryOps {
+#define  FIRST_UNARY_INST(N)             UnaryOpsBegin = N,
+#define HANDLE_UNARY_INST(N, OPC, CLASS) OPC = N,
+#define   LAST_UNARY_INST(N)             UnaryOpsEnd = N+1
+#include "llvm/IR/Instruction.def"
+  };
+
   enum BinaryOps {
 #define  FIRST_BINARY_INST(N)             BinaryOpsBegin = N,
 #define HANDLE_BINARY_INST(N, OPC, CLASS) OPC = N,
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 7b2c13c5328239fbf03877f632fc946694376de3..7700e7caa9cd5572b8164582cf0f9ddcac30bc93 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1103,6 +1103,71 @@ GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr,
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrInst, Value)
 
+//===----------------------------------------------------------------------===//
+//                                UnaryOperator Class
+//===----------------------------------------------------------------------===//
+
+/// a unary instruction 
+class UnaryOperator : public UnaryInstruction {
+  void AssertOK();
+
+protected:
+  UnaryOperator(UnaryOps iType, Value *S, Type *Ty,
+                const Twine &Name, Instruction *InsertBefore);
+  UnaryOperator(UnaryOps iType, Value *S, Type *Ty,
+                const Twine &Name, BasicBlock *InsertAtEnd);
+
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+
+  UnaryOperator *cloneImpl() const;
+
+public:
+
+  /// Construct a unary instruction, given the opcode and an operand.
+  /// Optionally (if InstBefore is specified) insert the instruction
+  /// into a BasicBlock right before the specified instruction.  The specified
+  /// Instruction is allowed to be a dereferenced end iterator.
+  ///
+  static UnaryOperator *Create(UnaryOps Op, Value *S,
+                               const Twine &Name = Twine(),
+                               Instruction *InsertBefore = nullptr);
+
+  /// Construct a unary instruction, given the opcode and an operand.
+  /// Also automatically insert this instruction to the end of the
+  /// BasicBlock specified.
+  ///
+  static UnaryOperator *Create(UnaryOps Op, Value *S,
+                               const Twine &Name,
+                               BasicBlock *InsertAtEnd);
+
+  /// These methods just forward to Create, and are useful when you
+  /// statically know what type of instruction you're going to create.  These
+  /// helpers just save some typing.
+#define HANDLE_UNARY_INST(N, OPC, CLASS) \
+  static UnaryInstruction *Create##OPC(Value *V, \
+                                       const Twine &Name = "") {\
+    return Create(Instruction::OPC, V, Name);\
+  }
+#include "llvm/IR/Instruction.def"
+#define HANDLE_UNARY_INST(N, OPC, CLASS) \
+  static UnaryInstruction *Create##OPC(Value *V, \
+                                       const Twine &Name, BasicBlock *BB) {\
+    return Create(Instruction::OPC, V, Name, BB);\
+  }
+#include "llvm/IR/Instruction.def"
+#define HANDLE_UNARY_INST(N, OPC, CLASS) \
+  static UnaryInstruction *Create##OPC(Value *V, \
+                                       const Twine &Name, Instruction *I) {\
+    return Create(Instruction::OPC, V, Name, I);\
+  }
+#include "llvm/IR/Instruction.def"
+
+  UnaryOps getOpcode() const {
+    return static_cast<UnaryOps>(Instruction::getOpcode());
+  }
+};
+
 //===----------------------------------------------------------------------===//
 //                               ICmpInst Class
 //===----------------------------------------------------------------------===//
@@ -1353,535 +1418,13 @@ public:
   }
 };
 
-class CallInst;
-class InvokeInst;
-
-template <class T> struct CallBaseParent { using type = Instruction; };
-
-//===----------------------------------------------------------------------===//
-/// Base class for all callable instructions (InvokeInst and CallInst)
-/// Holds everything related to calling a function, abstracting from the base
-/// type @p BaseInstTy and the concrete instruction @p InstTy
-///
-template <class InstTy>
-class CallBase : public CallBaseParent<InstTy>::type,
-                 public OperandBundleUser<InstTy, User::op_iterator> {
-protected:
-  AttributeList Attrs; ///< parameter attributes for callable
-  FunctionType *FTy;
-  using BaseInstTy = typename CallBaseParent<InstTy>::type;
-
-  template <class... ArgsTy>
-  CallBase(AttributeList const &A, FunctionType *FT, ArgsTy &&... Args)
-      : BaseInstTy(std::forward<ArgsTy>(Args)...), Attrs(A), FTy(FT) {}
-  bool hasDescriptor() const { return Value::HasDescriptor; }
-
-  using BaseInstTy::BaseInstTy;
-
-  using OperandBundleUser<InstTy,
-                          User::op_iterator>::isFnAttrDisallowedByOpBundle;
-  using OperandBundleUser<InstTy, User::op_iterator>::getNumTotalBundleOperands;
-  using OperandBundleUser<InstTy, User::op_iterator>::bundleOperandHasAttr;
-  using Instruction::getSubclassDataFromInstruction;
-  using Instruction::setInstructionSubclassData;
-
-public:
-  using Instruction::getContext;
-  using OperandBundleUser<InstTy, User::op_iterator>::hasOperandBundles;
-  using OperandBundleUser<InstTy,
-                          User::op_iterator>::getBundleOperandsStartIndex;
-
-  static bool classof(const Instruction *I) {
-    llvm_unreachable(
-        "CallBase is not meant to be used as part of the classof hierarchy");
-  }
-
-public:
-  /// Return the parameter attributes for this call.
-  ///
-  AttributeList getAttributes() const { return Attrs; }
-
-  /// Set the parameter attributes for this call.
-  ///
-  void setAttributes(AttributeList A) { Attrs = A; }
-
-  FunctionType *getFunctionType() const { return FTy; }
-
-  void mutateFunctionType(FunctionType *FTy) {
-    Value::mutateType(FTy->getReturnType());
-    this->FTy = FTy;
-  }
-
-  /// Return the number of call arguments.
-  ///
-  unsigned getNumArgOperands() const {
-    return getNumOperands() - getNumTotalBundleOperands() - InstTy::ArgOffset;
-  }
-
-  /// getArgOperand/setArgOperand - Return/set the i-th call argument.
-  ///
-  Value *getArgOperand(unsigned i) const {
-    assert(i < getNumArgOperands() && "Out of bounds!");
-    return getOperand(i);
-  }
-  void setArgOperand(unsigned i, Value *v) {
-    assert(i < getNumArgOperands() && "Out of bounds!");
-    setOperand(i, v);
-  }
-
-  /// Return the iterator pointing to the beginning of the argument list.
-  User::op_iterator arg_begin() { return op_begin(); }
-
-  /// Return the iterator pointing to the end of the argument list.
-  User::op_iterator arg_end() {
-    // [ call args ], [ operand bundles ], callee
-    return op_end() - getNumTotalBundleOperands() - InstTy::ArgOffset;
-  }
-
-  /// Iteration adapter for range-for loops.
-  iterator_range<User::op_iterator> arg_operands() {
-    return make_range(arg_begin(), arg_end());
-  }
-
-  /// Return the iterator pointing to the beginning of the argument list.
-  User::const_op_iterator arg_begin() const { return op_begin(); }
-
-  /// Return the iterator pointing to the end of the argument list.
-  User::const_op_iterator arg_end() const {
-    // [ call args ], [ operand bundles ], callee
-    return op_end() - getNumTotalBundleOperands() - InstTy::ArgOffset;
-  }
-
-  /// Iteration adapter for range-for loops.
-  iterator_range<User::const_op_iterator> arg_operands() const {
-    return make_range(arg_begin(), arg_end());
-  }
-
-  /// Wrappers for getting the \c Use of a call argument.
-  const Use &getArgOperandUse(unsigned i) const {
-    assert(i < getNumArgOperands() && "Out of bounds!");
-    return User::getOperandUse(i);
-  }
-  Use &getArgOperandUse(unsigned i) {
-    assert(i < getNumArgOperands() && "Out of bounds!");
-    return User::getOperandUse(i);
-  }
-
-  /// If one of the arguments has the 'returned' attribute, return its
-  /// operand value. Otherwise, return nullptr.
-  Value *getReturnedArgOperand() const {
-    unsigned Index;
-
-    if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
-      return getArgOperand(Index - AttributeList::FirstArgIndex);
-    if (const Function *F = getCalledFunction())
-      if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
-          Index)
-        return getArgOperand(Index - AttributeList::FirstArgIndex);
-
-    return nullptr;
-  }
-
-  User::op_iterator op_begin() {
-    return OperandTraits<CallBase>::op_begin(this);
-  }
-
-  User::const_op_iterator op_begin() const {
-    return OperandTraits<CallBase>::op_begin(const_cast<CallBase *>(this));
-  }
-
-  User::op_iterator op_end() { return OperandTraits<CallBase>::op_end(this); }
-
-  User::const_op_iterator op_end() const {
-    return OperandTraits<CallBase>::op_end(const_cast<CallBase *>(this));
-  }
-
-  Value *getOperand(unsigned i_nocapture) const {
-    assert(i_nocapture < OperandTraits<CallBase>::operands(this) &&
-           "getOperand() out of range!");
-    return cast_or_null<Value>(OperandTraits<CallBase>::op_begin(
-                                   const_cast<CallBase *>(this))[i_nocapture]
-                                   .get());
-  }
-
-  void setOperand(unsigned i_nocapture, Value *Val_nocapture) {
-    assert(i_nocapture < OperandTraits<CallBase>::operands(this) &&
-           "setOperand() out of range!");
-    OperandTraits<CallBase>::op_begin(this)[i_nocapture] = Val_nocapture;
-  }
-
-  unsigned getNumOperands() const {
-    return OperandTraits<CallBase>::operands(this);
-  }
-  template <int Idx_nocapture> Use &Op() {
-    return User::OpFrom<Idx_nocapture>(this);
-  }
-  template <int Idx_nocapture> const Use &Op() const {
-    return User::OpFrom<Idx_nocapture>(this);
-  }
-
-  /// Return the function called, or null if this is an
-  /// indirect function invocation.
-  ///
-  Function *getCalledFunction() const {
-    return dyn_cast_or_null<Function>(Op<-InstTy::ArgOffset>());
-  }
-
-  /// Determine whether this call has the given attribute.
-  bool hasFnAttr(Attribute::AttrKind Kind) const {
-    assert(Kind != Attribute::NoBuiltin &&
-           "Use CallBase::isNoBuiltin() to check for Attribute::NoBuiltin");
-    return hasFnAttrImpl(Kind);
-  }
-
-  /// Determine whether this call has the given attribute.
-  bool hasFnAttr(StringRef Kind) const { return hasFnAttrImpl(Kind); }
-
-  /// getCallingConv/setCallingConv - Get or set the calling convention of this
-  /// function call.
-  CallingConv::ID getCallingConv() const {
-    return static_cast<CallingConv::ID>(getSubclassDataFromInstruction() >> 2);
-  }
-  void setCallingConv(CallingConv::ID CC) {
-    auto ID = static_cast<unsigned>(CC);
-    assert(!(ID & ~CallingConv::MaxID) && "Unsupported calling convention");
-    setInstructionSubclassData((getSubclassDataFromInstruction() & 3) |
-                               (ID << 2));
-  }
-
-
-  /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute::AttrKind Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
-  }
-
-  /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute Attr) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addAttribute(getContext(), i, Attr);
-    setAttributes(PAL);
-  }
-
-  /// Adds the attribute to the indicated argument
-  void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
-  }
-
-  /// Adds the attribute to the indicated argument
-  void addParamAttr(unsigned ArgNo, Attribute Attr) {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
-    setAttributes(PAL);
-  }
-
-  /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
-  }
-
-  /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, StringRef Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
-  }
-
-  /// Removes the attribute from the given argument
-  void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
-  }
-
-  /// Removes the attribute from the given argument
-  void removeParamAttr(unsigned ArgNo, StringRef Kind) {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
-  }
-
-  /// adds the dereferenceable attribute to the list of attributes.
-  void addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
-    setAttributes(PAL);
-  }
-
-  /// adds the dereferenceable_or_null attribute to the list of
-  /// attributes.
-  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
-    setAttributes(PAL);
-  }
-
-  /// Determine whether the return value has the given attribute.
-  bool hasRetAttr(Attribute::AttrKind Kind) const {
-    if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
-      return true;
-
-    // Look at the callee, if available.
-    if (const Function *F = getCalledFunction())
-      return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
-    return false;
-  }
-
-  /// Determine whether the argument or parameter has the given attribute.
-  bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    assert(ArgNo < getNumArgOperands() && "Param index out of bounds!");
-
-    if (Attrs.hasParamAttribute(ArgNo, Kind))
-      return true;
-    if (const Function *F = getCalledFunction())
-      return F->getAttributes().hasParamAttribute(ArgNo, Kind);
-    return false;
-  }
-
-  /// Get the attribute of a given kind at a position.
-  Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const {
-    return getAttributes().getAttribute(i, Kind);
-  }
-
-  /// Get the attribute of a given kind at a position.
-  Attribute getAttribute(unsigned i, StringRef Kind) const {
-    return getAttributes().getAttribute(i, Kind);
-  }
-
-  /// Get the attribute of a given kind from a given arg
-  Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    return getAttributes().getParamAttr(ArgNo, Kind);
-  }
-
-  /// Get the attribute of a given kind from a given arg
-  Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    return getAttributes().getParamAttr(ArgNo, Kind);
-  }
-  /// Return true if the data operand at index \p i has the attribute \p
-  /// A.
-  ///
-  /// Data operands include call arguments and values used in operand bundles,
-  /// but does not include the callee operand.  This routine dispatches to the
-  /// underlying AttributeList or the OperandBundleUser as appropriate.
-  ///
-  /// The index \p i is interpreted as
-  ///
-  ///  \p i == Attribute::ReturnIndex  -> the return value
-  ///  \p i in [1, arg_size + 1)  -> argument number (\p i - 1)
-  ///  \p i in [arg_size + 1, data_operand_size + 1) -> bundle operand at index
-  ///     (\p i - 1) in the operand list.
-  bool dataOperandHasImpliedAttr(unsigned i, Attribute::AttrKind Kind) const {
-    // There are getNumOperands() - (InstTy::ArgOffset - 1) data operands.
-    // The last operand is the callee.
-    assert(i < (getNumOperands() - InstTy::ArgOffset + 1) &&
-           "Data operand index out of bounds!");
-
-    // The attribute A can either be directly specified, if the operand in
-    // question is a call argument; or be indirectly implied by the kind of its
-    // containing operand bundle, if the operand is a bundle operand.
-
-    if (i == AttributeList::ReturnIndex)
-      return hasRetAttr(Kind);
-
-    // FIXME: Avoid these i - 1 calculations and update the API to use
-    // zero-based indices.
-    if (i < (getNumArgOperands() + 1))
-      return paramHasAttr(i - 1, Kind);
-
-    assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
-           "Must be either a call argument or an operand bundle!");
-    return bundleOperandHasAttr(i - 1, Kind);
-  }
-
-  /// Extract the alignment of the return value.
-  unsigned getRetAlignment() const { return Attrs.getRetAlignment(); }
-
-  /// Extract the alignment for a call or parameter (0=unknown).
-  unsigned getParamAlignment(unsigned ArgNo) const {
-    return Attrs.getParamAlignment(ArgNo);
-  }
-
-  /// Extract the number of dereferenceable bytes for a call or
-  /// parameter (0=unknown).
-  uint64_t getDereferenceableBytes(unsigned i) const {
-    return Attrs.getDereferenceableBytes(i);
-  }
-
-  /// Extract the number of dereferenceable_or_null bytes for a call or
-  /// parameter (0=unknown).
-  uint64_t getDereferenceableOrNullBytes(unsigned i) const {
-    return Attrs.getDereferenceableOrNullBytes(i);
-  }
-
-  /// Determine if the return value is marked with NoAlias attribute.
-  bool returnDoesNotAlias() const {
-    return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-  }
-
-  /// Return true if the call should not be treated as a call to a
-  /// builtin.
-  bool isNoBuiltin() const {
-    return hasFnAttrImpl(Attribute::NoBuiltin) &&
-      !hasFnAttrImpl(Attribute::Builtin);
-  }
-
-  /// Determine if the call requires strict floating point semantics.
-  bool isStrictFP() const { return hasFnAttr(Attribute::StrictFP); }
-
-  /// Return true if the call should not be inlined.
-  bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
-  void setIsNoInline() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
-  }
-  /// Determine if the call does not access memory.
-  bool doesNotAccessMemory() const {
-    return hasFnAttr(Attribute::ReadNone);
-  }
-  void setDoesNotAccessMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
-  }
-
-  /// Determine if the call does not access or only reads memory.
-  bool onlyReadsMemory() const {
-    return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
-  }
-  void setOnlyReadsMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
-  }
-
-  /// Determine if the call does not access or only writes memory.
-  bool doesNotReadMemory() const {
-    return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly);
-  }
-  void setDoesNotReadMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly);
-  }
-
-  /// Determine if the call can access memmory only using pointers based
-  /// on its arguments.
-  bool onlyAccessesArgMemory() const {
-    return hasFnAttr(Attribute::ArgMemOnly);
-  }
-  void setOnlyAccessesArgMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly);
-  }
-
-  /// Determine if the function may only access memory that is
-  /// inaccessible from the IR.
-  bool onlyAccessesInaccessibleMemory() const {
-    return hasFnAttr(Attribute::InaccessibleMemOnly);
-  }
-  void setOnlyAccessesInaccessibleMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly);
-  }
-
-  /// Determine if the function may only access memory that is
-  /// either inaccessible from the IR or pointed to by its arguments.
-  bool onlyAccessesInaccessibleMemOrArgMem() const {
-    return hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
-  }
-  void setOnlyAccessesInaccessibleMemOrArgMem() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOrArgMemOnly);
-  }
-  /// Determine if the call cannot return.
-  bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
-  void setDoesNotReturn() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
-  }
-
-  /// Determine if the call should not perform indirect branch tracking.
-  bool doesNoCfCheck() const { return hasFnAttr(Attribute::NoCfCheck); }
-
-  /// Determine if the call cannot unwind.
-  bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
-  void setDoesNotThrow() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
-  }
-
-  /// Determine if the invoke cannot be duplicated.
-  bool cannotDuplicate() const {return hasFnAttr(Attribute::NoDuplicate); }
-  void setCannotDuplicate() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate);
-  }
-
-  /// Determine if the invoke is convergent
-  bool isConvergent() const { return hasFnAttr(Attribute::Convergent); }
-  void setConvergent() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-  }
-  void setNotConvergent() {
-    removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-  }
-
-  /// Determine if the call returns a structure through first
-  /// pointer argument.
-  bool hasStructRetAttr() const {
-    if (getNumArgOperands() == 0)
-      return false;
-
-    // Be friendly and also check the callee.
-    return paramHasAttr(0, Attribute::StructRet);
-  }
-
-  /// Determine if any call argument is an aggregate passed by value.
-  bool hasByValArgument() const {
-    return Attrs.hasAttrSomewhere(Attribute::ByVal);
-  }
-  /// Get a pointer to the function that is invoked by this
-  /// instruction.
-  const Value *getCalledValue() const { return Op<-InstTy::ArgOffset>(); }
-  Value *getCalledValue() { return Op<-InstTy::ArgOffset>(); }
-
-  /// Set the function called.
-  void setCalledFunction(Value* Fn) {
-    setCalledFunction(
-        cast<FunctionType>(cast<PointerType>(Fn->getType())->getElementType()),
-        Fn);
-  }
-  void setCalledFunction(FunctionType *FTy, Value *Fn) {
-    this->FTy = FTy;
-    assert(FTy == cast<FunctionType>(
-                      cast<PointerType>(Fn->getType())->getElementType()));
-    Op<-InstTy::ArgOffset>() = Fn;
-  }
-
-protected:
-  template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
-    if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind))
-      return true;
-
-    // Operand bundles override attributes on the called function, but don't
-    // override attributes directly present on the call instruction.
-    if (isFnAttrDisallowedByOpBundle(Kind))
-      return false;
-
-    if (const Function *F = getCalledFunction())
-      return F->getAttributes().hasAttribute(AttributeList::FunctionIndex,
-                                             Kind);
-    return false;
-  }
-};
-
 //===----------------------------------------------------------------------===//
 /// This class represents a function call, abstracting a target
 /// machine's calling convention.  This class uses low bit of the SubClassData
 /// field to indicate whether or not this is a tail call.  The rest of the bits
 /// hold the calling convention of the call.
 ///
-class CallInst : public CallBase<CallInst> {
-  friend class OperandBundleUser<CallInst, User::op_iterator>;
-
+class CallInst : public CallBase {
   CallInst(const CallInst &CI);
 
   /// Construct a CallInst given a range of arguments.
@@ -1921,6 +1464,13 @@ class CallInst : public CallBase<CallInst> {
             ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
   void init(Value *Func, const Twine &NameStr);
 
+  /// Compute the number of operands to allocate.
+  static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) {
+    // We need one operand for the called function, plus the input operand
+    // counts provided.
+    return 1 + NumArgs + NumBundleInputs;
+  }
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -1928,8 +1478,6 @@ protected:
   CallInst *cloneImpl() const;
 
 public:
-  static constexpr int ArgOffset = 1;
-
   static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles = None,
                           const Twine &NameStr = "",
@@ -1950,7 +1498,7 @@ public:
   static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                           const Twine &NameStr,
                           Instruction *InsertBefore = nullptr) {
-    return new (unsigned(Args.size() + 1))
+    return new (ComputeNumOperands(Args.size()))
         CallInst(Ty, Func, Args, None, NameStr, InsertBefore);
   }
 
@@ -1958,39 +1506,39 @@ public:
                           ArrayRef<OperandBundleDef> Bundles = None,
                           const Twine &NameStr = "",
                           Instruction *InsertBefore = nullptr) {
-    const unsigned TotalOps =
-        unsigned(Args.size()) + CountBundleInputs(Bundles) + 1;
+    const int NumOperands =
+        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
     const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
 
-    return new (TotalOps, DescriptorBytes)
+    return new (NumOperands, DescriptorBytes)
         CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore);
   }
 
   static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles,
                           const Twine &NameStr, BasicBlock *InsertAtEnd) {
-    const unsigned TotalOps =
-        unsigned(Args.size()) + CountBundleInputs(Bundles) + 1;
+    const int NumOperands =
+        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
     const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
 
-    return new (TotalOps, DescriptorBytes)
+    return new (NumOperands, DescriptorBytes)
         CallInst(Func, Args, Bundles, NameStr, InsertAtEnd);
   }
 
   static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
                           const Twine &NameStr, BasicBlock *InsertAtEnd) {
-    return new (unsigned(Args.size() + 1))
+    return new (ComputeNumOperands(Args.size()))
         CallInst(Func, Args, None, NameStr, InsertAtEnd);
   }
 
   static CallInst *Create(Value *F, const Twine &NameStr = "",
                           Instruction *InsertBefore = nullptr) {
-    return new (1) CallInst(F, NameStr, InsertBefore);
+    return new (ComputeNumOperands(0)) CallInst(F, NameStr, InsertBefore);
   }
 
   static CallInst *Create(Value *F, const Twine &NameStr,
                           BasicBlock *InsertAtEnd) {
-    return new (1) CallInst(F, NameStr, InsertAtEnd);
+    return new (ComputeNumOperands(0)) CallInst(F, NameStr, InsertAtEnd);
   }
 
   /// Create a clone of \p CI with a different set of operand bundles and
@@ -2081,7 +1629,7 @@ public:
   }
 
   /// Check if this call is an inline asm statement.
-  bool isInlineAsm() const { return isa<InlineAsm>(Op<-1>()); }
+  bool isInlineAsm() const { return isa<InlineAsm>(getCalledOperand()); }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
@@ -2099,32 +1647,28 @@ private:
   }
 };
 
-template <>
-struct OperandTraits<CallBase<CallInst>>
-    : public VariadicOperandTraits<CallBase<CallInst>, 1> {};
-
 CallInst::CallInst(Value *Func, ArrayRef<Value *> Args,
                    ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
                    BasicBlock *InsertAtEnd)
-    : CallBase<CallInst>(
-          cast<FunctionType>(
-              cast<PointerType>(Func->getType())->getElementType())
-              ->getReturnType(),
-          Instruction::Call,
-          OperandTraits<CallBase<CallInst>>::op_end(this) -
-              (Args.size() + CountBundleInputs(Bundles) + 1),
-          unsigned(Args.size() + CountBundleInputs(Bundles) + 1), InsertAtEnd) {
+    : CallBase(cast<FunctionType>(
+                   cast<PointerType>(Func->getType())->getElementType())
+                   ->getReturnType(),
+               Instruction::Call,
+               OperandTraits<CallBase>::op_end(this) -
+                   (Args.size() + CountBundleInputs(Bundles) + 1),
+               unsigned(Args.size() + CountBundleInputs(Bundles) + 1),
+               InsertAtEnd) {
   init(Func, Args, Bundles, NameStr);
 }
 
 CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                    ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
                    Instruction *InsertBefore)
-    : CallBase<CallInst>(Ty->getReturnType(), Instruction::Call,
-                         OperandTraits<CallBase<CallInst>>::op_end(this) -
-                             (Args.size() + CountBundleInputs(Bundles) + 1),
-                         unsigned(Args.size() + CountBundleInputs(Bundles) + 1),
-                         InsertBefore) {
+    : CallBase(Ty->getReturnType(), Instruction::Call,
+               OperandTraits<CallBase>::op_end(this) -
+                   (Args.size() + CountBundleInputs(Bundles) + 1),
+               unsigned(Args.size() + CountBundleInputs(Bundles) + 1),
+               InsertBefore) {
   init(Ty, Func, Args, Bundles, NameStr);
 }
 
@@ -2648,6 +2192,25 @@ public:
     return !changesLength() && isTransposeMask(getMask());
   }
 
+  /// Return true if this shuffle mask is an extract subvector mask.
+  /// A valid extract subvector mask returns a smaller vector from a single
+  /// source operand. The base extraction index is returned as well.
+  static bool isExtractSubvectorMask(ArrayRef<int> Mask, int NumSrcElts,
+                                     int &Index);
+  static bool isExtractSubvectorMask(const Constant *Mask, int NumSrcElts,
+                                     int &Index) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isExtractSubvectorMask(MaskAsInts, NumSrcElts, Index);
+  }
+
+  /// Return true if this shuffle mask is an extract subvector mask.
+  bool isExtractSubvectorMask(int &Index) const {
+    int NumSrcElts = Op<0>()->getType()->getVectorNumElements();
+    return isExtractSubvectorMask(getMask(), NumSrcElts, Index);
+  }
+
   /// Change values in a shuffle permute mask assuming the two vector operands
   /// of length InVecNumElts have swapped position.
   static void commuteShuffleMask(MutableArrayRef<int> Mask,
@@ -3982,8 +3545,16 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(IndirectBrInst, Value)
 /// Invoke instruction.  The SubclassData field is used to hold the
 /// calling convention of the call.
 ///
-class InvokeInst : public CallBase<InvokeInst> {
-  friend class OperandBundleUser<InvokeInst, User::op_iterator>;
+class InvokeInst : public CallBase {
+  /// The number of operands for this call beyond the called function,
+  /// arguments, and operand bundles.
+  static constexpr int NumExtraOperands = 2;
+
+  /// The index from the end of the operand array to the normal destination.
+  static constexpr int NormalDestOpEndIdx = -3;
+
+  /// The index from the end of the operand array to the unwind destination.
+  static constexpr int UnwindDestOpEndIdx = -2;
 
   InvokeInst(const InvokeInst &BI);
 
@@ -3992,26 +3563,25 @@ class InvokeInst : public CallBase<InvokeInst> {
   /// Construct an InvokeInst from a range of arguments
   inline InvokeInst(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException,
                     ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
-                    unsigned Values, const Twine &NameStr,
+                    int NumOperands, const Twine &NameStr,
                     Instruction *InsertBefore)
       : InvokeInst(cast<FunctionType>(
                        cast<PointerType>(Func->getType())->getElementType()),
-                   Func, IfNormal, IfException, Args, Bundles, Values, NameStr,
-                   InsertBefore) {}
+                   Func, IfNormal, IfException, Args, Bundles, NumOperands,
+                   NameStr, InsertBefore) {}
 
   inline InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                     BasicBlock *IfException, ArrayRef<Value *> Args,
-                    ArrayRef<OperandBundleDef> Bundles, unsigned Values,
+                    ArrayRef<OperandBundleDef> Bundles, int NumOperands,
                     const Twine &NameStr, Instruction *InsertBefore);
   /// Construct an InvokeInst given a range of arguments.
   ///
   /// Construct an InvokeInst from a range of arguments
   inline InvokeInst(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException,
                     ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
-                    unsigned Values, const Twine &NameStr,
+                    int NumOperands, const Twine &NameStr,
                     BasicBlock *InsertAtEnd);
 
-
   void init(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException,
             ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
             const Twine &NameStr) {
@@ -4024,6 +3594,13 @@ class InvokeInst : public CallBase<InvokeInst> {
             BasicBlock *IfException, ArrayRef<Value *> Args,
             ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
 
+  /// Compute the number of operands to allocate.
+  static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) {
+    // We need one operand for the called function, plus our extra operands and
+    // the input operand counts provided.
+    return 1 + NumExtraOperands + NumArgs + NumBundleInputs;
+  }
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -4031,7 +3608,6 @@ protected:
   InvokeInst *cloneImpl() const;
 
 public:
-  static constexpr int ArgOffset = 3;
   static InvokeInst *Create(Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             const Twine &NameStr,
@@ -4057,9 +3633,10 @@ public:
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             const Twine &NameStr,
                             Instruction *InsertBefore = nullptr) {
-    unsigned Values = unsigned(Args.size()) + 3;
-    return new (Values) InvokeInst(Ty, Func, IfNormal, IfException, Args, None,
-                                   Values, NameStr, InsertBefore);
+    int NumOperands = ComputeNumOperands(Args.size());
+    return new (NumOperands)
+        InvokeInst(Ty, Func, IfNormal, IfException, Args, None, NumOperands,
+                   NameStr, InsertBefore);
   }
 
   static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
@@ -4067,11 +3644,12 @@ public:
                             ArrayRef<OperandBundleDef> Bundles = None,
                             const Twine &NameStr = "",
                             Instruction *InsertBefore = nullptr) {
-    unsigned Values = unsigned(Args.size()) + CountBundleInputs(Bundles) + 3;
+    int NumOperands =
+        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
     unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
 
-    return new (Values, DescriptorBytes)
-        InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, Values,
+    return new (NumOperands, DescriptorBytes)
+        InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, NumOperands,
                    NameStr, InsertBefore);
   }
 
@@ -4079,21 +3657,22 @@ public:
                             BasicBlock *IfNormal, BasicBlock *IfException,
                             ArrayRef<Value *> Args, const Twine &NameStr,
                             BasicBlock *InsertAtEnd) {
-    unsigned Values = unsigned(Args.size()) + 3;
-    return new (Values) InvokeInst(Func, IfNormal, IfException, Args, None,
-                                   Values, NameStr, InsertAtEnd);
+    int NumOperands = ComputeNumOperands(Args.size());
+    return new (NumOperands) InvokeInst(Func, IfNormal, IfException, Args, None,
+                                        NumOperands, NameStr, InsertAtEnd);
   }
 
   static InvokeInst *Create(Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             ArrayRef<OperandBundleDef> Bundles,
                             const Twine &NameStr, BasicBlock *InsertAtEnd) {
-    unsigned Values = unsigned(Args.size()) + CountBundleInputs(Bundles) + 3;
+    int NumOperands =
+        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
     unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
 
-    return new (Values, DescriptorBytes)
-        InvokeInst(Func, IfNormal, IfException, Args, Bundles, Values, NameStr,
-                   InsertAtEnd);
+    return new (NumOperands, DescriptorBytes)
+        InvokeInst(Func, IfNormal, IfException, Args, Bundles, NumOperands,
+                   NameStr, InsertAtEnd);
   }
 
   /// Create a clone of \p II with a different set of operand bundles and
@@ -4114,43 +3693,18 @@ public:
     addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
   }
 
-  /// Return the function called, or null if this is an
-  /// indirect function invocation.
-  ///
-  Function *getCalledFunction() const {
-    return dyn_cast<Function>(Op<-3>());
-  }
-
-  /// Get a pointer to the function that is invoked by this
-  /// instruction
-  const Value *getCalledValue() const { return Op<-3>(); }
-        Value *getCalledValue()       { return Op<-3>(); }
-
-  /// Set the function called.
-  void setCalledFunction(Value* Fn) {
-    setCalledFunction(
-        cast<FunctionType>(cast<PointerType>(Fn->getType())->getElementType()),
-        Fn);
-  }
-  void setCalledFunction(FunctionType *FTy, Value *Fn) {
-    this->FTy = FTy;
-    assert(FTy == cast<FunctionType>(
-                      cast<PointerType>(Fn->getType())->getElementType()));
-    Op<-3>() = Fn;
-  }
-
   // get*Dest - Return the destination basic blocks...
   BasicBlock *getNormalDest() const {
-    return cast<BasicBlock>(Op<-2>());
+    return cast<BasicBlock>(Op<NormalDestOpEndIdx>());
   }
   BasicBlock *getUnwindDest() const {
-    return cast<BasicBlock>(Op<-1>());
+    return cast<BasicBlock>(Op<UnwindDestOpEndIdx>());
   }
   void setNormalDest(BasicBlock *B) {
-    Op<-2>() = reinterpret_cast<Value*>(B);
+    Op<NormalDestOpEndIdx>() = reinterpret_cast<Value *>(B);
   }
   void setUnwindDest(BasicBlock *B) {
-    Op<-1>() = reinterpret_cast<Value*>(B);
+    Op<UnwindDestOpEndIdx>() = reinterpret_cast<Value *>(B);
   }
 
   /// Get the landingpad instruction from the landing pad
@@ -4162,9 +3716,12 @@ public:
     return i == 0 ? getNormalDest() : getUnwindDest();
   }
 
-  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
-    assert(idx < 2 && "Successor # out of range for invoke!");
-    *(&Op<-2>() + idx) = reinterpret_cast<Value*>(NewSucc);
+  void setSuccessor(unsigned i, BasicBlock *NewSucc) {
+    assert(i < 2 && "Successor # out of range for invoke!");
+    if (i == 0)
+      setNormalDest(NewSucc);
+    else
+      setUnwindDest(NewSucc);
   }
 
   unsigned getNumSuccessors() const { return 2; }
@@ -4186,36 +3743,29 @@ private:
   }
 };
 
-template <>
-struct OperandTraits<CallBase<InvokeInst>>
-    : public VariadicOperandTraits<CallBase<InvokeInst>, 3> {};
-
 InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                        BasicBlock *IfException, ArrayRef<Value *> Args,
-                       ArrayRef<OperandBundleDef> Bundles, unsigned Values,
+                       ArrayRef<OperandBundleDef> Bundles, int NumOperands,
                        const Twine &NameStr, Instruction *InsertBefore)
-    : CallBase<InvokeInst>(Ty->getReturnType(), Instruction::Invoke,
-                           OperandTraits<CallBase<InvokeInst>>::op_end(this) -
-                               Values,
-                           Values, InsertBefore) {
+    : CallBase(Ty->getReturnType(), Instruction::Invoke,
+               OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands,
+               InsertBefore) {
   init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr);
 }
 
 InvokeInst::InvokeInst(Value *Func, BasicBlock *IfNormal,
                        BasicBlock *IfException, ArrayRef<Value *> Args,
-                       ArrayRef<OperandBundleDef> Bundles, unsigned Values,
+                       ArrayRef<OperandBundleDef> Bundles, int NumOperands,
                        const Twine &NameStr, BasicBlock *InsertAtEnd)
-    : CallBase<InvokeInst>(
-          cast<FunctionType>(
-              cast<PointerType>(Func->getType())->getElementType())
-              ->getReturnType(),
-          Instruction::Invoke,
-          OperandTraits<CallBase<InvokeInst>>::op_end(this) - Values, Values,
-          InsertAtEnd) {
+    : CallBase(cast<FunctionType>(
+                   cast<PointerType>(Func->getType())->getElementType())
+                   ->getReturnType(),
+               Instruction::Invoke,
+               OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands,
+               InsertAtEnd) {
   init(Func, IfNormal, IfException, Args, Bundles, NameStr);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                              ResumeInst Class
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 04de1ca63a26e877df3ad76b1966e71f9a9e9519..f503d3ebbdfebd496b1d5ca63253075e3b383849 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -90,6 +90,10 @@ class ReadNone<int argNo> : IntrinsicProperty {
 
 def IntrNoReturn : IntrinsicProperty;
 
+// IntrCold - Calls to this intrinsic are cold.
+// Parallels the cold attribute on LLVM IR functions.
+def IntrCold : IntrinsicProperty;
+
 // IntrNoduplicate - Calls to this intrinsic cannot be duplicated.
 // Parallels the noduplicate attribute on LLVM IR functions.
 def IntrNoDuplicate : IntrinsicProperty;
@@ -315,6 +319,78 @@ def int_gcwrite : Intrinsic<[],
                             [llvm_ptr_ty, llvm_ptr_ty, llvm_ptrptr_ty],
                             [IntrArgMemOnly, NoCapture<1>, NoCapture<2>]>;
 
+//===------------------- ObjC ARC runtime Intrinsics --------------------===//
+//
+// Note these are to support the Objective-C ARC optimizer which wants to
+// eliminate retain and releases where possible.
+
+def int_objc_autorelease                    : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_autoreleasePoolPop             : Intrinsic<[], [llvm_ptr_ty]>;
+def int_objc_autoreleasePoolPush            : Intrinsic<[llvm_ptr_ty], []>;
+def int_objc_autoreleaseReturnValue         : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_copyWeak                       : Intrinsic<[],
+                                                        [llvm_ptrptr_ty,
+                                                         llvm_ptrptr_ty]>;
+def int_objc_destroyWeak                    : Intrinsic<[], [llvm_ptrptr_ty]>;
+def int_objc_initWeak                       : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptrptr_ty,
+                                                         llvm_ptr_ty]>;
+def int_objc_loadWeak                       : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptrptr_ty]>;
+def int_objc_loadWeakRetained               : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptrptr_ty]>;
+def int_objc_moveWeak                       : Intrinsic<[],
+                                                        [llvm_ptrptr_ty,
+                                                         llvm_ptrptr_ty]>;
+def int_objc_release                        : Intrinsic<[], [llvm_ptr_ty]>;
+def int_objc_retain                         : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_retainAutorelease              : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_retainAutoreleaseReturnValue   : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_retainAutoreleasedReturnValue  : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_retainBlock                    : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_storeStrong                    : Intrinsic<[],
+                                                        [llvm_ptrptr_ty,
+                                                         llvm_ptr_ty]>;
+def int_objc_storeWeak                      : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptrptr_ty,
+                                                         llvm_ptr_ty]>;
+def int_objc_clang_arc_use                  : Intrinsic<[],
+                                                        [llvm_vararg_ty]>;
+def int_objc_unsafeClaimAutoreleasedReturnValue : Intrinsic<[llvm_ptr_ty],
+                                                            [llvm_ptr_ty]>;
+def int_objc_retainedObject                 : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_unretainedObject               : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_unretainedPointer              : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_retain_autorelease             : Intrinsic<[llvm_ptr_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_sync_enter                     : Intrinsic<[llvm_i32_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_sync_exit                      : Intrinsic<[llvm_i32_ty],
+                                                        [llvm_ptr_ty]>;
+def int_objc_arc_annotation_topdown_bbstart : Intrinsic<[],
+                                                        [llvm_ptrptr_ty,
+                                                         llvm_ptrptr_ty]>;
+def int_objc_arc_annotation_topdown_bbend   : Intrinsic<[],
+                                                        [llvm_ptrptr_ty,
+                                                         llvm_ptrptr_ty]>;
+def int_objc_arc_annotation_bottomup_bbstart  : Intrinsic<[],
+                                                          [llvm_ptrptr_ty,
+                                                           llvm_ptrptr_ty]>;
+def int_objc_arc_annotation_bottomup_bbend  : Intrinsic<[],
+                                                        [llvm_ptrptr_ty,
+                                                         llvm_ptrptr_ty]>;
+
+
 //===--------------------- Code Generator Intrinsics ----------------------===//
 //
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
@@ -735,7 +811,7 @@ def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
 
-//===------------------------- Fixed Point Intrinsics ---------------------===//
+//===------------------------- Saturation Arithmetic Intrinsics ---------------------===//
 //
 def int_sadd_sat : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
@@ -750,6 +826,12 @@ def int_usub_sat : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable]>;
 
+//===------------------------- Fixed Point Arithmetic Intrinsics ---------------------===//
+//
+def int_smul_fix : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
@@ -867,7 +949,7 @@ def int_coro_subfn_addr : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty],
 //
 def int_flt_rounds : Intrinsic<[llvm_i32_ty]>,
                      GCCBuiltin<"__builtin_flt_rounds">;
-def int_trap : Intrinsic<[], [], [IntrNoReturn]>,
+def int_trap : Intrinsic<[], [], [IntrNoReturn, IntrCold]>,
                GCCBuiltin<"__builtin_trap">;
 def int_debugtrap : Intrinsic<[]>,
                     GCCBuiltin<"__builtin_debugtrap">;
@@ -880,6 +962,10 @@ def int_experimental_deoptimize : Intrinsic<[llvm_any_ty], [llvm_vararg_ty],
 def int_experimental_guard : Intrinsic<[], [llvm_i1_ty, llvm_vararg_ty],
                                        [Throws]>;
 
+// Supports widenable conditions for guards represented as explicit branches.
+def int_experimental_widenable_condition : Intrinsic<[llvm_i1_ty], [],
+                                           [IntrInaccessibleMemOnly]>;
+
 // NOP: calls/invokes to this intrinsic are removed by codegen
 def int_donothing : Intrinsic<[], [], [IntrNoMem]>;
 
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 67e7da7797a4b908e8c0143aa85e8d4d23474afa..3ea364c78457e70e507e8b0cc8860a12f779585c 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -803,7 +803,7 @@ def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
 
 def int_amdgcn_s_buffer_load : Intrinsic <
-  [llvm_anyint_ty],
+  [llvm_any_ty],
   [llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // byte offset(SGPR/VGPR/imm)
    llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc)
@@ -835,7 +835,7 @@ class AMDGPURawBufferLoad : Intrinsic <
   [llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc) 
+   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
   [IntrReadMem], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
@@ -847,7 +847,7 @@ class AMDGPUStructBufferLoad : Intrinsic <
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc) 
+   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
   [IntrReadMem], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
@@ -859,7 +859,7 @@ class AMDGPURawBufferStore : Intrinsic <
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc) 
+   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
   [IntrWriteMem], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore;
@@ -872,7 +872,7 @@ class AMDGPUStructBufferStore : Intrinsic <
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc) 
+   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
   [IntrWriteMem], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
@@ -884,7 +884,7 @@ class AMDGPURawBufferAtomic : Intrinsic <
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc) 
+   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
   [], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1, 0>;
 def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
@@ -904,7 +904,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc) 
+   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
   [], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<2, 0>;
 
@@ -915,7 +915,7 @@ class AMDGPUStructBufferAtomic : Intrinsic <
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc) 
+   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
   [], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1, 0>;
 def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
@@ -936,7 +936,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc) 
+   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
   [], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<2, 0>;
 
@@ -980,7 +980,7 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic <
      llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
      llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
      llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
-     llvm_i32_ty],    // cachepolicy(imm; bit 0 = glc, bit 1 = slc) 
+     llvm_i32_ty],    // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
     [IntrReadMem], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 
@@ -991,7 +991,7 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic <
      llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
      llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
      llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
-     llvm_i32_ty],   // cachepolicy(imm; bit 0 = glc, bit 1 = slc) 
+     llvm_i32_ty],   // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
     [IntrWriteMem], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 
@@ -1002,7 +1002,7 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic <
      llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
      llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
      llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
-     llvm_i32_ty],    // cachepolicy(imm; bit 0 = glc, bit 1 = slc) 
+     llvm_i32_ty],    // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
     [IntrReadMem], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 
@@ -1014,7 +1014,7 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic <
      llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
      llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
      llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
-     llvm_i32_ty],   // cachepolicy(imm; bit 0 = glc, bit 1 = slc) 
+     llvm_i32_ty],   // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
     [IntrWriteMem], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 
diff --git a/include/llvm/IR/IntrinsicsHexagon.td b/include/llvm/IR/IntrinsicsHexagon.td
index 25f4215d68a848c073d38a9e85e4cba18287551d..ecc69a679553ffe99cb35047021c3a41e587547a 100644
--- a/include/llvm/IR/IntrinsicsHexagon.td
+++ b/include/llvm/IR/IntrinsicsHexagon.td
@@ -15,7 +15,7 @@
 //
 // All Hexagon intrinsics start with "llvm.hexagon.".
 let TargetPrefix = "hexagon" in {
-  /// Hexagon_Intrinsic - Base class for all Hexagon intrinsics.
+  /// Hexagon_Intrinsic - Base class for the majority of Hexagon intrinsics.
   class Hexagon_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
                               list<LLVMType> param_types,
                               list<IntrinsicProperty> properties>
@@ -30,397 +30,6 @@ let TargetPrefix = "hexagon" in {
     : Intrinsic<ret_types, param_types, properties>;
 }
 
-//===----------------------------------------------------------------------===//
-//
-// DEF_FUNCTION_TYPE_1(QI_ftype_MEM,BT_BOOL,BT_PTR) ->
-// Hexagon_qi_mem_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_mem_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_ptr_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(HI_ftype_SI,BT_I16,BT_INT) ->
-// Hexagon_hi_si_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_hi_si_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i16_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(SI_ftype_SI,BT_INT,BT_INT) ->
-// Hexagon_si_si_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_si_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(DI_ftype_SI,BT_LONGLONG,BT_INT) ->
-// Hexagon_di_si_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_si_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(SI_ftype_DI,BT_INT,BT_LONGLONG) ->
-// Hexagon_si_di_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_di_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(DI_ftype_DI,BT_LONGLONG,BT_LONGLONG) ->
-// Hexagon_di_di_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_di_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(QI_ftype_QI,BT_BOOL,BT_BOOL) ->
-// Hexagon_qi_qi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_qi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(QI_ftype_SI,BT_BOOL,BT_INT) ->
-// Hexagon_qi_si_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_si_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(DI_ftype_QI,BT_LONGLONG,BT_BOOL) ->
-// Hexagon_di_qi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_qi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_1(SI_ftype_QI,BT_INT,BT_BOOL) ->
-// Hexagon_si_qi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_qi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(QI_ftype_SISI,BT_BOOL,BT_INT,BT_INT) ->
-// Hexagon_qi_sisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_sisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(void_ftype_SISI,BT_VOID,BT_INT,BT_INT) ->
-// Hexagon_void_sisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_void_sisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_void_ty], [llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(SI_ftype_SISI,BT_INT,BT_INT,BT_INT) ->
-// Hexagon_si_sisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_sisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(USI_ftype_SISI,BT_UINT,BT_INT,BT_INT) ->
-// Hexagon_usi_sisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_usi_sisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(DI_ftype_SISI,BT_LONGLONG,BT_INT,BT_INT) ->
-// Hexagon_di_sisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_sisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(UDI_ftype_SISI,BT_ULONGLONG,BT_INT,BT_INT) ->
-// Hexagon_udi_sisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_udi_sisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(DI_ftype_SIDI,BT_LONGLONG,BT_INT,BT_LONGLONG) ->
-// Hexagon_di_sidi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_sidi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(DI_ftype_DISI,BT_LONGLONG,BT_LONGLONG,BT_INT) ->
-// Hexagon_di_disi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_disi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(SI_ftype_SIDI,BT_INT,BT_INT,BT_LONGLONG) ->
-// Hexagon_si_sidi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_sidi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(SI_ftype_DIDI,BT_INT,BT_LONGLONG,BT_LONGLONG) ->
-// Hexagon_si_didi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_didi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(DI_ftype_DIDI,BT_LONGLONG,BT_LONGLONG,BT_LONGLONG) ->
-// Hexagon_di_didi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_didi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(UDI_ftype_DIDI,BT_ULONGLONG,BT_LONGLONG,BT_LONGLONG) ->
-// Hexagon_udi_didi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_udi_didi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(SI_ftype_DISI,BT_INT,BT_LONGLONG,BT_INT) ->
-// Hexagon_si_disi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_disi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(QI_ftype_DIDI,BT_BOOL,BT_LONGLONG,BT_LONGLONG) ->
-// Hexagon_qi_didi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_didi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i64_ty, llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(QI_ftype_SIDI,BT_BOOL,BT_INT,BT_LONGLONG) ->
-// Hexagon_qi_didi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_sidi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(QI_ftype_DISI,BT_BOOL,BT_LONGLONG,BT_INT) ->
-// Hexagon_qi_disi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_disi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i64_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(QI_ftype_QIQI,BT_BOOL,BT_BOOL,BT_BOOL) ->
-// Hexagon_qi_qiqi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_qiqi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(QI_ftype_QIQIQI,BT_BOOL,BT_BOOL,BT_BOOL) ->
-// Hexagon_qi_qiqiqi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_qi_qiqiqi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i1_ty, llvm_i1_ty, llvm_i1_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(SI_ftype_QIQI,BT_INT,BT_BOOL,BT_BOOL) ->
-// Hexagon_si_qiqi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_qiqi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_2(SI_ftype_QISI,BT_INT,BT_BOOL,BT_INT) ->
-// Hexagon_si_qisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_qisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i1_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(void_ftype_SISISI,BT_VOID,BT_INT,BT_INT,BT_INT) ->
-// Hexagon_void_sisisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_void_sisisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_void_ty], [llvm_i32_ty, llvm_i32_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(SI_ftype_SISISI,BT_INT,BT_INT,BT_INT,BT_INT) ->
-// Hexagon_si_sisisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_sisisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(DI_ftype_SISISI,BT_LONGLONG,BT_INT,BT_INT,BT_INT) ->
-// Hexagon_di_sisisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_sisisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(SI_ftype_DISISI,BT_INT,BT_LONGLONG,BT_INT,BT_INT) ->
-// Hexagon_si_disisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_disisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i64_ty, llvm_i32_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(DI_ftype_DISISI,BT_LONGLONG,BT_LONGLONG,BT_INT,BT_INT) ->
-// Hexagon_di_disisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_disisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(SI_ftype_SIDISI,BT_INT,BT_INT,BT_LONGLONG,BT_INT) ->
-// Hexagon_si_sidisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_sidisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(DI_ftype_DIDISI,BT_LONGLONG,BT_LONGLONG,
-//                     BT_LONGLONG,BT_INT) ->
-// Hexagon_di_didisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_didisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(SI_ftype_SIDIDI,BT_INT,BT_INT,BT_LONGLONG,BT_LONGLONG) ->
-// Hexagon_si_sididi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_sididi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty,
-                           llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(DI_ftype_DIDIDI,BT_LONGLONG,BT_LONGLONG,BT_LONGLONG,
-//                     BT_LONGLONG) ->
-// Hexagon_di_dididi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_dididi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty,
-                           llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(SI_ftype_SISIDI,BT_INT,BT_INT,BT_INT,BT_LONGLONG) ->
-// Hexagon_si_sisidi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_sisidi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
-                           llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(SI_ftype_QISISI,BT_INT,BT_BOOL,BT_INT,BT_INT) ->
-// Hexagon_si_qisisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_qisisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(DI_ftype_QISISI,BT_LONGLONG,BT_BOOL,BT_INT,BT_INT) ->
-// Hexagon_di_qisisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_qisisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i1_ty, llvm_i32_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(DI_ftype_QIDIDI,BT_LONGLONG,BT_BOOL,BT_LONGLONG,
-//                     BT_LONGLONG) ->
-// Hexagon_di_qididi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_qididi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty,
-                           llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_3(DI_ftype_DIDIQI,BT_LONGLONG,BT_LONGLONG,BT_LONGLONG,
-//                     BT_BOOL) ->
-// Hexagon_di_didiqi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_didiqi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_4(SI_ftype_SISISISI,BT_INT,BT_INT,BT_INT,BT_INT,BT_INT) ->
-// Hexagon_si_sisisisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_sisisisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
-                           llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// DEF_FUNCTION_TYPE_4(DI_ftype_DIDISISI,BT_LONGLONG,BT_LONGLONG,
-//                     BT_LONGLONG,BT_INT,BT_INT) ->
-// Hexagon_di_didisisi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_didisisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty,
-                           llvm_i32_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-
 class Hexagon_mem_memmemsi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty,
@@ -457,10519 +66,6291 @@ class Hexagon_mem_memdisisi_Intrinsic<string GCCIntSuffix>
                            llvm_i32_ty, llvm_i32_ty],
                           [IntrWriteMem]>;
 
-class Hexagon_v256_v256v256_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
-                          [IntrArgMemOnly]>;
-
 //
-// Hexagon_sf_df_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_ldd,PTR_ftype_PTRPTRSISI,4)
 //
-class Hexagon_sf_si_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_float_ty], [llvm_i32_ty],
-                          [IntrNoMem, Throws]>;
+def int_hexagon_circ_ldd :
+Hexagon_mem_memmemsisi_Intrinsic<"circ_ldd">;
 //
-// Hexagon_sf_df_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_ldw,PTR_ftype_PTRPTRSISI,4)
 //
-class Hexagon_sf_df_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_float_ty], [llvm_double_ty],
-                          [IntrNoMem]>;
+def int_hexagon_circ_ldw :
+Hexagon_mem_memmemsisi_Intrinsic<"circ_ldw">;
 //
-// Hexagon_sf_di_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_ldh,PTR_ftype_PTRPTRSISI,4)
 //
-class Hexagon_sf_di_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_float_ty], [llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_circ_ldh :
+Hexagon_mem_memmemsisi_Intrinsic<"circ_ldh">;
 //
-// Hexagon_df_sf_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_lduh,PTR_ftype_PTRPTRSISI,4)
 //
-class Hexagon_df_sf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_double_ty], [llvm_float_ty],
-                          [IntrNoMem]>;
+def int_hexagon_circ_lduh :
+Hexagon_mem_memmemsisi_Intrinsic<"circ_lduh">;
 //
-// Hexagon_di_sf_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_ldb,PTR_ftype_PTRPTRSISI,4)
 //
-class Hexagon_di_sf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_float_ty],
-                          [IntrNoMem]>;
+def int_hexagon_circ_ldb :
+Hexagon_mem_memmemsisi_Intrinsic<"circ_ldb">;
 //
-// Hexagon_sf_sf_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_ldub,PTR_ftype_PTRPTRSISI,4)
 //
-class Hexagon_sf_sf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_float_ty], [llvm_float_ty],
-                          [IntrNoMem]>;
+def int_hexagon_circ_ldub :
+Hexagon_mem_memmemsisi_Intrinsic<"circ_ldub">;
+
 //
-// Hexagon_si_sf_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_std,PTR_ftype_PTRDISISI,4)
 //
-class Hexagon_si_sf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_float_ty],
-                          [IntrNoMem]>;
+def int_hexagon_circ_std :
+Hexagon_mem_memdisisi_Intrinsic<"circ_std">;
 //
-// Hexagon_si_df_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_stw,PTR_ftype_PTRSISISI,4)
 //
-class Hexagon_si_df_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_double_ty],
-                          [IntrNoMem]>;
+def int_hexagon_circ_stw :
+Hexagon_mem_memsisisi_Intrinsic<"circ_stw">;
 //
-// Hexagon_sf_sfsf_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_sth,PTR_ftype_PTRSISISI,4)
 //
-class Hexagon_sf_sfsf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-                          [IntrNoMem, Throws]>;
+def int_hexagon_circ_sth :
+Hexagon_mem_memsisisi_Intrinsic<"circ_sth">;
 //
-// Hexagon_si_sfsf_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_sthhi,PTR_ftype_PTRSISISI,4)
 //
-class Hexagon_si_sfsf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_float_ty, llvm_float_ty],
-                          [IntrNoMem, Throws]>;
+def int_hexagon_circ_sthhi :
+Hexagon_mem_memsisisi_Intrinsic<"circ_sthhi">;
 //
-// Hexagon_si_sfsi_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO_NONCONST(circ_stb,PTR_ftype_PTRSISISI,4)
 //
-class Hexagon_si_sfsi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_float_ty, llvm_i32_ty],
-                          [IntrNoMem, Throws]>;
+def int_hexagon_circ_stb :
+Hexagon_mem_memsisisi_Intrinsic<"circ_stb">;
+
 //
-// Hexagon_qi_sfqi_Intrinsic<string GCCIntSuffix>
+// BUILTIN_INFO(HEXAGON.dcfetch_A,v_ftype_DI*,1)
 //
-class Hexagon_qi_sfqi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_float_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// Hexagon_sf_sfsfsf_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_sf_sfsfsf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_float_ty], [llvm_float_ty, llvm_float_ty,
-                                            llvm_float_ty],
-                          [IntrNoMem, Throws]>;
-//
-// Hexagon_sf_sfsfsfqi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_sf_sfsfsfqi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_float_ty], [llvm_float_ty, llvm_float_ty,
-                                            llvm_float_ty,
-                           llvm_i32_ty],
-                          [IntrNoMem, Throws]>;
-//
-// Hexagon_di_dididi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_dididisi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty,
-                           llvm_i64_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-//
-// Hexagon_df_si_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_df_si_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_double_ty], [llvm_i32_ty],
-                          [IntrNoMem, Throws]>;
-//
-// Hexagon_df_di_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_df_di_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_double_ty], [llvm_i64_ty],
-                          [IntrNoMem]>;
-//
-// Hexagon_di_df_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_di_df_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_double_ty],
-                          [IntrNoMem]>;
-//
-// Hexagon_df_df_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_df_df_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_double_ty], [llvm_double_ty],
-                          [IntrNoMem]>;
-//
-// Hexagon_df_dfdf_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_df_dfdf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-                          [IntrNoMem, Throws]>;
-//
-// Hexagon_si_dfdf_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_dfdf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_double_ty, llvm_double_ty],
-                          [IntrNoMem, Throws]>;
-//
-// Hexagon_si_dfsi_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_si_dfsi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_double_ty, llvm_i32_ty],
-                          [IntrNoMem, Throws]>;
-//
-//
-// Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_double_ty], [llvm_double_ty, llvm_double_ty,
-                                             llvm_double_ty],
-                          [IntrNoMem, Throws]>;
-//
-// Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_df_dfdfdfqi_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_double_ty], [llvm_double_ty, llvm_double_ty,
-                                             llvm_double_ty,
-                          llvm_i32_ty],
-                          [IntrNoMem, Throws]>;
+def int_hexagon_prefetch :
+Hexagon_Intrinsic<"HEXAGON_prefetch", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dccleana :
+Hexagon_Intrinsic<"HEXAGON_Y2_dccleana", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dccleaninva :
+Hexagon_Intrinsic<"HEXAGON_Y2_dccleaninva", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dcinva :
+Hexagon_Intrinsic<"HEXAGON_Y2_dcinva", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dczeroa :
+Hexagon_Intrinsic<"HEXAGON_Y2_dczeroa", [], [llvm_ptr_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrHasSideEffects]>;
+def int_hexagon_Y4_l2fetch :
+Hexagon_Intrinsic<"HEXAGON_Y4_l2fetch", [], [llvm_ptr_ty, llvm_i32_ty], []>;
+def int_hexagon_Y5_l2fetch :
+Hexagon_Intrinsic<"HEXAGON_Y5_l2fetch", [], [llvm_ptr_ty, llvm_i64_ty], []>;
 
+def llvm_ptr32_ty : LLVMPointerType<llvm_i32_ty>;
+def llvm_ptr64_ty : LLVMPointerType<llvm_i64_ty>;
 
-// This one below will not be auto-generated,
-// so make sure, you don't overwrite this one.
-//
-// BUILTIN_INFO_NONCONST(circ_ldd,PTR_ftype_PTRPTRSISI,4)
-//
-def int_hexagon_circ_ldd :
-Hexagon_mem_memmemsisi_Intrinsic<"circ_ldd">;
-//
-// BUILTIN_INFO_NONCONST(circ_ldw,PTR_ftype_PTRPTRSISI,4)
-//
-def int_hexagon_circ_ldw :
-Hexagon_mem_memmemsisi_Intrinsic<"circ_ldw">;
-//
-// BUILTIN_INFO_NONCONST(circ_ldh,PTR_ftype_PTRPTRSISI,4)
-//
-def int_hexagon_circ_ldh :
-Hexagon_mem_memmemsisi_Intrinsic<"circ_ldh">;
-//
-// BUILTIN_INFO_NONCONST(circ_lduh,PTR_ftype_PTRPTRSISI,4)
-//
-def int_hexagon_circ_lduh :
-Hexagon_mem_memmemsisi_Intrinsic<"circ_lduh">;
-//
-// BUILTIN_INFO_NONCONST(circ_ldb,PTR_ftype_PTRPTRSISI,4)
-//
-def int_hexagon_circ_ldb :
-Hexagon_mem_memmemsisi_Intrinsic<"circ_ldb">;
-//
-// BUILTIN_INFO_NONCONST(circ_ldub,PTR_ftype_PTRPTRSISI,4)
-//
-def int_hexagon_circ_ldub :
-Hexagon_mem_memmemsisi_Intrinsic<"circ_ldub">;
+// Mark locked loads as read/write to prevent any accidental reordering.
+def int_hexagon_L2_loadw_locked :
+Hexagon_Intrinsic<"HEXAGON_L2_loadw_locked", [llvm_i32_ty], [llvm_ptr32_ty],
+      [IntrArgMemOnly, NoCapture<0>]>;
+def int_hexagon_L4_loadd_locked :
+Hexagon_Intrinsic<"HEXAGON_L4_loadd_locked", [llvm_i64_ty], [llvm_ptr64_ty],
+      [IntrArgMemOnly, NoCapture<0>]>;
 
-//
-// BUILTIN_INFO_NONCONST(circ_std,PTR_ftype_PTRDISISI,4)
-//
-def int_hexagon_circ_std :
-Hexagon_mem_memdisisi_Intrinsic<"circ_std">;
-//
-// BUILTIN_INFO_NONCONST(circ_stw,PTR_ftype_PTRSISISI,4)
-//
-def int_hexagon_circ_stw :
-Hexagon_mem_memsisisi_Intrinsic<"circ_stw">;
-//
-// BUILTIN_INFO_NONCONST(circ_sth,PTR_ftype_PTRSISISI,4)
-//
-def int_hexagon_circ_sth :
-Hexagon_mem_memsisisi_Intrinsic<"circ_sth">;
-//
-// BUILTIN_INFO_NONCONST(circ_sthhi,PTR_ftype_PTRSISISI,4)
-//
-def int_hexagon_circ_sthhi :
-Hexagon_mem_memsisisi_Intrinsic<"circ_sthhi">;
-//
-// BUILTIN_INFO_NONCONST(circ_stb,PTR_ftype_PTRSISISI,4)
-//
-def int_hexagon_circ_stb :
-Hexagon_mem_memsisisi_Intrinsic<"circ_stb">;
+def int_hexagon_S2_storew_locked :
+Hexagon_Intrinsic<"HEXAGON_S2_storew_locked", [llvm_i32_ty],
+      [llvm_ptr32_ty, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>;
+def int_hexagon_S4_stored_locked :
+Hexagon_Intrinsic<"HEXAGON_S4_stored_locked", [llvm_i32_ty],
+      [llvm_ptr64_ty, llvm_i64_ty], [IntrArgMemOnly, NoCapture<0>]>;
 
+def int_hexagon_vmemcpy : Hexagon_Intrinsic<"hexagon_vmemcpy",
+    [], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
+    [IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>, ReadOnly<1>]>;
 
-def int_hexagon_mm256i_vaddw :
-Hexagon_v256_v256v256_Intrinsic<"_mm256i_vaddw">;
+def int_hexagon_vmemset : Hexagon_Intrinsic<"hexagon_vmemset",
+    [], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
+    [IntrArgMemOnly, NoCapture<0>, WriteOnly<0>]>;
 
+multiclass Hexagon_custom_circ_ld_Intrinsic<LLVMType ElTy> {
+  def NAME#_pci : Hexagon_NonGCC_Intrinsic<
+    [ElTy, llvm_ptr_ty],
+    [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty],
+    [IntrArgMemOnly, NoCapture<3>]>;
+  def NAME#_pcr : Hexagon_NonGCC_Intrinsic<
+    [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty],
+    [IntrArgMemOnly, NoCapture<2>]>;
+}
+
+defm int_hexagon_L2_loadrub : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadrb : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadruh : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadrh : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadri : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_L2_loadrd : Hexagon_custom_circ_ld_Intrinsic<llvm_i64_ty>;
+
+multiclass Hexagon_custom_circ_st_Intrinsic<LLVMType ElTy> {
+  def NAME#_pci : Hexagon_NonGCC_Intrinsic<
+    [llvm_ptr_ty],
+    [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ElTy, llvm_ptr_ty],
+    [IntrArgMemOnly, NoCapture<4>]>;
+  def NAME#_pcr : Hexagon_NonGCC_Intrinsic<
+    [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, ElTy, llvm_ptr_ty],
+    [IntrArgMemOnly, NoCapture<3>]>;
+}
+
+defm int_hexagon_S2_storerb : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_S2_storerh : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_S2_storerf : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_S2_storeri : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
+defm int_hexagon_S2_storerd : Hexagon_custom_circ_st_Intrinsic<llvm_i64_ty>;
+
+// The front-end emits the intrinsic call with only two arguments. The third
+// argument from the builtin is already used by front-end to write to memory
+// by generating a store.
+class Hexagon_custom_brev_ld_Intrinsic<LLVMType ElTy>
+ : Hexagon_NonGCC_Intrinsic<
+    [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty],
+    [IntrReadMem]>;
+
+def int_hexagon_L2_loadrub_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadrb_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadruh_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadrh_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadri_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
+def int_hexagon_L2_loadrd_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i64_ty>;
+
+def int_hexagon_S2_storerb_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stb">;
+def int_hexagon_S2_storerh_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sth">;
+def int_hexagon_S2_storerf_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sthhi">;
+def int_hexagon_S2_storeri_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stw">;
+def int_hexagon_S2_storerd_pbr : Hexagon_mem_memdisi_Intrinsic<"brev_std">;
 
-// This one above will not be auto-generated,
-// so make sure, you don't overwrite this one.
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpeq,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpeq :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpeq">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpgt,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpgt :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgt">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpgtu,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpgtu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgtu">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpeqp,QI_ftype_DIDI,2)
-//
-def int_hexagon_C2_cmpeqp :
-Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpeqp">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpgtp,QI_ftype_DIDI,2)
-//
-def int_hexagon_C2_cmpgtp :
-Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpgtp">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpgtup,QI_ftype_DIDI,2)
-//
-def int_hexagon_C2_cmpgtup :
-Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpgtup">;
-//
-// BUILTIN_INFO(HEXAGON.A4_rcmpeqi,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_rcmpeqi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpeqi">;
-//
-// BUILTIN_INFO(HEXAGON.A4_rcmpneqi,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_rcmpneqi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpneqi">;
-//
-// BUILTIN_INFO(HEXAGON.A4_rcmpeq,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_rcmpeq :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpeq">;
-//
-// BUILTIN_INFO(HEXAGON.A4_rcmpneq,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_rcmpneq :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpneq">;
-//
-// BUILTIN_INFO(HEXAGON.C2_bitsset,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_bitsset :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsset">;
-//
-// BUILTIN_INFO(HEXAGON.C2_bitsclr,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_bitsclr :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsclr">;
-//
-// BUILTIN_INFO(HEXAGON.C4_nbitsset,QI_ftype_SISI,2)
-//
-def int_hexagon_C4_nbitsset :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsset">;
-//
-// BUILTIN_INFO(HEXAGON.C4_nbitsclr,QI_ftype_SISI,2)
-//
-def int_hexagon_C4_nbitsclr :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsclr">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpeqi,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpeqi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpeqi">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpgti,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpgti :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgti">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpgtui,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpgtui :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgtui">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpgei,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpgei :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgei">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpgeui,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpgeui :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgeui">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmplt,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmplt :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmplt">;
-//
-// BUILTIN_INFO(HEXAGON.C2_cmpltu,QI_ftype_SISI,2)
-//
-def int_hexagon_C2_cmpltu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpltu">;
-//
-// BUILTIN_INFO(HEXAGON.C2_bitsclri,QI_ftype_SISI,2)
 //
-def int_hexagon_C2_bitsclri :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsclri">;
+// Masked vector stores
 //
-// BUILTIN_INFO(HEXAGON.C4_nbitsclri,QI_ftype_SISI,2)
+
 //
-def int_hexagon_C4_nbitsclri :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsclri">;
+// Hexagon_vv64ivmemv512_Intrinsic<string GCCIntSuffix>
+// tag: V6_vS32b_qpred_ai
+class Hexagon_vv64ivmemv512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_v512i1_ty,llvm_ptr_ty,llvm_v16i32_ty],
+                          [IntrArgMemOnly]>;
+
 //
-// BUILTIN_INFO(HEXAGON.C4_cmpneqi,QI_ftype_SISI,2)
+// Hexagon_vv128ivmemv1024_Intrinsic<string GCCIntSuffix>
+// tag: V6_vS32b_qpred_ai_128B
+class Hexagon_vv128ivmemv1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_v1024i1_ty,llvm_ptr_ty,llvm_v32i32_ty],
+                          [IntrArgMemOnly]>;
+
+def int_hexagon_V6_vS32b_qpred_ai :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_qpred_ai">;
+
+def int_hexagon_V6_vS32b_nqpred_ai :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nqpred_ai">;
+
+def int_hexagon_V6_vS32b_nt_qpred_ai :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nt_qpred_ai">;
+
+def int_hexagon_V6_vS32b_nt_nqpred_ai :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nt_nqpred_ai">;
+
+def int_hexagon_V6_vS32b_qpred_ai_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_qpred_ai_128B">;
+
+def int_hexagon_V6_vS32b_nqpred_ai_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nqpred_ai_128B">;
+
+def int_hexagon_V6_vS32b_nt_qpred_ai_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nt_qpred_ai_128B">;
+
+def int_hexagon_V6_vS32b_nt_nqpred_ai_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nt_nqpred_ai_128B">;
+
+def int_hexagon_V6_vmaskedstoreq :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstoreq">;
+
+def int_hexagon_V6_vmaskedstorenq :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorenq">;
+
+def int_hexagon_V6_vmaskedstorentq :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentq">;
+
+def int_hexagon_V6_vmaskedstorentnq :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentnq">;
+
+def int_hexagon_V6_vmaskedstoreq_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstoreq_128B">;
+
+def int_hexagon_V6_vmaskedstorenq_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorenq_128B">;
+
+def int_hexagon_V6_vmaskedstorentq_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentq_128B">;
+
+def int_hexagon_V6_vmaskedstorentnq_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentnq_128B">;
+
+class Hexagon_V65_vvmemiiv512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,
+                               llvm_v16i32_ty],
+                          [IntrArgMemOnly]>;
+
+class Hexagon_V65_vvmemiiv1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,
+                               llvm_v32i32_ty],
+                          [IntrArgMemOnly]>;
+
+class Hexagon_V65_vvmemiiv2048_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,
+                               llvm_v64i32_ty],
+                          [IntrArgMemOnly]>;
+
+class Hexagon_V65_vvmemv64iiiv512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_ptr_ty,llvm_v512i1_ty,llvm_i32_ty,
+                               llvm_i32_ty,llvm_v16i32_ty],
+                          [IntrArgMemOnly]>;
+
+class Hexagon_V65_vvmemv128iiiv1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_ptr_ty,llvm_v1024i1_ty,llvm_i32_ty,
+                               llvm_i32_ty,llvm_v32i32_ty],
+                          [IntrArgMemOnly]>;
+
+class Hexagon_V65_vvmemv64iiiv1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_ptr_ty,llvm_v512i1_ty,llvm_i32_ty,
+                               llvm_i32_ty,llvm_v32i32_ty],
+                          [IntrArgMemOnly]>;
+
+class Hexagon_V65_vvmemv128iiiv2048_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_ptr_ty,llvm_v1024i1_ty,llvm_i32_ty,
+                               llvm_i32_ty,llvm_v64i32_ty],
+                          [IntrArgMemOnly]>;
+
+def int_hexagon_V6_vgathermw :
+Hexagon_V65_vvmemiiv512_Intrinsic<"HEXAGON_V6_vgathermw">;
+
+def int_hexagon_V6_vgathermw_128B :
+Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermw_128B">;
+
+def int_hexagon_V6_vgathermh :
+Hexagon_V65_vvmemiiv512_Intrinsic<"HEXAGON_V6_vgathermh">;
+
+def int_hexagon_V6_vgathermh_128B :
+Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermh_128B">;
+
+def int_hexagon_V6_vgathermhw :
+Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermhw">;
+
+def int_hexagon_V6_vgathermhw_128B :
+Hexagon_V65_vvmemiiv2048_Intrinsic<"HEXAGON_V6_vgathermhw_128B">;
+
+def int_hexagon_V6_vgathermwq :
+Hexagon_V65_vvmemv64iiiv512_Intrinsic<"HEXAGON_V6_vgathermwq">;
+
+def int_hexagon_V6_vgathermwq_128B :
+Hexagon_V65_vvmemv128iiiv1024_Intrinsic<"HEXAGON_V6_vgathermwq_128B">;
+
+def int_hexagon_V6_vgathermhq :
+Hexagon_V65_vvmemv64iiiv512_Intrinsic<"HEXAGON_V6_vgathermhq">;
+
+def int_hexagon_V6_vgathermhq_128B :
+Hexagon_V65_vvmemv128iiiv1024_Intrinsic<"HEXAGON_V6_vgathermhq_128B">;
+
+def int_hexagon_V6_vgathermhwq :
+Hexagon_V65_vvmemv64iiiv1024_Intrinsic<"HEXAGON_V6_vgathermhwq">;
+
+def int_hexagon_V6_vgathermhwq_128B :
+Hexagon_V65_vvmemv128iiiv2048_Intrinsic<"HEXAGON_V6_vgathermhwq_128B">;
+
+class Hexagon_V65_viiv512v512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_i32_ty,llvm_i32_ty,
+                                           llvm_v16i32_ty,llvm_v16i32_ty],
+                          [IntrWriteMem]>;
+
+class Hexagon_V65_viiv1024v1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_i32_ty,llvm_i32_ty,
+                                           llvm_v32i32_ty,llvm_v32i32_ty],
+                          [IntrWriteMem]>;
+
+class Hexagon_V65_vv64iiiv512v512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_v512i1_ty,llvm_i32_ty,
+                                           llvm_i32_ty,llvm_v16i32_ty,
+                                           llvm_v16i32_ty],
+                          [IntrWriteMem]>;
+
+class Hexagon_V65_vv128iiiv1024v1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_v1024i1_ty,llvm_i32_ty,
+                                           llvm_i32_ty,llvm_v32i32_ty,
+                                           llvm_v32i32_ty],
+                          [IntrWriteMem]>;
+
+class Hexagon_V65_viiv1024v512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_i32_ty,llvm_i32_ty,
+                                           llvm_v32i32_ty,llvm_v16i32_ty],
+                          [IntrWriteMem]>;
+
+class Hexagon_V65_viiv2048v1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_i32_ty,llvm_i32_ty,
+                                           llvm_v64i32_ty,llvm_v32i32_ty],
+                          [IntrWriteMem]>;
+
+class Hexagon_V65_vv64iiiv1024v512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_v512i1_ty,llvm_i32_ty,
+                                           llvm_i32_ty,llvm_v32i32_ty,
+                                           llvm_v16i32_ty],
+                          [IntrWriteMem]>;
+
+class Hexagon_V65_vv128iiiv2048v1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [], [llvm_v1024i1_ty,llvm_i32_ty,
+                                           llvm_i32_ty,llvm_v64i32_ty,
+                                           llvm_v32i32_ty],
+                          [IntrWriteMem]>;
+
+class Hexagon_V65_v2048_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v64i32_ty], [],
+                          [IntrNoMem]>;
+
 //
-def int_hexagon_C4_cmpneqi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpneqi">;
+// BUILTIN_INFO(HEXAGON.V6_vscattermw,v_ftype_SISIVIVI,4)
+// tag : V6_vscattermw
+def int_hexagon_V6_vscattermw :
+Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermw">;
+
 //
-// BUILTIN_INFO(HEXAGON.C4_cmpltei,QI_ftype_SISI,2)
+// BUILTIN_INFO(HEXAGON.V6_vscattermw_128B,v_ftype_SISIVIVI,4)
+// tag : V6_vscattermw_128B
+def int_hexagon_V6_vscattermw_128B :
+Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermw_128B">;
+
 //
-def int_hexagon_C4_cmpltei :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpltei">;
+// BUILTIN_INFO(HEXAGON.V6_vscattermh,v_ftype_SISIVIVI,4)
+// tag : V6_vscattermh
+def int_hexagon_V6_vscattermh :
+Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermh">;
+
 //
-// BUILTIN_INFO(HEXAGON.C4_cmplteui,QI_ftype_SISI,2)
+// BUILTIN_INFO(HEXAGON.V6_vscattermh_128B,v_ftype_SISIVIVI,4)
+// tag : V6_vscattermh_128B
+def int_hexagon_V6_vscattermh_128B :
+Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermh_128B">;
+
 //
-def int_hexagon_C4_cmplteui :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplteui">;
+// BUILTIN_INFO(HEXAGON.V6_vscattermw_add,v_ftype_SISIVIVI,4)
+// tag : V6_vscattermw_add
+def int_hexagon_V6_vscattermw_add :
+Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermw_add">;
+
 //
-// BUILTIN_INFO(HEXAGON.C4_cmpneq,QI_ftype_SISI,2)
+// BUILTIN_INFO(HEXAGON.V6_vscattermw_add_128B,v_ftype_SISIVIVI,4)
+// tag : V6_vscattermw_add_128B
+def int_hexagon_V6_vscattermw_add_128B :
+Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermw_add_128B">;
+
 //
-def int_hexagon_C4_cmpneq :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpneq">;
+// BUILTIN_INFO(HEXAGON.V6_vscattermh_add,v_ftype_SISIVIVI,4)
+// tag : V6_vscattermh_add
+def int_hexagon_V6_vscattermh_add :
+Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermh_add">;
+
 //
-// BUILTIN_INFO(HEXAGON.C4_cmplte,QI_ftype_SISI,2)
+// BUILTIN_INFO(HEXAGON.V6_vscattermh_add_128B,v_ftype_SISIVIVI,4)
+// tag : V6_vscattermh_add_128B
+def int_hexagon_V6_vscattermh_add_128B :
+Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermh_add_128B">;
+
 //
-def int_hexagon_C4_cmplte :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplte">;
+// BUILTIN_INFO(HEXAGON.V6_vscattermwq,v_ftype_QVSISIVIVI,5)
+// tag : V6_vscattermwq
+def int_hexagon_V6_vscattermwq :
+Hexagon_V65_vv64iiiv512v512_Intrinsic<"HEXAGON_V6_vscattermwq">;
+
 //
-// BUILTIN_INFO(HEXAGON.C4_cmplteu,QI_ftype_SISI,2)
+// BUILTIN_INFO(HEXAGON.V6_vscattermwq_128B,v_ftype_QVSISIVIVI,5)
+// tag : V6_vscattermwq_128B
+def int_hexagon_V6_vscattermwq_128B :
+Hexagon_V65_vv128iiiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermwq_128B">;
+
 //
-def int_hexagon_C4_cmplteu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplteu">;
+// BUILTIN_INFO(HEXAGON.V6_vscattermhq,v_ftype_QVSISIVIVI,5)
+// tag : V6_vscattermhq
+def int_hexagon_V6_vscattermhq :
+Hexagon_V65_vv64iiiv512v512_Intrinsic<"HEXAGON_V6_vscattermhq">;
+
 //
-// BUILTIN_INFO(HEXAGON.C2_and,QI_ftype_QIQI,2)
+// BUILTIN_INFO(HEXAGON.V6_vscattermhq_128B,v_ftype_QVSISIVIVI,5)
+// tag : V6_vscattermhq_128B
+def int_hexagon_V6_vscattermhq_128B :
+Hexagon_V65_vv128iiiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermhq_128B">;
+
 //
-def int_hexagon_C2_and :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_and">;
-//
-// BUILTIN_INFO(HEXAGON.C2_or,QI_ftype_QIQI,2)
-//
-def int_hexagon_C2_or :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_or">;
-//
-// BUILTIN_INFO(HEXAGON.C2_xor,QI_ftype_QIQI,2)
-//
-def int_hexagon_C2_xor :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_xor">;
-//
-// BUILTIN_INFO(HEXAGON.C2_andn,QI_ftype_QIQI,2)
-//
-def int_hexagon_C2_andn :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_andn">;
-//
-// BUILTIN_INFO(HEXAGON.C2_not,QI_ftype_QI,1)
-//
-def int_hexagon_C2_not :
-Hexagon_si_si_Intrinsic<"HEXAGON_C2_not">;
-//
-// BUILTIN_INFO(HEXAGON.C2_orn,QI_ftype_QIQI,2)
-//
-def int_hexagon_C2_orn :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_orn">;
-//
-// BUILTIN_INFO(HEXAGON.C4_and_and,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_and_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_and">;
-//
-// BUILTIN_INFO(HEXAGON.C4_and_or,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_and_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_or">;
-//
-// BUILTIN_INFO(HEXAGON.C4_or_and,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_or_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_and">;
-//
-// BUILTIN_INFO(HEXAGON.C4_or_or,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_or_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_or">;
-//
-// BUILTIN_INFO(HEXAGON.C4_and_andn,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_and_andn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_andn">;
-//
-// BUILTIN_INFO(HEXAGON.C4_and_orn,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_and_orn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_orn">;
-//
-// BUILTIN_INFO(HEXAGON.C4_or_andn,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_or_andn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_andn">;
-//
-// BUILTIN_INFO(HEXAGON.C4_or_orn,QI_ftype_QIQIQI,3)
-//
-def int_hexagon_C4_or_orn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_orn">;
-//
-// BUILTIN_INFO(HEXAGON.C2_pxfer_map,QI_ftype_QI,1)
-//
-def int_hexagon_C2_pxfer_map :
-Hexagon_si_qi_Intrinsic<"HEXAGON_C2_pxfer_map">;
-//
-// BUILTIN_INFO(HEXAGON.C2_any8,QI_ftype_QI,1)
-//
-def int_hexagon_C2_any8 :
-Hexagon_si_qi_Intrinsic<"HEXAGON_C2_any8">;
-//
-// BUILTIN_INFO(HEXAGON.C2_all8,QI_ftype_QI,1)
-//
-def int_hexagon_C2_all8 :
-Hexagon_si_qi_Intrinsic<"HEXAGON_C2_all8">;
-//
-// BUILTIN_INFO(HEXAGON.C2_vitpack,SI_ftype_QIQI,2)
-//
-def int_hexagon_C2_vitpack :
-Hexagon_si_qiqi_Intrinsic<"HEXAGON_C2_vitpack">;
-//
-// BUILTIN_INFO(HEXAGON.C2_mux,SI_ftype_QISISI,3)
-//
-def int_hexagon_C2_mux :
-Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_mux">;
-//
-// BUILTIN_INFO(HEXAGON.C2_muxii,SI_ftype_QISISI,3)
-//
-def int_hexagon_C2_muxii :
-Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxii">;
-//
-// BUILTIN_INFO(HEXAGON.C2_muxir,SI_ftype_QISISI,3)
-//
-def int_hexagon_C2_muxir :
-Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxir">;
-//
-// BUILTIN_INFO(HEXAGON.C2_muxri,SI_ftype_QISISI,3)
-//
-def int_hexagon_C2_muxri :
-Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxri">;
-//
-// BUILTIN_INFO(HEXAGON.C2_vmux,DI_ftype_QIDIDI,3)
-//
-def int_hexagon_C2_vmux :
-Hexagon_di_qididi_Intrinsic<"HEXAGON_C2_vmux">;
-//
-// BUILTIN_INFO(HEXAGON.C2_mask,DI_ftype_QI,1)
-//
-def int_hexagon_C2_mask :
-Hexagon_di_qi_Intrinsic<"HEXAGON_C2_mask">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vcmpbeq,QI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vcmpbeq :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpbeq">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpbeqi,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmpbeqi :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbeqi">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpbeq_any,QI_ftype_DIDI,2)
-//
-def int_hexagon_A4_vcmpbeq_any :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A4_vcmpbeq_any">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vcmpbgtu,QI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vcmpbgtu :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpbgtu">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpbgtui,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmpbgtui :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbgtui">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpbgt,QI_ftype_DIDI,2)
-//
-def int_hexagon_A4_vcmpbgt :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A4_vcmpbgt">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpbgti,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmpbgti :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbgti">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmpbeq,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmpbeq :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbeq">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmpbeqi,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmpbeqi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbeqi">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmpbgtu,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmpbgtu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgtu">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmpbgtui,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmpbgtui :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgtui">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmpbgt,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmpbgt :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgt">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmpbgti,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmpbgti :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgti">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vcmpheq,QI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vcmpheq :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpheq">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vcmphgt,QI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vcmphgt :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmphgt">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vcmphgtu,QI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vcmphgtu :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmphgtu">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpheqi,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmpheqi :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpheqi">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmphgti,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmphgti :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmphgti">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmphgtui,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmphgtui :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmphgtui">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmpheq,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmpheq :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpheq">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmphgt,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmphgt :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgt">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmphgtu,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmphgtu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgtu">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmpheqi,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmpheqi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpheqi">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmphgti,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmphgti :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgti">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cmphgtui,QI_ftype_SISI,2)
-//
-def int_hexagon_A4_cmphgtui :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgtui">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vcmpweq,QI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vcmpweq :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpweq">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vcmpwgt,QI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vcmpwgt :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpwgt">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vcmpwgtu,QI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vcmpwgtu :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpwgtu">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpweqi,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmpweqi :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpweqi">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpwgti,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmpwgti :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpwgti">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vcmpwgtui,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_vcmpwgtui :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpwgtui">;
-//
-// BUILTIN_INFO(HEXAGON.A4_boundscheck,QI_ftype_SIDI,2)
-//
-def int_hexagon_A4_boundscheck :
-Hexagon_si_sidi_Intrinsic<"HEXAGON_A4_boundscheck">;
-//
-// BUILTIN_INFO(HEXAGON.A4_tlbmatch,QI_ftype_DISI,2)
-//
-def int_hexagon_A4_tlbmatch :
-Hexagon_si_disi_Intrinsic<"HEXAGON_A4_tlbmatch">;
-//
-// BUILTIN_INFO(HEXAGON.C2_tfrpr,SI_ftype_QI,1)
-//
-def int_hexagon_C2_tfrpr :
-Hexagon_si_qi_Intrinsic<"HEXAGON_C2_tfrpr">;
-//
-// BUILTIN_INFO(HEXAGON.C2_tfrrp,QI_ftype_SI,1)
-//
-def int_hexagon_C2_tfrrp :
-Hexagon_si_si_Intrinsic<"HEXAGON_C2_tfrrp">;
-//
-// BUILTIN_INFO(HEXAGON.C4_fastcorner9,QI_ftype_QIQI,2)
-//
-def int_hexagon_C4_fastcorner9 :
-Hexagon_si_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9">;
-//
-// BUILTIN_INFO(HEXAGON.C4_fastcorner9_not,QI_ftype_QIQI,2)
-//
-def int_hexagon_C4_fastcorner9_not :
-Hexagon_si_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9_not">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_hh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_hh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_hl_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_hl_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_lh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_lh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_ll_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_ll_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_hh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_hh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_hl_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_hl_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_lh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_lh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_ll_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_ll_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s1">;
+// BUILTIN_INFO(HEXAGON.V6_vscattermhw,v_ftype_SISIVDVI,4)
+// tag : V6_vscattermhw
+def int_hexagon_V6_vscattermhw :
+Hexagon_V65_viiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhw">;
+
 //
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hh_s0,SI_ftype_SISISI,3)
+// BUILTIN_INFO(HEXAGON.V6_vscattermhw_128B,v_ftype_SISIVDVI,4)
+// tag : V6_vscattermhw_128B
+def int_hexagon_V6_vscattermhw_128B :
+Hexagon_V65_viiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhw_128B">;
+
 //
-def int_hexagon_M2_mpy_acc_sat_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s0">;
+// BUILTIN_INFO(HEXAGON.V6_vscattermhwq,v_ftype_QVSISIVDVI,5)
+// tag : V6_vscattermhwq
+def int_hexagon_V6_vscattermhwq :
+Hexagon_V65_vv64iiiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhwq">;
+
 //
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hh_s1,SI_ftype_SISISI,3)
+// BUILTIN_INFO(HEXAGON.V6_vscattermhwq_128B,v_ftype_QVSISIVDVI,5)
+// tag : V6_vscattermhwq_128B
+def int_hexagon_V6_vscattermhwq_128B :
+Hexagon_V65_vv128iiiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhwq_128B">;
+
 //
-def int_hexagon_M2_mpy_acc_sat_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hl_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_sat_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hl_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_sat_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_lh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_sat_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_lh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_sat_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_ll_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_sat_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_ll_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_acc_sat_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_sat_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_sat_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hl_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_sat_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hl_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_sat_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_lh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_sat_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_lh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_sat_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_ll_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_sat_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_ll_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpy_nac_sat_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_hh_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_hh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_hl_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_hl_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_lh_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_lh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_ll_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_ll_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_hh_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_hh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_hl_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_hl_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_lh_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_lh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_ll_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_ll_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hh_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_rnd_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_rnd_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hl_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_rnd_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hl_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_rnd_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_lh_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_rnd_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_lh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_rnd_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_ll_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_rnd_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_ll_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_rnd_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hh_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_rnd_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_rnd_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hl_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_rnd_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hl_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_rnd_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_lh_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_rnd_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_lh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_rnd_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_ll_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_rnd_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_ll_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_sat_rnd_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hh_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_acc_hh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hh_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_acc_hh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hl_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_acc_hl_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hl_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_acc_hl_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_lh_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_acc_lh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_lh_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_acc_lh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_ll_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_acc_ll_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_ll_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_acc_ll_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hh_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_nac_hh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hh_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_nac_hh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hl_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_nac_hl_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hl_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_nac_hl_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_lh_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_nac_lh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_lh_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_nac_lh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_ll_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_nac_ll_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_ll_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyd_nac_ll_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_hh_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_hh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_hh_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_hh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_hl_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_hl_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_hl_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_hl_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_lh_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_lh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_lh_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_lh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_ll_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_ll_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_ll_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_ll_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hh_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_rnd_hh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hh_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_rnd_hh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hl_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_rnd_hl_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hl_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_rnd_hl_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_lh_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_rnd_lh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_lh_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_rnd_lh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_ll_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_rnd_ll_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_ll_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyd_rnd_ll_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_acc_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_acc_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hl_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_acc_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hl_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_acc_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_lh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_acc_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_lh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_acc_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_ll_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_acc_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_ll_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_acc_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_nac_hh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_nac_hh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hl_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_nac_hl_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hl_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_nac_hl_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_lh_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_nac_lh_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_lh_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_nac_lh_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_ll_s0,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_nac_ll_s0 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_ll_s1,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_mpyu_nac_ll_s1 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_hh_s0,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_hh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_hh_s1,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_hh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_hl_s0,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_hl_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_hl_s1,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_hl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_lh_s0,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_lh_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_lh_s1,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_lh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_ll_s0,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_ll_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_ll_s1,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_ll_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hh_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_acc_hh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hh_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_acc_hh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hl_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_acc_hl_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hl_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_acc_hl_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_lh_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_acc_lh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_lh_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_acc_lh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_ll_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_acc_ll_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_ll_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_acc_ll_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hh_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_nac_hh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hh_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_nac_hh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hl_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_nac_hl_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hl_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_nac_hl_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_lh_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_nac_lh_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_lh_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_nac_lh_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_ll_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_nac_ll_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_ll_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_mpyud_nac_ll_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_hh_s0,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyud_hh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_hh_s1,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyud_hh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_hl_s0,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyud_hl_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_hl_s1,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyud_hl_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_lh_s0,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyud_lh_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_lh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_lh_s1,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyud_lh_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_lh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_ll_s0,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyud_ll_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_ll_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyud_ll_s1,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyud_ll_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_ll_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpysmi,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpysmi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpysmi">;
-//
-// BUILTIN_INFO(HEXAGON.M2_macsip,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_macsip :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_macsip">;
-//
-// BUILTIN_INFO(HEXAGON.M2_macsin,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_macsin :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_macsin">;
-//
-// BUILTIN_INFO(HEXAGON.M2_dpmpyss_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_dpmpyss_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_dpmpyss_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_dpmpyss_acc_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_dpmpyss_acc_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyss_acc_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_dpmpyss_nac_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_dpmpyss_nac_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyss_nac_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_dpmpyuu_s0,UDI_ftype_SISI,2)
-//
-def int_hexagon_M2_dpmpyuu_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_dpmpyuu_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_dpmpyuu_acc_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_dpmpyuu_acc_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyuu_acc_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_dpmpyuu_nac_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_dpmpyuu_nac_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyuu_nac_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_up,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_up :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_up_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_up_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpy_up_s1_sat,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpy_up_s1_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up_s1_sat">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyu_up,USI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyu_up :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_up">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpysu_up,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpysu_up :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpysu_up">;
-//
-// BUILTIN_INFO(HEXAGON.M2_dpmpyss_rnd_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_dpmpyss_rnd_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_dpmpyss_rnd_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M4_mac_up_s1_sat,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_mac_up_s1_sat :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mac_up_s1_sat">;
-//
-// BUILTIN_INFO(HEXAGON.M4_nac_up_s1_sat,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_nac_up_s1_sat :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_nac_up_s1_sat">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyi,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyi">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mpyui,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_mpyui :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyui">;
-//
-// BUILTIN_INFO(HEXAGON.M2_maci,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_maci :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_maci">;
-//
-// BUILTIN_INFO(HEXAGON.M2_acci,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_acci :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_acci">;
-//
-// BUILTIN_INFO(HEXAGON.M2_accii,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_accii :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_accii">;
-//
-// BUILTIN_INFO(HEXAGON.M2_nacci,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_nacci :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_nacci">;
-//
-// BUILTIN_INFO(HEXAGON.M2_naccii,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_naccii :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_naccii">;
-//
-// BUILTIN_INFO(HEXAGON.M2_subacc,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_subacc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_subacc">;
-//
-// BUILTIN_INFO(HEXAGON.M4_mpyrr_addr,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_mpyrr_addr :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyrr_addr">;
-//
-// BUILTIN_INFO(HEXAGON.M4_mpyri_addr_u2,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_mpyri_addr_u2 :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addr_u2">;
-//
-// BUILTIN_INFO(HEXAGON.M4_mpyri_addr,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_mpyri_addr :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addr">;
-//
-// BUILTIN_INFO(HEXAGON.M4_mpyri_addi,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_mpyri_addi :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addi">;
-//
-// BUILTIN_INFO(HEXAGON.M4_mpyrr_addi,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_mpyrr_addi :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyrr_addi">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmpy2s_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_vmpy2s_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmpy2s_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_vmpy2s_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmac2s_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_vmac2s_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2s_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmac2s_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_vmac2s_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2s_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmpy2su_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_vmpy2su_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2su_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmpy2su_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_vmpy2su_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2su_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmac2su_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_vmac2su_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2su_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmac2su_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_vmac2su_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2su_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmpy2s_s0pack,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_vmpy2s_s0pack :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s0pack">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmpy2s_s1pack,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_vmpy2s_s1pack :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s1pack">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmac2,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_vmac2 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmpy2es_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vmpy2es_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vmpy2es_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmpy2es_s1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vmpy2es_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vmpy2es_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmac2es_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vmac2es_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmac2es_s1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vmac2es_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vmac2es,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vmac2es :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrmac_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vrmac_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrmac_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrmpy_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vrmpy_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrmpy_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vdmpyrs_s0,SI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vdmpyrs_s0 :
-Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vdmpyrs_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vdmpyrs_s1,SI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vdmpyrs_s1 :
-Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vdmpyrs_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vrmpybuu,DI_ftype_DIDI,2)
-//
-def int_hexagon_M5_vrmpybuu :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vrmpybuu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vrmacbuu,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M5_vrmacbuu :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vrmacbuu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vrmpybsu,DI_ftype_DIDI,2)
-//
-def int_hexagon_M5_vrmpybsu :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vrmpybsu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vrmacbsu,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M5_vrmacbsu :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vrmacbsu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vmpybuu,DI_ftype_SISI,2)
-//
-def int_hexagon_M5_vmpybuu :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M5_vmpybuu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vmpybsu,DI_ftype_SISI,2)
-//
-def int_hexagon_M5_vmpybsu :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M5_vmpybsu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vmacbuu,DI_ftype_DISISI,3)
-//
-def int_hexagon_M5_vmacbuu :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M5_vmacbuu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vmacbsu,DI_ftype_DISISI,3)
-//
-def int_hexagon_M5_vmacbsu :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M5_vmacbsu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vdmpybsu,DI_ftype_DIDI,2)
-//
-def int_hexagon_M5_vdmpybsu :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vdmpybsu">;
-//
-// BUILTIN_INFO(HEXAGON.M5_vdmacbsu,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M5_vdmacbsu :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vdmacbsu">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vdmacs_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vdmacs_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vdmacs_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vdmacs_s1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vdmacs_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vdmacs_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vdmpys_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vdmpys_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vdmpys_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vdmpys_s1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vdmpys_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vdmpys_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpyrs_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpyrs_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrs_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpyrs_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpyrs_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrs_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpyrsc_s0,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpyrsc_s0 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrsc_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpyrsc_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpyrsc_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrsc_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmacs_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cmacs_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacs_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmacs_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cmacs_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacs_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmacsc_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cmacsc_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacsc_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmacsc_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cmacsc_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacsc_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpys_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpys_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpys_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpys_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpys_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpys_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpysc_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpysc_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpysc_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpysc_s1,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpysc_s1 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpysc_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cnacs_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cnacs_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacs_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cnacs_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cnacs_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacs_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cnacsc_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cnacsc_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacsc_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cnacsc_s1,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cnacsc_s1 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacsc_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmpys_s1,DI_ftype_DISI,2)
-//
-def int_hexagon_M2_vrcmpys_s1 :
-Hexagon_di_disi_Intrinsic<"HEXAGON_M2_vrcmpys_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmpys_acc_s1,DI_ftype_DIDISI,3)
-//
-def int_hexagon_M2_vrcmpys_acc_s1 :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_M2_vrcmpys_acc_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmpys_s1rp,SI_ftype_DISI,2)
-//
-def int_hexagon_M2_vrcmpys_s1rp :
-Hexagon_si_disi_Intrinsic<"HEXAGON_M2_vrcmpys_s1rp">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmacls_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmacls_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmacls_s1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmacls_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmachs_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmachs_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmachs_s1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmachs_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyl_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyl_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyl_s1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyl_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyh_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyh_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyh_s1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyh_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmacls_rs0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmacls_rs0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_rs0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmacls_rs1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmacls_rs1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmachs_rs0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmachs_rs0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_rs0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmachs_rs1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmachs_rs1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyl_rs0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyl_rs0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_rs0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyl_rs1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyl_rs1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyh_rs0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyh_rs0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_rs0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyh_rs1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyh_rs1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M4_vrmpyeh_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyeh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_s1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M4_vrmpyeh_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyeh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_acc_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M4_vrmpyeh_acc_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_acc_s1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M4_vrmpyeh_acc_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M4_vrmpyoh_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyoh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_s1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M4_vrmpyoh_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyoh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_acc_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M4_vrmpyoh_acc_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_acc_s1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M4_vrmpyoh_acc_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_hmmpyl_rs1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_hmmpyl_rs1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyl_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_hmmpyh_rs1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_hmmpyh_rs1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyh_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_hmmpyl_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_hmmpyl_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyl_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_hmmpyh_s1,SI_ftype_SISI,2)
-//
-def int_hexagon_M2_hmmpyh_s1 :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmaculs_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmaculs_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmaculs_s1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmaculs_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmacuhs_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmacuhs_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmacuhs_s1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmacuhs_s1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyul_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyul_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyul_s1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyul_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyuh_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyuh_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyuh_s1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyuh_s1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_s1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmaculs_rs0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmaculs_rs0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_rs0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmaculs_rs1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmaculs_rs1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmacuhs_rs0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmacuhs_rs0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_rs0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmacuhs_rs1,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_mmacuhs_rs1 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyul_rs0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyul_rs0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_rs0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyul_rs1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyul_rs1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyuh_rs0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyuh_rs0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_rs0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_mmpyuh_rs1,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_mmpyuh_rs1 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_rs1">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmaci_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vrcmaci_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmaci_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmacr_s0,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vrcmacr_s0 :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmacr_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmaci_s0c,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vrcmaci_s0c :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmaci_s0c">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmacr_s0c,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vrcmacr_s0c :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmacr_s0c">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmaci_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cmaci_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmaci_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmacr_s0,DI_ftype_DISISI,3)
-//
-def int_hexagon_M2_cmacr_s0 :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacr_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmpyi_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vrcmpyi_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyi_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmpyr_s0,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vrcmpyr_s0 :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyr_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmpyi_s0c,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vrcmpyi_s0c :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyi_s0c">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vrcmpyr_s0c,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vrcmpyr_s0c :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyr_s0c">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpyi_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpyi_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpyi_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M2_cmpyr_s0,DI_ftype_SISI,2)
-//
-def int_hexagon_M2_cmpyr_s0 :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpyr_s0">;
-//
-// BUILTIN_INFO(HEXAGON.M4_cmpyi_wh,SI_ftype_DISI,2)
-//
-def int_hexagon_M4_cmpyi_wh :
-Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyi_wh">;
-//
-// BUILTIN_INFO(HEXAGON.M4_cmpyr_wh,SI_ftype_DISI,2)
-//
-def int_hexagon_M4_cmpyr_wh :
-Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyr_wh">;
-//
-// BUILTIN_INFO(HEXAGON.M4_cmpyi_whc,SI_ftype_DISI,2)
+// BUILTIN_INFO(HEXAGON.V6_vscattermhw_add,v_ftype_SISIVDVI,4)
+// tag : V6_vscattermhw_add
+def int_hexagon_V6_vscattermhw_add :
+Hexagon_V65_viiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhw_add">;
+
 //
+// BUILTIN_INFO(HEXAGON.V6_vscattermhw_add_128B,v_ftype_SISIVDVI,4)
+// tag : V6_vscattermhw_add_128B
+def int_hexagon_V6_vscattermhw_add_128B :
+Hexagon_V65_viiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhw_add_128B">;
+
+// Auto-generated intrinsics
+
+// tag : S2_vsatwh
+class Hexagon_i32_i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrmpybusv
+class Hexagon_v16i32_v16i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrmpybusv
+class Hexagon_v32i32_v32i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vaslw_acc
+class Hexagon_v16i32_v16i32v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vaslw_acc
+class Hexagon_v32i32_v32i32v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vmux
+class Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_v16i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vmux
+class Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_v32i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : S2_tableidxd_goodsyntax
+class Hexagon_i32_i32i32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i32_ty,llvm_i32_ty,llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandnqrt_acc
+class Hexagon_v16i32_v16i32v512i1i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v512i1_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandnqrt_acc
+class Hexagon_v32i32_v32i32v1024i1i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v1024i1_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrmpybusi
+class Hexagon_v32i32_v32i32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrmpybusi
+class Hexagon_v64i32_v64i32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vsubb_dv
+class Hexagon_v64i32_v64i32v64i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty],
+       [IntrNoMem]>;
+
+// tag : M2_mpysu_up
+class Hexagon_i32_i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : M2_mpyud_acc_ll_s0
+class Hexagon_i64_i64i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i64_ty,llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : S2_lsr_i_r_nac
+class Hexagon_i32_i32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i32_ty,llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : M2_cmpysc_s0
+class Hexagon_i64_i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_lo
+class Hexagon_v16i32_v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_lo
+class Hexagon_v32i32_v64i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v64i32_ty],
+       [IntrNoMem]>;
+
+// tag : S2_shuffoh
+class Hexagon_i64_i64i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i64_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : F2_sfmax
+class Hexagon_float_floatfloat_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty], [llvm_float_ty,llvm_float_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : A2_vabswsat
+class Hexagon_i64_i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag :
+class Hexagon_v32i32_v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_ldnp0
+class Hexagon_v16i32_i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_ldnp0
+class Hexagon_v32i32_i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vdmpyhb
+class Hexagon_v16i32_v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vdmpyhb
+class Hexagon_v32i32_v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : A4_vcmphgti
+class Hexagon_i32_i64i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i64_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag :
+class Hexagon_v32i32_v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : S6_rol_i_p_or
+class Hexagon_i64_i64i64i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i64_ty,llvm_i64_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vgtuh_and
+class Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v512i1_ty], [llvm_v512i1_ty,llvm_v16i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vgtuh_and
+class Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v1024i1_ty], [llvm_v1024i1_ty,llvm_v32i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : A2_abssat
+class Hexagon_i32_i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : A2_vcmpwgtu
+class Hexagon_i32_i64i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i64_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vtmpybus_acc
+class Hexagon_v64i32_v64i32v64i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_conv_df2uw_chop
+class Hexagon_i32_double_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_double_ty],
+       [IntrNoMem]>;
+
+// tag : V6_pred_or
+class Hexagon_v512i1_v512i1v512i1_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v512i1_ty], [llvm_v512i1_ty,llvm_v512i1_ty],
+       [IntrNoMem]>;
+
+// tag : V6_pred_or
+class Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v1024i1_ty], [llvm_v1024i1_ty,llvm_v1024i1_ty],
+       [IntrNoMem]>;
+
+// tag : S2_asr_i_p_rnd_goodsyntax
+class Hexagon_i64_i64i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i64_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_conv_w2df
+class Hexagon_double_i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_double_ty], [llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vunpackuh
+class Hexagon_v32i32_v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vunpackuh
+class Hexagon_v64i32_v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vadduhw_acc
+class Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vadduhw_acc
+class Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : M2_vdmacs_s0
+class Hexagon_i64_i64i64i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i64_ty,llvm_i64_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrmpybub_rtt_acc
+class Hexagon_v32i32_v32i32v16i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrmpybub_rtt_acc
+class Hexagon_v64i32_v64i32v32i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_ldu0
+class Hexagon_v16i32_i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_ldu0
+class Hexagon_v32i32_i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : S4_extract_rp
+class Hexagon_i32_i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vdmpyhsuisat
+class Hexagon_v16i32_v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vdmpyhsuisat
+class Hexagon_v32i32_v64i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v64i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : A2_addsp
+class Hexagon_i64_i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_extractw
+class Hexagon_i32_v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_extractw
+class Hexagon_i32_v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vlutvwhi
+class Hexagon_v32i32_v16i32v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vlutvwhi
+class Hexagon_v64i32_v32i32v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vgtuh
+class Hexagon_v512i1_v16i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v512i1_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vgtuh
+class Hexagon_v1024i1_v32i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v1024i1_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_sffma_lib
+class Hexagon_float_floatfloatfloat_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty], [llvm_float_ty,llvm_float_ty,llvm_float_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : F2_conv_ud2df
+class Hexagon_double_i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_double_ty], [llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : S2_vzxthw
+class Hexagon_i64_i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vtmpyhb
+class Hexagon_v64i32_v64i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vshufoeh
+class Hexagon_v32i32_v16i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vshufoeh
+class Hexagon_v64i32_v32i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vlut4
+class Hexagon_v16i32_v16i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vlut4
+class Hexagon_v32i32_v32i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag :
+class Hexagon_v16i32_v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_conv_uw2sf
+class Hexagon_float_i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty], [llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vswap
+class Hexagon_v32i32_v512i1v16i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v512i1_ty,llvm_v16i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vswap
+class Hexagon_v64i32_v1024i1v32i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v1024i1_ty,llvm_v32i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandnqrt
+class Hexagon_v16i32_v512i1i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandnqrt
+class Hexagon_v32i32_v1024i1i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vmpyub
+class Hexagon_v64i32_v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : A5_ACS
+class Hexagon_i64i32_i64i64i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty,llvm_i32_ty], [llvm_i64_ty,llvm_i64_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vunpackob
+class Hexagon_v32i32_v32i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vunpackob
+class Hexagon_v64i32_v64i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vmpyhsat_acc
+class Hexagon_v32i32_v32i32v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vmpyhsat_acc
+class Hexagon_v64i32_v64i32v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vaddcarrysat
+class Hexagon_v16i32_v16i32v16i32v512i1_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v512i1_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vaddcarrysat
+class Hexagon_v32i32_v32i32v32i32v1024i1_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v1024i1_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vlutvvb_oracc
+class Hexagon_v16i32_v16i32v16i32v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vlutvvb_oracc
+class Hexagon_v32i32_v32i32v32i32v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrmpybub_rtt
+class Hexagon_v32i32_v16i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrmpybub_rtt
+class Hexagon_v64i32_v32i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : A4_addp_c
+class Hexagon_i64i32_i64i64i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty,llvm_i32_ty], [llvm_i64_ty,llvm_i64_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrsadubi_acc
+class Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vrsadubi_acc
+class Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty,llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_conv_df2sf
+class Hexagon_float_double_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty], [llvm_double_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandvqv
+class Hexagon_v16i32_v512i1v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandvqv
+class Hexagon_v32i32_v1024i1v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : C2_vmux
+class Hexagon_i64_i32i64i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i32_ty,llvm_i64_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : F2_sfcmpeq
+class Hexagon_i32_floatfloat_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_float_ty,llvm_float_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : V6_vmpahhsat
+class Hexagon_v16i32_v16i32v16i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vmpahhsat
+class Hexagon_v32i32_v32i32v32i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandvrt
+class Hexagon_v512i1_v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v512i1_ty], [llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandvrt
+class Hexagon_v1024i1_v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v1024i1_ty], [llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vsubcarry
+class Hexagon_custom_v16i32v512i1_v16i32v16i32v512i1_Intrinsic
+  : Hexagon_NonGCC_Intrinsic<
+       [llvm_v16i32_ty,llvm_v512i1_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v512i1_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vsubcarry
+class Hexagon_custom_v32i32v1024i1_v32i32v32i32v1024i1_Intrinsic_128B
+  : Hexagon_NonGCC_Intrinsic<
+       [llvm_v32i32_ty,llvm_v1024i1_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v1024i1_ty],
+       [IntrNoMem]>;
+
+// tag : F2_sffixupr
+class Hexagon_float_float_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty], [llvm_float_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : V6_vandvrt_acc
+class Hexagon_v512i1_v512i1v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v512i1_ty], [llvm_v512i1_ty,llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vandvrt_acc
+class Hexagon_v1024i1_v1024i1v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v1024i1_ty], [llvm_v1024i1_ty,llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_dfsub
+class Hexagon_double_doubledouble_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_double_ty], [llvm_double_ty,llvm_double_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : V6_vmpyowh_sacc
+class Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v16i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vmpyowh_sacc
+class Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v32i32_ty],
+       [IntrNoMem]>;
+
+// tag : S2_insertp
+class Hexagon_i64_i64i64i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_i64_ty,llvm_i64_ty,llvm_i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_sfinvsqrta
+class Hexagon_floati32_float_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty,llvm_i32_ty], [llvm_float_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : V6_vtran2x2_map
+class Hexagon_v16i32v16i32_v16i32v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty,llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vtran2x2_map
+class Hexagon_v32i32v32i32_v32i32v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty,llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vlutvwh_oracc
+class Hexagon_v32i32_v32i32v16i32v16i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vlutvwh_oracc
+class Hexagon_v64i32_v64i32v32i32v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_dfcmpge
+class Hexagon_i32_doubledouble_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_double_ty,llvm_double_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : F2_conv_df2d_chop
+class Hexagon_i64_double_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_double_ty],
+       [IntrNoMem]>;
+
+// tag : F2_conv_sf2w
+class Hexagon_i32_float_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_float_ty],
+       [IntrNoMem]>;
+
+// tag : F2_sfclass
+class Hexagon_i32_floati32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_float_ty,llvm_i32_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : F2_conv_sf2ud_chop
+class Hexagon_i64_float_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty], [llvm_float_ty],
+       [IntrNoMem]>;
+
+// tag : V6_pred_scalar2v2
+class Hexagon_v512i1_i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v512i1_ty], [llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_pred_scalar2v2
+class Hexagon_v1024i1_i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v1024i1_ty], [llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_sfrecipa
+class Hexagon_floati32_floatfloat_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty,llvm_i32_ty], [llvm_float_ty,llvm_float_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : V6_vprefixqh
+class Hexagon_v16i32_v512i1_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v512i1_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vprefixqh
+class Hexagon_v32i32_v1024i1_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v1024i1_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vdmpyhisat_acc
+class Hexagon_v16i32_v16i32v32i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vdmpyhisat_acc
+class Hexagon_v32i32_v32i32v64i32i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v64i32_ty,llvm_i32_ty],
+       [IntrNoMem]>;
+
+// tag : F2_conv_ud2sf
+class Hexagon_float_i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty], [llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : F2_conv_sf2df
+class Hexagon_double_float_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_double_ty], [llvm_float_ty],
+       [IntrNoMem]>;
+
+// tag : F2_sffma_sc
+class Hexagon_float_floatfloatfloati32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_float_ty], [llvm_float_ty,llvm_float_ty,llvm_float_ty,llvm_i32_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : F2_dfclass
+class Hexagon_i32_doublei32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_double_ty,llvm_i32_ty],
+       [IntrNoMem, Throws]>;
+
+// tag : V6_vd0
+class Hexagon_v16i32__Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v16i32_ty], [],
+       [IntrNoMem]>;
+
+// tag : V6_vd0
+class Hexagon_v32i32__Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v32i32_ty], [],
+       [IntrNoMem]>;
+
+// tag : V6_vdd0
+class Hexagon_v64i32__Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [],
+       [IntrNoMem]>;
+
+// tag : S2_insert_rp
+class Hexagon_i32_i32i32i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i32_ty], [llvm_i32_ty,llvm_i32_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_vassignp
+class Hexagon_v64i32_v64i32_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v64i32_ty], [llvm_v64i32_ty],
+       [IntrNoMem]>;
+
+// tag : A6_vminub_RdP
+class Hexagon_i64i32_i64i64_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_i64_ty,llvm_i32_ty], [llvm_i64_ty,llvm_i64_ty],
+       [IntrNoMem]>;
+
+// tag : V6_pred_not
+class Hexagon_v512i1_v512i1_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v512i1_ty], [llvm_v512i1_ty],
+       [IntrNoMem]>;
+
+// tag : V6_pred_not
+class Hexagon_v1024i1_v1024i1_Intrinsic<string GCCIntSuffix>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [llvm_v1024i1_ty], [llvm_v1024i1_ty],
+       [IntrNoMem]>;
+
+// V5 Scalar Instructions.
+
+def int_hexagon_S2_asr_r_p_or :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_or">;
+
+def int_hexagon_S2_vsatwh :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vsatwh">;
+
+def int_hexagon_S2_tableidxd_goodsyntax :
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxd_goodsyntax">;
+
+def int_hexagon_M2_mpysu_up :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysu_up">;
+
+def int_hexagon_M2_mpyud_acc_ll_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s0">;
+
+def int_hexagon_M2_mpyud_acc_ll_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s1">;
+
+def int_hexagon_M2_cmpysc_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpysc_s1">;
+
+def int_hexagon_M2_cmpysc_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpysc_s0">;
+
 def int_hexagon_M4_cmpyi_whc :
-Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyi_whc">;
-//
-// BUILTIN_INFO(HEXAGON.M4_cmpyr_whc,SI_ftype_DISI,2)
-//
-def int_hexagon_M4_cmpyr_whc :
-Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyr_whc">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vcmpy_s0_sat_i,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vcmpy_s0_sat_i :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_i">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vcmpy_s0_sat_r,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vcmpy_s0_sat_r :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_r">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vcmpy_s1_sat_i,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vcmpy_s1_sat_i :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_i">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vcmpy_s1_sat_r,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vcmpy_s1_sat_r :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_r">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vcmac_s0_sat_i,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vcmac_s0_sat_i :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_i">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vcmac_s0_sat_r,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M2_vcmac_s0_sat_r :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vcrotate,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_vcrotate :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_vcrotate">;
-//
-// BUILTIN_INFO(HEXAGON.S4_vrcrotate_acc,DI_ftype_DIDISISI,4)
-//
-def int_hexagon_S4_vrcrotate_acc :
-Hexagon_di_didisisi_Intrinsic<"HEXAGON_S4_vrcrotate_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S4_vrcrotate,DI_ftype_DISISI,3)
-//
-def int_hexagon_S4_vrcrotate :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_S4_vrcrotate">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vcnegh,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_vcnegh :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_vcnegh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vrcnegh,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_vrcnegh :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_vrcnegh">;
-//
-// BUILTIN_INFO(HEXAGON.M4_pmpyw,DI_ftype_SISI,2)
-//
-def int_hexagon_M4_pmpyw :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M4_pmpyw">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vpmpyh,DI_ftype_SISI,2)
-//
-def int_hexagon_M4_vpmpyh :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_M4_vpmpyh">;
-//
-// BUILTIN_INFO(HEXAGON.M4_pmpyw_acc,DI_ftype_DISISI,3)
-//
-def int_hexagon_M4_pmpyw_acc :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M4_pmpyw_acc">;
-//
-// BUILTIN_INFO(HEXAGON.M4_vpmpyh_acc,DI_ftype_DISISI,3)
-//
-def int_hexagon_M4_vpmpyh_acc :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_M4_vpmpyh_acc">;
-//
-// BUILTIN_INFO(HEXAGON.A2_add,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_add :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_add">;
-//
-// BUILTIN_INFO(HEXAGON.A2_sub,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_sub :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_sub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addsat,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addsat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addsat">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subsat,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subsat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subsat">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addi,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addi">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_l16_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_l16_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_l16_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_l16_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_l16_sat_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_l16_sat_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_sat_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_l16_sat_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_l16_sat_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_sat_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_l16_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_l16_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_l16_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_l16_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_l16_sat_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_l16_sat_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_sat_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_l16_sat_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_l16_sat_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_sat_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_h16_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_h16_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_h16_lh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_h16_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_lh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_h16_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_h16_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_h16_hh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_h16_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_hh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_h16_sat_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_lh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_h16_sat_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_lh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_h16_sat_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_hh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_addh_h16_sat_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_hh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_h16_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_h16_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_h16_lh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_h16_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_lh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_h16_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_h16_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_h16_hh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_h16_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_hh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_h16_sat_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_lh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_h16_sat_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_lh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_h16_sat_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_hh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subh_h16_sat_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_hh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_aslh,SI_ftype_SI,1)
-//
-def int_hexagon_A2_aslh :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_aslh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_asrh,SI_ftype_SI,1)
-//
-def int_hexagon_A2_asrh :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_asrh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_addp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_addp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addpsat,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_addpsat :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_addpsat">;
-//
-// BUILTIN_INFO(HEXAGON.A2_addsp,DI_ftype_SIDI,2)
-//
-def int_hexagon_A2_addsp :
-Hexagon_di_sidi_Intrinsic<"HEXAGON_A2_addsp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_subp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_subp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_neg,SI_ftype_SI,1)
-//
-def int_hexagon_A2_neg :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_neg">;
-//
-// BUILTIN_INFO(HEXAGON.A2_negsat,SI_ftype_SI,1)
-//
-def int_hexagon_A2_negsat :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_negsat">;
-//
-// BUILTIN_INFO(HEXAGON.A2_abs,SI_ftype_SI,1)
-//
-def int_hexagon_A2_abs :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_abs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_abssat,SI_ftype_SI,1)
-//
-def int_hexagon_A2_abssat :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_abssat">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vconj,DI_ftype_DI,1)
-//
-def int_hexagon_A2_vconj :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_vconj">;
-//
-// BUILTIN_INFO(HEXAGON.A2_negp,DI_ftype_DI,1)
-//
-def int_hexagon_A2_negp :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_negp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_absp,DI_ftype_DI,1)
-//
-def int_hexagon_A2_absp :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_absp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_max,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_max :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_max">;
-//
-// BUILTIN_INFO(HEXAGON.A2_maxu,USI_ftype_SISI,2)
-//
-def int_hexagon_A2_maxu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_maxu">;
-//
-// BUILTIN_INFO(HEXAGON.A2_min,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_min :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_min">;
-//
-// BUILTIN_INFO(HEXAGON.A2_minu,USI_ftype_SISI,2)
-//
-def int_hexagon_A2_minu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_minu">;
-//
-// BUILTIN_INFO(HEXAGON.A2_maxp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_maxp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_maxp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_maxup,UDI_ftype_DIDI,2)
-//
-def int_hexagon_A2_maxup :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_maxup">;
-//
-// BUILTIN_INFO(HEXAGON.A2_minp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_minp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_minp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_minup,UDI_ftype_DIDI,2)
-//
-def int_hexagon_A2_minup :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_minup">;
-//
-// BUILTIN_INFO(HEXAGON.A2_tfr,SI_ftype_SI,1)
-//
-def int_hexagon_A2_tfr :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_tfr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_tfrsi,SI_ftype_SI,1)
-//
-def int_hexagon_A2_tfrsi :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_tfrsi">;
-//
-// BUILTIN_INFO(HEXAGON.A2_tfrp,DI_ftype_DI,1)
-//
-def int_hexagon_A2_tfrp :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_tfrp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_tfrpi,DI_ftype_SI,1)
-//
-def int_hexagon_A2_tfrpi :
-Hexagon_di_si_Intrinsic<"HEXAGON_A2_tfrpi">;
-//
-// BUILTIN_INFO(HEXAGON.A2_zxtb,SI_ftype_SI,1)
-//
-def int_hexagon_A2_zxtb :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_zxtb">;
-//
-// BUILTIN_INFO(HEXAGON.A2_sxtb,SI_ftype_SI,1)
-//
-def int_hexagon_A2_sxtb :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_sxtb">;
-//
-// BUILTIN_INFO(HEXAGON.A2_zxth,SI_ftype_SI,1)
-//
-def int_hexagon_A2_zxth :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_zxth">;
-//
-// BUILTIN_INFO(HEXAGON.A2_sxth,SI_ftype_SI,1)
-//
-def int_hexagon_A2_sxth :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_sxth">;
-//
-// BUILTIN_INFO(HEXAGON.A2_combinew,DI_ftype_SISI,2)
-//
-def int_hexagon_A2_combinew :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_A2_combinew">;
-//
-// BUILTIN_INFO(HEXAGON.A4_combineri,DI_ftype_SISI,2)
-//
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M4_cmpyi_whc">;
+
+def int_hexagon_M2_mpy_sat_rnd_lh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s1">;
+
+def int_hexagon_M2_mpy_sat_rnd_lh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s0">;
+
+def int_hexagon_S2_tableidxb_goodsyntax :
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxb_goodsyntax">;
+
+def int_hexagon_S2_shuffoh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_shuffoh">;
+
+def int_hexagon_F2_sfmax :
+Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfmax">;
+
+def int_hexagon_A2_vabswsat :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vabswsat">;
+
+def int_hexagon_S2_asr_i_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r">;
+
+def int_hexagon_S2_asr_i_p :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p">;
+
 def int_hexagon_A4_combineri :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_combineri">;
-//
-// BUILTIN_INFO(HEXAGON.A4_combineir,DI_ftype_SISI,2)
-//
-def int_hexagon_A4_combineir :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_combineir">;
-//
-// BUILTIN_INFO(HEXAGON.A2_combineii,DI_ftype_SISI,2)
-//
-def int_hexagon_A2_combineii :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_A2_combineii">;
-//
-// BUILTIN_INFO(HEXAGON.A2_combine_hh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_combine_hh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_hh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_combine_hl,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_combine_hl :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_hl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_combine_lh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_combine_lh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_lh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_combine_ll,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_combine_ll :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_ll">;
-//
-// BUILTIN_INFO(HEXAGON.A2_tfril,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_tfril :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_tfril">;
-//
-// BUILTIN_INFO(HEXAGON.A2_tfrih,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_tfrih :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_tfrih">;
-//
-// BUILTIN_INFO(HEXAGON.A2_and,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_and :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_and">;
-//
-// BUILTIN_INFO(HEXAGON.A2_or,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_or :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_or">;
-//
-// BUILTIN_INFO(HEXAGON.A2_xor,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_xor :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_xor">;
-//
-// BUILTIN_INFO(HEXAGON.A2_not,SI_ftype_SI,1)
-//
-def int_hexagon_A2_not :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_not">;
-//
-// BUILTIN_INFO(HEXAGON.M2_xor_xacc,SI_ftype_SISISI,3)
-//
-def int_hexagon_M2_xor_xacc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_xor_xacc">;
-//
-// BUILTIN_INFO(HEXAGON.M4_xor_xacc,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_M4_xor_xacc :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_xor_xacc">;
-//
-// BUILTIN_INFO(HEXAGON.A4_andn,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_andn :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_andn">;
-//
-// BUILTIN_INFO(HEXAGON.A4_orn,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_orn :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_orn">;
-//
-// BUILTIN_INFO(HEXAGON.A4_andnp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A4_andnp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A4_andnp">;
-//
-// BUILTIN_INFO(HEXAGON.A4_ornp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A4_ornp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A4_ornp">;
-//
-// BUILTIN_INFO(HEXAGON.S4_addaddi,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_addaddi :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addaddi">;
-//
-// BUILTIN_INFO(HEXAGON.S4_subaddi,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_subaddi :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subaddi">;
-//
-// BUILTIN_INFO(HEXAGON.M4_and_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_and_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_and">;
-//
-// BUILTIN_INFO(HEXAGON.M4_and_andn,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_and_andn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_andn">;
-//
-// BUILTIN_INFO(HEXAGON.M4_and_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_and_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_or">;
-//
-// BUILTIN_INFO(HEXAGON.M4_and_xor,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_and_xor :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_xor">;
-//
-// BUILTIN_INFO(HEXAGON.M4_or_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_or_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_and">;
-//
-// BUILTIN_INFO(HEXAGON.M4_or_andn,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_or_andn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_andn">;
-//
-// BUILTIN_INFO(HEXAGON.M4_or_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_or_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_or">;
-//
-// BUILTIN_INFO(HEXAGON.M4_or_xor,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_or_xor :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_xor">;
-//
-// BUILTIN_INFO(HEXAGON.S4_or_andix,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_or_andix :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_andix">;
-//
-// BUILTIN_INFO(HEXAGON.S4_or_andi,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_or_andi :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_andi">;
-//
-// BUILTIN_INFO(HEXAGON.S4_or_ori,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_or_ori :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_ori">;
-//
-// BUILTIN_INFO(HEXAGON.M4_xor_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_xor_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_and">;
-//
-// BUILTIN_INFO(HEXAGON.M4_xor_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_xor_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_or">;
-//
-// BUILTIN_INFO(HEXAGON.M4_xor_andn,SI_ftype_SISISI,3)
-//
-def int_hexagon_M4_xor_andn :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_andn">;
-//
-// BUILTIN_INFO(HEXAGON.A2_subri,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_subri :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subri">;
-//
-// BUILTIN_INFO(HEXAGON.A2_andir,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_andir :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_andir">;
-//
-// BUILTIN_INFO(HEXAGON.A2_orir,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_orir :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_orir">;
-//
-// BUILTIN_INFO(HEXAGON.A2_andp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_andp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_andp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_orp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_orp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_orp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_xorp,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_xorp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_xorp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_notp,DI_ftype_DI,1)
-//
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineri">;
+
+def int_hexagon_M2_mpy_nac_sat_hl_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s1">;
+
+def int_hexagon_M4_vpmpyh_acc :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M4_vpmpyh_acc">;
+
+def int_hexagon_M2_vcmpy_s0_sat_i :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_i">;
+
 def int_hexagon_A2_notp :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_notp">;
-//
-// BUILTIN_INFO(HEXAGON.A2_sxtw,DI_ftype_SI,1)
-//
-def int_hexagon_A2_sxtw :
-Hexagon_di_si_Intrinsic<"HEXAGON_A2_sxtw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_sat,SI_ftype_DI,1)
-//
-def int_hexagon_A2_sat :
-Hexagon_si_di_Intrinsic<"HEXAGON_A2_sat">;
-//
-// BUILTIN_INFO(HEXAGON.A2_roundsat,SI_ftype_DI,1)
-//
-def int_hexagon_A2_roundsat :
-Hexagon_si_di_Intrinsic<"HEXAGON_A2_roundsat">;
-//
-// BUILTIN_INFO(HEXAGON.A2_sath,SI_ftype_SI,1)
-//
-def int_hexagon_A2_sath :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_sath">;
-//
-// BUILTIN_INFO(HEXAGON.A2_satuh,SI_ftype_SI,1)
-//
-def int_hexagon_A2_satuh :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_satuh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_satub,SI_ftype_SI,1)
-//
-def int_hexagon_A2_satub :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_satub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_satb,SI_ftype_SI,1)
-//
-def int_hexagon_A2_satb :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_satb">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vaddub,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vaddub :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vaddb_map,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vaddb_map :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddb_map">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vaddubs,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vaddubs :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddubs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vaddh,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vaddh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vaddhs,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vaddhs :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddhs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vadduhs,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vadduhs :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vadduhs">;
-//
-// BUILTIN_INFO(HEXAGON.A5_vaddhubs,SI_ftype_DIDI,2)
-//
-def int_hexagon_A5_vaddhubs :
-Hexagon_si_didi_Intrinsic<"HEXAGON_A5_vaddhubs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vaddw,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vaddw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vaddws,DI_ftype_DIDI,2)
-//
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_notp">;
+
+def int_hexagon_M2_mpy_hl_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_hl_s1">;
+
+def int_hexagon_M2_mpy_hl_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_hl_s0">;
+
+def int_hexagon_C4_or_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_or_and">;
+
+def int_hexagon_M2_vmac2s_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2s_s0">;
+
+def int_hexagon_M2_vmac2s_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2s_s1">;
+
+def int_hexagon_S2_brevp :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_brevp">;
+
+def int_hexagon_M4_pmpyw_acc :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M4_pmpyw_acc">;
+
+def int_hexagon_S2_cl1 :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_cl1">;
+
+def int_hexagon_C4_cmplte :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmplte">;
+
+def int_hexagon_M2_mmpyul_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyul_s0">;
+
 def int_hexagon_A2_vaddws :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddws">;
-//
-// BUILTIN_INFO(HEXAGON.S4_vxaddsubw,DI_ftype_DIDI,2)
-//
-def int_hexagon_S4_vxaddsubw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubw">;
-//
-// BUILTIN_INFO(HEXAGON.S4_vxsubaddw,DI_ftype_DIDI,2)
-//
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddws">;
+
+def int_hexagon_A2_maxup :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_maxup">;
+
+def int_hexagon_A4_vcmphgti :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmphgti">;
+
+def int_hexagon_S2_interleave :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_interleave">;
+
+def int_hexagon_M2_vrcmpyi_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrcmpyi_s0">;
+
+def int_hexagon_A2_abssat :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_abssat">;
+
+def int_hexagon_A2_vcmpwgtu :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpwgtu">;
+
+def int_hexagon_C2_cmpgtu :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgtu">;
+
+def int_hexagon_C2_cmpgtp :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_C2_cmpgtp">;
+
+def int_hexagon_A4_cmphgtui :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgtui">;
+
+def int_hexagon_C2_cmpgti :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgti">;
+
+def int_hexagon_M2_mpyi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyi">;
+
+def int_hexagon_F2_conv_df2uw_chop :
+Hexagon_i32_double_Intrinsic<"HEXAGON_F2_conv_df2uw_chop">;
+
+def int_hexagon_A4_cmpheq :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpheq">;
+
+def int_hexagon_M2_mpy_lh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_lh_s1">;
+
+def int_hexagon_M2_mpy_lh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_lh_s0">;
+
+def int_hexagon_S2_lsr_i_r_xacc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_xacc">;
+
+def int_hexagon_S2_vrcnegh :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vrcnegh">;
+
+def int_hexagon_S2_extractup :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S2_extractup">;
+
+def int_hexagon_S2_asr_i_p_rnd_goodsyntax :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_rnd_goodsyntax">;
+
+def int_hexagon_S4_ntstbit_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_ntstbit_r">;
+
+def int_hexagon_F2_conv_w2sf :
+Hexagon_float_i32_Intrinsic<"HEXAGON_F2_conv_w2sf">;
+
+def int_hexagon_C2_not :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_not">;
+
+def int_hexagon_C2_tfrpr :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_tfrpr">;
+
+def int_hexagon_M2_mpy_ll_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_ll_s1">;
+
+def int_hexagon_M2_mpy_ll_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_ll_s0">;
+
+def int_hexagon_A4_cmpbgt :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgt">;
+
+def int_hexagon_S2_asr_r_r_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_and">;
+
+def int_hexagon_A4_rcmpneqi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpneqi">;
+
+def int_hexagon_S2_asl_i_r_nac :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_nac">;
+
+def int_hexagon_M2_subacc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_subacc">;
+
+def int_hexagon_A2_orp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_orp">;
+
+def int_hexagon_M2_mpyu_up :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_up">;
+
+def int_hexagon_M2_mpy_acc_sat_lh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s1">;
+
+def int_hexagon_S2_asr_i_vh :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_vh">;
+
+def int_hexagon_S2_asr_i_vw :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_vw">;
+
+def int_hexagon_A4_cmpbgtu :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgtu">;
+
+def int_hexagon_A4_vcmpbeq_any :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A4_vcmpbeq_any">;
+
+def int_hexagon_A4_cmpbgti :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgti">;
+
+def int_hexagon_M2_mpyd_lh_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_lh_s1">;
+
+def int_hexagon_S2_asl_r_p_nac :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_nac">;
+
+def int_hexagon_S2_lsr_i_r_nac :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_nac">;
+
+def int_hexagon_A2_addsp :
+Hexagon_i64_i32i64_Intrinsic<"HEXAGON_A2_addsp">;
+
 def int_hexagon_S4_vxsubaddw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddw">;
-//
-// BUILTIN_INFO(HEXAGON.S4_vxaddsubh,DI_ftype_DIDI,2)
-//
-def int_hexagon_S4_vxaddsubh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubh">;
-//
-// BUILTIN_INFO(HEXAGON.S4_vxsubaddh,DI_ftype_DIDI,2)
-//
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxsubaddw">;
+
+def int_hexagon_A4_vcmpheqi :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpheqi">;
+
 def int_hexagon_S4_vxsubaddh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddh">;
-//
-// BUILTIN_INFO(HEXAGON.S4_vxaddsubhr,DI_ftype_DIDI,2)
-//
-def int_hexagon_S4_vxaddsubhr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubhr">;
-//
-// BUILTIN_INFO(HEXAGON.S4_vxsubaddhr,DI_ftype_DIDI,2)
-//
-def int_hexagon_S4_vxsubaddhr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddhr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svavgh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svavgh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svavgh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svavghs,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svavghs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svavghs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svnavgh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svnavgh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svnavgh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svaddh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svaddh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svaddh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svaddhs,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svaddhs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svaddhs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svadduhs,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svadduhs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svadduhs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svsubh,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svsubh :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svsubhs,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svsubhs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubhs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_svsubuhs,SI_ftype_SISI,2)
-//
-def int_hexagon_A2_svsubuhs :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubuhs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vraddub,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vraddub :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vraddub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vraddub_acc,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_A2_vraddub_acc :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_A2_vraddub_acc">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vraddh,SI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vraddh :
-Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vraddh">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vradduh,SI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vradduh :
-Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vradduh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vsubub,DI_ftype_DIDI,2)
-//
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxsubaddh">;
+
+def int_hexagon_M4_pmpyw :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M4_pmpyw">;
+
+def int_hexagon_S2_vsathb :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vsathb">;
+
+def int_hexagon_S2_asr_r_p_and :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_and">;
+
+def int_hexagon_M2_mpyu_acc_lh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s1">;
+
+def int_hexagon_M2_mpyu_acc_lh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s0">;
+
+def int_hexagon_S2_lsl_r_p_acc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_acc">;
+
+def int_hexagon_A2_pxorf :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_A2_pxorf">;
+
+def int_hexagon_C2_cmpgei :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgei">;
+
 def int_hexagon_A2_vsubub :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vsubb_map,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vsubb_map :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubb_map">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vsububs,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vsububs :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsububs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vsubh,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vsubh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vsubhs,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vsubhs :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubhs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vsubuhs,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vsubuhs :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubuhs">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vsubw,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vsubw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vsubws,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vsubws :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubws">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vabsh,DI_ftype_DI,1)
-//
-def int_hexagon_A2_vabsh :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabsh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vabshsat,DI_ftype_DI,1)
-//
-def int_hexagon_A2_vabshsat :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabshsat">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vabsw,DI_ftype_DI,1)
-//
-def int_hexagon_A2_vabsw :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabsw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vabswsat,DI_ftype_DI,1)
-//
-def int_hexagon_A2_vabswsat :
-Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabswsat">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vabsdiffw,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vabsdiffw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vabsdiffw">;
-//
-// BUILTIN_INFO(HEXAGON.M2_vabsdiffh,DI_ftype_DIDI,2)
-//
-def int_hexagon_M2_vabsdiffh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vabsdiffh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vrsadub,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vrsadub :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vrsadub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vrsadub_acc,DI_ftype_DIDIDI,3)
-//
-def int_hexagon_A2_vrsadub_acc :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_A2_vrsadub_acc">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavgub,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavgub :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavguh,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavguh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavgh,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavgh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vnavgh,DI_ftype_DIDI,2)
-//
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubub">;
+
+def int_hexagon_S2_asl_i_p :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_p">;
+
+def int_hexagon_S2_asl_i_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_i_r">;
+
+def int_hexagon_A4_vrminuw :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrminuw">;
+
+def int_hexagon_F2_sffma :
+Hexagon_float_floatfloatfloat_Intrinsic<"HEXAGON_F2_sffma">;
+
+def int_hexagon_A2_absp :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_absp">;
+
+def int_hexagon_C2_all8 :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_all8">;
+
+def int_hexagon_A4_vrminuh :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrminuh">;
+
+def int_hexagon_F2_sffma_lib :
+Hexagon_float_floatfloatfloat_Intrinsic<"HEXAGON_F2_sffma_lib">;
+
+def int_hexagon_M4_vrmpyoh_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M4_vrmpyoh_s0">;
+
+def int_hexagon_M4_vrmpyoh_s1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M4_vrmpyoh_s1">;
+
+def int_hexagon_C2_bitsset :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_bitsset">;
+
+def int_hexagon_M2_mpysip :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysip">;
+
+def int_hexagon_M2_mpysin :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysin">;
+
+def int_hexagon_A4_boundscheck :
+Hexagon_i32_i32i64_Intrinsic<"HEXAGON_A4_boundscheck">;
+
+def int_hexagon_M5_vrmpybuu :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M5_vrmpybuu">;
+
+def int_hexagon_C4_fastcorner9 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_fastcorner9">;
+
+def int_hexagon_M2_vrcmpys_s1rp :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M2_vrcmpys_s1rp">;
+
+def int_hexagon_A2_neg :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_neg">;
+
+def int_hexagon_A2_subsat :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subsat">;
+
+def int_hexagon_S2_asl_r_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_r_r">;
+
+def int_hexagon_S2_asl_r_p :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_r_p">;
+
 def int_hexagon_A2_vnavgh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavgw,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavgw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vnavgw,DI_ftype_DIDI,2)
-//
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavgh">;
+
+def int_hexagon_M2_mpy_nac_sat_hl_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s0">;
+
+def int_hexagon_F2_conv_ud2df :
+Hexagon_double_i64_Intrinsic<"HEXAGON_F2_conv_ud2df">;
+
 def int_hexagon_A2_vnavgw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavgwr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavgwr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgwr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vnavgwr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vnavgwr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgwr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavgwcr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavgwcr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgwcr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vnavgwcr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vnavgwcr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgwcr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavghcr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavghcr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavghcr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vnavghcr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vnavghcr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavghcr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavguw,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavguw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavguwr,DI_ftype_DIDI,2)
-//
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavgw">;
+
+def int_hexagon_S2_asl_i_r_acc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_acc">;
+
+def int_hexagon_S4_subi_lsr_ri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subi_lsr_ri">;
+
+def int_hexagon_S2_vzxthw :
+Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vzxthw">;
+
+def int_hexagon_F2_sfadd :
+Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfadd">;
+
+def int_hexagon_A2_sub :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_sub">;
+
+def int_hexagon_M2_vmac2su_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2su_s0">;
+
+def int_hexagon_M2_vmac2su_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2su_s1">;
+
+def int_hexagon_M2_dpmpyss_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_s0">;
+
+def int_hexagon_S2_insert :
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_insert">;
+
+def int_hexagon_S2_packhl :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_S2_packhl">;
+
+def int_hexagon_A4_vcmpwgti :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpwgti">;
+
 def int_hexagon_A2_vavguwr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguwr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavgubr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavgubr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgubr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavguhr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavguhr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguhr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vavghr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vavghr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavghr">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vnavghr,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vnavghr :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavghr">;
-//
-// BUILTIN_INFO(HEXAGON.A4_round_ri,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_round_ri :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_ri">;
-//
-// BUILTIN_INFO(HEXAGON.A4_round_rr,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_round_rr :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_rr">;
-//
-// BUILTIN_INFO(HEXAGON.A4_round_ri_sat,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_round_ri_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_ri_sat">;
-//
-// BUILTIN_INFO(HEXAGON.A4_round_rr_sat,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_round_rr_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_rr_sat">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cround_ri,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_cround_ri :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cround_ri">;
-//
-// BUILTIN_INFO(HEXAGON.A4_cround_rr,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_cround_rr :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cround_rr">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vrminh,DI_ftype_DIDISI,3)
-//
-def int_hexagon_A4_vrminh :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminh">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vrmaxh,DI_ftype_DIDISI,3)
-//
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavguwr">;
+
+def int_hexagon_S2_asl_r_r_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_and">;
+
+def int_hexagon_A2_svsubhs :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svsubhs">;
+
+def int_hexagon_A2_addh_l16_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_hl">;
+
+def int_hexagon_M4_and_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_and">;
+
+def int_hexagon_F2_conv_d2df :
+Hexagon_double_i64_Intrinsic<"HEXAGON_F2_conv_d2df">;
+
+def int_hexagon_C2_cmpgtui :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgtui">;
+
+def int_hexagon_A2_vconj :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vconj">;
+
+def int_hexagon_S2_lsr_r_vw :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_r_vw">;
+
+def int_hexagon_S2_lsr_r_vh :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_r_vh">;
+
+def int_hexagon_A2_subh_l16_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_l16_hl">;
+
+def int_hexagon_S4_vxsubaddhr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxsubaddhr">;
+
+def int_hexagon_S2_clbp :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_clbp">;
+
+def int_hexagon_S2_deinterleave :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_deinterleave">;
+
+def int_hexagon_C2_any8 :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_any8">;
+
+def int_hexagon_S2_togglebit_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_togglebit_r">;
+
+def int_hexagon_S2_togglebit_i :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_togglebit_i">;
+
+def int_hexagon_F2_conv_uw2sf :
+Hexagon_float_i32_Intrinsic<"HEXAGON_F2_conv_uw2sf">;
+
+def int_hexagon_S2_vsathb_nopack :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_vsathb_nopack">;
+
+def int_hexagon_M2_cmacs_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacs_s0">;
+
+def int_hexagon_M2_cmacs_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacs_s1">;
+
+def int_hexagon_M2_mpy_sat_hh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s0">;
+
+def int_hexagon_M2_mpy_sat_hh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s1">;
+
+def int_hexagon_M2_mmacuhs_s1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacuhs_s1">;
+
+def int_hexagon_M2_mmacuhs_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacuhs_s0">;
+
+def int_hexagon_S2_clrbit_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_clrbit_r">;
+
+def int_hexagon_C4_or_andn :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_or_andn">;
+
+def int_hexagon_S2_asl_r_r_nac :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_nac">;
+
+def int_hexagon_S2_asl_i_p_acc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_acc">;
+
+def int_hexagon_A4_vcmpwgtui :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpwgtui">;
+
+def int_hexagon_M4_vrmpyoh_acc_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s0">;
+
+def int_hexagon_M4_vrmpyoh_acc_s1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s1">;
+
 def int_hexagon_A4_vrmaxh :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxh">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vrminuh,DI_ftype_DIDISI,3)
-//
-def int_hexagon_A4_vrminuh :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminuh">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vrmaxuh,DI_ftype_DIDISI,3)
-//
-def int_hexagon_A4_vrmaxuh :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxuh">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vrminw,DI_ftype_DIDISI,3)
-//
-def int_hexagon_A4_vrminw :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminw">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vrmaxw,DI_ftype_DIDISI,3)
-//
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrmaxh">;
+
+def int_hexagon_A2_vcmpbeq :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpbeq">;
+
+def int_hexagon_A2_vcmphgt :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmphgt">;
+
+def int_hexagon_A2_vnavgwcr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavgwcr">;
+
+def int_hexagon_M2_vrcmacr_s0c :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrcmacr_s0c">;
+
+def int_hexagon_A2_vavgwcr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgwcr">;
+
+def int_hexagon_S2_asl_i_p_xacc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_xacc">;
+
 def int_hexagon_A4_vrmaxw :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxw">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vrminuw,DI_ftype_DIDISI,3)
-//
-def int_hexagon_A4_vrminuw :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminuw">;
-//
-// BUILTIN_INFO(HEXAGON.A4_vrmaxuw,DI_ftype_DIDISI,3)
-//
-def int_hexagon_A4_vrmaxuw :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxuw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vminb,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vminb :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminb">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vmaxb,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vmaxb :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxb">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vminub,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vminub :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vmaxub,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vmaxub :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxub">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vminh,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vminh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vmaxh,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vmaxh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vminuh,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vminuh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminuh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vmaxuh,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vmaxuh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxuh">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vminw,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vminw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vmaxw,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vmaxw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vminuw,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vminuw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminuw">;
-//
-// BUILTIN_INFO(HEXAGON.A2_vmaxuw,DI_ftype_DIDI,2)
-//
-def int_hexagon_A2_vmaxuw :
-Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxuw">;
-//
-// BUILTIN_INFO(HEXAGON.A4_modwrapu,SI_ftype_SISI,2)
-//
-def int_hexagon_A4_modwrapu :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_modwrapu">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfadd,SF_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfadd :
-Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfadd">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfsub,SF_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfsub :
-Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfsub">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfmpy,SF_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfmpy :
-Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmpy">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sffma,SF_ftype_SFSFSF,3)
-//
-def int_hexagon_F2_sffma :
-Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffma">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sffma_sc,SF_ftype_SFSFSFQI,4)
-//
-def int_hexagon_F2_sffma_sc :
-Hexagon_sf_sfsfsfqi_Intrinsic<"HEXAGON_F2_sffma_sc">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sffms,SF_ftype_SFSFSF,3)
-//
-def int_hexagon_F2_sffms :
-Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffms">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sffma_lib,SF_ftype_SFSFSF,3)
-//
-def int_hexagon_F2_sffma_lib :
-Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffma_lib">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sffms_lib,SF_ftype_SFSFSF,3)
-//
-def int_hexagon_F2_sffms_lib :
-Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffms_lib">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfcmpeq,QI_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfcmpeq :
-Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpeq">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfcmpgt,QI_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfcmpgt :
-Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpgt">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfcmpge,QI_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfcmpge :
-Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpge">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfcmpuo,QI_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfcmpuo :
-Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpuo">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfmax,SF_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfmax :
-Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmax">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfmin,SF_ftype_SFSF,2)
-//
-def int_hexagon_F2_sfmin :
-Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmin">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfclass,QI_ftype_SFSI,2)
-//
-def int_hexagon_F2_sfclass :
-Hexagon_si_sfsi_Intrinsic<"HEXAGON_F2_sfclass">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfimm_p,SF_ftype_SI,1)
-//
-def int_hexagon_F2_sfimm_p :
-Hexagon_sf_si_Intrinsic<"HEXAGON_F2_sfimm_p">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sfimm_n,SF_ftype_SI,1)
-//
-def int_hexagon_F2_sfimm_n :
-Hexagon_sf_si_Intrinsic<"HEXAGON_F2_sfimm_n">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sffixupn,SF_ftype_SFSF,2)
-//
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrmaxw">;
+
+def int_hexagon_A2_vnavghr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavghr">;
+
+def int_hexagon_M4_cmpyi_wh :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M4_cmpyi_wh">;
+
+def int_hexagon_A2_tfrsi :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfrsi">;
+
+def int_hexagon_S2_asr_i_r_acc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_acc">;
+
+def int_hexagon_A2_svnavgh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svnavgh">;
+
+def int_hexagon_S2_lsr_i_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r">;
+
+def int_hexagon_M2_vmac2 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2">;
+
+def int_hexagon_A4_vcmphgtui :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmphgtui">;
+
+def int_hexagon_A2_svavgh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svavgh">;
+
+def int_hexagon_M4_vrmpyeh_acc_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s0">;
+
+def int_hexagon_M4_vrmpyeh_acc_s1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s1">;
+
+def int_hexagon_S2_lsr_i_p :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p">;
+
+def int_hexagon_A2_combine_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_hl">;
+
+def int_hexagon_M2_mpy_up :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_up">;
+
+def int_hexagon_A2_combine_hh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_hh">;
+
+def int_hexagon_A2_negsat :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_negsat">;
+
+def int_hexagon_M2_mpyd_hl_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_hl_s0">;
+
+def int_hexagon_M2_mpyd_hl_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_hl_s1">;
+
+def int_hexagon_A4_bitsplit :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_bitsplit">;
+
+def int_hexagon_A2_vabshsat :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vabshsat">;
+
+def int_hexagon_M2_mpyui :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyui">;
+
+def int_hexagon_A2_addh_l16_sat_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_sat_ll">;
+
+def int_hexagon_S2_lsl_r_r_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r_and">;
+
+def int_hexagon_M2_mmpyul_rs0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyul_rs0">;
+
+def int_hexagon_S2_asr_i_r_rnd_goodsyntax :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_rnd_goodsyntax">;
+
+def int_hexagon_S2_lsr_r_p_nac :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_nac">;
+
+def int_hexagon_C2_cmplt :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmplt">;
+
+def int_hexagon_M2_cmacr_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacr_s0">;
+
+def int_hexagon_M4_or_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_and">;
+
+def int_hexagon_M4_mpyrr_addi :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyrr_addi">;
+
+def int_hexagon_S4_or_andi :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_andi">;
+
+def int_hexagon_M2_mpy_sat_hl_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s0">;
+
+def int_hexagon_M2_mpy_sat_hl_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s1">;
+
+def int_hexagon_M4_mpyrr_addr :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyrr_addr">;
+
+def int_hexagon_M2_mmachs_rs0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmachs_rs0">;
+
+def int_hexagon_M2_mmachs_rs1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmachs_rs1">;
+
+def int_hexagon_M2_vrcmpyr_s0c :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrcmpyr_s0c">;
+
+def int_hexagon_M2_mpy_acc_sat_hl_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s0">;
+
+def int_hexagon_M2_mpyd_acc_ll_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s1">;
+
 def int_hexagon_F2_sffixupn :
-Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sffixupn">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sffixupd,SF_ftype_SFSF,2)
-//
-def int_hexagon_F2_sffixupd :
-Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sffixupd">;
-//
-// BUILTIN_INFO(HEXAGON.F2_sffixupr,SF_ftype_SF,1)
-//
-def int_hexagon_F2_sffixupr :
-Hexagon_sf_sf_Intrinsic<"HEXAGON_F2_sffixupr">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfcmpeq,QI_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfcmpeq :
-Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpeq">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfcmpgt,QI_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfcmpgt :
-Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpgt">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfcmpge,QI_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfcmpge :
-Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpge">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfcmpuo,QI_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfcmpuo :
-Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpuo">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfclass,QI_ftype_DFSI,2)
-//
-def int_hexagon_F2_dfclass :
-Hexagon_si_dfsi_Intrinsic<"HEXAGON_F2_dfclass">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfimm_p,DF_ftype_SI,1)
-//
-def int_hexagon_F2_dfimm_p :
-Hexagon_df_si_Intrinsic<"HEXAGON_F2_dfimm_p">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfimm_n,DF_ftype_SI,1)
-//
-def int_hexagon_F2_dfimm_n :
-Hexagon_df_si_Intrinsic<"HEXAGON_F2_dfimm_n">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2df,DF_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2df :
-Hexagon_df_sf_Intrinsic<"HEXAGON_F2_conv_sf2df">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2sf,SF_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2sf :
-Hexagon_sf_df_Intrinsic<"HEXAGON_F2_conv_df2sf">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_uw2sf,SF_ftype_SI,1)
-//
-def int_hexagon_F2_conv_uw2sf :
-Hexagon_sf_si_Intrinsic<"HEXAGON_F2_conv_uw2sf">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_uw2df,DF_ftype_SI,1)
-//
-def int_hexagon_F2_conv_uw2df :
-Hexagon_df_si_Intrinsic<"HEXAGON_F2_conv_uw2df">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_w2sf,SF_ftype_SI,1)
-//
-def int_hexagon_F2_conv_w2sf :
-Hexagon_sf_si_Intrinsic<"HEXAGON_F2_conv_w2sf">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_w2df,DF_ftype_SI,1)
-//
-def int_hexagon_F2_conv_w2df :
-Hexagon_df_si_Intrinsic<"HEXAGON_F2_conv_w2df">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_ud2sf,SF_ftype_DI,1)
-//
-def int_hexagon_F2_conv_ud2sf :
-Hexagon_sf_di_Intrinsic<"HEXAGON_F2_conv_ud2sf">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_ud2df,DF_ftype_DI,1)
-//
-def int_hexagon_F2_conv_ud2df :
-Hexagon_df_di_Intrinsic<"HEXAGON_F2_conv_ud2df">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_d2sf,SF_ftype_DI,1)
-//
-def int_hexagon_F2_conv_d2sf :
-Hexagon_sf_di_Intrinsic<"HEXAGON_F2_conv_d2sf">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_d2df,DF_ftype_DI,1)
-//
-def int_hexagon_F2_conv_d2df :
-Hexagon_df_di_Intrinsic<"HEXAGON_F2_conv_d2df">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2uw,SI_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2uw :
-Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2uw">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2w,SI_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2w :
-Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2w">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2ud,DI_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2ud :
-Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2ud">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2d,DI_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2d :
-Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2d">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2uw,SI_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2uw :
-Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2uw">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2w,SI_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2w :
-Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2w">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2ud,DI_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2ud :
-Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2ud">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2d,DI_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2d :
-Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2d">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2uw_chop,SI_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2uw_chop :
-Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2uw_chop">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2w_chop,SI_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2w_chop :
-Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2w_chop">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2ud_chop,DI_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2ud_chop :
-Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2ud_chop">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_sf2d_chop,DI_ftype_SF,1)
-//
-def int_hexagon_F2_conv_sf2d_chop :
-Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2d_chop">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2uw_chop,SI_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2uw_chop :
-Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2uw_chop">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2w_chop,SI_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2w_chop :
-Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2w_chop">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2ud_chop,DI_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2ud_chop :
-Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2ud_chop">;
-//
-// BUILTIN_INFO(HEXAGON.F2_conv_df2d_chop,DI_ftype_DF,1)
-//
-def int_hexagon_F2_conv_df2d_chop :
-Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2d_chop">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asr_r_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_r_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asl_r_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_r_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_lsr_r_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsr_r_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_lsl_r_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsl_r_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_p,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_r_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_p,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asl_r_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_p,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_lsr_r_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_p,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_lsl_r_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_r_acc,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asr_r_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_r_acc,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asl_r_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_r_acc,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsr_r_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_r_acc,SI_ftype_SISISI,3)
-//
+Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sffixupn">;
+
+def int_hexagon_M2_mpyd_acc_lh_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s0">;
+
+def int_hexagon_M2_mpyd_acc_lh_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s1">;
+
+def int_hexagon_M2_mpy_rnd_hh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s0">;
+
+def int_hexagon_M2_mpy_rnd_hh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s1">;
+
+def int_hexagon_A2_vadduhs :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vadduhs">;
+
+def int_hexagon_A2_vsubuhs :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubuhs">;
+
+def int_hexagon_A2_subh_h16_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_hl">;
+
+def int_hexagon_A2_subh_h16_hh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_hh">;
+
+def int_hexagon_A2_xorp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_xorp">;
+
+def int_hexagon_A4_tfrpcp :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A4_tfrpcp">;
+
+def int_hexagon_A2_addh_h16_lh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_lh">;
+
+def int_hexagon_A2_addh_h16_sat_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_sat_hl">;
+
+def int_hexagon_A2_addh_h16_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_ll">;
+
+def int_hexagon_A2_addh_h16_sat_hh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_sat_hh">;
+
+def int_hexagon_A2_zxtb :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_zxtb">;
+
+def int_hexagon_A2_zxth :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_zxth">;
+
+def int_hexagon_A2_vnavgwr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavgwr">;
+
+def int_hexagon_M4_or_xor :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_xor">;
+
+def int_hexagon_M2_mpyud_acc_hh_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s0">;
+
+def int_hexagon_M2_mpyud_acc_hh_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s1">;
+
+def int_hexagon_M5_vmacbsu :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M5_vmacbsu">;
+
+def int_hexagon_M2_dpmpyuu_acc_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_dpmpyuu_acc_s0">;
+
+def int_hexagon_M2_mpy_rnd_hl_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s0">;
+
+def int_hexagon_M2_mpy_rnd_hl_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s1">;
+
+def int_hexagon_F2_sffms_lib :
+Hexagon_float_floatfloatfloat_Intrinsic<"HEXAGON_F2_sffms_lib">;
+
+def int_hexagon_C4_cmpneqi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpneqi">;
+
+def int_hexagon_M4_and_xor :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_xor">;
+
+def int_hexagon_A2_sat :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_A2_sat">;
+
+def int_hexagon_M2_mpyd_nac_lh_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s1">;
+
+def int_hexagon_M2_mpyd_nac_lh_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s0">;
+
+def int_hexagon_A2_addsat :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addsat">;
+
+def int_hexagon_A2_svavghs :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svavghs">;
+
+def int_hexagon_A2_vrsadub_acc :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_A2_vrsadub_acc">;
+
+def int_hexagon_C2_bitsclri :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_bitsclri">;
+
+def int_hexagon_A2_subh_h16_sat_hh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_sat_hh">;
+
+def int_hexagon_A2_subh_h16_sat_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_sat_hl">;
+
+def int_hexagon_M2_mmaculs_rs0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmaculs_rs0">;
+
+def int_hexagon_M2_mmaculs_rs1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmaculs_rs1">;
+
+def int_hexagon_M2_vradduh :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M2_vradduh">;
+
+def int_hexagon_A4_addp_c :
+Hexagon_i64i32_i64i64i32_Intrinsic<"HEXAGON_A4_addp_c">;
+
+def int_hexagon_C2_xor :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_xor">;
+
 def int_hexagon_S2_lsl_r_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_p_acc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_r_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_p_acc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_r_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_acc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_r_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_acc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsl_r_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_r_nac,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asr_r_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_r_nac,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asl_r_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_r_nac,SI_ftype_SISISI,3)
-//
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r_acc">;
+
+def int_hexagon_M2_mmpyh_rs1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyh_rs1">;
+
+def int_hexagon_M2_mmpyh_rs0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyh_rs0">;
+
+def int_hexagon_F2_conv_df2ud_chop :
+Hexagon_i64_double_Intrinsic<"HEXAGON_F2_conv_df2ud_chop">;
+
+def int_hexagon_C4_or_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_or_or">;
+
+def int_hexagon_S4_vxaddsubhr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxaddsubhr">;
+
+def int_hexagon_S2_vsathub :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vsathub">;
+
+def int_hexagon_F2_conv_df2sf :
+Hexagon_float_double_Intrinsic<"HEXAGON_F2_conv_df2sf">;
+
+def int_hexagon_M2_hmmpyh_rs1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_hmmpyh_rs1">;
+
+def int_hexagon_M2_hmmpyh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_hmmpyh_s1">;
+
+def int_hexagon_A2_vavgwr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgwr">;
+
+def int_hexagon_S2_tableidxh_goodsyntax :
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxh_goodsyntax">;
+
+def int_hexagon_A2_sxth :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_sxth">;
+
+def int_hexagon_A2_sxtb :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_sxtb">;
+
+def int_hexagon_C4_or_orn :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_or_orn">;
+
+def int_hexagon_M2_vrcmaci_s0c :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrcmaci_s0c">;
+
+def int_hexagon_A2_sxtw :
+Hexagon_i64_i32_Intrinsic<"HEXAGON_A2_sxtw">;
+
+def int_hexagon_M2_vabsdiffh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vabsdiffh">;
+
+def int_hexagon_M2_mpy_acc_lh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s1">;
+
+def int_hexagon_M2_mpy_acc_lh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s0">;
+
+def int_hexagon_M2_hmmpyl_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_hmmpyl_s1">;
+
+def int_hexagon_S2_cl1p :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_cl1p">;
+
+def int_hexagon_M2_vabsdiffw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vabsdiffw">;
+
+def int_hexagon_A4_andnp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A4_andnp">;
+
+def int_hexagon_C2_vmux :
+Hexagon_i64_i32i64i64_Intrinsic<"HEXAGON_C2_vmux">;
+
+def int_hexagon_S2_parityp :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_S2_parityp">;
+
+def int_hexagon_S2_lsr_i_p_and :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_and">;
+
+def int_hexagon_S2_asr_i_r_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_or">;
+
+def int_hexagon_M2_mpyu_nac_ll_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s0">;
+
+def int_hexagon_M2_mpyu_nac_ll_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s1">;
+
+def int_hexagon_F2_sfcmpeq :
+Hexagon_i32_floatfloat_Intrinsic<"HEXAGON_F2_sfcmpeq">;
+
+def int_hexagon_A2_vaddb_map :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddb_map">;
+
 def int_hexagon_S2_lsr_r_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_r_nac,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsl_r_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_p_nac,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_r_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_p_nac,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_r_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_nac,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_r_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_nac,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsl_r_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_r_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asr_r_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_r_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asl_r_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_r_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsr_r_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_r_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsl_r_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_r_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asr_r_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_r_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asl_r_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_r_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsr_r_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_r_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsl_r_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_p_and,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_r_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_p_and,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_r_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_and,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_r_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_and,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsl_r_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_p_or,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_r_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_p_or,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_r_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_or,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_r_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_or,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsl_r_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_p_xor,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_r_p_xor :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_xor">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_p_xor,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_r_p_xor :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_xor">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_xor,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_r_p_xor :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_xor">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_xor,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsl_r_p_xor :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_xor">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_r_sat,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asr_r_r_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_r_r_sat">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_r_sat,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asl_r_r_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_r_r_sat">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asr_i_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_lsr_i_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsr_i_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asl_i_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_i_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_p,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_i_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_p,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_lsr_i_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_p,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asl_i_p :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_r_acc,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asr_i_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_acc,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsr_i_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_r_acc,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asl_i_r_acc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_p_acc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_i_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_acc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_i_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_p_acc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_i_p_acc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_acc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_r_nac,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asr_i_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_nac,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsr_i_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_r_nac,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asl_i_r_nac :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_p_nac,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_i_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_nac,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_i_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_p_nac,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_i_p_nac :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_nac">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_xacc,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsr_i_r_xacc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_xacc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_r_xacc,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asl_i_r_xacc :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_xacc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_xacc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_i_p_xacc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_xacc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_p_xacc,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_i_p_xacc :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_xacc">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_r_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asr_i_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_and,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsr_i_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_r_and,SI_ftype_SISISI,3)
-//
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r_nac">;
+
+def int_hexagon_A2_vcmpheq :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpheq">;
+
+def int_hexagon_S2_clbnorm :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_clbnorm">;
+
+def int_hexagon_M2_cnacsc_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cnacsc_s1">;
+
+def int_hexagon_M2_cnacsc_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cnacsc_s0">;
+
+def int_hexagon_S4_subaddi :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subaddi">;
+
+def int_hexagon_M2_mpyud_nac_hl_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s1">;
+
+def int_hexagon_M2_mpyud_nac_hl_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s0">;
+
+def int_hexagon_S5_vasrhrnd_goodsyntax :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S5_vasrhrnd_goodsyntax">;
+
+def int_hexagon_S2_tstbit_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_tstbit_r">;
+
+def int_hexagon_S4_vrcrotate :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S4_vrcrotate">;
+
+def int_hexagon_M2_mmachs_s1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmachs_s1">;
+
+def int_hexagon_M2_mmachs_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmachs_s0">;
+
+def int_hexagon_S2_tstbit_i :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_tstbit_i">;
+
+def int_hexagon_M2_mpy_up_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_up_s1">;
+
+def int_hexagon_S2_extractu_rp :
+Hexagon_i32_i32i64_Intrinsic<"HEXAGON_S2_extractu_rp">;
+
+def int_hexagon_M2_mmpyuh_rs0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyuh_rs0">;
+
+def int_hexagon_S2_lsr_i_vw :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_vw">;
+
+def int_hexagon_M2_mpy_rnd_ll_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s0">;
+
+def int_hexagon_M2_mpy_rnd_ll_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s1">;
+
+def int_hexagon_M4_or_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_or">;
+
+def int_hexagon_M2_mpyu_hh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_hh_s1">;
+
+def int_hexagon_M2_mpyu_hh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_hh_s0">;
+
+def int_hexagon_S2_asl_r_p_acc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_acc">;
+
+def int_hexagon_M2_mpyu_nac_lh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s0">;
+
+def int_hexagon_M2_mpyu_nac_lh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s1">;
+
+def int_hexagon_M2_mpy_sat_ll_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s0">;
+
+def int_hexagon_M2_mpy_sat_ll_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s1">;
+
+def int_hexagon_F2_conv_w2df :
+Hexagon_double_i32_Intrinsic<"HEXAGON_F2_conv_w2df">;
+
+def int_hexagon_A2_subh_l16_sat_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_l16_sat_hl">;
+
+def int_hexagon_C2_cmpeqi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpeqi">;
+
 def int_hexagon_S2_asl_i_r_and :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_r_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asr_i_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_lsr_i_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_r_or,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_asl_i_r_or :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_p_and,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_i_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_and,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_i_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_p_and,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_i_p_and :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_and">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_p_or,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asr_i_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_or,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_lsr_i_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_p_or,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_asl_i_p_or :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_or">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_r_sat,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asl_i_r_sat :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_i_r_sat">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_r_rnd,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asr_i_r_rnd :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r_rnd">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_r_rnd_goodsyntax,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_asr_i_r_rnd_goodsyntax :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r_rnd_goodsyntax">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_p_rnd,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_i_p_rnd :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p_rnd">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_p_rnd_goodsyntax,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_i_p_rnd_goodsyntax :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p_rnd_goodsyntax">;
-//
-// BUILTIN_INFO(HEXAGON.S4_lsli,SI_ftype_SISI,2)
-//
-def int_hexagon_S4_lsli :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_lsli">;
-//
-// BUILTIN_INFO(HEXAGON.S2_addasl_rrri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_addasl_rrri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_addasl_rrri">;
-//
-// BUILTIN_INFO(HEXAGON.S4_andi_asl_ri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_andi_asl_ri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_andi_asl_ri">;
-//
-// BUILTIN_INFO(HEXAGON.S4_ori_asl_ri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_ori_asl_ri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_ori_asl_ri">;
-//
-// BUILTIN_INFO(HEXAGON.S4_addi_asl_ri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_addi_asl_ri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addi_asl_ri">;
-//
-// BUILTIN_INFO(HEXAGON.S4_subi_asl_ri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_subi_asl_ri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subi_asl_ri">;
-//
-// BUILTIN_INFO(HEXAGON.S4_andi_lsr_ri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_andi_lsr_ri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_andi_lsr_ri">;
-//
-// BUILTIN_INFO(HEXAGON.S4_ori_lsr_ri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_ori_lsr_ri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_ori_lsr_ri">;
-//
-// BUILTIN_INFO(HEXAGON.S4_addi_lsr_ri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_addi_lsr_ri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addi_lsr_ri">;
-//
-// BUILTIN_INFO(HEXAGON.S4_subi_lsr_ri,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_subi_lsr_ri :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subi_lsr_ri">;
-//
-// BUILTIN_INFO(HEXAGON.S2_valignib,DI_ftype_DIDISI,3)
-//
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_and">;
+
+def int_hexagon_S2_vcnegh :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_vcnegh">;
+
+def int_hexagon_A4_vcmpweqi :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpweqi">;
+
+def int_hexagon_M2_vdmpyrs_s0 :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M2_vdmpyrs_s0">;
+
+def int_hexagon_M2_vdmpyrs_s1 :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M2_vdmpyrs_s1">;
+
+def int_hexagon_M4_xor_xacc :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_xor_xacc">;
+
+def int_hexagon_M2_vdmpys_s1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vdmpys_s1">;
+
+def int_hexagon_M2_vdmpys_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vdmpys_s0">;
+
+def int_hexagon_A2_vavgubr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgubr">;
+
+def int_hexagon_M2_mpyu_hl_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_hl_s1">;
+
+def int_hexagon_M2_mpyu_hl_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_hl_s0">;
+
+def int_hexagon_S2_asl_r_r_acc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_acc">;
+
+def int_hexagon_S2_cl0p :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_cl0p">;
+
 def int_hexagon_S2_valignib :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_valignib">;
-//
-// BUILTIN_INFO(HEXAGON.S2_valignrb,DI_ftype_DIDIQI,3)
-//
-def int_hexagon_S2_valignrb :
-Hexagon_di_didiqi_Intrinsic<"HEXAGON_S2_valignrb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vspliceib,DI_ftype_DIDISI,3)
-//
-def int_hexagon_S2_vspliceib :
-Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_vspliceib">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsplicerb,DI_ftype_DIDIQI,3)
-//
-def int_hexagon_S2_vsplicerb :
-Hexagon_di_didiqi_Intrinsic<"HEXAGON_S2_vsplicerb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsplatrh,DI_ftype_SI,1)
-//
-def int_hexagon_S2_vsplatrh :
-Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsplatrh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsplatrb,SI_ftype_SI,1)
-//
-def int_hexagon_S2_vsplatrb :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_vsplatrb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_insert,SI_ftype_SISISISI,4)
-//
-def int_hexagon_S2_insert :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_insert">;
-//
-// BUILTIN_INFO(HEXAGON.S2_tableidxb_goodsyntax,SI_ftype_SISISISI,4)
-//
-def int_hexagon_S2_tableidxb_goodsyntax :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxb_goodsyntax">;
-//
-// BUILTIN_INFO(HEXAGON.S2_tableidxh_goodsyntax,SI_ftype_SISISISI,4)
-//
-def int_hexagon_S2_tableidxh_goodsyntax :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxh_goodsyntax">;
-//
-// BUILTIN_INFO(HEXAGON.S2_tableidxw_goodsyntax,SI_ftype_SISISISI,4)
-//
-def int_hexagon_S2_tableidxw_goodsyntax :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxw_goodsyntax">;
-//
-// BUILTIN_INFO(HEXAGON.S2_tableidxd_goodsyntax,SI_ftype_SISISISI,4)
-//
-def int_hexagon_S2_tableidxd_goodsyntax :
-Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxd_goodsyntax">;
-//
-// BUILTIN_INFO(HEXAGON.A4_bitspliti,DI_ftype_SISI,2)
-//
-def int_hexagon_A4_bitspliti :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_bitspliti">;
-//
-// BUILTIN_INFO(HEXAGON.A4_bitsplit,DI_ftype_SISI,2)
-//
-def int_hexagon_A4_bitsplit :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_bitsplit">;
-//
-// BUILTIN_INFO(HEXAGON.S4_extract,SI_ftype_SISISI,3)
-//
-def int_hexagon_S4_extract :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_extract">;
-//
-// BUILTIN_INFO(HEXAGON.S2_extractu,SI_ftype_SISISI,3)
-//
-def int_hexagon_S2_extractu :
-Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_extractu">;
-//
-// BUILTIN_INFO(HEXAGON.S2_insertp,DI_ftype_DIDISISI,4)
-//
-def int_hexagon_S2_insertp :
-Hexagon_di_didisisi_Intrinsic<"HEXAGON_S2_insertp">;
-//
-// BUILTIN_INFO(HEXAGON.S4_extractp,DI_ftype_DISISI,3)
-//
-def int_hexagon_S4_extractp :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_S4_extractp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_extractup,DI_ftype_DISISI,3)
-//
-def int_hexagon_S2_extractup :
-Hexagon_di_disisi_Intrinsic<"HEXAGON_S2_extractup">;
-//
-// BUILTIN_INFO(HEXAGON.S2_insert_rp,SI_ftype_SISIDI,3)
-//
-def int_hexagon_S2_insert_rp :
-Hexagon_si_sisidi_Intrinsic<"HEXAGON_S2_insert_rp">;
-//
-// BUILTIN_INFO(HEXAGON.S4_extract_rp,SI_ftype_SIDI,2)
-//
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_valignib">;
+
+def int_hexagon_F2_sffixupd :
+Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sffixupd">;
+
+def int_hexagon_M2_mpy_sat_rnd_hl_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s1">;
+
+def int_hexagon_M2_mpy_sat_rnd_hl_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s0">;
+
+def int_hexagon_M2_cmacsc_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacsc_s0">;
+
+def int_hexagon_M2_cmacsc_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacsc_s1">;
+
+def int_hexagon_S2_ct1 :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_ct1">;
+
+def int_hexagon_S2_ct0 :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_ct0">;
+
+def int_hexagon_M2_dpmpyuu_nac_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_dpmpyuu_nac_s0">;
+
+def int_hexagon_M2_mmpyul_rs1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyul_rs1">;
+
+def int_hexagon_S4_ntstbit_i :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_ntstbit_i">;
+
+def int_hexagon_F2_sffixupr :
+Hexagon_float_float_Intrinsic<"HEXAGON_F2_sffixupr">;
+
+def int_hexagon_S2_asr_r_p_xor :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_xor">;
+
+def int_hexagon_M2_mpyud_acc_hl_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s0">;
+
+def int_hexagon_M2_mpyud_acc_hl_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s1">;
+
+def int_hexagon_A2_vcmphgtu :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmphgtu">;
+
+def int_hexagon_C2_andn :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_andn">;
+
+def int_hexagon_M2_vmpy2s_s0pack :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s0pack">;
+
+def int_hexagon_S4_addaddi :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addaddi">;
+
+def int_hexagon_M2_mpyd_acc_ll_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s0">;
+
+def int_hexagon_M2_mpy_acc_sat_hl_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s1">;
+
+def int_hexagon_A4_rcmpeqi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpeqi">;
+
+def int_hexagon_M4_xor_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_xor_and">;
+
+def int_hexagon_S2_asl_i_p_and :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_and">;
+
+def int_hexagon_M2_mmpyuh_rs1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyuh_rs1">;
+
+def int_hexagon_S2_asr_r_r_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_or">;
+
+def int_hexagon_A4_round_ri :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_ri">;
+
+def int_hexagon_A2_max :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_max">;
+
+def int_hexagon_A4_round_rr :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_rr">;
+
+def int_hexagon_A4_combineii :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineii">;
+
+def int_hexagon_A4_combineir :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineir">;
+
+def int_hexagon_C4_and_orn :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_and_orn">;
+
+def int_hexagon_M5_vmacbuu :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M5_vmacbuu">;
+
+def int_hexagon_A4_rcmpeq :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpeq">;
+
+def int_hexagon_M4_cmpyr_whc :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M4_cmpyr_whc">;
+
+def int_hexagon_S2_lsr_i_r_acc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_acc">;
+
+def int_hexagon_S2_vzxtbh :
+Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vzxtbh">;
+
+def int_hexagon_M2_mmacuhs_rs1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacuhs_rs1">;
+
+def int_hexagon_S2_asr_r_r_sat :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_sat">;
+
+def int_hexagon_A2_combinew :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A2_combinew">;
+
+def int_hexagon_M2_mpy_acc_ll_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s1">;
+
+def int_hexagon_M2_mpy_acc_ll_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s0">;
+
+def int_hexagon_M2_cmpyi_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpyi_s0">;
+
+def int_hexagon_S2_asl_r_p_or :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_or">;
+
+def int_hexagon_S4_ori_asl_ri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_ori_asl_ri">;
+
+def int_hexagon_C4_nbitsset :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsset">;
+
+def int_hexagon_M2_mpyu_acc_hh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s1">;
+
+def int_hexagon_M2_mpyu_acc_hh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s0">;
+
+def int_hexagon_M2_mpyu_ll_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_ll_s1">;
+
+def int_hexagon_M2_mpyu_ll_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_ll_s0">;
+
+def int_hexagon_A2_addh_l16_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_ll">;
+
+def int_hexagon_S2_lsr_r_r_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r_and">;
+
+def int_hexagon_A4_modwrapu :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_modwrapu">;
+
+def int_hexagon_A4_rcmpneq :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpneq">;
+
+def int_hexagon_M2_mpyd_acc_hh_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s0">;
+
+def int_hexagon_M2_mpyd_acc_hh_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s1">;
+
+def int_hexagon_F2_sfimm_p :
+Hexagon_float_i32_Intrinsic<"HEXAGON_F2_sfimm_p">;
+
+def int_hexagon_F2_sfimm_n :
+Hexagon_float_i32_Intrinsic<"HEXAGON_F2_sfimm_n">;
+
+def int_hexagon_M4_cmpyr_wh :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M4_cmpyr_wh">;
+
+def int_hexagon_S2_lsl_r_p_and :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_and">;
+
+def int_hexagon_A2_vavgub :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgub">;
+
+def int_hexagon_F2_conv_d2sf :
+Hexagon_float_i64_Intrinsic<"HEXAGON_F2_conv_d2sf">;
+
+def int_hexagon_A2_vavguh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavguh">;
+
+def int_hexagon_A4_cmpbeqi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbeqi">;
+
+def int_hexagon_F2_sfcmpuo :
+Hexagon_i32_floatfloat_Intrinsic<"HEXAGON_F2_sfcmpuo">;
+
+def int_hexagon_A2_vavguw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavguw">;
+
+def int_hexagon_S2_asr_i_p_nac :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_nac">;
+
+def int_hexagon_S2_vsatwh_nopack :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_vsatwh_nopack">;
+
+def int_hexagon_M2_mpyd_hh_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_hh_s0">;
+
+def int_hexagon_M2_mpyd_hh_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_hh_s1">;
+
+def int_hexagon_S2_lsl_r_p_or :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_or">;
+
+def int_hexagon_A2_minu :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_minu">;
+
+def int_hexagon_M2_mpy_sat_lh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s1">;
+
+def int_hexagon_M4_or_andn :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_andn">;
+
+def int_hexagon_A2_minp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_minp">;
+
+def int_hexagon_S4_or_andix :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_andix">;
+
+def int_hexagon_M2_mpy_rnd_lh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s0">;
+
+def int_hexagon_M2_mpy_rnd_lh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s1">;
+
+def int_hexagon_M2_mmpyuh_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyuh_s0">;
+
+def int_hexagon_M2_mmpyuh_s1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyuh_s1">;
+
+def int_hexagon_M2_mpy_acc_sat_lh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s0">;
+
+def int_hexagon_F2_sfcmpge :
+Hexagon_i32_floatfloat_Intrinsic<"HEXAGON_F2_sfcmpge">;
+
+def int_hexagon_F2_sfmin :
+Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfmin">;
+
+def int_hexagon_F2_sfcmpgt :
+Hexagon_i32_floatfloat_Intrinsic<"HEXAGON_F2_sfcmpgt">;
+
+def int_hexagon_M4_vpmpyh :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M4_vpmpyh">;
+
+def int_hexagon_M2_mmacuhs_rs0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacuhs_rs0">;
+
+def int_hexagon_M2_mpyd_rnd_lh_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s1">;
+
+def int_hexagon_M2_mpyd_rnd_lh_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s0">;
+
+def int_hexagon_A2_roundsat :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_A2_roundsat">;
+
+def int_hexagon_S2_ct1p :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_ct1p">;
+
 def int_hexagon_S4_extract_rp :
-Hexagon_si_sidi_Intrinsic<"HEXAGON_S4_extract_rp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_extractu_rp,SI_ftype_SIDI,2)
-//
-def int_hexagon_S2_extractu_rp :
-Hexagon_si_sidi_Intrinsic<"HEXAGON_S2_extractu_rp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_insertp_rp,DI_ftype_DIDIDI,3)
-//
+Hexagon_i32_i32i64_Intrinsic<"HEXAGON_S4_extract_rp">;
+
+def int_hexagon_S2_lsl_r_r_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r_or">;
+
+def int_hexagon_C4_cmplteui :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmplteui">;
+
+def int_hexagon_S4_addi_lsr_ri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addi_lsr_ri">;
+
+def int_hexagon_A4_tfrcpp :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A4_tfrcpp">;
+
+def int_hexagon_S2_asr_i_svw_trun :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S2_asr_i_svw_trun">;
+
+def int_hexagon_A4_cmphgti :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgti">;
+
+def int_hexagon_A4_vrminh :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrminh">;
+
+def int_hexagon_A4_vrminw :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrminw">;
+
+def int_hexagon_A4_cmphgtu :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgtu">;
+
 def int_hexagon_S2_insertp_rp :
-Hexagon_di_dididi_Intrinsic<"HEXAGON_S2_insertp_rp">;
-//
-// BUILTIN_INFO(HEXAGON.S4_extractp_rp,DI_ftype_DIDI,2)
-//
-def int_hexagon_S4_extractp_rp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S4_extractp_rp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_extractup_rp,DI_ftype_DIDI,2)
-//
-def int_hexagon_S2_extractup_rp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S2_extractup_rp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_tstbit_i,QI_ftype_SISI,2)
-//
-def int_hexagon_S2_tstbit_i :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_tstbit_i">;
-//
-// BUILTIN_INFO(HEXAGON.S4_ntstbit_i,QI_ftype_SISI,2)
-//
-def int_hexagon_S4_ntstbit_i :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_ntstbit_i">;
-//
-// BUILTIN_INFO(HEXAGON.S2_setbit_i,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_setbit_i :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_setbit_i">;
-//
-// BUILTIN_INFO(HEXAGON.S2_togglebit_i,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_togglebit_i :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_togglebit_i">;
-//
-// BUILTIN_INFO(HEXAGON.S2_clrbit_i,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_clrbit_i :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_clrbit_i">;
-//
-// BUILTIN_INFO(HEXAGON.S2_tstbit_r,QI_ftype_SISI,2)
-//
-def int_hexagon_S2_tstbit_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_tstbit_r">;
-//
-// BUILTIN_INFO(HEXAGON.S4_ntstbit_r,QI_ftype_SISI,2)
-//
-def int_hexagon_S4_ntstbit_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_ntstbit_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_setbit_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_setbit_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_setbit_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_togglebit_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_togglebit_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_togglebit_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_clrbit_r,SI_ftype_SISI,2)
-//
-def int_hexagon_S2_clrbit_r :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_clrbit_r">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_vh,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_i_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_vh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_vh,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_lsr_i_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_vh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_vh,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asl_i_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_vh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_vh,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_r_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_vh">;
-//
-// BUILTIN_INFO(HEXAGON.S5_asrhub_rnd_sat_goodsyntax,SI_ftype_DISI,2)
-//
-def int_hexagon_S5_asrhub_rnd_sat_goodsyntax :
-Hexagon_si_disi_Intrinsic<"HEXAGON_S5_asrhub_rnd_sat_goodsyntax">;
-//
-// BUILTIN_INFO(HEXAGON.S5_asrhub_sat,SI_ftype_DISI,2)
-//
-def int_hexagon_S5_asrhub_sat :
-Hexagon_si_disi_Intrinsic<"HEXAGON_S5_asrhub_sat">;
-//
-// BUILTIN_INFO(HEXAGON.S5_vasrhrnd_goodsyntax,DI_ftype_DISI,2)
-//
-def int_hexagon_S5_vasrhrnd_goodsyntax :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S5_vasrhrnd_goodsyntax">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_vh,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asl_r_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_vh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_vh,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_lsr_r_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_vh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_vh,DI_ftype_DISI,2)
-//
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_S2_insertp_rp">;
+
+def int_hexagon_A2_vnavghcr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavghcr">;
+
+def int_hexagon_S4_subi_asl_ri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subi_asl_ri">;
+
 def int_hexagon_S2_lsl_r_vh :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_vh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_vw,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_i_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_vw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_i_svw_trun,SI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_i_svw_trun :
-Hexagon_si_disi_Intrinsic<"HEXAGON_S2_asr_i_svw_trun">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_svw_trun,SI_ftype_DISI,2)
-//
-def int_hexagon_S2_asr_r_svw_trun :
-Hexagon_si_disi_Intrinsic<"HEXAGON_S2_asr_r_svw_trun">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_i_vw,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_lsr_i_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_vw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_i_vw,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asl_i_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_vw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asr_r_vw,DI_ftype_DISI,2)
-//
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsl_r_vh">;
+
+def int_hexagon_M2_mpy_hh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_hh_s0">;
+
+def int_hexagon_A2_vsubws :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubws">;
+
+def int_hexagon_A2_sath :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_sath">;
+
+def int_hexagon_S2_asl_r_p_xor :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_xor">;
+
+def int_hexagon_A2_satb :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_satb">;
+
+def int_hexagon_C2_cmpltu :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpltu">;
+
+def int_hexagon_S2_insertp :
+Hexagon_i64_i64i64i32i32_Intrinsic<"HEXAGON_S2_insertp">;
+
+def int_hexagon_M2_mpyd_rnd_ll_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s1">;
+
+def int_hexagon_M2_mpyd_rnd_ll_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s0">;
+
+def int_hexagon_S2_lsr_i_p_nac :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_nac">;
+
+def int_hexagon_S2_extractup_rp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_extractup_rp">;
+
+def int_hexagon_S4_vxaddsubw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxaddsubw">;
+
+def int_hexagon_S4_vxaddsubh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxaddsubh">;
+
+def int_hexagon_A2_asrh :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_asrh">;
+
+def int_hexagon_S4_extractp_rp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_extractp_rp">;
+
+def int_hexagon_S2_lsr_r_r_acc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r_acc">;
+
+def int_hexagon_M2_mpyd_nac_ll_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s1">;
+
+def int_hexagon_M2_mpyd_nac_ll_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s0">;
+
+def int_hexagon_C2_or :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_or">;
+
+def int_hexagon_M2_mmpyul_s1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyul_s1">;
+
+def int_hexagon_M2_vrcmacr_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrcmacr_s0">;
+
+def int_hexagon_A2_xor :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_xor">;
+
+def int_hexagon_A2_add :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_add">;
+
+def int_hexagon_A2_vsububs :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsububs">;
+
+def int_hexagon_M2_vmpy2s_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s1">;
+
+def int_hexagon_M2_vmpy2s_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s0">;
+
+def int_hexagon_A2_vraddub_acc :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_A2_vraddub_acc">;
+
+def int_hexagon_F2_sfinvsqrta :
+Hexagon_floati32_float_Intrinsic<"HEXAGON_F2_sfinvsqrta">;
+
+def int_hexagon_S2_ct0p :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_ct0p">;
+
+def int_hexagon_A2_svaddh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svaddh">;
+
+def int_hexagon_S2_vcrotate :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_vcrotate">;
+
+def int_hexagon_A2_aslh :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_aslh">;
+
+def int_hexagon_A2_subh_h16_lh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_lh">;
+
+def int_hexagon_A2_subh_h16_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_ll">;
+
+def int_hexagon_M2_hmmpyl_rs1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_hmmpyl_rs1">;
+
+def int_hexagon_S2_asr_r_p :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_r_p">;
+
+def int_hexagon_S2_vsplatrh :
+Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vsplatrh">;
+
+def int_hexagon_S2_asr_r_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_r_r">;
+
+def int_hexagon_A2_addh_h16_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_hl">;
+
+def int_hexagon_S2_vsplatrb :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_vsplatrb">;
+
+def int_hexagon_A2_addh_h16_hh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_hh">;
+
+def int_hexagon_M2_cmpyr_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpyr_s0">;
+
+def int_hexagon_M2_dpmpyss_rnd_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_rnd_s0">;
+
+def int_hexagon_C2_muxri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxri">;
+
+def int_hexagon_M2_vmac2es_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vmac2es_s0">;
+
+def int_hexagon_M2_vmac2es_s1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vmac2es_s1">;
+
+def int_hexagon_C2_pxfer_map :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_pxfer_map">;
+
+def int_hexagon_M2_mpyu_lh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_lh_s1">;
+
+def int_hexagon_M2_mpyu_lh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_lh_s0">;
+
+def int_hexagon_S2_asl_i_r_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_or">;
+
+def int_hexagon_M2_mpyd_acc_hl_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s0">;
+
+def int_hexagon_M2_mpyd_acc_hl_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s1">;
+
+def int_hexagon_S2_asr_r_p_nac :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_nac">;
+
+def int_hexagon_A2_vaddw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddw">;
+
+def int_hexagon_S2_asr_i_r_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_and">;
+
+def int_hexagon_A2_vaddh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddh">;
+
+def int_hexagon_M2_mpy_nac_sat_lh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s1">;
+
+def int_hexagon_M2_mpy_nac_sat_lh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s0">;
+
+def int_hexagon_C2_cmpeqp :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_C2_cmpeqp">;
+
+def int_hexagon_M4_mpyri_addi :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addi">;
+
+def int_hexagon_A2_not :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_not">;
+
+def int_hexagon_S4_andi_lsr_ri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_andi_lsr_ri">;
+
+def int_hexagon_M2_macsip :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_macsip">;
+
+def int_hexagon_A2_tfrcrr :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfrcrr">;
+
+def int_hexagon_M2_macsin :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_macsin">;
+
+def int_hexagon_C2_orn :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_orn">;
+
+def int_hexagon_M4_and_andn :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_andn">;
+
+def int_hexagon_F2_sfmpy :
+Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfmpy">;
+
+def int_hexagon_M2_mpyud_nac_hh_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s1">;
+
+def int_hexagon_M2_mpyud_nac_hh_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s0">;
+
+def int_hexagon_S2_lsr_r_p_acc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_acc">;
+
 def int_hexagon_S2_asr_r_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_vw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_asl_r_vw,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_asl_r_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_vw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsr_r_vw,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_lsr_r_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_vw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lsl_r_vw,DI_ftype_DISI,2)
-//
-def int_hexagon_S2_lsl_r_vw :
-Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_vw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vrndpackwh,SI_ftype_DI,1)
-//
-def int_hexagon_S2_vrndpackwh :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_vrndpackwh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vrndpackwhs,SI_ftype_DI,1)
-//
-def int_hexagon_S2_vrndpackwhs :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_vrndpackwhs">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsxtbh,DI_ftype_SI,1)
-//
-def int_hexagon_S2_vsxtbh :
-Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsxtbh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vzxtbh,DI_ftype_SI,1)
-//
-def int_hexagon_S2_vzxtbh :
-Hexagon_di_si_Intrinsic<"HEXAGON_S2_vzxtbh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsathub,SI_ftype_DI,1)
-//
-def int_hexagon_S2_vsathub :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsathub">;
-//
-// BUILTIN_INFO(HEXAGON.S2_svsathub,SI_ftype_SI,1)
-//
-def int_hexagon_S2_svsathub :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_svsathub">;
-//
-// BUILTIN_INFO(HEXAGON.S2_svsathb,SI_ftype_SI,1)
-//
-def int_hexagon_S2_svsathb :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_svsathb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsathb,SI_ftype_DI,1)
-//
-def int_hexagon_S2_vsathb :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsathb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vtrunohb,SI_ftype_DI,1)
-//
-def int_hexagon_S2_vtrunohb :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_vtrunohb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vtrunewh,DI_ftype_DIDI,2)
-//
-def int_hexagon_S2_vtrunewh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S2_vtrunewh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vtrunowh,DI_ftype_DIDI,2)
-//
-def int_hexagon_S2_vtrunowh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S2_vtrunowh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vtrunehb,SI_ftype_DI,1)
-//
-def int_hexagon_S2_vtrunehb :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_vtrunehb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsxthw,DI_ftype_SI,1)
-//
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_r_vw">;
+
+def int_hexagon_M4_and_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_or">;
+
+def int_hexagon_S2_asr_r_vh :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_r_vh">;
+
+def int_hexagon_C2_mask :
+Hexagon_i64_i32_Intrinsic<"HEXAGON_C2_mask">;
+
+def int_hexagon_M2_mpy_nac_hh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s0">;
+
+def int_hexagon_M2_mpy_nac_hh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s1">;
+
+def int_hexagon_M2_mpy_up_s1_sat :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_up_s1_sat">;
+
+def int_hexagon_A4_vcmpbgt :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A4_vcmpbgt">;
+
+def int_hexagon_M5_vrmacbsu :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M5_vrmacbsu">;
+
+def int_hexagon_S2_tableidxw_goodsyntax :
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxw_goodsyntax">;
+
+def int_hexagon_A2_vrsadub :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vrsadub">;
+
+def int_hexagon_A2_tfrrcr :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfrrcr">;
+
+def int_hexagon_M2_vrcmpys_acc_s1 :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_M2_vrcmpys_acc_s1">;
+
+def int_hexagon_F2_dfcmpge :
+Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpge">;
+
+def int_hexagon_M2_accii :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_accii">;
+
+def int_hexagon_A5_vaddhubs :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A5_vaddhubs">;
+
+def int_hexagon_A2_vmaxw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxw">;
+
+def int_hexagon_A2_vmaxb :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxb">;
+
+def int_hexagon_A2_vmaxh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxh">;
+
 def int_hexagon_S2_vsxthw :
-Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsxthw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vzxthw,DI_ftype_SI,1)
-//
-def int_hexagon_S2_vzxthw :
-Hexagon_di_si_Intrinsic<"HEXAGON_S2_vzxthw">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsatwh,SI_ftype_DI,1)
-//
-def int_hexagon_S2_vsatwh :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsatwh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsatwuh,SI_ftype_DI,1)
-//
-def int_hexagon_S2_vsatwuh :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsatwuh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_packhl,DI_ftype_SISI,2)
-//
-def int_hexagon_S2_packhl :
-Hexagon_di_sisi_Intrinsic<"HEXAGON_S2_packhl">;
-//
-// BUILTIN_INFO(HEXAGON.A2_swiz,SI_ftype_SI,1)
-//
-def int_hexagon_A2_swiz :
-Hexagon_si_si_Intrinsic<"HEXAGON_A2_swiz">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsathub_nopack,DI_ftype_DI,1)
-//
-def int_hexagon_S2_vsathub_nopack :
-Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsathub_nopack">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsathb_nopack,DI_ftype_DI,1)
-//
-def int_hexagon_S2_vsathb_nopack :
-Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsathb_nopack">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsatwh_nopack,DI_ftype_DI,1)
-//
-def int_hexagon_S2_vsatwh_nopack :
-Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsatwh_nopack">;
-//
-// BUILTIN_INFO(HEXAGON.S2_vsatwuh_nopack,DI_ftype_DI,1)
-//
+Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vsxthw">;
+
+def int_hexagon_S4_andi_asl_ri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_andi_asl_ri">;
+
+def int_hexagon_S2_asl_i_p_nac :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_nac">;
+
+def int_hexagon_S2_lsl_r_p_xor :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_xor">;
+
+def int_hexagon_C2_cmpgt :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgt">;
+
+def int_hexagon_F2_conv_df2d_chop :
+Hexagon_i64_double_Intrinsic<"HEXAGON_F2_conv_df2d_chop">;
+
+def int_hexagon_M2_mpyu_nac_hl_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s0">;
+
+def int_hexagon_M2_mpyu_nac_hl_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s1">;
+
+def int_hexagon_F2_conv_sf2w :
+Hexagon_i32_float_Intrinsic<"HEXAGON_F2_conv_sf2w">;
+
+def int_hexagon_S2_lsr_r_p_or :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_or">;
+
+def int_hexagon_F2_sfclass :
+Hexagon_i32_floati32_Intrinsic<"HEXAGON_F2_sfclass">;
+
+def int_hexagon_M2_mpyud_acc_lh_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s0">;
+
+def int_hexagon_M4_xor_andn :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_xor_andn">;
+
+def int_hexagon_S2_addasl_rrri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_addasl_rrri">;
+
+def int_hexagon_M5_vdmpybsu :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M5_vdmpybsu">;
+
+def int_hexagon_M2_mpyu_nac_hh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s0">;
+
+def int_hexagon_M2_mpyu_nac_hh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s1">;
+
+def int_hexagon_A2_addi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addi">;
+
+def int_hexagon_A2_addp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_addp">;
+
+def int_hexagon_M2_vmpy2s_s1pack :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s1pack">;
+
+def int_hexagon_S4_clbpnorm :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S4_clbpnorm">;
+
+def int_hexagon_A4_round_rr_sat :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_rr_sat">;
+
+def int_hexagon_M2_nacci :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_nacci">;
+
+def int_hexagon_S2_shuffeh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_shuffeh">;
+
+def int_hexagon_S2_lsr_i_r_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_and">;
+
+def int_hexagon_M2_mpy_sat_rnd_hh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s1">;
+
+def int_hexagon_M2_mpy_sat_rnd_hh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s0">;
+
+def int_hexagon_F2_conv_sf2uw :
+Hexagon_i32_float_Intrinsic<"HEXAGON_F2_conv_sf2uw">;
+
+def int_hexagon_A2_vsubh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubh">;
+
+def int_hexagon_F2_conv_sf2ud :
+Hexagon_i64_float_Intrinsic<"HEXAGON_F2_conv_sf2ud">;
+
+def int_hexagon_A2_vsubw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubw">;
+
+def int_hexagon_A2_vcmpwgt :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpwgt">;
+
+def int_hexagon_M4_xor_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_xor_or">;
+
+def int_hexagon_F2_conv_sf2uw_chop :
+Hexagon_i32_float_Intrinsic<"HEXAGON_F2_conv_sf2uw_chop">;
+
+def int_hexagon_S2_asl_r_vw :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_r_vw">;
+
 def int_hexagon_S2_vsatwuh_nopack :
-Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsatwuh_nopack">;
-//
-// BUILTIN_INFO(HEXAGON.S2_shuffob,DI_ftype_DIDI,2)
-//
+Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_vsatwuh_nopack">;
+
+def int_hexagon_S2_asl_r_vh :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_r_vh">;
+
+def int_hexagon_A2_svsubuhs :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svsubuhs">;
+
+def int_hexagon_M5_vmpybsu :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M5_vmpybsu">;
+
+def int_hexagon_A2_subh_l16_sat_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_l16_sat_ll">;
+
+def int_hexagon_C4_and_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_and_and">;
+
+def int_hexagon_M2_mpyu_acc_hl_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s1">;
+
+def int_hexagon_M2_mpyu_acc_hl_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s0">;
+
+def int_hexagon_S2_lsr_r_p :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p">;
+
+def int_hexagon_S2_lsr_r_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r">;
+
+def int_hexagon_A4_subp_c :
+Hexagon_i64i32_i64i64i32_Intrinsic<"HEXAGON_A4_subp_c">;
+
+def int_hexagon_A2_vsubhs :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubhs">;
+
+def int_hexagon_C2_vitpack :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_vitpack">;
+
+def int_hexagon_A2_vavguhr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavguhr">;
+
+def int_hexagon_S2_vsplicerb :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vsplicerb">;
+
+def int_hexagon_C4_nbitsclr :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsclr">;
+
+def int_hexagon_A2_vcmpbgtu :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpbgtu">;
+
+def int_hexagon_M2_cmpys_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpys_s1">;
+
+def int_hexagon_M2_cmpys_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpys_s0">;
+
+def int_hexagon_F2_dfcmpuo :
+Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpuo">;
+
 def int_hexagon_S2_shuffob :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffob">;
-//
-// BUILTIN_INFO(HEXAGON.S2_shuffeb,DI_ftype_DIDI,2)
-//
-def int_hexagon_S2_shuffeb :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffeb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_shuffoh,DI_ftype_DIDI,2)
-//
-def int_hexagon_S2_shuffoh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffoh">;
-//
-// BUILTIN_INFO(HEXAGON.S2_shuffeh,DI_ftype_DIDI,2)
-//
-def int_hexagon_S2_shuffeh :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffeh">;
-//
-// BUILTIN_INFO(HEXAGON.S5_popcountp,SI_ftype_DI,1)
-//
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_shuffob">;
+
+def int_hexagon_C2_and :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_and">;
+
 def int_hexagon_S5_popcountp :
-Hexagon_si_di_Intrinsic<"HEXAGON_S5_popcountp">;
-//
-// BUILTIN_INFO(HEXAGON.S4_parity,SI_ftype_SISI,2)
-//
-def int_hexagon_S4_parity :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_parity">;
-//
-// BUILTIN_INFO(HEXAGON.S2_parityp,SI_ftype_DIDI,2)
-//
-def int_hexagon_S2_parityp :
-Hexagon_si_didi_Intrinsic<"HEXAGON_S2_parityp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_lfsp,DI_ftype_DIDI,2)
-//
-def int_hexagon_S2_lfsp :
-Hexagon_di_didi_Intrinsic<"HEXAGON_S2_lfsp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_clbnorm,SI_ftype_SI,1)
-//
-def int_hexagon_S2_clbnorm :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_clbnorm">;
-//
-// BUILTIN_INFO(HEXAGON.S4_clbaddi,SI_ftype_SISI,2)
-//
-def int_hexagon_S4_clbaddi :
-Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_clbaddi">;
-//
-// BUILTIN_INFO(HEXAGON.S4_clbpnorm,SI_ftype_DI,1)
-//
-def int_hexagon_S4_clbpnorm :
-Hexagon_si_di_Intrinsic<"HEXAGON_S4_clbpnorm">;
-//
-// BUILTIN_INFO(HEXAGON.S4_clbpaddi,SI_ftype_DISI,2)
-//
-def int_hexagon_S4_clbpaddi :
-Hexagon_si_disi_Intrinsic<"HEXAGON_S4_clbpaddi">;
-//
-// BUILTIN_INFO(HEXAGON.S2_clb,SI_ftype_SI,1)
-//
-def int_hexagon_S2_clb :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_clb">;
-//
-// BUILTIN_INFO(HEXAGON.S2_cl0,SI_ftype_SI,1)
-//
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S5_popcountp">;
+
+def int_hexagon_S4_extractp :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S4_extractp">;
+
 def int_hexagon_S2_cl0 :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_cl0">;
-//
-// BUILTIN_INFO(HEXAGON.S2_cl1,SI_ftype_SI,1)
-//
-def int_hexagon_S2_cl1 :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_cl1">;
-//
-// BUILTIN_INFO(HEXAGON.S2_clbp,SI_ftype_DI,1)
-//
-def int_hexagon_S2_clbp :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_clbp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_cl0p,SI_ftype_DI,1)
-//
-def int_hexagon_S2_cl0p :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_cl0p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_cl1p,SI_ftype_DI,1)
-//
-def int_hexagon_S2_cl1p :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_cl1p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_brev,SI_ftype_SI,1)
-//
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_cl0">;
+
+def int_hexagon_A4_vcmpbgti :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbgti">;
+
+def int_hexagon_M2_mmacls_s1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacls_s1">;
+
+def int_hexagon_M2_mmacls_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacls_s0">;
+
+def int_hexagon_C4_cmpneq :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpneq">;
+
+def int_hexagon_M2_vmac2es :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vmac2es">;
+
+def int_hexagon_M2_vdmacs_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vdmacs_s0">;
+
+def int_hexagon_M2_vdmacs_s1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vdmacs_s1">;
+
+def int_hexagon_M2_mpyud_ll_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_ll_s0">;
+
+def int_hexagon_M2_mpyud_ll_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_ll_s1">;
+
+def int_hexagon_S2_clb :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_clb">;
+
+def int_hexagon_M2_mpy_nac_ll_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s0">;
+
+def int_hexagon_M2_mpy_nac_ll_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s1">;
+
+def int_hexagon_M2_mpyd_nac_hl_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s1">;
+
+def int_hexagon_M2_mpyd_nac_hl_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s0">;
+
+def int_hexagon_M2_maci :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_maci">;
+
+def int_hexagon_A2_vmaxuh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxuh">;
+
+def int_hexagon_A4_bitspliti :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_bitspliti">;
+
+def int_hexagon_A2_vmaxub :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxub">;
+
+def int_hexagon_M2_mpyud_hh_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_hh_s0">;
+
+def int_hexagon_M2_mpyud_hh_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_hh_s1">;
+
+def int_hexagon_M2_vrmac_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrmac_s0">;
+
+def int_hexagon_M2_mpy_sat_lh_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s0">;
+
+def int_hexagon_S2_asl_r_r_sat :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_sat">;
+
+def int_hexagon_F2_conv_sf2d :
+Hexagon_i64_float_Intrinsic<"HEXAGON_F2_conv_sf2d">;
+
+def int_hexagon_S2_asr_r_r_nac :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_nac">;
+
+def int_hexagon_F2_dfimm_n :
+Hexagon_double_i32_Intrinsic<"HEXAGON_F2_dfimm_n">;
+
+def int_hexagon_A4_cmphgt :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgt">;
+
+def int_hexagon_F2_dfimm_p :
+Hexagon_double_i32_Intrinsic<"HEXAGON_F2_dfimm_p">;
+
+def int_hexagon_M2_mpyud_acc_lh_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s1">;
+
+def int_hexagon_M2_vcmpy_s1_sat_r :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_r">;
+
+def int_hexagon_M4_mpyri_addr_u2 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addr_u2">;
+
+def int_hexagon_M2_vcmpy_s1_sat_i :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_i">;
+
+def int_hexagon_S2_lsl_r_p_nac :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_nac">;
+
+def int_hexagon_M5_vrmacbuu :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M5_vrmacbuu">;
+
+def int_hexagon_S5_asrhub_rnd_sat_goodsyntax :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S5_asrhub_rnd_sat_goodsyntax">;
+
+def int_hexagon_S2_vspliceib :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vspliceib">;
+
+def int_hexagon_M2_dpmpyss_acc_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_acc_s0">;
+
+def int_hexagon_M2_cnacs_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cnacs_s1">;
+
+def int_hexagon_M2_cnacs_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cnacs_s0">;
+
+def int_hexagon_A2_maxu :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_maxu">;
+
+def int_hexagon_A2_maxp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_maxp">;
+
+def int_hexagon_A2_andir :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_andir">;
+
+def int_hexagon_F2_sfrecipa :
+Hexagon_floati32_floatfloat_Intrinsic<"HEXAGON_F2_sfrecipa">;
+
+def int_hexagon_A2_combineii :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A2_combineii">;
+
+def int_hexagon_A4_orn :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_orn">;
+
+def int_hexagon_A4_cmpbgtui :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgtui">;
+
+def int_hexagon_S2_lsr_r_r_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r_or">;
+
+def int_hexagon_A4_vcmpbeqi :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbeqi">;
+
+def int_hexagon_S2_lsl_r_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r">;
+
+def int_hexagon_S2_lsl_r_p :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p">;
+
+def int_hexagon_A2_or :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_or">;
+
+def int_hexagon_F2_dfcmpeq :
+Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpeq">;
+
+def int_hexagon_C2_cmpeq :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpeq">;
+
+def int_hexagon_A2_tfrp :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_tfrp">;
+
+def int_hexagon_C4_and_andn :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_and_andn">;
+
+def int_hexagon_S2_vsathub_nopack :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_vsathub_nopack">;
+
+def int_hexagon_A2_satuh :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_satuh">;
+
+def int_hexagon_A2_satub :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_satub">;
+
+def int_hexagon_M2_vrcmpys_s1 :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_M2_vrcmpys_s1">;
+
+def int_hexagon_S4_or_ori :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_ori">;
+
+def int_hexagon_C4_fastcorner9_not :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_fastcorner9_not">;
+
+def int_hexagon_A2_tfrih :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_tfrih">;
+
+def int_hexagon_A2_tfril :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_tfril">;
+
+def int_hexagon_M4_mpyri_addr :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addr">;
+
+def int_hexagon_S2_vtrunehb :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vtrunehb">;
+
+def int_hexagon_A2_vabsw :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vabsw">;
+
+def int_hexagon_A2_vabsh :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vabsh">;
+
+def int_hexagon_F2_sfsub :
+Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfsub">;
+
+def int_hexagon_C2_muxii :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxii">;
+
+def int_hexagon_C2_muxir :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxir">;
+
+def int_hexagon_A2_swiz :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_swiz">;
+
+def int_hexagon_S2_asr_i_p_and :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_and">;
+
+def int_hexagon_M2_cmpyrsc_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_cmpyrsc_s0">;
+
+def int_hexagon_M2_cmpyrsc_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_cmpyrsc_s1">;
+
+def int_hexagon_A2_vraddub :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vraddub">;
+
+def int_hexagon_A4_tlbmatch :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_tlbmatch">;
+
+def int_hexagon_F2_conv_df2w_chop :
+Hexagon_i32_double_Intrinsic<"HEXAGON_F2_conv_df2w_chop">;
+
+def int_hexagon_A2_and :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_and">;
+
+def int_hexagon_S2_lsr_r_p_and :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_and">;
+
+def int_hexagon_M2_mpy_nac_sat_ll_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s1">;
+
+def int_hexagon_M2_mpy_nac_sat_ll_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s0">;
+
+def int_hexagon_S4_extract :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_extract">;
+
+def int_hexagon_A2_vcmpweq :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpweq">;
+
+def int_hexagon_M2_acci :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_acci">;
+
+def int_hexagon_S2_lsr_i_p_acc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_acc">;
+
+def int_hexagon_S2_lsr_i_p_or :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_or">;
+
+def int_hexagon_F2_conv_ud2sf :
+Hexagon_float_i64_Intrinsic<"HEXAGON_F2_conv_ud2sf">;
+
+def int_hexagon_A2_tfr :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfr">;
+
+def int_hexagon_S2_asr_i_p_or :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_or">;
+
+def int_hexagon_A2_subri :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subri">;
+
+def int_hexagon_A4_vrmaxuw :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrmaxuw">;
+
+def int_hexagon_M5_vmpybuu :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M5_vmpybuu">;
+
+def int_hexagon_A4_vrmaxuh :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrmaxuh">;
+
+def int_hexagon_S2_asl_i_vw :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_vw">;
+
+def int_hexagon_A2_vavgw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgw">;
+
 def int_hexagon_S2_brev :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_brev">;
-//
-// BUILTIN_INFO(HEXAGON.S2_brevp,DI_ftype_DI,1)
-//
-def int_hexagon_S2_brevp :
-Hexagon_di_di_Intrinsic<"HEXAGON_S2_brevp">;
-//
-// BUILTIN_INFO(HEXAGON.S2_ct0,SI_ftype_SI,1)
-//
-def int_hexagon_S2_ct0 :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_ct0">;
-//
-// BUILTIN_INFO(HEXAGON.S2_ct1,SI_ftype_SI,1)
-//
-def int_hexagon_S2_ct1 :
-Hexagon_si_si_Intrinsic<"HEXAGON_S2_ct1">;
-//
-// BUILTIN_INFO(HEXAGON.S2_ct0p,SI_ftype_DI,1)
-//
-def int_hexagon_S2_ct0p :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_ct0p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_ct1p,SI_ftype_DI,1)
-//
-def int_hexagon_S2_ct1p :
-Hexagon_si_di_Intrinsic<"HEXAGON_S2_ct1p">;
-//
-// BUILTIN_INFO(HEXAGON.S2_interleave,DI_ftype_DI,1)
-//
-def int_hexagon_S2_interleave :
-Hexagon_di_di_Intrinsic<"HEXAGON_S2_interleave">;
-//
-// BUILTIN_INFO(HEXAGON.S2_deinterleave,DI_ftype_DI,1)
-//
-def int_hexagon_S2_deinterleave :
-Hexagon_di_di_Intrinsic<"HEXAGON_S2_deinterleave">;
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_brev">;
 
-//
-// BUILTIN_INFO(HEXAGON.dcfetch_A,v_ftype_DI*,1)
-//
-def int_hexagon_prefetch :
-Hexagon_Intrinsic<"HEXAGON_prefetch", [], [llvm_ptr_ty], []>;
-def int_hexagon_Y2_dccleana :
-Hexagon_Intrinsic<"HEXAGON_Y2_dccleana", [], [llvm_ptr_ty], []>;
-def int_hexagon_Y2_dccleaninva :
-Hexagon_Intrinsic<"HEXAGON_Y2_dccleaninva", [], [llvm_ptr_ty], []>;
-def int_hexagon_Y2_dcinva :
-Hexagon_Intrinsic<"HEXAGON_Y2_dcinva", [], [llvm_ptr_ty], []>;
-def int_hexagon_Y2_dczeroa :
-Hexagon_Intrinsic<"HEXAGON_Y2_dczeroa", [], [llvm_ptr_ty],
-      [IntrWriteMem, IntrArgMemOnly, IntrHasSideEffects]>;
-def int_hexagon_Y4_l2fetch :
-Hexagon_Intrinsic<"HEXAGON_Y4_l2fetch", [], [llvm_ptr_ty, llvm_i32_ty], []>;
-def int_hexagon_Y5_l2fetch :
-Hexagon_Intrinsic<"HEXAGON_Y5_l2fetch", [], [llvm_ptr_ty, llvm_i64_ty], []>;
+def int_hexagon_A2_vavgh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgh">;
 
-def llvm_ptr32_ty : LLVMPointerType<llvm_i32_ty>;
-def llvm_ptr64_ty : LLVMPointerType<llvm_i64_ty>;
+def int_hexagon_S2_clrbit_i :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_clrbit_i">;
 
-// Mark locked loads as read/write to prevent any accidental reordering.
-def int_hexagon_L2_loadw_locked :
-Hexagon_Intrinsic<"HEXAGON_L2_loadw_locked", [llvm_i32_ty], [llvm_ptr32_ty],
-      [IntrArgMemOnly, NoCapture<0>]>;
-def int_hexagon_L4_loadd_locked :
-Hexagon_Intrinsic<"HEXAGON_L4_loadd_locked", [llvm_i64_ty], [llvm_ptr64_ty],
-      [IntrArgMemOnly, NoCapture<0>]>;
+def int_hexagon_S2_asl_i_vh :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_vh">;
 
-def int_hexagon_S2_storew_locked :
-Hexagon_Intrinsic<"HEXAGON_S2_storew_locked", [llvm_i32_ty],
-      [llvm_ptr32_ty, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>;
-def int_hexagon_S4_stored_locked :
-Hexagon_Intrinsic<"HEXAGON_S4_stored_locked", [llvm_i32_ty],
-      [llvm_ptr64_ty, llvm_i64_ty], [IntrArgMemOnly, NoCapture<0>]>;
+def int_hexagon_S2_lsr_i_r_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_or">;
 
-// V60
+def int_hexagon_S2_lsl_r_r_nac :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r_nac">;
 
-class Hexagon_v2048v2048_Intrinsic_T<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mmpyl_rs1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyl_rs1">;
 
-// tag : V6_hi_W
-// tag : V6_lo_W
-class Hexagon_v512v1024_Intrinsic_T<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyud_hl_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_hl_s1">;
 
-// tag : V6_hi_W_128B
-// tag : V6_lo_W_128B
-class Hexagon_v1024v2048_Intrinsic_T<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v64i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mmpyl_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyl_s0">;
 
-class Hexagon_v1024v1024_Intrinsic_T<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mmpyl_s1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyl_s1">;
 
-// BUILTIN_INFO(HEXAGON.V6_hi_W,VI_ftype_VI,1)
-// tag : V6_hi
-def int_hexagon_V6_hi :
-Hexagon_v512v1024_Intrinsic_T<"HEXAGON_V6_hi">;
+def int_hexagon_M2_naccii :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_naccii">;
 
-// BUILTIN_INFO(HEXAGON.V6_lo_W,VI_ftype_VI,1)
-// tag : V6_lo
-def int_hexagon_V6_lo :
-Hexagon_v512v1024_Intrinsic_T<"HEXAGON_V6_lo">;
+def int_hexagon_S2_vrndpackwhs :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vrndpackwhs">;
 
-// BUILTIN_INFO(HEXAGON.V6_hi_W,VI_ftype_VI,1)
-// tag : V6_hi_128B
-def int_hexagon_V6_hi_128B :
-Hexagon_v1024v2048_Intrinsic_T<"HEXAGON_V6_hi_128B">;
+def int_hexagon_S2_vtrunewh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_vtrunewh">;
 
-// BUILTIN_INFO(HEXAGON.V6_lo_W,VI_ftype_VI,1)
-// tag : V6_lo_128B
-def int_hexagon_V6_lo_128B :
-Hexagon_v1024v2048_Intrinsic_T<"HEXAGON_V6_lo_128B">;
+def int_hexagon_M2_dpmpyss_nac_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_nac_s0">;
 
-// BUILTIN_INFO(HEXAGON.V6_vassignp,VI_ftype_VI,1)
-// tag : V6_vassignp
-def int_hexagon_V6_vassignp :
-Hexagon_v1024v1024_Intrinsic_T<"HEXAGON_V6_vassignp">;
+def int_hexagon_M2_mpyd_ll_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_ll_s0">;
 
-// BUILTIN_INFO(HEXAGON.V6_vassignp,VI_ftype_VI,1)
-// tag : V6_vassignp_128B
-def int_hexagon_V6_vassignp_128B :
-Hexagon_v2048v2048_Intrinsic_T<"HEXAGON_V6_vassignp_128B">;
+def int_hexagon_M2_mpyd_ll_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_ll_s1">;
 
+def int_hexagon_M4_mac_up_s1_sat :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mac_up_s1_sat">;
 
-//
-// Hexagon_iii_Intrinsic<string GCCIntSuffix>
-// tag : S6_rol_i_r
-class Hexagon_iii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S4_vrcrotate_acc :
+Hexagon_i64_i64i64i32i32_Intrinsic<"HEXAGON_S4_vrcrotate_acc">;
 
-//
-// Hexagon_LLiLLii_Intrinsic<string GCCIntSuffix>
-// tag : S6_rol_i_p
-class Hexagon_LLiLLii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_conv_uw2df :
+Hexagon_double_i32_Intrinsic<"HEXAGON_F2_conv_uw2df">;
 
-//
-// Hexagon_iiii_Intrinsic<string GCCIntSuffix>
-// tag : S6_rol_i_r_acc
-class Hexagon_iiii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i32_ty,llvm_i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_vaddubs :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddubs">;
 
-//
-// Hexagon_LLiLLiLLii_Intrinsic<string GCCIntSuffix>
-// tag : S6_rol_i_p_acc
-class Hexagon_LLiLLiLLii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty,llvm_i64_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_asr_r_r_acc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_acc">;
 
-//
-// Hexagon_v512v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_valignb
-class Hexagon_v512v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_orir :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_orir">;
 
-//
-// Hexagon_v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_valignb_128B
-class Hexagon_v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_andp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_andp">;
 
-//
-// Hexagon_v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vror
-class Hexagon_v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_lfsp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_lfsp">;
 
-//
-// Hexagon_v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vror_128B
-class Hexagon_v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_min :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_min">;
 
-//
-// Hexagon_v1024v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vunpackub
-class Hexagon_v1024v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpysmi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysmi">;
 
-//
-// Hexagon_v2048v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vunpackub_128B
-class Hexagon_v2048v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_vcmpy_s0_sat_r :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_r">;
 
-//
-// Hexagon_v1024v1024v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vunpackob
-class Hexagon_v1024v1024v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyu_acc_ll_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s1">;
 
-//
-// Hexagon_v2048v2048v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vunpackob_128B
-class Hexagon_v2048v2048v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyu_acc_ll_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s0">;
 
-//
-// Hexagon_v512v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vpackeb
-class Hexagon_v512v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_asr_r_svw_trun :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S2_asr_r_svw_trun">;
 
-//
-// Hexagon_v1024v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vpackeb_128B
-class Hexagon_v1024v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mmpyh_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyh_s0">;
 
-//
-// Hexagon_v2048v2048i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vdmpybus_dv_128B
-class Hexagon_v2048v2048i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mmpyh_s1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyh_s1">;
 
-//
-// Hexagon_v2048v2048v2048i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vdmpybus_dv_acc_128B
-class Hexagon_v2048v2048v2048i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_conv_sf2df :
+Hexagon_double_float_Intrinsic<"HEXAGON_F2_conv_sf2df">;
 
-//
-// Hexagon_v512v512v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vdmpyhvsat_acc
-class Hexagon_v512v512v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_vtrunohb :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vtrunohb">;
 
-//
-// Hexagon_v1024v1024v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vdmpyhvsat_acc_128B
-class Hexagon_v1024v1024v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_conv_sf2d_chop :
+Hexagon_i64_float_Intrinsic<"HEXAGON_F2_conv_sf2d_chop">;
 
-//
-// Hexagon_v512v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vdmpyhisat
-class Hexagon_v512v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyd_lh_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_lh_s0">;
 
-//
-// Hexagon_v1024v2048i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vdmpyhisat_128B
-class Hexagon_v1024v2048i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v64i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_conv_df2w :
+Hexagon_i32_double_Intrinsic<"HEXAGON_F2_conv_df2w">;
 
-//
-// Hexagon_v512v512v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vdmpyhisat_acc
-class Hexagon_v512v512v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S5_asrhub_sat :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S5_asrhub_sat">;
 
-//
-// Hexagon_v1024v1024v2048i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vdmpyhisat_acc_128B
-class Hexagon_v1024v1024v2048i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v64i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_asl_i_r_xacc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_xacc">;
 
-//
-// Hexagon_v1024v1024ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrmpyubi
-class Hexagon_v1024v1024ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_conv_df2d :
+Hexagon_i64_double_Intrinsic<"HEXAGON_F2_conv_df2d">;
 
-//
-// Hexagon_v2048v2048ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrmpyubi_128B
-class Hexagon_v2048v2048ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mmaculs_s1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmaculs_s1">;
 
-//
-// Hexagon_v1024v1024v1024ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrmpyubi_acc
-class Hexagon_v1024v1024v1024ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mmaculs_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmaculs_s0">;
 
-//
-// Hexagon_v2048v2048v2048ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrmpyubi_acc_128B
-class Hexagon_v2048v2048v2048ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty,llvm_i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_svadduhs :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svadduhs">;
 
-//
-// Hexagon_v2048v2048v2048_Intrinsic<string GCCIntSuffix>
-// tag : V6_vaddb_dv_128B
-class Hexagon_v2048v2048v2048_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_conv_sf2w_chop :
+Hexagon_i32_float_Intrinsic<"HEXAGON_F2_conv_sf2w_chop">;
 
-//
-// Hexagon_v1024v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vaddubh
-class Hexagon_v1024v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_svsathub :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_svsathub">;
 
-//
-// Hexagon_v2048v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vaddubh_128B
-class Hexagon_v2048v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyd_rnd_hl_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s1">;
 
-//
-// Hexagon_v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vd0
-class Hexagon_v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyd_rnd_hl_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s0">;
 
-//
-// Hexagon_v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vd0_128B
-class Hexagon_v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [],
-                          [IntrNoMem]>;
+def int_hexagon_S2_setbit_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_setbit_r">;
 
-//
-// Hexagon_v512v64iv512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vaddbq
-class Hexagon_v512v64iv512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_vavghr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavghr">;
 
-//
-// Hexagon_v1024v128iv1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vaddbq_128B
-class Hexagon_v1024v128iv1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_sffma_sc :
+Hexagon_float_floatfloatfloati32_Intrinsic<"HEXAGON_F2_sffma_sc">;
 
-//
-// Hexagon_v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vabsh
-class Hexagon_v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_dfclass :
+Hexagon_i32_doublei32_Intrinsic<"HEXAGON_F2_dfclass">;
 
-//
-// Hexagon_v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vabsh_128B
-class Hexagon_v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_conv_df2ud :
+Hexagon_i64_double_Intrinsic<"HEXAGON_F2_conv_df2ud">;
 
-//
-// Hexagon_v1024v1024v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpybv_acc
-class Hexagon_v1024v1024v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_F2_conv_df2uw :
+Hexagon_i32_double_Intrinsic<"HEXAGON_F2_conv_df2uw">;
 
-//
-// Hexagon_v2048v2048v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpybv_acc_128B
-class Hexagon_v2048v2048v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_cmpyrs_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_cmpyrs_s0">;
 
-//
-// Hexagon_v1024v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyub
-class Hexagon_v1024v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_cmpyrs_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_cmpyrs_s1">;
 
-//
-// Hexagon_v2048v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyub_128B
-class Hexagon_v2048v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_C4_cmpltei :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpltei">;
 
-//
-// Hexagon_v1024v1024v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyub_acc
-class Hexagon_v1024v1024v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_C4_cmplteu :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmplteu">;
 
-//
-// Hexagon_v2048v2048v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyub_acc_128B
-class Hexagon_v2048v2048v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_vsubb_map :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubb_map">;
 
-//
-// Hexagon_v512v64ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandqrt
-class Hexagon_v512v64ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_subh_l16_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_l16_ll">;
 
-//
-// Hexagon_v1024v128ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandqrt_128B
-class Hexagon_v1024v128ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_asr_i_r_rnd :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_rnd">;
 
-//
-// Hexagon_v512v512v64ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandqrt_acc
-class Hexagon_v512v512v64ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v512i1_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_vrmpy_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrmpy_s0">;
 
-//
-// Hexagon_v1024v1024v128ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandqrt_acc_128B
-class Hexagon_v1024v1024v128ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v1024i1_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyd_rnd_hh_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s1">;
 
-//
-// Hexagon_v64iv512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandvrt
-class Hexagon_v64iv512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyd_rnd_hh_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s0">;
 
-//
-// Hexagon_v128iv1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandvrt_128B
-class Hexagon_v128iv1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_minup :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_minup">;
 
-//
-// Hexagon_v64iv64iv512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandvrt_acc
-class Hexagon_v64iv64iv512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_v512i1_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_valignrb :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_valignrb">;
 
-//
-// Hexagon_v128iv128iv1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandvrt_acc_128B
-class Hexagon_v128iv128iv1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_v1024i1_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_asr_r_p_acc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_acc">;
 
-//
-// Hexagon_v64iv512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vgtw
-class Hexagon_v64iv512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mmpyl_rs0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyl_rs0">;
 
-//
-// Hexagon_v128iv1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vgtw_128B
-class Hexagon_v128iv1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_vrcmaci_s0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrcmaci_s0">;
 
-//
-// Hexagon_v64iv64iv512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vgtw_and
-class Hexagon_v64iv64iv512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_v512i1_ty,llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_vaddub :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddub">;
 
-//
-// Hexagon_v128iv128iv1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vgtw_and_128B
-class Hexagon_v128iv128iv1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_v1024i1_ty,llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_combine_lh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_lh">;
 
-//
-// Hexagon_v64iv64iv64i_Intrinsic<string GCCIntSuffix>
-// tag : V6_pred_or
-class Hexagon_v64iv64iv64i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_v512i1_ty,llvm_v512i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M5_vdmacbsu :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M5_vdmacbsu">;
 
-//
-// Hexagon_v128iv128iv128i_Intrinsic<string GCCIntSuffix>
-// tag : V6_pred_or_128B
-class Hexagon_v128iv128iv128i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_v1024i1_ty,llvm_v1024i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_combine_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_ll">;
 
-//
-// Hexagon_v64iv64i_Intrinsic<string GCCIntSuffix>
-// tag : V6_pred_not
-class Hexagon_v64iv64i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_v512i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyud_hl_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_hl_s0">;
 
-//
-// Hexagon_v128iv128i_Intrinsic<string GCCIntSuffix>
-// tag : V6_pred_not_128B
-class Hexagon_v128iv128i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_v1024i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_vrcmpyi_s0c :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrcmpyi_s0c">;
 
-//
-// Hexagon_v64ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_pred_scalar2
-class Hexagon_v64ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_asr_i_p_rnd :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_rnd">;
 
-//
-// Hexagon_v128ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_pred_scalar2_128B
-class Hexagon_v128ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_addpsat :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_addpsat">;
 
-//
-// Hexagon_v1024v64iv512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vswap
-class Hexagon_v1024v64iv512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v512i1_ty,llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_svaddhs :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svaddhs">;
 
-//
-// Hexagon_v2048v128iv1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vswap_128B
-class Hexagon_v2048v128iv1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v1024i1_ty,llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S4_ori_lsr_ri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_ori_lsr_ri">;
 
-//
-// Hexagon_v1024v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vshuffvdd
-class Hexagon_v1024v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpy_sat_rnd_ll_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s1">;
 
-//
-// Hexagon_v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vshuffvdd_128B
-class Hexagon_v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpy_sat_rnd_ll_s0 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s0">;
 
+def int_hexagon_A2_vminw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminw">;
 
-//
-// Hexagon_iv512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_extractw
-class Hexagon_iv512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_vminh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminh">;
 
-//
-// Hexagon_iv1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_extractw_128B
-class Hexagon_iv1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_vrcmpyr_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrcmpyr_s0">;
 
-//
-// Hexagon_v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_lvsplatw
-class Hexagon_v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_A2_vminb :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminb">;
 
-//
-// Hexagon_v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_lvsplatw_128B
-class Hexagon_v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_vcmac_s0_sat_i :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_i">;
 
-//
-// Hexagon_v512v512v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvvb_oracc
-class Hexagon_v512v512v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyud_lh_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_lh_s0">;
 
-//
-// Hexagon_v1024v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvvb_oracc_128B
-class Hexagon_v1024v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_M2_mpyud_lh_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_lh_s1">;
 
-//
-// Hexagon_v1024v1024v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvwh_oracc
-class Hexagon_v1024v1024v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S2_asl_r_r_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_or">;
 
-//
-// Hexagon_v2048v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvwh_oracc_128B
-class Hexagon_v2048v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_S4_lsli :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_lsli">;
 
-//
-// Hexagon_vv64ivmemv512_Intrinsic<string GCCIntSuffix>
-// tag: V6_vS32b_qpred_ai
-class Hexagon_vv64ivmemv512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_v512i1_ty,llvm_ptr_ty,llvm_v16i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_S2_lsl_r_vw :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsl_r_vw">;
 
-//
-// Hexagon_vv128ivmemv1024_Intrinsic<string GCCIntSuffix>
-// tag: V6_vS32b_qpred_ai_128B
-class Hexagon_vv128ivmemv1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_v1024i1_ty,llvm_ptr_ty,llvm_v32i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_M2_mpy_hh_s1 :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_hh_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_r,SI_ftype_SISI,2)
-// tag : S6_rol_i_r
-def int_hexagon_S6_rol_i_r :
-Hexagon_iii_Intrinsic<"HEXAGON_S6_rol_i_r">;
+def int_hexagon_M4_vrmpyeh_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M4_vrmpyeh_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_p,DI_ftype_DISI,2)
-// tag : S6_rol_i_p
-def int_hexagon_S6_rol_i_p :
-Hexagon_LLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p">;
+def int_hexagon_M4_vrmpyeh_s1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M4_vrmpyeh_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_r_acc,SI_ftype_SISISI,3)
-// tag : S6_rol_i_r_acc
-def int_hexagon_S6_rol_i_r_acc :
-Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_acc">;
+def int_hexagon_M2_mpy_nac_lh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_p_acc,DI_ftype_DIDISI,3)
-// tag : S6_rol_i_p_acc
-def int_hexagon_S6_rol_i_p_acc :
-Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_acc">;
+def int_hexagon_M2_mpy_nac_lh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_r_nac,SI_ftype_SISISI,3)
-// tag : S6_rol_i_r_nac
-def int_hexagon_S6_rol_i_r_nac :
-Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_nac">;
+def int_hexagon_M2_vraddh :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M2_vraddh">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_p_nac,DI_ftype_DIDISI,3)
-// tag : S6_rol_i_p_nac
-def int_hexagon_S6_rol_i_p_nac :
-Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_nac">;
+def int_hexagon_C2_tfrrp :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_tfrrp">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_r_xacc,SI_ftype_SISISI,3)
-// tag : S6_rol_i_r_xacc
-def int_hexagon_S6_rol_i_r_xacc :
-Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_xacc">;
+def int_hexagon_M2_mpy_acc_sat_ll_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_p_xacc,DI_ftype_DIDISI,3)
-// tag : S6_rol_i_p_xacc
-def int_hexagon_S6_rol_i_p_xacc :
-Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_xacc">;
+def int_hexagon_M2_mpy_acc_sat_ll_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_r_and,SI_ftype_SISISI,3)
-// tag : S6_rol_i_r_and
-def int_hexagon_S6_rol_i_r_and :
-Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_and">;
+def int_hexagon_S2_vtrunowh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_vtrunowh">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_r_or,SI_ftype_SISISI,3)
-// tag : S6_rol_i_r_or
-def int_hexagon_S6_rol_i_r_or :
-Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_or">;
+def int_hexagon_A2_abs :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_abs">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_p_and,DI_ftype_DIDISI,3)
-// tag : S6_rol_i_p_and
-def int_hexagon_S6_rol_i_p_and :
-Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_and">;
+def int_hexagon_A4_cmpbeq :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbeq">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_rol_i_p_or,DI_ftype_DIDISI,3)
-// tag : S6_rol_i_p_or
-def int_hexagon_S6_rol_i_p_or :
-Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_or">;
+def int_hexagon_A2_negp :
+Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_negp">;
 
-//
-// BUILTIN_INFO(HEXAGON.S2_cabacencbin,DI_ftype_DIDIQI,3)
-// tag : S2_cabacencbin
-def int_hexagon_S2_cabacencbin :
-Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S2_cabacencbin">;
+def int_hexagon_S2_asl_i_r_sat :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_valignb,VI_ftype_VIVISI,3)
-// tag : V6_valignb
-def int_hexagon_V6_valignb :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_valignb">;
+def int_hexagon_A2_addh_l16_sat_hl :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_sat_hl">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_valignb_128B,VI_ftype_VIVISI,3)
-// tag : V6_valignb_128B
-def int_hexagon_V6_valignb_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_valignb_128B">;
+def int_hexagon_S2_vsatwuh :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vsatwuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlalignb,VI_ftype_VIVISI,3)
-// tag : V6_vlalignb
-def int_hexagon_V6_vlalignb :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vlalignb">;
+def int_hexagon_F2_dfcmpgt :
+Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpgt">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlalignb_128B,VI_ftype_VIVISI,3)
-// tag : V6_vlalignb_128B
-def int_hexagon_V6_vlalignb_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlalignb_128B">;
+def int_hexagon_S2_svsathb :
+Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_svsathb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_valignbi,VI_ftype_VIVISI,3)
-// tag : V6_valignbi
-def int_hexagon_V6_valignbi :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_valignbi">;
+def int_hexagon_C2_cmpgtup :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_C2_cmpgtup">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_valignbi_128B,VI_ftype_VIVISI,3)
-// tag : V6_valignbi_128B
-def int_hexagon_V6_valignbi_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_valignbi_128B">;
+def int_hexagon_A4_cround_ri :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cround_ri">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlalignbi,VI_ftype_VIVISI,3)
-// tag : V6_vlalignbi
-def int_hexagon_V6_vlalignbi :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vlalignbi">;
+def int_hexagon_S4_clbpaddi :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S4_clbpaddi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlalignbi_128B,VI_ftype_VIVISI,3)
-// tag : V6_vlalignbi_128B
-def int_hexagon_V6_vlalignbi_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlalignbi_128B">;
+def int_hexagon_A4_cround_rr :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cround_rr">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vror,VI_ftype_VISI,2)
-// tag : V6_vror
-def int_hexagon_V6_vror :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vror">;
+def int_hexagon_C2_mux :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_mux">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vror_128B,VI_ftype_VISI,2)
-// tag : V6_vror_128B
-def int_hexagon_V6_vror_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vror_128B">;
+def int_hexagon_M2_dpmpyuu_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_dpmpyuu_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackub,VD_ftype_VI,1)
-// tag : V6_vunpackub
-def int_hexagon_V6_vunpackub :
-Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vunpackub">;
+def int_hexagon_S2_shuffeb :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_shuffeb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackub_128B,VD_ftype_VI,1)
-// tag : V6_vunpackub_128B
-def int_hexagon_V6_vunpackub_128B :
-Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vunpackub_128B">;
+def int_hexagon_A2_vminuw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminuw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackb,VD_ftype_VI,1)
-// tag : V6_vunpackb
-def int_hexagon_V6_vunpackb :
-Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vunpackb">;
+def int_hexagon_A2_vaddhs :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddhs">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackb_128B,VD_ftype_VI,1)
-// tag : V6_vunpackb_128B
-def int_hexagon_V6_vunpackb_128B :
-Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vunpackb_128B">;
+def int_hexagon_S2_insert_rp :
+Hexagon_i32_i32i32i64_Intrinsic<"HEXAGON_S2_insert_rp">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackuh,VD_ftype_VI,1)
-// tag : V6_vunpackuh
-def int_hexagon_V6_vunpackuh :
-Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vunpackuh">;
+def int_hexagon_A2_vminuh :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackuh_128B,VD_ftype_VI,1)
-// tag : V6_vunpackuh_128B
-def int_hexagon_V6_vunpackuh_128B :
-Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vunpackuh_128B">;
+def int_hexagon_A2_vminub :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackh,VD_ftype_VI,1)
-// tag : V6_vunpackh
-def int_hexagon_V6_vunpackh :
-Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vunpackh">;
+def int_hexagon_S2_extractu :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_extractu">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackh_128B,VD_ftype_VI,1)
-// tag : V6_vunpackh_128B
-def int_hexagon_V6_vunpackh_128B :
-Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vunpackh_128B">;
+def int_hexagon_A2_svsubh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svsubh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackob,VD_ftype_VDVI,2)
-// tag : V6_vunpackob
-def int_hexagon_V6_vunpackob :
-Hexagon_v1024v1024v512_Intrinsic<"HEXAGON_V6_vunpackob">;
+def int_hexagon_S4_clbaddi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_clbaddi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackob_128B,VD_ftype_VDVI,2)
-// tag : V6_vunpackob_128B
-def int_hexagon_V6_vunpackob_128B :
-Hexagon_v2048v2048v1024_Intrinsic<"HEXAGON_V6_vunpackob_128B">;
+def int_hexagon_F2_sffms :
+Hexagon_float_floatfloatfloat_Intrinsic<"HEXAGON_F2_sffms">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackoh,VD_ftype_VDVI,2)
-// tag : V6_vunpackoh
-def int_hexagon_V6_vunpackoh :
-Hexagon_v1024v1024v512_Intrinsic<"HEXAGON_V6_vunpackoh">;
+def int_hexagon_S2_vsxtbh :
+Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vsxtbh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vunpackoh_128B,VD_ftype_VDVI,2)
-// tag : V6_vunpackoh_128B
-def int_hexagon_V6_vunpackoh_128B :
-Hexagon_v2048v2048v1024_Intrinsic<"HEXAGON_V6_vunpackoh_128B">;
+def int_hexagon_M2_mpyud_nac_ll_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackeb,VI_ftype_VIVI,2)
-// tag : V6_vpackeb
-def int_hexagon_V6_vpackeb :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackeb">;
+def int_hexagon_M2_mpyud_nac_ll_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackeb_128B,VI_ftype_VIVI,2)
-// tag : V6_vpackeb_128B
-def int_hexagon_V6_vpackeb_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackeb_128B">;
+def int_hexagon_A2_subp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_subp">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackeh,VI_ftype_VIVI,2)
-// tag : V6_vpackeh
-def int_hexagon_V6_vpackeh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackeh">;
+def int_hexagon_M2_vmpy2es_s1 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vmpy2es_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackeh_128B,VI_ftype_VIVI,2)
-// tag : V6_vpackeh_128B
-def int_hexagon_V6_vpackeh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackeh_128B">;
+def int_hexagon_M2_vmpy2es_s0 :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vmpy2es_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackob,VI_ftype_VIVI,2)
-// tag : V6_vpackob
-def int_hexagon_V6_vpackob :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackob">;
+def int_hexagon_S4_parity :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_parity">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackob_128B,VI_ftype_VIVI,2)
-// tag : V6_vpackob_128B
-def int_hexagon_V6_vpackob_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackob_128B">;
+def int_hexagon_M2_mpy_acc_hh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackoh,VI_ftype_VIVI,2)
-// tag : V6_vpackoh
-def int_hexagon_V6_vpackoh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackoh">;
+def int_hexagon_M2_mpy_acc_hh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackoh_128B,VI_ftype_VIVI,2)
-// tag : V6_vpackoh_128B
-def int_hexagon_V6_vpackoh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackoh_128B">;
+def int_hexagon_S4_addi_asl_ri :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addi_asl_ri">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackhub_sat,VI_ftype_VIVI,2)
-// tag : V6_vpackhub_sat
-def int_hexagon_V6_vpackhub_sat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackhub_sat">;
+def int_hexagon_M2_mpyd_nac_hh_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackhub_sat_128B,VI_ftype_VIVI,2)
-// tag : V6_vpackhub_sat_128B
-def int_hexagon_V6_vpackhub_sat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackhub_sat_128B">;
+def int_hexagon_M2_mpyd_nac_hh_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackhb_sat,VI_ftype_VIVI,2)
-// tag : V6_vpackhb_sat
-def int_hexagon_V6_vpackhb_sat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackhb_sat">;
+def int_hexagon_S2_asr_i_r_nac :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_nac">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackhb_sat_128B,VI_ftype_VIVI,2)
-// tag : V6_vpackhb_sat_128B
-def int_hexagon_V6_vpackhb_sat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackhb_sat_128B">;
+def int_hexagon_A4_cmpheqi :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpheqi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackwuh_sat,VI_ftype_VIVI,2)
-// tag : V6_vpackwuh_sat
-def int_hexagon_V6_vpackwuh_sat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackwuh_sat">;
+def int_hexagon_S2_lsr_r_p_xor :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackwuh_sat_128B,VI_ftype_VIVI,2)
-// tag : V6_vpackwuh_sat_128B
-def int_hexagon_V6_vpackwuh_sat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackwuh_sat_128B">;
+def int_hexagon_M2_mpy_acc_hl_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackwh_sat,VI_ftype_VIVI,2)
-// tag : V6_vpackwh_sat
-def int_hexagon_V6_vpackwh_sat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackwh_sat">;
+def int_hexagon_M2_mpy_acc_hl_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpackwh_sat_128B,VI_ftype_VIVI,2)
-// tag : V6_vpackwh_sat_128B
-def int_hexagon_V6_vpackwh_sat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackwh_sat_128B">;
+def int_hexagon_F2_conv_sf2ud_chop :
+Hexagon_i64_float_Intrinsic<"HEXAGON_F2_conv_sf2ud_chop">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vzb,VD_ftype_VI,1)
-// tag : V6_vzb
-def int_hexagon_V6_vzb :
-Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vzb">;
+def int_hexagon_C2_cmpgeui :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgeui">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vzb_128B,VD_ftype_VI,1)
-// tag : V6_vzb_128B
-def int_hexagon_V6_vzb_128B :
-Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vzb_128B">;
+def int_hexagon_M2_mpy_acc_sat_hh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsb,VD_ftype_VI,1)
-// tag : V6_vsb
-def int_hexagon_V6_vsb :
-Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vsb">;
+def int_hexagon_M2_mpy_acc_sat_hh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsb_128B,VD_ftype_VI,1)
-// tag : V6_vsb_128B
-def int_hexagon_V6_vsb_128B :
-Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vsb_128B">;
+def int_hexagon_S2_asl_r_p_and :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vzh,VD_ftype_VI,1)
-// tag : V6_vzh
-def int_hexagon_V6_vzh :
-Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vzh">;
+def int_hexagon_A2_addh_h16_sat_lh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_sat_lh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vzh_128B,VD_ftype_VI,1)
-// tag : V6_vzh_128B
-def int_hexagon_V6_vzh_128B :
-Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vzh_128B">;
+def int_hexagon_A2_addh_h16_sat_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_sat_ll">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsh,VD_ftype_VI,1)
-// tag : V6_vsh
-def int_hexagon_V6_vsh :
-Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vsh">;
+def int_hexagon_M4_nac_up_s1_sat :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_nac_up_s1_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsh_128B,VD_ftype_VI,1)
-// tag : V6_vsh_128B
-def int_hexagon_V6_vsh_128B :
-Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vsh_128B">;
+def int_hexagon_M2_mpyud_nac_lh_s1 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpybus,VI_ftype_VISI,2)
-// tag : V6_vdmpybus
-def int_hexagon_V6_vdmpybus :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vdmpybus">;
+def int_hexagon_M2_mpyud_nac_lh_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpybus_128B,VI_ftype_VISI,2)
-// tag : V6_vdmpybus_128B
-def int_hexagon_V6_vdmpybus_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpybus_128B">;
+def int_hexagon_A4_round_ri_sat :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_ri_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpybus_acc,VI_ftype_VIVISI,3)
-// tag : V6_vdmpybus_acc
-def int_hexagon_V6_vdmpybus_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vdmpybus_acc">;
+def int_hexagon_M2_mpy_nac_hl_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpybus_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vdmpybus_acc_128B
-def int_hexagon_V6_vdmpybus_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpybus_acc_128B">;
+def int_hexagon_M2_mpy_nac_hl_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpybus_dv,VD_ftype_VDSI,2)
-// tag : V6_vdmpybus_dv
-def int_hexagon_V6_vdmpybus_dv :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpybus_dv">;
+def int_hexagon_A2_vavghcr :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavghcr">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpybus_dv_128B,VD_ftype_VDSI,2)
-// tag : V6_vdmpybus_dv_128B
-def int_hexagon_V6_vdmpybus_dv_128B :
-Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vdmpybus_dv_128B">;
+def int_hexagon_M2_mmacls_rs0 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacls_rs0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpybus_dv_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vdmpybus_dv_acc
-def int_hexagon_V6_vdmpybus_dv_acc :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpybus_dv_acc">;
+def int_hexagon_M2_mmacls_rs1 :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacls_rs1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpybus_dv_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vdmpybus_dv_acc_128B
-def int_hexagon_V6_vdmpybus_dv_acc_128B :
-Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vdmpybus_dv_acc_128B">;
+def int_hexagon_M2_cmaci_s0 :
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmaci_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhb,VI_ftype_VISI,2)
-// tag : V6_vdmpyhb
-def int_hexagon_V6_vdmpyhb :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhb">;
+def int_hexagon_S2_setbit_i :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_setbit_i">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_128B,VI_ftype_VISI,2)
-// tag : V6_vdmpyhb_128B
-def int_hexagon_V6_vdmpyhb_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhb_128B">;
+def int_hexagon_S2_asl_i_p_or :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_acc,VI_ftype_VIVISI,3)
-// tag : V6_vdmpyhb_acc
-def int_hexagon_V6_vdmpyhb_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhb_acc">;
+def int_hexagon_A4_andn :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_andn">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vdmpyhb_acc_128B
-def int_hexagon_V6_vdmpyhb_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhb_acc_128B">;
+def int_hexagon_M5_vrmpybsu :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M5_vrmpybsu">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_dv,VD_ftype_VDSI,2)
-// tag : V6_vdmpyhb_dv
-def int_hexagon_V6_vdmpyhb_dv :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhb_dv">;
+def int_hexagon_S2_vrndpackwh :
+Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vrndpackwh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_dv_128B,VD_ftype_VDSI,2)
-// tag : V6_vdmpyhb_dv_128B
-def int_hexagon_V6_vdmpyhb_dv_128B :
-Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_128B">;
+def int_hexagon_M2_vcmac_s0_sat_r :
+Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_r">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_dv_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vdmpyhb_dv_acc
-def int_hexagon_V6_vdmpyhb_dv_acc :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_acc">;
+def int_hexagon_A2_vmaxuw :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxuw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_dv_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vdmpyhb_dv_acc_128B
-def int_hexagon_V6_vdmpyhb_dv_acc_128B :
-Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_acc_128B">;
+def int_hexagon_C2_bitsclr :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_bitsclr">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhvsat,VI_ftype_VIVI,2)
-// tag : V6_vdmpyhvsat
-def int_hexagon_V6_vdmpyhvsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vdmpyhvsat">;
+def int_hexagon_M2_xor_xacc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_xor_xacc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhvsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vdmpyhvsat_128B
-def int_hexagon_V6_vdmpyhvsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vdmpyhvsat_128B">;
+def int_hexagon_A4_vcmpbgtui :
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbgtui">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhvsat_acc,VI_ftype_VIVIVI,3)
-// tag : V6_vdmpyhvsat_acc
-def int_hexagon_V6_vdmpyhvsat_acc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vdmpyhvsat_acc">;
+def int_hexagon_A4_ornp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A4_ornp">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhvsat_acc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vdmpyhvsat_acc_128B
-def int_hexagon_V6_vdmpyhvsat_acc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vdmpyhvsat_acc_128B">;
+def int_hexagon_A2_tfrpi :
+Hexagon_i64_i32_Intrinsic<"HEXAGON_A2_tfrpi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsat,VI_ftype_VISI,2)
-// tag : V6_vdmpyhsat
-def int_hexagon_V6_vdmpyhsat :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhsat">;
+def int_hexagon_C4_and_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_and_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsat_128B,VI_ftype_VISI,2)
-// tag : V6_vdmpyhsat_128B
-def int_hexagon_V6_vdmpyhsat_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsat_128B">;
+def int_hexagon_M2_mpy_nac_sat_hh_s1 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsat_acc,VI_ftype_VIVISI,3)
-// tag : V6_vdmpyhsat_acc
-def int_hexagon_V6_vdmpyhsat_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhsat_acc">;
+def int_hexagon_M2_mpy_nac_sat_hh_s0 :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsat_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vdmpyhsat_acc_128B
-def int_hexagon_V6_vdmpyhsat_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsat_acc_128B">;
+def int_hexagon_A2_subh_h16_sat_ll :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_sat_ll">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhisat,VI_ftype_VDSI,2)
-// tag : V6_vdmpyhisat
-def int_hexagon_V6_vdmpyhisat :
-Hexagon_v512v1024i_Intrinsic<"HEXAGON_V6_vdmpyhisat">;
+def int_hexagon_A2_subh_h16_sat_lh :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_sat_lh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhisat_128B,VI_ftype_VDSI,2)
-// tag : V6_vdmpyhisat_128B
-def int_hexagon_V6_vdmpyhisat_128B :
-Hexagon_v1024v2048i_Intrinsic<"HEXAGON_V6_vdmpyhisat_128B">;
+def int_hexagon_M2_vmpy2su_s1 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2su_s1">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhisat_acc,VI_ftype_VIVDSI,3)
-// tag : V6_vdmpyhisat_acc
-def int_hexagon_V6_vdmpyhisat_acc :
-Hexagon_v512v512v1024i_Intrinsic<"HEXAGON_V6_vdmpyhisat_acc">;
+def int_hexagon_M2_vmpy2su_s0 :
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2su_s0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhisat_acc_128B,VI_ftype_VIVDSI,3)
-// tag : V6_vdmpyhisat_acc_128B
-def int_hexagon_V6_vdmpyhisat_acc_128B :
-Hexagon_v1024v1024v2048i_Intrinsic<"HEXAGON_V6_vdmpyhisat_acc_128B">;
+def int_hexagon_S2_asr_i_p_acc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsusat,VI_ftype_VISI,2)
-// tag : V6_vdmpyhsusat
-def int_hexagon_V6_vdmpyhsusat :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhsusat">;
+def int_hexagon_C4_nbitsclri :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsclri">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsusat_128B,VI_ftype_VISI,2)
-// tag : V6_vdmpyhsusat_128B
-def int_hexagon_V6_vdmpyhsusat_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsusat_128B">;
+def int_hexagon_S2_lsr_i_vh :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_vh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsusat_acc,VI_ftype_VIVISI,3)
-// tag : V6_vdmpyhsusat_acc
-def int_hexagon_V6_vdmpyhsusat_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhsusat_acc">;
+def int_hexagon_S2_lsr_i_p_xacc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_xacc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsusat_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vdmpyhsusat_acc_128B
-def int_hexagon_V6_vdmpyhsusat_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsusat_acc_128B">;
+// V55 Scalar Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsuisat,VI_ftype_VDSI,2)
-// tag : V6_vdmpyhsuisat
-def int_hexagon_V6_vdmpyhsuisat :
-Hexagon_v512v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsuisat">;
+def int_hexagon_A5_ACS :
+Hexagon_i64i32_i64i64i64_Intrinsic<"HEXAGON_A5_ACS">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsuisat_128B,VI_ftype_VDSI,2)
-// tag : V6_vdmpyhsuisat_128B
-def int_hexagon_V6_vdmpyhsuisat_128B :
-Hexagon_v1024v2048i_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_128B">;
+// V60 Scalar Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsuisat_acc,VI_ftype_VIVDSI,3)
-// tag : V6_vdmpyhsuisat_acc
-def int_hexagon_V6_vdmpyhsuisat_acc :
-Hexagon_v512v512v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_acc">;
+def int_hexagon_S6_rol_i_p_and :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdmpyhsuisat_acc_128B,VI_ftype_VIVDSI,3)
-// tag : V6_vdmpyhsuisat_acc_128B
-def int_hexagon_V6_vdmpyhsuisat_acc_128B :
-Hexagon_v1024v1024v2048i_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_acc_128B">;
+def int_hexagon_S6_rol_i_r_xacc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_xacc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpyb,VD_ftype_VDSI,2)
-// tag : V6_vtmpyb
-def int_hexagon_V6_vtmpyb :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpyb">;
+def int_hexagon_S6_rol_i_r_and :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpyb_128B,VD_ftype_VDSI,2)
-// tag : V6_vtmpyb_128B
-def int_hexagon_V6_vtmpyb_128B :
-Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpyb_128B">;
+def int_hexagon_S6_rol_i_r_acc :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpyb_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vtmpyb_acc
-def int_hexagon_V6_vtmpyb_acc :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpyb_acc">;
+def int_hexagon_S6_rol_i_p_xacc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_xacc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpyb_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vtmpyb_acc_128B
-def int_hexagon_V6_vtmpyb_acc_128B :
-Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpyb_acc_128B">;
+def int_hexagon_S6_rol_i_p :
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S6_rol_i_p">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpybus,VD_ftype_VDSI,2)
-// tag : V6_vtmpybus
-def int_hexagon_V6_vtmpybus :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpybus">;
+def int_hexagon_S6_rol_i_p_nac :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_nac">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpybus_128B,VD_ftype_VDSI,2)
-// tag : V6_vtmpybus_128B
-def int_hexagon_V6_vtmpybus_128B :
-Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpybus_128B">;
+def int_hexagon_S6_rol_i_p_acc :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpybus_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vtmpybus_acc
-def int_hexagon_V6_vtmpybus_acc :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpybus_acc">;
+def int_hexagon_S6_rol_i_r_or :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpybus_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vtmpybus_acc_128B
-def int_hexagon_V6_vtmpybus_acc_128B :
-Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpybus_acc_128B">;
+def int_hexagon_S6_rol_i_r :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S6_rol_i_r">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpyhb,VD_ftype_VDSI,2)
-// tag : V6_vtmpyhb
-def int_hexagon_V6_vtmpyhb :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpyhb">;
+def int_hexagon_S6_rol_i_r_nac :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_nac">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpyhb_128B,VD_ftype_VDSI,2)
-// tag : V6_vtmpyhb_128B
-def int_hexagon_V6_vtmpyhb_128B :
-Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpyhb_128B">;
+def int_hexagon_S6_rol_i_p_or :
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpyhb_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vtmpyhb_acc
-def int_hexagon_V6_vtmpyhb_acc :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpyhb_acc">;
+// V62 Scalar Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vtmpyhb_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vtmpyhb_acc_128B
-def int_hexagon_V6_vtmpyhb_acc_128B :
-Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpyhb_acc_128B">;
+def int_hexagon_S6_vtrunehb_ppp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S6_vtrunehb_ppp">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyub,VI_ftype_VISI,2)
-// tag : V6_vrmpyub
-def int_hexagon_V6_vrmpyub :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vrmpyub">;
+def int_hexagon_V6_ldntnt0 :
+Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_ldntnt0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyub_128B,VI_ftype_VISI,2)
-// tag : V6_vrmpyub_128B
-def int_hexagon_V6_vrmpyub_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vrmpyub_128B">;
+def int_hexagon_M6_vabsdiffub :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M6_vabsdiffub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyub_acc,VI_ftype_VIVISI,3)
-// tag : V6_vrmpyub_acc
-def int_hexagon_V6_vrmpyub_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vrmpyub_acc">;
+def int_hexagon_S6_vtrunohb_ppp :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S6_vtrunohb_ppp">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyub_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vrmpyub_acc_128B
-def int_hexagon_V6_vrmpyub_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vrmpyub_acc_128B">;
+def int_hexagon_M6_vabsdiffb :
+Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M6_vabsdiffb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyubv,VI_ftype_VIVI,2)
-// tag : V6_vrmpyubv
-def int_hexagon_V6_vrmpyubv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vrmpyubv">;
+def int_hexagon_A6_vminub_RdP :
+Hexagon_i64i32_i64i64_Intrinsic<"HEXAGON_A6_vminub_RdP">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyubv_128B,VI_ftype_VIVI,2)
-// tag : V6_vrmpyubv_128B
-def int_hexagon_V6_vrmpyubv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpyubv_128B">;
+def int_hexagon_S6_vsplatrbp :
+Hexagon_i64_i32_Intrinsic<"HEXAGON_S6_vsplatrbp">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyubv_acc,VI_ftype_VIVIVI,3)
-// tag : V6_vrmpyubv_acc
-def int_hexagon_V6_vrmpyubv_acc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vrmpyubv_acc">;
+// V65 Scalar Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyubv_acc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vrmpyubv_acc_128B
-def int_hexagon_V6_vrmpyubv_acc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpyubv_acc_128B">;
+def int_hexagon_A6_vcmpbeq_notany :
+Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A6_vcmpbeq_notany">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybv,VI_ftype_VIVI,2)
-// tag : V6_vrmpybv
-def int_hexagon_V6_vrmpybv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vrmpybv">;
+// V66 Scalar Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybv_128B,VI_ftype_VIVI,2)
-// tag : V6_vrmpybv_128B
-def int_hexagon_V6_vrmpybv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpybv_128B">;
+def int_hexagon_F2_dfsub :
+Hexagon_double_doubledouble_Intrinsic<"HEXAGON_F2_dfsub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybv_acc,VI_ftype_VIVIVI,3)
-// tag : V6_vrmpybv_acc
-def int_hexagon_V6_vrmpybv_acc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vrmpybv_acc">;
+def int_hexagon_F2_dfadd :
+Hexagon_double_doubledouble_Intrinsic<"HEXAGON_F2_dfadd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybv_acc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vrmpybv_acc_128B
-def int_hexagon_V6_vrmpybv_acc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpybv_acc_128B">;
+def int_hexagon_M2_mnaci :
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mnaci">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyubi,VD_ftype_VDSISI,3)
-// tag : V6_vrmpyubi
-def int_hexagon_V6_vrmpyubi :
-Hexagon_v1024v1024ii_Intrinsic<"HEXAGON_V6_vrmpyubi">;
+def int_hexagon_S2_mask :
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_mask">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyubi_128B,VD_ftype_VDSISI,3)
-// tag : V6_vrmpyubi_128B
-def int_hexagon_V6_vrmpyubi_128B :
-Hexagon_v2048v2048ii_Intrinsic<"HEXAGON_V6_vrmpyubi_128B">;
+// V60 HVX Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyubi_acc,VD_ftype_VDVDSISI,4)
-// tag : V6_vrmpyubi_acc
-def int_hexagon_V6_vrmpyubi_acc :
-Hexagon_v1024v1024v1024ii_Intrinsic<"HEXAGON_V6_vrmpyubi_acc">;
+def int_hexagon_V6_veqb_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqb_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyubi_acc_128B,VD_ftype_VDVDSISI,4)
-// tag : V6_vrmpyubi_acc_128B
-def int_hexagon_V6_vrmpyubi_acc_128B :
-Hexagon_v2048v2048v2048ii_Intrinsic<"HEXAGON_V6_vrmpyubi_acc_128B">;
+def int_hexagon_V6_veqb_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqb_or_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybus,VI_ftype_VISI,2)
-// tag : V6_vrmpybus
-def int_hexagon_V6_vrmpybus :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vrmpybus">;
+def int_hexagon_V6_vminub :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybus_128B,VI_ftype_VISI,2)
-// tag : V6_vrmpybus_128B
-def int_hexagon_V6_vrmpybus_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vrmpybus_128B">;
+def int_hexagon_V6_vminub_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybus_acc,VI_ftype_VIVISI,3)
-// tag : V6_vrmpybus_acc
-def int_hexagon_V6_vrmpybus_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vrmpybus_acc">;
+def int_hexagon_V6_vaslw_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vaslw_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybus_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vrmpybus_acc_128B
-def int_hexagon_V6_vrmpybus_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vrmpybus_acc_128B">;
+def int_hexagon_V6_vaslw_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vaslw_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybusi,VD_ftype_VDSISI,3)
-// tag : V6_vrmpybusi
-def int_hexagon_V6_vrmpybusi :
-Hexagon_v1024v1024ii_Intrinsic<"HEXAGON_V6_vrmpybusi">;
+def int_hexagon_V6_vmpyhvsrs :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhvsrs">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybusi_128B,VD_ftype_VDSISI,3)
-// tag : V6_vrmpybusi_128B
-def int_hexagon_V6_vrmpybusi_128B :
-Hexagon_v2048v2048ii_Intrinsic<"HEXAGON_V6_vrmpybusi_128B">;
+def int_hexagon_V6_vmpyhvsrs_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhvsrs_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybusi_acc,VD_ftype_VDVDSISI,4)
-// tag : V6_vrmpybusi_acc
-def int_hexagon_V6_vrmpybusi_acc :
-Hexagon_v1024v1024v1024ii_Intrinsic<"HEXAGON_V6_vrmpybusi_acc">;
+def int_hexagon_V6_vsathub :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsathub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybusi_acc_128B,VD_ftype_VDVDSISI,4)
-// tag : V6_vrmpybusi_acc_128B
-def int_hexagon_V6_vrmpybusi_acc_128B :
-Hexagon_v2048v2048v2048ii_Intrinsic<"HEXAGON_V6_vrmpybusi_acc_128B">;
+def int_hexagon_V6_vsathub_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsathub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybusv,VI_ftype_VIVI,2)
-// tag : V6_vrmpybusv
-def int_hexagon_V6_vrmpybusv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vrmpybusv">;
+def int_hexagon_V6_vaddh_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddh_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybusv_128B,VI_ftype_VIVI,2)
-// tag : V6_vrmpybusv_128B
-def int_hexagon_V6_vrmpybusv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpybusv_128B">;
+def int_hexagon_V6_vaddh_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddh_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybusv_acc,VI_ftype_VIVIVI,3)
-// tag : V6_vrmpybusv_acc
-def int_hexagon_V6_vrmpybusv_acc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vrmpybusv_acc">;
+def int_hexagon_V6_vrmpybusi :
+Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybusv_acc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vrmpybusv_acc_128B
-def int_hexagon_V6_vrmpybusv_acc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpybusv_acc_128B">;
+def int_hexagon_V6_vrmpybusi_128B :
+Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdsaduh,VD_ftype_VDSI,2)
-// tag : V6_vdsaduh
-def int_hexagon_V6_vdsaduh :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdsaduh">;
+def int_hexagon_V6_vshufoh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshufoh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdsaduh_128B,VD_ftype_VDSI,2)
-// tag : V6_vdsaduh_128B
-def int_hexagon_V6_vdsaduh_128B :
-Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vdsaduh_128B">;
+def int_hexagon_V6_vshufoh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshufoh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdsaduh_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vdsaduh_acc
-def int_hexagon_V6_vdsaduh_acc :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdsaduh_acc">;
+def int_hexagon_V6_vasrwv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vasrwv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdsaduh_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vdsaduh_acc_128B
-def int_hexagon_V6_vdsaduh_acc_128B :
-Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vdsaduh_acc_128B">;
+def int_hexagon_V6_vasrwv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vasrwv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrsadubi,VD_ftype_VDSISI,3)
-// tag : V6_vrsadubi
-def int_hexagon_V6_vrsadubi :
-Hexagon_v1024v1024ii_Intrinsic<"HEXAGON_V6_vrsadubi">;
+def int_hexagon_V6_vdmpyhsuisat :
+Hexagon_v16i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsuisat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrsadubi_128B,VD_ftype_VDSISI,3)
-// tag : V6_vrsadubi_128B
-def int_hexagon_V6_vrsadubi_128B :
-Hexagon_v2048v2048ii_Intrinsic<"HEXAGON_V6_vrsadubi_128B">;
+def int_hexagon_V6_vdmpyhsuisat_128B :
+Hexagon_v32i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrsadubi_acc,VD_ftype_VDVDSISI,4)
-// tag : V6_vrsadubi_acc
 def int_hexagon_V6_vrsadubi_acc :
-Hexagon_v1024v1024v1024ii_Intrinsic<"HEXAGON_V6_vrsadubi_acc">;
+Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrsadubi_acc_128B,VD_ftype_VDVDSISI,4)
-// tag : V6_vrsadubi_acc_128B
 def int_hexagon_V6_vrsadubi_acc_128B :
-Hexagon_v2048v2048v2048ii_Intrinsic<"HEXAGON_V6_vrsadubi_acc_128B">;
+Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrw,VI_ftype_VISI,2)
-// tag : V6_vasrw
-def int_hexagon_V6_vasrw :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vasrw">;
+def int_hexagon_V6_vnavgw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vnavgw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrw_128B,VI_ftype_VISI,2)
-// tag : V6_vasrw_128B
-def int_hexagon_V6_vasrw_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vasrw_128B">;
+def int_hexagon_V6_vnavgw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vnavgw_128B">;
 
+def int_hexagon_V6_vnavgh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vnavgh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslw,VI_ftype_VISI,2)
-// tag : V6_vaslw
-def int_hexagon_V6_vaslw :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vaslw">;
+def int_hexagon_V6_vnavgh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vnavgh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslw_128B,VI_ftype_VISI,2)
-// tag : V6_vaslw_128B
-def int_hexagon_V6_vaslw_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vaslw_128B">;
+def int_hexagon_V6_vavgub :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrw,VI_ftype_VISI,2)
-// tag : V6_vlsrw
-def int_hexagon_V6_vlsrw :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vlsrw">;
+def int_hexagon_V6_vavgub_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrw_128B,VI_ftype_VISI,2)
-// tag : V6_vlsrw_128B
-def int_hexagon_V6_vlsrw_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vlsrw_128B">;
+def int_hexagon_V6_vsubb :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwv,VI_ftype_VIVI,2)
-// tag : V6_vasrwv
-def int_hexagon_V6_vasrwv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vasrwv">;
+def int_hexagon_V6_vsubb_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwv_128B,VI_ftype_VIVI,2)
-// tag : V6_vasrwv_128B
-def int_hexagon_V6_vasrwv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vasrwv_128B">;
+def int_hexagon_V6_vgtw_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtw_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslwv,VI_ftype_VIVI,2)
-// tag : V6_vaslwv
-def int_hexagon_V6_vaslwv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaslwv">;
+def int_hexagon_V6_vgtw_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtw_and_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslwv_128B,VI_ftype_VIVI,2)
-// tag : V6_vaslwv_128B
-def int_hexagon_V6_vaslwv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaslwv_128B">;
+def int_hexagon_V6_vavgubrnd :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgubrnd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrwv,VI_ftype_VIVI,2)
-// tag : V6_vlsrwv
-def int_hexagon_V6_vlsrwv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vlsrwv">;
+def int_hexagon_V6_vavgubrnd_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgubrnd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrwv_128B,VI_ftype_VIVI,2)
-// tag : V6_vlsrwv_128B
-def int_hexagon_V6_vlsrwv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vlsrwv_128B">;
+def int_hexagon_V6_vrmpybusv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybusv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrh,VI_ftype_VISI,2)
-// tag : V6_vasrh
-def int_hexagon_V6_vasrh :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vasrh">;
+def int_hexagon_V6_vrmpybusv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybusv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrh_128B,VI_ftype_VISI,2)
-// tag : V6_vasrh_128B
-def int_hexagon_V6_vasrh_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vasrh_128B">;
+def int_hexagon_V6_vsubbnq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubbnq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslh,VI_ftype_VISI,2)
-// tag : V6_vaslh
-def int_hexagon_V6_vaslh :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vaslh">;
+def int_hexagon_V6_vsubbnq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubbnq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslh_128B,VI_ftype_VISI,2)
-// tag : V6_vaslh_128B
-def int_hexagon_V6_vaslh_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vaslh_128B">;
+def int_hexagon_V6_vroundhb :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vroundhb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrh,VI_ftype_VISI,2)
-// tag : V6_vlsrh
-def int_hexagon_V6_vlsrh :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vlsrh">;
+def int_hexagon_V6_vroundhb_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vroundhb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrh_128B,VI_ftype_VISI,2)
-// tag : V6_vlsrh_128B
-def int_hexagon_V6_vlsrh_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vlsrh_128B">;
+def int_hexagon_V6_vadduhsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduhsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhv,VI_ftype_VIVI,2)
-// tag : V6_vasrhv
-def int_hexagon_V6_vasrhv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vasrhv">;
+def int_hexagon_V6_vadduhsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vadduhsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhv_128B,VI_ftype_VIVI,2)
-// tag : V6_vasrhv_128B
-def int_hexagon_V6_vasrhv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vasrhv_128B">;
+def int_hexagon_V6_vsububsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsububsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslhv,VI_ftype_VIVI,2)
-// tag : V6_vaslhv
-def int_hexagon_V6_vaslhv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaslhv">;
+def int_hexagon_V6_vsububsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsububsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslhv_128B,VI_ftype_VIVI,2)
-// tag : V6_vaslhv_128B
-def int_hexagon_V6_vaslhv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaslhv_128B">;
+def int_hexagon_V6_vmpabus_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpabus_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrhv,VI_ftype_VIVI,2)
-// tag : V6_vlsrhv
-def int_hexagon_V6_vlsrhv :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vlsrhv">;
+def int_hexagon_V6_vmpabus_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vmpabus_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrhv_128B,VI_ftype_VIVI,2)
-// tag : V6_vlsrhv_128B
-def int_hexagon_V6_vlsrhv_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vlsrhv_128B">;
+def int_hexagon_V6_vmux :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vmux">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwh,VI_ftype_VIVISI,3)
-// tag : V6_vasrwh
-def int_hexagon_V6_vasrwh :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwh">;
+def int_hexagon_V6_vmux_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vmux_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwh_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrwh_128B
-def int_hexagon_V6_vasrwh_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwh_128B">;
+def int_hexagon_V6_vmpyhus :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhus">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwhsat,VI_ftype_VIVISI,3)
-// tag : V6_vasrwhsat
-def int_hexagon_V6_vasrwhsat :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwhsat">;
+def int_hexagon_V6_vmpyhus_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhus_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwhsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrwhsat_128B
-def int_hexagon_V6_vasrwhsat_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwhsat_128B">;
+def int_hexagon_V6_vpackeb :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackeb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwhrndsat,VI_ftype_VIVISI,3)
-// tag : V6_vasrwhrndsat
-def int_hexagon_V6_vasrwhrndsat :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwhrndsat">;
+def int_hexagon_V6_vpackeb_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackeb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwhrndsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrwhrndsat_128B
-def int_hexagon_V6_vasrwhrndsat_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwhrndsat_128B">;
+def int_hexagon_V6_vsubhnq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubhnq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwuhsat,VI_ftype_VIVISI,3)
-// tag : V6_vasrwuhsat
-def int_hexagon_V6_vasrwuhsat :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwuhsat">;
+def int_hexagon_V6_vsubhnq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhnq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwuhsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrwuhsat_128B
-def int_hexagon_V6_vasrwuhsat_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwuhsat_128B">;
+def int_hexagon_V6_vavghrnd :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavghrnd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vroundwh,VI_ftype_VIVI,2)
-// tag : V6_vroundwh
-def int_hexagon_V6_vroundwh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vroundwh">;
+def int_hexagon_V6_vavghrnd_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavghrnd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vroundwh_128B,VI_ftype_VIVI,2)
-// tag : V6_vroundwh_128B
-def int_hexagon_V6_vroundwh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vroundwh_128B">;
+def int_hexagon_V6_vtran2x2_map :
+Hexagon_v16i32v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vtran2x2_map">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vroundwuh,VI_ftype_VIVI,2)
-// tag : V6_vroundwuh
-def int_hexagon_V6_vroundwuh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vroundwuh">;
+def int_hexagon_V6_vtran2x2_map_128B :
+Hexagon_v32i32v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vtran2x2_map_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vroundwuh_128B,VI_ftype_VIVI,2)
-// tag : V6_vroundwuh_128B
-def int_hexagon_V6_vroundwuh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vroundwuh_128B">;
+def int_hexagon_V6_vdelta :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vdelta">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhubsat,VI_ftype_VIVISI,3)
-// tag : V6_vasrhubsat
-def int_hexagon_V6_vasrhubsat :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhubsat">;
+def int_hexagon_V6_vdelta_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vdelta_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhubsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrhubsat_128B
-def int_hexagon_V6_vasrhubsat_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhubsat_128B">;
+def int_hexagon_V6_vgtuh_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuh_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhubrndsat,VI_ftype_VIVISI,3)
-// tag : V6_vasrhubrndsat
-def int_hexagon_V6_vasrhubrndsat :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhubrndsat">;
+def int_hexagon_V6_vgtuh_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuh_and_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhubrndsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrhubrndsat_128B
-def int_hexagon_V6_vasrhubrndsat_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhubrndsat_128B">;
+def int_hexagon_V6_vtmpyhb :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vtmpyhb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhbrndsat,VI_ftype_VIVISI,3)
-// tag : V6_vasrhbrndsat
-def int_hexagon_V6_vasrhbrndsat :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhbrndsat">;
+def int_hexagon_V6_vtmpyhb_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vtmpyhb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhbrndsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrhbrndsat_128B
-def int_hexagon_V6_vasrhbrndsat_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhbrndsat_128B">;
+def int_hexagon_V6_vpackob :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackob">;
+
+def int_hexagon_V6_vpackob_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackob_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vroundhb,VI_ftype_VIVI,2)
-// tag : V6_vroundhb
-def int_hexagon_V6_vroundhb :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vroundhb">;
+def int_hexagon_V6_vmaxh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vroundhb_128B,VI_ftype_VIVI,2)
-// tag : V6_vroundhb_128B
-def int_hexagon_V6_vroundhb_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vroundhb_128B">;
+def int_hexagon_V6_vmaxh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vroundhub,VI_ftype_VIVI,2)
-// tag : V6_vroundhub
-def int_hexagon_V6_vroundhub :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vroundhub">;
+def int_hexagon_V6_vtmpybus_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vtmpybus_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vroundhub_128B,VI_ftype_VIVI,2)
-// tag : V6_vroundhub_128B
-def int_hexagon_V6_vroundhub_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vroundhub_128B">;
+def int_hexagon_V6_vtmpybus_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vtmpybus_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslw_acc,VI_ftype_VIVISI,3)
-// tag : V6_vaslw_acc
-def int_hexagon_V6_vaslw_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vaslw_acc">;
+def int_hexagon_V6_vsubuhsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubuhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslw_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vaslw_acc_128B
-def int_hexagon_V6_vaslw_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vaslw_acc_128B">;
+def int_hexagon_V6_vsubuhsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrw_acc,VI_ftype_VIVISI,3)
-// tag : V6_vasrw_acc
 def int_hexagon_V6_vasrw_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrw_acc">;
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrw_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrw_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrw_acc_128B
 def int_hexagon_V6_vasrw_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrw_acc_128B">;
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrw_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddb,VI_ftype_VIVI,2)
-// tag : V6_vaddb
-def int_hexagon_V6_vaddb :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddb">;
+def int_hexagon_V6_pred_or :
+Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddb_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddb_128B
-def int_hexagon_V6_vaddb_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddb_128B">;
+def int_hexagon_V6_pred_or_128B :
+Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_or_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubb,VI_ftype_VIVI,2)
-// tag : V6_vsubb
-def int_hexagon_V6_vsubb :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubb">;
+def int_hexagon_V6_vrmpyub_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vrmpyub_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubb_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubb_128B
-def int_hexagon_V6_vsubb_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubb_128B">;
+def int_hexagon_V6_vrmpyub_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vrmpyub_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddb_dv,VD_ftype_VDVD,2)
-// tag : V6_vaddb_dv
-def int_hexagon_V6_vaddb_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddb_dv">;
+def int_hexagon_V6_lo :
+Hexagon_v16i32_v32i32_Intrinsic<"HEXAGON_V6_lo">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddb_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vaddb_dv_128B
-def int_hexagon_V6_vaddb_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddb_dv_128B">;
+def int_hexagon_V6_lo_128B :
+Hexagon_v32i32_v64i32_Intrinsic<"HEXAGON_V6_lo_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubb_dv,VD_ftype_VDVD,2)
-// tag : V6_vsubb_dv
 def int_hexagon_V6_vsubb_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubb_dv">;
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubb_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubb_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsubb_dv_128B
 def int_hexagon_V6_vsubb_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubb_dv_128B">;
-
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddh,VI_ftype_VIVI,2)
-// tag : V6_vaddh
-def int_hexagon_V6_vaddh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddh">;
-
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddh_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddh_128B
-def int_hexagon_V6_vaddh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddh_128B">;
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubb_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubh,VI_ftype_VIVI,2)
-// tag : V6_vsubh
-def int_hexagon_V6_vsubh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubh">;
+def int_hexagon_V6_vsubhsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubh_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubh_128B
-def int_hexagon_V6_vsubh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubh_128B">;
+def int_hexagon_V6_vsubhsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubhsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddh_dv,VD_ftype_VDVD,2)
-// tag : V6_vaddh_dv
-def int_hexagon_V6_vaddh_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddh_dv">;
+def int_hexagon_V6_vmpyiwh :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddh_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vaddh_dv_128B
-def int_hexagon_V6_vaddh_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddh_dv_128B">;
+def int_hexagon_V6_vmpyiwh_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubh_dv,VD_ftype_VDVD,2)
-// tag : V6_vsubh_dv
-def int_hexagon_V6_vsubh_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubh_dv">;
+def int_hexagon_V6_vmpyiwb :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubh_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsubh_dv_128B
-def int_hexagon_V6_vsubh_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubh_dv_128B">;
+def int_hexagon_V6_vmpyiwb_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddw,VI_ftype_VIVI,2)
-// tag : V6_vaddw
-def int_hexagon_V6_vaddw :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddw">;
+def int_hexagon_V6_ldu0 :
+Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_ldu0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddw_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddw_128B
-def int_hexagon_V6_vaddw_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddw_128B">;
+def int_hexagon_V6_ldu0_128B :
+Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_ldu0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubw,VI_ftype_VIVI,2)
-// tag : V6_vsubw
-def int_hexagon_V6_vsubw :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubw">;
+def int_hexagon_V6_vgtuh_xor :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuh_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubw_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubw_128B
-def int_hexagon_V6_vsubw_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubw_128B">;
+def int_hexagon_V6_vgtuh_xor_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuh_xor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddw_dv,VD_ftype_VDVD,2)
-// tag : V6_vaddw_dv
-def int_hexagon_V6_vaddw_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddw_dv">;
+def int_hexagon_V6_vgth_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgth_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddw_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vaddw_dv_128B
-def int_hexagon_V6_vaddw_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddw_dv_128B">;
+def int_hexagon_V6_vgth_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgth_or_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubw_dv,VD_ftype_VDVD,2)
-// tag : V6_vsubw_dv
-def int_hexagon_V6_vsubw_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubw_dv">;
+def int_hexagon_V6_vavgh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubw_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsubw_dv_128B
-def int_hexagon_V6_vsubw_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubw_dv_128B">;
+def int_hexagon_V6_vavgh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddubsat,VI_ftype_VIVI,2)
-// tag : V6_vaddubsat
-def int_hexagon_V6_vaddubsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddubsat">;
+def int_hexagon_V6_vlalignb :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlalignb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddubsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddubsat_128B
-def int_hexagon_V6_vaddubsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddubsat_128B">;
+def int_hexagon_V6_vlalignb_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlalignb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddubsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vaddubsat_dv
-def int_hexagon_V6_vaddubsat_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddubsat_dv">;
+def int_hexagon_V6_vsh :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vsh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddubsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vaddubsat_dv_128B
-def int_hexagon_V6_vaddubsat_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddubsat_dv_128B">;
+def int_hexagon_V6_vsh_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vsh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsububsat,VI_ftype_VIVI,2)
-// tag : V6_vsububsat
-def int_hexagon_V6_vsububsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsububsat">;
+def int_hexagon_V6_pred_and_n :
+Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_and_n">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsububsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vsububsat_128B
-def int_hexagon_V6_vsububsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsububsat_128B">;
+def int_hexagon_V6_pred_and_n_128B :
+Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_and_n_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsububsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vsububsat_dv
-def int_hexagon_V6_vsububsat_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsububsat_dv">;
+def int_hexagon_V6_vsb :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vsb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsububsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsububsat_dv_128B
-def int_hexagon_V6_vsububsat_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsububsat_dv_128B">;
+def int_hexagon_V6_vsb_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vsb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduhsat,VI_ftype_VIVI,2)
-// tag : V6_vadduhsat
-def int_hexagon_V6_vadduhsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vadduhsat">;
+def int_hexagon_V6_vroundwuh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vroundwuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduhsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vadduhsat_128B
-def int_hexagon_V6_vadduhsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduhsat_128B">;
+def int_hexagon_V6_vroundwuh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vroundwuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduhsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vadduhsat_dv
-def int_hexagon_V6_vadduhsat_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduhsat_dv">;
+def int_hexagon_V6_vasrhv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vasrhv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduhsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vadduhsat_dv_128B
-def int_hexagon_V6_vadduhsat_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vadduhsat_dv_128B">;
+def int_hexagon_V6_vasrhv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vasrhv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuhsat,VI_ftype_VIVI,2)
-// tag : V6_vsubuhsat
-def int_hexagon_V6_vsubuhsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubuhsat">;
+def int_hexagon_V6_vshuffh :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vshuffh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuhsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubuhsat_128B
-def int_hexagon_V6_vsubuhsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuhsat_128B">;
+def int_hexagon_V6_vshuffh_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vshuffh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuhsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vsubuhsat_dv
-def int_hexagon_V6_vsubuhsat_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuhsat_dv">;
+def int_hexagon_V6_vaddhsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuhsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsubuhsat_dv_128B
-def int_hexagon_V6_vsubuhsat_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubuhsat_dv_128B">;
+def int_hexagon_V6_vaddhsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddhsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhsat,VI_ftype_VIVI,2)
-// tag : V6_vaddhsat
-def int_hexagon_V6_vaddhsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddhsat">;
+def int_hexagon_V6_vnavgub :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vnavgub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddhsat_128B
-def int_hexagon_V6_vaddhsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddhsat_128B">;
+def int_hexagon_V6_vnavgub_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vnavgub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vaddhsat_dv
-def int_hexagon_V6_vaddhsat_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddhsat_dv">;
+def int_hexagon_V6_vrmpybv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vaddhsat_dv_128B
-def int_hexagon_V6_vaddhsat_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddhsat_dv_128B">;
+def int_hexagon_V6_vrmpybv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhsat,VI_ftype_VIVI,2)
-// tag : V6_vsubhsat
-def int_hexagon_V6_vsubhsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubhsat">;
+def int_hexagon_V6_vnormamth :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vnormamth">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubhsat_128B
-def int_hexagon_V6_vsubhsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubhsat_128B">;
+def int_hexagon_V6_vnormamth_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vnormamth_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vsubhsat_dv
-def int_hexagon_V6_vsubhsat_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubhsat_dv">;
+def int_hexagon_V6_vdmpyhb :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsubhsat_dv_128B
-def int_hexagon_V6_vsubhsat_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubhsat_dv_128B">;
+def int_hexagon_V6_vdmpyhb_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddwsat,VI_ftype_VIVI,2)
-// tag : V6_vaddwsat
-def int_hexagon_V6_vaddwsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddwsat">;
+def int_hexagon_V6_vavguh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavguh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddwsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddwsat_128B
-def int_hexagon_V6_vaddwsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddwsat_128B">;
+def int_hexagon_V6_vavguh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavguh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddwsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vaddwsat_dv
-def int_hexagon_V6_vaddwsat_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddwsat_dv">;
+def int_hexagon_V6_vlsrwv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vlsrwv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddwsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vaddwsat_dv_128B
-def int_hexagon_V6_vaddwsat_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddwsat_dv_128B">;
+def int_hexagon_V6_vlsrwv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vlsrwv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubwsat,VI_ftype_VIVI,2)
-// tag : V6_vsubwsat
-def int_hexagon_V6_vsubwsat :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubwsat">;
+def int_hexagon_V6_vlsrhv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vlsrhv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubwsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubwsat_128B
-def int_hexagon_V6_vsubwsat_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubwsat_128B">;
+def int_hexagon_V6_vlsrhv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vlsrhv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubwsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vsubwsat_dv
-def int_hexagon_V6_vsubwsat_dv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubwsat_dv">;
+def int_hexagon_V6_vdmpyhisat :
+Hexagon_v16i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhisat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubwsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsubwsat_dv_128B
-def int_hexagon_V6_vsubwsat_dv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubwsat_dv_128B">;
+def int_hexagon_V6_vdmpyhisat_128B :
+Hexagon_v32i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhisat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgub,VI_ftype_VIVI,2)
-// tag : V6_vavgub
-def int_hexagon_V6_vavgub :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgub">;
+def int_hexagon_V6_vdmpyhvsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vdmpyhvsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgub_128B,VI_ftype_VIVI,2)
-// tag : V6_vavgub_128B
-def int_hexagon_V6_vavgub_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgub_128B">;
+def int_hexagon_V6_vdmpyhvsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vdmpyhvsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgubrnd,VI_ftype_VIVI,2)
-// tag : V6_vavgubrnd
-def int_hexagon_V6_vavgubrnd :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgubrnd">;
+def int_hexagon_V6_vaddw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgubrnd_128B,VI_ftype_VIVI,2)
-// tag : V6_vavgubrnd_128B
-def int_hexagon_V6_vavgubrnd_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgubrnd_128B">;
+def int_hexagon_V6_vaddw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavguh,VI_ftype_VIVI,2)
-// tag : V6_vavguh
-def int_hexagon_V6_vavguh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavguh">;
+def int_hexagon_V6_vzh :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vzh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavguh_128B,VI_ftype_VIVI,2)
-// tag : V6_vavguh_128B
-def int_hexagon_V6_vavguh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavguh_128B">;
+def int_hexagon_V6_vzh_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vzh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavguhrnd,VI_ftype_VIVI,2)
-// tag : V6_vavguhrnd
-def int_hexagon_V6_vavguhrnd :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavguhrnd">;
+def int_hexagon_V6_vaddh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavguhrnd_128B,VI_ftype_VIVI,2)
-// tag : V6_vavguhrnd_128B
-def int_hexagon_V6_vavguhrnd_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavguhrnd_128B">;
+def int_hexagon_V6_vaddh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgh,VI_ftype_VIVI,2)
-// tag : V6_vavgh
-def int_hexagon_V6_vavgh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgh">;
+def int_hexagon_V6_vmaxub :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgh_128B,VI_ftype_VIVI,2)
-// tag : V6_vavgh_128B
-def int_hexagon_V6_vavgh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgh_128B">;
+def int_hexagon_V6_vmaxub_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavghrnd,VI_ftype_VIVI,2)
-// tag : V6_vavghrnd
-def int_hexagon_V6_vavghrnd :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavghrnd">;
+def int_hexagon_V6_vmpyhv_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavghrnd_128B,VI_ftype_VIVI,2)
-// tag : V6_vavghrnd_128B
-def int_hexagon_V6_vavghrnd_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavghrnd_128B">;
+def int_hexagon_V6_vmpyhv_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnavgh,VI_ftype_VIVI,2)
-// tag : V6_vnavgh
-def int_hexagon_V6_vnavgh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vnavgh">;
+def int_hexagon_V6_vadduhsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vadduhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnavgh_128B,VI_ftype_VIVI,2)
-// tag : V6_vnavgh_128B
-def int_hexagon_V6_vnavgh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vnavgh_128B">;
+def int_hexagon_V6_vadduhsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgw,VI_ftype_VIVI,2)
-// tag : V6_vavgw
-def int_hexagon_V6_vavgw :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgw">;
+def int_hexagon_V6_vshufoeh :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshufoeh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgw_128B,VI_ftype_VIVI,2)
-// tag : V6_vavgw_128B
-def int_hexagon_V6_vavgw_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgw_128B">;
+def int_hexagon_V6_vshufoeh_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshufoeh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgwrnd,VI_ftype_VIVI,2)
-// tag : V6_vavgwrnd
-def int_hexagon_V6_vavgwrnd :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgwrnd">;
+def int_hexagon_V6_vmpyuhv_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyuhv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgwrnd_128B,VI_ftype_VIVI,2)
-// tag : V6_vavgwrnd_128B
-def int_hexagon_V6_vavgwrnd_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgwrnd_128B">;
+def int_hexagon_V6_vmpyuhv_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyuhv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnavgw,VI_ftype_VIVI,2)
-// tag : V6_vnavgw
-def int_hexagon_V6_vnavgw :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vnavgw">;
+def int_hexagon_V6_veqh :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnavgw_128B,VI_ftype_VIVI,2)
-// tag : V6_vnavgw_128B
-def int_hexagon_V6_vnavgw_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vnavgw_128B">;
+def int_hexagon_V6_veqh_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsdiffub,VI_ftype_VIVI,2)
-// tag : V6_vabsdiffub
-def int_hexagon_V6_vabsdiffub :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vabsdiffub">;
+def int_hexagon_V6_vmpabuuv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpabuuv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsdiffub_128B,VI_ftype_VIVI,2)
-// tag : V6_vabsdiffub_128B
-def int_hexagon_V6_vabsdiffub_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vabsdiffub_128B">;
+def int_hexagon_V6_vmpabuuv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vmpabuuv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsdiffuh,VI_ftype_VIVI,2)
-// tag : V6_vabsdiffuh
-def int_hexagon_V6_vabsdiffuh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vabsdiffuh">;
+def int_hexagon_V6_vasrwhsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsdiffuh_128B,VI_ftype_VIVI,2)
-// tag : V6_vabsdiffuh_128B
-def int_hexagon_V6_vabsdiffuh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vabsdiffuh_128B">;
+def int_hexagon_V6_vasrwhsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsdiffh,VI_ftype_VIVI,2)
-// tag : V6_vabsdiffh
-def int_hexagon_V6_vabsdiffh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vabsdiffh">;
+def int_hexagon_V6_vminuh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsdiffh_128B,VI_ftype_VIVI,2)
-// tag : V6_vabsdiffh_128B
-def int_hexagon_V6_vabsdiffh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vabsdiffh_128B">;
+def int_hexagon_V6_vminuh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsdiffw,VI_ftype_VIVI,2)
-// tag : V6_vabsdiffw
-def int_hexagon_V6_vabsdiffw :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vabsdiffw">;
+def int_hexagon_V6_vror :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vror">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsdiffw_128B,VI_ftype_VIVI,2)
-// tag : V6_vabsdiffw_128B
-def int_hexagon_V6_vabsdiffw_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vabsdiffw_128B">;
+def int_hexagon_V6_vror_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vror_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnavgub,VI_ftype_VIVI,2)
-// tag : V6_vnavgub
-def int_hexagon_V6_vnavgub :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vnavgub">;
+def int_hexagon_V6_vmpyowh_rnd_sacc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_sacc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnavgub_128B,VI_ftype_VIVI,2)
-// tag : V6_vnavgub_128B
-def int_hexagon_V6_vnavgub_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vnavgub_128B">;
+def int_hexagon_V6_vmpyowh_rnd_sacc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_sacc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddubh,VD_ftype_VIVI,2)
-// tag : V6_vaddubh
-def int_hexagon_V6_vaddubh :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vaddubh">;
+def int_hexagon_V6_vmaxuh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddubh_128B,VD_ftype_VIVI,2)
-// tag : V6_vaddubh_128B
-def int_hexagon_V6_vaddubh_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddubh_128B">;
+def int_hexagon_V6_vmaxuh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsububh,VD_ftype_VIVI,2)
-// tag : V6_vsububh
-def int_hexagon_V6_vsububh :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vsububh">;
+def int_hexagon_V6_vabsh_sat :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsh_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsububh_128B,VD_ftype_VIVI,2)
-// tag : V6_vsububh_128B
-def int_hexagon_V6_vsububh_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vsububh_128B">;
+def int_hexagon_V6_vabsh_sat_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsh_sat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhw,VD_ftype_VIVI,2)
-// tag : V6_vaddhw
-def int_hexagon_V6_vaddhw :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vaddhw">;
+def int_hexagon_V6_pred_or_n :
+Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_or_n">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhw_128B,VD_ftype_VIVI,2)
-// tag : V6_vaddhw_128B
-def int_hexagon_V6_vaddhw_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddhw_128B">;
+def int_hexagon_V6_pred_or_n_128B :
+Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_or_n_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhw,VD_ftype_VIVI,2)
-// tag : V6_vsubhw
-def int_hexagon_V6_vsubhw :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vsubhw">;
+def int_hexagon_V6_vdealb :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vdealb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhw_128B,VD_ftype_VIVI,2)
-// tag : V6_vsubhw_128B
-def int_hexagon_V6_vsubhw_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vsubhw_128B">;
+def int_hexagon_V6_vdealb_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vdealb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduhw,VD_ftype_VIVI,2)
-// tag : V6_vadduhw
-def int_hexagon_V6_vadduhw :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vadduhw">;
+def int_hexagon_V6_vmpybusv :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpybusv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduhw_128B,VD_ftype_VIVI,2)
-// tag : V6_vadduhw_128B
-def int_hexagon_V6_vadduhw_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vadduhw_128B">;
+def int_hexagon_V6_vmpybusv_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpybusv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuhw,VD_ftype_VIVI,2)
-// tag : V6_vsubuhw
-def int_hexagon_V6_vsubuhw :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vsubuhw">;
+def int_hexagon_V6_vzb :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vzb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuhw_128B,VD_ftype_VIVI,2)
-// tag : V6_vsubuhw_128B
-def int_hexagon_V6_vsubuhw_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vsubuhw_128B">;
+def int_hexagon_V6_vzb_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vzb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vd0,VI_ftype_,0)
-// tag : V6_vd0
-def int_hexagon_V6_vd0 :
-Hexagon_v512_Intrinsic<"HEXAGON_V6_vd0">;
+def int_hexagon_V6_vdmpybus_dv :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vd0_128B,VI_ftype_,0)
-// tag : V6_vd0_128B
-def int_hexagon_V6_vd0_128B :
-Hexagon_v1024_Intrinsic<"HEXAGON_V6_vd0_128B">;
+def int_hexagon_V6_vdmpybus_dv_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddbq,VI_ftype_QVVIVI,3)
-// tag : V6_vaddbq
 def int_hexagon_V6_vaddbq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddbq">;
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddbq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddbq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vaddbq_128B
 def int_hexagon_V6_vaddbq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddbq_128B">;
-
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddbq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubbq,VI_ftype_QVVIVI,3)
-// tag : V6_vsubbq
-def int_hexagon_V6_vsubbq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubbq">;
+def int_hexagon_V6_vaddb :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubbq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vsubbq_128B
-def int_hexagon_V6_vsubbq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubbq_128B">;
+def int_hexagon_V6_vaddb_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddbnq,VI_ftype_QVVIVI,3)
-// tag : V6_vaddbnq
-def int_hexagon_V6_vaddbnq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddbnq">;
+def int_hexagon_V6_vaddwq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddwq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddbnq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vaddbnq_128B
-def int_hexagon_V6_vaddbnq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddbnq_128B">;
+def int_hexagon_V6_vaddwq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddwq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubbnq,VI_ftype_QVVIVI,3)
-// tag : V6_vsubbnq
-def int_hexagon_V6_vsubbnq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubbnq">;
+def int_hexagon_V6_vasrhubrndsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrhubrndsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubbnq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vsubbnq_128B
-def int_hexagon_V6_vsubbnq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubbnq_128B">;
+def int_hexagon_V6_vasrhubrndsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrhubrndsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhq,VI_ftype_QVVIVI,3)
-// tag : V6_vaddhq
-def int_hexagon_V6_vaddhq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddhq">;
+def int_hexagon_V6_vasrhubsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrhubsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vaddhq_128B
-def int_hexagon_V6_vaddhq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddhq_128B">;
+def int_hexagon_V6_vasrhubsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrhubsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhq,VI_ftype_QVVIVI,3)
-// tag : V6_vsubhq
-def int_hexagon_V6_vsubhq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubhq">;
+def int_hexagon_V6_vshufoeb :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshufoeb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vsubhq_128B
-def int_hexagon_V6_vsubhq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubhq_128B">;
+def int_hexagon_V6_vshufoeb_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshufoeb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhnq,VI_ftype_QVVIVI,3)
-// tag : V6_vaddhnq
-def int_hexagon_V6_vaddhnq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddhnq">;
+def int_hexagon_V6_vpackhub_sat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackhub_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhnq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vaddhnq_128B
-def int_hexagon_V6_vaddhnq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddhnq_128B">;
+def int_hexagon_V6_vpackhub_sat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackhub_sat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhnq,VI_ftype_QVVIVI,3)
-// tag : V6_vsubhnq
-def int_hexagon_V6_vsubhnq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubhnq">;
+def int_hexagon_V6_vmpyiwh_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwh_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubhnq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vsubhnq_128B
-def int_hexagon_V6_vsubhnq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubhnq_128B">;
+def int_hexagon_V6_vmpyiwh_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwh_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddwq,VI_ftype_QVVIVI,3)
-// tag : V6_vaddwq
-def int_hexagon_V6_vaddwq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddwq">;
+def int_hexagon_V6_vtmpyb :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vtmpyb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddwq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vaddwq_128B
-def int_hexagon_V6_vaddwq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddwq_128B">;
+def int_hexagon_V6_vtmpyb_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vtmpyb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubwq,VI_ftype_QVVIVI,3)
-// tag : V6_vsubwq
-def int_hexagon_V6_vsubwq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubwq">;
+def int_hexagon_V6_vmpabusv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpabusv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubwq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vsubwq_128B
-def int_hexagon_V6_vsubwq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubwq_128B">;
+def int_hexagon_V6_vmpabusv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vmpabusv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddwnq,VI_ftype_QVVIVI,3)
-// tag : V6_vaddwnq
-def int_hexagon_V6_vaddwnq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddwnq">;
+def int_hexagon_V6_pred_and :
+Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddwnq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vaddwnq_128B
-def int_hexagon_V6_vaddwnq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddwnq_128B">;
+def int_hexagon_V6_pred_and_128B :
+Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_and_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubwnq,VI_ftype_QVVIVI,3)
-// tag : V6_vsubwnq
 def int_hexagon_V6_vsubwnq :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubwnq">;
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubwnq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubwnq_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vsubwnq_128B
 def int_hexagon_V6_vsubwnq_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubwnq_128B">;
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubwnq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsh,VI_ftype_VI,1)
-// tag : V6_vabsh
-def int_hexagon_V6_vabsh :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vabsh">;
+def int_hexagon_V6_vpackwuh_sat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackwuh_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsh_128B,VI_ftype_VI,1)
-// tag : V6_vabsh_128B
-def int_hexagon_V6_vabsh_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vabsh_128B">;
+def int_hexagon_V6_vpackwuh_sat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackwuh_sat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsh_sat,VI_ftype_VI,1)
-// tag : V6_vabsh_sat
-def int_hexagon_V6_vabsh_sat :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vabsh_sat">;
+def int_hexagon_V6_vswap :
+Hexagon_v32i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vswap">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsh_sat_128B,VI_ftype_VI,1)
-// tag : V6_vabsh_sat_128B
-def int_hexagon_V6_vabsh_sat_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vabsh_sat_128B">;
+def int_hexagon_V6_vswap_128B :
+Hexagon_v64i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vswap_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsw,VI_ftype_VI,1)
-// tag : V6_vabsw
-def int_hexagon_V6_vabsw :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vabsw">;
+def int_hexagon_V6_vrmpyubv_acc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpyubv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsw_128B,VI_ftype_VI,1)
-// tag : V6_vabsw_128B
-def int_hexagon_V6_vabsw_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vabsw_128B">;
+def int_hexagon_V6_vrmpyubv_acc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpyubv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsw_sat,VI_ftype_VI,1)
-// tag : V6_vabsw_sat
-def int_hexagon_V6_vabsw_sat :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vabsw_sat">;
+def int_hexagon_V6_vgtb_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtb_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsw_sat_128B,VI_ftype_VI,1)
-// tag : V6_vabsw_sat_128B
-def int_hexagon_V6_vabsw_sat_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vabsw_sat_128B">;
+def int_hexagon_V6_vgtb_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtb_and_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybv,VD_ftype_VIVI,2)
-// tag : V6_vmpybv
-def int_hexagon_V6_vmpybv :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpybv">;
+def int_hexagon_V6_vaslw :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vaslw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybv_128B,VD_ftype_VIVI,2)
-// tag : V6_vmpybv_128B
-def int_hexagon_V6_vmpybv_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpybv_128B">;
+def int_hexagon_V6_vaslw_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vaslw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybv_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vmpybv_acc
-def int_hexagon_V6_vmpybv_acc :
-Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpybv_acc">;
+def int_hexagon_V6_vpackhb_sat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackhb_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybv_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vmpybv_acc_128B
-def int_hexagon_V6_vmpybv_acc_128B :
-Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpybv_acc_128B">;
+def int_hexagon_V6_vpackhb_sat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackhb_sat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyubv,VD_ftype_VIVI,2)
-// tag : V6_vmpyubv
-def int_hexagon_V6_vmpyubv :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyubv">;
+def int_hexagon_V6_vmpyih_acc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyih_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyubv_128B,VD_ftype_VIVI,2)
-// tag : V6_vmpyubv_128B
-def int_hexagon_V6_vmpyubv_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyubv_128B">;
+def int_hexagon_V6_vmpyih_acc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyih_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyubv_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyubv_acc
-def int_hexagon_V6_vmpyubv_acc :
-Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyubv_acc">;
+def int_hexagon_V6_vshuffvdd :
+Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vshuffvdd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyubv_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyubv_acc_128B
-def int_hexagon_V6_vmpyubv_acc_128B :
-Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyubv_acc_128B">;
+def int_hexagon_V6_vshuffvdd_128B :
+Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vshuffvdd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybusv,VD_ftype_VIVI,2)
-// tag : V6_vmpybusv
-def int_hexagon_V6_vmpybusv :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpybusv">;
+def int_hexagon_V6_vaddb_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddb_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybusv_128B,VD_ftype_VIVI,2)
-// tag : V6_vmpybusv_128B
-def int_hexagon_V6_vmpybusv_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpybusv_128B">;
+def int_hexagon_V6_vaddb_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddb_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybusv_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vmpybusv_acc
-def int_hexagon_V6_vmpybusv_acc :
-Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpybusv_acc">;
+def int_hexagon_V6_vunpackub :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vunpackub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybusv_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vmpybusv_acc_128B
-def int_hexagon_V6_vmpybusv_acc_128B :
-Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpybusv_acc_128B">;
+def int_hexagon_V6_vunpackub_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vunpackub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabusv,VD_ftype_VDVD,2)
-// tag : V6_vmpabusv
-def int_hexagon_V6_vmpabusv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpabusv">;
+def int_hexagon_V6_vgtuw :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabusv_128B,VD_ftype_VDVD,2)
-// tag : V6_vmpabusv_128B
-def int_hexagon_V6_vmpabusv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vmpabusv_128B">;
+def int_hexagon_V6_vgtuw_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabuuv,VD_ftype_VDVD,2)
-// tag : V6_vmpabuuv
-def int_hexagon_V6_vmpabuuv :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpabuuv">;
+def int_hexagon_V6_vlutvwh :
+Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabuuv_128B,VD_ftype_VDVD,2)
-// tag : V6_vmpabuuv_128B
-def int_hexagon_V6_vmpabuuv_128B :
-Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vmpabuuv_128B">;
+def int_hexagon_V6_vlutvwh_128B :
+Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhv,VD_ftype_VIVI,2)
-// tag : V6_vmpyhv
-def int_hexagon_V6_vmpyhv :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyhv">;
+def int_hexagon_V6_vgtub :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhv_128B,VD_ftype_VIVI,2)
-// tag : V6_vmpyhv_128B
-def int_hexagon_V6_vmpyhv_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhv_128B">;
+def int_hexagon_V6_vgtub_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhv_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyhv_acc
-def int_hexagon_V6_vmpyhv_acc :
-Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyhv_acc">;
+def int_hexagon_V6_vmpyowh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhv_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyhv_acc_128B
-def int_hexagon_V6_vmpyhv_acc_128B :
-Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhv_acc_128B">;
+def int_hexagon_V6_vmpyowh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuhv,VD_ftype_VIVI,2)
-// tag : V6_vmpyuhv
-def int_hexagon_V6_vmpyuhv :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyuhv">;
+def int_hexagon_V6_vmpyieoh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyieoh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuhv_128B,VD_ftype_VIVI,2)
-// tag : V6_vmpyuhv_128B
-def int_hexagon_V6_vmpyuhv_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyuhv_128B">;
+def int_hexagon_V6_vmpyieoh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyieoh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuhv_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyuhv_acc
-def int_hexagon_V6_vmpyuhv_acc :
-Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyuhv_acc">;
+def int_hexagon_V6_extractw :
+Hexagon_i32_v16i32i32_Intrinsic<"HEXAGON_V6_extractw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuhv_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyuhv_acc_128B
-def int_hexagon_V6_vmpyuhv_acc_128B :
-Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyuhv_acc_128B">;
+def int_hexagon_V6_extractw_128B :
+Hexagon_i32_v32i32i32_Intrinsic<"HEXAGON_V6_extractw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhvsrs,VI_ftype_VIVI,2)
-// tag : V6_vmpyhvsrs
-def int_hexagon_V6_vmpyhvsrs :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyhvsrs">;
+def int_hexagon_V6_vavgwrnd :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgwrnd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhvsrs_128B,VI_ftype_VIVI,2)
-// tag : V6_vmpyhvsrs_128B
-def int_hexagon_V6_vmpyhvsrs_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhvsrs_128B">;
+def int_hexagon_V6_vavgwrnd_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgwrnd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhus,VD_ftype_VIVI,2)
-// tag : V6_vmpyhus
-def int_hexagon_V6_vmpyhus :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyhus">;
+def int_hexagon_V6_vdmpyhsat_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsat_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhus_128B,VD_ftype_VIVI,2)
-// tag : V6_vmpyhus_128B
-def int_hexagon_V6_vmpyhus_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhus_128B">;
+def int_hexagon_V6_vdmpyhsat_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsat_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhus_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyhus_acc
-def int_hexagon_V6_vmpyhus_acc :
-Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyhus_acc">;
+def int_hexagon_V6_vgtub_xor :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtub_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhus_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyhus_acc_128B
-def int_hexagon_V6_vmpyhus_acc_128B :
-Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhus_acc_128B">;
+def int_hexagon_V6_vgtub_xor_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtub_xor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyih,VI_ftype_VIVI,2)
-// tag : V6_vmpyih
-def int_hexagon_V6_vmpyih :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyih">;
+def int_hexagon_V6_vmpyub :
+Hexagon_v32i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyih_128B,VI_ftype_VIVI,2)
-// tag : V6_vmpyih_128B
-def int_hexagon_V6_vmpyih_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyih_128B">;
+def int_hexagon_V6_vmpyub_128B :
+Hexagon_v64i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyub_128B">;
+
+def int_hexagon_V6_vmpyuh :
+Hexagon_v32i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyuh">;
+
+def int_hexagon_V6_vmpyuh_128B :
+Hexagon_v64i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyuh_128B">;
+
+def int_hexagon_V6_vunpackob :
+Hexagon_v32i32_v32i32v16i32_Intrinsic<"HEXAGON_V6_vunpackob">;
+
+def int_hexagon_V6_vunpackob_128B :
+Hexagon_v64i32_v64i32v32i32_Intrinsic<"HEXAGON_V6_vunpackob_128B">;
+
+def int_hexagon_V6_vmpahb :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpahb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyih_acc,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyih_acc
-def int_hexagon_V6_vmpyih_acc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyih_acc">;
+def int_hexagon_V6_vmpahb_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vmpahb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyih_acc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyih_acc_128B
-def int_hexagon_V6_vmpyih_acc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyih_acc_128B">;
+def int_hexagon_V6_veqw_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqw_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyewuh,VI_ftype_VIVI,2)
-// tag : V6_vmpyewuh
-def int_hexagon_V6_vmpyewuh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyewuh">;
+def int_hexagon_V6_veqw_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqw_or_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyewuh_128B,VI_ftype_VIVI,2)
-// tag : V6_vmpyewuh_128B
-def int_hexagon_V6_vmpyewuh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyewuh_128B">;
+def int_hexagon_V6_vandqrt :
+Hexagon_v16i32_v512i1i32_Intrinsic<"HEXAGON_V6_vandqrt">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh,VI_ftype_VIVI,2)
-// tag : V6_vmpyowh
-def int_hexagon_V6_vmpyowh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyowh">;
+def int_hexagon_V6_vandqrt_128B :
+Hexagon_v32i32_v1024i1i32_Intrinsic<"HEXAGON_V6_vandqrt_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_128B,VI_ftype_VIVI,2)
-// tag : V6_vmpyowh_128B
-def int_hexagon_V6_vmpyowh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_128B">;
+def int_hexagon_V6_vxor :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vxor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_rnd,VI_ftype_VIVI,2)
-// tag : V6_vmpyowh_rnd
-def int_hexagon_V6_vmpyowh_rnd :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_rnd">;
+def int_hexagon_V6_vxor_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vxor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_rnd_128B,VI_ftype_VIVI,2)
-// tag : V6_vmpyowh_rnd_128B
-def int_hexagon_V6_vmpyowh_rnd_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_128B">;
+def int_hexagon_V6_vasrwhrndsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwhrndsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_sacc,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyowh_sacc
-def int_hexagon_V6_vmpyowh_sacc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_sacc">;
+def int_hexagon_V6_vasrwhrndsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwhrndsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_sacc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyowh_sacc_128B
-def int_hexagon_V6_vmpyowh_sacc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_sacc_128B">;
+def int_hexagon_V6_vmpyhsat_acc :
+Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyhsat_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_rnd_sacc,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyowh_rnd_sacc
-def int_hexagon_V6_vmpyowh_rnd_sacc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_sacc">;
+def int_hexagon_V6_vmpyhsat_acc_128B :
+Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyhsat_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_rnd_sacc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyowh_rnd_sacc_128B
-def int_hexagon_V6_vmpyowh_rnd_sacc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_sacc_128B">;
+def int_hexagon_V6_vrmpybus_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vrmpybus_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyieoh,VI_ftype_VIVI,2)
-// tag : V6_vmpyieoh
-def int_hexagon_V6_vmpyieoh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyieoh">;
+def int_hexagon_V6_vrmpybus_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vrmpybus_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyieoh_128B,VI_ftype_VIVI,2)
-// tag : V6_vmpyieoh_128B
-def int_hexagon_V6_vmpyieoh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyieoh_128B">;
+def int_hexagon_V6_vsubhw :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubhw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiewuh,VI_ftype_VIVI,2)
-// tag : V6_vmpyiewuh
-def int_hexagon_V6_vmpyiewuh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyiewuh">;
+def int_hexagon_V6_vsubhw_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiewuh_128B,VI_ftype_VIVI,2)
-// tag : V6_vmpyiewuh_128B
-def int_hexagon_V6_vmpyiewuh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyiewuh_128B">;
+def int_hexagon_V6_vdealb4w :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vdealb4w">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiowh,VI_ftype_VIVI,2)
-// tag : V6_vmpyiowh
-def int_hexagon_V6_vmpyiowh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyiowh">;
+def int_hexagon_V6_vdealb4w_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vdealb4w_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiowh_128B,VI_ftype_VIVI,2)
-// tag : V6_vmpyiowh_128B
-def int_hexagon_V6_vmpyiowh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyiowh_128B">;
+def int_hexagon_V6_vmpyowh_sacc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh_sacc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiewh_acc,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyiewh_acc
-def int_hexagon_V6_vmpyiewh_acc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyiewh_acc">;
+def int_hexagon_V6_vmpyowh_sacc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_sacc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiewh_acc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyiewh_acc_128B
-def int_hexagon_V6_vmpyiewh_acc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyiewh_acc_128B">;
+def int_hexagon_V6_vmpybv :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpybv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiewuh_acc,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyiewuh_acc
-def int_hexagon_V6_vmpyiewuh_acc :
-Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyiewuh_acc">;
+def int_hexagon_V6_vmpybv_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpybv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiewuh_acc_128B,VI_ftype_VIVIVI,3)
-// tag : V6_vmpyiewuh_acc_128B
-def int_hexagon_V6_vmpyiewuh_acc_128B :
-Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyiewuh_acc_128B">;
+def int_hexagon_V6_vabsdiffh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vabsdiffh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyub,VD_ftype_VISI,2)
-// tag : V6_vmpyub
-def int_hexagon_V6_vmpyub :
-Hexagon_v1024v512i_Intrinsic<"HEXAGON_V6_vmpyub">;
+def int_hexagon_V6_vabsdiffh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vabsdiffh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyub_128B,VD_ftype_VISI,2)
-// tag : V6_vmpyub_128B
-def int_hexagon_V6_vmpyub_128B :
-Hexagon_v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyub_128B">;
+def int_hexagon_V6_vshuffob :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshuffob">;
+
+def int_hexagon_V6_vshuffob_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshuffob_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyub_acc,VD_ftype_VDVISI,3)
-// tag : V6_vmpyub_acc
 def int_hexagon_V6_vmpyub_acc :
-Hexagon_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpyub_acc">;
+Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyub_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyub_acc_128B,VD_ftype_VDVISI,3)
-// tag : V6_vmpyub_acc_128B
 def int_hexagon_V6_vmpyub_acc_128B :
-Hexagon_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyub_acc_128B">;
-
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybus,VD_ftype_VISI,2)
-// tag : V6_vmpybus
-def int_hexagon_V6_vmpybus :
-Hexagon_v1024v512i_Intrinsic<"HEXAGON_V6_vmpybus">;
-
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybus_128B,VD_ftype_VISI,2)
-// tag : V6_vmpybus_128B
-def int_hexagon_V6_vmpybus_128B :
-Hexagon_v2048v1024i_Intrinsic<"HEXAGON_V6_vmpybus_128B">;
+Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyub_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybus_acc,VD_ftype_VDVISI,3)
-// tag : V6_vmpybus_acc
-def int_hexagon_V6_vmpybus_acc :
-Hexagon_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpybus_acc">;
+def int_hexagon_V6_vnormamtw :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vnormamtw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpybus_acc_128B,VD_ftype_VDVISI,3)
-// tag : V6_vmpybus_acc_128B
-def int_hexagon_V6_vmpybus_acc_128B :
-Hexagon_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpybus_acc_128B">;
+def int_hexagon_V6_vnormamtw_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vnormamtw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabus,VD_ftype_VDSI,2)
-// tag : V6_vmpabus
-def int_hexagon_V6_vmpabus :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpabus">;
+def int_hexagon_V6_vunpackuh :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vunpackuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabus_128B,VD_ftype_VDSI,2)
-// tag : V6_vmpabus_128B
-def int_hexagon_V6_vmpabus_128B :
-Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpabus_128B">;
+def int_hexagon_V6_vunpackuh_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vunpackuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabus_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vmpabus_acc
-def int_hexagon_V6_vmpabus_acc :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpabus_acc">;
+def int_hexagon_V6_vgtuh_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuh_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabus_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vmpabus_acc_128B
-def int_hexagon_V6_vmpabus_acc_128B :
-Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpabus_acc_128B">;
+def int_hexagon_V6_vgtuh_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuh_or_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpahb,VD_ftype_VDSI,2)
-// tag : V6_vmpahb
-def int_hexagon_V6_vmpahb :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpahb">;
+def int_hexagon_V6_vmpyiewuh_acc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyiewuh_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpahb_128B,VD_ftype_VDSI,2)
-// tag : V6_vmpahb_128B
-def int_hexagon_V6_vmpahb_128B :
-Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpahb_128B">;
+def int_hexagon_V6_vmpyiewuh_acc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyiewuh_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpahb_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vmpahb_acc
-def int_hexagon_V6_vmpahb_acc :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpahb_acc">;
+def int_hexagon_V6_vunpackoh :
+Hexagon_v32i32_v32i32v16i32_Intrinsic<"HEXAGON_V6_vunpackoh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpahb_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vmpahb_acc_128B
-def int_hexagon_V6_vmpahb_acc_128B :
-Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpahb_acc_128B">;
+def int_hexagon_V6_vunpackoh_128B :
+Hexagon_v64i32_v64i32v32i32_Intrinsic<"HEXAGON_V6_vunpackoh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyh,VD_ftype_VISI,2)
-// tag : V6_vmpyh
-def int_hexagon_V6_vmpyh :
-Hexagon_v1024v512i_Intrinsic<"HEXAGON_V6_vmpyh">;
+def int_hexagon_V6_vdmpyhsat :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyh_128B,VD_ftype_VISI,2)
-// tag : V6_vmpyh_128B
-def int_hexagon_V6_vmpyh_128B :
-Hexagon_v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyh_128B">;
+def int_hexagon_V6_vdmpyhsat_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhsat_acc,VD_ftype_VDVISI,3)
-// tag : V6_vmpyhsat_acc
-def int_hexagon_V6_vmpyhsat_acc :
-Hexagon_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpyhsat_acc">;
+def int_hexagon_V6_vmpyubv :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyubv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhsat_acc_128B,VD_ftype_VDVISI,3)
-// tag : V6_vmpyhsat_acc_128B
-def int_hexagon_V6_vmpyhsat_acc_128B :
-Hexagon_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyhsat_acc_128B">;
+def int_hexagon_V6_vmpyubv_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyubv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhss,VI_ftype_VISI,2)
-// tag : V6_vmpyhss
 def int_hexagon_V6_vmpyhss :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyhss">;
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyhss">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhss_128B,VI_ftype_VISI,2)
-// tag : V6_vmpyhss_128B
 def int_hexagon_V6_vmpyhss_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyhss_128B">;
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyhss_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhsrs,VI_ftype_VISI,2)
-// tag : V6_vmpyhsrs
-def int_hexagon_V6_vmpyhsrs :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyhsrs">;
+def int_hexagon_V6_hi :
+Hexagon_v16i32_v32i32_Intrinsic<"HEXAGON_V6_hi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyhsrs_128B,VI_ftype_VISI,2)
-// tag : V6_vmpyhsrs_128B
-def int_hexagon_V6_vmpyhsrs_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyhsrs_128B">;
+def int_hexagon_V6_hi_128B :
+Hexagon_v32i32_v64i32_Intrinsic<"HEXAGON_V6_hi_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuh,VD_ftype_VISI,2)
-// tag : V6_vmpyuh
-def int_hexagon_V6_vmpyuh :
-Hexagon_v1024v512i_Intrinsic<"HEXAGON_V6_vmpyuh">;
+def int_hexagon_V6_vasrwuhsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwuhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuh_128B,VD_ftype_VISI,2)
-// tag : V6_vmpyuh_128B
-def int_hexagon_V6_vmpyuh_128B :
-Hexagon_v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyuh_128B">;
+def int_hexagon_V6_vasrwuhsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwuhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuh_acc,VD_ftype_VDVISI,3)
-// tag : V6_vmpyuh_acc
-def int_hexagon_V6_vmpyuh_acc :
-Hexagon_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpyuh_acc">;
+def int_hexagon_V6_veqw :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuh_acc_128B,VD_ftype_VDVISI,3)
-// tag : V6_vmpyuh_acc_128B
-def int_hexagon_V6_vmpyuh_acc_128B :
-Hexagon_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyuh_acc_128B">;
+def int_hexagon_V6_veqw_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyihb,VI_ftype_VISI,2)
-// tag : V6_vmpyihb
-def int_hexagon_V6_vmpyihb :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyihb">;
+def int_hexagon_V6_vdsaduh :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdsaduh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyihb_128B,VI_ftype_VISI,2)
-// tag : V6_vmpyihb_128B
-def int_hexagon_V6_vmpyihb_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyihb_128B">;
+def int_hexagon_V6_vdsaduh_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdsaduh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyihb_acc,VI_ftype_VIVISI,3)
-// tag : V6_vmpyihb_acc
-def int_hexagon_V6_vmpyihb_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyihb_acc">;
+def int_hexagon_V6_vsubw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyihb_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vmpyihb_acc_128B
-def int_hexagon_V6_vmpyihb_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyihb_acc_128B">;
+def int_hexagon_V6_vsubw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwb,VI_ftype_VISI,2)
-// tag : V6_vmpyiwb
-def int_hexagon_V6_vmpyiwb :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwb">;
+def int_hexagon_V6_vsubw_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubw_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwb_128B,VI_ftype_VISI,2)
-// tag : V6_vmpyiwb_128B
-def int_hexagon_V6_vmpyiwb_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwb_128B">;
+def int_hexagon_V6_vsubw_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubw_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwb_acc,VI_ftype_VIVISI,3)
-// tag : V6_vmpyiwb_acc
-def int_hexagon_V6_vmpyiwb_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwb_acc">;
+def int_hexagon_V6_veqb_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqb_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwb_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vmpyiwb_acc_128B
-def int_hexagon_V6_vmpyiwb_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwb_acc_128B">;
+def int_hexagon_V6_veqb_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqb_and_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwh,VI_ftype_VISI,2)
-// tag : V6_vmpyiwh
-def int_hexagon_V6_vmpyiwh :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwh">;
+def int_hexagon_V6_vmpyih :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyih">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwh_128B,VI_ftype_VISI,2)
-// tag : V6_vmpyiwh_128B
-def int_hexagon_V6_vmpyiwh_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwh_128B">;
+def int_hexagon_V6_vmpyih_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyih_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwh_acc,VI_ftype_VIVISI,3)
-// tag : V6_vmpyiwh_acc
-def int_hexagon_V6_vmpyiwh_acc :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwh_acc">;
+def int_hexagon_V6_vtmpyb_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vtmpyb_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwh_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vmpyiwh_acc_128B
-def int_hexagon_V6_vmpyiwh_acc_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwh_acc_128B">;
+def int_hexagon_V6_vtmpyb_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vtmpyb_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vand,VI_ftype_VIVI,2)
-// tag : V6_vand
-def int_hexagon_V6_vand :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vand">;
+def int_hexagon_V6_vrmpybus :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vrmpybus">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vand_128B,VI_ftype_VIVI,2)
-// tag : V6_vand_128B
-def int_hexagon_V6_vand_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vand_128B">;
+def int_hexagon_V6_vrmpybus_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vrmpybus_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vor,VI_ftype_VIVI,2)
-// tag : V6_vor
-def int_hexagon_V6_vor :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vor">;
+def int_hexagon_V6_vmpybus_acc :
+Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpybus_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vor_128B,VI_ftype_VIVI,2)
-// tag : V6_vor_128B
-def int_hexagon_V6_vor_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vor_128B">;
+def int_hexagon_V6_vmpybus_acc_128B :
+Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpybus_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vxor,VI_ftype_VIVI,2)
-// tag : V6_vxor
-def int_hexagon_V6_vxor :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vxor">;
+def int_hexagon_V6_vgth_xor :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgth_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vxor_128B,VI_ftype_VIVI,2)
-// tag : V6_vxor_128B
-def int_hexagon_V6_vxor_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vxor_128B">;
+def int_hexagon_V6_vgth_xor_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgth_xor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnot,VI_ftype_VI,1)
-// tag : V6_vnot
-def int_hexagon_V6_vnot :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vnot">;
+def int_hexagon_V6_vsubhsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnot_128B,VI_ftype_VI,1)
-// tag : V6_vnot_128B
-def int_hexagon_V6_vnot_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vnot_128B">;
+def int_hexagon_V6_vsubhsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandqrt,VI_ftype_QVSI,2)
-// tag : V6_vandqrt
-def int_hexagon_V6_vandqrt :
-Hexagon_v512v64ii_Intrinsic<"HEXAGON_V6_vandqrt">;
+def int_hexagon_V6_vrmpyubi_acc :
+Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandqrt_128B,VI_ftype_QVSI,2)
-// tag : V6_vandqrt_128B
-def int_hexagon_V6_vandqrt_128B :
-Hexagon_v1024v128ii_Intrinsic<"HEXAGON_V6_vandqrt_128B">;
+def int_hexagon_V6_vrmpyubi_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandqrt_acc,VI_ftype_VIQVSI,3)
-// tag : V6_vandqrt_acc
-def int_hexagon_V6_vandqrt_acc :
-Hexagon_v512v512v64ii_Intrinsic<"HEXAGON_V6_vandqrt_acc">;
+def int_hexagon_V6_vabsw :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandqrt_acc_128B,VI_ftype_VIQVSI,3)
-// tag : V6_vandqrt_acc_128B
-def int_hexagon_V6_vandqrt_acc_128B :
-Hexagon_v1024v1024v128ii_Intrinsic<"HEXAGON_V6_vandqrt_acc_128B">;
+def int_hexagon_V6_vabsw_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandvrt,QV_ftype_VISI,2)
-// tag : V6_vandvrt
-def int_hexagon_V6_vandvrt :
-Hexagon_v64iv512i_Intrinsic<"HEXAGON_V6_vandvrt">;
+def int_hexagon_V6_vaddwsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddwsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandvrt_128B,QV_ftype_VISI,2)
-// tag : V6_vandvrt_128B
-def int_hexagon_V6_vandvrt_128B :
-Hexagon_v128iv1024i_Intrinsic<"HEXAGON_V6_vandvrt_128B">;
+def int_hexagon_V6_vaddwsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddwsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandvrt_acc,QV_ftype_QVVISI,3)
-// tag : V6_vandvrt_acc
-def int_hexagon_V6_vandvrt_acc :
-Hexagon_v64iv64iv512i_Intrinsic<"HEXAGON_V6_vandvrt_acc">;
+def int_hexagon_V6_vlsrw :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vlsrw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandvrt_acc_128B,QV_ftype_QVVISI,3)
-// tag : V6_vandvrt_acc_128B
-def int_hexagon_V6_vandvrt_acc_128B :
-Hexagon_v128iv128iv1024i_Intrinsic<"HEXAGON_V6_vandvrt_acc_128B">;
+def int_hexagon_V6_vlsrw_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vlsrw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtw,QV_ftype_VIVI,2)
-// tag : V6_vgtw
-def int_hexagon_V6_vgtw :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtw">;
+def int_hexagon_V6_vabsh :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtw_128B,QV_ftype_VIVI,2)
-// tag : V6_vgtw_128B
-def int_hexagon_V6_vgtw_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtw_128B">;
+def int_hexagon_V6_vabsh_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtw_and,QV_ftype_QVVIVI,3)
-// tag : V6_vgtw_and
-def int_hexagon_V6_vgtw_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtw_and">;
+def int_hexagon_V6_vlsrh :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vlsrh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtw_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtw_and_128B
-def int_hexagon_V6_vgtw_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtw_and_128B">;
+def int_hexagon_V6_vlsrh_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vlsrh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtw_or,QV_ftype_QVVIVI,3)
-// tag : V6_vgtw_or
-def int_hexagon_V6_vgtw_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtw_or">;
+def int_hexagon_V6_valignb :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valignb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtw_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtw_or_128B
-def int_hexagon_V6_vgtw_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtw_or_128B">;
+def int_hexagon_V6_valignb_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valignb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtw_xor,QV_ftype_QVVIVI,3)
-// tag : V6_vgtw_xor
-def int_hexagon_V6_vgtw_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtw_xor">;
+def int_hexagon_V6_vsubhq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubhq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtw_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtw_xor_128B
-def int_hexagon_V6_vgtw_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtw_xor_128B">;
+def int_hexagon_V6_vsubhq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqw,QV_ftype_VIVI,2)
-// tag : V6_veqw
-def int_hexagon_V6_veqw :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_veqw">;
+def int_hexagon_V6_vpackoh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackoh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqw_128B,QV_ftype_VIVI,2)
-// tag : V6_veqw_128B
-def int_hexagon_V6_veqw_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_veqw_128B">;
+def int_hexagon_V6_vpackoh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackoh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqw_and,QV_ftype_QVVIVI,3)
-// tag : V6_veqw_and
-def int_hexagon_V6_veqw_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqw_and">;
+def int_hexagon_V6_vdmpybus_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqw_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqw_and_128B
-def int_hexagon_V6_veqw_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqw_and_128B">;
+def int_hexagon_V6_vdmpybus_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqw_or,QV_ftype_QVVIVI,3)
-// tag : V6_veqw_or
-def int_hexagon_V6_veqw_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqw_or">;
+def int_hexagon_V6_vdmpyhvsat_acc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vdmpyhvsat_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqw_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqw_or_128B
-def int_hexagon_V6_veqw_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqw_or_128B">;
+def int_hexagon_V6_vdmpyhvsat_acc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vdmpyhvsat_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqw_xor,QV_ftype_QVVIVI,3)
-// tag : V6_veqw_xor
-def int_hexagon_V6_veqw_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqw_xor">;
+def int_hexagon_V6_vrmpybv_acc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqw_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqw_xor_128B
-def int_hexagon_V6_veqw_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqw_xor_128B">;
+def int_hexagon_V6_vrmpybv_acc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgth,QV_ftype_VIVI,2)
-// tag : V6_vgth
-def int_hexagon_V6_vgth :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgth">;
+def int_hexagon_V6_vaddhsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgth_128B,QV_ftype_VIVI,2)
-// tag : V6_vgth_128B
-def int_hexagon_V6_vgth_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgth_128B">;
+def int_hexagon_V6_vaddhsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgth_and,QV_ftype_QVVIVI,3)
-// tag : V6_vgth_and
-def int_hexagon_V6_vgth_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgth_and">;
+def int_hexagon_V6_vcombine :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vcombine">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgth_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgth_and_128B
-def int_hexagon_V6_vgth_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgth_and_128B">;
+def int_hexagon_V6_vcombine_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vcombine_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgth_or,QV_ftype_QVVIVI,3)
-// tag : V6_vgth_or
-def int_hexagon_V6_vgth_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgth_or">;
+def int_hexagon_V6_vandqrt_acc :
+Hexagon_v16i32_v16i32v512i1i32_Intrinsic<"HEXAGON_V6_vandqrt_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgth_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgth_or_128B
-def int_hexagon_V6_vgth_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgth_or_128B">;
+def int_hexagon_V6_vandqrt_acc_128B :
+Hexagon_v32i32_v32i32v1024i1i32_Intrinsic<"HEXAGON_V6_vandqrt_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgth_xor,QV_ftype_QVVIVI,3)
-// tag : V6_vgth_xor
-def int_hexagon_V6_vgth_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgth_xor">;
+def int_hexagon_V6_vaslhv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaslhv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgth_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgth_xor_128B
-def int_hexagon_V6_vgth_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgth_xor_128B">;
+def int_hexagon_V6_vaslhv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaslhv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqh,QV_ftype_VIVI,2)
-// tag : V6_veqh
-def int_hexagon_V6_veqh :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_veqh">;
+def int_hexagon_V6_vinsertwr :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vinsertwr">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqh_128B,QV_ftype_VIVI,2)
-// tag : V6_veqh_128B
-def int_hexagon_V6_veqh_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_veqh_128B">;
+def int_hexagon_V6_vinsertwr_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vinsertwr_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqh_and,QV_ftype_QVVIVI,3)
-// tag : V6_veqh_and
-def int_hexagon_V6_veqh_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqh_and">;
+def int_hexagon_V6_vsubh_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubh_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqh_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqh_and_128B
-def int_hexagon_V6_veqh_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqh_and_128B">;
+def int_hexagon_V6_vsubh_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubh_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqh_or,QV_ftype_QVVIVI,3)
-// tag : V6_veqh_or
-def int_hexagon_V6_veqh_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqh_or">;
+def int_hexagon_V6_vshuffb :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vshuffb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqh_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqh_or_128B
-def int_hexagon_V6_veqh_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqh_or_128B">;
+def int_hexagon_V6_vshuffb_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vshuffb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqh_xor,QV_ftype_QVVIVI,3)
-// tag : V6_veqh_xor
-def int_hexagon_V6_veqh_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqh_xor">;
+def int_hexagon_V6_vand :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vand">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqh_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqh_xor_128B
-def int_hexagon_V6_veqh_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqh_xor_128B">;
+def int_hexagon_V6_vand_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vand_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtb,QV_ftype_VIVI,2)
-// tag : V6_vgtb
-def int_hexagon_V6_vgtb :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtb">;
+def int_hexagon_V6_vmpyhv :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtb_128B,QV_ftype_VIVI,2)
-// tag : V6_vgtb_128B
-def int_hexagon_V6_vgtb_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtb_128B">;
+def int_hexagon_V6_vmpyhv_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtb_and,QV_ftype_QVVIVI,3)
-// tag : V6_vgtb_and
-def int_hexagon_V6_vgtb_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtb_and">;
+def int_hexagon_V6_vdmpyhsuisat_acc :
+Hexagon_v16i32_v16i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtb_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtb_and_128B
-def int_hexagon_V6_vgtb_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtb_and_128B">;
+def int_hexagon_V6_vdmpyhsuisat_acc_128B :
+Hexagon_v32i32_v32i32v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtb_or,QV_ftype_QVVIVI,3)
-// tag : V6_vgtb_or
-def int_hexagon_V6_vgtb_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtb_or">;
+def int_hexagon_V6_vsububsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsububsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtb_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtb_or_128B
-def int_hexagon_V6_vgtb_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtb_or_128B">;
+def int_hexagon_V6_vsububsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsububsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtb_xor,QV_ftype_QVVIVI,3)
-// tag : V6_vgtb_xor
 def int_hexagon_V6_vgtb_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtb_xor">;
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtb_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtb_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtb_xor_128B
 def int_hexagon_V6_vgtb_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtb_xor_128B">;
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtb_xor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqb,QV_ftype_VIVI,2)
-// tag : V6_veqb
-def int_hexagon_V6_veqb :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_veqb">;
+def int_hexagon_V6_vdsaduh_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdsaduh_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqb_128B,QV_ftype_VIVI,2)
-// tag : V6_veqb_128B
-def int_hexagon_V6_veqb_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_veqb_128B">;
+def int_hexagon_V6_vdsaduh_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vdsaduh_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqb_and,QV_ftype_QVVIVI,3)
-// tag : V6_veqb_and
-def int_hexagon_V6_veqb_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqb_and">;
+def int_hexagon_V6_vrmpyub :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vrmpyub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqb_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqb_and_128B
-def int_hexagon_V6_veqb_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqb_and_128B">;
+def int_hexagon_V6_vrmpyub_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vrmpyub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqb_or,QV_ftype_QVVIVI,3)
-// tag : V6_veqb_or
-def int_hexagon_V6_veqb_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqb_or">;
+def int_hexagon_V6_vmpyuh_acc :
+Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyuh_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqb_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqb_or_128B
-def int_hexagon_V6_veqb_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqb_or_128B">;
+def int_hexagon_V6_vmpyuh_acc_128B :
+Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyuh_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqb_xor,QV_ftype_QVVIVI,3)
-// tag : V6_veqb_xor
-def int_hexagon_V6_veqb_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqb_xor">;
+def int_hexagon_V6_vcl0h :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vcl0h">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_veqb_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_veqb_xor_128B
-def int_hexagon_V6_veqb_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqb_xor_128B">;
+def int_hexagon_V6_vcl0h_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vcl0h_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuw,QV_ftype_VIVI,2)
-// tag : V6_vgtuw
-def int_hexagon_V6_vgtuw :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtuw">;
+def int_hexagon_V6_vmpyhus_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhus_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuw_128B,QV_ftype_VIVI,2)
-// tag : V6_vgtuw_128B
-def int_hexagon_V6_vgtuw_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuw_128B">;
+def int_hexagon_V6_vmpyhus_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhus_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuw_and,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuw_and
-def int_hexagon_V6_vgtuw_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuw_and">;
+def int_hexagon_V6_vmpybv_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpybv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuw_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuw_and_128B
-def int_hexagon_V6_vgtuw_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuw_and_128B">;
+def int_hexagon_V6_vmpybv_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpybv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuw_or,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuw_or
-def int_hexagon_V6_vgtuw_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuw_or">;
+def int_hexagon_V6_vrsadubi :
+Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuw_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuw_or_128B
-def int_hexagon_V6_vgtuw_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuw_or_128B">;
+def int_hexagon_V6_vrsadubi_128B :
+Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuw_xor,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuw_xor
-def int_hexagon_V6_vgtuw_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuw_xor">;
+def int_hexagon_V6_vdmpyhb_dv_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuw_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuw_xor_128B
-def int_hexagon_V6_vgtuw_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuw_xor_128B">;
+def int_hexagon_V6_vdmpyhb_dv_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuh,QV_ftype_VIVI,2)
-// tag : V6_vgtuh
-def int_hexagon_V6_vgtuh :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtuh">;
+def int_hexagon_V6_vshufeh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshufeh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuh_128B,QV_ftype_VIVI,2)
-// tag : V6_vgtuh_128B
-def int_hexagon_V6_vgtuh_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuh_128B">;
+def int_hexagon_V6_vshufeh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshufeh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuh_and,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuh_and
-def int_hexagon_V6_vgtuh_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuh_and">;
+def int_hexagon_V6_vmpyewuh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyewuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuh_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuh_and_128B
-def int_hexagon_V6_vgtuh_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuh_and_128B">;
+def int_hexagon_V6_vmpyewuh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyewuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuh_or,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuh_or
-def int_hexagon_V6_vgtuh_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuh_or">;
+def int_hexagon_V6_vmpyhsrs :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyhsrs">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuh_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuh_or_128B
-def int_hexagon_V6_vgtuh_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuh_or_128B">;
+def int_hexagon_V6_vmpyhsrs_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyhsrs_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuh_xor,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuh_xor
-def int_hexagon_V6_vgtuh_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuh_xor">;
+def int_hexagon_V6_vdmpybus_dv_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_dv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtuh_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtuh_xor_128B
-def int_hexagon_V6_vgtuh_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuh_xor_128B">;
+def int_hexagon_V6_vdmpybus_dv_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_dv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtub,QV_ftype_VIVI,2)
-// tag : V6_vgtub
-def int_hexagon_V6_vgtub :
-Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtub">;
+def int_hexagon_V6_vaddubh :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddubh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtub_128B,QV_ftype_VIVI,2)
-// tag : V6_vgtub_128B
-def int_hexagon_V6_vgtub_128B :
-Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtub_128B">;
+def int_hexagon_V6_vaddubh_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddubh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtub_and,QV_ftype_QVVIVI,3)
-// tag : V6_vgtub_and
-def int_hexagon_V6_vgtub_and :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtub_and">;
+def int_hexagon_V6_vasrwh :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtub_and_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtub_and_128B
-def int_hexagon_V6_vgtub_and_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtub_and_128B">;
+def int_hexagon_V6_vasrwh_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtub_or,QV_ftype_QVVIVI,3)
-// tag : V6_vgtub_or
-def int_hexagon_V6_vgtub_or :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtub_or">;
+def int_hexagon_V6_ld0 :
+Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_ld0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtub_or_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtub_or_128B
-def int_hexagon_V6_vgtub_or_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtub_or_128B">;
+def int_hexagon_V6_ld0_128B :
+Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_ld0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtub_xor,QV_ftype_QVVIVI,3)
-// tag : V6_vgtub_xor
-def int_hexagon_V6_vgtub_xor :
-Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtub_xor">;
+def int_hexagon_V6_vpopcounth :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vpopcounth">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vgtub_xor_128B,QV_ftype_QVVIVI,3)
-// tag : V6_vgtub_xor_128B
-def int_hexagon_V6_vgtub_xor_128B :
-Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtub_xor_128B">;
+def int_hexagon_V6_vpopcounth_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vpopcounth_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_or,QV_ftype_QVQV,2)
-// tag : V6_pred_or
-def int_hexagon_V6_pred_or :
-Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_or">;
+def int_hexagon_V6_ldnt0 :
+Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_ldnt0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_or_128B,QV_ftype_QVQV,2)
-// tag : V6_pred_or_128B
-def int_hexagon_V6_pred_or_128B :
-Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_or_128B">;
+def int_hexagon_V6_ldnt0_128B :
+Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_ldnt0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_and,QV_ftype_QVQV,2)
-// tag : V6_pred_and
-def int_hexagon_V6_pred_and :
-Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_and">;
+def int_hexagon_V6_vgth_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgth_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_and_128B,QV_ftype_QVQV,2)
-// tag : V6_pred_and_128B
-def int_hexagon_V6_pred_and_128B :
-Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_and_128B">;
+def int_hexagon_V6_vgth_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgth_and_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_not,QV_ftype_QV,1)
-// tag : V6_pred_not
-def int_hexagon_V6_pred_not :
-Hexagon_v64iv64i_Intrinsic<"HEXAGON_V6_pred_not">;
+def int_hexagon_V6_vaddubsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddubsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_not_128B,QV_ftype_QV,1)
-// tag : V6_pred_not_128B
-def int_hexagon_V6_pred_not_128B :
-Hexagon_v128iv128i_Intrinsic<"HEXAGON_V6_pred_not_128B">;
+def int_hexagon_V6_vaddubsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddubsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_xor,QV_ftype_QVQV,2)
-// tag : V6_pred_xor
-def int_hexagon_V6_pred_xor :
-Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_xor">;
+def int_hexagon_V6_vpackeh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackeh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_xor_128B,QV_ftype_QVQV,2)
-// tag : V6_pred_xor_128B
-def int_hexagon_V6_pred_xor_128B :
-Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_xor_128B">;
+def int_hexagon_V6_vpackeh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackeh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_and_n,QV_ftype_QVQV,2)
-// tag : V6_pred_and_n
-def int_hexagon_V6_pred_and_n :
-Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_and_n">;
+def int_hexagon_V6_vmpyh :
+Hexagon_v32i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_and_n_128B,QV_ftype_QVQV,2)
-// tag : V6_pred_and_n_128B
-def int_hexagon_V6_pred_and_n_128B :
-Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_and_n_128B">;
+def int_hexagon_V6_vmpyh_128B :
+Hexagon_v64i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_or_n,QV_ftype_QVQV,2)
-// tag : V6_pred_or_n
-def int_hexagon_V6_pred_or_n :
-Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_or_n">;
+def int_hexagon_V6_vminh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_or_n_128B,QV_ftype_QVQV,2)
-// tag : V6_pred_or_n_128B
-def int_hexagon_V6_pred_or_n_128B :
-Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_or_n_128B">;
+def int_hexagon_V6_vminh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_scalar2,QV_ftype_SI,1)
-// tag : V6_pred_scalar2
 def int_hexagon_V6_pred_scalar2 :
-Hexagon_v64ii_Intrinsic<"HEXAGON_V6_pred_scalar2">;
+Hexagon_v512i1_i32_Intrinsic<"HEXAGON_V6_pred_scalar2">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_scalar2_128B,QV_ftype_SI,1)
-// tag : V6_pred_scalar2_128B
 def int_hexagon_V6_pred_scalar2_128B :
-Hexagon_v128ii_Intrinsic<"HEXAGON_V6_pred_scalar2_128B">;
-
-//
-// BUILTIN_INFO(HEXAGON.V6_vmux,VI_ftype_QVVIVI,3)
-// tag : V6_vmux
-def int_hexagon_V6_vmux :
-Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vmux">;
+Hexagon_v1024i1_i32_Intrinsic<"HEXAGON_V6_pred_scalar2_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmux_128B,VI_ftype_QVVIVI,3)
-// tag : V6_vmux_128B
-def int_hexagon_V6_vmux_128B :
-Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vmux_128B">;
+def int_hexagon_V6_vdealh :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vdealh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vswap,VD_ftype_QVVIVI,3)
-// tag : V6_vswap
-def int_hexagon_V6_vswap :
-Hexagon_v1024v64iv512v512_Intrinsic<"HEXAGON_V6_vswap">;
+def int_hexagon_V6_vdealh_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vdealh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vswap_128B,VD_ftype_QVVIVI,3)
-// tag : V6_vswap_128B
-def int_hexagon_V6_vswap_128B :
-Hexagon_v2048v128iv1024v1024_Intrinsic<"HEXAGON_V6_vswap_128B">;
+def int_hexagon_V6_vpackwh_sat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackwh_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxub,VI_ftype_VIVI,2)
-// tag : V6_vmaxub
-def int_hexagon_V6_vmaxub :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxub">;
+def int_hexagon_V6_vpackwh_sat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackwh_sat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxub_128B,VI_ftype_VIVI,2)
-// tag : V6_vmaxub_128B
-def int_hexagon_V6_vmaxub_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxub_128B">;
+def int_hexagon_V6_vaslh :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vaslh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminub,VI_ftype_VIVI,2)
-// tag : V6_vminub
-def int_hexagon_V6_vminub :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vminub">;
+def int_hexagon_V6_vaslh_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vaslh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminub_128B,VI_ftype_VIVI,2)
-// tag : V6_vminub_128B
-def int_hexagon_V6_vminub_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminub_128B">;
+def int_hexagon_V6_vgtuw_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuw_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxuh,VI_ftype_VIVI,2)
-// tag : V6_vmaxuh
-def int_hexagon_V6_vmaxuh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxuh">;
+def int_hexagon_V6_vgtuw_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuw_and_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxuh_128B,VI_ftype_VIVI,2)
-// tag : V6_vmaxuh_128B
-def int_hexagon_V6_vmaxuh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxuh_128B">;
+def int_hexagon_V6_vor :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminuh,VI_ftype_VIVI,2)
-// tag : V6_vminuh
-def int_hexagon_V6_vminuh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vminuh">;
+def int_hexagon_V6_vor_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminuh_128B,VI_ftype_VIVI,2)
-// tag : V6_vminuh_128B
-def int_hexagon_V6_vminuh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminuh_128B">;
+def int_hexagon_V6_vlutvvb :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxh,VI_ftype_VIVI,2)
-// tag : V6_vmaxh
-def int_hexagon_V6_vmaxh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxh">;
+def int_hexagon_V6_vlutvvb_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxh_128B,VI_ftype_VIVI,2)
-// tag : V6_vmaxh_128B
-def int_hexagon_V6_vmaxh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxh_128B">;
+def int_hexagon_V6_vmpyiowh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyiowh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminh,VI_ftype_VIVI,2)
-// tag : V6_vminh
-def int_hexagon_V6_vminh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vminh">;
+def int_hexagon_V6_vmpyiowh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyiowh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminh_128B,VI_ftype_VIVI,2)
-// tag : V6_vminh_128B
-def int_hexagon_V6_vminh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminh_128B">;
+def int_hexagon_V6_vlutvvb_oracc :
+Hexagon_v16i32_v16i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxw,VI_ftype_VIVI,2)
-// tag : V6_vmaxw
-def int_hexagon_V6_vmaxw :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxw">;
+def int_hexagon_V6_vlutvvb_oracc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxw_128B,VI_ftype_VIVI,2)
-// tag : V6_vmaxw_128B
-def int_hexagon_V6_vmaxw_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxw_128B">;
+def int_hexagon_V6_vandvrt :
+Hexagon_v512i1_v16i32i32_Intrinsic<"HEXAGON_V6_vandvrt">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminw,VI_ftype_VIVI,2)
-// tag : V6_vminw
-def int_hexagon_V6_vminw :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vminw">;
+def int_hexagon_V6_vandvrt_128B :
+Hexagon_v1024i1_v32i32i32_Intrinsic<"HEXAGON_V6_vandvrt_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminw_128B,VI_ftype_VIVI,2)
-// tag : V6_vminw_128B
-def int_hexagon_V6_vminw_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminw_128B">;
+def int_hexagon_V6_veqh_xor :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqh_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsathub,VI_ftype_VIVI,2)
-// tag : V6_vsathub
-def int_hexagon_V6_vsathub :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsathub">;
+def int_hexagon_V6_veqh_xor_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqh_xor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsathub_128B,VI_ftype_VIVI,2)
-// tag : V6_vsathub_128B
-def int_hexagon_V6_vsathub_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsathub_128B">;
+def int_hexagon_V6_vadduhw :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vadduhw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsatwh,VI_ftype_VIVI,2)
-// tag : V6_vsatwh
-def int_hexagon_V6_vsatwh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsatwh">;
+def int_hexagon_V6_vadduhw_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduhw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsatwh_128B,VI_ftype_VIVI,2)
-// tag : V6_vsatwh_128B
-def int_hexagon_V6_vsatwh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsatwh_128B">;
+def int_hexagon_V6_vcl0w :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vcl0w">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffeb,VI_ftype_VIVI,2)
-// tag : V6_vshuffeb
-def int_hexagon_V6_vshuffeb :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vshuffeb">;
+def int_hexagon_V6_vcl0w_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vcl0w_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffeb_128B,VI_ftype_VIVI,2)
-// tag : V6_vshuffeb_128B
-def int_hexagon_V6_vshuffeb_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vshuffeb_128B">;
+def int_hexagon_V6_vmpyihb :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyihb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffob,VI_ftype_VIVI,2)
-// tag : V6_vshuffob
-def int_hexagon_V6_vshuffob :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vshuffob">;
+def int_hexagon_V6_vmpyihb_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyihb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffob_128B,VI_ftype_VIVI,2)
-// tag : V6_vshuffob_128B
-def int_hexagon_V6_vshuffob_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vshuffob_128B">;
+def int_hexagon_V6_vtmpybus :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vtmpybus">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshufeh,VI_ftype_VIVI,2)
-// tag : V6_vshufeh
-def int_hexagon_V6_vshufeh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vshufeh">;
+def int_hexagon_V6_vtmpybus_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vtmpybus_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshufeh_128B,VI_ftype_VIVI,2)
-// tag : V6_vshufeh_128B
-def int_hexagon_V6_vshufeh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vshufeh_128B">;
+def int_hexagon_V6_vd0 :
+Hexagon_v16i32__Intrinsic<"HEXAGON_V6_vd0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshufoh,VI_ftype_VIVI,2)
-// tag : V6_vshufoh
-def int_hexagon_V6_vshufoh :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vshufoh">;
+def int_hexagon_V6_vd0_128B :
+Hexagon_v32i32__Intrinsic<"HEXAGON_V6_vd0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshufoh_128B,VI_ftype_VIVI,2)
-// tag : V6_vshufoh_128B
-def int_hexagon_V6_vshufoh_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vshufoh_128B">;
+def int_hexagon_V6_veqh_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqh_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffvdd,VD_ftype_VIVISI,3)
-// tag : V6_vshuffvdd
-def int_hexagon_V6_vshuffvdd :
-Hexagon_v1024v512v512i_Intrinsic<"HEXAGON_V6_vshuffvdd">;
+def int_hexagon_V6_veqh_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqh_or_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffvdd_128B,VD_ftype_VIVISI,3)
-// tag : V6_vshuffvdd_128B
-def int_hexagon_V6_vshuffvdd_128B :
-Hexagon_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vshuffvdd_128B">;
+def int_hexagon_V6_vgtw_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtw_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdealvdd,VD_ftype_VIVISI,3)
-// tag : V6_vdealvdd
-def int_hexagon_V6_vdealvdd :
-Hexagon_v1024v512v512i_Intrinsic<"HEXAGON_V6_vdealvdd">;
+def int_hexagon_V6_vgtw_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtw_or_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdealvdd_128B,VD_ftype_VIVISI,3)
-// tag : V6_vdealvdd_128B
-def int_hexagon_V6_vdealvdd_128B :
-Hexagon_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vdealvdd_128B">;
+def int_hexagon_V6_vdmpybus :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vdmpybus">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshufoeh,VD_ftype_VIVI,2)
-// tag : V6_vshufoeh
-def int_hexagon_V6_vshufoeh :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vshufoeh">;
+def int_hexagon_V6_vdmpybus_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshufoeh_128B,VD_ftype_VIVI,2)
-// tag : V6_vshufoeh_128B
-def int_hexagon_V6_vshufoeh_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vshufoeh_128B">;
+def int_hexagon_V6_vgtub_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtub_or">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshufoeb,VD_ftype_VIVI,2)
-// tag : V6_vshufoeb
-def int_hexagon_V6_vshufoeb :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vshufoeb">;
+def int_hexagon_V6_vgtub_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtub_or_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshufoeb_128B,VD_ftype_VIVI,2)
-// tag : V6_vshufoeb_128B
-def int_hexagon_V6_vshufoeb_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vshufoeb_128B">;
+def int_hexagon_V6_vmpybus :
+Hexagon_v32i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpybus">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdealh,VI_ftype_VI,1)
-// tag : V6_vdealh
-def int_hexagon_V6_vdealh :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vdealh">;
+def int_hexagon_V6_vmpybus_128B :
+Hexagon_v64i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpybus_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdealh_128B,VI_ftype_VI,1)
-// tag : V6_vdealh_128B
-def int_hexagon_V6_vdealh_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vdealh_128B">;
+def int_hexagon_V6_vdmpyhb_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdealb,VI_ftype_VI,1)
-// tag : V6_vdealb
-def int_hexagon_V6_vdealb :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vdealb">;
+def int_hexagon_V6_vdmpyhb_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdealb_128B,VI_ftype_VI,1)
-// tag : V6_vdealb_128B
-def int_hexagon_V6_vdealb_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vdealb_128B">;
+def int_hexagon_V6_vandvrt_acc :
+Hexagon_v512i1_v512i1v16i32i32_Intrinsic<"HEXAGON_V6_vandvrt_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdealb4w,VI_ftype_VIVI,2)
-// tag : V6_vdealb4w
-def int_hexagon_V6_vdealb4w :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vdealb4w">;
+def int_hexagon_V6_vandvrt_acc_128B :
+Hexagon_v1024i1_v1024i1v32i32i32_Intrinsic<"HEXAGON_V6_vandvrt_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdealb4w_128B,VI_ftype_VIVI,2)
-// tag : V6_vdealb4w_128B
-def int_hexagon_V6_vdealb4w_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vdealb4w_128B">;
+def int_hexagon_V6_vassign :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vassign">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffh,VI_ftype_VI,1)
-// tag : V6_vshuffh
-def int_hexagon_V6_vshuffh :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vshuffh">;
+def int_hexagon_V6_vassign_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vassign_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffh_128B,VI_ftype_VI,1)
-// tag : V6_vshuffh_128B
-def int_hexagon_V6_vshuffh_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vshuffh_128B">;
+def int_hexagon_V6_vaddwnq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddwnq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffb,VI_ftype_VI,1)
-// tag : V6_vshuffb
-def int_hexagon_V6_vshuffb :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vshuffb">;
+def int_hexagon_V6_vaddwnq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddwnq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vshuffb_128B,VI_ftype_VI,1)
-// tag : V6_vshuffb_128B
-def int_hexagon_V6_vshuffb_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vshuffb_128B">;
+def int_hexagon_V6_vgtub_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtub_and">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_extractw,SI_ftype_VISI,2)
-// tag : V6_extractw
-def int_hexagon_V6_extractw :
-Hexagon_iv512i_Intrinsic<"HEXAGON_V6_extractw">;
+def int_hexagon_V6_vgtub_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtub_and_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_extractw_128B,SI_ftype_VISI,2)
-// tag : V6_extractw_128B
-def int_hexagon_V6_extractw_128B :
-Hexagon_iv1024i_Intrinsic<"HEXAGON_V6_extractw_128B">;
+def int_hexagon_V6_vdmpyhb_dv :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vinsertwr,VI_ftype_VISI,2)
-// tag : V6_vinsertwr
-def int_hexagon_V6_vinsertwr :
-Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vinsertwr">;
+def int_hexagon_V6_vdmpyhb_dv_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vinsertwr_128B,VI_ftype_VISI,2)
-// tag : V6_vinsertwr_128B
-def int_hexagon_V6_vinsertwr_128B :
-Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vinsertwr_128B">;
+def int_hexagon_V6_vunpackb :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vunpackb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_lvsplatw,VI_ftype_SI,1)
-// tag : V6_lvsplatw
-def int_hexagon_V6_lvsplatw :
-Hexagon_v512i_Intrinsic<"HEXAGON_V6_lvsplatw">;
+def int_hexagon_V6_vunpackb_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vunpackb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_lvsplatw_128B,VI_ftype_SI,1)
-// tag : V6_lvsplatw_128B
-def int_hexagon_V6_lvsplatw_128B :
-Hexagon_v1024i_Intrinsic<"HEXAGON_V6_lvsplatw_128B">;
+def int_hexagon_V6_vunpackh :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vunpackh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vassign,VI_ftype_VI,1)
-// tag : V6_vassign
-def int_hexagon_V6_vassign :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vassign">;
+def int_hexagon_V6_vunpackh_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vunpackh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vassign_128B,VI_ftype_VI,1)
-// tag : V6_vassign_128B
-def int_hexagon_V6_vassign_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vassign_128B">;
+def int_hexagon_V6_vmpahb_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpahb_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vcombine,VD_ftype_VIVI,2)
-// tag : V6_vcombine
-def int_hexagon_V6_vcombine :
-Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vcombine">;
+def int_hexagon_V6_vmpahb_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vmpahb_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vcombine_128B,VD_ftype_VIVI,2)
-// tag : V6_vcombine_128B
-def int_hexagon_V6_vcombine_128B :
-Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vcombine_128B">;
+def int_hexagon_V6_vaddbnq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddbnq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdelta,VI_ftype_VIVI,2)
-// tag : V6_vdelta
-def int_hexagon_V6_vdelta :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vdelta">;
+def int_hexagon_V6_vaddbnq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddbnq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdelta_128B,VI_ftype_VIVI,2)
-// tag : V6_vdelta_128B
-def int_hexagon_V6_vdelta_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vdelta_128B">;
+def int_hexagon_V6_vlalignbi :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlalignbi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrdelta,VI_ftype_VIVI,2)
-// tag : V6_vrdelta
-def int_hexagon_V6_vrdelta :
-Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vrdelta">;
+def int_hexagon_V6_vlalignbi_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlalignbi_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrdelta_128B,VI_ftype_VIVI,2)
-// tag : V6_vrdelta_128B
-def int_hexagon_V6_vrdelta_128B :
-Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrdelta_128B">;
+def int_hexagon_V6_vsatwh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsatwh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vcl0w,VI_ftype_VI,1)
-// tag : V6_vcl0w
-def int_hexagon_V6_vcl0w :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vcl0w">;
+def int_hexagon_V6_vsatwh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsatwh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vcl0w_128B,VI_ftype_VI,1)
-// tag : V6_vcl0w_128B
-def int_hexagon_V6_vcl0w_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vcl0w_128B">;
+def int_hexagon_V6_vgtuh :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vcl0h,VI_ftype_VI,1)
-// tag : V6_vcl0h
-def int_hexagon_V6_vcl0h :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vcl0h">;
+def int_hexagon_V6_vgtuh_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vcl0h_128B,VI_ftype_VI,1)
-// tag : V6_vcl0h_128B
-def int_hexagon_V6_vcl0h_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vcl0h_128B">;
+def int_hexagon_V6_vmpyihb_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyihb_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnormamtw,VI_ftype_VI,1)
-// tag : V6_vnormamtw
-def int_hexagon_V6_vnormamtw :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vnormamtw">;
+def int_hexagon_V6_vmpyihb_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyihb_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnormamtw_128B,VI_ftype_VI,1)
-// tag : V6_vnormamtw_128B
-def int_hexagon_V6_vnormamtw_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vnormamtw_128B">;
+def int_hexagon_V6_vrmpybusv_acc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybusv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnormamth,VI_ftype_VI,1)
-// tag : V6_vnormamth
-def int_hexagon_V6_vnormamth :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vnormamth">;
+def int_hexagon_V6_vrmpybusv_acc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybusv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnormamth_128B,VI_ftype_VI,1)
-// tag : V6_vnormamth_128B
-def int_hexagon_V6_vnormamth_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vnormamth_128B">;
+def int_hexagon_V6_vrdelta :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrdelta">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpopcounth,VI_ftype_VI,1)
-// tag : V6_vpopcounth
-def int_hexagon_V6_vpopcounth :
-Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vpopcounth">;
+def int_hexagon_V6_vrdelta_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrdelta_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vpopcounth_128B,VI_ftype_VI,1)
-// tag : V6_vpopcounth_128B
-def int_hexagon_V6_vpopcounth_128B :
-Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vpopcounth_128B">;
+def int_hexagon_V6_vroundwh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vroundwh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvb,VI_ftype_VIVISI,3)
-// tag : V6_vlutvvb
-def int_hexagon_V6_vlutvvb :
-Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb">;
+def int_hexagon_V6_vroundwh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vroundwh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvb_128B,VI_ftype_VIVISI,3)
-// tag : V6_vlutvvb_128B
-def int_hexagon_V6_vlutvvb_128B :
-Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_128B">;
+def int_hexagon_V6_vaddw_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddw_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracc,VI_ftype_VIVIVISI,4)
-// tag : V6_vlutvvb_oracc
-def int_hexagon_V6_vlutvvb_oracc :
-Hexagon_v512v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb_oracc">;
+def int_hexagon_V6_vaddw_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddw_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracc_128B,VI_ftype_VIVIVISI,4)
-// tag : V6_vlutvvb_oracc_128B
-def int_hexagon_V6_vlutvvb_oracc_128B :
-Hexagon_v1024v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_oracc_128B">;
+def int_hexagon_V6_vmpyiwb_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwb_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwh,VD_ftype_VIVISI,3)
-// tag : V6_vlutvwh
-def int_hexagon_V6_vlutvwh :
-Hexagon_v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh">;
+def int_hexagon_V6_vmpyiwb_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwb_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwh_128B,VD_ftype_VIVISI,3)
-// tag : V6_vlutvwh_128B
-def int_hexagon_V6_vlutvwh_128B :
-Hexagon_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_128B">;
+def int_hexagon_V6_vsubbq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubbq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracc,VD_ftype_VDVIVISI,4)
-// tag : V6_vlutvwh_oracc
-def int_hexagon_V6_vlutvwh_oracc :
-Hexagon_v1024v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_oracc">;
+def int_hexagon_V6_vsubbq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubbq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracc_128B,VD_ftype_VDVIVISI,4)
-// tag : V6_vlutvwh_oracc_128B
-def int_hexagon_V6_vlutvwh_oracc_128B :
-Hexagon_v2048v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_oracc_128B">;
+def int_hexagon_V6_veqh_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqh_and">;
 
-//
-// Masked vector stores
-//
-def int_hexagon_V6_vS32b_qpred_ai :
-Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_qpred_ai">;
+def int_hexagon_V6_veqh_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqh_and_128B">;
 
-def int_hexagon_V6_vS32b_nqpred_ai :
-Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nqpred_ai">;
+def int_hexagon_V6_valignbi :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valignbi">;
 
-def int_hexagon_V6_vS32b_nt_qpred_ai :
-Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nt_qpred_ai">;
+def int_hexagon_V6_valignbi_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valignbi_128B">;
 
-def int_hexagon_V6_vS32b_nt_nqpred_ai :
-Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nt_nqpred_ai">;
+def int_hexagon_V6_vaddwsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddwsat">;
 
-def int_hexagon_V6_vS32b_qpred_ai_128B :
-Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_qpred_ai_128B">;
+def int_hexagon_V6_vaddwsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddwsat_128B">;
 
-def int_hexagon_V6_vS32b_nqpred_ai_128B :
-Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nqpred_ai_128B">;
+def int_hexagon_V6_veqw_and :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqw_and">;
 
-def int_hexagon_V6_vS32b_nt_qpred_ai_128B :
-Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nt_qpred_ai_128B">;
+def int_hexagon_V6_veqw_and_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqw_and_128B">;
 
-def int_hexagon_V6_vS32b_nt_nqpred_ai_128B :
-Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nt_nqpred_ai_128B">;
+def int_hexagon_V6_vabsdiffub :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vabsdiffub">;
 
-def int_hexagon_V6_vmaskedstoreq :
-Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstoreq">;
+def int_hexagon_V6_vabsdiffub_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vabsdiffub_128B">;
 
-def int_hexagon_V6_vmaskedstorenq :
-Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorenq">;
+def int_hexagon_V6_vshuffeb :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshuffeb">;
 
-def int_hexagon_V6_vmaskedstorentq :
-Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentq">;
+def int_hexagon_V6_vshuffeb_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshuffeb_128B">;
 
-def int_hexagon_V6_vmaskedstorentnq :
-Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentnq">;
+def int_hexagon_V6_vabsdiffuh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vabsdiffuh">;
 
-def int_hexagon_V6_vmaskedstoreq_128B :
-Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstoreq_128B">;
+def int_hexagon_V6_vabsdiffuh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vabsdiffuh_128B">;
 
-def int_hexagon_V6_vmaskedstorenq_128B :
-Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorenq_128B">;
+def int_hexagon_V6_veqw_xor :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqw_xor">;
 
-def int_hexagon_V6_vmaskedstorentq_128B :
-Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentq_128B">;
+def int_hexagon_V6_veqw_xor_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqw_xor_128B">;
 
-def int_hexagon_V6_vmaskedstorentnq_128B :
-Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentnq_128B">;
+def int_hexagon_V6_vgth :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgth">;
 
-multiclass Hexagon_custom_circ_ld_Intrinsic<LLVMType ElTy> {
-  def NAME#_pci : Hexagon_NonGCC_Intrinsic<
-    [ElTy, llvm_ptr_ty],
-    [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<3>]>;
-  def NAME#_pcr : Hexagon_NonGCC_Intrinsic<
-    [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<2>]>;
-}
+def int_hexagon_V6_vgth_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgth_128B">;
 
-defm int_hexagon_L2_loadrub : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_L2_loadrb : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_L2_loadruh : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_L2_loadrh : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_L2_loadri : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_L2_loadrd : Hexagon_custom_circ_ld_Intrinsic<llvm_i64_ty>;
+def int_hexagon_V6_vgtuw_xor :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuw_xor">;
 
-multiclass Hexagon_custom_circ_st_Intrinsic<LLVMType ElTy> {
-  def NAME#_pci : Hexagon_NonGCC_Intrinsic<
-    [llvm_ptr_ty],
-    [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ElTy, llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<4>]>;
-  def NAME#_pcr : Hexagon_NonGCC_Intrinsic<
-    [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, ElTy, llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<3>]>;
-}
+def int_hexagon_V6_vgtuw_xor_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuw_xor_128B">;
 
-defm int_hexagon_S2_storerb : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_S2_storerh : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_S2_storerf : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_S2_storeri : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
-defm int_hexagon_S2_storerd : Hexagon_custom_circ_st_Intrinsic<llvm_i64_ty>;
+def int_hexagon_V6_vgtb :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtb">;
 
-// The front-end emits the intrinsic call with only two arguments. The third
-// argument from the builtin is already used by front-end to write to memory
-// by generating a store.
-class Hexagon_custom_brev_ld_Intrinsic<LLVMType ElTy>
- : Hexagon_NonGCC_Intrinsic<
-    [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty],
-    [IntrReadMem]>;
+def int_hexagon_V6_vgtb_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtb_128B">;
 
-def int_hexagon_L2_loadrub_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
-def int_hexagon_L2_loadrb_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
-def int_hexagon_L2_loadruh_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
-def int_hexagon_L2_loadrh_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
-def int_hexagon_L2_loadri_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i32_ty>;
-def int_hexagon_L2_loadrd_pbr : Hexagon_custom_brev_ld_Intrinsic<llvm_i64_ty>;
+def int_hexagon_V6_vgtw :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtw">;
 
-def int_hexagon_S2_storerb_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stb">;
-def int_hexagon_S2_storerh_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sth">;
-def int_hexagon_S2_storerf_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sthhi">;
-def int_hexagon_S2_storeri_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stw">;
-def int_hexagon_S2_storerd_pbr : Hexagon_mem_memdisi_Intrinsic<"brev_std">;
+def int_hexagon_V6_vgtw_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtw_128B">;
 
+def int_hexagon_V6_vsubwq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubwq">;
 
-///
-/// HexagonV62 intrinsics
-///
+def int_hexagon_V6_vsubwq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubwq_128B">;
 
-//
-// Hexagon_LLiLLiLLi_Intrinsic<string GCCIntSuffix>
-// tag : M6_vabsdiffb
-class Hexagon_LLiLLiLLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vnot :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vnot">;
 
-//
-// Hexagon_LLii_Intrinsic<string GCCIntSuffix>
-// tag : S6_vsplatrbp
-class Hexagon_LLii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vnot_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vnot_128B">;
 
-//
-// Hexagon_V62_v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlsrb
-class Hexagon_V62_v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vgtb_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtb_or">;
 
-//
-// Hexagon_V62_v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlsrb_128B
-class Hexagon_V62_v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vgtb_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtb_or_128B">;
 
-//
-// Hexagon_V62_v512v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vasrwuhrndsat
-class Hexagon_V62_v512v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vgtuw_or :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuw_or">;
 
-//
-// Hexagon_V62_v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vasrwuhrndsat_128B
-class Hexagon_V62_v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vgtuw_or_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuw_or_128B">;
 
-//
-// Hexagon_V62_v512v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrounduwuh
-class Hexagon_V62_v512v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaddubsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddubsat">;
 
-//
-// Hexagon_V62_v1024v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrounduwuh_128B
-class Hexagon_V62_v1024v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaddubsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddubsat_128B">;
 
-//
-// Hexagon_V62_v2048v2048v2048_Intrinsic<string GCCIntSuffix>
-// tag : V6_vadduwsat_dv_128B
-class Hexagon_V62_v2048v2048v2048_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vmaxw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxw">;
 
-//
-// Hexagon_V62_v1024v1024v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vaddhw_acc
-class Hexagon_V62_v1024v1024v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vmaxw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxw_128B">;
 
-//
-// Hexagon_V62_v2048v2048v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vaddhw_acc_128B
-class Hexagon_V62_v2048v2048v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaslwv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaslwv">;
 
-//
-// Hexagon_V62_v1024v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyewuh_64
-class Hexagon_V62_v1024v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaslwv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaslwv_128B">;
 
-//
-// Hexagon_V62_v2048v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyewuh_64_128B
-class Hexagon_V62_v2048v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vabsw_sat :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsw_sat">;
 
-//
-// Hexagon_V62_v2048v2048i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpauhb_128B
-class Hexagon_V62_v2048v2048i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vabsw_sat_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsw_sat_128B">;
 
-//
-// Hexagon_V62_v2048v2048v2048i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpauhb_acc_128B
-class Hexagon_V62_v2048v2048v2048i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vsubwsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubwsat_dv">;
 
-//
-// Hexagon_V62_v512v64ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandnqrt
-class Hexagon_V62_v512v64ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vsubwsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubwsat_dv_128B">;
 
-//
-// Hexagon_V62_v1024v128ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandnqrt_128B
-class Hexagon_V62_v1024v128ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vroundhub :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vroundhub">;
 
-//
-// Hexagon_V62_v512v512v64ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandnqrt_acc
-class Hexagon_V62_v512v512v64ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v512i1_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vroundhub_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vroundhub_128B">;
 
-//
-// Hexagon_V62_v1024v1024v128ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandnqrt_acc_128B
-class Hexagon_V62_v1024v1024v128ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v1024i1_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vdmpyhisat_acc :
+Hexagon_v16i32_v16i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhisat_acc">;
 
-//
-// Hexagon_V62_v512v64iv512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandvqv
-class Hexagon_V62_v512v64iv512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vdmpyhisat_acc_128B :
+Hexagon_v32i32_v32i32v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhisat_acc_128B">;
 
-//
-// Hexagon_V62_v1024v128iv1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vandvqv_128B
-class Hexagon_V62_v1024v128iv1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vmpabus :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpabus">;
 
-//
-// Hexagon_V62_v64ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_pred_scalar2v2
-class Hexagon_V62_v64ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vmpabus_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vmpabus_128B">;
 
-//
-// Hexagon_V62_v128ii_Intrinsic<string GCCIntSuffix>
-// tag : V6_pred_scalar2v2_128B
-class Hexagon_V62_v128ii_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vassignp :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vassignp">;
 
-//
-// Hexagon_V62_v64iv64iv64i_Intrinsic<string GCCIntSuffix>
-// tag : V6_shuffeqw
-class Hexagon_V62_v64iv64iv64i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v512i1_ty], [llvm_v512i1_ty,llvm_v512i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vassignp_128B :
+Hexagon_v64i32_v64i32_Intrinsic<"HEXAGON_V6_vassignp_128B">;
 
-//
-// Hexagon_V62_v128iv128iv128i_Intrinsic<string GCCIntSuffix>
-// tag : V6_shuffeqw_128B
-class Hexagon_V62_v128iv128iv128i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v1024i1_ty], [llvm_v1024i1_ty,llvm_v1024i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_veqb :
+Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqb">;
 
-//
-// Hexagon_V62_v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_lvsplath
-class Hexagon_V62_v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_veqb_128B :
+Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqb_128B">;
 
-//
-// Hexagon_V62_v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_lvsplath_128B
-class Hexagon_V62_v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vsububh :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsububh">;
 
-//
-// Hexagon_V62_v512v512v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvvb_oracci
-class Hexagon_V62_v512v512v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vsububh_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsububh_128B">;
 
-//
-// Hexagon_V62_v1024v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvvb_oracci_128B
-class Hexagon_V62_v1024v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_lvsplatw :
+Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_lvsplatw">;
 
-//
-// Hexagon_V62_v1024v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvwhi
-class Hexagon_V62_v1024v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_lvsplatw_128B :
+Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_lvsplatw_128B">;
 
-//
-// Hexagon_V62_v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvwhi_128B
-class Hexagon_V62_v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaddhnq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhnq">;
 
-//
-// Hexagon_V62_v1024v1024v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvwh_oracci
-class Hexagon_V62_v1024v1024v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaddhnq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhnq_128B">;
 
-//
-// Hexagon_V62_v2048v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlutvwh_oracci_128B
-class Hexagon_V62_v2048v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vdmpyhsusat :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsusat">;
 
-// Hexagon_v512v64iv512v512v64i_Intrinsic<string GCCIntSuffix>
-// tag: V6_vaddcarry
-class Hexagon_v512v64iv512v512v64i_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty, llvm_v512i1_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v512i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vdmpyhsusat_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsusat_128B">;
 
-// Hexagon_v1024v128iv1024v1024v128i_Intrinsic<string GCCIntSuffix>
-// tag: V6_vaddcarry_128B
-class Hexagon_v1024v128iv1024v1024v128i_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty, llvm_v1024i1_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v1024i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_pred_not :
+Hexagon_v512i1_v512i1_Intrinsic<"HEXAGON_V6_pred_not">;
 
+def int_hexagon_V6_pred_not_128B :
+Hexagon_v1024i1_v1024i1_Intrinsic<"HEXAGON_V6_pred_not_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.M6_vabsdiffb,DI_ftype_DIDI,2)
-// tag : M6_vabsdiffb
-def int_hexagon_M6_vabsdiffb :
-Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_M6_vabsdiffb">;
+def int_hexagon_V6_vlutvwh_oracc :
+Hexagon_v32i32_v32i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracc">;
 
-//
-// BUILTIN_INFO(HEXAGON.M6_vabsdiffub,DI_ftype_DIDI,2)
-// tag : M6_vabsdiffub
-def int_hexagon_M6_vabsdiffub :
-Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_M6_vabsdiffub">;
+def int_hexagon_V6_vlutvwh_oracc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_vtrunehb_ppp,DI_ftype_DIDI,2)
-// tag : S6_vtrunehb_ppp
-def int_hexagon_S6_vtrunehb_ppp :
-Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_S6_vtrunehb_ppp">;
+def int_hexagon_V6_vmpyiewh_acc :
+Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyiewh_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_vtrunohb_ppp,DI_ftype_DIDI,2)
-// tag : S6_vtrunohb_ppp
-def int_hexagon_S6_vtrunohb_ppp :
-Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_S6_vtrunohb_ppp">;
+def int_hexagon_V6_vmpyiewh_acc_128B :
+Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyiewh_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_vsplatrbp,DI_ftype_SI,1)
-// tag : S6_vsplatrbp
-def int_hexagon_S6_vsplatrbp :
-Hexagon_LLii_Intrinsic<"HEXAGON_S6_vsplatrbp">;
+def int_hexagon_V6_vdealvdd :
+Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdealvdd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrb,VI_ftype_VISI,2)
-// tag : V6_vlsrb
-def int_hexagon_V6_vlsrb :
-Hexagon_V62_v512v512i_Intrinsic<"HEXAGON_V6_vlsrb">;
+def int_hexagon_V6_vdealvdd_128B :
+Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdealvdd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlsrb_128B,VI_ftype_VISI,2)
-// tag : V6_vlsrb_128B
-def int_hexagon_V6_vlsrb_128B :
-Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vlsrb_128B">;
+def int_hexagon_V6_vavgw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwuhrndsat,VI_ftype_VIVISI,3)
-// tag : V6_vasrwuhrndsat
-def int_hexagon_V6_vasrwuhrndsat :
-Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwuhrndsat">;
+def int_hexagon_V6_vavgw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrwuhrndsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrwuhrndsat_128B
-def int_hexagon_V6_vasrwuhrndsat_128B :
-Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwuhrndsat_128B">;
+def int_hexagon_V6_vdmpyhsusat_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsusat_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasruwuhrndsat,VI_ftype_VIVISI,3)
-// tag : V6_vasruwuhrndsat
-def int_hexagon_V6_vasruwuhrndsat :
-Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruwuhrndsat">;
+def int_hexagon_V6_vdmpyhsusat_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsusat_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasruwuhrndsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasruwuhrndsat_128B
-def int_hexagon_V6_vasruwuhrndsat_128B :
-Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruwuhrndsat_128B">;
+def int_hexagon_V6_vgtw_xor :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtw_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhbsat,VI_ftype_VIVISI,3)
-// tag : V6_vasrhbsat
-def int_hexagon_V6_vasrhbsat :
-Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhbsat">;
+def int_hexagon_V6_vgtw_xor_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtw_xor_128B">;
+
+def int_hexagon_V6_vtmpyhb_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vtmpyhb_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrhbsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrhbsat_128B
-def int_hexagon_V6_vasrhbsat_128B :
-Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhbsat_128B">;
+def int_hexagon_V6_vtmpyhb_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vtmpyhb_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrounduwuh,VI_ftype_VIVI,2)
-// tag : V6_vrounduwuh
-def int_hexagon_V6_vrounduwuh :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vrounduwuh">;
+def int_hexagon_V6_vaddhw :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrounduwuh_128B,VI_ftype_VIVI,2)
-// tag : V6_vrounduwuh_128B
-def int_hexagon_V6_vrounduwuh_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrounduwuh_128B">;
+def int_hexagon_V6_vaddhw_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrounduhub,VI_ftype_VIVI,2)
-// tag : V6_vrounduhub
-def int_hexagon_V6_vrounduhub :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vrounduhub">;
+def int_hexagon_V6_vaddhq :
+Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhq">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrounduhub_128B,VI_ftype_VIVI,2)
-// tag : V6_vrounduhub_128B
-def int_hexagon_V6_vrounduhub_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrounduhub_128B">;
+def int_hexagon_V6_vaddhq_128B :
+Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhq_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduwsat,VI_ftype_VIVI,2)
-// tag : V6_vadduwsat
-def int_hexagon_V6_vadduwsat :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vadduwsat">;
+def int_hexagon_V6_vrmpyubv :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpyubv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduwsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vadduwsat_128B
-def int_hexagon_V6_vadduwsat_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduwsat_128B">;
+def int_hexagon_V6_vrmpyubv_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpyubv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduwsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vadduwsat_dv
-def int_hexagon_V6_vadduwsat_dv :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduwsat_dv">;
+def int_hexagon_V6_vsubh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduwsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vadduwsat_dv_128B
-def int_hexagon_V6_vadduwsat_dv_128B :
-Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vadduwsat_dv_128B">;
+def int_hexagon_V6_vsubh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuwsat,VI_ftype_VIVI,2)
-// tag : V6_vsubuwsat
-def int_hexagon_V6_vsubuwsat :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubuwsat">;
+def int_hexagon_V6_vrmpyubi :
+Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubuwsat_128B
-def int_hexagon_V6_vsubuwsat_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuwsat_128B">;
+def int_hexagon_V6_vrmpyubi_128B :
+Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vsubuwsat_dv
-def int_hexagon_V6_vsubuwsat_dv :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuwsat_dv">;
+def int_hexagon_V6_vminw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsubuwsat_dv_128B
-def int_hexagon_V6_vsubuwsat_dv_128B :
-Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubuwsat_dv_128B">;
+def int_hexagon_V6_vminw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddbsat,VI_ftype_VIVI,2)
-// tag : V6_vaddbsat
-def int_hexagon_V6_vaddbsat :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddbsat">;
+def int_hexagon_V6_vmpyubv_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyubv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddbsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddbsat_128B
-def int_hexagon_V6_vaddbsat_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddbsat_128B">;
+def int_hexagon_V6_vmpyubv_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyubv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddbsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vaddbsat_dv
-def int_hexagon_V6_vaddbsat_dv :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddbsat_dv">;
+def int_hexagon_V6_pred_xor :
+Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddbsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vaddbsat_dv_128B
-def int_hexagon_V6_vaddbsat_dv_128B :
-Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddbsat_dv_128B">;
+def int_hexagon_V6_pred_xor_128B :
+Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_xor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubbsat,VI_ftype_VIVI,2)
-// tag : V6_vsubbsat
-def int_hexagon_V6_vsubbsat :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubbsat">;
+def int_hexagon_V6_veqb_xor :
+Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqb_xor">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubbsat_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubbsat_128B
-def int_hexagon_V6_vsubbsat_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubbsat_128B">;
+def int_hexagon_V6_veqb_xor_128B :
+Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqb_xor_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubbsat_dv,VD_ftype_VDVD,2)
-// tag : V6_vsubbsat_dv
-def int_hexagon_V6_vsubbsat_dv :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubbsat_dv">;
+def int_hexagon_V6_vmpyiewuh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyiewuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubbsat_dv_128B,VD_ftype_VDVD,2)
-// tag : V6_vsubbsat_dv_128B
-def int_hexagon_V6_vsubbsat_dv_128B :
-Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubbsat_dv_128B">;
+def int_hexagon_V6_vmpyiewuh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyiewuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddububb_sat,VI_ftype_VIVI,2)
-// tag : V6_vaddububb_sat
-def int_hexagon_V6_vaddububb_sat :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddububb_sat">;
+def int_hexagon_V6_vmpybusv_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpybusv_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddububb_sat_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddububb_sat_128B
-def int_hexagon_V6_vaddububb_sat_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddububb_sat_128B">;
+def int_hexagon_V6_vmpybusv_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpybusv_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubububb_sat,VI_ftype_VIVI,2)
-// tag : V6_vsubububb_sat
-def int_hexagon_V6_vsubububb_sat :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubububb_sat">;
+def int_hexagon_V6_vavguhrnd :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavguhrnd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubububb_sat_128B,VI_ftype_VIVI,2)
-// tag : V6_vsubububb_sat_128B
-def int_hexagon_V6_vsubububb_sat_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubububb_sat_128B">;
+def int_hexagon_V6_vavguhrnd_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavguhrnd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhw_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vaddhw_acc
-def int_hexagon_V6_vaddhw_acc :
-Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vaddhw_acc">;
+def int_hexagon_V6_vmpyowh_rnd :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh_rnd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddhw_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vaddhw_acc_128B
-def int_hexagon_V6_vaddhw_acc_128B :
-Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddhw_acc_128B">;
+def int_hexagon_V6_vmpyowh_rnd_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduhw_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vadduhw_acc
-def int_hexagon_V6_vadduhw_acc :
-Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vadduhw_acc">;
+def int_hexagon_V6_vsubwsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubwsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vadduhw_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vadduhw_acc_128B
-def int_hexagon_V6_vadduhw_acc_128B :
-Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vadduhw_acc_128B">;
+def int_hexagon_V6_vsubwsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubwsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddubh_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vaddubh_acc
-def int_hexagon_V6_vaddubh_acc :
-Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vaddubh_acc">;
+def int_hexagon_V6_vsubuhw :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubuhw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddubh_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vaddubh_acc_128B
-def int_hexagon_V6_vaddubh_acc_128B :
-Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddubh_acc_128B">;
+def int_hexagon_V6_vsubuhw_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuhw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyewuh_64,VD_ftype_VIVI,2)
-// tag : V6_vmpyewuh_64
-def int_hexagon_V6_vmpyewuh_64 :
-Hexagon_V62_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyewuh_64">;
+def int_hexagon_V6_vrmpybusi_acc :
+Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyewuh_64_128B,VD_ftype_VIVI,2)
-// tag : V6_vmpyewuh_64_128B
-def int_hexagon_V6_vmpyewuh_64_128B :
-Hexagon_V62_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyewuh_64_128B">;
+def int_hexagon_V6_vrmpybusi_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_64_acc,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyowh_64_acc
-def int_hexagon_V6_vmpyowh_64_acc :
-Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc">;
+def int_hexagon_V6_vasrw :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vasrw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyowh_64_acc_128B,VD_ftype_VDVIVI,3)
-// tag : V6_vmpyowh_64_acc_128B
-def int_hexagon_V6_vmpyowh_64_acc_128B :
-Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc_128B">;
+def int_hexagon_V6_vasrw_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vasrw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpauhb,VD_ftype_VDSI,2)
-// tag : V6_vmpauhb
-def int_hexagon_V6_vmpauhb :
-Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpauhb">;
+def int_hexagon_V6_vasrh :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vasrh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpauhb_128B,VD_ftype_VDSI,2)
-// tag : V6_vmpauhb_128B
-def int_hexagon_V6_vmpauhb_128B :
-Hexagon_V62_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpauhb_128B">;
+def int_hexagon_V6_vasrh_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vasrh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpauhb_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vmpauhb_acc
-def int_hexagon_V6_vmpauhb_acc :
-Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpauhb_acc">;
+def int_hexagon_V6_vmpyuhv :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyuhv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpauhb_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vmpauhb_acc_128B
-def int_hexagon_V6_vmpauhb_acc_128B :
-Hexagon_V62_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpauhb_acc_128B">;
+def int_hexagon_V6_vmpyuhv_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyuhv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwub,VI_ftype_VISI,2)
-// tag : V6_vmpyiwub
-def int_hexagon_V6_vmpyiwub :
-Hexagon_V62_v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwub">;
+def int_hexagon_V6_vasrhbrndsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrhbrndsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_128B,VI_ftype_VISI,2)
-// tag : V6_vmpyiwub_128B
-def int_hexagon_V6_vmpyiwub_128B :
-Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwub_128B">;
+def int_hexagon_V6_vasrhbrndsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrhbrndsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_acc,VI_ftype_VIVISI,3)
-// tag : V6_vmpyiwub_acc
-def int_hexagon_V6_vmpyiwub_acc :
-Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwub_acc">;
+def int_hexagon_V6_vsubuhsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuhsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vmpyiwub_acc_128B
-def int_hexagon_V6_vmpyiwub_acc_128B :
-Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwub_acc_128B">;
+def int_hexagon_V6_vsubuhsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubuhsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandnqrt,VI_ftype_QVSI,2)
-// tag : V6_vandnqrt
-def int_hexagon_V6_vandnqrt :
-Hexagon_V62_v512v64ii_Intrinsic<"HEXAGON_V6_vandnqrt">;
+def int_hexagon_V6_vabsdiffw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vabsdiffw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandnqrt_128B,VI_ftype_QVSI,2)
-// tag : V6_vandnqrt_128B
-def int_hexagon_V6_vandnqrt_128B :
-Hexagon_V62_v1024v128ii_Intrinsic<"HEXAGON_V6_vandnqrt_128B">;
+def int_hexagon_V6_vabsdiffw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vabsdiffw_128B">;
+
+// V62 HVX Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandnqrt_acc,VI_ftype_VIQVSI,3)
-// tag : V6_vandnqrt_acc
 def int_hexagon_V6_vandnqrt_acc :
-Hexagon_V62_v512v512v64ii_Intrinsic<"HEXAGON_V6_vandnqrt_acc">;
+Hexagon_v16i32_v16i32v512i1i32_Intrinsic<"HEXAGON_V6_vandnqrt_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandnqrt_acc_128B,VI_ftype_VIQVSI,3)
-// tag : V6_vandnqrt_acc_128B
 def int_hexagon_V6_vandnqrt_acc_128B :
-Hexagon_V62_v1024v1024v128ii_Intrinsic<"HEXAGON_V6_vandnqrt_acc_128B">;
+Hexagon_v32i32_v32i32v1024i1i32_Intrinsic<"HEXAGON_V6_vandnqrt_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandvqv,VI_ftype_QVVI,2)
-// tag : V6_vandvqv
-def int_hexagon_V6_vandvqv :
-Hexagon_V62_v512v64iv512_Intrinsic<"HEXAGON_V6_vandvqv">;
+def int_hexagon_V6_vaddclbh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddclbh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandvqv_128B,VI_ftype_QVVI,2)
-// tag : V6_vandvqv_128B
-def int_hexagon_V6_vandvqv_128B :
-Hexagon_V62_v1024v128iv1024_Intrinsic<"HEXAGON_V6_vandvqv_128B">;
+def int_hexagon_V6_vaddclbh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddclbh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandvnqv,VI_ftype_QVVI,2)
-// tag : V6_vandvnqv
-def int_hexagon_V6_vandvnqv :
-Hexagon_V62_v512v64iv512_Intrinsic<"HEXAGON_V6_vandvnqv">;
+def int_hexagon_V6_vmpyowh_64_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vandvnqv_128B,VI_ftype_QVVI,2)
-// tag : V6_vandvnqv_128B
-def int_hexagon_V6_vandvnqv_128B :
-Hexagon_V62_v1024v128iv1024_Intrinsic<"HEXAGON_V6_vandvnqv_128B">;
+def int_hexagon_V6_vmpyowh_64_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_scalar2v2,QV_ftype_SI,1)
-// tag : V6_pred_scalar2v2
-def int_hexagon_V6_pred_scalar2v2 :
-Hexagon_V62_v64ii_Intrinsic<"HEXAGON_V6_pred_scalar2v2">;
+def int_hexagon_V6_vmpyewuh_64 :
+Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyewuh_64">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_pred_scalar2v2_128B,QV_ftype_SI,1)
-// tag : V6_pred_scalar2v2_128B
-def int_hexagon_V6_pred_scalar2v2_128B :
-Hexagon_V62_v128ii_Intrinsic<"HEXAGON_V6_pred_scalar2v2_128B">;
+def int_hexagon_V6_vmpyewuh_64_128B :
+Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyewuh_64_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_shuffeqw,QV_ftype_QVQV,2)
-// tag : V6_shuffeqw
-def int_hexagon_V6_shuffeqw :
-Hexagon_V62_v64iv64iv64i_Intrinsic<"HEXAGON_V6_shuffeqw">;
+def int_hexagon_V6_vsatuwuh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsatuwuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_shuffeqw_128B,QV_ftype_QVQV,2)
-// tag : V6_shuffeqw_128B
-def int_hexagon_V6_shuffeqw_128B :
-Hexagon_V62_v128iv128iv128i_Intrinsic<"HEXAGON_V6_shuffeqw_128B">;
+def int_hexagon_V6_vsatuwuh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsatuwuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_shuffeqh,QV_ftype_QVQV,2)
-// tag : V6_shuffeqh
 def int_hexagon_V6_shuffeqh :
-Hexagon_V62_v64iv64iv64i_Intrinsic<"HEXAGON_V6_shuffeqh">;
+Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_shuffeqh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_shuffeqh_128B,QV_ftype_QVQV,2)
-// tag : V6_shuffeqh_128B
 def int_hexagon_V6_shuffeqh_128B :
-Hexagon_V62_v128iv128iv128i_Intrinsic<"HEXAGON_V6_shuffeqh_128B">;
+Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_shuffeqh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxb,VI_ftype_VIVI,2)
-// tag : V6_vmaxb
-def int_hexagon_V6_vmaxb :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxb">;
+def int_hexagon_V6_shuffeqw :
+Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_shuffeqw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmaxb_128B,VI_ftype_VIVI,2)
-// tag : V6_vmaxb_128B
-def int_hexagon_V6_vmaxb_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxb_128B">;
+def int_hexagon_V6_shuffeqw_128B :
+Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_shuffeqw_128B">;
+
+def int_hexagon_V6_ldcnpnt0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldcnpnt0">;
+
+def int_hexagon_V6_ldcnpnt0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldcnpnt0_128B">;
+
+def int_hexagon_V6_vsubcarry :
+Hexagon_custom_v16i32v512i1_v16i32v16i32v512i1_Intrinsic;
+
+def int_hexagon_V6_vsubcarry_128B :
+Hexagon_custom_v32i32v1024i1_v32i32v32i32v1024i1_Intrinsic_128B;
+
+def int_hexagon_V6_vasrhbsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrhbsat">;
+
+def int_hexagon_V6_vasrhbsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrhbsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminb,VI_ftype_VIVI,2)
-// tag : V6_vminb
 def int_hexagon_V6_vminb :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vminb">;
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vminb_128B,VI_ftype_VIVI,2)
-// tag : V6_vminb_128B
 def int_hexagon_V6_vminb_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminb_128B">;
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsatuwuh,VI_ftype_VIVI,2)
-// tag : V6_vsatuwuh
-def int_hexagon_V6_vsatuwuh :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsatuwuh">;
+def int_hexagon_V6_vmpauhb_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpauhb_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsatuwuh_128B,VI_ftype_VIVI,2)
-// tag : V6_vsatuwuh_128B
-def int_hexagon_V6_vsatuwuh_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsatuwuh_128B">;
+def int_hexagon_V6_vmpauhb_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vmpauhb_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_lvsplath,VI_ftype_SI,1)
-// tag : V6_lvsplath
-def int_hexagon_V6_lvsplath :
-Hexagon_V62_v512i_Intrinsic<"HEXAGON_V6_lvsplath">;
+def int_hexagon_V6_vaddhw_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhw_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_lvsplath_128B,VI_ftype_SI,1)
-// tag : V6_lvsplath_128B
-def int_hexagon_V6_lvsplath_128B :
-Hexagon_V62_v1024i_Intrinsic<"HEXAGON_V6_lvsplath_128B">;
+def int_hexagon_V6_vaddhw_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhw_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_lvsplatb,VI_ftype_SI,1)
-// tag : V6_lvsplatb
-def int_hexagon_V6_lvsplatb :
-Hexagon_V62_v512i_Intrinsic<"HEXAGON_V6_lvsplatb">;
+def int_hexagon_V6_vlsrb :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vlsrb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_lvsplatb_128B,VI_ftype_SI,1)
-// tag : V6_lvsplatb_128B
-def int_hexagon_V6_lvsplatb_128B :
-Hexagon_V62_v1024i_Intrinsic<"HEXAGON_V6_lvsplatb_128B">;
+def int_hexagon_V6_vlsrb_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vlsrb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddclbw,VI_ftype_VIVI,2)
-// tag : V6_vaddclbw
-def int_hexagon_V6_vaddclbw :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddclbw">;
+def int_hexagon_V6_vlutvwhi :
+Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwhi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddclbw_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddclbw_128B
-def int_hexagon_V6_vaddclbw_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddclbw_128B">;
+def int_hexagon_V6_vlutvwhi_128B :
+Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwhi_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddclbh,VI_ftype_VIVI,2)
-// tag : V6_vaddclbh
-def int_hexagon_V6_vaddclbh :
-Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddclbh">;
+def int_hexagon_V6_vaddububb_sat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddububb_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddclbh_128B,VI_ftype_VIVI,2)
-// tag : V6_vaddclbh_128B
-def int_hexagon_V6_vaddclbh_128B :
-Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddclbh_128B">;
+def int_hexagon_V6_vaddububb_sat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddububb_sat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvbi,VI_ftype_VIVISI,3)
-// tag : V6_vlutvvbi
-def int_hexagon_V6_vlutvvbi :
-Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvbi">;
+def int_hexagon_V6_vsubbsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubbsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvbi_128B,VI_ftype_VIVISI,3)
-// tag : V6_vlutvvbi_128B
-def int_hexagon_V6_vlutvvbi_128B :
-Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvbi_128B">;
+def int_hexagon_V6_vsubbsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubbsat_dv_128B">;
+
+def int_hexagon_V6_ldtp0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldtp0">;
+
+def int_hexagon_V6_ldtp0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldtp0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracci,VI_ftype_VIVIVISI,4)
-// tag : V6_vlutvvb_oracci
 def int_hexagon_V6_vlutvvb_oracci :
-Hexagon_V62_v512v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb_oracci">;
+Hexagon_v16i32_v16i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracci">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracci_128B,VI_ftype_VIVIVISI,4)
-// tag : V6_vlutvvb_oracci_128B
 def int_hexagon_V6_vlutvvb_oracci_128B :
-Hexagon_V62_v1024v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_oracci_128B">;
+Hexagon_v32i32_v32i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracci_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwhi,VD_ftype_VIVISI,3)
-// tag : V6_vlutvwhi
-def int_hexagon_V6_vlutvwhi :
-Hexagon_V62_v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwhi">;
+def int_hexagon_V6_vsubuwsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuwsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwhi_128B,VD_ftype_VIVISI,3)
-// tag : V6_vlutvwhi_128B
-def int_hexagon_V6_vlutvwhi_128B :
-Hexagon_V62_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwhi_128B">;
+def int_hexagon_V6_vsubuwsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubuwsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracci,VD_ftype_VDVIVISI,4)
-// tag : V6_vlutvwh_oracci
-def int_hexagon_V6_vlutvwh_oracci :
-Hexagon_V62_v1024v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_oracci">;
+def int_hexagon_V6_ldpnt0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldpnt0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracci_128B,VD_ftype_VDVIVISI,4)
-// tag : V6_vlutvwh_oracci_128B
-def int_hexagon_V6_vlutvwh_oracci_128B :
-Hexagon_V62_v2048v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_oracci_128B">;
+def int_hexagon_V6_ldpnt0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldpnt0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvb_nm,VI_ftype_VIVISI,3)
-// tag : V6_vlutvvb_nm
-def int_hexagon_V6_vlutvvb_nm :
-Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb_nm">;
+def int_hexagon_V6_vandvnqv :
+Hexagon_v16i32_v512i1v16i32_Intrinsic<"HEXAGON_V6_vandvnqv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvvb_nm_128B,VI_ftype_VIVISI,3)
-// tag : V6_vlutvvb_nm_128B
-def int_hexagon_V6_vlutvvb_nm_128B :
-Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_nm_128B">;
+def int_hexagon_V6_vandvnqv_128B :
+Hexagon_v32i32_v1024i1v32i32_Intrinsic<"HEXAGON_V6_vandvnqv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwh_nm,VD_ftype_VIVISI,3)
-// tag : V6_vlutvwh_nm
-def int_hexagon_V6_vlutvwh_nm :
-Hexagon_V62_v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_nm">;
+def int_hexagon_V6_lvsplatb :
+Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_lvsplatb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlutvwh_nm_128B,VD_ftype_VIVISI,3)
-// tag : V6_vlutvwh_nm_128B
-def int_hexagon_V6_vlutvwh_nm_128B :
-Hexagon_V62_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_nm_128B">;
+def int_hexagon_V6_lvsplatb_128B :
+Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_lvsplatb_128B">;
+
+def int_hexagon_V6_lvsplath :
+Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_lvsplath">;
+
+def int_hexagon_V6_lvsplath_128B :
+Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_lvsplath_128B">;
+
+def int_hexagon_V6_ldtpnt0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldtpnt0">;
+
+def int_hexagon_V6_ldtpnt0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldtpnt0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddcarry,VI_ftype_VIVIQV,3)
-// tag: V6_vaddcarry
-def int_hexagon_V6_vaddcarry :
-Hexagon_v512v64iv512v512v64i_Intrinsic<"HEXAGON_v6_vaddcarry">;
+def int_hexagon_V6_vlutvwh_nm :
+Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_nm">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaddcarry_128B,VI_ftype_VIVIQV,3)
-// tag: V6_vaddcarry_128B
-def int_hexagon_V6_vaddcarry_128B :
-Hexagon_v1024v128iv1024v1024v128i_Intrinsic<"HEXAGON_v6_vaddcarry_128B">;
+def int_hexagon_V6_vlutvwh_nm_128B :
+Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_nm_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubcarry,VI_ftype_VIVIQV,3)
-// tag: V6_vsubcarry
-def int_hexagon_V6_vsubcarry :
-Hexagon_v512v64iv512v512v64i_Intrinsic<"HEXAGON_v6_vsubcarry">;
+def int_hexagon_V6_ldnpnt0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldnpnt0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vsubcarry_128B,VI_ftype_VIVIQV,3)
-// tag: V6_vsubcarry_128B
-def int_hexagon_V6_vsubcarry_128B :
-Hexagon_v1024v128iv1024v1024v128i_Intrinsic<"HEXAGON_v6_vsubcarry_128B">;
+def int_hexagon_V6_ldnpnt0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldnpnt0_128B">;
 
+def int_hexagon_V6_vmpauhb :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpauhb">;
 
-///
-/// HexagonV65 intrinsics
-///
+def int_hexagon_V6_vmpauhb_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vmpauhb_128B">;
 
-//
-// Hexagon_V65_iLLiLLi_Intrinsic<string GCCIntSuffix>
-// tag : A6_vcmpbeq_notany
-class Hexagon_V65_iLLiLLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i32_ty], [llvm_i64_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_ldtnp0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldtnp0">;
 
-//
-// Hexagon_V65_v1024v512LLi_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrmpyub_rtt
-class Hexagon_V65_v1024v512LLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_ldtnp0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldtnp0_128B">;
 
-//
-// Hexagon_V65_v2048v1024LLi_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrmpyub_rtt_128B
-class Hexagon_V65_v2048v1024LLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vrounduhub :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrounduhub">;
 
-//
-// Hexagon_V65_v1024v1024v512LLi_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrmpyub_rtt_acc
-class Hexagon_V65_v1024v1024v512LLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vrounduhub_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrounduhub_128B">;
 
-//
-// Hexagon_V65_v2048v2048v1024LLi_Intrinsic<string GCCIntSuffix>
-// tag : V6_vrmpyub_rtt_acc_128B
-class Hexagon_V65_v2048v2048v1024LLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vadduhw_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vadduhw_acc">;
 
-//
-// Hexagon_V65_v512v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vasruwuhsat
-class Hexagon_V65_v512v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vadduhw_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduhw_acc_128B">;
 
-//
-// Hexagon_V65_v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vasruwuhsat_128B
-class Hexagon_V65_v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_ldcp0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldcp0">;
 
-//
-// Hexagon_V65_v512v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vavguw
-class Hexagon_V65_v512v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_ldcp0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldcp0_128B">;
 
-//
-// Hexagon_V65_v1024v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vavguw_128B
-class Hexagon_V65_v1024v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vadduwsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vadduwsat">;
 
-//
-// Hexagon_V65_v512v512_Intrinsic<string GCCIntSuffix>
-// tag : V6_vabsb
-class Hexagon_V65_v512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vadduwsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduwsat_128B">;
 
-//
-// Hexagon_V65_v1024v1024_Intrinsic<string GCCIntSuffix>
-// tag : V6_vabsb_128B
-class Hexagon_V65_v1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_ldtnpnt0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldtnpnt0">;
 
-//
-// Hexagon_V65_v1024v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpabuu
-class Hexagon_V65_v1024v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_ldtnpnt0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldtnpnt0_128B">;
 
-//
-// Hexagon_V65_v2048v2048i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpabuu_128B
-class Hexagon_V65_v2048v2048i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaddbsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddbsat">;
 
-//
-// Hexagon_V65_v2048v2048v2048i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpabuu_acc_128B
-class Hexagon_V65_v2048v2048v2048i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaddbsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddbsat_128B">;
 
-//
-// Hexagon_V65_v1024v1024v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyh_acc
-class Hexagon_V65_v1024v1024v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vandnqrt :
+Hexagon_v16i32_v512i1i32_Intrinsic<"HEXAGON_V6_vandnqrt">;
 
-//
-// Hexagon_V65_v2048v2048v1024i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyh_acc_128B
-class Hexagon_V65_v2048v2048v1024i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vandnqrt_128B :
+Hexagon_v32i32_v1024i1i32_Intrinsic<"HEXAGON_V6_vandnqrt_128B">;
 
-//
-// Hexagon_V65_v512v512v512LLi_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpahhsat
-class Hexagon_V65_v512v512v512LLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vmpyiwub_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwub_acc">;
 
-//
-// Hexagon_V65_v1024v1024v1024LLi_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpahhsat_128B
-class Hexagon_V65_v1024v1024v1024LLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vmpyiwub_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwub_acc_128B">;
 
-//
-// Hexagon_V65_v512v512LLi_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlut4
-class Hexagon_V65_v512v512LLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vmaxb :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxb">;
 
-//
-// Hexagon_V65_v1024v1024LLi_Intrinsic<string GCCIntSuffix>
-// tag : V6_vlut4_128B
-class Hexagon_V65_v1024v1024LLi_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vmaxb_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxb_128B">;
 
-//
-// Hexagon_V65_v512v512i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vmpyuhe
-class Hexagon_V65_v512v512i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vandvqv :
+Hexagon_v16i32_v512i1v16i32_Intrinsic<"HEXAGON_V6_vandvqv">;
 
-//
-// Hexagon_V65_v512v64i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vprefixqb
-class Hexagon_V65_v512v64i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v16i32_ty], [llvm_v512i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vandvqv_128B :
+Hexagon_v32i32_v1024i1v32i32_Intrinsic<"HEXAGON_V6_vandvqv_128B">;
 
-//
-// Hexagon_V65_v1024v128i_Intrinsic<string GCCIntSuffix>
-// tag : V6_vprefixqb_128B
-class Hexagon_V65_v1024v128i_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v32i32_ty], [llvm_v1024i1_ty],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vaddcarry :
+Hexagon_custom_v16i32v512i1_v16i32v16i32v512i1_Intrinsic;
 
-//
-// BUILTIN_INFO(HEXAGON.A6_vcmpbeq_notany,QI_ftype_DIDI,2)
-// tag : A6_vcmpbeq_notany
-def int_hexagon_A6_vcmpbeq_notany :
-Hexagon_V65_iLLiLLi_Intrinsic<"HEXAGON_A6_vcmpbeq_notany">;
+def int_hexagon_V6_vaddcarry_128B :
+Hexagon_custom_v32i32v1024i1_v32i32v32i32v1024i1_Intrinsic_128B;
 
-//
-// BUILTIN_INFO(HEXAGON.A6_vcmpbeq_notany_128B,QI_ftype_DIDI,2)
-// tag : A6_vcmpbeq_notany_128B
-def int_hexagon_A6_vcmpbeq_notany_128B :
-Hexagon_V65_iLLiLLi_Intrinsic<"HEXAGON_A6_vcmpbeq_notany_128B">;
+def int_hexagon_V6_vasrwuhrndsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwuhrndsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyub_rtt,VD_ftype_VIDI,2)
-// tag : V6_vrmpyub_rtt
-def int_hexagon_V6_vrmpyub_rtt :
-Hexagon_V65_v1024v512LLi_Intrinsic<"HEXAGON_V6_vrmpyub_rtt">;
+def int_hexagon_V6_vasrwuhrndsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwuhrndsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyub_rtt_128B,VD_ftype_VIDI,2)
-// tag : V6_vrmpyub_rtt_128B
-def int_hexagon_V6_vrmpyub_rtt_128B :
-Hexagon_V65_v2048v1024LLi_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_128B">;
+def int_hexagon_V6_vlutvvbi :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvbi">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyub_rtt_acc,VD_ftype_VDVIDI,3)
-// tag : V6_vrmpyub_rtt_acc
-def int_hexagon_V6_vrmpyub_rtt_acc :
-Hexagon_V65_v1024v1024v512LLi_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_acc">;
+def int_hexagon_V6_vlutvvbi_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvbi_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpyub_rtt_acc_128B,VD_ftype_VDVIDI,3)
-// tag : V6_vrmpyub_rtt_acc_128B
-def int_hexagon_V6_vrmpyub_rtt_acc_128B :
-Hexagon_V65_v2048v2048v1024LLi_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_acc_128B">;
+def int_hexagon_V6_vsubuwsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubuwsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybub_rtt,VD_ftype_VIDI,2)
-// tag : V6_vrmpybub_rtt
-def int_hexagon_V6_vrmpybub_rtt :
-Hexagon_V65_v1024v512LLi_Intrinsic<"HEXAGON_V6_vrmpybub_rtt">;
+def int_hexagon_V6_vsubuwsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuwsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybub_rtt_128B,VD_ftype_VIDI,2)
-// tag : V6_vrmpybub_rtt_128B
-def int_hexagon_V6_vrmpybub_rtt_128B :
-Hexagon_V65_v2048v1024LLi_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_128B">;
+def int_hexagon_V6_vaddbsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddbsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybub_rtt_acc,VD_ftype_VDVIDI,3)
-// tag : V6_vrmpybub_rtt_acc
-def int_hexagon_V6_vrmpybub_rtt_acc :
-Hexagon_V65_v1024v1024v512LLi_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_acc">;
+def int_hexagon_V6_vaddbsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddbsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vrmpybub_rtt_acc_128B,VD_ftype_VDVIDI,3)
-// tag : V6_vrmpybub_rtt_acc_128B
-def int_hexagon_V6_vrmpybub_rtt_acc_128B :
-Hexagon_V65_v2048v2048v1024LLi_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_acc_128B">;
+def int_hexagon_V6_ldnp0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldnp0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasruwuhsat,VI_ftype_VIVISI,3)
-// tag : V6_vasruwuhsat
-def int_hexagon_V6_vasruwuhsat :
-Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruwuhsat">;
+def int_hexagon_V6_ldnp0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldnp0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasruwuhsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasruwuhsat_128B
-def int_hexagon_V6_vasruwuhsat_128B :
-Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruwuhsat_128B">;
+def int_hexagon_V6_vasruwuhrndsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasruwuhrndsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasruhubsat,VI_ftype_VIVISI,3)
-// tag : V6_vasruhubsat
-def int_hexagon_V6_vasruhubsat :
-Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruhubsat">;
+def int_hexagon_V6_vasruwuhrndsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasruwuhrndsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasruhubsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasruhubsat_128B
-def int_hexagon_V6_vasruhubsat_128B :
-Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruhubsat_128B">;
+def int_hexagon_V6_vrounduwuh :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrounduwuh">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasruhubrndsat,VI_ftype_VIVISI,3)
-// tag : V6_vasruhubrndsat
-def int_hexagon_V6_vasruhubrndsat :
-Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruhubrndsat">;
+def int_hexagon_V6_vrounduwuh_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrounduwuh_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasruhubrndsat_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasruhubrndsat_128B
-def int_hexagon_V6_vasruhubrndsat_128B :
-Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruhubrndsat_128B">;
+def int_hexagon_V6_vlutvvb_nm :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_nm">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslh_acc,VI_ftype_VIVISI,3)
-// tag : V6_vaslh_acc
-def int_hexagon_V6_vaslh_acc :
-Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vaslh_acc">;
+def int_hexagon_V6_vlutvvb_nm_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_nm_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vaslh_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vaslh_acc_128B
-def int_hexagon_V6_vaslh_acc_128B :
-Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vaslh_acc_128B">;
+def int_hexagon_V6_pred_scalar2v2 :
+Hexagon_v512i1_i32_Intrinsic<"HEXAGON_V6_pred_scalar2v2">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrh_acc,VI_ftype_VIVISI,3)
-// tag : V6_vasrh_acc
-def int_hexagon_V6_vasrh_acc :
-Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrh_acc">;
+def int_hexagon_V6_pred_scalar2v2_128B :
+Hexagon_v1024i1_i32_Intrinsic<"HEXAGON_V6_pred_scalar2v2_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vasrh_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vasrh_acc_128B
-def int_hexagon_V6_vasrh_acc_128B :
-Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrh_acc_128B">;
+def int_hexagon_V6_ldp0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldp0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavguw,VI_ftype_VIVI,2)
-// tag : V6_vavguw
-def int_hexagon_V6_vavguw :
-Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vavguw">;
+def int_hexagon_V6_ldp0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldp0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavguw_128B,VI_ftype_VIVI,2)
-// tag : V6_vavguw_128B
-def int_hexagon_V6_vavguw_128B :
-Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavguw_128B">;
+def int_hexagon_V6_vaddubh_acc :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddubh_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavguwrnd,VI_ftype_VIVI,2)
-// tag : V6_vavguwrnd
-def int_hexagon_V6_vavguwrnd :
-Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vavguwrnd">;
+def int_hexagon_V6_vaddubh_acc_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddubh_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavguwrnd_128B,VI_ftype_VIVI,2)
-// tag : V6_vavguwrnd_128B
-def int_hexagon_V6_vavguwrnd_128B :
-Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavguwrnd_128B">;
+def int_hexagon_V6_vaddclbw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddclbw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgb,VI_ftype_VIVI,2)
-// tag : V6_vavgb
-def int_hexagon_V6_vavgb :
-Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vavgb">;
+def int_hexagon_V6_vaddclbw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddclbw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgb_128B,VI_ftype_VIVI,2)
-// tag : V6_vavgb_128B
-def int_hexagon_V6_vavgb_128B :
-Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgb_128B">;
+def int_hexagon_V6_ldcpnt0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldcpnt0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgbrnd,VI_ftype_VIVI,2)
-// tag : V6_vavgbrnd
-def int_hexagon_V6_vavgbrnd :
-Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vavgbrnd">;
+def int_hexagon_V6_ldcpnt0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldcpnt0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vavgbrnd_128B,VI_ftype_VIVI,2)
-// tag : V6_vavgbrnd_128B
-def int_hexagon_V6_vavgbrnd_128B :
-Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgbrnd_128B">;
+def int_hexagon_V6_vadduwsat_dv :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduwsat_dv">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnavgb,VI_ftype_VIVI,2)
-// tag : V6_vnavgb
-def int_hexagon_V6_vnavgb :
-Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vnavgb">;
+def int_hexagon_V6_vadduwsat_dv_128B :
+Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vadduwsat_dv_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vnavgb_128B,VI_ftype_VIVI,2)
-// tag : V6_vnavgb_128B
-def int_hexagon_V6_vnavgb_128B :
-Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vnavgb_128B">;
+def int_hexagon_V6_vmpyiwub :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwub">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsb,VI_ftype_VI,1)
-// tag : V6_vabsb
-def int_hexagon_V6_vabsb :
-Hexagon_V65_v512v512_Intrinsic<"HEXAGON_V6_vabsb">;
+def int_hexagon_V6_vmpyiwub_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwub_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsb_128B,VI_ftype_VI,1)
-// tag : V6_vabsb_128B
-def int_hexagon_V6_vabsb_128B :
-Hexagon_V65_v1024v1024_Intrinsic<"HEXAGON_V6_vabsb_128B">;
+def int_hexagon_V6_vsubububb_sat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubububb_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsb_sat,VI_ftype_VI,1)
-// tag : V6_vabsb_sat
-def int_hexagon_V6_vabsb_sat :
-Hexagon_V65_v512v512_Intrinsic<"HEXAGON_V6_vabsb_sat">;
+def int_hexagon_V6_vsubububb_sat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubububb_sat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vabsb_sat_128B,VI_ftype_VI,1)
-// tag : V6_vabsb_sat_128B
-def int_hexagon_V6_vabsb_sat_128B :
-Hexagon_V65_v1024v1024_Intrinsic<"HEXAGON_V6_vabsb_sat_128B">;
+def int_hexagon_V6_ldcnp0 :
+Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldcnp0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabuu,VD_ftype_VDSI,2)
-// tag : V6_vmpabuu
-def int_hexagon_V6_vmpabuu :
-Hexagon_V65_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpabuu">;
+def int_hexagon_V6_ldcnp0_128B :
+Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldcnp0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabuu_128B,VD_ftype_VDSI,2)
-// tag : V6_vmpabuu_128B
-def int_hexagon_V6_vmpabuu_128B :
-Hexagon_V65_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpabuu_128B">;
+def int_hexagon_V6_vlutvwh_oracci :
+Hexagon_v32i32_v32i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracci">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabuu_acc,VD_ftype_VDVDSI,3)
-// tag : V6_vmpabuu_acc
-def int_hexagon_V6_vmpabuu_acc :
-Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpabuu_acc">;
+def int_hexagon_V6_vlutvwh_oracci_128B :
+Hexagon_v64i32_v64i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracci_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpabuu_acc_128B,VD_ftype_VDVDSI,3)
-// tag : V6_vmpabuu_acc_128B
-def int_hexagon_V6_vmpabuu_acc_128B :
-Hexagon_V65_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpabuu_acc_128B">;
+def int_hexagon_V6_vsubbsat :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubbsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyh_acc,VD_ftype_VDVISI,3)
-// tag : V6_vmpyh_acc
-def int_hexagon_V6_vmpyh_acc :
-Hexagon_V65_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpyh_acc">;
+def int_hexagon_V6_vsubbsat_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubbsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyh_acc_128B,VD_ftype_VDVISI,3)
-// tag : V6_vmpyh_acc_128B
-def int_hexagon_V6_vmpyh_acc_128B :
-Hexagon_V65_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyh_acc_128B">;
+// V65 HVX Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpahhsat,VI_ftype_VIVIDI,3)
-// tag : V6_vmpahhsat
-def int_hexagon_V6_vmpahhsat :
-Hexagon_V65_v512v512v512LLi_Intrinsic<"HEXAGON_V6_vmpahhsat">;
+def int_hexagon_V6_vasruhubrndsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasruhubrndsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpahhsat_128B,VI_ftype_VIVIDI,3)
-// tag : V6_vmpahhsat_128B
-def int_hexagon_V6_vmpahhsat_128B :
-Hexagon_V65_v1024v1024v1024LLi_Intrinsic<"HEXAGON_V6_vmpahhsat_128B">;
+def int_hexagon_V6_vasruhubrndsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasruhubrndsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpauhuhsat,VI_ftype_VIVIDI,3)
-// tag : V6_vmpauhuhsat
-def int_hexagon_V6_vmpauhuhsat :
-Hexagon_V65_v512v512v512LLi_Intrinsic<"HEXAGON_V6_vmpauhuhsat">;
+def int_hexagon_V6_vrmpybub_rtt :
+Hexagon_v32i32_v16i32i64_Intrinsic<"HEXAGON_V6_vrmpybub_rtt">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpauhuhsat_128B,VI_ftype_VIVIDI,3)
-// tag : V6_vmpauhuhsat_128B
-def int_hexagon_V6_vmpauhuhsat_128B :
-Hexagon_V65_v1024v1024v1024LLi_Intrinsic<"HEXAGON_V6_vmpauhuhsat_128B">;
+def int_hexagon_V6_vrmpybub_rtt_128B :
+Hexagon_v64i32_v32i32i64_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpsuhuhsat,VI_ftype_VIVIDI,3)
-// tag : V6_vmpsuhuhsat
-def int_hexagon_V6_vmpsuhuhsat :
-Hexagon_V65_v512v512v512LLi_Intrinsic<"HEXAGON_V6_vmpsuhuhsat">;
+def int_hexagon_V6_vmpahhsat :
+Hexagon_v16i32_v16i32v16i32i64_Intrinsic<"HEXAGON_V6_vmpahhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpsuhuhsat_128B,VI_ftype_VIVIDI,3)
-// tag : V6_vmpsuhuhsat_128B
-def int_hexagon_V6_vmpsuhuhsat_128B :
-Hexagon_V65_v1024v1024v1024LLi_Intrinsic<"HEXAGON_V6_vmpsuhuhsat_128B">;
+def int_hexagon_V6_vmpahhsat_128B :
+Hexagon_v32i32_v32i32v32i32i64_Intrinsic<"HEXAGON_V6_vmpahhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlut4,VI_ftype_VIDI,2)
-// tag : V6_vlut4
-def int_hexagon_V6_vlut4 :
-Hexagon_V65_v512v512LLi_Intrinsic<"HEXAGON_V6_vlut4">;
+def int_hexagon_V6_vavguwrnd :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavguwrnd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vlut4_128B,VI_ftype_VIDI,2)
-// tag : V6_vlut4_128B
-def int_hexagon_V6_vlut4_128B :
-Hexagon_V65_v1024v1024LLi_Intrinsic<"HEXAGON_V6_vlut4_128B">;
+def int_hexagon_V6_vavguwrnd_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavguwrnd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuhe,VI_ftype_VISI,2)
-// tag : V6_vmpyuhe
-def int_hexagon_V6_vmpyuhe :
-Hexagon_V65_v512v512i_Intrinsic<"HEXAGON_V6_vmpyuhe">;
+def int_hexagon_V6_vnavgb :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vnavgb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuhe_128B,VI_ftype_VISI,2)
-// tag : V6_vmpyuhe_128B
-def int_hexagon_V6_vmpyuhe_128B :
-Hexagon_V65_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyuhe_128B">;
+def int_hexagon_V6_vnavgb_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vnavgb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuhe_acc,VI_ftype_VIVISI,3)
-// tag : V6_vmpyuhe_acc
-def int_hexagon_V6_vmpyuhe_acc :
-Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyuhe_acc">;
+def int_hexagon_V6_vasrh_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrh_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vmpyuhe_acc_128B,VI_ftype_VIVISI,3)
-// tag : V6_vmpyuhe_acc_128B
-def int_hexagon_V6_vmpyuhe_acc_128B :
-Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyuhe_acc_128B">;
+def int_hexagon_V6_vasrh_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrh_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vprefixqb,VI_ftype_QV,1)
-// tag : V6_vprefixqb
-def int_hexagon_V6_vprefixqb :
-Hexagon_V65_v512v64i_Intrinsic<"HEXAGON_V6_vprefixqb">;
+def int_hexagon_V6_vmpauhuhsat :
+Hexagon_v16i32_v16i32v16i32i64_Intrinsic<"HEXAGON_V6_vmpauhuhsat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vprefixqb_128B,VI_ftype_QV,1)
-// tag : V6_vprefixqb_128B
-def int_hexagon_V6_vprefixqb_128B :
-Hexagon_V65_v1024v128i_Intrinsic<"HEXAGON_V6_vprefixqb_128B">;
+def int_hexagon_V6_vmpauhuhsat_128B :
+Hexagon_v32i32_v32i32v32i32i64_Intrinsic<"HEXAGON_V6_vmpauhuhsat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vprefixqh,VI_ftype_QV,1)
-// tag : V6_vprefixqh
-def int_hexagon_V6_vprefixqh :
-Hexagon_V65_v512v64i_Intrinsic<"HEXAGON_V6_vprefixqh">;
+def int_hexagon_V6_vmpyh_acc :
+Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyh_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vprefixqh_128B,VI_ftype_QV,1)
-// tag : V6_vprefixqh_128B
-def int_hexagon_V6_vprefixqh_128B :
-Hexagon_V65_v1024v128i_Intrinsic<"HEXAGON_V6_vprefixqh_128B">;
+def int_hexagon_V6_vmpyh_acc_128B :
+Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyh_acc_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vprefixqw,VI_ftype_QV,1)
-// tag : V6_vprefixqw
-def int_hexagon_V6_vprefixqw :
-Hexagon_V65_v512v64i_Intrinsic<"HEXAGON_V6_vprefixqw">;
+def int_hexagon_V6_vrmpybub_rtt_acc :
+Hexagon_v32i32_v32i32v16i32i64_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_acc">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vprefixqw_128B,VI_ftype_QV,1)
-// tag : V6_vprefixqw_128B
-def int_hexagon_V6_vprefixqw_128B :
-Hexagon_V65_v1024v128i_Intrinsic<"HEXAGON_V6_vprefixqw_128B">;
+def int_hexagon_V6_vrmpybub_rtt_acc_128B :
+Hexagon_v64i32_v64i32v32i32i64_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_acc_128B">;
 
+def int_hexagon_V6_vavgb :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgb">;
 
-// The scatter/gather ones below will not be generated from iset.py. Make sure
-// you don't overwrite these.
-class Hexagon_V65_vvmemiiv512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,
-                               llvm_v16i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_V6_vavgb_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgb_128B">;
 
-class Hexagon_V65_vvmemiiv1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,
-                               llvm_v32i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_V6_vaslh_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vaslh_acc">;
 
-class Hexagon_V65_vvmemiiv2048_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,
-                               llvm_v64i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_V6_vaslh_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vaslh_acc_128B">;
 
-class Hexagon_V65_vvmemv64iiiv512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty,llvm_v512i1_ty,llvm_i32_ty,
-                               llvm_i32_ty,llvm_v16i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_V6_vavguw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavguw">;
 
-class Hexagon_V65_vvmemv128iiiv1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty,llvm_v1024i1_ty,llvm_i32_ty,
-                               llvm_i32_ty,llvm_v32i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_V6_vavguw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavguw_128B">;
 
-class Hexagon_V65_vvmemv64iiiv1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty,llvm_v512i1_ty,llvm_i32_ty,
-                               llvm_i32_ty,llvm_v32i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_V6_vlut4 :
+Hexagon_v16i32_v16i32i64_Intrinsic<"HEXAGON_V6_vlut4">;
 
-class Hexagon_V65_vvmemv128iiiv2048_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty,llvm_v1024i1_ty,llvm_i32_ty,
-                               llvm_i32_ty,llvm_v64i32_ty],
-                          [IntrArgMemOnly]>;
+def int_hexagon_V6_vlut4_128B :
+Hexagon_v32i32_v32i32i64_Intrinsic<"HEXAGON_V6_vlut4_128B">;
 
-def int_hexagon_V6_vgathermw :
-Hexagon_V65_vvmemiiv512_Intrinsic<"HEXAGON_V6_vgathermw">;
+def int_hexagon_V6_vmpyuhe_acc :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyuhe_acc">;
 
-def int_hexagon_V6_vgathermw_128B :
-Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermw_128B">;
+def int_hexagon_V6_vmpyuhe_acc_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyuhe_acc_128B">;
 
-def int_hexagon_V6_vgathermh :
-Hexagon_V65_vvmemiiv512_Intrinsic<"HEXAGON_V6_vgathermh">;
+def int_hexagon_V6_vrmpyub_rtt :
+Hexagon_v32i32_v16i32i64_Intrinsic<"HEXAGON_V6_vrmpyub_rtt">;
 
-def int_hexagon_V6_vgathermh_128B :
-Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermh_128B">;
+def int_hexagon_V6_vrmpyub_rtt_128B :
+Hexagon_v64i32_v32i32i64_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_128B">;
 
-def int_hexagon_V6_vgathermhw :
-Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermhw">;
+def int_hexagon_V6_vmpsuhuhsat :
+Hexagon_v16i32_v16i32v16i32i64_Intrinsic<"HEXAGON_V6_vmpsuhuhsat">;
 
-def int_hexagon_V6_vgathermhw_128B :
-Hexagon_V65_vvmemiiv2048_Intrinsic<"HEXAGON_V6_vgathermhw_128B">;
+def int_hexagon_V6_vmpsuhuhsat_128B :
+Hexagon_v32i32_v32i32v32i32i64_Intrinsic<"HEXAGON_V6_vmpsuhuhsat_128B">;
 
-def int_hexagon_V6_vgathermwq :
-Hexagon_V65_vvmemv64iiiv512_Intrinsic<"HEXAGON_V6_vgathermwq">;
+def int_hexagon_V6_vasruhubsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasruhubsat">;
 
-def int_hexagon_V6_vgathermwq_128B :
-Hexagon_V65_vvmemv128iiiv1024_Intrinsic<"HEXAGON_V6_vgathermwq_128B">;
+def int_hexagon_V6_vasruhubsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasruhubsat_128B">;
 
-def int_hexagon_V6_vgathermhq :
-Hexagon_V65_vvmemv64iiiv512_Intrinsic<"HEXAGON_V6_vgathermhq">;
+def int_hexagon_V6_vmpyuhe :
+Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyuhe">;
 
-def int_hexagon_V6_vgathermhq_128B :
-Hexagon_V65_vvmemv128iiiv1024_Intrinsic<"HEXAGON_V6_vgathermhq_128B">;
+def int_hexagon_V6_vmpyuhe_128B :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyuhe_128B">;
 
-def int_hexagon_V6_vgathermhwq :
-Hexagon_V65_vvmemv64iiiv1024_Intrinsic<"HEXAGON_V6_vgathermhwq">;
+def int_hexagon_V6_vrmpyub_rtt_acc :
+Hexagon_v32i32_v32i32v16i32i64_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_acc">;
 
-def int_hexagon_V6_vgathermhwq_128B :
-Hexagon_V65_vvmemv128iiiv2048_Intrinsic<"HEXAGON_V6_vgathermhwq_128B">;
+def int_hexagon_V6_vrmpyub_rtt_acc_128B :
+Hexagon_v64i32_v64i32v32i32i64_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_acc_128B">;
 
-class Hexagon_V65_viiv512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_i32_ty,llvm_i32_ty,
-                                           llvm_v16i32_ty,llvm_v16i32_ty],
-                          [IntrWriteMem]>;
+def int_hexagon_V6_vasruwuhsat :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasruwuhsat">;
 
-class Hexagon_V65_viiv1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_i32_ty,llvm_i32_ty,
-                                           llvm_v32i32_ty,llvm_v32i32_ty],
-                          [IntrWriteMem]>;
+def int_hexagon_V6_vasruwuhsat_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasruwuhsat_128B">;
 
-class Hexagon_V65_vv64iiiv512v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_v512i1_ty,llvm_i32_ty,
-                                           llvm_i32_ty,llvm_v16i32_ty,
-                                           llvm_v16i32_ty],
-                          [IntrWriteMem]>;
+def int_hexagon_V6_vmpabuu_acc :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpabuu_acc">;
 
-class Hexagon_V65_vv128iiiv1024v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_v1024i1_ty,llvm_i32_ty,
-                                           llvm_i32_ty,llvm_v32i32_ty,
-                                           llvm_v32i32_ty],
-                          [IntrWriteMem]>;
+def int_hexagon_V6_vmpabuu_acc_128B :
+Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vmpabuu_acc_128B">;
 
-class Hexagon_V65_viiv1024v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_i32_ty,llvm_i32_ty,
-                                           llvm_v32i32_ty,llvm_v16i32_ty],
-                          [IntrWriteMem]>;
+def int_hexagon_V6_vprefixqw :
+Hexagon_v16i32_v512i1_Intrinsic<"HEXAGON_V6_vprefixqw">;
 
-class Hexagon_V65_viiv2048v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_i32_ty,llvm_i32_ty,
-                                           llvm_v64i32_ty,llvm_v32i32_ty],
-                          [IntrWriteMem]>;
+def int_hexagon_V6_vprefixqw_128B :
+Hexagon_v32i32_v1024i1_Intrinsic<"HEXAGON_V6_vprefixqw_128B">;
 
-class Hexagon_V65_vv64iiiv1024v512_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_v512i1_ty,llvm_i32_ty,
-                                           llvm_i32_ty,llvm_v32i32_ty,
-                                           llvm_v16i32_ty],
-                          [IntrWriteMem]>;
+def int_hexagon_V6_vprefixqh :
+Hexagon_v16i32_v512i1_Intrinsic<"HEXAGON_V6_vprefixqh">;
 
-class Hexagon_V65_vv128iiiv2048v1024_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_v1024i1_ty,llvm_i32_ty,
-                                           llvm_i32_ty,llvm_v64i32_ty,
-                                           llvm_v32i32_ty],
-                          [IntrWriteMem]>;
+def int_hexagon_V6_vprefixqh_128B :
+Hexagon_v32i32_v1024i1_Intrinsic<"HEXAGON_V6_vprefixqh_128B">;
 
-class Hexagon_V65_v2048_Intrinsic<string GCCIntSuffix>
- : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_v64i32_ty], [],
-                          [IntrNoMem]>;
+def int_hexagon_V6_vprefixqb :
+Hexagon_v16i32_v512i1_Intrinsic<"HEXAGON_V6_vprefixqb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermw,v_ftype_SISIVIVI,4)
-// tag : V6_vscattermw
-def int_hexagon_V6_vscattermw :
-Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermw">;
+def int_hexagon_V6_vprefixqb_128B :
+Hexagon_v32i32_v1024i1_Intrinsic<"HEXAGON_V6_vprefixqb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermw_128B,v_ftype_SISIVIVI,4)
-// tag : V6_vscattermw_128B
-def int_hexagon_V6_vscattermw_128B :
-Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermw_128B">;
+def int_hexagon_V6_vabsb :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsb">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermh,v_ftype_SISIVIVI,4)
-// tag : V6_vscattermh
-def int_hexagon_V6_vscattermh :
-Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermh">;
+def int_hexagon_V6_vabsb_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsb_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermh_128B,v_ftype_SISIVIVI,4)
-// tag : V6_vscattermh_128B
-def int_hexagon_V6_vscattermh_128B :
-Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermh_128B">;
+def int_hexagon_V6_vavgbrnd :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgbrnd">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermw_add,v_ftype_SISIVIVI,4)
-// tag : V6_vscattermw_add
-def int_hexagon_V6_vscattermw_add :
-Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermw_add">;
+def int_hexagon_V6_vavgbrnd_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgbrnd_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermw_add_128B,v_ftype_SISIVIVI,4)
-// tag : V6_vscattermw_add_128B
-def int_hexagon_V6_vscattermw_add_128B :
-Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermw_add_128B">;
+def int_hexagon_V6_vdd0 :
+Hexagon_v32i32__Intrinsic<"HEXAGON_V6_vdd0">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermh_add,v_ftype_SISIVIVI,4)
-// tag : V6_vscattermh_add
-def int_hexagon_V6_vscattermh_add :
-Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermh_add">;
+def int_hexagon_V6_vdd0_128B :
+Hexagon_v64i32__Intrinsic<"HEXAGON_V6_vdd0_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermh_add_128B,v_ftype_SISIVIVI,4)
-// tag : V6_vscattermh_add_128B
-def int_hexagon_V6_vscattermh_add_128B :
-Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermh_add_128B">;
+def int_hexagon_V6_vmpabuu :
+Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpabuu">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermwq,v_ftype_QVSISIVIVI,5)
-// tag : V6_vscattermwq
-def int_hexagon_V6_vscattermwq :
-Hexagon_V65_vv64iiiv512v512_Intrinsic<"HEXAGON_V6_vscattermwq">;
+def int_hexagon_V6_vmpabuu_128B :
+Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vmpabuu_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermwq_128B,v_ftype_QVSISIVIVI,5)
-// tag : V6_vscattermwq_128B
-def int_hexagon_V6_vscattermwq_128B :
-Hexagon_V65_vv128iiiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermwq_128B">;
+def int_hexagon_V6_vabsb_sat :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsb_sat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermhq,v_ftype_QVSISIVIVI,5)
-// tag : V6_vscattermhq
-def int_hexagon_V6_vscattermhq :
-Hexagon_V65_vv64iiiv512v512_Intrinsic<"HEXAGON_V6_vscattermhq">;
+def int_hexagon_V6_vabsb_sat_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsb_sat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermhq_128B,v_ftype_QVSISIVIVI,5)
-// tag : V6_vscattermhq_128B
-def int_hexagon_V6_vscattermhq_128B :
-Hexagon_V65_vv128iiiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermhq_128B">;
+// V66 HVX Instructions.
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermhw,v_ftype_SISIVDVI,4)
-// tag : V6_vscattermhw
-def int_hexagon_V6_vscattermhw :
-Hexagon_V65_viiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhw">;
+def int_hexagon_V6_vaddcarrysat :
+Hexagon_v16i32_v16i32v16i32v512i1_Intrinsic<"HEXAGON_V6_vaddcarrysat">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermhw_128B,v_ftype_SISIVDVI,4)
-// tag : V6_vscattermhw_128B
-def int_hexagon_V6_vscattermhw_128B :
-Hexagon_V65_viiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhw_128B">;
+def int_hexagon_V6_vaddcarrysat_128B :
+Hexagon_v32i32_v32i32v32i32v1024i1_Intrinsic<"HEXAGON_V6_vaddcarrysat_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermhwq,v_ftype_QVSISIVDVI,5)
-// tag : V6_vscattermhwq
-def int_hexagon_V6_vscattermhwq :
-Hexagon_V65_vv64iiiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhwq">;
+def int_hexagon_V6_vasr_into :
+Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vasr_into">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermhwq_128B,v_ftype_QVSISIVDVI,5)
-// tag : V6_vscattermhwq_128B
-def int_hexagon_V6_vscattermhwq_128B :
-Hexagon_V65_vv128iiiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhwq_128B">;
+def int_hexagon_V6_vasr_into_128B :
+Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vasr_into_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermhw_add,v_ftype_SISIVDVI,4)
-// tag : V6_vscattermhw_add
-def int_hexagon_V6_vscattermhw_add :
-Hexagon_V65_viiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhw_add">;
+def int_hexagon_V6_vsatdw :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsatdw">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vscattermhw_add_128B,v_ftype_SISIVDVI,4)
-// tag : V6_vscattermhw_add_128B
-def int_hexagon_V6_vscattermhw_add_128B :
-Hexagon_V65_viiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhw_add_128B">;
+def int_hexagon_V6_vsatdw_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsatdw_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdd0,VD_ftype_,0)
-// tag : V6_vdd0
-def int_hexagon_V6_vdd0 :
-Hexagon_v1024_Intrinsic<"HEXAGON_V6_vdd0">;
+def int_hexagon_V6_vrotr :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrotr">;
+
+def int_hexagon_V6_vrotr_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrotr_128B">;
 
-//
-// BUILTIN_INFO(HEXAGON.V6_vdd0_128B,VD_ftype_,0)
-// tag : V6_vdd0_128B
-def int_hexagon_V6_vdd0_128B :
-Hexagon_V65_v2048_Intrinsic<"HEXAGON_V6_vdd0_128B">;
diff --git a/include/llvm/IR/IntrinsicsRISCV.td b/include/llvm/IR/IntrinsicsRISCV.td
index b656622b3632d0f509b977ab1576d5b70af9c8f9..0ac7348b56dbf87d1aecaa33510732b544afcea8 100644
--- a/include/llvm/IR/IntrinsicsRISCV.td
+++ b/include/llvm/IR/IntrinsicsRISCV.td
@@ -36,4 +36,9 @@ def int_riscv_masked_atomicrmw_min_i32  : MaskedAtomicRMW32WithSextIntrinsic;
 def int_riscv_masked_atomicrmw_umax_i32 : MaskedAtomicRMW32Intrinsic;
 def int_riscv_masked_atomicrmw_umin_i32 : MaskedAtomicRMW32Intrinsic;
 
+def int_riscv_masked_cmpxchg_i32
+    : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty, llvm_i32_ty, llvm_i32_ty,
+                                llvm_i32_ty, llvm_i32_ty],
+                [IntrArgMemOnly, NoCapture<0>]>;
+
 } // TargetPrefix = "riscv"
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 53a1e214ae9625bccfe684ff4a0bb6ea84d4c2fb..a59dbe7ea4a42150b375e7d6e88ee8538e4cfc13 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -2726,22 +2726,16 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // ADX
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_addcarryx_u32:
+  def int_x86_addcarry_32:
         Intrinsic<[llvm_i8_ty, llvm_i32_ty],
                   [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_addcarryx_u64:
+  def int_x86_addcarry_64:
         Intrinsic<[llvm_i8_ty, llvm_i64_ty],
                   [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_addcarry_u32:
+  def int_x86_subborrow_32:
         Intrinsic<[llvm_i8_ty, llvm_i32_ty],
                   [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_addcarry_u64:
-        Intrinsic<[llvm_i8_ty, llvm_i64_ty],
-                  [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_subborrow_u32:
-        Intrinsic<[llvm_i8_ty, llvm_i32_ty],
-                  [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_subborrow_u64:
+  def int_x86_subborrow_64:
         Intrinsic<[llvm_i8_ty, llvm_i64_ty],
                   [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
 }
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 91e44addbd606b72e6325ec3ddd74b89ae5b99b7..ef4a4a9840b2f5799930edd61322829f430d5ca1 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -49,6 +49,7 @@ class MemoryBuffer;
 class RandomNumberGenerator;
 template <class PtrType> class SmallPtrSetImpl;
 class StructType;
+class VersionTuple;
 
 /// A Module instance is used to store all the information related to an
 /// LLVM module. Modules are the top level container of all other LLVM
@@ -868,6 +869,17 @@ public:
   /// Set that PLT should be avoid for RTLib calls.
   void setRtLibUseGOT();
 
+  /// @name Utility functions for querying and setting the build SDK version
+  /// @{
+
+  /// Attach a build SDK version metadata to this module.
+  void setSDKVersion(const VersionTuple &V);
+
+  /// Get the build SDK version metadata.
+  ///
+  /// An empty version is returned if no such metadata is attached.
+  VersionTuple getSDKVersion() const;
+  /// @}
 
   /// Take ownership of the given memory buffer.
   void setOwnedMemoryBuffer(std::unique_ptr<MemoryBuffer> MB);
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index 9a456acf96652e8cdd2e429fe602d87a6b0299fc..6653795d5034b84377a328525cde071012d1b38c 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -163,13 +163,13 @@ using GlobalValueSummaryMapTy =
 /// Struct that holds a reference to a particular GUID in a global value
 /// summary.
 struct ValueInfo {
-  PointerIntPair<const GlobalValueSummaryMapTy::value_type *, 1, bool>
-      RefAndFlag;
+  PointerIntPair<const GlobalValueSummaryMapTy::value_type *, 2, int>
+      RefAndFlags;
 
   ValueInfo() = default;
   ValueInfo(bool HaveGVs, const GlobalValueSummaryMapTy::value_type *R) {
-    RefAndFlag.setPointer(R);
-    RefAndFlag.setInt(HaveGVs);
+    RefAndFlags.setPointer(R);
+    RefAndFlags.setInt(HaveGVs);
   }
 
   operator bool() const { return getRef(); }
@@ -189,10 +189,12 @@ struct ValueInfo {
                      : getRef()->second.U.Name;
   }
 
-  bool haveGVs() const { return RefAndFlag.getInt(); }
+  bool haveGVs() const { return RefAndFlags.getInt() & 0x1; }
+  bool isReadOnly() const { return RefAndFlags.getInt() & 0x2; }
+  void setReadOnly() { RefAndFlags.setInt(RefAndFlags.getInt() | 0x2); }
 
   const GlobalValueSummaryMapTy::value_type *getRef() const {
-    return RefAndFlag.getPointer();
+    return RefAndFlags.getPointer();
   }
 
   bool isDSOLocal() const;
@@ -499,8 +501,9 @@ public:
         FunctionSummary::GVFlags(
             GlobalValue::LinkageTypes::AvailableExternallyLinkage,
             /*NotEligibleToImport=*/true, /*Live=*/true, /*IsLocal=*/false),
-        0, FunctionSummary::FFlags{}, std::vector<ValueInfo>(),
-        std::move(Edges), std::vector<GlobalValue::GUID>(),
+        /*InsCount=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0,
+        std::vector<ValueInfo>(), std::move(Edges),
+        std::vector<GlobalValue::GUID>(),
         std::vector<FunctionSummary::VFuncId>(),
         std::vector<FunctionSummary::VFuncId>(),
         std::vector<FunctionSummary::ConstVCall>(),
@@ -518,6 +521,11 @@ private:
   /// Function summary specific flags.
   FFlags FunFlags;
 
+  /// The synthesized entry count of the function.
+  /// This is only populated during ThinLink phase and remains unused while
+  /// generating per-module summaries.
+  uint64_t EntryCount = 0;
+
   /// List of <CalleeValueInfo, CalleeInfo> call edge pairs from this function.
   std::vector<EdgeTy> CallGraphEdgeList;
 
@@ -525,14 +533,15 @@ private:
 
 public:
   FunctionSummary(GVFlags Flags, unsigned NumInsts, FFlags FunFlags,
-                  std::vector<ValueInfo> Refs, std::vector<EdgeTy> CGEdges,
+                  uint64_t EntryCount, std::vector<ValueInfo> Refs,
+                  std::vector<EdgeTy> CGEdges,
                   std::vector<GlobalValue::GUID> TypeTests,
                   std::vector<VFuncId> TypeTestAssumeVCalls,
                   std::vector<VFuncId> TypeCheckedLoadVCalls,
                   std::vector<ConstVCall> TypeTestAssumeConstVCalls,
                   std::vector<ConstVCall> TypeCheckedLoadConstVCalls)
       : GlobalValueSummary(FunctionKind, Flags, std::move(Refs)),
-        InstCount(NumInsts), FunFlags(FunFlags),
+        InstCount(NumInsts), FunFlags(FunFlags), EntryCount(EntryCount),
         CallGraphEdgeList(std::move(CGEdges)) {
     if (!TypeTests.empty() || !TypeTestAssumeVCalls.empty() ||
         !TypeCheckedLoadVCalls.empty() || !TypeTestAssumeConstVCalls.empty() ||
@@ -543,6 +552,8 @@ public:
           std::move(TypeTestAssumeConstVCalls),
           std::move(TypeCheckedLoadConstVCalls)});
   }
+  // Gets the number of immutable refs in RefEdgeList
+  unsigned immutableRefCount() const;
 
   /// Check if this is a function summary.
   static bool classof(const GlobalValueSummary *GVS) {
@@ -555,6 +566,12 @@ public:
   /// Get the instruction count recorded for this function.
   unsigned instCount() const { return InstCount; }
 
+  /// Get the synthetic entry count for this function.
+  uint64_t entryCount() const { return EntryCount; }
+
+  /// Set the synthetic entry count for this function.
+  void setEntryCount(uint64_t EC) { EntryCount = EC; }
+
   /// Return the list of <CalleeValueInfo, CalleeInfo> pairs.
   ArrayRef<EdgeTy> calls() const { return CallGraphEdgeList; }
 
@@ -652,19 +669,30 @@ template <> struct DenseMapInfo<FunctionSummary::ConstVCall> {
 /// Global variable summary information to aid decisions and
 /// implementation of importing.
 ///
-/// Currently this doesn't add anything to the base \p GlobalValueSummary,
-/// but is a placeholder as additional info may be added to the summary
-/// for variables.
+/// Global variable summary has extra flag, telling if it is
+/// modified during the program run or not. This affects ThinLTO
+/// internalization
 class GlobalVarSummary : public GlobalValueSummary {
-
 public:
-  GlobalVarSummary(GVFlags Flags, std::vector<ValueInfo> Refs)
-      : GlobalValueSummary(GlobalVarKind, Flags, std::move(Refs)) {}
+  struct GVarFlags {
+    GVarFlags(bool ReadOnly = false) : ReadOnly(ReadOnly) {}
+
+    unsigned ReadOnly : 1;
+  } VarFlags;
+
+  GlobalVarSummary(GVFlags Flags, GVarFlags VarFlags,
+                   std::vector<ValueInfo> Refs)
+      : GlobalValueSummary(GlobalVarKind, Flags, std::move(Refs)),
+        VarFlags(VarFlags) {}
 
   /// Check if this is a global variable summary.
   static bool classof(const GlobalValueSummary *GVS) {
     return GVS->getSummaryKind() == GlobalVarKind;
   }
+
+  GVarFlags varflags() const { return VarFlags; }
+  void setReadOnly(bool RO) { VarFlags.ReadOnly = RO; }
+  bool isReadOnly() const { return VarFlags.ReadOnly; }
 };
 
 struct TypeTestResolution {
@@ -787,6 +815,9 @@ private:
   /// considered live.
   bool WithGlobalValueDeadStripping = false;
 
+  /// Indicates that summary-based synthetic entry count propagation has run
+  bool HasSyntheticEntryCounts = false;
+
   /// Indicates that distributed backend should skip compilation of the
   /// module. Flag is suppose to be set by distributed ThinLTO indexing
   /// when it detected that the module is not needed during the final
@@ -899,6 +930,9 @@ public:
     WithGlobalValueDeadStripping = true;
   }
 
+  bool hasSyntheticEntryCounts() const { return HasSyntheticEntryCounts; }
+  void setHasSyntheticEntryCounts() { HasSyntheticEntryCounts = true; }
+
   bool skipModuleByDistributedBackend() const {
     return SkipModuleByDistributedBackend;
   }
@@ -931,7 +965,7 @@ public:
   // Save a string in the Index. Use before passing Name to
   // getOrInsertValueInfo when the string isn't owned elsewhere (e.g. on the
   // module's Strtab).
-  StringRef saveString(std::string String) { return Saver.save(String); }
+  StringRef saveString(StringRef String) { return Saver.save(String); }
 
   /// Return a ValueInfo for \p GUID setting value \p Name.
   ValueInfo getOrInsertValueInfo(GlobalValue::GUID GUID, StringRef Name) {
@@ -1135,11 +1169,15 @@ public:
 
   /// Print out strongly connected components for debugging.
   void dumpSCCs(raw_ostream &OS);
+
+  /// Analyze index and detect unmodified globals
+  void propagateConstants(const DenseSet<GlobalValue::GUID> &PreservedSymbols);
 };
 
 /// GraphTraits definition to build SCC for the index
 template <> struct GraphTraits<ValueInfo> {
   typedef ValueInfo NodeRef;
+  using EdgeRef = FunctionSummary::EdgeTy &;
 
   static NodeRef valueInfoFromEdge(FunctionSummary::EdgeTy &P) {
     return P.first;
@@ -1148,6 +1186,8 @@ template <> struct GraphTraits<ValueInfo> {
       mapped_iterator<std::vector<FunctionSummary::EdgeTy>::iterator,
                       decltype(&valueInfoFromEdge)>;
 
+  using ChildEdgeIteratorType = std::vector<FunctionSummary::EdgeTy>::iterator;
+
   static NodeRef getEntryNode(ValueInfo V) { return V; }
 
   static ChildIteratorType child_begin(NodeRef N) {
@@ -1169,6 +1209,26 @@ template <> struct GraphTraits<ValueInfo> {
         cast<FunctionSummary>(N.getSummaryList().front()->getBaseObject());
     return ChildIteratorType(F->CallGraphEdgeList.end(), &valueInfoFromEdge);
   }
+
+  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
+    if (!N.getSummaryList().size()) // handle external function
+      return FunctionSummary::ExternalNode.CallGraphEdgeList.begin();
+
+    FunctionSummary *F =
+        cast<FunctionSummary>(N.getSummaryList().front()->getBaseObject());
+    return F->CallGraphEdgeList.begin();
+  }
+
+  static ChildEdgeIteratorType child_edge_end(NodeRef N) {
+    if (!N.getSummaryList().size()) // handle external function
+      return FunctionSummary::ExternalNode.CallGraphEdgeList.end();
+
+    FunctionSummary *F =
+        cast<FunctionSummary>(N.getSummaryList().front()->getBaseObject());
+    return F->CallGraphEdgeList.end();
+  }
+
+  static NodeRef edge_dest(EdgeRef E) { return E.first; }
 };
 
 template <>
@@ -1184,6 +1244,14 @@ struct GraphTraits<ModuleSummaryIndex *> : public GraphTraits<ValueInfo> {
   }
 };
 
+static inline bool canImportGlobalVar(GlobalValueSummary *S) {
+  assert(isa<GlobalVarSummary>(S->getBaseObject()));
+
+  // We don't import GV with references, because it can result
+  // in promotion of local variables in the source module.
+  return !GlobalValue::isInterposableLinkage(S->linkage()) &&
+         !S->notEligibleToImport() && S->refs().empty();
+}
 } // end namespace llvm
 
 #endif // LLVM_IR_MODULESUMMARYINDEX_H
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
index 56f56b4b8c2e613ccd9365adf280af7d8be12ba3..a88ee26b51c3534abc1b2b280cd9fcd89a911c09 100644
--- a/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -224,7 +224,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
           GlobalValueSummary::GVFlags(
               static_cast<GlobalValue::LinkageTypes>(FSum.Linkage),
               FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal),
-          0, FunctionSummary::FFlags{}, Refs,
+          /*NumInsts=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0, Refs,
           ArrayRef<FunctionSummary::EdgeTy>{}, std::move(FSum.TypeTests),
           std::move(FSum.TypeTestAssumeVCalls),
           std::move(FSum.TypeCheckedLoadVCalls),
diff --git a/include/llvm/IR/PassInstrumentation.h b/include/llvm/IR/PassInstrumentation.h
index 3718b53c45348f3fbcb00ab23e22ca8f5b384c1a..08dac1c4a2746599f86f46368e5eaa08585aa87f 100644
--- a/include/llvm/IR/PassInstrumentation.h
+++ b/include/llvm/IR/PassInstrumentation.h
@@ -68,10 +68,17 @@ class PreservedAnalyses;
 /// PassInstrumentation to pass control to the registered callbacks.
 class PassInstrumentationCallbacks {
 public:
-  // Before/After callbacks accept IRUnits, so they need to take them
-  // as pointers, wrapped with llvm::Any
+  // Before/After callbacks accept IRUnits whenever appropriate, so they need
+  // to take them as constant pointers, wrapped with llvm::Any.
+  // For the case when IRUnit has been invalidated there is a different
+  // callback to use - AfterPassInvalidated.
+  // TODO: currently AfterPassInvalidated does not accept IRUnit, since passing
+  // already invalidated IRUnit is unsafe. There are ways to handle invalidated IRUnits
+  // in a safe way, and we might pursue that as soon as there is a useful instrumentation
+  // that needs it.
   using BeforePassFunc = bool(StringRef, Any);
   using AfterPassFunc = void(StringRef, Any);
+  using AfterPassInvalidatedFunc = void(StringRef);
   using BeforeAnalysisFunc = void(StringRef, Any);
   using AfterAnalysisFunc = void(StringRef, Any);
 
@@ -90,6 +97,11 @@ public:
     AfterPassCallbacks.emplace_back(std::move(C));
   }
 
+  template <typename CallableT>
+  void registerAfterPassInvalidatedCallback(CallableT C) {
+    AfterPassInvalidatedCallbacks.emplace_back(std::move(C));
+  }
+
   template <typename CallableT>
   void registerBeforeAnalysisCallback(CallableT C) {
     BeforeAnalysisCallbacks.emplace_back(std::move(C));
@@ -105,6 +117,8 @@ private:
 
   SmallVector<llvm::unique_function<BeforePassFunc>, 4> BeforePassCallbacks;
   SmallVector<llvm::unique_function<AfterPassFunc>, 4> AfterPassCallbacks;
+  SmallVector<llvm::unique_function<AfterPassInvalidatedFunc>, 4>
+      AfterPassInvalidatedCallbacks;
   SmallVector<llvm::unique_function<BeforeAnalysisFunc>, 4>
       BeforeAnalysisCallbacks;
   SmallVector<llvm::unique_function<AfterAnalysisFunc>, 4>
@@ -139,7 +153,8 @@ public:
   }
 
   /// AfterPass instrumentation point - takes \p Pass instance that has
-  /// just been executed and constant reference to IR it operates on.
+  /// just been executed and constant reference to \p IR it operates on.
+  /// \p IR is guaranteed to be valid at this point.
   template <typename IRUnitT, typename PassT>
   void runAfterPass(const PassT &Pass, const IRUnitT &IR) const {
     if (Callbacks)
@@ -147,6 +162,16 @@ public:
         C(Pass.name(), llvm::Any(&IR));
   }
 
+  /// AfterPassInvalidated instrumentation point - takes \p Pass instance
+  /// that has just been executed. For use when IR has been invalidated
+  /// by \p Pass execution.
+  template <typename IRUnitT, typename PassT>
+  void runAfterPassInvalidated(const PassT &Pass) const {
+    if (Callbacks)
+      for (auto &C : Callbacks->AfterPassInvalidatedCallbacks)
+        C(Pass.name());
+  }
+
   /// BeforeAnalysis instrumentation point - takes \p Analysis instance
   /// to be executed and constant reference to IR it operates on.
   template <typename IRUnitT, typename PassT>
diff --git a/include/llvm/IR/PassTimingInfo.h b/include/llvm/IR/PassTimingInfo.h
index 2c6a0e08a368f2405dcbfd81e97b80d00300d75b..e9945f997f4395b65b4a82b9935e8432b8144b0b 100644
--- a/include/llvm/IR/PassTimingInfo.h
+++ b/include/llvm/IR/PassTimingInfo.h
@@ -99,8 +99,8 @@ private:
   void stopTimer(StringRef PassID);
 
   // Implementation of pass instrumentation callbacks.
-  bool runBeforePass(StringRef PassID, Any IR);
-  void runAfterPass(StringRef PassID, Any IR);
+  bool runBeforePass(StringRef PassID);
+  void runAfterPass(StringRef PassID);
 };
 
 } // namespace llvm
diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index dd30072ce57110a849e10dc7149cde2c7daa423f..e1e7c7268b5d690a7e08b77caf5b97310a15e21b 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -215,6 +215,7 @@ template <typename Predicate> struct cst_pred_ty : public Predicate {
         // Non-splat vector constant: check each element for a match.
         unsigned NumElts = V->getType()->getVectorNumElements();
         assert(NumElts != 0 && "Constant vector with no elements?");
+        bool HasNonUndefElements = false;
         for (unsigned i = 0; i != NumElts; ++i) {
           Constant *Elt = C->getAggregateElement(i);
           if (!Elt)
@@ -224,8 +225,9 @@ template <typename Predicate> struct cst_pred_ty : public Predicate {
           auto *CI = dyn_cast<ConstantInt>(Elt);
           if (!CI || !this->isValue(CI->getValue()))
             return false;
+          HasNonUndefElements = true;
         }
-        return true;
+        return HasNonUndefElements;
       }
     }
     return false;
@@ -272,6 +274,7 @@ template <typename Predicate> struct cstfp_pred_ty : public Predicate {
         // Non-splat vector constant: check each element for a match.
         unsigned NumElts = V->getType()->getVectorNumElements();
         assert(NumElts != 0 && "Constant vector with no elements?");
+        bool HasNonUndefElements = false;
         for (unsigned i = 0; i != NumElts; ++i) {
           Constant *Elt = C->getAggregateElement(i);
           if (!Elt)
@@ -281,8 +284,9 @@ template <typename Predicate> struct cstfp_pred_ty : public Predicate {
           auto *CF = dyn_cast<ConstantFP>(Elt);
           if (!CF || !this->isValue(CF->getValueAPF()))
             return false;
+          HasNonUndefElements = true;
         }
-        return true;
+        return HasNonUndefElements;
       }
     }
     return false;
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 1a9c6f82bfdfdecaff405f4003a40f345fb83a5b..206089cf3a8f4a95896573f8c4b51dcec9d2c23b 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -140,6 +140,7 @@ void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeExpandMemCmpPassPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
+void initializeMakeGuardsExplicitLegacyPassPass(PassRegistry&);
 void initializeExternalAAWrapperPassPass(PassRegistry&);
 void initializeFEntryInserterPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
@@ -182,6 +183,7 @@ void initializeInstrProfilingLegacyPassPass(PassRegistry&);
 void initializeInstructionCombiningPassPass(PassRegistry&);
 void initializeInstructionSelectPass(PassRegistry&);
 void initializeInterleavedAccessPass(PassRegistry&);
+void initializeInterleavedLoadCombinePass(PassRegistry &);
 void initializeInternalizeLegacyPassPass(PassRegistry&);
 void initializeIntervalPartitionPass(PassRegistry&);
 void initializeJumpThreadingPass(PassRegistry&);
@@ -205,7 +207,7 @@ void initializeLiveRangeShrinkPass(PassRegistry&);
 void initializeLiveRegMatrixPass(PassRegistry&);
 void initializeLiveStacksPass(PassRegistry&);
 void initializeLiveVariablesPass(PassRegistry&);
-void initializeLoadStoreVectorizerPass(PassRegistry&);
+void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry&);
 void initializeLoaderPassPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
 void initializeLocalizerPass(PassRegistry&);
@@ -354,7 +356,7 @@ void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
 void initializeSanitizerCoverageModulePass(PassRegistry&);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
 void initializeScalarizeMaskedMemIntrinPass(PassRegistry&);
-void initializeScalarizerPass(PassRegistry&);
+void initializeScalarizerLegacyPassPass(PassRegistry&);
 void initializeScavengerTestPass(PassRegistry&);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
 void initializeSeparateConstOffsetFromGEPPass(PassRegistry&);
@@ -371,6 +373,8 @@ void initializeSpillPlacementPass(PassRegistry&);
 void initializeStackColoringPass(PassRegistry&);
 void initializeStackMapLivenessPass(PassRegistry&);
 void initializeStackProtectorPass(PassRegistry&);
+void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &);
+void initializeStackSafetyInfoWrapperPassPass(PassRegistry &);
 void initializeStackSlotColoringPass(PassRegistry&);
 void initializeStraightLineStrengthReducePass(PassRegistry&);
 void initializeStripDeadDebugInfoPass(PassRegistry&);
@@ -396,6 +400,7 @@ void initializeUnreachableMachineBlockElimPass(PassRegistry&);
 void initializeVerifierLegacyPassPass(PassRegistry&);
 void initializeVirtRegMapPass(PassRegistry&);
 void initializeVirtRegRewriterPass(PassRegistry&);
+void initializeWarnMissedTransformationsLegacyPass(PassRegistry &);
 void initializeWasmEHPreparePass(PassRegistry&);
 void initializeWholeProgramDevirtPass(PassRegistry&);
 void initializeWinEHPreparePass(PassRegistry&);
diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h
index c0ad32f485c3f0a126ec10204bfc1ebbb02225d5..7058602c3ee2ef01f1456c61ee7d410583c17bb7 100644
--- a/include/llvm/LTO/Config.h
+++ b/include/llvm/LTO/Config.h
@@ -49,6 +49,10 @@ struct Config {
   /// Use the new pass manager
   bool UseNewPM = false;
 
+  /// Flag to indicate that the optimizer should not assume builtins are present
+  /// on the target.
+  bool Freestanding = false;
+
   /// Disable entirely the optimizer, including importing for ThinLTO
   bool CodeGenOnly = false;
 
diff --git a/include/llvm/LTO/LTO.h b/include/llvm/LTO/LTO.h
index 7d6beab6b441a0dfe4aeeb3ec1546cab99162ee6..15390873056af5b56f921a3acff143ce7db6c886 100644
--- a/include/llvm/LTO/LTO.h
+++ b/include/llvm/LTO/LTO.h
@@ -40,13 +40,13 @@ class Module;
 class Target;
 class raw_pwrite_stream;
 
-/// Resolve Weak and LinkOnce values in the \p Index. Linkage changes recorded
-/// in the index and the ThinLTO backends must apply the changes to the Module
-/// via thinLTOResolveWeakForLinkerModule.
+/// Resolve linkage for prevailing symbols in the \p Index. Linkage changes
+/// recorded in the index and the ThinLTO backends must apply the changes to
+/// the module via thinLTOResolvePrevailingInModule.
 ///
 /// This is done for correctness (if value exported, ensure we always
 /// emit a copy), and compile-time optimization (allow drop of duplicates).
-void thinLTOResolveWeakForLinkerInIndex(
+void thinLTOResolvePrevailingInIndex(
     ModuleSummaryIndex &Index,
     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing,
@@ -60,6 +60,19 @@ void thinLTOInternalizeAndPromoteInIndex(
     ModuleSummaryIndex &Index,
     function_ref<bool(StringRef, GlobalValue::GUID)> isExported);
 
+/// Computes a unique hash for the Module considering the current list of
+/// export/import and other global analysis results.
+/// The hash is produced in \p Key.
+void computeLTOCacheKey(
+    SmallString<40> &Key, const lto::Config &Conf,
+    const ModuleSummaryIndex &Index, StringRef ModuleID,
+    const FunctionImporter::ImportMapTy &ImportList,
+    const FunctionImporter::ExportSetTy &ExportList,
+    const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+    const GVSummaryMapTy &DefinedGlobals,
+    const std::set<GlobalValue::GUID> &CfiFunctionDefs = {},
+    const std::set<GlobalValue::GUID> &CfiFunctionDecls = {});
+
 namespace lto {
 
 /// Given the original \p Path to an output file, replace any path
diff --git a/include/llvm/LTO/SummaryBasedOptimizations.h b/include/llvm/LTO/SummaryBasedOptimizations.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad3a8e7dc77b4a1b003a6feeb82fbad6afeae932
--- /dev/null
+++ b/include/llvm/LTO/SummaryBasedOptimizations.h
@@ -0,0 +1,17 @@
+//=- llvm/LTO/SummaryBasedOptimizations.h -Link time optimizations-*- C++ -*-=//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LTO_SUMMARYBASEDOPTIMIZATIONS_H
+#define LLVM_LTO_SUMMARYBASEDOPTIMIZATIONS_H
+namespace llvm {
+class ModuleSummaryIndex;
+void computeSyntheticCounts(ModuleSummaryIndex &Index);
+
+} // namespace llvm
+#endif
diff --git a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index 2ff317a322737119789eb9ca223dc9fa8d0222a4..d4c69a1ce260b7337d1ab49dad9bacfa864db806 100644
--- a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -273,8 +273,8 @@ public:
   /**
    * Compute and emit the imported files for module at \p ModulePath.
    */
-  static void emitImports(StringRef ModulePath, StringRef OutputName,
-                          ModuleSummaryIndex &Index);
+  void emitImports(Module &Module, StringRef OutputName,
+                   ModuleSummaryIndex &Index);
 
   /**
    * Perform cross-module importing for the module identified by
@@ -285,8 +285,8 @@ public:
   /**
    * Compute the list of summaries needed for importing into module.
    */
-  static void gatherImportedSummariesForModule(
-      StringRef ModulePath, ModuleSummaryIndex &Index,
+  void gatherImportedSummariesForModule(
+      Module &Module, ModuleSummaryIndex &Index,
       std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex);
 
   /**
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index e2378cea90ce187fc00c4bc44d8e1ac68eda3305..a31caeea2eedcf6dbbffb87521fe38ccf1dae0d6 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -50,6 +50,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
@@ -219,6 +220,7 @@ namespace {
       (void) llvm::createFloat2IntPass();
       (void) llvm::createEliminateAvailableExternallyPass();
       (void) llvm::createScalarizeMaskedMemIntrinPass();
+      (void) llvm::createWarnMissedTransformationsPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolutionWrapperPass();
diff --git a/include/llvm/MC/MCAsmMacro.h b/include/llvm/MC/MCAsmMacro.h
index 09b32c7ea3337f98673f35687a7494138a8cc620..135fa4f2e33d64a800b95ecf0d018eceaaa67f21 100644
--- a/include/llvm/MC/MCAsmMacro.h
+++ b/include/llvm/MC/MCAsmMacro.h
@@ -52,7 +52,7 @@ public:
     Pipe, PipePipe, Caret,
     Amp, AmpAmp, Exclaim, ExclaimEqual, Percent, Hash,
     Less, LessEqual, LessLess, LessGreater,
-    Greater, GreaterEqual, GreaterGreater, At,
+    Greater, GreaterEqual, GreaterGreater, At, MinusGreater,
 
     // MIPS unary expression operators such as %neg.
     PercentCall16, PercentCall_Hi, PercentCall_Lo, PercentDtprel_Hi,
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 0f9499d705e4666a33b650be21c932e5a2e71fb7..986c6e17548fc3501c329a314c66d5a86975fb5c 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/VersionTuple.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -94,6 +95,8 @@ public:
     unsigned Major;
     unsigned Minor;
     unsigned Update;
+    /// An optional version of the SDK that was used to build the source.
+    VersionTuple SDKVersion;
   };
 
 private:
@@ -255,20 +258,24 @@ public:
   /// MachO deployment target version information.
   const VersionInfoType &getVersionInfo() const { return VersionInfo; }
   void setVersionMin(MCVersionMinType Type, unsigned Major, unsigned Minor,
-                     unsigned Update) {
+                     unsigned Update,
+                     VersionTuple SDKVersion = VersionTuple()) {
     VersionInfo.EmitBuildVersion = false;
     VersionInfo.TypeOrPlatform.Type = Type;
     VersionInfo.Major = Major;
     VersionInfo.Minor = Minor;
     VersionInfo.Update = Update;
+    VersionInfo.SDKVersion = SDKVersion;
   }
   void setBuildVersion(MachO::PlatformType Platform, unsigned Major,
-                       unsigned Minor, unsigned Update) {
+                       unsigned Minor, unsigned Update,
+                       VersionTuple SDKVersion = VersionTuple()) {
     VersionInfo.EmitBuildVersion = true;
     VersionInfo.TypeOrPlatform.Platform = Platform;
     VersionInfo.Major = Major;
     VersionInfo.Minor = Minor;
     VersionInfo.Update = Update;
+    VersionInfo.SDKVersion = SDKVersion;
   }
 
   /// Reuse an assembler instance
diff --git a/include/llvm/MC/MCCodeView.h b/include/llvm/MC/MCCodeView.h
index 2678cf4388f8d3b56c89320398cd1306e713de7a..cef03a409f9506c903be3739ae138511a9c06228 100644
--- a/include/llvm/MC/MCCodeView.h
+++ b/include/llvm/MC/MCCodeView.h
@@ -194,7 +194,7 @@ public:
   void encodeInlineLineTable(MCAsmLayout &Layout,
                              MCCVInlineLineTableFragment &F);
 
-  void
+  MCFragment *
   emitDefRange(MCObjectStreamer &OS,
                ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
                StringRef FixedSizePortion);
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 2bfaf19cf2c64ace28f67de449673d51b2ea05ff..aeaa4e55879b08983bef054277ac874379657fa0 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -430,6 +430,7 @@ public:
     OpUndefined,
     OpRegister,
     OpWindowSave,
+    OpNegateRAState,
     OpGnuArgsSize
   };
 
@@ -509,6 +510,11 @@ public:
     return MCCFIInstruction(OpWindowSave, L, 0, 0, "");
   }
 
+  /// .cfi_negate_ra_state AArch64 negate RA state.
+  static MCCFIInstruction createNegateRAState(MCSymbol *L) {
+    return MCCFIInstruction(OpNegateRAState, L, 0, 0, "");
+  }
+
   /// .cfi_restore says that the rule for Register is now the same as it
   /// was at the beginning of the function, after all initial instructions added
   /// by .cfi_startproc were executed.
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 23dde10e785bb69e9a96ecd1f3726a558140b6f2..8cb6b86fd672eedd9d8e57a117fb6fb8146f80bc 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -288,6 +288,7 @@ public:
     VK_WebAssembly_FUNCTION, // Function table index, rather than virtual addr
     VK_WebAssembly_GLOBAL,   // Global object index
     VK_WebAssembly_TYPEINDEX,// Type table index
+    VK_WebAssembly_EVENT,    // Event index
 
     VK_AMDGPU_GOTPCREL32_LO, // symbol@gotpcrel32@lo
     VK_AMDGPU_GOTPCREL32_HI, // symbol@gotpcrel32@hi
diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
index 67bb11a703872731d4eeb9693bb321ab0f9558ed..d501b686bb2e79d83fc397db4531d46b43850483 100644
--- a/include/llvm/MC/MCInst.h
+++ b/include/llvm/MC/MCInst.h
@@ -208,6 +208,8 @@ public:
   /// string.
   void dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer = nullptr,
                    StringRef Separator = " ") const;
+  void dump_pretty(raw_ostream &OS, StringRef Name,
+                   StringRef Separator = " ") const;
 };
 
 inline raw_ostream& operator<<(raw_ostream &OS, const MCOperand &MO) {
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index ba97aeb837bedefd7f6bb6d58237a875e95c975b..61e7d09afbcbc4f6507c89c5b936612396be049c 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -151,7 +151,8 @@ enum Flag {
   InsertSubreg,
   Convergent,
   Add,
-  Trap
+  Trap,
+  VariadicOpsAreDefs,
 };
 }
 
@@ -383,6 +384,11 @@ public:
   /// additional values.
   bool isConvergent() const { return Flags & (1ULL << MCID::Convergent); }
 
+  /// Return true if variadic operands of this instruction are definitions.
+  bool variadicOpsAreDefs() const {
+    return Flags & (1ULL << MCID::VariadicOpsAreDefs);
+  }
+
   //===--------------------------------------------------------------------===//
   // Side Effect Analysis
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 729aa23ef333a3117602f8f72d7de99d1066bbd6..f8142ccd8ac5b68ba95fc92731e4c87a2e0ede6c 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/VersionTuple.h"
 
 namespace llvm {
 class MCContext;
@@ -241,6 +242,9 @@ public:
   MCSection *getCompactUnwindSection() const { return CompactUnwindSection; }
   MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; }
   MCSection *getDwarfInfoSection() const { return DwarfInfoSection; }
+  MCSection *getDwarfInfoSection(uint64_t Hash) const {
+    return getDwarfComdatSection(".debug_info", Hash);
+  }
   MCSection *getDwarfLineSection() const { return DwarfLineSection; }
   MCSection *getDwarfLineStrSection() const { return DwarfLineStrSection; }
   MCSection *getDwarfFrameSection() const { return DwarfFrameSection; }
@@ -277,7 +281,9 @@ public:
     return DwarfAccelTypesSection;
   }
   MCSection *getDwarfInfoDWOSection() const { return DwarfInfoDWOSection; }
-  MCSection *getDwarfTypesSection(uint64_t Hash) const;
+  MCSection *getDwarfTypesSection(uint64_t Hash) const {
+    return getDwarfComdatSection(".debug_types", Hash);
+  }
   MCSection *getDwarfTypesDWOSection() const { return DwarfTypesDWOSection; }
   MCSection *getDwarfAbbrevDWOSection() const { return DwarfAbbrevDWOSection; }
   MCSection *getDwarfStrDWOSection() const { return DwarfStrDWOSection; }
@@ -385,14 +391,22 @@ private:
   bool PositionIndependent;
   MCContext *Ctx;
   Triple TT;
+  VersionTuple SDKVersion;
 
   void initMachOMCObjectFileInfo(const Triple &T);
   void initELFMCObjectFileInfo(const Triple &T, bool Large);
   void initCOFFMCObjectFileInfo(const Triple &T);
   void initWasmMCObjectFileInfo(const Triple &T);
+  MCSection *getDwarfComdatSection(const char *Name, uint64_t Hash) const;
 
 public:
   const Triple &getTargetTriple() const { return TT; }
+
+  void setSDKVersion(const VersionTuple &TheSDKVersion) {
+    SDKVersion = TheSDKVersion;
+  }
+
+  const VersionTuple &getSDKVersion() const { return SDKVersion; }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index c9e577b7e295e4a06d77d716708230c157254399..892909656c150a070023cf22e0552ccec02c670f 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -39,12 +39,21 @@ class MCObjectStreamer : public MCStreamer {
   bool EmitEHFrame;
   bool EmitDebugFrame;
   SmallVector<MCSymbol *, 2> PendingLabels;
+  struct PendingMCFixup {
+    const MCSymbol *Sym;
+    MCFixup Fixup;
+    MCDataFragment *DF;
+    PendingMCFixup(const MCSymbol *McSym, MCDataFragment *F, MCFixup McFixup)
+        : Sym(McSym), Fixup(McFixup), DF(F) {}
+  };
+  SmallVector<PendingMCFixup, 2> PendingFixups;
 
   virtual void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo&) = 0;
   void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
   void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
   MCSymbol *EmitCFILabel() override;
   void EmitInstructionImpl(const MCInst &Inst, const MCSubtargetInfo &STI);
+  void resolvePendingFixups();
 
 protected:
   MCObjectStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 41305296b004f67b97c6b07c637f65d3473d828a..689ac73cbdd1c37f6aeb882b7ba9b1dfeb6eb001 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -183,6 +183,8 @@ struct MCExtraProcessorInfo {
   unsigned NumRegisterFiles;
   const MCRegisterCostEntry *RegisterCostTable;
   unsigned NumRegisterCostEntries;
+  unsigned LoadQueueID;
+  unsigned StoreQueueID;
 };
 
 /// Machine model for scheduling, bundling, and heuristics.
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index edf0a72d9c12d3848529a541cf46ff0f14729c28..849ea9a624e899455a1fbc11f95ed44d7b479b62 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -28,6 +28,7 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetParser.h"
+#include "llvm/Support/VersionTuple.h"
 #include <cassert>
 #include <cstdint>
 #include <memory>
@@ -452,14 +453,17 @@ public:
 
   /// Specify the Mach-O minimum deployment target version.
   virtual void EmitVersionMin(MCVersionMinType Type, unsigned Major,
-                              unsigned Minor, unsigned Update) {}
+                              unsigned Minor, unsigned Update,
+                              VersionTuple SDKVersion) {}
 
   /// Emit/Specify Mach-O build version command.
   /// \p Platform should be one of MachO::PlatformType.
   virtual void EmitBuildVersion(unsigned Platform, unsigned Major,
-                                unsigned Minor, unsigned Update) {}
+                                unsigned Minor, unsigned Update,
+                                VersionTuple SDKVersion) {}
 
-  void EmitVersionForTarget(const Triple &Target);
+  void EmitVersionForTarget(const Triple &Target,
+                            const VersionTuple &SDKVersion);
 
   /// Note in the output that the specified \p Func is a Thumb mode
   /// function (ARM target only).
@@ -896,6 +900,7 @@ public:
   virtual void EmitCFIUndefined(int64_t Register);
   virtual void EmitCFIRegister(int64_t Register1, int64_t Register2);
   virtual void EmitCFIWindowSave();
+  virtual void EmitCFINegateRAState();
 
   virtual void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc = SMLoc());
   virtual void EmitWinCFIEndProc(SMLoc Loc = SMLoc());
diff --git a/include/llvm/MC/MCSymbolWasm.h b/include/llvm/MC/MCSymbolWasm.h
index 281178aea61229158c8a45c629b460b1f7e0786f..8e66dc881d0fba45ca0cac9278fd8202b0fdb853 100644
--- a/include/llvm/MC/MCSymbolWasm.h
+++ b/include/llvm/MC/MCSymbolWasm.h
@@ -21,8 +21,8 @@ class MCSymbolWasm : public MCSymbol {
   bool IsComdat = false;
   std::string ModuleName;
   wasm::WasmSignature *Signature = nullptr;
-  wasm::WasmGlobalType GlobalType;
-  bool GlobalTypeSet = false;
+  Optional<wasm::WasmGlobalType> GlobalType;
+  Optional<wasm::WasmEventType> EventType;
 
   /// An expression describing how to calculate the size of a symbol. If a
   /// symbol has no size this field will be NULL.
@@ -42,6 +42,7 @@ public:
   bool isData() const { return Type == wasm::WASM_SYMBOL_TYPE_DATA; }
   bool isGlobal() const { return Type == wasm::WASM_SYMBOL_TYPE_GLOBAL; }
   bool isSection() const { return Type == wasm::WASM_SYMBOL_TYPE_SECTION; }
+  bool isEvent() const { return Type == wasm::WASM_SYMBOL_TYPE_EVENT; }
   wasm::WasmSymbolType getType() const { return Type; }
   void setType(wasm::WasmSymbolType type) { Type = type; }
 
@@ -61,14 +62,16 @@ public:
   void setSignature(wasm::WasmSignature *Sig) { Signature = Sig; }
 
   const wasm::WasmGlobalType &getGlobalType() const {
-    assert(GlobalTypeSet);
-    return GlobalType;
+    assert(GlobalType.hasValue());
+    return GlobalType.getValue();
   }
+  void setGlobalType(wasm::WasmGlobalType GT) { GlobalType = GT; }
 
-  void setGlobalType(wasm::WasmGlobalType GT) {
-    GlobalTypeSet = true;
-    GlobalType = GT;
+  const wasm::WasmEventType &getEventType() const {
+    assert(EventType.hasValue());
+    return EventType.getValue();
   }
+  void setEventType(wasm::WasmEventType ET) { EventType = ET; }
 };
 
 } // end namespace llvm
diff --git a/tools/llvm-mca/include/Context.h b/include/llvm/MCA/Context.h
similarity index 90%
rename from tools/llvm-mca/include/Context.h
rename to include/llvm/MCA/Context.h
index ebd1528e371f722e970785dc689f383ad1e813a5..6b2bee0fdc4210779189c4fe8e7b43dfe9426ed5 100644
--- a/tools/llvm-mca/include/Context.h
+++ b/include/llvm/MCA/Context.h
@@ -15,14 +15,15 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_CONTEXT_H
-#define LLVM_TOOLS_LLVM_MCA_CONTEXT_H
-#include "HardwareUnits/HardwareUnit.h"
-#include "InstrBuilder.h"
-#include "Pipeline.h"
-#include "SourceMgr.h"
+#ifndef LLVM_MCA_CONTEXT_H
+#define LLVM_MCA_CONTEXT_H
+
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
+#include "llvm/MCA/InstrBuilder.h"
+#include "llvm/MCA/Pipeline.h"
+#include "llvm/MCA/SourceMgr.h"
 #include <memory>
 
 namespace llvm {
@@ -65,4 +66,4 @@ public:
 
 } // namespace mca
 } // namespace llvm
-#endif // LLVM_TOOLS_LLVM_MCA_CONTEXT_H
+#endif // LLVM_MCA_CONTEXT_H
diff --git a/tools/llvm-mca/include/HWEventListener.h b/include/llvm/MCA/HWEventListener.h
similarity index 96%
rename from tools/llvm-mca/include/HWEventListener.h
rename to include/llvm/MCA/HWEventListener.h
index 0216fae7866b71ead66e3aa5635997664e1915c9..3b32b2cd657721d7cb2694e5f7d0c2c684dedfcb 100644
--- a/tools/llvm-mca/include/HWEventListener.h
+++ b/include/llvm/MCA/HWEventListener.h
@@ -12,12 +12,12 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
-#define LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
+#ifndef LLVM_MCA_HWEVENTLISTENER_H
+#define LLVM_MCA_HWEVENTLISTENER_H
 
-#include "Instruction.h"
-#include "Support.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/MCA/Instruction.h"
+#include "llvm/MCA/Support.h"
 
 namespace llvm {
 namespace mca {
@@ -153,4 +153,4 @@ private:
 } // namespace mca
 } // namespace llvm
 
-#endif
+#endif // LLVM_MCA_HWEVENTLISTENER_H
diff --git a/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h b/include/llvm/MCA/HardwareUnits/HardwareUnit.h
similarity index 86%
rename from tools/llvm-mca/include/HardwareUnits/HardwareUnit.h
rename to include/llvm/MCA/HardwareUnits/HardwareUnit.h
index 5070418c11bffc3f97a2dfd8d21d36adb9eb22cd..104a2009f219d234986557de065cafd3ebe70aca 100644
--- a/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h
+++ b/include/llvm/MCA/HardwareUnits/HardwareUnit.h
@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
-#define LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
+#ifndef LLVM_MCA_HARDWAREUNIT_H
+#define LLVM_MCA_HARDWAREUNIT_H
 
 namespace llvm {
 namespace mca {
@@ -30,4 +30,4 @@ public:
 
 } // namespace mca
 } // namespace llvm
-#endif // LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
+#endif // LLVM_MCA_HARDWAREUNIT_H
diff --git a/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/include/llvm/MCA/HardwareUnits/LSUnit.h
similarity index 65%
rename from tools/llvm-mca/include/HardwareUnits/LSUnit.h
rename to include/llvm/MCA/HardwareUnits/LSUnit.h
index 6b36282ca7253aa7499d9c32f91ab42558bdf49d..e217fc50f780c5d78cc4a4051dd322c204660e52 100644
--- a/tools/llvm-mca/include/HardwareUnits/LSUnit.h
+++ b/include/llvm/MCA/HardwareUnits/LSUnit.h
@@ -13,16 +13,18 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_LSUNIT_H
-#define LLVM_TOOLS_LLVM_MCA_LSUNIT_H
+#ifndef LLVM_MCA_LSUNIT_H
+#define LLVM_MCA_LSUNIT_H
 
-#include "HardwareUnits/HardwareUnit.h"
-#include <set>
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
 
 namespace llvm {
 namespace mca {
 
 class InstRef;
+class Scheduler;
 
 /// A Load/Store Unit implementing a load and store queues.
 ///
@@ -99,8 +101,46 @@ class LSUnit : public HardwareUnit {
   // If true, loads will never alias with stores. This is the default.
   bool NoAlias;
 
-  std::set<unsigned> LoadQueue;
-  std::set<unsigned> StoreQueue;
+  // When a `MayLoad` instruction is dispatched to the schedulers for execution,
+  // the LSUnit reserves an entry in the `LoadQueue` for it.
+  //
+  // LoadQueue keeps track of all the loads that are in-flight. A load
+  // instruction is eventually removed from the LoadQueue when it reaches
+  // completion stage. That means, a load leaves the queue whe it is 'executed',
+  // and its value can be forwarded on the data path to outside units.
+  //
+  // This class doesn't know about the latency of a load instruction. So, it
+  // conservatively/pessimistically assumes that the latency of a load opcode
+  // matches the instruction latency.
+  //
+  // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
+  // and load/store conflicts, the latency of a load is determined by the depth
+  // of the load pipeline. So, we could use field `LoadLatency` in the
+  // MCSchedModel to model that latency.
+  // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
+  // L1D, and it usually already accounts for any extra latency due to data
+  // forwarding.
+  // When doing throughput analysis, `LoadLatency` is likely to
+  // be a better predictor of load latency than instruction latency. This is
+  // particularly true when simulating code with temporal/spatial locality of
+  // memory accesses.
+  // Using `LoadLatency` (instead of the instruction latency) is also expected
+  // to improve the load queue allocation for long latency instructions with
+  // folded memory operands (See PR39829).
+  //
+  // FIXME: On some processors, load/store operations are split into multiple
+  // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
+  // not 256-bit data types. So, a 256-bit load is effectively split into two
+  // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
+  // simplicity, this class optimistically assumes that a load instruction only
+  // consumes one entry in the LoadQueue.  Similarly, store instructions only
+  // consume a single entry in the StoreQueue.
+  // In future, we should reassess the quality of this design, and consider
+  // alternative approaches that let instructions specify the number of
+  // load/store queue entries which they consume at dispatch stage (See
+  // PR39830).
+  SmallSet<unsigned, 16> LoadQueue;
+  SmallSet<unsigned, 16> StoreQueue;
 
   void assignLQSlot(unsigned Index);
   void assignSQSlot(unsigned Index);
@@ -109,12 +149,12 @@ class LSUnit : public HardwareUnit {
   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
   // conservatively treated as a store barrier. It forces older store to be
   // executed before newer stores are issued.
-  std::set<unsigned> StoreBarriers;
+  SmallSet<unsigned, 8> StoreBarriers;
 
   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
   // conservatively treated as a load barrier. It forces older loads to execute
   // before newer loads are issued.
-  std::set<unsigned> LoadBarriers;
+  SmallSet<unsigned, 8> LoadBarriers;
 
   bool isSQEmpty() const { return StoreQueue.empty(); }
   bool isLQEmpty() const { return LoadQueue.empty(); }
@@ -122,8 +162,8 @@ class LSUnit : public HardwareUnit {
   bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
 
 public:
-  LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
-      : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
+  LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0,
+         bool AssumeNoAlias = false);
 
 #ifndef NDEBUG
   void dump() const;
@@ -149,10 +189,19 @@ public:
   // 5. A load has to wait until an older load barrier is fully executed.
   // 6. A store has to wait until an older store barrier is fully executed.
   virtual bool isReady(const InstRef &IR) const;
+
+  // Load and store instructions are tracked by their corresponding queues from
+  // dispatch until the "instruction executed" event.
+  // Only when a load instruction reaches the 'Executed' stage, its value
+  // becomes available to the users. At that point, the load no longer needs to
+  // be tracked by the load queue.
+  // FIXME: For simplicity, we optimistically assume a similar behavior for
+  // store instructions. In practice, store operations don't tend to leave the
+  // store queue until they reach the 'Retired' stage (See PR39830).
   void onInstructionExecuted(const InstRef &IR);
 };
 
 } // namespace mca
 } // namespace llvm
 
-#endif
+#endif // LLVM_MCA_LSUNIT_H
diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/include/llvm/MCA/HardwareUnits/RegisterFile.h
similarity index 98%
rename from tools/llvm-mca/include/HardwareUnits/RegisterFile.h
rename to include/llvm/MCA/HardwareUnits/RegisterFile.h
index d9949bf4f6a13be5208240ba299cb4d7927895c1..c23ab038923481363f0d5718af4ca56f56dd983f 100644
--- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
+++ b/include/llvm/MCA/HardwareUnits/RegisterFile.h
@@ -14,14 +14,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
-#define LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
+#ifndef LLVM_MCA_REGISTER_FILE_H
+#define LLVM_MCA_REGISTER_FILE_H
 
-#include "HardwareUnits/HardwareUnit.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -236,4 +236,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
+#endif // LLVM_MCA_REGISTER_FILE_H
diff --git a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h b/include/llvm/MCA/HardwareUnits/ResourceManager.h
similarity index 97%
rename from tools/llvm-mca/include/HardwareUnits/ResourceManager.h
rename to include/llvm/MCA/HardwareUnits/ResourceManager.h
index 065ead8f1a8abd300ad06f4e0fae775b8cb061b8..5429f2b8cf0a5c928f40a32ab1d26c3ae137e921 100644
--- a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
+++ b/include/llvm/MCA/HardwareUnits/ResourceManager.h
@@ -13,15 +13,15 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCE_MANAGER_H
-#define LLVM_TOOLS_LLVM_MCA_RESOURCE_MANAGER_H
+#ifndef LLVM_MCA_RESOURCE_MANAGER_H
+#define LLVM_MCA_RESOURCE_MANAGER_H
 
-#include "Instruction.h"
-#include "Support.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/Instruction.h"
+#include "llvm/MCA/Support.h"
 
 namespace llvm {
 namespace mca {
@@ -119,8 +119,6 @@ class DefaultResourceStrategy final : public ResourceStrategy {
   /// on the overall performance of the tool.
   uint64_t RemovedFromNextInSequence;
 
-  void skipMask(uint64_t Mask);
-
 public:
   DefaultResourceStrategy(uint64_t UnitMask)
       : ResourceStrategy(), ResourceUnitMask(UnitMask),
@@ -183,6 +181,8 @@ class ResourceState {
   /// underlying units (i.e. pipelines) until the resource is released.
   bool Unavailable;
 
+  const bool IsAGroup;
+
   /// Checks for the availability of unit 'SubResMask' in the group.
   bool isSubResourceReady(uint64_t SubResMask) const {
     return ReadyMask & SubResMask;
@@ -210,7 +210,7 @@ public:
   /// `NumUnits` available units.
   bool isReady(unsigned NumUnits = 1) const;
 
-  bool isAResourceGroup() const { return countPopulation(ResourceMask) > 1; }
+  bool isAResourceGroup() const { return IsAGroup; }
 
   bool containsResource(uint64_t ID) const { return ResourceMask & ID; }
 
@@ -357,4 +357,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_RESOURCE_MANAGER_H
+#endif // LLVM_MCA_RESOURCE_MANAGER_H
diff --git a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h b/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
similarity index 94%
rename from tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
rename to include/llvm/MCA/HardwareUnits/RetireControlUnit.h
index 12e0a1fba136ea926c1ce3f48d8e9b7fdcf7fa60..71360e984ade5d48914b375df9e628abbf58274c 100644
--- a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
+++ b/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
@@ -12,12 +12,12 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H
-#define LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H
+#ifndef LLVM_MCA_RETIRE_CONTROL_UNIT_H
+#define LLVM_MCA_RETIRE_CONTROL_UNIT_H
 
-#include "HardwareUnits/HardwareUnit.h"
-#include "Instruction.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
+#include "llvm/MCA/Instruction.h"
 #include <vector>
 
 namespace llvm {
@@ -101,4 +101,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H
+#endif // LLVM_MCA_RETIRE_CONTROL_UNIT_H
diff --git a/tools/llvm-mca/include/HardwareUnits/Scheduler.h b/include/llvm/MCA/HardwareUnits/Scheduler.h
similarity index 92%
rename from tools/llvm-mca/include/HardwareUnits/Scheduler.h
rename to include/llvm/MCA/HardwareUnits/Scheduler.h
index 17332b430d2edff7f8c5d00ff4e31f75f5757242..351ea4827df9a112c3cb3e14607c97eda19146e2 100644
--- a/tools/llvm-mca/include/HardwareUnits/Scheduler.h
+++ b/include/llvm/MCA/HardwareUnits/Scheduler.h
@@ -12,15 +12,15 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
-#define LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
+#ifndef LLVM_MCA_SCHEDULER_H
+#define LLVM_MCA_SCHEDULER_H
 
-#include "HardwareUnits/HardwareUnit.h"
-#include "HardwareUnits/LSUnit.h"
-#include "ResourceManager.h"
-#include "Support.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
+#include "llvm/MCA/HardwareUnits/LSUnit.h"
+#include "llvm/MCA/HardwareUnits/ResourceManager.h"
+#include "llvm/MCA/Support.h"
 
 namespace llvm {
 namespace mca {
@@ -85,7 +85,7 @@ public:
 /// transition (i.e. from state IS_READY, to state IS_EXECUTING). An Instruction
 /// leaves the IssuedSet when it reaches the write-back stage.
 class Scheduler : public HardwareUnit {
-  LSUnit *LSU;
+  LSUnit &LSU;
 
   // Instruction selection strategy for this Scheduler.
   std::unique_ptr<SchedulerStrategy> Strategy;
@@ -117,16 +117,15 @@ class Scheduler : public HardwareUnit {
   void promoteToReadySet(SmallVectorImpl<InstRef> &Ready);
 
 public:
-  Scheduler(const MCSchedModel &Model, LSUnit *Lsu)
-      : LSU(Lsu), Resources(make_unique<ResourceManager>(Model)) {
-    initializeStrategy(nullptr);
-  }
-  Scheduler(const MCSchedModel &Model, LSUnit *Lsu,
+  Scheduler(const MCSchedModel &Model, LSUnit &Lsu)
+      : Scheduler(Model, Lsu, nullptr) {}
+
+  Scheduler(const MCSchedModel &Model, LSUnit &Lsu,
             std::unique_ptr<SchedulerStrategy> SelectStrategy)
-      : LSU(Lsu), Resources(make_unique<ResourceManager>(Model)) {
-    initializeStrategy(std::move(SelectStrategy));
-  }
-  Scheduler(std::unique_ptr<ResourceManager> RM, LSUnit *Lsu,
+      : Scheduler(make_unique<ResourceManager>(Model), Lsu,
+                  std::move(SelectStrategy)) {}
+
+  Scheduler(std::unique_ptr<ResourceManager> RM, LSUnit &Lsu,
             std::unique_ptr<SchedulerStrategy> SelectStrategy)
       : LSU(Lsu), Resources(std::move(RM)) {
     initializeStrategy(std::move(SelectStrategy));
@@ -212,4 +211,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
+#endif // LLVM_MCA_SCHEDULER_H
diff --git a/tools/llvm-mca/include/InstrBuilder.h b/include/llvm/MCA/InstrBuilder.h
similarity index 79%
rename from tools/llvm-mca/include/InstrBuilder.h
rename to include/llvm/MCA/InstrBuilder.h
index 67aa889cf7bf5ebc0751ffc523c3703f42516760..5f998db5e4ce0c232e7b110113608cc1125193e1 100644
--- a/tools/llvm-mca/include/InstrBuilder.h
+++ b/include/llvm/MCA/InstrBuilder.h
@@ -12,15 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
-#define LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
+#ifndef LLVM_MCA_INSTRBUILDER_H
+#define LLVM_MCA_INSTRBUILDER_H
 
-#include "Instruction.h"
-#include "Support.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/Instruction.h"
+#include "llvm/MCA/Support.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -40,31 +40,38 @@ class InstrBuilder {
   const MCSubtargetInfo &STI;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
-  const MCInstrAnalysis &MCIA;
+  const MCInstrAnalysis *MCIA;
   SmallVector<uint64_t, 8> ProcResourceMasks;
 
   DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
   DenseMap<const MCInst *, std::unique_ptr<const InstrDesc>> VariantDescriptors;
 
+  bool FirstCallInst;
+  bool FirstReturnInst;
+
   Expected<const InstrDesc &> createInstrDescImpl(const MCInst &MCI);
   Expected<const InstrDesc &> getOrCreateInstrDesc(const MCInst &MCI);
 
   InstrBuilder(const InstrBuilder &) = delete;
   InstrBuilder &operator=(const InstrBuilder &) = delete;
 
-  Error populateWrites(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID);
-  Error populateReads(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID);
+  void populateWrites(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID);
+  void populateReads(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID);
   Error verifyInstrDesc(const InstrDesc &ID, const MCInst &MCI) const;
 
 public:
   InstrBuilder(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
-               const MCRegisterInfo &RI, const MCInstrAnalysis &IA);
+               const MCRegisterInfo &RI, const MCInstrAnalysis *IA);
 
-  void clear() { VariantDescriptors.shrink_and_clear(); }
+  void clear() {
+    VariantDescriptors.shrink_and_clear();
+    FirstCallInst = true;
+    FirstReturnInst = true;
+  }
 
   Expected<std::unique_ptr<Instruction>> createInstruction(const MCInst &MCI);
 };
 } // namespace mca
 } // namespace llvm
 
-#endif
+#endif // LLVM_MCA_INSTRBUILDER_H
diff --git a/tools/llvm-mca/include/Instruction.h b/include/llvm/MCA/Instruction.h
similarity index 94%
rename from tools/llvm-mca/include/Instruction.h
rename to include/llvm/MCA/Instruction.h
index 7407283bca2c3933aacd83ed3db269349975b774..148651b22dac04657c598516970db241180182df 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/include/llvm/MCA/Instruction.h
@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
-#define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
+#ifndef LLVM_MCA_INSTRUCTION_H
+#define LLVM_MCA_INSTRUCTION_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -26,8 +26,6 @@
 #endif
 
 #include <memory>
-#include <set>
-#include <vector>
 
 namespace llvm {
 namespace mca {
@@ -123,8 +121,10 @@ class WriteState {
   // that we don't break the WAW, and the two writes can be merged together.
   const WriteState *DependentWrite;
 
-  // Number of writes that are in a WAW dependency with this write.
-  unsigned NumWriteUsers;
+  // A partial write that is in a false dependency with this write.
+  WriteState *PartialWrite;
+
+  unsigned DependentWriteCyclesLeft;
 
   // A list of dependent reads. Users is a set of dependent
   // reads. A dependent read is added to the set only if CyclesLeft
@@ -132,14 +132,15 @@ class WriteState {
   // gets notified with the actual CyclesLeft.
 
   // The 'second' element of a pair is a "ReadAdvance" number of cycles.
-  std::set<std::pair<ReadState *, int>> Users;
+  SmallVector<std::pair<ReadState *, int>, 4> Users;
 
 public:
   WriteState(const WriteDescriptor &Desc, unsigned RegID,
              bool clearsSuperRegs = false, bool writesZero = false)
       : WD(&Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
         PRFID(0), ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
-        IsEliminated(false), DependentWrite(nullptr), NumWriteUsers(0U) {}
+        IsEliminated(false), DependentWrite(nullptr), PartialWrite(nullptr),
+        DependentWriteCyclesLeft(0) {}
 
   WriteState(const WriteState &Other) = default;
   WriteState &operator=(const WriteState &Other) = default;
@@ -151,8 +152,17 @@ public:
   unsigned getLatency() const { return WD->Latency; }
 
   void addUser(ReadState *Use, int ReadAdvance);
+  void addUser(WriteState *Use);
+
+  unsigned getDependentWriteCyclesLeft() const { return DependentWriteCyclesLeft; }
+
+  unsigned getNumUsers() const {
+    unsigned NumUsers = Users.size();
+    if (PartialWrite)
+      ++NumUsers;
+    return NumUsers;
+  }
 
-  unsigned getNumUsers() const { return Users.size() + NumWriteUsers; }
   bool clearsSuperRegisters() const { return ClearsSuperRegs; }
   bool isWriteZero() const { return WritesZero; }
   bool isEliminated() const { return IsEliminated; }
@@ -161,10 +171,12 @@ public:
   }
 
   const WriteState *getDependentWrite() const { return DependentWrite; }
-  void setDependentWrite(WriteState *Other) {
-    DependentWrite = Other;
-    ++Other->NumWriteUsers;
+  void setDependentWrite(WriteState *Other) { DependentWrite = Other; }
+  void writeStartEvent(unsigned Cycles) {
+    DependentWriteCyclesLeft = Cycles;
+    DependentWrite = nullptr;
   }
+
   void setWriteZero() { WritesZero = true; }
   void setEliminated() {
     assert(Users.empty() && "Write is in an inconsistent state.");
@@ -305,15 +317,16 @@ struct ResourceUsage {
 
 /// An instruction descriptor
 struct InstrDesc {
-  std::vector<WriteDescriptor> Writes; // Implicit writes are at the end.
-  std::vector<ReadDescriptor> Reads;   // Implicit reads are at the end.
+  SmallVector<WriteDescriptor, 4> Writes; // Implicit writes are at the end.
+  SmallVector<ReadDescriptor, 4> Reads;   // Implicit reads are at the end.
 
   // For every resource used by an instruction of this kind, this vector
   // reports the number of "consumed cycles".
-  std::vector<std::pair<uint64_t, ResourceUsage>> Resources;
+  SmallVector<std::pair<uint64_t, ResourceUsage>, 4> Resources;
 
   // A list of buffered resources consumed by this instruction.
-  std::vector<uint64_t> Buffers;
+  SmallVector<uint64_t, 4> Buffers;
+
   unsigned MaxLatency;
   // Number of MicroOps for this instruction.
   unsigned NumMicroOps;
@@ -321,6 +334,8 @@ struct InstrDesc {
   bool MayLoad;
   bool MayStore;
   bool HasSideEffects;
+  bool BeginGroup;
+  bool EndGroup;
 
   // A zero latency instruction doesn't consume any scheduler resources.
   bool isZeroLatency() const { return !MaxLatency && Resources.empty(); }
@@ -526,4 +541,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif
+#endif  // LLVM_MCA_INSTRUCTION_H
diff --git a/tools/llvm-mca/include/Pipeline.h b/include/llvm/MCA/Pipeline.h
similarity index 90%
rename from tools/llvm-mca/include/Pipeline.h
rename to include/llvm/MCA/Pipeline.h
index 47ff07b288288bb2fc667f158e6e88b51de919df..acd256060bdd77a6f73c142eb3e38e72595de490 100644
--- a/tools/llvm-mca/include/Pipeline.h
+++ b/include/llvm/MCA/Pipeline.h
@@ -13,12 +13,12 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_PIPELINE_H
-#define LLVM_TOOLS_LLVM_MCA_PIPELINE_H
+#ifndef LLVM_MCA_PIPELINE_H
+#define LLVM_MCA_PIPELINE_H
 
-#include "HardwareUnits/Scheduler.h"
-#include "Stages/Stage.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/HardwareUnits/Scheduler.h"
+#include "llvm/MCA/Stages/Stage.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -67,10 +67,13 @@ class Pipeline {
 public:
   Pipeline() : Cycles(0) {}
   void appendStage(std::unique_ptr<Stage> S);
-  Error run();
+
+  /// Returns the total number of simulated cycles.
+  Expected<unsigned> run();
+
   void addEventListener(HWEventListener *Listener);
 };
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_PIPELINE_H
+#endif // LLVM_MCA_PIPELINE_H
diff --git a/tools/llvm-mca/include/SourceMgr.h b/include/llvm/MCA/SourceMgr.h
similarity index 95%
rename from tools/llvm-mca/include/SourceMgr.h
rename to include/llvm/MCA/SourceMgr.h
index e5180107011b19a3feab4faa9a92ca4c00f278cb..5e0ca6419f5d4057185d8fb123edd53eed6bdb06 100644
--- a/tools/llvm-mca/include/SourceMgr.h
+++ b/include/llvm/MCA/SourceMgr.h
@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
-#define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
+#ifndef LLVM_MCA_SOURCEMGR_H
+#define LLVM_MCA_SOURCEMGR_H
 
 #include "llvm/ADT/ArrayRef.h"
 
@@ -54,4 +54,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif
+#endif // LLVM_MCA_SOURCEMGR_H
diff --git a/tools/llvm-mca/include/Stages/DispatchStage.h b/include/llvm/MCA/Stages/DispatchStage.h
similarity index 91%
rename from tools/llvm-mca/include/Stages/DispatchStage.h
rename to include/llvm/MCA/Stages/DispatchStage.h
index 29cace1022e08ddbb746e6b4fe16d7686ba511ce..f015cd7522eb54edf0e22e0cf412c4f352861ab2 100644
--- a/tools/llvm-mca/include/Stages/DispatchStage.h
+++ b/include/llvm/MCA/Stages/DispatchStage.h
@@ -16,16 +16,16 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H
-#define LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H
+#ifndef LLVM_MCA_DISPATCH_STAGE_H
+#define LLVM_MCA_DISPATCH_STAGE_H
 
-#include "HWEventListener.h"
-#include "HardwareUnits/RegisterFile.h"
-#include "HardwareUnits/RetireControlUnit.h"
-#include "Instruction.h"
-#include "Stages/Stage.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/HWEventListener.h"
+#include "llvm/MCA/HardwareUnits/RegisterFile.h"
+#include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
+#include "llvm/MCA/Instruction.h"
+#include "llvm/MCA/Stages/Stage.h"
 
 namespace llvm {
 namespace mca {
@@ -90,4 +90,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H
+#endif // LLVM_MCA_DISPATCH_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/FetchStage.h b/include/llvm/MCA/Stages/EntryStage.h
similarity index 52%
rename from tools/llvm-mca/include/Stages/FetchStage.h
rename to include/llvm/MCA/Stages/EntryStage.h
index 55bf2011b32ba5f312d52c650ed941f69170e71a..cd9a65b8cc2bcfaa7a0161bee4dc9d71b449f5ed 100644
--- a/tools/llvm-mca/include/Stages/FetchStage.h
+++ b/include/llvm/MCA/Stages/EntryStage.h
@@ -1,4 +1,4 @@
-//===---------------------- FetchStage.h ------------------------*- C++ -*-===//
+//===---------------------- EntryStage.h ------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,35 +8,36 @@
 //===----------------------------------------------------------------------===//
 /// \file
 ///
-/// This file defines the Fetch stage of an instruction pipeline.  Its sole
-/// purpose in life is to produce instructions for the rest of the pipeline.
+/// This file defines the Entry stage of an instruction pipeline.  Its sole
+/// purpose in life is to pick instructions in sequence and move them to the
+/// next pipeline stage.
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
-#define LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
+#ifndef LLVM_MCA_ENTRY_STAGE_H
+#define LLVM_MCA_ENTRY_STAGE_H
 
-#include "SourceMgr.h"
-#include "Stages/Stage.h"
-#include <map>
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/SourceMgr.h"
+#include "llvm/MCA/Stages/Stage.h"
 
 namespace llvm {
 namespace mca {
 
-class FetchStage final : public Stage {
+class EntryStage final : public Stage {
   InstRef CurrentInstruction;
-  using InstMap = std::map<unsigned, std::unique_ptr<Instruction>>;
-  InstMap Instructions;
+  SmallVector<std::unique_ptr<Instruction>, 16> Instructions;
   SourceMgr &SM;
+  unsigned NumRetired;
 
   // Updates the program counter, and sets 'CurrentInstruction'.
   void getNextInstruction();
 
-  FetchStage(const FetchStage &Other) = delete;
-  FetchStage &operator=(const FetchStage &Other) = delete;
+  EntryStage(const EntryStage &Other) = delete;
+  EntryStage &operator=(const EntryStage &Other) = delete;
 
 public:
-  FetchStage(SourceMgr &SM) : CurrentInstruction(), SM(SM) {}
+  EntryStage(SourceMgr &SM) : CurrentInstruction(), SM(SM), NumRetired(0) { }
 
   bool isAvailable(const InstRef &IR) const override;
   bool hasWorkToComplete() const override;
@@ -48,4 +49,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
+#endif // LLVM_MCA_FETCH_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/ExecuteStage.h b/include/llvm/MCA/Stages/ExecuteStage.h
similarity index 92%
rename from tools/llvm-mca/include/Stages/ExecuteStage.h
rename to include/llvm/MCA/Stages/ExecuteStage.h
index 91b24059c950df6dfdc762c063ce79a2b474922c..4f40c5c65d6bc951acf7b47edebeacb2d301dfa4 100644
--- a/tools/llvm-mca/include/Stages/ExecuteStage.h
+++ b/include/llvm/MCA/Stages/ExecuteStage.h
@@ -15,13 +15,13 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H
-#define LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H
+#ifndef LLVM_MCA_EXECUTE_STAGE_H
+#define LLVM_MCA_EXECUTE_STAGE_H
 
-#include "HardwareUnits/Scheduler.h"
-#include "Instruction.h"
-#include "Stages/Stage.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/MCA/HardwareUnits/Scheduler.h"
+#include "llvm/MCA/Instruction.h"
+#include "llvm/MCA/Stages/Stage.h"
 
 namespace llvm {
 namespace mca {
@@ -77,4 +77,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H
+#endif // LLVM_MCA_EXECUTE_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/InstructionTables.h b/include/llvm/MCA/Stages/InstructionTables.h
similarity index 83%
rename from tools/llvm-mca/include/Stages/InstructionTables.h
rename to include/llvm/MCA/Stages/InstructionTables.h
index e618d06b1b74740a23bb7eb48650390c03d6bc0c..50f84fc6fff94c9c455e52a26c6dbe7716e3ab70 100644
--- a/tools/llvm-mca/include/Stages/InstructionTables.h
+++ b/include/llvm/MCA/Stages/InstructionTables.h
@@ -14,14 +14,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONTABLES_H
-#define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONTABLES_H
+#ifndef LLVM_MCA_INSTRUCTIONTABLES_H
+#define LLVM_MCA_INSTRUCTIONTABLES_H
 
-#include "HardwareUnits/Scheduler.h"
-#include "Stages/Stage.h"
-#include "Support.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/HardwareUnits/Scheduler.h"
+#include "llvm/MCA/Stages/Stage.h"
+#include "llvm/MCA/Support.h"
 
 namespace llvm {
 namespace mca {
@@ -42,4 +42,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif
+#endif // LLVM_MCA_INSTRUCTIONTABLES_H
diff --git a/tools/llvm-mca/include/Stages/RetireStage.h b/include/llvm/MCA/Stages/RetireStage.h
similarity index 84%
rename from tools/llvm-mca/include/Stages/RetireStage.h
rename to include/llvm/MCA/Stages/RetireStage.h
index 28eda40984f36caf3bb19415a7c3545b565af8bf..2051ce5c86adf990ee87f4f3b1acffbf807c9a46 100644
--- a/tools/llvm-mca/include/Stages/RetireStage.h
+++ b/include/llvm/MCA/Stages/RetireStage.h
@@ -14,12 +14,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H
-#define LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H
+#ifndef LLVM_MCA_RETIRE_STAGE_H
+#define LLVM_MCA_RETIRE_STAGE_H
 
-#include "HardwareUnits/RegisterFile.h"
-#include "HardwareUnits/RetireControlUnit.h"
-#include "Stages/Stage.h"
+#include "llvm/MCA/HardwareUnits/RegisterFile.h"
+#include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
+#include "llvm/MCA/Stages/Stage.h"
 
 namespace llvm {
 namespace mca {
@@ -45,4 +45,4 @@ public:
 } // namespace mca
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H
+#endif // LLVM_MCA_RETIRE_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/Stage.h b/include/llvm/MCA/Stages/Stage.h
similarity index 94%
rename from tools/llvm-mca/include/Stages/Stage.h
rename to include/llvm/MCA/Stages/Stage.h
index 5665fc453bfcae42279b63ad8401a3cc88633614..fc7ab569bb0f73a30566c75309e8263e126df4c8 100644
--- a/tools/llvm-mca/include/Stages/Stage.h
+++ b/include/llvm/MCA/Stages/Stage.h
@@ -13,10 +13,10 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_STAGE_H
-#define LLVM_TOOLS_LLVM_MCA_STAGE_H
+#ifndef LLVM_MCA_STAGE_H
+#define LLVM_MCA_STAGE_H
 
-#include "HWEventListener.h"
+#include "llvm/MCA/HWEventListener.h"
 #include "llvm/Support/Error.h"
 #include <set>
 
@@ -85,4 +85,4 @@ public:
 
 } // namespace mca
 } // namespace llvm
-#endif // LLVM_TOOLS_LLVM_MCA_STAGE_H
+#endif // LLVM_MCA_STAGE_H
diff --git a/tools/llvm-mca/include/Support.h b/include/llvm/MCA/Support.h
similarity index 98%
rename from tools/llvm-mca/include/Support.h
rename to include/llvm/MCA/Support.h
index e7a4e33ed74ecdfc8ae4e5381e693b90d473c25c..5c5a35454330b70ee32fbf194d3956f9f127e4a1 100644
--- a/tools/llvm-mca/include/Support.h
+++ b/include/llvm/MCA/Support.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_SUPPORT_H
-#define LLVM_TOOLS_LLVM_MCA_SUPPORT_H
+#ifndef LLVM_MCA_SUPPORT_H
+#define LLVM_MCA_SUPPORT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -116,4 +116,4 @@ double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
 } // namespace mca
 } // namespace llvm
 
-#endif
+#endif // LLVM_MCA_SUPPORT_H
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index e39a9b2e31c686c3f59c06030a020c7cd32ad355..5fe475ee8f46efbd84c65ff527dc3f111bb5793d 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -971,6 +971,9 @@ public:
       return nullptr;
     return reinterpret_cast<const dos_header *>(base());
   }
+  std::error_code getCOFFHeader(const coff_file_header *&Res) const;
+  std::error_code
+  getCOFFBigObjHeader(const coff_bigobj_file_header *&Res) const;
   std::error_code getPE32Header(const pe32_header *&Res) const;
   std::error_code getPE32PlusHeader(const pe32plus_header *&Res) const;
   std::error_code getDataDirectory(uint32_t index,
@@ -1062,6 +1065,8 @@ public:
   bool isRelocatableObject() const override;
   bool is64() const { return PE32PlusHeader; }
 
+  StringRef mapDebugSectionName(StringRef Name) const override;
+
   static bool classof(const Binary *v) { return v->isCOFF(); }
 };
 
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index 752d468fd25b8e1b5ffe8692253eb015a9bbc061..bcdc190cc7dccceab916ec1e5f81745638b92ca3 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -32,7 +32,7 @@ namespace llvm {
 namespace object {
 
 StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type);
-uint32_t getELFRelrRelocationType(uint32_t Machine);
+uint32_t getELFRelativeRelocationType(uint32_t Machine);
 StringRef getELFSectionTypeName(uint32_t Machine, uint32_t Type);
 
 // Subclasses of ELFFile may need this for template instantiation
@@ -113,7 +113,7 @@ public:
   StringRef getRelocationTypeName(uint32_t Type) const;
   void getRelocationTypeName(uint32_t Type,
                              SmallVectorImpl<char> &Result) const;
-  uint32_t getRelrRelocationType() const;
+  uint32_t getRelativeRelocationType() const;
 
   const char *getDynamicTagAsString(unsigned Arch, uint64_t Type) const;
   const char *getDynamicTagAsString(uint64_t Type) const;
@@ -415,8 +415,8 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
 }
 
 template <class ELFT>
-uint32_t ELFFile<ELFT>::getRelrRelocationType() const {
-  return getELFRelrRelocationType(getHeader()->e_machine);
+uint32_t ELFFile<ELFT>::getRelativeRelocationType() const {
+  return getELFRelativeRelocationType(getHeader()->e_machine);
 }
 
 template <class ELFT>
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index dff086078391a2cada4166de03648a58c667c78e..0f620681cd99a421d34b6f496aa0fd84d07c7a98 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -260,6 +260,8 @@ protected:
   bool isSectionData(DataRefImpl Sec) const override;
   bool isSectionBSS(DataRefImpl Sec) const override;
   bool isSectionVirtual(DataRefImpl Sec) const override;
+  bool isBerkeleyText(DataRefImpl Sec) const override;
+  bool isBerkeleyData(DataRefImpl Sec) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
   std::vector<SectionRef> dynamic_relocation_sections() const override;
@@ -333,9 +335,10 @@ protected:
     // A symbol is exported if its binding is either GLOBAL or WEAK, and its
     // visibility is either DEFAULT or PROTECTED. All other symbols are not
     // exported.
-    return ((Binding == ELF::STB_GLOBAL || Binding == ELF::STB_WEAK) &&
-            (Visibility == ELF::STV_DEFAULT ||
-             Visibility == ELF::STV_PROTECTED));
+    return (
+        (Binding == ELF::STB_GLOBAL || Binding == ELF::STB_WEAK ||
+         Binding == ELF::STB_GNU_UNIQUE) &&
+        (Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_PROTECTED));
   }
 
   // This flag is used for classof, to distinguish ELFObjectFile from
@@ -758,6 +761,20 @@ bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
   return getSection(Sec)->sh_type == ELF::SHT_NOBITS;
 }
 
+template <class ELFT>
+bool ELFObjectFile<ELFT>::isBerkeleyText(DataRefImpl Sec) const {
+  return getSection(Sec)->sh_flags & ELF::SHF_ALLOC &&
+         (getSection(Sec)->sh_flags & ELF::SHF_EXECINSTR ||
+          !(getSection(Sec)->sh_flags & ELF::SHF_WRITE));
+}
+
+template <class ELFT>
+bool ELFObjectFile<ELFT>::isBerkeleyData(DataRefImpl Sec) const {
+  const Elf_Shdr *EShdr = getSection(Sec);
+  return !isBerkeleyText(Sec) && EShdr->sh_type != ELF::SHT_NOBITS &&
+         EShdr->sh_flags & ELF::SHF_ALLOC;
+}
+
 template <class ELFT>
 relocation_iterator
 ELFObjectFile<ELFT>::section_rel_begin(DataRefImpl Sec) const {
diff --git a/include/llvm/Object/ELFTypes.h b/include/llvm/Object/ELFTypes.h
index fb386120e34d5968c688dd9a63b5ddca0d8c4c43..ec3c8e7bae465ba4b788ad2b350ac60679ca5aa1 100644
--- a/include/llvm/Object/ELFTypes.h
+++ b/include/llvm/Object/ELFTypes.h
@@ -605,13 +605,12 @@ public:
   }
 
   /// Get the note's descriptor.
-  ArrayRef<Elf_Word> getDesc() const {
+  ArrayRef<uint8_t> getDesc() const {
     if (!Nhdr.n_descsz)
-      return ArrayRef<Elf_Word>();
-    return ArrayRef<Elf_Word>(
-        reinterpret_cast<const Elf_Word *>(
-            reinterpret_cast<const uint8_t *>(&Nhdr) + sizeof(Nhdr) +
-            alignTo<Elf_Nhdr_Impl<ELFT>::Align>(Nhdr.n_namesz)),
+      return ArrayRef<uint8_t>();
+    return ArrayRef<uint8_t>(
+        reinterpret_cast<const uint8_t *>(&Nhdr) + sizeof(Nhdr) +
+          alignTo<Elf_Nhdr_Impl<ELFT>::Align>(Nhdr.n_namesz),
         Nhdr.n_descsz);
   }
 
@@ -643,14 +642,19 @@ class Elf_Note_Iterator_Impl
   // container, either cleanly or with an overflow error.
   void advanceNhdr(const uint8_t *NhdrPos, size_t NoteSize) {
     RemainingSize -= NoteSize;
-    if (RemainingSize == 0u)
+    if (RemainingSize == 0u) {
+      // Ensure that if the iterator walks to the end, the error is checked
+      // afterwards.
+      *Err = Error::success();
       Nhdr = nullptr;
-    else if (sizeof(*Nhdr) > RemainingSize)
+    } else if (sizeof(*Nhdr) > RemainingSize)
       stopWithOverflowError();
     else {
       Nhdr = reinterpret_cast<const Elf_Nhdr_Impl<ELFT> *>(NhdrPos + NoteSize);
       if (Nhdr->getSize() > RemainingSize)
         stopWithOverflowError();
+      else
+        *Err = Error::success();
     }
   }
 
@@ -658,6 +662,7 @@ class Elf_Note_Iterator_Impl
   explicit Elf_Note_Iterator_Impl(Error &Err) : Err(&Err) {}
   Elf_Note_Iterator_Impl(const uint8_t *Start, size_t Size, Error &Err)
       : RemainingSize(Size), Err(&Err) {
+    consumeError(std::move(Err));
     assert(Start && "ELF note iterator starting at NULL");
     advanceNhdr(Start, 0u);
   }
@@ -671,6 +676,10 @@ public:
     return *this;
   }
   bool operator==(Elf_Note_Iterator_Impl Other) const {
+    if (!Nhdr && Other.Err)
+      (void)(bool)(*Other.Err);
+    if (!Other.Nhdr && Err)
+      (void)(bool)(*Err);
     return Nhdr == Other.Nhdr;
   }
   bool operator!=(Elf_Note_Iterator_Impl Other) const {
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 02d62e8e48798085668d34f9144b64f8ba8fccb2..036c99cb6baff0d5f967302b40f965dff391c38c 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -104,13 +104,25 @@ public:
   uint64_t getAlignment() const;
 
   bool isCompressed() const;
+  /// Whether this section contains instructions.
   bool isText() const;
+  /// Whether this section contains data, not instructions.
   bool isData() const;
+  /// Whether this section contains BSS uninitialized data.
   bool isBSS() const;
   bool isVirtual() const;
   bool isBitcode() const;
   bool isStripped() const;
 
+  /// Whether this section will be placed in the text segment, according to the
+  /// Berkeley size format. This is true if the section is allocatable, and
+  /// contains either code or readonly data.
+  bool isBerkeleyText() const;
+  /// Whether this section will be placed in the data segment, according to the
+  /// Berkeley size format. This is true if the section is allocatable and
+  /// contains data (e.g. PROGBITS), but is not text.
+  bool isBerkeleyData() const;
+
   bool containsSymbol(SymbolRef S) const;
 
   relocation_iterator relocation_begin() const;
@@ -238,6 +250,8 @@ protected:
   virtual bool isSectionVirtual(DataRefImpl Sec) const = 0;
   virtual bool isSectionBitcode(DataRefImpl Sec) const;
   virtual bool isSectionStripped(DataRefImpl Sec) const;
+  virtual bool isBerkeleyText(DataRefImpl Sec) const;
+  virtual bool isBerkeleyData(DataRefImpl Sec) const;
   virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0;
   virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0;
   virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
@@ -449,6 +463,14 @@ inline bool SectionRef::isStripped() const {
   return OwningObject->isSectionStripped(SectionPimpl);
 }
 
+inline bool SectionRef::isBerkeleyText() const {
+  return OwningObject->isBerkeleyText(SectionPimpl);
+}
+
+inline bool SectionRef::isBerkeleyData() const {
+  return OwningObject->isBerkeleyData(SectionPimpl);
+}
+
 inline relocation_iterator SectionRef::relocation_begin() const {
   return OwningObject->section_rel_begin(SectionPimpl);
 }
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index 008e109f6679e58c4e49c491b9f17652183f86e0..9a978de2e59968e354ce01e3b8f6828beb0afcc3 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -129,6 +129,8 @@ private:
     case ELF::R_X86_64_NONE:
       return 0;
     case ELF::R_X86_64_64:
+    case ELF::R_X86_64_DTPOFF32:
+    case ELF::R_X86_64_DTPOFF64:
       return Value + getELFAddend(R);
     case ELF::R_X86_64_PC32:
       return Value + getELFAddend(R) - R.getOffset();
@@ -333,6 +335,7 @@ private:
       case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
       case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32:
       case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32:
+      case wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB:
         // For wasm section, its offset at 0 -- ignoring Value
         return 0;
       }
diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h
index 25903fc994543803b86b3e697784b285891db1f3..ed857652a0486149658e157987586f7693e4e2b0 100644
--- a/include/llvm/Object/Wasm.h
+++ b/include/llvm/Object/Wasm.h
@@ -37,13 +37,16 @@ namespace object {
 class WasmSymbol {
 public:
   WasmSymbol(const wasm::WasmSymbolInfo &Info,
-             const wasm::WasmSignature *FunctionType,
-             const wasm::WasmGlobalType *GlobalType)
-      : Info(Info), FunctionType(FunctionType), GlobalType(GlobalType) {}
+             const wasm::WasmGlobalType *GlobalType,
+             const wasm::WasmEventType *EventType,
+             const wasm::WasmSignature *Signature)
+      : Info(Info), GlobalType(GlobalType), EventType(EventType),
+        Signature(Signature) {}
 
   const wasm::WasmSymbolInfo &Info;
-  const wasm::WasmSignature *FunctionType;
   const wasm::WasmGlobalType *GlobalType;
+  const wasm::WasmEventType *EventType;
+  const wasm::WasmSignature *Signature;
 
   bool isTypeFunction() const {
     return Info.Kind == wasm::WASM_SYMBOL_TYPE_FUNCTION;
@@ -59,6 +62,8 @@ public:
     return Info.Kind == wasm::WASM_SYMBOL_TYPE_SECTION;
   }
 
+  bool isTypeEvent() const { return Info.Kind == wasm::WASM_SYMBOL_TYPE_EVENT; }
+
   bool isDefined() const { return !isUndefined(); }
 
   bool isUndefined() const {
@@ -124,12 +129,14 @@ public:
 
   static bool classof(const Binary *v) { return v->isWasm(); }
 
+  const wasm::WasmDylinkInfo &dylinkInfo() const { return DylinkInfo; }
   ArrayRef<wasm::WasmSignature> types() const { return Signatures; }
   ArrayRef<uint32_t> functionTypes() const { return FunctionTypes; }
   ArrayRef<wasm::WasmImport> imports() const { return Imports; }
   ArrayRef<wasm::WasmTable> tables() const { return Tables; }
   ArrayRef<wasm::WasmLimits> memories() const { return Memories; }
   ArrayRef<wasm::WasmGlobal> globals() const { return Globals; }
+  ArrayRef<wasm::WasmEvent> events() const { return Events; }
   ArrayRef<wasm::WasmExport> exports() const { return Exports; }
   ArrayRef<WasmSymbol> syms() const { return Symbols; }
   const wasm::WasmLinkingData &linkingData() const { return LinkingData; }
@@ -141,6 +148,7 @@ public:
   uint32_t startFunction() const { return StartFunction; }
   uint32_t getNumImportedGlobals() const { return NumImportedGlobals; }
   uint32_t getNumImportedFunctions() const { return NumImportedFunctions; }
+  uint32_t getNumImportedEvents() const { return NumImportedEvents; }
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
 
@@ -193,6 +201,7 @@ public:
   Triple::ArchType getArch() const override;
   SubtargetFeatures getFeatures() const override;
   bool isRelocatableObject() const override;
+  bool isSharedObject() const;
 
   struct ReadContext {
     const uint8_t *Start;
@@ -205,12 +214,16 @@ private:
   bool isDefinedFunctionIndex(uint32_t Index) const;
   bool isValidGlobalIndex(uint32_t Index) const;
   bool isDefinedGlobalIndex(uint32_t Index) const;
+  bool isValidEventIndex(uint32_t Index) const;
+  bool isDefinedEventIndex(uint32_t Index) const;
   bool isValidFunctionSymbol(uint32_t Index) const;
   bool isValidGlobalSymbol(uint32_t Index) const;
+  bool isValidEventSymbol(uint32_t Index) const;
   bool isValidDataSymbol(uint32_t Index) const;
   bool isValidSectionSymbol(uint32_t Index) const;
   wasm::WasmFunction &getDefinedFunction(uint32_t Index);
   wasm::WasmGlobal &getDefinedGlobal(uint32_t Index);
+  wasm::WasmEvent &getDefinedEvent(uint32_t Index);
 
   const WasmSection &getWasmSection(DataRefImpl Ref) const;
   const wasm::WasmRelocation &getWasmRelocation(DataRefImpl Ref) const;
@@ -226,6 +239,7 @@ private:
   Error parseTableSection(ReadContext &Ctx);
   Error parseMemorySection(ReadContext &Ctx);
   Error parseGlobalSection(ReadContext &Ctx);
+  Error parseEventSection(ReadContext &Ctx);
   Error parseExportSection(ReadContext &Ctx);
   Error parseStartSection(ReadContext &Ctx);
   Error parseElemSection(ReadContext &Ctx);
@@ -233,6 +247,7 @@ private:
   Error parseDataSection(ReadContext &Ctx);
 
   // Custom section types
+  Error parseDylinkSection(ReadContext &Ctx);
   Error parseNameSection(ReadContext &Ctx);
   Error parseLinkingSection(ReadContext &Ctx);
   Error parseLinkingSectionSymtab(ReadContext &Ctx);
@@ -241,11 +256,13 @@ private:
 
   wasm::WasmObjectHeader Header;
   std::vector<WasmSection> Sections;
+  wasm::WasmDylinkInfo DylinkInfo;
   std::vector<wasm::WasmSignature> Signatures;
   std::vector<uint32_t> FunctionTypes;
   std::vector<wasm::WasmTable> Tables;
   std::vector<wasm::WasmLimits> Memories;
   std::vector<wasm::WasmGlobal> Globals;
+  std::vector<wasm::WasmEvent> Events;
   std::vector<wasm::WasmImport> Imports;
   std::vector<wasm::WasmExport> Exports;
   std::vector<wasm::WasmElemSegment> ElemSegments;
@@ -255,12 +272,58 @@ private:
   std::vector<wasm::WasmFunctionName> DebugNames;
   uint32_t StartFunction = -1;
   bool HasLinkingSection = false;
+  bool HasDylinkSection = false;
   wasm::WasmLinkingData LinkingData;
   uint32_t NumImportedGlobals = 0;
   uint32_t NumImportedFunctions = 0;
+  uint32_t NumImportedEvents = 0;
   uint32_t CodeSection = 0;
   uint32_t DataSection = 0;
   uint32_t GlobalSection = 0;
+  uint32_t EventSection = 0;
+};
+
+class WasmSectionOrderChecker {
+public:
+  // We define orders for all core wasm sections and known custom sections.
+  enum : int {
+    // Core sections
+    // The order of standard sections is precisely given by the spec.
+    WASM_SEC_ORDER_TYPE = 1,
+    WASM_SEC_ORDER_IMPORT = 2,
+    WASM_SEC_ORDER_FUNCTION = 3,
+    WASM_SEC_ORDER_TABLE = 4,
+    WASM_SEC_ORDER_MEMORY = 5,
+    WASM_SEC_ORDER_GLOBAL = 6,
+    WASM_SEC_ORDER_EVENT = 7,
+    WASM_SEC_ORDER_EXPORT = 8,
+    WASM_SEC_ORDER_START = 9,
+    WASM_SEC_ORDER_ELEM = 10,
+    WASM_SEC_ORDER_DATACOUNT = 11,
+    WASM_SEC_ORDER_CODE = 12,
+    WASM_SEC_ORDER_DATA = 13,
+
+    // Custom sections
+    // "dylink" should be the very first section in the module
+    WASM_SEC_ORDER_DYLINK = 0,
+    // "linking" section requires DATA section in order to validate data symbols
+    WASM_SEC_ORDER_LINKING = 100,
+    // Must come after "linking" section in order to validate reloc indexes.
+    WASM_SEC_ORDER_RELOC = 101,
+    // "name" section must appear after DATA. Comes after "linking" to allow
+    // symbol table to set default function name.
+    WASM_SEC_ORDER_NAME = 102,
+    // "producers" section must appear after "name" section.
+    WASM_SEC_ORDER_PRODUCERS = 103
+  };
+
+  bool isValidSectionOrder(unsigned ID, StringRef CustomSectionName = "");
+
+private:
+  int LastOrder = -1; // Lastly seen known section's order
+
+  // Returns -1 for unknown sections.
+  int getSectionOrder(unsigned ID, StringRef CustomSectionName = "");
 };
 
 } // end namespace object
diff --git a/include/llvm/ObjectYAML/WasmYAML.h b/include/llvm/ObjectYAML/WasmYAML.h
index 8cd08e520560b50a6714e3a86b436331f84b5861..406dd7cb515f745fff27a26ef9d16d777c55ec86 100644
--- a/include/llvm/ObjectYAML/WasmYAML.h
+++ b/include/llvm/ObjectYAML/WasmYAML.h
@@ -74,6 +74,12 @@ struct Global {
   wasm::WasmInitExpr InitExpr;
 };
 
+struct Event {
+  uint32_t Index;
+  uint32_t Attribute;
+  uint32_t SigIndex;
+};
+
 struct Import {
   StringRef Module;
   StringRef Field;
@@ -83,6 +89,7 @@ struct Import {
     Global GlobalImport;
     Table TableImport;
     Limits Memory;
+    Event EventImport;
   };
 };
 
@@ -176,6 +183,21 @@ struct CustomSection : Section {
   yaml::BinaryRef Payload;
 };
 
+struct DylinkSection : CustomSection {
+  DylinkSection() : CustomSection("dylink") {}
+
+  static bool classof(const Section *S) {
+    auto C = dyn_cast<CustomSection>(S);
+    return C && C->Name == "dylink";
+  }
+
+  uint32_t MemorySize;
+  uint32_t MemoryAlignment;
+  uint32_t TableSize;
+  uint32_t TableAlignment;
+  std::vector<StringRef> Needed;
+};
+
 struct NameSection : CustomSection {
   NameSection() : CustomSection("name") {}
 
@@ -262,6 +284,16 @@ struct GlobalSection : Section {
   std::vector<Global> Globals;
 };
 
+struct EventSection : Section {
+  EventSection() : Section(wasm::WASM_SEC_EVENT) {}
+
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_EVENT;
+  }
+
+  std::vector<Event> Events;
+};
+
 struct ExportSection : Section {
   ExportSection() : Section(wasm::WASM_SEC_EXPORT) {}
 
@@ -339,6 +371,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::SymbolInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::InitFunction)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::ComdatEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Comdat)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Event)
 
 namespace llvm {
 namespace yaml {
@@ -471,6 +504,10 @@ template <> struct ScalarEnumerationTraits<WasmYAML::RelocType> {
   static void enumeration(IO &IO, WasmYAML::RelocType &Kind);
 };
 
+template <> struct MappingTraits<WasmYAML::Event> {
+  static void mapping(IO &IO, WasmYAML::Event &Event);
+};
+
 } // end namespace yaml
 } // end namespace llvm
 
diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index 22e5eb0caa0fa9ff5608a6b1439b17bbac0fd374..fa59345a02cf72126f7251f17baa51ddf0834389 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -501,6 +501,18 @@ public:
     PipelineStartEPCallbacks.push_back(C);
   }
 
+  /// Register a callback for a default optimizer pipeline extension point
+  ///
+  /// This extension point allows adding optimizations at the very end of the
+  /// function optimization pipeline. A key difference between this and the
+  /// legacy PassManager's OptimizerLast callback is that this extension point
+  /// is not triggered at O0. Extensions to the O0 pipeline should append their
+  /// passes to the end of the overall pipeline.
+  void registerOptimizerLastEPCallback(
+      const std::function<void(FunctionPassManager &, OptimizationLevel)> &C) {
+    OptimizerLastEPCallbacks.push_back(C);
+  }
+
   /// Register a callback for parsing an AliasAnalysis Name to populate
   /// the given AAManager \p AA
   void registerParseAACallback(
@@ -614,6 +626,8 @@ private:
       CGSCCOptimizerLateEPCallbacks;
   SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
       VectorizerStartEPCallbacks;
+  SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
+      OptimizerLastEPCallbacks;
   // Module callbacks
   SmallVector<std::function<void(ModulePassManager &)>, 2>
       PipelineStartEPCallbacks;
diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index 3c477cc347147ec6a62090604c2a9e37098e769f..5cc729e42cc88f218dd37496ea5f9f4cda1e7401 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -548,6 +548,9 @@ public:
       : SampleProfileReader(std::move(B), C, Underlying->getFormat()) {
     Profiles = std::move(Underlying->getProfiles());
     Summary = takeSummary(*Underlying);
+    // Keep the underlying reader alive; the profile data may contain
+    // StringRefs referencing names in its name table.
+    UnderlyingReader = std::move(Underlying);
   }
 
   /// Create a remapped sample profile from the given remapping file and
@@ -569,6 +572,7 @@ public:
 private:
   SymbolRemappingReader Remappings;
   DenseMap<SymbolRemappingReader::Key, FunctionSamples*> SampleMap;
+  std::unique_ptr<SampleProfileReader> UnderlyingReader;
 };
 
 } // end namespace sampleprof
diff --git a/include/llvm/Support/AArch64TargetParser.def b/include/llvm/Support/AArch64TargetParser.def
index 65bfee81bf45526989f6a47cd4d3dab68e3f08c4..6a7d16adefc6ad81a77228b831ccd7db9023abf4 100644
--- a/include/llvm/Support/AArch64TargetParser.def
+++ b/include/llvm/Support/AArch64TargetParser.def
@@ -72,6 +72,7 @@ AARCH64_ARCH_EXT_NAME("sve",      AArch64::AEK_SVE,      "+sve",  "-sve")
 AARCH64_ARCH_EXT_NAME("rcpc",     AArch64::AEK_RCPC,     "+rcpc", "-rcpc")
 AARCH64_ARCH_EXT_NAME("rng",      AArch64::AEK_RAND,     "+rand",  "-rand")
 AARCH64_ARCH_EXT_NAME("memtag",   AArch64::AEK_MTE,      "+mte",   "-mte")
+AARCH64_ARCH_EXT_NAME("ssbs",     AArch64::AEK_SSBS,     "+ssbs",  "-ssbs")
 #undef AARCH64_ARCH_EXT_NAME
 
 #ifndef AARCH64_CPU_NAME
@@ -117,6 +118,9 @@ AARCH64_CPU_NAME("thunderxt81", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                 (AArch64::AEK_CRC | AArch64::AEK_PROFILE))
 AARCH64_CPU_NAME("thunderxt83", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                 (AArch64::AEK_CRC | AArch64::AEK_PROFILE))
+AARCH64_CPU_NAME("tsv110", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_PROFILE | AArch64::AEK_FP16 | AArch64::AEK_FP16FML |
+                  AArch64::AEK_DOTPROD))
 // Invalid CPU
 AARCH64_CPU_NAME("invalid", INVALID, FK_INVALID, true, AArch64::AEK_INVALID)
 #undef AARCH64_CPU_NAME
diff --git a/include/llvm/Support/AArch64TargetParser.h b/include/llvm/Support/AArch64TargetParser.h
new file mode 100644
index 0000000000000000000000000000000000000000..aea406217c46a09aa15f5bd58af5f25b62ef393e
--- /dev/null
+++ b/include/llvm/Support/AArch64TargetParser.h
@@ -0,0 +1,122 @@
+//===-- AArch64TargetParser - Parser for AArch64 features -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise AArch64 hardware features
+// such as FPU/CPU/ARCH and extension names.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_AARCH64TARGETPARSERCOMMON_H
+#define LLVM_SUPPORT_AARCH64TARGETPARSERCOMMON_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/ARMTargetParser.h"
+#include <vector>
+
+// FIXME:This should be made into class design,to avoid dupplication.
+namespace llvm {
+namespace AArch64 {
+
+// Arch extension modifiers for CPUs.
+enum ArchExtKind : unsigned {
+  AEK_INVALID =     0,
+  AEK_NONE =        1,
+  AEK_CRC =         1 << 1,
+  AEK_CRYPTO =      1 << 2,
+  AEK_FP =          1 << 3,
+  AEK_SIMD =        1 << 4,
+  AEK_FP16 =        1 << 5,
+  AEK_PROFILE =     1 << 6,
+  AEK_RAS =         1 << 7,
+  AEK_LSE =         1 << 8,
+  AEK_SVE =         1 << 9,
+  AEK_DOTPROD =     1 << 10,
+  AEK_RCPC =        1 << 11,
+  AEK_RDM =         1 << 12,
+  AEK_SM4 =         1 << 13,
+  AEK_SHA3 =        1 << 14,
+  AEK_SHA2 =        1 << 15,
+  AEK_AES =         1 << 16,
+  AEK_FP16FML =     1 << 17,
+  AEK_RAND =        1 << 18,
+  AEK_MTE =         1 << 19,
+  AEK_SSBS =        1 << 20,
+};
+
+enum class ArchKind {
+#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID,
+#include "AArch64TargetParser.def"
+};
+
+const ARM::ArchNames<ArchKind> AArch64ARCHNames[] = {
+#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU,        \
+                     ARCH_BASE_EXT)                                            \
+  {NAME,                                                                       \
+   sizeof(NAME) - 1,                                                           \
+   CPU_ATTR,                                                                   \
+   sizeof(CPU_ATTR) - 1,                                                       \
+   SUB_ARCH,                                                                   \
+   sizeof(SUB_ARCH) - 1,                                                       \
+   ARM::FPUKind::ARCH_FPU,                                                     \
+   ARCH_BASE_EXT,                                                              \
+   AArch64::ArchKind::ID,                                                      \
+   ARCH_ATTR},
+#include "AArch64TargetParser.def"
+};
+
+const ARM::ExtName AArch64ARCHExtNames[] = {
+#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)                   \
+  {NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE},
+#include "AArch64TargetParser.def"
+};
+
+const ARM::CpuNames<ArchKind> AArch64CPUNames[] = {
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)       \
+  {NAME, sizeof(NAME) - 1, AArch64::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT},
+#include "AArch64TargetParser.def"
+};
+
+const ArchKind ArchKinds[] = {
+#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) \
+    ArchKind::ID,
+#include "AArch64TargetParser.def"
+};
+
+// FIXME: These should be moved to TargetTuple once it exists
+bool getExtensionFeatures(unsigned Extensions,
+                          std::vector<StringRef> &Features);
+bool getArchFeatures(ArchKind AK, std::vector<StringRef> &Features);
+
+StringRef getArchName(ArchKind AK);
+unsigned getArchAttr(ArchKind AK);
+StringRef getCPUAttr(ArchKind AK);
+StringRef getSubArch(ArchKind AK);
+StringRef getArchExtName(unsigned ArchExtKind);
+StringRef getArchExtFeature(StringRef ArchExt);
+
+// Information by Name
+unsigned getDefaultFPU(StringRef CPU, ArchKind AK);
+unsigned getDefaultExtensions(StringRef CPU, ArchKind AK);
+StringRef getDefaultCPU(StringRef Arch);
+ArchKind getCPUArchKind(StringRef CPU);
+
+// Parser
+ArchKind parseArch(StringRef Arch);
+ArchExtKind parseArchExt(StringRef ArchExt);
+ArchKind parseCPUArch(StringRef CPU);
+// Used by target parser tests
+void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+
+bool isX18ReservedByDefault(const Triple &TT);
+
+} // namespace AArch64
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/Support/AMDGPUMetadata.h b/include/llvm/Support/AMDGPUMetadata.h
index 667fb3f3da436c146eac3453293cc601060581c2..84851c07499d43f4078f0fa6e403980371f7ce0f 100644
--- a/include/llvm/Support/AMDGPUMetadata.h
+++ b/include/llvm/Support/AMDGPUMetadata.h
@@ -431,6 +431,21 @@ std::error_code fromString(std::string String, Metadata &HSAMetadata);
 /// Converts \p HSAMetadata to \p String.
 std::error_code toString(Metadata HSAMetadata, std::string &String);
 
+//===----------------------------------------------------------------------===//
+// HSA metadata for v3 code object.
+//===----------------------------------------------------------------------===//
+namespace V3 {
+/// HSA metadata major version.
+constexpr uint32_t VersionMajor = 1;
+/// HSA metadata minor version.
+constexpr uint32_t VersionMinor = 0;
+
+/// HSA metadata beginning assembler directive.
+constexpr char AssemblerDirectiveBegin[] = ".amdgpu_metadata";
+/// HSA metadata ending assembler directive.
+constexpr char AssemblerDirectiveEnd[] = ".end_amdgpu_metadata";
+} // end namespace V3
+
 } // end namespace HSAMD
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/Support/ARMTargetParser.h b/include/llvm/Support/ARMTargetParser.h
new file mode 100644
index 0000000000000000000000000000000000000000..e41675a1f6087194d3e071bf0a44ab0d9d1b11f5
--- /dev/null
+++ b/include/llvm/Support/ARMTargetParser.h
@@ -0,0 +1,263 @@
+//===-- ARMTargetParser - Parser for ARM target features --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise ARM hardware features
+// such as FPU/CPU/ARCH/extensions and specific support such as HWDIV.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ARMTARGETPARSER_H
+#define LLVM_SUPPORT_ARMTARGETPARSER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include <vector>
+
+namespace llvm {
+namespace ARM {
+
+// Arch extension modifiers for CPUs.
+// Note that this is not the same as the AArch64 list
+enum ArchExtKind : unsigned {
+  AEK_INVALID =     0,
+  AEK_NONE =        1,
+  AEK_CRC =         1 << 1,
+  AEK_CRYPTO =      1 << 2,
+  AEK_FP =          1 << 3,
+  AEK_HWDIVTHUMB =  1 << 4,
+  AEK_HWDIVARM =    1 << 5,
+  AEK_MP =          1 << 6,
+  AEK_SIMD =        1 << 7,
+  AEK_SEC =         1 << 8,
+  AEK_VIRT =        1 << 9,
+  AEK_DSP =         1 << 10,
+  AEK_FP16 =        1 << 11,
+  AEK_RAS =         1 << 12,
+  AEK_SVE =         1 << 13,
+  AEK_DOTPROD =     1 << 14,
+  AEK_SHA2    =     1 << 15,
+  AEK_AES     =     1 << 16,
+  AEK_FP16FML =     1 << 17,
+  // Unsupported extensions.
+  AEK_OS = 0x8000000,
+  AEK_IWMMXT = 0x10000000,
+  AEK_IWMMXT2 = 0x20000000,
+  AEK_MAVERICK = 0x40000000,
+  AEK_XSCALE = 0x80000000,
+};
+
+// List of Arch Extension names.
+// FIXME: TableGen this.
+struct ExtName {
+  const char *NameCStr;
+  size_t NameLength;
+  unsigned ID;
+  const char *Feature;
+  const char *NegFeature;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+
+const ExtName ARCHExtNames[] = {
+#define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)                       \
+  {NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE},
+#include "ARMTargetParser.def"
+};
+
+// List of HWDiv names (use getHWDivSynonym) and which architectural
+// features they correspond to (use getHWDivFeatures).
+// FIXME: TableGen this.
+const struct {
+  const char *NameCStr;
+  size_t NameLength;
+  unsigned ID;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+} HWDivNames[] = {
+#define ARM_HW_DIV_NAME(NAME, ID) {NAME, sizeof(NAME) - 1, ID},
+#include "ARMTargetParser.def"
+};
+
+// Arch names.
+enum class ArchKind {
+#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID,
+#include "ARMTargetParser.def"
+};
+
+// List of CPU names and their arches.
+// The same CPU can have multiple arches and can be default on multiple arches.
+// When finding the Arch for a CPU, first-found prevails. Sort them accordingly.
+// When this becomes table-generated, we'd probably need two tables.
+// FIXME: TableGen this.
+template <typename T> struct CpuNames {
+  const char *NameCStr;
+  size_t NameLength;
+  T ArchID;
+  bool Default; // is $Name the default CPU for $ArchID ?
+  unsigned DefaultExtensions;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+
+const CpuNames<ArchKind> CPUNames[] = {
+#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)           \
+  {NAME, sizeof(NAME) - 1, ARM::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT},
+#include "ARMTargetParser.def"
+};
+
+// FPU names.
+enum FPUKind {
+#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) KIND,
+#include "ARMTargetParser.def"
+  FK_LAST
+};
+
+// FPU Version
+enum class FPUVersion {
+  NONE,
+  VFPV2,
+  VFPV3,
+  VFPV3_FP16,
+  VFPV4,
+  VFPV5
+};
+
+// An FPU name restricts the FPU in one of three ways:
+enum class FPURestriction {
+  None = 0, ///< No restriction
+  D16,      ///< Only 16 D registers
+  SP_D16    ///< Only single-precision instructions, with 16 D registers
+};
+
+// An FPU name implies one of three levels of Neon support:
+enum class NeonSupportLevel {
+  None = 0, ///< No Neon
+  Neon,     ///< Neon
+  Crypto    ///< Neon with Crypto
+};
+
+// ISA kinds.
+enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 };
+
+// Endianness
+// FIXME: BE8 vs. BE32?
+enum class EndianKind { INVALID = 0, LITTLE, BIG };
+
+// v6/v7/v8 Profile
+enum class ProfileKind { INVALID = 0, A, R, M };
+
+// List of canonical FPU names (use getFPUSynonym) and which architectural
+// features they correspond to (use getFPUFeatures).
+// FIXME: TableGen this.
+// The entries must appear in the order listed in ARM::FPUKind for correct
+// indexing
+struct FPUName {
+  const char *NameCStr;
+  size_t NameLength;
+  FPUKind ID;
+  FPUVersion FPUVer;
+  NeonSupportLevel NeonSupport;
+  FPURestriction Restriction;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+};
+
+static const FPUName FPUNames[] = {
+#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION)                \
+  {NAME, sizeof(NAME) - 1, KIND, VERSION, NEON_SUPPORT, RESTRICTION},
+#include "llvm/Support/ARMTargetParser.def"
+};
+
+// List of canonical arch names (use getArchSynonym).
+// This table also provides the build attribute fields for CPU arch
+// and Arch ID, according to the Addenda to the ARM ABI, chapters
+// 2.4 and 2.3.5.2 respectively.
+// FIXME: SubArch values were simplified to fit into the expectations
+// of the triples and are not conforming with their official names.
+// Check to see if the expectation should be changed.
+// FIXME: TableGen this.
+template <typename T> struct ArchNames {
+  const char *NameCStr;
+  size_t NameLength;
+  const char *CPUAttrCStr;
+  size_t CPUAttrLength;
+  const char *SubArchCStr;
+  size_t SubArchLength;
+  unsigned DefaultFPU;
+  unsigned ArchBaseExtensions;
+  T ID;
+  ARMBuildAttrs::CPUArch ArchAttr; // Arch ID in build attributes.
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+
+  // CPU class in build attributes.
+  StringRef getCPUAttr() const { return StringRef(CPUAttrCStr, CPUAttrLength); }
+
+  // Sub-Arch name.
+  StringRef getSubArch() const { return StringRef(SubArchCStr, SubArchLength); }
+};
+
+static const ArchNames<ArchKind> ARCHNames[] = {
+#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU,            \
+                 ARCH_BASE_EXT)                                                \
+  {NAME,         sizeof(NAME) - 1,                                             \
+   CPU_ATTR,     sizeof(CPU_ATTR) - 1,                                         \
+   SUB_ARCH,     sizeof(SUB_ARCH) - 1,                                         \
+   ARCH_FPU,     ARCH_BASE_EXT,                                                \
+   ArchKind::ID, ARCH_ATTR},
+#include "llvm/Support/ARMTargetParser.def"
+};
+
+// Information by ID
+StringRef getFPUName(unsigned FPUKind);
+FPUVersion getFPUVersion(unsigned FPUKind);
+NeonSupportLevel getFPUNeonSupportLevel(unsigned FPUKind);
+FPURestriction getFPURestriction(unsigned FPUKind);
+
+// FIXME: These should be moved to TargetTuple once it exists
+bool getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features);
+bool getHWDivFeatures(unsigned HWDivKind, std::vector<StringRef> &Features);
+bool getExtensionFeatures(unsigned Extensions,
+                          std::vector<StringRef> &Features);
+
+StringRef getArchName(ArchKind AK);
+unsigned getArchAttr(ArchKind AK);
+StringRef getCPUAttr(ArchKind AK);
+StringRef getSubArch(ArchKind AK);
+StringRef getArchExtName(unsigned ArchExtKind);
+StringRef getArchExtFeature(StringRef ArchExt);
+StringRef getHWDivName(unsigned HWDivKind);
+
+// Information by Name
+unsigned getDefaultFPU(StringRef CPU, ArchKind AK);
+unsigned getDefaultExtensions(StringRef CPU, ArchKind AK);
+StringRef getDefaultCPU(StringRef Arch);
+StringRef getCanonicalArchName(StringRef Arch);
+StringRef getFPUSynonym(StringRef FPU);
+StringRef getArchSynonym(StringRef Arch);
+
+// Parser
+unsigned parseHWDiv(StringRef HWDiv);
+unsigned parseFPU(StringRef FPU);
+ArchKind parseArch(StringRef Arch);
+unsigned parseArchExt(StringRef ArchExt);
+ArchKind parseCPUArch(StringRef CPU);
+ISAKind parseArchISA(StringRef Arch);
+EndianKind parseArchEndian(StringRef Arch);
+ProfileKind parseArchProfile(StringRef Arch);
+unsigned parseArchVersion(StringRef Arch);
+
+void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
+
+} // namespace ARM
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index b1b351863b99a79bc0170b5ab90771f420d6ee67..42d08378a6774a1da39ca28782eb5dc17fb622c5 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -311,6 +311,33 @@ public:
     return None;
   }
 
+  /// A wrapper around identifyObject that additionally asserts that
+  /// the object is indeed within the allocator.
+  /// \return An index uniquely and reproducibly identifying
+  /// an input pointer \p Ptr in the given allocator.
+  int64_t identifyKnownObject(const void *Ptr) {
+    Optional<int64_t> Out = identifyObject(Ptr);
+    assert(Out && "Wrong allocator used");
+    return *Out;
+  }
+
+  /// A wrapper around identifyKnownObject. Accepts type information
+  /// about the object and produces a smaller identifier by relying on
+  /// the alignment information. Note that sub-classes may have different
+  /// alignment, so the most base class should be passed as template parameter
+  /// in order to obtain correct results. For that reason automatic template
+  /// parameter deduction is disabled.
+  /// \return An index uniquely and reproducibly identifying
+  /// an input pointer \p Ptr in the given allocator. This identifier is
+  /// different from the ones produced by identifyObject and
+  /// identifyAlignedObject.
+  template <typename T>
+  int64_t identifyKnownAlignedObject(const void *Ptr) {
+    int64_t Out = identifyKnownObject(Ptr);
+    assert(Out % alignof(T) == 0 && "Wrong alignment information");
+    return Out / alignof(T);
+  }
+
   size_t getTotalMemory() const {
     size_t TotalMemory = 0;
     for (auto I = Slabs.begin(), E = Slabs.end(); I != E; ++I)
diff --git a/include/llvm/Support/BinaryStreamArray.h b/include/llvm/Support/BinaryStreamArray.h
index 7b8a95b45735b3226724f64b83c2eeda5063bd16..7c110fcb6a4b1be8bdeadfeb05725f21ded8aacb 100644
--- a/include/llvm/Support/BinaryStreamArray.h
+++ b/include/llvm/Support/BinaryStreamArray.h
@@ -96,21 +96,32 @@ public:
 
   explicit VarStreamArray(const Extractor &E) : E(E) {}
 
-  explicit VarStreamArray(BinaryStreamRef Stream) : Stream(Stream) {}
+  explicit VarStreamArray(BinaryStreamRef Stream, uint32_t Skew = 0)
+      : Stream(Stream), Skew(Skew) {}
 
-  VarStreamArray(BinaryStreamRef Stream, const Extractor &E)
-      : Stream(Stream), E(E) {}
+  VarStreamArray(BinaryStreamRef Stream, const Extractor &E, uint32_t Skew = 0)
+      : Stream(Stream), E(E), Skew(Skew) {}
 
   Iterator begin(bool *HadError = nullptr) const {
-    return Iterator(*this, E, HadError);
+    return Iterator(*this, E, Skew, nullptr);
   }
 
   bool valid() const { return Stream.valid(); }
 
+  uint32_t skew() const { return Skew; }
   Iterator end() const { return Iterator(E); }
 
   bool empty() const { return Stream.getLength() == 0; }
 
+  VarStreamArray<ValueType, Extractor> substream(uint32_t Begin,
+                                                 uint32_t End) const {
+    assert(Begin >= Skew);
+    // We should never cut off the beginning of the stream since it might be
+    // skewed, meaning the initial bytes are important.
+    BinaryStreamRef NewStream = Stream.slice(0, End);
+    return {NewStream, E, Begin};
+  }
+
   /// given an offset into the array's underlying stream, return an
   /// iterator to the record at that offset.  This is considered unsafe
   /// since the behavior is undefined if \p Offset does not refer to the
@@ -123,13 +134,17 @@ public:
   Extractor &getExtractor() { return E; }
 
   BinaryStreamRef getUnderlyingStream() const { return Stream; }
-  void setUnderlyingStream(BinaryStreamRef S) { Stream = S; }
+  void setUnderlyingStream(BinaryStreamRef S, uint32_t Skew = 0) {
+    Stream = S;
+    this->Skew = Skew;
+  }
 
-  void drop_front() { Stream = Stream.drop_front(begin()->length()); }
+  void drop_front() { Skew += begin()->length(); }
 
 private:
   BinaryStreamRef Stream;
   Extractor E;
+  uint32_t Skew;
 };
 
 template <typename ValueType, typename Extractor>
@@ -140,10 +155,6 @@ class VarStreamArrayIterator
   typedef VarStreamArray<ValueType, Extractor> ArrayType;
 
 public:
-  VarStreamArrayIterator(const ArrayType &Array, const Extractor &E,
-                         bool *HadError)
-      : VarStreamArrayIterator(Array, E, 0, HadError) {}
-
   VarStreamArrayIterator(const ArrayType &Array, const Extractor &E,
                          uint32_t Offset, bool *HadError)
       : IterRef(Array.Stream.drop_front(Offset)), Extract(E),
diff --git a/include/llvm/Support/BinaryStreamReader.h b/include/llvm/Support/BinaryStreamReader.h
index fe77b550c4537184ce60325d29dce7514fe2a39d..392958de30d5656aff5c0aa037a3f8a455930755 100644
--- a/include/llvm/Support/BinaryStreamReader.h
+++ b/include/llvm/Support/BinaryStreamReader.h
@@ -203,11 +203,12 @@ public:
   /// \returns a success error code if the data was successfully read, otherwise
   /// returns an appropriate error code.
   template <typename T, typename U>
-  Error readArray(VarStreamArray<T, U> &Array, uint32_t Size) {
+  Error readArray(VarStreamArray<T, U> &Array, uint32_t Size,
+                  uint32_t Skew = 0) {
     BinaryStreamRef S;
     if (auto EC = readStreamRef(S, Size))
       return EC;
-    Array.setUnderlyingStream(S);
+    Array.setUnderlyingStream(S, Skew);
     return Error::success();
   }
 
diff --git a/include/llvm/Support/BuryPointer.h b/include/llvm/Support/BuryPointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..53f1f395b922947713e4ab5f8a98a8d291f5a343
--- /dev/null
+++ b/include/llvm/Support/BuryPointer.h
@@ -0,0 +1,30 @@
+//===- llvm/Support/BuryPointer.h - Memory Manipulation/Leak ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BURYPOINTER_H
+#define LLVM_SUPPORT_BURYPOINTER_H
+
+#include <memory>
+
+namespace llvm {
+
+// In tools that will exit soon anyway, going through the process of explicitly
+// deallocating resources can be unnecessary - better to leak the resources and
+// let the OS clean them up when the process ends. Use this function to ensure
+// the memory is not misdiagnosed as an unintentional leak by leak detection
+// tools (this is achieved by preserving pointers to the object in a globally
+// visible array).
+void BuryPointer(const void *Ptr);
+template <typename T> void BuryPointer(std::unique_ptr<T> Ptr) {
+  BuryPointer(Ptr.release());
+}
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/Support/Chrono.h b/include/llvm/Support/Chrono.h
index 994068af3771b972e3f336dcceddc7f000d3fb0a..57677e8d5cf149326f6fe32f8a16cd4cb1a9917c 100644
--- a/include/llvm/Support/Chrono.h
+++ b/include/llvm/Support/Chrono.h
@@ -47,6 +47,14 @@ toTimePoint(std::time_t T) {
   return time_point_cast<seconds>(system_clock::from_time_t(T));
 }
 
+/// Convert a std::time_t + nanoseconds to a TimePoint
+LLVM_ATTRIBUTE_ALWAYS_INLINE inline TimePoint<>
+toTimePoint(std::time_t T, uint32_t nsec) {
+  using namespace std::chrono;
+  return time_point_cast<nanoseconds>(system_clock::from_time_t(T))
+    + nanoseconds(nsec);
+}
+
 } // namespace sys
 
 raw_ostream &operator<<(raw_ostream &OS, sys::TimePoint<> TP);
diff --git a/include/llvm/Support/Error.h b/include/llvm/Support/Error.h
index 4e1619019bb4aba8ba4d80478f9b985434304f7f..4962812e750e6007ce0024c1217c82506c7fff64 100644
--- a/include/llvm/Support/Error.h
+++ b/include/llvm/Support/Error.h
@@ -953,10 +953,14 @@ Expected<T> handleExpected(Expected<T> ValOrErr, RecoveryFtor &&RecoveryPath,
 /// will be printed before the first one is logged. A newline will be printed
 /// after each error.
 ///
+/// This function is compatible with the helpers from Support/WithColor.h. You
+/// can pass any of them as the OS. Please consider using them instead of
+/// including 'error: ' in the ErrorBanner.
+///
 /// This is useful in the base level of your program to allow clean termination
 /// (allowing clean deallocation of resources, etc.), while reporting error
 /// information to the user.
-void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner);
+void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner = {});
 
 /// Write all error messages (if any) in E to a string. The newline character
 /// is used to separate error messages.
diff --git a/include/llvm/Support/FileCheck.h b/include/llvm/Support/FileCheck.h
index de8d46bcb8f27b8d180ac895485367d40ae74f40..4061a26e22c50f04abdc801f600fb5d55ccc52bb 100644
--- a/include/llvm/Support/FileCheck.h
+++ b/include/llvm/Support/FileCheck.h
@@ -43,7 +43,8 @@ struct FileCheckRequest {
 //===----------------------------------------------------------------------===//
 
 namespace Check {
-enum FileCheckType {
+
+enum FileCheckKind {
   CheckNone = 0,
   CheckPlain,
   CheckNext,
@@ -58,10 +59,31 @@ enum FileCheckType {
   CheckEOF,
 
   /// Marks when parsing found a -NOT check combined with another CHECK suffix.
-  CheckBadNot
+  CheckBadNot,
+
+  /// Marks when parsing found a -COUNT directive with invalid count value.
+  CheckBadCount
+};
+
+class FileCheckType {
+  FileCheckKind Kind;
+  int Count; ///< optional Count for some checks
+
+public:
+  FileCheckType(FileCheckKind Kind = CheckNone) : Kind(Kind), Count(1) {}
+  FileCheckType(const FileCheckType &) = default;
+
+  operator FileCheckKind() const { return Kind; }
+
+  int getCount() const { return Count; }
+  FileCheckType &setCount(int C);
+
+  std::string getDescription(StringRef Prefix) const;
 };
 }
 
+struct FileCheckDiag;
+
 class FileCheckPattern {
   SMLoc PatternLoc;
 
@@ -105,7 +127,8 @@ public:
                          const StringMap<StringRef> &VariableTable,
                          SMRange MatchRange = None) const;
   void PrintFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
-                       const StringMap<StringRef> &VariableTable) const;
+                       const StringMap<StringRef> &VariableTable,
+                       std::vector<FileCheckDiag> *Diags) const;
 
   bool hasVariable() const {
     return !(VariableUses.empty() && VariableDefs.empty());
@@ -113,6 +136,8 @@ public:
 
   Check::FileCheckType getCheckTy() const { return CheckTy; }
 
+  int getCount() const { return CheckTy.getCount(); }
+
 private:
   bool AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM);
   void AddBackrefToRegEx(unsigned BackrefNum);
@@ -123,6 +148,59 @@ private:
   size_t FindRegexVarEnd(StringRef Str, SourceMgr &SM);
 };
 
+//===----------------------------------------------------------------------===//
+/// Summary of a FileCheck diagnostic.
+//===----------------------------------------------------------------------===//
+
+struct FileCheckDiag {
+  /// What is the FileCheck directive for this diagnostic?
+  Check::FileCheckType CheckTy;
+  /// Where is the FileCheck directive for this diagnostic?
+  unsigned CheckLine, CheckCol;
+  /// What type of match result does this diagnostic describe?
+  ///
+  /// A directive's supplied pattern is said to be either expected or excluded
+  /// depending on whether the pattern must have or must not have a match in
+  /// order for the directive to succeed.  For example, a CHECK directive's
+  /// pattern is expected, and a CHECK-NOT directive's pattern is excluded.
+  /// All match result types whose names end with "Excluded" are for excluded
+  /// patterns, and all others are for expected patterns.
+  ///
+  /// There might be more than one match result for a single pattern.  For
+  /// example, there might be several discarded matches
+  /// (MatchFoundButDiscarded) before either a good match
+  /// (MatchFoundAndExpected) or a failure to match (MatchNoneButExpected),
+  /// and there might be a fuzzy match (MatchFuzzy) after the latter.
+  enum MatchType {
+    /// Indicates a good match for an expected pattern.
+    MatchFoundAndExpected,
+    /// Indicates a match for an excluded pattern.
+    MatchFoundButExcluded,
+    /// Indicates a match for an expected pattern, but the match is on the
+    /// wrong line.
+    MatchFoundButWrongLine,
+    /// Indicates a discarded match for an expected pattern.
+    MatchFoundButDiscarded,
+    /// Indicates no match for an excluded pattern.
+    MatchNoneAndExcluded,
+    /// Indicates no match for an expected pattern, but this might follow good
+    /// matches when multiple matches are expected for the pattern, or it might
+    /// follow discarded matches for the pattern.
+    MatchNoneButExpected,
+    /// Indicates a fuzzy match that serves as a suggestion for the next
+    /// intended match for an expected pattern with too few or no good matches.
+    MatchFuzzy,
+  } MatchTy;
+  /// The search range if MatchTy is MatchNoneAndExcluded or
+  /// MatchNoneButExpected, or the match range otherwise.
+  unsigned InputStartLine;
+  unsigned InputStartCol;
+  unsigned InputEndLine;
+  unsigned InputEndCol;
+  FileCheckDiag(const SourceMgr &SM, const Check::FileCheckType &CheckTy,
+                SMLoc CheckLoc, MatchType MatchTy, SMRange InputRange);
+};
+
 //===----------------------------------------------------------------------===//
 // Check Strings.
 //===----------------------------------------------------------------------===//
@@ -147,18 +225,20 @@ struct FileCheckString {
 
   size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabelScanMode,
                size_t &MatchLen, StringMap<StringRef> &VariableTable,
-               FileCheckRequest &Req) const;
+               FileCheckRequest &Req, std::vector<FileCheckDiag> *Diags) const;
 
   bool CheckNext(const SourceMgr &SM, StringRef Buffer) const;
   bool CheckSame(const SourceMgr &SM, StringRef Buffer) const;
   bool CheckNot(const SourceMgr &SM, StringRef Buffer,
                 const std::vector<const FileCheckPattern *> &NotStrings,
                 StringMap<StringRef> &VariableTable,
-                const FileCheckRequest &Req) const;
+                const FileCheckRequest &Req,
+                std::vector<FileCheckDiag> *Diags) const;
   size_t CheckDag(const SourceMgr &SM, StringRef Buffer,
                   std::vector<const FileCheckPattern *> &NotStrings,
                   StringMap<StringRef> &VariableTable,
-                  const FileCheckRequest &Req) const;
+                  const FileCheckRequest &Req,
+                  std::vector<FileCheckDiag> *Diags) const;
 };
 
 /// FileCheck class takes the request and exposes various methods that
@@ -195,7 +275,8 @@ public:
   ///
   /// Returns false if the input fails to satisfy the checks.
   bool CheckInput(SourceMgr &SM, StringRef Buffer,
-                  ArrayRef<FileCheckString> CheckStrings);
+                  ArrayRef<FileCheckString> CheckStrings,
+                  std::vector<FileCheckDiag> *Diags = nullptr);
 };
 } // namespace llvm
 #endif
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index a76eab605a86029b28853ec83986def1a9236816..827e2e91eea218b2e684170e65b2978b71e974ee 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -160,6 +160,8 @@ protected:
   #if defined(LLVM_ON_UNIX)
   time_t fs_st_atime = 0;
   time_t fs_st_mtime = 0;
+  uint32_t fs_st_atime_nsec = 0;
+  uint32_t fs_st_mtime_nsec = 0;
   uid_t fs_st_uid = 0;
   gid_t fs_st_gid = 0;
   off_t fs_st_size = 0;
@@ -180,9 +182,12 @@ public:
   explicit basic_file_status(file_type Type) : Type(Type) {}
 
   #if defined(LLVM_ON_UNIX)
-  basic_file_status(file_type Type, perms Perms, time_t ATime, time_t MTime,
+  basic_file_status(file_type Type, perms Perms, time_t ATime,
+                    uint32_t ATimeNSec, time_t MTime, uint32_t MTimeNSec,
                     uid_t UID, gid_t GID, off_t Size)
-      : fs_st_atime(ATime), fs_st_mtime(MTime), fs_st_uid(UID), fs_st_gid(GID),
+      : fs_st_atime(ATime), fs_st_mtime(MTime),
+        fs_st_atime_nsec(ATimeNSec), fs_st_mtime_nsec(MTimeNSec),
+        fs_st_uid(UID), fs_st_gid(GID),
         fs_st_size(Size), Type(Type), Perms(Perms) {}
 #elif defined(_WIN32)
   basic_file_status(file_type Type, perms Perms, uint32_t LastAccessTimeHigh,
@@ -199,7 +204,20 @@ public:
   // getters
   file_type type() const { return Type; }
   perms permissions() const { return Perms; }
+
+  /// The file access time as reported from the underlying file system.
+  ///
+  /// Also see comments on \c getLastModificationTime() related to the precision
+  /// of the returned value.
   TimePoint<> getLastAccessedTime() const;
+
+  /// The file modification time as reported from the underlying file system.
+  ///
+  /// The returned value allows for nanosecond precision but the actual
+  /// resolution is an implementation detail of the underlying file system.
+  /// There is no guarantee for what kind of resolution you can expect, the
+  /// resolution can differ across platforms and even across mountpoints on the
+  /// same machine.
   TimePoint<> getLastModificationTime() const;
 
   #if defined(LLVM_ON_UNIX)
@@ -247,8 +265,11 @@ public:
 
   #if defined(LLVM_ON_UNIX)
   file_status(file_type Type, perms Perms, dev_t Dev, nlink_t Links, ino_t Ino,
-              time_t ATime, time_t MTime, uid_t UID, gid_t GID, off_t Size)
-      : basic_file_status(Type, Perms, ATime, MTime, UID, GID, Size),
+              time_t ATime, uint32_t ATimeNSec,
+              time_t MTime, uint32_t MTimeNSec,
+              uid_t UID, gid_t GID, off_t Size)
+      : basic_file_status(Type, Perms, ATime, ATimeNSec, MTime, MTimeNSec,
+                          UID, GID, Size),
         fs_st_dev(Dev), fs_st_nlinks(Links), fs_st_ino(Ino) {}
   #elif defined(_WIN32)
   file_status(file_type Type, perms Perms, uint32_t LinkCount,
@@ -349,6 +370,12 @@ std::error_code create_hard_link(const Twine &to, const Twine &from);
 std::error_code real_path(const Twine &path, SmallVectorImpl<char> &output,
                           bool expand_tilde = false);
 
+/// Expands ~ expressions to the user's home directory. On Unix ~user
+/// directories are resolved as well.
+///
+/// @param path The path to resolve.
+void expand_tilde(const Twine &path, SmallVectorImpl<char> &output);
+
 /// Get the current path.
 ///
 /// @param result Holds the current path on return.
diff --git a/include/llvm/Support/TargetOpcodes.def b/include/llvm/Support/TargetOpcodes.def
index a683f053e034519284907c498dddb41adabb6545..3245276bbf362bb1b345aa2a4dd7cf93aa04b830 100644
--- a/include/llvm/Support/TargetOpcodes.def
+++ b/include/llvm/Support/TargetOpcodes.def
@@ -258,6 +258,17 @@ HANDLE_TARGET_OPCODE(G_INSERT)
 /// larger register.
 HANDLE_TARGET_OPCODE(G_MERGE_VALUES)
 
+/// Generic instruction to create a vector value from a number of scalar
+/// components.
+HANDLE_TARGET_OPCODE(G_BUILD_VECTOR)
+
+/// Generic instruction to create a vector value from a number of scalar
+/// components, which have types larger than the result vector elt type.
+HANDLE_TARGET_OPCODE(G_BUILD_VECTOR_TRUNC)
+
+/// Generic instruction to create a vector by concatenating multiple vectors.
+HANDLE_TARGET_OPCODE(G_CONCAT_VECTORS)
+
 /// Generic pointer to int conversion.
 HANDLE_TARGET_OPCODE(G_PTRTOINT)
 
@@ -443,6 +454,9 @@ HANDLE_TARGET_OPCODE(G_FLOG)
 /// Floating point base-2 logarithm of a value.
 HANDLE_TARGET_OPCODE(G_FLOG2)
 
+/// Floating point base-10 logarithm of a value.
+HANDLE_TARGET_OPCODE(G_FLOG10)
+
 /// Generic FP negation.
 HANDLE_TARGET_OPCODE(G_FNEG)
 
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 63241b52e1fa6b19eba020e51b728b825fe116d2..ace11ed410a38fafe24b2ceab67e31028163eed7 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -18,215 +18,20 @@
 // FIXME: vector is used because that's what clang uses for subtarget feature
 // lists, but SmallVector would probably be better
 #include "llvm/ADT/Triple.h"
+#include "llvm/Support/ARMTargetParser.h"
+#include "llvm/Support/AArch64TargetParser.h"
 #include <vector>
 
 namespace llvm {
 class StringRef;
 
-// Target specific information into their own namespaces. These should be
-// generated from TableGen because the information is already there, and there
-// is where new information about targets will be added.
+// Target specific information in their own namespaces.
+// (ARM/AArch64 are declared in ARM/AArch64TargetParser.h)
+// These should be generated from TableGen because the information is already
+// there, and there is where new information about targets will be added.
 // FIXME: To TableGen this we need to make some table generated files available
 // even if the back-end is not compiled with LLVM, plus we need to create a new
 // back-end to TableGen to create these clean tables.
-namespace ARM {
-
-// FPU Version
-enum class FPUVersion {
-  NONE,
-  VFPV2,
-  VFPV3,
-  VFPV3_FP16,
-  VFPV4,
-  VFPV5
-};
-
-// An FPU name restricts the FPU in one of three ways:
-enum class FPURestriction {
-  None = 0, ///< No restriction
-  D16,      ///< Only 16 D registers
-  SP_D16    ///< Only single-precision instructions, with 16 D registers
-};
-
-// An FPU name implies one of three levels of Neon support:
-enum class NeonSupportLevel {
-  None = 0, ///< No Neon
-  Neon,     ///< Neon
-  Crypto    ///< Neon with Crypto
-};
-
-// FPU names.
-enum FPUKind {
-#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) KIND,
-#include "ARMTargetParser.def"
-  FK_LAST
-};
-
-// Arch names.
-enum class ArchKind {
-#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID,
-#include "ARMTargetParser.def"
-};
-
-// Arch extension modifiers for CPUs.
-enum ArchExtKind : unsigned {
-  AEK_INVALID =     0,
-  AEK_NONE =        1,
-  AEK_CRC =         1 << 1,
-  AEK_CRYPTO =      1 << 2,
-  AEK_FP =          1 << 3,
-  AEK_HWDIVTHUMB =  1 << 4,
-  AEK_HWDIVARM =    1 << 5,
-  AEK_MP =          1 << 6,
-  AEK_SIMD =        1 << 7,
-  AEK_SEC =         1 << 8,
-  AEK_VIRT =        1 << 9,
-  AEK_DSP =         1 << 10,
-  AEK_FP16 =        1 << 11,
-  AEK_RAS =         1 << 12,
-  AEK_SVE =         1 << 13,
-  AEK_DOTPROD =     1 << 14,
-  AEK_SHA2    =     1 << 15,
-  AEK_AES     =     1 << 16,
-  AEK_FP16FML =     1 << 17,
-  // Unsupported extensions.
-  AEK_OS = 0x8000000,
-  AEK_IWMMXT = 0x10000000,
-  AEK_IWMMXT2 = 0x20000000,
-  AEK_MAVERICK = 0x40000000,
-  AEK_XSCALE = 0x80000000,
-};
-
-// ISA kinds.
-enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 };
-
-// Endianness
-// FIXME: BE8 vs. BE32?
-enum class EndianKind { INVALID = 0, LITTLE, BIG };
-
-// v6/v7/v8 Profile
-enum class ProfileKind { INVALID = 0, A, R, M };
-
-StringRef getCanonicalArchName(StringRef Arch);
-
-// Information by ID
-StringRef getFPUName(unsigned FPUKind);
-FPUVersion getFPUVersion(unsigned FPUKind);
-NeonSupportLevel getFPUNeonSupportLevel(unsigned FPUKind);
-FPURestriction getFPURestriction(unsigned FPUKind);
-
-// FIXME: These should be moved to TargetTuple once it exists
-bool getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features);
-bool getHWDivFeatures(unsigned HWDivKind, std::vector<StringRef> &Features);
-bool getExtensionFeatures(unsigned Extensions,
-                          std::vector<StringRef> &Features);
-
-StringRef getArchName(ArchKind AK);
-unsigned getArchAttr(ArchKind AK);
-StringRef getCPUAttr(ArchKind AK);
-StringRef getSubArch(ArchKind AK);
-StringRef getArchExtName(unsigned ArchExtKind);
-StringRef getArchExtFeature(StringRef ArchExt);
-StringRef getHWDivName(unsigned HWDivKind);
-
-// Information by Name
-unsigned  getDefaultFPU(StringRef CPU, ArchKind AK);
-unsigned  getDefaultExtensions(StringRef CPU, ArchKind AK);
-StringRef getDefaultCPU(StringRef Arch);
-
-// Parser
-unsigned parseHWDiv(StringRef HWDiv);
-unsigned parseFPU(StringRef FPU);
-ArchKind parseArch(StringRef Arch);
-unsigned parseArchExt(StringRef ArchExt);
-ArchKind parseCPUArch(StringRef CPU);
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
-ISAKind parseArchISA(StringRef Arch);
-EndianKind parseArchEndian(StringRef Arch);
-ProfileKind parseArchProfile(StringRef Arch);
-unsigned parseArchVersion(StringRef Arch);
-
-StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
-
-} // namespace ARM
-
-// FIXME:This should be made into class design,to avoid dupplication.
-namespace AArch64 {
-
-// Arch names.
-enum class ArchKind {
-#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID,
-#include "AArch64TargetParser.def"
-};
-
-// Arch extension modifiers for CPUs.
-enum ArchExtKind : unsigned {
-  AEK_INVALID =     0,
-  AEK_NONE =        1,
-  AEK_CRC =         1 << 1,
-  AEK_CRYPTO =      1 << 2,
-  AEK_FP =          1 << 3,
-  AEK_SIMD =        1 << 4,
-  AEK_FP16 =        1 << 5,
-  AEK_PROFILE =     1 << 6,
-  AEK_RAS =         1 << 7,
-  AEK_LSE =         1 << 8,
-  AEK_SVE =         1 << 9,
-  AEK_DOTPROD =     1 << 10,
-  AEK_RCPC =        1 << 11,
-  AEK_RDM =         1 << 12,
-  AEK_SM4 =         1 << 13,
-  AEK_SHA3 =        1 << 14,
-  AEK_SHA2 =        1 << 15,
-  AEK_AES =         1 << 16,
-  AEK_FP16FML =     1 << 17,
-  AEK_RAND =        1 << 18,
-  AEK_MTE =         1 << 19,
-};
-
-StringRef getCanonicalArchName(StringRef Arch);
-
-// Information by ID
-StringRef getFPUName(unsigned FPUKind);
-ARM::FPUVersion getFPUVersion(unsigned FPUKind);
-ARM::NeonSupportLevel getFPUNeonSupportLevel(unsigned FPUKind);
-ARM::FPURestriction getFPURestriction(unsigned FPUKind);
-
-// FIXME: These should be moved to TargetTuple once it exists
-bool getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features);
-bool getExtensionFeatures(unsigned Extensions,
-                                   std::vector<StringRef> &Features);
-bool getArchFeatures(ArchKind AK, std::vector<StringRef> &Features);
-
-StringRef getArchName(ArchKind AK);
-unsigned getArchAttr(ArchKind AK);
-StringRef getCPUAttr(ArchKind AK);
-StringRef getSubArch(ArchKind AK);
-StringRef getArchExtName(unsigned ArchExtKind);
-StringRef getArchExtFeature(StringRef ArchExt);
-unsigned checkArchVersion(StringRef Arch);
-
-// Information by Name
-unsigned  getDefaultFPU(StringRef CPU, ArchKind AK);
-unsigned  getDefaultExtensions(StringRef CPU, ArchKind AK);
-StringRef getDefaultCPU(StringRef Arch);
-AArch64::ArchKind getCPUArchKind(StringRef CPU);
-
-// Parser
-unsigned parseFPU(StringRef FPU);
-AArch64::ArchKind parseArch(StringRef Arch);
-ArchExtKind parseArchExt(StringRef ArchExt);
-ArchKind parseCPUArch(StringRef CPU);
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
-ARM::ISAKind parseArchISA(StringRef Arch);
-ARM::EndianKind parseArchEndian(StringRef Arch);
-ARM::ProfileKind parseArchProfile(StringRef Arch);
-unsigned parseArchVersion(StringRef Arch);
-
-bool isX18ReservedByDefault(const Triple &TT);
-
-} // namespace AArch64
-
 namespace X86 {
 
 // This should be kept in sync with libcc/compiler-rt as its included by clang
diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h
index e8021f648b0d5625f1014cc1496d60d43b55a032..ba7ece5e72ba78428eddb003a0e296e908fe2657 100644
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@@ -27,7 +27,8 @@
 #define LLVM_THREADING_USE_STD_CALL_ONCE 1
 #elif defined(LLVM_ON_UNIX) &&                                                 \
     (defined(_LIBCPP_VERSION) ||                                               \
-     !(defined(__NetBSD__) || defined(__OpenBSD__) || defined(__ppc__)))
+     !(defined(__NetBSD__) || defined(__OpenBSD__) ||                          \
+       (defined(__ppc__) || defined(__PPC__))))
 // std::call_once from libc++ is used on all Unix platforms. Other
 // implementations like libstdc++ are known to have problems on NetBSD,
 // OpenBSD and PowerPC.
diff --git a/include/llvm/Support/VirtualFileSystem.h b/include/llvm/Support/VirtualFileSystem.h
index b3326bbbe486b33322c04d68d48362cc780368eb..3d4094eeb11106c13b27dadbdd7611028b124fe6 100644
--- a/include/llvm/Support/VirtualFileSystem.h
+++ b/include/llvm/Support/VirtualFileSystem.h
@@ -374,6 +374,9 @@ public:
                               SmallVectorImpl<char> &Output) const override {
     return FS->getRealPath(Path, Output);
   }
+  std::error_code isLocal(const Twine &Path, bool &Result) override {
+    return FS->isLocal(Path, Result);
+  }
 
 protected:
   FileSystem &getUnderlyingFS() { return *FS; }
diff --git a/include/llvm/Support/X86DisassemblerDecoderCommon.h b/include/llvm/Support/X86DisassemblerDecoderCommon.h
index 185b357efef5f7a74909f64c5199ecf546650f24..466dd309909ac39ad3709e8caabf07483789d339 100644
--- a/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -414,7 +414,7 @@ enum OperandEncoding {
   ENUM_ENTRY(TYPE_R16,        "2-byte")                                        \
   ENUM_ENTRY(TYPE_R32,        "4-byte")                                        \
   ENUM_ENTRY(TYPE_R64,        "8-byte")                                        \
-  ENUM_ENTRY(TYPE_IMM,        "immediate operand")                      \
+  ENUM_ENTRY(TYPE_IMM,        "immediate operand")                             \
   ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
   ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
   ENUM_ENTRY(TYPE_AVX512ICC,  "1-byte immediate operand for AVX512 icmp")      \
diff --git a/include/llvm/Support/X86TargetParser.def b/include/llvm/Support/X86TargetParser.def
index eb45ed6a76afb1365ac909ff10181d8345469fcd..e9bede545d3f71ced08b874d405a37f6132faf11 100644
--- a/include/llvm/Support/X86TargetParser.def
+++ b/include/llvm/Support/X86TargetParser.def
@@ -102,6 +102,7 @@ X86_CPU_SUBTYPE_COMPAT("icelake-server", INTEL_COREI7_ICELAKE_SERVER, "icelake-s
 // Entries below this are not in libgcc/compiler-rt.
 X86_CPU_SUBTYPE       ("core2",          INTEL_CORE2_65)
 X86_CPU_SUBTYPE       ("penryn",         INTEL_CORE2_45)
+X86_CPU_SUBTYPE       ("cascadelake",    INTEL_COREI7_CASCADELAKE)
 X86_CPU_SUBTYPE       ("k6",             AMDPENTIUM_K6)
 X86_CPU_SUBTYPE       ("k6-2",           AMDPENTIUM_K62)
 X86_CPU_SUBTYPE       ("k6-3",           AMDPENTIUM_K63)
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 6219755e83a2cf959c99338b7c1b9638f143904c..3d790e96fff76d18f986b1e6aeb53258305c6ee7 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -39,6 +39,12 @@
 namespace llvm {
 namespace yaml {
 
+enum class NodeKind : uint8_t {
+  Scalar,
+  Map,
+  Sequence,
+};
+
 struct EmptyContext {};
 
 /// This class should be specialized by any type that needs to be converted
@@ -145,14 +151,14 @@ struct ScalarTraits {
   // Must provide:
   //
   // Function to write the value as a string:
-  //static void output(const T &value, void *ctxt, llvm::raw_ostream &out);
+  // static void output(const T &value, void *ctxt, llvm::raw_ostream &out);
   //
   // Function to convert a string to a value.  Returns the empty
   // StringRef on success or an error string if string is malformed:
-  //static StringRef input(StringRef scalar, void *ctxt, T &value);
+  // static StringRef input(StringRef scalar, void *ctxt, T &value);
   //
   // Function to determine if the value should be quoted.
-  //static QuotingType mustQuote(StringRef);
+  // static QuotingType mustQuote(StringRef);
 };
 
 /// This class should be specialized by type that requires custom conversion
@@ -163,7 +169,7 @@ struct ScalarTraits {
 ///      static void output(const MyType &Value, void*, llvm::raw_ostream &Out)
 ///      {
 ///        // stream out custom formatting
-///        Out << Val;
+///        Out << Value;
 ///      }
 ///      static StringRef input(StringRef Scalar, void*, MyType &Value) {
 ///        // parse scalar and set `value`
@@ -181,6 +187,47 @@ struct BlockScalarTraits {
   // Function to convert a string to a value.  Returns the empty
   // StringRef on success or an error string if string is malformed:
   // static StringRef input(StringRef Scalar, void *ctxt, T &Value);
+  //
+  // Optional:
+  // static StringRef inputTag(T &Val, std::string Tag)
+  // static void outputTag(const T &Val, raw_ostream &Out)
+};
+
+/// This class should be specialized by type that requires custom conversion
+/// to/from a YAML scalar with optional tags. For example:
+///
+///    template <>
+///    struct TaggedScalarTraits<MyType> {
+///      static void output(const MyType &Value, void*, llvm::raw_ostream
+///      &ScalarOut, llvm::raw_ostream &TagOut)
+///      {
+///        // stream out custom formatting including optional Tag
+///        Out << Value;
+///      }
+///      static StringRef input(StringRef Scalar, StringRef Tag, void*, MyType
+///      &Value) {
+///        // parse scalar and set `value`
+///        // return empty string on success, or error string
+///        return StringRef();
+///      }
+///      static QuotingType mustQuote(const MyType &Value, StringRef) {
+///        return QuotingType::Single;
+///      }
+///    };
+template <typename T> struct TaggedScalarTraits {
+  // Must provide:
+  //
+  // Function to write the value and tag as strings:
+  // static void output(const T &Value, void *ctx, llvm::raw_ostream &ScalarOut,
+  // llvm::raw_ostream &TagOut);
+  //
+  // Function to convert a string to a value.  Returns the empty
+  // StringRef on success or an error string if string is malformed:
+  // static StringRef input(StringRef Scalar, StringRef Tag, void *ctxt, T
+  // &Value);
+  //
+  // Function to determine if the value should be quoted.
+  // static QuotingType mustQuote(const T &Value, StringRef Scalar);
 };
 
 /// This class should be specialized by any type that needs to be converted
@@ -234,6 +281,31 @@ struct CustomMappingTraits {
   // static void output(IO &io, T &elem);
 };
 
+/// This class should be specialized by any type that can be represented as
+/// a scalar, map, or sequence, decided dynamically. For example:
+///
+///    typedef std::unique_ptr<MyBase> MyPoly;
+///
+///    template<>
+///    struct PolymorphicTraits<MyPoly> {
+///      static NodeKind getKind(const MyPoly &poly) {
+///        return poly->getKind();
+///      }
+///      static MyScalar& getAsScalar(MyPoly &poly) {
+///        if (!poly || !isa<MyScalar>(poly))
+///          poly.reset(new MyScalar());
+///        return *cast<MyScalar>(poly.get());
+///      }
+///      // ...
+///    };
+template <typename T> struct PolymorphicTraits {
+  // Must provide:
+  // static NodeKind getKind(const T &poly);
+  // static scalar_type &getAsScalar(T &poly);
+  // static map_type &getAsMap(T &poly);
+  // static sequence_type &getAsSequence(T &poly);
+};
+
 // Only used for better diagnostics of missing traits
 template <typename T>
 struct MissingTrait;
@@ -307,6 +379,24 @@ struct has_BlockScalarTraits
       (sizeof(test<BlockScalarTraits<T>>(nullptr, nullptr)) == 1);
 };
 
+// Test if TaggedScalarTraits<T> is defined on type T.
+template <class T> struct has_TaggedScalarTraits {
+  using Signature_input = StringRef (*)(StringRef, StringRef, void *, T &);
+  using Signature_output = void (*)(const T &, void *, raw_ostream &,
+                                    raw_ostream &);
+  using Signature_mustQuote = QuotingType (*)(const T &, StringRef);
+
+  template <typename U>
+  static char test(SameType<Signature_input, &U::input> *,
+                   SameType<Signature_output, &U::output> *,
+                   SameType<Signature_mustQuote, &U::mustQuote> *);
+
+  template <typename U> static double test(...);
+
+  static bool const value =
+      (sizeof(test<TaggedScalarTraits<T>>(nullptr, nullptr, nullptr)) == 1);
+};
+
 // Test if MappingContextTraits<T> is defined on type T.
 template <class T, class Context> struct has_MappingTraits {
   using Signature_mapping = void (*)(class IO &, T &, Context &);
@@ -438,6 +528,17 @@ struct has_DocumentListTraits
   static bool const value = (sizeof(test<DocumentListTraits<T>>(nullptr))==1);
 };
 
+template <class T> struct has_PolymorphicTraits {
+  using Signature_getKind = NodeKind (*)(const T &);
+
+  template <typename U>
+  static char test(SameType<Signature_getKind, &U::getKind> *);
+
+  template <typename U> static double test(...);
+
+  static bool const value = (sizeof(test<PolymorphicTraits<T>>(nullptr)) == 1);
+};
+
 inline bool isNumeric(StringRef S) {
   const static auto skipDigits = [](StringRef Input) {
     return Input.drop_front(
@@ -626,10 +727,12 @@ struct missingTraits
                                         !has_ScalarBitSetTraits<T>::value &&
                                         !has_ScalarTraits<T>::value &&
                                         !has_BlockScalarTraits<T>::value &&
+                                        !has_TaggedScalarTraits<T>::value &&
                                         !has_MappingTraits<T, Context>::value &&
                                         !has_SequenceTraits<T>::value &&
                                         !has_CustomMappingTraits<T>::value &&
-                                        !has_DocumentListTraits<T>::value> {};
+                                        !has_DocumentListTraits<T>::value &&
+                                        !has_PolymorphicTraits<T>::value> {};
 
 template <typename T, typename Context>
 struct validatedMappingTraits
@@ -683,6 +786,9 @@ public:
 
   virtual void scalarString(StringRef &, QuotingType) = 0;
   virtual void blockScalarString(StringRef &) = 0;
+  virtual void scalarTag(std::string &) = 0;
+
+  virtual NodeKind getNodeKind() = 0;
 
   virtual void setError(const Twine &) = 0;
 
@@ -917,6 +1023,31 @@ yamlize(IO &YamlIO, T &Val, bool, EmptyContext &Ctx) {
   }
 }
 
+template <typename T>
+typename std::enable_if<has_TaggedScalarTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
+  if (io.outputting()) {
+    std::string ScalarStorage, TagStorage;
+    raw_string_ostream ScalarBuffer(ScalarStorage), TagBuffer(TagStorage);
+    TaggedScalarTraits<T>::output(Val, io.getContext(), ScalarBuffer,
+                                  TagBuffer);
+    io.scalarTag(TagBuffer.str());
+    StringRef ScalarStr = ScalarBuffer.str();
+    io.scalarString(ScalarStr,
+                    TaggedScalarTraits<T>::mustQuote(Val, ScalarStr));
+  } else {
+    std::string Tag;
+    io.scalarTag(Tag);
+    StringRef Str;
+    io.scalarString(Str, QuotingType::None);
+    StringRef Result =
+        TaggedScalarTraits<T>::input(Str, Tag, io.getContext(), Val);
+    if (!Result.empty()) {
+      io.setError(Twine(Result));
+    }
+  }
+}
+
 template <typename T, typename Context>
 typename std::enable_if<validatedMappingTraits<T, Context>::value, void>::type
 yamlize(IO &io, T &Val, bool, Context &Ctx) {
@@ -972,6 +1103,20 @@ yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
   }
 }
 
+template <typename T>
+typename std::enable_if<has_PolymorphicTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
+  switch (io.outputting() ? PolymorphicTraits<T>::getKind(Val)
+                          : io.getNodeKind()) {
+  case NodeKind::Scalar:
+    return yamlize(io, PolymorphicTraits<T>::getAsScalar(Val), true, Ctx);
+  case NodeKind::Map:
+    return yamlize(io, PolymorphicTraits<T>::getAsMap(Val), true, Ctx);
+  case NodeKind::Sequence:
+    return yamlize(io, PolymorphicTraits<T>::getAsSequence(Val), true, Ctx);
+  }
+}
+
 template <typename T>
 typename std::enable_if<missingTraits<T, EmptyContext>::value, void>::type
 yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
@@ -1250,6 +1395,8 @@ private:
   void endBitSetScalar() override;
   void scalarString(StringRef &, QuotingType) override;
   void blockScalarString(StringRef &) override;
+  void scalarTag(std::string &) override;
+  NodeKind getNodeKind() override;
   void setError(const Twine &message) override;
   bool canElideEmptySequence() override;
 
@@ -1395,6 +1542,8 @@ public:
   void endBitSetScalar() override;
   void scalarString(StringRef &, QuotingType) override;
   void blockScalarString(StringRef &) override;
+  void scalarTag(std::string &) override;
+  NodeKind getNodeKind() override;
   void setError(const Twine &message) override;
   bool canElideEmptySequence() override;
 
@@ -1414,14 +1563,21 @@ private:
   void flowKey(StringRef Key);
 
   enum InState {
-    inSeq,
-    inFlowSeq,
+    inSeqFirstElement,
+    inSeqOtherElement,
+    inFlowSeqFirstElement,
+    inFlowSeqOtherElement,
     inMapFirstKey,
     inMapOtherKey,
     inFlowMapFirstKey,
     inFlowMapOtherKey
   };
 
+  static bool inSeqAnyElement(InState State);
+  static bool inFlowSeqAnyElement(InState State);
+  static bool inMapAnyKey(InState State);
+  static bool inFlowMapAnyKey(InState State);
+
   raw_ostream &Out;
   int WrapColumn;
   SmallVector<InState, 8> StateStack;
@@ -1557,6 +1713,16 @@ operator>>(Input &In, T &Val) {
   return In;
 }
 
+// Define non-member operator>> so that Input can stream in a polymorphic type.
+template <typename T>
+inline typename std::enable_if<has_PolymorphicTraits<T>::value, Input &>::type
+operator>>(Input &In, T &Val) {
+  EmptyContext Ctx;
+  if (In.setCurrentDocument())
+    yamlize(In, Val, true, Ctx);
+  return In;
+}
+
 // Provide better error message about types missing a trait specialization
 template <typename T>
 inline typename std::enable_if<missingTraits<T, EmptyContext>::value,
@@ -1645,6 +1811,24 @@ operator<<(Output &Out, T &Val) {
   return Out;
 }
 
+// Define non-member operator<< so that Output can stream out a polymorphic
+// type.
+template <typename T>
+inline typename std::enable_if<has_PolymorphicTraits<T>::value, Output &>::type
+operator<<(Output &Out, T &Val) {
+  EmptyContext Ctx;
+  Out.beginDocuments();
+  if (Out.preflightDocument(0)) {
+    // FIXME: The parser does not support explicit documents terminated with a
+    // plain scalar; the end-marker is included as part of the scalar token.
+    assert(PolymorphicTraits<T>::getKind(Val) != NodeKind::Scalar && "plain scalar documents are not supported");
+    yamlize(Out, Val, true, Ctx);
+    Out.postflightDocument();
+  }
+  Out.endDocuments();
+  return Out;
+}
+
 // Provide better error message about types missing a trait specialization
 template <typename T>
 inline typename std::enable_if<missingTraits<T, EmptyContext>::value,
diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h
index 55d84f138f071ca35b47efded0a7c965880a233e..e7b8f2517b8aa975a26923eb07b534f48e77a737 100644
--- a/include/llvm/Support/type_traits.h
+++ b/include/llvm/Support/type_traits.h
@@ -30,9 +30,10 @@ namespace llvm {
 template <typename T>
 struct isPodLike {
   // std::is_trivially_copyable is available in libc++ with clang, libstdc++
-  // that comes with GCC 5.
+  // that comes with GCC 5.  MSVC 2015 and newer also have
+  // std::is_trivially_copyable.
 #if (__has_feature(is_trivially_copyable) && defined(_LIBCPP_VERSION)) ||      \
-    (defined(__GNUC__) && __GNUC__ >= 5)
+    (defined(__GNUC__) && __GNUC__ >= 5) || defined(_MSC_VER)
   // If the compiler supports the is_trivially_copyable trait use it, as it
   // matches the definition of isPodLike closely.
   static const bool value = std::is_trivially_copyable<T>::value;
diff --git a/include/llvm/Target/GenericOpcodes.td b/include/llvm/Target/GenericOpcodes.td
index af4fa8a1f04a76ee2329f78e9ab74252ee4d2b90..775221a9f9772c7236dcab85bf49c4b96171ed86 100644
--- a/include/llvm/Target/GenericOpcodes.td
+++ b/include/llvm/Target/GenericOpcodes.td
@@ -540,6 +540,13 @@ def G_FLOG2 : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+// Floating point base-10 logarithm of a value.
+def G_FLOG10 : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = 0;
+}
+
 //------------------------------------------------------------------------------
 // Opcodes for LLVM Intrinsics
 //------------------------------------------------------------------------------
@@ -675,6 +682,28 @@ def G_MERGE_VALUES : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+/// Create a vector from multiple scalar registers.
+def G_BUILD_VECTOR : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src0, variable_ops);
+  let hasSideEffects = 0;
+}
+
+/// Like G_BUILD_VECTOR, but truncates the larger operand types to fit the
+/// destination vector elt type.
+def G_BUILD_VECTOR_TRUNC : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src0, variable_ops);
+  let hasSideEffects = 0;
+}
+
+/// Create a vector by concatenating vectors together.
+def G_CONCAT_VECTORS : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src0, variable_ops);
+  let hasSideEffects = 0;
+}
+
 // Intrinsic without side effects.
 def G_INTRINSIC : GenericInstruction {
   let OutOperandList = (outs);
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index abb1bb431f66865210ae60089a7d8a3fc2ea0a7d..e4b827babb92c71a3e8f1a52317d414f7588ec94 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -479,6 +479,7 @@ class Instruction {
   bit isInsertSubreg = 0;   // Is this instruction a kind of insert subreg?
                             // If so, make sure to override
                             // TargetInstrInfo::getInsertSubregLikeInputs.
+  bit variadicOpsAreDefs = 0; // Are variadic operands definitions?
 
   // Does the instruction have side effects that are not captured by any
   // operands of the instruction or other flags?
diff --git a/include/llvm/Target/TargetInstrPredicate.td b/include/llvm/Target/TargetInstrPredicate.td
index e70da00979040ad76c03e2437130539db4cf8036..4b2c57b34c2e2035599cd09e41b3a726e72600e2 100644
--- a/include/llvm/Target/TargetInstrPredicate.td
+++ b/include/llvm/Target/TargetInstrPredicate.td
@@ -39,7 +39,7 @@
 // processor scheduling model.
 //
 // The `MCInstPredicateExample` definition above is equivalent (and therefore
-// could replace) the following definition from the ExynosM3 model (see
+// could replace) the following definition from a previous ExynosM3 model (see
 // AArch64SchedExynosM3.td):
 //
 // def M3BranchLinkFastPred  : SchedPredicate<[{
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 40b77c3d94a01a65a4d65787de0b27675f1a20a9..e80f2bf82f26d6eca31fedd7361efcae509c1960 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -201,6 +201,12 @@ public:
   virtual void emitLinkerFlagsForUsed(raw_ostream &OS,
                                       const GlobalValue *GV) const {}
 
+  /// If supported, return the section to use for the llvm.commandline
+  /// metadata. Otherwise, return nullptr.
+  virtual MCSection *getSectionForCommandLines() const {
+    return nullptr;
+  }
+
 protected:
   virtual MCSection *SelectSectionForGlobal(const GlobalObject *GO,
                                             SectionKind Kind,
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index f968fa80d500f0c7c89d3706e4515396f5587bb3..3eafcc25583aceaacb8b242360646868c1e78dde 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -201,6 +201,9 @@ public:
   bool getO0WantsFastISel() { return O0WantsFastISel; }
   void setO0WantsFastISel(bool Enable) { O0WantsFastISel = Enable; }
   void setGlobalISel(bool Enable) { Options.EnableGlobalISel = Enable; }
+  void setGlobalISelAbort(GlobalISelAbortMode Mode) {
+    Options.GlobalISelAbort = Mode;
+  }
   void setMachineOutliner(bool Enable) {
     Options.EnableMachineOutliner = Enable;
   }
@@ -351,6 +354,23 @@ public:
   }
 };
 
+/// Helper method for getting the code model, returning Default if
+/// CM does not have a value. The tiny and kernel models will produce
+/// an error, so targets that support them or require more complex codemodel
+/// selection logic should implement and call their own getEffectiveCodeModel.
+inline CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
+                                              CodeModel::Model Default) {
+  if (CM) {
+    // By default, targets do not support the tiny and kernel models.
+    if (*CM == CodeModel::Tiny)
+      report_fatal_error("Target does not support the tiny CodeModel");
+    if (*CM == CodeModel::Kernel)
+      report_fatal_error("Target does not support the kernel CodeModel");
+    return *CM;
+  }
+  return Default;
+}
+
 } // end namespace llvm
 
 #endif // LLVM_TARGET_TARGETMACHINE_H
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 07ed773de55e7874865a38149513a25b09904e9f..b18101d92833d590020cfa0ea232886137497a54 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -96,6 +96,14 @@ namespace llvm {
     SCE       // Tune debug info for SCE targets (e.g. PS4).
   };
 
+  /// Enable abort calls when global instruction selection fails to lower/select
+  /// an instruction.
+  enum class GlobalISelAbortMode {
+    Disable,        // Disable the abort.
+    Enable,         // Enable the abort.
+    DisableWithDiag // Disable the abort but emit a diagnostic on failure.
+  };
+
   class TargetOptions {
   public:
     TargetOptions()
@@ -192,6 +200,10 @@ namespace llvm {
     /// EnableGlobalISel - This flag enables global instruction selection.
     unsigned EnableGlobalISel : 1;
 
+    /// EnableGlobalISelAbort - Control abort behaviour when global instruction
+    /// selection fails to lower/select an instruction.
+    GlobalISelAbortMode GlobalISelAbort = GlobalISelAbortMode::Enable;
+
     /// UseInitArray - Use .init_array instead of .ctors for static
     /// constructors.
     unsigned UseInitArray : 1;
diff --git a/include/llvm/Target/TargetPfmCounters.td b/include/llvm/Target/TargetPfmCounters.td
index 0a55a558f30a52405607ddfc8f3adb5acf9519d4..dac150f03445a109461f6da55558ff9f59fb4087 100644
--- a/include/llvm/Target/TargetPfmCounters.td
+++ b/include/llvm/Target/TargetPfmCounters.td
@@ -44,3 +44,7 @@ class PfmCountersBinding<string cpu_name, ProcPfmCounters counters> {
   string CpuName = cpu_name;
   ProcPfmCounters Counters = counters;
 }
+
+// Declares the default binding for unbound CPUs for the target.
+class PfmCountersDefaultBinding<ProcPfmCounters counters>
+    : PfmCountersBinding<"", counters> {}
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 141e06693887cb9a9c869488b6a0ac9b6344e5ce..808e183f5a5f9b57aa79cf04dd045d903286e23c 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -373,6 +373,10 @@ class SchedPredicate<code pred> : SchedPredicateBase {
   SchedMachineModel SchedModel = ?;
   code Predicate = pred;
 }
+
+// Define a predicate to be typically used as the default case in a
+// SchedVariant.  It the SchedVariant does not use any other predicate based on
+// MCSchedPredicate, this is the default scheduling case used by llvm-mca.
 def NoSchedPred : MCSchedPredicate<TruePred>;
 
 // Associate a predicate with a list of SchedReadWrites. By default,
@@ -557,3 +561,13 @@ class RetireControlUnit<int bufferSize, int retirePerCycle> {
   int MaxRetirePerCycle = retirePerCycle;
   SchedMachineModel SchedModel = ?;
 }
+
+// Base class for Load/StoreQueue.  It is used to identify processor resources
+// which describe load/store queues in the LS unit.
+class MemoryQueue<ProcResource PR> {
+  ProcResource QueueDescriptor = PR;
+  SchedMachineModel SchedModel = ?;
+}
+
+class LoadQueue<ProcResource LDQueue> : MemoryQueue<LDQueue>;
+class StoreQueue<ProcResource STQueue> : MemoryQueue<STQueue>;
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 532e866be55426a23a2277bd72bf60205e1b92a5..eb5a14bd21b8a6c461d7c83e0f7bbdf7349e686b 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -116,12 +116,18 @@ def SDTIntBinOp : SDTypeProfile<1, 2, [     // add, and, or, xor, udiv, etc.
 def SDTIntShiftOp : SDTypeProfile<1, 2, [   // shl, sra, srl
   SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>
 ]>;
+def SDTIntShiftDOp: SDTypeProfile<1, 3, [   // fshl, fshr
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
 def SDTIntSatNoShOp : SDTypeProfile<1, 2, [   // ssat with no shift
   SDTCisSameAs<0, 1>, SDTCisInt<2>
 ]>;
 def SDTIntBinHiLoOp : SDTypeProfile<2, 2, [ // mulhi, mullo, sdivrem, udivrem
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,SDTCisInt<0>
 ]>;
+def SDTIntScaledBinOp : SDTypeProfile<1, 3, [  // smulfix
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
 
 def SDTFPBinOp : SDTypeProfile<1, 2, [      // fadd, fmul, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>
@@ -162,7 +168,7 @@ def SDTExtInreg : SDTypeProfile<1, 2, [     // sext_inreg
 ]>;
 def SDTExtInvec : SDTypeProfile<1, 1, [     // sext_invec
   SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>, SDTCisVec<1>,
-  SDTCisOpSmallerThanOp<1, 0>, SDTCisSameSizeAs<0,1>
+  SDTCisOpSmallerThanOp<1, 0>
 ]>;
 
 def SDTSetCC : SDTypeProfile<1, 3, [        // setcc
@@ -350,6 +356,8 @@ def sra        : SDNode<"ISD::SRA"       , SDTIntShiftOp>;
 def shl        : SDNode<"ISD::SHL"       , SDTIntShiftOp>;
 def rotl       : SDNode<"ISD::ROTL"      , SDTIntShiftOp>;
 def rotr       : SDNode<"ISD::ROTR"      , SDTIntShiftOp>;
+def fshl       : SDNode<"ISD::FSHL"      , SDTIntShiftDOp>;
+def fshr       : SDNode<"ISD::FSHR"      , SDTIntShiftDOp>;
 def and        : SDNode<"ISD::AND"       , SDTIntBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def or         : SDNode<"ISD::OR"        , SDTIntBinOp,
@@ -377,6 +385,7 @@ def saddsat    : SDNode<"ISD::SADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
 def uaddsat    : SDNode<"ISD::UADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
 def ssubsat    : SDNode<"ISD::SSUBSAT"   , SDTIntBinOp>;
 def usubsat    : SDNode<"ISD::USUBSAT"   , SDTIntBinOp>;
+def smulfix    : SDNode<"ISD::SMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
@@ -627,6 +636,15 @@ class PatFrags<dag ops, list<dag> frags, code pred = [{}],
   code ImmediateCode = [{}];
   SDNodeXForm OperandTransform = xform;
 
+  // When this is set, the PredicateCode may refer to a constant Operands
+  // vector which contains the captured nodes of the DAG, in the order listed
+  // by the Operands field above.
+  //
+  // This is useful when Fragments involves associative / commutative
+  // operators: a single piece of code can easily refer to all operands even
+  // when re-associated / commuted variants of the fragment are matched.
+  bit PredicateCodeUsesOperands = 0;
+
   // Define a few pre-packaged predicates. This helps GlobalISel import
   // existing rules from SelectionDAG for many common cases.
   // They will be tested prior to the code in pred and must not be used in
diff --git a/include/llvm/TextAPI/ELF/ELFStub.h b/include/llvm/TextAPI/ELF/ELFStub.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa54e6f8b711d7018d245df9e1d75e9b74df5782
--- /dev/null
+++ b/include/llvm/TextAPI/ELF/ELFStub.h
@@ -0,0 +1,69 @@
+//===- ELFStub.h ------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-----------------------------------------------------------------------===/
+///
+/// \file
+/// This file defines an internal representation of an ELF stub.
+///
+//===-----------------------------------------------------------------------===/
+
+#ifndef LLVM_TEXTAPI_ELF_ELFSTUB_H
+#define LLVM_TEXTAPI_ELF_ELFSTUB_H
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Support/VersionTuple.h"
+#include <vector>
+#include <set>
+
+namespace llvm {
+namespace elfabi {
+
+typedef uint16_t ELFArch;
+
+enum class ELFSymbolType {
+  NoType = ELF::STT_NOTYPE,
+  Object = ELF::STT_OBJECT,
+  Func = ELF::STT_FUNC,
+  TLS = ELF::STT_TLS,
+
+  // Type information is 4 bits, so 16 is safely out of range.
+  Unknown = 16,
+};
+
+struct ELFSymbol {
+  ELFSymbol(std::string SymbolName) : Name(SymbolName) {}
+  std::string Name;
+  uint64_t Size;
+  ELFSymbolType Type;
+  bool Undefined;
+  bool Weak;
+  Optional<std::string> Warning;
+  bool operator<(const ELFSymbol &RHS) const {
+    return Name < RHS.Name;
+  }
+};
+
+// A cumulative representation of ELF stubs.
+// Both textual and binary stubs will read into and write from this object.
+class ELFStub {
+// TODO: Add support for symbol versioning.
+public:
+  VersionTuple TbeVersion;
+  Optional<std::string> SoName;
+  ELFArch Arch;
+  std::vector<std::string> NeededLibs;
+  std::set<ELFSymbol> Symbols;
+
+  ELFStub() {}
+  ELFStub(const ELFStub &Stub);
+  ELFStub(ELFStub &&Stub);
+};
+} // end namespace elfabi
+} // end namespace llvm
+
+#endif // LLVM_TEXTAPI_ELF_ELFSTUB_H
diff --git a/include/llvm/TextAPI/ELF/TBEHandler.h b/include/llvm/TextAPI/ELF/TBEHandler.h
new file mode 100644
index 0000000000000000000000000000000000000000..91521c656fa230c90c2af273d516bd02176ef079
--- /dev/null
+++ b/include/llvm/TextAPI/ELF/TBEHandler.h
@@ -0,0 +1,45 @@
+//===- TBEHandler.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-----------------------------------------------------------------------===/
+///
+/// \file
+/// This file declares an interface for reading and writing .tbe (text-based
+/// ELF) files.
+///
+//===-----------------------------------------------------------------------===/
+
+#ifndef LLVM_TEXTAPI_ELF_TBEHANDLER_H
+#define LLVM_TEXTAPI_ELF_TBEHANDLER_H
+
+#include "llvm/Support/VersionTuple.h"
+#include "llvm/Support/Error.h"
+#include <memory>
+
+namespace llvm {
+
+class raw_ostream;
+class Error;
+class StringRef;
+class VersionTuple;
+
+namespace elfabi {
+
+class ELFStub;
+
+const VersionTuple TBEVersionCurrent(1, 0);
+
+/// Attempts to read an ELF interface file from a StringRef buffer.
+Expected<std::unique_ptr<ELFStub>> readTBEFromBuffer(StringRef Buf);
+
+/// Attempts to write an ELF interface file to a raw_ostream.
+Error writeTBEToOutputStream(raw_ostream &OS, const ELFStub &Stub);
+
+} // end namespace elfabi
+} // end namespace llvm
+
+#endif // LLVM_TEXTAPI_ELF_TBEHANDLER_H
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index 5ad880574ef7109637f2f7386b5b69d71ac1e648..c2103b637266d877264a4361b9d7e9b266601b84 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -176,6 +176,14 @@ void computeDeadSymbols(
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
     function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing);
 
+/// Compute dead symbols and run constant propagation in combined index
+/// after that.
+void computeDeadSymbolsWithConstProp(
+    ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+    function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
+    bool ImportEnabled);
+
 /// Converts value \p GV to declaration, or replaces with a declaration if
 /// it is an alias. Returns true if converted, false if replaced.
 bool convertToDeclaration(GlobalValue &GV);
@@ -201,10 +209,10 @@ std::error_code EmitImportsFiles(
     StringRef ModulePath, StringRef OutputFilename,
     const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex);
 
-/// Resolve WeakForLinker values in \p TheModule based on the information
+/// Resolve prevailing symbol linkages in \p TheModule based on the information
 /// recorded in the summaries during global summary-based analysis.
-void thinLTOResolveWeakForLinkerModule(Module &TheModule,
-                                       const GVSummaryMapTy &DefinedGlobals);
+void thinLTOResolvePrevailingInModule(Module &TheModule,
+                                      const GVSummaryMapTy &DefinedGlobals);
 
 /// Internalize \p TheModule based on the information recorded in the summaries
 /// during global summary-based analysis.
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 81ed0a2237e97be93e548bc85dbace334a2b3e89..3624cd266af5e780239a8b06de84845470a52a42 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -77,6 +77,12 @@ struct GCOVOptions {
   // Emit the exit block immediately after the start block, rather than after
   // all of the function body's blocks.
   bool ExitBlockBeforeBody;
+
+  // Regexes separated by a semi-colon to filter the files to instrument.
+  std::string Filter;
+
+  // Regexes separated by a semi-colon to filter the files to not instrument.
+  std::string Exclude;
 };
 
 ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
@@ -143,7 +149,8 @@ FunctionPass *createAddressSanitizerFunctionPass(bool CompileKernel = false,
                                                  bool UseAfterScope = false);
 ModulePass *createAddressSanitizerModulePass(bool CompileKernel = false,
                                              bool Recover = false,
-                                             bool UseGlobalsGC = true);
+                                             bool UseGlobalsGC = true,
+                                             bool UseOdrIndicator = true);
 
 // Insert MemorySanitizer instrumentation (detection of uninitialized reads)
 FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0,
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index fe4ff621c6f0eb66858dbb61dba404e5488df55f..9019c3be4944d6a12c23f0ae5c24371dad8ef3b5 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -183,11 +183,12 @@ Pass *createLoopInstSimplifyPass();
 //
 // LoopUnroll - This pass is a simple loop unrolling pass.
 //
-Pass *createLoopUnrollPass(int OptLevel = 2, int Threshold = -1, int Count = -1,
+Pass *createLoopUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false,
+                           int Threshold = -1, int Count = -1,
                            int AllowPartial = -1, int Runtime = -1,
                            int UpperBound = -1, int AllowPeeling = -1);
 // Create an unrolling pass for full unrolling that uses exact trip count only.
-Pass *createSimpleLoopUnrollPass(int OptLevel = 2);
+Pass *createSimpleLoopUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false);
 
 //===----------------------------------------------------------------------===//
 //
@@ -391,12 +392,6 @@ FunctionPass *createLowerExpectIntrinsicPass();
 //
 FunctionPass *createPartiallyInlineLibCallsPass();
 
-//===----------------------------------------------------------------------===//
-//
-// ScalarizerPass - Converts vector operations into scalar operations
-//
-FunctionPass *createScalarizerPass();
-
 //===----------------------------------------------------------------------===//
 //
 // SeparateConstOffsetFromGEP - Split GEPs for better CSE
@@ -490,6 +485,13 @@ FunctionPass *createLibCallsShrinkWrapPass();
 // primarily to help other loop passes.
 //
 Pass *createLoopSimplifyCFGPass();
+
+//===----------------------------------------------------------------------===//
+//
+// WarnMissedTransformations - This pass emits warnings for leftover forced
+// transformations.
+//
+Pass *createWarnMissedTransformationsPass();
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Scalar/JumpThreading.h b/include/llvm/Transforms/Scalar/JumpThreading.h
index c8376dd80df34b4161e407cad3e3656c92e8be75..78518941ceda94b9789e044cc7de4036b42cbced 100644
--- a/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -89,22 +89,9 @@ class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
 #else
   SmallSet<AssertingVH<const BasicBlock>, 16> LoopHeaders;
 #endif
-  DenseSet<std::pair<Value *, BasicBlock *>> RecursionSet;
 
   unsigned BBDupThreshold;
 
-  // RAII helper for updating the recursion stack.
-  struct RecursionSetRemover {
-    DenseSet<std::pair<Value *, BasicBlock *>> &TheSet;
-    std::pair<Value *, BasicBlock *> ThePair;
-
-    RecursionSetRemover(DenseSet<std::pair<Value *, BasicBlock *>> &S,
-                        std::pair<Value *, BasicBlock *> P)
-        : TheSet(S), ThePair(P) {}
-
-    ~RecursionSetRemover() { TheSet.erase(ThePair); }
-  };
-
 public:
   JumpThreadingPass(int T = -1);
 
@@ -128,11 +115,21 @@ public:
   bool DuplicateCondBranchOnPHIIntoPred(
       BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs);
 
+  bool ComputeValueKnownInPredecessorsImpl(
+      Value *V, BasicBlock *BB, jumpthreading::PredValueInfo &Result,
+      jumpthreading::ConstantPreference Preference,
+      DenseSet<std::pair<Value *, BasicBlock *>> &RecursionSet,
+      Instruction *CxtI = nullptr);
   bool
   ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
                                   jumpthreading::PredValueInfo &Result,
                                   jumpthreading::ConstantPreference Preference,
-                                  Instruction *CxtI = nullptr);
+                                  Instruction *CxtI = nullptr) {
+    DenseSet<std::pair<Value *, BasicBlock *>> RecursionSet;
+    return ComputeValueKnownInPredecessorsImpl(V, BB, Result, Preference,
+                                               RecursionSet, CxtI);
+  }
+
   bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
                               jumpthreading::ConstantPreference Preference,
                               Instruction *CxtI = nullptr);
diff --git a/include/llvm/Transforms/Scalar/LoopPassManager.h b/include/llvm/Transforms/Scalar/LoopPassManager.h
index e54960d199c270f72290a1bfa5eecf094630b07c..46ebb74c413c5991bd9d0c0a6cd6f3a7b59e1956 100644
--- a/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -352,7 +352,11 @@ public:
         continue;
       PreservedAnalyses PassPA = Pass.run(*L, LAM, LAR, Updater);
 
-      PI.runAfterPass<Loop>(Pass, *L);
+      // Do not pass deleted Loop into the instrumentation.
+      if (Updater.skipCurrentLoop())
+        PI.runAfterPassInvalidated<Loop>(Pass);
+      else
+        PI.runAfterPass<Loop>(Pass, *L);
 
       // FIXME: We should verify the set of analyses relevant to Loop passes
       // are preserved.
diff --git a/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index 20c9a26b98cf090caea2e5b452776b3e66457f63..e38e983cc9eb69b3d3efc3f338f506ef13dc7a6a 100644
--- a/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -24,8 +24,14 @@ class LPMUpdater;
 class LoopFullUnrollPass : public PassInfoMixin<LoopFullUnrollPass> {
   const int OptLevel;
 
+  /// If false, use a cost model to determine whether unrolling of a loop is
+  /// profitable. If true, only loops that explicitly request unrolling via
+  /// metadata are considered. All other loops are skipped.
+  const bool OnlyWhenForced;
+
 public:
-  explicit LoopFullUnrollPass(int OptLevel = 2) : OptLevel(OptLevel) {}
+  explicit LoopFullUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false)
+      : OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced) {}
 
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
@@ -50,7 +56,13 @@ struct LoopUnrollOptions {
   Optional<bool> AllowUpperBound;
   int OptLevel;
 
-  LoopUnrollOptions(int OptLevel = 2) : OptLevel(OptLevel) {}
+  /// If false, use a cost model to determine whether unrolling of a loop is
+  /// profitable. If true, only loops that explicitly request unrolling via
+  /// metadata are considered. All other loops are skipped.
+  bool OnlyWhenForced;
+
+  LoopUnrollOptions(int OptLevel = 2, bool OnlyWhenForced = false)
+      : OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced) {}
 
   /// Enables or disables partial unrolling. When disabled only full unrolling
   /// is allowed.
diff --git a/include/llvm/Transforms/Scalar/MakeGuardsExplicit.h b/include/llvm/Transforms/Scalar/MakeGuardsExplicit.h
new file mode 100644
index 0000000000000000000000000000000000000000..41b4aada2baac97310b301167139e46611ec1b9a
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/MakeGuardsExplicit.h
@@ -0,0 +1,47 @@
+//===-- MakeGuardsExplicit.h - Turn guard intrinsics into guard branches --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the @llvm.experimental.guard intrinsic to the new form of
+// guard represented as widenable explicit branch to the deopt block. The
+// difference between this pass and LowerGuardIntrinsic is that after this pass
+// the guard represented as intrinsic:
+//
+//   call void(i1, ...) @llvm.experimental.guard(i1 %old_cond) [ "deopt"() ]
+//
+// transforms to a guard represented as widenable explicit branch:
+//
+//   %widenable_cond = call i1 @llvm.experimental.widenable.condition()
+//   br i1 (%old_cond & %widenable_cond), label %guarded, label %deopt
+//
+// Here:
+//   - The semantics of @llvm.experimental.widenable.condition allows to replace
+//     %widenable_cond with the construction (%widenable_cond & %any_other_cond)
+//     without loss of correctness;
+//   - %guarded is the lower part of old guard intrinsic's parent block split by
+//     the intrinsic call;
+//   - %deopt is a block containing a sole call to @llvm.experimental.deoptimize
+//     intrinsic.
+//
+// Therefore, this branch preserves the property of widenability.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_SCALAR_MAKEGUARDSEXPLICIT_H
+#define LLVM_TRANSFORMS_SCALAR_MAKEGUARDSEXPLICIT_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct MakeGuardsExplicitPass : public PassInfoMixin<MakeGuardsExplicitPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif //LLVM_TRANSFORMS_SCALAR_MAKEGUARDSEXPLICIT_H
diff --git a/include/llvm/Transforms/Scalar/SCCP.h b/include/llvm/Transforms/Scalar/SCCP.h
index e434d41d034fd92f9eee1be5ebc9fcf867b9a3f5..0abbb32fde6a03cd78a5bc21e8826f21acb85bc6 100644
--- a/include/llvm/Transforms/Scalar/SCCP.h
+++ b/include/llvm/Transforms/Scalar/SCCP.h
@@ -31,7 +31,7 @@
 
 namespace llvm {
 
-class Function;
+class PostDominatorTree;
 
 /// This pass performs function-level constant propagation and merging.
 class SCCPPass : public PassInfoMixin<SCCPPass> {
@@ -39,9 +39,15 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-bool runIPSCCP(
-    Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI,
-    function_ref<std::unique_ptr<PredicateInfo>(Function &)> getPredicateInfo);
+/// Helper struct for bundling up the analysis results per function for IPSCCP.
+struct AnalysisResultsForFn {
+  std::unique_ptr<PredicateInfo> PredInfo;
+  DominatorTree *DT;
+  PostDominatorTree *PDT;
+};
+
+bool runIPSCCP(Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI,
+               function_ref<AnalysisResultsForFn(Function &)> getAnalysis);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_SCALAR_SCCP_H
diff --git a/include/llvm/Transforms/Scalar/Scalarizer.h b/include/llvm/Transforms/Scalar/Scalarizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a0b9a2b638c8a2b77ab3f3a300a468ddc3db1a9
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/Scalarizer.h
@@ -0,0 +1,35 @@
+//===- Scalarizer.h --- Scalarize vector operations -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass converts vector operations into scalar operations, in order
+/// to expose optimization opportunities on the individual scalar operations.
+/// It is mainly intended for targets that do not have vector units, but it
+/// may also be useful for revectorizing code to different vector widths.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_SCALARIZER_H
+#define LLVM_TRANSFORMS_SCALAR_SCALARIZER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ScalarizerPass : public PassInfoMixin<ScalarizerPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Create a legacy pass manager instance of the Scalarizer pass
+FunctionPass *createScalarizerPass();
+
+}
+
+#endif /* LLVM_TRANSFORMS_SCALAR_SCALARIZER_H */
diff --git a/include/llvm/Transforms/Scalar/WarnMissedTransforms.h b/include/llvm/Transforms/Scalar/WarnMissedTransforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..018b22a932e6cf1bd95b190d76f7c14f04967080
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/WarnMissedTransforms.h
@@ -0,0 +1,38 @@
+//===- WarnMissedTransforms.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Emit warnings if forced code transformations have not been performed.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_WARNMISSEDTRANSFORMS_H
+#define LLVM_TRANSFORMS_SCALAR_WARNMISSEDTRANSFORMS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Function;
+class Loop;
+class LPMUpdater;
+
+// New pass manager boilerplate.
+class WarnMissedTransformationsPass
+    : public PassInfoMixin<WarnMissedTransformationsPass> {
+public:
+  explicit WarnMissedTransformationsPass() {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+// Legacy pass manager boilerplate.
+Pass *createWarnMissedTransformationsPass();
+void initializeWarnMissedTransformationsLegacyPass(PassRegistry &);
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_WARNMISSEDTRANSFORMS_H
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index d7dce53fc76141caa29201a1fb0d056a55a40bdc..f5e997324fc8917d05321adb2dc0bab6165d9ddb 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -47,6 +47,7 @@ class LoopInfo;
 class Module;
 class ProfileSummaryInfo;
 class ReturnInst;
+class DomTreeUpdater;
 
 /// Return an exact copy of the specified module
 std::unique_ptr<Module> CloneModule(const Module &M);
@@ -262,11 +263,12 @@ void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
 /// we replace them with the uses of corresponding Phi inputs. ValueMapping
 /// is used to map the original instructions from BB to their newly-created
 /// copies. Returns the split block.
-BasicBlock *
-DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
-                                    Instruction *StopAt,
-                                    ValueToValueMapTy &ValueMapping,
-                                    DominatorTree *DT = nullptr);
+BasicBlock *DuplicateInstructionsInSplitBetween(BasicBlock *BB,
+                                                BasicBlock *PredBB,
+                                                Instruction *StopAt,
+                                                ValueToValueMapTy &ValueMapping,
+                                                DomTreeUpdater &DTU);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CLONING_H
diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h
index 13bef8418057002230a375cdff52094265a0e223..498ff5a7883f3157b2760f7e8eb82cec8d74694b 100644
--- a/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include <limits>
 
 namespace llvm {
@@ -146,7 +147,8 @@ class Value;
     BasicBlock *findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock);
 
   private:
-    void severSplitPHINodes(BasicBlock *&Header);
+    void severSplitPHINodesOfEntry(BasicBlock *&Header);
+    void severSplitPHINodesOfExits(const SmallPtrSetImpl<BasicBlock *> &Exits);
     void splitReturnBlocks();
 
     Function *constructFunction(const ValueSet &inputs,
diff --git a/include/llvm/Transforms/Utils/FunctionImportUtils.h b/include/llvm/Transforms/Utils/FunctionImportUtils.h
index b9fbef04cdc3d6a7c0741a12b0cd41cef8c28d28..e24398b90012b355d0f409df813d3f758b1fb0c7 100644
--- a/include/llvm/Transforms/Utils/FunctionImportUtils.h
+++ b/include/llvm/Transforms/Utils/FunctionImportUtils.h
@@ -114,6 +114,9 @@ bool renameModuleForThinLTO(
     Module &M, const ModuleSummaryIndex &Index,
     SetVector<GlobalValue *> *GlobalsToImport = nullptr);
 
+/// Compute synthetic function entry counts.
+void computeSyntheticCounts(ModuleSummaryIndex &Index);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 86a32bb63005984407cc3b0ffc2592565626901c..ec8b0eda36416a38c57207503c1c49d0fad3aefd 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -174,6 +174,12 @@ bool RecursivelyDeleteDeadPHINode(PHINode *PN,
 bool SimplifyInstructionsInBlock(BasicBlock *BB,
                                  const TargetLibraryInfo *TLI = nullptr);
 
+/// Replace all the uses of an SSA value in @llvm.dbg intrinsics with
+/// undef. This is useful for signaling that a variable, e.g. has been
+/// found dead and hence it's unavailable at a given program point.
+/// Returns true if the dbg values have been changed.
+bool replaceDbgUsesWithUndef(Instruction *I);
+
 //===----------------------------------------------------------------------===//
 //  Control Flow Graph Restructuring.
 //
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index f642852275c04fe066ab814019288045c905e361..faf9131e6aca44d3a005481676632791c854a496 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -171,6 +171,77 @@ SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L);
 Optional<const MDOperand *> findStringMetadataForLoop(Loop *TheLoop,
                                                       StringRef Name);
 
+/// Find named metadata for a loop with an integer value.
+llvm::Optional<int> getOptionalIntLoopAttribute(Loop *TheLoop, StringRef Name);
+
+/// Create a new loop identifier for a loop created from a loop transformation.
+///
+/// @param OrigLoopID The loop ID of the loop before the transformation.
+/// @param FollowupAttrs List of attribute names that contain attributes to be
+///                      added to the new loop ID.
+/// @param InheritOptionsAttrsPrefix Selects which attributes should be inherited
+///                                  from the original loop. The following values
+///                                  are considered:
+///        nullptr   : Inherit all attributes from @p OrigLoopID.
+///        ""        : Do not inherit any attribute from @p OrigLoopID; only use
+///                    those specified by a followup attribute.
+///        "<prefix>": Inherit all attributes except those which start with
+///                    <prefix>; commonly used to remove metadata for the
+///                    applied transformation.
+/// @param AlwaysNew If true, do not try to reuse OrigLoopID and never return
+///                  None.
+///
+/// @return The loop ID for the after-transformation loop. The following values
+///         can be returned:
+///         None         : No followup attribute was found; it is up to the
+///                        transformation to choose attributes that make sense.
+///         @p OrigLoopID: The original identifier can be reused.
+///         nullptr      : The new loop has no attributes.
+///         MDNode*      : A new unique loop identifier.
+Optional<MDNode *>
+makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef<StringRef> FollowupAttrs,
+                   const char *InheritOptionsAttrsPrefix = "",
+                   bool AlwaysNew = false);
+
+/// Look for the loop attribute that disables all transformation heuristic.
+bool hasDisableAllTransformsHint(const Loop *L);
+
+/// The mode sets how eager a transformation should be applied.
+enum TransformationMode {
+  /// The pass can use heuristics to determine whether a transformation should
+  /// be applied.
+  TM_Unspecified,
+
+  /// The transformation should be applied without considering a cost model.
+  TM_Enable,
+
+  /// The transformation should not be applied.
+  TM_Disable,
+
+  /// Force is a flag and should not be used alone.
+  TM_Force = 0x04,
+
+  /// The transformation was directed by the user, e.g. by a #pragma in
+  /// the source code. If the transformation could not be applied, a
+  /// warning should be emitted.
+  TM_ForcedByUser = TM_Enable | TM_Force,
+
+  /// The transformation must not be applied. For instance, `#pragma clang loop
+  /// unroll(disable)` explicitly forbids any unrolling to take place. Unlike
+  /// general loop metadata, it must not be dropped. Most passes should not
+  /// behave differently under TM_Disable and TM_SuppressedByUser.
+  TM_SuppressedByUser = TM_Disable | TM_Force
+};
+
+/// @{
+/// Get the mode for LLVM's supported loop transformations.
+TransformationMode hasUnrollTransformation(Loop *L);
+TransformationMode hasUnrollAndJamTransformation(Loop *L);
+TransformationMode hasVectorizeTransformation(Loop *L);
+TransformationMode hasDistributeTransformation(Loop *L);
+TransformationMode hasLICMVersioningTransformation(Loop *L);
+/// @}
+
 /// Set input string into loop metadata by keeping other values intact.
 void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                              unsigned V = 0);
diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h
index a6b84af068a52c47fc8278f7de0e1c73c3aef5a6..70e936d75008a30ebcac45141013ba0f2eae8260 100644
--- a/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -35,6 +35,15 @@ class ScalarEvolution;
 
 using NewLoopsMap = SmallDenseMap<const Loop *, Loop *, 4>;
 
+/// @{
+/// Metadata attribute names
+const char *const LLVMLoopUnrollFollowupAll = "llvm.loop.unroll.followup_all";
+const char *const LLVMLoopUnrollFollowupUnrolled =
+    "llvm.loop.unroll.followup_unrolled";
+const char *const LLVMLoopUnrollFollowupRemainder =
+    "llvm.loop.unroll.followup_remainder";
+/// @}
+
 const Loop* addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
                                      BasicBlock *ClonedBB, LoopInfo *LI,
                                      NewLoopsMap &NewLoops);
@@ -61,15 +70,16 @@ LoopUnrollResult UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
                             unsigned PeelCount, bool UnrollRemainder,
                             LoopInfo *LI, ScalarEvolution *SE,
                             DominatorTree *DT, AssumptionCache *AC,
-                            OptimizationRemarkEmitter *ORE, bool PreserveLCSSA);
+                            OptimizationRemarkEmitter *ORE, bool PreserveLCSSA,
+                            Loop **RemainderLoop = nullptr);
 
 bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                 bool AllowExpensiveTripCount,
                                 bool UseEpilogRemainder, bool UnrollRemainder,
-                                LoopInfo *LI,
-                                ScalarEvolution *SE, DominatorTree *DT,
-                                AssumptionCache *AC,
-                                bool PreserveLCSSA);
+                                LoopInfo *LI, ScalarEvolution *SE,
+                                DominatorTree *DT, AssumptionCache *AC,
+                                bool PreserveLCSSA,
+                                Loop **ResultLoop = nullptr);
 
 void computePeelCount(Loop *L, unsigned LoopSize,
                       TargetTransformInfo::UnrollingPreferences &UP,
@@ -84,7 +94,8 @@ LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
                                   unsigned TripMultiple, bool UnrollRemainder,
                                   LoopInfo *LI, ScalarEvolution *SE,
                                   DominatorTree *DT, AssumptionCache *AC,
-                                  OptimizationRemarkEmitter *ORE);
+                                  OptimizationRemarkEmitter *ORE,
+                                  Loop **EpilogueLoop = nullptr);
 
 bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
                           DependenceInfo &DI);
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
index 950af7ffe05f5ca074209f278b1628b58e1a4c1d..70f9a2e0741b3a1aebcbe5c65a30c7fc872f9b52 100644
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@@ -110,8 +110,8 @@ struct VectorizeConfig {
 //
 // LoopVectorize - Create a loop vectorization pass.
 //
-Pass *createLoopVectorizePass(bool NoUnrolling = false,
-                              bool AlwaysVectorize = true);
+Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced = false,
+                              bool VectorizeOnlyWhenForced = false);
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h b/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b37d7093c442c7c4c99c9e40ca631a660532638
--- /dev/null
+++ b/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h
@@ -0,0 +1,27 @@
+//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOADSTOREVECTORIZER_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOADSTOREVECTORIZER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class LoadStoreVectorizerPass : public PassInfoMixin<LoadStoreVectorizerPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Create a legacy pass manager instance of the LoadStoreVectorizer pass
+Pass *createLoadStoreVectorizerPass();
+
+}
+
+#endif /* LLVM_TRANSFORMS_VECTORIZE_LOADSTOREVECTORIZER_H */
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index ceb660daa28cb5414e22a39bae2ac3b198be7510..5c7bba048607e5ef7e6e40c348956c8b0f527b24 100644
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -95,7 +95,7 @@ public:
     FK_Enabled = 1,    ///< Forcing enabled.
   };
 
-  LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
+  LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced,
                      OptimizationRemarkEmitter &ORE);
 
   /// Mark the loop L as already vectorized by setting the width to 1.
@@ -105,7 +105,8 @@ public:
     writeHintsToMetadata(Hints);
   }
 
-  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const;
+  bool allowVectorization(Function *F, Loop *L,
+                          bool VectorizeOnlyWhenForced) const;
 
   /// Dumps all the hint information.
   void emitRemarkWithHints() const;
@@ -113,7 +114,12 @@ public:
   unsigned getWidth() const { return Width.Value; }
   unsigned getInterleave() const { return Interleave.Value; }
   unsigned getIsVectorized() const { return IsVectorized.Value; }
-  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
+  enum ForceKind getForce() const {
+    if ((ForceKind)Force.Value == FK_Undefined &&
+        hasDisableAllTransformsHint(TheLoop))
+      return FK_Disabled;
+    return (ForceKind)Force.Value;
+  }
 
   /// If hints are provided that force vectorization, use the AlwaysPrint
   /// pass name to force the frontend to print the diagnostic.
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorize.h b/include/llvm/Transforms/Vectorize/LoopVectorize.h
index d79d8469180347cac44dbf50e068cbbb8a7f4c8a..d9c4f7b023c14fa689463aebf7b70cbecf9a3289 100644
--- a/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -78,12 +78,13 @@ class TargetTransformInfo;
 
 /// The LoopVectorize Pass.
 struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
-  bool DisableUnrolling = false;
+  /// If false, consider all loops for interleaving.
+  /// If true, only loops that explicitly request interleaving are considered.
+  bool InterleaveOnlyWhenForced = false;
 
-  /// If true, consider all loops for vectorization.
-  /// If false, only loops that explicitly request vectorization are
-  /// considered.
-  bool AlwaysVectorize = true;
+  /// If false, consider all loops for vectorization.
+  /// If true, only loops that explicitly request vectorization are considered.
+  bool VectorizeOnlyWhenForced = false;
 
   ScalarEvolution *SE;
   LoopInfo *LI;
diff --git a/include/llvm/XRay/FDRRecordProducer.h b/include/llvm/XRay/FDRRecordProducer.h
index e67e22873f8d91cc1beeabb6935877e4f2ab8eb1..efdba2a67b7bec66cede54e29c97ca06ba912346 100644
--- a/include/llvm/XRay/FDRRecordProducer.h
+++ b/include/llvm/XRay/FDRRecordProducer.h
@@ -29,6 +29,11 @@ class FileBasedRecordProducer : public RecordProducer {
   const XRayFileHeader &Header;
   DataExtractor &E;
   uint32_t &OffsetPtr;
+  uint32_t CurrentBufferBytes = 0;
+
+  // Helper function which gets the next record by speculatively reading through
+  // the log, finding a buffer extents record.
+  Expected<std::unique_ptr<Record>> findNextBufferExtent();
 
 public:
   FileBasedRecordProducer(const XRayFileHeader &FH, DataExtractor &DE,
diff --git a/include/llvm/XRay/FDRRecords.h b/include/llvm/XRay/FDRRecords.h
index 9d48332d5083d15e56b732937561c0e039486482..8a84f4d0c1fbc6e7427108005dab0a50de704fe6 100644
--- a/include/llvm/XRay/FDRRecords.h
+++ b/include/llvm/XRay/FDRRecords.h
@@ -14,10 +14,14 @@
 #ifndef LLVM_LIB_XRAY_FDRRECORDS_H_
 #define LLVM_LIB_XRAY_FDRRECORDS_H_
 
+#include <cstdint>
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
-#include <cstdint>
 
 namespace llvm {
 namespace xray {
@@ -26,21 +30,37 @@ class RecordVisitor;
 class RecordInitializer;
 
 class Record {
-protected:
-  enum class Type {
-    Unknown,
-    Function,
-    Metadata,
+public:
+  enum class RecordKind {
+    RK_Metadata,
+    RK_Metadata_BufferExtents,
+    RK_Metadata_WallClockTime,
+    RK_Metadata_NewCPUId,
+    RK_Metadata_TSCWrap,
+    RK_Metadata_CustomEvent,
+    RK_Metadata_CustomEventV5,
+    RK_Metadata_CallArg,
+    RK_Metadata_PIDEntry,
+    RK_Metadata_NewBuffer,
+    RK_Metadata_EndOfBuffer,
+    RK_Metadata_TypedEvent,
+    RK_Metadata_LastMetadata,
+    RK_Function,
   };
 
+  static StringRef kindToString(RecordKind K);
+
+private:
+  const RecordKind T;
+
 public:
   Record(const Record &) = delete;
   Record(Record &&) = delete;
   Record &operator=(const Record &) = delete;
   Record &operator=(Record &&) = delete;
-  Record() = default;
+  explicit Record(RecordKind T) : T(T) {}
 
-  virtual Type type() const = 0;
+  RecordKind getRecordType() const { return T; }
 
   // Each Record should be able to apply an abstract visitor, and choose the
   // appropriate function in the visitor to invoke, given its own type.
@@ -50,10 +70,6 @@ public:
 };
 
 class MetadataRecord : public Record {
-protected:
-  static constexpr int kMetadataBodySize = 15;
-  friend class RecordInitializer;
-
 public:
   enum class MetadataType : unsigned {
     Unknown,
@@ -69,11 +85,22 @@ public:
     TypedEvent,
   };
 
-  Type type() const override { return Type::Metadata; }
+protected:
+  static constexpr int kMetadataBodySize = 15;
+  friend class RecordInitializer;
+
+private:
+  const MetadataType MT;
 
-  // All metadata records must know to provide the type of their open
-  // metadata record.
-  virtual MetadataType metadataType() const = 0;
+public:
+  explicit MetadataRecord(RecordKind T, MetadataType M) : Record(T), MT(M) {}
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() >= RecordKind::RK_Metadata &&
+           R->getRecordType() <= RecordKind::RK_Metadata_LastMetadata;
+  }
+
+  MetadataType metadataType() const { return MT; }
 
   virtual ~MetadataRecord() = default;
 };
@@ -86,16 +113,22 @@ class BufferExtents : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  BufferExtents() = default;
-  explicit BufferExtents(uint64_t S) : MetadataRecord(), Size(S) {}
+  BufferExtents()
+      : MetadataRecord(RecordKind::RK_Metadata_BufferExtents,
+                       MetadataType::BufferExtents) {}
 
-  MetadataType metadataType() const override {
-    return MetadataType::BufferExtents;
-  }
+  explicit BufferExtents(uint64_t S)
+      : MetadataRecord(RecordKind::RK_Metadata_BufferExtents,
+                       MetadataType::BufferExtents),
+        Size(S) {}
 
   uint64_t size() const { return Size; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_BufferExtents;
+  }
 };
 
 class WallclockRecord : public MetadataRecord {
@@ -104,18 +137,23 @@ class WallclockRecord : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  WallclockRecord() = default;
-  explicit WallclockRecord(uint64_t S, uint32_t N)
-      : MetadataRecord(), Seconds(S), Nanos(N) {}
+  WallclockRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_WallClockTime,
+                       MetadataType::WallClockTime) {}
 
-  MetadataType metadataType() const override {
-    return MetadataType::WallClockTime;
-  }
+  explicit WallclockRecord(uint64_t S, uint32_t N)
+      : MetadataRecord(RecordKind::RK_Metadata_WallClockTime,
+                       MetadataType::WallClockTime),
+        Seconds(S), Nanos(N) {}
 
   uint64_t seconds() const { return Seconds; }
   uint32_t nanos() const { return Nanos; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_WallClockTime;
+  }
 };
 
 class NewCPUIDRecord : public MetadataRecord {
@@ -124,16 +162,24 @@ class NewCPUIDRecord : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  NewCPUIDRecord() = default;
-  NewCPUIDRecord(uint16_t C, uint64_t T) : MetadataRecord(), CPUId(C), TSC(T) {}
+  NewCPUIDRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_NewCPUId,
+                       MetadataType::NewCPUId) {}
 
-  MetadataType metadataType() const override { return MetadataType::NewCPUId; }
+  NewCPUIDRecord(uint16_t C, uint64_t T)
+      : MetadataRecord(RecordKind::RK_Metadata_NewCPUId,
+                       MetadataType::NewCPUId),
+        CPUId(C), TSC(T) {}
 
   uint16_t cpuid() const { return CPUId; }
 
   uint64_t tsc() const { return TSC; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_NewCPUId;
+  }
 };
 
 class TSCWrapRecord : public MetadataRecord {
@@ -141,14 +187,21 @@ class TSCWrapRecord : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  TSCWrapRecord() = default;
-  explicit TSCWrapRecord(uint64_t B) : MetadataRecord(), BaseTSC(B) {}
+  TSCWrapRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_TSCWrap, MetadataType::TSCWrap) {
+  }
 
-  MetadataType metadataType() const override { return MetadataType::TSCWrap; }
+  explicit TSCWrapRecord(uint64_t B)
+      : MetadataRecord(RecordKind::RK_Metadata_TSCWrap, MetadataType::TSCWrap),
+        BaseTSC(B) {}
 
   uint64_t tsc() const { return BaseTSC; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_TSCWrap;
+  }
 };
 
 class CustomEventRecord : public MetadataRecord {
@@ -159,13 +212,14 @@ class CustomEventRecord : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  CustomEventRecord() = default;
-  explicit CustomEventRecord(uint64_t S, uint64_t T, uint16_t C, std::string D)
-      : MetadataRecord(), Size(S), TSC(T), CPU(C), Data(std::move(D)) {}
+  CustomEventRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_CustomEvent,
+                       MetadataType::CustomEvent) {}
 
-  MetadataType metadataType() const override {
-    return MetadataType::CustomEvent;
-  }
+  explicit CustomEventRecord(uint64_t S, uint64_t T, uint16_t C, std::string D)
+      : MetadataRecord(RecordKind::RK_Metadata_CustomEvent,
+                       MetadataType::CustomEvent),
+        Size(S), TSC(T), CPU(C), Data(std::move(D)) {}
 
   int32_t size() const { return Size; }
   uint64_t tsc() const { return TSC; }
@@ -173,6 +227,10 @@ public:
   StringRef data() const { return Data; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_CustomEvent;
+  }
 };
 
 class CustomEventRecordV5 : public MetadataRecord {
@@ -182,19 +240,24 @@ class CustomEventRecordV5 : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  CustomEventRecordV5() = default;
-  explicit CustomEventRecordV5(int32_t S, int32_t D, std::string P)
-      : MetadataRecord(), Size(S), Delta(D), Data(std::move(P)) {}
+  CustomEventRecordV5()
+      : MetadataRecord(RecordKind::RK_Metadata_CustomEventV5,
+                       MetadataType::CustomEvent) {}
 
-  MetadataType metadataType() const override {
-    return MetadataType::CustomEvent;
-  }
+  explicit CustomEventRecordV5(int32_t S, int32_t D, std::string P)
+      : MetadataRecord(RecordKind::RK_Metadata_CustomEventV5,
+                       MetadataType::CustomEvent),
+        Size(S), Delta(D), Data(std::move(P)) {}
 
   int32_t size() const { return Size; }
   int32_t delta() const { return Delta; }
   StringRef data() const { return Data; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_CustomEventV5;
+  }
 };
 
 class TypedEventRecord : public MetadataRecord {
@@ -205,13 +268,14 @@ class TypedEventRecord : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  TypedEventRecord() = default;
-  explicit TypedEventRecord(int32_t S, int32_t D, uint16_t E, std::string P)
-      : MetadataRecord(), Size(S), Delta(D), Data(std::move(P)) {}
+  TypedEventRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_TypedEvent,
+                       MetadataType::TypedEvent) {}
 
-  MetadataType metadataType() const override {
-    return MetadataType::TypedEvent;
-  }
+  explicit TypedEventRecord(int32_t S, int32_t D, uint16_t E, std::string P)
+      : MetadataRecord(RecordKind::RK_Metadata_TypedEvent,
+                       MetadataType::TypedEvent),
+        Size(S), Delta(D), Data(std::move(P)) {}
 
   int32_t size() const { return Size; }
   int32_t delta() const { return Delta; }
@@ -219,6 +283,10 @@ public:
   StringRef data() const { return Data; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_TypedEvent;
+  }
 };
 
 class CallArgRecord : public MetadataRecord {
@@ -226,14 +294,21 @@ class CallArgRecord : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  CallArgRecord() = default;
-  explicit CallArgRecord(uint64_t A) : MetadataRecord(), Arg(A) {}
+  CallArgRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_CallArg, MetadataType::CallArg) {
+  }
 
-  MetadataType metadataType() const override { return MetadataType::CallArg; }
+  explicit CallArgRecord(uint64_t A)
+      : MetadataRecord(RecordKind::RK_Metadata_CallArg, MetadataType::CallArg),
+        Arg(A) {}
 
   uint64_t arg() const { return Arg; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_CallArg;
+  }
 };
 
 class PIDRecord : public MetadataRecord {
@@ -241,14 +316,22 @@ class PIDRecord : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  PIDRecord() = default;
-  explicit PIDRecord(int32_t P) : MetadataRecord(), PID(P) {}
+  PIDRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_PIDEntry,
+                       MetadataType::PIDEntry) {}
 
-  MetadataType metadataType() const override { return MetadataType::PIDEntry; }
+  explicit PIDRecord(int32_t P)
+      : MetadataRecord(RecordKind::RK_Metadata_PIDEntry,
+                       MetadataType::PIDEntry),
+        PID(P) {}
 
   int32_t pid() const { return PID; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_PIDEntry;
+  }
 };
 
 class NewBufferRecord : public MetadataRecord {
@@ -256,25 +339,35 @@ class NewBufferRecord : public MetadataRecord {
   friend class RecordInitializer;
 
 public:
-  NewBufferRecord() = default;
-  explicit NewBufferRecord(int32_t T) : MetadataRecord(), TID(T) {}
+  NewBufferRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_NewBuffer,
+                       MetadataType::NewBuffer) {}
 
-  MetadataType metadataType() const override { return MetadataType::NewBuffer; }
+  explicit NewBufferRecord(int32_t T)
+      : MetadataRecord(RecordKind::RK_Metadata_NewBuffer,
+                       MetadataType::NewBuffer),
+        TID(T) {}
 
   int32_t tid() const { return TID; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_NewBuffer;
+  }
 };
 
 class EndBufferRecord : public MetadataRecord {
 public:
-  EndBufferRecord() = default;
-
-  MetadataType metadataType() const override {
-    return MetadataType::EndOfBuffer;
-  }
+  EndBufferRecord()
+      : MetadataRecord(RecordKind::RK_Metadata_EndOfBuffer,
+                       MetadataType::EndOfBuffer) {}
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Metadata_EndOfBuffer;
+  }
 };
 
 class FunctionRecord : public Record {
@@ -286,11 +379,10 @@ class FunctionRecord : public Record {
   static constexpr unsigned kFunctionRecordSize = 8;
 
 public:
-  FunctionRecord() = default;
-  explicit FunctionRecord(RecordTypes K, int32_t F, uint32_t D)
-      : Record(), Kind(K), FuncId(F), Delta(D) {}
+  FunctionRecord() : Record(RecordKind::RK_Function) {}
 
-  Type type() const override { return Type::Function; }
+  explicit FunctionRecord(RecordTypes K, int32_t F, uint32_t D)
+      : Record(RecordKind::RK_Function), Kind(K), FuncId(F), Delta(D) {}
 
   // A function record is a concrete record type which has a number of common
   // properties.
@@ -299,6 +391,10 @@ public:
   uint32_t delta() const { return Delta; }
 
   Error apply(RecordVisitor &V) override;
+
+  static bool classof(const Record *R) {
+    return R->getRecordType() == RecordKind::RK_Function;
+  }
 };
 
 class RecordVisitor {
diff --git a/include/llvm/module.extern.modulemap b/include/llvm/module.extern.modulemap
new file mode 100644
index 0000000000000000000000000000000000000000..8acda137e0440832905a7730b5965b83a6b2aee8
--- /dev/null
+++ b/include/llvm/module.extern.modulemap
@@ -0,0 +1,5 @@
+module LLVM_Extern_Config_Def {}
+module LLVM_Extern_IR_Attributes_Gen {}
+module LLVM_Extern_IR_Intrinsics_Gen {}
+module LLVM_Extern_IR_Intrinsics_Enum {}
+module LLVM_Extern_Utils_DataTypes {}
diff --git a/include/llvm/module.install.modulemap b/include/llvm/module.install.modulemap
new file mode 100644
index 0000000000000000000000000000000000000000..ac73a861232657ae0453d8e10fb65c1f014e03d0
--- /dev/null
+++ b/include/llvm/module.install.modulemap
@@ -0,0 +1,27 @@
+
+module LLVM_Extern_Config_Def {
+ textual header "Config/AsmParsers.def"
+ textual header "Config/AsmPrinters.def"
+ textual header "Config/Disassemblers.def"
+ textual header "Config/Targets.def"
+ export *
+}
+
+module LLVM_Extern_IR_Attributes_Gen {
+  textual header "IR/Attributes.gen"
+  textual header "IR/Attributes.inc"
+}
+
+module LLVM_Extern_IR_Intrinsics_Gen {
+  textual header "IR/Intrinsics.gen"
+  textual header "IR/Intrinsics.inc"
+}
+
+module LLVM_Extern_IR_Intrinsics_Enum {
+  textual header "IR/IntrinsicEnums.inc"
+}
+
+module LLVM_Extern_Utils_DataTypes {
+  header "Support/DataTypes.h"
+  export *
+}
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index c918eff2b9766db41f6415d09a5e18a5251b0c9b..3b6e2a8a0ba42d51e4eaa3273bf6d9750abf5c39 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -36,6 +36,7 @@ module LLVM_Backend {
 
 module LLVM_Bitcode { requires cplusplus umbrella "Bitcode" module * { export * } }
 
+
 module LLVM_BinaryFormat {
     requires cplusplus
     umbrella "BinaryFormat" module * { export * }
@@ -63,7 +64,12 @@ module LLVM_BinaryFormat {
     textual header "BinaryFormat/MsgPack.def"
 }
 
-module LLVM_Config { requires cplusplus umbrella "Config" module * { export * } }
+module LLVM_Config {
+  requires cplusplus
+  umbrella "Config"
+  extern module LLVM_Extern_Config_Def "module.extern.modulemap"
+  module * { export * }
+}
 
 module LLVM_DebugInfo {
   requires cplusplus
@@ -181,7 +187,11 @@ module LLVM_intrinsic_gen {
 
   // Attributes.h
   module IR_Argument { header "IR/Argument.h" export * }
-  module IR_Attributes { header "IR/Attributes.h" export * }
+  module IR_Attributes {
+    header "IR/Attributes.h"
+    extern module LLVM_Extern_IR_Attributes_Gen "module.extern.modulemap"
+    export *
+  }
   module IR_CallSite { header "IR/CallSite.h" export * }
   module IR_ConstantFolder { header "IR/ConstantFolder.h" export * }
   module IR_GlobalVariable { header "IR/GlobalVariable.h" export * }
@@ -207,7 +217,12 @@ module LLVM_intrinsic_gen {
   module IR_Verifier { header "IR/Verifier.h" export * }
   module IR_InstIterator { header "IR/InstIterator.h" export * }
   module IR_InstVisitor { header "IR/InstVisitor.h" export * }
-  module IR_Intrinsics { header "IR/Intrinsics.h" export * }
+  module IR_Intrinsics {
+    header "IR/Intrinsics.h"
+    extern module LLVM_Extern_IR_Intricsics_Gen "module.extern.modulemap"
+    extern module LLVM_Extern_IR_Intrinsics_Enum "module.extern.modulemap"
+    export *
+  }
   module IR_IntrinsicInst { header "IR/IntrinsicInst.h" export * }
   module IR_PatternMatch { header "IR/PatternMatch.h" export * }
   module IR_Statepoint { header "IR/Statepoint.h" export * }
@@ -284,6 +299,8 @@ module LLVM_Transforms {
   module * { export * }
 }
 
+extern module LLVM_Extern_Utils_DataTypes "module.extern.modulemap"
+
 // A module covering ADT/ and Support/. These are intertwined and
 // codependent, and notionally form a single module.
 module LLVM_Utils {
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 49b1bdb2e779610abfaa4b99ea4ea161a93fcb35..bb8742123a0f08beafedbf39f46899304d963c0d 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -77,6 +77,8 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeRegionOnlyPrinterPass(Registry);
   initializeSCEVAAWrapperPassPass(Registry);
   initializeScalarEvolutionWrapperPassPass(Registry);
+  initializeStackSafetyGlobalInfoWrapperPassPass(Registry);
+  initializeStackSafetyInfoWrapperPassPass(Registry);
   initializeTargetTransformInfoWrapperPassPass(Registry);
   initializeTypeBasedAAWrapperPassPass(Registry);
   initializeScopedNoAliasAAWrapperPassPass(Registry);
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
index 6965235326df3fe2eb7c683475323919ea547d2a..fd2292ced017d4415ef7746fc094e75974d87763 100644
--- a/lib/Analysis/CGSCCPassManager.cpp
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -79,7 +79,10 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
 
     PreservedAnalyses PassPA = Pass->run(*C, AM, G, UR);
 
-    PI.runAfterPass(*Pass, *C);
+    if (UR.InvalidatedSCCs.count(C))
+      PI.runAfterPassInvalidated<LazyCallGraph::SCC>(*Pass);
+    else
+      PI.runAfterPass<LazyCallGraph::SCC>(*Pass, *C);
 
     // Update the SCC if necessary.
     C = UR.UpdatedC ? UR.UpdatedC : C;
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index c33e2a88127261af7347223329d7f85cd6a9ed67..c57d8ef69d69b7d45a931af2176d6c4a08de0c04 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_library(LLVMAnalysis
   ScalarEvolutionAliasAnalysis.cpp
   ScalarEvolutionExpander.cpp
   ScalarEvolutionNormalization.cpp
+  StackSafetyAnalysis.cpp
   SyncDependenceAnalysis.cpp
   SyntheticCountsUtils.cpp
   TargetLibraryInfo.cpp
diff --git a/lib/Analysis/CallGraphSCCPass.cpp b/lib/Analysis/CallGraphSCCPass.cpp
index 80d0427529a5dfb432b8dfeaff3508f18d5d7ac3..0aed57a39387cc89b2ea218c1fd5515868b14737 100644
--- a/lib/Analysis/CallGraphSCCPass.cpp
+++ b/lib/Analysis/CallGraphSCCPass.cpp
@@ -633,23 +633,40 @@ namespace {
 
     bool runOnSCC(CallGraphSCC &SCC) override {
       bool BannerPrinted = false;
-      auto PrintBannerOnce = [&] () {
+      auto PrintBannerOnce = [&]() {
         if (BannerPrinted)
           return;
         OS << Banner;
         BannerPrinted = true;
-        };
+      };
+
+      bool NeedModule = llvm::forcePrintModuleIR();
+      if (isFunctionInPrintList("*") && NeedModule) {
+        PrintBannerOnce();
+        OS << "\n";
+        SCC.getCallGraph().getModule().print(OS, nullptr);
+        return false;
+      }
+      bool FoundFunction = false;
       for (CallGraphNode *CGN : SCC) {
         if (Function *F = CGN->getFunction()) {
           if (!F->isDeclaration() && isFunctionInPrintList(F->getName())) {
-            PrintBannerOnce();
-            F->print(OS);
+            FoundFunction = true;
+            if (!NeedModule) {
+              PrintBannerOnce();
+              F->print(OS);
+            }
           }
         } else if (isFunctionInPrintList("*")) {
           PrintBannerOnce();
           OS << "\nPrinting <null> Function\n";
         }
       }
+      if (NeedModule && FoundFunction) {
+        PrintBannerOnce();
+        OS << "\n";
+        SCC.getCallGraph().getModule().print(OS, nullptr);
+      }
       return false;
     }
 
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index d4f73bdb4361deb0cf48444e91b362e61ed67aec..5cac5e5ca616a4f9a4706f1ea398d08bda8afbce 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -158,7 +158,8 @@ namespace {
 /// storing the value (or part of it) into memory anywhere automatically
 /// counts as capturing it or not.
 bool llvm::PointerMayBeCaptured(const Value *V,
-                                bool ReturnCaptures, bool StoreCaptures) {
+                                bool ReturnCaptures, bool StoreCaptures,
+                                unsigned MaxUsesToExplore) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
 
@@ -169,7 +170,7 @@ bool llvm::PointerMayBeCaptured(const Value *V,
   (void)StoreCaptures;
 
   SimpleCaptureTracker SCT(ReturnCaptures);
-  PointerMayBeCaptured(V, &SCT);
+  PointerMayBeCaptured(V, &SCT, MaxUsesToExplore);
   return SCT.Captured;
 }
 
@@ -186,13 +187,15 @@ bool llvm::PointerMayBeCaptured(const Value *V,
 bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
                                       bool StoreCaptures, const Instruction *I,
                                       const DominatorTree *DT, bool IncludeI,
-                                      OrderedBasicBlock *OBB) {
+                                      OrderedBasicBlock *OBB,
+                                      unsigned MaxUsesToExplore) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
   bool UseNewOBB = OBB == nullptr;
 
   if (!DT)
-    return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures);
+    return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures,
+                                MaxUsesToExplore);
   if (UseNewOBB)
     OBB = new OrderedBasicBlock(I->getParent());
 
@@ -200,29 +203,25 @@ bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
   // with StoreCaptures.
 
   CapturesBefore CB(ReturnCaptures, I, DT, IncludeI, OBB);
-  PointerMayBeCaptured(V, &CB);
+  PointerMayBeCaptured(V, &CB, MaxUsesToExplore);
 
   if (UseNewOBB)
     delete OBB;
   return CB.Captured;
 }
 
-/// TODO: Write a new FunctionPass AliasAnalysis so that it can keep
-/// a cache. Then we can move the code from BasicAliasAnalysis into
-/// that path, and remove this threshold.
-static int const Threshold = 20;
-
-void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
+void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
+                                unsigned MaxUsesToExplore) {
   assert(V->getType()->isPointerTy() && "Capture is for pointers only!");
-  SmallVector<const Use *, Threshold> Worklist;
-  SmallSet<const Use *, Threshold> Visited;
+  SmallVector<const Use *, DefaultMaxUsesToExplore> Worklist;
+  SmallSet<const Use *, DefaultMaxUsesToExplore> Visited;
 
   auto AddUses = [&](const Value *V) {
-    int Count = 0;
+    unsigned Count = 0;
     for (const Use &U : V->uses()) {
       // If there are lots of uses, conservatively say that the value
       // is captured to avoid taking too much compile time.
-      if (Count++ >= Threshold)
+      if (Count++ >= MaxUsesToExplore)
         return Tracker->tooManyUses();
       if (!Visited.insert(&U).second)
         continue;
diff --git a/lib/Analysis/CmpInstAnalysis.cpp b/lib/Analysis/CmpInstAnalysis.cpp
index 159c1a2d135ac9d594f3ab6a69c930f64f6becf6..27071babec5c85f0924a177cce0243868c958486 100644
--- a/lib/Analysis/CmpInstAnalysis.cpp
+++ b/lib/Analysis/CmpInstAnalysis.cpp
@@ -40,28 +40,28 @@ unsigned llvm::getICmpCode(const ICmpInst *ICI, bool InvertPred) {
   }
 }
 
-Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
-                          CmpInst::Predicate &NewICmpPred) {
+Constant *llvm::getPredForICmpCode(unsigned Code, bool Sign, Type *OpTy,
+                                   CmpInst::Predicate &Pred) {
   switch (Code) {
     default: llvm_unreachable("Illegal ICmp code!");
     case 0: // False.
-      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-    case 1: NewICmpPred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
-    case 2: NewICmpPred = ICmpInst::ICMP_EQ; break;
-    case 3: NewICmpPred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
-    case 4: NewICmpPred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
-    case 5: NewICmpPred = ICmpInst::ICMP_NE; break;
-    case 6: NewICmpPred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
+      return ConstantInt::get(CmpInst::makeCmpResultType(OpTy), 0);
+    case 1: Pred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
+    case 2: Pred = ICmpInst::ICMP_EQ; break;
+    case 3: Pred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
+    case 4: Pred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
+    case 5: Pred = ICmpInst::ICMP_NE; break;
+    case 6: Pred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
     case 7: // True.
-      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
+      return ConstantInt::get(CmpInst::makeCmpResultType(OpTy), 1);
   }
   return nullptr;
 }
 
-bool llvm::PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
-  return (CmpInst::isSigned(p1) == CmpInst::isSigned(p2)) ||
-         (CmpInst::isSigned(p1) && ICmpInst::isEquality(p2)) ||
-         (CmpInst::isSigned(p2) && ICmpInst::isEquality(p1));
+bool llvm::predicatesFoldable(ICmpInst::Predicate P1, ICmpInst::Predicate P2) {
+  return (CmpInst::isSigned(P1) == CmpInst::isSigned(P2)) ||
+         (CmpInst::isSigned(P1) && ICmpInst::isEquality(P2)) ||
+         (CmpInst::isSigned(P2) && ICmpInst::isEquality(P1));
 }
 
 bool llvm::decomposeBitTestICmp(Value *LHS, Value *RHS,
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 92b0555913758ff32837ba385c612686ef96f1f2..6180886267470e441ff51e3c00344f18e48bc6b1 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -347,9 +347,20 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
 
     // We're simulating a load through a pointer that was bitcast to point to
     // a different type, so we can try to walk down through the initial
-    // elements of an aggregate to see if some part of th e aggregate is
+    // elements of an aggregate to see if some part of the aggregate is
     // castable to implement the "load" semantic model.
-    C = C->getAggregateElement(0u);
+    if (SrcTy->isStructTy()) {
+      // Struct types might have leading zero-length elements like [0 x i32],
+      // which are certainly not what we are looking for, so skip them.
+      unsigned Elem = 0;
+      Constant *ElemC;
+      do {
+        ElemC = C->getAggregateElement(Elem++);
+      } while (ElemC && DL.getTypeSizeInBits(ElemC->getType()) == 0);
+      C = ElemC;
+    } else {
+      C = C->getAggregateElement(0u);
+    }
   } while (C);
 
   return nullptr;
@@ -1399,6 +1410,10 @@ bool llvm::canConstantFoldCallTo(ImmutableCallSite CS, const Function *F) {
   case Intrinsic::usub_with_overflow:
   case Intrinsic::smul_with_overflow:
   case Intrinsic::umul_with_overflow:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::uadd_sat:
+  case Intrinsic::ssub_sat:
+  case Intrinsic::usub_sat:
   case Intrinsic::convert_from_fp16:
   case Intrinsic::convert_to_fp16:
   case Intrinsic::bitreverse:
@@ -2019,6 +2034,14 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
           };
           return ConstantStruct::get(cast<StructType>(Ty), Ops);
         }
+        case Intrinsic::uadd_sat:
+          return ConstantInt::get(Ty, Op1->getValue().uadd_sat(Op2->getValue()));
+        case Intrinsic::sadd_sat:
+          return ConstantInt::get(Ty, Op1->getValue().sadd_sat(Op2->getValue()));
+        case Intrinsic::usub_sat:
+          return ConstantInt::get(Ty, Op1->getValue().usub_sat(Op2->getValue()));
+        case Intrinsic::ssub_sat:
+          return ConstantInt::get(Ty, Op1->getValue().ssub_sat(Op2->getValue()));
         case Intrinsic::cttz:
           if (Op2->isOne() && Op1->isZero()) // cttz(0, 1) is undef.
             return UndefValue::get(Ty);
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 35af4be485f821f3320e583b0ed262aef18480d7..0382787fbefe788e0209bdae08a5427ca01c9bee 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/Pass.h"
@@ -50,6 +51,7 @@
 #include <cstdint>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "demanded-bits"
 
@@ -142,6 +144,28 @@ void DemandedBits::determineLiveOperandBits(
                  std::min(BitWidth, Known.countMaxTrailingZeros()+1));
         }
         break;
+      case Intrinsic::fshl:
+      case Intrinsic::fshr: {
+        const APInt *SA;
+        if (OperandNo == 2) {
+          // Shift amount is modulo the bitwidth. For powers of two we have
+          // SA % BW == SA & (BW - 1).
+          if (isPowerOf2_32(BitWidth))
+            AB = BitWidth - 1;
+        } else if (match(II->getOperand(2), m_APInt(SA))) {
+          // Normalize to funnel shift left. APInt shifts of BitWidth are well-
+          // defined, so no need to special-case zero shifts here.
+          uint64_t ShiftAmt = SA->urem(BitWidth);
+          if (II->getIntrinsicID() == Intrinsic::fshr)
+            ShiftAmt = BitWidth - ShiftAmt;
+
+          if (OperandNo == 0)
+            AB = AOut.lshr(ShiftAmt);
+          else if (OperandNo == 1)
+            AB = AOut.shl(BitWidth - ShiftAmt);
+        }
+        break;
+      }
       }
     break;
   case Instruction::Add:
@@ -153,8 +177,9 @@ void DemandedBits::determineLiveOperandBits(
     AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
     break;
   case Instruction::Shl:
-    if (OperandNo == 0)
-      if (auto *ShiftAmtC = dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+    if (OperandNo == 0) {
+      const APInt *ShiftAmtC;
+      if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) {
         uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1);
         AB = AOut.lshr(ShiftAmt);
 
@@ -166,10 +191,12 @@ void DemandedBits::determineLiveOperandBits(
         else if (S->hasNoUnsignedWrap())
           AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
       }
+    }
     break;
   case Instruction::LShr:
-    if (OperandNo == 0)
-      if (auto *ShiftAmtC = dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+    if (OperandNo == 0) {
+      const APInt *ShiftAmtC;
+      if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) {
         uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1);
         AB = AOut.shl(ShiftAmt);
 
@@ -178,10 +205,12 @@ void DemandedBits::determineLiveOperandBits(
         if (cast<LShrOperator>(UserI)->isExact())
           AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
       }
+    }
     break;
   case Instruction::AShr:
-    if (OperandNo == 0)
-      if (auto *ShiftAmtC = dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+    if (OperandNo == 0) {
+      const APInt *ShiftAmtC;
+      if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) {
         uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1);
         AB = AOut.shl(ShiftAmt);
         // Because the high input bit is replicated into the
@@ -196,6 +225,7 @@ void DemandedBits::determineLiveOperandBits(
         if (cast<AShrOperator>(UserI)->isExact())
           AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
       }
+    }
     break;
   case Instruction::And:
     AB = AOut;
@@ -253,6 +283,15 @@ void DemandedBits::determineLiveOperandBits(
     if (OperandNo != 0)
       AB = AOut;
     break;
+  case Instruction::ExtractElement:
+    if (OperandNo == 0)
+      AB = AOut;
+    break;
+  case Instruction::InsertElement:
+  case Instruction::ShuffleVector:
+    if (OperandNo == 0 || OperandNo == 1)
+      AB = AOut;
+    break;
   }
 }
 
@@ -288,8 +327,9 @@ void DemandedBits::performAnalysis() {
     // bits and add the instruction to the work list. For other instructions
     // add their operands to the work list (for integer values operands, mark
     // all bits as live).
-    if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
-      if (AliveBits.try_emplace(&I, IT->getBitWidth(), 0).second)
+    Type *T = I.getType();
+    if (T->isIntOrIntVectorTy()) {
+      if (AliveBits.try_emplace(&I, T->getScalarSizeInBits(), 0).second)
         Worklist.push_back(&I);
 
       continue;
@@ -298,8 +338,9 @@ void DemandedBits::performAnalysis() {
     // Non-integer-typed instructions...
     for (Use &OI : I.operands()) {
       if (Instruction *J = dyn_cast<Instruction>(OI)) {
-        if (IntegerType *IT = dyn_cast<IntegerType>(J->getType()))
-          AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth());
+        Type *T = J->getType();
+        if (T->isIntOrIntVectorTy())
+          AliveBits[J] = APInt::getAllOnesValue(T->getScalarSizeInBits());
         Worklist.push_back(J);
       }
     }
@@ -315,13 +356,13 @@ void DemandedBits::performAnalysis() {
 
     LLVM_DEBUG(dbgs() << "DemandedBits: Visiting: " << *UserI);
     APInt AOut;
-    if (UserI->getType()->isIntegerTy()) {
+    if (UserI->getType()->isIntOrIntVectorTy()) {
       AOut = AliveBits[UserI];
       LLVM_DEBUG(dbgs() << " Alive Out: " << AOut);
     }
     LLVM_DEBUG(dbgs() << "\n");
 
-    if (!UserI->getType()->isIntegerTy())
+    if (!UserI->getType()->isIntOrIntVectorTy())
       Visited.insert(UserI);
 
     KnownBits Known, Known2;
@@ -330,10 +371,11 @@ void DemandedBits::performAnalysis() {
     // operand is added to the work-list.
     for (Use &OI : UserI->operands()) {
       if (Instruction *I = dyn_cast<Instruction>(OI)) {
-        if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) {
-          unsigned BitWidth = IT->getBitWidth();
+        Type *T = I->getType();
+        if (T->isIntOrIntVectorTy()) {
+          unsigned BitWidth = T->getScalarSizeInBits();
           APInt AB = APInt::getAllOnesValue(BitWidth);
-          if (UserI->getType()->isIntegerTy() && !AOut &&
+          if (UserI->getType()->isIntOrIntVectorTy() && !AOut &&
               !isAlwaysLive(UserI)) {
             AB = APInt(BitWidth, 0);
           } else {
@@ -368,11 +410,13 @@ void DemandedBits::performAnalysis() {
 APInt DemandedBits::getDemandedBits(Instruction *I) {
   performAnalysis();
 
-  const DataLayout &DL = I->getModule()->getDataLayout();
   auto Found = AliveBits.find(I);
   if (Found != AliveBits.end())
     return Found->second;
-  return APInt::getAllOnesValue(DL.getTypeSizeInBits(I->getType()));
+
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  return APInt::getAllOnesValue(
+      DL.getTypeSizeInBits(I->getType()->getScalarType()));
 }
 
 bool DemandedBits::isInstructionDead(Instruction *I) {
diff --git a/lib/Analysis/DivergenceAnalysis.cpp b/lib/Analysis/DivergenceAnalysis.cpp
index de47445c5e01286c39f0c101e88a516bc201b109..7ba23854a3cc53efb3fdd68d7ca4ea78bf87a0fc 100644
--- a/lib/Analysis/DivergenceAnalysis.cpp
+++ b/lib/Analysis/DivergenceAnalysis.cpp
@@ -422,3 +422,36 @@ void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
       OS << "DIVERGENT:" << I << '\n';
   }
 }
+
+// class GPUDivergenceAnalysis
+GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F,
+                                             const DominatorTree &DT,
+                                             const PostDominatorTree &PDT,
+                                             const LoopInfo &LI,
+                                             const TargetTransformInfo &TTI)
+    : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) {
+  for (auto &I : instructions(F)) {
+    if (TTI.isSourceOfDivergence(&I)) {
+      DA.markDivergent(I);
+    } else if (TTI.isAlwaysUniform(&I)) {
+      DA.addUniformOverride(I);
+    }
+  }
+  for (auto &Arg : F.args()) {
+    if (TTI.isSourceOfDivergence(&Arg)) {
+      DA.markDivergent(Arg);
+    }
+  }
+
+  DA.compute();
+}
+
+bool GPUDivergenceAnalysis::isDivergent(const Value &val) const {
+  return DA.isDivergent(val);
+}
+
+void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
+  OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
+  DA.print(OS, mod);
+  OS << "}\n";
+}
diff --git a/lib/Analysis/IVDescriptors.cpp b/lib/Analysis/IVDescriptors.cpp
index 854a95573e902bb0643aa68bfa9970c1b911678e..aaebc4a481ec5db351a12432f313a44356774859 100644
--- a/lib/Analysis/IVDescriptors.cpp
+++ b/lib/Analysis/IVDescriptors.cpp
@@ -299,9 +299,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
         return false;
     }
 
+    bool IsASelect = isa<SelectInst>(Cur);
+
+    // A conditional reduction operation must only have 2 or less uses in
+    // VisitedInsts.
+    if (IsASelect && (Kind == RK_FloatAdd || Kind == RK_FloatMult) &&
+        hasMultipleUsesOf(Cur, VisitedInsts, 2))
+      return false;
+
     // A reduction operation must only have one use of the reduction value.
-    if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
-        hasMultipleUsesOf(Cur, VisitedInsts))
+    if (!IsAPhi && !IsASelect && Kind != RK_IntegerMinMax &&
+        Kind != RK_FloatMinMax && hasMultipleUsesOf(Cur, VisitedInsts, 1))
       return false;
 
     // All inputs to a PHI node must be a reduction value.
@@ -362,7 +370,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
       } else if (!isa<PHINode>(UI) &&
                  ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                    !isa<SelectInst>(UI)) ||
-                  !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence()))
+                  (!isConditionalRdxPattern(Kind, UI).isRecurrence() &&
+                   !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence())))
         return false;
 
       // Remember that we completed the cycle.
@@ -491,6 +500,53 @@ RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) {
   return InstDesc(false, I);
 }
 
+/// Returns true if the select instruction has users in the compare-and-add
+/// reduction pattern below. The select instruction argument is the last one
+/// in the sequence.
+///
+/// %sum.1 = phi ...
+/// ...
+/// %cmp = fcmp pred %0, %CFP
+/// %add = fadd %0, %sum.1
+/// %sum.2 = select %cmp, %add, %sum.1
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isConditionalRdxPattern(
+    RecurrenceKind Kind, Instruction *I) {
+  SelectInst *SI = dyn_cast<SelectInst>(I);
+  if (!SI)
+    return InstDesc(false, I);
+
+  CmpInst *CI = dyn_cast<CmpInst>(SI->getCondition());
+  // Only handle single use cases for now.
+  if (!CI || !CI->hasOneUse())
+    return InstDesc(false, I);
+
+  Value *TrueVal = SI->getTrueValue();
+  Value *FalseVal = SI->getFalseValue();
+  // Handle only when either of operands of select instruction is a PHI
+  // node for now.
+  if ((isa<PHINode>(*TrueVal) && isa<PHINode>(*FalseVal)) ||
+      (!isa<PHINode>(*TrueVal) && !isa<PHINode>(*FalseVal)))
+    return InstDesc(false, I);
+
+  Instruction *I1 =
+      isa<PHINode>(*TrueVal) ? dyn_cast<Instruction>(FalseVal)
+                             : dyn_cast<Instruction>(TrueVal);
+  if (!I1 || !I1->isBinaryOp())
+    return InstDesc(false, I);
+
+  Value *Op1, *Op2;
+  if ((m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1)  ||
+       m_FSub(m_Value(Op1), m_Value(Op2)).match(I1)) &&
+      I1->isFast())
+    return InstDesc(Kind == RK_FloatAdd, SI);
+
+  if (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1) && (I1->isFast()))
+    return InstDesc(Kind == RK_FloatMult, SI);
+
+  return InstDesc(false, I);
+}
+
 RecurrenceDescriptor::InstDesc
 RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
                                         InstDesc &Prev, bool HasFunNoNaNAttr) {
@@ -520,9 +576,12 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
   case Instruction::FSub:
   case Instruction::FAdd:
     return InstDesc(Kind == RK_FloatAdd, I, UAI);
+  case Instruction::Select:
+    if (Kind == RK_FloatAdd || Kind == RK_FloatMult)
+      return isConditionalRdxPattern(Kind, I);
+    LLVM_FALLTHROUGH;
   case Instruction::FCmp:
   case Instruction::ICmp:
-  case Instruction::Select:
     if (Kind != RK_IntegerMinMax &&
         (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
       return InstDesc(false, I);
@@ -531,13 +590,14 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
 }
 
 bool RecurrenceDescriptor::hasMultipleUsesOf(
-    Instruction *I, SmallPtrSetImpl<Instruction *> &Insts) {
+    Instruction *I, SmallPtrSetImpl<Instruction *> &Insts,
+    unsigned MaxNumUses) {
   unsigned NumUses = 0;
   for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E;
        ++Use) {
     if (Insts.count(dyn_cast<Instruction>(*Use)))
       ++NumUses;
-    if (NumUses > 1)
+    if (NumUses > MaxNumUses)
       return true;
   }
 
diff --git a/lib/Analysis/InstructionPrecedenceTracking.cpp b/lib/Analysis/InstructionPrecedenceTracking.cpp
index c667c28228183fcd3c58a828f40f8ce6cea70183..b98975b006a445bd1f0f255f1e560294b3e26fa1 100644
--- a/lib/Analysis/InstructionPrecedenceTracking.cpp
+++ b/lib/Analysis/InstructionPrecedenceTracking.cpp
@@ -142,3 +142,8 @@ bool ImplicitControlFlowTracking::isSpecialInstruction(
   }
   return true;
 }
+
+bool MemoryWriteTracking::isSpecialInstruction(
+    const Instruction *Insn) const {
+  return Insn->mayWriteToMemory();
+}
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index fd6f4ba476eedf329ab48c639309c2905e63c7c9..ccf907c144f0d1a7535e667a1f788a819b0122aa 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -2630,6 +2630,70 @@ static void setLimitsForBinOp(BinaryOperator &BO, APInt &Lower, APInt &Upper,
   }
 }
 
+/// Some intrinsics with a constant operand have an easy-to-compute range of
+/// outputs. This can be used to fold a comparison to always true or always
+/// false.
+static void setLimitsForIntrinsic(IntrinsicInst &II, APInt &Lower,
+                                  APInt &Upper) {
+  unsigned Width = Lower.getBitWidth();
+  const APInt *C;
+  switch (II.getIntrinsicID()) {
+  case Intrinsic::uadd_sat:
+    // uadd.sat(x, C) produces [C, UINT_MAX].
+    if (match(II.getOperand(0), m_APInt(C)) ||
+        match(II.getOperand(1), m_APInt(C)))
+      Lower = *C;
+    break;
+  case Intrinsic::sadd_sat:
+    if (match(II.getOperand(0), m_APInt(C)) ||
+        match(II.getOperand(1), m_APInt(C))) {
+      if (C->isNegative()) {
+        // sadd.sat(x, -C) produces [SINT_MIN, SINT_MAX + (-C)].
+        Lower = APInt::getSignedMinValue(Width);
+        Upper = APInt::getSignedMaxValue(Width) + *C + 1;
+      } else {
+        // sadd.sat(x, +C) produces [SINT_MIN + C, SINT_MAX].
+        Lower = APInt::getSignedMinValue(Width) + *C;
+        Upper = APInt::getSignedMaxValue(Width) + 1;
+      }
+    }
+    break;
+  case Intrinsic::usub_sat:
+    // usub.sat(C, x) produces [0, C].
+    if (match(II.getOperand(0), m_APInt(C)))
+      Upper = *C + 1;
+    // usub.sat(x, C) produces [0, UINT_MAX - C].
+    else if (match(II.getOperand(1), m_APInt(C)))
+      Upper = APInt::getMaxValue(Width) - *C + 1;
+    break;
+  case Intrinsic::ssub_sat:
+    if (match(II.getOperand(0), m_APInt(C))) {
+      if (C->isNegative()) {
+        // ssub.sat(-C, x) produces [SINT_MIN, -SINT_MIN + (-C)].
+        Lower = APInt::getSignedMinValue(Width);
+        Upper = *C - APInt::getSignedMinValue(Width) + 1;
+      } else {
+        // ssub.sat(+C, x) produces [-SINT_MAX + C, SINT_MAX].
+        Lower = *C - APInt::getSignedMaxValue(Width);
+        Upper = APInt::getSignedMaxValue(Width) + 1;
+      }
+    } else if (match(II.getOperand(1), m_APInt(C))) {
+      if (C->isNegative()) {
+        // ssub.sat(x, -C) produces [SINT_MIN - (-C), SINT_MAX]:
+        Lower = APInt::getSignedMinValue(Width) - *C;
+        Upper = APInt::getSignedMaxValue(Width) + 1;
+      } else {
+        // ssub.sat(x, +C) produces [SINT_MIN, SINT_MAX - C].
+        Lower = APInt::getSignedMinValue(Width);
+        Upper = APInt::getSignedMaxValue(Width) - *C + 1;
+      }
+    }
+    break;
+  default:
+    break;
+  }
+}
+
 static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
                                        Value *RHS, const InstrInfoQuery &IIQ) {
   Type *ITy = GetCompareTy(RHS); // The return type.
@@ -2663,6 +2727,8 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   APInt Upper = APInt(Width, 0);
   if (auto *BO = dyn_cast<BinaryOperator>(LHS))
     setLimitsForBinOp(*BO, Lower, Upper, IIQ);
+  else if (auto *II = dyn_cast<IntrinsicInst>(LHS))
+    setLimitsForIntrinsic(*II, Lower, Upper);
 
   ConstantRange LHS_CR =
       Lower != Upper ? ConstantRange(Lower, Upper) : ConstantRange(Width, true);
@@ -3837,6 +3903,28 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
       if (Value *V = simplifySelectBitTest(TrueVal, FalseVal, X, Y,
                                            Pred == ICmpInst::ICMP_EQ))
         return V;
+
+    // Test for zero-shift-guard-ops around funnel shifts. These are used to
+    // avoid UB from oversized shifts in raw IR rotate patterns, but the
+    // intrinsics do not have that problem.
+    Value *ShAmt;
+    auto isFsh = m_CombineOr(m_Intrinsic<Intrinsic::fshl>(m_Value(X), m_Value(),
+                                                          m_Value(ShAmt)),
+                             m_Intrinsic<Intrinsic::fshr>(m_Value(), m_Value(X),
+                                                          m_Value(ShAmt)));
+    // (ShAmt != 0) ? fshl(X, *, ShAmt) : X --> fshl(X, *, ShAmt)
+    // (ShAmt != 0) ? fshr(*, X, ShAmt) : X --> fshr(*, X, ShAmt)
+    // (ShAmt == 0) ? fshl(X, *, ShAmt) : X --> X
+    // (ShAmt == 0) ? fshr(*, X, ShAmt) : X --> X
+    if (match(TrueVal, isFsh) && FalseVal == X && CmpLHS == ShAmt)
+      return Pred == ICmpInst::ICMP_NE ? TrueVal : X;
+
+    // (ShAmt == 0) ? X : fshl(X, *, ShAmt) --> fshl(X, *, ShAmt)
+    // (ShAmt == 0) ? X : fshr(*, X, ShAmt) --> fshr(*, X, ShAmt)
+    // (ShAmt != 0) ? X : fshl(X, *, ShAmt) --> X
+    // (ShAmt != 0) ? X : fshr(*, X, ShAmt) --> X
+    if (match(FalseVal, isFsh) && TrueVal == X && CmpLHS == ShAmt)
+      return Pred == ICmpInst::ICMP_EQ ? FalseVal : X;
   }
 
   // Check for other compares that behave like bit test.
@@ -3944,6 +4032,10 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   if (Value *V = foldSelectWithBinaryOp(Cond, TrueVal, FalseVal))
     return V;
 
+  Optional<bool> Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL);
+  if (Imp)
+    return *Imp ? TrueVal : FalseVal;
+
   return nullptr;
 }
 
@@ -4890,6 +4982,40 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
       return Constant::getNullValue(ReturnType);
     break;
+  case Intrinsic::uadd_sat:
+    // sat(MAX + X) -> MAX
+    // sat(X + MAX) -> MAX
+    if (match(Op0, m_AllOnes()) || match(Op1, m_AllOnes()))
+      return Constant::getAllOnesValue(ReturnType);
+    LLVM_FALLTHROUGH;
+  case Intrinsic::sadd_sat:
+    // sat(X + undef) -> -1
+    // sat(undef + X) -> -1
+    // For unsigned: Assume undef is MAX, thus we saturate to MAX (-1).
+    // For signed: Assume undef is ~X, in which case X + ~X = -1.
+    if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
+      return Constant::getAllOnesValue(ReturnType);
+
+    // X + 0 -> X
+    if (match(Op1, m_Zero()))
+      return Op0;
+    // 0 + X -> X
+    if (match(Op0, m_Zero()))
+      return Op1;
+    break;
+  case Intrinsic::usub_sat:
+    // sat(0 - X) -> 0, sat(X - MAX) -> 0
+    if (match(Op0, m_Zero()) || match(Op1, m_AllOnes()))
+      return Constant::getNullValue(ReturnType);
+    LLVM_FALLTHROUGH;
+  case Intrinsic::ssub_sat:
+    // X - X -> 0, X - undef -> 0, undef - X -> 0
+    if (Op0 == Op1 || match(Op0, m_Undef()) || match(Op1, m_Undef()))
+      return Constant::getNullValue(ReturnType);
+    // X - 0 -> X
+    if (match(Op1, m_Zero()))
+      return Op0;
+    break;
   case Intrinsic::load_relative:
     if (auto *C0 = dyn_cast<Constant>(Op0))
       if (auto *C1 = dyn_cast<Constant>(Op1))
@@ -4986,7 +5112,16 @@ static Value *simplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
   }
   case Intrinsic::fshl:
   case Intrinsic::fshr: {
-    Value *ShAmtArg = ArgBegin[2];
+    Value *Op0 = ArgBegin[0], *Op1 = ArgBegin[1], *ShAmtArg = ArgBegin[2];
+
+    // If both operands are undef, the result is undef.
+    if (match(Op0, m_Undef()) && match(Op1, m_Undef()))
+      return UndefValue::get(F->getReturnType());
+
+    // If shift amount is undef, assume it is zero.
+    if (match(ShAmtArg, m_Undef()))
+      return ArgBegin[IID == Intrinsic::fshl ? 0 : 1];
+
     const APInt *ShAmtC;
     if (match(ShAmtArg, m_APInt(ShAmtC))) {
       // If there's effectively no shift, return the 1st arg or 2nd arg.
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index ee0148e0d7956213f4a2d201edb1a08220eb62a5..110c085d3f353224dac76da6543b969797a6b2f5 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -420,6 +421,8 @@ namespace {
                               BasicBlock *BB);
   bool solveBlockValueSelect(ValueLatticeElement &BBLV, SelectInst *S,
                              BasicBlock *BB);
+  Optional<ConstantRange> getRangeForOperand(unsigned Op, Instruction *I,
+                                             BasicBlock *BB);
   bool solveBlockValueBinaryOp(ValueLatticeElement &BBLV, BinaryOperator *BBI,
                                BasicBlock *BB);
   bool solveBlockValueCast(ValueLatticeElement &BBLV, CastInst *CI,
@@ -634,8 +637,7 @@ bool LazyValueInfoImpl::solveBlockValueImpl(ValueLatticeElement &Res,
     if (auto *CI = dyn_cast<CastInst>(BBI))
       return solveBlockValueCast(Res, CI, BB);
 
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI);
-    if (BO && isa<ConstantInt>(BO->getOperand(1)))
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI))
       return solveBlockValueBinaryOp(Res, BO, BB);
   }
 
@@ -951,6 +953,25 @@ bool LazyValueInfoImpl::solveBlockValueSelect(ValueLatticeElement &BBLV,
   return true;
 }
 
+Optional<ConstantRange> LazyValueInfoImpl::getRangeForOperand(unsigned Op,
+                                                              Instruction *I,
+                                                              BasicBlock *BB) {
+  if (!hasBlockValue(I->getOperand(Op), BB))
+    if (pushBlockValue(std::make_pair(BB, I->getOperand(Op))))
+      return None;
+
+  const unsigned OperandBitWidth =
+    DL.getTypeSizeInBits(I->getOperand(Op)->getType());
+  ConstantRange Range = ConstantRange(OperandBitWidth);
+  if (hasBlockValue(I->getOperand(Op), BB)) {
+    ValueLatticeElement Val = getBlockValue(I->getOperand(Op), BB);
+    intersectAssumeOrGuardBlockValueConstantRange(I->getOperand(Op), Val, I);
+    if (Val.isConstantRange())
+      Range = Val.getConstantRange();
+  }
+  return Range;
+}
+
 bool LazyValueInfoImpl::solveBlockValueCast(ValueLatticeElement &BBLV,
                                             CastInst *CI,
                                             BasicBlock *BB) {
@@ -981,21 +1002,11 @@ bool LazyValueInfoImpl::solveBlockValueCast(ValueLatticeElement &BBLV,
   // Figure out the range of the LHS.  If that fails, we still apply the
   // transfer rule on the full set since we may be able to locally infer
   // interesting facts.
-  if (!hasBlockValue(CI->getOperand(0), BB))
-    if (pushBlockValue(std::make_pair(BB, CI->getOperand(0))))
-      // More work to do before applying this transfer rule.
-      return false;
-
-  const unsigned OperandBitWidth =
-    DL.getTypeSizeInBits(CI->getOperand(0)->getType());
-  ConstantRange LHSRange = ConstantRange(OperandBitWidth);
-  if (hasBlockValue(CI->getOperand(0), BB)) {
-    ValueLatticeElement LHSVal = getBlockValue(CI->getOperand(0), BB);
-    intersectAssumeOrGuardBlockValueConstantRange(CI->getOperand(0), LHSVal,
-                                                  CI);
-    if (LHSVal.isConstantRange())
-      LHSRange = LHSVal.getConstantRange();
-  }
+  Optional<ConstantRange> LHSRes = getRangeForOperand(0, CI, BB);
+  if (!LHSRes.hasValue())
+    // More work to do before applying this transfer rule.
+    return false;
+  ConstantRange LHSRange = LHSRes.getValue();
 
   const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth();
 
@@ -1037,27 +1048,19 @@ bool LazyValueInfoImpl::solveBlockValueBinaryOp(ValueLatticeElement &BBLV,
     return true;
   };
 
-  // Figure out the range of the LHS.  If that fails, use a conservative range,
-  // but apply the transfer rule anyways.  This lets us pick up facts from
-  // expressions like "and i32 (call i32 @foo()), 32"
-  if (!hasBlockValue(BO->getOperand(0), BB))
-    if (pushBlockValue(std::make_pair(BB, BO->getOperand(0))))
-      // More work to do before applying this transfer rule.
-      return false;
+  // Figure out the ranges of the operands.  If that fails, use a
+  // conservative range, but apply the transfer rule anyways.  This
+  // lets us pick up facts from expressions like "and i32 (call i32
+  // @foo()), 32"
+  Optional<ConstantRange> LHSRes = getRangeForOperand(0, BO, BB);
+  Optional<ConstantRange> RHSRes = getRangeForOperand(1, BO, BB);
 
-  const unsigned OperandBitWidth =
-    DL.getTypeSizeInBits(BO->getOperand(0)->getType());
-  ConstantRange LHSRange = ConstantRange(OperandBitWidth);
-  if (hasBlockValue(BO->getOperand(0), BB)) {
-    ValueLatticeElement LHSVal = getBlockValue(BO->getOperand(0), BB);
-    intersectAssumeOrGuardBlockValueConstantRange(BO->getOperand(0), LHSVal,
-                                                  BO);
-    if (LHSVal.isConstantRange())
-      LHSRange = LHSVal.getConstantRange();
-  }
+  if (!LHSRes.hasValue() || !RHSRes.hasValue())
+    // More work to do before applying this transfer rule.
+    return false;
 
-  ConstantInt *RHS = cast<ConstantInt>(BO->getOperand(1));
-  ConstantRange RHSRange = ConstantRange(RHS->getValue());
+  ConstantRange LHSRange = LHSRes.getValue();
+  ConstantRange RHSRange = RHSRes.getValue();
 
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
diff --git a/lib/Analysis/LegacyDivergenceAnalysis.cpp b/lib/Analysis/LegacyDivergenceAnalysis.cpp
index 2089d1c53d0de298274ece2ac3549bf465ced6a9..5540859ebddae4e6395aabde4458b58874d920fa 100644
--- a/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ b/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -1,4 +1,5 @@
-//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==//
+//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis
+//Implementation -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -64,6 +65,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -79,6 +83,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "divergence"
 
+// transparently use the GPUDivergenceAnalysis
+static cl::opt<bool> UseGPUDA("use-gpu-divergence-analysis", cl::init(false),
+                              cl::Hidden,
+                              cl::desc("turn the LegacyDivergenceAnalysis into "
+                                       "a wrapper for GPUDivergenceAnalysis"));
+
 namespace {
 
 class DivergencePropagator {
@@ -262,16 +272,17 @@ void DivergencePropagator::propagate() {
   }
 }
 
-} /// end namespace anonymous
+} // namespace
 
 // Register this pass.
 char LegacyDivergenceAnalysis::ID = 0;
-INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis",
-                      false, true)
+INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence",
+                      "Legacy Divergence Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis",
-                    false, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence",
+                    "Legacy Divergence Analysis", false, true)
 
 FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
   return new LegacyDivergenceAnalysis();
@@ -280,9 +291,24 @@ FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
 void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<PostDominatorTreeWrapperPass>();
+  if (UseGPUDA)
+    AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
+bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis(
+    const Function &F) const {
+  if (!UseGPUDA)
+    return false;
+
+  // GPUDivergenceAnalysis requires a reducible CFG.
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  using RPOTraversal = ReversePostOrderTraversal<const Function *>;
+  RPOTraversal FuncRPOT(&F);
+  return !containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
+                                 const LoopInfo>(FuncRPOT, LI);
+}
+
 bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
   auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
   if (TTIWP == nullptr)
@@ -295,36 +321,61 @@ bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
     return false;
 
   DivergentValues.clear();
+  gpuDA = nullptr;
+
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-  DivergencePropagator DP(F, TTI,
-                          getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                          PDT, DivergentValues);
-  DP.populateWithSourcesOfDivergence();
-  DP.propagate();
-  LLVM_DEBUG(
-    dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n";
-    print(dbgs(), F.getParent())
-  );
+
+  if (shouldUseGPUDivergenceAnalysis(F)) {
+    // run the new GPU divergence analysis
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    gpuDA = llvm::make_unique<GPUDivergenceAnalysis>(F, DT, PDT, LI, TTI);
+
+  } else {
+    // run LLVM's existing DivergenceAnalysis
+    DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues);
+    DP.populateWithSourcesOfDivergence();
+    DP.propagate();
+  }
+
+  LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName()
+                    << ":\n";
+             print(dbgs(), F.getParent()));
+
   return false;
 }
 
+bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const {
+  if (gpuDA) {
+    return gpuDA->isDivergent(*V);
+  }
+  return DivergentValues.count(V);
+}
+
 void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
-  if (DivergentValues.empty())
+  if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty())
     return;
-  const Value *FirstDivergentValue = *DivergentValues.begin();
-  const Function *F;
-  if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
-    F = Arg->getParent();
-  } else if (const Instruction *I =
-                 dyn_cast<Instruction>(FirstDivergentValue)) {
-    F = I->getParent()->getParent();
-  } else {
-    llvm_unreachable("Only arguments and instructions can be divergent");
+
+  const Function *F = nullptr;
+  if (!DivergentValues.empty()) {
+    const Value *FirstDivergentValue = *DivergentValues.begin();
+    if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
+      F = Arg->getParent();
+    } else if (const Instruction *I =
+                   dyn_cast<Instruction>(FirstDivergentValue)) {
+      F = I->getParent()->getParent();
+    } else {
+      llvm_unreachable("Only arguments and instructions can be divergent");
+    }
+  } else if (gpuDA) {
+    F = &gpuDA->getFunction();
   }
+  if (!F)
+    return;
 
   // Dumps all divergent values in F, arguments and then instructions.
   for (auto &Arg : F->args()) {
-    OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : "           ");
+    OS << (isDivergent(&Arg) ? "DIVERGENT: " : "           ");
     OS << Arg << "\n";
   }
   // Iterate instructions using instructions() to ensure a deterministic order.
@@ -332,7 +383,7 @@ void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
     auto &BB = *BI;
     OS << "\n           " << BB.getName() << ":\n";
     for (auto &I : BB.instructionsWithoutDebug()) {
-      OS << (DivergentValues.count(&I) ? "DIVERGENT:     " : "               ");
+      OS << (isDivergent(&I) ? "DIVERGENT:     " : "               ");
       OS << I << "\n";
     }
   }
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 4b8e8afdabbd437c64d058c08ac0ef8f4de5c62a..245f318e308a3658ae9ec6077a06a29b4be2c64e 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -420,7 +420,7 @@ void RuntimePointerChecking::groupChecks(
 
     // We've computed the grouped checks for this partition.
     // Save the results and continue with the next one.
-    std::copy(Groups.begin(), Groups.end(), std::back_inserter(CheckingGroups));
+    llvm::copy(Groups, std::back_inserter(CheckingGroups));
   }
 }
 
@@ -1221,18 +1221,19 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
   return X == PtrSCEVB;
 }
 
-bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
+MemoryDepChecker::VectorizationSafetyStatus
+MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
   switch (Type) {
   case NoDep:
   case Forward:
   case BackwardVectorizable:
-    return true;
+    return VectorizationSafetyStatus::Safe;
 
   case Unknown:
   case ForwardButPreventsForwarding:
   case Backward:
   case BackwardVectorizableButPreventsForwarding:
-    return false;
+    return VectorizationSafetyStatus::Unsafe;
   }
   llvm_unreachable("unexpected DepType!");
 }
@@ -1317,6 +1318,11 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
   return false;
 }
 
+void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
+  if (Status < S)
+    Status = S;
+}
+
 /// Given a non-constant (unknown) dependence-distance \p Dist between two
 /// memory accesses, that have the same stride whose absolute value is given
 /// in \p Stride, and that have the same type size \p TypeByteSize,
@@ -1652,7 +1658,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
 
             Dependence::DepType Type =
                 isDependent(*A.first, A.second, *B.first, B.second, Strides);
-            SafeForVectorization &= Dependence::isSafeForVectorization(Type);
+            mergeInStatus(Dependence::isSafeForVectorization(Type));
 
             // Gather dependences unless we accumulated MaxDependences
             // dependences.  In that case return as soon as we find the first
@@ -1669,7 +1675,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
                            << "Too many dependences, stopped recording\n");
               }
             }
-            if (!RecordDependences && !SafeForVectorization)
+            if (!RecordDependences && !isSafeForVectorization())
               return false;
           }
         ++OI;
@@ -1679,7 +1685,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
   }
 
   LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
-  return SafeForVectorization;
+  return isSafeForVectorization();
 }
 
 SmallVector<Instruction *, 4>
@@ -1870,7 +1876,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
     Value *Ptr = ST->getPointerOperand();
 
     if (isUniform(Ptr))
-      HasMultipleStoresToLoopInvariantAddress |=
+      HasDependenceInvolvingLoopInvariantAddress |=
           !UniformStores.insert(Ptr).second;
 
     // If we did *not* see this pointer before, insert it to  the read-write
@@ -1914,6 +1920,14 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
       IsReadOnlyPtr = true;
     }
 
+    // See if there is an unsafe dependency between a load to a uniform address and
+    // store to the same uniform address.
+    if (UniformStores.count(Ptr)) {
+      LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
+                           "load and uniform store to the same address!\n");
+      HasDependenceInvolvingLoopInvariantAddress = true;
+    }
+
     MemoryLocation Loc = MemoryLocation::get(LD);
     // The TBAA metadata could have a control dependency on the predication
     // condition, so we cannot rely on it when determining whether or not we
@@ -2272,7 +2286,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
       PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
-      HasMultipleStoresToLoopInvariantAddress(false) {
+      HasDependenceInvolvingLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
 }
@@ -2304,8 +2318,8 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   PtrRtChecking->print(OS, Depth);
   OS << "\n";
 
-  OS.indent(Depth) << "Multiple stores to invariant address were "
-                   << (HasMultipleStoresToLoopInvariantAddress ? "" : "not ")
+  OS.indent(Depth) << "Non vectorizable stores to invariant address were "
+                   << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 4b174b66d1e12aa55122d17bfdae7286bcb1444b..6c779bf2cca03bb5b694ae8798a87915f10ef059 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -237,23 +237,19 @@ MDNode *Loop::getLoopID() const {
 }
 
 void Loop::setLoopID(MDNode *LoopID) const {
-  assert(LoopID && "Loop ID should not be null");
-  assert(LoopID->getNumOperands() > 0 && "Loop ID needs at least one operand");
-  assert(LoopID->getOperand(0) == LoopID && "Loop ID should refer to itself");
+  assert((!LoopID || LoopID->getNumOperands() > 0) &&
+         "Loop ID needs at least one operand");
+  assert((!LoopID || LoopID->getOperand(0) == LoopID) &&
+         "Loop ID should refer to itself");
 
-  if (BasicBlock *Latch = getLoopLatch()) {
-    Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
-    return;
-  }
-
-  assert(!getLoopLatch() &&
-         "The loop should have no single latch at this point");
   BasicBlock *H = getHeader();
   for (BasicBlock *BB : this->blocks()) {
     Instruction *TI = BB->getTerminator();
     for (BasicBlock *Successor : successors(TI)) {
-      if (Successor == H)
+      if (Successor == H) {
         TI->setMetadata(LLVMContext::MD_loop, LoopID);
+        break;
+      }
     }
   }
 }
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 4c1bd7ab08e675b5917b43c32e351f7bbc10b1db..a68f114b83a0dbebcc13f048c9c6248c2e1fbdb2 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -216,10 +216,12 @@ bool LPPassManager::runOnFunction(Function &F) {
 
       initializeAnalysisImpl(P);
 
+      bool LocalChanged = false;
       {
         PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader());
         TimeRegion PassTimer(getPassTimer(P));
-        Changed |= P->runOnLoop(CurrentLoop, *this);
+        LocalChanged = P->runOnLoop(CurrentLoop, *this);
+        Changed |= LocalChanged;
         if (EmitICRemark) {
           unsigned NewSize = F.getInstructionCount();
           // Update the size of the function, emit a remark, and update the
@@ -235,7 +237,7 @@ bool LPPassManager::runOnFunction(Function &F) {
         }
       }
 
-      if (Changed)
+      if (LocalChanged)
         dumpPassInfo(P, MODIFICATION_MSG, ON_LOOP_MSG,
                      CurrentLoopDeleted ? "<deleted loop>"
                                         : CurrentLoop->getName());
diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index 1e45cc4bdf8ef3a93d50321e9ff00f04bbfaea65..3d98fca823ad492a184bab5950adf21e3025e7ba 100644
--- a/lib/Analysis/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -2199,13 +2199,14 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
     return StartingAccess->getOptimized();
 
   const Instruction *I = StartingAccess->getMemoryInst();
-  UpwardsMemoryQuery Q(I, StartingAccess);
   // We can't sanely do anything with a fence, since they conservatively clobber
   // all memory, and have no locations to get pointers from to try to
   // disambiguate.
-  if (!Q.IsCall && I->isFenceLike())
+  if (!ImmutableCallSite(I) && I->isFenceLike())
     return StartingAccess;
 
+  UpwardsMemoryQuery Q(I, StartingAccess);
+
   if (isUseTriviallyOptimizableToLiveOnEntry(*MSSA->AA, I)) {
     MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef();
     StartingAccess->setOptimized(LiveOnEntry);
diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp
index 880dc2f278522688bc4e2f310685933b9b514c0d..6c817d20368456b1196c47d9485845154d62c61a 100644
--- a/lib/Analysis/MemorySSAUpdater.cpp
+++ b/lib/Analysis/MemorySSAUpdater.cpp
@@ -93,7 +93,7 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(
         // FIXME: Figure out whether this is dead code and if so remove it.
         if (!std::equal(Phi->op_begin(), Phi->op_end(), PhiOps.begin())) {
           // These will have been filled in by the recursive read we did above.
-          std::copy(PhiOps.begin(), PhiOps.end(), Phi->op_begin());
+          llvm::copy(PhiOps, Phi->op_begin());
           std::copy(pred_begin(BB), pred_end(BB), Phi->block_begin());
         }
       } else {
@@ -1022,7 +1022,7 @@ void MemorySSAUpdater::wireOldPredecessorsToNewImmediatePredecessor(
   MemoryPhi *Phi = MSSA->getMemoryAccess(Old);
   if (!Phi)
     return;
-  if (pred_size(Old) == 1) {
+  if (Old->hasNPredecessors(1)) {
     assert(pred_size(New) == Preds.size() &&
            "Should have moved all predecessors.");
     MSSA->moveTo(Phi, New, MemorySSA::Beginning);
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 29b96ac746b2007fabfbeee7824439a39b7d1e0d..6bda1d1b1a36016ed675d04bdbb745ab53abdbd4 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -220,10 +220,19 @@ static void addIntrinsicToSummary(
   }
 }
 
-static void computeFunctionSummary(
-    ModuleSummaryIndex &Index, const Module &M, const Function &F,
-    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, DominatorTree &DT,
-    bool HasLocalsInUsedOrAsm, DenseSet<GlobalValue::GUID> &CantBePromoted) {
+static bool isNonVolatileLoad(const Instruction *I) {
+  if (const auto *LI = dyn_cast<LoadInst>(I))
+    return !LI->isVolatile();
+
+  return false;
+}
+
+static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
+                                   const Function &F, BlockFrequencyInfo *BFI,
+                                   ProfileSummaryInfo *PSI, DominatorTree &DT,
+                                   bool HasLocalsInUsedOrAsm,
+                                   DenseSet<GlobalValue::GUID> &CantBePromoted,
+                                   bool IsThinLTO) {
   // Summary not currently supported for anonymous functions, they should
   // have been named.
   assert(F.hasName());
@@ -244,6 +253,7 @@ static void computeFunctionSummary(
   // Add personality function, prefix data and prologue data to function's ref
   // list.
   findRefEdges(Index, &F, RefEdges, Visited);
+  std::vector<const Instruction *> NonVolatileLoads;
 
   bool HasInlineAsmMaybeReferencingInternal = false;
   for (const BasicBlock &BB : F)
@@ -251,6 +261,13 @@ static void computeFunctionSummary(
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       ++NumInsts;
+      if (isNonVolatileLoad(&I)) {
+        // Postpone processing of non-volatile load instructions
+        // See comments below
+        Visited.insert(&I);
+        NonVolatileLoads.push_back(&I);
+        continue;
+      }
       findRefEdges(Index, &I, RefEdges, Visited);
       auto CS = ImmutableCallSite(&I);
       if (!CS)
@@ -340,6 +357,24 @@ static void computeFunctionSummary(
       }
     }
 
+  // By now we processed all instructions in a function, except
+  // non-volatile loads. All new refs we add in a loop below
+  // are obviously constant. All constant refs are grouped in the
+  // end of RefEdges vector, so we can use a single integer value
+  // to identify them.
+  unsigned RefCnt = RefEdges.size();
+  for (const Instruction *I : NonVolatileLoads) {
+    Visited.erase(I);
+    findRefEdges(Index, I, RefEdges, Visited);
+  }
+  std::vector<ValueInfo> Refs = RefEdges.takeVector();
+  // Regular LTO module doesn't participate in ThinLTO import,
+  // so no reference from it can be readonly, since this would
+  // require importing variable as local copy
+  if (IsThinLTO)
+    for (; RefCnt < Refs.size(); ++RefCnt)
+      Refs[RefCnt].setReadOnly();
+
   // Explicit add hot edges to enforce importing for designated GUIDs for
   // sample PGO, to enable the same inlines as the profiled optimized binary.
   for (auto &I : F.getImportGUIDs())
@@ -357,13 +392,11 @@ static void computeFunctionSummary(
       F.hasFnAttribute(Attribute::ReadNone),
       F.hasFnAttribute(Attribute::ReadOnly),
       F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(),
-      // Inliner doesn't handle variadic functions.
       // FIXME: refactor this to use the same code that inliner is using.
-      F.isVarArg() ||
-          // Don't try to import functions with noinline attribute.
-          F.getAttributes().hasFnAttribute(Attribute::NoInline)};
+      // Don't try to import functions with noinline attribute.
+      F.getAttributes().hasFnAttribute(Attribute::NoInline)};
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
-      Flags, NumInsts, FunFlags, RefEdges.takeVector(),
+      Flags, NumInsts, FunFlags, /*EntryCount=*/0, std::move(Refs),
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
       TypeTestAssumeVCalls.takeVector(), TypeCheckedLoadVCalls.takeVector(),
       TypeTestAssumeConstVCalls.takeVector(),
@@ -382,8 +415,13 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
   bool NonRenamableLocal = isNonRenamableLocal(V);
   GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
                                     /* Live = */ false, V.isDSOLocal());
-  auto GVarSummary =
-      llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
+
+  // Don't mark variables we won't be able to internalize as read-only.
+  GlobalVarSummary::GVarFlags VarFlags(
+      !V.hasComdat() && !V.hasAppendingLinkage() && !V.isInterposable() &&
+      !V.hasAvailableExternallyLinkage() && !V.hasDLLExportStorageClass());
+  auto GVarSummary = llvm::make_unique<GlobalVarSummary>(Flags, VarFlags,
+                                                         RefEdges.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(V.getGUID());
   if (HasBlockAddress)
@@ -471,14 +509,15 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
           if (Function *F = dyn_cast<Function>(GV)) {
             std::unique_ptr<FunctionSummary> Summary =
                 llvm::make_unique<FunctionSummary>(
-                    GVFlags, 0,
+                    GVFlags, /*InstCount=*/0,
                     FunctionSummary::FFlags{
                         F->hasFnAttribute(Attribute::ReadNone),
                         F->hasFnAttribute(Attribute::ReadOnly),
                         F->hasFnAttribute(Attribute::NoRecurse),
                         F->returnDoesNotAlias(),
                         /* NoInline = */ false},
-                    ArrayRef<ValueInfo>{}, ArrayRef<FunctionSummary::EdgeTy>{},
+                    /*EntryCount=*/0, ArrayRef<ValueInfo>{},
+                    ArrayRef<FunctionSummary::EdgeTy>{},
                     ArrayRef<GlobalValue::GUID>{},
                     ArrayRef<FunctionSummary::VFuncId>{},
                     ArrayRef<FunctionSummary::VFuncId>{},
@@ -487,13 +526,19 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
-                llvm::make_unique<GlobalVarSummary>(GVFlags,
-                                                    ArrayRef<ValueInfo>{});
+                llvm::make_unique<GlobalVarSummary>(
+                    GVFlags, GlobalVarSummary::GVarFlags(),
+                    ArrayRef<ValueInfo>{});
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           }
         });
   }
 
+  bool IsThinLTO = true;
+  if (auto *MD =
+          mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("ThinLTO")))
+    IsThinLTO = MD->getZExtValue();
+
   // Compute summaries for all functions defined in module, and save in the
   // index.
   for (auto &F : M) {
@@ -514,7 +559,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
 
     computeFunctionSummary(Index, M, F, BFI, PSI, DT,
                            !LocalsUsed.empty() || HasLocalInlineAsmSymbol,
-                           CantBePromoted);
+                           CantBePromoted, IsThinLTO);
   }
 
   // Compute summaries for all variables defined in module, and save in the
@@ -545,11 +590,6 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
   setLiveRoot(Index, "llvm.global_dtors");
   setLiveRoot(Index, "llvm.global.annotations");
 
-  bool IsThinLTO = true;
-  if (auto *MD =
-          mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("ThinLTO")))
-    IsThinLTO = MD->getZExtValue();
-
   for (auto &GlobalList : Index) {
     // Ignore entries for references that are undefined in the current module.
     if (GlobalList.second.SummaryList.empty())
@@ -619,7 +659,7 @@ ModuleSummaryIndexWrapperPass::ModuleSummaryIndexWrapperPass()
 }
 
 bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) {
-  auto &PSI = *getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   Index.emplace(buildModuleSummaryIndex(
       M,
       [this](const Function &F) {
@@ -627,7 +667,7 @@ bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) {
                          *const_cast<Function *>(&F))
                      .getBFI());
       },
-      &PSI));
+      PSI));
   return false;
 }
 
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 23e012626e244df2957531a18510132acc6c4bce..281b8f5ab987e803f8bd615c86ab3436c0838793 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -72,6 +72,7 @@ bool ICFLoopSafetyInfo::anyBlockMayThrow() const {
 void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
   assert(CurLoop != nullptr && "CurLoop can't be null");
   ICF.clear();
+  MW.clear();
   MayThrow = false;
   // Figure out the fact that at least one block may throw.
   for (auto &BB : CurLoop->blocks())
@@ -84,12 +85,14 @@ void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
 
 void ICFLoopSafetyInfo::insertInstructionTo(const BasicBlock *BB) {
   ICF.invalidateBlock(BB);
+  MW.invalidateBlock(BB);
 }
 
 void ICFLoopSafetyInfo::removeInstruction(const Instruction *Inst) {
   // TODO: So far we just conservatively drop cache, but maybe we can not do it
   // when Inst is not an ICF instruction. Follow-up on that.
   ICF.invalidateBlock(Inst->getParent());
+  MW.invalidateBlock(Inst->getParent());
 }
 
 void LoopSafetyInfo::computeBlockColors(const Loop *CurLoop) {
@@ -256,6 +259,34 @@ bool ICFLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
          allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT);
 }
 
+bool ICFLoopSafetyInfo::doesNotWriteMemoryBefore(const BasicBlock *BB,
+                                                 const Loop *CurLoop) const {
+  assert(CurLoop->contains(BB) && "Should only be called for loop blocks!");
+
+  // Fast path: there are no instructions before header.
+  if (BB == CurLoop->getHeader())
+    return true;
+
+  // Collect all transitive predecessors of BB in the same loop. This set will
+  // be a subset of the blocks within the loop.
+  SmallPtrSet<const BasicBlock *, 4> Predecessors;
+  collectTransitivePredecessors(CurLoop, BB, Predecessors);
+  // Find if there any instruction in either predecessor that could write
+  // to memory.
+  for (auto *Pred : Predecessors)
+    if (MW.mayWriteToMemory(Pred))
+      return false;
+  return true;
+}
+
+bool ICFLoopSafetyInfo::doesNotWriteMemoryBefore(const Instruction &I,
+                                                 const Loop *CurLoop) const {
+  auto *BB = I.getParent();
+  assert(CurLoop->contains(BB) && "Should only be called for loop blocks!");
+  return !MW.isDominatedByMemoryWriteFromSameBlock(&I) &&
+         doesNotWriteMemoryBefore(BB, CurLoop);
+}
+
 namespace {
   struct MustExecutePrinter : public FunctionPass {
 
diff --git a/lib/Analysis/ObjCARCInstKind.cpp b/lib/Analysis/ObjCARCInstKind.cpp
index f268e2a9abdde9a5992c34ef946f3c4f539707e7..31c432711834a945d6481cc964085609519b4afa 100644
--- a/lib/Analysis/ObjCARCInstKind.cpp
+++ b/lib/Analysis/ObjCARCInstKind.cpp
@@ -85,97 +85,73 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
 }
 
 ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) {
-  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
 
-  // No (mandatory) arguments.
-  if (AI == AE)
-    return StringSwitch<ARCInstKind>(F->getName())
-        .Case("objc_autoreleasePoolPush", ARCInstKind::AutoreleasepoolPush)
-        .Case("clang.arc.use", ARCInstKind::IntrinsicUser)
-        .Default(ARCInstKind::CallOrUser);
-
-  // One argument.
-  const Argument *A0 = &*AI++;
-  if (AI == AE) {
-    // Argument is a pointer.
-    PointerType *PTy = dyn_cast<PointerType>(A0->getType());
-    if (!PTy)
-      return ARCInstKind::CallOrUser;
-
-    Type *ETy = PTy->getElementType();
-    // Argument is i8*.
-    if (ETy->isIntegerTy(8))
-      return StringSwitch<ARCInstKind>(F->getName())
-          .Case("objc_retain", ARCInstKind::Retain)
-          .Case("objc_retainAutoreleasedReturnValue", ARCInstKind::RetainRV)
-          .Case("objc_unsafeClaimAutoreleasedReturnValue", ARCInstKind::ClaimRV)
-          .Case("objc_retainBlock", ARCInstKind::RetainBlock)
-          .Case("objc_release", ARCInstKind::Release)
-          .Case("objc_autorelease", ARCInstKind::Autorelease)
-          .Case("objc_autoreleaseReturnValue", ARCInstKind::AutoreleaseRV)
-          .Case("objc_autoreleasePoolPop", ARCInstKind::AutoreleasepoolPop)
-          .Case("objc_retainedObject", ARCInstKind::NoopCast)
-          .Case("objc_unretainedObject", ARCInstKind::NoopCast)
-          .Case("objc_unretainedPointer", ARCInstKind::NoopCast)
-          .Case("objc_retain_autorelease", ARCInstKind::FusedRetainAutorelease)
-          .Case("objc_retainAutorelease", ARCInstKind::FusedRetainAutorelease)
-          .Case("objc_retainAutoreleaseReturnValue",
-                ARCInstKind::FusedRetainAutoreleaseRV)
-          .Case("objc_sync_enter", ARCInstKind::User)
-          .Case("objc_sync_exit", ARCInstKind::User)
-          .Default(ARCInstKind::CallOrUser);
-
-    // Argument is i8**
-    if (PointerType *Pte = dyn_cast<PointerType>(ETy))
-      if (Pte->getElementType()->isIntegerTy(8))
-        return StringSwitch<ARCInstKind>(F->getName())
-            .Case("objc_loadWeakRetained", ARCInstKind::LoadWeakRetained)
-            .Case("objc_loadWeak", ARCInstKind::LoadWeak)
-            .Case("objc_destroyWeak", ARCInstKind::DestroyWeak)
-            .Default(ARCInstKind::CallOrUser);
-
-    // Anything else with one argument.
+  Intrinsic::ID ID = F->getIntrinsicID();
+  switch (ID) {
+  default:
     return ARCInstKind::CallOrUser;
+  case Intrinsic::objc_autorelease:
+    return ARCInstKind::Autorelease;
+  case Intrinsic::objc_autoreleasePoolPop:
+    return ARCInstKind::AutoreleasepoolPop;
+  case Intrinsic::objc_autoreleasePoolPush:
+    return ARCInstKind::AutoreleasepoolPush;
+  case Intrinsic::objc_autoreleaseReturnValue:
+    return ARCInstKind::AutoreleaseRV;
+  case Intrinsic::objc_copyWeak:
+    return ARCInstKind::CopyWeak;
+  case Intrinsic::objc_destroyWeak:
+    return ARCInstKind::DestroyWeak;
+  case Intrinsic::objc_initWeak:
+    return ARCInstKind::InitWeak;
+  case Intrinsic::objc_loadWeak:
+    return ARCInstKind::LoadWeak;
+  case Intrinsic::objc_loadWeakRetained:
+    return ARCInstKind::LoadWeakRetained;
+  case Intrinsic::objc_moveWeak:
+    return ARCInstKind::MoveWeak;
+  case Intrinsic::objc_release:
+    return ARCInstKind::Release;
+  case Intrinsic::objc_retain:
+    return ARCInstKind::Retain;
+  case Intrinsic::objc_retainAutorelease:
+    return ARCInstKind::FusedRetainAutorelease;
+  case Intrinsic::objc_retainAutoreleaseReturnValue:
+    return ARCInstKind::FusedRetainAutoreleaseRV;
+  case Intrinsic::objc_retainAutoreleasedReturnValue:
+    return ARCInstKind::RetainRV;
+  case Intrinsic::objc_retainBlock:
+    return ARCInstKind::RetainBlock;
+  case Intrinsic::objc_storeStrong:
+    return ARCInstKind::StoreStrong;
+  case Intrinsic::objc_storeWeak:
+    return ARCInstKind::StoreWeak;
+  case Intrinsic::objc_clang_arc_use:
+    return ARCInstKind::IntrinsicUser;
+  case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue:
+    return ARCInstKind::ClaimRV;
+  case Intrinsic::objc_retainedObject:
+    return ARCInstKind::NoopCast;
+  case Intrinsic::objc_unretainedObject:
+    return ARCInstKind::NoopCast;
+  case Intrinsic::objc_unretainedPointer:
+    return ARCInstKind::NoopCast;
+  case Intrinsic::objc_retain_autorelease:
+    return ARCInstKind::FusedRetainAutorelease;
+  case Intrinsic::objc_sync_enter:
+    return ARCInstKind::User;
+  case Intrinsic::objc_sync_exit:
+    return ARCInstKind::User;
+  case Intrinsic::objc_arc_annotation_topdown_bbstart:
+  case Intrinsic::objc_arc_annotation_topdown_bbend:
+  case Intrinsic::objc_arc_annotation_bottomup_bbstart:
+  case Intrinsic::objc_arc_annotation_bottomup_bbend:
+    // Ignore annotation calls. This is important to stop the
+    // optimizer from treating annotations as uses which would
+    // make the state of the pointers they are attempting to
+    // elucidate to be incorrect.
+    return ARCInstKind::None;
   }
-
-  // Two arguments, first is i8**.
-  const Argument *A1 = &*AI++;
-  if (AI == AE)
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
-      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
-        if (Pte->getElementType()->isIntegerTy(8))
-          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
-            Type *ETy1 = PTy1->getElementType();
-            // Second argument is i8*
-            if (ETy1->isIntegerTy(8))
-              return StringSwitch<ARCInstKind>(F->getName())
-                  .Case("objc_storeWeak", ARCInstKind::StoreWeak)
-                  .Case("objc_initWeak", ARCInstKind::InitWeak)
-                  .Case("objc_storeStrong", ARCInstKind::StoreStrong)
-                  .Default(ARCInstKind::CallOrUser);
-            // Second argument is i8**.
-            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
-              if (Pte1->getElementType()->isIntegerTy(8))
-                return StringSwitch<ARCInstKind>(F->getName())
-                    .Case("objc_moveWeak", ARCInstKind::MoveWeak)
-                    .Case("objc_copyWeak", ARCInstKind::CopyWeak)
-                    // Ignore annotation calls. This is important to stop the
-                    // optimizer from treating annotations as uses which would
-                    // make the state of the pointers they are attempting to
-                    // elucidate to be incorrect.
-                    .Case("llvm.arc.annotation.topdown.bbstart",
-                          ARCInstKind::None)
-                    .Case("llvm.arc.annotation.topdown.bbend",
-                          ARCInstKind::None)
-                    .Case("llvm.arc.annotation.bottomup.bbstart",
-                          ARCInstKind::None)
-                    .Case("llvm.arc.annotation.bottomup.bbend",
-                          ARCInstKind::None)
-                    .Default(ARCInstKind::CallOrUser);
-          }
-
-  // Anything else.
-  return ARCInstKind::CallOrUser;
 }
 
 // A whitelist of intrinsics that we know do not use objc pointers or decrement
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 7472b6201c2b3e36a08f087c64f95e30d0712392..1d70c75f2e1c9d532edf6531589e155721874fe1 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -39,11 +39,6 @@ static cl::opt<int> ProfileSummaryCutoffCold(
     cl::desc("A count is cold if it is below the minimum count"
              " to reach this percentile of total counts."));
 
-static cl::opt<bool> ProfileSampleAccurate(
-    "profile-sample-accurate", cl::Hidden, cl::init(false),
-    cl::desc("If the sample profile is accurate, we will mark all un-sampled "
-             "callsite as cold. Otherwise, treat un-sampled callsites as if "
-             "we have no profile."));
 static cl::opt<unsigned> ProfileSummaryHugeWorkingSetSizeThreshold(
     "profile-summary-huge-working-set-size-threshold", cl::Hidden,
     cl::init(15000), cl::ZeroOrMore,
@@ -151,7 +146,7 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
       return true;
   }
   for (const auto &BB : *F)
-    if (isHotBB(&BB, &BFI))
+    if (isHotBlock(&BB, &BFI))
       return true;
   return false;
 }
@@ -180,7 +175,7 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
       return false;
   }
   for (const auto &BB : *F)
-    if (!isColdBB(&BB, &BFI))
+    if (!isColdBlock(&BB, &BFI))
       return false;
   return true;
 }
@@ -253,14 +248,14 @@ uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() {
   return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
 }
 
-bool ProfileSummaryInfo::isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI) {
-  auto Count = BFI->getBlockProfileCount(B);
+bool ProfileSummaryInfo::isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) {
+  auto Count = BFI->getBlockProfileCount(BB);
   return Count && isHotCount(*Count);
 }
 
-bool ProfileSummaryInfo::isColdBB(const BasicBlock *B,
+bool ProfileSummaryInfo::isColdBlock(const BasicBlock *BB,
                                   BlockFrequencyInfo *BFI) {
-  auto Count = BFI->getBlockProfileCount(B);
+  auto Count = BFI->getBlockProfileCount(BB);
   return Count && isColdCount(*Count);
 }
 
@@ -278,11 +273,7 @@ bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS,
 
   // In SamplePGO, if the caller has been sampled, and there is no profile
   // annotated on the callsite, we consider the callsite as cold.
-  // If there is no profile for the caller, and we know the profile is
-  // accurate, we consider the callsite as cold.
-  return (hasSampleProfile() &&
-          (CS.getCaller()->hasProfileData() || ProfileSampleAccurate ||
-           CS.getCaller()->hasFnAttribute("profile-sample-accurate")));
+  return hasSampleProfile() && CS.getCaller()->hasProfileData();
 }
 
 INITIALIZE_PASS(ProfileSummaryInfoWrapperPass, "profile-summary-info",
diff --git a/lib/Analysis/StackSafetyAnalysis.cpp b/lib/Analysis/StackSafetyAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..49d9b3f57c620c314f76d4d5c0d206ca5ad8473f
--- /dev/null
+++ b/lib/Analysis/StackSafetyAnalysis.cpp
@@ -0,0 +1,676 @@
+//===- StackSafetyAnalysis.cpp - Stack memory safety analysis -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stack-safety"
+
+static cl::opt<int> StackSafetyMaxIterations("stack-safety-max-iterations",
+                                             cl::init(20), cl::Hidden);
+
+namespace {
+
+/// Rewrite an SCEV expression for a memory access address to an expression that
+/// represents offset from the given alloca.
+class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
+  const Value *AllocaPtr;
+
+public:
+  AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr)
+      : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {}
+
+  const SCEV *visit(const SCEV *Expr) {
+    // Only re-write the expression if the alloca is used in an addition
+    // expression (it can be used in other types of expressions if it's cast to
+    // an int and passed as an argument.)
+    if (!isa<SCEVAddRecExpr>(Expr) && !isa<SCEVAddExpr>(Expr) &&
+        !isa<SCEVUnknown>(Expr))
+      return Expr;
+    return SCEVRewriteVisitor<AllocaOffsetRewriter>::visit(Expr);
+  }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    // FIXME: look through one or several levels of definitions?
+    // This can be inttoptr(AllocaPtr) and SCEV would not unwrap
+    // it for us.
+    if (Expr->getValue() == AllocaPtr)
+      return SE.getZero(Expr->getType());
+    return Expr;
+  }
+};
+
+/// Describes use of address in as a function call argument.
+struct PassAsArgInfo {
+  /// Function being called.
+  const GlobalValue *Callee = nullptr;
+  /// Index of argument which pass address.
+  size_t ParamNo = 0;
+  // Offset range of address from base address (alloca or calling function
+  // argument).
+  // Range should never set to empty-set, that is an invalid access range
+  // that can cause empty-set to be propagated with ConstantRange::add
+  ConstantRange Offset;
+  PassAsArgInfo(const GlobalValue *Callee, size_t ParamNo, ConstantRange Offset)
+      : Callee(Callee), ParamNo(ParamNo), Offset(Offset) {}
+
+  StringRef getName() const { return Callee->getName(); }
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const PassAsArgInfo &P) {
+  return OS << "@" << P.getName() << "(arg" << P.ParamNo << ", " << P.Offset
+            << ")";
+}
+
+/// Describe uses of address (alloca or parameter) inside of the function.
+struct UseInfo {
+  // Access range if the address (alloca or parameters).
+  // It is allowed to be empty-set when there are no known accesses.
+  ConstantRange Range;
+
+  // List of calls which pass address as an argument.
+  SmallVector<PassAsArgInfo, 4> Calls;
+
+  explicit UseInfo(unsigned PointerSize) : Range{PointerSize, false} {}
+
+  void updateRange(ConstantRange R) { Range = Range.unionWith(R); }
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const UseInfo &U) {
+  OS << U.Range;
+  for (auto &Call : U.Calls)
+    OS << ", " << Call;
+  return OS;
+}
+
+struct AllocaInfo {
+  const AllocaInst *AI = nullptr;
+  uint64_t Size = 0;
+  UseInfo Use;
+
+  AllocaInfo(unsigned PointerSize, const AllocaInst *AI, uint64_t Size)
+      : AI(AI), Size(Size), Use(PointerSize) {}
+
+  StringRef getName() const { return AI->getName(); }
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const AllocaInfo &A) {
+  return OS << A.getName() << "[" << A.Size << "]: " << A.Use;
+}
+
+struct ParamInfo {
+  const Argument *Arg = nullptr;
+  UseInfo Use;
+
+  explicit ParamInfo(unsigned PointerSize, const Argument *Arg)
+      : Arg(Arg), Use(PointerSize) {}
+
+  StringRef getName() const { return Arg ? Arg->getName() : "<N/A>"; }
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const ParamInfo &P) {
+  return OS << P.getName() << "[]: " << P.Use;
+}
+
+/// Calculate the allocation size of a given alloca. Returns 0 if the
+/// size can not be statically determined.
+uint64_t getStaticAllocaAllocationSize(const AllocaInst *AI) {
+  const DataLayout &DL = AI->getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
+  if (AI->isArrayAllocation()) {
+    auto C = dyn_cast<ConstantInt>(AI->getArraySize());
+    if (!C)
+      return 0;
+    Size *= C->getZExtValue();
+  }
+  return Size;
+}
+
+} // end anonymous namespace
+
+/// Describes uses of allocas and parameters inside of a single function.
+struct StackSafetyInfo::FunctionInfo {
+  // May be a Function or a GlobalAlias
+  const GlobalValue *GV = nullptr;
+  // Informations about allocas uses.
+  SmallVector<AllocaInfo, 4> Allocas;
+  // Informations about parameters uses.
+  SmallVector<ParamInfo, 4> Params;
+  // TODO: describe return value as depending on one or more of its arguments.
+
+  // StackSafetyDataFlowAnalysis counter stored here for faster access.
+  int UpdateCount = 0;
+
+  FunctionInfo(const StackSafetyInfo &SSI) : FunctionInfo(*SSI.Info) {}
+
+  explicit FunctionInfo(const Function *F) : GV(F){};
+  // Creates FunctionInfo that forwards all the parameters to the aliasee.
+  explicit FunctionInfo(const GlobalAlias *A);
+
+  FunctionInfo(FunctionInfo &&) = default;
+
+  bool IsDSOLocal() const { return GV->isDSOLocal(); };
+
+  bool IsInterposable() const { return GV->isInterposable(); };
+
+  StringRef getName() const { return GV->getName(); }
+
+  void print(raw_ostream &O) const {
+    // TODO: Consider different printout format after
+    // StackSafetyDataFlowAnalysis. Calls and parameters are irrelevant then.
+    O << "  @" << getName() << (IsDSOLocal() ? "" : " dso_preemptable")
+      << (IsInterposable() ? " interposable" : "") << "\n";
+    O << "    args uses:\n";
+    for (auto &P : Params)
+      O << "      " << P << "\n";
+    O << "    allocas uses:\n";
+    for (auto &AS : Allocas)
+      O << "      " << AS << "\n";
+  }
+
+private:
+  FunctionInfo(const FunctionInfo &) = default;
+};
+
+StackSafetyInfo::FunctionInfo::FunctionInfo(const GlobalAlias *A) : GV(A) {
+  unsigned PointerSize = A->getParent()->getDataLayout().getPointerSizeInBits();
+  const GlobalObject *Aliasee = A->getBaseObject();
+  const FunctionType *Type = cast<FunctionType>(Aliasee->getValueType());
+  // 'Forward' all parameters to this alias to the aliasee
+  for (unsigned ArgNo = 0; ArgNo < Type->getNumParams(); ArgNo++) {
+    Params.emplace_back(PointerSize, nullptr);
+    UseInfo &US = Params.back().Use;
+    US.Calls.emplace_back(Aliasee, ArgNo, ConstantRange(APInt(PointerSize, 0)));
+  }
+}
+
+namespace {
+
+class StackSafetyLocalAnalysis {
+  const Function &F;
+  const DataLayout &DL;
+  ScalarEvolution &SE;
+  unsigned PointerSize = 0;
+
+  const ConstantRange UnknownRange;
+
+  ConstantRange offsetFromAlloca(Value *Addr, const Value *AllocaPtr);
+  ConstantRange getAccessRange(Value *Addr, const Value *AllocaPtr,
+                               uint64_t AccessSize);
+  ConstantRange getMemIntrinsicAccessRange(const MemIntrinsic *MI, const Use &U,
+                                           const Value *AllocaPtr);
+
+  bool analyzeAllUses(const Value *Ptr, UseInfo &AS);
+
+  ConstantRange getRange(uint64_t Lower, uint64_t Upper) const {
+    return ConstantRange(APInt(PointerSize, Lower), APInt(PointerSize, Upper));
+  }
+
+public:
+  StackSafetyLocalAnalysis(const Function &F, ScalarEvolution &SE)
+      : F(F), DL(F.getParent()->getDataLayout()), SE(SE),
+        PointerSize(DL.getPointerSizeInBits()),
+        UnknownRange(PointerSize, true) {}
+
+  // Run the transformation on the associated function.
+  StackSafetyInfo run();
+};
+
+ConstantRange
+StackSafetyLocalAnalysis::offsetFromAlloca(Value *Addr,
+                                           const Value *AllocaPtr) {
+  if (!SE.isSCEVable(Addr->getType()))
+    return UnknownRange;
+
+  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
+  ConstantRange Offset = SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize);
+  assert(!Offset.isEmptySet());
+  return Offset;
+}
+
+ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr,
+                                                       const Value *AllocaPtr,
+                                                       uint64_t AccessSize) {
+  if (!SE.isSCEVable(Addr->getType()))
+    return UnknownRange;
+
+  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
+
+  ConstantRange AccessStartRange =
+      SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize);
+  ConstantRange SizeRange = getRange(0, AccessSize);
+  ConstantRange AccessRange = AccessStartRange.add(SizeRange);
+  assert(!AccessRange.isEmptySet());
+  return AccessRange;
+}
+
+ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
+    const MemIntrinsic *MI, const Use &U, const Value *AllocaPtr) {
+  if (auto MTI = dyn_cast<MemTransferInst>(MI)) {
+    if (MTI->getRawSource() != U && MTI->getRawDest() != U)
+      return getRange(0, 1);
+  } else {
+    if (MI->getRawDest() != U)
+      return getRange(0, 1);
+  }
+  const auto *Len = dyn_cast<ConstantInt>(MI->getLength());
+  // Non-constant size => unsafe. FIXME: try SCEV getRange.
+  if (!Len)
+    return UnknownRange;
+  ConstantRange AccessRange = getAccessRange(U, AllocaPtr, Len->getZExtValue());
+  return AccessRange;
+}
+
+/// The function analyzes all local uses of Ptr (alloca or argument) and
+/// calculates local access range and all function calls where it was used.
+bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
+  SmallPtrSet<const Value *, 16> Visited;
+  SmallVector<const Value *, 8> WorkList;
+  WorkList.push_back(Ptr);
+
+  // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
+  while (!WorkList.empty()) {
+    const Value *V = WorkList.pop_back_val();
+    for (const Use &UI : V->uses()) {
+      auto I = cast<const Instruction>(UI.getUser());
+      assert(V == UI.get());
+
+      switch (I->getOpcode()) {
+      case Instruction::Load: {
+        US.updateRange(
+            getAccessRange(UI, Ptr, DL.getTypeStoreSize(I->getType())));
+        break;
+      }
+
+      case Instruction::VAArg:
+        // "va-arg" from a pointer is safe.
+        break;
+      case Instruction::Store: {
+        if (V == I->getOperand(0)) {
+          // Stored the pointer - conservatively assume it may be unsafe.
+          US.updateRange(UnknownRange);
+          return false;
+        }
+        US.updateRange(getAccessRange(
+            UI, Ptr, DL.getTypeStoreSize(I->getOperand(0)->getType())));
+        break;
+      }
+
+      case Instruction::Ret:
+        // Information leak.
+        // FIXME: Process parameters correctly. This is a leak only if we return
+        // alloca.
+        US.updateRange(UnknownRange);
+        return false;
+
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        ImmutableCallSite CS(I);
+
+        if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+              II->getIntrinsicID() == Intrinsic::lifetime_end)
+            break;
+        }
+
+        if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
+          US.updateRange(getMemIntrinsicAccessRange(MI, UI, Ptr));
+          break;
+        }
+
+        // FIXME: consult devirt?
+        // Do not follow aliases, otherwise we could inadvertently follow
+        // dso_preemptable aliases or aliases with interposable linkage.
+        const GlobalValue *Callee = dyn_cast<GlobalValue>(
+            CS.getCalledValue()->stripPointerCastsNoFollowAliases());
+        if (!Callee) {
+          US.updateRange(UnknownRange);
+          return false;
+        }
+
+        assert(isa<Function>(Callee) || isa<GlobalAlias>(Callee));
+
+        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) {
+          if (A->get() == V) {
+            ConstantRange Offset = offsetFromAlloca(UI, Ptr);
+            US.Calls.emplace_back(Callee, A - B, Offset);
+          }
+        }
+
+        break;
+      }
+
+      default:
+        if (Visited.insert(I).second)
+          WorkList.push_back(cast<const Instruction>(I));
+      }
+    }
+  }
+
+  return true;
+}
+
+StackSafetyInfo StackSafetyLocalAnalysis::run() {
+  StackSafetyInfo::FunctionInfo Info(&F);
+  assert(!F.isDeclaration() &&
+         "Can't run StackSafety on a function declaration");
+
+  LLVM_DEBUG(dbgs() << "[StackSafety] " << F.getName() << "\n");
+
+  for (auto &I : instructions(F)) {
+    if (auto AI = dyn_cast<AllocaInst>(&I)) {
+      Info.Allocas.emplace_back(PointerSize, AI,
+                                getStaticAllocaAllocationSize(AI));
+      AllocaInfo &AS = Info.Allocas.back();
+      analyzeAllUses(AI, AS.Use);
+    }
+  }
+
+  for (const Argument &A : make_range(F.arg_begin(), F.arg_end())) {
+    Info.Params.emplace_back(PointerSize, &A);
+    ParamInfo &PS = Info.Params.back();
+    analyzeAllUses(&A, PS.Use);
+  }
+
+  LLVM_DEBUG(dbgs() << "[StackSafety] done\n");
+  LLVM_DEBUG(Info.print(dbgs()));
+  return StackSafetyInfo(std::move(Info));
+}
+
+class StackSafetyDataFlowAnalysis {
+  using FunctionMap =
+      std::map<const GlobalValue *, StackSafetyInfo::FunctionInfo>;
+
+  FunctionMap Functions;
+  // Callee-to-Caller multimap.
+  DenseMap<const GlobalValue *, SmallVector<const GlobalValue *, 4>> Callers;
+  SetVector<const GlobalValue *> WorkList;
+
+  unsigned PointerSize = 0;
+  const ConstantRange UnknownRange;
+
+  ConstantRange getArgumentAccessRange(const GlobalValue *Callee,
+                                       unsigned ParamNo) const;
+  bool updateOneUse(UseInfo &US, bool UpdateToFullSet);
+  void updateOneNode(const GlobalValue *Callee,
+                     StackSafetyInfo::FunctionInfo &FS);
+  void updateOneNode(const GlobalValue *Callee) {
+    updateOneNode(Callee, Functions.find(Callee)->second);
+  }
+  void updateAllNodes() {
+    for (auto &F : Functions)
+      updateOneNode(F.first, F.second);
+  }
+  void runDataFlow();
+  void verifyFixedPoint();
+
+public:
+  StackSafetyDataFlowAnalysis(
+      Module &M, std::function<const StackSafetyInfo &(Function &)> FI);
+  StackSafetyGlobalInfo run();
+};
+
+StackSafetyDataFlowAnalysis::StackSafetyDataFlowAnalysis(
+    Module &M, std::function<const StackSafetyInfo &(Function &)> FI)
+    : PointerSize(M.getDataLayout().getPointerSizeInBits()),
+      UnknownRange(PointerSize, true) {
+  // Without ThinLTO, run the local analysis for every function in the TU and
+  // then run the DFA.
+  for (auto &F : M.functions())
+    if (!F.isDeclaration())
+      Functions.emplace(&F, FI(F));
+  for (auto &A : M.aliases())
+    if (isa<Function>(A.getBaseObject()))
+      Functions.emplace(&A, StackSafetyInfo::FunctionInfo(&A));
+}
+
+ConstantRange
+StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee,
+                                                    unsigned ParamNo) const {
+  auto IT = Functions.find(Callee);
+  // Unknown callee (outside of LTO domain or an indirect call).
+  if (IT == Functions.end())
+    return UnknownRange;
+  const StackSafetyInfo::FunctionInfo &FS = IT->second;
+  // The definition of this symbol may not be the definition in this linkage
+  // unit.
+  if (!FS.IsDSOLocal() || FS.IsInterposable())
+    return UnknownRange;
+  if (ParamNo >= FS.Params.size()) // possibly vararg
+    return UnknownRange;
+  return FS.Params[ParamNo].Use.Range;
+}
+
+bool StackSafetyDataFlowAnalysis::updateOneUse(UseInfo &US,
+                                               bool UpdateToFullSet) {
+  bool Changed = false;
+  for (auto &CS : US.Calls) {
+    assert(!CS.Offset.isEmptySet() &&
+           "Param range can't be empty-set, invalid offset range");
+
+    ConstantRange CalleeRange = getArgumentAccessRange(CS.Callee, CS.ParamNo);
+    CalleeRange = CalleeRange.add(CS.Offset);
+    if (!US.Range.contains(CalleeRange)) {
+      Changed = true;
+      if (UpdateToFullSet)
+        US.Range = UnknownRange;
+      else
+        US.Range = US.Range.unionWith(CalleeRange);
+    }
+  }
+  return Changed;
+}
+
+void StackSafetyDataFlowAnalysis::updateOneNode(
+    const GlobalValue *Callee, StackSafetyInfo::FunctionInfo &FS) {
+  bool UpdateToFullSet = FS.UpdateCount > StackSafetyMaxIterations;
+  bool Changed = false;
+  for (auto &AS : FS.Allocas)
+    Changed |= updateOneUse(AS.Use, UpdateToFullSet);
+  for (auto &PS : FS.Params)
+    Changed |= updateOneUse(PS.Use, UpdateToFullSet);
+
+  if (Changed) {
+    LLVM_DEBUG(dbgs() << "=== update [" << FS.UpdateCount
+                      << (UpdateToFullSet ? ", full-set" : "") << "] "
+                      << FS.getName() << "\n");
+    // Callers of this function may need updating.
+    for (auto &CallerID : Callers[Callee])
+      WorkList.insert(CallerID);
+
+    ++FS.UpdateCount;
+  }
+}
+
+void StackSafetyDataFlowAnalysis::runDataFlow() {
+  Callers.clear();
+  WorkList.clear();
+
+  SmallVector<const GlobalValue *, 16> Callees;
+  for (auto &F : Functions) {
+    Callees.clear();
+    StackSafetyInfo::FunctionInfo &FS = F.second;
+    for (auto &AS : FS.Allocas)
+      for (auto &CS : AS.Use.Calls)
+        Callees.push_back(CS.Callee);
+    for (auto &PS : FS.Params)
+      for (auto &CS : PS.Use.Calls)
+        Callees.push_back(CS.Callee);
+
+    llvm::sort(Callees);
+    Callees.erase(std::unique(Callees.begin(), Callees.end()), Callees.end());
+
+    for (auto &Callee : Callees)
+      Callers[Callee].push_back(F.first);
+  }
+
+  updateAllNodes();
+
+  while (!WorkList.empty()) {
+    const GlobalValue *Callee = WorkList.back();
+    WorkList.pop_back();
+    updateOneNode(Callee);
+  }
+}
+
+void StackSafetyDataFlowAnalysis::verifyFixedPoint() {
+  WorkList.clear();
+  updateAllNodes();
+  assert(WorkList.empty());
+}
+
+StackSafetyGlobalInfo StackSafetyDataFlowAnalysis::run() {
+  runDataFlow();
+  LLVM_DEBUG(verifyFixedPoint());
+
+  StackSafetyGlobalInfo SSI;
+  for (auto &F : Functions)
+    SSI.emplace(F.first, std::move(F.second));
+  return SSI;
+}
+
+void print(const StackSafetyGlobalInfo &SSI, raw_ostream &O, const Module &M) {
+  size_t Count = 0;
+  for (auto &F : M.functions())
+    if (!F.isDeclaration()) {
+      SSI.find(&F)->second.print(O);
+      O << "\n";
+      ++Count;
+    }
+  for (auto &A : M.aliases()) {
+    SSI.find(&A)->second.print(O);
+    O << "\n";
+    ++Count;
+  }
+  assert(Count == SSI.size() && "Unexpected functions in the result");
+}
+
+} // end anonymous namespace
+
+StackSafetyInfo::StackSafetyInfo() = default;
+StackSafetyInfo::StackSafetyInfo(StackSafetyInfo &&) = default;
+StackSafetyInfo &StackSafetyInfo::operator=(StackSafetyInfo &&) = default;
+
+StackSafetyInfo::StackSafetyInfo(FunctionInfo &&Info)
+    : Info(new FunctionInfo(std::move(Info))) {}
+
+StackSafetyInfo::~StackSafetyInfo() = default;
+
+void StackSafetyInfo::print(raw_ostream &O) const { Info->print(O); }
+
+AnalysisKey StackSafetyAnalysis::Key;
+
+StackSafetyInfo StackSafetyAnalysis::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  StackSafetyLocalAnalysis SSLA(F, AM.getResult<ScalarEvolutionAnalysis>(F));
+  return SSLA.run();
+}
+
+PreservedAnalyses StackSafetyPrinterPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  OS << "'Stack Safety Local Analysis' for function '" << F.getName() << "'\n";
+  AM.getResult<StackSafetyAnalysis>(F).print(OS);
+  return PreservedAnalyses::all();
+}
+
+char StackSafetyInfoWrapperPass::ID = 0;
+
+StackSafetyInfoWrapperPass::StackSafetyInfoWrapperPass() : FunctionPass(ID) {
+  initializeStackSafetyInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+void StackSafetyInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.setPreservesAll();
+}
+
+void StackSafetyInfoWrapperPass::print(raw_ostream &O, const Module *M) const {
+  SSI.print(O);
+}
+
+bool StackSafetyInfoWrapperPass::runOnFunction(Function &F) {
+  StackSafetyLocalAnalysis SSLA(
+      F, getAnalysis<ScalarEvolutionWrapperPass>().getSE());
+  SSI = StackSafetyInfo(SSLA.run());
+  return false;
+}
+
+AnalysisKey StackSafetyGlobalAnalysis::Key;
+
+StackSafetyGlobalInfo
+StackSafetyGlobalAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  StackSafetyDataFlowAnalysis SSDFA(
+      M, [&FAM](Function &F) -> const StackSafetyInfo & {
+        return FAM.getResult<StackSafetyAnalysis>(F);
+      });
+  return SSDFA.run();
+}
+
+PreservedAnalyses StackSafetyGlobalPrinterPass::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  OS << "'Stack Safety Analysis' for module '" << M.getName() << "'\n";
+  print(AM.getResult<StackSafetyGlobalAnalysis>(M), OS, M);
+  return PreservedAnalyses::all();
+}
+
+char StackSafetyGlobalInfoWrapperPass::ID = 0;
+
+StackSafetyGlobalInfoWrapperPass::StackSafetyGlobalInfoWrapperPass()
+    : ModulePass(ID) {
+  initializeStackSafetyGlobalInfoWrapperPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void StackSafetyGlobalInfoWrapperPass::print(raw_ostream &O,
+                                             const Module *M) const {
+  ::print(SSI, O, *M);
+}
+
+void StackSafetyGlobalInfoWrapperPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<StackSafetyInfoWrapperPass>();
+}
+
+bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) {
+  StackSafetyDataFlowAnalysis SSDFA(
+      M, [this](Function &F) -> const StackSafetyInfo & {
+        return getAnalysis<StackSafetyInfoWrapperPass>(F).getResult();
+      });
+  SSI = SSDFA.run();
+  return false;
+}
+
+static const char LocalPassArg[] = "stack-safety-local";
+static const char LocalPassName[] = "Stack Safety Local Analysis";
+INITIALIZE_PASS_BEGIN(StackSafetyInfoWrapperPass, LocalPassArg, LocalPassName,
+                      false, true)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(StackSafetyInfoWrapperPass, LocalPassArg, LocalPassName,
+                    false, true)
+
+static const char GlobalPassName[] = "Stack Safety Analysis";
+INITIALIZE_PASS_BEGIN(StackSafetyGlobalInfoWrapperPass, DEBUG_TYPE,
+                      GlobalPassName, false, false)
+INITIALIZE_PASS_DEPENDENCY(StackSafetyInfoWrapperPass)
+INITIALIZE_PASS_END(StackSafetyGlobalInfoWrapperPass, DEBUG_TYPE,
+                    GlobalPassName, false, false)
diff --git a/lib/Analysis/SyntheticCountsUtils.cpp b/lib/Analysis/SyntheticCountsUtils.cpp
index b085fa274d7fbad13adbc5642e69d1e37b1b8931..386396bcff364f3cf0222a3f7994fd4c46023686 100644
--- a/lib/Analysis/SyntheticCountsUtils.cpp
+++ b/lib/Analysis/SyntheticCountsUtils.cpp
@@ -14,12 +14,12 @@
 #include "llvm/Analysis/SyntheticCountsUtils.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 
 using namespace llvm;
 
@@ -29,7 +29,7 @@ void SyntheticCountsUtils<CallGraphType>::propagateFromSCC(
     const SccTy &SCC, GetRelBBFreqTy GetRelBBFreq, GetCountTy GetCount,
     AddCountTy AddCount) {
 
-  SmallPtrSet<NodeRef, 8> SCCNodes;
+  DenseSet<NodeRef> SCCNodes;
   SmallVector<std::pair<NodeRef, EdgeRef>, 8> SCCEdges, NonSCCEdges;
 
   for (auto &Node : SCC)
@@ -111,3 +111,4 @@ void SyntheticCountsUtils<CallGraphType>::propagate(const CallGraphType &CG,
 }
 
 template class llvm::SyntheticCountsUtils<const CallGraph *>;
+template class llvm::SyntheticCountsUtils<ModuleSummaryIndex *>;
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 6e4eb8ff0cdf2a47cddd20a38e6c9cf3a568b027..79fe6dc7d87cb26dc4fa2b5989fbf3a6493f434c 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -389,8 +389,7 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
 }
 
 TargetTransformInfo::OperandValueKind
-TargetTransformInfo::getOperandInfo(Value *V,
-                                    OperandValueProperties &OpProps) const {
+TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) {
   OperandValueKind OpInfo = OK_AnyValue;
   OpProps = OP_None;
 
@@ -400,6 +399,13 @@ TargetTransformInfo::getOperandInfo(Value *V,
     return OK_UniformConstantValue;
   }
 
+  // A broadcast shuffle creates a uniform value.
+  // TODO: Add support for non-zero index broadcasts.
+  // TODO: Add support for different source vector width.
+  if (auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V))
+    if (ShuffleInst->isZeroEltSplat())
+      OpInfo = OK_UniformValue;
+
   const Value *Splat = getSplatValue(V);
 
   // Check for a splat of a constant or for a non uniform vector of constants
@@ -1108,14 +1114,20 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   }
   case Instruction::ShuffleVector: {
     const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
-    // TODO: Identify and add costs for insert/extract subvector, etc.
+    Type *Ty = Shuffle->getType();
+    Type *SrcTy = Shuffle->getOperand(0)->getType();
+
+    // TODO: Identify and add costs for insert subvector, etc.
+    int SubIndex;
+    if (Shuffle->isExtractSubvectorMask(SubIndex))
+      return TTIImpl->getShuffleCost(SK_ExtractSubvector, SrcTy, SubIndex, Ty);
+
     if (Shuffle->changesLength())
       return -1;
 
     if (Shuffle->isIdentity())
       return 0;
 
-    Type *Ty = Shuffle->getType();
     if (Shuffle->isReverse())
       return TTIImpl->getShuffleCost(SK_Reverse, Ty, 0, nullptr);
 
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index ed17441d1e404e2c70f5ea62b191987394ffcd3d..6f0196069f2d64ff9d819049b2f8376dcb8c24a6 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -1506,6 +1506,27 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         // of bits which might be set provided by popcnt KnownOne2.
         break;
       }
+      case Intrinsic::fshr:
+      case Intrinsic::fshl: {
+        const APInt *SA;
+        if (!match(I->getOperand(2), m_APInt(SA)))
+          break;
+
+        // Normalize to funnel shift left.
+        uint64_t ShiftAmt = SA->urem(BitWidth);
+        if (II->getIntrinsicID() == Intrinsic::fshr)
+          ShiftAmt = BitWidth - ShiftAmt;
+
+        KnownBits Known3(Known);
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), Known3, Depth + 1, Q);
+
+        Known.Zero =
+            Known2.Zero.shl(ShiftAmt) | Known3.Zero.lshr(BitWidth - ShiftAmt);
+        Known.One =
+            Known2.One.shl(ShiftAmt) | Known3.One.lshr(BitWidth - ShiftAmt);
+        break;
+      }
       case Intrinsic::x86_sse42_crc32_64_64:
         Known.Zero.setBitsFrom(32);
         break;
@@ -4001,13 +4022,11 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(
 
     if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
       // The sign bit is set in both cases: this MUST overflow.
-      // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::AlwaysOverflows;
     }
 
     if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
       // The sign bit is clear in both cases: this CANNOT overflow.
-      // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::NeverOverflows;
     }
   }
@@ -4124,11 +4143,18 @@ OverflowResult llvm::computeOverflowForUnsignedSub(const Value *LHS,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
-  // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
   KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
-  KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
-  if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
-    return OverflowResult::NeverOverflows;
+  if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) {
+    KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
+
+    // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
+    if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
+      return OverflowResult::NeverOverflows;
+
+    // If the LHS is non-negative and the RHS negative, we always wrap.
+    if (LHSKnown.isNonNegative() && RHSKnown.isNegative())
+      return OverflowResult::AlwaysOverflows;
+  }
 
   return OverflowResult::MayOverflow;
 }
@@ -5373,3 +5399,35 @@ Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
   }
   return None;
 }
+
+Optional<bool> llvm::isImpliedByDomCondition(const Value *Cond,
+                                             const Instruction *ContextI,
+                                             const DataLayout &DL) {
+  assert(Cond->getType()->isIntOrIntVectorTy(1) && "Condition must be bool");
+  if (!ContextI || !ContextI->getParent())
+    return None;
+
+  // TODO: This is a poor/cheap way to determine dominance. Should we use a
+  // dominator tree (eg, from a SimplifyQuery) instead?
+  const BasicBlock *ContextBB = ContextI->getParent();
+  const BasicBlock *PredBB = ContextBB->getSinglePredecessor();
+  if (!PredBB)
+    return None;
+
+  // We need a conditional branch in the predecessor.
+  Value *PredCond;
+  BasicBlock *TrueBB, *FalseBB;
+  if (!match(PredBB->getTerminator(), m_Br(m_Value(PredCond), TrueBB, FalseBB)))
+    return None;
+
+  // The branch should get simplified. Don't bother simplifying this condition.
+  if (TrueBB == FalseBB)
+    return None;
+
+  assert((TrueBB == ContextBB || FalseBB == ContextBB) &&
+         "Predecessor block does not point to successor?");
+
+  // Is this condition implied by the predecessor condition?
+  bool CondIsTrue = TrueBB == ContextBB;
+  return isImpliedCondition(PredCond, Cond, DL, CondIsTrue);
+}
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 38dca50e82a523b9b503337690e3161c9e234810..9ebb01684c8a5d4ab1568031872abe122569b92e 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -37,13 +37,19 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
     cl::init(8));
 
-/// Identify if the intrinsic is trivially vectorizable.
-/// This method returns true if the intrinsic's argument types are all
-/// scalars for the scalar form of the intrinsic and all vectors for
-/// the vector form of the intrinsic.
+/// Return true if all of the intrinsic's arguments and return type are scalars
+/// for the scalar form of the intrinsic and vectors for the vector form of the
+/// intrinsic.
 bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::sqrt:
+  case Intrinsic::bswap: // Begin integer bit-manipulation.
+  case Intrinsic::bitreverse:
+  case Intrinsic::ctpop:
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz:
+  case Intrinsic::fshl:
+  case Intrinsic::fshr:
+  case Intrinsic::sqrt: // Begin floating-point.
   case Intrinsic::sin:
   case Intrinsic::cos:
   case Intrinsic::exp:
@@ -63,14 +69,9 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
-  case Intrinsic::bswap:
-  case Intrinsic::bitreverse:
-  case Intrinsic::ctpop:
   case Intrinsic::pow:
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
-  case Intrinsic::ctlz:
-  case Intrinsic::cttz:
   case Intrinsic::powi:
   case Intrinsic::canonicalize:
     return true;
@@ -504,8 +505,9 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
   return Inst;
 }
 
-Constant *llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
-                                           const InterleaveGroup &Group) {
+Constant *
+llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+                           const InterleaveGroup<Instruction> &Group) {
   // All 1's means mask is not needed.
   if (Group.getNumMembers() == Group.getFactor())
     return nullptr;
@@ -719,9 +721,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
   collectDependences();
 
   // Holds all interleaved store groups temporarily.
-  SmallSetVector<InterleaveGroup *, 4> StoreGroups;
+  SmallSetVector<InterleaveGroup<Instruction> *, 4> StoreGroups;
   // Holds all interleaved load groups temporarily.
-  SmallSetVector<InterleaveGroup *, 4> LoadGroups;
+  SmallSetVector<InterleaveGroup<Instruction> *, 4> LoadGroups;
 
   // Search in bottom-up program order for pairs of accesses (A and B) that can
   // form interleaved load or store groups. In the algorithm below, access A
@@ -743,7 +745,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // Initialize a group for B if it has an allowable stride. Even if we don't
     // create a group for B, we continue with the bottom-up algorithm to ensure
     // we don't break any of B's dependences.
-    InterleaveGroup *Group = nullptr;
+    InterleaveGroup<Instruction> *Group = nullptr;
     if (isStrided(DesB.Stride) && 
         (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
       Group = getInterleaveGroup(B);
@@ -788,7 +790,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
         // illegal code motion. A will then be free to form another group with
         // instructions that precede it.
         if (isInterleaved(A)) {
-          InterleaveGroup *StoreGroup = getInterleaveGroup(A);
+          InterleaveGroup<Instruction> *StoreGroup = getInterleaveGroup(A);
           StoreGroups.remove(StoreGroup);
           releaseGroup(StoreGroup);
         }
@@ -867,7 +869,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
   }   // Iteration over B accesses.
 
   // Remove interleaved store groups with gaps.
-  for (InterleaveGroup *Group : StoreGroups)
+  for (auto *Group : StoreGroups)
     if (Group->getNumMembers() != Group->getFactor()) {
       LLVM_DEBUG(
           dbgs() << "LV: Invalidate candidate interleaved store group due "
@@ -888,7 +890,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
   // This means that we can forcefully peel the loop in order to only have to
   // check the first pointer for no-wrap. When we'll change to use Assume=true
   // we'll only need at most one runtime check per interleaved group.
-  for (InterleaveGroup *Group : LoadGroups) {
+  for (auto *Group : LoadGroups) {
     // Case 1: A full group. Can Skip the checks; For full groups, if the wide
     // load would wrap around the address space we would do a memory access at
     // nullptr even without the transformation.
@@ -946,9 +948,9 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
     return;
 
   // Avoid releasing a Group twice.
-  SmallPtrSet<InterleaveGroup *, 4> DelSet;
+  SmallPtrSet<InterleaveGroup<Instruction> *, 4> DelSet;
   for (auto &I : InterleaveGroupMap) {
-    InterleaveGroup *Group = I.second;
+    InterleaveGroup<Instruction> *Group = I.second;
     if (Group->requiresScalarEpilogue())
       DelSet.insert(Group);
   }
@@ -963,3 +965,18 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
 
   RequiresScalarEpilogue = false;
 }
+
+template <typename InstT>
+void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
+  llvm_unreachable("addMetadata can only be used for Instruction");
+}
+
+namespace llvm {
+template <>
+void InterleaveGroup<Instruction>::addMetadata(Instruction *NewInst) const {
+  SmallVector<Value *, 4> VL;
+  std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
+                 [](std::pair<int, Instruction *> p) { return p.second; });
+  propagateMetadata(NewInst, VL);
+}
+}
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index af4f43986ef0632c50c7284fa14f4ad9ee156925..eab7ec81953609cc050f7b7af47307ebac74d2ae 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -788,6 +788,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(info);
   KEYWORD(byte);
   KEYWORD(bit);
+  KEYWORD(varFlags);
 
 #undef KEYWORD
 
@@ -823,6 +824,8 @@ lltok::Kind LLLexer::LexIdentifier() {
     }                                                                          \
   } while (false)
 
+  INSTKEYWORD(fneg,  FNeg);
+
   INSTKEYWORD(add,   Add);  INSTKEYWORD(fadd,   FAdd);
   INSTKEYWORD(sub,   Sub);  INSTKEYWORD(fsub,   FSub);
   INSTKEYWORD(mul,   Mul);  INSTKEYWORD(fmul,   FMul);
@@ -902,6 +905,11 @@ lltok::Kind LLLexer::LexIdentifier() {
     return lltok::DIFlag;
   }
 
+  if (Keyword.startswith("DISPFlag")) {
+    StrVal.assign(Keyword.begin(), Keyword.end());
+    return lltok::DISPFlag;
+  }
+
   if (Keyword.startswith("CSK_")) {
     StrVal.assign(Keyword.begin(), Keyword.end());
     return lltok::ChecksumKind;
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 5fe1e125d48680db35067d3bf9e19275f65a991a..f887372060b5344a2ed11e442c788414573b2bee 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -3295,7 +3295,31 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     ID.Kind = ValID::t_Constant;
     return false;
   }
-
+ 
+  // Unary Operators.
+  case lltok::kw_fneg: {
+    unsigned Opc = Lex.getUIntVal();
+    Constant *Val;
+    Lex.Lex();
+    if (ParseToken(lltok::lparen, "expected '(' in unary constantexpr") ||
+        ParseGlobalTypeAndValue(Val) ||
+        ParseToken(lltok::rparen, "expected ')' in unary constantexpr"))
+      return true;
+    
+    // Check that the type is valid for the operator.
+    switch (Opc) {
+    case Instruction::FNeg:
+      if (!Val->getType()->isFPOrFPVectorTy())
+        return Error(ID.Loc, "constexpr requires fp operands");
+      break;
+    default: llvm_unreachable("Unknown unary operator!");
+    }
+    unsigned Flags = 0;
+    Constant *C = ConstantExpr::get(Opc, Val, Flags);
+    ID.ConstantVal = C;
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
   // Binary Operators.
   case lltok::kw_add:
   case lltok::kw_fadd:
@@ -3725,6 +3749,10 @@ struct DIFlagField : public MDFieldImpl<DINode::DIFlags> {
   DIFlagField() : MDFieldImpl(DINode::FlagZero) {}
 };
 
+struct DISPFlagField : public MDFieldImpl<DISubprogram::DISPFlags> {
+  DISPFlagField() : MDFieldImpl(DISubprogram::SPFlagZero) {}
+};
+
 struct MDSignedField : public MDFieldImpl<int64_t> {
   int64_t Min;
   int64_t Max;
@@ -4017,6 +4045,46 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DIFlagField &Result) {
   return false;
 }
 
+/// DISPFlagField
+///  ::= uint32
+///  ::= DISPFlagVector
+///  ::= DISPFlagVector '|' DISPFlag* '|' uint32
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DISPFlagField &Result) {
+
+  // Parser for a single flag.
+  auto parseFlag = [&](DISubprogram::DISPFlags &Val) {
+    if (Lex.getKind() == lltok::APSInt && !Lex.getAPSIntVal().isSigned()) {
+      uint32_t TempVal = static_cast<uint32_t>(Val);
+      bool Res = ParseUInt32(TempVal);
+      Val = static_cast<DISubprogram::DISPFlags>(TempVal);
+      return Res;
+    }
+
+    if (Lex.getKind() != lltok::DISPFlag)
+      return TokError("expected debug info flag");
+
+    Val = DISubprogram::getFlag(Lex.getStrVal());
+    if (!Val)
+      return TokError(Twine("invalid subprogram debug info flag '") +
+                      Lex.getStrVal() + "'");
+    Lex.Lex();
+    return false;
+  };
+
+  // Parse the flags and combine them together.
+  DISubprogram::DISPFlags Combined = DISubprogram::SPFlagZero;
+  do {
+    DISubprogram::DISPFlags Val;
+    if (parseFlag(Val))
+      return true;
+    Combined |= Val;
+  } while (EatIfPresent(lltok::bar));
+
+  Result.assign(Combined);
+  return false;
+}
+
 template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
                             MDSignedField &Result) {
@@ -4473,7 +4541,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(dwoId, MDUnsignedField, );                                          \
   OPTIONAL(splitDebugInlining, MDBoolField, = true);                           \
   OPTIONAL(debugInfoForProfiling, MDBoolField, = false);                       \
-  OPTIONAL(nameTableKind, NameTableKindField, );
+  OPTIONAL(nameTableKind, NameTableKindField, );                               \
+  OPTIONAL(debugBaseAddress, MDBoolField, = false);
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4481,7 +4550,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
       Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
       runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
       retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val,
-      splitDebugInlining.Val, debugInfoForProfiling.Val, nameTableKind.Val);
+      splitDebugInlining.Val, debugInfoForProfiling.Val, nameTableKind.Val,
+      debugBaseAddress.Val);
   return false;
 }
 
@@ -4491,8 +4561,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
 ///                     isDefinition: true, scopeLine: 8, containingType: !3,
 ///                     virtuality: DW_VIRTUALTIY_pure_virtual,
 ///                     virtualIndex: 10, thisAdjustment: 4, flags: 11,
-///                     isOptimized: false, templateParams: !4, declaration: !5,
-///                     retainedNodes: !6, thrownTypes: !7)
+///                     spFlags: 10, isOptimized: false, templateParams: !4,
+///                     declaration: !5, retainedNodes: !6, thrownTypes: !7)
 bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   auto Loc = Lex.getLoc();
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
@@ -4510,26 +4580,31 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(virtualIndex, MDUnsignedField, (0, UINT32_MAX));                    \
   OPTIONAL(thisAdjustment, MDSignedField, (0, INT32_MIN, INT32_MAX));          \
   OPTIONAL(flags, DIFlagField, );                                              \
+  OPTIONAL(spFlags, DISPFlagField, );                                          \
   OPTIONAL(isOptimized, MDBoolField, );                                        \
   OPTIONAL(unit, MDField, );                                                   \
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(declaration, MDField, );                                            \
-  OPTIONAL(retainedNodes, MDField, );                                              \
+  OPTIONAL(retainedNodes, MDField, );                                          \
   OPTIONAL(thrownTypes, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  if (isDefinition.Val && !IsDistinct)
+  // An explicit spFlags field takes precedence over individual fields in
+  // older IR versions.
+  DISubprogram::DISPFlags SPFlags =
+      spFlags.Seen ? spFlags.Val
+                   : DISubprogram::toSPFlags(isLocal.Val, isDefinition.Val,
+                                             isOptimized.Val, virtuality.Val);
+  if ((SPFlags & DISubprogram::SPFlagDefinition) && !IsDistinct)
     return Lex.Error(
         Loc,
-        "missing 'distinct', required for !DISubprogram when 'isDefinition'");
-
+        "missing 'distinct', required for !DISubprogram that is a Definition");
   Result = GET_OR_DISTINCT(
       DISubprogram,
       (Context, scope.Val, name.Val, linkageName.Val, file.Val, line.Val,
-       type.Val, isLocal.Val, isDefinition.Val, scopeLine.Val,
-       containingType.Val, virtuality.Val, virtualIndex.Val, thisAdjustment.Val,
-       flags.Val, isOptimized.Val, unit.Val, templateParams.Val,
+       type.Val, scopeLine.Val, containingType.Val, virtualIndex.Val,
+       thisAdjustment.Val, flags.Val, SPFlags, unit.Val, templateParams.Val,
        declaration.Val, retainedNodes.Val, thrownTypes.Val));
   return false;
 }
@@ -5492,6 +5567,16 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_catchswitch: return ParseCatchSwitch(Inst, PFS);
   case lltok::kw_catchpad:    return ParseCatchPad(Inst, PFS);
   case lltok::kw_cleanuppad:  return ParseCleanupPad(Inst, PFS);
+  // Unary Operators.
+  case lltok::kw_fneg: {
+    FastMathFlags FMF = EatFastMathFlagsIfPresent();
+    int Res = ParseUnaryOp(Inst, PFS, KeywordVal, 2);
+    if (Res != 0)
+      return Res;
+    if (FMF.any())
+      Inst->setFastMathFlags(FMF);
+    return false;
+  }
   // Binary Operators.
   case lltok::kw_add:
   case lltok::kw_sub:
@@ -6063,6 +6148,43 @@ bool LLParser::ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
+//===----------------------------------------------------------------------===//
+// Unary Operators.
+//===----------------------------------------------------------------------===//
+
+/// ParseUnaryOp
+///  ::= UnaryOp TypeAndValue ',' Value
+///
+/// If OperandType is 0, then any FP or integer operand is allowed.  If it is 1,
+/// then any integer operand is allowed, if it is 2, any fp operand is allowed.
+bool LLParser::ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS,
+                            unsigned Opc, unsigned OperandType) {
+  LocTy Loc; Value *LHS;
+  if (ParseTypeAndValue(LHS, Loc, PFS))
+    return true;
+
+  bool Valid;
+  switch (OperandType) {
+  default: llvm_unreachable("Unknown operand type!");
+  case 0: // int or FP.
+    Valid = LHS->getType()->isIntOrIntVectorTy() ||
+            LHS->getType()->isFPOrFPVectorTy();
+    break;
+  case 1: 
+    Valid = LHS->getType()->isIntOrIntVectorTy(); 
+    break;
+  case 2: 
+    Valid = LHS->getType()->isFPOrFPVectorTy(); 
+    break;
+  }
+
+  if (!Valid)
+    return Error(Loc, "invalid operand type for instruction");
+
+  Inst = UnaryOperator::Create((Instruction::UnaryOps)Opc, LHS);
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Binary Operators.
 //===----------------------------------------------------------------------===//
@@ -7397,8 +7519,14 @@ bool LLParser::ParseArgs(std::vector<uint64_t> &Args) {
   return false;
 }
 
-static ValueInfo EmptyVI =
-    ValueInfo(false, (GlobalValueSummaryMapTy::value_type *)-8);
+auto FwdVIRef = (GlobalValueSummaryMapTy::value_type *)-8;
+
+static void resolveFwdRef(ValueInfo *Fwd, ValueInfo &Resolved) {
+  bool ReadOnly = Fwd->isReadOnly();
+  *Fwd = Resolved;
+  if (ReadOnly)
+    Fwd->setReadOnly();
+}
 
 /// Stores the given Name/GUID and associated summary into the Index.
 /// Also updates any forward references to the associated entry ID.
@@ -7434,9 +7562,9 @@ void LLParser::AddGlobalValueToIndex(
   auto FwdRefVIs = ForwardRefValueInfos.find(ID);
   if (FwdRefVIs != ForwardRefValueInfos.end()) {
     for (auto VIRef : FwdRefVIs->second) {
-      assert(*VIRef.first == EmptyVI &&
+      assert(VIRef.first->getRef() == FwdVIRef &&
              "Forward referenced ValueInfo expected to be empty");
-      *VIRef.first = VI;
+      resolveFwdRef(VIRef.first, VI);
     }
     ForwardRefValueInfos.erase(FwdRefVIs);
   }
@@ -7599,8 +7727,8 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
     return true;
 
   auto FS = llvm::make_unique<FunctionSummary>(
-      GVFlags, InstCount, FFlags, std::move(Refs), std::move(Calls),
-      std::move(TypeIdInfo.TypeTests),
+      GVFlags, InstCount, FFlags, /*EntryCount=*/0, std::move(Refs),
+      std::move(Calls), std::move(TypeIdInfo.TypeTests),
       std::move(TypeIdInfo.TypeTestAssumeVCalls),
       std::move(TypeIdInfo.TypeCheckedLoadVCalls),
       std::move(TypeIdInfo.TypeTestAssumeConstVCalls),
@@ -7626,11 +7754,14 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
   GlobalValueSummary::GVFlags GVFlags = GlobalValueSummary::GVFlags(
       /*Linkage=*/GlobalValue::ExternalLinkage, /*NotEligibleToImport=*/false,
       /*Live=*/false, /*IsLocal=*/false);
+  GlobalVarSummary::GVarFlags GVarFlags(/*ReadOnly*/ false);
   std::vector<ValueInfo> Refs;
   if (ParseToken(lltok::colon, "expected ':' here") ||
       ParseToken(lltok::lparen, "expected '(' here") ||
       ParseModuleReference(ModulePath) ||
-      ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags))
+      ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags) ||
+      ParseToken(lltok::comma, "expected ',' here") ||
+      ParseGVarFlags(GVarFlags))
     return true;
 
   // Parse optional refs field
@@ -7642,7 +7773,8 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
   if (ParseToken(lltok::rparen, "expected ')' here"))
     return true;
 
-  auto GS = llvm::make_unique<GlobalVarSummary>(GVFlags, std::move(Refs));
+  auto GS =
+      llvm::make_unique<GlobalVarSummary>(GVFlags, GVarFlags, std::move(Refs));
 
   GS->setModulePath(ModulePath);
 
@@ -7687,7 +7819,7 @@ bool LLParser::ParseAliasSummary(std::string Name, GlobalValue::GUID GUID,
   AS->setModulePath(ModulePath);
 
   // Record forward reference if the aliasee is not parsed yet.
-  if (AliaseeVI == EmptyVI) {
+  if (AliaseeVI.getRef() == FwdVIRef) {
     auto FwdRef = ForwardRefAliasees.insert(
         std::make_pair(GVId, std::vector<std::pair<AliasSummary *, LocTy>>()));
     FwdRef.first->second.push_back(std::make_pair(AS.get(), Loc));
@@ -7809,7 +7941,7 @@ bool LLParser::ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
     // Keep track of the Call array index needing a forward reference.
     // We will save the location of the ValueInfo needing an update, but
     // can only do so once the std::vector is finalized.
-    if (VI == EmptyVI)
+    if (VI.getRef() == FwdVIRef)
       IdToIndexMap[GVId].push_back(std::make_pair(Calls.size(), Loc));
     Calls.push_back(FunctionSummary::EdgeTy{VI, CalleeInfo(Hotness, RelBF)});
 
@@ -7821,7 +7953,7 @@ bool LLParser::ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
   // of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
     for (auto P : I.second) {
-      assert(Calls[P.first].first == EmptyVI &&
+      assert(Calls[P.first].first.getRef() == FwdVIRef &&
              "Forward referenced ValueInfo expected to be empty");
       auto FwdRef = ForwardRefValueInfos.insert(std::make_pair(
           I.first, std::vector<std::pair<ValueInfo *, LocTy>>()));
@@ -7872,28 +8004,42 @@ bool LLParser::ParseOptionalRefs(std::vector<ValueInfo> &Refs) {
       ParseToken(lltok::lparen, "expected '(' in refs"))
     return true;
 
-  IdToIndexMapType IdToIndexMap;
-  // Parse each ref edge
-  do {
+  struct ValueContext {
     ValueInfo VI;
-    LocTy Loc = Lex.getLoc();
     unsigned GVId;
-    if (ParseGVReference(VI, GVId))
+    LocTy Loc;
+  };
+  std::vector<ValueContext> VContexts;
+  // Parse each ref edge
+  do {
+    ValueContext VC;
+    VC.Loc = Lex.getLoc();
+    if (ParseGVReference(VC.VI, VC.GVId))
       return true;
+    VContexts.push_back(VC);
+  } while (EatIfPresent(lltok::comma));
+
+  // Sort value contexts so that ones with readonly ValueInfo are at the end
+  // of VContexts vector. This is needed to match immutableRefCount() behavior.
+  llvm::sort(VContexts, [](const ValueContext &VC1, const ValueContext &VC2) {
+    return VC1.VI.isReadOnly() < VC2.VI.isReadOnly();
+  });
 
+  IdToIndexMapType IdToIndexMap;
+  for (auto &VC : VContexts) {
     // Keep track of the Refs array index needing a forward reference.
     // We will save the location of the ValueInfo needing an update, but
     // can only do so once the std::vector is finalized.
-    if (VI == EmptyVI)
-      IdToIndexMap[GVId].push_back(std::make_pair(Refs.size(), Loc));
-    Refs.push_back(VI);
-  } while (EatIfPresent(lltok::comma));
+    if (VC.VI.getRef() == FwdVIRef)
+      IdToIndexMap[VC.GVId].push_back(std::make_pair(Refs.size(), VC.Loc));
+    Refs.push_back(VC.VI);
+  }
 
   // Now that the Refs vector is finalized, it is safe to save the locations
   // of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
     for (auto P : I.second) {
-      assert(Refs[P.first] == EmptyVI &&
+      assert(Refs[P.first].getRef() == FwdVIRef &&
              "Forward referenced ValueInfo expected to be empty");
       auto FwdRef = ForwardRefValueInfos.insert(std::make_pair(
           I.first, std::vector<std::pair<ValueInfo *, LocTy>>()));
@@ -8179,6 +8325,27 @@ bool LLParser::ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags) {
   return false;
 }
 
+/// GVarFlags
+///   ::= 'varFlags' ':' '(' 'readonly' ':' Flag ')'
+bool LLParser::ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags) {
+  assert(Lex.getKind() == lltok::kw_varFlags);
+  Lex.Lex();
+
+  unsigned Flag;
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseToken(lltok::kw_readonly, "expected 'readonly' here") ||
+      ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+
+  ParseFlag(Flag);
+  GVarFlags.ReadOnly = Flag;
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+  return false;
+}
+
 /// ModuleReference
 ///   ::= 'module' ':' UInt
 bool LLParser::ParseModuleReference(StringRef &ModulePath) {
@@ -8199,18 +8366,20 @@ bool LLParser::ParseModuleReference(StringRef &ModulePath) {
 /// GVReference
 ///   ::= SummaryID
 bool LLParser::ParseGVReference(ValueInfo &VI, unsigned &GVId) {
+  bool ReadOnly = EatIfPresent(lltok::kw_readonly);
   if (ParseToken(lltok::SummaryID, "expected GV ID"))
     return true;
 
   GVId = Lex.getUIntVal();
-
   // Check if we already have a VI for this GV
   if (GVId < NumberedValueInfos.size()) {
-    assert(NumberedValueInfos[GVId] != EmptyVI);
+    assert(NumberedValueInfos[GVId].getRef() != FwdVIRef);
     VI = NumberedValueInfos[GVId];
   } else
     // We will create a forward reference to the stored location.
-    VI = EmptyVI;
+    VI = ValueInfo(false, FwdVIRef);
 
+  if (ReadOnly)
+    VI.setReadOnly();
   return false;
 }
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index cec1a8e8f7eb6a27df315fa564fa9896dca2c40b..5a0fc297265d4d1dc649bc31084de1f1dcbdd3f9 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -352,6 +352,7 @@ namespace llvm {
     bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID);
     bool ParseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID);
     bool ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags);
+    bool ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags);
     bool ParseOptionalFFlags(FunctionSummary::FFlags &FFlags);
     bool ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls);
     bool ParseHotness(CalleeInfo::HotnessType &Hotness);
@@ -571,6 +572,8 @@ namespace llvm {
     bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
 
+    bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
+                      unsigned OperandType);
     bool ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
                          unsigned OperandType);
     bool ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index f8f5955a16c89efd3281990f868f7e9ad847e41b..c2e2795a9467be6c4026aa5ea29091dd3c242163 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -270,6 +270,7 @@ enum Kind {
   kw_umin,
 
   // Instruction Opcodes (Opcode in UIntVal).
+  kw_fneg,
   kw_add,
   kw_fadd,
   kw_sub,
@@ -417,6 +418,7 @@ enum Kind {
   kw_info,
   kw_byte,
   kw_bit,
+  kw_varFlags,
 
   // Unsigned Valued tokens (UIntVal).
   GlobalID,   // @42
@@ -440,6 +442,7 @@ enum Kind {
   NameTableKind,    // GNU
   DwarfOp,          // DW_OP_foo
   DIFlag,           // DIFlagFoo
+  DISPFlag,         // DISPFlagFoo
   DwarfMacinfo,     // DW_MACINFO_foo
   ChecksumKind,     // CSK_foo
 
diff --git a/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b789f646b5f6f920880ff76c912d1f31c8730987
--- /dev/null
+++ b/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -0,0 +1,324 @@
+//===- AMDGPUMetadataVerifier.cpp - MsgPack Types ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Implements a verifier for AMDGPU HSA metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
+#include "llvm/Support/AMDGPUMetadata.h"
+
+namespace llvm {
+namespace AMDGPU {
+namespace HSAMD {
+namespace V3 {
+
+bool MetadataVerifier::verifyScalar(
+    msgpack::Node &Node, msgpack::ScalarNode::ScalarKind SKind,
+    function_ref<bool(msgpack::ScalarNode &)> verifyValue) {
+  auto ScalarPtr = dyn_cast<msgpack::ScalarNode>(&Node);
+  if (!ScalarPtr)
+    return false;
+  auto &Scalar = *ScalarPtr;
+  // Do not output extraneous tags for types we know from the spec.
+  Scalar.IgnoreTag = true;
+  if (Scalar.getScalarKind() != SKind) {
+    if (Strict)
+      return false;
+    // If we are not strict, we interpret string values as "implicitly typed"
+    // and attempt to coerce them to the expected type here.
+    if (Scalar.getScalarKind() != msgpack::ScalarNode::SK_String)
+      return false;
+    std::string StringValue = Scalar.getString();
+    Scalar.setScalarKind(SKind);
+    if (Scalar.inputYAML(StringValue) != StringRef())
+      return false;
+  }
+  if (verifyValue)
+    return verifyValue(Scalar);
+  return true;
+}
+
+bool MetadataVerifier::verifyInteger(msgpack::Node &Node) {
+  if (!verifyScalar(Node, msgpack::ScalarNode::SK_UInt))
+    if (!verifyScalar(Node, msgpack::ScalarNode::SK_Int))
+      return false;
+  return true;
+}
+
+bool MetadataVerifier::verifyArray(
+    msgpack::Node &Node, function_ref<bool(msgpack::Node &)> verifyNode,
+    Optional<size_t> Size) {
+  auto ArrayPtr = dyn_cast<msgpack::ArrayNode>(&Node);
+  if (!ArrayPtr)
+    return false;
+  auto &Array = *ArrayPtr;
+  if (Size && Array.size() != *Size)
+    return false;
+  for (auto &Item : Array)
+    if (!verifyNode(*Item.get()))
+      return false;
+
+  return true;
+}
+
+bool MetadataVerifier::verifyEntry(
+    msgpack::MapNode &MapNode, StringRef Key, bool Required,
+    function_ref<bool(msgpack::Node &)> verifyNode) {
+  auto Entry = MapNode.find(Key);
+  if (Entry == MapNode.end())
+    return !Required;
+  return verifyNode(*Entry->second.get());
+}
+
+bool MetadataVerifier::verifyScalarEntry(
+    msgpack::MapNode &MapNode, StringRef Key, bool Required,
+    msgpack::ScalarNode::ScalarKind SKind,
+    function_ref<bool(msgpack::ScalarNode &)> verifyValue) {
+  return verifyEntry(MapNode, Key, Required, [=](msgpack::Node &Node) {
+    return verifyScalar(Node, SKind, verifyValue);
+  });
+}
+
+bool MetadataVerifier::verifyIntegerEntry(msgpack::MapNode &MapNode,
+                                          StringRef Key, bool Required) {
+  return verifyEntry(MapNode, Key, Required, [this](msgpack::Node &Node) {
+    return verifyInteger(Node);
+  });
+}
+
+bool MetadataVerifier::verifyKernelArgs(msgpack::Node &Node) {
+  auto ArgsMapPtr = dyn_cast<msgpack::MapNode>(&Node);
+  if (!ArgsMapPtr)
+    return false;
+  auto &ArgsMap = *ArgsMapPtr;
+
+  if (!verifyScalarEntry(ArgsMap, ".name", false,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".type_name", false,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyIntegerEntry(ArgsMap, ".size", true))
+    return false;
+  if (!verifyIntegerEntry(ArgsMap, ".offset", true))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".value_kind", true,
+                         msgpack::ScalarNode::SK_String,
+                         [](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case("by_value", true)
+                               .Case("global_buffer", true)
+                               .Case("dynamic_shared_pointer", true)
+                               .Case("sampler", true)
+                               .Case("image", true)
+                               .Case("pipe", true)
+                               .Case("queue", true)
+                               .Case("hidden_global_offset_x", true)
+                               .Case("hidden_global_offset_y", true)
+                               .Case("hidden_global_offset_z", true)
+                               .Case("hidden_none", true)
+                               .Case("hidden_printf_buffer", true)
+                               .Case("hidden_default_queue", true)
+                               .Case("hidden_completion_action", true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".value_type", true,
+                         msgpack::ScalarNode::SK_String,
+                         [](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case("struct", true)
+                               .Case("i8", true)
+                               .Case("u8", true)
+                               .Case("i16", true)
+                               .Case("u16", true)
+                               .Case("f16", true)
+                               .Case("i32", true)
+                               .Case("u32", true)
+                               .Case("f32", true)
+                               .Case("i64", true)
+                               .Case("u64", true)
+                               .Case("f64", true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyIntegerEntry(ArgsMap, ".pointee_align", false))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".address_space", false,
+                         msgpack::ScalarNode::SK_String,
+                         [](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case("private", true)
+                               .Case("global", true)
+                               .Case("constant", true)
+                               .Case("local", true)
+                               .Case("generic", true)
+                               .Case("region", true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".access", false,
+                         msgpack::ScalarNode::SK_String,
+                         [](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case("read_only", true)
+                               .Case("write_only", true)
+                               .Case("read_write", true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".actual_access", false,
+                         msgpack::ScalarNode::SK_String,
+                         [](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case("read_only", true)
+                               .Case("write_only", true)
+                               .Case("read_write", true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".is_const", false,
+                         msgpack::ScalarNode::SK_Boolean))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".is_restrict", false,
+                         msgpack::ScalarNode::SK_Boolean))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".is_volatile", false,
+                         msgpack::ScalarNode::SK_Boolean))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, ".is_pipe", false,
+                         msgpack::ScalarNode::SK_Boolean))
+    return false;
+
+  return true;
+}
+
+bool MetadataVerifier::verifyKernel(msgpack::Node &Node) {
+  auto KernelMapPtr = dyn_cast<msgpack::MapNode>(&Node);
+  if (!KernelMapPtr)
+    return false;
+  auto &KernelMap = *KernelMapPtr;
+
+  if (!verifyScalarEntry(KernelMap, ".name", true,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyScalarEntry(KernelMap, ".symbol", true,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyScalarEntry(KernelMap, ".language", false,
+                         msgpack::ScalarNode::SK_String,
+                         [](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case("OpenCL C", true)
+                               .Case("OpenCL C++", true)
+                               .Case("HCC", true)
+                               .Case("HIP", true)
+                               .Case("OpenMP", true)
+                               .Case("Assembler", true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyEntry(
+          KernelMap, ".language_version", false, [this](msgpack::Node &Node) {
+            return verifyArray(
+                Node,
+                [this](msgpack::Node &Node) { return verifyInteger(Node); }, 2);
+          }))
+    return false;
+  if (!verifyEntry(KernelMap, ".args", false, [this](msgpack::Node &Node) {
+        return verifyArray(Node, [this](msgpack::Node &Node) {
+          return verifyKernelArgs(Node);
+        });
+      }))
+    return false;
+  if (!verifyEntry(KernelMap, ".reqd_workgroup_size", false,
+                   [this](msgpack::Node &Node) {
+                     return verifyArray(Node,
+                                        [this](msgpack::Node &Node) {
+                                          return verifyInteger(Node);
+                                        },
+                                        3);
+                   }))
+    return false;
+  if (!verifyEntry(KernelMap, ".workgroup_size_hint", false,
+                   [this](msgpack::Node &Node) {
+                     return verifyArray(Node,
+                                        [this](msgpack::Node &Node) {
+                                          return verifyInteger(Node);
+                                        },
+                                        3);
+                   }))
+    return false;
+  if (!verifyScalarEntry(KernelMap, ".vec_type_hint", false,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyScalarEntry(KernelMap, ".device_enqueue_symbol", false,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".kernarg_segment_size", true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".group_segment_fixed_size", true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".private_segment_fixed_size", true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".kernarg_segment_align", true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".wavefront_size", true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".sgpr_count", true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".vgpr_count", true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".max_flat_workgroup_size", true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".sgpr_spill_count", false))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, ".vgpr_spill_count", false))
+    return false;
+
+  return true;
+}
+
+bool MetadataVerifier::verify(msgpack::Node &HSAMetadataRoot) {
+  auto RootMapPtr = dyn_cast<msgpack::MapNode>(&HSAMetadataRoot);
+  if (!RootMapPtr)
+    return false;
+  auto &RootMap = *RootMapPtr;
+
+  if (!verifyEntry(
+          RootMap, "amdhsa.version", true, [this](msgpack::Node &Node) {
+            return verifyArray(
+                Node,
+                [this](msgpack::Node &Node) { return verifyInteger(Node); }, 2);
+          }))
+    return false;
+  if (!verifyEntry(
+          RootMap, "amdhsa.printf", false, [this](msgpack::Node &Node) {
+            return verifyArray(Node, [this](msgpack::Node &Node) {
+              return verifyScalar(Node, msgpack::ScalarNode::SK_String);
+            });
+          }))
+    return false;
+  if (!verifyEntry(RootMap, "amdhsa.kernels", true,
+                   [this](msgpack::Node &Node) {
+                     return verifyArray(Node, [this](msgpack::Node &Node) {
+                       return verifyKernel(Node);
+                     });
+                   }))
+    return false;
+
+  return true;
+}
+
+} // end namespace V3
+} // end namespace HSAMD
+} // end namespace AMDGPU
+} // end namespace llvm
diff --git a/lib/BinaryFormat/CMakeLists.txt b/lib/BinaryFormat/CMakeLists.txt
index 93bb8b07763f0541310b89585569df2e7738ac5d..d645279d0ac59e547acd7ea5f9a82be0502f997b 100644
--- a/lib/BinaryFormat/CMakeLists.txt
+++ b/lib/BinaryFormat/CMakeLists.txt
@@ -1,7 +1,9 @@
 add_llvm_library(LLVMBinaryFormat
+  AMDGPUMetadataVerifier.cpp
   Dwarf.cpp
   Magic.cpp
   MsgPackReader.cpp
+  MsgPackTypes.cpp
   MsgPackWriter.cpp
   Wasm.cpp
 
diff --git a/lib/BinaryFormat/Dwarf.cpp b/lib/BinaryFormat/Dwarf.cpp
index 5984de73ae633d94a8b1646db850c6c300ee1a3a..fe8ce2bd7aa36558d586415516f86abc7066517a 100644
--- a/lib/BinaryFormat/Dwarf.cpp
+++ b/lib/BinaryFormat/Dwarf.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
@@ -455,14 +456,32 @@ StringRef llvm::dwarf::RangeListEncodingString(unsigned Encoding) {
   }
 }
 
-StringRef llvm::dwarf::CallFrameString(unsigned Encoding) {
+StringRef llvm::dwarf::CallFrameString(unsigned Encoding,
+    Triple::ArchType Arch) {
+  assert(Arch != llvm::Triple::ArchType::UnknownArch);
+#define SELECT_AARCH64 (Arch == llvm::Triple::aarch64_be || Arch == llvm::Triple::aarch64)
+#define SELECT_MIPS64 Arch == llvm::Triple::mips64
+#define SELECT_SPARC (Arch == llvm::Triple::sparc || Arch == llvm::Triple::sparcv9)
+#define SELECT_X86 (Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64)
+#define HANDLE_DW_CFA(ID, NAME)
+#define HANDLE_DW_CFA_PRED(ID, NAME, PRED) \
+  if (ID == Encoding && PRED) \
+    return "DW_CFA_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+
   switch (Encoding) {
   default:
     return StringRef();
+#define HANDLE_DW_CFA_PRED(ID, NAME, PRED)
 #define HANDLE_DW_CFA(ID, NAME)                                                \
   case DW_CFA_##NAME:                                                          \
     return "DW_CFA_" #NAME;
 #include "llvm/BinaryFormat/Dwarf.def"
+
+#undef SELECT_X86
+#undef SELECT_SPARC
+#undef SELECT_MIPS64
+#undef SELECT_AARCH64
   }
 }
 
diff --git a/lib/BinaryFormat/Magic.cpp b/lib/BinaryFormat/Magic.cpp
index 5a339583fca1a78f8c2dcd00db214ce607688b20..78efa6ec87be8c614d22a565c47eae296466e31b 100644
--- a/lib/BinaryFormat/Magic.cpp
+++ b/lib/BinaryFormat/Magic.cpp
@@ -206,7 +206,7 @@ file_magic llvm::identify_magic(StringRef Magic) {
 }
 
 std::error_code llvm::identify_magic(const Twine &Path, file_magic &Result) {
-  auto FileOrError = MemoryBuffer::getFile(Path);
+  auto FileOrError = MemoryBuffer::getFile(Path, -1LL, false);
   if (!FileOrError)
     return FileOrError.getError();
 
diff --git a/lib/BinaryFormat/MsgPackTypes.cpp b/lib/BinaryFormat/MsgPackTypes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a8f70b10fb83e4c32cc01996fe1f2fc080dbf31
--- /dev/null
+++ b/lib/BinaryFormat/MsgPackTypes.cpp
@@ -0,0 +1,303 @@
+//===- MsgPackTypes.cpp - MsgPack Types -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Implementation of types representing MessagePack "documents".
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/MsgPackTypes.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace msgpack;
+
+namespace llvm {
+namespace msgpack {
+void ScalarNode::anchor() {}
+void ArrayNode::anchor() {}
+void MapNode::anchor() {}
+}
+}
+
+Expected<OptNodePtr> Node::readArray(Reader &MPReader, size_t Length) {
+  auto A = std::make_shared<ArrayNode>();
+  for (size_t I = 0; I < Length; ++I) {
+    auto OptNodeOrErr = Node::read(MPReader);
+    if (auto Err = OptNodeOrErr.takeError())
+      return std::move(Err);
+    if (!*OptNodeOrErr)
+      return make_error<StringError>(
+          "Insufficient array elements",
+          std::make_error_code(std::errc::invalid_argument));
+    A->push_back(std::move(**OptNodeOrErr));
+  }
+  return OptNodePtr(std::move(A));
+}
+
+Expected<OptNodePtr> Node::readMap(Reader &MPReader, size_t Length) {
+  auto M = std::make_shared<MapNode>();
+  for (size_t I = 0; I < Length; ++I) {
+    auto OptKeyOrErr = Node::read(MPReader);
+    if (auto Err = OptKeyOrErr.takeError())
+      return std::move(Err);
+    if (!*OptKeyOrErr)
+      return make_error<StringError>(
+          "Insufficient map elements",
+          std::make_error_code(std::errc::invalid_argument));
+    auto OptValOrErr = Node::read(MPReader);
+    if (auto Err = OptValOrErr.takeError())
+      return std::move(Err);
+    if (!*OptValOrErr)
+      return make_error<StringError>(
+          "Insufficient map elements",
+          std::make_error_code(std::errc::invalid_argument));
+    auto *Key = dyn_cast<ScalarNode>((*OptKeyOrErr)->get());
+    if (!Key)
+      return make_error<StringError>(
+          "Only string map keys are supported",
+          std::make_error_code(std::errc::invalid_argument));
+    if (Key->getScalarKind() != ScalarNode::SK_String)
+      return make_error<StringError>(
+          "Only string map keys are supported",
+          std::make_error_code(std::errc::invalid_argument));
+    M->try_emplace(Key->getString(), std::move(**OptValOrErr));
+  }
+  return OptNodePtr(std::move(M));
+}
+
+Expected<OptNodePtr> Node::read(Reader &MPReader) {
+  Object Obj;
+
+  auto ContinueOrErr = MPReader.read(Obj);
+  if (auto Err = ContinueOrErr.takeError())
+    return std::move(Err);
+  if (!*ContinueOrErr)
+    return None;
+
+  switch (Obj.Kind) {
+  case Type::Int:
+    return OptNodePtr(std::make_shared<ScalarNode>(Obj.Int));
+  case Type::UInt:
+    return OptNodePtr(std::make_shared<ScalarNode>(Obj.UInt));
+  case Type::Nil:
+    return OptNodePtr(std::make_shared<ScalarNode>());
+  case Type::Boolean:
+    return OptNodePtr(std::make_shared<ScalarNode>(Obj.Bool));
+  case Type::Float:
+    return OptNodePtr(std::make_shared<ScalarNode>(Obj.Float));
+  case Type::String:
+    return OptNodePtr(std::make_shared<ScalarNode>(Obj.Raw));
+  case Type::Binary:
+    return OptNodePtr(std::make_shared<ScalarNode>(Obj.Raw));
+  case Type::Array:
+    return Node::readArray(MPReader, Obj.Length);
+  case Type::Map:
+    return Node::readMap(MPReader, Obj.Length);
+  case Type::Extension:
+    return make_error<StringError>(
+        "Extension types are not supported",
+        std::make_error_code(std::errc::invalid_argument));
+  }
+  llvm_unreachable("msgpack::Type not handled");
+}
+
+void ScalarNode::destroy() {
+  switch (SKind) {
+  case SK_String:
+  case SK_Binary:
+    StringValue.~basic_string();
+    break;
+  default:
+    // POD types do not require destruction
+    break;
+  }
+}
+
+ScalarNode::ScalarNode(int64_t IntValue)
+    : Node(NK_Scalar), SKind(SK_Int), IntValue(IntValue) {}
+
+ScalarNode::ScalarNode(int32_t IntValue)
+    : ScalarNode(static_cast<int64_t>(IntValue)) {}
+
+ScalarNode::ScalarNode(uint64_t UIntValue)
+    : Node(NK_Scalar), SKind(SK_UInt), UIntValue(UIntValue) {}
+
+ScalarNode::ScalarNode(uint32_t IntValue)
+    : ScalarNode(static_cast<uint64_t>(IntValue)) {}
+
+ScalarNode::ScalarNode() : Node(NK_Scalar), SKind(SK_Nil) {}
+
+ScalarNode::ScalarNode(bool BoolValue)
+    : Node(NK_Scalar), SKind(SK_Boolean), BoolValue(BoolValue) {}
+
+ScalarNode::ScalarNode(double FloatValue)
+    : Node(NK_Scalar), SKind(SK_Float), BoolValue(FloatValue) {}
+
+ScalarNode::ScalarNode(StringRef StringValue)
+    : Node(NK_Scalar), SKind(SK_String) {
+  new (&this->StringValue) std::string(StringValue);
+}
+
+ScalarNode::ScalarNode(const char *StringValue)
+    : ScalarNode(StringRef(StringValue)) {}
+
+ScalarNode::ScalarNode(std::string &&StringValue)
+    : Node(NK_Scalar), SKind(SK_String) {
+  new (&this->StringValue) std::string(StringValue);
+}
+
+ScalarNode::ScalarNode(MemoryBufferRef BinaryValue)
+    : Node(NK_Scalar), SKind(SK_Binary) {
+  new (&StringValue) std::string(BinaryValue.getBuffer());
+}
+
+ScalarNode::~ScalarNode() { destroy(); }
+
+ScalarNode &ScalarNode::operator=(ScalarNode &&RHS) {
+  destroy();
+  switch (SKind = RHS.SKind) {
+  case SK_Int:
+    IntValue = RHS.IntValue;
+    break;
+  case SK_UInt:
+    UIntValue = RHS.UIntValue;
+    break;
+  case SK_Boolean:
+    BoolValue = RHS.BoolValue;
+    break;
+  case SK_Float:
+    FloatValue = RHS.FloatValue;
+    break;
+  case SK_String:
+  case SK_Binary:
+    new (&StringValue) std::string(std::move(RHS.StringValue));
+    break;
+  case SK_Nil:
+    // pass
+    break;
+  }
+  return *this;
+}
+
+StringRef ScalarNode::inputYAML(StringRef ScalarStr) {
+  switch (SKind) {
+  case SK_Int:
+    return yaml::ScalarTraits<int64_t>::input(ScalarStr, nullptr, IntValue);
+  case SK_UInt:
+    return yaml::ScalarTraits<uint64_t>::input(ScalarStr, nullptr, UIntValue);
+  case SK_Nil:
+    return StringRef();
+  case SK_Boolean:
+    return yaml::ScalarTraits<bool>::input(ScalarStr, nullptr, BoolValue);
+  case SK_Float:
+    return yaml::ScalarTraits<double>::input(ScalarStr, nullptr, FloatValue);
+  case SK_Binary:
+  case SK_String:
+    return yaml::ScalarTraits<std::string>::input(ScalarStr, nullptr,
+                                                  StringValue);
+  }
+  llvm_unreachable("unrecognized ScalarKind");
+}
+
+void ScalarNode::outputYAML(raw_ostream &OS) const {
+  switch (SKind) {
+  case SK_Int:
+    yaml::ScalarTraits<int64_t>::output(IntValue, nullptr, OS);
+    break;
+  case SK_UInt:
+    yaml::ScalarTraits<uint64_t>::output(UIntValue, nullptr, OS);
+    break;
+  case SK_Nil:
+    yaml::ScalarTraits<StringRef>::output("", nullptr, OS);
+    break;
+  case SK_Boolean:
+    yaml::ScalarTraits<bool>::output(BoolValue, nullptr, OS);
+    break;
+  case SK_Float:
+    yaml::ScalarTraits<double>::output(FloatValue, nullptr, OS);
+    break;
+  case SK_Binary:
+  case SK_String:
+    yaml::ScalarTraits<std::string>::output(StringValue, nullptr, OS);
+    break;
+  }
+}
+
+yaml::QuotingType ScalarNode::mustQuoteYAML(StringRef ScalarStr) const {
+  switch (SKind) {
+  case SK_Int:
+    return yaml::ScalarTraits<int64_t>::mustQuote(ScalarStr);
+  case SK_UInt:
+    return yaml::ScalarTraits<uint64_t>::mustQuote(ScalarStr);
+  case SK_Nil:
+    return yaml::ScalarTraits<StringRef>::mustQuote(ScalarStr);
+  case SK_Boolean:
+    return yaml::ScalarTraits<bool>::mustQuote(ScalarStr);
+  case SK_Float:
+    return yaml::ScalarTraits<double>::mustQuote(ScalarStr);
+  case SK_Binary:
+  case SK_String:
+    return yaml::ScalarTraits<std::string>::mustQuote(ScalarStr);
+  }
+  llvm_unreachable("unrecognized ScalarKind");
+}
+
+const char *ScalarNode::IntTag = "!int";
+const char *ScalarNode::NilTag = "!nil";
+const char *ScalarNode::BooleanTag = "!bool";
+const char *ScalarNode::FloatTag = "!float";
+const char *ScalarNode::StringTag = "!str";
+const char *ScalarNode::BinaryTag = "!bin";
+
+StringRef ScalarNode::getYAMLTag() const {
+  switch (SKind) {
+  case SK_Int:
+    return IntTag;
+  case SK_UInt:
+    return IntTag;
+  case SK_Nil:
+    return NilTag;
+  case SK_Boolean:
+    return BooleanTag;
+  case SK_Float:
+    return FloatTag;
+  case SK_String:
+    return StringTag;
+  case SK_Binary:
+    return BinaryTag;
+  }
+  llvm_unreachable("unrecognized ScalarKind");
+}
+
+void ScalarNode::write(Writer &MPWriter) {
+  switch (SKind) {
+  case SK_Int:
+    MPWriter.write(IntValue);
+    break;
+  case SK_UInt:
+    MPWriter.write(UIntValue);
+    break;
+  case SK_Nil:
+    MPWriter.writeNil();
+    break;
+  case SK_Boolean:
+    MPWriter.write(BoolValue);
+    break;
+  case SK_Float:
+    MPWriter.write(FloatValue);
+    break;
+  case SK_String:
+    MPWriter.write(StringValue);
+    break;
+  case SK_Binary:
+    MPWriter.write(MemoryBufferRef(StringValue, ""));
+    break;
+  }
+}
diff --git a/lib/BinaryFormat/Wasm.cpp b/lib/BinaryFormat/Wasm.cpp
index ba8de34a6d22efef1c0ee838c0c362c99da9ad13..94d40bf02a390c0438d7ec2e1f72375c91964a6e 100644
--- a/lib/BinaryFormat/Wasm.cpp
+++ b/lib/BinaryFormat/Wasm.cpp
@@ -19,6 +19,8 @@ std::string llvm::wasm::toString(wasm::WasmSymbolType type) {
     return "WASM_SYMBOL_TYPE_DATA";
   case wasm::WASM_SYMBOL_TYPE_SECTION:
     return "WASM_SYMBOL_TYPE_SECTION";
+  case wasm::WASM_SYMBOL_TYPE_EVENT:
+    return "WASM_SYMBOL_TYPE_EVENT";
   }
   llvm_unreachable("unknown symbol type");
 }
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 56e05f8f085564153c86aa9c93dfb91eb5ca3ddb..846ce3a4f7aa72042cf5d80022f9b626c7ab074a 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -898,6 +898,11 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
   return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local);
 }
 
+// Decode the flags for GlobalVariable in the summary
+static GlobalVarSummary::GVarFlags getDecodedGVarFlags(uint64_t RawFlags) {
+  return GlobalVarSummary::GVarFlags((RawFlags & 0x1) ? true : false);
+}
+
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
   switch (Val) {
   default: // Map unknown visibilities to default.
@@ -964,6 +969,20 @@ static int getDecodedCastOpcode(unsigned Val) {
   }
 }
 
+static int getDecodedUnaryOpcode(unsigned Val, Type *Ty) {
+  bool IsFP = Ty->isFPOrFPVectorTy();
+  // UnOps are only valid for int/fp or vector of int/fp types
+  if (!IsFP && !Ty->isIntOrIntVectorTy())
+    return -1;
+
+  switch (Val) {
+  default:
+    return -1;
+  case bitc::UNOP_NEG:
+    return IsFP ? Instruction::FNeg : -1;
+  }
+}
+
 static int getDecodedBinaryOpcode(unsigned Val, Type *Ty) {
   bool IsFP = Ty->isFPOrFPVectorTy();
   // BinOps are only valid for int/fp or vector of int/fp types
@@ -2317,6 +2336,19 @@ Error BitcodeReader::parseConstants() {
       }
       break;
     }
+    case bitc::CST_CODE_CE_UNOP: {  // CE_UNOP: [opcode, opval]
+      if (Record.size() < 2)
+        return error("Invalid record");
+      int Opc = getDecodedUnaryOpcode(Record[0], CurTy);
+      if (Opc < 0) {
+        V = UndefValue::get(CurTy);  // Unknown unop.
+      } else {
+        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
+        unsigned Flags = 0;
+        V = ConstantExpr::get(Opc, LHS, Flags);
+      }
+      break;
+    }
     case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
       if (Record.size() < 3)
         return error("Invalid record");
@@ -3521,12 +3553,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       MDNode *Scope = nullptr, *IA = nullptr;
       if (ScopeID) {
-        Scope = MDLoader->getMDNodeFwdRefOrNull(ScopeID - 1);
+        Scope = dyn_cast_or_null<MDNode>(
+            MDLoader->getMetadataFwdRefOrLoad(ScopeID - 1));
         if (!Scope)
           return error("Invalid record");
       }
       if (IAID) {
-        IA = MDLoader->getMDNodeFwdRefOrNull(IAID - 1);
+        IA = dyn_cast_or_null<MDNode>(
+            MDLoader->getMetadataFwdRefOrLoad(IAID - 1));
         if (!IA)
           return error("Invalid record");
       }
@@ -3535,7 +3569,27 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       I = nullptr;
       continue;
     }
+    case bitc::FUNC_CODE_INST_UNOP: {    // UNOP: [opval, ty, opcode]
+      unsigned OpNum = 0;
+      Value *LHS;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
+          OpNum+1 > Record.size())
+        return error("Invalid record");
 
+      int Opc = getDecodedUnaryOpcode(Record[OpNum++], LHS->getType());
+      if (Opc == -1)
+        return error("Invalid record");
+      I = UnaryOperator::Create((Instruction::UnaryOps)Opc, LHS);
+      InstructionList.push_back(I);
+      if (OpNum < Record.size()) {
+        if (isa<FPMathOperator>(I)) {
+          FastMathFlags FMF = getDecodedFastMathFlags(Record[OpNum]);
+          if (FMF.any())
+            I->setFastMathFlags(FMF);
+        }
+      }
+      break;
+    }
     case bitc::FUNC_CODE_INST_BINOP: {    // BINOP: [opval, ty, opval, opcode]
       unsigned OpNum = 0;
       Value *LHS, *RHS;
@@ -4864,7 +4918,7 @@ void ModuleSummaryIndexBitcodeReader::setValueGUID(
   ValueIdToValueInfoMap[ValueID] = std::make_pair(
       TheIndex.getOrInsertValueInfo(
           ValueGUID,
-          UseStrtab ? ValueName : TheIndex.saveString(ValueName.str())),
+          UseStrtab ? ValueName : TheIndex.saveString(ValueName)),
       OriginalNameID);
 }
 
@@ -5170,6 +5224,12 @@ static void parseTypeIdSummaryRecord(ArrayRef<uint64_t> Record,
     parseWholeProgramDevirtResolution(Record, Strtab, Slot, TypeId);
 }
 
+static void setImmutableRefs(std::vector<ValueInfo> &Refs, unsigned Count) {
+  // Read-only refs are in the end of the refs list.
+  for (unsigned RefNo = Refs.size() - Count; RefNo < Refs.size(); ++RefNo)
+    Refs[RefNo].setReadOnly();
+}
+
 // Eagerly parse the entire summary block. This populates the GlobalValueSummary
 // objects in the index.
 Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
@@ -5187,9 +5247,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
   }
   const uint64_t Version = Record[0];
   const bool IsOldProfileFormat = Version == 1;
-  if (Version < 1 || Version > 4)
+  if (Version < 1 || Version > 6)
     return error("Invalid summary version " + Twine(Version) +
-                 ", 1, 2, 3 or 4 expected");
+                 ". Version should be in the range [1-6].");
   Record.clear();
 
   // Keep around the last seen summary to be used when we see an optional
@@ -5243,6 +5303,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       // 1 bit: SkipModuleByDistributedBackend flag.
       if (Flags & 0x2)
         TheIndex.setSkipModuleByDistributedBackend();
+      // 1 bit: HasSyntheticEntryCounts flag.
+      if (Flags & 0x4)
+        TheIndex.setHasSyntheticEntryCounts();
       break;
     }
     case bitc::FS_VALUE_GUID: { // [valueid, refguid]
@@ -5268,11 +5331,16 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       unsigned InstCount = Record[2];
       uint64_t RawFunFlags = 0;
       unsigned NumRefs = Record[3];
+      unsigned NumImmutableRefs = 0;
       int RefListStartIndex = 4;
       if (Version >= 4) {
         RawFunFlags = Record[3];
         NumRefs = Record[4];
         RefListStartIndex = 5;
+        if (Version >= 5) {
+          NumImmutableRefs = Record[5];
+          RefListStartIndex = 6;
+        }
       }
 
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
@@ -5291,9 +5359,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       std::vector<FunctionSummary::EdgeTy> Calls = makeCallList(
           ArrayRef<uint64_t>(Record).slice(CallGraphEdgeStartIndex),
           IsOldProfileFormat, HasProfile, HasRelBF);
+      setImmutableRefs(Refs, NumImmutableRefs);
       auto FS = llvm::make_unique<FunctionSummary>(
-          Flags, InstCount, getDecodedFFlags(RawFunFlags), std::move(Refs),
-          std::move(Calls), std::move(PendingTypeTests),
+          Flags, InstCount, getDecodedFFlags(RawFunFlags), /*EntryCount=*/0,
+          std::move(Refs), std::move(Calls), std::move(PendingTypeTests),
           std::move(PendingTypeTestAssumeVCalls),
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
@@ -5339,14 +5408,21 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       TheIndex.addGlobalValueSummary(GUID.first, std::move(AS));
       break;
     }
-    // FS_PERMODULE_GLOBALVAR_INIT_REFS: [valueid, flags, n x valueid]
+    // FS_PERMODULE_GLOBALVAR_INIT_REFS: [valueid, flags, varflags, n x valueid]
     case bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS: {
       unsigned ValueID = Record[0];
       uint64_t RawFlags = Record[1];
+      unsigned RefArrayStart = 2;
+      GlobalVarSummary::GVarFlags GVF;
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
+      if (Version >= 5) {
+        GVF = getDecodedGVarFlags(Record[2]);
+        RefArrayStart = 3;
+      }
       std::vector<ValueInfo> Refs =
-          makeRefList(ArrayRef<uint64_t>(Record).slice(2));
-      auto FS = llvm::make_unique<GlobalVarSummary>(Flags, std::move(Refs));
+          makeRefList(ArrayRef<uint64_t>(Record).slice(RefArrayStart));
+      auto FS =
+          llvm::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
       FS->setModulePath(getThisModule()->first());
       auto GUID = getValueInfoFromValueId(ValueID);
       FS->setOriginalName(GUID.second);
@@ -5364,13 +5440,25 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       uint64_t RawFlags = Record[2];
       unsigned InstCount = Record[3];
       uint64_t RawFunFlags = 0;
+      uint64_t EntryCount = 0;
       unsigned NumRefs = Record[4];
+      unsigned NumImmutableRefs = 0;
       int RefListStartIndex = 5;
 
       if (Version >= 4) {
         RawFunFlags = Record[4];
-        NumRefs = Record[5];
         RefListStartIndex = 6;
+        size_t NumRefsIndex = 5;
+        if (Version >= 5) {
+          RefListStartIndex = 7;
+          if (Version >= 6) {
+            NumRefsIndex = 6;
+            EntryCount = Record[5];
+            RefListStartIndex = 8;
+          }
+          NumImmutableRefs = Record[RefListStartIndex - 1];
+        }
+        NumRefs = Record[NumRefsIndex];
       }
 
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
@@ -5384,9 +5472,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           ArrayRef<uint64_t>(Record).slice(CallGraphEdgeStartIndex),
           IsOldProfileFormat, HasProfile, false);
       ValueInfo VI = getValueInfoFromValueId(ValueID).first;
+      setImmutableRefs(Refs, NumImmutableRefs);
       auto FS = llvm::make_unique<FunctionSummary>(
-          Flags, InstCount, getDecodedFFlags(RawFunFlags), std::move(Refs),
-          std::move(Edges), std::move(PendingTypeTests),
+          Flags, InstCount, getDecodedFFlags(RawFunFlags), EntryCount,
+          std::move(Refs), std::move(Edges), std::move(PendingTypeTests),
           std::move(PendingTypeTestAssumeVCalls),
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
@@ -5432,10 +5521,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       unsigned ValueID = Record[0];
       uint64_t ModuleId = Record[1];
       uint64_t RawFlags = Record[2];
+      unsigned RefArrayStart = 3;
+      GlobalVarSummary::GVarFlags GVF;
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
+      if (Version >= 5) {
+        GVF = getDecodedGVarFlags(Record[3]);
+        RefArrayStart = 4;
+      }
       std::vector<ValueInfo> Refs =
-          makeRefList(ArrayRef<uint64_t>(Record).slice(3));
-      auto FS = llvm::make_unique<GlobalVarSummary>(Flags, std::move(Refs));
+          makeRefList(ArrayRef<uint64_t>(Record).slice(RefArrayStart));
+      auto FS =
+          llvm::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
       LastSeenSummary = FS.get();
       FS->setModulePath(ModuleIdMap[ModuleId]);
       ValueInfo VI = getValueInfoFromValueId(ValueID).first;
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index e77965be216a1aff6bd17c380b634092b9d852b2..3289aa0acddd9bc7ce9136df03496a7c74dec803 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -650,10 +650,6 @@ public:
     return MetadataList.getMetadataFwdRef(ID);
   }
 
-  MDNode *getMDNodeFwdRefOrNull(unsigned Idx) {
-    return MetadataList.getMDNodeFwdRefOrNull(Idx);
-  }
-
   DISubprogram *lookupSubprogramForFunction(Function *F) {
     return FunctionsWithSPs.lookup(F);
   }
@@ -772,7 +768,7 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
           // It is acknowledged by 'TODO: Inherit from Metadata' in the
           // NamedMDNode class definition.
           MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
-          assert(MD && "Invalid record");
+          assert(MD && "Invalid metadata: expect fwd ref to MDNode");
           NMD->addOperand(MD);
         }
         break;
@@ -1049,7 +1045,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     for (unsigned i = 0; i != Size; ++i) {
       MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
       if (!MD)
-        return error("Invalid record");
+        return error("Invalid named metadata: expect fwd ref to MDNode");
       NMD->addOperand(MD);
     }
     break;
@@ -1395,7 +1391,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         Record.size() <= 14 ? 0 : Record[14],
         Record.size() <= 16 ? true : Record[16],
         Record.size() <= 17 ? false : Record[17],
-        Record.size() <= 18 ? 0 : Record[18]);
+        Record.size() <= 18 ? 0 : Record[18],
+        Record.size() <= 19 ? 0 : Record[19]);
 
     MetadataList.assignValue(CU, NextMetadataNo);
     NextMetadataNo++;
@@ -1409,20 +1406,43 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() < 18 || Record.size() > 21)
       return error("Invalid record");
 
-    IsDistinct =
-        (Record[0] & 1) || Record[8]; // All definitions should be distinct.
+    bool HasSPFlags = Record[0] & 4;
+    DISubprogram::DISPFlags SPFlags =
+        HasSPFlags
+            ? static_cast<DISubprogram::DISPFlags>(Record[9])
+            : DISubprogram::toSPFlags(
+                  /*IsLocalToUnit=*/Record[7], /*IsDefinition=*/Record[8],
+                  /*IsOptimized=*/Record[14], /*Virtuality=*/Record[11]);
+
+    // All definitions should be distinct.
+    IsDistinct = (Record[0] & 1) || (SPFlags & DISubprogram::SPFlagDefinition);
     // Version 1 has a Function as Record[15].
     // Version 2 has removed Record[15].
     // Version 3 has the Unit as Record[15].
     // Version 4 added thisAdjustment.
-    bool HasUnit = Record[0] >= 2;
-    if (HasUnit && Record.size() < 19)
+    // Version 5 repacked flags into DISPFlags, changing many element numbers.
+    bool HasUnit = Record[0] & 2;
+    if (!HasSPFlags && HasUnit && Record.size() < 19)
+      return error("Invalid record");
+    if (HasSPFlags && !HasUnit)
       return error("Invalid record");
-    Metadata *CUorFn = getMDOrNull(Record[15]);
-    unsigned Offset = Record.size() >= 19 ? 1 : 0;
-    bool HasFn = Offset && !HasUnit;
-    bool HasThisAdj = Record.size() >= 20;
-    bool HasThrownTypes = Record.size() >= 21;
+    // Accommodate older formats.
+    bool HasFn = false;
+    bool HasThisAdj = true;
+    bool HasThrownTypes = true;
+    unsigned OffsetA = 0;
+    unsigned OffsetB = 0;
+    if (!HasSPFlags) {
+      OffsetA = 2;
+      OffsetB = 2;
+      if (Record.size() >= 19) {
+        HasFn = !HasUnit;
+        OffsetB++;
+      }
+      HasThisAdj = Record.size() >= 20;
+      HasThrownTypes = Record.size() >= 21;
+    }
+    Metadata *CUorFn = getMDOrNull(Record[12 + OffsetB]);
     DISubprogram *SP = GET_OR_DISTINCT(
         DISubprogram,
         (Context,
@@ -1432,20 +1452,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
          getMDOrNull(Record[4]),                            // file
          Record[5],                                         // line
          getMDOrNull(Record[6]),                            // type
-         Record[7],                                         // isLocal
-         Record[8],                                         // isDefinition
-         Record[9],                                         // scopeLine
-         getDITypeRefOrNull(Record[10]),                    // containingType
-         Record[11],                                        // virtuality
-         Record[12],                                        // virtualIndex
-         HasThisAdj ? Record[19] : 0,                       // thisAdjustment
-         static_cast<DINode::DIFlags>(Record[13]),          // flags
-         Record[14],                                        // isOptimized
+         Record[7 + OffsetA],                               // scopeLine
+         getDITypeRefOrNull(Record[8 + OffsetA]),           // containingType
+         Record[10 + OffsetA],                              // virtualIndex
+         HasThisAdj ? Record[16 + OffsetB] : 0,             // thisAdjustment
+         static_cast<DINode::DIFlags>(Record[11 + OffsetA]),// flags
+         SPFlags,                                           // SPFlags
          HasUnit ? CUorFn : nullptr,                        // unit
-         getMDOrNull(Record[15 + Offset]),                  // templateParams
-         getMDOrNull(Record[16 + Offset]),                  // declaration
-         getMDOrNull(Record[17 + Offset]),                  // retainedNodes
-         HasThrownTypes ? getMDOrNull(Record[20]) : nullptr // thrownTypes
+         getMDOrNull(Record[13 + OffsetB]),                 // templateParams
+         getMDOrNull(Record[14 + OffsetB]),                 // declaration
+         getMDOrNull(Record[15 + OffsetB]),                 // retainedNodes
+         HasThrownTypes ? getMDOrNull(Record[17 + OffsetB])
+                        : nullptr                           // thrownTypes
          ));
     MetadataList.assignValue(SP, NextMetadataNo);
     NextMetadataNo++;
@@ -1833,7 +1851,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseGlobalObjectAttachment(
       return error("Invalid ID");
     MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[I + 1]);
     if (!MD)
-      return error("Invalid metadata attachment");
+      return error("Invalid metadata attachment: expect fwd ref to MDNode");
     GO.addMetadata(K->second, *MD);
   }
   return Error::success();
@@ -2003,10 +2021,6 @@ Metadata *MetadataLoader::getMetadataFwdRefOrLoad(unsigned Idx) {
   return Pimpl->getMetadataFwdRefOrLoad(Idx);
 }
 
-MDNode *MetadataLoader::getMDNodeFwdRefOrNull(unsigned Idx) {
-  return Pimpl->getMDNodeFwdRefOrNull(Idx);
-}
-
 DISubprogram *MetadataLoader::lookupSubprogramForFunction(Function *F) {
   return Pimpl->lookupSubprogramForFunction(F);
 }
diff --git a/lib/Bitcode/Reader/MetadataLoader.h b/lib/Bitcode/Reader/MetadataLoader.h
index 752eb35ebb43003c1a3df57442292415af9c0bcc..07a77a086f3240b1c85e1dfb5ca4200157fa7bb3 100644
--- a/lib/Bitcode/Reader/MetadataLoader.h
+++ b/lib/Bitcode/Reader/MetadataLoader.h
@@ -65,8 +65,6 @@ public:
   /// necessary.
   Metadata *getMetadataFwdRefOrLoad(unsigned Idx);
 
-  MDNode *getMDNodeFwdRefOrNull(unsigned Idx);
-
   /// Return the DISubprogram metadata for a Function if any, null otherwise.
   DISubprogram *lookupSubprogramForFunction(Function *F);
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index f4634c9d3f4e39b9a5f7392f497115f55ea59167..68d79edceafb5509659c20c9afa02ef6b9a4c56b 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -112,6 +112,8 @@ enum {
 
   // FUNCTION_BLOCK abbrev id's.
   FUNCTION_INST_LOAD_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+  FUNCTION_INST_UNOP_ABBREV,
+  FUNCTION_INST_UNOP_FLAGS_ABBREV,
   FUNCTION_INST_BINOP_ABBREV,
   FUNCTION_INST_BINOP_FLAGS_ABBREV,
   FUNCTION_INST_CAST_ABBREV,
@@ -513,6 +515,13 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) {
   }
 }
 
+static unsigned getEncodedUnaryOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown binary instruction!");
+  case Instruction::FNeg: return bitc::UNOP_NEG;
+  }
+}
+
 static unsigned getEncodedBinaryOpcode(unsigned Opcode) {
   switch (Opcode) {
   default: llvm_unreachable("Unknown binary instruction!");
@@ -991,6 +1000,11 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
   return RawFlags;
 }
 
+static uint64_t getEncodedGVarFlags(GlobalVarSummary::GVarFlags Flags) {
+  uint64_t RawFlags = Flags.ReadOnly;
+  return RawFlags;
+}
+
 static unsigned getEncodedVisibility(const GlobalValue &GV) {
   switch (GV.getVisibility()) {
   case GlobalValue::DefaultVisibility:   return 0;
@@ -1619,22 +1633,20 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
 void ModuleBitcodeWriter::writeDISubprogram(const DISubprogram *N,
                                             SmallVectorImpl<uint64_t> &Record,
                                             unsigned Abbrev) {
-  uint64_t HasUnitFlag = 1 << 1;
-  Record.push_back(N->isDistinct() | HasUnitFlag);
+  const uint64_t HasUnitFlag = 1 << 1;
+  const uint64_t HasSPFlagsFlag = 1 << 2;
+  Record.push_back(uint64_t(N->isDistinct()) | HasUnitFlag | HasSPFlagsFlag);
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawLinkageName()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(N->getLine());
   Record.push_back(VE.getMetadataOrNullID(N->getType()));
-  Record.push_back(N->isLocalToUnit());
-  Record.push_back(N->isDefinition());
   Record.push_back(N->getScopeLine());
   Record.push_back(VE.getMetadataOrNullID(N->getContainingType()));
-  Record.push_back(N->getVirtuality());
+  Record.push_back(N->getSPFlags());
   Record.push_back(N->getVirtualIndex());
   Record.push_back(N->getFlags());
-  Record.push_back(N->isOptimized());
   Record.push_back(VE.getMetadataOrNullID(N->getRawUnit()));
   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getDeclaration()));
@@ -2384,6 +2396,16 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
             Record.push_back(Flags);
         }
         break;
+      case Instruction::FNeg: {
+        assert(CE->getNumOperands() == 1 && "Unknown constant expr!");
+        Code = bitc::CST_CODE_CE_UNOP;
+        Record.push_back(getEncodedUnaryOpcode(CE->getOpcode()));
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        uint64_t Flags = getOptimizationFlags(CE);
+        if (Flags != 0)
+          Record.push_back(Flags);
+        break;
+      }
       case Instruction::GetElementPtr: {
         Code = bitc::CST_CODE_CE_GEP;
         const auto *GO = cast<GEPOperator>(C);
@@ -2556,7 +2578,19 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
       }
     }
     break;
-
+  case Instruction::FNeg: {
+    Code = bitc::FUNC_CODE_INST_UNOP;
+    if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+      AbbrevToUse = FUNCTION_INST_UNOP_ABBREV;
+    Vals.push_back(getEncodedUnaryOpcode(I.getOpcode()));
+    uint64_t Flags = getOptimizationFlags(&I);
+    if (Flags != 0) {
+      if (AbbrevToUse == FUNCTION_INST_UNOP_ABBREV)
+        AbbrevToUse = FUNCTION_INST_UNOP_FLAGS_ABBREV;
+      Vals.push_back(Flags);
+    }
+    break;
+  }
   case Instruction::GetElementPtr: {
     Code = bitc::FUNC_CODE_INST_GEP;
     AbbrevToUse = FUNCTION_INST_GEP_ABBREV;
@@ -3217,6 +3251,25 @@ void ModuleBitcodeWriter::writeBlockInfo() {
         FUNCTION_INST_LOAD_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
+  { // INST_UNOP abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNOP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_UNOP_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // INST_UNOP_FLAGS abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNOP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_UNOP_FLAGS_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
   { // INST_BINOP abbrev for FUNCTION_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
@@ -3489,6 +3542,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
   NameVals.push_back(FS->instCount());
   NameVals.push_back(getEncodedFFlags(FS->fflags()));
   NameVals.push_back(FS->refs().size());
+  NameVals.push_back(FS->immutableRefCount());
 
   for (auto &RI : FS->refs())
     NameVals.push_back(VE.getValueID(RI.getValue()));
@@ -3530,6 +3584,7 @@ void ModuleBitcodeWriterBase::writeModuleLevelReferences(
   NameVals.push_back(VE.getValueID(&V));
   GlobalVarSummary *VS = cast<GlobalVarSummary>(Summary);
   NameVals.push_back(getEncodedGVSummaryFlags(VS->flags()));
+  NameVals.push_back(getEncodedGVarFlags(VS->varflags()));
 
   unsigned SizeBeforeRefs = NameVals.size();
   for (auto &RI : VS->refs())
@@ -3546,7 +3601,7 @@ void ModuleBitcodeWriterBase::writeModuleLevelReferences(
 // Current version for the summary.
 // This is bumped whenever we introduce changes in the way some record are
 // interpreted, like flags for instance.
-static const uint64_t INDEX_VERSION = 4;
+static const uint64_t INDEX_VERSION = 6;
 
 /// Emit the per-module summary section alongside the rest of
 /// the module's bitcode.
@@ -3581,6 +3636,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // immutablerefcnt
   // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
@@ -3597,6 +3653,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // immutablerefcnt
   // numrefs x valueid, n x (valueid [, rel_block_freq])
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
@@ -3675,6 +3732,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     Flags |= 0x1;
   if (Index.skipModuleByDistributedBackend())
     Flags |= 0x2;
+  if (Index.hasSyntheticEntryCounts())
+    Flags |= 0x4;
   Stream.EmitRecord(bitc::FS_FLAGS, ArrayRef<uint64_t>{Flags});
 
   for (const auto &GVI : valueIds()) {
@@ -3690,7 +3749,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // entrycount
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // immutablerefcnt
   // numrefs x valueid, n x (valueid)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
@@ -3705,6 +3766,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // immutablerefcnt
   // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
@@ -3777,6 +3839,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
       NameVals.push_back(*ValueId);
       NameVals.push_back(Index.getModuleId(VS->modulePath()));
       NameVals.push_back(getEncodedGVSummaryFlags(VS->flags()));
+      NameVals.push_back(getEncodedGVarFlags(VS->varflags()));
       for (auto &RI : VS->refs()) {
         auto RefValueId = getValueId(RI.getGUID());
         if (!RefValueId)
@@ -3801,18 +3864,24 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
     NameVals.push_back(FS->instCount());
     NameVals.push_back(getEncodedFFlags(FS->fflags()));
+    NameVals.push_back(FS->entryCount());
+
     // Fill in below
-    NameVals.push_back(0);
+    NameVals.push_back(0); // numrefs
+    NameVals.push_back(0); // immutablerefcnt
 
-    unsigned Count = 0;
+    unsigned Count = 0, ImmutableRefCnt = 0;
     for (auto &RI : FS->refs()) {
       auto RefValueId = getValueId(RI.getGUID());
       if (!RefValueId)
         continue;
       NameVals.push_back(*RefValueId);
+      if (RI.isReadOnly())
+        ImmutableRefCnt++;
       Count++;
     }
-    NameVals[5] = Count;
+    NameVals[6] = Count;
+    NameVals[7] = ImmutableRefCnt;
 
     bool HasProfileData = false;
     for (auto &EI : FS->calls()) {
@@ -3960,7 +4029,7 @@ void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) {
 
     if (ModHash)
       // Save the written hash value.
-      std::copy(std::begin(Vals), std::end(Vals), std::begin(*ModHash));
+      llvm::copy(Vals, std::begin(*ModHash));
   }
 }
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 1f54c611bad66a74b5726f384b9c04967ef89416..f25c7a098558574e40d97f8e55102df491b3865d 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -12,6 +12,7 @@ add_subdirectory(Linker)
 add_subdirectory(Analysis)
 add_subdirectory(LTO)
 add_subdirectory(MC)
+add_subdirectory(MCA)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
@@ -23,6 +24,7 @@ add_subdirectory(AsmParser)
 add_subdirectory(LineEditor)
 add_subdirectory(ProfileData)
 add_subdirectory(Passes)
+add_subdirectory(TextAPI)
 add_subdirectory(ToolDrivers)
 add_subdirectory(XRay)
 add_subdirectory(Testing)
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 526f7ce30831518d2156b7aee4a4c2bf82d3f0d7..7070451e333073a7be91b788450ce8c9569c8976 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "AsmPrinterHandler.h"
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
@@ -36,6 +35,7 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/AsmPrinterHandler.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/GCStrategy.h"
@@ -54,6 +54,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -262,7 +263,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   // use the directive, where it would need the same conditionalization
   // anyway.
   const Triple &Target = TM.getTargetTriple();
-  OutStreamer->EmitVersionForTarget(Target);
+  OutStreamer->EmitVersionForTarget(Target, M.getSDKVersion());
 
   // Allow the target to emit any magic that it wants at the start of the file.
   EmitStartOfAsmFile(M);
@@ -629,8 +630,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 ///
 /// \p Value - The value to emit.
 /// \p Size - The size of the integer (in bytes) to emit.
-void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
-                                      unsigned Size) const {
+void AsmPrinter::EmitDebugValue(const MCExpr *Value, unsigned Size) const {
   OutStreamer->EmitValue(Value, Size);
 }
 
@@ -1499,6 +1499,9 @@ bool AsmPrinter::doFinalization(Module &M) {
   // Emit llvm.ident metadata in an '.ident' directive.
   EmitModuleIdents(M);
 
+  // Emit bytes for llvm.commandline metadata.
+  EmitModuleCommandLines(M);
+
   // Emit __morestack address if needed for indirect calls.
   if (MMI->usesMorestackAddr()) {
     unsigned Align = 1;
@@ -2008,6 +2011,29 @@ void AsmPrinter::EmitModuleIdents(Module &M) {
   }
 }
 
+void AsmPrinter::EmitModuleCommandLines(Module &M) {
+  MCSection *CommandLine = getObjFileLowering().getSectionForCommandLines();
+  if (!CommandLine)
+    return;
+
+  const NamedMDNode *NMD = M.getNamedMetadata("llvm.commandline");
+  if (!NMD || !NMD->getNumOperands())
+    return;
+
+  OutStreamer->PushSection();
+  OutStreamer->SwitchSection(CommandLine);
+  OutStreamer->EmitZeros(1);
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    const MDNode *N = NMD->getOperand(i);
+    assert(N->getNumOperands() == 1 &&
+           "llvm.commandline metadata entry can have only one operand");
+    const MDString *S = cast<MDString>(N->getOperand(0));
+    OutStreamer->EmitBytes(S->getString());
+    OutStreamer->EmitZeros(1);
+  }
+  OutStreamer->PopSection();
+}
+
 //===--------------------------------------------------------------------===//
 // Emission and print routines
 //
@@ -2977,11 +3003,6 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
   if (!S.usesMetadata())
     return nullptr;
 
-  assert(!S.useStatepoints() && "statepoints do not currently support custom"
-         " stackmap formats, please see the documentation for a description of"
-         " the default format.  If you really need a custom serialized format,"
-         " please file a bug");
-
   gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
   gcp_map_type::iterator GCPI = GCMap.find(&S);
   if (GCPI != GCMap.end())
@@ -3002,6 +3023,27 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
   report_fatal_error("no GCMetadataPrinter registered for GC: " + Twine(Name));
 }
 
+void AsmPrinter::emitStackMaps(StackMaps &SM) {
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  bool NeedsDefault = false;
+  if (MI->begin() == MI->end())
+    // No GC strategy, use the default format.
+    NeedsDefault = true;
+  else
+    for (auto &I : *MI) {
+      if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
+        if (MP->emitStackMaps(SM, *this))
+          continue;
+      // The strategy doesn't have printer or doesn't emit custom stack maps.
+      // Use the default format.
+      NeedsDefault = true;
+    }
+
+  if (NeedsDefault)
+    SM.serializeToStackMapSection();
+}
+
 /// Pin vtable to this file.
 AsmPrinterHandler::~AsmPrinterHandler() = default;
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 6055884706700ad94187c11742197ad49bbcd4a2..afce3ad3133b3376ae4445a639842cb4e5a15452 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -212,6 +212,9 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
   case MCCFIInstruction::OpWindowSave:
     OutStreamer->EmitCFIWindowSave();
     break;
+  case MCCFIInstruction::OpNegateRAState:
+    OutStreamer->EmitCFINegateRAState();
+    break;
   case MCCFIInstruction::OpSameValue:
     OutStreamer->EmitCFISameValue(Inst.getRegister());
     break;
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 01d018fdde37bad3c844ca5f11e758e37866bcb7..4132b1a30c42ab537812bf581792ae8b003e761b 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -44,6 +44,7 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h"
 #include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
@@ -92,9 +93,6 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
-static cl::opt<bool> EmitDebugGlobalHashes("emit-codeview-ghash-section",
-                                           cl::ReallyHidden, cl::init(false));
-
 static CPUType mapArchToCVCPUType(Triple::ArchType Type) {
   switch (Type) {
   case Triple::ArchType::x86:
@@ -125,6 +123,11 @@ CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
 
   TheCPU =
       mapArchToCVCPUType(Triple(MMI->getModule()->getTargetTriple()).getArch());
+
+  // Check if we should emit type record hashes.
+  ConstantInt *GH = mdconst::extract_or_null<ConstantInt>(
+      MMI->getModule()->getModuleFlag("CodeViewGHash"));
+  EmitDebugGlobalHashes = GH && !GH->isZero();
 }
 
 StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
@@ -729,14 +732,7 @@ static Version parseVersion(StringRef Name) {
 }
 
 void CodeViewDebug::emitCompilerInformation() {
-  MCContext &Context = MMI->getContext();
-  MCSymbol *CompilerBegin = Context.createTempSymbol(),
-           *CompilerEnd = Context.createTempSymbol();
-  OS.AddComment("Record length");
-  OS.emitAbsoluteSymbolDiff(CompilerEnd, CompilerBegin, 2);
-  OS.EmitLabel(CompilerBegin);
-  OS.AddComment("Record kind: S_COMPILE3");
-  OS.EmitIntValue(SymbolKind::S_COMPILE3, 2);
+  MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_COMPILE3);
   uint32_t Flags = 0;
 
   NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
@@ -775,7 +771,7 @@ void CodeViewDebug::emitCompilerInformation() {
   OS.AddComment("Null-terminated compiler version string");
   emitNullTerminatedSymbolName(OS, CompilerVersion);
 
-  OS.EmitLabel(CompilerEnd);
+  endSymbolRecord(CompilerEnd);
 }
 
 static TypeIndex getStringIdTypeIdx(GlobalTypeTableBuilder &TypeTable,
@@ -811,14 +807,12 @@ void CodeViewDebug::emitBuildInfo() {
 
   // Make a new .debug$S subsection for the S_BUILDINFO record, which points
   // from the module symbols into the type stream.
-  MCSymbol *BuildInfoEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
-  OS.AddComment("Record length");
-  OS.EmitIntValue(6, 2);
-  OS.AddComment("Record kind: S_BUILDINFO");
-  OS.EmitIntValue(unsigned(SymbolKind::S_BUILDINFO), 2);
+  MCSymbol *BISubsecEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
+  MCSymbol *BIEnd = beginSymbolRecord(SymbolKind::S_BUILDINFO);
   OS.AddComment("LF_BUILDINFO index");
   OS.EmitIntValue(BuildInfoIndex.getIndex(), 4);
-  endCVSubsection(BuildInfoEnd);
+  endSymbolRecord(BIEnd);
+  endCVSubsection(BISubsecEnd);
 }
 
 void CodeViewDebug::emitInlineeLinesSubsection() {
@@ -858,18 +852,11 @@ void CodeViewDebug::emitInlineeLinesSubsection() {
 void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI,
                                         const DILocation *InlinedAt,
                                         const InlineSite &Site) {
-  MCSymbol *InlineBegin = MMI->getContext().createTempSymbol(),
-           *InlineEnd = MMI->getContext().createTempSymbol();
-
   assert(TypeIndices.count({Site.Inlinee, nullptr}));
   TypeIndex InlineeIdx = TypeIndices[{Site.Inlinee, nullptr}];
 
   // SymbolRecord
-  OS.AddComment("Record length");
-  OS.emitAbsoluteSymbolDiff(InlineEnd, InlineBegin, 2);   // RecordLength
-  OS.EmitLabel(InlineBegin);
-  OS.AddComment("Record kind: S_INLINESITE");
-  OS.EmitIntValue(SymbolKind::S_INLINESITE, 2); // RecordKind
+  MCSymbol *InlineEnd = beginSymbolRecord(SymbolKind::S_INLINESITE);
 
   OS.AddComment("PtrParent");
   OS.EmitIntValue(0, 4);
@@ -884,7 +871,7 @@ void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI,
   OS.EmitCVInlineLinetableDirective(Site.SiteFuncId, FileId, StartLineNum,
                                     FI.Begin, FI.End);
 
-  OS.EmitLabel(InlineEnd);
+  endSymbolRecord(InlineEnd);
 
   emitLocalVariableList(FI, Site.InlinedLocals);
 
@@ -897,10 +884,7 @@ void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI,
   }
 
   // Close the scope.
-  OS.AddComment("Record length");
-  OS.EmitIntValue(2, 2);                                  // RecordLength
-  OS.AddComment("Record kind: S_INLINESITE_END");
-  OS.EmitIntValue(SymbolKind::S_INLINESITE_END, 2); // RecordKind
+  emitEndSymbolRecord(SymbolKind::S_INLINESITE_END);
 }
 
 void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
@@ -935,13 +919,7 @@ void CodeViewDebug::emitDebugInfoForThunk(const Function *GV,
   MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
 
   // Emit S_THUNK32
-  MCSymbol *ThunkRecordBegin = MMI->getContext().createTempSymbol(),
-           *ThunkRecordEnd   = MMI->getContext().createTempSymbol();
-  OS.AddComment("Record length");
-  OS.emitAbsoluteSymbolDiff(ThunkRecordEnd, ThunkRecordBegin, 2);
-  OS.EmitLabel(ThunkRecordBegin);
-  OS.AddComment("Record kind: S_THUNK32");
-  OS.EmitIntValue(unsigned(SymbolKind::S_THUNK32), 2);
+  MCSymbol *ThunkRecordEnd = beginSymbolRecord(SymbolKind::S_THUNK32);
   OS.AddComment("PtrParent");
   OS.EmitIntValue(0, 4);
   OS.AddComment("PtrEnd");
@@ -959,17 +937,13 @@ void CodeViewDebug::emitDebugInfoForThunk(const Function *GV,
   OS.AddComment("Function name");
   emitNullTerminatedSymbolName(OS, FuncName);
   // Additional fields specific to the thunk ordinal would go here.
-  OS.EmitLabel(ThunkRecordEnd);
+  endSymbolRecord(ThunkRecordEnd);
 
   // Local variables/inlined routines are purposely omitted here.  The point of
   // marking this as a thunk is so Visual Studio will NOT stop in this routine.
 
   // Emit S_PROC_ID_END
-  const unsigned RecordLengthForSymbolEnd = 2;
-  OS.AddComment("Record length");
-  OS.EmitIntValue(RecordLengthForSymbolEnd, 2);
-  OS.AddComment("Record kind: S_PROC_ID_END");
-  OS.EmitIntValue(unsigned(SymbolKind::S_PROC_ID_END), 2);
+  emitEndSymbolRecord(SymbolKind::S_PROC_ID_END);
 
   endCVSubsection(SymbolsEnd);
 }
@@ -1012,19 +986,9 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
   OS.AddComment("Symbol subsection for " + Twine(FuncName));
   MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
   {
-    MCSymbol *ProcRecordBegin = MMI->getContext().createTempSymbol(),
-             *ProcRecordEnd = MMI->getContext().createTempSymbol();
-    OS.AddComment("Record length");
-    OS.emitAbsoluteSymbolDiff(ProcRecordEnd, ProcRecordBegin, 2);
-    OS.EmitLabel(ProcRecordBegin);
-
-    if (GV->hasLocalLinkage()) {
-      OS.AddComment("Record kind: S_LPROC32_ID");
-      OS.EmitIntValue(unsigned(SymbolKind::S_LPROC32_ID), 2);
-    } else {
-      OS.AddComment("Record kind: S_GPROC32_ID");
-      OS.EmitIntValue(unsigned(SymbolKind::S_GPROC32_ID), 2);
-    }
+    SymbolKind ProcKind = GV->hasLocalLinkage() ? SymbolKind::S_LPROC32_ID
+                                                : SymbolKind::S_GPROC32_ID;
+    MCSymbol *ProcRecordEnd = beginSymbolRecord(ProcKind);
 
     // These fields are filled in by tools like CVPACK which run after the fact.
     OS.AddComment("PtrParent");
@@ -1053,15 +1017,9 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
     OS.AddComment("Function name");
     // Truncate the name so we won't overflow the record length field.
     emitNullTerminatedSymbolName(OS, FuncName);
-    OS.EmitLabel(ProcRecordEnd);
-
-    MCSymbol *FrameProcBegin = MMI->getContext().createTempSymbol(),
-             *FrameProcEnd = MMI->getContext().createTempSymbol();
-    OS.AddComment("Record length");
-    OS.emitAbsoluteSymbolDiff(FrameProcEnd, FrameProcBegin, 2);
-    OS.EmitLabel(FrameProcBegin);
-    OS.AddComment("Record kind: S_FRAMEPROC");
-    OS.EmitIntValue(unsigned(SymbolKind::S_FRAMEPROC), 2);
+    endSymbolRecord(ProcRecordEnd);
+
+    MCSymbol *FrameProcEnd = beginSymbolRecord(SymbolKind::S_FRAMEPROC);
     // Subtract out the CSR size since MSVC excludes that and we include it.
     OS.AddComment("FrameSize");
     OS.EmitIntValue(FI.FrameSize - FI.CSRSize, 4);
@@ -1077,7 +1035,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
     OS.EmitIntValue(0, 2);
     OS.AddComment("Flags (defines frame register)");
     OS.EmitIntValue(uint32_t(FI.FrameProcOpts), 4);
-    OS.EmitLabel(FrameProcEnd);
+    endSymbolRecord(FrameProcEnd);
 
     emitLocalVariableList(FI, FI.Locals);
     emitLexicalBlockList(FI.ChildBlocks, FI);
@@ -1095,13 +1053,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
     for (auto Annot : FI.Annotations) {
       MCSymbol *Label = Annot.first;
       MDTuple *Strs = cast<MDTuple>(Annot.second);
-      MCSymbol *AnnotBegin = MMI->getContext().createTempSymbol(),
-               *AnnotEnd = MMI->getContext().createTempSymbol();
-      OS.AddComment("Record length");
-      OS.emitAbsoluteSymbolDiff(AnnotEnd, AnnotBegin, 2);
-      OS.EmitLabel(AnnotBegin);
-      OS.AddComment("Record kind: S_ANNOTATION");
-      OS.EmitIntValue(SymbolKind::S_ANNOTATION, 2);
+      MCSymbol *AnnotEnd = beginSymbolRecord(SymbolKind::S_ANNOTATION);
       OS.EmitCOFFSecRel32(Label, /*Offset=*/0);
       // FIXME: Make sure we don't overflow the max record size.
       OS.EmitCOFFSectionIndex(Label);
@@ -1113,17 +1065,14 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
         assert(Str.data()[Str.size()] == '\0' && "non-nullterminated MDString");
         OS.EmitBytes(StringRef(Str.data(), Str.size() + 1));
       }
-      OS.EmitLabel(AnnotEnd);
+      endSymbolRecord(AnnotEnd);
     }
 
     if (SP != nullptr)
       emitDebugInfoForUDTs(LocalUDTs);
 
     // We're done with this function.
-    OS.AddComment("Record length");
-    OS.EmitIntValue(0x0002, 2);
-    OS.AddComment("Record kind: S_PROC_ID_END");
-    OS.EmitIntValue(unsigned(SymbolKind::S_PROC_ID_END), 2);
+    emitEndSymbolRecord(SymbolKind::S_PROC_ID_END);
   }
   endCVSubsection(SymbolsEnd);
 
@@ -1713,6 +1662,9 @@ TypeIndex CodeViewDebug::lowerTypePointer(const DIDerivedType *Ty,
     break;
   }
 
+  if (Ty->isObjectPointer())
+    PO |= PointerOptions::Const;
+
   PointerRecord PR(PointeeTI, PK, PM, PO, Ty->getSizeInBits() / 8);
   return TypeTable.writeLeafType(PR);
 }
@@ -1877,27 +1829,22 @@ TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty,
   // Lower the containing class type.
   TypeIndex ClassType = getTypeIndex(ClassTy);
 
-  SmallVector<TypeIndex, 8> ReturnAndArgTypeIndices;
-  for (DITypeRef ArgTypeRef : Ty->getTypeArray())
-    ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgTypeRef));
+  DITypeRefArray ReturnAndArgs = Ty->getTypeArray();
+
+  unsigned Index = 0;
+  SmallVector<TypeIndex, 8> ArgTypeIndices;
+  TypeIndex ReturnTypeIndex = getTypeIndex(ReturnAndArgs[Index++]);
 
-  // MSVC uses type none for variadic argument.
-  if (ReturnAndArgTypeIndices.size() > 1 &&
-      ReturnAndArgTypeIndices.back() == TypeIndex::Void()) {
-    ReturnAndArgTypeIndices.back() = TypeIndex::None();
-  }
-  TypeIndex ReturnTypeIndex = TypeIndex::Void();
-  ArrayRef<TypeIndex> ArgTypeIndices = None;
-  if (!ReturnAndArgTypeIndices.empty()) {
-    auto ReturnAndArgTypesRef = makeArrayRef(ReturnAndArgTypeIndices);
-    ReturnTypeIndex = ReturnAndArgTypesRef.front();
-    ArgTypeIndices = ReturnAndArgTypesRef.drop_front();
-  }
   TypeIndex ThisTypeIndex;
-  if (!IsStaticMethod && !ArgTypeIndices.empty()) {
-    ThisTypeIndex = ArgTypeIndices.front();
-    ArgTypeIndices = ArgTypeIndices.drop_front();
-  }
+  if (!IsStaticMethod && ReturnAndArgs.size() > 1)
+    ThisTypeIndex = getTypeIndexForThisPtr(ReturnAndArgs[Index++], Ty);
+
+  while (Index < ReturnAndArgs.size())
+    ArgTypeIndices.push_back(getTypeIndex(ReturnAndArgs[Index++]));
+
+  // MSVC uses type none for variadic argument.
+  if (!ArgTypeIndices.empty() && ArgTypeIndices.back() == TypeIndex::Void())
+    ArgTypeIndices.back() = TypeIndex::None();
 
   ArgListRecord ArgListRec(TypeRecordKind::ArgList, ArgTypeIndices);
   TypeIndex ArgListIndex = TypeTable.writeLeafType(ArgListRec);
@@ -1987,8 +1934,8 @@ static ClassOptions getCommonClassOptions(const DICompositeType *Ty) {
     CO |= ClassOptions::Nested;
 
   // Put the Scoped flag on function-local types. MSVC puts this flag for enum
-  // type only when it has an immediate function scope. Clang never puts enums 
-  // inside DILexicalBlock scopes. Enum types, as generated by clang, are 
+  // type only when it has an immediate function scope. Clang never puts enums
+  // inside DILexicalBlock scopes. Enum types, as generated by clang, are
   // always in function, class, or file scopes.
   if (Ty->getTag() == dwarf::DW_TAG_enumeration_type) {
     if (ImmediateScope && isa<DISubprogram>(ImmediateScope))
@@ -2444,6 +2391,31 @@ TypeIndex CodeViewDebug::getTypeIndex(DITypeRef TypeRef, DITypeRef ClassTyRef) {
   return recordTypeIndexForDINode(Ty, TI, ClassTy);
 }
 
+codeview::TypeIndex
+CodeViewDebug::getTypeIndexForThisPtr(DITypeRef TypeRef,
+                                      const DISubroutineType *SubroutineTy) {
+  const DIType *Ty = TypeRef.resolve();
+
+  PointerOptions Options = PointerOptions::None;
+  if (SubroutineTy->getFlags() & DINode::DIFlags::FlagLValueReference)
+    Options = PointerOptions::LValueRefThisPointer;
+  else if (SubroutineTy->getFlags() & DINode::DIFlags::FlagRValueReference)
+    Options = PointerOptions::RValueRefThisPointer;
+
+  // Check if we've already translated this type.  If there is no ref qualifier
+  // on the function then we look up this pointer type with no associated class
+  // so that the TypeIndex for the this pointer can be shared with the type
+  // index for other pointers to this class type.  If there is a ref qualifier
+  // then we lookup the pointer using the subroutine as the parent type.
+  auto I = TypeIndices.find({Ty, SubroutineTy});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  TypeLoweringScope S(*this);
+  TypeIndex TI = lowerTypePointer(cast<DIDerivedType>(Ty), Options);
+  return recordTypeIndexForDINode(Ty, TI, SubroutineTy);
+}
+
 TypeIndex CodeViewDebug::getTypeIndexForReferenceTo(DITypeRef TypeRef) {
   DIType *Ty = TypeRef.resolve();
   PointerRecord PR(getTypeIndex(Ty),
@@ -2461,6 +2433,14 @@ TypeIndex CodeViewDebug::getCompleteTypeIndex(DITypeRef TypeRef) {
   if (!Ty)
     return TypeIndex::Void();
 
+  // Look through typedefs when getting the complete type index. Call
+  // getTypeIndex on the typdef to ensure that any UDTs are accumulated and are
+  // emitted only once.
+  if (Ty->getTag() == dwarf::DW_TAG_typedef)
+    (void)getTypeIndex(Ty);
+  while (Ty->getTag() == dwarf::DW_TAG_typedef)
+    Ty = cast<DIDerivedType>(Ty)->getBaseType().resolve();
+
   // If this is a non-record type, the complete type index is the same as the
   // normal type index. Just call getTypeIndex.
   switch (Ty->getTag()) {
@@ -2562,14 +2542,7 @@ static void copyBytesForDefRange(SmallString<20> &BytePrefix,
 void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
                                       const LocalVariable &Var) {
   // LocalSym record, see SymbolRecord.h for more info.
-  MCSymbol *LocalBegin = MMI->getContext().createTempSymbol(),
-           *LocalEnd = MMI->getContext().createTempSymbol();
-  OS.AddComment("Record length");
-  OS.emitAbsoluteSymbolDiff(LocalEnd, LocalBegin, 2);
-  OS.EmitLabel(LocalBegin);
-
-  OS.AddComment("Record kind: S_LOCAL");
-  OS.EmitIntValue(unsigned(SymbolKind::S_LOCAL), 2);
+  MCSymbol *LocalEnd = beginSymbolRecord(SymbolKind::S_LOCAL);
 
   LocalSymFlags Flags = LocalSymFlags::None;
   if (Var.DIVar->isParameter())
@@ -2586,7 +2559,7 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
   OS.EmitIntValue(static_cast<uint16_t>(Flags), 2);
   // Truncate the name so we won't overflow the record length field.
   emitNullTerminatedSymbolName(OS, Var.DIVar->getName());
-  OS.EmitLabel(LocalEnd);
+  endSymbolRecord(LocalEnd);
 
   // Calculate the on disk prefix of the appropriate def range record. The
   // records and on disk formats are described in SymbolRecords.h. BytePrefix
@@ -2658,15 +2631,7 @@ void CodeViewDebug::emitLexicalBlockList(ArrayRef<LexicalBlock *> Blocks,
 /// lexical block scope.
 void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block,
                                      const FunctionInfo& FI) {
-  MCSymbol *RecordBegin = MMI->getContext().createTempSymbol(),
-           *RecordEnd   = MMI->getContext().createTempSymbol();
-
-  // Lexical block symbol record.
-  OS.AddComment("Record length");
-  OS.emitAbsoluteSymbolDiff(RecordEnd, RecordBegin, 2);   // Record Length
-  OS.EmitLabel(RecordBegin);
-  OS.AddComment("Record kind: S_BLOCK32");
-  OS.EmitIntValue(SymbolKind::S_BLOCK32, 2);              // Record Kind
+  MCSymbol *RecordEnd = beginSymbolRecord(SymbolKind::S_BLOCK32);
   OS.AddComment("PtrParent");
   OS.EmitIntValue(0, 4);                                  // PtrParent
   OS.AddComment("PtrEnd");
@@ -2679,7 +2644,7 @@ void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block,
   OS.EmitCOFFSectionIndex(FI.Begin);                      // Func Symbol
   OS.AddComment("Lexical block name");
   emitNullTerminatedSymbolName(OS, Block.Name);           // Name
-  OS.EmitLabel(RecordEnd);
+  endSymbolRecord(RecordEnd);
 
   // Emit variables local to this lexical block.
   emitLocalVariableList(FI, Block.Locals);
@@ -2688,10 +2653,7 @@ void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block,
   emitLexicalBlockList(Block.Children, FI);
 
   // Close the lexical block scope.
-  OS.AddComment("Record length");
-  OS.EmitIntValue(2, 2);                                  // Record Length
-  OS.AddComment("Record kind: S_END");
-  OS.EmitIntValue(SymbolKind::S_END, 2);                  // Record Kind
+  emitEndSymbolRecord(SymbolKind::S_END);
 }
 
 /// Convenience routine for collecting lexical block information for a list
@@ -2849,26 +2811,53 @@ void CodeViewDebug::endCVSubsection(MCSymbol *EndLabel) {
   OS.EmitValueToAlignment(4);
 }
 
+static StringRef getSymbolName(SymbolKind SymKind) {
+  for (const EnumEntry<SymbolKind> &EE : getSymbolTypeNames())
+    if (EE.Value == SymKind)
+      return EE.Name;
+  return "";
+}
+
+MCSymbol *CodeViewDebug::beginSymbolRecord(SymbolKind SymKind) {
+  MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(),
+           *EndLabel = MMI->getContext().createTempSymbol();
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 2);
+  OS.EmitLabel(BeginLabel);
+  if (OS.isVerboseAsm())
+    OS.AddComment("Record kind: " + getSymbolName(SymKind));
+  OS.EmitIntValue(unsigned(SymKind), 2);
+  return EndLabel;
+}
+
+void CodeViewDebug::endSymbolRecord(MCSymbol *SymEnd) {
+  // MSVC does not pad out symbol records to four bytes, but LLVM does to avoid
+  // an extra copy of every symbol record in LLD. This increases object file
+  // size by less than 1% in the clang build, and is compatible with the Visual
+  // C++ linker.
+  OS.EmitValueToAlignment(4);
+  OS.EmitLabel(SymEnd);
+}
+
+void CodeViewDebug::emitEndSymbolRecord(SymbolKind EndKind) {
+  OS.AddComment("Record length");
+  OS.EmitIntValue(2, 2);
+  if (OS.isVerboseAsm())
+    OS.AddComment("Record kind: " + getSymbolName(EndKind));
+  OS.EmitIntValue(unsigned(EndKind), 2); // Record Kind
+}
+
 void CodeViewDebug::emitDebugInfoForUDTs(
     ArrayRef<std::pair<std::string, const DIType *>> UDTs) {
   for (const auto &UDT : UDTs) {
     const DIType *T = UDT.second;
     assert(shouldEmitUdt(T));
 
-    MCSymbol *UDTRecordBegin = MMI->getContext().createTempSymbol(),
-             *UDTRecordEnd = MMI->getContext().createTempSymbol();
-    OS.AddComment("Record length");
-    OS.emitAbsoluteSymbolDiff(UDTRecordEnd, UDTRecordBegin, 2);
-    OS.EmitLabel(UDTRecordBegin);
-
-    OS.AddComment("Record kind: S_UDT");
-    OS.EmitIntValue(unsigned(SymbolKind::S_UDT), 2);
-
+    MCSymbol *UDTRecordEnd = beginSymbolRecord(SymbolKind::S_UDT);
     OS.AddComment("Type");
     OS.EmitIntValue(getCompleteTypeIndex(T).getIndex(), 4);
-
     emitNullTerminatedSymbolName(OS, UDT.first);
-    OS.EmitLabel(UDTRecordEnd);
+    endSymbolRecord(UDTRecordEnd);
   }
 }
 
@@ -2939,31 +2928,14 @@ void CodeViewDebug::emitDebugInfoForRetainedTypes() {
 void CodeViewDebug::emitDebugInfoForGlobal(const DIGlobalVariable *DIGV,
                                            const GlobalVariable *GV,
                                            MCSymbol *GVSym) {
-  // DataSym record, see SymbolRecord.h for more info.
-  // FIXME: Thread local data, etc
-  MCSymbol *DataBegin = MMI->getContext().createTempSymbol(),
-           *DataEnd = MMI->getContext().createTempSymbol();
-  const unsigned FixedLengthOfThisRecord = 12;
-  OS.AddComment("Record length");
-  OS.emitAbsoluteSymbolDiff(DataEnd, DataBegin, 2);
-  OS.EmitLabel(DataBegin);
-  if (DIGV->isLocalToUnit()) {
-    if (GV->isThreadLocal()) {
-      OS.AddComment("Record kind: S_LTHREAD32");
-      OS.EmitIntValue(unsigned(SymbolKind::S_LTHREAD32), 2);
-    } else {
-      OS.AddComment("Record kind: S_LDATA32");
-      OS.EmitIntValue(unsigned(SymbolKind::S_LDATA32), 2);
-    }
-  } else {
-    if (GV->isThreadLocal()) {
-      OS.AddComment("Record kind: S_GTHREAD32");
-      OS.EmitIntValue(unsigned(SymbolKind::S_GTHREAD32), 2);
-    } else {
-      OS.AddComment("Record kind: S_GDATA32");
-      OS.EmitIntValue(unsigned(SymbolKind::S_GDATA32), 2);
-    }
-  }
+  // DataSym record, see SymbolRecord.h for more info. Thread local data
+  // happens to have the same format as global data.
+  SymbolKind DataSym = GV->isThreadLocal()
+                           ? (DIGV->isLocalToUnit() ? SymbolKind::S_LTHREAD32
+                                                    : SymbolKind::S_GTHREAD32)
+                           : (DIGV->isLocalToUnit() ? SymbolKind::S_LDATA32
+                                                    : SymbolKind::S_GDATA32);
+  MCSymbol *DataEnd = beginSymbolRecord(DataSym);
   OS.AddComment("Type");
   OS.EmitIntValue(getCompleteTypeIndex(DIGV->getType()).getIndex(), 4);
   OS.AddComment("DataOffset");
@@ -2971,6 +2943,7 @@ void CodeViewDebug::emitDebugInfoForGlobal(const DIGlobalVariable *DIGV,
   OS.AddComment("Segment");
   OS.EmitCOFFSectionIndex(GVSym);
   OS.AddComment("Name");
-  emitNullTerminatedSymbolName(OS, DIGV->getName(), FixedLengthOfThisRecord);
-  OS.EmitLabel(DataEnd);
+  const unsigned LengthOfDataRecord = 12;
+  emitNullTerminatedSymbolName(OS, DIGV->getName(), LengthOfDataRecord);
+  endSymbolRecord(DataEnd);
 }
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index ef0f0c3635e501b8b5e73df307ef0dd3bfc5eede..fcc7c3290eb2c4aa1388dacf638cded494bb67b9 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -14,14 +14,14 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
 
-#include "DbgEntityHistoryCalculator.h"
-#include "DebugHandlerBase.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
+#include "llvm/CodeGen/DebugHandlerBase.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
@@ -54,6 +54,9 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   BumpPtrAllocator Allocator;
   codeview::GlobalTypeTableBuilder TypeTable;
 
+  /// Whether to emit type record hashes into .debug$H.
+  bool EmitDebugGlobalHashes = false;
+
   /// The codeview CPU type used by the translation unit.
   codeview::CPUType TheCPU;
 
@@ -299,9 +302,18 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   /// Returns an end label for use with endCVSubsection when the subsection is
   /// finished.
   MCSymbol *beginCVSubsection(codeview::DebugSubsectionKind Kind);
-
   void endCVSubsection(MCSymbol *EndLabel);
 
+  /// Opens a symbol record of the given kind. Returns an end label for use with
+  /// endSymbolRecord.
+  MCSymbol *beginSymbolRecord(codeview::SymbolKind Kind);
+  void endSymbolRecord(MCSymbol *SymEnd);
+
+  /// Emits an S_END, S_INLINESITE_END, or S_PROC_ID_END record. These records
+  /// are empty, so we emit them with a simpler assembly sequence that doesn't
+  /// involve labels.
+  void emitEndSymbolRecord(codeview::SymbolKind EndKind);
+
   void emitInlinedCallSite(const FunctionInfo &FI, const DILocation *InlinedAt,
                            const InlineSite &Site);
 
@@ -343,6 +355,10 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   codeview::TypeIndex getTypeIndex(DITypeRef TypeRef,
                                    DITypeRef ClassTyRef = DITypeRef());
 
+  codeview::TypeIndex
+  getTypeIndexForThisPtr(DITypeRef TypeRef,
+                         const DISubroutineType *SubroutineTy);
+
   codeview::TypeIndex getTypeIndexForReferenceTo(DITypeRef TypeRef);
 
   codeview::TypeIndex getMemberFunctionType(const DISubprogram *SP,
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 301fd9ef81b3cfabb952c1deb76e003b0c47ba69..e27659494f08125d0125ed860fed1435037c0d2f 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -465,7 +465,7 @@ void DIEInteger::print(raw_ostream &O) const {
 /// EmitValue - Emit expression value.
 ///
 void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form));
+  AP->EmitDebugValue(Expr, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of expression value in bytes.
diff --git a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index bb2fa7d9c72069bace064c968fc10745a052604b..09867822c30ac9f1c281cdc0108911edc96a3377 100644
--- a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DbgEntityHistoryCalculator.h"
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index a362dd40e3b16121df0df7534fb5a74bb164e99b..551cd36d19843d6cf1c0a68a37a5365ef5341008 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DebugHandlerBase.h"
+#include "llvm/CodeGen/DebugHandlerBase.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index d93c7f6c8459ea038b083b5109c986b770b4989d..1dca3f0fce5baf647f26da923f5d14e9746812a0 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1125,3 +1125,12 @@ bool DwarfCompileUnit::includeMinimalInlineScopes() const {
   return getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly ||
          (DD->useSplitDwarf() && !Skeleton);
 }
+
+void DwarfCompileUnit::addAddrTableBase() {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  MCSymbol *Label = DD->getAddressPool().getLabel();
+  addSectionLabel(getUnitDie(),
+                  getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base
+                                         : dwarf::DW_AT_GNU_addr_base,
+                  Label, TLOF.getDwarfAddrSection()->getBeginSymbol());
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 13679c37fe547c45ac7dd833a91e09160305bae4..9ec22f68c12f1a0dc4b69b4f5cca2de589c75143 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
-#include "DbgEntityHistoryCalculator.h"
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -23,6 +22,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -243,6 +243,9 @@ public:
 
   void emitHeader(bool UseOffsets) override;
 
+  /// Add the DW_AT_addr_base attribute to the unit DIE.
+  void addAddrTableBase();
+
   MCSymbol *getLabelBegin() const {
     assert(getSection());
     return LabelBegin;
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 070b8fe4ec1c4a1b7b1eb6cb4f7539270ba4604d..2d8c9e497388155f705894fe502285fbc0c4dc4b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -568,28 +568,10 @@ void DwarfDebug::addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const {
   U.addFlag(D, dwarf::DW_AT_GNU_pubnames);
 }
 
-// Create new DwarfCompileUnit for the given metadata node with tag
-// DW_TAG_compile_unit.
-DwarfCompileUnit &
-DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
-  if (auto *CU = CUMap.lookup(DIUnit))
-    return *CU;
-  StringRef FN = DIUnit->getFilename();
-  CompilationDir = DIUnit->getDirectory();
-
-  auto OwnedUnit = llvm::make_unique<DwarfCompileUnit>(
-      InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder);
-  DwarfCompileUnit &NewCU = *OwnedUnit;
+void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit,
+                                      DwarfCompileUnit &NewCU) {
   DIE &Die = NewCU.getUnitDie();
-  InfoHolder.addUnit(std::move(OwnedUnit));
-  if (useSplitDwarf()) {
-    NewCU.setSkeleton(constructSkeletonCU(NewCU));
-    NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name,
-                  Asm->TM.Options.MCOptions.SplitDwarfFile);
-  }
-
-  for (auto *IE : DIUnit->getImportedEntities())
-    NewCU.addImportedEntity(IE);
+  StringRef FN = DIUnit->getFilename();
 
   // LTO with assembly output shares a single line table amongst multiple CUs.
   // To avoid the compilation directory being ambiguous, let the line table
@@ -640,11 +622,6 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
                     dwarf::DW_FORM_data1, RVer);
   }
 
-  if (useSplitDwarf())
-    NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoDWOSection());
-  else
-    NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection());
-
   if (DIUnit->getDWOId()) {
     // This CU is either a clang module DWO or a skeleton CU.
     NewCU.addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8,
@@ -654,9 +631,34 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
       NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name,
                       DIUnit->getSplitDebugFilename());
   }
+}
+// Create new DwarfCompileUnit for the given metadata node with tag
+// DW_TAG_compile_unit.
+DwarfCompileUnit &
+DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
+  if (auto *CU = CUMap.lookup(DIUnit))
+    return *CU;
+
+  CompilationDir = DIUnit->getDirectory();
+
+  auto OwnedUnit = llvm::make_unique<DwarfCompileUnit>(
+      InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder);
+  DwarfCompileUnit &NewCU = *OwnedUnit;
+  InfoHolder.addUnit(std::move(OwnedUnit));
+
+  for (auto *IE : DIUnit->getImportedEntities())
+    NewCU.addImportedEntity(IE);
+
+  if (useSplitDwarf()) {
+    NewCU.setSkeleton(constructSkeletonCU(NewCU));
+    NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoDWOSection());
+  } else {
+    finishUnitAttributes(DIUnit, NewCU);
+    NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection());
+  }
 
   CUMap.insert({DIUnit, &NewCU});
-  CUDieMap.insert({&Die, &NewCU});
+  CUDieMap.insert({&NewCU.getUnitDie(), &NewCU});
   return NewCU;
 }
 
@@ -851,7 +853,12 @@ void DwarfDebug::finalizeModuleInfo() {
     // If we're splitting the dwarf out now that we've got the entire
     // CU then add the dwo id to it.
     auto *SkCU = TheCU.getSkeleton();
-    if (useSplitDwarf()) {
+    if (useSplitDwarf() && !empty(TheCU.getUnitDie().children())) {
+      finishUnitAttributes(TheCU.getCUNode(), TheCU);
+      TheCU.addString(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_name,
+                      Asm->TM.Options.MCOptions.SplitDwarfFile);
+      SkCU->addString(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_name,
+                      Asm->TM.Options.MCOptions.SplitDwarfFile);
       // Emit a unique identifier for this CU.
       uint64_t ID =
           DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie());
@@ -870,6 +877,8 @@ void DwarfDebug::finalizeModuleInfo() {
         SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base,
                               Sym, Sym);
       }
+    } else if (SkCU) {
+      finishUnitAttributes(SkCU->getCUNode(), *SkCU);
     }
 
     // If we have code split among multiple sections or non-contiguous
@@ -882,7 +891,9 @@ void DwarfDebug::finalizeModuleInfo() {
 
     // We don't keep track of which addresses are used in which CU so this
     // is a bit pessimistic under LTO.
-    if (!AddrPool.isEmpty())
+    if (!AddrPool.isEmpty() &&
+        (getDwarfVersion() >= 5 ||
+         (SkCU && !empty(TheCU.getUnitDie().children()))))
       U.addAddrTableBase();
 
     if (unsigned NumRanges = TheCU.getRanges().size()) {
@@ -1839,8 +1850,8 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
     if (GnuStyle) {
       dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheU, Entity);
       Asm->OutStreamer->AddComment(
-          Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
-          dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+          Twine("Attributes: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) +
+          ", " + dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
       Asm->emitInt8(Desc.toBits());
     }
 
@@ -2287,7 +2298,7 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
     // contributions.
     auto *Base = CUBase;
     if (!Base && (P.second.size() > 1 || DwarfVersion < 5) &&
-        (UseDwarfRangesBaseAddressSpecifier || DwarfVersion >= 5)) {
+        (CU.getCUNode()->getRangesBaseAddress() || DwarfVersion >= 5)) {
       BaseIsSet = true;
       // FIXME/use care: This may not be a useful base address if it's not
       // the lowest address/range in this object.
@@ -2483,8 +2494,6 @@ void DwarfDebug::emitDebugMacinfo() {
 
 void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
                                   std::unique_ptr<DwarfCompileUnit> NewU) {
-  NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name,
-                  Asm->TM.Options.MCOptions.SplitDwarfFile);
 
   if (!CompilationDir.empty())
     NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
@@ -2609,10 +2618,18 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   NewTU.setTypeSignature(Signature);
   Ins.first->second = Signature;
 
-  if (useSplitDwarf())
-    NewTU.setSection(Asm->getObjFileLowering().getDwarfTypesDWOSection());
-  else {
-    NewTU.setSection(Asm->getObjFileLowering().getDwarfTypesSection(Signature));
+  if (useSplitDwarf()) {
+    MCSection *Section =
+        getDwarfVersion() <= 4
+            ? Asm->getObjFileLowering().getDwarfTypesDWOSection()
+            : Asm->getObjFileLowering().getDwarfInfoDWOSection();
+    NewTU.setSection(Section);
+  } else {
+    MCSection *Section =
+        getDwarfVersion() <= 4
+            ? Asm->getObjFileLowering().getDwarfTypesSection(Signature)
+            : Asm->getObjFileLowering().getDwarfInfoSection(Signature);
+    NewTU.setSection(Section);
     // Non-split type units reuse the compile unit's line table.
     CU.applyStmtList(UnitDie);
   }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index c73d442af2fd6ea9aced119ce57a04d73482ec68..8a31e989b289a4c3c8c87310be2cd9041f3557f3 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -15,8 +15,6 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 
 #include "AddressPool.h"
-#include "DbgEntityHistoryCalculator.h"
-#include "DebugHandlerBase.h"
 #include "DebugLocStream.h"
 #include "DwarfFile.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -31,6 +29,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AccelTable.h"
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
+#include "llvm/CodeGen/DebugHandlerBase.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
@@ -540,6 +540,8 @@ class DwarfDebug : public DebugHandlerBase {
   /// Create new DwarfCompileUnit for the given metadata node with tag
   /// DW_TAG_compile_unit.
   DwarfCompileUnit &getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit);
+  void finishUnitAttributes(const DICompileUnit *DIUnit,
+                            DwarfCompileUnit &NewCU);
 
   /// Construct imported_module or imported_declaration DIE.
   void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 4e410bb49beab02e459d05c33eacf058fc4405fb..78ccad4814111b77bfff2801c6d786c19e70fe63 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -39,13 +39,17 @@ void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) {
   if (TheU->getCUNode()->isDebugDirectivesOnly())
     return;
 
-  DIE &Die = TheU->getUnitDie();
-  MCSection *USection = TheU->getSection();
-  Asm->OutStreamer->SwitchSection(USection);
+  MCSection *S = TheU->getSection();
 
+  if (!S)
+    return;
+
+  Asm->OutStreamer->SwitchSection(S);
   TheU->emitHeader(UseOffsets);
+  Asm->emitDwarfDIE(TheU->getUnitDie());
 
-  Asm->emitDwarfDIE(Die);
+  if (MCSymbol *EndLabel = TheU->getEndLabel())
+    Asm->OutStreamer->EmitLabel(EndLabel);
 }
 
 // Compute the size and offset for each DIE.
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 2053395808f1bcf1f16e5fb2d349a8eeb4d165ee..991dac7cf35b0e96f6d5476abfd2334d26e031e9 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1553,7 +1553,14 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
 void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
   // Emit size of content not including length itself
   Asm->OutStreamer->AddComment("Length of Unit");
-  Asm->emitInt32(getHeaderSize() + getUnitDie().getSize());
+  if (!DD->useSectionsAsReferences()) {
+    StringRef Prefix = isDwoUnit() ? "debug_info_dwo_" : "debug_info_";
+    MCSymbol *BeginLabel = Asm->createTempSymbol(Prefix + "start");
+    EndLabel = Asm->createTempSymbol(Prefix + "end");
+    Asm->EmitLabelDifference(EndLabel, BeginLabel, 4);
+    Asm->OutStreamer->EmitLabel(BeginLabel);
+  } else
+    Asm->emitInt32(getHeaderSize() + getUnitDie().getSize());
 
   Asm->OutStreamer->AddComment("DWARF version number");
   unsigned Version = DD->getDwarfVersion();
@@ -1664,12 +1671,3 @@ void DwarfUnit::addLoclistsBase() {
                   DU->getLoclistsTableBaseSym(),
                   TLOF.getDwarfLoclistsSection()->getBeginSymbol());
 }
-
-void DwarfUnit::addAddrTableBase() {
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-  MCSymbol *Label = DD->getAddressPool().getLabel();
-  addSectionLabel(getUnitDie(),
-                  getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base
-                                         : dwarf::DW_AT_GNU_addr_base,
-                  Label, TLOF.getDwarfAddrSection()->getBeginSymbol());
-}
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 860d165318430a9611bd09a569b41fbc2c1a9498..a59ebb7c1465c93c7cd27f0a0841408de4fe7722 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -49,6 +49,9 @@ protected:
   /// Target of Dwarf emission.
   AsmPrinter *Asm;
 
+  /// Emitted at the end of the CU and used to compute the CU Length field.
+  MCSymbol *EndLabel = nullptr;
+
   // Holders for some common dwarf information.
   DwarfDebug *DD;
   DwarfFile *DU;
@@ -82,6 +85,7 @@ protected:
 public:
   // Accessors.
   AsmPrinter* getAsmPrinter() const { return Asm; }
+  MCSymbol *getEndLabel() const { return EndLabel; }
   uint16_t getLanguage() const { return CUNode->getSourceLanguage(); }
   const DICompileUnit *getCUNode() const { return CUNode; }
 
@@ -275,9 +279,6 @@ public:
   /// Add the DW_AT_loclists_base attribute to the unit DIE.
   void addLoclistsBase();
 
-  /// Add the DW_AT_addr_base attribute to the unit DIE.
-  void addAddrTableBase();
-
   virtual DwarfCompileUnit &getCU() = 0;
 
   void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index e3a6f8e9d587268e821f716a0ef2871ec04c5745..ce912d032c6dac48d50709b223f3b89dc5a1125f 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
 
-#include "AsmPrinterHandler.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinterHandler.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 49cc376fcc9837acce8e2239a54f9bfcf8ea8a0e..34677ecc9e6945212e313b03249ad1cc1e1d35e4 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -15,10 +15,10 @@
 
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/BuiltinGCs.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/CodeGen/GCs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 59a57ed30d105104ec6c481494f476d1c4a0b079..3479a00def23b412b35f439e9e416c79d6b744bb 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -15,9 +15,9 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/BuiltinGCs.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
-#include "llvm/CodeGen/GCs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Mangler.h"
diff --git a/lib/CodeGen/AsmPrinter/WasmException.cpp b/lib/CodeGen/AsmPrinter/WasmException.cpp
index 46745d08c9f3b7103de89ed30ddf05833c8c4cfc..527e5ae50146a5d98bbbe6d4cd34fa8d9d043c54 100644
--- a/lib/CodeGen/AsmPrinter/WasmException.cpp
+++ b/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -13,9 +13,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "WasmException.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 using namespace llvm;
 
+void WasmException::endModule() {
+  // This is the symbol used in 'throw' and 'if_except' instruction to denote
+  // this is a C++ exception. This symbol has to be emitted somewhere once in
+  // the module.  Check if the symbol has already been created, i.e., we have at
+  // least one 'throw' or 'if_except' instruction in the module, and emit the
+  // symbol only if so.
+  SmallString<60> NameStr;
+  Mangler::getNameWithPrefix(NameStr, "__cpp_exception", Asm->getDataLayout());
+  if (Asm->OutContext.lookupSymbol(NameStr)) {
+    MCSymbol *ExceptionSym = Asm->GetExternalSymbolSymbol("__cpp_exception");
+    Asm->OutStreamer->EmitLabel(ExceptionSym);
+  }
+}
+
 void WasmException::markFunctionEnd() {
   // Get rid of any dead landing pads.
   if (!Asm->MF->getLandingPads().empty()) {
diff --git a/lib/CodeGen/AsmPrinter/WasmException.h b/lib/CodeGen/AsmPrinter/WasmException.h
index 09a9a25ce8d0027564e8867f3b6e55801014df68..cbdb42457cf80eaf75eb19b303365a310223e4b6 100644
--- a/lib/CodeGen/AsmPrinter/WasmException.h
+++ b/lib/CodeGen/AsmPrinter/WasmException.h
@@ -24,7 +24,7 @@ class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer {
 public:
   WasmException(AsmPrinter *A) : EHStreamer(A) {}
 
-  void endModule() override {}
+  void endModule() override;
   void beginFunction(const MachineFunction *MF) override {}
   virtual void markFunctionEnd() override;
   void endFunction(const MachineFunction *MF) override;
diff --git a/lib/CodeGen/AsmPrinter/WinCFGuard.h b/lib/CodeGen/AsmPrinter/WinCFGuard.h
index 124e8f04bfad8506beb2a9420cfcdad4232f6af8..28f119e359661f4d2ca89eaeddb9c716d2164f0f 100644
--- a/lib/CodeGen/AsmPrinter/WinCFGuard.h
+++ b/lib/CodeGen/AsmPrinter/WinCFGuard.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WINCFGUARD_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_WINCFGUARD_H
 
-#include "AsmPrinterHandler.h"
+#include "llvm/CodeGen/AsmPrinterHandler.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index c1b9879401527c1b48b7e9ac95a036d2a96faf77..95581c09dd1c46d5487ae2bd60b21fe1f40292cf 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -91,6 +91,7 @@ namespace {
     AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
     void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
     void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
+    void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
 
     AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
     static Value *insertRMWCmpXchgLoop(
@@ -944,6 +945,35 @@ void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
   AI->eraseFromParent();
 }
 
+void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) {
+  IRBuilder<> Builder(CI);
+
+  PartwordMaskValues PMV = createMaskInstrs(
+      Builder, CI, CI->getCompareOperand()->getType(), CI->getPointerOperand(),
+      TLI->getMinCmpXchgSizeInBits() / 8);
+
+  Value *CmpVal_Shifted = Builder.CreateShl(
+      Builder.CreateZExt(CI->getCompareOperand(), PMV.WordType), PMV.ShiftAmt,
+      "CmpVal_Shifted");
+  Value *NewVal_Shifted = Builder.CreateShl(
+      Builder.CreateZExt(CI->getNewValOperand(), PMV.WordType), PMV.ShiftAmt,
+      "NewVal_Shifted");
+  Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
+      Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask,
+      CI->getSuccessOrdering());
+  Value *FinalOldVal = Builder.CreateTrunc(
+      Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
+
+  Value *Res = UndefValue::get(CI->getType());
+  Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
+  Value *Success = Builder.CreateICmpEQ(
+      CmpVal_Shifted, Builder.CreateAnd(OldVal, PMV.Mask), "Success");
+  Res = Builder.CreateInsertValue(Res, Success, 1);
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+}
+
 Value *AtomicExpand::insertRMWLLSCLoop(
     IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
     AtomicOrdering MemOpOrder,
@@ -1366,8 +1396,8 @@ bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
     return expandAtomicCmpXchg(CI);
   }
   case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
-    llvm_unreachable(
-        "MaskedIntrinsic expansion of cmpxhg not yet implemented");
+    expandAtomicCmpXchgToMaskedIntrinsic(CI);
+    return true;
   }
 }
 
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index fa027eedd4d95979b6e88d21c6d3fc54178a8f6a..efbfd5f4ab2c582e9b06674810eb0111de99ef0f 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -298,7 +298,7 @@ static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
 
 ///  Whether MI should be counted as an instruction when calculating common tail.
 static bool countsAsInstruction(const MachineInstr &MI) {
-  return !(MI.isDebugValue() || MI.isCFIInstruction());
+  return !(MI.isDebugInstr() || MI.isCFIInstruction());
 }
 
 /// ComputeCommonTailLength - Given two machine basic blocks, compute the number
@@ -1363,9 +1363,9 @@ static void copyDebugInfoToPredecessor(const TargetInstrInfo *TII,
                                        MachineBasicBlock &PredMBB) {
   auto InsertBefore = PredMBB.getFirstTerminator();
   for (MachineInstr &MI : MBB.instrs())
-    if (MI.isDebugValue()) {
+    if (MI.isDebugInstr()) {
       TII->duplicate(PredMBB, InsertBefore, MI);
-      LLVM_DEBUG(dbgs() << "Copied debug value from empty block to pred: "
+      LLVM_DEBUG(dbgs() << "Copied debug entity from empty block to pred: "
                         << MI);
     }
 }
@@ -1375,9 +1375,9 @@ static void copyDebugInfoToSuccessor(const TargetInstrInfo *TII,
                                      MachineBasicBlock &SuccMBB) {
   auto InsertBefore = SuccMBB.SkipPHIsAndLabels(SuccMBB.begin());
   for (MachineInstr &MI : MBB.instrs())
-    if (MI.isDebugValue()) {
+    if (MI.isDebugInstr()) {
       TII->duplicate(SuccMBB, InsertBefore, MI);
-      LLVM_DEBUG(dbgs() << "Copied debug value from empty block to succ: "
+      LLVM_DEBUG(dbgs() << "Copied debug entity from empty block to succ: "
                         << MI);
     }
 }
diff --git a/lib/CodeGen/BuiltinGCs.cpp b/lib/CodeGen/BuiltinGCs.cpp
index 3a9b20aa661d64dcf0b59d63c86e4d196e01df5e..93939e573b7baa41052cf40c6be5c82cb8b6b753 100644
--- a/lib/CodeGen/BuiltinGCs.cpp
+++ b/lib/CodeGen/BuiltinGCs.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/BuiltinGCs.h"
 #include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/CodeGen/GCs.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/Casting.h"
 
@@ -28,10 +28,8 @@ namespace {
 class ErlangGC : public GCStrategy {
 public:
   ErlangGC() {
-    InitRoots = false;
-    NeededSafePoints = 1 << GC::PostCall;
+    NeededSafePoints = true;
     UsesMetadata = true;
-    CustomRoots = false;
   }
 };
 
@@ -41,7 +39,7 @@ public:
 class OcamlGC : public GCStrategy {
 public:
   OcamlGC() {
-    NeededSafePoints = 1 << GC::PostCall;
+    NeededSafePoints = true;
     UsesMetadata = true;
   }
 };
@@ -56,10 +54,7 @@ public:
 /// while introducing only minor runtime overhead.
 class ShadowStackGC : public GCStrategy {
 public:
-  ShadowStackGC() {
-    InitRoots = true;
-    CustomRoots = true;
-  }
+  ShadowStackGC() {}
 };
 
 /// A GCStrategy which serves as an example for the usage of a statepoint based
@@ -74,10 +69,8 @@ public:
     UseStatepoints = true;
     // These options are all gc.root specific, we specify them so that the
     // gc.root lowering code doesn't run.
-    InitRoots = false;
-    NeededSafePoints = 0;
+    NeededSafePoints = false;
     UsesMetadata = false;
-    CustomRoots = false;
   }
 
   Optional<bool> isGCManagedPointer(const Type *Ty) const override {
@@ -108,10 +101,8 @@ public:
     UseStatepoints = true;
     // These options are all gc.root specific, we specify them so that the
     // gc.root lowering code doesn't run.
-    InitRoots = false;
-    NeededSafePoints = 0;
+    NeededSafePoints = false;
     UsesMetadata = false;
-    CustomRoots = false;
   }
 
   Optional<bool> isGCManagedPointer(const Type *Ty) const override {
@@ -136,9 +127,5 @@ static GCRegistry::Add<StatepointGC> D("statepoint-example",
                                        "an example strategy for statepoint");
 static GCRegistry::Add<CoreCLRGC> E("coreclr", "CoreCLR-compatible GC");
 
-// Provide hooks to ensure the containing library is fully loaded.
-void llvm::linkErlangGC() {}
-void llvm::linkOcamlGC() {}
-void llvm::linkShadowStackGC() {}
-void llvm::linkStatepointExampleGC() {}
-void llvm::linkCoreCLRGC() {}
+// Provide hook to ensure the containing library is fully loaded.
+void llvm::linkAllBuiltinGCs() {}
diff --git a/lib/CodeGen/CFIInstrInserter.cpp b/lib/CodeGen/CFIInstrInserter.cpp
index 4fd119430cff8d335b0f53091f90b99a803daa23..c4799855a2b30eed983974b647511ea9a692da4d 100644
--- a/lib/CodeGen/CFIInstrInserter.cpp
+++ b/lib/CodeGen/CFIInstrInserter.cpp
@@ -207,6 +207,7 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
       case MCCFIInstruction::OpUndefined:
       case MCCFIInstruction::OpRegister:
       case MCCFIInstruction::OpWindowSave:
+      case MCCFIInstruction::OpNegateRAState:
       case MCCFIInstruction::OpGnuArgsSize:
         break;
       }
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index fbdc511eea7fa8542d0ffb46b7b42169ec65f0c8..e76f9f8ed4e7c90202ff9e6f7685a1fec364a13c 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_library(LLVMCodeGen
   InlineSpiller.cpp
   InterferenceCache.cpp
   InterleavedAccessPass.cpp
+  InterleavedLoadCombinePass.cpp
   IntrinsicLowering.cpp
   LatencyPriorityQueue.cpp
   LazyMachineBlockFrequencyInfo.cpp
@@ -83,7 +84,6 @@ add_llvm_library(LLVMCodeGen
   MachineOperand.cpp
   MachineOptimizationRemarkEmitter.cpp
   MachineOutliner.cpp
-  MachinePassRegistry.cpp
   MachinePipeliner.cpp
   MachinePostDominators.cpp
   MachineRegionInfo.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 2f845354c57027832713d2f55339e7a29c95f2fb..66166482c78bcd7ca31566a3f7b339c8cf929a61 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -42,6 +42,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeIfConverterPass(Registry);
   initializeImplicitNullChecksPass(Registry);
   initializeIndirectBrExpandPassPass(Registry);
+  initializeInterleavedLoadCombinePass(Registry);
   initializeInterleavedAccessPass(Registry);
   initializeLiveDebugValuesPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 651873bb91182a1636f9f7278f6828c995c47bc1..392e8bc0db48021a640b87c392969a4343d67846 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -416,7 +416,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   OptSize = F.optForSize();
 
   ProfileSummaryInfo *PSI =
-      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   if (ProfileGuidedSectionPrefix) {
     if (PSI->isFunctionHotInCallGraph(&F, *BFI))
       F.setSectionPrefix(".hot");
@@ -2362,6 +2362,8 @@ class TypePromotionTransaction {
 
     /// Keep track of the original uses (pair Instruction, Index).
     SmallVector<InstructionAndIdx, 4> OriginalUses;
+    /// Keep track of the debug users.
+    SmallVector<DbgValueInst *, 1> DbgValues;
 
     using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;
 
@@ -2375,6 +2377,10 @@ class TypePromotionTransaction {
         Instruction *UserI = cast<Instruction>(U.getUser());
         OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
       }
+      // Record the debug uses separately. They are not in the instruction's
+      // use list, but they are replaced by RAUW.
+      findDbgValues(DbgValues, Inst);
+
       // Now, we can replace the uses.
       Inst->replaceAllUsesWith(New);
     }
@@ -2387,6 +2393,15 @@ class TypePromotionTransaction {
            UseIt != EndIt; ++UseIt) {
         UseIt->Inst->setOperand(UseIt->Idx, Inst);
       }
+      // RAUW has replaced all original uses with references to the new value,
+      // including the debug uses. Since we are undoing the replacements,
+      // the original debug uses must also be reinstated to maintain the
+      // correctness and utility of debug value instructions.
+      for (auto *DVI: DbgValues) {
+        LLVMContext &Ctx = Inst->getType()->getContext();
+        auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst));
+        DVI->setOperand(0, MV);
+      }
     }
   };
 
@@ -2657,15 +2672,159 @@ private:
                              Value *PromotedOperand) const;
 };
 
+class PhiNodeSet;
+
+/// An iterator for PhiNodeSet.
+class PhiNodeSetIterator {
+  PhiNodeSet * const Set;
+  size_t CurrentIndex = 0;
+
+public:
+  /// The constructor. Start should point to either a valid element, or be equal
+  /// to the size of the underlying SmallVector of the PhiNodeSet.
+  PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start);
+  PHINode * operator*() const;
+  PhiNodeSetIterator& operator++();
+  bool operator==(const PhiNodeSetIterator &RHS) const;
+  bool operator!=(const PhiNodeSetIterator &RHS) const;
+};
+
+/// Keeps a set of PHINodes.
+///
+/// This is a minimal set implementation for a specific use case:
+/// It is very fast when there are very few elements, but also provides good
+/// performance when there are many. It is similar to SmallPtrSet, but also
+/// provides iteration by insertion order, which is deterministic and stable
+/// across runs. It is also similar to SmallSetVector, but provides removing
+/// elements in O(1) time. This is achieved by not actually removing the element
+/// from the underlying vector, so comes at the cost of using more memory, but
+/// that is fine, since PhiNodeSets are used as short lived objects.
+class PhiNodeSet {
+  friend class PhiNodeSetIterator;
+
+  using MapType = SmallDenseMap<PHINode *, size_t, 32>;
+  using iterator =  PhiNodeSetIterator;
+
+  /// Keeps the elements in the order of their insertion in the underlying
+  /// vector. To achieve constant time removal, it never deletes any element.
+  SmallVector<PHINode *, 32> NodeList;
+
+  /// Keeps the elements in the underlying set implementation. This (and not the
+  /// NodeList defined above) is the source of truth on whether an element
+  /// is actually in the collection.
+  MapType NodeMap;
+
+  /// Points to the first valid (not deleted) element when the set is not empty
+  /// and the value is not zero. Equals to the size of the underlying vector
+  /// when the set is empty. When the value is 0, as in the beginning, the
+  /// first element may or may not be valid.
+  size_t FirstValidElement = 0;
+
+public:
+  /// Inserts a new element to the collection.
+  /// \returns true if the element is actually added, i.e. was not in the
+  /// collection before the operation.
+  bool insert(PHINode *Ptr) {
+    if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) {
+      NodeList.push_back(Ptr);
+      return true;
+    }
+    return false;
+  }
+
+  /// Removes the element from the collection.
+  /// \returns whether the element is actually removed, i.e. was in the
+  /// collection before the operation.
+  bool erase(PHINode *Ptr) {
+    auto it = NodeMap.find(Ptr);
+    if (it != NodeMap.end()) {
+      NodeMap.erase(Ptr);
+      SkipRemovedElements(FirstValidElement);
+      return true;
+    }
+    return false;
+  }
+
+  /// Removes all elements and clears the collection.
+  void clear() {
+    NodeMap.clear();
+    NodeList.clear();
+    FirstValidElement = 0;
+  }
+
+  /// \returns an iterator that will iterate the elements in the order of
+  /// insertion.
+  iterator begin() {
+    if (FirstValidElement == 0)
+      SkipRemovedElements(FirstValidElement);
+    return PhiNodeSetIterator(this, FirstValidElement);
+  }
+
+  /// \returns an iterator that points to the end of the collection.
+  iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }
+
+  /// Returns the number of elements in the collection.
+  size_t size() const {
+    return NodeMap.size();
+  }
+
+  /// \returns 1 if the given element is in the collection, and 0 if otherwise.
+  size_t count(PHINode *Ptr) const {
+    return NodeMap.count(Ptr);
+  }
+
+private:
+  /// Updates the CurrentIndex so that it will point to a valid element.
+  ///
+  /// If the element of NodeList at CurrentIndex is valid, it does not
+  /// change it. If there are no more valid elements, it updates CurrentIndex
+  /// to point to the end of the NodeList.
+  void SkipRemovedElements(size_t &CurrentIndex) {
+    while (CurrentIndex < NodeList.size()) {
+      auto it = NodeMap.find(NodeList[CurrentIndex]);
+      // If the element has been deleted and added again later, NodeMap will
+      // point to a different index, so CurrentIndex will still be invalid.
+      if (it != NodeMap.end() && it->second == CurrentIndex)
+        break;
+      ++CurrentIndex;
+    }
+  }
+};
+
+PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)
+    : Set(Set), CurrentIndex(Start) {}
+
+PHINode * PhiNodeSetIterator::operator*() const {
+  assert(CurrentIndex < Set->NodeList.size() &&
+         "PhiNodeSet access out of range");
+  return Set->NodeList[CurrentIndex];
+}
+
+PhiNodeSetIterator& PhiNodeSetIterator::operator++() {
+  assert(CurrentIndex < Set->NodeList.size() &&
+         "PhiNodeSet access out of range");
+  ++CurrentIndex;
+  Set->SkipRemovedElements(CurrentIndex);
+  return *this;
+}
+
+bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {
+  return CurrentIndex == RHS.CurrentIndex;
+}
+
+bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
+  return !((*this) == RHS);
+}
+
 /// Keep track of simplification of Phi nodes.
 /// Accept the set of all phi nodes and erase phi node from this set
 /// if it is simplified.
 class SimplificationTracker {
   DenseMap<Value *, Value *> Storage;
   const SimplifyQuery &SQ;
-  // Tracks newly created Phi nodes. We use a SetVector to get deterministic
-  // order when iterating over the set in MatchPhiSet.
-  SmallSetVector<PHINode *, 32> AllPhiNodes;
+  // Tracks newly created Phi nodes. The elements are iterated by insertion
+  // order.
+  PhiNodeSet AllPhiNodes;
   // Tracks newly created Select nodes.
   SmallPtrSet<SelectInst *, 32> AllSelectNodes;
 
@@ -2697,7 +2856,7 @@ public:
           Put(PI, V);
           PI->replaceAllUsesWith(V);
           if (auto *PHI = dyn_cast<PHINode>(PI))
-            AllPhiNodes.remove(PHI);
+            AllPhiNodes.erase(PHI);
           if (auto *Select = dyn_cast<SelectInst>(PI))
             AllSelectNodes.erase(Select);
           PI->eraseFromParent();
@@ -2720,11 +2879,11 @@ public:
     assert(Get(To) == To && "Replacement PHI node is already replaced.");
     Put(From, To);
     From->replaceAllUsesWith(To);
-    AllPhiNodes.remove(From);
+    AllPhiNodes.erase(From);
     From->eraseFromParent();
   }
 
-  SmallSetVector<PHINode *, 32>& newPhiNodes() { return AllPhiNodes; }
+  PhiNodeSet& newPhiNodes() { return AllPhiNodes; }
 
   void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }
 
@@ -2752,8 +2911,7 @@ public:
 
 /// A helper class for combining addressing modes.
 class AddressingModeCombiner {
-  typedef std::pair<Value *, BasicBlock *> ValueInBB;
-  typedef DenseMap<ValueInBB, Value *> FoldAddrToValueMapping;
+  typedef DenseMap<Value *, Value *> FoldAddrToValueMapping;
   typedef std::pair<PHINode *, PHINode *> PHIPair;
 
 private:
@@ -2773,10 +2931,10 @@ private:
   const SimplifyQuery &SQ;
 
   /// Original Address.
-  ValueInBB Original;
+  Value *Original;
 
 public:
-  AddressingModeCombiner(const SimplifyQuery &_SQ, ValueInBB OriginalValue)
+  AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)
       : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}
 
   /// Get the combined AddrMode
@@ -2872,46 +3030,40 @@ public:
   }
 
 private:
-  /// Initialize Map with anchor values. For address seen in some BB
+  /// Initialize Map with anchor values. For address seen
   /// we set the value of different field saw in this address.
-  /// If address is not an instruction than basic block is set to null.
   /// At the same time we find a common type for different field we will
   /// use to create new Phi/Select nodes. Keep it in CommonType field.
   /// Return false if there is no common type found.
   bool initializeMap(FoldAddrToValueMapping &Map) {
     // Keep track of keys where the value is null. We will need to replace it
     // with constant null when we know the common type.
-    SmallVector<ValueInBB, 2> NullValue;
+    SmallVector<Value *, 2> NullValue;
     Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
     for (auto &AM : AddrModes) {
-      BasicBlock *BB = nullptr;
-      if (Instruction *I = dyn_cast<Instruction>(AM.OriginalValue))
-        BB = I->getParent();
-
       Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
       if (DV) {
         auto *Type = DV->getType();
         if (CommonType && CommonType != Type)
           return false;
         CommonType = Type;
-        Map[{ AM.OriginalValue, BB }] = DV;
+        Map[AM.OriginalValue] = DV;
       } else {
-        NullValue.push_back({ AM.OriginalValue, BB });
+        NullValue.push_back(AM.OriginalValue);
       }
     }
     assert(CommonType && "At least one non-null value must be!");
-    for (auto VIBB : NullValue)
-      Map[VIBB] = Constant::getNullValue(CommonType);
+    for (auto *V : NullValue)
+      Map[V] = Constant::getNullValue(CommonType);
     return true;
   }
 
-  /// We have mapping between value A and basic block where value A
-  /// seen to other value B where B was a field in addressing mode represented
-  /// by A. Also we have an original value C representing an address in some
-  /// basic block. Traversing from C through phi and selects we ended up with
-  /// A's in a map. This utility function tries to find a value V which is a
-  /// field in addressing mode C and traversing through phi nodes and selects
-  /// we will end up in corresponded values B in a map.
+  /// We have mapping between value A and other value B where B was a field in
+  /// addressing mode represented by A. Also we have an original value C
+  /// representing an address we start with. Traversing from C through phi and
+  /// selects we ended up with A's in a map. This utility function tries to find
+  /// a value V which is a field in addressing mode C and traversing through phi
+  /// nodes and selects we will end up in corresponded values B in a map.
   /// The utility will create a new Phi/Selects if needed.
   // The simple example looks as follows:
   // BB1:
@@ -2924,22 +3076,24 @@ private:
   //   p = phi [p1, BB1], [p2, BB2]
   //   v = load p
   // Map is
-  //   <p1, BB1> -> b1
-  //   <p2, BB2> -> b2
+  //   p1 -> b1
+  //   p2 -> b2
   // Request is
-  //   <p, BB3> -> ?
-  // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3
+  //   p -> ?
+  // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.
   Value *findCommon(FoldAddrToValueMapping &Map) {
     // Tracks the simplification of newly created phi nodes. The reason we use
     // this mapping is because we will add new created Phi nodes in AddrToBase.
     // Simplification of Phi nodes is recursive, so some Phi node may
-    // be simplified after we added it to AddrToBase.
+    // be simplified after we added it to AddrToBase. In reality this
+    // simplification is possible only if original phi/selects were not
+    // simplified yet.
     // Using this mapping we can find the current value in AddrToBase.
     SimplificationTracker ST(SQ);
 
     // First step, DFS to create PHI nodes for all intermediate blocks.
     // Also fill traverse order for the second step.
-    SmallVector<ValueInBB, 32> TraverseOrder;
+    SmallVector<Value *, 32> TraverseOrder;
     InsertPlaceholders(Map, TraverseOrder, ST);
 
     // Second Step, fill new nodes by merged values and simplify if possible.
@@ -2969,7 +3123,7 @@ private:
   /// Matcher tracks the matched Phi nodes.
   bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
                     SmallSetVector<PHIPair, 8> &Matcher,
-                    SmallSetVector<PHINode *, 32> &PhiNodesToMatch) {
+                    PhiNodeSet &PhiNodesToMatch) {
     SmallVector<PHIPair, 8> WorkList;
     Matcher.insert({ PHI, Candidate });
     WorkList.push_back({ PHI, Candidate });
@@ -3018,11 +3172,12 @@ private:
   /// Returns false if this matching fails and creation of new Phi is disabled.
   bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
                    unsigned &PhiNotMatchedCount) {
-    // Use a SetVector for Matched to make sure we do replacements (ReplacePhi)
-    // in a deterministic order below.
+    // Matched and PhiNodesToMatch iterate their elements in a deterministic
+    // order, so the replacements (ReplacePhi) are also done in a deterministic
+    // order.
     SmallSetVector<PHIPair, 8> Matched;
     SmallPtrSet<PHINode *, 8> WillNotMatch;
-    SmallSetVector<PHINode *, 32> &PhiNodesToMatch = ST.newPhiNodes();
+    PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();
     while (PhiNodesToMatch.size()) {
       PHINode *PHI = *PhiNodesToMatch.begin();
 
@@ -3057,129 +3212,86 @@ private:
       // Just remove all seen values in matcher. They will not match anything.
       PhiNotMatchedCount += WillNotMatch.size();
       for (auto *P : WillNotMatch)
-        PhiNodesToMatch.remove(P);
+        PhiNodesToMatch.erase(P);
     }
     return true;
   }
-  /// Fill the placeholder with values from predecessors and simplify it.
+  /// Fill the placeholders with values from predecessors and simplify them.
   void FillPlaceholders(FoldAddrToValueMapping &Map,
-                        SmallVectorImpl<ValueInBB> &TraverseOrder,
+                        SmallVectorImpl<Value *> &TraverseOrder,
                         SimplificationTracker &ST) {
     while (!TraverseOrder.empty()) {
-      auto Current = TraverseOrder.pop_back_val();
+      Value *Current = TraverseOrder.pop_back_val();
       assert(Map.find(Current) != Map.end() && "No node to fill!!!");
-      Value *CurrentValue = Current.first;
-      BasicBlock *CurrentBlock = Current.second;
       Value *V = Map[Current];
 
       if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
         // CurrentValue also must be Select.
-        auto *CurrentSelect = cast<SelectInst>(CurrentValue);
+        auto *CurrentSelect = cast<SelectInst>(Current);
         auto *TrueValue = CurrentSelect->getTrueValue();
-        ValueInBB TrueItem = { TrueValue, isa<Instruction>(TrueValue)
-                                              ? CurrentBlock
-                                              : nullptr };
-        assert(Map.find(TrueItem) != Map.end() && "No True Value!");
-        Select->setTrueValue(ST.Get(Map[TrueItem]));
+        assert(Map.find(TrueValue) != Map.end() && "No True Value!");
+        Select->setTrueValue(ST.Get(Map[TrueValue]));
         auto *FalseValue = CurrentSelect->getFalseValue();
-        ValueInBB FalseItem = { FalseValue, isa<Instruction>(FalseValue)
-                                                ? CurrentBlock
-                                                : nullptr };
-        assert(Map.find(FalseItem) != Map.end() && "No False Value!");
-        Select->setFalseValue(ST.Get(Map[FalseItem]));
+        assert(Map.find(FalseValue) != Map.end() && "No False Value!");
+        Select->setFalseValue(ST.Get(Map[FalseValue]));
       } else {
         // Must be a Phi node then.
         PHINode *PHI = cast<PHINode>(V);
+        auto *CurrentPhi = dyn_cast<PHINode>(Current);
         // Fill the Phi node with values from predecessors.
-        bool IsDefinedInThisBB =
-            cast<Instruction>(CurrentValue)->getParent() == CurrentBlock;
-        auto *CurrentPhi = dyn_cast<PHINode>(CurrentValue);
-        for (auto B : predecessors(CurrentBlock)) {
-          Value *PV = IsDefinedInThisBB
-                          ? CurrentPhi->getIncomingValueForBlock(B)
-                          : CurrentValue;
-          ValueInBB item = { PV, isa<Instruction>(PV) ? B : nullptr };
-          assert(Map.find(item) != Map.end() && "No predecessor Value!");
-          PHI->addIncoming(ST.Get(Map[item]), B);
+        for (auto B : predecessors(PHI->getParent())) {
+          Value *PV = CurrentPhi->getIncomingValueForBlock(B);
+          assert(Map.find(PV) != Map.end() && "No predecessor Value!");
+          PHI->addIncoming(ST.Get(Map[PV]), B);
         }
       }
-      // Simplify if possible.
       Map[Current] = ST.Simplify(V);
     }
   }
 
-  /// Starting from value recursively iterates over predecessors up to known
-  /// ending values represented in a map. For each traversed block inserts
-  /// a placeholder Phi or Select.
+  /// Starting from original value recursively iterates over def-use chain up to
+  /// known ending values represented in a map. For each traversed phi/select
+  /// inserts a placeholder Phi or Select.
   /// Reports all new created Phi/Select nodes by adding them to set.
-  /// Also reports and order in what basic blocks have been traversed.
+  /// Also reports and order in what values have been traversed.
   void InsertPlaceholders(FoldAddrToValueMapping &Map,
-                          SmallVectorImpl<ValueInBB> &TraverseOrder,
+                          SmallVectorImpl<Value *> &TraverseOrder,
                           SimplificationTracker &ST) {
-    SmallVector<ValueInBB, 32> Worklist;
-    assert((isa<PHINode>(Original.first) || isa<SelectInst>(Original.first)) &&
+    SmallVector<Value *, 32> Worklist;
+    assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&
            "Address must be a Phi or Select node");
     auto *Dummy = UndefValue::get(CommonType);
     Worklist.push_back(Original);
     while (!Worklist.empty()) {
-      auto Current = Worklist.pop_back_val();
-      // If value is not an instruction it is something global, constant,
-      // parameter and we can say that this value is observable in any block.
-      // Set block to null to denote it.
-      // Also please take into account that it is how we build anchors.
-      if (!isa<Instruction>(Current.first))
-        Current.second = nullptr;
+      Value *Current = Worklist.pop_back_val();
       // if it is already visited or it is an ending value then skip it.
       if (Map.find(Current) != Map.end())
         continue;
       TraverseOrder.push_back(Current);
 
-      Value *CurrentValue = Current.first;
-      BasicBlock *CurrentBlock = Current.second;
       // CurrentValue must be a Phi node or select. All others must be covered
       // by anchors.
-      Instruction *CurrentI = cast<Instruction>(CurrentValue);
-      bool IsDefinedInThisBB = CurrentI->getParent() == CurrentBlock;
-
-      unsigned PredCount = pred_size(CurrentBlock);
-      // if Current Value is not defined in this basic block we are interested
-      // in values in predecessors.
-      if (!IsDefinedInThisBB) {
-        assert(PredCount && "Unreachable block?!");
-        PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
-                                       &CurrentBlock->front());
-        Map[Current] = PHI;
-        ST.insertNewPhi(PHI);
-        // Add all predecessors in work list.
-        for (auto B : predecessors(CurrentBlock))
-          Worklist.push_back({ CurrentValue, B });
-        continue;
-      }
-      // Value is defined in this basic block.
-      if (SelectInst *OrigSelect = dyn_cast<SelectInst>(CurrentI)) {
+      if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
         // Is it OK to get metadata from OrigSelect?!
         // Create a Select placeholder with dummy value.
-        SelectInst *Select =
-            SelectInst::Create(OrigSelect->getCondition(), Dummy, Dummy,
-                               OrigSelect->getName(), OrigSelect, OrigSelect);
+        SelectInst *Select = SelectInst::Create(
+            CurrentSelect->getCondition(), Dummy, Dummy,
+            CurrentSelect->getName(), CurrentSelect, CurrentSelect);
         Map[Current] = Select;
         ST.insertNewSelect(Select);
-        // We are interested in True and False value in this basic block.
-        Worklist.push_back({ OrigSelect->getTrueValue(), CurrentBlock });
-        Worklist.push_back({ OrigSelect->getFalseValue(), CurrentBlock });
+        // We are interested in True and False values.
+        Worklist.push_back(CurrentSelect->getTrueValue());
+        Worklist.push_back(CurrentSelect->getFalseValue());
       } else {
         // It must be a Phi node then.
-        auto *CurrentPhi = cast<PHINode>(CurrentI);
-        // Create new Phi node for merge of bases.
-        assert(PredCount && "Unreachable block?!");
-        PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
-                                       &CurrentBlock->front());
+        PHINode *CurrentPhi = cast<PHINode>(Current);
+        unsigned PredCount = CurrentPhi->getNumIncomingValues();
+        PHINode *PHI =
+            PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi);
         Map[Current] = PHI;
         ST.insertNewPhi(PHI);
-
-        // Add all predecessors in work list.
-        for (auto B : predecessors(CurrentBlock))
-          Worklist.push_back({ CurrentPhi->getIncomingValueForBlock(B), B });
+        for (Value *P : CurrentPhi->incoming_values())
+          Worklist.push_back(P);
       }
     }
   }
@@ -4398,7 +4510,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   bool PhiOrSelectSeen = false;
   SmallVector<Instruction*, 16> AddrModeInsts;
   const SimplifyQuery SQ(*DL, TLInfo);
-  AddressingModeCombiner AddrModes(SQ, { Addr, MemoryInst->getParent() });
+  AddressingModeCombiner AddrModes(SQ, Addr);
   TypePromotionTransaction TPT(RemovedInsts);
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
       TPT.getRestorationPoint();
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index fe3d29657942612c7b684b2ce37824c1adf0d469..1c80556dfef57943142f6cfe8c59b3af1649a4c8 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -103,16 +103,6 @@ void Printer::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<GCModuleInfo>();
 }
 
-static const char *DescKind(GC::PointKind Kind) {
-  switch (Kind) {
-  case GC::PreCall:
-    return "pre-call";
-  case GC::PostCall:
-    return "post-call";
-  }
-  llvm_unreachable("Invalid point kind");
-}
-
 bool Printer::runOnFunction(Function &F) {
   if (F.hasGC())
     return false;
@@ -129,7 +119,7 @@ bool Printer::runOnFunction(Function &F) {
   for (GCFunctionInfo::iterator PI = FD->begin(), PE = FD->end(); PI != PE;
        ++PI) {
 
-    OS << "\t" << PI->Label->getName() << ": " << DescKind(PI->Kind)
+    OS << "\t" << PI->Label->getName() << ": " << "post-call"
        << ", live = {";
 
     for (GCFunctionInfo::live_iterator RI = FD->live_begin(PI),
diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp
index 31ddeadbd97a13e1576913d346e88f89bbb72ef0..e8ccd84b0b93e4ebe0c8798e49500c78d4916003 100644
--- a/lib/CodeGen/GCRootLowering.cpp
+++ b/lib/CodeGen/GCRootLowering.cpp
@@ -38,7 +38,7 @@ namespace {
 /// directed by the GCStrategy. It also performs automatic root initialization
 /// and custom intrinsic lowering.
 class LowerIntrinsics : public FunctionPass {
-  bool PerformDefaultLowering(Function &F, GCStrategy &S);
+  bool DoLowering(Function &F, GCStrategy &S);
 
 public:
   static char ID;
@@ -102,13 +102,6 @@ void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<DominatorTreeWrapperPass>();
 }
 
-static bool NeedsDefaultLoweringPass(const GCStrategy &C) {
-  // Default lowering is necessary only if read or write barriers have a default
-  // action. The default for roots is no action.
-  return !C.customWriteBarrier() || !C.customReadBarrier() ||
-         C.initializeRoots();
-}
-
 /// doInitialization - If this module uses the GC intrinsics, find them now.
 bool LowerIntrinsics::doInitialization(Module &M) {
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
@@ -148,8 +141,7 @@ static bool CouldBecomeSafePoint(Instruction *I) {
   return true;
 }
 
-static bool InsertRootInitializers(Function &F, AllocaInst **Roots,
-                                   unsigned Count) {
+static bool InsertRootInitializers(Function &F, ArrayRef<AllocaInst *> Roots) {
   // Scroll past alloca instructions.
   BasicBlock::iterator IP = F.getEntryBlock().begin();
   while (isa<AllocaInst>(IP))
@@ -166,12 +158,12 @@ static bool InsertRootInitializers(Function &F, AllocaInst **Roots,
   // Add root initializers.
   bool MadeChange = false;
 
-  for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
-    if (!InitedRoots.count(*I)) {
+  for (AllocaInst *Root : Roots)
+    if (!InitedRoots.count(Root)) {
       StoreInst *SI = new StoreInst(
-          ConstantPointerNull::get(cast<PointerType>((*I)->getAllocatedType())),
-          *I);
-      SI->insertAfter(*I);
+          ConstantPointerNull::get(cast<PointerType>(Root->getAllocatedType())),
+          Root);
+      SI->insertAfter(Root);
       MadeChange = true;
     }
 
@@ -188,64 +180,59 @@ bool LowerIntrinsics::runOnFunction(Function &F) {
   GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
   GCStrategy &S = FI.getStrategy();
 
-  bool MadeChange = false;
-
-  if (NeedsDefaultLoweringPass(S))
-    MadeChange |= PerformDefaultLowering(F, S);
-
-  return MadeChange;
+  return DoLowering(F, S);
 }
 
-bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
-  bool LowerWr = !S.customWriteBarrier();
-  bool LowerRd = !S.customReadBarrier();
-  bool InitRoots = S.initializeRoots();
-
+/// Lower barriers out of existance (if the associated GCStrategy hasn't
+/// already done so...), and insert initializing stores to roots as a defensive
+/// measure.  Given we're going to report all roots live at all safepoints, we
+/// need to be able to ensure each root has been initialized by the point the
+/// first safepoint is reached.  This really should have been done by the
+/// frontend, but the old API made this non-obvious, so we do a potentially
+/// redundant store just in case.  
+bool LowerIntrinsics::DoLowering(Function &F, GCStrategy &S) {
   SmallVector<AllocaInst *, 32> Roots;
 
   bool MadeChange = false;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
-      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) {
-        Function *F = CI->getCalledFunction();
-        switch (F->getIntrinsicID()) {
-        case Intrinsic::gcwrite:
-          if (LowerWr) {
-            // Replace a write barrier with a simple store.
-            Value *St =
-                new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI);
-            CI->replaceAllUsesWith(St);
-            CI->eraseFromParent();
-          }
-          break;
-        case Intrinsic::gcread:
-          if (LowerRd) {
-            // Replace a read barrier with a simple load.
-            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
-            Ld->takeName(CI);
-            CI->replaceAllUsesWith(Ld);
-            CI->eraseFromParent();
-          }
-          break;
-        case Intrinsic::gcroot:
-          if (InitRoots) {
-            // Initialize the GC root, but do not delete the intrinsic. The
-            // backend needs the intrinsic to flag the stack slot.
-            Roots.push_back(
-                cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
-          }
-          break;
-        default:
-          continue;
-        }
-
+  for (BasicBlock &BB : F) 
+    for (BasicBlock::iterator II = BB.begin(), E = BB.end(); II != E;) {
+      IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++);
+      if (!CI)
+        continue;
+
+      Function *F = CI->getCalledFunction();
+      switch (F->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::gcwrite: {
+        // Replace a write barrier with a simple store.
+        Value *St = new StoreInst(CI->getArgOperand(0),
+                                  CI->getArgOperand(2), CI);
+        CI->replaceAllUsesWith(St);
+        CI->eraseFromParent();
         MadeChange = true;
+        break;
+      }
+      case Intrinsic::gcread: {
+        // Replace a read barrier with a simple load.
+        Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
+        Ld->takeName(CI);
+        CI->replaceAllUsesWith(Ld);
+        CI->eraseFromParent();
+        MadeChange = true;
+        break;
+      }
+      case Intrinsic::gcroot: {
+        // Initialize the GC root, but do not delete the intrinsic. The
+        // backend needs the intrinsic to flag the stack slot.
+        Roots.push_back(
+            cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+        break;
+      }
       }
     }
-  }
 
   if (Roots.size())
-    MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size());
+    MadeChange |= InsertRootInitializers(F, Roots);
 
   return MadeChange;
 }
@@ -276,26 +263,18 @@ MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
 }
 
 void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
-  // Find the return address (next instruction), too, so as to bracket the call
-  // instruction.
+  // Find the return address (next instruction), since that's what will be on
+  // the stack when the call is suspended and we need to inspect the stack.
   MachineBasicBlock::iterator RAI = CI;
   ++RAI;
 
-  if (FI->getStrategy().needsSafePoint(GC::PreCall)) {
-    MCSymbol *Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc());
-    FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc());
-  }
-
-  if (FI->getStrategy().needsSafePoint(GC::PostCall)) {
-    MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
-    FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc());
-  }
+  MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
+  FI->addSafePoint(Label, CI->getDebugLoc());
 }
 
 void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
-  for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE;
-       ++BBI)
-    for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end();
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineBasicBlock::iterator MI = MBB.begin(), ME = MBB.end();
          MI != ME; ++MI)
       if (MI->isCall()) {
         // Do not treat tail or sibling call sites as safe points.  This is
diff --git a/lib/CodeGen/GlobalISel/CMakeLists.txt b/lib/CodeGen/GlobalISel/CMakeLists.txt
index 4c1da3756b1839ff05821d90fd81c83d4200d627..5f13692bbee1f454a7a442eecf377bb29826191f 100644
--- a/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMGlobalISel
         GlobalISel.cpp
         Combiner.cpp
         CombinerHelper.cpp
+        GISelChangeObserver.cpp
         IRTranslator.cpp
         InstructionSelect.cpp
         InstructionSelector.cpp
diff --git a/lib/CodeGen/GlobalISel/Combiner.cpp b/lib/CodeGen/GlobalISel/Combiner.cpp
index f3f075af4869598c724f9e833d3d4556e9394d58..90fd54ec244ba34b5a45ac892f4a2182dda10e85 100644
--- a/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -1,4 +1,4 @@
-//===-- lib/CodeGen/GlobalISel/GICombiner.cpp -----------------------===//
+//===-- lib/CodeGen/GlobalISel/Combiner.cpp -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelWorkList.h"
-#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
@@ -28,27 +29,62 @@ using namespace llvm;
 namespace {
 /// This class acts as the glue the joins the CombinerHelper to the overall
 /// Combine algorithm. The CombinerHelper is intended to report the
-/// modifications it makes to the MIR to the CombinerChangeObserver and the
+/// modifications it makes to the MIR to the GISelChangeObserver and the
 /// observer subclass will act on these events. In this case, instruction
 /// erasure will cancel any future visits to the erased instruction and
 /// instruction creation will schedule that instruction for a future visit.
 /// Other Combiner implementations may require more complex behaviour from
-/// their CombinerChangeObserver subclass.
-class WorkListMaintainer : public CombinerChangeObserver {
+/// their GISelChangeObserver subclass.
+class WorkListMaintainer : public GISelChangeObserver,
+                           public MachineFunction::Delegate {
   using WorkListTy = GISelWorkList<512>;
+  MachineFunction &MF;
   WorkListTy &WorkList;
+  /// The instructions that have been created but we want to report once they
+  /// have their operands. This is only maintained if debug output is requested.
+  SmallPtrSet<const MachineInstr *, 4> CreatedInstrs;
 
 public:
-  WorkListMaintainer(WorkListTy &WorkList) : WorkList(WorkList) {}
-  virtual ~WorkListMaintainer() {}
+  WorkListMaintainer(MachineFunction &MF, WorkListTy &WorkList)
+      : GISelChangeObserver(), MF(MF), WorkList(WorkList) {
+    MF.setDelegate(this);
+  }
+  virtual ~WorkListMaintainer() {
+    MF.resetDelegate(this);
+  }
 
-  void erasedInstr(MachineInstr &MI) override {
-    LLVM_DEBUG(dbgs() << "Erased: "; MI.print(dbgs()); dbgs() << "\n");
+  void erasingInstr(const MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Erased: " << MI << "\n");
     WorkList.remove(&MI);
   }
-  void createdInstr(MachineInstr &MI) override {
-    LLVM_DEBUG(dbgs() << "Created: "; MI.print(dbgs()); dbgs() << "\n");
+  void createdInstr(const MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Creating: " << MI << "\n");
     WorkList.insert(&MI);
+    LLVM_DEBUG(CreatedInstrs.insert(&MI));
+  }
+  void changingInstr(const MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Changing: " << MI << "\n");
+    WorkList.insert(&MI);
+  }
+  void changedInstr(const MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Changed: " << MI << "\n");
+    WorkList.insert(&MI);
+  }
+
+  void reportFullyCreatedInstrs() {
+    LLVM_DEBUG(for (const auto *MI
+                    : CreatedInstrs) {
+      dbgs() << "Created: ";
+      MI->print(dbgs());
+    });
+    LLVM_DEBUG(CreatedInstrs.clear());
+  }
+
+  void MF_HandleInsertion(const MachineInstr &MI) override {
+    createdInstr(MI);
+  }
+  void MF_HandleRemoval(const MachineInstr &MI) override {
+    erasingInstr(MI);
   }
 };
 }
@@ -80,8 +116,8 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF) {
     // insert with list bottom up, so while we pop_back_val, we'll traverse top
     // down RPOT.
     Changed = false;
-    GISelWorkList<512> WorkList;
-    WorkListMaintainer Observer(WorkList);
+    GISelWorkList<512> WorkList(&MF);
+    WorkListMaintainer Observer(MF, WorkList);
     for (MachineBasicBlock *MBB : post_order(&MF)) {
       if (MBB->empty())
         continue;
@@ -100,8 +136,9 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF) {
     // Main Loop. Process the instructions here.
     while (!WorkList.empty()) {
       MachineInstr *CurrInst = WorkList.pop_back_val();
-      LLVM_DEBUG(dbgs() << "Try combining " << *CurrInst << "\n";);
+      LLVM_DEBUG(dbgs() << "\nTry combining " << *CurrInst;);
       Changed |= CInfo.combine(Observer, *CurrInst, Builder);
+      Observer.reportFullyCreatedInstrs();
     }
     MFChanged |= Changed;
   } while (Changed);
diff --git a/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 557bc880fd6ce0421dfef8db76b601e16db334c7..b1c5670a6dec3f76398394b1785b180ec4fa2449 100644
--- a/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1,4 +1,4 @@
-//== ---lib/CodeGen/GlobalISel/GICombinerHelper.cpp --------------------- == //
+//===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,27 +6,44 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 
-#define DEBUG_TYPE "gi-combine"
+#define DEBUG_TYPE "gi-combiner"
 
 using namespace llvm;
 
-CombinerHelper::CombinerHelper(CombinerChangeObserver &Observer,
+CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
                                MachineIRBuilder &B)
     : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer) {}
 
-void CombinerHelper::eraseInstr(MachineInstr &MI) {
-  Observer.erasedInstr(MI);
+void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, unsigned FromReg,
+                                    unsigned ToReg) const {
+  Observer.changingAllUsesOfReg(MRI, FromReg);
+
+  if (MRI.constrainRegAttrs(ToReg, FromReg))
+    MRI.replaceRegWith(FromReg, ToReg);
+  else
+    Builder.buildCopy(ToReg, FromReg);
+
+  Observer.finishedChangingAllUsesOfReg();
 }
-void CombinerHelper::scheduleForVisit(MachineInstr &MI) {
-  Observer.createdInstr(MI);
+
+void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
+                                      MachineOperand &FromRegOp,
+                                      unsigned ToReg) const {
+  assert(FromRegOp.getParent() && "Expected an operand in an MI");
+  Observer.changingInstr(*FromRegOp.getParent());
+
+  FromRegOp.setReg(ToReg);
+
+  Observer.changedInstr(*FromRegOp.getParent());
 }
 
 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
@@ -40,7 +57,7 @@ bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
   // a(sx) = COPY b(sx) -> Replace all uses of a with b.
   if (DstTy.isValid() && SrcTy.isValid() && DstTy == SrcTy) {
     MI.eraseFromParent();
-    MRI.replaceRegWith(DstReg, SrcReg);
+    replaceRegWith(MRI, DstReg, SrcReg);
     return true;
   }
   return false;
@@ -193,8 +210,11 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
   // type since by definition the result of an extend is larger.
   assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
 
+  LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI);
+
   // Rewrite the load to the chosen extending load.
   unsigned ChosenDstReg = Preferred.MI->getOperand(0).getReg();
+  Observer.changingInstr(MI);
   MI.setDesc(
       Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT
                                ? TargetOpcode::G_SEXTLOAD
@@ -213,7 +233,7 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
     if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
       unsigned UseDstReg = UseMI->getOperand(0).getReg();
-      unsigned UseSrcReg = UseMI->getOperand(1).getReg();
+      MachineOperand &UseSrcMO = UseMI->getOperand(1);
       const LLT &UseDstTy = MRI.getType(UseDstReg);
       if (UseDstReg != ChosenDstReg) {
         if (Preferred.Ty == UseDstTy) {
@@ -226,7 +246,7 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
           // rewrites to:
           //    %2:_(s32) = G_SEXTLOAD ...
           //    ... = ... %2(s32)
-          MRI.replaceRegWith(UseDstReg, ChosenDstReg);
+          replaceRegWith(MRI, UseDstReg, ChosenDstReg);
           ScheduleForErase.push_back(UseMO.getParent());
         } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
           // If the preferred size is smaller, then keep the extend but extend
@@ -239,7 +259,7 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
           //    %2:_(s32) = G_SEXTLOAD ...
           //    %3:_(s64) = G_ANYEXT %2:_(s32)
           //    ... = ... %3(s64)
-          MRI.replaceRegWith(UseSrcReg, ChosenDstReg);
+          replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg);
         } else {
           // If the preferred size is large, then insert a truncate. For
           // example:
@@ -286,7 +306,9 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
 
     MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB);
     if (PreviouslyEmitted) {
+      Observer.changingInstr(*UseMO->getParent());
       UseMO->setReg(PreviouslyEmitted->getOperand(0).getReg());
+      Observer.changedInstr(*UseMO->getParent());
       continue;
     }
 
@@ -294,14 +316,14 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
     unsigned NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
     MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg);
     EmittedInsns[InsertIntoBB] = NewMI;
-    UseMO->setReg(NewDstReg);
-    Observer.createdInstr(*NewMI);
+    replaceRegOpWith(MRI, *UseMO, NewDstReg);
   }
   for (auto &EraseMI : ScheduleForErase) {
+    Observer.erasingInstr(*EraseMI);
     EraseMI->eraseFromParent();
-    Observer.erasedInstr(*EraseMI);
   }
   MI.getOperand(0).setReg(ChosenDstReg);
+  Observer.changedInstr(MI);
 
   return true;
 }
diff --git a/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp b/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..993a919826fa6a3eeb54c34ff515d874f56837ef
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
@@ -0,0 +1,31 @@
+//===-- lib/CodeGen/GlobalISel/GISelChangeObserver.cpp --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file constains common code to combine machine functions at generic
+// level.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+void GISelChangeObserver::changingAllUsesOfReg(
+    const MachineRegisterInfo &MRI, unsigned Reg) {
+  for (auto &ChangingMI : MRI.use_instructions(Reg)) {
+    changingInstr(ChangingMI);
+    ChangingAllUsesOfReg.insert(&ChangingMI);
+  }
+}
+
+void GISelChangeObserver::finishedChangingAllUsesOfReg() {
+  for (auto *ChangedMI : ChangingAllUsesOfReg)
+    changedInstr(*ChangedMI);
+}
+
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index ef090777726c52908e1047cdb9e31d307a18b508..33313756a3899d6f3fbcef46a57b9b325a484437 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -215,7 +215,7 @@ ArrayRef<unsigned> IRTranslator::getOrCreateVRegs(const Value &Val) {
     unsigned Idx = 0;
     while (auto Elt = C.getAggregateElement(Idx++)) {
       auto EltRegs = getOrCreateVRegs(*Elt);
-      std::copy(EltRegs.begin(), EltRegs.end(), std::back_inserter(*VRegs));
+      llvm::copy(EltRegs, std::back_inserter(*VRegs));
     }
   } else {
     assert(SplitTys.size() == 1 && "unexpectedly split LLT");
@@ -330,6 +330,13 @@ bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
   return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
 }
 
+bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
+  MIRBuilder.buildInstr(TargetOpcode::G_FNEG)
+      .addDef(getOrCreateVReg(U))
+      .addUse(getOrCreateVReg(*U.getOperand(1)));
+  return true;
+}
+
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
   const CmpInst *CI = dyn_cast<CmpInst>(&U);
@@ -347,8 +354,10 @@ bool IRTranslator::translateCompare(const User &U,
   else if (Pred == CmpInst::FCMP_TRUE)
     MIRBuilder.buildCopy(
         Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType())));
-  else
-    MIRBuilder.buildFCmp(Pred, Res, Op0, Op1);
+  else {
+    auto FCmp = MIRBuilder.buildFCmp(Pred, Res, Op0, Op1);
+    FCmp->copyIRFlags(*CI);
+  }
 
   return true;
 }
@@ -581,8 +590,15 @@ bool IRTranslator::translateSelect(const User &U,
   ArrayRef<unsigned> Op0Regs = getOrCreateVRegs(*U.getOperand(1));
   ArrayRef<unsigned> Op1Regs = getOrCreateVRegs(*U.getOperand(2));
 
-  for (unsigned i = 0; i < ResRegs.size(); ++i)
-    MIRBuilder.buildSelect(ResRegs[i], Tst, Op0Regs[i], Op1Regs[i]);
+  const SelectInst &SI = cast<SelectInst>(U);
+  const CmpInst *Cmp = dyn_cast<CmpInst>(SI.getCondition());
+  for (unsigned i = 0; i < ResRegs.size(); ++i) {
+    auto Select =
+        MIRBuilder.buildSelect(ResRegs[i], Tst, Op0Regs[i], Op1Regs[i]);
+    if (Cmp && isa<FPMathOperator>(Cmp)) {
+      Select->copyIRFlags(*Cmp);
+    }
+  }
 
   return true;
 }
@@ -862,37 +878,56 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);
   case Intrinsic::smul_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder);
-  case Intrinsic::pow:
-    MIRBuilder.buildInstr(TargetOpcode::G_FPOW)
+  case Intrinsic::pow: {
+    auto Pow = MIRBuilder.buildInstr(TargetOpcode::G_FPOW)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
+    Pow->copyIRFlags(CI);
+    return true;
+  }
+  case Intrinsic::exp: {
+    auto Exp = MIRBuilder.buildInstr(TargetOpcode::G_FEXP)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    Exp->copyIRFlags(CI);
     return true;
-  case Intrinsic::exp:
-    MIRBuilder.buildInstr(TargetOpcode::G_FEXP)
+  }
+  case Intrinsic::exp2: {
+    auto Exp2 = MIRBuilder.buildInstr(TargetOpcode::G_FEXP2)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    Exp2->copyIRFlags(CI);
     return true;
-  case Intrinsic::exp2:
-    MIRBuilder.buildInstr(TargetOpcode::G_FEXP2)
+  }
+  case Intrinsic::log: {
+    auto Log = MIRBuilder.buildInstr(TargetOpcode::G_FLOG)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    Log->copyIRFlags(CI);
     return true;
-  case Intrinsic::log:
-    MIRBuilder.buildInstr(TargetOpcode::G_FLOG)
+  }
+  case Intrinsic::log2: {
+    auto Log2 = MIRBuilder.buildInstr(TargetOpcode::G_FLOG2)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    Log2->copyIRFlags(CI);
     return true;
-  case Intrinsic::log2:
-    MIRBuilder.buildInstr(TargetOpcode::G_FLOG2)
+  }
+  case Intrinsic::log10: {
+    auto Log10 = MIRBuilder.buildInstr(TargetOpcode::G_FLOG10)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    Log10->copyIRFlags(CI);
     return true;
-  case Intrinsic::fabs:
-    MIRBuilder.buildInstr(TargetOpcode::G_FABS)
+  }
+  case Intrinsic::fabs: {
+    auto Fabs = MIRBuilder.buildInstr(TargetOpcode::G_FABS)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    Fabs->copyIRFlags(CI);
     return true;
+  }
   case Intrinsic::trunc:
     MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC)
         .addDef(getOrCreateVReg(CI))
@@ -903,13 +938,15 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
     return true;
-  case Intrinsic::fma:
-    MIRBuilder.buildInstr(TargetOpcode::G_FMA)
+  case Intrinsic::fma: {
+    auto FMA = MIRBuilder.buildInstr(TargetOpcode::G_FMA)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(1)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(2)));
+    FMA->copyIRFlags(CI);
     return true;
+  }
   case Intrinsic::fmuladd: {
     const TargetMachine &TM = MF->getTarget();
     const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
@@ -921,11 +958,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
         TLI.isFMAFasterThanFMulAndFAdd(TLI.getValueType(*DL, CI.getType()))) {
       // TODO: Revisit this to see if we should move this part of the
       // lowering to the combiner.
-      MIRBuilder.buildInstr(TargetOpcode::G_FMA, Dst, Op0, Op1, Op2);
+      auto FMA =  MIRBuilder.buildInstr(TargetOpcode::G_FMA, {Dst}, {Op0, Op1, Op2});
+      FMA->copyIRFlags(CI);
     } else {
       LLT Ty = getLLTForType(*CI.getType(), *DL);
-      auto FMul = MIRBuilder.buildInstr(TargetOpcode::G_FMUL, Ty, Op0, Op1);
-      MIRBuilder.buildInstr(TargetOpcode::G_FADD, Dst, FMul, Op2);
+      auto FMul = MIRBuilder.buildInstr(TargetOpcode::G_FMUL, {Ty}, {Op0, Op1});
+      FMul->copyIRFlags(CI);
+      auto FAdd =  MIRBuilder.buildInstr(TargetOpcode::G_FADD, {Dst}, {FMul, Op2});
+      FAdd->copyIRFlags(CI);
     }
     return true;
   }
@@ -961,13 +1001,15 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     getStackGuard(GuardVal, MIRBuilder);
 
     AllocaInst *Slot = cast<AllocaInst>(CI.getArgOperand(1));
+    int FI = getOrCreateFrameIndex(*Slot);
+    MF->getFrameInfo().setStackProtectorIndex(FI);
+
     MIRBuilder.buildStore(
         GuardVal, getOrCreateVReg(*Slot),
-        *MF->getMachineMemOperand(
-            MachinePointerInfo::getFixedStack(*MF,
-                                              getOrCreateFrameIndex(*Slot)),
-            MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
-            PtrTy.getSizeInBits() / 8, 8));
+        *MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
+                                  MachineMemOperand::MOStore |
+                                      MachineMemOperand::MOVolatile,
+                                  PtrTy.getSizeInBits() / 8, 8));
     return true;
   }
   case Intrinsic::cttz:
@@ -1400,7 +1442,7 @@ bool IRTranslator::translatePHI(const User &U, MachineIRBuilder &MIRBuilder) {
 
   SmallVector<MachineInstr *, 4> Insts;
   for (auto Reg : getOrCreateVRegs(PI)) {
-    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI, Reg);
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI, {Reg}, {});
     Insts.push_back(MIB.getInstr());
   }
 
@@ -1584,22 +1626,22 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {
     // Return the scalar if it is a <1 x Ty> vector.
     if (CAZ->getNumElements() == 1)
       return translate(*CAZ->getElementValue(0u), Reg);
-    std::vector<unsigned> Ops;
+    SmallVector<unsigned, 4> Ops;
     for (unsigned i = 0; i < CAZ->getNumElements(); ++i) {
       Constant &Elt = *CAZ->getElementValue(i);
       Ops.push_back(getOrCreateVReg(Elt));
     }
-    EntryBuilder.buildMerge(Reg, Ops);
+    EntryBuilder.buildBuildVector(Reg, Ops);
   } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) {
     // Return the scalar if it is a <1 x Ty> vector.
     if (CV->getNumElements() == 1)
       return translate(*CV->getElementAsConstant(0), Reg);
-    std::vector<unsigned> Ops;
+    SmallVector<unsigned, 4> Ops;
     for (unsigned i = 0; i < CV->getNumElements(); ++i) {
       Constant &Elt = *CV->getElementAsConstant(i);
       Ops.push_back(getOrCreateVReg(Elt));
     }
-    EntryBuilder.buildMerge(Reg, Ops);
+    EntryBuilder.buildBuildVector(Reg, Ops);
   } else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
     switch(CE->getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS)                         \
@@ -1615,7 +1657,7 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {
     for (unsigned i = 0; i < CV->getNumOperands(); ++i) {
       Ops.push_back(getOrCreateVReg(*CV->getOperand(i)));
     }
-    EntryBuilder.buildMerge(Reg, Ops);
+    EntryBuilder.buildBuildVector(Reg, Ops);
   } else if (auto *BA = dyn_cast<BlockAddress>(&C)) {
     EntryBuilder.buildBlockAddress(Reg, BA);
   } else
diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp
index 9a2aac998a84b42cde5e75df115d25241b5b94dd..df2dcace4212c9033519f385420085780f49f3a0 100644
--- a/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelWorkList.h"
 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
@@ -64,9 +65,52 @@ static bool isArtifact(const MachineInstr &MI) {
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_MERGE_VALUES:
   case TargetOpcode::G_UNMERGE_VALUES:
+  case TargetOpcode::G_CONCAT_VECTORS:
+  case TargetOpcode::G_BUILD_VECTOR:
     return true;
   }
 }
+using InstListTy = GISelWorkList<256>;
+using ArtifactListTy = GISelWorkList<128>;
+
+class LegalizerWorkListManager : public GISelChangeObserver {
+  InstListTy &InstList;
+  ArtifactListTy &ArtifactList;
+
+public:
+  LegalizerWorkListManager(InstListTy &Insts, ArtifactListTy &Arts)
+      : InstList(Insts), ArtifactList(Arts) {}
+
+  void createdInstr(const MachineInstr &MI) override {
+    // Only legalize pre-isel generic instructions.
+    // Legalization process could generate Target specific pseudo
+    // instructions with generic types. Don't record them
+    if (isPreISelGenericOpcode(MI.getOpcode())) {
+      if (isArtifact(MI))
+        ArtifactList.insert(&MI);
+      else
+        InstList.insert(&MI);
+    }
+    LLVM_DEBUG(dbgs() << ".. .. New MI: " << MI);
+  }
+
+  void erasingInstr(const MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << ".. .. Erasing: " << MI);
+    InstList.remove(&MI);
+    ArtifactList.remove(&MI);
+  }
+
+  void changingInstr(const MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << ".. .. Changing MI: " << MI);
+  }
+
+  void changedInstr(const MachineInstr &MI) override {
+    // When insts change, we want to revisit them to legalize them again.
+    // We'll consider them the same as created.
+    LLVM_DEBUG(dbgs() << ".. .. Changed MI: " << MI);
+    createdInstr(MI);
+  }
+};
 
 bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   // If the ISel pipeline failed, do not bother running that pass.
@@ -77,14 +121,13 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   init(MF);
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
-  LegalizerHelper Helper(MF);
 
   const size_t NumBlocks = MF.size();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Populate Insts
-  GISelWorkList<256> InstList;
-  GISelWorkList<128> ArtifactList;
+  InstListTy InstList(&MF);
+  ArtifactListTy ArtifactList(&MF);
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   // Perform legalization bottom up so we can DCE as we legalize.
   // Traverse BB in RPOT and within each basic block, add insts top down,
@@ -103,24 +146,12 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
         InstList.insert(&MI);
     }
   }
-  Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) {
-    // Only legalize pre-isel generic instructions.
-    // Legalization process could generate Target specific pseudo
-    // instructions with generic types. Don't record them
-    if (isPreISelGenericOpcode(MI->getOpcode())) {
-      if (isArtifact(*MI))
-        ArtifactList.insert(MI);
-      else
-        InstList.insert(MI);
-    }
-    LLVM_DEBUG(dbgs() << ".. .. New MI: " << *MI;);
-  });
+  LegalizerWorkListManager WorkListObserver(InstList, ArtifactList);
+  LegalizerHelper Helper(MF, WorkListObserver);
   const LegalizerInfo &LInfo(Helper.getLegalizerInfo());
   LegalizationArtifactCombiner ArtCombiner(Helper.MIRBuilder, MF.getRegInfo(), LInfo);
-  auto RemoveDeadInstFromLists = [&InstList,
-                                  &ArtifactList](MachineInstr *DeadMI) {
-    InstList.remove(DeadMI);
-    ArtifactList.remove(DeadMI);
+  auto RemoveDeadInstFromLists = [&WorkListObserver](MachineInstr *DeadMI) {
+    WorkListObserver.erasingInstr(*DeadMI);
   };
   bool Changed = false;
   do {
@@ -138,7 +169,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       // Error out if we couldn't legalize this instruction. We may want to
       // fall back to DAG ISel instead in the future.
       if (Res == LegalizerHelper::UnableToLegalize) {
-        Helper.MIRBuilder.stopRecordingInsertions();
+        Helper.MIRBuilder.stopObservingChanges();
         reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
                            "unable to legalize instruction", MI);
         return false;
@@ -149,7 +180,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *ArtifactList.pop_back_val();
       assert(isPreISelGenericOpcode(MI.getOpcode()) && "Expecting generic opcode");
       if (isTriviallyDead(MI, MRI)) {
-        LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n");
+        LLVM_DEBUG(dbgs() << MI << "Is dead\n");
         RemoveDeadInstFromLists(&MI);
         MI.eraseFromParentAndMarkDBGValuesForRemoval();
         continue;
@@ -157,7 +188,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       SmallVector<MachineInstr *, 4> DeadInstructions;
       if (ArtCombiner.tryCombineInstruction(MI, DeadInstructions)) {
         for (auto *DeadMI : DeadInstructions) {
-          LLVM_DEBUG(dbgs() << ".. Erasing Dead Instruction " << *DeadMI);
+          LLVM_DEBUG(dbgs() << *DeadMI << "Is dead\n");
           RemoveDeadInstFromLists(DeadMI);
           DeadMI->eraseFromParentAndMarkDBGValuesForRemoval();
         }
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 516f5ce4343e7c34efb2279e7fdef00491cd0417..266409dd1fabceb3c2d329bc1337e1a4619af098 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -29,14 +30,19 @@
 using namespace llvm;
 using namespace LegalizeActions;
 
-LegalizerHelper::LegalizerHelper(MachineFunction &MF)
-    : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) {
+LegalizerHelper::LegalizerHelper(MachineFunction &MF,
+                                 GISelChangeObserver &Observer)
+    : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()),
+      Observer(Observer) {
   MIRBuilder.setMF(MF);
+  MIRBuilder.setChangeObserver(Observer);
 }
 
-LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI)
-    : MRI(MF.getRegInfo()), LI(LI) {
+LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
+                                 GISelChangeObserver &Observer)
+    : MRI(MF.getRegInfo()), LI(LI), Observer(Observer) {
   MIRBuilder.setMF(MF);
+  MIRBuilder.setChangeObserver(Observer);
 }
 LegalizerHelper::LegalizeResult
 LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
@@ -64,8 +70,8 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
   case Custom:
     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
-    return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized
-                                                  : UnableToLegalize;
+    return LI.legalizeCustom(MI, MRI, MIRBuilder, Observer) ? Legalized
+                                                            : UnableToLegalize;
   default:
     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
     return UnableToLegalize;
@@ -82,17 +88,20 @@ void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts,
 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
   switch (Opcode) {
   case TargetOpcode::G_SDIV:
-    assert(Size == 32 && "Unsupported size");
-    return RTLIB::SDIV_I32;
+    assert((Size == 32 || Size == 64) && "Unsupported size");
+    return Size == 64 ? RTLIB::SDIV_I64 : RTLIB::SDIV_I32;
   case TargetOpcode::G_UDIV:
-    assert(Size == 32 && "Unsupported size");
-    return RTLIB::UDIV_I32;
+    assert((Size == 32 || Size == 64) && "Unsupported size");
+    return Size == 64 ? RTLIB::UDIV_I64 : RTLIB::UDIV_I32;
   case TargetOpcode::G_SREM:
-    assert(Size == 32 && "Unsupported size");
-    return RTLIB::SREM_I32;
+    assert((Size == 32 || Size == 64) && "Unsupported size");
+    return Size == 64 ? RTLIB::SREM_I64 : RTLIB::SREM_I32;
   case TargetOpcode::G_UREM:
+    assert((Size == 32 || Size == 64) && "Unsupported size");
+    return Size == 64 ? RTLIB::UREM_I64 : RTLIB::UREM_I32;
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF:
     assert(Size == 32 && "Unsupported size");
-    return RTLIB::UREM_I32;
+    return RTLIB::CTLZ_I32;
   case TargetOpcode::G_FADD:
     assert((Size == 32 || Size == 64) && "Unsupported size");
     return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32;
@@ -189,8 +198,9 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   case TargetOpcode::G_SDIV:
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_SREM:
-  case TargetOpcode::G_UREM: {
-    Type *HLTy = Type::getInt32Ty(Ctx);
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
+    Type *HLTy = IntegerType::get(Ctx, Size);
     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
     if (Status != Legalized)
       return Status;
@@ -294,7 +304,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     for (int i = 0; i < NumParts; ++i)
       DstRegs.push_back(
           MIRBuilder.buildUndef(NarrowTy)->getOperand(0).getReg());
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+
+    unsigned DstReg = MI.getOperand(0).getReg();
+    if(MRI.getType(DstReg).isVector())
+      MIRBuilder.buildBuildVector(DstReg, DstRegs);
+    else
+      MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -324,7 +339,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       CarryIn = CarryOut;
     }
     unsigned DstReg = MI.getOperand(0).getReg();
-    MIRBuilder.buildMerge(DstReg, DstRegs);
+    if(MRI.getType(DstReg).isVector())
+      MIRBuilder.buildBuildVector(DstReg, DstRegs);
+    else
+      MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -380,7 +398,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       DstRegs.push_back(SegReg);
     }
 
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    if(MRI.getType(DstReg).isVector())
+      MIRBuilder.buildBuildVector(DstReg, DstRegs);
+    else
+      MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -441,7 +463,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     }
 
     assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered");
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    if(MRI.getType(DstReg).isVector())
+      MIRBuilder.buildBuildVector(DstReg, DstRegs);
+    else
+      MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -482,7 +508,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       DstRegs.push_back(DstReg);
     }
     unsigned DstReg = MI.getOperand(0).getReg();
-    MIRBuilder.buildMerge(DstReg, DstRegs);
+    if(MRI.getType(DstReg).isVector())
+      MIRBuilder.buildBuildVector(DstReg, DstRegs);
+    else
+      MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -542,11 +571,16 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       DstRegs.push_back(DstReg);
     }
     unsigned DstReg = MI.getOperand(0).getReg();
-    MIRBuilder.buildMerge(DstReg, DstRegs);
+    if(MRI.getType(DstReg).isVector())
+      MIRBuilder.buildBuildVector(DstReg, DstRegs);
+    else
+      MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
-  case TargetOpcode::G_OR: {
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
     // Legalize bitwise operation:
     // A = BinOp<Ty> B, C
     // into:
@@ -585,11 +619,15 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
     // Do the operation on each small part.
     for (int i = 0; i < NumParts; ++i)
-      MIRBuilder.buildOr(DstRegs[i], SrcsReg1[i], SrcsReg2[i]);
+      MIRBuilder.buildInstr(MI.getOpcode(), {DstRegs[i]},
+                            {SrcsReg1[i], SrcsReg2[i]});
 
     // Gather the destination registers into the final destination.
     unsigned DstReg = MI.getOperand(0).getReg();
-    MIRBuilder.buildMerge(DstReg, DstRegs);
+    if(MRI.getType(DstReg).isVector())
+      MIRBuilder.buildBuildVector(DstReg, DstRegs);
+    else
+      MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -599,7 +637,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
                                      unsigned OpIdx, unsigned ExtOpcode) {
   MachineOperand &MO = MI.getOperand(OpIdx);
-  auto ExtB = MIRBuilder.buildInstr(ExtOpcode, WideTy, MO.getReg());
+  auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO.getReg()});
   MO.setReg(ExtB->getOperand(0).getReg());
 }
 
@@ -608,7 +646,7 @@ void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
   MachineOperand &MO = MI.getOperand(OpIdx);
   unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
-  MIRBuilder.buildInstr(TruncOpcode, MO.getReg(), DstExt);
+  MIRBuilder.buildInstr(TruncOpcode, {MO.getReg()}, {DstExt});
   MO.setReg(DstExt);
 }
 
@@ -623,20 +661,20 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_USUBO: {
     if (TypeIdx == 1)
       return UnableToLegalize; // TODO
-    auto LHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, WideTy,
-                                         MI.getOperand(2).getReg());
-    auto RHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, WideTy,
-                                         MI.getOperand(3).getReg());
+    auto LHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, {WideTy},
+                                         {MI.getOperand(2).getReg()});
+    auto RHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, {WideTy},
+                                         {MI.getOperand(3).getReg()});
     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_UADDO
                           ? TargetOpcode::G_ADD
                           : TargetOpcode::G_SUB;
     // Do the arithmetic in the larger type.
-    auto NewOp = MIRBuilder.buildInstr(Opcode, WideTy, LHSZext, RHSZext);
+    auto NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSZext, RHSZext});
     LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
     APInt Mask = APInt::getAllOnesValue(OrigTy.getSizeInBits());
     auto AndOp = MIRBuilder.buildInstr(
-        TargetOpcode::G_AND, WideTy, NewOp,
-        MIRBuilder.buildConstant(WideTy, Mask.getZExtValue()));
+        TargetOpcode::G_AND, {WideTy},
+        {NewOp, MIRBuilder.buildConstant(WideTy, Mask.getZExtValue())});
     // There is no overflow if the AndOp is the same as NewOp.
     MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1).getReg(), NewOp,
                          AndOp);
@@ -660,25 +698,26 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
       auto TopBit =
           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
       MIBSrc = MIRBuilder.buildInstr(
-          TargetOpcode::G_OR, WideTy, MIBSrc,
-          MIRBuilder.buildConstant(WideTy, TopBit.getSExtValue()));
+          TargetOpcode::G_OR, {WideTy},
+          {MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit.getSExtValue())});
     }
     // Perform the operation at the larger size.
-    auto MIBNewOp = MIRBuilder.buildInstr(MI.getOpcode(), WideTy, MIBSrc);
+    auto MIBNewOp = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy}, {MIBSrc});
     // This is already the correct result for CTPOP and CTTZs
     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
       // The correct result is NewOp - (Difference in widety and current ty).
       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
-      MIBNewOp =
-          MIRBuilder.buildInstr(TargetOpcode::G_SUB, WideTy, MIBNewOp,
-                                MIRBuilder.buildConstant(WideTy, SizeDiff));
+      MIBNewOp = MIRBuilder.buildInstr(
+          TargetOpcode::G_SUB, {WideTy},
+          {MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff)});
     }
     auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
-    // Make the original instruction a trunc now, and update it's source.
+    // Make the original instruction a trunc now, and update its source.
+    Observer.changingInstr(MI);
     MI.setDesc(TII.get(TargetOpcode::G_TRUNC));
     MI.getOperand(1).setReg(MIBNewOp->getOperand(0).getReg());
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
   }
 
@@ -691,45 +730,50 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     // Perform operation at larger width (any extension is fine here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_SHL:
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
     // The "number of bits to shift" operand must preserve its value as an
     // unsigned integer:
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_SDIV:
   case TargetOpcode::G_SREM:
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_ASHR:
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
     // The "number of bits to shift" operand must preserve its value as an
     // unsigned integer:
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_UREM:
   case TargetOpcode::G_LSHR:
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_SELECT:
@@ -738,40 +782,45 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     // Perform operation at larger width (any extension is fine here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
     if (TypeIdx != 0)
       return UnableToLegalize;
+    Observer.changingInstr(MI);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_SITOFP:
     if (TypeIdx != 1)
       return UnableToLegalize;
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_UITOFP:
     if (TypeIdx != 1)
       return UnableToLegalize;
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_INSERT:
     if (TypeIdx != 0)
       return UnableToLegalize;
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_LOAD:
@@ -784,8 +833,9 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     LLVM_FALLTHROUGH;
   case TargetOpcode::G_SEXTLOAD:
   case TargetOpcode::G_ZEXTLOAD:
+    Observer.changingInstr(MI);
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_STORE: {
@@ -793,18 +843,20 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
         WideTy != LLT::scalar(8))
       return UnableToLegalize;
 
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ZEXT);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
   }
   case TargetOpcode::G_CONSTANT: {
     MachineOperand &SrcMO = MI.getOperand(1);
     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
     const APInt &Val = SrcMO.getCImm()->getValue().sext(WideTy.getSizeInBits());
+    Observer.changingInstr(MI);
     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
 
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
   }
   case TargetOpcode::G_FCONSTANT: {
@@ -822,28 +874,32 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     default:
       llvm_unreachable("Unhandled fp widen type");
     }
+    Observer.changingInstr(MI);
     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
 
     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
   }
   case TargetOpcode::G_BRCOND:
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ANYEXT);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_FCMP:
+    Observer.changingInstr(MI);
     if (TypeIdx == 0)
       widenScalarDst(MI, WideTy);
     else {
       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
     }
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_ICMP:
+    Observer.changingInstr(MI);
     if (TypeIdx == 0)
       widenScalarDst(MI, WideTy);
     else {
@@ -854,18 +910,20 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
     }
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_GEP:
     assert(TypeIdx == 1 && "unable to legalize pointer of GEP");
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
 
   case TargetOpcode::G_PHI: {
     assert(TypeIdx == 0 && "Expecting only Idx 0");
 
+    Observer.changingInstr(MI);
     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
@@ -875,14 +933,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     MachineBasicBlock &MBB = *MI.getParent();
     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
     widenScalarDst(MI, WideTy);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
   }
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     if (TypeIdx != 2)
       return UnableToLegalize;
+    Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
   }
 }
@@ -1063,6 +1122,24 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   case TargetOpcode::G_CTTZ:
   case TargetOpcode::G_CTPOP:
     return lowerBitCount(MI, TypeIdx, Ty);
+  case G_UADDE: {
+    unsigned Res = MI.getOperand(0).getReg();
+    unsigned CarryOut = MI.getOperand(1).getReg();
+    unsigned LHS = MI.getOperand(2).getReg();
+    unsigned RHS = MI.getOperand(3).getReg();
+    unsigned CarryIn = MI.getOperand(4).getReg();
+
+    unsigned TmpRes = MRI.createGenericVirtualRegister(Ty);
+    unsigned ZExtCarryIn = MRI.createGenericVirtualRegister(Ty);
+
+    MIRBuilder.buildAdd(TmpRes, LHS, RHS);
+    MIRBuilder.buildZExt(ZExtCarryIn, CarryIn);
+    MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
+    MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
@@ -1072,6 +1149,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   // FIXME: Don't know how to handle secondary types yet.
   if (TypeIdx != 0)
     return UnableToLegalize;
+
+  MIRBuilder.setInstr(MI);
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
@@ -1085,8 +1164,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     if (Size % NarrowSize != 0)
       return UnableToLegalize;
 
-    MIRBuilder.setInstr(MI);
-
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
@@ -1097,7 +1174,49 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
       DstRegs.push_back(DstReg);
     }
 
-    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MIRBuilder.buildConcatVectors(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE: {
+    bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
+    unsigned ValReg = MI.getOperand(0).getReg();
+    unsigned AddrReg = MI.getOperand(1).getReg();
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    unsigned Size = MRI.getType(ValReg).getSizeInBits();
+    unsigned NumParts = Size / NarrowSize;
+
+    SmallVector<unsigned, 8> NarrowRegs;
+    if (!IsLoad)
+      extractParts(ValReg, NarrowTy, NumParts, NarrowRegs);
+
+    const LLT OffsetTy =
+        LLT::scalar(MRI.getType(AddrReg).getScalarSizeInBits());
+    MachineFunction &MF = *MI.getMF();
+    MachineMemOperand *MMO = *MI.memoperands_begin();
+    for (unsigned Idx = 0; Idx < NumParts; ++Idx) {
+      unsigned Adjustment = Idx * NarrowTy.getSizeInBits() / 8;
+      unsigned Alignment = MinAlign(MMO->getAlignment(), Adjustment);
+      unsigned NewAddrReg = 0;
+      MIRBuilder.materializeGEP(NewAddrReg, AddrReg, OffsetTy, Adjustment);
+      MachineMemOperand &NewMMO = *MF.getMachineMemOperand(
+          MMO->getPointerInfo().getWithOffset(Adjustment), MMO->getFlags(),
+          NarrowTy.getSizeInBits() / 8, Alignment);
+      if (IsLoad) {
+        unsigned Dst = MRI.createGenericVirtualRegister(NarrowTy);
+        NarrowRegs.push_back(Dst);
+        MIRBuilder.buildLoad(Dst, NewAddrReg, NewMMO);
+      } else {
+        MIRBuilder.buildStore(NarrowRegs[Idx], NewAddrReg, NewMMO);
+      }
+    }
+    if (IsLoad) {
+      if (NarrowTy.isVector())
+        MIRBuilder.buildConcatVectors(ValReg, NarrowRegs);
+      else
+        MIRBuilder.buildBuildVector(ValReg, NarrowRegs);
+    }
     MI.eraseFromParent();
     return Legalized;
   }
@@ -1108,27 +1227,27 @@ LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   unsigned Opc = MI.getOpcode();
   auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
-  auto isLegalOrCustom = [this](const LegalityQuery &Q) {
+  auto isSupported = [this](const LegalityQuery &Q) {
     auto QAction = LI.getAction(Q).Action;
-    return QAction == Legal || QAction == Custom;
+    return QAction == Legal || QAction == Libcall || QAction == Custom;
   };
   switch (Opc) {
   default:
     return UnableToLegalize;
   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
     // This trivially expands to CTLZ.
+    Observer.changingInstr(MI);
     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
   }
   case TargetOpcode::G_CTLZ: {
     unsigned SrcReg = MI.getOperand(1).getReg();
     unsigned Len = Ty.getSizeInBits();
-    if (isLegalOrCustom({TargetOpcode::G_CTLZ_ZERO_UNDEF, {Ty}})) {
-      // If CTLZ_ZERO_UNDEF is legal or custom, emit that and a select with
-      // zero.
-      auto MIBCtlzZU =
-          MIRBuilder.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, Ty, SrcReg);
+    if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {Ty}})) {
+      // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
+      auto MIBCtlzZU = MIRBuilder.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF,
+                                             {Ty}, {SrcReg});
       auto MIBZero = MIRBuilder.buildConstant(Ty, 0);
       auto MIBLen = MIRBuilder.buildConstant(Ty, Len);
       auto MIBICmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
@@ -1154,30 +1273,32 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
       auto MIBShiftAmt = MIRBuilder.buildConstant(Ty, 1ULL << i);
       auto MIBOp = MIRBuilder.buildInstr(
-          TargetOpcode::G_OR, Ty, Op,
-          MIRBuilder.buildInstr(TargetOpcode::G_LSHR, Ty, Op, MIBShiftAmt));
+          TargetOpcode::G_OR, {Ty},
+          {Op, MIRBuilder.buildInstr(TargetOpcode::G_LSHR, {Ty},
+                                     {Op, MIBShiftAmt})});
       Op = MIBOp->getOperand(0).getReg();
     }
-    auto MIBPop = MIRBuilder.buildInstr(TargetOpcode::G_CTPOP, Ty, Op);
-    MIRBuilder.buildInstr(TargetOpcode::G_SUB, MI.getOperand(0).getReg(),
-                          MIRBuilder.buildConstant(Ty, Len), MIBPop);
+    auto MIBPop = MIRBuilder.buildInstr(TargetOpcode::G_CTPOP, {Ty}, {Op});
+    MIRBuilder.buildInstr(TargetOpcode::G_SUB, {MI.getOperand(0).getReg()},
+                          {MIRBuilder.buildConstant(Ty, Len), MIBPop});
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
     // This trivially expands to CTTZ.
+    Observer.changingInstr(MI);
     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
-    MIRBuilder.recordInsertion(&MI);
+    Observer.changedInstr(MI);
     return Legalized;
   }
   case TargetOpcode::G_CTTZ: {
     unsigned SrcReg = MI.getOperand(1).getReg();
     unsigned Len = Ty.getSizeInBits();
-    if (isLegalOrCustom({TargetOpcode::G_CTTZ_ZERO_UNDEF, {Ty}})) {
+    if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {Ty}})) {
       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
       // zero.
-      auto MIBCttzZU =
-          MIRBuilder.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, Ty, SrcReg);
+      auto MIBCttzZU = MIRBuilder.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF,
+                                             {Ty}, {SrcReg});
       auto MIBZero = MIRBuilder.buildConstant(Ty, 0);
       auto MIBLen = MIRBuilder.buildConstant(Ty, Len);
       auto MIBICmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
@@ -1193,17 +1314,18 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     // Ref: "Hacker's Delight" by Henry Warren
     auto MIBCstNeg1 = MIRBuilder.buildConstant(Ty, -1);
     auto MIBNot =
-        MIRBuilder.buildInstr(TargetOpcode::G_XOR, Ty, SrcReg, MIBCstNeg1);
+        MIRBuilder.buildInstr(TargetOpcode::G_XOR, {Ty}, {SrcReg, MIBCstNeg1});
     auto MIBTmp = MIRBuilder.buildInstr(
-        TargetOpcode::G_AND, Ty, MIBNot,
-        MIRBuilder.buildInstr(TargetOpcode::G_ADD, Ty, SrcReg, MIBCstNeg1));
-    if (!isLegalOrCustom({TargetOpcode::G_CTPOP, {Ty}}) &&
-        isLegalOrCustom({TargetOpcode::G_CTLZ, {Ty}})) {
+        TargetOpcode::G_AND, {Ty},
+        {MIBNot, MIRBuilder.buildInstr(TargetOpcode::G_ADD, {Ty},
+                                       {SrcReg, MIBCstNeg1})});
+    if (!isSupported({TargetOpcode::G_CTPOP, {Ty}}) &&
+        isSupported({TargetOpcode::G_CTLZ, {Ty}})) {
       auto MIBCstLen = MIRBuilder.buildConstant(Ty, Len);
       MIRBuilder.buildInstr(
-          TargetOpcode::G_SUB, MI.getOperand(0).getReg(),
-          MIBCstLen,
-          MIRBuilder.buildInstr(TargetOpcode::G_CTLZ, Ty, MIBTmp));
+          TargetOpcode::G_SUB, {MI.getOperand(0).getReg()},
+          {MIBCstLen,
+           MIRBuilder.buildInstr(TargetOpcode::G_CTLZ, {Ty}, {MIBTmp})});
       MI.eraseFromParent();
       return Legalized;
     }
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index ca776de0a0fe0d9e69ec90307fb580cfe60465e7..926af3fd76c6760c13f3483df2577a77c44456ed 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -19,6 +19,7 @@
 
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -375,7 +376,8 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI,
 }
 
 bool LegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                                   MachineIRBuilder &MIRBuilder) const {
+                                   MachineIRBuilder &MIRBuilder,
+                                   GISelChangeObserver &Observer) const {
   return false;
 }
 
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index c7b98c6b85961c85cf7883d45385c2be658bf859..c1109e611777dfe7abd9f261406cce476052f6c7 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -10,6 +10,7 @@
 /// This file implements the MachineIRBuidler class.
 //===----------------------------------------------------------------------===//
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -22,73 +23,70 @@
 
 using namespace llvm;
 
-void MachineIRBuilderBase::setMF(MachineFunction &MF) {
+void MachineIRBuilder::setMF(MachineFunction &MF) {
   State.MF = &MF;
   State.MBB = nullptr;
   State.MRI = &MF.getRegInfo();
   State.TII = MF.getSubtarget().getInstrInfo();
   State.DL = DebugLoc();
   State.II = MachineBasicBlock::iterator();
-  State.InsertedInstr = nullptr;
+  State.Observer = nullptr;
 }
 
-void MachineIRBuilderBase::setMBB(MachineBasicBlock &MBB) {
+void MachineIRBuilder::setMBB(MachineBasicBlock &MBB) {
   State.MBB = &MBB;
   State.II = MBB.end();
   assert(&getMF() == MBB.getParent() &&
          "Basic block is in a different function");
 }
 
-void MachineIRBuilderBase::setInstr(MachineInstr &MI) {
+void MachineIRBuilder::setInstr(MachineInstr &MI) {
   assert(MI.getParent() && "Instruction is not part of a basic block");
   setMBB(*MI.getParent());
   State.II = MI.getIterator();
 }
 
-void MachineIRBuilderBase::setInsertPt(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator II) {
+void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator II) {
   assert(MBB.getParent() == &getMF() &&
          "Basic block is in a different function");
   State.MBB = &MBB;
   State.II = II;
 }
 
-void MachineIRBuilderBase::recordInsertion(MachineInstr *InsertedInstr) const {
-  if (State.InsertedInstr)
-    State.InsertedInstr(InsertedInstr);
+void MachineIRBuilder::recordInsertion(MachineInstr *InsertedInstr) const {
+  if (State.Observer)
+    State.Observer->createdInstr(*InsertedInstr);
 }
 
-void MachineIRBuilderBase::recordInsertions(
-    std::function<void(MachineInstr *)> Inserted) {
-  State.InsertedInstr = std::move(Inserted);
+void MachineIRBuilder::setChangeObserver(GISelChangeObserver &Observer) {
+  State.Observer = &Observer;
 }
 
-void MachineIRBuilderBase::stopRecordingInsertions() {
-  State.InsertedInstr = nullptr;
-}
+void MachineIRBuilder::stopObservingChanges() { State.Observer = nullptr; }
 
 //------------------------------------------------------------------------------
 // Build instruction variants.
 //------------------------------------------------------------------------------
 
-MachineInstrBuilder MachineIRBuilderBase::buildInstr(unsigned Opcode) {
+MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opcode) {
   return insertInstr(buildInstrNoInsert(Opcode));
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildInstrNoInsert(unsigned Opcode) {
+MachineInstrBuilder MachineIRBuilder::buildInstrNoInsert(unsigned Opcode) {
   MachineInstrBuilder MIB = BuildMI(getMF(), getDL(), getTII().get(Opcode));
   return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilderBase::insertInstr(MachineInstrBuilder MIB) {
+MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) {
   getMBB().insert(getInsertPt(), MIB);
   recordInsertion(MIB);
   return MIB;
 }
 
 MachineInstrBuilder
-MachineIRBuilderBase::buildDirectDbgValue(unsigned Reg, const MDNode *Variable,
-                                          const MDNode *Expr) {
+MachineIRBuilder::buildDirectDbgValue(unsigned Reg, const MDNode *Variable,
+                                      const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
   assert(
@@ -99,8 +97,9 @@ MachineIRBuilderBase::buildDirectDbgValue(unsigned Reg, const MDNode *Variable,
                              /*IsIndirect*/ false, Reg, Variable, Expr));
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildIndirectDbgValue(
-    unsigned Reg, const MDNode *Variable, const MDNode *Expr) {
+MachineInstrBuilder
+MachineIRBuilder::buildIndirectDbgValue(unsigned Reg, const MDNode *Variable,
+                                        const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
   assert(
@@ -111,9 +110,9 @@ MachineInstrBuilder MachineIRBuilderBase::buildIndirectDbgValue(
                              /*IsIndirect*/ true, Reg, Variable, Expr));
 }
 
-MachineInstrBuilder
-MachineIRBuilderBase::buildFIDbgValue(int FI, const MDNode *Variable,
-                                      const MDNode *Expr) {
+MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
+                                                      const MDNode *Variable,
+                                                      const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
   assert(
@@ -126,8 +125,9 @@ MachineIRBuilderBase::buildFIDbgValue(int FI, const MDNode *Variable,
       .addMetadata(Expr);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildConstDbgValue(
-    const Constant &C, const MDNode *Variable, const MDNode *Expr) {
+MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
+                                                         const MDNode *Variable,
+                                                         const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
   assert(
@@ -149,7 +149,7 @@ MachineInstrBuilder MachineIRBuilderBase::buildConstDbgValue(
   return MIB.addImm(0).addMetadata(Variable).addMetadata(Expr);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildDbgLabel(const MDNode *Label) {
+MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
   assert(isa<DILabel>(Label) && "not a label");
   assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(State.DL) &&
          "Expected inlined-at fields to agree");
@@ -158,16 +158,15 @@ MachineInstrBuilder MachineIRBuilderBase::buildDbgLabel(const MDNode *Label) {
   return MIB.addMetadata(Label);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildFrameIndex(unsigned Res,
-                                                          int Idx) {
+MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) {
   assert(getMRI()->getType(Res).isPointer() && "invalid operand type");
   return buildInstr(TargetOpcode::G_FRAME_INDEX)
       .addDef(Res)
       .addFrameIndex(Idx);
 }
 
-MachineInstrBuilder
-MachineIRBuilderBase::buildGlobalValue(unsigned Res, const GlobalValue *GV) {
+MachineInstrBuilder MachineIRBuilder::buildGlobalValue(unsigned Res,
+                                                       const GlobalValue *GV) {
   assert(getMRI()->getType(Res).isPointer() && "invalid operand type");
   assert(getMRI()->getType(Res).getAddressSpace() ==
              GV->getType()->getAddressSpace() &&
@@ -178,17 +177,14 @@ MachineIRBuilderBase::buildGlobalValue(unsigned Res, const GlobalValue *GV) {
       .addGlobalAddress(GV);
 }
 
-void MachineIRBuilderBase::validateBinaryOp(unsigned Res, unsigned Op0,
-                                            unsigned Op1) {
-  assert((getMRI()->getType(Res).isScalar() ||
-          getMRI()->getType(Res).isVector()) &&
-         "invalid operand type");
-  assert(getMRI()->getType(Res) == getMRI()->getType(Op0) &&
-         getMRI()->getType(Res) == getMRI()->getType(Op1) && "type mismatch");
+void MachineIRBuilder::validateBinaryOp(const LLT &Res, const LLT &Op0,
+                                        const LLT &Op1) {
+  assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
+  assert((Res == Op0 && Res == Op1) && "type mismatch");
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildGEP(unsigned Res, unsigned Op0,
-                                                   unsigned Op1) {
+MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0,
+                                               unsigned Op1) {
   assert(getMRI()->getType(Res).isPointer() &&
          getMRI()->getType(Res) == getMRI()->getType(Op0) && "type mismatch");
   assert(getMRI()->getType(Op1).isScalar() && "invalid offset type");
@@ -200,8 +196,8 @@ MachineInstrBuilder MachineIRBuilderBase::buildGEP(unsigned Res, unsigned Op0,
 }
 
 Optional<MachineInstrBuilder>
-MachineIRBuilderBase::materializeGEP(unsigned &Res, unsigned Op0,
-                                     const LLT &ValueTy, uint64_t Value) {
+MachineIRBuilder::materializeGEP(unsigned &Res, unsigned Op0,
+                                 const LLT &ValueTy, uint64_t Value) {
   assert(Res == 0 && "Res is a result argument");
   assert(ValueTy.isScalar()  && "invalid offset type");
 
@@ -217,9 +213,8 @@ MachineIRBuilderBase::materializeGEP(unsigned &Res, unsigned Op0,
   return buildGEP(Res, Op0, TmpReg);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildPtrMask(unsigned Res,
-                                                       unsigned Op0,
-                                                       uint32_t NumBits) {
+MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0,
+                                                   uint32_t NumBits) {
   assert(getMRI()->getType(Res).isPointer() &&
          getMRI()->getType(Res) == getMRI()->getType(Op0) && "type mismatch");
 
@@ -229,24 +224,23 @@ MachineInstrBuilder MachineIRBuilderBase::buildPtrMask(unsigned Res,
       .addImm(NumBits);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildBr(MachineBasicBlock &Dest) {
+MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
   return buildInstr(TargetOpcode::G_BR).addMBB(&Dest);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildBrIndirect(unsigned Tgt) {
+MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) {
   assert(getMRI()->getType(Tgt).isPointer() && "invalid branch destination");
   return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildCopy(unsigned Res, unsigned Op) {
-  assert(getMRI()->getType(Res) == LLT() || getMRI()->getType(Op) == LLT() ||
-         getMRI()->getType(Res) == getMRI()->getType(Op));
-  return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op);
+MachineInstrBuilder MachineIRBuilder::buildCopy(const DstOp &Res,
+                                                const SrcOp &Op) {
+  return buildInstr(TargetOpcode::COPY, Res, Op);
 }
 
-MachineInstrBuilder
-MachineIRBuilderBase::buildConstant(unsigned Res, const ConstantInt &Val) {
-  LLT Ty = getMRI()->getType(Res);
+MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res,
+                                                    const ConstantInt &Val) {
+  LLT Ty = Res.getLLTTy(*getMRI());
 
   assert((Ty.isScalar() || Ty.isPointer()) && "invalid operand type");
 
@@ -255,48 +249,55 @@ MachineIRBuilderBase::buildConstant(unsigned Res, const ConstantInt &Val) {
     NewVal = ConstantInt::get(getMF().getFunction().getContext(),
                               Val.getValue().sextOrTrunc(Ty.getSizeInBits()));
 
-  return buildInstr(TargetOpcode::G_CONSTANT).addDef(Res).addCImm(NewVal);
+  auto MIB = buildInstr(TargetOpcode::G_CONSTANT);
+  Res.addDefToMIB(*getMRI(), MIB);
+  MIB.addCImm(NewVal);
+  return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildConstant(unsigned Res,
-                                                        int64_t Val) {
+MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res,
+                                                    int64_t Val) {
   auto IntN = IntegerType::get(getMF().getFunction().getContext(),
-                               getMRI()->getType(Res).getSizeInBits());
+                               Res.getLLTTy(*getMRI()).getSizeInBits());
   ConstantInt *CI = ConstantInt::get(IntN, Val, true);
   return buildConstant(Res, *CI);
 }
 
-MachineInstrBuilder
-MachineIRBuilderBase::buildFConstant(unsigned Res, const ConstantFP &Val) {
-  assert(getMRI()->getType(Res).isScalar() && "invalid operand type");
+MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res,
+                                                     const ConstantFP &Val) {
+  assert(Res.getLLTTy(*getMRI()).isScalar() && "invalid operand type");
 
-  return buildInstr(TargetOpcode::G_FCONSTANT).addDef(Res).addFPImm(&Val);
+  auto MIB = buildInstr(TargetOpcode::G_FCONSTANT);
+  Res.addDefToMIB(*getMRI(), MIB);
+  MIB.addFPImm(&Val);
+  return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildFConstant(unsigned Res,
-                                                         double Val) {
-  LLT DstTy = getMRI()->getType(Res);
+MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res,
+                                                     double Val) {
+  LLT DstTy = Res.getLLTTy(*getMRI());
   auto &Ctx = getMF().getFunction().getContext();
   auto *CFP =
       ConstantFP::get(Ctx, getAPFloatFromSize(Val, DstTy.getSizeInBits()));
   return buildFConstant(Res, *CFP);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildBrCond(unsigned Tst,
-                                                      MachineBasicBlock &Dest) {
+MachineInstrBuilder MachineIRBuilder::buildBrCond(unsigned Tst,
+                                                  MachineBasicBlock &Dest) {
   assert(getMRI()->getType(Tst).isScalar() && "invalid operand type");
 
   return buildInstr(TargetOpcode::G_BRCOND).addUse(Tst).addMBB(&Dest);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildLoad(unsigned Res, unsigned Addr,
-                                                    MachineMemOperand &MMO) {
+MachineInstrBuilder MachineIRBuilder::buildLoad(unsigned Res, unsigned Addr,
+                                                MachineMemOperand &MMO) {
   return buildLoadInstr(TargetOpcode::G_LOAD, Res, Addr, MMO);
 }
 
-MachineInstrBuilder
-MachineIRBuilderBase::buildLoadInstr(unsigned Opcode, unsigned Res,
-                                     unsigned Addr, MachineMemOperand &MMO) {
+MachineInstrBuilder MachineIRBuilder::buildLoadInstr(unsigned Opcode,
+                                                     unsigned Res,
+                                                     unsigned Addr,
+                                                     MachineMemOperand &MMO) {
   assert(getMRI()->getType(Res).isValid() && "invalid operand type");
   assert(getMRI()->getType(Addr).isPointer() && "invalid operand type");
 
@@ -306,9 +307,8 @@ MachineIRBuilderBase::buildLoadInstr(unsigned Opcode, unsigned Res,
       .addMemOperand(&MMO);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildStore(unsigned Val,
-                                                     unsigned Addr,
-                                                     MachineMemOperand &MMO) {
+MachineInstrBuilder MachineIRBuilder::buildStore(unsigned Val, unsigned Addr,
+                                                 MachineMemOperand &MMO) {
   assert(getMRI()->getType(Val).isValid() && "invalid operand type");
   assert(getMRI()->getType(Addr).isPointer() && "invalid operand type");
 
@@ -318,83 +318,73 @@ MachineInstrBuilder MachineIRBuilderBase::buildStore(unsigned Val,
       .addMemOperand(&MMO);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildUAdde(unsigned Res,
-                                                     unsigned CarryOut,
-                                                     unsigned Op0, unsigned Op1,
-                                                     unsigned CarryIn) {
-  assert(getMRI()->getType(Res).isScalar() && "invalid operand type");
-  assert(getMRI()->getType(Res) == getMRI()->getType(Op0) &&
-         getMRI()->getType(Res) == getMRI()->getType(Op1) && "type mismatch");
-  assert(getMRI()->getType(CarryOut).isScalar() && "invalid operand type");
-  assert(getMRI()->getType(CarryOut) == getMRI()->getType(CarryIn) &&
-         "type mismatch");
-
-  return buildInstr(TargetOpcode::G_UADDE)
-      .addDef(Res)
-      .addDef(CarryOut)
-      .addUse(Op0)
-      .addUse(Op1)
-      .addUse(CarryIn);
+MachineInstrBuilder MachineIRBuilder::buildUAdde(const DstOp &Res,
+                                                 const DstOp &CarryOut,
+                                                 const SrcOp &Op0,
+                                                 const SrcOp &Op1,
+                                                 const SrcOp &CarryIn) {
+  return buildInstr(TargetOpcode::G_UADDE, {Res, CarryOut},
+                    {Op0, Op1, CarryIn});
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildAnyExt(unsigned Res,
-                                                      unsigned Op) {
-  validateTruncExt(Res, Op, true);
-  return buildInstr(TargetOpcode::G_ANYEXT).addDef(Res).addUse(Op);
+MachineInstrBuilder MachineIRBuilder::buildAnyExt(const DstOp &Res,
+                                                  const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_ANYEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildSExt(unsigned Res, unsigned Op) {
-  validateTruncExt(Res, Op, true);
-  return buildInstr(TargetOpcode::G_SEXT).addDef(Res).addUse(Op);
+MachineInstrBuilder MachineIRBuilder::buildSExt(const DstOp &Res,
+                                                const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_SEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildZExt(unsigned Res, unsigned Op) {
-  validateTruncExt(Res, Op, true);
-  return buildInstr(TargetOpcode::G_ZEXT).addDef(Res).addUse(Op);
+MachineInstrBuilder MachineIRBuilder::buildZExt(const DstOp &Res,
+                                                const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_ZEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildExtOrTrunc(unsigned ExtOpc,
-                                                          unsigned Res,
-                                                          unsigned Op) {
+MachineInstrBuilder MachineIRBuilder::buildExtOrTrunc(unsigned ExtOpc,
+                                                      const DstOp &Res,
+                                                      const SrcOp &Op) {
   assert((TargetOpcode::G_ANYEXT == ExtOpc || TargetOpcode::G_ZEXT == ExtOpc ||
           TargetOpcode::G_SEXT == ExtOpc) &&
          "Expecting Extending Opc");
-  assert(getMRI()->getType(Res).isScalar() ||
-         getMRI()->getType(Res).isVector());
-  assert(getMRI()->getType(Res).isScalar() == getMRI()->getType(Op).isScalar());
+  assert(Res.getLLTTy(*getMRI()).isScalar() ||
+         Res.getLLTTy(*getMRI()).isVector());
+  assert(Res.getLLTTy(*getMRI()).isScalar() ==
+         Op.getLLTTy(*getMRI()).isScalar());
 
   unsigned Opcode = TargetOpcode::COPY;
-  if (getMRI()->getType(Res).getSizeInBits() >
-      getMRI()->getType(Op).getSizeInBits())
+  if (Res.getLLTTy(*getMRI()).getSizeInBits() >
+      Op.getLLTTy(*getMRI()).getSizeInBits())
     Opcode = ExtOpc;
-  else if (getMRI()->getType(Res).getSizeInBits() <
-           getMRI()->getType(Op).getSizeInBits())
+  else if (Res.getLLTTy(*getMRI()).getSizeInBits() <
+           Op.getLLTTy(*getMRI()).getSizeInBits())
     Opcode = TargetOpcode::G_TRUNC;
   else
-    assert(getMRI()->getType(Res) == getMRI()->getType(Op));
+    assert(Res.getLLTTy(*getMRI()) == Op.getLLTTy(*getMRI()));
 
-  return buildInstr(Opcode).addDef(Res).addUse(Op);
+  return buildInstr(Opcode, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildSExtOrTrunc(unsigned Res,
-                                                           unsigned Op) {
+MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(const DstOp &Res,
+                                                       const SrcOp &Op) {
   return buildExtOrTrunc(TargetOpcode::G_SEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildZExtOrTrunc(unsigned Res,
-                                                           unsigned Op) {
+MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(const DstOp &Res,
+                                                       const SrcOp &Op) {
   return buildExtOrTrunc(TargetOpcode::G_ZEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildAnyExtOrTrunc(unsigned Res,
-                                                             unsigned Op) {
+MachineInstrBuilder MachineIRBuilder::buildAnyExtOrTrunc(const DstOp &Res,
+                                                         const SrcOp &Op) {
   return buildExtOrTrunc(TargetOpcode::G_ANYEXT, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildCast(unsigned Dst,
-                                                    unsigned Src) {
-  LLT SrcTy = getMRI()->getType(Src);
-  LLT DstTy = getMRI()->getType(Dst);
+MachineInstrBuilder MachineIRBuilder::buildCast(const DstOp &Dst,
+                                                const SrcOp &Src) {
+  LLT SrcTy = Src.getLLTTy(*getMRI());
+  LLT DstTy = Dst.getLLTTy(*getMRI());
   if (SrcTy == DstTy)
     return buildCopy(Dst, Src);
 
@@ -408,11 +398,11 @@ MachineInstrBuilder MachineIRBuilderBase::buildCast(unsigned Dst,
     Opcode = TargetOpcode::G_BITCAST;
   }
 
-  return buildInstr(Opcode).addDef(Dst).addUse(Src);
+  return buildInstr(Opcode, Dst, Src);
 }
 
-MachineInstrBuilder
-MachineIRBuilderBase::buildExtract(unsigned Res, unsigned Src, uint64_t Index) {
+MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src,
+                                                   uint64_t Index) {
 #ifndef NDEBUG
   assert(getMRI()->getType(Src).isValid() && "invalid operand type");
   assert(getMRI()->getType(Res).isValid() && "invalid operand type");
@@ -433,8 +423,8 @@ MachineIRBuilderBase::buildExtract(unsigned Res, unsigned Src, uint64_t Index) {
       .addImm(Index);
 }
 
-void MachineIRBuilderBase::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
-                                         ArrayRef<uint64_t> Indices) {
+void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
+                                     ArrayRef<uint64_t> Indices) {
 #ifndef NDEBUG
   assert(Ops.size() == Indices.size() && "incompatible args");
   assert(!Ops.empty() && "invalid trivial sequence");
@@ -474,56 +464,67 @@ void MachineIRBuilderBase::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
   }
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildUndef(unsigned Res) {
-  return buildInstr(TargetOpcode::G_IMPLICIT_DEF).addDef(Res);
+MachineInstrBuilder MachineIRBuilder::buildUndef(const DstOp &Res) {
+  return buildInstr(TargetOpcode::G_IMPLICIT_DEF, {Res}, {});
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildMerge(unsigned Res,
-                                                     ArrayRef<unsigned> Ops) {
-
-#ifndef NDEBUG
-  assert(!Ops.empty() && "invalid trivial sequence");
-  LLT Ty = getMRI()->getType(Ops[0]);
-  for (auto Reg : Ops)
-    assert(getMRI()->getType(Reg) == Ty && "type mismatch in input list");
-  assert(Ops.size() * getMRI()->getType(Ops[0]).getSizeInBits() ==
-             getMRI()->getType(Res).getSizeInBits() &&
-         "input operands do not cover output register");
-#endif
+MachineInstrBuilder MachineIRBuilder::buildMerge(const DstOp &Res,
+                                                 ArrayRef<unsigned> Ops) {
+  // Unfortunately to convert from ArrayRef<LLT> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  return buildInstr(TargetOpcode::G_MERGE_VALUES, Res, TmpVec);
+}
 
-  if (Ops.size() == 1)
-    return buildCast(Res, Ops[0]);
+MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<LLT> Res,
+                                                   const SrcOp &Op) {
+  // Unfortunately to convert from ArrayRef<LLT> to ArrayRef<DstOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<DstOp, 8> TmpVec(Res.begin(), Res.end());
+  return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
+}
 
-  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES);
-  MIB.addDef(Res);
-  for (unsigned i = 0; i < Ops.size(); ++i)
-    MIB.addUse(Ops[i]);
-  return MIB;
+MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res,
+                                                   const SrcOp &Op) {
+  // Unfortunately to convert from ArrayRef<unsigned> to ArrayRef<DstOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<DstOp, 8> TmpVec(Res.begin(), Res.end());
+  return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildUnmerge(ArrayRef<unsigned> Res,
-                                                       unsigned Op) {
+MachineInstrBuilder MachineIRBuilder::buildBuildVector(const DstOp &Res,
+                                                       ArrayRef<unsigned> Ops) {
+  // Unfortunately to convert from ArrayRef<unsigned> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec);
+}
 
-#ifndef NDEBUG
-  assert(!Res.empty() && "invalid trivial sequence");
-  LLT Ty = getMRI()->getType(Res[0]);
-  for (auto Reg : Res)
-    assert(getMRI()->getType(Reg) == Ty && "type mismatch in input list");
-  assert(Res.size() * getMRI()->getType(Res[0]).getSizeInBits() ==
-             getMRI()->getType(Op).getSizeInBits() &&
-         "input operands do not cover output register");
-#endif
+MachineInstrBuilder
+MachineIRBuilder::buildBuildVectorTrunc(const DstOp &Res,
+                                        ArrayRef<unsigned> Ops) {
+  // Unfortunately to convert from ArrayRef<unsigned> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  return buildInstr(TargetOpcode::G_BUILD_VECTOR_TRUNC, Res, TmpVec);
+}
 
-  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES);
-  for (unsigned i = 0; i < Res.size(); ++i)
-    MIB.addDef(Res[i]);
-  MIB.addUse(Op);
-  return MIB;
+MachineInstrBuilder
+MachineIRBuilder::buildConcatVectors(const DstOp &Res, ArrayRef<unsigned> Ops) {
+  // Unfortunately to convert from ArrayRef<unsigned> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  return buildInstr(TargetOpcode::G_CONCAT_VECTORS, Res, TmpVec);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildInsert(unsigned Res,
-                                                      unsigned Src, unsigned Op,
-                                                      unsigned Index) {
+MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src,
+                                                  unsigned Op, unsigned Index) {
   assert(Index + getMRI()->getType(Op).getSizeInBits() <=
              getMRI()->getType(Res).getSizeInBits() &&
          "insertion past the end of a register");
@@ -540,9 +541,9 @@ MachineInstrBuilder MachineIRBuilderBase::buildInsert(unsigned Res,
       .addImm(Index);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildIntrinsic(Intrinsic::ID ID,
-                                                         unsigned Res,
-                                                         bool HasSideEffects) {
+MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
+                                                     unsigned Res,
+                                                     bool HasSideEffects) {
   auto MIB =
       buildInstr(HasSideEffects ? TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS
                                 : TargetOpcode::G_INTRINSIC);
@@ -552,133 +553,52 @@ MachineInstrBuilder MachineIRBuilderBase::buildIntrinsic(Intrinsic::ID ID,
   return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildTrunc(unsigned Res,
-                                                     unsigned Op) {
-  validateTruncExt(Res, Op, false);
-  return buildInstr(TargetOpcode::G_TRUNC).addDef(Res).addUse(Op);
+MachineInstrBuilder MachineIRBuilder::buildTrunc(const DstOp &Res,
+                                                 const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_TRUNC, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildFPTrunc(unsigned Res,
-                                                       unsigned Op) {
-  validateTruncExt(Res, Op, false);
-  return buildInstr(TargetOpcode::G_FPTRUNC).addDef(Res).addUse(Op);
+MachineInstrBuilder MachineIRBuilder::buildFPTrunc(const DstOp &Res,
+                                                   const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_FPTRUNC, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildICmp(CmpInst::Predicate Pred,
-                                                    unsigned Res, unsigned Op0,
-                                                    unsigned Op1) {
-#ifndef NDEBUG
-  assert(getMRI()->getType(Op0) == getMRI()->getType(Op0) && "type mismatch");
-  assert(CmpInst::isIntPredicate(Pred) && "invalid predicate");
-  if (getMRI()->getType(Op0).isScalar() || getMRI()->getType(Op0).isPointer())
-    assert(getMRI()->getType(Res).isScalar() && "type mismatch");
-  else
-    assert(getMRI()->getType(Res).isVector() &&
-           getMRI()->getType(Res).getNumElements() ==
-               getMRI()->getType(Op0).getNumElements() &&
-           "type mismatch");
-#endif
-
-  return buildInstr(TargetOpcode::G_ICMP)
-      .addDef(Res)
-      .addPredicate(Pred)
-      .addUse(Op0)
-      .addUse(Op1);
+MachineInstrBuilder MachineIRBuilder::buildICmp(CmpInst::Predicate Pred,
+                                                const DstOp &Res,
+                                                const SrcOp &Op0,
+                                                const SrcOp &Op1) {
+  return buildInstr(TargetOpcode::G_ICMP, Res, {Pred, Op0, Op1});
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildFCmp(CmpInst::Predicate Pred,
-                                                    unsigned Res, unsigned Op0,
-                                                    unsigned Op1) {
-#ifndef NDEBUG
-  assert((getMRI()->getType(Op0).isScalar() ||
-          getMRI()->getType(Op0).isVector()) &&
-         "invalid operand type");
-  assert(getMRI()->getType(Op0) == getMRI()->getType(Op1) && "type mismatch");
-  assert(CmpInst::isFPPredicate(Pred) && "invalid predicate");
-  if (getMRI()->getType(Op0).isScalar())
-    assert(getMRI()->getType(Res).isScalar() && "type mismatch");
-  else
-    assert(getMRI()->getType(Res).isVector() &&
-           getMRI()->getType(Res).getNumElements() ==
-               getMRI()->getType(Op0).getNumElements() &&
-           "type mismatch");
-#endif
+MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred,
+                                                const DstOp &Res,
+                                                const SrcOp &Op0,
+                                                const SrcOp &Op1) {
 
-  return buildInstr(TargetOpcode::G_FCMP)
-      .addDef(Res)
-      .addPredicate(Pred)
-      .addUse(Op0)
-      .addUse(Op1);
+  return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1});
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildSelect(unsigned Res,
-                                                      unsigned Tst,
-                                                      unsigned Op0,
-                                                      unsigned Op1) {
-#ifndef NDEBUG
-  LLT ResTy = getMRI()->getType(Res);
-  assert((ResTy.isScalar() || ResTy.isVector() || ResTy.isPointer()) &&
-         "invalid operand type");
-  assert(ResTy == getMRI()->getType(Op0) && ResTy == getMRI()->getType(Op1) &&
-         "type mismatch");
-  if (ResTy.isScalar() || ResTy.isPointer())
-    assert(getMRI()->getType(Tst).isScalar() && "type mismatch");
-  else
-    assert((getMRI()->getType(Tst).isScalar() ||
-            (getMRI()->getType(Tst).isVector() &&
-             getMRI()->getType(Tst).getNumElements() ==
-                 getMRI()->getType(Op0).getNumElements())) &&
-           "type mismatch");
-#endif
+MachineInstrBuilder MachineIRBuilder::buildSelect(const DstOp &Res,
+                                                  const SrcOp &Tst,
+                                                  const SrcOp &Op0,
+                                                  const SrcOp &Op1) {
 
-  return buildInstr(TargetOpcode::G_SELECT)
-      .addDef(Res)
-      .addUse(Tst)
-      .addUse(Op0)
-      .addUse(Op1);
+  return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1});
 }
 
 MachineInstrBuilder
-MachineIRBuilderBase::buildInsertVectorElement(unsigned Res, unsigned Val,
-                                               unsigned Elt, unsigned Idx) {
-#ifndef NDEBUG
-  LLT ResTy = getMRI()->getType(Res);
-  LLT ValTy = getMRI()->getType(Val);
-  LLT EltTy = getMRI()->getType(Elt);
-  LLT IdxTy = getMRI()->getType(Idx);
-  assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type");
-  assert(IdxTy.isScalar() && "invalid operand type");
-  assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch");
-  assert(ResTy.getElementType() == EltTy && "type mismatch");
-#endif
-
-  return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT)
-      .addDef(Res)
-      .addUse(Val)
-      .addUse(Elt)
-      .addUse(Idx);
+MachineIRBuilder::buildInsertVectorElement(const DstOp &Res, const SrcOp &Val,
+                                           const SrcOp &Elt, const SrcOp &Idx) {
+  return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT, Res, {Val, Elt, Idx});
 }
 
 MachineInstrBuilder
-MachineIRBuilderBase::buildExtractVectorElement(unsigned Res, unsigned Val,
-                                                unsigned Idx) {
-#ifndef NDEBUG
-  LLT ResTy = getMRI()->getType(Res);
-  LLT ValTy = getMRI()->getType(Val);
-  LLT IdxTy = getMRI()->getType(Idx);
-  assert(ValTy.isVector() && "invalid operand type");
-  assert((ResTy.isScalar() || ResTy.isPointer()) && "invalid operand type");
-  assert(IdxTy.isScalar() && "invalid operand type");
-  assert(ValTy.getElementType() == ResTy && "type mismatch");
-#endif
-
-  return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT)
-      .addDef(Res)
-      .addUse(Val)
-      .addUse(Idx);
+MachineIRBuilder::buildExtractVectorElement(const DstOp &Res, const SrcOp &Val,
+                                            const SrcOp &Idx) {
+  return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT, Res, {Val, Idx});
 }
 
-MachineInstrBuilder MachineIRBuilderBase::buildAtomicCmpXchgWithSuccess(
+MachineInstrBuilder MachineIRBuilder::buildAtomicCmpXchgWithSuccess(
     unsigned OldValRes, unsigned SuccessRes, unsigned Addr, unsigned CmpVal,
     unsigned NewVal, MachineMemOperand &MMO) {
 #ifndef NDEBUG
@@ -706,9 +626,9 @@ MachineInstrBuilder MachineIRBuilderBase::buildAtomicCmpXchgWithSuccess(
 }
 
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr,
-                                         unsigned CmpVal, unsigned NewVal,
-                                         MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr,
+                                     unsigned CmpVal, unsigned NewVal,
+                                     MachineMemOperand &MMO) {
 #ifndef NDEBUG
   LLT OldValResTy = getMRI()->getType(OldValRes);
   LLT AddrTy = getMRI()->getType(Addr);
@@ -730,10 +650,11 @@ MachineIRBuilderBase::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr,
       .addMemOperand(&MMO);
 }
 
-MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMW(unsigned Opcode, unsigned OldValRes,
-                                     unsigned Addr, unsigned Val,
-                                     MachineMemOperand &MMO) {
+MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(unsigned Opcode,
+                                                     unsigned OldValRes,
+                                                     unsigned Addr,
+                                                     unsigned Val,
+                                                     MachineMemOperand &MMO) {
 #ifndef NDEBUG
   LLT OldValResTy = getMRI()->getType(OldValRes);
   LLT AddrTy = getMRI()->getType(Addr);
@@ -752,74 +673,75 @@ MachineIRBuilderBase::buildAtomicRMW(unsigned Opcode, unsigned OldValRes,
 }
 
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWXchg(unsigned OldValRes, unsigned Addr,
-                                         unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWXchg(unsigned OldValRes, unsigned Addr,
+                                     unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_XCHG, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWAdd(unsigned OldValRes, unsigned Addr,
-                                        unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWAdd(unsigned OldValRes, unsigned Addr,
+                                    unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_ADD, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWSub(unsigned OldValRes, unsigned Addr,
-                                        unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWSub(unsigned OldValRes, unsigned Addr,
+                                    unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_SUB, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWAnd(unsigned OldValRes, unsigned Addr,
-                                        unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWAnd(unsigned OldValRes, unsigned Addr,
+                                    unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_AND, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWNand(unsigned OldValRes, unsigned Addr,
-                                         unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWNand(unsigned OldValRes, unsigned Addr,
+                                     unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_NAND, OldValRes, Addr, Val,
                         MMO);
 }
-MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWOr(unsigned OldValRes, unsigned Addr,
-                                       unsigned Val, MachineMemOperand &MMO) {
+MachineInstrBuilder MachineIRBuilder::buildAtomicRMWOr(unsigned OldValRes,
+                                                       unsigned Addr,
+                                                       unsigned Val,
+                                                       MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_OR, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWXor(unsigned OldValRes, unsigned Addr,
-                                        unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWXor(unsigned OldValRes, unsigned Addr,
+                                    unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_XOR, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWMax(unsigned OldValRes, unsigned Addr,
-                                        unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWMax(unsigned OldValRes, unsigned Addr,
+                                    unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_MAX, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWMin(unsigned OldValRes, unsigned Addr,
-                                        unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWMin(unsigned OldValRes, unsigned Addr,
+                                    unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_MIN, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWUmax(unsigned OldValRes, unsigned Addr,
-                                         unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWUmax(unsigned OldValRes, unsigned Addr,
+                                     unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_UMAX, OldValRes, Addr, Val,
                         MMO);
 }
 MachineInstrBuilder
-MachineIRBuilderBase::buildAtomicRMWUmin(unsigned OldValRes, unsigned Addr,
-                                         unsigned Val, MachineMemOperand &MMO) {
+MachineIRBuilder::buildAtomicRMWUmin(unsigned OldValRes, unsigned Addr,
+                                     unsigned Val, MachineMemOperand &MMO) {
   return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_UMIN, OldValRes, Addr, Val,
                         MMO);
 }
 
 MachineInstrBuilder
-MachineIRBuilderBase::buildBlockAddress(unsigned Res, const BlockAddress *BA) {
+MachineIRBuilder::buildBlockAddress(unsigned Res, const BlockAddress *BA) {
 #ifndef NDEBUG
   assert(getMRI()->getType(Res).isPointer() && "invalid res type");
 #endif
@@ -827,12 +749,9 @@ MachineIRBuilderBase::buildBlockAddress(unsigned Res, const BlockAddress *BA) {
   return buildInstr(TargetOpcode::G_BLOCK_ADDR).addDef(Res).addBlockAddress(BA);
 }
 
-void MachineIRBuilderBase::validateTruncExt(unsigned Dst, unsigned Src,
-                                            bool IsExtend) {
+void MachineIRBuilder::validateTruncExt(const LLT &DstTy, const LLT &SrcTy,
+                                        bool IsExtend) {
 #ifndef NDEBUG
-  LLT SrcTy = getMRI()->getType(Src);
-  LLT DstTy = getMRI()->getType(Dst);
-
   if (DstTy.isVector()) {
     assert(SrcTy.isVector() && "mismatched cast between vector and non-vector");
     assert(SrcTy.getNumElements() == DstTy.getNumElements() &&
@@ -848,3 +767,236 @@ void MachineIRBuilderBase::validateTruncExt(unsigned Dst, unsigned Src,
            "invalid widening trunc");
 #endif
 }
+
+void MachineIRBuilder::validateSelectOp(const LLT &ResTy, const LLT &TstTy,
+                                        const LLT &Op0Ty, const LLT &Op1Ty) {
+#ifndef NDEBUG
+  assert((ResTy.isScalar() || ResTy.isVector() || ResTy.isPointer()) &&
+         "invalid operand type");
+  assert((ResTy == Op0Ty && ResTy == Op1Ty) && "type mismatch");
+  if (ResTy.isScalar() || ResTy.isPointer())
+    assert(TstTy.isScalar() && "type mismatch");
+  else
+    assert((TstTy.isScalar() ||
+            (TstTy.isVector() &&
+             TstTy.getNumElements() == Op0Ty.getNumElements())) &&
+           "type mismatch");
+#endif
+}
+
+MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
+                                                 ArrayRef<DstOp> DstOps,
+                                                 ArrayRef<SrcOp> SrcOps,
+                                                 Optional<unsigned> Flags) {
+  switch (Opc) {
+  default:
+    break;
+  case TargetOpcode::G_SELECT: {
+    assert(DstOps.size() == 1 && "Invalid select");
+    assert(SrcOps.size() == 3 && "Invalid select");
+    validateSelectOp(
+        DstOps[0].getLLTTy(*getMRI()), SrcOps[0].getLLTTy(*getMRI()),
+        SrcOps[1].getLLTTy(*getMRI()), SrcOps[2].getLLTTy(*getMRI()));
+    break;
+  }
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_XOR:
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_SREM: {
+    // All these are binary ops.
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 2 && "Invalid Srcs");
+    validateBinaryOp(DstOps[0].getLLTTy(*getMRI()),
+                     SrcOps[0].getLLTTy(*getMRI()),
+                     SrcOps[1].getLLTTy(*getMRI()));
+    break;
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    validateTruncExt(DstOps[0].getLLTTy(*getMRI()),
+                     SrcOps[0].getLLTTy(*getMRI()), true);
+    break;
+  case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_FPTRUNC:
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    validateTruncExt(DstOps[0].getLLTTy(*getMRI()),
+                     SrcOps[0].getLLTTy(*getMRI()), false);
+    break;
+  }
+  case TargetOpcode::COPY:
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    assert(DstOps[0].getLLTTy(*getMRI()) == LLT() ||
+           SrcOps[0].getLLTTy(*getMRI()) == LLT() ||
+           DstOps[0].getLLTTy(*getMRI()) == SrcOps[0].getLLTTy(*getMRI()));
+    break;
+  case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_ICMP: {
+    assert(DstOps.size() == 1 && "Invalid Dst Operands");
+    assert(SrcOps.size() == 3 && "Invalid Src Operands");
+    // For F/ICMP, the first src operand is the predicate, followed by
+    // the two comparands.
+    assert(SrcOps[0].getSrcOpKind() == SrcOp::SrcType::Ty_Predicate &&
+           "Expecting predicate");
+    assert([&]() -> bool {
+      CmpInst::Predicate Pred = SrcOps[0].getPredicate();
+      return Opc == TargetOpcode::G_ICMP ? CmpInst::isIntPredicate(Pred)
+                                         : CmpInst::isFPPredicate(Pred);
+    }() && "Invalid predicate");
+    assert(SrcOps[1].getLLTTy(*getMRI()) == SrcOps[2].getLLTTy(*getMRI()) &&
+           "Type mismatch");
+    assert([&]() -> bool {
+      LLT Op0Ty = SrcOps[1].getLLTTy(*getMRI());
+      LLT DstTy = DstOps[0].getLLTTy(*getMRI());
+      if (Op0Ty.isScalar() || Op0Ty.isPointer())
+        return DstTy.isScalar();
+      else
+        return DstTy.isVector() &&
+               DstTy.getNumElements() == Op0Ty.getNumElements();
+    }() && "Type Mismatch");
+    break;
+  }
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    assert(!DstOps.empty() && "Invalid trivial sequence");
+    assert(SrcOps.size() == 1 && "Invalid src for Unmerge");
+    assert(std::all_of(DstOps.begin(), DstOps.end(),
+                       [&, this](const DstOp &Op) {
+                         return Op.getLLTTy(*getMRI()) ==
+                                DstOps[0].getLLTTy(*getMRI());
+                       }) &&
+           "type mismatch in output list");
+    assert(DstOps.size() * DstOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+               SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
+           "input operands do not cover output register");
+    break;
+  }
+  case TargetOpcode::G_MERGE_VALUES: {
+    assert(!SrcOps.empty() && "invalid trivial sequence");
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(std::all_of(SrcOps.begin(), SrcOps.end(),
+                       [&, this](const SrcOp &Op) {
+                         return Op.getLLTTy(*getMRI()) ==
+                                SrcOps[0].getLLTTy(*getMRI());
+                       }) &&
+           "type mismatch in input list");
+    assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+               DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
+           "input operands do not cover output register");
+    if (SrcOps.size() == 1)
+      return buildCast(DstOps[0], SrcOps[0]);
+    if (DstOps[0].getLLTTy(*getMRI()).isVector())
+      return buildInstr(TargetOpcode::G_CONCAT_VECTORS, DstOps, SrcOps);
+    break;
+  }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+    assert(DstOps.size() == 1 && "Invalid Dst size");
+    assert(SrcOps.size() == 2 && "Invalid Src size");
+    assert(SrcOps[0].getLLTTy(*getMRI()).isVector() && "Invalid operand type");
+    assert((DstOps[0].getLLTTy(*getMRI()).isScalar() ||
+            DstOps[0].getLLTTy(*getMRI()).isPointer()) &&
+           "Invalid operand type");
+    assert(SrcOps[1].getLLTTy(*getMRI()).isScalar() && "Invalid operand type");
+    assert(SrcOps[0].getLLTTy(*getMRI()).getElementType() ==
+               DstOps[0].getLLTTy(*getMRI()) &&
+           "Type mismatch");
+    break;
+  }
+  case TargetOpcode::G_INSERT_VECTOR_ELT: {
+    assert(DstOps.size() == 1 && "Invalid dst size");
+    assert(SrcOps.size() == 3 && "Invalid src size");
+    assert(DstOps[0].getLLTTy(*getMRI()).isVector() &&
+           SrcOps[0].getLLTTy(*getMRI()).isVector() && "Invalid operand type");
+    assert(DstOps[0].getLLTTy(*getMRI()).getElementType() ==
+               SrcOps[1].getLLTTy(*getMRI()) &&
+           "Type mismatch");
+    assert(SrcOps[2].getLLTTy(*getMRI()).isScalar() && "Invalid index");
+    assert(DstOps[0].getLLTTy(*getMRI()).getNumElements() ==
+               SrcOps[0].getLLTTy(*getMRI()).getNumElements() &&
+           "Type mismatch");
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR: {
+    assert((!SrcOps.empty() || SrcOps.size() < 2) &&
+           "Must have at least 2 operands");
+    assert(DstOps.size() == 1 && "Invalid DstOps");
+    assert(DstOps[0].getLLTTy(*getMRI()).isVector() &&
+           "Res type must be a vector");
+    assert(std::all_of(SrcOps.begin(), SrcOps.end(),
+                       [&, this](const SrcOp &Op) {
+                         return Op.getLLTTy(*getMRI()) ==
+                                SrcOps[0].getLLTTy(*getMRI());
+                       }) &&
+           "type mismatch in input list");
+    assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+               DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
+           "input scalars do not exactly cover the outpur vector register");
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR_TRUNC: {
+    assert((!SrcOps.empty() || SrcOps.size() < 2) &&
+           "Must have at least 2 operands");
+    assert(DstOps.size() == 1 && "Invalid DstOps");
+    assert(DstOps[0].getLLTTy(*getMRI()).isVector() &&
+           "Res type must be a vector");
+    assert(std::all_of(SrcOps.begin(), SrcOps.end(),
+                       [&, this](const SrcOp &Op) {
+                         return Op.getLLTTy(*getMRI()) ==
+                                SrcOps[0].getLLTTy(*getMRI());
+                       }) &&
+           "type mismatch in input list");
+    if (SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+        DstOps[0].getLLTTy(*getMRI()).getElementType().getSizeInBits())
+      return buildInstr(TargetOpcode::G_BUILD_VECTOR, DstOps, SrcOps);
+    break;
+  }
+  case TargetOpcode::G_CONCAT_VECTORS: {
+    assert(DstOps.size() == 1 && "Invalid DstOps");
+    assert((!SrcOps.empty() || SrcOps.size() < 2) &&
+           "Must have at least 2 operands");
+    assert(std::all_of(SrcOps.begin(), SrcOps.end(),
+                       [&, this](const SrcOp &Op) {
+                         return (Op.getLLTTy(*getMRI()).isVector() &&
+                                 Op.getLLTTy(*getMRI()) ==
+                                     SrcOps[0].getLLTTy(*getMRI()));
+                       }) &&
+           "type mismatch in input list");
+    assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+               DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
+           "input vectors do not exactly cover the outpur vector register");
+    break;
+  }
+  case TargetOpcode::G_UADDE: {
+    assert(DstOps.size() == 2 && "Invalid no of dst operands");
+    assert(SrcOps.size() == 3 && "Invalid no of src operands");
+    assert(DstOps[0].getLLTTy(*getMRI()).isScalar() && "Invalid operand");
+    assert((DstOps[0].getLLTTy(*getMRI()) == SrcOps[0].getLLTTy(*getMRI())) &&
+           (DstOps[0].getLLTTy(*getMRI()) == SrcOps[1].getLLTTy(*getMRI())) &&
+           "Invalid operand");
+    assert(DstOps[1].getLLTTy(*getMRI()).isScalar() && "Invalid operand");
+    assert(DstOps[1].getLLTTy(*getMRI()) == SrcOps[2].getLLTTy(*getMRI()) &&
+           "type mismatch");
+    break;
+  }
+  }
+
+  auto MIB = buildInstr(Opc);
+  for (const DstOp &Op : DstOps)
+    Op.addDefToMIB(*getMRI(), MIB);
+  for (const SrcOp &Op : SrcOps)
+    Op.addSrcToMIB(MIB);
+  if (Flags)
+    MIB->setFlags(*Flags);
+  return MIB;
+}
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 6bb48dc2e8aac8cef90628f54a512bd99bf7fdb1..98df7d34857e3bbb978a2484f62a36925b5c0226 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -116,7 +116,7 @@ bool RegBankSelect::assignmentMatch(
   OnlyAssign = false;
   // Each part of a break down needs to end up in a different register.
   // In other word, Reg assignement does not match.
-  if (ValMapping.NumBreakDowns > 1)
+  if (ValMapping.NumBreakDowns != 1)
     return false;
 
   const RegisterBank *CurRegBank = RBI->getRegBank(Reg, *MRI, *TRI);
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index deb49a1ea4826acaacf55a12f05c56003d787258..f411ee6745d0ed6f23326ea902a0d595de5fe6f4 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -360,10 +360,10 @@ ImplicitNullChecks::SuitabilityResult
 ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
                                        ArrayRef<MachineInstr *> PrevInsts) {
   int64_t Offset;
-  unsigned BaseReg;
+  MachineOperand *BaseOp;
 
-  if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) ||
-      BaseReg != PointerReg)
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) ||
+      !BaseOp->isReg() || BaseOp->getReg() != PointerReg)
     return SR_Unsuitable;
 
   // We want the mem access to be issued at a sane offset from PointerReg,
diff --git a/lib/CodeGen/InterleavedLoadCombinePass.cpp b/lib/CodeGen/InterleavedLoadCombinePass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..989fa164ad2dd2ef3887e1353f8647019b909aac
--- /dev/null
+++ b/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -0,0 +1,1359 @@
+//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+//
+// This file defines the interleaved-load-combine pass. The pass searches for
+// ShuffleVectorInstruction that execute interleaving loads. If a matching
+// pattern is found, it adds a combined load and further instructions in a
+// pattern that is detectable by InterleavedAccesPass. The old instructions are
+// left dead to be removed later. The pass is specifically designed to be
+// executed just before InterleavedAccesPass to find any left-over instances
+// that are not detected within former passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <algorithm>
+#include <cassert>
+#include <list>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "interleaved-load-combine"
+
+namespace {
+
+/// Statistic counter
+STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
+
+/// Option to disable the pass
+static cl::opt<bool> DisableInterleavedLoadCombine(
+    "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
+    cl::desc("Disable combining of interleaved loads"));
+
+struct VectorInfo;
+
+struct InterleavedLoadCombineImpl {
+public:
+  InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
+                             TargetMachine &TM)
+      : F(F), DT(DT), MSSA(MSSA),
+        TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
+        TTI(TM.getTargetTransformInfo(F)) {}
+
+  /// Scan the function for interleaved load candidates and execute the
+  /// replacement if applicable.
+  bool run();
+
+private:
+  /// Function this pass is working on
+  Function &F;
+
+  /// Dominator Tree Analysis
+  DominatorTree &DT;
+
+  /// Memory Alias Analyses
+  MemorySSA &MSSA;
+
+  /// Target Lowering Information
+  const TargetLowering &TLI;
+
+  /// Target Transform Information
+  const TargetTransformInfo TTI;
+
+  /// Find the instruction in sets LIs that dominates all others, return nullptr
+  /// if there is none.
+  LoadInst *findFirstLoad(const std::set<LoadInst *> &LIs);
+
+  /// Replace interleaved load candidates. It does additional
+  /// analyses if this makes sense. Returns true on success and false
+  /// of nothing has been changed.
+  bool combine(std::list<VectorInfo> &InterleavedLoad,
+               OptimizationRemarkEmitter &ORE);
+
+  /// Given a set of VectorInfo containing candidates for a given interleave
+  /// factor, find a set that represents a 'factor' interleaved load.
+  bool findPattern(std::list<VectorInfo> &Candidates,
+                   std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
+                   const DataLayout &DL);
+}; // InterleavedLoadCombine
+
+/// First Order Polynomial on an n-Bit Integer Value
+///
+/// Polynomial(Value) = Value * B + A + E*2^(n-e)
+///
+/// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
+/// significant bits. It is introduced if an exact computation cannot be proven
+/// (e.q. division by 2).
+///
+/// As part of this optimization multiple loads will be combined. It necessary
+/// to prove that loads are within some relative offset to each other. This
+/// class is used to prove relative offsets of values loaded from memory.
+///
+/// Representing an integer in this form is sound since addition in two's
+/// complement is associative (trivial) and multiplication distributes over the
+/// addition (see Proof(1) in Polynomial::mul). Further, both operations
+/// commute.
+//
+// Example:
+// declare @fn(i64 %IDX, <4 x float>* %PTR) {
+//   %Pa1 = add i64 %IDX, 2
+//   %Pa2 = lshr i64 %Pa1, 1
+//   %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
+//   %Va = load <4 x float>, <4 x float>* %Pa3
+//
+//   %Pb1 = add i64 %IDX, 4
+//   %Pb2 = lshr i64 %Pb1, 1
+//   %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
+//   %Vb = load <4 x float>, <4 x float>* %Pb3
+// ... }
+//
+// The goal is to prove that two loads load consecutive addresses.
+//
+// In this case the polynomials are constructed by the following
+// steps.
+//
+// The number tag #e specifies the error bits.
+//
+// Pa_0 = %IDX              #0
+// Pa_1 = %IDX + 2          #0 | add 2
+// Pa_2 = %IDX/2 + 1        #1 | lshr 1
+// Pa_3 = %IDX/2 + 1        #1 | GEP, step signext to i64
+// Pa_4 = (%IDX/2)*16 + 16  #0 | GEP, multiply index by sizeof(4) for floats
+// Pa_5 = (%IDX/2)*16 + 16  #0 | GEP, add offset of leading components
+//
+// Pb_0 = %IDX              #0
+// Pb_1 = %IDX + 4          #0 | add 2
+// Pb_2 = %IDX/2 + 2        #1 | lshr 1
+// Pb_3 = %IDX/2 + 2        #1 | GEP, step signext to i64
+// Pb_4 = (%IDX/2)*16 + 32  #0 | GEP, multiply index by sizeof(4) for floats
+// Pb_5 = (%IDX/2)*16 + 16  #0 | GEP, add offset of leading components
+//
+// Pb_5 - Pa_5 = 16         #0 | subtract to get the offset
+//
+// Remark: %PTR is not maintained within this class. So in this instance the
+// offset of 16 can only be assumed if the pointers are equal.
+//
+class Polynomial {
+  /// Operations on B
+  enum BOps {
+    LShr,
+    Mul,
+    SExt,
+    Trunc,
+  };
+
+  /// Number of Error Bits e
+  unsigned ErrorMSBs;
+
+  /// Value
+  Value *V;
+
+  /// Coefficient B
+  SmallVector<std::pair<BOps, APInt>, 4> B;
+
+  /// Coefficient A
+  APInt A;
+
+public:
+  Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() {
+    IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
+    if (Ty) {
+      ErrorMSBs = 0;
+      this->V = V;
+      A = APInt(Ty->getBitWidth(), 0);
+    }
+  }
+
+  Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
+      : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {}
+
+  Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
+      : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {}
+
+  Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {}
+
+  /// Increment and clamp the number of undefined bits.
+  void incErrorMSBs(unsigned amt) {
+    if (ErrorMSBs == (unsigned)-1)
+      return;
+
+    ErrorMSBs += amt;
+    if (ErrorMSBs > A.getBitWidth())
+      ErrorMSBs = A.getBitWidth();
+  }
+
+  /// Decrement and clamp the number of undefined bits.
+  void decErrorMSBs(unsigned amt) {
+    if (ErrorMSBs == (unsigned)-1)
+      return;
+
+    if (ErrorMSBs > amt)
+      ErrorMSBs -= amt;
+    else
+      ErrorMSBs = 0;
+  }
+
+  /// Apply an add on the polynomial
+  Polynomial &add(const APInt &C) {
+    // Note: Addition is associative in two's complement even when in case of
+    // signed overflow.
+    //
+    // Error bits can only propagate into higher significant bits. As these are
+    // already regarded as undefined, there is no change.
+    //
+    // Theorem: Adding a constant to a polynomial does not change the error
+    // term.
+    //
+    // Proof:
+    //
+    //   Since the addition is associative and commutes:
+    //
+    //   (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e)
+    // [qed]
+
+    if (C.getBitWidth() != A.getBitWidth()) {
+      ErrorMSBs = (unsigned)-1;
+      return *this;
+    }
+
+    A += C;
+    return *this;
+  }
+
+  /// Apply a multiplication onto the polynomial.
+  Polynomial &mul(const APInt &C) {
+    // Note: Multiplication distributes over the addition
+    //
+    // Theorem: Multiplication distributes over the addition
+    //
+    // Proof(1):
+    //
+    //   (B+A)*C =-
+    //        = (B + A) + (B + A) + .. {C Times}
+    //         addition is associative and commutes, hence
+    //        = B + B + .. {C Times} .. + A + A + .. {C times}
+    //        = B*C + A*C
+    //   (see (function add) for signed values and overflows)
+    // [qed]
+    //
+    // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
+    // to the left.
+    //
+    // Proof(2):
+    //
+    //   Let B' and A' be the n-Bit inputs with some unknown errors EA,
+    //   EB at e leading bits. B' and A' can be written down as:
+    //
+    //     B' = B + 2^(n-e)*EB
+    //     A' = A + 2^(n-e)*EA
+    //
+    //   Let C' be an input with c trailing zero bits. C' can be written as
+    //
+    //     C' = C*2^c
+    //
+    //   Therefore we can compute the result by using distributivity and
+    //   commutativity.
+    //
+    //     (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' =
+    //                     = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
+    //                     = (B'+A') * C' =
+    //                     = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
+    //                     = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' =
+    //                     = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' =
+    //                     = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c =
+    //                     = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c =
+    //
+    //   Let EC be the final error with EC = C*(EB + EA)
+    //
+    //                     = (B + A)*C' + EC*2^(n-e)*2^c =
+    //                     = (B + A)*C' + EC*2^(n-(e-c))
+    //
+    //   Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
+    //   less error bits than the input. c bits are shifted out to the left.
+    // [qed]
+
+    if (C.getBitWidth() != A.getBitWidth()) {
+      ErrorMSBs = (unsigned)-1;
+      return *this;
+    }
+
+    // Multiplying by one is a no-op.
+    if (C.isOneValue()) {
+      return *this;
+    }
+
+    // Multiplying by zero removes the coefficient B and defines all bits.
+    if (C.isNullValue()) {
+      ErrorMSBs = 0;
+      deleteB();
+    }
+
+    // See Proof(2): Trailing zero bits indicate a left shift. This removes
+    // leading bits from the result even if they are undefined.
+    decErrorMSBs(C.countTrailingZeros());
+
+    A *= C;
+    pushBOperation(Mul, C);
+    return *this;
+  }
+
+  /// Apply a logical shift right on the polynomial
+  Polynomial &lshr(const APInt &C) {
+    // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e')
+    //          where
+    //             e' = e + 1,
+    //             E is a e-bit number,
+    //             E' is a e'-bit number,
+    //   holds under the following precondition:
+    //          pre(1): A % 2 = 0
+    //          pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
+    //   where >> expresses a logical shift to the right, with adding zeros.
+    //
+    //  We need to show that for every, E there is a E'
+    //
+    //  B = b_h * 2^(n-1) + b_m * 2 + b_l
+    //  A = a_h * 2^(n-1) + a_m * 2         (pre(1))
+    //
+    //  where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
+    //
+    //  Let X = (B + A + E*2^(n-e)) >> 1
+    //  Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
+    //
+    //    X = [B + A + E*2^(n-e)] >> 1 =
+    //      = [  b_h * 2^(n-1) + b_m * 2 + b_l +
+    //         + a_h * 2^(n-1) + a_m * 2 +
+    //         + E * 2^(n-e) ] >> 1 =
+    //
+    //    The sum is built by putting the overflow of [a_m + b+n] into the term
+    //    2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
+    //    this bit is discarded. This is expressed by % 2.
+    //
+    //    The bit in position 0 cannot overflow into the term (b_m + a_m).
+    //
+    //      = [  ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
+    //         + ((b_m + a_m) % 2^(n-2)) * 2 +
+    //         + b_l + E * 2^(n-e) ] >> 1 =
+    //
+    //    The shift is computed by dividing the terms by 2 and by cutting off
+    //    b_l.
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E * 2^(n-(e+1)) =
+    //
+    //    by the definition in the Theorem e+1 = e'
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E * 2^(n-e') =
+    //
+    //    Compute Y by applying distributivity first
+    //
+    //    Y =  (B >> 1) + (A >> 1) + E*2^(n-e') =
+    //      =    (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
+    //         + (a_h * 2^(n-1) + a_m * 2) >> 1 +
+    //         + E * 2^(n-e) >> 1 =
+    //
+    //    Again, the shift is computed by dividing the terms by 2 and by cutting
+    //    off b_l.
+    //
+    //      =     b_h * 2^(n-2) + b_m +
+    //         +  a_h * 2^(n-2) + a_m +
+    //         +  E * 2^(n-(e+1)) =
+    //
+    //    Again, the sum is built by putting the overflow of [a_m + b+n] into
+    //    the term 2^(n-1). But this time there is room for a second bit in the
+    //    term 2^(n-2) we add this bit to a new term and denote it o_h in a
+    //    second step.
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
+    //         + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E * 2^(n-(e+1)) =
+    //
+    //    Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
+    //    Further replace e+1 by e'.
+    //
+    //      =    o_h * 2^(n-1) +
+    //         + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E * 2^(n-e') =
+    //
+    //    Move o_h into the error term and construct E'. To ensure that there is
+    //    no 2^x with negative x, this step requires pre(2) (e < n).
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + o_h * 2^(e'-1) * 2^(n-e') +               | pre(2), move 2^(e'-1)
+    //                                                     | out of the old exponent
+    //         + E * 2^(n-e') =
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + [o_h * 2^(e'-1) + E] * 2^(n-e') +         | move 2^(e'-1) out of
+    //                                                     | the old exponent
+    //
+    //    Let E' = o_h * 2^(e'-1) + E
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E' * 2^(n-e')
+    //
+    //    Because X and Y are distinct only in there error terms and E' can be
+    //    constructed as shown the theorem holds.
+    // [qed]
+    //
+    // For completeness in case of the case e=n it is also required to show that
+    // distributivity can be applied.
+    //
+    // In this case Theorem(1) transforms to (the pre-condition on A can also be
+    // dropped)
+    //
+    // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
+    //          where
+    //             A, B, E, E' are two's complement numbers with the same bit
+    //             width
+    //
+    //   Let A + B + E = X
+    //   Let (B >> 1) + (A >> 1) = Y
+    //
+    //   Therefore we need to show that for every X and Y there is an E' which
+    //   makes the equation
+    //
+    //     X = Y + E'
+    //
+    //   hold. This is trivially the case for E' = X - Y.
+    //
+    // [qed]
+    //
+    // Remark: Distributing lshr with and arbitrary number n can be expressed as
+    //   ((((B + A) lshr 1) lshr 1) ... ) {n times}.
+    // This construction induces n additional error bits at the left.
+
+    if (C.getBitWidth() != A.getBitWidth()) {
+      ErrorMSBs = (unsigned)-1;
+      return *this;
+    }
+
+    if (C.isNullValue())
+      return *this;
+
+    // Test if the result will be zero
+    unsigned shiftAmt = C.getZExtValue();
+    if (shiftAmt >= C.getBitWidth())
+      return mul(APInt(C.getBitWidth(), 0));
+
+    // The proof that shiftAmt LSBs are zero for at least one summand is only
+    // possible for the constant number.
+    //
+    // If this can be proven add shiftAmt to the error counter
+    // `ErrorMSBs`. Otherwise set all bits as undefined.
+    if (A.countTrailingZeros() < shiftAmt)
+      ErrorMSBs = A.getBitWidth();
+    else
+      incErrorMSBs(shiftAmt);
+
+    // Apply the operation.
+    pushBOperation(LShr, C);
+    A = A.lshr(shiftAmt);
+
+    return *this;
+  }
+
+  /// Apply a sign-extend or truncate operation on the polynomial.
+  Polynomial &sextOrTrunc(unsigned n) {
+    if (n < A.getBitWidth()) {
+      // Truncate: Clearly undefined Bits on the MSB side are removed
+      // if there are any.
+      decErrorMSBs(A.getBitWidth() - n);
+      A = A.trunc(n);
+      pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
+    }
+    if (n > A.getBitWidth()) {
+      // Extend: Clearly extending first and adding later is different
+      // to adding first and extending later in all extended bits.
+      incErrorMSBs(n - A.getBitWidth());
+      A = A.sext(n);
+      pushBOperation(SExt, APInt(sizeof(n) * 8, n));
+    }
+
+    return *this;
+  }
+
+  /// Test if there is a coefficient B.
+  bool isFirstOrder() const { return V != nullptr; }
+
+  /// Test coefficient B of two Polynomials are equal.
+  bool isCompatibleTo(const Polynomial &o) const {
+    // The polynomial use different bit width.
+    if (A.getBitWidth() != o.A.getBitWidth())
+      return false;
+
+    // If neither Polynomial has the Coefficient B.
+    if (!isFirstOrder() && !o.isFirstOrder())
+      return true;
+
+    // The index variable is different.
+    if (V != o.V)
+      return false;
+
+    // Check the operations.
+    if (B.size() != o.B.size())
+      return false;
+
+    auto ob = o.B.begin();
+    for (auto &b : B) {
+      if (b != *ob)
+        return false;
+      ob++;
+    }
+
+    return true;
+  }
+
+  /// Subtract two polynomials, return an undefined polynomial if
+  /// subtraction is not possible.
+  Polynomial operator-(const Polynomial &o) const {
+    // Return an undefined polynomial if incompatible.
+    if (!isCompatibleTo(o))
+      return Polynomial();
+
+    // If the polynomials are compatible (meaning they have the same
+    // coefficient on B), B is eliminated. Thus a polynomial solely
+    // containing A is returned
+    return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
+  }
+
+  /// Subtract a constant from a polynomial,
+  Polynomial operator-(uint64_t C) const {
+    Polynomial Result(*this);
+    Result.A -= C;
+    return Result;
+  }
+
+  /// Add a constant to a polynomial,
+  Polynomial operator+(uint64_t C) const {
+    Polynomial Result(*this);
+    Result.A += C;
+    return Result;
+  }
+
+  /// Returns true if it can be proven that two Polynomials are equal.
+  bool isProvenEqualTo(const Polynomial &o) {
+    // Subtract both polynomials and test if it is fully defined and zero.
+    Polynomial r = *this - o;
+    return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isNullValue());
+  }
+
+  /// Print the polynomial into a stream.
+  void print(raw_ostream &OS) const {
+    OS << "[{#ErrBits:" << ErrorMSBs << "} ";
+
+    if (V) {
+      for (auto b : B)
+        OS << "(";
+      OS << "(" << *V << ") ";
+
+      for (auto b : B) {
+        switch (b.first) {
+        case LShr:
+          OS << "LShr ";
+          break;
+        case Mul:
+          OS << "Mul ";
+          break;
+        case SExt:
+          OS << "SExt ";
+          break;
+        case Trunc:
+          OS << "Trunc ";
+          break;
+        }
+
+        OS << b.second << ") ";
+      }
+    }
+
+    OS << "+ " << A << "]";
+  }
+
+private:
+  void deleteB() {
+    V = nullptr;
+    B.clear();
+  }
+
+  void pushBOperation(const BOps Op, const APInt &C) {
+    if (isFirstOrder()) {
+      B.push_back(std::make_pair(Op, C));
+      return;
+    }
+  }
+};
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
+  S.print(OS);
+  return OS;
+}
+#endif
+
+/// VectorInfo stores abstract the following information for each vector
+/// element:
+///
+/// 1) The the memory address loaded into the element as Polynomial
+/// 2) a set of load instruction necessary to construct the vector,
+/// 3) a set of all other instructions that are necessary to create the vector and
+/// 4) a pointer value that can be used as relative base for all elements.
+struct VectorInfo {
+private:
+  VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
+    llvm_unreachable(
+        "Copying VectorInfo is neither implemented nor necessary,");
+  }
+
+public:
+  /// Information of a Vector Element
+  struct ElementInfo {
+    /// Offset Polynomial.
+    Polynomial Ofs;
+
+    /// The Load Instruction used to Load the entry. LI is null if the pointer
+    /// of the load instruction does not point on to the entry
+    LoadInst *LI;
+
+    ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
+        : Ofs(Offset), LI(LI) {}
+  };
+
+  /// Basic-block the load instructions are within
+  BasicBlock *BB;
+
+  /// Pointer value of all participation load instructions
+  Value *PV;
+
+  /// Participating load instructions
+  std::set<LoadInst *> LIs;
+
+  /// Participating instructions
+  std::set<Instruction *> Is;
+
+  /// Final shuffle-vector instruction
+  ShuffleVectorInst *SVI;
+
+  /// Information of the offset for each vector element
+  ElementInfo *EI;
+
+  /// Vector Type
+  VectorType *const VTy;
+
+  VectorInfo(VectorType *VTy)
+      : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) {
+    EI = new ElementInfo[VTy->getNumElements()];
+  }
+
+  virtual ~VectorInfo() { delete[] EI; }
+
+  unsigned getDimension() const { return VTy->getNumElements(); }
+
+  /// Test if the VectorInfo can be part of an interleaved load with the
+  /// specified factor.
+  ///
+  /// \param Factor of the interleave
+  /// \param DL Targets Datalayout
+  ///
+  /// \returns true if this is possible and false if not
+  bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
+    unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
+    for (unsigned i = 1; i < getDimension(); i++) {
+      if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Recursively computes the vector information stored in V.
+  ///
+  /// This function delegates the work to specialized implementations
+  ///
+  /// \param V Value to operate on
+  /// \param Result Result of the computation
+  ///
+  /// \returns false if no sensible information can be gathered.
+  static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
+    ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
+    if (SVI)
+      return computeFromSVI(SVI, Result, DL);
+    LoadInst *LI = dyn_cast<LoadInst>(V);
+    if (LI)
+      return computeFromLI(LI, Result, DL);
+    BitCastInst *BCI = dyn_cast<BitCastInst>(V);
+    if (BCI)
+      return computeFromBCI(BCI, Result, DL);
+    return false;
+  }
+
+  /// BitCastInst specialization to compute the vector information.
+  ///
+  /// \param BCI BitCastInst to operate on
+  /// \param Result Result of the computation
+  ///
+  /// \returns false if no sensible information can be gathered.
+  static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
+                             const DataLayout &DL) {
+    Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0));
+
+    if (!Op)
+      return false;
+
+    VectorType *VTy = dyn_cast<VectorType>(Op->getType());
+    if (!VTy)
+      return false;
+
+    // We can only cast from large to smaller vectors
+    if (Result.VTy->getNumElements() % VTy->getNumElements())
+      return false;
+
+    unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
+    unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
+    unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
+
+    if (NewSize * Factor != OldSize)
+      return false;
+
+    VectorInfo Old(VTy);
+    if (!compute(Op, Old, DL))
+      return false;
+
+    for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
+      for (unsigned j = 0; j < Factor; j++) {
+        Result.EI[i + j] =
+            ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
+                        j == 0 ? Old.EI[i / Factor].LI : nullptr);
+      }
+    }
+
+    Result.BB = Old.BB;
+    Result.PV = Old.PV;
+    Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
+    Result.Is.insert(Old.Is.begin(), Old.Is.end());
+    Result.Is.insert(BCI);
+    Result.SVI = nullptr;
+
+    return true;
+  }
+
+  /// ShuffleVectorInst specialization to compute vector information.
+  ///
+  /// \param SVI ShuffleVectorInst to operate on
+  /// \param Result Result of the computation
+  ///
+  /// Compute the left and the right side vector information and merge them by
+  /// applying the shuffle operation. This function also ensures that the left
+  /// and right side have compatible loads. This means that all loads are with
+  /// in the same basic block and are based on the same pointer.
+  ///
+  /// \returns false if no sensible information can be gathered.
+  static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
+                             const DataLayout &DL) {
+    VectorType *ArgTy = dyn_cast<VectorType>(SVI->getOperand(0)->getType());
+    assert(ArgTy && "ShuffleVector Operand is not a VectorType");
+
+    // Compute the left hand vector information.
+    VectorInfo LHS(ArgTy);
+    if (!compute(SVI->getOperand(0), LHS, DL))
+      LHS.BB = nullptr;
+
+    // Compute the right hand vector information.
+    VectorInfo RHS(ArgTy);
+    if (!compute(SVI->getOperand(1), RHS, DL))
+      RHS.BB = nullptr;
+
+    // Neither operand produced sensible results?
+    if (!LHS.BB && !RHS.BB)
+      return false;
+    // Only RHS produced sensible results?
+    else if (!LHS.BB) {
+      Result.BB = RHS.BB;
+      Result.PV = RHS.PV;
+    }
+    // Only LHS produced sensible results?
+    else if (!RHS.BB) {
+      Result.BB = LHS.BB;
+      Result.PV = LHS.PV;
+    }
+    // Both operands produced sensible results?
+    else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
+      Result.BB = LHS.BB;
+      Result.PV = LHS.PV;
+    }
+    // Both operands produced sensible results but they are incompatible.
+    else {
+      return false;
+    }
+
+    // Merge and apply the operation on the offset information.
+    if (LHS.BB) {
+      Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
+      Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
+    }
+    if (RHS.BB) {
+      Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
+      Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
+    }
+    Result.Is.insert(SVI);
+    Result.SVI = SVI;
+
+    int j = 0;
+    for (int i : SVI->getShuffleMask()) {
+      assert((i < 2 * (signed)ArgTy->getNumElements()) &&
+             "Invalid ShuffleVectorInst (index out of bounds)");
+
+      if (i < 0)
+        Result.EI[j] = ElementInfo();
+      else if (i < (signed)ArgTy->getNumElements()) {
+        if (LHS.BB)
+          Result.EI[j] = LHS.EI[i];
+        else
+          Result.EI[j] = ElementInfo();
+      } else {
+        if (RHS.BB)
+          Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
+        else
+          Result.EI[j] = ElementInfo();
+      }
+      j++;
+    }
+
+    return true;
+  }
+
+  /// LoadInst specialization to compute vector information.
+  ///
+  /// This function also acts as abort condition to the recursion.
+  ///
+  /// \param LI LoadInst to operate on
+  /// \param Result Result of the computation
+  ///
+  /// \returns false if no sensible information can be gathered.
+  static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
+                            const DataLayout &DL) {
+    Value *BasePtr;
+    Polynomial Offset;
+
+    if (LI->isVolatile())
+      return false;
+
+    if (LI->isAtomic())
+      return false;
+
+    // Get the base polynomial
+    computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
+
+    Result.BB = LI->getParent();
+    Result.PV = BasePtr;
+    Result.LIs.insert(LI);
+    Result.Is.insert(LI);
+
+    for (unsigned i = 0; i < Result.getDimension(); i++) {
+      Value *Idx[2] = {
+          ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),
+          ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),
+      };
+      int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, makeArrayRef(Idx, 2));
+      Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
+    }
+
+    return true;
+  }
+
+  /// Recursively compute polynomial of a value.
+  ///
+  /// \param BO Input binary operation
+  /// \param Result Result polynomial
+  static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
+    Value *LHS = BO.getOperand(0);
+    Value *RHS = BO.getOperand(1);
+
+    // Find the RHS Constant if any
+    ConstantInt *C = dyn_cast<ConstantInt>(RHS);
+    if ((!C) && BO.isCommutative()) {
+      C = dyn_cast<ConstantInt>(LHS);
+      if (C)
+        std::swap(LHS, RHS);
+    }
+
+    switch (BO.getOpcode()) {
+    case Instruction::Add:
+      if (!C)
+        break;
+
+      computePolynomial(*LHS, Result);
+      Result.add(C->getValue());
+      return;
+
+    case Instruction::LShr:
+      if (!C)
+        break;
+
+      computePolynomial(*LHS, Result);
+      Result.lshr(C->getValue());
+      return;
+
+    default:
+      break;
+    }
+
+    Result = Polynomial(&BO);
+  }
+
+  /// Recursively compute polynomial of a value
+  ///
+  /// \param V input value
+  /// \param Result result polynomial
+  static void computePolynomial(Value &V, Polynomial &Result) {
+    if (isa<BinaryOperator>(&V))
+      computePolynomialBinOp(*dyn_cast<BinaryOperator>(&V), Result);
+    else
+      Result = Polynomial(&V);
+  }
+
+  /// Compute the Polynomial representation of a Pointer type.
+  ///
+  /// \param Ptr input pointer value
+  /// \param Result result polynomial
+  /// \param BasePtr pointer the polynomial is based on
+  /// \param DL Datalayout of the target machine
+  static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
+                                           Value *&BasePtr,
+                                           const DataLayout &DL) {
+    // Not a pointer type? Return an undefined polynomial
+    PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType());
+    if (!PtrTy) {
+      Result = Polynomial();
+      BasePtr = nullptr;
+    }
+    unsigned PointerBits =
+        DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
+
+    /// Skip pointer casts. Return Zero polynomial otherwise
+    if (isa<CastInst>(&Ptr)) {
+      CastInst &CI = *cast<CastInst>(&Ptr);
+      switch (CI.getOpcode()) {
+      case Instruction::BitCast:
+        computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
+        break;
+      default:
+        BasePtr = &Ptr;
+        Polynomial(PointerBits, 0);
+        break;
+      }
+    }
+    /// Resolve GetElementPtrInst.
+    else if (isa<GetElementPtrInst>(&Ptr)) {
+      GetElementPtrInst &GEP = *cast<GetElementPtrInst>(&Ptr);
+
+      APInt BaseOffset(PointerBits, 0);
+
+      // Check if we can compute the Offset with accumulateConstantOffset
+      if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
+        Result = Polynomial(BaseOffset);
+        BasePtr = GEP.getPointerOperand();
+        return;
+      } else {
+        // Otherwise we allow that the last index operand of the GEP is
+        // non-constant.
+        unsigned idxOperand, e;
+        SmallVector<Value *, 4> Indices;
+        for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
+             idxOperand++) {
+          ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand));
+          if (!IDX)
+            break;
+          Indices.push_back(IDX);
+        }
+
+        // It must also be the last operand.
+        if (idxOperand + 1 != e) {
+          Result = Polynomial();
+          BasePtr = nullptr;
+          return;
+        }
+
+        // Compute the polynomial of the index operand.
+        computePolynomial(*GEP.getOperand(idxOperand), Result);
+
+        // Compute base offset from zero based index, excluding the last
+        // variable operand.
+        BaseOffset =
+            DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
+
+        // Apply the operations of GEP to the polynomial.
+        unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
+        Result.sextOrTrunc(PointerBits);
+        Result.mul(APInt(PointerBits, ResultSize));
+        Result.add(BaseOffset);
+        BasePtr = GEP.getPointerOperand();
+      }
+    }
+    // All other instructions are handled by using the value as base pointer and
+    // a zero polynomial.
+    else {
+      BasePtr = &Ptr;
+      Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
+    }
+  }
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const {
+    if (PV)
+      OS << *PV;
+    else
+      OS << "(none)";
+    OS << " + ";
+    for (unsigned i = 0; i < getDimension(); i++)
+      OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
+    OS << "]";
+  }
+#endif
+};
+
+} // anonymous namespace
+
+bool InterleavedLoadCombineImpl::findPattern(
+    std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
+    unsigned Factor, const DataLayout &DL) {
+  for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
+    unsigned i;
+    // Try to find an interleaved load using the front of Worklist as first line
+    unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
+
+    // List containing iterators pointing to the VectorInfos of the candidates
+    std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
+
+    for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
+      if (C->VTy != C0->VTy)
+        continue;
+      if (C->BB != C0->BB)
+        continue;
+      if (C->PV != C0->PV)
+        continue;
+
+      // Check the current value matches any of factor - 1 remaining lines
+      for (i = 1; i < Factor; i++) {
+        if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
+          Res[i] = C;
+        }
+      }
+
+      for (i = 1; i < Factor; i++) {
+        if (Res[i] == Candidates.end())
+          break;
+      }
+      if (i == Factor) {
+        Res[0] = C0;
+        break;
+      }
+    }
+
+    if (Res[0] != Candidates.end()) {
+      // Move the result into the output
+      for (unsigned i = 0; i < Factor; i++) {
+        InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
+      }
+
+      return true;
+    }
+  }
+  return false;
+}
+
+LoadInst *
+InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
+  assert(!LIs.empty() && "No load instructions given.");
+
+  // All LIs are within the same BB. Select the first for a reference.
+  BasicBlock *BB = (*LIs.begin())->getParent();
+  BasicBlock::iterator FLI =
+      std::find_if(BB->begin(), BB->end(), [&LIs](Instruction &I) -> bool {
+        return is_contained(LIs, &I);
+      });
+  assert(FLI != BB->end());
+
+  return cast<LoadInst>(FLI);
+}
+
+bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
+                                         OptimizationRemarkEmitter &ORE) {
+  LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
+
+  // The insertion point is the LoadInst which loads the first values. The
+  // following tests are used to proof that the combined load can be inserted
+  // just before InsertionPoint.
+  LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
+
+  // Test if the offset is computed
+  if (!InsertionPoint)
+    return false;
+
+  std::set<LoadInst *> LIs;
+  std::set<Instruction *> Is;
+  std::set<Instruction *> SVIs;
+
+  unsigned InterleavedCost;
+  unsigned InstructionCost = 0;
+
+  // Get the interleave factor
+  unsigned Factor = InterleavedLoad.size();
+
+  // Merge all input sets used in analysis
+  for (auto &VI : InterleavedLoad) {
+    // Generate a set of all load instructions to be combined
+    LIs.insert(VI.LIs.begin(), VI.LIs.end());
+
+    // Generate a set of all instructions taking part in load
+    // interleaved. This list excludes the instructions necessary for the
+    // polynomial construction.
+    Is.insert(VI.Is.begin(), VI.Is.end());
+
+    // Generate the set of the final ShuffleVectorInst.
+    SVIs.insert(VI.SVI);
+  }
+
+  // There is nothing to combine.
+  if (LIs.size() < 2)
+    return false;
+
+  // Test if all participating instruction will be dead after the
+  // transformation. If intermediate results are used, no performance gain can
+  // be expected. Also sum the cost of the Instructions beeing left dead.
+  for (auto &I : Is) {
+    // Compute the old cost
+    InstructionCost +=
+        TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+
+    // The final SVIs are allowed not to be dead, all uses will be replaced
+    if (SVIs.find(I) != SVIs.end())
+      continue;
+
+    // If there are users outside the set to be eliminated, we abort the
+    // transformation. No gain can be expected.
+    for (const auto &U : I->users()) {
+      if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
+        return false;
+    }
+  }
+
+  // We know that all LoadInst are within the same BB. This guarantees that
+  // either everything or nothing is loaded.
+  LoadInst *First = findFirstLoad(LIs);
+
+  // To be safe that the loads can be combined, iterate over all loads and test
+  // that the corresponding defining access dominates first LI. This guarantees
+  // that there are no aliasing stores in between the loads.
+  auto FMA = MSSA.getMemoryAccess(First);
+  for (auto LI : LIs) {
+    auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
+    if (!MSSA.dominates(MADef, FMA))
+      return false;
+  }
+  assert(!LIs.empty() && "There are no LoadInst to combine");
+
+  // It is necessary that insertion point dominates all final ShuffleVectorInst.
+  for (auto &VI : InterleavedLoad) {
+    if (!DT.dominates(InsertionPoint, VI.SVI))
+      return false;
+  }
+
+  // All checks are done. Add instructions detectable by InterleavedAccessPass
+  // The old instruction will are left dead.
+  IRBuilder<> Builder(InsertionPoint);
+  Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
+  unsigned ElementsPerSVI =
+      InterleavedLoad.front().SVI->getType()->getNumElements();
+  VectorType *ILTy = VectorType::get(ETy, Factor * ElementsPerSVI);
+
+  SmallVector<unsigned, 4> Indices;
+  for (unsigned i = 0; i < Factor; i++)
+    Indices.push_back(i);
+  InterleavedCost = TTI.getInterleavedMemoryOpCost(
+      Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(),
+      InsertionPoint->getPointerAddressSpace());
+
+  if (InterleavedCost >= InstructionCost) {
+    return false;
+  }
+
+  // Create a pointer cast for the wide load.
+  auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0),
+                                      ILTy->getPointerTo(),
+                                      "interleaved.wide.ptrcast");
+
+  // Create the wide load and update the MemorySSA.
+  auto LI = Builder.CreateAlignedLoad(CI, InsertionPoint->getAlignment(),
+                                      "interleaved.wide.load");
+  auto MSSAU = MemorySSAUpdater(&MSSA);
+  MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
+      LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
+  MSSAU.insertUse(MSSALoad);
+
+  // Create the final SVIs and replace all uses.
+  int i = 0;
+  for (auto &VI : InterleavedLoad) {
+    SmallVector<uint32_t, 4> Mask;
+    for (unsigned j = 0; j < ElementsPerSVI; j++)
+      Mask.push_back(i + j * Factor);
+
+    Builder.SetInsertPoint(VI.SVI);
+    auto SVI = Builder.CreateShuffleVector(LI, UndefValue::get(LI->getType()),
+                                           Mask, "interleaved.shuffle");
+    VI.SVI->replaceAllUsesWith(SVI);
+    i++;
+  }
+
+  NumInterleavedLoadCombine++;
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
+           << "Load interleaved combined with factor "
+           << ore::NV("Factor", Factor);
+  });
+
+  return true;
+}
+
+bool InterleavedLoadCombineImpl::run() {
+  OptimizationRemarkEmitter ORE(&F);
+  bool changed = false;
+  unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
+
+  auto &DL = F.getParent()->getDataLayout();
+
+  // Start with the highest factor to avoid combining and recombining.
+  for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
+    std::list<VectorInfo> Candidates;
+
+    for (BasicBlock &BB : F) {
+      for (Instruction &I : BB) {
+        if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+
+          Candidates.emplace_back(SVI->getType());
+
+          if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
+            Candidates.pop_back();
+            continue;
+          }
+
+          if (!Candidates.back().isInterleaved(Factor, DL)) {
+            Candidates.pop_back();
+          }
+        }
+      }
+    }
+
+    std::list<VectorInfo> InterleavedLoad;
+    while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
+      if (combine(InterleavedLoad, ORE)) {
+        changed = true;
+      } else {
+        // Remove the first element of the Interleaved Load but put the others
+        // back on the list and continue searching
+        Candidates.splice(Candidates.begin(), InterleavedLoad,
+                          std::next(InterleavedLoad.begin()),
+                          InterleavedLoad.end());
+      }
+      InterleavedLoad.clear();
+    }
+  }
+
+  return changed;
+}
+
+namespace {
+/// This pass combines interleaved loads into a pattern detectable by
+/// InterleavedAccessPass.
+struct InterleavedLoadCombine : public FunctionPass {
+  static char ID;
+
+  InterleavedLoadCombine() : FunctionPass(ID) {
+    initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Interleaved Load Combine Pass";
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (DisableInterleavedLoadCombine)
+      return false;
+
+    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+    if (!TPC)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
+                      << "\n");
+
+    return InterleavedLoadCombineImpl(
+               F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+               getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+               TPC->getTM<TargetMachine>())
+        .run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+};
+} // anonymous namespace
+
+char InterleavedLoadCombine::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    InterleavedLoadCombine, DEBUG_TYPE,
+    "Combine interleaved loads into wide loads and shufflevector instructions",
+    false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(
+    InterleavedLoadCombine, DEBUG_TYPE,
+    "Combine interleaved loads into wide loads and shufflevector instructions",
+    false, false)
+
+FunctionPass *
+llvm::createInterleavedLoadCombinePass() {
+  auto P = new InterleavedLoadCombine();
+  return P;
+}
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 01779b1eb9b015e59a73abd61d2f1b98006ff68e..d0d889782a358e9c6e4c37eddd313ffabb238559 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -132,10 +132,10 @@ private:
   unsigned WasIndirect : 1;
 };
 
-/// LocMap - Map of where a user value is live, and its location.
+/// Map of where a user value is live, and its location.
 using LocMap = IntervalMap<SlotIndex, DbgValueLocation, 4>;
 
-/// SpillOffsetMap - Map of stack slot offsets for spilled locations.
+/// Map of stack slot offsets for spilled locations.
 /// Non-spilled locations are not added to the map.
 using SpillOffsetMap = DenseMap<unsigned, unsigned>;
 
@@ -143,7 +143,7 @@ namespace {
 
 class LDVImpl;
 
-/// UserValue - A user value is a part of a debug info user variable.
+/// A user value is a part of a debug info user variable.
 ///
 /// A DBG_VALUE instruction notes that (a sub-register of) a virtual register
 /// holds part of a user variable. The part is identified by a byte offset.
@@ -170,26 +170,26 @@ class UserValue {
   /// lexical scope.
   SmallSet<SlotIndex, 2> trimmedDefs;
 
-  /// insertDebugValue - Insert a DBG_VALUE into MBB at Idx for LocNo.
+  /// Insert a DBG_VALUE into MBB at Idx for LocNo.
   void insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
                         SlotIndex StopIdx, DbgValueLocation Loc, bool Spilled,
                         unsigned SpillOffset, LiveIntervals &LIS,
                         const TargetInstrInfo &TII,
                         const TargetRegisterInfo &TRI);
 
-  /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs
+  /// Replace OldLocNo ranges with NewRegs ranges where NewRegs
   /// is live. Returns true if any changes were made.
   bool splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
                      LiveIntervals &LIS);
 
 public:
-  /// UserValue - Create a new UserValue.
+  /// Create a new UserValue.
   UserValue(const DILocalVariable *var, const DIExpression *expr, DebugLoc L,
             LocMap::Allocator &alloc)
       : Variable(var), Expression(expr), dl(std::move(L)), leader(this),
         locInts(alloc) {}
 
-  /// getLeader - Get the leader of this value's equivalence class.
+  /// Get the leader of this value's equivalence class.
   UserValue *getLeader() {
     UserValue *l = leader;
     while (l != l->leader)
@@ -197,10 +197,10 @@ public:
     return leader = l;
   }
 
-  /// getNext - Return the next UserValue in the equivalence class.
+  /// Return the next UserValue in the equivalence class.
   UserValue *getNext() const { return next; }
 
-  /// match - Does this UserValue match the parameters?
+  /// Does this UserValue match the parameters?
   bool match(const DILocalVariable *Var, const DIExpression *Expr,
              const DILocation *IA) const {
     // FIXME: The fragment should be part of the equivalence class, but not
@@ -208,7 +208,7 @@ public:
     return Var == Variable && Expr == Expression && dl->getInlinedAt() == IA;
   }
 
-  /// merge - Merge equivalence classes.
+  /// Merge equivalence classes.
   static UserValue *merge(UserValue *L1, UserValue *L2) {
     L2 = L2->getLeader();
     if (!L1)
@@ -260,10 +260,10 @@ public:
     return locations.size() - 1;
   }
 
-  /// mapVirtRegs - Ensure that all virtual register locations are mapped.
+  /// Ensure that all virtual register locations are mapped.
   void mapVirtRegs(LDVImpl *LDV);
 
-  /// addDef - Add a definition point to this value.
+  /// Add a definition point to this value.
   void addDef(SlotIndex Idx, const MachineOperand &LocMO, bool IsIndirect) {
     DbgValueLocation Loc(getLocationNo(LocMO), IsIndirect);
     // Add a singular (Idx,Idx) -> Loc mapping.
@@ -275,66 +275,71 @@ public:
       I.setValue(Loc);
   }
 
-  /// extendDef - Extend the current definition as far as possible down.
+  /// Extend the current definition as far as possible down.
+  ///
   /// Stop when meeting an existing def or when leaving the live
-  /// range of VNI.
-  /// End points where VNI is no longer live are added to Kills.
-  /// @param Idx   Starting point for the definition.
-  /// @param Loc   Location number to propagate.
-  /// @param LR    Restrict liveness to where LR has the value VNI. May be null.
-  /// @param VNI   When LR is not null, this is the value to restrict to.
-  /// @param Kills Append end points of VNI's live range to Kills.
-  /// @param LIS   Live intervals analysis.
+  /// range of VNI. End points where VNI is no longer live are added to Kills.
+  ///
+  /// We only propagate DBG_VALUES locally here. LiveDebugValues performs a
+  /// data-flow analysis to propagate them beyond basic block boundaries.
+  ///
+  /// \param Idx Starting point for the definition.
+  /// \param Loc Location number to propagate.
+  /// \param LR Restrict liveness to where LR has the value VNI. May be null.
+  /// \param VNI When LR is not null, this is the value to restrict to.
+  /// \param [out] Kills Append end points of VNI's live range to Kills.
+  /// \param LIS Live intervals analysis.
   void extendDef(SlotIndex Idx, DbgValueLocation Loc,
                  LiveRange *LR, const VNInfo *VNI,
                  SmallVectorImpl<SlotIndex> *Kills,
                  LiveIntervals &LIS);
 
-  /// addDefsFromCopies - The value in LI/LocNo may be copies to other
-  /// registers. Determine if any of the copies are available at the kill
-  /// points, and add defs if possible.
-  /// @param LI      Scan for copies of the value in LI->reg.
-  /// @param LocNo   Location number of LI->reg.
-  /// @param WasIndirect Indicates if the original use of LI->reg was indirect
-  /// @param Kills   Points where the range of LocNo could be extended.
-  /// @param NewDefs Append (Idx, LocNo) of inserted defs here.
+  /// The value in LI/LocNo may be copies to other registers. Determine if
+  /// any of the copies are available at the kill points, and add defs if
+  /// possible.
+  ///
+  /// \param LI Scan for copies of the value in LI->reg.
+  /// \param LocNo Location number of LI->reg.
+  /// \param WasIndirect Indicates if the original use of LI->reg was indirect
+  /// \param Kills Points where the range of LocNo could be extended.
+  /// \param [in,out] NewDefs Append (Idx, LocNo) of inserted defs here.
   void addDefsFromCopies(
       LiveInterval *LI, unsigned LocNo, bool WasIndirect,
       const SmallVectorImpl<SlotIndex> &Kills,
       SmallVectorImpl<std::pair<SlotIndex, DbgValueLocation>> &NewDefs,
       MachineRegisterInfo &MRI, LiveIntervals &LIS);
 
-  /// computeIntervals - Compute the live intervals of all locations after
-  /// collecting all their def points.
+  /// Compute the live intervals of all locations after collecting all their
+  /// def points.
   void computeIntervals(MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                         LiveIntervals &LIS, LexicalScopes &LS);
 
-  /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
+  /// Replace OldReg ranges with NewRegs ranges where NewRegs is
   /// live. Returns true if any changes were made.
   bool splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
                      LiveIntervals &LIS);
 
-  /// rewriteLocations - Rewrite virtual register locations according to the
-  /// provided virtual register map. Record the stack slot offsets for the
-  /// locations that were spilled.
+  /// Rewrite virtual register locations according to the provided virtual
+  /// register map. Record the stack slot offsets for the locations that
+  /// were spilled.
   void rewriteLocations(VirtRegMap &VRM, const MachineFunction &MF,
                         const TargetInstrInfo &TII,
                         const TargetRegisterInfo &TRI,
                         SpillOffsetMap &SpillOffsets);
 
-  /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
+  /// Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
                        const TargetInstrInfo &TII,
                        const TargetRegisterInfo &TRI,
                        const SpillOffsetMap &SpillOffsets);
 
-  /// getDebugLoc - Return DebugLoc of this UserValue.
+  /// Return DebugLoc of this UserValue.
   DebugLoc getDebugLoc() { return dl;}
 
   void print(raw_ostream &, const TargetRegisterInfo *);
 };
 
-/// LDVImpl - Implementation of the LiveDebugVariables pass.
+/// Implementation of the LiveDebugVariables pass.
 class LDVImpl {
   LiveDebugVariables &pass;
   LocMap::Allocator allocator;
@@ -348,7 +353,7 @@ class LDVImpl {
   /// Whether the machine function is modified during the pass.
   bool ModifiedMF = false;
 
-  /// userValues - All allocated UserValue instances.
+  /// All allocated UserValue instances.
   SmallVector<std::unique_ptr<UserValue>, 8> userValues;
 
   /// Map virtual register to eq class leader.
@@ -359,27 +364,31 @@ class LDVImpl {
   using UVMap = DenseMap<const DILocalVariable *, UserValue *>;
   UVMap userVarMap;
 
-  /// getUserValue - Find or create a UserValue.
+  /// Find or create a UserValue.
   UserValue *getUserValue(const DILocalVariable *Var, const DIExpression *Expr,
                           const DebugLoc &DL);
 
-  /// lookupVirtReg - Find the EC leader for VirtReg or null.
+  /// Find the EC leader for VirtReg or null.
   UserValue *lookupVirtReg(unsigned VirtReg);
 
-  /// handleDebugValue - Add DBG_VALUE instruction to our maps.
-  /// @param MI  DBG_VALUE instruction
-  /// @param Idx Last valid SLotIndex before instruction.
-  /// @return    True if the DBG_VALUE instruction should be deleted.
+  /// Add DBG_VALUE instruction to our maps.
+  ///
+  /// \param MI DBG_VALUE instruction
+  /// \param Idx Last valid SLotIndex before instruction.
+  ///
+  /// \returns True if the DBG_VALUE instruction should be deleted.
   bool handleDebugValue(MachineInstr &MI, SlotIndex Idx);
 
-  /// collectDebugValues - Collect and erase all DBG_VALUE instructions, adding
-  /// a UserValue def for each instruction.
-  /// @param mf MachineFunction to be scanned.
-  /// @return True if any debug values were found.
+  /// Collect and erase all DBG_VALUE instructions, adding a UserValue def
+  /// for each instruction.
+  ///
+  /// \param mf MachineFunction to be scanned.
+  ///
+  /// \returns True if any debug values were found.
   bool collectDebugValues(MachineFunction &mf);
 
-  /// computeIntervals - Compute the live intervals of all user values after
-  /// collecting all their def points.
+  /// Compute the live intervals of all user values after collecting all
+  /// their def points.
   void computeIntervals();
 
 public:
@@ -387,7 +396,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &mf);
 
-  /// clear - Release all memory.
+  /// Release all memory.
   void clear() {
     MF = nullptr;
     userValues.clear();
@@ -400,13 +409,13 @@ public:
     ModifiedMF = false;
   }
 
-  /// mapVirtReg - Map virtual register to an equivalence class.
+  /// Map virtual register to an equivalence class.
   void mapVirtReg(unsigned VirtReg, UserValue *EC);
 
-  /// splitRegister -  Replace all references to OldReg with NewRegs.
+  /// Replace all references to OldReg with NewRegs.
   void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs);
 
-  /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
+  /// Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM);
 
   void print(raw_ostream&);
@@ -612,8 +621,6 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
   return Changed;
 }
 
-/// We only propagate DBG_VALUES locally here. LiveDebugValues performs a
-/// data-flow analysis to propagate them beyond basic block boundaries.
 void UserValue::extendDef(SlotIndex Idx, DbgValueLocation Loc, LiveRange *LR,
                           const VNInfo *VNI, SmallVectorImpl<SlotIndex> *Kills,
                           LiveIntervals &LIS) {
diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp
index bbd1dcdb744bce844e6e1755c91a73ff2a49504b..f17c23619ed58b7c7a5184c4e0bcbf2e8abe8e29 100644
--- a/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -677,8 +677,7 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
 
   std::vector<MachineInstr *> Candidates = populateCandidates(MBB);
   std::vector<MachineInstr *> VisitedMIs;
-  std::copy(Candidates.begin(), Candidates.end(),
-            std::back_inserter(VisitedMIs));
+  llvm::copy(Candidates, std::back_inserter(VisitedMIs));
 
   std::vector<TypedVReg> VRegs;
   for (auto candidate : Candidates) {
diff --git a/lib/CodeGen/MIRParser/MILexer.cpp b/lib/CodeGen/MIRParser/MILexer.cpp
index f7cc94e34a59e7e151b136a856ea9c931bd98300..265877c2f5b42a073e36498714bb0a0f8c31cdab 100644
--- a/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/lib/CodeGen/MIRParser/MILexer.cpp
@@ -220,6 +220,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("undefined", MIToken::kw_cfi_undefined)
       .Case("register", MIToken::kw_cfi_register)
       .Case("window_save", MIToken::kw_cfi_window_save)
+      .Case("negate_ra_sign_state", MIToken::kw_cfi_aarch64_negate_ra_sign_state)
       .Case("blockaddress", MIToken::kw_blockaddress)
       .Case("intrinsic", MIToken::kw_intrinsic)
       .Case("target-index", MIToken::kw_target_index)
@@ -576,6 +577,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
       .Case("!noalias", MIToken::md_noalias)
       .Case("!range", MIToken::md_range)
       .Case("!DIExpression", MIToken::md_diexpr)
+      .Case("!DILocation", MIToken::md_dilocation)
       .Default(MIToken::Error);
 }
 
diff --git a/lib/CodeGen/MIRParser/MILexer.h b/lib/CodeGen/MIRParser/MILexer.h
index dffa4f745444e03f4b4ca8f86fae7a0222dd5979..ceff79087d814c1b7691054eefff22a30fdccddb 100644
--- a/lib/CodeGen/MIRParser/MILexer.h
+++ b/lib/CodeGen/MIRParser/MILexer.h
@@ -89,6 +89,7 @@ struct MIToken {
     kw_cfi_restore_state,
     kw_cfi_undefined,
     kw_cfi_window_save,
+    kw_cfi_aarch64_negate_ra_sign_state,
     kw_blockaddress,
     kw_intrinsic,
     kw_target_index,
@@ -126,6 +127,7 @@ struct MIToken {
     md_noalias,
     md_range,
     md_diexpr,
+    md_dilocation,
 
     // Identifier tokens
     Identifier,
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index 1a6174bf9ee23a316ed4bd3670df27b6d806fa5c..6f2d8bb53ac82b848809447b49c8ef7d270ff6e6 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -226,6 +226,7 @@ public:
   bool parseMCSymbolOperand(MachineOperand &Dest);
   bool parseMDNode(MDNode *&Node);
   bool parseDIExpression(MDNode *&Expr);
+  bool parseDILocation(MDNode *&Expr);
   bool parseMetadataOperand(MachineOperand &Dest);
   bool parseCFIOffset(int &Offset);
   bool parseCFIRegister(unsigned &Reg);
@@ -776,11 +777,15 @@ bool MIParser::parse(MachineInstr *&MI) {
   DebugLoc DebugLocation;
   if (Token.is(MIToken::kw_debug_location)) {
     lex();
-    if (Token.isNot(MIToken::exclaim))
-      return error("expected a metadata node after 'debug-location'");
     MDNode *Node = nullptr;
-    if (parseMDNode(Node))
-      return true;
+    if (Token.is(MIToken::exclaim)) {
+      if (parseMDNode(Node))
+        return true;
+    } else if (Token.is(MIToken::md_dilocation)) {
+      if (parseDILocation(Node))
+        return true;
+    } else
+      return error("expected a metadata node after 'debug-location'");
     if (!isa<DILocation>(Node))
       return error("referenced metadata is not a DILocation");
     DebugLocation = DebugLoc(Node);
@@ -898,6 +903,9 @@ bool MIParser::parseStandaloneMDNode(MDNode *&Node) {
   } else if (Token.is(MIToken::md_diexpr)) {
     if (parseDIExpression(Node))
       return true;
+  } else if (Token.is(MIToken::md_dilocation)) {
+    if (parseDILocation(Node))
+      return true;
   } else
     return error("expected a metadata node");
   if (Token.isNot(MIToken::Eof))
@@ -1684,6 +1692,109 @@ bool MIParser::parseDIExpression(MDNode *&Expr) {
   return false;
 }
 
+bool MIParser::parseDILocation(MDNode *&Loc) {
+  assert(Token.is(MIToken::md_dilocation));
+  lex();
+
+  bool HaveLine = false;
+  unsigned Line = 0;
+  unsigned Column = 0;
+  MDNode *Scope = nullptr;
+  MDNode *InlinedAt = nullptr;
+  bool ImplicitCode = false;
+
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+
+  if (Token.isNot(MIToken::rparen)) {
+    do {
+      if (Token.is(MIToken::Identifier)) {
+        if (Token.stringValue() == "line") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (Token.isNot(MIToken::IntegerLiteral) ||
+              Token.integerValue().isSigned())
+            return error("expected unsigned integer");
+          Line = Token.integerValue().getZExtValue();
+          HaveLine = true;
+          lex();
+          continue;
+        }
+        if (Token.stringValue() == "column") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (Token.isNot(MIToken::IntegerLiteral) ||
+              Token.integerValue().isSigned())
+            return error("expected unsigned integer");
+          Column = Token.integerValue().getZExtValue();
+          lex();
+          continue;
+        }
+        if (Token.stringValue() == "scope") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (parseMDNode(Scope))
+            return error("expected metadata node");
+          if (!isa<DIScope>(Scope))
+            return error("expected DIScope node");
+          continue;
+        }
+        if (Token.stringValue() == "inlinedAt") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (Token.is(MIToken::exclaim)) {
+            if (parseMDNode(InlinedAt))
+              return true;
+          } else if (Token.is(MIToken::md_dilocation)) {
+            if (parseDILocation(InlinedAt))
+              return true;
+          } else
+            return error("expected metadata node");
+          if (!isa<DILocation>(InlinedAt))
+            return error("expected DILocation node");
+          continue;
+        }
+        if (Token.stringValue() == "isImplicitCode") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (!Token.is(MIToken::Identifier))
+            return error("expected true/false");
+          // As far as I can see, we don't have any existing need for parsing
+          // true/false in MIR yet. Do it ad-hoc until there's something else
+          // that needs it.
+          if (Token.stringValue() == "true")
+            ImplicitCode = true;
+          else if (Token.stringValue() == "false")
+            ImplicitCode = false;
+          else
+            return error("expected true/false");
+          lex();
+          continue;
+        }
+      }
+      return error(Twine("invalid DILocation argument '") +
+                   Token.stringValue() + "'");
+    } while (consumeIfPresent(MIToken::comma));
+  }
+
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+
+  if (!HaveLine)
+    return error("DILocation requires line number");
+  if (!Scope)
+    return error("DILocation requires a scope");
+
+  Loc = DILocation::get(MF.getFunction().getContext(), Line, Column, Scope,
+                        InlinedAt, ImplicitCode);
+  return false;
+}
+
 bool MIParser::parseMetadataOperand(MachineOperand &Dest) {
   MDNode *Node = nullptr;
   if (Token.is(MIToken::exclaim)) {
@@ -1820,6 +1931,9 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) {
   case MIToken::kw_cfi_window_save:
     CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
     break;
+  case MIToken::kw_cfi_aarch64_negate_ra_sign_state:
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+    break;
   case MIToken::kw_cfi_escape: {
     std::string Values;
     if (parseCFIEscapeValues(Values))
@@ -2112,6 +2226,7 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
   case MIToken::kw_cfi_restore_state:
   case MIToken::kw_cfi_undefined:
   case MIToken::kw_cfi_window_save:
+  case MIToken::kw_cfi_aarch64_negate_ra_sign_state:
     return parseCFIOperand(Dest);
   case MIToken::kw_blockaddress:
     return parseBlockAddressOperand(Dest);
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 8012946371531d5446b0e3b95259b2042a0385c9..d9dcc428943f19cb1a93cce21fd3979cf59543fe 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -401,18 +401,20 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
   for (const auto &CSInfo : MFI.getCalleeSavedInfo()) {
     yaml::StringValue Reg;
     printRegMIR(CSInfo.getReg(), Reg, TRI);
-    auto StackObjectInfo = StackObjectOperandMapping.find(CSInfo.getFrameIdx());
-    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
-           "Invalid stack object index");
-    const FrameIndexOperand &StackObject = StackObjectInfo->second;
-    if (StackObject.IsFixed) {
-      YMF.FixedStackObjects[StackObject.ID].CalleeSavedRegister = Reg;
-      YMF.FixedStackObjects[StackObject.ID].CalleeSavedRestored =
-        CSInfo.isRestored();
-    } else {
-      YMF.StackObjects[StackObject.ID].CalleeSavedRegister = Reg;
-      YMF.StackObjects[StackObject.ID].CalleeSavedRestored =
-        CSInfo.isRestored();
+    if (!CSInfo.isSpilledToReg()) {
+      auto StackObjectInfo = StackObjectOperandMapping.find(CSInfo.getFrameIdx());
+      assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+             "Invalid stack object index");
+      const FrameIndexOperand &StackObject = StackObjectInfo->second;
+      if (StackObject.IsFixed) {
+        YMF.FixedStackObjects[StackObject.ID].CalleeSavedRegister = Reg;
+        YMF.FixedStackObjects[StackObject.ID].CalleeSavedRestored =
+          CSInfo.isRestored();
+      } else {
+        YMF.StackObjects[StackObject.ID].CalleeSavedRegister = Reg;
+        YMF.StackObjects[StackObject.ID].CalleeSavedRestored =
+          CSInfo.isRestored();
+      }
     }
   }
   for (unsigned I = 0, E = MFI.getLocalFrameObjectCount(); I < E; ++I) {
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 0b9ee462d1ae7af7283d07e281b60c262a8e39be..03771bc5dae10471e0154bd0a7b461b8e865e83b 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -1380,24 +1380,21 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
 
   // Try searching forwards from Before, looking for reads or defs.
   const_iterator I(Before);
-  // If this is the last insn in the block, don't search forwards.
-  if (I != end()) {
-    for (++I; I != end() && N > 0; ++I) {
-      if (I->isDebugInstr())
-        continue;
+  for (; I != end() && N > 0; ++I) {
+    if (I->isDebugInstr())
+      continue;
 
-      --N;
+    --N;
 
-      MachineOperandIteratorBase::PhysRegInfo Info =
-          ConstMIOperands(*I).analyzePhysReg(Reg, TRI);
+    MachineOperandIteratorBase::PhysRegInfo Info =
+        ConstMIOperands(*I).analyzePhysReg(Reg, TRI);
 
-      // Register is live when we read it here.
-      if (Info.Read)
-        return LQR_Live;
-      // Register is dead if we can fully overwrite or clobber it here.
-      if (Info.FullyDefined || Info.Clobbered)
-        return LQR_Dead;
-    }
+    // Register is live when we read it here.
+    if (Info.Read)
+      return LQR_Live;
+    // Register is dead if we can fully overwrite or clobber it here.
+    if (Info.FullyDefined || Info.Clobbered)
+      return LQR_Dead;
   }
 
   // If we reached the end, it is safe to clobber Reg at the end of a block of
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 488481cec37c4cd16b3b59b4daff2ad4b0c1b164..3ded00b70fd43f3ac0405ea47c5c18d01f9d345b 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -435,7 +435,7 @@ MachineFunction::createMIExtraInfo(ArrayRef<MachineMemOperand *> MMOs,
 
 const char *MachineFunction::createExternalSymbolName(StringRef Name) {
   char *Dest = Allocator.Allocate<char>(Name.size() + 1);
-  std::copy(Name.begin(), Name.end(), Dest);
+  llvm::copy(Name, Dest);
   Dest[Name.size()] = 0;
   return Dest;
 }
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index f30290109b795b326f3c21ccb8843896974f9200..764a84c7e1327ed25c0b77d1b5f9da8e1339e9d4 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -933,9 +933,7 @@ int MachineInstr::findRegisterUseOperandIdx(
     unsigned MOReg = MO.getReg();
     if (!MOReg)
       continue;
-    if (MOReg == Reg || (TRI && TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-                         TargetRegisterInfo::isPhysicalRegister(Reg) &&
-                         TRI->isSubRegister(MOReg, Reg)))
+    if (MOReg == Reg || (TRI && Reg && MOReg && TRI->regsOverlap(MOReg, Reg)))
       if (!isKill || MO.isKill())
         return i;
   }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 2c9b5963537891fee054a40fed626ed0de2aa6aa..58fd1f23842060aebc302c18bc22af7c923b3176 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -463,8 +463,12 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI,
     for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
       if (PhysRegDefs.test(*AS))
         PhysRegClobbers.set(*AS);
-      PhysRegDefs.set(*AS);
     }
+    // Need a second loop because MCRegAliasIterator can visit the same
+    // register twice.
+    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS)
+      PhysRegDefs.set(*AS);
+
     if (PhysRegClobbers.test(Reg))
       // MI defined register is seen defined by another instruction in
       // the loop, it cannot be a LICM candidate.
diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp
index 4fe51f6624816bd24443d4884de662f8d891b2ee..05e51e1873cf90a65f1ef6b5da9706340f51e4bc 100644
--- a/lib/CodeGen/MachineOperand.cpp
+++ b/lib/CodeGen/MachineOperand.cpp
@@ -697,6 +697,11 @@ static void printCFI(raw_ostream &OS, const MCCFIInstruction &CFI,
     if (MCSymbol *Label = CFI.getLabel())
       MachineOperand::printSymbol(OS, *Label);
     break;
+  case MCCFIInstruction::OpNegateRAState:
+    OS << "negate_ra_sign_state ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    break;
   default:
     // TODO: Print the other CFI Operations.
     OS << "<unserializable cfi directive>";
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index c69bfecd8dcd414db1115147eb630f52eef640a0..ad96c0e579e43665c2b301009b8c0b6aad305665 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -519,18 +519,19 @@ public:
       RS = RepeatedSubstring();
       N = nullptr;
 
+      // Each leaf node represents a repeat of a string.
+      std::vector<SuffixTreeNode *> LeafChildren;
+
       // Continue visiting nodes until we find one which repeats more than once.
       while (!ToVisit.empty()) {
         SuffixTreeNode *Curr = ToVisit.back();
         ToVisit.pop_back();
+        LeafChildren.clear();
 
         // Keep track of the length of the string associated with the node. If
         // it's too short, we'll quit.
         unsigned Length = Curr->ConcatLen;
 
-        // Each leaf node represents a repeat of a string.
-        std::vector<SuffixTreeNode *> LeafChildren;
-
         // Iterate over each child, saving internal nodes for visiting, and
         // leaf nodes in LeafChildren. Internal nodes represent individual
         // strings, which may repeat.
@@ -621,9 +622,8 @@ struct InstructionMapper {
   DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>
       InstructionIntegerMap;
 
-  /// Corresponcence from unsigned integers to \p MachineInstrs.
-  /// Inverse of \p InstructionIntegerMap.
-  DenseMap<unsigned, MachineInstr *> IntegerInstructionMap;
+  /// Correspondence between \p MachineBasicBlocks and target-defined flags.
+  DenseMap<MachineBasicBlock *, unsigned> MBBFlagsMap;
 
   /// The vector of unsigned integers that the module is mapped to.
   std::vector<unsigned> UnsignedVec;
@@ -641,8 +641,7 @@ struct InstructionMapper {
   /// Maps \p *It to a legal integer.
   ///
   /// Updates \p CanOutlineWithPrevInstr, \p HaveLegalRange, \p InstrListForMBB,
-  /// \p UnsignedVecForMBB, \p InstructionIntegerMap, \p IntegerInstructionMap,
-  /// and \p LegalInstrNumber.
+  /// \p UnsignedVecForMBB, \p InstructionIntegerMap, and \p LegalInstrNumber.
   ///
   /// \returns The integer that \p *It was mapped to.
   unsigned mapToLegalUnsigned(
@@ -675,10 +674,8 @@ struct InstructionMapper {
     unsigned MINumber = ResultIt->second;
 
     // There was an insertion.
-    if (WasInserted) {
+    if (WasInserted)
       LegalInstrNumber++;
-      IntegerInstructionMap.insert(std::make_pair(MINumber, &MI));
-    }
 
     UnsignedVecForMBB.push_back(MINumber);
 
@@ -742,7 +739,15 @@ struct InstructionMapper {
   /// \param TII \p TargetInstrInfo for the function.
   void convertToUnsignedVec(MachineBasicBlock &MBB,
                             const TargetInstrInfo &TII) {
-    unsigned Flags = TII.getMachineOutlinerMBBFlags(MBB);
+    unsigned Flags = 0;
+
+    // Don't even map in this case.
+    if (!TII.isMBBSafeToOutlineFrom(MBB, Flags))
+      return;
+
+    // Store info for the MBB for later outlining.
+    MBBFlagsMap[&MBB] = Flags;
+
     MachineBasicBlock::iterator It = MBB.begin();
 
     // The number of instructions in this block that will be considered for
@@ -863,7 +868,8 @@ struct MachineOutliner : public ModulePass {
   /// Remark output explaining that a function was outlined.
   void emitOutlinedFunctionRemark(OutlinedFunction &OF);
 
-  /// Find all repeated substrings that satisfy the outlining cost model.
+  /// Find all repeated substrings that satisfy the outlining cost model by
+  /// constructing a suffix tree.
   ///
   /// If a substring appears at least twice, then it must be represented by
   /// an internal node which appears in at least two suffixes. Each suffix
@@ -872,75 +878,26 @@ struct MachineOutliner : public ModulePass {
   /// internal node represents a beneficial substring, then we use each of
   /// its leaf children to find the locations of its substring.
   ///
-  /// \param ST A suffix tree to query.
   /// \param Mapper Contains outlining mapping information.
-  /// \param[out] CandidateList Filled with candidates representing each
-  /// beneficial substring.
   /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions
   /// each type of candidate.
-  ///
-  /// \returns The length of the longest candidate found.
-  unsigned
-  findCandidates(SuffixTree &ST,
-                 InstructionMapper &Mapper,
-                 std::vector<std::shared_ptr<Candidate>> &CandidateList,
-                 std::vector<OutlinedFunction> &FunctionList);
-
-  /// Replace the sequences of instructions represented by the
-  /// \p Candidates in \p CandidateList with calls to \p MachineFunctions
-  /// described in \p FunctionList.
+  void findCandidates(InstructionMapper &Mapper,
+                      std::vector<OutlinedFunction> &FunctionList);
+
+  /// Replace the sequences of instructions represented by \p OutlinedFunctions
+  /// with calls to functions.
   ///
   /// \param M The module we are outlining from.
-  /// \param CandidateList A list of candidates to be outlined.
   /// \param FunctionList A list of functions to be inserted into the module.
   /// \param Mapper Contains the instruction mappings for the module.
-  bool outline(Module &M,
-               const ArrayRef<std::shared_ptr<Candidate>> &CandidateList,
-               std::vector<OutlinedFunction> &FunctionList,
+  bool outline(Module &M, std::vector<OutlinedFunction> &FunctionList,
                InstructionMapper &Mapper);
 
   /// Creates a function for \p OF and inserts it into the module.
-  MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF,
+  MachineFunction *createOutlinedFunction(Module &M, OutlinedFunction &OF,
                                           InstructionMapper &Mapper,
                                           unsigned Name);
 
-  /// Find potential outlining candidates and store them in \p CandidateList.
-  ///
-  /// For each type of potential candidate, also build an \p OutlinedFunction
-  /// struct containing the information to build the function for that
-  /// candidate.
-  ///
-  /// \param[out] CandidateList Filled with outlining candidates for the module.
-  /// \param[out] FunctionList Filled with functions corresponding to each type
-  /// of \p Candidate.
-  /// \param ST The suffix tree for the module.
-  ///
-  /// \returns The length of the longest candidate found. 0 if there are none.
-  unsigned
-  buildCandidateList(std::vector<std::shared_ptr<Candidate>> &CandidateList,
-                     std::vector<OutlinedFunction> &FunctionList,
-                     SuffixTree &ST, InstructionMapper &Mapper);
-
-  /// Helper function for pruneOverlaps.
-  /// Removes \p C from the candidate list, and updates its \p OutlinedFunction.
-  void prune(Candidate &C, std::vector<OutlinedFunction> &FunctionList);
-
-  /// Remove any overlapping candidates that weren't handled by the
-  /// suffix tree's pruning method.
-  ///
-  /// Pruning from the suffix tree doesn't necessarily remove all overlaps.
-  /// If a short candidate is chosen for outlining, then a longer candidate
-  /// which has that short candidate as a suffix is chosen, the tree's pruning
-  /// method will not find it. Thus, we need to prune before outlining as well.
-  ///
-  /// \param[in,out] CandidateList A list of outlining candidates.
-  /// \param[in,out] FunctionList A list of functions to be outlined.
-  /// \param Mapper Contains instruction mapping info for outlining.
-  /// \param MaxCandidateLen The length of the longest candidate.
-  void pruneOverlaps(std::vector<std::shared_ptr<Candidate>> &CandidateList,
-                     std::vector<OutlinedFunction> &FunctionList,
-                     InstructionMapper &Mapper, unsigned MaxCandidateLen);
-
   /// Construct a suffix tree on the instructions in \p M and outline repeated
   /// strings from that tree.
   bool runOnModule(Module &M) override;
@@ -949,8 +906,8 @@ struct MachineOutliner : public ModulePass {
   /// function for remark emission.
   DISubprogram *getSubprogramOrNull(const OutlinedFunction &OF) {
     DISubprogram *SP;
-    for (const std::shared_ptr<Candidate> &C : OF.Candidates)
-      if (C && C->getMF() && (SP = C->getMF()->getFunction().getSubprogram()))
+    for (const Candidate &C : OF.Candidates)
+      if (C.getMF() && (SP = C.getMF()->getFunction().getSubprogram()))
         return SP;
     return nullptr;
   }
@@ -1030,7 +987,7 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
   MachineOptimizationRemark R(DEBUG_TYPE, "OutlinedFunction",
                               MBB->findDebugLoc(MBB->begin()), MBB);
   R << "Saved " << NV("OutliningBenefit", OF.getBenefit()) << " bytes by "
-    << "outlining " << NV("Length", OF.Sequence.size()) << " instructions "
+    << "outlining " << NV("Length", OF.getNumInstrs()) << " instructions "
     << "from " << NV("NumOccurrences", OF.getOccurrenceCount())
     << " locations. "
     << "(Found at: ";
@@ -1038,12 +995,8 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
   // Tell the user the other places the candidate was found.
   for (size_t i = 0, e = OF.Candidates.size(); i < e; i++) {
 
-    // Skip over things that were pruned.
-    if (!OF.Candidates[i]->InCandidateList)
-      continue;
-
     R << NV((Twine("StartLoc") + Twine(i)).str(),
-            OF.Candidates[i]->front()->getDebugLoc());
+            OF.Candidates[i].front()->getDebugLoc());
     if (i != e - 1)
       R << ", ";
   }
@@ -1053,19 +1006,18 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
   MORE.emit(R);
 }
 
-unsigned MachineOutliner::findCandidates(
-    SuffixTree &ST, InstructionMapper &Mapper,
-    std::vector<std::shared_ptr<Candidate>> &CandidateList,
-    std::vector<OutlinedFunction> &FunctionList) {
-  CandidateList.clear();
+void
+MachineOutliner::findCandidates(InstructionMapper &Mapper,
+                                std::vector<OutlinedFunction> &FunctionList) {
   FunctionList.clear();
-  unsigned MaxLen = 0;
+  SuffixTree ST(Mapper.UnsignedVec);
 
   // First, find dall of the repeated substrings in the tree of minimum length
   // 2.
+  std::vector<Candidate> CandidatesForRepeatedSeq;
   for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
+    CandidatesForRepeatedSeq.clear();
     SuffixTree::RepeatedSubstring RS = *It;
-    std::vector<Candidate> CandidatesForRepeatedSeq;
     unsigned StringLen = RS.Length;
     for (const unsigned &StartIdx : RS.StartIndices) {
       unsigned EndIdx = StartIdx + StringLen - 1;
@@ -1101,17 +1053,18 @@ unsigned MachineOutliner::findCandidates(
 
         MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
         MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
+        MachineBasicBlock *MBB = StartIt->getParent();
 
         CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt,
-                                              EndIt, StartIt->getParent(),
-                                              FunctionList.size());
+                                              EndIt, MBB, FunctionList.size(),
+                                              Mapper.MBBFlagsMap[MBB]);
       }
     }
 
     // We've found something we might want to outline.
     // Create an OutlinedFunction to store it and check if it'd be beneficial
     // to outline.
-    if (CandidatesForRepeatedSeq.empty())
+    if (CandidatesForRepeatedSeq.size() < 2)
       continue;
 
     // Arbitrarily choose a TII from the first candidate.
@@ -1122,169 +1075,23 @@ unsigned MachineOutliner::findCandidates(
     OutlinedFunction OF =
         TII->getOutliningCandidateInfo(CandidatesForRepeatedSeq);
 
-    // If we deleted every candidate, then there's nothing to outline.
-    if (OF.Candidates.empty())
+    // If we deleted too many candidates, then there's nothing worth outlining.
+    // FIXME: This should take target-specified instruction sizes into account.
+    if (OF.Candidates.size() < 2)
       continue;
 
-    std::vector<unsigned> Seq;
-    unsigned StartIdx = RS.StartIndices[0]; // Grab any start index.
-    for (unsigned i = StartIdx; i < StartIdx + StringLen; i++)
-      Seq.push_back(ST.Str[i]);
-    OF.Sequence = Seq;
-
     // Is it better to outline this candidate than not?
     if (OF.getBenefit() < 1) {
       emitNotOutliningCheaperRemark(StringLen, CandidatesForRepeatedSeq, OF);
       continue;
     }
 
-    if (StringLen > MaxLen)
-      MaxLen = StringLen;
-
-    // The function is beneficial. Save its candidates to the candidate list
-    // for pruning.
-    for (std::shared_ptr<Candidate> &C : OF.Candidates)
-      CandidateList.push_back(C);
     FunctionList.push_back(OF);
   }
-
-  return MaxLen;
-}
-
-// Remove C from the candidate space, and update its OutlinedFunction.
-void MachineOutliner::prune(Candidate &C,
-                            std::vector<OutlinedFunction> &FunctionList) {
-  // Get the OutlinedFunction associated with this Candidate.
-  OutlinedFunction &F = FunctionList[C.FunctionIdx];
-
-  // Update C's associated function's occurrence count.
-  F.decrement();
-
-  // Remove C from the CandidateList.
-  C.InCandidateList = false;
-
-  LLVM_DEBUG(dbgs() << "- Removed a Candidate \n";
-             dbgs() << "--- Num fns left for candidate: "
-                    << F.getOccurrenceCount() << "\n";
-             dbgs() << "--- Candidate's functions's benefit: " << F.getBenefit()
-                    << "\n";);
-}
-
-void MachineOutliner::pruneOverlaps(
-    std::vector<std::shared_ptr<Candidate>> &CandidateList,
-    std::vector<OutlinedFunction> &FunctionList, InstructionMapper &Mapper,
-    unsigned MaxCandidateLen) {
-
-  // Return true if this candidate became unbeneficial for outlining in a
-  // previous step.
-  auto ShouldSkipCandidate = [&FunctionList, this](Candidate &C) {
-
-    // Check if the candidate was removed in a previous step.
-    if (!C.InCandidateList)
-      return true;
-
-    // C must be alive. Check if we should remove it.
-    if (FunctionList[C.FunctionIdx].getBenefit() < 1) {
-      prune(C, FunctionList);
-      return true;
-    }
-
-    // C is in the list, and F is still beneficial.
-    return false;
-  };
-
-  // TODO: Experiment with interval trees or other interval-checking structures
-  // to lower the time complexity of this function.
-  // TODO: Can we do better than the simple greedy choice?
-  // Check for overlaps in the range.
-  // This is O(MaxCandidateLen * CandidateList.size()).
-  for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et;
-       It++) {
-    Candidate &C1 = **It;
-
-    // If C1 was already pruned, or its function is no longer beneficial for
-    // outlining, move to the next candidate.
-    if (ShouldSkipCandidate(C1))
-      continue;
-
-    // The minimum start index of any candidate that could overlap with this
-    // one.
-    unsigned FarthestPossibleIdx = 0;
-
-    // Either the index is 0, or it's at most MaxCandidateLen indices away.
-    if (C1.getStartIdx() > MaxCandidateLen)
-      FarthestPossibleIdx = C1.getStartIdx() - MaxCandidateLen;
-
-    // Compare against the candidates in the list that start at most
-    // FarthestPossibleIdx indices away from C1. There are at most
-    // MaxCandidateLen of these.
-    for (auto Sit = It + 1; Sit != Et; Sit++) {
-      Candidate &C2 = **Sit;
-
-      // Is this candidate too far away to overlap?
-      if (C2.getStartIdx() < FarthestPossibleIdx)
-        break;
-
-      // If C2 was already pruned, or its function is no longer beneficial for
-      // outlining, move to the next candidate.
-      if (ShouldSkipCandidate(C2))
-        continue;
-
-      // Do C1 and C2 overlap?
-      //
-      // Not overlapping:
-      // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices
-      //
-      // We sorted our candidate list so C2Start <= C1Start. We know that
-      // C2End > C2Start since each candidate has length >= 2. Therefore, all we
-      // have to check is C2End < C2Start to see if we overlap.
-      if (C2.getEndIdx() < C1.getStartIdx())
-        continue;
-
-      // C1 and C2 overlap.
-      // We need to choose the better of the two.
-      //
-      // Approximate this by picking the one which would have saved us the
-      // most instructions before any pruning.
-
-      // Is C2 a better candidate?
-      if (C2.Benefit > C1.Benefit) {
-        // Yes, so prune C1. Since C1 is dead, we don't have to compare it
-        // against anything anymore, so break.
-        prune(C1, FunctionList);
-        break;
-      }
-
-      // Prune C2 and move on to the next candidate.
-      prune(C2, FunctionList);
-    }
-  }
-}
-
-unsigned MachineOutliner::buildCandidateList(
-    std::vector<std::shared_ptr<Candidate>> &CandidateList,
-    std::vector<OutlinedFunction> &FunctionList, SuffixTree &ST,
-    InstructionMapper &Mapper) {
-
-  std::vector<unsigned> CandidateSequence; // Current outlining candidate.
-  unsigned MaxCandidateLen = 0;            // Length of the longest candidate.
-
-  MaxCandidateLen =
-      findCandidates(ST, Mapper, CandidateList, FunctionList);
-
-  // Sort the candidates in decending order. This will simplify the outlining
-  // process when we have to remove the candidates from the mapping by
-  // allowing us to cut them out without keeping track of an offset.
-  std::stable_sort(
-      CandidateList.begin(), CandidateList.end(),
-      [](const std::shared_ptr<Candidate> &LHS,
-         const std::shared_ptr<Candidate> &RHS) { return *LHS < *RHS; });
-
-  return MaxCandidateLen;
 }
 
 MachineFunction *
-MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
+MachineOutliner::createOutlinedFunction(Module &M, OutlinedFunction &OF,
                                         InstructionMapper &Mapper,
                                         unsigned Name) {
 
@@ -1319,7 +1126,8 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
   // function. This makes sure the outlined function knows what kinds of
   // instructions are going into it. This is fine, since all parent functions
   // must necessarily support the instructions that are in the outlined region.
-  const Function &ParentFn = OF.Candidates.front()->getMF()->getFunction();
+  Candidate &FirstCand = OF.Candidates.front();
+  const Function &ParentFn = FirstCand.getMF()->getFunction();
   if (ParentFn.hasFnAttribute("target-features"))
     F->addFnAttr(ParentFn.getFnAttribute("target-features"));
 
@@ -1336,11 +1144,9 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
   // Insert the new function into the module.
   MF.insert(MF.begin(), &MBB);
 
-  // Copy over the instructions for the function using the integer mappings in
-  // its sequence.
-  for (unsigned Str : OF.Sequence) {
-    MachineInstr *NewMI =
-        MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second);
+  for (auto I = FirstCand.front(), E = std::next(FirstCand.back()); I != E;
+       ++I) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
     NewMI->dropMemRefs(MF);
 
     // Don't keep debug information for outlined instructions.
@@ -1372,9 +1178,10 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
         Unit /* File */,
         0 /* Line 0 is reserved for compiler-generated code. */,
         DB.createSubroutineType(DB.getOrCreateTypeArray(None)), /* void type */
-        false, true, 0, /* Line 0 is reserved for compiler-generated code. */
+        0, /* Line 0 is reserved for compiler-generated code. */
         DINode::DIFlags::FlagArtificial /* Compiler-generated code. */,
-        true /* Outlined code is optimized code by definition. */);
+        /* Outlined code is optimized code by definition. */
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
 
     // Don't add any new variables to the subprogram.
     DB.finalizeSubprogram(OutlinedSP);
@@ -1388,87 +1195,100 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
   return &MF;
 }
 
-bool MachineOutliner::outline(
-    Module &M, const ArrayRef<std::shared_ptr<Candidate>> &CandidateList,
-    std::vector<OutlinedFunction> &FunctionList, InstructionMapper &Mapper) {
+bool MachineOutliner::outline(Module &M,
+                              std::vector<OutlinedFunction> &FunctionList,
+                              InstructionMapper &Mapper) {
 
   bool OutlinedSomething = false;
 
   // Number to append to the current outlined function.
   unsigned OutlinedFunctionNum = 0;
 
-  // Replace the candidates with calls to their respective outlined functions.
-  for (const std::shared_ptr<Candidate> &Cptr : CandidateList) {
-    Candidate &C = *Cptr;
-    // Was the candidate removed during pruneOverlaps?
-    if (!C.InCandidateList)
-      continue;
-
-    // If not, then look at its OutlinedFunction.
-    OutlinedFunction &OF = FunctionList[C.FunctionIdx];
+  // Sort by benefit. The most beneficial functions should be outlined first.
+  std::stable_sort(
+      FunctionList.begin(), FunctionList.end(),
+      [](const OutlinedFunction &LHS, const OutlinedFunction &RHS) {
+        return LHS.getBenefit() > RHS.getBenefit();
+      });
+
+  // Walk over each function, outlining them as we go along. Functions are
+  // outlined greedily, based off the sort above.
+  for (OutlinedFunction &OF : FunctionList) {
+    // If we outlined something that overlapped with a candidate in a previous
+    // step, then we can't outline from it.
+    erase_if(OF.Candidates, [&Mapper](Candidate &C) {
+      return std::any_of(
+          Mapper.UnsignedVec.begin() + C.getStartIdx(),
+          Mapper.UnsignedVec.begin() + C.getEndIdx() + 1,
+          [](unsigned I) { return (I == static_cast<unsigned>(-1)); });
+    });
 
-    // Was its OutlinedFunction made unbeneficial during pruneOverlaps?
+    // If we made it unbeneficial to outline this function, skip it.
     if (OF.getBenefit() < 1)
       continue;
 
-    // Does this candidate have a function yet?
-    if (!OF.MF) {
-      OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum);
-      emitOutlinedFunctionRemark(OF);
-      FunctionsCreated++;
-      OutlinedFunctionNum++; // Created a function, move to the next name.
-    }
-
+    // It's beneficial. Create the function and outline its sequence's
+    // occurrences.
+    OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum);
+    emitOutlinedFunctionRemark(OF);
+    FunctionsCreated++;
+    OutlinedFunctionNum++; // Created a function, move to the next name.
     MachineFunction *MF = OF.MF;
-    MachineBasicBlock &MBB = *C.getMBB();
-    MachineBasicBlock::iterator StartIt = C.front();
-    MachineBasicBlock::iterator EndIt = C.back();
-    assert(StartIt != C.getMBB()->end() && "StartIt out of bounds!");
-    assert(EndIt != C.getMBB()->end() && "EndIt out of bounds!");
-
     const TargetSubtargetInfo &STI = MF->getSubtarget();
     const TargetInstrInfo &TII = *STI.getInstrInfo();
 
-    // Insert a call to the new function and erase the old sequence.
-    auto CallInst = TII.insertOutlinedCall(M, MBB, StartIt, *OF.MF, C);
-
-    // If the caller tracks liveness, then we need to make sure that anything
-    // we outline doesn't break liveness assumptions.
-    // The outlined functions themselves currently don't track liveness, but
-    // we should make sure that the ranges we yank things out of aren't
-    // wrong.
-    if (MBB.getParent()->getProperties().hasProperty(
-            MachineFunctionProperties::Property::TracksLiveness)) {
-      // Helper lambda for adding implicit def operands to the call instruction.
-      auto CopyDefs = [&CallInst](MachineInstr &MI) {
-        for (MachineOperand &MOP : MI.operands()) {
-          // Skip over anything that isn't a register.
-          if (!MOP.isReg())
-            continue;
-
-          // If it's a def, add it to the call instruction.
-          if (MOP.isDef())
-            CallInst->addOperand(
-                MachineOperand::CreateReg(MOP.getReg(), true, /* isDef = true */
-                                          true /* isImp = true */));
-        }
-      };
+    // Replace occurrences of the sequence with calls to the new function.
+    for (Candidate &C : OF.Candidates) {
+      MachineBasicBlock &MBB = *C.getMBB();
+      MachineBasicBlock::iterator StartIt = C.front();
+      MachineBasicBlock::iterator EndIt = C.back();
+
+      // Insert the call.
+      auto CallInst = TII.insertOutlinedCall(M, MBB, StartIt, *MF, C);
+
+      // If the caller tracks liveness, then we need to make sure that
+      // anything we outline doesn't break liveness assumptions. The outlined
+      // functions themselves currently don't track liveness, but we should
+      // make sure that the ranges we yank things out of aren't wrong.
+      if (MBB.getParent()->getProperties().hasProperty(
+              MachineFunctionProperties::Property::TracksLiveness)) {
+        // Helper lambda for adding implicit def operands to the call
+        // instruction.
+        auto CopyDefs = [&CallInst](MachineInstr &MI) {
+          for (MachineOperand &MOP : MI.operands()) {
+            // Skip over anything that isn't a register.
+            if (!MOP.isReg())
+              continue;
+
+            // If it's a def, add it to the call instruction.
+            if (MOP.isDef())
+              CallInst->addOperand(MachineOperand::CreateReg(
+                  MOP.getReg(), true, /* isDef = true */
+                  true /* isImp = true */));
+          }
+        };
+        // Copy over the defs in the outlined range.
+        // First inst in outlined range <-- Anything that's defined in this
+        // ...                           .. range has to be added as an
+        // implicit Last inst in outlined range  <-- def to the call
+        // instruction.
+        std::for_each(CallInst, std::next(EndIt), CopyDefs);
+      }
 
-      // Copy over the defs in the outlined range.
-      // First inst in outlined range <-- Anything that's defined in this
-      // ...                           .. range has to be added as an implicit
-      // Last inst in outlined range  <-- def to the call instruction.
-      std::for_each(CallInst, std::next(EndIt), CopyDefs);
-    }
+      // Erase from the point after where the call was inserted up to, and
+      // including, the final instruction in the sequence.
+      // Erase needs one past the end, so we need std::next there too.
+      MBB.erase(std::next(StartIt), std::next(EndIt));
 
-    // Erase from the point after where the call was inserted up to, and
-    // including, the final instruction in the sequence.
-    // Erase needs one past the end, so we need std::next there too.
-    MBB.erase(std::next(StartIt), std::next(EndIt));
-    OutlinedSomething = true;
+      // Keep track of what we removed by marking them all as -1.
+      std::for_each(Mapper.UnsignedVec.begin() + C.getStartIdx(),
+                    Mapper.UnsignedVec.begin() + C.getEndIdx() + 1,
+                    [](unsigned &I) { I = static_cast<unsigned>(-1); });
+      OutlinedSomething = true;
 
-    // Statistics.
-    NumOutlined++;
+      // Statistics.
+      NumOutlined++;
+    }
   }
 
   LLVM_DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";);
@@ -1628,18 +1448,10 @@ bool MachineOutliner::runOnModule(Module &M) {
 
   // Prepare instruction mappings for the suffix tree.
   populateMapper(Mapper, M, MMI);
-
-  // Construct a suffix tree, use it to find candidates, and then outline them.
-  SuffixTree ST(Mapper.UnsignedVec);
-  std::vector<std::shared_ptr<Candidate>> CandidateList;
   std::vector<OutlinedFunction> FunctionList;
 
   // Find all of the outlining candidates.
-  unsigned MaxCandidateLen =
-      buildCandidateList(CandidateList, FunctionList, ST, Mapper);
-
-  // Remove candidates that overlap with other candidates.
-  pruneOverlaps(CandidateList, FunctionList, Mapper, MaxCandidateLen);
+  findCandidates(Mapper, FunctionList);
 
   // If we've requested size remarks, then collect the MI counts of every
   // function before outlining, and the MI counts after outlining.
@@ -1656,7 +1468,7 @@ bool MachineOutliner::runOnModule(Module &M) {
     initSizeRemarkInfo(M, MMI, FunctionToInstrCount);
 
   // Outline each of the candidates and return true if something was outlined.
-  bool OutlinedSomething = outline(M, CandidateList, FunctionList, Mapper);
+  bool OutlinedSomething = outline(M, FunctionList, Mapper);
 
   // If we outlined something, we definitely changed the MI count of the
   // module. If we've asked for size remarks, then output them.
diff --git a/lib/CodeGen/MachinePassRegistry.cpp b/lib/CodeGen/MachinePassRegistry.cpp
deleted file mode 100644
index 3ee3e40b27e236d5d984c7dc3ed4618555f7834e..0000000000000000000000000000000000000000
--- a/lib/CodeGen/MachinePassRegistry.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//===-- CodeGen/MachineInstr.cpp ------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the machine function pass registry for register allocators
-// and instruction schedulers.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/MachinePassRegistry.h"
-
-using namespace llvm;
-
-void MachinePassRegistryListener::anchor() { }
-
-/// setDefault - Set the default constructor by name.
-void MachinePassRegistry::setDefault(StringRef Name) {
-  MachinePassCtor Ctor = nullptr;
-  for(MachinePassRegistryNode *R = getList(); R; R = R->getNext()) {
-    if (R->getName() == Name) {
-      Ctor = R->getCtor();
-      break;
-    }
-  }
-  assert(Ctor && "Unregistered pass name");
-  setDefault(Ctor);
-}
-
-/// Add - Adds a function pass to the registration list.
-///
-void MachinePassRegistry::Add(MachinePassRegistryNode *Node) {
-  Node->setNext(List);
-  List = Node;
-  if (Listener) Listener->NotifyAdd(Node->getName(),
-                                    Node->getCtor(),
-                                    Node->getDescription());
-}
-
-
-/// Remove - Removes a function pass from the registration list.
-///
-void MachinePassRegistry::Remove(MachinePassRegistryNode *Node) {
-  for (MachinePassRegistryNode **I = &List; *I; I = (*I)->getNextAddress()) {
-    if (*I == Node) {
-      if (Listener) Listener->NotifyRemove(Node->getName());
-      *I = (*I)->getNext();
-      break;
-    }
-  }
-}
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index bb5fc664c5f593e9469459bfcbb4346633901cf1..de8c55585d3fdfdd37593c3b9c0918e9f4e1982d 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -1121,11 +1121,12 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
           // First, perform the cheaper check that compares the base register.
           // If they are the same and the load offset is less than the store
           // offset, then mark the dependence as loop carried potentially.
-          unsigned BaseReg1, BaseReg2;
+          MachineOperand *BaseOp1, *BaseOp2;
           int64_t Offset1, Offset2;
-          if (TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) &&
-              TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
-            if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
+          if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) &&
+              TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) {
+            if (BaseOp1->isIdenticalTo(*BaseOp2) &&
+                (int)Offset1 < (int)Offset2) {
               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
                      "What happened to the chain edge?");
               SDep Dep(Load, SDep::Barrier);
@@ -3246,11 +3247,16 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
 /// during each iteration. Set Delta to the amount of the change.
 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  unsigned BaseReg;
+  MachineOperand *BaseOp;
   int64_t Offset;
-  if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
     return false;
 
+  if (!BaseOp->isReg())
+    return false;
+
+  unsigned BaseReg = BaseOp->getReg();
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
   // Check if there is a Phi. If so, get the definition in the loop.
   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
@@ -3653,19 +3659,19 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
     return true;
 
-  unsigned BaseRegS, BaseRegD;
+  MachineOperand *BaseOpS, *BaseOpD;
   int64_t OffsetS, OffsetD;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) ||
-      !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI))
+  if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) ||
+      !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, TRI))
     return true;
 
-  if (BaseRegS != BaseRegD)
+  if (!BaseOpS->isIdenticalTo(*BaseOpD))
     return true;
 
   // Check that the base register is incremented by a constant value for each
   // iteration.
-  MachineInstr *Def = MRI.getVRegDef(BaseRegS);
+  MachineInstr *Def = MRI.getVRegDef(BaseOpS->getReg());
   if (!Def || !Def->isPHI())
     return true;
   unsigned InitVal = 0;
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 9ab405f38d4d239716a8ea371311cf2fc6bb22a4..ec11586b08ead6fb3326cbcbe2449e03afa07865 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -41,6 +41,7 @@
 #include "llvm/CodeGen/ScheduleDFS.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -240,7 +241,8 @@ void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-MachinePassRegistry MachineSchedRegistry::Registry;
+MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor>
+    MachineSchedRegistry::Registry;
 
 /// A dummy default scheduler factory indicates whether the scheduler
 /// is overridden on the command line.
@@ -1482,15 +1484,40 @@ namespace {
 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
   struct MemOpInfo {
     SUnit *SU;
-    unsigned BaseReg;
+    MachineOperand *BaseOp;
     int64_t Offset;
 
-    MemOpInfo(SUnit *su, unsigned reg, int64_t ofs)
-        : SU(su), BaseReg(reg), Offset(ofs) {}
+    MemOpInfo(SUnit *su, MachineOperand *Op, int64_t ofs)
+        : SU(su), BaseOp(Op), Offset(ofs) {}
+
+    bool operator<(const MemOpInfo &RHS) const {
+      if (BaseOp->getType() != RHS.BaseOp->getType())
+        return BaseOp->getType() < RHS.BaseOp->getType();
+
+      if (BaseOp->isReg())
+        return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) <
+               std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset,
+                               RHS.SU->NodeNum);
+      if (BaseOp->isFI()) {
+        const MachineFunction &MF =
+            *BaseOp->getParent()->getParent()->getParent();
+        const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+        bool StackGrowsDown = TFI.getStackGrowthDirection() ==
+                              TargetFrameLowering::StackGrowsDown;
+        // Can't use tuple comparison here since we might need to use a
+        // different order when the stack grows down.
+        if (BaseOp->getIndex() != RHS.BaseOp->getIndex())
+          return StackGrowsDown ? BaseOp->getIndex() > RHS.BaseOp->getIndex()
+                                : BaseOp->getIndex() < RHS.BaseOp->getIndex();
+
+        if (Offset != RHS.Offset)
+          return StackGrowsDown ? Offset > RHS.Offset : Offset < RHS.Offset;
+
+        return SU->NodeNum < RHS.SU->NodeNum;
+      }
 
-    bool operator<(const MemOpInfo&RHS) const {
-      return std::tie(BaseReg, Offset, SU->NodeNum) <
-             std::tie(RHS.BaseReg, RHS.Offset, RHS.SU->NodeNum);
+      llvm_unreachable("MemOpClusterMutation only supports register or frame "
+                       "index bases.");
     }
   };
 
@@ -1546,10 +1573,10 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
   for (SUnit *SU : MemOps) {
-    unsigned BaseReg;
+    MachineOperand *BaseOp;
     int64_t Offset;
-    if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI))
-      MemOpRecords.push_back(MemOpInfo(SU, BaseReg, Offset));
+    if (TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, TRI))
+      MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset));
   }
   if (MemOpRecords.size() < 2)
     return;
@@ -1559,8 +1586,8 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     SUnit *SUa = MemOpRecords[Idx].SU;
     SUnit *SUb = MemOpRecords[Idx+1].SU;
-    if (TII->shouldClusterMemOps(*SUa->getInstr(), MemOpRecords[Idx].BaseReg,
-                                 *SUb->getInstr(), MemOpRecords[Idx+1].BaseReg,
+    if (TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
+                                 *MemOpRecords[Idx + 1].BaseOp,
                                  ClusterLength) &&
         DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
       LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
@@ -2515,7 +2542,7 @@ const char *GenericSchedulerBase::getReasonStr(
   switch (Reason) {
   case NoCand:         return "NOCAND    ";
   case Only1:          return "ONLY1     ";
-  case PhysRegCopy:    return "PREG-COPY ";
+  case PhysReg:        return "PHYS-REG  ";
   case RegExcess:      return "REG-EXCESS";
   case RegCritical:    return "REG-CRIT  ";
   case Stall:          return "STALL     ";
@@ -2851,24 +2878,41 @@ unsigned getWeakLeft(const SUnit *SU, bool isTop) {
 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
 /// with the operation that produces or consumes the physreg. We'll do this when
 /// regalloc has support for parallel copies.
-int biasPhysRegCopy(const SUnit *SU, bool isTop) {
+int biasPhysReg(const SUnit *SU, bool isTop) {
   const MachineInstr *MI = SU->getInstr();
-  if (!MI->isCopy())
-    return 0;
 
-  unsigned ScheduledOper = isTop ? 1 : 0;
-  unsigned UnscheduledOper = isTop ? 0 : 1;
-  // If we have already scheduled the physreg produce/consumer, immediately
-  // schedule the copy.
-  if (TargetRegisterInfo::isPhysicalRegister(
-        MI->getOperand(ScheduledOper).getReg()))
-    return 1;
-  // If the physreg is at the boundary, defer it. Otherwise schedule it
-  // immediately to free the dependent. We can hoist the copy later.
-  bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
-  if (TargetRegisterInfo::isPhysicalRegister(
-        MI->getOperand(UnscheduledOper).getReg()))
-    return AtBoundary ? -1 : 1;
+  if (MI->isCopy()) {
+    unsigned ScheduledOper = isTop ? 1 : 0;
+    unsigned UnscheduledOper = isTop ? 0 : 1;
+    // If we have already scheduled the physreg produce/consumer, immediately
+    // schedule the copy.
+    if (TargetRegisterInfo::isPhysicalRegister(
+            MI->getOperand(ScheduledOper).getReg()))
+      return 1;
+    // If the physreg is at the boundary, defer it. Otherwise schedule it
+    // immediately to free the dependent. We can hoist the copy later.
+    bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
+    if (TargetRegisterInfo::isPhysicalRegister(
+            MI->getOperand(UnscheduledOper).getReg()))
+      return AtBoundary ? -1 : 1;
+  }
+
+  if (MI->isMoveImmediate()) {
+    // If we have a move immediate and all successors have been assigned, bias
+    // towards scheduling this later. Make sure all register defs are to
+    // physical registers.
+    bool DoBias = true;
+    for (const MachineOperand &Op : MI->defs()) {
+      if (Op.isReg() && !TargetRegisterInfo::isPhysicalRegister(Op.getReg())) {
+        DoBias = false;
+        break;
+      }
+    }
+
+    if (DoBias)
+      return isTop ? -1 : 1;
+  }
+
   return 0;
 }
 } // end namespace llvm
@@ -2929,9 +2973,9 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
     return;
   }
 
-  if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop),
-                 biasPhysRegCopy(Cand.SU, Cand.AtTop),
-                 TryCand, Cand, PhysRegCopy))
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
     return;
 
   // Avoid exceeding the target's limit.
@@ -3178,7 +3222,7 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
   return SU;
 }
 
-void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
+void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
   MachineBasicBlock::iterator InsertPos = SU->getInstr();
   if (!isTop)
     ++InsertPos;
@@ -3193,7 +3237,7 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
       continue;
     MachineInstr *Copy = DepSU->getInstr();
-    if (!Copy->isCopy())
+    if (!Copy->isCopy() && !Copy->isMoveImmediate())
       continue;
     LLVM_DEBUG(dbgs() << "  Rescheduling physreg copy ";
                DAG->dumpNode(*Dep.getSUnit()));
@@ -3207,18 +3251,18 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 /// does.
 ///
 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
-/// them here. See comments in biasPhysRegCopy.
+/// them here. See comments in biasPhysReg.
 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
   if (IsTopNode) {
     SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
     Top.bumpNode(SU);
     if (SU->hasPhysRegUses)
-      reschedulePhysRegCopies(SU, true);
+      reschedulePhysReg(SU, true);
   } else {
     SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
     Bot.bumpNode(SU);
     if (SU->hasPhysRegDefs)
-      reschedulePhysRegCopies(SU, false);
+      reschedulePhysReg(SU, false);
   }
 }
 
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index d45855407f28f26dc75357657c2c2d9499a3ec1e..cdc597db640166ed5973436a9520e5240aef470b 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -716,9 +716,12 @@ static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
       !PredBB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit))
     return false;
 
-  unsigned BaseReg;
+  MachineOperand *BaseOp;
   int64_t Offset;
-  if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
+    return false;
+
+  if (!BaseOp->isReg())
     return false;
 
   if (!(MI.mayLoad() && !MI.isPredicable()))
@@ -731,7 +734,7 @@ static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
   return MBP.LHS.isReg() && MBP.RHS.isImm() && MBP.RHS.getImm() == 0 &&
          (MBP.Predicate == MachineBranchPredicate::PRED_NE ||
           MBP.Predicate == MachineBranchPredicate::PRED_EQ) &&
-         MBP.LHS.getReg() == BaseReg;
+         MBP.LHS.getReg() == BaseOp->getReg();
 }
 
 /// Sink an instruction and its associated debug instructions. If the debug
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 79ca6adf95c4604b08feb6644bd5bca7558759e3..e62ed3094651e9113271b2ca10c661c00c54c515 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -218,8 +218,7 @@ computeHeightResources(const MachineBasicBlock *MBB) {
   // The trace tail is done.
   if (!TBI->Succ) {
     TBI->Tail = MBB->getNumber();
-    std::copy(PRCycles.begin(), PRCycles.end(),
-              ProcResourceHeights.begin() + PROffset);
+    llvm::copy(PRCycles, ProcResourceHeights.begin() + PROffset);
     return;
   }
 
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index b37c421596b757f8f78aa84aff8bd7c8cd6274c4..f4804e734f612e99de4920f21cb947e65467ed73 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -1055,6 +1055,89 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
     break;
   }
+  case TargetOpcode::G_MERGE_VALUES: {
+    // G_MERGE_VALUES should only be used to merge scalars into a larger scalar,
+    // e.g. s2N = MERGE sN, sN
+    // Merging multiple scalars into a vector is not allowed, should use
+    // G_BUILD_VECTOR for that.
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (DstTy.isVector() || SrcTy.isVector())
+      report("G_MERGE_VALUES cannot operate on vectors", MI);
+    break;
+  }
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(MI->getNumOperands()-1).getReg());
+    // For now G_UNMERGE can split vectors.
+    for (unsigned i = 0; i < MI->getNumOperands()-1; ++i) {
+      if (MRI->getType(MI->getOperand(i).getReg()) != DstTy)
+        report("G_UNMERGE_VALUES destination types do not match", MI);
+    }
+    if (SrcTy.getSizeInBits() !=
+        (DstTy.getSizeInBits() * (MI->getNumOperands() - 1))) {
+      report("G_UNMERGE_VALUES source operand does not cover dest operands",
+             MI);
+    }
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR: {
+    // Source types must be scalars, dest type a vector. Total size of scalars
+    // must match the dest vector size.
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcEltTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isVector() || SrcEltTy.isVector())
+      report("G_BUILD_VECTOR must produce a vector from scalar operands", MI);
+    for (unsigned i = 2; i < MI->getNumOperands(); ++i) {
+      if (MRI->getType(MI->getOperand(1).getReg()) !=
+          MRI->getType(MI->getOperand(i).getReg()))
+        report("G_BUILD_VECTOR source operand types are not homogeneous", MI);
+    }
+    if (DstTy.getSizeInBits() !=
+        SrcEltTy.getSizeInBits() * (MI->getNumOperands() - 1))
+      report("G_BUILD_VECTOR src operands total size don't match dest "
+             "size.",
+             MI);
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR_TRUNC: {
+    // Source types must be scalars, dest type a vector. Scalar types must be
+    // larger than the dest vector elt type, as this is a truncating operation.
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcEltTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isVector() || SrcEltTy.isVector())
+      report("G_BUILD_VECTOR_TRUNC must produce a vector from scalar operands",
+             MI);
+    for (unsigned i = 2; i < MI->getNumOperands(); ++i) {
+      if (MRI->getType(MI->getOperand(1).getReg()) !=
+          MRI->getType(MI->getOperand(i).getReg()))
+        report("G_BUILD_VECTOR_TRUNC source operand types are not homogeneous",
+               MI);
+    }
+    if (SrcEltTy.getSizeInBits() <= DstTy.getElementType().getSizeInBits())
+      report("G_BUILD_VECTOR_TRUNC source operand types are not larger than "
+             "dest elt type",
+             MI);
+    break;
+  }
+  case TargetOpcode::G_CONCAT_VECTORS: {
+    // Source types should be vectors, and total size should match the dest
+    // vector size.
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isVector() || !SrcTy.isVector())
+      report("G_CONCAT_VECTOR requires vector source and destination operands",
+             MI);
+    for (unsigned i = 2; i < MI->getNumOperands(); ++i) {
+      if (MRI->getType(MI->getOperand(1).getReg()) !=
+          MRI->getType(MI->getOperand(i).getReg()))
+        report("G_CONCAT_VECTOR source operand types are not homogeneous", MI);
+    }
+    if (DstTy.getNumElements() !=
+        SrcTy.getNumElements() * (MI->getNumOperands() - 1))
+      report("G_CONCAT_VECTOR num dest and source elements should match", MI);
+    break;
+  }
   case TargetOpcode::COPY: {
     if (foundErrors)
       break;
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index befa8422d3993ecebe571b0dc4010c6c0f45ea04..770f6c5b0403020590152ae11162e080a5bf3687 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -90,10 +90,10 @@ bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
 }
 
 /// IsSingleValuePHICycle - Check if MI is a PHI where all the source operands
-/// are copies of SingleValReg, possibly via copies through other PHIs.  If
+/// are copies of SingleValReg, possibly via copies through other PHIs. If
 /// SingleValReg is zero on entry, it is set to the register with the single
-/// non-copy value.  PHIsInCycle is a set used to keep track of the PHIs that
-/// have been scanned.
+/// non-copy value. PHIsInCycle is a set used to keep track of the PHIs that
+/// have been scanned. PHIs may be grouped by cycle, several cycles or chains.
 bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
                                          unsigned &SingleValReg,
                                          InstrSet &PHIsInCycle) {
@@ -119,8 +119,10 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
     if (SrcMI && SrcMI->isCopy() &&
         !SrcMI->getOperand(0).getSubReg() &&
         !SrcMI->getOperand(1).getSubReg() &&
-        TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg()))
-      SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+        TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg())) {
+      SrcReg = SrcMI->getOperand(1).getReg();
+      SrcMI = MRI->getVRegDef(SrcReg);
+    }
     if (!SrcMI)
       return false;
 
@@ -129,7 +131,7 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
         return false;
     } else {
       // Fail if there is more than one non-phi/non-move register.
-      if (SingleValReg != 0)
+      if (SingleValReg != 0 && SingleValReg != SrcReg)
         return false;
       SingleValReg = SrcReg;
     }
@@ -180,6 +182,9 @@ bool OptimizePHIs::OptimizeBB(MachineBasicBlock &MBB) {
       if (!MRI->constrainRegClass(SingleValReg, MRI->getRegClass(OldReg)))
         continue;
 
+      // for the case SingleValReg taken from copy instr
+      MRI->clearKillFlags(SingleValReg);
+
       MRI->replaceRegWith(OldReg, SingleValReg);
       MI->eraseFromParent();
       ++NumPHICycles;
diff --git a/lib/CodeGen/PreISelIntrinsicLowering.cpp b/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 8f88ef78828aed09d88c85f828034c47a16f387f..b0e9ac03612d10c45cc6e51ff9e7f5cf7e206183 100644
--- a/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -7,13 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass implements IR lowering for the llvm.load.relative intrinsic.
+// This pass implements IR lowering for the llvm.load.relative and llvm.objc.*
+// intrinsics.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -55,11 +57,129 @@ static bool lowerLoadRelative(Function &F) {
   return Changed;
 }
 
+static bool lowerObjCCall(Function &F, const char *NewFn,
+                          bool setNonLazyBind = false) {
+  if (F.use_empty())
+    return false;
+
+  // If we haven't already looked up this function, check to see if the
+  // program already contains a function with this name.
+  Module *M = F.getParent();
+  Constant* FCache = M->getOrInsertFunction(NewFn, F.getFunctionType());
+
+  if (Function* Fn = dyn_cast<Function>(FCache)) {
+    Fn->setLinkage(F.getLinkage());
+    if (setNonLazyBind && !Fn->isWeakForLinker()) {
+      // If we have Native ARC, set nonlazybind attribute for these APIs for
+      // performance.
+      Fn->addFnAttr(Attribute::NonLazyBind);
+    }
+  }
+
+  for (auto I = F.use_begin(), E = F.use_end(); I != E;) {
+    auto *CI = dyn_cast<CallInst>(I->getUser());
+    assert(CI->getCalledFunction() && "Cannot lower an indirect call!");
+    ++I;
+
+    IRBuilder<> Builder(CI->getParent(), CI->getIterator());
+    SmallVector<Value *, 8> Args(CI->arg_begin(), CI->arg_end());
+    CallInst *NewCI = Builder.CreateCall(FCache, Args);
+    NewCI->setName(CI->getName());
+    NewCI->setTailCallKind(CI->getTailCallKind());
+    if (!CI->use_empty())
+      CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+  }
+
+  return true;
+}
+
 static bool lowerIntrinsics(Module &M) {
   bool Changed = false;
   for (Function &F : M) {
-    if (F.getName().startswith("llvm.load.relative."))
+    if (F.getName().startswith("llvm.load.relative.")) {
       Changed |= lowerLoadRelative(F);
+      continue;
+    }
+    switch (F.getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::objc_autorelease:
+      Changed |= lowerObjCCall(F, "objc_autorelease");
+      break;
+    case Intrinsic::objc_autoreleasePoolPop:
+      Changed |= lowerObjCCall(F, "objc_autoreleasePoolPop");
+      break;
+    case Intrinsic::objc_autoreleasePoolPush:
+      Changed |= lowerObjCCall(F, "objc_autoreleasePoolPush");
+      break;
+    case Intrinsic::objc_autoreleaseReturnValue:
+      Changed |= lowerObjCCall(F, "objc_autoreleaseReturnValue");
+      break;
+    case Intrinsic::objc_copyWeak:
+      Changed |= lowerObjCCall(F, "objc_copyWeak");
+      break;
+    case Intrinsic::objc_destroyWeak:
+      Changed |= lowerObjCCall(F, "objc_destroyWeak");
+      break;
+    case Intrinsic::objc_initWeak:
+      Changed |= lowerObjCCall(F, "objc_initWeak");
+      break;
+    case Intrinsic::objc_loadWeak:
+      Changed |= lowerObjCCall(F, "objc_loadWeak");
+      break;
+    case Intrinsic::objc_loadWeakRetained:
+      Changed |= lowerObjCCall(F, "objc_loadWeakRetained");
+      break;
+    case Intrinsic::objc_moveWeak:
+      Changed |= lowerObjCCall(F, "objc_moveWeak");
+      break;
+    case Intrinsic::objc_release:
+      Changed |= lowerObjCCall(F, "objc_release", true);
+      break;
+    case Intrinsic::objc_retain:
+      Changed |= lowerObjCCall(F, "objc_retain", true);
+      break;
+    case Intrinsic::objc_retainAutorelease:
+      Changed |= lowerObjCCall(F, "objc_retainAutorelease");
+      break;
+    case Intrinsic::objc_retainAutoreleaseReturnValue:
+      Changed |= lowerObjCCall(F, "objc_retainAutoreleaseReturnValue");
+      break;
+    case Intrinsic::objc_retainAutoreleasedReturnValue:
+      Changed |= lowerObjCCall(F, "objc_retainAutoreleasedReturnValue");
+      break;
+    case Intrinsic::objc_retainBlock:
+      Changed |= lowerObjCCall(F, "objc_retainBlock");
+      break;
+    case Intrinsic::objc_storeStrong:
+      Changed |= lowerObjCCall(F, "objc_storeStrong");
+      break;
+    case Intrinsic::objc_storeWeak:
+      Changed |= lowerObjCCall(F, "objc_storeWeak");
+      break;
+    case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue:
+      Changed |= lowerObjCCall(F, "objc_unsafeClaimAutoreleasedReturnValue");
+      break;
+    case Intrinsic::objc_retainedObject:
+      Changed |= lowerObjCCall(F, "objc_retainedObject");
+      break;
+    case Intrinsic::objc_unretainedObject:
+      Changed |= lowerObjCCall(F, "objc_unretainedObject");
+      break;
+    case Intrinsic::objc_unretainedPointer:
+      Changed |= lowerObjCCall(F, "objc_unretainedPointer");
+      break;
+    case Intrinsic::objc_retain_autorelease:
+      Changed |= lowerObjCCall(F, "objc_retain_autorelease");
+      break;
+    case Intrinsic::objc_sync_enter:
+      Changed |= lowerObjCCall(F, "objc_sync_enter");
+      break;
+    case Intrinsic::objc_sync_exit:
+      Changed |= lowerObjCCall(F, "objc_sync_exit");
+      break;
+    }
   }
   return Changed;
 }
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index fc62c8caf59edfb0a8c9d39301778f4f360012ac..23754e487a18112e00d87c0009759dce6a07865e 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -75,6 +75,10 @@ using namespace llvm;
 
 using MBBVector = SmallVector<MachineBasicBlock *, 4>;
 
+STATISTIC(NumLeafFuncWithSpills, "Number of leaf functions with CSRs");
+STATISTIC(NumFuncSeen, "Number of functions seen in PEI");
+
+
 namespace {
 
 class PEI : public MachineFunctionPass {
@@ -168,6 +172,7 @@ using StackObjSet = SmallSetVector<int, 8>;
 /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
 /// frame indexes with appropriate references.
 bool PEI::runOnMachineFunction(MachineFunction &MF) {
+  NumFuncSeen++;
   const Function &F = MF.getFunction();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
@@ -357,6 +362,11 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
     // Now that we know which registers need to be saved and restored, allocate
     // stack slots for them.
     for (auto &CS : CSI) {
+      // If the target has spilled this register to another register, we don't
+      // need to allocate a stack slot.
+      if (CS.isSpilledToReg())
+        continue;
+
       unsigned Reg = CS.getReg();
       const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
 
@@ -454,7 +464,22 @@ static void updateLiveness(MachineFunction &MF) {
       if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg))
         MBB->addLiveIn(Reg);
     }
+    // If callee-saved register is spilled to another register rather than
+    // spilling to stack, the destination register has to be marked as live for
+    // each MBB between the prologue and epilogue so that it is not clobbered
+    // before it is reloaded in the epilogue. The Visited set contains all
+    // blocks outside of the region delimited by prologue/epilogue.
+    if (CSI[i].isSpilledToReg()) {
+      for (MachineBasicBlock &MBB : MF) {
+        if (Visited.count(&MBB))
+          continue;
+        MCPhysReg DstReg = CSI[i].getDstReg();
+        if (!MBB.isLiveIn(DstReg))
+          MBB.addLiveIn(DstReg);
+      }
+    }
   }
+
 }
 
 /// Insert restore code for the callee-saved registers used in the function.
@@ -530,6 +555,9 @@ void PEI::spillCalleeSavedRegs(MachineFunction &MF) {
 
     std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
     if (!CSI.empty()) {
+      if (!MFI.hasCalls())
+        NumLeafFuncWithSpills++;
+
       for (MachineBasicBlock *SaveBlock : SaveBlocks) {
         insertCSRSaves(*SaveBlock, CSI);
         // Update the live-in information of all the blocks up to the save
@@ -1090,7 +1118,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
         MachineOperand &Offset = MI.getOperand(i + 1);
         int refOffset = TFI->getFrameIndexReferencePreferSP(
             MF, MI.getOperand(i).getIndex(), Reg, /*IgnoreSPUpdates*/ false);
-        Offset.setImm(Offset.getImm() + refOffset);
+        Offset.setImm(Offset.getImm() + refOffset + SPAdj);
         MI.getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
         continue;
       }
diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt
index 3318e109155b8eb1ca96e212a1bde9949b39276b..d8958715c6b45455dc00329d5bf2dbbdd5953fe5 100644
--- a/lib/CodeGen/README.txt
+++ b/lib/CodeGen/README.txt
@@ -156,8 +156,8 @@ doing the wrong thing.
 //===---------------------------------------------------------------------===//
 
 It would be really nice to be able to write patterns in .td files for copies,
-which would eliminate a bunch of explicit predicates on them (e.g. no side 
-effects).  Once this is in place, it would be even better to have tblgen 
+which would eliminate a bunch of explicit predicates on them (e.g. no side
+effects).  Once this is in place, it would be even better to have tblgen
 synthesize the various copy insertion/inspection methods in TargetInstrInfo.
 
 //===---------------------------------------------------------------------===//
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index ea7f247214de3ea98dc3d6821914598d1281a711..eb3a4e481f5d37d4ba9e8d8b22c79b1d1020ef43 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -177,6 +177,8 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     void allocateBasicBlock(MachineBasicBlock &MBB);
+    void allocateInstruction(MachineInstr &MI);
+    void handleDebugValue(MachineInstr &MI);
     void handleThroughOperands(MachineInstr &MI,
                                SmallVectorImpl<unsigned> &VirtDead);
     bool isLastUseOfLocalReg(const MachineOperand &MO) const;
@@ -207,7 +209,7 @@ namespace {
     LiveReg &reloadVirtReg(MachineInstr &MI, unsigned OpNum, unsigned VirtReg,
                            unsigned Hint);
     void spillAll(MachineBasicBlock::iterator MI);
-    bool setPhysReg(MachineInstr &MI, unsigned OpNum, MCPhysReg PhysReg);
+    bool setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg);
 
     int getStackSpaceFor(unsigned VirtReg);
     void spill(MachineBasicBlock::iterator Before, unsigned VirtReg,
@@ -373,7 +375,8 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) {
 
 /// Spill all dirty virtregs without killing them.
 void RegAllocFast::spillAll(MachineBasicBlock::iterator MI) {
-  if (LiveVirtRegs.empty()) return;
+  if (LiveVirtRegs.empty())
+    return;
   // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
   // of spilling here is deterministic, if arbitrary.
   for (LiveReg &LR : LiveVirtRegs) {
@@ -490,9 +493,9 @@ void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
   }
 }
 
-/// Return the cost of spilling clearing out PhysReg and aliases so it is
-/// free for allocation. Returns 0 when PhysReg is free or disabled with all
-/// aliases disabled - it can be allocated directly.
+/// Return the cost of spilling clearing out PhysReg and aliases so it is free
+/// for allocation. Returns 0 when PhysReg is free or disabled with all aliases
+/// disabled - it can be allocated directly.
 /// \returns spillImpossible when PhysReg or an alias can't be spilled.
 unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
   if (isRegUsedInInstr(PhysReg)) {
@@ -588,16 +591,13 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint) {
     }
   }
 
-  LLVM_DEBUG(dbgs() << "Allocating " << printReg(VirtReg) << " from "
-                    << TRI->getRegClassName(&RC) << '\n');
-
-  unsigned BestReg = 0;
+  MCPhysReg BestReg = 0;
   unsigned BestCost = spillImpossible;
   for (MCPhysReg PhysReg : AllocationOrder) {
     LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << ' ');
     unsigned Cost = calcSpillCost(PhysReg);
     LLVM_DEBUG(dbgs() << "Cost: " << Cost << " BestCost: " << BestCost << '\n');
-    // Cost is 0 when all aliases are already disabled.
+    // Immediate take a register with cost 0.
     if (Cost == 0) {
       assignVirtToPhysReg(LR, PhysReg);
       return;
@@ -609,7 +609,8 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint) {
   }
 
   if (!BestReg) {
-    // Nothing we can do. Report an error and keep going with a bad allocation.
+    // Nothing we can do: Report an error and keep going with an invalid
+    // allocation.
     if (MI.isInlineAsm())
       MI.emitError("inline assembly requires more registers than available");
     else
@@ -704,9 +705,8 @@ RegAllocFast::LiveReg &RegAllocFast::reloadVirtReg(MachineInstr &MI,
 /// Changes operand OpNum in MI the refer the PhysReg, considering subregs. This
 /// may invalidate any operand pointers.  Return true if the operand kills its
 /// register.
-bool RegAllocFast::setPhysReg(MachineInstr &MI, unsigned OpNum,
+bool RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO,
                               MCPhysReg PhysReg) {
-  MachineOperand &MO = MI.getOperand(OpNum);
   bool Dead = MO.isDead();
   if (!MO.getSubReg()) {
     MO.setReg(PhysReg);
@@ -769,7 +769,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   SmallVector<unsigned, 8> PartialDefs;
   LLVM_DEBUG(dbgs() << "Allocating tied uses.\n");
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI.getOperand(I);
+    MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
@@ -780,7 +780,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
                         << ".\n");
       LiveReg &LR = reloadVirtReg(MI, I, Reg, 0);
       MCPhysReg PhysReg = LR.PhysReg;
-      setPhysReg(MI, I, PhysReg);
+      setPhysReg(MI, MO, PhysReg);
       // Note: we don't update the def operand yet. That would cause the normal
       // def-scan to attempt spilling.
     } else if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) {
@@ -802,7 +802,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
       continue;
     // Note: defineVirtReg may invalidate MO.
     MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, 0);
-    if (setPhysReg(MI, I, PhysReg))
+    if (setPhysReg(MI, MI.getOperand(I), PhysReg))
       VirtDead.push_back(Reg);
   }
 
@@ -860,6 +860,199 @@ void RegAllocFast::dumpState() {
 }
 #endif
 
+void RegAllocFast::allocateInstruction(MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // If this is a copy, we may be able to coalesce.
+  unsigned CopySrcReg = 0;
+  unsigned CopyDstReg = 0;
+  unsigned CopySrcSub = 0;
+  unsigned CopyDstSub = 0;
+  if (MI.isCopy()) {
+    CopyDstReg = MI.getOperand(0).getReg();
+    CopySrcReg = MI.getOperand(1).getReg();
+    CopyDstSub = MI.getOperand(0).getSubReg();
+    CopySrcSub = MI.getOperand(1).getSubReg();
+  }
+
+  // Track registers used by instruction.
+  UsedInInstr.clear();
+
+  // First scan.
+  // Mark physreg uses and early clobbers as used.
+  // Find the end of the virtreg operands
+  unsigned VirtOpEnd = 0;
+  bool hasTiedOps = false;
+  bool hasEarlyClobbers = false;
+  bool hasPartialRedefs = false;
+  bool hasPhysDefs = false;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    // Make sure MRI knows about registers clobbered by regmasks.
+    if (MO.isRegMask()) {
+      MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+      continue;
+    }
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg) continue;
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      VirtOpEnd = i+1;
+      if (MO.isUse()) {
+        hasTiedOps = hasTiedOps ||
+                            MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1;
+      } else {
+        if (MO.isEarlyClobber())
+          hasEarlyClobbers = true;
+        if (MO.getSubReg() && MI.readsVirtualRegister(Reg))
+          hasPartialRedefs = true;
+      }
+      continue;
+    }
+    if (!MRI->isAllocatable(Reg)) continue;
+    if (MO.isUse()) {
+      usePhysReg(MO);
+    } else if (MO.isEarlyClobber()) {
+      definePhysReg(MI, Reg,
+                    (MO.isImplicit() || MO.isDead()) ? regFree : regReserved);
+      hasEarlyClobbers = true;
+    } else
+      hasPhysDefs = true;
+  }
+
+  // The instruction may have virtual register operands that must be allocated
+  // the same register at use-time and def-time: early clobbers and tied
+  // operands. If there are also physical defs, these registers must avoid
+  // both physical defs and uses, making them more constrained than normal
+  // operands.
+  // Similarly, if there are multiple defs and tied operands, we must make
+  // sure the same register is allocated to uses and defs.
+  // We didn't detect inline asm tied operands above, so just make this extra
+  // pass for all inline asm.
+  if (MI.isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
+      (hasTiedOps && (hasPhysDefs || MCID.getNumDefs() > 1))) {
+    handleThroughOperands(MI, VirtDead);
+    // Don't attempt coalescing when we have funny stuff going on.
+    CopyDstReg = 0;
+    // Pretend we have early clobbers so the use operands get marked below.
+    // This is not necessary for the common case of a single tied use.
+    hasEarlyClobbers = true;
+  }
+
+  // Second scan.
+  // Allocate virtreg uses.
+  for (unsigned I = 0; I != VirtOpEnd; ++I) {
+    MachineOperand &MO = MI.getOperand(I);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    if (MO.isUse()) {
+      LiveReg &LR = reloadVirtReg(MI, I, Reg, CopyDstReg);
+      MCPhysReg PhysReg = LR.PhysReg;
+      CopySrcReg = (CopySrcReg == Reg || CopySrcReg == PhysReg) ? PhysReg : 0;
+      if (setPhysReg(MI, MO, PhysReg))
+        killVirtReg(LR);
+    }
+  }
+
+  // Track registers defined by instruction - early clobbers and tied uses at
+  // this point.
+  UsedInInstr.clear();
+  if (hasEarlyClobbers) {
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+      // Look for physreg defs and tied uses.
+      if (!MO.isDef() && !MO.isTied()) continue;
+      markRegUsedInInstr(Reg);
+    }
+  }
+
+  unsigned DefOpEnd = MI.getNumOperands();
+  if (MI.isCall()) {
+    // Spill all virtregs before a call. This serves one purpose: If an
+    // exception is thrown, the landing pad is going to expect to find
+    // registers in their spill slots.
+    // Note: although this is appealing to just consider all definitions
+    // as call-clobbered, this is not correct because some of those
+    // definitions may be used later on and we do not want to reuse
+    // those for virtual registers in between.
+    LLVM_DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
+    spillAll(MI);
+  }
+
+  // Third scan.
+  // Allocate defs and collect dead defs.
+  for (unsigned I = 0; I != DefOpEnd; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+    if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
+      continue;
+    unsigned Reg = MO.getReg();
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!MRI->isAllocatable(Reg)) continue;
+      definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
+      continue;
+    }
+    MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg);
+    if (setPhysReg(MI, MI.getOperand(I), PhysReg)) {
+      VirtDead.push_back(Reg);
+      CopyDstReg = 0; // cancel coalescing;
+    } else
+      CopyDstReg = (CopyDstReg == Reg || CopyDstReg == PhysReg) ? PhysReg : 0;
+  }
+
+  // Kill dead defs after the scan to ensure that multiple defs of the same
+  // register are allocated identically. We didn't need to do this for uses
+  // because we are crerating our own kill flags, and they are always at the
+  // last use.
+  for (unsigned VirtReg : VirtDead)
+    killVirtReg(VirtReg);
+  VirtDead.clear();
+
+  LLVM_DEBUG(dbgs() << "<< " << MI);
+  if (CopyDstReg && CopyDstReg == CopySrcReg && CopyDstSub == CopySrcSub) {
+    LLVM_DEBUG(dbgs() << "Mark identity copy for removal\n");
+    Coalesced.push_back(&MI);
+  }
+}
+
+void RegAllocFast::handleDebugValue(MachineInstr &MI) {
+  MachineOperand &MO = MI.getOperand(0);
+
+  // Ignore DBG_VALUEs that aren't based on virtual registers. These are
+  // mostly constants and frame indices.
+  if (!MO.isReg())
+    return;
+  unsigned Reg = MO.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return;
+
+  // See if this virtual register has already been allocated to a physical
+  // register or spilled to a stack slot.
+  LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
+  if (LRI != LiveVirtRegs.end() && LRI->PhysReg) {
+    setPhysReg(MI, MO, LRI->PhysReg);
+  } else {
+    int SS = StackSlotForVirtReg[Reg];
+    if (SS != -1) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      updateDbgValueForSpill(MI, SS);
+      LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << MI);
+      return;
+    }
+
+    // We can't allocate a physreg for a DebugValue, sorry!
+    LLVM_DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
+    MO.setReg(0);
+  }
+
+  // If Reg hasn't been spilled, put this DBG_VALUE in LiveDbgValueMap so
+  // that future spills of Reg will have DBG_VALUEs.
+  LiveDbgValueMap[Reg].push_back(&MI);
+}
+
 void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
@@ -879,205 +1072,19 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
 
   // Otherwise, sequentially allocate each instruction in the MBB.
   for (MachineInstr &MI : MBB) {
-    const MCInstrDesc &MCID = MI.getDesc();
-    LLVM_DEBUG(dbgs() << "\n>> " << MI << "Regs:"; dumpState());
+    LLVM_DEBUG(
+      dbgs() << "\n>> " << MI << "Regs:";
+      dumpState()
+    );
 
-    // Debug values are not allowed to change codegen in any way.
+    // Special handling for debug values. Note that they are not allowed to
+    // affect codegen of the other instructions in any way.
     if (MI.isDebugValue()) {
-      MachineInstr *DebugMI = &MI;
-      MachineOperand &MO = DebugMI->getOperand(0);
-
-      // Ignore DBG_VALUEs that aren't based on virtual registers. These are
-      // mostly constants and frame indices.
-      if (!MO.isReg())
-        continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
-        continue;
-
-      // See if this virtual register has already been allocated to a physical
-      // register or spilled to a stack slot.
-      LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
-      if (LRI != LiveVirtRegs.end() && LRI->PhysReg)
-        setPhysReg(*DebugMI, 0, LRI->PhysReg);
-      else {
-        int SS = StackSlotForVirtReg[Reg];
-        if (SS != -1) {
-          // Modify DBG_VALUE now that the value is in a spill slot.
-          updateDbgValueForSpill(*DebugMI, SS);
-          LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:"
-                            << "\t" << *DebugMI);
-          continue;
-        }
-
-        // We can't allocate a physreg for a DebugValue, sorry!
-        LLVM_DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
-        MO.setReg(0);
-      }
-
-      // If Reg hasn't been spilled, put this DBG_VALUE in LiveDbgValueMap so
-      // that future spills of Reg will have DBG_VALUEs.
-      LiveDbgValueMap[Reg].push_back(DebugMI);
-      continue;
-    }
-
-    if (MI.isDebugLabel())
+      handleDebugValue(MI);
       continue;
-
-    // If this is a copy, we may be able to coalesce.
-    unsigned CopySrcReg = 0;
-    unsigned CopyDstReg = 0;
-    unsigned CopySrcSub = 0;
-    unsigned CopyDstSub = 0;
-    if (MI.isCopy()) {
-      CopyDstReg = MI.getOperand(0).getReg();
-      CopySrcReg = MI.getOperand(1).getReg();
-      CopyDstSub = MI.getOperand(0).getSubReg();
-      CopySrcSub = MI.getOperand(1).getSubReg();
-    }
-
-    // Track registers used by instruction.
-    UsedInInstr.clear();
-
-    // First scan.
-    // Mark physreg uses and early clobbers as used.
-    // Find the end of the virtreg operands
-    unsigned VirtOpEnd = 0;
-    bool hasTiedOps = false;
-    bool hasEarlyClobbers = false;
-    bool hasPartialRedefs = false;
-    bool hasPhysDefs = false;
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
-      // Make sure MRI knows about registers clobbered by regmasks.
-      if (MO.isRegMask()) {
-        MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
-        continue;
-      }
-      if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
-      if (!Reg) continue;
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        VirtOpEnd = i+1;
-        if (MO.isUse()) {
-          hasTiedOps = hasTiedOps ||
-                              MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1;
-        } else {
-          if (MO.isEarlyClobber())
-            hasEarlyClobbers = true;
-          if (MO.getSubReg() && MI.readsVirtualRegister(Reg))
-            hasPartialRedefs = true;
-        }
-        continue;
-      }
-      if (!MRI->isAllocatable(Reg)) continue;
-      if (MO.isUse()) {
-        usePhysReg(MO);
-      } else if (MO.isEarlyClobber()) {
-        definePhysReg(MI, Reg,
-                      (MO.isImplicit() || MO.isDead()) ? regFree : regReserved);
-        hasEarlyClobbers = true;
-      } else
-        hasPhysDefs = true;
     }
 
-    // The instruction may have virtual register operands that must be allocated
-    // the same register at use-time and def-time: early clobbers and tied
-    // operands. If there are also physical defs, these registers must avoid
-    // both physical defs and uses, making them more constrained than normal
-    // operands.
-    // Similarly, if there are multiple defs and tied operands, we must make
-    // sure the same register is allocated to uses and defs.
-    // We didn't detect inline asm tied operands above, so just make this extra
-    // pass for all inline asm.
-    if (MI.isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
-        (hasTiedOps && (hasPhysDefs || MCID.getNumDefs() > 1))) {
-      handleThroughOperands(MI, VirtDead);
-      // Don't attempt coalescing when we have funny stuff going on.
-      CopyDstReg = 0;
-      // Pretend we have early clobbers so the use operands get marked below.
-      // This is not necessary for the common case of a single tied use.
-      hasEarlyClobbers = true;
-    }
-
-    // Second scan.
-    // Allocate virtreg uses.
-    for (unsigned I = 0; I != VirtOpEnd; ++I) {
-      const MachineOperand &MO = MI.getOperand(I);
-      if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
-      if (MO.isUse()) {
-        LiveReg &LR = reloadVirtReg(MI, I, Reg, CopyDstReg);
-        MCPhysReg PhysReg = LR.PhysReg;
-        CopySrcReg = (CopySrcReg == Reg || CopySrcReg == PhysReg) ? PhysReg : 0;
-        if (setPhysReg(MI, I, PhysReg))
-          killVirtReg(LR);
-      }
-    }
-
-    // Track registers defined by instruction - early clobbers and tied uses at
-    // this point.
-    UsedInInstr.clear();
-    if (hasEarlyClobbers) {
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isReg()) continue;
-        unsigned Reg = MO.getReg();
-        if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
-        // Look for physreg defs and tied uses.
-        if (!MO.isDef() && !MO.isTied()) continue;
-        markRegUsedInInstr(Reg);
-      }
-    }
-
-    unsigned DefOpEnd = MI.getNumOperands();
-    if (MI.isCall()) {
-      // Spill all virtregs before a call. This serves one purpose: If an
-      // exception is thrown, the landing pad is going to expect to find
-      // registers in their spill slots.
-      // Note: although this is appealing to just consider all definitions
-      // as call-clobbered, this is not correct because some of those
-      // definitions may be used later on and we do not want to reuse
-      // those for virtual registers in between.
-      LLVM_DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
-      spillAll(MI);
-    }
-
-    // Third scan.
-    // Allocate defs and collect dead defs.
-    for (unsigned I = 0; I != DefOpEnd; ++I) {
-      const MachineOperand &MO = MI.getOperand(I);
-      if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
-        continue;
-      unsigned Reg = MO.getReg();
-
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        if (!MRI->isAllocatable(Reg)) continue;
-        definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
-        continue;
-      }
-      MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg);
-      if (setPhysReg(MI, I, PhysReg)) {
-        VirtDead.push_back(Reg);
-        CopyDstReg = 0; // cancel coalescing;
-      } else
-        CopyDstReg = (CopyDstReg == Reg || CopyDstReg == PhysReg) ? PhysReg : 0;
-    }
-
-    // Kill dead defs after the scan to ensure that multiple defs of the same
-    // register are allocated identically. We didn't need to do this for uses
-    // because we are crerating our own kill flags, and they are always at the
-    // last use.
-    for (unsigned VirtReg : VirtDead)
-      killVirtReg(VirtReg);
-    VirtDead.clear();
-
-    if (CopyDstReg && CopyDstReg == CopySrcReg && CopyDstSub == CopySrcSub) {
-      LLVM_DEBUG(dbgs() << "-- coalescing: " << MI);
-      Coalesced.push_back(&MI);
-    } else {
-      LLVM_DEBUG(dbgs() << "<< " << MI);
-    }
+    allocateInstruction(MI);
   }
 
   // Spill all physical registers holding virtual registers now.
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 64da8485389f2bb896e02f20e3e08d757a60eb61..f244028534f23b2937264219d7e9ed5d244acced 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -1183,7 +1183,10 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
     BC.Number = BI.MBB->getNumber();
     Intf.moveToBlock(BC.Number);
     BC.Entry = BI.LiveIn ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
-    BC.Exit = BI.LiveOut ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
+    BC.Exit = (BI.LiveOut &&
+               !LIS->getInstructionFromIndex(BI.LastInstr)->isImplicitDef())
+                  ? SpillPlacement::PrefReg
+                  : SpillPlacement::DontCare;
     BC.ChangesValue = BI.FirstDef.isValid();
 
     if (!Intf.hasInterference())
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fc0e8efebdcbdc84f8c739500ee0255eb7553794..7a22caf9c8b1e1ec3e4b67bcc7e52eb5bc798da4 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
@@ -263,8 +264,9 @@ namespace {
     /// \param EltNo index of the vector element to load.
     /// \param OriginalLoad load that EVE came from to be replaced.
     /// \returns EVE on success SDValue() on failure.
-    SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
-        SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad);
+    SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
+                                         SDValue EltNo,
+                                         LoadSDNode *OriginalLoad);
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
     SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
@@ -324,6 +326,7 @@ namespace {
     SDValue visitSHL(SDNode *N);
     SDValue visitSRA(SDNode *N);
     SDValue visitSRL(SDNode *N);
+    SDValue visitFunnelShift(SDNode *N);
     SDValue visitRotate(SDNode *N);
     SDValue visitABS(SDNode *N);
     SDValue visitBSWAP(SDNode *N);
@@ -410,11 +413,14 @@ namespace {
     SDValue foldVSelectOfConstants(SDNode *N);
     SDValue foldBinOpIntoSelect(SDNode *BO);
     bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
-    SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
+    SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N);
     SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
     SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
                              SDValue N2, SDValue N3, ISD::CondCode CC,
                              bool NotExtCompare = false);
+    SDValue convertSelectOfFPConstantsToLoadOffset(
+        const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
+        ISD::CondCode CC);
     SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
     SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
@@ -490,6 +496,10 @@ namespace {
     /// returns false.
     bool findBetterNeighborChains(StoreSDNode *St);
 
+    // Helper for findBetterNeighborChains. Walk up store chain add additional
+    // chained stores that do not overlap and can be parallelized.
+    bool parallelizeChainedStores(StoreSDNode *St);
+
     /// Holds a pointer to an LSBaseSDNode as well as information on where it
     /// is located in a sequence of memory operations connected by a chain.
     struct MemOpLink {
@@ -523,7 +533,7 @@ namespace {
                            EVT &MemVT, unsigned ShAmt = 0);
 
     /// Used by BackwardsPropagateMask to find suitable loads.
-    bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl<LoadSDNode*> &Loads,
+    bool SearchForAndLoads(SDNode *N, SmallVectorImpl<LoadSDNode*> &Loads,
                            SmallPtrSetImpl<SDNode*> &NodesWithConsts,
                            ConstantSDNode *Mask, SDNode *&NodeToMask);
     /// Attempt to propagate a given AND node back to load leaves so that they
@@ -903,39 +913,6 @@ static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
   return true;
 }
 
-// Determines if it is a constant null integer or a splatted vector of a
-// constant null integer (with no undefs).
-// Build vector implicit truncation is not an issue for null values.
-static bool isNullConstantOrNullSplatConstant(SDValue N) {
-  // TODO: may want to use peekThroughBitcast() here.
-  if (ConstantSDNode *Splat = isConstOrConstSplat(N))
-    return Splat->isNullValue();
-  return false;
-}
-
-// Determines if it is a constant integer of one or a splatted vector of a
-// constant integer of one (with no undefs).
-// Do not permit build vector implicit truncation.
-static bool isOneConstantOrOneSplatConstant(SDValue N) {
-  // TODO: may want to use peekThroughBitcast() here.
-  unsigned BitWidth = N.getScalarValueSizeInBits();
-  if (ConstantSDNode *Splat = isConstOrConstSplat(N))
-    return Splat->isOne() && Splat->getAPIntValue().getBitWidth() == BitWidth;
-  return false;
-}
-
-// Determines if it is a constant integer of all ones or a splatted vector of a
-// constant integer of all ones (with no undefs).
-// Do not permit build vector implicit truncation.
-static bool isAllOnesConstantOrAllOnesSplatConstant(SDValue N) {
-  N = peekThroughBitcasts(N);
-  unsigned BitWidth = N.getScalarValueSizeInBits();
-  if (ConstantSDNode *Splat = isConstOrConstSplat(N))
-    return Splat->isAllOnesValue() &&
-           Splat->getAPIntValue().getBitWidth() == BitWidth;
-  return false;
-}
-
 // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
 // undef's.
 static bool isAnyConstantBuildVector(const SDNode *N) {
@@ -1538,6 +1515,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::SRL:                return visitSRL(N);
   case ISD::ROTR:
   case ISD::ROTL:               return visitRotate(N);
+  case ISD::FSHL:
+  case ISD::FSHR:               return visitFunnelShift(N);
   case ISD::ABS:                return visitABS(N);
   case ISD::BSWAP:              return visitBSWAP(N);
   case ISD::BITREVERSE:         return visitBITREVERSE(N);
@@ -1913,11 +1892,10 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
   // and (select Cond, 0, -1), X --> select Cond, 0, X
   // or X, (select Cond, -1, 0) --> select Cond, -1, X
   auto BinOpcode = BO->getOpcode();
-  bool CanFoldNonConst = (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
-                         (isNullConstantOrNullSplatConstant(CT) ||
-                          isAllOnesConstantOrAllOnesSplatConstant(CT)) &&
-                         (isNullConstantOrNullSplatConstant(CF) ||
-                          isAllOnesConstantOrAllOnesSplatConstant(CF));
+  bool CanFoldNonConst =
+      (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
+      (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) &&
+      (isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF));
 
   SDValue CBO = BO->getOperand(SelOpNo ^ 1);
   if (!CanFoldNonConst &&
@@ -2086,7 +2064,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     //   add (zext i1 X), -1 -> sext (not i1 X)
     // because most (?) targets generate better code for the zext form.
     if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
-        isOneConstantOrOneSplatConstant(N1)) {
+        isOneOrOneSplat(N1)) {
       SDValue X = N0.getOperand(0);
       if ((!LegalOperations ||
            (TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
@@ -2115,13 +2093,11 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     return RADD;
 
   // fold ((0-A) + B) -> B-A
-  if (N0.getOpcode() == ISD::SUB &&
-      isNullConstantOrNullSplatConstant(N0.getOperand(0)))
+  if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0)))
     return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
 
   // fold (A + (0-B)) -> A-B
-  if (N1.getOpcode() == ISD::SUB &&
-      isNullConstantOrNullSplatConstant(N1.getOperand(0)))
+  if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
     return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));
 
   // fold (A+(B-A)) -> B
@@ -2179,7 +2155,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     return DAG.getNode(ISD::OR, DL, VT, N0, N1);
 
   // fold (add (xor a, -1), 1) -> (sub 0, a)
-  if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1))
+  if (isBitwiseNot(N0) && isOneOrOneSplat(N1))
     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                        N0.getOperand(0));
 
@@ -2236,7 +2212,7 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference)
 
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
   if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
-      isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0)))
+      isNullOrNullSplat(N1.getOperand(0).getOperand(0)))
     return DAG.getNode(ISD::SUB, DL, VT, N0,
                        DAG.getNode(ISD::SHL, DL, VT,
                                    N1.getOperand(0).getOperand(1),
@@ -2249,8 +2225,7 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference)
 
     // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
     // and similar xforms where the inner op is either ~0 or 0.
-    if (NumSignBits == DestBits &&
-        isOneConstantOrOneSplatConstant(N1->getOperand(1)))
+    if (NumSignBits == DestBits && isOneOrOneSplat(N1->getOperand(1)))
       return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0);
   }
 
@@ -2381,7 +2356,7 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) {
                      DAG.getConstant(0, DL, CarryVT));
 
   // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry.
-  if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1)) {
+  if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) {
     SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
                               DAG.getConstant(0, DL, VT),
                               N0.getOperand(0));
@@ -2586,7 +2561,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
                        DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
   }
 
-  if (isNullConstantOrNullSplatConstant(N0)) {
+  if (isNullOrNullSplat(N0)) {
     unsigned BitWidth = VT.getScalarSizeInBits();
     // Right-shifting everything out but the sign bit followed by negation is
     // the same as flipping arithmetic/logical shift type without the negation:
@@ -2617,12 +2592,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   }
 
   // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
-  if (isAllOnesConstantOrAllOnesSplatConstant(N0))
+  if (isAllOnesOrAllOnesSplat(N0))
     return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
 
   // fold (A - (0-B)) -> A+B
-  if (N1.getOpcode() == ISD::SUB &&
-      isNullConstantOrNullSplatConstant(N1.getOperand(0)))
+  if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
     return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));
 
   // fold A-(A-B) -> B
@@ -2676,14 +2650,14 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   // fold (X - (-Y * Z)) -> (X + (Y * Z))
   if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
     if (N1.getOperand(0).getOpcode() == ISD::SUB &&
-        isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) {
+        isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
       SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
                                 N1.getOperand(0).getOperand(1),
                                 N1.getOperand(1));
       return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
     }
     if (N1.getOperand(1).getOpcode() == ISD::SUB &&
-        isNullConstantOrNullSplatConstant(N1.getOperand(1).getOperand(0))) {
+        isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
       SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
                                 N1.getOperand(0),
                                 N1.getOperand(1).getOperand(1));
@@ -3560,18 +3534,16 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                                 unsigned HiOp) {
   // If the high half is not needed, just compute the low half.
   bool HiExists = N->hasAnyUseOfValue(1);
-  if (!HiExists &&
-      (!LegalOperations ||
-       TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
+  if (!HiExists && (!LegalOperations ||
+                    TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
     SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
     return CombineTo(N, Res, Res);
   }
 
   // If the low half is not needed, just compute the high half.
   bool LoExists = N->hasAnyUseOfValue(0);
-  if (!LoExists &&
-      (!LegalOperations ||
-       TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
+  if (!LoExists && (!LegalOperations ||
+                    TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) {
     SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
     return CombineTo(N, Res, Res);
   }
@@ -3587,7 +3559,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
     SDValue LoOpt = combine(Lo.getNode());
     if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
         (!LegalOperations ||
-         TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType())))
+         TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType())))
       return CombineTo(N, LoOpt, LoOpt);
   }
 
@@ -3597,7 +3569,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
     SDValue HiOpt = combine(Hi.getNode());
     if (HiOpt.getNode() && HiOpt != Hi &&
         (!LegalOperations ||
-         TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType())))
+         TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType())))
       return CombineTo(N, HiOpt, HiOpt);
   }
 
@@ -3729,59 +3701,95 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   return SDValue();
 }
 
-/// If this is a binary operator with two operands of the same opcode, try to
-/// simplify it.
-SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
+/// If this is a bitwise logic instruction and both operands have the same
+/// opcode, try to sink the other opcode after the logic instruction.
+SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
-  assert(N0.getOpcode() == N1.getOpcode() && "Bad input!");
+  unsigned LogicOpcode = N->getOpcode();
+  unsigned HandOpcode = N0.getOpcode();
+  assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR ||
+          LogicOpcode == ISD::XOR) && "Expected logic opcode");
+  assert(HandOpcode == N1.getOpcode() && "Bad input!");
 
   // Bail early if none of these transforms apply.
-  if (N0.getNumOperands() == 0) return SDValue();
-
-  // For each of OP in AND/OR/XOR:
-  // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
-  // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
-  // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
-  // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y))
-  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
-  //
-  // do not sink logical op inside of a vector extend, since it may combine
-  // into a vsetcc.
-  EVT Op0VT = N0.getOperand(0).getValueType();
-  if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
-       N0.getOpcode() == ISD::SIGN_EXTEND ||
-       N0.getOpcode() == ISD::BSWAP ||
-       // Avoid infinite looping with PromoteIntBinOp.
-       (N0.getOpcode() == ISD::ANY_EXTEND &&
-        (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
-       (N0.getOpcode() == ISD::TRUNCATE &&
-        (!TLI.isZExtFree(VT, Op0VT) ||
-         !TLI.isTruncateFree(Op0VT, VT)) &&
-        TLI.isTypeLegal(Op0VT))) &&
-      !VT.isVector() &&
-      Op0VT == N1.getOperand(0).getValueType() &&
-      (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
-    SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
-                                 N0.getOperand(0).getValueType(),
-                                 N0.getOperand(0), N1.getOperand(0));
-    AddToWorklist(ORNode.getNode());
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode);
-  }
-
-  // For each of OP in SHL/SRL/SRA/AND...
-  //   fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
-  //   fold (or  (OP x, z), (OP y, z)) -> (OP (or  x, y), z)
-  //   fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
-  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL ||
-       N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) &&
+  if (N0.getNumOperands() == 0)
+    return SDValue();
+
+  // FIXME: We should check number of uses of the operands to not increase
+  //        the instruction count for all transforms.
+
+  // Handle size-changing casts.
+  SDValue X = N0.getOperand(0);
+  SDValue Y = N1.getOperand(0);
+  EVT XVT = X.getValueType();
+  SDLoc DL(N);
+  if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND ||
+      HandOpcode == ISD::SIGN_EXTEND) {
+    // If both operands have other uses, this transform would create extra
+    // instructions without eliminating anything.
+    if (!N0.hasOneUse() && !N1.hasOneUse())
+      return SDValue();
+    // We need matching integer source types.
+    // Do not hoist logic op inside of a vector extend, since it may combine
+    // into a vsetcc.
+    // TODO: Should the vector check apply to truncate though?
+    if (VT.isVector() || XVT != Y.getValueType())
+      return SDValue();
+    // Don't create an illegal op during or after legalization.
+    if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
+      return SDValue();
+    // Avoid infinite looping with PromoteIntBinOp.
+    // TODO: Should we apply desirable/legal constraints to all opcodes?
+    if (HandOpcode == ISD::ANY_EXTEND && LegalTypes &&
+        !TLI.isTypeDesirableForOp(LogicOpcode, XVT))
+      return SDValue();
+    // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y)
+    SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+    return DAG.getNode(HandOpcode, DL, VT, Logic);
+  }
+
+  // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
+  if (HandOpcode == ISD::TRUNCATE) {
+    // If both operands have other uses, this transform would create extra
+    // instructions without eliminating anything.
+    if (!N0.hasOneUse() && !N1.hasOneUse())
+      return SDValue();
+    // We need matching source types.
+    if (XVT != Y.getValueType())
+      return SDValue();
+    // Don't create an illegal op during or after legalization.
+    if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
+      return SDValue();
+    // Be extra careful sinking truncate. If it's free, there's no benefit in
+    // widening a binop. Also, don't create a logic op on an illegal type.
+    if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
+      return SDValue();
+    if (!TLI.isTypeLegal(XVT))
+      return SDValue();
+    SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+    return DAG.getNode(HandOpcode, DL, VT, Logic);
+  }
+
+  // For binops SHL/SRL/SRA/AND:
+  //   logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z
+  if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL ||
+       HandOpcode == ISD::SRA || HandOpcode == ISD::AND) &&
       N0.getOperand(1) == N1.getOperand(1)) {
-    SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
-                                 N0.getOperand(0).getValueType(),
-                                 N0.getOperand(0), N1.getOperand(0));
-    AddToWorklist(ORNode.getNode());
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
-                       ORNode, N0.getOperand(1));
+    // If either operand has other uses, this transform is not an improvement.
+    if (!N0.hasOneUse() || !N1.hasOneUse())
+      return SDValue();
+    SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+    return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1));
+  }
+
+  // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y)
+  if (HandOpcode == ISD::BSWAP) {
+    // If either operand has other uses, this transform is not an improvement.
+    if (!N0.hasOneUse() || !N1.hasOneUse())
+      return SDValue();
+    SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+    return DAG.getNode(HandOpcode, DL, VT, Logic);
   }
 
   // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
@@ -3791,21 +3799,12 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // we don't want to undo this promotion.
   // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
   // on scalars.
-  if ((N0.getOpcode() == ISD::BITCAST ||
-       N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
+  if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) &&
        Level <= AfterLegalizeTypes) {
-    SDValue In0 = N0.getOperand(0);
-    SDValue In1 = N1.getOperand(0);
-    EVT In0Ty = In0.getValueType();
-    EVT In1Ty = In1.getValueType();
-    SDLoc DL(N);
-    // If both incoming values are integers, and the original types are the
-    // same.
-    if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
-      SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
-      SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
-      AddToWorklist(Op.getNode());
-      return BC;
+    // Input types must be integer and the same.
+    if (XVT.isInteger() && XVT == Y.getValueType()) {
+      SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+      return DAG.getNode(HandOpcode, DL, VT, Logic);
     }
   }
 
@@ -3821,55 +3820,44 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // If both shuffles use the same mask, and both shuffles have the same first
   // or second operand, then it might still be profitable to move the shuffle
   // after the xor/and/or operation.
-  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
-    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
-    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);
-
-    assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
+  if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
+    auto *SVN0 = cast<ShuffleVectorSDNode>(N0);
+    auto *SVN1 = cast<ShuffleVectorSDNode>(N1);
+    assert(X.getValueType() == Y.getValueType() &&
            "Inputs to shuffles are not the same type");
 
     // Check that both shuffles use the same mask. The masks are known to be of
     // the same length because the result vector type is the same.
     // Check also that shuffles have only one use to avoid introducing extra
     // instructions.
-    if (SVN0->hasOneUse() && SVN1->hasOneUse() &&
-        SVN0->getMask().equals(SVN1->getMask())) {
-      SDValue ShOp = N0->getOperand(1);
-
-      // Don't try to fold this node if it requires introducing a
-      // build vector of all zeros that might be illegal at this stage.
-      if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
-        ShOp = tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
-      }
+    if (!SVN0->hasOneUse() || !SVN1->hasOneUse() ||
+        !SVN0->getMask().equals(SVN1->getMask()))
+      return SDValue();
 
-      // (AND (shuf (A, C), shuf (B, C))) -> shuf (AND (A, B), C)
-      // (OR  (shuf (A, C), shuf (B, C))) -> shuf (OR  (A, B), C)
-      // (XOR (shuf (A, C), shuf (B, C))) -> shuf (XOR (A, B), V_0)
-      if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
-        SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
-                                      N0->getOperand(0), N1->getOperand(0));
-        AddToWorklist(NewNode.getNode());
-        return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp,
-                                    SVN0->getMask());
-      }
+    // Don't try to fold this node if it requires introducing a
+    // build vector of all zeros that might be illegal at this stage.
+    SDValue ShOp = N0.getOperand(1);
+    if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
+      ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
 
-      // Don't try to fold this node if it requires introducing a
-      // build vector of all zeros that might be illegal at this stage.
-      ShOp = N0->getOperand(0);
-      if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
-        ShOp = tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
-      }
+    // (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C)
+    if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
+      SDValue Logic = DAG.getNode(LogicOpcode, DL, VT,
+                                  N0.getOperand(0), N1.getOperand(0));
+      return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask());
+    }
 
-      // (AND (shuf (C, A), shuf (C, B))) -> shuf (C, AND (A, B))
-      // (OR  (shuf (C, A), shuf (C, B))) -> shuf (C, OR  (A, B))
-      // (XOR (shuf (C, A), shuf (C, B))) -> shuf (V_0, XOR (A, B))
-      if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) {
-        SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
-                                      N0->getOperand(1), N1->getOperand(1));
-        AddToWorklist(NewNode.getNode());
-        return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode,
-                                    SVN0->getMask());
-      }
+    // Don't try to fold this node if it requires introducing a
+    // build vector of all zeros that might be illegal at this stage.
+    ShOp = N0.getOperand(0);
+    if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
+      ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
+
+    // (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B))
+    if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) {
+      SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1),
+                                  N1.getOperand(1));
+      return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask());
     }
   }
 
@@ -3905,8 +3893,8 @@ SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
   ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
   bool IsInteger = OpVT.isInteger();
   if (LR == RR && CC0 == CC1 && IsInteger) {
-    bool IsZero = isNullConstantOrNullSplatConstant(LR);
-    bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR);
+    bool IsZero = isNullOrNullSplat(LR);
+    bool IsNeg1 = isAllOnesOrAllOnesSplat(LR);
 
     // All bits clear?
     bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
@@ -4208,7 +4196,7 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
 }
 
 bool DAGCombiner::SearchForAndLoads(SDNode *N,
-                                    SmallPtrSetImpl<LoadSDNode*> &Loads,
+                                    SmallVectorImpl<LoadSDNode*> &Loads,
                                     SmallPtrSetImpl<SDNode*> &NodesWithConsts,
                                     ConstantSDNode *Mask,
                                     SDNode *&NodeToMask) {
@@ -4245,7 +4233,7 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
 
         // Use LE to convert equal sized loads to zext.
         if (ExtVT.bitsLE(Load->getMemoryVT()))
-          Loads.insert(Load);
+          Loads.push_back(Load);
 
         continue;
       }
@@ -4310,7 +4298,7 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
   if (isa<LoadSDNode>(N->getOperand(0)))
     return false;
 
-  SmallPtrSet<LoadSDNode*, 8> Loads;
+  SmallVector<LoadSDNode*, 8> Loads;
   SmallPtrSet<SDNode*, 2> NodesWithConsts;
   SDNode *FixupNode = nullptr;
   if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
@@ -4644,8 +4632,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
   // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
   if (N0.getOpcode() == N1.getOpcode())
-    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
-      return Tmp;
+    if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
+      return V;
 
   // Masking the negated extension of a boolean is just the zero-extended
   // boolean:
@@ -4655,7 +4643,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   // Note: the SimplifyDemandedBits fold below can make an information-losing
   // transform, and then we have no way to find this better fold.
   if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
-    if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) {
+    if (isNullOrNullSplat(N0.getOperand(0))) {
       SDValue SubRHS = N0.getOperand(1);
       if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
           SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
@@ -5203,8 +5191,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 
   // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
   if (N0.getOpcode() == N1.getOpcode())
-    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
-      return Tmp;
+    if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
+      return V;
 
   // See if this is some rotate idiom.
   if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
@@ -5953,7 +5941,7 @@ SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
   assert(N->getOpcode() == ISD::XOR);
 
   // Don't touch 'not' (i.e. where y = -1).
-  if (isAllOnesConstantOrAllOnesSplatConstant(N->getOperand(1)))
+  if (isAllOnesOrAllOnesSplat(N->getOperand(1)))
     return SDValue();
 
   EVT VT = N->getValueType(0);
@@ -5970,7 +5958,7 @@ SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
     SDValue Xor0 = Xor.getOperand(0);
     SDValue Xor1 = Xor.getOperand(1);
     // Don't touch 'not' (i.e. where y = -1).
-    if (isAllOnesConstantOrAllOnesSplatConstant(Xor1))
+    if (isAllOnesOrAllOnesSplat(Xor1))
       return false;
     if (Other == Xor0)
       std::swap(Xor0, Xor1);
@@ -6036,8 +6024,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
+  SDLoc DL(N);
   if (N0.isUndef() && N1.isUndef())
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
   // fold (xor x, undef) -> undef
   if (N0.isUndef())
     return N0;
@@ -6047,11 +6036,11 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
   if (N0C && N1C)
-    return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C);
+    return DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, N0C, N1C);
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
+    return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
   // fold (xor x, 0) -> x
   if (isNullConstant(N1))
     return N0;
@@ -6060,19 +6049,18 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     return NewSel;
 
   // reassociate xor
-  if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1, N->getFlags()))
+  if (SDValue RXOR = ReassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
     return RXOR;
 
   // fold !(x cc y) -> (x !cc y)
+  unsigned N0Opcode = N0.getOpcode();
   SDValue LHS, RHS, CC;
   if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
-    bool isInt = LHS.getValueType().isInteger();
     ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
-                                               isInt);
-
+                                               LHS.getValueType().isInteger());
     if (!LegalOperations ||
         TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
-      switch (N0.getOpcode()) {
+      switch (N0Opcode) {
       default:
         llvm_unreachable("Unhandled SetCC Equivalent!");
       case ISD::SETCC:
@@ -6085,54 +6073,74 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
-  if (isOneConstant(N1) && N0.getOpcode() == ISD::ZERO_EXTEND &&
-      N0.getNode()->hasOneUse() &&
+  if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() &&
       isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
     SDValue V = N0.getOperand(0);
-    SDLoc DL(N0);
-    V = DAG.getNode(ISD::XOR, DL, V.getValueType(), V,
-                    DAG.getConstant(1, DL, V.getValueType()));
+    SDLoc DL0(N0);
+    V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V,
+                    DAG.getConstant(1, DL0, V.getValueType()));
     AddToWorklist(V.getNode());
-    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V);
   }
 
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
   if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
-      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+      (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
-      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
       LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
       RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
       AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
-      return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
+      return DAG.getNode(NewOpcode, DL, VT, LHS, RHS);
     }
   }
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
   if (isAllOnesConstant(N1) && N0.hasOneUse() &&
-      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+      (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
-      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
       LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
       RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
       AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
-      return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
+      return DAG.getNode(NewOpcode, DL, VT, LHS, RHS);
     }
   }
   // fold (xor (and x, y), y) -> (and (not x), y)
-  if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
-      N0->getOperand(1) == N1) {
-    SDValue X = N0->getOperand(0);
+  if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) {
+    SDValue X = N0.getOperand(0);
     SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
     AddToWorklist(NotX.getNode());
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);
+    return DAG.getNode(ISD::AND, DL, VT, NotX, N1);
+  }
+
+  if ((N0Opcode == ISD::SRL || N0Opcode == ISD::SHL) && N0.hasOneUse()) {
+    ConstantSDNode *XorC = isConstOrConstSplat(N1);
+    ConstantSDNode *ShiftC = isConstOrConstSplat(N0.getOperand(1));
+    unsigned BitWidth = VT.getScalarSizeInBits();
+    if (XorC && ShiftC) {
+      // Don't crash on an oversized shift. We can not guarantee that a bogus
+      // shift has been simplified to undef.
+      uint64_t ShiftAmt = ShiftC->getLimitedValue();
+      if (ShiftAmt < BitWidth) {
+        APInt Ones = APInt::getAllOnesValue(BitWidth);
+        Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt);
+        if (XorC->getAPIntValue() == Ones) {
+          // If the xor constant is a shifted -1, do a 'not' before the shift:
+          // xor (X << ShiftC), XorC --> (not X) << ShiftC
+          // xor (X >> ShiftC), XorC --> (not X) >> ShiftC
+          SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT);
+          return DAG.getNode(N0Opcode, DL, VT, Not, N0.getOperand(1));
+        }
+      }
+    }
   }
 
   // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
   if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
-    SDValue A = N0.getOpcode() == ISD::ADD ? N0 : N1;
-    SDValue S = N0.getOpcode() == ISD::SRA ? N0 : N1;
+    SDValue A = N0Opcode == ISD::ADD ? N0 : N1;
+    SDValue S = N0Opcode == ISD::SRA ? N0 : N1;
     if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
       SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
       SDValue S0 = S.getOperand(0);
@@ -6140,14 +6148,14 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
         unsigned OpSizeInBits = VT.getScalarSizeInBits();
         if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
           if (C->getAPIntValue() == (OpSizeInBits - 1))
-            return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
+            return DAG.getNode(ISD::ABS, DL, VT, S0);
       }
     }
   }
 
   // fold (xor x, x) -> 0
   if (N0 == N1)
-    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
+    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
 
   // fold (xor (shl 1, x), -1) -> (rotl ~1, x)
   // Here is a concrete example of this equivalence:
@@ -6167,17 +6175,16 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   //   consistent result.
   // - Pushing the zero left requires shifting one bits in from the right.
   // A rotate left of ~1 is a nice way of achieving the desired result.
-  if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0.getOpcode() == ISD::SHL
-      && isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
-    SDLoc DL(N);
+  if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL &&
+      isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
     return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
                        N0.getOperand(1));
   }
 
   // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
-  if (N0.getOpcode() == N1.getOpcode())
-    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
-      return Tmp;
+  if (N0Opcode == N1.getOpcode())
+    if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
+      return V;
 
   // Unfold  ((x ^ y) & m) ^ y  into  (x & m) | (y & ~m)  if profitable
   if (SDValue MM = unfoldMaskedMerge(N))
@@ -6193,6 +6200,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
 /// Handle transforms common to the three shifts, when the shift amount is a
 /// constant.
 SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
+  // Do not turn a 'not' into a regular xor.
+  if (isBitwiseNot(N->getOperand(0)))
+    return SDValue();
+
   SDNode *LHS = N->getOperand(0).getNode();
   if (!LHS->hasOneUse()) return SDValue();
 
@@ -6298,9 +6309,16 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
   unsigned Bitsize = VT.getScalarSizeInBits();
 
   // fold (rot x, 0) -> x
-  if (isNullConstantOrNullSplatConstant(N1))
+  if (isNullOrNullSplat(N1))
     return N0;
 
+  // fold (rot x, c) -> x iff (c % BitSize) == 0
+  if (isPowerOf2_32(Bitsize) && Bitsize > 1) {
+    APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1);
+    if (DAG.MaskedValueIsZero(N1, ModuloMask))
+      return N0;
+  }
+
   // fold (rot x, c) -> (rot x, c % BitSize)
   if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {
     if (Cst->getAPIntValue().uge(Bitsize)) {
@@ -6343,6 +6361,9 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
 SDValue DAGCombiner::visitSHL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  if (SDValue V = DAG.simplifyShift(N0, N1))
+    return V;
+
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
@@ -6377,22 +6398,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
-  // fold (shl 0, x) -> 0
-  if (isNullConstantOrNullSplatConstant(N0))
-    return N0;
-  // fold (shl x, c >= size(x)) -> undef
-  // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
-  auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
-    return Val->getAPIntValue().uge(OpSizeInBits);
-  };
-  if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
-    return DAG.getUNDEF(VT);
-  // fold (shl x, 0) -> x
-  if (N1C && N1C->isNullValue())
-    return N0;
-  // fold (shl undef, x) -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, SDLoc(N), VT);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -6582,6 +6587,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
 SDValue DAGCombiner::visitSRA(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  if (SDValue V = DAG.simplifyShift(N0, N1))
+    return V;
+
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
@@ -6602,16 +6610,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
-  // fold (sra x, c >= size(x)) -> undef
-  // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
-  auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
-    return Val->getAPIntValue().uge(OpSizeInBits);
-  };
-  if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
-    return DAG.getUNDEF(VT);
-  // fold (sra x, 0) -> x
-  if (N1C && N1C->isNullValue())
-    return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -6748,6 +6746,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
 SDValue DAGCombiner::visitSRL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  if (SDValue V = DAG.simplifyShift(N0, N1))
+    return V;
+
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
@@ -6762,19 +6763,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
-  // fold (srl 0, x) -> 0
-  if (isNullConstantOrNullSplatConstant(N0))
-    return N0;
-  // fold (srl x, c >= size(x)) -> undef
-  // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
-  auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
-    return Val->getAPIntValue().uge(OpSizeInBits);
-  };
-  if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
-    return DAG.getUNDEF(VT);
-  // fold (srl x, 0) -> x
-  if (N1C && N1C->isNullValue())
-    return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -6965,6 +6953,41 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  bool IsFSHL = N->getOpcode() == ISD::FSHL;
+  unsigned BitWidth = VT.getScalarSizeInBits();
+
+  // fold (fshl N0, N1, 0) -> N0
+  // fold (fshr N0, N1, 0) -> N1
+  if (isPowerOf2_32(BitWidth))
+    if (DAG.MaskedValueIsZero(
+            N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1)))
+      return IsFSHL ? N0 : N1;
+
+  // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
+  if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
+    if (Cst->getAPIntValue().uge(BitWidth)) {
+      uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
+      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
+                         DAG.getConstant(RotAmt, SDLoc(N), N2.getValueType()));
+    }
+  }
+
+  // fold (fshl N0, N0, N2) -> (rotl N0, N2)
+  // fold (fshr N0, N0, N2) -> (rotr N0, N2)
+  // TODO: Investigate flipping this rotate if only one is legal, if funnel shift
+  // is legal as well we might be better off avoiding non-constant (BW - N2).
+  unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
+  if (N0 == N1 && hasOperation(RotOpc, VT))
+    return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -7231,15 +7254,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   EVT VT0 = N0.getValueType();
   SDLoc DL(N);
 
-  // fold (select C, X, X) -> X
-  if (N1 == N2)
-    return N1;
-
-  if (const ConstantSDNode *N0C = dyn_cast<const ConstantSDNode>(N0)) {
-    // fold (select true, X, Y) -> X
-    // fold (select false, X, Y) -> Y
-    return !N0C->isNullValue() ? N1 : N2;
-  }
+  if (SDValue V = DAG.simplifySelect(N0, N1, N2))
+    return V;
 
   // fold (select X, X, Y) -> (or X, Y)
   // fold (select X, 1, Y) -> (or C, Y)
@@ -7818,9 +7834,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N2 = N->getOperand(2);
   SDLoc DL(N);
 
-  // fold (vselect C, X, X) -> X
-  if (N1 == N2)
-    return N1;
+  if (SDValue V = DAG.simplifySelect(N0, N1, N2))
+    return V;
 
   // Canonicalize integer abs.
   // vselect (setg[te] X,  0),  X, -X ->
@@ -7874,7 +7889,7 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     // TODO: This should be extended to handle any constant.
     // TODO: This could be extended to handle non-loading patterns, but that
     //       requires thorough testing to avoid regressions.
-    if (isNullConstantOrNullSplatConstant(RHS)) {
+    if (isNullOrNullSplat(RHS)) {
       EVT NarrowVT = LHS.getValueType();
       EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
       EVT SetCCVT = getSetCCResultType(LHS.getValueType());
@@ -8017,9 +8032,8 @@ SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
 /// Vector extends are not folded if operations are legal; this is to
 /// avoid introducing illegal build_vector dag nodes.
-static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
-                                         SelectionDAG &DAG, bool LegalTypes,
-                                         bool LegalOperations) {
+static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
+                                         SelectionDAG &DAG, bool LegalTypes) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -8033,16 +8047,15 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
   // fold (zext c1) -> c1
   // fold (aext c1) -> c1
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode();
+    return DAG.getNode(Opcode, SDLoc(N), VT, N0);
 
   // fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
   // fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
   // fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
   EVT SVT = VT.getScalarType();
-  if (!(VT.isVector() &&
-      (!LegalTypes || (!LegalOperations && TLI.isTypeLegal(SVT))) &&
+  if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) &&
       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
-    return nullptr;
+    return SDValue();
 
   // We can fold this node into a build_vector.
   unsigned VTBits = SVT.getSizeInBits();
@@ -8068,7 +8081,7 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
       Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
   }
 
-  return DAG.getBuildVector(VT, DL, Elts).getNode();
+  return DAG.getBuildVector(VT, DL, Elts);
 }
 
 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
@@ -8474,9 +8487,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
-  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
-                                              LegalOperations))
-    return SDValue(Res, 0);
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
 
   // fold (sext (sext x)) -> (sext x)
   // fold (sext (aext x)) -> (sext x)
@@ -8613,21 +8625,24 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // if this is the case.
       EVT SVT = getSetCCResultType(N00VT);
 
-      // We know that the # elements of the results is the same as the
-      // # elements of the compare (and the # elements of the compare result
-      // for that matter).  Check to see that they are the same size.  If so,
-      // we know that the element size of the sext'd result matches the
-      // element size of the compare operands.
-      if (VT.getSizeInBits() == SVT.getSizeInBits())
-        return DAG.getSetCC(DL, VT, N00, N01, CC);
-
-      // If the desired elements are smaller or larger than the source
-      // elements, we can use a matching integer vector type and then
-      // truncate/sign extend.
-      EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
-      if (SVT == MatchingVecType) {
-        SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
-        return DAG.getSExtOrTrunc(VsetCC, DL, VT);
+      // If we already have the desired type, don't change it.
+      if (SVT != N0.getValueType()) {
+        // We know that the # elements of the results is the same as the
+        // # elements of the compare (and the # elements of the compare result
+        // for that matter).  Check to see that they are the same size.  If so,
+        // we know that the element size of the sext'd result matches the
+        // element size of the compare operands.
+        if (VT.getSizeInBits() == SVT.getSizeInBits())
+          return DAG.getSetCC(DL, VT, N00, N01, CC);
+
+        // If the desired elements are smaller or larger than the source
+        // elements, we can use a matching integer vector type and then
+        // truncate/sign extend.
+        EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
+        if (SVT == MatchingVecType) {
+          SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
+          return DAG.getSExtOrTrunc(VsetCC, DL, VT);
+        }
       }
     }
 
@@ -8697,9 +8712,9 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
   SDValue Op1 = N->getOperand(1);
   assert(Op0.getValueType() == Op1.getValueType());
 
-  if (isNullConstantOrNullSplatConstant(Op0))
+  if (isNullOrNullSplat(Op0))
     Op = Op1;
-  else if (isNullConstantOrNullSplatConstant(Op1))
+  else if (isNullOrNullSplat(Op1))
     Op = Op0;
   else
     return false;
@@ -8713,9 +8728,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
-  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
-                                              LegalOperations))
-    return SDValue(Res, 0);
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
 
   // fold (zext (zext x)) -> (zext x)
   // fold (zext (aext x)) -> (zext x)
@@ -8963,9 +8977,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
-  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
-                                              LegalOperations))
-    return SDValue(Res, 0);
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
 
   // fold (aext (aext x)) -> (aext x)
   // fold (aext (zext x)) -> (zext x)
@@ -9136,6 +9149,26 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
     return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
   }
 
+  // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller
+  // than X. Just move the AssertZext in front of the truncate and drop the
+  // AssertSExt.
+  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::AssertSext &&
+      Opcode == ISD::AssertZext) {
+    SDValue BigA = N0.getOperand(0);
+    EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+    assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
+           "Asserting zero/sign-extended bits to a type larger than the "
+           "truncated destination does not provide information");
+
+    if (AssertVT.bitsLT(BigA_AssertVT)) {
+      SDLoc DL(N);
+      SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
+                                      BigA.getOperand(0), N1);
+      return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
+    }
+  }
+
   return SDValue();
 }
 
@@ -9490,9 +9523,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
-  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
-                                              LegalOperations))
-    return SDValue(Res, 0);
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
+
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
 
   return SDValue();
 }
@@ -9504,9 +9539,11 @@ SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
-  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
-                                              LegalOperations))
-    return SDValue(Res, 0);
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
+
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
 
   return SDValue();
 }
@@ -9781,6 +9818,33 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
+  // Narrow a suitable binary operation with a non-opaque constant operand by
+  // moving it ahead of the truncate. This is limited to pre-legalization
+  // because targets may prefer a wider type during later combines and invert
+  // this transform.
+  switch (N0.getOpcode()) {
+  // TODO: Add case for ADD - that will likely require a change in logic here
+  // or target-specific changes to avoid regressions.
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    if (!LegalOperations && N0.hasOneUse() &&
+        (isConstantOrConstantVector(N0.getOperand(0), true) ||
+         isConstantOrConstantVector(N0.getOperand(1), true))) {
+      // TODO: We already restricted this to pre-legalization, but for vectors
+      // we are extra cautious to not create an unsupported operation.
+      // Target-specific changes are likely needed to avoid regressions here.
+      if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
+        SDLoc DL(N);
+        SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
+        SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
+        return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -12861,6 +12925,20 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   bool STCoversLD =
       (Offset >= 0) &&
       (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
+
+  auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
+    if (LD->isIndexed()) {
+      bool IsSub = (LD->getAddressingMode() == ISD::PRE_DEC ||
+                    LD->getAddressingMode() == ISD::POST_DEC);
+      unsigned Opc = IsSub ? ISD::SUB : ISD::ADD;
+      SDValue Idx = DAG.getNode(Opc, SDLoc(LD), LD->getOperand(1).getValueType(),
+                             LD->getOperand(1), LD->getOperand(2));
+      SDValue Ops[] = {Val, Idx, Chain};
+      return CombineTo(LD, Ops, 3);
+    }
+    return CombineTo(LD, Val, Chain);
+  };
+
   if (!STCoversLD)
     return SDValue();
 
@@ -12868,7 +12946,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
     // Simple case: Direct non-truncating forwarding
     if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
-      return CombineTo(LD, ST->getValue(), Chain);
+      return ReplaceLd(LD, ST->getValue(), Chain);
     // Can we model the truncate and extension with an and mask?
     if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
         !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
@@ -12878,7 +12956,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
                                                STMemType.getSizeInBits()),
                           SDLoc(ST), STType);
       auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
-      return CombineTo(LD, Val, Chain);
+      return ReplaceLd(LD, Val, Chain);
     }
   }
 
@@ -12903,7 +12981,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
     }
     if (!extendLoadedValueToExtension(LD, Val))
       continue;
-    return CombineTo(LD, Val, Chain);
+    return ReplaceLd(LD, Val, Chain);
   } while (false);
 
   // On failure, cleanup dead nodes we may have created.
@@ -15312,6 +15390,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     return InVec;
 
   EVT VT = InVec.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
 
   // Remove redundant insertions:
   // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
@@ -15324,7 +15403,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     // If this is variable insert to undef vector, it might be better to splat:
     // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
     if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
-      SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
+      SmallVector<SDValue, 8> Ops(NumElts, InVal);
       return DAG.getBuildVector(VT, DL, Ops);
     }
     return SDValue();
@@ -15369,11 +15448,11 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     Ops.append(InVec.getNode()->op_begin(),
                InVec.getNode()->op_end());
   } else if (InVec.isUndef()) {
-    unsigned NElts = VT.getVectorNumElements();
-    Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+    Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
   } else {
     return SDValue();
   }
+  assert(Ops.size() == NumElts && "Unexpected vector size");
 
   // Insert the element
   if (Elt < Ops.size()) {
@@ -15387,8 +15466,9 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   return DAG.getBuildVector(VT, DL, Ops);
 }
 
-SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
-    SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) {
+SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
+                                                  SDValue EltNo,
+                                                  LoadSDNode *OriginalLoad) {
   assert(!OriginalLoad->isVolatile());
 
   EVT ResultVT = EVE->getValueType(0);
@@ -15471,43 +15551,52 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
 }
 
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
-  SDValue InVec = N->getOperand(0);
-  EVT VT = InVec.getValueType();
-  EVT NVT = N->getValueType(0);
-  if (InVec.isUndef())
-    return DAG.getUNDEF(NVT);
+  SDValue VecOp = N->getOperand(0);
+  SDValue Index = N->getOperand(1);
+  EVT ScalarVT = N->getValueType(0);
+  EVT VecVT = VecOp.getValueType();
+  if (VecOp.isUndef())
+    return DAG.getUNDEF(ScalarVT);
+
+  // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
+  //
+  // This only really matters if the index is non-constant since other combines
+  // on the constant elements already work.
+  SDLoc DL(N);
+  if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+      Index == VecOp.getOperand(2)) {
+    SDValue Elt = VecOp.getOperand(1);
+    return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt;
+  }
 
   // (vextract (scalar_to_vector val, 0) -> val
-  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+  if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) {
     // Check if the result type doesn't match the inserted element type. A
     // SCALAR_TO_VECTOR may truncate the inserted element and the
     // EXTRACT_VECTOR_ELT may widen the extracted vector.
-    SDValue InOp = InVec.getOperand(0);
-    if (InOp.getValueType() != NVT) {
-      assert(InOp.getValueType().isInteger() && NVT.isInteger());
-      return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT);
+    SDValue InOp = VecOp.getOperand(0);
+    if (InOp.getValueType() != ScalarVT) {
+      assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
+      return DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
     }
     return InOp;
   }
 
-  SDValue EltNo = N->getOperand(1);
-  ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
-
   // extract_vector_elt of out-of-bounds element -> UNDEF
-  if (ConstEltNo && ConstEltNo->getAPIntValue().uge(VT.getVectorNumElements()))
-    return DAG.getUNDEF(NVT);
+  auto *IndexC = dyn_cast<ConstantSDNode>(Index);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  if (IndexC && IndexC->getAPIntValue().uge(NumElts))
+    return DAG.getUNDEF(ScalarVT);
 
   // extract_vector_elt (build_vector x, y), 1 -> y
-  if (ConstEltNo &&
-      InVec.getOpcode() == ISD::BUILD_VECTOR &&
-      TLI.isTypeLegal(VT) &&
-      (InVec.hasOneUse() ||
-       TLI.aggressivelyPreferBuildVectorSources(VT))) {
-    SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue());
+  if (IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR &&
+      TLI.isTypeLegal(VecVT) &&
+      (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) {
+    SDValue Elt = VecOp.getOperand(IndexC->getZExtValue());
     EVT InEltVT = Elt.getValueType();
 
     // Sometimes build_vector's scalar input types do not match result type.
-    if (NVT == InEltVT)
+    if (ScalarVT == InEltVT)
       return Elt;
 
     // TODO: It may be useful to truncate if free if the build_vector implicitly
@@ -15517,27 +15606,27 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   // TODO: These transforms should not require the 'hasOneUse' restriction, but
   // there are regressions on multiple targets without it. We can end up with a
   // mess of scalar and vector code if we reduce only part of the DAG to scalar.
-  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && VT.isInteger() &&
-      InVec.hasOneUse()) {
+  if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() &&
+      VecOp.hasOneUse()) {
     // The vector index of the LSBs of the source depend on the endian-ness.
     bool IsLE = DAG.getDataLayout().isLittleEndian();
-    unsigned ExtractIndex = ConstEltNo->getZExtValue();
+    unsigned ExtractIndex = IndexC->getZExtValue();
     // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
-    unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1;
-    SDValue BCSrc = InVec.getOperand(0);
+    unsigned BCTruncElt = IsLE ? 0 : NumElts - 1;
+    SDValue BCSrc = VecOp.getOperand(0);
     if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
+      return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc);
 
     if (LegalTypes && BCSrc.getValueType().isInteger() &&
         BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
       // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
       // trunc i64 X to i32
       SDValue X = BCSrc.getOperand(0);
-      assert(X.getValueType().isScalarInteger() && NVT.isScalarInteger() &&
+      assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() &&
              "Extract element and scalar to vector can't change element type "
              "from FP to integer.");
       unsigned XBitWidth = X.getValueSizeInBits();
-      unsigned VecEltBitWidth = VT.getScalarSizeInBits();
+      unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
       BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
 
       // An extract element return value type can be wider than its vector
@@ -15546,51 +15635,40 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
         assert(XBitWidth % VecEltBitWidth == 0 &&
                "Scalar bitwidth must be a multiple of vector element bitwidth");
-        return DAG.getAnyExtOrTrunc(X, SDLoc(N), NVT);
+        return DAG.getAnyExtOrTrunc(X, DL, ScalarVT);
       }
     }
   }
 
-  // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
-  //
-  // This only really matters if the index is non-constant since other combines
-  // on the constant elements already work.
-  if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT &&
-      EltNo == InVec.getOperand(2)) {
-    SDValue Elt = InVec.getOperand(1);
-    return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt;
-  }
-
   // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
   // We only perform this optimization before the op legalization phase because
   // we may introduce new vector instructions which are not backed by TD
   // patterns. For example on AVX, extracting elements from a wide vector
   // without using extract_subvector. However, if we can find an underlying
   // scalar value, then we can always use that.
-  if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) {
-    int NumElem = VT.getVectorNumElements();
-    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec);
+  if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    auto *Shuf = cast<ShuffleVectorSDNode>(VecOp);
     // Find the new index to extract from.
-    int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue());
+    int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue());
 
     // Extracting an undef index is undef.
     if (OrigElt == -1)
-      return DAG.getUNDEF(NVT);
+      return DAG.getUNDEF(ScalarVT);
 
     // Select the right vector half to extract from.
     SDValue SVInVec;
-    if (OrigElt < NumElem) {
-      SVInVec = InVec->getOperand(0);
+    if (OrigElt < (int)NumElts) {
+      SVInVec = VecOp.getOperand(0);
     } else {
-      SVInVec = InVec->getOperand(1);
-      OrigElt -= NumElem;
+      SVInVec = VecOp.getOperand(1);
+      OrigElt -= NumElts;
     }
 
     if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
       SDValue InOp = SVInVec.getOperand(OrigElt);
-      if (InOp.getValueType() != NVT) {
-        assert(InOp.getValueType().isInteger() && NVT.isInteger());
-        InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT);
+      if (InOp.getValueType() != ScalarVT) {
+        assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
+        InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
       }
 
       return InOp;
@@ -15601,136 +15679,132 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
     if (!LegalOperations ||
         // FIXME: Should really be just isOperationLegalOrCustom.
-        TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) ||
-        TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) {
+        TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
+        TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
       EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec,
-                         DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
+                         DAG.getConstant(OrigElt, DL, IndexTy));
     }
   }
 
   // If only EXTRACT_VECTOR_ELT nodes use the source vector we can
   // simplify it based on the (valid) extraction indices.
-  if (llvm::all_of(InVec->uses(), [&](SDNode *Use) {
+  if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) {
         return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-               Use->getOperand(0) == InVec &&
+               Use->getOperand(0) == VecOp &&
                isa<ConstantSDNode>(Use->getOperand(1));
       })) {
-    APInt DemandedElts = APInt::getNullValue(VT.getVectorNumElements());
-    for (SDNode *Use : InVec->uses()) {
+    APInt DemandedElts = APInt::getNullValue(NumElts);
+    for (SDNode *Use : VecOp->uses()) {
       auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
-      if (CstElt->getAPIntValue().ult(VT.getVectorNumElements()))
+      if (CstElt->getAPIntValue().ult(NumElts))
         DemandedElts.setBit(CstElt->getZExtValue());
     }
-    if (SimplifyDemandedVectorElts(InVec, DemandedElts, true))
+    if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) {
+      // We simplified the vector operand of this extract element. If this
+      // extract is not dead, visit it again so it is folded properly.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
       return SDValue(N, 0);
+    }
   }
 
-  bool BCNumEltsChanged = false;
-  EVT ExtVT = VT.getVectorElementType();
-  EVT LVT = ExtVT;
-
+  // Everything under here is trying to match an extract of a loaded value.
   // If the result of load has to be truncated, then it's not necessarily
   // profitable.
-  if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
+  bool BCNumEltsChanged = false;
+  EVT ExtVT = VecVT.getVectorElementType();
+  EVT LVT = ExtVT;
+  if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT))
     return SDValue();
 
-  if (InVec.getOpcode() == ISD::BITCAST) {
+  if (VecOp.getOpcode() == ISD::BITCAST) {
     // Don't duplicate a load with other uses.
-    if (!InVec.hasOneUse())
+    if (!VecOp.hasOneUse())
       return SDValue();
 
-    EVT BCVT = InVec.getOperand(0).getValueType();
+    EVT BCVT = VecOp.getOperand(0).getValueType();
     if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
       return SDValue();
-    if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
+    if (NumElts != BCVT.getVectorNumElements())
       BCNumEltsChanged = true;
-    InVec = InVec.getOperand(0);
+    VecOp = VecOp.getOperand(0);
     ExtVT = BCVT.getVectorElementType();
   }
 
   // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
-  if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
-      ISD::isNormalLoad(InVec.getNode()) &&
-      !N->getOperand(1)->hasPredecessor(InVec.getNode())) {
+  if (!LegalOperations && !IndexC && VecOp.hasOneUse() &&
+      ISD::isNormalLoad(VecOp.getNode()) &&
+      !N->getOperand(1)->hasPredecessor(VecOp.getNode())) {
     SDValue Index = N->getOperand(1);
-    if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec)) {
-      if (!OrigLoad->isVolatile()) {
-        return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
-                                                             OrigLoad);
-      }
-    }
+    if (auto *OrigLoad = dyn_cast<LoadSDNode>(VecOp))
+      if (!OrigLoad->isVolatile())
+        return scalarizeExtractedVectorLoad(N, VecVT, Index, OrigLoad);
   }
 
   // Perform only after legalization to ensure build_vector / vector_shuffle
   // optimizations have already been done.
-  if (!LegalOperations) return SDValue();
+  if (!LegalOperations || !IndexC)
+    return SDValue();
 
   // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
   // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
   // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
+  int Elt = IndexC->getZExtValue();
+  LoadSDNode *LN0 = nullptr;
+  if (ISD::isNormalLoad(VecOp.getNode())) {
+    LN0 = cast<LoadSDNode>(VecOp);
+  } else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+             VecOp.getOperand(0).getValueType() == ExtVT &&
+             ISD::isNormalLoad(VecOp.getOperand(0).getNode())) {
+    // Don't duplicate a load with other uses.
+    if (!VecOp.hasOneUse())
+      return SDValue();
 
-  if (ConstEltNo) {
-    int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    LN0 = cast<LoadSDNode>(VecOp.getOperand(0));
+  }
+  if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) {
+    // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
+    // =>
+    // (load $addr+1*size)
 
-    LoadSDNode *LN0 = nullptr;
-    const ShuffleVectorSDNode *SVN = nullptr;
-    if (ISD::isNormalLoad(InVec.getNode())) {
-      LN0 = cast<LoadSDNode>(InVec);
-    } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-               InVec.getOperand(0).getValueType() == ExtVT &&
-               ISD::isNormalLoad(InVec.getOperand(0).getNode())) {
-      // Don't duplicate a load with other uses.
-      if (!InVec.hasOneUse())
-        return SDValue();
+    // Don't duplicate a load with other uses.
+    if (!VecOp.hasOneUse())
+      return SDValue();
 
-      LN0 = cast<LoadSDNode>(InVec.getOperand(0));
-    } else if ((SVN = dyn_cast<ShuffleVectorSDNode>(InVec))) {
-      // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
-      // =>
-      // (load $addr+1*size)
+    // If the bit convert changed the number of elements, it is unsafe
+    // to examine the mask.
+    if (BCNumEltsChanged)
+      return SDValue();
 
-      // Don't duplicate a load with other uses.
-      if (!InVec.hasOneUse())
-        return SDValue();
+    // Select the input vector, guarding against out of range extract vector.
+    int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt);
+    VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1);
 
-      // If the bit convert changed the number of elements, it is unsafe
-      // to examine the mask.
-      if (BCNumEltsChanged)
+    if (VecOp.getOpcode() == ISD::BITCAST) {
+      // Don't duplicate a load with other uses.
+      if (!VecOp.hasOneUse())
         return SDValue();
 
-      // Select the input vector, guarding against out of range extract vector.
-      unsigned NumElems = VT.getVectorNumElements();
-      int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt);
-      InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1);
-
-      if (InVec.getOpcode() == ISD::BITCAST) {
-        // Don't duplicate a load with other uses.
-        if (!InVec.hasOneUse())
-          return SDValue();
-
-        InVec = InVec.getOperand(0);
-      }
-      if (ISD::isNormalLoad(InVec.getNode())) {
-        LN0 = cast<LoadSDNode>(InVec);
-        Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
-        EltNo = DAG.getConstant(Elt, SDLoc(EltNo), EltNo.getValueType());
-      }
+      VecOp = VecOp.getOperand(0);
     }
+    if (ISD::isNormalLoad(VecOp.getNode())) {
+      LN0 = cast<LoadSDNode>(VecOp);
+      Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts;
+      Index = DAG.getConstant(Elt, DL, Index.getValueType());
+    }
+  }
 
-    // Make sure we found a non-volatile load and the extractelement is
-    // the only use.
-    if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
-      return SDValue();
-
-    // If Idx was -1 above, Elt is going to be -1, so just return undef.
-    if (Elt == -1)
-      return DAG.getUNDEF(LVT);
+  // Make sure we found a non-volatile load and the extractelement is
+  // the only use.
+  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
+    return SDValue();
 
-    return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0);
-  }
+  // If Idx was -1 above, Elt is going to be -1, so just return undef.
+  if (Elt == -1)
+    return DAG.getUNDEF(LVT);
 
-  return SDValue();
+  return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0);
 }
 
 // Simplify (build_vec (ext )) to (bitcast (build_vec ))
@@ -16454,11 +16528,19 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     SDValue In = N->getOperand(0);
     assert(In.getValueType().isVector() && "Must concat vectors");
 
-    // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
-    if (In->getOpcode() == ISD::BITCAST &&
-        !In->getOperand(0).getValueType().isVector()) {
-      SDValue Scalar = In->getOperand(0);
+    SDValue Scalar = peekThroughOneUseBitcasts(In);
 
+    // concat_vectors(scalar_to_vector(scalar), undef) ->
+    //     scalar_to_vector(scalar)
+    if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+         Scalar.hasOneUse()) {
+      EVT SVT = Scalar.getValueType().getVectorElementType();
+      if (SVT == Scalar.getOperand(0).getValueType())
+        Scalar = Scalar.getOperand(0);
+    }
+
+    // concat_vectors(scalar, undef) -> scalar_to_vector(scalar)
+    if (!Scalar.getValueType().isVector()) {
       // If the bitcast type isn't legal, it might be a trunc of a legal type;
       // look through the trunc so we can still do the transform:
       //   concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
@@ -16467,7 +16549,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
           TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
         Scalar = Scalar->getOperand(0);
 
-      EVT SclTy = Scalar->getValueType(0);
+      EVT SclTy = Scalar.getValueType();
 
       if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
         return SDValue();
@@ -16612,47 +16694,56 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   if (!ISD::isBinaryOp(BinOp.getNode()))
     return SDValue();
 
-  // The binop must be a vector type, so we can chop it in half.
+  // The binop must be a vector type, so we can extract some fraction of it.
   EVT WideBVT = BinOp.getValueType();
   if (!WideBVT.isVector())
     return SDValue();
 
   EVT VT = Extract->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
   unsigned ExtractIndex = ExtractIndexC->getZExtValue();
-  assert(ExtractIndex % NumElems == 0 &&
+  assert(ExtractIndex % VT.getVectorNumElements() == 0 &&
          "Extract index is not a multiple of the vector length.");
-  EVT SrcVT = Extract->getOperand(0).getValueType();
 
   // Bail out if this is not a proper multiple width extraction.
-  unsigned NumSrcElems = SrcVT.getVectorNumElements();
-  if (NumSrcElems % NumElems != 0)
+  unsigned WideWidth = WideBVT.getSizeInBits();
+  unsigned NarrowWidth = VT.getSizeInBits();
+  if (WideWidth % NarrowWidth != 0)
     return SDValue();
 
-  // Bail out if the target does not support a narrower version of the binop.
-  unsigned NarrowingRatio = NumSrcElems / NumElems;
-  unsigned BOpcode = BinOp.getOpcode();
+  // Bail out if we are extracting a fraction of a single operation. This can
+  // occur because we potentially looked through a bitcast of the binop.
+  unsigned NarrowingRatio = WideWidth / NarrowWidth;
   unsigned WideNumElts = WideBVT.getVectorNumElements();
+  if (WideNumElts % NarrowingRatio != 0)
+    return SDValue();
+
+  // Bail out if the target does not support a narrower version of the binop.
   EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
                                    WideNumElts / NarrowingRatio);
+  unsigned BOpcode = BinOp.getOpcode();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
     return SDValue();
 
   // If extraction is cheap, we don't need to look at the binop operands
   // for concat ops. The narrow binop alone makes this transform profitable.
-  // TODO: We're not dealing with the bitcasted pattern here. That limitation
-  // should be lifted.
-  if (Extract->getOperand(0) == BinOp && BinOp.hasOneUse() &&
-      TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtractIndex)) {
+  // We can't just reuse the original extract index operand because we may have
+  // bitcasted.
+  unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();
+  unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
+  EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
+  if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&
+      BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {
     // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
     SDLoc DL(Extract);
+    SDValue NewExtIndex = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT);
     SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
-                            BinOp.getOperand(0), Extract->getOperand(1));
+                            BinOp.getOperand(0), NewExtIndex);
     SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
-                            BinOp.getOperand(1), Extract->getOperand(1));
-    return DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
-                       BinOp.getNode()->getFlags());
+                            BinOp.getOperand(1), NewExtIndex);
+    SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
+                                      BinOp.getNode()->getFlags());
+    return DAG.getBitcast(VT, NarrowBinOp);
   }
 
   // Only handle the case where we are doubling and then halving. A larger ratio
@@ -16681,11 +16772,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
     return SDValue();
 
   // If one of the binop operands was not the result of a concat, we must
-  // extract a half-sized operand for our new narrow binop. We can't just reuse
-  // the original extract index operand because we may have bitcasted.
-  unsigned ConcatOpNum = ExtractIndex / NumElems;
-  unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
-  EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
+  // extract a half-sized operand for our new narrow binop.
   SDLoc DL(Extract);
 
   // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
@@ -16713,17 +16800,19 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   if (DAG.getDataLayout().isBigEndian())
     return SDValue();
 
-  // TODO: The one-use check is overly conservative. Check the cost of the
-  // extract instead or remove that condition entirely.
   auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
   auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
-  if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() ||
-      !ExtIdx)
+  if (!Ld || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx)
+    return SDValue();
+
+  // Allow targets to opt-out.
+  EVT VT = Extract->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
     return SDValue();
 
   // The narrow load will be offset from the base address of the old load if
   // we are extracting from something besides index 0 (little-endian).
-  EVT VT = Extract->getValueType(0);
   SDLoc DL(Extract);
   SDValue BaseAddr = Ld->getOperand(1);
   unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();
@@ -16928,12 +17017,15 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
   SDValue N0 = SVN->getOperand(0);
   SDValue N1 = SVN->getOperand(1);
 
-  if (!N0->hasOneUse() || !N1->hasOneUse())
+  if (!N0->hasOneUse())
     return SDValue();
 
   // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
   // discussed above.
   if (!N1.isUndef()) {
+    if (!N1->hasOneUse())
+      return SDValue();
+
     bool N0AnyConst = isAnyConstantBuildVector(N0.getNode());
     bool N1AnyConst = isAnyConstantBuildVector(N1.getNode());
     if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
@@ -17366,7 +17458,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
 
   // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
   // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
-  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
+  if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
     if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
       return Res;
 
@@ -17630,12 +17722,6 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   if (N1.isUndef())
     return N0;
 
-  // For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow
-  // us to pull BITCASTs from input to output.
-  if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR)
-    if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode()))
-      return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2);
-
   // If this is an insert of an extracted vector into an undef vector, we can
   // just use the input to the extract.
   if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
@@ -17718,6 +17804,10 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
   }
 
+  // Simplify source operands based on insertion.
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -18157,6 +18247,63 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
   return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
 }
 
+/// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
+/// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
+/// in it. This may be a win when the constant is not otherwise available
+/// because it replaces two constant pool loads with one.
+SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
+    const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
+    ISD::CondCode CC) {
+  if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType().isFloatingPoint()))
+    return SDValue();
+
+  // If we are before legalize types, we want the other legalization to happen
+  // first (for example, to avoid messing with soft float).
+  auto *TV = dyn_cast<ConstantFPSDNode>(N2);
+  auto *FV = dyn_cast<ConstantFPSDNode>(N3);
+  EVT VT = N2.getValueType();
+  if (!TV || !FV || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // If a constant can be materialized without loads, this does not make sense.
+  if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal ||
+      TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) ||
+      TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0)))
+    return SDValue();
+
+  // If both constants have multiple uses, then we won't need to do an extra
+  // load. The values are likely around in registers for other users.
+  if (!TV->hasOneUse() && !FV->hasOneUse())
+    return SDValue();
+
+  Constant *Elts[] = { const_cast<ConstantFP*>(FV->getConstantFPValue()),
+                       const_cast<ConstantFP*>(TV->getConstantFPValue()) };
+  Type *FPTy = Elts[0]->getType();
+  const DataLayout &TD = DAG.getDataLayout();
+
+  // Create a ConstantArray of the two constants.
+  Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
+  SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
+                                      TD.getPrefTypeAlignment(FPTy));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+
+  // Get offsets to the 0 and 1 elements of the array, so we can select between
+  // them.
+  SDValue Zero = DAG.getIntPtrConstant(0, DL);
+  unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
+  SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
+  SDValue Cond =
+      DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC);
+  AddToWorklist(Cond.getNode());
+  SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero);
+  AddToWorklist(CstOffset.getNode());
+  CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset);
+  AddToWorklist(CPIdx.getNode());
+  return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+                     MachinePointerInfo::getConstantPool(
+                         DAG.getMachineFunction()), Alignment);
+}
+
 /// Simplify an expression of the form (N0 cond N1) ? N2 : N3
 /// where 'cond' is the comparison specified by CC.
 SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
@@ -18165,75 +18312,26 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
   // (x ? y : y) -> y.
   if (N2 == N3) return N2;
 
+  EVT CmpOpVT = N0.getValueType();
   EVT VT = N2.getValueType();
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
-  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
 
-  // Determine if the condition we're dealing with is constant
-  SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
-                              N0, N1, CC, DL, false);
+  // Determine if the condition we're dealing with is constant.
+  SDValue SCC = SimplifySetCC(getSetCCResultType(CmpOpVT), N0, N1, CC, DL,
+                              false);
   if (SCC.getNode()) AddToWorklist(SCC.getNode());
 
-  if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
+  if (auto *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
     // fold select_cc true, x, y -> x
     // fold select_cc false, x, y -> y
     return !SCCC->isNullValue() ? N2 : N3;
   }
 
-  // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
-  // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
-  // in it.  This is a win when the constant is not otherwise available because
-  // it replaces two constant pool loads with one.  We only do this if the FP
-  // type is known to be legal, because if it isn't, then we are before legalize
-  // types an we want the other legalization to happen first (e.g. to avoid
-  // messing with soft float) and if the ConstantFP is not legal, because if
-  // it is legal, we may not need to store the FP constant in a constant pool.
-  if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
-    if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
-      if (TLI.isTypeLegal(N2.getValueType()) &&
-          (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
-               TargetLowering::Legal &&
-           !TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) &&
-           !TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) &&
-          // If both constants have multiple uses, then we won't need to do an
-          // extra load, they are likely around in registers for other users.
-          (TV->hasOneUse() || FV->hasOneUse())) {
-        Constant *Elts[] = {
-          const_cast<ConstantFP*>(FV->getConstantFPValue()),
-          const_cast<ConstantFP*>(TV->getConstantFPValue())
-        };
-        Type *FPTy = Elts[0]->getType();
-        const DataLayout &TD = DAG.getDataLayout();
-
-        // Create a ConstantArray of the two constants.
-        Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
-        SDValue CPIdx =
-            DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
-                                TD.getPrefTypeAlignment(FPTy));
-        unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-
-        // Get the offsets to the 0 and 1 element of the array so that we can
-        // select between them.
-        SDValue Zero = DAG.getIntPtrConstant(0, DL);
-        unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
-        SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
-
-        SDValue Cond = DAG.getSetCC(DL,
-                                    getSetCCResultType(N0.getValueType()),
-                                    N0, N1, CC);
-        AddToWorklist(Cond.getNode());
-        SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
-                                          Cond, One, Zero);
-        AddToWorklist(CstOffset.getNode());
-        CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
-                            CstOffset);
-        AddToWorklist(CPIdx.getNode());
-        return DAG.getLoad(
-            TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
-            MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-            Alignment);
-      }
-    }
+  if (SDValue V =
+          convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC))
+    return V;
 
   if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
     return V;
@@ -18247,7 +18345,7 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
   if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
       N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
     SDValue AndLHS = N0->getOperand(0);
-    ConstantSDNode *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
       // Shift the tested bit over the sign bit.
       const APInt &AndMask = ConstAndRHS->getAPIntValue();
@@ -18268,48 +18366,48 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
   }
 
   // fold select C, 16, 0 -> shl C, 4
-  if (N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2() &&
-      TLI.getBooleanContents(N0.getValueType()) ==
-          TargetLowering::ZeroOrOneBooleanContent) {
+  bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2();
+  bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2();
+
+  if ((Fold || Swap) &&
+      TLI.getBooleanContents(CmpOpVT) ==
+          TargetLowering::ZeroOrOneBooleanContent &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) {
+
+    if (Swap) {
+      CC = ISD::getSetCCInverse(CC, CmpOpVT.isInteger());
+      std::swap(N2C, N3C);
+    }
 
     // If the caller doesn't want us to simplify this into a zext of a compare,
     // don't do it.
     if (NotExtCompare && N2C->isOne())
       return SDValue();
 
-    // Get a SetCC of the condition
-    // NOTE: Don't create a SETCC if it's not legal on this target.
-    if (!LegalOperations ||
-        TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) {
-      SDValue Temp, SCC;
-      // cast from setcc result type to select result type
-      if (LegalTypes) {
-        SCC  = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()),
-                            N0, N1, CC);
-        if (N2.getValueType().bitsLT(SCC.getValueType()))
-          Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2),
-                                        N2.getValueType());
-        else
-          Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
-                             N2.getValueType(), SCC);
-      } else {
-        SCC  = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
-        Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
-                           N2.getValueType(), SCC);
-      }
+    SDValue Temp, SCC;
+    // zext (setcc n0, n1)
+    if (LegalTypes) {
+      SCC = DAG.getSetCC(DL, getSetCCResultType(CmpOpVT), N0, N1, CC);
+      if (VT.bitsLT(SCC.getValueType()))
+        Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT);
+      else
+        Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
+    } else {
+      SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
+      Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
+    }
 
-      AddToWorklist(SCC.getNode());
-      AddToWorklist(Temp.getNode());
+    AddToWorklist(SCC.getNode());
+    AddToWorklist(Temp.getNode());
 
-      if (N2C->isOne())
-        return Temp;
+    if (N2C->isOne())
+      return Temp;
 
-      // shl setcc result by log2 n2c
-      return DAG.getNode(
-          ISD::SHL, DL, N2.getValueType(), Temp,
-          DAG.getConstant(N2C->getAPIntValue().logBase2(), SDLoc(Temp),
-                          getShiftAmountTy(Temp.getValueType())));
-    }
+    // shl setcc result by log2 n2c
+    return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
+                       DAG.getConstant(N2C->getAPIntValue().logBase2(),
+                                       SDLoc(Temp),
+                                       getShiftAmountTy(Temp.getValueType())));
   }
 
   // Check to see if this is an integer abs.
@@ -18329,18 +18427,16 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
              N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
       SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));
 
-    EVT XType = N0.getValueType();
-    if (SubC && SubC->isNullValue() && XType.isInteger()) {
+    if (SubC && SubC->isNullValue() && CmpOpVT.isInteger()) {
       SDLoc DL(N0);
-      SDValue Shift = DAG.getNode(ISD::SRA, DL, XType,
-                                  N0,
-                                  DAG.getConstant(XType.getSizeInBits() - 1, DL,
-                                         getShiftAmountTy(N0.getValueType())));
-      SDValue Add = DAG.getNode(ISD::ADD, DL,
-                                XType, N0, Shift);
+      SDValue Shift = DAG.getNode(ISD::SRA, DL, CmpOpVT, N0,
+                                  DAG.getConstant(CmpOpVT.getSizeInBits() - 1,
+                                                  DL,
+                                                  getShiftAmountTy(CmpOpVT)));
+      SDValue Add = DAG.getNode(ISD::ADD, DL, CmpOpVT, N0, Shift);
       AddToWorklist(Shift.getNode());
       AddToWorklist(Add.getNode());
-      return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
+      return DAG.getNode(ISD::XOR, DL, CmpOpVT, Add, Shift);
     }
   }
 
@@ -18905,6 +19001,11 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
+// TODO: Replace with with std::monostate when we move to C++17.
+struct UnitT { } Unit;
+bool operator==(const UnitT &, const UnitT &) { return true; }
+bool operator!=(const UnitT &, const UnitT &) { return false; }
+
 // This function tries to collect a bunch of potentially interesting
 // nodes to improve the chains of, all at once. This might seem
 // redundant, as this function gets called when visiting every store
@@ -18917,13 +19018,22 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
 // the nodes that will eventually be candidates, and then not be able
 // to go from a partially-merged state to the desired final
 // fully-merged state.
-bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
-  if (OptLevel == CodeGenOpt::None)
-    return false;
+
+bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
+  SmallVector<StoreSDNode *, 8> ChainedStores;
+  StoreSDNode *STChain = St;
+  // Intervals records which offsets from BaseIndex have been covered. In
+  // the common case, every store writes to the immediately previous address
+  // space and thus merged with the previous interval at insertion time.
+
+  using IMap =
+      llvm::IntervalMap<int64_t, UnitT, 8, IntervalMapHalfOpenInfo<int64_t>>;
+  IMap::Allocator A;
+  IMap Intervals(A);
 
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
+  const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
 
   // We must have a base and an offset.
   if (!BasePtr.getBase().getNode())
@@ -18933,76 +19043,114 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
   if (BasePtr.getBase().isUndef())
     return false;
 
-  SmallVector<StoreSDNode *, 8> ChainedStores;
-  ChainedStores.push_back(St);
+  // Add ST's interval.
+  Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit);
 
-  // Walk up the chain and look for nodes with offsets from the same
-  // base pointer. Stop when reaching an instruction with a different kind
-  // or instruction which has a different base pointer.
-  StoreSDNode *Index = St;
-  while (Index) {
+  while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) {
     // If the chain has more than one use, then we can't reorder the mem ops.
-    if (Index != St && !SDValue(Index, 0)->hasOneUse())
+    if (!SDValue(Chain, 0)->hasOneUse())
       break;
-
-    if (Index->isVolatile() || Index->isIndexed())
+    if (Chain->isVolatile() || Chain->isIndexed())
       break;
 
     // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG);
-
+    const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG);
     // Check that the base pointer is the same as the original one.
-    if (!BasePtr.equalBaseIndex(Ptr, DAG))
+    int64_t Offset;
+    if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset))
+      break;
+    int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8;
+    // Make sure we don't overlap with other intervals by checking the ones to
+    // the left or right before inserting.
+    auto I = Intervals.find(Offset);
+    // If there's a next interval, we should end before it.
+    if (I != Intervals.end() && I.start() < (Offset + Length))
+      break;
+    // If there's a previous interval, we should start after it.
+    if (I != Intervals.begin() && (--I).stop() <= Offset)
       break;
+    Intervals.insert(Offset, Offset + Length, Unit);
 
-    // Walk up the chain to find the next store node, ignoring any
-    // intermediate loads. Any other kind of node will halt the loop.
-    SDNode *NextInChain = Index->getChain().getNode();
-    while (true) {
-      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
-        // We found a store node. Use it for the next iteration.
-        if (STn->isVolatile() || STn->isIndexed()) {
-          Index = nullptr;
-          break;
-        }
-        ChainedStores.push_back(STn);
-        Index = STn;
-        break;
-      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
-        NextInChain = Ldn->getChain().getNode();
-        continue;
-      } else {
-        Index = nullptr;
-        break;
-      }
-    }// end while
+    ChainedStores.push_back(Chain);
+    STChain = Chain;
   }
 
-  // At this point, ChainedStores lists all of the Store nodes
-  // reachable by iterating up through chain nodes matching the above
-  // conditions.  For each such store identified, try to find an
-  // earlier chain to attach the store to which won't violate the
-  // required ordering.
-  bool MadeChangeToSt = false;
-  SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;
+  // If we didn't find a chained store, exit.
+  if (ChainedStores.size() == 0)
+    return false;
 
-  for (StoreSDNode *ChainedStore : ChainedStores) {
-    SDValue Chain = ChainedStore->getChain();
-    SDValue BetterChain = FindBetterChain(ChainedStore, Chain);
+  // Improve all chained stores (St and ChainedStores members) starting from
+  // where the store chain ended and return single TokenFactor.
+  SDValue NewChain = STChain->getChain();
+  SmallVector<SDValue, 8> TFOps;
+  for (unsigned I = ChainedStores.size(); I;) {
+    StoreSDNode *S = ChainedStores[--I];
+    SDValue BetterChain = FindBetterChain(S, NewChain);
+    S = cast<StoreSDNode>(DAG.UpdateNodeOperands(
+        S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3)));
+    TFOps.push_back(SDValue(S, 0));
+    ChainedStores[I] = S;
+  }
+
+  // Improve St's chain. Use a new node to avoid creating a loop from CombineTo.
+  SDValue BetterChain = FindBetterChain(St, NewChain);
+  SDValue NewST;
+  if (St->isTruncatingStore())
+    NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(),
+                              St->getBasePtr(), St->getMemoryVT(),
+                              St->getMemOperand());
+  else
+    NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(),
+                         St->getBasePtr(), St->getMemOperand());
 
-    if (Chain != BetterChain) {
-      if (ChainedStore == St)
-        MadeChangeToSt = true;
-      BetterChains.push_back(std::make_pair(ChainedStore, BetterChain));
-    }
-  }
+  TFOps.push_back(NewST);
 
-  // Do all replacements after finding the replacements to make to avoid making
-  // the chains more complicated by introducing new TokenFactors.
-  for (auto Replacement : BetterChains)
-    replaceStoreChain(Replacement.first, Replacement.second);
+  // If we improved every element of TFOps, then we've lost the dependence on
+  // NewChain to successors of St and we need to add it back to TFOps. Do so at
+  // the beginning to keep relative order consistent with FindBetterChains.
+  auto hasImprovedChain = [&](SDValue ST) -> bool {
+    return ST->getOperand(0) != NewChain;
+  };
+  bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain);
+  if (AddNewChain)
+    TFOps.insert(TFOps.begin(), NewChain);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, SDLoc(STChain), MVT::Other, TFOps);
+  CombineTo(St, TF);
+
+  AddToWorklist(STChain);
+  // Add TF operands worklist in reverse order.
+  for (auto I = TF->getNumOperands(); I;)
+    AddToWorklist(TF->getOperand(--I).getNode());
+  AddToWorklist(TF.getNode());
+  return true;
+}
+
+bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
+
+  // We must have a base and an offset.
+  if (!BasePtr.getBase().getNode())
+    return false;
 
-  return MadeChangeToSt;
+  // Do not handle stores to undef base pointers.
+  if (BasePtr.getBase().isUndef())
+    return false;
+
+  // Directly improve a chain of disjoint stores starting at St.
+  if (parallelizeChainedStores(St))
+    return true;
+
+  // Improve St's Chain..
+  SDValue BetterChain = FindBetterChain(St, St->getChain());
+  if (St->getChain() != BetterChain) {
+    replaceStoreChain(St, BetterChain);
+    return true;
+  }
+  return false;
 }
 
 /// This is the entry point for the file.
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index d5f066c2423eb989420af4df1e8f6b8727520053..a9a3c44ea0c9ca6d62af1f5f84331bf8ba343761 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -547,6 +547,15 @@ void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
   assert(I.isValid() && E.isValid() && std::distance(I, E) > 0 &&
          "Invalid iterator!");
   while (I != E) {
+    if (LastFlushPoint == I)
+      LastFlushPoint = E;
+    if (SavedInsertPt == I)
+      SavedInsertPt = E;
+    if (EmitStartPt == I)
+      EmitStartPt = E.isValid() ? &*E : nullptr;
+    if (LastLocalValue == I)
+      LastLocalValue = E.isValid() ? &*E : nullptr;
+
     MachineInstr *Dead = &*I;
     ++I;
     Dead->eraseFromParent();
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index d3c31911d67744b88add9dc672a5cf03469e61aa..fba728625b071bbc19e24655ee0b43dd4a34511e 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -579,9 +579,18 @@ FunctionLoweringInfo::getOrCreateSwiftErrorVRegUseAt(const Instruction *I, const
 const Value *
 FunctionLoweringInfo::getValueFromVirtualReg(unsigned Vreg) {
   if (VirtReg2Value.empty()) {
+    SmallVector<EVT, 4> ValueVTs;
     for (auto &P : ValueMap) {
-      VirtReg2Value[P.second] = P.first;
+      ValueVTs.clear();
+      ComputeValueVTs(*TLI, Fn->getParent()->getDataLayout(),
+                      P.first->getType(), ValueVTs);
+      unsigned Reg = P.second;
+      for (EVT VT : ValueVTs) {
+        unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT);
+        for (unsigned i = 0, e = NumRegisters; i != e; ++i)
+          VirtReg2Value[Reg++] = P.first;
+      }
     }
   }
-  return VirtReg2Value[Vreg];
+  return VirtReg2Value.lookup(Vreg);
 }
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index fc9c227e4dfacd3a1958a2404ad75221ce389c7b..6a6114677cc2b5ab22fff63661417d10792aca6f 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -652,6 +652,12 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
   const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
   MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II, NewVReg);
   unsigned NumOps = Node->getNumOperands();
+  // If the input pattern has a chain, then the root of the corresponding
+  // output pattern will get a chain as well. This can happen to be a
+  // REG_SEQUENCE (which is not "guarded" by countOperands/CountResults).
+  if (NumOps && Node->getOperand(NumOps-1).getValueType() == MVT::Other)
+    --NumOps; // Ignore chain if it exists.
+
   assert((NumOps & 1) == 1 &&
          "REG_SEQUENCE must have an odd number of operands!");
   for (unsigned i = 1; i != NumOps; ++i) {
@@ -694,6 +700,20 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
 
+  SD->setIsEmitted();
+
+  if (SD->isInvalidated()) {
+    // An invalidated SDNode must generate an undef DBG_VALUE: although the
+    // original value is no longer computed, earlier DBG_VALUEs live ranges
+    // must not leak into later code.
+    auto MIB = BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE));
+    MIB.addReg(0U);
+    MIB.addReg(0U, RegState::Debug);
+    MIB.addMetadata(Var);
+    MIB.addMetadata(Expr);
+    return &*MIB;
+  }
+
   if (SD->getKind() == SDDbgValue::FRAMEIX) {
     // Stack address; this needs to be lowered in target-dependent fashion.
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
@@ -735,6 +755,9 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
         MIB.addImm(CI->getSExtValue());
     } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
       MIB.addFPImm(CF);
+    } else if (isa<ConstantPointerNull>(V)) {
+      // Note: This assumes that all nullptr constants are zero-valued.
+      MIB.addImm(0);
     } else {
       // Could be an Undef.  In any case insert an Undef so we can see what we
       // dropped.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index dcb479e4ce1fb85b136a1e3f8fd214ac73090e83..9d6a6939fbabafc0fcc1e3ed1b0eed83cab6610b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1128,6 +1128,12 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   }
+  case ISD::SMULFIX: {
+    unsigned Scale = Node->getConstantOperandVal(2);
+    Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
+                                              Node->getValueType(0), Scale);
+    break;
+  }
   case ISD::MSCATTER:
     Action = TLI.getOperationAction(Node->getOpcode(),
                     cast<MaskedScatterSDNode>(Node)->getValue().getValueType());
@@ -1170,6 +1176,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       }
     }
     break;
+    case ISD::FSHL:
+    case ISD::FSHR:
     case ISD::SRL_PARTS:
     case ISD::SRA_PARTS:
     case ISD::SHL_PARTS: {
@@ -3262,6 +3270,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     break;
   }
+  case ISD::FSHL:
+  case ISD::FSHR:
+    if (TLI.expandFunnelShift(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    if (TLI.expandROT(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
   case ISD::SADDSAT:
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
@@ -3269,6 +3287,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(TLI.getExpandedSaturationAdditionSubtraction(Node, DAG));
     break;
   }
+  case ISD::SMULFIX: {
+    Results.push_back(TLI.getExpandedFixedPointMultiplication(Node, DAG));
+    break;
+  }
   case ISD::SADDO:
   case ISD::SSUBO: {
     SDValue LHS = Node->getOperand(0);
@@ -3708,46 +3730,6 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     ReplaceNode(SDValue(Node, 0), Result);
     break;
   }
-  case ISD::ROTL:
-  case ISD::ROTR: {
-    bool IsLeft = Node->getOpcode() == ISD::ROTL;
-    SDValue Op0 = Node->getOperand(0), Op1 = Node->getOperand(1);
-    EVT ResVT = Node->getValueType(0);
-    EVT OpVT = Op0.getValueType();
-    assert(OpVT == ResVT &&
-           "The result and the operand types of rotate should match");
-    EVT ShVT = Op1.getValueType();
-    SDValue Width = DAG.getConstant(OpVT.getScalarSizeInBits(), dl, ShVT);
-
-    // If a rotate in the other direction is legal, use it.
-    unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
-    if (TLI.isOperationLegal(RevRot, ResVT)) {
-      SDValue Sub = DAG.getNode(ISD::SUB, dl, ShVT, Width, Op1);
-      Results.push_back(DAG.getNode(RevRot, dl, ResVT, Op0, Sub));
-      break;
-    }
-
-    // Otherwise,
-    //   (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1)))
-    //   (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1)))
-    //
-    assert(isPowerOf2_32(OpVT.getScalarSizeInBits()) &&
-           "Expecting the type bitwidth to be a power of 2");
-    unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
-    unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
-    SDValue Width1 = DAG.getNode(ISD::SUB, dl, ShVT,
-                                 Width, DAG.getConstant(1, dl, ShVT));
-    SDValue NegOp1 = DAG.getNode(ISD::SUB, dl, ShVT, Width, Op1);
-    SDValue And0 = DAG.getNode(ISD::AND, dl, ShVT, Op1, Width1);
-    SDValue And1 = DAG.getNode(ISD::AND, dl, ShVT, NegOp1, Width1);
-
-    SDValue Or = DAG.getNode(ISD::OR, dl, ResVT,
-                             DAG.getNode(ShOpc, dl, ResVT, Op0, And0),
-                             DAG.getNode(HsOpc, dl, ResVT, Op0, And1));
-    Results.push_back(Or);
-    break;
-  }
-
   case ISD::GLOBAL_OFFSET_TABLE:
   case ISD::GlobalAddress:
   case ISD::GlobalTLSAddress:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 2b1df0165d336198eaad7ea1d208b83de62fea81..25fa2a0a4af1f6489f217db622178daa5c874b07 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -118,6 +118,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
 
   case ISD::FP_TO_FP16:  Res = PromoteIntRes_FP_TO_FP16(N); break;
 
+  case ISD::FLT_ROUNDS_: Res = PromoteIntRes_FLT_ROUNDS(N); break;
+
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
@@ -145,6 +147,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
+  case ISD::SMULFIX:     Res = PromoteIntRes_SMULFIX(N); break;
 
   case ISD::ATOMIC_LOAD:
     Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
@@ -475,6 +478,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_FLT_ROUNDS(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  return DAG.getNode(N->getOpcode(), dl, NVT);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
@@ -580,7 +590,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
   SDLoc dl(N);
   SDValue Op1 = N->getOperand(0);
   SDValue Op2 = N->getOperand(1);
-  unsigned OldBits = Op1.getValueSizeInBits();
+  unsigned OldBits = Op1.getScalarValueSizeInBits();
 
   unsigned Opcode = N->getOpcode();
   unsigned ShiftOp;
@@ -602,7 +612,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
   SDValue Op2Promoted = GetPromotedInteger(Op2);
 
   EVT PromotedType = Op1Promoted.getValueType();
-  unsigned NewBits = Op1Promoted.getValueSizeInBits();
+  unsigned NewBits = PromotedType.getScalarSizeInBits();
   unsigned SHLAmount = NewBits - OldBits;
   EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
   SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
@@ -616,6 +626,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
   return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_SMULFIX(SDNode *N) {
+  // Can just promote the operands then continue with operation.
+  SDLoc dl(N);
+  SDValue Op1Promoted = SExtPromotedInteger(N->getOperand(0));
+  SDValue Op2Promoted = SExtPromotedInteger(N->getOperand(1));
+  EVT PromotedType = Op1Promoted.getValueType();
+  return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted,
+                     N->getOperand(2));
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
   if (ResNo == 1)
     return PromoteIntRes_Overflow(N);
@@ -1042,6 +1062,13 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
 
   case ISD::ADDCARRY:
   case ISD::SUBCARRY: Res = PromoteIntOp_ADDSUBCARRY(N, OpNo); break;
+
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR: Res = PromoteIntOp_FRAMERETURNADDR(N); break;
+
+  case ISD::PREFETCH: Res = PromoteIntOp_PREFETCH(N, OpNo); break;
+
+  case ISD::SMULFIX: Res = PromoteIntOp_SMULFIX(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -1063,9 +1090,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
 /// shared among BR_CC, SELECT_CC, and SETCC handlers.
 void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
                                             ISD::CondCode CCCode) {
-  // We have to insert explicit sign or zero extends.  Note that we could
-  // insert sign extends for ALL conditions, but zero extend is cheaper on
-  // many machines (an AND instead of two shifts), so prefer it.
+  // We have to insert explicit sign or zero extends. Note that we could
+  // insert sign extends for ALL conditions. For those operations where either
+  // zero or sign extension would be valid, use SExtOrZExtPromotedInteger
+  // which will choose the cheapest for the target.
   switch (CCCode) {
   default: llvm_unreachable("Unknown integer comparison!");
   case ISD::SETEQ:
@@ -1086,8 +1114,8 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
       NewLHS = OpL;
       NewRHS = OpR;
     } else {
-      NewLHS = ZExtPromotedInteger(NewLHS);
-      NewRHS = ZExtPromotedInteger(NewRHS);
+      NewLHS = SExtOrZExtPromotedInteger(NewLHS);
+      NewRHS = SExtOrZExtPromotedInteger(NewRHS);
     }
     break;
   }
@@ -1095,11 +1123,8 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
   case ISD::SETUGT:
   case ISD::SETULE:
   case ISD::SETULT:
-    // ALL of these operations will work if we either sign or zero extend
-    // the operands (including the unsigned comparisons!).  Zero extend is
-    // usually a simpler/cheaper operation, so prefer it.
-    NewLHS = ZExtPromotedInteger(NewLHS);
-    NewRHS = ZExtPromotedInteger(NewRHS);
+    NewLHS = SExtOrZExtPromotedInteger(NewLHS);
+    NewRHS = SExtOrZExtPromotedInteger(NewRHS);
     break;
   case ISD::SETGE:
   case ISD::SETGT:
@@ -1403,6 +1428,30 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) {
   return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_SMULFIX(SDNode *N) {
+  SDValue Op2 = ZExtPromotedInteger(N->getOperand(2));
+  return SDValue(
+      DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), Op2), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_FRAMERETURNADDR(SDNode *N) {
+  // Promote the RETURNADDR/FRAMEADDR argument to a supported integer width.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo) {
+  assert(OpNo > 1 && "Don't know how to promote this operand!");
+  // Promote the rw, locality, and cache type arguments to a supported integer
+  // width.
+  SDValue Op2 = ZExtPromotedInteger(N->getOperand(2));
+  SDValue Op3 = ZExtPromotedInteger(N->getOperand(3));
+  SDValue Op4 = ZExtPromotedInteger(N->getOperand(4));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
+                                        Op2, Op3, Op4),
+                 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
 //===----------------------------------------------------------------------===//
@@ -1541,6 +1590,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break;
+  case ISD::SMULFIX: ExpandIntRes_SMULFIX(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -2509,6 +2559,95 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBSAT(SDNode *N, SDValue &Lo,
   SplitInteger(Result, Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_SMULFIX(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  uint64_t Scale = N->getConstantOperandVal(2);
+  if (!Scale) {
+    SDValue Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    SplitInteger(Result, Lo, Hi);
+    return;
+  }
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(LHS, LL, LH);
+  GetExpandedInteger(RHS, RL, RH);
+  SmallVector<SDValue, 4> Result;
+
+  if (!TLI.expandMUL_LOHI(ISD::SMUL_LOHI, VT, dl, LHS, RHS, Result, NVT, DAG,
+                          TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
+                          LL, LH, RL, RH)) {
+    report_fatal_error("Unable to expand SMUL_FIX using SMUL_LOHI.");
+    return;
+  }
+
+  unsigned VTSize = VT.getScalarSizeInBits();
+  unsigned NVTSize = NVT.getScalarSizeInBits();
+  EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
+
+  // Shift whole amount by scale.
+  SDValue ResultLL = Result[0];
+  SDValue ResultLH = Result[1];
+  SDValue ResultHL = Result[2];
+  SDValue ResultHH = Result[3];
+
+  // After getting the multplication result in 4 parts, we need to perform a
+  // shift right by the amount of the scale to get the result in that scale.
+  // Let's say we multiply 2 64 bit numbers. The resulting value can be held in
+  // 128 bits that are cut into 4 32-bit parts:
+  //
+  //      HH       HL       LH       LL
+  //  |---32---|---32---|---32---|---32---|
+  // 128      96       64       32        0
+  //
+  //                    |------VTSize-----|
+  //
+  //                             |NVTSize-|
+  //
+  // The resulting Lo and Hi will only need to be one of these 32-bit parts
+  // after shifting.
+  if (Scale < NVTSize) {
+    // If the scale is less than the size of the VT we expand to, the Hi and
+    // Lo of the result will be in the first 2 parts of the result after
+    // shifting right. This only requires shifting by the scale as far as the
+    // third part in the result (ResultHL).
+    SDValue SRLAmnt = DAG.getConstant(Scale, dl, ShiftTy);
+    SDValue SHLAmnt = DAG.getConstant(NVTSize - Scale, dl, ShiftTy);
+    Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLL, SRLAmnt);
+    Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
+                     DAG.getNode(ISD::SHL, dl, NVT, ResultLH, SHLAmnt));
+    Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
+    Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
+                     DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
+  } else if (Scale == NVTSize) {
+    // If the scales are equal, Lo and Hi are ResultLH and Result HL,
+    // respectively. Avoid shifting to prevent undefined behavior.
+    Lo = ResultLH;
+    Hi = ResultHL;
+  } else if (Scale < VTSize) {
+    // If the scale is instead less than the old VT size, but greater than or
+    // equal to the expanded VT size, the first part of the result (ResultLL) is
+    // no longer a part of Lo because it would be scaled out anyway. Instead we
+    // can start shifting right from the fourth part (ResultHH) to the second
+    // part (ResultLH), and Result LH will be the new Lo.
+    SDValue SRLAmnt = DAG.getConstant(Scale - NVTSize, dl, ShiftTy);
+    SDValue SHLAmnt = DAG.getConstant(VTSize - Scale, dl, ShiftTy);
+    Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
+    Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
+                     DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
+    Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt);
+    Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
+                     DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt));
+  } else {
+    llvm_unreachable(
+        "Expected the scale to be less than the width of the operands");
+  }
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue LHS = Node->getOperand(0);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 8b7c57cbb3b54460f37ff745af157caa109dcf44..032000f6cb799bbb6bc87ddedb9296105faff342 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -281,6 +281,20 @@ private:
     return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType());
   }
 
+  // Get a promoted operand and sign or zero extend it to the final size
+  // (depending on TargetLoweringInfo::isSExtCheaperThanZExt). For a given
+  // subtarget and type, the choice of sign or zero-extension will be
+  // consistent.
+  SDValue SExtOrZExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    SDLoc DL(Op);
+    Op = GetPromotedInteger(Op);
+    if (TLI.isSExtCheaperThanZExt(OldVT, Op.getValueType()))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(), Op,
+                         DAG.getValueType(OldVT));
+    return DAG.getZeroExtendInReg(Op, DL, OldVT.getScalarType());
+  }
+
   // Integer Result Promotion.
   void PromoteIntegerResult(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
@@ -331,6 +345,8 @@ private:
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_ADDSUBSAT(SDNode *N);
+  SDValue PromoteIntRes_SMULFIX(SDNode *N);
+  SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -361,6 +377,9 @@ private:
   SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N);
+  SDValue PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SMULFIX(SDNode *N);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -416,6 +435,7 @@ private:
   void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ADDSUBSAT         (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SMULFIX           (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
@@ -671,6 +691,8 @@ private:
   SDValue ScalarizeVecRes_UNDEF(SDNode *N);
   SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
 
+  SDValue ScalarizeVecRes_SMULFIX(SDNode *N);
+
   // Vector Operand Scalarization: <1 x ty> -> ty.
   bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_BITCAST(SDNode *N);
@@ -706,6 +728,8 @@ private:
   void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
+  void SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, SDValue &Hi);
+
   void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 17f05c3ba9775daf35f686a834b0ba4c50de2089..4df02c6a6bc5f35e45bcfe1a772c86c194a5a5fd 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -132,6 +132,8 @@ class VectorLegalizer {
   SDValue ExpandCTPOP(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
   SDValue ExpandCTTZ(SDValue Op);
+  SDValue ExpandFunnelShift(SDValue Op);
+  SDValue ExpandROT(SDValue Op);
   SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
   SDValue ExpandStrictFPOp(SDValue Op);
 
@@ -244,7 +246,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         if (SDValue Lowered = TLI.LowerOperation(Result, DAG)) {
           assert(Lowered->getNumValues() == Op->getNumValues() &&
                  "Unexpected number of results");
-          Changed = Lowered != Result;
+          if (Lowered != Result) {
+            // Make sure the new code is also legal.
+            Lowered = LegalizeOp(Lowered);
+            Changed = true;
+          }
           return TranslateLegalizeResults(Op, Lowered);
         }
         LLVM_FALLTHROUGH;
@@ -266,7 +272,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         return TranslateLegalizeResults(Op, Result);
       case TargetLowering::Custom: {
         SDValue Lowered = TLI.LowerOperation(Result, DAG);
-        Changed = Lowered != Result;
+        if (Lowered != Result) {
+          // Make sure the new code is also legal.
+          Lowered = LegalizeOp(Lowered);
+          Changed = true;
+        }
         return TranslateLegalizeResults(Op, Lowered);
       }
       case TargetLowering::Expand:
@@ -322,6 +332,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SREM:
@@ -403,6 +415,12 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::USUBSAT:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
+  case ISD::SMULFIX: {
+    unsigned Scale = Node->getConstantOperandVal(2);
+    Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
+                                              Node->getValueType(0), Scale);
+    break;
+  }
   case ISD::FP_ROUND_INREG:
     Action = TLI.getOperationAction(Node->getOpcode(),
                cast<VTSDNode>(Node->getOperand(1))->getVT());
@@ -739,6 +757,12 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
     return ExpandCTTZ(Op);
+  case ISD::FSHL:
+  case ISD::FSHR:
+    return ExpandFunnelShift(Op);
+  case ISD::ROTL:
+  case ISD::ROTR:
+    return ExpandROT(Op);
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
     return ExpandFMINNUM_FMAXNUM(Op);
@@ -1116,32 +1140,42 @@ SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
 }
 
 SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
-  // Attempt to expand using TargetLowering.
   SDValue Result;
   if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
     return Result;
 
-  // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
 SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
-  // Attempt to expand using TargetLowering.
   SDValue Result;
   if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
     return Result;
 
-  // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
 SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
-  // Attempt to expand using TargetLowering.
   SDValue Result;
   if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
     return Result;
 
-  // Otherwise go ahead and unroll.
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::ExpandFunnelShift(SDValue Op) {
+  SDValue Result;
+  if (TLI.expandFunnelShift(Op.getNode(), Result, DAG))
+    return Result;
+
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::ExpandROT(SDValue Op) {
+  SDValue Result;
+  if (TLI.expandROT(Op.getNode(), Result, DAG))
+    return Result;
+
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 88abd84366a9b2aa2041074f57ed286149ea1768..f367e9358576ab5e37d8e64f9f5d0b45ca75c513 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -172,6 +172,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FTRUNC:
     R = ScalarizeVecRes_StrictFPOp(N);
     break;
+  case ISD::SMULFIX:
+    R = ScalarizeVecRes_SMULFIX(N);
+    break;
   }
 
   // If R is null, the sub-method took care of registering the result.
@@ -194,6 +197,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
                      Op0.getValueType(), Op0, Op1, Op2);
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SMULFIX(SDNode *N) {
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  SDValue Op1 = GetScalarizedVector(N->getOperand(1));
+  SDValue Op2 = N->getOperand(2);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
+                     Op2);
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) {
   EVT VT = N->getValueType(0).getVectorElementType();
   unsigned NumOpers = N->getNumOperands();
@@ -848,6 +859,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FTRUNC:
     SplitVecRes_StrictFPOp(N, Lo, Hi);
     break;
+  case ISD::SMULFIX:
+    SplitVecRes_SMULFIX(N, Lo, Hi);
+    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -885,6 +899,20 @@ void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
                    Op0Hi, Op1Hi, Op2Hi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDValue RHSLo, RHSHi;
+  GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
+  SDLoc dl(N);
+  SDValue Op2 = N->getOperand(2);
+
+  unsigned Opcode = N->getOpcode();
+  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2);
+  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2);
+}
+
 void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   // We know the result is a vector.  The input may be either a vector or a
@@ -1694,13 +1722,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::VSELECT:
       Res = SplitVecOp_VSELECT(N, OpNo);
       break;
-    case ISD::FP_TO_SINT:
-    case ISD::FP_TO_UINT:
-      if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))
-        Res = SplitVecOp_TruncateHelper(N);
-      else
-        Res = SplitVecOp_UnaryOp(N);
-      break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:
       if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))
@@ -1708,6 +1729,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       else
         Res = SplitVecOp_UnaryOp(N);
       break;
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
     case ISD::CTTZ:
     case ISD::CTLZ:
     case ISD::CTPOP:
@@ -1934,6 +1957,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Load back the required element.
   StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
+
+  // FIXME: This is to handle i1 vectors with elements promoted to i8.
+  // i1 vector handling needs general improvement.
+  if (N->getValueType(0).bitsLT(EltVT)) {
+    SDValue Load = DAG.getLoad(EltVT, dl, Store, StackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+    return DAG.getZExtOrTrunc(Load, dl, N->getValueType(0));
+  }
+
   return DAG.getExtLoad(
       ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
       MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
@@ -2238,16 +2270,31 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
   unsigned InElementSize = InVT.getScalarSizeInBits();
   unsigned OutElementSize = OutVT.getScalarSizeInBits();
 
+  // Determine the split output VT. If its legal we can just split dirctly.
+  EVT LoOutVT, HiOutVT;
+  std::tie(LoOutVT, HiOutVT) = DAG.GetSplitDestVTs(OutVT);
+  assert(LoOutVT == HiOutVT && "Unequal split?");
+
   // If the input elements are only 1/2 the width of the result elements,
   // just use the normal splitting. Our trick only work if there's room
   // to split more than once.
-  if (InElementSize <= OutElementSize * 2)
+  if (isTypeLegal(LoOutVT) ||
+      InElementSize <= OutElementSize * 2)
     return SplitVecOp_UnaryOp(N);
   SDLoc DL(N);
 
+  // Don't touch if this will be scalarized.
+  EVT FinalVT = InVT;
+  while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
+    FinalVT = FinalVT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+  if (getTypeAction(FinalVT) == TargetLowering::TypeScalarizeVector)
+    return SplitVecOp_UnaryOp(N);
+
   // Get the split input vector.
   SDValue InLoVec, InHiVec;
   GetSplitVector(InVec, InLoVec, InHiVec);
+
   // Truncate them to 1/2 the element size.
   EVT HalfElementVT = IsFloat ?
     EVT::getFloatingPointVT(InElementSize/2) :
@@ -2378,6 +2425,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::UADDSAT:
+  case ISD::SADDSAT:
+  case ISD::USUBSAT:
+  case ISD::SSUBSAT:
     Res = WidenVecRes_Binary(N);
     break;
 
@@ -2714,7 +2765,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) {
   // The Chain is the first operand.
   InOps.push_back(N->getOperand(0));
 
-  // Now process the remaining operands. 
+  // Now process the remaining operands.
   for (unsigned i = 1; i < NumOpers; ++i) {
     SDValue Oper = N->getOperand(i);
 
@@ -2822,6 +2873,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       // If both input and result vector types are of same width, extend
       // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
       // accepts fewer elements in the result than in the input.
+      if (Opcode == ISD::ANY_EXTEND)
+        return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
       if (Opcode == ISD::SIGN_EXTEND)
         return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
       if (Opcode == ISD::ZERO_EXTEND)
diff --git a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index 490105a9d581486ed39ae96c2ca9cfa8cab9ec6b..f7566b246f323d13aa606026d04e507d03069922 100644
--- a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -53,6 +53,7 @@ private:
   enum DbgValueKind kind;
   bool IsIndirect;
   bool Invalid = false;
+  bool Emitted = false;
 
 public:
   /// Constructor for non-constants.
@@ -126,6 +127,15 @@ public:
   void setIsInvalidated() { Invalid = true; }
   bool isInvalidated() const { return Invalid; }
 
+  /// setIsEmitted / isEmitted - Getter/Setter for flag indicating that this
+  /// SDDbgValue has been emitted to an MBB.
+  void setIsEmitted() { Emitted = true; }
+  bool isEmitted() const { return Emitted; }
+
+  /// clearIsEmitted - Reset Emitted flag, for certain special cases where
+  /// dbg.addr is emitted twice.
+  void clearIsEmitted() { Emitted = false; }
+
   LLVM_DUMP_METHOD void dump(raw_ostream &OS) const;
 };
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 4b06cfcc7f674ba4a645e31333e231f8da800fc9..90e109b022fd5fee6110d95f9217db9ae7f21ecf 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -776,11 +776,9 @@ ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     if (N->getHasDebugValue()) {
       MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
       for (auto DV : DAG->GetDbgValues(N)) {
-        if (DV->isInvalidated())
-          continue;
-        if (auto *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap))
-          BB->insert(InsertPos, DbgMI);
-        DV->setIsInvalidated();
+        if (!DV->isEmitted())
+          if (auto *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap))
+            BB->insert(InsertPos, DbgMI);
       }
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 957b2e35494e098b562efa7b48b0c8f6c1096e5b..e258f0a218a538cfa0fde5a1cfe5888808abdb1e 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -722,7 +722,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
   MachineBasicBlock *BB = Emitter.getBlock();
   MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
   for (auto DV : DAG->GetDbgValues(N)) {
-    if (DV->isInvalidated())
+    if (DV->isEmitted())
       continue;
     unsigned DVOrder = DV->getOrder();
     if (!Order || DVOrder == Order) {
@@ -731,7 +731,6 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
         Orders.push_back({DVOrder, DbgMI});
         BB->insert(InsertPos, DbgMI);
       }
-      DV->setIsInvalidated();
     }
   }
 }
@@ -822,8 +821,12 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd();
     for (; PDI != PDE; ++PDI) {
       MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap);
-      if (DbgMI)
+      if (DbgMI) {
         BB->insert(InsertPos, DbgMI);
+        // We re-emit the dbg_value closer to its use, too, after instructions
+        // are emitted to the BB.
+        (*PDI)->clearIsEmitted();
+      }
     }
   }
 
@@ -889,7 +892,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
       for (; DI != DE; ++DI) {
         if ((*DI)->getOrder() < LastOrder || (*DI)->getOrder() >= Order)
           break;
-        if ((*DI)->isInvalidated())
+        if ((*DI)->isEmitted())
           continue;
 
         MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap);
@@ -911,7 +914,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     // some of them before one or more conditional branches?
     SmallVector<MachineInstr*, 8> DbgMIs;
     for (; DI != DE; ++DI) {
-      if ((*DI)->isInvalidated())
+      if ((*DI)->isEmitted())
         continue;
       assert((*DI)->getOrder() >= LastOrder &&
              "emitting DBG_VALUE out of order");
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index fce14d53c22a9a3c84e0a2acc58f343fd04b64e2..d69e50d9007be7ae2d2a00aa773b764684a2b39b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1685,7 +1685,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
   // SDNode doesn't have access to it.  This memory will be "leaked" when
   // the node is deallocated, but recovered when the NodeAllocator is released.
   int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
-  std::copy(MaskVec.begin(), MaskVec.end(), MaskAlloc);
+  llvm::copy(MaskVec, MaskAlloc);
 
   auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
                                            dl.getDebugLoc(), MaskAlloc);
@@ -2121,6 +2121,102 @@ bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
   return Mask.isSubsetOf(computeKnownBits(Op, Depth).Zero);
 }
 
+/// isSplatValue - Return true if the vector V has the same value
+/// across all DemandedElts.
+bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
+                                APInt &UndefElts) {
+  if (!DemandedElts)
+    return false; // No demanded elts, better to assume we don't know anything.
+
+  EVT VT = V.getValueType();
+  assert(VT.isVector() && "Vector type expected");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch");
+  UndefElts = APInt::getNullValue(NumElts);
+
+  switch (V.getOpcode()) {
+  case ISD::BUILD_VECTOR: {
+    SDValue Scl;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue Op = V.getOperand(i);
+      if (Op.isUndef()) {
+        UndefElts.setBit(i);
+        continue;
+      }
+      if (!DemandedElts[i])
+        continue;
+      if (Scl && Scl != Op)
+        return false;
+      Scl = Op;
+    }
+    return true;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    // Check if this is a shuffle node doing a splat.
+    // TODO: Do we need to handle shuffle(splat, undef, mask)?
+    int SplatIndex = -1;
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(V)->getMask();
+    for (int i = 0; i != (int)NumElts; ++i) {
+      int M = Mask[i];
+      if (M < 0) {
+        UndefElts.setBit(i);
+        continue;
+      }
+      if (!DemandedElts[i])
+        continue;
+      if (0 <= SplatIndex && SplatIndex != M)
+        return false;
+      SplatIndex = M;
+    }
+    return true;
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    SDValue Src = V.getOperand(0);
+    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+      // Offset the demanded elts by the subvector index.
+      uint64_t Idx = SubIdx->getZExtValue();
+      APInt UndefSrcElts;
+      APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+      if (isSplatValue(Src, DemandedSrc, UndefSrcElts)) {
+        UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
+        return true;
+      }
+    }
+    break;
+  }
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::AND: {
+    APInt UndefLHS, UndefRHS;
+    SDValue LHS = V.getOperand(0);
+    SDValue RHS = V.getOperand(1);
+    if (isSplatValue(LHS, DemandedElts, UndefLHS) &&
+        isSplatValue(RHS, DemandedElts, UndefRHS)) {
+      UndefElts = UndefLHS | UndefRHS;
+      return true;
+    }
+    break;
+  }
+  }
+
+  return false;
+}
+
+/// Helper wrapper to main isSplatValue function.
+bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) {
+  EVT VT = V.getValueType();
+  assert(VT.isVector() && "Vector type expected");
+  unsigned NumElts = VT.getVectorNumElements();
+
+  APInt UndefElts;
+  APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+  return isSplatValue(V, DemandedElts, UndefElts) &&
+         (AllowUndefs || !UndefElts);
+}
+
 /// Helper function that checks to see if a node is a constant or a
 /// build vector of splat constants at least within the demanded elts.
 static ConstantSDNode *isConstOrDemandedConstSplat(SDValue N,
@@ -2195,6 +2291,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
   KnownBits Known2;
   unsigned NumElts = DemandedElts.getBitWidth();
+  assert((!Op.getValueType().isVector() ||
+          NumElts == Op.getValueType().getVectorNumElements()) &&
+         "Unexpected vector size");
 
   if (!DemandedElts)
     return Known;  // No demanded elts, better to assume we don't know anything.
@@ -2203,8 +2302,6 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   switch (Opcode) {
   case ISD::BUILD_VECTOR:
     // Collect the known bits that are shared by every demanded vector element.
-    assert(NumElts == Op.getValueType().getVectorNumElements() &&
-           "Unexpected vector size");
     Known.Zero.setAllBits(); Known.One.setAllBits();
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       if (!DemandedElts[i])
@@ -2344,6 +2441,19 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     }
     break;
   }
+  case ISD::SCALAR_TO_VECTOR: {
+    // We know about scalar_to_vector as much as we know about it source,
+    // which becomes the first element of otherwise unknown vector.
+    if (DemandedElts != 1)
+      break;
+
+    SDValue N0 = Op.getOperand(0);
+    Known = computeKnownBits(N0, Depth + 1);
+    if (N0.getValueSizeInBits() != BitWidth)
+      Known = Known.trunc(BitWidth);
+
+    break;
+  }
   case ISD::BITCAST: {
     SDValue N0 = Op.getOperand(0);
     EVT SubVT = N0.getValueType();
@@ -2569,6 +2679,39 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known.One.ashrInPlace(Shift);
     }
     break;
+  case ISD::FSHL:
+  case ISD::FSHR:
+    if (ConstantSDNode *C =
+            isConstOrDemandedConstSplat(Op.getOperand(2), DemandedElts)) {
+      unsigned Amt = C->getAPIntValue().urem(BitWidth);
+
+      // For fshl, 0-shift returns the 1st arg.
+      // For fshr, 0-shift returns the 2nd arg.
+      if (Amt == 0) {
+        Known = computeKnownBits(Op.getOperand(Opcode == ISD::FSHL ? 0 : 1),
+                                 DemandedElts, Depth + 1);
+        break;
+      }
+
+      // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+      // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+      Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+      Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+      if (Opcode == ISD::FSHL) {
+        Known.One <<= Amt;
+        Known.Zero <<= Amt;
+        Known2.One.lshrInPlace(BitWidth - Amt);
+        Known2.Zero.lshrInPlace(BitWidth - Amt);
+      } else {
+        Known.One <<= BitWidth - Amt;
+        Known.Zero <<= BitWidth - Amt;
+        Known2.One.lshrInPlace(Amt);
+        Known2.Zero.lshrInPlace(Amt);
+      }
+      Known.One |= Known2.One;
+      Known.Zero |= Known2.Zero;
+    }
+    break;
   case ISD::SIGN_EXTEND_INREG: {
     EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     unsigned EBits = EVT.getScalarSizeInBits();
@@ -4167,9 +4310,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::ZERO_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     assert(VT.isVector() && "This DAG node is restricted to vector types.");
-    assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
-           "The sizes of the input and result must match in order to perform the "
-           "extend in-register.");
+    assert(Operand.getValueType().bitsLE(VT) &&
+           "The input must be the same size or smaller than the result.");
     assert(VT.getVectorNumElements() <
              Operand.getValueType().getVectorNumElements() &&
            "The destination vector type must have fewer lanes than the input.");
@@ -4387,14 +4529,20 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
     if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2))
       return FoldSymbolOffset(Opcode, VT, GA, Cst1);
 
-  // For vectors extract each constant element into Inputs so we can constant
-  // fold them individually.
-  BuildVectorSDNode *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
-  BuildVectorSDNode *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
-  if (!BV1 || !BV2)
+  // For vectors, extract each constant element and fold them individually.
+  // Either input may be an undef value.
+  auto *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
+  if (!BV1 && !Cst1->isUndef())
+    return SDValue();
+  auto *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
+  if (!BV2 && !Cst2->isUndef())
+    return SDValue();
+  // If both operands are undef, that's handled the same way as scalars.
+  if (!BV1 && !BV2)
     return SDValue();
 
-  assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!");
+  assert((!BV1 || !BV2 || BV1->getNumOperands() == BV2->getNumOperands()) &&
+         "Vector binop with different number of elements in operands?");
 
   EVT SVT = VT.getScalarType();
   EVT LegalSVT = SVT;
@@ -4404,10 +4552,10 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
       return SDValue();
   }
   SmallVector<SDValue, 4> Outputs;
-  for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) {
-    SDValue V1 = BV1->getOperand(I);
-    SDValue V2 = BV2->getOperand(I);
-
+  unsigned NumOps = BV1 ? BV1->getNumOperands() : BV2->getNumOperands();
+  for (unsigned I = 0; I != NumOps; ++I) {
+    SDValue V1 = BV1 ? BV1->getOperand(I) : getUNDEF(SVT);
+    SDValue V2 = BV2 ? BV2->getOperand(I) : getUNDEF(SVT);
     if (SVT.isInteger()) {
       if (V1->getValueType(0).bitsGT(SVT))
         V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
@@ -4635,6 +4783,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
+    if (SDValue V = simplifyShift(N1, N2))
+      return V;
+    LLVM_FALLTHROUGH;
   case ISD::ROTL:
   case ISD::ROTR:
     assert(VT == N1.getValueType() &&
@@ -4643,7 +4794,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "Shifts only work on integers");
     assert((!VT.isVector() || VT == N2.getValueType()) &&
            "Vector shift amounts must be in the same as their first arg");
-    // Verify that the shift amount VT is bit enough to hold valid shift
+    // Verify that the shift amount VT is big enough to hold valid shift
     // amounts.  This catches things like trying to shift an i1024 value by an
     // i8, which is easy to fall into in generic code that uses
     // TLI.getShiftAmount().
@@ -4691,8 +4842,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(!EVT.isVector() &&
            "AssertSExt/AssertZExt type should be the vector element type "
            "rather than the vector type!");
-    assert(EVT.bitsLE(VT) && "Not extending!");
-    if (VT == EVT) return N1; // noop assertion.
+    assert(EVT.bitsLE(VT.getScalarType()) && "Not extending!");
+    if (VT.getScalarType() == EVT) return N1; // noop assertion.
     break;
   }
   case ISD::SIGN_EXTEND_INREG: {
@@ -4929,14 +5080,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
   }
 
-  // Any FP binop with an undef operand is folded to NaN. This matches the
-  // behavior of the IR optimizer.
   switch (Opcode) {
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
+    // If both operands are undef, the result is undef. If 1 operand is undef,
+    // the result is NaN. This should match the behavior of the IR optimizer.
+    if (N1.isUndef() && N2.isUndef())
+      return getUNDEF(VT);
     if (N1.isUndef() || N2.isUndef())
       return getConstantFP(APFloat::getNaN(EVTToAPFloatSemantics(VT)), DL, VT);
   }
@@ -4955,9 +5108,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       case ISD::SDIV:
       case ISD::UREM:
       case ISD::SREM:
-      case ISD::SRA:
-      case ISD::SRL:
-      case ISD::SHL:
         return getConstant(0, DL, VT);    // fold op(undef, arg2) -> 0
       }
     }
@@ -4973,16 +5123,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
         return getConstant(0, DL, VT);
       LLVM_FALLTHROUGH;
     case ISD::ADD:
-    case ISD::ADDC:
-    case ISD::ADDE:
     case ISD::SUB:
     case ISD::UDIV:
     case ISD::SDIV:
     case ISD::UREM:
     case ISD::SREM:
-    case ISD::SRA:
-    case ISD::SRL:
-    case ISD::SHL:
       return getUNDEF(VT);       // fold op(arg1, undef) -> undef
     case ISD::MUL:
     case ISD::AND:
@@ -5078,13 +5223,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   }
   case ISD::SELECT:
-    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
-     if (N1C->getZExtValue())
-       return N2;             // select true, X, Y -> X
-     return N3;             // select false, X, Y -> Y
-    }
-
-    if (N2 == N3) return N2;   // select C, X, X -> X
+  case ISD::VSELECT:
+    if (SDValue V = simplifySelect(N1, N2, N3))
+      return V;
     break;
   case ISD::VECTOR_SHUFFLE:
     llvm_unreachable("should use getVectorShuffle constructor!");
@@ -5383,12 +5524,10 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
 
       // If the new VT cannot cover all of the remaining bits, then consider
       // issuing a (or a pair of) unaligned and overlapping load / store.
-      // FIXME: Only does this for 64-bit or more since we don't have proper
-      // cost model for unaligned load / store.
       bool Fast;
-      if (NumMemOps && AllowOverlap &&
-          VTSize >= 8 && NewVTSize < Size &&
-          TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && Fast)
+      if (NumMemOps && AllowOverlap && NewVTSize < Size &&
+          TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) &&
+          Fast)
         VTSize = Size;
       else {
         VT = NewVT;
@@ -6784,6 +6923,60 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
   return V;
 }
 
+SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
+  // select undef, T, F --> T (if T is a constant), otherwise F
+  // select, ?, undef, F --> F
+  // select, ?, T, undef --> T
+  if (Cond.isUndef())
+    return isConstantValueOfAnyType(T) ? T : F;
+  if (T.isUndef())
+    return F;
+  if (F.isUndef())
+    return T;
+
+  // select true, T, F --> T
+  // select false, T, F --> F
+  if (auto *CondC = dyn_cast<ConstantSDNode>(Cond))
+    return CondC->isNullValue() ? F : T;
+
+  // TODO: This should simplify VSELECT with constant condition using something
+  // like this (but check boolean contents to be complete?):
+  //  if (ISD::isBuildVectorAllOnes(Cond.getNode()))
+  //    return T;
+  //  if (ISD::isBuildVectorAllZeros(Cond.getNode()))
+  //    return F;
+
+  // select ?, T, T --> T
+  if (T == F)
+    return T;
+
+  return SDValue();
+}
+
+SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
+  // shift undef, Y --> 0 (can always assume that the undef value is 0)
+  if (X.isUndef())
+    return getConstant(0, SDLoc(X.getNode()), X.getValueType());
+  // shift X, undef --> undef (because it may shift by the bitwidth)
+  if (Y.isUndef())
+    return getUNDEF(X.getValueType());
+
+  // shift 0, Y --> 0
+  // shift X, 0 --> X
+  if (isNullOrNullSplat(X) || isNullOrNullSplat(Y))
+    return X;
+
+  // shift X, C >= bitwidth(X) --> undef
+  // All vector elements must be too big to avoid partial undefs.
+  auto isShiftTooBig = [X](ConstantSDNode *Val) {
+    return Val->getAPIntValue().uge(X.getScalarValueSizeInBits());
+  };
+  if (ISD::matchUnaryPredicate(Y, isShiftTooBig))
+    return getUNDEF(X.getValueType());
+
+  return SDValue();
+}
+
 SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
                                SDValue Ptr, SDValue SV, unsigned Align) {
   SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
@@ -7039,7 +7232,7 @@ SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
   if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(NumVTs);
-    std::copy(VTs.begin(), VTs.end(), Array);
+    llvm::copy(VTs, Array);
     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
     VTListMap.InsertNode(Result, IP);
   }
@@ -7185,7 +7378,7 @@ void SelectionDAG::setNodeMemRefs(MachineSDNode *N,
 
   MachineMemOperand **MemRefsBuffer =
       Allocator.template Allocate<MachineMemOperand *>(NewMemRefs.size());
-  std::copy(NewMemRefs.begin(), NewMemRefs.end(), MemRefsBuffer);
+  llvm::copy(NewMemRefs, MemRefsBuffer);
   N->MemRefs = MemRefsBuffer;
   N->NumMemRefs = static_cast<int>(NewMemRefs.size());
 }
@@ -7674,8 +7867,11 @@ void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
                     Dbg->getDebugLoc(), Dbg->getOrder());
     ClonedDVs.push_back(Clone);
 
-    if (InvalidateDbg)
+    if (InvalidateDbg) {
+      // Invalidate value and indicate the SDDbgValue should not be emitted.
       Dbg->setIsInvalidated();
+      Dbg->setIsEmitted();
+    }
   }
 
   for (SDDbgValue *Dbg : ClonedDVs)
@@ -7712,6 +7908,7 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
                         DV->isIndirect(), DV->getDebugLoc(), DV->getOrder());
         ClonedDVs.push_back(Clone);
         DV->setIsInvalidated();
+        DV->setIsEmitted();
         LLVM_DEBUG(dbgs() << "SALVAGE: Rewriting";
                    N0.getNode()->dumprFull(this);
                    dbgs() << " into " << *DIExpr << '\n');
@@ -8319,6 +8516,26 @@ ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N, bool AllowUndefs) {
   return nullptr;
 }
 
+bool llvm::isNullOrNullSplat(SDValue N) {
+  // TODO: may want to use peekThroughBitcast() here.
+  ConstantSDNode *C = isConstOrConstSplat(N);
+  return C && C->isNullValue();
+}
+
+bool llvm::isOneOrOneSplat(SDValue N) {
+  // TODO: may want to use peekThroughBitcast() here.
+  unsigned BitWidth = N.getScalarValueSizeInBits();
+  ConstantSDNode *C = isConstOrConstSplat(N);
+  return C && C->isOne() && C->getValueSizeInBits(0) == BitWidth;
+}
+
+bool llvm::isAllOnesOrAllOnesSplat(SDValue N) {
+  N = peekThroughBitcasts(N);
+  unsigned BitWidth = N.getScalarValueSizeInBits();
+  ConstantSDNode *C = isConstOrConstSplat(N);
+  return C && C->isAllOnesValue() && C->getValueSizeInBits(0) == BitWidth;
+}
+
 HandleSDNode::~HandleSDNode() {
   DropOperands();
 }
@@ -8943,8 +9160,11 @@ SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
 
 void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
   assert(!Node->OperandList && "Node already has operands");
+  assert(std::numeric_limits<decltype(SDNode::NumOperands)>::max() >
+             Vals.size() &&
+         "too many operands to fit into SDNode");
   SDUse *Ops = OperandRecycler.allocate(
-    ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
+      ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
 
   bool IsDivergent = false;
   for (unsigned I = 0; I != Vals.size(); ++I) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 8c57f18183e95777dd6210dd7440225559509117..488bac1a9a80013b17c4fd6eb9ef0a09129f2a3f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -107,14 +107,14 @@ BaseIndexOffset BaseIndexOffset::match(const LSBaseSDNode *N,
       if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1)))
         if (DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue())) {
           Offset += C->getSExtValue();
-          Base = Base->getOperand(0);
+          Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0));
           continue;
         }
       break;
     case ISD::ADD:
       if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) {
         Offset += C->getSExtValue();
-        Base = Base->getOperand(0);
+        Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0));
         continue;
       }
       break;
@@ -130,7 +130,7 @@ BaseIndexOffset BaseIndexOffset::match(const LSBaseSDNode *N,
             Offset -= Off;
           else
             Offset += Off;
-          Base = LSBase->getBasePtr();
+          Base = DAG.getTargetLoweringInfo().unwrapAddress(LSBase->getBasePtr());
           continue;
         }
       break;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index bf24d7f75622747e28589419c2c64f2ffe1c8734..c42d2491243553ff306423622fdf4f7a34b2ed2c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -823,7 +823,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
       // If the source register was virtual and if we know something about it,
       // add an assert node.
       if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) ||
-          !RegisterVT.isInteger() || RegisterVT.isVector())
+          !RegisterVT.isInteger())
         continue;
 
       const FunctionLoweringInfo::LiveOutInfo *LOI =
@@ -831,7 +831,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
       if (!LOI)
         continue;
 
-      unsigned RegSize = RegisterVT.getSizeInBits();
+      unsigned RegSize = RegisterVT.getScalarSizeInBits();
       unsigned NumSignBits = LOI->NumSignBits;
       unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
 
@@ -1032,8 +1032,19 @@ SDValue SelectionDAGBuilder::getRoot() {
   }
 
   // Otherwise, we have to make a token factor node.
-  SDValue Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
-                             PendingLoads);
+  // If we have >= 2^16 loads then split across multiple token factors as
+  // there's a 64k limit on the number of SDNode operands.
+  SDValue Root;
+  size_t Limit = (1 << 16) - 1;
+  while (PendingLoads.size() > Limit) {
+    unsigned SliceIdx = PendingLoads.size() - Limit;
+    auto ExtractedTFs = ArrayRef<SDValue>(PendingLoads).slice(SliceIdx, Limit);
+    SDValue NewTF =
+        DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, ExtractedTFs);
+    PendingLoads.erase(PendingLoads.begin() + SliceIdx, PendingLoads.end());
+    PendingLoads.emplace_back(NewTF);
+  }
+  Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, PendingLoads);
   PendingLoads.clear();
   DAG.setRoot(Root);
   return Root;
@@ -2801,6 +2812,15 @@ static bool isVectorReductionOp(const User *I) {
   return ReduxExtracted;
 }
 
+void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
+  SDNodeFlags Flags;
+
+  SDValue Op = getValue(I.getOperand(0));
+  SDValue UnNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op.getValueType(),
+                                    Op, Flags);
+  setValue(&I, UnNodeValue);
+}
+
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
   if (auto *OFBinOp = dyn_cast<OverflowingBinaryOperator>(&I)) {
@@ -5292,7 +5312,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       return nullptr;
 
     SDDbgValue *SDV;
-    if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
+    if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V) ||
+        isa<ConstantPointerNull>(V)) {
       SDV = DAG.getConstantDbgValue(Variable, Expression, V, dl, SDNodeOrder);
       DAG.AddDbgValue(SDV, nullptr, false);
       return nullptr;
@@ -5731,6 +5752,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Zero = DAG.getConstant(0, sdl, VT);
     SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
 
+    auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
+    if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
+      setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
+      return nullptr;
+    }
+
     // When X == Y, this is rotate. If the data type has a power-of-2 size, we
     // avoid the select that is necessary in the general case to filter out
     // the 0-shift possibility that leads to UB.
@@ -5805,6 +5832,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
     return nullptr;
   }
+  case Intrinsic::smul_fix: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    setValue(&I,
+             DAG.getNode(ISD::SMULFIX, sdl, Op1.getValueType(), Op1, Op2, Op3));
+    return nullptr;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 4b5dda982f1b5c65252cfac4bf251b88f766d403..5f9cdb69daf72d0d017ace630f7f6f7bed163951 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -854,6 +854,9 @@ private:
   void visitInvoke(const InvokeInst &I);
   void visitResume(const ResumeInst &I);
 
+  void visitUnary(const User &I, unsigned Opcode);
+  void visitFNeg(const User &I) { visitUnary(I, ISD::FNEG); }
+
   void visitBinary(const User &I, unsigned Opcode);
   void visitShift(const User &I, unsigned Opcode);
   void visitAdd(const User &I)  { visitBinary(I, ISD::ADD); }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 02d45df5864d906aadf12959eff1ab9cfe64b3e9..43df2abb674bebb53530040b91156b783c121efb 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -237,6 +237,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SRL:                        return "srl";
   case ISD::ROTL:                       return "rotl";
   case ISD::ROTR:                       return "rotr";
+  case ISD::FSHL:                       return "fshl";
+  case ISD::FSHR:                       return "fshr";
   case ISD::FADD:                       return "fadd";
   case ISD::STRICT_FADD:                return "strict_fadd";
   case ISD::FSUB:                       return "fsub";
@@ -295,6 +297,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::UADDSAT:                    return "uaddsat";
   case ISD::SSUBSAT:                    return "ssubsat";
   case ISD::USUBSAT:                    return "usubsat";
+  case ISD::SMULFIX:                    return "smulfix";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index dca358032fb59fec6881a27b0a3ac9a136cc15fb..7d341dae210b1e532b6cfc3654dc50345e2ddc50 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -177,7 +177,8 @@ static const bool ViewDAGCombine1 = false,
 /// RegisterScheduler class - Track the registration of instruction schedulers.
 ///
 //===---------------------------------------------------------------------===//
-MachinePassRegistry RegisterScheduler::Registry;
+MachinePassRegistry<RegisterScheduler::FunctionPassCtor>
+    RegisterScheduler::Registry;
 
 //===---------------------------------------------------------------------===//
 ///
@@ -696,10 +697,10 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
     if (!TargetRegisterInfo::isVirtualRegister(DestReg))
       continue;
 
-    // Ignore non-scalar or non-integer values.
+    // Ignore non-integer values.
     SDValue Src = N->getOperand(2);
     EVT SrcVT = Src.getValueType();
-    if (!SrcVT.isInteger() || SrcVT.isVector())
+    if (!SrcVT.isInteger())
       continue;
 
     unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
@@ -3206,6 +3207,18 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
                                 N.getNode()))
         break;
       continue;
+    case OPC_CheckPredicateWithOperands: {
+      unsigned OpNum = MatcherTable[MatcherIndex++];
+      SmallVector<SDValue, 8> Operands;
+
+      for (unsigned i = 0; i < OpNum; ++i)
+        Operands.push_back(RecordedNodes[MatcherTable[MatcherIndex++]].first);
+
+      unsigned PredNo = MatcherTable[MatcherIndex++];
+      if (!CheckNodePredicateWithOperands(N.getNode(), PredNo, Operands))
+        break;
+      continue;
+    }
     case OPC_CheckComplexPat: {
       unsigned CPNum = MatcherTable[MatcherIndex++];
       unsigned RecNo = MatcherTable[MatcherIndex++];
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 54cbd6859f7031eb8682c5d72891f761af8d89f4..90a1b350fc942e1bbb426066c7c71c3ecd83d3b8 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -522,7 +522,16 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // The vm state arguments are lowered in an opaque manner.  We do not know
   // what type of values are contained within.
   for (const Value *V : SI.DeoptState) {
-    SDValue Incoming = Builder.getValue(V);
+    SDValue Incoming;
+    // If this is a function argument at a static frame index, generate it as
+    // the frame index.
+    if (const Argument *Arg = dyn_cast<Argument>(V)) {
+      int FI = Builder.FuncInfo.getArgumentFrameIndex(Arg);
+      if (FI != INT_MAX)
+        Incoming = Builder.DAG.getFrameIndex(FI, Builder.getFrameIndexTy());
+    }
+    if (!Incoming.getNode())
+      Incoming = Builder.getValue(V);
     const bool LiveInValue = LiveInDeopt && !isGCValue(V);
     lowerIncomingStatepointValue(Incoming, LiveInValue, Ops, Builder);
   }
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 2bc9090428b4a45a7e1c58091f5c29f7ff0a45cc..4c551d5b231ff8f56b0fd6fac193a2e9ee1a0ce9 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -496,23 +496,41 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
   return Simplified;
 }
 
+bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                                          KnownBits &Known,
+                                          TargetLoweringOpt &TLO,
+                                          unsigned Depth,
+                                          bool AssumeSingleUse) const {
+  EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, Depth,
+                              AssumeSingleUse);
+}
+
 /// Look at Op. At this point, we know that only the OriginalDemandedBits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
 /// original and new nodes in Old and New. Otherwise, analyze the expression and
 /// return a mask of Known bits for the expression (used to simplify the
 /// caller).  The Known bits may only be accurate for those bits in the
-/// DemandedMask.
-bool TargetLowering::SimplifyDemandedBits(SDValue Op,
-                                          const APInt &OriginalDemandedBits,
-                                          KnownBits &Known,
-                                          TargetLoweringOpt &TLO,
-                                          unsigned Depth,
-                                          bool AssumeSingleUse) const {
+/// OriginalDemandedBits and OriginalDemandedElts.
+bool TargetLowering::SimplifyDemandedBits(
+    SDValue Op, const APInt &OriginalDemandedBits,
+    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+    unsigned Depth, bool AssumeSingleUse) const {
   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
   assert(Op.getScalarValueSizeInBits() == BitWidth &&
          "Mask size mismatches value type size!");
+
+  unsigned NumElts = OriginalDemandedElts.getBitWidth();
+  assert((!Op.getValueType().isVector() ||
+          NumElts == Op.getValueType().getVectorNumElements()) &&
+         "Unexpected vector size");
+
   APInt DemandedBits = OriginalDemandedBits;
+  APInt DemandedElts = OriginalDemandedElts;
   SDLoc dl(Op);
   auto &DL = TLO.DAG.getDataLayout();
 
@@ -532,18 +550,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (Depth != 0) {
       // If not at the root, Just compute the Known bits to
       // simplify things downstream.
-      TLO.DAG.computeKnownBits(Op, Known, Depth);
+      TLO.DAG.computeKnownBits(Op, Known, DemandedElts, Depth);
       return false;
     }
     // If this is the root being simplified, allow it to have multiple uses,
-    // just set the DemandedBits to all bits.
+    // just set the DemandedBits/Elts to all bits.
     DemandedBits = APInt::getAllOnesValue(BitWidth);
-  } else if (OriginalDemandedBits == 0) {
-    // Not demanding any bits from Op.
+    DemandedElts = APInt::getAllOnesValue(NumElts);
+  } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) {
+    // Not demanding any bits/elts from Op.
     if (!Op.isUndef())
       return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
     return false;
-  } else if (Depth == 6) {        // Limit search depth.
+  } else if (Depth == 6) { // Limit search depth.
     return false;
   }
 
@@ -573,18 +592,71 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
-    return false;   // Don't fall through, will infinitely loop.
-  case ISD::CONCAT_VECTORS:
+    return false; // Don't fall through, will infinitely loop.
+  case ISD::CONCAT_VECTORS: {
     Known.Zero.setAllBits();
     Known.One.setAllBits();
-    for (SDValue SrcOp : Op->ops()) {
-      if (SimplifyDemandedBits(SrcOp, DemandedBits, Known2, TLO, Depth + 1))
+    EVT SubVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVecs = Op.getNumOperands();
+    unsigned NumSubElts = SubVT.getVectorNumElements();
+    for (unsigned i = 0; i != NumSubVecs; ++i) {
+      APInt DemandedSubElts =
+          DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+      if (SimplifyDemandedBits(Op.getOperand(i), DemandedBits, DemandedSubElts,
+                               Known2, TLO, Depth + 1))
         return true;
-      // Known bits are the values that are shared by every subvector.
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      // Known bits are shared by every demanded subvector element.
+      if (!!DemandedSubElts) {
+        Known.One &= Known2.One;
+        Known.Zero &= Known2.Zero;
+      }
+    }
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+    // Collect demanded elements from shuffle operands..
+    APInt DemandedLHS(NumElts, 0);
+    APInt DemandedRHS(NumElts, 0);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (!DemandedElts[i])
+        continue;
+      int M = ShuffleMask[i];
+      if (M < 0) {
+        // For UNDEF elements, we don't know anything about the common state of
+        // the shuffle result.
+        DemandedLHS.clearAllBits();
+        DemandedRHS.clearAllBits();
+        break;
+      }
+      assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
+      if (M < (int)NumElts)
+        DemandedLHS.setBit(M);
+      else
+        DemandedRHS.setBit(M - NumElts);
+    }
+
+    if (!!DemandedLHS || !!DemandedRHS) {
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
+      if (!!DemandedLHS) {
+        if (SimplifyDemandedBits(Op.getOperand(0), DemandedBits, DemandedLHS,
+                                 Known2, TLO, Depth + 1))
+          return true;
+        Known.One &= Known2.One;
+        Known.Zero &= Known2.Zero;
+      }
+      if (!!DemandedRHS) {
+        if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedRHS,
+                                 Known2, TLO, Depth + 1))
+          return true;
+        Known.One &= Known2.One;
+        Known.Zero &= Known2.Zero;
+      }
     }
     break;
+  }
   case ISD::AND: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
@@ -596,7 +668,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (ConstantSDNode *RHSC = isConstOrConstSplat(Op1)) {
       KnownBits LHSKnown;
       // Do not increment Depth here; that can cause an infinite loop.
-      TLO.DAG.computeKnownBits(Op0, LHSKnown, Depth);
+      TLO.DAG.computeKnownBits(Op0, LHSKnown, DemandedElts, Depth);
       // If the LHS already has zeros where RHSC does, this 'and' is dead.
       if ((LHSKnown.Zero & DemandedBits) ==
           (~RHSC->getAPIntValue() & DemandedBits))
@@ -619,10 +691,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       }
     }
 
-    if (SimplifyDemandedBits(Op1, DemandedBits, Known, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO, Depth + 1))
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
-    if (SimplifyDemandedBits(Op0, ~Known.Zero & DemandedBits, Known2, TLO,
+    if (SimplifyDemandedBits(Op0, ~Known.Zero & DemandedBits, DemandedElts, Known2, TLO,
                              Depth + 1))
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
@@ -653,10 +725,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
-    if (SimplifyDemandedBits(Op1, DemandedBits, Known, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO, Depth + 1))
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
-    if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, Known2, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, DemandedElts, Known2, TLO,
+                             Depth + 1))
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
@@ -683,10 +756,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
-    if (SimplifyDemandedBits(Op1, DemandedBits, Known, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO, Depth + 1))
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
-    if (SimplifyDemandedBits(Op0, DemandedBits, Known2, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known2, TLO, Depth + 1))
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
@@ -840,7 +913,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         }
       }
 
-      if (SimplifyDemandedBits(Op0, DemandedBits.lshr(ShAmt), Known, TLO,
+      if (SimplifyDemandedBits(Op0, DemandedBits.lshr(ShAmt), DemandedElts, Known, TLO,
                                Depth + 1))
         return true;
 
@@ -935,7 +1008,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       }
 
       // Compute the new bits that are at the top now.
-      if (SimplifyDemandedBits(Op0, InDemandedMask, Known, TLO, Depth + 1))
+      if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1))
         return true;
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShAmt);
@@ -974,7 +1047,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       if (DemandedBits.countLeadingZeros() < ShAmt)
         InDemandedMask.setSignBit();
 
-      if (SimplifyDemandedBits(Op0, InDemandedMask, Known, TLO, Depth + 1))
+      if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1))
         return true;
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShAmt);
@@ -1084,19 +1157,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     break;
   }
   case ISD::ZERO_EXTEND: {
-    unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
+    SDValue Src = Op.getOperand(0);
+    unsigned InBits = Src.getScalarValueSizeInBits();
 
     // If none of the top bits are demanded, convert this into an any_extend.
-    if (DemandedBits.getActiveBits() <= OperandBitWidth)
-      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
-                                               Op.getOperand(0)));
+    if (DemandedBits.getActiveBits() <= InBits)
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, Src));
 
-    APInt InMask = DemandedBits.trunc(OperandBitWidth);
-    if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1))
+    APInt InDemandedBits = DemandedBits.trunc(InBits);
+    if (SimplifyDemandedBits(Src, InDemandedBits, Known, TLO, Depth+1))
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     Known = Known.zext(BitWidth);
-    Known.Zero.setBitsFrom(OperandBitWidth);
+    Known.Zero.setBitsFrom(InBits);
     break;
   }
   case ISD::SIGN_EXTEND: {
@@ -1143,9 +1216,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     break;
   }
   case ISD::ANY_EXTEND: {
-    unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
-    APInt InMask = DemandedBits.trunc(OperandBitWidth);
-    if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1))
+    SDValue Src = Op.getOperand(0);
+    unsigned InBits = Src.getScalarValueSizeInBits();
+    APInt InDemandedBits = DemandedBits.trunc(InBits);
+    if (SimplifyDemandedBits(Src, InDemandedBits, Known, TLO, Depth+1))
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     Known = Known.zext(BitWidth);
@@ -1219,6 +1293,33 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     Known.Zero |= ~InMask;
     break;
   }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Src = Op.getOperand(0);
+    SDValue Idx = Op.getOperand(1);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    unsigned EltBitWidth = Src.getScalarValueSizeInBits();
+
+    // Demand the bits from every vector element without a constant index.
+    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx))
+      if (CIdx->getAPIntValue().ult(NumSrcElts))
+        DemandedSrcElts = APInt::getOneBitSet(NumSrcElts, CIdx->getZExtValue());
+
+    // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
+    // anything about the extended bits.
+    APInt DemandedSrcBits = DemandedBits;
+    if (BitWidth > EltBitWidth)
+      DemandedSrcBits = DemandedSrcBits.trunc(EltBitWidth);
+
+    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, Known2, TLO,
+                             Depth + 1))
+      return true;
+
+    Known = Known2;
+    if (BitWidth > EltBitWidth)
+      Known = Known.zext(BitWidth);
+    break;
+  }
   case ISD::BITCAST: {
     SDValue Src = Op.getOperand(0);
     EVT SrcVT = Src.getValueType();
@@ -1247,8 +1348,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                              TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt));
       }
     }
-    // If bitcast from a vector and the mask covers entire elements, see if we
-    // can use SimplifyDemandedVectorElts.
+    // If bitcast from a vector, see if we can use SimplifyDemandedVectorElts by
+    // demanding the element if any bits from it are demanded.
     // TODO - bigendian once we have test coverage.
     // TODO - bool vectors once SimplifyDemandedVectorElts has SETCC support.
     if (SrcVT.isVector() && NumSrcEltBits > 1 &&
@@ -1260,10 +1361,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         for (unsigned i = 0; i != Scale; ++i) {
           unsigned Offset = i * NumSrcEltBits;
           APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
-          if (Sub.isAllOnesValue())
+          if (!Sub.isNullValue())
             DemandedSubElts.setBit(i);
-          else if (!Sub.isNullValue())
-            return false;
         }
         return true;
       };
@@ -1295,8 +1394,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
     unsigned DemandedBitsLZ = DemandedBits.countLeadingZeros();
     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
-    if (SimplifyDemandedBits(Op0, LoMask, Known2, TLO, Depth + 1) ||
-        SimplifyDemandedBits(Op1, LoMask, Known2, TLO, Depth + 1) ||
+    if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, Known2, TLO, Depth + 1) ||
+        SimplifyDemandedBits(Op1, LoMask, DemandedElts, Known2, TLO, Depth + 1) ||
         // See if the operation should be performed at a smaller bit width.
         ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
       SDNodeFlags Flags = Op.getNode()->getFlags();
@@ -1336,14 +1435,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   }
   default:
     if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
-      if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, Known, TLO,
-                                            Depth))
+      if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, DemandedElts,
+                                            Known, TLO, Depth))
         return true;
       break;
     }
 
     // Just use computeKnownBits to compute output bits.
-    TLO.DAG.computeKnownBits(Op, Known, Depth);
+    TLO.DAG.computeKnownBits(Op, Known, DemandedElts, Depth);
     break;
   }
 
@@ -1360,7 +1459,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         if (C->isOpaque())
           return false;
     }
-    return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT));
+    // TODO: Handle float bits as well.
+    if (VT.isInteger())
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT));
   }
 
   return false;
@@ -1459,6 +1560,23 @@ bool TargetLowering::SimplifyDemandedVectorElts(
                                      TLO, Depth + 1))
         return true;
 
+      // Try calling SimplifyDemandedBits, converting demanded elts to the bits
+      // of the large element.
+      // TODO - bigendian once we have test coverage.
+      if (TLO.DAG.getDataLayout().isLittleEndian()) {
+        unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits();
+        APInt SrcDemandedBits = APInt::getNullValue(SrcEltSizeInBits);
+        for (unsigned i = 0; i != NumElts; ++i)
+          if (DemandedElts[i]) {
+            unsigned Ofs = (i % Scale) * EltSizeInBits;
+            SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits);
+          }
+
+        KnownBits Known;
+        if (SimplifyDemandedBits(Src, SrcDemandedBits, Known, TLO, Depth + 1))
+          return true;
+      }
+
       // If the src element is zero/undef then all the output elements will be -
       // only demanded elements are guaranteed to be correct.
       for (unsigned i = 0; i != NumSrcElts; ++i) {
@@ -1551,7 +1669,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     EVT SubVT = Sub.getValueType();
     unsigned NumSubElts = SubVT.getVectorNumElements();
     const APInt& Idx = cast<ConstantSDNode>(Op.getOperand(2))->getAPIntValue();
-    if (Idx.uge(NumElts - NumSubElts))
+    if (Idx.ugt(NumElts - NumSubElts))
       break;
     unsigned SubIdx = Idx.getZExtValue();
     APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
@@ -1596,9 +1714,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       unsigned Idx = CIdx->getZExtValue();
       if (!DemandedElts[Idx])
         return TLO.CombineTo(Op, Vec);
-      DemandedElts.clearBit(Idx);
 
-      if (SimplifyDemandedVectorElts(Vec, DemandedElts, KnownUndef,
+      APInt DemandedVecElts(DemandedElts);
+      DemandedVecElts.clearBit(Idx);
+      if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
                                      KnownZero, TLO, Depth + 1))
         return true;
 
@@ -1718,6 +1837,21 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     }
     break;
   }
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    APInt SrcUndef, SrcZero;
+    SDValue Src = Op.getOperand(0);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts);
+    if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef,
+                                   SrcZero, TLO, Depth + 1))
+      return true;
+    KnownZero = SrcZero.zextOrTrunc(NumElts);
+    KnownUndef = SrcUndef.zextOrTrunc(NumElts);
+    break;
+  }
+  case ISD::OR:
+  case ISD::XOR:
   case ISD::ADD:
   case ISD::SUB:
   case ISD::FADD:
@@ -1736,21 +1870,51 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     KnownUndef &= SrcUndef;
     break;
   }
+  case ISD::AND: {
+    APInt SrcUndef, SrcZero;
+    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
+                                   SrcZero, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
+                                   KnownZero, TLO, Depth + 1))
+      return true;
+
+    // If either side has a zero element, then the result element is zero, even
+    // if the other is an UNDEF.
+    KnownZero |= SrcZero;
+    KnownUndef &= SrcUndef;
+    KnownUndef &= ~KnownZero;
+    break;
+  }
   case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
                                    KnownZero, TLO, Depth + 1))
       return true;
     break;
   default: {
-    if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
       if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
                                                   KnownZero, TLO, Depth))
         return true;
+    } else {
+      KnownBits Known;
+      APInt DemandedBits = APInt::getAllOnesValue(EltSizeInBits);
+      if (SimplifyDemandedBits(Op, DemandedBits, DemandedEltMask, Known, TLO,
+                               Depth, AssumeSingleUse))
+        return true;
+    }
     break;
   }
   }
-
   assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero");
+
+  // Constant fold all undef cases.
+  // TODO: Handle zero cases as well.
+  if (DemandedElts.isSubsetOf(KnownUndef))
+    return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+
   return false;
 }
 
@@ -1811,18 +1975,14 @@ bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 }
 
 bool TargetLowering::SimplifyDemandedBitsForTargetNode(
-    SDValue Op, const APInt &DemandedBits, KnownBits &Known,
-    TargetLoweringOpt &TLO, unsigned Depth) const {
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
           Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
           Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
           Op.getOpcode() == ISD::INTRINSIC_VOID) &&
          "Should use SimplifyDemandedBits if you don't know whether Op"
          " is a target node!");
-  EVT VT = Op.getValueType();
-  APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
-                           : APInt(1, 1);
   computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
   return false;
 }
@@ -2963,8 +3123,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
 /// Returns true (and the GlobalValue and the offset) if the node is a
 /// GlobalAddress + offset.
-bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA,
+bool TargetLowering::isGAPlusOffset(SDNode *WN, const GlobalValue *&GA,
                                     int64_t &Offset) const {
+
+  SDNode *N = unwrapAddress(SDValue(WN, 0)).getNode();
+
   if (auto *GASD = dyn_cast<GlobalAddressSDNode>(N)) {
     GA = GASD->getGlobal();
     Offset += GASD->getOffset();
@@ -4028,8 +4191,17 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl,
   if (!MakeMUL_LOHI(LH, RL, Lo, Hi, false))
     return false;
 
-  Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next,
-                     Merge(Lo, Hi));
+  SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
+  EVT BoolType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  bool UseGlue = (isOperationLegalOrCustom(ISD::ADDC, VT) &&
+                  isOperationLegalOrCustom(ISD::ADDE, VT));
+  if (UseGlue)
+    Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next,
+                       Merge(Lo, Hi));
+  else
+    Next = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(VT, BoolType), Next,
+                       Merge(Lo, Hi), DAG.getConstant(0, dl, BoolType));
 
   SDValue Carry = Next.getValue(1);
   Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
@@ -4038,9 +4210,13 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl,
   if (!MakeMUL_LOHI(LH, RH, Lo, Hi, Opcode == ISD::SMUL_LOHI))
     return false;
 
-  SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
-  Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero,
-                   Carry);
+  if (UseGlue)
+    Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero,
+                     Carry);
+  else
+    Hi = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(HiLoVT, BoolType), Hi,
+                     Zero, Carry);
+
   Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));
 
   if (Opcode == ISD::SMUL_LOHI) {
@@ -4075,6 +4251,99 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
   return Ok;
 }
 
+bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
+                                       SelectionDAG &DAG) const {
+  EVT VT = Node->getValueType(0);
+
+  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
+    return false;
+
+  // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+  // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+  SDValue X = Node->getOperand(0);
+  SDValue Y = Node->getOperand(1);
+  SDValue Z = Node->getOperand(2);
+
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  bool IsFSHL = Node->getOpcode() == ISD::FSHL;
+  SDLoc DL(SDValue(Node, 0));
+
+  EVT ShVT = Z.getValueType();
+  SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
+  SDValue Zero = DAG.getConstant(0, DL, ShVT);
+
+  SDValue ShAmt;
+  if (isPowerOf2_32(EltSizeInBits)) {
+    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
+    ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
+  } else {
+    ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
+  }
+
+  SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
+  SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
+  SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
+
+  // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
+  // and that is undefined. We must compare and select to avoid UB.
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
+
+  // For fshl, 0-shift returns the 1st arg (X).
+  // For fshr, 0-shift returns the 2nd arg (Y).
+  SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
+  Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
+  return true;
+}
+
+// TODO: Merge with expandFunnelShift.
+bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
+                               SelectionDAG &DAG) const {
+  EVT VT = Node->getValueType(0);
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  bool IsLeft = Node->getOpcode() == ISD::ROTL;
+  SDValue Op0 = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDLoc DL(SDValue(Node, 0));
+
+  EVT ShVT = Op1.getValueType();
+  SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
+
+  // If a rotate in the other direction is legal, use it.
+  unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
+  if (isOperationLegal(RevRot, VT)) {
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1);
+    Result = DAG.getNode(RevRot, DL, VT, Op0, Sub);
+    return true;
+  }
+
+  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
+    return false;
+
+  // Otherwise,
+  //   (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1)))
+  //   (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1)))
+  //
+  assert(isPowerOf2_32(EltSizeInBits) && EltSizeInBits > 1 &&
+         "Expecting the type bitwidth to be a power of 2");
+  unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
+  unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
+  SDValue BitWidthMinusOneC = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
+  SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1);
+  SDValue And0 = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC);
+  SDValue And1 = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC);
+  Result = DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ShOpc, DL, VT, Op0, And0),
+                       DAG.getNode(HsOpc, DL, VT, Op0, And1));
+  return true;
+}
+
 bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
                                SelectionDAG &DAG) const {
   SDValue Src = Node->getOperand(0);
@@ -4153,24 +4422,51 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
                            !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
     return false;
 
-  // Expand based on maximum range of FP_TO_SINT:
-  // True = fp_to_sint(Src)
-  // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
-  // Result = select (Src < 0x8000000000000000), True, False
-  APFloat apf(DAG.EVTToAPFloatSemantics(SrcVT),
-              APInt::getNullValue(SrcVT.getScalarSizeInBits()));
-  APInt x = APInt::getSignMask(DstVT.getScalarSizeInBits());
-  (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
-
-  SDValue Tmp1 = DAG.getConstantFP(apf, dl, SrcVT);
-  SDValue Tmp2 = DAG.getSetCC(dl, SetCCVT, Src, Tmp1, ISD::SETLT);
-  SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
-  // TODO: Should any fast-math-flags be set for the FSUB?
-  SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT,
-                              DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Tmp1));
-  False =
-      DAG.getNode(ISD::XOR, dl, DstVT, False, DAG.getConstant(x, dl, DstVT));
-  Result = DAG.getSelect(dl, DstVT, Tmp2, True, False);
+  // If the maximum float value is smaller then the signed integer range,
+  // the destination signmask can't be represented by the float, so we can
+  // just use FP_TO_SINT directly.
+  const fltSemantics &APFSem = DAG.EVTToAPFloatSemantics(SrcVT);
+  APFloat APF(APFSem, APInt::getNullValue(SrcVT.getScalarSizeInBits()));
+  APInt SignMask = APInt::getSignMask(DstVT.getScalarSizeInBits());
+  if (APFloat::opOverflow &
+      APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) {
+    Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
+    return true;
+  }
+
+  SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
+  SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT);
+
+  bool Strict = shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false);
+  if (Strict) {
+    // Expand based on maximum range of FP_TO_SINT, if the value exceeds the
+    // signmask then offset (the result of which should be fully representable).
+    // Sel = Src < 0x8000000000000000
+    // Val = select Sel, Src, Src - 0x8000000000000000
+    // Ofs = select Sel, 0, 0x8000000000000000
+    // Result = fp_to_sint(Val) ^ Ofs
+
+    // TODO: Should any fast-math-flags be set for the FSUB?
+    SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src,
+                                DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
+    SDValue Ofs = DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT),
+                                DAG.getConstant(SignMask, dl, DstVT));
+    Result = DAG.getNode(ISD::XOR, dl, DstVT,
+                         DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val), Ofs);
+  } else {
+    // Expand based on maximum range of FP_TO_SINT:
+    // True = fp_to_sint(Src)
+    // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
+    // Result = select (Src < 0x8000000000000000), True, False
+
+    SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
+    // TODO: Should any fast-math-flags be set for the FSUB?
+    SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT,
+                                DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
+    False = DAG.getNode(ISD::XOR, dl, DstVT, False,
+                        DAG.getConstant(SignMask, dl, DstVT));
+    Result = DAG.getSelect(dl, DstVT, Sel, True, False);
+  }
   return true;
 }
 
@@ -5062,3 +5358,55 @@ SDValue TargetLowering::getExpandedSaturationAdditionSubtraction(
     return DAG.getSelect(dl, ResultType, Overflow, Result, SumDiff);
   }
 }
+
+SDValue
+TargetLowering::getExpandedFixedPointMultiplication(SDNode *Node,
+                                                    SelectionDAG &DAG) const {
+  assert(Node->getOpcode() == ISD::SMULFIX && "Expected opcode to be SMULFIX.");
+  assert(Node->getNumOperands() == 3 &&
+         "Expected signed fixed point multiplication to have 3 operands.");
+
+  SDLoc dl(Node);
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  assert(LHS.getValueType().isScalarInteger() &&
+         "Expected operands to be integers. Vector of int arguments should "
+         "already be unrolled.");
+  assert(RHS.getValueType().isScalarInteger() &&
+         "Expected operands to be integers. Vector of int arguments should "
+         "already be unrolled.");
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected both operands to be the same type");
+
+  unsigned Scale = Node->getConstantOperandVal(2);
+  EVT VT = LHS.getValueType();
+  assert(Scale < VT.getScalarSizeInBits() &&
+         "Expected scale to be less than the number of bits.");
+
+  if (!Scale)
+    return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+
+  // Get the upper and lower bits of the result.
+  SDValue Lo, Hi;
+  if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) {
+    SDValue Result =
+        DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), LHS, RHS);
+    Lo = Result.getValue(0);
+    Hi = Result.getValue(1);
+  } else if (isOperationLegalOrCustom(ISD::MULHS, VT)) {
+    Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    Hi = DAG.getNode(ISD::MULHS, dl, VT, LHS, RHS);
+  } else {
+    report_fatal_error("Unable to expand signed fixed point multiplication.");
+  }
+
+  // The result will need to be shifted right by the scale since both operands
+  // are scaled. The result is given to us in 2 halves, so we only want part of
+  // both in the result.
+  EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout());
+  Lo = DAG.getNode(ISD::SRL, dl, VT, Lo, DAG.getConstant(Scale, dl, ShiftTy));
+  Hi = DAG.getNode(
+      ISD::SHL, dl, VT, Hi,
+      DAG.getConstant(VT.getScalarSizeInBits() - Scale, dl, ShiftTy));
+  return DAG.getNode(ISD::OR, dl, VT, Lo, Hi);
+}
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index cb12c7ce6e826f3a9735ea7773fde7d45f37cb87..dcf37ca76b204bf51c285194f470328e8244f4d0 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -199,6 +199,18 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
   return false;
 }
 
+/// Search for the first call to the llvm.stackprotector intrinsic and return it
+/// if present.
+static const CallInst *findStackProtectorIntrinsic(Function &F) {
+  for (const BasicBlock &BB : F)
+    for (const Instruction &I : BB)
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        if (CI->getCalledFunction() ==
+            Intrinsic::getDeclaration(F.getParent(), Intrinsic::stackprotector))
+          return CI;
+  return nullptr;
+}
+
 /// Check whether or not this function needs a stack protector based
 /// upon the stack protector level.
 ///
@@ -215,13 +227,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
 bool StackProtector::RequiresStackProtector() {
   bool Strong = false;
   bool NeedsProtector = false;
-  for (const BasicBlock &BB : *F)
-    for (const Instruction &I : BB)
-      if (const CallInst *CI = dyn_cast<CallInst>(&I))
-        if (CI->getCalledFunction() ==
-            Intrinsic::getDeclaration(F->getParent(),
-                                      Intrinsic::stackprotector))
-          HasPrologue = true;
+  HasPrologue = findStackProtectorIntrinsic(*F);
 
   if (F->hasFnAttribute(Attribute::SafeStack))
     return false;
@@ -379,7 +385,8 @@ bool StackProtector::InsertStackProtectors() {
   // protection in SDAG.
   bool SupportsSelectionDAGSP =
       TLI->useStackGuardXorFP() ||
-      (EnableSelectionDAGSP && !TM->Options.EnableFastISel);
+      (EnableSelectionDAGSP && !TM->Options.EnableFastISel &&
+       !TM->Options.EnableGlobalISel);
   AllocaInst *AI = nullptr;       // Place on stack that stores the stack guard.
 
   for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
@@ -399,6 +406,14 @@ bool StackProtector::InsertStackProtectors() {
     if (SupportsSelectionDAGSP)
       break;
 
+    // Find the stack guard slot if the prologue was not created by this pass
+    // itself via a previous call to CreatePrologue().
+    if (!AI) {
+      const CallInst *SPCall = findStackProtectorIntrinsic(*F);
+      assert(SPCall && "Call to llvm.stackprotector is missing");
+      AI = cast<AllocaInst>(SPCall->getArgOperand(1));
+    }
+
     // Set HasIRCheck to true, so that SelectionDAG will not generate its own
     // version. SelectionDAG called 'shouldEmitSDCheck' to check whether
     // instrumentation has already been generated.
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 166ff18e7759efc3ff3df0397b83d3d9431aca41..e8619037564245319d06ff5f430490875ee83c7c 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -610,10 +610,13 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::UMIN, VT, Expand);
     setOperationAction(ISD::UMAX, VT, Expand);
     setOperationAction(ISD::ABS, VT, Expand);
+    setOperationAction(ISD::FSHL, VT, Expand);
+    setOperationAction(ISD::FSHR, VT, Expand);
     setOperationAction(ISD::SADDSAT, VT, Expand);
     setOperationAction(ISD::UADDSAT, VT, Expand);
     setOperationAction(ISD::SSUBSAT, VT, Expand);
     setOperationAction(ISD::USUBSAT, VT, Expand);
+    setOperationAction(ISD::SMULFIX, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -1451,6 +1454,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case CatchPad:       return 0;
   case CatchSwitch:    return 0;
   case CleanupPad:     return 0;
+  case FNeg:           return ISD::FNEG;
   case Add:            return ISD::ADD;
   case FAdd:           return ISD::FADD;
   case Sub:            return ISD::SUB;
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 341ab927861039d114dad9516291094003b2b287..cb2fe691d7021a7ec07907eb6168cdefec40fee4 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -795,6 +795,14 @@ const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference(
       MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext());
 }
 
+MCSection *TargetLoweringObjectFileELF::getSectionForCommandLines() const {
+  // Use ".GCC.command.line" since this feature is to support clang's
+  // -frecord-gcc-switches which in turn attempts to mimic GCC's switch of the
+  // same name.
+  return getContext().getELFSection(".GCC.command.line", ELF::SHT_PROGBITS,
+                                    ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
+}
+
 void
 TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
   UseInitArray = UseInitArray_;
@@ -1100,6 +1108,22 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
   //       .indirect_symbol        _extfoo
   //       .long   0
   //
+  // The indirect symbol table (and sections of non_lazy_symbol_pointers type)
+  // may point to both local (same translation unit) and global (other
+  // translation units) symbols. Example:
+  //
+  // .section __DATA,__pointers,non_lazy_symbol_pointers
+  // L1:
+  //    .indirect_symbol _myGlobal
+  //    .long 0
+  // L2:
+  //    .indirect_symbol _myLocal
+  //    .long _myLocal
+  //
+  // If the symbol is local, instead of the symbol's index, the assembler
+  // places the constant INDIRECT_SYMBOL_LOCAL into the indirect symbol table.
+  // Then the linker will notice the constant in the table and will look at the
+  // content of the symbol.
   MachineModuleInfoMachO &MachOMMI =
     MMI->getObjFileInfo<MachineModuleInfoMachO>();
   MCContext &Ctx = getContext();
@@ -1119,9 +1143,12 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
   MCSymbol *Stub = Ctx.getOrCreateSymbol(Name);
 
   MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(Stub);
-  if (!StubSym.getPointer())
-    StubSym = MachineModuleInfoImpl::
-      StubValueTy(const_cast<MCSymbol *>(Sym), true /* access indirectly */);
+  if (!StubSym.getPointer()) {
+    bool IsIndirectLocal = Sym->isDefined() && !Sym->isExternal();
+    // With the assumption that IsIndirectLocal == GV->hasLocalLinkage().
+    StubSym = MachineModuleInfoImpl::StubValueTy(const_cast<MCSymbol *>(Sym),
+                                                 !IsIndirectLocal);
+  }
 
   const MCExpr *BSymExpr =
     MCSymbolRefExpr::create(BaseSym, MCSymbolRefExpr::VK_None, Ctx);
@@ -1317,10 +1344,11 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
       MCSymbol *Sym = TM.getSymbol(ComdatGV);
       StringRef COMDATSymName = Sym->getName();
 
-      // Append "$symbol" to the section name when targetting mingw. The ld.bfd
+      // Append "$symbol" to the section name *before* IR-level mangling is
+      // applied when targetting mingw. This is what GCC does, and the ld.bfd
       // COFF linker will not properly handle comdats otherwise.
       if (getTargetTriple().isWindowsGNUEnvironment())
-        raw_svector_ostream(Name) << '$' << COMDATSymName;
+        raw_svector_ostream(Name) << '$' << ComdatGV->getName();
 
       return getContext().getCOFFSection(Name, Characteristics, Kind,
                                          COMDATSymName, Selection, UniqueID);
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 9adacd2ed718a7170f7e9b0d822b6fe73732a682..defb165fe0f29adb628e8cae798a197ec16dc8a7 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -137,13 +137,15 @@ static cl::opt<std::string> PrintMachineInstrs(
     "print-machineinstrs", cl::ValueOptional, cl::desc("Print machine instrs"),
     cl::value_desc("pass-name"), cl::init("option-unspecified"), cl::Hidden);
 
-static cl::opt<int> EnableGlobalISelAbort(
+static cl::opt<GlobalISelAbortMode> EnableGlobalISelAbort(
     "global-isel-abort", cl::Hidden,
     cl::desc("Enable abort calls when \"global\" instruction selection "
-             "fails to lower/select an instruction: 0 disable the abort, "
-             "1 enable the abort, and "
-             "2 disable the abort but emit a diagnostic on failure"),
-    cl::init(1));
+             "fails to lower/select an instruction"),
+    cl::values(
+        clEnumValN(GlobalISelAbortMode::Disable, "0", "Disable the abort"),
+        clEnumValN(GlobalISelAbortMode::Enable, "1", "Enable the abort"),
+        clEnumValN(GlobalISelAbortMode::DisableWithDiag, "2",
+                   "Disable the abort but emit a diagnostic on failure")));
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
@@ -343,11 +345,39 @@ static AnalysisID getPassIDFromName(StringRef PassName) {
   return PI ? PI->getTypeInfo() : nullptr;
 }
 
+static std::pair<StringRef, unsigned>
+getPassNameAndInstanceNum(StringRef PassName) {
+  StringRef Name, InstanceNumStr;
+  std::tie(Name, InstanceNumStr) = PassName.split(',');
+
+  unsigned InstanceNum = 0;
+  if (!InstanceNumStr.empty() && InstanceNumStr.getAsInteger(10, InstanceNum))
+    report_fatal_error("invalid pass instance specifier " + PassName);
+
+  return std::make_pair(Name, InstanceNum);
+}
+
 void TargetPassConfig::setStartStopPasses() {
-  StartBefore = getPassIDFromName(StartBeforeOpt);
-  StartAfter = getPassIDFromName(StartAfterOpt);
-  StopBefore = getPassIDFromName(StopBeforeOpt);
-  StopAfter = getPassIDFromName(StopAfterOpt);
+  StringRef StartBeforeName;
+  std::tie(StartBeforeName, StartBeforeInstanceNum) =
+    getPassNameAndInstanceNum(StartBeforeOpt);
+
+  StringRef StartAfterName;
+  std::tie(StartAfterName, StartAfterInstanceNum) =
+    getPassNameAndInstanceNum(StartAfterOpt);
+
+  StringRef StopBeforeName;
+  std::tie(StopBeforeName, StopBeforeInstanceNum)
+    = getPassNameAndInstanceNum(StopBeforeOpt);
+
+  StringRef StopAfterName;
+  std::tie(StopAfterName, StopAfterInstanceNum)
+    = getPassNameAndInstanceNum(StopAfterOpt);
+
+  StartBefore = getPassIDFromName(StartBeforeName);
+  StartAfter = getPassIDFromName(StartAfterName);
+  StopBefore = getPassIDFromName(StopBeforeName);
+  StopAfter = getPassIDFromName(StopAfterName);
   if (StartBefore && StartAfter)
     report_fatal_error(Twine(StartBeforeOptName) + Twine(" and ") +
                        Twine(StartAfterOptName) + Twine(" specified!"));
@@ -384,6 +414,9 @@ TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
   if (TM.Options.EnableIPRA)
     setRequiresCodeGenSCCOrder();
 
+  if (EnableGlobalISelAbort.getNumOccurrences())
+    TM.Options.GlobalISelAbort = EnableGlobalISelAbort;
+
   setStartStopPasses();
 }
 
@@ -488,9 +521,9 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
   // and shouldn't reference it.
   AnalysisID PassID = P->getPassID();
 
-  if (StartBefore == PassID)
+  if (StartBefore == PassID && StartBeforeCount++ == StartBeforeInstanceNum)
     Started = true;
-  if (StopBefore == PassID)
+  if (StopBefore == PassID && StopBeforeCount++ == StopBeforeInstanceNum)
     Stopped = true;
   if (Started && !Stopped) {
     std::string Banner;
@@ -513,9 +546,11 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
   } else {
     delete P;
   }
-  if (StopAfter == PassID)
+
+  if (StopAfter == PassID && StopAfterCount++ == StopAfterInstanceNum)
     Stopped = true;
-  if (StartAfter == PassID)
+
+  if (StartAfter == PassID && StartAfterCount++ == StartAfterInstanceNum)
     Started = true;
   if (Stopped && !Started)
     report_fatal_error("Cannot stop compilation after pass that is not run");
@@ -721,8 +756,11 @@ bool TargetPassConfig::addCoreISelPasses() {
   // Enable FastISel with -fast-isel, but allow that to be overridden.
   TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
   if (EnableFastISelOption == cl::BOU_TRUE ||
-      (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel()))
+      (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel() &&
+       !TM->Options.EnableGlobalISel)) {
     TM->setFastISel(true);
+    TM->setGlobalISel(false);
+  }
 
   // Ask the target for an instruction selector.
   // Explicitly enabling fast-isel should override implicitly enabled
@@ -730,6 +768,7 @@ bool TargetPassConfig::addCoreISelPasses() {
   if (EnableGlobalISelOption == cl::BOU_TRUE ||
       (EnableGlobalISelOption == cl::BOU_UNSET &&
        TM->Options.EnableGlobalISel && EnableFastISelOption != cl::BOU_TRUE)) {
+    TM->setGlobalISel(true);
     TM->setFastISel(false);
 
     SaveAndRestore<bool> SavedAddingMachinePasses(AddingMachinePasses, true);
@@ -990,7 +1029,8 @@ bool TargetPassConfig::getOptimizeRegAlloc() const {
 }
 
 /// RegisterRegAlloc's global Registry tracks allocator registration.
-MachinePassRegistry RegisterRegAlloc::Registry;
+MachinePassRegistry<RegisterRegAlloc::FunctionPassCtor>
+    RegisterRegAlloc::Registry;
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
@@ -1164,14 +1204,9 @@ void TargetPassConfig::addBlockPlacement() {
 /// GlobalISel Configuration
 //===---------------------------------------------------------------------===//
 bool TargetPassConfig::isGlobalISelAbortEnabled() const {
-  if (EnableGlobalISelAbort.getNumOccurrences() > 0)
-    return EnableGlobalISelAbort == 1;
-
-  // When no abort behaviour is specified, we don't abort if the target says
-  // that GISel is enabled.
-  return !TM->Options.EnableGlobalISel;
+  return TM->Options.GlobalISelAbort == GlobalISelAbortMode::Enable;
 }
 
 bool TargetPassConfig::reportDiagnosticWhenGlobalISelFallback() const {
-  return EnableGlobalISelAbort == 2;
+  return TM->Options.GlobalISelAbort == GlobalISelAbortMode::DisableWithDiag;
 }
diff --git a/lib/CodeGen/WasmEHPrepare.cpp b/lib/CodeGen/WasmEHPrepare.cpp
index 6f02a05f56157e33a7932e16a20b97ec1ac47158..e5002eb95346dbe6228ab2498bfd68e4cb6080a2 100644
--- a/lib/CodeGen/WasmEHPrepare.cpp
+++ b/lib/CodeGen/WasmEHPrepare.cpp
@@ -137,6 +137,7 @@ class WasmEHPrepare : public FunctionPass {
   Value *LSDAField = nullptr;      // lsda field
   Value *SelectorField = nullptr;  // selector
 
+  Function *ThrowF = nullptr;           // wasm.throw() intrinsic
   Function *CatchF = nullptr;           // wasm.catch.extract() intrinsic
   Function *LPadIndexF = nullptr;       // wasm.landingpad.index() intrinsic
   Function *LSDAF = nullptr;            // wasm.lsda() intrinsic
@@ -145,6 +146,9 @@ class WasmEHPrepare : public FunctionPass {
   Function *CallPersonalityF = nullptr; // _Unwind_CallPersonality() wrapper
   Function *ClangCallTermF = nullptr;   // __clang_call_terminate() function
 
+  bool prepareEHPads(Function &F);
+  bool prepareThrows(Function &F);
+
   void prepareEHPad(BasicBlock *BB, unsigned Index);
   void prepareTerminateCleanupPad(BasicBlock *BB);
 
@@ -177,7 +181,62 @@ bool WasmEHPrepare::doInitialization(Module &M) {
   return false;
 }
 
+// Erase the specified BBs if the BB does not have any remaining predecessors,
+// and also all its dead children.
+template <typename Container>
+static void eraseDeadBBsAndChildren(const Container &BBs) {
+  SmallVector<BasicBlock *, 8> WL(BBs.begin(), BBs.end());
+  while (!WL.empty()) {
+    auto *BB = WL.pop_back_val();
+    if (pred_begin(BB) != pred_end(BB))
+      continue;
+    WL.append(succ_begin(BB), succ_end(BB));
+    DeleteDeadBlock(BB);
+  }
+}
+
 bool WasmEHPrepare::runOnFunction(Function &F) {
+  bool Changed = false;
+  Changed |= prepareThrows(F);
+  Changed |= prepareEHPads(F);
+  return Changed;
+}
+
+bool WasmEHPrepare::prepareThrows(Function &F) {
+  Module &M = *F.getParent();
+  IRBuilder<> IRB(F.getContext());
+  bool Changed = false;
+
+  // wasm.throw() intinsic, which will be lowered to wasm 'throw' instruction.
+  ThrowF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_throw);
+
+  // Insert an unreachable instruction after a call to @llvm.wasm.throw and
+  // delete all following instructions within the BB, and delete all the dead
+  // children of the BB as well.
+  for (User *U : ThrowF->users()) {
+    // A call to @llvm.wasm.throw() is only generated from
+    // __builtin_wasm_throw() builtin call within libcxxabi, and cannot be an
+    // InvokeInst.
+    auto *ThrowI = cast<CallInst>(U);
+    if (ThrowI->getFunction() != &F)
+      continue;
+    Changed = true;
+    auto *BB = ThrowI->getParent();
+    SmallVector<BasicBlock *, 4> Succs(succ_begin(BB), succ_end(BB));
+    auto &InstList = BB->getInstList();
+    InstList.erase(std::next(BasicBlock::iterator(ThrowI)), InstList.end());
+    IRB.SetInsertPoint(BB);
+    IRB.CreateUnreachable();
+    eraseDeadBBsAndChildren(Succs);
+  }
+
+  return Changed;
+}
+
+bool WasmEHPrepare::prepareEHPads(Function &F) {
+  Module &M = *F.getParent();
+  IRBuilder<> IRB(F.getContext());
+
   SmallVector<BasicBlock *, 16> CatchPads;
   SmallVector<BasicBlock *, 16> CleanupPads;
   for (BasicBlock &BB : F) {
@@ -194,9 +253,6 @@ bool WasmEHPrepare::runOnFunction(Function &F) {
     return false;
   assert(F.hasPersonalityFn() && "Personality function not found");
 
-  Module &M = *F.getParent();
-  IRBuilder<> IRB(F.getContext());
-
   // __wasm_lpad_context global variable
   LPadContextGV = cast<GlobalVariable>(
       M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy));
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index e1bdffd30ee799f4ff3e72b6f36aa986627130a8..1610ca469575307549c6dfb3fe1a415d2590a356 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -27,8 +27,9 @@ add_llvm_library(LLVMDebugInfoCodeView
   RecordSerialization.cpp
   SimpleTypeSerializer.cpp
   StringsAndChecksums.cpp
-  SymbolRecordMapping.cpp
   SymbolDumper.cpp
+  SymbolRecordHelpers.cpp
+  SymbolRecordMapping.cpp
   SymbolSerializer.cpp
   TypeDumpVisitor.cpp
   TypeIndex.cpp
diff --git a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
index 44a67743169ec9c8eb4adb03d18cd5374f2ac3e9..cbcaa569282818a8e4b3bc5617792b53241d932d 100644
--- a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
@@ -75,7 +75,7 @@ Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols) {
 Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols,
                                          uint32_t InitialOffset) {
   for (auto I : Symbols) {
-    if (auto EC = visitSymbolRecord(I, InitialOffset))
+    if (auto EC = visitSymbolRecord(I, InitialOffset + Symbols.skew()))
       return EC;
     InitialOffset += I.length();
   }
diff --git a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index ca8007411cadcd9061107eb338c1a709b43e9fa3..ddcad8c631d7484c1978824d455d8b9bd8f2618c 100644
--- a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -89,6 +89,8 @@ uint32_t LazyRandomTypeCollection::getOffsetOfType(TypeIndex Index) {
 }
 
 CVType LazyRandomTypeCollection::getType(TypeIndex Index) {
+  assert(!Index.isSimple());
+
   auto EC = ensureTypeExists(Index);
   error(std::move(EC));
   assert(contains(Index));
@@ -97,6 +99,9 @@ CVType LazyRandomTypeCollection::getType(TypeIndex Index) {
 }
 
 Optional<CVType> LazyRandomTypeCollection::tryGetType(TypeIndex Index) {
+  if (Index.isSimple())
+    return None;
+
   if (auto EC = ensureTypeExists(Index)) {
     consumeError(std::move(EC));
     return None;
@@ -151,6 +156,7 @@ Error LazyRandomTypeCollection::ensureTypeExists(TypeIndex TI) {
 }
 
 void LazyRandomTypeCollection::ensureCapacityFor(TypeIndex Index) {
+  assert(!Index.isSimple());
   uint32_t MinSize = Index.toArrayIndex() + 1;
 
   if (MinSize <= capacity())
@@ -163,6 +169,7 @@ void LazyRandomTypeCollection::ensureCapacityFor(TypeIndex Index) {
 }
 
 Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) {
+  assert(!TI.isSimple());
   if (PartialOffsets.empty())
     return fullScanForType(TI);
 
@@ -217,6 +224,7 @@ Optional<TypeIndex> LazyRandomTypeCollection::getNext(TypeIndex Prev) {
 }
 
 Error LazyRandomTypeCollection::fullScanForType(TypeIndex TI) {
+  assert(!TI.isSimple());
   assert(PartialOffsets.empty());
 
   TypeIndex CurrentTI = TypeIndex::fromArrayIndex(0);
diff --git a/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp b/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01746138ad1f2df923ff1b1e721f0b140b3352c2
--- /dev/null
+++ b/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp
@@ -0,0 +1,94 @@
+//===- SymbolRecordHelpers.cpp ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/SymbolRecordHelpers.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+template <typename RecordT> RecordT createRecord(const CVSymbol &sym) {
+  RecordT record(static_cast<SymbolRecordKind>(sym.kind()));
+  cantFail(SymbolDeserializer::deserializeAs<RecordT>(sym, record));
+  return record;
+}
+
+uint32_t llvm::codeview::getScopeEndOffset(const CVSymbol &Sym) {
+  assert(symbolOpensScope(Sym.kind()));
+  switch (Sym.kind()) {
+  case SymbolKind::S_GPROC32:
+  case SymbolKind::S_LPROC32:
+  case SymbolKind::S_GPROC32_ID:
+  case SymbolKind::S_LPROC32_ID:
+  case SymbolKind::S_LPROC32_DPC:
+  case SymbolKind::S_LPROC32_DPC_ID: {
+    ProcSym Proc = createRecord<ProcSym>(Sym);
+    return Proc.End;
+  }
+  case SymbolKind::S_BLOCK32: {
+    BlockSym Block = createRecord<BlockSym>(Sym);
+    return Block.End;
+  }
+  case SymbolKind::S_THUNK32: {
+    Thunk32Sym Thunk = createRecord<Thunk32Sym>(Sym);
+    return Thunk.End;
+  }
+  case SymbolKind::S_INLINESITE: {
+    InlineSiteSym Site = createRecord<InlineSiteSym>(Sym);
+    return Site.End;
+  }
+  default:
+    assert(false && "Unknown record type");
+    return 0;
+  }
+}
+
+uint32_t
+llvm::codeview::getScopeParentOffset(const llvm::codeview::CVSymbol &Sym) {
+  assert(symbolOpensScope(Sym.kind()));
+  switch (Sym.kind()) {
+  case SymbolKind::S_GPROC32:
+  case SymbolKind::S_LPROC32:
+  case SymbolKind::S_GPROC32_ID:
+  case SymbolKind::S_LPROC32_ID:
+  case SymbolKind::S_LPROC32_DPC:
+  case SymbolKind::S_LPROC32_DPC_ID: {
+    ProcSym Proc = createRecord<ProcSym>(Sym);
+    return Proc.Parent;
+  }
+  case SymbolKind::S_BLOCK32: {
+    BlockSym Block = createRecord<BlockSym>(Sym);
+    return Block.Parent;
+  }
+  case SymbolKind::S_THUNK32: {
+    Thunk32Sym Thunk = createRecord<Thunk32Sym>(Sym);
+    return Thunk.Parent;
+  }
+  case SymbolKind::S_INLINESITE: {
+    InlineSiteSym Site = createRecord<InlineSiteSym>(Sym);
+    return Site.Parent;
+  }
+  default:
+    assert(false && "Unknown record type");
+    return 0;
+  }
+}
+
+CVSymbolArray
+llvm::codeview::limitSymbolArrayToScope(const CVSymbolArray &Symbols,
+                                        uint32_t ScopeBegin) {
+  CVSymbol Opener = *Symbols.at(ScopeBegin);
+  assert(symbolOpensScope(Opener.kind()));
+  uint32_t EndOffset = getScopeEndOffset(Opener);
+  CVSymbol Closer = *Symbols.at(EndOffset);
+  EndOffset += Closer.RecordData.size();
+  return Symbols.substream(ScopeBegin, EndOffset);
+}
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 7c68c9167c9820825ccb5ad05310b59fc7ff540c..f5d3bea43a14799e1ddc3c15e53914dd956667a1 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -361,7 +361,6 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
   printTypeIndex("PointeeType", Ptr.getReferentType());
-  W->printHex("PointerAttributes", uint32_t(Ptr.getOptions()));
   W->printEnum("PtrType", unsigned(Ptr.getPointerKind()),
                makeArrayRef(PtrKindNames));
   W->printEnum("PtrMode", unsigned(Ptr.getMode()), makeArrayRef(PtrModeNames));
@@ -371,6 +370,8 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
   W->printNumber("IsVolatile", Ptr.isVolatile());
   W->printNumber("IsUnaligned", Ptr.isUnaligned());
   W->printNumber("IsRestrict", Ptr.isRestrict());
+  W->printNumber("IsThisPtr&", Ptr.isLValueReferenceThisPtr());
+  W->printNumber("IsThisPtr&&", Ptr.isRValueReferenceThisPtr());
   W->printNumber("SizeOf", Ptr.getSize());
 
   if (Ptr.isPointerToMember()) {
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 7ab54de6bc49e3ad604dc2d5e575381acf1ba403..e6620ee3dd1d16a342b64ba0b2e525fdb6d3a0a4 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -317,7 +317,6 @@ void DWARFContext::dump(
     raw_ostream &OS, DIDumpOptions DumpOpts,
     std::array<Optional<uint64_t>, DIDT_ID_Count> DumpOffsets) {
 
-  Optional<uint64_t> DumpOffset;
   uint64_t DumpType = DumpOpts.DumpType;
 
   StringRef Extension = sys::path::extension(DObj->getFileName());
@@ -334,13 +333,13 @@ void DWARFContext::dump(
   bool Explicit = DumpType != DIDT_All && !IsDWO;
   bool ExplicitDWO = Explicit && IsDWO;
   auto shouldDump = [&](bool Explicit, const char *Name, unsigned ID,
-                        StringRef Section) {
-    DumpOffset = DumpOffsets[ID];
+                        StringRef Section) -> Optional<uint64_t> * {
     unsigned Mask = 1U << ID;
     bool Should = (DumpType & Mask) && (Explicit || !Section.empty());
-    if (Should)
-      OS << "\n" << Name << " contents:\n";
-    return Should;
+    if (!Should)
+      return nullptr;
+    OS << "\n" << Name << " contents:\n";
+    return &DumpOffsets[ID];
   };
 
   // Dump individual sections.
@@ -353,7 +352,7 @@ void DWARFContext::dump(
 
   auto dumpDebugInfo = [&](const char *Name, unit_iterator_range Units) {
     OS << '\n' << Name << " contents:\n";
-    if ((DumpOffset = DumpOffsets[DIDT_ID_DebugInfo]))
+    if (auto DumpOffset = DumpOffsets[DIDT_ID_DebugInfo])
       for (const auto &U : Units)
         U->getDIEForOffset(DumpOffset.getValue())
             .dump(OS, 0, DumpOpts.noImplicitRecursion());
@@ -370,9 +369,8 @@ void DWARFContext::dump(
 
   auto dumpDebugType = [&](const char *Name, unit_iterator_range Units) {
     OS << '\n' << Name << " contents:\n";
-    DumpOffset = DumpOffsets[DIDT_ID_DebugTypes];
     for (const auto &U : Units)
-      if (DumpOffset)
+      if (auto DumpOffset = DumpOffsets[DIDT_ID_DebugTypes])
         U->getDIEForOffset(*DumpOffset)
             .dump(OS, 0, DumpOpts.noImplicitRecursion());
       else
@@ -385,28 +383,30 @@ void DWARFContext::dump(
       dumpDebugType(".debug_types.dwo", dwo_types_section_units());
   }
 
-  if (shouldDump(Explicit, ".debug_loc", DIDT_ID_DebugLoc,
-                 DObj->getLocSection().Data)) {
-    getDebugLoc()->dump(OS, getRegisterInfo(), DumpOffset);
+  if (const auto *Off = shouldDump(Explicit, ".debug_loc", DIDT_ID_DebugLoc,
+                                   DObj->getLocSection().Data)) {
+    getDebugLoc()->dump(OS, getRegisterInfo(), *Off);
   }
-  if (shouldDump(Explicit, ".debug_loclists", DIDT_ID_DebugLoclists,
-                 DObj->getLoclistsSection().Data)) {
+  if (const auto *Off =
+          shouldDump(Explicit, ".debug_loclists", DIDT_ID_DebugLoclists,
+                     DObj->getLoclistsSection().Data)) {
     DWARFDataExtractor Data(*DObj, DObj->getLoclistsSection(), isLittleEndian(),
                             0);
-    dumpLoclistsSection(OS, DumpOpts, Data, getRegisterInfo(), DumpOffset);
+    dumpLoclistsSection(OS, DumpOpts, Data, getRegisterInfo(), *Off);
   }
-  if (shouldDump(ExplicitDWO, ".debug_loc.dwo", DIDT_ID_DebugLoc,
-                 DObj->getLocDWOSection().Data)) {
-    getDebugLocDWO()->dump(OS, 0, getRegisterInfo(), DumpOffset);
+  if (const auto *Off =
+          shouldDump(ExplicitDWO, ".debug_loc.dwo", DIDT_ID_DebugLoc,
+                     DObj->getLocDWOSection().Data)) {
+    getDebugLocDWO()->dump(OS, 0, getRegisterInfo(), *Off);
   }
 
-  if (shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
-                 DObj->getDebugFrameSection()))
-    getDebugFrame()->dump(OS, getRegisterInfo(), DumpOffset);
+  if (const auto *Off = shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
+                                   DObj->getDebugFrameSection()))
+    getDebugFrame()->dump(OS, getRegisterInfo(), *Off);
 
-  if (shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame,
-                 DObj->getEHFrameSection()))
-    getEHFrame()->dump(OS, getRegisterInfo(), DumpOffset);
+  if (const auto *Off = shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame,
+                                   DObj->getEHFrameSection()))
+    getEHFrame()->dump(OS, getRegisterInfo(), *Off);
 
   if (DumpType & DIDT_DebugMacro) {
     if (Explicit || !getDebugMacro()->empty()) {
@@ -425,7 +425,8 @@ void DWARFContext::dump(
   }
 
   auto DumpLineSection = [&](DWARFDebugLine::SectionParser Parser,
-                             DIDumpOptions DumpOpts) {
+                             DIDumpOptions DumpOpts,
+                             Optional<uint64_t> DumpOffset) {
     while (!Parser.done()) {
       if (DumpOffset && Parser.getOffset() != *DumpOffset) {
         Parser.skip(dumpWarning);
@@ -442,22 +443,23 @@ void DWARFContext::dump(
     }
   };
 
-  if (shouldDump(Explicit, ".debug_line", DIDT_ID_DebugLine,
-                 DObj->getLineSection().Data)) {
+  if (const auto *Off = shouldDump(Explicit, ".debug_line", DIDT_ID_DebugLine,
+                                   DObj->getLineSection().Data)) {
     DWARFDataExtractor LineData(*DObj, DObj->getLineSection(), isLittleEndian(),
                                 0);
     DWARFDebugLine::SectionParser Parser(LineData, *this, compile_units(),
                                          type_units());
-    DumpLineSection(Parser, DumpOpts);
+    DumpLineSection(Parser, DumpOpts, *Off);
   }
 
-  if (shouldDump(ExplicitDWO, ".debug_line.dwo", DIDT_ID_DebugLine,
-                 DObj->getLineDWOSection().Data)) {
+  if (const auto *Off =
+          shouldDump(ExplicitDWO, ".debug_line.dwo", DIDT_ID_DebugLine,
+                     DObj->getLineDWOSection().Data)) {
     DWARFDataExtractor LineData(*DObj, DObj->getLineDWOSection(),
                                 isLittleEndian(), 0);
     DWARFDebugLine::SectionParser Parser(LineData, *this, dwo_compile_units(),
                                          dwo_type_units());
-    DumpLineSection(Parser, DumpOpts);
+    DumpLineSection(Parser, DumpOpts, *Off);
   }
 
   if (shouldDump(Explicit, ".debug_cu_index", DIDT_ID_DebugCUIndex,
@@ -549,24 +551,24 @@ void DWARFContext::dump(
   }
 
   if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames,
-                 DObj->getPubNamesSection()))
-    DWARFDebugPubTable(DObj->getPubNamesSection(), isLittleEndian(), false)
+                 DObj->getPubNamesSection().Data))
+    DWARFDebugPubTable(*DObj, DObj->getPubNamesSection(), isLittleEndian(), false)
         .dump(OS);
 
   if (shouldDump(Explicit, ".debug_pubtypes", DIDT_ID_DebugPubtypes,
-                 DObj->getPubTypesSection()))
-    DWARFDebugPubTable(DObj->getPubTypesSection(), isLittleEndian(), false)
+                 DObj->getPubTypesSection().Data))
+    DWARFDebugPubTable(*DObj, DObj->getPubTypesSection(), isLittleEndian(), false)
         .dump(OS);
 
   if (shouldDump(Explicit, ".debug_gnu_pubnames", DIDT_ID_DebugGnuPubnames,
-                 DObj->getGnuPubNamesSection()))
-    DWARFDebugPubTable(DObj->getGnuPubNamesSection(), isLittleEndian(),
+                 DObj->getGnuPubNamesSection().Data))
+    DWARFDebugPubTable(*DObj, DObj->getGnuPubNamesSection(), isLittleEndian(),
                        true /* GnuStyle */)
         .dump(OS);
 
   if (shouldDump(Explicit, ".debug_gnu_pubtypes", DIDT_ID_DebugGnuPubtypes,
-                 DObj->getGnuPubTypesSection()))
-    DWARFDebugPubTable(DObj->getGnuPubTypesSection(), isLittleEndian(),
+                 DObj->getGnuPubTypesSection().Data))
+    DWARFDebugPubTable(*DObj, DObj->getGnuPubTypesSection(), isLittleEndian(),
                        true /* GnuStyle */)
         .dump(OS);
 
@@ -765,7 +767,7 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
   // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
   DWARFDataExtractor debugFrameData(DObj->getDebugFrameSection(),
                                     isLittleEndian(), DObj->getAddressSize());
-  DebugFrame.reset(new DWARFDebugFrame(false /* IsEH */));
+  DebugFrame.reset(new DWARFDebugFrame(getArch(), false /* IsEH */));
   DebugFrame->parse(debugFrameData);
   return DebugFrame.get();
 }
@@ -776,7 +778,7 @@ const DWARFDebugFrame *DWARFContext::getEHFrame() {
 
   DWARFDataExtractor debugFrameData(DObj->getEHFrameSection(), isLittleEndian(),
                                     DObj->getAddressSize());
-  DebugFrame.reset(new DWARFDebugFrame(true /* IsEH */));
+  DebugFrame.reset(new DWARFDebugFrame(getArch(), true /* IsEH */));
   DebugFrame->parse(debugFrameData);
   return DebugFrame.get();
 }
@@ -1265,6 +1267,10 @@ class DWARFObjInMemory final : public DWARFObject {
   DWARFSectionMap AppleNamespacesSection;
   DWARFSectionMap AppleObjCSection;
   DWARFSectionMap DebugNamesSection;
+  DWARFSectionMap PubNamesSection;
+  DWARFSectionMap PubTypesSection;
+  DWARFSectionMap GnuPubNamesSection;
+  DWARFSectionMap GnuPubTypesSection;
 
   DWARFSectionMap *mapNameToDWARFSection(StringRef Name) {
     return StringSwitch<DWARFSectionMap *>(Name)
@@ -1281,6 +1287,10 @@ class DWARFObjInMemory final : public DWARFObject {
         .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
         .Case("debug_addr", &AddrSection)
         .Case("apple_names", &AppleNamesSection)
+        .Case("debug_pubnames", &PubNamesSection)
+        .Case("debug_pubtypes", &PubTypesSection)
+        .Case("debug_gnu_pubnames", &GnuPubNamesSection)
+        .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
         .Case("apple_types", &AppleTypesSection)
         .Case("apple_namespaces", &AppleNamespacesSection)
         .Case("apple_namespac", &AppleNamespacesSection)
@@ -1294,12 +1304,8 @@ class DWARFObjInMemory final : public DWARFObject {
   StringRef EHFrameSection;
   StringRef StringSection;
   StringRef MacinfoSection;
-  StringRef PubNamesSection;
-  StringRef PubTypesSection;
-  StringRef GnuPubNamesSection;
   StringRef AbbrevDWOSection;
   StringRef StringDWOSection;
-  StringRef GnuPubTypesSection;
   StringRef CUIndexSection;
   StringRef GdbIndexSection;
   StringRef TUIndexSection;
@@ -1319,10 +1325,6 @@ class DWARFObjInMemory final : public DWARFObject {
         .Case("eh_frame", &EHFrameSection)
         .Case("debug_str", &StringSection)
         .Case("debug_macinfo", &MacinfoSection)
-        .Case("debug_pubnames", &PubNamesSection)
-        .Case("debug_pubtypes", &PubTypesSection)
-        .Case("debug_gnu_pubnames", &GnuPubNamesSection)
-        .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
         .Case("debug_abbrev.dwo", &AbbrevDWOSection)
         .Case("debug_str.dwo", &StringDWOSection)
         .Case("debug_cu_index", &CUIndexSection)
@@ -1598,12 +1600,12 @@ public:
     return RnglistsSection;
   }
   StringRef getMacinfoSection() const override { return MacinfoSection; }
-  StringRef getPubNamesSection() const override { return PubNamesSection; }
-  StringRef getPubTypesSection() const override { return PubTypesSection; }
-  StringRef getGnuPubNamesSection() const override {
+  const DWARFSection &getPubNamesSection() const override { return PubNamesSection; }
+  const DWARFSection &getPubTypesSection() const override { return PubTypesSection; }
+  const DWARFSection &getGnuPubNamesSection() const override {
     return GnuPubNamesSection;
   }
-  StringRef getGnuPubTypesSection() const override {
+  const DWARFSection &getGnuPubTypesSection() const override {
     return GnuPubTypesSection;
   }
   const DWARFSection &getAppleNamesSection() const override {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp b/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
index 7085ca067ba621d17cbe7ff9e4d14a6c48741d70..22759bfac26c859d3b3b5f329f4685de95bb5dc7 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
@@ -148,7 +148,7 @@ void DWARFDebugAddrTable::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
                HeaderData.Length, HeaderData.Version, HeaderData.AddrSize,
                HeaderData.SegSize);
 
-  static const char *Fmt32 = "0x%8.8" PRIx32;
+  static const char *Fmt32 = "0x%8.8" PRIx64;
   static const char *Fmt64 = "0x%16.16" PRIx64;
   std::string AddrFmt = "\n";
   std::string AddrFmtVerbose = " => ";
diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index d725255fe4c53e9bc40e2dd0e06425860be94af5..6349a6efb8c21ee43398d63dfdc9a1f998b3d455 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -225,7 +225,7 @@ void CFIProgram::printOperand(raw_ostream &OS, const MCRegisterInfo *MRI,
   switch (Type) {
   case OT_Unset: {
     OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to";
-    auto OpcodeName = CallFrameString(Opcode);
+    auto OpcodeName = CallFrameString(Opcode, Arch);
     if (!OpcodeName.empty())
       OS << " " << OpcodeName;
     else
@@ -279,7 +279,7 @@ void CFIProgram::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
     if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
       Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK;
     OS.indent(2 * IndentLevel);
-    OS << CallFrameString(Opcode) << ":";
+    OS << CallFrameString(Opcode, Arch) << ":";
     for (unsigned i = 0; i < Instr.Ops.size(); ++i)
       printOperand(OS, MRI, IsEH, Instr, i, Instr.Ops[i]);
     OS << '\n';
@@ -325,8 +325,9 @@ void FDE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
   OS << "\n";
 }
 
-DWARFDebugFrame::DWARFDebugFrame(bool IsEH, uint64_t EHFrameAddress)
-    : IsEH(IsEH), EHFrameAddress(EHFrameAddress) {}
+DWARFDebugFrame::DWARFDebugFrame(Triple::ArchType Arch,
+    bool IsEH, uint64_t EHFrameAddress)
+    : Arch(Arch), IsEH(IsEH), EHFrameAddress(EHFrameAddress) {}
 
 DWARFDebugFrame::~DWARFDebugFrame() = default;
 
@@ -396,7 +397,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
       uint8_t SegmentDescriptorSize = Version < 4 ? 0 : Data.getU8(&Offset);
       uint64_t CodeAlignmentFactor = Data.getULEB128(&Offset);
       int64_t DataAlignmentFactor = Data.getSLEB128(&Offset);
-      uint64_t ReturnAddressRegister = Data.getULEB128(&Offset);
+      uint64_t ReturnAddressRegister =
+          Version == 1 ? Data.getU8(&Offset) : Data.getULEB128(&Offset);
 
       // Parse the augmentation data for EH CIEs
       StringRef AugmentationData("");
@@ -460,7 +462,7 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
           StartOffset, Length, Version, AugmentationString, AddressSize,
           SegmentDescriptorSize, CodeAlignmentFactor, DataAlignmentFactor,
           ReturnAddressRegister, AugmentationData, FDEPointerEncoding,
-          LSDAPointerEncoding, Personality, PersonalityEncoding);
+          LSDAPointerEncoding, Personality, PersonalityEncoding, Arch);
       CIEs[StartOffset] = Cie.get();
       Entries.emplace_back(std::move(Cie));
     } else {
@@ -512,7 +514,7 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
 
       Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
                                    InitialLocation, AddressRange,
-                                   Cie, LSDAAddress));
+                                   Cie, LSDAAddress, Arch));
     }
 
     if (Error E =
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 9146b457a5ddae1bfdae098e337b3053b094fa93..f8b5ff6ec8fb60344cc9d07142fdb361b9587e32 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -124,7 +124,7 @@ DWARFDebugLoc::parseOneLocationList(DWARFDataExtractor Data, unsigned *Offset) {
     StringRef str = Data.getData().substr(*Offset, Bytes);
     *Offset += Bytes;
     E.Loc.reserve(str.size());
-    std::copy(str.begin(), str.end(), std::back_inserter(E.Loc));
+    llvm::copy(str, std::back_inserter(E.Loc));
     LL.Entries.push_back(std::move(E));
   }
 }
@@ -189,7 +189,7 @@ DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset,
       StringRef str = Data.getData().substr(*Offset, Bytes);
       *Offset += Bytes;
       E.Loc.resize(str.size());
-      std::copy(str.begin(), str.end(), E.Loc.begin());
+      llvm::copy(str, E.Loc.begin());
     }
 
     LL.Entries.push_back(std::move(E));
@@ -235,14 +235,14 @@ void DWARFDebugLoclists::LocationList::dump(raw_ostream &OS, uint64_t BaseAddr,
     case dwarf::DW_LLE_start_length:
       OS << '\n';
       OS.indent(Indent);
-      OS << format("[0x%*.*" PRIx64 ", 0x%*.*x): ", AddressSize * 2,
+      OS << format("[0x%*.*" PRIx64 ", 0x%*.*" PRIx64 "): ", AddressSize * 2,
                    AddressSize * 2, E.Value0, AddressSize * 2, AddressSize * 2,
                    E.Value0 + E.Value1);
       break;
     case dwarf::DW_LLE_offset_pair:
       OS << '\n';
       OS.indent(Indent);
-      OS << format("[0x%*.*" PRIx64 ", 0x%*.*x): ", AddressSize * 2,
+      OS << format("[0x%*.*" PRIx64 ", 0x%*.*" PRIx64 "): ", AddressSize * 2,
                    AddressSize * 2, BaseAddr + E.Value0, AddressSize * 2,
                    AddressSize * 2, BaseAddr + E.Value1);
       break;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
index de8b6e543fab20fa150226c819ca433e4f0daf80..abd1ad59a9c18d4d1e9f22ae7f38fe275279d12c 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/DataExtractor.h"
@@ -18,10 +19,11 @@
 using namespace llvm;
 using namespace dwarf;
 
-DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian,
-                                       bool GnuStyle)
+DWARFDebugPubTable::DWARFDebugPubTable(const DWARFObject &Obj,
+                                       const DWARFSection &Sec,
+                                       bool LittleEndian, bool GnuStyle)
     : GnuStyle(GnuStyle) {
-  DataExtractor PubNames(Data, LittleEndian, 0);
+  DWARFDataExtractor PubNames(Obj, Sec, LittleEndian, 0);
   uint32_t Offset = 0;
   while (PubNames.isValidOffset(Offset)) {
     Sets.push_back({});
@@ -29,10 +31,10 @@ DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian,
 
     SetData.Length = PubNames.getU32(&Offset);
     SetData.Version = PubNames.getU16(&Offset);
-    SetData.Offset = PubNames.getU32(&Offset);
+    SetData.Offset = PubNames.getRelocatedValue(4, &Offset);
     SetData.Size = PubNames.getU32(&Offset);
 
-    while (Offset < Data.size()) {
+    while (Offset < Sec.Data.size()) {
       uint32_t DieRef = PubNames.getU32(&Offset);
       if (DieRef == 0)
         break;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index cb5fb0d49dabaea6de5b58f15ec970a7ed47feae..60c6eb30857fa303e1b62e382f6868d39b7c394f 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -123,7 +123,7 @@ DWARFDebugRnglist::getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr,
     if (RLE.EntryKind == dwarf::DW_RLE_base_addressx) {
       BaseAddr = U.getAddrOffsetSectionItem(RLE.Value0);
       if (!BaseAddr)
-        BaseAddr = {RLE.Value0, 0};
+        BaseAddr = {RLE.Value0, -1ULL};
       continue;
     }
     if (RLE.EntryKind == dwarf::DW_RLE_base_address) {
@@ -156,7 +156,7 @@ DWARFDebugRnglist::getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr,
     case dwarf::DW_RLE_startx_length: {
       auto Start = U.getAddrOffsetSectionItem(RLE.Value0);
       if (!Start)
-        Start = {0, 0};
+        Start = {0, -1ULL};
       E.SectionIndex = Start->SectionIndex;
       E.LowPC = Start->Address;
       E.HighPC = E.LowPC + RLE.Value1;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 31c4cd5e472a5efefae5ccf4b8918eb52cdbfafc..551e292fb0323d35ecd11296a01bbe837edf048b 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -155,9 +155,7 @@ static void dumpTypeTagName(raw_ostream &OS, dwarf::Tag T) {
 }
 
 /// Recursively dump the DIE type name when applicable.
-static void dumpTypeName(raw_ostream &OS, const DWARFDie &Die) {
-  DWARFDie D = Die.getAttributeValueAsReferencedDie(DW_AT_type);
-
+static void dumpTypeName(raw_ostream &OS, const DWARFDie &D) {
   if (!D.isValid())
     return;
 
@@ -175,22 +173,63 @@ static void dumpTypeName(raw_ostream &OS, const DWARFDie &Die) {
   case DW_TAG_ptr_to_member_type:
   case DW_TAG_reference_type:
   case DW_TAG_rvalue_reference_type:
+  case DW_TAG_subroutine_type:
     break;
   default:
     dumpTypeTagName(OS, T);
   }
 
   // Follow the DW_AT_type if possible.
-  dumpTypeName(OS, D);
+  DWARFDie TypeDie = D.getAttributeValueAsReferencedDie(DW_AT_type);
+  dumpTypeName(OS, TypeDie);
 
   switch (T) {
-  case DW_TAG_array_type:
-    OS << "[]";
+  case DW_TAG_subroutine_type: {
+    if (!TypeDie)
+      OS << "void";
+    OS << '(';
+    bool First = true;
+    for (const DWARFDie &C : D.children()) {
+      if (C.getTag() == DW_TAG_formal_parameter) {
+        if (!First)
+          OS << ", ";
+        First = false;
+        dumpTypeName(OS, C.getAttributeValueAsReferencedDie(DW_AT_type));
+      }
+    }
+    OS << ')';
+    break;
+  }
+  case DW_TAG_array_type: {
+    Optional<uint64_t> Bound;
+    for (const DWARFDie &C : D.children())
+      if (C.getTag() == DW_TAG_subrange_type) {
+        OS << '[';
+        uint64_t LowerBound = 0;
+        if (Optional<DWARFFormValue> L = C.find(DW_AT_lower_bound))
+          if (Optional<uint64_t> LB = L->getAsUnsignedConstant()) {
+            LowerBound = *LB;
+            OS << LowerBound << '-';
+          }
+        if (Optional<DWARFFormValue> CountV = C.find(DW_AT_count)) {
+          if (Optional<uint64_t> C = CountV->getAsUnsignedConstant())
+            OS << (*C + LowerBound);
+        } else if (Optional<DWARFFormValue> UpperV = C.find(DW_AT_upper_bound))
+          if (Optional<uint64_t> U = UpperV->getAsUnsignedConstant())
+            OS << *U;
+        OS << ']';
+      }
     break;
+  }
   case DW_TAG_pointer_type:
     OS << '*';
     break;
   case DW_TAG_ptr_to_member_type:
+    if (DWARFDie Cont =
+            D.getAttributeValueAsReferencedDie(DW_AT_containing_type)) {
+      dumpTypeName(OS << ' ', Cont);
+      OS << "::";
+    }
     OS << '*';
     break;
   case DW_TAG_reference_type:
@@ -270,12 +309,13 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   // having both the raw value and the pretty-printed value is
   // interesting. These attributes are handled below.
   if (Attr == DW_AT_specification || Attr == DW_AT_abstract_origin) {
-    if (const char *Name = Die.getAttributeValueAsReferencedDie(Attr).getName(
-            DINameKind::LinkageName))
+    if (const char *Name =
+            Die.getAttributeValueAsReferencedDie(formValue).getName(
+                DINameKind::LinkageName))
       OS << Space << "\"" << Name << '\"';
   } else if (Attr == DW_AT_type) {
     OS << Space << "\"";
-    dumpTypeName(OS, Die);
+    dumpTypeName(OS, Die.getAttributeValueAsReferencedDie(formValue));
     OS << '"';
   } else if (Attr == DW_AT_APPLE_property_attribute) {
     if (Optional<uint64_t> OptVal = formValue.getAsUnsignedConstant())
@@ -371,7 +411,14 @@ DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
 
 DWARFDie
 DWARFDie::getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const {
-  if (auto SpecRef = toReference(find(Attr))) {
+  if (Optional<DWARFFormValue> F = find(Attr))
+    return getAttributeValueAsReferencedDie(*F);
+  return DWARFDie();
+}
+
+DWARFDie
+DWARFDie::getAttributeValueAsReferencedDie(const DWARFFormValue &V) const {
+  if (auto SpecRef = toReference(V)) {
     if (auto SpecUnit = U->getUnitVector().getUnitForOffset(*SpecRef))
       return SpecUnit->getDIEForOffset(*SpecRef);
   }
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 1caaa249bef94f7c4174b71f982dfc8942d8c006..48900e4b7a28947f85d42488029d0b97ad7b8abb 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -66,9 +66,11 @@ void DWARFUnitVector::addUnitsImpl(
   DWARFDataExtractor Data(Obj, Section, LE, 0);
   // Lazy initialization of Parser, now that we have all section info.
   if (!Parser) {
-    Parser = [=, &Context, &Obj, &Section, &SOS, &LS](
-                 uint32_t Offset, DWARFSectionKind SectionKind,
-                 const DWARFSection *CurSection) -> std::unique_ptr<DWARFUnit> {
+    Parser = [=, &Context, &Obj, &Section, &SOS,
+              &LS](uint32_t Offset, DWARFSectionKind SectionKind,
+                   const DWARFSection *CurSection,
+                   const DWARFUnitIndex::Entry *IndexEntry)
+        -> std::unique_ptr<DWARFUnit> {
       const DWARFSection &InfoSection = CurSection ? *CurSection : Section;
       DWARFDataExtractor Data(Obj, InfoSection, LE, 0);
       if (!Data.isValidOffset(Offset))
@@ -77,7 +79,8 @@ void DWARFUnitVector::addUnitsImpl(
       if (IsDWO)
         Index = &getDWARFUnitIndex(Context, SectionKind);
       DWARFUnitHeader Header;
-      if (!Header.extract(Context, Data, &Offset, SectionKind, Index))
+      if (!Header.extract(Context, Data, &Offset, SectionKind, Index,
+                          IndexEntry))
         return nullptr;
       std::unique_ptr<DWARFUnit> U;
       if (Header.isTypeUnit())
@@ -106,7 +109,7 @@ void DWARFUnitVector::addUnitsImpl(
       ++I;
       continue;
     }
-    auto U = Parser(Offset, SectionKind, &Section);
+    auto U = Parser(Offset, SectionKind, &Section, nullptr);
     // If parsing failed, we're done with this section.
     if (!U)
       break;
@@ -156,7 +159,7 @@ DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E) {
   if (!Parser)
     return nullptr;
 
-  auto U = Parser(Offset, DW_SECT_INFO, nullptr);
+  auto U = Parser(Offset, DW_SECT_INFO, nullptr, &E);
   if (!U)
     U = nullptr;
 
@@ -233,9 +236,12 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
                               const DWARFDataExtractor &debug_info,
                               uint32_t *offset_ptr,
                               DWARFSectionKind SectionKind,
-                              const DWARFUnitIndex *Index) {
+                              const DWARFUnitIndex *Index,
+                              const DWARFUnitIndex::Entry *Entry) {
   Offset = *offset_ptr;
-  IndexEntry = Index ? Index->getFromOffset(*offset_ptr) : nullptr;
+  IndexEntry = Entry;
+  if (!IndexEntry && Index)
+    IndexEntry = Index->getFromOffset(*offset_ptr);
   Length = debug_info.getU32(offset_ptr);
   // FIXME: Support DWARF64.
   unsigned SizeOfLength = 4;
diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp
index 5f5ff69fe3f829eb3d41b9b09c7ce23b395ffd2a..256952073e88a0fc6dee89a680f4c95ff5e3a2e5 100644
--- a/lib/DebugInfo/PDB/GenericError.cpp
+++ b/lib/DebugInfo/PDB/GenericError.cpp
@@ -34,6 +34,8 @@ public:
       return "The PDB file path is an invalid UTF8 sequence.";
     case pdb_error_code::signature_out_of_date:
       return "The signature does not match; the file(s) might be out of date.";
+    case pdb_error_code::external_cmdline_ref:
+      return "The path to this file must be provided on the command-line.";
     }
     llvm_unreachable("Unrecognized generic_error_code");
   }
diff --git a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index b97f1e90bcf83ead6e6d58f1e43e5b78fb940e9d..ab93efc839a9a101ff68c0595228466a64601ac0 100644
--- a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -19,7 +19,6 @@
 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
-#include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
@@ -66,12 +65,22 @@ void DbiModuleDescriptorBuilder::setFirstSectionContrib(
 }
 
 void DbiModuleDescriptorBuilder::addSymbol(CVSymbol Symbol) {
-  Symbols.push_back(Symbol);
-  // Symbols written to a PDB file are required to be 4 byte aligned.  The same
+  // Defer to the bulk API. It does the same thing.
+  addSymbolsInBulk(Symbol.data());
+}
+
+void DbiModuleDescriptorBuilder::addSymbolsInBulk(
+    ArrayRef<uint8_t> BulkSymbols) {
+  // Do nothing for empty runs of symbols.
+  if (BulkSymbols.empty())
+    return;
+
+  Symbols.push_back(BulkSymbols);
+  // Symbols written to a PDB file are required to be 4 byte aligned. The same
   // is not true of object files.
-  assert(Symbol.length() % alignOf(CodeViewContainer::Pdb) == 0 &&
+  assert(BulkSymbols.size() % alignOf(CodeViewContainer::Pdb) == 0 &&
          "Invalid Symbol alignment!");
-  SymbolByteSize += Symbol.length();
+  SymbolByteSize += BulkSymbols.size();
 }
 
 void DbiModuleDescriptorBuilder::addSourceFile(StringRef Path) {
@@ -145,16 +154,13 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
     if (auto EC =
             SymbolWriter.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC))
       return EC;
-    BinaryItemStream<CVSymbol> Records(llvm::support::endianness::little);
-    Records.setItems(Symbols);
-    BinaryStreamRef RecordsRef(Records);
-    if (auto EC = SymbolWriter.writeStreamRef(RecordsRef))
-      return EC;
-    if (auto EC = SymbolWriter.padToAlignment(4))
-      return EC;
-    // TODO: Write C11 Line data
+    for (ArrayRef<uint8_t> Syms : Symbols) {
+      if (auto EC = SymbolWriter.writeBytes(Syms))
+        return EC;
+    }
     assert(SymbolWriter.getOffset() % alignOf(CodeViewContainer::Pdb) == 0 &&
            "Invalid debug section alignment!");
+    // TODO: Write C11 Line data
     for (const auto &Builder : C13Builders) {
       assert(Builder && "Empty C13 Fragment Builder!");
       if (auto EC = Builder->commit(SymbolWriter))
diff --git a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index b40ae8e9007030c1cee02ed740a27738c1d70ffc..57da7003da2b82cb486bb000a22fca72575ec7fa 100644
--- a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/DebugInfo/CodeView/RecordName.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
@@ -20,6 +21,7 @@
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/xxhash.h"
 #include <algorithm>
 #include <vector>
 
@@ -29,8 +31,27 @@ using namespace llvm::pdb;
 using namespace llvm::codeview;
 
 struct llvm::pdb::GSIHashStreamBuilder {
+  struct UdtDenseMapInfo {
+    static inline CVSymbol getEmptyKey() {
+      static CVSymbol Empty;
+      return Empty;
+    }
+    static inline CVSymbol getTombstoneKey() {
+      static CVSymbol Tombstone(static_cast<SymbolKind>(-1),
+                                ArrayRef<uint8_t>());
+      return Tombstone;
+    }
+    static unsigned getHashValue(const CVSymbol &Val) {
+      return xxHash64(Val.RecordData);
+    }
+    static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
+      return LHS.RecordData == RHS.RecordData;
+    }
+  };
+
   std::vector<CVSymbol> Records;
   uint32_t StreamIndex;
+  llvm::DenseSet<CVSymbol, UdtDenseMapInfo> UdtHashes;
   std::vector<PSHashRecord> HashRecords;
   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
   std::vector<support::ulittle32_t> HashBuckets;
@@ -42,10 +63,18 @@ struct llvm::pdb::GSIHashStreamBuilder {
 
   template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) {
     T Copy(Symbol);
-    Records.push_back(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
-                                                       CodeViewContainer::Pdb));
+    addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
+                                               CodeViewContainer::Pdb));
+  }
+  void addSymbol(const CVSymbol &Symbol) {
+    if (Symbol.kind() == S_UDT) {
+      auto Iter = UdtHashes.insert(Symbol);
+      if (!Iter.second)
+        return;
+    }
+
+    Records.push_back(Symbol);
   }
-  void addSymbol(const CVSymbol &Symbol) { Records.push_back(Symbol); }
 };
 
 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
@@ -272,10 +301,6 @@ void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
   GSH->addSymbol(Sym, Msf);
 }
 
-void GSIStreamBuilder::addGlobalSymbol(const UDTSym &Sym) {
-  GSH->addSymbol(Sym, Msf);
-}
-
 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) {
   GSH->addSymbol(Sym);
 }
diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
index 6464b85982eca60ab9f3de9a8f2356bca4dd551c..8c97f4a012f03d61fde19ae78fce6deab7006946 100644
--- a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
+++ b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
@@ -11,7 +11,9 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecordHelpers.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -47,7 +49,8 @@ Error ModuleDebugStreamRef::reload() {
 
   if (auto EC = Reader.readInteger(Signature))
     return EC;
-  if (auto EC = Reader.readSubstream(SymbolsSubstream, SymbolSize - 4))
+  Reader.setOffset(0);
+  if (auto EC = Reader.readSubstream(SymbolsSubstream, SymbolSize))
     return EC;
   if (auto EC = Reader.readSubstream(C11LinesSubstream, C11Size))
     return EC;
@@ -55,8 +58,8 @@ Error ModuleDebugStreamRef::reload() {
     return EC;
 
   BinaryStreamReader SymbolReader(SymbolsSubstream.StreamData);
-  if (auto EC =
-          SymbolReader.readArray(SymbolArray, SymbolReader.bytesRemaining()))
+  if (auto EC = SymbolReader.readArray(
+          SymbolArray, SymbolReader.bytesRemaining(), sizeof(uint32_t)))
     return EC;
 
   BinaryStreamReader SubsectionsReader(C13LinesSubstream.StreamData);
@@ -76,6 +79,11 @@ Error ModuleDebugStreamRef::reload() {
   return Error::success();
 }
 
+const codeview::CVSymbolArray
+ModuleDebugStreamRef::getSymbolArrayForScope(uint32_t ScopeBegin) const {
+  return limitSymbolArrayToScope(SymbolArray, ScopeBegin);
+}
+
 BinarySubstreamRef ModuleDebugStreamRef::getSymbolsSubstream() const {
   return SymbolsSubstream;
 }
@@ -98,9 +106,7 @@ ModuleDebugStreamRef::symbols(bool *HadError) const {
 }
 
 CVSymbol ModuleDebugStreamRef::readSymbolAtOffset(uint32_t Offset) const {
-  // Offsets include the size of the 4-byte magic at the beginning, but lookup
-  // doesn't take that into account, so subtract it here.
-  auto Iter = SymbolArray.at(Offset - 4);
+  auto Iter = SymbolArray.at(Offset);
   assert(Iter != SymbolArray.end());
   return *Iter;
 }
diff --git a/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/lib/DebugInfo/PDB/Native/SymbolCache.cpp
index 7485341a2064962f2f7c479273c68ad70df28288..5cdd628312fe5c7313c7062e2f8ef2efff3f63b2 100644
--- a/lib/DebugInfo/PDB/Native/SymbolCache.cpp
+++ b/lib/DebugInfo/PDB/Native/SymbolCache.cpp
@@ -68,9 +68,6 @@ SymbolCache::SymbolCache(NativeSession &Session, DbiStream *Dbi)
 
   if (Dbi)
     Compilands.resize(Dbi->modules().getModuleCount());
-
-  auto &Tpi = cantFail(Session.getPDBFile().getPDBTpiStream());
-  Tpi.buildHashMap();
 }
 
 std::unique_ptr<IPDBEnumSymbols>
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index 44781705bfaed221953a93df9652182e7a17ed18..f234d446e6a099246e159fb0590c007aaffe3ea2 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -160,6 +160,9 @@ void TpiStream::buildHashMap() {
 }
 
 std::vector<TypeIndex> TpiStream::findRecordsByName(StringRef Name) const {
+  if (!supportsTypeLookup())
+    const_cast<TpiStream*>(this)->buildHashMap();
+
   uint32_t Bucket = hashStringV1(Name) % Header->NumHashBuckets;
   if (Bucket > HashMap.size())
     return {};
@@ -177,6 +180,9 @@ bool TpiStream::supportsTypeLookup() const { return !HashMap.empty(); }
 
 Expected<TypeIndex>
 TpiStream::findFullDeclForForwardRef(TypeIndex ForwardRefTI) const {
+  if (!supportsTypeLookup())
+    const_cast<TpiStream*>(this)->buildHashMap();
+
   CVType F = Types->getType(ForwardRefTI);
   if (!isUdtForwardRef(F))
     return ForwardRefTI;
@@ -215,6 +221,7 @@ TpiStream::findFullDeclForForwardRef(TypeIndex ForwardRefTI) const {
 }
 
 codeview::CVType TpiStream::getType(codeview::TypeIndex Index) {
+  assert(!Index.isSimple());
   return Types->getType(Index);
 }
 
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index b6b11dbddf26b5d841331e6a6d9d750318da47ea..b2de0be2b70c5f2a5e418bd5502f0291d94070e3 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -340,7 +340,7 @@ char *llvm::itaniumDemangle(const char *MangledName, char *Buf,
 
   if (AST == nullptr)
     InternalStatus = demangle_invalid_mangled_name;
-  else if (initializeOutputStream(Buf, N, S, 1024))
+  else if (!initializeOutputStream(Buf, N, S, 1024))
     InternalStatus = demangle_memory_alloc_failure;
   else {
     assert(Parser.ForwardTemplateRefs.empty());
@@ -356,15 +356,6 @@ char *llvm::itaniumDemangle(const char *MangledName, char *Buf,
   return InternalStatus == demangle_success ? Buf : nullptr;
 }
 
-bool llvm::itaniumFindTypesInMangledName(const char *MangledName, void *Ctx,
-                                         void (*Callback)(void *,
-                                                          const char *)) {
-  Demangler Parser(MangledName, MangledName + std::strlen(MangledName));
-  Parser.TypeCallback = Callback;
-  Parser.TypeCallbackContext = Ctx;
-  return Parser.parse() == nullptr;
-}
-
 ItaniumPartialDemangler::ItaniumPartialDemangler()
     : RootNode(nullptr), Context(new Demangler{nullptr, nullptr}) {}
 
@@ -396,7 +387,7 @@ bool ItaniumPartialDemangler::partialDemangle(const char *MangledName) {
 
 static char *printNode(const Node *RootNode, char *Buf, size_t *N) {
   OutputStream S;
-  if (initializeOutputStream(Buf, N, S, 128))
+  if (!initializeOutputStream(Buf, N, S, 128))
     return nullptr;
   RootNode->print(S);
   S += '\0';
@@ -441,7 +432,7 @@ char *ItaniumPartialDemangler::getFunctionDeclContextName(char *Buf,
   const Node *Name = static_cast<const FunctionEncoding *>(RootNode)->getName();
 
   OutputStream S;
-  if (initializeOutputStream(Buf, N, S, 128))
+  if (!initializeOutputStream(Buf, N, S, 128))
     return nullptr;
 
  KeepGoingLocalFunction:
@@ -494,7 +485,7 @@ char *ItaniumPartialDemangler::getFunctionParameters(char *Buf,
   NodeArray Params = static_cast<FunctionEncoding *>(RootNode)->getParams();
 
   OutputStream S;
-  if (initializeOutputStream(Buf, N, S, 128))
+  if (!initializeOutputStream(Buf, N, S, 128))
     return nullptr;
 
   S += '(';
@@ -512,7 +503,7 @@ char *ItaniumPartialDemangler::getFunctionReturnType(
     return nullptr;
 
   OutputStream S;
-  if (initializeOutputStream(Buf, N, S, 128))
+  if (!initializeOutputStream(Buf, N, S, 128))
     return nullptr;
 
   if (const Node *Ret =
diff --git a/lib/Demangle/MicrosoftDemangle.cpp b/lib/Demangle/MicrosoftDemangle.cpp
index 882e4a578455dbcac5189167b0839d522d3ba43f..cca7bf95a7e4afdb9115fe52334b874461ac3d70 100644
--- a/lib/Demangle/MicrosoftDemangle.cpp
+++ b/lib/Demangle/MicrosoftDemangle.cpp
@@ -40,7 +40,8 @@ struct NodeList {
   NodeList *Next = nullptr;
 };
 
-static bool isMemberPointer(StringView MangledName) {
+static bool isMemberPointer(StringView MangledName, bool &Error) {
+  Error = false;
   switch (MangledName.popFront()) {
   case '$':
     // This is probably an rvalue reference (e.g. $$Q), and you cannot have an
@@ -58,7 +59,8 @@ static bool isMemberPointer(StringView MangledName) {
     // what.
     break;
   default:
-    assert(false && "Ty is not a pointer type!");
+    Error = true;
+    return false;
   }
 
   // If it starts with a number, then 6 indicates a non-member function
@@ -89,9 +91,9 @@ static bool isMemberPointer(StringView MangledName) {
   case 'T':
     return true;
   default:
-    assert(false);
+    Error = true;
+    return false;
   }
-  return false;
 }
 
 static SpecialIntrinsicKind
@@ -874,7 +876,7 @@ void Demangler::memorizeIdentifier(IdentifierNode *Identifier) {
   // Render this class template name into a string buffer so that we can
   // memorize it for the purpose of back-referencing.
   OutputStream OS;
-  if (initializeOutputStream(nullptr, nullptr, OS, 1024))
+  if (!initializeOutputStream(nullptr, nullptr, OS, 1024))
     // FIXME: Propagate out-of-memory as an error?
     std::terminate();
   Identifier->output(OS, OF_Default);
@@ -1207,7 +1209,7 @@ Demangler::demangleStringLiteral(StringView &MangledName) {
   if (MangledName.empty())
     goto StringLiteralError;
 
-  if (initializeOutputStream(nullptr, nullptr, OS, 1024))
+  if (!initializeOutputStream(nullptr, nullptr, OS, 1024))
     // FIXME: Propagate out-of-memory as an error?
     std::terminate();
   if (IsWcharT) {
@@ -1330,7 +1332,7 @@ Demangler::demangleLocallyScopedNamePiece(StringView &MangledName) {
 
   // Render the parent symbol's name into a buffer.
   OutputStream OS;
-  if (initializeOutputStream(nullptr, nullptr, OS, 1024))
+  if (!initializeOutputStream(nullptr, nullptr, OS, 1024))
     // FIXME: Propagate out-of-memory as an error?
     std::terminate();
   OS << '`';
@@ -1651,10 +1653,12 @@ TypeNode *Demangler::demangleType(StringView &MangledName,
   if (isTagType(MangledName))
     Ty = demangleClassType(MangledName);
   else if (isPointerType(MangledName)) {
-    if (isMemberPointer(MangledName))
+    if (isMemberPointer(MangledName, Error))
       Ty = demangleMemberPointerType(MangledName);
-    else
+    else if (!Error)
       Ty = demanglePointerType(MangledName);
+    else
+      return nullptr;
   } else if (isArrayType(MangledName))
     Ty = demangleArrayType(MangledName);
   else if (isFunctionType(MangledName)) {
@@ -1669,10 +1673,10 @@ TypeNode *Demangler::demangleType(StringView &MangledName,
     Ty = demangleCustomType(MangledName);
   } else {
     Ty = demanglePrimitiveType(MangledName);
-    if (!Ty || Error)
-      return Ty;
   }
 
+  if (!Ty || Error)
+    return Ty;
   Ty->Quals = Qualifiers(Ty->Quals | Quals);
   return Ty;
 }
@@ -1988,6 +1992,8 @@ Demangler::demangleFunctionParameterList(StringView &MangledName) {
 
     *Current = Arena.alloc<NodeList>();
     TypeNode *TN = demangleType(MangledName, QualifierMangleMode::Drop);
+    if (!TN || Error)
+      return nullptr;
 
     (*Current)->N = TN;
 
@@ -2156,7 +2162,7 @@ void Demangler::dumpBackReferences() {
 
   // Create an output stream so we can render each type.
   OutputStream OS;
-  if (initializeOutputStream(nullptr, nullptr, OS, 1024))
+  if (!initializeOutputStream(nullptr, nullptr, OS, 1024))
     std::terminate();
   for (size_t I = 0; I < Backrefs.FunctionParamCount; ++I) {
     OS.setCurrentPosition(0);
@@ -2194,7 +2200,7 @@ char *llvm::microsoftDemangle(const char *MangledName, char *Buf, size_t *N,
 
   if (D.Error)
     InternalStatus = demangle_invalid_mangled_name;
-  else if (initializeOutputStream(Buf, N, S, 1024))
+  else if (!initializeOutputStream(Buf, N, S, 1024))
     InternalStatus = demangle_memory_alloc_failure;
   else {
     AST->output(S, OF_Default);
diff --git a/lib/Demangle/MicrosoftDemangleNodes.cpp b/lib/Demangle/MicrosoftDemangleNodes.cpp
index af893b9b68e1ce1bb56e54d4b10b79eedd250d08..a588dca5b7ba338bdf5498965099833b47d886c0 100644
--- a/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Demangle/Compiler.h"
 #include "llvm/Demangle/Utility.h"
 #include <cctype>
+#include <string>
 
 using namespace llvm;
 using namespace ms_demangle;
@@ -113,6 +114,14 @@ static void outputCallingConvention(OutputStream &OS, CallingConv CC) {
   }
 }
 
+std::string Node::toString() const {
+  OutputStream OS;
+  initializeOutputStream(nullptr, nullptr, OS, 1024);
+  this->output(OS, llvm::ms_demangle::OF_Default);
+  OS << '\0';
+  return {OS.getBuffer()};
+}
+
 void TypeNode::outputQuals(bool SpaceBefore, bool SpaceAfter) const {}
 
 void PrimitiveTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
@@ -161,22 +170,21 @@ void EncodedStringLiteralNode::output(OutputStream &OS,
                                       OutputFlags Flags) const {
   switch (Char) {
   case CharKind::Wchar:
-    OS << "const wchar_t * {L\"";
+    OS << "L\"";
     break;
   case CharKind::Char:
-    OS << "const char * {\"";
+    OS << "\"";
     break;
   case CharKind::Char16:
-    OS << "const char16_t * {u\"";
+    OS << "u\"";
     break;
   case CharKind::Char32:
-    OS << "const char32_t * {U\"";
+    OS << "U\"";
     break;
   }
   OS << DecodedString << "\"";
   if (IsTruncated)
     OS << "...";
-  OS << "}";
 }
 
 void IntegerLiteralNode::output(OutputStream &OS, OutputFlags Flags) const {
@@ -369,16 +377,23 @@ void LiteralOperatorIdentifierNode::output(OutputStream &OS,
 
 void FunctionSignatureNode::outputPre(OutputStream &OS,
                                       OutputFlags Flags) const {
+  if (FunctionClass & FC_Public)
+    OS << "public: ";
+  if (FunctionClass & FC_Protected)
+    OS << "protected: ";
+  if (FunctionClass & FC_Private)
+    OS << "private: ";
+
   if (!(FunctionClass & FC_Global)) {
     if (FunctionClass & FC_Static)
       OS << "static ";
   }
-  if (FunctionClass & FC_ExternC)
-    OS << "extern \"C\" ";
-
   if (FunctionClass & FC_Virtual)
     OS << "virtual ";
 
+  if (FunctionClass & FC_ExternC)
+    OS << "extern \"C\" ";
+
   if (ReturnType) {
     ReturnType->outputPre(OS, Flags);
     OS << " ";
@@ -555,9 +570,13 @@ void FunctionSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
 void VariableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
   switch (SC) {
   case StorageClass::PrivateStatic:
+    OS << "private: static ";
+    break;
   case StorageClass::PublicStatic:
+    OS << "public: static ";
+    break;
   case StorageClass::ProtectedStatic:
-    OS << "static ";
+    OS << "protected: static ";
     break;
   default:
     break;
diff --git a/lib/ExecutionEngine/GDBRegistrationListener.cpp b/lib/ExecutionEngine/GDBRegistrationListener.cpp
index fd4f0746f7f9e310178a56ea9b0460de48f81642..8204f5a90268bb313dc2c1500ebf0341e9db9511 100644
--- a/lib/ExecutionEngine/GDBRegistrationListener.cpp
+++ b/lib/ExecutionEngine/GDBRegistrationListener.cpp
@@ -76,8 +76,8 @@ struct RegisteredObjectInfo {
 };
 
 // Buffer for an in-memory object file in executable memory
-typedef llvm::DenseMap< const char*, RegisteredObjectInfo>
-  RegisteredObjectBufferMap;
+typedef llvm::DenseMap<JITEventListener::ObjectKey, RegisteredObjectInfo>
+    RegisteredObjectBufferMap;
 
 /// Global access point for the JIT debugging interface designed for use with a
 /// singleton toolbox. Handles thread-safe registration and deregistration of
@@ -99,13 +99,13 @@ public:
   /// Creates an entry in the JIT registry for the buffer @p Object,
   /// which must contain an object file in executable memory with any
   /// debug information for the debugger.
-  void NotifyObjectEmitted(const ObjectFile &Object,
-                           const RuntimeDyld::LoadedObjectInfo &L) override;
+  void notifyObjectLoaded(ObjectKey K, const ObjectFile &Obj,
+                          const RuntimeDyld::LoadedObjectInfo &L) override;
 
   /// Removes the internal registration of @p Object, and
   /// frees associated resources.
   /// Returns true if @p Object was found in ObjectBufferMap.
-  void NotifyFreeingObject(const ObjectFile &Object) override;
+  void notifyFreeingObject(ObjectKey K) override;
 
 private:
   /// Deregister the debug info for the given object file from the debugger
@@ -147,11 +147,11 @@ GDBJITRegistrationListener::~GDBJITRegistrationListener() {
   ObjectBufferMap.clear();
 }
 
-void GDBJITRegistrationListener::NotifyObjectEmitted(
-                                       const ObjectFile &Object,
-                                       const RuntimeDyld::LoadedObjectInfo &L) {
+void GDBJITRegistrationListener::notifyObjectLoaded(
+    ObjectKey K, const ObjectFile &Obj,
+    const RuntimeDyld::LoadedObjectInfo &L) {
 
-  OwningBinary<ObjectFile> DebugObj = L.getObjectForDebug(Object);
+  OwningBinary<ObjectFile> DebugObj = L.getObjectForDebug(Obj);
 
   // Bail out if debug objects aren't supported.
   if (!DebugObj.getBinary())
@@ -160,11 +160,8 @@ void GDBJITRegistrationListener::NotifyObjectEmitted(
   const char *Buffer = DebugObj.getBinary()->getMemoryBufferRef().getBufferStart();
   size_t      Size = DebugObj.getBinary()->getMemoryBufferRef().getBufferSize();
 
-  const char *Key = Object.getMemoryBufferRef().getBufferStart();
-
-  assert(Key && "Attempt to register a null object with a debugger.");
   llvm::MutexGuard locked(*JITDebugLock);
-  assert(ObjectBufferMap.find(Key) == ObjectBufferMap.end() &&
+  assert(ObjectBufferMap.find(K) == ObjectBufferMap.end() &&
          "Second attempt to perform debug registration.");
   jit_code_entry* JITCodeEntry = new jit_code_entry();
 
@@ -175,16 +172,15 @@ void GDBJITRegistrationListener::NotifyObjectEmitted(
     JITCodeEntry->symfile_addr = Buffer;
     JITCodeEntry->symfile_size = Size;
 
-    ObjectBufferMap[Key] = RegisteredObjectInfo(Size, JITCodeEntry,
-                                                std::move(DebugObj));
+    ObjectBufferMap[K] =
+        RegisteredObjectInfo(Size, JITCodeEntry, std::move(DebugObj));
     NotifyDebugger(JITCodeEntry);
   }
 }
 
-void GDBJITRegistrationListener::NotifyFreeingObject(const ObjectFile& Object) {
-  const char *Key = Object.getMemoryBufferRef().getBufferStart();
+void GDBJITRegistrationListener::notifyFreeingObject(ObjectKey K) {
   llvm::MutexGuard locked(*JITDebugLock);
-  RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(Key);
+  RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(K);
 
   if (I != ObjectBufferMap.end()) {
     deregisterObjectInternal(I);
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 211f5216811f2ceb8e39236d42d85c6ddce961f9..e9051c198506ed7d12125495036466ab0f2bbb0a 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -47,7 +47,7 @@ class IntelJITEventListener : public JITEventListener {
   typedef DenseMap<const void *, MethodAddressVector>  ObjectMap;
 
   ObjectMap  LoadedObjectMap;
-  std::map<const char*, OwningBinary<ObjectFile>> DebugObjects;
+  std::map<ObjectKey, OwningBinary<ObjectFile>> DebugObjects;
 
 public:
   IntelJITEventListener(IntelJITEventsWrapper* libraryWrapper) {
@@ -57,10 +57,10 @@ public:
   ~IntelJITEventListener() {
   }
 
-  void NotifyObjectEmitted(const ObjectFile &Obj,
-                           const RuntimeDyld::LoadedObjectInfo &L) override;
+  void notifyObjectLoaded(ObjectKey Key, const ObjectFile &Obj,
+                          const RuntimeDyld::LoadedObjectInfo &L) override;
 
-  void NotifyFreeingObject(const ObjectFile &Obj) override;
+  void notifyFreeingObject(ObjectKey Key) override;
 };
 
 static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
@@ -96,9 +96,9 @@ static iJIT_Method_Load FunctionDescToIntelJITFormat(
   return Result;
 }
 
-void IntelJITEventListener::NotifyObjectEmitted(
-                                       const ObjectFile &Obj,
-                                       const RuntimeDyld::LoadedObjectInfo &L) {
+void IntelJITEventListener::notifyObjectLoaded(
+    ObjectKey Key, const ObjectFile &Obj,
+    const RuntimeDyld::LoadedObjectInfo &L) {
 
   OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
   const ObjectFile *DebugObj = DebugObjOwner.getBinary();
@@ -188,17 +188,17 @@ void IntelJITEventListener::NotifyObjectEmitted(
   // registered function addresses for each loaded object.  We will
   // use the MethodIDs map to get the registered ID for each function.
   LoadedObjectMap[ObjData] = Functions;
-  DebugObjects[Obj.getData().data()] = std::move(DebugObjOwner);
+  DebugObjects[Key] = std::move(DebugObjOwner);
 }
 
-void IntelJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
+void IntelJITEventListener::notifyFreeingObject(ObjectKey Key) {
   // This object may not have been registered with the listener. If it wasn't,
   // bail out.
-  if (DebugObjects.find(Obj.getData().data()) == DebugObjects.end())
+  if (DebugObjects.find(Key) == DebugObjects.end())
     return;
 
   // Get the address of the object image for use as a unique identifier
-  const ObjectFile &DebugObj = *DebugObjects[Obj.getData().data()].getBinary();
+  const ObjectFile &DebugObj = *DebugObjects[Key].getBinary();
   const void* ObjData = DebugObj.getData().data();
 
   // Get the object's function list from LoadedObjectMap
@@ -223,7 +223,7 @@ void IntelJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
 
   // Erase the object from LoadedObjectMap
   LoadedObjectMap.erase(OI);
-  DebugObjects.erase(Obj.getData().data());
+  DebugObjects.erase(Key);
 }
 
 }  // anonymous namespace.
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 64dca930722e61ee7eadb2600b3eafad04d7ed05..044d9b7f27a793ea235c91b163965f76e2d696cd 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -227,7 +227,8 @@ static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals,
   Type *RetTy = FTy->getReturnType();
   ffi_type *rtype = ffiTypeFor(RetTy);
 
-  if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, &args[0]) == FFI_OK) {
+  if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, args.data()) ==
+      FFI_OK) {
     SmallVector<uint8_t, 128> ret;
     if (RetTy->getTypeID() != Type::VoidTyID)
       ret.resize(TD.getTypeStoreSize(RetTy));
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 25c0cb5d6eff6e1958834723a79c324d20481ae1..ffc6707e1488c517c5168b3d1a3f9a06c19af22a 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -95,7 +95,7 @@ MCJIT::~MCJIT() {
 
   for (auto &Obj : LoadedObjects)
     if (Obj)
-      NotifyFreeingObject(*Obj);
+      notifyFreeingObject(*Obj);
 
   Archives.clear();
 }
@@ -119,7 +119,7 @@ void MCJIT::addObjectFile(std::unique_ptr<object::ObjectFile> Obj) {
   if (Dyld.hasError())
     report_fatal_error(Dyld.getErrorString());
 
-  NotifyObjectEmitted(*Obj, *L);
+  notifyObjectLoaded(*Obj, *L);
 
   LoadedObjects.push_back(std::move(Obj));
 }
@@ -216,7 +216,7 @@ void MCJIT::generateCodeForModule(Module *M) {
   if (!LoadedObject) {
     std::string Buf;
     raw_string_ostream OS(Buf);
-    logAllUnhandledErrors(LoadedObject.takeError(), OS, "");
+    logAllUnhandledErrors(LoadedObject.takeError(), OS);
     OS.flush();
     report_fatal_error(Buf);
   }
@@ -226,7 +226,7 @@ void MCJIT::generateCodeForModule(Module *M) {
   if (Dyld.hasError())
     report_fatal_error(Dyld.getErrorString());
 
-  NotifyObjectEmitted(*LoadedObject.get(), *L);
+  notifyObjectLoaded(*LoadedObject.get(), *L);
 
   Buffers.push_back(std::move(ObjectToLoad));
   LoadedObjects.push_back(std::move(*LoadedObject));
@@ -648,19 +648,23 @@ void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
   }
 }
 
-void MCJIT::NotifyObjectEmitted(const object::ObjectFile& Obj,
-                                const RuntimeDyld::LoadedObjectInfo &L) {
+void MCJIT::notifyObjectLoaded(const object::ObjectFile &Obj,
+                               const RuntimeDyld::LoadedObjectInfo &L) {
+  uint64_t Key =
+      static_cast<uint64_t>(reinterpret_cast<uintptr_t>(Obj.getData().data()));
   MutexGuard locked(lock);
   MemMgr->notifyObjectLoaded(this, Obj);
   for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
-    EventListeners[I]->NotifyObjectEmitted(Obj, L);
+    EventListeners[I]->notifyObjectLoaded(Key, Obj, L);
   }
 }
 
-void MCJIT::NotifyFreeingObject(const object::ObjectFile& Obj) {
+void MCJIT::notifyFreeingObject(const object::ObjectFile &Obj) {
+  uint64_t Key =
+      static_cast<uint64_t>(reinterpret_cast<uintptr_t>(Obj.getData().data()));
   MutexGuard locked(lock);
   for (JITEventListener *L : EventListeners)
-    L->NotifyFreeingObject(Obj);
+    L->notifyFreeingObject(Key);
 }
 
 JITSymbol
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 943b14942a0f55b90d7ca314a4bd6b3bf1d86454..1119e138720f43138ee206649a452672445aeaee 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -331,9 +331,9 @@ protected:
   /// the future.
   std::unique_ptr<MemoryBuffer> emitObject(Module *M);
 
-  void NotifyObjectEmitted(const object::ObjectFile& Obj,
-                           const RuntimeDyld::LoadedObjectInfo &L);
-  void NotifyFreeingObject(const object::ObjectFile& Obj);
+  void notifyObjectLoaded(const object::ObjectFile &Obj,
+                          const RuntimeDyld::LoadedObjectInfo &L);
+  void notifyFreeingObject(const object::ObjectFile &Obj);
 
   JITSymbol findExistingSymbol(const std::string &Name);
   Module *findModuleForSymbol(const std::string &Name, bool CheckFunctionsOnly);
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 6f0825fb38da891604d6a923aa6e3008388b15ef..21af6b585c41e7d55a2ebab94767815e7634291f 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -40,7 +40,7 @@ class OProfileJITEventListener : public JITEventListener {
   std::unique_ptr<OProfileWrapper> Wrapper;
 
   void initialize();
-  std::map<const char*, OwningBinary<ObjectFile>> DebugObjects;
+  std::map<ObjectKey, OwningBinary<ObjectFile>> DebugObjects;
 
 public:
   OProfileJITEventListener(std::unique_ptr<OProfileWrapper> LibraryWrapper)
@@ -50,10 +50,10 @@ public:
 
   ~OProfileJITEventListener();
 
-  void NotifyObjectEmitted(const ObjectFile &Obj,
-                           const RuntimeDyld::LoadedObjectInfo &L) override;
+  void notifyObjectLoaded(ObjectKey Key, const ObjectFile &Obj,
+                          const RuntimeDyld::LoadedObjectInfo &L) override;
 
-  void NotifyFreeingObject(const ObjectFile &Obj) override;
+  void notifyFreeingObject(ObjectKey Key) override;
 };
 
 void OProfileJITEventListener::initialize() {
@@ -78,9 +78,9 @@ OProfileJITEventListener::~OProfileJITEventListener() {
   }
 }
 
-void OProfileJITEventListener::NotifyObjectEmitted(
-                                       const ObjectFile &Obj,
-                                       const RuntimeDyld::LoadedObjectInfo &L) {
+void OProfileJITEventListener::notifyObjectLoaded(
+    ObjectKey Key, const ObjectFile &Obj,
+    const RuntimeDyld::LoadedObjectInfo &L) {
   if (!Wrapper->isAgentAvailable()) {
     return;
   }
@@ -137,18 +137,18 @@ void OProfileJITEventListener::NotifyObjectEmitted(
     }
   }
 
-  DebugObjects[Obj.getData().data()] = std::move(DebugObjOwner);
+  DebugObjects[Key] = std::move(DebugObjOwner);
 }
 
-void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
+void OProfileJITEventListener::notifyFreeingObject(ObjectKey Key) {
   if (Wrapper->isAgentAvailable()) {
 
     // If there was no agent registered when the original object was loaded then
     // we won't have created a debug object for it, so bail out.
-    if (DebugObjects.find(Obj.getData().data()) == DebugObjects.end())
+    if (DebugObjects.find(Key) == DebugObjects.end())
       return;
 
-    const ObjectFile &DebugObj = *DebugObjects[Obj.getData().data()].getBinary();
+    const ObjectFile &DebugObj = *DebugObjects[Key].getBinary();
 
     // Use symbol info to iterate functions in the object.
     for (symbol_iterator I = DebugObj.symbol_begin(),
@@ -171,7 +171,7 @@ void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
     }
   }
 
-  DebugObjects.erase(Obj.getData().data());
+  DebugObjects.erase(Key);
 }
 
 }  // anonymous namespace.
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index f99cbec6d3b60a0a50199498e11fa9f220215a19..73c0bcdf7d28dab061dfc670cea5574ca2cfef0b 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -1947,7 +1947,8 @@ ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder,
                          SymbolStringPtr Name) {
   SymbolNameSet Names({Name});
 
-  JITDylibSearchList FullSearchOrder(SearchOrder.size());
+  JITDylibSearchList FullSearchOrder;
+  FullSearchOrder.reserve(SearchOrder.size());
   for (auto *JD : SearchOrder)
     FullSearchOrder.push_back({JD, false});
 
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index deddfcb10e12e4d05ee4873615de0f08fa29e160..817a4b89bfb0517e99a30c70589f94035124b56b 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -492,13 +492,17 @@ private:
   void notifyFinalized(orc::VModuleKey K,
 		       const object::ObjectFile &Obj,
 		       const RuntimeDyld::LoadedObjectInfo &LoadedObjInfo) {
+    uint64_t Key = static_cast<uint64_t>(
+        reinterpret_cast<uintptr_t>(Obj.getData().data()));
     for (auto &Listener : EventListeners)
-      Listener->NotifyObjectEmitted(Obj, LoadedObjInfo);
+      Listener->notifyObjectLoaded(Key, Obj, LoadedObjInfo);
   }
 
   void notifyFreed(orc::VModuleKey K, const object::ObjectFile &Obj) {
+    uint64_t Key = static_cast<uint64_t>(
+        reinterpret_cast<uintptr_t>(Obj.getData().data()));
     for (auto &Listener : EventListeners)
-      Listener->NotifyFreeingObject(Obj);
+      Listener->notifyFreeingObject(Key);
   }
 
   orc::ExecutionSession ES;
diff --git a/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
index 7bf8120d23dfdda91970bff25a40f41b1065f2bd..f195d028299865aeea3850bab1cc0e38a7f2e5f6 100644
--- a/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
+++ b/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
@@ -66,9 +66,9 @@ public:
       CloseMarker();
   }
 
-  void NotifyObjectEmitted(const ObjectFile &Obj,
-                           const RuntimeDyld::LoadedObjectInfo &L) override;
-  void NotifyFreeingObject(const ObjectFile &Obj) override;
+  void notifyObjectLoaded(ObjectKey K, const ObjectFile &Obj,
+                          const RuntimeDyld::LoadedObjectInfo &L) override;
+  void notifyFreeingObject(ObjectKey K) override;
 
 private:
   bool InitDebuggingDir();
@@ -227,8 +227,9 @@ PerfJITEventListener::PerfJITEventListener() : Pid(::getpid()) {
     SuccessfullyInitialized = true;
 }
 
-void PerfJITEventListener::NotifyObjectEmitted(
-    const ObjectFile &Obj, const RuntimeDyld::LoadedObjectInfo &L) {
+void PerfJITEventListener::notifyObjectLoaded(
+    ObjectKey K, const ObjectFile &Obj,
+    const RuntimeDyld::LoadedObjectInfo &L) {
 
   if (!SuccessfullyInitialized)
     return;
@@ -280,7 +281,7 @@ void PerfJITEventListener::NotifyObjectEmitted(
   Dumpstream->flush();
 }
 
-void PerfJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
+void PerfJITEventListener::notifyFreeingObject(ObjectKey K) {
   // perf currently doesn't have an interface for unloading. But munmap()ing the
   // code section does, so that's ok.
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
index 1c54ad6fb03f8d248515ef74dfcba00b4bab1e6b..340ddaab186da07273def89ffa7d17caa3bf5ed8 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
@@ -66,7 +66,7 @@ RuntimeDyldCOFF::loadObject(const object::ObjectFile &O) {
   } else {
     HasError = true;
     raw_string_ostream ErrStream(ErrorStr);
-    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, "");
+    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream);
     return nullptr;
   }
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index f9a81c7bd1b00538323ac4fe24c75dc060e5b838..226ee715e18bc768a1d48e572265195c9f94e2b4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -255,7 +255,7 @@ RuntimeDyldELF::loadObject(const object::ObjectFile &O) {
   else {
     HasError = true;
     raw_string_ostream ErrStream(ErrorStr);
-    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, "");
+    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream);
     return nullptr;
   }
 }
@@ -1130,7 +1130,7 @@ RuntimeDyldELF::processRelocationRef(
     if (!SymTypeOrErr) {
       std::string Buf;
       raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(SymTypeOrErr.takeError(), OS, "");
+      logAllUnhandledErrors(SymTypeOrErr.takeError(), OS);
       OS.flush();
       report_fatal_error(Buf);
     }
@@ -1151,7 +1151,7 @@ RuntimeDyldELF::processRelocationRef(
       if (!SectionOrErr) {
         std::string Buf;
         raw_string_ostream OS(Buf);
-        logAllUnhandledErrors(SectionOrErr.takeError(), OS, "");
+        logAllUnhandledErrors(SectionOrErr.takeError(), OS);
         OS.flush();
         report_fatal_error(Buf);
       }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index c5a215c833315039b643bf685ba79af1429518e5..d47fcd45be8859fa67e164d4a46c491f83f9e2ad 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -370,7 +370,7 @@ RuntimeDyldMachO::loadObject(const object::ObjectFile &O) {
   else {
     HasError = true;
     raw_string_ostream ErrStream(ErrorStr);
-    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, "");
+    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream);
     return nullptr;
   }
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index 729ea1ec48a4e9d65b0a8d214811817b4463a29f..8723dd0fd0ea5140fb8ba80d48dcb9d3f915cadb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -28,7 +28,7 @@ static bool isThumbFunc(symbol_iterator Symbol, const ObjectFile &Obj,
   if (!SymTypeOrErr) {
     std::string Buf;
     raw_string_ostream OS(Buf);
-    logAllUnhandledErrors(SymTypeOrErr.takeError(), OS, "");
+    logAllUnhandledErrors(SymTypeOrErr.takeError(), OS);
     OS.flush();
     report_fatal_error(Buf);
   }
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 39bdc4b69217adced55de954fda2ff46d2fb8eaa..aee5f6dc374693c86b35c93a6135804ca554c118 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -128,6 +128,13 @@ public:
       break;
     }
 
+    case COFF::IMAGE_REL_AMD64_SECREL: {
+      assert(static_cast<int64_t>(RE.Addend) <= INT32_MAX && "Relocation overflow");
+      assert(static_cast<int64_t>(RE.Addend) >= INT32_MIN && "Relocation underflow");
+      writeBytesUnaligned(RE.Addend, Target, 4);
+      break;
+    }
+
     default:
       llvm_unreachable("Relocation type not implemented yet!");
       break;
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 3b575739263dc1c716c7f41f73356a594429f715..36f4f3aa8764b9b350ea45fbdf7c9f6080b47f31 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -1607,6 +1607,7 @@ struct MDFieldPrinter {
   void printInt(StringRef Name, IntTy Int, bool ShouldSkipZero = true);
   void printBool(StringRef Name, bool Value, Optional<bool> Default = None);
   void printDIFlags(StringRef Name, DINode::DIFlags Flags);
+  void printDISPFlags(StringRef Name, DISubprogram::DISPFlags Flags);
   template <class IntTy, class Stringifier>
   void printDwarfEnum(StringRef Name, IntTy Value, Stringifier toString,
                       bool ShouldSkipZero = true);
@@ -1705,6 +1706,30 @@ void MDFieldPrinter::printDIFlags(StringRef Name, DINode::DIFlags Flags) {
     Out << FlagsFS << Extra;
 }
 
+void MDFieldPrinter::printDISPFlags(StringRef Name,
+                                    DISubprogram::DISPFlags Flags) {
+  // Always print this field, because no flags in the IR at all will be
+  // interpreted as old-style isDefinition: true.
+  Out << FS << Name << ": ";
+
+  if (!Flags) {
+    Out << 0;
+    return;
+  }
+
+  SmallVector<DISubprogram::DISPFlags, 8> SplitFlags;
+  auto Extra = DISubprogram::splitFlags(Flags, SplitFlags);
+
+  FieldSeparator FlagsFS(" | ");
+  for (auto F : SplitFlags) {
+    auto StringF = DISubprogram::getFlagString(F);
+    assert(!StringF.empty() && "Expected valid flag");
+    Out << FlagsFS << StringF;
+  }
+  if (Extra || SplitFlags.empty())
+    Out << FlagsFS << Extra;
+}
+
 void MDFieldPrinter::printEmissionKind(StringRef Name,
                                        DICompileUnit::DebugEmissionKind EK) {
   Out << FS << Name << ": " << DICompileUnit::emissionKindString(EK);
@@ -1910,6 +1935,7 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Printer.printBool("debugInfoForProfiling", N->getDebugInfoForProfiling(),
                     false);
   Printer.printNameTableKind("nameTableKind", N->getNameTableKind());
+  Printer.printBool("rangesBaseAddress", N->getRangesBaseAddress(), false);
   Out << ")";
 }
 
@@ -1924,18 +1950,14 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N,
   Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLine());
   Printer.printMetadata("type", N->getRawType());
-  Printer.printBool("isLocal", N->isLocalToUnit());
-  Printer.printBool("isDefinition", N->isDefinition());
   Printer.printInt("scopeLine", N->getScopeLine());
   Printer.printMetadata("containingType", N->getRawContainingType());
-  Printer.printDwarfEnum("virtuality", N->getVirtuality(),
-                         dwarf::VirtualityString);
   if (N->getVirtuality() != dwarf::DW_VIRTUALITY_none ||
       N->getVirtualIndex() != 0)
     Printer.printInt("virtualIndex", N->getVirtualIndex(), false);
   Printer.printInt("thisAdjustment", N->getThisAdjustment());
   Printer.printDIFlags("flags", N->getFlags());
-  Printer.printBool("isOptimized", N->isOptimized());
+  Printer.printDISPFlags("spFlags", N->getSPFlags());
   Printer.printMetadata("unit", N->getRawUnit());
   Printer.printMetadata("templateParams", N->getRawTemplateParams());
   Printer.printMetadata("declaration", N->getRawDeclaration());
@@ -2272,11 +2294,15 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
       Machine = MachineStorage.get();
     }
     int Slot = Machine->getMetadataSlot(N);
-    if (Slot == -1)
+    if (Slot == -1) {
+      if (const DILocation *Loc = dyn_cast<DILocation>(N)) {
+        writeDILocation(Out, Loc, TypePrinter, Machine, Context);
+        return;
+      }
       // Give the pointer value instead of "badref", since this comes up all
       // the time when debugging.
       Out << "<" << N << ">";
-    else
+    } else
       Out << '!' << Slot;
     return;
   }
@@ -2820,7 +2846,7 @@ void AssemblyWriter::printAliasSummary(const AliasSummary *AS) {
 }
 
 void AssemblyWriter::printGlobalVarSummary(const GlobalVarSummary *GS) {
-  // Nothing for now
+  Out << ", varFlags: (readonly: " << GS->VarFlags.ReadOnly << ")";
 }
 
 static std::string getLinkageName(GlobalValue::LinkageTypes LT) {
@@ -3014,6 +3040,8 @@ void AssemblyWriter::printSummary(const GlobalValueSummary &Summary) {
     FieldSeparator FS;
     for (auto &Ref : RefList) {
       Out << FS;
+      if (Ref.isReadOnly())
+        Out << "readonly ";
       Out << "^" << Machine.getGUIDSlot(Ref.getGUID());
     }
     Out << ")";
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 63a0f6f22af302e0ac72eded5a1e928b0057374b..ff46debb7a9e574681eac51f06272733af1b4bcc 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -639,7 +639,7 @@ LLVM_DUMP_METHOD void AttributeSet::dump() const {
 AttributeSetNode::AttributeSetNode(ArrayRef<Attribute> Attrs)
     : AvailableAttrs(0), NumAttrs(Attrs.size()) {
   // There's memory after the node where we can store the entries in.
-  std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
+  llvm::copy(Attrs, getTrailingObjects<Attribute>());
 
   for (const auto I : *this) {
     if (!I.isStringAttribute()) {
@@ -809,7 +809,7 @@ AttributeListImpl::AttributeListImpl(LLVMContext &C,
   assert(!Sets.empty() && "pointless AttributeListImpl");
 
   // There's memory after the node where we can store the entries in.
-  std::copy(Sets.begin(), Sets.end(), getTrailingObjects<AttributeSet>());
+  llvm::copy(Sets, getTrailingObjects<AttributeSet>());
 
   // Initialize AvailableFunctionAttrs summary bitset.
   static_assert(Attribute::EndAttrKinds <=
@@ -1685,28 +1685,32 @@ adjustCallerStackProbeSize(Function &Caller, const Function &Callee) {
 }
 
 /// If the inlined function defines a min legal vector width, then ensure
-/// the calling function has the same or larger min legal vector width. This
-/// function is called after the inlining decision has been made so we have to
-/// merge the attribute this way. Heuristics that would use
+/// the calling function has the same or larger min legal vector width. If the
+/// caller has the attribute, but the callee doesn't, we need to remove the
+/// attribute from the caller since we can't make any guarantees about the
+/// caller's requirements.
+/// This function is called after the inlining decision has been made so we have
+/// to merge the attribute this way. Heuristics that would use
 /// min-legal-vector-width to determine inline compatibility would need to be
 /// handled as part of inline cost analysis.
 static void
 adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) {
-  if (Callee.hasFnAttribute("min-legal-vector-width")) {
-    uint64_t CalleeVectorWidth;
-    Callee.getFnAttribute("min-legal-vector-width")
-          .getValueAsString()
-          .getAsInteger(0, CalleeVectorWidth);
-    if (Caller.hasFnAttribute("min-legal-vector-width")) {
+  if (Caller.hasFnAttribute("min-legal-vector-width")) {
+    if (Callee.hasFnAttribute("min-legal-vector-width")) {
       uint64_t CallerVectorWidth;
       Caller.getFnAttribute("min-legal-vector-width")
             .getValueAsString()
             .getAsInteger(0, CallerVectorWidth);
-      if (CallerVectorWidth < CalleeVectorWidth) {
+      uint64_t CalleeVectorWidth;
+      Callee.getFnAttribute("min-legal-vector-width")
+            .getValueAsString()
+            .getAsInteger(0, CalleeVectorWidth);
+      if (CallerVectorWidth < CalleeVectorWidth)
         Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width"));
-      }
     } else {
-      Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width"));
+      // If the callee doesn't have the attribute then we don't know anything
+      // and must drop the attribute from the caller.
+      Caller.removeFnAttr("min-legal-vector-width");
     }
   }
 }
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 10cb1ac698da9230cc21f3d770328186240d8618..39e29a2a093c3bec882b7503419ad24b2ac7c41b 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -65,24 +65,19 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
   return true;
 }
 
-static bool UpgradeADCSBBIntrinsic(Function *F, Intrinsic::ID IID,
-                                   Function *&NewFn) {
-  // If this intrinsic has 3 operands, it's the new version.
-  if (F->getFunctionType()->getNumParams() == 3)
-    return false;
-
-  rename(F);
-  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
-  return true;
-}
-
 static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
   // All of the intrinsics matches below should be marked with which llvm
   // version started autoupgrading them. At some point in the future we would
   // like to use this information to remove upgrade code for some older
   // intrinsics. It is currently undecided how we will determine that future
   // point.
-  if (Name.startswith("sse2.paddus.") || // Added in 8.0
+  if (Name == "addcarryx.u32" || // Added in 8.0
+      Name == "addcarryx.u64" || // Added in 8.0
+      Name == "addcarry.u32" || // Added in 8.0
+      Name == "addcarry.u64" || // Added in 8.0
+      Name == "subborrow.u32" || // Added in 8.0
+      Name == "subborrow.u64" || // Added in 8.0
+      Name.startswith("sse2.paddus.") || // Added in 8.0
       Name.startswith("sse2.psubus.") || // Added in 8.0
       Name.startswith("avx2.paddus.") || // Added in 8.0
       Name.startswith("avx2.psubus.") || // Added in 8.0
@@ -382,20 +377,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
     return true;
   }
 
-  if (Name == "addcarryx.u32")
-    return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_addcarryx_u32, NewFn);
-  if (Name == "addcarryx.u64")
-    return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_addcarryx_u64, NewFn);
-  if (Name == "addcarry.u32")
-    return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_addcarry_u32, NewFn);
-  if (Name == "addcarry.u64")
-    return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_addcarry_u64, NewFn);
-  if (Name == "subborrow.u32")
-    return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_subborrow_u32, NewFn);
-  if (Name == "subborrow.u64")
-    return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_subborrow_u64, NewFn);
-
-  if (Name == "rdtscp") {
+  if (Name == "rdtscp") { // Added in 8.0
     // If this intrinsic has 0 operands, it's the new version.
     if (F->getFunctionType()->getNumParams() == 0)
       return false;
@@ -3277,6 +3259,39 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+    } else if (IsX86 && (Name == "addcarryx.u32" || Name == "addcarryx.u64" ||
+                         Name == "addcarry.u32" || Name == "addcarry.u64" ||
+                         Name == "subborrow.u32" || Name == "subborrow.u64")) {
+      Intrinsic::ID IID;
+      if (Name[0] == 'a' && Name.back() == '2')
+        IID = Intrinsic::x86_addcarry_32;
+      else if (Name[0] == 'a' && Name.back() == '4')
+        IID = Intrinsic::x86_addcarry_64;
+      else if (Name[0] == 's' && Name.back() == '2')
+        IID = Intrinsic::x86_subborrow_32;
+      else if (Name[0] == 's' && Name.back() == '4')
+        IID = Intrinsic::x86_subborrow_64;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      // Make a call with 3 operands.
+      Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                        CI->getArgOperand(2)};
+      Value *NewCall = Builder.CreateCall(
+                                Intrinsic::getDeclaration(CI->getModule(), IID),
+                                Args);
+
+      // Extract the second result and store it.
+      Value *Data = Builder.CreateExtractValue(NewCall, 1);
+      // Cast the pointer to the right type.
+      Value *Ptr = Builder.CreateBitCast(CI->getArgOperand(3),
+                                 llvm::PointerType::getUnqual(Data->getType()));
+      Builder.CreateAlignedStore(Data, Ptr, 1);
+      // Replace the original call result with the first result of the new call.
+      Value *CF = Builder.CreateExtractValue(NewCall, 0);
+
+      CI->replaceAllUsesWith(CF);
+      Rep = nullptr;
     } else if (IsX86 && Name.startswith("avx512.mask.") &&
                upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
       // Rep will be updated by the call in the condition.
@@ -3478,40 +3493,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
-  case Intrinsic::x86_addcarryx_u32:
-  case Intrinsic::x86_addcarryx_u64:
-  case Intrinsic::x86_addcarry_u32:
-  case Intrinsic::x86_addcarry_u64:
-  case Intrinsic::x86_subborrow_u32:
-  case Intrinsic::x86_subborrow_u64: {
-    // This used to take 4 arguments. If we only have 3 arguments its already
-    // upgraded.
-    if (CI->getNumOperands() == 3)
-      return;
-
-    // Make a call with 3 operands.
-    NewCall = Builder.CreateCall(NewFn, { CI->getArgOperand(0),
-                                          CI->getArgOperand(1),
-                                          CI->getArgOperand(2)});
-    // Extract the second result and store it.
-    Value *Data = Builder.CreateExtractValue(NewCall, 1);
-    // Cast the pointer to the right type.
-    Value *Ptr = Builder.CreateBitCast(CI->getArgOperand(3),
-                                 llvm::PointerType::getUnqual(Data->getType()));
-    Builder.CreateAlignedStore(Data, Ptr, 1);
-    // Replace the original call result with the first result of the new call.
-    Value *CF = Builder.CreateExtractValue(NewCall, 0);
-
-    std::string Name = CI->getName();
-    if (!Name.empty()) {
-      CI->setName(Name + ".old");
-      NewCall->setName(Name);
-    }
-    CI->replaceAllUsesWith(CF);
-    CI->eraseFromParent();
-    return;
-  }
-
   case Intrinsic::x86_sse41_insertps:
   case Intrinsic::x86_sse41_dppd:
   case Intrinsic::x86_sse41_dpps:
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 12ab2e2ace4da07ddbebe5948a37c3c09ba66889..e9eb686c1e139536819d9df21d6c7156cc4e19ea 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -260,6 +260,14 @@ const BasicBlock *BasicBlock::getUniquePredecessor() const {
   return PredBB;
 }
 
+bool BasicBlock::hasNPredecessors(unsigned N) const {
+  return hasNItems(pred_begin(this), pred_end(this), N);
+}
+
+bool BasicBlock::hasNPredecessorsOrMore(unsigned N) const {
+  return hasNItemsOrMore(pred_begin(this), pred_end(this), N);
+}
+
 const BasicBlock *BasicBlock::getSingleSuccessor() const {
   succ_const_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // no successors
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 107975df1e774ff999a1bd6a8d0a9bdf7c60e9f1..826199ee054e38df2d3c060b2cef45ce7a594658 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -1077,10 +1078,10 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
             isa<GlobalValue>(CE1->getOperand(0))) {
           GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));
 
-          // Functions are at least 4-byte aligned.
-          unsigned GVAlign = GV->getAlignment();
-          if (isa<Function>(GV))
-            GVAlign = std::max(GVAlign, 4U);
+          unsigned GVAlign =
+              GV->getParent()
+                  ? GV->getPointerAlignment(GV->getParent()->getDataLayout())
+                  : 0;
 
           if (GVAlign > 1) {
             unsigned DstWidth = CI2->getType()->getBitWidth();
@@ -2052,7 +2053,7 @@ static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
 static bool isIndexInRangeOfArrayType(uint64_t NumElements,
                                       const ConstantInt *CI) {
   // We cannot bounds check the index if it doesn't fit in an int64_t.
-  if (CI->getValue().getActiveBits() > 64)
+  if (CI->getValue().getMinSignedBits() > 64)
     return false;
 
   // A negative index or an index past the end of our sequential type is
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 2351e7e4a38930f06bd2ef9148a8389560a5761f..df09d13d3ebaa8d6f2c5e205a42864e3f3ade446 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -184,18 +184,15 @@ bool Constant::isNotMinSignedValue() const {
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return !CFP->getValueAPF().bitcastToAPInt().isMinSignedValue();
 
-  // Check for constant vectors which are splats of INT_MIN values.
-  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isNotMinSignedValue();
-
-  // Check for constant vectors which are splats of INT_MIN values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
-    if (CV->isSplat()) {
-      if (CV->getElementType()->isFloatingPointTy())
-        return !CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue();
-      return !CV->getElementAsAPInt(0).isMinSignedValue();
+  // Check that vectors don't contain INT_MIN
+  if (this->getType()->isVectorTy()) {
+    unsigned NumElts = this->getType()->getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = this->getAggregateElement(i);
+      if (!Elt || !Elt->isNotMinSignedValue())
+        return false;
     }
+    return true;
   }
 
   // It *may* contain INT_MIN, we can't tell.
@@ -353,8 +350,12 @@ Constant *Constant::getAggregateElement(unsigned Elt) const {
 
 Constant *Constant::getAggregateElement(Constant *Elt) const {
   assert(isa<IntegerType>(Elt->getType()) && "Index must be an integer");
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Elt))
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Elt)) {
+    // Check if the constant fits into an uint64_t.
+    if (CI->getValue().getActiveBits() > 64)
+      return nullptr;
     return getAggregateElement(CI->getZExtValue());
+  }
   return nullptr;
 }
 
@@ -722,9 +723,9 @@ Constant *ConstantFP::get(Type *Ty, StringRef Str) {
   return C;
 }
 
-Constant *ConstantFP::getNaN(Type *Ty, bool Negative, unsigned Type) {
+Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) {
   const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
-  APFloat NaN = APFloat::getNaN(Semantics, Negative, Type);
+  APFloat NaN = APFloat::getNaN(Semantics, Negative, Payload);
   Constant *C = get(Ty->getContext(), NaN);
 
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
@@ -733,6 +734,28 @@ Constant *ConstantFP::getNaN(Type *Ty, bool Negative, unsigned Type) {
   return C;
 }
 
+Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) {
+  const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
+  APFloat NaN = APFloat::getQNaN(Semantics, Negative, Payload);
+  Constant *C = get(Ty->getContext(), NaN);
+  
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), C);
+  
+  return C;
+}
+
+Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) {
+  const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
+  APFloat NaN = APFloat::getSNaN(Semantics, Negative, Payload);
+  Constant *C = get(Ty->getContext(), NaN);
+  
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), C);
+  
+  return C;
+}
+
 Constant *ConstantFP::getNegativeZero(Type *Ty) {
   const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
   APFloat NegZero = APFloat::getZero(Semantics, /*Negative=*/true);
@@ -940,7 +963,7 @@ ConstantAggregate::ConstantAggregate(CompositeType *T, ValueTy VT,
                                      ArrayRef<Constant *> V)
     : Constant(T, VT, OperandTraits<ConstantAggregate>::op_end(this) - V.size(),
                V.size()) {
-  std::copy(V.begin(), V.end(), op_begin());
+  llvm::copy(V, op_begin());
 
   // Check that types match, unless this is an opaque struct.
   if (auto *ST = dyn_cast<StructType>(T))
@@ -1780,6 +1803,36 @@ Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy,
   return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced);
 }
 
+Constant *ConstantExpr::get(unsigned Opcode, Constant *C, unsigned Flags, 
+                            Type *OnlyIfReducedTy) {
+  // Check the operands for consistency first.
+  assert(Instruction::isUnaryOp(Opcode) &&
+         "Invalid opcode in unary constant expression");
+
+#ifndef NDEBUG
+  switch (Opcode) {
+  case Instruction::FNeg:
+    assert(C->getType()->isFPOrFPVectorTy() &&
+           "Tried to create a floating-point operation on a "
+           "non-floating-point type!");
+    break;
+  default:
+    break;
+  }
+#endif
+
+  // TODO: Try to constant fold operation.
+
+  if (OnlyIfReducedTy == C->getType())
+    return nullptr;
+
+  Constant *ArgVec[] = { C };
+  ConstantExprKeyType Key(Opcode, ArgVec, 0, Flags);
+
+  LLVMContextImpl *pImpl = C->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(C->getType(), Key);
+}
+
 Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
                             unsigned Flags, Type *OnlyIfReducedTy) {
   // Check the operands for consistency first.
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index e9f31e4ded68dd8d9cb391b4fff3baa38cbde65c..eac17139708471dc357b1ef54d0e20eb04ac88ca 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -529,7 +529,9 @@ struct ConstantExprKeyType {
   ConstantExpr *create(TypeClass *Ty) const {
     switch (Opcode) {
     default:
-      if (Instruction::isCast(Opcode))
+      if (Instruction::isCast(Opcode) ||
+          (Opcode >= Instruction::UnaryOpsBegin &&
+           Opcode < Instruction::UnaryOpsEnd))
         return new UnaryConstantExpr(Opcode, Ops[0], Ty);
       if ((Opcode >= Instruction::BinaryOpsBegin &&
            Opcode < Instruction::BinaryOpsEnd))
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index df3578ea87f3d3520c8e780fe1be3fa72f811548..d29759f567918a8a5eca38e7a5df44aa3a4da6b4 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -140,7 +140,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
     StringRef Flags, unsigned RunTimeVer, StringRef SplitName,
     DICompileUnit::DebugEmissionKind Kind, uint64_t DWOId,
     bool SplitDebugInlining, bool DebugInfoForProfiling,
-    DICompileUnit::DebugNameTableKind NameTableKind) {
+    DICompileUnit::DebugNameTableKind NameTableKind, bool RangesBaseAddress) {
 
   assert(((Lang <= dwarf::DW_LANG_Fortran08 && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
@@ -150,7 +150,8 @@ DICompileUnit *DIBuilder::createCompileUnit(
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, File, Producer, isOptimized, Flags, RunTimeVer,
       SplitName, Kind, nullptr, nullptr, nullptr, nullptr, nullptr, DWOId,
-      SplitDebugInlining, DebugInfoForProfiling, NameTableKind);
+      SplitDebugInlining, DebugInfoForProfiling, NameTableKind,
+      RangesBaseAddress);
 
   // Create a named metadata so that it is easier to find cu in a module.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
@@ -750,18 +751,18 @@ static DISubprogram *getSubprogram(bool IsDistinct, Ts &&... Args) {
 
 DISubprogram *DIBuilder::createFunction(
     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
-    unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
-    bool isDefinition, unsigned ScopeLine, DINode::DIFlags Flags,
-    bool isOptimized, DITemplateParameterArray TParams, DISubprogram *Decl,
+    unsigned LineNo, DISubroutineType *Ty, unsigned ScopeLine,
+    DINode::DIFlags Flags, DISubprogram::DISPFlags SPFlags,
+    DITemplateParameterArray TParams, DISubprogram *Decl,
     DITypeArray ThrownTypes) {
+  bool IsDefinition = SPFlags & DISubprogram::SPFlagDefinition;
   auto *Node = getSubprogram(
-      /* IsDistinct = */ isDefinition, VMContext,
-      getNonCompileUnitScope(Context), Name, LinkageName, File, LineNo, Ty,
-      isLocalToUnit, isDefinition, ScopeLine, nullptr, 0, 0, 0, Flags,
-      isOptimized, isDefinition ? CUNode : nullptr, TParams, Decl,
+      /*IsDistinct=*/IsDefinition, VMContext, getNonCompileUnitScope(Context),
+      Name, LinkageName, File, LineNo, Ty, ScopeLine, nullptr, 0, 0, Flags,
+      SPFlags, IsDefinition ? CUNode : nullptr, TParams, Decl,
       MDTuple::getTemporary(VMContext, None).release(), ThrownTypes);
 
-  if (isDefinition)
+  if (IsDefinition)
     AllSubprograms.push_back(Node);
   trackIfUnresolved(Node);
   return Node;
@@ -769,35 +770,37 @@ DISubprogram *DIBuilder::createFunction(
 
 DISubprogram *DIBuilder::createTempFunctionFwdDecl(
     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
-    unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
-    bool isDefinition, unsigned ScopeLine, DINode::DIFlags Flags,
-    bool isOptimized, DITemplateParameterArray TParams, DISubprogram *Decl,
+    unsigned LineNo, DISubroutineType *Ty, unsigned ScopeLine,
+    DINode::DIFlags Flags, DISubprogram::DISPFlags SPFlags,
+    DITemplateParameterArray TParams, DISubprogram *Decl,
     DITypeArray ThrownTypes) {
-  return DISubprogram::getTemporary(
-             VMContext, getNonCompileUnitScope(Context), Name, LinkageName,
-             File, LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine, nullptr,
-             0, 0, 0, Flags, isOptimized, isDefinition ? CUNode : nullptr,
-             TParams, Decl, nullptr, ThrownTypes)
+  bool IsDefinition = SPFlags & DISubprogram::SPFlagDefinition;
+  return DISubprogram::getTemporary(VMContext, getNonCompileUnitScope(Context),
+                                    Name, LinkageName, File, LineNo, Ty,
+                                    ScopeLine, nullptr, 0, 0, Flags, SPFlags,
+                                    IsDefinition ? CUNode : nullptr, TParams,
+                                    Decl, nullptr, ThrownTypes)
       .release();
 }
 
 DISubprogram *DIBuilder::createMethod(
     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F,
-    unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
-    bool isDefinition, unsigned VK, unsigned VIndex, int ThisAdjustment,
-    DIType *VTableHolder, DINode::DIFlags Flags, bool isOptimized,
-    DITemplateParameterArray TParams, DITypeArray ThrownTypes) {
+    unsigned LineNo, DISubroutineType *Ty, unsigned VIndex, int ThisAdjustment,
+    DIType *VTableHolder, DINode::DIFlags Flags,
+    DISubprogram::DISPFlags SPFlags, DITemplateParameterArray TParams,
+    DITypeArray ThrownTypes) {
   assert(getNonCompileUnitScope(Context) &&
          "Methods should have both a Context and a context that isn't "
          "the compile unit.");
   // FIXME: Do we want to use different scope/lines?
+  bool IsDefinition = SPFlags & DISubprogram::SPFlagDefinition;
   auto *SP = getSubprogram(
-      /* IsDistinct = */ isDefinition, VMContext, cast<DIScope>(Context), Name,
-      LinkageName, F, LineNo, Ty, isLocalToUnit, isDefinition, LineNo,
-      VTableHolder, VK, VIndex, ThisAdjustment, Flags, isOptimized,
-      isDefinition ? CUNode : nullptr, TParams, nullptr, nullptr, ThrownTypes);
+      /*IsDistinct=*/IsDefinition, VMContext, cast<DIScope>(Context), Name,
+      LinkageName, F, LineNo, Ty, LineNo, VTableHolder, VIndex, ThisAdjustment,
+      Flags, SPFlags, IsDefinition ? CUNode : nullptr, TParams, nullptr,
+      nullptr, ThrownTypes);
 
-  if (isDefinition)
+  if (IsDefinition)
     AllSubprograms.push_back(SP);
   trackIfUnresolved(SP);
   return SP;
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index 02b7953cb5bbed5bcd972443c7241b9cbefddf52..d1ff54571782d10d89b0129e1593fc7bc144cd0f 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -438,11 +438,10 @@ private:
     auto distinctMDSubprogram = [&]() {
       return DISubprogram::getDistinct(
           MDS->getContext(), FileAndScope, MDS->getName(), LinkageName,
-          FileAndScope, MDS->getLine(), Type, MDS->isLocalToUnit(),
-          MDS->isDefinition(), MDS->getScopeLine(), ContainingType,
-          MDS->getVirtuality(), MDS->getVirtualIndex(),
-          MDS->getThisAdjustment(), MDS->getFlags(), MDS->isOptimized(), Unit,
-          TemplateParams, Declaration, Variables);
+          FileAndScope, MDS->getLine(), Type, MDS->getScopeLine(),
+          ContainingType, MDS->getVirtualIndex(), MDS->getThisAdjustment(),
+          MDS->getFlags(), MDS->getSPFlags(), Unit, TemplateParams, Declaration,
+          Variables);
     };
 
     if (MDS->isDistinct())
@@ -450,11 +449,9 @@ private:
 
     auto *NewMDS = DISubprogram::get(
         MDS->getContext(), FileAndScope, MDS->getName(), LinkageName,
-        FileAndScope, MDS->getLine(), Type, MDS->isLocalToUnit(),
-        MDS->isDefinition(), MDS->getScopeLine(), ContainingType,
-        MDS->getVirtuality(), MDS->getVirtualIndex(), MDS->getThisAdjustment(),
-        MDS->getFlags(), MDS->isOptimized(), Unit, TemplateParams, Declaration,
-        Variables);
+        FileAndScope, MDS->getLine(), Type, MDS->getScopeLine(), ContainingType,
+        MDS->getVirtualIndex(), MDS->getThisAdjustment(), MDS->getFlags(),
+        MDS->getSPFlags(), Unit, TemplateParams, Declaration, Variables);
 
     StringRef OldLinkageName = MDS->getLinkageName();
 
@@ -491,7 +488,8 @@ private:
         CU->getSplitDebugFilename(), DICompileUnit::LineTablesOnly, EnumTypes,
         RetainedTypes, GlobalVariables, ImportedEntities, CU->getMacros(),
         CU->getDWOId(), CU->getSplitDebugInlining(),
-        CU->getDebugInfoForProfiling(), CU->getNameTableKind());
+        CU->getDebugInfoForProfiling(), CU->getNameTableKind(),
+        CU->getRangesBaseAddress());
   }
 
   DILocation *getReplacementMDLocation(DILocation *MLD) {
@@ -719,6 +717,11 @@ static LLVMDIFlags map_to_llvmDIFlags(DINode::DIFlags Flags) {
   return static_cast<LLVMDIFlags>(Flags);
 }
 
+static DISubprogram::DISPFlags
+pack_into_DISPFlags(bool IsLocalToUnit, bool IsDefinition, bool IsOptimized) {
+  return DISubprogram::toSPFlags(IsLocalToUnit, IsDefinition, IsOptimized);
+}
+
 unsigned LLVMDebugMetadataVersion() {
   return DEBUG_METADATA_VERSION;
 }
@@ -802,9 +805,10 @@ LLVMMetadataRef LLVMDIBuilderCreateFunction(
     unsigned ScopeLine, LLVMDIFlags Flags, LLVMBool IsOptimized) {
   return wrap(unwrap(Builder)->createFunction(
       unwrapDI<DIScope>(Scope), {Name, NameLen}, {LinkageName, LinkageNameLen},
-      unwrapDI<DIFile>(File), LineNo, unwrapDI<DISubroutineType>(Ty),
-      IsLocalToUnit, IsDefinition, ScopeLine, map_from_llvmDIFlags(Flags),
-      IsOptimized, nullptr, nullptr, nullptr));
+      unwrapDI<DIFile>(File), LineNo, unwrapDI<DISubroutineType>(Ty), ScopeLine,
+      map_from_llvmDIFlags(Flags),
+      pack_into_DISPFlags(IsLocalToUnit, IsDefinition, IsOptimized), nullptr,
+      nullptr, nullptr));
 }
 
 
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index bb49cb2109a5dee8733d04d52a1576247ccabd85..3b702ce4798b99ddf10e077565c30079480a9f9e 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -470,7 +470,8 @@ DICompileUnit *DICompileUnit::getImpl(
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
     Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros,
     uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling,
-    unsigned NameTableKind, StorageType Storage, bool ShouldCreate) {
+    unsigned NameTableKind, bool RangesBaseAddress, StorageType Storage,
+    bool ShouldCreate) {
   assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
   assert(isCanonical(Flags) && "Expected canonical MDString");
@@ -483,7 +484,8 @@ DICompileUnit *DICompileUnit::getImpl(
   return storeImpl(new (array_lengthof(Ops)) DICompileUnit(
                        Context, Storage, SourceLanguage, IsOptimized,
                        RuntimeVersion, EmissionKind, DWOId, SplitDebugInlining,
-                       DebugInfoForProfiling, NameTableKind, Ops),
+                       DebugInfoForProfiling, NameTableKind, RangesBaseAddress,
+                       Ops),
                    Storage);
 }
 
@@ -540,21 +542,55 @@ DILocalScope *DILocalScope::getNonLexicalBlockFileScope() const {
   return const_cast<DILocalScope *>(this);
 }
 
+DISubprogram::DISPFlags DISubprogram::getFlag(StringRef Flag) {
+  return StringSwitch<DISPFlags>(Flag)
+#define HANDLE_DISP_FLAG(ID, NAME) .Case("DISPFlag" #NAME, SPFlag##NAME)
+#include "llvm/IR/DebugInfoFlags.def"
+      .Default(SPFlagZero);
+}
+
+StringRef DISubprogram::getFlagString(DISPFlags Flag) {
+  switch (Flag) {
+  // Appease a warning.
+  case SPFlagVirtuality:
+    return "";
+#define HANDLE_DISP_FLAG(ID, NAME)                                             \
+  case SPFlag##NAME:                                                           \
+    return "DISPFlag" #NAME;
+#include "llvm/IR/DebugInfoFlags.def"
+  }
+  return "";
+}
+
+DISubprogram::DISPFlags
+DISubprogram::splitFlags(DISPFlags Flags,
+                         SmallVectorImpl<DISPFlags> &SplitFlags) {
+  // Multi-bit fields can require special handling. In our case, however, the
+  // only multi-bit field is virtuality, and all its values happen to be
+  // single-bit values, so the right behavior just falls out.
+#define HANDLE_DISP_FLAG(ID, NAME)                                             \
+  if (DISPFlags Bit = Flags & SPFlag##NAME) {                                  \
+    SplitFlags.push_back(Bit);                                                 \
+    Flags &= ~Bit;                                                             \
+  }
+#include "llvm/IR/DebugInfoFlags.def"
+  return Flags;
+}
+
 DISubprogram *DISubprogram::getImpl(
     LLVMContext &Context, Metadata *Scope, MDString *Name,
     MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
-    bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
-    Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
-    int ThisAdjustment, DIFlags Flags, bool IsOptimized, Metadata *Unit,
+    unsigned ScopeLine, Metadata *ContainingType, unsigned VirtualIndex,
+    int ThisAdjustment, DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
     Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes,
     Metadata *ThrownTypes, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(
-      DISubprogram, (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
-                     IsDefinition, ScopeLine, ContainingType, Virtuality,
-                     VirtualIndex, ThisAdjustment, Flags, IsOptimized, Unit,
-                     TemplateParams, Declaration, RetainedNodes, ThrownTypes));
+  DEFINE_GETIMPL_LOOKUP(DISubprogram,
+                        (Scope, Name, LinkageName, File, Line, Type, ScopeLine,
+                         ContainingType, VirtualIndex, ThisAdjustment, Flags,
+                         SPFlags, Unit, TemplateParams, Declaration,
+                         RetainedNodes, ThrownTypes));
   SmallVector<Metadata *, 11> Ops = {
       File,        Scope,         Name,           LinkageName,    Type,       Unit,
       Declaration, RetainedNodes, ContainingType, TemplateParams, ThrownTypes};
@@ -566,11 +602,10 @@ DISubprogram *DISubprogram::getImpl(
         Ops.pop_back();
     }
   }
-  DEFINE_GETIMPL_STORE_N(DISubprogram,
-                         (Line, ScopeLine, Virtuality, VirtualIndex,
-                          ThisAdjustment, Flags, IsLocalToUnit, IsDefinition,
-                          IsOptimized),
-                         Ops, Ops.size());
+  DEFINE_GETIMPL_STORE_N(
+      DISubprogram,
+      (Line, ScopeLine, VirtualIndex, ThisAdjustment, Flags, SPFlags), Ops,
+      Ops.size());
 }
 
 bool DISubprogram::describes(const Function *F) const {
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 5ddb1196b07226005575a9294943f9d81e5d2528..3c6665bbcb7eac8e2c4689516ab706cc5a41a879 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -33,9 +33,10 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
 #include <atomic>
 #include <cassert>
 #include <memory>
@@ -106,7 +107,7 @@ void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const {
 DiagnosticLocation::DiagnosticLocation(const DebugLoc &DL) {
   if (!DL)
     return;
-  Filename = DL->getFilename();
+  File = DL->getFile();
   Line = DL->getLine();
   Column = DL->getColumn();
 }
@@ -114,17 +115,36 @@ DiagnosticLocation::DiagnosticLocation(const DebugLoc &DL) {
 DiagnosticLocation::DiagnosticLocation(const DISubprogram *SP) {
   if (!SP)
     return;
-  Filename = SP->getFilename();
+  
+  File = SP->getFile();
   Line = SP->getScopeLine();
   Column = 0;
 }
 
-void DiagnosticInfoWithLocationBase::getLocation(StringRef *Filename,
-                                                 unsigned *Line,
-                                                 unsigned *Column) const {
-  *Filename = Loc.getFilename();
-  *Line = Loc.getLine();
-  *Column = Loc.getColumn();
+StringRef DiagnosticLocation::getRelativePath() const {
+  return File->getFilename();
+}
+
+std::string DiagnosticLocation::getAbsolutePath() const {
+  StringRef Name = File->getFilename();
+  if (sys::path::is_absolute(Name))
+    return Name;
+
+  SmallString<128> Path;
+  sys::path::append(Path, File->getDirectory(), Name);
+  return sys::path::remove_leading_dotslash(Path).str();
+}
+
+std::string DiagnosticInfoWithLocationBase::getAbsolutePath() const {
+  return Loc.getAbsolutePath();
+}
+
+void DiagnosticInfoWithLocationBase::getLocation(StringRef &RelativePath,
+                                                 unsigned &Line,
+                                                 unsigned &Column) const {
+  RelativePath = Loc.getRelativePath();
+  Line = Loc.getLine();
+  Column = Loc.getColumn();
 }
 
 const std::string DiagnosticInfoWithLocationBase::getLocationStr() const {
@@ -132,7 +152,7 @@ const std::string DiagnosticInfoWithLocationBase::getLocationStr() const {
   unsigned Line = 0;
   unsigned Column = 0;
   if (isLocationAvailable())
-    getLocation(&Filename, &Line, &Column);
+    getLocation(Filename, Line, Column);
   return (Filename + ":" + Twine(Line) + ":" + Twine(Column)).str();
 }
 
@@ -399,7 +419,7 @@ template <> struct MappingTraits<DiagnosticLocation> {
   static void mapping(IO &io, DiagnosticLocation &DL) {
     assert(io.outputting() && "input not yet implemented");
 
-    StringRef File = DL.getFilename();
+    StringRef File = DL.getRelativePath();
     unsigned Line = DL.getLine();
     unsigned Col = DL.getColumn();
 
diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp
index befe1d9ffb1cda24d0e7f98f55932fdb3a639d88..43010220b9f3cac88bfbf46d4552a4d82eb97b44 100644
--- a/lib/IR/IRPrintingPasses.cpp
+++ b/lib/IR/IRPrintingPasses.cpp
@@ -27,7 +27,8 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &) {
-  OS << Banner;
+  if (!Banner.empty())
+    OS << Banner << "\n";
   if (llvm::isFunctionInPrintList("*"))
     M.print(OS, nullptr, ShouldPreserveUseListOrder);
   else {
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 75e7413a47d0ffca7785f7ac47807f54d74004f3..f077957969fa5019998401f986556c245ac89781 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -303,6 +303,9 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case CatchPad: return "catchpad";
   case CatchSwitch: return "catchswitch";
 
+  // Standard unary operators...
+  case FNeg: return "fneg";
+
   // Standard binary operators...
   case Add: return "add";
   case FAdd: return "fadd";
@@ -602,6 +605,13 @@ const Instruction *Instruction::getNextNonDebugInstruction() const {
   return nullptr;
 }
 
+const Instruction *Instruction::getPrevNonDebugInstruction() const {
+  for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode())
+    if (!isa<DbgInfoIntrinsic>(I))
+      return I;
+  return nullptr;
+}
+
 bool Instruction::isAssociative() const {
   unsigned Opcode = getOpcode();
   if (isAssociative(Opcode))
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 7d4b6df18d93911840a0683e0fd51dd53a370c28..8405bbde3220d19540963f1678a477d10ed6bd0a 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -65,10 +65,7 @@ AllocaInst::getAllocationSizeInBits(const DataLayout &DL) const {
 //===----------------------------------------------------------------------===//
 
 User::op_iterator CallSite::getCallee() const {
-  Instruction *II(getInstruction());
-  return isCall()
-    ? cast<CallInst>(II)->op_end() - 1 // Skip Callee
-    : cast<InvokeInst>(II)->op_end() - 3; // Skip BB, BB, Callee
+  return cast<CallBase>(getInstruction())->op_end() - 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -253,6 +250,82 @@ void LandingPadInst::addClause(Constant *Val) {
   getOperandList()[OpNo] = Val;
 }
 
+//===----------------------------------------------------------------------===//
+//                        CallBase Implementation
+//===----------------------------------------------------------------------===//
+
+Value *CallBase::getReturnedArgOperand() const {
+  unsigned Index;
+
+  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+    return getArgOperand(Index - AttributeList::FirstArgIndex);
+  if (const Function *F = getCalledFunction())
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
+        Index)
+      return getArgOperand(Index - AttributeList::FirstArgIndex);
+
+  return nullptr;
+}
+
+bool CallBase::hasRetAttr(Attribute::AttrKind Kind) const {
+  if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+    return true;
+
+  // Look at the callee, if available.
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+  return false;
+}
+
+/// Determine whether the argument or parameter has the given attribute.
+bool CallBase::paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
+  assert(ArgNo < getNumArgOperands() && "Param index out of bounds!");
+
+  if (Attrs.hasParamAttribute(ArgNo, Kind))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasParamAttribute(ArgNo, Kind);
+  return false;
+}
+
+bool CallBase::hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const {
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeList::FunctionIndex, Kind);
+  return false;
+}
+
+bool CallBase::hasFnAttrOnCalledFunction(StringRef Kind) const {
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeList::FunctionIndex, Kind);
+  return false;
+}
+
+CallBase::op_iterator
+CallBase::populateBundleOperandInfos(ArrayRef<OperandBundleDef> Bundles,
+                                     const unsigned BeginIndex) {
+  auto It = op_begin() + BeginIndex;
+  for (auto &B : Bundles)
+    It = std::copy(B.input_begin(), B.input_end(), It);
+
+  auto *ContextImpl = getContext().pImpl;
+  auto BI = Bundles.begin();
+  unsigned CurrentIndex = BeginIndex;
+
+  for (auto &BOI : bundle_op_infos()) {
+    assert(BI != Bundles.end() && "Incorrect allocation?");
+
+    BOI.Tag = ContextImpl->getOrInsertBundleTag(BI->getTag());
+    BOI.Begin = CurrentIndex;
+    BOI.End = CurrentIndex + BI->input_size();
+    CurrentIndex = BOI.End;
+    BI++;
+  }
+
+  assert(BI == Bundles.end() && "Incorrect allocation?");
+
+  return It;
+}
+
 //===----------------------------------------------------------------------===//
 //                        CallInst Implementation
 //===----------------------------------------------------------------------===//
@@ -262,7 +335,7 @@ void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
   this->FTy = FTy;
   assert(getNumOperands() == Args.size() + CountBundleInputs(Bundles) + 1 &&
          "NumOperands not set up?");
-  Op<-1>() = Func;
+  setCalledOperand(Func);
 
 #ifndef NDEBUG
   assert((Args.size() == FTy->getNumParams() ||
@@ -275,7 +348,7 @@ void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
            "Calling a function with a bad signature!");
 #endif
 
-  std::copy(Args.begin(), Args.end(), op_begin());
+  llvm::copy(Args, op_begin());
 
   auto It = populateBundleOperandInfos(Bundles, Args.size());
   (void)It;
@@ -288,7 +361,7 @@ void CallInst::init(Value *Func, const Twine &NameStr) {
   FTy =
       cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
   assert(getNumOperands() == 1 && "NumOperands not set up?");
-  Op<-1>() = Func;
+  setCalledOperand(Func);
 
   assert(FTy->getNumParams() == 0 && "Calling a function with bad signature");
 
@@ -296,31 +369,27 @@ void CallInst::init(Value *Func, const Twine &NameStr) {
 }
 
 CallInst::CallInst(Value *Func, const Twine &Name, Instruction *InsertBefore)
-    : CallBase<CallInst>(
-          cast<FunctionType>(
-              cast<PointerType>(Func->getType())->getElementType())
-              ->getReturnType(),
-          Instruction::Call,
-          OperandTraits<CallBase<CallInst>>::op_end(this) - 1, 1,
-          InsertBefore) {
+    : CallBase(cast<FunctionType>(
+                   cast<PointerType>(Func->getType())->getElementType())
+                   ->getReturnType(),
+               Instruction::Call, OperandTraits<CallBase>::op_end(this) - 1, 1,
+               InsertBefore) {
   init(Func, Name);
 }
 
 CallInst::CallInst(Value *Func, const Twine &Name, BasicBlock *InsertAtEnd)
-    : CallBase<CallInst>(
-          cast<FunctionType>(
-              cast<PointerType>(Func->getType())->getElementType())
-              ->getReturnType(),
-          Instruction::Call,
-          OperandTraits<CallBase<CallInst>>::op_end(this) - 1, 1, InsertAtEnd) {
+    : CallBase(cast<FunctionType>(
+                   cast<PointerType>(Func->getType())->getElementType())
+                   ->getReturnType(),
+               Instruction::Call, OperandTraits<CallBase>::op_end(this) - 1, 1,
+               InsertAtEnd) {
   init(Func, Name);
 }
 
 CallInst::CallInst(const CallInst &CI)
-    : CallBase<CallInst>(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call,
-                         OperandTraits<CallBase<CallInst>>::op_end(this) -
-                             CI.getNumOperands(),
-                         CI.getNumOperands()) {
+    : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call,
+               OperandTraits<CallBase>::op_end(this) - CI.getNumOperands(),
+               CI.getNumOperands()) {
   setTailCallKind(CI.getTailCallKind());
   setCallingConv(CI.getCallingConv());
 
@@ -560,11 +629,12 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
                       const Twine &NameStr) {
   this->FTy = FTy;
 
-  assert(getNumOperands() == 3 + Args.size() + CountBundleInputs(Bundles) &&
+  assert((int)getNumOperands() ==
+             ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)) &&
          "NumOperands not set up?");
-  Op<-3>() = Fn;
-  Op<-2>() = IfNormal;
-  Op<-1>() = IfException;
+  setNormalDest(IfNormal);
+  setUnwindDest(IfException);
+  setCalledOperand(Fn);
 
 #ifndef NDEBUG
   assert(((Args.size() == FTy->getNumParams()) ||
@@ -577,7 +647,7 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
            "Invoking a function with a bad signature!");
 #endif
 
-  std::copy(Args.begin(), Args.end(), op_begin());
+  llvm::copy(Args, op_begin());
 
   auto It = populateBundleOperandInfos(Bundles, Args.size());
   (void)It;
@@ -587,10 +657,9 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
 }
 
 InvokeInst::InvokeInst(const InvokeInst &II)
-    : CallBase<InvokeInst>(II.Attrs, II.FTy, II.getType(), Instruction::Invoke,
-                           OperandTraits<CallBase<InvokeInst>>::op_end(this) -
-                               II.getNumOperands(),
-                           II.getNumOperands()) {
+    : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke,
+               OperandTraits<CallBase>::op_end(this) - II.getNumOperands(),
+               II.getNumOperands()) {
   setCallingConv(II.getCallingConv());
   std::copy(II.op_begin(), II.op_end(), op_begin());
   std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(),
@@ -834,7 +903,7 @@ void CatchSwitchInst::removeHandler(handler_iterator HI) {
 void FuncletPadInst::init(Value *ParentPad, ArrayRef<Value *> Args,
                           const Twine &NameStr) {
   assert(getNumOperands() == 1 + Args.size() && "NumOperands not set up?");
-  std::copy(Args.begin(), Args.end(), op_begin());
+  llvm::copy(Args, op_begin());
   setParentPad(ParentPad);
   setName(NameStr);
 }
@@ -1390,7 +1459,7 @@ void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList,
   assert(getNumOperands() == 1 + IdxList.size() &&
          "NumOperands not initialized?");
   Op<0>() = Ptr;
-  std::copy(IdxList.begin(), IdxList.end(), op_begin() + 1);
+  llvm::copy(IdxList, op_begin() + 1);
   setName(Name);
 }
 
@@ -1798,6 +1867,35 @@ bool ShuffleVectorInst::isTransposeMask(ArrayRef<int> Mask) {
   return true;
 }
 
+bool ShuffleVectorInst::isExtractSubvectorMask(ArrayRef<int> Mask,
+                                               int NumSrcElts, int &Index) {
+  // Must extract from a single source.
+  if (!isSingleSourceMaskImpl(Mask, NumSrcElts))
+    return false;
+
+  // Must be smaller (else this is an Identity shuffle).
+  if (NumSrcElts <= (int)Mask.size())
+    return false;
+
+  // Find start of extraction, accounting that we may start with an UNDEF.
+  int SubIndex = -1;
+  for (int i = 0, e = Mask.size(); i != e; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+    int Offset = (M % NumSrcElts) - i;
+    if (0 <= SubIndex && SubIndex != Offset)
+      return false;
+    SubIndex = Offset;
+  }
+
+  if (0 <= SubIndex) {
+    Index = SubIndex;
+    return true;
+  }
+  return false;
+}
+
 bool ShuffleVectorInst::isIdentityWithPadding() const {
   int NumOpElts = Op<0>()->getType()->getVectorNumElements();
   int NumMaskElts = getType()->getVectorNumElements();
@@ -1927,6 +2025,59 @@ Type *ExtractValueInst::getIndexedType(Type *Agg,
   return const_cast<Type*>(Agg);
 }
 
+//===----------------------------------------------------------------------===//
+//                             UnaryOperator Class
+//===----------------------------------------------------------------------===//
+
+UnaryOperator::UnaryOperator(UnaryOps iType, Value *S,
+                             Type *Ty, const Twine &Name,
+                             Instruction *InsertBefore)
+  : UnaryInstruction(Ty, iType, S, InsertBefore) {
+  Op<0>() = S;
+  setName(Name);
+  AssertOK();
+}
+
+UnaryOperator::UnaryOperator(UnaryOps iType, Value *S,
+                             Type *Ty, const Twine &Name,
+                             BasicBlock *InsertAtEnd)
+  : UnaryInstruction(Ty, iType, S, InsertAtEnd) {
+  Op<0>() = S;
+  setName(Name);
+  AssertOK();
+}
+
+UnaryOperator *UnaryOperator::Create(UnaryOps Op, Value *S,
+                                     const Twine &Name,
+                                     Instruction *InsertBefore) {
+  return new UnaryOperator(Op, S, S->getType(), Name, InsertBefore);
+}
+
+UnaryOperator *UnaryOperator::Create(UnaryOps Op, Value *S,
+                                     const Twine &Name,
+                                     BasicBlock *InsertAtEnd) {
+  UnaryOperator *Res = Create(Op, S, Name);
+  InsertAtEnd->getInstList().push_back(Res);
+  return Res;
+}
+
+void UnaryOperator::AssertOK() {
+  Value *LHS = getOperand(0);
+  (void)LHS; // Silence warnings.
+#ifndef NDEBUG
+  switch (getOpcode()) {
+  case FNeg:
+    assert(getType() == LHS->getType() &&
+           "Unary operation should return same type as operand!");
+    assert(getType()->isFPOrFPVectorTy() &&
+           "Tried to create a floating-point operation on a "
+           "non-floating-point type!");
+    break;
+  default: llvm_unreachable("Invalid opcode provided");
+  }
+#endif
+}
+
 //===----------------------------------------------------------------------===//
 //                             BinaryOperator Class
 //===----------------------------------------------------------------------===//
@@ -3668,6 +3819,10 @@ GetElementPtrInst *GetElementPtrInst::cloneImpl() const {
   return new (getNumOperands()) GetElementPtrInst(*this);
 }
 
+UnaryOperator *UnaryOperator::cloneImpl() const {
+  return Create(getOpcode(), Op<0>());
+}
+
 BinaryOperator *BinaryOperator::cloneImpl() const {
   return Create(getOpcode(), Op<0>(), Op<1>());
 }
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 4da61e1772319909dd127b9100d9e6e02f9c9949..2d120869860a0eb02500ab5f036d02262b909d93 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -612,15 +612,12 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   Metadata *File;
   unsigned Line;
   Metadata *Type;
-  bool IsLocalToUnit;
-  bool IsDefinition;
   unsigned ScopeLine;
   Metadata *ContainingType;
-  unsigned Virtuality;
   unsigned VirtualIndex;
   int ThisAdjustment;
   unsigned Flags;
-  bool IsOptimized;
+  unsigned SPFlags;
   Metadata *Unit;
   Metadata *TemplateParams;
   Metadata *Declaration;
@@ -629,45 +626,39 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
 
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName,
                 Metadata *File, unsigned Line, Metadata *Type,
-                bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
-                Metadata *ContainingType, unsigned Virtuality,
+                unsigned ScopeLine, Metadata *ContainingType,
                 unsigned VirtualIndex, int ThisAdjustment, unsigned Flags,
-                bool IsOptimized, Metadata *Unit, Metadata *TemplateParams,
+                unsigned SPFlags, Metadata *Unit, Metadata *TemplateParams,
                 Metadata *Declaration, Metadata *RetainedNodes,
                 Metadata *ThrownTypes)
       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
-        Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
-        IsDefinition(IsDefinition), ScopeLine(ScopeLine),
-        ContainingType(ContainingType), Virtuality(Virtuality),
-        VirtualIndex(VirtualIndex), ThisAdjustment(ThisAdjustment),
-        Flags(Flags), IsOptimized(IsOptimized), Unit(Unit),
-        TemplateParams(TemplateParams), Declaration(Declaration),
+        Line(Line), Type(Type), ScopeLine(ScopeLine),
+        ContainingType(ContainingType), VirtualIndex(VirtualIndex),
+        ThisAdjustment(ThisAdjustment), Flags(Flags), SPFlags(SPFlags),
+        Unit(Unit), TemplateParams(TemplateParams), Declaration(Declaration),
         RetainedNodes(RetainedNodes), ThrownTypes(ThrownTypes) {}
   MDNodeKeyImpl(const DISubprogram *N)
       : Scope(N->getRawScope()), Name(N->getRawName()),
         LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
-        Line(N->getLine()), Type(N->getRawType()),
-        IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()),
-        ScopeLine(N->getScopeLine()), ContainingType(N->getRawContainingType()),
-        Virtuality(N->getVirtuality()), VirtualIndex(N->getVirtualIndex()),
+        Line(N->getLine()), Type(N->getRawType()), ScopeLine(N->getScopeLine()),
+        ContainingType(N->getRawContainingType()),
+        VirtualIndex(N->getVirtualIndex()),
         ThisAdjustment(N->getThisAdjustment()), Flags(N->getFlags()),
-        IsOptimized(N->isOptimized()), Unit(N->getRawUnit()),
+        SPFlags(N->getSPFlags()), Unit(N->getRawUnit()),
         TemplateParams(N->getRawTemplateParams()),
-        Declaration(N->getRawDeclaration()), RetainedNodes(N->getRawRetainedNodes()),
+        Declaration(N->getRawDeclaration()),
+        RetainedNodes(N->getRawRetainedNodes()),
         ThrownTypes(N->getRawThrownTypes()) {}
 
   bool isKeyOf(const DISubprogram *RHS) const {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
            LinkageName == RHS->getRawLinkageName() &&
            File == RHS->getRawFile() && Line == RHS->getLine() &&
-           Type == RHS->getRawType() && IsLocalToUnit == RHS->isLocalToUnit() &&
-           IsDefinition == RHS->isDefinition() &&
-           ScopeLine == RHS->getScopeLine() &&
+           Type == RHS->getRawType() && ScopeLine == RHS->getScopeLine() &&
            ContainingType == RHS->getRawContainingType() &&
-           Virtuality == RHS->getVirtuality() &&
            VirtualIndex == RHS->getVirtualIndex() &&
            ThisAdjustment == RHS->getThisAdjustment() &&
-           Flags == RHS->getFlags() && IsOptimized == RHS->isOptimized() &&
+           Flags == RHS->getFlags() && SPFlags == RHS->getSPFlags() &&
            Unit == RHS->getUnit() &&
            TemplateParams == RHS->getRawTemplateParams() &&
            Declaration == RHS->getRawDeclaration() &&
@@ -675,11 +666,13 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
            ThrownTypes == RHS->getRawThrownTypes();
   }
 
+  bool isDefinition() const { return SPFlags & DISubprogram::SPFlagDefinition; }
+
   unsigned getHashValue() const {
     // If this is a declaration inside an ODR type, only hash the type and the
     // name.  Otherwise the hash will be stronger than
     // MDNodeSubsetEqualImpl::isDeclarationOfODRMember().
-    if (!IsDefinition && LinkageName)
+    if (!isDefinition() && LinkageName)
       if (auto *CT = dyn_cast_or_null<DICompositeType>(Scope))
         if (CT->getRawIdentifier())
           return hash_combine(LinkageName, Scope);
@@ -696,7 +689,7 @@ template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
   using KeyTy = MDNodeKeyImpl<DISubprogram>;
 
   static bool isSubsetEqual(const KeyTy &LHS, const DISubprogram *RHS) {
-    return isDeclarationOfODRMember(LHS.IsDefinition, LHS.Scope,
+    return isDeclarationOfODRMember(LHS.isDefinition(), LHS.Scope,
                                     LHS.LinkageName, LHS.TemplateParams, RHS);
   }
 
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 204fecde32c98320e7b5bac1159756519189371e..5536c2497f1e986859933cf9fd5a90f5367d93a8 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -1484,7 +1484,7 @@ void GlobalObject::copyMetadata(const GlobalObject *Other, unsigned Offset) {
       std::vector<uint64_t> Elements(OrigElements.size() + 2);
       Elements[0] = dwarf::DW_OP_plus_uconst;
       Elements[1] = Offset;
-      std::copy(OrigElements.begin(), OrigElements.end(), Elements.begin() + 2);
+      llvm::copy(OrigElements, Elements.begin() + 2);
       E = DIExpression::get(getContext(), Elements);
       Attachment = DIGlobalVariableExpression::get(getContext(), GV, E);
     }
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 7d02a3956ff90768f382e812a79b1b3c9fd4836c..70a16cbbf24210b45a2c11e9fa4aaebdd173fd8f 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -46,6 +46,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RandomNumberGenerator.h"
+#include "llvm/Support/VersionTuple.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -547,6 +548,45 @@ void Module::setRtLibUseGOT() {
   addModuleFlag(ModFlagBehavior::Max, "RtLibUseGOT", 1);
 }
 
+void Module::setSDKVersion(const VersionTuple &V) {
+  SmallVector<unsigned, 3> Entries;
+  Entries.push_back(V.getMajor());
+  if (auto Minor = V.getMinor()) {
+    Entries.push_back(*Minor);
+    if (auto Subminor = V.getSubminor())
+      Entries.push_back(*Subminor);
+    // Ignore the 'build' component as it can't be represented in the object
+    // file.
+  }
+  addModuleFlag(ModFlagBehavior::Warning, "SDK Version",
+                ConstantDataArray::get(Context, Entries));
+}
+
+VersionTuple Module::getSDKVersion() const {
+  auto *CM = dyn_cast_or_null<ConstantAsMetadata>(getModuleFlag("SDK Version"));
+  if (!CM)
+    return {};
+  auto *Arr = dyn_cast_or_null<ConstantDataArray>(CM->getValue());
+  if (!Arr)
+    return {};
+  auto getVersionComponent = [&](unsigned Index) -> Optional<unsigned> {
+    if (Index >= Arr->getNumElements())
+      return None;
+    return (unsigned)Arr->getElementAsInteger(Index);
+  };
+  auto Major = getVersionComponent(0);
+  if (!Major)
+    return {};
+  VersionTuple Result = VersionTuple(*Major);
+  if (auto Minor = getVersionComponent(1)) {
+    Result = VersionTuple(*Major, *Minor);
+    if (auto Subminor = getVersionComponent(2)) {
+      Result = VersionTuple(*Major, *Minor, *Subminor);
+    }
+  }
+  return Result;
+}
+
 GlobalVariable *llvm::collectUsedGlobalVariables(
     const Module &M, SmallPtrSetImpl<GlobalValue *> &Set, bool CompilerUsed) {
   const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used";
diff --git a/lib/IR/ModuleSummaryIndex.cpp b/lib/IR/ModuleSummaryIndex.cpp
index 8d85f7901b080476119d4c6d979858ac1ce41921..46b88cd31779a2097a8d6ddb846568b271ab9f8c 100644
--- a/lib/IR/ModuleSummaryIndex.cpp
+++ b/lib/IR/ModuleSummaryIndex.cpp
@@ -14,11 +14,17 @@
 
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "module-summary-index"
+
+STATISTIC(ReadOnlyLiveGVars,
+          "Number of live global variables marked read only");
+
 FunctionSummary FunctionSummary::ExternalNode =
     FunctionSummary::makeDummyFunctionSummary({});
 bool ValueInfo::isDSOLocal() const {
@@ -30,6 +36,17 @@ bool ValueInfo::isDSOLocal() const {
                       });
 }
 
+// Gets the number of immutable refs in RefEdgeList
+unsigned FunctionSummary::immutableRefCount() const {
+  // Here we take advantage of having all readonly references
+  // located in the end of the RefEdgeList.
+  auto Refs = refs();
+  unsigned ImmutableRefCnt = 0;
+  for (int I = Refs.size() - 1; I >= 0 && Refs[I].isReadOnly(); --I)
+    ImmutableRefCnt++;
+  return ImmutableRefCnt;
+}
+
 // Collect for the given module the list of function it defines
 // (GUID -> Summary).
 void ModuleSummaryIndex::collectDefinedFunctionsForModule(
@@ -84,6 +101,80 @@ bool ModuleSummaryIndex::isGUIDLive(GlobalValue::GUID GUID) const {
   return false;
 }
 
+static void propagateConstantsToRefs(GlobalValueSummary *S) {
+  // If reference is not readonly then referenced summary is not
+  // readonly either. Note that:
+  // - All references from GlobalVarSummary are conservatively considered as
+  //   not readonly. Tracking them properly requires more complex analysis
+  //   then we have now.
+  //
+  // - AliasSummary objects have no refs at all so this function is a no-op
+  //   for them.
+  for (auto &VI : S->refs()) {
+    if (VI.isReadOnly()) {
+      // We only mark refs as readonly when computing function summaries on
+      // analysis phase.
+      assert(isa<FunctionSummary>(S));
+      continue;
+    }
+    for (auto &Ref : VI.getSummaryList())
+      // If references to alias is not readonly then aliasee is not readonly
+      if (auto *GVS = dyn_cast<GlobalVarSummary>(Ref->getBaseObject()))
+        GVS->setReadOnly(false);
+  }
+}
+
+// Do the constant propagation in combined index.
+// The goal of constant propagation is internalization of readonly
+// variables. To determine which variables are readonly and which
+// are not we take following steps:
+// - During analysis we speculatively assign readonly attribute to
+//   all variables which can be internalized. When computing function
+//   summary we also assign readonly attribute to a reference if
+//   function doesn't modify referenced variable.
+//
+// - After computing dead symbols in combined index we do the constant
+//   propagation. During this step we clear readonly attribute from
+//   all variables which:
+//   a. are preserved or can't be imported
+//   b. referenced by any global variable initializer
+//   c. referenced by a function and reference is not readonly
+//
+// Internalization itself happens in the backend after import is finished
+// See internalizeImmutableGVs.
+void ModuleSummaryIndex::propagateConstants(
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  for (auto &P : *this)
+    for (auto &S : P.second.SummaryList) {
+      if (!isGlobalValueLive(S.get()))
+        // We don't examine references from dead objects
+        continue;
+
+      // Global variable can't be marked read only if it is not eligible
+      // to import since we need to ensure that all external references
+      // get a local (imported) copy. It also can't be marked read only
+      // if it or any alias (since alias points to the same memory) are
+      // preserved or notEligibleToImport, since either of those means
+      // there could be writes that are not visible (because preserved
+      // means it could have external to DSO writes, and notEligibleToImport
+      // means it could have writes via inline assembly leading it to be
+      // in the @llvm.*used).
+      if (auto *GVS = dyn_cast<GlobalVarSummary>(S->getBaseObject()))
+        // Here we intentionally pass S.get() not GVS, because S could be
+        // an alias.
+        if (!canImportGlobalVar(S.get()) || GUIDPreservedSymbols.count(P.first))
+          GVS->setReadOnly(false);
+      propagateConstantsToRefs(S.get());
+    }
+  if (llvm::AreStatisticsEnabled())
+    for (auto &P : *this)
+      if (P.second.SummaryList.size())
+        if (auto *GVS = dyn_cast<GlobalVarSummary>(
+                P.second.SummaryList[0]->getBaseObject()))
+          if (isGlobalValueLive(GVS) && GVS->isReadOnly())
+            ReadOnlyLiveGVars++;
+}
+
 // TODO: write a graphviz dumper for SCCs (see ModuleSummaryIndex::exportToDot)
 // then delete this function and update its tests
 LLVM_DUMP_METHOD
@@ -108,6 +199,7 @@ namespace {
 struct Attributes {
   void add(const Twine &Name, const Twine &Value,
            const Twine &Comment = Twine());
+  void addComment(const Twine &Comment);
   std::string getAsString() const;
 
   std::vector<std::string> Attrs;
@@ -129,6 +221,10 @@ void Attributes::add(const Twine &Name, const Twine &Value,
   A += Value.str();
   A += "\"";
   Attrs.push_back(A);
+  addComment(Comment);
+}
+
+void Attributes::addComment(const Twine &Comment) {
   if (!Comment.isTriviallyEmpty()) {
     if (Comments.empty())
       Comments = " // ";
@@ -237,6 +333,12 @@ static void defineExternalNode(raw_ostream &OS, const char *Pfx,
   OS << "\"]; // defined externally\n";
 }
 
+static bool hasReadOnlyFlag(const GlobalValueSummary *S) {
+  if (auto *GVS = dyn_cast<GlobalVarSummary>(S))
+    return GVS->isReadOnly();
+  return false;
+}
+
 void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const {
   std::vector<Edge> CrossModuleEdges;
   DenseMap<GlobalValue::GUID, std::vector<uint64_t>> NodeMap;
@@ -252,13 +354,17 @@ void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const {
   };
 
   auto DrawEdge = [&](const char *Pfx, uint64_t SrcMod, GlobalValue::GUID SrcId,
-                      uint64_t DstMod, GlobalValue::GUID DstId, int TypeOrHotness) {
-    // 0 corresponds to alias edge, 1 to ref edge, 2 to call with unknown
-    // hotness, ...
-    TypeOrHotness += 2;
+                      uint64_t DstMod, GlobalValue::GUID DstId,
+                      int TypeOrHotness) {
+    // 0 - alias
+    // 1 - reference
+    // 2 - constant reference
+    // Other value: (hotness - 3).
+    TypeOrHotness += 3;
     static const char *EdgeAttrs[] = {
         " [style=dotted]; // alias",
         " [style=dashed]; // ref",
+        " [style=dashed,color=forestgreen]; // const-ref",
         " // call (hotness : Unknown)",
         " [color=blue]; // call (hotness : Cold)",
         " // call (hotness : None)",
@@ -301,6 +407,8 @@ void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const {
         A.add("shape", "box");
       } else {
         A.add("shape", "Mrecord", "variable");
+        if (Flags.Live && hasReadOnlyFlag(SummaryIt.second))
+          A.addComment("immutable");
       }
 
       auto VI = getValueInfo(SummaryIt.first);
@@ -318,7 +426,7 @@ void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const {
     for (auto &SummaryIt : GVSMap) {
       auto *GVS = SummaryIt.second;
       for (auto &R : GVS->refs())
-        Draw(SummaryIt.first, R.getGUID(), -1);
+        Draw(SummaryIt.first, R.getGUID(), R.isReadOnly() ? -1 : -2);
 
       if (auto *AS = dyn_cast_or_null<AliasSummary>(SummaryIt.second)) {
         GlobalValue::GUID AliaseeId;
@@ -331,7 +439,7 @@ void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const {
             AliaseeId = AliaseeOrigId;
         }
 
-        Draw(SummaryIt.first, AliaseeId, -2);
+        Draw(SummaryIt.first, AliaseeId, -3);
         continue;
       }
 
diff --git a/lib/IR/PassTimingInfo.cpp b/lib/IR/PassTimingInfo.cpp
index 8165704a9ec0c52e4d202798867c5ccffb77c172..40b3977ecbd9b92dbaea277803af952d4c591e2f 100644
--- a/lib/IR/PassTimingInfo.cpp
+++ b/lib/IR/PassTimingInfo.cpp
@@ -226,7 +226,7 @@ static bool matchPassManager(StringRef PassID) {
          Prefix.endswith("AnalysisManagerProxy");
 }
 
-bool TimePassesHandler::runBeforePass(StringRef PassID, Any IR) {
+bool TimePassesHandler::runBeforePass(StringRef PassID) {
   if (matchPassManager(PassID))
     return true;
 
@@ -239,7 +239,7 @@ bool TimePassesHandler::runBeforePass(StringRef PassID, Any IR) {
   return true;
 }
 
-void TimePassesHandler::runAfterPass(StringRef PassID, Any IR) {
+void TimePassesHandler::runAfterPass(StringRef PassID) {
   if (matchPassManager(PassID))
     return;
 
@@ -254,13 +254,15 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) {
     return;
 
   PIC.registerBeforePassCallback(
-      [this](StringRef P, Any IR) { return this->runBeforePass(P, IR); });
+      [this](StringRef P, Any) { return this->runBeforePass(P); });
   PIC.registerAfterPassCallback(
-      [this](StringRef P, Any IR) { this->runAfterPass(P, IR); });
+      [this](StringRef P, Any) { this->runAfterPass(P); });
+  PIC.registerAfterPassInvalidatedCallback(
+      [this](StringRef P) { this->runAfterPass(P); });
   PIC.registerBeforeAnalysisCallback(
-      [this](StringRef P, Any IR) { this->runBeforePass(P, IR); });
+      [this](StringRef P, Any) { this->runBeforePass(P); });
   PIC.registerAfterAnalysisCallback(
-      [this](StringRef P, Any IR) { this->runAfterPass(P, IR); });
+      [this](StringRef P, Any) { this->runAfterPass(P); });
 }
 
 } // namespace llvm
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 6dc48ad97cc64294575bed1fdec28fa2e211181d..dc8af6b68e7bf4deb2a9f945af79ba8ee5138dd5 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -130,20 +130,11 @@ void Value::destroyValueName() {
 }
 
 bool Value::hasNUses(unsigned N) const {
-  const_use_iterator UI = use_begin(), E = use_end();
-
-  for (; N; --N, ++UI)
-    if (UI == E) return false;  // Too few.
-  return UI == E;
+  return hasNItems(use_begin(), use_end(), N);
 }
 
 bool Value::hasNUsesOrMore(unsigned N) const {
-  const_use_iterator UI = use_begin(), E = use_end();
-
-  for (; N; --N, ++UI)
-    if (UI == E) return false;  // Too few.
-
-  return true;
+  return hasNItemsOrMore(use_begin(), use_end(), N);
 }
 
 bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 4d0135d8338e92825462a3c69b834eb022fe4bd1..7fd6df3e6e001f304205ef3db1d7ef774691e817 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -281,6 +281,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// Whether the current function has a DISubprogram attached to it.
   bool HasDebugInfo = false;
 
+  /// Whether source was present on the first DIFile encountered in each CU.
+  DenseMap<const DICompileUnit *, bool> HasSourceDebugInfo;
+
   /// Stores the count of how many objects were passed to llvm.localescape for a
   /// given function and the largest index passed to llvm.localrecover.
   DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo;
@@ -383,6 +386,7 @@ public:
 
     visitModuleFlags(M);
     visitModuleIdents(M);
+    visitModuleCommandLines(M);
 
     verifyCompileUnits();
 
@@ -405,6 +409,7 @@ private:
   void visitValueAsMetadata(const ValueAsMetadata &MD, Function *F);
   void visitComdat(const Comdat &C);
   void visitModuleIdents(const Module &M);
+  void visitModuleCommandLines(const Module &M);
   void visitModuleFlags(const Module &M);
   void visitModuleFlag(const MDNode *Op,
                        DenseMap<const MDString *, const MDNode *> &SeenIDs,
@@ -443,6 +448,7 @@ private:
   void visitBitCastInst(BitCastInst &I);
   void visitAddrSpaceCastInst(AddrSpaceCastInst &I);
   void visitPHINode(PHINode &PN);
+  void visitUnaryOperator(UnaryOperator &U);
   void visitBinaryOperator(BinaryOperator &B);
   void visitICmpInst(ICmpInst &IC);
   void visitFCmpInst(FCmpInst &FC);
@@ -518,6 +524,9 @@ private:
   /// Module-level verification that all @llvm.experimental.deoptimize
   /// declarations share the same calling convention.
   void verifyDeoptimizeCallingConvs();
+
+  /// Verify all-or-nothing property of DIFile source attribute within a CU.
+  void verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F);
 };
 
 } // end anonymous namespace
@@ -1031,6 +1040,8 @@ void Verifier::visitDICompileUnit(const DICompileUnit &N) {
   AssertDI(!N.getFile()->getFilename().empty(), "invalid filename", &N,
            N.getFile());
 
+  verifySourceDebugInfo(N, *N.getFile());
+
   AssertDI((N.getEmissionKind() <= DICompileUnit::LastEmissionKind),
            "invalid emission kind", &N);
 
@@ -1108,6 +1119,8 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
     AssertDI(N.isDistinct(), "subprogram definitions must be distinct", &N);
     AssertDI(Unit, "subprogram definitions must have a compile unit", &N);
     AssertDI(isa<DICompileUnit>(Unit), "invalid unit type", &N, Unit);
+    if (N.getFile())
+      verifySourceDebugInfo(*N.getUnit(), *N.getFile());
   } else {
     // Subprogram declarations (part of the type hierarchy).
     AssertDI(!Unit, "subprogram declarations must not have a compile unit", &N);
@@ -1302,6 +1315,24 @@ void Verifier::visitModuleIdents(const Module &M) {
   }
 }
 
+void Verifier::visitModuleCommandLines(const Module &M) {
+  const NamedMDNode *CommandLines = M.getNamedMetadata("llvm.commandline");
+  if (!CommandLines)
+    return;
+
+  // llvm.commandline takes a list of metadata entry. Each entry has only one
+  // string. Scan each llvm.commandline entry and make sure that this
+  // requirement is met.
+  for (const MDNode *N : CommandLines->operands()) {
+    Assert(N->getNumOperands() == 1,
+           "incorrect number of operands in llvm.commandline metadata", N);
+    Assert(dyn_cast_or_null<MDString>(N->getOperand(0)),
+           ("invalid value for llvm.commandline metadata entry operand"
+            "(the operand should be a string)"),
+           N->getOperand(0));
+  }
+}
+
 void Verifier::visitModuleFlags(const Module &M) {
   const NamedMDNode *Flags = M.getModuleFlagsMetadata();
   if (!Flags) return;
@@ -1929,6 +1960,7 @@ void Verifier::verifyStatepoint(ImmutableCallSite CS) {
 
   // Verify that the types of the call parameter arguments match
   // the type of the wrapped callee.
+  AttributeList Attrs = CS.getAttributes();
   for (int i = 0; i < NumParams; i++) {
     Type *ParamType = TargetFuncType->getParamType(i);
     Type *ArgType = CS.getArgument(5 + i)->getType();
@@ -1936,6 +1968,12 @@ void Verifier::verifyStatepoint(ImmutableCallSite CS) {
            "gc.statepoint call argument does not match wrapped "
            "function type",
            &CI);
+
+    if (TargetFuncType->isVarArg()) {
+      AttributeSet ArgAttrs = Attrs.getParamAttributes(5 + i);
+      Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
+             "Attribute 'sret' cannot be used for vararg call arguments!", &CI);
+    }
   }
 
   const int EndCallArgsInx = 4 + NumCallArgs;
@@ -2813,8 +2851,13 @@ void Verifier::verifyCallSite(CallSite CS) {
         SawReturned = true;
       }
 
-      Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
-             "Attribute 'sret' cannot be used for vararg call arguments!", I);
+      // Statepoint intrinsic is vararg but the wrapped function may be not.
+      // Allow sret here and check the wrapped function in verifyStatepoint.
+      if (CS.getCalledFunction() == nullptr ||
+          CS.getCalledFunction()->getIntrinsicID() !=
+            Intrinsic::experimental_gc_statepoint)
+        Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
+               "Attribute 'sret' cannot be used for vararg call arguments!", I);
 
       if (ArgAttrs.hasAttribute(Attribute::InAlloca))
         Assert(Idx == CS.arg_size() - 1, "inalloca isn't on the last argument!",
@@ -2990,6 +3033,28 @@ void Verifier::visitInvokeInst(InvokeInst &II) {
   visitTerminator(II);
 }
 
+/// visitUnaryOperator - Check the argument to the unary operator.
+///
+void Verifier::visitUnaryOperator(UnaryOperator &U) {
+  Assert(U.getType() == U.getOperand(0)->getType(), 
+         "Unary operators must have same type for"
+         "operands and result!",
+         &U);
+
+  switch (U.getOpcode()) {
+  // Check that floating-point arithmetic operators are only used with
+  // floating-point operands.
+  case Instruction::FNeg:
+    Assert(U.getType()->isFPOrFPVectorTy(),
+           "FNeg operator only works with float types!", &U);
+    break;
+  default:
+    llvm_unreachable("Unknown UnaryOperator opcode!");
+  }
+
+  visitInstruction(U);
+}
+
 /// visitBinaryOperator - Check that both arguments to the binary operator are
 /// of the same type!
 ///
@@ -3887,6 +3952,10 @@ void Verifier::visitInstruction(Instruction &I) {
     }
   }
 
+  // Get a pointer to the call base of the instruction if it is some form of
+  // call.
+  const CallBase *CBI = dyn_cast<CallBase>(&I);
+
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
     Assert(I.getOperand(i) != nullptr, "Instruction has null operand!", &I);
 
@@ -3899,10 +3968,9 @@ void Verifier::visitInstruction(Instruction &I) {
     if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
       // Check to make sure that the "address of" an intrinsic function is never
       // taken.
-      Assert(
-          !F->isIntrinsic() ||
-              i == (isa<CallInst>(I) ? e - 1 : isa<InvokeInst>(I) ? e - 3 : 0),
-          "Cannot take the address of an intrinsic!", &I);
+      Assert(!F->isIntrinsic() ||
+                 (CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i)),
+             "Cannot take the address of an intrinsic!", &I);
       Assert(
           !F->isIntrinsic() || isa<CallInst>(I) ||
               F->getIntrinsicID() == Intrinsic::donothing ||
@@ -3928,8 +3996,7 @@ void Verifier::visitInstruction(Instruction &I) {
     } else if (isa<Instruction>(I.getOperand(i))) {
       verifyDominatesUse(I, i);
     } else if (isa<InlineAsm>(I.getOperand(i))) {
-      Assert((i + 1 == e && isa<CallInst>(I)) ||
-                 (i + 3 == e && isa<InvokeInst>(I)),
+      Assert(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i),
              "Cannot take the address of an inline asm!", &I);
     } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(I.getOperand(i))) {
       if (CE->getType()->isPtrOrPtrVectorTy() ||
@@ -4494,6 +4561,24 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "of ints");
     break;
   }
+  case Intrinsic::smul_fix: {
+    Value *Op1 = CS.getArgOperand(0);
+    Value *Op2 = CS.getArgOperand(1);
+    Assert(Op1->getType()->isIntOrIntVectorTy(),
+           "first operand of smul_fix must be an int type or vector "
+           "of ints");
+    Assert(Op2->getType()->isIntOrIntVectorTy(),
+           "second operand of smul_fix must be an int type or vector "
+           "of ints");
+
+    auto *Op3 = dyn_cast<ConstantInt>(CS.getArgOperand(2));
+    Assert(Op3, "third argument of smul_fix must be a constant integer");
+    Assert(Op3->getType()->getBitWidth() <= 32,
+           "third argument of smul_fix must fit within 32 bits");
+    Assert(Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
+           "the scale of smul_fix must be less than the width of the operands");
+    break;
+  }
   };
 }
 
@@ -4707,6 +4792,14 @@ void Verifier::verifyDeoptimizeCallingConvs() {
   }
 }
 
+void Verifier::verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F) {
+  bool HasSource = F.getSource().hasValue();
+  if (!HasSourceDebugInfo.count(&U))
+    HasSourceDebugInfo[&U] = HasSource;
+  AssertDI(HasSource == HasSourceDebugInfo[&U],
+           "inconsistent use of embedded source");
+}
+
 //===----------------------------------------------------------------------===//
 //  Implement the public interfaces to this file...
 //===----------------------------------------------------------------------===//
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index 0eb4bba26760079001118a33f6d105412a0241cb..d87bf797161bea615d6f2402bda9f3d9849f1f05 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -31,6 +31,7 @@ subdirectories =
  IRReader
  LTO
  MC
+ MCA
  Object
  BinaryFormat
  ObjectYAML
@@ -40,6 +41,7 @@ subdirectories =
  ProfileData
  Support
  TableGen
+ TextAPI
  Target
  Testing
  ToolDrivers
diff --git a/lib/LTO/CMakeLists.txt b/lib/LTO/CMakeLists.txt
index 73b5662d4bc8ed10b1383f0c617db5d71d011028..1730df665d83b31a0e81edff4403d18a5eacfc2d 100644
--- a/lib/LTO/CMakeLists.txt
+++ b/lib/LTO/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMLTO
   LTOBackend.cpp
   LTOModule.cpp
   LTOCodeGenerator.cpp
+  SummaryBasedOptimizations.cpp
   UpdateCompilerUsed.cpp
   ThinLTOCodeGenerator.cpp
 
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 2726b6785eddfd174e62112b68fc64aca5f45600..08924fb92dd6a4cb4ee516d88a23b27da5f002f5 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/LTO/LTOBackend.h"
+#include "llvm/LTO/SummaryBasedOptimizations.h"
 #include "llvm/Linker/IRMover.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Support/Error.h"
@@ -42,6 +43,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 
 #include <set>
@@ -61,10 +63,10 @@ cl::opt<bool> EnableLTOInternalization(
     "enable-lto-internalization", cl::init(true), cl::Hidden,
     cl::desc("Enable global value internalization in LTO"));
 
-// Returns a unique hash for the Module considering the current list of
+// Computes a unique hash for the Module considering the current list of
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
-static void computeCacheKey(
+void llvm::computeLTOCacheKey(
     SmallString<40> &Key, const Config &Conf, const ModuleSummaryIndex &Index,
     StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList,
     const FunctionImporter::ExportSetTy &ExportList,
@@ -132,6 +134,7 @@ static void computeCacheKey(
   AddUnsigned(Conf.CGFileType);
   AddUnsigned(Conf.OptLevel);
   AddUnsigned(Conf.UseNewPM);
+  AddUnsigned(Conf.Freestanding);
   AddString(Conf.OptPipeline);
   AddString(Conf.AAPipeline);
   AddString(Conf.OverrideTriple);
@@ -187,6 +190,8 @@ static void computeCacheKey(
       AddUnsigned(VI.isDSOLocal());
       AddUsedCfiGlobal(VI.getGUID());
     }
+    if (auto *GVS = dyn_cast<GlobalVarSummary>(GS))
+      AddUnsigned(GVS->isReadOnly());
     if (auto *FS = dyn_cast<FunctionSummary>(GS)) {
       for (auto &TT : FS->type_tests())
         UsedTypeIds.insert(TT);
@@ -218,8 +223,14 @@ static void computeCacheKey(
   // Imported functions may introduce new uses of type identifier resolutions,
   // so we need to collect their used resolutions as well.
   for (auto &ImpM : ImportList)
-    for (auto &ImpF : ImpM.second)
-      AddUsedThings(Index.findSummaryInModule(ImpF, ImpM.first()));
+    for (auto &ImpF : ImpM.second) {
+      GlobalValueSummary *S = Index.findSummaryInModule(ImpF, ImpM.first());
+      AddUsedThings(S);
+      // If this is an alias, we also care about any types/etc. that the aliasee
+      // may reference.
+      if (auto *AS = dyn_cast_or_null<AliasSummary>(S))
+        AddUsedThings(AS->getBaseObject());
+    }
 
   auto AddTypeIdSummary = [&](StringRef TId, const TypeIdSummary &S) {
     AddString(TId);
@@ -282,7 +293,7 @@ static void computeCacheKey(
   Key = toHex(Hasher.result());
 }
 
-static void thinLTOResolveWeakForLinkerGUID(
+static void thinLTOResolvePrevailingGUID(
     GlobalValueSummaryList &GVSummaryList, GlobalValue::GUID GUID,
     DenseSet<GlobalValueSummary *> &GlobalInvolvedWithAlias,
     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
@@ -291,7 +302,10 @@ static void thinLTOResolveWeakForLinkerGUID(
         recordNewLinkage) {
   for (auto &S : GVSummaryList) {
     GlobalValue::LinkageTypes OriginalLinkage = S->linkage();
-    if (!GlobalValue::isWeakForLinker(OriginalLinkage))
+    // Ignore local and appending linkage values since the linker
+    // doesn't resolve them.
+    if (GlobalValue::isLocalLinkage(OriginalLinkage) ||
+        GlobalValue::isAppendingLinkage(S->linkage()))
       continue;
     // We need to emit only one of these. The prevailing module will keep it,
     // but turned into a weak, while the others will drop it when possible.
@@ -315,13 +329,13 @@ static void thinLTOResolveWeakForLinkerGUID(
   }
 }
 
-// Resolve Weak and LinkOnce values in the \p Index.
+/// Resolve linkage for prevailing symbols in the \p Index.
 //
 // We'd like to drop these functions if they are no longer referenced in the
 // current module. However there is a chance that another module is still
 // referencing them because of the import. We make sure we always emit at least
 // one copy.
-void llvm::thinLTOResolveWeakForLinkerInIndex(
+void llvm::thinLTOResolvePrevailingInIndex(
     ModuleSummaryIndex &Index,
     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing,
@@ -337,9 +351,9 @@ void llvm::thinLTOResolveWeakForLinkerInIndex(
         GlobalInvolvedWithAlias.insert(&AS->getAliasee());
 
   for (auto &I : Index)
-    thinLTOResolveWeakForLinkerGUID(I.second.SummaryList, I.first,
-                                    GlobalInvolvedWithAlias, isPrevailing,
-                                    recordNewLinkage);
+    thinLTOResolvePrevailingGUID(I.second.SummaryList, I.first,
+                                 GlobalInvolvedWithAlias, isPrevailing,
+                                 recordNewLinkage);
 }
 
 static void thinLTOInternalizeAndPromoteGUID(
@@ -350,7 +364,13 @@ static void thinLTOInternalizeAndPromoteGUID(
       if (GlobalValue::isLocalLinkage(S->linkage()))
         S->setLinkage(GlobalValue::ExternalLinkage);
     } else if (EnableLTOInternalization &&
-               !GlobalValue::isLocalLinkage(S->linkage()))
+               // Ignore local and appending linkage values since the linker
+               // doesn't resolve them.
+               !GlobalValue::isLocalLinkage(S->linkage()) &&
+               S->linkage() != GlobalValue::AppendingLinkage &&
+               // We can't internalize available_externally globals because this
+               // can break function pointer equality.
+               S->linkage() != GlobalValue::AvailableExternallyLinkage)
       S->setLinkage(GlobalValue::InternalLinkage);
   }
 }
@@ -803,7 +823,8 @@ Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
       return PrevailingType::Unknown;
     return It->second;
   };
-  computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols, isPrevailing);
+  computeDeadSymbolsWithConstProp(ThinLTO.CombinedIndex, GUIDPreservedSymbols,
+                                  isPrevailing, Conf.OptLevel > 0);
 
   // Setup output file to emit statistics.
   std::unique_ptr<ToolOutputFile> StatsFile = nullptr;
@@ -974,9 +995,9 @@ public:
 
     SmallString<40> Key;
     // The module may be cached, this helps handling it.
-    computeCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList, ExportList,
-                    ResolvedODR, DefinedGlobals, CfiFunctionDefs,
-                    CfiFunctionDecls);
+    computeLTOCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList,
+                       ExportList, ResolvedODR, DefinedGlobals, CfiFunctionDefs,
+                       CfiFunctionDecls);
     if (AddStreamFn CacheAddStream = Cache(Task, Key))
       return RunThinBackend(CacheAddStream);
 
@@ -1151,6 +1172,9 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache) {
     if (!ModuleToDefinedGVSummaries.count(Mod.first))
       ModuleToDefinedGVSummaries.try_emplace(Mod.first);
 
+  // Synthesize entry counts for functions in the CombinedIndex.
+  computeSyntheticCounts(ThinLTO.CombinedIndex);
+
   StringMap<FunctionImporter::ImportMapTy> ImportLists(
       ThinLTO.ModuleMap.size());
   StringMap<FunctionImporter::ExportSetTy> ExportLists(
@@ -1205,8 +1229,8 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache) {
                               GlobalValue::LinkageTypes NewLinkage) {
     ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
   };
-  thinLTOResolveWeakForLinkerInIndex(ThinLTO.CombinedIndex, isPrevailing,
-                                     recordNewLinkage);
+  thinLTOResolvePrevailingInIndex(ThinLTO.CombinedIndex, isPrevailing,
+                                  recordNewLinkage);
 
   std::unique_ptr<ThinBackendProc> BackendProc =
       ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 1f9d60a5bdff520c1eb7e5baa7476cfdacfd09cf..926c419e34a84c35a3b9470e7ed29758c420e39e 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -490,7 +490,7 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
 
   dropDeadSymbols(Mod, DefinedGlobals, CombinedIndex);
 
-  thinLTOResolveWeakForLinkerModule(Mod, DefinedGlobals);
+  thinLTOResolvePrevailingInModule(Mod, DefinedGlobals);
 
   if (Conf.PostPromoteModuleHook && !Conf.PostPromoteModuleHook(Task, Mod))
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 20fc0943539f1bd50349bad9664a493c3ffc45ab..0d40d49dbe394e3281164a1ef999d269c6933162 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -73,7 +73,7 @@ bool LTOModule::isBitcodeFile(StringRef Path) {
 bool LTOModule::isThinLTO() {
   Expected<BitcodeLTOInfo> Result = getBitcodeLTOInfo(MBRef);
   if (!Result) {
-    logAllUnhandledErrors(Result.takeError(), errs(), "");
+    logAllUnhandledErrors(Result.takeError(), errs());
     return false;
   }
   return Result->IsThinLTO;
diff --git a/lib/LTO/SummaryBasedOptimizations.cpp b/lib/LTO/SummaryBasedOptimizations.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b1abb78462b1ab9044bae7e72c8f31a98f4cd67
--- /dev/null
+++ b/lib/LTO/SummaryBasedOptimizations.cpp
@@ -0,0 +1,80 @@
+//==-SummaryBasedOptimizations.cpp - Optimizations based on ThinLTO summary-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements optimizations that are based on the module summaries.
+// These optimizations are performed during the thinlink phase of the
+// compilation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/SummaryBasedOptimizations.h"
+#include "llvm/Analysis/SyntheticCountsUtils.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+
+using namespace llvm;
+
+cl::opt<bool> ThinLTOSynthesizeEntryCounts(
+    "thinlto-synthesize-entry-counts", cl::init(false), cl::Hidden,
+    cl::desc("Synthesize entry counts based on the summary"));
+
+extern cl::opt<int> InitialSyntheticCount;
+
+static void initializeCounts(ModuleSummaryIndex &Index) {
+  auto Root = Index.calculateCallGraphRoot();
+  // Root is a fake node. All its successors are the actual roots of the
+  // callgraph.
+  // FIXME: This initializes the entry counts of only the root nodes. This makes
+  // sense when compiling a binary with ThinLTO, but for libraries any of the
+  // non-root nodes could be called from outside.
+  for (auto &C : Root.calls()) {
+    auto &V = C.first;
+    for (auto &GVS : V.getSummaryList()) {
+      auto S = GVS.get()->getBaseObject();
+      auto *F = cast<FunctionSummary>(S);
+      F->setEntryCount(InitialSyntheticCount);
+    }
+  }
+}
+
+void llvm::computeSyntheticCounts(ModuleSummaryIndex &Index) {
+  if (!ThinLTOSynthesizeEntryCounts)
+    return;
+
+  using Scaled64 = ScaledNumber<uint64_t>;
+  initializeCounts(Index);
+  auto GetCallSiteRelFreq = [](FunctionSummary::EdgeTy &Edge) {
+    return Scaled64(Edge.second.RelBlockFreq, -CalleeInfo::ScaleShift);
+  };
+  auto GetEntryCount = [](ValueInfo V) {
+    if (V.getSummaryList().size()) {
+      auto S = V.getSummaryList().front().get()->getBaseObject();
+      auto *F = cast<FunctionSummary>(S);
+      return F->entryCount();
+    } else {
+      return UINT64_C(0);
+    }
+  };
+  auto AddToEntryCount = [](ValueInfo V, uint64_t New) {
+    if (!V.getSummaryList().size())
+      return;
+    for (auto &GVS : V.getSummaryList()) {
+      auto S = GVS.get()->getBaseObject();
+      auto *F = cast<FunctionSummary>(S);
+      F->setEntryCount(SaturatingAdd(F->entryCount(), New));
+    }
+  };
+
+  // After initializing the counts in initializeCounts above, the counts have to
+  // be propagated across the combined callgraph.
+  // SyntheticCountsUtils::propagate takes care of this propagation on any
+  // callgraph that specialized GraphTraits.
+  SyntheticCountsUtils<ModuleSummaryIndex *>::propagate(
+      &Index, GetCallSiteRelFreq, GetEntryCount, AddToEntryCount);
+  Index.setHasSyntheticEntryCounts();
+}
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 9500b2ded70ea8de4d05c969ede91d1e1800ff65..d9ec68fe3eb50a3234d16e2e40d9715df5e9730f 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/LTO/LTO.h"
+#include "llvm/LTO/SummaryBasedOptimizations.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Support/CachePruning.h"
@@ -298,8 +299,7 @@ public:
       const FunctionImporter::ImportMapTy &ImportList,
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
-      const GVSummaryMapTy &DefinedFunctions,
-      const DenseSet<GlobalValue::GUID> &PreservedSymbols, unsigned OptLevel,
+      const GVSummaryMapTy &DefinedGVSummaries, unsigned OptLevel,
       bool Freestanding, const TargetMachineBuilder &TMBuilder) {
     if (CachePath.empty())
       return;
@@ -308,87 +308,26 @@ public:
       // The module does not have an entry, it can't have a hash at all
       return;
 
-    // Compute the unique hash for this entry
-    // This is based on the current compiler version, the module itself, the
-    // export list, the hash for every single module in the import list, the
-    // list of ResolvedODR for the module, and the list of preserved symbols.
-
-    // Include the hash for the current module
-    auto ModHash = Index.getModuleHash(ModuleID);
-
-    if (all_of(ModHash, [](uint32_t V) { return V == 0; }))
+    if (all_of(Index.getModuleHash(ModuleID),
+               [](uint32_t V) { return V == 0; }))
       // No hash entry, no caching!
       return;
 
-    SHA1 Hasher;
-
-    // Include the parts of the LTO configuration that affect code generation.
-    auto AddString = [&](StringRef Str) {
-      Hasher.update(Str);
-      Hasher.update(ArrayRef<uint8_t>{0});
-    };
-    auto AddUnsigned = [&](unsigned I) {
-      uint8_t Data[4];
-      Data[0] = I;
-      Data[1] = I >> 8;
-      Data[2] = I >> 16;
-      Data[3] = I >> 24;
-      Hasher.update(ArrayRef<uint8_t>{Data, 4});
-    };
-
-    // Start with the compiler revision
-    Hasher.update(LLVM_VERSION_STRING);
-#ifdef LLVM_REVISION
-    Hasher.update(LLVM_REVISION);
-#endif
-
-    // Hash the optimization level and the target machine settings.
-    AddString(TMBuilder.MCpu);
-    // FIXME: Hash more of Options. For now all clients initialize Options from
-    // command-line flags (which is unsupported in production), but may set
-    // RelaxELFRelocations. The clang driver can also pass FunctionSections,
-    // DataSections and DebuggerTuning via command line flags.
-    AddUnsigned(TMBuilder.Options.RelaxELFRelocations);
-    AddUnsigned(TMBuilder.Options.FunctionSections);
-    AddUnsigned(TMBuilder.Options.DataSections);
-    AddUnsigned((unsigned)TMBuilder.Options.DebuggerTuning);
-    AddString(TMBuilder.MAttr);
-    if (TMBuilder.RelocModel)
-      AddUnsigned(*TMBuilder.RelocModel);
-    AddUnsigned(TMBuilder.CGOptLevel);
-    AddUnsigned(OptLevel);
-    AddUnsigned(Freestanding);
-
-    Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
-    for (auto F : ExportList)
-      // The export list can impact the internalization, be conservative here
-      Hasher.update(ArrayRef<uint8_t>((uint8_t *)&F, sizeof(F)));
-
-    // Include the hash for every module we import functions from
-    for (auto &Entry : ImportList) {
-      auto ModHash = Index.getModuleHash(Entry.first());
-      Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
-    }
-
-    // Include the hash for the resolved ODR.
-    for (auto &Entry : ResolvedODR) {
-      Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&Entry.first,
-                                      sizeof(GlobalValue::GUID)));
-      Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&Entry.second,
-                                      sizeof(GlobalValue::LinkageTypes)));
-    }
-
-    // Include the hash for the preserved symbols.
-    for (auto &Entry : PreservedSymbols) {
-      if (DefinedFunctions.count(Entry))
-        Hasher.update(
-            ArrayRef<uint8_t>((const uint8_t *)&Entry, sizeof(GlobalValue::GUID)));
-    }
+    llvm::lto::Config Conf;
+    Conf.OptLevel = OptLevel;
+    Conf.Options = TMBuilder.Options;
+    Conf.CPU = TMBuilder.MCpu;
+    Conf.MAttrs.push_back(TMBuilder.MAttr);
+    Conf.RelocModel = TMBuilder.RelocModel;
+    Conf.CGOptLevel = TMBuilder.CGOptLevel;
+    Conf.Freestanding = Freestanding;
+    SmallString<40> Key;
+    computeLTOCacheKey(Key, Conf, Index, ModuleID, ImportList, ExportList,
+                       ResolvedODR, DefinedGVSummaries);
 
     // This choice of file name allows the cache to be pruned (see pruneCache()
     // in include/llvm/Support/CachePruning.h).
-    sys::path::append(EntryPath, CachePath,
-                      "llvmcache-" + toHex(Hasher.result()));
+    sys::path::append(EntryPath, CachePath, "llvmcache-" + Key);
   }
 
   // Access the path to this entry in the cache.
@@ -457,8 +396,8 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
   if (!SingleModule) {
     promoteModule(TheModule, Index);
 
-    // Apply summary-based LinkOnce/Weak resolution decisions.
-    thinLTOResolveWeakForLinkerModule(TheModule, DefinedGlobals);
+    // Apply summary-based prevailing-symbol resolution decisions.
+    thinLTOResolvePrevailingInModule(TheModule, DefinedGlobals);
 
     // Save temps: after promotion.
     saveTempBitcode(TheModule, SaveTempsDir, count, ".1.promoted.bc");
@@ -500,12 +439,12 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
   return codegenModule(TheModule, TM);
 }
 
-/// Resolve LinkOnce/Weak symbols. Record resolutions in the \p ResolvedODR map
+/// Resolve prevailing symbols. Record resolutions in the \p ResolvedODR map
 /// for caching, and in the \p Index for application during the ThinLTO
 /// backends. This is needed for correctness for exported symbols (ensure
 /// at least one copy kept) and a compile-time optimization (to drop duplicate
 /// copies when possible).
-static void resolveWeakForLinkerInIndex(
+static void resolvePrevailingInIndex(
     ModuleSummaryIndex &Index,
     StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>>
         &ResolvedODR) {
@@ -527,7 +466,7 @@ static void resolveWeakForLinkerInIndex(
     ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
   };
 
-  thinLTOResolveWeakForLinkerInIndex(Index, isPrevailing, recordNewLinkage);
+  thinLTOResolvePrevailingInIndex(Index, isPrevailing, recordNewLinkage);
 }
 
 // Initialize the TargetMachine builder for a given Triple
@@ -646,7 +585,8 @@ static void computeDeadSymbolsInIndex(
   auto isPrevailing = [&](GlobalValue::GUID G) {
     return PrevailingType::Unknown;
   };
-  computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
+  computeDeadSymbolsWithConstProp(Index, GUIDPreservedSymbols, isPrevailing,
+                                  /* ImportEnabled = */ true);
 }
 
 /**
@@ -675,11 +615,11 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
                            ExportLists);
 
-  // Resolve LinkOnce/Weak symbols.
+  // Resolve prevailing symbols
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
-  resolveWeakForLinkerInIndex(Index, ResolvedODR);
+  resolvePrevailingInIndex(Index, ResolvedODR);
 
-  thinLTOResolveWeakForLinkerModule(
+  thinLTOResolvePrevailingInModule(
       TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]);
 
   // Promote the exported values in the index, so that they are promoted
@@ -722,37 +662,52 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
  * Compute the list of summaries needed for importing into module.
  */
 void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
-    StringRef ModulePath, ModuleSummaryIndex &Index,
+    Module &TheModule, ModuleSummaryIndex &Index,
     std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
   auto ModuleCount = Index.modulePaths().size();
+  auto ModuleIdentifier = TheModule.getModuleIdentifier();
 
   // Collect for each module the list of function it defines (GUID -> Summary).
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Convert the preserved symbols set from string to GUID
+  auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
+      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+
+  // Compute "dead" symbols, we don't want to import/export these!
+  computeDeadSymbolsInIndex(Index, GUIDPreservedSymbols);
+
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
                            ExportLists);
 
-  llvm::gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
-                                         ImportLists[ModulePath],
-                                         ModuleToSummariesForIndex);
+  llvm::gatherImportedSummariesForModule(
+      ModuleIdentifier, ModuleToDefinedGVSummaries,
+      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex);
 }
 
 /**
  * Emit the list of files needed for importing into module.
  */
-void ThinLTOCodeGenerator::emitImports(StringRef ModulePath,
-                                       StringRef OutputName,
+void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName,
                                        ModuleSummaryIndex &Index) {
   auto ModuleCount = Index.modulePaths().size();
+  auto ModuleIdentifier = TheModule.getModuleIdentifier();
 
   // Collect for each module the list of function it defines (GUID -> Summary).
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Convert the preserved symbols set from string to GUID
+  auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
+      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+
+  // Compute "dead" symbols, we don't want to import/export these!
+  computeDeadSymbolsInIndex(Index, GUIDPreservedSymbols);
+
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
@@ -760,13 +715,13 @@ void ThinLTOCodeGenerator::emitImports(StringRef ModulePath,
                            ExportLists);
 
   std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
-  llvm::gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
-                                         ImportLists[ModulePath],
-                                         ModuleToSummariesForIndex);
+  llvm::gatherImportedSummariesForModule(
+      ModuleIdentifier, ModuleToDefinedGVSummaries,
+      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex);
 
   std::error_code EC;
-  if ((EC =
-           EmitImportsFiles(ModulePath, OutputName, ModuleToSummariesForIndex)))
+  if ((EC = EmitImportsFiles(ModuleIdentifier, OutputName,
+                             ModuleToSummariesForIndex)))
     report_fatal_error(Twine("Failed to open ") + OutputName +
                        " to save imports lists\n");
 }
@@ -929,6 +884,9 @@ void ThinLTOCodeGenerator::run() {
   // Compute "dead" symbols, we don't want to import/export these!
   computeDeadSymbolsInIndex(*Index, GUIDPreservedSymbols);
 
+  // Synthesize entry counts for functions in the combined index.
+  computeSyntheticCounts(*Index);
+
   // Collect the import/export lists for all modules from the call-graph in the
   // combined index.
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
@@ -942,9 +900,9 @@ void ThinLTOCodeGenerator::run() {
   // on the index, and nuke this map.
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
 
-  // Resolve LinkOnce/Weak symbols, this has to be computed early because it
+  // Resolve prevailing symbols, this has to be computed early because it
   // impacts the caching.
-  resolveWeakForLinkerInIndex(*Index, ResolvedODR);
+  resolvePrevailingInIndex(*Index, ResolvedODR);
 
   // Use global summary-based analysis to identify symbols that can be
   // internalized (because they aren't exported or preserved as per callback).
@@ -983,14 +941,14 @@ void ThinLTOCodeGenerator::run() {
         auto ModuleIdentifier = ModuleBuffer.getBufferIdentifier();
         auto &ExportList = ExportLists[ModuleIdentifier];
 
-        auto &DefinedFunctions = ModuleToDefinedGVSummaries[ModuleIdentifier];
+        auto &DefinedGVSummaries = ModuleToDefinedGVSummaries[ModuleIdentifier];
 
         // The module may be cached, this helps handling it.
         ModuleCacheEntry CacheEntry(CacheOptions.Path, *Index, ModuleIdentifier,
                                     ImportLists[ModuleIdentifier], ExportList,
                                     ResolvedODR[ModuleIdentifier],
-                                    DefinedFunctions, GUIDPreservedSymbols,
-                                    OptLevel, Freestanding, TMBuilder);
+                                    DefinedGVSummaries, OptLevel, Freestanding,
+                                    TMBuilder);
         auto CacheEntryPath = CacheEntry.getEntryPath();
 
         {
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index b304bfc401aa0497e608178ab124a23b153539a7..afbc57abfcc08da06c0afe775e550353af5ce4cb 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -1062,10 +1062,15 @@ void IRLinker::prepareCompileUnitsForImport() {
     ValueMap.MD()[CU->getRawEnumTypes()].reset(nullptr);
     ValueMap.MD()[CU->getRawMacros()].reset(nullptr);
     ValueMap.MD()[CU->getRawRetainedTypes()].reset(nullptr);
-    // We import global variables only temporarily in order for instcombine
-    // and globalopt to perform constant folding and static constructor
-    // evaluation. After that elim-avail-extern will covert imported globals
-    // back to declarations, so we don't need debug info for them.
+    // The original definition (or at least its debug info - if the variable is
+    // internalized an optimized away) will remain in the source module, so
+    // there's no need to import them.
+    // If LLVM ever does more advanced optimizations on global variables
+    // (removing/localizing write operations, for instance) that can track
+    // through debug info, this decision may need to be revisited - but do so
+    // with care when it comes to debug info size. Emitting small CUs containing
+    // only a few imported entities into every destination module may be very
+    // size inefficient.
     ValueMap.MD()[CU->getRawGlobalVariables()].reset(nullptr);
 
     // Imported entities only need to be mapped in if they have local
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 5b8b013ad0875a4bdc3020764950db96f6c59605..89f3b30cddd6f44087340e3ab40d84c4e51ee1a1 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -669,6 +669,20 @@ void ELFWriter::computeSymbolTable(
     } else {
       const MCSectionELF &Section =
           static_cast<const MCSectionELF &>(Symbol.getSection());
+
+      // We may end up with a situation when section symbol is technically
+      // defined, but should not be. That happens because we explicitly
+      // pre-create few .debug_* sections to have accessors.
+      // And if these sections were not really defined in the code, but were
+      // referenced, we simply error out.
+      if (!Section.isRegistered()) {
+        assert(static_cast<const MCSymbolELF &>(Symbol).getType() ==
+               ELF::STT_SECTION);
+        Ctx.reportError(SMLoc(),
+                        "Undefined section reference: " + Symbol.getName());
+        continue;
+      }
+
       if (Mode == NonDwoOnly && isDwoSection(Section))
         continue;
       MSD.SectionIndex = SectionIndexMap.lookup(&Section);
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 463e9066616d81ba35897522889dde5299cc7045..fd10c6901b732899bf4c266f66e430f6df74ab25 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -147,9 +147,9 @@ public:
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
   void EmitDataRegion(MCDataRegionType Kind) override;
   void EmitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor,
-                      unsigned Update) override;
+                      unsigned Update, VersionTuple SDKVersion) override;
   void EmitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor,
-                        unsigned Update) override;
+                        unsigned Update, VersionTuple SDKVersion) override;
   void EmitThumbFunc(MCSymbol *Func) override;
 
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
@@ -285,6 +285,7 @@ public:
   void EmitCFIUndefined(int64_t Register) override;
   void EmitCFIRegister(int64_t Register1, int64_t Register2) override;
   void EmitCFIWindowSave() override;
+  void EmitCFINegateRAState() override;
   void EmitCFIReturnColumn(int64_t Register) override;
 
   void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) override;
@@ -514,11 +515,26 @@ static const char *getVersionMinDirective(MCVersionMinType Type) {
   llvm_unreachable("Invalid MC version min type");
 }
 
+static void EmitSDKVersionSuffix(raw_ostream &OS,
+                                 const VersionTuple &SDKVersion) {
+  if (SDKVersion.empty())
+    return;
+  OS << '\t' << "sdk_version " << SDKVersion.getMajor();
+  if (auto Minor = SDKVersion.getMinor()) {
+    OS << ", " << *Minor;
+    if (auto Subminor = SDKVersion.getSubminor()) {
+      OS << ", " << *Subminor;
+    }
+  }
+}
+
 void MCAsmStreamer::EmitVersionMin(MCVersionMinType Type, unsigned Major,
-                                   unsigned Minor, unsigned Update) {
+                                   unsigned Minor, unsigned Update,
+                                   VersionTuple SDKVersion) {
   OS << '\t' << getVersionMinDirective(Type) << ' ' << Major << ", " << Minor;
   if (Update)
     OS << ", " << Update;
+  EmitSDKVersionSuffix(OS, SDKVersion);
   EmitEOL();
 }
 
@@ -534,11 +550,13 @@ static const char *getPlatformName(MachO::PlatformType Type) {
 }
 
 void MCAsmStreamer::EmitBuildVersion(unsigned Platform, unsigned Major,
-                                     unsigned Minor, unsigned Update) {
+                                     unsigned Minor, unsigned Update,
+                                     VersionTuple SDKVersion) {
   const char *PlatformName = getPlatformName((MachO::PlatformType)Platform);
   OS << "\t.build_version " << PlatformName << ", " << Major << ", " << Minor;
   if (Update)
     OS << ", " << Update;
+  EmitSDKVersionSuffix(OS, SDKVersion);
   EmitEOL();
 }
 
@@ -1569,6 +1587,12 @@ void MCAsmStreamer::EmitCFIWindowSave() {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitCFINegateRAState() {
+  MCStreamer::EmitCFINegateRAState();
+  OS << "\t.cfi_negate_ra_state";
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitCFIReturnColumn(int64_t Register) {
   MCStreamer::EmitCFIReturnColumn(Register);
   OS << "\t.cfi_return_column " << Register;
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 1e23b6d816e8a083d946cc0a351515a2316746a8..cde6a93a164729636b8671cd48634bbc028219aa 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -111,6 +111,7 @@ void MCAssembler::reset() {
   ELFHeaderEFlags = 0;
   LOHContainer.reset();
   VersionInfo.Major = 0;
+  VersionInfo.SDKVersion = VersionTuple();
 
   // reset objects owned by us
   if (getBackendPtr())
diff --git a/lib/MC/MCCodeView.cpp b/lib/MC/MCCodeView.cpp
index 234b43e3d4962b31db7f4988ed448a0f23608b4f..978ac789c31e118afca58925c40b0b9115b04d13 100644
--- a/lib/MC/MCCodeView.cpp
+++ b/lib/MC/MCCodeView.cpp
@@ -432,13 +432,13 @@ void CodeViewContext::emitInlineLineTableForFunction(MCObjectStreamer &OS,
                                   OS.getCurrentSectionOnly());
 }
 
-void CodeViewContext::emitDefRange(
+MCFragment *CodeViewContext::emitDefRange(
     MCObjectStreamer &OS,
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     StringRef FixedSizePortion) {
   // Create and insert a fragment into the current section that will be encoded
   // later.
-  new MCCVDefRangeFragment(Ranges, FixedSizePortion,
+  return new MCCVDefRangeFragment(Ranges, FixedSizePortion,
                            OS.getCurrentSectionOnly());
 }
 
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 7093446815f3787d93e86f4153587b41217c66b3..9791fb5724b1365219dbce1f2c3abaf419cf9558 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -1332,6 +1332,10 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1);
     return;
 
+  case MCCFIInstruction::OpNegateRAState:
+    Streamer.EmitIntValue(dwarf::DW_CFA_AARCH64_negate_ra_state, 1);
+    return;
+
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
     Streamer.EmitIntValue(dwarf::DW_CFA_undefined, 1);
@@ -1418,7 +1422,12 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     unsigned Reg = Instr.getRegister();
     if (!IsEH)
       Reg = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg);
-    Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
+    if (Reg < 64) {
+      Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
+    } else {
+      Streamer.EmitIntValue(dwarf::DW_CFA_restore_extended, 1);
+      Streamer.EmitULEB128IntValue(Reg);
+    }
     return;
   }
   case MCCFIInstruction::OpGnuArgsSize:
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 38f311be7c66b49fac14355c6716b6bbfaa30abc..3c022199145fbafe128f4ce4368cc546c969486f 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -306,6 +306,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_WebAssembly_FUNCTION: return "FUNCTION";
   case VK_WebAssembly_GLOBAL: return "GLOBAL";
   case VK_WebAssembly_TYPEINDEX: return "TYPEINDEX";
+  case VK_WebAssembly_EVENT: return "EVENT";
   case VK_AMDGPU_GOTPCREL32_LO: return "gotpcrel32@lo";
   case VK_AMDGPU_GOTPCREL32_HI: return "gotpcrel32@hi";
   case VK_AMDGPU_REL32_LO: return "rel32@lo";
@@ -421,6 +422,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("function", VK_WebAssembly_FUNCTION)
     .Case("global", VK_WebAssembly_GLOBAL)
     .Case("typeindex", VK_WebAssembly_TYPEINDEX)
+    .Case("event", VK_WebAssembly_EVENT)
     .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO)
     .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI)
     .Case("rel32@lo", VK_AMDGPU_REL32_LO)
diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp
index 0ebcf21a422e3f2fae407a38357b54e270279333..d22b117972bf230c38d62ab70f110de947673203 100644
--- a/lib/MC/MCFragment.cpp
+++ b/lib/MC/MCFragment.cpp
@@ -237,8 +237,8 @@ MCFragment::~MCFragment() = default;
 
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
                        MCSection *Parent)
-    : Kind(Kind), HasInstructions(HasInstructions), Parent(Parent),
-      Atom(nullptr), Offset(~UINT64_C(0)) {
+    : Kind(Kind), HasInstructions(HasInstructions), LayoutOrder(0),
+      Parent(Parent), Atom(nullptr), Offset(~UINT64_C(0)) {
   if (Parent && !isDummy())
     Parent->getFragmentList().push_back(this);
 }
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index f9b71caaf91c997ba0dd4566e96b3d77c05240ef..64f111fc7114847f4a39c5a60b16b7349185d8a2 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -72,11 +72,17 @@ void MCInst::print(raw_ostream &OS) const {
 
 void MCInst::dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer,
                          StringRef Separator) const {
+  StringRef InstName = Printer ? Printer->getOpcodeName(getOpcode()) : "";
+  dump_pretty(OS, InstName, Separator);
+}
+
+void MCInst::dump_pretty(raw_ostream &OS, StringRef Name,
+                         StringRef Separator) const {
   OS << "<MCInst #" << getOpcode();
 
-  // Show the instruction opcode name if we have access to a printer.
-  if (Printer)
-    OS << ' ' << Printer->getOpcodeName(getOpcode());
+  // Show the instruction opcode name if we have it.
+  if (!Name.empty())
+    OS << ' ' << Name;
 
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     OS << Separator;
diff --git a/lib/MC/MCInstrDesc.cpp b/lib/MC/MCInstrDesc.cpp
index ee55f3eff3ac739d81532714e8aa644d4b7e8749..53cba864a85d7b893b6d6406dd115e7119fc70a4 100644
--- a/lib/MC/MCInstrDesc.cpp
+++ b/lib/MC/MCInstrDesc.cpp
@@ -39,15 +39,6 @@ bool MCInstrDesc::mayAffectControlFlow(const MCInst &MI,
     return false;
   if (hasDefOfPhysReg(MI, PC, RI))
     return true;
-  // A variadic instruction may define PC in the variable operand list.
-  // There's currently no indication of which entries in a variable
-  // list are defs and which are uses. While that's the case, this function
-  // needs to assume they're defs in order to be conservatively correct.
-  for (int i = NumOperands, e = MI.getNumOperands(); i != e; ++i) {
-    if (MI.getOperand(i).isReg() &&
-        RI.isSubRegisterEq(PC, MI.getOperand(i).getReg()))
-      return true;
-  }
   return false;
 }
 
@@ -66,5 +57,10 @@ bool MCInstrDesc::hasDefOfPhysReg(const MCInst &MI, unsigned Reg,
     if (MI.getOperand(i).isReg() &&
         RI.isSubRegisterEq(Reg, MI.getOperand(i).getReg()))
       return true;
+  if (variadicOpsAreDefs())
+    for (int i = NumOperands - 1, e = MI.getNumOperands(); i != e; ++i)
+      if (MI.getOperand(i).isReg() &&
+          RI.isSubRegisterEq(Reg, MI.getOperand(i).getReg()))
+        return true;
   return hasImplicitDefOfPhysReg(Reg, &RI);
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 43e69605787cd3394d00c6b4302a1cf4c4b75b42..b30317e74672bb3198a1f6aa8efad4e61411fcf9 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -89,10 +89,10 @@ public:
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
   void EmitDataRegion(MCDataRegionType Kind) override;
-  void EmitVersionMin(MCVersionMinType Kind, unsigned Major,
-                      unsigned Minor, unsigned Update) override;
-  void EmitBuildVersion(unsigned Platform, unsigned Major,
-                        unsigned Minor, unsigned Update) override;
+  void EmitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor,
+                      unsigned Update, VersionTuple SDKVersion) override;
+  void EmitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor,
+                        unsigned Update, VersionTuple SDKVersion) override;
   void EmitThumbFunc(MCSymbol *Func) override;
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
@@ -270,14 +270,16 @@ void MCMachOStreamer::EmitDataRegion(MCDataRegionType Kind) {
 }
 
 void MCMachOStreamer::EmitVersionMin(MCVersionMinType Kind, unsigned Major,
-                                     unsigned Minor, unsigned Update) {
-  getAssembler().setVersionMin(Kind, Major, Minor, Update);
+                                     unsigned Minor, unsigned Update,
+                                     VersionTuple SDKVersion) {
+  getAssembler().setVersionMin(Kind, Major, Minor, Update, SDKVersion);
 }
 
 void MCMachOStreamer::EmitBuildVersion(unsigned Platform, unsigned Major,
-                                       unsigned Minor, unsigned Update) {
+                                       unsigned Minor, unsigned Update,
+                                       VersionTuple SDKVersion) {
   getAssembler().setBuildVersion((MachO::PlatformType)Platform, Major, Minor,
-                                 Update);
+                                 Update, SDKVersion);
 }
 
 void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
@@ -507,7 +509,7 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context,
       new MCMachOStreamer(Context, std::move(MAB), std::move(OW), std::move(CE),
                           DWARFMustBeAtTheEnd, LabelSections);
   const Triple &Target = Context.getObjectFileInfo()->getTargetTriple();
-  S->EmitVersionForTarget(Target);
+  S->EmitVersionForTarget(Target, Context.getObjectFileInfo()->getSDKVersion());
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   return S;
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index ab8e0f31db9e7a8faa482fbea124cdef8f8738c8..9e35355d06e0a50e4825fd1f927a6e13b266012c 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -479,10 +479,10 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
 }
 
 void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
-  EHFrameSection = Ctx->getCOFFSection(
-      ".eh_frame", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                       COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE,
-      SectionKind::getData());
+  EHFrameSection =
+      Ctx->getCOFFSection(".eh_frame", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                           COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getData());
 
   // Set the `IMAGE_SCN_MEM_16BIT` flag when compiling for thumb mode.  This is
   // used to indicate to the linker that the text segment contains thumb instructions
@@ -510,11 +510,7 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       ".rdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getReadOnly());
 
-  // FIXME: We're emitting LSDA info into a readonly section on COFF, even
-  // though it contains relocatable pointers.  In PIC mode, this is probably a
-  // big runtime hit for C++ apps.  Either the contents of the LSDA need to be
-  // adjusted or this should be a data section.
-  if (T.getArch() == Triple::x86_64) {
+  if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::aarch64) {
     // On Windows 64 with SEH, the LSDA is emitted into the .xdata section
     LSDASection = nullptr;
   } else {
@@ -812,16 +808,17 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
   }
 }
 
-MCSection *MCObjectFileInfo::getDwarfTypesSection(uint64_t Hash) const {
+MCSection *MCObjectFileInfo::getDwarfComdatSection(const char *Name,
+                                                   uint64_t Hash) const {
   switch (TT.getObjectFormat()) {
   case Triple::ELF:
-    return Ctx->getELFSection(".debug_types", ELF::SHT_PROGBITS, ELF::SHF_GROUP,
-                              0, utostr(Hash));
+    return Ctx->getELFSection(Name, ELF::SHT_PROGBITS, ELF::SHF_GROUP, 0,
+                              utostr(Hash));
   case Triple::MachO:
   case Triple::COFF:
   case Triple::Wasm:
   case Triple::UnknownObjectFormat:
-    report_fatal_error("Cannot get DWARF types section for this object file "
+    report_fatal_error("Cannot get DWARF comdat section for this object file "
                        "format: not implemented.");
     break;
   }
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 8c88db009bd480c7f8031213bd56581ea027b63e..6ec705bdddb73451aa0169e6df9847a7881b14ef 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -59,6 +59,27 @@ void MCObjectStreamer::flushPendingLabels(MCFragment *F, uint64_t FOffset) {
   PendingLabels.clear();
 }
 
+// When fixup's offset is a forward declared label, e.g.:
+//
+//   .reloc 1f, R_MIPS_JALR, foo
+// 1: nop
+//
+// postpone adding it to Fixups vector until the label is defined and its offset
+// is known.
+void MCObjectStreamer::resolvePendingFixups() {
+  for (PendingMCFixup &PendingFixup : PendingFixups) {
+    if (!PendingFixup.Sym || PendingFixup.Sym->isUndefined ()) {
+      getContext().reportError(PendingFixup.Fixup.getLoc(),
+                               "unresolved relocation offset");
+      continue;
+    }
+    flushPendingLabels(PendingFixup.DF, PendingFixup.DF->getContents().size());
+    PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset());
+    PendingFixup.DF->getFixups().push_back(PendingFixup.Fixup);
+  }
+  PendingFixups.clear();
+}
+
 // As a compile-time optimization, avoid allocating and evaluating an MCExpr
 // tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment.
 static Optional<uint64_t>
@@ -476,7 +497,11 @@ void MCObjectStreamer::EmitCVInlineLinetableDirective(
 void MCObjectStreamer::EmitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     StringRef FixedSizePortion) {
-  getContext().getCVContext().emitDefRange(*this, Ranges, FixedSizePortion);
+  MCFragment *Frag =
+      getContext().getCVContext().emitDefRange(*this, Ranges, FixedSizePortion);
+  // Attach labels that were pending before we created the defrange fragment to
+  // the beginning of the new fragment.
+  flushPendingLabels(Frag, 0);
   this->MCStreamer::EmitCVDefRangeDirective(Ranges, FixedSizePortion);
 }
 
@@ -603,16 +628,6 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
 bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
                                           const MCExpr *Expr, SMLoc Loc,
                                           const MCSubtargetInfo &STI) {
-  int64_t OffsetValue;
-  if (!Offset.evaluateAsAbsolute(OffsetValue))
-    llvm_unreachable("Offset is not absolute");
-
-  if (OffsetValue < 0)
-    llvm_unreachable("Offset is negative");
-
-  MCDataFragment *DF = getOrCreateDataFragment(&STI);
-  flushPendingLabels(DF, DF->getContents().size());
-
   Optional<MCFixupKind> MaybeKind = Assembler->getBackend().getFixupKind(Name);
   if (!MaybeKind.hasValue())
     return true;
@@ -622,7 +637,30 @@ bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
   if (Expr == nullptr)
     Expr =
         MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext());
-  DF->getFixups().push_back(MCFixup::create(OffsetValue, Expr, Kind, Loc));
+
+  MCDataFragment *DF = getOrCreateDataFragment(&STI);
+  flushPendingLabels(DF, DF->getContents().size());
+
+  int64_t OffsetValue;
+  if (Offset.evaluateAsAbsolute(OffsetValue)) {
+    if (OffsetValue < 0)
+      llvm_unreachable(".reloc offset is negative");
+    DF->getFixups().push_back(MCFixup::create(OffsetValue, Expr, Kind, Loc));
+    return false;
+  }
+
+  if (Offset.getKind() != llvm::MCExpr::SymbolRef)
+    llvm_unreachable(".reloc offset is not absolute nor a label");
+
+  const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(Offset);
+  if (SRE.getSymbol().isDefined()) {
+    DF->getFixups().push_back(MCFixup::create(SRE.getSymbol().getOffset(),
+                                              Expr, Kind, Loc));
+    return false;
+  }
+
+  PendingFixups.emplace_back(&SRE.getSymbol(), DF,
+                                         MCFixup::create(-1, Expr, Kind, Loc));
   return false;
 }
 
@@ -689,5 +727,6 @@ void MCObjectStreamer::FinishImpl() {
   MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams());
 
   flushPendingLabels();
+  resolvePendingFixups();
   getAssembler().Finish();
 }
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index c8d48f033f65f6a958568a9e956a018d5fde4341..2b0d20f9b8e2efc10507323dc1cd80edcb7ce613 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -627,7 +627,6 @@ AsmToken AsmLexer::LexToken() {
     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
-  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
   case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
   case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
   case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
@@ -646,6 +645,12 @@ AsmToken AsmLexer::LexToken() {
       return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
     }
     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
+  case '-':
+    if (*CurPtr == '>') {
+      ++CurPtr;
+      return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
+    }
+    return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
   case '|':
     if (*CurPtr == '|') {
       ++CurPtr;
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 3f7b507791eca9ec957e2b7fde49db974cbd6307..aa07eee44916483c4e6b5fbb27268d1d010241c6 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -675,6 +675,7 @@ namespace llvm {
 extern MCAsmParserExtension *createDarwinAsmParser();
 extern MCAsmParserExtension *createELFAsmParser();
 extern MCAsmParserExtension *createCOFFAsmParser();
+extern MCAsmParserExtension *createWasmAsmParser();
 
 } // end namespace llvm
 
@@ -705,10 +706,7 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
     PlatformParser.reset(createELFAsmParser());
     break;
   case MCObjectFileInfo::IsWasm:
-    // TODO: WASM will need its own MCAsmParserExtension implementation, but
-    // for now we can re-use the ELF one, since the directives can be the
-    // same for now.
-    PlatformParser.reset(createELFAsmParser());
+    PlatformParser.reset(createWasmAsmParser());
     break;
   }
 
@@ -2940,20 +2938,20 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
 bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
   const MCExpr *Offset;
   const MCExpr *Expr = nullptr;
-
-  SMLoc OffsetLoc = Lexer.getTok().getLoc();
   int64_t OffsetValue;
-  // We can only deal with constant expressions at the moment.
+  SMLoc OffsetLoc = Lexer.getTok().getLoc();
 
   if (parseExpression(Offset))
     return true;
 
-  if (check(!Offset->evaluateAsAbsolute(OffsetValue,
-                                        getStreamer().getAssemblerPtr()),
-            OffsetLoc, "expression is not a constant value") ||
-      check(OffsetValue < 0, OffsetLoc, "expression is negative") ||
-      parseToken(AsmToken::Comma, "expected comma") ||
-      check(getTok().isNot(AsmToken::Identifier), "expected relocation name"))
+  if ((Offset->evaluateAsAbsolute(OffsetValue,
+                                  getStreamer().getAssemblerPtr()) &&
+       check(OffsetValue < 0, OffsetLoc, "expression is negative")) ||
+      (check(Offset->getKind() != llvm::MCExpr::Constant &&
+             Offset->getKind() != llvm::MCExpr::SymbolRef,
+             OffsetLoc, "expected non-negative number or a label")) ||
+      (parseToken(AsmToken::Comma, "expected comma") ||
+       check(getTok().isNot(AsmToken::Identifier), "expected relocation name")))
     return true;
 
   SMLoc NameLoc = Lexer.getTok().getLoc();
@@ -3921,7 +3919,7 @@ bool AsmParser::parseDirectiveCFIStartProc() {
         parseToken(AsmToken::EndOfStatement))
       return addErrorSuffix(" in '.cfi_startproc' directive");
   }
-  
+
   // TODO(kristina): Deal with a corner case of incorrect diagnostic context
   // being produced if this directive is emitted as part of preprocessor macro
   // expansion which can *ONLY* happen if Clang's cc1as is the API consumer.
diff --git a/lib/MC/MCParser/CMakeLists.txt b/lib/MC/MCParser/CMakeLists.txt
index 99fdd0167993780f81c43be5503cf260ba2f240f..0c54e8e901931897ffe4e9fa118ef07a4a9d8701 100644
--- a/lib/MC/MCParser/CMakeLists.txt
+++ b/lib/MC/MCParser/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMMCParser
   MCAsmParser.cpp
   MCAsmParserExtension.cpp
   MCTargetAsmParser.cpp
+  WasmAsmParser.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC/MCParser
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index e6fc1fac81ba07f9a769200b65ee9832c59d6239..07926d6c1c9ab7840f17c93f8e7fcce75fed1e44 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -459,7 +459,12 @@ public:
 
   bool parseBuildVersion(StringRef Directive, SMLoc Loc);
   bool parseVersionMin(StringRef Directive, SMLoc Loc, MCVersionMinType Type);
+  bool parseMajorMinorVersionComponent(unsigned *Major, unsigned *Minor,
+                                       const char *VersionName);
+  bool parseOptionalTrailingVersionComponent(unsigned *Component,
+                                             const char *ComponentName);
   bool parseVersion(unsigned *Major, unsigned *Minor, unsigned *Update);
+  bool parseSDKVersion(VersionTuple &SDKVersion);
   void checkVersion(StringRef Directive, StringRef Arg, SMLoc Loc,
                     Triple::OSType ExpectedOS);
 };
@@ -1000,43 +1005,89 @@ bool DarwinAsmParser::parseDirectiveDataRegionEnd(StringRef, SMLoc) {
   return false;
 }
 
-/// parseVersion ::= major, minor [, update]
-bool DarwinAsmParser::parseVersion(unsigned *Major, unsigned *Minor,
-                                   unsigned *Update) {
+static bool isSDKVersionToken(const AsmToken &Tok) {
+  return Tok.is(AsmToken::Identifier) && Tok.getIdentifier() == "sdk_version";
+}
+
+/// parseMajorMinorVersionComponent ::= major, minor
+bool DarwinAsmParser::parseMajorMinorVersionComponent(unsigned *Major,
+                                                      unsigned *Minor,
+                                                      const char *VersionName) {
   // Get the major version number.
   if (getLexer().isNot(AsmToken::Integer))
-    return TokError("invalid OS major version number, integer expected");
+    return TokError(Twine("invalid ") + VersionName +
+                    " major version number, integer expected");
   int64_t MajorVal = getLexer().getTok().getIntVal();
   if (MajorVal > 65535 || MajorVal <= 0)
-    return TokError("invalid OS major version number");
+    return TokError(Twine("invalid ") + VersionName + " major version number");
   *Major = (unsigned)MajorVal;
   Lex();
   if (getLexer().isNot(AsmToken::Comma))
-    return TokError("OS minor version number required, comma expected");
+    return TokError(Twine(VersionName) +
+                    " minor version number required, comma expected");
   Lex();
   // Get the minor version number.
   if (getLexer().isNot(AsmToken::Integer))
-    return TokError("invalid OS minor version number, integer expected");
+    return TokError(Twine("invalid ") + VersionName +
+                    " minor version number, integer expected");
   int64_t MinorVal = getLexer().getTok().getIntVal();
   if (MinorVal > 255 || MinorVal < 0)
-    return TokError("invalid OS minor version number");
+    return TokError(Twine("invalid ") + VersionName + " minor version number");
   *Minor = MinorVal;
   Lex();
+  return false;
+}
+
+/// parseOptionalTrailingVersionComponent ::= , version_number
+bool DarwinAsmParser::parseOptionalTrailingVersionComponent(
+    unsigned *Component, const char *ComponentName) {
+  assert(getLexer().is(AsmToken::Comma) && "comma expected");
+  Lex();
+  if (getLexer().isNot(AsmToken::Integer))
+    return TokError(Twine("invalid ") + ComponentName +
+                    " version number, integer expected");
+  int64_t Val = getLexer().getTok().getIntVal();
+  if (Val > 255 || Val < 0)
+    return TokError(Twine("invalid ") + ComponentName + " version number");
+  *Component = Val;
+  Lex();
+  return false;
+}
+
+/// parseVersion ::= parseMajorMinorVersionComponent
+///                      parseOptionalTrailingVersionComponent
+bool DarwinAsmParser::parseVersion(unsigned *Major, unsigned *Minor,
+                                   unsigned *Update) {
+  if (parseMajorMinorVersionComponent(Major, Minor, "OS"))
+    return true;
 
   // Get the update level, if specified
   *Update = 0;
-  if (getLexer().is(AsmToken::EndOfStatement))
+  if (getLexer().is(AsmToken::EndOfStatement) ||
+      isSDKVersionToken(getLexer().getTok()))
     return false;
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("invalid OS update specifier, comma expected");
+  if (parseOptionalTrailingVersionComponent(Update, "OS update"))
+    return true;
+  return false;
+}
+
+bool DarwinAsmParser::parseSDKVersion(VersionTuple &SDKVersion) {
+  assert(isSDKVersionToken(getLexer().getTok()) && "expected sdk_version");
   Lex();
-  if (getLexer().isNot(AsmToken::Integer))
-    return TokError("invalid OS update version number, integer expected");
-  int64_t UpdateVal = getLexer().getTok().getIntVal();
-  if (UpdateVal > 255 || UpdateVal < 0)
-    return TokError("invalid OS update version number");
-  *Update = UpdateVal;
-  Lex();
+  unsigned Major, Minor;
+  if (parseMajorMinorVersionComponent(&Major, &Minor, "SDK"))
+    return true;
+  SDKVersion = VersionTuple(Major, Minor);
+
+  // Get the subminor version, if specified.
+  if (getLexer().is(AsmToken::Comma)) {
+    unsigned Subminor;
+    if (parseOptionalTrailingVersionComponent(&Subminor, "SDK subminor"))
+      return true;
+    SDKVersion = VersionTuple(Major, Minor, Subminor);
+  }
   return false;
 }
 
@@ -1066,10 +1117,10 @@ static Triple::OSType getOSTypeFromMCVM(MCVersionMinType Type) {
 }
 
 /// parseVersionMin
-///   ::= .ios_version_min parseVersion
-///   |   .macosx_version_min parseVersion
-///   |   .tvos_version_min parseVersion
-///   |   .watchos_version_min parseVersion
+///   ::= .ios_version_min parseVersion parseSDKVersion
+///   |   .macosx_version_min parseVersion parseSDKVersion
+///   |   .tvos_version_min parseVersion parseSDKVersion
+///   |   .watchos_version_min parseVersion parseSDKVersion
 bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc,
                                       MCVersionMinType Type) {
   unsigned Major;
@@ -1078,13 +1129,16 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc,
   if (parseVersion(&Major, &Minor, &Update))
     return true;
 
+  VersionTuple SDKVersion;
+  if (isSDKVersionToken(getLexer().getTok()) && parseSDKVersion(SDKVersion))
+    return true;
+
   if (parseToken(AsmToken::EndOfStatement))
     return addErrorSuffix(Twine(" in '") + Directive + "' directive");
 
   Triple::OSType ExpectedOS = getOSTypeFromMCVM(Type);
   checkVersion(Directive, StringRef(), Loc, ExpectedOS);
-
-  getStreamer().EmitVersionMin(Type, Major, Minor, Update);
+  getStreamer().EmitVersionMin(Type, Major, Minor, Update, SDKVersion);
   return false;
 }
 
@@ -1100,7 +1154,7 @@ static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) {
 }
 
 /// parseBuildVersion
-///   ::= .build_version (macos|ios|tvos|watchos), parseVersion
+///   ::= .build_version (macos|ios|tvos|watchos), parseVersion parseSDKVersion
 bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) {
   StringRef PlatformName;
   SMLoc PlatformLoc = getTok().getLoc();
@@ -1126,14 +1180,17 @@ bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) {
   if (parseVersion(&Major, &Minor, &Update))
     return true;
 
+  VersionTuple SDKVersion;
+  if (isSDKVersionToken(getLexer().getTok()) && parseSDKVersion(SDKVersion))
+    return true;
+
   if (parseToken(AsmToken::EndOfStatement))
     return addErrorSuffix(" in '.build_version' directive");
 
   Triple::OSType ExpectedOS
     = getOSTypeFromPlatform((MachO::PlatformType)Platform);
   checkVersion(Directive, PlatformName, Loc, ExpectedOS);
-
-  getStreamer().EmitBuildVersion(Platform, Major, Minor, Update);
+  getStreamer().EmitBuildVersion(Platform, Major, Minor, Update, SDKVersion);
   return false;
 }
 
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 3d9590e1f9f5f5e7503bd9223eb8c6395360ec6d..d568f7a71eeb5dca8f4e9293e2c0678be636d706 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -321,6 +321,9 @@ static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
     case 'y':
       flags |= ELF::SHF_ARM_PURECODE;
       break;
+    case 's':
+      flags |= ELF::SHF_HEX_GPREL;
+      break;
     case 'G':
       flags |= ELF::SHF_GROUP;
       break;
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index 1ed056800d4a6cd7b8a24897fde417b9d19e01b7..10960fc696334870f353a07c9a4b6e990f41c305 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -85,6 +85,7 @@ void AsmToken::dump(raw_ostream &OS) const {
   case AsmToken::LessGreater:        OS << "LessGreater"; break;
   case AsmToken::LessLess:           OS << "LessLess"; break;
   case AsmToken::Minus:              OS << "Minus"; break;
+  case AsmToken::MinusGreater:       OS << "MinusGreater"; break;
   case AsmToken::Percent:            OS << "Percent"; break;
   case AsmToken::Pipe:               OS << "Pipe"; break;
   case AsmToken::PipePipe:           OS << "PipePipe"; break;
diff --git a/lib/MC/MCParser/WasmAsmParser.cpp b/lib/MC/MCParser/WasmAsmParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..93bb0cb3c72e476360da73d8e054f8bb8bcc0af2
--- /dev/null
+++ b/lib/MC/MCParser/WasmAsmParser.cpp
@@ -0,0 +1,145 @@
+//===- WasmAsmParser.cpp - Wasm Assembly Parser -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// --
+//
+// Note, this is for wasm, the binary format (analogous to ELF), not wasm,
+// the instruction set (analogous to x86), for which parsing code lives in
+// WebAssemblyAsmParser.
+//
+// This file contains processing for generic directives implemented using
+// MCTargetStreamer, the ones that depend on WebAssemblyTargetStreamer are in
+// WebAssemblyAsmParser.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/Support/MachineValueType.h"
+
+using namespace llvm;
+
+namespace {
+
+class WasmAsmParser : public MCAsmParserExtension {
+  MCAsmParser *Parser;
+  MCAsmLexer *Lexer;
+
+  template<bool (WasmAsmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair(
+        this, HandleDirective<WasmAsmParser, HandlerMethod>);
+
+    getParser().addDirectiveHandler(Directive, Handler);
+  }
+
+public:
+  WasmAsmParser() : Parser(nullptr), Lexer(nullptr) {
+    BracketExpressionsSupported = true;
+  }
+
+  void Initialize(MCAsmParser &P) override {
+    Parser = &P;
+    Lexer = &Parser->getLexer();
+    // Call the base implementation.
+    this->MCAsmParserExtension::Initialize(*Parser);
+
+    addDirectiveHandler<&WasmAsmParser::parseSectionDirectiveText>(".text");
+    addDirectiveHandler<&WasmAsmParser::parseSectionDirective>(".section");
+    addDirectiveHandler<&WasmAsmParser::parseDirectiveSize>(".size");
+    addDirectiveHandler<&WasmAsmParser::parseDirectiveType>(".type");
+  }
+
+  bool Error(const StringRef &msg, const AsmToken &tok) {
+    return Parser->Error(tok.getLoc(), msg + tok.getString());
+  }
+
+  bool IsNext(AsmToken::TokenKind Kind) {
+    auto ok = Lexer->is(Kind);
+    if (ok) Lex();
+    return ok;
+  }
+
+  bool Expect(AsmToken::TokenKind Kind, const char *KindName) {
+    if (!IsNext(Kind))
+      return Error(std::string("Expected ") + KindName + ", instead got: ",
+                   Lexer->getTok());
+    return false;
+  }
+
+  bool parseSectionDirectiveText(StringRef, SMLoc) {
+    // FIXME: .text currently no-op.
+    return false;
+  }
+
+  bool parseSectionDirective(StringRef, SMLoc) {
+    // FIXME: .section currently no-op.
+    while (Lexer->isNot(AsmToken::EndOfStatement)) Parser->Lex();
+    return false;
+  }
+
+  // TODO: This function is almost the same as ELFAsmParser::ParseDirectiveSize
+  // so maybe could be shared somehow.
+  bool parseDirectiveSize(StringRef, SMLoc) {
+    StringRef Name;
+    if (Parser->parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+    auto Sym = getContext().getOrCreateSymbol(Name);
+    if (Lexer->isNot(AsmToken::Comma))
+      return TokError("unexpected token in directive");
+    Lex();
+    const MCExpr *Expr;
+    if (Parser->parseExpression(Expr))
+      return true;
+    if (Lexer->isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in directive");
+    Lex();
+    // MCWasmStreamer implements this.
+    getStreamer().emitELFSize(Sym, Expr);
+    return false;
+  }
+
+  bool parseDirectiveType(StringRef, SMLoc) {
+    // This could be the start of a function, check if followed by
+    // "label,@function"
+    if (!Lexer->is(AsmToken::Identifier))
+      return Error("Expected label after .type directive, got: ",
+                   Lexer->getTok());
+    auto WasmSym = cast<MCSymbolWasm>(
+                     getStreamer().getContext().getOrCreateSymbol(
+                       Lexer->getTok().getString()));
+    Lex();
+    if (!(IsNext(AsmToken::Comma) && IsNext(AsmToken::At) &&
+          Lexer->is(AsmToken::Identifier)))
+      return Error("Expected label,@type declaration, got: ", Lexer->getTok());
+    auto TypeName = Lexer->getTok().getString();
+    if (TypeName == "function")
+      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+    else if (TypeName == "global")
+      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+    else
+      return Error("Unknown WASM symbol type: ", Lexer->getTok());
+    Lex();
+    return Expect(AsmToken::EndOfStatement, "EOL");
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+MCAsmParserExtension *createWasmAsmParser() {
+  return new WasmAsmParser;
+}
+
+} // end namespace llvm
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index 4d77d05cc5053c06fe72f9e641c60da999673e8f..7ee1694ebbf71dd8f09be6386683a6110f91fea2 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -116,6 +116,9 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   } else if (T.isARM() || T.isThumb()) {
     if (Flags & ELF::SHF_ARM_PURECODE)
       OS << 'y';
+  } else if (Arch == Triple::hexagon) {
+    if (Flags & ELF::SHF_HEX_GPREL)
+      OS << 's';
   }
 
   OS << '"';
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 3722c0ad3c818292e55c6a5fd48d75055a771835..85e69f6f663e0283a47c3d2e01a2b452d34daa7a 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -577,6 +577,15 @@ void MCStreamer::EmitCFIWindowSave() {
   CurFrame->Instructions.push_back(Instruction);
 }
 
+void MCStreamer::EmitCFINegateRAState() {
+  MCSymbol *Label = EmitCFILabel();
+  MCCFIInstruction Instruction = MCCFIInstruction::createNegateRAState(Label);
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
+  if (!CurFrame)
+    return;
+  CurFrame->Instructions.push_back(Instruction);
+}
+
 void MCStreamer::EmitCFIReturnColumn(int64_t Register) {
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
@@ -1045,7 +1054,8 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) {
   return Sym;
 }
 
-void MCStreamer::EmitVersionForTarget(const Triple &Target) {
+void MCStreamer::EmitVersionForTarget(const Triple &Target,
+                                      const VersionTuple &SDKVersion) {
   if (!Target.isOSBinFormatMachO() || !Target.isOSDarwin())
     return;
   // Do we even know the version?
@@ -1071,5 +1081,5 @@ void MCStreamer::EmitVersionForTarget(const Triple &Target) {
     Target.getiOSVersion(Major, Minor, Update);
   }
   if (Major != 0)
-    EmitVersionMin(VersionType, Major, Minor, Update);
+    EmitVersionMin(VersionType, Major, Minor, Update, SDKVersion);
 }
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index 0c8d58e597271a97366d3467417c192659760209..0724b109e1a1da6ab89a43e7c4d3561e6470bc7a 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -468,10 +468,10 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   info->Symbol = Label;
 
   uint32_t FuncLength = 0x0;
-  FuncLength = (uint32_t)GetAbsDifference(streamer, info->FuncletOrFuncEnd,
-                                          info->Begin);
-  if (FuncLength)
-    FuncLength /= 4;
+  if (info->FuncletOrFuncEnd)
+    FuncLength = (uint32_t)GetAbsDifference(streamer, info->FuncletOrFuncEnd,
+                                            info->Begin);
+  FuncLength /= 4;
   uint32_t PrologCodeBytes = ARM64CountOfUnwindCodes(info->Instructions);
   uint32_t TotalCodeBytes = PrologCodeBytes;
 
@@ -487,8 +487,8 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
 
   // Code Words, Epilog count, E, X, Vers, Function Length
   uint32_t row1 = 0x0;
-  uint8_t CodeWords = TotalCodeBytes / 4;
-  uint8_t CodeWordsMod = TotalCodeBytes % 4;
+  uint32_t CodeWords = TotalCodeBytes / 4;
+  uint32_t CodeWordsMod = TotalCodeBytes % 4;
   if (CodeWordsMod)
     CodeWords++;
   uint32_t EpilogCount = info->EpilogMap.size();
@@ -505,6 +505,11 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
 
   // Extended Code Words, Extended Epilog Count
   if (ExtensionWord) {
+    // FIXME: We should be able to split unwind info into multiple sections.
+    // FIXME: We should share epilog codes across epilogs, where possible,
+    // which would make this issue show up less frequently.
+    if (CodeWords > 0xFF || EpilogCount > 0xFFFF)
+      report_fatal_error("SEH unwind data splitting not yet implemented");
     uint32_t row2 = 0x0;
     row2 |= (CodeWords & 0xFF) << 16;
     row2 |= (EpilogCount & 0xFFFF);
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index c2e8dc1ef6584b149d962283178989403899b41a..2fa65658ccfacfec0db3928d16586ee18f3be9e8 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -846,18 +846,27 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
 
   // Write out the deployment target information, if it's available.
   if (VersionInfo.Major != 0) {
-    assert(VersionInfo.Update < 256 && "unencodable update target version");
-    assert(VersionInfo.Minor < 256 && "unencodable minor target version");
-    assert(VersionInfo.Major < 65536 && "unencodable major target version");
-    uint32_t EncodedVersion = VersionInfo.Update | (VersionInfo.Minor << 8) |
-      (VersionInfo.Major << 16);
+    auto EncodeVersion = [](VersionTuple V) -> uint32_t {
+      assert(!V.empty() && "empty version");
+      unsigned Update = V.getSubminor() ? *V.getSubminor() : 0;
+      unsigned Minor = V.getMinor() ? *V.getMinor() : 0;
+      assert(Update < 256 && "unencodable update target version");
+      assert(Minor < 256 && "unencodable minor target version");
+      assert(V.getMajor() < 65536 && "unencodable major target version");
+      return Update | (Minor << 8) | (V.getMajor() << 16);
+    };
+    uint32_t EncodedVersion = EncodeVersion(
+        VersionTuple(VersionInfo.Major, VersionInfo.Minor, VersionInfo.Update));
+    uint32_t SDKVersion = !VersionInfo.SDKVersion.empty()
+                              ? EncodeVersion(VersionInfo.SDKVersion)
+                              : 0;
     if (VersionInfo.EmitBuildVersion) {
       // FIXME: Currently empty tools. Add clang version in the future.
       W.write<uint32_t>(MachO::LC_BUILD_VERSION);
       W.write<uint32_t>(sizeof(MachO::build_version_command));
       W.write<uint32_t>(VersionInfo.TypeOrPlatform.Platform);
       W.write<uint32_t>(EncodedVersion);
-      W.write<uint32_t>(0);         // SDK version.
+      W.write<uint32_t>(SDKVersion);
       W.write<uint32_t>(0);         // Empty tools list.
     } else {
       MachO::LoadCommandType LCType
@@ -865,7 +874,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
       W.write<uint32_t>(LCType);
       W.write<uint32_t>(sizeof(MachO::version_min_command));
       W.write<uint32_t>(EncodedVersion);
-      W.write<uint32_t>(0);         // reserved.
+      W.write<uint32_t>(SDKVersion);
     }
   }
 
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index c1e0b7aa7ab01a526f61d4e24f9fcde4f6a8e435..8a369204cd6c85aed7061960acf04f2d4d247fe7 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -56,10 +56,10 @@ struct SectionBookkeeping {
   uint32_t Index;
 };
 
-// The signature of a wasm function, in a struct capable of being used as a
-// DenseMap key.
-// TODO: Consider using WasmSignature directly instead.
-struct WasmFunctionType {
+// The signature of a wasm function or event, in a struct capable of being used
+// as a DenseMap key.
+// TODO: Consider using wasm::WasmSignature directly instead.
+struct WasmSignature {
   // Support empty and tombstone instances, needed by DenseMap.
   enum { Plain, Empty, Tombstone } State;
 
@@ -69,36 +69,35 @@ struct WasmFunctionType {
   // The parameter types of the function.
   SmallVector<wasm::ValType, 4> Params;
 
-  WasmFunctionType() : State(Plain) {}
+  WasmSignature() : State(Plain) {}
 
-  bool operator==(const WasmFunctionType &Other) const {
+  bool operator==(const WasmSignature &Other) const {
     return State == Other.State && Returns == Other.Returns &&
            Params == Other.Params;
   }
 };
 
-// Traits for using WasmFunctionType in a DenseMap.
-struct WasmFunctionTypeDenseMapInfo {
-  static WasmFunctionType getEmptyKey() {
-    WasmFunctionType FuncTy;
-    FuncTy.State = WasmFunctionType::Empty;
-    return FuncTy;
+// Traits for using WasmSignature in a DenseMap.
+struct WasmSignatureDenseMapInfo {
+  static WasmSignature getEmptyKey() {
+    WasmSignature Sig;
+    Sig.State = WasmSignature::Empty;
+    return Sig;
   }
-  static WasmFunctionType getTombstoneKey() {
-    WasmFunctionType FuncTy;
-    FuncTy.State = WasmFunctionType::Tombstone;
-    return FuncTy;
+  static WasmSignature getTombstoneKey() {
+    WasmSignature Sig;
+    Sig.State = WasmSignature::Tombstone;
+    return Sig;
   }
-  static unsigned getHashValue(const WasmFunctionType &FuncTy) {
-    uintptr_t Value = FuncTy.State;
-    for (wasm::ValType Ret : FuncTy.Returns)
+  static unsigned getHashValue(const WasmSignature &Sig) {
+    uintptr_t Value = Sig.State;
+    for (wasm::ValType Ret : Sig.Returns)
       Value += DenseMapInfo<uint32_t>::getHashValue(uint32_t(Ret));
-    for (wasm::ValType Param : FuncTy.Params)
+    for (wasm::ValType Param : Sig.Params)
       Value += DenseMapInfo<uint32_t>::getHashValue(uint32_t(Param));
     return Value;
   }
-  static bool isEqual(const WasmFunctionType &LHS,
-                      const WasmFunctionType &RHS) {
+  static bool isEqual(const WasmSignature &LHS, const WasmSignature &RHS) {
     return LHS == RHS;
   }
 };
@@ -118,7 +117,7 @@ struct WasmDataSegment {
 
 // A wasm function to be written into the function section.
 struct WasmFunction {
-  uint32_t Type;
+  uint32_t SigIndex;
   const MCSymbolWasm *Sym;
 };
 
@@ -216,7 +215,8 @@ class WasmObjectWriter : public MCObjectWriter {
   // Maps function symbols to the table element index space. Used
   // for TABLE_INDEX relocation types (i.e. address taken functions).
   DenseMap<const MCSymbolWasm *, uint32_t> TableIndices;
-  // Maps function/global symbols to the function/global/section index space.
+  // Maps function/global symbols to the function/global/event/section index
+  // space.
   DenseMap<const MCSymbolWasm *, uint32_t> WasmIndices;
   // Maps data symbols to the Wasm segment and offset/size with the segment.
   DenseMap<const MCSymbolWasm *, wasm::WasmDataReference> DataLocations;
@@ -231,13 +231,13 @@ class WasmObjectWriter : public MCObjectWriter {
   // Map from section to defining function symbol.
   DenseMap<const MCSection *, const MCSymbol *> SectionFunctions;
 
-  DenseMap<WasmFunctionType, uint32_t, WasmFunctionTypeDenseMapInfo>
-      FunctionTypeIndices;
-  SmallVector<WasmFunctionType, 4> FunctionTypes;
+  DenseMap<WasmSignature, uint32_t, WasmSignatureDenseMapInfo> SignatureIndices;
+  SmallVector<WasmSignature, 4> Signatures;
   SmallVector<WasmGlobal, 4> Globals;
   SmallVector<WasmDataSegment, 4> DataSegments;
   unsigned NumFunctionImports = 0;
   unsigned NumGlobalImports = 0;
+  unsigned NumEventImports = 0;
   uint32_t SectionCount = 0;
 
   // TargetObjectWriter wrappers.
@@ -266,8 +266,8 @@ private:
     TableIndices.clear();
     DataLocations.clear();
     CustomSectionsRelocations.clear();
-    FunctionTypeIndices.clear();
-    FunctionTypes.clear();
+    SignatureIndices.clear();
+    Signatures.clear();
     Globals.clear();
     DataSegments.clear();
     SectionFunctions.clear();
@@ -294,7 +294,7 @@ private:
 
   void writeValueType(wasm::ValType Ty) { W.OS << static_cast<char>(Ty); }
 
-  void writeTypeSection(ArrayRef<WasmFunctionType> FunctionTypes);
+  void writeTypeSection(ArrayRef<WasmSignature> Signatures);
   void writeImportSection(ArrayRef<wasm::WasmImport> Imports, uint32_t DataSize,
                           uint32_t NumElements);
   void writeFunctionSection(ArrayRef<WasmFunction> Functions);
@@ -304,6 +304,7 @@ private:
   void writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout,
                         ArrayRef<WasmFunction> Functions);
   void writeDataSection();
+  void writeEventSection(ArrayRef<wasm::WasmEventType> Events);
   void writeRelocSection(uint32_t SectionIndex, StringRef Name,
                          std::vector<WasmRelocationEntry> &Relocations);
   void writeLinkingMetaDataSection(
@@ -322,7 +323,9 @@ private:
 
   uint32_t getRelocationIndexValue(const WasmRelocationEntry &RelEntry);
   uint32_t getFunctionType(const MCSymbolWasm &Symbol);
-  uint32_t registerFunctionType(const MCSymbolWasm &Symbol);
+  uint32_t getEventType(const MCSymbolWasm &Symbol);
+  void registerFunctionType(const MCSymbolWasm &Symbol);
+  void registerEventType(const MCSymbolWasm &Symbol);
 };
 
 } // end anonymous namespace
@@ -581,7 +584,8 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry) {
     return getRelocationIndexValue(RelEntry);
   case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
   case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
-    // Provisional value is function/global Wasm index
+  case wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB:
+    // Provisional value is function/global/event Wasm index
     if (!WasmIndices.count(RelEntry.Symbol))
       report_fatal_error("symbol not found in wasm index space: " +
                          RelEntry.Symbol->getName());
@@ -678,6 +682,7 @@ void WasmObjectWriter::applyRelocations(
     case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
     case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
+    case wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB:
       WritePatchableLEB(Stream, Value, Offset);
       break;
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
@@ -696,23 +701,22 @@ void WasmObjectWriter::applyRelocations(
   }
 }
 
-void WasmObjectWriter::writeTypeSection(
-    ArrayRef<WasmFunctionType> FunctionTypes) {
-  if (FunctionTypes.empty())
+void WasmObjectWriter::writeTypeSection(ArrayRef<WasmSignature> Signatures) {
+  if (Signatures.empty())
     return;
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_TYPE);
 
-  encodeULEB128(FunctionTypes.size(), W.OS);
+  encodeULEB128(Signatures.size(), W.OS);
 
-  for (const WasmFunctionType &FuncTy : FunctionTypes) {
+  for (const WasmSignature &Sig : Signatures) {
     W.OS << char(wasm::WASM_TYPE_FUNC);
-    encodeULEB128(FuncTy.Params.size(), W.OS);
-    for (wasm::ValType Ty : FuncTy.Params)
+    encodeULEB128(Sig.Params.size(), W.OS);
+    for (wasm::ValType Ty : Sig.Params)
       writeValueType(Ty);
-    encodeULEB128(FuncTy.Returns.size(), W.OS);
-    for (wasm::ValType Ty : FuncTy.Returns)
+    encodeULEB128(Sig.Returns.size(), W.OS);
+    for (wasm::ValType Ty : Sig.Returns)
       writeValueType(Ty);
   }
 
@@ -753,6 +757,10 @@ void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports,
       encodeULEB128(0, W.OS);           // flags
       encodeULEB128(NumElements, W.OS); // initial
       break;
+    case wasm::WASM_EXTERNAL_EVENT:
+      encodeULEB128(Import.Event.Attribute, W.OS);
+      encodeULEB128(Import.Event.SigIndex, W.OS);
+      break;
     default:
       llvm_unreachable("unsupported import kind");
     }
@@ -770,7 +778,7 @@ void WasmObjectWriter::writeFunctionSection(ArrayRef<WasmFunction> Functions) {
 
   encodeULEB128(Functions.size(), W.OS);
   for (const WasmFunction &Func : Functions)
-    encodeULEB128(Func.Type, W.OS);
+    encodeULEB128(Func.SigIndex, W.OS);
 
   endSection(Section);
 }
@@ -795,6 +803,22 @@ void WasmObjectWriter::writeGlobalSection() {
   endSection(Section);
 }
 
+void WasmObjectWriter::writeEventSection(ArrayRef<wasm::WasmEventType> Events) {
+  if (Events.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_EVENT);
+
+  encodeULEB128(Events.size(), W.OS);
+  for (const wasm::WasmEventType &Event : Events) {
+    encodeULEB128(Event.Attribute, W.OS);
+    encodeULEB128(Event.SigIndex, W.OS);
+  }
+
+  endSection(Section);
+}
+
 void WasmObjectWriter::writeExportSection(ArrayRef<wasm::WasmExport> Exports) {
   if (Exports.empty())
     return;
@@ -956,6 +980,7 @@ void WasmObjectWriter::writeLinkingMetaDataSection(
       switch (Sym.Kind) {
       case wasm::WASM_SYMBOL_TYPE_FUNCTION:
       case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+      case wasm::WASM_SYMBOL_TYPE_EVENT:
         encodeULEB128(Sym.ElementIndex, W.OS);
         if ((Sym.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0)
           writeString(Sym.Name);
@@ -1047,26 +1072,51 @@ uint32_t WasmObjectWriter::getFunctionType(const MCSymbolWasm &Symbol) {
   return TypeIndices[&Symbol];
 }
 
-uint32_t WasmObjectWriter::registerFunctionType(const MCSymbolWasm &Symbol) {
+uint32_t WasmObjectWriter::getEventType(const MCSymbolWasm &Symbol) {
+  assert(Symbol.isEvent());
+  assert(TypeIndices.count(&Symbol));
+  return TypeIndices[&Symbol];
+}
+
+void WasmObjectWriter::registerFunctionType(const MCSymbolWasm &Symbol) {
   assert(Symbol.isFunction());
 
-  WasmFunctionType F;
+  WasmSignature S;
   const MCSymbolWasm *ResolvedSym = ResolveSymbol(Symbol);
   if (auto *Sig = ResolvedSym->getSignature()) {
-    F.Returns = Sig->Returns;
-    F.Params = Sig->Params;
+    S.Returns = Sig->Returns;
+    S.Params = Sig->Params;
   }
 
-  auto Pair =
-      FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size()));
+  auto Pair = SignatureIndices.insert(std::make_pair(S, Signatures.size()));
   if (Pair.second)
-    FunctionTypes.push_back(F);
+    Signatures.push_back(S);
   TypeIndices[&Symbol] = Pair.first->second;
 
   LLVM_DEBUG(dbgs() << "registerFunctionType: " << Symbol
                     << " new:" << Pair.second << "\n");
   LLVM_DEBUG(dbgs() << "  -> type index: " << Pair.first->second << "\n");
-  return Pair.first->second;
+}
+
+void WasmObjectWriter::registerEventType(const MCSymbolWasm &Symbol) {
+  assert(Symbol.isEvent());
+
+  // TODO Currently we don't generate imported exceptions, but if we do, we
+  // should have a way of infering types of imported exceptions.
+  WasmSignature S;
+  if (auto *Sig = Symbol.getSignature()) {
+    S.Returns = Sig->Returns;
+    S.Params = Sig->Params;
+  }
+
+  auto Pair = SignatureIndices.insert(std::make_pair(S, Signatures.size()));
+  if (Pair.second)
+    Signatures.push_back(S);
+  TypeIndices[&Symbol] = Pair.first->second;
+
+  LLVM_DEBUG(dbgs() << "registerEventType: " << Symbol << " new:" << Pair.second
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "  -> type index: " << Pair.first->second << "\n");
 }
 
 static bool isInSymtab(const MCSymbolWasm &Sym) {
@@ -1100,6 +1150,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   SmallVector<uint32_t, 4> TableElems;
   SmallVector<wasm::WasmImport, 4> Imports;
   SmallVector<wasm::WasmExport, 4> Exports;
+  SmallVector<wasm::WasmEventType, 1> Events;
   SmallVector<wasm::WasmSymbolInfo, 4> SymbolInfos;
   SmallVector<std::pair<uint16_t, uint32_t>, 2> InitFuncs;
   std::map<StringRef, std::vector<WasmComdatEntry>> Comdats;
@@ -1128,7 +1179,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   TableImport.Table.ElemType = wasm::WASM_TYPE_ANYFUNC;
   Imports.push_back(TableImport);
 
-  // Populate FunctionTypeIndices, and Imports and WasmIndices for undefined
+  // Populate SignatureIndices, and Imports and WasmIndices for undefined
   // symbols.  This must be done before populating WasmIndices for defined
   // symbols.
   for (const MCSymbol &S : Asm.symbols()) {
@@ -1139,6 +1190,9 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
     if (WS.isFunction())
       registerFunctionType(WS);
 
+    if (WS.isEvent())
+      registerEventType(WS);
+
     if (WS.isTemporary())
       continue;
 
@@ -1163,6 +1217,18 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
         Import.Global = WS.getGlobalType();
         Imports.push_back(Import);
         WasmIndices[&WS] = NumGlobalImports++;
+      } else if (WS.isEvent()) {
+        if (WS.isWeak())
+          report_fatal_error("undefined event symbol cannot be weak");
+
+        wasm::WasmImport Import;
+        Import.Module = WS.getModuleName();
+        Import.Field = WS.getName();
+        Import.Kind = wasm::WASM_EXTERNAL_EVENT;
+        Import.Event.Attribute = wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION;
+        Import.Event.SigIndex = getEventType(WS);
+        Imports.push_back(Import);
+        WasmIndices[&WS] = NumEventImports++;
       }
     }
   }
@@ -1254,7 +1320,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
         // A definition. Write out the function body.
         Index = NumFunctionImports + Functions.size();
         WasmFunction Func;
-        Func.Type = getFunctionType(WS);
+        Func.SigIndex = getFunctionType(WS);
         Func.Sym = &WS;
         WasmIndices[&WS] = Index;
         Functions.push_back(Func);
@@ -1270,6 +1336,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       }
 
       LLVM_DEBUG(dbgs() << "  -> function index: " << Index << "\n");
+
     } else if (WS.isData()) {
       if (WS.isTemporary() && !WS.getSize())
         continue;
@@ -1299,6 +1366,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
           static_cast<uint32_t>(Size)};
       DataLocations[&WS] = Ref;
       LLVM_DEBUG(dbgs() << "  -> segment index: " << Ref.Segment << "\n");
+
     } else if (WS.isGlobal()) {
       // A "true" Wasm global (currently just __stack_pointer)
       if (WS.isDefined())
@@ -1307,6 +1375,24 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       // An import; the index was assigned above
       LLVM_DEBUG(dbgs() << "  -> global index: "
                         << WasmIndices.find(&WS)->second << "\n");
+
+    } else if (WS.isEvent()) {
+      // C++ exception symbol (__cpp_exception)
+      unsigned Index;
+      if (WS.isDefined()) {
+        Index = NumEventImports + Events.size();
+        wasm::WasmEventType Event;
+        Event.SigIndex = getEventType(WS);
+        Event.Attribute = wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION;
+        WasmIndices[&WS] = Index;
+        Events.push_back(Event);
+      } else {
+        // An import; the index was assigned above.
+        Index = WasmIndices.find(&WS)->second;
+      }
+      LLVM_DEBUG(dbgs() << "  -> event index: " << WasmIndices.find(&WS)->second
+                        << "\n");
+
     } else {
       assert(WS.isSection());
     }
@@ -1340,7 +1426,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       DataLocations[&WS] = Ref;
       LLVM_DEBUG(dbgs() << "  -> index:" << Ref.Segment << "\n");
     } else {
-      report_fatal_error("don't yet support global aliases");
+      report_fatal_error("don't yet support global/event aliases");
     }
   }
 
@@ -1473,12 +1559,13 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   // Write out the Wasm header.
   writeHeader(Asm);
 
-  writeTypeSection(FunctionTypes);
+  writeTypeSection(Signatures);
   writeImportSection(Imports, DataSize, TableElems.size());
   writeFunctionSection(Functions);
   // Skip the "table" section; we import the table instead.
   // Skip the "memory" section; we import the memory instead.
   writeGlobalSection();
+  writeEventSection(Events);
   writeExportSection(Exports);
   writeElemSection(TableElems);
   writeCodeSection(Asm, Layout, Functions);
diff --git a/tools/llvm-mca/lib/CMakeLists.txt b/lib/MCA/CMakeLists.txt
similarity index 59%
rename from tools/llvm-mca/lib/CMakeLists.txt
rename to lib/MCA/CMakeLists.txt
index 359a7d49d49d43cb6d55d7b0ef9cb1ff6c3f5568..bfd0782d1f7d5012b8f6798c8109eab278e38ba3 100644
--- a/tools/llvm-mca/lib/CMakeLists.txt
+++ b/lib/MCA/CMakeLists.txt
@@ -1,7 +1,4 @@
-include_directories(${LLVM_MCA_SOURCE_DIR}/include)
-
-add_library(LLVMMCA
-  STATIC
+add_llvm_library(LLVMMCA
   Context.cpp
   HWEventListener.cpp
   HardwareUnits/HardwareUnit.cpp
@@ -14,19 +11,13 @@ add_library(LLVMMCA
   Instruction.cpp
   Pipeline.cpp
   Stages/DispatchStage.cpp
+  Stages/EntryStage.cpp
   Stages/ExecuteStage.cpp
-  Stages/FetchStage.cpp
   Stages/InstructionTables.cpp
   Stages/RetireStage.cpp
   Stages/Stage.cpp
   Support.cpp
-  )
 
-llvm_update_compile_flags(LLVMMCA)
-llvm_map_components_to_libnames(libs
-  MC
-  Support
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/MCA
   )
-
-target_link_libraries(LLVMMCA ${libs})
-set_target_properties(LLVMMCA PROPERTIES FOLDER "Libraries")
diff --git a/tools/llvm-mca/lib/Context.cpp b/lib/MCA/Context.cpp
similarity index 76%
rename from tools/llvm-mca/lib/Context.cpp
rename to lib/MCA/Context.cpp
index 5b6f52478ddbb60b84ef0b79f3b2b718092e5339..c1b197dfe2e6744fc53e2c9b05283e7da9406083 100644
--- a/tools/llvm-mca/lib/Context.cpp
+++ b/lib/MCA/Context.cpp
@@ -15,14 +15,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Context.h"
-#include "HardwareUnits/RegisterFile.h"
-#include "HardwareUnits/RetireControlUnit.h"
-#include "HardwareUnits/Scheduler.h"
-#include "Stages/DispatchStage.h"
-#include "Stages/ExecuteStage.h"
-#include "Stages/FetchStage.h"
-#include "Stages/RetireStage.h"
+#include "llvm/MCA/Context.h"
+#include "llvm/MCA/HardwareUnits/RegisterFile.h"
+#include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
+#include "llvm/MCA/HardwareUnits/Scheduler.h"
+#include "llvm/MCA/Stages/DispatchStage.h"
+#include "llvm/MCA/Stages/EntryStage.h"
+#include "llvm/MCA/Stages/ExecuteStage.h"
+#include "llvm/MCA/Stages/RetireStage.h"
 
 namespace llvm {
 namespace mca {
@@ -35,12 +35,12 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
   // Create the hardware units defining the backend.
   auto RCU = llvm::make_unique<RetireControlUnit>(SM);
   auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
-  auto LSU = llvm::make_unique<LSUnit>(Opts.LoadQueueSize, Opts.StoreQueueSize,
-                                       Opts.AssumeNoAlias);
-  auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get());
+  auto LSU = llvm::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
+                                       Opts.StoreQueueSize, Opts.AssumeNoAlias);
+  auto HWS = llvm::make_unique<Scheduler>(SM, *LSU);
 
   // Create the pipeline stages.
-  auto Fetch = llvm::make_unique<FetchStage>(SrcMgr);
+  auto Fetch = llvm::make_unique<EntryStage>(SrcMgr);
   auto Dispatch = llvm::make_unique<DispatchStage>(STI, MRI, Opts.DispatchWidth,
                                                    *RCU, *PRF);
   auto Execute = llvm::make_unique<ExecuteStage>(*HWS);
diff --git a/tools/llvm-mca/lib/HWEventListener.cpp b/lib/MCA/HWEventListener.cpp
similarity index 94%
rename from tools/llvm-mca/lib/HWEventListener.cpp
rename to lib/MCA/HWEventListener.cpp
index 3930e2555a9ba33df5c9b42496fdc1b16e711c58..4a0e5b1754dd593761557726cf1fcfaab97af9f6 100644
--- a/tools/llvm-mca/lib/HWEventListener.cpp
+++ b/lib/MCA/HWEventListener.cpp
@@ -12,7 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "HWEventListener.h"
+#include "llvm/MCA/HWEventListener.h"
 
 namespace llvm {
 namespace mca {
diff --git a/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp b/lib/MCA/HardwareUnits/HardwareUnit.cpp
similarity index 93%
rename from tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
rename to lib/MCA/HardwareUnits/HardwareUnit.cpp
index 4e46ffacbd4044799c8a86a8916dd8455c882e3a..edd32b9c0c1ab9bbbb3d9e49e18145ee7058274b 100644
--- a/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
+++ b/lib/MCA/HardwareUnits/HardwareUnit.cpp
@@ -13,7 +13,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "HardwareUnits/HardwareUnit.h"
+#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
 
 namespace llvm {
 namespace mca {
diff --git a/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/lib/MCA/HardwareUnits/LSUnit.cpp
similarity index 71%
rename from tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
rename to lib/MCA/HardwareUnits/LSUnit.cpp
index 6923c6e0dc8ff532af346f2c9fb70f9013b038a4..8895eb392b60ae92dd17aa377f4b9f03efb077f7 100644
--- a/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
+++ b/lib/MCA/HardwareUnits/LSUnit.cpp
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "HardwareUnits/LSUnit.h"
-#include "Instruction.h"
+#include "llvm/MCA/HardwareUnits/LSUnit.h"
+#include "llvm/MCA/Instruction.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -22,6 +22,23 @@
 namespace llvm {
 namespace mca {
 
+LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ,
+               bool AssumeNoAlias)
+    : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    if (!LQ_Size && EPI.LoadQueueID) {
+      const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID);
+      LQ_Size = LdQDesc.BufferSize;
+    }
+
+    if (!SQ_Size && EPI.StoreQueueID) {
+      const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID);
+      SQ_Size = StQDesc.BufferSize;
+    }
+  }
+}
+
 #ifndef NDEBUG
 void LSUnit::dump() const {
   dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
@@ -136,31 +153,38 @@ bool LSUnit::isReady(const InstRef &IR) const {
 }
 
 void LSUnit::onInstructionExecuted(const InstRef &IR) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
   const unsigned Index = IR.getSourceIndex();
-  std::set<unsigned>::iterator it = LoadQueue.find(Index);
-  if (it != LoadQueue.end()) {
-    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
-                      << " has been removed from the load queue.\n");
-    LoadQueue.erase(it);
-  }
+  bool IsALoad = Desc.MayLoad;
+  bool IsAStore = Desc.MayStore;
 
-  it = StoreQueue.find(Index);
-  if (it != StoreQueue.end()) {
-    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
-                      << " has been removed from the store queue.\n");
-    StoreQueue.erase(it);
+  if (IsALoad) {
+    if (LoadQueue.erase(Index)) {
+      LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                        << " has been removed from the load queue.\n");
+    }
+    if (!LoadBarriers.empty() && Index == *LoadBarriers.begin()) {
+      LLVM_DEBUG(
+          dbgs() << "[LSUnit]: Instruction idx=" << Index
+                 << " has been removed from the set of load barriers.\n");
+      LoadBarriers.erase(Index);
+    }
   }
 
-  if (!StoreBarriers.empty() && Index == *StoreBarriers.begin()) {
-    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
-                      << " has been removed from the set of store barriers.\n");
-    StoreBarriers.erase(StoreBarriers.begin());
-  }
-  if (!LoadBarriers.empty() && Index == *LoadBarriers.begin()) {
-    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
-                      << " has been removed from the set of load barriers.\n");
-    LoadBarriers.erase(LoadBarriers.begin());
+  if (IsAStore) {
+    if (StoreQueue.erase(Index)) {
+      LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                        << " has been removed from the store queue.\n");
+    }
+
+    if (!StoreBarriers.empty() && Index == *StoreBarriers.begin()) {
+      LLVM_DEBUG(
+          dbgs() << "[LSUnit]: Instruction idx=" << Index
+                 << " has been removed from the set of store barriers.\n");
+      StoreBarriers.erase(Index);
+    }
   }
 }
+
 } // namespace mca
 } // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/lib/MCA/HardwareUnits/RegisterFile.cpp
similarity index 98%
rename from tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
rename to lib/MCA/HardwareUnits/RegisterFile.cpp
index 6bc63a0db5000787beaf91458aa84ce75d5bccfe..22977e5ded65b054ce6be3c4207ac05d1e98a6b2 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
+++ b/lib/MCA/HardwareUnits/RegisterFile.cpp
@@ -14,8 +14,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "HardwareUnits/RegisterFile.h"
-#include "Instruction.h"
+#include "llvm/MCA/HardwareUnits/RegisterFile.h"
+#include "llvm/MCA/Instruction.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "llvm-mca"
@@ -185,11 +185,11 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
       // register is allocated.
       ShouldAllocatePhysRegs = false;
 
-      if (OtherWrite.getWriteState() &&
-          (OtherWrite.getSourceIndex() != Write.getSourceIndex())) {
+      WriteState *OtherWS = OtherWrite.getWriteState();
+      if (OtherWS && (OtherWrite.getSourceIndex() != Write.getSourceIndex())) {
         // This partial write has a false dependency on RenameAs.
         assert(!IsEliminated && "Unexpected partial update!");
-        WS.setDependentWrite(OtherWrite.getWriteState());
+        OtherWS->addUser(&WS);
       }
     }
   }
diff --git a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp b/lib/MCA/HardwareUnits/ResourceManager.cpp
similarity index 86%
rename from tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
rename to lib/MCA/HardwareUnits/ResourceManager.cpp
index e371f50ed4892aa9a88bf84af0a15b2e4e75ad6b..e76543c53e14560dc71a9f6ce8cb5db594d7bdf0 100644
--- a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
+++ b/lib/MCA/HardwareUnits/ResourceManager.cpp
@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "HardwareUnits/ResourceManager.h"
-#include "Support.h"
+#include "llvm/MCA/HardwareUnits/ResourceManager.h"
+#include "llvm/MCA/Support.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -24,21 +24,28 @@ namespace mca {
 #define DEBUG_TYPE "llvm-mca"
 ResourceStrategy::~ResourceStrategy() = default;
 
-void DefaultResourceStrategy::skipMask(uint64_t Mask) {
-  NextInSequenceMask &= (~Mask);
-  if (!NextInSequenceMask) {
-    NextInSequenceMask = ResourceUnitMask ^ RemovedFromNextInSequence;
-    RemovedFromNextInSequence = 0;
-  }
-}
-
 uint64_t DefaultResourceStrategy::select(uint64_t ReadyMask) {
   // This method assumes that ReadyMask cannot be zero.
-  uint64_t CandidateMask = PowerOf2Floor(NextInSequenceMask);
-  while (!(ReadyMask & CandidateMask)) {
-    skipMask(CandidateMask);
-    CandidateMask = PowerOf2Floor(NextInSequenceMask);
+  uint64_t CandidateMask = ReadyMask & NextInSequenceMask;
+  if (CandidateMask) {
+    CandidateMask = PowerOf2Floor(CandidateMask);
+    NextInSequenceMask &= (CandidateMask | (CandidateMask - 1));
+    return CandidateMask;
+  }
+
+  NextInSequenceMask = ResourceUnitMask ^ RemovedFromNextInSequence;
+  RemovedFromNextInSequence = 0;
+  CandidateMask = ReadyMask & NextInSequenceMask;
+
+  if (CandidateMask) {
+    CandidateMask = PowerOf2Floor(CandidateMask);
+    NextInSequenceMask &= (CandidateMask | (CandidateMask - 1));
+    return CandidateMask;
   }
+
+  NextInSequenceMask = ResourceUnitMask;
+  CandidateMask = PowerOf2Floor(ReadyMask & NextInSequenceMask);
+  NextInSequenceMask &= (CandidateMask | (CandidateMask - 1));
   return CandidateMask;
 }
 
@@ -47,14 +54,20 @@ void DefaultResourceStrategy::used(uint64_t Mask) {
     RemovedFromNextInSequence |= Mask;
     return;
   }
-  skipMask(Mask);
+
+  NextInSequenceMask &= (~Mask);
+  if (NextInSequenceMask)
+    return;
+
+  NextInSequenceMask = ResourceUnitMask ^ RemovedFromNextInSequence;
+  RemovedFromNextInSequence = 0;
 }
 
 ResourceState::ResourceState(const MCProcResourceDesc &Desc, unsigned Index,
                              uint64_t Mask)
     : ProcResourceDescIndex(Index), ResourceMask(Mask),
-      BufferSize(Desc.BufferSize) {
-  if (countPopulation(ResourceMask) > 1)
+      BufferSize(Desc.BufferSize), IsAGroup(countPopulation(ResourceMask)>1) {
+  if (IsAGroup)
     ResourceSizeMask = ResourceMask ^ PowerOf2Floor(ResourceMask);
   else
     ResourceSizeMask = (1ULL << Desc.NumUnits) - 1;
@@ -78,8 +91,10 @@ ResourceStateEvent ResourceState::isBufferAvailable() const {
 
 #ifndef NDEBUG
 void ResourceState::dump() const {
-  dbgs() << "MASK: " << ResourceMask << ", SIZE_MASK: " << ResourceSizeMask
-         << ", RDYMASK: " << ReadyMask << ", BufferSize=" << BufferSize
+  dbgs() << "MASK=" << format_hex(ResourceMask, 8)
+         << ", SZMASK=" << format_hex(ResourceSizeMask, 8)
+         << ", RDYMASK=" << format_hex(ReadyMask, 8)
+         << ", BufferSize=" << BufferSize
          << ", AvailableSlots=" << AvailableSlots
          << ", Reserved=" << Unavailable << '\n';
 }
@@ -147,8 +162,14 @@ ResourceRef ResourceManager::selectPipe(uint64_t ResourceID) {
 
 void ResourceManager::use(const ResourceRef &RR) {
   // Mark the sub-resource referenced by RR as used.
-  ResourceState &RS = *Resources[getResourceStateIndex(RR.first)];
+  unsigned RSID = getResourceStateIndex(RR.first);
+  ResourceState &RS = *Resources[RSID];
   RS.markSubResourceAsUsed(RR.second);
+  // Remember to update the resource strategy for non-group resources with
+  // multiple units.
+  if (RS.getNumUnits() > 1)
+    Strategies[RSID]->used(RR.second);
+
   // If there are still available units in RR.first,
   // then we are done.
   if (RS.isReady())
diff --git a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp b/lib/MCA/HardwareUnits/RetireControlUnit.cpp
similarity index 94%
rename from tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
rename to lib/MCA/HardwareUnits/RetireControlUnit.cpp
index 0456e1d7a5bf9fb8fded20f42323e906ab9d93c6..de9f24552c38be4190bd364fc3f7b17868ae53c8 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
+++ b/lib/MCA/HardwareUnits/RetireControlUnit.cpp
@@ -12,7 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "HardwareUnits/RetireControlUnit.h"
+#include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "llvm-mca"
@@ -60,9 +60,10 @@ const RetireControlUnit::RUToken &RetireControlUnit::peekCurrentToken() const {
 }
 
 void RetireControlUnit::consumeCurrentToken() {
-  const RetireControlUnit::RUToken &Current = peekCurrentToken();
+  RetireControlUnit::RUToken &Current = Queue[CurrentInstructionSlotIdx];
   assert(Current.NumSlots && "Reserved zero slots?");
   assert(Current.IR && "Invalid RUToken in the RCU queue.");
+  Current.IR.getInstruction()->retire();
 
   // Update the slot index to be the next item in the circular queue.
   CurrentInstructionSlotIdx += Current.NumSlots;
diff --git a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp b/lib/MCA/HardwareUnits/Scheduler.cpp
similarity index 97%
rename from tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
rename to lib/MCA/HardwareUnits/Scheduler.cpp
index b1ac8d99b865b83b3af292c5382b60a7e96baf8d..3924ac599105aa3e803f7361568613c9c0d3f81a 100644
--- a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
+++ b/lib/MCA/HardwareUnits/Scheduler.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HardwareUnits/Scheduler.h"
+#include "llvm/MCA/HardwareUnits/Scheduler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -51,7 +51,7 @@ Scheduler::Status Scheduler::isAvailable(const InstRef &IR) const {
   }
 
   // Give lower priority to LSUnit stall events.
-  switch (LSU->isAvailable(IR)) {
+  switch (LSU.isAvailable(IR)) {
   case LSUnit::LSU_LQUEUE_FULL:
     return Scheduler::SC_LOAD_QUEUE_FULL;
   case LSUnit::LSU_SQUEUE_FULL:
@@ -80,7 +80,7 @@ void Scheduler::issueInstructionImpl(
   if (IS->isExecuting())
     IssuedSet.emplace_back(IR);
   else if (IS->isExecuted())
-    LSU->onInstructionExecuted(IR);
+    LSU.onInstructionExecuted(IR);
 }
 
 // Release the buffered resources and issue the instruction.
@@ -170,7 +170,7 @@ void Scheduler::updateIssuedSet(SmallVectorImpl<InstRef> &Executed) {
     }
 
     // Instruction IR has completed execution.
-    LSU->onInstructionExecuted(IR);
+    LSU.onInstructionExecuted(IR);
     Executed.emplace_back(IR);
     ++RemovedElements;
     IR.invalidate();
@@ -213,7 +213,7 @@ void Scheduler::dispatch(const InstRef &IR) {
   // If necessary, reserve queue entries in the load-store unit (LSU).
   bool IsMemOp = Desc.MayLoad || Desc.MayStore;
   if (IsMemOp)
-    LSU->dispatch(IR);
+    LSU.dispatch(IR);
 
   if (!isReady(IR)) {
     LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR << " to the WaitSet\n");
@@ -238,7 +238,7 @@ void Scheduler::dispatch(const InstRef &IR) {
 bool Scheduler::isReady(const InstRef &IR) const {
   const InstrDesc &Desc = IR.getInstruction()->getDesc();
   bool IsMemOp = Desc.MayLoad || Desc.MayStore;
-  return IR.getInstruction()->isReady() && (!IsMemOp || LSU->isReady(IR));
+  return IR.getInstruction()->isReady() && (!IsMemOp || LSU.isReady(IR));
 }
 
 } // namespace mca
diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/lib/MCA/InstrBuilder.cpp
similarity index 65%
rename from tools/llvm-mca/lib/InstrBuilder.cpp
rename to lib/MCA/InstrBuilder.cpp
index 535ad4d57feea8d15f91533fb9e0128c6c78e6fe..8d501dc6b15abb422234e5d4e4f7c718d0c14e7f 100644
--- a/tools/llvm-mca/lib/InstrBuilder.cpp
+++ b/lib/MCA/InstrBuilder.cpp
@@ -12,7 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "InstrBuilder.h"
+#include "llvm/MCA/InstrBuilder.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCInst.h"
@@ -28,8 +28,9 @@ namespace mca {
 InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
                            const llvm::MCInstrInfo &mcii,
                            const llvm::MCRegisterInfo &mri,
-                           const llvm::MCInstrAnalysis &mcia)
-    : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia) {
+                           const llvm::MCInstrAnalysis *mcia)
+    : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), FirstCallInst(true),
+      FirstReturnInst(true) {
   computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
 }
 
@@ -55,12 +56,15 @@ static void initializeUsedResources(InstrDesc &ID,
   // part of a "Super" resource. The key value is the "Super" resource mask ID.
   DenseMap<uint64_t, unsigned> SuperResources;
 
+  unsigned NumProcResources = SM.getNumProcResourceKinds();
+  APInt Buffers(NumProcResources, 0);
+
   for (unsigned I = 0, E = SCDesc.NumWriteProcResEntries; I < E; ++I) {
     const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + I;
     const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx);
     uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx];
     if (PR.BufferSize != -1)
-      ID.Buffers.push_back(Mask);
+      Buffers.setBit(PRE->ProcResourceIdx);
     CycleSegment RCy(0, PRE->Cycles, false);
     Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy)));
     if (PR.SuperIdx) {
@@ -138,6 +142,30 @@ static void initializeUsedResources(InstrDesc &ID,
     }
   }
 
+  // Identify extra buffers that are consumed through super resources.
+  for (const std::pair<uint64_t, unsigned> &SR : SuperResources) {
+    for (unsigned I = 1, E = NumProcResources; I < E; ++I) {
+      const MCProcResourceDesc &PR = *SM.getProcResource(I);
+      if (PR.BufferSize == -1)
+        continue;
+
+      uint64_t Mask = ProcResourceMasks[I];
+      if (Mask != SR.first && ((Mask & SR.first) == SR.first))
+        Buffers.setBit(I);
+    }
+  }
+
+  // Now set the buffers.
+  if (unsigned NumBuffers = Buffers.countPopulation()) {
+    ID.Buffers.resize(NumBuffers);
+    for (unsigned I = 0, E = NumProcResources; I < E && NumBuffers; ++I) {
+      if (Buffers[I]) {
+        --NumBuffers;
+        ID.Buffers[NumBuffers] = ProcResourceMasks[I];
+      }
+    }
+  }
+
   LLVM_DEBUG({
     for (const std::pair<uint64_t, ResourceUsage> &R : ID.Resources)
       dbgs() << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n';
@@ -161,33 +189,92 @@ static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
   ID.MaxLatency = Latency < 0 ? 100U : static_cast<unsigned>(Latency);
 }
 
-Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
-                                   unsigned SchedClassID) {
+static Error verifyOperands(const MCInstrDesc &MCDesc, const MCInst &MCI) {
+  // Count register definitions, and skip non register operands in the process.
+  unsigned I, E;
+  unsigned NumExplicitDefs = MCDesc.getNumDefs();
+  for (I = 0, E = MCI.getNumOperands(); NumExplicitDefs && I < E; ++I) {
+    const MCOperand &Op = MCI.getOperand(I);
+    if (Op.isReg())
+      --NumExplicitDefs;
+  }
+
+  if (NumExplicitDefs) {
+    return make_error<InstructionError<MCInst>>(
+        "Expected more register operand definitions.", MCI);
+  }
+
+  if (MCDesc.hasOptionalDef()) {
+    // Always assume that the optional definition is the last operand.
+    const MCOperand &Op = MCI.getOperand(MCDesc.getNumOperands() - 1);
+    if (I == MCI.getNumOperands() || !Op.isReg()) {
+      std::string Message =
+          "expected a register operand for an optional definition. Instruction "
+          "has not been correctly analyzed.";
+      return make_error<InstructionError<MCInst>>(Message, MCI);
+    }
+  }
+
+  return ErrorSuccess();
+}
+
+void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
+                                  unsigned SchedClassID) {
   const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
   const MCSchedModel &SM = STI.getSchedModel();
   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
 
-  // These are for now the (strong) assumptions made by this algorithm:
-  //  * The number of explicit and implicit register definitions in a MCInst
-  //    matches the number of explicit and implicit definitions according to
-  //    the opcode descriptor (MCInstrDesc).
-  //  * Register definitions take precedence over register uses in the operands
-  //    list.
-  //  * If an opcode specifies an optional definition, then the optional
-  //    definition is always the last operand in the sequence, and it can be
-  //    set to zero (i.e. "no register").
+  // Assumptions made by this algorithm:
+  //  1. The number of explicit and implicit register definitions in a MCInst
+  //     matches the number of explicit and implicit definitions according to
+  //     the opcode descriptor (MCInstrDesc).
+  //  2. Uses start at index #(MCDesc.getNumDefs()).
+  //  3. There can only be a single optional register definition, an it is
+  //     always the last operand of the sequence (excluding extra operands
+  //     contributed by variadic opcodes).
   //
   // These assumptions work quite well for most out-of-order in-tree targets
   // like x86. This is mainly because the vast majority of instructions is
   // expanded to MCInst using a straightforward lowering logic that preserves
   // the ordering of the operands.
+  //
+  // About assumption 1.
+  // The algorithm allows non-register operands between register operand
+  // definitions. This helps to handle some special ARM instructions with
+  // implicit operand increment (-mtriple=armv7):
+  //
+  // vld1.32  {d18, d19}, [r1]!  @ <MCInst #1463 VLD1q32wb_fixed
+  //                             @  <MCOperand Reg:59>
+  //                             @  <MCOperand Imm:0>     (!!)
+  //                             @  <MCOperand Reg:67>
+  //                             @  <MCOperand Imm:0>
+  //                             @  <MCOperand Imm:14>
+  //                             @  <MCOperand Reg:0>>
+  //
+  // MCDesc reports:
+  //  6 explicit operands.
+  //  1 optional definition
+  //  2 explicit definitions (!!)
+  //
+  // The presence of an 'Imm' operand between the two register definitions
+  // breaks the assumption that "register definitions are always at the
+  // beginning of the operand sequence".
+  //
+  // To workaround this issue, this algorithm ignores (i.e. skips) any
+  // non-register operands between register definitions.  The optional
+  // definition is still at index #(NumOperands-1).
+  //
+  // According to assumption 2. register reads start at #(NumExplicitDefs-1).
+  // That means, register R1 from the example is both read and written.
   unsigned NumExplicitDefs = MCDesc.getNumDefs();
   unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs();
   unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
   unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs;
   if (MCDesc.hasOptionalDef())
     TotalDefs++;
-  ID.Writes.resize(TotalDefs);
+
+  unsigned NumVariadicOps = MCI.getNumOperands() - MCDesc.getNumOperands();
+  ID.Writes.resize(TotalDefs + NumVariadicOps);
   // Iterate over the operands list, and skip non-register operands.
   // The first NumExplictDefs register operands are expected to be register
   // definitions.
@@ -214,19 +301,15 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
     }
     Write.IsOptionalDef = false;
     LLVM_DEBUG({
-      dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
+      dbgs() << "\t\t[Def]    OpIdx=" << Write.OpIndex
              << ", Latency=" << Write.Latency
              << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
     });
     CurrentDef++;
   }
 
-  if (CurrentDef != NumExplicitDefs) {
-    return make_error<InstructionError<MCInst>>(
-        "Expected more register operand definitions.", MCI);
-  }
-
-  CurrentDef = 0;
+  assert(CurrentDef == NumExplicitDefs &&
+         "Expected more register operand definitions.");
   for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) {
     unsigned Index = NumExplicitDefs + CurrentDef;
     WriteDescriptor &Write = ID.Writes[Index];
@@ -248,7 +331,7 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
     Write.IsOptionalDef = false;
     assert(Write.RegisterID != 0 && "Expected a valid phys register!");
     LLVM_DEBUG({
-      dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
+      dbgs() << "\t\t[Def][I] OpIdx=" << ~Write.OpIndex
              << ", PhysReg=" << MRI.getName(Write.RegisterID)
              << ", Latency=" << Write.Latency
              << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
@@ -256,75 +339,120 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
   }
 
   if (MCDesc.hasOptionalDef()) {
-    // Always assume that the optional definition is the last operand of the
-    // MCInst sequence.
-    const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
-    if (i == MCI.getNumOperands() || !Op.isReg()) {
-      std::string Message =
-          "expected a register operand for an optional definition. Instruction "
-          "has not been correctly analyzed.";
-      return make_error<InstructionError<MCInst>>(Message, MCI);
-    }
-
-    WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
-    Write.OpIndex = MCI.getNumOperands() - 1;
+    WriteDescriptor &Write = ID.Writes[NumExplicitDefs + NumImplicitDefs];
+    Write.OpIndex = MCDesc.getNumOperands() - 1;
     // Assign a default latency for this write.
     Write.Latency = ID.MaxLatency;
     Write.SClassOrWriteResourceID = 0;
     Write.IsOptionalDef = true;
+    LLVM_DEBUG({
+      dbgs() << "\t\t[Def][O] OpIdx=" << Write.OpIndex
+             << ", Latency=" << Write.Latency
+             << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
+    });
   }
 
-  return ErrorSuccess();
-}
+  if (!NumVariadicOps)
+    return;
 
-Error InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
-                                  unsigned SchedClassID) {
-  const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
-  unsigned NumExplicitDefs = MCDesc.getNumDefs();
+  // FIXME: if an instruction opcode is flagged 'mayStore', and it has no
+  // "unmodeledSideEffects', then this logic optimistically assumes that any
+  // extra register operands in the variadic sequence is not a register
+  // definition.
+  //
+  // Otherwise, we conservatively assume that any register operand from the
+  // variadic sequence is both a register read and a register write.
+  bool AssumeUsesOnly = MCDesc.mayStore() && !MCDesc.mayLoad() &&
+                        !MCDesc.hasUnmodeledSideEffects();
+  CurrentDef = NumExplicitDefs + NumImplicitDefs + MCDesc.hasOptionalDef();
+  for (unsigned I = 0, OpIndex = MCDesc.getNumOperands();
+       I < NumVariadicOps && !AssumeUsesOnly; ++I, ++OpIndex) {
+    const MCOperand &Op = MCI.getOperand(OpIndex);
+    if (!Op.isReg())
+      continue;
 
-  // Skip explicit definitions.
-  unsigned i = 0;
-  for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) {
-    const MCOperand &Op = MCI.getOperand(i);
-    if (Op.isReg())
-      NumExplicitDefs--;
+    WriteDescriptor &Write = ID.Writes[CurrentDef];
+    Write.OpIndex = OpIndex;
+    // Assign a default latency for this write.
+    Write.Latency = ID.MaxLatency;
+    Write.SClassOrWriteResourceID = 0;
+    Write.IsOptionalDef = false;
+    ++CurrentDef;
+    LLVM_DEBUG({
+      dbgs() << "\t\t[Def][V] OpIdx=" << Write.OpIndex
+             << ", Latency=" << Write.Latency
+             << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
+    });
   }
 
-  if (NumExplicitDefs) {
-    return make_error<InstructionError<MCInst>>(
-        "Expected more register operand definitions.", MCI);
-  }
+  ID.Writes.resize(CurrentDef);
+}
 
-  unsigned NumExplicitUses = MCI.getNumOperands() - i;
+void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
+                                 unsigned SchedClassID) {
+  const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
+  unsigned NumExplicitUses = MCDesc.getNumOperands() - MCDesc.getNumDefs();
   unsigned NumImplicitUses = MCDesc.getNumImplicitUses();
-  if (MCDesc.hasOptionalDef()) {
-    assert(NumExplicitUses);
-    NumExplicitUses--;
-  }
-  unsigned TotalUses = NumExplicitUses + NumImplicitUses;
-  if (!TotalUses)
-    return ErrorSuccess();
-
+  // Remove the optional definition.
+  if (MCDesc.hasOptionalDef())
+    --NumExplicitUses;
+  unsigned NumVariadicOps = MCI.getNumOperands() - MCDesc.getNumOperands();
+  unsigned TotalUses = NumExplicitUses + NumImplicitUses + NumVariadicOps;
   ID.Reads.resize(TotalUses);
-  for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) {
+  unsigned CurrentUse = 0;
+  for (unsigned I = 0, OpIndex = MCDesc.getNumDefs(); I < NumExplicitUses;
+       ++I, ++OpIndex) {
+    const MCOperand &Op = MCI.getOperand(OpIndex);
+    if (!Op.isReg())
+      continue;
+
     ReadDescriptor &Read = ID.Reads[CurrentUse];
-    Read.OpIndex = i + CurrentUse;
-    Read.UseIndex = CurrentUse;
+    Read.OpIndex = OpIndex;
+    Read.UseIndex = I;
     Read.SchedClassID = SchedClassID;
-    LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex
+    ++CurrentUse;
+    LLVM_DEBUG(dbgs() << "\t\t[Use]    OpIdx=" << Read.OpIndex
                       << ", UseIndex=" << Read.UseIndex << '\n');
   }
 
-  for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) {
-    ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse];
-    Read.OpIndex = ~CurrentUse;
-    Read.UseIndex = NumExplicitUses + CurrentUse;
-    Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse];
+  // For the purpose of ReadAdvance, implicit uses come directly after explicit
+  // uses. The "UseIndex" must be updated according to that implicit layout.
+  for (unsigned I = 0; I < NumImplicitUses; ++I) {
+    ReadDescriptor &Read = ID.Reads[CurrentUse + I];
+    Read.OpIndex = ~I;
+    Read.UseIndex = NumExplicitUses + I;
+    Read.RegisterID = MCDesc.getImplicitUses()[I];
     Read.SchedClassID = SchedClassID;
-    LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex << ", RegisterID="
+    LLVM_DEBUG(dbgs() << "\t\t[Use][I] OpIdx=" << ~Read.OpIndex
+                      << ", UseIndex=" << Read.UseIndex << ", RegisterID="
                       << MRI.getName(Read.RegisterID) << '\n');
   }
-  return ErrorSuccess();
+
+  CurrentUse += NumImplicitUses;
+
+  // FIXME: If an instruction opcode is marked as 'mayLoad', and it has no
+  // "unmodeledSideEffects", then this logic optimistically assumes that any
+  // extra register operands in the variadic sequence are not register
+  // definition.
+
+  bool AssumeDefsOnly = !MCDesc.mayStore() && MCDesc.mayLoad() &&
+                        !MCDesc.hasUnmodeledSideEffects();
+  for (unsigned I = 0, OpIndex = MCDesc.getNumOperands();
+       I < NumVariadicOps && !AssumeDefsOnly; ++I, ++OpIndex) {
+    const MCOperand &Op = MCI.getOperand(OpIndex);
+    if (!Op.isReg())
+      continue;
+
+    ReadDescriptor &Read = ID.Reads[CurrentUse];
+    Read.OpIndex = OpIndex;
+    Read.UseIndex = NumExplicitUses + NumImplicitUses + I;
+    Read.SchedClassID = SchedClassID;
+    ++CurrentUse;
+    LLVM_DEBUG(dbgs() << "\t\t[Use][V] OpIdx=" << Read.OpIndex
+                      << ", UseIndex=" << Read.UseIndex << '\n');
+  }
+
+  ID.Reads.resize(CurrentUse);
 }
 
 Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID,
@@ -364,10 +492,11 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
 
   // Then obtain the scheduling class information from the instruction.
   unsigned SchedClassID = MCDesc.getSchedClass();
-  unsigned CPUID = SM.getProcessorID();
+  bool IsVariant = SM.getSchedClassDesc(SchedClassID)->isVariant();
 
   // Try to solve variant scheduling classes.
-  if (SchedClassID) {
+  if (IsVariant) {
+    unsigned CPUID = SM.getProcessorID();
     while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
       SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
 
@@ -389,30 +518,36 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
   std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
   ID->NumMicroOps = SCDesc.NumMicroOps;
 
-  if (MCDesc.isCall()) {
+  if (MCDesc.isCall() && FirstCallInst) {
     // We don't correctly model calls.
     WithColor::warning() << "found a call in the input assembly sequence.\n";
     WithColor::note() << "call instructions are not correctly modeled. "
                       << "Assume a latency of 100cy.\n";
+    FirstCallInst = false;
   }
 
-  if (MCDesc.isReturn()) {
+  if (MCDesc.isReturn() && FirstReturnInst) {
     WithColor::warning() << "found a return instruction in the input"
                          << " assembly sequence.\n";
     WithColor::note() << "program counter updates are ignored.\n";
+    FirstReturnInst = false;
   }
 
   ID->MayLoad = MCDesc.mayLoad();
   ID->MayStore = MCDesc.mayStore();
   ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
+  ID->BeginGroup = SCDesc.BeginGroup;
+  ID->EndGroup = SCDesc.EndGroup;
 
   initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
   computeMaxLatency(*ID, MCDesc, SCDesc, STI);
-  if (auto Err = populateWrites(*ID, MCI, SchedClassID))
-    return std::move(Err);
-  if (auto Err = populateReads(*ID, MCI, SchedClassID))
+
+  if (Error Err = verifyOperands(MCDesc, MCI))
     return std::move(Err);
 
+  populateWrites(*ID, MCI, SchedClassID);
+  populateReads(*ID, MCI, SchedClassID);
+
   LLVM_DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n');
   LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
 
@@ -422,7 +557,8 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
 
   // Now add the new descriptor.
   SchedClassID = MCDesc.getSchedClass();
-  if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) {
+  bool IsVariadic = MCDesc.isVariadic();
+  if (!IsVariadic && !IsVariant) {
     Descriptors[MCI.getOpcode()] = std::move(ID);
     return *Descriptors[MCI.getOpcode()];
   }
@@ -453,12 +589,16 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
   // Check if this is a dependency breaking instruction.
   APInt Mask;
 
-  unsigned ProcID = STI.getSchedModel().getProcessorID();
-  bool IsZeroIdiom = MCIA.isZeroIdiom(MCI, Mask, ProcID);
-  bool IsDepBreaking =
-      IsZeroIdiom || MCIA.isDependencyBreaking(MCI, Mask, ProcID);
-  if (MCIA.isOptimizableRegisterMove(MCI, ProcID))
-    NewIS->setOptimizableMove();
+  bool IsZeroIdiom = false;
+  bool IsDepBreaking = false;
+  if (MCIA) {
+    unsigned ProcID = STI.getSchedModel().getProcessorID();
+    IsZeroIdiom = MCIA->isZeroIdiom(MCI, Mask, ProcID);
+    IsDepBreaking =
+        IsZeroIdiom || MCIA->isDependencyBreaking(MCI, Mask, ProcID);
+    if (MCIA->isOptimizableRegisterMove(MCI, ProcID))
+      NewIS->setOptimizableMove();
+  }
 
   // Initialize Reads first.
   for (const ReadDescriptor &RD : D.Reads) {
@@ -515,7 +655,8 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
 
   // Now query the MCInstrAnalysis object to obtain information about which
   // register writes implicitly clear the upper portion of a super-register.
-  MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
+  if (MCIA)
+    MCIA->clearsSuperRegisters(MRI, MCI, WriteMask);
 
   // Initialize writes.
   unsigned WriteIndex = 0;
@@ -529,9 +670,9 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
     }
 
     assert(RegID && "Expected a valid register ID!");
-    NewIS->getDefs().emplace_back(
-        WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex],
-        /* WritesZero */ IsZeroIdiom);
+    NewIS->getDefs().emplace_back(WD, RegID,
+                                  /* ClearsSuperRegs */ WriteMask[WriteIndex],
+                                  /* WritesZero */ IsZeroIdiom);
     ++WriteIndex;
   }
 
diff --git a/tools/llvm-mca/lib/Instruction.cpp b/lib/MCA/Instruction.cpp
similarity index 83%
rename from tools/llvm-mca/lib/Instruction.cpp
rename to lib/MCA/Instruction.cpp
index 832a6199f00ca363891cd3e5c780c2bb921f9b96..057e95ca9990fdf967cda1c741c5e8e2ce453db7 100644
--- a/tools/llvm-mca/lib/Instruction.cpp
+++ b/lib/MCA/Instruction.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Instruction.h"
+#include "llvm/MCA/Instruction.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -49,6 +49,10 @@ void WriteState::onInstructionIssued() {
     unsigned ReadCycles = std::max(0, CyclesLeft - User.second);
     RS->writeStartEvent(ReadCycles);
   }
+
+  // Notify any writes that are in a false dependency with this write.
+  if (PartialWrite)
+    PartialWrite->writeStartEvent(CyclesLeft);
 }
 
 void WriteState::addUser(ReadState *User, int ReadAdvance) {
@@ -61,8 +65,22 @@ void WriteState::addUser(ReadState *User, int ReadAdvance) {
     return;
   }
 
-  std::pair<ReadState *, int> NewPair(User, ReadAdvance);
-  Users.insert(NewPair);
+  if (llvm::find_if(Users, [&User](const std::pair<ReadState *, int> &Use) {
+        return Use.first == User;
+      }) == Users.end()) {
+    Users.emplace_back(User, ReadAdvance);
+  }
+}
+
+void WriteState::addUser(WriteState *User) {
+  if (CyclesLeft != UNKNOWN_CYCLES) {
+    User->writeStartEvent(std::max(0, CyclesLeft));
+    return;
+  }
+
+  assert(!PartialWrite && "PartialWrite already set!");
+  PartialWrite = User;
+  User->setDependentWrite(this);
 }
 
 void WriteState::cycleEvent() {
@@ -71,6 +89,9 @@ void WriteState::cycleEvent() {
   // specify a negative ReadAdvance.
   if (CyclesLeft != UNKNOWN_CYCLES)
     CyclesLeft--;
+
+  if (DependentWriteCyclesLeft)
+    DependentWriteCyclesLeft--;
 }
 
 void ReadState::cycleEvent() {
@@ -143,13 +164,11 @@ void Instruction::update() {
 
   // A partial register write cannot complete before a dependent write.
   auto IsDefReady = [&](const WriteState &Def) {
-    if (const WriteState *Write = Def.getDependentWrite()) {
-      int WriteLatency = Write->getCyclesLeft();
-      if (WriteLatency == UNKNOWN_CYCLES)
-        return false;
-      return static_cast<unsigned>(WriteLatency) < getLatency();
+    if (!Def.getDependentWrite()) {
+      unsigned CyclesLeft = Def.getDependentWriteCyclesLeft();
+      return !CyclesLeft || CyclesLeft < getLatency();
     }
-    return true;
+    return false;
   };
 
   if (all_of(getDefs(), IsDefReady))
@@ -164,6 +183,9 @@ void Instruction::cycleEvent() {
     for (ReadState &Use : getUses())
       Use.cycleEvent();
 
+    for (WriteState &Def : getDefs())
+      Def.cycleEvent();
+
     update();
     return;
   }
diff --git a/tools/llvm-mca/lib/LLVMBuild.txt b/lib/MCA/LLVMBuild.txt
similarity index 100%
rename from tools/llvm-mca/lib/LLVMBuild.txt
rename to lib/MCA/LLVMBuild.txt
diff --git a/tools/llvm-mca/lib/Pipeline.cpp b/lib/MCA/Pipeline.cpp
similarity index 94%
rename from tools/llvm-mca/lib/Pipeline.cpp
rename to lib/MCA/Pipeline.cpp
index 309f415913d746b53639293e354266cc503bca9d..fd97ea624b839c4be09a0357710017973b987e84 100644
--- a/tools/llvm-mca/lib/Pipeline.cpp
+++ b/lib/MCA/Pipeline.cpp
@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Pipeline.h"
-#include "HWEventListener.h"
+#include "llvm/MCA/Pipeline.h"
+#include "llvm/MCA/HWEventListener.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm {
@@ -35,18 +35,18 @@ bool Pipeline::hasWorkToProcess() {
   });
 }
 
-Error Pipeline::run() {
+Expected<unsigned> Pipeline::run() {
   assert(!Stages.empty() && "Unexpected empty pipeline found!");
 
   do {
     notifyCycleBegin();
     if (Error Err = runCycle())
-      return Err;
+      return std::move(Err);
     notifyCycleEnd();
     ++Cycles;
   } while (hasWorkToProcess());
 
-  return ErrorSuccess();
+  return Cycles;
 }
 
 Error Pipeline::runCycle() {
diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/lib/MCA/Stages/DispatchStage.cpp
similarity index 95%
rename from tools/llvm-mca/lib/Stages/DispatchStage.cpp
rename to lib/MCA/Stages/DispatchStage.cpp
index 838dbad22e3ad5c03ffb5d1c8fda39374bb914d0..7fb4eb6a1c0ef85b3b1a6a03157a2df1197f3cfe 100644
--- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp
+++ b/lib/MCA/Stages/DispatchStage.cpp
@@ -16,9 +16,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Stages/DispatchStage.h"
-#include "HWEventListener.h"
-#include "HardwareUnits/Scheduler.h"
+#include "llvm/MCA/Stages/DispatchStage.h"
+#include "llvm/MCA/HWEventListener.h"
+#include "llvm/MCA/HardwareUnits/Scheduler.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "llvm-mca"
@@ -99,6 +99,10 @@ Error DispatchStage::dispatch(InstRef IR) {
     AvailableEntries -= NumMicroOps;
   }
 
+  // Check if this instructions ends the dispatch group.
+  if (Desc.EndGroup)
+    AvailableEntries = 0;
+
   // Check if this is an optimizable reg-reg move.
   bool IsEliminated = false;
   if (IS.isOptimizableMove()) {
@@ -164,6 +168,10 @@ bool DispatchStage::isAvailable(const InstRef &IR) const {
   unsigned Required = std::min(Desc.NumMicroOps, DispatchWidth);
   if (Required > AvailableEntries)
     return false;
+
+  if (Desc.BeginGroup && AvailableEntries != DispatchWidth)
+    return false;
+
   // The dispatch logic doesn't internally buffer instructions.  It only accepts
   // instructions that can be successfully moved to the next stage during this
   // same cycle.
diff --git a/tools/llvm-mca/lib/Stages/FetchStage.cpp b/lib/MCA/Stages/EntryStage.cpp
similarity index 66%
rename from tools/llvm-mca/lib/Stages/FetchStage.cpp
rename to lib/MCA/Stages/EntryStage.cpp
index 6e91dd6121d35fb18f00fdd6583cdeb22029fef9..3325bb36f5afe91e9bcf9f04064f2029fe8975c0 100644
--- a/tools/llvm-mca/lib/Stages/FetchStage.cpp
+++ b/lib/MCA/Stages/EntryStage.cpp
@@ -1,4 +1,4 @@
-//===---------------------- FetchStage.cpp ----------------------*- C++ -*-===//
+//===---------------------- EntryStage.cpp ----------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,32 +13,32 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Stages/FetchStage.h"
-#include "Instruction.h"
+#include "llvm/MCA/Stages/EntryStage.h"
+#include "llvm/MCA/Instruction.h"
 
 namespace llvm {
 namespace mca {
 
-bool FetchStage::hasWorkToComplete() const { return CurrentInstruction; }
+bool EntryStage::hasWorkToComplete() const { return CurrentInstruction; }
 
-bool FetchStage::isAvailable(const InstRef & /* unused */) const {
+bool EntryStage::isAvailable(const InstRef & /* unused */) const {
   if (CurrentInstruction)
     return checkNextStage(CurrentInstruction);
   return false;
 }
 
-void FetchStage::getNextInstruction() {
+void EntryStage::getNextInstruction() {
   assert(!CurrentInstruction && "There is already an instruction to process!");
   if (!SM.hasNext())
     return;
   SourceRef SR = SM.peekNext();
   std::unique_ptr<Instruction> Inst = llvm::make_unique<Instruction>(SR.second);
   CurrentInstruction = InstRef(SR.first, Inst.get());
-  Instructions[SR.first] = std::move(Inst);
+  Instructions.emplace_back(std::move(Inst));
   SM.updateNext();
 }
 
-llvm::Error FetchStage::execute(InstRef & /*unused */) {
+llvm::Error EntryStage::execute(InstRef & /*unused */) {
   assert(CurrentInstruction && "There is no instruction to process!");
   if (llvm::Error Val = moveToTheNextStage(CurrentInstruction))
     return Val;
@@ -49,22 +49,25 @@ llvm::Error FetchStage::execute(InstRef & /*unused */) {
   return llvm::ErrorSuccess();
 }
 
-llvm::Error FetchStage::cycleStart() {
+llvm::Error EntryStage::cycleStart() {
   if (!CurrentInstruction)
     getNextInstruction();
   return llvm::ErrorSuccess();
 }
 
-llvm::Error FetchStage::cycleEnd() {
+llvm::Error EntryStage::cycleEnd() {
   // Find the first instruction which hasn't been retired.
-  const InstMap::iterator It =
-      llvm::find_if(Instructions, [](const InstMap::value_type &KeyValuePair) {
-        return !KeyValuePair.second->isRetired();
-      });
+  auto Range = make_range(&Instructions[NumRetired], Instructions.end());
+  auto It = find_if(Range, [](const std::unique_ptr<Instruction> &I) {
+    return !I->isRetired();
+  });
 
+  NumRetired = std::distance(Instructions.begin(), It);
   // Erase instructions up to the first that hasn't been retired.
-  if (It != Instructions.begin())
+  if ((NumRetired * 2) >= Instructions.size()) {
     Instructions.erase(Instructions.begin(), It);
+    NumRetired = 0;
+  }
 
   return llvm::ErrorSuccess();
 }
diff --git a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp b/lib/MCA/Stages/ExecuteStage.cpp
similarity index 99%
rename from tools/llvm-mca/lib/Stages/ExecuteStage.cpp
rename to lib/MCA/Stages/ExecuteStage.cpp
index 298f08a2887f951117d3a79d629ee705a8e1d66e..17f7ff7259a3e963bc5e003c011f417ab130c4e6 100644
--- a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
+++ b/lib/MCA/Stages/ExecuteStage.cpp
@@ -15,7 +15,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Stages/ExecuteStage.h"
+#include "llvm/MCA/Stages/ExecuteStage.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/tools/llvm-mca/lib/Stages/InstructionTables.cpp b/lib/MCA/Stages/InstructionTables.cpp
similarity index 98%
rename from tools/llvm-mca/lib/Stages/InstructionTables.cpp
rename to lib/MCA/Stages/InstructionTables.cpp
index 33c30e7f95c0f7142b0732df0a20d858f0550cbf..f918c183aa5a0486452a4cd5dac3f80deb99f1f3 100644
--- a/tools/llvm-mca/lib/Stages/InstructionTables.cpp
+++ b/lib/MCA/Stages/InstructionTables.cpp
@@ -15,7 +15,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Stages/InstructionTables.h"
+#include "llvm/MCA/Stages/InstructionTables.h"
 
 namespace llvm {
 namespace mca {
diff --git a/tools/llvm-mca/lib/Stages/RetireStage.cpp b/lib/MCA/Stages/RetireStage.cpp
similarity index 95%
rename from tools/llvm-mca/lib/Stages/RetireStage.cpp
rename to lib/MCA/Stages/RetireStage.cpp
index 47eed5f2c9c651beefae47f364ee0f9d764dda85..d6bcc518662f5323e87c2a21c93a237e328b667f 100644
--- a/tools/llvm-mca/lib/Stages/RetireStage.cpp
+++ b/lib/MCA/Stages/RetireStage.cpp
@@ -14,8 +14,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Stages/RetireStage.h"
-#include "HWEventListener.h"
+#include "llvm/MCA/Stages/RetireStage.h"
+#include "llvm/MCA/HWEventListener.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "llvm-mca"
diff --git a/tools/llvm-mca/lib/Stages/Stage.cpp b/lib/MCA/Stages/Stage.cpp
similarity index 95%
rename from tools/llvm-mca/lib/Stages/Stage.cpp
rename to lib/MCA/Stages/Stage.cpp
index c3cfe47d24e19feba002cec357a148355685da91..38191645e736dab45a0543e5cf7800c74b9aaaed 100644
--- a/tools/llvm-mca/lib/Stages/Stage.cpp
+++ b/lib/MCA/Stages/Stage.cpp
@@ -13,7 +13,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Stages/Stage.h"
+#include "llvm/MCA/Stages/Stage.h"
 
 namespace llvm {
 namespace mca {
diff --git a/tools/llvm-mca/lib/Support.cpp b/lib/MCA/Support.cpp
similarity index 98%
rename from tools/llvm-mca/lib/Support.cpp
rename to lib/MCA/Support.cpp
index a6ff26dafb5baa49e52d1fd3291c3bd22596c712..3271bc6bf5b4f103fcd046cd5c48547dccc96f5e 100644
--- a/tools/llvm-mca/lib/Support.cpp
+++ b/lib/MCA/Support.cpp
@@ -13,7 +13,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Support.h"
+#include "llvm/MCA/Support.h"
 #include "llvm/MC/MCSchedule.h"
 
 namespace llvm {
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index f09401307b40f6b2f6e1d0981d933c1c0eb27666..e17f0e2783e1e838c728cbb2d33a9283ca7e9c78 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -938,6 +938,18 @@ iterator_range<base_reloc_iterator> COFFObjectFile::base_relocs() const {
   return make_range(base_reloc_begin(), base_reloc_end());
 }
 
+std::error_code
+COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const {
+  Res = COFFHeader;
+  return std::error_code();
+}
+
+std::error_code
+COFFObjectFile::getCOFFBigObjHeader(const coff_bigobj_file_header *&Res) const {
+  Res = COFFBigObjHeader;
+  return std::error_code();
+}
+
 std::error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const {
   Res = PE32Header;
   return std::error_code();
@@ -1284,6 +1296,12 @@ bool COFFObjectFile::isRelocatableObject() const {
   return !DataDirectory;
 }
 
+StringRef COFFObjectFile::mapDebugSectionName(StringRef Name) const {
+  return StringSwitch<StringRef>(Name)
+      .Case("eh_fram", "eh_frame")
+      .Default(Name);
+}
+
 bool ImportDirectoryEntryRef::
 operator==(const ImportDirectoryEntryRef &Other) const {
   return ImportTable == Other.ImportTable && Index == Other.Index;
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index 2edab0b13735eca0432f1d6a6080212286008ff0..cf8313f88f9394ed4a68b74295f9662714ed9da2 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -154,7 +154,7 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
 
 #undef ELF_RELOC
 
-uint32_t llvm::object::getELFRelrRelocationType(uint32_t Machine) {
+uint32_t llvm::object::getELFRelativeRelocationType(uint32_t Machine) {
   switch (Machine) {
   case ELF::EM_X86_64:
     return ELF::R_X86_64_RELATIVE;
@@ -300,7 +300,7 @@ ELFFile<ELFT>::decode_relrs(Elf_Relr_Range relrs) const {
   Elf_Rela Rela;
   Rela.r_info = 0;
   Rela.r_addend = 0;
-  Rela.setType(getRelrRelocationType(), false);
+  Rela.setType(getRelativeRelocationType(), false);
   std::vector<Elf_Rela> Relocs;
 
   // Word type: uint32_t for Elf32, and uint64_t for Elf64.
diff --git a/lib/Object/ModuleSymbolTable.cpp b/lib/Object/ModuleSymbolTable.cpp
index b353ef3c835bca6396f7b7a165f6f5ed561f421b..33ce7d8109fb8a74083dd6a85205df2603d45346 100644
--- a/lib/Object/ModuleSymbolTable.cpp
+++ b/lib/Object/ModuleSymbolTable.cpp
@@ -100,6 +100,7 @@ initializeRecordStreamer(const Module &M,
   MCObjectFileInfo MOFI;
   MCContext MCCtx(MAI.get(), MRI.get(), &MOFI);
   MOFI.InitMCObjectFileInfo(TT, /*PIC*/ false, MCCtx);
+  MOFI.setSDKVersion(M.getSDKVersion());
   RecordStreamer Streamer(MCCtx, M);
   T->createNullTargetStreamer(Streamer);
 
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index 5fd823e0117ed6b942625dcb3b9eb0c074691a34..f5de2e1d5ce23b8cae7d7ff4d956ad589d61b22f 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -105,7 +105,7 @@ void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect,
   if (!SecOrErr) {
    std::string Buf;
    raw_string_ostream OS(Buf);
-   logAllUnhandledErrors(SecOrErr.takeError(), OS, "");
+   logAllUnhandledErrors(SecOrErr.takeError(), OS);
    OS.flush();
    report_fatal_error(Buf);
   }
@@ -187,7 +187,7 @@ const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI) {
   if (!Ret) {
     std::string Buf;
     raw_string_ostream OS(Buf);
-    logAllUnhandledErrors(Ret.takeError(), OS, "");
+    logAllUnhandledErrors(Ret.takeError(), OS);
     OS.flush();
     report_fatal_error(Buf);
   }
@@ -199,7 +199,7 @@ uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) {
   if (!Ret) {
     std::string Buf;
     raw_string_ostream OS(Buf);
-    logAllUnhandledErrors(Ret.takeError(), OS, "");
+    logAllUnhandledErrors(Ret.takeError(), OS);
     OS.flush();
     report_fatal_error(Buf);
   }
@@ -229,7 +229,7 @@ const char *LLVMGetRelocationTypeName(LLVMRelocationIteratorRef RI) {
   SmallVector<char, 0> ret;
   (*unwrap(RI))->getTypeName(ret);
   char *str = static_cast<char*>(safe_malloc(ret.size()));
-  std::copy(ret.begin(), ret.end(), str);
+  llvm::copy(ret, str);
   return str;
 }
 
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index db0ff220c4d89947658b8931c7135632ba582393..cf63b89adc122a516d69a8adc37f368b8b027404 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -77,6 +77,14 @@ bool ObjectFile::isSectionBitcode(DataRefImpl Sec) const {
 
 bool ObjectFile::isSectionStripped(DataRefImpl Sec) const { return false; }
 
+bool ObjectFile::isBerkeleyText(DataRefImpl Sec) const {
+  return isSectionText(Sec);
+}
+
+bool ObjectFile::isBerkeleyData(DataRefImpl Sec) const {
+  return isSectionData(Sec);
+}
+
 section_iterator ObjectFile::getRelocatedSection(DataRefImpl Sec) const {
   return section_iterator(SectionRef(Sec, this));
 }
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 3bd66f9375fe670eab34e7b8f41839c454a8505a..34e4d192e590a5f4b42287ac7b1ee16b3c6b89f7 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -207,8 +208,8 @@ static wasm::WasmTable readTable(WasmObjectFile::ReadContext &Ctx) {
   return Table;
 }
 
-static Error readSection(WasmSection &Section,
-                         WasmObjectFile::ReadContext &Ctx) {
+static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
+                         WasmSectionOrderChecker &Checker) {
   Section.Offset = Ctx.Ptr - Ctx.Start;
   Section.Type = readUint8(Ctx);
   LLVM_DEBUG(dbgs() << "readSection type=" << Section.Type << "\n");
@@ -231,6 +232,13 @@ static Error readSection(WasmSection &Section,
     Ctx.Ptr += SectionNameSize;
     Size -= SectionNameSize;
   }
+
+  if (!Checker.isValidSectionOrder(Section.Type, Section.Name)) {
+    return make_error<StringError>("Out of order section type: " +
+                                       llvm::to_string(Section.Type),
+                                   object_error::parse_failed);
+  }
+
   Section.Content = ArrayRef<uint8_t>(Ctx.Ptr, Size);
   Ctx.Ptr += Size;
   return Error::success();
@@ -265,8 +273,9 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
   }
 
   WasmSection Sec;
+  WasmSectionOrderChecker Checker;
   while (Ctx.Ptr < Ctx.End) {
-    if ((Err = readSection(Sec, Ctx)))
+    if ((Err = readSection(Sec, Ctx, Checker)))
       return;
     if ((Err = parseSection(Sec)))
       return;
@@ -295,6 +304,8 @@ Error WasmObjectFile::parseSection(WasmSection &Sec) {
     return parseMemorySection(Ctx);
   case wasm::WASM_SEC_GLOBAL:
     return parseGlobalSection(Ctx);
+  case wasm::WASM_SEC_EVENT:
+    return parseEventSection(Ctx);
   case wasm::WASM_SEC_EXPORT:
     return parseExportSection(Ctx);
   case wasm::WASM_SEC_START:
@@ -311,6 +322,22 @@ Error WasmObjectFile::parseSection(WasmSection &Sec) {
   }
 }
 
+Error WasmObjectFile::parseDylinkSection(ReadContext &Ctx) {
+  // See https://github.com/WebAssembly/tool-conventions/blob/master/DynamicLinking.md
+  DylinkInfo.MemorySize = readVaruint32(Ctx);
+  DylinkInfo.MemoryAlignment = readVaruint32(Ctx);
+  DylinkInfo.TableSize = readVaruint32(Ctx);
+  DylinkInfo.TableAlignment = readVaruint32(Ctx);
+  uint32_t Count = readVaruint32(Ctx);
+  while (Count--) {
+    DylinkInfo.Needed.push_back(readString(Ctx));
+  }
+  if (Ctx.Ptr != Ctx.End)
+    return make_error<GenericBinaryError>("dylink section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
 Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
   llvm::DenseSet<uint64_t> Seen;
   if (Functions.size() != FunctionTypes.size()) {
@@ -439,19 +466,24 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
 
   std::vector<wasm::WasmImport *> ImportedGlobals;
   std::vector<wasm::WasmImport *> ImportedFunctions;
+  std::vector<wasm::WasmImport *> ImportedEvents;
   ImportedGlobals.reserve(Imports.size());
   ImportedFunctions.reserve(Imports.size());
+  ImportedEvents.reserve(Imports.size());
   for (auto &I : Imports) {
     if (I.Kind == wasm::WASM_EXTERNAL_FUNCTION)
       ImportedFunctions.emplace_back(&I);
     else if (I.Kind == wasm::WASM_EXTERNAL_GLOBAL)
       ImportedGlobals.emplace_back(&I);
+    else if (I.Kind == wasm::WASM_EXTERNAL_EVENT)
+      ImportedEvents.emplace_back(&I);
   }
 
   while (Count--) {
     wasm::WasmSymbolInfo Info;
-    const wasm::WasmSignature *FunctionType = nullptr;
+    const wasm::WasmSignature *Signature = nullptr;
     const wasm::WasmGlobalType *GlobalType = nullptr;
+    const wasm::WasmEventType *EventType = nullptr;
 
     Info.Kind = readUint8(Ctx);
     Info.Flags = readVaruint32(Ctx);
@@ -467,13 +499,13 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
       if (IsDefined) {
         Info.Name = readString(Ctx);
         unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions;
-        FunctionType = &Signatures[FunctionTypes[FuncIndex]];
+        Signature = &Signatures[FunctionTypes[FuncIndex]];
         wasm::WasmFunction &Function = Functions[FuncIndex];
         if (Function.SymbolName.empty())
           Function.SymbolName = Info.Name;
       } else {
         wasm::WasmImport &Import = *ImportedFunctions[Info.ElementIndex];
-        FunctionType = &Signatures[Import.SigIndex];
+        Signature = &Signatures[Import.SigIndex];
         Info.Name = Import.Field;
         Info.Module = Import.Module;
       }
@@ -532,6 +564,34 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
       break;
     }
 
+    case wasm::WASM_SYMBOL_TYPE_EVENT: {
+      Info.ElementIndex = readVaruint32(Ctx);
+      if (!isValidEventIndex(Info.ElementIndex) ||
+          IsDefined != isDefinedEventIndex(Info.ElementIndex))
+        return make_error<GenericBinaryError>("invalid event symbol index",
+                                              object_error::parse_failed);
+      if (!IsDefined && (Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK) ==
+                            wasm::WASM_SYMBOL_BINDING_WEAK)
+        return make_error<GenericBinaryError>("undefined weak global symbol",
+                                              object_error::parse_failed);
+      if (IsDefined) {
+        Info.Name = readString(Ctx);
+        unsigned EventIndex = Info.ElementIndex - NumImportedEvents;
+        wasm::WasmEvent &Event = Events[EventIndex];
+        Signature = &Signatures[Event.Type.SigIndex];
+        EventType = &Event.Type;
+        if (Event.SymbolName.empty())
+          Event.SymbolName = Info.Name;
+
+      } else {
+        wasm::WasmImport &Import = *ImportedEvents[Info.ElementIndex];
+        EventType = &Import.Event;
+        Signature = &Signatures[EventType->SigIndex];
+        Info.Name = Import.Field;
+      }
+      break;
+    }
+
     default:
       return make_error<GenericBinaryError>("Invalid symbol type",
                                             object_error::parse_failed);
@@ -544,8 +604,8 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
                                                 Twine(Info.Name),
                                             object_error::parse_failed);
     LinkingData.SymbolTable.emplace_back(Info);
-    Symbols.emplace_back(LinkingData.SymbolTable.back(), FunctionType,
-                         GlobalType);
+    Symbols.emplace_back(LinkingData.SymbolTable.back(), GlobalType, EventType,
+                         Signature);
     LLVM_DEBUG(dbgs() << "Adding symbol: " << Symbols.back() << "\n");
   }
 
@@ -635,6 +695,11 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
         return make_error<GenericBinaryError>("Bad relocation global index",
                                               object_error::parse_failed);
       break;
+    case wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB:
+      if (!isValidEventSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("Bad relocation event index",
+                                              object_error::parse_failed);
+      break;
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB:
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
     case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
@@ -683,7 +748,10 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
 }
 
 Error WasmObjectFile::parseCustomSection(WasmSection &Sec, ReadContext &Ctx) {
-  if (Sec.Name == "name") {
+  if (Sec.Name == "dylink") {
+    if (Error Err = parseDylinkSection(Ctx))
+      return Err;
+  } else if (Sec.Name == "name") {
     if (Error Err = parseNameSection(Ctx))
       return Err;
   } else if (Sec.Name == "linking") {
@@ -755,6 +823,11 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
         return make_error<GenericBinaryError>("Invalid table element type",
                                               object_error::parse_failed);
       break;
+    case wasm::WASM_EXTERNAL_EVENT:
+      NumImportedEvents++;
+      Im.Event.Attribute = readVarint32(Ctx);
+      Im.Event.SigIndex = readVarint32(Ctx);
+      break;
     default:
       return make_error<GenericBinaryError>("Unexpected import kind",
                                             object_error::parse_failed);
@@ -831,6 +904,24 @@ Error WasmObjectFile::parseGlobalSection(ReadContext &Ctx) {
   return Error::success();
 }
 
+Error WasmObjectFile::parseEventSection(ReadContext &Ctx) {
+  EventSection = Sections.size();
+  uint32_t Count = readVarint32(Ctx);
+  Events.reserve(Count);
+  while (Count--) {
+    wasm::WasmEvent Event;
+    Event.Index = NumImportedEvents + Events.size();
+    Event.Type.Attribute = readVaruint32(Ctx);
+    Event.Type.SigIndex = readVarint32(Ctx);
+    Events.push_back(Event);
+  }
+
+  if (Ctx.Ptr != Ctx.End)
+    return make_error<GenericBinaryError>("Event section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
 Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
   uint32_t Count = readVaruint32(Ctx);
   Exports.reserve(Count);
@@ -850,6 +941,11 @@ Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
         return make_error<GenericBinaryError>("Invalid global export",
                                               object_error::parse_failed);
       break;
+    case wasm::WASM_EXTERNAL_EVENT:
+      if (!isValidEventIndex(Ex.Index))
+        return make_error<GenericBinaryError>("Invalid event export",
+                                              object_error::parse_failed);
+      break;
     case wasm::WASM_EXTERNAL_MEMORY:
     case wasm::WASM_EXTERNAL_TABLE:
       break;
@@ -881,6 +977,14 @@ bool WasmObjectFile::isDefinedGlobalIndex(uint32_t Index) const {
   return Index >= NumImportedGlobals && isValidGlobalIndex(Index);
 }
 
+bool WasmObjectFile::isValidEventIndex(uint32_t Index) const {
+  return Index < NumImportedEvents + Events.size();
+}
+
+bool WasmObjectFile::isDefinedEventIndex(uint32_t Index) const {
+  return Index >= NumImportedEvents && isValidEventIndex(Index);
+}
+
 bool WasmObjectFile::isValidFunctionSymbol(uint32_t Index) const {
   return Index < Symbols.size() && Symbols[Index].isTypeFunction();
 }
@@ -889,6 +993,10 @@ bool WasmObjectFile::isValidGlobalSymbol(uint32_t Index) const {
   return Index < Symbols.size() && Symbols[Index].isTypeGlobal();
 }
 
+bool WasmObjectFile::isValidEventSymbol(uint32_t Index) const {
+  return Index < Symbols.size() && Symbols[Index].isTypeEvent();
+}
+
 bool WasmObjectFile::isValidDataSymbol(uint32_t Index) const {
   return Index < Symbols.size() && Symbols[Index].isTypeData();
 }
@@ -907,6 +1015,11 @@ wasm::WasmGlobal &WasmObjectFile::getDefinedGlobal(uint32_t Index) {
   return Globals[Index - NumImportedGlobals];
 }
 
+wasm::WasmEvent &WasmObjectFile::getDefinedEvent(uint32_t Index) {
+  assert(isDefinedEventIndex(Index));
+  return Events[Index - NumImportedEvents];
+}
+
 Error WasmObjectFile::parseStartSection(ReadContext &Ctx) {
   StartFunction = readVaruint32(Ctx);
   if (!isValidFunctionIndex(StartFunction))
@@ -1070,6 +1183,7 @@ uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const {
   switch (Sym.Info.Kind) {
   case wasm::WASM_SYMBOL_TYPE_FUNCTION:
   case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+  case wasm::WASM_SYMBOL_TYPE_EVENT:
     return Sym.Info.ElementIndex;
   case wasm::WASM_SYMBOL_TYPE_DATA: {
     // The value of a data symbol is the segment offset, plus the symbol
@@ -1112,6 +1226,8 @@ WasmObjectFile::getSymbolType(DataRefImpl Symb) const {
     return SymbolRef::ST_Data;
   case wasm::WASM_SYMBOL_TYPE_SECTION:
     return SymbolRef::ST_Debug;
+  case wasm::WASM_SYMBOL_TYPE_EVENT:
+    return SymbolRef::ST_Other;
   }
 
   llvm_unreachable("Unknown WasmSymbol::SymbolType");
@@ -1135,10 +1251,12 @@ WasmObjectFile::getSymbolSection(DataRefImpl Symb) const {
   case wasm::WASM_SYMBOL_TYPE_DATA:
     Ref.d.a = DataSection;
     break;
-  case wasm::WASM_SYMBOL_TYPE_SECTION: {
+  case wasm::WASM_SYMBOL_TYPE_SECTION:
     Ref.d.a = Sym.Info.ElementIndex;
     break;
-  }
+  case wasm::WASM_SYMBOL_TYPE_EVENT:
+    Ref.d.a = EventSection;
+    break;
   default:
     llvm_unreachable("Unknown WasmSymbol::SymbolType");
   }
@@ -1161,6 +1279,7 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
     ECase(TABLE);
     ECase(MEMORY);
     ECase(GLOBAL);
+    ECase(EVENT);
     ECase(EXPORT);
     ECase(START);
     ECase(ELEM);
@@ -1299,6 +1418,8 @@ SubtargetFeatures WasmObjectFile::getFeatures() const {
 
 bool WasmObjectFile::isRelocatableObject() const { return HasLinkingSection; }
 
+bool WasmObjectFile::isSharedObject() const { return HasDylinkSection; }
+
 const WasmSection &WasmObjectFile::getWasmSection(DataRefImpl Ref) const {
   assert(Ref.d.a < Sections.size());
   return Sections[Ref.d.a];
@@ -1321,3 +1442,58 @@ WasmObjectFile::getWasmRelocation(DataRefImpl Ref) const {
   assert(Ref.d.b < Sec.Relocations.size());
   return Sec.Relocations[Ref.d.b];
 }
+
+int WasmSectionOrderChecker::getSectionOrder(unsigned ID,
+                                             StringRef CustomSectionName) {
+  switch (ID) {
+  case wasm::WASM_SEC_CUSTOM:
+    return StringSwitch<unsigned>(CustomSectionName)
+        .Case("dylink", WASM_SEC_ORDER_DYLINK)
+        .Case("linking", WASM_SEC_ORDER_LINKING)
+        .StartsWith("reloc.", WASM_SEC_ORDER_RELOC)
+        .Case("name", WASM_SEC_ORDER_NAME)
+        .Case("producers", WASM_SEC_ORDER_PRODUCERS)
+        .Default(-1);
+  case wasm::WASM_SEC_TYPE:
+    return WASM_SEC_ORDER_TYPE;
+  case wasm::WASM_SEC_IMPORT:
+    return WASM_SEC_ORDER_IMPORT;
+  case wasm::WASM_SEC_FUNCTION:
+    return WASM_SEC_ORDER_FUNCTION;
+  case wasm::WASM_SEC_TABLE:
+    return WASM_SEC_ORDER_TABLE;
+  case wasm::WASM_SEC_MEMORY:
+    return WASM_SEC_ORDER_MEMORY;
+  case wasm::WASM_SEC_GLOBAL:
+    return WASM_SEC_ORDER_GLOBAL;
+  case wasm::WASM_SEC_EXPORT:
+    return WASM_SEC_ORDER_EXPORT;
+  case wasm::WASM_SEC_START:
+    return WASM_SEC_ORDER_START;
+  case wasm::WASM_SEC_ELEM:
+    return WASM_SEC_ORDER_ELEM;
+  case wasm::WASM_SEC_CODE:
+    return WASM_SEC_ORDER_CODE;
+  case wasm::WASM_SEC_DATA:
+    return WASM_SEC_ORDER_DATA;
+  case wasm::WASM_SEC_DATACOUNT:
+    return WASM_SEC_ORDER_DATACOUNT;
+  case wasm::WASM_SEC_EVENT:
+    return WASM_SEC_ORDER_EVENT;
+  default:
+    llvm_unreachable("invalid section");
+  }
+}
+
+bool WasmSectionOrderChecker::isValidSectionOrder(unsigned ID,
+                                                  StringRef CustomSectionName) {
+  int Order = getSectionOrder(ID, CustomSectionName);
+  if (Order == -1) // Skip unknown sections
+    return true;
+  // There can be multiple "reloc." sections. Otherwise there shouldn't be any
+  // duplicate section orders.
+  bool IsValid = (LastOrder == Order && Order == WASM_SEC_ORDER_RELOC) ||
+                 LastOrder < Order;
+  LastOrder = Order;
+  return IsValid;
+}
diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp
index 1b7282f13db0a7cdc66310b5510ea5ff0a19b3be..65413dd8bea178170721258532099723448cb4ab 100644
--- a/lib/Object/WindowsResource.cpp
+++ b/lib/Object/WindowsResource.cpp
@@ -259,7 +259,7 @@ WindowsResourceParser::TreeNode::addChild(ArrayRef<UTF16> NameRef,
   std::vector<UTF16> EndianCorrectedName;
   if (sys::IsBigEndianHost) {
     EndianCorrectedName.resize(NameRef.size() + 1);
-    std::copy(NameRef.begin(), NameRef.end(), EndianCorrectedName.begin() + 1);
+    llvm::copy(NameRef, EndianCorrectedName.begin() + 1);
     EndianCorrectedName[0] = UNI_UTF16_BYTE_ORDER_MARK_SWAPPED;
     CorrectedName = makeArrayRef(EndianCorrectedName);
   } else
@@ -501,8 +501,7 @@ void WindowsResourceCOFFWriter::writeFirstSection() {
 void WindowsResourceCOFFWriter::writeSecondSection() {
   // Now write the .rsrc$02 section.
   for (auto const &RawDataEntry : Data) {
-    std::copy(RawDataEntry.begin(), RawDataEntry.end(),
-              BufferStart + CurrentOffset);
+    llvm::copy(RawDataEntry, BufferStart + CurrentOffset);
     CurrentOffset += alignTo(RawDataEntry.size(), sizeof(uint64_t));
   }
 
@@ -672,7 +671,7 @@ void WindowsResourceCOFFWriter::writeDirectoryStringTable() {
     support::endian::write16le(BufferStart + CurrentOffset, Length);
     CurrentOffset += sizeof(uint16_t);
     auto *Start = reinterpret_cast<UTF16 *>(BufferStart + CurrentOffset);
-    std::copy(String.begin(), String.end(), Start);
+    llvm::copy(String, Start);
     CurrentOffset += Length * sizeof(UTF16);
     TotalStringTableSize += Length * sizeof(UTF16) + sizeof(uint16_t);
   }
diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index 189d71782bd85d7f840a760f6c689d288b5f9975..b9f52432fc802b3accf8ed864c6dd17908e9e31b 100644
--- a/lib/ObjectYAML/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -337,10 +337,18 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCase(EF_HEXAGON_MACH_V3);
     BCase(EF_HEXAGON_MACH_V4);
     BCase(EF_HEXAGON_MACH_V5);
+    BCase(EF_HEXAGON_MACH_V55);
+    BCase(EF_HEXAGON_MACH_V60);
+    BCase(EF_HEXAGON_MACH_V62);
+    BCase(EF_HEXAGON_MACH_V65);
     BCase(EF_HEXAGON_ISA_V2);
     BCase(EF_HEXAGON_ISA_V3);
     BCase(EF_HEXAGON_ISA_V4);
     BCase(EF_HEXAGON_ISA_V5);
+    BCase(EF_HEXAGON_ISA_V55);
+    BCase(EF_HEXAGON_ISA_V60);
+    BCase(EF_HEXAGON_ISA_V62);
+    BCase(EF_HEXAGON_ISA_V65);
     break;
   case ELF::EM_AVR:
     BCase(EF_AVR_ARCH_AVR1);
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index dba950af5892941ef6e79cd3bbb7fa542f040ebd..b97803340c0d9e4cf07fbb353f7629304db1ad13 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -48,6 +48,16 @@ static void commonSectionMapping(IO &IO, WasmYAML::Section &Section) {
   IO.mapOptional("Relocations", Section.Relocations);
 }
 
+static void sectionMapping(IO &IO, WasmYAML::DylinkSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Name", Section.Name);
+  IO.mapRequired("MemorySize", Section.MemorySize);
+  IO.mapRequired("MemoryAlignment", Section.MemoryAlignment);
+  IO.mapRequired("TableSize", Section.TableSize);
+  IO.mapRequired("TableAlignment", Section.TableAlignment);
+  IO.mapRequired("Needed", Section.Needed);
+}
+
 static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapRequired("Name", Section.Name);
@@ -100,6 +110,11 @@ static void sectionMapping(IO &IO, WasmYAML::GlobalSection &Section) {
   IO.mapOptional("Globals", Section.Globals);
 }
 
+static void sectionMapping(IO &IO, WasmYAML::EventSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Events", Section.Events);
+}
+
 static void sectionMapping(IO &IO, WasmYAML::ExportSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Exports", Section.Exports);
@@ -142,7 +157,11 @@ void MappingTraits<std::unique_ptr<WasmYAML::Section>>::mapping(
     } else {
       IO.mapRequired("Name", SectionName);
     }
-    if (SectionName == "linking") {
+    if (SectionName == "dylink") {
+      if (!IO.outputting())
+        Section.reset(new WasmYAML::DylinkSection());
+      sectionMapping(IO, *cast<WasmYAML::DylinkSection>(Section.get()));
+    } else if (SectionName == "linking") {
       if (!IO.outputting())
         Section.reset(new WasmYAML::LinkingSection());
       sectionMapping(IO, *cast<WasmYAML::LinkingSection>(Section.get()));
@@ -187,6 +206,11 @@ void MappingTraits<std::unique_ptr<WasmYAML::Section>>::mapping(
       Section.reset(new WasmYAML::GlobalSection());
     sectionMapping(IO, *cast<WasmYAML::GlobalSection>(Section.get()));
     break;
+  case wasm::WASM_SEC_EVENT:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::EventSection());
+    sectionMapping(IO, *cast<WasmYAML::EventSection>(Section.get()));
+    break;
   case wasm::WASM_SEC_EXPORT:
     if (!IO.outputting())
       Section.reset(new WasmYAML::ExportSection());
@@ -227,6 +251,7 @@ void ScalarEnumerationTraits<WasmYAML::SectionType>::enumeration(
   ECase(TABLE);
   ECase(MEMORY);
   ECase(GLOBAL);
+  ECase(EVENT);
   ECase(EXPORT);
   ECase(START);
   ECase(ELEM);
@@ -307,6 +332,9 @@ void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
   } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
     IO.mapRequired("GlobalType", Import.GlobalImport.Type);
     IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_EVENT) {
+    IO.mapRequired("EventAttribute", Import.EventImport.Attribute);
+    IO.mapRequired("EventSigIndex", Import.EventImport.SigIndex);
   } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) {
     IO.mapRequired("Table", Import.TableImport);
   } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY) {
@@ -399,6 +427,8 @@ void MappingTraits<WasmYAML::SymbolInfo>::mapping(IO &IO,
     IO.mapRequired("Function", Info.ElementIndex);
   } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_GLOBAL) {
     IO.mapRequired("Global", Info.ElementIndex);
+  } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_EVENT) {
+    IO.mapRequired("Event", Info.ElementIndex);
   } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_DATA) {
     if ((Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) {
       IO.mapRequired("Segment", Info.DataRef.Segment);
@@ -412,6 +442,12 @@ void MappingTraits<WasmYAML::SymbolInfo>::mapping(IO &IO,
   }
 }
 
+void MappingTraits<WasmYAML::Event>::mapping(IO &IO, WasmYAML::Event &Event) {
+  IO.mapRequired("Index", Event.Index);
+  IO.mapRequired("Attribute", Event.Attribute);
+  IO.mapRequired("SigIndex", Event.SigIndex);
+}
+
 void ScalarBitSetTraits<WasmYAML::LimitFlags>::bitset(
     IO &IO, WasmYAML::LimitFlags &Value) {
 #define BCase(X) IO.bitSetCase(Value, #X, wasm::WASM_LIMITS_FLAG_##X)
@@ -443,6 +479,7 @@ void ScalarEnumerationTraits<WasmYAML::SymbolKind>::enumeration(
   ECase(DATA);
   ECase(GLOBAL);
   ECase(SECTION);
+  ECase(EVENT);
 #undef ECase
 }
 
@@ -467,6 +504,7 @@ void ScalarEnumerationTraits<WasmYAML::ExportKind>::enumeration(
   ECase(TABLE);
   ECase(MEMORY);
   ECase(GLOBAL);
+  ECase(EVENT);
 #undef ECase
 }
 
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 0c6dfff06f1887b1306ffaab762fd2a2fff1ed63..4da8f549d9bf9c5d6c7fa8fbaee4606255e6bcd2 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
@@ -62,7 +63,6 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
-#include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
@@ -89,6 +89,7 @@
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
@@ -130,6 +131,7 @@
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
+#include "llvm/Transforms/Scalar/MakeGuardsExplicit.h"
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/Transforms/Scalar/NaryReassociate.h"
@@ -139,12 +141,14 @@
 #include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Scalar/Sink.h"
 #include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
+#include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
 #include "llvm/Transforms/Utils/BreakCriticalEdges.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
@@ -155,6 +159,7 @@
 #include "llvm/Transforms/Utils/Mem2Reg.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 
@@ -831,6 +836,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
         createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
   }
   OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(Level)));
+  OptimizePM.addPass(WarnMissedTransformationsPass());
   OptimizePM.addPass(InstCombinePass());
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass(), DebugLogging));
@@ -862,6 +868,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // inserting redudnancies into the progrem. This even includes SimplifyCFG.
   OptimizePM.addPass(SpeculateAroundPHIsPass());
 
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(OptimizePM, Level);
+
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
@@ -1001,6 +1010,13 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   assert(Level != O0 && "Must request optimizations for the default pipeline!");
   ModulePassManager MPM(DebugLogging);
 
+  if (PGOOpt && !PGOOpt->SampleProfileFile.empty()) {
+    // Load sample profile before running the LTO optimization pipeline.
+    MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile,
+                                        PGOOpt->ProfileRemappingFile,
+                                        false /* ThinLTOPhase::PreLink */));
+  }
+
   // Remove unused virtual tables to improve the quality of code generated by
   // whole-program devirtualization and bitset lowering.
   MPM.addPass(GlobalDCEPass());
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 99df2ad2719cf78505d0a0ec76ed6ceea7e6b944..97f0d577b30d2ba45e4733ed94cb6e2ac889cbec 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -24,6 +24,7 @@ MODULE_ANALYSIS("lcg", LazyCallGraphAnalysis())
 MODULE_ANALYSIS("module-summary", ModuleSummaryIndexAnalysis())
 MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis())
 MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis())
+MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis())
 MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 MODULE_ANALYSIS("verify", VerifierAnalysis())
 MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
@@ -71,6 +72,7 @@ MODULE_PASS("print-callgraph", CallGraphPrinterPass(dbgs()))
 MODULE_PASS("print", PrintModulePass(dbgs()))
 MODULE_PASS("print-lcg", LazyCallGraphPrinterPass(dbgs()))
 MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(dbgs()))
+MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(dbgs()))
 MODULE_PASS("rewrite-statepoints-for-gc", RewriteStatepointsForGC())
 MODULE_PASS("rewrite-symbols", RewriteSymbolPass())
 MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass())
@@ -120,6 +122,7 @@ FUNCTION_ANALYSIS("regions", RegionInfoAnalysis())
 FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
 FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis())
 FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis())
+FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis())
 FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 FUNCTION_ANALYSIS("targetir",
                   TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis())
@@ -162,6 +165,7 @@ FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass())
 FUNCTION_PASS("early-cse", EarlyCSEPass(/*UseMemorySSA=*/false))
 FUNCTION_PASS("early-cse-memssa", EarlyCSEPass(/*UseMemorySSA=*/true))
 FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/false))
+FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass())
 FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/true))
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("instcombine", InstCombinePass())
@@ -175,6 +179,7 @@ FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
 FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("gvn", GVN())
+FUNCTION_PASS("load-store-vectorizer", LoadStoreVectorizerPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass())
@@ -204,7 +209,9 @@ FUNCTION_PASS("print<memoryssa>", MemorySSAPrinterPass(dbgs()))
 FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(dbgs()))
 FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(dbgs()))
 FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs()))
+FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(dbgs()))
 FUNCTION_PASS("reassociate", ReassociatePass())
+FUNCTION_PASS("scalarizer", ScalarizerPass())
 FUNCTION_PASS("sccp", SCCPPass())
 FUNCTION_PASS("simplify-cfg", SimplifyCFGPass())
 FUNCTION_PASS("sink", SinkingPass())
@@ -223,6 +230,7 @@ FUNCTION_PASS("verify<memoryssa>", MemorySSAVerifierPass())
 FUNCTION_PASS("verify<regions>", RegionInfoVerifierPass())
 FUNCTION_PASS("view-cfg", CFGViewerPass())
 FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass())
+FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
 #undef FUNCTION_PASS
 
 #ifndef LOOP_ANALYSIS
diff --git a/lib/Passes/StandardInstrumentations.cpp b/lib/Passes/StandardInstrumentations.cpp
index 48d36e5a01e58ad9f9335dca6be6fb3bbe5875c8..765ffe626620108a47adcc1ab77788d66b6876a1 100644
--- a/lib/Passes/StandardInstrumentations.cpp
+++ b/lib/Passes/StandardInstrumentations.cpp
@@ -53,7 +53,6 @@ void unwrapAndPrint(StringRef Banner, Any IR) {
     Extra = formatv(" (function: {0})\n", F->getName());
   } else if (any_isa<const LazyCallGraph::SCC *>(IR)) {
     const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
-    assert(C);
     if (!llvm::forcePrintModuleIR()) {
       Extra = formatv(" (scc: {0})\n", C->getName());
       bool BannerPrinted = false;
diff --git a/lib/Support/AArch64TargetParser.cpp b/lib/Support/AArch64TargetParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e897137df680fcfc99f344223b3ecf31c152790f
--- /dev/null
+++ b/lib/Support/AArch64TargetParser.cpp
@@ -0,0 +1,206 @@
+//===-- AArch64TargetParser - Parser for AArch64 features -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise AArch64 hardware features
+// such as FPU/CPU/ARCH and extension names.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/AArch64TargetParser.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include <cctype>
+
+using namespace llvm;
+
+static unsigned checkArchVersion(llvm::StringRef Arch) {
+  if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1]))
+    return (Arch[1] - 48);
+  return 0;
+}
+
+unsigned AArch64::getDefaultFPU(StringRef CPU, AArch64::ArchKind AK) {
+  if (CPU == "generic")
+    return AArch64ARCHNames[static_cast<unsigned>(AK)].DefaultFPU;
+
+  return StringSwitch<unsigned>(CPU)
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)       \
+  .Case(NAME, ARM::DEFAULT_FPU)
+#include "../../include/llvm/Support/AArch64TargetParser.def"
+  .Default(ARM::FK_INVALID);
+}
+
+unsigned AArch64::getDefaultExtensions(StringRef CPU, AArch64::ArchKind AK) {
+  if (CPU == "generic")
+    return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchBaseExtensions;
+
+  return StringSwitch<unsigned>(CPU)
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)       \
+  .Case(NAME, AArch64ARCHNames[static_cast<unsigned>(ArchKind::ID)]            \
+                      .ArchBaseExtensions |                                    \
+                  DEFAULT_EXT)
+#include "../../include/llvm/Support/AArch64TargetParser.def"
+  .Default(AArch64::AEK_INVALID);
+}
+
+AArch64::ArchKind AArch64::getCPUArchKind(StringRef CPU) {
+  if (CPU == "generic")
+    return ArchKind::ARMV8A;
+
+  return StringSwitch<AArch64::ArchKind>(CPU)
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)       \
+  .Case(NAME, ArchKind::ID)
+#include "../../include/llvm/Support/AArch64TargetParser.def"
+  .Default(ArchKind::INVALID);
+}
+
+bool AArch64::getExtensionFeatures(unsigned Extensions,
+                                   std::vector<StringRef> &Features) {
+  if (Extensions == AArch64::AEK_INVALID)
+    return false;
+
+  if (Extensions & AEK_FP)
+    Features.push_back("+fp-armv8");
+  if (Extensions & AEK_SIMD)
+    Features.push_back("+neon");
+  if (Extensions & AEK_CRC)
+    Features.push_back("+crc");
+  if (Extensions & AEK_CRYPTO)
+    Features.push_back("+crypto");
+  if (Extensions & AEK_DOTPROD)
+    Features.push_back("+dotprod");
+  if (Extensions & AEK_FP16FML)
+    Features.push_back("+fp16fml");
+  if (Extensions & AEK_FP16)
+    Features.push_back("+fullfp16");
+  if (Extensions & AEK_PROFILE)
+    Features.push_back("+spe");
+  if (Extensions & AEK_RAS)
+    Features.push_back("+ras");
+  if (Extensions & AEK_LSE)
+    Features.push_back("+lse");
+  if (Extensions & AEK_RDM)
+    Features.push_back("+rdm");
+  if (Extensions & AEK_SVE)
+    Features.push_back("+sve");
+  if (Extensions & AEK_RCPC)
+    Features.push_back("+rcpc");
+
+  return true;
+}
+
+bool AArch64::getArchFeatures(AArch64::ArchKind AK,
+                              std::vector<StringRef> &Features) {
+  if (AK == ArchKind::ARMV8_1A)
+    Features.push_back("+v8.1a");
+  if (AK == ArchKind::ARMV8_2A)
+    Features.push_back("+v8.2a");
+  if (AK == ArchKind::ARMV8_3A)
+    Features.push_back("+v8.3a");
+  if (AK == ArchKind::ARMV8_4A)
+    Features.push_back("+v8.4a");
+  if (AK == ArchKind::ARMV8_5A)
+    Features.push_back("+v8.5a");
+
+  return AK != ArchKind::INVALID;
+}
+
+StringRef AArch64::getArchName(AArch64::ArchKind AK) {
+  return AArch64ARCHNames[static_cast<unsigned>(AK)].getName();
+}
+
+StringRef AArch64::getCPUAttr(AArch64::ArchKind AK) {
+  return AArch64ARCHNames[static_cast<unsigned>(AK)].getCPUAttr();
+}
+
+StringRef AArch64::getSubArch(AArch64::ArchKind AK) {
+  return AArch64ARCHNames[static_cast<unsigned>(AK)].getSubArch();
+}
+
+unsigned AArch64::getArchAttr(AArch64::ArchKind AK) {
+  return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchAttr;
+}
+
+StringRef AArch64::getArchExtName(unsigned ArchExtKind) {
+  for (const auto &AE : AArch64ARCHExtNames)
+    if (ArchExtKind == AE.ID)
+      return AE.getName();
+  return StringRef();
+}
+
+StringRef AArch64::getArchExtFeature(StringRef ArchExt) {
+  if (ArchExt.startswith("no")) {
+    StringRef ArchExtBase(ArchExt.substr(2));
+    for (const auto &AE : AArch64ARCHExtNames) {
+      if (AE.NegFeature && ArchExtBase == AE.getName())
+        return StringRef(AE.NegFeature);
+    }
+  }
+
+  for (const auto &AE : AArch64ARCHExtNames)
+    if (AE.Feature && ArchExt == AE.getName())
+      return StringRef(AE.Feature);
+  return StringRef();
+}
+
+StringRef AArch64::getDefaultCPU(StringRef Arch) {
+  ArchKind AK = parseArch(Arch);
+  if (AK == ArchKind::INVALID)
+    return StringRef();
+
+  // Look for multiple AKs to find the default for pair AK+Name.
+  for (const auto &CPU : AArch64CPUNames)
+    if (CPU.ArchID == AK && CPU.Default)
+      return CPU.getName();
+
+  // If we can't find a default then target the architecture instead
+  return "generic";
+}
+
+void AArch64::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
+  for (const auto &Arch : AArch64CPUNames) {
+    if (Arch.ArchID != ArchKind::INVALID)
+      Values.push_back(Arch.getName());
+  }
+}
+
+bool AArch64::isX18ReservedByDefault(const Triple &TT) {
+  return TT.isAndroid() || TT.isOSDarwin() || TT.isOSFuchsia() ||
+         TT.isOSWindows();
+}
+
+// Allows partial match, ex. "v8a" matches "armv8a".
+AArch64::ArchKind AArch64::parseArch(StringRef Arch) {
+  Arch = ARM::getCanonicalArchName(Arch);
+  if (checkArchVersion(Arch) < 8)
+    return ArchKind::INVALID;
+
+  StringRef Syn = ARM::getArchSynonym(Arch);
+  for (const auto A : AArch64ARCHNames) {
+    if (A.getName().endswith(Syn))
+      return A.ID;
+  }
+  return ArchKind::INVALID;
+}
+
+AArch64::ArchExtKind AArch64::parseArchExt(StringRef ArchExt) {
+  for (const auto A : AArch64ARCHExtNames) {
+    if (ArchExt == A.getName())
+      return static_cast<ArchExtKind>(A.ID);
+  }
+  return AArch64::AEK_INVALID;
+}
+
+AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) {
+  for (const auto C : AArch64CPUNames) {
+    if (CPU == C.getName())
+      return C.ArchID;
+  }
+  return ArchKind::INVALID;
+}
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index f2f5cca946f3706314a97f8c005aa26c90ce5b44..a5f4f98c489ae4f44822a99d17ea1a682c18f351 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -1947,7 +1947,43 @@ APInt APInt::ushl_ov(const APInt &ShAmt, bool &Overflow) const {
   return *this << ShAmt;
 }
 
+APInt APInt::sadd_sat(const APInt &RHS) const {
+  bool Overflow;
+  APInt Res = sadd_ov(RHS, Overflow);
+  if (!Overflow)
+    return Res;
 
+  return isNegative() ? APInt::getSignedMinValue(BitWidth)
+                      : APInt::getSignedMaxValue(BitWidth);
+}
+
+APInt APInt::uadd_sat(const APInt &RHS) const {
+  bool Overflow;
+  APInt Res = uadd_ov(RHS, Overflow);
+  if (!Overflow)
+    return Res;
+
+  return APInt::getMaxValue(BitWidth);
+}
+
+APInt APInt::ssub_sat(const APInt &RHS) const {
+  bool Overflow;
+  APInt Res = ssub_ov(RHS, Overflow);
+  if (!Overflow)
+    return Res;
+
+  return isNegative() ? APInt::getSignedMinValue(BitWidth)
+                      : APInt::getSignedMaxValue(BitWidth);
+}
+
+APInt APInt::usub_sat(const APInt &RHS) const {
+  bool Overflow;
+  APInt Res = usub_ov(RHS, Overflow);
+  if (!Overflow)
+    return Res;
+
+  return APInt(BitWidth, 0);
+}
 
 
 void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
diff --git a/lib/Support/ARMTargetParser.cpp b/lib/Support/ARMTargetParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07294b0c09a3031bc138072eb56d46c122994c66
--- /dev/null
+++ b/lib/Support/ARMTargetParser.cpp
@@ -0,0 +1,577 @@
+//===-- ARMTargetParser - Parser for ARM target features --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise ARM hardware features
+// such as FPU/CPU/ARCH/extensions and specific support such as HWDIV.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ARMTargetParser.h"
+#include "llvm/ADT/StringSwitch.h"
+#include <cctype>
+
+using namespace llvm;
+
+static StringRef getHWDivSynonym(StringRef HWDiv) {
+  return StringSwitch<StringRef>(HWDiv)
+      .Case("thumb,arm", "arm,thumb")
+      .Default(HWDiv);
+}
+
+// Allows partial match, ex. "v7a" matches "armv7a".
+ARM::ArchKind ARM::parseArch(StringRef Arch) {
+  Arch = getCanonicalArchName(Arch);
+  StringRef Syn = getArchSynonym(Arch);
+  for (const auto A : ARCHNames) {
+    if (A.getName().endswith(Syn))
+      return A.ID;
+  }
+  return ArchKind::INVALID;
+}
+
+// Version number (ex. v7 = 7).
+unsigned ARM::parseArchVersion(StringRef Arch) {
+  Arch = getCanonicalArchName(Arch);
+  switch (parseArch(Arch)) {
+  case ArchKind::ARMV2:
+  case ArchKind::ARMV2A:
+    return 2;
+  case ArchKind::ARMV3:
+  case ArchKind::ARMV3M:
+    return 3;
+  case ArchKind::ARMV4:
+  case ArchKind::ARMV4T:
+    return 4;
+  case ArchKind::ARMV5T:
+  case ArchKind::ARMV5TE:
+  case ArchKind::IWMMXT:
+  case ArchKind::IWMMXT2:
+  case ArchKind::XSCALE:
+  case ArchKind::ARMV5TEJ:
+    return 5;
+  case ArchKind::ARMV6:
+  case ArchKind::ARMV6K:
+  case ArchKind::ARMV6T2:
+  case ArchKind::ARMV6KZ:
+  case ArchKind::ARMV6M:
+    return 6;
+  case ArchKind::ARMV7A:
+  case ArchKind::ARMV7VE:
+  case ArchKind::ARMV7R:
+  case ArchKind::ARMV7M:
+  case ArchKind::ARMV7S:
+  case ArchKind::ARMV7EM:
+  case ArchKind::ARMV7K:
+    return 7;
+  case ArchKind::ARMV8A:
+  case ArchKind::ARMV8_1A:
+  case ArchKind::ARMV8_2A:
+  case ArchKind::ARMV8_3A:
+  case ArchKind::ARMV8_4A:
+  case ArchKind::ARMV8_5A:
+  case ArchKind::ARMV8R:
+  case ArchKind::ARMV8MBaseline:
+  case ArchKind::ARMV8MMainline:
+    return 8;
+  case ArchKind::INVALID:
+    return 0;
+  }
+  llvm_unreachable("Unhandled architecture");
+}
+
+// Profile A/R/M
+ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
+  Arch = getCanonicalArchName(Arch);
+  switch (parseArch(Arch)) {
+  case ArchKind::ARMV6M:
+  case ArchKind::ARMV7M:
+  case ArchKind::ARMV7EM:
+  case ArchKind::ARMV8MMainline:
+  case ArchKind::ARMV8MBaseline:
+    return ProfileKind::M;
+  case ArchKind::ARMV7R:
+  case ArchKind::ARMV8R:
+    return ProfileKind::R;
+  case ArchKind::ARMV7A:
+  case ArchKind::ARMV7VE:
+  case ArchKind::ARMV7K:
+  case ArchKind::ARMV8A:
+  case ArchKind::ARMV8_1A:
+  case ArchKind::ARMV8_2A:
+  case ArchKind::ARMV8_3A:
+  case ArchKind::ARMV8_4A:
+  case ArchKind::ARMV8_5A:
+    return ProfileKind::A;
+  case ArchKind::ARMV2:
+  case ArchKind::ARMV2A:
+  case ArchKind::ARMV3:
+  case ArchKind::ARMV3M:
+  case ArchKind::ARMV4:
+  case ArchKind::ARMV4T:
+  case ArchKind::ARMV5T:
+  case ArchKind::ARMV5TE:
+  case ArchKind::ARMV5TEJ:
+  case ArchKind::ARMV6:
+  case ArchKind::ARMV6K:
+  case ArchKind::ARMV6T2:
+  case ArchKind::ARMV6KZ:
+  case ArchKind::ARMV7S:
+  case ArchKind::IWMMXT:
+  case ArchKind::IWMMXT2:
+  case ArchKind::XSCALE:
+  case ArchKind::INVALID:
+    return ProfileKind::INVALID;
+  }
+  llvm_unreachable("Unhandled architecture");
+}
+
+StringRef ARM::getArchSynonym(StringRef Arch) {
+  return StringSwitch<StringRef>(Arch)
+      .Case("v5", "v5t")
+      .Case("v5e", "v5te")
+      .Case("v6j", "v6")
+      .Case("v6hl", "v6k")
+      .Cases("v6m", "v6sm", "v6s-m", "v6-m")
+      .Cases("v6z", "v6zk", "v6kz")
+      .Cases("v7", "v7a", "v7hl", "v7l", "v7-a")
+      .Case("v7r", "v7-r")
+      .Case("v7m", "v7-m")
+      .Case("v7em", "v7e-m")
+      .Cases("v8", "v8a", "v8l", "aarch64", "arm64", "v8-a")
+      .Case("v8.1a", "v8.1-a")
+      .Case("v8.2a", "v8.2-a")
+      .Case("v8.3a", "v8.3-a")
+      .Case("v8.4a", "v8.4-a")
+      .Case("v8.5a", "v8.5-a")
+      .Case("v8r", "v8-r")
+      .Case("v8m.base", "v8-m.base")
+      .Case("v8m.main", "v8-m.main")
+      .Default(Arch);
+}
+
+bool ARM::getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features) {
+
+  if (FPUKind >= FK_LAST || FPUKind == FK_INVALID)
+    return false;
+
+  // fp-only-sp and d16 subtarget features are independent of each other, so we
+  // must enable/disable both.
+  switch (FPUNames[FPUKind].Restriction) {
+  case FPURestriction::SP_D16:
+    Features.push_back("+fp-only-sp");
+    Features.push_back("+d16");
+    break;
+  case FPURestriction::D16:
+    Features.push_back("-fp-only-sp");
+    Features.push_back("+d16");
+    break;
+  case FPURestriction::None:
+    Features.push_back("-fp-only-sp");
+    Features.push_back("-d16");
+    break;
+  }
+
+  // FPU version subtarget features are inclusive of lower-numbered ones, so
+  // enable the one corresponding to this version and disable all that are
+  // higher. We also have to make sure to disable fp16 when vfp4 is disabled,
+  // as +vfp4 implies +fp16 but -vfp4 does not imply -fp16.
+  switch (FPUNames[FPUKind].FPUVer) {
+  case FPUVersion::VFPV5:
+    Features.push_back("+fp-armv8");
+    break;
+  case FPUVersion::VFPV4:
+    Features.push_back("+vfp4");
+    Features.push_back("-fp-armv8");
+    break;
+  case FPUVersion::VFPV3_FP16:
+    Features.push_back("+vfp3");
+    Features.push_back("+fp16");
+    Features.push_back("-vfp4");
+    Features.push_back("-fp-armv8");
+    break;
+  case FPUVersion::VFPV3:
+    Features.push_back("+vfp3");
+    Features.push_back("-fp16");
+    Features.push_back("-vfp4");
+    Features.push_back("-fp-armv8");
+    break;
+  case FPUVersion::VFPV2:
+    Features.push_back("+vfp2");
+    Features.push_back("-vfp3");
+    Features.push_back("-fp16");
+    Features.push_back("-vfp4");
+    Features.push_back("-fp-armv8");
+    break;
+  case FPUVersion::NONE:
+    Features.push_back("-vfp2");
+    Features.push_back("-vfp3");
+    Features.push_back("-fp16");
+    Features.push_back("-vfp4");
+    Features.push_back("-fp-armv8");
+    break;
+  }
+
+  // crypto includes neon, so we handle this similarly to FPU version.
+  switch (FPUNames[FPUKind].NeonSupport) {
+  case NeonSupportLevel::Crypto:
+    Features.push_back("+neon");
+    Features.push_back("+crypto");
+    break;
+  case NeonSupportLevel::Neon:
+    Features.push_back("+neon");
+    Features.push_back("-crypto");
+    break;
+  case NeonSupportLevel::None:
+    Features.push_back("-neon");
+    Features.push_back("-crypto");
+    break;
+  }
+
+  return true;
+}
+
+// Little/Big endian
+ARM::EndianKind ARM::parseArchEndian(StringRef Arch) {
+  if (Arch.startswith("armeb") || Arch.startswith("thumbeb") ||
+      Arch.startswith("aarch64_be"))
+    return EndianKind::BIG;
+
+  if (Arch.startswith("arm") || Arch.startswith("thumb")) {
+    if (Arch.endswith("eb"))
+      return EndianKind::BIG;
+    else
+      return EndianKind::LITTLE;
+  }
+
+  if (Arch.startswith("aarch64"))
+    return EndianKind::LITTLE;
+
+  return EndianKind::INVALID;
+}
+
+// ARM, Thumb, AArch64
+ARM::ISAKind ARM::parseArchISA(StringRef Arch) {
+  return StringSwitch<ISAKind>(Arch)
+      .StartsWith("aarch64", ISAKind::AARCH64)
+      .StartsWith("arm64", ISAKind::AARCH64)
+      .StartsWith("thumb", ISAKind::THUMB)
+      .StartsWith("arm", ISAKind::ARM)
+      .Default(ISAKind::INVALID);
+}
+
+unsigned ARM::parseFPU(StringRef FPU) {
+  StringRef Syn = getFPUSynonym(FPU);
+  for (const auto F : FPUNames) {
+    if (Syn == F.getName())
+      return F.ID;
+  }
+  return FK_INVALID;
+}
+
+ARM::NeonSupportLevel ARM::getFPUNeonSupportLevel(unsigned FPUKind) {
+  if (FPUKind >= FK_LAST)
+    return NeonSupportLevel::None;
+  return FPUNames[FPUKind].NeonSupport;
+}
+
+// MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but
+// (iwmmxt|xscale)(eb)? is also permitted. If the former, return
+// "v.+", if the latter, return unmodified string, minus 'eb'.
+// If invalid, return empty string.
+StringRef ARM::getCanonicalArchName(StringRef Arch) {
+  size_t offset = StringRef::npos;
+  StringRef A = Arch;
+  StringRef Error = "";
+
+  // Begins with "arm" / "thumb", move past it.
+  if (A.startswith("arm64"))
+    offset = 5;
+  else if (A.startswith("arm"))
+    offset = 3;
+  else if (A.startswith("thumb"))
+    offset = 5;
+  else if (A.startswith("aarch64")) {
+    offset = 7;
+    // AArch64 uses "_be", not "eb" suffix.
+    if (A.find("eb") != StringRef::npos)
+      return Error;
+    if (A.substr(offset, 3) == "_be")
+      offset += 3;
+  }
+
+  // Ex. "armebv7", move past the "eb".
+  if (offset != StringRef::npos && A.substr(offset, 2) == "eb")
+    offset += 2;
+  // Or, if it ends with eb ("armv7eb"), chop it off.
+  else if (A.endswith("eb"))
+    A = A.substr(0, A.size() - 2);
+  // Trim the head
+  if (offset != StringRef::npos)
+    A = A.substr(offset);
+
+  // Empty string means offset reached the end, which means it's valid.
+  if (A.empty())
+    return Arch;
+
+  // Only match non-marketing names
+  if (offset != StringRef::npos) {
+    // Must start with 'vN'.
+    if (A.size() >= 2 && (A[0] != 'v' || !std::isdigit(A[1])))
+      return Error;
+    // Can't have an extra 'eb'.
+    if (A.find("eb") != StringRef::npos)
+      return Error;
+  }
+
+  // Arch will either be a 'v' name (v7a) or a marketing name (xscale).
+  return A;
+}
+
+StringRef ARM::getFPUSynonym(StringRef FPU) {
+  return StringSwitch<StringRef>(FPU)
+      .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported
+      .Case("vfp2", "vfpv2")
+      .Case("vfp3", "vfpv3")
+      .Case("vfp4", "vfpv4")
+      .Case("vfp3-d16", "vfpv3-d16")
+      .Case("vfp4-d16", "vfpv4-d16")
+      .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16")
+      .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16")
+      .Case("fp5-sp-d16", "fpv5-sp-d16")
+      .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16")
+      // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3.
+      .Case("neon-vfpv3", "neon")
+      .Default(FPU);
+}
+
+StringRef ARM::getFPUName(unsigned FPUKind) {
+  if (FPUKind >= FK_LAST)
+    return StringRef();
+  return FPUNames[FPUKind].getName();
+}
+
+ARM::FPUVersion ARM::getFPUVersion(unsigned FPUKind) {
+  if (FPUKind >= FK_LAST)
+    return FPUVersion::NONE;
+  return FPUNames[FPUKind].FPUVer;
+}
+
+ARM::FPURestriction ARM::getFPURestriction(unsigned FPUKind) {
+  if (FPUKind >= FK_LAST)
+    return FPURestriction::None;
+  return FPUNames[FPUKind].Restriction;
+}
+
+unsigned ARM::getDefaultFPU(StringRef CPU, ARM::ArchKind AK) {
+  if (CPU == "generic")
+    return ARM::ARCHNames[static_cast<unsigned>(AK)].DefaultFPU;
+
+  return StringSwitch<unsigned>(CPU)
+#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)           \
+  .Case(NAME, DEFAULT_FPU)
+#include "llvm/Support/ARMTargetParser.def"
+   .Default(ARM::FK_INVALID);
+}
+
+unsigned ARM::getDefaultExtensions(StringRef CPU, ARM::ArchKind AK) {
+  if (CPU == "generic")
+    return ARM::ARCHNames[static_cast<unsigned>(AK)].ArchBaseExtensions;
+
+  return StringSwitch<unsigned>(CPU)
+#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)           \
+  .Case(NAME,                                                                  \
+        ARCHNames[static_cast<unsigned>(ArchKind::ID)].ArchBaseExtensions |    \
+            DEFAULT_EXT)
+#include "llvm/Support/ARMTargetParser.def"
+  .Default(ARM::AEK_INVALID);
+}
+
+bool ARM::getHWDivFeatures(unsigned HWDivKind,
+                           std::vector<StringRef> &Features) {
+
+  if (HWDivKind == AEK_INVALID)
+    return false;
+
+  if (HWDivKind & AEK_HWDIVARM)
+    Features.push_back("+hwdiv-arm");
+  else
+    Features.push_back("-hwdiv-arm");
+
+  if (HWDivKind & AEK_HWDIVTHUMB)
+    Features.push_back("+hwdiv");
+  else
+    Features.push_back("-hwdiv");
+
+  return true;
+}
+
+bool ARM::getExtensionFeatures(unsigned Extensions,
+                               std::vector<StringRef> &Features) {
+
+  if (Extensions == AEK_INVALID)
+    return false;
+
+  if (Extensions & AEK_CRC)
+    Features.push_back("+crc");
+  else
+    Features.push_back("-crc");
+
+  if (Extensions & AEK_DSP)
+    Features.push_back("+dsp");
+  else
+    Features.push_back("-dsp");
+
+  if (Extensions & AEK_FP16FML)
+    Features.push_back("+fp16fml");
+  else
+    Features.push_back("-fp16fml");
+
+  if (Extensions & AEK_RAS)
+    Features.push_back("+ras");
+  else
+    Features.push_back("-ras");
+
+  if (Extensions & AEK_DOTPROD)
+    Features.push_back("+dotprod");
+  else
+    Features.push_back("-dotprod");
+
+  return getHWDivFeatures(Extensions, Features);
+}
+
+StringRef ARM::getArchName(ARM::ArchKind AK) {
+  return ARCHNames[static_cast<unsigned>(AK)].getName();
+}
+
+StringRef ARM::getCPUAttr(ARM::ArchKind AK) {
+  return ARCHNames[static_cast<unsigned>(AK)].getCPUAttr();
+}
+
+StringRef ARM::getSubArch(ARM::ArchKind AK) {
+  return ARCHNames[static_cast<unsigned>(AK)].getSubArch();
+}
+
+unsigned ARM::getArchAttr(ARM::ArchKind AK) {
+  return ARCHNames[static_cast<unsigned>(AK)].ArchAttr;
+}
+
+StringRef ARM::getArchExtName(unsigned ArchExtKind) {
+  for (const auto AE : ARCHExtNames) {
+    if (ArchExtKind == AE.ID)
+      return AE.getName();
+  }
+  return StringRef();
+}
+
+StringRef ARM::getArchExtFeature(StringRef ArchExt) {
+  if (ArchExt.startswith("no")) {
+    StringRef ArchExtBase(ArchExt.substr(2));
+    for (const auto AE : ARCHExtNames) {
+      if (AE.NegFeature && ArchExtBase == AE.getName())
+        return StringRef(AE.NegFeature);
+    }
+  }
+  for (const auto AE : ARCHExtNames) {
+    if (AE.Feature && ArchExt == AE.getName())
+      return StringRef(AE.Feature);
+  }
+
+  return StringRef();
+}
+
+StringRef ARM::getHWDivName(unsigned HWDivKind) {
+  for (const auto D : HWDivNames) {
+    if (HWDivKind == D.ID)
+      return D.getName();
+  }
+  return StringRef();
+}
+
+StringRef ARM::getDefaultCPU(StringRef Arch) {
+  ArchKind AK = parseArch(Arch);
+  if (AK == ArchKind::INVALID)
+    return StringRef();
+
+  // Look for multiple AKs to find the default for pair AK+Name.
+  for (const auto CPU : CPUNames) {
+    if (CPU.ArchID == AK && CPU.Default)
+      return CPU.getName();
+  }
+
+  // If we can't find a default then target the architecture instead
+  return "generic";
+}
+
+unsigned ARM::parseHWDiv(StringRef HWDiv) {
+  StringRef Syn = getHWDivSynonym(HWDiv);
+  for (const auto D : HWDivNames) {
+    if (Syn == D.getName())
+      return D.ID;
+  }
+  return AEK_INVALID;
+}
+
+unsigned ARM::parseArchExt(StringRef ArchExt) {
+  for (const auto A : ARCHExtNames) {
+    if (ArchExt == A.getName())
+      return A.ID;
+  }
+  return AEK_INVALID;
+}
+
+ARM::ArchKind ARM::parseCPUArch(StringRef CPU) {
+  for (const auto C : CPUNames) {
+    if (CPU == C.getName())
+      return C.ArchID;
+  }
+  return ArchKind::INVALID;
+}
+
+void ARM::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
+  for (const CpuNames<ArchKind> &Arch : CPUNames) {
+    if (Arch.ArchID != ArchKind::INVALID)
+      Values.push_back(Arch.getName());
+  }
+}
+
+StringRef ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) {
+  StringRef ArchName =
+      CPU.empty() ? TT.getArchName() : getArchName(parseCPUArch(CPU));
+
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getEnvironment() == Triple::EABI ||
+        TT.getOS() == Triple::UnknownOS ||
+        parseArchProfile(ArchName) == ProfileKind::M)
+      return "aapcs";
+    if (TT.isWatchABI())
+      return "aapcs16";
+    return "apcs-gnu";
+  } else if (TT.isOSWindows())
+    // FIXME: this is invalid for WindowsCE.
+    return "aapcs";
+
+  // Select the default based on the platform.
+  switch (TT.getEnvironment()) {
+  case Triple::Android:
+  case Triple::GNUEABI:
+  case Triple::GNUEABIHF:
+  case Triple::MuslEABI:
+  case Triple::MuslEABIHF:
+    return "aapcs-linux";
+  case Triple::EABIHF:
+  case Triple::EABI:
+    return "aapcs";
+  default:
+    if (TT.isOSNetBSD())
+      return "apcs-gnu";
+    if (TT.isOSOpenBSD())
+      return "aapcs-linux";
+    return "aapcs";
+  }
+}
diff --git a/lib/Support/BuryPointer.cpp b/lib/Support/BuryPointer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c988b4a0ab263d79e76e5ba397647859cfe7e1c
--- /dev/null
+++ b/lib/Support/BuryPointer.cpp
@@ -0,0 +1,31 @@
+//===- BuryPointer.cpp - Memory Manipulation/Leak ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BuryPointer.h"
+#include "llvm/Support/Compiler.h"
+#include <atomic>
+
+namespace llvm {
+
+void BuryPointer(const void *Ptr) {
+  // This function may be called only a small fixed amount of times per each
+  // invocation, otherwise we do actually have a leak which we want to report.
+  // If this function is called more than kGraveYardMaxSize times, the pointers
+  // will not be properly buried and a leak detector will report a leak, which
+  // is what we want in such case.
+  static const size_t kGraveYardMaxSize = 16;
+  LLVM_ATTRIBUTE_UNUSED static const void *GraveYard[kGraveYardMaxSize];
+  static std::atomic<unsigned> GraveYardSize;
+  unsigned Idx = GraveYardSize++;
+  if (Idx >= kGraveYardMaxSize)
+    return;
+  GraveYard[Idx] = Ptr;
+}
+
+}
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 1446359f0cfef57e235fd765d57845f128d09939..2a6810672b1da92a5146df21f5b761197ba1f29f 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -45,6 +45,8 @@ if (MSVC)
 endif()
 
 add_llvm_library(LLVMSupport
+  AArch64TargetParser.cpp
+  ARMTargetParser.cpp
   AMDGPUMetadata.cpp
   APFloat.cpp
   APInt.cpp
@@ -59,6 +61,7 @@ add_llvm_library(LLVMSupport
   BinaryStreamWriter.cpp
   BlockFrequency.cpp
   BranchProbability.cpp
+  BuryPointer.cpp
   CachePruning.cpp
   circular_raw_ostream.cpp
   Chrono.cpp
diff --git a/lib/Support/Error.cpp b/lib/Support/Error.cpp
index ad2443db9af596d304976f38f39356745388e781..0b0be8d48fa7a72b9088293b08b598ef6de4c929 100644
--- a/lib/Support/Error.cpp
+++ b/lib/Support/Error.cpp
@@ -141,7 +141,7 @@ void report_fatal_error(Error Err, bool GenCrashDiag) {
   std::string ErrMsg;
   {
     raw_string_ostream ErrStream(ErrMsg);
-    logAllUnhandledErrors(std::move(Err), ErrStream, "");
+    logAllUnhandledErrors(std::move(Err), ErrStream);
   }
   report_fatal_error(ErrMsg);
 }
diff --git a/lib/Support/FileCheck.cpp b/lib/Support/FileCheck.cpp
index 386fa80b8b9d3adc5aaf6707d584e48916f3fa04..37986c96c0817f600ef34b16aecd5b93643f28ea 100644
--- a/lib/Support/FileCheck.cpp
+++ b/lib/Support/FileCheck.cpp
@@ -16,8 +16,12 @@
 
 #include "llvm/Support/FileCheck.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <cstdint>
 #include <list>
 #include <map>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -408,9 +412,28 @@ void FileCheckPattern::PrintVariableUses(const SourceMgr &SM, StringRef Buffer,
   }
 }
 
+static SMRange ProcessMatchResult(FileCheckDiag::MatchType MatchTy,
+                                  const SourceMgr &SM, SMLoc Loc,
+                                  Check::FileCheckType CheckTy,
+                                  StringRef Buffer, size_t Pos, size_t Len,
+                                  std::vector<FileCheckDiag> *Diags,
+                                  bool AdjustPrevDiag = false) {
+  SMLoc Start = SMLoc::getFromPointer(Buffer.data() + Pos);
+  SMLoc End = SMLoc::getFromPointer(Buffer.data() + Pos + Len);
+  SMRange Range(Start, End);
+  if (Diags) {
+    if (AdjustPrevDiag)
+      Diags->rbegin()->MatchTy = MatchTy;
+    else
+      Diags->emplace_back(SM, CheckTy, Loc, MatchTy, Range);
+  }
+  return Range;
+}
+
 void FileCheckPattern::PrintFuzzyMatch(
     const SourceMgr &SM, StringRef Buffer,
-    const StringMap<StringRef> &VariableTable) const {
+    const StringMap<StringRef> &VariableTable,
+    std::vector<FileCheckDiag> *Diags) const {
   // Attempt to find the closest/best fuzzy match.  Usually an error happens
   // because some string in the output didn't exactly match. In these cases, we
   // would like to show the user a best guess at what "should have" matched, to
@@ -444,8 +467,11 @@ void FileCheckPattern::PrintFuzzyMatch(
   // reasonable and not equal to what we showed in the "scanning from here"
   // line.
   if (Best && Best != StringRef::npos && BestQuality < 50) {
-    SM.PrintMessage(SMLoc::getFromPointer(Buffer.data() + Best),
-                    SourceMgr::DK_Note, "possible intended match here");
+    SMRange MatchRange =
+        ProcessMatchResult(FileCheckDiag::MatchFuzzy, SM, getLoc(),
+                           getCheckTy(), Buffer, Best, 0, Diags);
+    SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note,
+                    "possible intended match here");
 
     // FIXME: If we wanted to be really friendly we would show why the match
     // failed, as it can be hard to spot simple one character differences.
@@ -527,51 +553,42 @@ llvm::FileCheck::CanonicalizeFile(MemoryBuffer &MB,
   return StringRef(OutputBuffer.data(), OutputBuffer.size() - 1);
 }
 
+FileCheckDiag::FileCheckDiag(const SourceMgr &SM,
+                             const Check::FileCheckType &CheckTy,
+                             SMLoc CheckLoc, MatchType MatchTy,
+                             SMRange InputRange)
+    : CheckTy(CheckTy), MatchTy(MatchTy) {
+  auto Start = SM.getLineAndColumn(InputRange.Start);
+  auto End = SM.getLineAndColumn(InputRange.End);
+  InputStartLine = Start.first;
+  InputStartCol = Start.second;
+  InputEndLine = End.first;
+  InputEndCol = End.second;
+  Start = SM.getLineAndColumn(CheckLoc);
+  CheckLine = Start.first;
+  CheckCol = Start.second;
+}
+
 static bool IsPartOfWord(char c) {
   return (isalnum(c) || c == '-' || c == '_');
 }
 
-// Get the size of the prefix extension.
-static size_t CheckTypeSize(Check::FileCheckType Ty) {
-  switch (Ty) {
-  case Check::CheckNone:
-  case Check::CheckBadNot:
-    return 0;
-
-  case Check::CheckPlain:
-    return sizeof(":") - 1;
-
-  case Check::CheckNext:
-    return sizeof("-NEXT:") - 1;
-
-  case Check::CheckSame:
-    return sizeof("-SAME:") - 1;
-
-  case Check::CheckNot:
-    return sizeof("-NOT:") - 1;
-
-  case Check::CheckDAG:
-    return sizeof("-DAG:") - 1;
-
-  case Check::CheckLabel:
-    return sizeof("-LABEL:") - 1;
-
-  case Check::CheckEmpty:
-    return sizeof("-EMPTY:") - 1;
-
-  case Check::CheckEOF:
-    llvm_unreachable("Should not be using EOF size");
-  }
-
-  llvm_unreachable("Bad check type");
+Check::FileCheckType &Check::FileCheckType::setCount(int C) {
+  assert(Count > 0 && "zero and negative counts are not supported");
+  assert((C == 1 || Kind == CheckPlain) &&
+         "count supported only for plain CHECK directives");
+  Count = C;
+  return *this;
 }
 
 // Get a description of the type.
-static std::string CheckTypeName(StringRef Prefix, Check::FileCheckType Ty) {
-  switch (Ty) {
+std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
+  switch (Kind) {
   case Check::CheckNone:
     return "invalid";
   case Check::CheckPlain:
+    if (Count > 1)
+      return Prefix.str() + "-COUNT";
     return Prefix;
   case Check::CheckNext:
     return Prefix.str() + "-NEXT";
@@ -589,50 +606,65 @@ static std::string CheckTypeName(StringRef Prefix, Check::FileCheckType Ty) {
     return "implicit EOF";
   case Check::CheckBadNot:
     return "bad NOT";
+  case Check::CheckBadCount:
+    return "bad COUNT";
   }
   llvm_unreachable("unknown FileCheckType");
 }
 
-static Check::FileCheckType FindCheckType(StringRef Buffer, StringRef Prefix) {
+static std::pair<Check::FileCheckType, StringRef>
+FindCheckType(StringRef Buffer, StringRef Prefix) {
   if (Buffer.size() <= Prefix.size())
-    return Check::CheckNone;
+    return {Check::CheckNone, StringRef()};
 
   char NextChar = Buffer[Prefix.size()];
 
+  StringRef Rest = Buffer.drop_front(Prefix.size() + 1);
   // Verify that the : is present after the prefix.
   if (NextChar == ':')
-    return Check::CheckPlain;
+    return {Check::CheckPlain, Rest};
 
   if (NextChar != '-')
-    return Check::CheckNone;
+    return {Check::CheckNone, StringRef()};
+
+  if (Rest.consume_front("COUNT-")) {
+    int64_t Count;
+    if (Rest.consumeInteger(10, Count))
+      // Error happened in parsing integer.
+      return {Check::CheckBadCount, Rest};
+    if (Count <= 0 || Count > INT32_MAX)
+      return {Check::CheckBadCount, Rest};
+    if (!Rest.consume_front(":"))
+      return {Check::CheckBadCount, Rest};
+    return {Check::FileCheckType(Check::CheckPlain).setCount(Count), Rest};
+  }
 
-  StringRef Rest = Buffer.drop_front(Prefix.size() + 1);
-  if (Rest.startswith("NEXT:"))
-    return Check::CheckNext;
+  if (Rest.consume_front("NEXT:"))
+    return {Check::CheckNext, Rest};
 
-  if (Rest.startswith("SAME:"))
-    return Check::CheckSame;
+  if (Rest.consume_front("SAME:"))
+    return {Check::CheckSame, Rest};
 
-  if (Rest.startswith("NOT:"))
-    return Check::CheckNot;
+  if (Rest.consume_front("NOT:"))
+    return {Check::CheckNot, Rest};
 
-  if (Rest.startswith("DAG:"))
-    return Check::CheckDAG;
+  if (Rest.consume_front("DAG:"))
+    return {Check::CheckDAG, Rest};
 
-  if (Rest.startswith("LABEL:"))
-    return Check::CheckLabel;
+  if (Rest.consume_front("LABEL:"))
+    return {Check::CheckLabel, Rest};
 
-  if (Rest.startswith("EMPTY:"))
-    return Check::CheckEmpty;
+  if (Rest.consume_front("EMPTY:"))
+    return {Check::CheckEmpty, Rest};
 
   // You can't combine -NOT with another suffix.
   if (Rest.startswith("DAG-NOT:") || Rest.startswith("NOT-DAG:") ||
       Rest.startswith("NEXT-NOT:") || Rest.startswith("NOT-NEXT:") ||
       Rest.startswith("SAME-NOT:") || Rest.startswith("NOT-SAME:") ||
       Rest.startswith("EMPTY-NOT:") || Rest.startswith("NOT-EMPTY:"))
-    return Check::CheckBadNot;
+    return {Check::CheckBadNot, Rest};
 
-  return Check::CheckNone;
+  return {Check::CheckNone, Rest};
 }
 
 // From the given position, find the next character after the word.
@@ -651,8 +683,12 @@ static size_t SkipWord(StringRef Str, size_t Loc) {
 /// 2) The found prefix must be followed by a valid check type suffix using \c
 ///    FindCheckType above.
 ///
-/// The first match of the regular expression to satisfy these two is returned,
-/// otherwise an empty StringRef is returned to indicate failure.
+/// Returns a pair of StringRefs into the Buffer, which combines:
+///   - the first match of the regular expression to satisfy these two is
+///   returned,
+///     otherwise an empty StringRef is returned to indicate failure.
+///   - buffer rewound to the location right after parsed suffix, for parsing
+///     to continue from
 ///
 /// If this routine returns a valid prefix, it will also shrink \p Buffer to
 /// start at the beginning of the returned prefix, increment \p LineNumber for
@@ -661,16 +697,16 @@ static size_t SkipWord(StringRef Str, size_t Loc) {
 ///
 /// If no valid prefix is found, the state of Buffer, LineNumber, and CheckTy
 /// is unspecified.
-static StringRef FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer,
-                                         unsigned &LineNumber,
-                                         Check::FileCheckType &CheckTy) {
+static std::pair<StringRef, StringRef>
+FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer,
+                        unsigned &LineNumber, Check::FileCheckType &CheckTy) {
   SmallVector<StringRef, 2> Matches;
 
   while (!Buffer.empty()) {
     // Find the first (longest) match using the RE.
     if (!PrefixRE.match(Buffer, &Matches))
       // No match at all, bail.
-      return StringRef();
+      return {StringRef(), StringRef()};
 
     StringRef Prefix = Matches[0];
     Matches.clear();
@@ -690,11 +726,12 @@ static StringRef FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer,
     // intentional and unintentional uses of this feature.
     if (Skipped.empty() || !IsPartOfWord(Skipped.back())) {
       // Now extract the type.
-      CheckTy = FindCheckType(Buffer, Prefix);
+      StringRef AfterSuffix;
+      std::tie(CheckTy, AfterSuffix) = FindCheckType(Buffer, Prefix);
 
       // If we've found a valid check type for this prefix, we're done.
       if (CheckTy != Check::CheckNone)
-        return Prefix;
+        return {Prefix, AfterSuffix};
     }
 
     // If we didn't successfully find a prefix, we need to skip this invalid
@@ -704,7 +741,7 @@ static StringRef FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer,
   }
 
   // We ran out of buffer while skipping partial matches so give up.
-  return StringRef();
+  return {StringRef(), StringRef()};
 }
 
 /// Read the check file, which specifies the sequence of expected strings.
@@ -742,19 +779,26 @@ bool llvm::FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer,
     Check::FileCheckType CheckTy;
 
     // See if a prefix occurs in the memory buffer.
-    StringRef UsedPrefix = FindFirstMatchingPrefix(PrefixRE, Buffer, LineNumber,
-                                                   CheckTy);
+    StringRef UsedPrefix;
+    StringRef AfterSuffix;
+    std::tie(UsedPrefix, AfterSuffix) =
+        FindFirstMatchingPrefix(PrefixRE, Buffer, LineNumber, CheckTy);
     if (UsedPrefix.empty())
       break;
     assert(UsedPrefix.data() == Buffer.data() &&
            "Failed to move Buffer's start forward, or pointed prefix outside "
            "of the buffer!");
+    assert(AfterSuffix.data() >= Buffer.data() &&
+           AfterSuffix.data() < Buffer.data() + Buffer.size() &&
+           "Parsing after suffix doesn't start inside of buffer!");
 
     // Location to use for error messages.
     const char *UsedPrefixStart = UsedPrefix.data();
 
-    // Skip the buffer to the end.
-    Buffer = Buffer.drop_front(UsedPrefix.size() + CheckTypeSize(CheckTy));
+    // Skip the buffer to the end of parsed suffix (or just prefix, if no good
+    // suffix was processed).
+    Buffer = AfterSuffix.empty() ? Buffer.drop_front(UsedPrefix.size())
+                                 : AfterSuffix;
 
     // Complain about useful-looking but unsupported suffixes.
     if (CheckTy == Check::CheckBadNot) {
@@ -763,6 +807,14 @@ bool llvm::FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer,
       return true;
     }
 
+    // Complain about invalid count specification.
+    if (CheckTy == Check::CheckBadCount) {
+      SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Error,
+                      "invalid count in -COUNT specification on prefix '" +
+                          UsedPrefix + "'");
+      return true;
+    }
+
     // Okay, we found the prefix, yay. Remember the rest of the line, but ignore
     // leading whitespace.
     if (!(Req.NoCanonicalizeWhiteSpace && Req.MatchFullLines))
@@ -845,69 +897,87 @@ bool llvm::FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer,
 
 static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM,
                        StringRef Prefix, SMLoc Loc, const FileCheckPattern &Pat,
-                       StringRef Buffer, StringMap<StringRef> &VariableTable,
-                       size_t MatchPos, size_t MatchLen,
-                       const FileCheckRequest &Req) {
+                       int MatchedCount, StringRef Buffer,
+                       StringMap<StringRef> &VariableTable, size_t MatchPos,
+                       size_t MatchLen, const FileCheckRequest &Req,
+                       std::vector<FileCheckDiag> *Diags) {
   if (ExpectedMatch) {
     if (!Req.Verbose)
       return;
     if (!Req.VerboseVerbose && Pat.getCheckTy() == Check::CheckEOF)
       return;
   }
-  SMLoc MatchStart = SMLoc::getFromPointer(Buffer.data() + MatchPos);
-  SMLoc MatchEnd = SMLoc::getFromPointer(Buffer.data() + MatchPos + MatchLen);
-  SMRange MatchRange(MatchStart, MatchEnd);
+  SMRange MatchRange = ProcessMatchResult(
+      ExpectedMatch ? FileCheckDiag::MatchFoundAndExpected
+                    : FileCheckDiag::MatchFoundButExcluded,
+      SM, Loc, Pat.getCheckTy(), Buffer, MatchPos, MatchLen, Diags);
+  std::string Message = formatv("{0}: {1} string found in input",
+                                Pat.getCheckTy().getDescription(Prefix),
+                                (ExpectedMatch ? "expected" : "excluded"))
+                            .str();
+  if (Pat.getCount() > 1)
+    Message += formatv(" ({0} out of {1})", MatchedCount, Pat.getCount()).str();
+
   SM.PrintMessage(
-      Loc, ExpectedMatch ? SourceMgr::DK_Remark : SourceMgr::DK_Error,
-      CheckTypeName(Prefix, Pat.getCheckTy()) + ": " +
-          (ExpectedMatch ? "expected" : "excluded") +
-          " string found in input");
-  SM.PrintMessage(MatchStart, SourceMgr::DK_Note, "found here", {MatchRange});
+      Loc, ExpectedMatch ? SourceMgr::DK_Remark : SourceMgr::DK_Error, Message);
+  SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note, "found here",
+                  {MatchRange});
   Pat.PrintVariableUses(SM, Buffer, VariableTable, MatchRange);
 }
 
 static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM,
-                       const FileCheckString &CheckStr, StringRef Buffer,
-                       StringMap<StringRef> &VariableTable, size_t MatchPos,
-                       size_t MatchLen, FileCheckRequest &Req) {
+                       const FileCheckString &CheckStr, int MatchedCount,
+                       StringRef Buffer, StringMap<StringRef> &VariableTable,
+                       size_t MatchPos, size_t MatchLen, FileCheckRequest &Req,
+                       std::vector<FileCheckDiag> *Diags) {
   PrintMatch(ExpectedMatch, SM, CheckStr.Prefix, CheckStr.Loc, CheckStr.Pat,
-             Buffer, VariableTable, MatchPos, MatchLen, Req);
+             MatchedCount, Buffer, VariableTable, MatchPos, MatchLen, Req,
+             Diags);
 }
 
 static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM,
-                         StringRef Prefix, SMLoc Loc, const FileCheckPattern &Pat,
-                         StringRef Buffer,
-                         StringMap<StringRef> &VariableTable,
-                         bool VerboseVerbose) {
+                         StringRef Prefix, SMLoc Loc,
+                         const FileCheckPattern &Pat, int MatchedCount,
+                         StringRef Buffer, StringMap<StringRef> &VariableTable,
+                         bool VerboseVerbose,
+                         std::vector<FileCheckDiag> *Diags) {
   if (!ExpectedMatch && !VerboseVerbose)
     return;
 
   // Otherwise, we have an error, emit an error message.
-  SM.PrintMessage(Loc,
-                  ExpectedMatch ? SourceMgr::DK_Error : SourceMgr::DK_Remark,
-                  CheckTypeName(Prefix, Pat.getCheckTy()) + ": " +
-                      (ExpectedMatch ? "expected" : "excluded") +
-                      " string not found in input");
+  std::string Message = formatv("{0}: {1} string not found in input",
+                                Pat.getCheckTy().getDescription(Prefix),
+                                (ExpectedMatch ? "expected" : "excluded"))
+                            .str();
+  if (Pat.getCount() > 1)
+    Message += formatv(" ({0} out of {1})", MatchedCount, Pat.getCount()).str();
+
+  SM.PrintMessage(
+      Loc, ExpectedMatch ? SourceMgr::DK_Error : SourceMgr::DK_Remark, Message);
 
   // Print the "scanning from here" line.  If the current position is at the
   // end of a line, advance to the start of the next line.
   Buffer = Buffer.substr(Buffer.find_first_not_of(" \t\n\r"));
-
-  SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
-                  "scanning from here");
+  SMRange SearchRange = ProcessMatchResult(
+      ExpectedMatch ? FileCheckDiag::MatchNoneButExpected
+                    : FileCheckDiag::MatchNoneAndExcluded,
+      SM, Loc, Pat.getCheckTy(), Buffer, 0, Buffer.size(), Diags);
+  SM.PrintMessage(SearchRange.Start, SourceMgr::DK_Note, "scanning from here");
 
   // Allow the pattern to print additional information if desired.
   Pat.PrintVariableUses(SM, Buffer, VariableTable);
+
   if (ExpectedMatch)
-    Pat.PrintFuzzyMatch(SM, Buffer, VariableTable);
+    Pat.PrintFuzzyMatch(SM, Buffer, VariableTable, Diags);
 }
 
 static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM,
-                         const FileCheckString &CheckStr, StringRef Buffer,
-                         StringMap<StringRef> &VariableTable,
-                         bool VerboseVerbose) {
+                         const FileCheckString &CheckStr, int MatchedCount,
+                         StringRef Buffer, StringMap<StringRef> &VariableTable,
+                         bool VerboseVerbose,
+                         std::vector<FileCheckDiag> *Diags) {
   PrintNoMatch(ExpectedMatch, SM, CheckStr.Prefix, CheckStr.Loc, CheckStr.Pat,
-               Buffer, VariableTable, VerboseVerbose);
+               MatchedCount, Buffer, VariableTable, VerboseVerbose, Diags);
 }
 
 /// Count the number of newlines in the specified range.
@@ -935,9 +1005,10 @@ static unsigned CountNumNewlinesBetween(StringRef Range,
 
 /// Match check string and its "not strings" and/or "dag strings".
 size_t FileCheckString::Check(const SourceMgr &SM, StringRef Buffer,
-                          bool IsLabelScanMode, size_t &MatchLen,
-                          StringMap<StringRef> &VariableTable,
-                          FileCheckRequest &Req) const {
+                              bool IsLabelScanMode, size_t &MatchLen,
+                              StringMap<StringRef> &VariableTable,
+                              FileCheckRequest &Req,
+                              std::vector<FileCheckDiag> *Diags) const {
   size_t LastPos = 0;
   std::vector<const FileCheckPattern *> NotStrings;
 
@@ -947,42 +1018,72 @@ size_t FileCheckString::Check(const SourceMgr &SM, StringRef Buffer,
   // over the block again (including the last CHECK-LABEL) in normal mode.
   if (!IsLabelScanMode) {
     // Match "dag strings" (with mixed "not strings" if any).
-    LastPos = CheckDag(SM, Buffer, NotStrings, VariableTable, Req);
+    LastPos = CheckDag(SM, Buffer, NotStrings, VariableTable, Req, Diags);
     if (LastPos == StringRef::npos)
       return StringRef::npos;
   }
 
   // Match itself from the last position after matching CHECK-DAG.
-  StringRef MatchBuffer = Buffer.substr(LastPos);
-  size_t MatchPos = Pat.Match(MatchBuffer, MatchLen, VariableTable);
-  if (MatchPos == StringRef::npos) {
-    PrintNoMatch(true, SM, *this, MatchBuffer, VariableTable, Req.VerboseVerbose);
-    return StringRef::npos;
+  size_t LastMatchEnd = LastPos;
+  size_t FirstMatchPos = 0;
+  // Go match the pattern Count times. Majority of patterns only match with
+  // count 1 though.
+  assert(Pat.getCount() != 0 && "pattern count can not be zero");
+  for (int i = 1; i <= Pat.getCount(); i++) {
+    StringRef MatchBuffer = Buffer.substr(LastMatchEnd);
+    size_t CurrentMatchLen;
+    // get a match at current start point
+    size_t MatchPos = Pat.Match(MatchBuffer, CurrentMatchLen, VariableTable);
+    if (i == 1)
+      FirstMatchPos = LastPos + MatchPos;
+
+    // report
+    if (MatchPos == StringRef::npos) {
+      PrintNoMatch(true, SM, *this, i, MatchBuffer, VariableTable,
+                   Req.VerboseVerbose, Diags);
+      return StringRef::npos;
+    }
+    PrintMatch(true, SM, *this, i, MatchBuffer, VariableTable, MatchPos,
+               CurrentMatchLen, Req, Diags);
+
+    // move start point after the match
+    LastMatchEnd += MatchPos + CurrentMatchLen;
   }
-  PrintMatch(true, SM, *this, MatchBuffer, VariableTable, MatchPos, MatchLen, Req);
+  // Full match len counts from first match pos.
+  MatchLen = LastMatchEnd - FirstMatchPos;
 
   // Similar to the above, in "label-scan mode" we can't yet handle CHECK-NEXT
   // or CHECK-NOT
   if (!IsLabelScanMode) {
+    size_t MatchPos = FirstMatchPos - LastPos;
+    StringRef MatchBuffer = Buffer.substr(LastPos);
     StringRef SkippedRegion = Buffer.substr(LastPos, MatchPos);
 
     // If this check is a "CHECK-NEXT", verify that the previous match was on
     // the previous line (i.e. that there is one newline between them).
-    if (CheckNext(SM, SkippedRegion))
+    if (CheckNext(SM, SkippedRegion)) {
+      ProcessMatchResult(FileCheckDiag::MatchFoundButWrongLine, SM, Loc,
+                         Pat.getCheckTy(), MatchBuffer, MatchPos, MatchLen,
+                         Diags, Req.Verbose);
       return StringRef::npos;
+    }
 
     // If this check is a "CHECK-SAME", verify that the previous match was on
     // the same line (i.e. that there is no newline between them).
-    if (CheckSame(SM, SkippedRegion))
+    if (CheckSame(SM, SkippedRegion)) {
+      ProcessMatchResult(FileCheckDiag::MatchFoundButWrongLine, SM, Loc,
+                         Pat.getCheckTy(), MatchBuffer, MatchPos, MatchLen,
+                         Diags, Req.Verbose);
       return StringRef::npos;
+    }
 
     // If this match had "not strings", verify that they don't exist in the
     // skipped region.
-    if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable, Req))
+    if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable, Req, Diags))
       return StringRef::npos;
   }
 
-  return LastPos + MatchPos;
+  return FirstMatchPos;
 }
 
 /// Verify there is a single line in the given buffer.
@@ -1061,10 +1162,11 @@ bool FileCheckString::CheckSame(const SourceMgr &SM, StringRef Buffer) const {
 }
 
 /// Verify there's no "not strings" in the given buffer.
-bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
-                           const std::vector<const FileCheckPattern *> &NotStrings,
-                           StringMap<StringRef> &VariableTable,
-                           const FileCheckRequest &Req) const {
+bool FileCheckString::CheckNot(
+    const SourceMgr &SM, StringRef Buffer,
+    const std::vector<const FileCheckPattern *> &NotStrings,
+    StringMap<StringRef> &VariableTable, const FileCheckRequest &Req,
+    std::vector<FileCheckDiag> *Diags) const {
   for (const FileCheckPattern *Pat : NotStrings) {
     assert((Pat->getCheckTy() == Check::CheckNot) && "Expect CHECK-NOT!");
 
@@ -1072,13 +1174,13 @@ bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
     size_t Pos = Pat->Match(Buffer, MatchLen, VariableTable);
 
     if (Pos == StringRef::npos) {
-      PrintNoMatch(false, SM, Prefix, Pat->getLoc(), *Pat, Buffer,
-                   VariableTable, Req.VerboseVerbose);
+      PrintNoMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer,
+                   VariableTable, Req.VerboseVerbose, Diags);
       continue;
     }
 
-    PrintMatch(false, SM, Prefix, Pat->getLoc(), *Pat, Buffer, VariableTable,
-               Pos, MatchLen, Req);
+    PrintMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer, VariableTable,
+               Pos, MatchLen, Req, Diags);
 
     return true;
   }
@@ -1087,10 +1189,12 @@ bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
 }
 
 /// Match "dag strings" and their mixed "not strings".
-size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
-                             std::vector<const FileCheckPattern *> &NotStrings,
-                             StringMap<StringRef> &VariableTable,
-                             const FileCheckRequest &Req) const {
+size_t
+FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
+                          std::vector<const FileCheckPattern *> &NotStrings,
+                          StringMap<StringRef> &VariableTable,
+                          const FileCheckRequest &Req,
+                          std::vector<FileCheckDiag> *Diags) const {
   if (DagNotStrings.empty())
     return 0;
 
@@ -1133,15 +1237,15 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
       // With a group of CHECK-DAGs, a single mismatching means the match on
       // that group of CHECK-DAGs fails immediately.
       if (MatchPosBuf == StringRef::npos) {
-        PrintNoMatch(true, SM, Prefix, Pat.getLoc(), Pat, MatchBuffer,
-                     VariableTable, Req.VerboseVerbose);
+        PrintNoMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, MatchBuffer,
+                     VariableTable, Req.VerboseVerbose, Diags);
         return StringRef::npos;
       }
       // Re-calc it as the offset relative to the start of the original string.
       MatchPos += MatchPosBuf;
       if (Req.VerboseVerbose)
-        PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, Buffer, VariableTable,
-                   MatchPos, MatchLen, Req);
+        PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer,
+                   VariableTable, MatchPos, MatchLen, Req, Diags);
       MatchRange M{MatchPos, MatchPos + MatchLen};
       if (Req.AllowDeprecatedDagOverlap) {
         // We don't need to track all matches in this mode, so we just maintain
@@ -1178,12 +1282,14 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
         SM.PrintMessage(OldStart, SourceMgr::DK_Note,
                         "match discarded, overlaps earlier DAG match here",
                         {OldRange});
+        if (Diags)
+          Diags->rbegin()->MatchTy = FileCheckDiag::MatchFoundButDiscarded;
       }
       MatchPos = MI->End;
     }
     if (!Req.VerboseVerbose)
-      PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, Buffer, VariableTable,
-                 MatchPos, MatchLen, Req);
+      PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer, VariableTable,
+                 MatchPos, MatchLen, Req, Diags);
 
     // Handle the end of a CHECK-DAG group.
     if (std::next(PatItr) == PatEnd ||
@@ -1194,7 +1300,7 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
         // region.
         StringRef SkippedRegion =
             Buffer.slice(StartPos, MatchRanges.begin()->Pos);
-        if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable, Req))
+        if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable, Req, Diags))
           return StringRef::npos;
         // Clear "not strings".
         NotStrings.clear();
@@ -1275,7 +1381,8 @@ static void ClearLocalVars(StringMap<StringRef> &VariableTable) {
 ///
 /// Returns false if the input fails to satisfy the checks.
 bool llvm::FileCheck::CheckInput(SourceMgr &SM, StringRef Buffer,
-                ArrayRef<FileCheckString> CheckStrings) {
+                                 ArrayRef<FileCheckString> CheckStrings,
+                                 std::vector<FileCheckDiag> *Diags) {
   bool ChecksFailed = false;
 
   /// VariableTable - This holds all the current filecheck variables.
@@ -1298,9 +1405,8 @@ bool llvm::FileCheck::CheckInput(SourceMgr &SM, StringRef Buffer,
 
       // Scan to next CHECK-LABEL match, ignoring CHECK-NOT and CHECK-DAG
       size_t MatchLabelLen = 0;
-      size_t MatchLabelPos =
-          CheckLabelStr.Check(SM, Buffer, true, MatchLabelLen, VariableTable,
-                              Req);
+      size_t MatchLabelPos = CheckLabelStr.Check(
+          SM, Buffer, true, MatchLabelLen, VariableTable, Req, Diags);
       if (MatchLabelPos == StringRef::npos)
         // Immediately bail of CHECK-LABEL fails, nothing else we can do.
         return false;
@@ -1319,8 +1425,8 @@ bool llvm::FileCheck::CheckInput(SourceMgr &SM, StringRef Buffer,
       // Check each string within the scanned region, including a second check
       // of any final CHECK-LABEL (to verify CHECK-NOT and CHECK-DAG)
       size_t MatchLen = 0;
-      size_t MatchPos =
-          CheckStr.Check(SM, CheckRegion, false, MatchLen, VariableTable, Req);
+      size_t MatchPos = CheckStr.Check(SM, CheckRegion, false, MatchLen,
+                                       VariableTable, Req, Diags);
 
       if (MatchPos == StringRef::npos) {
         ChecksFailed = true;
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 91e98a33b371e4db035ba21bf7e5ebe7be83e9aa..d5a688c7fb9b0b1ca5aed00d54122117c564c1af 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -211,6 +211,17 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
     }
   }
 
+  if (Implementer == "0x48") // HiSilicon Technologies, Inc.
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+          .Case("0xd01", "tsv110")
+          .Default("generic");
+
   if (Implementer == "0x51") // Qualcomm Technologies, Inc.
     // Look for the CPU part line.
     for (unsigned I = 0, E = Lines.size(); I != E; ++I)
@@ -679,12 +690,24 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
       break;
 
     default: // Unknown family 6 CPU, try to guess.
+      if (Features & (1 << X86::FEATURE_AVX512VBMI2)) {
+        *Type = X86::INTEL_COREI7;
+        *Subtype = X86::INTEL_COREI7_ICELAKE_CLIENT;
+        break;
+      }
+
       if (Features & (1 << X86::FEATURE_AVX512VBMI)) {
         *Type = X86::INTEL_COREI7;
         *Subtype = X86::INTEL_COREI7_CANNONLAKE;
         break;
       }
 
+      if (Features2 & (1 << (X86::FEATURE_AVX512VNNI - 32))) {
+        *Type = X86::INTEL_COREI7;
+        *Subtype = X86::INTEL_COREI7_CASCADELAKE;
+        break;
+      }
+
       if (Features & (1 << X86::FEATURE_AVX512VL)) {
         *Type = X86::INTEL_COREI7;
         *Subtype = X86::INTEL_COREI7_SKYLAKE_AVX512;
@@ -886,11 +909,11 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
 
   auto setFeature = [&](unsigned F) {
     if (F < 32)
-      Features |= 1 << F;
+      Features |= 1U << (F & 0x1f);
     else if (F < 64)
-      Features2 |= 1 << (F - 32);
+      Features2 |= 1U << ((F - 32) & 0x1f);
     else if (F < 96)
-      Features3 |= 1 << (F - 64);
+      Features3 |= 1U << ((F - 64) & 0x1f);
     else
       llvm_unreachable("Unexpected FeatureBit");
   };
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 7eba5962774d0e3e01f0f8153c5c9e16b0a90ec4..a3e6941bd62277d69c41ae9807d8ffc2fea3739d 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -533,7 +533,7 @@ void replace_path_prefix(SmallVectorImpl<char> &Path,
 
   // If prefixes have the same size we can simply copy the new one over.
   if (OldPrefix.size() == NewPrefix.size()) {
-    std::copy(NewPrefix.begin(), NewPrefix.end(), Path.begin());
+    llvm::copy(NewPrefix, Path.begin());
     return;
   }
 
diff --git a/lib/Support/RandomNumberGenerator.cpp b/lib/Support/RandomNumberGenerator.cpp
index f1f22af82a81579e991a81e8f8c09a91c5305695..df0d87fab021b558cf58ac095a0e985bcf096b69 100644
--- a/lib/Support/RandomNumberGenerator.cpp
+++ b/lib/Support/RandomNumberGenerator.cpp
@@ -49,7 +49,7 @@ RandomNumberGenerator::RandomNumberGenerator(StringRef Salt) {
   Data[0] = Seed;
   Data[1] = Seed >> 32;
 
-  std::copy(Salt.begin(), Salt.end(), Data.begin() + 2);
+  llvm::copy(Salt, Data.begin() + 2);
 
   std::seed_seq SeedSeq(Data.begin(), Data.end());
   Generator.seed(SeedSeq);
diff --git a/lib/Support/Signals.cpp b/lib/Support/Signals.cpp
index 6534ff69b84cb3b2352091b837d14b7f7ac8fcaf..333f492d4589489840567abc58db7699c18b6994 100644
--- a/lib/Support/Signals.cpp
+++ b/lib/Support/Signals.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
@@ -155,7 +157,7 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
   }
 
   Optional<StringRef> Redirects[] = {StringRef(InputFile),
-                                     StringRef(OutputFile), llvm::None};
+                                     StringRef(OutputFile), StringRef("")};
   StringRef Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining",
 #ifdef _WIN32
                       // Pass --relative-address on Windows so that we don't
@@ -180,8 +182,14 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
   auto CurLine = Lines.begin();
   int frame_no = 0;
   for (int i = 0; i < Depth; i++) {
+    auto PrintLineHeader = [&]() {
+      OS << right_justify(formatv("#{0}", frame_no++).str(),
+                          std::log10(Depth) + 2)
+         << ' ' << format_ptr(StackTrace[i]) << ' ';
+    };
     if (!Modules[i]) {
-      OS << '#' << frame_no++ << ' ' << format_ptr(StackTrace[i]) << '\n';
+      PrintLineHeader();
+      OS << '\n';
       continue;
     }
     // Read pairs of lines (function name and file/line info) until we
@@ -192,7 +200,7 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
       StringRef FunctionName = *CurLine++;
       if (FunctionName.empty())
         break;
-      OS << '#' << frame_no++ << ' ' << format_ptr(StackTrace[i]) << ' ';
+      PrintLineHeader();
       if (!FunctionName.startswith("??"))
         OS << FunctionName << ' ';
       if (CurLine == Lines.end())
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index 968b559c08dfa5a268347faa4d6e6f2c1c97987e..bdc0dc52c5e2334b4d4fb6d8e68618eb38457013 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -17,944 +17,12 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
-#include <cctype>
 
 using namespace llvm;
-using namespace ARM;
-using namespace AArch64;
 using namespace AMDGPU;
 
 namespace {
 
-// List of canonical FPU names (use getFPUSynonym) and which architectural
-// features they correspond to (use getFPUFeatures).
-// FIXME: TableGen this.
-// The entries must appear in the order listed in ARM::FPUKind for correct indexing
-static const struct {
-  const char *NameCStr;
-  size_t NameLength;
-  ARM::FPUKind ID;
-  ARM::FPUVersion FPUVersion;
-  ARM::NeonSupportLevel NeonSupport;
-  ARM::FPURestriction Restriction;
-
-  StringRef getName() const { return StringRef(NameCStr, NameLength); }
-} FPUNames[] = {
-#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) \
-  { NAME, sizeof(NAME) - 1, KIND, VERSION, NEON_SUPPORT, RESTRICTION },
-#include "llvm/Support/ARMTargetParser.def"
-};
-
-// List of canonical arch names (use getArchSynonym).
-// This table also provides the build attribute fields for CPU arch
-// and Arch ID, according to the Addenda to the ARM ABI, chapters
-// 2.4 and 2.3.5.2 respectively.
-// FIXME: SubArch values were simplified to fit into the expectations
-// of the triples and are not conforming with their official names.
-// Check to see if the expectation should be changed.
-// FIXME: TableGen this.
-template <typename T> struct ArchNames {
-  const char *NameCStr;
-  size_t NameLength;
-  const char *CPUAttrCStr;
-  size_t CPUAttrLength;
-  const char *SubArchCStr;
-  size_t SubArchLength;
-  unsigned DefaultFPU;
-  unsigned ArchBaseExtensions;
-  T ID;
-  ARMBuildAttrs::CPUArch ArchAttr; // Arch ID in build attributes.
-
-  StringRef getName() const { return StringRef(NameCStr, NameLength); }
-
-  // CPU class in build attributes.
-  StringRef getCPUAttr() const { return StringRef(CPUAttrCStr, CPUAttrLength); }
-
-  // Sub-Arch name.
-  StringRef getSubArch() const { return StringRef(SubArchCStr, SubArchLength); }
-};
-ArchNames<ARM::ArchKind> ARCHNames[] = {
-#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT)       \
-  {NAME, sizeof(NAME) - 1, CPU_ATTR, sizeof(CPU_ATTR) - 1, SUB_ARCH,       \
-   sizeof(SUB_ARCH) - 1, ARCH_FPU, ARCH_BASE_EXT, ARM::ArchKind::ID, ARCH_ATTR},
-#include "llvm/Support/ARMTargetParser.def"
-};
-
-ArchNames<AArch64::ArchKind> AArch64ARCHNames[] = {
- #define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT)       \
-   {NAME, sizeof(NAME) - 1, CPU_ATTR, sizeof(CPU_ATTR) - 1, SUB_ARCH,       \
-    sizeof(SUB_ARCH) - 1, ARCH_FPU, ARCH_BASE_EXT, AArch64::ArchKind::ID, ARCH_ATTR},
- #include "llvm/Support/AArch64TargetParser.def"
- };
-
-
-// List of Arch Extension names.
-// FIXME: TableGen this.
-static const struct {
-  const char *NameCStr;
-  size_t NameLength;
-  unsigned ID;
-  const char *Feature;
-  const char *NegFeature;
-
-  StringRef getName() const { return StringRef(NameCStr, NameLength); }
-} ARCHExtNames[] = {
-#define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
-  { NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE },
-#include "llvm/Support/ARMTargetParser.def"
-},AArch64ARCHExtNames[] = {
-#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
-  { NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE },
-#include "llvm/Support/AArch64TargetParser.def"
-};
-
-// List of HWDiv names (use getHWDivSynonym) and which architectural
-// features they correspond to (use getHWDivFeatures).
-// FIXME: TableGen this.
-static const struct {
-  const char *NameCStr;
-  size_t NameLength;
-  unsigned ID;
-
-  StringRef getName() const { return StringRef(NameCStr, NameLength); }
-} HWDivNames[] = {
-#define ARM_HW_DIV_NAME(NAME, ID) { NAME, sizeof(NAME) - 1, ID },
-#include "llvm/Support/ARMTargetParser.def"
-};
-
-// List of CPU names and their arches.
-// The same CPU can have multiple arches and can be default on multiple arches.
-// When finding the Arch for a CPU, first-found prevails. Sort them accordingly.
-// When this becomes table-generated, we'd probably need two tables.
-// FIXME: TableGen this.
-template <typename T> struct CpuNames {
-  const char *NameCStr;
-  size_t NameLength;
-  T ArchID;
-  bool Default; // is $Name the default CPU for $ArchID ?
-  unsigned DefaultExtensions;
-
-  StringRef getName() const { return StringRef(NameCStr, NameLength); }
-};
-CpuNames<ARM::ArchKind> CPUNames[] = {
-#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
-  { NAME, sizeof(NAME) - 1, ARM::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT },
-#include "llvm/Support/ARMTargetParser.def"
-};
-
-CpuNames<AArch64::ArchKind> AArch64CPUNames[] = {
- #define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
-   { NAME, sizeof(NAME) - 1, AArch64::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT },
- #include "llvm/Support/AArch64TargetParser.def"
- };
-
-} // namespace
-
-// ======================================================= //
-// Information by ID
-// ======================================================= //
-
-StringRef ARM::getFPUName(unsigned FPUKind) {
-  if (FPUKind >= ARM::FK_LAST)
-    return StringRef();
-  return FPUNames[FPUKind].getName();
-}
-
-FPUVersion ARM::getFPUVersion(unsigned FPUKind) {
-  if (FPUKind >= ARM::FK_LAST)
-    return FPUVersion::NONE;
-  return FPUNames[FPUKind].FPUVersion;
-}
-
-ARM::NeonSupportLevel ARM::getFPUNeonSupportLevel(unsigned FPUKind) {
-  if (FPUKind >= ARM::FK_LAST)
-    return ARM::NeonSupportLevel::None;
-  return FPUNames[FPUKind].NeonSupport;
-}
-
-ARM::FPURestriction ARM::getFPURestriction(unsigned FPUKind) {
-  if (FPUKind >= ARM::FK_LAST)
-    return ARM::FPURestriction::None;
-  return FPUNames[FPUKind].Restriction;
-}
-
-unsigned llvm::ARM::getDefaultFPU(StringRef CPU, ArchKind AK) {
-  if (CPU == "generic")
-    return ARCHNames[static_cast<unsigned>(AK)].DefaultFPU;
-
-  return StringSwitch<unsigned>(CPU)
-#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
-    .Case(NAME, DEFAULT_FPU)
-#include "llvm/Support/ARMTargetParser.def"
-    .Default(ARM::FK_INVALID);
-}
-
-unsigned llvm::ARM::getDefaultExtensions(StringRef CPU, ArchKind AK) {
-  if (CPU == "generic")
-    return ARCHNames[static_cast<unsigned>(AK)].ArchBaseExtensions;
-
-  return StringSwitch<unsigned>(CPU)
-#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
-    .Case(NAME, ARCHNames[static_cast<unsigned>(ARM::ArchKind::ID)]\
-            .ArchBaseExtensions | DEFAULT_EXT)
-#include "llvm/Support/ARMTargetParser.def"
-    .Default(ARM::AEK_INVALID);
-}
-
-bool llvm::ARM::getHWDivFeatures(unsigned HWDivKind,
-                                 std::vector<StringRef> &Features) {
-
-  if (HWDivKind == ARM::AEK_INVALID)
-    return false;
-
-  if (HWDivKind & ARM::AEK_HWDIVARM)
-    Features.push_back("+hwdiv-arm");
-  else
-    Features.push_back("-hwdiv-arm");
-
-  if (HWDivKind & ARM::AEK_HWDIVTHUMB)
-    Features.push_back("+hwdiv");
-  else
-    Features.push_back("-hwdiv");
-
-  return true;
-}
-
-bool llvm::ARM::getExtensionFeatures(unsigned Extensions,
-                                     std::vector<StringRef> &Features) {
-
-  if (Extensions == ARM::AEK_INVALID)
-    return false;
-
-  if (Extensions & ARM::AEK_CRC)
-    Features.push_back("+crc");
-  else
-    Features.push_back("-crc");
-
-  if (Extensions & ARM::AEK_DSP)
-    Features.push_back("+dsp");
-  else
-    Features.push_back("-dsp");
-
-  if (Extensions & ARM::AEK_FP16FML)
-    Features.push_back("+fp16fml");
-  else
-    Features.push_back("-fp16fml");
-
-  if (Extensions & ARM::AEK_RAS)
-    Features.push_back("+ras");
-  else
-    Features.push_back("-ras");
-
-  if (Extensions & ARM::AEK_DOTPROD)
-    Features.push_back("+dotprod");
-  else
-    Features.push_back("-dotprod");
-
-  return getHWDivFeatures(Extensions, Features);
-}
-
-bool llvm::ARM::getFPUFeatures(unsigned FPUKind,
-                               std::vector<StringRef> &Features) {
-
-  if (FPUKind >= ARM::FK_LAST || FPUKind == ARM::FK_INVALID)
-    return false;
-
-  // fp-only-sp and d16 subtarget features are independent of each other, so we
-  // must enable/disable both.
-  switch (FPUNames[FPUKind].Restriction) {
-  case ARM::FPURestriction::SP_D16:
-    Features.push_back("+fp-only-sp");
-    Features.push_back("+d16");
-    break;
-  case ARM::FPURestriction::D16:
-    Features.push_back("-fp-only-sp");
-    Features.push_back("+d16");
-    break;
-  case ARM::FPURestriction::None:
-    Features.push_back("-fp-only-sp");
-    Features.push_back("-d16");
-    break;
-  }
-
-  // FPU version subtarget features are inclusive of lower-numbered ones, so
-  // enable the one corresponding to this version and disable all that are
-  // higher. We also have to make sure to disable fp16 when vfp4 is disabled,
-  // as +vfp4 implies +fp16 but -vfp4 does not imply -fp16.
-  switch (FPUNames[FPUKind].FPUVersion) {
-  case ARM::FPUVersion::VFPV5:
-    Features.push_back("+fp-armv8");
-    break;
-  case ARM::FPUVersion::VFPV4:
-    Features.push_back("+vfp4");
-    Features.push_back("-fp-armv8");
-    break;
-  case ARM::FPUVersion::VFPV3_FP16:
-    Features.push_back("+vfp3");
-    Features.push_back("+fp16");
-    Features.push_back("-vfp4");
-    Features.push_back("-fp-armv8");
-    break;
-  case ARM::FPUVersion::VFPV3:
-    Features.push_back("+vfp3");
-    Features.push_back("-fp16");
-    Features.push_back("-vfp4");
-    Features.push_back("-fp-armv8");
-    break;
-  case ARM::FPUVersion::VFPV2:
-    Features.push_back("+vfp2");
-    Features.push_back("-vfp3");
-    Features.push_back("-fp16");
-    Features.push_back("-vfp4");
-    Features.push_back("-fp-armv8");
-    break;
-  case ARM::FPUVersion::NONE:
-    Features.push_back("-vfp2");
-    Features.push_back("-vfp3");
-    Features.push_back("-fp16");
-    Features.push_back("-vfp4");
-    Features.push_back("-fp-armv8");
-    break;
-  }
-
-  // crypto includes neon, so we handle this similarly to FPU version.
-  switch (FPUNames[FPUKind].NeonSupport) {
-  case ARM::NeonSupportLevel::Crypto:
-    Features.push_back("+neon");
-    Features.push_back("+crypto");
-    break;
-  case ARM::NeonSupportLevel::Neon:
-    Features.push_back("+neon");
-    Features.push_back("-crypto");
-    break;
-  case ARM::NeonSupportLevel::None:
-    Features.push_back("-neon");
-    Features.push_back("-crypto");
-    break;
-  }
-
-  return true;
-}
-
-StringRef llvm::ARM::getArchName(ArchKind AK) {
-  return ARCHNames[static_cast<unsigned>(AK)].getName();
-}
-
-StringRef llvm::ARM::getCPUAttr(ArchKind AK) {
-  return ARCHNames[static_cast<unsigned>(AK)].getCPUAttr();
-}
-
-StringRef llvm::ARM::getSubArch(ArchKind AK) {
-  return ARCHNames[static_cast<unsigned>(AK)].getSubArch();
-}
-
-unsigned llvm::ARM::getArchAttr(ArchKind AK) {
-  return ARCHNames[static_cast<unsigned>(AK)].ArchAttr;
-}
-
-StringRef llvm::ARM::getArchExtName(unsigned ArchExtKind) {
-  for (const auto AE : ARCHExtNames) {
-    if (ArchExtKind == AE.ID)
-      return AE.getName();
-  }
-  return StringRef();
-}
-
-StringRef llvm::ARM::getArchExtFeature(StringRef ArchExt) {
-  if (ArchExt.startswith("no")) {
-    StringRef ArchExtBase(ArchExt.substr(2));
-    for (const auto AE : ARCHExtNames) {
-      if (AE.NegFeature && ArchExtBase == AE.getName())
-        return StringRef(AE.NegFeature);
-    }
-  }
-  for (const auto AE : ARCHExtNames) {
-    if (AE.Feature && ArchExt == AE.getName())
-      return StringRef(AE.Feature);
-  }
-
-  return StringRef();
-}
-
-StringRef llvm::ARM::getHWDivName(unsigned HWDivKind) {
-  for (const auto D : HWDivNames) {
-    if (HWDivKind == D.ID)
-      return D.getName();
-  }
-  return StringRef();
-}
-
-StringRef llvm::ARM::getDefaultCPU(StringRef Arch) {
-  ArchKind AK = parseArch(Arch);
-  if (AK == ARM::ArchKind::INVALID)
-    return StringRef();
-
-  // Look for multiple AKs to find the default for pair AK+Name.
-  for (const auto CPU : CPUNames) {
-    if (CPU.ArchID == AK && CPU.Default)
-      return CPU.getName();
-  }
-
-  // If we can't find a default then target the architecture instead
-  return "generic";
-}
-
-StringRef llvm::AArch64::getFPUName(unsigned FPUKind) {
-  return ARM::getFPUName(FPUKind);
-}
-
-ARM::FPUVersion AArch64::getFPUVersion(unsigned FPUKind) {
-  return ARM::getFPUVersion(FPUKind);
-}
-
-ARM::NeonSupportLevel AArch64::getFPUNeonSupportLevel(unsigned FPUKind) {
-  return ARM::getFPUNeonSupportLevel( FPUKind);
-}
-
-ARM::FPURestriction AArch64::getFPURestriction(unsigned FPUKind) {
-  return ARM::getFPURestriction(FPUKind);
-}
-
-unsigned llvm::AArch64::getDefaultFPU(StringRef CPU, ArchKind AK) {
-  if (CPU == "generic")
-    return AArch64ARCHNames[static_cast<unsigned>(AK)].DefaultFPU;
-
-  return StringSwitch<unsigned>(CPU)
-#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
-    .Case(NAME, DEFAULT_FPU)
-#include "llvm/Support/AArch64TargetParser.def"
-    .Default(ARM::FK_INVALID);
-}
-
-unsigned llvm::AArch64::getDefaultExtensions(StringRef CPU, ArchKind AK) {
-  if (CPU == "generic")
-    return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchBaseExtensions;
-
-  return StringSwitch<unsigned>(CPU)
-#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)       \
-  .Case(NAME,                                                                  \
-        AArch64ARCHNames[static_cast<unsigned>(AArch64::ArchKind::ID)] \
-            .ArchBaseExtensions | \
-            DEFAULT_EXT)
-#include "llvm/Support/AArch64TargetParser.def"
-    .Default(AArch64::AEK_INVALID);
-}
-
-AArch64::ArchKind llvm::AArch64::getCPUArchKind(StringRef CPU) {
-  if (CPU == "generic")
-    return AArch64::ArchKind::ARMV8A;
-
-  return StringSwitch<AArch64::ArchKind>(CPU)
-#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
-  .Case(NAME, AArch64::ArchKind:: ID)
-#include "llvm/Support/AArch64TargetParser.def"
-    .Default(AArch64::ArchKind::INVALID);
-}
-
-bool llvm::AArch64::getExtensionFeatures(unsigned Extensions,
-                                     std::vector<StringRef> &Features) {
-
-  if (Extensions == AArch64::AEK_INVALID)
-    return false;
-
-  if (Extensions & AArch64::AEK_FP)
-    Features.push_back("+fp-armv8");
-  if (Extensions & AArch64::AEK_SIMD)
-    Features.push_back("+neon");
-  if (Extensions & AArch64::AEK_CRC)
-    Features.push_back("+crc");
-  if (Extensions & AArch64::AEK_CRYPTO)
-    Features.push_back("+crypto");
-  if (Extensions & AArch64::AEK_DOTPROD)
-    Features.push_back("+dotprod");
-  if (Extensions & AArch64::AEK_FP16FML)
-    Features.push_back("+fp16fml");
-  if (Extensions & AArch64::AEK_FP16)
-    Features.push_back("+fullfp16");
-  if (Extensions & AArch64::AEK_PROFILE)
-    Features.push_back("+spe");
-  if (Extensions & AArch64::AEK_RAS)
-    Features.push_back("+ras");
-  if (Extensions & AArch64::AEK_LSE)
-    Features.push_back("+lse");
-  if (Extensions & AArch64::AEK_RDM)
-    Features.push_back("+rdm");
-  if (Extensions & AArch64::AEK_SVE)
-    Features.push_back("+sve");
-  if (Extensions & AArch64::AEK_RCPC)
-    Features.push_back("+rcpc");
-
-  return true;
-}
-
-bool llvm::AArch64::getFPUFeatures(unsigned FPUKind,
-                               std::vector<StringRef> &Features) {
-  return ARM::getFPUFeatures(FPUKind, Features);
-}
-
-bool llvm::AArch64::getArchFeatures(AArch64::ArchKind AK,
-                                    std::vector<StringRef> &Features) {
-  if (AK == AArch64::ArchKind::ARMV8_1A)
-    Features.push_back("+v8.1a");
-  if (AK == AArch64::ArchKind::ARMV8_2A)
-    Features.push_back("+v8.2a");
-  if (AK == AArch64::ArchKind::ARMV8_3A)
-    Features.push_back("+v8.3a");
-  if (AK == AArch64::ArchKind::ARMV8_4A)
-    Features.push_back("+v8.4a");
-  if (AK == AArch64::ArchKind::ARMV8_5A)
-    Features.push_back("+v8.5a");
-
-  return AK != AArch64::ArchKind::INVALID;
-}
-
-StringRef llvm::AArch64::getArchName(ArchKind AK) {
-  return AArch64ARCHNames[static_cast<unsigned>(AK)].getName();
-}
-
-StringRef llvm::AArch64::getCPUAttr(ArchKind AK) {
-  return AArch64ARCHNames[static_cast<unsigned>(AK)].getCPUAttr();
-}
-
-StringRef llvm::AArch64::getSubArch(ArchKind AK) {
-  return AArch64ARCHNames[static_cast<unsigned>(AK)].getSubArch();
-}
-
-unsigned llvm::AArch64::getArchAttr(ArchKind AK) {
-  return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchAttr;
-}
-
-StringRef llvm::AArch64::getArchExtName(unsigned ArchExtKind) {
-  for (const auto &AE : AArch64ARCHExtNames)
-    if (ArchExtKind == AE.ID)
-      return AE.getName();
-  return StringRef();
-}
-
-StringRef llvm::AArch64::getArchExtFeature(StringRef ArchExt) {
-  if (ArchExt.startswith("no")) {
-    StringRef ArchExtBase(ArchExt.substr(2));
-    for (const auto &AE : AArch64ARCHExtNames) {
-      if (AE.NegFeature && ArchExtBase == AE.getName())
-        return StringRef(AE.NegFeature);
-    }
-  }
-
-  for (const auto &AE : AArch64ARCHExtNames)
-    if (AE.Feature && ArchExt == AE.getName())
-      return StringRef(AE.Feature);
-  return StringRef();
-}
-
-StringRef llvm::AArch64::getDefaultCPU(StringRef Arch) {
-  AArch64::ArchKind AK = parseArch(Arch);
-  if (AK == ArchKind::INVALID)
-    return StringRef();
-
-  // Look for multiple AKs to find the default for pair AK+Name.
-  for (const auto &CPU : AArch64CPUNames)
-    if (CPU.ArchID == AK && CPU.Default)
-      return CPU.getName();
-
-  // If we can't find a default then target the architecture instead
-  return "generic";
-}
-
-unsigned llvm::AArch64::checkArchVersion(StringRef Arch) {
-  if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1]))
-    return (Arch[1] - 48);
-  return 0;
-}
-
-// ======================================================= //
-// Parsers
-// ======================================================= //
-
-static StringRef getHWDivSynonym(StringRef HWDiv) {
-  return StringSwitch<StringRef>(HWDiv)
-      .Case("thumb,arm", "arm,thumb")
-      .Default(HWDiv);
-}
-
-static StringRef getFPUSynonym(StringRef FPU) {
-  return StringSwitch<StringRef>(FPU)
-      .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported
-      .Case("vfp2", "vfpv2")
-      .Case("vfp3", "vfpv3")
-      .Case("vfp4", "vfpv4")
-      .Case("vfp3-d16", "vfpv3-d16")
-      .Case("vfp4-d16", "vfpv4-d16")
-      .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16")
-      .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16")
-      .Case("fp5-sp-d16", "fpv5-sp-d16")
-      .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16")
-      // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3.
-      .Case("neon-vfpv3", "neon")
-      .Default(FPU);
-}
-
-static StringRef getArchSynonym(StringRef Arch) {
-  return StringSwitch<StringRef>(Arch)
-      .Case("v5", "v5t")
-      .Case("v5e", "v5te")
-      .Case("v6j", "v6")
-      .Case("v6hl", "v6k")
-      .Cases("v6m", "v6sm", "v6s-m", "v6-m")
-      .Cases("v6z", "v6zk", "v6kz")
-      .Cases("v7", "v7a", "v7hl", "v7l", "v7-a")
-      .Case("v7r", "v7-r")
-      .Case("v7m", "v7-m")
-      .Case("v7em", "v7e-m")
-      .Cases("v8", "v8a", "v8l", "aarch64", "arm64", "v8-a")
-      .Case("v8.1a", "v8.1-a")
-      .Case("v8.2a", "v8.2-a")
-      .Case("v8.3a", "v8.3-a")
-      .Case("v8.4a", "v8.4-a")
-      .Case("v8.5a", "v8.5-a")
-      .Case("v8r", "v8-r")
-      .Case("v8m.base", "v8-m.base")
-      .Case("v8m.main", "v8-m.main")
-      .Default(Arch);
-}
-
-// MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but
-// (iwmmxt|xscale)(eb)? is also permitted. If the former, return
-// "v.+", if the latter, return unmodified string, minus 'eb'.
-// If invalid, return empty string.
-StringRef llvm::ARM::getCanonicalArchName(StringRef Arch) {
-  size_t offset = StringRef::npos;
-  StringRef A = Arch;
-  StringRef Error = "";
-
-  // Begins with "arm" / "thumb", move past it.
-  if (A.startswith("arm64"))
-    offset = 5;
-  else if (A.startswith("arm"))
-    offset = 3;
-  else if (A.startswith("thumb"))
-    offset = 5;
-  else if (A.startswith("aarch64")) {
-    offset = 7;
-    // AArch64 uses "_be", not "eb" suffix.
-    if (A.find("eb") != StringRef::npos)
-      return Error;
-    if (A.substr(offset, 3) == "_be")
-      offset += 3;
-  }
-
-  // Ex. "armebv7", move past the "eb".
-  if (offset != StringRef::npos && A.substr(offset, 2) == "eb")
-    offset += 2;
-  // Or, if it ends with eb ("armv7eb"), chop it off.
-  else if (A.endswith("eb"))
-    A = A.substr(0, A.size() - 2);
-  // Trim the head
-  if (offset != StringRef::npos)
-    A = A.substr(offset);
-
-  // Empty string means offset reached the end, which means it's valid.
-  if (A.empty())
-    return Arch;
-
-  // Only match non-marketing names
-  if (offset != StringRef::npos) {
-    // Must start with 'vN'.
-    if (A.size() >= 2 && (A[0] != 'v' || !std::isdigit(A[1])))
-      return Error;
-    // Can't have an extra 'eb'.
-    if (A.find("eb") != StringRef::npos)
-      return Error;
-  }
-
-  // Arch will either be a 'v' name (v7a) or a marketing name (xscale).
-  return A;
-}
-
-unsigned llvm::ARM::parseHWDiv(StringRef HWDiv) {
-  StringRef Syn = getHWDivSynonym(HWDiv);
-  for (const auto D : HWDivNames) {
-    if (Syn == D.getName())
-      return D.ID;
-  }
-  return ARM::AEK_INVALID;
-}
-
-unsigned llvm::ARM::parseFPU(StringRef FPU) {
-  StringRef Syn = getFPUSynonym(FPU);
-  for (const auto F : FPUNames) {
-    if (Syn == F.getName())
-      return F.ID;
-  }
-  return ARM::FK_INVALID;
-}
-
-// Allows partial match, ex. "v7a" matches "armv7a".
-ARM::ArchKind ARM::parseArch(StringRef Arch) {
-  Arch = getCanonicalArchName(Arch);
-  StringRef Syn = getArchSynonym(Arch);
-  for (const auto A : ARCHNames) {
-    if (A.getName().endswith(Syn))
-      return A.ID;
-  }
-  return ARM::ArchKind::INVALID;
-}
-
-unsigned llvm::ARM::parseArchExt(StringRef ArchExt) {
-  for (const auto A : ARCHExtNames) {
-    if (ArchExt == A.getName())
-      return A.ID;
-  }
-  return ARM::AEK_INVALID;
-}
-
-ARM::ArchKind llvm::ARM::parseCPUArch(StringRef CPU) {
-  for (const auto C : CPUNames) {
-    if (CPU == C.getName())
-      return C.ArchID;
-  }
-  return ARM::ArchKind::INVALID;
-}
-
-void llvm::ARM::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
-  for (const CpuNames<ARM::ArchKind> &Arch : CPUNames) {
-    if (Arch.ArchID != ARM::ArchKind::INVALID)
-      Values.push_back(Arch.getName());
-  }
-}
-
-void llvm::AArch64::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
-  for (const CpuNames<AArch64::ArchKind> &Arch : AArch64CPUNames) {
-    if (Arch.ArchID != AArch64::ArchKind::INVALID)
-      Values.push_back(Arch.getName());
-  }
-}
-
-// ARM, Thumb, AArch64
-ARM::ISAKind ARM::parseArchISA(StringRef Arch) {
-  return StringSwitch<ARM::ISAKind>(Arch)
-      .StartsWith("aarch64", ARM::ISAKind::AARCH64)
-      .StartsWith("arm64", ARM::ISAKind::AARCH64)
-      .StartsWith("thumb", ARM::ISAKind::THUMB)
-      .StartsWith("arm", ARM::ISAKind::ARM)
-      .Default(ARM::ISAKind::INVALID);
-}
-
-// Little/Big endian
-ARM::EndianKind ARM::parseArchEndian(StringRef Arch) {
-  if (Arch.startswith("armeb") || Arch.startswith("thumbeb") ||
-      Arch.startswith("aarch64_be"))
-    return ARM::EndianKind::BIG;
-
-  if (Arch.startswith("arm") || Arch.startswith("thumb")) {
-    if (Arch.endswith("eb"))
-      return ARM::EndianKind::BIG;
-    else
-      return ARM::EndianKind::LITTLE;
-  }
-
-  if (Arch.startswith("aarch64"))
-    return ARM::EndianKind::LITTLE;
-
-  return ARM::EndianKind::INVALID;
-}
-
-// Profile A/R/M
-ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
-  Arch = getCanonicalArchName(Arch);
-  switch (parseArch(Arch)) {
-  case ARM::ArchKind::ARMV6M:
-  case ARM::ArchKind::ARMV7M:
-  case ARM::ArchKind::ARMV7EM:
-  case ARM::ArchKind::ARMV8MMainline:
-  case ARM::ArchKind::ARMV8MBaseline:
-    return ARM::ProfileKind::M;
-  case ARM::ArchKind::ARMV7R:
-  case ARM::ArchKind::ARMV8R:
-    return ARM::ProfileKind::R;
-  case ARM::ArchKind::ARMV7A:
-  case ARM::ArchKind::ARMV7VE:
-  case ARM::ArchKind::ARMV7K:
-  case ARM::ArchKind::ARMV8A:
-  case ARM::ArchKind::ARMV8_1A:
-  case ARM::ArchKind::ARMV8_2A:
-  case ARM::ArchKind::ARMV8_3A:
-  case ARM::ArchKind::ARMV8_4A:
-  case ARM::ArchKind::ARMV8_5A:
-    return ARM::ProfileKind::A;
-  case ARM::ArchKind::ARMV2:
-  case ARM::ArchKind::ARMV2A:
-  case ARM::ArchKind::ARMV3:
-  case ARM::ArchKind::ARMV3M:
-  case ARM::ArchKind::ARMV4:
-  case ARM::ArchKind::ARMV4T:
-  case ARM::ArchKind::ARMV5T:
-  case ARM::ArchKind::ARMV5TE:
-  case ARM::ArchKind::ARMV5TEJ:
-  case ARM::ArchKind::ARMV6:
-  case ARM::ArchKind::ARMV6K:
-  case ARM::ArchKind::ARMV6T2:
-  case ARM::ArchKind::ARMV6KZ:
-  case ARM::ArchKind::ARMV7S:
-  case ARM::ArchKind::IWMMXT:
-  case ARM::ArchKind::IWMMXT2:
-  case ARM::ArchKind::XSCALE:
-  case ARM::ArchKind::INVALID:
-    return ARM::ProfileKind::INVALID;
-  }
-  llvm_unreachable("Unhandled architecture");
-}
-
-// Version number (ex. v7 = 7).
-unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
-  Arch = getCanonicalArchName(Arch);
-  switch (parseArch(Arch)) {
-  case ARM::ArchKind::ARMV2:
-  case ARM::ArchKind::ARMV2A:
-    return 2;
-  case ARM::ArchKind::ARMV3:
-  case ARM::ArchKind::ARMV3M:
-    return 3;
-  case ARM::ArchKind::ARMV4:
-  case ARM::ArchKind::ARMV4T:
-    return 4;
-  case ARM::ArchKind::ARMV5T:
-  case ARM::ArchKind::ARMV5TE:
-  case ARM::ArchKind::IWMMXT:
-  case ARM::ArchKind::IWMMXT2:
-  case ARM::ArchKind::XSCALE:
-  case ARM::ArchKind::ARMV5TEJ:
-    return 5;
-  case ARM::ArchKind::ARMV6:
-  case ARM::ArchKind::ARMV6K:
-  case ARM::ArchKind::ARMV6T2:
-  case ARM::ArchKind::ARMV6KZ:
-  case ARM::ArchKind::ARMV6M:
-    return 6;
-  case ARM::ArchKind::ARMV7A:
-  case ARM::ArchKind::ARMV7VE:
-  case ARM::ArchKind::ARMV7R:
-  case ARM::ArchKind::ARMV7M:
-  case ARM::ArchKind::ARMV7S:
-  case ARM::ArchKind::ARMV7EM:
-  case ARM::ArchKind::ARMV7K:
-    return 7;
-  case ARM::ArchKind::ARMV8A:
-  case ARM::ArchKind::ARMV8_1A:
-  case ARM::ArchKind::ARMV8_2A:
-  case ARM::ArchKind::ARMV8_3A:
-  case ARM::ArchKind::ARMV8_4A:
-  case ARM::ArchKind::ARMV8_5A:
-  case ARM::ArchKind::ARMV8R:
-  case ARM::ArchKind::ARMV8MBaseline:
-  case ARM::ArchKind::ARMV8MMainline:
-    return 8;
-  case ARM::ArchKind::INVALID:
-    return 0;
-  }
-  llvm_unreachable("Unhandled architecture");
-}
-
-StringRef llvm::ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) {
-  StringRef ArchName =
-      CPU.empty() ? TT.getArchName() : ARM::getArchName(ARM::parseCPUArch(CPU));
-
-  if (TT.isOSBinFormatMachO()) {
-    if (TT.getEnvironment() == Triple::EABI ||
-        TT.getOS() == Triple::UnknownOS ||
-        llvm::ARM::parseArchProfile(ArchName) == ARM::ProfileKind::M)
-      return "aapcs";
-    if (TT.isWatchABI())
-      return "aapcs16";
-    return "apcs-gnu";
-  } else if (TT.isOSWindows())
-    // FIXME: this is invalid for WindowsCE.
-    return "aapcs";
-
-  // Select the default based on the platform.
-  switch (TT.getEnvironment()) {
-  case Triple::Android:
-  case Triple::GNUEABI:
-  case Triple::GNUEABIHF:
-  case Triple::MuslEABI:
-  case Triple::MuslEABIHF:
-    return "aapcs-linux";
-  case Triple::EABIHF:
-  case Triple::EABI:
-    return "aapcs";
-  default:
-    if (TT.isOSNetBSD())
-      return "apcs-gnu";
-    if (TT.isOSOpenBSD())
-      return "aapcs-linux";
-    return "aapcs";
-  }
-}
-
-StringRef llvm::AArch64::getCanonicalArchName(StringRef Arch) {
-  return ARM::getCanonicalArchName(Arch);
-}
-
-unsigned llvm::AArch64::parseFPU(StringRef FPU) {
-  return ARM::parseFPU(FPU);
-}
-
-// Allows partial match, ex. "v8a" matches "armv8a".
-AArch64::ArchKind AArch64::parseArch(StringRef Arch) {
-  Arch = getCanonicalArchName(Arch);
-  if (checkArchVersion(Arch) < 8)
-    return ArchKind::INVALID;
-
-  StringRef Syn = getArchSynonym(Arch);
-  for (const auto A : AArch64ARCHNames) {
-    if (A.getName().endswith(Syn))
-      return A.ID;
-  }
-  return ArchKind::INVALID;
-}
-
-AArch64::ArchExtKind llvm::AArch64::parseArchExt(StringRef ArchExt) {
-  for (const auto A : AArch64ARCHExtNames) {
-    if (ArchExt == A.getName())
-      return static_cast<ArchExtKind>(A.ID);
-  }
-  return AArch64::AEK_INVALID;
-}
-
-AArch64::ArchKind llvm::AArch64::parseCPUArch(StringRef CPU) {
-  for (const auto C : AArch64CPUNames) {
-    if (CPU == C.getName())
-      return C.ArchID;
-  }
-  return ArchKind::INVALID;
-}
-
-// ARM, Thumb, AArch64
-ARM::ISAKind AArch64::parseArchISA(StringRef Arch) {
-  return ARM::parseArchISA(Arch);
-}
-
-// Little/Big endian
-ARM::EndianKind AArch64::parseArchEndian(StringRef Arch) {
-  return ARM::parseArchEndian(Arch);
-}
-
-// Profile A/R/M
-ARM::ProfileKind AArch64::parseArchProfile(StringRef Arch) {
-  return ARM::parseArchProfile(Arch);
-}
-
-// Version number (ex. v8 = 8).
-unsigned llvm::AArch64::parseArchVersion(StringRef Arch) {
-  return ARM::parseArchVersion(Arch);
-}
-
-bool llvm::AArch64::isX18ReservedByDefault(const Triple &TT) {
-  return TT.isAndroid() || TT.isOSDarwin() || TT.isOSFuchsia() ||
-         TT.isOSWindows();
-}
-
-namespace {
-
 struct GPUInfo {
   StringLiteral Name;
   StringLiteral CanonicalName;
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index c53c878e9df1217b6758ff323facc3fc56eba816..4471fd05181eaf58bd4390d5c110169cd68acada 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -210,6 +210,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case Contiki: return "contiki";
   case AMDPAL: return "amdpal";
   case HermitCore: return "hermit";
+  case Hurd: return "hurd";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -508,6 +509,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("contiki", Triple::Contiki)
     .StartsWith("amdpal", Triple::AMDPAL)
     .StartsWith("hermit", Triple::HermitCore)
+    .StartsWith("hurd", Triple::Hurd)
     .Default(Triple::UnknownOS);
 }
 
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 02b7c2579c9c5bf13fd55ac2933ca7f75471b09b..d7cc0d627d09e894f6fb380bf18946576c526bd9 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -38,6 +38,8 @@
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
 #include <sys/attr.h>
+#elif defined(__DragonFly__)
+#include <sys/mount.h>
 #endif
 
 // Both stdio.h and cstdio are included via different paths and
@@ -54,7 +56,7 @@
 
 #include <sys/types.h>
 #if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__FreeBSD__) &&   \
-    !defined(__linux__)
+    !defined(__linux__) && !defined(__FreeBSD_kernel__)
 #include <sys/statvfs.h>
 #define STATVFS statvfs
 #define FSTATVFS fstatvfs
@@ -83,7 +85,7 @@
 #define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
 #endif
 
-#if defined(__NetBSD__) || defined(__GNU__)
+#if defined(__NetBSD__) || defined(__DragonFly__) || defined(__GNU__)
 #define STATVFS_F_FLAG(vfs) (vfs).f_flag
 #else
 #define STATVFS_F_FLAG(vfs) (vfs).f_flags
@@ -227,11 +229,11 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
 }
 
 TimePoint<> basic_file_status::getLastAccessedTime() const {
-  return toTimePoint(fs_st_atime);
+  return toTimePoint(fs_st_atime, fs_st_atime_nsec);
 }
 
 TimePoint<> basic_file_status::getLastModificationTime() const {
-  return toTimePoint(fs_st_mtime);
+  return toTimePoint(fs_st_mtime, fs_st_mtime_nsec);
 }
 
 UniqueID file_status::getUniqueID() const {
@@ -548,6 +550,18 @@ static void expandTildeExpr(SmallVectorImpl<char> &Path) {
   llvm::sys::path::append(Path, Storage);
 }
 
+
+void expand_tilde(const Twine &path, SmallVectorImpl<char> &dest) {
+  dest.clear();
+  if (path.isTriviallyEmpty())
+    return;
+
+  path.toVector(dest);
+  expandTildeExpr(dest);
+
+  return;
+}
+
 static file_type typeForMode(mode_t Mode) {
   if (S_ISDIR(Mode))
     return file_type::directory_file;
@@ -577,11 +591,22 @@ static std::error_code fillStatus(int StatRet, const struct stat &Status,
     return EC;
   }
 
+  uint32_t atime_nsec, mtime_nsec;
+#if defined(HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC)
+  atime_nsec = Status.st_atimespec.tv_nsec;
+  mtime_nsec = Status.st_mtimespec.tv_nsec;
+#elif defined(HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC)
+  atime_nsec = Status.st_atim.tv_nsec;
+  mtime_nsec = Status.st_mtim.tv_nsec;
+#else
+  atime_nsec = mtime_nsec = 0;
+#endif
+
   perms Perms = static_cast<perms>(Status.st_mode) & all_perms;
   Result = file_status(typeForMode(Status.st_mode), Perms, Status.st_dev,
-                       Status.st_nlink, Status.st_ino, Status.st_atime,
-                       Status.st_mtime, Status.st_uid, Status.st_gid,
-                       Status.st_size);
+                       Status.st_nlink, Status.st_ino,
+                       Status.st_atime, atime_nsec, Status.st_mtime, mtime_nsec,
+                       Status.st_uid, Status.st_gid, Status.st_size);
 
   return std::error_code();
 }
diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp
index e8b0435b9cdf12fe5d43603ecea5adf23b8b3a24..24581944fa81ef82a22f25b14f42658fc8150b44 100644
--- a/lib/Support/VirtualFileSystem.cpp
+++ b/lib/Support/VirtualFileSystem.cpp
@@ -1164,14 +1164,14 @@ private:
   /// Looks up the path <tt>[Start, End)</tt> in \p From, possibly
   /// recursing into the contents of \p From if it is a directory.
   ErrorOr<Entry *> lookupPath(sys::path::const_iterator Start,
-                              sys::path::const_iterator End, Entry *From);
+                              sys::path::const_iterator End, Entry *From) const;
 
   /// Get the status of a given an \c Entry.
   ErrorOr<Status> status(const Twine &Path, Entry *E);
 
 public:
   /// Looks up \p Path in \c Roots.
-  ErrorOr<Entry *> lookupPath(const Twine &Path);
+  ErrorOr<Entry *> lookupPath(const Twine &Path) const;
 
   /// Parses \p Buffer, which is expected to be in YAML format and
   /// returns a virtual file system representing its contents.
@@ -1183,6 +1183,9 @@ public:
   ErrorOr<Status> status(const Twine &Path) override;
   ErrorOr<std::unique_ptr<File>> openFileForRead(const Twine &Path) override;
 
+  std::error_code getRealPath(const Twine &Path,
+                              SmallVectorImpl<char> &Output) const override;
+
   llvm::ErrorOr<std::string> getCurrentWorkingDirectory() const override {
     return ExternalFS->getCurrentWorkingDirectory();
   }
@@ -1726,7 +1729,7 @@ RedirectingFileSystem::create(std::unique_ptr<MemoryBuffer> Buffer,
   return FS.release();
 }
 
-ErrorOr<Entry *> RedirectingFileSystem::lookupPath(const Twine &Path_) {
+ErrorOr<Entry *> RedirectingFileSystem::lookupPath(const Twine &Path_) const {
   SmallString<256> Path;
   Path_.toVector(Path);
 
@@ -1757,7 +1760,8 @@ ErrorOr<Entry *> RedirectingFileSystem::lookupPath(const Twine &Path_) {
 
 ErrorOr<Entry *>
 RedirectingFileSystem::lookupPath(sys::path::const_iterator Start,
-                                  sys::path::const_iterator End, Entry *From) {
+                                  sys::path::const_iterator End,
+                                  Entry *From) const {
 #ifndef _WIN32
   assert(!isTraversalComponent(*Start) &&
          !isTraversalComponent(From->getName()) &&
@@ -1890,6 +1894,27 @@ RedirectingFileSystem::openFileForRead(const Twine &Path) {
       llvm::make_unique<FileWithFixedStatus>(std::move(*Result), S));
 }
 
+std::error_code
+RedirectingFileSystem::getRealPath(const Twine &Path,
+                                   SmallVectorImpl<char> &Output) const {
+  ErrorOr<Entry *> Result = lookupPath(Path);
+  if (!Result) {
+    if (IsFallthrough &&
+        Result.getError() == llvm::errc::no_such_file_or_directory) {
+      return ExternalFS->getRealPath(Path, Output);
+    }
+    return Result.getError();
+  }
+
+  if (auto *F = dyn_cast<RedirectingFileEntry>(*Result)) {
+    return ExternalFS->getRealPath(F->getExternalContentsPath(), Output);
+  }
+  // Even if there is a directory entry, fall back to ExternalFS if allowed,
+  // because directories don't have a single external contents path.
+  return IsFallthrough ? ExternalFS->getRealPath(Path, Output)
+                       : llvm::errc::invalid_argument;
+}
+
 IntrusiveRefCntPtr<FileSystem>
 vfs::getVFSFromYAML(std::unique_ptr<MemoryBuffer> Buffer,
                     SourceMgr::DiagHandlerTy DiagHandler,
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 45d73ae3dfed29a98ec00444c05492e1354a5eef..d34aa763124c9aaced0f36459df1564979b44ba7 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -416,7 +416,7 @@ static std::error_code rename_internal(HANDLE FromHandle, const Twine &To,
       *reinterpret_cast<FILE_RENAME_INFO *>(RenameInfoBuf.data());
   RenameInfo.ReplaceIfExists = ReplaceIfExists;
   RenameInfo.RootDirectory = 0;
-  RenameInfo.FileNameLength = ToWide.size();
+  RenameInfo.FileNameLength = ToWide.size() * sizeof(wchar_t);
   std::copy(ToWide.begin(), ToWide.end(), &RenameInfo.FileName[0]);
 
   SetLastError(ERROR_SUCCESS);
@@ -1253,6 +1253,17 @@ static void expandTildeExpr(SmallVectorImpl<char> &Path) {
   Path.insert(Path.begin() + 1, HomeDir.begin() + 1, HomeDir.end());
 }
 
+void expand_tilde(const Twine &path, SmallVectorImpl<char> &dest) {
+  dest.clear();
+  if (path.isTriviallyEmpty())
+    return;
+
+  path.toVector(dest);
+  expandTildeExpr(dest);
+
+  return;
+}
+
 std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
                           bool expand_tilde) {
   dest.clear();
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index f8492c96bab69e681d4fb163c60dd07de09bda5d..b9bbee7883c61b5c9f29c72f0e55aefd6e1c71f6 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -341,11 +341,25 @@ void Input::scalarString(StringRef &S, QuotingType) {
 
 void Input::blockScalarString(StringRef &S) { scalarString(S, QuotingType::None); }
 
+void Input::scalarTag(std::string &Tag) {
+  Tag = CurrentNode->_node->getVerbatimTag();
+}
+
 void Input::setError(HNode *hnode, const Twine &message) {
   assert(hnode && "HNode must not be NULL");
   setError(hnode->_node, message);
 }
 
+NodeKind Input::getNodeKind() {
+  if (isa<ScalarHNode>(CurrentNode))
+    return NodeKind::Scalar;
+  else if (isa<MapHNode>(CurrentNode))
+    return NodeKind::Map;
+  else if (isa<SequenceHNode>(CurrentNode))
+    return NodeKind::Sequence;
+  llvm_unreachable("Unsupported node kind");
+}
+
 void Input::setError(Node *node, const Twine &message) {
   Strm->printError(node, message);
   EC = make_error_code(errc::invalid_argument);
@@ -436,9 +450,11 @@ bool Output::mapTag(StringRef Tag, bool Use) {
     // If this tag is being written inside a sequence we should write the start
     // of the sequence before writing the tag, otherwise the tag won't be
     // attached to the element in the sequence, but rather the sequence itself.
-    bool SequenceElement =
-        StateStack.size() > 1 && (StateStack[StateStack.size() - 2] == inSeq ||
-          StateStack[StateStack.size() - 2] == inFlowSeq);
+    bool SequenceElement = false;
+    if (StateStack.size() > 1) {
+      auto &E = StateStack[StateStack.size() - 2];
+      SequenceElement = inSeqAnyElement(E) || inFlowSeqAnyElement(E);
+    }
     if (SequenceElement && StateStack.back() == inMapFirstKey) {
       newLineCheck();
     } else {
@@ -461,6 +477,9 @@ bool Output::mapTag(StringRef Tag, bool Use) {
 }
 
 void Output::endMapping() {
+  // If we did not map anything, we should explicitly emit an empty map
+  if (StateStack.back() == inMapFirstKey)
+    output("{}");
   StateStack.pop_back();
 }
 
@@ -524,12 +543,15 @@ void Output::endDocuments() {
 }
 
 unsigned Output::beginSequence() {
-  StateStack.push_back(inSeq);
+  StateStack.push_back(inSeqFirstElement);
   NeedsNewLine = true;
   return 0;
 }
 
 void Output::endSequence() {
+  // If we did not emit anything, we should explicitly emit an empty sequence
+  if (StateStack.back() == inSeqFirstElement)
+    output("[]");
   StateStack.pop_back();
 }
 
@@ -538,10 +560,17 @@ bool Output::preflightElement(unsigned, void *&) {
 }
 
 void Output::postflightElement(void *) {
+  if (StateStack.back() == inSeqFirstElement) {
+    StateStack.pop_back();
+    StateStack.push_back(inSeqOtherElement);
+  } else if (StateStack.back() == inFlowSeqFirstElement) {
+    StateStack.pop_back();
+    StateStack.push_back(inFlowSeqOtherElement);
+  }
 }
 
 unsigned Output::beginFlowSequence() {
-  StateStack.push_back(inFlowSeq);
+  StateStack.push_back(inFlowSeqFirstElement);
   newLineCheck();
   ColumnAtFlowStart = Column;
   output("[ ");
@@ -680,6 +709,14 @@ void Output::blockScalarString(StringRef &S) {
   }
 }
 
+void Output::scalarTag(std::string &Tag) {
+  if (Tag.empty())
+    return;
+  newLineCheck();
+  output(Tag);
+  output(" ");
+}
+
 void Output::setError(const Twine &message) {
 }
 
@@ -693,7 +730,7 @@ bool Output::canElideEmptySequence() {
     return true;
   if (StateStack.back() != inMapFirstKey)
     return true;
-  return (StateStack[StateStack.size()-2] != inSeq);
+  return !inSeqAnyElement(StateStack[StateStack.size() - 2]);
 }
 
 void Output::output(StringRef s) {
@@ -703,9 +740,8 @@ void Output::output(StringRef s) {
 
 void Output::outputUpToEndOfLine(StringRef s) {
   output(s);
-  if (StateStack.empty() || (StateStack.back() != inFlowSeq &&
-                             StateStack.back() != inFlowMapFirstKey &&
-                             StateStack.back() != inFlowMapOtherKey))
+  if (StateStack.empty() || (!inFlowSeqAnyElement(StateStack.back()) &&
+                             !inFlowMapAnyKey(StateStack.back())))
     NeedsNewLine = true;
 }
 
@@ -725,16 +761,20 @@ void Output::newLineCheck() {
 
   outputNewLine();
 
-  assert(StateStack.size() > 0);
+  if (StateStack.size() == 0)
+    return;
+
   unsigned Indent = StateStack.size() - 1;
   bool OutputDash = false;
 
-  if (StateStack.back() == inSeq) {
+  if (StateStack.back() == inSeqFirstElement ||
+      StateStack.back() == inSeqOtherElement) {
     OutputDash = true;
-  } else if ((StateStack.size() > 1) && ((StateStack.back() == inMapFirstKey) ||
-             (StateStack.back() == inFlowSeq) ||
-             (StateStack.back() == inFlowMapFirstKey)) &&
-             (StateStack[StateStack.size() - 2] == inSeq)) {
+  } else if ((StateStack.size() > 1) &&
+             ((StateStack.back() == inMapFirstKey) ||
+              inFlowSeqAnyElement(StateStack.back()) ||
+              (StateStack.back() == inFlowMapFirstKey)) &&
+             inSeqAnyElement(StateStack[StateStack.size() - 2])) {
     --Indent;
     OutputDash = true;
   }
@@ -772,6 +812,24 @@ void Output::flowKey(StringRef Key) {
   output(": ");
 }
 
+NodeKind Output::getNodeKind() { report_fatal_error("invalid call"); }
+
+bool Output::inSeqAnyElement(InState State) {
+  return State == inSeqFirstElement || State == inSeqOtherElement;
+}
+
+bool Output::inFlowSeqAnyElement(InState State) {
+  return State == inFlowSeqFirstElement || State == inFlowSeqOtherElement;
+}
+
+bool Output::inMapAnyKey(InState State) {
+  return State == inMapFirstKey || State == inMapOtherKey;
+}
+
+bool Output::inFlowMapAnyKey(InState State) {
+  return State == inFlowMapFirstKey || State == inFlowMapOtherKey;
+}
+
 //===----------------------------------------------------------------------===//
 //  traits for built-in types
 //===----------------------------------------------------------------------===//
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index 3a070162608971d2521790833386e2be5e4328d1..df4088ad2f867ec9787a9a20a39e17812a2f4464 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -46,6 +46,10 @@ static cl::list<std::string>
 IncludeDirs("I", cl::desc("Directory of include files"),
             cl::value_desc("directory"), cl::Prefix);
 
+static cl::list<std::string>
+MacroNames("D", cl::desc("Name of the macro to be defined"),
+            cl::value_desc("macro name"), cl::Prefix);
+
 static int reportError(const char *ProgName, Twine Msg) {
   errs() << ProgName << ": " << Msg;
   errs().flush();
@@ -91,7 +95,7 @@ int llvm::TableGenMain(char *argv0, TableGenMainFn *MainFn) {
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
 
-  TGParser Parser(SrcMgr, Records);
+  TGParser Parser(SrcMgr, MacroNames, Records);
 
   if (Parser.ParseFile())
     return 1;
diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp
index 652be6e8dbbf3bb392e0b4c8ca5ceb4b3c7cf4bf..16aeee561075c54a2f103d8fc185305bdc1a55e6 100644
--- a/lib/TableGen/TGLexer.cpp
+++ b/lib/TableGen/TGLexer.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/TableGen/Error.h"
+#include <algorithm>
 #include <cctype>
 #include <cerrno>
 #include <cstdint>
@@ -28,11 +29,35 @@
 
 using namespace llvm;
 
-TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
+namespace {
+// A list of supported preprocessing directives with their
+// internal token kinds and names.
+struct {
+  tgtok::TokKind Kind;
+  const char *Word;
+} PreprocessorDirs[] = {
+  { tgtok::Ifdef, "ifdef" },
+  { tgtok::Else, "else" },
+  { tgtok::Endif, "endif" },
+  { tgtok::Define, "define" }
+};
+} // end anonymous namespace
+
+TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
   CurBuffer = SrcMgr.getMainFileID();
   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
   CurPtr = CurBuf.begin();
   TokStart = nullptr;
+
+  // Pretend that we enter the "top-level" include file.
+  PrepIncludeStack.push_back(
+      make_unique<std::vector<PreprocessorControlDesc>>());
+
+  // Put all macros defined in the command line into the DefinedMacros set.
+  std::for_each(Macros.begin(), Macros.end(),
+                [this](const std::string &MacroName) {
+                  DefinedMacros.insert(MacroName);
+                });
 }
 
 SMLoc TGLexer::getLoc() const {
@@ -41,11 +66,42 @@ SMLoc TGLexer::getLoc() const {
 
 /// ReturnError - Set the error to the specified string at the specified
 /// location.  This is defined to always return tgtok::Error.
-tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
+tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) {
   PrintError(Loc, Msg);
   return tgtok::Error;
 }
 
+tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
+  return ReturnError(SMLoc::getFromPointer(Loc), Msg);
+}
+
+bool TGLexer::processEOF() {
+  SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
+  if (ParentIncludeLoc != SMLoc()) {
+    // If prepExitInclude() detects a problem with the preprocessing
+    // control stack, it will return false.  Pretend that we reached
+    // the final EOF and stop lexing more tokens by returning false
+    // to LexToken().
+    if (!prepExitInclude(false))
+      return false;
+
+    CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
+    CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
+    CurPtr = ParentIncludeLoc.getPointer();
+    // Make sure TokStart points into the parent file's buffer.
+    // LexToken() assigns to it before calling getNextChar(),
+    // so it is pointing into the included file now.
+    TokStart = CurPtr;
+    return true;
+  }
+
+  // Pretend that we exit the "top-level" include file.
+  // Note that in case of an error (e.g. control stack imbalance)
+  // the routine will issue a fatal error.
+  prepExitInclude(true);
+  return false;
+}
+
 int TGLexer::getNextChar() {
   char CurChar = *CurPtr++;
   switch (CurChar) {
@@ -57,16 +113,6 @@ int TGLexer::getNextChar() {
     if (CurPtr-1 != CurBuf.end())
       return 0;  // Just whitespace.
 
-    // If this is the end of an included file, pop the parent file off the
-    // include stack.
-    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
-    if (ParentIncludeLoc != SMLoc()) {
-      CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
-      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
-      CurPtr = ParentIncludeLoc.getPointer();
-      return getNextChar();
-    }
-
     // Otherwise, return end of file.
     --CurPtr;  // Another call to lex will return EOF again.
     return EOF;
@@ -83,11 +129,11 @@ int TGLexer::getNextChar() {
   }
 }
 
-int TGLexer::peekNextChar(int Index) {
+int TGLexer::peekNextChar(int Index) const {
   return *(CurPtr + Index);
 }
 
-tgtok::TokKind TGLexer::LexToken() {
+tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
   TokStart = CurPtr;
   // This always consumes at least one character.
   int CurChar = getNextChar();
@@ -100,7 +146,18 @@ tgtok::TokKind TGLexer::LexToken() {
 
     // Unknown character, emit an error.
     return ReturnError(TokStart, "Unexpected character");
-  case EOF: return tgtok::Eof;
+  case EOF:
+    // Lex next token, if we just left an include file.
+    // Note that leaving an include file means that the next
+    // symbol is located at the end of 'include "..."'
+    // construct, so LexToken() is called with default
+    // false parameter.
+    if (processEOF())
+      return LexToken();
+
+    // Return EOF denoting the end of lexing.
+    return tgtok::Eof;
+
   case ':': return tgtok::colon;
   case ';': return tgtok::semi;
   case '.': return tgtok::period;
@@ -114,15 +171,27 @@ tgtok::TokKind TGLexer::LexToken() {
   case ')': return tgtok::r_paren;
   case '=': return tgtok::equal;
   case '?': return tgtok::question;
-  case '#': return tgtok::paste;
+  case '#':
+    if (FileOrLineStart) {
+      tgtok::TokKind Kind = prepIsDirective();
+      if (Kind != tgtok::Error)
+        return lexPreprocessor(Kind);
+    }
+
+    return tgtok::paste;
+
+  case '\r':
+    PrintFatalError("getNextChar() must never return '\r'");
+    return tgtok::Error;
 
   case 0:
   case ' ':
   case '\t':
-  case '\n':
-  case '\r':
     // Ignore whitespace.
-    return LexToken();
+    return LexToken(FileOrLineStart);
+  case '\n':
+    // Ignore whitespace, and identify the new line.
+    return LexToken(true);
   case '/':
     // If this is the start of a // comment, skip until the end of the line or
     // the end of the buffer.
@@ -133,7 +202,7 @@ tgtok::TokKind TGLexer::LexToken() {
         return tgtok::Error;
     } else // Otherwise, this is an error.
       return ReturnError(TokStart, "Unexpected character");
-    return LexToken();
+    return LexToken(FileOrLineStart);
   case '-': case '+':
   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
   case '7': case '8': case '9': {
@@ -249,10 +318,10 @@ tgtok::TokKind TGLexer::LexVarName() {
 }
 
 tgtok::TokKind TGLexer::LexIdentifier() {
-  // The first letter is [a-zA-Z_#].
+  // The first letter is [a-zA-Z_].
   const char *IdentStart = TokStart;
 
-  // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
+  // Match the rest of the identifier regex: [0-9a-zA-Z_]*
   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
     ++CurPtr;
 
@@ -322,6 +391,9 @@ bool TGLexer::LexInclude() {
   // Save the line number and lex buffer of the includer.
   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
   CurPtr = CurBuf.begin();
+
+  PrepIncludeStack.push_back(
+      make_unique<std::vector<PreprocessorControlDesc>>());
   return false;
 }
 
@@ -496,3 +568,444 @@ tgtok::TokKind TGLexer::LexExclaim() {
 
   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
 }
+
+bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
+  // Report an error, if preprocessor control stack for the current
+  // file is not empty.
+  if (!PrepIncludeStack.back()->empty()) {
+    prepReportPreprocessorStackError();
+
+    return false;
+  }
+
+  // Pop the preprocessing controls from the include stack.
+  if (PrepIncludeStack.empty()) {
+    PrintFatalError("Preprocessor include stack is empty");
+  }
+
+  PrepIncludeStack.pop_back();
+
+  if (IncludeStackMustBeEmpty) {
+    if (!PrepIncludeStack.empty())
+      PrintFatalError("Preprocessor include stack is not empty");
+  } else {
+    if (PrepIncludeStack.empty())
+      PrintFatalError("Preprocessor include stack is empty");
+  }
+
+  return true;
+}
+
+tgtok::TokKind TGLexer::prepIsDirective() const {
+  for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) {
+    int NextChar = *CurPtr;
+    bool Match = true;
+    unsigned I = 0;
+    for (; I < strlen(PreprocessorDirs[ID].Word); ++I) {
+      if (NextChar != PreprocessorDirs[ID].Word[I]) {
+        Match = false;
+        break;
+      }
+
+      NextChar = peekNextChar(I + 1);
+    }
+
+    // Check for whitespace after the directive.  If there is no whitespace,
+    // then we do not recognize it as a preprocessing directive.
+    if (Match) {
+      tgtok::TokKind Kind = PreprocessorDirs[ID].Kind;
+
+      // New line and EOF may follow only #else/#endif.  It will be reported
+      // as an error for #ifdef/#define after the call to prepLexMacroName().
+      if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF ||
+          NextChar == '\n' ||
+          // It looks like TableGen does not support '\r' as the actual
+          // carriage return, e.g. getNextChar() treats a single '\r'
+          // as '\n'.  So we do the same here.
+          NextChar == '\r')
+        return Kind;
+
+      // Allow comments after some directives, e.g.:
+      //     #else// OR #else/**/
+      //     #endif// OR #endif/**/
+      //
+      // Note that we do allow comments after #ifdef/#define here, e.g.
+      //     #ifdef/**/ AND #ifdef//
+      //     #define/**/ AND #define//
+      //
+      // These cases will be reported as incorrect after calling
+      // prepLexMacroName().  We could have supported C-style comments
+      // after #ifdef/#define, but this would complicate the code
+      // for little benefit.
+      if (NextChar == '/') {
+        NextChar = peekNextChar(I + 1);
+
+        if (NextChar == '*' || NextChar == '/')
+          return Kind;
+
+        // Pretend that we do not recognize the directive.
+      }
+    }
+  }
+
+  return tgtok::Error;
+}
+
+bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
+  TokStart = CurPtr;
+
+  for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID)
+    if (PreprocessorDirs[ID].Kind == Kind) {
+      // Advance CurPtr to the end of the preprocessing word.
+      CurPtr += strlen(PreprocessorDirs[ID].Word);
+      return true;
+    }
+
+  PrintFatalError("Unsupported preprocessing token in "
+                  "prepEatPreprocessorDirective()");
+  return false;
+}
+
+tgtok::TokKind TGLexer::lexPreprocessor(
+    tgtok::TokKind Kind, bool ReturnNextLiveToken) {
+
+  // We must be looking at a preprocessing directive.  Eat it!
+  if (!prepEatPreprocessorDirective(Kind))
+    PrintFatalError("lexPreprocessor() called for unknown "
+                    "preprocessor directive");
+
+  if (Kind == tgtok::Ifdef) {
+    StringRef MacroName = prepLexMacroName();
+    if (MacroName.empty())
+      return ReturnError(TokStart, "Expected macro name after #ifdef");
+
+    bool MacroIsDefined = DefinedMacros.count(MacroName) != 0;
+
+    // Regardless of whether we are processing tokens or not,
+    // we put the #ifdef control on stack.
+    PrepIncludeStack.back()->push_back(
+        {Kind, MacroIsDefined, SMLoc::getFromPointer(TokStart)});
+
+    if (!prepSkipDirectiveEnd())
+      return ReturnError(CurPtr,
+                         "Only comments are supported after #ifdef NAME");
+
+    // If we were not processing tokens before this #ifdef,
+    // then just return back to the lines skipping code.
+    if (!ReturnNextLiveToken)
+      return Kind;
+
+    // If we were processing tokens before this #ifdef,
+    // and the macro is defined, then just return the next token.
+    if (MacroIsDefined)
+      return LexToken();
+
+    // We were processing tokens before this #ifdef, and the macro
+    // is not defined, so we have to start skipping the lines.
+    // If the skipping is successful, it will return the token following
+    // either #else or #endif corresponding to this #ifdef.
+    if (prepSkipRegion(ReturnNextLiveToken))
+      return LexToken();
+
+    return tgtok::Error;
+  } else if (Kind == tgtok::Else) {
+    // Check if this #else is correct before calling prepSkipDirectiveEnd(),
+    // which will move CurPtr away from the beginning of #else.
+    if (PrepIncludeStack.back()->empty())
+      return ReturnError(TokStart, "#else without #ifdef");
+
+    PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back();
+
+    if (IfdefEntry.Kind != tgtok::Ifdef) {
+      PrintError(TokStart, "double #else");
+      return ReturnError(IfdefEntry.SrcPos, "Previous #else is here");
+    }
+
+    // Replace the corresponding #ifdef's control with its negation
+    // on the control stack.
+    PrepIncludeStack.back()->pop_back();
+    PrepIncludeStack.back()->push_back(
+        {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)});
+
+    if (!prepSkipDirectiveEnd())
+      return ReturnError(CurPtr, "Only comments are supported after #else");
+
+    // If we were processing tokens before this #else,
+    // we have to start skipping lines until the matching #endif.
+    if (ReturnNextLiveToken) {
+      if (prepSkipRegion(ReturnNextLiveToken))
+        return LexToken();
+
+      return tgtok::Error;
+    }
+
+    // Return to the lines skipping code.
+    return Kind;
+  } else if (Kind == tgtok::Endif) {
+    // Check if this #endif is correct before calling prepSkipDirectiveEnd(),
+    // which will move CurPtr away from the beginning of #endif.
+    if (PrepIncludeStack.back()->empty())
+      return ReturnError(TokStart, "#endif without #ifdef");
+
+    auto &IfdefOrElseEntry = PrepIncludeStack.back()->back();
+
+    if (IfdefOrElseEntry.Kind != tgtok::Ifdef &&
+        IfdefOrElseEntry.Kind != tgtok::Else) {
+      PrintFatalError("Invalid preprocessor control on the stack");
+      return tgtok::Error;
+    }
+
+    if (!prepSkipDirectiveEnd())
+      return ReturnError(CurPtr, "Only comments are supported after #endif");
+
+    PrepIncludeStack.back()->pop_back();
+
+    // If we were processing tokens before this #endif, then
+    // we should continue it.
+    if (ReturnNextLiveToken) {
+      return LexToken();
+    }
+
+    // Return to the lines skipping code.
+    return Kind;
+  } else if (Kind == tgtok::Define) {
+    StringRef MacroName = prepLexMacroName();
+    if (MacroName.empty())
+      return ReturnError(TokStart, "Expected macro name after #define");
+
+    if (!DefinedMacros.insert(MacroName).second)
+      PrintWarning(getLoc(),
+                   "Duplicate definition of macro: " + Twine(MacroName));
+
+    if (!prepSkipDirectiveEnd())
+      return ReturnError(CurPtr,
+                         "Only comments are supported after #define NAME");
+
+    if (!ReturnNextLiveToken) {
+      PrintFatalError("#define must be ignored during the lines skipping");
+      return tgtok::Error;
+    }
+
+    return LexToken();
+  }
+
+  PrintFatalError("Preprocessing directive is not supported");
+  return tgtok::Error;
+}
+
+bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
+  if (!MustNeverBeFalse)
+    PrintFatalError("Invalid recursion.");
+
+  do {
+    // Skip all symbols to the line end.
+    prepSkipToLineEnd();
+
+    // Find the first non-whitespace symbol in the next line(s).
+    if (!prepSkipLineBegin())
+      return false;
+
+    // If the first non-blank/comment symbol on the line is '#',
+    // it may be a start of preprocessing directive.
+    //
+    // If it is not '#' just go to the next line.
+    if (*CurPtr == '#')
+      ++CurPtr;
+    else
+      continue;
+
+    tgtok::TokKind Kind = prepIsDirective();
+
+    // If we did not find a preprocessing directive or it is #define,
+    // then just skip to the next line.  We do not have to do anything
+    // for #define in the line-skipping mode.
+    if (Kind == tgtok::Error || Kind == tgtok::Define)
+      continue;
+
+    tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false);
+
+    // If lexPreprocessor() encountered an error during lexing this
+    // preprocessor idiom, then return false to the calling lexPreprocessor().
+    // This will force tgtok::Error to be returned to the tokens processing.
+    if (ProcessedKind == tgtok::Error)
+      return false;
+
+    if (Kind != ProcessedKind)
+      PrintFatalError("prepIsDirective() and lexPreprocessor() "
+                      "returned different token kinds");
+
+    // If this preprocessing directive enables tokens processing,
+    // then return to the lexPreprocessor() and get to the next token.
+    // We can move from line-skipping mode to processing tokens only
+    // due to #else or #endif.
+    if (prepIsProcessingEnabled()) {
+      if (Kind != tgtok::Else && Kind != tgtok::Endif) {
+        PrintFatalError("Tokens processing was enabled by an unexpected "
+                        "preprocessing directive");
+        return false;
+      }
+
+      return true;
+    }
+  } while (CurPtr != CurBuf.end());
+
+  // We have reached the end of the file, but never left the lines-skipping
+  // mode.  This means there is no matching #endif.
+  prepReportPreprocessorStackError();
+  return false;
+}
+
+StringRef TGLexer::prepLexMacroName() {
+  // Skip whitespaces between the preprocessing directive and the macro name.
+  while (*CurPtr == ' ' || *CurPtr == '\t')
+    ++CurPtr;
+
+  TokStart = CurPtr;
+  // Macro names start with [a-zA-Z_].
+  if (*CurPtr != '_' && !isalpha(*CurPtr))
+    return "";
+
+  // Match the rest of the identifier regex: [0-9a-zA-Z_]*
+  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
+    ++CurPtr;
+
+  return StringRef(TokStart, CurPtr - TokStart);
+}
+
+bool TGLexer::prepSkipLineBegin() {
+  while (CurPtr != CurBuf.end()) {
+    switch (*CurPtr) {
+    case ' ':
+    case '\t':
+    case '\n':
+    case '\r':
+      break;
+
+    case '/': {
+      int NextChar = peekNextChar(1);
+      if (NextChar == '*') {
+        // Skip C-style comment.
+        // Note that we do not care about skipping the C++-style comments.
+        // If the line contains "//", it may not contain any processable
+        // preprocessing directive.  Just return CurPtr pointing to
+        // the first '/' in this case.  We also do not care about
+        // incorrect symbols after the first '/' - we are in lines-skipping
+        // mode, so incorrect code is allowed to some extent.
+
+        // Set TokStart to the beginning of the comment to enable proper
+        // diagnostic printing in case of error in SkipCComment().
+        TokStart = CurPtr;
+
+        // CurPtr must point to '*' before call to SkipCComment().
+        ++CurPtr;
+        if (SkipCComment())
+          return false;
+      } else {
+        // CurPtr points to the non-whitespace '/'.
+        return true;
+      }
+
+      // We must not increment CurPtr after the comment was lexed.
+      continue;
+    }
+
+    default:
+      return true;
+    }
+
+    ++CurPtr;
+  }
+
+  // We have reached the end of the file.  Return to the lines skipping
+  // code, and allow it to handle the EOF as needed.
+  return true;
+}
+
+bool TGLexer::prepSkipDirectiveEnd() {
+  while (CurPtr != CurBuf.end()) {
+    switch (*CurPtr) {
+    case ' ':
+    case '\t':
+      break;
+
+    case '\n':
+    case '\r':
+      return true;
+
+    case '/': {
+      int NextChar = peekNextChar(1);
+      if (NextChar == '/') {
+        // Skip C++-style comment.
+        // We may just return true now, but let's skip to the line/buffer end
+        // to simplify the method specification.
+        ++CurPtr;
+        SkipBCPLComment();
+      } else if (NextChar == '*') {
+        // When we are skipping C-style comment at the end of a preprocessing
+        // directive, we can skip several lines.  If any meaningful TD token
+        // follows the end of the C-style comment on the same line, it will
+        // be considered as an invalid usage of TD token.
+        // For example, we want to forbid usages like this one:
+        //     #define MACRO class Class {}
+        // But with C-style comments we also disallow the following:
+        //     #define MACRO /* This macro is used
+        //                      to ... */ class Class {}
+        // One can argue that this should be allowed, but it does not seem
+        // to be worth of the complication.  Moreover, this matches
+        // the C preprocessor behavior.
+
+        // Set TokStart to the beginning of the comment to enable proper
+        // diagnostic printer in case of error in SkipCComment().
+        TokStart = CurPtr;
+        ++CurPtr;
+        if (SkipCComment())
+          return false;
+      } else {
+        TokStart = CurPtr;
+        PrintError(CurPtr, "Unexpected character");
+        return false;
+      }
+
+      // We must not increment CurPtr after the comment was lexed.
+      continue;
+    }
+
+    default:
+      // Do not allow any non-whitespaces after the directive.
+      TokStart = CurPtr;
+      return false;
+    }
+
+    ++CurPtr;
+  }
+
+  return true;
+}
+
+void TGLexer::prepSkipToLineEnd() {
+  while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end())
+    ++CurPtr;
+}
+
+bool TGLexer::prepIsProcessingEnabled() {
+  for (auto I = PrepIncludeStack.back()->rbegin(),
+            E = PrepIncludeStack.back()->rend();
+       I != E; ++I) {
+    if (!I->IsDefined)
+      return false;
+  }
+
+  return true;
+}
+
+void TGLexer::prepReportPreprocessorStackError() {
+  if (PrepIncludeStack.back()->empty())
+    PrintFatalError("prepReportPreprocessorStackError() called with "
+                    "empty control stack");
+
+  auto &PrepControl = PrepIncludeStack.back()->back();
+  PrintError(CurBuf.end(), "Reached EOF without matching #endif");
+  PrintError(PrepControl.SrcPos, "The latest preprocessor control is here");
+
+  TokStart = CurPtr;
+}
diff --git a/lib/TableGen/TGLexer.h b/lib/TableGen/TGLexer.h
index 2c80743e3a6840068a6636ecb6e9f09ea520ef44..e9980b36b97bb0bfd29a643a6b9289bcbb4aff1f 100644
--- a/lib/TableGen/TGLexer.h
+++ b/lib/TableGen/TGLexer.h
@@ -14,11 +14,14 @@
 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
 #define LLVM_LIB_TABLEGEN_TGLEXER_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
 #include <map>
+#include <memory>
 #include <string>
 
 namespace llvm {
@@ -59,7 +62,11 @@ namespace tgtok {
     BinaryIntVal,
 
     // String valued tokens.
-    Id, StrVal, VarName, CodeFragment
+    Id, StrVal, VarName, CodeFragment,
+
+    // Preprocessing tokens for internal usage by the lexer.
+    // They are never returned as a result of Lex().
+    Ifdef, Else, Endif, Define
   };
 }
 
@@ -87,10 +94,10 @@ private:
   DependenciesMapTy Dependencies;
 
 public:
-  TGLexer(SourceMgr &SrcMgr);
+  TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
 
   tgtok::TokKind Lex() {
-    return CurCode = LexToken();
+    return CurCode = LexToken(CurPtr == CurBuf.begin());
   }
 
   const DependenciesMapTy &getDependencies() const {
@@ -119,12 +126,13 @@ public:
 
 private:
   /// LexToken - Read the next token and return its code.
-  tgtok::TokKind LexToken();
+  tgtok::TokKind LexToken(bool FileOrLineStart = false);
 
+  tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
 
   int getNextChar();
-  int peekNextChar(int Index);
+  int peekNextChar(int Index) const;
   void SkipBCPLComment();
   bool SkipCComment();
   tgtok::TokKind LexIdentifier();
@@ -134,6 +142,231 @@ private:
   tgtok::TokKind LexNumber();
   tgtok::TokKind LexBracket();
   tgtok::TokKind LexExclaim();
+
+  // Process EOF encountered in LexToken().
+  // If EOF is met in an include file, then the method will update
+  // CurPtr, CurBuf and preprocessing include stack, and return true.
+  // If EOF is met in the top-level file, then the method will
+  // update and check the preprocessing include stack, and return false.
+  bool processEOF();
+
+  // *** Structures and methods for preprocessing support ***
+
+  // A set of macro names that are defined either via command line or
+  // by using:
+  //     #define NAME
+  StringSet<> DefinedMacros;
+
+  // Each of #ifdef and #else directives has a descriptor associated
+  // with it.
+  //
+  // An ordered list of preprocessing controls defined by #ifdef/#else
+  // directives that are in effect currently is called preprocessing
+  // control stack.  It is represented as a vector of PreprocessorControlDesc's.
+  //
+  // The control stack is updated according to the following rules:
+  //
+  // For each #ifdef we add an element to the control stack.
+  // For each #else we replace the top element with a descriptor
+  // with an inverted IsDefined value.
+  // For each #endif we pop the top element from the control stack.
+  //
+  // When CurPtr reaches the current buffer's end, the control stack
+  // must be empty, i.e. #ifdef and the corresponding #endif
+  // must be located in the same file.
+  struct PreprocessorControlDesc {
+    // Either tgtok::Ifdef or tgtok::Else.
+    tgtok::TokKind Kind;
+
+    // True, if the condition for this directive is true, false - otherwise.
+    // Examples:
+    //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
+    //     ...
+    //     #else             : false, if NAME is defined, true - otherwise.
+    bool IsDefined;
+
+    // Pointer into CurBuf to the beginning of the preprocessing directive
+    // word, e.g.:
+    //     #ifdef NAME
+    //      ^ - SrcPos
+    SMLoc SrcPos;
+  };
+
+  // We want to disallow code like this:
+  //     file1.td:
+  //         #define NAME
+  //         #ifdef NAME
+  //         include "file2.td"
+  //     EOF
+  //     file2.td:
+  //         #endif
+  //     EOF
+  //
+  // To do this, we clear the preprocessing control stack on entry
+  // to each of the included file.  PrepIncludeStack is used to store
+  // preprocessing control stacks for the current file and all its
+  // parent files.  The back() element is the preprocessing control
+  // stack for the current file.
+  std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
+      PrepIncludeStack;
+
+  // Validate that the current preprocessing control stack is empty,
+  // since we are about to exit a file, and pop the include stack.
+  //
+  // If IncludeStackMustBeEmpty is true, the include stack must be empty
+  // after the popping, otherwise, the include stack must not be empty
+  // after the popping.  Basically, the include stack must be empty
+  // only if we exit the "top-level" file (i.e. finish lexing).
+  //
+  // The method returns false, if the current preprocessing control stack
+  // is not empty (e.g. there is an unterminated #ifdef/#else),
+  // true - otherwise.
+  bool prepExitInclude(bool IncludeStackMustBeEmpty);
+
+  // Look ahead for a preprocessing directive starting from CurPtr.  The caller
+  // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
+  // a preprocessing directive word followed by a whitespace, then it returns
+  // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
+  //
+  // CurPtr is not adjusted by this method.
+  tgtok::TokKind prepIsDirective() const;
+
+  // Given a preprocessing token kind, adjusts CurPtr to the end
+  // of the preprocessing directive word.  Returns true, unless
+  // an unsupported token kind is passed in.
+  //
+  // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
+  // to avoid adjusting CurPtr before we are sure that '#' is followed
+  // by a preprocessing directive.  If it is not, then we fall back to
+  // tgtok::paste interpretation of '#'.
+  bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
+
+  // The main "exit" point from the token parsing to preprocessor.
+  //
+  // The method is called for CurPtr, when prepIsDirective() returns
+  // true.  The first parameter matches the result of prepIsDirective(),
+  // denoting the actual preprocessor directive to be processed.
+  //
+  // If the preprocessing directive disables the tokens processing, e.g.:
+  //     #ifdef NAME // NAME is undefined
+  // then lexPreprocessor() enters the lines-skipping mode.
+  // In this mode, it does not parse any tokens, because the code under
+  // the #ifdef may not even be a correct tablegen code.  The preprocessor
+  // looks for lines containing other preprocessing directives, which
+  // may be prepended with whitespaces and C-style comments.  If the line
+  // does not contain a preprocessing directive, it is skipped completely.
+  // Otherwise, the preprocessing directive is processed by recursively
+  // calling lexPreprocessor().  The processing of the encountered
+  // preprocessing directives includes updating preprocessing control stack
+  // and adding new macros into DefinedMacros set.
+  //
+  // The second parameter controls whether lexPreprocessor() is called from
+  // LexToken() (true) or recursively from lexPreprocessor() (false).
+  //
+  // If ReturnNextLiveToken is true, the method returns the next
+  // LEX token following the current directive or following the end
+  // of the disabled preprocessing region corresponding to this directive.
+  // If ReturnNextLiveToken is false, the method returns the first parameter,
+  // unless there were errors encountered in the disabled preprocessing
+  // region - in this case, it returns tgtok::Error.
+  tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
+                                 bool ReturnNextLiveToken = true);
+
+  // Worker method for lexPreprocessor() to skip lines after some
+  // preprocessing directive up to the buffer end or to the directive
+  // that re-enables token processing.  The method returns true
+  // upon processing the next directive that re-enables tokens
+  // processing.  False is returned if an error was encountered.
+  //
+  // Note that prepSkipRegion() calls lexPreprocessor() to process
+  // encountered preprocessing directives.  In this case, the second
+  // parameter to lexPreprocessor() is set to false.  Being passed
+  // false ReturnNextLiveToken, lexPreprocessor() must never call
+  // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
+  // to prepSkipRegion() and checking that it is never set to false.
+  bool prepSkipRegion(bool MustNeverBeFalse);
+
+  // Lex name of the macro after either #ifdef or #define.  We could have used
+  // LexIdentifier(), but it has special handling of "include" word, which
+  // could result in awkward diagnostic errors.  Consider:
+  // ----
+  // #ifdef include
+  // class ...
+  // ----
+  // LexIdentifier() will engage LexInclude(), which will complain about
+  // missing file with name "class".  Instead, prepLexMacroName() will treat
+  // "include" as a normal macro name.
+  //
+  // On entry, CurPtr points to the end of a preprocessing directive word.
+  // The method allows for whitespaces between the preprocessing directive
+  // and the macro name.  The allowed whitespaces are ' ' and '\t'.
+  //
+  // If the first non-whitespace symbol after the preprocessing directive
+  // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
+  // the method updates TokStart to the position of the first non-whitespace
+  // symbol, sets CurPtr to the position of the macro name's last symbol,
+  // and returns a string reference to the macro name.  Otherwise,
+  // TokStart is set to the first non-whitespace symbol after the preprocessing
+  // directive, and the method returns an empty string reference.
+  //
+  // In all cases, TokStart may be used to point to the word following
+  // the preprocessing directive.
+  StringRef prepLexMacroName();
+
+  // Skip any whitespaces starting from CurPtr.  The method is used
+  // only in the lines-skipping mode to find the first non-whitespace
+  // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
+  // and '\r'.  The method skips C-style comments as well, because
+  // it is used to find the beginning of the preprocessing directive.
+  // If we do not handle C-style comments the following code would
+  // result in incorrect detection of a preprocessing directive:
+  //     /*
+  //     #ifdef NAME
+  //     */
+  // As long as we skip C-style comments, the following code is correctly
+  // recognized as a preprocessing directive:
+  //     /* first line comment
+  //        second line comment */ #ifdef NAME
+  //
+  // The method returns true upon reaching the first non-whitespace symbol
+  // or EOF, CurPtr is set to point to this symbol.  The method returns false,
+  // if an error occured during skipping of a C-style comment.
+  bool prepSkipLineBegin();
+
+  // Skip any whitespaces or comments after a preprocessing directive.
+  // The method returns true upon reaching either end of the line
+  // or end of the file.  If there is a multiline C-style comment
+  // after the preprocessing directive, the method skips
+  // the comment, so the final CurPtr may point to one of the next lines.
+  // The method returns false, if an error occured during skipping
+  // C- or C++-style comment, or a non-whitespace symbol appears
+  // after the preprocessing directive.
+  //
+  // The method maybe called both during lines-skipping and tokens
+  // processing.  It actually verifies that only whitespaces or/and
+  // comments follow a preprocessing directive.
+  //
+  // After the execution of this mehod, CurPtr points either to new line
+  // symbol, buffer end or non-whitespace symbol following the preprocesing
+  // directive.
+  bool prepSkipDirectiveEnd();
+
+  // Skip all symbols to the end of the line/file.
+  // The method adjusts CurPtr, so that it points to either new line
+  // symbol in the current line or the buffer end.
+  void prepSkipToLineEnd();
+
+  // Return true, if the current preprocessor control stack is such that
+  // we should allow lexer to process the next token, false - otherwise.
+  //
+  // In particular, the method returns true, if all the #ifdef/#else
+  // controls on the stack have their IsDefined member set to true.
+  bool prepIsProcessingEnabled();
+
+  // Report an error, if we reach EOF with non-empty preprocessing control
+  // stack.  This means there is no matching #endif for the previous
+  // #ifdef/#else.
+  void prepReportPreprocessorStackError();
 };
 
 } // end namespace llvm
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index 0a28b3a03aa19cc90b5e4d82a3750eecbc59d468..e3849043513b58106ecb71d8d4768d1121a2e20d 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -115,8 +115,9 @@ class TGParser {
   };
 
 public:
-  TGParser(SourceMgr &SrcMgr, RecordKeeper &records)
-      : Lex(SrcMgr), CurMultiClass(nullptr), Records(records) {}
+  TGParser(SourceMgr &SrcMgr, ArrayRef<std::string> Macros,
+           RecordKeeper &records)
+    : Lex(SrcMgr, Macros), CurMultiClass(nullptr), Records(records) {}
 
   /// ParseFile - Main entrypoint for parsing a tblgen file.  These parser
   /// routines return true on error, or false on success.
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 2f0d0bf346d6db04127fc2857bf25c2c7d3f72a3..c36d9354f3ba6f234fab670da883f96ec374a37d 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -39,6 +39,7 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
                                  CodeGenOpt::Level OptLevel);
 FunctionPass *createAArch64StorePairSuppressPass();
 FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64SpeculationHardeningPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
 FunctionPass *createAArch64SIMDInstrOptPass();
 ModulePass *createAArch64PromoteConstantPass();
@@ -68,6 +69,7 @@ void initializeAArch64ConditionalComparesPass(PassRegistry&);
 void initializeAArch64ConditionOptimizerPass(PassRegistry&);
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
 void initializeAArch64ExpandPseudoPass(PassRegistry&);
+void initializeAArch64SpeculationHardeningPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
 void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 9d596a1821c8de72fb6b0c54ff67c8c74a3aaed7..a13f6c848816f79c29ac3b5301d29cd57b464dda 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -65,6 +65,18 @@ def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
 def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
   "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">;
 
+def FeaturePAN : SubtargetFeature<
+    "pan", "HasPAN", "true",
+    "Enables ARM v8.1 Privileged Access-Never extension">;
+
+def FeatureLOR : SubtargetFeature<
+    "lor", "HasLOR", "true",
+    "Enables ARM v8.1 Limited Ordering Regions extension">;
+
+def FeatureVH : SubtargetFeature<
+    "vh", "HasVH", "true",
+    "Enables ARM v8.1 Virtual Host extension">;
+
 def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
   "Enable ARMv8 PMUv3 Performance Monitors extension">;
 
@@ -77,6 +89,18 @@ def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true",
 def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
   "Enable Statistical Profiling extension">;
 
+def FeaturePAN_RWV : SubtargetFeature<
+    "pan-rwv", "HasPAN_RWV", "true",
+    "Enable v8.2 PAN s1e1R and s1e1W Variants",
+    [FeaturePAN]>;
+
+// UAO PState
+def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true",
+    "Enable v8.2 UAO PState">;
+
+def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
+    "true", "Enable v8.2 data Cache Clean to Point of Persistence" >;
+
 def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
   "Enable Scalable Vector Extension (SVE) instructions">;
 
@@ -125,11 +149,11 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature<
 
 def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
     "CustomAsCheapAsMove", "true",
-    "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
+    "Use custom handling of cheap instructions">;
 
 def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
     "ExynosAsCheapAsMove", "true",
-    "Use Exynos specific code in TargetInstrInfo::isAsCheapAsAMove()",
+    "Use Exynos specific handling of cheap instructions",
     [FeatureCustomCheapAsMoveHandling]>;
 
 def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
@@ -195,6 +219,66 @@ def FeatureDotProd : SubtargetFeature<
     "dotprod", "HasDotProd", "true",
     "Enable dot product support">;
 
+def FeaturePA : SubtargetFeature<
+    "pa", "HasPA", "true",
+    "Enable v8.3-A Pointer Authentication enchancement">;
+
+def FeatureJS : SubtargetFeature<
+    "jsconv", "HasJS", "true",
+    "Enable v8.3-A JavaScript FP conversion enchancement",
+    [FeatureFPARMv8]>;
+
+def FeatureCCIDX : SubtargetFeature<
+    "ccidx", "HasCCIDX", "true",
+    "Enable v8.3-A Extend of the CCSIDR number of sets">;
+
+def FeatureComplxNum : SubtargetFeature<
+    "complxnum", "HasComplxNum", "true",
+    "Enable v8.3-A Floating-point complex number support",
+    [FeatureNEON]>;
+
+def FeatureNV : SubtargetFeature<
+    "nv", "HasNV", "true",
+    "Enable v8.4-A Nested Virtualization Enchancement">;
+
+def FeatureRASv8_4 : SubtargetFeature<
+    "rasv8_4", "HasRASv8_4", "true",
+    "Enable v8.4-A Reliability, Availability and Serviceability extension",
+    [FeatureRAS]>;
+
+def FeatureMPAM : SubtargetFeature<
+    "mpam", "HasMPAM", "true",
+    "Enable v8.4-A Memory system Partitioning and Monitoring extension">;
+
+def FeatureDIT : SubtargetFeature<
+    "dit", "HasDIT", "true",
+    "Enable v8.4-A Data Independent Timing instructions">;
+
+def FeatureTRACEV8_4 : SubtargetFeature<
+    "tracev8.4", "HasTRACEV8_4", "true",
+    "Enable v8.4-A Trace extension">;
+
+def FeatureAM : SubtargetFeature<
+    "am", "HasAM", "true",
+    "Enable v8.4-A Activity Monitors extension">;
+
+def FeatureSEL2 : SubtargetFeature<
+    "sel2", "HasSEL2", "true",
+    "Enable v8.4-A Secure Exception Level 2 extension">;
+
+def FeatureTLB_RMI : SubtargetFeature<
+    "tlb-rmi", "HasTLB_RMI", "true",
+    "Enable v8.4-A TLB Range and Maintenance Instructions">;
+
+def FeatureFMI : SubtargetFeature<
+    "fmi", "HasFMI", "true",
+    "Enable v8.4-A Flag Manipulation Instructions">;
+
+// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset
+def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true",
+    "Enable v8.4-A RCPC instructions with Immediate Offsets",
+    [FeatureRCPC]>;
+
 def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
                                         "NegativeImmediates", "false",
                                         "Convert immediates and instructions "
@@ -222,6 +306,9 @@ def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true",
 def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict",
   "true", "Enable architectural speculation restriction" >;
 
+def FeatureSSBS : SubtargetFeature<"ssbs", "HasSSBS",
+  "true", "Enable Speculative Store Bypass Safe bit" >;
+
 def FeatureSpecCtrl : SubtargetFeature<"specctrl", "HasSpecCtrl", "true",
   "Enable speculation control barrier" >;
 
@@ -229,7 +316,7 @@ def FeaturePredCtrl : SubtargetFeature<"predctrl", "HasPredCtrl", "true",
   "Enable execution and data prediction invalidation instructions" >;
 
 def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "HasCCDP",
-    "true", "Enable Cache Clean to Point of Deep Persistence" >;
+    "true", "Enable v8.5 Cache Clean to Point of Deep Persistence" >;
 
 def FeatureBranchTargetId : SubtargetFeature<"bti", "HasBTI",
     "true", "Enable Branch Target Identification" >;
@@ -245,21 +332,27 @@ def FeatureMTE : SubtargetFeature<"mte", "HasMTE",
 //
 
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
-  "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>;
+  "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM,
+  FeaturePAN, FeatureLOR, FeatureVH]>;
 
 def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
-  "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
+  "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, 
+  FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>;
 
 def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
-  "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC]>;
+  "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePA,
+  FeatureJS, FeatureCCIDX, FeatureComplxNum]>;
 
 def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
-  "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd]>;
+  "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
+  FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT,
+  FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
+  FeatureFMI, FeatureRCPC_IMMO]>;
 
 def HasV8_5aOps : SubtargetFeature<
   "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
   [HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict,
-   FeatureSpecCtrl, FeaturePredCtrl, FeatureCacheDeepPersist,
+   FeatureSSBS, FeatureSpecCtrl, FeaturePredCtrl, FeatureCacheDeepPersist,
    FeatureBranchTargetId]
 >;
 
@@ -277,6 +370,8 @@ include "AArch64CallingConvention.td"
 
 include "AArch64Schedule.td"
 include "AArch64InstrInfo.td"
+include "AArch64SchedPredicates.td"
+include "AArch64SchedPredExynos.td"
 
 def AArch64InstrInfo : InstrInfo;
 
@@ -546,6 +641,21 @@ def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
                                        FeaturePredictableSelectIsExpensive,
                                        FeatureNEON]>;
 
+def ProcTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
+                                  "HiSilicon TS-V110 processors", [
+                                  HasV8_2aOps,
+                                  FeatureCrypto,
+                                  FeatureCustomCheapAsMoveHandling,
+                                  FeatureFPARMv8,
+                                  FeatureFuseAES,
+                                  FeatureNEON,
+                                  FeaturePerfMon,
+                                  FeaturePostRAScheduler,
+                                  FeatureSPE,
+                                  FeatureFullFP16,
+                                  FeatureFP16FML,
+                                  FeatureDotProd]>;
+
 def : ProcessorModel<"generic", NoSchedModel, [
                      FeatureFPARMv8,
                      FeatureFuseAES,
@@ -578,6 +688,8 @@ def : ProcessorModel<"thunderxt81", ThunderXT8XModel,  [ProcThunderXT81]>;
 def : ProcessorModel<"thunderxt83", ThunderXT8XModel,  [ProcThunderXT83]>;
 // Cavium ThunderX2T9X  Processors. Formerly Broadcom Vulcan.
 def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
+// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
+def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
@@ -626,3 +738,9 @@ def AArch64 : Target {
   let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
   let AllowRegisterRenaming = 1;
 }
+
+//===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "AArch64PfmCounters.td"
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 1ff0392c0f2692c599dcb7aee580228c95907ff8..828fbb2de7ee7600e9349bfbacd7fe8a6207ac84 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -109,12 +110,33 @@ public:
     AU.setPreservesAll();
   }
 
-  bool runOnMachineFunction(MachineFunction &F) override {
-    AArch64FI = F.getInfo<AArch64FunctionInfo>();
-    STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
-    bool Result = AsmPrinter::runOnMachineFunction(F);
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+    STI = static_cast<const AArch64Subtarget*>(&MF.getSubtarget());
+
+    SetupMachineFunction(MF);
+
+    if (STI->isTargetCOFF()) {
+      bool Internal = MF.getFunction().hasInternalLinkage();
+      COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
+                                              : COFF::IMAGE_SYM_CLASS_EXTERNAL;
+      int Type =
+        COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
+
+      OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
+      OutStreamer->EmitCOFFSymbolStorageClass(Scl);
+      OutStreamer->EmitCOFFSymbolType(Type);
+      OutStreamer->EndCOFFSymbolDef();
+    }
+
+    // Emit the rest of the function body.
+    EmitFunctionBody();
+
+    // Emit the XRay table for this function.
     emitXRayTable();
-    return Result;
+
+    // We didn't modify anything.
+    return false;
   }
 
 private:
@@ -217,7 +239,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
     // linker can safely perform dead code stripping.  Since LLVM never
     // generates code that does this, it is always safe to set.
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
-    SM.serializeToStackMapSection();
+    emitStackMaps(SM);
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 2f6cb4c8670a2883673a1ada454613af69348127..5db941e9dac72b651ded275d04ec4b91bf31b01b 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -123,7 +123,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
 
 // Vararg functions on windows pass floats in integer registers
 def CC_AArch64_Win64_VarArg : CallingConv<[
-  CCIfType<[f16, f32],    CCPromoteToType<f64>>,
+  CCIfType<[f16, f32], CCPromoteToType<f64>>,
   CCIfType<[f64], CCBitConvertToType<i64>>,
   CCDelegateTo<CC_AArch64_AAPCS>
 ]>;
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index dfc08a12f51343b4524f6978fa111362d7050015..47550cabb9f0dac8a899c02d2af7f42715e0a2b8 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -2016,8 +2016,9 @@ bool AArch64FastISel::selectLoad(const Instruction *I) {
       if (RetVT == MVT::i64 && VT <= MVT::i32) {
         if (WantZExt) {
           // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG).
-          std::prev(FuncInfo.InsertPt)->eraseFromParent();
-          ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg();
+          MachineBasicBlock::iterator I(std::prev(FuncInfo.InsertPt));
+          ResultReg = std::prev(I)->getOperand(0).getReg();
+          removeDeadCode(I, std::next(I));
         } else
           ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg,
                                                  /*IsKill=*/true,
@@ -2038,7 +2039,8 @@ bool AArch64FastISel::selectLoad(const Instruction *I) {
           break;
         }
       }
-      MI->eraseFromParent();
+      MachineBasicBlock::iterator I(MI);
+      removeDeadCode(I, std::next(I));
       MI = nullptr;
       if (Reg)
         MI = MRI.getUniqueVRegDef(Reg);
@@ -2256,6 +2258,13 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
 
 /// Try to emit a combined compare-and-branch instruction.
 bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
+  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
+  // will not be produced, as they are conditional branch instructions that do
+  // not set flags.
+  if (FuncInfo.MF->getFunction().hasFnAttribute(
+          Attribute::SpeculativeLoadHardening))
+    return false;
+
   assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
   const CmpInst *CI = cast<CmpInst>(BI->getCondition());
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
@@ -4508,7 +4517,8 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
             MI->getOperand(1).getSubReg() == AArch64::sub_32) &&
            "Expected copy instruction");
     Reg = MI->getOperand(1).getReg();
-    MI->eraseFromParent();
+    MachineBasicBlock::iterator I(MI);
+    removeDeadCode(I, std::next(I));
   }
   updateValueMap(I, Reg);
   return true;
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 9c85001481d18598b86c60e28d5a883b38d871e4..800dd449b49f2d5fb84077e6c93549eb19290584 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -204,6 +204,11 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  // Win64 EH requires a frame pointer if funclets are present, as the locals
+  // are accessed off the frame pointer in both the parent function and the
+  // funclets.
+  if (MF.hasEHFunclets())
+    return true;
   // Retain behavior of always omitting the FP for leaf functions when possible.
   if (MFI.hasCalls() && MF.getTarget().Options.DisableFramePointerElim(MF))
     return true;
@@ -585,10 +590,12 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
     bool NeedsWinCFI, bool InProlog = true) {
   // Ignore instructions that do not operate on SP, i.e. shadow call stack
-  // instructions.
+  // instructions and associated CFI instruction.
   while (MBBI->getOpcode() == AArch64::STRXpost ||
-         MBBI->getOpcode() == AArch64::LDRXpre) {
-    assert(MBBI->getOperand(0).getReg() != AArch64::SP);
+         MBBI->getOpcode() == AArch64::LDRXpre ||
+         MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
+    if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
+      assert(MBBI->getOperand(0).getReg() != AArch64::SP);
     ++MBBI;
   }
   unsigned NewOpc;
@@ -685,9 +692,11 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
   unsigned Opc = MI.getOpcode();
 
   // Ignore instructions that do not operate on SP, i.e. shadow call stack
-  // instructions.
-  if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre) {
-    assert(MI.getOperand(0).getReg() != AArch64::SP);
+  // instructions and associated CFI instruction.
+  if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
+      Opc == AArch64::CFI_INSTRUCTION) {
+    if (Opc != AArch64::CFI_INSTRUCTION)
+      assert(MI.getOperand(0).getReg() != AArch64::SP);
     return;
   }
 
@@ -774,6 +783,12 @@ static bool ShouldSignWithAKey(MachineFunction &MF) {
   return Key.equals_lower("a_key");
 }
 
+static bool needsWinCFI(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+         F.needsUnwindTableEntry();
+}
+
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -787,9 +802,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   bool needsFrameMoves = (MMI.hasDebugInfo() || F.needsUnwindTableEntry()) &&
                          !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool HasFP = hasFP(MF);
-  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
-                     F.needsUnwindTableEntry();
+  bool NeedsWinCFI = needsWinCFI(MF);
   MF.setHasWinCFI(NeedsWinCFI);
+  bool IsFunclet = MBB.isEHFuncletEntry();
+
   // At this point, we're going to decide whether or not the function uses a
   // redzone. In most cases, the function doesn't have a redzone so let's
   // assume that's false and set it to true in the case that there's a redzone.
@@ -804,6 +820,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
         MBB, MBBI, DL,
         TII->get(ShouldSignWithAKey(MF) ? AArch64::PACIASP : AArch64::PACIBSP))
         .setMIFlag(MachineInstr::FrameSetup);
+
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
   }
 
   // All calls are tail calls in GHC calling conv, and functions have no
@@ -811,7 +833,14 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
-  int NumBytes = (int)MFI.getStackSize();
+  // getStackSize() includes all the locals in its size calculation. We don't
+  // include these locals when computing the stack size of a funclet, as they
+  // are allocated in the parent's stack frame and accessed via the frame
+  // pointer from the funclet.  We only save the callee saved registers in the
+  // funclet, which are really the callee saved registers of the parent
+  // function, including the funclet.
+  int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF)
+                           : (int)MFI.getStackSize();
   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
     // All of the stack allocation is for locals.
@@ -847,7 +876,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+  // Var args are accounted for in the containing function, so don't
+  // include them for funclets.
+  unsigned FixedObject = (IsWin64 && !IsFunclet) ?
+                         alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
 
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
@@ -875,6 +907,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     ++MBBI;
   }
 
+  // The code below is not applicable to funclets. We have emitted all the SEH
+  // opcodes that we needed to emit.  The FP and BP belong to the containing
+  // function.
+  if (IsFunclet) {
+    if (NeedsWinCFI)
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+          .setMIFlag(MachineInstr::FrameSetup);
+    return;
+  }
+
   if (HasFP) {
     // Only set up FP if we actually need to. Frame pointer is fp =
     // sp - fixedobject - 16.
@@ -1165,6 +1207,16 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
   }
 }
 
+static bool isFuncletReturnInstr(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::CATCHRET:
+  case AArch64::CLEANUPRET:
+    return true;
+  }
+}
+
 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
@@ -1173,8 +1225,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
   bool IsTailCallReturn = false;
-  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
-                     MF.getFunction().needsUnwindTableEntry();
+  bool NeedsWinCFI = needsWinCFI(MF);
+  bool IsFunclet = false;
 
   if (MBB.end() != MBBI) {
     DL = MBBI->getDebugLoc();
@@ -1182,9 +1234,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
                        RetOpcode == AArch64::TCRETURNri ||
                        RetOpcode == AArch64::TCRETURNriBTI;
+    IsFunclet = isFuncletReturnInstr(*MBBI);
   }
 
-  int NumBytes = MFI.getStackSize();
+  int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF)
+                           : MFI.getStackSize();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
 
   // All calls are tail calls in GHC calling conv, and functions have no
@@ -1241,10 +1295,19 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+  // Var args are accounted for in the containing function, so don't
+  // include them for funclets.
+  unsigned FixedObject =
+      (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
 
   uint64_t AfterCSRPopSize = ArgumentPopSize;
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
+  // We cannot rely on the local stack size set in emitPrologue if the function
+  // has funclets, as funclets have different local stack size requirements, and
+  // the current value set in emitPrologue may be that of the containing
+  // function.
+  if (MF.hasEHFunclets())
+    AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
 
@@ -1340,7 +1403,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // FIXME: Rather than doing the math here, we should instead just use
   // non-post-indexed loads for the restores if we aren't actually going to
   // be able to save any instructions.
-  if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
+  if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned()))
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
                     -AFI->getCalleeSavedStackSize() + 16, TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
@@ -1445,6 +1508,14 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
         // being in range for direct access. If the FPOffset is positive,
         // that'll always be best, as the SP will be even further away.
         UseFP = true;
+      } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
+        // Funclets access the locals contained in the parent's stack frame
+        // via the frame pointer, so we have to use the FP in the parent
+        // function.
+        assert(
+            Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
+            "Funclets should only be present on Win64");
+        UseFP = true;
       } else {
         // We have the choice between FP and (SP or BP).
         if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
@@ -1538,8 +1609,7 @@ static void computeCalleeSaveRegisterPairs(
   if (CSI.empty())
     return;
 
-  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
-                     MF.getFunction().needsUnwindTableEntry();
+  bool NeedsWinCFI = needsWinCFI(MF);
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -1652,8 +1722,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
-                     MF.getFunction().needsUnwindTableEntry();
+  bool NeedsWinCFI = needsWinCFI(MF);
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
 
@@ -1675,6 +1744,23 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
           .setMIFlag(MachineInstr::FrameSetup);
 
+    if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
+      // Emit a CFI instruction that causes 8 to be subtracted from the value of
+      // x18 when unwinding past this frame.
+      static const char CFIInst[] = {
+          dwarf::DW_CFA_val_expression,
+          18, // register
+          2,  // length
+          static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
+          static_cast<char>(-8) & 0x7f, // addend (sleb128)
+      };
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createEscape(nullptr, CFIInst));
+      BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
     // This instruction also makes x18 live-in to the entry block.
     MBB.addLiveIn(AArch64::X18);
   }
@@ -1765,8 +1851,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
-  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
-                     MF.getFunction().needsUnwindTableEntry();
+  bool NeedsWinCFI = needsWinCFI(MF);
 
   if (MI != MBB.end())
     DL = MI->getDebugLoc();
@@ -1998,3 +2083,69 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   return AFI->hasCalleeSaveStackFreeSpace();
 }
+
+void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  // If this function isn't doing Win64-style C++ EH, we don't need to do
+  // anything.
+  if (!MF.hasEHFunclets())
+    return;
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
+
+  MachineBasicBlock &MBB = MF.front();
+  auto MBBI = MBB.begin();
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+    ++MBBI;
+
+  if (MBBI->isTerminator())
+    return;
+
+  // Create an UnwindHelp object.
+  int UnwindHelpFI =
+      MFI.CreateStackObject(/*size*/8, /*alignment*/16, false);
+  EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
+  // We need to store -2 into the UnwindHelp object at the start of the
+  // function.
+  DebugLoc DL;
+  RS->enterBasicBlock(MBB);
+  unsigned DstReg = RS->scavengeRegister(&AArch64::GPR64RegClass, MBBI, 0);
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
+      .addReg(DstReg, getKillRegState(true))
+      .addFrameIndex(UnwindHelpFI)
+      .addImm(0);
+}
+
+/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
+/// the update.  This is easily retrieved as it is exactly the offset that is set
+/// in processFunctionBeforeFrameFinalized.
+int AArch64FrameLowering::getFrameIndexReferencePreferSP(
+    const MachineFunction &MF, int FI, unsigned &FrameReg,
+    bool IgnoreSPUpdates) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
+                    << MFI.getObjectOffset(FI) << "\n");
+  FrameReg = AArch64::SP;
+  return MFI.getObjectOffset(FI);
+}
+
+/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
+/// the parent's frame pointer
+unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
+    const MachineFunction &MF) const {
+  return 0;
+}
+
+/// Funclets only need to account for space for the callee saved registers,
+/// as the locals are accounted for in the parent's stack frame.
+unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
+    const MachineFunction &MF) const {
+  // This is the size of the pushed CSRs.
+  unsigned CSSize =
+      MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
+  // This is the amount of stack a funclet needs to allocate.
+  return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
+                 getStackAlignment());
+}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 104e52b5f1f367ce8cda652aa81ef6f4f1349340..0d0385acf46e6155e27acb7793a007bfdc46994f 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -69,6 +69,17 @@ public:
 
   bool enableStackSlotScavenging(const MachineFunction &MF) const override;
 
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                             RegScavenger *RS) const override;
+
+  unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
+
+  unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
+
+  int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+                                     unsigned &FrameReg,
+                                     bool IgnoreSPUpdates) const override;
+
 private:
   bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
                                       unsigned StackBumpBytes) const;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index d5d6d5ca23e1de93a117e76ecc773272983b2f80..cc10c9688e14f38ee3d99790334c245b2be4850a 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -714,8 +714,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 
       if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
-        setOperationAction(ISD::MULHS, VT, Custom);
-        setOperationAction(ISD::MULHU, VT, Custom);
+        setOperationAction(ISD::MULHS, VT, Legal);
+        setOperationAction(ISD::MULHU, VT, Legal);
       } else {
         setOperationAction(ISD::MULHS, VT, Expand);
         setOperationAction(ISD::MULHU, VT, Expand);
@@ -1273,6 +1273,20 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
   return EndBB;
 }
 
+MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
+       MachineInstr &MI, MachineBasicBlock *BB) const {
+  assert(!isAsynchronousEHPersonality(classifyEHPersonality(
+             BB->getParent()->getFunction().getPersonalityFn())) &&
+         "SEH does not use catchret!");
+  return BB;
+}
+
+MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad(
+     MachineInstr &MI, MachineBasicBlock *BB) const {
+  MI.eraseFromParent();
+  return BB;
+}
+
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
   switch (MI.getOpcode()) {
@@ -1288,6 +1302,11 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
+
+  case AArch64::CATCHRET:
+    return EmitLoweredCatchRet(MI, BB);
+  case AArch64::CATCHPAD:
+    return EmitLoweredCatchPad(MI, BB);
   }
 }
 
@@ -1501,6 +1520,11 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
+  } else if (isCMN(LHS, CC)) {
+    // As we are looking for EQ/NE compares, the operands can be commuted ; can
+    // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
+    Opcode = AArch64ISD::ADDS;
+    LHS = LHS.getOperand(1);
   } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
              !isUnsignedIntSetCC(CC)) {
     // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
@@ -1527,33 +1551,44 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 ///   ccmp B, inv(CB), CA
 ///   check for CB flags
 ///
-/// In general we can create code for arbitrary "... (and (and A B) C)"
-/// sequences. We can also implement some "or" expressions, because "(or A B)"
-/// is equivalent to "not (and (not A) (not B))" and we can implement some
-/// negation operations:
-/// We can negate the results of a single comparison by inverting the flags
-/// used when the predicate fails and inverting the flags tested in the next
-/// instruction; We can also negate the results of the whole previous
-/// conditional compare sequence by inverting the flags tested in the next
-/// instruction. However there is no way to negate the result of a partial
-/// sequence.
+/// This naturally lets us implement chains of AND operations with SETCC
+/// operands. And we can even implement some other situations by transforming
+/// them:
+///   - We can implement (NEG SETCC) i.e. negating a single comparison by
+///     negating the flags used in a CCMP/FCCMP operations.
+///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
+///     by negating the flags we test for afterwards. i.e.
+///     NEG (CMP CCMP CCCMP ...) can be implemented.
+///   - Note that we can only ever negate all previously processed results.
+///     What we can not implement by flipping the flags to test is a negation
+///     of two sub-trees (because the negation affects all sub-trees emitted so
+///     far, so the 2nd sub-tree we emit would also affect the first).
+/// With those tools we can implement some OR operations:
+///   - (OR (SETCC A) (SETCC B)) can be implemented via:
+///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
+///   - After transforming OR to NEG/AND combinations we may be able to use NEG
+///     elimination rules from earlier to implement the whole thing as a
+///     CCMP/FCCMP chain.
 ///
-/// Therefore on encountering an "or" expression we can negate the subtree on
-/// one side and have to be able to push the negate to the leafs of the subtree
-/// on the other side (see also the comments in code). As complete example:
-/// "or (or (setCA (cmp A)) (setCB (cmp B)))
-///     (and (setCC (cmp C)) (setCD (cmp D)))"
-/// is transformed to
-/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
-///           (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
-/// and implemented as:
+/// As complete example:
+///     or (or (setCA (cmp A)) (setCB (cmp B)))
+///        (and (setCC (cmp C)) (setCD (cmp D)))"
+/// can be reassociated to:
+///     or (and (setCC (cmp C)) setCD (cmp D))
+//         (or (setCA (cmp A)) (setCB (cmp B)))
+/// can be transformed to:
+///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
+///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
+/// which can be implemented as:
 ///   cmp C
 ///   ccmp D, inv(CD), CC
 ///   ccmp A, CA, inv(CD)
 ///   ccmp B, CB, inv(CA)
 ///   check for CB flags
-/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
-/// by conditional compare sequences.
+///
+/// A counterexample is "or (and A B) (and C D)" which translates to
+/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
+/// can only implement 1 of the inner (not) operations, but not both!
 /// @{
 
 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
@@ -1593,9 +1628,20 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
 
 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
 /// expressed as a conjunction. See \ref AArch64CCMP.
-/// \param CanNegate        Set to true if we can also emit the negation of the
-///                         tree as a conjunction.
+/// \param CanNegate    Set to true if we can negate the whole sub-tree just by
+///                     changing the conditions on the SETCC tests.
+///                     (this means we can call emitConjunctionRec() with
+///                      Negate==true on this sub-tree)
+/// \param MustBeFirst  Set to true if this subtree needs to be negated and we
+///                     cannot do the negation naturally. We are required to
+///                     emit the subtree first in this case.
+/// \param WillNegate   Is true if are called when the result of this
+///                     subexpression must be negated. This happens when the
+///                     outer expression is an OR. We can use this fact to know
+///                     that we have a double negation (or (or ...) ...) that
+///                     can be implemented for free.
 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
+                               bool &MustBeFirst, bool WillNegate,
                                unsigned Depth = 0) {
   if (!Val.hasOneUse())
     return false;
@@ -1604,42 +1650,44 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
     if (Val->getOperand(0).getValueType() == MVT::f128)
       return false;
     CanNegate = true;
+    MustBeFirst = false;
     return true;
   }
   // Protect against exponential runtime and stack overflow.
   if (Depth > 6)
     return false;
   if (Opcode == ISD::AND || Opcode == ISD::OR) {
+    bool IsOR = Opcode == ISD::OR;
     SDValue O0 = Val->getOperand(0);
     SDValue O1 = Val->getOperand(1);
     bool CanNegateL;
-    if (!canEmitConjunction(O0, CanNegateL, Depth+1))
+    bool MustBeFirstL;
+    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
       return false;
     bool CanNegateR;
-    if (!canEmitConjunction(O1, CanNegateR, Depth+1))
+    bool MustBeFirstR;
+    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
       return false;
 
-    if (Opcode == ISD::OR) {
-      // For an OR expression we need to be able to negate at least one side or
-      // we cannot do the transformation at all.
+    if (MustBeFirstL && MustBeFirstR)
+      return false;
+
+    if (IsOR) {
+      // For an OR expression we need to be able to naturally negate at least
+      // one side or we cannot do the transformation at all.
       if (!CanNegateL && !CanNegateR)
         return false;
-      // However if we can negate x and y, then we can change
-      // (not (or x y))
-      // into
-      // (and (not x) (not y))
-      // to eliminate the outer negation.
-      CanNegate = CanNegateL && CanNegateR;
+      // If we the result of the OR will be negated and we can naturally negate
+      // the leafs, then this sub-tree as a whole negates naturally.
+      CanNegate = WillNegate && CanNegateL && CanNegateR;
+      // If we cannot naturally negate the whole sub-tree, then this must be
+      // emitted first.
+      MustBeFirst = !CanNegate;
     } else {
-      // If the operands are OR expressions then we finally need to negate their
-      // outputs, we can only do that for the operand with emitted last by
-      // negating OutCC, not for both operands.
-      bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
-      bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
-      if (NeedsNegOutL && NeedsNegOutR)
-        return false;
-      // We cannot negate an AND operation.
+      assert(Opcode == ISD::AND && "Must be OR or AND");
+      // We cannot naturally negate an AND operation.
       CanNegate = false;
+      MustBeFirst = MustBeFirstL || MustBeFirstR;
     }
     return true;
   }
@@ -1652,10 +1700,8 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
 /// and conditional compare operations. @returns an NZCV flags producing node
 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
 /// transformation was not possible.
-/// On recursive invocations @p PushNegate may be set to true to have negation
-/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
-/// for the comparisons in the current subtree; @p Depth limits the search
-/// depth to avoid stack overflow.
+/// \p Negate is true if we want this sub-tree being negated just by changing
+/// SETCC conditions.
 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
     AArch64CC::CondCode Predicate) {
@@ -1697,61 +1743,69 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
     return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
                                      DAG);
   }
-  assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
-         "Valid conjunction/disjunction tree");
-
-  // Check if both sides can be transformed.
-  SDValue LHS = Val->getOperand(0);
-  SDValue RHS = Val->getOperand(1);
+  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
 
-  // In case of an OR we need to negate our operands and the result.
-  // (A v B) <=> not(not(A) ^ not(B))
-  bool NegateOpsAndResult = Opcode == ISD::OR;
-  // We can negate the results of all previous operations by inverting the
-  // predicate flags giving us a free negation for one side. The other side
-  // must be negatable by itself.
-  if (NegateOpsAndResult) {
-    // See which side we can negate.
-    bool CanNegateL;
-    bool isValidL = canEmitConjunction(LHS, CanNegateL);
-    assert(isValidL && "Valid conjunction/disjunction tree");
-    (void)isValidL;
+  bool IsOR = Opcode == ISD::OR;
 
-#ifndef NDEBUG
-    bool CanNegateR;
-    bool isValidR = canEmitConjunction(RHS, CanNegateR);
-    assert(isValidR && "Valid conjunction/disjunction tree");
-    assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
-#endif
+  SDValue LHS = Val->getOperand(0);
+  bool CanNegateL;
+  bool MustBeFirstL;
+  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
+  assert(ValidL && "Valid conjunction/disjunction tree");
+  (void)ValidL;
 
-    // Order the side which we cannot negate to RHS so we can emit it first.
-    if (!CanNegateL)
+  SDValue RHS = Val->getOperand(1);
+  bool CanNegateR;
+  bool MustBeFirstR;
+  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
+  assert(ValidR && "Valid conjunction/disjunction tree");
+  (void)ValidR;
+
+  // Swap sub-tree that must come first to the right side.
+  if (MustBeFirstL) {
+    assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
+    std::swap(LHS, RHS);
+    std::swap(CanNegateL, CanNegateR);
+    std::swap(MustBeFirstL, MustBeFirstR);
+  }
+
+  bool NegateR;
+  bool NegateAfterR;
+  bool NegateL;
+  bool NegateAfterAll;
+  if (Opcode == ISD::OR) {
+    // Swap the sub-tree that we can negate naturally to the left.
+    if (!CanNegateL) {
+      assert(CanNegateR && "at least one side must be negatable");
+      assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
+      assert(!Negate);
       std::swap(LHS, RHS);
+      NegateR = false;
+      NegateAfterR = true;
+    } else {
+      // Negate the left sub-tree if possible, otherwise negate the result.
+      NegateR = CanNegateR;
+      NegateAfterR = !CanNegateR;
+    }
+    NegateL = true;
+    NegateAfterAll = !Negate;
   } else {
-    bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
-    assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
-           "Valid conjunction/disjunction tree");
-    // Order the side where we need to negate the output flags to RHS so it
-    // gets emitted first.
-    if (NeedsNegOutL)
-      std::swap(LHS, RHS);
+    assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
+    assert(!Negate && "Valid conjunction/disjunction tree");
+
+    NegateL = false;
+    NegateR = false;
+    NegateAfterR = false;
+    NegateAfterAll = false;
   }
 
-  // Emit RHS. If we want to negate the tree we only need to push a negate
-  // through if we are already in a PushNegate case, otherwise we can negate
-  // the "flags to test" afterwards.
+  // Emit sub-trees.
   AArch64CC::CondCode RHSCC;
-  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, Negate,
-                                                   CCOp, Predicate);
-  if (NegateOpsAndResult && !Negate)
+  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
+  if (NegateAfterR)
     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
-  // Emit LHS. We may need to negate it.
-  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC,
-                                                   NegateOpsAndResult, CmpR,
-                                                   RHSCC);
-  // If we transformed an OR to and AND then we have to negate the result
-  // (or absorb the Negate parameter).
-  if (NegateOpsAndResult && !Negate)
+  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
+  if (NegateAfterAll)
     OutCC = AArch64CC::getInvertedCondCode(OutCC);
   return CmpL;
 }
@@ -1763,7 +1817,8 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
                                AArch64CC::CondCode &OutCC) {
   bool DummyCanNegate;
-  if (!canEmitConjunction(Val, DummyCanNegate))
+  bool DummyMustBeFirst;
+  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
     return SDValue();
 
   return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
@@ -2651,66 +2706,6 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
-// Lower vector multiply high (ISD::MULHS and ISD::MULHU).
-static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) {
-  // Multiplications are only custom-lowered for 128-bit vectors so that
-  // {S,U}MULL{2} can be detected.  Otherwise v2i64 multiplications are not
-  // legal.
-  EVT VT = Op.getValueType();
-  assert(VT.is128BitVector() && VT.isInteger() &&
-         "unexpected type for custom-lowering ISD::MULH{U,S}");
-
-  SDValue V0 = Op.getOperand(0);
-  SDValue V1 = Op.getOperand(1);
-
-  SDLoc DL(Op);
-
-  EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
-
-  // We turn (V0 mulhs/mulhu V1) to:
-  //
-  // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)),
-  //              (extract_subvector (ExtractVT V128:V1, (i64 0))))),
-  //       (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)),
-  //              (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx))))))
-  //
-  // Where ExtractVT is a subvector with half number of elements, and
-  // VMullIdx2 is the index of the middle element (the high part).
-  //
-  // The vector hight part extract and multiply will be matched against
-  // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will
-  // issue a {s}mull2 instruction.
-  //
-  // This basically multiply the lower subvector with '{s,u}mull', the high
-  // subvector with '{s,u}mull2', and shuffle both results high part in
-  // resulting vector.
-  unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2;
-  SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64);
-  SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64);
-
-  SDValue VMullV0 =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx);
-  SDValue VMullV1 =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx);
-
-  SDValue VMull2V0 =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx);
-  SDValue VMull2V1 =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx);
-
-  unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL
-                                                  : AArch64ISD::UMULL;
-
-  EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext());
-  SDValue Mull  = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1);
-  SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1);
-
-  Mull  = DAG.getNode(ISD::BITCAST, DL, VT, Mull);
-  Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2);
-
-  return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2);
-}
-
 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2913,9 +2908,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:
     return LowerMUL(Op, DAG);
-  case ISD::MULHS:
-  case ISD::MULHU:
-    return LowerMULH(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::STORE:
@@ -4351,6 +4343,13 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
 
+  MachineFunction &MF = DAG.getMachineFunction();
+  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
+  // will not be produced, as they are conditional branch instructions that do
+  // not set flags.
+  bool ProduceNonFlagSettingCondBr =
+      !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+
   // Handle f128 first, since lowering it will result in comparing the return
   // value of a libcall against zero, which is just what the rest of LowerBR_CC
   // is expecting to deal with.
@@ -4393,7 +4392,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
     // If the RHS of the comparison is zero, we can potentially fold this
     // to a specialized branch.
     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
-    if (RHSC && RHSC->getZExtValue() == 0) {
+    if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
       if (CC == ISD::SETEQ) {
         // See if we can use a TBZ to fold in an AND as well.
         // TBZ has a smaller branch displacement than CBZ.  If the offset is
@@ -4436,7 +4435,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
       }
     }
     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
-        LHS.getOpcode() != ISD::AND) {
+        LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
       // Don't combine AND since emitComparison converts the AND to an ANDS
       // (a.k.a. TST) and the test in the test bit and branch instruction
       // becomes redundant.  This would also increase register pressure.
@@ -7241,7 +7240,10 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
     return DAG.getUNDEF(VT);
   }
 
-  if (isOnlyLowElement) {
+  // Convert BUILD_VECTOR where all elements but the lowest are undef into
+  // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
+  // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
+  if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
     LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
                          "SCALAR_TO_VECTOR node\n");
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
@@ -7823,7 +7825,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
 
   if (ShouldInvert)
-    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+    Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
 
   return Cmp;
 }
@@ -8084,6 +8086,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
                                                   ISD::LoadExtType ExtTy,
                                                   EVT NewVT) const {
+  // TODO: This may be worth removing. Check regression tests for diffs.
+  if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
+    return false;
+
   // If we're reducing the load width in order to avoid having to use an extra
   // instruction to do extension then it's probably a good idea.
   if (ExtTy != ISD::NON_EXTLOAD)
@@ -10808,6 +10814,13 @@ SDValue performCONDCombine(SDNode *N,
 static SDValue performBRCONDCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
+  // will not be produced, as they are conditional branch instructions that do
+  // not set flags.
+  if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
+    return SDValue();
+
   if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
     N = NV.getNode();
   SDValue Chain = N->getOperand(0);
@@ -11668,6 +11681,39 @@ Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
   return TargetLowering::getIRStackGuard(IRB);
 }
 
+void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
+  // MSVC CRT provides functionalities for stack protection.
+  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
+    // MSVC CRT has a global variable holding security cookie.
+    M.getOrInsertGlobal("__security_cookie",
+                        Type::getInt8PtrTy(M.getContext()));
+
+    // MSVC CRT has a function to validate security cookie.
+    auto *SecurityCheckCookie = cast<Function>(
+        M.getOrInsertFunction("__security_check_cookie",
+                              Type::getVoidTy(M.getContext()),
+                              Type::getInt8PtrTy(M.getContext())));
+    SecurityCheckCookie->setCallingConv(CallingConv::Win64);
+    SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+    return;
+  }
+  TargetLowering::insertSSPDeclarations(M);
+}
+
+Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
+  // MSVC CRT has a global variable holding security cookie.
+  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+    return M.getGlobalVariable("__security_cookie");
+  return TargetLowering::getSDagStackGuard(M);
+}
+
+Value *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+  // MSVC CRT has a function to validate security cookie.
+  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+    return M.getFunction("__security_check_cookie");
+  return TargetLowering::getSSPStackGuardCheck(M);
+}
+
 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
@@ -11772,3 +11818,8 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
   MF.getFrameInfo().computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
+
+// Unlike X86, we let frame lowering assign offsets to all catch objects.
+bool AArch64TargetLowering::needsFixedCatchObjects() const {
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 7ee3b82a4ac12a7c3f8133122917028b844672b5..ffc4cc3ef53474064faf487c305a37f442005574 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -302,6 +302,12 @@ public:
   MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
 
+  MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
+                                           MachineBasicBlock *BB) const;
+
+  MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
+                                         MachineBasicBlock *BB) const;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
@@ -401,6 +407,10 @@ public:
   /// returns the address of that location. Otherwise, returns nullptr.
   Value *getIRStackGuard(IRBuilder<> &IRB) const override;
 
+  void insertSSPDeclarations(Module &M) const override;
+  Value *getSDagStackGuard(const Module &M) const override;
+  Value *getSSPStackGuardCheck(const Module &M) const override;
+
   /// If the target has a standard location for the unsafe stack pointer,
   /// returns the address of that location. Otherwise, returns nullptr.
   Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
@@ -517,6 +527,8 @@ public:
   bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
                                                  CallingConv::ID CallConv,
                                                  bool isVarArg) const override;
+  /// Used for exception handling on Win64.
+  bool needsFixedCatchObjects() const override;
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index ab90ea3f74ad63032a3fe95ed3f7597241c04b2b..9061ed4f9f54a6f275685ba66656cdd6072fe629 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -9989,9 +9989,10 @@ class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode,
   let Inst{4-0}   = Rd;
 }
 
+//8.3 CompNum - Floating-point complex number support
 multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
                                           string asm, SDPatternOperator OpNode>{
-  let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in {
+  let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
   def v4f16 : BaseSIMDThreeSameVectorComplex<0, U, 0b01, opcode, V64, rottype,
               asm, ".4h",
               [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
@@ -10007,7 +10008,7 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
                                                (rottype i32:$rot)))]>;
   }
 
-  let Predicates = [HasV8_3a, HasNEON] in {
+  let Predicates = [HasComplxNum, HasNEON] in {
   def v2f32 : BaseSIMDThreeSameVectorComplex<0, U, 0b10, opcode, V64, rottype,
               asm, ".2s",
               [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
@@ -10063,7 +10064,7 @@ class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size,
 multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
                                              Operand rottype, string asm,
                                              SDPatternOperator OpNode> {
-  let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in {
+  let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
   def v4f16 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b01, opcode, V64,
               rottype, asm, ".4h",
               [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
@@ -10079,7 +10080,7 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
                                                (rottype i32:$rot)))]>;
   }
 
-  let Predicates = [HasV8_3a, HasNEON] in {
+  let Predicates = [HasComplxNum, HasNEON] in {
   def v2f32 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b10, opcode, V64,
               rottype, asm, ".2s",
               [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
@@ -10145,7 +10146,7 @@ class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size,
 // classes.
 multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype,
                                      string asm, SDPatternOperator OpNode> {
-  let Predicates = [HasV8_3a,HasNEON,HasFullFP16] in {
+  let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
   def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64,
                       V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h",
                       ".4h", ".h", []> {
@@ -10161,9 +10162,9 @@ multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype,
     let Inst{11} = idx{1};
     let Inst{21} = idx{0};
   }
-  } // Predicates = [HasV8_3a,HasNEON,HasFullFP16]
+  } // Predicates = HasComplxNum, HasNEON, HasFullFP16]
 
-  let Predicates = [HasV8_3a,HasNEON] in {
+  let Predicates = [HasComplxNum, HasNEON] in {
   def v4f32_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b10, opc1, opc2,
                       V128, V128, V128, VectorIndexD, rottype, asm, ".4s",
                       ".4s", ".4s", ".s", []> {
@@ -10171,7 +10172,7 @@ multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype,
     let Inst{11} = idx{0};
     let Inst{21} = 0;
   }
-  } // Predicates = [HasV8_3a,HasNEON]
+  } // Predicates = [HasComplxNum, HasNEON]
 }
 
 //----------------------------------------------------------------------------
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7b4e05128058621a7acf8d144da5a86285fb4811..10464ea57bb9915f45248e99736565cb41329934 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -66,7 +66,8 @@ static cl::opt<unsigned>
                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
 
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
-    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
+                          AArch64::CATCHRET),
       RI(STI.getTargetTriple()), Subtarget(STI) {}
 
 /// GetInstSize - Return the number of bytes of code the specified
@@ -704,10 +705,10 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   // Secondly, check cases specific to sub-targets.
 
   if (Subtarget.hasExynosCheapAsMoveHandling()) {
-    if (isExynosResetFast(MI) || isExynosShiftExtFast(MI))
+    if (isExynosCheapAsMove(MI))
       return true;
-    else
-      return MI.isAsCheapAsAMove();
+
+    return MI.isAsCheapAsAMove();
   }
 
   // Finally, check generic cases.
@@ -758,211 +759,6 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
-bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) {
-  unsigned Reg, Imm, Shift;
-
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-
-  // MOV Rd, SP
-  case AArch64::ADDWri:
-  case AArch64::ADDXri:
-    if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm())
-      return false;
-
-    Reg = MI.getOperand(1).getReg();
-    Imm = MI.getOperand(2).getImm();
-    return ((Reg == AArch64::WSP || Reg == AArch64::SP) && Imm == 0);
-
-  // Literal
-  case AArch64::ADR:
-  case AArch64::ADRP:
-    return true;
-
-  // MOVI Vd, #0
-  case AArch64::MOVID:
-  case AArch64::MOVIv8b_ns:
-  case AArch64::MOVIv2d_ns:
-  case AArch64::MOVIv16b_ns:
-    Imm = MI.getOperand(1).getImm();
-    return (Imm == 0);
-
-  // MOVI Vd, #0
-  case AArch64::MOVIv2i32:
-  case AArch64::MOVIv4i16:
-  case AArch64::MOVIv4i32:
-  case AArch64::MOVIv8i16:
-    Imm = MI.getOperand(1).getImm();
-    Shift = MI.getOperand(2).getImm();
-    return (Imm == 0 && Shift == 0);
-
-  // MOV Rd, Imm
-  case AArch64::MOVNWi:
-  case AArch64::MOVNXi:
-
-  // MOV Rd, Imm
-  case AArch64::MOVZWi:
-  case AArch64::MOVZXi:
-    return true;
-
-  // MOV Rd, Imm
-  case AArch64::ORRWri:
-  case AArch64::ORRXri:
-    if (!MI.getOperand(1).isReg())
-      return false;
-
-    Reg = MI.getOperand(1).getReg();
-    Imm = MI.getOperand(2).getImm();
-    return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Imm == 0);
-
-  // MOV Rd, Rm
-  case AArch64::ORRWrs:
-  case AArch64::ORRXrs:
-    if (!MI.getOperand(1).isReg())
-      return false;
-
-    Reg = MI.getOperand(1).getReg();
-    Imm = MI.getOperand(3).getImm();
-    Shift = AArch64_AM::getShiftValue(Imm);
-    return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Shift == 0);
-  }
-}
-
-bool AArch64InstrInfo::isExynosLdStExtFast(const MachineInstr &MI) {
-  unsigned Imm;
-  AArch64_AM::ShiftExtendType Ext;
-
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-
-  // WriteLD
-  case AArch64::PRFMroW:
-  case AArch64::PRFMroX:
-
-  // WriteLDIdx
-  case AArch64::LDRBBroW:
-  case AArch64::LDRBBroX:
-  case AArch64::LDRHHroW:
-  case AArch64::LDRHHroX:
-  case AArch64::LDRSBWroW:
-  case AArch64::LDRSBWroX:
-  case AArch64::LDRSBXroW:
-  case AArch64::LDRSBXroX:
-  case AArch64::LDRSHWroW:
-  case AArch64::LDRSHWroX:
-  case AArch64::LDRSHXroW:
-  case AArch64::LDRSHXroX:
-  case AArch64::LDRSWroW:
-  case AArch64::LDRSWroX:
-  case AArch64::LDRWroW:
-  case AArch64::LDRWroX:
-  case AArch64::LDRXroW:
-  case AArch64::LDRXroX:
-
-  case AArch64::LDRBroW:
-  case AArch64::LDRBroX:
-  case AArch64::LDRDroW:
-  case AArch64::LDRDroX:
-  case AArch64::LDRHroW:
-  case AArch64::LDRHroX:
-  case AArch64::LDRSroW:
-  case AArch64::LDRSroX:
-
-  // WriteSTIdx
-  case AArch64::STRBBroW:
-  case AArch64::STRBBroX:
-  case AArch64::STRHHroW:
-  case AArch64::STRHHroX:
-  case AArch64::STRWroW:
-  case AArch64::STRWroX:
-  case AArch64::STRXroW:
-  case AArch64::STRXroX:
-
-  case AArch64::STRBroW:
-  case AArch64::STRBroX:
-  case AArch64::STRDroW:
-  case AArch64::STRDroX:
-  case AArch64::STRHroW:
-  case AArch64::STRHroX:
-  case AArch64::STRSroW:
-  case AArch64::STRSroX:
-    Imm = MI.getOperand(3).getImm();
-    Ext = AArch64_AM::getMemExtendType(Imm);
-    return (Ext == AArch64_AM::SXTX || Ext == AArch64_AM::UXTX);
-  }
-}
-
-bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) {
-  unsigned Imm, Shift;
-  AArch64_AM::ShiftExtendType Ext;
-
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-
-  // WriteI
-  case AArch64::ADDSWri:
-  case AArch64::ADDSXri:
-  case AArch64::ADDWri:
-  case AArch64::ADDXri:
-  case AArch64::SUBSWri:
-  case AArch64::SUBSXri:
-  case AArch64::SUBWri:
-  case AArch64::SUBXri:
-    return true;
-
-  // WriteISReg
-  case AArch64::ADDSWrs:
-  case AArch64::ADDSXrs:
-  case AArch64::ADDWrs:
-  case AArch64::ADDXrs:
-  case AArch64::ANDSWrs:
-  case AArch64::ANDSXrs:
-  case AArch64::ANDWrs:
-  case AArch64::ANDXrs:
-  case AArch64::BICSWrs:
-  case AArch64::BICSXrs:
-  case AArch64::BICWrs:
-  case AArch64::BICXrs:
-  case AArch64::EONWrs:
-  case AArch64::EONXrs:
-  case AArch64::EORWrs:
-  case AArch64::EORXrs:
-  case AArch64::ORNWrs:
-  case AArch64::ORNXrs:
-  case AArch64::ORRWrs:
-  case AArch64::ORRXrs:
-  case AArch64::SUBSWrs:
-  case AArch64::SUBSXrs:
-  case AArch64::SUBWrs:
-  case AArch64::SUBXrs:
-    Imm = MI.getOperand(3).getImm();
-    Shift = AArch64_AM::getShiftValue(Imm);
-    Ext = AArch64_AM::getShiftType(Imm);
-    return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::LSL));
-
-  // WriteIEReg
-  case AArch64::ADDSWrx:
-  case AArch64::ADDSXrx:
-  case AArch64::ADDSXrx64:
-  case AArch64::ADDWrx:
-  case AArch64::ADDXrx:
-  case AArch64::ADDXrx64:
-  case AArch64::SUBSWrx:
-  case AArch64::SUBSXrx:
-  case AArch64::SUBSXrx64:
-  case AArch64::SUBWrx:
-  case AArch64::SUBXrx:
-  case AArch64::SUBXrx64:
-    Imm = MI.getOperand(3).getImm();
-    Shift = AArch64_AM::getArithShiftValue(Imm);
-    Ext = AArch64_AM::getArithExtendType(Imm);
-    return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::UXTX));
-  }
-}
-
 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
@@ -1134,7 +930,7 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
     MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  unsigned BaseRegA = 0, BaseRegB = 0;
+  MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
   int64_t OffsetA = 0, OffsetB = 0;
   unsigned WidthA = 0, WidthB = 0;
 
@@ -1145,14 +941,14 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
-  // Retrieve the base register, offset from the base register and width. Width
+  // Retrieve the base, offset from the base and width. Width
   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
-  // base registers are identical, and the offset of a lower memory access +
+  // base are identical, and the offset of a lower memory access +
   // the width doesn't overlap the offset of a higher memory access,
   // then the memory accesses are different.
-  if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
-      getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
-    if (BaseRegA == BaseRegB) {
+  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
+      getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
+    if (BaseOpA->isIdenticalTo(*BaseOpB)) {
       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
       int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
@@ -1168,6 +964,13 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
                                             const MachineFunction &MF) const {
   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
     return true;
+  switch (MI.getOpcode()) {
+  case AArch64::DSB:
+  case AArch64::ISB:
+    // DSB and ISB also are scheduling barriers.
+    return true;
+  default:;
+  }
   return isSEHInstruction(MI);
 }
 
@@ -1657,11 +1460,36 @@ bool AArch64InstrInfo::substituteCmpToZero(
 }
 
 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
+      MI.getOpcode() != AArch64::CATCHRET)
     return false;
 
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
+
+  if (MI.getOpcode() == AArch64::CATCHRET) {
+    // Skip to the first instruction before the epilog.
+    const TargetInstrInfo *TII =
+      MBB.getParent()->getSubtarget().getInstrInfo();
+    MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
+    auto MBBI = MachineBasicBlock::iterator(MI);
+    MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
+    while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
+           FirstEpilogSEH != MBB.begin())
+      FirstEpilogSEH = std::prev(FirstEpilogSEH);
+    if (FirstEpilogSEH != MBB.begin())
+      FirstEpilogSEH = std::next(FirstEpilogSEH);
+    BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
+        .addReg(AArch64::X0, RegState::Define)
+        .addMBB(TargetMBB);
+    BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
+        .addReg(AArch64::X0, RegState::Define)
+        .addReg(AArch64::X0)
+        .addMBB(TargetMBB)
+        .addImm(0);
+    return true;
+  }
+
   unsigned Reg = MI.getOperand(0).getReg();
   const GlobalValue *GV =
       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
@@ -1714,71 +1542,6 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return true;
 }
 
-/// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    break;
-  case AArch64::ADDSWrs:
-  case AArch64::ADDSXrs:
-  case AArch64::ADDWrs:
-  case AArch64::ADDXrs:
-  case AArch64::ANDSWrs:
-  case AArch64::ANDSXrs:
-  case AArch64::ANDWrs:
-  case AArch64::ANDXrs:
-  case AArch64::BICSWrs:
-  case AArch64::BICSXrs:
-  case AArch64::BICWrs:
-  case AArch64::BICXrs:
-  case AArch64::EONWrs:
-  case AArch64::EONXrs:
-  case AArch64::EORWrs:
-  case AArch64::EORXrs:
-  case AArch64::ORNWrs:
-  case AArch64::ORNXrs:
-  case AArch64::ORRWrs:
-  case AArch64::ORRXrs:
-  case AArch64::SUBSWrs:
-  case AArch64::SUBSXrs:
-  case AArch64::SUBWrs:
-  case AArch64::SUBXrs:
-    if (MI.getOperand(3).isImm()) {
-      unsigned val = MI.getOperand(3).getImm();
-      return (val != 0);
-    }
-    break;
-  }
-  return false;
-}
-
-/// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    break;
-  case AArch64::ADDSWrx:
-  case AArch64::ADDSXrx:
-  case AArch64::ADDSXrx64:
-  case AArch64::ADDWrx:
-  case AArch64::ADDXrx:
-  case AArch64::ADDXrx64:
-  case AArch64::SUBSWrx:
-  case AArch64::SUBSXrx:
-  case AArch64::SUBSXrx64:
-  case AArch64::SUBWrx:
-  case AArch64::SUBXrx:
-  case AArch64::SUBXrx64:
-    if (MI.getOperand(3).isImm()) {
-      unsigned val = MI.getOperand(3).getImm();
-      return (val != 0);
-    }
-    break;
-  }
-
-  return false;
-}
-
 // Return true if this instruction simply sets its single destination register
 // to zero. This is equivalent to a register rename of the zero-register.
 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
@@ -1901,67 +1664,6 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   return 0;
 }
 
-/// Return true if this is load/store scales or extends its register offset.
-/// This refers to scaling a dynamic index as opposed to scaled immediates.
-/// MI should be a memory op that allows scaled addressing.
-bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    break;
-  case AArch64::LDRBBroW:
-  case AArch64::LDRBroW:
-  case AArch64::LDRDroW:
-  case AArch64::LDRHHroW:
-  case AArch64::LDRHroW:
-  case AArch64::LDRQroW:
-  case AArch64::LDRSBWroW:
-  case AArch64::LDRSBXroW:
-  case AArch64::LDRSHWroW:
-  case AArch64::LDRSHXroW:
-  case AArch64::LDRSWroW:
-  case AArch64::LDRSroW:
-  case AArch64::LDRWroW:
-  case AArch64::LDRXroW:
-  case AArch64::STRBBroW:
-  case AArch64::STRBroW:
-  case AArch64::STRDroW:
-  case AArch64::STRHHroW:
-  case AArch64::STRHroW:
-  case AArch64::STRQroW:
-  case AArch64::STRSroW:
-  case AArch64::STRWroW:
-  case AArch64::STRXroW:
-  case AArch64::LDRBBroX:
-  case AArch64::LDRBroX:
-  case AArch64::LDRDroX:
-  case AArch64::LDRHHroX:
-  case AArch64::LDRHroX:
-  case AArch64::LDRQroX:
-  case AArch64::LDRSBWroX:
-  case AArch64::LDRSBXroX:
-  case AArch64::LDRSHWroX:
-  case AArch64::LDRSHXroX:
-  case AArch64::LDRSWroX:
-  case AArch64::LDRSroX:
-  case AArch64::LDRWroX:
-  case AArch64::LDRXroX:
-  case AArch64::STRBBroX:
-  case AArch64::STRBroX:
-  case AArch64::STRDroX:
-  case AArch64::STRHHroX:
-  case AArch64::STRHroX:
-  case AArch64::STRQroX:
-  case AArch64::STRSroX:
-  case AArch64::STRWroX:
-  case AArch64::STRXroX:
-
-    unsigned Val = MI.getOperand(3).getImm();
-    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
-    return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
-  }
-  return false;
-}
-
 /// Check all MachineMemOperands for a hint to suppress pairing.
 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
@@ -2135,17 +1837,21 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
   if (MI.hasOrderedMemoryRef())
     return false;
 
-  // Make sure this is a reg+imm (as opposed to an address reloc).
-  assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
+  assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
+         "Expected a reg or frame index operand.");
   if (!MI.getOperand(2).isImm())
     return false;
 
   // Can't merge/pair if the instruction modifies the base register.
   // e.g., ldr x0, [x0]
-  unsigned BaseReg = MI.getOperand(1).getReg();
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  if (MI.modifiesRegister(BaseReg, TRI))
-    return false;
+  // This case will never occur with an FI base.
+  if (MI.getOperand(1).isReg()) {
+    unsigned BaseReg = MI.getOperand(1).getReg();
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    if (MI.modifiesRegister(BaseReg, TRI))
+      return false;
+  }
 
   // Check if this load/store has a hint to avoid pair formation.
   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
@@ -2168,25 +1874,28 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
   return true;
 }
 
-bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
-    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
-    const TargetRegisterInfo *TRI) const {
+bool AArch64InstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
+                                          MachineOperand *&BaseOp,
+                                          int64_t &Offset,
+                                          const TargetRegisterInfo *TRI) const {
   unsigned Width;
-  return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
+  return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
 }
 
-bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
-    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
-    const TargetRegisterInfo *TRI) const {
+bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
+    MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
+    unsigned &Width, const TargetRegisterInfo *TRI) const {
   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
   // Handle only loads/stores with base register followed by immediate offset.
   if (LdSt.getNumExplicitOperands() == 3) {
     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
-    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
+    if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
+        !LdSt.getOperand(2).isImm())
       return false;
   } else if (LdSt.getNumExplicitOperands() == 4) {
     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
-    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
+    if (!LdSt.getOperand(1).isReg() ||
+        (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
         !LdSt.getOperand(3).isImm())
       return false;
   } else
@@ -2205,13 +1914,18 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   // multiplied by the scaling factor. Unscaled instructions have scaling factor
   // set to 1.
   if (LdSt.getNumExplicitOperands() == 3) {
-    BaseReg = LdSt.getOperand(1).getReg();
+    BaseOp = &LdSt.getOperand(1);
     Offset = LdSt.getOperand(2).getImm() * Scale;
   } else {
     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
-    BaseReg = LdSt.getOperand(2).getReg();
+    BaseOp = &LdSt.getOperand(2);
     Offset = LdSt.getOperand(3).getImm() * Scale;
   }
+
+  assert((BaseOp->isReg() || BaseOp->isFI()) &&
+         "getMemOperandWithOffset only supports base "
+         "operands of type register or frame index.");
+
   return true;
 }
 
@@ -2366,31 +2080,33 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   return true;
 }
 
-// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
-// scaled.
-static bool scaleOffset(unsigned Opc, int64_t &Offset) {
-  unsigned OffsetStride = 1;
+static unsigned getOffsetStride(unsigned Opc) {
   switch (Opc) {
   default:
-    return false;
+    return 0;
   case AArch64::LDURQi:
   case AArch64::STURQi:
-    OffsetStride = 16;
-    break;
+    return 16;
   case AArch64::LDURXi:
   case AArch64::LDURDi:
   case AArch64::STURXi:
   case AArch64::STURDi:
-    OffsetStride = 8;
-    break;
+    return 8;
   case AArch64::LDURWi:
   case AArch64::LDURSi:
   case AArch64::LDURSWi:
   case AArch64::STURWi:
   case AArch64::STURSi:
-    OffsetStride = 4;
-    break;
+    return 4;
   }
+}
+
+// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = getOffsetStride(Opc);
+  if (OffsetStride == 0)
+    return false;
   // If the byte-offset isn't a multiple of the stride, we can't scale this
   // offset.
   if (Offset % OffsetStride != 0)
@@ -2402,6 +2118,19 @@ static bool scaleOffset(unsigned Opc, int64_t &Offset) {
   return true;
 }
 
+// Unscale the scaled offsets. Returns false if the scaled offset can't be
+// unscaled.
+static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = getOffsetStride(Opc);
+  if (OffsetStride == 0)
+    return false;
+
+  // Convert the "element" offset used by scaled pair load/store instructions
+  // into the byte-offset used by unscaled.
+  Offset *= OffsetStride;
+  return true;
+}
+
 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
   if (FirstOpc == SecondOpc)
     return true;
@@ -2420,15 +2149,46 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
   return false;
 }
 
+static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
+                            int64_t Offset1, unsigned Opcode1, int FI2,
+                            int64_t Offset2, unsigned Opcode2) {
+  // Accesses through fixed stack object frame indices may access a different
+  // fixed stack slot. Check that the object offsets + offsets match.
+  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
+    int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
+    int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
+    assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
+    // Get the byte-offset from the object offset.
+    if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
+      return false;
+    ObjectOffset1 += Offset1;
+    ObjectOffset2 += Offset2;
+    // Get the "element" index in the object.
+    if (!scaleOffset(Opcode1, ObjectOffset1) ||
+        !scaleOffset(Opcode2, ObjectOffset2))
+      return false;
+    return ObjectOffset1 + 1 == ObjectOffset2;
+  }
+
+  return FI1 == FI2;
+}
+
 /// Detect opportunities for ldp/stp formation.
 ///
-/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
-bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
-                                           unsigned BaseReg1,
-                                           MachineInstr &SecondLdSt,
-                                           unsigned BaseReg2,
+/// Only called for LdSt for which getMemOperandWithOffset returns true.
+bool AArch64InstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
+                                           MachineOperand &BaseOp2,
                                            unsigned NumLoads) const {
-  if (BaseReg1 != BaseReg2)
+  MachineInstr &FirstLdSt = *BaseOp1.getParent();
+  MachineInstr &SecondLdSt = *BaseOp2.getParent();
+  if (BaseOp1.getType() != BaseOp2.getType())
+    return false;
+
+  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
+         "Only base registers and frame indices are supported.");
+
+  // Check for both base regs and base FI.
+  if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
     return false;
 
   // Only cluster up to a single pair.
@@ -2464,7 +2224,20 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
     return false;
 
   // The caller should already have ordered First/SecondLdSt by offset.
-  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  // Note: except for non-equal frame index bases
+  if (BaseOp1.isFI()) {
+    assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
+           "Caller should have ordered offsets.");
+
+    const MachineFrameInfo &MFI =
+        FirstLdSt.getParent()->getParent()->getFrameInfo();
+    return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
+                           BaseOp2.getIndex(), Offset2, SecondOpc);
+  }
+
+  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
+         "Caller should have ordered offsets.");
+
   return Offset1 + 1 == Offset2;
 }
 
@@ -5075,11 +4848,13 @@ enum MachineOutlinerClass {
 
 enum MachineOutlinerMBBFlags {
   LRUnavailableSomewhere = 0x2,
-  HasCalls = 0x4
+  HasCalls = 0x4,
+  UnsafeRegsDead = 0x8
 };
 
 unsigned
 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
+  assert(C.LRUWasSet && "LRU wasn't set?");
   MachineFunction *MF = C.getMF();
   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
       MF->getSubtarget().getRegisterInfo());
@@ -5102,17 +4877,22 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
 outliner::OutlinedFunction
 AArch64InstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
-  unsigned SequenceSize = std::accumulate(
-      RepeatedSequenceLocs[0].front(),
-      std::next(RepeatedSequenceLocs[0].back()),
-      0, [this](unsigned Sum, const MachineInstr &MI) {
-        return Sum + getInstSizeInBytes(MI);
-      });
-
-  // Compute liveness information for each candidate.
+  outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
+  unsigned SequenceSize =
+      std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
+                      [this](unsigned Sum, const MachineInstr &MI) {
+                        return Sum + getInstSizeInBytes(MI);
+                      });
+
+  // Properties about candidate MBBs that hold for all of them.
+  unsigned FlagsSetInAll = 0xF;
+
+  // Compute liveness information for each candidate, and set FlagsSetInAll.
   const TargetRegisterInfo &TRI = getRegisterInfo();
   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
-                [&TRI](outliner::Candidate &C) { C.initLRU(TRI); });
+                [&FlagsSetInAll](outliner::Candidate &C) {
+                  FlagsSetInAll &= C.Flags;
+                });
 
   // According to the AArch64 Procedure Call Standard, the following are
   // undefined on entry/exit from a function call:
@@ -5125,23 +4905,31 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   // of these registers is live into/across it. Thus, we need to delete
   // those
   // candidates.
-  auto CantGuaranteeValueAcrossCall = [](outliner::Candidate &C) {
+  auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
+    // If the unsafe registers in this block are all dead, then we don't need
+    // to compute liveness here.
+    if (C.Flags & UnsafeRegsDead)
+      return false;
+    C.initLRU(TRI);
     LiveRegUnits LRU = C.LRU;
     return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
             !LRU.available(AArch64::NZCV));
   };
 
-  // Erase every candidate that violates the restrictions above. (It could be
-  // true that we have viable candidates, so it's not worth bailing out in
-  // the case that, say, 1 out of 20 candidates violate the restructions.)
-  RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
-                                            RepeatedSequenceLocs.end(),
-                                            CantGuaranteeValueAcrossCall),
-                             RepeatedSequenceLocs.end());
+  // Are there any candidates where those registers are live?
+  if (!(FlagsSetInAll & UnsafeRegsDead)) {
+    // Erase every candidate that violates the restrictions above. (It could be
+    // true that we have viable candidates, so it's not worth bailing out in
+    // the case that, say, 1 out of 20 candidates violate the restructions.)
+    RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
+                                              RepeatedSequenceLocs.end(),
+                                              CantGuaranteeValueAcrossCall),
+                               RepeatedSequenceLocs.end());
 
-  // If the sequence is empty, we're done.
-  if (RepeatedSequenceLocs.empty())
-    return outliner::OutlinedFunction();
+    // If the sequence doesn't have enough candidates left, then we're done.
+    if (RepeatedSequenceLocs.size() < 2)
+      return outliner::OutlinedFunction();
+  }
 
   // At this point, we have only "safe" candidates to outline. Figure out
   // frame + call instruction information.
@@ -5162,6 +4950,60 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
   });
 
+  // Returns true if an instructions is safe to fix up, false otherwise.
+  auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
+    if (MI.isCall())
+      return true;
+
+    if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
+        !MI.readsRegister(AArch64::SP, &TRI))
+      return true;
+
+    // Any modification of SP will break our code to save/restore LR.
+    // FIXME: We could handle some instructions which add a constant
+    // offset to SP, with a bit more work.
+    if (MI.modifiesRegister(AArch64::SP, &TRI))
+      return false;
+
+    // At this point, we have a stack instruction that we might need to
+    // fix up. We'll handle it if it's a load or store.
+    if (MI.mayLoadOrStore()) {
+      MachineOperand *Base; // Filled with the base operand of MI.
+      int64_t Offset;       // Filled with the offset of MI.
+
+      // Does it allow us to offset the base operand and is the base the
+      // register SP?
+      if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
+          Base->getReg() != AArch64::SP)
+        return false;
+
+      // Find the minimum/maximum offset for this instruction and check
+      // if fixing it up would be in range.
+      int64_t MinOffset,
+          MaxOffset;  // Unscaled offsets for the instruction.
+      unsigned Scale; // The scale to multiply the offsets by.
+      unsigned DummyWidth;
+      getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
+
+      Offset += 16; // Update the offset to what it would be if we outlined.
+      if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
+        return false;
+
+      // It's in range, so we can outline it.
+      return true;
+    }
+
+    // FIXME: Add handling for instructions like "add x0, sp, #8".
+
+    // We can't fix it up, so don't outline it.
+    return false;
+  };
+
+  // True if it's possible to fix up each stack instruction in this sequence.
+  // Important for frames/call variants that modify the stack.
+  bool AllStackInstrsSafe = std::all_of(
+      FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
+
   // If the last instruction in any candidate is a terminator, then we should
   // tail call all of the candidates.
   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
@@ -5178,57 +5020,94 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     SetCandidateCallInfo(MachineOutlinerThunk, 4);
   }
 
-  // Make sure that LR isn't live on entry to this candidate. The only
-  // instructions that use LR that could possibly appear in a repeated sequence
-  // are calls. Therefore, we only have to check and see if LR is dead on entry
-  // to (or exit from) some candidate.
-  else if (std::all_of(RepeatedSequenceLocs.begin(),
-                       RepeatedSequenceLocs.end(),
-                       [](outliner::Candidate &C) {
-                         return C.LRU.available(AArch64::LR);
-                         })) {
-    FrameID = MachineOutlinerNoLRSave;
-    NumBytesToCreateFrame = 4;
-    SetCandidateCallInfo(MachineOutlinerNoLRSave, 4);
-  }
-
-  // LR is live, so we need to save it. Decide whether it should be saved to
-  // the stack, or if it can be saved to a register.
   else {
-    if (all_of(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
-          return findRegisterToSaveLRTo(C);
-        })) {
-      // Every candidate has an available callee-saved register for the save.
-      // We can save LR to a register.
-      FrameID = MachineOutlinerRegSave;
-      NumBytesToCreateFrame = 4;
-      SetCandidateCallInfo(MachineOutlinerRegSave, 12);
+    // We need to decide how to emit calls + frames. We can always emit the same
+    // frame if we don't need to save to the stack. If we have to save to the
+    // stack, then we need a different frame.
+    unsigned NumBytesNoStackCalls = 0;
+    std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
+
+    for (outliner::Candidate &C : RepeatedSequenceLocs) {
+      C.initLRU(TRI);
+
+      // Is LR available? If so, we don't need a save.
+      if (C.LRU.available(AArch64::LR)) {
+        NumBytesNoStackCalls += 4;
+        C.setCallInfo(MachineOutlinerNoLRSave, 4);
+        CandidatesWithoutStackFixups.push_back(C);
+      }
+
+      // Is an unused register available? If so, we won't modify the stack, so
+      // we can outline with the same frame type as those that don't save LR.
+      else if (findRegisterToSaveLRTo(C)) {
+        NumBytesNoStackCalls += 12;
+        C.setCallInfo(MachineOutlinerRegSave, 12);
+        CandidatesWithoutStackFixups.push_back(C);
+      }
+
+      // Is SP used in the sequence at all? If not, we don't have to modify
+      // the stack, so we are guaranteed to get the same frame.
+      else if (C.UsedInSequence.available(AArch64::SP)) {
+        NumBytesNoStackCalls += 12;
+        C.setCallInfo(MachineOutlinerDefault, 12);
+        CandidatesWithoutStackFixups.push_back(C);
+      }
+
+      // If we outline this, we need to modify the stack. Pretend we don't
+      // outline this by saving all of its bytes.
+      else {
+        NumBytesNoStackCalls += SequenceSize;
+      }
     }
 
-    else {
-      // At least one candidate does not have an available callee-saved
-      // register. We must save LR to the stack.
-      FrameID = MachineOutlinerDefault;
-      NumBytesToCreateFrame = 4;
+    // If there are no places where we have to save LR, then note that we
+    // don't have to update the stack. Otherwise, give every candidate the
+    // default call type, as long as it's safe to do so.
+    if (!AllStackInstrsSafe ||
+        NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
+      RepeatedSequenceLocs = CandidatesWithoutStackFixups;
+      FrameID = MachineOutlinerNoLRSave;
+    } else {
       SetCandidateCallInfo(MachineOutlinerDefault, 12);
     }
+
+    // If we dropped all of the candidates, bail out here.
+    if (RepeatedSequenceLocs.size() < 2) {
+      RepeatedSequenceLocs.clear();
+      return outliner::OutlinedFunction();
+    }
   }
 
-  // Check if the range contains a call. These require a save + restore of the
-  // link register.
-  if (std::any_of(RepeatedSequenceLocs[0].front(),
-                  RepeatedSequenceLocs[0].back(),
-                  [](const MachineInstr &MI) { return MI.isCall(); }))
-    NumBytesToCreateFrame += 8; // Save + restore the link register.
+  // Does every candidate's MBB contain a call? If so, then we might have a call
+  // in the range.
+  if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
+    // Check if the range contains a call. These require a save + restore of the
+    // link register.
+    bool ModStackToSaveLR = false;
+    if (std::any_of(FirstCand.front(), FirstCand.back(),
+                    [](const MachineInstr &MI) { return MI.isCall(); }))
+      ModStackToSaveLR = true;
+
+    // Handle the last instruction separately. If this is a tail call, then the
+    // last instruction is a call. We don't want to save + restore in this case.
+    // However, it could be possible that the last instruction is a call without
+    // it being valid to tail call this sequence. We should consider this as
+    // well.
+    else if (FrameID != MachineOutlinerThunk &&
+             FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
+      ModStackToSaveLR = true;
+
+    if (ModStackToSaveLR) {
+      // We can't fix up the stack. Bail out.
+      if (!AllStackInstrsSafe) {
+        RepeatedSequenceLocs.clear();
+        return outliner::OutlinedFunction();
+      }
 
-  // Handle the last instruction separately. If this is a tail call, then the
-  // last instruction is a call. We don't want to save + restore in this case.
-  // However, it could be possible that the last instruction is a call without
-  // it being valid to tail call this sequence. We should consider this as well.
-  else if (FrameID != MachineOutlinerThunk &&
-           FrameID != MachineOutlinerTailCall &&
-           RepeatedSequenceLocs[0].back()->isCall())
-    NumBytesToCreateFrame += 8;
+      // Save + restore LR.
+      NumBytesToCreateFrame += 8;
+    }
+  }
 
   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
                                     NumBytesToCreateFrame, FrameID);
@@ -5260,29 +5139,70 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
   return true;
 }
 
-unsigned
-AArch64InstrInfo::getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const {
-  unsigned Flags = 0x0;
-  // Check if there's a call inside this MachineBasicBlock. If there is, then
-  // set a flag.
-  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
-    Flags |= MachineOutlinerMBBFlags::HasCalls;
-
+bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                                              unsigned &Flags) const {
   // Check if LR is available through all of the MBB. If it's not, then set
   // a flag.
   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
          "Suitable Machine Function for outlining must track liveness");
   LiveRegUnits LRU(getRegisterInfo());
-  LRU.addLiveOuts(MBB);
 
-  std::for_each(MBB.rbegin(),
-                MBB.rend(),
+  std::for_each(MBB.rbegin(), MBB.rend(),
                 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
 
-  if (!LRU.available(AArch64::LR))
-      Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
+  // Check if each of the unsafe registers are available...
+  bool W16AvailableInBlock = LRU.available(AArch64::W16);
+  bool W17AvailableInBlock = LRU.available(AArch64::W17);
+  bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
+
+  // If all of these are dead (and not live out), we know we don't have to check
+  // them later.
+  if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
+    Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
+
+  // Now, add the live outs to the set.
+  LRU.addLiveOuts(MBB);
+
+  // If any of these registers is available in the MBB, but also a live out of
+  // the block, then we know outlining is unsafe.
+  if (W16AvailableInBlock && !LRU.available(AArch64::W16))
+    return false;
+  if (W17AvailableInBlock && !LRU.available(AArch64::W17))
+    return false;
+  if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
+    return false;
+
+  // Check if there's a call inside this MachineBasicBlock. If there is, then
+  // set a flag.
+  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
+    Flags |= MachineOutlinerMBBFlags::HasCalls;
+
+  MachineFunction *MF = MBB.getParent();
+
+  // In the event that we outline, we may have to save LR. If there is an
+  // available register in the MBB, then we'll always save LR there. Check if
+  // this is true.
+  bool CanSaveLR = false;
+  const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+
+  // Check if there is an available register across the sequence that we can
+  // use.
+  for (unsigned Reg : AArch64::GPR64RegClass) {
+    if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
+        Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
+      CanSaveLR = true;
+      break;
+    }
+  }
 
-  return Flags;
+  // Check if we have a register we can save LR to, and if LR was used
+  // somewhere. If both of those things are true, then we need to evaluate the
+  // safety of outlining stack instructions later.
+  if (!CanSaveLR && !LRU.available(AArch64::LR))
+    Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
+
+  return true;
 }
 
 outliner::InstrType
@@ -5405,108 +5325,19 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
     return outliner::InstrType::Illegal;
 
-  // Does this use the stack?
-  if (MI.modifiesRegister(AArch64::SP, &RI) ||
-      MI.readsRegister(AArch64::SP, &RI)) {
-    // True if there is no chance that any outlined candidate from this range
-    // could require stack fixups. That is, both
-    // * LR is available in the range (No save/restore around call)
-    // * The range doesn't include calls (No save/restore in outlined frame)
-    // are true.
-    // FIXME: This is very restrictive; the flags check the whole block,
-    // not just the bit we will try to outline.
-    bool MightNeedStackFixUp =
-        (Flags & (MachineOutlinerMBBFlags::LRUnavailableSomewhere |
-                  MachineOutlinerMBBFlags::HasCalls));
-
-    // If this instruction is in a range where it *never* needs to be fixed
-    // up, then we can *always* outline it. This is true even if it's not
-    // possible to fix that instruction up.
-    //
-    // Why? Consider two equivalent instructions I1, I2 where both I1 and I2
-    // use SP. Suppose that I1 sits within a range that definitely doesn't
-    // need stack fixups, while I2 sits in a range that does.
-    //
-    // First, I1 can be outlined as long as we *never* fix up the stack in
-    // any sequence containing it. I1 is already a safe instruction in the
-    // original program, so as long as we don't modify it we're good to go.
-    // So this leaves us with showing that outlining I2 won't break our
-    // program.
-    //
-    // Suppose I1 and I2 belong to equivalent candidate sequences. When we
-    // look at I2, we need to see if it can be fixed up. Suppose I2, (and
-    // thus I1) cannot be fixed up. Then I2 will be assigned an unique
-    // integer label; thus, I2 cannot belong to any candidate sequence (a
-    // contradiction). Suppose I2 can be fixed up. Then I1 can be fixed up
-    // as well, so we're good. Thus, I1 is always safe to outline.
-    //
-    // This gives us two things: first off, it buys us some more instructions
-    // for our search space by deeming stack instructions illegal only when
-    // they can't be fixed up AND we might have to fix them up. Second off,
-    // This allows us to catch tricky instructions like, say,
-    // %xi = ADDXri %sp, n, 0. We can't safely outline these since they might
-    // be paired with later SUBXris, which might *not* end up being outlined.
-    // If we mess with the stack to save something, then an ADDXri messes with
-    // it *after*, then we aren't going to restore the right something from
-    // the stack if we don't outline the corresponding SUBXri first. ADDXris and
-    // SUBXris are extremely common in prologue/epilogue code, so supporting
-    // them in the outliner can be a pretty big win!
-    if (!MightNeedStackFixUp)
-      return outliner::InstrType::Legal;
-
-    // Any modification of SP will break our code to save/restore LR.
-    // FIXME: We could handle some instructions which add a constant offset to
-    // SP, with a bit more work.
-    if (MI.modifiesRegister(AArch64::SP, &RI))
-      return outliner::InstrType::Illegal;
-
-    // At this point, we have a stack instruction that we might need to fix
-    // up. We'll handle it if it's a load or store.
-    if (MI.mayLoadOrStore()) {
-      unsigned Base;  // Filled with the base regiser of MI.
-      int64_t Offset; // Filled with the offset of MI.
-      unsigned DummyWidth;
-
-      // Does it allow us to offset the base register and is the base SP?
-      if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) ||
-          Base != AArch64::SP)
-        return outliner::InstrType::Illegal;
-
-      // Find the minimum/maximum offset for this instruction and check if
-      // fixing it up would be in range.
-      int64_t MinOffset, MaxOffset; // Unscaled offsets for the instruction.
-      unsigned Scale;               // The scale to multiply the offsets by.
-      getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
-
-      // TODO: We should really test what happens if an instruction overflows.
-      // This is tricky to test with IR tests, but when the outliner is moved
-      // to a MIR test, it really ought to be checked.
-      Offset += 16; // Update the offset to what it would be if we outlined.
-      if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
-        return outliner::InstrType::Illegal;
-
-      // It's in range, so we can outline it.
-      return outliner::InstrType::Legal;
-    }
-
-    // FIXME: Add handling for instructions like "add x0, sp, #8".
-
-    // We can't fix it up, so don't outline it.
-    return outliner::InstrType::Illegal;
-  }
-
   return outliner::InstrType::Legal;
 }
 
 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
   for (MachineInstr &MI : MBB) {
-    unsigned Base, Width;
+    MachineOperand *Base;
+    unsigned Width;
     int64_t Offset;
 
     // Is this a load or store with an immediate offset with SP as the base?
     if (!MI.mayLoadOrStore() ||
-        !getMemOpBaseRegImmOfsWidth(MI, Base, Offset, Width, &RI) ||
-        Base != AArch64::SP)
+        !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
+        (Base->isReg() && Base->getReg() != AArch64::SP))
       continue;
 
     // It is, so we have to fix it up.
@@ -5699,3 +5530,6 @@ bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
   MachineFunction &MF) const {
   return MF.getFunction().optForMinSize();
 }
+
+#define GET_INSTRINFO_HELPERS
+#include "AArch64GenInstrInfo.inc"
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 43011dd4c3e57de43d692ee6fa164bd34ff337ea..9954669d567508d6ce83cb9d17d00dee077c1a91 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -62,14 +62,6 @@ public:
   unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
-  /// Returns true if there is a shiftable register and that the shift value
-  /// is non-zero.
-  static bool hasShiftedReg(const MachineInstr &MI);
-
-  /// Returns true if there is an extendable register and that the extending
-  /// value is non-zero.
-  static bool hasExtendedReg(const MachineInstr &MI);
-
   /// Does this instruction set its full destination register to zero?
   static bool isGPRZero(const MachineInstr &MI);
 
@@ -79,11 +71,6 @@ public:
   /// Does this instruction rename an FPR without modifying bits?
   static bool isFPRCopy(const MachineInstr &MI);
 
-  /// Return true if this is load/store scales or extends its register offset.
-  /// This refers to scaling a dynamic index as opposed to scaled immediates.
-  /// MI should be a memory op that allows scaled addressing.
-  static bool isScaledAddr(const MachineInstr &MI);
-
   /// Return true if pairing the given load or store is hinted to be
   /// unprofitable.
   static bool isLdStPairSuppressed(const MachineInstr &MI);
@@ -110,13 +97,13 @@ public:
   /// Hint that pairing the given load or store is unprofitable.
   static void suppressLdStPair(MachineInstr &MI);
 
-  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
-                             int64_t &Offset,
-                             const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandWithOffset(MachineInstr &MI, MachineOperand *&BaseOp,
+                               int64_t &Offset,
+                               const TargetRegisterInfo *TRI) const override;
 
-  bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
-                                  int64_t &Offset, unsigned &Width,
-                                  const TargetRegisterInfo *TRI) const;
+  bool getMemOperandWithOffsetWidth(MachineInstr &MI, MachineOperand *&BaseOp,
+                                    int64_t &Offset, unsigned &Width,
+                                    const TargetRegisterInfo *TRI) const;
 
   /// Return the immediate offset of the base register in a load/store \p LdSt.
   MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const;
@@ -128,8 +115,7 @@ public:
   bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
                     int64_t &MinOffset, int64_t &MaxOffset) const;
 
-  bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
-                           MachineInstr &SecondLdSt, unsigned BaseReg2,
+  bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2,
                            unsigned NumLoads) const override;
 
   void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -246,7 +232,8 @@ public:
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
   outliner::InstrType
   getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
-  unsigned getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const override;
+  bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                              unsigned &Flags) const override;
   void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
                           const outliner::OutlinedFunction &OF) const override;
   MachineBasicBlock::iterator
@@ -254,15 +241,6 @@ public:
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
                      const outliner::Candidate &C) const override;
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
-  /// Returns true if the instruction sets a constant value that can be
-  /// executed more efficiently.
-  static bool isExynosResetFast(const MachineInstr &MI);
-  /// Returns true if the load or store has an extension that can be executed
-  /// more efficiently.
-  static bool isExynosLdStExtFast(const MachineInstr &MI);
-  /// Returns true if the instruction has a constant shift left or extension
-  /// that can be executed more efficiently.
-  static bool isExynosShiftExtFast(const MachineInstr &MI);
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
   static bool isFalkorShiftExtFast(const MachineInstr &MI);
@@ -270,6 +248,9 @@ public:
   /// on Windows.
   static bool isSEHInstruction(const MachineInstr &MI);
 
+#define GET_INSTRINFO_HELPER_DECLS
+#include "AArch64GenInstrInfo.inc"
+
 private:
   /// Sets the offsets on outlined instructions in \p MBB which use SP
   /// so that they will be valid post-outlining.
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 37d3967df44efcaa6affde19fe754d684f307775..ee43b55aed93508ae319f977e53c1195b8f96303 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -24,6 +24,54 @@ def HasV8_4a         : Predicate<"Subtarget->hasV8_4aOps()">,
                                  AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
 def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
                                  AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
+def HasVH            : Predicate<"Subtarget->hasVH()">,
+                       AssemblerPredicate<"FeatureVH", "vh">;
+
+def HasLOR           : Predicate<"Subtarget->hasLOR()">,
+                       AssemblerPredicate<"FeatureLOR", "lor">;
+
+def HasPA            : Predicate<"Subtarget->hasPA()">,
+                       AssemblerPredicate<"FeaturePA", "pa">;
+
+def HasJS            : Predicate<"Subtarget->hasJS()">,
+                       AssemblerPredicate<"FeatureJS", "jsconv">;
+
+def HasCCIDX         : Predicate<"Subtarget->hasCCIDX()">,
+                       AssemblerPredicate<"FeatureCCIDX", "ccidx">;
+
+def HasComplxNum      : Predicate<"Subtarget->hasComplxNum()">,
+                       AssemblerPredicate<"FeatureComplxNum", "complxnum">;
+
+def HasNV            : Predicate<"Subtarget->hasNV()">,
+                       AssemblerPredicate<"FeatureNV", "nv">;
+
+def HasRASv8_4       : Predicate<"Subtarget->hasRASv8_4()">,
+                       AssemblerPredicate<"FeatureRASv8_4", "rasv8_4">;
+
+def HasMPAM          : Predicate<"Subtarget->hasMPAM()">,
+                       AssemblerPredicate<"FeatureMPAM", "mpam">;
+
+def HasDIT           : Predicate<"Subtarget->hasDIT()">,
+                       AssemblerPredicate<"FeatureDIT", "dit">;
+
+def HasTRACEV8_4         : Predicate<"Subtarget->hasTRACEV8_4()">,
+                       AssemblerPredicate<"FeatureTRACEV8_4", "tracev8.4">;
+
+def HasAM            : Predicate<"Subtarget->hasAM()">,
+                       AssemblerPredicate<"FeatureAM", "am">;
+
+def HasSEL2          : Predicate<"Subtarget->hasSEL2()">,
+                       AssemblerPredicate<"FeatureSEL2", "sel2">;
+
+def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
+                       AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;
+
+def HasFMI           : Predicate<"Subtarget->hasFMI()">,
+                       AssemblerPredicate<"FeatureFMI", "fmi">;
+
+def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPCImm()">,
+                       AssemblerPredicate<"FeatureRCPC_IMMO", "rcpc-immo">;
+
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
                                AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
@@ -510,7 +558,7 @@ def ISB   : CRmSystemI<barrier_op, 0b110, "isb",
 def TSB   : CRmSystemI<barrier_op, 0b010, "tsb", []> {
   let CRm        = 0b0010;
   let Inst{12}   = 0;
-  let Predicates = [HasV8_4a];
+  let Predicates = [HasTRACEV8_4];
 }
 }
 
@@ -602,7 +650,7 @@ let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
 }
 
 // These pointer authentication isntructions require armv8.3a
-let Predicates = [HasV8_3a] in {
+let Predicates = [HasPA] in {
   multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> {
     def IA   : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>;
     def IB   : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>;
@@ -642,17 +690,17 @@ let Predicates = [HasV8_3a] in {
   defm LDRAA  : AuthLoad<0, "ldraa", simm10Scaled>;
   defm LDRAB  : AuthLoad<1, "ldrab", simm10Scaled>;
 
-  // v8.3a floating point conversion for javascript
-  let Predicates = [HasV8_3a, HasFPARMv8] in
-  def FJCVTZS  : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
-                                        "fjcvtzs", []> {
-    let Inst{31} = 0;
-  }
+}
 
-} // HasV8_3a
+// v8.3a floating point conversion for javascript
+let Predicates = [HasJS, HasFPARMv8] in
+def FJCVTZS  : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
+                                      "fjcvtzs", []> {
+  let Inst{31} = 0;
+} // HasJS, HasFPARMv8
 
 // v8.4 Flag manipulation instructions
-let Predicates = [HasV8_4a] in {
+let Predicates = [HasFMI] in {
 def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> {
   let Inst{20-5} = 0b0000001000000000;
 }
@@ -660,7 +708,7 @@ def SETF8  : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">;
 def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">;
 def RMIF   : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif",
                         "{\t$Rn, $imm, $mask}">;
-} // HasV8_4a
+} // HasFMI
 
 // v8.5 flag manipulation instructions
 let Predicates = [HasAltNZCV], Uses = [NZCV], Defs = [NZCV] in {
@@ -2629,8 +2677,9 @@ defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32z, "sturb",
                          [(truncstorei8 GPR32z:$Rt,
                                   (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
 
-// Armv8.4 LDAPR & STLR with Immediate Offset instruction
-let Predicates = [HasV8_4a] in {
+// Armv8.4 Weaker Release Consistency enhancements
+//         LDAPR & STLR with Immediate Offset instructions
+let Predicates = [HasRCPC_IMMO] in {
 defm STLURB     : BaseStoreUnscaleV84<"stlurb",  0b00, 0b00, GPR32>;
 defm STLURH     : BaseStoreUnscaleV84<"stlurh",  0b01, 0b00, GPR32>;
 defm STLURW     : BaseStoreUnscaleV84<"stlur",   0b10, 0b00, GPR32>;
@@ -2915,7 +2964,7 @@ def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
 def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
 def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
 
-let Predicates = [HasV8_1a] in {
+let Predicates = [HasLOR] in {
   // v8.1a "Limited Order Region" extension load-acquire instructions
   def LDLARW  : LoadAcquire   <0b10, 1, 1, 0, 0, GPR32, "ldlar">;
   def LDLARX  : LoadAcquire   <0b11, 1, 1, 0, 0, GPR64, "ldlar">;
@@ -3175,6 +3224,20 @@ let isPseudo = 1 in {
   def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
 }
 
+// Pseudo instructions for Windows EH
+//===----------------------------------------------------------------------===//
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+    isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1, isPseudo = 1 in {
+   def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret)]>, Sched<[]>;
+   let usesCustomInserter = 1 in
+     def CATCHRET : Pseudo<(outs), (ins am_brcond:$dst, am_brcond:$src), [(catchret bb:$dst, bb:$src)]>,
+                    Sched<[]>;
+}
+
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in
+def CATCHPAD : Pseudo<(outs), (ins), [(catchpad)]>, Sched<[]>;
+
 //===----------------------------------------------------------------------===//
 // Floating point immediate move.
 //===----------------------------------------------------------------------===//
@@ -4094,25 +4157,6 @@ defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
 defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
   UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
 
-// Patterns for smull2/umull2.
-multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
-  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
-  def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
-                           (extract_high_v16i8 V128:$Rm))),
-             (INST8B V128:$Rn, V128:$Rm)>;
-  def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
-                           (extract_high_v8i16 V128:$Rm))),
-             (INST4H V128:$Rn, V128:$Rm)>;
-  def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
-                           (extract_high_v4i32 V128:$Rm))),
-             (INST2S V128:$Rn, V128:$Rm)>;
-}
-
-defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
-  SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
-defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
-  UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
-
 // Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
 multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
   Instruction INST8B, Instruction INST4H, Instruction INST2S> {
@@ -4335,6 +4379,12 @@ def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
 def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
 def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
 
+// DUP from a 64-bit register to a 64-bit register is just a copy
+def : Pat<(v1i64 (AArch64dup (i64 GPR64:$Rn))),
+          (COPY_TO_REGCLASS GPR64:$Rn, FPR64)>;
+def : Pat<(v1f64 (AArch64dup (f64 FPR64:$Rn))),
+          (COPY_TO_REGCLASS FPR64:$Rn, FPR64)>;
+
 def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
           (v2f32 (DUPv2i32lane
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
@@ -5970,6 +6020,41 @@ def : Pat<(i32 (trunc GPR64sp:$src)),
 // __builtin_trap() uses the BRK instruction on AArch64.
 def : Pat<(trap), (BRK 1)>;
 
+// Multiply high patterns which multiply the lower subvector using smull/umull
+// and the upper subvector with smull2/umull2. Then shuffle the high the high
+// part of both results together.
+def : Pat<(v16i8 (mulhs V128:$Rn, V128:$Rm)),
+          (UZP2v16i8
+           (SMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
+                            (EXTRACT_SUBREG V128:$Rm, dsub)),
+           (SMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>;
+def : Pat<(v8i16 (mulhs V128:$Rn, V128:$Rm)),
+          (UZP2v8i16
+           (SMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
+                             (EXTRACT_SUBREG V128:$Rm, dsub)),
+           (SMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
+def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)),
+          (UZP2v4i32
+           (SMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
+                             (EXTRACT_SUBREG V128:$Rm, dsub)),
+           (SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+
+def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)),
+          (UZP2v16i8
+           (UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
+                            (EXTRACT_SUBREG V128:$Rm, dsub)),
+           (UMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>;
+def : Pat<(v8i16 (mulhu V128:$Rn, V128:$Rm)),
+          (UZP2v8i16
+           (UMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
+                             (EXTRACT_SUBREG V128:$Rm, dsub)),
+           (UMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
+def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
+          (UZP2v4i32
+           (UMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
+                             (EXTRACT_SUBREG V128:$Rm, dsub)),
+           (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+
 // Conversions within AdvSIMD types in the same register size are free.
 // But because we need a consistent lane ordering, in big endian many
 // conversions require one or more REV instructions.
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 46b8b4dc6486eb4ae89b6ba8b9ba15657658ae30..6cbfb6ab161b25e4ce8526fd7007fcc5e6eb78ed 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -65,6 +65,15 @@ private:
   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
                            MachineRegisterInfo &MRI) const;
 
+  // Helper to generate an equivalent of scalar_to_vector into a new register,
+  // returned via 'Dst'.
+  bool emitScalarToVector(unsigned &Dst, const LLT DstTy,
+                          const TargetRegisterClass *DstRC, unsigned Scalar,
+                          MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineRegisterInfo &MRI) const;
+  bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
+
   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
 
   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
@@ -667,7 +676,7 @@ void AArch64InstructionSelector::materializeLargeCMVal(
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineIRBuilder MIB(I);
 
-  auto MovZ = MIB.buildInstr(AArch64::MOVZXi, &AArch64::GPR64RegClass);
+  auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
   MovZ->addOperand(MF, I.getOperand(1));
   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
                                      AArch64II::MO_NC);
@@ -779,16 +788,36 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     const unsigned CondReg = I.getOperand(0).getReg();
     MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
 
-    if (selectCompareBranch(I, MF, MRI))
+    // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
+    // instructions will not be produced, as they are conditional branch
+    // instructions that do not set flags.
+    bool ProduceNonFlagSettingCondBr =
+        !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+    if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI))
       return true;
 
-    auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
-                   .addUse(CondReg)
-                   .addImm(/*bit offset=*/0)
-                   .addMBB(DestMBB);
+    if (ProduceNonFlagSettingCondBr) {
+      auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
+                     .addUse(CondReg)
+                     .addImm(/*bit offset=*/0)
+                     .addMBB(DestMBB);
 
-    I.eraseFromParent();
-    return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+      I.eraseFromParent();
+      return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+    } else {
+      auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
+                     .addDef(AArch64::WZR)
+                     .addUse(CondReg)
+                     .addImm(1);
+      constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI);
+      auto Bcc =
+          BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc))
+              .addImm(AArch64CC::EQ)
+              .addMBB(DestMBB);
+
+      I.eraseFromParent();
+      return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI);
+    }
   }
 
   case TargetOpcode::G_BRINDIRECT: {
@@ -1013,12 +1042,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     }
     unsigned MemSizeInBits = MemOp.getSize() * 8;
 
-    // FIXME: PR36018: Volatile loads in some cases are incorrectly selected by
-    // folding with an extend. Until we have a G_SEXTLOAD solution bail out if
-    // we hit one.
-    if (Opcode == TargetOpcode::G_LOAD && MemOp.isVolatile())
-      return false;
-
     const unsigned PtrReg = I.getOperand(1).getReg();
 #ifndef NDEBUG
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
@@ -1528,11 +1551,130 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
     }
   }
+  case TargetOpcode::G_BUILD_VECTOR:
+    return selectBuildVector(I, MRI);
   }
 
   return false;
 }
 
+bool AArch64InstructionSelector::emitScalarToVector(
+    unsigned &Dst, const LLT DstTy, const TargetRegisterClass *DstRC,
+    unsigned Scalar, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, MachineRegisterInfo &MRI) const {
+  Dst = MRI.createVirtualRegister(DstRC);
+
+  unsigned UndefVec = MRI.createVirtualRegister(DstRC);
+  MachineInstr &UndefMI = *BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                   TII.get(TargetOpcode::IMPLICIT_DEF))
+                               .addDef(UndefVec);
+
+  auto BuildFn = [&](unsigned SubregIndex) {
+    MachineInstr &InsMI = *BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                   TII.get(TargetOpcode::INSERT_SUBREG))
+                               .addDef(Dst)
+                               .addUse(UndefVec)
+                               .addUse(Scalar)
+                               .addImm(SubregIndex);
+    constrainSelectedInstRegOperands(UndefMI, TII, TRI, RBI);
+    return constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
+  };
+
+  switch (DstTy.getElementType().getSizeInBits()) {
+  case 32:
+    return BuildFn(AArch64::ssub);
+  case 64:
+    return BuildFn(AArch64::dsub);
+  default:
+    return false;
+  }
+}
+
+bool AArch64InstructionSelector::selectBuildVector(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+  // Until we port more of the optimized selections, for now just use a vector
+  // insert sequence.
+  const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+  const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
+  unsigned EltSize = EltTy.getSizeInBits();
+  if (EltSize < 32 || EltSize > 64)
+    return false; // Don't support all element types yet.
+  const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
+  unsigned Opc;
+  unsigned SubregIdx;
+  if (RB.getID() == AArch64::GPRRegBankID) {
+    if (EltSize == 32) {
+      Opc = AArch64::INSvi32gpr;
+      SubregIdx = AArch64::ssub;
+    } else {
+      Opc = AArch64::INSvi64gpr;
+      SubregIdx = AArch64::dsub;
+    }
+  } else {
+    if (EltSize == 32) {
+      Opc = AArch64::INSvi32lane;
+      SubregIdx = AArch64::ssub;
+    } else {
+      Opc = AArch64::INSvi64lane;
+      SubregIdx = AArch64::dsub;
+    }
+  }
+
+  if (EltSize * DstTy.getNumElements() != 128)
+    return false; // Don't handle unpacked vectors yet.
+
+  unsigned DstVec = 0;
+  const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(
+      DstTy, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
+  emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(),
+                     *I.getParent(), I.getIterator(), MRI);
+  for (unsigned i = 2, e = DstTy.getSizeInBits() / EltSize + 1; i < e; ++i) {
+    unsigned InsDef;
+    // For the last insert re-use the dst reg of the G_BUILD_VECTOR.
+    if (i + 1 < e)
+      InsDef = MRI.createVirtualRegister(DstRC);
+    else
+      InsDef = I.getOperand(0).getReg();
+    unsigned LaneIdx = i - 1;
+    if (RB.getID() == AArch64::FPRRegBankID) {
+      unsigned ImpDef = MRI.createVirtualRegister(DstRC);
+      MachineInstr &ImpDefMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                        TII.get(TargetOpcode::IMPLICIT_DEF))
+                                    .addDef(ImpDef);
+      unsigned InsSubDef = MRI.createVirtualRegister(DstRC);
+      MachineInstr &InsSubMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                        TII.get(TargetOpcode::INSERT_SUBREG))
+                                    .addDef(InsSubDef)
+                                    .addUse(ImpDef)
+                                    .addUse(I.getOperand(i).getReg())
+                                    .addImm(SubregIdx);
+      MachineInstr &InsEltMI =
+          *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc))
+               .addDef(InsDef)
+               .addUse(DstVec)
+               .addImm(LaneIdx)
+               .addUse(InsSubDef)
+               .addImm(0);
+      constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(InsSubMI, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(InsEltMI, TII, TRI, RBI);
+      DstVec = InsDef;
+    } else {
+      MachineInstr &InsMI =
+          *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc))
+               .addDef(InsDef)
+               .addUse(DstVec)
+               .addImm(LaneIdx)
+               .addUse(I.getOperand(i).getReg());
+      constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
+      DstVec = InsDef;
+    }
+  }
+  I.eraseFromParent();
+  return true;
+}
+
 /// SelectArithImmed - Select an immediate value that can be represented as
 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
 /// Val set to the 12-bit value and Shift set to the shifter operand.
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 4b5e10ac4ec503d025f2f58e58c882c2c1581ecc..5d63f0c244ea32f2f57b5adabc4f0353b3f39258 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -48,7 +48,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
   const LLT v2s64 = LLT::vector(2, 64);
 
   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
-      .legalFor({p0, s1, s8, s16, s32, s64})
+      .legalFor({p0, s1, s8, s16, s32, s64, v2s64})
       .clampScalar(0, s1, s64)
       .widenScalarToNextPow2(0, 8);
 
@@ -398,13 +398,26 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
         return VecTy == v4s32 || VecTy == v2s64;
       });
 
+  getActionDefinitionsBuilder(G_BUILD_VECTOR)
+      .legalFor({{v4s32, s32}, {v2s64, s64}})
+      .clampNumElements(0, v4s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64)
+
+      // Deal with larger scalar types, which will be implicitly truncated.
+      .legalIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].getScalarSizeInBits() <
+               Query.Types[1].getSizeInBits();
+      })
+      .minScalarSameAs(1, 0);
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
 
 bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
                                           MachineRegisterInfo &MRI,
-                                          MachineIRBuilder &MIRBuilder) const {
+                                          MachineIRBuilder &MIRBuilder,
+                                          GISelChangeObserver &Observer) const {
   switch (MI.getOpcode()) {
   default:
     // No idea what to do.
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h
index a745b0edbc6d943e0af989863005bc11d20bcec6..77e8bdc7623c9ba2a1617631cfa9eeb9eca60d9e 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H
 
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 
 namespace llvm {
@@ -28,7 +29,8 @@ public:
   AArch64LegalizerInfo(const AArch64Subtarget &ST);
 
   bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                      MachineIRBuilder &MIRBuilder) const override;
+                      MachineIRBuilder &MIRBuilder,
+                      GISelChangeObserver &Observer) const override;
 
 private:
   bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 6e1824f632eea335ff1cc7c05fd2cb9fc45f1489..07295989ec8e29d888aa3f902cee13fa665fc02b 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -269,4 +269,17 @@ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     if (lowerOperand(MO, MCOp))
       OutMI.addOperand(MCOp);
   }
+
+  switch (OutMI.getOpcode()) {
+  case AArch64::CATCHRET:
+    OutMI = MCInst();
+    OutMI.setOpcode(AArch64::RET);
+    OutMI.addOperand(MCOperand::createReg(AArch64::LR));
+    break;
+  case AArch64::CLEANUPRET:
+    OutMI = MCInst();
+    OutMI.setOpcode(AArch64::RET);
+    OutMI.addOperand(MCOperand::createReg(AArch64::LR));
+    break;
+  }
 }
diff --git a/lib/Target/AArch64/AArch64PfmCounters.td b/lib/Target/AArch64/AArch64PfmCounters.td
new file mode 100644
index 0000000000000000000000000000000000000000..16ba3e4282a0d0fd9adb232a634b95b274b77da6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PfmCounters.td
@@ -0,0 +1,19 @@
+//===-- AArch64PfmCounters.td - AArch64 Hardware Counters --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the available hardware counters for AArch64.
+//
+//===----------------------------------------------------------------------===//
+
+def CpuCyclesPfmCounter : PfmCounter<"CPU_CYCLES">;
+
+def DefaultPfmCounters : ProcPfmCounters {
+  let CycleCounter = CpuCyclesPfmCounter;
+}
+def : PfmCountersDefaultBinding<DefaultPfmCounters>;
diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
index 32487b9ccc330998972efcf99e6f99e6c795eec1..5022b6221ab99029caa1e54089da7928480ce2ff 100644
--- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
@@ -32,11 +32,11 @@ public:
   AArch64PreLegalizerCombinerInfo()
       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
                      /*LegalizerInfo*/ nullptr) {}
-  virtual bool combine(CombinerChangeObserver &Observer, MachineInstr &MI,
+  virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
                        MachineIRBuilder &B) const override;
 };
 
-bool AArch64PreLegalizerCombinerInfo::combine(CombinerChangeObserver &Observer,
+bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
                                               MachineInstr &MI,
                                               MachineIRBuilder &B) const {
   CombinerHelper Helper(Observer, B);
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 0bab5c05ba6696dcb50e74a226fd257d7221cfa0..96ae45ae3d0dcd7d074fc4294ee823c3ed8a7d21 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -161,6 +161,10 @@ void AArch64RegisterInfo::UpdateCustomCallPreservedMask(MachineFunction &MF,
   *Mask = UpdatedMask;
 }
 
+const uint32_t *AArch64RegisterInfo::getNoPreservedMask() const {
+  return CSR_AArch64_NoRegs_RegMask;
+}
+
 const uint32_t *
 AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
                                                 CallingConv::ID CC) const {
@@ -199,6 +203,10 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (hasBasePointer(MF))
     markSuperRegs(Reserved, AArch64::W19);
 
+  // SLH uses register W16/X16 as the taint register.
+  if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
+    markSuperRegs(Reserved, AArch64::W16);
+
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
@@ -251,14 +259,15 @@ unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
 bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  // In the presence of variable sized objects, if the fixed stack size is
-  // large enough that referencing from the FP won't result in things being
-  // in range relatively often, we can use a base pointer to allow access
+  // In the presence of variable sized objects or funclets, if the fixed stack
+  // size is large enough that referencing from the FP won't result in things
+  // being in range relatively often, we can use a base pointer to allow access
   // from the other direction like the SP normally works.
+  //
   // Furthermore, if both variable sized objects are present, and the
   // stack needs to be dynamically re-aligned, the base pointer is the only
   // reliable way to reference the locals.
-  if (MFI.hasVarSizedObjects()) {
+  if (MFI.hasVarSizedObjects() || MF.hasEHFunclets()) {
     if (needsStackRealignment(MF))
       return true;
     // Conservatively estimate whether the negative offset from the frame
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 4653c7af59d0ec1b7c1f5c8b3ed4dbe09da34014..c4153228a7c0467fd3ede037bfd0208c1543d0f3 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -65,6 +65,9 @@ public:
   // normal calls, so they need a different mask to represent this.
   const uint32_t *getTLSCallPreservedMask() const;
 
+  // Funclets on ARM64 Windows don't preserve any registers.
+  const uint32_t *getNoPreservedMask() const override;
+
   /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
   /// case that 'returned' is on an i64 first argument if the calling convention
   /// is one that can (partially) model this attribute with a preserved mask
diff --git a/lib/Target/AArch64/AArch64SchedExynosM1.td b/lib/Target/AArch64/AArch64SchedExynosM1.td
index d566a13dc67dc1193889bac1179c2284f7c85106..7f1c2d4c7648281cc6bfb308608c68ce15476540 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM1.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM1.td
@@ -61,14 +61,6 @@ def M1UnitALU  : ProcResGroup<[M1UnitA,
 def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
                                M1UnitNAL1]>; // All simple vector
 
-//===----------------------------------------------------------------------===//
-// Predicates.
-
-def M1BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
-                                        MI->getOperand(0).getReg() != AArch64::LR}]>;
-def M1LdStExtPred    : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>;
-def M1ShiftExtPred   : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>;
-
 //===----------------------------------------------------------------------===//
 // Coarse scheduling model.
 
@@ -86,14 +78,16 @@ def M1WriteAC : SchedWriteRes<[M1UnitALU,
 def M1WriteAD : SchedWriteRes<[M1UnitALU,
                                M1UnitC]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
-def M1WriteAX : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteA1]>,
-                                   SchedVar<NoSchedPred,    [M1WriteAA]>]>;
+def M1WriteAX : SchedWriteVariant<[SchedVar<ExynosExtPred, [M1WriteA1]>,
+                                   SchedVar<NoSchedPred,   [M1WriteAA]>]>;
+def M1WriteAY : SchedWriteVariant<[SchedVar<ExynosShiftPred, [M1WriteA1]>,
+                                   SchedVar<NoSchedPred,     [M1WriteAA]>]>;
 def M1WriteC1 : SchedWriteRes<[M1UnitC]>   { let Latency = 1; }
 def M1WriteC2 : SchedWriteRes<[M1UnitC]>   { let Latency = 2; }
 
 def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
-def M1WriteBX : SchedWriteVariant<[SchedVar<M1BranchLinkPred, [M1WriteAB]>,
-                                   SchedVar<NoSchedPred,      [M1WriteAC]>]>;
+def M1WriteBX : SchedWriteVariant<[SchedVar<ExynosBranchLinkLRPred, [M1WriteAC]>,
+                                   SchedVar<NoSchedPred,            [M1WriteAB]>]>;
 
 def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; }
 def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; }
@@ -111,40 +105,27 @@ def M1WriteLD : SchedWriteRes<[M1UnitL,
                                            let ResourceCycles = [2, 1]; }
 def M1WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
-def M1WriteLX : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteL5]>,
-                                   SchedVar<NoSchedPred,   [M1WriteLC]>]>;
-def M1WriteLY : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteL5]>,
-                                   SchedVar<NoSchedPred,   [M1WriteLD]>]>;
+def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteLC]>,
+                                   SchedVar<NoSchedPred,   [M1WriteL5]>]>;
 
 def M1WriteS1 : SchedWriteRes<[M1UnitS]>   { let Latency = 1; }
 def M1WriteS3 : SchedWriteRes<[M1UnitS]>   { let Latency = 3; }
 def M1WriteS4 : SchedWriteRes<[M1UnitS]>   { let Latency = 4; }
 def M1WriteSA : SchedWriteRes<[M1UnitS,
-                               M1UnitFST,
-                               M1UnitS,
-                               M1UnitFST]> { let Latency = 1;
-                                             let NumMicroOps = 2; }
-def M1WriteSB : SchedWriteRes<[M1UnitS,
                                M1UnitFST,
                                M1UnitA]>   { let Latency = 3;
                                              let NumMicroOps = 2; }
-def M1WriteSC : SchedWriteRes<[M1UnitS,
+def M1WriteSB : SchedWriteRes<[M1UnitS,
                                M1UnitFST,
                                M1UnitS,
                                M1UnitFST,
                                M1UnitA]>   { let Latency = 3;
                                              let NumMicroOps = 3; }
-def M1WriteSD : SchedWriteRes<[M1UnitS,
-                               M1UnitFST,
-                               M1UnitA]>   { let Latency = 1;
-                                             let NumMicroOps = 2; }
-def M1WriteSE : SchedWriteRes<[M1UnitS,
+def M1WriteSC : SchedWriteRes<[M1UnitS,
                                M1UnitA]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
-def M1WriteSX : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteS1]>,
-                                   SchedVar<NoSchedPred,   [M1WriteSE]>]>;
-def M1WriteSY : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteS1]>,
-                                   SchedVar<NoSchedPred,   [M1WriteSB]>]>;
+def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteSC]>,
+                                   SchedVar<NoSchedPred,   [M1WriteS1]>]>;
 
 def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
                                       SchedVar<NoSchedPred,   [ReadDefault]>]>;
@@ -415,9 +396,9 @@ def M1WriteVSTH    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitS,
                                     M1UnitFST,
                                     M1UnitFST,
-                                    M1UnitFST]>      { let Latency = 14;
-                                                       let NumMicroOps = 4;
-                                                       let ResourceCycles = [1, 7, 1, 7, 1]; }
+                                    M1UnitFST]>     { let Latency = 14;
+                                                      let NumMicroOps = 4;
+                                                      let ResourceCycles = [1, 7, 1, 7, 1]; }
 def M1WriteVSTI    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitS,
                                     M1UnitFST,
@@ -428,9 +409,17 @@ def M1WriteVSTI    : SchedWriteRes<[M1UnitNALU,
                                     M1UnitS,
                                     M1UnitFST,
                                     M1UnitFST,
-                                    M1UnitFST]>      { let Latency = 17;
-                                                       let NumMicroOps = 7;
-                                                       let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; }
+                                    M1UnitFST]>     { let Latency = 17;
+                                                      let NumMicroOps = 7;
+                                                      let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; }
+
+// Special cases.
+def M1WriteAES  : SchedWriteRes<[M1UnitNCRYPT]>     { let Latency = 1; }
+def M1WriteCOPY : SchedWriteVariant<[SchedVar<ExynosFPPred, [M1WriteNALU1]>,
+                                     SchedVar<NoSchedPred,  [M1WriteA1]>]>;
+
+// Fast forwarding.
+def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>;
 
 // Branch instructions
 def : InstRW<[M1WriteB1], (instrs Bcc)>;
@@ -440,8 +429,11 @@ def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>;
 def : InstRW<[M1WriteAD], (instregex "^TBN?Z[WX]")>;
 
 // Arithmetic and logical integer instructions.
-def : InstRW<[M1WriteA1], (instrs COPY)>;
-def : InstRW<[M1WriteAX], (instregex ".+r[sx](64)?$")>;
+def : InstRW<[M1WriteAX], (instregex ".+rx(64)?$")>;
+def : InstRW<[M1WriteAY], (instregex ".+rs$")>;
+
+// Move instructions.
+def : InstRW<[M1WriteCOPY], (instrs COPY)>;
 
 // Divide and multiply instructions.
 
@@ -451,10 +443,20 @@ def : InstRW<[M1WriteAX], (instregex ".+r[sx](64)?$")>;
 def : InstRW<[M1WriteLB,
               WriteLDHi,
               WriteAdr],    (instregex "^LDP(SW|W|X)(post|pre)")>;
-def : InstRW<[M1WriteLX,
-              ReadAdrBase], (instregex "^PRFMro[WX]")>;
+def : InstRW<[M1WriteLC,
+              ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>;
+def : InstRW<[M1WriteL5,
+              ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>;
+def : InstRW<[M1WriteLC,
+              ReadAdrBase], (instrs PRFMroW)>;
+def : InstRW<[M1WriteL5,
+              ReadAdrBase], (instrs PRFMroX)>;
 
 // Store instructions.
+def : InstRW<[M1WriteSC,
+              ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>;
+def : InstRW<[WriteST,
+              ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>;
 
 // FP data instructions.
 def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)[DS]r")>;
@@ -488,8 +490,10 @@ def : InstRW<[WriteVLD],    (instregex "^LDUR[BDHSQ]i")>;
 def : InstRW<[WriteVLD,
               WriteAdr],    (instregex "^LDR[BDHSQ](post|pre)")>;
 def : InstRW<[WriteVLD],    (instregex "^LDR[BDHSQ]ui")>;
-def : InstRW<[M1WriteLY,
-              ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>;
+def : InstRW<[M1WriteLD,
+              ReadAdrBase], (instregex "^LDR[BDHS]roW")>;
+def : InstRW<[WriteVLD,
+              ReadAdrBase], (instregex "^LDR[BDHS]roX")>;
 def : InstRW<[M1WriteLD,
               ReadAdrBase], (instregex "^LDRQro[WX]")>;
 def : InstRW<[WriteVLD,
@@ -508,14 +512,16 @@ def : InstRW<[WriteVST],    (instregex "^STUR[BDHSQ]i")>;
 def : InstRW<[WriteVST,
               WriteAdr],    (instregex "^STR[BDHSQ](post|pre)")>;
 def : InstRW<[WriteVST],    (instregex "^STR[BDHSQ]ui")>;
-def : InstRW<[M1WriteSY,
-              ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>;
-def : InstRW<[M1WriteSB,
+def : InstRW<[M1WriteSA,
+              ReadAdrBase], (instregex "^STR[BDHS]roW")>;
+def : InstRW<[WriteVST,
+              ReadAdrBase], (instregex "^STR[BDHS]roX")>;
+def : InstRW<[M1WriteSA,
               ReadAdrBase], (instregex "^STRQro[WX]")>;
 def : InstRW<[WriteVST],    (instregex "^STN?P[DSQ]i")>;
 def : InstRW<[WriteVST,
               WriteAdr],    (instregex "^STP[DS](post|pre)")>;
-def : InstRW<[M1WriteSC,
+def : InstRW<[M1WriteSB,
               WriteAdr],    (instregex "^STPQ(post|pre)")>;
 
 // ASIMD instructions.
@@ -609,21 +615,21 @@ def : InstRW<[M1WriteVLDE],   (instregex "LD1i(64)$")>;
 def : InstRW<[M1WriteVLDE,
               WriteAdr],      (instregex "LD1i(64)_POST$")>;
 
-def : InstRW<[M1WriteL5],     (instregex "LD1Rv(8b|4h|2s)$")>;
-def : InstRW<[M1WriteL5,
+def : InstRW<[WriteVLD],      (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLD,
               WriteAdr],      (instregex "LD1Rv(8b|4h|2s)_POST$")>;
-def : InstRW<[M1WriteL5],     (instregex "LD1Rv(1d)$")>;
-def : InstRW<[M1WriteL5,
+def : InstRW<[WriteVLD],      (instregex "LD1Rv(1d)$")>;
+def : InstRW<[WriteVLD,
               WriteAdr],      (instregex "LD1Rv(1d)_POST$")>;
-def : InstRW<[M1WriteL5],     (instregex "LD1Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[M1WriteL5,
+def : InstRW<[WriteVLD],      (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD,
               WriteAdr],      (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
 
-def : InstRW<[M1WriteL5],     (instregex "LD1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[M1WriteL5,
+def : InstRW<[WriteVLD],      (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD,
               WriteAdr],      (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[M1WriteL5],     (instregex "LD1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[M1WriteL5,
+def : InstRW<[WriteVLD],      (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD,
               WriteAdr],      (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[M1WriteVLDA],   (instregex "LD1Twov(8b|4h|2s|1d)$")>;
 def : InstRW<[M1WriteVLDA,
@@ -831,8 +837,6 @@ def : InstRW<[M1WriteVSTI,
               WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
 
 // Cryptography instructions.
-def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
-def M1ReadAES  : SchedReadAdvance<1, [M1WriteAES]>;
 def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>;
 def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>;
 
diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td
index e61fb611ab245bc2a2b5b52d5f9969f24dce20c9..fd19ff84f1e010d40b16fdc6917d003bc3fb985b 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -103,20 +103,6 @@ def M3UnitNSHF : ProcResGroup<[M3UnitNSHF0,
                                M3UnitNSHF1,
                                M3UnitNSHF2]>;
 
-//===----------------------------------------------------------------------===//
-// Predicates.
-
-def M3BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
-                                        MI->getOperand(0).isReg() &&
-                                        MI->getOperand(0).getReg() != AArch64::LR}]>;
-def M3ResetPred      : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>;
-def M3RotatePred     : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri ||
-                                         MI->getOpcode() == AArch64::EXTRXrri) &&
-                                        MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
-                                        MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>;
-def M3LdStExtPred    : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>;
-def M3ShiftExtPred   : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>;
-
 //===----------------------------------------------------------------------===//
 // Coarse scheduling model.
 
@@ -138,15 +124,23 @@ def M3WriteAD : SchedWriteRes<[M3UnitALU,
                                              let NumMicroOps = 2; }
 def M3WriteC1 : SchedWriteRes<[M3UnitC]>   { let Latency = 1; }
 def M3WriteC2 : SchedWriteRes<[M3UnitC]>   { let Latency = 2; }
-def M3WriteAX : SchedWriteVariant<[SchedVar<M3ResetPred,    [M3WriteZ0]>,
-                                   SchedVar<M3ShiftExtPred, [M3WriteA1]>,
-                                   SchedVar<NoSchedPred,    [M3WriteAA]>]>;
-def M3WriteAY : SchedWriteVariant<[SchedVar<M3RotatePred, [M3WriteA1]>,
-                                   SchedVar<NoSchedPred,  [M3WriteAA]>]>;
+def M3WriteAU : SchedWriteVariant<[SchedVar<IsCopyIdiomPred, [M3WriteZ0]>,
+                                   SchedVar<ExynosShiftPred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,     [M3WriteAA]>]>;
+def M3WriteAV : SchedWriteVariant<[SchedVar<IsCopyIdiomPred, [M3WriteZ0]>,
+                                   SchedVar<NoSchedPred,     [M3WriteAA]>]>;
+def M3WriteAW : SchedWriteVariant<[SchedVar<IsZeroIdiomPred, [M3WriteZ0]>,
+                                   SchedVar<NoSchedPred,     [M3WriteAA]>]>;
+def M3WriteAX : SchedWriteVariant<[SchedVar<ExynosExtPred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,   [M3WriteAA]>]>;
+def M3WriteAY : SchedWriteVariant<[SchedVar<ExynosRotateRightImmPred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,              [M3WriteAA]>]>;
+def M3WriteAZ : SchedWriteVariant<[SchedVar<ExynosShiftPred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,     [M3WriteAA]>]>;
 
 def M3WriteB1 : SchedWriteRes<[M3UnitB]> { let Latency = 1; }
-def M3WriteBX : SchedWriteVariant<[SchedVar<M3BranchLinkPred, [M3WriteAB]>,
-                                   SchedVar<NoSchedPred,      [M3WriteAC]>]>;
+def M3WriteBX : SchedWriteVariant<[SchedVar<ExynosBranchLinkLRPred, [M3WriteAC]>,
+                                   SchedVar<NoSchedPred,            [M3WriteAB]>]>;
 
 def M3WriteL4 : SchedWriteRes<[M3UnitL]> { let Latency = 4; }
 def M3WriteL5 : SchedWriteRes<[M3UnitL]> { let Latency = 5; }
@@ -163,29 +157,23 @@ def M3WriteLC : SchedWriteRes<[M3UnitA,
 def M3WriteLD : SchedWriteRes<[M3UnitA,
                                M3UnitL]> { let Latency = 4;
                                            let NumMicroOps = 2; }
+def M3WriteLE : SchedWriteRes<[M3UnitA,
+                               M3UnitL]> { let Latency = 6;
+                                           let NumMicroOps = 2; }
 def M3WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
-
-def M3WriteLX : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteL5]>,
-                                   SchedVar<NoSchedPred,   [M3WriteLB]>]>;
+def M3WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M3WriteL5]>,
+                                   SchedVar<NoSchedPred,   [M3WriteL4]>]>;
 
 def M3WriteS1 : SchedWriteRes<[M3UnitS]>   { let Latency = 1; }
 def M3WriteSA : SchedWriteRes<[M3UnitA,
                                M3UnitS,
-                               M3UnitFST]> { let Latency = 2;
+                               M3UnitFST]> { let Latency = 3;
                                              let NumMicroOps = 2; }
 def M3WriteSB : SchedWriteRes<[M3UnitA,
-                               M3UnitS]>   { let Latency = 1;
-                                             let NumMicroOps = 2; }
-def M3WriteSC : SchedWriteRes<[M3UnitA,
                                M3UnitS]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
 
-def M3WriteSX : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteS1]>,
-                                   SchedVar<NoSchedPred,   [M3WriteSB]>]>;
-def M3WriteSY : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteS1]>,
-                                   SchedVar<NoSchedPred,   [M3WriteSC]>]>;
-
 def M3ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
                                       SchedVar<NoSchedPred,   [ReadDefault]>]>;
 
@@ -214,9 +202,7 @@ def : WriteRes<WriteIM64, [M3UnitC]>  { let Latency = 4;
                                         let ResourceCycles = [2]; }
 
 // Miscellaneous instructions.
-def : WriteRes<WriteExtr, [M3UnitALU,
-                           M3UnitALU]> { let Latency = 1;
-                                         let NumMicroOps = 2; }
+def : SchedAlias<WriteExtr, M3WriteAY>;
 
 // Addressing modes.
 def : WriteRes<WriteAdr, []> { let Latency = 1;
@@ -227,13 +213,13 @@ def : SchedAlias<ReadAdrBase, M3ReadAdrBase>;
 def : SchedAlias<WriteLD, M3WriteL4>;
 def : WriteRes<WriteLDHi, []> { let Latency = 4;
                                 let NumMicroOps = 0; }
-def : SchedAlias<WriteLDIdx, M3WriteLX>;
+def : SchedAlias<WriteLDIdx, M3WriteLB>;
 
 // Store instructions.
 def : SchedAlias<WriteST,    M3WriteS1>;
 def : SchedAlias<WriteSTP,   M3WriteS1>;
 def : SchedAlias<WriteSTX,   M3WriteS1>;
-def : SchedAlias<WriteSTIdx, M3WriteSX>;
+def : SchedAlias<WriteSTIdx, M3WriteSB>;
 
 // FP data instructions.
 def : WriteRes<WriteF,    [M3UnitFADD]>  { let Latency = 2; }
@@ -243,7 +229,6 @@ def : WriteRes<WriteFDiv, [M3UnitFDIV]>  { let Latency = 12;
 def : WriteRes<WriteFMul, [M3UnitFMAC]>  { let Latency = 4; }
 
 // FP miscellaneous instructions.
-// TODO: Conversion between register files is much different.
 def : WriteRes<WriteFCvt,  [M3UnitFCVT]> { let Latency = 3; }
 def : WriteRes<WriteFImm,  [M3UnitNALU]> { let Latency = 1; }
 def : WriteRes<WriteFCopy, [M3UnitNALU]> { let Latency = 1; }
@@ -479,11 +464,15 @@ def M3WriteVSTI    : SchedWriteRes<[M3UnitNALU,
 
 // Special cases.
 def M3WriteAES     : SchedWriteRes<[M3UnitNCRY]>  { let Latency = 1; }
+def M3WriteCOPY    : SchedWriteVariant<[SchedVar<ExynosFPPred, [M3WriteNALU1]>,
+                                        SchedVar<NoSchedPred,  [M3WriteZ0]>]>;
+def M3WriteMOVI    : SchedWriteVariant<[SchedVar<IsZeroFPIdiomPred, [M3WriteZ0]>,
+                                        SchedVar<NoSchedPred,       [M3WriteNALU1]>]>;
+
+// Fast forwarding.
 def M3ReadAES      : SchedReadAdvance<1, [M3WriteAES]>;
 def M3ReadFMAC     : SchedReadAdvance<1, [M3WriteFMAC4,
                                           M3WriteFMAC5]>;
-def M3WriteMOVI    : SchedWriteVariant<[SchedVar<M3ResetPred, [M3WriteZ0]>,
-                                        SchedVar<NoSchedPred, [M3WriteNALU1]>]>;
 def M3ReadNMUL     : SchedReadAdvance<1, [M3WriteNMUL3]>;
 
 // Branch instructions
@@ -494,29 +483,40 @@ def : InstRW<[M3WriteC1], (instregex "^CBN?Z[WX]")>;
 def : InstRW<[M3WriteAD], (instregex "^TBN?Z[WX]")>;
 
 // Arithmetic and logical integer instructions.
-def : InstRW<[M3WriteA1], (instrs COPY)>;
-def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?Xrx64")>;
-def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]$")>;
-def : InstRW<[M3WriteAX], (instregex "^(ADD|BIC|SUB)S[WX]r[sx]$")>;
-def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|EOR|ORR|SUB)[WX]ri")>;
+def : InstRW<[M3WriteAZ], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)[WX]rs$")>;
+def : InstRW<[M3WriteAU], (instrs ORRWrs, ORRXrs)>;
+def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?[WX]rx(64)?$")>;
+def : InstRW<[M3WriteAZ], (instregex "^(ADD|AND|BIC|SUB)S[WX]rs$")>;
+def : InstRW<[M3WriteAV], (instrs ADDWri, ADDXri)>;
+def : InstRW<[M3WriteAW], (instrs ORRWri, ORRXri)>;
 
 // Move instructions.
-def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>;
-def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>;
+def : InstRW<[M3WriteCOPY], (instrs COPY)>;
+def : InstRW<[M3WriteZ0],   (instrs ADR, ADRP)>;
+def : InstRW<[M3WriteZ0],   (instregex "^MOV[NZ][WX]i")>;
 
 // Divide and multiply instructions.
 
 // Miscellaneous instructions.
-def : InstRW<[M3WriteAY], (instrs EXTRWrri, EXTRXrri)>;
 
 // Load instructions.
 def : InstRW<[M3WriteLD,
               WriteLDHi,
               WriteAdr],    (instregex "^LDP(SW|W|X)(post|pre)")>;
+def : InstRW<[M3WriteLB,
+              ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>;
 def : InstRW<[M3WriteLX,
-              ReadAdrBase], (instregex "^PRFMro[WX]")>;
+              ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>;
+def : InstRW<[M3WriteLB,
+              ReadAdrBase], (instrs PRFMroW)>;
+def : InstRW<[M3WriteLX,
+              ReadAdrBase], (instrs PRFMroX)>;
 
 // Store instructions.
+def : InstRW<[M3WriteSB,
+              ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>;
+def : InstRW<[WriteST,
+              ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>;
 
 // FP data instructions.
 def : InstRW<[M3WriteNSHF1],  (instregex "^FABS[DS]r")>;
@@ -553,9 +553,11 @@ def : InstRW<[WriteVLD],    (instregex "^LDUR[BDHSQ]i")>;
 def : InstRW<[WriteVLD,
               WriteAdr],    (instregex "^LDR[BDHSQ](post|pre)")>;
 def : InstRW<[WriteVLD],    (instregex "^LDR[BDHSQ]ui")>;
-def : InstRW<[M3WriteLX,
-              ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>;
-def : InstRW<[M3WriteLB,
+def : InstRW<[M3WriteLE,
+              ReadAdrBase], (instregex "^LDR[BDHS]roW")>;
+def : InstRW<[WriteVLD,
+              ReadAdrBase], (instregex "^LDR[BDHS]roX")>;
+def : InstRW<[M3WriteLE,
               ReadAdrBase], (instregex "^LDRQro[WX]")>;
 def : InstRW<[WriteVLD,
               M3WriteLH],   (instregex "^LDN?P[DS]i")>;
@@ -573,8 +575,10 @@ def : InstRW<[WriteVST],    (instregex "^STUR[BDHSQ]i")>;
 def : InstRW<[WriteVST,
               WriteAdr],    (instregex "^STR[BDHSQ](post|pre)")>;
 def : InstRW<[WriteVST],    (instregex "^STR[BDHSQ]ui")>;
-def : InstRW<[M3WriteSY,
-              ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>;
+def : InstRW<[M3WriteSA,
+              ReadAdrBase], (instregex "^STR[BDHS]roW")>;
+def : InstRW<[WriteVST,
+              ReadAdrBase], (instregex "^STR[BDHS]roX")>;
 def : InstRW<[M3WriteSA,
               ReadAdrBase], (instregex "^STRQro[WX]")>;
 def : InstRW<[WriteVST],    (instregex "^STN?P[DSQ]i")>;
diff --git a/lib/Target/AArch64/AArch64SchedPredExynos.td b/lib/Target/AArch64/AArch64SchedPredExynos.td
new file mode 100644
index 0000000000000000000000000000000000000000..f8533d18022af3108248f201451a0301b66f34b0
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedPredExynos.td
@@ -0,0 +1,139 @@
+//===- AArch64SchedPredExynos.td - AArch64 Sched Preds -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are used by the
+// AArch64 Exynos processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Auxiliary predicates.
+
+// Check the shift in arithmetic and logic instructions.
+def ExynosCheckShift : CheckAny<[CheckShiftBy0,
+                                 CheckAll<
+                                   [CheckShiftLSL,
+                                    CheckAny<
+                                      [CheckShiftBy1,
+                                       CheckShiftBy2,
+                                       CheckShiftBy3]>]>]>;
+
+// Exynos predicates.
+
+// Identify BLR specifying the LR register as the indirect target register.
+def ExynosBranchLinkLRPred : MCSchedPredicate<
+                               CheckAll<[CheckOpcode<[BLR]>,
+                                         CheckRegOperand<0, LR>]>>;
+
+// Identify arithmetic and logic instructions without or with limited extension.
+def ExynosExtFn   : TIIPredicate<
+                      "isExynosExtFast",
+                      MCOpcodeSwitchStatement<
+                        [MCOpcodeSwitchCase<
+                           IsArithExtOp.ValidOpcodes,
+                           MCReturnStatement<
+                             CheckAny<[CheckExtBy0,
+                                       CheckAll<
+                                         [CheckAny<
+                                           [CheckExtUXTW,
+                                            CheckExtUXTX]>,
+                                          CheckAny<
+                                            [CheckExtBy1,
+                                             CheckExtBy2,
+                                             CheckExtBy3]>]>]>>>],
+                        MCReturnStatement<FalsePred>>>;
+def ExynosExtPred : MCSchedPredicate<ExynosExtFn>;
+
+// Identify a load or store using the register offset addressing mode
+// with a scaled non-extended register.
+def ExynosScaledIdxFn   : TIIPredicate<"isExynosScaledAddr",
+                                       MCOpcodeSwitchStatement<
+                                         [MCOpcodeSwitchCase<
+                                            IsLoadStoreRegOffsetOp.ValidOpcodes,
+                                            MCReturnStatement<
+                                              CheckAny<
+                                                [CheckMemExtSXTW,
+                                                 CheckMemExtUXTW,
+                                                 CheckMemScaled]>>>],
+                                         MCReturnStatement<FalsePred>>>;
+def ExynosScaledIdxPred : MCSchedPredicate<ExynosScaledIdxFn>;
+
+// Identify FP instructions.
+def ExynosFPPred : MCSchedPredicate<CheckAny<[CheckDForm, CheckQForm]>>;
+
+// Identify whether an instruction whose result is a long vector
+// operates on the upper half of the input registers.
+def ExynosLongVectorUpperFn   : TIIPredicate<
+                                  "isExynosLongVectorUpper",
+                                  MCOpcodeSwitchStatement<
+                                  [MCOpcodeSwitchCase<
+                                    IsLongVectorUpperOp.ValidOpcodes,
+                                    MCReturnStatement<TruePred>>],
+                                  MCReturnStatement<FalsePred>>>;
+def ExynosLongVectorUpperPred : MCSchedPredicate<ExynosLongVectorUpperFn>;
+
+// Identify 128-bit NEON instructions.
+def ExynosQFormPred : MCSchedPredicate<CheckQForm>;
+
+// Identify instructions that reset a register efficiently.
+def ExynosResetFn   : TIIPredicate<
+                        "isExynosResetFast",
+                        MCOpcodeSwitchStatement<
+                          [MCOpcodeSwitchCase<
+                             [ADR, ADRP,
+                              MOVNWi, MOVNXi,
+                              MOVZWi, MOVZXi],
+                             MCReturnStatement<TruePred>>],
+                          MCReturnStatement<
+                            CheckAny<
+                              [IsCopyIdiomFn,
+                               IsZeroFPIdiomFn,
+                               IsZeroIdiomFn]>>>>;
+def ExynosResetPred : MCSchedPredicate<ExynosResetFn>;
+
+// Identify EXTR as the alias for ROR (immediate).
+def ExynosRotateRightImmPred : MCSchedPredicate<
+                                 CheckAll<[CheckOpcode<[EXTRWrri, EXTRXrri]>,
+                                           CheckSameRegOperand<1, 2>]>>;
+
+// Identify arithmetic and logic instructions with limited shift.
+def ExynosShiftFn   : TIIPredicate<
+                        "isExynosShiftFast",
+                        MCOpcodeSwitchStatement<
+                          [MCOpcodeSwitchCase<
+                             IsArithLogicShiftOp.ValidOpcodes,
+                             MCReturnStatement<ExynosCheckShift>>],
+                          MCReturnStatement<FalsePred>>>;
+def ExynosShiftPred : MCSchedPredicate<ExynosShiftFn>;
+
+// Identify more arithmetic and logic instructions with limited shift.
+def ExynosShiftExFn   : TIIPredicate<
+                          "isExynosShiftExFast",
+                          MCOpcodeSwitchStatement<
+                            [MCOpcodeSwitchCase<
+                               IsArithLogicShiftOp.ValidOpcodes,
+                               MCReturnStatement<
+                                 CheckAny<
+                                   [CheckAll<
+                                     [CheckShiftLSL,
+                                      CheckShiftBy8]>,
+                                    ExynosCheckShift]>>>],
+                            MCReturnStatement<FalsePred>>>;
+def ExynosShiftExPred : MCSchedPredicate<ExynosShiftExFn>;
+
+
+// Identify arithmetic and logic immediate instructions.
+def ExynosCheapFn : TIIPredicate<
+                      "isExynosCheapAsMove",
+                      MCOpcodeSwitchStatement<
+                        [MCOpcodeSwitchCase<
+                           IsArithLogicImmOp.ValidOpcodes,
+                           MCReturnStatement<TruePred>>],
+                        MCReturnStatement<
+                          CheckAny<
+                            [ExynosExtFn, ExynosResetFn, ExynosShiftFn]>>>>;
diff --git a/lib/Target/AArch64/AArch64SchedPredicates.td b/lib/Target/AArch64/AArch64SchedPredicates.td
new file mode 100644
index 0000000000000000000000000000000000000000..a48f1dcffaffadd70758855a6735d01ee177af32
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -0,0 +1,370 @@
+//===- AArch64SchedPredicates.td - AArch64 Sched Preds -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are used by the
+// AArch64 subtargets.
+//
+//===----------------------------------------------------------------------===//
+
+// Function mappers.
+
+// Check the extension type in arithmetic instructions.
+let FunctionMapper = "AArch64_AM::getArithExtendType" in {
+  def CheckExtUXTB                      : CheckImmOperand_s<3, "AArch64_AM::UXTB">;
+  def CheckExtUXTH                      : CheckImmOperand_s<3, "AArch64_AM::UXTH">;
+  def CheckExtUXTW                      : CheckImmOperand_s<3, "AArch64_AM::UXTW">;
+  def CheckExtUXTX                      : CheckImmOperand_s<3, "AArch64_AM::UXTX">;
+  def CheckExtSXTB                      : CheckImmOperand_s<3, "AArch64_AM::SXTB">;
+  def CheckExtSXTH                      : CheckImmOperand_s<3, "AArch64_AM::SXTH">;
+  def CheckExtSXTW                      : CheckImmOperand_s<3, "AArch64_AM::SXTW">;
+  def CheckExtSXTX                      : CheckImmOperand_s<3, "AArch64_AM::SXTX">;
+}
+
+// Check for shifting in extended arithmetic instructions.
+foreach I = {0-3} in {
+  let FunctionMapper = "AArch64_AM::getArithShiftValue" in
+  def CheckExtBy#I                      : CheckImmOperand<3, I>;
+}
+
+// Check the extension type in the register offset addressing mode.
+let FunctionMapper = "AArch64_AM::getMemExtendType" in {
+  def CheckMemExtUXTW                   : CheckImmOperand_s<3, "AArch64_AM::UXTW">;
+  def CheckMemExtLSL                    : CheckImmOperand_s<3, "AArch64_AM::UXTX">;
+  def CheckMemExtSXTW                   : CheckImmOperand_s<3, "AArch64_AM::SXTW">;
+  def CheckMemExtSXTX                   : CheckImmOperand_s<3, "AArch64_AM::SXTX">;
+}
+
+// Check for scaling in the register offset addressing mode.
+let FunctionMapper = "AArch64_AM::getMemDoShift" in
+def CheckMemScaled                      : CheckImmOperandSimple<3>;
+
+// Check the shifting type in arithmetic and logic instructions.
+let FunctionMapper = "AArch64_AM::getShiftType" in {
+  def CheckShiftLSL                : CheckImmOperand_s<3, "AArch64_AM::LSL">;
+  def CheckShiftLSR                : CheckImmOperand_s<3, "AArch64_AM::LSR">;
+  def CheckShiftASR                : CheckImmOperand_s<3, "AArch64_AM::ASR">;
+  def CheckShiftROR                : CheckImmOperand_s<3, "AArch64_AM::ROR">;
+  def CheckShiftMSL                : CheckImmOperand_s<3, "AArch64_AM::MSL">;
+}
+
+// Check for shifting in arithmetic and logic instructions.
+foreach I = {0-3, 8} in {
+  let FunctionMapper = "AArch64_AM::getShiftValue" in
+  def CheckShiftBy#I        : CheckImmOperand<3, I>;
+}
+
+// Generic predicates.
+
+// Identify whether an instruction is the 64-bit NEON form based on its result.
+def CheckDForm             : CheckAll<[CheckIsRegOperand<0>,
+                                       CheckAny<[CheckRegOperand<0, D0>,
+                                                 CheckRegOperand<0, D1>,
+                                                 CheckRegOperand<0, D2>,
+                                                 CheckRegOperand<0, D3>,
+                                                 CheckRegOperand<0, D4>,
+                                                 CheckRegOperand<0, D5>,
+                                                 CheckRegOperand<0, D6>,
+                                                 CheckRegOperand<0, D7>,
+                                                 CheckRegOperand<0, D8>,
+                                                 CheckRegOperand<0, D9>,
+                                                 CheckRegOperand<0, D10>,
+                                                 CheckRegOperand<0, D11>,
+                                                 CheckRegOperand<0, D12>,
+                                                 CheckRegOperand<0, D13>,
+                                                 CheckRegOperand<0, D14>,
+                                                 CheckRegOperand<0, D15>,
+                                                 CheckRegOperand<0, D16>,
+                                                 CheckRegOperand<0, D17>,
+                                                 CheckRegOperand<0, D18>,
+                                                 CheckRegOperand<0, D19>,
+                                                 CheckRegOperand<0, D20>,
+                                                 CheckRegOperand<0, D21>,
+                                                 CheckRegOperand<0, D22>,
+                                                 CheckRegOperand<0, D23>,
+                                                 CheckRegOperand<0, D24>,
+                                                 CheckRegOperand<0, D25>,
+                                                 CheckRegOperand<0, D26>,
+                                                 CheckRegOperand<0, D27>,
+                                                 CheckRegOperand<0, D28>,
+                                                 CheckRegOperand<0, D29>,
+                                                 CheckRegOperand<0, D30>,
+                                                 CheckRegOperand<0, D31>]>]>;
+
+// Identify whether an instruction is the 128-bit NEON form based on its result.
+def CheckQForm             : CheckAll<[CheckIsRegOperand<0>,
+                                       CheckAny<[CheckRegOperand<0, Q0>,
+                                                 CheckRegOperand<0, Q1>,
+                                                 CheckRegOperand<0, Q2>,
+                                                 CheckRegOperand<0, Q3>,
+                                                 CheckRegOperand<0, Q4>,
+                                                 CheckRegOperand<0, Q5>,
+                                                 CheckRegOperand<0, Q6>,
+                                                 CheckRegOperand<0, Q7>,
+                                                 CheckRegOperand<0, Q8>,
+                                                 CheckRegOperand<0, Q9>,
+                                                 CheckRegOperand<0, Q10>,
+                                                 CheckRegOperand<0, Q11>,
+                                                 CheckRegOperand<0, Q12>,
+                                                 CheckRegOperand<0, Q13>,
+                                                 CheckRegOperand<0, Q14>,
+                                                 CheckRegOperand<0, Q15>,
+                                                 CheckRegOperand<0, Q16>,
+                                                 CheckRegOperand<0, Q17>,
+                                                 CheckRegOperand<0, Q18>,
+                                                 CheckRegOperand<0, Q19>,
+                                                 CheckRegOperand<0, Q20>,
+                                                 CheckRegOperand<0, Q21>,
+                                                 CheckRegOperand<0, Q22>,
+                                                 CheckRegOperand<0, Q23>,
+                                                 CheckRegOperand<0, Q24>,
+                                                 CheckRegOperand<0, Q25>,
+                                                 CheckRegOperand<0, Q26>,
+                                                 CheckRegOperand<0, Q27>,
+                                                 CheckRegOperand<0, Q28>,
+                                                 CheckRegOperand<0, Q29>,
+                                                 CheckRegOperand<0, Q30>,
+                                                 CheckRegOperand<0, Q31>]>]>;
+
+// Identify arithmetic instructions with extend.
+def IsArithExtOp           : CheckOpcode<[ADDWrx, ADDXrx, ADDSWrx, ADDSXrx,
+                                          SUBWrx, SUBXrx, SUBSWrx, SUBSXrx,
+                                          ADDXrx64, ADDSXrx64,
+                                          SUBXrx64, SUBSXrx64]>;
+
+// Identify arithmetic immediate instructions.
+def IsArithImmOp           : CheckOpcode<[ADDWri, ADDXri, ADDSWri, ADDSXri,
+                                          SUBWri, SUBXri, SUBSWri, SUBSXri]>;
+
+// Identify arithmetic instructions with shift.
+def IsArithShiftOp         : CheckOpcode<[ADDWrs, ADDXrs, ADDSWrs, ADDSXrs,
+                                          SUBWrs, SUBXrs, SUBSWrs, SUBSXrs]>;
+
+// Identify arithmetic instructions without shift.
+def IsArithUnshiftOp       : CheckOpcode<[ADDWrr, ADDXrr, ADDSWrr, ADDSXrr,
+                                          SUBWrr, SUBXrr, SUBSWrr, SUBSXrr]>;
+
+// Identify logic immediate instructions.
+def IsLogicImmOp           : CheckOpcode<[ANDWri, ANDXri,
+                                          EORWri, EORXri,
+                                          ORRWri, ORRXri]>;
+
+// Identify logic instructions with shift.
+def IsLogicShiftOp         : CheckOpcode<[ANDWrs, ANDXrs, ANDSWrs, ANDSXrs,
+                                          BICWrs, BICXrs, BICSWrs, BICSXrs,
+                                          EONWrs, EONXrs,
+                                          EORWrs, EORXrs,
+                                          ORNWrs, ORNXrs,
+                                          ORRWrs, ORRXrs]>;
+
+// Identify logic instructions without shift.
+def IsLogicUnshiftOp       : CheckOpcode<[ANDWrr, ANDXrr, ANDSWrr, ANDSXrr,
+                                          BICWrr, BICXrr, BICSWrr, BICSXrr,
+                                          EONWrr, EONXrr,
+                                          EORWrr, EORXrr,
+                                          ORNWrr, ORNXrr,
+                                          ORRWrr, ORRXrr]>;
+
+// Identify arithmetic and logic immediate instructions.
+def IsArithLogicImmOp      : CheckOpcode<!listconcat(IsArithImmOp.ValidOpcodes,
+                                                     IsLogicImmOp.ValidOpcodes)>;
+
+// Identify arithmetic and logic instructions with shift.
+def IsArithLogicShiftOp    : CheckOpcode<!listconcat(IsArithShiftOp.ValidOpcodes,
+                                                     IsLogicShiftOp.ValidOpcodes)>;
+
+// Identify arithmetic and logic instructions without shift.
+def IsArithLogicUnshiftOp  : CheckOpcode<!listconcat(IsArithUnshiftOp.ValidOpcodes,
+                                                     IsLogicUnshiftOp.ValidOpcodes)>;
+
+// Identify whether an instruction whose result is a long vector
+// operates on the upper half of the input registers.
+def IsLongVectorUpperOp    : CheckOpcode<[FCVTLv8i16, FCVTLv4i32,
+                                          FCVTNv8i16, FCVTNv4i32,
+                                          FCVTXNv4f32,
+                                          PMULLv16i8, PMULLv2i64,
+                                          RADDHNv8i16_v16i8, RADDHNv4i32_v8i16, RADDHNv2i64_v4i32,
+                                          RSHRNv16i8_shift, RSHRNv8i16_shift, RSHRNv4i32_shift,
+                                          RSUBHNv8i16_v16i8, RSUBHNv4i32_v8i16, RSUBHNv2i64_v4i32,
+                                          SABALv16i8_v8i16, SABALv8i16_v4i32, SABALv4i32_v2i64,
+                                          SABDLv16i8_v8i16, SABDLv8i16_v4i32, SABDLv4i32_v2i64,
+                                          SADDLv16i8_v8i16, SADDLv8i16_v4i32, SADDLv4i32_v2i64,
+                                          SADDWv16i8_v8i16, SADDWv8i16_v4i32, SADDWv4i32_v2i64,
+                                          SHLLv16i8, SHLLv8i16, SHLLv4i32,
+                                          SHRNv16i8_shift, SHRNv8i16_shift, SHRNv4i32_shift,
+                                          SMLALv16i8_v8i16, SMLALv8i16_v4i32, SMLALv4i32_v2i64,
+                                          SMLALv8i16_indexed, SMLALv4i32_indexed,
+                                          SMLSLv16i8_v8i16, SMLSLv8i16_v4i32, SMLSLv4i32_v2i64,
+                                          SMLSLv8i16_indexed, SMLSLv4i32_indexed,
+                                          SMULLv16i8_v8i16, SMULLv8i16_v4i32, SMULLv4i32_v2i64,
+                                          SMULLv8i16_indexed, SMULLv4i32_indexed,
+                                          SQDMLALv8i16_v4i32, SQDMLALv4i32_v2i64,
+                                          SQDMLALv8i16_indexed, SQDMLALv4i32_indexed,
+                                          SQDMLSLv8i16_v4i32, SQDMLSLv4i32_v2i64,
+                                          SQDMLSLv8i16_indexed, SQDMLSLv4i32_indexed,
+                                          SQDMULLv8i16_v4i32, SQDMULLv4i32_v2i64,
+                                          SQDMULLv8i16_indexed, SQDMULLv4i32_indexed,
+                                          SQRSHRNv16i8_shift, SQRSHRNv8i16_shift, SQRSHRNv4i32_shift,
+                                          SQRSHRUNv16i8_shift, SQRSHRUNv8i16_shift, SQRSHRUNv4i32_shift,
+                                          SQSHRNv16i8_shift, SQSHRNv8i16_shift, SQSHRNv4i32_shift,
+                                          SQSHRUNv16i8_shift, SQSHRUNv8i16_shift, SQSHRUNv4i32_shift,
+                                          SQXTNv16i8, SQXTNv8i16, SQXTNv4i32,
+                                          SQXTUNv16i8, SQXTUNv8i16, SQXTUNv4i32,
+                                          SSHLLv16i8_shift, SSHLLv8i16_shift, SSHLLv4i32_shift,
+                                          SSUBLv16i8_v8i16, SSUBLv8i16_v4i32, SSUBLv4i32_v2i64,
+                                          SSUBWv16i8_v8i16, SSUBWv8i16_v4i32, SSUBWv4i32_v2i64,
+                                          UABALv16i8_v8i16, UABALv8i16_v4i32, UABALv4i32_v2i64,
+                                          UABDLv16i8_v8i16, UABDLv8i16_v4i32, UABDLv4i32_v2i64,
+                                          UADDLv16i8_v8i16, UADDLv8i16_v4i32, UADDLv4i32_v2i64,
+                                          UADDWv16i8_v8i16, UADDWv8i16_v4i32, UADDWv4i32_v2i64,
+                                          UMLALv16i8_v8i16, UMLALv8i16_v4i32, UMLALv4i32_v2i64,
+                                          UMLALv8i16_indexed, UMLALv4i32_indexed,
+                                          UMLSLv16i8_v8i16, UMLSLv8i16_v4i32, UMLSLv4i32_v2i64,
+                                          UMLSLv8i16_indexed, UMLSLv4i32_indexed,
+                                          UMULLv16i8_v8i16, UMULLv8i16_v4i32, UMULLv4i32_v2i64,
+                                          UMULLv8i16_indexed, UMULLv4i32_indexed,
+                                          UQSHRNv16i8_shift, UQSHRNv8i16_shift, UQSHRNv4i32_shift,
+                                          UQXTNv16i8, UQXTNv8i16, UQXTNv4i32,
+                                          USHLLv16i8_shift, USHLLv8i16_shift, USHLLv4i32_shift,
+                                          USUBLv16i8_v8i16, USUBLv8i16_v4i32, USUBLv4i32_v2i64,
+                                          USUBWv16i8_v8i16, USUBWv8i16_v4i32, USUBWv4i32_v2i64,
+                                          XTNv16i8, XTNv8i16, XTNv4i32]>;
+
+// Identify whether an instruction is a load
+// using the register offset addressing mode.
+def IsLoadRegOffsetOp      : CheckOpcode<[PRFMroW, PRFMroX,
+                                          LDRBBroW, LDRBBroX,
+                                          LDRSBWroW, LDRSBWroX, LDRSBXroW, LDRSBXroX,
+                                          LDRHHroW, LDRHHroX,
+                                          LDRSHWroW, LDRSHWroX, LDRSHXroW, LDRSHXroX,
+                                          LDRWroW, LDRWroX,
+                                          LDRSWroW, LDRSWroX,
+                                          LDRXroW, LDRXroX,
+                                          LDRBroW, LDRBroX,
+                                          LDRHroW, LDRHroX,
+                                          LDRSroW, LDRSroX,
+                                          LDRDroW, LDRDroX]>;
+
+// Identify whether an instruction is a load
+// using the register offset addressing mode.
+def IsStoreRegOffsetOp     : CheckOpcode<[STRBBroW, STRBBroX,
+                                          STRHHroW, STRHHroX,
+                                          STRWroW, STRWroX,
+                                          STRXroW, STRXroX,
+                                          STRBroW, STRBroX,
+                                          STRHroW, STRHroX,
+                                          STRSroW, STRSroX,
+                                          STRDroW, STRDroX]>;
+
+// Identify whether an instruction is a load or
+// store using the register offset addressing mode.
+def IsLoadStoreRegOffsetOp : CheckOpcode<!listconcat(IsLoadRegOffsetOp.ValidOpcodes,
+                                                     IsStoreRegOffsetOp.ValidOpcodes)>;
+
+// Target predicates.
+
+// Identify an instruction that effectively transfers a register to another.
+def IsCopyIdiomFn     : TIIPredicate<"isCopyIdiom",
+                                     MCOpcodeSwitchStatement<
+                                       [// MOV {Rd, SP}, {SP, Rn} =>
+                                        // ADD {Rd, SP}, {SP, Rn}, #0
+                                        MCOpcodeSwitchCase<
+                                          [ADDWri, ADDXri],
+                                          MCReturnStatement<
+                                            CheckAll<
+                                              [CheckIsRegOperand<0>,
+                                               CheckIsRegOperand<1>,
+                                               CheckAny<
+                                                 [CheckRegOperand<0, WSP>,
+                                                  CheckRegOperand<0, SP>,
+                                                  CheckRegOperand<1, WSP>,
+                                                  CheckRegOperand<1, SP>]>,
+                                               CheckZeroOperand<2>]>>>,
+                                        // MOV Rd, Rm =>
+                                        // ORR Rd, ZR, Rm, LSL #0
+                                        MCOpcodeSwitchCase<
+                                          [ORRWrs, ORRXrs],
+                                          MCReturnStatement<
+                                            CheckAll<
+                                              [CheckIsRegOperand<1>,
+                                               CheckIsRegOperand<2>,
+                                               CheckAny<
+                                                 [CheckRegOperand<1, WZR>,
+                                                  CheckRegOperand<1, XZR>,
+                                                  CheckRegOperand<2, WZR>,
+                                                  CheckRegOperand<2, XZR>]>,
+                                               CheckShiftBy0]>>>],
+                                       MCReturnStatement<FalsePred>>>;
+def IsCopyIdiomPred   : MCSchedPredicate<IsCopyIdiomFn>;
+
+// Identify arithmetic instructions with an extended register.
+def RegExtendedFn     : TIIPredicate<"hasExtendedReg",
+                                     MCOpcodeSwitchStatement<
+                                       [MCOpcodeSwitchCase<
+                                         IsArithExtOp.ValidOpcodes,
+                                         MCReturnStatement<
+                                           CheckNot<CheckZeroOperand<3>>>>],
+                                       MCReturnStatement<FalsePred>>>;
+def RegExtendedPred   : MCSchedPredicate<RegExtendedFn>;
+
+// Identify arithmetic and logic instructions with a shifted register.
+def RegShiftedFn      : TIIPredicate<"hasShiftedReg",
+                                     MCOpcodeSwitchStatement<
+                                       [MCOpcodeSwitchCase<
+                                          IsArithLogicShiftOp.ValidOpcodes,
+                                          MCReturnStatement<
+                                            CheckNot<CheckZeroOperand<3>>>>],
+                                       MCReturnStatement<FalsePred>>>;
+def RegShiftedPred    : MCSchedPredicate<RegShiftedFn>;
+
+// Identify a load or store using the register offset addressing mode
+// with an extended or scaled register.
+def ScaledIdxFn       : TIIPredicate<"isScaledAddr",
+                                     MCOpcodeSwitchStatement<
+                                       [MCOpcodeSwitchCase<
+                                          IsLoadStoreRegOffsetOp.ValidOpcodes,
+                                          MCReturnStatement<
+                                            CheckAny<[CheckNot<CheckMemExtLSL>,
+                                                      CheckMemScaled]>>>],
+                                       MCReturnStatement<FalsePred>>>;
+def ScaledIdxPred     : MCSchedPredicate<ScaledIdxFn>;
+
+// Identify an instruction that effectively resets a FP register to zero.
+def IsZeroFPIdiomFn   : TIIPredicate<"isZeroFPIdiom",
+                                     MCOpcodeSwitchStatement<
+                                       [// MOVI Vd, #0
+                                        MCOpcodeSwitchCase<
+                                          [MOVIv8b_ns, MOVIv16b_ns,
+                                           MOVID, MOVIv2d_ns],
+                                          MCReturnStatement<CheckZeroOperand<1>>>,
+                                        // MOVI Vd, #0, LSL #0
+                                        MCOpcodeSwitchCase<
+                                          [MOVIv4i16, MOVIv8i16,
+                                           MOVIv2i32, MOVIv4i32],
+                                          MCReturnStatement<
+                                            CheckAll<
+                                              [CheckZeroOperand<1>,
+                                               CheckZeroOperand<2>]>>>],
+                                       MCReturnStatement<FalsePred>>>;
+def IsZeroFPIdiomPred : MCSchedPredicate<IsZeroFPIdiomFn>;
+
+// Identify an instruction that effectively resets a GP register to zero.
+def IsZeroIdiomFn     : TIIPredicate<"isZeroIdiom",
+                                    MCOpcodeSwitchStatement<
+                                      [// ORR Rd, ZR, #0
+                                       MCOpcodeSwitchCase<
+                                         [ORRWri, ORRXri],
+                                         MCReturnStatement<
+                                           CheckAll<
+                                             [CheckIsRegOperand<1>,
+                                              CheckAny<
+                                                [CheckRegOperand<1, WZR>,
+                                                 CheckRegOperand<1, XZR>]>,
+                                              CheckZeroOperand<2>]>>>],
+                                      MCReturnStatement<FalsePred>>>;
+def IsZeroIdiomPred   : MCSchedPredicate<IsZeroIdiomFn>;
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index ce81f48acf71282a63d6eebf5cbea23b72419b9d..f55ba4d42fce18b95b7149a7e44115ee667cd5ee 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -50,17 +50,6 @@ def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
 def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
 def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
 
-// Predicate for determining when a shiftable register is shifted.
-def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>;
-
-// Predicate for determining when a extendedable register is extended.
-def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>;
-
-// ScaledIdxPred is true if a WriteLDIdx operand will be
-// scaled. Subtargets can use this to dynamically select resources and
-// latency for WriteLDIdx and ReadAdrBase.
-def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>;
-
 // Serialized two-level address load.
 // EXAMPLE: LOADGot
 def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f8ef5ee6eae58a1454e05f5cf2e3e309154fa7b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -0,0 +1,368 @@
+//===- AArch64SpeculationHardening.cpp - Harden Against Missspeculation  --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass to insert code to mitigate against side channel
+// vulnerabilities that may happen under control flow miss-speculation.
+//
+// The pass implements tracking of control flow miss-speculation into a "taint"
+// register. That taint register can then be used to mask off registers with
+// sensitive data when executing under miss-speculation, a.k.a. "transient
+// execution".
+// This pass is aimed at mitigating against SpectreV1-style vulnarabilities.
+//
+// At the moment, it implements the tracking of miss-speculation of control
+// flow into a taint register, but doesn't implement a mechanism yet to then
+// use that taint register to mask of vulnerable data in registers (something
+// for a follow-on improvement). Possible strategies to mask out vulnerable
+// data that can be implemented on top of this are:
+// - speculative load hardening to automatically mask of data loaded
+//   in registers.
+// - using intrinsics to mask of data in registers as indicated by the
+//   programmer (see https://lwn.net/Articles/759423/).
+//
+// For AArch64, the following implementation choices are made below.
+// Some of these are different than the implementation choices made in
+// the similar pass implemented in X86SpeculativeLoadHardening.cpp, as
+// the instruction set characteristics result in different trade-offs.
+// - The speculation hardening is done after register allocation. With a
+//   relative abundance of registers, one register is reserved (X16) to be
+//   the taint register. X16 is expected to not clash with other register
+//   reservation mechanisms with very high probability because:
+//   . The AArch64 ABI doesn't guarantee X16 to be retained across any call.
+//   . The only way to request X16 to be used as a programmer is through
+//     inline assembly. In the rare case a function explicitly demands to
+//     use X16/W16, this pass falls back to hardening against speculation
+//     by inserting a DSB SYS/ISB barrier pair which will prevent control
+//     flow speculation.
+// - It is easy to insert mask operations at this late stage as we have
+//   mask operations available that don't set flags.
+// - The taint variable contains all-ones when no miss-speculation is detected,
+//   and contains all-zeros when miss-speculation is detected. Therefore, when
+//   masking, an AND instruction (which only changes the register to be masked,
+//   no other side effects) can easily be inserted anywhere that's needed.
+// - The tracking of miss-speculation is done by using a data-flow conditional
+//   select instruction (CSEL) to evaluate the flags that were also used to
+//   make conditional branch direction decisions. Speculation of the CSEL
+//   instruction can be limited with a CSDB instruction - so the combination of
+//   CSEL + a later CSDB gives the guarantee that the flags as used in the CSEL
+//   aren't speculated. When conditional branch direction gets miss-speculated,
+//   the semantics of the inserted CSEL instruction is such that the taint
+//   register will contain all zero bits.
+//   One key requirement for this to work is that the conditional branch is
+//   followed by an execution of the CSEL instruction, where the CSEL
+//   instruction needs to use the same flags status as the conditional branch.
+//   This means that the conditional branches must not be implemented as one
+//   of the AArch64 conditional branches that do not use the flags as input
+//   (CB(N)Z and TB(N)Z). This is implemented by ensuring in the instruction
+//   selectors to not produce these instructions when speculation hardening
+//   is enabled. This pass will assert if it does encounter such an instruction.
+// - On function call boundaries, the miss-speculation state is transferred from
+//   the taint register X16 to be encoded in the SP register as value 0.
+//
+// Future extensions/improvements could be:
+// - Implement this functionality using full speculation barriers, akin to the
+//   x86-slh-lfence option. This may be more useful for the intrinsics-based
+//   approach than for the SLH approach to masking.
+//   Note that this pass already inserts the full speculation barriers if the
+//   function for some niche reason makes use of X16/W16.
+// - no indirect branch misprediction gets protected/instrumented; but this
+//   could be done for some indirect branches, such as switch jump tables.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-speculation-hardening"
+
+#define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass"
+
+namespace {
+
+class AArch64SpeculationHardening : public MachineFunctionPass {
+public:
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  static char ID;
+
+  AArch64SpeculationHardening() : MachineFunctionPass(ID) {
+    initializeAArch64SpeculationHardeningPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override {
+    return AARCH64_SPECULATION_HARDENING_NAME;
+  }
+
+private:
+  unsigned MisspeculatingTaintReg;
+  bool UseControlFlowSpeculationBarrier;
+
+  bool functionUsesHardeningRegister(MachineFunction &MF) const;
+  bool instrumentControlFlow(MachineBasicBlock &MBB);
+  bool endsWithCondControlFlow(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                               MachineBasicBlock *&FBB,
+                               AArch64CC::CondCode &CondCode) const;
+  void insertTrackingCode(MachineBasicBlock &SplitEdgeBB,
+                          AArch64CC::CondCode &CondCode, DebugLoc DL) const;
+  void insertSPToRegTaintPropagation(MachineBasicBlock *MBB,
+                                     MachineBasicBlock::iterator MBBI) const;
+  void insertRegToSPTaintPropagation(MachineBasicBlock *MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned TmpReg) const;
+};
+
+} // end anonymous namespace
+
+char AArch64SpeculationHardening::ID = 0;
+
+INITIALIZE_PASS(AArch64SpeculationHardening, "aarch64-speculation-hardening",
+                AARCH64_SPECULATION_HARDENING_NAME, false, false)
+
+bool AArch64SpeculationHardening::endsWithCondControlFlow(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    AArch64CC::CondCode &CondCode) const {
+  SmallVector<MachineOperand, 1> analyzeBranchCondCode;
+  if (TII->analyzeBranch(MBB, TBB, FBB, analyzeBranchCondCode, false))
+    return false;
+
+  // Ignore if the BB ends in an unconditional branch/fall-through.
+  if (analyzeBranchCondCode.empty())
+    return false;
+
+  // If the BB ends with a single conditional branch, FBB will be set to
+  // nullptr (see API docs for TII->analyzeBranch). For the rest of the
+  // analysis we want the FBB block to be set always.
+  assert(TBB != nullptr);
+  if (FBB == nullptr)
+    FBB = MBB.getFallThrough();
+
+  // If both the true and the false condition jump to the same basic block,
+  // there isn't need for any protection - whether the branch is speculated
+  // correctly or not, we end up executing the architecturally correct code.
+  if (TBB == FBB)
+    return false;
+
+  assert(MBB.succ_size() == 2);
+  // translate analyzeBranchCondCode to CondCode.
+  assert(analyzeBranchCondCode.size() == 1 && "unknown Cond array format");
+  CondCode = AArch64CC::CondCode(analyzeBranchCondCode[0].getImm());
+  return true;
+}
+
+void AArch64SpeculationHardening::insertTrackingCode(
+    MachineBasicBlock &SplitEdgeBB, AArch64CC::CondCode &CondCode,
+    DebugLoc DL) const {
+  if (UseControlFlowSpeculationBarrier) {
+    // insert full control flow speculation barrier (DSB SYS + ISB)
+    BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::ISB))
+        .addImm(0xf);
+    BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::DSB))
+        .addImm(0xf);
+  } else {
+    BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::CSELXr))
+        .addDef(MisspeculatingTaintReg)
+        .addUse(MisspeculatingTaintReg)
+        .addUse(AArch64::XZR)
+        .addImm(CondCode);
+    SplitEdgeBB.addLiveIn(AArch64::NZCV);
+  }
+}
+
+bool AArch64SpeculationHardening::instrumentControlFlow(
+    MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "Instrument control flow tracking on MBB: " << MBB);
+
+  bool Modified = false;
+  MachineBasicBlock *TBB = nullptr;
+  MachineBasicBlock *FBB = nullptr;
+  AArch64CC::CondCode CondCode;
+
+  if (!endsWithCondControlFlow(MBB, TBB, FBB, CondCode)) {
+    LLVM_DEBUG(dbgs() << "... doesn't end with CondControlFlow\n");
+  } else {
+    // Now insert:
+    // "CSEL MisSpeculatingR, MisSpeculatingR, XZR, cond" on the True edge and
+    // "CSEL MisSpeculatingR, MisSpeculatingR, XZR, Invertcond" on the False
+    // edge.
+    AArch64CC::CondCode InvCondCode = AArch64CC::getInvertedCondCode(CondCode);
+
+    MachineBasicBlock *SplitEdgeTBB = MBB.SplitCriticalEdge(TBB, *this);
+    MachineBasicBlock *SplitEdgeFBB = MBB.SplitCriticalEdge(FBB, *this);
+
+    assert(SplitEdgeTBB != nullptr);
+    assert(SplitEdgeFBB != nullptr);
+
+    DebugLoc DL;
+    if (MBB.instr_end() != MBB.instr_begin())
+      DL = (--MBB.instr_end())->getDebugLoc();
+
+    insertTrackingCode(*SplitEdgeTBB, CondCode, DL);
+    insertTrackingCode(*SplitEdgeFBB, InvCondCode, DL);
+
+    LLVM_DEBUG(dbgs() << "SplitEdgeTBB: " << *SplitEdgeTBB << "\n");
+    LLVM_DEBUG(dbgs() << "SplitEdgeFBB: " << *SplitEdgeFBB << "\n");
+    Modified = true;
+  }
+
+  // Perform correct code generation around function calls and before returns.
+  {
+    SmallVector<MachineInstr *, 4> ReturnInstructions;
+    SmallVector<MachineInstr *, 4> CallInstructions;
+
+    for (MachineInstr &MI : MBB) {
+      if (MI.isReturn())
+        ReturnInstructions.push_back(&MI);
+      else if (MI.isCall())
+        CallInstructions.push_back(&MI);
+    }
+
+    Modified |=
+        (ReturnInstructions.size() > 0) || (CallInstructions.size() > 0);
+
+    for (MachineInstr *Return : ReturnInstructions)
+      insertRegToSPTaintPropagation(Return->getParent(), Return, AArch64::X17);
+    for (MachineInstr *Call : CallInstructions) {
+      // Just after the call:
+      MachineBasicBlock::iterator i = Call;
+      i++;
+      insertSPToRegTaintPropagation(Call->getParent(), i);
+      // Just before the call:
+      insertRegToSPTaintPropagation(Call->getParent(), Call, AArch64::X17);
+    }
+  }
+
+  return Modified;
+}
+
+void AArch64SpeculationHardening::insertSPToRegTaintPropagation(
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) const {
+  // If full control flow speculation barriers are used, emit a control flow
+  // barrier to block potential miss-speculation in flight coming in to this
+  // function.
+  if (UseControlFlowSpeculationBarrier) {
+    // insert full control flow speculation barrier (DSB SYS + ISB)
+    BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::DSB)).addImm(0xf);
+    BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ISB)).addImm(0xf);
+    return;
+  }
+
+  // CMP   SP, #0   === SUBS   xzr, SP, #0
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::SUBSXri))
+      .addDef(AArch64::XZR)
+      .addUse(AArch64::SP)
+      .addImm(0)
+      .addImm(0); // no shift
+  // CSETM x16, NE  === CSINV  x16, xzr, xzr, EQ
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::CSINVXr))
+      .addDef(MisspeculatingTaintReg)
+      .addUse(AArch64::XZR)
+      .addUse(AArch64::XZR)
+      .addImm(AArch64CC::EQ);
+}
+
+void AArch64SpeculationHardening::insertRegToSPTaintPropagation(
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI,
+    unsigned TmpReg) const {
+  // If full control flow speculation barriers are used, there will not be
+  // miss-speculation when returning from this function, and therefore, also
+  // no need to encode potential miss-speculation into the stack pointer.
+  if (UseControlFlowSpeculationBarrier)
+    return;
+
+  // mov   Xtmp, SP  === ADD  Xtmp, SP, #0
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri))
+      .addDef(TmpReg)
+      .addUse(AArch64::SP)
+      .addImm(0)
+      .addImm(0); // no shift
+  // and   Xtmp, Xtmp, TaintReg === AND Xtmp, Xtmp, TaintReg, #0
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ANDXrs))
+      .addDef(TmpReg, RegState::Renamable)
+      .addUse(TmpReg, RegState::Kill | RegState::Renamable)
+      .addUse(MisspeculatingTaintReg, RegState::Kill)
+      .addImm(0);
+  // mov   SP, Xtmp === ADD SP, Xtmp, #0
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri))
+      .addDef(AArch64::SP)
+      .addUse(TmpReg, RegState::Kill)
+      .addImm(0)
+      .addImm(0); // no shift
+}
+
+bool AArch64SpeculationHardening::functionUsesHardeningRegister(
+    MachineFunction &MF) const {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // treat function calls specially, as the hardening register does not
+      // need to remain live across function calls.
+      if (MI.isCall())
+        continue;
+      if (MI.readsRegister(MisspeculatingTaintReg, TRI) ||
+          MI.modifiesRegister(MisspeculatingTaintReg, TRI))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
+    return false;
+
+  MisspeculatingTaintReg = AArch64::X16;
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  bool Modified = false;
+
+  UseControlFlowSpeculationBarrier = functionUsesHardeningRegister(MF);
+
+  // Instrument control flow speculation tracking, if requested.
+  LLVM_DEBUG(
+      dbgs()
+      << "***** AArch64SpeculationHardening - track control flow *****\n");
+
+  // 1. Add instrumentation code to function entry and exits.
+  SmallVector<MachineBasicBlock *, 2> EntryBlocks;
+  EntryBlocks.push_back(&MF.front());
+  for (const LandingPadInfo &LPI : MF.getLandingPads())
+    EntryBlocks.push_back(LPI.LandingPadBlock);
+  for (auto Entry : EntryBlocks)
+    insertSPToRegTaintPropagation(
+        Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin()));
+
+  // 2. Add instrumentation code to every basic block.
+  for (auto &MBB : MF)
+    Modified |= instrumentControlFlow(MBB);
+
+  return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createAArch64SpeculationHardeningPass() {
+  return new AArch64SpeculationHardening();
+}
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index fc7b5984fe3e867da3138505d961b74525c37dff..d5643d384283340e0674c344a0e7aad498b568c7 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -148,9 +148,11 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
     for (auto &MI : MBB) {
       if (!isNarrowFPStore(MI))
         continue;
-      unsigned BaseReg;
+      MachineOperand *BaseOp;
       int64_t Offset;
-      if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) {
+      if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) &&
+          BaseOp->isReg()) {
+        unsigned BaseReg = BaseOp->getReg();
         if (PrevBaseReg == BaseReg) {
           // If this block can take STPs, skip ahead to the next block.
           if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 49d737bea6a6dc2c544b40106c9c4776a6beb77e..dd30d25b2b505de77c9beb0b727c62f6ed8d01e2 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -14,13 +14,13 @@
 #include "AArch64Subtarget.h"
 
 #include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "AArch64PBQPRegAlloc.h"
-#include "AArch64TargetMachine.h"
-
 #include "AArch64CallLowering.h"
+#include "AArch64InstrInfo.h"
 #include "AArch64LegalizerInfo.h"
+#include "AArch64PBQPRegAlloc.h"
 #include "AArch64RegisterBankInfo.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
@@ -148,6 +148,11 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
+  case TSV110:
+    CacheLineSize = 64;
+    PrefFunctionAlignment = 4;
+    PrefLoopAlignment = 2;
+    break;
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 8bf7c1654081e10c69f64daced9852d6cea57422..9b27e0a0e8fdf8428ac8783d9bf98f79c8ae1c17 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -56,7 +56,8 @@ public:
     ThunderX,
     ThunderXT81,
     ThunderXT83,
-    ThunderXT88
+    ThunderXT88,
+    TSV110
   };
 
 protected:
@@ -82,6 +83,33 @@ protected:
   bool HasFP16FML = false;
   bool HasSPE = false;
 
+  // ARMv8.1 extensions
+  bool HasVH = false;
+  bool HasPAN = false;
+  bool HasLOR = false;
+
+  // ARMv8.2 extensions
+  bool HasPsUAO = false;
+  bool HasPAN_RWV = false;
+  bool HasCCPP = false;
+
+  // ARMv8.3 extensions
+  bool HasPA = false;
+  bool HasJS = false;
+  bool HasCCIDX = false;
+  bool HasComplxNum = false;
+
+  // ARMv8.4 extensions
+  bool HasNV = false;
+  bool HasRASv8_4 = false;
+  bool HasMPAM = false;
+  bool HasDIT = false;
+  bool HasTRACEV8_4 = false;
+  bool HasAM = false;
+  bool HasSEL2 = false;
+  bool HasTLB_RMI = false;
+  bool HasFMI = false;
+  bool HasRCPC_IMMO = false;
   // ARMv8.4 Crypto extensions
   bool HasSM4 = true;
   bool HasSHA3 = true;
@@ -99,6 +127,7 @@ protected:
   bool HasFRInt3264 = false;
   bool HasSpecRestrict = false;
   bool HasSpecCtrl = false;
+  bool HasSSBS = false;
   bool HasPredCtrl = false;
   bool HasCCDP = false;
   bool HasBTI = false;
@@ -327,6 +356,7 @@ public:
   bool hasFRInt3264() const { return HasFRInt3264; }
   bool hasSpecRestrict() const { return HasSpecRestrict; }
   bool hasSpecCtrl() const { return HasSpecCtrl; }
+  bool hasSSBS() const { return HasSSBS; }
   bool hasPredCtrl() const { return HasPredCtrl; }
   bool hasCCDP() const { return HasCCDP; }
   bool hasBTI() const { return HasBTI; }
@@ -348,6 +378,30 @@ public:
 
   bool useAA() const override { return UseAA; }
 
+  bool hasVH() const { return HasVH; }
+  bool hasPAN() const { return HasPAN; }
+  bool hasLOR() const { return HasLOR; }
+
+  bool hasPsUAO() const { return HasPsUAO; }
+  bool hasPAN_RWV() const { return HasPAN_RWV; }
+  bool hasCCPP() const { return HasCCPP; }
+
+  bool hasPA() const { return HasPA; }
+  bool hasJS() const { return HasJS; }
+  bool hasCCIDX() const { return HasCCIDX; }
+  bool hasComplxNum() const { return HasComplxNum; }
+
+  bool hasNV() const { return HasNV; }
+  bool hasRASv8_4() const { return HasRASv8_4; }
+  bool hasMPAM() const { return HasMPAM; }
+  bool hasDIT() const { return HasDIT; }
+  bool hasTRACEV8_4() const { return HasTRACEV8_4; }
+  bool hasAM() const { return HasAM; }
+  bool hasSEL2() const { return HasSEL2; }
+  bool hasTLB_RMI() const { return HasTLB_RMI; }
+  bool hasFMI() const { return HasFMI; }
+  bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
+
   bool useSmallAddressing() const {
     switch (TLInfo.getTargetMachine().getCodeModel()) {
       case CodeModel::Kernel:
@@ -382,6 +436,8 @@ public:
   bool isCallingConvWin64(CallingConv::ID CC) const {
     switch (CC) {
     case CallingConv::C:
+    case CallingConv::Fast:
+    case CallingConv::Swift:
       return isTargetWindows();
     case CallingConv::Win64:
       return true;
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index 9faa465fb9ba15a23ce3d3564a09a253a97a9beb..60d48e4d99d750e41b2d04efe20459e82ef9c91b 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -14,6 +14,25 @@
 
 include "llvm/TableGen/SearchableTable.td"
 
+//===----------------------------------------------------------------------===//
+// Features that, for the compiler, only enable system operands and PStates
+//===----------------------------------------------------------------------===//
+
+def HasCCPP    : Predicate<"Subtarget->hasCCPP()">,
+                 AssemblerPredicate<"FeatureCCPP", "ccpp">;
+
+def HasPAN     : Predicate<"Subtarget->hasPAN()">,
+                 AssemblerPredicate<"FeaturePAN",
+                 "ARM v8.1  Privileged Access-Never extension">;
+
+def HasPsUAO   : Predicate<"Subtarget->hasPsUAO()">,
+                 AssemblerPredicate<"FeaturePsUAO",
+                 "ARM v8.2 UAO PState extension (psuao)">;
+
+def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">,
+                 AssemblerPredicate<"FeaturePAN_RWV",
+                 "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">;
+
 //===----------------------------------------------------------------------===//
 // AT (address translate) instruction options.
 //===----------------------------------------------------------------------===//
@@ -45,7 +64,7 @@ def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>;
 def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>;
 def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>;
 
-let Requires = [{ {AArch64::HasV8_2aOps} }] in {
+let Requires = [{ {AArch64::FeaturePAN_RWV} }] in {
 def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>;
 def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>;
 }
@@ -102,7 +121,7 @@ def : DC<"CVAU",  0b011, 0b0111, 0b1011, 0b001>;
 def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>;
 def : DC<"CISW",  0b000, 0b0111, 0b1110, 0b010>;
 
-let Requires = [{ {AArch64::HasV8_2aOps} }] in
+let Requires = [{ {AArch64::FeatureCCPP} }] in
 def : DC<"CVAP",  0b011, 0b0111, 0b1100, 0b001>;
 
 let Requires = [{ {AArch64::FeatureCacheDeepPersist} }] in
@@ -178,7 +197,7 @@ class TSB<string name, bits<4> encoding> : SearchableTable{
   bits<4> Encoding;
   let Encoding = encoding;
 
-  code Requires = [{ {AArch64::HasV8_4aOps} }];
+  code Requires = [{ {AArch64::FeatureTRACEV8_4} }];
 }
 
 def : TSB<"csync", 0>;
@@ -314,16 +333,17 @@ def : PState<"SPSel",   0b00101>;
 def : PState<"DAIFSet", 0b11110>;
 def : PState<"DAIFClr", 0b11111>;
 // v8.1a "Privileged Access Never" extension-specific PStates
-let Requires = [{ {AArch64::HasV8_1aOps} }] in
+let Requires = [{ {AArch64::FeaturePAN} }] in
 def : PState<"PAN",     0b00100>;
+
 // v8.2a "User Access Override" extension-specific PStates
-let Requires = [{ {AArch64::HasV8_2aOps} }] in
+let Requires = [{ {AArch64::FeaturePsUAO} }] in
 def : PState<"UAO",     0b00011>;
 // v8.4a timining insensitivity of data processing instructions
-let Requires = [{ {AArch64::HasV8_4aOps} }] in
+let Requires = [{ {AArch64::FeatureDIT} }] in
 def : PState<"DIT",     0b11010>;
 // v8.5a Spectre Mitigation
-let Requires = [{ {AArch64::FeatureSpecRestrict} }] in
+let Requires = [{ {AArch64::FeatureSSBS} }] in
 def : PState<"SSBS",    0b11001>;
 // v8.5a Memory Tagging Extension
 let Requires = [{ {AArch64::FeatureMTE} }] in
@@ -413,8 +433,9 @@ def : TLBI<"VALE3",        0b110, 0b1000, 0b0111, 0b101>;
 def : TLBI<"VMALLS12E1",   0b100, 0b1000, 0b0111, 0b110, 0>;
 def : TLBI<"VAALE1",       0b000, 0b1000, 0b0111, 0b111>;
 
+// Armv8.4-A Translation Lookaside Buffer Instructions (TLBI)
+let Requires = [{ {AArch64::FeatureTLB_RMI} }] in {
 // Armv8.4-A Outer Sharable TLB Maintenance instructions:
-let Requires = [{ {AArch64::HasV8_4aOps} }] in {
 //                         op1    CRn     CRm     op2
 def : TLBI<"VMALLE1OS",    0b000, 0b1000, 0b0001, 0b000, 0>;
 def : TLBI<"VAE1OS",       0b000, 0b1000, 0b0001, 0b001>;
@@ -465,7 +486,7 @@ def : TLBI<"RVAE3IS",      0b110, 0b1000, 0b0010, 0b001>;
 def : TLBI<"RVALE3IS",     0b110, 0b1000, 0b0010, 0b101>;
 def : TLBI<"RVAE3OS",      0b110, 0b1000, 0b0101, 0b001>;
 def : TLBI<"RVALE3OS",     0b110, 0b1000, 0b0101, 0b101>;
-}
+} //FeatureTLB_RMI
 
 // Armv8.5-A Prediction Restriction by Context instruction options:
 class PRCTX<string name, bits<4> crm> : SearchableTable {
@@ -540,8 +561,10 @@ def : ROSysReg<"PMCEID0_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b110>;
 def : ROSysReg<"PMCEID1_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b111>;
 def : ROSysReg<"MIDR_EL1",           0b11, 0b000, 0b0000, 0b0000, 0b000>;
 def : ROSysReg<"CCSIDR_EL1",         0b11, 0b001, 0b0000, 0b0000, 0b000>;
+
+//v8.3 CCIDX - extending the CCsIDr number of sets
 def : ROSysReg<"CCSIDR2_EL1",        0b11, 0b001, 0b0000, 0b0000, 0b010> {
-  let Requires = [{ {AArch64::HasV8_3aOps} }];
+  let Requires = [{ {AArch64::FeatureCCIDX} }];
 }
 def : ROSysReg<"CLIDR_EL1",          0b11, 0b001, 0b0000, 0b0000, 0b001>;
 def : ROSysReg<"CTR_EL0",            0b11, 0b011, 0b0000, 0b0000, 0b001>;
@@ -579,9 +602,7 @@ def : ROSysReg<"ID_AA64ISAR0_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b000>;
 def : ROSysReg<"ID_AA64ISAR1_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b001>;
 def : ROSysReg<"ID_AA64MMFR0_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b000>;
 def : ROSysReg<"ID_AA64MMFR1_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b001>;
-def : ROSysReg<"ID_AA64MMFR2_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b010> {
-  let Requires = [{ {AArch64::HasV8_2aOps} }];
-}
+def : ROSysReg<"ID_AA64MMFR2_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b010>;
 def : ROSysReg<"MVFR0_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b000>;
 def : ROSysReg<"MVFR1_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b001>;
 def : ROSysReg<"MVFR2_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b010>;
@@ -651,7 +672,7 @@ def : ROSysReg<"ID_AA64ZFR0_EL1",    0b11, 0b000, 0b0000, 0b0100, 0b100>;
 
 // v8.1a "Limited Ordering Regions" extension-specific system register
 //                         Op0    Op1     CRn     CRm    Op2
-let Requires = [{ {AArch64::HasV8_1aOps} }] in
+let Requires = [{ {AArch64::FeatureLOR} }] in
 def : ROSysReg<"LORID_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b111>;
 
 // v8.2a "RAS extension" registers
@@ -1185,21 +1206,21 @@ def : RWSysReg<"ICH_LR14_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b110>;
 def : RWSysReg<"ICH_LR15_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b111>;
 
 // v8.1a "Privileged Access Never" extension-specific system registers
-let Requires = [{ {AArch64::HasV8_1aOps} }] in
+let Requires = [{ {AArch64::FeaturePAN} }] in
 def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>;
 
 // v8.1a "Limited Ordering Regions" extension-specific system registers
 //                         Op0    Op1     CRn     CRm    Op2
-let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+let Requires = [{ {AArch64::FeatureLOR} }] in {
 def : RWSysReg<"LORSA_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b000>;
 def : RWSysReg<"LOREA_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b001>;
 def : RWSysReg<"LORN_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b010>;
 def : RWSysReg<"LORC_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b011>;
 }
 
-// v8.1a "Virtualization hos extensions" system registers
+// v8.1a "Virtualization Host extensions" system registers
 //                              Op0    Op1     CRn     CRm    Op2
-let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+let Requires = [{ {AArch64::FeatureVH} }] in {
 def : RWSysReg<"TTBR1_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b001>;
 def : RWSysReg<"CONTEXTIDR_EL2",  0b11, 0b100, 0b1101, 0b0000, 0b001>;
 def : RWSysReg<"CNTHV_TVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b000>;
@@ -1230,7 +1251,7 @@ def : RWSysReg<"ELR_EL12",        0b11, 0b101, 0b0100, 0b0000, 0b001>;
 }
 // v8.2a registers
 //                  Op0    Op1     CRn     CRm    Op2
-let Requires = [{ {AArch64::HasV8_2aOps} }] in
+let Requires = [{ {AArch64::FeaturePsUAO} }] in
 def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>;
 
 // v8.2a "Statistical Profiling extension" registers
@@ -1267,7 +1288,7 @@ def : RWSysReg<"VSESR_EL2",     0b11, 0b100, 0b0101, 0b0010, 0b011>;
 
 // v8.3a "Pointer authentication extension" registers
 //                              Op0    Op1     CRn     CRm    Op2
-let Requires = [{ {AArch64::HasV8_3aOps} }] in {
+let Requires = [{ {AArch64::FeaturePA} }] in {
 def : RWSysReg<"APIAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b000>;
 def : RWSysReg<"APIAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b001>;
 def : RWSysReg<"APIBKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b010>;
@@ -1280,8 +1301,8 @@ def : RWSysReg<"APGAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b000>;
 def : RWSysReg<"APGAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b001>;
 }
 
-let Requires = [{ {AArch64::HasV8_4aOps} }] in {
-
+// v8.4 "Secure Exception Level 2 extension"
+let Requires = [{ {AArch64::FeatureSEL2} }] in {
 // v8.4a "Virtualization secure second stage translation" registers
 //                           Op0   Op1    CRn     CRm     Op2
 def : RWSysReg<"VSTCR_EL2" , 0b11, 0b100, 0b0010, 0b0110, 0b010>;
@@ -1299,18 +1320,22 @@ def : RWSysReg<"CNTHPS_CTL_EL2",  0b11, 0b100, 0b1110, 0b0101, 0b001>;
 // v8.4a "Virtualization debug state" registers
 //                           Op0   Op1    CRn     CRm     Op2
 def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>;
+} // FeatureSEL2
 
 // v8.4a RAS registers
-//                              Op0   Op1    CRn     CRm    Op2
+//                              Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeatureRASv8_4} }] in {
 def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>;
 def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>;
 def : RWSysReg<"ERXTS_EL1",     0b11, 0b000, 0b0101, 0b0101, 0b111>;
 def : RWSysReg<"ERXMISC2_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b010>;
 def : RWSysReg<"ERXMISC3_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b011>;
 def : ROSysReg<"ERXPFGF_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b100>;
+} // FeatureRASv8_4
 
 // v8.4a MPAM registers
 //                             Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeatureMPAM} }] in {
 def : RWSysReg<"MPAM0_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b001>;
 def : RWSysReg<"MPAM1_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b000>;
 def : RWSysReg<"MPAM2_EL2",    0b11, 0b100, 0b1010, 0b0101, 0b000>;
@@ -1327,9 +1352,11 @@ def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>;
 def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>;
 def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>;
 def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>;
+} //FeatureMPAM
 
-// v8.4a Activitiy monitor registers
+// v8.4a Activitiy Monitor registers
 //                                 Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeatureAM} }] in {
 def : RWSysReg<"AMCR_EL0",         0b11, 0b011, 0b1101, 0b0010, 0b000>;
 def : ROSysReg<"AMCFGR_EL0",       0b11, 0b011, 0b1101, 0b0010, 0b001>;
 def : ROSysReg<"AMCGCR_EL0",       0b11, 0b011, 0b1101, 0b0010, 0b010>;
@@ -1378,6 +1405,7 @@ def : RWSysReg<"AMEVTYPER112_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b100>;
 def : RWSysReg<"AMEVTYPER113_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b101>;
 def : RWSysReg<"AMEVTYPER114_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b110>;
 def : RWSysReg<"AMEVTYPER115_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b111>;
+} //FeatureAM
 
 // v8.4a Trace Extension registers
 //
@@ -1386,19 +1414,24 @@ def : RWSysReg<"AMEVTYPER115_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b111>;
 // but they are already defined above.
 //
 //                                 Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeatureTRACEV8_4} }] in {
 def : RWSysReg<"TRFCR_EL1",        0b11, 0b000, 0b0001, 0b0010, 0b001>;
 def : RWSysReg<"TRFCR_EL2",        0b11, 0b100, 0b0001, 0b0010, 0b001>;
 def : RWSysReg<"TRFCR_EL12",       0b11, 0b101, 0b0001, 0b0010, 0b001>;
+} //FeatureTRACEV8_4
 
 // v8.4a Timining insensitivity of data processing instructions
+// DIT: Data Independent Timing instructions
 //                                 Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeatureDIT} }] in {
 def : RWSysReg<"DIT",              0b11, 0b011, 0b0100, 0b0010, 0b101>;
+} //FeatureDIT
 
 // v8.4a Enhanced Support for Nested Virtualization
 //                                 Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeatureNV} }] in {
 def : RWSysReg<"VNCR_EL2",         0b11, 0b100, 0b0010, 0b0010, 0b000>;
-
-} // HasV8_4aOps
+} //FeatureNV
 
 // SVE control registers
 //                                 Op0   Op1    CRn     CRm     Op2
@@ -1411,7 +1444,7 @@ def : RWSysReg<"ZCR_EL12",         0b11, 0b101, 0b0001, 0b0010, 0b000>;
 
 // V8.5a Spectre mitigation SSBS register
 //                     Op0   Op1    CRn     CRm     Op2
-let Requires = [{ {AArch64::FeatureSpecRestrict} }] in
+let Requires = [{ {AArch64::FeatureSSBS} }] in
 def : RWSysReg<"SSBS", 0b11, 0b011, 0b0100, 0b0010, 0b110>;
 
 // v8.5a Memory Tagging Extension
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 2f3f87d02b787206cffe5026a871eedc648abdf7..4e016525f7e4e6bc1fea1dad2b68c2b58904541b 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -177,6 +177,7 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeFalkorHWPFFixPass(*PR);
   initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
+  initializeAArch64SpeculationHardeningPass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -219,9 +220,9 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(const Triple &TT,
-                                              Optional<CodeModel::Model> CM,
-                                              bool JIT) {
+static CodeModel::Model
+getEffectiveAArch64CodeModel(const Triple &TT, Optional<CodeModel::Model> CM,
+                             bool JIT) {
   if (CM) {
     if (*CM != CodeModel::Small && *CM != CodeModel::Tiny &&
         *CM != CodeModel::Large) {
@@ -255,7 +256,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T,
                         computeDataLayout(TT, Options.MCOptions, LittleEndian),
                         TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM),
-                        getEffectiveCodeModel(TT, CM, JIT), OL),
+                        getEffectiveAArch64CodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) {
   initAsmInfo();
 
@@ -275,8 +276,10 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
   }
 
   // Enable GlobalISel at or below EnableGlobalISelAt0.
-  if (getOptLevel() <= EnableGlobalISelAtO)
+  if (getOptLevel() <= EnableGlobalISelAtO) {
     setGlobalISel(true);
+    setGlobalISelAbort(GlobalISelAbortMode::Disable);
+  }
 
   // AArch64 supports the MachineOutliner.
   setMachineOutliner(true);
@@ -419,8 +422,10 @@ void AArch64PassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 
   // Match interleaved memory accesses to ldN/stN intrinsics.
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    addPass(createInterleavedLoadCombinePass());
     addPass(createInterleavedAccessPass());
+  }
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
@@ -546,12 +551,28 @@ void AArch64PassConfig::addPreSched2() {
   if (TM->getOptLevel() != CodeGenOpt::None) {
     if (EnableLoadStoreOpt)
       addPass(createAArch64LoadStoreOptimizationPass());
+  }
+
+  // The AArch64SpeculationHardeningPass destroys dominator tree and natural
+  // loop info, which is needed for the FalkorHWPFFixPass and also later on.
+  // Therefore, run the AArch64SpeculationHardeningPass before the
+  // FalkorHWPFFixPass to avoid recomputing dominator tree and natural loop
+  // info.
+  addPass(createAArch64SpeculationHardeningPass());
+
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     if (EnableFalkorHWPFFix)
       addPass(createFalkorHWPFFixPass());
   }
 }
 
 void AArch64PassConfig::addPreEmitPass() {
+  // Machine Block Placement might have created new opportunities when run
+  // at O3, where the Tail Duplication Threshold is set to 4 instructions.
+  // Run the load/store optimizer once more.
+  if (TM->getOptLevel() >= CodeGenOpt::Aggressive && EnableLoadStoreOpt)
+    addPass(createAArch64LoadStoreOptimizationPass());
+
   if (EnableA53Fix835769)
     addPass(createAArch64A53Fix835769());
   // Relax conditional branch instructions if they're otherwise out of
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 0f291cb83824202b9b36d534ab11f447f8d8659c..edf361bc399cce8321b1b0d985f1f1563c44c082 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -175,6 +175,7 @@ private:
 
   bool parseDirectiveReq(StringRef Name, SMLoc L);
   bool parseDirectiveUnreq(SMLoc L);
+  bool parseDirectiveCFINegateRAState();
 
   bool validateInstruction(MCInst &Inst, SMLoc &IDLoc,
                            SmallVectorImpl<SMLoc> &Loc);
@@ -2813,28 +2814,29 @@ static const struct Extension {
   const char *Name;
   const FeatureBitset Features;
 } ExtensionMap[] = {
-  { "crc",  {AArch64::FeatureCRC} },
-  { "sm4",  {AArch64::FeatureSM4} },
-  { "sha3", {AArch64::FeatureSHA3} },
-  { "sha2", {AArch64::FeatureSHA2} },
-  { "aes",  {AArch64::FeatureAES} },
-  { "crypto", {AArch64::FeatureCrypto} },
-  { "fp", {AArch64::FeatureFPARMv8} },
-  { "simd", {AArch64::FeatureNEON} },
-  { "ras", {AArch64::FeatureRAS} },
-  { "lse", {AArch64::FeatureLSE} },
-  { "predctrl", {AArch64::FeaturePredCtrl} },
-  { "ccdp", {AArch64::FeatureCacheDeepPersist} },
-  { "mte", {AArch64::FeatureMTE} },
-
-  // FIXME: Unsupported extensions
-  { "pan", {} },
-  { "lor", {} },
-  { "rdma", {} },
-  { "profile", {} },
+    {"crc", {AArch64::FeatureCRC}},
+    {"sm4", {AArch64::FeatureSM4}},
+    {"sha3", {AArch64::FeatureSHA3}},
+    {"sha2", {AArch64::FeatureSHA2}},
+    {"aes", {AArch64::FeatureAES}},
+    {"crypto", {AArch64::FeatureCrypto}},
+    {"fp", {AArch64::FeatureFPARMv8}},
+    {"simd", {AArch64::FeatureNEON}},
+    {"ras", {AArch64::FeatureRAS}},
+    {"lse", {AArch64::FeatureLSE}},
+    {"predctrl", {AArch64::FeaturePredCtrl}},
+    {"ccdp", {AArch64::FeatureCacheDeepPersist}},
+    {"mte", {AArch64::FeatureMTE}},
+    {"tlb-rmi", {AArch64::FeatureTLB_RMI}},
+    {"pan-rwv", {AArch64::FeaturePAN_RWV}},
+    {"ccpp", {AArch64::FeatureCCPP}},
+    // FIXME: Unsupported extensions
+    {"pan", {}},
+    {"lor", {}},
+    {"rdma", {}},
+    {"profile", {}},
 };
 
-
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
   if (FBS[AArch64::HasV8_1aOps])
     Str += "ARMv8.1a";
@@ -5026,6 +5028,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
     parseDirectiveUnreq(Loc);
   else if (IDVal == ".inst")
     parseDirectiveInst(Loc);
+  else if (IDVal == ".cfi_negate_ra_state")
+    parseDirectiveCFINegateRAState();
   else if (IsMachO) {
     if (IDVal == MCLOHDirectiveName())
       parseDirectiveLOH(IDVal, Loc);
@@ -5399,6 +5403,13 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
   return false;
 }
 
+bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return true;
+  getStreamer().EmitCFINegateRAState();
+  return false;
+}
+
 bool
 AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
                                     AArch64MCExpr::VariantKind &ELFRefKind,
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 58190686c7947c2b60b350c35d202fbb2ce34121..7778882d4915f3b338f6918e3e3ee97c9e8888ce 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(LLVM AArch64GenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables)
+tablegen(LLVM AArch64GenExegesis.inc -gen-exegesis)
 
 add_public_tablegen_target(AArch64CommonTableGen)
 
@@ -51,6 +52,7 @@ add_llvm_target(AArch64CodeGen
   AArch64RegisterBankInfo.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
+  AArch64SpeculationHardening.cpp
   AArch64StorePairSuppress.cpp
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 0e486b9392316a78688c597bcbed616f34b94d54..58e4a9c9a9e9aa7bdc5a0a6b070ed57750acd63f 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -132,4 +132,7 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
 
   CommentString = "//";
   ExceptionsType = ExceptionHandling::DwarfCFI;
+  // The default is dwarf, but WinEH can be enabled optionally, which requires
+  // WinEHEncodingType to be set.
+  WinEHEncodingType = WinEH::EncodingType::Itanium;
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 744bb0805d2515041c3539fb6efb24e8685081ba..0f8198ba4e9b679ae5f1105c193ecf099f902656 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -16,6 +16,7 @@
 #include "AArch64MCAsmInfo.h"
 #include "AArch64WinCOFFStreamer.h"
 #include "InstPrinter/AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
@@ -31,6 +32,7 @@
 using namespace llvm;
 
 #define GET_INSTRINFO_MC_DESC
+#define GET_INSTRINFO_MC_HELPERS
 #include "AArch64GenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_MC_DESC
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 63f50778ccdb675983d5a4021edf8deb0fcaa540..0f22f69bd5b0a9a954183a5cc0dea29a7fa6f9b5 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -84,6 +84,7 @@ void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
 // Defines symbolic names for the AArch64 instructions.
 //
 #define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
 #include "AArch64GenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_ENUM
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 07e5d97dff90b148fdbe23905ae9f190e075dce2..d26397a4271a0709c91925d70c636b289ec9b827 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -37,10 +37,12 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
 FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
 
 // SI Passes
+FunctionPass *createGCNDPPCombinePass();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIFixupVectorISelPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
@@ -57,6 +59,7 @@ FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
+FunctionPass *createSIModeRegisterPass();
 
 void initializeAMDGPUDAGToDAGISelPass(PassRegistry&);
 
@@ -92,6 +95,9 @@ extern char &AMDGPULowerKernelAttributesID;
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
+void initializeGCNDPPCombinePass(PassRegistry &);
+extern char &GCNDPPCombineID;
+
 void initializeR600ClauseMergePassPass(PassRegistry &);
 extern char &R600ClauseMergePassID;
 
@@ -122,6 +128,9 @@ extern char &SIFixSGPRCopiesID;
 void initializeSIFixVGPRCopiesPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
+void initializeSIFixupVectorISelPass(PassRegistry &);
+extern char &SIFixupVectorISelID;
+
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
@@ -187,6 +196,9 @@ extern char &SIMemoryLegalizerID;
 void initializeSIDebuggerInsertNopsPass(PassRegistry&);
 extern char &SIDebuggerInsertNopsID;
 
+void initializeSIModeRegisterPass(PassRegistry&);
+extern char &SIModeRegisterID;
+
 void initializeSIInsertWaitcntsPass(PassRegistry&);
 extern char &SIInsertWaitcntsID;
 
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 96a8029773d0b5ce53900fd4c4625d76de127346..0aacedf24e32b84b07c189882b9abd19e6a7d028 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td"
 include "llvm/Target/Target.td"
 include "AMDGPUFeatures.td"
 
+class BoolToList<bit Value> {
+  list<int> ret = !if(Value, [1]<int>, []<int>);
+}
+
 //===------------------------------------------------------------===//
 // Subtarget Features (device properties)
 //===------------------------------------------------------------===//
@@ -465,34 +469,41 @@ def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
   [FeatureSouthernIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
   [FeatureSouthernIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
   [FeatureSeaIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
   [FeatureSeaIslands,
    HalfRate64Ops,
    FeatureLDSBankCount32,
-   FeatureFastFMAF32]>;
+   FeatureFastFMAF32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
   [FeatureSeaIslands,
    FeatureLDSBankCount16,
-   FeatureFastFMAF32]>;
+   FeatureFastFMAF32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
   [FeatureSeaIslands,
-   FeatureLDSBankCount16]>;
+   FeatureLDSBankCount16,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
   [FeatureSeaIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
   [FeatureVolcanicIslands,
@@ -500,39 +511,46 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
    HalfRate64Ops,
    FeatureLDSBankCount32,
    FeatureXNACK,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
    FeatureSGPRInitBug,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount16,
-   FeatureXNACK]>;
+   FeatureXNACK,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureXNACK]>;
+   FeatureXNACK,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
   [FeatureGFX9,
    FeatureLDSBankCount32,
-   FeatureFmaMixInsts]>;
+   FeatureFmaMixInsts,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
   [FeatureGFX9,
@@ -540,13 +558,15 @@ def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
    FeatureDLInsts,
-   FeatureSRAMECC]>;
+   FeatureSRAMECC,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureXNACK]>;
+   FeatureXNACK,
+   FeatureCodeObjectV3]>;
 
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
@@ -744,7 +764,6 @@ def EnableLateCFGStructurize : Predicate<
 include "SISchedule.td"
 include "GCNProcessors.td"
 include "AMDGPUInstrInfo.td"
-include "AMDGPUIntrinsics.td"
 include "SIIntrinsics.td"
 include "AMDGPURegisterInfo.td"
 include "AMDGPURegisterBanks.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d07c0516c27265e14e6c0dfe84a2ef72f71fff64..2ded7cdb648994026a4990431e5479dc25130b16 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -46,6 +46,7 @@
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::HSAMD;
 
 // TODO: This should get the default rounding mode from the kernel. We just set
 // the default here, but this could change if the OpenCL rounding mode pragmas
@@ -99,6 +100,10 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
   : AsmPrinter(TM, std::move(Streamer)) {
+    if (IsaInfo::hasCodeObjectV3(getSTI()))
+      HSAMetadataStream.reset(new MetadataStreamerV3());
+    else
+      HSAMetadataStream.reset(new MetadataStreamerV2());
 }
 
 StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -122,9 +127,6 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
     IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
 
     getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
-
-    if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
-      return;
   }
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
@@ -132,11 +134,14 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
     return;
 
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    HSAMetadataStream.begin(M);
+    HSAMetadataStream->begin(M);
 
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     readPALMetadata(M);
 
+  if (IsaInfo::hasCodeObjectV3(getSTI()))
+    return;
+
   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
@@ -148,37 +153,38 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
 }
 
 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  // TODO: Add metadata to code object v3.
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
-
   // Following code requires TargetStreamer to be present.
   if (!getTargetStreamer())
     return;
 
-  // Emit ISA Version (NT_AMD_AMDGPU_ISA).
-  std::string ISAVersionString;
-  raw_string_ostream ISAVersionStream(ISAVersionString);
-  IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
-  getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+  if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+    // Emit ISA Version (NT_AMD_AMDGPU_ISA).
+    std::string ISAVersionString;
+    raw_string_ostream ISAVersionStream(ISAVersionString);
+    IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
+    getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+  }
 
   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
-    HSAMetadataStream.end();
-    getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata());
+    HSAMetadataStream->end();
+    bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
+    (void)Success;
+    assert(Success && "Malformed HSA Metadata");
   }
 
-  // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
-  if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
-    // Copy the PAL metadata from the map where we collected it into a vector,
-    // then write it as a .note.
-    PALMD::Metadata PALMetadataVector;
-    for (auto i : PALMetadataMap) {
-      PALMetadataVector.push_back(i.first);
-      PALMetadataVector.push_back(i.second);
+  if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+    // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
+    if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
+      // Copy the PAL metadata from the map where we collected it into a vector,
+      // then write it as a .note.
+      PALMD::Metadata PALMetadataVector;
+      for (auto i : PALMetadataMap) {
+        PALMetadataVector.push_back(i.first);
+        PALMetadataVector.push_back(i.second);
+      }
+      getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
     }
-    getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
   }
 }
 
@@ -211,11 +217,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
   }
 
-  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
-    return;
-
-  if (!STM.hasCodeObjectV3() && STM.isAmdHsaOS())
-    HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
+  if (STM.isAmdHsaOS())
+    HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
@@ -911,9 +914,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   }
 
   ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
-      getSTI(), ProgInfo.NumSGPRsForWavesPerEU);
+      &STM, ProgInfo.NumSGPRsForWavesPerEU);
   ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
-      getSTI(), ProgInfo.NumVGPRsForWavesPerEU);
+      &STM, ProgInfo.NumVGPRsForWavesPerEU);
 
   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 462b5feca6a3f35c761ca139d788f88e948e35e6..167ac4b21e1e20feb3dbcc39d168e2853d21d545 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -56,7 +56,7 @@ private:
   SIProgramInfo CurrentProgramInfo;
   DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
 
-  AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream;
+  std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
   std::map<uint32_t, uint32_t> PALMetadataMap;
 
   uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index fadc833e0142287424eb223c7b0043778cd1f0a7..c38b0e61558b3d2a2464c776b6dfb43dcb1f3aee 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPUHSAMetadataStreamer.h"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIProgramInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -36,11 +37,14 @@ static cl::opt<bool> VerifyHSAMetadata(
 namespace AMDGPU {
 namespace HSAMD {
 
-void MetadataStreamer::dump(StringRef HSAMetadataString) const {
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV2
+//===----------------------------------------------------------------------===//
+void MetadataStreamerV2::dump(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
 }
 
-void MetadataStreamer::verify(StringRef HSAMetadataString) const {
+void MetadataStreamerV2::verify(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata Parser Test: ";
 
   HSAMD::Metadata FromHSAMetadataString;
@@ -63,7 +67,8 @@ void MetadataStreamer::verify(StringRef HSAMetadataString) const {
   }
 }
 
-AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
+AccessQualifier
+MetadataStreamerV2::getAccessQualifier(StringRef AccQual) const {
   if (AccQual.empty())
     return AccessQualifier::Unknown;
 
@@ -74,7 +79,8 @@ AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
              .Default(AccessQualifier::Default);
 }
 
-AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer(
+AddressSpaceQualifier
+MetadataStreamerV2::getAddressSpaceQualifier(
     unsigned AddressSpace) const {
   switch (AddressSpace) {
   case AMDGPUAS::PRIVATE_ADDRESS:
@@ -94,8 +100,8 @@ AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer(
   }
 }
 
-ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
-                                         StringRef BaseTypeName) const {
+ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual,
+                                           StringRef BaseTypeName) const {
   if (TypeQual.find("pipe") != StringRef::npos)
     return ValueKind::Pipe;
 
@@ -122,7 +128,7 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
                       ValueKind::ByValue);
 }
 
-ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
+ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
     auto Signed = !TypeName.startswith("u");
@@ -154,7 +160,7 @@ ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
   }
 }
 
-std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
+std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
     if (!Signed)
@@ -191,8 +197,8 @@ std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
   }
 }
 
-std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
-    MDNode *Node) const {
+std::vector<uint32_t>
+MetadataStreamerV2::getWorkGroupDimensions(MDNode *Node) const {
   std::vector<uint32_t> Dims;
   if (Node->getNumOperands() != 3)
     return Dims;
@@ -202,9 +208,9 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
   return Dims;
 }
 
-Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
-    const MachineFunction &MF,
-    const SIProgramInfo &ProgramInfo) const {
+Kernel::CodeProps::Metadata
+MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
+                                    const SIProgramInfo &ProgramInfo) const {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
@@ -231,9 +237,9 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
   return HSACodeProps;
 }
 
-Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
-    const MachineFunction &MF,
-    const SIProgramInfo &ProgramInfo) const {
+Kernel::DebugProps::Metadata
+MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF,
+                                     const SIProgramInfo &ProgramInfo) const {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
 
@@ -253,14 +259,14 @@ Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
   return HSADebugProps;
 }
 
-void MetadataStreamer::emitVersion() {
+void MetadataStreamerV2::emitVersion() {
   auto &Version = HSAMetadata.mVersion;
 
   Version.push_back(VersionMajor);
   Version.push_back(VersionMinor);
 }
 
-void MetadataStreamer::emitPrintf(const Module &Mod) {
+void MetadataStreamerV2::emitPrintf(const Module &Mod) {
   auto &Printf = HSAMetadata.mPrintf;
 
   auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
@@ -272,7 +278,7 @@ void MetadataStreamer::emitPrintf(const Module &Mod) {
       Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
 }
 
-void MetadataStreamer::emitKernelLanguage(const Function &Func) {
+void MetadataStreamerV2::emitKernelLanguage(const Function &Func) {
   auto &Kernel = HSAMetadata.mKernels.back();
 
   // TODO: What about other languages?
@@ -290,7 +296,7 @@ void MetadataStreamer::emitKernelLanguage(const Function &Func) {
       mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue());
 }
 
-void MetadataStreamer::emitKernelAttrs(const Function &Func) {
+void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
   auto &Attrs = HSAMetadata.mKernels.back().mAttrs;
 
   if (auto Node = Func.getMetadata("reqd_work_group_size"))
@@ -308,14 +314,14 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) {
   }
 }
 
-void MetadataStreamer::emitKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
   for (auto &Arg : Func.args())
     emitKernelArg(Arg);
 
   emitHiddenKernelArgs(Func);
 }
 
-void MetadataStreamer::emitKernelArg(const Argument &Arg) {
+void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
   auto Func = Arg.getParent();
   auto ArgNo = Arg.getArgNo();
   const MDNode *Node;
@@ -368,12 +374,12 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
                 PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
 }
 
-void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
-                                     ValueKind ValueKind,
-                                     unsigned PointeeAlign,
-                                     StringRef Name,
-                                     StringRef TypeName, StringRef BaseTypeName,
-                                     StringRef AccQual, StringRef TypeQual) {
+void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
+                                       ValueKind ValueKind,
+                                       unsigned PointeeAlign, StringRef Name,
+                                       StringRef TypeName,
+                                       StringRef BaseTypeName,
+                                       StringRef AccQual, StringRef TypeQual) {
   HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
   auto &Arg = HSAMetadata.mKernels.back().mArgs.back();
 
@@ -386,7 +392,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
   Arg.mPointeeAlign = PointeeAlign;
 
   if (auto PtrTy = dyn_cast<PointerType>(Ty))
-    Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
+    Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace());
 
   Arg.mAccQual = getAccessQualifier(AccQual);
 
@@ -406,7 +412,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
   }
 }
 
-void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
   int HiddenArgNumBytes =
       getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
 
@@ -448,12 +454,16 @@ void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
   }
 }
 
-void MetadataStreamer::begin(const Module &Mod) {
+bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+  return TargetStreamer.EmitHSAMetadata(getHSAMetadata());
+}
+
+void MetadataStreamerV2::begin(const Module &Mod) {
   emitVersion();
   emitPrintf(Mod);
 }
 
-void MetadataStreamer::end() {
+void MetadataStreamerV2::end() {
   std::string HSAMetadataString;
   if (toString(HSAMetadata, HSAMetadataString))
     return;
@@ -464,7 +474,8 @@ void MetadataStreamer::end() {
     verify(HSAMetadataString);
 }
 
-void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
+                                    const SIProgramInfo &ProgramInfo) {
   auto &Func = MF.getFunction();
   if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
     return;
@@ -484,6 +495,505 @@ void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo
   HSAMetadata.mKernels.back().mDebugProps = DebugProps;
 }
 
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV3
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerV3::dump(StringRef HSAMetadataString) const {
+  errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
+}
+
+void MetadataStreamerV3::verify(StringRef HSAMetadataString) const {
+  errs() << "AMDGPU HSA Metadata Parser Test: ";
+
+  std::shared_ptr<msgpack::Node> FromHSAMetadataString =
+      std::make_shared<msgpack::MapNode>();
+
+  yaml::Input YIn(HSAMetadataString);
+  YIn >> FromHSAMetadataString;
+  if (YIn.error()) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  std::string ToHSAMetadataString;
+  raw_string_ostream StrOS(ToHSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << FromHSAMetadataString;
+
+  errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n';
+  if (HSAMetadataString != ToHSAMetadataString) {
+    errs() << "Original input: " << HSAMetadataString << '\n'
+           << "Produced output: " << StrOS.str() << '\n';
+  }
+}
+
+Optional<StringRef>
+MetadataStreamerV3::getAccessQualifier(StringRef AccQual) const {
+  return StringSwitch<Optional<StringRef>>(AccQual)
+      .Case("read_only", StringRef("read_only"))
+      .Case("write_only", StringRef("write_only"))
+      .Case("read_write", StringRef("read_write"))
+      .Default(None);
+}
+
+Optional<StringRef>
+MetadataStreamerV3::getAddressSpaceQualifier(unsigned AddressSpace) const {
+  switch (AddressSpace) {
+  case AMDGPUAS::PRIVATE_ADDRESS:
+    return StringRef("private");
+  case AMDGPUAS::GLOBAL_ADDRESS:
+    return StringRef("global");
+  case AMDGPUAS::CONSTANT_ADDRESS:
+    return StringRef("constant");
+  case AMDGPUAS::LOCAL_ADDRESS:
+    return StringRef("local");
+  case AMDGPUAS::FLAT_ADDRESS:
+    return StringRef("generic");
+  case AMDGPUAS::REGION_ADDRESS:
+    return StringRef("region");
+  default:
+    return None;
+  }
+}
+
+StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual,
+                                           StringRef BaseTypeName) const {
+  if (TypeQual.find("pipe") != StringRef::npos)
+    return "pipe";
+
+  return StringSwitch<StringRef>(BaseTypeName)
+      .Case("image1d_t", "image")
+      .Case("image1d_array_t", "image")
+      .Case("image1d_buffer_t", "image")
+      .Case("image2d_t", "image")
+      .Case("image2d_array_t", "image")
+      .Case("image2d_array_depth_t", "image")
+      .Case("image2d_array_msaa_t", "image")
+      .Case("image2d_array_msaa_depth_t", "image")
+      .Case("image2d_depth_t", "image")
+      .Case("image2d_msaa_t", "image")
+      .Case("image2d_msaa_depth_t", "image")
+      .Case("image3d_t", "image")
+      .Case("sampler_t", "sampler")
+      .Case("queue_t", "queue")
+      .Default(isa<PointerType>(Ty)
+                   ? (Ty->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
+                          ? "dynamic_shared_pointer"
+                          : "global_buffer")
+                   : "by_value");
+}
+
+StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    auto Signed = !TypeName.startswith("u");
+    switch (Ty->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? "i8" : "u8";
+    case 16:
+      return Signed ? "i16" : "u16";
+    case 32:
+      return Signed ? "i32" : "u32";
+    case 64:
+      return Signed ? "i64" : "u64";
+    default:
+      return "struct";
+    }
+  }
+  case Type::HalfTyID:
+    return "f16";
+  case Type::FloatTyID:
+    return "f32";
+  case Type::DoubleTyID:
+    return "f64";
+  case Type::PointerTyID:
+    return getValueType(Ty->getPointerElementType(), TypeName);
+  case Type::VectorTyID:
+    return getValueType(Ty->getVectorElementType(), TypeName);
+  default:
+    return "struct";
+  }
+}
+
+std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    if (!Signed)
+      return (Twine('u') + getTypeName(Ty, true)).str();
+
+    auto BitWidth = Ty->getIntegerBitWidth();
+    switch (BitWidth) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BitWidth)).str();
+    }
+  }
+  case Type::HalfTyID:
+    return "half";
+  case Type::FloatTyID:
+    return "float";
+  case Type::DoubleTyID:
+    return "double";
+  case Type::VectorTyID: {
+    auto VecTy = cast<VectorType>(Ty);
+    auto ElTy = VecTy->getElementType();
+    auto NumElements = VecTy->getVectorNumElements();
+    return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
+  }
+  default:
+    return "unknown";
+  }
+}
+
+std::shared_ptr<msgpack::ArrayNode>
+MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const {
+  auto Dims = std::make_shared<msgpack::ArrayNode>();
+  if (Node->getNumOperands() != 3)
+    return Dims;
+
+  for (auto &Op : Node->operands())
+    Dims->push_back(std::make_shared<msgpack::ScalarNode>(
+        mdconst::extract<ConstantInt>(Op)->getZExtValue()));
+  return Dims;
+}
+
+void MetadataStreamerV3::emitVersion() {
+  auto Version = std::make_shared<msgpack::ArrayNode>();
+  Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMajor));
+  Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMinor));
+  getRootMetadata("amdhsa.version") = std::move(Version);
+}
+
+void MetadataStreamerV3::emitPrintf(const Module &Mod) {
+  auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
+  if (!Node)
+    return;
+
+  auto Printf = std::make_shared<msgpack::ArrayNode>();
+  for (auto Op : Node->operands())
+    if (Op->getNumOperands())
+      Printf->push_back(std::make_shared<msgpack::ScalarNode>(
+          cast<MDString>(Op->getOperand(0))->getString()));
+  getRootMetadata("amdhsa.printf") = std::move(Printf);
+}
+
+void MetadataStreamerV3::emitKernelLanguage(const Function &Func,
+                                            msgpack::MapNode &Kern) {
+  // TODO: What about other languages?
+  auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
+  if (!Node || !Node->getNumOperands())
+    return;
+  auto Op0 = Node->getOperand(0);
+  if (Op0->getNumOperands() <= 1)
+    return;
+
+  Kern[".language"] = std::make_shared<msgpack::ScalarNode>("OpenCL C");
+  auto LanguageVersion = std::make_shared<msgpack::ArrayNode>();
+  LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+      mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));
+  LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+      mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));
+  Kern[".language_version"] = std::move(LanguageVersion);
+}
+
+void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
+                                         msgpack::MapNode &Kern) {
+
+  if (auto Node = Func.getMetadata("reqd_work_group_size"))
+    Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("work_group_size_hint"))
+    Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("vec_type_hint")) {
+    Kern[".vec_type_hint"] = std::make_shared<msgpack::ScalarNode>(getTypeName(
+        cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+        mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()));
+  }
+  if (Func.hasFnAttribute("runtime-handle")) {
+    Kern[".device_enqueue_symbol"] = std::make_shared<msgpack::ScalarNode>(
+        Func.getFnAttribute("runtime-handle").getValueAsString().str());
+  }
+}
+
+void MetadataStreamerV3::emitKernelArgs(const Function &Func,
+                                        msgpack::MapNode &Kern) {
+  unsigned Offset = 0;
+  auto Args = std::make_shared<msgpack::ArrayNode>();
+  for (auto &Arg : Func.args())
+    emitKernelArg(Arg, Offset, *Args);
+
+  emitHiddenKernelArgs(Func, Offset, *Args);
+
+  // TODO: What about other languages?
+  if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) {
+    auto &DL = Func.getParent()->getDataLayout();
+    auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args);
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args);
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args);
+
+    auto Int8PtrTy =
+        Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+
+    // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+    // "none" argument.
+    if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+      emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args);
+    else
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+
+    // Emit "default queue" and "completion action" arguments if enqueue kernel
+    // is used, otherwise emit dummy "none" arguments.
+    if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+      emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args);
+      emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args);
+    } else {
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+    }
+  }
+
+  Kern[".args"] = std::move(Args);
+}
+
+void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
+                                       msgpack::ArrayNode &Args) {
+  auto Func = Arg.getParent();
+  auto ArgNo = Arg.getArgNo();
+  const MDNode *Node;
+
+  StringRef Name;
+  Node = Func->getMetadata("kernel_arg_name");
+  if (Node && ArgNo < Node->getNumOperands())
+    Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
+  else if (Arg.hasName())
+    Name = Arg.getName();
+
+  StringRef TypeName;
+  Node = Func->getMetadata("kernel_arg_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef BaseTypeName;
+  Node = Func->getMetadata("kernel_arg_base_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef AccQual;
+  if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() &&
+      Arg.hasNoAliasAttr()) {
+    AccQual = "read_only";
+  } else {
+    Node = Func->getMetadata("kernel_arg_access_qual");
+    if (Node && ArgNo < Node->getNumOperands())
+      AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+  }
+
+  StringRef TypeQual;
+  Node = Func->getMetadata("kernel_arg_type_qual");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  Type *Ty = Arg.getType();
+  const DataLayout &DL = Func->getParent()->getDataLayout();
+
+  unsigned PointeeAlign = 0;
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      PointeeAlign = Arg.getParamAlignment();
+      if (PointeeAlign == 0)
+        PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+    }
+  }
+
+  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
+                getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset,
+                Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual,
+                TypeQual);
+}
+
+void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
+                                       StringRef ValueKind, unsigned &Offset,
+                                       msgpack::ArrayNode &Args,
+                                       unsigned PointeeAlign, StringRef Name,
+                                       StringRef TypeName,
+                                       StringRef BaseTypeName,
+                                       StringRef AccQual, StringRef TypeQual) {
+  auto ArgPtr = std::make_shared<msgpack::MapNode>();
+  auto &Arg = *ArgPtr;
+
+  if (!Name.empty())
+    Arg[".name"] = std::make_shared<msgpack::ScalarNode>(Name);
+  if (!TypeName.empty())
+    Arg[".type_name"] = std::make_shared<msgpack::ScalarNode>(TypeName);
+  auto Size = DL.getTypeAllocSize(Ty);
+  auto Align = DL.getABITypeAlignment(Ty);
+  Arg[".size"] = std::make_shared<msgpack::ScalarNode>(Size);
+  Offset = alignTo(Offset, Align);
+  Arg[".offset"] = std::make_shared<msgpack::ScalarNode>(Offset);
+  Offset += Size;
+  Arg[".value_kind"] = std::make_shared<msgpack::ScalarNode>(ValueKind);
+  Arg[".value_type"] =
+      std::make_shared<msgpack::ScalarNode>(getValueType(Ty, BaseTypeName));
+  if (PointeeAlign)
+    Arg[".pointee_align"] = std::make_shared<msgpack::ScalarNode>(PointeeAlign);
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty))
+    if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace()))
+      Arg[".address_space"] = std::make_shared<msgpack::ScalarNode>(*Qualifier);
+
+  if (auto AQ = getAccessQualifier(AccQual))
+    Arg[".access"] = std::make_shared<msgpack::ScalarNode>(*AQ);
+
+  // TODO: Emit Arg[".actual_access"].
+
+  SmallVector<StringRef, 1> SplitTypeQuals;
+  TypeQual.split(SplitTypeQuals, " ", -1, false);
+  for (StringRef Key : SplitTypeQuals) {
+    if (Key == "const")
+      Arg[".is_const"] = std::make_shared<msgpack::ScalarNode>(true);
+    else if (Key == "restrict")
+      Arg[".is_restrict"] = std::make_shared<msgpack::ScalarNode>(true);
+    else if (Key == "volatile")
+      Arg[".is_volatile"] = std::make_shared<msgpack::ScalarNode>(true);
+    else if (Key == "pipe")
+      Arg[".is_pipe"] = std::make_shared<msgpack::ScalarNode>(true);
+  }
+
+  Args.push_back(std::move(ArgPtr));
+}
+
+void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
+                                              unsigned &Offset,
+                                              msgpack::ArrayNode &Args) {
+  int HiddenArgNumBytes =
+      getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
+
+  if (!HiddenArgNumBytes)
+    return;
+
+  auto &DL = Func.getParent()->getDataLayout();
+  auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+  if (HiddenArgNumBytes >= 8)
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args);
+  if (HiddenArgNumBytes >= 16)
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args);
+  if (HiddenArgNumBytes >= 24)
+    emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args);
+
+  auto Int8PtrTy =
+      Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+
+  // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+  // "none" argument.
+  if (HiddenArgNumBytes >= 32) {
+    if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+      emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args);
+    else
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+  }
+
+  // Emit "default queue" and "completion action" arguments if enqueue kernel is
+  // used, otherwise emit dummy "none" arguments.
+  if (HiddenArgNumBytes >= 48) {
+    if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+      emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args);
+    } else {
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+    }
+  }
+}
+
+std::shared_ptr<msgpack::MapNode>
+MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
+                                      const SIProgramInfo &ProgramInfo) const {
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  const Function &F = MF.getFunction();
+
+  auto HSAKernelProps = std::make_shared<msgpack::MapNode>();
+  auto &Kern = *HSAKernelProps;
+
+  unsigned MaxKernArgAlign;
+  Kern[".kernarg_segment_size"] = std::make_shared<msgpack::ScalarNode>(
+      STM.getKernArgSegmentSize(F, MaxKernArgAlign));
+  Kern[".group_segment_fixed_size"] =
+      std::make_shared<msgpack::ScalarNode>(ProgramInfo.LDSSize);
+  Kern[".private_segment_fixed_size"] =
+      std::make_shared<msgpack::ScalarNode>(ProgramInfo.ScratchSize);
+  Kern[".kernarg_segment_align"] =
+      std::make_shared<msgpack::ScalarNode>(std::max(uint32_t(4), MaxKernArgAlign));
+  Kern[".wavefront_size"] =
+      std::make_shared<msgpack::ScalarNode>(STM.getWavefrontSize());
+  Kern[".sgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumSGPR);
+  Kern[".vgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumVGPR);
+  Kern[".max_flat_workgroup_size"] =
+      std::make_shared<msgpack::ScalarNode>(MFI.getMaxFlatWorkGroupSize());
+  Kern[".sgpr_spill_count"] =
+      std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledSGPRs());
+  Kern[".vgpr_spill_count"] =
+      std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledVGPRs());
+
+  return HSAKernelProps;
+}
+
+bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+  return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true);
+}
+
+void MetadataStreamerV3::begin(const Module &Mod) {
+  emitVersion();
+  emitPrintf(Mod);
+  getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode());
+}
+
+void MetadataStreamerV3::end() {
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << HSAMetadataRoot;
+
+  if (DumpHSAMetadata)
+    dump(StrOS.str());
+  if (VerifyHSAMetadata)
+    verify(StrOS.str());
+}
+
+void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
+                                    const SIProgramInfo &ProgramInfo) {
+  auto &Func = MF.getFunction();
+  auto KernelProps = getHSAKernelProps(MF, ProgramInfo);
+
+  assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         Func.getCallingConv() == CallingConv::SPIR_KERNEL);
+
+  auto &KernelsNode = getRootMetadata("amdhsa.kernels");
+  auto Kernels = cast<msgpack::ArrayNode>(KernelsNode.get());
+
+  {
+    auto &Kern = *KernelProps;
+    Kern[".name"] = std::make_shared<msgpack::ScalarNode>(Func.getName());
+    Kern[".symbol"] = std::make_shared<msgpack::ScalarNode>(
+        (Twine(Func.getName()) + Twine(".kd")).str());
+    emitKernelLanguage(Func, Kern);
+    emitKernelAttrs(Func, Kern);
+    emitKernelArgs(Func, Kern);
+  }
+
+  Kernels->push_back(std::move(KernelProps));
+}
+
 } // end namespace HSAMD
 } // end namespace AMDGPU
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index a1e08235a5ec4234012080b4a8253ccfd586d56d..afc09baf952d6e7c869cd9435600da4a14bf3ed6 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -19,10 +19,12 @@
 #include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 
 namespace llvm {
 
+class AMDGPUTargetStreamer;
 class Argument;
 class DataLayout;
 class Function;
@@ -34,7 +36,92 @@ class Type;
 namespace AMDGPU {
 namespace HSAMD {
 
-class MetadataStreamer final {
+class MetadataStreamer {
+public:
+  virtual ~MetadataStreamer(){};
+
+  virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0;
+
+  virtual void begin(const Module &Mod) = 0;
+
+  virtual void end() = 0;
+
+  virtual void emitKernel(const MachineFunction &MF,
+                          const SIProgramInfo &ProgramInfo) = 0;
+};
+
+class MetadataStreamerV3 final : public MetadataStreamer {
+private:
+  std::shared_ptr<msgpack::Node> HSAMetadataRoot =
+      std::make_shared<msgpack::MapNode>();
+
+  void dump(StringRef HSAMetadataString) const;
+
+  void verify(StringRef HSAMetadataString) const;
+
+  Optional<StringRef> getAccessQualifier(StringRef AccQual) const;
+
+  Optional<StringRef> getAddressSpaceQualifier(unsigned AddressSpace) const;
+
+  StringRef getValueKind(Type *Ty, StringRef TypeQual,
+                         StringRef BaseTypeName) const;
+
+  StringRef getValueType(Type *Ty, StringRef TypeName) const;
+
+  std::string getTypeName(Type *Ty, bool Signed) const;
+
+  std::shared_ptr<msgpack::ArrayNode>
+  getWorkGroupDimensions(MDNode *Node) const;
+
+  std::shared_ptr<msgpack::MapNode>
+  getHSAKernelProps(const MachineFunction &MF,
+                    const SIProgramInfo &ProgramInfo) const;
+
+  void emitVersion();
+
+  void emitPrintf(const Module &Mod);
+
+  void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern);
+
+  void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern);
+
+  void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern);
+
+  void emitKernelArg(const Argument &Arg, unsigned &Offset,
+                     msgpack::ArrayNode &Args);
+
+  void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
+                     unsigned &Offset, msgpack::ArrayNode &Args,
+                     unsigned PointeeAlign = 0, StringRef Name = "",
+                     StringRef TypeName = "", StringRef BaseTypeName = "",
+                     StringRef AccQual = "", StringRef TypeQual = "");
+
+  void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
+                            msgpack::ArrayNode &Args);
+
+  std::shared_ptr<msgpack::Node> &getRootMetadata(StringRef Key) {
+    return (*cast<msgpack::MapNode>(HSAMetadataRoot.get()))[Key];
+  }
+
+  std::shared_ptr<msgpack::Node> &getHSAMetadataRoot() {
+    return HSAMetadataRoot;
+  }
+
+public:
+  MetadataStreamerV3() = default;
+  ~MetadataStreamerV3() = default;
+
+  bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
+
+  void begin(const Module &Mod) override;
+
+  void end() override;
+
+  void emitKernel(const MachineFunction &MF,
+                  const SIProgramInfo &ProgramInfo) override;
+};
+
+class MetadataStreamerV2 final : public MetadataStreamer {
 private:
   Metadata HSAMetadata;
 
@@ -44,7 +131,7 @@ private:
 
   AccessQualifier getAccessQualifier(StringRef AccQual) const;
 
-  AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const;
+  AddressSpaceQualifier getAddressSpaceQualifier(unsigned AddressSpace) const;
 
   ValueKind getValueKind(Type *Ty, StringRef TypeQual,
                          StringRef BaseTypeName) const;
@@ -82,19 +169,22 @@ private:
 
   void emitHiddenKernelArgs(const Function &Func);
 
-public:
-  MetadataStreamer() = default;
-  ~MetadataStreamer() = default;
-
   const Metadata &getHSAMetadata() const {
     return HSAMetadata;
   }
 
-  void begin(const Module &Mod);
+public:
+  MetadataStreamerV2() = default;
+  ~MetadataStreamerV2() = default;
+
+  bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
+
+  void begin(const Module &Mod) override;
 
-  void end();
+  void end() override;
 
-  void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo);
+  void emitKernel(const MachineFunction &MF,
+                  const SIProgramInfo &ProgramInfo) override;
 };
 
 } // end namespace HSAMD
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ad0a9e388af894f4ca8d2f6f7e49b8b8950726e0..f5894c9022a6e5e3c142785cb4e640f99fcaec72 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -655,8 +655,11 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
 }
 
 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
-                                                 ISD::LoadExtType,
+                                                 ISD::LoadExtType ExtTy,
                                                  EVT NewVT) const {
+  // TODO: This may be worth removing. Check regression tests for diffs.
+  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
+    return false;
 
   unsigned NewSize = NewVT.getStoreSizeInBits();
 
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index b7d1575ca898fc55254532018543dddc26ec4340..282d1c118333629717d6eecbd8f4c0791b5e5193 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -135,6 +135,12 @@ def brtarget   : Operand<OtherVT>;
 // Misc. PatFrags
 //===----------------------------------------------------------------------===//
 
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0),
+  (op $src0),
+  [{ return N->hasOneUse(); }]
+>;
+
 class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
   (ops node:$src0, node:$src1),
   (op $src0, $src1),
@@ -165,6 +171,8 @@ def or_oneuse : HasOneUseBinOp<or>;
 def xor_oneuse : HasOneUseBinOp<xor>;
 } // Properties = [SDNPCommutative, SDNPAssociative]
 
+def not_oneuse : HasOneUseUnaryOp<not>;
+
 def add_oneuse : HasOneUseBinOp<add>;
 def sub_oneuse : HasOneUseBinOp<sub>;
 
@@ -796,18 +804,30 @@ class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
   (BIT_ALIGN $src0, $src0, $src1)
 >;
 
-// This matches 16 permutations of
-// max(min(x, y), min(max(x, y), z))
-class IntMed3Pat<Instruction med3Inst,
+multiclass IntMed3Pat<Instruction med3Inst,
+                 SDPatternOperator min,
                  SDPatternOperator max,
-                 SDPatternOperator max_oneuse,
                  SDPatternOperator min_oneuse,
-                 ValueType vt = i32> : AMDGPUPat<
+                 SDPatternOperator max_oneuse,
+                 ValueType vt = i32> {
+
+  // This matches 16 permutations of 
+  // min(max(a, b), max(min(a, b), c))
+  def : AMDGPUPat <
+  (min (max_oneuse vt:$src0, vt:$src1),
+       (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
+  (med3Inst vt:$src0, vt:$src1, vt:$src2)
+>;
+
+  // This matches 16 permutations of 
+  // max(min(x, y), min(max(x, y), z))
+  def : AMDGPUPat <
   (max (min_oneuse vt:$src0, vt:$src1),
        (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
   (med3Inst $src0, $src1, $src2)
 >;
-
+}
+  
 // Special conversion patterns
 
 def cvt_rpi_i32_f32 : PatFrag <
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 896e2055cf620615788f7bb369a8289027d22423..02108ca3ddd783cbe3d355139faec0b2cad4c878 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -40,7 +40,7 @@ StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
   if (IntrID < Intrinsic::num_intrinsics)
     return StringRef();
 
-  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+  assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics &&
          "Invalid intrinsic ID");
 
   return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
@@ -91,7 +91,7 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
     = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
 
   AttributeList AS =
-      getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID));
+      getAttributes(M->getContext(), static_cast<SIIntrinsic::ID>(IntrID));
   F->setAttributes(AS);
   return F;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index ef42f9a319af69ca3541040e0f15ef09d6e7790a..a1a094dded23d007a277f3a68152a56dbbbb0e60 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -20,7 +20,7 @@
 namespace llvm {
 class TargetMachine;
 
-namespace AMDGPUIntrinsic {
+namespace SIIntrinsic {
 enum ID {
   last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
 #define GET_INTRINSIC_ENUM_VALUES
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 877251a3ac40384ab84fc64b0e0f311355a6e194..62fe0f3a8b91aa38c3e651307940f2667f7fdbf3 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -45,6 +45,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 
+  const LLT CodePtr = FlatPtr;
+
   const LLT AddrSpaces[] = {
     GlobalPtr,
     ConstantPtr,
@@ -88,17 +90,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
   // between these two scenarios.
   setAction({G_CONSTANT, S1}, Legal);
 
-  setAction({G_FADD, S32}, Legal);
+  setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
+
+  getActionDefinitionsBuilder(
+    { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA})
+    .legalFor({S32, S64});
+
+  // Use actual fsub instruction
+  setAction({G_FSUB, S32}, Legal);
+
+  // Must use fadd + fneg
+  setAction({G_FSUB, S64}, Lower);
 
   setAction({G_FCMP, S1}, Legal);
   setAction({G_FCMP, 1, S32}, Legal);
   setAction({G_FCMP, 1, S64}, Legal);
 
-  setAction({G_FMUL, S32}, Legal);
-
   setAction({G_ZEXT, S64}, Legal);
   setAction({G_ZEXT, 1, S32}, Legal);
 
+  setAction({G_SEXT, S64}, Legal);
+  setAction({G_SEXT, 1, S32}, Legal);
+
+  setAction({G_ANYEXT, S64}, Legal);
+  setAction({G_ANYEXT, 1, S32}, Legal);
+
   setAction({G_FPTOSI, S32}, Legal);
   setAction({G_FPTOSI, 1, S32}, Legal);
 
@@ -114,14 +130,28 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
     setAction({G_GEP, 1, IdxTy}, Legal);
   }
 
+  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
+
   setAction({G_ICMP, S1}, Legal);
   setAction({G_ICMP, 1, S32}, Legal);
 
+  setAction({G_CTLZ, S32}, Legal);
+  setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal);
+  setAction({G_CTTZ, S32}, Legal);
+  setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal);
+  setAction({G_BSWAP, S32}, Legal);
+  setAction({G_CTPOP, S32}, Legal);
+
   getActionDefinitionsBuilder(G_INTTOPTR)
     .legalIf([](const LegalityQuery &Query) {
       return true;
     });
 
+  getActionDefinitionsBuilder(G_PTRTOINT)
+    .legalIf([](const LegalityQuery &Query) {
+      return true;
+    });
+
   getActionDefinitionsBuilder({G_LOAD, G_STORE})
     .legalIf([=, &ST](const LegalityQuery &Query) {
         const LLT &Ty0 = Query.Types[0];
@@ -182,6 +212,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
                (Ty1.getSizeInBits() % 32 == 0);
       });
 
+  getActionDefinitionsBuilder(G_BUILD_VECTOR)
+      .legalIf([=](const LegalityQuery &Query) {
+        const LLT &VecTy = Query.Types[0];
+        const LLT &ScalarTy = Query.Types[1];
+        return VecTy.getSizeInBits() % 32 == 0 &&
+               ScalarTy.getSizeInBits() % 32 == 0 &&
+               VecTy.getSizeInBits() <= 512;
+      });
   // Merge/Unmerge
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index f37795e961e806c3ae17178e1cfb43adfb9634b0..4fc3fe0f105b0823a4a5d926ce9576950f87a74e 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -995,8 +995,10 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
   } else {
     AttributeList Attr;
     LLVMContext &Ctx = M->getContext();
-    Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly);
-    Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind);
+    Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
+                             Attribute::ReadOnly);
+    Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
+                             Attribute::NoUnwind);
     C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
   }
 
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index fae1da9233770c32625df0147f59b3dd3848377d..743dc7a0d00b97fb3b119cb6502687fe35aaafa8 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -122,14 +122,17 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
 
     VectorType *VT = dyn_cast<VectorType>(ArgTy);
     bool IsV3 = VT && VT->getNumElements() == 3;
+    bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
+
     VectorType *V4Ty = nullptr;
 
     int64_t AlignDownOffset = alignDown(EltOffset, 4);
     int64_t OffsetDiff = EltOffset - AlignDownOffset;
-    unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
+    unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
+                                      KernArgBaseAlign);
 
     Value *ArgPtr;
-    if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
+    if (DoShiftOpt) { // FIXME: Handle aggregate types
       // Since we don't have sub-dword scalar loads, avoid doing an extload by
       // loading earlier than the argument address, and extracting the relevant
       // bits.
@@ -147,7 +150,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     } else {
       ArgPtr = Builder.CreateConstInBoundsGEP1_64(
         KernArgSegment,
-        AlignDownOffset,
+        EltOffset,
         Arg.getName() + ".kernarg.offset");
       ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
                                      ArgPtr->getName() + ".cast");
@@ -198,7 +201,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
 
     // TODO: Convert noalias arg to !noalias
 
-    if (Size < 32 && !ArgTy->isAggregateType()) {
+    if (DoShiftOpt) {
       Value *ExtractBits = OffsetDiff == 0 ?
         Load : Builder.CreateLShr(Load, OffsetDiff * 8);
 
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 995d9ae3907fc5c2d1126cf2cb311b9a9b0d812b..5e0b7d4290220ad0ac854b1bb71d21407628add1 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -42,9 +42,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
     if (!FirstMI)
       return true;
 
+    const MachineBasicBlock &MBB = *FirstMI->getParent();
+    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
     const MachineOperand *Src2 = TII.getNamedOperand(SecondMI,
                                                      AMDGPU::OpName::src2);
-    return FirstMI->definesRegister(Src2->getReg());
+    return FirstMI->definesRegister(Src2->getReg(), TRI);
   }
   default:
     return false;
diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h
index b50a2eb8e9e714aea0fec14d1a9386764707dc78..2feff14d34a15864df2e444cc71ec2ade690101a 100644
--- a/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -23,7 +23,8 @@ namespace ElfNote {
 
 const char SectionName[] = ".note";
 
-const char NoteName[] = "AMD";
+const char NoteNameV2[] = "AMD";
+const char NoteNameV3[] = "AMDGPU";
 
 // TODO: Remove this file once we drop code object v2.
 enum NoteType{
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 05b714f924b938f630c72b9b42fd2690610bb959..e53c02202df5bb5c101f29502ff941a0bf7db503 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -35,7 +35,7 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
     : AMDGPUGenRegisterBankInfo(),
       TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
 
-  // HACK: Until this is fully tablegen'd
+  // HACK: Until this is fully tablegen'd.
   static bool AlreadyInit = false;
   if (AlreadyInit)
     return;
@@ -354,9 +354,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     LLVM_FALLTHROUGH;
 
   case AMDGPU::G_FADD:
+  case AMDGPU::G_FSUB:
   case AMDGPU::G_FPTOSI:
   case AMDGPU::G_FPTOUI:
   case AMDGPU::G_FMUL:
+  case AMDGPU::G_FMA:
     return getDefaultMappingVOP(MI);
   case AMDGPU::G_IMPLICIT_DEF: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -364,7 +366,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_FCONSTANT:
-  case AMDGPU::G_CONSTANT: {
+  case AMDGPU::G_CONSTANT:
+  case AMDGPU::G_FRAME_INDEX:
+  case AMDGPU::G_BLOCK_ADDR: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
@@ -402,8 +406,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
     break;
   }
+  case AMDGPU::G_BITCAST:
   case AMDGPU::G_INTTOPTR:
-  case AMDGPU::G_BITCAST: {
+  case AMDGPU::G_PTRTOINT:
+  case AMDGPU::G_CTLZ:
+  case AMDGPU::G_CTLZ_ZERO_UNDEF:
+  case AMDGPU::G_CTTZ:
+  case AMDGPU::G_CTTZ_ZERO_UNDEF:
+  case AMDGPU::G_CTPOP:
+  case AMDGPU::G_BSWAP:
+  case AMDGPU::G_FABS:
+  case AMDGPU::G_FNEG: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
@@ -419,7 +432,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
     break;
   }
-  case AMDGPU::G_ZEXT: {
+  case AMDGPU::G_ZEXT:
+  case AMDGPU::G_SEXT:
+  case AMDGPU::G_ANYEXT: {
     unsigned Dst = MI.getOperand(0).getReg();
     unsigned Src = MI.getOperand(1).getReg();
     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 8b1cb23c6722af19e6fe9472e3aeae84d4ff6971..886aca42b6cb1eb3a8a1bbe9747b423b61f69906 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -581,7 +581,8 @@ public:
   }
 
   bool hasCodeObjectV3() const {
-    return CodeObjectV3;
+    // FIXME: Need to add code object v3 support for mesa and pal.
+    return isAmdHsaOS() ? CodeObjectV3 : false;
   }
 
   bool hasUnalignedBufferAccess() const {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 403dace533a335181d7e0509a86ff821cd1145b9..70d365f4ad75d51734bd338cf3882868785e1251 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -106,6 +106,11 @@ static cl::opt<bool> EnableSDWAPeephole(
   cl::desc("Enable SDWA peepholer"),
   cl::init(true));
 
+static cl::opt<bool> EnableDPPCombine(
+  "amdgpu-dpp-combine",
+  cl::desc("Enable DPP combiner"),
+  cl::init(false));
+
 // Enable address space based alias analysis
 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
   cl::desc("Enable AMDGPU Alias Analysis"),
@@ -145,6 +150,13 @@ static cl::opt<bool> EnableAtomicOptimizations(
   cl::init(false),
   cl::Hidden);
 
+// Enable Mode register optimization
+static cl::opt<bool> EnableSIModeRegisterPass(
+  "amdgpu-mode-register",
+  cl::desc("Enable mode register pass"),
+  cl::init(true),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -158,9 +170,11 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeR600VectorRegMergerPass(*PR);
   initializeGlobalISel(*PR);
   initializeAMDGPUDAGToDAGISelPass(*PR);
+  initializeGCNDPPCombinePass(*PR);
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
+  initializeSIFixupVectorISelPass(*PR);
   initializeSIFoldOperandsPass(*PR);
   initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
@@ -182,6 +196,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
+  initializeSIModeRegisterPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIInsertSkipsPass(*PR);
@@ -303,12 +318,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return Reloc::PIC_;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          TargetOptions Options,
@@ -317,7 +326,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOpt::Level OptLevel)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
                         FS, Options, getEffectiveRelocModel(RM),
-                        getEffectiveCodeModel(CM), OptLevel),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
       TLOF(createTLOF(getTargetTriple())) {
   initAsmInfo();
 }
@@ -789,6 +798,8 @@ void GCNPassConfig::addMachineSSAOptimization() {
   //
   // XXX - Can we get away without running DeadMachineInstructionElim again?
   addPass(&SIFoldOperandsID);
+  if (EnableDPPCombine)
+    addPass(&GCNDPPCombineID);
   addPass(&DeadMachineInstructionElimID);
   addPass(&SILoadStoreOptimizerID);
   if (EnableSDWAPeephole) {
@@ -813,6 +824,7 @@ bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(&SIFixSGPRCopiesID);
   addPass(createSILowerI1CopiesPass());
+  addPass(createSIFixupVectorISelPass());
   return false;
 }
 
@@ -878,7 +890,8 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 
 void GCNPassConfig::addPostRegAlloc() {
   addPass(&SIFixVGPRCopiesID);
-  addPass(&SIOptimizeExecMaskingID);
+  if (getOptLevel() > CodeGenOpt::None)
+    addPass(&SIOptimizeExecMaskingID);
   TargetPassConfig::addPostRegAlloc();
 }
 
@@ -889,6 +902,7 @@ void GCNPassConfig::addPreEmitPass() {
   addPass(createSIMemoryLegalizerPass());
   addPass(createSIInsertWaitcntsPass());
   addPass(createSIShrinkInstructionsPass());
+  addPass(createSIModeRegisterPass());
 
   // The hazard recognizer that runs as part of the post-ra scheduler does not
   // guarantee to be able handle all hazards correctly. This is because if there
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index e2f718bd3c34d5058c04efa6c191ff8ac940219f..7a720717cc800857a67e01da6e8423ad1e2fd2f4 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -29,3 +29,13 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
 
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
 }
+
+MCSection *AMDGPUTargetObjectFile::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
+  // Set metadata access for the explicit section
+  StringRef SectionName = GO->getSection();
+  if (SectionName.startswith(".AMDGPU.metadata."))
+    SK = SectionKind::getMetadata();
+
+  return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index dd9dc1a88fc2b68148bc083b9da0560452ccc3bb..a4ae1a2c18c26bdbdace155b7923ca9be4aa83e0 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -26,6 +26,8 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
   public:
     MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
                                       const TargetMachine &TM) const override;
+    MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                        const TargetMachine &TM) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index bd94193087207ab8b5f4215ebdc767b77e41f5e2..3f9af27a2e5e1d2fec59755ae1896ff42b2abf50 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -3065,9 +3065,18 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
 }
 
 bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
+  const char *AssemblerDirectiveBegin;
+  const char *AssemblerDirectiveEnd;
+  std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
+      AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())
+          ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
+                            HSAMD::V3::AssemblerDirectiveEnd)
+          : std::make_tuple(HSAMD::AssemblerDirectiveBegin,
+                            HSAMD::AssemblerDirectiveEnd);
+
   if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) {
     return Error(getParser().getTok().getLoc(),
-                 (Twine(HSAMD::AssemblerDirectiveBegin) + Twine(" directive is "
+                 (Twine(AssemblerDirectiveBegin) + Twine(" directive is "
                  "not available on non-amdhsa OSes")).str());
   }
 
@@ -3085,7 +3094,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
 
     if (getLexer().is(AsmToken::Identifier)) {
       StringRef ID = getLexer().getTok().getIdentifier();
-      if (ID == AMDGPU::HSAMD::AssemblerDirectiveEnd) {
+      if (ID == AssemblerDirectiveEnd) {
         Lex();
         FoundEnd = true;
         break;
@@ -3107,8 +3116,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
 
   YamlStream.flush();
 
-  if (!getTargetStreamer().EmitHSAMetadata(HSAMetadataString))
-    return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  if (IsaInfo::hasCodeObjectV3(&getSTI())) {
+    if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
+      return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  } else {
+    if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
+      return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  }
 
   return false;
 }
@@ -3145,6 +3159,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
 
     if (IDVal == ".amdhsa_kernel")
       return ParseDirectiveAMDHSAKernel();
+
+    // TODO: Restructure/combine with PAL metadata directive.
+    if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin)
+      return ParseDirectiveHSAMetadata();
   } else {
     if (IDVal == ".hsa_code_object_version")
       return ParseDirectiveHSACodeObjectVersion();
@@ -3160,10 +3178,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
 
     if (IDVal == ".amd_amdgpu_isa")
       return ParseDirectiveISAVersion();
-  }
 
-  if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
-    return ParseDirectiveHSAMetadata();
+    if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
+      return ParseDirectiveHSAMetadata();
+  }
 
   if (IDVal == PALMD::AssemblerDirective)
     return ParseDirectivePALMetadata();
@@ -5275,12 +5293,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
     ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
   }
 
-  // All DPP instructions with at least one source operand have a fake "old"
-  // source at the beginning that's tied to the dst operand. Handle it here.
-  if (Desc.getNumOperands() >= 2)
-    Inst.addOperand(Inst.getOperand(0));
-
   for (unsigned E = Operands.size(); I != E; ++I) {
+    auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
+                                            MCOI::TIED_TO);
+    if (TiedTo != -1) {
+      assert((unsigned)TiedTo < Inst.getNumOperands());
+      // handle tied old or src2 for MAC instructions
+      Inst.addOperand(Inst.getOperand(TiedTo));
+    }
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     // Add the register arguments
     if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index e48b73b0f1e793d95d11dc9e2b687b859f828650..51c2abeac2ffb07f0b495a81a48cbbf7113e7bd2 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -286,6 +286,12 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
 // MUBUF classes
 //===----------------------------------------------------------------------===//
 
+class MUBUFGetBaseOpcode<string Op> {
+  string ret = !subst("DWORDX2", "DWORD",
+    !subst("DWORDX3", "DWORD",
+    !subst("DWORDX4", "DWORD", Op)));
+}
+
 class MUBUF_Pseudo <string opName, dag outs, dag ins,
                     string asmOps, list<dag> pattern=[]> :
   InstSI<outs, ins, "", pattern>,
@@ -299,6 +305,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   string Mnemonic = opName;
   string AsmOperands = asmOps;
 
+  Instruction Opcode = !cast<Instruction>(NAME);
+  Instruction BaseOpcode = !cast<Instruction>(MUBUFGetBaseOpcode<NAME>.ret);
+
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MUBUF = 1;
@@ -321,6 +330,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_offset  = 1;
   bits<1> has_slc     = 1;
   bits<1> has_tfe     = 1;
+  bits<4> dwords      = 0;
 }
 
 class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
@@ -394,6 +404,16 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
              );
 }
 
+class getMUBUFDwords<RegisterClass regClass> {
+  string regClassAsInt = !cast<string>(regClass);
+  int ret =
+    !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1,
+    !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2,
+    !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3,
+    !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4,
+    0))));
+}
+
 class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
   dag ret =
     !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret,
@@ -454,6 +474,7 @@ class MUBUF_Load_Pseudo <string opName,
   let Uses = !if(isLds, [EXEC, M0], [EXEC]);
   let has_tfe = !if(isLds, 0, 1);
   let lds = isLds;
+  let dwords = getMUBUFDwords<vdataClass>.ret;
 }
 
 // FIXME: tfe can't be an operand because it requires a separate
@@ -517,6 +538,7 @@ class MUBUF_Store_Pseudo <string opName,
   let mayLoad = 0;
   let mayStore = 1;
   let maybeAtomic = 1;
+  let dwords = getMUBUFDwords<vdataClass>.ret;
 }
 
 multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
@@ -1422,54 +1444,6 @@ defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D
 defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
 defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
 }
-
-// BUFFER_LOAD_DWORD*, addr64=0
-multiclass MUBUF_Load_Dword <ValueType vt,
-                             MUBUF_Pseudo offset,
-                             MUBUF_Pseudo offen,
-                             MUBUF_Pseudo idxen,
-                             MUBUF_Pseudo bothen> {
-
-  def : GCNPat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
-                                  imm:$offset, 0, 0, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
-            (as_i1imm $slc), (as_i1imm $tfe))
-  >;
-
-  def : GCNPat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 1, 0, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
-           (as_i1imm $tfe))
-  >;
-
-  def : GCNPat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 0, 1, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
-           (as_i1imm $slc), (as_i1imm $tfe))
-  >;
-
-  def : GCNPat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 1, 1, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
-            (as_i1imm $tfe))
-  >;
-}
-
-defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
-                         BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
-defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
-                         BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
-defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
-                         BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
-
 multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
                                       ValueType vt, PatFrag atomic_st> {
   // Store follows atomic op convention so address is forst
@@ -2116,3 +2090,22 @@ let SubtargetPredicate = HasPackedD16VMem in {
   defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Real_AllAddr_vi <0x0e>;
   defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>;
 } // End HasUnpackedD16VMem.
+
+def MUBUFInfoTable : GenericTable {
+  let FilterClass = "MUBUF_Pseudo";
+  let CppTypeName = "MUBUFInfo";
+  let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getMUBUFOpcodeHelper";
+}
+
+def getMUBUFInfoFromOpcode : SearchIndex {
+  let Table = MUBUFInfoTable;
+  let Key = ["Opcode"];
+}
+
+def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex {
+  let Table = MUBUFInfoTable;
+  let Key = ["BaseOpcode", "dwords"];
+}
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 3c87dc188270364f66e0986062de9f6d6f8e706f..7d1219914823d558b2d4dd28ffcd74cc8fee1ec3 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -96,6 +96,7 @@ add_llvm_target(AMDGPUCodeGen
   SIAnnotateControlFlow.cpp
   SIDebuggerInsertNops.cpp
   SIFixSGPRCopies.cpp
+  SIFixupVectorISel.cpp
   SIFixVGPRCopies.cpp
   SIFixWWMLiveness.cpp
   SIFoldOperands.cpp
@@ -118,6 +119,8 @@ add_llvm_target(AMDGPUCodeGen
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
   GCNILPSched.cpp
+  GCNDPPCombine.cpp
+  SIModeRegister.cpp
   )
 
 add_subdirectory(AsmParser)
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 18e8b8a1c2d1a6e6fd29bc5cda3f4bdcb5f51f5f..44040d352e6a86ef5e6fa979cb6d815c4a5e8872 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -121,6 +121,11 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
 }
 
+class GlobalSaddrTable <bit is_saddr, string Name = ""> {
+  bit IsSaddr = is_saddr;
+  string SaddrOp = Name;
+}
+
 // TODO: Is exec allowed for saddr? The disabled value 0x7f is the
 // same encoding value as exec_hi, so it isn't possible to use that if
 // saddr is 32-bit (which isn't handled here yet).
@@ -171,15 +176,19 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
   let is_flat_global = 1 in {
-    def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>;
-    def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>;
+    def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+      GlobalSaddrTable<0, opName>;
+    def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>,
+      GlobalSaddrTable<1, opName>;
   }
 }
 
 multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
   let is_flat_global = 1 in {
-    def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>;
-    def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>;
+    def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+      GlobalSaddrTable<0, opName>;
+    def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>,
+      GlobalSaddrTable<1, opName>;
   }
 }
 
@@ -262,6 +271,7 @@ multiclass FLAT_Atomic_Pseudo<
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
     " $vaddr, $vdata$offset$slc">,
+    GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
     let PseudoInstr = NAME;
   }
@@ -272,6 +282,7 @@ multiclass FLAT_Atomic_Pseudo<
     " $vdst, $vaddr, $vdata$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+       GlobalSaddrTable<0, opName#"_rtn">,
        AtomicNoRet <opName, 1>;
 }
 
@@ -287,6 +298,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, off$offset$slc">,
+    GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
     let has_saddr = 1;
     let PseudoInstr = NAME;
@@ -296,6 +308,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, $saddr$offset$slc">,
+    GlobalSaddrTable<1, opName>,
     AtomicNoRet <opName#"_saddr", 0> {
     let has_saddr = 1;
     let enabled_saddr = 1;
@@ -317,6 +330,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
     " $vdst, $vaddr, $vdata, off$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+      GlobalSaddrTable<0, opName#"_rtn">,
       AtomicNoRet <opName, 1> {
     let has_saddr = 1;
   }
@@ -325,6 +339,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
     (outs vdst_rc:$vdst),
       (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+    GlobalSaddrTable<1, opName#"_rtn">,
     AtomicNoRet <opName#"_saddr", 1> {
      let has_saddr = 1;
      let enabled_saddr = 1;
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56071d0d23744dfd0e887e8b6f13cff377cc5c9f
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -0,0 +1,446 @@
+//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
+// operand.If any of the use instruction cannot be combined with the mov the
+// whole sequence is reverted.
+//
+// $old = ...
+// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
+//                            dpp_controls..., $bound_ctrl
+// $res = VALU $dpp_value, ...
+//
+// to
+//
+// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
+//                 dpp_controls..., $folded_bound_ctrl
+//
+// Combining rules :
+//
+// $bound_ctrl is DPP_BOUND_ZERO, $old is any
+// $bound_ctrl is DPP_BOUND_OFF, $old is 0
+//
+// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
+// $bound_ctrl is DPP_BOUND_OFF, $old is undef
+//
+// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
+// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+//
+// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gcn-dpp-combine"
+
+STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
+
+namespace {
+
+class GCNDPPCombine : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const SIInstrInfo *TII;
+
+  using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
+
+  MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
+
+  RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
+                            RegSubRegPair OldOpndVGPR,
+                            MachineOperand &OldOpndValue) const;
+
+  MachineInstr *createDPPInst(MachineInstr &OrigMI,
+                              MachineInstr &MovMI,
+                              RegSubRegPair OldOpndVGPR,
+                              MachineOperand *OldOpnd,
+                              bool BoundCtrlZero) const;
+
+  MachineInstr *createDPPInst(MachineInstr &OrigMI,
+                              MachineInstr &MovMI,
+                              RegSubRegPair OldOpndVGPR,
+                              bool BoundCtrlZero) const;
+
+  bool hasNoImmOrEqual(MachineInstr &MI,
+                       unsigned OpndName,
+                       int64_t Value,
+                       int64_t Mask = -1) const;
+
+  bool combineDPPMov(MachineInstr &MI) const;
+
+public:
+  static char ID;
+
+  GCNDPPCombine() : MachineFunctionPass(ID) {
+    initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "GCN DPP Combine"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
+
+char GCNDPPCombine::ID = 0;
+
+char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
+
+FunctionPass *llvm::createGCNDPPCombinePass() {
+  return new GCNDPPCombine();
+}
+
+static int getDPPOp(unsigned Op) {
+  auto DPP32 = AMDGPU::getDPPOp32(Op);
+  if (DPP32 != -1)
+    return DPP32;
+
+  auto E32 = AMDGPU::getVOPe32(Op);
+  return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
+}
+
+// tracks the register operand definition and returns:
+//   1. immediate operand used to initialize the register if found
+//   2. nullptr if the register operand is undef
+//   3. the operand itself otherwise
+MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
+  auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
+  if (!Def)
+    return nullptr;
+
+  switch(Def->getOpcode()) {
+  default: break;
+  case AMDGPU::IMPLICIT_DEF:
+    return nullptr;
+  case AMDGPU::COPY:
+  case AMDGPU::V_MOV_B32_e32: {
+    auto &Op1 = Def->getOperand(1);
+    if (Op1.isImm())
+      return &Op1;
+    break;
+  }
+  }
+  return &OldOpnd;
+}
+
+MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
+                                           MachineInstr &MovMI,
+                                           RegSubRegPair OldOpndVGPR,
+                                           bool BoundCtrlZero) const {
+  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+  assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
+         TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
+
+  auto OrigOp = OrigMI.getOpcode();
+  auto DPPOp = getDPPOp(OrigOp);
+  if (DPPOp == -1) {
+    LLVM_DEBUG(dbgs() << "  failed: no DPP opcode\n");
+    return nullptr;
+  }
+
+  auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
+                         OrigMI.getDebugLoc(), TII->get(DPPOp));
+  bool Fail = false;
+  do {
+    auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
+    assert(Dst);
+    DPPInst.add(*Dst);
+    int NumOperands = 1;
+
+    const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
+    if (OldIdx != -1) {
+      assert(OldIdx == NumOperands);
+      assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+      DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
+      ++NumOperands;
+    }
+
+    if (auto *Mod0 = TII->getNamedOperand(OrigMI,
+                                          AMDGPU::OpName::src0_modifiers)) {
+      assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
+                                          AMDGPU::OpName::src0_modifiers));
+      assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+      DPPInst.addImm(Mod0->getImm());
+      ++NumOperands;
+    }
+    auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
+    assert(Src0);
+    if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
+      LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n");
+      Fail = true;
+      break;
+    }
+    DPPInst.add(*Src0);
+    ++NumOperands;
+
+    if (auto *Mod1 = TII->getNamedOperand(OrigMI,
+                                          AMDGPU::OpName::src1_modifiers)) {
+      assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
+                                          AMDGPU::OpName::src1_modifiers));
+      assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+      DPPInst.addImm(Mod1->getImm());
+      ++NumOperands;
+    }
+    if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+      if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
+        LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n");
+        Fail = true;
+        break;
+      }
+      DPPInst.add(*Src1);
+      ++NumOperands;
+    }
+
+    if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
+      if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
+        LLVM_DEBUG(dbgs() << "  failed: src2 is illegal\n");
+        Fail = true;
+        break;
+      }
+      DPPInst.add(*Src2);
+    }
+
+    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
+    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
+    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
+    DPPInst.addImm(BoundCtrlZero ? 1 : 0);
+  } while (false);
+
+  if (Fail) {
+    DPPInst.getInstr()->eraseFromParent();
+    return nullptr;
+  }
+  LLVM_DEBUG(dbgs() << "  combined:  " << *DPPInst.getInstr());
+  return DPPInst.getInstr();
+}
+
+GCNDPPCombine::RegSubRegPair
+GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
+                           RegSubRegPair OldOpndVGPR,
+                           MachineOperand &OldOpndValue) const {
+  assert(OldOpndValue.isImm());
+  switch (OrigMI.getOpcode()) {
+  default: break;
+  case AMDGPU::V_MAX_U32_e32:
+    if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
+      return OldOpndVGPR;
+    break;
+  case AMDGPU::V_MAX_I32_e32:
+    if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
+      return OldOpndVGPR;
+    break;
+  case AMDGPU::V_MIN_I32_e32:
+    if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
+      return OldOpndVGPR;
+    break;
+
+  case AMDGPU::V_MUL_I32_I24_e32:
+  case AMDGPU::V_MUL_U32_U24_e32:
+    if (OldOpndValue.getImm() == 1) {
+      auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+      assert(Src1 && Src1->isReg());
+      return getRegSubRegPair(*Src1);
+    }
+    break;
+  }
+  return RegSubRegPair();
+}
+
+// Cases to combine:
+//  $bound_ctrl is DPP_BOUND_ZERO, $old is any
+//  $bound_ctrl is DPP_BOUND_OFF, $old is 0
+//  -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
+
+//  $bound_ctrl is DPP_BOUND_OFF, $old is undef
+//  -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
+
+//  $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+//  -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
+
+MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
+                                           MachineInstr &MovMI,
+                                           RegSubRegPair OldOpndVGPR,
+                                           MachineOperand *OldOpndValue,
+                                           bool BoundCtrlZero) const {
+  assert(OldOpndVGPR.Reg);
+  if (!BoundCtrlZero && OldOpndValue) {
+    assert(OldOpndValue->isImm());
+    OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
+    if (!OldOpndVGPR.Reg) {
+      LLVM_DEBUG(dbgs() << "  failed: old immediate cannot be folded\n");
+      return nullptr;
+    }
+  }
+  return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
+}
+
+// returns true if MI doesn't have OpndName immediate operand or the
+// operand has Value
+bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
+                                    int64_t Value, int64_t Mask) const {
+  auto *Imm = TII->getNamedOperand(MI, OpndName);
+  if (!Imm)
+    return true;
+
+  assert(Imm->isImm());
+  return (Imm->getImm() & Mask) == Value;
+}
+
+bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
+  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+  auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
+  assert(BCZOpnd && BCZOpnd->isImm());
+  bool BoundCtrlZero = 0 != BCZOpnd->getImm();
+
+  LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+
+  auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
+  assert(OldOpnd && OldOpnd->isReg());
+  auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
+  auto *OldOpndValue = getOldOpndValue(*OldOpnd);
+  assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
+  if (OldOpndValue) {
+    if (BoundCtrlZero) {
+      OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
+      OldOpndValue = nullptr;
+    } else {
+      if (!OldOpndValue->isImm()) {
+        LLVM_DEBUG(dbgs() << "  failed: old operand isn't an imm or undef\n");
+        return false;
+      }
+      if (OldOpndValue->getImm() == 0) {
+        OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
+        OldOpndValue = nullptr;
+        BoundCtrlZero = true;
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  old=";
+    if (!OldOpndValue)
+      dbgs() << "undef";
+    else
+      dbgs() << OldOpndValue->getImm();
+    dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
+
+  std::vector<MachineInstr*> OrigMIs, DPPMIs;
+  if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
+    OldOpndVGPR = RegSubRegPair(
+      MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
+    auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
+                             TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
+    DPPMIs.push_back(UndefInst.getInstr());
+  }
+
+  OrigMIs.push_back(&MovMI);
+  bool Rollback = true;
+  for (auto &Use : MRI->use_nodbg_operands(
+       TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
+    Rollback = true;
+
+    auto &OrigMI = *Use.getParent();
+    auto OrigOp = OrigMI.getOpcode();
+    if (TII->isVOP3(OrigOp)) {
+      if (!TII->hasVALU32BitEncoding(OrigOp)) {
+        LLVM_DEBUG(dbgs() << "  failed: VOP3 hasn't e32 equivalent\n");
+        break;
+      }
+      // check if other than abs|neg modifiers are set (opsel for example)
+      const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
+      if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
+          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
+          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
+          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
+        LLVM_DEBUG(dbgs() << "  failed: VOP3 has non-default modifiers\n");
+        break;
+      }
+    } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
+      LLVM_DEBUG(dbgs() << "  failed: not VOP1/2/3\n");
+      break;
+    }
+
+    LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
+    if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
+      if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
+                                        OldOpndValue, BoundCtrlZero)) {
+        DPPMIs.push_back(DPPInst);
+        Rollback = false;
+      }
+    } else if (OrigMI.isCommutable() &&
+               &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+      auto *BB = OrigMI.getParent();
+      auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
+      BB->insert(OrigMI, NewMI);
+      if (TII->commuteInstruction(*NewMI)) {
+        LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI);
+        if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
+                                          OldOpndValue, BoundCtrlZero)) {
+          DPPMIs.push_back(DPPInst);
+          Rollback = false;
+        }
+      } else
+        LLVM_DEBUG(dbgs() << "  failed: cannot be commuted\n");
+      NewMI->eraseFromParent();
+    } else
+      LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n");
+    if (Rollback)
+      break;
+    OrigMIs.push_back(&OrigMI);
+  }
+
+  for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
+    MI->eraseFromParent();
+
+  return !Rollback;
+}
+
+bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
+  auto &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasDPP() || skipFunction(MF.getFunction()))
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TII = ST.getInstrInfo();
+
+  assert(MRI->isSSA() && "Must be run on SSA");
+
+  bool Changed = false;
+  for (auto &MBB : MF) {
+    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
+      auto &MI = *I++;
+      if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
+        Changed = true;
+        ++NumDPPMovsCombined;
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt
index c54a13c4b4d885f9505dae2e08c4e91ed6fb9e81..e591d756a545e25d914a76efefec29d0f4626232 100644
--- a/lib/Target/AMDGPU/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/LLVMBuild.txt
@@ -30,5 +30,5 @@ has_disassembler = 1
 type = Library
 name = AMDGPUCodeGen
 parent = AMDGPU
-required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel BinaryFormat
 add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 225bf5b7816bf8ab0712f32cf8fcf4999dd9b295..c17fe126546ce97b3cd28cac59fd6f97114f3eca 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -17,7 +17,9 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
@@ -35,12 +37,13 @@ namespace llvm {
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::HSAMD;
 
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetStreamer
 //===----------------------------------------------------------------------===//
 
-bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
+bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
   HSAMD::Metadata HSAMetadata;
   if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
     return false;
@@ -48,6 +51,15 @@ bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
   return EmitHSAMetadata(HSAMetadata);
 }
 
+bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
+  std::shared_ptr<msgpack::Node> HSAMetadataRoot;
+  yaml::Input YIn(HSAMetadataString);
+  YIn >> HSAMetadataRoot;
+  if (YIn.error())
+    return false;
+  return EmitHSAMetadata(HSAMetadataRoot, false);
+}
+
 StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   AMDGPU::GPUKind AK;
 
@@ -195,9 +207,26 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
   if (HSAMD::toString(HSAMetadata, HSAMetadataString))
     return false;
 
-  OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
+  OS << '\t' << AssemblerDirectiveBegin << '\n';
   OS << HSAMetadataString << '\n';
-  OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
+  OS << '\t' << AssemblerDirectiveEnd << '\n';
+  return true;
+}
+
+bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
+    std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+  V3::MetadataVerifier Verifier(Strict);
+  if (!Verifier.verify(*HSAMetadataRoot))
+    return false;
+
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << HSAMetadataRoot;
+
+  OS << '\t' << V3::AssemblerDirectiveBegin << '\n';
+  OS << StrOS.str() << '\n';
+  OS << '\t' << V3::AssemblerDirectiveEnd << '\n';
   return true;
 }
 
@@ -358,13 +387,13 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
   return static_cast<MCELFStreamer &>(Streamer);
 }
 
-void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
-    const MCExpr *DescSZ, unsigned NoteType,
+void AMDGPUTargetELFStreamer::EmitNote(
+    StringRef Name, const MCExpr *DescSZ, unsigned NoteType,
     function_ref<void(MCELFStreamer &)> EmitDesc) {
   auto &S = getStreamer();
   auto &Context = S.getContext();
 
-  auto NameSZ = sizeof(ElfNote::NoteName);
+  auto NameSZ = Name.size() + 1;
 
   S.PushSection();
   S.SwitchSection(Context.getELFSection(
@@ -372,7 +401,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
   S.EmitIntValue(NameSZ, 4);                                  // namesz
   S.EmitValue(DescSZ, 4);                                     // descz
   S.EmitIntValue(NoteType, 4);                                // type
-  S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ));          // name
+  S.EmitBytes(Name);                                          // name
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
   EmitDesc(S);                                                // desc
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
@@ -384,14 +413,11 @@ void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
 void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
     uint32_t Major, uint32_t Minor) {
 
-  EmitAMDGPUNote(
-    MCConstantExpr::create(8, getContext()),
-    ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
-    [&](MCELFStreamer &OS){
-      OS.EmitIntValue(Major, 4);
-      OS.EmitIntValue(Minor, 4);
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
+           ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
+             OS.EmitIntValue(Major, 4);
+             OS.EmitIntValue(Minor, 4);
+           });
 }
 
 void
@@ -407,21 +433,18 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
     sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
     VendorNameSize + ArchNameSize;
 
-  EmitAMDGPUNote(
-    MCConstantExpr::create(DescSZ, getContext()),
-    ElfNote::NT_AMDGPU_HSA_ISA,
-    [&](MCELFStreamer &OS) {
-      OS.EmitIntValue(VendorNameSize, 2);
-      OS.EmitIntValue(ArchNameSize, 2);
-      OS.EmitIntValue(Major, 4);
-      OS.EmitIntValue(Minor, 4);
-      OS.EmitIntValue(Stepping, 4);
-      OS.EmitBytes(VendorName);
-      OS.EmitIntValue(0, 1); // NULL terminate VendorName
-      OS.EmitBytes(ArchName);
-      OS.EmitIntValue(0, 1); // NULL terminte ArchName
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
+           ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) {
+             OS.EmitIntValue(VendorNameSize, 2);
+             OS.EmitIntValue(ArchNameSize, 2);
+             OS.EmitIntValue(Major, 4);
+             OS.EmitIntValue(Minor, 4);
+             OS.EmitIntValue(Stepping, 4);
+             OS.EmitBytes(VendorName);
+             OS.EmitIntValue(0, 1); // NULL terminate VendorName
+             OS.EmitBytes(ArchName);
+             OS.EmitIntValue(0, 1); // NULL terminte ArchName
+           });
 }
 
 void
@@ -450,15 +473,41 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
     MCSymbolRefExpr::create(DescEnd, Context),
     MCSymbolRefExpr::create(DescBegin, Context), Context);
 
-  EmitAMDGPUNote(
-    DescSZ,
-    ELF::NT_AMD_AMDGPU_ISA,
-    [&](MCELFStreamer &OS) {
-      OS.EmitLabel(DescBegin);
-      OS.EmitBytes(IsaVersionString);
-      OS.EmitLabel(DescEnd);
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA,
+           [&](MCELFStreamer &OS) {
+             OS.EmitLabel(DescBegin);
+             OS.EmitBytes(IsaVersionString);
+             OS.EmitLabel(DescEnd);
+           });
+  return true;
+}
+
+bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
+    std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+  V3::MetadataVerifier Verifier(Strict);
+  if (!Verifier.verify(*HSAMetadataRoot))
+    return false;
+
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  msgpack::Writer MPWriter(StrOS);
+  HSAMetadataRoot->write(MPWriter);
+
+  // Create two labels to mark the beginning and end of the desc field
+  // and a MCExpr to calculate the size of the desc field.
+  auto &Context = getContext();
+  auto *DescBegin = Context.createTempSymbol();
+  auto *DescEnd = Context.createTempSymbol();
+  auto *DescSZ = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(DescEnd, Context),
+      MCSymbolRefExpr::create(DescBegin, Context), Context);
+
+  EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA,
+           [&](MCELFStreamer &OS) {
+             OS.EmitLabel(DescBegin);
+             OS.EmitBytes(StrOS.str());
+             OS.EmitLabel(DescEnd);
+           });
   return true;
 }
 
@@ -477,28 +526,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
     MCSymbolRefExpr::create(DescEnd, Context),
     MCSymbolRefExpr::create(DescBegin, Context), Context);
 
-  EmitAMDGPUNote(
-    DescSZ,
-    ELF::NT_AMD_AMDGPU_HSA_METADATA,
-    [&](MCELFStreamer &OS) {
-      OS.EmitLabel(DescBegin);
-      OS.EmitBytes(HSAMetadataString);
-      OS.EmitLabel(DescEnd);
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA,
+           [&](MCELFStreamer &OS) {
+             OS.EmitLabel(DescBegin);
+             OS.EmitBytes(HSAMetadataString);
+             OS.EmitLabel(DescEnd);
+           });
   return true;
 }
 
 bool AMDGPUTargetELFStreamer::EmitPALMetadata(
     const PALMD::Metadata &PALMetadata) {
-  EmitAMDGPUNote(
-    MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), getContext()),
-    ELF::NT_AMD_AMDGPU_PAL_METADATA,
-    [&](MCELFStreamer &OS){
-      for (auto I : PALMetadata)
-        OS.EmitIntValue(I, sizeof(uint32_t));
-    }
-  );
+  EmitNote(ElfNote::NoteNameV2,
+           MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t),
+                                  getContext()),
+           ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) {
+             for (auto I : PALMetadata)
+               OS.EmitIntValue(I, sizeof(uint32_t));
+           });
   return true;
 }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 2b885592f32be5e2e1607c4696d384397b20cb02..9a807c804f9ff1cddc1a5593900146aef938b411 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
 #include "AMDKernelCodeT.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
@@ -52,7 +53,20 @@ public:
   virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
 
   /// \returns True on success, false on failure.
-  virtual bool EmitHSAMetadata(StringRef HSAMetadataString);
+  virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
+
+  /// \returns True on success, false on failure.
+  virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString);
+
+  /// Emit HSA Metadata
+  ///
+  /// When \p Strict is true, known metadata elements must already be
+  /// well-typed. When \p Strict is false, known types are inferred and
+  /// the \p HSAMetadata structure is updated with the correct types.
+  ///
+  /// \returns True on success, false on failure.
+  virtual bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+                               bool Strict) = 0;
 
   /// \returns True on success, false on failure.
   virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
@@ -91,6 +105,10 @@ public:
   /// \returns True on success, false on failure.
   bool EmitISAVersion(StringRef IsaVersionString) override;
 
+  /// \returns True on success, false on failure.
+  bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+                       bool Strict) override;
+
   /// \returns True on success, false on failure.
   bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
 
@@ -107,8 +125,8 @@ public:
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   MCStreamer &Streamer;
 
-  void EmitAMDGPUNote(const MCExpr *DescSize, unsigned NoteType,
-                      function_ref<void(MCELFStreamer &)> EmitDesc);
+  void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
+                function_ref<void(MCELFStreamer &)> EmitDesc);
 
 public:
   AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
@@ -131,6 +149,10 @@ public:
   /// \returns True on success, false on failure.
   bool EmitISAVersion(StringRef IsaVersionString) override;
 
+  /// \returns True on success, false on failure.
+  bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+                       bool Strict) override;
+
   /// \returns True on success, false on failure.
   bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
index 773ee7c0a4ba752933147dea790c3750d0558528..bc910a470d72a412ed799b1d0b7bd8a4027b3773 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = AMDGPUDesc
 parent = AMDGPU
-required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support
+required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support BinaryFormat
 add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 1683fe6c9a571a5dcd286eeaa8edeb6b00f5ad0e..679cf18d2c20b970f1828686bc164919126c004c 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -226,11 +226,11 @@ private:
         // occur in the same basic block as its definition, because
         // it is illegal for the scheduler to schedule them in
         // different blocks.
-        if (UseI->readsRegister(MOI->getReg()))
+        if (UseI->readsRegister(MOI->getReg(), &TRI))
           LastUseCount = AluInstCount;
 
         // Exit early if the current use kills the register
-        if (UseI != Def && UseI->killsRegister(MOI->getReg()))
+        if (UseI != Def && UseI->killsRegister(MOI->getReg(), &TRI))
           break;
       }
       if (LastUseCount)
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 97228965e68c3658d948c60869b7683d5760047d..9cc3e5f3c314c929b50f9a6defef686d7af14cbf 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -229,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
-  return MI.findRegisterUseOperandIdx(R600::AR_X) != -1;
+  return MI.findRegisterUseOperandIdx(R600::AR_X, false, &RI) != -1;
 }
 
 bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
-  return MI.findRegisterDefOperandIdx(R600::AR_X) != -1;
+  return MI.findRegisterDefOperandIdx(R600::AR_X, false, false, &RI) != -1;
 }
 
 bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a6d28d6999e5f6e4c15f14ca1e12780e55065d45..7f6abc34cff3a2f7685fb10f65a8c31be47be4f5 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -88,7 +88,10 @@ enum : uint64_t {
   IsPacked = UINT64_C(1) << 49,
 
   // Is a D16 buffer instruction.
-  D16Buf = UINT64_C(1) << 50
+  D16Buf = UINT64_C(1) << 50,
+
+  // Uses floating point double precision rounding mode
+  FPDPRounding = UINT64_C(1) << 51
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee39eb04d8316c6af5eb8006114cf935170b9298
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -0,0 +1,231 @@
+//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
+/// Currently this will convert GLOBAL_{LOAD|STORE}_*
+/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
+/// feeding the sreg into the saddr field of the new instruction.
+/// We currently handle a REG_SEQUENCE feeding the vaddr
+/// and decompose it into a base and index.
+///
+/// Transform:
+/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32
+/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
+///                                    %24:vgpr_32, %19:sreg_64_xexec
+/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
+/// %11:vreg_64 = COPY %16:vreg_64
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
+/// Into:
+/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
+/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "si-fixup-vector-isel"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableGlobalSGPRAddr(
+  "amdgpu-enable-global-sgpr-addr",
+  cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"),
+  cl::init(false));
+
+STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities");
+STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted");
+
+namespace {
+
+class SIFixupVectorISel : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixupVectorISel() : MachineFunctionPass(ID) {
+    initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
+                "SI Fixup Vector ISel", false, false)
+
+char SIFixupVectorISel::ID = 0;
+
+char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
+
+FunctionPass *llvm::createSIFixupVectorISelPass() {
+  return new SIFixupVectorISel();
+}
+
+static bool findSRegBaseAndIndex(MachineOperand *Op,
+                                 unsigned &BaseReg,
+                                 unsigned &IndexReg,
+                                 MachineRegisterInfo &MRI,
+                                 const SIRegisterInfo *TRI) {
+  SmallVector<MachineOperand *, 8> Worklist;
+  Worklist.push_back(Op);
+  while (!Worklist.empty()) {
+    MachineOperand *WOp = Worklist.pop_back_val();
+    if (!WOp->isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+      continue;
+    MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
+    switch (DefInst->getOpcode()) {
+    default:
+      continue;
+    case AMDGPU::COPY:
+      Worklist.push_back(&DefInst->getOperand(1));
+      break;
+    case AMDGPU::REG_SEQUENCE:
+      if (DefInst->getNumOperands() != 5)
+        continue;
+      Worklist.push_back(&DefInst->getOperand(1));
+      Worklist.push_back(&DefInst->getOperand(3));
+      break;
+    case AMDGPU::V_ADD_I32_e64:
+      // The V_ADD_* and its analogous V_ADDCV_* are generated by
+      // a previous pass which lowered from an ADD_64_PSEUDO,
+      // which generates subregs to break up the 64 bit args.
+      if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister)
+        continue;
+      BaseReg = DefInst->getOperand(2).getReg();
+      if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister)
+        continue;
+      IndexReg = DefInst->getOperand(3).getReg();
+      // Chase the IndexReg.
+      MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg);
+      if (!MI || !MI->isCopy())
+        continue;
+      // Make sure the reg class is 64 bit for Index.
+      // If the Index register is a subreg, we want it to reference
+      // a 64 bit register which we will use as the Index reg.
+      const TargetRegisterClass *IdxRC, *BaseRC;
+      IdxRC = MRI.getRegClass(MI->getOperand(1).getReg());
+      if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64)
+        continue;
+      IndexReg = MI->getOperand(1).getReg();
+      // Chase the BaseReg.
+      MI = MRI.getUniqueVRegDef(BaseReg);
+      if (!MI || !MI->isCopy())
+        continue;
+      // Make sure the register class is 64 bit for Base.
+      BaseReg = MI->getOperand(1).getReg();
+      BaseRC = MRI.getRegClass(BaseReg);
+      if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64)
+        continue;
+      // Make sure Base is SReg and Index is VReg.
+      if (!TRI->isSGPRReg(MRI, BaseReg))
+        return false;
+      if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
+        return false;
+      // clear any killed flags on Index and Base regs, used later.
+      MRI.clearKillFlags(IndexReg);
+      MRI.clearKillFlags(BaseReg);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
+static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
+                             MachineFunction &MF,
+                             MachineRegisterInfo &MRI,
+                             const GCNSubtarget &ST,
+                             const SIInstrInfo *TII,
+                             const SIRegisterInfo *TRI) {
+  if (!EnableGlobalSGPRAddr)
+    return false;
+  bool FuncModified = false;
+  MachineBasicBlock::iterator I, Next;
+  for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    Next = std::next(I);
+    MachineInstr &MI = *I;
+    int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
+    if (NewOpcd < 0)
+      continue;
+    // Update our statistics on opportunities seen.
+    ++NumSGPRGlobalOccurs;
+    LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
+    // Need a Base and Index or we cant transform to _SADDR.
+    unsigned BaseReg = 0;
+    unsigned IndexReg = 0;
+    MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+    if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
+      continue;
+    ++NumSGPRGlobalSaddrs;
+    FuncModified = true;
+    // Create the new _SADDR Memory instruction.
+    bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
+    MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+    MachineInstr *NewGlob = nullptr;
+    NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
+    if (HasVdst)
+      NewGlob->addOperand(MF, MI.getOperand(0));
+    NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
+    if (VData)
+      NewGlob->addOperand(MF, *VData);
+    NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
+    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
+
+    MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
+    // Atomics dont have a GLC, so omit the field if not there.
+    if (Glc)
+      NewGlob->addOperand(MF, *Glc);
+    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
+    // _D16 have an vdst_in operand, copy it in.
+    MachineOperand *VDstInOp = TII->getNamedOperand(MI,
+                                      AMDGPU::OpName::vdst_in);
+    if (VDstInOp)
+      NewGlob->addOperand(MF, *VDstInOp);
+    NewGlob->copyImplicitOps(MF, MI);
+    NewGlob->cloneMemRefs(MF, MI);
+    // Remove the old Global Memop instruction.
+    MI.eraseFromParent();
+    LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
+  }
+  return FuncModified;
+}
+
+bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  bool FuncModified = false;
+  for (MachineBasicBlock &MBB : MF) {
+    // Cleanup missed Saddr opportunites from ISel.
+    FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI);
+  }
+  return FuncModified;
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 254f1362f1f0f3cabed56208b33b3243ddd6e5c7..12d0bc528ca570c13034dd724b2487065bfc9c79 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -679,6 +679,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -4847,6 +4848,71 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   return SDValue(NewNode, 0);
 }
 
+SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
+                                       SDValue Offset, SDValue GLC,
+                                       SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      VT.getStoreSize(), VT.getStoreSize());
+
+  if (!Offset->isDivergent()) {
+    SDValue Ops[] = {
+        Rsrc,
+        Offset, // Offset
+        GLC     // glc
+    };
+    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                   DAG.getVTList(VT), Ops, VT, MMO);
+  }
+
+  // We have a divergent offset. Emit a MUBUF buffer load instead. We can
+  // assume that the buffer is unswizzled.
+  SmallVector<SDValue, 4> Loads;
+  unsigned NumLoads = 1;
+  MVT LoadVT = VT.getSimpleVT();
+  unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
+  assert((LoadVT.getScalarType() == MVT::i32 ||
+          LoadVT.getScalarType() == MVT::f32) &&
+         isPowerOf2_32(NumElts));
+
+  if (NumElts == 8 || NumElts == 16) {
+    NumLoads = NumElts == 16 ? 4 : 2;
+    LoadVT = MVT::v4i32;
+  }
+
+  SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
+  unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
+  SDValue Ops[] = {
+      DAG.getEntryNode(),                         // Chain
+      Rsrc,                                       // rsrc
+      DAG.getConstant(0, DL, MVT::i32),           // vindex
+      {},                                         // voffset
+      {},                                         // soffset
+      {},                                         // offset
+      DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1),            // idxen
+  };
+
+  // Use the alignment to ensure that the required offsets will fit into the
+  // immediate offsets.
+  setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
+
+  uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
+    Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
+                                            Ops, LoadVT, MMO));
+  }
+
+  if (VT == MVT::v8i32 || VT == MVT::v16i32)
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
+
+  return Loads[0];
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -5000,39 +5066,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDZ);
-  case AMDGPUIntrinsic::SI_load_const: {
-    SDValue Ops[] = {
-      Op.getOperand(1),   // Ptr
-      Op.getOperand(2),   // Offset
-      DAG.getTargetConstant(0, DL, MVT::i1) // glc
-    };
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo(),
-        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
-            MachineMemOperand::MOInvariant,
-        VT.getStoreSize(), 4);
-    SDVTList VTList = DAG.getVTList(MVT::i32);
-    SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
-                                           VTList, Ops, MVT::i32, MMO);
-
+  case SIIntrinsic::SI_load_const: {
+    SDValue Load =
+        lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
+                     DAG.getTargetConstant(0, DL, MVT::i1), DAG);
     return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
   }
   case Intrinsic::amdgcn_s_buffer_load: {
     unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-    SDValue Ops[] = {
-      Op.getOperand(1), // Ptr
-      Op.getOperand(2), // Offset
-      DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
-    };
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo(),
-        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
-            MachineMemOperand::MOInvariant,
-        VT.getStoreSize(), VT.getStoreSize());
-    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+    return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
+                        DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
   }
   case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
@@ -5766,19 +5809,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
                        Op.getOperand(2), Op.getOperand(3));
   }
-  case AMDGPUIntrinsic::AMDGPU_kill: {
-    SDValue Src = Op.getOperand(2);
-    if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
-      if (!K->isNegative())
-        return Chain;
-
-      SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
-      return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
-    }
-
-    SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
-    return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
-  }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
       const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -5789,55 +5819,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     }
     return SDValue();
   };
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-
-    // Extract vindex and voffset from vaddr as appropriate
-    const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
-    const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
-    SDValue VAddr = Op.getOperand(5);
-
-    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
-
-    assert(!(OffEn->isOne() && IdxEn->isOne()) &&
-           "Legacy intrinsic doesn't support both offset and index - use new version");
-
-    SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
-    SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
-
-    // Deal with the vec-3 case
-    const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
-    auto Opcode = NumChannels->getZExtValue() == 3 ?
-      AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
-
-    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
-    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(12))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(13))->getZExtValue();
-    SDValue Ops[] = {
-     Chain,
-     Op.getOperand(3),  // vdata
-     Op.getOperand(2),  // rsrc
-     VIndex,
-     VOffset,
-     Op.getOperand(6),  // soffset
-     Op.getOperand(7),  // inst_offset
-     DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
-     DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
-     DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen
-    };
-
-    assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
-           "Value of tfe other than zero is unsupported");
-
-    EVT VT = Op.getOperand(3).getValueType();
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(Opcode, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
-  }
-
   case Intrinsic::amdgcn_tbuffer_store: {
     SDValue VData = Op.getOperand(2);
     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
@@ -6067,13 +6048,13 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
 // pointed to by Offsets.
 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
-                                        SelectionDAG &DAG,
-                                        SDValue *Offsets) const {
+                                        SelectionDAG &DAG, SDValue *Offsets,
+                                        unsigned Align) const {
   SDLoc DL(CombinedOffset);
   if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
     uint32_t Imm = C->getZExtValue();
     uint32_t SOffset, ImmOffset;
-    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) {
+    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
@@ -6085,8 +6066,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
     SDValue N1 = CombinedOffset.getOperand(1);
     uint32_t SOffset, ImmOffset;
     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
-    if (Offset >= 0
-        && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) {
+    if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+                                                Subtarget, Align)) {
       Offsets[0] = N0;
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
@@ -8025,7 +8006,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
 
     switch(Opc) {
     default:
-      return SDValue();
+      break;
       // TODO: Support other binary operations.
     case ISD::FADD:
     case ISD::FSUB:
@@ -8051,12 +8032,34 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
     }
   }
 
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
   unsigned VecSize = VecVT.getSizeInBits();
   unsigned EltSize = EltVT.getSizeInBits();
 
+  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
+  // This elminates non-constant index and subsequent movrel or scratch access.
+  // Sub-dword vectors of size 2 dword or less have better implementation.
+  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
+  // instructions.
+  if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
+      !isa<ConstantSDNode>(N->getOperand(1))) {
+    SDLoc SL(N);
+    SDValue Idx = N->getOperand(1);
+    EVT IdxVT = Idx.getValueType();
+    SDValue V;
+    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+      SDValue IC = DAG.getConstant(I, SL, IdxVT);
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
+      if (I == 0)
+        V = Elt;
+      else
+        V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
+    }
+    return V;
+  }
+
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
   // elements. This exposes more load reduction opportunities by replacing
   // multiple small extract_vector_elements with a single 32-bit extract.
@@ -8092,6 +8095,42 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
   return SDValue();
 }
 
+SDValue
+SITargetLowering::performInsertVectorEltCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+
+  // INSERT_VECTOR_ELT (<n x e>, var-idx)
+  // => BUILD_VECTOR n x select (e, const-idx)
+  // This elminates non-constant index and subsequent movrel or scratch access.
+  // Sub-dword vectors of size 2 dword or less have better implementation.
+  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
+  // instructions.
+  if (isa<ConstantSDNode>(Idx) ||
+      VecSize > 256 || (VecSize <= 64 && EltSize < 32))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  SDValue Ins = N->getOperand(1);
+  EVT IdxVT = Idx.getValueType();
+
+  SmallVector<SDValue, 16> Ops;
+  for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+    SDValue IC = DAG.getConstant(I, SL, IdxVT);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
+    SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
+    Ops.push_back(V);
+  }
+
+  return DAG.getBuildVector(VecVT, SL, Ops);
+}
+
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -8578,6 +8617,9 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+    return SDValue();
+
   switch (N->getOpcode()) {
   default:
     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
@@ -8603,12 +8645,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::UMAX:
   case ISD::UMIN:
   case AMDGPUISD::FMIN_LEGACY:
-  case AMDGPUISD::FMAX_LEGACY: {
-    if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
-        getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return performMinMaxCombine(N, DCI);
-    break;
-  }
+  case AMDGPUISD::FMAX_LEGACY:
+    return performMinMaxCombine(N, DCI);
   case ISD::FMA:
     return performFMACombine(N, DCI);
   case ISD::LOAD: {
@@ -8700,6 +8738,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::EXTRACT_VECTOR_ELT:
     return performExtractVectorEltCombine(N, DCI);
+  case ISD::INSERT_VECTOR_ELT:
+    return performInsertVectorEltCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -9243,42 +9283,49 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
   Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
 }
 
+LLVM_ATTRIBUTE_UNUSED
+static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
+  assert(N->getOpcode() == ISD::CopyFromReg);
+  do {
+    // Follow the chain until we find an INLINEASM node.
+    N = N->getOperand(0).getNode();
+    if (N->getOpcode() == ISD::INLINEASM)
+      return true;
+  } while (N->getOpcode() == ISD::CopyFromReg);
+  return false;
+}
+
 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
   FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
 {
   switch (N->getOpcode()) {
-    case ISD::Register:
     case ISD::CopyFromReg:
     {
-      const RegisterSDNode *R = nullptr;
-      if (N->getOpcode() == ISD::Register) {
-        R = dyn_cast<RegisterSDNode>(N);
-      }
-      else {
-        R = dyn_cast<RegisterSDNode>(N->getOperand(1));
-      }
-      if (R)
-      {
-        const MachineFunction * MF = FLI->MF;
-        const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-        const MachineRegisterInfo &MRI = MF->getRegInfo();
-        const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
-        unsigned Reg = R->getReg();
-        if (TRI.isPhysicalRegister(Reg))
-          return TRI.isVGPR(MRI, Reg);
-
-        if (MRI.isLiveIn(Reg)) {
-          // workitem.id.x workitem.id.y workitem.id.z
-          // Any VGPR formal argument is also considered divergent
-          if (TRI.isVGPR(MRI, Reg))
-              return true;
-          // Formal arguments of non-entry functions
-          // are conservatively considered divergent
-          else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
-            return true;
-        }
-        return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+      const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
+      const MachineFunction * MF = FLI->MF;
+      const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+      const MachineRegisterInfo &MRI = MF->getRegInfo();
+      const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+      unsigned Reg = R->getReg();
+      if (TRI.isPhysicalRegister(Reg))
+        return !TRI.isSGPRReg(MRI, Reg);
+
+      if (MRI.isLiveIn(Reg)) {
+        // workitem.id.x workitem.id.y workitem.id.z
+        // Any VGPR formal argument is also considered divergent
+        if (!TRI.isSGPRReg(MRI, Reg))
+          return true;
+        // Formal arguments of non-entry functions
+        // are conservatively considered divergent
+        else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+          return true;
+        return false;
       }
+      const Value *V = FLI->getValueFromVirtualReg(Reg);
+      if (V)
+        return KDA->isDivergent(V);
+      assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
+      return !TRI.isSGPRReg(MRI, Reg);
     }
     break;
     case ISD::LOAD: {
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 73fa05ea58f51e063b40ba419ad823ad7345d7ef..bcef519ee6635ddb6b1d2f12096cce80a7ef57ef 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -60,6 +60,8 @@ private:
                                  MVT VT, unsigned Offset) const;
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
                      SelectionDAG &DAG) const;
+  SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
+                       SDValue GLC, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -152,6 +154,7 @@ private:
   SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
@@ -167,7 +170,6 @@ private:
   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
-  bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
   unsigned isCFIntrinsic(const SDNode *Intr) const;
@@ -190,7 +192,7 @@ private:
   // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
   // pointed to by Offsets.
   void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
-                        SDValue *Offsets) const;
+                        SDValue *Offsets, unsigned Align = 4) const;
 
 public:
   SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
@@ -209,6 +211,7 @@ public:
                             SmallVectorImpl<Value*> &/*Ops*/,
                             Type *&/*AccessTy*/) const override;
 
+  bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS,
                              Instruction *I = nullptr) const override;
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index dc9397cf7b85e900ad6b308ee35788d7607d2915..ba21a5ce1293a0d139ddf2089ba53e9fc1b06e66 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -66,6 +66,8 @@ private:
 
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
+  bool optimizeVccBranch(MachineInstr &MI) const;
+
 public:
   static char ID;
 
@@ -320,6 +322,96 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
   return true;
 }
 
+bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
+  // Match:
+  // sreg = -1
+  // vcc = S_AND_B64 exec, sreg
+  // S_CBRANCH_VCC[N]Z
+  // =>
+  // S_CBRANCH_EXEC[N]Z
+  bool Changed = false;
+  MachineBasicBlock &MBB = *MI.getParent();
+  const unsigned CondReg = AMDGPU::VCC;
+  const unsigned ExecReg = AMDGPU::EXEC;
+  const unsigned And = AMDGPU::S_AND_B64;
+
+  MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
+                                      E = MBB.rend();
+  bool ReadsCond = false;
+  unsigned Threshold = 5;
+  for (++A ; A != E ; ++A) {
+    if (!--Threshold)
+      return false;
+    if (A->modifiesRegister(ExecReg, TRI))
+      return false;
+    if (A->modifiesRegister(CondReg, TRI)) {
+      if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
+        return false;
+      break;
+    }
+    ReadsCond |= A->readsRegister(CondReg, TRI);
+  }
+  if (A == E)
+    return false;
+
+  MachineOperand &Op1 = A->getOperand(1);
+  MachineOperand &Op2 = A->getOperand(2);
+  if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+    TII->commuteInstruction(*A);
+    Changed = true;
+  }
+  if (Op1.getReg() != ExecReg)
+    return Changed;
+  if (Op2.isImm() && Op2.getImm() != -1)
+    return Changed;
+
+  unsigned SReg = AMDGPU::NoRegister;
+  if (Op2.isReg()) {
+    SReg = Op2.getReg();
+    auto M = std::next(A);
+    bool ReadsSreg = false;
+    for ( ; M != E ; ++M) {
+      if (M->definesRegister(SReg, TRI))
+        break;
+      if (M->modifiesRegister(SReg, TRI))
+        return Changed;
+      ReadsSreg |= M->readsRegister(SReg, TRI);
+    }
+    if (M == E ||
+        !M->isMoveImmediate() ||
+        !M->getOperand(1).isImm() ||
+        M->getOperand(1).getImm() != -1)
+      return Changed;
+    // First if sreg is only used in and instruction fold the immediate
+    // into that and.
+    if (!ReadsSreg && Op2.isKill()) {
+      A->getOperand(2).ChangeToImmediate(-1);
+      M->eraseFromParent();
+    }
+  }
+
+  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
+      MI.killsRegister(CondReg, TRI))
+    A->eraseFromParent();
+
+  bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
+  if (SReg == ExecReg) {
+    if (IsVCCZ) {
+      MI.eraseFromParent();
+      return true;
+    }
+    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+  } else {
+    MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
+                               : AMDGPU::S_CBRANCH_EXECNZ));
+  }
+
+  MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+  MI.addImplicitDefUseOperands(*MBB.getParent());
+
+  return true;
+}
+
 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -384,7 +476,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         kill(MI);
 
         if (ExecBranchStack.empty()) {
-          if (skipIfDead(MI, *NextBB)) {
+          if (NextBB != BE && skipIfDead(MI, *NextBB)) {
             HaveSkipBlock = true;
             NextBB = std::next(BI);
             BE = MF.end();
@@ -417,6 +509,11 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         }
         break;
 
+      case AMDGPU::S_CBRANCH_VCCZ:
+      case AMDGPU::S_CBRANCH_VCCNZ:
+        MadeChange |= optimizeVccBranch(MI);
+        break;
+
       default:
         break;
       }
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index eb39984f795910e38acf036c3856bd5c968d7456..79674046fce1bcf9dfba54254655664670bf5e55 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -13,6 +13,14 @@
 /// Memory reads and writes are issued asynchronously, so we need to insert
 /// S_WAITCNT instructions when we want to access any of their results or
 /// overwrite any register that's used asynchronously.
+///
+/// TODO: This pass currently keeps one timeline per hardware counter. A more
+/// finely-grained approach that keeps one timeline per event type could
+/// sometimes get away with generating weaker s_waitcnt instructions. For
+/// example, when both SMEM and LDS are in flight and we need to wait for
+/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
+/// but the pass will currently generate a conservative lgkmcnt(0) because
+/// multiple event types are in flight.
 //
 //===----------------------------------------------------------------------===//
 
@@ -33,7 +41,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -69,6 +76,25 @@ static cl::opt<unsigned> ForceEmitZeroFlag(
 
 namespace {
 
+template <typename EnumT>
+class enum_iterator
+    : public iterator_facade_base<enum_iterator<EnumT>,
+                                  std::forward_iterator_tag, const EnumT> {
+  EnumT Value;
+public:
+  enum_iterator() = default;
+  enum_iterator(EnumT Value) : Value(Value) {}
+
+  enum_iterator &operator++() {
+    Value = static_cast<EnumT>(Value + 1);
+    return *this;
+  }
+
+  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
+
+  EnumT operator*() const { return Value; }
+};
+
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
 // s_waitcnt instruction needs to be emited.
@@ -77,12 +103,17 @@ namespace {
 
 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
 
+iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
+  return make_range(enum_iterator<InstCounterType>(VM_CNT),
+                    enum_iterator<InstCounterType>(NUM_INST_CNTS));
+}
+
 using RegInterval = std::pair<signed, signed>;
 
 struct {
-  int32_t VmcntMax;
-  int32_t ExpcntMax;
-  int32_t LgkmcntMax;
+  uint32_t VmcntMax;
+  uint32_t ExpcntMax;
+  uint32_t LgkmcntMax;
   int32_t NumVGPRsMax;
   int32_t NumSGPRsMax;
 } HardwareLimits;
@@ -108,6 +139,14 @@ enum WaitEventType {
   NUM_WAIT_EVENTS,
 };
 
+static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
+  (1 << VMEM_ACCESS),
+  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+      (1 << SQ_MESSAGE),
+  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+      (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+};
+
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
@@ -122,30 +161,38 @@ enum RegisterMapping {
   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
 };
 
-#define ForAllWaitEventType(w)                                                 \
-  for (enum WaitEventType w = (enum WaitEventType)0;                           \
-       (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
-       (w) = (enum WaitEventType)((w) + 1))
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+  switch (T) {
+  case VM_CNT:
+    Wait.VmCnt = std::min(Wait.VmCnt, Count);
+    break;
+  case EXP_CNT:
+    Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
+    break;
+  case LGKM_CNT:
+    Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
+    break;
+  default:
+    llvm_unreachable("bad InstCounterType");
+  }
+}
 
-// This is a per-basic-block object that maintains current score brackets
-// of each wait counter, and a per-register scoreboard for each wait counter.
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
 // We also maintain the latest score for every event type that can change the
 // waitcnt in order to know if there are multiple types of events within
 // the brackets. When multiple types of event happen in the bracket,
 // wait count may get decreased out of order, therefore we need to put in
 // "s_waitcnt 0" before use.
-class BlockWaitcntBrackets {
+class WaitcntBrackets {
 public:
-  BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
+  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
+    for (auto T : inst_counter_types())
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
-    }
   }
 
-  ~BlockWaitcntBrackets() = default;
-
-  static int32_t getWaitCountMax(InstCounterType T) {
+  static uint32_t getWaitCountMax(InstCounterType T) {
     switch (T) {
     case VM_CNT:
       return HardwareLimits.VmcntMax;
@@ -159,33 +206,14 @@ public:
     return 0;
   }
 
-  void setScoreLB(InstCounterType T, int32_t Val) {
-    assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return;
-    ScoreLBs[T] = Val;
-  }
-
-  void setScoreUB(InstCounterType T, int32_t Val) {
-    assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return;
-    ScoreUBs[T] = Val;
-    if (T == EXP_CNT) {
-      int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
-      if (ScoreLBs[T] < UB)
-        ScoreLBs[T] = UB;
-    }
-  }
-
-  int32_t getScoreLB(InstCounterType T) {
+  uint32_t getScoreLB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
     if (T >= NUM_INST_CNTS)
       return 0;
     return ScoreLBs[T];
   }
 
-  int32_t getScoreUB(InstCounterType T) {
+  uint32_t getScoreUB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
     if (T >= NUM_INST_CNTS)
       return 0;
@@ -194,89 +222,56 @@ public:
 
   // Mapping from event to counter.
   InstCounterType eventCounter(WaitEventType E) {
-    switch (E) {
-    case VMEM_ACCESS:
+    if (E == VMEM_ACCESS)
       return VM_CNT;
-    case LDS_ACCESS:
-    case GDS_ACCESS:
-    case SQ_MESSAGE:
-    case SMEM_ACCESS:
+    if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
       return LGKM_CNT;
-    case EXP_GPR_LOCK:
-    case GDS_GPR_LOCK:
-    case VMW_GPR_LOCK:
-    case EXP_POS_ACCESS:
-    case EXP_PARAM_ACCESS:
-      return EXP_CNT;
-    default:
-      llvm_unreachable("unhandled event type");
-    }
-    return NUM_INST_CNTS;
-  }
-
-  void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
-    if (GprNo < NUM_ALL_VGPRS) {
-      if (GprNo > VgprUB) {
-        VgprUB = GprNo;
-      }
-      VgprScores[T][GprNo] = Val;
-    } else {
-      assert(T == LGKM_CNT);
-      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
-        SgprUB = GprNo - NUM_ALL_VGPRS;
-      }
-      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
-    }
+    assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
+    return EXP_CNT;
   }
 
-  int32_t getRegScore(int GprNo, InstCounterType T) {
+  uint32_t getRegScore(int GprNo, InstCounterType T) {
     if (GprNo < NUM_ALL_VGPRS) {
       return VgprScores[T][GprNo];
     }
+    assert(T == LGKM_CNT);
     return SgprScores[GprNo - NUM_ALL_VGPRS];
   }
 
   void clear() {
     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     memset(ScoreUBs, 0, sizeof(ScoreUBs));
-    memset(EventUBs, 0, sizeof(EventUBs));
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
+    PendingEvents = 0;
+    memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
+    for (auto T : inst_counter_types())
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
-    }
     memset(SgprScores, 0, sizeof(SgprScores));
   }
 
+  bool merge(const WaitcntBrackets &Other);
+
   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
                              const MachineRegisterInfo *MRI,
                              const SIRegisterInfo *TRI, unsigned OpNo,
                              bool Def) const;
 
-  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
-                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
-                   unsigned OpNo, int32_t Val);
-
-  void setWaitAtBeginning() { WaitAtBeginning = true; }
-  void clearWaitAtBeginning() { WaitAtBeginning = false; }
-  bool getWaitAtBeginning() const { return WaitAtBeginning; }
-  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
   int32_t getMaxVGPR() const { return VgprUB; }
   int32_t getMaxSGPR() const { return SgprUB; }
 
-  int32_t getEventUB(enum WaitEventType W) const {
-    assert(W < NUM_WAIT_EVENTS);
-    return EventUBs[W];
-  }
-
-  bool counterOutOfOrder(InstCounterType T);
-  unsigned int updateByWait(InstCounterType T, int ScoreToWait);
+  bool counterOutOfOrder(InstCounterType T) const;
+  bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+  bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+  void determineWait(InstCounterType T, uint32_t ScoreToWait,
+                     AMDGPU::Waitcnt &Wait) const;
+  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+  void applyWaitcnt(InstCounterType T, unsigned Count);
   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
                      const MachineRegisterInfo *MRI, WaitEventType E,
                      MachineInstr &MI);
 
-  bool hasPendingSMEM() const {
-    return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
-            EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+  bool hasPending() const { return PendingEvents != 0; }
+  bool hasPendingEvent(WaitEventType E) const {
+    return PendingEvents & (1 << E);
   }
 
   bool hasPendingFlat() const {
@@ -291,75 +286,71 @@ public:
     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
   }
 
-  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
-
-  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
-
-  bool getRevisitLoop() const { return RevisitLoop; }
-  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
+  void print(raw_ostream &);
+  void dump() { print(dbgs()); }
 
-  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
-  int32_t getPostOrder() const { return PostOrder; }
+private:
+  struct MergeInfo {
+    uint32_t OldLB;
+    uint32_t OtherLB;
+    uint32_t MyShift;
+    uint32_t OtherShift;
+  };
+  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
+                         uint32_t OtherScore);
+
+  void setScoreLB(InstCounterType T, uint32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreLBs[T] = Val;
+  }
 
-  void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
-  void clearWaitcnt() { Waitcnt = nullptr; }
-  MachineInstr *getWaitcnt() const { return Waitcnt; }
+  void setScoreUB(InstCounterType T, uint32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreUBs[T] = Val;
+    if (T == EXP_CNT) {
+      uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
+      if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
+        ScoreLBs[T] = UB;
+    }
+  }
 
-  bool mixedExpTypes() const { return MixedExpTypes; }
-  void setMixedExpTypes(bool MixedExpTypesIn) {
-    MixedExpTypes = MixedExpTypesIn;
+  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      if (GprNo > VgprUB) {
+        VgprUB = GprNo;
+      }
+      VgprScores[T][GprNo] = Val;
+    } else {
+      assert(T == LGKM_CNT);
+      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
+        SgprUB = GprNo - NUM_ALL_VGPRS;
+      }
+      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+    }
   }
 
-  void print(raw_ostream &);
-  void dump() { print(dbgs()); }
+  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+                   unsigned OpNo, uint32_t Val);
 
-private:
   const GCNSubtarget *ST = nullptr;
-  bool WaitAtBeginning = false;
-  bool RevisitLoop = false;
-  bool MixedExpTypes = false;
-  int32_t PostOrder = 0;
-  MachineInstr *Waitcnt = nullptr;
-  int32_t ScoreLBs[NUM_INST_CNTS] = {0};
-  int32_t ScoreUBs[NUM_INST_CNTS] = {0};
-  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
+  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
+  uint32_t PendingEvents = 0;
+  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
   // Remember the last flat memory operation.
-  int32_t LastFlat[NUM_INST_CNTS] = {0};
+  uint32_t LastFlat[NUM_INST_CNTS] = {0};
   // wait_cnt scores for every vgpr.
   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
   int32_t VgprUB = 0;
   int32_t SgprUB = 0;
-  int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
-  int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
-};
-
-// This is a per-loop-region object that records waitcnt status at the end of
-// loop footer from the previous iteration. We also maintain an iteration
-// count to track the number of times the loop has been visited. When it
-// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
-// at the end of the loop footer.
-class LoopWaitcntData {
-public:
-  LoopWaitcntData() = default;
-  ~LoopWaitcntData() = default;
-
-  void incIterCnt() { IterCnt++; }
-  void resetIterCnt() { IterCnt = 0; }
-  unsigned getIterCnt() { return IterCnt; }
-
-  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
-  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
-
-  void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
-
-private:
-  // s_waitcnt added at the end of loop footer to stablize wait scores
-  // at the end of the loop footer.
-  MachineInstr *LfWaitcnt = nullptr;
-  // Number of iterations the loop has been visited, not including the initial
-  // walk over.
-  int32_t IterCnt = 0;
+  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
@@ -368,19 +359,21 @@ private:
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
-  const MachineLoopInfo *MLI = nullptr;
   AMDGPU::IsaVersion IV;
 
-  DenseSet<MachineBasicBlock *> BlockVisitedSet;
   DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseSet<MachineInstr *> VCCZBugHandledSet;
 
-  DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
-      BlockWaitcntBracketsMap;
+  struct BlockInfo {
+    MachineBasicBlock *MBB;
+    std::unique_ptr<WaitcntBrackets> Incoming;
+    bool Dirty = true;
 
-  std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
+    explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
+  };
 
-  DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
+  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
+  DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
 
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
   // because of amdgpu-waitcnt-forcezero flag
@@ -404,13 +397,11 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
   bool isForceEmitWaitcnt() const {
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1))
+    for (auto T : inst_counter_types())
       if (ForceEmitWaitcnt[T])
         return true;
     return false;
@@ -444,27 +435,22 @@ public:
   }
 
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
-  void generateWaitcntInstBefore(MachineInstr &MI,
-                                  BlockWaitcntBrackets *ScoreBrackets);
+  bool generateWaitcntInstBefore(MachineInstr &MI,
+                                 WaitcntBrackets &ScoreBrackets,
+                                 MachineInstr *OldWaitcntInstr);
   void updateEventWaitcntAfter(MachineInstr &Inst,
-                               BlockWaitcntBrackets *ScoreBrackets);
-  void mergeInputScoreBrackets(MachineBasicBlock &Block);
-  bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
-  unsigned countNumBottomBlocks(const MachineLoop *Loop);
-  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
-  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
-  bool isWaitcntStronger(unsigned LHS, unsigned RHS);
-  unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
+                               WaitcntBrackets *ScoreBrackets);
+  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
+                            WaitcntBrackets &ScoreBrackets);
 };
 
 } // end anonymous namespace
 
-RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
-                                                 const SIInstrInfo *TII,
-                                                 const MachineRegisterInfo *MRI,
-                                                 const SIRegisterInfo *TRI,
-                                                 unsigned OpNo,
-                                                 bool Def) const {
+RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
+                                            const SIInstrInfo *TII,
+                                            const MachineRegisterInfo *MRI,
+                                            const SIRegisterInfo *TRI,
+                                            unsigned OpNo, bool Def) const {
   const MachineOperand &Op = MI->getOperand(OpNo);
   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
       (Def && !Op.isDef()))
@@ -502,11 +488,11 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
   return Result;
 }
 
-void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
-                                       const SIInstrInfo *TII,
-                                       const SIRegisterInfo *TRI,
-                                       const MachineRegisterInfo *MRI,
-                                       unsigned OpNo, int32_t Val) {
+void WaitcntBrackets::setExpScore(const MachineInstr *MI,
+                                  const SIInstrInfo *TII,
+                                  const SIRegisterInfo *TRI,
+                                  const MachineRegisterInfo *MRI, unsigned OpNo,
+                                  uint32_t Val) {
   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
   LLVM_DEBUG({
     const MachineOperand &Opnd = MI->getOperand(OpNo);
@@ -517,26 +503,26 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
   }
 }
 
-void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
-                                         const SIRegisterInfo *TRI,
-                                         const MachineRegisterInfo *MRI,
-                                         WaitEventType E, MachineInstr &Inst) {
+void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+                                    const SIRegisterInfo *TRI,
+                                    const MachineRegisterInfo *MRI,
+                                    WaitEventType E, MachineInstr &Inst) {
   const MachineRegisterInfo &MRIA = *MRI;
   InstCounterType T = eventCounter(E);
-  int32_t CurrScore = getScoreUB(T) + 1;
-  // EventUB and ScoreUB need to be update regardless if this event changes
-  // the score of a register or not.
+  uint32_t CurrScore = getScoreUB(T) + 1;
+  if (CurrScore == 0)
+    report_fatal_error("InsertWaitcnt score wraparound");
+  // PendingEvents and ScoreUB need to be update regardless if this event
+  // changes the score of a register or not.
   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  EventUBs[E] = CurrScore;
+  if (!hasPendingEvent(E)) {
+    if (PendingEvents & WaitEventMaskForInst[T])
+      MixedPendingEvents[T] = true;
+    PendingEvents |= 1 << E;
+  }
   setScoreUB(T, CurrScore);
 
   if (T == EXP_CNT) {
-    // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
-    // is required.
-    if (!MixedExpTypes) {
-      MixedExpTypes = counterOutOfOrder(EXP_CNT);
-    }
-
     // Put score on the source vgprs. If this is a store, just use those
     // specific register(s).
     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
@@ -661,12 +647,11 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
   }
 }
 
-void BlockWaitcntBrackets::print(raw_ostream &OS) {
+void WaitcntBrackets::print(raw_ostream &OS) {
   OS << '\n';
-  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-       T = (enum InstCounterType)(T + 1)) {
-    int LB = getScoreLB(T);
-    int UB = getScoreUB(T);
+  for (auto T : inst_counter_types()) {
+    uint32_t LB = getScoreLB(T);
+    uint32_t UB = getScoreUB(T);
 
     switch (T) {
     case VM_CNT:
@@ -686,10 +671,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
     if (LB < UB) {
       // Print vgpr scores.
       for (int J = 0; J <= getMaxVGPR(); J++) {
-        int RegScore = getRegScore(J, T);
+        uint32_t RegScore = getRegScore(J, T);
         if (RegScore <= LB)
           continue;
-        int RelScore = RegScore - LB - 1;
+        uint32_t RelScore = RegScore - LB - 1;
         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
           OS << RelScore << ":v" << J << " ";
         } else {
@@ -699,10 +684,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
       // Also need to print sgpr scores for lgkm_cnt.
       if (T == LGKM_CNT) {
         for (int J = 0; J <= getMaxSGPR(); J++) {
-          int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+          uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
           if (RegScore <= LB)
             continue;
-          int RelScore = RegScore - LB - 1;
+          uint32_t RelScore = RegScore - LB - 1;
           OS << RelScore << ":s" << J << " ";
         }
       }
@@ -712,23 +697,31 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
   OS << '\n';
 }
 
-unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
-                                                int ScoreToWait) {
-  unsigned int NeedWait = 0;
-  if (ScoreToWait == -1) {
-    // The score to wait is unknown. This implies that it was not encountered
-    // during the path of the CFG walk done during the current traversal but
-    // may be seen on a different path. Emit an s_wait counter with a
-    // conservative value of 0 for the counter.
-    NeedWait = CNT_MASK(T);
-    setScoreLB(T, getScoreUB(T));
-    return NeedWait;
-  }
+/// Simplify the waitcnt, in the sense of removing redundant counts, and return
+/// whether a waitcnt instruction is needed at all.
+bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
+         simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
+         simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+}
+
+bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+                                      unsigned &Count) const {
+  const uint32_t LB = getScoreLB(T);
+  const uint32_t UB = getScoreUB(T);
+  if (Count < UB && UB - Count > LB)
+    return true;
 
+  Count = ~0u;
+  return false;
+}
+
+void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
+                                    AMDGPU::Waitcnt &Wait) const {
   // If the score of src_operand falls within the bracket, we need an
   // s_waitcnt instruction.
-  const int32_t LB = getScoreLB(T);
-  const int32_t UB = getScoreUB(T);
+  const uint32_t LB = getScoreLB(T);
+  const uint32_t UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     if ((T == VM_CNT || T == LGKM_CNT) &&
         hasPendingFlat() &&
@@ -736,90 +729,46 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
       // If there is a pending FLAT operation, and this is a VMem or LGKM
       // waitcnt and the target can report early completion, then we need
       // to force a waitcnt 0.
-      NeedWait = CNT_MASK(T);
-      setScoreLB(T, getScoreUB(T));
+      addWait(Wait, T, 0);
     } else if (counterOutOfOrder(T)) {
       // Counter can get decremented out-of-order when there
       // are multiple types event in the bracket. Also emit an s_wait counter
       // with a conservative value of 0 for the counter.
-      NeedWait = CNT_MASK(T);
-      setScoreLB(T, getScoreUB(T));
+      addWait(Wait, T, 0);
     } else {
-      NeedWait = CNT_MASK(T);
-      setScoreLB(T, ScoreToWait);
+      addWait(Wait, T, UB - ScoreToWait);
     }
   }
+}
 
-  return NeedWait;
+void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
+  applyWaitcnt(VM_CNT, Wait.VmCnt);
+  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
 }
 
-// Where there are multiple types of event in the bracket of a counter,
-// the decrement may go out of order.
-bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
-  switch (T) {
-  case VM_CNT:
-    return false;
-  case LGKM_CNT: {
-    if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      // Scalar memory read always can go out of order.
-      return true;
-    }
-    int NumEventTypes = 0;
-    if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (NumEventTypes <= 1) {
-      return false;
-    }
-    break;
+void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
+  const uint32_t UB = getScoreUB(T);
+  if (Count >= UB)
+    return;
+  if (Count != 0) {
+    if (counterOutOfOrder(T))
+      return;
+    setScoreLB(T, std::max(getScoreLB(T), UB - Count));
+  } else {
+    setScoreLB(T, UB);
+    MixedPendingEvents[T] = false;
+    PendingEvents &= ~WaitEventMaskForInst[T];
   }
-  case EXP_CNT: {
-    // If there has been a mixture of export types, then a waitcnt exp(0) is
-    // required.
-    if (MixedExpTypes)
-      return true;
-    int NumEventTypes = 0;
-    if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-
-    if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
+}
 
-    if (NumEventTypes <= 1) {
-      return false;
-    }
-    break;
-  }
-  default:
-    break;
-  }
-  return true;
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
+  // Scalar memory read always can go out of order.
+  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
+    return true;
+  return MixedPendingEvents[T];
 }
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
@@ -841,29 +790,6 @@ static bool readsVCCZ(const MachineInstr &MI) {
          !MI.getOperand(1).isUndef();
 }
 
-/// Given wait count encodings checks if LHS is stronger than RHS.
-bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
-  if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
-    return false;
-  if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
-    return false;
-  if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
-    return false;
-  return true;
-}
-
-/// Given wait count encodings create a new encoding which is stronger
-/// or equal to both.
-unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
-  unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
-                            AMDGPU::decodeVmcnt(IV, RHS));
-  unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
-                              AMDGPU::decodeLgkmcnt(IV, RHS));
-  unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
-                             AMDGPU::decodeExpcnt(IV, RHS));
-  return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
-}
-
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
@@ -874,41 +800,23 @@ unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-void SIInsertWaitcnts::generateWaitcntInstBefore(
-    MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
-  // To emit, or not to emit - that's the question!
-  // Start with an assumption that there is no need to emit.
-  unsigned int EmitWaitcnt = 0;
-
-  // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
-  bool ForceEmitZeroWaitcnt = false;
-
+bool SIInsertWaitcnts::generateWaitcntInstBefore(
+    MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
+    MachineInstr *OldWaitcntInstr) {
   setForceEmitWaitcnt();
   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
 
   if (MI.isDebugInstr())
-    return;
+    return false;
 
-  // See if an s_waitcnt is forced at block entry, or is needed at
-  // program end.
-  if (ScoreBrackets->getWaitAtBeginning()) {
-    // Note that we have already cleared the state, so we don't need to update
-    // it.
-    ScoreBrackets->clearWaitAtBeginning();
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      EmitWaitcnt |= CNT_MASK(T);
-      ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-    }
-  }
+  AMDGPU::Waitcnt Wait;
 
   // See if this instruction has a forced S_WAITCNT VM.
   // TODO: Handle other cases of NeedsWaitcntVmBefore()
-  else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
-           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
-           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
-    EmitWaitcnt |=
-        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
+      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
+      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+    Wait.VmCnt = 0;
   }
 
   // All waits must be resolved at call return.
@@ -916,23 +824,14 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
-        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-        EmitWaitcnt |= CNT_MASK(T);
-      }
-    }
+    Wait = AMDGPU::Waitcnt::allZero();
   }
   // Resolve vm waits before gs-done.
   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
             AMDGPU::SendMsg::ID_GS_DONE)) {
-    if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
-      ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-      EmitWaitcnt |= CNT_MASK(VM_CNT);
-    }
+    Wait.VmCnt = 0;
   }
 #if 0 // TODO: the following blocks of logic when we have fence.
   else if (MI.getOpcode() == SC_FENCE) {
@@ -996,14 +895,12 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
+      if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
+          ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
+          ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
+          ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
+        Wait.ExpCnt = 0;
+      }
     }
 
 #if 0 // TODO: the following code to handle CALL.
@@ -1035,23 +932,23 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
         continue;
       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
       // VM_CNT is only relevant to vgpr or LDS.
-      EmitWaitcnt |= ScoreBrackets->updateByWait(
-          VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+      ScoreBrackets.determineWait(
+          VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
     }
 
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
       const MachineOperand &Op = MI.getOperand(I);
       const MachineRegisterInfo &MRIA = *MRI;
       RegInterval Interval =
-          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
+          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Op.getReg())) {
           // VM_CNT is only relevant to vgpr or LDS.
-          EmitWaitcnt |= ScoreBrackets->updateByWait(
-              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+          ScoreBrackets.determineWait(
+              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
         }
-        EmitWaitcnt |= ScoreBrackets->updateByWait(
-            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+        ScoreBrackets.determineWait(
+            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
     }
     // End of for loop that looks at all source operands to decide vm_wait_cnt
@@ -1069,26 +966,26 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
         if (AS != AMDGPUAS::LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        EmitWaitcnt |= ScoreBrackets->updateByWait(
-            VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-        EmitWaitcnt |= ScoreBrackets->updateByWait(
-            EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+        ScoreBrackets.determineWait(
+            VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+        ScoreBrackets.determineWait(
+            EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
       }
     }
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
       MachineOperand &Def = MI.getOperand(I);
       const MachineRegisterInfo &MRIA = *MRI;
       RegInterval Interval =
-          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
+          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Def.getReg())) {
-          EmitWaitcnt |= ScoreBrackets->updateByWait(
-              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
-          EmitWaitcnt |= ScoreBrackets->updateByWait(
-              EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+          ScoreBrackets.determineWait(
+              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+          ScoreBrackets.determineWait(
+              EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
         }
-        EmitWaitcnt |= ScoreBrackets->updateByWait(
-            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+        ScoreBrackets.determineWait(
+            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
     } // End of for loop that looks at all dest operands.
   }
@@ -1099,172 +996,79 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
   // requiring a WAITCNT beforehand.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier()) {
-    EmitWaitcnt |=
-        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-    EmitWaitcnt |= ScoreBrackets->updateByWait(
-        EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-    EmitWaitcnt |= ScoreBrackets->updateByWait(
-        LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
+    Wait = AMDGPU::Waitcnt::allZero();
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
-    if (ScoreBrackets->getScoreLB(LGKM_CNT) <
-            ScoreBrackets->getScoreUB(LGKM_CNT) &&
-        ScoreBrackets->hasPendingSMEM()) {
-      // Wait on everything, not just LGKM.  vccz reads usually come from
-      // terminators, and we always wait on everything at the end of the
-      // block, so if we only wait on LGKM here, we might end up with
-      // another s_waitcnt inserted right after this if there are non-LGKM
-      // instructions still outstanding.
-      // FIXME: this is too conservative / the comment is wrong.
-      // We don't wait on everything at the end of the block and we combine
-      // waitcnts so we should never have back-to-back waitcnts.
-      ForceEmitZeroWaitcnt = true;
-      EmitWaitcnt = true;
+    if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+            ScoreBrackets.getScoreUB(LGKM_CNT) &&
+        ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+      Wait.LgkmCnt = 0;
     }
   }
 
-  // Does this operand processing indicate s_wait counter update?
-  if (EmitWaitcnt || IsForceEmitWaitcnt) {
-    int CntVal[NUM_INST_CNTS];
-
-    if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
-      // Force all waitcnts to 0.
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-      }
-      CntVal[VM_CNT] = 0;
-      CntVal[EXP_CNT] = 0;
-      CntVal[LGKM_CNT] = 0;
-    } else {
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        if (EmitWaitcnt & CNT_MASK(T)) {
-          int Delta =
-              ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
-          int MaxDelta = ScoreBrackets->getWaitCountMax(T);
-          if (Delta >= MaxDelta) {
-            Delta = -1;
-            if (T != EXP_CNT) {
-              ScoreBrackets->setScoreLB(
-                  T, ScoreBrackets->getScoreUB(T) - MaxDelta);
-            }
-            EmitWaitcnt &= ~CNT_MASK(T);
-          }
-          CntVal[T] = Delta;
-        } else {
-          // If we are not waiting for a particular counter then encode
-          // it as -1 which means "don't care."
-          CntVal[T] = -1;
-        }
+  // Early-out if no wait is indicated.
+  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
+    bool Modified = false;
+    if (OldWaitcntInstr) {
+      if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
+        TrackedWaitcntSet.erase(OldWaitcntInstr);
+        OldWaitcntInstr->eraseFromParent();
+        Modified = true;
+      } else {
+        int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
+        ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
       }
+      Modified = true;
     }
+    return Modified;
+  }
 
-    MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
-    int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
-    if (!OldWaitcnt ||
-        (AMDGPU::decodeVmcnt(IV, Imm) !=
-         (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
-        (AMDGPU::decodeExpcnt(IV, Imm) !=
-         (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
-        (AMDGPU::decodeLgkmcnt(IV, Imm) !=
-         (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
-      MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
-      if (ContainingLoop) {
-        MachineBasicBlock *TBB = ContainingLoop->getHeader();
-        BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
-        if (!ScoreBracket) {
-          assert(!BlockVisitedSet.count(TBB));
-          BlockWaitcntBracketsMap[TBB] =
-              llvm::make_unique<BlockWaitcntBrackets>(ST);
-          ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
-        }
-        ScoreBracket->setRevisitLoop(true);
-        LLVM_DEBUG(dbgs() << "set-revisit2: Block"
-                          << ContainingLoop->getHeader()->getNumber() << '\n';);
-      }
-    }
+  if (ForceEmitZeroWaitcnts)
+    Wait = AMDGPU::Waitcnt::allZero();
 
-    // Update an existing waitcount, or make a new one.
-    unsigned Enc = AMDGPU::encodeWaitcnt(IV,
-                      ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
-                      ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
-                      ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
-    // We don't remove waitcnts that existed prior to the waitcnt
-    // pass. Check if the waitcnt to-be-inserted can be avoided
-    // or if the prev waitcnt can be updated.
-    bool insertSWaitInst = true;
-    for (MachineBasicBlock::iterator I = MI.getIterator(),
-                                     B = MI.getParent()->begin();
-         insertSWaitInst && I != B; --I) {
-      if (I == MI.getIterator())
-        continue;
+  if (ForceEmitWaitcnt[VM_CNT])
+    Wait.VmCnt = 0;
+  if (ForceEmitWaitcnt[EXP_CNT])
+    Wait.ExpCnt = 0;
+  if (ForceEmitWaitcnt[LGKM_CNT])
+    Wait.LgkmCnt = 0;
 
-      switch (I->getOpcode()) {
-      case AMDGPU::S_WAITCNT:
-        if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
-          insertSWaitInst = false;
-        else if (!OldWaitcnt) {
-          OldWaitcnt = &*I;
-          Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
-        }
-        break;
-        // TODO: skip over instructions which never require wait.
-      }
-      break;
-    }
-    if (insertSWaitInst) {
-      if (OldWaitcnt) {
-        assert(OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT);
-        if (ForceEmitZeroWaitcnts)
-          LLVM_DEBUG(dbgs()
-                     << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
-        if (IsForceEmitWaitcnt)
-          LLVM_DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n");
-
-        OldWaitcnt->getOperand(0).setImm(Enc);
-        if (!OldWaitcnt->getParent())
-          MI.getParent()->insert(MI, OldWaitcnt);
-
-        LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
-                          << "Old Instr: " << MI << '\n'
-                          << "New Instr: " << *OldWaitcnt << '\n');
-      } else {
-        auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
-                                 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-                             .addImm(Enc);
-        TrackedWaitcntSet.insert(SWaitInst);
-
-        LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                          << "Old Instr: " << MI << '\n'
-                          << "New Instr: " << *SWaitInst << '\n');
-      }
-    }
+  ScoreBrackets.applyWaitcnt(Wait);
 
-    if (CntVal[EXP_CNT] == 0) {
-      ScoreBrackets->setMixedExpTypes(false);
-    }
+  AMDGPU::Waitcnt OldWait;
+  if (OldWaitcntInstr) {
+    OldWait =
+        AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
   }
-}
+  if (OldWait.dominates(Wait))
+    return false;
 
-void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
-                                             MachineInstr *Waitcnt) {
-  if (MBB.empty()) {
-    MBB.push_back(Waitcnt);
-    return;
-  }
+  if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
+    Wait = Wait.combined(OldWait);
 
-  MachineBasicBlock::iterator It = MBB.end();
-  MachineInstr *MI = &*(--It);
-  if (MI->isBranch()) {
-    MBB.insert(It, Waitcnt);
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  if (OldWaitcntInstr) {
+    OldWaitcntInstr->getOperand(0).setImm(Enc);
+
+    LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+                      << "Old Instr: " << MI << '\n'
+                      << "New Instr: " << *OldWaitcntInstr << '\n');
   } else {
-    MBB.push_back(Waitcnt);
+    auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+                             MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                         .addImm(Enc);
+    TrackedWaitcntSet.insert(SWaitInst);
+
+    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                      << "Old Instr: " << MI << '\n'
+                      << "New Instr: " << *SWaitInst << '\n');
   }
+
+  return true;
 }
 
 // This is a flat memory operation. Check to see if it has memory
@@ -1282,8 +1086,8 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
   return false;
 }
 
-void SIInsertWaitcnts::updateEventWaitcntAfter(
-    MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
+void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
+                                               WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
   // bracket and the destination operand scores.
@@ -1349,251 +1153,121 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(
   }
 }
 
-// Merge the score brackets of the Block's predecessors;
-// this merged score bracket is used when adding waitcnts to the Block
-void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
-  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
-  int32_t MaxPending[NUM_INST_CNTS] = {0};
-  int32_t MaxFlat[NUM_INST_CNTS] = {0};
-  bool MixedExpTypes = false;
-
-  // For single basic block loops, we need to retain the Block's
-  // score bracket to have accurate Pred info. So, make a copy of Block's
-  // score bracket, clear() it (which retains several important bits of info),
-  // populate, and then replace en masse. For non-single basic block loops,
-  // just clear Block's current score bracket and repopulate in-place.
-  bool IsSelfPred;
-  std::unique_ptr<BlockWaitcntBrackets> S;
-
-  IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
-    != Block.pred_end();
-  if (IsSelfPred) {
-    S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
-    ScoreBrackets = S.get();
-  }
-
-  ScoreBrackets->clear();
-
-  // See if there are any uninitialized predecessors. If so, emit an
-  // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.count(Pred);
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
-      continue;
-    }
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      int span =
-          PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
-      MaxPending[T] = std::max(MaxPending[T], span);
-      span =
-          PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
-      MaxFlat[T] = std::max(MaxFlat[T], span);
-    }
-
-    MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
-  }
-
-  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.count(Pred);
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
-      continue;
-    }
-
-    int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
-                  PredScoreBrackets->getScoreLB(EXP_CNT);
-    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
-    int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
-                  PredScoreBrackets->getScoreLB(EXP_CNT);
-    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
-  }
-
-#if 0
-  // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
-  // TODO: how does LC distinguish between function entry and main entry?
-  // If this is the entry to a function, force a wait.
-  MachineBasicBlock &Entry = Block.getParent()->front();
-  if (Entry.getNumber() == Block.getNumber()) {
-    ScoreBrackets->setWaitAtBeginning();
-    return;
-  }
-#endif
-
-  // Now set the current Block's brackets to the largest ending bracket.
-  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-       T = (enum InstCounterType)(T + 1)) {
-    ScoreBrackets->setScoreUB(T, MaxPending[T]);
-    ScoreBrackets->setScoreLB(T, 0);
-    ScoreBrackets->setLastFlat(T, MaxFlat[T]);
-  }
-
-  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
-
-  // Set the register scoreboard.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (!BlockVisitedSet.count(Pred)) {
-      continue;
-    }
-
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-
-    // Now merge the gpr_reg_score information
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      int PredLB = PredScoreBrackets->getScoreLB(T);
-      int PredUB = PredScoreBrackets->getScoreUB(T);
-      if (PredLB < PredUB) {
-        int PredScale = MaxPending[T] - PredUB;
-        // Merge vgpr scores.
-        for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
-          int PredRegScore = PredScoreBrackets->getRegScore(J, T);
-          if (PredRegScore <= PredLB)
-            continue;
-          int NewRegScore = PredScale + PredRegScore;
-          ScoreBrackets->setRegScore(
-              J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
-        }
-        // Also need to merge sgpr scores for lgkm_cnt.
-        if (T == LGKM_CNT) {
-          for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
-            int PredRegScore =
-                PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
-            if (PredRegScore <= PredLB)
-              continue;
-            int NewRegScore = PredScale + PredRegScore;
-            ScoreBrackets->setRegScore(
-                J + NUM_ALL_VGPRS, LGKM_CNT,
-                std::max(
-                    ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
-                    NewRegScore));
-          }
-        }
-      }
-    }
-
-    // Also merge the WaitEvent information.
-    ForAllWaitEventType(W) {
-      enum InstCounterType T = PredScoreBrackets->eventCounter(W);
-      int PredEventUB = PredScoreBrackets->getEventUB(W);
-      if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
-        int NewEventUB =
-            MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
-        if (NewEventUB > 0) {
-          ScoreBrackets->setEventUB(
-              W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
-        }
-      }
-    }
-  }
-
-  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
-  // sequencing predecessors, because changes to EXEC require waitcnts due to
-  // the delayed nature of these operations.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (!BlockVisitedSet.count(Pred)) {
-      continue;
-    }
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
+                                 uint32_t OtherScore) {
+  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+  uint32_t OtherShifted =
+      OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
+  Score = std::max(MyShifted, OtherShifted);
+  return OtherShifted > MyShifted;
+}
 
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-
-    int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
-    if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
-      int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
-                       PredScoreBrackets->getScoreUB(EXP_CNT);
-      if (new_gds_ub > 0) {
-        ScoreBrackets->setEventUB(
-            GDS_GPR_LOCK,
-            std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
-      }
-    }
-    int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
-    if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
-      int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
-                       PredScoreBrackets->getScoreUB(EXP_CNT);
-      if (new_exp_ub > 0) {
-        ScoreBrackets->setEventUB(
-            EXP_GPR_LOCK,
-            std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
+/// Merge the pending events and associater score brackets of \p Other into
+/// this brackets status.
+///
+/// Returns whether the merge resulted in a change that requires tighter waits
+/// (i.e. the merged brackets strictly dominate the original brackets).
+bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
+  bool StrictDom = false;
+
+  for (auto T : inst_counter_types()) {
+    // Merge event flags for this counter
+    const bool OldOutOfOrder = counterOutOfOrder(T);
+    const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
+    const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+    if (OtherEvents & ~OldEvents)
+      StrictDom = true;
+    if (Other.MixedPendingEvents[T] ||
+        (OldEvents && OtherEvents && OldEvents != OtherEvents))
+      MixedPendingEvents[T] = true;
+    PendingEvents |= OtherEvents;
+
+    // Merge scores for this counter
+    const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
+    const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+    MergeInfo M;
+    M.OldLB = ScoreLBs[T];
+    M.OtherLB = Other.ScoreLBs[T];
+    M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
+    M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
+
+    const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
+    if (NewUB < ScoreUBs[T])
+      report_fatal_error("waitcnt score overflow");
+    ScoreUBs[T] = NewUB;
+    ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
+
+    StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
+
+    bool RegStrictDom = false;
+    for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
+         J++) {
+      RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
+    }
+
+    if (T == LGKM_CNT) {
+      for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
+           J != E; J++) {
+        RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
       }
     }
-  }
 
-  // if a single block loop, update the score brackets. Not needed for other
-  // blocks, as we did this in-place
-  if (IsSelfPred) {
-    BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+    if (RegStrictDom && !OldOutOfOrder)
+      StrictDom = true;
   }
-}
 
-/// Return true if the given basic block is a "bottom" block of a loop.
-/// This works even if the loop is discontiguous. This also handles
-/// multiple back-edges for the same "header" block of a loop.
-bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
-                                    const MachineBasicBlock *Block) {
-  for (MachineBasicBlock *MBB : Loop->blocks()) {
-    if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/// Count the number of "bottom" basic blocks of a loop.
-unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
-  unsigned Count = 0;
-  for (MachineBasicBlock *MBB : Loop->blocks()) {
-    if (MBB->isSuccessor(Loop->getHeader())) {
-      Count++;
-    }
-  }
-  return Count;
+  return StrictDom;
 }
 
 // Generate s_waitcnt instructions where needed.
-void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
-                                            MachineBasicBlock &Block) {
-  // Initialize the state information.
-  mergeInputScoreBrackets(Block);
-
-  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+                                            MachineBasicBlock &Block,
+                                            WaitcntBrackets &ScoreBrackets) {
+  bool Modified = false;
 
   LLVM_DEBUG({
     dbgs() << "*** Block" << Block.getNumber() << " ***";
-    ScoreBrackets->dump();
+    ScoreBrackets.dump();
   });
 
   // Walk over the instructions.
+  MachineInstr *OldWaitcntInstr = nullptr;
+
   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
        Iter != E;) {
     MachineInstr &Inst = *Iter;
+
     // Remove any previously existing waitcnts.
     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
-      // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
-      // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
-      // as needed.
-      if (!TrackedWaitcntSet.count(&Inst))
-        ++Iter;
-      else {
-        ++Iter;
-        Inst.removeFromParent();
+      if (OldWaitcntInstr) {
+        if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
+          TrackedWaitcntSet.erase(OldWaitcntInstr);
+          OldWaitcntInstr->eraseFromParent();
+          OldWaitcntInstr = nullptr;
+        } else if (!TrackedWaitcntSet.count(&Inst)) {
+          // Two successive s_waitcnt's, both of which are pre-existing and
+          // are therefore preserved.
+          int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
+          ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+        } else {
+          ++Iter;
+          Inst.eraseFromParent();
+          Modified = true;
+          continue;
+        }
       }
-      ScoreBrackets->setWaitcnt(&Inst);
+
+      OldWaitcntInstr = &Inst;
+      ++Iter;
       continue;
     }
 
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
         (!VCCZBugHandledSet.count(&Inst))) {
-      if (ScoreBrackets->getScoreLB(LGKM_CNT) <
-              ScoreBrackets->getScoreUB(LGKM_CNT) &&
-          ScoreBrackets->hasPendingSMEM()) {
+      if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+              ScoreBrackets.getScoreUB(LGKM_CNT) &&
+          ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
           VCCZBugWorkAround = true;
       }
@@ -1601,9 +1275,10 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
-    generateWaitcntInstBefore(Inst, ScoreBrackets);
+    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+    OldWaitcntInstr = nullptr;
 
-    updateEventWaitcntAfter(Inst, ScoreBrackets);
+    updateEventWaitcntAfter(Inst, &ScoreBrackets);
 
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
     // If this instruction generates a S_SETVSKIP because it is an
@@ -1616,11 +1291,9 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     }
 #endif
 
-    ScoreBrackets->clearWaitcnt();
-
     LLVM_DEBUG({
       Inst.print(dbgs());
-      ScoreBrackets->dump();
+      ScoreBrackets.dump();
     });
 
     // Check to see if this is a GWS instruction. If so, and if this is CI or
@@ -1632,10 +1305,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
-      ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
-      ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-      ScoreBrackets->updateByWait(LGKM_CNT,
-                                  ScoreBrackets->getScoreUB(LGKM_CNT));
+      ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero());
     }
 
     // TODO: Remove this work-around after fixing the scheduler and enable the
@@ -1648,71 +1318,13 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
               AMDGPU::VCC)
           .addReg(AMDGPU::VCC);
       VCCZBugHandledSet.insert(&Inst);
+      Modified = true;
     }
 
     ++Iter;
   }
 
-  // Check if we need to force convergence at loop footer.
-  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
-  if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
-    LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-    WaitcntData->print();
-    LLVM_DEBUG(dbgs() << '\n';);
-
-    // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement, but doesn't guarantee convergence for a loop. Each
-    // loop should take at most (n+1) iterations for it to converge naturally,
-    // where n is the number of bottom blocks. If this threshold is reached and
-    // the result hasn't converged, then we force convergence by inserting
-    // a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
-      // To ensure convergence, need to make wait events at loop footer be no
-      // more than those from the previous iteration.
-      // As a simplification, instead of tracking individual scores and
-      // generating the precise wait count, just wait on 0.
-      bool HasPending = false;
-      MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
-          ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-          HasPending = true;
-          break;
-        }
-      }
-
-      if (HasPending) {
-        if (!SWaitInst) {
-          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
-                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-                              .addImm(0);
-          TrackedWaitcntSet.insert(SWaitInst);
-#if 0 // TODO: Format the debug output
-          OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
-          OutputTransformAdd(SWaitInst, context);
-#endif
-        }
-#if 0 // TODO: ??
-        _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
-#endif
-      }
-
-      if (SWaitInst) {
-        LLVM_DEBUG({
-          SWaitInst->print(dbgs());
-          dbgs() << "\nAdjusted score board:";
-          ScoreBrackets->dump();
-        });
-
-        // Add this waitcnt to the block. It is either newly created or
-        // created in previous iterations and added back since block traversal
-        // always removes waitcnts.
-        insertWaitcntBeforeCF(Block, SWaitInst);
-        WaitcntData->setWaitcnt(SWaitInst);
-      }
-    }
-  }
+  return Modified;
 }
 
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
@@ -1720,13 +1332,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
-  MLI = &getAnalysis<MachineLoopInfo>();
   IV = AMDGPU::getIsaVersion(ST->getCPU());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
-  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-       T = (enum InstCounterType)(T + 1))
+  for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
@@ -1746,93 +1356,70 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
 
   TrackedWaitcntSet.clear();
-  BlockVisitedSet.clear();
   VCCZBugHandledSet.clear();
-  LoopWaitcntDataMap.clear();
-  BlockWaitcntProcessedSet.clear();
+  RpotIdxMap.clear();
+  BlockInfos.clear();
+
+  // Keep iterating over the blocks in reverse post order, inserting and
+  // updating s_waitcnt where needed, until a fix point is reached.
+  for (MachineBasicBlock *MBB :
+       ReversePostOrderTraversal<MachineFunction *>(&MF)) {
+    RpotIdxMap[MBB] = BlockInfos.size();
+    BlockInfos.emplace_back(MBB);
+  }
 
-  // Walk over the blocks in reverse post order, inserting
-  // s_waitcnt where needed.
-  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  std::unique_ptr<WaitcntBrackets> Brackets;
   bool Modified = false;
-  for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
-           I = RPOT.begin(),
-           E = RPOT.end(), J = RPOT.begin();
-       I != E;) {
-    MachineBasicBlock &MBB = **I;
-
-    BlockVisitedSet.insert(&MBB);
-
-    BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
-    if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
-      ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
-    }
-    ScoreBrackets->setPostOrder(MBB.getNumber());
-    MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
-    if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
-      LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
-
-    // If we are walking into the block from before the loop, then guarantee
-    // at least 1 re-walk over the loop to propagate the information, even if
-    // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
-      unsigned Count = countNumBottomBlocks(ContainingLoop);
-
-      // If the loop has multiple back-edges, and so more than one "bottom"
-      // basic block, we have to guarantee a re-walk over every blocks.
-      if ((std::count(BlockWaitcntProcessedSet.begin(),
-                      BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
-        BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
-                          << ContainingLoop->getHeader()->getNumber() << '\n';);
+  bool Repeat;
+  do {
+    Repeat = false;
+
+    for (BlockInfo &BI : BlockInfos) {
+      if (!BI.Dirty)
+        continue;
+
+      unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
+
+      if (BI.Incoming) {
+        if (!Brackets)
+          Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
+        else
+          *Brackets = *BI.Incoming;
+      } else {
+        if (!Brackets)
+          Brackets = llvm::make_unique<WaitcntBrackets>(ST);
+        else
+          Brackets->clear();
       }
-    }
 
-    // Walk over the instructions.
-    insertWaitcntInBlock(MF, MBB);
-
-    // Record that waitcnts have been processed at least once for this block.
-    BlockWaitcntProcessedSet.push_back(&MBB);
-
-    // See if we want to revisit the loop. If a loop has multiple back-edges,
-    // we shouldn't revisit the same "bottom" basic block.
-    if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
-        std::count(BlockWaitcntProcessedSet.begin(),
-                   BlockWaitcntProcessedSet.end(), &MBB) == 1) {
-      MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
-      BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
-      if (EntrySB && EntrySB->getRevisitLoop()) {
-        EntrySB->setRevisitLoop(false);
-        J = I;
-        int32_t PostOrder = EntrySB->getPostOrder();
-        // TODO: Avoid this loop. Find another way to set I.
-        for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
-                 X = RPOT.begin(),
-                 Y = RPOT.end();
-             X != Y; ++X) {
-          MachineBasicBlock &MBBX = **X;
-          if (MBBX.getNumber() == PostOrder) {
-            I = X;
-            break;
+      Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
+      BI.Dirty = false;
+
+      if (Brackets->hasPending()) {
+        BlockInfo *MoveBracketsToSucc = nullptr;
+        for (MachineBasicBlock *Succ : BI.MBB->successors()) {
+          unsigned SuccIdx = RpotIdxMap[Succ];
+          BlockInfo &SuccBI = BlockInfos[SuccIdx];
+          if (!SuccBI.Incoming) {
+            SuccBI.Dirty = true;
+            if (SuccIdx <= Idx)
+              Repeat = true;
+            if (!MoveBracketsToSucc) {
+              MoveBracketsToSucc = &SuccBI;
+            } else {
+              SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
+            }
+          } else if (SuccBI.Incoming->merge(*Brackets)) {
+            SuccBI.Dirty = true;
+            if (SuccIdx <= Idx)
+              Repeat = true;
           }
         }
-        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-        WaitcntData->incIterCnt();
-        LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
-        continue;
-      } else {
-        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-        // Loop converged, reset iteration count. If this loop gets revisited,
-        // it must be from an outer loop, the counter will restart, this will
-        // ensure we don't force convergence on such revisits.
-        WaitcntData->resetIterCnt();
+        if (MoveBracketsToSucc)
+          MoveBracketsToSucc->Incoming = std::move(Brackets);
       }
     }
-
-    J = I;
-    ++I;
-  }
+  } while (Repeat);
 
   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
 
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index b73d30940fc38edb933098b7d2b44b6f1268e40f..65ffc27b8b6084068c8b5f0da64ed079f98a5b91 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -121,6 +121,10 @@ class InstSI <dag outs, dag ins, string asm = "",
   // This bit indicates that this is a D16 buffer instruction.
   field bit D16Buf = 0;
 
+  // This bit indicates that this uses the floating point double precision
+  // rounding mode flags
+  field bit FPDPRounding = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -178,6 +182,8 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{50} = D16Buf;
 
+  let TSFlags{51} = FPDPRounding;
+
   let SchedRW = [Write32Bit];
 
   field bits<1> DisableSIDecoder = 0;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 562428ef37c011a2badf1df50d485186b0ed46e8..b7c4eed621124654b9a6de824f9a9e6eb0fe44f0 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -265,9 +265,10 @@ static bool isStride64(unsigned Opc) {
   }
 }
 
-bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
-                                        int64_t &Offset,
-                                        const TargetRegisterInfo *TRI) const {
+bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
+                                          MachineOperand *&BaseOp,
+                                          int64_t &Offset,
+                                          const TargetRegisterInfo *TRI) const {
   unsigned Opc = LdSt.getOpcode();
 
   if (isDS(LdSt)) {
@@ -275,11 +276,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     if (OffsetImm) {
       // Normal, single offset LDS instruction.
-      const MachineOperand *AddrReg =
-          getNamedOperand(LdSt, AMDGPU::OpName::addr);
-
-      BaseReg = AddrReg->getReg();
+      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
       Offset = OffsetImm->getImm();
+      assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                                "operands of type register.");
       return true;
     }
 
@@ -310,10 +310,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
       if (isStride64(Opc))
         EltSize *= 64;
 
-      const MachineOperand *AddrReg =
-          getNamedOperand(LdSt, AMDGPU::OpName::addr);
-      BaseReg = AddrReg->getReg();
+      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
       Offset = EltSize * Offset0;
+      assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                                "operands of type register.");
       return true;
     }
 
@@ -325,19 +325,20 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
     if (SOffset && SOffset->isReg())
       return false;
 
-    const MachineOperand *AddrReg =
-        getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+    MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     if (!AddrReg)
       return false;
 
     const MachineOperand *OffsetImm =
         getNamedOperand(LdSt, AMDGPU::OpName::offset);
-    BaseReg = AddrReg->getReg();
+    BaseOp = AddrReg;
     Offset = OffsetImm->getImm();
 
     if (SOffset) // soffset can be an inline immediate.
       Offset += SOffset->getImm();
 
+    assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                              "operands of type register.");
     return true;
   }
 
@@ -347,36 +348,46 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
     if (!OffsetImm)
       return false;
 
-    const MachineOperand *SBaseReg =
-        getNamedOperand(LdSt, AMDGPU::OpName::sbase);
-    BaseReg = SBaseReg->getReg();
+    MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+    BaseOp = SBaseReg;
     Offset = OffsetImm->getImm();
+    assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                              "operands of type register.");
     return true;
   }
 
   if (isFLAT(LdSt)) {
-    const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+    MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     if (VAddr) {
       // Can't analyze 2 offsets.
       if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
         return false;
 
-      BaseReg = VAddr->getReg();
+      BaseOp = VAddr;
     } else {
       // scratch instructions have either vaddr or saddr.
-      BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
+      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
     }
 
     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
+    assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                              "operands of type register.");
     return true;
   }
 
   return false;
 }
 
-static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
-                                  const MachineInstr &MI2, unsigned BaseReg2) {
-  if (BaseReg1 == BaseReg2)
+static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
+                                  const MachineOperand &BaseOp1,
+                                  const MachineInstr &MI2,
+                                  const MachineOperand &BaseOp2) {
+  // Support only base operands with base registers.
+  // Note: this could be extended to support FI operands.
+  if (!BaseOp1.isReg() || !BaseOp2.isReg())
+    return false;
+
+  if (BaseOp1.isIdenticalTo(BaseOp2))
     return true;
 
   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
@@ -402,12 +413,13 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
   return Base1 == Base2;
 }
 
-bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
-                                      unsigned BaseReg1,
-                                      MachineInstr &SecondLdSt,
-                                      unsigned BaseReg2,
+bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
+                                      MachineOperand &BaseOp2,
                                       unsigned NumLoads) const {
-  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
+  MachineInstr &FirstLdSt = *BaseOp1.getParent();
+  MachineInstr &SecondLdSt = *BaseOp2.getParent();
+
+  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
     return false;
 
   const MachineOperand *FirstDst = nullptr;
@@ -864,7 +876,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  DebugLoc DL = MBB.findDebugLoc(MI);
+  const DebugLoc &DL = MBB.findDebugLoc(MI);
 
   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
@@ -963,9 +975,9 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  DebugLoc DL = MBB.findDebugLoc(MI);
+  const DebugLoc &DL = MBB.findDebugLoc(MI);
   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
   unsigned SpillSize = TRI->getSpillSize(*RC);
@@ -977,6 +989,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
 
   if (RI.isSGPRClass(RC)) {
+    MFI->setHasSpilledSGPRs();
+
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
@@ -1018,7 +1032,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  DebugLoc DL = MBB.findDebugLoc(MI);
+  const DebugLoc &DL = MBB.findDebugLoc(MI);
   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
   unsigned WavefrontSize = ST.getWavefrontSize();
 
@@ -1026,7 +1040,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
   if (!MFI->hasCalculatedTID()) {
     MachineBasicBlock &Entry = MBB.getParent()->front();
     MachineBasicBlock::iterator Insert = Entry.front();
-    DebugLoc DL = Insert->getDebugLoc();
+    const DebugLoc &DL = Insert->getDebugLoc();
 
     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
                                    *MF);
@@ -1632,7 +1646,34 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                                 SmallVectorImpl<MachineOperand> &Cond,
                                 bool AllowModify) const {
   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-  if (I == MBB.end())
+  auto E = MBB.end();
+  if (I == E)
+    return false;
+
+  // Skip over the instructions that are artificially terminators for special
+  // exec management.
+  while (I != E && !I->isBranch() && !I->isReturn() &&
+         I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
+    switch (I->getOpcode()) {
+    case AMDGPU::SI_MASK_BRANCH:
+    case AMDGPU::S_MOV_B64_term:
+    case AMDGPU::S_XOR_B64_term:
+    case AMDGPU::S_ANDN2_B64_term:
+      break;
+    case AMDGPU::SI_IF:
+    case AMDGPU::SI_ELSE:
+    case AMDGPU::SI_KILL_I1_TERMINATOR:
+    case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+      // FIXME: It's messy that these need to be considered here at all.
+      return true;
+    default:
+      llvm_unreachable("unexpected non-branch terminator inst");
+    }
+
+    ++I;
+  }
+
+  if (I == E)
     return false;
 
   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
@@ -2133,11 +2174,13 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
 
 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
                                                MachineInstr &MIb) const {
-  unsigned BaseReg0, BaseReg1;
+  MachineOperand *BaseOp0, *BaseOp1;
   int64_t Offset0, Offset1;
 
-  if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
-      getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
+  if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
+      getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
+    if (!BaseOp0->isIdenticalTo(*BaseOp1))
+      return false;
 
     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
       // FIXME: Handle ds_read2 / ds_write2.
@@ -2145,8 +2188,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
     }
     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
-    if (BaseReg0 == BaseReg1 &&
-        offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
+    if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
       return true;
     }
   }
@@ -2584,6 +2626,10 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
   if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
     return false;
 
+  // Can it be shrunk to a valid 32 bit opcode?
+  if (!hasVALU32BitEncoding(MI.getOpcode()))
+    return false;
+
   // Check output modifiers
   return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
          !hasModifiersSet(MI, AMDGPU::OpName::clamp);
@@ -3121,6 +3167,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+  case AMDGPU::S_XNOR_B32:
+    return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
@@ -3558,8 +3606,13 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
   // pointer value is uniform.
   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
-      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
-      SBase->setReg(SGPR);
+    unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+    SBase->setReg(SGPR);
+  }
+  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
+  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
+    unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
+    SOff->setReg(SGPR);
   }
 }
 
@@ -4088,22 +4141,50 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       // Default handling
       break;
     case AMDGPU::S_AND_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
       Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
       Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NAND_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NOR_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_XNOR_B64:
+      if (ST.hasDLInsts())
+        splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+      else
+        splitScalar64BitXnor(Worklist, Inst, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_ANDN2_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_ORN2_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
       Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_NOT_B64:
-      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
       Inst.eraseFromParent();
       continue;
 
@@ -4184,120 +4265,26 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       Inst.eraseFromParent();
       continue;
 
-    case AMDGPU::S_XNOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+    case AMDGPU::S_NAND_B32:
+      splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
       Inst.eraseFromParent();
       continue;
 
-    case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
-    case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
-    case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
-    case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
-    case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
-      unsigned VDst;
-      unsigned NewOpcode;
-
-      switch(Opcode) {
-      case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
-        NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
-        VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        break;
-      case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
-        NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
-        VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-        break;
-      case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
-        NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
-        VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
-        break;
-      case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
-      case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
-        splitScalarBuffer(Worklist, Inst);
-        Inst.eraseFromParent();
-        continue;
-      }
-
-      const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
-      auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
-      unsigned Offset = 0;
-
-      // FIXME: This isn't safe because the addressing mode doesn't work
-      // correctly if vaddr is negative.
-      //
-      // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
-      //
-      // See if we can extract an immediate offset by recognizing one of these:
-      //   V_ADD_I32_e32 dst, imm, src1
-      //   V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
-      // V_ADD will be removed by "Remove dead machine instructions".
-      if (Add &&
-          (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
-           Add->getOpcode() == AMDGPU::V_ADD_U32_e32 ||
-           Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
-        static const unsigned SrcNames[2] = {
-          AMDGPU::OpName::src0,
-          AMDGPU::OpName::src1,
-        };
-
-        // Find a literal offset in one of source operands.
-        for (int i = 0; i < 2; i++) {
-          const MachineOperand *Src =
-            getNamedOperand(*Add, SrcNames[i]);
-
-          if (Src->isReg()) {
-            MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg());
-            if (Def) {
-              if (Def->isMoveImmediate())
-                Src = &Def->getOperand(1);
-              else if (Def->isCopy()) {
-                auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
-                if (Mov && Mov->isMoveImmediate()) {
-                  Src = &Mov->getOperand(1);
-                }
-              }
-            }
-          }
-
-          if (Src) {
-            if (Src->isImm())
-              Offset = Src->getImm();
-            else if (Src->isCImm())
-              Offset = Src->getCImm()->getZExtValue();
-          }
-
-          if (Offset && isLegalMUBUFImmOffset(Offset)) {
-            VAddr = getNamedOperand(*Add, SrcNames[!i]);
-            break;
-          }
-
-          Offset = 0;
-        }
-      }
+    case AMDGPU::S_NOR_B32:
+      splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
+      Inst.eraseFromParent();
+      continue;
 
-      MachineInstr *NewInstr =
-          BuildMI(*MBB, Inst, Inst.getDebugLoc(),
-                  get(NewOpcode), VDst)
-              .add(*VAddr)                                        // vaddr
-              .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
-              .addImm(0)                                          // soffset
-              .addImm(Offset)                                     // offset
-              .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
-              .addImm(0) // slc
-              .addImm(0) // tfe
-              .cloneMemRefs(Inst)
-              .getInstr();
-
-      MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
-                         VDst);
-      addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
+    case AMDGPU::S_ANDN2_B32:
+      splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
       Inst.eraseFromParent();
+      continue;
 
-      // Legalize all operands other than the offset. Notably, convert the srsrc
-      // into SGPRs using v_readfirstlane if needed.
-      legalizeOperands(*NewInstr, MDT);
+    case AMDGPU::S_ORN2_B32:
+      splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
+      Inst.eraseFromParent();
       continue;
     }
-    }
 
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
@@ -4471,23 +4458,116 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
   MachineOperand &Src0 = Inst.getOperand(1);
   MachineOperand &Src1 = Inst.getOperand(2);
 
-  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
-  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
-
-  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   if (ST.hasDLInsts()) {
+    unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
+    legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
+
     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
       .add(Src0)
       .add(Src1);
+
+    MRI.replaceRegWith(Dest.getReg(), NewDest);
+    addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
   } else {
-    unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
-      .add(Src0)
+    // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
+    // invert either source and then perform the XOR. If either source is a
+    // scalar register, then we can leave the inversion on the scalar unit to
+    // acheive a better distrubution of scalar and vector instructions.
+    bool Src0IsSGPR = Src0.isReg() &&
+                      RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
+    bool Src1IsSGPR = Src1.isReg() &&
+                      RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
+    MachineInstr *Not = nullptr;
+    MachineInstr *Xor = nullptr;
+    unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+    // Build a pair of scalar instructions and add them to the work list.
+    // The next iteration over the work list will lower these to the vector
+    // unit as necessary.
+    if (Src0IsSGPR) {
+      Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+        .add(Src0);
+      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+      .addReg(Temp)
       .add(Src1);
+    } else if (Src1IsSGPR) {
+      Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+        .add(Src1);
+      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+      .add(Src0)
+      .addReg(Temp);
+    } else {
+      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
+        .add(Src0)
+        .add(Src1);
+      Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+        .addReg(Temp);
+      Worklist.insert(Not);
+    }
+
+    MRI.replaceRegWith(Dest.getReg(), NewDest);
+
+    Worklist.insert(Xor);
 
-    BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
-      .addReg(Xor);
+    addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
   }
+}
+
+void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
+                                      MachineInstr &Inst,
+                                      unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+
+  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
+    .add(Src0)
+    .add(Src1);
+
+  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+    .addReg(Interm);
+
+  Worklist.insert(&Op);
+  Worklist.insert(&Not);
+
+  MRI.replaceRegWith(Dest.getReg(), NewDest);
+  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
+                                     MachineInstr &Inst,
+                                     unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+
+  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
+    .add(Src1);
+
+  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
+    .add(Src0)
+    .addReg(Interm);
+
+  Worklist.insert(&Not);
+  Worklist.insert(&Op);
 
   MRI.replaceRegWith(Dest.getReg(), NewDest);
   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
@@ -4520,13 +4600,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
-  BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
+  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
 
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
-  BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
+  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
 
   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -4537,6 +4617,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 
   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
 
+  Worklist.insert(&LoHalf);
+  Worklist.insert(&HiHalf);
+
   // We don't need to legalizeOperands here because for a single operand, src0
   // will support any kind of input.
 
@@ -4642,6 +4725,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
                                                        AMDGPU::sub0, Src0SubRC);
   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
                                                        AMDGPU::sub0, Src1SubRC);
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub1, Src1SubRC);
 
   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
@@ -4652,11 +4739,6 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
                               .add(SrcReg0Sub0)
                               .add(SrcReg1Sub0);
 
-  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
-                                                       AMDGPU::sub1, Src0SubRC);
-  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
-                                                       AMDGPU::sub1, Src1SubRC);
-
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
                               .add(SrcReg0Sub1)
@@ -4671,22 +4753,62 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
 
   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
 
-  // Try to legalize the operands in case we need to swap the order to keep it
-  // valid.
-  legalizeOperands(LoHalf, MDT);
-  legalizeOperands(HiHalf, MDT);
+  Worklist.insert(&LoHalf);
+  Worklist.insert(&HiHalf);
 
   // Move all users of this moved vlaue.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
+void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
+                                       MachineInstr &Inst,
+                                       MachineDominatorTree *MDT) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+
+  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  MachineOperand* Op0;
+  MachineOperand* Op1;
+
+  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
+    Op0 = &Src0;
+    Op1 = &Src1;
+  } else {
+    Op0 = &Src1;
+    Op1 = &Src0;
+  }
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
+    .add(*Op0);
+
+  unsigned NewDest = MRI.createVirtualRegister(DestRC);
+
+  MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
+    .addReg(Interm)
+    .add(*Op1);
+
+  MRI.replaceRegWith(Dest.getReg(), NewDest);
+
+  Worklist.insert(&Xor);
+}
+
 void SIInstrInfo::splitScalar64BitBCNT(
     SetVectorType &Worklist, MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst.getDebugLoc();
+  const DebugLoc &DL = Inst.getDebugLoc();
 
   MachineOperand &Dest = Inst.getOperand(0);
   MachineOperand &Src = Inst.getOperand(1);
@@ -4722,7 +4844,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst.getDebugLoc();
+  const DebugLoc &DL = Inst.getDebugLoc();
 
   MachineOperand &Dest = Inst.getOperand(0);
   uint32_t Imm = Inst.getOperand(2).getImm();
@@ -4778,73 +4900,6 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
-                                    MachineInstr &Inst) const {
-  MachineBasicBlock &MBB = *Inst.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  MachineBasicBlock::iterator MII = Inst;
-  auto &DL = Inst.getDebugLoc();
-
-  MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
-  MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
-  MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
-  MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
-
-  unsigned Opcode = Inst.getOpcode();
-  unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
-  unsigned Count = 0;
-  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
-  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
-
-  switch(Opcode) {
-  default:
-    return;
-  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
-    Count = 2;
-    break;
-  case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
-    Count = 4;
-    break;
-  }
-
-  // FIXME: Should also attempt to build VAddr and Offset like the non-split
-  // case (see call site for this function)
-
-  // Create a vector of result registers
-  SmallVector<unsigned, 8> ResultRegs;
-  for (unsigned i = 0; i < Count ; ++i) {
-    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
-    MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
-      .addReg(Offset.getReg())  // offset
-      .addReg(Rsrc.getReg())    // rsrc
-      .addImm(0)                // soffset
-      .addImm(i << 4)           // inst_offset
-      .addImm(Glc.getImm())     // glc
-      .addImm(0)                // slc
-      .addImm(0)                // tfe
-      .addMemOperand(*Inst.memoperands_begin());
-    // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
-    auto &NewDestOp = NewMI.getOperand(0);
-    for (unsigned i = 0 ; i < 4 ; i++)
-      ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
-                                              RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
-  }
-  // Create a new combined result to replace original with
-  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
-  MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
-                                  get(TargetOpcode::REG_SEQUENCE), FullDestReg);
-
-  for (unsigned i = 0 ; i < Count * 4 ; ++i) {
-    CombinedResBuilder
-      .addReg(ResultRegs[i])
-      .addImm(RI.getSubRegFromChannel(i));
-  }
-
-  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
-  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
-}
-
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
   unsigned DstReg,
   MachineRegisterInfo &MRI,
@@ -4934,10 +4989,10 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
        make_range(MachineBasicBlock::iterator(SCCDefInst),
                       SCCDefInst.getParent()->end())) {
     // Exit if we find another SCC def.
-    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
+    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
       return;
 
-    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
+    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
       Worklist.insert(&MI);
   }
 }
@@ -5455,3 +5510,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   return MCOp;
 }
+
+static
+TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
+  assert(RegOpnd.isReg());
+  return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
+                             getRegSubRegPair(RegOpnd);
+}
+
+TargetInstrInfo::RegSubRegPair
+llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
+  assert(MI.isRegSequence());
+  for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
+    if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
+      auto &RegOp = MI.getOperand(1 + 2 * I);
+      return getRegOrUndef(RegOp);
+    }
+  return TargetInstrInfo::RegSubRegPair();
+}
+
+// Try to find the definition of reg:subreg in subreg-manipulation pseudos
+// Following a subreg of reg:subreg isn't supported
+static bool followSubRegDef(MachineInstr &MI,
+                            TargetInstrInfo::RegSubRegPair &RSR) {
+  if (!RSR.SubReg)
+    return false;
+  switch (MI.getOpcode()) {
+  default: break;
+  case AMDGPU::REG_SEQUENCE:
+    RSR = getRegSequenceSubReg(MI, RSR.SubReg);
+    return true;
+  // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
+  case AMDGPU::INSERT_SUBREG:
+    if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
+      // inserted the subreg we're looking for
+      RSR = getRegOrUndef(MI.getOperand(2));
+    else { // the subreg in the rest of the reg
+      auto R1 = getRegOrUndef(MI.getOperand(1));
+      if (R1.SubReg) // subreg of subreg isn't supported
+        return false;
+      RSR.Reg = R1.Reg;
+    }
+    return true;
+  }
+  return false;
+}
+
+MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
+                                     MachineRegisterInfo &MRI) {
+  assert(MRI.isSSA());
+  if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
+    return nullptr;
+
+  auto RSR = P;
+  auto *DefInst = MRI.getVRegDef(RSR.Reg);
+  while (auto *MI = DefInst) {
+    DefInst = nullptr;
+    switch (MI->getOpcode()) {
+    case AMDGPU::COPY:
+    case AMDGPU::V_MOV_B32_e32: {
+      auto &Op1 = MI->getOperand(1);
+      if (Op1.isReg() &&
+        TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
+        if (Op1.isUndef())
+          return nullptr;
+        RSR = getRegSubRegPair(Op1);
+        DefInst = MRI.getVRegDef(RSR.Reg);
+      }
+      break;
+    }
+    default:
+      if (followSubRegDef(*MI, RSR)) {
+        if (!RSR.Reg)
+          return nullptr;
+        DefInst = MRI.getVRegDef(RSR.Reg);
+      }
+    }
+    if (!DefInst)
+      return MI;
+  }
+  return nullptr;
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 2f51b19995035e4f063fc1325396949ff6d370eb..5b1a05f3785ec722553790ae753668bee63b8f31 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -89,6 +89,14 @@ private:
   void lowerScalarXnor(SetVectorType &Worklist,
                        MachineInstr &Inst) const;
 
+  void splitScalarNotBinop(SetVectorType &Worklist,
+                           MachineInstr &Inst,
+                           unsigned Opcode) const;
+
+  void splitScalarBinOpN2(SetVectorType &Worklist,
+                          MachineInstr &Inst,
+                          unsigned Opcode) const;
+
   void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                MachineInstr &Inst, unsigned Opcode) const;
 
@@ -99,12 +107,13 @@ private:
                                 unsigned Opcode,
                                 MachineDominatorTree *MDT = nullptr) const;
 
+  void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst,
+                                MachineDominatorTree *MDT = nullptr) const;
+
   void splitScalar64BitBCNT(SetVectorType &Worklist,
                             MachineInstr &Inst) const;
   void splitScalar64BitBFE(SetVectorType &Worklist,
                            MachineInstr &Inst) const;
-  void splitScalarBuffer(SetVectorType &Worklist,
-                         MachineInstr &Inst) const;
   void movePackToVALU(SetVectorType &Worklist,
                       MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
@@ -164,12 +173,11 @@ public:
                                int64_t &Offset1,
                                int64_t &Offset2) const override;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
-                             int64_t &Offset,
-                             const TargetRegisterInfo *TRI) const final;
+  bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+                               int64_t &Offset,
+                               const TargetRegisterInfo *TRI) const final;
 
-  bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
-                           MachineInstr &SecondLdSt, unsigned BaseReg2,
+  bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2,
                            unsigned NumLoads) const override;
 
   bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
@@ -596,6 +604,14 @@ public:
       return MI.getDesc().TSFlags & ClampFlags;
   }
 
+  static bool usesFPDPRounding(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding;
+  }
+
+  bool usesFPDPRounding(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
+  }
+
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(MI.isCopy());
     unsigned Dest = MI.getOperand(0).getReg();
@@ -910,9 +926,36 @@ public:
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
   int pseudoToMCOpcode(int Opcode) const;
-
 };
 
+/// \brief Returns true if a reg:subreg pair P has a TRC class
+inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P,
+                         const TargetRegisterClass &TRC,
+                         MachineRegisterInfo &MRI) {
+  auto *RC = MRI.getRegClass(P.Reg);
+  if (!P.SubReg)
+    return RC == &TRC;
+  auto *TRI = MRI.getTargetRegisterInfo();
+  return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg);
+}
+
+/// \brief Create RegSubRegPair from a register MachineOperand
+inline
+TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) {
+  assert(O.isReg());
+  return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg());
+}
+
+/// \brief Return the SubReg component from REG_SEQUENCE
+TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
+                                                    unsigned SubReg);
+
+/// \brief Return the defining instruction for a given reg:subreg pair
+/// skipping copy like instructions and subreg-manipulation pseudos.
+/// Following another subreg of a reg:subreg isn't supported.
+MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
+                               MachineRegisterInfo &MRI);
+
 namespace AMDGPU {
 
   LLVM_READONLY
@@ -924,6 +967,9 @@ namespace AMDGPU {
   LLVM_READONLY
   int getSDWAOp(uint16_t Opcode);
 
+  LLVM_READONLY
+  int getDPPOp32(uint16_t Opcode);
+
   LLVM_READONLY
   int getBasicFromSDWAOp(uint16_t Opcode);
 
@@ -954,6 +1000,9 @@ namespace AMDGPU {
   LLVM_READONLY
   int getSOPKOp(uint16_t Opcode);
 
+  LLVM_READONLY
+  int getGlobalSaddrOp(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 0859989b03963018ff2f20c4a4beda01cc94d910..13afa4d4974bf18f1b592f51e0e61a452c63ce1a 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1622,7 +1622,7 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                     0, // 64-bit dst - No DPP or SDWA for 64-bit operands
                     !if(!eq(Src0VT.Size, 64),
                         0, // 64-bit src0
-                        !if(!eq(Src0VT.Size, 64),
+                        !if(!eq(Src1VT.Size, 64),
                             0, // 64-bit src2
                             1
                         )
@@ -1631,6 +1631,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
             );
 }
 
+class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32> {
+  bit ret = !if(!eq(NumSrcArgs, 0), 0,
+                getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
 class BitOr<bit a, bit b> {
   bit ret = !if(a, 1, !if(b, 1, 0));
 }
@@ -1710,7 +1716,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSDWAOMod = isFloatType<DstVT>.ret;
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
-  field bit HasExtDPP = HasExt;
+  field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExtSDWA = HasExt;
   field bit HasExtSDWA9 = HasExt;
   field int NeedPatGen = PatGenMode.NoPattern;
@@ -1741,8 +1747,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
                                            getOpSelMod<Src0VT>.ret,
                                            getOpSelMod<Src1VT>.ret,
                                            getOpSelMod<Src2VT>.ret>.ret;
-  field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
-                               HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
+  field dag InsDPP = !if(HasExtDPP,
+                         getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
+                                   HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
+                         (ins));
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
                                  HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
                                  DstVT>.ret;
@@ -1756,7 +1764,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
                                               HasSrc0FloatMods,
                                               HasSrc1FloatMods,
                                               HasSrc2FloatMods>.ret;
-  field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmDPP = !if(HasExtDPP,
+                            getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
   field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
   field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
 }
@@ -1931,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping {
   let ValueCols = [["Default"]];
 }
 
+// Maps ordinary instructions to their DPP counterparts
+def getDPPOp32 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["AsmVariantName"];
+  let KeyCol = ["Default"];
+  let ValueCols = [["DPP"]];
+}
+
 // Maps an commuted opcode to its original version
 def getCommuteOrig : InstrMapping {
   let FilterClass = "Commutable_REV";
@@ -2017,6 +2035,15 @@ def getAtomicNoRetOp : InstrMapping {
   let ValueCols = [["0"]];
 }
 
+// Maps a GLOBAL to its SADDR form.
+def getGlobalSaddrOp : InstrMapping {
+  let FilterClass = "GlobalSaddrTable";
+  let RowFields = ["SaddrOp"];
+  let ColFields = ["IsSaddr"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 9714203d3d7053d7d0b3aa838d02e1e1cef47c13..5f78a10727503525fd54b3f06fb0c6a1486da9b1 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -247,7 +247,7 @@ def SI_LOOP : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved, brtarget:$target),
   [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
   let Size = 8;
-  let isBranch = 0;
+  let isBranch = 1;
   let hasSideEffects = 1;
 }
 
@@ -307,6 +307,7 @@ def SI_ILLEGAL_COPY : SPseudoInstSI <
 def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
   let isTerminator = 1;
   let usesCustomInserter = 1;
+  let isBranch = 1;
 }
 
 def SI_PS_LIVE : PseudoInstSI <
@@ -579,7 +580,8 @@ def : Pat <
   (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))),
   (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
 >;
-// TODO: we could add more variants for other types of conditionals
+
+  // TODO: we could add more variants for other types of conditionals
 
 //===----------------------------------------------------------------------===//
 // VOP1 Patterns
@@ -1535,15 +1537,15 @@ def : GCNPat <
 } // End SubtargetPredicate = HasVOP3PInsts
 
 
-// def : GCNPat <
-//   (v2f16 (scalar_to_vector f16:$src0)),
-//   (COPY $src0)
-// >;
+def : GCNPat <
+  (v2f16 (scalar_to_vector f16:$src0)),
+  (COPY $src0)
+>;
 
-// def : GCNPat <
-//   (v2i16 (scalar_to_vector i16:$src0)),
-//   (COPY $src0)
-// >;
+def : GCNPat <
+  (v2i16 (scalar_to_vector i16:$src0)),
+  (COPY $src0)
+>;
 
 def : GCNPat <
   (v4i16 (scalar_to_vector i16:$src0)),
@@ -1621,8 +1623,8 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
 defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
 
-def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
-def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
+defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
 
 }
 
@@ -1649,20 +1651,33 @@ class FP16Med3Pat<ValueType vt,
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
 >;
 
-class Int16Med3Pat<Instruction med3Inst,
+multiclass Int16Med3Pat<Instruction med3Inst,
+                   SDPatternOperator min,
                    SDPatternOperator max,
                    SDPatternOperator max_oneuse,
                    SDPatternOperator min_oneuse,
-                   ValueType vt = i32> : GCNPat<
+                   ValueType vt = i16> {
+  // This matches 16 permutations of
+  // max(min(x, y), min(max(x, y), z))
+  def : GCNPat <
   (max (min_oneuse vt:$src0, vt:$src1),
        (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
   (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
 >;
 
+  // This matches 16 permutations of
+  // min(max(a, b), max(min(a, b), c))
+  def : GCNPat <
+  (min (max_oneuse vt:$src0, vt:$src1),
+      (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
+  (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+>;
+}
+
 def : FPMed3Pat<f32, V_MED3_F32>;
 
 let OtherPredicates = [isGFX9] in {
 def : FP16Med3Pat<f16, V_MED3_F16>;
-def : Int16Med3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
-def : Int16Med3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
+defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
 } // End Predicates = [isGFX9]
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index 7b7cf1635050bdf18f7129593d8c52b684b93132..e51ff4b4bc50efd42aa9eb5a9e99a4b69a7d39ee 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -16,36 +16,4 @@
 let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
-  def int_SI_tbuffer_store : Intrinsic <
-    [],
-    [llvm_anyint_ty, // rsrc(SGPR)
-     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
-     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
-     llvm_i32_ty,    // vaddr(VGPR)
-     llvm_i32_ty,    // soffset(SGPR)
-     llvm_i32_ty,    // inst_offset(imm)
-     llvm_i32_ty,    // dfmt(imm)
-     llvm_i32_ty,    // nfmt(imm)
-     llvm_i32_ty,    // offen(imm)
-     llvm_i32_ty,    // idxen(imm)
-     llvm_i32_ty,    // glc(imm)
-     llvm_i32_ty,    // slc(imm)
-     llvm_i32_ty],   // tfe(imm)
-    []>;
-
-  // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
-  def int_SI_buffer_load_dword : Intrinsic <
-    [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
-    [llvm_anyint_ty,  // rsrc(SGPR)
-     llvm_anyint_ty,  // vaddr(VGPR)
-     llvm_i32_ty,     // soffset(SGPR)
-     llvm_i32_ty,     // inst_offset(imm)
-     llvm_i32_ty,     // offen(imm)
-     llvm_i32_ty,     // idxen(imm)
-     llvm_i32_ty,     // glc(imm)
-     llvm_i32_ty,     // slc(imm)
-     llvm_i32_ty],    // tfe(imm)
-    [IntrReadMem, IntrArgMemOnly]>;
-
 } // End TargetPrefix = "SI", isTarget = 1
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index e379e98623a2000b36c2b1ed88280942cd253a2c..52bbe5c03450b0137d8d8fab666f0abc3b13bcaf 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -20,6 +20,26 @@
 // ==>
 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
 //
+// This pass also tries to promote constant offset to the immediate by
+// adjusting the base. It tries to use a base from the nearby instructions that
+// allows it to have a 13bit constant offset and then promotes the 13bit offset
+// to the immediate.
+// E.g.
+//  s_movk_i32 s0, 0x1800
+//  v_add_co_u32_e32 v0, vcc, s0, v2
+//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+//
+//  s_movk_i32 s0, 0x1000
+//  v_add_co_u32_e32 v5, vcc, s0, v2
+//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+//  global_load_dwordx2 v[5:6], v[5:6], off
+//  global_load_dwordx2 v[0:1], v[0:1], off
+// =>
+//  s_movk_i32 s0, 0x1000
+//  v_add_co_u32_e32 v5, vcc, s0, v2
+//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+//  global_load_dwordx2 v[5:6], v[5:6], off
+//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
 //
 // Future improvements:
 //
@@ -43,9 +63,9 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -74,23 +94,38 @@ using namespace llvm;
 #define DEBUG_TYPE "si-load-store-opt"
 
 namespace {
+enum InstClassEnum {
+  UNKNOWN,
+  DS_READ,
+  DS_WRITE,
+  S_BUFFER_LOAD_IMM,
+  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
+  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
+  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
+  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
+  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
+  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
+};
 
-class SILoadStoreOptimizer : public MachineFunctionPass {
-  enum InstClassEnum {
-    DS_READ_WRITE,
-    S_BUFFER_LOAD_IMM,
-    BUFFER_LOAD_OFFEN,
-    BUFFER_LOAD_OFFSET,
-    BUFFER_STORE_OFFEN,
-    BUFFER_STORE_OFFSET,
-  };
+enum RegisterEnum {
+  SBASE = 0x1,
+  SRSRC = 0x2,
+  SOFFSET = 0x4,
+  VADDR = 0x8,
+  ADDR = 0x10,
+};
 
+class SILoadStoreOptimizer : public MachineFunctionPass {
   struct CombineInfo {
     MachineBasicBlock::iterator I;
     MachineBasicBlock::iterator Paired;
     unsigned EltSize;
     unsigned Offset0;
     unsigned Offset1;
+    unsigned Width0;
+    unsigned Width1;
     unsigned BaseOff;
     InstClassEnum InstClass;
     bool GLC0;
@@ -98,9 +133,23 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     bool SLC0;
     bool SLC1;
     bool UseST64;
-    bool IsX2;
-    SmallVector<MachineInstr*, 8> InstsToMove;
-   };
+    SmallVector<MachineInstr *, 8> InstsToMove;
+  };
+
+  struct BaseRegisters {
+    unsigned LoReg = 0;
+    unsigned HiReg = 0;
+
+    unsigned LoSubReg = 0;
+    unsigned HiSubReg = 0;
+  };
+
+  struct MemAddress {
+    BaseRegisters Base;
+    int64_t Offset = 0;
+  };
+
+  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
 
 private:
   const GCNSubtarget *STM = nullptr;
@@ -108,9 +157,16 @@ private:
   const SIRegisterInfo *TRI = nullptr;
   MachineRegisterInfo *MRI = nullptr;
   AliasAnalysis *AA = nullptr;
-  unsigned CreatedX2;
+  bool OptimizeAgain;
 
   static bool offsetsCanBeCombined(CombineInfo &CI);
+  static bool widthsFit(const CombineInfo &CI);
+  static unsigned getNewOpcode(const CombineInfo &CI);
+  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
+  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
+  unsigned getOpcodeWidth(const MachineInstr &MI);
+  InstClassEnum getInstClass(unsigned Opc);
+  unsigned getRegs(unsigned Opc);
 
   bool findMatchingInst(CombineInfo &CI);
 
@@ -123,10 +179,21 @@ private:
   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
-  unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
-                                    bool &IsOffen) const;
   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
 
+  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
+                           int32_t NewOffset);
+  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
+  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
+  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
+  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
+  /// Promotes constant offset to the immediate by adjusting the base. It
+  /// tries to use a base from the nearby instructions that allows it to have
+  /// a 13bit constant offset which gets promoted to the immediate.
+  bool promoteConstantOffsetToImm(MachineInstr &CI,
+                                  MemInfoMap &Visited,
+                                  SmallPtrSet<MachineInstr *, 4> &Promoted);
+
 public:
   static char ID;
 
@@ -153,8 +220,8 @@ public:
 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
                       "SI Load Store Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
-                    "SI Load Store Optimizer", false, false)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
+                    false, false)
 
 char SILoadStoreOptimizer::ID = 0;
 
@@ -165,7 +232,7 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {
 }
 
 static void moveInstsAfter(MachineBasicBlock::iterator I,
-                           ArrayRef<MachineInstr*> InstsToMove) {
+                           ArrayRef<MachineInstr *> InstsToMove) {
   MachineBasicBlock *MBB = I->getParent();
   ++I;
   for (MachineInstr *MI : InstsToMove) {
@@ -191,21 +258,19 @@ static void addDefsUsesToList(const MachineInstr &MI,
 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
                                       MachineBasicBlock::iterator B,
                                       const SIInstrInfo *TII,
-                                      AliasAnalysis * AA) {
+                                      AliasAnalysis *AA) {
   // RAW or WAR - cannot reorder
   // WAW - cannot reorder
   // RAR - safe to reorder
   return !(A->mayStore() || B->mayStore()) ||
-    TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
+         TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
 }
 
 // Add MI and its defs to the lists if MI reads one of the defs that are
 // already in the list. Returns true in that case.
-static bool
-addToListsIfDependent(MachineInstr &MI,
-                      DenseSet<unsigned> &RegDefs,
-                      DenseSet<unsigned> &PhysRegUses,
-                      SmallVectorImpl<MachineInstr*> &Insts) {
+static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
+                                  DenseSet<unsigned> &PhysRegUses,
+                                  SmallVectorImpl<MachineInstr *> &Insts) {
   for (MachineOperand &Use : MI.operands()) {
     // If one of the defs is read, then there is a use of Def between I and the
     // instruction that I will potentially be merged with. We will need to move
@@ -228,18 +293,16 @@ addToListsIfDependent(MachineInstr &MI,
   return false;
 }
 
-static bool
-canMoveInstsAcrossMemOp(MachineInstr &MemOp,
-                        ArrayRef<MachineInstr*> InstsToMove,
-                        const SIInstrInfo *TII,
-                        AliasAnalysis *AA) {
+static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+                                    ArrayRef<MachineInstr *> InstsToMove,
+                                    const SIInstrInfo *TII, AliasAnalysis *AA) {
   assert(MemOp.mayLoadOrStore());
 
   for (MachineInstr *InstToMove : InstsToMove) {
     if (!InstToMove->mayLoadOrStore())
       continue;
     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
-        return false;
+      return false;
   }
   return true;
 }
@@ -260,10 +323,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
   CI.BaseOff = 0;
 
   // Handle SMEM and VMEM instructions.
-  if (CI.InstClass != DS_READ_WRITE) {
-    unsigned Diff = CI.IsX2 ? 2 : 1;
-    return (EltOffset0 + Diff == EltOffset1 ||
-            EltOffset1 + Diff == EltOffset0) &&
+  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
+    return (EltOffset0 + CI.Width0 == EltOffset1 ||
+            EltOffset1 + CI.Width1 == EltOffset0) &&
            CI.GLC0 == CI.GLC1 &&
            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
   }
@@ -305,42 +367,175 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
   return false;
 }
 
+bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) {
+  const unsigned Width = (CI.Width0 + CI.Width1);
+  switch (CI.InstClass) {
+  default:
+    return Width <= 4;
+  case S_BUFFER_LOAD_IMM:
+    switch (Width) {
+    default:
+      return false;
+    case 2:
+    case 4:
+      return true;
+    }
+  }
+}
+
+unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
+  const unsigned Opc = MI.getOpcode();
+
+  if (TII->isMUBUF(MI)) {
+    return AMDGPU::getMUBUFDwords(Opc);
+  }
+
+  switch (Opc) {
+  default:
+    return 0;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+    return 1;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+    return 2;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return 4;
+  }
+}
+
+InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
+  if (TII->isMUBUF(Opc)) {
+    const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
+
+    // If we couldn't identify the opcode, bail out.
+    if (baseOpcode == -1) {
+      return UNKNOWN;
+    }
+
+    switch (baseOpcode) {
+    default:
+      return UNKNOWN;
+    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+      return BUFFER_LOAD_OFFEN;
+    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+      return BUFFER_LOAD_OFFSET;
+    case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+      return BUFFER_STORE_OFFEN;
+    case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+      return BUFFER_STORE_OFFSET;
+    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+      return BUFFER_LOAD_OFFEN_exact;
+    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+      return BUFFER_LOAD_OFFSET_exact;
+    case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+      return BUFFER_STORE_OFFEN_exact;
+    case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+      return BUFFER_STORE_OFFSET_exact;
+    }
+  }
+
+  switch (Opc) {
+  default:
+    return UNKNOWN;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return S_BUFFER_LOAD_IMM;
+  case AMDGPU::DS_READ_B32:
+  case AMDGPU::DS_READ_B64:
+  case AMDGPU::DS_READ_B32_gfx9:
+  case AMDGPU::DS_READ_B64_gfx9:
+    return DS_READ;
+  case AMDGPU::DS_WRITE_B32:
+  case AMDGPU::DS_WRITE_B64:
+  case AMDGPU::DS_WRITE_B32_gfx9:
+  case AMDGPU::DS_WRITE_B64_gfx9:
+    return DS_WRITE;
+  }
+}
+
+unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
+  if (TII->isMUBUF(Opc)) {
+    unsigned result = 0;
+
+    if (AMDGPU::getMUBUFHasVAddr(Opc)) {
+      result |= VADDR;
+    }
+
+    if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
+      result |= SRSRC;
+    }
+
+    if (AMDGPU::getMUBUFHasSoffset(Opc)) {
+      result |= SOFFSET;
+    }
+
+    return result;
+  }
+
+  switch (Opc) {
+  default:
+    return 0;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return SBASE;
+  case AMDGPU::DS_READ_B32:
+  case AMDGPU::DS_READ_B64:
+  case AMDGPU::DS_READ_B32_gfx9:
+  case AMDGPU::DS_READ_B64_gfx9:
+  case AMDGPU::DS_WRITE_B32:
+  case AMDGPU::DS_WRITE_B64:
+  case AMDGPU::DS_WRITE_B32_gfx9:
+  case AMDGPU::DS_WRITE_B64_gfx9:
+    return ADDR;
+  }
+}
+
 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   MachineBasicBlock::iterator E = MBB->end();
   MachineBasicBlock::iterator MBBI = CI.I;
 
-  unsigned AddrOpName[3] = {0};
-  int AddrIdx[3];
-  const MachineOperand *AddrReg[3];
+  const unsigned Opc = CI.I->getOpcode();
+  const InstClassEnum InstClass = getInstClass(Opc);
+
+  if (InstClass == UNKNOWN) {
+    return false;
+  }
+
+  const unsigned Regs = getRegs(Opc);
+
+  unsigned AddrOpName[5] = {0};
+  int AddrIdx[5];
+  const MachineOperand *AddrReg[5];
   unsigned NumAddresses = 0;
 
-  switch (CI.InstClass) {
-  case DS_READ_WRITE:
+  if (Regs & ADDR) {
     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
-    break;
-  case S_BUFFER_LOAD_IMM:
+  }
+
+  if (Regs & SBASE) {
     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
-    break;
-  case BUFFER_LOAD_OFFEN:
-  case BUFFER_STORE_OFFEN:
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
-    break;
-  case BUFFER_LOAD_OFFSET:
-  case BUFFER_STORE_OFFSET:
+  }
+
+  if (Regs & SRSRC) {
     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+  }
+
+  if (Regs & SOFFSET) {
     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
-    break;
+  }
+
+  if (Regs & VADDR) {
+    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
   }
 
   for (unsigned i = 0; i < NumAddresses; i++) {
     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
 
-    // We only ever merge operations with the same base address register, so don't
-    // bother scanning forward if there are no other uses.
+    // We only ever merge operations with the same base address register, so
+    // don't bother scanning forward if there are no other uses.
     if (AddrReg[i]->isReg() &&
         (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
@@ -353,8 +548,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
   DenseSet<unsigned> PhysRegUsesToMove;
   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
-  for ( ; MBBI != E; ++MBBI) {
-    if (MBBI->getOpcode() != CI.I->getOpcode()) {
+  for (; MBBI != E; ++MBBI) {
+    const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
+
+    if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
+        (IsDS && (MBBI->getOpcode() != Opc))) {
       // This is not a matching DS instruction, but we can keep looking as
       // long as one of these conditions are met:
       // 1. It is safe to move I down past MBBI.
@@ -368,8 +566,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
       }
 
       if (MBBI->mayLoadOrStore() &&
-        (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
+          (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
+           !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
@@ -413,8 +611,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
         continue;
       }
 
-      // Check same base pointer. Be careful of subregisters, which can occur with
-      // vectors of pointers.
+      // Check same base pointer. Be careful of subregisters, which can occur
+      // with vectors of pointers.
       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
         Match = false;
@@ -423,13 +621,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
     }
 
     if (Match) {
-      int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
-                                                 AMDGPU::OpName::offset);
+      int OffsetIdx =
+          AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
+      CI.Width0 = getOpcodeWidth(*CI.I);
       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
+      CI.Width1 = getOpcodeWidth(*MBBI);
       CI.Paired = MBBI;
 
-      if (CI.InstClass == DS_READ_WRITE) {
+      if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
         CI.Offset0 &= 0xffff;
         CI.Offset1 &= 0xffff;
       } else {
@@ -445,7 +645,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
       // We also need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
-      if (offsetsCanBeCombined(CI))
+      if (widthsFit(CI) && offsetsCanBeCombined(CI))
         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
           return true;
     }
@@ -472,12 +672,12 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
   if (STM->ldsRequiresM0Init())
     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
 
-  return (EltSize == 4) ?
-    AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
+  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
+                        : AMDGPU::DS_READ2ST64_B64_gfx9;
 }
 
-MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
-  CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -489,8 +689,8 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
 
   unsigned NewOffset0 = CI.Offset0;
   unsigned NewOffset1 = CI.Offset1;
-  unsigned Opc = CI.UseST64 ?
-    read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
+  unsigned Opc =
+      CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
 
   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
@@ -502,13 +702,12 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   }
 
   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
-         (NewOffset0 != NewOffset1) &&
-         "Computed offset doesn't fit");
+         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 
   const MCInstrDesc &Read2Desc = TII->get(Opc);
 
-  const TargetRegisterClass *SuperRC
-    = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+  const TargetRegisterClass *SuperRC =
+      (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
 
   DebugLoc DL = CI.I->getDebugLoc();
@@ -519,23 +718,24 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   if (CI.BaseOff) {
     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
-      .addImm(CI.BaseOff);
+        .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
-      .addReg(ImmReg)
-      .addReg(AddrReg->getReg(), 0, BaseSubReg);
+        .addReg(ImmReg)
+        .addReg(AddrReg->getReg(), 0, BaseSubReg);
     BaseSubReg = 0;
   }
 
-  MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
-                        .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
-                        .addImm(NewOffset0)                        // offset0
-                        .addImm(NewOffset1)                        // offset1
-                        .addImm(0)                                 // gds
-                        .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+  MachineInstrBuilder Read2 =
+      BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
+          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
+          .addImm(NewOffset0)                        // offset0
+          .addImm(NewOffset1)                        // offset1
+          .addImm(0)                                 // gds
+          .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
   (void)Read2;
 
@@ -562,32 +762,36 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
   if (STM->ldsRequiresM0Init())
     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
-  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
+  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
+                        : AMDGPU::DS_WRITE2_B64_gfx9;
 }
 
 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
   if (STM->ldsRequiresM0Init())
-    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
+                          : AMDGPU::DS_WRITE2ST64_B64;
 
-  return (EltSize == 4) ?
-    AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
+  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
+                        : AMDGPU::DS_WRITE2ST64_B64_gfx9;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
-  CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
   // sure we preserve the subregister index and any register flags set on them.
-  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
-  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
-  const MachineOperand *Data1
-    = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
+  const MachineOperand *AddrReg =
+      TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+  const MachineOperand *Data0 =
+      TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
+  const MachineOperand *Data1 =
+      TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
 
   unsigned NewOffset0 = CI.Offset0;
   unsigned NewOffset1 = CI.Offset1;
-  unsigned Opc = CI.UseST64 ?
-    write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+  unsigned Opc =
+      CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
 
   if (NewOffset0 > NewOffset1) {
     // Canonicalize the merged instruction so the smaller offset comes first.
@@ -596,8 +800,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   }
 
   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
-         (NewOffset0 != NewOffset1) &&
-         "Computed offset doesn't fit");
+         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 
   const MCInstrDesc &Write2Desc = TII->get(Opc);
   DebugLoc DL = CI.I->getDebugLoc();
@@ -608,25 +811,26 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   if (CI.BaseOff) {
     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
-      .addImm(CI.BaseOff);
+        .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
-      .addReg(ImmReg)
-      .addReg(AddrReg->getReg(), 0, BaseSubReg);
+        .addReg(ImmReg)
+        .addReg(AddrReg->getReg(), 0, BaseSubReg);
     BaseSubReg = 0;
   }
 
-  MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc)
-                        .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
-                        .add(*Data0)                               // data0
-                        .add(*Data1)                               // data1
-                        .addImm(NewOffset0)                        // offset0
-                        .addImm(NewOffset1)                        // offset1
-                        .addImm(0)                                 // gds
-                        .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+  MachineInstrBuilder Write2 =
+      BuildMI(*MBB, CI.Paired, DL, Write2Desc)
+          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
+          .add(*Data0)                               // data0
+          .add(*Data1)                               // data1
+          .addImm(NewOffset0)                        // offset0
+          .addImm(NewOffset1)                        // offset1
+          .addImm(0)                                 // gds
+          .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
   moveInstsAfter(Write2, CI.InstsToMove);
 
@@ -638,15 +842,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   return Next;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
-  CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
-  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
-                              AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+  const unsigned Opcode = getNewOpcode(CI);
+
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
 
-  const TargetRegisterClass *SuperRC =
-    CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
 
@@ -656,12 +859,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
       .addImm(CI.GLC0)      // glc
       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
-  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
-  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
-
-  // Handle descending offsets
-  if (CI.Offset0 > CI.Offset1)
-    std::swap(SubRegIdx0, SubRegIdx1);
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
 
   // Copy to the old destination registers.
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
@@ -683,29 +883,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   return Next;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
-  CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
-  unsigned Opcode;
 
-  if (CI.InstClass == BUFFER_LOAD_OFFEN) {
-    Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
-                       AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
-  } else {
-    Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
-                       AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
-  }
+  const unsigned Opcode = getNewOpcode(CI);
 
-  const TargetRegisterClass *SuperRC =
-    CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+
+  // Copy to the new source register.
   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
 
   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
 
-  if (CI.InstClass == BUFFER_LOAD_OFFEN)
-      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+  const unsigned Regs = getRegs(Opcode);
+
+  if (Regs & VADDR)
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -715,12 +911,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
       .addImm(0)            // tfe
       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
 
-  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
-  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
-
-  // Handle descending offsets
-  if (CI.Offset0 > CI.Offset1)
-    std::swap(SubRegIdx0, SubRegIdx1);
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
 
   // Copy to the old destination registers.
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
@@ -742,57 +935,137 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   return Next;
 }
 
-unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
-  const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
-  IsX2 = false;
-  IsOffen = false;
-
-  switch (I.getOpcode()) {
-  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
-    IsOffen = true;
-    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
-  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
-    IsOffen = true;
-    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
-  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
-    IsX2 = true;
-    IsOffen = true;
-    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
-  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
-    IsX2 = true;
-    IsOffen = true;
-    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
-  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
-    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
-  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
-    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
-  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
-    IsX2 = true;
-    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
-  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
-    IsX2 = true;
-    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
-  }
-  return 0;
-}
-
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
-  CombineInfo &CI) {
+unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
+  const unsigned Width = CI.Width0 + CI.Width1;
+
+  switch (CI.InstClass) {
+  default:
+    return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
+  case UNKNOWN:
+    llvm_unreachable("Unknown instruction class");
+  case S_BUFFER_LOAD_IMM:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+    case 4:
+      return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+    }
+  }
+}
+
+std::pair<unsigned, unsigned>
+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
+  if (CI.Offset0 > CI.Offset1) {
+    switch (CI.Width0) {
+    default:
+      return std::make_pair(0, 0);
+    case 1:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
+      case 2:
+        return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
+      case 3:
+        return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
+      }
+    case 2:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
+      case 2:
+        return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
+      }
+    case 3:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
+      }
+    }
+  } else {
+    switch (CI.Width0) {
+    default:
+      return std::make_pair(0, 0);
+    case 1:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
+      case 2:
+        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
+      case 3:
+        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
+      }
+    case 2:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
+      case 2:
+        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
+      }
+    case 3:
+      switch (CI.Width1) {
+      default:
+        return std::make_pair(0, 0);
+      case 1:
+        return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
+      }
+    }
+  }
+}
+
+const TargetRegisterClass *
+SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
+  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
+    switch (CI.Width0 + CI.Width1) {
+    default:
+      return nullptr;
+    case 2:
+      return &AMDGPU::SReg_64_XEXECRegClass;
+    case 4:
+      return &AMDGPU::SReg_128RegClass;
+    case 8:
+      return &AMDGPU::SReg_256RegClass;
+    case 16:
+      return &AMDGPU::SReg_512RegClass;
+    }
+  } else {
+    switch (CI.Width0 + CI.Width1) {
+    default:
+      return nullptr;
+    case 2:
+      return &AMDGPU::VReg_64RegClass;
+    case 3:
+      return &AMDGPU::VReg_96RegClass;
+    case 4:
+      return &AMDGPU::VReg_128RegClass;
+    }
+  }
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
-  bool Unused1, Unused2;
-  unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
 
-  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
-  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+  const unsigned Opcode = getNewOpcode(CI);
 
-  // Handle descending offsets
-  if (CI.Offset0 > CI.Offset1)
-    std::swap(SubRegIdx0, SubRegIdx1);
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
 
   // Copy to the new source register.
-  const TargetRegisterClass *SuperRC =
-    CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
 
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
@@ -805,10 +1078,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
       .addImm(SubRegIdx1);
 
   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
-      .addReg(SrcReg, RegState::Kill);
+                 .addReg(SrcReg, RegState::Kill);
+
+  const unsigned Regs = getRegs(Opcode);
 
-  if (CI.InstClass == BUFFER_STORE_OFFEN)
-      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+  if (Regs & VADDR)
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -826,105 +1101,399 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   return Next;
 }
 
+MachineOperand
+SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
+  APInt V(32, Val, true);
+  if (TII->isInlineConstant(V))
+    return MachineOperand::CreateImm(Val);
+
+  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  MachineInstr *Mov =
+  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+          TII->get(AMDGPU::S_MOV_B32), Reg)
+    .addImm(Val);
+  (void)Mov;
+  LLVM_DEBUG(dbgs() << "    "; Mov->dump());
+  return MachineOperand::CreateReg(Reg, false);
+}
+
+// Compute base address using Addr and return the final register.
+unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
+                                           const MemAddress &Addr) {
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
+          Addr.Base.LoSubReg) &&
+         "Expected 32-bit Base-Register-Low!!");
+
+  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
+          Addr.Base.HiSubReg) &&
+         "Expected 32-bit Base-Register-Hi!!");
+
+  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
+  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
+  MachineOperand OffsetHi =
+    createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
+  unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+  unsigned DeadCarryReg =
+    MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  MachineInstr *LoHalf =
+    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
+      .addReg(CarryReg, RegState::Define)
+      .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
+    .add(OffsetLo);
+  (void)LoHalf;
+  LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
+
+  MachineInstr *HiHalf =
+  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
+    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+    .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
+    .add(OffsetHi)
+    .addReg(CarryReg, RegState::Kill);
+  (void)HiHalf;
+  LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
+
+  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  MachineInstr *FullBase =
+    BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+      .addReg(DestSub0)
+      .addImm(AMDGPU::sub0)
+      .addReg(DestSub1)
+      .addImm(AMDGPU::sub1);
+  (void)FullBase;
+  LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
+
+  return FullDestReg;
+}
+
+// Update base and offset with the NewBase and NewOffset in MI.
+void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
+                                               unsigned NewBase,
+                                               int32_t NewOffset) {
+  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
+  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
+}
+
+Optional<int32_t>
+SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
+  if (Op.isImm())
+    return Op.getImm();
+
+  if (!Op.isReg())
+    return None;
+
+  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
+      !Def->getOperand(1).isImm())
+    return None;
+
+  return Def->getOperand(1).getImm();
+}
+
+// Analyze Base and extracts:
+//  - 32bit base registers, subregisters
+//  - 64bit constant offset
+// Expecting base computation as:
+//   %OFFSET0:sgpr_32 = S_MOV_B32 8000
+//   %LO:vgpr_32, %c:sreg_64_xexec =
+//       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
+//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
+//   %Base:vreg_64 =
+//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
+void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
+                                                      MemAddress &Addr) {
+  if (!Base.isReg())
+    return;
+
+  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
+  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
+      || Def->getNumOperands() != 5)
+    return;
+
+  MachineOperand BaseLo = Def->getOperand(1);
+  MachineOperand BaseHi = Def->getOperand(3);
+  if (!BaseLo.isReg() || !BaseHi.isReg())
+    return;
+
+  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
+  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
+
+  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
+      !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
+    return;
+
+  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
+  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
+
+  auto Offset0P = extractConstOffset(*Src0);
+  if (Offset0P)
+    BaseLo = *Src1;
+  else {
+    if (!(Offset0P = extractConstOffset(*Src1)))
+      return;
+    BaseLo = *Src0;
+  }
+
+  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
+  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
+
+  if (Src0->isImm())
+    std::swap(Src0, Src1);
+
+  if (!Src1->isImm())
+    return;
+
+  uint64_t Offset1 = Src1->getImm();
+  BaseHi = *Src0;
+
+  Addr.Base.LoReg = BaseLo.getReg();
+  Addr.Base.HiReg = BaseHi.getReg();
+  Addr.Base.LoSubReg = BaseLo.getSubReg();
+  Addr.Base.HiSubReg = BaseHi.getSubReg();
+  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
+}
+
+bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
+    MachineInstr &MI,
+    MemInfoMap &Visited,
+    SmallPtrSet<MachineInstr *, 4> &AnchorList) {
+
+  // TODO: Support flat and scratch.
+  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
+      TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+    return false;
+
+  // TODO: Support Store.
+  if (!MI.mayLoad())
+    return false;
+
+  if (AnchorList.count(&MI))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
+
+  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
+    LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
+    return false;
+  }
+
+  // Step1: Find the base-registers and a 64bit constant offset.
+  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+  MemAddress MAddr;
+  if (Visited.find(&MI) == Visited.end()) {
+    processBaseWithConstOffset(Base, MAddr);
+    Visited[&MI] = MAddr;
+  } else
+    MAddr = Visited[&MI];
+
+  if (MAddr.Offset == 0) {
+    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
+                         " constant offsets that can be promoted.\n";);
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
+             << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
+
+  // Step2: Traverse through MI's basic block and find an anchor(that has the
+  // same base-registers) with the highest 13bit distance from MI's offset.
+  // E.g. (64bit loads)
+  // bb:
+  //   addr1 = &a + 4096;   load1 = load(addr1,  0)
+  //   addr2 = &a + 6144;   load2 = load(addr2,  0)
+  //   addr3 = &a + 8192;   load3 = load(addr3,  0)
+  //   addr4 = &a + 10240;  load4 = load(addr4,  0)
+  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
+  //
+  // Starting from the first load, the optimization will try to find a new base
+  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
+  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
+  // as the new-base(anchor) because of the maximum distance which can
+  // accomodate more intermediate bases presumeably.
+  //
+  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
+  // (&a + 8192) for load1, load2, load4.
+  //   addr = &a + 8192
+  //   load1 = load(addr,       -4096)
+  //   load2 = load(addr,       -2048)
+  //   load3 = load(addr,       0)
+  //   load4 = load(addr,       2048)
+  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
+  //
+  MachineInstr *AnchorInst = nullptr;
+  MemAddress AnchorAddr;
+  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
+  SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
+
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock::iterator E = MBB->end();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  ++MBBI;
+  const SITargetLowering *TLI =
+    static_cast<const SITargetLowering *>(STM->getTargetLowering());
+
+  for ( ; MBBI != E; ++MBBI) {
+    MachineInstr &MINext = *MBBI;
+    // TODO: Support finding an anchor(with same base) from store addresses or
+    // any other load addresses where the opcodes are different.
+    if (MINext.getOpcode() != MI.getOpcode() ||
+        TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
+      continue;
+
+    const MachineOperand &BaseNext =
+      *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
+    MemAddress MAddrNext;
+    if (Visited.find(&MINext) == Visited.end()) {
+      processBaseWithConstOffset(BaseNext, MAddrNext);
+      Visited[&MINext] = MAddrNext;
+    } else
+      MAddrNext = Visited[&MINext];
+
+    if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
+        MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
+        MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
+        MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
+      continue;
+
+    InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
+
+    int64_t Dist = MAddr.Offset - MAddrNext.Offset;
+    TargetLoweringBase::AddrMode AM;
+    AM.HasBaseReg = true;
+    AM.BaseOffs = Dist;
+    if (TLI->isLegalGlobalAddressingMode(AM) &&
+        (uint32_t)std::abs(Dist) > MaxDist) {
+      MaxDist = std::abs(Dist);
+
+      AnchorAddr = MAddrNext;
+      AnchorInst = &MINext;
+    }
+  }
+
+  if (AnchorInst) {
+    LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
+               AnchorInst->dump());
+    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
+               <<  AnchorAddr.Offset << "\n\n");
+
+    // Instead of moving up, just re-compute anchor-instruction's base address.
+    unsigned Base = computeBase(MI, AnchorAddr);
+
+    updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
+    LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
+
+    for (auto P : InstsWCommonBase) {
+      TargetLoweringBase::AddrMode AM;
+      AM.HasBaseReg = true;
+      AM.BaseOffs = P.second - AnchorAddr.Offset;
+
+      if (TLI->isLegalGlobalAddressingMode(AM)) {
+        LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
+                   dbgs() << ")"; P.first->dump());
+        updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
+        LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
+      }
+    }
+    AnchorList.insert(AnchorInst);
+    return true;
+  }
+
+  return false;
+}
+
 // Scan through looking for adjacent LDS operations with constant offsets from
 // the same base register. We rely on the scheduler to do the hard work of
 // clustering nearby loads, and assume these are all adjacent.
 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
   bool Modified = false;
 
+  // Contain the list
+  MemInfoMap Visited;
+  // Contains the list of instructions for which constant offsets are being
+  // promoted to the IMM.
+  SmallPtrSet<MachineInstr *, 4> AnchorList;
+
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
     MachineInstr &MI = *I;
 
+    if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
+      Modified = true;
+
     // Don't combine if volatile.
     if (MI.hasOrderedMemoryRef()) {
       ++I;
       continue;
     }
 
+    const unsigned Opc = MI.getOpcode();
+
     CombineInfo CI;
     CI.I = I;
-    unsigned Opc = MI.getOpcode();
-    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
-        Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
+    CI.InstClass = getInstClass(Opc);
 
-      CI.InstClass = DS_READ_WRITE;
+    switch (CI.InstClass) {
+    default:
+      break;
+    case DS_READ:
       CI.EltSize =
-        (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
-
+          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
+                                                                          : 4;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeRead2Pair(CI);
       } else {
         ++I;
       }
-
       continue;
-    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
-               Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
-               Opc == AMDGPU::DS_WRITE_B64_gfx9) {
-      CI.InstClass = DS_READ_WRITE;
-      CI.EltSize
-        = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
-
+    case DS_WRITE:
+      CI.EltSize =
+          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
+                                                                            : 4;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeWrite2Pair(CI);
       } else {
         ++I;
       }
-
       continue;
-    }
-    if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
-        Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
-      // EltSize is in units of the offset encoding.
-      CI.InstClass = S_BUFFER_LOAD_IMM;
+    case S_BUFFER_LOAD_IMM:
       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
-      CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeSBufferLoadImmPair(CI);
-        if (!CI.IsX2)
-          CreatedX2++;
+        OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
       } else {
         ++I;
       }
       continue;
-    }
-    if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
-        Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
-        Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
-        Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
-      if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
-          Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
-        CI.InstClass = BUFFER_LOAD_OFFEN;
-      else
-        CI.InstClass = BUFFER_LOAD_OFFSET;
-
+    case BUFFER_LOAD_OFFEN:
+    case BUFFER_LOAD_OFFSET:
+    case BUFFER_LOAD_OFFEN_exact:
+    case BUFFER_LOAD_OFFSET_exact:
       CI.EltSize = 4;
-      CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
-                Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeBufferLoadPair(CI);
-        if (!CI.IsX2)
-          CreatedX2++;
+        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
       } else {
         ++I;
       }
       continue;
-    }
-
-    bool StoreIsX2, IsOffen;
-    if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
-      CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
+    case BUFFER_STORE_OFFEN:
+    case BUFFER_STORE_OFFSET:
+    case BUFFER_STORE_OFFEN_exact:
+    case BUFFER_STORE_OFFSET_exact:
       CI.EltSize = 4;
-      CI.IsX2 = StoreIsX2;
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeBufferStorePair(CI);
-        if (!CI.IsX2)
-          CreatedX2++;
+        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
       } else {
         ++I;
       }
@@ -958,12 +1527,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
   bool Modified = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    CreatedX2 = 0;
-    Modified |= optimizeBlock(MBB);
-
-    // Run again to convert x2 to x4.
-    if (CreatedX2 >= 1)
+    do {
+      OptimizeAgain = false;
       Modified |= optimizeBlock(MBB);
+    } while (OptimizeAgain);
   }
 
   return Modified;
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 6670def7d09457d319133a6fdfc04c52314e96c8..fb7e670068fe61201caf3dae4acb8258ac5646bc 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -1955,12 +1955,12 @@ void SIScheduleDAGMI::schedule()
 
   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
     SUnit *SU = &SUnits[i];
-    unsigned BaseLatReg;
+    MachineOperand *BaseLatOp;
     int64_t OffLatReg;
     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
       IsLowLatencySU[i] = 1;
-      if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
-                                       TRI))
+      if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
+                                         TRI))
         LowLatencyOffset[i] = OffLatReg;
     } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
       IsHighLatencySU[i] = 1;
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6fa52f57601be4844849fa52ff54c738dc4cc5e1..b4a4e9e33133dfb5373300e5b6c2395acf6e40f1 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -812,6 +812,12 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
+  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
+
+  const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
+                             ? AMDGPU::BUFFER_WBINVL1
+                             : AMDGPU::BUFFER_WBINVL1_VOL;
+
   if (Pos == Position::AFTER)
     ++MI;
 
@@ -819,7 +825,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+      BuildMI(MBB, MI, DL, TII->get(Flush));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..883fd308f2f4bd64c622fb69729117e336bb5dde
--- /dev/null
+++ b/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -0,0 +1,406 @@
+//===-- SIModeRegister.cpp - Mode Register --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass inserts changes to the Mode register settings as required.
+/// Note that currently it only deals with the Double Precision Floating Point
+/// rounding mode setting, but is intended to be generic enough to be easily
+/// expanded.
+///
+//===----------------------------------------------------------------------===//
+//
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <queue>
+
+#define DEBUG_TYPE "si-mode-register"
+
+STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted.");
+
+using namespace llvm;
+
+struct Status {
+  // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a
+  // known value
+  unsigned Mask;
+  unsigned Mode;
+
+  Status() : Mask(0), Mode(0){};
+
+  Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) {
+    Mode &= Mask;
+  };
+
+  // merge two status values such that only values that don't conflict are
+  // preserved
+  Status merge(const Status &S) const {
+    return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask)));
+  }
+
+  // merge an unknown value by using the unknown value's mask to remove bits
+  // from the result
+  Status mergeUnknown(unsigned newMask) {
+    return Status(Mask & ~newMask, Mode & ~newMask);
+  }
+
+  // intersect two Status values to produce a mode and mask that is a subset
+  // of both values
+  Status intersect(const Status &S) const {
+    unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode);
+    unsigned NewMode = (Mode & NewMask);
+    return Status(NewMask, NewMode);
+  }
+
+  // produce the delta required to change the Mode to the required Mode
+  Status delta(const Status &S) const {
+    return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode);
+  }
+
+  bool operator==(const Status &S) const {
+    return (Mask == S.Mask) && (Mode == S.Mode);
+  }
+
+  bool operator!=(const Status &S) const { return !(*this == S); }
+
+  bool isCompatible(Status &S) {
+    return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode);
+  }
+
+  bool isCombinable(Status &S) {
+    return !(Mask & S.Mask) || isCompatible(S);
+  }
+};
+
+class BlockData {
+public:
+  // The Status that represents the mode register settings required by the
+  // FirstInsertionPoint (if any) in this block. Calculated in Phase 1.
+  Status Require;
+
+  // The Status that represents the net changes to the Mode register made by
+  // this block, Calculated in Phase 1.
+  Status Change;
+
+  // The Status that represents the mode register settings on exit from this
+  // block. Calculated in Phase 2.
+  Status Exit;
+
+  // The Status that represents the intersection of exit Mode register settings
+  // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3.
+  Status Pred;
+
+  // In Phase 1 we record the first instruction that has a mode requirement,
+  // which is used in Phase 3 if we need to insert a mode change.
+  MachineInstr *FirstInsertionPoint;
+
+  BlockData() : FirstInsertionPoint(nullptr) {};
+};
+
+namespace {
+
+class SIModeRegister : public MachineFunctionPass {
+public:
+  static char ID;
+
+  std::vector<std::unique_ptr<BlockData>> BlockInfo;
+  std::queue<MachineBasicBlock *> Phase2List;
+
+  // The default mode register setting currently only caters for the floating
+  // point double precision rounding mode.
+  // We currently assume the default rounding mode is Round to Nearest
+  // NOTE: this should come from a per function rounding mode setting once such
+  // a setting exists.
+  unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST;
+  Status DefaultStatus =
+      Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode));
+
+public:
+  SIModeRegister() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+  void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+  void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+  Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII);
+
+  void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I,
+                    const SIInstrInfo *TII, Status InstrMode);
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE,
+                "Insert required mode register values", false, false)
+
+char SIModeRegister::ID = 0;
+
+char &llvm::SIModeRegisterID = SIModeRegister::ID;
+
+FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); }
+
+// Determine the Mode register setting required for this instruction.
+// Instructions which don't use the Mode register return a null Status.
+// Note this currently only deals with instructions that use the floating point
+// double precision setting.
+Status SIModeRegister::getInstructionMode(MachineInstr &MI,
+                                          const SIInstrInfo *TII) {
+  if (TII->usesFPDPRounding(MI)) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_INTERP_P1LL_F16:
+    case AMDGPU::V_INTERP_P1LV_F16:
+    case AMDGPU::V_INTERP_P2_F16:
+      // f16 interpolation instructions need double precision round to zero
+      return Status(FP_ROUND_MODE_DP(3),
+                    FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO));
+    default:
+      return DefaultStatus;
+    }
+  }
+  return Status();
+}
+
+// Insert a setreg instruction to update the Mode register.
+// It is possible (though unlikely) for an instruction to require a change to
+// the value of disjoint parts of the Mode register when we don't know the
+// value of the intervening bits. In that case we need to use more than one
+// setreg instruction.
+void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
+                                  const SIInstrInfo *TII, Status InstrMode) {
+  while (InstrMode.Mask) {
+    unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask);
+    unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset);
+    unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
+    BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+        .addImm(Value)
+        .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
+                (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+                (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
+    ++NumSetregInserted;
+    InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
+  }
+}
+
+// In Phase 1 we iterate through the instructions of the block and for each
+// instruction we get its mode usage. If the instruction uses the Mode register
+// we:
+// - update the Change status, which tracks the changes to the Mode register
+//   made by this block
+// - if this instruction's requirements are compatible with the current setting
+//   of the Mode register we merge the modes
+// - if it isn't compatible and an InsertionPoint isn't set, then we set the
+//   InsertionPoint to the current instruction, and we remember the current
+//   mode
+// - if it isn't compatible and InsertionPoint is set we insert a seteg before
+//   that instruction (unless this instruction forms part of the block's
+//   entry requirements in which case the insertion is deferred until Phase 3
+//   when predecessor exit values are known), and move the insertion point to
+//   this instruction
+// - if this is a setreg instruction we treat it as an incompatible instruction.
+//   This is sub-optimal but avoids some nasty corner cases, and is expected to
+//   occur very rarely.
+// - on exit we have set the Require, Change, and initial Exit modes.
+void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
+                                        const SIInstrInfo *TII) {
+  auto NewInfo = llvm::make_unique<BlockData>();
+  MachineInstr *InsertionPoint = nullptr;
+  // RequirePending is used to indicate whether we are collecting the initial
+  // requirements for the block, and need to defer the first InsertionPoint to
+  // Phase 3. It is set to false once we have set FirstInsertionPoint, or when
+  // we discover an explict setreg that means this block doesn't have any
+  // initial requirements.
+  bool RequirePending = true;
+  Status IPChange;
+  for (MachineInstr &MI : MBB) {
+    Status InstrMode = getInstructionMode(MI, TII);
+    if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) ||
+        (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) {
+      // We preserve any explicit mode register setreg instruction we encounter,
+      // as we assume it has been inserted by a higher authority (this is
+      // likely to be a very rare occurrence).
+      unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
+      if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
+          AMDGPU::Hwreg::ID_MODE)
+        continue;
+
+      unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
+                        AMDGPU::Hwreg::WIDTH_M1_SHIFT_) +
+                       1;
+      unsigned Offset =
+          (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
+      unsigned Mask = ((1 << Width) - 1) << Offset;
+
+      // If an InsertionPoint is set we will insert a setreg there.
+      if (InsertionPoint) {
+        insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change));
+        InsertionPoint = nullptr;
+      }
+      // If this is an immediate then we know the value being set, but if it is
+      // not an immediate then we treat the modified bits of the mode register
+      // as unknown.
+      if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) {
+        unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm();
+        unsigned Mode = (Val << Offset) & Mask;
+        Status Setreg = Status(Mask, Mode);
+        // If we haven't already set the initial requirements for the block we
+        // don't need to as the requirements start from this explicit setreg.
+        RequirePending = false;
+        NewInfo->Change = NewInfo->Change.merge(Setreg);
+      } else {
+        NewInfo->Change = NewInfo->Change.mergeUnknown(Mask);
+      }
+    } else if (!NewInfo->Change.isCompatible(InstrMode)) {
+      // This instruction uses the Mode register and its requirements aren't
+      // compatible with the current mode.
+      if (InsertionPoint) {
+        // If the required mode change cannot be included in the current
+        // InsertionPoint changes, we need a setreg and start a new
+        // InsertionPoint.
+        if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) {
+          if (RequirePending) {
+            // This is the first insertionPoint in the block so we will defer
+            // the insertion of the setreg to Phase 3 where we know whether or
+            // not it is actually needed.
+            NewInfo->FirstInsertionPoint = InsertionPoint;
+            NewInfo->Require = NewInfo->Change;
+            RequirePending = false;
+          } else {
+            insertSetreg(MBB, InsertionPoint, TII,
+                         IPChange.delta(NewInfo->Change));
+            IPChange = NewInfo->Change;
+          }
+          // Set the new InsertionPoint
+          InsertionPoint = &MI;
+        }
+        NewInfo->Change = NewInfo->Change.merge(InstrMode);
+      } else {
+        // No InsertionPoint is currently set - this is either the first in
+        // the block or we have previously seen an explicit setreg.
+        InsertionPoint = &MI;
+        IPChange = NewInfo->Change;
+        NewInfo->Change = NewInfo->Change.merge(InstrMode);
+      }
+    }
+  }
+  if (RequirePending) {
+    // If we haven't yet set the initial requirements for the block we set them
+    // now.
+    NewInfo->FirstInsertionPoint = InsertionPoint;
+    NewInfo->Require = NewInfo->Change;
+  } else if (InsertionPoint) {
+    // We need to insert a setreg at the InsertionPoint
+    insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change));
+  }
+  NewInfo->Exit = NewInfo->Change;
+  BlockInfo[MBB.getNumber()] = std::move(NewInfo);
+}
+
+// In Phase 2 we revisit each block and calculate the common Mode register
+// value provided by all predecessor blocks. If the Exit value for the block
+// is changed, then we add the successor blocks to the worklist so that the
+// exit value is propagated.
+void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
+                                        const SIInstrInfo *TII) {
+//  BlockData *BI = BlockInfo[MBB.getNumber()];
+  unsigned ThisBlock = MBB.getNumber();
+  if (MBB.pred_empty()) {
+    // There are no predecessors, so use the default starting status.
+    BlockInfo[ThisBlock]->Pred = DefaultStatus;
+  } else {
+    // Build a status that is common to all the predecessors by intersecting
+    // all the predecessor exit status values.
+    MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end();
+    MachineBasicBlock &PB = *(*P);
+    BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit;
+
+    for (P = std::next(P); P != E; P = std::next(P)) {
+      MachineBasicBlock *Pred = *P;
+      BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit);
+    }
+  }
+  Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change);
+  if (BlockInfo[ThisBlock]->Exit != TmpStatus) {
+    BlockInfo[ThisBlock]->Exit = TmpStatus;
+    // Add the successors to the work list so we can propagate the changed exit
+    // status.
+    for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
+                                          E = MBB.succ_end();
+         S != E; S = std::next(S)) {
+      MachineBasicBlock &B = *(*S);
+      Phase2List.push(&B);
+    }
+  }
+}
+
+// In Phase 3 we revisit each block and if it has an insertion point defined we
+// check whether the predecessor mode meets the block's entry requirements. If
+// not we insert an appropriate setreg instruction to modify the Mode register.
+void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB,
+                                        const SIInstrInfo *TII) {
+//  BlockData *BI = BlockInfo[MBB.getNumber()];
+  unsigned ThisBlock = MBB.getNumber();
+  if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) {
+    Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require);
+    if (BlockInfo[ThisBlock]->FirstInsertionPoint)
+      insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta);
+    else
+      insertSetreg(MBB, &MBB.instr_front(), TII, Delta);
+  }
+}
+
+bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) {
+  BlockInfo.resize(MF.getNumBlockIDs());
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  // Processing is performed in a number of phases
+
+  // Phase 1 - determine the initial mode required by each block, and add setreg
+  // instructions for intra block requirements.
+  for (MachineBasicBlock &BB : MF)
+    processBlockPhase1(BB, TII);
+
+  // Phase 2 - determine the exit mode from each block. We add all blocks to the
+  // list here, but will also add any that need to be revisited during Phase 2
+  // processing.
+  for (MachineBasicBlock &BB : MF)
+    Phase2List.push(&BB);
+  while (!Phase2List.empty()) {
+    processBlockPhase2(*Phase2List.front(), TII);
+    Phase2List.pop();
+  }
+
+  // Phase 3 - add an initial setreg to each block where the required entry mode
+  // is not satisfied by the exit mode of all its predecessors.
+  for (MachineBasicBlock &BB : MF)
+    processBlockPhase3(BB, TII);
+
+  BlockInfo.clear();
+
+  return NumSetregInserted > 0;
+}
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index e53d876f232b31f639e713625a39a6435a0a9fe3..c671fed34bdf14bfcc27dc34183f4889f238ca56 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -103,6 +103,122 @@ static MachineInstr* getOrExecSource(const MachineInstr &MI,
   return SaveExecInst;
 }
 
+// Optimize sequence
+//    %sel = V_CNDMASK_B32_e64 0, 1, %cc
+//    %cmp = V_CMP_NE_U32 1, %1
+//    $vcc = S_AND_B64 $exec, %cmp
+//    S_CBRANCH_VCC[N]Z
+// =>
+//    $vcc = S_ANDN2_B64 $exec, %cc
+//    S_CBRANCH_VCC[N]Z
+//
+// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
+// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
+// only 3 first instructions are really needed. S_AND_B64 with exec is a
+// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
+// lanes.
+//
+// Returns %cc register on success.
+static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
+                                     const GCNSubtarget &ST,
+                                     MachineRegisterInfo &MRI,
+                                     LiveIntervals *LIS) {
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const unsigned AndOpc = AMDGPU::S_AND_B64;
+  const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64;
+  const unsigned CondReg = AMDGPU::VCC;
+  const unsigned ExecReg = AMDGPU::EXEC;
+
+  auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
+                           unsigned Opc = MI.getOpcode();
+                           return Opc == AMDGPU::S_CBRANCH_VCCZ ||
+                                  Opc == AMDGPU::S_CBRANCH_VCCNZ; });
+  if (I == MBB.terminators().end())
+    return AMDGPU::NoRegister;
+
+  auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister,
+                                   *I, MRI, LIS);
+  if (!And || And->getOpcode() != AndOpc ||
+      !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
+    return AMDGPU::NoRegister;
+
+  MachineOperand *AndCC = &And->getOperand(1);
+  unsigned CmpReg = AndCC->getReg();
+  unsigned CmpSubReg = AndCC->getSubReg();
+  if (CmpReg == ExecReg) {
+    AndCC = &And->getOperand(2);
+    CmpReg = AndCC->getReg();
+    CmpSubReg = AndCC->getSubReg();
+  } else if (And->getOperand(2).getReg() != ExecReg) {
+    return AMDGPU::NoRegister;
+  }
+
+  auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS);
+  if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
+                Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
+      Cmp->getParent() != And->getParent())
+    return AMDGPU::NoRegister;
+
+  MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
+  MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
+  if (Op1->isImm() && Op2->isReg())
+    std::swap(Op1, Op2);
+  if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
+    return AMDGPU::NoRegister;
+
+  unsigned SelReg = Op1->getReg();
+  auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
+  if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+    return AMDGPU::NoRegister;
+
+  Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
+  Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
+  MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
+  if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
+      Op1->getImm() != 0 || Op2->getImm() != 1)
+    return AMDGPU::NoRegister;
+
+  LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
+                    << *Cmp << '\t' << *And);
+
+  unsigned CCReg = CC->getReg();
+  LIS->RemoveMachineInstrFromMaps(*And);
+  MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
+                                TII->get(Andn2Opc), And->getOperand(0).getReg())
+                            .addReg(ExecReg)
+                            .addReg(CCReg, CC->getSubReg());
+  And->eraseFromParent();
+  LIS->InsertMachineInstrInMaps(*Andn2);
+
+  LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
+
+  // Try to remove compare. Cmp value should not used in between of cmp
+  // and s_and_b64 if VCC or just unused if any other register.
+  if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
+       MRI.use_nodbg_empty(CmpReg)) ||
+      (CmpReg == CondReg &&
+       std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
+                    [&](const MachineInstr &MI) {
+                      return MI.readsRegister(CondReg, TRI); }))) {
+    LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
+
+    LIS->RemoveMachineInstrFromMaps(*Cmp);
+    Cmp->eraseFromParent();
+
+    // Try to remove v_cndmask_b32.
+    if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
+        MRI.use_nodbg_empty(SelReg)) {
+      LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
+
+      LIS->RemoveMachineInstrFromMaps(*Sel);
+      Sel->eraseFromParent();
+    }
+  }
+
+  return CCReg;
+}
+
 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -117,6 +233,14 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineBasicBlock &MBB : MF) {
 
+    if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) {
+      RecalcRegs.insert(Reg);
+      RecalcRegs.insert(AMDGPU::VCC_LO);
+      RecalcRegs.insert(AMDGPU::VCC_HI);
+      RecalcRegs.insert(AMDGPU::SCC);
+      Changed = true;
+    }
+
     // Try to remove unneeded instructions before s_endpgm.
     if (MBB.succ_empty()) {
       if (MBB.empty())
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 0e000b72962eb0ea37df3363e5f30a2bd8e683f3..2d43d5d05ef6437a0f49038141978d1218646b5f 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -90,7 +90,9 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
   void matchSDWAOperands(MachineBasicBlock &MBB);
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
-  bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
+  bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
+  void pseudoOpConvertToVOP2(MachineInstr &MI,
+                             const GCNSubtarget &ST) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
@@ -854,7 +856,82 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
   }
 }
 
-bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
+// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
+// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
+//
+// We are transforming from a VOP3 into a VOP2 form of the instruction.
+//   %19:vgpr_32 = V_AND_B32_e32 255,
+//       killed %16:vgpr_32, implicit $exec
+//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
+//       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
+//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
+//       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
+//
+// becomes
+//   %47:vgpr_32 = V_ADD_I32_sdwa
+//       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
+//       implicit-def $vcc, implicit $exec
+//  %48:vgpr_32 = V_ADDC_U32_e32
+//       0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
+void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
+                                           const GCNSubtarget &ST) const {
+  int Opc = MI.getOpcode();
+  assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
+         "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
+
+  // Can the candidate MI be shrunk?
+  if (!TII->canShrink(MI, *MRI))
+    return;
+  Opc = AMDGPU::getVOPe32(Opc);
+  // Find the related ADD instruction.
+  const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  if (!Sdst)
+    return;
+  MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
+  if (!NextOp)
+    return;
+  MachineInstr &MISucc = *NextOp->getParent();
+  // Can the successor be shrunk?
+  if (!TII->canShrink(MISucc, *MRI))
+    return;
+  int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
+  // Make sure the carry in/out are subsequently unused.
+  MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
+  if (!CarryIn)
+    return;
+  MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
+  if (!CarryOut)
+    return;
+  if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
+    return;
+  // Make sure VCC or its subregs are dead before MI.
+  MachineBasicBlock &MBB = *MI.getParent();
+  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
+  if (Liveness != MachineBasicBlock::LQR_Dead)
+    return;
+  // Check if VCC is referenced in range of (MI,MISucc].
+  for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
+       I != E; ++I) {
+    if (I->modifiesRegister(AMDGPU::VCC, TRI))
+      return;
+  }
+  // Make the two new e32 instruction variants.
+  // Replace MI with V_{SUB|ADD}_I32_e32
+  auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
+  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
+  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
+  MI.eraseFromParent();
+  // Replace MISucc with V_{SUBB|ADDC}_U32_e32
+  auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
+  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
+  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
+  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
+  MISucc.eraseFromParent();
+}
+
+bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
                                          const GCNSubtarget &ST) const {
   // Check if this is already an SDWA instruction
   unsigned Opc = MI.getOpcode();
@@ -1127,6 +1204,22 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     bool Changed = false;
     do {
+      // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
+      // Look for a possible ADD or SUB that resulted from a previously lowered
+      // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
+      // lowers the pair of instructions into e32 form.
+      matchSDWAOperands(MBB);
+      for (const auto &OperandPair : SDWAOperands) {
+        const auto &Operand = OperandPair.second;
+        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+        if (PotentialMI &&
+           (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
+            PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
+          pseudoOpConvertToVOP2(*PotentialMI, ST);
+      }
+      SDWAOperands.clear();
+
+      // Generate potential match list.
       matchSDWAOperands(MBB);
 
       for (const auto &OperandPair : SDWAOperands) {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 669a60fadd4f891e28b123d62f06b4ba2efef6b6..97cfde2b2354128982007949da015f1d03000fe0 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -18,9 +18,12 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 
@@ -901,7 +904,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
         .addImm(0)                        // glc
         .addMemOperand(MMO);
 
-      if (NumSubRegs > 1)
+      if (NumSubRegs > 1 && i == 0)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
 
       continue;
@@ -915,7 +918,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
         .addReg(Spill.VGPR)
         .addImm(Spill.Lane);
 
-      if (NumSubRegs > 1)
+      if (NumSubRegs > 1 && i == 0)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
     } else {
       if (OnlyToVGPR)
@@ -1599,3 +1602,57 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
     llvm_unreachable("not implemented");
   }
 }
+
+// Find reaching register definition
+MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
+                                              MachineInstr &Use,
+                                              MachineRegisterInfo &MRI,
+                                              LiveIntervals *LIS) const {
+  auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
+  SlotIndex UseIdx = LIS->getInstructionIndex(Use);
+  SlotIndex DefIdx;
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (!LIS->hasInterval(Reg))
+      return nullptr;
+    LiveInterval &LI = LIS->getInterval(Reg);
+    LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
+                                  : MRI.getMaxLaneMaskForVReg(Reg);
+    VNInfo *V = nullptr;
+    if (LI.hasSubRanges()) {
+      for (auto &S : LI.subranges()) {
+        if ((S.LaneMask & SubLanes) == SubLanes) {
+          V = S.getVNInfoAt(UseIdx);
+          break;
+        }
+      }
+    } else {
+      V = LI.getVNInfoAt(UseIdx);
+    }
+    if (!V)
+      return nullptr;
+    DefIdx = V->def;
+  } else {
+    // Find last def.
+    for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) {
+      LiveRange &LR = LIS->getRegUnit(*Units);
+      if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
+        if (!DefIdx.isValid() ||
+            MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
+                          LIS->getInstructionFromIndex(V->def)))
+          DefIdx = V->def;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
+
+  if (!Def || !MDT.dominates(Def, &Use))
+    return nullptr;
+
+  assert(Def->modifiesRegister(Reg, this));
+
+  return Def;
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 5a51b67ca719c30c349582e1b201b9e14d91e6f8..b82fefde47e1344d7b21638328793fca2ef2b8b6 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -228,6 +228,12 @@ public:
   getConstrainedRegClassForOperand(const MachineOperand &MO,
                                  const MachineRegisterInfo &MRI) const override;
 
+  // Find reaching register definition
+  MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg,
+                                MachineInstr &Use,
+                                MachineRegisterInfo &MRI,
+                                LiveIntervals *LIS) const;
+
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
                            unsigned LoadStoreOp,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index f87a0763b353b999943cc42edc0f00af59ca29b2..80bc180ba6eea5a2dbafeb60858a96c8bfbb9594 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -435,7 +435,7 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   let AllocationPriority = 7;
 }
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
 }
@@ -444,13 +444,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add
   let isAllocatable = 0;
 }
 
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
   (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
 }
 
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 8;
@@ -459,15 +459,15 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
 // Requires 2 s_mov_b64 to copy
 let CopyCost = 2 in {
 
-def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
   let AllocationPriority = 10;
 }
 
-def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
   let isAllocatable = 0;
 }
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32,
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v16i8, v2i64, v2f64], 32,
   (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 10;
 }
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 015773b110420505693912faa7d137630926ca3f..6ad7dd0e3a7ce9cb764b8bfc61f1fd3b45887ecc 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -212,6 +212,82 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
   }
 }
 
+/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
+/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
+/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
+/// XNOR (as a ^ b == ~(a ^ ~b)).
+/// \returns true if the caller should continue the machine function iterator
+static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
+                                MachineRegisterInfo &MRI,
+                                const SIInstrInfo *TII,
+                                MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  const MachineOperand *Dest = &MI.getOperand(0);
+  MachineOperand *Src0 = &MI.getOperand(1);
+  MachineOperand *Src1 = &MI.getOperand(2);
+  MachineOperand *SrcReg = Src0;
+  MachineOperand *SrcImm = Src1;
+
+  if (SrcImm->isImm() &&
+      !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) {
+    uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
+    uint32_t NewImm = 0;
+
+    if (Opc == AMDGPU::S_AND_B32) {
+      if (isPowerOf2_32(~Imm)) {
+        NewImm = countTrailingOnes(Imm);
+        Opc = AMDGPU::S_BITSET0_B32;
+      } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+        NewImm = ~Imm;
+        Opc = AMDGPU::S_ANDN2_B32;
+      }
+    } else if (Opc == AMDGPU::S_OR_B32) {
+      if (isPowerOf2_32(Imm)) {
+        NewImm = countTrailingZeros(Imm);
+        Opc = AMDGPU::S_BITSET1_B32;
+      } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+        NewImm = ~Imm;
+        Opc = AMDGPU::S_ORN2_B32;
+      }
+    } else if (Opc == AMDGPU::S_XOR_B32) {
+      if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+        NewImm = ~Imm;
+        Opc = AMDGPU::S_XNOR_B32;
+      }
+    } else {
+      llvm_unreachable("unexpected opcode");
+    }
+
+    if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
+        SrcImm == Src0) {
+      if (!TII->commuteInstruction(MI, false, 1, 2))
+        NewImm = 0;
+    }
+
+    if (NewImm != 0) {
+      if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
+        SrcReg->isReg()) {
+        MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+        MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+        return true;
+      }
+
+      if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+        MI.setDesc(TII->get(Opc));
+        if (Opc == AMDGPU::S_BITSET0_B32 ||
+            Opc == AMDGPU::S_BITSET1_B32) {
+          Src0->ChangeToImmediate(NewImm);
+          MI.RemoveOperand(2);
+        } else {
+          SrcImm->setImm(NewImm);
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
 // This is the same as MachineInstr::readsRegister/modifiesRegister except
 // it takes subregs into account.
 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
@@ -512,6 +588,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
+      // Shrink scalar logic operations.
+      if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+          MI.getOpcode() == AMDGPU::S_OR_B32 ||
+          MI.getOpcode() == AMDGPU::S_XOR_B32) {
+        if (shrinkScalarLogicOp(ST, MRI, TII, MI))
+          continue;
+      }
+
       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
         continue;
 
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 8bd7de7269b7ed650f9cae0e5ec0c8470e09aedf..8a063e1a4867328022854a2f8fb4d83add936fae 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -751,6 +751,12 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2",   v2i32>;
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4",   v4i32>;
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8",   v8i32>;
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16",  v16i32>;
+
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD",     f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2",   v2f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4",   v4f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8",   v8f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16",  v16f32>;
 } // End let AddedComplexity = 100
 
 let OtherPredicates = [isSICI] in {
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 5841dcb2b9c68fdb37c92ab40265b25bd492be83..ca5e981ac5c25c1f1f6c77a10d5c4e9d8a206795 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -336,6 +336,12 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
   "$sdst, $src0, $src1", pattern
 >;
 
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0),
+  (Op $src0),
+  [{ return !N->isDivergent(); }]
+>;
+
 class UniformBinFrag<SDPatternOperator Op> : PatFrag <
   (ops node:$src0, node:$src1),
   (Op $src0, $src1),
@@ -421,16 +427,39 @@ def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
 def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
   [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
 >;
+
+def S_NAND_B32 : SOP2_32 <"s_nand_b32",
+  [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NAND_B64 : SOP2_64 <"s_nand_b64",
+  [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))]
+>;
+
+def S_NOR_B32 : SOP2_32 <"s_nor_b32",
+  [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NOR_B64 : SOP2_64 <"s_nor_b64",
+  [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))]
+>;
 } // End isCommutable = 1
 
-def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
-def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">;
-def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">;
-def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">;
-def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
-def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
-def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
-def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
+def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
+  [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64",
+  [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
+
+def S_ORN2_B32 : SOP2_32 <"s_orn2_b32",
+  [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ORN2_B64 : SOP2_64 <"s_orn2_b64",
+  [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
 } // End Defs = [SCC]
 
 // Use added complexity so these patterns are preferred to the VALU patterns.
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 634ec8fcc3d11ad657327d32445e3a89f4aef55f..54c866bdc63ce060b036a09d86f226cbc02a0bf6 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -128,6 +128,49 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
   return NewInfo ? NewInfo->Opcode : -1;
 }
 
+struct MUBUFInfo {
+  uint16_t Opcode;
+  uint16_t BaseOpcode;
+  uint8_t dwords;
+  bool has_vaddr;
+  bool has_srsrc;
+  bool has_soffset;
+};
+
+#define GET_MUBUFInfoTable_DECL
+#define GET_MUBUFInfoTable_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+
+int getMUBUFBaseOpcode(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc);
+  return Info ? Info->BaseOpcode : -1;
+}
+
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) {
+  const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords);
+  return Info ? Info->Opcode : -1;
+}
+
+int getMUBUFDwords(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->dwords : 0;
+}
+
+bool getMUBUFHasVAddr(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->has_vaddr : false;
+}
+
+bool getMUBUFHasSrsrc(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->has_srsrc : false;
+}
+
+bool getMUBUFHasSoffset(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->has_soffset : false;
+}
+
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
@@ -159,7 +202,8 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
 }
 
 bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
-  return STI->getFeatureBits().test(FeatureCodeObjectV3);
+  return STI->getTargetTriple().getOS() == Triple::AMDHSA &&
+             STI->getFeatureBits().test(FeatureCodeObjectV3);
 }
 
 unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
@@ -521,6 +565,14 @@ void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
   Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
 }
 
+Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) {
+  Waitcnt Decoded;
+  Decoded.VmCnt = decodeVmcnt(Version, Encoded);
+  Decoded.ExpCnt = decodeExpcnt(Version, Encoded);
+  Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded);
+  return Decoded;
+}
+
 unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
                      unsigned Vmcnt) {
   Waitcnt =
@@ -551,6 +603,10 @@ unsigned encodeWaitcnt(const IsaVersion &Version,
   return Waitcnt;
 }
 
+unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
+  return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt);
+}
+
 unsigned getInitialPSInputAddr(const Function &F) {
   return getIntegerAttribute(F, "InitialPSInputAddr", 0);
 }
@@ -754,6 +810,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::VS_64RegClassID:
   case AMDGPU::SReg_64RegClassID:
   case AMDGPU::VReg_64RegClassID:
+  case AMDGPU::SReg_64_XEXECRegClassID:
     return 64;
   case AMDGPU::VReg_96RegClassID:
     return 96;
@@ -894,9 +951,12 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
 // Given Imm, split it into the values to put into the SOffset and ImmOffset
 // fields in an MUBUF instruction. Return false if it is not possible (due to a
 // hardware bug needing a workaround).
+//
+// The required alignment ensures that individual address components remain
+// aligned if they are aligned to begin with. It also ensures that additional
+// offsets within the given alignment can be added to the resulting ImmOffset.
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget) {
-  const uint32_t Align = 4;
+                      const GCNSubtarget *Subtarget, uint32_t Align) {
   const uint32_t MaxImm = alignDown(4095, Align);
   uint32_t Overflow = 0;
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d45f42498692ede8e565d1c8c63fc540493ba473..1ef7da328d31a85658b0a4ee3df9f613c7a34106 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -220,6 +220,24 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
 LLVM_READONLY
 int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
 
+LLVM_READONLY
+int getMUBUFBaseOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords);
+
+LLVM_READONLY
+int getMUBUFDwords(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasVAddr(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasSrsrc(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasSoffset(unsigned Opc);
+
 LLVM_READONLY
 int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
@@ -258,6 +276,32 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
                                             std::pair<int, int> Default,
                                             bool OnlyFirstRequired = false);
 
+/// Represents the counter values to wait for in an s_waitcnt instruction.
+///
+/// Large values (including the maximum possible integer) can be used to
+/// represent "don't care" waits.
+struct Waitcnt {
+  unsigned VmCnt = ~0u;
+  unsigned ExpCnt = ~0u;
+  unsigned LgkmCnt = ~0u;
+
+  Waitcnt() {}
+  Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt)
+      : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {}
+
+  static Waitcnt allZero() { return Waitcnt(0, 0, 0); }
+
+  bool dominates(const Waitcnt &Other) const {
+    return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
+           LgkmCnt <= Other.LgkmCnt;
+  }
+
+  Waitcnt combined(const Waitcnt &Other) const {
+    return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt),
+                   std::min(LgkmCnt, Other.LgkmCnt));
+  }
+};
+
 /// \returns Vmcnt bit mask for given isa \p Version.
 unsigned getVmcntBitMask(const IsaVersion &Version);
 
@@ -291,6 +335,8 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
 void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
 
+Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded);
+
 /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
 unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
                      unsigned Vmcnt);
@@ -318,6 +364,8 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
 unsigned encodeWaitcnt(const IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
 
+unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
+
 unsigned getInitialPSInputAddr(const Function &F);
 
 LLVM_READNONE
@@ -441,11 +489,8 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 /// not the encoded offset.
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
-// Given Imm, split it into the values to put into the SOffset and ImmOffset
-// fields in an MUBUF instruction. Return false if it is not possible (due to a
-// hardware bug needing a workaround).
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget);
+                      const GCNSubtarget *Subtarget, uint32_t Align = 4);
 
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 9f4673cf7ab55e88c3e76f606ff3457c573ff37a..68446ab79720af70c856845bcefaffd1f19ca642 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP1";
 }
 
+class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_DPP_Pseudo <OpName, P, pattern> {
+}
+
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret =
     !if(P.HasModifiers,
@@ -103,6 +107,8 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   def _e32 : VOP1_Pseudo <opName, P>;
   def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
   def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+  foreach _ = BoolToList<P.HasExtDPP>.ret in
+    def _dpp : VOP1_DPP_Pseudo <opName, P>;
 }
 
 // Special profile for instructions which have clamp
@@ -173,7 +179,9 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
 defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
+let FPDPRounding = 1 in {
 defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+} // End FPDPRounding = 1
 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
@@ -226,7 +234,9 @@ defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
 let SchedRW = [WriteDoubleAdd] in {
 defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
 defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
+let FPDPRounding = 1 in {
 defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
+} // End FPDPRounding = 1
 } // End SchedRW = [WriteDoubleAdd]
 
 defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
@@ -333,8 +343,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
 
 let SubtargetPredicate = Has16BitInsts in {
 
+let FPDPRounding = 1 in {
 defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
 defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
+} // End FPDPRounding = 1
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
 let SchedRW = [WriteQuarterRate32] in {
@@ -352,7 +364,9 @@ defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
 defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
 defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
 defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
+let FPDPRounding = 1 in {
 defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
+} // End FPDPRounding = 1
 
 }
 
@@ -500,13 +514,8 @@ defm V_EXP_LEGACY_F32    : VOP1_Real_ci <0x46>;
 // VI
 //===----------------------------------------------------------------------===//
 
-class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
-  VOP_DPP <ps.OpName, P> {
-  let Defs = ps.Defs;
-  let Uses = ps.Uses;
-  let SchedRW = ps.SchedRW;
-  let hasSideEffects = ps.hasSideEffects;
-
+class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
+  VOP_DPPe <P> {
   bits<8> vdst;
   let Inst{8-0}   = 0xfa; // dpp
   let Inst{16-9}  = op;
@@ -544,9 +553,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
     VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
-  // For now left dpp only for asm/dasm
-  // TODO: add corresponding pseudo
-  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_vi :
+      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
+      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
 defm V_NOP               : VOP1_Real_vi <0x0>;
@@ -717,9 +727,11 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
     VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
-  // For now left dpp only for asm/dasm
-  // TODO: add corresponding pseudo
-  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx9 :
+      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+
 }
 
 defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index db031be7e558ed9d3b46ae5a7f9be9aaf5ed213a..e3fd7b5f9fadd9f645fabeab78b30a3d749f9e6d 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP2";
 }
 
+class VOP2_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_DPP_Pseudo <OpName, P, pattern> {
+}
+
+
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
@@ -155,7 +160,12 @@ multiclass VOP2Inst<string opName,
                     bit GFX9Renamed = 0> :
     VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
     VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
-    VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed>;
+    VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> {
+  let renamedInGFX9 = GFX9Renamed in {
+    foreach _ = BoolToList<P.HasExtDPP>.ret in
+      def _dpp  : VOP2_DPP_Pseudo <opName, P>;
+  }
+}
 
 multiclass VOP2bInst <string opName,
                       VOPProfile P,
@@ -172,6 +182,8 @@ multiclass VOP2bInst <string opName,
         def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {
           let AsmMatchConverter = "cvtSdwaVOP2b";
         }
+        foreach _ = BoolToList<P.HasExtDPP>.ret in
+          def _dpp  : VOP2_DPP_Pseudo <opName, P>;
       }
 
       def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -194,6 +206,9 @@ multiclass VOP2eInst <string opName,
       def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
         let AsmMatchConverter = "cvtSdwaVOP2b";
       }
+
+      foreach _ = BoolToList<P.HasExtDPP>.ret in
+        def _dpp  : VOP2_DPP_Pseudo <opName, P>;
     }
 
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -233,9 +248,9 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                        0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
-  let InsDPP = (ins DstRCDPP:$old,
-                    Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+  let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+                    VGPR_32:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
 
@@ -540,18 +555,23 @@ def :  divergent_i64_BinOp <xor, V_XOR_B32_e32>;
 
 let SubtargetPredicate = Has16BitInsts in {
 
+let FPDPRounding = 1 in {
 def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
+defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+} // End FPDPRounding = 1
+
 defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
 defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
 defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
-defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
 
 let isCommutable = 1 in {
+let FPDPRounding = 1 in {
 defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
 defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
 defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
 def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
+} // End FPDPRounding = 1
 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
@@ -778,13 +798,8 @@ defm V_CVT_PK_I16_I32     : VOP2_Real_e32e64_si <0x31>;
 // VI
 //===----------------------------------------------------------------------===//
 
-class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> :
-  VOP_DPP <OpName, P> {
-  let Defs = ps.Defs;
-  let Uses = ps.Uses;
-  let SchedRW = ps.SchedRW;
-  let hasSideEffects = ps.hasSideEffects;
-
+class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
+  VOP_DPPe <P> {
   bits<8> vdst;
   bits<8> src1;
   let Inst{8-0}   = 0xfa; //dpp
@@ -865,8 +880,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
       VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
       let AsmString = AsmName # ps.AsmOperands;
     }
-  def _dpp :
-    VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>;
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_vi :
+      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
+        VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
+        let AsmString = AsmName # ps.AsmOperands;
+      }
 }
 }
 
@@ -893,10 +913,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
       VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
       let AsmString = AsmName # ps.AsmOperands;
     }
-  def _dpp_gfx9 :
-    VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> {
-      let DecoderNamespace = "SDWA9";
-    }
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx9 :
+      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
+        VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
+        let AsmString = AsmName # ps.AsmOperands;
+        let DecoderNamespace = "SDWA9";
+      }
 }
 
 multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
@@ -914,19 +938,23 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
     VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
     }
-  def _dpp_gfx9 :
-    VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
-      let DecoderNamespace = "SDWA9";
-    }
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx9 :
+      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
+        let DecoderNamespace = "SDWA9";
+      }
 }
 
 } // AssemblerPredicates = [isGFX9]
 
 multiclass VOP2_Real_e32e64_vi <bits<6> op> :
   Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
-  // For now left dpp only for asm/dasm
-  // TODO: add corresponding pseudo
-  def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_vi :
+      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
 defm V_CNDMASK_B32        : VOP2_Real_e32e64_vi <0x0>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 51bee3efeb2c4b923d87cd324aece67117c47672..4b8c1f208a0eda54003b5fb238f2c55e92279994 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -220,7 +220,8 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
 // VOP3 INTERP
 //===----------------------------------------------------------------------===//
 
-class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> {
+class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
+                 VOP3_Pseudo<OpName, P, pattern> {
   let AsmMatchConverter = "cvtVOP3Interp";
 }
 
@@ -292,9 +293,11 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
 def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteDoubleAdd] in {
+let FPDPRounding = 1 in {
 def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
+} // End FPDPRounding = 1
 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
 def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
 } // End SchedRW = [WriteDoubleAdd]
@@ -324,6 +327,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
 def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
   getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
   let SchedRW = [WriteDouble];
+  let FPDPRounding = 1;
 }
 } // End Uses = [VCC, EXEC]
 
@@ -354,10 +358,10 @@ def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CL
 def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
 def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
 
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
 def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
 def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
@@ -368,6 +372,7 @@ def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32,
 def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
   let SchedRW = [WriteDouble, WriteSALU];
   let AsmMatchConverter = "";
+  let FPDPRounding = 1;
 }
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -431,33 +436,51 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 
 def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
   let Predicates = [Has16BitInsts, isVIOnly];
+  let FPDPRounding = 1;
 }
 def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
                                       VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
   let renamedInGFX9 = 1;
   let Predicates = [Has16BitInsts, isGFX9];
+  let FPDPRounding = 1;
+}
+
+def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> {
+  let Predicates = [Has16BitInsts, isVIOnly];
+  let FPDPRounding = 1;
+}
+def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> {
+  let renamedInGFX9 = 1;
+  let Predicates = [Has16BitInsts, isGFX9];
+  let FPDPRounding = 1;
 }
 
 let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
 
 let renamedInGFX9 = 1 in {
-def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
 def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
+let FPDPRounding = 1 in {
+def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+let Uses = [M0, EXEC] in {
 def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
-}
+} // End Uses = [M0, EXEC]
+} // End FPDPRounding = 1
+} // End renamedInGFX9 = 1
 
 let SubtargetPredicate = isGFX9 in {
-def V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> {
+  let FPDPRounding = 1;
+}
 def V_MAD_U16_gfx9   : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
 def V_MAD_I16_gfx9   : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
-def V_FMA_F16_gfx9   : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
 def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 } // End SubtargetPredicate = isGFX9
 
+let Uses = [M0, EXEC], FPDPRounding = 1 in {
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
 def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+} // End Uses = [M0, EXEC], FPDPRounding = 1
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
@@ -485,6 +508,37 @@ defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
 
 } // End Predicates = [Has16BitInsts]
 
+class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+  (ops node:$x, node:$y, node:$z),
+  // When the inner operation is used multiple times, selecting 3-op
+  // instructions may still be beneficial -- if the other users can be
+  // combined similarly. Let's be conservative for now.
+  (op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z),
+  [{
+    // Only use VALU ops when the result is divergent.
+    if (!N->isDivergent())
+      return false;
+
+    // Check constant bus limitations.
+    //
+    // Note: Use !isDivergent as a conservative proxy for whether the value
+    //       is in an SGPR (uniform values can end up in VGPRs as well).
+    unsigned ConstantBusUses = 0;
+    for (unsigned i = 0; i < 3; ++i) {
+      if (!Operands[i]->isDivergent() &&
+          !isInlineImmediate(Operands[i].getNode())) {
+        ConstantBusUses++;
+        if (ConstantBusUses >= 2)
+          return false;
+      }
+    }
+
+    return true;
+  }]
+> {
+  let PredicateCodeUsesOperands = 1;
+}
+
 let SubtargetPredicate = isGFX9 in {
 def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
 def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -519,6 +573,22 @@ def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B3
 
 def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
+
+
+class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
+  // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
+  (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
+  (inst i32:$src0, i32:$src1, i32:$src2)
+>;
+
+def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>;
+def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32>;
+def : ThreeOp_i32_Pats<add, add, V_ADD3_U32>;
+def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32>;
+def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>;
+def : ThreeOp_i32_Pats<or, or, V_OR3_B32>;
+def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>;
+
 } // End SubtargetPredicate = isGFX9
 
 //===----------------------------------------------------------------------===//
@@ -792,12 +862,15 @@ defm V_FMA_F16          : VOP3_F16_Real_vi <0x1ee>;
 defm V_DIV_FIXUP_F16    : VOP3_F16_Real_vi <0x1ef>;
 defm V_INTERP_P2_F16    : VOP3Interp_F16_Real_vi <0x276>;
 
+let FPDPRounding = 1 in {
 defm V_MAD_LEGACY_F16       : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16",       "v_mad_legacy_f16">;
-defm V_MAD_LEGACY_U16       : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16",       "v_mad_legacy_u16">;
-defm V_MAD_LEGACY_I16       : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16",       "v_mad_legacy_i16">;
 defm V_FMA_LEGACY_F16       : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16",       "v_fma_legacy_f16">;
 defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">;
 defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">;
+} // End FPDPRounding = 1
+
+defm V_MAD_LEGACY_U16       : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16",       "v_mad_legacy_u16">;
+defm V_MAD_LEGACY_I16       : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16",       "v_mad_legacy_i16">;
 
 defm V_MAD_F16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
 defm V_MAD_U16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 2efd28b9cd8bc1b0c76ca85253a4adff3abaa79f..0d25a86da3266480619df72c85e6d6f0610ca7da 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -42,12 +42,14 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
 }
 
 let isCommutable = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
 def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
 def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
 
+let FPDPRounding = 1 in {
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
+} // End FPDPRounding = 1
 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
 def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
 
@@ -137,12 +139,14 @@ let SubtargetPredicate = HasMadMixInsts in {
 let isCommutable = 1 in {
 def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
+let FPDPRounding = 1 in {
 // Clamp modifier is applied after conversion to f16.
 def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 
 let ClampLo = 0, ClampHi = 1 in {
 def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 }
+} // End FPDPRounding = 1
 }
 
 defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
@@ -154,12 +158,14 @@ let SubtargetPredicate = HasFmaMixInsts in {
 let isCommutable = 1 in {
 def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
+let FPDPRounding = 1 in {
 // Clamp modifier is applied after conversion to f16.
 def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 
 let ClampLo = 0, ClampHi = 1 in {
 def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
 }
+} // End FPDPRounding = 1
 }
 
 defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index e177b2fd081f5a7a37e8101528a34b179243b7da..7de7d90d27b3add0bb70891242e856098492d38a 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -505,9 +505,14 @@ class VOP_DPPe<VOPProfile P> : Enc64 {
   let Inst{63-60} = row_mask;
 }
 
-class VOP_DPP <string OpName, VOPProfile P> :
-  InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>,
-  VOP_DPPe<P> {
+class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
+  VOP <OpName>,
+  SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>,
+  MnemonicAlias <OpName#"_dpp", OpName> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
 
   let mayLoad = 0;
   let mayStore = 0;
@@ -517,6 +522,11 @@ class VOP_DPP <string OpName, VOPProfile P> :
   let VALU = 1;
   let DPP = 1;
   let Size = 8;
+  let Uses = [EXEC];
+  let isConvergent = 1;
+
+  string Mnemonic = OpName;
+  string AsmOperands = P.AsmDPP;
 
   let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
   let SubtargetPredicate = HasDPP;
@@ -526,6 +536,36 @@ class VOP_DPP <string OpName, VOPProfile P> :
   let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
   let DecoderNamespace = "DPP";
+
+  VOPProfile Pfl = P;
+}
+
+class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Defs = ps.Defs;
+  let Uses = ps.Uses;
+  let SchedRW = ps.SchedRW;
+  let hasSideEffects = ps.hasSideEffects;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  // Copy relevant pseudo op flags
+  let isConvergent         = ps.isConvergent;
+  let SubtargetPredicate   = ps.SubtargetPredicate;
+  let AssemblerPredicate   = ps.AssemblerPredicate;
+  let AsmMatchConverter    = ps.AsmMatchConverter;
+  let AsmVariantName       = ps.AsmVariantName;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let DecoderNamespace     = ps.DecoderNamespace;
+  let Constraints          = ps.Constraints;
+  let DisableEncoding      = ps.DisableEncoding;
+  let TSFlags              = ps.TSFlags;
 }
 
 class getNumNodeArgs<SDPatternOperator Op> {
diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp
index 1acae3a88870f6f3a15406a57a28d550ac891292..6f5bbd3b4ef3690584a115112dbcf80821d73e1f 100644
--- a/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/lib/Target/ARC/ARCTargetMachine.cpp
@@ -26,12 +26,6 @@ static Reloc::Model getRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 /// ARCTargetMachine ctor - Create an ILP32 architecture model
 ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
@@ -43,7 +37,7 @@ ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT,
                         "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-"
                         "f32:32:32-i64:32-f64:32-a:0:32-n32",
                         TT, CPU, FS, Options, getRelocModel(RM),
-                        getEffectiveCodeModel(CM), OL),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index deab039772c0bff4fe7d36b0173de47651fddbc4..8e80c32bcf89183425809fe85ac320f4255535d3 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -429,7 +429,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   auto &TLI = *getTLI<ARMTargetLowering>();
   auto Subtarget = TLI.getSubtarget();
 
-  if (Subtarget->isThumb())
+  if (Subtarget->isThumb1Only())
     return false;
 
   // Quick exit if there aren't any args
@@ -500,6 +500,22 @@ struct CallReturnHandler : public IncomingValueHandler {
   MachineInstrBuilder MIB;
 };
 
+// FIXME: This should move to the ARMSubtarget when it supports all the opcodes.
+unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) {
+  if (isDirect)
+    return STI.isThumb() ? ARM::tBL : ARM::BL;
+
+  if (STI.isThumb())
+    return ARM::tBLXr;
+
+  if (STI.hasV5TOps())
+    return ARM::BLX;
+
+  if (STI.hasV4TOps())
+    return ARM::BX_CALL;
+
+  return ARM::BMOVPCRX_CALL;
+}
 } // end anonymous namespace
 
 bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
@@ -517,27 +533,34 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   if (STI.genLongCalls())
     return false;
 
+  if (STI.isThumb1Only())
+    return false;
+
   auto CallSeqStart = MIRBuilder.buildInstr(ARM::ADJCALLSTACKDOWN);
 
   // Create the call instruction so we can add the implicit uses of arg
   // registers, but don't insert it yet.
   bool isDirect = !Callee.isReg();
-  auto CallOpcode =
-      isDirect ? ARM::BL
-               : STI.hasV5TOps()
-                     ? ARM::BLX
-                     : STI.hasV4TOps() ? ARM::BX_CALL : ARM::BMOVPCRX_CALL;
-  auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode)
-                 .add(Callee)
-                 .addRegMask(TRI->getCallPreservedMask(MF, CallConv));
-  if (Callee.isReg()) {
+  auto CallOpcode = getCallOpcode(STI, isDirect);
+  auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode);
+
+  bool isThumb = STI.isThumb();
+  if (isThumb)
+    MIB.add(predOps(ARMCC::AL));
+
+  MIB.add(Callee);
+  if (!isDirect) {
     auto CalleeReg = Callee.getReg();
-    if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg))
-      MIB->getOperand(0).setReg(constrainOperandRegClass(
+    if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) {
+      unsigned CalleeIdx = isThumb ? 2 : 0;
+      MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
           MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
-          *MIB.getInstr(), MIB->getDesc(), Callee, 0));
+          *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx));
+    }
   }
 
+  MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv));
+
   SmallVector<ArgInfo, 8> ArgInfos;
   for (auto Arg : OrigArgs) {
     if (!isSupportedType(DL, TLI, Arg.Ty))
diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index 0bd1f9ca63918d253fe44f11c67198da294ad357..b631c2bc687b05cd3e63f3a94dbf31489242598d 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -109,24 +109,25 @@ EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
 namespace {
 class IRPromoter {
   SmallPtrSet<Value*, 8> NewInsts;
-  SmallVector<Instruction*, 4> InstsToRemove;
-  DenseMap<Value*, Type*> TruncTysMap;
+  SmallPtrSet<Instruction*, 4> InstsToRemove;
+  DenseMap<Value*, SmallVector<Type*, 4>> TruncTysMap;
   SmallPtrSet<Value*, 8> Promoted;
   Module *M = nullptr;
   LLVMContext &Ctx;
   IntegerType *ExtTy = nullptr;
   IntegerType *OrigTy = nullptr;
-
-  void PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
-                         SmallPtrSetImpl<Instruction*> &SafeToPromote);
-  void ExtendSources(SmallPtrSetImpl<Value*> &Sources);
-  void PromoteTree(SmallPtrSetImpl<Value*> &Visited,
-                   SmallPtrSetImpl<Value*> &Sources,
-                   SmallPtrSetImpl<Instruction*> &Sinks,
-                   SmallPtrSetImpl<Instruction*> &SafeToPromote);
-  void TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
-                     SmallPtrSetImpl<Instruction*> &Sinks);
-  void Cleanup(SmallPtrSetImpl<Instruction*> &Sinks);
+  SmallPtrSetImpl<Value*> *Visited;
+  SmallPtrSetImpl<Value*> *Sources;
+  SmallPtrSetImpl<Instruction*> *Sinks;
+  SmallPtrSetImpl<Instruction*> *SafeToPromote;
+
+  void ReplaceAllUsersOfWith(Value *From, Value *To);
+  void PrepareConstants(void);
+  void ExtendSources(void);
+  void ConvertTruncs(void);
+  void PromoteTree(void);
+  void TruncateSinks(void);
+  void Cleanup(void);
 
 public:
   IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
@@ -180,6 +181,22 @@ static bool generateSignBits(Value *V) {
          Opc == Instruction::SRem;
 }
 
+static bool EqualTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
+}
+
+static bool LessOrEqualTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize;
+}
+
+static bool GreaterThanTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize;
+}
+
+static bool LessThanTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize;
+}
+
 /// Some instructions can use 8- and 16-bit operands, and we don't need to
 /// promote anything larger. We disallow booleans to make life easier when
 /// dealing with icmps but allow any other integer that is <= 16 bits. Void
@@ -194,15 +211,15 @@ static bool isSupportedType(Value *V) {
   if (auto *Ld = dyn_cast<LoadInst>(V))
     Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
 
-  const IntegerType *IntTy = dyn_cast<IntegerType>(Ty);
-  if (!IntTy)
+  if (!isa<IntegerType>(Ty) ||
+      cast<IntegerType>(V->getType())->getBitWidth() == 1)
     return false;
 
-  return IntTy->getBitWidth() == ARMCodeGenPrepare::TypeSize;
+  return LessOrEqualTypeSize(V);
 }
 
 /// Return true if the given value is a source in the use-def chain, producing
-/// a narrow (i8, i16) value. These values will be zext to start the promotion
+/// a narrow 'TypeSize' value. These values will be zext to start the promotion
 /// of the tree to i32. We guarantee that these won't populate the upper bits
 /// of the register. ZExt on the loads will be free, and the same for call
 /// return values because we only accept ones that guarantee a zeroext ret val.
@@ -211,6 +228,7 @@ static bool isSupportedType(Value *V) {
 static bool isSource(Value *V) {
   if (!isa<IntegerType>(V->getType()))
     return false;
+
   // TODO Allow zext to be sources.
   if (isa<Argument>(V))
     return true;
@@ -221,7 +239,7 @@ static bool isSource(Value *V) {
   else if (auto *Call = dyn_cast<CallInst>(V))
     return Call->hasRetAttr(Attribute::AttrKind::ZExt);
   else if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return isSupportedType(Trunc);
+    return EqualTypeSize(Trunc);
   return false;
 }
 
@@ -232,20 +250,23 @@ static bool isSink(Value *V) {
   // TODO The truncate also isn't actually necessary because we would already
   // proved that the data value is kept within the range of the original data
   // type.
-  auto UsesNarrowValue = [](Value *V) {
-    return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
-  };
 
+  // Sinks are:
+  // - points where the value in the register is being observed, such as an
+  //   icmp, switch or store.
+  // - points where value types have to match, such as calls and returns.
+  // - zext are included to ease the transformation and are generally removed
+  //   later on.
   if (auto *Store = dyn_cast<StoreInst>(V))
-    return UsesNarrowValue(Store->getValueOperand());
+    return LessOrEqualTypeSize(Store->getValueOperand());
   if (auto *Return = dyn_cast<ReturnInst>(V))
-    return UsesNarrowValue(Return->getReturnValue());
-  if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return UsesNarrowValue(Trunc->getOperand(0));
+    return LessOrEqualTypeSize(Return->getReturnValue());
   if (auto *ZExt = dyn_cast<ZExtInst>(V))
-    return UsesNarrowValue(ZExt->getOperand(0));
+    return GreaterThanTypeSize(ZExt);
+  if (auto *Switch = dyn_cast<SwitchInst>(V))
+    return LessThanTypeSize(Switch->getCondition());
   if (auto *ICmp = dyn_cast<ICmpInst>(V))
-    return ICmp->isSigned();
+    return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0));
 
   return isa<CallInst>(V);
 }
@@ -416,23 +437,32 @@ static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
   llvm_unreachable("unhandled opcode for narrow intrinsic");
 }
 
-static void ReplaceAllUsersOfWith(Value *From, Value *To) {
+void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
   SmallVector<Instruction*, 4> Users;
   Instruction *InstTo = dyn_cast<Instruction>(To);
+  bool ReplacedAll = true;
+
+  LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To
+             << "\n");
+
   for (Use &U : From->uses()) {
     auto *User = cast<Instruction>(U.getUser());
-    if (InstTo && User->isIdenticalTo(InstTo))
+    if (InstTo && User->isIdenticalTo(InstTo)) {
+      ReplacedAll = false;
       continue;
+    }
     Users.push_back(User);
   }
 
   for (auto *U : Users)
     U->replaceUsesOfWith(From, To);
+
+  if (ReplacedAll)
+    if (auto *I = dyn_cast<Instruction>(From))
+      InstsToRemove.insert(I);
 }
 
-void
-IRPromoter::PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
-                             SmallPtrSetImpl<Instruction*> &SafeToPromote) {
+void IRPromoter::PrepareConstants() {
   IRBuilder<> Builder{Ctx};
   // First step is to prepare the instructions for mutation. Most constants
   // just need to be zero extended into their new type, but complications arise
@@ -453,12 +483,12 @@ IRPromoter::PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
   // immediate as operand 1, we create an equivalent instruction using a
   // positive immediate. That positive immediate can then be zext along with
   // all the other immediates later.
-  for (auto *V : Visited) {
+  for (auto *V : *Visited) {
     if (!isa<Instruction>(V))
       continue;
 
     auto *I = cast<Instruction>(V);
-    if (SafeToPromote.count(I)) {
+    if (SafeToPromote->count(I)) {
 
       if (!isa<OverflowingBinaryOperator>(I))
         continue;
@@ -483,16 +513,16 @@ IRPromoter::PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
           NewInst->copyIRFlags(I);
           NewInsts.insert(NewInst);
         }
-        InstsToRemove.push_back(I);
+        InstsToRemove.insert(I);
         I->replaceAllUsesWith(NewVal);
       }
     }
   }
   for (auto *I : NewInsts)
-    Visited.insert(I);
+    Visited->insert(I);
 }
 
-void IRPromoter::ExtendSources(SmallPtrSetImpl<Value*> &Sources) {
+void IRPromoter::ExtendSources() {
   IRBuilder<> Builder{Ctx};
 
   auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
@@ -510,13 +540,13 @@ void IRPromoter::ExtendSources(SmallPtrSetImpl<Value*> &Sources) {
         I->moveAfter(InsertPt);
       NewInsts.insert(I);
     }
+
     ReplaceAllUsersOfWith(V, ZExt);
-    TruncTysMap[ZExt] = TruncTysMap[V];
   };
 
   // Now, insert extending instructions between the sources and their users.
   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
-  for (auto V : Sources) {
+  for (auto V : *Sources) {
     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
     if (auto *I = dyn_cast<Instruction>(V))
       InsertZExt(I, I);
@@ -530,22 +560,19 @@ void IRPromoter::ExtendSources(SmallPtrSetImpl<Value*> &Sources) {
   }
 }
 
-void IRPromoter::PromoteTree(SmallPtrSetImpl<Value*> &Visited,
-                             SmallPtrSetImpl<Value*> &Sources,
-                             SmallPtrSetImpl<Instruction*> &Sinks,
-                             SmallPtrSetImpl<Instruction*> &SafeToPromote) {
+void IRPromoter::PromoteTree() {
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
 
   IRBuilder<> Builder{Ctx};
 
   // Mutate the types of the instructions within the tree. Here we handle
   // constant operands.
-  for (auto *V : Visited) {
-    if (Sources.count(V))
+  for (auto *V : *Visited) {
+    if (Sources->count(V))
       continue;
 
     auto *I = cast<Instruction>(V);
-    if (Sinks.count(I))
+    if (Sinks->count(I))
       continue;
 
     for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
@@ -568,15 +595,15 @@ void IRPromoter::PromoteTree(SmallPtrSetImpl<Value*> &Visited,
 
   // Finally, any instructions that should be promoted but haven't yet been,
   // need to be handled using intrinsics.
-  for (auto *V : Visited) {
+  for (auto *V : *Visited) {
     auto *I = dyn_cast<Instruction>(V);
     if (!I)
       continue;
 
-    if (Sources.count(I) || Sinks.count(I))
+    if (Sources->count(I) || Sinks->count(I))
       continue;
 
-    if (!shouldPromote(I) || SafeToPromote.count(I) || NewInsts.count(I))
+    if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I))
       continue;
   
     assert(EnableDSP && "DSP intrinisc insertion not enabled!");
@@ -590,29 +617,21 @@ void IRPromoter::PromoteTree(SmallPtrSetImpl<Value*> &Visited,
     Builder.SetCurrentDebugLocation(I->getDebugLoc());
     Value *Args[] = { I->getOperand(0), I->getOperand(1) };
     CallInst *Call = Builder.CreateCall(DSPInst, Args);
-    ReplaceAllUsersOfWith(I, Call);
-    InstsToRemove.push_back(I);
     NewInsts.insert(Call);
-    TruncTysMap[Call] = OrigTy;
+    ReplaceAllUsersOfWith(I, Call);
   }
 }
 
-void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
-                               SmallPtrSetImpl<Instruction*> &Sinks) {
+void IRPromoter::TruncateSinks() {
   LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
 
   IRBuilder<> Builder{Ctx};
 
-  auto InsertTrunc = [&](Value *V) -> Instruction* {
+  auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* {
     if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
       return nullptr;
 
-    if ((!Promoted.count(V) && !NewInsts.count(V)) || !TruncTysMap.count(V) ||
-        Sources.count(V))
-      return nullptr;
-
-    Type *TruncTy = TruncTysMap[V];
-    if (TruncTy == ExtTy)
+    if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V))
       return nullptr;
 
     LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for "
@@ -626,14 +645,15 @@ void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
 
   // Fix up any stores or returns that use the results of the promoted
   // chain.
-  for (auto I : Sinks) {
-    LLVM_DEBUG(dbgs() << " - " << *I << "\n");
+  for (auto I : *Sinks) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n");
 
     // Handle calls separately as we need to iterate over arg operands.
     if (auto *Call = dyn_cast<CallInst>(I)) {
       for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
         Value *Arg = Call->getArgOperand(i);
-        if (Instruction *Trunc = InsertTrunc(Arg)) {
+        Type *Ty = TruncTysMap[Call][i];
+        if (Instruction *Trunc = InsertTrunc(Arg, Ty)) {
           Trunc->moveBefore(Call);
           Call->setArgOperand(i, Trunc);
         }
@@ -641,44 +661,53 @@ void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
       continue;
     }
 
+    // Special case switches because we need to truncate the condition.
+    if (auto *Switch = dyn_cast<SwitchInst>(I)) {
+      Type *Ty = TruncTysMap[Switch][0];
+      if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) {
+        Trunc->moveBefore(Switch);
+        Switch->setCondition(Trunc);
+      }
+      continue;
+    }
+
     // Now handle the others.
     for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-      if (Instruction *Trunc = InsertTrunc(I->getOperand(i))) {
+      Type *Ty = TruncTysMap[I][i];
+      if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) {
         Trunc->moveBefore(I);
         I->setOperand(i, Trunc);
       }
     }
   }
-
 }
 
-void IRPromoter::Cleanup(SmallPtrSetImpl<Instruction*> &Sinks) {
-  // Some zext sinks will now have become redundant, along with their trunc
-  // operands, so remove them.
-  for (auto I : Sinks) {
-    if (auto *ZExt = dyn_cast<ZExtInst>(I)) {
-      if (ZExt->getDestTy() != ExtTy)
-        continue;
+void IRPromoter::Cleanup() {
+  // Some zexts will now have become redundant, along with their trunc
+  // operands, so remove them
+  for (auto V : *Visited) {
+    if (!isa<CastInst>(V))
+      continue;
 
-      Value *Src = ZExt->getOperand(0);
-      if (ZExt->getSrcTy() == ZExt->getDestTy()) {
-        LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary zext\n");
-        ReplaceAllUsersOfWith(ZExt, Src);
-        InstsToRemove.push_back(ZExt);
-        continue;
-      }
+    auto ZExt = cast<CastInst>(V);
+    if (ZExt->getDestTy() != ExtTy)
+      continue;
 
-      // For any truncs that we insert to handle zexts, we can replace the
-      // result of the zext with the input to the trunc.
-      if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
-        auto *Trunc = cast<TruncInst>(Src);
-        assert(Trunc->getOperand(0)->getType() == ExtTy &&
-               "expected inserted trunc to be operating on i32");
-        LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: "
-                   << *Trunc->getOperand(0));
-        ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
-        InstsToRemove.push_back(ZExt);
-      }
+    Value *Src = ZExt->getOperand(0);
+    if (ZExt->getSrcTy() == ZExt->getDestTy()) {
+      LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt
+                 << "\n");
+      ReplaceAllUsersOfWith(ZExt, Src);
+      continue;
+    }
+
+    // For any truncs that we insert to handle zexts, we can replace the
+    // result of the zext with the input to the trunc.
+    if (NewInsts.count(Src) && isa<ZExtInst>(V) && isa<TruncInst>(Src)) {
+      auto *Trunc = cast<TruncInst>(Src);
+      assert(Trunc->getOperand(0)->getType() == ExtTy &&
+             "expected inserted trunc to be operating on i32");
+      ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
     }
   }
 
@@ -694,6 +723,29 @@ void IRPromoter::Cleanup(SmallPtrSetImpl<Instruction*> &Sinks) {
   Promoted.clear();
 }
 
+void IRPromoter::ConvertTruncs() {
+  IRBuilder<> Builder{Ctx};
+
+  for (auto *V : *Visited) {
+    if (!isa<TruncInst>(V) || Sources->count(V))
+      continue;
+
+    auto *Trunc = cast<TruncInst>(V);
+    assert(LessThanTypeSize(Trunc) && "expected narrow trunc");
+
+    Builder.SetInsertPoint(Trunc);
+    unsigned NumBits =
+      cast<IntegerType>(Trunc->getType())->getScalarSizeInBits();
+    ConstantInt *Mask = ConstantInt::get(Ctx, APInt::getMaxValue(NumBits));
+    Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask);
+
+    if (auto *I = dyn_cast<Instruction>(Masked))
+      NewInsts.insert(I);
+
+    ReplaceAllUsersOfWith(Trunc, Masked);
+  }
+}
+
 void IRPromoter::Mutate(Type *OrigTy,
                         SmallPtrSetImpl<Value*> &Visited,
                         SmallPtrSetImpl<Value*> &Sources,
@@ -707,39 +759,49 @@ void IRPromoter::Mutate(Type *OrigTy,
   assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() &&
          "original type not smaller than extended type");
 
-  // Cache original types.
-  for (auto *V : Visited)
-    TruncTysMap[V] = V->getType();
+  this->Visited = &Visited;
+  this->Sources = &Sources;
+  this->Sinks = &Sinks;
+  this->SafeToPromote = &SafeToPromote;
+
+  // Cache original types of the values that will likely need truncating
+  for (auto *I : Sinks) {
+    if (auto *Call = dyn_cast<CallInst>(I)) {
+      for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
+        Value *Arg = Call->getArgOperand(i);
+        TruncTysMap[Call].push_back(Arg->getType());
+      }
+    } else if (auto *Switch = dyn_cast<SwitchInst>(I))
+      TruncTysMap[I].push_back(Switch->getCondition()->getType());
+    else {
+      for (unsigned i = 0; i < I->getNumOperands(); ++i)
+        TruncTysMap[I].push_back(I->getOperand(i)->getType());
+    }
+  }
 
   // Convert adds and subs using negative immediates to equivalent instructions
   // that use positive constants.
-  PrepareConstants(Visited, SafeToPromote);
+  PrepareConstants();
 
   // Insert zext instructions between sources and their users.
-  ExtendSources(Sources);
+  ExtendSources();
+
+  // Convert any truncs, that aren't sources, into AND masks.
+  ConvertTruncs();
 
   // Promote visited instructions, mutating their types in place. Also insert
   // DSP intrinsics, if enabled, for adds and subs which would be unsafe to
   // promote.
-  PromoteTree(Visited, Sources, Sinks, SafeToPromote);
+  PromoteTree();
 
   // Insert trunc instructions for use by calls, stores etc...
-  TruncateSinks(Sources, Sinks);
+  TruncateSinks();
 
   // Finally, remove unecessary zexts and truncs, delete old instructions and
   // clear the data structures.
-  Cleanup(Sinks);
+  Cleanup();
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n");
-  LLVM_DEBUG(dbgs();
-             for (auto *V : Sources)
-               V->dump();
-             for (auto *I : NewInsts)
-               I->dump();
-             for (auto *V : Visited) {
-               if (!Sources.count(V))
-                 V->dump();
-              });
+  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n");
 }
 
 /// We accept most instructions, as well as Arguments and ConstantInsts. We
@@ -747,8 +809,15 @@ void IRPromoter::Mutate(Type *OrigTy,
 /// return value is zeroext. We don't allow opcodes that can introduce sign
 /// bits.
 bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
-  if (isa<ICmpInst>(V))
-    return true;
+  if (auto *I = dyn_cast<ICmpInst>(V)) {
+    // Now that we allow small types than TypeSize, only allow icmp of
+    // TypeSize because they will require a trunc to be legalised.
+    // TODO: Allow icmp of smaller types, and calculate at the end
+    // whether the transform would be beneficial.
+    if (isa<PointerType>(I->getOperand(0)->getType()))
+      return true;
+    return EqualTypeSize(I->getOperand(0));
+  }
 
   // Memory instructions
   if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
@@ -766,12 +835,11 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
       isa<LoadInst>(V))
     return isSupportedType(V);
 
-  // Truncs can be either sources or sinks.
-  if (auto *Trunc = dyn_cast<TruncInst>(V))
-    return isSupportedType(Trunc) || isSupportedType(Trunc->getOperand(0));
+  if (isa<SExtInst>(V))
+    return false;
 
-  if (isa<CastInst>(V) && !isa<SExtInst>(V))
-    return isSupportedType(cast<CastInst>(V)->getOperand(0));
+  if (auto *Cast = dyn_cast<CastInst>(V))
+    return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
 
   // Special cases for calls as we need to check for zeroext
   // TODO We should accept calls even if they don't have zeroext, as they can
@@ -901,13 +969,17 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
     // Calls can be both sources and sinks.
     if (isSink(V))
       Sinks.insert(cast<Instruction>(V));
+
     if (isSource(V))
       Sources.insert(V);
-    else if (auto *I = dyn_cast<Instruction>(V)) {
-      // Visit operands of any instruction visited.
-      for (auto &U : I->operands()) {
-        if (!AddLegalInst(U))
-          return false;
+
+    if (!isSink(V) && !isSource(V)) {
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        // Visit operands of any instruction visited.
+        for (auto &U : I->operands()) {
+          if (!AddLegalInst(U))
+            return false;
+        }
       }
     }
 
@@ -973,6 +1045,8 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) {
         if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
           continue;
 
+        LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+
         for (auto &Op : CI.operands()) {
           if (auto *I = dyn_cast<Instruction>(Op))
             MadeChange |= TryToPromote(I);
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index a66cd7053c0aa2be3ee31b0844fec7b455b73cb7..a50abfdbee446188465f9c215fdf1725a218a9b6 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -2951,7 +2951,8 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   unsigned ResultReg = MI->getOperand(0).getReg();
   if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
     return false;
-  MI->eraseFromParent();
+  MachineBasicBlock::iterator I(MI);
+  removeDeadCode(I, std::next(I));
   return true;
 }
 
@@ -2970,12 +2971,16 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   unsigned ConstAlign =
       MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
   unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign);
+  MachineMemOperand *CPMMO =
+      MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+                               MachineMemOperand::MOLoad, 4, 4);
 
   unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
   unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
   MachineInstrBuilder MIB =
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
-          .addConstantPoolIndex(Idx);
+          .addConstantPoolIndex(Idx)
+          .addMemOperand(CPMMO);
   if (Opc == ARM::LDRcp)
     MIB.addImm(0);
   MIB.add(predOps(ARMCC::AL));
@@ -2988,6 +2993,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
             .addReg(TempReg)
             .addImm(ARMPCLabelIndex);
+
   if (!Subtarget->isThumb())
     MIB.add(predOps(ARMCC::AL));
 
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 3e057e953d86a2a94b762b8ca3cd7f1485fc63fe..2417166ca962149fc429675f4934228763cac98f 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -911,6 +911,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
       assert(RegInfo->hasBasePointer(MF) &&
              "VLAs and dynamic stack alignment, but missing base pointer!");
       FrameReg = RegInfo->getBaseRegister();
+      Offset -= SPAdj;
     }
     return Offset;
   }
@@ -2157,9 +2158,15 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Do not generate a prologue for leaf functions with a stack of size zero.
   // For non-leaf functions we have to allow for the possibility that the
-  // call is to a non-split function, as in PR37807.
-  if (StackSize == 0 && !MFI.hasTailCall())
+  // callis to a non-split function, as in PR37807. This function could also
+  // take the address of a non-split function. When the linker tries to adjust
+  // its non-existent prologue, it would fail with an error. Mark the object
+  // file so that such failures are not errors. See this Go language bug-report
+  // https://go-review.googlesource.com/c/go/+/148819/
+  if (StackSize == 0 && !MFI.hasTailCall()) {
+    MF.getMMI().setHasNosplitStack(true);
     return;
+  }
 
   // Use R4 and R5 as scratch registers.
   // We save R4 and R5 before use and restore them before leaving the function.
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 77622567d4b9b8ec357f1d4b06d592036676ce18..8e0e82388251ae6499998fd6c32cc2d8c875cfa3 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1763,12 +1763,14 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   default: llvm_unreachable("unhandled vld type");
     // Double-register operations:
   case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4f16:
   case MVT::v4i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32: OpcodeIndex = 2; break;
   case MVT::v1i64: OpcodeIndex = 3; break;
     // Quad-register operations:
   case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8f16:
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 56d2e510cb719761d1b0ed3200179a491148902a..8d5cf0a2c1c2180bc9172f705791628724464eac 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -984,7 +984,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
     // On v8, we have particularly efficient implementations of atomic fences
     // if they can be combined with nearby atomic loads and stores.
-    if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
+    if (!Subtarget->hasAcquireRelease() ||
+        getTargetMachine().getOptLevel() == 0) {
       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
       InsertFencesForAtomic = true;
     }
@@ -1282,6 +1283,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
 
   case ARMISD::CMOV:          return "ARMISD::CMOV";
+  case ARMISD::SUBS:          return "ARMISD::SUBS";
 
   case ARMISD::SSAT:          return "ARMISD::SSAT";
   case ARMISD::USAT:          return "ARMISD::USAT";
@@ -7794,6 +7796,50 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   return LowerCallTo(CLI).first;
 }
 
+// This is a code size optimisation: return the original SDIV node to
+// DAGCombiner when we don't want to expand SDIV into a sequence of
+// instructions, and an empty node otherwise which will cause the
+// SDIV to be expanded in DAGCombine.
+SDValue
+ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                 SelectionDAG &DAG,
+                                 SmallVectorImpl<SDNode *> &Created) const {
+  // TODO: Support SREM
+  if (N->getOpcode() != ISD::SDIV)
+    return SDValue();
+
+  const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
+  const auto &MF = DAG.getMachineFunction();
+  const bool MinSize = MF.getFunction().optForMinSize();
+  const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
+                                      : ST.hasDivideInARMMode();
+
+  // Don't touch vector types; rewriting this may lead to scalarizing
+  // the int divs.
+  if (N->getOperand(0).getValueType().isVector())
+    return SDValue();
+
+  // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
+  // hwdiv support for this to be really profitable.
+  if (!(MinSize && HasDivide))
+    return SDValue();
+
+  // ARM mode is a bit simpler than Thumb: we can handle large power
+  // of 2 immediates with 1 mov instruction; no further checks required,
+  // just return the sdiv node.
+  if (!ST.isThumb())
+    return SDValue(N, 0);
+
+  // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
+  // and thus lose the code size benefits of a MOVS that requires only 2.
+  // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
+  // but as it's doing exactly this, it's not worth the trouble to get TTI.
+  if (Divisor.sgt(128))
+    return SDValue();
+
+  return SDValue(N, 0);
+}
+
 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
                                             bool Signed) const {
   assert(Op.getValueType() == MVT::i32 &&
@@ -12663,30 +12709,38 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
                         DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
         Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
       }
-    } else if (CC == ARMCC::NE && LHS != RHS &&
+    } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
                (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
       // This seems pointless but will allow us to combine it further below.
-      // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
-      SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+      // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
+      SDValue Sub =
+          DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
+      SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
+                                          Sub.getValue(1), SDValue());
       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
-                        N->getOperand(3), Cmp);
+                        N->getOperand(3), CPSRGlue.getValue(1));
+      FalseVal = Sub;
     }
   } else if (isNullConstant(TrueVal)) {
-    if (CC == ARMCC::EQ && LHS != RHS &&
+    if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
         (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
       // This seems pointless but will allow us to combine it further below
       // Note that we change == for != as this is the dual for the case above.
-      // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
-      SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+      // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
+      SDValue Sub =
+          DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
+      SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
+                                          Sub.getValue(1), SDValue());
       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
                         DAG.getConstant(ARMCC::NE, dl, MVT::i32),
-                        N->getOperand(3), Cmp);
+                        N->getOperand(3), CPSRGlue.getValue(1));
+      FalseVal = Sub;
     }
   }
 
   // On Thumb1, the DAG above may be further combined if z is a power of 2
   // (z == 2 ^ K).
-  // CMOV (SUB x, y), z, !=, (CMPZ x, y) ->
+  // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
   //       merge t3, t4
   // where t1 = (SUBCARRY (SUB x, y), z, 0)
   //       t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
@@ -12694,8 +12748,8 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   //       t4 = (SUB 1, t2:1)   [ we want a carry, not a borrow ]
   const APInt *TrueConst;
   if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
-      (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) &&
-      (FalseVal.getOperand(1) == RHS) &&
+      (FalseVal.getOpcode() == ARMISD::SUBS) &&
+      (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) &&
       (TrueConst = isPowerOf2Constant(TrueVal))) {
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
     unsigned ShiftAmount = TrueConst->logBase2();
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index be53987a2af93fe939e920c583542e826dc53847..ccee7fa258ab4978bc7b3f8f4b4a5f3d2bd848b4 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -85,6 +85,7 @@ class VectorType;
       FMSTAT,       // ARM fmstat instruction.
 
       CMOV,         // ARM conditional move instructions.
+      SUBS,         // Flag-setting subtraction.
 
       SSAT,         // Signed saturation
       USAT,         // Unsigned saturation
@@ -694,6 +695,9 @@ class VectorType;
     unsigned getRegisterByName(const char* RegName, EVT VT,
                                SelectionDAG &DAG) const override;
 
+    SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                          SmallVectorImpl<SDNode *> &Created) const override;
+
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
     /// expanded to FMAs when this method returns true, otherwise fmuladd is
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 76f8414e8f0b87ef2bbded872d1f7952ed7f5f43..488af7bf12c76a62724244eafb96784fde21aa81 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -144,6 +144,7 @@ def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
+def ARMsubs          : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>;
 
 def ARMssatnoshift   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
 
@@ -3334,7 +3335,7 @@ multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
 
 let hasSideEffects = 0 in {
 
-let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
 defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
                          IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">;
 
@@ -3641,6 +3642,14 @@ let isAdd = 1 in
 defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>;
 defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
 
+def : ARMPat<(ARMsubs GPR:$Rn, mod_imm:$imm), (SUBSri $Rn, mod_imm:$imm)>;
+def : ARMPat<(ARMsubs GPR:$Rn, GPR:$Rm), (SUBSrr $Rn, $Rm)>;
+def : ARMPat<(ARMsubs GPR:$Rn, so_reg_imm:$shift),
+             (SUBSrsi $Rn, so_reg_imm:$shift)>;
+def : ARMPat<(ARMsubs GPR:$Rn, so_reg_reg:$shift),
+             (SUBSrsr $Rn, so_reg_reg:$shift)>;
+
+
 let isAdd = 1 in
 defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>;
 defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>;
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 3c153625b01684be5ff8fd55963738c71d47af3f..b20b34eaa6a90b28ca3ca7fc2641f656b96c05c7 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -781,7 +781,7 @@ defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
 // These require base address to be written back or one of the loaded regs.
 let hasSideEffects = 0 in {
 
-let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
 def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
         IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
   bits<3> Rn;
@@ -826,7 +826,8 @@ def : InstAlias<"ldm${p} $Rn!, $regs",
                 (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>,
         Requires<[IsThumb, IsThumb1Only]>;
 
-let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1,
+    variadicOpsAreDefs = 1 in
 def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
                IIC_iPop,
                "pop${p}\t$regs", []>,
@@ -1351,6 +1352,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
                 Sched<[WriteALU]>;
 }
 
+
+def : T1Pat<(ARMsubs tGPR:$Rn, tGPR:$Rm), (tSUBSrr $Rn, $Rm)>;
+def : T1Pat<(ARMsubs tGPR:$Rn, imm0_7:$imm3), (tSUBSi3 $Rn, imm0_7:$imm3)>;
+def : T1Pat<(ARMsubs tGPR:$Rn, imm0_255:$imm8), (tSUBSi8 $Rn, imm0_255:$imm8)>;
+
+
 // Sign-extend byte
 def tSXTB :                     // A8.6.222
   T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 0c5720e0267f584d7e7bd36b047cc041de5c1e06..4130dbd884572d341b80177bc0c67058b835b6a2 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -1775,7 +1775,7 @@ multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
 
 let hasSideEffects = 0 in {
 
-let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
 defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
 
 multiclass thumb2_st_mult<string asm, InstrItinClass itin,
@@ -2094,6 +2094,12 @@ defm t2SUB  : T2I_bin_ii12rs<0b101, "sub", sub>;
 defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>;
 defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>;
 
+def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_imm:$imm),
+            (t2SUBSri $Rn, t2_so_imm:$imm)>;
+def : T2Pat<(ARMsubs GPRnopc:$Rn, rGPR:$Rm), (t2SUBSrr $Rn, $Rm)>;
+def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
+            (t2SUBSrs $Rn, t2_so_reg:$ShiftedRm)>;
+
 let hasPostISelHook = 1 in {
 defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
@@ -4445,13 +4451,13 @@ def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
 def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
             (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
 
-let AddedComplexity = 8 in {
-  def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr),  (t2LDAB addr_offset_none:$addr)>;
-  def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
-  def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA  addr_offset_none:$addr)>;
-  def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (t2STLB GPR:$val, addr_offset_none:$addr)>;
-  def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
-  def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL  GPR:$val, addr_offset_none:$addr)>;
+let AddedComplexity = 8, Predicates = [IsThumb, HasAcquireRelease, HasV7Clrex] in {
+  def : Pat<(atomic_load_acquire_8 addr_offset_none:$addr),  (t2LDAB addr_offset_none:$addr)>;
+  def : Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
+  def : Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA  addr_offset_none:$addr)>;
+  def : Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (t2STLB GPR:$val, addr_offset_none:$addr)>;
+  def : Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
+  def : Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL  GPR:$val, addr_offset_none:$addr)>;
 }
 
 
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 6692a4d4142052b2d2e4cf754b432a0c39ccbc87..293e734c97cd6a7c600077151d46816515d9cf9d 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -76,6 +76,42 @@ private:
   const ARMRegisterBankInfo &RBI;
   const ARMSubtarget &STI;
 
+  // Store the opcodes that we might need, so we don't have to check what kind
+  // of subtarget (ARM vs Thumb) we have all the time.
+  struct OpcodeCache {
+    unsigned ZEXT16;
+    unsigned SEXT16;
+
+    unsigned ZEXT8;
+    unsigned SEXT8;
+
+    // Used for implementing ZEXT/SEXT from i1
+    unsigned AND;
+    unsigned RSB;
+
+    unsigned STORE32;
+    unsigned LOAD32;
+
+    unsigned STORE16;
+    unsigned LOAD16;
+
+    unsigned STORE8;
+    unsigned LOAD8;
+
+    OpcodeCache(const ARMSubtarget &STI);
+  } const Opcodes;
+
+  // Select the opcode for simple extensions (that translate to a single SXT/UXT
+  // instruction). Extension operations more complicated than that should not
+  // invoke this. Returns the original opcode if it doesn't know how to select a
+  // better one.
+  unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) const;
+
+  // Select the opcode for simple loads and stores. Returns the original opcode
+  // if it doesn't know how to select a better one.
+  unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
+                                 unsigned Size) const;
+
 #define GET_GLOBALISEL_PREDICATES_DECL
 #include "ARMGenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_DECL
@@ -107,7 +143,7 @@ ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM,
                                                const ARMSubtarget &STI,
                                                const ARMRegisterBankInfo &RBI)
     : InstructionSelector(), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI),
+      TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI), Opcodes(STI),
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "ARMGenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_INIT
@@ -225,41 +261,63 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB,
   return true;
 }
 
-/// Select the opcode for simple extensions (that translate to a single SXT/UXT
-/// instruction). Extension operations more complicated than that should not
-/// invoke this. Returns the original opcode if it doesn't know how to select a
-/// better one.
-static unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) {
+ARMInstructionSelector::OpcodeCache::OpcodeCache(const ARMSubtarget &STI) {
+  bool isThumb = STI.isThumb();
+
+  using namespace TargetOpcode;
+
+#define STORE_OPCODE(VAR, OPC) VAR = isThumb ? ARM::t2##OPC : ARM::OPC
+  STORE_OPCODE(SEXT16, SXTH);
+  STORE_OPCODE(ZEXT16, UXTH);
+
+  STORE_OPCODE(SEXT8, SXTB);
+  STORE_OPCODE(ZEXT8, UXTB);
+
+  STORE_OPCODE(AND, ANDri);
+  STORE_OPCODE(RSB, RSBri);
+
+  STORE_OPCODE(STORE32, STRi12);
+  STORE_OPCODE(LOAD32, LDRi12);
+
+  // LDRH/STRH are special...
+  STORE16 = isThumb ? ARM::t2STRHi12 : ARM::STRH;
+  LOAD16 = isThumb ? ARM::t2LDRHi12 : ARM::LDRH;
+
+  STORE_OPCODE(STORE8, STRBi12);
+  STORE_OPCODE(LOAD8, LDRBi12);
+#undef MAP_OPCODE
+}
+
+unsigned ARMInstructionSelector::selectSimpleExtOpc(unsigned Opc,
+                                                    unsigned Size) const {
   using namespace TargetOpcode;
 
   if (Size != 8 && Size != 16)
     return Opc;
 
   if (Opc == G_SEXT)
-    return Size == 8 ? ARM::SXTB : ARM::SXTH;
+    return Size == 8 ? Opcodes.SEXT8 : Opcodes.SEXT16;
 
   if (Opc == G_ZEXT)
-    return Size == 8 ? ARM::UXTB : ARM::UXTH;
+    return Size == 8 ? Opcodes.ZEXT8 : Opcodes.ZEXT16;
 
   return Opc;
 }
 
-/// Select the opcode for simple loads and stores. For types smaller than 32
-/// bits, the value will be zero extended. Returns the original opcode if it
-/// doesn't know how to select a better one.
-static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
-                                      unsigned Size) {
+unsigned ARMInstructionSelector::selectLoadStoreOpCode(unsigned Opc,
+                                                       unsigned RegBank,
+                                                       unsigned Size) const {
   bool isStore = Opc == TargetOpcode::G_STORE;
 
   if (RegBank == ARM::GPRRegBankID) {
     switch (Size) {
     case 1:
     case 8:
-      return isStore ? ARM::STRBi12 : ARM::LDRBi12;
+      return isStore ? Opcodes.STORE8 : Opcodes.LOAD8;
     case 16:
-      return isStore ? ARM::STRH : ARM::LDRH;
+      return isStore ? Opcodes.STORE16 : Opcodes.LOAD16;
     case 32:
-      return isStore ? ARM::STRi12 : ARM::LDRi12;
+      return isStore ? Opcodes.STORE32 : Opcodes.LOAD32;
     default:
       return Opc;
     }
@@ -702,7 +760,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     switch (SrcSize) {
     case 1: {
       // ZExt boils down to & 0x1; for SExt we also subtract that from 0
-      I.setDesc(TII.get(ARM::ANDri));
+      I.setDesc(TII.get(Opcodes.AND));
       MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp());
 
       if (isSExt) {
@@ -714,7 +772,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
 
         auto InsertBefore = std::next(I.getIterator());
         auto SubI =
-            BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::RSBri))
+            BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(Opcodes.RSB))
                 .addDef(SExtResult)
                 .addUse(AndResult)
                 .addImm(0)
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 891418306903c5f19cb970bfee98329e6395024d..4a0c24d58474c73e2478a52744ef0a36831a6ad0 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -75,13 +75,48 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
 
-  getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
-  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+  if (ST.isThumb1Only()) {
+    // Thumb1 is not supported yet.
+    computeTables();
+    verify(*ST.getInstrInfo());
+    return;
+  }
+
+  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
+      .legalForCartesianProduct({s32}, {s1, s8, s16});
 
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
       .legalFor({s32})
       .minScalar(0, s32);
 
+  getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
+  getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}});
+
+  getActionDefinitionsBuilder(G_CONSTANT)
+      .legalFor({s32, p0})
+      .clampScalar(0, s32, s32);
+
+  // We're keeping these builders around because we'll want to add support for
+  // floating point to them.
+  auto &LoadStoreBuilder =
+      getActionDefinitionsBuilder({G_LOAD, G_STORE})
+          .legalForTypesWithMemSize({
+              {s1, p0, 8},
+              {s8, p0, 8},
+              {s16, p0, 16},
+              {s32, p0, 32},
+              {p0, p0, 32}});
+
+  if (ST.isThumb()) {
+    // FIXME: merge with the code for non-Thumb.
+    computeTables();
+    verify(*ST.getInstrInfo());
+    return;
+  }
+
+  getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
+  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+
   if (ST.hasDivideInARMMode())
     getActionDefinitionsBuilder({G_SDIV, G_UDIV})
         .legalFor({s32})
@@ -101,14 +136,24 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
       setAction({Op, s32}, Libcall);
   }
 
-  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
-      .legalForCartesianProduct({s32}, {s1, s8, s16});
-
-  getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
-  getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}});
-
   getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}).legalFor({s32});
 
+  if (ST.hasV5TOps()) {
+    getActionDefinitionsBuilder(G_CTLZ)
+        .legalFor({s32})
+        .clampScalar(0, s32, s32);
+    getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+        .lowerFor({s32})
+        .clampScalar(0, s32, s32);
+  } else {
+    getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+        .libcallFor({s32})
+        .clampScalar(0, s32, s32);
+    getActionDefinitionsBuilder(G_CTLZ)
+        .lowerFor({s32})
+        .clampScalar(0, s32, s32);
+  }
+
   getActionDefinitionsBuilder(G_GEP).legalFor({{p0, s32}});
 
   getActionDefinitionsBuilder(G_SELECT).legalForCartesianProduct({s32, p0},
@@ -116,20 +161,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
   getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
 
-  getActionDefinitionsBuilder(G_CONSTANT)
-      .legalFor({s32, p0})
-      .clampScalar(0, s32, s32);
-
   getActionDefinitionsBuilder(G_ICMP)
       .legalForCartesianProduct({s1}, {s32, p0})
       .minScalar(1, s32);
 
   // We're keeping these builders around because we'll want to add support for
   // floating point to them.
-  auto &LoadStoreBuilder =
-      getActionDefinitionsBuilder({G_LOAD, G_STORE})
-          .legalForCartesianProduct({s1, s8, s16, s32, p0}, {p0});
-
   auto &PhiBuilder =
       getActionDefinitionsBuilder(G_PHI).legalFor({s32, p0}).minScalar(0, s32);
 
@@ -302,7 +339,8 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate,
 
 bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
                                       MachineRegisterInfo &MRI,
-                                      MachineIRBuilder &MIRBuilder) const {
+                                      MachineIRBuilder &MIRBuilder,
+                                      GISelChangeObserver &Observer) const {
   using namespace TargetOpcode;
 
   MIRBuilder.setInstr(MI);
diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h
index 78ab9412c04bae3eaafdb670350694964c247f99..527bf87f1093043191f1e36951b48ab7e3118c5e 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/lib/Target/ARM/ARMLegalizerInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H
 
 #include "llvm/ADT/IndexedMap.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/IR/Instructions.h"
@@ -29,7 +30,8 @@ public:
   ARMLegalizerInfo(const ARMSubtarget &ST);
 
   bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                      MachineIRBuilder &MIRBuilder) const override;
+                      MachineIRBuilder &MIRBuilder,
+                      GISelChangeObserver &Observer) const override;
 
 private:
   void setFCmpLibcallsGNU();
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index 3ab9298c1108ca00b60b5752b3b6619080e62b6d..fc3258914f92b4069f2a90db48e67f1e733a1ae0 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -99,6 +99,8 @@ namespace {
         for (auto *V : RHS)
           AllValues.push_back(V);
       }
+
+    bool AreSymmetrical(BinOpChain *Other);
   };
 
   struct Reduction {
@@ -106,9 +108,9 @@ namespace {
                                       // pattern matching.
     Instruction     *AccIntAdd;       // The accumulating integer add statement,
                                       // i.e, the reduction statement.
-
     OpChainList     MACCandidates;    // The MAC candidates associated with
                                       // this reduction statement.
+    PMACPairList    PMACPairs;
     Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
   };
 
@@ -121,10 +123,13 @@ namespace {
     Loop              *L;
     const DataLayout  *DL;
     Module            *M;
+    std::map<LoadInst*, LoadInst*> LoadPairs;
+    std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
 
-    bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
+    bool RecordSequentialLoads(BasicBlock *Header);
+    bool InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
-    PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
+    void CreateParallelMACPairs(Reduction &R);
     Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
                                  Instruction *Acc, bool Exchange,
                                  Instruction *InsertAfter);
@@ -202,6 +207,12 @@ namespace {
 
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
+
+      if (!RecordSequentialLoads(Header)) {
+        LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
+        return false;
+      }
+
       Changes = MatchSMLAD(F);
       return Changes;
     }
@@ -254,58 +265,14 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) {
   return false;
 }
 
-// Element-by-element comparison of Value lists returning true if they are
-// instructions with the same opcode or constants with the same value.
-static bool AreSymmetrical(const ValueList &VL0,
-                           const ValueList &VL1) {
-  if (VL0.size() != VL1.size()) {
-    LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
-                      << VL0.size() << " != " << VL1.size() << "\n");
-    return false;
-  }
-
-  const unsigned Pairs = VL0.size();
-  LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
-
-  for (unsigned i = 0; i < Pairs; ++i) {
-    const Value *V0 = VL0[i];
-    const Value *V1 = VL1[i];
-    const auto *Inst0 = dyn_cast<Instruction>(V0);
-    const auto *Inst1 = dyn_cast<Instruction>(V1);
-
-    LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
-               dbgs() << "mul1: "; V0->dump();
-               dbgs() << "mul2: "; V1->dump());
-
-    if (!Inst0 || !Inst1)
-      return false;
-
-    if (Inst0->isSameOperationAs(Inst1)) {
-      LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
-      continue;
-    }
-
-    const APInt *C0, *C1;
-    if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
-      return false;
-  }
-
-  LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
-  return true;
-}
-
 template<typename MemInst>
 static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
-                                  MemInstList &VecMem, const DataLayout &DL,
-                                  ScalarEvolution &SE) {
+                                  const DataLayout &DL, ScalarEvolution &SE) {
   if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
     LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
     return false;
   }
   if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
-    VecMem.clear();
-    VecMem.push_back(MemOp0);
-    VecMem.push_back(MemOp1);
     LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
     return true;
   }
@@ -328,16 +295,106 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
     return false;
   }
 
-  return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
+  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
+    return false;
+
+  VecMem.clear();
+  VecMem.push_back(Ld0);
+  VecMem.push_back(Ld1);
+  return true;
 }
 
-PMACPairList
-ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
+/// Iterate through the block and record base, offset pairs of loads as well as
+/// maximal sequences of sequential loads.
+bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
+  SmallVector<LoadInst*, 8> Loads;
+  for (auto &I : *Header) {
+    auto *Ld = dyn_cast<LoadInst>(&I);
+    if (!Ld)
+      continue;
+    Loads.push_back(Ld);
+  }
+
+  std::map<LoadInst*, LoadInst*> BaseLoads;
+
+  for (auto *Ld0 : Loads) {
+    for (auto *Ld1 : Loads) {
+      if (Ld0 == Ld1)
+        continue;
+
+      if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
+        LoadPairs[Ld0] = Ld1;
+        if (BaseLoads.count(Ld0)) {
+          LoadInst *Base = BaseLoads[Ld0];
+          BaseLoads[Ld1] = Base;
+          SequentialLoads[Base].push_back(Ld1);
+        } else {
+          BaseLoads[Ld1] = Ld0;
+          SequentialLoads[Ld0].push_back(Ld1);
+        }
+      }
+    }
+  }
+  return LoadPairs.size() > 1;
+}
+
+void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
+  OpChainList &Candidates = R.MACCandidates;
+  PMACPairList &PMACPairs = R.PMACPairs;
   const unsigned Elems = Candidates.size();
-  PMACPairList PMACPairs;
 
   if (Elems < 2)
-    return PMACPairs;
+    return;
+
+  auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) {
+    if (!PMul0->AreSymmetrical(PMul1))
+      return false;
+
+    // The first elements of each vector should be loads with sexts. If we
+    // find that its two pairs of consecutive loads, then these can be
+    // transformed into two wider loads and the users can be replaced with
+    // DSP intrinsics.
+    for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
+      auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
+      auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
+      auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
+      auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
+
+      if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
+        return false;
+
+      LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
+                 << "\t Ld0: " << *Ld0 << "\n"
+                 << "\t Ld1: " << *Ld1 << "\n"
+                 << "and operands " << x + 2 << ":\n"
+                 << "\t Ld2: " << *Ld2 << "\n"
+                 << "\t Ld3: " << *Ld3 << "\n");
+
+      if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
+        if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+          LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+          PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+          return true;
+        } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
+          LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+          LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
+          PMul1->Exchange = true;
+          PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+          return true;
+        }
+      } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
+                 AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+        LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+        LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
+        LLVM_DEBUG(dbgs() << "    and swapping muls\n");
+        PMul0->Exchange = true;
+        // Only the second operand can be exchanged, so swap the muls.
+        PMACPairs.push_back(std::make_pair(PMul1, PMul0));
+        return true;
+      }
+    }
+    return false;
+  };
 
   SmallPtrSet<const Instruction*, 4> Paired;
   for (unsigned i = 0; i < Elems; ++i) {
@@ -364,77 +421,21 @@ ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
                  dbgs() << "- "; Mul0->dump();
                  dbgs() << "- "; Mul1->dump());
 
-      const ValueList &Mul0_LHS = PMul0->LHS;
-      const ValueList &Mul0_RHS = PMul0->RHS;
-      const ValueList &Mul1_LHS = PMul1->LHS;
-      const ValueList &Mul1_RHS = PMul1->RHS;
-
-      if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
-          !AreSymmetrical(Mul0_RHS, Mul1_RHS))
-        continue;
-
       LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
-      // The first elements of each vector should be loads with sexts. If we
-      // find that its two pairs of consecutive loads, then these can be
-      // transformed into two wider loads and the users can be replaced with
-      // DSP intrinsics.
-      bool Found = false;
-      for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
-        auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
-        auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
-        auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
-        auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
-
-        if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
-          continue;
-
-        LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
-                   << "\t Ld0: " << *Ld0 << "\n"
-                   << "\t Ld1: " << *Ld1 << "\n"
-                   << "and operands " << x + 2 << ":\n"
-                   << "\t Ld2: " << *Ld2 << "\n"
-                   << "\t Ld3: " << *Ld3 << "\n");
-
-        if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
-          if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
-            LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-            PMACPairs.push_back(std::make_pair(PMul0, PMul1));
-            Found = true;
-          } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
-            LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-            LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
-            PMul1->Exchange = true;
-            PMACPairs.push_back(std::make_pair(PMul0, PMul1));
-            Found = true;
-          }
-        } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd)) {
-          if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
-            LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-            LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
-            LLVM_DEBUG(dbgs() << "    and swapping muls\n");
-            PMul0->Exchange = true;
-            // Only the second operand can be exchanged, so swap the muls.
-            PMACPairs.push_back(std::make_pair(PMul1, PMul0));
-            Found = true;
-          }
-        }
-      }
-      if (Found) {
+      if (CanPair(PMul0, PMul1)) {
         Paired.insert(Mul0);
         Paired.insert(Mul1);
         break;
       }
     }
   }
-  return PMACPairs;
 }
 
-bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
-                                        PMACPairList &PMACPairs) {
+bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
   Instruction *Acc = Reduction.Phi;
   Instruction *InsertAfter = Reduction.AccIntAdd;
 
-  for (auto &Pair : PMACPairs) {
+  for (auto &Pair : Reduction.PMACPairs) {
     BinOpChain *PMul0 = Pair.first;
     BinOpChain *PMul1 = Pair.second;
     LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
@@ -524,26 +525,20 @@ static void MatchParallelMACSequences(Reduction &R,
     if (!I)
       return false;
 
-    Value *MulOp0, *MulOp1;
-
     switch (I->getOpcode()) {
     case Instruction::Add:
       if (Match(I->getOperand(0)) || (Match(I->getOperand(1))))
         return true;
       break;
-    case Instruction::Mul:
-      if (match (I, (m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
+    case Instruction::Mul: {
+      Value *MulOp0 = I->getOperand(0);
+      Value *MulOp1 = I->getOperand(1);
+      if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
         AddMACCandidate(Candidates, I, MulOp0, MulOp1);
-        return false;
-      }
-      break;
+      return false;
+    }
     case Instruction::SExt:
-      if (match (I, (m_SExt(m_Mul(m_Value(MulOp0), m_Value(MulOp1)))))) {
-        Instruction *Mul = cast<Instruction>(I->getOperand(0));
-        AddMACCandidate(Candidates, Mul, MulOp0, MulOp1);
-        return false;
-      }
-      break;
+      return Match(I->getOperand(0));
     }
     return false;
   };
@@ -685,8 +680,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
   for (auto &R : Reductions) {
     if (AreAliased(AA, Reads, Writes, R.MACCandidates))
       return false;
-    PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
-    Changed |= InsertParallelMACs(R, PMACPairs);
+    CreateParallelMACPairs(R);
+    Changed |= InsertParallelMACs(R);
   }
 
   LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
@@ -733,6 +728,52 @@ Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
   return Call;
 }
 
+// Compare the value lists in Other to this chain.
+bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
+  // Element-by-element comparison of Value lists returning true if they are
+  // instructions with the same opcode or constants with the same value.
+  auto CompareValueList = [](const ValueList &VL0,
+                             const ValueList &VL1) {
+    if (VL0.size() != VL1.size()) {
+      LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
+                        << VL0.size() << " != " << VL1.size() << "\n");
+      return false;
+    }
+
+    const unsigned Pairs = VL0.size();
+    LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
+
+    for (unsigned i = 0; i < Pairs; ++i) {
+      const Value *V0 = VL0[i];
+      const Value *V1 = VL1[i];
+      const auto *Inst0 = dyn_cast<Instruction>(V0);
+      const auto *Inst1 = dyn_cast<Instruction>(V1);
+
+      LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
+                dbgs() << "mul1: "; V0->dump();
+                dbgs() << "mul2: "; V1->dump());
+
+      if (!Inst0 || !Inst1)
+        return false;
+
+      if (Inst0->isSameOperationAs(Inst1)) {
+        LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+        continue;
+      }
+
+      const APInt *C0, *C1;
+      if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
+        return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
+    return true;
+  };
+
+  return CompareValueList(LHS, Other->LHS) &&
+         CompareValueList(RHS, Other->RHS);
+}
+
 Pass *llvm::createARMParallelDSPPass() {
   return new ARMParallelDSP();
 }
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 0e16d6bcfe2b27fc3df8ccde6bca7cd30e4c9249..4f28f2dafc70ab4e31c978443a1d28191fe250cf 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -234,6 +234,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_GEP:
   case G_INTTOPTR:
   case G_PTRTOINT:
+  case G_CTLZ:
     // FIXME: We're abusing the fact that everything lives in a GPR for now; in
     // the real world we would use different mappings.
     OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 519f789fc2156607f14e73500a59b2ebde81b99a..ec02c840d5e17e75b5e2962cc38b1a9f996258dd 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -194,12 +194,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 /// Create an ARM architecture model.
 ///
 ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
@@ -210,7 +204,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
                                            CodeGenOpt::Level OL, bool isLittle)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
                         CPU, FS, Options, getEffectiveRelocModel(TT, RM),
-                        getEffectiveCodeModel(CM), OL),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TargetABI(computeTargetABI(TT, CPU, Options)),
       TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) {
 
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f837db56264c5970d94950a0824656a3d4ac7a50..3832b0112b87b627a41abbad7821dbb43f883b59 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMFeatures.h"
+#include "InstPrinter/ARMInstPrinter.h"
 #include "Utils/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -3205,17 +3206,26 @@ public:
 } // end anonymous namespace.
 
 void ARMOperand::print(raw_ostream &OS) const {
+  auto RegName = [](unsigned Reg) {
+    if (Reg)
+      return ARMInstPrinter::getRegisterName(Reg);
+    else
+      return "noreg";
+  };
+
   switch (Kind) {
   case k_CondCode:
     OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
     break;
   case k_CCOut:
-    OS << "<ccout " << getReg() << ">";
+    OS << "<ccout " << RegName(getReg()) << ">";
     break;
   case k_ITCondMask: {
     static const char *const MaskStr[] = {
-      "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)",
-      "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)"
+      "(invalid)", "(teee)", "(tee)", "(teet)",
+      "(te)",      "(tete)", "(tet)", "(tett)",
+      "(t)",       "(ttee)", "(tte)", "(ttet)",
+      "(tt)",      "(ttte)", "(ttt)", "(tttt)"
     };
     assert((ITMask.Mask & 0xf) == ITMask.Mask);
     OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
@@ -3249,13 +3259,25 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << "<ARM_TSB::" << TraceSyncBOptToString(getTraceSyncBarrierOpt()) << ">";
     break;
   case k_Memory:
-    OS << "<memory "
-       << " base:" << Memory.BaseRegNum;
+    OS << "<memory";
+    if (Memory.BaseRegNum)
+      OS << " base:" << RegName(Memory.BaseRegNum);
+    if (Memory.OffsetImm)
+      OS << " offset-imm:" << *Memory.OffsetImm;
+    if (Memory.OffsetRegNum)
+      OS << " offset-reg:" << (Memory.isNegative ? "-" : "")
+         << RegName(Memory.OffsetRegNum);
+    if (Memory.ShiftType != ARM_AM::no_shift) {
+      OS << " shift-type:" << ARM_AM::getShiftOpcStr(Memory.ShiftType);
+      OS << " shift-imm:" << Memory.ShiftImm;
+    }
+    if (Memory.Alignment)
+      OS << " alignment:" << Memory.Alignment;
     OS << ">";
     break;
   case k_PostIndexRegister:
     OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-")
-       << PostIdxReg.RegNum;
+       << RegName(PostIdxReg.RegNum);
     if (PostIdxReg.ShiftTy != ARM_AM::no_shift)
       OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " "
          << PostIdxReg.ShiftImm;
@@ -3271,23 +3293,21 @@ void ARMOperand::print(raw_ostream &OS) const {
     break;
   }
   case k_Register:
-    OS << "<register " << getReg() << ">";
+    OS << "<register " << RegName(getReg()) << ">";
     break;
   case k_ShifterImmediate:
     OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl")
        << " #" << ShifterImm.Imm << ">";
     break;
   case k_ShiftedRegister:
-    OS << "<so_reg_reg "
-       << RegShiftedReg.SrcReg << " "
-       << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy)
-       << " " << RegShiftedReg.ShiftReg << ">";
+    OS << "<so_reg_reg " << RegName(RegShiftedReg.SrcReg) << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy) << " "
+       << RegName(RegShiftedReg.ShiftReg) << ">";
     break;
   case k_ShiftedImmediate:
-    OS << "<so_reg_imm "
-       << RegShiftedImm.SrcReg << " "
-       << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy)
-       << " #" << RegShiftedImm.ShiftImm << ">";
+    OS << "<so_reg_imm " << RegName(RegShiftedImm.SrcReg) << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy) << " #"
+       << RegShiftedImm.ShiftImm << ">";
     break;
   case k_RotateImmediate:
     OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
@@ -3311,7 +3331,7 @@ void ARMOperand::print(raw_ostream &OS) const {
     const SmallVectorImpl<unsigned> &RegList = getRegList();
     for (SmallVectorImpl<unsigned>::const_iterator
            I = RegList.begin(), E = RegList.end(); I != E; ) {
-      OS << *I;
+      OS << RegName(*I);
       if (++I < E) OS << ", ";
     }
 
@@ -3320,15 +3340,15 @@ void ARMOperand::print(raw_ostream &OS) const {
   }
   case k_VectorList:
     OS << "<vector_list " << VectorList.Count << " * "
-       << VectorList.RegNum << ">";
+       << RegName(VectorList.RegNum) << ">";
     break;
   case k_VectorListAllLanes:
     OS << "<vector_list(all lanes) " << VectorList.Count << " * "
-       << VectorList.RegNum << ">";
+       << RegName(VectorList.RegNum) << ">";
     break;
   case k_VectorListIndexed:
     OS << "<vector_list(lane " << VectorList.LaneIndex << ") "
-       << VectorList.Count << " * " << VectorList.RegNum << ">";
+       << VectorList.Count << " * " << RegName(VectorList.RegNum) << ">";
     break;
   case k_Token:
     OS << "'" << getToken() << "'";
@@ -9157,33 +9177,9 @@ bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const {
 
   // Any arithmetic instruction which writes to the PC also terminates the IT
   // block.
-  for (unsigned OpIdx = 0; OpIdx < MCID.getNumDefs(); ++OpIdx) {
-    MCOperand &Op = Inst.getOperand(OpIdx);
-    if (Op.isReg() && Op.getReg() == ARM::PC)
-      return true;
-  }
-
-  if (MCID.hasImplicitDefOfPhysReg(ARM::PC, MRI))
+  if (MCID.hasDefOfPhysReg(Inst, ARM::PC, *MRI))
     return true;
 
-  // Instructions with variable operand lists, which write to the variable
-  // operands. We only care about Thumb instructions here, as ARM instructions
-  // obviously can't be in an IT block.
-  switch (Inst.getOpcode()) {
-  case ARM::tLDMIA:
-  case ARM::t2LDMIA:
-  case ARM::t2LDMIA_UPD:
-  case ARM::t2LDMDB:
-  case ARM::t2LDMDB_UPD:
-    if (listContainsReg(Inst, 3, ARM::PC))
-      return true;
-    break;
-  case ARM::tPOP:
-    if (listContainsReg(Inst, 2, ARM::PC))
-      return true;
-    break;
-  }
-
   return false;
 }
 
@@ -9290,6 +9286,10 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   switch (MatchResult) {
   case Match_Success:
+    LLVM_DEBUG(dbgs() << "Parsed as: ";
+               Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
+               dbgs() << "\n");
+
     // Context sensitive operand constraints aren't handled by the matcher,
     // so check them here.
     if (validateInstruction(Inst, Operands)) {
@@ -9307,7 +9307,9 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       // individual transformations can chain off each other. E.g.,
       // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
       while (processInstruction(Inst, Operands, Out))
-        ;
+        LLVM_DEBUG(dbgs() << "Changed to: ";
+                   Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
+                   dbgs() << "\n");
 
       // Only after the instruction is fully processed, we can validate it
       if (wasInITBlock && hasV8Ops() && isThumb() &&
diff --git a/lib/Target/ARM/AsmParser/LLVMBuild.txt b/lib/Target/ARM/AsmParser/LLVMBuild.txt
index 061885ad4fac836d2e9f8a681f8fb7b3e5778d04..19058aeadb956c886dbfb857ded878c07c55daae 100644
--- a/lib/Target/ARM/AsmParser/LLVMBuild.txt
+++ b/lib/Target/ARM/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = ARMAsmParser
 parent = ARM
-required_libraries = ARMDesc ARMInfo MC MCParser Support ARMUtils
+required_libraries = ARMDesc ARMInfo ARMAsmPrinter MC MCParser Support ARMUtils
 add_to_library_groups = ARM
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 4b4956e914f267bf8e8f131d76f2ae16df6b14f5..0ced8195790d6c5ba78cd0fde12f08e9648b31c4 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -22,6 +22,8 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ScopedPrinter.h"
+
 using namespace llvm;
 
 namespace {
@@ -144,6 +146,15 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                                  MCValue Target,
                                  uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+
+  if (FixupOffset & 0xff000000) {
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                 "can not encode offset '0x" +
+                                     to_hexString(FixupOffset) +
+                                     "' in resulting scattered relocation.");
+    return;
+  }
+
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Type = MachO::ARM_RELOC_HALF;
 
@@ -250,6 +261,15 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                                                     unsigned Log2Size,
                                                     uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+
+  if (FixupOffset & 0xff000000) {
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                 "can not encode offset '0x" +
+                                     to_hexString(FixupOffset) +
+                                     "' in resulting scattered relocation.");
+    return;
+  }
+
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
 
   // See <reloc.h>.
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 1a91a7030657918e16bc30d91609b30756b8dfc8..d567d333904922f285b765ba6581decab970d920 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -146,9 +146,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
       MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
 
-  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
-      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
-      RC == &ARM::GPRnopcRegClass) {
+  if (ARM::GPRRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(ARM::t2STRi12))
         .addReg(SrcReg, getKillRegState(isKill))
         .addFrameIndex(FI)
@@ -190,9 +188,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
-      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
-      RC == &ARM::GPRnopcRegClass) {
+  if (ARM::GPRRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
         .addFrameIndex(FI)
         .addImm(0)
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
index 74300d9a451c372dce86787652dc94e5e1c9311a..9828cdab68c313465e7be17f073892b3000adfb8 100644
--- a/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -40,12 +40,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return RM.hasValue() ? *RM : Reloc::Static;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
@@ -53,8 +47,8 @@ AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
                                    Optional<CodeModel::Model> CM,
                                    CodeGenOpt::Level OL, bool JIT)
     : LLVMTargetMachine(T, AVRDataLayout, TT, getCPU(CPU), FS, Options,
-                        getEffectiveRelocModel(RM), getEffectiveCodeModel(CM),
-                        OL),
+                        getEffectiveRelocModel(RM),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       SubTarget(TT, getCPU(CPU), FS, *this) {
   this->TLOF = make_unique<AVRTargetObjectFile>();
   initAsmInfo();
diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index d57cc098497fce7fe80f28058954e7e2e460c5aa..f2bb59265271b4625115e9a051cfbd47453e5c71 100644
--- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -34,8 +34,9 @@
 
 #define DEBUG_TYPE "avr-asm-parser"
 
-namespace llvm {
+using namespace llvm;
 
+namespace {
 /// Parses AVR assembly from a stream.
 class AVRAsmParser : public MCTargetAsmParser {
   const MCSubtargetInfo &STI;
@@ -245,6 +246,8 @@ public:
   }
 };
 
+} // end anonymous namespace.
+
 // Auto-generated Match Functions
 
 /// Maps from the set of all register names to a register number.
@@ -510,6 +513,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands) {
     case AsmToken::Real:
       if (!tryParseExpression(Operands))
         return false;
+      break;
     default:
       break;
     }
@@ -708,5 +712,3 @@ unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   }
   return Match_InvalidOperand;
 }
-
-} // end of namespace llvm
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
index 52b1bb00316a931eee9d142bcbba848ac08498c1..350465b118ed227a88f2637fcbe4b0d19c6cafc1 100644
--- a/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -51,12 +51,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
@@ -64,8 +58,8 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
                                    Optional<CodeModel::Model> CM,
                                    CodeGenOpt::Level OL, bool JIT)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
-                        getEffectiveRelocModel(RM), getEffectiveCodeModel(CM),
-                        OL),
+                        getEffectiveRelocModel(RM),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index dcab9bfb019a6898a64c5190050a655527e728bd..2eb1f0fc8bd94d6ce6b2e7849fde5e6c9278e85c 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -313,8 +313,6 @@ public:
   bool iss30_2Imm() const { return true; }
   bool iss29_3Imm() const { return true; }
   bool iss27_2Imm() const { return CheckImmRange(27, 2, true, true, false); }
-  bool iss10_0Imm() const { return CheckImmRange(10, 0, true, false, false); }
-  bool iss10_6Imm() const { return CheckImmRange(10, 6, true, false, false); }
   bool iss9_0Imm() const { return CheckImmRange(9, 0, true, false, false); }
   bool iss8_0Imm() const { return CheckImmRange(8, 0, true, false, false); }
   bool iss8_0Imm64() const { return CheckImmRange(8, 0, true, true, false); }
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index a409fff0d342266524e44046a69a97d1a0d3a2d6..428b42eba30da1a4e8e730645e1fb60f5445d67d 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -117,6 +117,10 @@ DecodeGeneralDoubleLow8RegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
+static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t Address,
                                                 const void *Decoder);
@@ -145,62 +149,7 @@ static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
                                     uint64_t /*Address*/, const void *Decoder);
 static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                     const void *Decoder);
-
-static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                   const void *Decoder) {
-  signedDecoder<4>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                    const void *Decoder) {
-  signedDecoder<14>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                   const void *Decoder) {
-  signedDecoder<8>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                   const void *Decoder) {
-  signedDecoder<7>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                    const void *Decoder) {
-  signedDecoder<12>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                   const void *Decoder) {
-  signedDecoder<3>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                    const void *Decoder) {
-  signedDecoder<13>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                   const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                   const void *Decoder) {
-  signedDecoder<9>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                   const void *Decoder) {
-  signedDecoder<5>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t,
-                                   const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
+#include "HexagonDepDecoders.h"
 #include "HexagonGenDisassemblerTables.inc"
 
 static MCDisassembler *createHexagonDisassembler(const Target &T,
@@ -663,6 +612,18 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
   return (DecodeRegisterClass(Inst, RegNo >> 1, HvxWRDecoderTable));
 }
 
+LLVM_ATTRIBUTE_UNUSED  // Suppress warning temporarily.
+static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t /*Address*/,
+                                              const void *Decoder) {
+  static const MCPhysReg HvxVQRDecoderTable[] = {
+      Hexagon::VQ0,  Hexagon::VQ1,  Hexagon::VQ2,  Hexagon::VQ3,
+      Hexagon::VQ4,  Hexagon::VQ5,  Hexagon::VQ6,  Hexagon::VQ7};
+
+  return DecodeRegisterClass(Inst, RegNo >> 2, HvxVQRDecoderTable);
+}
+
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t /*Address*/,
                                                 const void *Decoder) {
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 8853dd6d550943db24d4bac73132eaa4eeb362b7..868353e188329acdcfcebec441cc6c0b2e72065a 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -25,6 +25,9 @@ include "llvm/Target/Target.td"
 include "HexagonDepArch.td"
 
 // Hexagon ISA Extensions
+def ExtensionZReg: SubtargetFeature<"zreg", "UseZRegOps", "true",
+      "Hexagon ZReg extension instructions">;
+
 def ExtensionHVX: SubtargetFeature<"hvx", "HexagonHVXVersion",
       "Hexagon::ArchEnum::V60", "Hexagon HVX instructions">;
 def ExtensionHVXV60: SubtargetFeature<"hvxv60", "HexagonHVXVersion",
@@ -32,10 +35,14 @@ def ExtensionHVXV60: SubtargetFeature<"hvxv60", "HexagonHVXVersion",
       [ExtensionHVX]>;
 def ExtensionHVXV62: SubtargetFeature<"hvxv62", "HexagonHVXVersion",
       "Hexagon::ArchEnum::V62", "Hexagon HVX instructions",
-      [ExtensionHVX,ExtensionHVXV60]>;
+      [ExtensionHVX, ExtensionHVXV60]>;
 def ExtensionHVXV65: SubtargetFeature<"hvxv65", "HexagonHVXVersion",
       "Hexagon::ArchEnum::V65", "Hexagon HVX instructions",
-      [ExtensionHVX,ExtensionHVXV60, ExtensionHVXV62]>;
+      [ExtensionHVX, ExtensionHVXV60, ExtensionHVXV62]>;
+def ExtensionHVXV66: SubtargetFeature<"hvxv66", "HexagonHVXVersion",
+      "Hexagon::ArchEnum::V66", "Hexagon HVX instructions",
+      [ExtensionHVX, ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65,
+       ExtensionZReg]>;
 
 def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
       "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
@@ -60,6 +67,9 @@ def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true",
       "Enable generation of duplex instruction">;
 def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
       "true", "Reserve register R19">;
+def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim",
+      "NoreturnStackElim", "true",
+      "Eliminate stack allocation in a noreturn function when possible">;
 
 //===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
@@ -78,6 +88,10 @@ def UseHVXV62          : Predicate<"HST->useHVXOps()">,
                          AssemblerPredicate<"ExtensionHVXV62">;
 def UseHVXV65          : Predicate<"HST->useHVXOps()">,
                          AssemblerPredicate<"ExtensionHVXV65">;
+def UseHVXV66          : Predicate<"HST->useHVXOps()">,
+                         AssemblerPredicate<"ExtensionHVXV66">;
+def UseZReg            : Predicate<"HST->useZRegOps()">,
+                         AssemblerPredicate<"ExtensionZReg">;
 
 def Hvx64:  HwMode<"+hvx-length64b">;
 def Hvx128: HwMode<"+hvx-length128b">;
@@ -309,8 +323,6 @@ include "HexagonPatternsHVX.td"
 include "HexagonPatternsV65.td"
 include "HexagonDepMappings.td"
 include "HexagonIntrinsics.td"
-include "HexagonMapAsm2IntrinV62.gen.td"
-include "HexagonMapAsm2IntrinV65.gen.td"
 
 def HexagonInstrInfo : InstrInfo;
 
@@ -346,6 +358,10 @@ def : Proc<"hexagonv65", HexagonModelV65,
            [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
             FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ,
             FeatureNVS, FeaturePackets, FeatureSmallData]>;
+def : Proc<"hexagonv66", HexagonModelV66,
+           [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66,
+            FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ,
+            FeatureNVS, FeaturePackets, FeatureSmallData]>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index 94aacbed6af697bd19fa93931eedd58d96fc0b0a..92b6da871a4c7826a72d84f9e74912ef9c3c2719 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -93,11 +93,12 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
   const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
   unsigned ID = RC.getID();
   uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
-  auto &HRI = static_cast<const HexagonRegisterInfo&>(TRI);
+  const auto &HRI = static_cast<const HexagonRegisterInfo&>(TRI);
   bool IsSubLo = (Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo));
   switch (ID) {
     case Hexagon::DoubleRegsRegClassID:
     case Hexagon::HvxWRRegClassID:
+    case Hexagon::HvxVQRRegClassID:
       return IsSubLo ? BT::BitMask(0, RW-1)
                      : BT::BitMask(RW, 2*RW-1);
     default:
@@ -114,9 +115,13 @@ uint16_t HexagonEvaluator::getPhysRegBitWidth(unsigned Reg) const {
   assert(TargetRegisterInfo::isPhysicalRegister(Reg));
 
   using namespace Hexagon;
-  for (auto &RC : {HvxVRRegClass, HvxWRRegClass, HvxQRRegClass})
-    if (RC.contains(Reg))
-      return TRI.getRegSizeInBits(RC);
+  const auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  if (HST.useHVXOps()) {
+    for (auto &RC : {HvxVRRegClass, HvxWRRegClass, HvxQRRegClass,
+                     HvxVQRRegClass})
+      if (RC.contains(Reg))
+        return TRI.getRegSizeInBits(RC);
+  }
   // Default treatment for other physical registers.
   if (const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg))
     return TRI.getRegSizeInBits(*RC);
@@ -142,6 +147,8 @@ const TargetRegisterClass &HexagonEvaluator::composeWithSubRegIndex(
       return Hexagon::IntRegsRegClass;
     case Hexagon::HvxWRRegClassID:
       return Hexagon::HvxVRRegClass;
+    case Hexagon::HvxVQRRegClassID:
+      return Hexagon::HvxWRRegClass;
     default:
       break;
   }
diff --git a/lib/Target/Hexagon/HexagonDepArch.h b/lib/Target/Hexagon/HexagonDepArch.h
index 1bcf40220619f0e2ece0da2e36122c01139937c5..dff2b2f471d0f1bb83ac53517d0960980a3bb869 100644
--- a/lib/Target/Hexagon/HexagonDepArch.h
+++ b/lib/Target/Hexagon/HexagonDepArch.h
@@ -1,4 +1,4 @@
-//===- HexagonDepArch.h ---------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,12 +10,11 @@
 //===----------------------------------------------------------------------===//
 
 
-
 #ifndef HEXAGON_DEP_ARCH_H
 #define HEXAGON_DEP_ARCH_H
 namespace llvm {
 namespace Hexagon {
-enum class ArchEnum { NoArch,Generic,V5,V55,V60,V62,V65 };
+enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66 };
 } // namespace Hexagon
 } // namespace llvm;
 #endif // HEXAGON_DEP_ARCH_H
diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td
index ce7956926101b92fe369ddf0115e99bfb4e4e6b4..f1aadae555c8604fed185b243e4d089cf1979d90 100644
--- a/lib/Target/Hexagon/HexagonDepArch.td
+++ b/lib/Target/Hexagon/HexagonDepArch.td
@@ -1,4 +1,4 @@
-//===- HexagonDepArch.td --------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,7 +9,8 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
+def ArchV66: SubtargetFeature<"v66", "HexagonArchVersion", "Hexagon::ArchEnum::V66", "Enable Hexagon V66 architecture">;
+def HasV66 : Predicate<"HST->hasV66Ops()">, AssemblerPredicate<"ArchV66">;
 def ArchV65: SubtargetFeature<"v65", "HexagonArchVersion", "Hexagon::ArchEnum::V65", "Enable Hexagon V65 architecture">;
 def HasV65 : Predicate<"HST->hasV65Ops()">, AssemblerPredicate<"ArchV65">;
 def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "Hexagon::ArchEnum::V62", "Enable Hexagon V62 architecture">;
@@ -19,3 +20,4 @@ def HasV60 : Predicate<"HST->hasV60Ops()">, AssemblerPredicate<"ArchV60">;
 def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "Hexagon::ArchEnum::V55", "Enable Hexagon V55 architecture">;
 def HasV55 : Predicate<"HST->hasV55Ops()">, AssemblerPredicate<"ArchV55">;
 def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "Hexagon::ArchEnum::V5", "Enable Hexagon V5 architecture">;
+def HasV5 : Predicate<"HST->hasV5Ops()">, AssemblerPredicate<"ArchV5">;
diff --git a/lib/Target/Hexagon/HexagonDepDecoders.h b/lib/Target/Hexagon/HexagonDepDecoders.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f78412f45d24f4b656cdc67ca4633bb453b013d
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepDecoders.h
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Automatically generated file, please consult code owner before editing.
+//===----------------------------------------------------------------------===//
+
+// clang-format off
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<4>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<14>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<8>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<7>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<12>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<3>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<13>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<9>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<5>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+// clang-format on
diff --git a/lib/Target/Hexagon/HexagonDepIICHVX.td b/lib/Target/Hexagon/HexagonDepIICHVX.td
index b27cdae81a283a52b0f284d9f868dda8674bf532..9e3dea9f3e9b00c7e4318a084fc8e9f91444f6bb 100644
--- a/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -1,4 +1,4 @@
-//===- HexagonDepIICHVX.td ------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,1849 +9,2549 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
-def tc_0317c6ca : InstrItinClass;
-def tc_1b93bdc6 : InstrItinClass;
-def tc_2171ebae : InstrItinClass;
-def tc_28978789 : InstrItinClass;
-def tc_29841470 : InstrItinClass;
-def tc_316c637c : InstrItinClass;
-def tc_354299ad : InstrItinClass;
-def tc_35e92f8e : InstrItinClass;
-def tc_38208312 : InstrItinClass;
-def tc_4105d6b5 : InstrItinClass;
-def tc_41f4b64e : InstrItinClass;
-def tc_41f99e1c : InstrItinClass;
-def tc_45453b98 : InstrItinClass;
-def tc_4e2a5159 : InstrItinClass;
-def tc_4f190ba3 : InstrItinClass;
-def tc_4fd8566e : InstrItinClass;
-def tc_51cd3aab : InstrItinClass;
-def tc_5a9fc4ec : InstrItinClass;
-def tc_5c03dc63 : InstrItinClass;
-def tc_5c120602 : InstrItinClass;
-def tc_5cbf490b : InstrItinClass;
-def tc_63e3d94c : InstrItinClass;
-def tc_644584f8 : InstrItinClass;
-def tc_66bb62ea : InstrItinClass;
-def tc_69b6dd20 : InstrItinClass;
-def tc_6b78cf13 : InstrItinClass;
-def tc_6fd9ad30 : InstrItinClass;
-def tc_71337255 : InstrItinClass;
-def tc_72ad7b54 : InstrItinClass;
-def tc_7474003e : InstrItinClass;
-def tc_77a4c701 : InstrItinClass;
-def tc_7c3f55c4 : InstrItinClass;
-def tc_7e9f581b : InstrItinClass;
-def tc_7fa82b08 : InstrItinClass;
-def tc_7fa8b40f : InstrItinClass;
-def tc_85d237e3 : InstrItinClass;
-def tc_8a6eb39a : InstrItinClass;
-def tc_8b6a873f : InstrItinClass;
-def tc_908a4c8c : InstrItinClass;
-def tc_9311da3f : InstrItinClass;
-def tc_94f43c04 : InstrItinClass;
-def tc_9777e6bf : InstrItinClass;
-def tc_97c165b9 : InstrItinClass;
-def tc_98733e9d : InstrItinClass;
-def tc_99093773 : InstrItinClass;
-def tc_9b9642a1 : InstrItinClass;
-def tc_9c267309 : InstrItinClass;
-def tc_a3127e12 : InstrItinClass;
-def tc_a4c9df3b : InstrItinClass;
-def tc_a807365d : InstrItinClass;
-def tc_aedb9f9e : InstrItinClass;
-def tc_b06ab583 : InstrItinClass;
-def tc_b712833a : InstrItinClass;
-def tc_b77635b4 : InstrItinClass;
-def tc_bbaf280e : InstrItinClass;
-def tc_bf142ae2 : InstrItinClass;
-def tc_bfe309d5 : InstrItinClass;
-def tc_c00bf9c9 : InstrItinClass;
-def tc_c4b515c5 : InstrItinClass;
-def tc_cbf6d1dc : InstrItinClass;
-def tc_cedf314b : InstrItinClass;
-def tc_d2cb81ea : InstrItinClass;
-def tc_d5090f3e : InstrItinClass;
-def tc_d642eff3 : InstrItinClass;
-def tc_d725e5b0 : InstrItinClass;
-def tc_d7bea0ec : InstrItinClass;
-def tc_d98f4d63 : InstrItinClass;
-def tc_da979fb3 : InstrItinClass;
-def tc_db5b9e2f : InstrItinClass;
-def tc_df54ad52 : InstrItinClass;
-def tc_e172d86a : InstrItinClass;
-def tc_e231aa4f : InstrItinClass;
-def tc_e3748cdf : InstrItinClass;
-def tc_e5053c8f : InstrItinClass;
-def tc_e6299d16 : InstrItinClass;
-def tc_eb669007 : InstrItinClass;
-def tc_ec58f88a : InstrItinClass;
-def tc_eda67dcd : InstrItinClass;
-def tc_ee927c0e : InstrItinClass;
-def tc_f3fc3f83 : InstrItinClass;
-def tc_fa99dc24 : InstrItinClass;
+def tc_04da405a : InstrItinClass;
+def tc_05058f6f : InstrItinClass;
+def tc_05ac6f98 : InstrItinClass;
+def tc_05ca8cfd : InstrItinClass;
+def tc_08a4f1b6 : InstrItinClass;
+def tc_0b04c6c7 : InstrItinClass;
+def tc_0ec46cf9 : InstrItinClass;
+def tc_131f1c81 : InstrItinClass;
+def tc_1381a97c : InstrItinClass;
+def tc_15fdf750 : InstrItinClass;
+def tc_16ff9ef8 : InstrItinClass;
+def tc_191381c1 : InstrItinClass;
+def tc_1ad8a370 : InstrItinClass;
+def tc_1ba8a0cd : InstrItinClass;
+def tc_20a4bbec : InstrItinClass;
+def tc_257f6f7c : InstrItinClass;
+def tc_26a377fe : InstrItinClass;
+def tc_2c745bb8 : InstrItinClass;
+def tc_2d4051cd : InstrItinClass;
+def tc_2e8f5f6e : InstrItinClass;
+def tc_309dbb4f : InstrItinClass;
+def tc_3904b926 : InstrItinClass;
+def tc_3aacf4a8 : InstrItinClass;
+def tc_3ad719fb : InstrItinClass;
+def tc_3c56e5ce : InstrItinClass;
+def tc_3ce09744 : InstrItinClass;
+def tc_3e2aaafc : InstrItinClass;
+def tc_447d9895 : InstrItinClass;
+def tc_453fe68d : InstrItinClass;
+def tc_46d6c3e0 : InstrItinClass;
+def tc_51d0ecc3 : InstrItinClass;
+def tc_52447ecc : InstrItinClass;
+def tc_540c3da3 : InstrItinClass;
+def tc_54a0dc47 : InstrItinClass;
+def tc_561aaa58 : InstrItinClass;
+def tc_56c4f9fe : InstrItinClass;
+def tc_56e64202 : InstrItinClass;
+def tc_58d21193 : InstrItinClass;
+def tc_5bf8afbb : InstrItinClass;
+def tc_61bf7c03 : InstrItinClass;
+def tc_649072c2 : InstrItinClass;
+def tc_660769f1 : InstrItinClass;
+def tc_663c80a7 : InstrItinClass;
+def tc_6942b6e0 : InstrItinClass;
+def tc_6e7fa133 : InstrItinClass;
+def tc_71646d06 : InstrItinClass;
+def tc_7177e272 : InstrItinClass;
+def tc_718b5c53 : InstrItinClass;
+def tc_7273323b : InstrItinClass;
+def tc_7417e785 : InstrItinClass;
+def tc_767c4e9d : InstrItinClass;
+def tc_7e6a3e89 : InstrItinClass;
+def tc_8772086c : InstrItinClass;
+def tc_87adc037 : InstrItinClass;
+def tc_8e420e4d : InstrItinClass;
+def tc_90bcc1db : InstrItinClass;
+def tc_933f2b39 : InstrItinClass;
+def tc_946013d8 : InstrItinClass;
+def tc_9d1dc972 : InstrItinClass;
+def tc_9f363d21 : InstrItinClass;
+def tc_a02a10a8 : InstrItinClass;
+def tc_a0dbea28 : InstrItinClass;
+def tc_a7e6707d : InstrItinClass;
+def tc_ab23f776 : InstrItinClass;
+def tc_abe8c3b2 : InstrItinClass;
+def tc_ac4046bc : InstrItinClass;
+def tc_af25efd9 : InstrItinClass;
+def tc_b091f1c6 : InstrItinClass;
+def tc_b28e51aa : InstrItinClass;
+def tc_b4416217 : InstrItinClass;
+def tc_b9db8205 : InstrItinClass;
+def tc_c0749f3c : InstrItinClass;
+def tc_c127de3a : InstrItinClass;
+def tc_c4edf264 : InstrItinClass;
+def tc_c5dba46e : InstrItinClass;
+def tc_c7039829 : InstrItinClass;
+def tc_cd94bfe0 : InstrItinClass;
+def tc_d8287c14 : InstrItinClass;
+def tc_db5555f3 : InstrItinClass;
+def tc_dd5b0695 : InstrItinClass;
+def tc_df80eeb0 : InstrItinClass;
+def tc_e2d2e9e5 : InstrItinClass;
+def tc_e35c1e93 : InstrItinClass;
+def tc_e3f68a46 : InstrItinClass;
+def tc_e675c45a : InstrItinClass;
+def tc_e699ae41 : InstrItinClass;
+def tc_e8797b98 : InstrItinClass;
+def tc_e99d4c2e : InstrItinClass;
+def tc_f1de44ef : InstrItinClass;
+def tc_f21e8abb : InstrItinClass;
+def tc_fd7610da : InstrItinClass;
 
 class DepHVXItinV55 {
   list<InstrItinData> DepHVXItinV55_list = [
-    InstrItinData <tc_0317c6ca, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_1b93bdc6, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+    InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_2171ebae, /*SLOT0123,VA_DV*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_28978789, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [3, 2],
-      [HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_29841470, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_316c637c, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_354299ad, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_35e92f8e, /*SLOT0,NOSLOT1,LOAD,VP*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_38208312, /*SLOT01,LOAD*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4105d6b5, /*SLOT0123,VP*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 2],
-      [HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_41f4b64e, /*SLOT0123,VS*/
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
 
-    InstrItinData <tc_41f99e1c, /*SLOT23,VX_DV*/
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_45453b98, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4e2a5159, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4f190ba3, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_4fd8566e, /*SLOT0,NOSLOT1,LOAD,VP*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_51cd3aab, /*SLOT01,LOAD*/
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5a9fc4ec, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_5c03dc63, /*SLOT0,STORE*/
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5c120602, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5cbf490b, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_63e3d94c, /*SLOT1,LOAD,VA*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_644584f8, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
-      [HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_66bb62ea, /*SLOT1,LOAD,VA*/
-      [InstrStage<1, [SLOT1], 0>,
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_69b6dd20, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6b78cf13, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
-      [HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_6fd9ad30, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_71337255, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
-      [HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_72ad7b54, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_7474003e, /*SLOT2,VX_DV*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_77a4c701, /*SLOT01,LOAD*/
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7c3f55c4, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_7e9f581b, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_7fa82b08, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_7fa8b40f, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85d237e3, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_8a6eb39a, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
-      [HVX_FWD]>,
-
-    InstrItinData <tc_8b6a873f, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_908a4c8c, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_9311da3f, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_94f43c04, /*SLOT0,STORE,VA_DV*/
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_9777e6bf, /*SLOT0,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
-      [Hex_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_97c165b9, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_98733e9d, /*SLOT1,LOAD,VA_DV*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_99093773, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
-      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9b9642a1, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c267309, /*SLOT01,LOAD*/
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a3127e12, /*SLOT0123,VA*/
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_a4c9df3b, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_a807365d, /*SLOT23,VS_VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
-       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
 
-    InstrItinData <tc_aedb9f9e, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_b06ab583, /*SLOT0123,VA*/
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
-      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b712833a, /*SLOT01,LOAD,VA*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b77635b4, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [2],
-      [Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_bbaf280e, /*SLOT0123,VA*/
+    InstrItinData <tc_8772086c, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_bf142ae2, /*SLOT0123,VP*/
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bfe309d5, /*SLOT1,LOAD,VA_DV*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_c00bf9c9, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4b515c5, /*SLOT0123,VP*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cbf6d1dc, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_cedf314b, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [3],
-      [HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d2cb81ea, /*SLOT0123,VS*/
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d5090f3e, /*SLOT0,STORE*/
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d642eff3, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d725e5b0, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d7bea0ec, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_d98f4d63, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_da979fb3, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db5b9e2f, /*SLOT0,STORE*/
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_df54ad52, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_e172d86a, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e231aa4f, /*SLOT23,VX*/
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e3748cdf, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e5053c8f, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [],
-      []>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_e6299d16, /*SLOT0123,VP*/
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5],
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
       [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_eb669007, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ec58f88a, /*SLOT0,STORE,VA_DV*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_eda67dcd, /*SLOT23,VX_DV*/
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_ee927c0e, /*SLOT23,VS_VX*/
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
-       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f3fc3f83, /*SLOT0123,VP*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fa99dc24, /*SLOT2,VX_DV*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>
-  ];
-}
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-class DepHVXItinV60 {
-  list<InstrItinData> DepHVXItinV60_list = [
-    InstrItinData <tc_0317c6ca, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_1b93bdc6, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2, 5],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2171ebae, /*SLOT0123,VA_DV*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_28978789, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [3, 2],
-      [HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_29841470, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_316c637c, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_354299ad, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_35e92f8e, /*SLOT0,NOSLOT1,LOAD,VP*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_38208312, /*SLOT01,LOAD*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_4105d6b5, /*SLOT0123,VP*/
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 2],
-      [HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_41f4b64e, /*SLOT0123,VS*/
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
 
-    InstrItinData <tc_41f99e1c, /*SLOT23,VX_DV*/
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_45453b98, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4e2a5159, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_4f190ba3, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4fd8566e, /*SLOT0,NOSLOT1,LOAD,VP*/
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_51cd3aab, /*SLOT01,LOAD*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5a9fc4ec, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_5c03dc63, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+  ];
+}
 
-    InstrItinData <tc_5c120602, /*SLOT0123,VP_VS*/
+class DepHVXItinV60 {
+  list<InstrItinData> DepHVXItinV60_list = [
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_5cbf490b, /*SLOT01,LOAD,VA*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
+    InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_63e3d94c, /*SLOT1,LOAD,VA*/
+    InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
       [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_644584f8, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
-      [HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_66bb62ea, /*SLOT1,LOAD,VA*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_69b6dd20, /*SLOT23,VX*/
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6b78cf13, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
-      [HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_6fd9ad30, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_71337255, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
-      [HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_72ad7b54, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
 
-    InstrItinData <tc_7474003e, /*SLOT2,VX_DV*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_77a4c701, /*SLOT01,LOAD*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7c3f55c4, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_7e9f581b, /*SLOT23,VX_DV*/
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7fa82b08, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7fa8b40f, /*SLOT0123,VS*/
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_85d237e3, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8a6eb39a, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
-      [HVX_FWD]>,
-
-    InstrItinData <tc_8b6a873f, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_908a4c8c, /*SLOT23,VX*/
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9311da3f, /*SLOT23,VX*/
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_94f43c04, /*SLOT0,STORE,VA_DV*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_9777e6bf, /*SLOT0,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
-      [Hex_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_97c165b9, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_98733e9d, /*SLOT1,LOAD,VA_DV*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_99093773, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
-      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9b9642a1, /*SLOT0123,VS*/
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_9c267309, /*SLOT01,LOAD*/
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a3127e12, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a4c9df3b, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_a807365d, /*SLOT23,VS_VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
-       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_aedb9f9e, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_b06ab583, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
-      [HVX_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_b712833a, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b77635b4, /*SLOT0123,4SLOT*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [2],
-      [Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bbaf280e, /*SLOT0123,VA*/
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_bf142ae2, /*SLOT0123,VP*/
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_bfe309d5, /*SLOT1,LOAD,VA_DV*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c00bf9c9, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4b515c5, /*SLOT0123,VP*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_cbf6d1dc, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cedf314b, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [3],
-      [HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d2cb81ea, /*SLOT0123,VS*/
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_d5090f3e, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d642eff3, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d725e5b0, /*SLOT23,VX*/
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d7bea0ec, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d98f4d63, /*SLOT23,VX_DV*/
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da979fb3, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db5b9e2f, /*SLOT0,STORE*/
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_df54ad52, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e172d86a, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_e231aa4f, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e3748cdf, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_e5053c8f, /*SLOT0123,4SLOT*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [],
-      []>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_e6299d16, /*SLOT0123,VP*/
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_eb669007, /*SLOT01,LOAD,VA*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
 
-    InstrItinData <tc_ec58f88a, /*SLOT0,STORE,VA_DV*/
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_eda67dcd, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ee927c0e, /*SLOT23,VS_VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
-       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f3fc3f83, /*SLOT0123,VP*/
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8772086c, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_fa99dc24, /*SLOT2,VX_DV*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>
-  ];
-}
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-class DepHVXItinV62 {
-  list<InstrItinData> DepHVXItinV62_list = [
-    InstrItinData <tc_0317c6ca, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_1b93bdc6, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2, 5],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2171ebae, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_28978789, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [3, 2],
-      [HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_29841470, /*SLOT0,STORE*/
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_316c637c, /*SLOT0123,VA_DV*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_354299ad, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_35e92f8e, /*SLOT0,NOSLOT1,LOAD,VP*/
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_38208312, /*SLOT01,LOAD*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4105d6b5, /*SLOT0123,VP*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 2],
-      [HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_41f4b64e, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_41f99e1c, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_45453b98, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_4e2a5159, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4f190ba3, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_4fd8566e, /*SLOT0,NOSLOT1,LOAD,VP*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_51cd3aab, /*SLOT01,LOAD*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5a9fc4ec, /*SLOT0123,VA*/
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_5c03dc63, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5c120602, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5cbf490b, /*SLOT01,LOAD,VA*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_63e3d94c, /*SLOT1,LOAD,VA*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_644584f8, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
       [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_66bb62ea, /*SLOT1,LOAD,VA*/
-      [InstrStage<1, [SLOT1], 0>,
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_69b6dd20, /*SLOT23,VX*/
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_6b78cf13, /*SLOT23,VX*/
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6fd9ad30, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_71337255, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
-      [HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_72ad7b54, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_7474003e, /*SLOT2,VX_DV*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_77a4c701, /*SLOT01,LOAD*/
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_7c3f55c4, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7e9f581b, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_7fa82b08, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_7fa8b40f, /*SLOT0123,VS*/
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_85d237e3, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_8a6eb39a, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+       InstrStage<1, [CVI_ALL]>], [3],
       [HVX_FWD]>,
 
-    InstrItinData <tc_8b6a873f, /*SLOT0,STORE*/
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_908a4c8c, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_9311da3f, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_94f43c04, /*SLOT0,STORE,VA_DV*/
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_9777e6bf, /*SLOT0,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
-      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+  ];
+}
 
-    InstrItinData <tc_97c165b9, /*SLOT0123,VA_DV*/
+class DepHVXItinV62 {
+  list<InstrItinData> DepHVXItinV62_list = [
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_98733e9d, /*SLOT1,LOAD,VA_DV*/
+    InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
       [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_99093773, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
-      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_9b9642a1, /*SLOT0123,VA*/
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_9c267309, /*SLOT01,LOAD*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a3127e12, /*SLOT0123,VA*/
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_a4c9df3b, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_a807365d, /*SLOT23,VS_VX*/
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
-       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_aedb9f9e, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_b06ab583, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
-      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b712833a, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b77635b4, /*SLOT0123,4SLOT*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bbaf280e, /*SLOT0123,VA*/
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bf142ae2, /*SLOT0123,VP*/
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_bfe309d5, /*SLOT1,LOAD,VA_DV*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c00bf9c9, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4b515c5, /*SLOT0123,VP*/
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cbf6d1dc, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cedf314b, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [3],
-      [HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d2cb81ea, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d5090f3e, /*SLOT0,STORE*/
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d642eff3, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d725e5b0, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d7bea0ec, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_d98f4d63, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_da979fb3, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db5b9e2f, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_df54ad52, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_e172d86a, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e231aa4f, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e3748cdf, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_e5053c8f, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [],
-      []>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e6299d16, /*SLOT0123,VP*/
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_eb669007, /*SLOT01,LOAD,VA*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ec58f88a, /*SLOT0,STORE,VA_DV*/
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_eda67dcd, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ee927c0e, /*SLOT23,VS_VX*/
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
-       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f3fc3f83, /*SLOT0123,VP*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fa99dc24, /*SLOT2,VX_DV*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>
-  ];
-}
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-class DepHVXItinV65 {
-  list<InstrItinData> DepHVXItinV65_list = [
-    InstrItinData <tc_0317c6ca, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b93bdc6, /*SLOT0,STORE*/
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2, 5],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_2171ebae, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_28978789, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [3, 2],
-      [HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_29841470, /*SLOT0,STORE*/
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_316c637c, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
 
-    InstrItinData <tc_354299ad, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_35e92f8e, /*SLOT0,NOSLOT1,LOAD,VP*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_38208312, /*SLOT01,LOAD*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4105d6b5, /*SLOT0123,VP*/
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 2],
+       InstrStage<1, [CVI_ALL]>], [3, 2],
       [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_41f4b64e, /*SLOT0123,VS*/
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_41f99e1c, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_45453b98, /*SLOT0123,VS*/
+    InstrItinData <tc_8772086c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_4e2a5159, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4f190ba3, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_4fd8566e, /*SLOT0,NOSLOT1,LOAD,VP*/
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_51cd3aab, /*SLOT01,LOAD*/
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5a9fc4ec, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5c03dc63, /*SLOT0,STORE*/
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5c120602, /*SLOT0123,VP_VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_5cbf490b, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_63e3d94c, /*SLOT1,LOAD,VA*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_644584f8, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
-      [HVX_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_66bb62ea, /*SLOT1,LOAD,VA*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_69b6dd20, /*SLOT23,VX*/
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6b78cf13, /*SLOT23,VX*/
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6fd9ad30, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_71337255, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
-      [HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_72ad7b54, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_7474003e, /*SLOT2,VX_DV*/
-      [InstrStage<1, [SLOT2], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_77a4c701, /*SLOT01,LOAD*/
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+  ];
+}
+
+class DepHVXItinV65 {
+  list<InstrItinData> DepHVXItinV65_list = [
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_7c3f55c4, /*SLOT23,VX_DV*/
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7e9f581b, /*SLOT23,VX_DV*/
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7fa82b08, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_7fa8b40f, /*SLOT0123,VS*/
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85d237e3, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8772086c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+  ];
+}
+
+class DepHVXItinV66 {
+  list<InstrItinData> DepHVXItinV66_list = [
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_8a6eb39a, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
-      [HVX_FWD]>,
-
-    InstrItinData <tc_8b6a873f, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_908a4c8c, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_9311da3f, /*SLOT23,VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_94f43c04, /*SLOT0,STORE,VA_DV*/
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_9777e6bf, /*SLOT0,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
-      [Hex_FWD, HVX_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_97c165b9, /*SLOT0123,VA_DV*/
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_98733e9d, /*SLOT1,LOAD,VA_DV*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_99093773, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
-      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9b9642a1, /*SLOT0123,VA*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c267309, /*SLOT01,LOAD*/
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a3127e12, /*SLOT0123,VA*/
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_a4c9df3b, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_a807365d, /*SLOT23,VS_VX*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
-       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
 
-    InstrItinData <tc_aedb9f9e, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_b06ab583, /*SLOT0123,VA*/
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
-      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b712833a, /*SLOT01,LOAD,VA*/
-      [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b77635b4, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [2],
-      [Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_bbaf280e, /*SLOT0123,VA*/
+    InstrItinData <tc_8772086c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_bf142ae2, /*SLOT0123,VP*/
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bfe309d5, /*SLOT1,LOAD,VA_DV*/
-      [InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_c00bf9c9, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4b515c5, /*SLOT0123,VP*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cbf6d1dc, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_cedf314b, /*SLOT0123,4SLOT*/
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [3],
-      [HVX_FWD]>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d2cb81ea, /*SLOT0123,VS*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_SHIFT]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d5090f3e, /*SLOT0,STORE*/
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
       [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d642eff3, /*SLOT0,NOSLOT1,STORE,VP*/
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [SLOT1], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d725e5b0, /*SLOT23,VX*/
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d7bea0ec, /*SLOT0123,VP_VS*/
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLSHF]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_d98f4d63, /*SLOT23,VX_DV*/
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da979fb3, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db5b9e2f, /*SLOT0,STORE*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
-      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
-
-    InstrItinData <tc_df54ad52, /*SLOT0,STORE,VA*/
-      [InstrStage<1, [SLOT0], 0>,
-       InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
-      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e172d86a, /*SLOT23,VX_DV*/
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_e231aa4f, /*SLOT23,VX*/
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e3748cdf, /*SLOT0,STORE,VA*/
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_e5053c8f, /*SLOT0123,4SLOT*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_ALL]>], [],
-      []>,
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_e6299d16, /*SLOT0123,VP*/
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5],
-      [HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_eb669007, /*SLOT01,LOAD,VA*/
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
-       InstrStage<1, [CVI_LD], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
-      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_ec58f88a, /*SLOT0,STORE,VA_DV*/
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
       [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_ST], 0>,
-       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
-      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_eda67dcd, /*SLOT23,VX_DV*/
-      [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
 
-    InstrItinData <tc_ee927c0e, /*SLOT23,VS_VX*/
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
-       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
-      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f3fc3f83, /*SLOT0123,VP*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
-       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
-      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
-    InstrItinData <tc_fa99dc24, /*SLOT2,VX_DV*/
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
       [InstrStage<1, [SLOT2], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
-      [HVX_FWD, HVX_FWD, Hex_FWD]>
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
   ];
 }
diff --git a/lib/Target/Hexagon/HexagonDepIICScalar.td b/lib/Target/Hexagon/HexagonDepIICScalar.td
index 931504b56ccbf2b3b302bd3ba4075c5733cc2d9f..9da25952fb1c96e0312b71ab21139a5c96b19b7d 100644
--- a/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -1,4 +1,4 @@
-//===- HexagonDepIICScalar.td ---------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,3087 +9,3789 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
-def tc_00afc57e : InstrItinClass;
-def tc_00e7c26e : InstrItinClass;
-def tc_03220ffa : InstrItinClass;
-def tc_038a1342 : InstrItinClass;
-def tc_04c9decc : InstrItinClass;
-def tc_05b6c987 : InstrItinClass;
-def tc_0cd51c76 : InstrItinClass;
-def tc_0dc560de : InstrItinClass;
-def tc_0fc1ae07 : InstrItinClass;
-def tc_10b97e27 : InstrItinClass;
-def tc_1372bca1 : InstrItinClass;
-def tc_14cd4cfa : InstrItinClass;
-def tc_15411484 : InstrItinClass;
-def tc_16d0d8d5 : InstrItinClass;
-def tc_181af5d0 : InstrItinClass;
-def tc_1853ea6d : InstrItinClass;
-def tc_1b82a277 : InstrItinClass;
-def tc_1b9c9ee5 : InstrItinClass;
-def tc_1d5a38a8 : InstrItinClass;
-def tc_1e856f58 : InstrItinClass;
-def tc_234a11a5 : InstrItinClass;
-def tc_238d91d2 : InstrItinClass;
-def tc_29175780 : InstrItinClass;
-def tc_2a160009 : InstrItinClass;
-def tc_2b2f4060 : InstrItinClass;
-def tc_2b6f77c6 : InstrItinClass;
-def tc_2f185f5c : InstrItinClass;
-def tc_2fc0c436 : InstrItinClass;
-def tc_351fed2d : InstrItinClass;
-def tc_3669266a : InstrItinClass;
-def tc_367f7f3d : InstrItinClass;
-def tc_36c68ad1 : InstrItinClass;
-def tc_395dc00f : InstrItinClass;
-def tc_3bc2c5d3 : InstrItinClass;
-def tc_3cb8ea06 : InstrItinClass;
-def tc_3d04548d : InstrItinClass;
-def tc_3da80ba5 : InstrItinClass;
-def tc_3e07fb90 : InstrItinClass;
-def tc_41d5298e : InstrItinClass;
-def tc_4403ca65 : InstrItinClass;
-def tc_44126683 : InstrItinClass;
-def tc_452f85af : InstrItinClass;
-def tc_481e5e5c : InstrItinClass;
-def tc_49eb22c8 : InstrItinClass;
-def tc_4ca572d4 : InstrItinClass;
-def tc_4d9914c9 : InstrItinClass;
-def tc_4d99bca9 : InstrItinClass;
-def tc_4f7cd700 : InstrItinClass;
-def tc_513bef45 : InstrItinClass;
-def tc_51b866be : InstrItinClass;
-def tc_523fcf30 : InstrItinClass;
-def tc_5274e61a : InstrItinClass;
-def tc_52d7bbea : InstrItinClass;
-def tc_53bc8a6a : InstrItinClass;
-def tc_53bdb2f6 : InstrItinClass;
-def tc_540fdfbc : InstrItinClass;
-def tc_55050d58 : InstrItinClass;
-def tc_57288781 : InstrItinClass;
-def tc_594ab548 : InstrItinClass;
-def tc_59a01ead : InstrItinClass;
-def tc_5acef64a : InstrItinClass;
-def tc_5ba5997d : InstrItinClass;
-def tc_5eb851fc : InstrItinClass;
-def tc_5f6847a1 : InstrItinClass;
-def tc_60571023 : InstrItinClass;
-def tc_609d2efe : InstrItinClass;
-def tc_63fe3df7 : InstrItinClass;
-def tc_66888ded : InstrItinClass;
-def tc_6792d5ff : InstrItinClass;
-def tc_681a2300 : InstrItinClass;
-def tc_68cb12ce : InstrItinClass;
-def tc_6aa5711a : InstrItinClass;
-def tc_6ac37025 : InstrItinClass;
-def tc_6ebb4a12 : InstrItinClass;
-def tc_6efc556e : InstrItinClass;
-def tc_6fa4db47 : InstrItinClass;
-def tc_73043bf4 : InstrItinClass;
-def tc_746baa8e : InstrItinClass;
-def tc_74e47fd9 : InstrItinClass;
-def tc_7934b9df : InstrItinClass;
-def tc_7a830544 : InstrItinClass;
-def tc_7f881c76 : InstrItinClass;
-def tc_84df2cd3 : InstrItinClass;
-def tc_855b0b61 : InstrItinClass;
-def tc_87735c3b : InstrItinClass;
-def tc_897d1a9d : InstrItinClass;
-def tc_8b15472a : InstrItinClass;
-def tc_8fd5f294 : InstrItinClass;
-def tc_8fe6b782 : InstrItinClass;
-def tc_90f3e30c : InstrItinClass;
-def tc_976ddc4f : InstrItinClass;
-def tc_97743097 : InstrItinClass;
-def tc_994333cd : InstrItinClass;
-def tc_999d32db : InstrItinClass;
-def tc_99be14ca : InstrItinClass;
-def tc_9c00ce8d : InstrItinClass;
-def tc_9c98e8af : InstrItinClass;
-def tc_9d5941c7 : InstrItinClass;
-def tc_9ef61e5c : InstrItinClass;
-def tc_9faf76ae : InstrItinClass;
-def tc_9fdb5406 : InstrItinClass;
-def tc_a21dc435 : InstrItinClass;
-def tc_a27582fa : InstrItinClass;
-def tc_a46f0df5 : InstrItinClass;
-def tc_a788683e : InstrItinClass;
-def tc_a8acdac0 : InstrItinClass;
-def tc_a904d137 : InstrItinClass;
-def tc_adb14c66 : InstrItinClass;
-def tc_b13761ae : InstrItinClass;
-def tc_b166348b : InstrItinClass;
-def tc_b44c6e2a : InstrItinClass;
-def tc_b77c481f : InstrItinClass;
-def tc_b7dd427e : InstrItinClass;
-def tc_b9488031 : InstrItinClass;
-def tc_b9c0b731 : InstrItinClass;
-def tc_b9c4623f : InstrItinClass;
-def tc_bad2bcaf : InstrItinClass;
-def tc_bcc96cee : InstrItinClass;
-def tc_bde7aaf4 : InstrItinClass;
-def tc_be706f30 : InstrItinClass;
-def tc_c2f7d806 : InstrItinClass;
-def tc_c5e2426d : InstrItinClass;
-def tc_c6aa82f7 : InstrItinClass;
-def tc_c6ce9b3f : InstrItinClass;
-def tc_c6ebf8dd : InstrItinClass;
-def tc_c74f796f : InstrItinClass;
-def tc_c82dc1ff : InstrItinClass;
-def tc_caaebcba : InstrItinClass;
-def tc_cd7374a0 : InstrItinClass;
-def tc_cde8b071 : InstrItinClass;
-def tc_cf47a43f : InstrItinClass;
-def tc_cf59f215 : InstrItinClass;
-def tc_d088982c : InstrItinClass;
-def tc_d1090e34 : InstrItinClass;
-def tc_d24b2d85 : InstrItinClass;
-def tc_d580173f : InstrItinClass;
-def tc_d6bf0472 : InstrItinClass;
-def tc_d9709180 : InstrItinClass;
-def tc_d9f95eef : InstrItinClass;
-def tc_daa058fa : InstrItinClass;
-def tc_dbdffe3d : InstrItinClass;
-def tc_e0739b8c : InstrItinClass;
-def tc_e1e99bfa : InstrItinClass;
-def tc_e216a5db : InstrItinClass;
-def tc_e421e012 : InstrItinClass;
-def tc_e7624c08 : InstrItinClass;
-def tc_e7d02c66 : InstrItinClass;
-def tc_e913dc32 : InstrItinClass;
-def tc_e9c822f7 : InstrItinClass;
-def tc_e9fae2d6 : InstrItinClass;
-def tc_ef52ed71 : InstrItinClass;
-def tc_ef84f62f : InstrItinClass;
-def tc_f2704b9a : InstrItinClass;
-def tc_f3eaa14b : InstrItinClass;
-def tc_f47d212f : InstrItinClass;
-def tc_f49e76f4 : InstrItinClass;
-def tc_f7dd9c9f : InstrItinClass;
-def tc_f86c328a : InstrItinClass;
-def tc_f8eeed7a : InstrItinClass;
-def tc_fcab4871 : InstrItinClass;
-def tc_ff9ee76e : InstrItinClass;
-
-class DepScalarItinV4 {
-  list<InstrItinData> DepScalarItinV4_list = [
-    InstrItinData <tc_00afc57e, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_00e7c26e, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_03220ffa, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_038a1342, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_04c9decc, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_05b6c987, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0cd51c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0dc560de, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0fc1ae07, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_10b97e27, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_1372bca1, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_14cd4cfa, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_15411484, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_16d0d8d5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_181af5d0, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_1853ea6d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1b82a277, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_1b9c9ee5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1d5a38a8, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_1e856f58, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_234a11a5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_238d91d2, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_29175780, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_2a160009, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_2b2f4060, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2b6f77c6, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2f185f5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2fc0c436, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_351fed2d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_3669266a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_367f7f3d, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_36c68ad1, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_395dc00f, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_3bc2c5d3, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_3cb8ea06, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_3d04548d, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_3da80ba5, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_3e07fb90, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_41d5298e, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4403ca65, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_44126683, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_452f85af, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_481e5e5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_49eb22c8, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4ca572d4, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_4d9914c9, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_4d99bca9, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4f7cd700, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_513bef45, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_51b866be, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_523fcf30, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_5274e61a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_52d7bbea, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_53bc8a6a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_53bdb2f6, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_540fdfbc, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_55050d58, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_57288781, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_594ab548, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_59a01ead, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_5acef64a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_5ba5997d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_5eb851fc, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_5f6847a1, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_60571023, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_609d2efe, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_63fe3df7, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_66888ded, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_6792d5ff, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_681a2300, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_68cb12ce, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_6aa5711a, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_6ac37025, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_6ebb4a12, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_6efc556e, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_6fa4db47, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_73043bf4, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_746baa8e, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_74e47fd9, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_7934b9df, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_7a830544, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_7f881c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_84df2cd3, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_855b0b61, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_87735c3b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_897d1a9d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_8b15472a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_8fd5f294, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_8fe6b782, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_90f3e30c, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_976ddc4f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_97743097, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_994333cd, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_999d32db, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_99be14ca, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_9c00ce8d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_9c98e8af, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_9d5941c7, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_9ef61e5c, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_9faf76ae, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_9fdb5406, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_a21dc435, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_a27582fa, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_a46f0df5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_a788683e, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_a8acdac0, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_a904d137, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_adb14c66, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b13761ae, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_b166348b, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b44c6e2a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b77c481f, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_b7dd427e, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_b9488031, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b9c0b731, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b9c4623f, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_bad2bcaf, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_bcc96cee, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_bde7aaf4, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_be706f30, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c2f7d806, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c5e2426d, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_c6aa82f7, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c6ce9b3f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c6ebf8dd, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c74f796f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c82dc1ff, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_caaebcba, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_cd7374a0, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_cde8b071, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_cf47a43f, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_cf59f215, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_d088982c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d1090e34, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_d24b2d85, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_d580173f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d6bf0472, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d9709180, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_d9f95eef, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_daa058fa, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_dbdffe3d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_e0739b8c, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_e1e99bfa, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_e216a5db, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e421e012, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e7624c08, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e7d02c66, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e913dc32, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_e9c822f7, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_e9fae2d6, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_ef52ed71, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_ef84f62f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f2704b9a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f3eaa14b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f47d212f, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_f49e76f4, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f7dd9c9f, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_f86c328a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_f8eeed7a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_fcab4871, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_ff9ee76e, [InstrStage<1, [SLOT0]>]>  ];
-}
+def tc_002cb246 : InstrItinClass;
+def tc_0371abea : InstrItinClass;
+def tc_05c070ec : InstrItinClass;
+def tc_05d3a09b : InstrItinClass;
+def tc_0663f615 : InstrItinClass;
+def tc_096199d3 : InstrItinClass;
+def tc_0a705168 : InstrItinClass;
+def tc_0ae0825c : InstrItinClass;
+def tc_0b2be201 : InstrItinClass;
+def tc_0d8f5752 : InstrItinClass;
+def tc_13bfbcf9 : InstrItinClass;
+def tc_14b272fa : InstrItinClass;
+def tc_14b5c689 : InstrItinClass;
+def tc_15aa71c5 : InstrItinClass;
+def tc_174516e8 : InstrItinClass;
+def tc_17e0d2cd : InstrItinClass;
+def tc_1a2fd869 : InstrItinClass;
+def tc_1ad90acd : InstrItinClass;
+def tc_1ae57e39 : InstrItinClass;
+def tc_1b6f7cec : InstrItinClass;
+def tc_1c4528a2 : InstrItinClass;
+def tc_1c80410a : InstrItinClass;
+def tc_1d81e60e : InstrItinClass;
+def tc_1fc97744 : InstrItinClass;
+def tc_20cdee80 : InstrItinClass;
+def tc_2332b92e : InstrItinClass;
+def tc_24b66c99 : InstrItinClass;
+def tc_25a78932 : InstrItinClass;
+def tc_2b8da4c2 : InstrItinClass;
+def tc_2eabeebe : InstrItinClass;
+def tc_2f7c551d : InstrItinClass;
+def tc_2ff964b4 : InstrItinClass;
+def tc_30b9bb4a : InstrItinClass;
+def tc_32779c6f : InstrItinClass;
+def tc_36153880 : InstrItinClass;
+def tc_362c6592 : InstrItinClass;
+def tc_3962fa26 : InstrItinClass;
+def tc_39dfefe8 : InstrItinClass;
+def tc_3a867367 : InstrItinClass;
+def tc_3b470976 : InstrItinClass;
+def tc_3b5b7ef9 : InstrItinClass;
+def tc_3bd75825 : InstrItinClass;
+def tc_3c76b0ff : InstrItinClass;
+def tc_3d495a39 : InstrItinClass;
+def tc_40116ca8 : InstrItinClass;
+def tc_434c8e1e : InstrItinClass;
+def tc_4414d8b1 : InstrItinClass;
+def tc_44d3da28 : InstrItinClass;
+def tc_4560740b : InstrItinClass;
+def tc_4837eefb : InstrItinClass;
+def tc_49a8207d : InstrItinClass;
+def tc_4ae7b58b : InstrItinClass;
+def tc_4b68bce4 : InstrItinClass;
+def tc_4c5ba658 : InstrItinClass;
+def tc_4d5fa3a1 : InstrItinClass;
+def tc_53559e35 : InstrItinClass;
+def tc_56336eb0 : InstrItinClass;
+def tc_56f114f4 : InstrItinClass;
+def tc_57890846 : InstrItinClass;
+def tc_5a2711e5 : InstrItinClass;
+def tc_5abb5e3f : InstrItinClass;
+def tc_5aee39f7 : InstrItinClass;
+def tc_5b54b33f : InstrItinClass;
+def tc_5b7c0967 : InstrItinClass;
+def tc_5bf126a6 : InstrItinClass;
+def tc_5d7f5414 : InstrItinClass;
+def tc_5ef37dc4 : InstrItinClass;
+def tc_6132ba3d : InstrItinClass;
+def tc_61830035 : InstrItinClass;
+def tc_640086b5 : InstrItinClass;
+def tc_643b4717 : InstrItinClass;
+def tc_67435e81 : InstrItinClass;
+def tc_675e4897 : InstrItinClass;
+def tc_679309b8 : InstrItinClass;
+def tc_6b25e783 : InstrItinClass;
+def tc_703e822c : InstrItinClass;
+def tc_7186d325 : InstrItinClass;
+def tc_7646c131 : InstrItinClass;
+def tc_76851da1 : InstrItinClass;
+def tc_779080bf : InstrItinClass;
+def tc_784490da : InstrItinClass;
+def tc_785f65a7 : InstrItinClass;
+def tc_7a91e76a : InstrItinClass;
+def tc_838b34ea : InstrItinClass;
+def tc_85c9c08f : InstrItinClass;
+def tc_85d5d03f : InstrItinClass;
+def tc_862b3e70 : InstrItinClass;
+def tc_88b4f13d : InstrItinClass;
+def tc_89e94ad3 : InstrItinClass;
+def tc_8b121f4a : InstrItinClass;
+def tc_8b3e402a : InstrItinClass;
+def tc_8c945be0 : InstrItinClass;
+def tc_8c99de45 : InstrItinClass;
+def tc_8d9d0154 : InstrItinClass;
+def tc_8fb7ab1b : InstrItinClass;
+def tc_9461ff31 : InstrItinClass;
+def tc_946df596 : InstrItinClass;
+def tc_9ad9998f : InstrItinClass;
+def tc_9bfd761f : InstrItinClass;
+def tc_9c3ecd83 : InstrItinClass;
+def tc_9ca930f7 : InstrItinClass;
+def tc_9da59d12 : InstrItinClass;
+def tc_9debc299 : InstrItinClass;
+def tc_9e313203 : InstrItinClass;
+def tc_9fc3dae0 : InstrItinClass;
+def tc_a1123dda : InstrItinClass;
+def tc_a1c00888 : InstrItinClass;
+def tc_a58fd5cc : InstrItinClass;
+def tc_a5d4aeec : InstrItinClass;
+def tc_a6b1eca9 : InstrItinClass;
+def tc_a813cf9a : InstrItinClass;
+def tc_a9d88b22 : InstrItinClass;
+def tc_ae53734a : InstrItinClass;
+def tc_b31c2e97 : InstrItinClass;
+def tc_b343892a : InstrItinClass;
+def tc_b43e7930 : InstrItinClass;
+def tc_b4407292 : InstrItinClass;
+def tc_b44ecf75 : InstrItinClass;
+def tc_b4b5c03a : InstrItinClass;
+def tc_b51dc29a : InstrItinClass;
+def tc_b83e6d73 : InstrItinClass;
+def tc_b857bf4e : InstrItinClass;
+def tc_b8bffe55 : InstrItinClass;
+def tc_b90a29b1 : InstrItinClass;
+def tc_b9272d6c : InstrItinClass;
+def tc_b9e09e03 : InstrItinClass;
+def tc_bab0eed9 : InstrItinClass;
+def tc_bafaade3 : InstrItinClass;
+def tc_bcf98408 : InstrItinClass;
+def tc_bd8382d1 : InstrItinClass;
+def tc_bdceeac1 : InstrItinClass;
+def tc_be9602ff : InstrItinClass;
+def tc_bf061958 : InstrItinClass;
+def tc_bfec0f01 : InstrItinClass;
+def tc_c4db48cb : InstrItinClass;
+def tc_c4f596e3 : InstrItinClass;
+def tc_c79a189f : InstrItinClass;
+def tc_c8ce0b5c : InstrItinClass;
+def tc_cd374165 : InstrItinClass;
+def tc_cf8126ae : InstrItinClass;
+def tc_cfd8378a : InstrItinClass;
+def tc_d08ee0f4 : InstrItinClass;
+def tc_d1aa9eaa : InstrItinClass;
+def tc_d2e63d61 : InstrItinClass;
+def tc_d5b7b0c1 : InstrItinClass;
+def tc_d5c0729a : InstrItinClass;
+def tc_d63f638c : InstrItinClass;
+def tc_d65dbf51 : InstrItinClass;
+def tc_d773585a : InstrItinClass;
+def tc_d9d43ecb : InstrItinClass;
+def tc_da4a37ed : InstrItinClass;
+def tc_da97ee82 : InstrItinClass;
+def tc_db2bce9c : InstrItinClass;
+def tc_de4df740 : InstrItinClass;
+def tc_de554571 : InstrItinClass;
+def tc_df3319ed : InstrItinClass;
+def tc_e06f432a : InstrItinClass;
+def tc_e4a7f9f0 : InstrItinClass;
+def tc_e4b3cb20 : InstrItinClass;
+def tc_e78647bd : InstrItinClass;
+def tc_e86aa961 : InstrItinClass;
+def tc_e93a3d71 : InstrItinClass;
+def tc_e95795ec : InstrItinClass;
+def tc_e9f3243f : InstrItinClass;
+def tc_f429765c : InstrItinClass;
+def tc_f675fee8 : InstrItinClass;
+def tc_f8e23f0b : InstrItinClass;
+def tc_f9058dd7 : InstrItinClass;
+def tc_fc3999b4 : InstrItinClass;
+def tc_fcc3ddf9 : InstrItinClass;
+def tc_fe211424 : InstrItinClass;
 
 class DepScalarItinV5 {
   list<InstrItinData> DepScalarItinV5_list = [
-    InstrItinData <tc_00afc57e, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_00e7c26e, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_03220ffa, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_038a1342, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_04c9decc, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_05b6c987, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0cd51c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0dc560de, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0fc1ae07, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_10b97e27, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_1372bca1, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_14cd4cfa, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_15411484, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_16d0d8d5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_181af5d0, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_1853ea6d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1b82a277, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_1b9c9ee5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1d5a38a8, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_1e856f58, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_234a11a5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_238d91d2, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_29175780, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_2a160009, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_2b2f4060, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2b6f77c6, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2f185f5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2fc0c436, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_351fed2d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_3669266a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_367f7f3d, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_36c68ad1, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_395dc00f, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_3bc2c5d3, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_3cb8ea06, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_3d04548d, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_3da80ba5, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_3e07fb90, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_41d5298e, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4403ca65, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_44126683, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_452f85af, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_481e5e5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_49eb22c8, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4ca572d4, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_4d9914c9, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_4d99bca9, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4f7cd700, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_513bef45, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_51b866be, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_523fcf30, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_5274e61a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_52d7bbea, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_53bc8a6a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_53bdb2f6, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_540fdfbc, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_55050d58, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_57288781, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_594ab548, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_59a01ead, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_5acef64a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_5ba5997d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_5eb851fc, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_5f6847a1, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_60571023, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_609d2efe, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_63fe3df7, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_66888ded, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_6792d5ff, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_681a2300, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_68cb12ce, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_6aa5711a, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_6ac37025, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_6ebb4a12, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_6efc556e, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_6fa4db47, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_73043bf4, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_746baa8e, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_74e47fd9, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_7934b9df, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_7a830544, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_7f881c76, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_84df2cd3, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_855b0b61, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_87735c3b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_897d1a9d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_8b15472a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_8fd5f294, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_8fe6b782, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_90f3e30c, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_976ddc4f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_97743097, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_994333cd, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_999d32db, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_99be14ca, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_9c00ce8d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_9c98e8af, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_9d5941c7, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_9ef61e5c, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_9faf76ae, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_9fdb5406, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_a21dc435, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_a27582fa, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_a46f0df5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_a788683e, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_a8acdac0, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_a904d137, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_adb14c66, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b13761ae, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_b166348b, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b44c6e2a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b77c481f, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_b7dd427e, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_b9488031, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b9c0b731, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b9c4623f, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_bad2bcaf, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_bcc96cee, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_bde7aaf4, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_be706f30, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c2f7d806, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c5e2426d, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_c6aa82f7, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c6ce9b3f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c6ebf8dd, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c74f796f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c82dc1ff, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_caaebcba, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_cd7374a0, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_cde8b071, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_cf47a43f, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_cf59f215, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_d088982c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d1090e34, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_d24b2d85, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_d580173f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d6bf0472, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d9709180, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_d9f95eef, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_daa058fa, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_dbdffe3d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_e0739b8c, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_e1e99bfa, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_e216a5db, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e421e012, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e7624c08, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e7d02c66, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e913dc32, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_e9c822f7, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_e9fae2d6, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_ef52ed71, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_ef84f62f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f2704b9a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f3eaa14b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f47d212f, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_f49e76f4, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f7dd9c9f, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_f86c328a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_f8eeed7a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_fcab4871, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_ff9ee76e, [InstrStage<1, [SLOT0]>]>  ];
+    InstrItinData <tc_002cb246, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_0371abea, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_05c070ec, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_05d3a09b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_0663f615, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_096199d3, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_0a705168, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_0ae0825c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_0b2be201, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_0d8f5752, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_13bfbcf9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_14b272fa, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_14b5c689, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_15aa71c5, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_174516e8, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_17e0d2cd, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_1a2fd869, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1ad90acd, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_1ae57e39, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1b6f7cec, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1c4528a2, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_1c80410a, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1d81e60e, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1fc97744, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_20cdee80, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2332b92e, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_24b66c99, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_25a78932, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_2b8da4c2, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_2eabeebe, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2f7c551d, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2ff964b4, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_30b9bb4a, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_32779c6f, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_36153880, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_362c6592, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_3962fa26, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_39dfefe8, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_3a867367, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3b470976, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3b5b7ef9, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_3bd75825, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_3c76b0ff, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_3d495a39, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_40116ca8, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_434c8e1e, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_4414d8b1, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_44d3da28, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_4560740b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_4837eefb, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_49a8207d, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_4ae7b58b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_4b68bce4, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_4c5ba658, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_4d5fa3a1, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_53559e35, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_56336eb0, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_56f114f4, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_57890846, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5a2711e5, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5abb5e3f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_5aee39f7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_5b54b33f, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5b7c0967, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_5bf126a6, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_5d7f5414, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_5ef37dc4, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_6132ba3d, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_61830035, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_640086b5, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_643b4717, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_67435e81, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_675e4897, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_679309b8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_6b25e783, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_703e822c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7186d325, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_7646c131, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_76851da1, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_779080bf, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_784490da, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_785f65a7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7a91e76a, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_838b34ea, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_85c9c08f, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_85d5d03f, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_862b3e70, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_88b4f13d, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_89e94ad3, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_8b121f4a, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_8b3e402a, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_8c945be0, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_8c99de45, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_8d9d0154, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_8fb7ab1b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9461ff31, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_946df596, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9ad9998f, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_9bfd761f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9c3ecd83, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9ca930f7, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9da59d12, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9debc299, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9e313203, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9fc3dae0, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_a1123dda, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_a1c00888, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a58fd5cc, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a5d4aeec, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_a6b1eca9, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_a813cf9a, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_a9d88b22, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_ae53734a, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b31c2e97, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b343892a, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_b43e7930, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_b4407292, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_b44ecf75, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_b4b5c03a, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b51dc29a, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_b83e6d73, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_b857bf4e, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_b8bffe55, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b90a29b1, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_b9272d6c, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_b9e09e03, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_bab0eed9, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_bafaade3, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_bcf98408, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_bd8382d1, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_bdceeac1, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_be9602ff, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_bf061958, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_bfec0f01, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_c4db48cb, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c4f596e3, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c79a189f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_c8ce0b5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_cd374165, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_cf8126ae, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_cfd8378a, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d08ee0f4, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d1aa9eaa, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d2e63d61, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_d5b7b0c1, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_d5c0729a, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_d63f638c, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_d65dbf51, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_d773585a, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d9d43ecb, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_da4a37ed, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_da97ee82, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_db2bce9c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_de4df740, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_de554571, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_df3319ed, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_e06f432a, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_e4a7f9f0, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e4b3cb20, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_e78647bd, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_e86aa961, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_e93a3d71, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_e95795ec, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_e9f3243f, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_f429765c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f675fee8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f8e23f0b, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_f9058dd7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_fc3999b4, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_fcc3ddf9, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_fe211424, [InstrStage<1, [SLOT0]>]>  ];
 }
 
 class DepScalarItinV55 {
   list<InstrItinData> DepScalarItinV55_list = [
-    InstrItinData <tc_00afc57e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_002cb246, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0371abea, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_00e7c26e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_05c070ec, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_03220ffa, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_05d3a09b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_038a1342, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_0663f615, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_04c9decc, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_096199d3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05b6c987, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+    InstrItinData <tc_0a705168, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0cd51c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_0ae0825c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0dc560de, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+    InstrItinData <tc_0b2be201, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0fc1ae07, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_10b97e27, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_0d8f5752, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1372bca1, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_14cd4cfa, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_15411484, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_13bfbcf9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_16d0d8d5, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_14b272fa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_181af5d0, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 1],
+    InstrItinData <tc_14b5c689, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1853ea6d, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_15aa71c5, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b82a277, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+    InstrItinData <tc_174516e8, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_1b9c9ee5, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_1d5a38a8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_17e0d2cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1e856f58, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1a2fd869, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_234a11a5, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_1ad90acd, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_238d91d2, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+    InstrItinData <tc_1ae57e39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_29175780, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_2a160009, /*tc_2early*/
-      [InstrStage<1, [SLOT0]>], [],
+    InstrItinData <tc_1b6f7cec, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_2b2f4060, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1c4528a2, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b6f77c6, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_1c80410a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2f185f5c, /*tc_2early*/
+    InstrItinData <tc_1d81e60e, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2fc0c436, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_351fed2d, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_3669266a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_367f7f3d, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
-
-    InstrItinData <tc_36c68ad1, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
-      []>,
-
-    InstrItinData <tc_395dc00f, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 3, 1],
+    InstrItinData <tc_1fc97744, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bc2c5d3, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_3cb8ea06, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
+    InstrItinData <tc_20cdee80, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3d04548d, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 2],
+    InstrItinData <tc_2332b92e, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3da80ba5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_3e07fb90, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+    InstrItinData <tc_24b66c99, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_41d5298e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4403ca65, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+    InstrItinData <tc_25a78932, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44126683, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_2b8da4c2, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_452f85af, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_481e5e5c, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2eabeebe, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_49eb22c8, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_2f7c551d, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4ca572d4, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_4d9914c9, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2ff964b4, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d99bca9, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1],
+    InstrItinData <tc_30b9bb4a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4f7cd700, /*tc_3x*/
+    InstrItinData <tc_32779c6f, /*tc_3x*/
       [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_513bef45, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_36153880, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_51b866be, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+    InstrItinData <tc_362c6592, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_523fcf30, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5274e61a, /*tc_st*/
+    InstrItinData <tc_3962fa26, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_52d7bbea, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+    InstrItinData <tc_39dfefe8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
       []>,
 
-    InstrItinData <tc_53bc8a6a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_3a867367, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_53bdb2f6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+    InstrItinData <tc_3b470976, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_540fdfbc, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_55050d58, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_3bd75825, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_3c76b0ff, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d495a39, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57288781, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_40116ca8, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_594ab548, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_434c8e1e, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_59a01ead, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+    InstrItinData <tc_4414d8b1, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5acef64a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+    InstrItinData <tc_44d3da28, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5ba5997d, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4560740b, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5eb851fc, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 3, 2],
+    InstrItinData <tc_4837eefb, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5f6847a1, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_49a8207d, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_60571023, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4ae7b58b, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_609d2efe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_4b68bce4, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_63fe3df7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
+    InstrItinData <tc_4c5ba658, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4d5fa3a1, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_53559e35, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_66888ded, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_56336eb0, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6792d5ff, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_56f114f4, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_681a2300, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_68cb12ce, /*tc_1*/
+    InstrItinData <tc_57890846, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6aa5711a, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_6ac37025, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+    InstrItinData <tc_5a2711e5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6ebb4a12, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6efc556e, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_6fa4db47, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5aee39f7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_73043bf4, /*tc_2early*/
-      [InstrStage<1, [SLOT3]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5b54b33f, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_746baa8e, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_5b7c0967, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_74e47fd9, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_7934b9df, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_5bf126a6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a830544, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_7f881c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5d7f5414, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_84df2cd3, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_5ef37dc4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_855b0b61, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
+    InstrItinData <tc_6132ba3d, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_87735c3b, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_61830035, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_897d1a9d, /*tc_1*/
+    InstrItinData <tc_640086b5, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b15472a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_8fd5f294, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_643b4717, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fe6b782, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_67435e81, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_90f3e30c, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+    InstrItinData <tc_675e4897, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_976ddc4f, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_679309b8, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_97743097, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6b25e783, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_994333cd, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_703e822c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_999d32db, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_7186d325, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_99be14ca, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
+    InstrItinData <tc_7646c131, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c00ce8d, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_76851da1, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c98e8af, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_779080bf, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9d5941c7, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_784490da, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ef61e5c, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_785f65a7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9faf76ae, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_7a91e76a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9fdb5406, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+    InstrItinData <tc_838b34ea, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a21dc435, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_85c9c08f, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a27582fa, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_85d5d03f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_862b3e70, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_a46f0df5, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_88b4f13d, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a788683e, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+    InstrItinData <tc_89e94ad3, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a8acdac0, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b121f4a, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
 
-    InstrItinData <tc_a904d137, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b3e402a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_adb14c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8c945be0, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b13761ae, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [],
+    InstrItinData <tc_8c99de45, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_b166348b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_8d9d0154, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_8fb7ab1b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b44c6e2a, /*tc_2*/
+    InstrItinData <tc_9461ff31, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b77c481f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+    InstrItinData <tc_946df596, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b7dd427e, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9ad9998f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_b9488031, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_9bfd761f, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9c0b731, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b9c4623f, /*tc_2*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_bad2bcaf, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+    InstrItinData <tc_9c3ecd83, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcc96cee, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9ca930f7, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bde7aaf4, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_9da59d12, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be706f30, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9debc299, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c2f7d806, /*tc_2*/
+    InstrItinData <tc_9e313203, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c5e2426d, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_9fc3dae0, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6aa82f7, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c6ce9b3f, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a1123dda, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_c6ebf8dd, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a1c00888, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c74f796f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_a58fd5cc, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c82dc1ff, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_caaebcba, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+    InstrItinData <tc_a5d4aeec, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cd7374a0, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a6b1eca9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cde8b071, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_a813cf9a, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf47a43f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_cf59f215, /*tc_3x*/
+    InstrItinData <tc_a9d88b22, /*tc_3x*/
       [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d088982c, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_ae53734a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_d1090e34, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b31c2e97, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d24b2d85, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b343892a, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d580173f, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b43e7930, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d6bf0472, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b4407292, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_d9709180, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b44ecf75, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9f95eef, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b4b5c03a, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_daa058fa, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
+    InstrItinData <tc_b51dc29a, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_dbdffe3d, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_b83e6d73, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e0739b8c, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e1e99bfa, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b857bf4e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_e216a5db, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+    InstrItinData <tc_b8bffe55, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e421e012, /*tc_st*/
+    InstrItinData <tc_b90a29b1, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e7624c08, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_b9272d6c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e7d02c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b9e09e03, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e913dc32, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+    InstrItinData <tc_bab0eed9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9c822f7, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_bafaade3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9fae2d6, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_bcf98408, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef52ed71, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_bd8382d1, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef84f62f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+    InstrItinData <tc_bdceeac1, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f2704b9a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f3eaa14b, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_be9602ff, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f47d212f, /*tc_ld*/
+    InstrItinData <tc_bf061958, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f49e76f4, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_bfec0f01, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f7dd9c9f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_c4db48cb, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f86c328a, /*tc_st*/
+    InstrItinData <tc_c4f596e3, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8eeed7a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_c79a189f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fcab4871, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ff9ee76e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
-      [Hex_FWD, Hex_FWD]>
-  ];
-}
+    InstrItinData <tc_cd374165, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-class DepScalarItinV60 {
-  list<InstrItinData> DepScalarItinV60_list = [
-    InstrItinData <tc_00afc57e, /*tc_2*/
+    InstrItinData <tc_cf8126ae, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_00e7c26e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_cfd8378a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_03220ffa, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d08ee0f4, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_038a1342, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+    InstrItinData <tc_d1aa9eaa, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_04c9decc, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_d2e63d61, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05b6c987, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d5b7b0c1, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_0cd51c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+    InstrItinData <tc_d5c0729a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0dc560de, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_0fc1ae07, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
+    InstrItinData <tc_d63f638c, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_10b97e27, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d65dbf51, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1372bca1, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d773585a, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14cd4cfa, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_d9d43ecb, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_15411484, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_da4a37ed, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_16d0d8d5, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_da97ee82, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_181af5d0, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 1],
+    InstrItinData <tc_db2bce9c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1853ea6d, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+    InstrItinData <tc_de4df740, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_de554571, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df3319ed, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e06f432a, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_e4a7f9f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e4b3cb20, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e78647bd, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e86aa961, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e93a3d71, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e95795ec, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e9f3243f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f429765c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f675fee8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f8e23f0b, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f9058dd7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fc3999b4, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_fcc3ddf9, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fe211424, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV60 {
+  list<InstrItinData> DepScalarItinV60_list = [
+    InstrItinData <tc_002cb246, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0371abea, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_05c070ec, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_05d3a09b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0663f615, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_096199d3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0a705168, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ae0825c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0b2be201, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0d8f5752, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_13bfbcf9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14b272fa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14b5c689, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_15aa71c5, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_174516e8, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_17e0d2cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1a2fd869, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1ad90acd, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1ae57e39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1b6f7cec, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_1c4528a2, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c80410a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1d81e60e, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fc97744, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20cdee80, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2332b92e, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24b66c99, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_25a78932, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2b8da4c2, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2eabeebe, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_2f7c551d, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2ff964b4, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_30b9bb4a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_32779c6f, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_36153880, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_362c6592, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3962fa26, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_39dfefe8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_3a867367, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3b470976, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3bd75825, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_3c76b0ff, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d495a39, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_40116ca8, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_434c8e1e, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4414d8b1, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44d3da28, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4560740b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4837eefb, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_49a8207d, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_4ae7b58b, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_4b68bce4, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4c5ba658, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4d5fa3a1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_53559e35, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56336eb0, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56f114f4, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57890846, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a2711e5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5aee39f7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b54b33f, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b7c0967, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5bf126a6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5d7f5414, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_5ef37dc4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6132ba3d, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_61830035, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_640086b5, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_643b4717, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_67435e81, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_675e4897, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_679309b8, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6b25e783, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_703e822c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7186d325, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b82a277, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+    InstrItinData <tc_7646c131, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76851da1, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_779080bf, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_784490da, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_785f65a7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7a91e76a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838b34ea, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_85c9c08f, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_85d5d03f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_862b3e70, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_88b4f13d, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_89e94ad3, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8b121f4a, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_8b3e402a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_8c945be0, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8c99de45, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_8d9d0154, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_8fb7ab1b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9461ff31, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_946df596, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9ad9998f, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_9bfd761f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c3ecd83, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9ca930f7, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9da59d12, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9debc299, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e313203, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9fc3dae0, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1123dda, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_a1c00888, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a58fd5cc, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a5d4aeec, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a6b1eca9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a813cf9a, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9d88b22, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae53734a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b31c2e97, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b343892a, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b43e7930, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b4407292, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_b44ecf75, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b4b5c03a, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b51dc29a, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b83e6d73, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b857bf4e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b8bffe55, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b90a29b1, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b9272d6c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b9e09e03, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bab0eed9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bafaade3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bcf98408, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bd8382d1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bdceeac1, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_be9602ff, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf061958, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bfec0f01, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c4db48cb, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c4f596e3, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c79a189f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cd374165, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cf8126ae, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfd8378a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d08ee0f4, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d1aa9eaa, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d2e63d61, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d5b7b0c1, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d5c0729a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d63f638c, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d65dbf51, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d773585a, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d9d43ecb, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_da4a37ed, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_da97ee82, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db2bce9c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_de4df740, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_de554571, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df3319ed, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e06f432a, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_e4a7f9f0, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e4b3cb20, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e78647bd, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e86aa961, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e93a3d71, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e95795ec, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e9f3243f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f429765c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f675fee8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f8e23f0b, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f9058dd7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fc3999b4, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_1b9c9ee5, /*tc_2*/
+    InstrItinData <tc_fcc3ddf9, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fe211424, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV62 {
+  list<InstrItinData> DepScalarItinV62_list = [
+    InstrItinData <tc_002cb246, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1d5a38a8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_0371abea, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_05c070ec, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_05d3a09b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0663f615, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1e856f58, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_096199d3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_234a11a5, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_0a705168, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ae0825c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_238d91d2, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_0b2be201, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_29175780, /*tc_3x*/
+    InstrItinData <tc_0d8f5752, /*tc_3x*/
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2a160009, /*tc_2early*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_13bfbcf9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b2f4060, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+    InstrItinData <tc_14b272fa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b6f77c6, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_14b5c689, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2f185f5c, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_15aa71c5, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2fc0c436, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+    InstrItinData <tc_174516e8, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_17e0d2cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1a2fd869, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_351fed2d, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+    InstrItinData <tc_1ad90acd, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3669266a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_1ae57e39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_367f7f3d, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
+    InstrItinData <tc_1b6f7cec, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_36c68ad1, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
-      []>,
+    InstrItinData <tc_1c4528a2, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_395dc00f, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+    InstrItinData <tc_1c80410a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bc2c5d3, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_3cb8ea06, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
+    InstrItinData <tc_1d81e60e, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3d04548d, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_1fc97744, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20cdee80, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3da80ba5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_2332b92e, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3e07fb90, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+    InstrItinData <tc_24b66c99, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_41d5298e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4403ca65, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+    InstrItinData <tc_25a78932, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44126683, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_2b8da4c2, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_452f85af, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_481e5e5c, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2eabeebe, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_49eb22c8, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_2f7c551d, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4ca572d4, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_4d9914c9, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2ff964b4, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d99bca9, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+    InstrItinData <tc_30b9bb4a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4f7cd700, /*tc_3stall*/
+    InstrItinData <tc_32779c6f, /*tc_3stall*/
       [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_513bef45, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_36153880, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_51b866be, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_362c6592, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_523fcf30, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5274e61a, /*tc_st*/
+    InstrItinData <tc_3962fa26, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_52d7bbea, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+    InstrItinData <tc_39dfefe8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
       []>,
 
-    InstrItinData <tc_53bc8a6a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_3a867367, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_53bdb2f6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+    InstrItinData <tc_3b470976, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_540fdfbc, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_55050d58, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_3bd75825, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_3c76b0ff, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d495a39, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57288781, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_40116ca8, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_594ab548, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_434c8e1e, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_59a01ead, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+    InstrItinData <tc_4414d8b1, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5acef64a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+    InstrItinData <tc_44d3da28, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5ba5997d, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4560740b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5eb851fc, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+    InstrItinData <tc_4837eefb, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5f6847a1, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+    InstrItinData <tc_49a8207d, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_4ae7b58b, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_4b68bce4, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_60571023, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+    InstrItinData <tc_4c5ba658, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4d5fa3a1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_53559e35, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_609d2efe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_56336eb0, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_63fe3df7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_56f114f4, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_66888ded, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_57890846, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a2711e5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6792d5ff, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_681a2300, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_5aee39f7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_68cb12ce, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_5b54b33f, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b7c0967, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6aa5711a, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
+    InstrItinData <tc_5bf126a6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6ac37025, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+    InstrItinData <tc_5d7f5414, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_5ef37dc4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6132ba3d, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6ebb4a12, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_61830035, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6efc556e, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
+    InstrItinData <tc_640086b5, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6fa4db47, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_643b4717, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_73043bf4, /*tc_2early*/
-      [InstrStage<1, [SLOT3]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_67435e81, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_746baa8e, /*tc_newvjump*/
+    InstrItinData <tc_675e4897, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_74e47fd9, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_679309b8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7934b9df, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_6b25e783, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_703e822c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a830544, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_7186d325, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7f881c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+    InstrItinData <tc_7646c131, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76851da1, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_84df2cd3, /*tc_2*/
+    InstrItinData <tc_779080bf, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_784490da, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_855b0b61, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_785f65a7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7a91e76a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_87735c3b, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+    InstrItinData <tc_838b34ea, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_897d1a9d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_8b15472a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_85c9c08f, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fd5f294, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_85d5d03f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fe6b782, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_862b3e70, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_88b4f13d, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_90f3e30c, /*tc_2early*/
+    InstrItinData <tc_89e94ad3, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_976ddc4f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b121f4a, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
 
-    InstrItinData <tc_97743097, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b3e402a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_994333cd, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_8c945be0, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_999d32db, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_8c99de45, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_99be14ca, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8d9d0154, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_9c00ce8d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+    InstrItinData <tc_8fb7ab1b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c98e8af, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9461ff31, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9d5941c7, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_946df596, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ef61e5c, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9ad9998f, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_9faf76ae, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_9bfd761f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9fdb5406, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+    InstrItinData <tc_9c3ecd83, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a21dc435, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_9ca930f7, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a27582fa, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_9da59d12, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a46f0df5, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9debc299, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a788683e, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+    InstrItinData <tc_9e313203, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a8acdac0, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_a904d137, /*tc_1*/
+    InstrItinData <tc_9fc3dae0, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_adb14c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a1123dda, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_b13761ae, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
+    InstrItinData <tc_a1c00888, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b166348b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_a58fd5cc, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b44c6e2a, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b77c481f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b7dd427e, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+    InstrItinData <tc_a5d4aeec, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9488031, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_a6b1eca9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9c0b731, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b9c4623f, /*tc_2*/
+    InstrItinData <tc_a813cf9a, /*tc_2*/
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bad2bcaf, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a9d88b22, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcc96cee, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+    InstrItinData <tc_ae53734a, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b31c2e97, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_b343892a, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be706f30, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c2f7d806, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_b43e7930, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c5e2426d, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b4407292, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_c6aa82f7, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_b44ecf75, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6ce9b3f, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+    InstrItinData <tc_b4b5c03a, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6ebf8dd, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b51dc29a, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c74f796f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b83e6d73, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c82dc1ff, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
+    InstrItinData <tc_b857bf4e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_caaebcba, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b8bffe55, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cd7374a0, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+    InstrItinData <tc_b90a29b1, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cde8b071, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_b9272d6c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf47a43f, /*tc_ld*/
+    InstrItinData <tc_b9e09e03, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bab0eed9, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf59f215, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d088982c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_bafaade3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d1090e34, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_bcf98408, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d24b2d85, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_bd8382d1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d580173f, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+    InstrItinData <tc_bdceeac1, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d6bf0472, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_be9602ff, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9709180, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+    InstrItinData <tc_bf061958, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9f95eef, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+    InstrItinData <tc_bfec0f01, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_daa058fa, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
+    InstrItinData <tc_c4db48cb, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_dbdffe3d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_c4f596e3, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c79a189f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e0739b8c, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_cd374165, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e1e99bfa, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_cf8126ae, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e216a5db, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+    InstrItinData <tc_cfd8378a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d08ee0f4, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e421e012, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+    InstrItinData <tc_d1aa9eaa, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e7624c08, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_d2e63d61, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e7d02c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d5b7b0c1, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_e913dc32, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d5c0729a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9c822f7, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+    InstrItinData <tc_d63f638c, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_e9fae2d6, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_ef52ed71, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d65dbf51, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef84f62f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+    InstrItinData <tc_d773585a, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f2704b9a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f3eaa14b, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+    InstrItinData <tc_d9d43ecb, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f47d212f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f49e76f4, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f7dd9c9f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+    InstrItinData <tc_da4a37ed, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f86c328a, /*tc_st*/
+    InstrItinData <tc_da97ee82, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8eeed7a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_db2bce9c, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fcab4871, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_de4df740, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ff9ee76e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
-      [Hex_FWD, Hex_FWD]>
-  ];
-}
+    InstrItinData <tc_de554571, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-class DepScalarItinV62 {
-  list<InstrItinData> DepScalarItinV62_list = [
-    InstrItinData <tc_00afc57e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_df3319ed, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_00e7c26e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
+    InstrItinData <tc_e06f432a, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_03220ffa, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e4a7f9f0, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_038a1342, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+    InstrItinData <tc_e4b3cb20, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_04c9decc, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e78647bd, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05b6c987, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e86aa961, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e93a3d71, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e95795ec, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0cd51c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+    InstrItinData <tc_e9f3243f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0dc560de, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+    InstrItinData <tc_f429765c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0fc1ae07, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_f675fee8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_10b97e27, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_f8e23f0b, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1372bca1, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_f9058dd7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14cd4cfa, /*tc_2early*/
+    InstrItinData <tc_fc3999b4, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_15411484, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_16d0d8d5, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_fcc3ddf9, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_181af5d0, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_fe211424, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>
+  ];
+}
 
-    InstrItinData <tc_1853ea6d, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+class DepScalarItinV65 {
+  list<InstrItinData> DepScalarItinV65_list = [
+    InstrItinData <tc_002cb246, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b82a277, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_0371abea, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b9c9ee5, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_05c070ec, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_05d3a09b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0663f615, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1d5a38a8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_096199d3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1e856f58, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_0a705168, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_234a11a5, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_0ae0825c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_238d91d2, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_0b2be201, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_29175780, /*tc_3x*/
+    InstrItinData <tc_0d8f5752, /*tc_3x*/
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2a160009, /*tc_2early*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_13bfbcf9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b2f4060, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+    InstrItinData <tc_14b272fa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b6f77c6, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_14b5c689, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2f185f5c, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_15aa71c5, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2fc0c436, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+    InstrItinData <tc_174516e8, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_17e0d2cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1a2fd869, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_351fed2d, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+    InstrItinData <tc_1ad90acd, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3669266a, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_1ae57e39, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_367f7f3d, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
+    InstrItinData <tc_1b6f7cec, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_36c68ad1, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
-      []>,
+    InstrItinData <tc_1c4528a2, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_395dc00f, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+    InstrItinData <tc_1c80410a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bc2c5d3, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_3cb8ea06, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2],
+    InstrItinData <tc_1d81e60e, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3d04548d, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_1fc97744, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20cdee80, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3da80ba5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_2332b92e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3e07fb90, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+    InstrItinData <tc_24b66c99, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_41d5298e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_4403ca65, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+    InstrItinData <tc_25a78932, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44126683, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_2b8da4c2, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_452f85af, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_481e5e5c, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2eabeebe, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_49eb22c8, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_2f7c551d, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4ca572d4, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
-
-    InstrItinData <tc_4d9914c9, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2ff964b4, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d99bca9, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+    InstrItinData <tc_30b9bb4a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4f7cd700, /*tc_3stall*/
+    InstrItinData <tc_32779c6f, /*tc_3stall*/
       [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_513bef45, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_36153880, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_51b866be, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_362c6592, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_523fcf30, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_5274e61a, /*tc_st*/
+    InstrItinData <tc_3962fa26, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_52d7bbea, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+    InstrItinData <tc_39dfefe8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
       []>,
 
-    InstrItinData <tc_53bc8a6a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_3a867367, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_53bdb2f6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+    InstrItinData <tc_3b470976, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_540fdfbc, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_55050d58, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_3bd75825, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_3c76b0ff, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d495a39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57288781, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_40116ca8, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_594ab548, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_434c8e1e, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_59a01ead, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+    InstrItinData <tc_4414d8b1, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5acef64a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+    InstrItinData <tc_44d3da28, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5ba5997d, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4560740b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5eb851fc, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+    InstrItinData <tc_4837eefb, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5f6847a1, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+    InstrItinData <tc_49a8207d, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_4ae7b58b, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_4b68bce4, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_60571023, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+    InstrItinData <tc_4c5ba658, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4d5fa3a1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_53559e35, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_609d2efe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_56336eb0, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_63fe3df7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_56f114f4, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_66888ded, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_57890846, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a2711e5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6792d5ff, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_681a2300, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_5aee39f7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_68cb12ce, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_5b54b33f, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b7c0967, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6aa5711a, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
+    InstrItinData <tc_5bf126a6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6ac37025, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+    InstrItinData <tc_5d7f5414, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_5ef37dc4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6132ba3d, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6ebb4a12, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_61830035, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6efc556e, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
+    InstrItinData <tc_640086b5, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6fa4db47, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_643b4717, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_73043bf4, /*tc_2early*/
-      [InstrStage<1, [SLOT3]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_67435e81, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_746baa8e, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_675e4897, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_74e47fd9, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_679309b8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7934b9df, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_6b25e783, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_703e822c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a830544, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_7186d325, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7646c131, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76851da1, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7f881c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+    InstrItinData <tc_779080bf, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_84df2cd3, /*tc_2*/
+    InstrItinData <tc_784490da, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_855b0b61, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_785f65a7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_87735c3b, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_7a91e76a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_897d1a9d, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_838b34ea, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b15472a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_85c9c08f, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fd5f294, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_85d5d03f, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fe6b782, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_862b3e70, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_88b4f13d, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_90f3e30c, /*tc_2early*/
+    InstrItinData <tc_89e94ad3, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_976ddc4f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b121f4a, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
 
-    InstrItinData <tc_994333cd, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b3e402a, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_97743097, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_8c945be0, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_999d32db, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
+    InstrItinData <tc_8c99de45, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_8d9d0154, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_99be14ca, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2, 2],
+    InstrItinData <tc_8fb7ab1b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9461ff31, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c00ce8d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_946df596, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c98e8af, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9ad9998f, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_9d5941c7, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9bfd761f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ef61e5c, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+    InstrItinData <tc_9c3ecd83, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9faf76ae, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_9ca930f7, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9fdb5406, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+    InstrItinData <tc_9da59d12, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9debc299, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a21dc435, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_9e313203, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a27582fa, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_a46f0df5, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_9fc3dae0, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a788683e, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+    InstrItinData <tc_a1123dda, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_a1c00888, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a8acdac0, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+    InstrItinData <tc_a58fd5cc, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a904d137, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+    InstrItinData <tc_a5d4aeec, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a6b1eca9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a813cf9a, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_adb14c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a9d88b22, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b13761ae, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
+    InstrItinData <tc_ae53734a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_b166348b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_b31c2e97, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b44c6e2a, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_b343892a, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b77c481f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b43e7930, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b7dd427e, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b4407292, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_b9488031, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_b44ecf75, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9c0b731, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+    InstrItinData <tc_b4b5c03a, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9c4623f, /*tc_2*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_b51dc29a, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bad2bcaf, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b83e6d73, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcc96cee, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b857bf4e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_b8bffe55, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be706f30, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c2f7d806, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b90a29b1, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c5e2426d, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_b9272d6c, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6aa82f7, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_b9e09e03, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6ce9b3f, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+    InstrItinData <tc_bab0eed9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6ebf8dd, /*tc_3x*/
+    InstrItinData <tc_bafaade3, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c74f796f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_bcf98408, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bd8382d1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bdceeac1, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c82dc1ff, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_be9602ff, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_caaebcba, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_bf061958, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cd7374a0, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+    InstrItinData <tc_bfec0f01, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cde8b071, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_c4db48cb, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf47a43f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+    InstrItinData <tc_c4f596e3, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf59f215, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_c79a189f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cd374165, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d088982c, /*tc_2*/
+    InstrItinData <tc_cf8126ae, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d1090e34, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_cfd8378a, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d24b2d85, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+    InstrItinData <tc_d08ee0f4, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d1aa9eaa, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d580173f, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d2e63d61, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d6bf0472, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d5b7b0c1, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_d9709180, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d5c0729a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9f95eef, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+    InstrItinData <tc_d63f638c, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d65dbf51, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_daa058fa, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
+    InstrItinData <tc_d773585a, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d9d43ecb, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_dbdffe3d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_da4a37ed, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e0739b8c, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_da97ee82, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e1e99bfa, /*tc_2early*/
+    InstrItinData <tc_db2bce9c, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e216a5db, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+    InstrItinData <tc_de4df740, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e421e012, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_de554571, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df3319ed, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e7624c08, /*tc_newvjump*/
+    InstrItinData <tc_e06f432a, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_e7d02c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e913dc32, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e4a7f9f0, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9c822f7, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_e4b3cb20, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9fae2d6, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_e78647bd, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef52ed71, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e86aa961, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef84f62f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e93a3d71, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f2704b9a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e95795ec, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f3eaa14b, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e9f3243f, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f47d212f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+    InstrItinData <tc_f429765c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f49e76f4, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_f675fee8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f7dd9c9f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_f8e23f0b, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f86c328a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_f9058dd7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8eeed7a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_fc3999b4, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_fcc3ddf9, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fcab4871, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
-
-    InstrItinData <tc_ff9ee76e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
+    InstrItinData <tc_fe211424, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
       [Hex_FWD, Hex_FWD]>
   ];
 }
 
-class DepScalarItinV65 {
-  list<InstrItinData> DepScalarItinV65_list = [
-    InstrItinData <tc_00afc57e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+class DepScalarItinV66 {
+  list<InstrItinData> DepScalarItinV66_list = [
+    InstrItinData <tc_002cb246, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0371abea, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_00e7c26e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_05c070ec, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_03220ffa, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_05d3a09b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_038a1342, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_0663f615, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_04c9decc, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_096199d3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05b6c987, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+    InstrItinData <tc_0a705168, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0cd51c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_0ae0825c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0dc560de, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+    InstrItinData <tc_0b2be201, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0fc1ae07, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_0d8f5752, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_10b97e27, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_13bfbcf9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14b272fa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14b5c689, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1372bca1, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_15aa71c5, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14cd4cfa, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
+    InstrItinData <tc_174516e8, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_15411484, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_17e0d2cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_16d0d8d5, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_1a2fd869, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_181af5d0, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [3, 2],
+    InstrItinData <tc_1ad90acd, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1853ea6d, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+    InstrItinData <tc_1ae57e39, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b82a277, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_1b6f7cec, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_1b9c9ee5, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1c4528a2, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1d5a38a8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_1c80410a, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1e856f58, /*tc_1*/
+    InstrItinData <tc_1d81e60e, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fc97744, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_234a11a5, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_20cdee80, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_238d91d2, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_29175780, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_2332b92e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2a160009, /*tc_2early*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_24b66c99, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b2f4060, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_25a78932, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b6f77c6, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_2b8da4c2, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2f185f5c, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2eabeebe, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_2fc0c436, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+    InstrItinData <tc_2f7c551d, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2ff964b4, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_351fed2d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_30b9bb4a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3669266a, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_32779c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_367f7f3d, /*tc_st*/
+    InstrItinData <tc_36153880, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_36c68ad1, /*tc_ld*/
+    InstrItinData <tc_362c6592, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3962fa26, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_39dfefe8, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [],
       []>,
 
-    InstrItinData <tc_395dc00f, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1],
+    InstrItinData <tc_3a867367, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3b470976, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bc2c5d3, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3cb8ea06, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_3bd75825, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_3d04548d, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_3c76b0ff, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3da80ba5, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_3d495a39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3e07fb90, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+    InstrItinData <tc_40116ca8, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_41d5298e, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_434c8e1e, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4414d8b1, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44d3da28, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4403ca65, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+    InstrItinData <tc_4560740b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44126683, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_4837eefb, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_452f85af, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_49a8207d, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_481e5e5c, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4ae7b58b, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_49eb22c8, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_4b68bce4, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4ca572d4, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
+    InstrItinData <tc_4c5ba658, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d9914c9, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4d5fa3a1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_4d99bca9, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+    InstrItinData <tc_53559e35, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56336eb0, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4f7cd700, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_56f114f4, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57890846, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_513bef45, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1],
+    InstrItinData <tc_5a2711e5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_51b866be, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_523fcf30, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5aee39f7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5274e61a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5b54b33f, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_52d7bbea, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
+    InstrItinData <tc_5b7c0967, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_53bc8a6a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_5bf126a6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5d7f5414, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_5ef37dc4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6132ba3d, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_53bdb2f6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+    InstrItinData <tc_61830035, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_540fdfbc, /*tc_1*/
+    InstrItinData <tc_640086b5, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_55050d58, /*tc_1*/
+    InstrItinData <tc_643b4717, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57288781, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_594ab548, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+    InstrItinData <tc_67435e81, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_59a01ead, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+    InstrItinData <tc_675e4897, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_679309b8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5acef64a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6b25e783, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_5ba5997d, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_703e822c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5eb851fc, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+    InstrItinData <tc_7186d325, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5f6847a1, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+    InstrItinData <tc_7646c131, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_60571023, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_609d2efe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_76851da1, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_63fe3df7, /*tc_latepredldaia*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_66888ded, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_779080bf, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6792d5ff, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_784490da, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_681a2300, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_785f65a7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_68cb12ce, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_7a91e76a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6aa5711a, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_838b34ea, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6ac37025, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_85c9c08f, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6ebb4a12, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_85d5d03f, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6efc556e, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
+    InstrItinData <tc_862b3e70, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_6fa4db47, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_88b4f13d, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_73043bf4, /*tc_1*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_89e94ad3, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_746baa8e, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b121f4a, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
 
-    InstrItinData <tc_74e47fd9, /*tc_latepredstaia*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b3e402a, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_7934b9df, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_8c945be0, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a830544, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8c99de45, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_7f881c76, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8d9d0154, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_84df2cd3, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_8fb7ab1b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_855b0b61, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+    InstrItinData <tc_9461ff31, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_87735c3b, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_897d1a9d, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_946df596, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b15472a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9ad9998f, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_8fd5f294, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_9bfd761f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fe6b782, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9c3ecd83, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_90f3e30c, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+    InstrItinData <tc_9ca930f7, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_976ddc4f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_9da59d12, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_97743097, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [2, 2],
+    InstrItinData <tc_9debc299, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e313203, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_994333cd, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_9fc3dae0, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_999d32db, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [1],
+    InstrItinData <tc_a1123dda, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_99be14ca, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_9c00ce8d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_9c98e8af, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_a1c00888, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9d5941c7, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a58fd5cc, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ef61e5c, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+    InstrItinData <tc_a5d4aeec, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9faf76ae, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_9fdb5406, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a6b1eca9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a21dc435, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_a813cf9a, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a27582fa, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_a46f0df5, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_a9d88b22, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a788683e, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_ae53734a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_a8acdac0, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+    InstrItinData <tc_b31c2e97, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a904d137, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b343892a, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_adb14c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b43e7930, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b13761ae, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [],
+    InstrItinData <tc_b4407292, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_b166348b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_b44ecf75, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b4b5c03a, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b44c6e2a, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b51dc29a, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b77c481f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+    InstrItinData <tc_b83e6d73, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b7dd427e, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b857bf4e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_b9488031, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_b8bffe55, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9c0b731, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b90a29b1, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9c4623f, /*tc_2*/
+    InstrItinData <tc_b9272d6c, /*tc_3stall*/
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bad2bcaf, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b9e09e03, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcc96cee, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+    InstrItinData <tc_bab0eed9, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bde7aaf4, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_bafaade3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be706f30, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c2f7d806, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c5e2426d, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_bcf98408, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6aa82f7, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_bd8382d1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6ce9b3f, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+    InstrItinData <tc_bdceeac1, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c6ebf8dd, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_be9602ff, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c74f796f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_bf061958, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c82dc1ff, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_caaebcba, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_cd7374a0, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+    InstrItinData <tc_bfec0f01, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cde8b071, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_c4db48cb, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf47a43f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+    InstrItinData <tc_c4f596e3, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf59f215, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_c79a189f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cd374165, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d088982c, /*tc_2*/
+    InstrItinData <tc_cf8126ae, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d1090e34, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+    InstrItinData <tc_cfd8378a, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d24b2d85, /*tc_latepredstaia*/
-      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+    InstrItinData <tc_d08ee0f4, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d1aa9eaa, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d580173f, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d2e63d61, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d6bf0472, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d5b7b0c1, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_d9709180, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d5c0729a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9f95eef, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+    InstrItinData <tc_d63f638c, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d65dbf51, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_daa058fa, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
+    InstrItinData <tc_d773585a, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d9d43ecb, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_dbdffe3d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_da4a37ed, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e0739b8c, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_da97ee82, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e1e99bfa, /*tc_1*/
+    InstrItinData <tc_db2bce9c, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e216a5db, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+    InstrItinData <tc_de4df740, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e421e012, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_de554571, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e7624c08, /*tc_newvjump*/
+    InstrItinData <tc_df3319ed, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e06f432a, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_e7d02c66, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e913dc32, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e4a7f9f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9c822f7, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4],
-      [Hex_FWD]>,
+    InstrItinData <tc_e4b3cb20, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9fae2d6, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_e78647bd, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef52ed71, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e86aa961, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ef84f62f, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e93a3d71, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f2704b9a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e95795ec, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f3eaa14b, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e9f3243f, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f47d212f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+    InstrItinData <tc_f429765c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f49e76f4, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_f7dd9c9f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_f675fee8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f86c328a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_f8e23f0b, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8eeed7a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_f9058dd7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fcab4871, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_fc3999b4, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_ff9ee76e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
+    InstrItinData <tc_fcc3ddf9, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fe211424, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
       [Hex_FWD, Hex_FWD]>
   ];
 }
diff --git a/lib/Target/Hexagon/HexagonDepITypes.h b/lib/Target/Hexagon/HexagonDepITypes.h
index 7e06ccede6e7af4f86ffedfbcef4f8215c06910e..81e3971e21d20ae17391759b0a9af940bca4087e 100644
--- a/lib/Target/Hexagon/HexagonDepITypes.h
+++ b/lib/Target/Hexagon/HexagonDepITypes.h
@@ -1,4 +1,4 @@
-//===- HexagonDepITypes.h -------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,7 +9,6 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
 namespace llvm {
 namespace HexagonII {
 enum Type {
@@ -45,6 +44,7 @@ enum Type {
   TypeCVI_VX = 29,
   TypeCVI_VX_DV = 30,
   TypeCVI_VX_LATE = 31,
+  TypeCVI_ZW = 32,
   TypeDUPLEX = 33,
   TypeENDLOOP = 34,
   TypeEXTENDER = 35,
@@ -59,7 +59,7 @@ enum Type {
   TypeS_2op = 44,
   TypeS_3op = 45,
   TypeV2LDST = 48,
-  TypeV4LDST = 49
+  TypeV4LDST = 49,
 };
 }
 }
diff --git a/lib/Target/Hexagon/HexagonDepITypes.td b/lib/Target/Hexagon/HexagonDepITypes.td
index 0a385bf938fed1c9f6625107c04d5f1d47244f9f..f694062a5232f11e3ae560b34ea0ccb995dcc370 100644
--- a/lib/Target/Hexagon/HexagonDepITypes.td
+++ b/lib/Target/Hexagon/HexagonDepITypes.td
@@ -1,4 +1,4 @@
-//===- HexagonDepITypes.td ------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,8 +9,7 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
-class IType<bits<6> t> { bits<6> Value = t; }
+class IType<bits<7> t> { bits<7> Value = t; }
 def TypeALU32_2op : IType<0>;
 def TypeALU32_3op : IType<1>;
 def TypeALU32_ADDI : IType<2>;
@@ -43,6 +42,7 @@ def TypeCVI_VS_VX : IType<28>;
 def TypeCVI_VX : IType<29>;
 def TypeCVI_VX_DV : IType<30>;
 def TypeCVI_VX_LATE : IType<31>;
+def TypeCVI_ZW : IType<32>;
 def TypeDUPLEX : IType<33>;
 def TypeENDLOOP : IType<34>;
 def TypeEXTENDER : IType<35>;
diff --git a/lib/Target/Hexagon/HexagonDepInstrFormats.td b/lib/Target/Hexagon/HexagonDepInstrFormats.td
index 9f98da3a1deedc8c3e6f8d29815871451e35f9cb..ffe212ef9d97bc4bef363596558b1202cd8edbc6 100644
--- a/lib/Target/Hexagon/HexagonDepInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -1,4 +1,4 @@
-//===- HexagonDepInstrFormats.td ------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,7 +9,6 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
 class Enc_890909 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
@@ -61,14 +60,6 @@ class Enc_27b757 : OpcodeHexagon {
   bits <5> Vs32;
   let Inst{4-0} = Vs32{4-0};
 }
-class Enc_8d04c3 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_1de724 : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -87,12 +78,6 @@ class Enc_0e41fa : OpcodeHexagon {
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
 }
-class Enc_2a736a : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_3d6d37 : OpcodeHexagon {
   bits <2> Qs4;
   let Inst{6-5} = Qs4{1-0};
@@ -121,14 +106,6 @@ class Enc_802dc0 : OpcodeHexagon {
   bits <2> Qv4;
   let Inst{23-22} = Qv4{1-0};
 }
-class Enc_6a4549 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_6b197f : OpcodeHexagon {
   bits <4> Ii;
   let Inst{8-5} = Ii{3-0};
@@ -137,22 +114,6 @@ class Enc_6b197f : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_1f3376 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vxx32;
-  let Inst{7-3} = Vxx32{4-0};
-}
-class Enc_1f5d8f : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_51436c : OpcodeHexagon {
   bits <16> Ii;
   let Inst{23-22} = Ii{15-14};
@@ -249,6 +210,14 @@ class Enc_d7dc10 : OpcodeHexagon {
   bits <2> Pd4;
   let Inst{1-0} = Pd4{1-0};
 }
+class Enc_6baed4 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
 class Enc_736575 : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -291,14 +260,6 @@ class Enc_509701 : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_c84567 : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_830e5d : OpcodeHexagon {
   bits <8> Ii;
   let Inst{12-5} = Ii{7-0};
@@ -310,12 +271,6 @@ class Enc_830e5d : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_ae0040 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <6> Sd64;
-  let Inst{5-0} = Sd64{5-0};
-}
 class Enc_79b8c8 : OpcodeHexagon {
   bits <6> Ii;
   let Inst{6-3} = Ii{5-2};
@@ -336,16 +291,6 @@ class Enc_58a8bf : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_e8ddd5 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{21-21} = Ii{15-15};
-  let Inst{13-8} = Ii{14-9};
-  let Inst{2-0} = Ii{8-6};
-  bits <5> Vss32;
-  let Inst{7-3} = Vss32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_041d7b : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -369,14 +314,6 @@ class Enc_f44229 : OpcodeHexagon {
   bits <3> Nt8;
   let Inst{10-8} = Nt8{2-0};
 }
-class Enc_fc563d : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_aad80c : OpcodeHexagon {
   bits <5> Vuu32;
   let Inst{12-8} = Vuu32{4-0};
@@ -434,6 +371,14 @@ class Enc_ee5ed0 : OpcodeHexagon {
   bits <2> n1;
   let Inst{9-8} = n1{1-0};
 }
+class Enc_bddee3 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vyyyy32;
+  let Inst{4-0} = Vyyyy32{4-0};
+  bits <3> Rx8;
+  let Inst{18-16} = Rx8{2-0};
+}
 class Enc_935d9b : OpcodeHexagon {
   bits <5> Ii;
   let Inst{6-3} = Ii{4-1};
@@ -573,6 +518,14 @@ class Enc_27fd0e : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
+class Enc_d7bc34 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vyyyy32;
+  let Inst{4-0} = Vyyyy32{4-0};
+}
 class Enc_93af4c : OpcodeHexagon {
   bits <7> Ii;
   let Inst{10-4} = Ii{6-0};
@@ -620,12 +573,6 @@ class Enc_14640c : OpcodeHexagon {
   let Inst{24-22} = n1{3-1};
   let Inst{13-13} = n1{0-0};
 }
-class Enc_2516bf : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_31db33 : OpcodeHexagon {
   bits <2> Qt4;
   let Inst{6-5} = Qt4{1-0};
@@ -656,24 +603,6 @@ class Enc_784502 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_9a9d62 : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Vs32;
-  let Inst{7-3} = Vs32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
-class Enc_3a81ac : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_6413b6 : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -703,13 +632,13 @@ class Enc_84bff1 : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_74aef2 : OpcodeHexagon {
+class Enc_f4413a : OpcodeHexagon {
   bits <4> Ii;
   let Inst{8-5} = Ii{3-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
@@ -753,16 +682,6 @@ class Enc_e39bb2 : OpcodeHexagon {
   bits <4> Rd16;
   let Inst{3-0} = Rd16{3-0};
 }
-class Enc_7db2f8 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{13-9} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{8-4} = Vv32{4-0};
-  bits <4> Vdd16;
-  let Inst{3-0} = Vdd16{3-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_1b64fb : OpcodeHexagon {
   bits <16> Ii;
   let Inst{26-25} = Ii{15-14};
@@ -772,6 +691,16 @@ class Enc_1b64fb : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
 }
+class Enc_c1d806 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <2> Qe4;
+  let Inst{6-5} = Qe4{1-0};
+}
 class Enc_c6220b : OpcodeHexagon {
   bits <2> Ii;
   let Inst{13-13} = Ii{1-1};
@@ -841,10 +770,6 @@ class Enc_fcf7a7 : OpcodeHexagon {
   bits <2> Pd4;
   let Inst{1-0} = Pd4{1-0};
 }
-class Enc_2c3281 : OpcodeHexagon {
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_55355c : OpcodeHexagon {
   bits <2> Ii;
   let Inst{13-13} = Ii{1-1};
@@ -877,6 +802,16 @@ class Enc_6185fe : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
+class Enc_74aef2 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
 class Enc_cd4705 : OpcodeHexagon {
   bits <3> Ii;
   let Inst{7-5} = Ii{2-0};
@@ -920,10 +855,6 @@ class Enc_fef969 : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_b2ffce : OpcodeHexagon {
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_63eaeb : OpcodeHexagon {
   bits <2> Ii;
   let Inst{1-0} = Ii{1-0};
@@ -948,12 +879,6 @@ class Enc_372c9d : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_9e9047 : OpcodeHexagon {
-  bits <2> Pt4;
-  let Inst{9-8} = Pt4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-}
 class Enc_4dff07 : OpcodeHexagon {
   bits <2> Qv4;
   let Inst{12-11} = Qv4{1-0};
@@ -1000,16 +925,6 @@ class Enc_b388cf : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_880793 : OpcodeHexagon {
-  bits <3> Qt8;
-  let Inst{2-0} = Qt8{2-0};
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_ad1c74 : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -1086,14 +1001,6 @@ class Enc_88d4d9 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
 }
-class Enc_c0cdde : OpcodeHexagon {
-  bits <9> Ii;
-  let Inst{13-5} = Ii{8-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
-}
 class Enc_226535 : OpcodeHexagon {
   bits <8> Ii;
   let Inst{12-7} = Ii{7-2};
@@ -1102,14 +1009,6 @@ class Enc_226535 : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{4-0} = Rt32{4-0};
 }
-class Enc_96f0fd : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vx32;
-  let Inst{7-3} = Vx32{4-0};
-  bits <3> Qdd8;
-  let Inst{2-0} = Qdd8{2-0};
-}
 class Enc_31aa6a : OpcodeHexagon {
   bits <5> Ii;
   let Inst{6-3} = Ii{4-1};
@@ -1120,12 +1019,6 @@ class Enc_31aa6a : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_932b58 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-}
 class Enc_397f23 : OpcodeHexagon {
   bits <8> Ii;
   let Inst{13-13} = Ii{7-7};
@@ -1192,14 +1085,6 @@ class Enc_01d3d0 : OpcodeHexagon {
   bits <5> Vdd32;
   let Inst{4-0} = Vdd32{4-0};
 }
-class Enc_3126d7 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_b0e9d8 : OpcodeHexagon {
   bits <10> Ii;
   let Inst{21-21} = Ii{9-9};
@@ -1209,6 +1094,14 @@ class Enc_b0e9d8 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{4-0} = Rx32{4-0};
 }
+class Enc_1bd127 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vdddd32;
+  let Inst{4-0} = Vdddd32{4-0};
+}
 class Enc_3694bd : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -1276,12 +1169,6 @@ class Enc_88c16c : OpcodeHexagon {
   bits <5> Rxx32;
   let Inst{4-0} = Rxx32{4-0};
 }
-class Enc_e7408c : OpcodeHexagon {
-  bits <6> Sss64;
-  let Inst{21-16} = Sss64{5-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
 class Enc_770858 : OpcodeHexagon {
   bits <2> Ps4;
   let Inst{6-5} = Ps4{1-0};
@@ -1323,15 +1210,14 @@ class Enc_412ff0 : OpcodeHexagon {
   bits <5> Rxx32;
   let Inst{12-8} = Rxx32{4-0};
 }
-class Enc_8e9fbd : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-  bits <5> Vy32;
-  let Inst{12-8} = Vy32{4-0};
+class Enc_ef601b : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
 }
 class Enc_c9a18e : OpcodeHexagon {
   bits <11> Ii;
@@ -1356,19 +1242,6 @@ class Enc_e6abcf : OpcodeHexagon {
   bits <5> Rtt32;
   let Inst{12-8} = Rtt32{4-0};
 }
-class Enc_6339d5 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <2> Pv4;
-  let Inst{6-5} = Pv4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Ru32;
-  let Inst{12-8} = Ru32{4-0};
-  bits <5> Rt32;
-  let Inst{4-0} = Rt32{4-0};
-}
 class Enc_d6990d : OpcodeHexagon {
   bits <5> Vuu32;
   let Inst{12-8} = Vuu32{4-0};
@@ -1377,16 +1250,6 @@ class Enc_d6990d : OpcodeHexagon {
   bits <5> Vxx32;
   let Inst{4-0} = Vxx32{4-0};
 }
-class Enc_6c4697 : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_6c9440 : OpcodeHexagon {
   bits <10> Ii;
   let Inst{21-21} = Ii{9-9};
@@ -1445,15 +1308,13 @@ class Enc_9d1247 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_f4413a : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{8-5} = Ii{3-0};
-  bits <2> Pt4;
-  let Inst{10-9} = Pt4{1-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_7b7ba8 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
 class Enc_f7430e : OpcodeHexagon {
   bits <4> Ii;
@@ -1531,12 +1392,6 @@ class Enc_a803e0 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
 }
-class Enc_fde0e3 : OpcodeHexagon {
-  bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_45364e : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
@@ -1557,12 +1412,6 @@ class Enc_b909d2 : OpcodeHexagon {
   let Inst{13-13} = n1{1-1};
   let Inst{8-8} = n1{0-0};
 }
-class Enc_790d6e : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_e6c957 : OpcodeHexagon {
   bits <10> Ii;
   let Inst{21-21} = Ii{9-9};
@@ -1570,15 +1419,6 @@ class Enc_e6c957 : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_fa3ba4 : OpcodeHexagon {
-  bits <14> Ii;
-  let Inst{26-25} = Ii{13-12};
-  let Inst{13-5} = Ii{11-3};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
 class Enc_0d8870 : OpcodeHexagon {
   bits <12> Ii;
   let Inst{26-25} = Ii{11-10};
@@ -1623,14 +1463,6 @@ class Enc_0ed752 : OpcodeHexagon {
   bits <5> Cdd32;
   let Inst{4-0} = Cdd32{4-0};
 }
-class Enc_908985 : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vss32;
-  let Inst{7-3} = Vss32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_143445 : OpcodeHexagon {
   bits <13> Ii;
   let Inst{26-25} = Ii{12-11};
@@ -1658,16 +1490,6 @@ class Enc_3e3989 : OpcodeHexagon {
   let Inst{25-22} = n1{4-1};
   let Inst{8-8} = n1{0-0};
 }
-class Enc_12dd8f : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <5> Vx32;
-  let Inst{7-3} = Vx32{4-0};
-}
 class Enc_152467 : OpcodeHexagon {
   bits <5> Ii;
   let Inst{8-5} = Ii{4-1};
@@ -1676,30 +1498,31 @@ class Enc_152467 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_6b1bc4 : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <3> Qt8;
-  let Inst{10-8} = Qt8{2-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_daea09 : OpcodeHexagon {
-  bits <17> Ii;
-  let Inst{23-22} = Ii{16-15};
-  let Inst{20-16} = Ii{14-10};
-  let Inst{13-13} = Ii{9-9};
-  let Inst{7-1} = Ii{8-2};
+class Enc_9ac432 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
   bits <2> Pu4;
-  let Inst{9-8} = Pu4{1-0};
+  let Inst{7-6} = Pu4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_f37377 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-7} = Ii{7-2};
-  bits <8> II;
-  let Inst{13-13} = II{7-7};
-  let Inst{6-0} = II{6-0};
-  bits <5> Rs32;
+class Enc_a90628 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_f37377 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
 }
 class Enc_a198f6 : OpcodeHexagon {
@@ -1712,12 +1535,6 @@ class Enc_a198f6 : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_a265b7 : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_4e4a80 : OpcodeHexagon {
   bits <2> Qs4;
   let Inst{6-5} = Qs4{1-0};
@@ -1728,16 +1545,6 @@ class Enc_4e4a80 : OpcodeHexagon {
   bits <5> Vvv32;
   let Inst{4-0} = Vvv32{4-0};
 }
-class Enc_8d5d98 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <5> Vxx32;
-  let Inst{7-3} = Vxx32{4-0};
-}
 class Enc_3dac0b : OpcodeHexagon {
   bits <2> Qt4;
   let Inst{6-5} = Qt4{1-0};
@@ -1780,16 +1587,6 @@ class Enc_2df31d : OpcodeHexagon {
   bits <4> Rd16;
   let Inst{3-0} = Rd16{3-0};
 }
-class Enc_b0e553 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{21-21} = Ii{15-15};
-  let Inst{13-8} = Ii{14-9};
-  let Inst{2-0} = Ii{8-6};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_25bef0 : OpcodeHexagon {
   bits <16> Ii;
   let Inst{26-25} = Ii{15-14};
@@ -1905,10 +1702,14 @@ class Enc_bd1cbc : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_d0fe02 : OpcodeHexagon {
-  bits <5> Rxx32;
-  let Inst{20-16} = Rxx32{4-0};
-  bits <0> sgp10;
+class Enc_c85e2a : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
 class Enc_a30110 : OpcodeHexagon {
   bits <5> Vu32;
@@ -1920,24 +1721,12 @@ class Enc_a30110 : OpcodeHexagon {
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
 }
-class Enc_f3f408 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
-}
-class Enc_ce4c54 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{21-21} = Ii{15-15};
-  let Inst{13-8} = Ii{14-9};
-  let Inst{2-0} = Ii{8-6};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
+class Enc_33f8ba : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-8} = Ii{7-3};
+  let Inst{4-2} = Ii{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
 class Enc_690862 : OpcodeHexagon {
   bits <13> Ii;
@@ -1949,20 +1738,6 @@ class Enc_690862 : OpcodeHexagon {
   bits <3> Nt8;
   let Inst{10-8} = Nt8{2-0};
 }
-class Enc_e570b0 : OpcodeHexagon {
-  bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_3c46e8 : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_2a3787 : OpcodeHexagon {
   bits <13> Ii;
   let Inst{26-25} = Ii{12-11};
@@ -2010,22 +1785,6 @@ class Enc_729ff7 : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_5883d0 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{21-21} = Ii{15-15};
-  let Inst{13-8} = Ii{14-9};
-  let Inst{2-0} = Ii{8-6};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_ff0e49 : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <6> Sdd64;
-  let Inst{5-0} = Sdd64{5-0};
-}
 class Enc_217147 : OpcodeHexagon {
   bits <2> Qv4;
   let Inst{23-22} = Qv4{1-0};
@@ -2060,14 +1819,6 @@ class Enc_541f26 : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
 }
-class Enc_9aae4a : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vx32;
-  let Inst{7-3} = Vx32{4-0};
-  bits <3> Qd8;
-  let Inst{2-0} = Qd8{2-0};
-}
 class Enc_724154 : OpcodeHexagon {
   bits <6> II;
   let Inst{5-0} = II{5-0};
@@ -2114,16 +1865,6 @@ class Enc_b84c4c : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_9ac432 : OpcodeHexagon {
-  bits <2> Ps4;
-  let Inst{17-16} = Ps4{1-0};
-  bits <2> Pt4;
-  let Inst{9-8} = Pt4{1-0};
-  bits <2> Pu4;
-  let Inst{7-6} = Pu4{1-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
-}
 class Enc_8203bb : OpcodeHexagon {
   bits <6> Ii;
   let Inst{12-7} = Ii{5-0};
@@ -2228,12 +1969,6 @@ class Enc_96ce4f : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_2bbae6 : OpcodeHexagon {
-  bits <6> Ss64;
-  let Inst{21-16} = Ss64{5-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
 class Enc_143a3c : OpcodeHexagon {
   bits <6> Ii;
   let Inst{13-8} = Ii{5-0};
@@ -2281,13 +2016,14 @@ class Enc_de0214 : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_a90628 : OpcodeHexagon {
-  bits <2> Qv4;
-  let Inst{23-22} = Qv4{1-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
+class Enc_daea09 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{23-22} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
 }
 class Enc_fda92c : OpcodeHexagon {
   bits <17> Ii;
@@ -2365,26 +2101,6 @@ class Enc_b43b67 : OpcodeHexagon {
   bits <2> Qx4;
   let Inst{6-5} = Qx4{1-0};
 }
-class Enc_1cd70f : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
-class Enc_3a527f : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{21-21} = Ii{15-15};
-  let Inst{13-8} = Ii{14-9};
-  let Inst{2-0} = Ii{8-6};
-  bits <5> Vs32;
-  let Inst{7-3} = Vs32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_4aca3a : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -2403,12 +2119,6 @@ class Enc_b38ffc : OpcodeHexagon {
   bits <4> Rt16;
   let Inst{3-0} = Rt16{3-0};
 }
-class Enc_5c3a80 : OpcodeHexagon {
-  bits <3> Qt8;
-  let Inst{10-8} = Qt8{2-0};
-  bits <3> Qd8;
-  let Inst{5-3} = Qd8{2-0};
-}
 class Enc_cda00a : OpcodeHexagon {
   bits <12> Ii;
   let Inst{19-16} = Ii{11-8};
@@ -2426,24 +2136,6 @@ class Enc_2fbf3c : OpcodeHexagon {
   bits <4> Rd16;
   let Inst{3-0} = Rd16{3-0};
 }
-class Enc_a4ae28 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <3> Qd8;
-  let Inst{5-3} = Qd8{2-0};
-}
-class Enc_dd5f9f : OpcodeHexagon {
-  bits <3> Qtt8;
-  let Inst{2-0} = Qtt8{2-0};
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <5> Vvv32;
-  let Inst{12-8} = Vvv32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_70b24b : OpcodeHexagon {
   bits <6> Ii;
   let Inst{8-5} = Ii{5-2};
@@ -2490,16 +2182,6 @@ class Enc_08d755 : OpcodeHexagon {
   bits <2> Pd4;
   let Inst{1-0} = Pd4{1-0};
 }
-class Enc_a7ca29 : OpcodeHexagon {
-  bits <3> Qt8;
-  let Inst{2-0} = Qt8{2-0};
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_1178da : OpcodeHexagon {
   bits <3> Ii;
   let Inst{7-5} = Ii{2-0};
@@ -2518,14 +2200,6 @@ class Enc_8dbe85 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_17a474 : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vs32;
-  let Inst{7-3} = Vs32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_5a18b3 : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -2586,14 +2260,6 @@ class Enc_12b6e9 : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_9a895f : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_6f70ca : OpcodeHexagon {
   bits <8> Ii;
   let Inst{8-4} = Ii{7-3};
@@ -2605,12 +2271,7 @@ class Enc_7222b7 : OpcodeHexagon {
   let Inst{1-0} = Qd4{1-0};
 }
 class Enc_e3b0c4 : OpcodeHexagon {
-}
-class Enc_d7e8ba : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
+
 }
 class Enc_a255dc : OpcodeHexagon {
   bits <3> Ii;
@@ -2628,16 +2289,6 @@ class Enc_cb785b : OpcodeHexagon {
   bits <5> Vdd32;
   let Inst{4-0} = Vdd32{4-0};
 }
-class Enc_5b76ab : OpcodeHexagon {
-  bits <10> Ii;
-  let Inst{21-21} = Ii{9-9};
-  let Inst{13-8} = Ii{8-3};
-  let Inst{2-0} = Ii{2-0};
-  bits <5> Vs32;
-  let Inst{7-3} = Vs32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_cb4b4e : OpcodeHexagon {
   bits <2> Pu4;
   let Inst{6-5} = Pu4{1-0};
@@ -2648,23 +2299,13 @@ class Enc_cb4b4e : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_fbacc2 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <5> Vxx32;
-  let Inst{7-3} = Vxx32{4-0};
-  bits <5> Vy32;
-  let Inst{12-8} = Vy32{4-0};
-}
-class Enc_2ad23d : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vx32;
-  let Inst{7-3} = Vx32{4-0};
+class Enc_1f5d8f : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
 class Enc_9cdba7 : OpcodeHexagon {
   bits <8> Ii;
@@ -2683,10 +2324,6 @@ class Enc_5cd7e9 : OpcodeHexagon {
   bits <5> Ryy32;
   let Inst{4-0} = Ryy32{4-0};
 }
-class Enc_e7c9de : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-}
 class Enc_454a26 : OpcodeHexagon {
   bits <2> Pt4;
   let Inst{9-8} = Pt4{1-0};
@@ -2786,14 +2423,6 @@ class Enc_d2c7f1 : OpcodeHexagon {
   bits <2> Pe4;
   let Inst{6-5} = Pe4{1-0};
 }
-class Enc_dcfcbb : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vvv32;
-  let Inst{12-8} = Vvv32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_3680c2 : OpcodeHexagon {
   bits <7> Ii;
   let Inst{11-5} = Ii{6-0};
@@ -2822,31 +2451,13 @@ class Enc_e957fb : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
 }
-class Enc_2146c1 : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <5> Vvv32;
-  let Inst{12-8} = Vvv32{4-0};
-  bits <3> Qss8;
-  let Inst{2-0} = Qss8{2-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
-class Enc_a662ae : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <5> Vvv32;
-  let Inst{12-8} = Vvv32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_8f7cc3 : OpcodeHexagon {
-  bits <3> Qtt8;
-  let Inst{10-8} = Qtt8{2-0};
-  bits <3> Qdd8;
-  let Inst{5-3} = Qdd8{2-0};
+class Enc_c0cdde : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
 class Enc_c9e3bc : OpcodeHexagon {
   bits <4> Ii;
@@ -2886,33 +2497,18 @@ class Enc_6f83e7 : OpcodeHexagon {
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
 }
-class Enc_46f33d : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-}
-class Enc_c1652e : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
+class Enc_6339d5 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <3> Qd8;
-  let Inst{5-3} = Qd8{2-0};
-}
-class Enc_b5b643 : OpcodeHexagon {
-  bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vx32;
-  let Inst{7-3} = Vx32{4-0};
-}
-class Enc_85daf5 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vx32;
-  let Inst{7-3} = Vx32{4-0};
+  let Inst{4-0} = Rt32{4-0};
 }
 class Enc_d483b9 : OpcodeHexagon {
   bits <1> Ii;
@@ -2952,13 +2548,14 @@ class Enc_6c9ee0 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_72a92d : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vxx32;
-  let Inst{7-3} = Vxx32{4-0};
+class Enc_fa3ba4 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-5} = Ii{11-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
 class Enc_44661f : OpcodeHexagon {
   bits <1> Mu2;
@@ -3006,14 +2603,6 @@ class Enc_da664b : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_7b7ba8 : OpcodeHexagon {
-  bits <2> Qu4;
-  let Inst{9-8} = Qu4{1-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
-}
 class Enc_47ee5e : OpcodeHexagon {
   bits <2> Ii;
   let Inst{13-13} = Ii{1-1};
@@ -3116,14 +2705,6 @@ class Enc_8e583a : OpcodeHexagon {
   let Inst{25-23} = n1{3-1};
   let Inst{13-13} = n1{0-0};
 }
-class Enc_334c2b : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_b886fd : OpcodeHexagon {
   bits <5> Ii;
   let Inst{6-3} = Ii{4-1};
@@ -3177,36 +2758,12 @@ class Enc_8dbdfe : OpcodeHexagon {
   bits <3> Nt8;
   let Inst{10-8} = Nt8{2-0};
 }
-class Enc_7dc746 : OpcodeHexagon {
-  bits <3> Quu8;
-  let Inst{10-8} = Quu8{2-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <3> Qdd8;
-  let Inst{5-3} = Qdd8{2-0};
-}
 class Enc_90cd8b : OpcodeHexagon {
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_b8513b : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{20-16} = Vuu32{4-0};
-  bits <5> Vvv32;
-  let Inst{12-8} = Vvv32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
-class Enc_b3bac4 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-}
 class Enc_bd0b33 : OpcodeHexagon {
   bits <10> Ii;
   let Inst{21-21} = Ii{9-9};
@@ -3216,16 +2773,6 @@ class Enc_bd0b33 : OpcodeHexagon {
   bits <2> Pd4;
   let Inst{1-0} = Pd4{1-0};
 }
-class Enc_843e80 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{7-3} = Vd32{4-0};
-  bits <3> Qxx8;
-  let Inst{2-0} = Qxx8{2-0};
-}
 class Enc_8b8927 : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{20-16} = Rt32{4-0};
@@ -3359,6 +2906,16 @@ class Enc_e07374 : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
+class Enc_e0820b : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
 class Enc_323f2d : OpcodeHexagon {
   bits <6> II;
   let Inst{11-8} = II{5-2};
@@ -3381,16 +2938,6 @@ class Enc_1a9974 : OpcodeHexagon {
   bits <5> Rtt32;
   let Inst{4-0} = Rtt32{4-0};
 }
-class Enc_9ce456 : OpcodeHexagon {
-  bits <10> Ii;
-  let Inst{21-21} = Ii{9-9};
-  let Inst{13-8} = Ii{8-3};
-  let Inst{2-0} = Ii{2-0};
-  bits <5> Vss32;
-  let Inst{7-3} = Vss32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_5de85f : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -3416,14 +2963,6 @@ class Enc_0b51ce : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_b5e54d : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
 class Enc_b4e6cf : OpcodeHexagon {
   bits <10> Ii;
   let Inst{21-21} = Ii{9-9};
@@ -3479,16 +3018,6 @@ class Enc_645d54 : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_b5d5a7 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{21-21} = Ii{15-15};
-  let Inst{13-8} = Ii{14-9};
-  let Inst{2-0} = Ii{8-6};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vs32;
-  let Inst{7-3} = Vs32{4-0};
-}
 class Enc_667b39 : OpcodeHexagon {
   bits <5> Css32;
   let Inst{20-16} = Css32{4-0};
@@ -3511,6 +3040,14 @@ class Enc_163a3c : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{4-0} = Rt32{4-0};
 }
+class Enc_a75aa6 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+}
 class Enc_b087ac : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
@@ -3519,6 +3056,14 @@ class Enc_b087ac : OpcodeHexagon {
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
 }
+class Enc_691712 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
 class Enc_b1e1fb : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -3546,16 +3091,6 @@ class Enc_b8c967 : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_f106e0 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{8-4} = Vv32{4-0};
-  bits <5> Vt32;
-  let Inst{13-9} = Vt32{4-0};
-  bits <4> Vdd16;
-  let Inst{3-0} = Vdd16{3-0};
-}
 class Enc_fb6577 : OpcodeHexagon {
   bits <2> Pu4;
   let Inst{9-8} = Pu4{1-0};
@@ -3564,20 +3099,6 @@ class Enc_fb6577 : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_37c406 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <4> Vdd16;
-  let Inst{7-4} = Vdd16{3-0};
-}
-class Enc_403871 : OpcodeHexagon {
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
 class Enc_2bae10 : OpcodeHexagon {
   bits <4> Ii;
   let Inst{10-8} = Ii{3-1};
@@ -3586,22 +3107,6 @@ class Enc_2bae10 : OpcodeHexagon {
   bits <4> Rd16;
   let Inst{3-0} = Rd16{3-0};
 }
-class Enc_f3adb6 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{21-21} = Ii{15-15};
-  let Inst{13-8} = Ii{14-9};
-  let Inst{2-0} = Ii{8-6};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
-class Enc_aac08c : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <5> Vx32;
-  let Inst{7-3} = Vx32{4-0};
-}
 class Enc_c4dc92 : OpcodeHexagon {
   bits <2> Qv4;
   let Inst{23-22} = Qv4{1-0};
@@ -3743,12 +3248,14 @@ class Enc_134437 : OpcodeHexagon {
   bits <2> Qd4;
   let Inst{1-0} = Qd4{1-0};
 }
-class Enc_33f8ba : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-8} = Ii{7-3};
-  let Inst{4-2} = Ii{2-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_f3f408 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
 class Enc_97d666 : OpcodeHexagon {
   bits <4> Rs16;
@@ -3766,16 +3273,6 @@ class Enc_f82eaf : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_57e245 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-  bits <5> Vy32;
-  let Inst{12-8} = Vy32{4-0};
-}
 class Enc_69d63b : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -3842,24 +3339,6 @@ class Enc_7eaeb6 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_274a4c : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{20-16} = Vu32{4-0};
-  bits <3> Rt8;
-  let Inst{2-0} = Rt8{2-0};
-  bits <5> Vx32;
-  let Inst{7-3} = Vx32{4-0};
-  bits <5> Vy32;
-  let Inst{12-8} = Vy32{4-0};
-}
-class Enc_aceeef : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_f55a0c : OpcodeHexagon {
   bits <6> Ii;
   let Inst{11-8} = Ii{5-2};
@@ -3898,16 +3377,6 @@ class Enc_7b523d : OpcodeHexagon {
   bits <5> Vxx32;
   let Inst{4-0} = Vxx32{4-0};
 }
-class Enc_c39a8b : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{21-21} = Ii{15-15};
-  let Inst{13-8} = Ii{14-9};
-  let Inst{2-0} = Ii{8-6};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vss32;
-  let Inst{7-3} = Vss32{4-0};
-}
 class Enc_47ef61 : OpcodeHexagon {
   bits <3> Ii;
   let Inst{7-5} = Ii{2-0};
@@ -4006,6 +3475,14 @@ class Enc_a6ce9c : OpcodeHexagon {
   bits <4> Rs16;
   let Inst{7-4} = Rs16{3-0};
 }
+class Enc_3b7631 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vdddd32;
+  let Inst{4-0} = Vdddd32{4-0};
+  bits <3> Rx8;
+  let Inst{18-16} = Rx8{2-0};
+}
 class Enc_eca7c8 : OpcodeHexagon {
   bits <2> Ii;
   let Inst{13-13} = Ii{1-1};
@@ -4017,16 +3494,6 @@ class Enc_eca7c8 : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{4-0} = Rt32{4-0};
 }
-class Enc_598f6c : OpcodeHexagon {
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-}
-class Enc_41dcc3 : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vdd32;
-  let Inst{7-3} = Vdd32{4-0};
-}
 class Enc_4b39e4 : OpcodeHexagon {
   bits <3> Ii;
   let Inst{7-5} = Ii{2-0};
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 0b5efda933da3f27d9b589f9daa2dda60dd52543..3ef1c49eb7ee0f4be2c186e03d59b9f9cbae9a26 100644
--- a/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -1,4 +1,4 @@
-//===- HexagonDepInstrInfo.td ---------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,12 +9,11 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
 def A2_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = abs($Rs32)",
-tc_c2f7d806, TypeS_2op>, Enc_5e2823 {
+tc_cf8126ae, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -25,7 +24,7 @@ def A2_absp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = abs($Rss32)",
-tc_c2f7d806, TypeS_2op>, Enc_b9c5fb {
+tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000100;
 let prefersSlot3 = 1;
@@ -34,7 +33,7 @@ def A2_abssat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = abs($Rs32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_5e2823 {
+tc_cf8126ae, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -46,7 +45,7 @@ def A2_add : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011000;
@@ -62,7 +61,7 @@ def A2_addh_h16_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.h,$Rs32.h):<<16",
-tc_897d1a9d, TypeALU64>, Enc_bd6011 {
+tc_679309b8, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -74,7 +73,7 @@ def A2_addh_h16_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.h,$Rs32.l):<<16",
-tc_897d1a9d, TypeALU64>, Enc_bd6011 {
+tc_679309b8, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -86,7 +85,7 @@ def A2_addh_h16_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.h):<<16",
-tc_897d1a9d, TypeALU64>, Enc_bd6011 {
+tc_679309b8, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -98,7 +97,7 @@ def A2_addh_h16_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.l):<<16",
-tc_897d1a9d, TypeALU64>, Enc_bd6011 {
+tc_679309b8, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -110,7 +109,7 @@ def A2_addh_h16_sat_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.h,$Rs32.h):sat:<<16",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -123,7 +122,7 @@ def A2_addh_h16_sat_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.h,$Rs32.l):sat:<<16",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -136,7 +135,7 @@ def A2_addh_h16_sat_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.h):sat:<<16",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -149,7 +148,7 @@ def A2_addh_h16_sat_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.l):sat:<<16",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -162,7 +161,7 @@ def A2_addh_l16_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.h)",
-tc_1b9c9ee5, TypeALU64>, Enc_bd6011 {
+tc_4414d8b1, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101000;
@@ -174,7 +173,7 @@ def A2_addh_l16_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.l)",
-tc_1b9c9ee5, TypeALU64>, Enc_bd6011 {
+tc_4414d8b1, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101000;
@@ -186,7 +185,7 @@ def A2_addh_l16_sat_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.h):sat",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101000;
@@ -199,7 +198,7 @@ def A2_addh_l16_sat_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.l):sat",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101000;
@@ -212,7 +211,7 @@ def A2_addi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = add($Rs32,#$Ii)",
-tc_b9488031, TypeALU32_ADDI>, Enc_cb9321, PredNewRel, ImmRegRel {
+tc_5a2711e5, TypeALU32_ADDI>, Enc_cb9321, PredNewRel, ImmRegRel {
 let Inst{31-28} = 0b1011;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -231,7 +230,7 @@ def A2_addp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_946df596, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -242,7 +241,7 @@ def A2_addpsat : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_a56825 {
+tc_779080bf, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -254,7 +253,7 @@ def A2_addsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add($Rs32,$Rt32):sat",
-tc_5ba5997d, TypeALU32_3op>, Enc_5ab2be {
+tc_61830035, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110010;
@@ -269,14 +268,14 @@ def A2_addsp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rs32,$Rtt32)",
-tc_897d1a9d, TypeALU64> {
+tc_679309b8, TypeALU64> {
 let isPseudo = 1;
 }
 def A2_addsph : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rss32,$Rtt32):raw:hi",
-tc_897d1a9d, TypeALU64>, Enc_a56825 {
+tc_679309b8, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -286,7 +285,7 @@ def A2_addspl : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rss32,$Rtt32):raw:lo",
-tc_897d1a9d, TypeALU64>, Enc_a56825 {
+tc_679309b8, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -296,7 +295,7 @@ def A2_and : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = and($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001000;
@@ -312,7 +311,7 @@ def A2_andir : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = and($Rs32,#$Ii)",
-tc_b9488031, TypeALU32_2op>, Enc_140c83, ImmRegRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_140c83, ImmRegRel {
 let Inst{31-22} = 0b0111011000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -328,7 +327,7 @@ def A2_andp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = and($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_946df596, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -338,7 +337,7 @@ def A2_aslh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = aslh($Rs32)",
-tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000000;
 let hasNewValue = 1;
@@ -350,7 +349,7 @@ def A2_asrh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = asrh($Rs32)",
-tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000001;
 let hasNewValue = 1;
@@ -362,7 +361,7 @@ def A2_combine_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = combine($Rt32.h,$Rs32.h)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011 {
+tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011100;
@@ -374,7 +373,7 @@ def A2_combine_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = combine($Rt32.h,$Rs32.l)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011 {
+tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011101;
@@ -386,7 +385,7 @@ def A2_combine_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = combine($Rt32.l,$Rs32.h)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011 {
+tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011110;
@@ -398,7 +397,7 @@ def A2_combine_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = combine($Rt32.l,$Rs32.l)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011 {
+tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011111;
@@ -410,7 +409,7 @@ def A2_combineii : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s32_0Imm:$Ii, s8_0Imm:$II),
 "$Rdd32 = combine(#$Ii,#$II)",
-tc_b9488031, TypeALU32_2op>, Enc_18c338 {
+tc_5a2711e5, TypeALU32_2op>, Enc_18c338 {
 let Inst{31-23} = 0b011111000;
 let isReMaterializable = 1;
 let isAsCheapAsAMove = 1;
@@ -425,7 +424,7 @@ def A2_combinew : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = combine($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_be32a5, PredNewRel {
+tc_5a2711e5, TypeALU32_3op>, Enc_be32a5, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110101000;
@@ -437,7 +436,7 @@ def A2_max : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = max($Rs32,$Rt32)",
-tc_b44c6e2a, TypeALU64>, Enc_5ab2be {
+tc_779080bf, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101110;
@@ -449,7 +448,7 @@ def A2_maxp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = max($Rss32,$Rtt32)",
-tc_b44c6e2a, TypeALU64>, Enc_a56825 {
+tc_779080bf, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -459,7 +458,7 @@ def A2_maxu : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = maxu($Rs32,$Rt32)",
-tc_b44c6e2a, TypeALU64>, Enc_5ab2be {
+tc_779080bf, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101110;
@@ -471,7 +470,7 @@ def A2_maxup : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = maxu($Rss32,$Rtt32)",
-tc_b44c6e2a, TypeALU64>, Enc_a56825 {
+tc_779080bf, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -481,7 +480,7 @@ def A2_min : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = min($Rt32,$Rs32)",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101101;
@@ -493,7 +492,7 @@ def A2_minp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = min($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -503,7 +502,7 @@ def A2_minu : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = minu($Rt32,$Rs32)",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101101;
@@ -515,7 +514,7 @@ def A2_minup : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = minu($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -525,7 +524,7 @@ def A2_neg : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = neg($Rs32)",
-tc_68cb12ce, TypeALU32_2op> {
+tc_57890846, TypeALU32_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -535,7 +534,7 @@ def A2_negp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = neg($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_b9c5fb {
+tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000000100;
 }
@@ -543,7 +542,7 @@ def A2_negsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = neg($Rs32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_5e2823 {
+tc_cf8126ae, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -555,7 +554,7 @@ def A2_nop : HInst<
 (outs),
 (ins),
 "nop",
-tc_6efc556e, TypeALU32_2op>, Enc_e3b0c4 {
+tc_2eabeebe, TypeALU32_2op>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b0111111100000000;
 }
@@ -563,7 +562,7 @@ def A2_not : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = not($Rs32)",
-tc_68cb12ce, TypeALU32_2op> {
+tc_57890846, TypeALU32_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -573,7 +572,7 @@ def A2_notp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = not($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_b9c5fb {
+tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000000100;
 }
@@ -581,7 +580,7 @@ def A2_or : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = or($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001001;
@@ -597,7 +596,7 @@ def A2_orir : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = or($Rs32,#$Ii)",
-tc_b9488031, TypeALU32_2op>, Enc_140c83, ImmRegRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_140c83, ImmRegRel {
 let Inst{31-22} = 0b0111011010;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -613,7 +612,7 @@ def A2_orp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = or($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_946df596, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -623,7 +622,7 @@ def A2_paddf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rd32 = add($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111011000;
@@ -639,7 +638,7 @@ def A2_paddfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rd32 = add($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111011000;
@@ -656,7 +655,7 @@ def A2_paddif : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "if (!$Pu4) $Rd32 = add($Rs32,#$Ii)",
-tc_d6bf0472, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b011101001;
 let isPredicated = 1;
@@ -676,7 +675,7 @@ def A2_paddifnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "if (!$Pu4.new) $Rd32 = add($Rs32,#$Ii)",
-tc_2b2f4060, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+tc_05c070ec, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{31-23} = 0b011101001;
 let isPredicated = 1;
@@ -697,7 +696,7 @@ def A2_paddit : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "if ($Pu4) $Rd32 = add($Rs32,#$Ii)",
-tc_d6bf0472, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b011101000;
 let isPredicated = 1;
@@ -716,7 +715,7 @@ def A2_padditnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "if ($Pu4.new) $Rd32 = add($Rs32,#$Ii)",
-tc_2b2f4060, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+tc_05c070ec, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{31-23} = 0b011101000;
 let isPredicated = 1;
@@ -736,7 +735,7 @@ def A2_paddt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rd32 = add($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111011000;
@@ -751,7 +750,7 @@ def A2_paddtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rd32 = add($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111011000;
@@ -767,7 +766,7 @@ def A2_pandf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rd32 = and($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001000;
@@ -781,7 +780,7 @@ def A2_pandfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rd32 = and($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001000;
@@ -796,7 +795,7 @@ def A2_pandt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rd32 = and($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001000;
@@ -809,7 +808,7 @@ def A2_pandtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rd32 = and($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001000;
@@ -823,7 +822,7 @@ def A2_porf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rd32 = or($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001001;
@@ -837,7 +836,7 @@ def A2_porfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rd32 = or($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001001;
@@ -852,7 +851,7 @@ def A2_port : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rd32 = or($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001001;
@@ -865,7 +864,7 @@ def A2_portnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rd32 = or($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001001;
@@ -879,7 +878,7 @@ def A2_psubf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = sub($Rt32,$Rs32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111011001;
@@ -893,7 +892,7 @@ def A2_psubfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111011001;
@@ -908,7 +907,7 @@ def A2_psubt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = sub($Rt32,$Rs32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111011001;
@@ -921,7 +920,7 @@ def A2_psubtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111011001;
@@ -935,7 +934,7 @@ def A2_pxorf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rd32 = xor($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001011;
@@ -949,7 +948,7 @@ def A2_pxorfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001011;
@@ -964,7 +963,7 @@ def A2_pxort : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rd32 = xor($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001011;
@@ -977,7 +976,7 @@ def A2_pxortnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001011;
@@ -991,7 +990,7 @@ def A2_roundsat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = round($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_90cd8b {
+tc_cf8126ae, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000110;
 let hasNewValue = 1;
@@ -1003,7 +1002,7 @@ def A2_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = sat($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_90cd8b {
+tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000110;
 let hasNewValue = 1;
@@ -1014,7 +1013,7 @@ def A2_satb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = satb($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_5e2823 {
+tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10001100110;
 let hasNewValue = 1;
@@ -1025,7 +1024,7 @@ def A2_sath : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sath($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_5e2823 {
+tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001100110;
 let hasNewValue = 1;
@@ -1036,7 +1035,7 @@ def A2_satub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = satub($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_5e2823 {
+tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001100110;
 let hasNewValue = 1;
@@ -1047,7 +1046,7 @@ def A2_satuh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = satuh($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_5e2823 {
+tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10001100110;
 let hasNewValue = 1;
@@ -1058,7 +1057,7 @@ def A2_sub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32,$Rs32)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011, PredNewRel, ImmRegRel {
+tc_5a2711e5, TypeALU32_3op>, Enc_bd6011, PredNewRel, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011001;
@@ -1073,7 +1072,7 @@ def A2_subh_h16_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.h,$Rs32.h):<<16",
-tc_897d1a9d, TypeALU64>, Enc_bd6011 {
+tc_679309b8, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1085,7 +1084,7 @@ def A2_subh_h16_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.h,$Rs32.l):<<16",
-tc_897d1a9d, TypeALU64>, Enc_bd6011 {
+tc_679309b8, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1097,7 +1096,7 @@ def A2_subh_h16_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.h):<<16",
-tc_897d1a9d, TypeALU64>, Enc_bd6011 {
+tc_679309b8, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1109,7 +1108,7 @@ def A2_subh_h16_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.l):<<16",
-tc_897d1a9d, TypeALU64>, Enc_bd6011 {
+tc_679309b8, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1121,7 +1120,7 @@ def A2_subh_h16_sat_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.h,$Rs32.h):sat:<<16",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1134,7 +1133,7 @@ def A2_subh_h16_sat_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.h,$Rs32.l):sat:<<16",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1147,7 +1146,7 @@ def A2_subh_h16_sat_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.h):sat:<<16",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1160,7 +1159,7 @@ def A2_subh_h16_sat_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.l):sat:<<16",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1173,7 +1172,7 @@ def A2_subh_l16_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.h)",
-tc_1b9c9ee5, TypeALU64>, Enc_bd6011 {
+tc_4414d8b1, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101001;
@@ -1185,7 +1184,7 @@ def A2_subh_l16_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.l)",
-tc_1b9c9ee5, TypeALU64>, Enc_bd6011 {
+tc_4414d8b1, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101001;
@@ -1197,7 +1196,7 @@ def A2_subh_l16_sat_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.h):sat",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101001;
@@ -1210,7 +1209,7 @@ def A2_subh_l16_sat_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.l):sat",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101001;
@@ -1223,7 +1222,7 @@ def A2_subp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = sub($Rtt32,$Rss32)",
-tc_540fdfbc, TypeALU64>, Enc_ea23e4 {
+tc_946df596, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -1232,7 +1231,7 @@ def A2_subri : HInst<
 (outs IntRegs:$Rd32),
 (ins s32_0Imm:$Ii, IntRegs:$Rs32),
 "$Rd32 = sub(#$Ii,$Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_140c83, PredNewRel, ImmRegRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_140c83, PredNewRel, ImmRegRel {
 let Inst{31-22} = 0b0111011001;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -1248,7 +1247,7 @@ def A2_subsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32,$Rs32):sat",
-tc_5ba5997d, TypeALU32_3op>, Enc_bd6011 {
+tc_61830035, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110110;
@@ -1262,7 +1261,7 @@ def A2_svaddh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vaddh($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_5ab2be {
+tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110000;
@@ -1275,7 +1274,7 @@ def A2_svaddhs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vaddh($Rs32,$Rt32):sat",
-tc_5ba5997d, TypeALU32_3op>, Enc_5ab2be {
+tc_61830035, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110001;
@@ -1290,7 +1289,7 @@ def A2_svadduhs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vadduh($Rs32,$Rt32):sat",
-tc_5ba5997d, TypeALU32_3op>, Enc_5ab2be {
+tc_61830035, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110011;
@@ -1305,12 +1304,13 @@ def A2_svavgh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vavgh($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_5ab2be {
+tc_1c80410a, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let prefersSlot3 = 1;
 let InputType = "reg";
 let isCommutable = 1;
 }
@@ -1318,12 +1318,13 @@ def A2_svavghs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vavgh($Rs32,$Rt32):rnd",
-tc_8fe6b782, TypeALU32_3op>, Enc_5ab2be {
+tc_d08ee0f4, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let prefersSlot3 = 1;
 let InputType = "reg";
 let isCommutable = 1;
 }
@@ -1331,19 +1332,20 @@ def A2_svnavgh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = vnavgh($Rt32,$Rs32)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011 {
+tc_1c80410a, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let prefersSlot3 = 1;
 let InputType = "reg";
 }
 def A2_svsubh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = vsubh($Rt32,$Rs32)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011 {
+tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110100;
@@ -1355,7 +1357,7 @@ def A2_svsubhs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = vsubh($Rt32,$Rs32):sat",
-tc_5ba5997d, TypeALU32_3op>, Enc_bd6011 {
+tc_61830035, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110101;
@@ -1369,7 +1371,7 @@ def A2_svsubuhs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = vsubuh($Rt32,$Rs32):sat",
-tc_5ba5997d, TypeALU32_3op>, Enc_bd6011 {
+tc_61830035, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110111;
@@ -1383,7 +1385,7 @@ def A2_swiz : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = swiz($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_5e2823 {
+tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -1393,7 +1395,7 @@ def A2_sxtb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sxtb($Rs32)",
-tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000101;
 let hasNewValue = 1;
@@ -1405,7 +1407,7 @@ def A2_sxth : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sxth($Rs32)",
-tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000111;
 let hasNewValue = 1;
@@ -1417,7 +1419,7 @@ def A2_sxtw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = sxtw($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_3a3d62 {
+tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100010;
 }
@@ -1425,7 +1427,7 @@ def A2_tfr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = $Rs32",
-tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000011;
 let hasNewValue = 1;
@@ -1438,7 +1440,7 @@ def A2_tfrcrr : HInst<
 (outs IntRegs:$Rd32),
 (ins CtrRegs:$Cs32),
 "$Rd32 = $Cs32",
-tc_29175780, TypeCR>, Enc_0cb018 {
+tc_b9272d6c, TypeCR>, Enc_0cb018 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01101010000;
 let hasNewValue = 1;
@@ -1448,7 +1450,7 @@ def A2_tfrf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = $Rs32",
-tc_d6bf0472, TypeALU32_2op>, PredNewRel, ImmRegRel {
+tc_4c5ba658, TypeALU32_2op>, PredNewRel, ImmRegRel {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
@@ -1463,7 +1465,7 @@ def A2_tfrfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = $Rs32",
-tc_2b2f4060, TypeALU32_2op>, PredNewRel, ImmRegRel {
+tc_05c070ec, TypeALU32_2op>, PredNewRel, ImmRegRel {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
@@ -1479,7 +1481,7 @@ def A2_tfrih : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u16_0Imm:$Ii),
 "$Rx32.h = #$Ii",
-tc_b9488031, TypeALU32_2op>, Enc_51436c {
+tc_5a2711e5, TypeALU32_2op>, Enc_51436c {
 let Inst{21-21} = 0b1;
 let Inst{31-24} = 0b01110010;
 let hasNewValue = 1;
@@ -1490,7 +1492,7 @@ def A2_tfril : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u16_0Imm:$Ii),
 "$Rx32.l = #$Ii",
-tc_b9488031, TypeALU32_2op>, Enc_51436c {
+tc_5a2711e5, TypeALU32_2op>, Enc_51436c {
 let Inst{21-21} = 0b1;
 let Inst{31-24} = 0b01110001;
 let hasNewValue = 1;
@@ -1501,7 +1503,7 @@ def A2_tfrp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = $Rss32",
-tc_b9488031, TypeALU32_2op>, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, PredNewRel {
 let BaseOpcode = "A2_tfrp";
 let isPredicable = 1;
 let isPseudo = 1;
@@ -1510,7 +1512,7 @@ def A2_tfrpf : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32),
 "if (!$Pu4) $Rdd32 = $Rss32",
-tc_b9488031, TypeALU32_2op>, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, PredNewRel {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let BaseOpcode = "A2_tfrp";
@@ -1520,7 +1522,7 @@ def A2_tfrpfnew : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32),
 "if (!$Pu4.new) $Rdd32 = $Rss32",
-tc_5f6847a1, TypeALU32_2op>, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, PredNewRel {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let isPredicatedNew = 1;
@@ -1531,7 +1533,7 @@ def A2_tfrpi : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s8_0Imm:$Ii),
 "$Rdd32 = #$Ii",
-tc_b9488031, TypeALU64> {
+tc_5a2711e5, TypeALU64> {
 let isReMaterializable = 1;
 let isAsCheapAsAMove = 1;
 let isMoveImm = 1;
@@ -1541,7 +1543,7 @@ def A2_tfrpt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32),
 "if ($Pu4) $Rdd32 = $Rss32",
-tc_b9488031, TypeALU32_2op>, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, PredNewRel {
 let isPredicated = 1;
 let BaseOpcode = "A2_tfrp";
 let isPseudo = 1;
@@ -1550,7 +1552,7 @@ def A2_tfrptnew : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32),
 "if ($Pu4.new) $Rdd32 = $Rss32",
-tc_5f6847a1, TypeALU32_2op>, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, PredNewRel {
 let isPredicated = 1;
 let isPredicatedNew = 1;
 let BaseOpcode = "A2_tfrp";
@@ -1560,7 +1562,7 @@ def A2_tfrrcr : HInst<
 (outs CtrRegs:$Cd32),
 (ins IntRegs:$Rs32),
 "$Cd32 = $Rs32",
-tc_a21dc435, TypeCR>, Enc_bd811a {
+tc_434c8e1e, TypeCR>, Enc_bd811a {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01100010001;
 let hasNewValue = 1;
@@ -1570,7 +1572,7 @@ def A2_tfrsi : HInst<
 (outs IntRegs:$Rd32),
 (ins s32_0Imm:$Ii),
 "$Rd32 = #$Ii",
-tc_68cb12ce, TypeALU32_2op>, Enc_5e87ce, PredNewRel, ImmRegRel {
+tc_57890846, TypeALU32_2op>, Enc_5e87ce, PredNewRel, ImmRegRel {
 let Inst{21-21} = 0b0;
 let Inst{31-24} = 0b01111000;
 let hasNewValue = 1;
@@ -1592,7 +1594,7 @@ def A2_tfrt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = $Rs32",
-tc_d6bf0472, TypeALU32_2op>, PredNewRel, ImmRegRel {
+tc_4c5ba658, TypeALU32_2op>, PredNewRel, ImmRegRel {
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -1606,7 +1608,7 @@ def A2_tfrtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = $Rs32",
-tc_2b2f4060, TypeALU32_2op>, PredNewRel, ImmRegRel {
+tc_05c070ec, TypeALU32_2op>, PredNewRel, ImmRegRel {
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -1621,7 +1623,7 @@ def A2_vabsh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vabsh($Rss32)",
-tc_c2f7d806, TypeS_2op>, Enc_b9c5fb {
+tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000000010;
 let prefersSlot3 = 1;
@@ -1630,7 +1632,7 @@ def A2_vabshsat : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vabsh($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_b9c5fb {
+tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000000010;
 let prefersSlot3 = 1;
@@ -1640,7 +1642,7 @@ def A2_vabsw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vabsw($Rss32)",
-tc_c2f7d806, TypeS_2op>, Enc_b9c5fb {
+tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000010;
 let prefersSlot3 = 1;
@@ -1649,7 +1651,7 @@ def A2_vabswsat : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vabsw($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_b9c5fb {
+tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000010;
 let prefersSlot3 = 1;
@@ -1659,7 +1661,7 @@ def A2_vaddb_map : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddb($Rss32,$Rtt32)",
-tc_540fdfbc, TypeMAPPING> {
+tc_946df596, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -1667,7 +1669,7 @@ def A2_vaddh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddh($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_946df596, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1676,7 +1678,7 @@ def A2_vaddhs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddh($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_a56825 {
+tc_779080bf, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1687,7 +1689,7 @@ def A2_vaddub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddub($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_946df596, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1696,7 +1698,7 @@ def A2_vaddubs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddub($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_a56825 {
+tc_779080bf, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1707,7 +1709,7 @@ def A2_vadduhs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vadduh($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_a56825 {
+tc_779080bf, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1718,7 +1720,7 @@ def A2_vaddw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddw($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_946df596, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1727,7 +1729,7 @@ def A2_vaddws : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddw($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_a56825 {
+tc_779080bf, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1738,16 +1740,17 @@ def A2_vavgh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgh($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_6132ba3d, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
 }
 def A2_vavghcr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgh($Rss32,$Rtt32):crnd",
-tc_2b6f77c6, TypeALU64>, Enc_a56825 {
+tc_002cb246, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
@@ -1757,79 +1760,87 @@ def A2_vavghr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgh($Rss32,$Rtt32):rnd",
-tc_dbdffe3d, TypeALU64>, Enc_a56825 {
+tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
 }
 def A2_vavgub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgub($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_6132ba3d, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
 }
 def A2_vavgubr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgub($Rss32,$Rtt32):rnd",
-tc_dbdffe3d, TypeALU64>, Enc_a56825 {
+tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
 }
 def A2_vavguh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavguh($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_6132ba3d, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
 }
 def A2_vavguhr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavguh($Rss32,$Rtt32):rnd",
-tc_dbdffe3d, TypeALU64>, Enc_a56825 {
+tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
 }
 def A2_vavguw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavguw($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_6132ba3d, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
 }
 def A2_vavguwr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavguw($Rss32,$Rtt32):rnd",
-tc_dbdffe3d, TypeALU64>, Enc_a56825 {
+tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
 }
 def A2_vavgw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgw($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_6132ba3d, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
 }
 def A2_vavgwcr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgw($Rss32,$Rtt32):crnd",
-tc_2b6f77c6, TypeALU64>, Enc_a56825 {
+tc_002cb246, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -1839,16 +1850,17 @@ def A2_vavgwr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgw($Rss32,$Rtt32):rnd",
-tc_dbdffe3d, TypeALU64>, Enc_a56825 {
+tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
 }
 def A2_vcmpbeq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpb.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b110000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1857,7 +1869,7 @@ def A2_vcmpbgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpb.gtu($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b111000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1866,7 +1878,7 @@ def A2_vcmpheq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmph.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1875,7 +1887,7 @@ def A2_vcmphgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmph.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1884,7 +1896,7 @@ def A2_vcmphgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmph.gtu($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b101000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1893,7 +1905,7 @@ def A2_vcmpweq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpw.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1902,7 +1914,7 @@ def A2_vcmpwgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpw.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1911,7 +1923,7 @@ def A2_vcmpwgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpw.gtu($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1920,7 +1932,7 @@ def A2_vconj : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vconj($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_b9c5fb {
+tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000100;
 let prefersSlot3 = 1;
@@ -1930,7 +1942,7 @@ def A2_vmaxb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxb($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1940,7 +1952,7 @@ def A2_vmaxh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxh($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1950,7 +1962,7 @@ def A2_vmaxub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxub($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1960,7 +1972,7 @@ def A2_vmaxuh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxuh($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1970,7 +1982,7 @@ def A2_vmaxuw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxuw($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -1980,7 +1992,7 @@ def A2_vmaxw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxw($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1990,7 +2002,7 @@ def A2_vminb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminb($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -2000,7 +2012,7 @@ def A2_vminh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminh($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2010,7 +2022,7 @@ def A2_vminub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminub($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2020,7 +2032,7 @@ def A2_vminuh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminuh($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2030,7 +2042,7 @@ def A2_vminuw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminuw($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2040,7 +2052,7 @@ def A2_vminw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminw($Rtt32,$Rss32)",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2050,16 +2062,17 @@ def A2_vnavgh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgh($Rtt32,$Rss32)",
-tc_540fdfbc, TypeALU64>, Enc_ea23e4 {
+tc_6132ba3d, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
 }
 def A2_vnavghcr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgh($Rtt32,$Rss32):crnd:sat",
-tc_2b6f77c6, TypeALU64>, Enc_ea23e4 {
+tc_002cb246, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2070,7 +2083,7 @@ def A2_vnavghr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgh($Rtt32,$Rss32):rnd:sat",
-tc_2b6f77c6, TypeALU64>, Enc_ea23e4 {
+tc_002cb246, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2081,16 +2094,17 @@ def A2_vnavgw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgw($Rtt32,$Rss32)",
-tc_540fdfbc, TypeALU64>, Enc_ea23e4 {
+tc_6132ba3d, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
 }
 def A2_vnavgwcr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgw($Rtt32,$Rss32):crnd:sat",
-tc_2b6f77c6, TypeALU64>, Enc_ea23e4 {
+tc_002cb246, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2101,7 +2115,7 @@ def A2_vnavgwr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgw($Rtt32,$Rss32):rnd:sat",
-tc_2b6f77c6, TypeALU64>, Enc_ea23e4 {
+tc_002cb246, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2112,7 +2126,7 @@ def A2_vraddub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vraddub($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -2122,7 +2136,7 @@ def A2_vraddub_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vraddub($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -2133,7 +2147,7 @@ def A2_vrsadub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrsadub($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -2143,7 +2157,7 @@ def A2_vrsadub_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrsadub($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -2154,7 +2168,7 @@ def A2_vsubb_map : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vsubb($Rss32,$Rtt32)",
-tc_540fdfbc, TypeMAPPING> {
+tc_946df596, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -2162,7 +2176,7 @@ def A2_vsubh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubh($Rtt32,$Rss32)",
-tc_540fdfbc, TypeALU64>, Enc_ea23e4 {
+tc_946df596, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2171,7 +2185,7 @@ def A2_vsubhs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubh($Rtt32,$Rss32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2182,7 +2196,7 @@ def A2_vsubub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubub($Rtt32,$Rss32)",
-tc_540fdfbc, TypeALU64>, Enc_ea23e4 {
+tc_946df596, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2191,7 +2205,7 @@ def A2_vsububs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubub($Rtt32,$Rss32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2202,7 +2216,7 @@ def A2_vsubuhs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubuh($Rtt32,$Rss32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2213,7 +2227,7 @@ def A2_vsubw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubw($Rtt32,$Rss32)",
-tc_540fdfbc, TypeALU64>, Enc_ea23e4 {
+tc_946df596, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2222,7 +2236,7 @@ def A2_vsubws : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubw($Rtt32,$Rss32):sat",
-tc_b44c6e2a, TypeALU64>, Enc_ea23e4 {
+tc_779080bf, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2233,7 +2247,7 @@ def A2_xor : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = xor($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_5ab2be, PredNewRel {
+tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001011;
@@ -2248,7 +2262,7 @@ def A2_xorp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = xor($Rss32,$Rtt32)",
-tc_540fdfbc, TypeALU64>, Enc_a56825 {
+tc_946df596, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -2258,7 +2272,7 @@ def A2_zxtb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = zxtb($Rs32)",
-tc_b9488031, TypeALU32_2op>, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, PredNewRel {
 let hasNewValue = 1;
 let opNewValue = 0;
 let BaseOpcode = "A2_zxtb";
@@ -2270,7 +2284,7 @@ def A2_zxth : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = zxth($Rs32)",
-tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000110;
 let hasNewValue = 1;
@@ -2282,7 +2296,7 @@ def A4_addp_c : HInst<
 (outs DoubleRegs:$Rdd32, PredRegs:$Px4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
 "$Rdd32 = add($Rss32,$Rtt32,$Px4):carry",
-tc_523fcf30, TypeS_3op>, Enc_2b3f60 {
+tc_9c3ecd83, TypeS_3op>, Enc_2b3f60 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000010110;
@@ -2293,7 +2307,7 @@ def A4_andn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = and($Rt32,~$Rs32)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011 {
+tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001100;
@@ -2305,7 +2319,7 @@ def A4_andnp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = and($Rtt32,~$Rss32)",
-tc_540fdfbc, TypeALU64>, Enc_ea23e4 {
+tc_946df596, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -2314,7 +2328,7 @@ def A4_bitsplit : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = bitsplit($Rs32,$Rt32)",
-tc_1b9c9ee5, TypeALU64>, Enc_be32a5 {
+tc_4414d8b1, TypeALU64>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010100001;
@@ -2324,7 +2338,7 @@ def A4_bitspliti : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rdd32 = bitsplit($Rs32,#$Ii)",
-tc_1b9c9ee5, TypeS_2op>, Enc_311abd {
+tc_4414d8b1, TypeS_2op>, Enc_311abd {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001000110;
@@ -2334,14 +2348,14 @@ def A4_boundscheck : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Pd4 = boundscheck($Rs32,$Rtt32)",
-tc_1e856f58, TypeALU64> {
+tc_85d5d03f, TypeALU64> {
 let isPseudo = 1;
 }
 def A4_boundscheck_hi : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = boundscheck($Rss32,$Rtt32):raw:hi",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b101000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -2350,7 +2364,7 @@ def A4_boundscheck_lo : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = boundscheck($Rss32,$Rtt32):raw:lo",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -2359,7 +2373,7 @@ def A4_cmpbeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmpb.eq($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b110000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2372,7 +2386,7 @@ def A4_cmpbeqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u8_0Imm:$Ii),
 "$Pd4 = cmpb.eq($Rs32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_08d755, ImmRegRel {
+tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011101000;
@@ -2385,7 +2399,7 @@ def A4_cmpbgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmpb.gt($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2397,7 +2411,7 @@ def A4_cmpbgti : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s8_0Imm:$Ii),
 "$Pd4 = cmpb.gt($Rs32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_08d755, ImmRegRel {
+tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011101001;
@@ -2409,7 +2423,7 @@ def A4_cmpbgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmpb.gtu($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b111000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2421,7 +2435,7 @@ def A4_cmpbgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Pd4 = cmpb.gtu($Rs32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_02553a, ImmRegRel {
+tc_643b4717, TypeALU64>, Enc_02553a, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011101010;
@@ -2438,7 +2452,7 @@ def A4_cmpheq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmph.eq($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2451,7 +2465,7 @@ def A4_cmpheqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmph.eq($Rs32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_08d755, ImmRegRel {
+tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel {
 let Inst{4-2} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011101000;
@@ -2469,7 +2483,7 @@ def A4_cmphgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmph.gt($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2481,7 +2495,7 @@ def A4_cmphgti : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmph.gt($Rs32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_08d755, ImmRegRel {
+tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel {
 let Inst{4-2} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011101001;
@@ -2498,7 +2512,7 @@ def A4_cmphgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmph.gtu($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b101000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2510,7 +2524,7 @@ def A4_cmphgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Pd4 = cmph.gtu($Rs32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_02553a, ImmRegRel {
+tc_643b4717, TypeALU64>, Enc_02553a, ImmRegRel {
 let Inst{4-2} = 0b010;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011101010;
@@ -2527,7 +2541,7 @@ def A4_combineii : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s8_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = combine(#$Ii,#$II)",
-tc_b9488031, TypeALU32_2op>, Enc_f0cca7 {
+tc_5a2711e5, TypeALU32_2op>, Enc_f0cca7 {
 let Inst{31-21} = 0b01111100100;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -2539,7 +2553,7 @@ def A4_combineir : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s32_0Imm:$Ii, IntRegs:$Rs32),
 "$Rdd32 = combine(#$Ii,$Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_9cdba7 {
+tc_5a2711e5, TypeALU32_2op>, Enc_9cdba7 {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b01110011001;
 let isExtendable = 1;
@@ -2552,7 +2566,7 @@ def A4_combineri : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rdd32 = combine($Rs32,#$Ii)",
-tc_b9488031, TypeALU32_2op>, Enc_9cdba7 {
+tc_5a2711e5, TypeALU32_2op>, Enc_9cdba7 {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b01110011000;
 let isExtendable = 1;
@@ -2565,7 +2579,7 @@ def A4_cround_ri : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = cround($Rs32,#$Ii)",
-tc_2b6f77c6, TypeS_2op>, Enc_a05677 {
+tc_002cb246, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100111;
@@ -2577,7 +2591,7 @@ def A4_cround_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cround($Rs32,$Rt32)",
-tc_2b6f77c6, TypeS_3op>, Enc_5ab2be {
+tc_002cb246, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110110;
@@ -2589,14 +2603,14 @@ def A4_ext : HInst<
 (outs),
 (ins u26_6Imm:$Ii),
 "immext(#$Ii)",
-tc_452f85af, TypeEXTENDER>, Enc_2b518f {
+tc_862b3e70, TypeEXTENDER>, Enc_2b518f {
 let Inst{31-28} = 0b0000;
 }
 def A4_modwrapu : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = modwrap($Rs32,$Rt32)",
-tc_b44c6e2a, TypeALU64>, Enc_5ab2be {
+tc_779080bf, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -2608,7 +2622,7 @@ def A4_orn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = or($Rt32,~$Rs32)",
-tc_b9488031, TypeALU32_3op>, Enc_bd6011 {
+tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001101;
@@ -2620,7 +2634,7 @@ def A4_ornp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = or($Rtt32,~$Rss32)",
-tc_540fdfbc, TypeALU64>, Enc_ea23e4 {
+tc_946df596, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -2629,7 +2643,7 @@ def A4_paslhf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = aslh($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000000;
@@ -2643,7 +2657,7 @@ def A4_paslhfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = aslh($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000000;
@@ -2658,7 +2672,7 @@ def A4_paslht : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = aslh($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000000;
@@ -2671,7 +2685,7 @@ def A4_paslhtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = aslh($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000000;
@@ -2685,7 +2699,7 @@ def A4_pasrhf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = asrh($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000001;
@@ -2699,7 +2713,7 @@ def A4_pasrhfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = asrh($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000001;
@@ -2714,7 +2728,7 @@ def A4_pasrht : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = asrh($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000001;
@@ -2727,7 +2741,7 @@ def A4_pasrhtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = asrh($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000001;
@@ -2741,7 +2755,7 @@ def A4_psxtbf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = sxtb($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000101;
@@ -2755,7 +2769,7 @@ def A4_psxtbfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = sxtb($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000101;
@@ -2770,7 +2784,7 @@ def A4_psxtbt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = sxtb($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000101;
@@ -2783,7 +2797,7 @@ def A4_psxtbtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = sxtb($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000101;
@@ -2797,7 +2811,7 @@ def A4_psxthf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = sxth($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000111;
@@ -2811,7 +2825,7 @@ def A4_psxthfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = sxth($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000111;
@@ -2826,7 +2840,7 @@ def A4_psxtht : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = sxth($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000111;
@@ -2839,7 +2853,7 @@ def A4_psxthtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = sxth($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000111;
@@ -2853,7 +2867,7 @@ def A4_pzxtbf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = zxtb($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000100;
@@ -2867,7 +2881,7 @@ def A4_pzxtbfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = zxtb($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000100;
@@ -2882,7 +2896,7 @@ def A4_pzxtbt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = zxtb($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000100;
@@ -2895,7 +2909,7 @@ def A4_pzxtbtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = zxtb($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000100;
@@ -2909,7 +2923,7 @@ def A4_pzxthf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = zxth($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000110;
@@ -2923,7 +2937,7 @@ def A4_pzxthfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = zxth($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000110;
@@ -2938,7 +2952,7 @@ def A4_pzxtht : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = zxth($Rs32)",
-tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000110;
@@ -2951,7 +2965,7 @@ def A4_pzxthtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = zxth($Rs32)",
-tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000110;
@@ -2965,7 +2979,7 @@ def A4_rcmpeq : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmp.eq($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
+tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011010;
@@ -2979,7 +2993,7 @@ def A4_rcmpeqi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = cmp.eq($Rs32,#$Ii)",
-tc_b9488031, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b01110011010;
 let hasNewValue = 1;
@@ -2996,7 +3010,7 @@ def A4_rcmpneq : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = !cmp.eq($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
+tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011011;
@@ -3010,7 +3024,7 @@ def A4_rcmpneqi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = !cmp.eq($Rs32,#$Ii)",
-tc_b9488031, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b01110011011;
 let hasNewValue = 1;
@@ -3027,7 +3041,7 @@ def A4_round_ri : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = round($Rs32,#$Ii)",
-tc_2b6f77c6, TypeS_2op>, Enc_a05677 {
+tc_002cb246, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100111;
@@ -3039,7 +3053,7 @@ def A4_round_ri_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = round($Rs32,#$Ii):sat",
-tc_2b6f77c6, TypeS_2op>, Enc_a05677 {
+tc_002cb246, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100111;
@@ -3052,7 +3066,7 @@ def A4_round_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = round($Rs32,$Rt32)",
-tc_2b6f77c6, TypeS_3op>, Enc_5ab2be {
+tc_002cb246, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110110;
@@ -3064,7 +3078,7 @@ def A4_round_rr_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = round($Rs32,$Rt32):sat",
-tc_2b6f77c6, TypeS_3op>, Enc_5ab2be {
+tc_002cb246, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110110;
@@ -3077,7 +3091,7 @@ def A4_subp_c : HInst<
 (outs DoubleRegs:$Rdd32, PredRegs:$Px4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
 "$Rdd32 = sub($Rss32,$Rtt32,$Px4):carry",
-tc_523fcf30, TypeS_3op>, Enc_2b3f60 {
+tc_9c3ecd83, TypeS_3op>, Enc_2b3f60 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000010111;
@@ -3088,7 +3102,7 @@ def A4_tfrcpp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins CtrRegs64:$Css32),
 "$Rdd32 = $Css32",
-tc_29175780, TypeCR>, Enc_667b39 {
+tc_b9272d6c, TypeCR>, Enc_667b39 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01101000000;
 }
@@ -3096,7 +3110,7 @@ def A4_tfrpcp : HInst<
 (outs CtrRegs64:$Cdd32),
 (ins DoubleRegs:$Rss32),
 "$Cdd32 = $Rss32",
-tc_a21dc435, TypeCR>, Enc_0ed752 {
+tc_434c8e1e, TypeCR>, Enc_0ed752 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01100011001;
 }
@@ -3104,7 +3118,7 @@ def A4_tlbmatch : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Pd4 = tlbmatch($Rss32,$Rt32)",
-tc_04c9decc, TypeALU64>, Enc_03833b {
+tc_4837eefb, TypeALU64>, Enc_03833b {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3114,7 +3128,7 @@ def A4_vcmpbeq_any : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = any8(vcmpb.eq($Rss32,$Rtt32))",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3123,7 +3137,7 @@ def A4_vcmpbeqi : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u8_0Imm:$Ii),
 "$Pd4 = vcmpb.eq($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_0d8adb {
+tc_643b4717, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100000;
@@ -3132,7 +3146,7 @@ def A4_vcmpbgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpb.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3141,7 +3155,7 @@ def A4_vcmpbgti : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmpb.gt($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_0d8adb {
+tc_643b4717, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100001;
@@ -3150,7 +3164,7 @@ def A4_vcmpbgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
 "$Pd4 = vcmpb.gtu($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_3680c2 {
+tc_643b4717, TypeALU64>, Enc_3680c2 {
 let Inst{4-2} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011100010;
@@ -3159,7 +3173,7 @@ def A4_vcmpheqi : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmph.eq($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_0d8adb {
+tc_643b4717, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100000;
@@ -3168,7 +3182,7 @@ def A4_vcmphgti : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmph.gt($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_0d8adb {
+tc_643b4717, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100001;
@@ -3177,7 +3191,7 @@ def A4_vcmphgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
 "$Pd4 = vcmph.gtu($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_3680c2 {
+tc_643b4717, TypeALU64>, Enc_3680c2 {
 let Inst{4-2} = 0b010;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011100010;
@@ -3186,7 +3200,7 @@ def A4_vcmpweqi : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmpw.eq($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_0d8adb {
+tc_643b4717, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100000;
@@ -3195,7 +3209,7 @@ def A4_vcmpwgti : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmpw.gt($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_0d8adb {
+tc_643b4717, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100001;
@@ -3204,7 +3218,7 @@ def A4_vcmpwgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
 "$Pd4 = vcmpw.gtu($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_3680c2 {
+tc_643b4717, TypeALU64>, Enc_3680c2 {
 let Inst{4-2} = 0b100;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011100010;
@@ -3213,7 +3227,7 @@ def A4_vrmaxh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrmaxh($Rss32,$Ru32)",
-tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 {
+tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011001;
@@ -3224,7 +3238,7 @@ def A4_vrmaxuh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrmaxuh($Rss32,$Ru32)",
-tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 {
+tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -3235,7 +3249,7 @@ def A4_vrmaxuw : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrmaxuw($Rss32,$Ru32)",
-tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 {
+tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -3246,7 +3260,7 @@ def A4_vrmaxw : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrmaxw($Rss32,$Ru32)",
-tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 {
+tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011001;
@@ -3257,7 +3271,7 @@ def A4_vrminh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrminh($Rss32,$Ru32)",
-tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 {
+tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011001;
@@ -3268,7 +3282,7 @@ def A4_vrminuh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrminuh($Rss32,$Ru32)",
-tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 {
+tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -3279,7 +3293,7 @@ def A4_vrminuw : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrminuw($Rss32,$Ru32)",
-tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 {
+tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -3290,7 +3304,7 @@ def A4_vrminw : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrminw($Rss32,$Ru32)",
-tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 {
+tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011001;
@@ -3301,7 +3315,7 @@ def A5_ACS : HInst<
 (outs DoubleRegs:$Rxx32, PredRegs:$Pe4),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32,$Pe4 = vacsh($Rss32,$Rtt32)",
-tc_caaebcba, TypeM>, Enc_831a7d, Requires<[HasV55]> {
+tc_d1aa9eaa, TypeM>, Enc_831a7d, Requires<[HasV55]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -3314,7 +3328,7 @@ def A5_vaddhubs : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vaddhub($Rss32,$Rtt32):sat",
-tc_2b6f77c6, TypeS_3op>, Enc_d2216a {
+tc_002cb246, TypeS_3op>, Enc_d2216a {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -3327,7 +3341,7 @@ def A6_vcmpbeq_notany : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = !any8(vcmpb.eq($Rss32,$Rtt32))",
-tc_55050d58, TypeALU64>, Enc_fcf7a7, Requires<[HasV65]> {
+tc_1fc97744, TypeALU64>, Enc_fcf7a7, Requires<[HasV65]> {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3336,7 +3350,7 @@ def A6_vminub_RdP : HInst<
 (outs DoubleRegs:$Rdd32, PredRegs:$Pe4),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32,$Pe4 = vminub($Rtt32,$Rss32)",
-tc_ef84f62f, TypeM>, Enc_d2c7f1, Requires<[HasV62]> {
+tc_f9058dd7, TypeM>, Enc_d2c7f1, Requires<[HasV62]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -3347,7 +3361,7 @@ def C2_all8 : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4),
 "$Pd4 = all8($Ps4)",
-tc_f2704b9a, TypeCR>, Enc_65d691 {
+tc_de554571, TypeCR>, Enc_65d691 {
 let Inst{13-2} = 0b000000000000;
 let Inst{31-18} = 0b01101011101000;
 }
@@ -3355,7 +3369,7 @@ def C2_and : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Pt4, PredRegs:$Ps4),
 "$Pd4 = and($Pt4,$Ps4)",
-tc_53bc8a6a, TypeCR>, Enc_454a26 {
+tc_640086b5, TypeCR>, Enc_454a26 {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011000000;
@@ -3364,7 +3378,7 @@ def C2_andn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Pt4, PredRegs:$Ps4),
 "$Pd4 = and($Pt4,!$Ps4)",
-tc_53bc8a6a, TypeCR>, Enc_454a26 {
+tc_640086b5, TypeCR>, Enc_454a26 {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011011000;
@@ -3373,7 +3387,7 @@ def C2_any8 : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4),
 "$Pd4 = any8($Ps4)",
-tc_f2704b9a, TypeCR>, Enc_65d691 {
+tc_de554571, TypeCR>, Enc_65d691 {
 let Inst{13-2} = 0b000000000000;
 let Inst{31-18} = 0b01101011100000;
 }
@@ -3381,7 +3395,7 @@ def C2_bitsclr : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = bitsclr($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111100;
@@ -3390,7 +3404,7 @@ def C2_bitsclri : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u6_0Imm:$Ii),
 "$Pd4 = bitsclr($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_5d6c34 {
+tc_643b4717, TypeS_2op>, Enc_5d6c34 {
 let Inst{7-2} = 0b000000;
 let Inst{31-21} = 0b10000101100;
 }
@@ -3398,7 +3412,7 @@ def C2_bitsset : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = bitsset($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111010;
@@ -3407,7 +3421,7 @@ def C2_ccombinewf : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rdd32 = combine($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111101000;
@@ -3419,7 +3433,7 @@ def C2_ccombinewnewf : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111101000;
@@ -3432,7 +3446,7 @@ def C2_ccombinewnewt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
-tc_2b2f4060, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+tc_05c070ec, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111101000;
@@ -3444,7 +3458,7 @@ def C2_ccombinewt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rdd32 = combine($Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+tc_4c5ba658, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111101000;
@@ -3455,7 +3469,7 @@ def C2_cmoveif : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if (!$Pu4) $Rd32 = #$Ii",
-tc_b9488031, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{20-20} = 0b0;
 let Inst{31-23} = 0b011111101;
@@ -3477,7 +3491,7 @@ def C2_cmoveit : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if ($Pu4) $Rd32 = #$Ii",
-tc_b9488031, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+tc_5a2711e5, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{20-20} = 0b0;
 let Inst{31-23} = 0b011111100;
@@ -3498,7 +3512,7 @@ def C2_cmovenewif : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if (!$Pu4.new) $Rd32 = #$Ii",
-tc_5f6847a1, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{20-20} = 0b0;
 let Inst{31-23} = 0b011111101;
@@ -3521,7 +3535,7 @@ def C2_cmovenewit : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if ($Pu4.new) $Rd32 = #$Ii",
-tc_5f6847a1, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+tc_1ae57e39, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{20-20} = 0b0;
 let Inst{31-23} = 0b011111100;
@@ -3543,7 +3557,7 @@ def C2_cmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.eq($Rs32,$Rt32)",
-tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010000;
@@ -3556,7 +3570,7 @@ def C2_cmpeqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmp.eq($Rs32,#$Ii)",
-tc_6ebb4a12, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{31-22} = 0b0111010100;
 let CextOpcode = "C2_cmpeq";
@@ -3572,7 +3586,7 @@ def C2_cmpeqp : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = cmp.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010100;
@@ -3583,7 +3597,7 @@ def C2_cmpgei : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s8_0Imm:$Ii),
 "$Pd4 = cmp.ge($Rs32,#$Ii)",
-tc_6ebb4a12, TypeALU32_2op> {
+tc_56f114f4, TypeALU32_2op> {
 let isCompare = 1;
 let isPseudo = 1;
 }
@@ -3591,7 +3605,7 @@ def C2_cmpgeui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u8_0Imm:$Ii),
 "$Pd4 = cmp.geu($Rs32,#$Ii)",
-tc_6ebb4a12, TypeALU32_2op> {
+tc_56f114f4, TypeALU32_2op> {
 let isCompare = 1;
 let isPseudo = 1;
 }
@@ -3599,7 +3613,7 @@ def C2_cmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.gt($Rs32,$Rt32)",
-tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010010;
@@ -3611,7 +3625,7 @@ def C2_cmpgti : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmp.gt($Rs32,#$Ii)",
-tc_6ebb4a12, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{31-22} = 0b0111010101;
 let CextOpcode = "C2_cmpgt";
@@ -3627,7 +3641,7 @@ def C2_cmpgtp : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = cmp.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010100;
@@ -3637,7 +3651,7 @@ def C2_cmpgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.gtu($Rs32,$Rt32)",
-tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010011;
@@ -3649,7 +3663,7 @@ def C2_cmpgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Pd4 = cmp.gtu($Rs32,#$Ii)",
-tc_6ebb4a12, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
+tc_56f114f4, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{31-21} = 0b01110101100;
 let CextOpcode = "C2_cmpgtu";
@@ -3665,7 +3679,7 @@ def C2_cmpgtup : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = cmp.gtu($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010100;
@@ -3675,7 +3689,7 @@ def C2_cmplt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.lt($Rs32,$Rt32)",
-tc_6ebb4a12, TypeALU32_3op> {
+tc_56f114f4, TypeALU32_3op> {
 let isCompare = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
@@ -3684,7 +3698,7 @@ def C2_cmpltu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.ltu($Rs32,$Rt32)",
-tc_6ebb4a12, TypeALU32_3op> {
+tc_56f114f4, TypeALU32_3op> {
 let isCompare = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
@@ -3693,7 +3707,7 @@ def C2_mask : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4),
 "$Rdd32 = mask($Pt4)",
-tc_cde8b071, TypeS_2op>, Enc_78e566 {
+tc_0ae0825c, TypeS_2op>, Enc_78e566 {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0000;
 let Inst{31-16} = 0b1000011000000000;
@@ -3702,7 +3716,7 @@ def C2_mux : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mux($Pu4,$Rs32,$Rt32)",
-tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54 {
+tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110100000;
@@ -3714,7 +3728,7 @@ def C2_muxii : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii, s8_0Imm:$II),
 "$Rd32 = mux($Pu4,#$Ii,#$II)",
-tc_d6bf0472, TypeALU32_2op>, Enc_830e5d {
+tc_4c5ba658, TypeALU32_2op>, Enc_830e5d {
 let Inst{31-25} = 0b0111101;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -3728,7 +3742,7 @@ def C2_muxir : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = mux($Pu4,$Rs32,#$Ii)",
-tc_d6bf0472, TypeALU32_2op>, Enc_e38e1f {
+tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b011100110;
 let hasNewValue = 1;
@@ -3744,7 +3758,7 @@ def C2_muxri : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii, IntRegs:$Rs32),
 "$Rd32 = mux($Pu4,#$Ii,$Rs32)",
-tc_d6bf0472, TypeALU32_2op>, Enc_e38e1f {
+tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b011100111;
 let hasNewValue = 1;
@@ -3760,7 +3774,7 @@ def C2_not : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4),
 "$Pd4 = not($Ps4)",
-tc_f2704b9a, TypeCR>, Enc_65d691 {
+tc_de554571, TypeCR>, Enc_65d691 {
 let Inst{13-2} = 0b000000000000;
 let Inst{31-18} = 0b01101011110000;
 }
@@ -3768,7 +3782,7 @@ def C2_or : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Pt4, PredRegs:$Ps4),
 "$Pd4 = or($Pt4,$Ps4)",
-tc_53bc8a6a, TypeCR>, Enc_454a26 {
+tc_640086b5, TypeCR>, Enc_454a26 {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011001000;
@@ -3777,7 +3791,7 @@ def C2_orn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Pt4, PredRegs:$Ps4),
 "$Pd4 = or($Pt4,!$Ps4)",
-tc_53bc8a6a, TypeCR>, Enc_454a26 {
+tc_640086b5, TypeCR>, Enc_454a26 {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011111000;
@@ -3786,7 +3800,7 @@ def C2_pxfer_map : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4),
 "$Pd4 = $Ps4",
-tc_53bc8a6a, TypeMAPPING> {
+tc_640086b5, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -3794,7 +3808,7 @@ def C2_tfrpr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Ps4),
 "$Rd32 = $Ps4",
-tc_cde8b071, TypeS_2op>, Enc_f5e933 {
+tc_0ae0825c, TypeS_2op>, Enc_f5e933 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-18} = 0b10001001010000;
 let hasNewValue = 1;
@@ -3804,7 +3818,7 @@ def C2_tfrrp : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32),
 "$Pd4 = $Rs32",
-tc_351fed2d, TypeS_2op>, Enc_48b75f {
+tc_cfd8378a, TypeS_2op>, Enc_48b75f {
 let Inst{13-2} = 0b000000000000;
 let Inst{31-21} = 0b10000101010;
 }
@@ -3812,7 +3826,7 @@ def C2_vitpack : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Ps4, PredRegs:$Pt4),
 "$Rd32 = vitpack($Ps4,$Pt4)",
-tc_1b9c9ee5, TypeS_2op>, Enc_527412 {
+tc_4414d8b1, TypeS_2op>, Enc_527412 {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b10001001000000;
@@ -3824,7 +3838,7 @@ def C2_vmux : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmux($Pu4,$Rss32,$Rtt32)",
-tc_f8eeed7a, TypeALU64>, Enc_329361 {
+tc_b4b5c03a, TypeALU64>, Enc_329361 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010001000;
@@ -3833,7 +3847,7 @@ def C2_xor : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4),
 "$Pd4 = xor($Ps4,$Pt4)",
-tc_53bc8a6a, TypeCR>, Enc_284ebb {
+tc_640086b5, TypeCR>, Enc_284ebb {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011010000;
@@ -3842,7 +3856,7 @@ def C4_addipc : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = add(pc,#$Ii)",
-tc_b9c4623f, TypeCR>, Enc_607661 {
+tc_a813cf9a, TypeCR>, Enc_607661 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0110101001001001;
@@ -3858,7 +3872,7 @@ def C4_and_and : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = and($Ps4,and($Pt4,$Pu4))",
-tc_481e5e5c, TypeCR>, Enc_9ac432 {
+tc_b31c2e97, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011000100;
@@ -3867,7 +3881,7 @@ def C4_and_andn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = and($Ps4,and($Pt4,!$Pu4))",
-tc_481e5e5c, TypeCR>, Enc_9ac432 {
+tc_b31c2e97, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011100100;
@@ -3876,7 +3890,7 @@ def C4_and_or : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = and($Ps4,or($Pt4,$Pu4))",
-tc_481e5e5c, TypeCR>, Enc_9ac432 {
+tc_b31c2e97, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011001100;
@@ -3885,7 +3899,7 @@ def C4_and_orn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = and($Ps4,or($Pt4,!$Pu4))",
-tc_481e5e5c, TypeCR>, Enc_9ac432 {
+tc_b31c2e97, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011101100;
@@ -3894,7 +3908,7 @@ def C4_cmplte : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !cmp.gt($Rs32,$Rt32)",
-tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010010;
@@ -3906,7 +3920,7 @@ def C4_cmpltei : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = !cmp.gt($Rs32,#$Ii)",
-tc_6ebb4a12, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
 let Inst{4-2} = 0b100;
 let Inst{31-22} = 0b0111010101;
 let CextOpcode = "C4_cmplte";
@@ -3922,7 +3936,7 @@ def C4_cmplteu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !cmp.gtu($Rs32,$Rt32)",
-tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010011;
@@ -3934,7 +3948,7 @@ def C4_cmplteui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Pd4 = !cmp.gtu($Rs32,#$Ii)",
-tc_6ebb4a12, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
+tc_56f114f4, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
 let Inst{4-2} = 0b100;
 let Inst{31-21} = 0b01110101100;
 let CextOpcode = "C4_cmplteu";
@@ -3950,7 +3964,7 @@ def C4_cmpneq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !cmp.eq($Rs32,$Rt32)",
-tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010000;
@@ -3963,7 +3977,7 @@ def C4_cmpneqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = !cmp.eq($Rs32,#$Ii)",
-tc_6ebb4a12, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
 let Inst{4-2} = 0b100;
 let Inst{31-22} = 0b0111010100;
 let CextOpcode = "C4_cmpneq";
@@ -3979,7 +3993,7 @@ def C4_fastcorner9 : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4),
 "$Pd4 = fastcorner9($Ps4,$Pt4)",
-tc_53bc8a6a, TypeCR>, Enc_284ebb {
+tc_640086b5, TypeCR>, Enc_284ebb {
 let Inst{7-2} = 0b100100;
 let Inst{13-10} = 0b1000;
 let Inst{31-18} = 0b01101011000000;
@@ -3988,7 +4002,7 @@ def C4_fastcorner9_not : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4),
 "$Pd4 = !fastcorner9($Ps4,$Pt4)",
-tc_53bc8a6a, TypeCR>, Enc_284ebb {
+tc_640086b5, TypeCR>, Enc_284ebb {
 let Inst{7-2} = 0b100100;
 let Inst{13-10} = 0b1000;
 let Inst{31-18} = 0b01101011000100;
@@ -3997,7 +4011,7 @@ def C4_nbitsclr : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !bitsclr($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111101;
@@ -4006,7 +4020,7 @@ def C4_nbitsclri : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u6_0Imm:$Ii),
 "$Pd4 = !bitsclr($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_5d6c34 {
+tc_643b4717, TypeS_2op>, Enc_5d6c34 {
 let Inst{7-2} = 0b000000;
 let Inst{31-21} = 0b10000101101;
 }
@@ -4014,7 +4028,7 @@ def C4_nbitsset : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !bitsset($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111011;
@@ -4023,7 +4037,7 @@ def C4_or_and : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = or($Ps4,and($Pt4,$Pu4))",
-tc_481e5e5c, TypeCR>, Enc_9ac432 {
+tc_b31c2e97, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011010100;
@@ -4032,7 +4046,7 @@ def C4_or_andn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = or($Ps4,and($Pt4,!$Pu4))",
-tc_481e5e5c, TypeCR>, Enc_9ac432 {
+tc_b31c2e97, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011110100;
@@ -4041,7 +4055,7 @@ def C4_or_or : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = or($Ps4,or($Pt4,$Pu4))",
-tc_481e5e5c, TypeCR>, Enc_9ac432 {
+tc_b31c2e97, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011011100;
@@ -4050,7 +4064,7 @@ def C4_or_orn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = or($Ps4,or($Pt4,!$Pu4))",
-tc_481e5e5c, TypeCR>, Enc_9ac432 {
+tc_b31c2e97, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011111100;
@@ -4059,7 +4073,7 @@ def F2_conv_d2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_d2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
+tc_3a867367, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4069,7 +4083,7 @@ def F2_conv_d2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_d2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
+tc_3a867367, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -4081,7 +4095,7 @@ def F2_conv_df2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
+tc_3a867367, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4091,7 +4105,7 @@ def F2_conv_df2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
+tc_3a867367, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4101,7 +4115,7 @@ def F2_conv_df2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
+tc_3a867367, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -4113,7 +4127,7 @@ def F2_conv_df2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
+tc_3a867367, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4123,7 +4137,7 @@ def F2_conv_df2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
+tc_3a867367, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4133,7 +4147,7 @@ def F2_conv_df2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
+tc_3a867367, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -4145,7 +4159,7 @@ def F2_conv_df2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
+tc_3a867367, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000101;
 let hasNewValue = 1;
@@ -4157,7 +4171,7 @@ def F2_conv_df2w : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
+tc_3a867367, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -4169,7 +4183,7 @@ def F2_conv_df2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
+tc_3a867367, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -4181,7 +4195,7 @@ def F2_conv_sf2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
+tc_3a867367, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4191,7 +4205,7 @@ def F2_conv_sf2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
+tc_3a867367, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4201,7 +4215,7 @@ def F2_conv_sf2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
+tc_3a867367, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4211,7 +4225,7 @@ def F2_conv_sf2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
+tc_3a867367, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4221,7 +4235,7 @@ def F2_conv_sf2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
+tc_3a867367, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4231,7 +4245,7 @@ def F2_conv_sf2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
+tc_3a867367, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4243,7 +4257,7 @@ def F2_conv_sf2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
+tc_3a867367, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4255,7 +4269,7 @@ def F2_conv_sf2w : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
+tc_3a867367, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4267,7 +4281,7 @@ def F2_conv_sf2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
+tc_3a867367, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4279,7 +4293,7 @@ def F2_conv_ud2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_ud2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
+tc_3a867367, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4289,7 +4303,7 @@ def F2_conv_ud2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_ud2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
+tc_3a867367, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000001;
 let hasNewValue = 1;
@@ -4301,7 +4315,7 @@ def F2_conv_uw2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_uw2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
+tc_3a867367, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4311,7 +4325,7 @@ def F2_conv_uw2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_uw2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
+tc_3a867367, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011001;
 let hasNewValue = 1;
@@ -4323,7 +4337,7 @@ def F2_conv_w2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_w2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
+tc_3a867367, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4333,7 +4347,7 @@ def F2_conv_w2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_w2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
+tc_3a867367, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011010;
 let hasNewValue = 1;
@@ -4341,11 +4355,22 @@ let opNewValue = 0;
 let isFP = 1;
 let Uses = [USR];
 }
+def F2_dfadd : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = dfadd($Rss32,$Rtt32)",
+tc_2f7c551d, TypeM>, Enc_a56825, Requires<[HasV66]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let isFP = 1;
+let Uses = [USR];
+}
 def F2_dfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Pd4 = dfclass($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_1f19b5 {
+tc_643b4717, TypeALU64>, Enc_1f19b5 {
 let Inst{4-2} = 0b100;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b11011100100;
@@ -4356,7 +4381,7 @@ def F2_dfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4368,7 +4393,7 @@ def F2_dfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4380,7 +4405,7 @@ def F2_dfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4392,7 +4417,7 @@ def F2_dfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
+tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4404,7 +4429,7 @@ def F2_dfimm_n : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_e6c957 {
+tc_9e313203, TypeALU64>, Enc_e6c957 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100101;
 let prefersSlot3 = 1;
@@ -4413,16 +4438,27 @@ def F2_dfimm_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_e6c957 {
+tc_9e313203, TypeALU64>, Enc_e6c957 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100100;
 let prefersSlot3 = 1;
 }
+def F2_dfsub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = dfsub($Rss32,$Rtt32)",
+tc_2f7c551d, TypeM>, Enc_a56825, Requires<[HasV66]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let isFP = 1;
+let Uses = [USR];
+}
 def F2_sfadd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfadd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be {
+tc_3b470976, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -4436,7 +4472,7 @@ def F2_sfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = sfclass($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_83ee64 {
+tc_643b4717, TypeS_2op>, Enc_83ee64 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101111;
@@ -4447,7 +4483,7 @@ def F2_sfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.eq($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4459,7 +4495,7 @@ def F2_sfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.ge($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4471,7 +4507,7 @@ def F2_sfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.gt($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4483,7 +4519,7 @@ def F2_sfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.uo($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4495,7 +4531,7 @@ def F2_sffixupd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be {
+tc_3b470976, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4507,7 +4543,7 @@ def F2_sffixupn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupn($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be {
+tc_3b470976, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4519,7 +4555,7 @@ def F2_sffixupr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sffixupr($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
+tc_3a867367, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011101;
 let hasNewValue = 1;
@@ -4530,7 +4566,7 @@ def F2_sffma : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154 {
+tc_a58fd5cc, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4544,7 +4580,7 @@ def F2_sffma_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154 {
+tc_a58fd5cc, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4558,7 +4594,7 @@ def F2_sffma_sc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
 "$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
-tc_038a1342, TypeM>, Enc_437f33 {
+tc_4560740b, TypeM>, Enc_437f33 {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -4572,7 +4608,7 @@ def F2_sffms : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154 {
+tc_a58fd5cc, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4586,7 +4622,7 @@ def F2_sffms_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154 {
+tc_a58fd5cc, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4600,7 +4636,7 @@ def F2_sfimm_n : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_6c9440 {
+tc_9e313203, TypeALU64>, Enc_6c9440 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011001;
 let hasNewValue = 1;
@@ -4611,7 +4647,7 @@ def F2_sfimm_p : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_6c9440 {
+tc_9e313203, TypeALU64>, Enc_6c9440 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011000;
 let hasNewValue = 1;
@@ -4622,7 +4658,7 @@ def F2_sfinvsqrta : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32),
 "$Rd32,$Pe4 = sfinvsqrta($Rs32)",
-tc_4d99bca9, TypeS_2op>, Enc_890909 {
+tc_b8bffe55, TypeS_2op>, Enc_890909 {
 let Inst{13-7} = 0b0000000;
 let Inst{31-21} = 0b10001011111;
 let hasNewValue = 1;
@@ -4634,7 +4670,7 @@ def F2_sfmax : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmax($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be {
+tc_88b4f13d, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4648,7 +4684,7 @@ def F2_sfmin : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmin($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be {
+tc_88b4f13d, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4662,7 +4698,7 @@ def F2_sfmpy : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmpy($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be {
+tc_3b470976, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011010;
@@ -4676,7 +4712,7 @@ def F2_sfrecipa : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
-tc_9c00ce8d, TypeM>, Enc_a94f3b {
+tc_2ff964b4, TypeM>, Enc_a94f3b {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011111;
@@ -4689,7 +4725,7 @@ def F2_sfsub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfsub($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be {
+tc_3b470976, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -4702,7 +4738,7 @@ def G4_tfrgcpp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins GuestRegs64:$Gss32),
 "$Rdd32 = $Gss32",
-tc_6fa4db47, TypeCR>, Enc_0aa344 {
+tc_0d8f5752, TypeCR>, Enc_0aa344 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01101000001;
 }
@@ -4710,7 +4746,7 @@ def G4_tfrgcrr : HInst<
 (outs IntRegs:$Rd32),
 (ins GuestRegs:$Gs32),
 "$Rd32 = $Gs32",
-tc_6fa4db47, TypeCR>, Enc_44271f {
+tc_0d8f5752, TypeCR>, Enc_44271f {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01101010001;
 let hasNewValue = 1;
@@ -4720,7 +4756,7 @@ def G4_tfrgpcp : HInst<
 (outs GuestRegs64:$Gdd32),
 (ins DoubleRegs:$Rss32),
 "$Gdd32 = $Rss32",
-tc_994333cd, TypeCR>, Enc_ed5027 {
+tc_bcf98408, TypeCR>, Enc_ed5027 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01100011000;
 let hasNewValue = 1;
@@ -4730,7 +4766,7 @@ def G4_tfrgrcr : HInst<
 (outs GuestRegs:$Gd32),
 (ins IntRegs:$Rs32),
 "$Gd32 = $Rs32",
-tc_994333cd, TypeCR>, Enc_621fba {
+tc_bcf98408, TypeCR>, Enc_621fba {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01100010000;
 let hasNewValue = 1;
@@ -4740,7 +4776,7 @@ def J2_call : HInst<
 (outs),
 (ins a30_2Imm:$Ii),
 "call $Ii",
-tc_a27582fa, TypeJ>, Enc_81ac1d, PredRel {
+tc_4ae7b58b, TypeJ>, Enc_81ac1d, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{31-25} = 0b0101101;
 let isCall = 1;
@@ -4762,7 +4798,7 @@ def J2_callf : HInst<
 (outs),
 (ins PredRegs:$Pu4, a30_2Imm:$Ii),
 "if (!$Pu4) call $Ii",
-tc_2f185f5c, TypeJ>, Enc_daea09, PredRel {
+tc_1d81e60e, TypeJ>, Enc_daea09, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b000;
 let Inst{21-21} = 0b1;
@@ -4789,7 +4825,7 @@ def J2_callr : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "callr $Rs32",
-tc_15411484, TypeJ>, Enc_ecbcc8 {
+tc_3bd75825, TypeJ>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01010000101;
 let isCall = 1;
@@ -4803,7 +4839,7 @@ def J2_callrf : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) callr $Rs32",
-tc_10b97e27, TypeJ>, Enc_88d4d9 {
+tc_1ad90acd, TypeJ>, Enc_88d4d9 {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b01010001001;
@@ -4821,7 +4857,7 @@ def J2_callrt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) callr $Rs32",
-tc_10b97e27, TypeJ>, Enc_88d4d9 {
+tc_1ad90acd, TypeJ>, Enc_88d4d9 {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b01010001000;
@@ -4838,7 +4874,7 @@ def J2_callt : HInst<
 (outs),
 (ins PredRegs:$Pu4, a30_2Imm:$Ii),
 "if ($Pu4) call $Ii",
-tc_2f185f5c, TypeJ>, Enc_daea09, PredRel {
+tc_1d81e60e, TypeJ>, Enc_daea09, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b000;
 let Inst{21-21} = 0b0;
@@ -4864,7 +4900,7 @@ def J2_endloop0 : HInst<
 (outs),
 (ins),
 "endloop0",
-tc_52d7bbea, TypeJ> {
+tc_1b6f7cec, TypeJ> {
 let Uses = [LC0, SA0];
 let Defs = [LC0, P3, PC, USR];
 let isBranch = 1;
@@ -4875,7 +4911,7 @@ def J2_endloop01 : HInst<
 (outs),
 (ins),
 "endloop01",
-tc_52d7bbea, TypeJ> {
+tc_1b6f7cec, TypeJ> {
 let Uses = [LC0, LC1, SA0, SA1];
 let Defs = [LC0, LC1, P3, PC, USR];
 let isPseudo = 1;
@@ -4884,7 +4920,7 @@ def J2_endloop1 : HInst<
 (outs),
 (ins),
 "endloop1",
-tc_52d7bbea, TypeJ> {
+tc_1b6f7cec, TypeJ> {
 let Uses = [LC1, SA1];
 let Defs = [LC1, PC];
 let isBranch = 1;
@@ -4895,7 +4931,7 @@ def J2_jump : HInst<
 (outs),
 (ins b30_2Imm:$Ii),
 "jump $Ii",
-tc_3669266a, TypeJ>, Enc_81ac1d, PredNewRel {
+tc_ae53734a, TypeJ>, Enc_81ac1d, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{31-25} = 0b0101100;
 let isTerminator = 1;
@@ -4917,7 +4953,7 @@ def J2_jumpf : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4) jump:nt $Ii",
-tc_e9fae2d6, TypeJ>, Enc_daea09, PredNewRel {
+tc_db2bce9c, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b000;
 let Inst{21-21} = 0b1;
@@ -4943,7 +4979,7 @@ def J2_jumpf_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, b15_2Imm:$Ii),
 "if (!$Pu4) jump $Ii",
-tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60]> {
+tc_db2bce9c, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -4951,7 +4987,7 @@ def J2_jumpfnew : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4.new) jump:nt $Ii",
-tc_a46f0df5, TypeJ>, Enc_daea09, PredNewRel {
+tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b010;
 let Inst{21-21} = 0b1;
@@ -4978,7 +5014,7 @@ def J2_jumpfnewpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4.new) jump:t $Ii",
-tc_a46f0df5, TypeJ>, Enc_daea09, PredNewRel {
+tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b110;
 let Inst{21-21} = 0b1;
@@ -5005,7 +5041,7 @@ def J2_jumpfpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4) jump:t $Ii",
-tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
+tc_cd374165, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b100;
 let Inst{21-21} = 0b1;
@@ -5031,7 +5067,7 @@ def J2_jumpr : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "jumpr $Rs32",
-tc_9faf76ae, TypeJ>, Enc_ecbcc8, PredNewRel {
+tc_d5b7b0c1, TypeJ>, Enc_ecbcc8, PredNewRel {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01010010100;
 let isTerminator = 1;
@@ -5048,7 +5084,7 @@ def J2_jumprf : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) jumpr:nt $Rs32",
-tc_e0739b8c, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_85c9c08f, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b01010011011;
@@ -5067,7 +5103,7 @@ def J2_jumprf_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) jumpr $Rs32",
-tc_e0739b8c, TypeMAPPING>, Requires<[HasV60]> {
+tc_85c9c08f, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5075,7 +5111,7 @@ def J2_jumprfnew : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) jumpr:nt $Rs32",
-tc_181af5d0, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0010;
 let Inst{31-21} = 0b01010011011;
@@ -5095,7 +5131,7 @@ def J2_jumprfnewpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) jumpr:t $Rs32",
-tc_181af5d0, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0110;
 let Inst{31-21} = 0b01010011011;
@@ -5115,7 +5151,7 @@ def J2_jumprfpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) jumpr:t $Rs32",
-tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
+tc_e78647bd, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0100;
 let Inst{31-21} = 0b01010011011;
@@ -5134,7 +5170,7 @@ def J2_jumprgtez : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32>=#0) jump:nt $Ii",
-tc_73043bf4, TypeCR>, Enc_0fa531 {
+tc_d9d43ecb, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b0;
 let Inst{31-22} = 0b0110000101;
@@ -5152,7 +5188,7 @@ def J2_jumprgtezpt : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32>=#0) jump:t $Ii",
-tc_73043bf4, TypeCR>, Enc_0fa531 {
+tc_d9d43ecb, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b1;
 let Inst{31-22} = 0b0110000101;
@@ -5170,7 +5206,7 @@ def J2_jumprltez : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32<=#0) jump:nt $Ii",
-tc_73043bf4, TypeCR>, Enc_0fa531 {
+tc_d9d43ecb, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b0;
 let Inst{31-22} = 0b0110000111;
@@ -5188,7 +5224,7 @@ def J2_jumprltezpt : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32<=#0) jump:t $Ii",
-tc_73043bf4, TypeCR>, Enc_0fa531 {
+tc_d9d43ecb, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b1;
 let Inst{31-22} = 0b0110000111;
@@ -5206,7 +5242,7 @@ def J2_jumprnz : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32==#0) jump:nt $Ii",
-tc_73043bf4, TypeCR>, Enc_0fa531 {
+tc_d9d43ecb, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b0;
 let Inst{31-22} = 0b0110000110;
@@ -5224,7 +5260,7 @@ def J2_jumprnzpt : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32==#0) jump:t $Ii",
-tc_73043bf4, TypeCR>, Enc_0fa531 {
+tc_d9d43ecb, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b1;
 let Inst{31-22} = 0b0110000110;
@@ -5242,7 +5278,7 @@ def J2_jumprt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) jumpr:nt $Rs32",
-tc_e0739b8c, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_85c9c08f, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b01010011010;
@@ -5260,7 +5296,7 @@ def J2_jumprt_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) jumpr $Rs32",
-tc_e0739b8c, TypeMAPPING>, Requires<[HasV60]> {
+tc_85c9c08f, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5268,7 +5304,7 @@ def J2_jumprtnew : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) jumpr:nt $Rs32",
-tc_181af5d0, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0010;
 let Inst{31-21} = 0b01010011010;
@@ -5287,7 +5323,7 @@ def J2_jumprtnewpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) jumpr:t $Rs32",
-tc_181af5d0, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0110;
 let Inst{31-21} = 0b01010011010;
@@ -5306,7 +5342,7 @@ def J2_jumprtpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) jumpr:t $Rs32",
-tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
+tc_e78647bd, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0100;
 let Inst{31-21} = 0b01010011010;
@@ -5324,7 +5360,7 @@ def J2_jumprz : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32!=#0) jump:nt $Ii",
-tc_73043bf4, TypeCR>, Enc_0fa531 {
+tc_d9d43ecb, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b0;
 let Inst{31-22} = 0b0110000100;
@@ -5342,7 +5378,7 @@ def J2_jumprzpt : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32!=#0) jump:t $Ii",
-tc_73043bf4, TypeCR>, Enc_0fa531 {
+tc_d9d43ecb, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b1;
 let Inst{31-22} = 0b0110000100;
@@ -5360,7 +5396,7 @@ def J2_jumpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4) jump:nt $Ii",
-tc_e9fae2d6, TypeJ>, Enc_daea09, PredNewRel {
+tc_db2bce9c, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b000;
 let Inst{21-21} = 0b0;
@@ -5385,7 +5421,7 @@ def J2_jumpt_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, b15_2Imm:$Ii),
 "if ($Pu4) jump $Ii",
-tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60]> {
+tc_db2bce9c, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5393,7 +5429,7 @@ def J2_jumptnew : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4.new) jump:nt $Ii",
-tc_a46f0df5, TypeJ>, Enc_daea09, PredNewRel {
+tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b010;
 let Inst{21-21} = 0b0;
@@ -5419,7 +5455,7 @@ def J2_jumptnewpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4.new) jump:t $Ii",
-tc_a46f0df5, TypeJ>, Enc_daea09, PredNewRel {
+tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b110;
 let Inst{21-21} = 0b0;
@@ -5445,7 +5481,7 @@ def J2_jumptpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4) jump:t $Ii",
-tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
+tc_cd374165, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b100;
 let Inst{21-21} = 0b0;
@@ -5470,7 +5506,7 @@ def J2_loop0i : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "loop0($Ii,#$II)",
-tc_cf59f215, TypeCR>, Enc_4dc228 {
+tc_a9d88b22, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001000;
@@ -5487,7 +5523,7 @@ def J2_loop0r : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "loop0($Ii,$Rs32)",
-tc_7934b9df, TypeCR>, Enc_864a5a {
+tc_df3319ed, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5505,7 +5541,7 @@ def J2_loop1i : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "loop1($Ii,#$II)",
-tc_cf59f215, TypeCR>, Enc_4dc228 {
+tc_a9d88b22, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001001;
@@ -5522,7 +5558,7 @@ def J2_loop1r : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "loop1($Ii,$Rs32)",
-tc_7934b9df, TypeCR>, Enc_864a5a {
+tc_df3319ed, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5540,7 +5576,7 @@ def J2_pause : HInst<
 (outs),
 (ins u8_0Imm:$Ii),
 "pause(#$Ii)",
-tc_681a2300, TypeJ>, Enc_a51a9a {
+tc_8d9d0154, TypeJ>, Enc_a51a9a {
 let Inst{1-0} = 0b00;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5551,7 +5587,7 @@ def J2_ploop1si : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "p3 = sp1loop0($Ii,#$II)",
-tc_c5e2426d, TypeCR>, Enc_4dc228 {
+tc_1c4528a2, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001101;
@@ -5569,7 +5605,7 @@ def J2_ploop1sr : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "p3 = sp1loop0($Ii,$Rs32)",
-tc_4f7cd700, TypeCR>, Enc_864a5a {
+tc_32779c6f, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5588,7 +5624,7 @@ def J2_ploop2si : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "p3 = sp2loop0($Ii,#$II)",
-tc_c5e2426d, TypeCR>, Enc_4dc228 {
+tc_1c4528a2, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001110;
@@ -5606,7 +5642,7 @@ def J2_ploop2sr : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "p3 = sp2loop0($Ii,$Rs32)",
-tc_4f7cd700, TypeCR>, Enc_864a5a {
+tc_32779c6f, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5625,7 +5661,7 @@ def J2_ploop3si : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "p3 = sp3loop0($Ii,#$II)",
-tc_c5e2426d, TypeCR>, Enc_4dc228 {
+tc_1c4528a2, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001111;
@@ -5643,7 +5679,7 @@ def J2_ploop3sr : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "p3 = sp3loop0($Ii,$Rs32)",
-tc_4f7cd700, TypeCR>, Enc_864a5a {
+tc_32779c6f, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5662,45 +5698,45 @@ def J2_trap0 : HInst<
 (outs),
 (ins u8_0Imm:$Ii),
 "trap0(#$Ii)",
-tc_14cd4cfa, TypeJ>, Enc_a51a9a {
+tc_fc3999b4, TypeJ>, Enc_a51a9a {
 let Inst{1-0} = 0b00;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0101010000000000;
-let hasSideEffects = 1;
 let isSolo = 1;
+let hasSideEffects = 1;
 }
 def J2_trap1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u8_0Imm:$Ii),
 "trap1($Rx32,#$Ii)",
-tc_59a01ead, TypeJ>, Enc_33f8ba {
+tc_b9e09e03, TypeJ>, Enc_33f8ba {
 let Inst{1-0} = 0b00;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01010100100;
 let hasNewValue = 1;
-let hasSideEffects = 1;
 let opNewValue = 0;
 let isSolo = 1;
 let Uses = [GOSP];
 let Defs = [GOSP, PC];
+let hasSideEffects = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
 def J2_trap1_noregmap : HInst<
 (outs),
 (ins u8_0Imm:$Ii),
 "trap1(#$Ii)",
-tc_59a01ead, TypeMAPPING> {
+tc_b9e09e03, TypeMAPPING> {
+let hasSideEffects = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
-let hasSideEffects = 1;
 }
 def J4_cmpeq_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -5726,7 +5762,7 @@ def J4_cmpeq_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -5752,7 +5788,7 @@ def J4_cmpeq_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010001;
@@ -5778,7 +5814,7 @@ def J4_cmpeq_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010001;
@@ -5804,7 +5840,7 @@ def J4_cmpeq_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010001;
@@ -5830,7 +5866,7 @@ def J4_cmpeq_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010001;
@@ -5856,7 +5892,7 @@ def J4_cmpeq_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -5881,7 +5917,7 @@ def J4_cmpeq_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -5906,7 +5942,7 @@ def J4_cmpeq_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010000;
@@ -5931,7 +5967,7 @@ def J4_cmpeq_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010000;
@@ -5956,7 +5992,7 @@ def J4_cmpeq_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010000;
@@ -5981,7 +6017,7 @@ def J4_cmpeq_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010000;
@@ -6006,7 +6042,7 @@ def J4_cmpeqi_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6032,7 +6068,7 @@ def J4_cmpeqi_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,#$II)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6058,7 +6094,7 @@ def J4_cmpeqi_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000001;
@@ -6084,7 +6120,7 @@ def J4_cmpeqi_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000001;
@@ -6110,7 +6146,7 @@ def J4_cmpeqi_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001001;
@@ -6136,7 +6172,7 @@ def J4_cmpeqi_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001001;
@@ -6162,7 +6198,7 @@ def J4_cmpeqi_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6187,7 +6223,7 @@ def J4_cmpeqi_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,#$II)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6212,7 +6248,7 @@ def J4_cmpeqi_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000000;
@@ -6237,7 +6273,7 @@ def J4_cmpeqi_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000000;
@@ -6262,7 +6298,7 @@ def J4_cmpeqi_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001000;
@@ -6287,7 +6323,7 @@ def J4_cmpeqi_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001000;
@@ -6312,7 +6348,7 @@ def J4_cmpeqn1_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_e90a15, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_e90a15, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -6338,7 +6374,7 @@ def J4_cmpeqn1_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_5a18b3, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_5a18b3, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -6364,7 +6400,7 @@ def J4_cmpeqn1_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_1de724, PredRel {
+tc_3d495a39, TypeCJ>, Enc_1de724, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{31-22} = 0b0001000111;
@@ -6390,7 +6426,7 @@ def J4_cmpeqn1_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14640c, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14640c, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{31-22} = 0b0001000111;
@@ -6416,7 +6452,7 @@ def J4_cmpeqn1_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_668704, PredRel {
+tc_3d495a39, TypeCJ>, Enc_668704, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{31-22} = 0b0001001111;
@@ -6442,7 +6478,7 @@ def J4_cmpeqn1_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_800e04, PredRel {
+tc_3d495a39, TypeCJ>, Enc_800e04, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{31-22} = 0b0001001111;
@@ -6468,7 +6504,7 @@ def J4_cmpeqn1_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_4aca3a, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_4aca3a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -6493,7 +6529,7 @@ def J4_cmpeqn1_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_f7ea77, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_f7ea77, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -6518,7 +6554,7 @@ def J4_cmpeqn1_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_405228, PredRel {
+tc_3d495a39, TypeCJ>, Enc_405228, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{31-22} = 0b0001000110;
@@ -6543,7 +6579,7 @@ def J4_cmpeqn1_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_3a2484, PredRel {
+tc_3d495a39, TypeCJ>, Enc_3a2484, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{31-22} = 0b0001000110;
@@ -6568,7 +6604,7 @@ def J4_cmpeqn1_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_736575, PredRel {
+tc_3d495a39, TypeCJ>, Enc_736575, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{31-22} = 0b0001001110;
@@ -6593,7 +6629,7 @@ def J4_cmpeqn1_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_8e583a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_8e583a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{31-22} = 0b0001001110;
@@ -6618,7 +6654,7 @@ def J4_cmpgt_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6644,7 +6680,7 @@ def J4_cmpgt_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6670,7 +6706,7 @@ def J4_cmpgt_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010011;
@@ -6696,7 +6732,7 @@ def J4_cmpgt_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010011;
@@ -6722,7 +6758,7 @@ def J4_cmpgt_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010011;
@@ -6748,7 +6784,7 @@ def J4_cmpgt_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010011;
@@ -6774,7 +6810,7 @@ def J4_cmpgt_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6799,7 +6835,7 @@ def J4_cmpgt_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6824,7 +6860,7 @@ def J4_cmpgt_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010010;
@@ -6849,7 +6885,7 @@ def J4_cmpgt_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010010;
@@ -6874,7 +6910,7 @@ def J4_cmpgt_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010010;
@@ -6899,7 +6935,7 @@ def J4_cmpgt_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010010;
@@ -6924,7 +6960,7 @@ def J4_cmpgti_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6950,7 +6986,7 @@ def J4_cmpgti_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,#$II)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6976,7 +7012,7 @@ def J4_cmpgti_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000011;
@@ -7002,7 +7038,7 @@ def J4_cmpgti_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000011;
@@ -7028,7 +7064,7 @@ def J4_cmpgti_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001011;
@@ -7054,7 +7090,7 @@ def J4_cmpgti_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001011;
@@ -7080,7 +7116,7 @@ def J4_cmpgti_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -7105,7 +7141,7 @@ def J4_cmpgti_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,#$II)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7130,7 +7166,7 @@ def J4_cmpgti_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000010;
@@ -7155,7 +7191,7 @@ def J4_cmpgti_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000010;
@@ -7180,7 +7216,7 @@ def J4_cmpgti_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001010;
@@ -7205,7 +7241,7 @@ def J4_cmpgti_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001010;
@@ -7230,7 +7266,7 @@ def J4_cmpgtn1_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_3694bd, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_3694bd, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -7256,7 +7292,7 @@ def J4_cmpgtn1_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_a6853f, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_a6853f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -7282,7 +7318,7 @@ def J4_cmpgtn1_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_a42857, PredRel {
+tc_3d495a39, TypeCJ>, Enc_a42857, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000001;
 let Inst{31-22} = 0b0001000111;
@@ -7308,7 +7344,7 @@ def J4_cmpgtn1_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_f6fe0b, PredRel {
+tc_3d495a39, TypeCJ>, Enc_f6fe0b, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100001;
 let Inst{31-22} = 0b0001000111;
@@ -7334,7 +7370,7 @@ def J4_cmpgtn1_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_3e3989, PredRel {
+tc_3d495a39, TypeCJ>, Enc_3e3989, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000001;
 let Inst{31-22} = 0b0001001111;
@@ -7360,7 +7396,7 @@ def J4_cmpgtn1_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_b909d2, PredRel {
+tc_3d495a39, TypeCJ>, Enc_b909d2, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100001;
 let Inst{31-22} = 0b0001001111;
@@ -7386,7 +7422,7 @@ def J4_cmpgtn1_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_f82302, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_f82302, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -7411,7 +7447,7 @@ def J4_cmpgtn1_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_6413b6, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_6413b6, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -7436,7 +7472,7 @@ def J4_cmpgtn1_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_b78edd, PredRel {
+tc_3d495a39, TypeCJ>, Enc_b78edd, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000001;
 let Inst{31-22} = 0b0001000110;
@@ -7461,7 +7497,7 @@ def J4_cmpgtn1_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_041d7b, PredRel {
+tc_3d495a39, TypeCJ>, Enc_041d7b, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100001;
 let Inst{31-22} = 0b0001000110;
@@ -7486,7 +7522,7 @@ def J4_cmpgtn1_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_b1e1fb, PredRel {
+tc_3d495a39, TypeCJ>, Enc_b1e1fb, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000001;
 let Inst{31-22} = 0b0001001110;
@@ -7511,7 +7547,7 @@ def J4_cmpgtn1_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_178717, PredRel {
+tc_3d495a39, TypeCJ>, Enc_178717, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100001;
 let Inst{31-22} = 0b0001001110;
@@ -7536,7 +7572,7 @@ def J4_cmpgtu_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -7562,7 +7598,7 @@ def J4_cmpgtu_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7588,7 +7624,7 @@ def J4_cmpgtu_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010101;
@@ -7614,7 +7650,7 @@ def J4_cmpgtu_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010101;
@@ -7640,7 +7676,7 @@ def J4_cmpgtu_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010101;
@@ -7666,7 +7702,7 @@ def J4_cmpgtu_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010101;
@@ -7692,7 +7728,7 @@ def J4_cmpgtu_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -7717,7 +7753,7 @@ def J4_cmpgtu_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
-tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7742,7 +7778,7 @@ def J4_cmpgtu_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010100;
@@ -7767,7 +7803,7 @@ def J4_cmpgtu_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010100;
@@ -7792,7 +7828,7 @@ def J4_cmpgtu_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010100;
@@ -7817,7 +7853,7 @@ def J4_cmpgtu_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:t $Ii",
-tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel {
+tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010100;
@@ -7842,7 +7878,7 @@ def J4_cmpgtui_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -7868,7 +7904,7 @@ def J4_cmpgtui_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7894,7 +7930,7 @@ def J4_cmpgtui_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000101;
@@ -7920,7 +7956,7 @@ def J4_cmpgtui_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000101;
@@ -7946,7 +7982,7 @@ def J4_cmpgtui_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001101;
@@ -7972,7 +8008,7 @@ def J4_cmpgtui_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001101;
@@ -7998,7 +8034,7 @@ def J4_cmpgtui_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8023,7 +8059,7 @@ def J4_cmpgtui_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
-tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel {
+tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8048,7 +8084,7 @@ def J4_cmpgtui_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000100;
@@ -8073,7 +8109,7 @@ def J4_cmpgtui_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000100;
@@ -8098,7 +8134,7 @@ def J4_cmpgtui_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:nt $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001100;
@@ -8123,7 +8159,7 @@ def J4_cmpgtui_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:t $Ii",
-tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel {
+tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001100;
@@ -8148,7 +8184,7 @@ def J4_cmplt_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
-tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel {
+tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8174,7 +8210,7 @@ def J4_cmplt_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
-tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel {
+tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8200,7 +8236,7 @@ def J4_cmplt_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
-tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel {
+tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8225,7 +8261,7 @@ def J4_cmplt_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
-tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel {
+tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8250,7 +8286,7 @@ def J4_cmpltu_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
-tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel {
+tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8276,7 +8312,7 @@ def J4_cmpltu_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
-tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel {
+tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8302,7 +8338,7 @@ def J4_cmpltu_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
-tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel {
+tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8327,7 +8363,7 @@ def J4_cmpltu_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
-tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel {
+tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8352,7 +8388,7 @@ def J4_hintjumpr : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "hintjr($Rs32)",
-tc_9faf76ae, TypeJ>, Enc_ecbcc8 {
+tc_d5b7b0c1, TypeJ>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01010010101;
 let isTerminator = 1;
@@ -8364,7 +8400,7 @@ def J4_jumpseti : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins u6_0Imm:$II, b30_2Imm:$Ii),
 "$Rd16 = #$II ; jump $Ii",
-tc_49eb22c8, TypeCJ>, Enc_9e4c3f {
+tc_0663f615, TypeCJ>, Enc_9e4c3f {
 let Inst{0-0} = 0b0;
 let Inst{31-22} = 0b0001011000;
 let hasNewValue = 1;
@@ -8384,7 +8420,7 @@ def J4_jumpsetr : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "$Rd16 = $Rs16 ; jump $Ii",
-tc_49eb22c8, TypeCJ>, Enc_66bce1 {
+tc_0663f615, TypeCJ>, Enc_66bce1 {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001011100;
@@ -8405,7 +8441,7 @@ def J4_tstbit0_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!tstbit($Ns8.new,#0)) jump:nt $Ii",
-tc_746baa8e, TypeNCJ>, Enc_69d63b {
+tc_8c945be0, TypeNCJ>, Enc_69d63b {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -8430,7 +8466,7 @@ def J4_tstbit0_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!tstbit($Ns8.new,#0)) jump:t $Ii",
-tc_746baa8e, TypeNCJ>, Enc_69d63b {
+tc_8c945be0, TypeNCJ>, Enc_69d63b {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -8455,7 +8491,7 @@ def J4_tstbit0_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p0 = tstbit($Rs16,#0); if (!p0.new) jump:nt $Ii",
-tc_3cb8ea06, TypeCJ>, Enc_ad1c74 {
+tc_2332b92e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000011;
 let Inst{31-22} = 0b0001000111;
@@ -8480,7 +8516,7 @@ def J4_tstbit0_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p0 = tstbit($Rs16,#0); if (!p0.new) jump:t $Ii",
-tc_3cb8ea06, TypeCJ>, Enc_ad1c74 {
+tc_2332b92e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100011;
 let Inst{31-22} = 0b0001000111;
@@ -8505,7 +8541,7 @@ def J4_tstbit0_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p1 = tstbit($Rs16,#0); if (!p1.new) jump:nt $Ii",
-tc_3cb8ea06, TypeCJ>, Enc_ad1c74 {
+tc_2332b92e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000011;
 let Inst{31-22} = 0b0001001111;
@@ -8530,7 +8566,7 @@ def J4_tstbit0_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p1 = tstbit($Rs16,#0); if (!p1.new) jump:t $Ii",
-tc_3cb8ea06, TypeCJ>, Enc_ad1c74 {
+tc_2332b92e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100011;
 let Inst{31-22} = 0b0001001111;
@@ -8555,7 +8591,7 @@ def J4_tstbit0_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (tstbit($Ns8.new,#0)) jump:nt $Ii",
-tc_746baa8e, TypeNCJ>, Enc_69d63b {
+tc_8c945be0, TypeNCJ>, Enc_69d63b {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -8579,7 +8615,7 @@ def J4_tstbit0_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (tstbit($Ns8.new,#0)) jump:t $Ii",
-tc_746baa8e, TypeNCJ>, Enc_69d63b {
+tc_8c945be0, TypeNCJ>, Enc_69d63b {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -8603,7 +8639,7 @@ def J4_tstbit0_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p0 = tstbit($Rs16,#0); if (p0.new) jump:nt $Ii",
-tc_3cb8ea06, TypeCJ>, Enc_ad1c74 {
+tc_2332b92e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000011;
 let Inst{31-22} = 0b0001000110;
@@ -8627,7 +8663,7 @@ def J4_tstbit0_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p0 = tstbit($Rs16,#0); if (p0.new) jump:t $Ii",
-tc_3cb8ea06, TypeCJ>, Enc_ad1c74 {
+tc_2332b92e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100011;
 let Inst{31-22} = 0b0001000110;
@@ -8651,7 +8687,7 @@ def J4_tstbit0_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p1 = tstbit($Rs16,#0); if (p1.new) jump:nt $Ii",
-tc_3cb8ea06, TypeCJ>, Enc_ad1c74 {
+tc_2332b92e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000011;
 let Inst{31-22} = 0b0001001110;
@@ -8675,7 +8711,7 @@ def J4_tstbit0_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p1 = tstbit($Rs16,#0); if (p1.new) jump:t $Ii",
-tc_3cb8ea06, TypeCJ>, Enc_ad1c74 {
+tc_2332b92e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100011;
 let Inst{31-22} = 0b0001001110;
@@ -8699,7 +8735,7 @@ def L2_deallocframe : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = deallocframe($Rs32):raw",
-tc_d1090e34, TypeLD>, Enc_3a3d62 {
+tc_15aa71c5, TypeLD>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10010000000;
 let accessSize = DoubleWordAccess;
@@ -8711,7 +8747,7 @@ def L2_loadalignb_io : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Ryy32 = memb_fifo($Rs32+#$Ii)",
-tc_ef52ed71, TypeLD>, Enc_a27588 {
+tc_5ef37dc4, TypeLD>, Enc_a27588 {
 let Inst{24-21} = 0b0100;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -8728,9 +8764,10 @@ def L2_loadalignb_pbr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memb_fifo($Rx32++$Mu2:brev)",
-tc_bad2bcaf, TypeLD>, Enc_1f5d8f {
+tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110100;
+let addrMode = PostInc;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
@@ -8739,7 +8776,7 @@ def L2_loadalignb_pci : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
 "$Ryy32 = memb_fifo($Rx32++#$Ii:circ($Mu2))",
-tc_03220ffa, TypeLD>, Enc_74aef2 {
+tc_785f65a7, TypeLD>, Enc_74aef2 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000100;
 let addrMode = PostInc;
@@ -8752,7 +8789,7 @@ def L2_loadalignb_pcr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memb_fifo($Rx32++I:circ($Mu2))",
-tc_bad2bcaf, TypeLD>, Enc_1f5d8f {
+tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000100;
 let addrMode = PostInc;
@@ -8765,7 +8802,7 @@ def L2_loadalignb_pi : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "$Ryy32 = memb_fifo($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_6b197f {
+tc_3c76b0ff, TypeLD>, Enc_6b197f {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010100;
 let addrMode = PostInc;
@@ -8777,7 +8814,7 @@ def L2_loadalignb_pr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memb_fifo($Rx32++$Mu2)",
-tc_bad2bcaf, TypeLD>, Enc_1f5d8f {
+tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100100;
 let addrMode = PostInc;
@@ -8789,7 +8826,7 @@ def L2_loadalignb_zomap : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
 "$Ryy32 = memb_fifo($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let Constraints = "$Ryy32 = $Ryy32in";
@@ -8798,7 +8835,7 @@ def L2_loadalignh_io : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Ryy32 = memh_fifo($Rs32+#$Ii)",
-tc_ef52ed71, TypeLD>, Enc_5cd7e9 {
+tc_5ef37dc4, TypeLD>, Enc_5cd7e9 {
 let Inst{24-21} = 0b0010;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -8815,9 +8852,10 @@ def L2_loadalignh_pbr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memh_fifo($Rx32++$Mu2:brev)",
-tc_bad2bcaf, TypeLD>, Enc_1f5d8f {
+tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110010;
+let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
@@ -8826,7 +8864,7 @@ def L2_loadalignh_pci : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Ryy32 = memh_fifo($Rx32++#$Ii:circ($Mu2))",
-tc_03220ffa, TypeLD>, Enc_9e2e1c {
+tc_785f65a7, TypeLD>, Enc_9e2e1c {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000010;
 let addrMode = PostInc;
@@ -8839,7 +8877,7 @@ def L2_loadalignh_pcr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memh_fifo($Rx32++I:circ($Mu2))",
-tc_bad2bcaf, TypeLD>, Enc_1f5d8f {
+tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000010;
 let addrMode = PostInc;
@@ -8852,7 +8890,7 @@ def L2_loadalignh_pi : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Ryy32 = memh_fifo($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_bd1cbc {
+tc_3c76b0ff, TypeLD>, Enc_bd1cbc {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010010;
 let addrMode = PostInc;
@@ -8864,7 +8902,7 @@ def L2_loadalignh_pr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memh_fifo($Rx32++$Mu2)",
-tc_bad2bcaf, TypeLD>, Enc_1f5d8f {
+tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100010;
 let addrMode = PostInc;
@@ -8876,7 +8914,7 @@ def L2_loadalignh_zomap : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
 "$Ryy32 = memh_fifo($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let Constraints = "$Ryy32 = $Ryy32in";
@@ -8885,7 +8923,7 @@ def L2_loadbsw2_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = membh($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_de0214 {
+tc_17e0d2cd, TypeLD>, Enc_de0214 {
 let Inst{24-21} = 0b0001;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -8903,11 +8941,12 @@ def L2_loadbsw2_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = membh($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -8916,7 +8955,7 @@ def L2_loadbsw2_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = membh($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_e83554 {
+tc_e93a3d71, TypeLD>, Enc_e83554 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000001;
 let hasNewValue = 1;
@@ -8931,7 +8970,7 @@ def L2_loadbsw2_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = membh($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000001;
 let hasNewValue = 1;
@@ -8946,7 +8985,7 @@ def L2_loadbsw2_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Rd32 = membh($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_152467 {
+tc_44d3da28, TypeLD>, Enc_152467 {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010001;
 let hasNewValue = 1;
@@ -8960,7 +8999,7 @@ def L2_loadbsw2_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = membh($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100001;
 let hasNewValue = 1;
@@ -8974,7 +9013,7 @@ def L2_loadbsw2_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = membh($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -8984,7 +9023,7 @@ def L2_loadbsw4_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii),
 "$Rdd32 = membh($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_2d7491 {
+tc_17e0d2cd, TypeLD>, Enc_2d7491 {
 let Inst{24-21} = 0b0111;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -9000,9 +9039,10 @@ def L2_loadbsw4_pbr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = membh($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110111;
+let addrMode = PostInc;
 let accessSize = WordAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9011,7 +9051,7 @@ def L2_loadbsw4_pci : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
 "$Rdd32 = membh($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_70b24b {
+tc_e93a3d71, TypeLD>, Enc_70b24b {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000111;
 let addrMode = PostInc;
@@ -9024,7 +9064,7 @@ def L2_loadbsw4_pcr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = membh($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000111;
 let addrMode = PostInc;
@@ -9037,7 +9077,7 @@ def L2_loadbsw4_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii),
 "$Rdd32 = membh($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_71f1b4 {
+tc_44d3da28, TypeLD>, Enc_71f1b4 {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010111;
 let addrMode = PostInc;
@@ -9049,7 +9089,7 @@ def L2_loadbsw4_pr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = membh($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100111;
 let addrMode = PostInc;
@@ -9061,7 +9101,7 @@ def L2_loadbsw4_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = membh($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -9069,7 +9109,7 @@ def L2_loadbzw2_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = memubh($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_de0214 {
+tc_17e0d2cd, TypeLD>, Enc_de0214 {
 let Inst{24-21} = 0b0011;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9087,11 +9127,12 @@ def L2_loadbzw2_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memubh($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9100,7 +9141,7 @@ def L2_loadbzw2_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memubh($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_e83554 {
+tc_e93a3d71, TypeLD>, Enc_e83554 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000011;
 let hasNewValue = 1;
@@ -9115,7 +9156,7 @@ def L2_loadbzw2_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memubh($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000011;
 let hasNewValue = 1;
@@ -9130,7 +9171,7 @@ def L2_loadbzw2_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Rd32 = memubh($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_152467 {
+tc_44d3da28, TypeLD>, Enc_152467 {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010011;
 let hasNewValue = 1;
@@ -9144,7 +9185,7 @@ def L2_loadbzw2_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memubh($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100011;
 let hasNewValue = 1;
@@ -9158,7 +9199,7 @@ def L2_loadbzw2_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memubh($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9168,7 +9209,7 @@ def L2_loadbzw4_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii),
 "$Rdd32 = memubh($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_2d7491 {
+tc_17e0d2cd, TypeLD>, Enc_2d7491 {
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -9184,9 +9225,10 @@ def L2_loadbzw4_pbr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memubh($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110101;
+let addrMode = PostInc;
 let accessSize = WordAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9195,7 +9237,7 @@ def L2_loadbzw4_pci : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
 "$Rdd32 = memubh($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_70b24b {
+tc_e93a3d71, TypeLD>, Enc_70b24b {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000101;
 let addrMode = PostInc;
@@ -9208,7 +9250,7 @@ def L2_loadbzw4_pcr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memubh($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000101;
 let addrMode = PostInc;
@@ -9221,7 +9263,7 @@ def L2_loadbzw4_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii),
 "$Rdd32 = memubh($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_71f1b4 {
+tc_44d3da28, TypeLD>, Enc_71f1b4 {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010101;
 let addrMode = PostInc;
@@ -9233,7 +9275,7 @@ def L2_loadbzw4_pr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memubh($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100101;
 let addrMode = PostInc;
@@ -9245,7 +9287,7 @@ def L2_loadbzw4_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = memubh($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -9253,7 +9295,7 @@ def L2_loadrb_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = memb($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm {
+tc_17e0d2cd, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1000;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9274,11 +9316,12 @@ def L2_loadrb_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memb($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let addrMode = PostInc;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9287,7 +9330,7 @@ def L2_loadrb_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memb($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_e0a47a {
+tc_e93a3d71, TypeLD>, Enc_e0a47a {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001000;
 let hasNewValue = 1;
@@ -9302,7 +9345,7 @@ def L2_loadrb_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memb($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001000;
 let hasNewValue = 1;
@@ -9317,7 +9360,7 @@ def L2_loadrb_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii),
 "$Rd32 = memb($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm {
+tc_44d3da28, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011000;
 let hasNewValue = 1;
@@ -9334,7 +9377,7 @@ def L2_loadrb_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memb($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101000;
 let hasNewValue = 1;
@@ -9348,7 +9391,7 @@ def L2_loadrb_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memb($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9358,7 +9401,7 @@ def L2_loadrbgp : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = memb(gp+#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel {
 let Inst{24-21} = 0b1000;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9377,7 +9420,7 @@ def L2_loadrd_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s29_3Imm:$Ii),
 "$Rdd32 = memd($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_fa3ba4, AddrModeRel, PostInc_BaseImm {
+tc_17e0d2cd, TypeLD>, Enc_fa3ba4, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1110;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -9396,9 +9439,10 @@ def L2_loadrd_pbr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memd($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111110;
+let addrMode = PostInc;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9407,7 +9451,7 @@ def L2_loadrd_pci : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2),
 "$Rdd32 = memd($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_b05839 {
+tc_e93a3d71, TypeLD>, Enc_b05839 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001110;
 let addrMode = PostInc;
@@ -9420,7 +9464,7 @@ def L2_loadrd_pcr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memd($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001110;
 let addrMode = PostInc;
@@ -9433,7 +9477,7 @@ def L2_loadrd_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_3Imm:$Ii),
 "$Rdd32 = memd($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_5bdd42, PredNewRel, PostInc_BaseImm {
+tc_44d3da28, TypeLD>, Enc_5bdd42, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011110;
 let addrMode = PostInc;
@@ -9448,7 +9492,7 @@ def L2_loadrd_pr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memd($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_7eee72 {
+tc_44d3da28, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101110;
 let addrMode = PostInc;
@@ -9460,7 +9504,7 @@ def L2_loadrd_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = memd($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -9468,7 +9512,7 @@ def L2_loadrdgp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u29_3Imm:$Ii),
 "$Rdd32 = memd(gp+#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_509701, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_509701, AddrModeRel {
 let Inst{24-21} = 0b1110;
 let Inst{31-27} = 0b01001;
 let accessSize = DoubleWordAccess;
@@ -9485,7 +9529,7 @@ def L2_loadrh_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = memh($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm {
+tc_17e0d2cd, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1010;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9506,11 +9550,12 @@ def L2_loadrh_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memh($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9519,7 +9564,7 @@ def L2_loadrh_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memh($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_e83554 {
+tc_e93a3d71, TypeLD>, Enc_e83554 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001010;
 let hasNewValue = 1;
@@ -9534,7 +9579,7 @@ def L2_loadrh_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memh($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001010;
 let hasNewValue = 1;
@@ -9549,7 +9594,7 @@ def L2_loadrh_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Rd32 = memh($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm {
+tc_44d3da28, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011010;
 let hasNewValue = 1;
@@ -9566,7 +9611,7 @@ def L2_loadrh_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memh($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101010;
 let hasNewValue = 1;
@@ -9580,7 +9625,7 @@ def L2_loadrh_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memh($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9590,7 +9635,7 @@ def L2_loadrhgp : HInst<
 (outs IntRegs:$Rd32),
 (ins u31_1Imm:$Ii),
 "$Rd32 = memh(gp+#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel {
 let Inst{24-21} = 0b1010;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9609,7 +9654,7 @@ def L2_loadri_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii),
 "$Rd32 = memw($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_2a3787, AddrModeRel, PostInc_BaseImm {
+tc_17e0d2cd, TypeLD>, Enc_2a3787, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1100;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9630,11 +9675,12 @@ def L2_loadri_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memw($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let addrMode = PostInc;
 let accessSize = WordAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9643,7 +9689,7 @@ def L2_loadri_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memw($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_27fd0e {
+tc_e93a3d71, TypeLD>, Enc_27fd0e {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001100;
 let hasNewValue = 1;
@@ -9658,7 +9704,7 @@ def L2_loadri_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memw($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001100;
 let hasNewValue = 1;
@@ -9673,7 +9719,7 @@ def L2_loadri_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii),
 "$Rd32 = memw($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_3d920a, PredNewRel, PostInc_BaseImm {
+tc_44d3da28, TypeLD>, Enc_3d920a, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011100;
 let hasNewValue = 1;
@@ -9690,7 +9736,7 @@ def L2_loadri_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memw($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101100;
 let hasNewValue = 1;
@@ -9704,7 +9750,7 @@ def L2_loadri_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memw($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9714,7 +9760,7 @@ def L2_loadrigp : HInst<
 (outs IntRegs:$Rd32),
 (ins u30_2Imm:$Ii),
 "$Rd32 = memw(gp+#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
 let Inst{24-21} = 0b1100;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9733,7 +9779,7 @@ def L2_loadrub_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = memub($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm {
+tc_17e0d2cd, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1001;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9754,11 +9800,12 @@ def L2_loadrub_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memub($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let addrMode = PostInc;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9767,7 +9814,7 @@ def L2_loadrub_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memub($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_e0a47a {
+tc_e93a3d71, TypeLD>, Enc_e0a47a {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001001;
 let hasNewValue = 1;
@@ -9782,7 +9829,7 @@ def L2_loadrub_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memub($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001001;
 let hasNewValue = 1;
@@ -9797,7 +9844,7 @@ def L2_loadrub_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii),
 "$Rd32 = memub($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm {
+tc_44d3da28, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011001;
 let hasNewValue = 1;
@@ -9814,7 +9861,7 @@ def L2_loadrub_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memub($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101001;
 let hasNewValue = 1;
@@ -9828,7 +9875,7 @@ def L2_loadrub_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memub($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9838,7 +9885,7 @@ def L2_loadrubgp : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = memub(gp+#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel {
 let Inst{24-21} = 0b1001;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9857,7 +9904,7 @@ def L2_loadruh_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = memuh($Rs32+#$Ii)",
-tc_7f881c76, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm {
+tc_17e0d2cd, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1011;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9878,11 +9925,12 @@ def L2_loadruh_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memuh($Rx32++$Mu2:brev)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -9891,7 +9939,7 @@ def L2_loadruh_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memuh($Rx32++#$Ii:circ($Mu2))",
-tc_4403ca65, TypeLD>, Enc_e83554 {
+tc_e93a3d71, TypeLD>, Enc_e83554 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001011;
 let hasNewValue = 1;
@@ -9906,7 +9954,7 @@ def L2_loadruh_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memuh($Rx32++I:circ($Mu2))",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001011;
 let hasNewValue = 1;
@@ -9921,7 +9969,7 @@ def L2_loadruh_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Rd32 = memuh($Rx32++#$Ii)",
-tc_2fc0c436, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm {
+tc_44d3da28, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011011;
 let hasNewValue = 1;
@@ -9938,7 +9986,7 @@ def L2_loadruh_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memuh($Rx32++$Mu2)",
-tc_2fc0c436, TypeLD>, Enc_74d4e5 {
+tc_44d3da28, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101011;
 let hasNewValue = 1;
@@ -9952,7 +10000,7 @@ def L2_loadruh_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memuh($Rs32)",
-tc_7f881c76, TypeMAPPING> {
+tc_17e0d2cd, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9962,7 +10010,7 @@ def L2_loadruhgp : HInst<
 (outs IntRegs:$Rd32),
 (ins u31_1Imm:$Ii),
 "$Rd32 = memuh(gp+#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel {
 let Inst{24-21} = 0b1011;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9981,7 +10029,7 @@ def L2_loadw_locked : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memw_locked($Rs32)",
-tc_6aa5711a, TypeLD>, Enc_5e2823 {
+tc_b43e7930, TypeLD>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10010010000;
 let hasNewValue = 1;
@@ -9994,7 +10042,7 @@ def L2_ploadrbf_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memb($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101000;
 let isPredicated = 1;
@@ -10016,7 +10064,7 @@ def L2_ploadrbf_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memb($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_f4413a, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011000;
 let isPredicated = 1;
@@ -10033,7 +10081,7 @@ def L2_ploadrbf_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memb($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10043,7 +10091,7 @@ def L2_ploadrbfnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111000;
 let isPredicated = 1;
@@ -10066,7 +10114,7 @@ def L2_ploadrbfnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_f4413a, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011000;
 let isPredicated = 1;
@@ -10084,7 +10132,7 @@ def L2_ploadrbfnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memb($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10094,7 +10142,7 @@ def L2_ploadrbt_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memb($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001000;
 let isPredicated = 1;
@@ -10115,7 +10163,7 @@ def L2_ploadrbt_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memb($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_f4413a, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011000;
 let isPredicated = 1;
@@ -10131,7 +10179,7 @@ def L2_ploadrbt_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memb($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10141,7 +10189,7 @@ def L2_ploadrbtnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011000;
 let isPredicated = 1;
@@ -10163,7 +10211,7 @@ def L2_ploadrbtnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_f4413a, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011000;
 let isPredicated = 1;
@@ -10180,7 +10228,7 @@ def L2_ploadrbtnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memb($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10190,7 +10238,7 @@ def L2_ploadrdf_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
 "if (!$Pt4) $Rdd32 = memd($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101110;
 let isPredicated = 1;
@@ -10210,7 +10258,7 @@ def L2_ploadrdf_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
 "if (!$Pt4) $Rdd32 = memd($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_9d1247, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_9d1247, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011110;
 let isPredicated = 1;
@@ -10225,7 +10273,7 @@ def L2_ploadrdf_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rdd32 = memd($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -10233,7 +10281,7 @@ def L2_ploadrdfnew_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
 "if (!$Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111110;
 let isPredicated = 1;
@@ -10254,7 +10302,7 @@ def L2_ploadrdfnew_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
 "if (!$Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_9d1247, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_9d1247, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011110;
 let isPredicated = 1;
@@ -10270,7 +10318,7 @@ def L2_ploadrdfnew_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rdd32 = memd($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -10278,7 +10326,7 @@ def L2_ploadrdt_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
 "if ($Pt4) $Rdd32 = memd($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001110;
 let isPredicated = 1;
@@ -10297,7 +10345,7 @@ def L2_ploadrdt_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
 "if ($Pt4) $Rdd32 = memd($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_9d1247, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_9d1247, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011110;
 let isPredicated = 1;
@@ -10311,7 +10359,7 @@ def L2_ploadrdt_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rdd32 = memd($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -10319,7 +10367,7 @@ def L2_ploadrdtnew_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
 "if ($Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011110;
 let isPredicated = 1;
@@ -10339,7 +10387,7 @@ def L2_ploadrdtnew_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
 "if ($Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_9d1247, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_9d1247, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011110;
 let isPredicated = 1;
@@ -10354,7 +10402,7 @@ def L2_ploadrdtnew_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rdd32 = memd($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -10362,7 +10410,7 @@ def L2_ploadrhf_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if (!$Pt4) $Rd32 = memh($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101010;
 let isPredicated = 1;
@@ -10384,7 +10432,7 @@ def L2_ploadrhf_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if (!$Pt4) $Rd32 = memh($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_733b27, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011010;
 let isPredicated = 1;
@@ -10401,7 +10449,7 @@ def L2_ploadrhf_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memh($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10411,7 +10459,7 @@ def L2_ploadrhfnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111010;
 let isPredicated = 1;
@@ -10434,7 +10482,7 @@ def L2_ploadrhfnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_733b27, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011010;
 let isPredicated = 1;
@@ -10452,7 +10500,7 @@ def L2_ploadrhfnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memh($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10462,7 +10510,7 @@ def L2_ploadrht_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if ($Pt4) $Rd32 = memh($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001010;
 let isPredicated = 1;
@@ -10483,7 +10531,7 @@ def L2_ploadrht_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if ($Pt4) $Rd32 = memh($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_733b27, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011010;
 let isPredicated = 1;
@@ -10499,7 +10547,7 @@ def L2_ploadrht_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memh($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10509,7 +10557,7 @@ def L2_ploadrhtnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011010;
 let isPredicated = 1;
@@ -10531,7 +10579,7 @@ def L2_ploadrhtnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_733b27, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011010;
 let isPredicated = 1;
@@ -10548,7 +10596,7 @@ def L2_ploadrhtnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memh($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10558,7 +10606,7 @@ def L2_ploadrif_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
 "if (!$Pt4) $Rd32 = memw($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101100;
 let isPredicated = 1;
@@ -10580,7 +10628,7 @@ def L2_ploadrif_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
 "if (!$Pt4) $Rd32 = memw($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_b97f71, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_b97f71, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011100;
 let isPredicated = 1;
@@ -10597,7 +10645,7 @@ def L2_ploadrif_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memw($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10607,7 +10655,7 @@ def L2_ploadrifnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111100;
 let isPredicated = 1;
@@ -10630,7 +10678,7 @@ def L2_ploadrifnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_b97f71, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_b97f71, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011100;
 let isPredicated = 1;
@@ -10648,7 +10696,7 @@ def L2_ploadrifnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memw($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10658,7 +10706,7 @@ def L2_ploadrit_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
 "if ($Pt4) $Rd32 = memw($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001100;
 let isPredicated = 1;
@@ -10679,7 +10727,7 @@ def L2_ploadrit_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
 "if ($Pt4) $Rd32 = memw($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_b97f71, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_b97f71, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011100;
 let isPredicated = 1;
@@ -10695,7 +10743,7 @@ def L2_ploadrit_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memw($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10705,7 +10753,7 @@ def L2_ploadritnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011100;
 let isPredicated = 1;
@@ -10727,7 +10775,7 @@ def L2_ploadritnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_b97f71, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_b97f71, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011100;
 let isPredicated = 1;
@@ -10744,7 +10792,7 @@ def L2_ploadritnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memw($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10754,7 +10802,7 @@ def L2_ploadrubf_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memub($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101001;
 let isPredicated = 1;
@@ -10776,7 +10824,7 @@ def L2_ploadrubf_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memub($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_f4413a, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011001;
 let isPredicated = 1;
@@ -10793,7 +10841,7 @@ def L2_ploadrubf_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memub($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10803,7 +10851,7 @@ def L2_ploadrubfnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111001;
 let isPredicated = 1;
@@ -10826,7 +10874,7 @@ def L2_ploadrubfnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_f4413a, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011001;
 let isPredicated = 1;
@@ -10844,7 +10892,7 @@ def L2_ploadrubfnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memub($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10854,7 +10902,7 @@ def L2_ploadrubt_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memub($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001001;
 let isPredicated = 1;
@@ -10875,7 +10923,7 @@ def L2_ploadrubt_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memub($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_f4413a, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011001;
 let isPredicated = 1;
@@ -10891,7 +10939,7 @@ def L2_ploadrubt_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memub($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10901,7 +10949,7 @@ def L2_ploadrubtnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011001;
 let isPredicated = 1;
@@ -10923,7 +10971,7 @@ def L2_ploadrubtnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_f4413a, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011001;
 let isPredicated = 1;
@@ -10940,7 +10988,7 @@ def L2_ploadrubtnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memub($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10950,7 +10998,7 @@ def L2_ploadruhf_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if (!$Pt4) $Rd32 = memuh($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101011;
 let isPredicated = 1;
@@ -10972,7 +11020,7 @@ def L2_ploadruhf_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if (!$Pt4) $Rd32 = memuh($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_733b27, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011011;
 let isPredicated = 1;
@@ -10989,7 +11037,7 @@ def L2_ploadruhf_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memuh($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10999,7 +11047,7 @@ def L2_ploadruhfnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111011;
 let isPredicated = 1;
@@ -11022,7 +11070,7 @@ def L2_ploadruhfnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_733b27, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011011;
 let isPredicated = 1;
@@ -11040,7 +11088,7 @@ def L2_ploadruhfnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memuh($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -11050,7 +11098,7 @@ def L2_ploadruht_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if ($Pt4) $Rd32 = memuh($Rs32+#$Ii)",
-tc_ef52ed71, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001011;
 let isPredicated = 1;
@@ -11071,7 +11119,7 @@ def L2_ploadruht_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if ($Pt4) $Rd32 = memuh($Rx32++#$Ii)",
-tc_bad2bcaf, TypeLD>, Enc_733b27, PredNewRel {
+tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011011;
 let isPredicated = 1;
@@ -11087,7 +11135,7 @@ def L2_ploadruht_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memuh($Rs32)",
-tc_ef52ed71, TypeMAPPING> {
+tc_5ef37dc4, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -11097,7 +11145,7 @@ def L2_ploadruhtnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
-tc_2fc0c436, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011011;
 let isPredicated = 1;
@@ -11119,7 +11167,7 @@ def L2_ploadruhtnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
-tc_63fe3df7, TypeLD>, Enc_733b27, PredNewRel {
+tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011011;
 let isPredicated = 1;
@@ -11136,7 +11184,7 @@ def L2_ploadruhtnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memuh($Rs32)",
-tc_2fc0c436, TypeMAPPING> {
+tc_44d3da28, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -11146,7 +11194,7 @@ def L4_add_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) += $Rt32",
-tc_44126683, TypeV4LDST>, Enc_d44e31 {
+tc_7186d325, TypeV4LDST>, Enc_d44e31 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110000;
@@ -11165,7 +11213,7 @@ def L4_add_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) += $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11173,7 +11221,7 @@ def L4_add_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) += $Rt32",
-tc_44126683, TypeV4LDST>, Enc_163a3c {
+tc_7186d325, TypeV4LDST>, Enc_163a3c {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110001;
@@ -11192,7 +11240,7 @@ def L4_add_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) += $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11200,7 +11248,7 @@ def L4_add_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) += $Rt32",
-tc_44126683, TypeV4LDST>, Enc_226535 {
+tc_7186d325, TypeV4LDST>, Enc_226535 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110010;
@@ -11219,7 +11267,7 @@ def L4_add_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) += $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11227,7 +11275,7 @@ def L4_and_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) &= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_d44e31 {
+tc_7186d325, TypeV4LDST>, Enc_d44e31 {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110000;
@@ -11246,7 +11294,7 @@ def L4_and_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) &= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11254,7 +11302,7 @@ def L4_and_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) &= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_163a3c {
+tc_7186d325, TypeV4LDST>, Enc_163a3c {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110001;
@@ -11273,7 +11321,7 @@ def L4_and_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) &= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11281,7 +11329,7 @@ def L4_and_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) &= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_226535 {
+tc_7186d325, TypeV4LDST>, Enc_226535 {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110010;
@@ -11300,7 +11348,7 @@ def L4_and_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) &= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11308,7 +11356,7 @@ def L4_iadd_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
 "memb($Rs32+#$Ii) += #$II",
-tc_44126683, TypeV4LDST>, Enc_46c951 {
+tc_096199d3, TypeV4LDST>, Enc_46c951 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111000;
@@ -11327,7 +11375,7 @@ def L4_iadd_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memb($Rs32) += #$II",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11335,7 +11383,7 @@ def L4_iadd_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
 "memh($Rs32+#$Ii) += #$II",
-tc_44126683, TypeV4LDST>, Enc_e66a97 {
+tc_096199d3, TypeV4LDST>, Enc_e66a97 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111001;
@@ -11354,7 +11402,7 @@ def L4_iadd_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memh($Rs32) += #$II",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11362,7 +11410,7 @@ def L4_iadd_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
 "memw($Rs32+#$Ii) += #$II",
-tc_44126683, TypeV4LDST>, Enc_84b2cd {
+tc_096199d3, TypeV4LDST>, Enc_84b2cd {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111010;
@@ -11381,7 +11429,7 @@ def L4_iadd_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memw($Rs32) += #$II",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11389,7 +11437,7 @@ def L4_iand_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
 "memb($Rs32+#$Ii) = clrbit(#$II)",
-tc_44126683, TypeV4LDST>, Enc_46c951 {
+tc_096199d3, TypeV4LDST>, Enc_46c951 {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111000;
@@ -11408,7 +11456,7 @@ def L4_iand_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memb($Rs32) = clrbit(#$II)",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11416,7 +11464,7 @@ def L4_iand_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
 "memh($Rs32+#$Ii) = clrbit(#$II)",
-tc_44126683, TypeV4LDST>, Enc_e66a97 {
+tc_096199d3, TypeV4LDST>, Enc_e66a97 {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111001;
@@ -11435,7 +11483,7 @@ def L4_iand_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memh($Rs32) = clrbit(#$II)",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11443,7 +11491,7 @@ def L4_iand_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
 "memw($Rs32+#$Ii) = clrbit(#$II)",
-tc_44126683, TypeV4LDST>, Enc_84b2cd {
+tc_096199d3, TypeV4LDST>, Enc_84b2cd {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111010;
@@ -11462,7 +11510,7 @@ def L4_iand_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memw($Rs32) = clrbit(#$II)",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11470,7 +11518,7 @@ def L4_ior_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
 "memb($Rs32+#$Ii) = setbit(#$II)",
-tc_44126683, TypeV4LDST>, Enc_46c951 {
+tc_096199d3, TypeV4LDST>, Enc_46c951 {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111000;
@@ -11489,7 +11537,7 @@ def L4_ior_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memb($Rs32) = setbit(#$II)",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11497,7 +11545,7 @@ def L4_ior_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
 "memh($Rs32+#$Ii) = setbit(#$II)",
-tc_44126683, TypeV4LDST>, Enc_e66a97 {
+tc_096199d3, TypeV4LDST>, Enc_e66a97 {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111001;
@@ -11516,7 +11564,7 @@ def L4_ior_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memh($Rs32) = setbit(#$II)",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11524,7 +11572,7 @@ def L4_ior_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
 "memw($Rs32+#$Ii) = setbit(#$II)",
-tc_44126683, TypeV4LDST>, Enc_84b2cd {
+tc_096199d3, TypeV4LDST>, Enc_84b2cd {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111010;
@@ -11543,7 +11591,7 @@ def L4_ior_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memw($Rs32) = setbit(#$II)",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11551,7 +11599,7 @@ def L4_isub_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
 "memb($Rs32+#$Ii) -= #$II",
-tc_44126683, TypeV4LDST>, Enc_46c951 {
+tc_096199d3, TypeV4LDST>, Enc_46c951 {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111000;
@@ -11570,7 +11618,7 @@ def L4_isub_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memb($Rs32) -= #$II",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11578,7 +11626,7 @@ def L4_isub_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
 "memh($Rs32+#$Ii) -= #$II",
-tc_44126683, TypeV4LDST>, Enc_e66a97 {
+tc_096199d3, TypeV4LDST>, Enc_e66a97 {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111001;
@@ -11597,7 +11645,7 @@ def L4_isub_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memh($Rs32) -= #$II",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11605,7 +11653,7 @@ def L4_isub_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
 "memw($Rs32+#$Ii) -= #$II",
-tc_44126683, TypeV4LDST>, Enc_84b2cd {
+tc_096199d3, TypeV4LDST>, Enc_84b2cd {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111010;
@@ -11624,7 +11672,7 @@ def L4_isub_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memw($Rs32) -= #$II",
-tc_44126683, TypeMAPPING> {
+tc_096199d3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11632,7 +11680,7 @@ def L4_loadalignb_ap : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Re32),
 (ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
 "$Ryy32 = memb_fifo($Re32=#$II)",
-tc_5acef64a, TypeLD>, Enc_f394d3 {
+tc_7a91e76a, TypeLD>, Enc_f394d3 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010100;
@@ -11652,7 +11700,7 @@ def L4_loadalignb_ur : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Ryy32 = memb_fifo($Rt32<<#$Ii+#$II)",
-tc_0cd51c76, TypeLD>, Enc_04c959 {
+tc_a5d4aeec, TypeLD>, Enc_04c959 {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100100;
 let addrMode = BaseLongOffset;
@@ -11672,7 +11720,7 @@ def L4_loadalignh_ap : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Re32),
 (ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
 "$Ryy32 = memh_fifo($Re32=#$II)",
-tc_5acef64a, TypeLD>, Enc_f394d3 {
+tc_7a91e76a, TypeLD>, Enc_f394d3 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010010;
@@ -11692,7 +11740,7 @@ def L4_loadalignh_ur : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Ryy32 = memh_fifo($Rt32<<#$Ii+#$II)",
-tc_0cd51c76, TypeLD>, Enc_04c959 {
+tc_a5d4aeec, TypeLD>, Enc_04c959 {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100010;
 let addrMode = BaseLongOffset;
@@ -11712,7 +11760,7 @@ def L4_loadbsw2_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = membh($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_323f2d {
+tc_3b5b7ef9, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010001;
@@ -11733,7 +11781,7 @@ def L4_loadbsw2_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = membh($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_4f677b {
+tc_bab0eed9, TypeLD>, Enc_4f677b {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100001;
 let hasNewValue = 1;
@@ -11754,7 +11802,7 @@ def L4_loadbsw4_ap : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rdd32 = membh($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_7fa7f6 {
+tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010111;
@@ -11773,7 +11821,7 @@ def L4_loadbsw4_ur : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = membh($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_6185fe {
+tc_bab0eed9, TypeLD>, Enc_6185fe {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100111;
 let addrMode = BaseLongOffset;
@@ -11792,7 +11840,7 @@ def L4_loadbzw2_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memubh($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_323f2d {
+tc_3b5b7ef9, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010011;
@@ -11813,7 +11861,7 @@ def L4_loadbzw2_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memubh($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_4f677b {
+tc_bab0eed9, TypeLD>, Enc_4f677b {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100011;
 let hasNewValue = 1;
@@ -11834,7 +11882,7 @@ def L4_loadbzw4_ap : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rdd32 = memubh($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_7fa7f6 {
+tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010101;
@@ -11853,7 +11901,7 @@ def L4_loadbzw4_ur : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = memubh($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_6185fe {
+tc_bab0eed9, TypeLD>, Enc_6185fe {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100101;
 let addrMode = BaseLongOffset;
@@ -11872,7 +11920,7 @@ def L4_loadd_locked : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = memd_locked($Rs32)",
-tc_6aa5711a, TypeLD>, Enc_3a3d62 {
+tc_b43e7930, TypeLD>, Enc_3a3d62 {
 let Inst{13-5} = 0b010000000;
 let Inst{31-21} = 0b10010010000;
 let accessSize = DoubleWordAccess;
@@ -11883,7 +11931,7 @@ def L4_loadrb_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memb($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_323f2d {
+tc_3b5b7ef9, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011000;
@@ -11904,7 +11952,7 @@ def L4_loadrb_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010000;
 let hasNewValue = 1;
@@ -11921,7 +11969,7 @@ def L4_loadrb_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memb($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101000;
 let hasNewValue = 1;
@@ -11943,7 +11991,7 @@ def L4_loadrd_ap : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rdd32 = memd($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_7fa7f6 {
+tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011110;
@@ -11962,7 +12010,7 @@ def L4_loadrd_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_f47d212f, TypeLD>, Enc_84bff1, AddrModeRel, ImmRegShl {
+tc_bf061958, TypeLD>, Enc_84bff1, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010110;
 let addrMode = BaseRegOffset;
@@ -11977,7 +12025,7 @@ def L4_loadrd_ur : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = memd($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_6185fe, AddrModeRel, ImmRegShl {
+tc_bab0eed9, TypeLD>, Enc_6185fe, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101110;
 let addrMode = BaseLongOffset;
@@ -11997,7 +12045,7 @@ def L4_loadrh_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memh($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_323f2d {
+tc_3b5b7ef9, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011010;
@@ -12018,7 +12066,7 @@ def L4_loadrh_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010010;
 let hasNewValue = 1;
@@ -12035,7 +12083,7 @@ def L4_loadrh_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memh($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101010;
 let hasNewValue = 1;
@@ -12057,7 +12105,7 @@ def L4_loadri_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memw($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_323f2d {
+tc_3b5b7ef9, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011100;
@@ -12078,7 +12126,7 @@ def L4_loadri_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010100;
 let hasNewValue = 1;
@@ -12095,7 +12143,7 @@ def L4_loadri_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memw($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101100;
 let hasNewValue = 1;
@@ -12117,7 +12165,7 @@ def L4_loadrub_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memub($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_323f2d {
+tc_3b5b7ef9, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011001;
@@ -12138,7 +12186,7 @@ def L4_loadrub_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010001;
 let hasNewValue = 1;
@@ -12155,7 +12203,7 @@ def L4_loadrub_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memub($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101001;
 let hasNewValue = 1;
@@ -12177,7 +12225,7 @@ def L4_loadruh_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memuh($Re32=#$II)",
-tc_b77c481f, TypeLD>, Enc_323f2d {
+tc_3b5b7ef9, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011011;
@@ -12198,7 +12246,7 @@ def L4_loadruh_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010011;
 let hasNewValue = 1;
@@ -12215,7 +12263,7 @@ def L4_loadruh_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memuh($Rt32<<#$Ii+#$II)",
-tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101011;
 let hasNewValue = 1;
@@ -12237,7 +12285,7 @@ def L4_or_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) |= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_d44e31 {
+tc_7186d325, TypeV4LDST>, Enc_d44e31 {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110000;
@@ -12256,7 +12304,7 @@ def L4_or_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) |= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -12264,7 +12312,7 @@ def L4_or_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) |= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_163a3c {
+tc_7186d325, TypeV4LDST>, Enc_163a3c {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110001;
@@ -12283,7 +12331,7 @@ def L4_or_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) |= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -12291,7 +12339,7 @@ def L4_or_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) |= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_226535 {
+tc_7186d325, TypeV4LDST>, Enc_226535 {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110010;
@@ -12310,7 +12358,7 @@ def L4_or_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) |= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -12318,7 +12366,7 @@ def L4_ploadrbf_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memb(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111000;
@@ -12343,7 +12391,7 @@ def L4_ploadrbf_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001000;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12360,7 +12408,7 @@ def L4_ploadrbfnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memb(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111000;
@@ -12386,7 +12434,7 @@ def L4_ploadrbfnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011000;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12404,7 +12452,7 @@ def L4_ploadrbt_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memb(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111000;
@@ -12428,7 +12476,7 @@ def L4_ploadrbt_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000000;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12444,7 +12492,7 @@ def L4_ploadrbtnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memb(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111000;
@@ -12469,7 +12517,7 @@ def L4_ploadrbtnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010000;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12486,7 +12534,7 @@ def L4_ploadrdf_abs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rdd32 = memd(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2a7b91, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2a7b91, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111110;
@@ -12509,7 +12557,7 @@ def L4_ploadrdf_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_98c0b8, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_98c0b8, AddrModeRel {
 let Inst{31-21} = 0b00110001110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12524,7 +12572,7 @@ def L4_ploadrdfnew_abs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rdd32 = memd(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2a7b91, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2a7b91, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111110;
@@ -12548,7 +12596,7 @@ def L4_ploadrdfnew_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_98c0b8, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_98c0b8, AddrModeRel {
 let Inst{31-21} = 0b00110011110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12564,7 +12612,7 @@ def L4_ploadrdt_abs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rdd32 = memd(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2a7b91, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2a7b91, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111110;
@@ -12586,7 +12634,7 @@ def L4_ploadrdt_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_98c0b8, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_98c0b8, AddrModeRel {
 let Inst{31-21} = 0b00110000110;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -12600,7 +12648,7 @@ def L4_ploadrdtnew_abs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rdd32 = memd(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2a7b91, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2a7b91, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111110;
@@ -12623,7 +12671,7 @@ def L4_ploadrdtnew_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_98c0b8, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_98c0b8, AddrModeRel {
 let Inst{31-21} = 0b00110010110;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -12638,7 +12686,7 @@ def L4_ploadrhf_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memh(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111010;
@@ -12663,7 +12711,7 @@ def L4_ploadrhf_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001010;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12680,7 +12728,7 @@ def L4_ploadrhfnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memh(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111010;
@@ -12706,7 +12754,7 @@ def L4_ploadrhfnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011010;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12724,7 +12772,7 @@ def L4_ploadrht_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memh(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111010;
@@ -12748,7 +12796,7 @@ def L4_ploadrht_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000010;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12764,7 +12812,7 @@ def L4_ploadrhtnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memh(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111010;
@@ -12789,7 +12837,7 @@ def L4_ploadrhtnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010010;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12806,7 +12854,7 @@ def L4_ploadrif_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memw(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111100;
@@ -12831,7 +12879,7 @@ def L4_ploadrif_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12848,7 +12896,7 @@ def L4_ploadrifnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memw(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111100;
@@ -12874,7 +12922,7 @@ def L4_ploadrifnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12892,7 +12940,7 @@ def L4_ploadrit_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memw(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111100;
@@ -12916,7 +12964,7 @@ def L4_ploadrit_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000100;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12932,7 +12980,7 @@ def L4_ploadritnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memw(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111100;
@@ -12957,7 +13005,7 @@ def L4_ploadritnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010100;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12974,7 +13022,7 @@ def L4_ploadrubf_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memub(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111001;
@@ -12999,7 +13047,7 @@ def L4_ploadrubf_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001001;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -13016,7 +13064,7 @@ def L4_ploadrubfnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memub(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111001;
@@ -13042,7 +13090,7 @@ def L4_ploadrubfnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011001;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -13060,7 +13108,7 @@ def L4_ploadrubt_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memub(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111001;
@@ -13084,7 +13132,7 @@ def L4_ploadrubt_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000001;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13100,7 +13148,7 @@ def L4_ploadrubtnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memub(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111001;
@@ -13125,7 +13173,7 @@ def L4_ploadrubtnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010001;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13142,7 +13190,7 @@ def L4_ploadruhf_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memuh(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111011;
@@ -13167,7 +13215,7 @@ def L4_ploadruhf_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001011;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -13184,7 +13232,7 @@ def L4_ploadruhfnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memuh(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111011;
@@ -13210,7 +13258,7 @@ def L4_ploadruhfnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011011;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -13228,7 +13276,7 @@ def L4_ploadruht_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memuh(#$Ii)",
-tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111011;
@@ -13252,7 +13300,7 @@ def L4_ploadruht_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000011;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13268,7 +13316,7 @@ def L4_ploadruhtnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memuh(#$Ii)",
-tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111011;
@@ -13293,7 +13341,7 @@ def L4_ploadruhtnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010011;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13310,7 +13358,7 @@ def L4_return : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = dealloc_return($Rs32):raw",
-tc_3d04548d, TypeLD>, Enc_3a3d62, PredNewRel {
+tc_675e4897, TypeLD>, Enc_3a3d62, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10010110000;
 let isTerminator = 1;
@@ -13331,7 +13379,7 @@ def L4_return_f : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if (!$Pv4) $Rdd32 = dealloc_return($Rs32):raw",
-tc_513bef45, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_2b8da4c2, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1100;
 let Inst{31-21} = 0b10010110000;
@@ -13353,7 +13401,7 @@ def L4_return_fnew_pnt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if (!$Pv4.new) $Rdd32 = dealloc_return($Rs32):nt:raw",
-tc_395dc00f, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b10010110000;
@@ -13376,7 +13424,7 @@ def L4_return_fnew_pt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if (!$Pv4.new) $Rdd32 = dealloc_return($Rs32):t:raw",
-tc_395dc00f, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1110;
 let Inst{31-21} = 0b10010110000;
@@ -13399,7 +13447,7 @@ def L4_return_map_to_raw_f : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4) dealloc_return",
-tc_513bef45, TypeMAPPING>, Requires<[HasV65]> {
+tc_2b8da4c2, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13407,7 +13455,7 @@ def L4_return_map_to_raw_fnew_pnt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4.new) dealloc_return:nt",
-tc_395dc00f, TypeMAPPING>, Requires<[HasV65]> {
+tc_9da59d12, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13415,7 +13463,7 @@ def L4_return_map_to_raw_fnew_pt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4.new) dealloc_return:t",
-tc_395dc00f, TypeMAPPING>, Requires<[HasV65]> {
+tc_9da59d12, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13423,7 +13471,7 @@ def L4_return_map_to_raw_t : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4) dealloc_return",
-tc_3bc2c5d3, TypeMAPPING>, Requires<[HasV65]> {
+tc_4d5fa3a1, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13431,7 +13479,7 @@ def L4_return_map_to_raw_tnew_pnt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4.new) dealloc_return:nt",
-tc_e7624c08, TypeMAPPING>, Requires<[HasV65]> {
+tc_e06f432a, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13439,7 +13487,7 @@ def L4_return_map_to_raw_tnew_pt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4.new) dealloc_return:t",
-tc_e7624c08, TypeMAPPING>, Requires<[HasV65]> {
+tc_e06f432a, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13447,7 +13495,7 @@ def L4_return_t : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if ($Pv4) $Rdd32 = dealloc_return($Rs32):raw",
-tc_513bef45, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_2b8da4c2, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0100;
 let Inst{31-21} = 0b10010110000;
@@ -13468,7 +13516,7 @@ def L4_return_tnew_pnt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if ($Pv4.new) $Rdd32 = dealloc_return($Rs32):nt:raw",
-tc_395dc00f, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0010;
 let Inst{31-21} = 0b10010110000;
@@ -13490,7 +13538,7 @@ def L4_return_tnew_pt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if ($Pv4.new) $Rdd32 = dealloc_return($Rs32):t:raw",
-tc_395dc00f, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0110;
 let Inst{31-21} = 0b10010110000;
@@ -13512,7 +13560,7 @@ def L4_sub_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) -= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_d44e31 {
+tc_7186d325, TypeV4LDST>, Enc_d44e31 {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110000;
@@ -13531,7 +13579,7 @@ def L4_sub_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) -= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13539,7 +13587,7 @@ def L4_sub_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) -= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_163a3c {
+tc_7186d325, TypeV4LDST>, Enc_163a3c {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110001;
@@ -13558,7 +13606,7 @@ def L4_sub_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) -= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13566,7 +13614,7 @@ def L4_sub_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) -= $Rt32",
-tc_44126683, TypeV4LDST>, Enc_226535 {
+tc_7186d325, TypeV4LDST>, Enc_226535 {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110010;
@@ -13585,7 +13633,7 @@ def L4_sub_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) -= $Rt32",
-tc_44126683, TypeMAPPING> {
+tc_7186d325, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13593,15 +13641,26 @@ def L6_deallocframe_map_to_raw : HInst<
 (outs),
 (ins),
 "deallocframe",
-tc_d1090e34, TypeMAPPING>, Requires<[HasV65]> {
+tc_15aa71c5, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
+def L6_memcpy : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, ModRegs:$Mu2),
+"memcpy($Rs32,$Rt32,$Mu2)",
+tc_a6b1eca9, TypeLD>, Enc_a75aa6, Requires<[HasV66]> {
+let Inst{7-0} = 0b01000000;
+let Inst{31-21} = 0b10010010000;
+let mayLoad = 1;
+let isSolo = 1;
+let mayStore = 1;
+}
 def L6_return_map_to_raw : HInst<
 (outs),
 (ins),
 "dealloc_return",
-tc_3d04548d, TypeMAPPING>, Requires<[HasV65]> {
+tc_675e4897, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13609,7 +13668,7 @@ def M2_acci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += add($Rs32,$Rt32)",
-tc_c74f796f, TypeM>, Enc_2ae154, ImmRegRel {
+tc_f675fee8, TypeM>, Enc_2ae154, ImmRegRel {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -13624,7 +13683,7 @@ def M2_accii : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rx32 += add($Rs32,#$Ii)",
-tc_c74f796f, TypeM>, Enc_c90aca, ImmRegRel {
+tc_f675fee8, TypeM>, Enc_c90aca, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100010000;
 let hasNewValue = 1;
@@ -13643,7 +13702,7 @@ def M2_cmaci_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpyi($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13654,7 +13713,7 @@ def M2_cmacr_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpyr($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13665,7 +13724,7 @@ def M2_cmacs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpy($Rs32,$Rt32):sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13677,7 +13736,7 @@ def M2_cmacs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpy($Rs32,$Rt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111100;
@@ -13689,7 +13748,7 @@ def M2_cmacsc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpy($Rs32,$Rt32*):sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111010;
@@ -13701,7 +13760,7 @@ def M2_cmacsc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpy($Rs32,$Rt32*):<<1:sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111110;
@@ -13713,7 +13772,7 @@ def M2_cmpyi_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpyi($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -13723,7 +13782,7 @@ def M2_cmpyr_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpyr($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -13733,7 +13792,7 @@ def M2_cmpyrs_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmpy($Rs32,$Rt32):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101001;
@@ -13746,7 +13805,7 @@ def M2_cmpyrs_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmpy($Rs32,$Rt32):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -13759,7 +13818,7 @@ def M2_cmpyrsc_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmpy($Rs32,$Rt32*):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101011;
@@ -13772,7 +13831,7 @@ def M2_cmpyrsc_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmpy($Rs32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101111;
@@ -13785,7 +13844,7 @@ def M2_cmpys_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpy($Rs32,$Rt32):sat",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -13796,7 +13855,7 @@ def M2_cmpys_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpy($Rs32,$Rt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101100;
@@ -13807,7 +13866,7 @@ def M2_cmpysc_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpy($Rs32,$Rt32*):sat",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101010;
@@ -13818,7 +13877,7 @@ def M2_cmpysc_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpy($Rs32,$Rt32*):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101110;
@@ -13829,7 +13888,7 @@ def M2_cnacs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= cmpy($Rs32,$Rt32):sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13841,7 +13900,7 @@ def M2_cnacs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= cmpy($Rs32,$Rt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111100;
@@ -13853,7 +13912,7 @@ def M2_cnacsc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= cmpy($Rs32,$Rt32*):sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111010;
@@ -13865,7 +13924,7 @@ def M2_cnacsc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= cmpy($Rs32,$Rt32*):<<1:sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111110;
@@ -13877,7 +13936,7 @@ def M2_dpmpyss_acc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13888,7 +13947,7 @@ def M2_dpmpyss_nac_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111001;
@@ -13899,7 +13958,7 @@ def M2_dpmpyss_rnd_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32):rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101001;
@@ -13911,7 +13970,7 @@ def M2_dpmpyss_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -13921,7 +13980,7 @@ def M2_dpmpyuu_acc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111010;
@@ -13932,7 +13991,7 @@ def M2_dpmpyuu_nac_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111011;
@@ -13943,7 +14002,7 @@ def M2_dpmpyuu_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101010;
@@ -13953,7 +14012,7 @@ def M2_hmmpyh_rs1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32.h):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -13966,7 +14025,7 @@ def M2_hmmpyh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32.h):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -13979,7 +14038,7 @@ def M2_hmmpyl_rs1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32.l):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101111;
@@ -13992,7 +14051,7 @@ def M2_hmmpyl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32.l):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -14005,7 +14064,7 @@ def M2_maci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyi($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_2ae154, ImmRegRel {
+tc_d773585a, TypeM>, Enc_2ae154, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -14020,7 +14079,7 @@ def M2_macsin : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Rx32 -= mpyi($Rs32,#$Ii)",
-tc_16d0d8d5, TypeM>, Enc_c90aca {
+tc_05d3a09b, TypeM>, Enc_c90aca {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100001100;
 let hasNewValue = 1;
@@ -14038,7 +14097,7 @@ def M2_macsip : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Rx32 += mpyi($Rs32,#$Ii)",
-tc_16d0d8d5, TypeM>, Enc_c90aca, ImmRegRel {
+tc_05d3a09b, TypeM>, Enc_c90aca, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100001000;
 let hasNewValue = 1;
@@ -14057,7 +14116,7 @@ def M2_mmachs_rs0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywoh($Rss32,$Rtt32):rnd:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -14069,7 +14128,7 @@ def M2_mmachs_rs1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -14081,7 +14140,7 @@ def M2_mmachs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywoh($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -14093,7 +14152,7 @@ def M2_mmachs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -14105,7 +14164,7 @@ def M2_mmacls_rs0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweh($Rss32,$Rtt32):rnd:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -14117,7 +14176,7 @@ def M2_mmacls_rs1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -14129,7 +14188,7 @@ def M2_mmacls_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweh($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -14141,7 +14200,7 @@ def M2_mmacls_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -14153,7 +14212,7 @@ def M2_mmacuhs_rs0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywouh($Rss32,$Rtt32):rnd:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010011;
@@ -14165,7 +14224,7 @@ def M2_mmacuhs_rs1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -14177,7 +14236,7 @@ def M2_mmacuhs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywouh($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -14189,7 +14248,7 @@ def M2_mmacuhs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010110;
@@ -14201,7 +14260,7 @@ def M2_mmaculs_rs0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweuh($Rss32,$Rtt32):rnd:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010011;
@@ -14213,7 +14272,7 @@ def M2_mmaculs_rs1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -14225,7 +14284,7 @@ def M2_mmaculs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweuh($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -14237,7 +14296,7 @@ def M2_mmaculs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010110;
@@ -14249,7 +14308,7 @@ def M2_mmpyh_rs0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywoh($Rss32,$Rtt32):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -14260,7 +14319,7 @@ def M2_mmpyh_rs1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -14271,7 +14330,7 @@ def M2_mmpyh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywoh($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -14282,7 +14341,7 @@ def M2_mmpyh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -14293,7 +14352,7 @@ def M2_mmpyl_rs0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweh($Rss32,$Rtt32):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -14304,7 +14363,7 @@ def M2_mmpyl_rs1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -14315,7 +14374,7 @@ def M2_mmpyl_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweh($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -14326,7 +14385,7 @@ def M2_mmpyl_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -14337,7 +14396,7 @@ def M2_mmpyuh_rs0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywouh($Rss32,$Rtt32):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000011;
@@ -14348,7 +14407,7 @@ def M2_mmpyuh_rs1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -14359,7 +14418,7 @@ def M2_mmpyuh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywouh($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -14370,7 +14429,7 @@ def M2_mmpyuh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -14381,7 +14440,7 @@ def M2_mmpyul_rs0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweuh($Rss32,$Rtt32):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000011;
@@ -14392,7 +14451,7 @@ def M2_mmpyul_rs1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -14403,7 +14462,7 @@ def M2_mmpyul_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweuh($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -14414,18 +14473,31 @@ def M2_mmpyul_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
 let prefersSlot3 = 1;
 let Defs = [USR_OVF];
 }
+def M2_mnaci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyi($Rs32,$Rt32)",
+tc_bdceeac1, TypeM>, Enc_2ae154, Requires<[HasV66]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
 def M2_mpy_acc_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14438,7 +14510,7 @@ def M2_mpy_acc_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14451,7 +14523,7 @@ def M2_mpy_acc_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14464,7 +14536,7 @@ def M2_mpy_acc_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14477,7 +14549,7 @@ def M2_mpy_acc_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14490,7 +14562,7 @@ def M2_mpy_acc_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14503,7 +14575,7 @@ def M2_mpy_acc_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14516,7 +14588,7 @@ def M2_mpy_acc_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14529,7 +14601,7 @@ def M2_mpy_acc_sat_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.h):sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14543,7 +14615,7 @@ def M2_mpy_acc_sat_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.h):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14557,7 +14629,7 @@ def M2_mpy_acc_sat_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.l):sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14571,7 +14643,7 @@ def M2_mpy_acc_sat_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.l):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14585,7 +14657,7 @@ def M2_mpy_acc_sat_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.h):sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14599,7 +14671,7 @@ def M2_mpy_acc_sat_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.h):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14613,7 +14685,7 @@ def M2_mpy_acc_sat_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.l):sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14627,7 +14699,7 @@ def M2_mpy_acc_sat_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.l):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14641,7 +14713,7 @@ def M2_mpy_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -14653,7 +14725,7 @@ def M2_mpy_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -14665,7 +14737,7 @@ def M2_mpy_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -14677,7 +14749,7 @@ def M2_mpy_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -14689,7 +14761,7 @@ def M2_mpy_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -14701,7 +14773,7 @@ def M2_mpy_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -14713,7 +14785,7 @@ def M2_mpy_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -14725,7 +14797,7 @@ def M2_mpy_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -14737,7 +14809,7 @@ def M2_mpy_nac_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14750,7 +14822,7 @@ def M2_mpy_nac_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14763,7 +14835,7 @@ def M2_mpy_nac_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14776,7 +14848,7 @@ def M2_mpy_nac_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14789,7 +14861,7 @@ def M2_mpy_nac_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14802,7 +14874,7 @@ def M2_mpy_nac_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14815,7 +14887,7 @@ def M2_mpy_nac_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14828,7 +14900,7 @@ def M2_mpy_nac_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14841,7 +14913,7 @@ def M2_mpy_nac_sat_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.h):sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14855,7 +14927,7 @@ def M2_mpy_nac_sat_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14869,7 +14941,7 @@ def M2_mpy_nac_sat_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.l):sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14883,7 +14955,7 @@ def M2_mpy_nac_sat_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14897,7 +14969,7 @@ def M2_mpy_nac_sat_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.h):sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14911,7 +14983,7 @@ def M2_mpy_nac_sat_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14925,7 +14997,7 @@ def M2_mpy_nac_sat_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.l):sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14939,7 +15011,7 @@ def M2_mpy_nac_sat_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14953,7 +15025,7 @@ def M2_mpy_rnd_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -14965,7 +15037,7 @@ def M2_mpy_rnd_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -14977,7 +15049,7 @@ def M2_mpy_rnd_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -14989,7 +15061,7 @@ def M2_mpy_rnd_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15001,7 +15073,7 @@ def M2_mpy_rnd_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15013,7 +15085,7 @@ def M2_mpy_rnd_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15025,7 +15097,7 @@ def M2_mpy_rnd_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15037,7 +15109,7 @@ def M2_mpy_rnd_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15049,7 +15121,7 @@ def M2_mpy_sat_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -15062,7 +15134,7 @@ def M2_mpy_sat_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -15075,7 +15147,7 @@ def M2_mpy_sat_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -15088,7 +15160,7 @@ def M2_mpy_sat_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -15101,7 +15173,7 @@ def M2_mpy_sat_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -15114,7 +15186,7 @@ def M2_mpy_sat_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -15127,7 +15199,7 @@ def M2_mpy_sat_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -15140,7 +15212,7 @@ def M2_mpy_sat_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -15153,7 +15225,7 @@ def M2_mpy_sat_rnd_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15166,7 +15238,7 @@ def M2_mpy_sat_rnd_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15179,7 +15251,7 @@ def M2_mpy_sat_rnd_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15192,7 +15264,7 @@ def M2_mpy_sat_rnd_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15205,7 +15277,7 @@ def M2_mpy_sat_rnd_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15218,7 +15290,7 @@ def M2_mpy_sat_rnd_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15231,7 +15303,7 @@ def M2_mpy_sat_rnd_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15244,7 +15316,7 @@ def M2_mpy_sat_rnd_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15257,7 +15329,7 @@ def M2_mpy_up : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101000;
@@ -15269,7 +15341,7 @@ def M2_mpy_up_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -15281,7 +15353,7 @@ def M2_mpy_up_s1_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101111;
@@ -15294,7 +15366,7 @@ def M2_mpyd_acc_hh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.h,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110000;
@@ -15305,7 +15377,7 @@ def M2_mpyd_acc_hh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.h,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110100;
@@ -15316,7 +15388,7 @@ def M2_mpyd_acc_hl_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.h,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110000;
@@ -15327,7 +15399,7 @@ def M2_mpyd_acc_hl_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.h,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110100;
@@ -15338,7 +15410,7 @@ def M2_mpyd_acc_lh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.l,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110000;
@@ -15349,7 +15421,7 @@ def M2_mpyd_acc_lh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.l,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110100;
@@ -15360,7 +15432,7 @@ def M2_mpyd_acc_ll_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.l,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110000;
@@ -15371,7 +15443,7 @@ def M2_mpyd_acc_ll_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.l,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110100;
@@ -15382,7 +15454,7 @@ def M2_mpyd_hh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.h)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100000;
@@ -15392,7 +15464,7 @@ def M2_mpyd_hh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100100;
@@ -15402,7 +15474,7 @@ def M2_mpyd_hl_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.l)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100000;
@@ -15412,7 +15484,7 @@ def M2_mpyd_hl_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100100;
@@ -15422,7 +15494,7 @@ def M2_mpyd_lh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.h)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100000;
@@ -15432,7 +15504,7 @@ def M2_mpyd_lh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100100;
@@ -15442,7 +15514,7 @@ def M2_mpyd_ll_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.l)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100000;
@@ -15452,7 +15524,7 @@ def M2_mpyd_ll_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100100;
@@ -15462,7 +15534,7 @@ def M2_mpyd_nac_hh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.h,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110001;
@@ -15473,7 +15545,7 @@ def M2_mpyd_nac_hh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.h,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110101;
@@ -15484,7 +15556,7 @@ def M2_mpyd_nac_hl_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.h,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110001;
@@ -15495,7 +15567,7 @@ def M2_mpyd_nac_hl_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.h,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110101;
@@ -15506,7 +15578,7 @@ def M2_mpyd_nac_lh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.l,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110001;
@@ -15517,7 +15589,7 @@ def M2_mpyd_nac_lh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.l,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110101;
@@ -15528,7 +15600,7 @@ def M2_mpyd_nac_ll_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.l,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110001;
@@ -15539,7 +15611,7 @@ def M2_mpyd_nac_ll_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.l,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110101;
@@ -15550,7 +15622,7 @@ def M2_mpyd_rnd_hh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.h):rnd",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100001;
@@ -15560,7 +15632,7 @@ def M2_mpyd_rnd_hh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100101;
@@ -15570,7 +15642,7 @@ def M2_mpyd_rnd_hl_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.l):rnd",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100001;
@@ -15580,7 +15652,7 @@ def M2_mpyd_rnd_hl_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100101;
@@ -15590,7 +15662,7 @@ def M2_mpyd_rnd_lh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.h):rnd",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100001;
@@ -15600,7 +15672,7 @@ def M2_mpyd_rnd_lh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100101;
@@ -15610,7 +15682,7 @@ def M2_mpyd_rnd_ll_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.l):rnd",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100001;
@@ -15620,7 +15692,7 @@ def M2_mpyd_rnd_ll_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100101;
@@ -15630,7 +15702,7 @@ def M2_mpyi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyi($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_5ab2be, ImmRegRel {
+tc_bafaade3, TypeM>, Enc_5ab2be, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101000;
@@ -15644,7 +15716,7 @@ def M2_mpysin : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u8_0Imm:$Ii),
 "$Rd32 = -mpyi($Rs32,#$Ii)",
-tc_1853ea6d, TypeM>, Enc_b8c967 {
+tc_c8ce0b5c, TypeM>, Enc_b8c967 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100000100;
 let hasNewValue = 1;
@@ -15655,7 +15727,7 @@ def M2_mpysip : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Rd32 = +mpyi($Rs32,#$Ii)",
-tc_1853ea6d, TypeM>, Enc_b8c967 {
+tc_c8ce0b5c, TypeM>, Enc_b8c967 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100000000;
 let hasNewValue = 1;
@@ -15671,7 +15743,7 @@ def M2_mpysmi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, m32_0Imm:$Ii),
 "$Rd32 = mpyi($Rs32,#$Ii)",
-tc_1853ea6d, TypeM>, ImmRegRel {
+tc_c8ce0b5c, TypeM>, ImmRegRel {
 let hasNewValue = 1;
 let opNewValue = 0;
 let CextOpcode = "M2_mpyi";
@@ -15687,7 +15759,7 @@ def M2_mpysu_up : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpysu($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101011;
@@ -15699,7 +15771,7 @@ def M2_mpyu_acc_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.h,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110010;
@@ -15712,7 +15784,7 @@ def M2_mpyu_acc_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.h,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110110;
@@ -15725,7 +15797,7 @@ def M2_mpyu_acc_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.h,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110010;
@@ -15738,7 +15810,7 @@ def M2_mpyu_acc_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.h,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110110;
@@ -15751,7 +15823,7 @@ def M2_mpyu_acc_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.l,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110010;
@@ -15764,7 +15836,7 @@ def M2_mpyu_acc_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.l,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110110;
@@ -15777,7 +15849,7 @@ def M2_mpyu_acc_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.l,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110010;
@@ -15790,7 +15862,7 @@ def M2_mpyu_acc_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.l,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110110;
@@ -15803,7 +15875,7 @@ def M2_mpyu_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.h,$Rt32.h)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100010;
@@ -15815,7 +15887,7 @@ def M2_mpyu_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.h,$Rt32.h):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100110;
@@ -15827,7 +15899,7 @@ def M2_mpyu_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.h,$Rt32.l)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100010;
@@ -15839,7 +15911,7 @@ def M2_mpyu_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.h,$Rt32.l):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100110;
@@ -15851,7 +15923,7 @@ def M2_mpyu_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.l,$Rt32.h)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100010;
@@ -15863,7 +15935,7 @@ def M2_mpyu_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.l,$Rt32.h):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100110;
@@ -15875,7 +15947,7 @@ def M2_mpyu_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.l,$Rt32.l)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100010;
@@ -15887,7 +15959,7 @@ def M2_mpyu_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.l,$Rt32.l):<<1",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100110;
@@ -15899,7 +15971,7 @@ def M2_mpyu_nac_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.h,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110011;
@@ -15912,7 +15984,7 @@ def M2_mpyu_nac_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110111;
@@ -15925,7 +15997,7 @@ def M2_mpyu_nac_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.h,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110011;
@@ -15938,7 +16010,7 @@ def M2_mpyu_nac_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110111;
@@ -15951,7 +16023,7 @@ def M2_mpyu_nac_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.l,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110011;
@@ -15964,7 +16036,7 @@ def M2_mpyu_nac_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110111;
@@ -15977,7 +16049,7 @@ def M2_mpyu_nac_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.l,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110011;
@@ -15990,7 +16062,7 @@ def M2_mpyu_nac_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110111;
@@ -16003,7 +16075,7 @@ def M2_mpyu_up : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101010;
@@ -16015,7 +16087,7 @@ def M2_mpyud_acc_hh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.h,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110010;
@@ -16026,7 +16098,7 @@ def M2_mpyud_acc_hh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.h,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110110;
@@ -16037,7 +16109,7 @@ def M2_mpyud_acc_hl_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.h,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110010;
@@ -16048,7 +16120,7 @@ def M2_mpyud_acc_hl_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.h,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110110;
@@ -16059,7 +16131,7 @@ def M2_mpyud_acc_lh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.l,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110010;
@@ -16070,7 +16142,7 @@ def M2_mpyud_acc_lh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.l,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110110;
@@ -16081,7 +16153,7 @@ def M2_mpyud_acc_ll_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.l,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110010;
@@ -16092,7 +16164,7 @@ def M2_mpyud_acc_ll_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.l,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110110;
@@ -16103,7 +16175,7 @@ def M2_mpyud_hh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.h,$Rt32.h)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100010;
@@ -16113,7 +16185,7 @@ def M2_mpyud_hh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.h,$Rt32.h):<<1",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100110;
@@ -16123,7 +16195,7 @@ def M2_mpyud_hl_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.h,$Rt32.l)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100010;
@@ -16133,7 +16205,7 @@ def M2_mpyud_hl_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.h,$Rt32.l):<<1",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100110;
@@ -16143,7 +16215,7 @@ def M2_mpyud_lh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.l,$Rt32.h)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100010;
@@ -16153,7 +16225,7 @@ def M2_mpyud_lh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.l,$Rt32.h):<<1",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100110;
@@ -16163,7 +16235,7 @@ def M2_mpyud_ll_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.l,$Rt32.l)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100010;
@@ -16173,7 +16245,7 @@ def M2_mpyud_ll_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.l,$Rt32.l):<<1",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100110;
@@ -16183,7 +16255,7 @@ def M2_mpyud_nac_hh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.h,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110011;
@@ -16194,7 +16266,7 @@ def M2_mpyud_nac_hh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110111;
@@ -16205,7 +16277,7 @@ def M2_mpyud_nac_hl_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.h,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110011;
@@ -16216,7 +16288,7 @@ def M2_mpyud_nac_hl_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110111;
@@ -16227,7 +16299,7 @@ def M2_mpyud_nac_lh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.l,$Rt32.h)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110011;
@@ -16238,7 +16310,7 @@ def M2_mpyud_nac_lh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110111;
@@ -16249,7 +16321,7 @@ def M2_mpyud_nac_ll_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.l,$Rt32.l)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110011;
@@ -16260,7 +16332,7 @@ def M2_mpyud_nac_ll_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110111;
@@ -16271,7 +16343,7 @@ def M2_mpyui : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyui($Rs32,$Rt32)",
-tc_8fd5f294, TypeM> {
+tc_bafaade3, TypeM> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -16281,7 +16353,7 @@ def M2_nacci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= add($Rs32,$Rt32)",
-tc_c74f796f, TypeM>, Enc_2ae154 {
+tc_f675fee8, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111100;
@@ -16295,7 +16367,7 @@ def M2_naccii : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rx32 -= add($Rs32,#$Ii)",
-tc_c74f796f, TypeM>, Enc_c90aca {
+tc_f675fee8, TypeM>, Enc_c90aca {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100010100;
 let hasNewValue = 1;
@@ -16313,7 +16385,7 @@ def M2_subacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rx32 += sub($Rt32,$Rs32)",
-tc_c74f796f, TypeM>, Enc_a568d4 {
+tc_f675fee8, TypeM>, Enc_a568d4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -16327,7 +16399,7 @@ def M2_vabsdiffh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffh($Rtt32,$Rss32)",
-tc_2b6f77c6, TypeM>, Enc_ea23e4 {
+tc_002cb246, TypeM>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000011;
@@ -16337,7 +16409,7 @@ def M2_vabsdiffw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffw($Rtt32,$Rss32)",
-tc_2b6f77c6, TypeM>, Enc_ea23e4 {
+tc_002cb246, TypeM>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -16347,7 +16419,7 @@ def M2_vcmac_s0_sat_i : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vcmpyi($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -16359,7 +16431,7 @@ def M2_vcmac_s0_sat_r : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vcmpyr($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -16371,7 +16443,7 @@ def M2_vcmpy_s0_sat_i : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vcmpyi($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -16382,7 +16454,7 @@ def M2_vcmpy_s0_sat_r : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vcmpyr($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -16393,7 +16465,7 @@ def M2_vcmpy_s1_sat_i : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vcmpyi($Rss32,$Rtt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -16404,7 +16476,7 @@ def M2_vcmpy_s1_sat_r : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vcmpyr($Rss32,$Rtt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -16415,7 +16487,7 @@ def M2_vdmacs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpy($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16427,7 +16499,7 @@ def M2_vdmacs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpy($Rss32,$Rtt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -16439,7 +16511,7 @@ def M2_vdmpyrs_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vdmpy($Rss32,$Rtt32):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_d2216a {
+tc_bafaade3, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001000;
@@ -16452,7 +16524,7 @@ def M2_vdmpyrs_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vdmpy($Rss32,$Rtt32):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_d2216a {
+tc_bafaade3, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001100;
@@ -16465,7 +16537,7 @@ def M2_vdmpys_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpy($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16476,7 +16548,7 @@ def M2_vdmpys_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpy($Rss32,$Rtt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -16487,7 +16559,7 @@ def M2_vmac2 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyh($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111001;
@@ -16498,7 +16570,7 @@ def M2_vmac2es : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyeh($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -16509,7 +16581,7 @@ def M2_vmac2es_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyeh($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16521,7 +16593,7 @@ def M2_vmac2es_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyeh($Rss32,$Rtt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -16533,7 +16605,7 @@ def M2_vmac2s_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyh($Rs32,$Rt32):sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -16545,7 +16617,7 @@ def M2_vmac2s_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyh($Rs32,$Rt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111100;
@@ -16557,7 +16629,7 @@ def M2_vmac2su_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyhsu($Rs32,$Rt32):sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111011;
@@ -16569,7 +16641,7 @@ def M2_vmac2su_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyhsu($Rs32,$Rt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111111;
@@ -16581,7 +16653,7 @@ def M2_vmpy2es_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyeh($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16592,7 +16664,7 @@ def M2_vmpy2es_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyeh($Rss32,$Rtt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -16603,7 +16675,7 @@ def M2_vmpy2s_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpyh($Rs32,$Rt32):sat",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -16614,7 +16686,7 @@ def M2_vmpy2s_s0pack : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vmpyh($Rs32,$Rt32):rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101001;
@@ -16627,7 +16699,7 @@ def M2_vmpy2s_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpyh($Rs32,$Rt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101100;
@@ -16638,7 +16710,7 @@ def M2_vmpy2s_s1pack : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vmpyh($Rs32,$Rt32):<<1:rnd:sat",
-tc_8fd5f294, TypeM>, Enc_5ab2be {
+tc_bafaade3, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -16651,7 +16723,7 @@ def M2_vmpy2su_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpyhsu($Rs32,$Rt32):sat",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -16662,7 +16734,7 @@ def M2_vmpy2su_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpyhsu($Rs32,$Rt32):<<1:sat",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101100;
@@ -16673,7 +16745,7 @@ def M2_vraddh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vraddh($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_d2216a {
+tc_bafaade3, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001001;
@@ -16685,7 +16757,7 @@ def M2_vradduh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vradduh($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_d2216a {
+tc_bafaade3, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001000;
@@ -16697,7 +16769,7 @@ def M2_vrcmaci_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpyi($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16708,7 +16780,7 @@ def M2_vrcmaci_s0c : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpyi($Rss32,$Rtt32*)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -16719,7 +16791,7 @@ def M2_vrcmacr_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpyr($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16730,7 +16802,7 @@ def M2_vrcmacr_s0c : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpyr($Rss32,$Rtt32*)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010011;
@@ -16741,7 +16813,7 @@ def M2_vrcmpyi_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpyi($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16751,7 +16823,7 @@ def M2_vrcmpyi_s0c : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpyi($Rss32,$Rtt32*)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -16761,7 +16833,7 @@ def M2_vrcmpyr_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpyr($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16771,7 +16843,7 @@ def M2_vrcmpyr_s0c : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpyr($Rss32,$Rtt32*)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000011;
@@ -16781,7 +16853,7 @@ def M2_vrcmpys_acc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += vrcmpys($Rss32,$Rt32):<<1:sat",
-tc_e913dc32, TypeM> {
+tc_d773585a, TypeM> {
 let isPseudo = 1;
 let Constraints = "$Rxx32 = $Rxx32in";
 }
@@ -16789,7 +16861,7 @@ def M2_vrcmpys_acc_s1_h : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -16801,7 +16873,7 @@ def M2_vrcmpys_acc_s1_l : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -16813,14 +16885,14 @@ def M2_vrcmpys_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vrcmpys($Rss32,$Rt32):<<1:sat",
-tc_8fd5f294, TypeM> {
+tc_bafaade3, TypeM> {
 let isPseudo = 1;
 }
 def M2_vrcmpys_s1_h : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -16831,7 +16903,7 @@ def M2_vrcmpys_s1_l : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -16842,7 +16914,7 @@ def M2_vrcmpys_s1rp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = vrcmpys($Rss32,$Rt32):<<1:rnd:sat",
-tc_8fd5f294, TypeM> {
+tc_bafaade3, TypeM> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -16851,7 +16923,7 @@ def M2_vrcmpys_s1rp_h : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:hi",
-tc_8fd5f294, TypeM>, Enc_d2216a {
+tc_bafaade3, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001101;
@@ -16864,7 +16936,7 @@ def M2_vrcmpys_s1rp_l : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:lo",
-tc_8fd5f294, TypeM>, Enc_d2216a {
+tc_bafaade3, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001101;
@@ -16877,7 +16949,7 @@ def M2_vrmac_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpyh($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16888,7 +16960,7 @@ def M2_vrmpy_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpyh($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16898,7 +16970,7 @@ def M2_xor_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 ^= xor($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111100;
@@ -16912,7 +16984,7 @@ def M4_and_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= and($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111010;
@@ -16926,7 +16998,7 @@ def M4_and_andn : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= and($Rs32,~$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111001;
@@ -16940,7 +17012,7 @@ def M4_and_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= or($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111010;
@@ -16954,7 +17026,7 @@ def M4_and_xor : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= xor($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111010;
@@ -16968,7 +17040,7 @@ def M4_cmpyi_wh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyiwh($Rss32,$Rt32):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28 {
+tc_bafaade3, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -16981,7 +17053,7 @@ def M4_cmpyi_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28 {
+tc_bafaade3, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -16994,7 +17066,7 @@ def M4_cmpyr_wh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyrwh($Rss32,$Rt32):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28 {
+tc_bafaade3, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17007,7 +17079,7 @@ def M4_cmpyr_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28 {
+tc_bafaade3, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17020,7 +17092,7 @@ def M4_mac_up_s1_sat : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32,$Rt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -17035,7 +17107,7 @@ def M4_mpyri_addi : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii, IntRegs:$Rs32, u6_0Imm:$II),
 "$Rd32 = add(#$Ii,mpyi($Rs32,#$II))",
-tc_16d0d8d5, TypeALU64>, Enc_322e1b, ImmRegRel {
+tc_05d3a09b, TypeALU64>, Enc_322e1b, ImmRegRel {
 let Inst{31-24} = 0b11011000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -17051,7 +17123,7 @@ def M4_mpyri_addr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Ru32, IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Rd32 = add($Ru32,mpyi($Rs32,#$Ii))",
-tc_16d0d8d5, TypeALU64>, Enc_420cf3, ImmRegRel {
+tc_05d3a09b, TypeALU64>, Enc_420cf3, ImmRegRel {
 let Inst{31-23} = 0b110111111;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -17068,7 +17140,7 @@ def M4_mpyri_addr_u2 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Ru32, u6_2Imm:$Ii, IntRegs:$Rs32),
 "$Rd32 = add($Ru32,mpyi(#$Ii,$Rs32))",
-tc_bcc96cee, TypeALU64>, Enc_277737 {
+tc_1a2fd869, TypeALU64>, Enc_277737 {
 let Inst{31-23} = 0b110111110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -17078,7 +17150,7 @@ def M4_mpyrr_addi : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add(#$Ii,mpyi($Rs32,$Rt32))",
-tc_e913dc32, TypeALU64>, Enc_a7b8e8, ImmRegRel {
+tc_d773585a, TypeALU64>, Enc_a7b8e8, ImmRegRel {
 let Inst{31-23} = 0b110101110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -17095,7 +17167,7 @@ def M4_mpyrr_addr : HInst<
 (outs IntRegs:$Ry32),
 (ins IntRegs:$Ru32, IntRegs:$Ry32in, IntRegs:$Rs32),
 "$Ry32 = add($Ru32,mpyi($Ry32in,$Rs32))",
-tc_e913dc32, TypeM>, Enc_7f1a05, ImmRegRel {
+tc_d773585a, TypeM>, Enc_7f1a05, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100011000;
@@ -17110,7 +17182,7 @@ def M4_nac_up_s1_sat : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32,$Rt32):<<1:sat",
-tc_e913dc32, TypeM>, Enc_2ae154 {
+tc_d773585a, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -17125,7 +17197,7 @@ def M4_or_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= and($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111010;
@@ -17139,7 +17211,7 @@ def M4_or_andn : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= and($Rs32,~$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111001;
@@ -17153,7 +17225,7 @@ def M4_or_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= or($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111110;
@@ -17167,7 +17239,7 @@ def M4_or_xor : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= xor($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111110;
@@ -17181,7 +17253,7 @@ def M4_pmpyw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = pmpyw($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101010;
@@ -17191,7 +17263,7 @@ def M4_pmpyw_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 ^= pmpyw($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111001;
@@ -17202,7 +17274,7 @@ def M4_vpmpyh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vpmpyh($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101110;
@@ -17212,7 +17284,7 @@ def M4_vpmpyh_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 ^= vpmpyh($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111101;
@@ -17223,7 +17295,7 @@ def M4_vrmpyeh_acc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpyweh($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -17234,7 +17306,7 @@ def M4_vrmpyeh_acc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpyweh($Rss32,$Rtt32):<<1",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -17245,7 +17317,7 @@ def M4_vrmpyeh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpyweh($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -17255,7 +17327,7 @@ def M4_vrmpyeh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpyweh($Rss32,$Rtt32):<<1",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -17265,7 +17337,7 @@ def M4_vrmpyoh_acc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpywoh($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010011;
@@ -17276,7 +17348,7 @@ def M4_vrmpyoh_acc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpywoh($Rss32,$Rtt32):<<1",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -17287,7 +17359,7 @@ def M4_vrmpyoh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpywoh($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -17297,7 +17369,7 @@ def M4_vrmpyoh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpywoh($Rss32,$Rtt32):<<1",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -17307,7 +17379,7 @@ def M4_xor_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 ^= and($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111110;
@@ -17321,7 +17393,7 @@ def M4_xor_andn : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 ^= and($Rs32,~$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111001;
@@ -17335,7 +17407,7 @@ def M4_xor_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 ^= or($Rs32,$Rt32)",
-tc_84df2cd3, TypeM>, Enc_2ae154 {
+tc_f429765c, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111110;
@@ -17349,7 +17421,7 @@ def M4_xor_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 ^= xor($Rss32,$Rtt32)",
-tc_84df2cd3, TypeS_3op>, Enc_88c16c {
+tc_f429765c, TypeS_3op>, Enc_88c16c {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001010100;
@@ -17360,7 +17432,7 @@ def M5_vdmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -17372,7 +17444,7 @@ def M5_vdmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -17383,7 +17455,7 @@ def M5_vmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpybsu($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111110;
@@ -17394,7 +17466,7 @@ def M5_vmacbuu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpybu($Rs32,$Rt32)",
-tc_e913dc32, TypeM>, Enc_61f0b0 {
+tc_d773585a, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111100;
@@ -17405,7 +17477,7 @@ def M5_vmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpybsu($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101010;
@@ -17415,7 +17487,7 @@ def M5_vmpybuu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpybu($Rs32,$Rt32)",
-tc_8fd5f294, TypeM>, Enc_be32a5 {
+tc_bafaade3, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101100;
@@ -17425,7 +17497,7 @@ def M5_vrmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpybsu($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010110;
@@ -17436,7 +17508,7 @@ def M5_vrmacbuu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpybu($Rss32,$Rtt32)",
-tc_e913dc32, TypeM>, Enc_88c16c {
+tc_d773585a, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -17447,7 +17519,7 @@ def M5_vrmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpybsu($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -17457,7 +17529,7 @@ def M5_vrmpybuu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpybu($Rss32,$Rtt32)",
-tc_8fd5f294, TypeM>, Enc_a56825 {
+tc_bafaade3, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -17467,7 +17539,7 @@ def M6_vabsdiffb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffb($Rtt32,$Rss32)",
-tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
+tc_9461ff31, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -17477,7 +17549,7 @@ def M6_vabsdiffub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffub($Rtt32,$Rss32)",
-tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
+tc_9461ff31, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -17487,7 +17559,7 @@ def PS_loadrbabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = memb(#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel {
 let Inst{24-21} = 0b1000;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17510,7 +17582,7 @@ def PS_loadrdabs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u29_3Imm:$Ii),
 "$Rdd32 = memd(#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_509701, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_509701, AddrModeRel {
 let Inst{24-21} = 0b1110;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
@@ -17531,7 +17603,7 @@ def PS_loadrhabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u31_1Imm:$Ii),
 "$Rd32 = memh(#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel {
 let Inst{24-21} = 0b1010;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17554,7 +17626,7 @@ def PS_loadriabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u30_2Imm:$Ii),
 "$Rd32 = memw(#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
 let Inst{24-21} = 0b1100;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17577,7 +17649,7 @@ def PS_loadrubabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = memub(#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel {
 let Inst{24-21} = 0b1001;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17600,7 +17672,7 @@ def PS_loadruhabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u31_1Imm:$Ii),
 "$Rd32 = memuh(#$Ii)",
-tc_9c98e8af, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel {
 let Inst{24-21} = 0b1011;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17623,7 +17695,7 @@ def PS_storerbabs : HInst<
 (outs),
 (ins u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb(#$Ii) = $Rt32",
-tc_a788683e, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
 let Inst{24-21} = 0b0000;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
@@ -17645,7 +17717,7 @@ def PS_storerbnewabs : HInst<
 (outs),
 (ins u32_0Imm:$Ii, IntRegs:$Nt8),
 "memb(#$Ii) = $Nt8.new",
-tc_ff9ee76e, TypeV2LDST>, Enc_ad1831, AddrModeRel {
+tc_5bf126a6, TypeV2LDST>, Enc_ad1831, AddrModeRel {
 let Inst{12-11} = 0b00;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -17671,7 +17743,7 @@ def PS_storerdabs : HInst<
 (outs),
 (ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd(#$Ii) = $Rtt32",
-tc_a788683e, TypeV2LDST>, Enc_5c124a, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_5c124a, AddrModeRel {
 let Inst{24-21} = 0b0110;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
@@ -17692,7 +17764,7 @@ def PS_storerfabs : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh(#$Ii) = $Rt32.h",
-tc_a788683e, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel {
 let Inst{24-21} = 0b0011;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
@@ -17713,7 +17785,7 @@ def PS_storerhabs : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh(#$Ii) = $Rt32",
-tc_a788683e, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel {
 let Inst{24-21} = 0b0010;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
@@ -17735,7 +17807,7 @@ def PS_storerhnewabs : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Nt8),
 "memh(#$Ii) = $Nt8.new",
-tc_ff9ee76e, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
+tc_5bf126a6, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
 let Inst{12-11} = 0b01;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -17761,7 +17833,7 @@ def PS_storeriabs : HInst<
 (outs),
 (ins u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw(#$Ii) = $Rt32",
-tc_a788683e, TypeV2LDST>, Enc_541f26, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_541f26, AddrModeRel {
 let Inst{24-21} = 0b0100;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
@@ -17783,7 +17855,7 @@ def PS_storerinewabs : HInst<
 (outs),
 (ins u30_2Imm:$Ii, IntRegs:$Nt8),
 "memw(#$Ii) = $Nt8.new",
-tc_ff9ee76e, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
+tc_5bf126a6, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
 let Inst{12-11} = 0b10;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -17809,7 +17881,7 @@ def S2_addasl_rrri : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32, u3_0Imm:$Ii),
 "$Rd32 = addasl($Rt32,$Rs32,#$Ii)",
-tc_c74f796f, TypeS_3op>, Enc_47ef61 {
+tc_f675fee8, TypeS_3op>, Enc_47ef61 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000100000;
 let hasNewValue = 1;
@@ -17820,7 +17892,7 @@ def S2_allocframe : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u11_3Imm:$Ii),
 "allocframe($Rx32,#$Ii):raw",
-tc_e216a5db, TypeST>, Enc_22c845 {
+tc_b44ecf75, TypeST>, Enc_22c845 {
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b10100000100;
 let hasNewValue = 1;
@@ -17836,7 +17908,7 @@ def S2_asl_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asl($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_5eac98 {
+tc_946df596, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10000000000;
 }
@@ -17844,7 +17916,7 @@ def S2_asl_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += asl($Rss32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_70fb07 {
+tc_f675fee8, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -17854,7 +17926,7 @@ def S2_asl_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= asl($Rss32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_70fb07 {
+tc_f429765c, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -17864,7 +17936,7 @@ def S2_asl_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= asl($Rss32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_70fb07 {
+tc_f675fee8, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -17874,7 +17946,7 @@ def S2_asl_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= asl($Rss32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_70fb07 {
+tc_f429765c, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -17884,7 +17956,7 @@ def S2_asl_i_p_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 ^= asl($Rss32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_70fb07 {
+tc_f429765c, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10000010100;
 let prefersSlot3 = 1;
@@ -17894,7 +17966,7 @@ def S2_asl_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asl($Rs32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_a05677 {
+tc_946df596, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -17905,7 +17977,7 @@ def S2_asl_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += asl($Rs32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_28a2dc {
+tc_f675fee8, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -17918,7 +17990,7 @@ def S2_asl_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= asl($Rs32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_28a2dc {
+tc_f429765c, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -17931,7 +18003,7 @@ def S2_asl_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= asl($Rs32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_28a2dc {
+tc_f675fee8, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -17944,7 +18016,7 @@ def S2_asl_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= asl($Rs32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_28a2dc {
+tc_f429765c, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -17957,7 +18029,7 @@ def S2_asl_i_r_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asl($Rs32,#$Ii):sat",
-tc_b44c6e2a, TypeS_2op>, Enc_a05677 {
+tc_779080bf, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100010;
@@ -17970,7 +18042,7 @@ def S2_asl_i_r_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 ^= asl($Rs32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_28a2dc {
+tc_f429765c, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110100;
@@ -17983,7 +18055,7 @@ def S2_asl_i_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vaslh($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_12b6e9 {
+tc_946df596, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b010;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000100;
@@ -17992,7 +18064,7 @@ def S2_asl_i_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Rdd32 = vaslw($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_7e5a82 {
+tc_946df596, TypeS_2op>, Enc_7e5a82 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000000010;
@@ -18001,7 +18073,7 @@ def S2_asl_r_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = asl($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011100;
@@ -18010,7 +18082,7 @@ def S2_asl_r_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += asl($Rss32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_1aa186 {
+tc_f675fee8, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011110;
@@ -18021,7 +18093,7 @@ def S2_asl_r_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 &= asl($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011010;
@@ -18032,7 +18104,7 @@ def S2_asl_r_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 -= asl($Rss32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_1aa186 {
+tc_f675fee8, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011100;
@@ -18043,7 +18115,7 @@ def S2_asl_r_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 |= asl($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011000;
@@ -18054,7 +18126,7 @@ def S2_asl_r_p_xor : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 ^= asl($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011011;
@@ -18065,7 +18137,7 @@ def S2_asl_r_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = asl($Rs32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_5ab2be {
+tc_946df596, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110010;
@@ -18076,7 +18148,7 @@ def S2_asl_r_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += asl($Rs32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_2ae154 {
+tc_f675fee8, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100110;
@@ -18089,7 +18161,7 @@ def S2_asl_r_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= asl($Rs32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_2ae154 {
+tc_f429765c, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100010;
@@ -18102,7 +18174,7 @@ def S2_asl_r_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= asl($Rs32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_2ae154 {
+tc_f675fee8, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100100;
@@ -18115,7 +18187,7 @@ def S2_asl_r_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= asl($Rs32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_2ae154 {
+tc_f429765c, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100000;
@@ -18128,7 +18200,7 @@ def S2_asl_r_r_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = asl($Rs32,$Rt32):sat",
-tc_b44c6e2a, TypeS_3op>, Enc_5ab2be {
+tc_779080bf, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110000;
@@ -18141,7 +18213,7 @@ def S2_asl_r_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vaslh($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011010;
@@ -18150,7 +18222,7 @@ def S2_asl_r_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vaslw($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011000;
@@ -18159,7 +18231,7 @@ def S2_asr_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asr($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_5eac98 {
+tc_946df596, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b10000000000;
 }
@@ -18167,7 +18239,7 @@ def S2_asr_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += asr($Rss32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_70fb07 {
+tc_f675fee8, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b100;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -18177,7 +18249,7 @@ def S2_asr_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= asr($Rss32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_70fb07 {
+tc_f429765c, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -18187,7 +18259,7 @@ def S2_asr_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= asr($Rss32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_70fb07 {
+tc_f675fee8, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -18197,7 +18269,7 @@ def S2_asr_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= asr($Rss32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_70fb07 {
+tc_f429765c, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b100;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -18207,7 +18279,7 @@ def S2_asr_i_p_rnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asr($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Enc_5eac98 {
+tc_002cb246, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18216,14 +18288,14 @@ def S2_asr_i_p_rnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asrrnd($Rss32,#$Ii)",
-tc_2b6f77c6, TypeS_2op> {
+tc_002cb246, TypeS_2op> {
 let isPseudo = 1;
 }
 def S2_asr_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asr($Rs32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_a05677 {
+tc_946df596, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -18234,7 +18306,7 @@ def S2_asr_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += asr($Rs32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_28a2dc {
+tc_f675fee8, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -18247,7 +18319,7 @@ def S2_asr_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= asr($Rs32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_28a2dc {
+tc_f429765c, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -18260,7 +18332,7 @@ def S2_asr_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= asr($Rs32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_28a2dc {
+tc_f675fee8, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -18273,7 +18345,7 @@ def S2_asr_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= asr($Rs32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_28a2dc {
+tc_f429765c, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -18286,7 +18358,7 @@ def S2_asr_i_r_rnd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asr($Rs32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Enc_a05677 {
+tc_002cb246, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100010;
@@ -18298,7 +18370,7 @@ def S2_asr_i_r_rnd_goodsyntax : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asrrnd($Rs32,#$Ii)",
-tc_2b6f77c6, TypeS_2op> {
+tc_002cb246, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -18307,7 +18379,7 @@ def S2_asr_i_svw_trun : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Rd32 = vasrw($Rss32,#$Ii)",
-tc_1b9c9ee5, TypeS_2op>, Enc_8dec2e {
+tc_4414d8b1, TypeS_2op>, Enc_8dec2e {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001000110;
@@ -18319,7 +18391,7 @@ def S2_asr_i_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_12b6e9 {
+tc_946df596, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000100;
@@ -18328,7 +18400,7 @@ def S2_asr_i_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Rdd32 = vasrw($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_7e5a82 {
+tc_946df596, TypeS_2op>, Enc_7e5a82 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000000010;
@@ -18337,7 +18409,7 @@ def S2_asr_r_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = asr($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011100;
@@ -18346,7 +18418,7 @@ def S2_asr_r_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += asr($Rss32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_1aa186 {
+tc_f675fee8, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011110;
@@ -18357,7 +18429,7 @@ def S2_asr_r_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 &= asr($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011010;
@@ -18368,7 +18440,7 @@ def S2_asr_r_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 -= asr($Rss32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_1aa186 {
+tc_f675fee8, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011100;
@@ -18379,7 +18451,7 @@ def S2_asr_r_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 |= asr($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011000;
@@ -18390,7 +18462,7 @@ def S2_asr_r_p_xor : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 ^= asr($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011011;
@@ -18401,7 +18473,7 @@ def S2_asr_r_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = asr($Rs32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_5ab2be {
+tc_946df596, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110010;
@@ -18412,7 +18484,7 @@ def S2_asr_r_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += asr($Rs32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_2ae154 {
+tc_f675fee8, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100110;
@@ -18425,7 +18497,7 @@ def S2_asr_r_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= asr($Rs32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_2ae154 {
+tc_f429765c, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100010;
@@ -18438,7 +18510,7 @@ def S2_asr_r_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= asr($Rs32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_2ae154 {
+tc_f675fee8, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100100;
@@ -18451,7 +18523,7 @@ def S2_asr_r_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= asr($Rs32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_2ae154 {
+tc_f429765c, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100000;
@@ -18464,7 +18536,7 @@ def S2_asr_r_r_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = asr($Rs32,$Rt32):sat",
-tc_b44c6e2a, TypeS_3op>, Enc_5ab2be {
+tc_779080bf, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110000;
@@ -18477,7 +18549,7 @@ def S2_asr_r_svw_trun : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = vasrw($Rss32,$Rt32)",
-tc_1b9c9ee5, TypeS_3op>, Enc_3d5b28 {
+tc_4414d8b1, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -18489,7 +18561,7 @@ def S2_asr_r_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vasrh($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011010;
@@ -18498,7 +18570,7 @@ def S2_asr_r_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vasrw($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011000;
@@ -18507,7 +18579,7 @@ def S2_brev : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = brev($Rs32)",
-tc_d088982c, TypeS_2op>, Enc_5e2823 {
+tc_14b5c689, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001100010;
 let hasNewValue = 1;
@@ -18518,7 +18590,7 @@ def S2_brevp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = brev($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_b9c5fb {
+tc_14b5c689, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18527,7 +18599,7 @@ def S2_cabacdecbin : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = decbin($Rss32,$Rtt32)",
-tc_c6ebf8dd, TypeS_3op>, Enc_a56825 {
+tc_76851da1, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -18539,7 +18611,7 @@ def S2_cl0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = cl0($Rs32)",
-tc_d088982c, TypeS_2op>, Enc_5e2823 {
+tc_14b5c689, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10001100000;
 let hasNewValue = 1;
@@ -18550,7 +18622,7 @@ def S2_cl0p : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = cl0($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_90cd8b {
+tc_14b5c689, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -18561,7 +18633,7 @@ def S2_cl1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = cl1($Rs32)",
-tc_d088982c, TypeS_2op>, Enc_5e2823 {
+tc_14b5c689, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001100000;
 let hasNewValue = 1;
@@ -18572,7 +18644,7 @@ def S2_cl1p : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = cl1($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_90cd8b {
+tc_14b5c689, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -18583,7 +18655,7 @@ def S2_clb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = clb($Rs32)",
-tc_d088982c, TypeS_2op>, Enc_5e2823 {
+tc_14b5c689, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001100000;
 let hasNewValue = 1;
@@ -18594,7 +18666,7 @@ def S2_clbnorm : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = normamt($Rs32)",
-tc_d088982c, TypeS_2op>, Enc_5e2823 {
+tc_14b5c689, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10001100000;
 let hasNewValue = 1;
@@ -18605,7 +18677,7 @@ def S2_clbp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = clb($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_90cd8b {
+tc_14b5c689, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -18616,7 +18688,7 @@ def S2_clrbit_i : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = clrbit($Rs32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_a05677 {
+tc_946df596, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100110;
@@ -18627,7 +18699,7 @@ def S2_clrbit_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = clrbit($Rs32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_5ab2be {
+tc_946df596, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110100;
@@ -18638,7 +18710,7 @@ def S2_ct0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = ct0($Rs32)",
-tc_d088982c, TypeS_2op>, Enc_5e2823 {
+tc_14b5c689, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001100010;
 let hasNewValue = 1;
@@ -18649,7 +18721,7 @@ def S2_ct0p : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = ct0($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_90cd8b {
+tc_14b5c689, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -18660,7 +18732,7 @@ def S2_ct1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = ct1($Rs32)",
-tc_d088982c, TypeS_2op>, Enc_5e2823 {
+tc_14b5c689, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10001100010;
 let hasNewValue = 1;
@@ -18671,7 +18743,7 @@ def S2_ct1p : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = ct1($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_90cd8b {
+tc_14b5c689, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -18682,7 +18754,7 @@ def S2_deinterleave : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = deinterleave($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_b9c5fb {
+tc_14b5c689, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18691,7 +18763,7 @@ def S2_extractu : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
 "$Rd32 = extractu($Rs32,#$Ii,#$II)",
-tc_c74f796f, TypeS_2op>, Enc_b388cf {
+tc_f675fee8, TypeS_2op>, Enc_b388cf {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b100011010;
 let hasNewValue = 1;
@@ -18702,7 +18774,7 @@ def S2_extractu_rp : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Rd32 = extractu($Rs32,$Rtt32)",
-tc_2b6f77c6, TypeS_3op>, Enc_e07374 {
+tc_002cb246, TypeS_3op>, Enc_e07374 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001001000;
@@ -18714,7 +18786,7 @@ def S2_extractup : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
 "$Rdd32 = extractu($Rss32,#$Ii,#$II)",
-tc_c74f796f, TypeS_2op>, Enc_b84c4c {
+tc_f675fee8, TypeS_2op>, Enc_b84c4c {
 let Inst{31-24} = 0b10000001;
 let prefersSlot3 = 1;
 }
@@ -18722,7 +18794,7 @@ def S2_extractup_rp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = extractu($Rss32,$Rtt32)",
-tc_2b6f77c6, TypeS_3op>, Enc_a56825 {
+tc_002cb246, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001000;
@@ -18732,7 +18804,7 @@ def S2_insert : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = insert($Rs32,#$Ii,#$II)",
-tc_87735c3b, TypeS_2op>, Enc_a1e29d {
+tc_bfec0f01, TypeS_2op>, Enc_a1e29d {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b100011110;
 let hasNewValue = 1;
@@ -18744,7 +18816,7 @@ def S2_insert_rp : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Rx32 = insert($Rs32,$Rtt32)",
-tc_84df2cd3, TypeS_3op>, Enc_179b35 {
+tc_f429765c, TypeS_3op>, Enc_179b35 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001000000;
@@ -18757,7 +18829,7 @@ def S2_insertp : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
 "$Rxx32 = insert($Rss32,#$Ii,#$II)",
-tc_87735c3b, TypeS_2op>, Enc_143a3c {
+tc_bfec0f01, TypeS_2op>, Enc_143a3c {
 let Inst{31-24} = 0b10000011;
 let prefersSlot3 = 1;
 let Constraints = "$Rxx32 = $Rxx32in";
@@ -18766,7 +18838,7 @@ def S2_insertp_rp : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 = insert($Rss32,$Rtt32)",
-tc_84df2cd3, TypeS_3op>, Enc_88c16c {
+tc_f429765c, TypeS_3op>, Enc_88c16c {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001010000;
@@ -18777,7 +18849,7 @@ def S2_interleave : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = interleave($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_b9c5fb {
+tc_14b5c689, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18786,7 +18858,7 @@ def S2_lfsp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = lfs($Rss32,$Rtt32)",
-tc_2b6f77c6, TypeS_3op>, Enc_a56825 {
+tc_002cb246, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -18796,7 +18868,7 @@ def S2_lsl_r_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = lsl($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011100;
@@ -18805,7 +18877,7 @@ def S2_lsl_r_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += lsl($Rss32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_1aa186 {
+tc_f675fee8, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011110;
@@ -18816,7 +18888,7 @@ def S2_lsl_r_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 &= lsl($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011010;
@@ -18827,7 +18899,7 @@ def S2_lsl_r_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 -= lsl($Rss32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_1aa186 {
+tc_f675fee8, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011100;
@@ -18838,7 +18910,7 @@ def S2_lsl_r_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 |= lsl($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011000;
@@ -18849,7 +18921,7 @@ def S2_lsl_r_p_xor : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 ^= lsl($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011011;
@@ -18860,7 +18932,7 @@ def S2_lsl_r_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = lsl($Rs32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_5ab2be {
+tc_946df596, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110010;
@@ -18871,7 +18943,7 @@ def S2_lsl_r_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += lsl($Rs32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_2ae154 {
+tc_f675fee8, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100110;
@@ -18884,7 +18956,7 @@ def S2_lsl_r_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= lsl($Rs32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_2ae154 {
+tc_f429765c, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100010;
@@ -18897,7 +18969,7 @@ def S2_lsl_r_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= lsl($Rs32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_2ae154 {
+tc_f675fee8, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100100;
@@ -18910,7 +18982,7 @@ def S2_lsl_r_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= lsl($Rs32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_2ae154 {
+tc_f429765c, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100000;
@@ -18923,7 +18995,7 @@ def S2_lsl_r_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vlslh($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011010;
@@ -18932,7 +19004,7 @@ def S2_lsl_r_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vlslw($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011000;
@@ -18941,7 +19013,7 @@ def S2_lsr_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = lsr($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_5eac98 {
+tc_946df596, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b10000000000;
 }
@@ -18949,7 +19021,7 @@ def S2_lsr_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += lsr($Rss32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_70fb07 {
+tc_f675fee8, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b101;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -18959,7 +19031,7 @@ def S2_lsr_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= lsr($Rss32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_70fb07 {
+tc_f429765c, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -18969,7 +19041,7 @@ def S2_lsr_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= lsr($Rss32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_70fb07 {
+tc_f675fee8, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -18979,7 +19051,7 @@ def S2_lsr_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= lsr($Rss32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_70fb07 {
+tc_f429765c, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b101;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -18989,7 +19061,7 @@ def S2_lsr_i_p_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 ^= lsr($Rss32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_70fb07 {
+tc_f429765c, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b10000010100;
 let prefersSlot3 = 1;
@@ -18999,7 +19071,7 @@ def S2_lsr_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = lsr($Rs32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_a05677 {
+tc_946df596, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -19010,7 +19082,7 @@ def S2_lsr_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += lsr($Rs32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_28a2dc {
+tc_f675fee8, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -19023,7 +19095,7 @@ def S2_lsr_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= lsr($Rs32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_28a2dc {
+tc_f429765c, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -19036,7 +19108,7 @@ def S2_lsr_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= lsr($Rs32,#$Ii)",
-tc_c74f796f, TypeS_2op>, Enc_28a2dc {
+tc_f675fee8, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -19049,7 +19121,7 @@ def S2_lsr_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= lsr($Rs32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_28a2dc {
+tc_f429765c, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -19062,7 +19134,7 @@ def S2_lsr_i_r_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 ^= lsr($Rs32,#$Ii)",
-tc_84df2cd3, TypeS_2op>, Enc_28a2dc {
+tc_f429765c, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110100;
@@ -19075,7 +19147,7 @@ def S2_lsr_i_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vlsrh($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_12b6e9 {
+tc_946df596, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b001;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000100;
@@ -19084,7 +19156,7 @@ def S2_lsr_i_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Rdd32 = vlsrw($Rss32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_7e5a82 {
+tc_946df596, TypeS_2op>, Enc_7e5a82 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000000010;
@@ -19093,7 +19165,7 @@ def S2_lsr_r_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = lsr($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011100;
@@ -19102,7 +19174,7 @@ def S2_lsr_r_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += lsr($Rss32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_1aa186 {
+tc_f675fee8, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011110;
@@ -19113,7 +19185,7 @@ def S2_lsr_r_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 &= lsr($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011010;
@@ -19124,7 +19196,7 @@ def S2_lsr_r_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 -= lsr($Rss32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_1aa186 {
+tc_f675fee8, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011100;
@@ -19135,7 +19207,7 @@ def S2_lsr_r_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 |= lsr($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011000;
@@ -19146,7 +19218,7 @@ def S2_lsr_r_p_xor : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 ^= lsr($Rss32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_1aa186 {
+tc_f429765c, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011011;
@@ -19157,7 +19229,7 @@ def S2_lsr_r_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = lsr($Rs32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_5ab2be {
+tc_946df596, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110010;
@@ -19168,7 +19240,7 @@ def S2_lsr_r_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += lsr($Rs32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_2ae154 {
+tc_f675fee8, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100110;
@@ -19181,7 +19253,7 @@ def S2_lsr_r_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= lsr($Rs32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_2ae154 {
+tc_f429765c, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100010;
@@ -19194,7 +19266,7 @@ def S2_lsr_r_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= lsr($Rs32,$Rt32)",
-tc_c74f796f, TypeS_3op>, Enc_2ae154 {
+tc_f675fee8, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100100;
@@ -19207,7 +19279,7 @@ def S2_lsr_r_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= lsr($Rs32,$Rt32)",
-tc_84df2cd3, TypeS_3op>, Enc_2ae154 {
+tc_f429765c, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100000;
@@ -19220,7 +19292,7 @@ def S2_lsr_r_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vlsrh($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011010;
@@ -19229,16 +19301,28 @@ def S2_lsr_r_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vlsrw($Rss32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_927852 {
+tc_946df596, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011000;
 }
+def S2_mask : HInst<
+(outs IntRegs:$Rd32),
+(ins u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rd32 = mask(#$Ii,#$II)",
+tc_9461ff31, TypeS_2op>, Enc_c85e2a, Requires<[HasV66]> {
+let Inst{13-13} = 0b1;
+let Inst{20-16} = 0b00000;
+let Inst{31-23} = 0b100011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
 def S2_packhl : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = packhl($Rs32,$Rt32)",
-tc_b9488031, TypeALU32_3op>, Enc_be32a5 {
+tc_5a2711e5, TypeALU32_3op>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110101100;
@@ -19248,7 +19332,7 @@ def S2_parityp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = parity($Rss32,$Rtt32)",
-tc_2b6f77c6, TypeALU64>, Enc_d2216a {
+tc_002cb246, TypeALU64>, Enc_d2216a {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010000000;
@@ -19260,7 +19344,7 @@ def S2_pstorerbf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memb($Rs32+#$Ii) = $Rt32",
-tc_8b15472a, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_da8d43, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100000;
 let isPredicated = 1;
@@ -19282,7 +19366,7 @@ def S2_pstorerbf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memb($Rx32++#$Ii) = $Rt32",
-tc_cd7374a0, TypeST>, Enc_cc449f, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_cc449f, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19300,7 +19384,7 @@ def S2_pstorerbf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4) memb($Rs32) = $Rt32",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19308,7 +19392,7 @@ def S2_pstorerbfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memb($Rx32++#$Ii) = $Rt32",
-tc_74e47fd9, TypeST>, Enc_cc449f, AddrModeRel {
+tc_53559e35, TypeST>, Enc_cc449f, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19327,7 +19411,7 @@ def S2_pstorerbnewf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memb($Rs32+#$Ii) = $Nt8.new",
-tc_594ab548, TypeV2LDST>, Enc_585242, AddrModeRel {
+tc_8fb7ab1b, TypeV2LDST>, Enc_585242, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b01000100101;
@@ -19353,7 +19437,7 @@ def S2_pstorerbnewf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memb($Rx32++#$Ii) = $Nt8.new",
-tc_d9f95eef, TypeST>, Enc_52a5dd, AddrModeRel {
+tc_838b34ea, TypeST>, Enc_52a5dd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b100;
@@ -19375,7 +19459,7 @@ def S2_pstorerbnewf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4) memb($Rs32) = $Nt8.new",
-tc_594ab548, TypeMAPPING> {
+tc_8fb7ab1b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -19384,7 +19468,7 @@ def S2_pstorerbnewfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
-tc_d24b2d85, TypeST>, Enc_52a5dd, AddrModeRel {
+tc_d65dbf51, TypeST>, Enc_52a5dd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b100;
@@ -19407,7 +19491,7 @@ def S2_pstorerbnewt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memb($Rs32+#$Ii) = $Nt8.new",
-tc_594ab548, TypeV2LDST>, Enc_585242, AddrModeRel {
+tc_8fb7ab1b, TypeV2LDST>, Enc_585242, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b01000000101;
@@ -19432,7 +19516,7 @@ def S2_pstorerbnewt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memb($Rx32++#$Ii) = $Nt8.new",
-tc_d9f95eef, TypeST>, Enc_52a5dd, AddrModeRel {
+tc_838b34ea, TypeST>, Enc_52a5dd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b100;
@@ -19453,7 +19537,7 @@ def S2_pstorerbnewt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4) memb($Rs32) = $Nt8.new",
-tc_594ab548, TypeMAPPING> {
+tc_8fb7ab1b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -19462,7 +19546,7 @@ def S2_pstorerbnewtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
-tc_d24b2d85, TypeST>, Enc_52a5dd, AddrModeRel {
+tc_d65dbf51, TypeST>, Enc_52a5dd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b100;
@@ -19484,7 +19568,7 @@ def S2_pstorerbt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memb($Rs32+#$Ii) = $Rt32",
-tc_8b15472a, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_da8d43, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000000;
 let isPredicated = 1;
@@ -19505,7 +19589,7 @@ def S2_pstorerbt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memb($Rx32++#$Ii) = $Rt32",
-tc_cd7374a0, TypeST>, Enc_cc449f, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_cc449f, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19522,7 +19606,7 @@ def S2_pstorerbt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4) memb($Rs32) = $Rt32",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19530,7 +19614,7 @@ def S2_pstorerbtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memb($Rx32++#$Ii) = $Rt32",
-tc_74e47fd9, TypeST>, Enc_cc449f, AddrModeRel {
+tc_53559e35, TypeST>, Enc_cc449f, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19548,7 +19632,7 @@ def S2_pstorerdf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd($Rs32+#$Ii) = $Rtt32",
-tc_8b15472a, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_57a33e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100110;
 let isPredicated = 1;
@@ -19569,7 +19653,7 @@ def S2_pstorerdf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd($Rx32++#$Ii) = $Rtt32",
-tc_cd7374a0, TypeST>, Enc_9a33d5, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_9a33d5, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19587,7 +19671,7 @@ def S2_pstorerdf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd($Rs32) = $Rtt32",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19595,7 +19679,7 @@ def S2_pstorerdfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
-tc_74e47fd9, TypeST>, Enc_9a33d5, AddrModeRel {
+tc_53559e35, TypeST>, Enc_9a33d5, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19614,7 +19698,7 @@ def S2_pstorerdt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4) memd($Rs32+#$Ii) = $Rtt32",
-tc_8b15472a, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_57a33e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000110;
 let isPredicated = 1;
@@ -19634,7 +19718,7 @@ def S2_pstorerdt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4) memd($Rx32++#$Ii) = $Rtt32",
-tc_cd7374a0, TypeST>, Enc_9a33d5, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_9a33d5, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19651,7 +19735,7 @@ def S2_pstorerdt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "if ($Pv4) memd($Rs32) = $Rtt32",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19659,7 +19743,7 @@ def S2_pstorerdtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
-tc_74e47fd9, TypeST>, Enc_9a33d5, AddrModeRel {
+tc_53559e35, TypeST>, Enc_9a33d5, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19677,7 +19761,7 @@ def S2_pstorerff_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32+#$Ii) = $Rt32.h",
-tc_8b15472a, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100011;
 let isPredicated = 1;
@@ -19698,7 +19782,7 @@ def S2_pstorerff_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rx32++#$Ii) = $Rt32.h",
-tc_cd7374a0, TypeST>, Enc_b886fd, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19716,7 +19800,7 @@ def S2_pstorerff_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32) = $Rt32.h",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19724,7 +19808,7 @@ def S2_pstorerffnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
-tc_74e47fd9, TypeST>, Enc_b886fd, AddrModeRel {
+tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19743,7 +19827,7 @@ def S2_pstorerft_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32+#$Ii) = $Rt32.h",
-tc_8b15472a, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000011;
 let isPredicated = 1;
@@ -19763,7 +19847,7 @@ def S2_pstorerft_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rx32++#$Ii) = $Rt32.h",
-tc_cd7374a0, TypeST>, Enc_b886fd, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19780,7 +19864,7 @@ def S2_pstorerft_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32) = $Rt32.h",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19788,7 +19872,7 @@ def S2_pstorerftnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
-tc_74e47fd9, TypeST>, Enc_b886fd, AddrModeRel {
+tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19806,7 +19890,7 @@ def S2_pstorerhf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32+#$Ii) = $Rt32",
-tc_8b15472a, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100010;
 let isPredicated = 1;
@@ -19828,7 +19912,7 @@ def S2_pstorerhf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rx32++#$Ii) = $Rt32",
-tc_cd7374a0, TypeST>, Enc_b886fd, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19846,7 +19930,7 @@ def S2_pstorerhf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32) = $Rt32",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19854,7 +19938,7 @@ def S2_pstorerhfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32",
-tc_74e47fd9, TypeST>, Enc_b886fd, AddrModeRel {
+tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19873,7 +19957,7 @@ def S2_pstorerhnewf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memh($Rs32+#$Ii) = $Nt8.new",
-tc_594ab548, TypeV2LDST>, Enc_f44229, AddrModeRel {
+tc_8fb7ab1b, TypeV2LDST>, Enc_f44229, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b01000100101;
@@ -19899,7 +19983,7 @@ def S2_pstorerhnewf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memh($Rx32++#$Ii) = $Nt8.new",
-tc_d9f95eef, TypeST>, Enc_31aa6a, AddrModeRel {
+tc_838b34ea, TypeST>, Enc_31aa6a, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b101;
@@ -19921,7 +20005,7 @@ def S2_pstorerhnewf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4) memh($Rs32) = $Nt8.new",
-tc_594ab548, TypeMAPPING> {
+tc_8fb7ab1b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -19930,7 +20014,7 @@ def S2_pstorerhnewfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
-tc_d24b2d85, TypeST>, Enc_31aa6a, AddrModeRel {
+tc_d65dbf51, TypeST>, Enc_31aa6a, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b101;
@@ -19953,7 +20037,7 @@ def S2_pstorerhnewt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memh($Rs32+#$Ii) = $Nt8.new",
-tc_594ab548, TypeV2LDST>, Enc_f44229, AddrModeRel {
+tc_8fb7ab1b, TypeV2LDST>, Enc_f44229, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b01000000101;
@@ -19978,7 +20062,7 @@ def S2_pstorerhnewt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memh($Rx32++#$Ii) = $Nt8.new",
-tc_d9f95eef, TypeST>, Enc_31aa6a, AddrModeRel {
+tc_838b34ea, TypeST>, Enc_31aa6a, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b101;
@@ -19999,7 +20083,7 @@ def S2_pstorerhnewt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4) memh($Rs32) = $Nt8.new",
-tc_594ab548, TypeMAPPING> {
+tc_8fb7ab1b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -20008,7 +20092,7 @@ def S2_pstorerhnewtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
-tc_d24b2d85, TypeST>, Enc_31aa6a, AddrModeRel {
+tc_d65dbf51, TypeST>, Enc_31aa6a, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b101;
@@ -20030,7 +20114,7 @@ def S2_pstorerht_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32+#$Ii) = $Rt32",
-tc_8b15472a, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000010;
 let isPredicated = 1;
@@ -20051,7 +20135,7 @@ def S2_pstorerht_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rx32++#$Ii) = $Rt32",
-tc_cd7374a0, TypeST>, Enc_b886fd, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -20068,7 +20152,7 @@ def S2_pstorerht_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32) = $Rt32",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20076,7 +20160,7 @@ def S2_pstorerhtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32",
-tc_74e47fd9, TypeST>, Enc_b886fd, AddrModeRel {
+tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -20094,7 +20178,7 @@ def S2_pstorerif_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memw($Rs32+#$Ii) = $Rt32",
-tc_8b15472a, TypeV2LDST>, Enc_397f23, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_397f23, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100100;
 let isPredicated = 1;
@@ -20116,7 +20200,7 @@ def S2_pstorerif_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memw($Rx32++#$Ii) = $Rt32",
-tc_cd7374a0, TypeST>, Enc_7eaeb6, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_7eaeb6, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -20134,7 +20218,7 @@ def S2_pstorerif_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4) memw($Rs32) = $Rt32",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20142,7 +20226,7 @@ def S2_pstorerifnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memw($Rx32++#$Ii) = $Rt32",
-tc_74e47fd9, TypeST>, Enc_7eaeb6, AddrModeRel {
+tc_53559e35, TypeST>, Enc_7eaeb6, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -20162,7 +20246,7 @@ def S2_pstorerinewf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memw($Rs32+#$Ii) = $Nt8.new",
-tc_594ab548, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+tc_8fb7ab1b, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b01000100101;
@@ -20188,7 +20272,7 @@ def S2_pstorerinewf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memw($Rx32++#$Ii) = $Nt8.new",
-tc_d9f95eef, TypeST>, Enc_65f095, AddrModeRel {
+tc_838b34ea, TypeST>, Enc_65f095, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b110;
@@ -20210,7 +20294,7 @@ def S2_pstorerinewf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4) memw($Rs32) = $Nt8.new",
-tc_594ab548, TypeMAPPING> {
+tc_8fb7ab1b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -20219,7 +20303,7 @@ def S2_pstorerinewfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
-tc_d24b2d85, TypeST>, Enc_65f095, AddrModeRel {
+tc_d65dbf51, TypeST>, Enc_65f095, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b110;
@@ -20242,7 +20326,7 @@ def S2_pstorerinewt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memw($Rs32+#$Ii) = $Nt8.new",
-tc_594ab548, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+tc_8fb7ab1b, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b01000000101;
@@ -20267,7 +20351,7 @@ def S2_pstorerinewt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memw($Rx32++#$Ii) = $Nt8.new",
-tc_d9f95eef, TypeST>, Enc_65f095, AddrModeRel {
+tc_838b34ea, TypeST>, Enc_65f095, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b110;
@@ -20288,7 +20372,7 @@ def S2_pstorerinewt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4) memw($Rs32) = $Nt8.new",
-tc_594ab548, TypeMAPPING> {
+tc_8fb7ab1b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -20297,7 +20381,7 @@ def S2_pstorerinewtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
-tc_d24b2d85, TypeST>, Enc_65f095, AddrModeRel {
+tc_d65dbf51, TypeST>, Enc_65f095, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b110;
@@ -20319,7 +20403,7 @@ def S2_pstorerit_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memw($Rs32+#$Ii) = $Rt32",
-tc_8b15472a, TypeV2LDST>, Enc_397f23, AddrModeRel {
+tc_f8e23f0b, TypeV2LDST>, Enc_397f23, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000100;
 let isPredicated = 1;
@@ -20340,7 +20424,7 @@ def S2_pstorerit_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memw($Rx32++#$Ii) = $Rt32",
-tc_cd7374a0, TypeST>, Enc_7eaeb6, AddrModeRel {
+tc_24b66c99, TypeST>, Enc_7eaeb6, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -20357,7 +20441,7 @@ def S2_pstorerit_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4) memw($Rs32) = $Rt32",
-tc_8b15472a, TypeMAPPING> {
+tc_f8e23f0b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20365,7 +20449,7 @@ def S2_pstoreritnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memw($Rx32++#$Ii) = $Rt32",
-tc_74e47fd9, TypeST>, Enc_7eaeb6, AddrModeRel {
+tc_53559e35, TypeST>, Enc_7eaeb6, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -20383,7 +20467,7 @@ def S2_setbit_i : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = setbit($Rs32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_a05677 {
+tc_946df596, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100110;
@@ -20394,7 +20478,7 @@ def S2_setbit_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = setbit($Rs32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_5ab2be {
+tc_946df596, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110100;
@@ -20405,7 +20489,7 @@ def S2_shuffeb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = shuffeb($Rss32,$Rtt32)",
-tc_540fdfbc, TypeS_3op>, Enc_a56825 {
+tc_946df596, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001000;
@@ -20414,7 +20498,7 @@ def S2_shuffeh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = shuffeh($Rss32,$Rtt32)",
-tc_540fdfbc, TypeS_3op>, Enc_a56825 {
+tc_946df596, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001000;
@@ -20423,7 +20507,7 @@ def S2_shuffob : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = shuffob($Rtt32,$Rss32)",
-tc_540fdfbc, TypeS_3op>, Enc_ea23e4 {
+tc_946df596, TypeS_3op>, Enc_ea23e4 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001000;
@@ -20432,7 +20516,7 @@ def S2_shuffoh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = shuffoh($Rtt32,$Rss32)",
-tc_540fdfbc, TypeS_3op>, Enc_ea23e4 {
+tc_946df596, TypeS_3op>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -20441,7 +20525,7 @@ def S2_storerb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) = $Rt32",
-tc_05b6c987, TypeST>, Enc_448f7f, AddrModeRel, PostInc_BaseImm {
+tc_30b9bb4a, TypeST>, Enc_448f7f, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1000;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
@@ -20462,9 +20546,10 @@ def S2_storerb_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memb($Rx32++$Mu2:brev) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111000;
+let addrMode = PostInc;
 let accessSize = ByteAccess;
 let mayStore = 1;
 let BaseOpcode = "S2_storerb_pbr";
@@ -20475,7 +20560,7 @@ def S2_storerb_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
 "memb($Rx32++#$Ii:circ($Mu2)) = $Rt32",
-tc_9fdb5406, TypeST>, Enc_b15941, AddrModeRel {
+tc_e86aa961, TypeST>, Enc_b15941, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001000;
@@ -20491,7 +20576,7 @@ def S2_storerb_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memb($Rx32++I:circ($Mu2)) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001000;
 let addrMode = PostInc;
@@ -20506,7 +20591,7 @@ def S2_storerb_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rx32++#$Ii) = $Rt32",
-tc_f86c328a, TypeST>, Enc_10bc21, AddrModeRel, PostInc_BaseImm {
+tc_da97ee82, TypeST>, Enc_10bc21, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -20524,7 +20609,7 @@ def S2_storerb_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memb($Rx32++$Mu2) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f {
+tc_da97ee82, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101000;
 let addrMode = PostInc;
@@ -20537,7 +20622,7 @@ def S2_storerb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) = $Rt32",
-tc_05b6c987, TypeMAPPING> {
+tc_30b9bb4a, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20545,7 +20630,7 @@ def S2_storerbgp : HInst<
 (outs),
 (ins u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb(gp+#$Ii) = $Rt32",
-tc_a788683e, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
 let Inst{24-21} = 0b0000;
 let Inst{31-27} = 0b01001;
 let accessSize = ByteAccess;
@@ -20563,7 +20648,7 @@ def S2_storerbnew_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Nt8),
 "memb($Rs32+#$Ii) = $Nt8.new",
-tc_f7dd9c9f, TypeST>, Enc_4df4e9, AddrModeRel {
+tc_be9602ff, TypeST>, Enc_4df4e9, AddrModeRel {
 let Inst{12-11} = 0b00;
 let Inst{24-21} = 0b1101;
 let Inst{31-27} = 0b10100;
@@ -20588,10 +20673,11 @@ def S2_storerbnew_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memb($Rx32++$Mu2:brev) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b10101111101;
+let addrMode = PostInc;
 let accessSize = ByteAccess;
 let isNVStore = 1;
 let isNewValue = 1;
@@ -20605,7 +20691,7 @@ def S2_storerbnew_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
 "memb($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
-tc_9d5941c7, TypeST>, Enc_96ce4f, AddrModeRel {
+tc_d5c0729a, TypeST>, Enc_96ce4f, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{12-11} = 0b00;
@@ -20625,7 +20711,7 @@ def S2_storerbnew_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memb($Rx32++I:circ($Mu2)) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b10101001101;
@@ -20644,7 +20730,7 @@ def S2_storerbnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "memb($Rx32++#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_c7cd90, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_c7cd90, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b000;
@@ -20665,7 +20751,7 @@ def S2_storerbnew_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memb($Rx32++$Mu2) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85 {
+tc_c79a189f, TypeST>, Enc_8dbe85 {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b10101101101;
@@ -20682,7 +20768,7 @@ def S2_storerbnew_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Nt8),
 "memb($Rs32) = $Nt8.new",
-tc_f7dd9c9f, TypeMAPPING> {
+tc_be9602ff, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 1;
@@ -20691,7 +20777,7 @@ def S2_storerbnewgp : HInst<
 (outs),
 (ins u32_0Imm:$Ii, IntRegs:$Nt8),
 "memb(gp+#$Ii) = $Nt8.new",
-tc_ff9ee76e, TypeV2LDST>, Enc_ad1831, AddrModeRel {
+tc_5bf126a6, TypeV2LDST>, Enc_ad1831, AddrModeRel {
 let Inst{12-11} = 0b00;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -20713,7 +20799,7 @@ def S2_storerd_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd($Rs32+#$Ii) = $Rtt32",
-tc_05b6c987, TypeST>, Enc_ce6828, AddrModeRel, PostInc_BaseImm {
+tc_30b9bb4a, TypeST>, Enc_ce6828, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1110;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
@@ -20733,9 +20819,10 @@ def S2_storerd_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
 "memd($Rx32++$Mu2:brev) = $Rtt32",
-tc_f86c328a, TypeST>, Enc_928ca1 {
+tc_da97ee82, TypeST>, Enc_928ca1 {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111110;
+let addrMode = PostInc;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -20744,7 +20831,7 @@ def S2_storerd_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2, DoubleRegs:$Rtt32),
 "memd($Rx32++#$Ii:circ($Mu2)) = $Rtt32",
-tc_9fdb5406, TypeST>, Enc_395cc4 {
+tc_e86aa961, TypeST>, Enc_395cc4 {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001110;
@@ -20758,7 +20845,7 @@ def S2_storerd_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
 "memd($Rx32++I:circ($Mu2)) = $Rtt32",
-tc_f86c328a, TypeST>, Enc_928ca1 {
+tc_da97ee82, TypeST>, Enc_928ca1 {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001110;
 let addrMode = PostInc;
@@ -20771,7 +20858,7 @@ def S2_storerd_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd($Rx32++#$Ii) = $Rtt32",
-tc_f86c328a, TypeST>, Enc_85bf58, AddrModeRel, PostInc_BaseImm {
+tc_da97ee82, TypeST>, Enc_85bf58, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -20788,7 +20875,7 @@ def S2_storerd_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
 "memd($Rx32++$Mu2) = $Rtt32",
-tc_f86c328a, TypeST>, Enc_928ca1 {
+tc_da97ee82, TypeST>, Enc_928ca1 {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101110;
 let addrMode = PostInc;
@@ -20800,7 +20887,7 @@ def S2_storerd_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "memd($Rs32) = $Rtt32",
-tc_05b6c987, TypeMAPPING> {
+tc_30b9bb4a, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20808,7 +20895,7 @@ def S2_storerdgp : HInst<
 (outs),
 (ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd(gp+#$Ii) = $Rtt32",
-tc_a788683e, TypeV2LDST>, Enc_5c124a, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_5c124a, AddrModeRel {
 let Inst{24-21} = 0b0110;
 let Inst{31-27} = 0b01001;
 let accessSize = DoubleWordAccess;
@@ -20825,7 +20912,7 @@ def S2_storerf_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) = $Rt32.h",
-tc_05b6c987, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm {
+tc_30b9bb4a, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1011;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
@@ -20845,9 +20932,10 @@ def S2_storerf_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++$Mu2:brev) = $Rt32.h",
-tc_f86c328a, TypeST>, Enc_d5c73f {
+tc_da97ee82, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111011;
+let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -20856,7 +20944,7 @@ def S2_storerf_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++#$Ii:circ($Mu2)) = $Rt32.h",
-tc_9fdb5406, TypeST>, Enc_935d9b {
+tc_e86aa961, TypeST>, Enc_935d9b {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001011;
@@ -20870,7 +20958,7 @@ def S2_storerf_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++I:circ($Mu2)) = $Rt32.h",
-tc_f86c328a, TypeST>, Enc_d5c73f {
+tc_da97ee82, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001011;
 let addrMode = PostInc;
@@ -20883,7 +20971,7 @@ def S2_storerf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rx32++#$Ii) = $Rt32.h",
-tc_f86c328a, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm {
+tc_da97ee82, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -20900,7 +20988,7 @@ def S2_storerf_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++$Mu2) = $Rt32.h",
-tc_f86c328a, TypeST>, Enc_d5c73f {
+tc_da97ee82, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101011;
 let addrMode = PostInc;
@@ -20912,7 +21000,7 @@ def S2_storerf_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) = $Rt32.h",
-tc_05b6c987, TypeMAPPING> {
+tc_30b9bb4a, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20920,7 +21008,7 @@ def S2_storerfgp : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh(gp+#$Ii) = $Rt32.h",
-tc_a788683e, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel {
 let Inst{24-21} = 0b0011;
 let Inst{31-27} = 0b01001;
 let accessSize = HalfWordAccess;
@@ -20937,7 +21025,7 @@ def S2_storerh_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) = $Rt32",
-tc_05b6c987, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm {
+tc_30b9bb4a, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1010;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
@@ -20958,9 +21046,10 @@ def S2_storerh_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++$Mu2:brev) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111010;
+let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
 let BaseOpcode = "S2_storerh_pbr";
@@ -20971,7 +21060,7 @@ def S2_storerh_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++#$Ii:circ($Mu2)) = $Rt32",
-tc_9fdb5406, TypeST>, Enc_935d9b, AddrModeRel {
+tc_e86aa961, TypeST>, Enc_935d9b, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001010;
@@ -20987,7 +21076,7 @@ def S2_storerh_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++I:circ($Mu2)) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001010;
 let addrMode = PostInc;
@@ -21002,7 +21091,7 @@ def S2_storerh_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rx32++#$Ii) = $Rt32",
-tc_f86c328a, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm {
+tc_da97ee82, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -21020,7 +21109,7 @@ def S2_storerh_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++$Mu2) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f {
+tc_da97ee82, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101010;
 let addrMode = PostInc;
@@ -21033,7 +21122,7 @@ def S2_storerh_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) = $Rt32",
-tc_05b6c987, TypeMAPPING> {
+tc_30b9bb4a, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -21041,7 +21130,7 @@ def S2_storerhgp : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh(gp+#$Ii) = $Rt32",
-tc_a788683e, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel {
 let Inst{24-21} = 0b0010;
 let Inst{31-27} = 0b01001;
 let accessSize = HalfWordAccess;
@@ -21059,7 +21148,7 @@ def S2_storerhnew_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Nt8),
 "memh($Rs32+#$Ii) = $Nt8.new",
-tc_f7dd9c9f, TypeST>, Enc_0d8870, AddrModeRel {
+tc_be9602ff, TypeST>, Enc_0d8870, AddrModeRel {
 let Inst{12-11} = 0b01;
 let Inst{24-21} = 0b1101;
 let Inst{31-27} = 0b10100;
@@ -21084,10 +21173,11 @@ def S2_storerhnew_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memh($Rx32++$Mu2:brev) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b10101111101;
+let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let isNVStore = 1;
 let isNewValue = 1;
@@ -21101,7 +21191,7 @@ def S2_storerhnew_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
 "memh($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
-tc_9d5941c7, TypeST>, Enc_91b9fe, AddrModeRel {
+tc_d5c0729a, TypeST>, Enc_91b9fe, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{12-11} = 0b01;
@@ -21121,7 +21211,7 @@ def S2_storerhnew_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memh($Rx32++I:circ($Mu2)) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b10101001101;
@@ -21140,7 +21230,7 @@ def S2_storerhnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "memh($Rx32++#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_e26546, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_e26546, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b001;
@@ -21161,7 +21251,7 @@ def S2_storerhnew_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memh($Rx32++$Mu2) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85 {
+tc_c79a189f, TypeST>, Enc_8dbe85 {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b10101101101;
@@ -21178,7 +21268,7 @@ def S2_storerhnew_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Nt8),
 "memh($Rs32) = $Nt8.new",
-tc_f7dd9c9f, TypeMAPPING> {
+tc_be9602ff, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 1;
@@ -21187,7 +21277,7 @@ def S2_storerhnewgp : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Nt8),
 "memh(gp+#$Ii) = $Nt8.new",
-tc_ff9ee76e, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
+tc_5bf126a6, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
 let Inst{12-11} = 0b01;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -21209,7 +21299,7 @@ def S2_storeri_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) = $Rt32",
-tc_05b6c987, TypeST>, Enc_143445, AddrModeRel, PostInc_BaseImm {
+tc_30b9bb4a, TypeST>, Enc_143445, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1100;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
@@ -21230,9 +21320,10 @@ def S2_storeri_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memw($Rx32++$Mu2:brev) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111100;
+let addrMode = PostInc;
 let accessSize = WordAccess;
 let mayStore = 1;
 let BaseOpcode = "S2_storeri_pbr";
@@ -21243,7 +21334,7 @@ def S2_storeri_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
 "memw($Rx32++#$Ii:circ($Mu2)) = $Rt32",
-tc_9fdb5406, TypeST>, Enc_79b8c8, AddrModeRel {
+tc_e86aa961, TypeST>, Enc_79b8c8, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001100;
@@ -21259,7 +21350,7 @@ def S2_storeri_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memw($Rx32++I:circ($Mu2)) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001100;
 let addrMode = PostInc;
@@ -21274,7 +21365,7 @@ def S2_storeri_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rx32++#$Ii) = $Rt32",
-tc_f86c328a, TypeST>, Enc_db40cd, AddrModeRel, PostInc_BaseImm {
+tc_da97ee82, TypeST>, Enc_db40cd, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -21292,7 +21383,7 @@ def S2_storeri_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memw($Rx32++$Mu2) = $Rt32",
-tc_f86c328a, TypeST>, Enc_d5c73f {
+tc_da97ee82, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101100;
 let addrMode = PostInc;
@@ -21305,7 +21396,7 @@ def S2_storeri_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) = $Rt32",
-tc_05b6c987, TypeMAPPING> {
+tc_30b9bb4a, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -21313,7 +21404,7 @@ def S2_storerigp : HInst<
 (outs),
 (ins u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw(gp+#$Ii) = $Rt32",
-tc_a788683e, TypeV2LDST>, Enc_541f26, AddrModeRel {
+tc_0371abea, TypeV2LDST>, Enc_541f26, AddrModeRel {
 let Inst{24-21} = 0b0100;
 let Inst{31-27} = 0b01001;
 let accessSize = WordAccess;
@@ -21331,7 +21422,7 @@ def S2_storerinew_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Nt8),
 "memw($Rs32+#$Ii) = $Nt8.new",
-tc_f7dd9c9f, TypeST>, Enc_690862, AddrModeRel {
+tc_be9602ff, TypeST>, Enc_690862, AddrModeRel {
 let Inst{12-11} = 0b10;
 let Inst{24-21} = 0b1101;
 let Inst{31-27} = 0b10100;
@@ -21356,10 +21447,11 @@ def S2_storerinew_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memw($Rx32++$Mu2:brev) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b10101111101;
+let addrMode = PostInc;
 let accessSize = WordAccess;
 let isNVStore = 1;
 let isNewValue = 1;
@@ -21373,7 +21465,7 @@ def S2_storerinew_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
 "memw($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
-tc_9d5941c7, TypeST>, Enc_3f97c8, AddrModeRel {
+tc_d5c0729a, TypeST>, Enc_3f97c8, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{12-11} = 0b10;
@@ -21393,7 +21485,7 @@ def S2_storerinew_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memw($Rx32++I:circ($Mu2)) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b10101001101;
@@ -21412,7 +21504,7 @@ def S2_storerinew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "memw($Rx32++#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_223005, AddrModeRel {
+tc_c79a189f, TypeST>, Enc_223005, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b010;
@@ -21432,7 +21524,7 @@ def S2_storerinew_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memw($Rx32++$Mu2) = $Nt8.new",
-tc_e7d02c66, TypeST>, Enc_8dbe85 {
+tc_c79a189f, TypeST>, Enc_8dbe85 {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b10101101101;
@@ -21449,7 +21541,7 @@ def S2_storerinew_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Nt8),
 "memw($Rs32) = $Nt8.new",
-tc_f7dd9c9f, TypeMAPPING> {
+tc_be9602ff, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 1;
@@ -21458,7 +21550,7 @@ def S2_storerinewgp : HInst<
 (outs),
 (ins u30_2Imm:$Ii, IntRegs:$Nt8),
 "memw(gp+#$Ii) = $Nt8.new",
-tc_ff9ee76e, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
+tc_5bf126a6, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
 let Inst{12-11} = 0b10;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -21480,7 +21572,7 @@ def S2_storew_locked : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw_locked($Rs32,$Pd4) = $Rt32",
-tc_1372bca1, TypeST>, Enc_c2b48e {
+tc_5abb5e3f, TypeST>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100000101;
@@ -21493,7 +21585,7 @@ def S2_svsathb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = vsathb($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_5e2823 {
+tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -21504,7 +21596,7 @@ def S2_svsathub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = vsathub($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_5e2823 {
+tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -21515,7 +21607,7 @@ def S2_tableidxb : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
 "$Rx32 = tableidxb($Rs32,#$Ii,#$II):raw",
-tc_87735c3b, TypeS_2op>, Enc_cd82bc {
+tc_bfec0f01, TypeS_2op>, Enc_cd82bc {
 let Inst{31-22} = 0b1000011100;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21526,7 +21618,7 @@ def S2_tableidxb_goodsyntax : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = tableidxb($Rs32,#$Ii,#$II)",
-tc_87735c3b, TypeS_2op> {
+tc_bfec0f01, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -21537,7 +21629,7 @@ def S2_tableidxd : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
 "$Rx32 = tableidxd($Rs32,#$Ii,#$II):raw",
-tc_87735c3b, TypeS_2op>, Enc_cd82bc {
+tc_bfec0f01, TypeS_2op>, Enc_cd82bc {
 let Inst{31-22} = 0b1000011111;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21548,7 +21640,7 @@ def S2_tableidxd_goodsyntax : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = tableidxd($Rs32,#$Ii,#$II)",
-tc_87735c3b, TypeS_2op> {
+tc_bfec0f01, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -21558,7 +21650,7 @@ def S2_tableidxh : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
 "$Rx32 = tableidxh($Rs32,#$Ii,#$II):raw",
-tc_87735c3b, TypeS_2op>, Enc_cd82bc {
+tc_bfec0f01, TypeS_2op>, Enc_cd82bc {
 let Inst{31-22} = 0b1000011101;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21569,7 +21661,7 @@ def S2_tableidxh_goodsyntax : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = tableidxh($Rs32,#$Ii,#$II)",
-tc_87735c3b, TypeS_2op> {
+tc_bfec0f01, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -21579,7 +21671,7 @@ def S2_tableidxw : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
 "$Rx32 = tableidxw($Rs32,#$Ii,#$II):raw",
-tc_87735c3b, TypeS_2op>, Enc_cd82bc {
+tc_bfec0f01, TypeS_2op>, Enc_cd82bc {
 let Inst{31-22} = 0b1000011110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21590,7 +21682,7 @@ def S2_tableidxw_goodsyntax : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = tableidxw($Rs32,#$Ii,#$II)",
-tc_87735c3b, TypeS_2op> {
+tc_bfec0f01, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -21600,7 +21692,7 @@ def S2_togglebit_i : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = togglebit($Rs32,#$Ii)",
-tc_540fdfbc, TypeS_2op>, Enc_a05677 {
+tc_946df596, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100110;
@@ -21611,7 +21703,7 @@ def S2_togglebit_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = togglebit($Rs32,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_5ab2be {
+tc_946df596, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110100;
@@ -21622,7 +21714,7 @@ def S2_tstbit_i : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = tstbit($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_83ee64 {
+tc_643b4717, TypeS_2op>, Enc_83ee64 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101000;
@@ -21631,7 +21723,7 @@ def S2_tstbit_r : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = tstbit($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111000;
@@ -21640,7 +21732,7 @@ def S2_valignib : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, u3_0Imm:$Ii),
 "$Rdd32 = valignb($Rtt32,$Rss32,#$Ii)",
-tc_f8eeed7a, TypeS_3op>, Enc_729ff7 {
+tc_b4b5c03a, TypeS_3op>, Enc_729ff7 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000000000;
 }
@@ -21648,7 +21740,7 @@ def S2_valignrb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, PredRegs:$Pu4),
 "$Rdd32 = valignb($Rtt32,$Rss32,$Pu4)",
-tc_f8eeed7a, TypeS_3op>, Enc_8c6530 {
+tc_b4b5c03a, TypeS_3op>, Enc_8c6530 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000010000;
@@ -21657,7 +21749,7 @@ def S2_vcnegh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vcnegh($Rss32,$Rt32)",
-tc_b44c6e2a, TypeS_3op>, Enc_927852 {
+tc_779080bf, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011110;
@@ -21668,7 +21760,7 @@ def S2_vcrotate : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vcrotate($Rss32,$Rt32)",
-tc_2b6f77c6, TypeS_3op>, Enc_927852 {
+tc_002cb246, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011110;
@@ -21679,7 +21771,7 @@ def S2_vrcnegh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += vrcnegh($Rss32,$Rt32)",
-tc_e913dc32, TypeS_3op>, Enc_1aa186 {
+tc_d773585a, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -21690,7 +21782,7 @@ def S2_vrndpackwh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vrndwh($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_90cd8b {
+tc_14b5c689, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -21701,7 +21793,7 @@ def S2_vrndpackwhs : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vrndwh($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_90cd8b {
+tc_cf8126ae, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -21713,7 +21805,7 @@ def S2_vsathb : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vsathb($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_90cd8b {
+tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -21724,7 +21816,7 @@ def S2_vsathb_nopack : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vsathb($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_b9c5fb {
+tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000000;
 let Defs = [USR_OVF];
@@ -21733,7 +21825,7 @@ def S2_vsathub : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vsathub($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_90cd8b {
+tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -21744,7 +21836,7 @@ def S2_vsathub_nopack : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vsathub($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_b9c5fb {
+tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000000000;
 let Defs = [USR_OVF];
@@ -21753,7 +21845,7 @@ def S2_vsatwh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vsatwh($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_90cd8b {
+tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -21764,7 +21856,7 @@ def S2_vsatwh_nopack : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vsatwh($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_b9c5fb {
+tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000000;
 let Defs = [USR_OVF];
@@ -21773,7 +21865,7 @@ def S2_vsatwuh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vsatwuh($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_90cd8b {
+tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -21784,7 +21876,7 @@ def S2_vsatwuh_nopack : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vsatwuh($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_b9c5fb {
+tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000000000;
 let Defs = [USR_OVF];
@@ -21793,7 +21885,7 @@ def S2_vsplatrb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = vsplatb($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_5e2823 {
+tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10001100010;
 let hasNewValue = 1;
@@ -21805,7 +21897,7 @@ def S2_vsplatrh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsplath($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_3a3d62 {
+tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100010;
 let isReMaterializable = 1;
@@ -21815,7 +21907,7 @@ def S2_vspliceib : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, u3_0Imm:$Ii),
 "$Rdd32 = vspliceb($Rss32,$Rtt32,#$Ii)",
-tc_f8eeed7a, TypeS_3op>, Enc_d50cd3 {
+tc_b4b5c03a, TypeS_3op>, Enc_d50cd3 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000000100;
 }
@@ -21823,7 +21915,7 @@ def S2_vsplicerb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Pu4),
 "$Rdd32 = vspliceb($Rss32,$Rtt32,$Pu4)",
-tc_f8eeed7a, TypeS_3op>, Enc_dbd70c {
+tc_b4b5c03a, TypeS_3op>, Enc_dbd70c {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000010100;
@@ -21832,7 +21924,7 @@ def S2_vsxtbh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsxtbh($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_3a3d62 {
+tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100000;
 let isReMaterializable = 1;
@@ -21842,7 +21934,7 @@ def S2_vsxthw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsxthw($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_3a3d62 {
+tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100000;
 let isReMaterializable = 1;
@@ -21852,7 +21944,7 @@ def S2_vtrunehb : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vtrunehb($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_90cd8b {
+tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -21862,7 +21954,7 @@ def S2_vtrunewh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunewh($Rss32,$Rtt32)",
-tc_540fdfbc, TypeS_3op>, Enc_a56825 {
+tc_946df596, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -21871,7 +21963,7 @@ def S2_vtrunohb : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vtrunohb($Rss32)",
-tc_cde8b071, TypeS_2op>, Enc_90cd8b {
+tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -21881,7 +21973,7 @@ def S2_vtrunowh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunowh($Rss32,$Rtt32)",
-tc_540fdfbc, TypeS_3op>, Enc_a56825 {
+tc_946df596, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -21890,7 +21982,7 @@ def S2_vzxtbh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vzxtbh($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_3a3d62 {
+tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100000;
 let isReMaterializable = 1;
@@ -21900,7 +21992,7 @@ def S2_vzxthw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vzxthw($Rs32)",
-tc_cde8b071, TypeS_2op>, Enc_3a3d62 {
+tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000100000;
 let isReMaterializable = 1;
@@ -21910,7 +22002,7 @@ def S4_addaddi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, s32_0Imm:$Ii),
 "$Rd32 = add($Rs32,add($Ru32,#$Ii))",
-tc_c74f796f, TypeALU64>, Enc_8b8d61 {
+tc_f675fee8, TypeALU64>, Enc_8b8d61 {
 let Inst{31-23} = 0b110110110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21925,7 +22017,7 @@ def S4_addi_asl_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = add(#$Ii,asl($Rx32in,#$II))",
-tc_c74f796f, TypeALU64>, Enc_c31910 {
+tc_f675fee8, TypeALU64>, Enc_c31910 {
 let Inst{2-0} = 0b100;
 let Inst{4-4} = 0b0;
 let Inst{31-24} = 0b11011110;
@@ -21943,7 +22035,7 @@ def S4_addi_lsr_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = add(#$Ii,lsr($Rx32in,#$II))",
-tc_c74f796f, TypeALU64>, Enc_c31910 {
+tc_f675fee8, TypeALU64>, Enc_c31910 {
 let Inst{2-0} = 0b100;
 let Inst{4-4} = 0b1;
 let Inst{31-24} = 0b11011110;
@@ -21961,7 +22053,7 @@ def S4_andi_asl_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = and(#$Ii,asl($Rx32in,#$II))",
-tc_84df2cd3, TypeALU64>, Enc_c31910 {
+tc_f429765c, TypeALU64>, Enc_c31910 {
 let Inst{2-0} = 0b000;
 let Inst{4-4} = 0b0;
 let Inst{31-24} = 0b11011110;
@@ -21979,7 +22071,7 @@ def S4_andi_lsr_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = and(#$Ii,lsr($Rx32in,#$II))",
-tc_84df2cd3, TypeALU64>, Enc_c31910 {
+tc_f429765c, TypeALU64>, Enc_c31910 {
 let Inst{2-0} = 0b000;
 let Inst{4-4} = 0b1;
 let Inst{31-24} = 0b11011110;
@@ -21997,7 +22089,7 @@ def S4_clbaddi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s6_0Imm:$Ii),
 "$Rd32 = add(clb($Rs32),#$Ii)",
-tc_2b6f77c6, TypeS_2op>, Enc_9fae8a {
+tc_002cb246, TypeS_2op>, Enc_9fae8a {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b10001100001;
 let hasNewValue = 1;
@@ -22008,7 +22100,7 @@ def S4_clbpaddi : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, s6_0Imm:$Ii),
 "$Rd32 = add(clb($Rss32),#$Ii)",
-tc_2b6f77c6, TypeS_2op>, Enc_a1640c {
+tc_002cb246, TypeS_2op>, Enc_a1640c {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -22019,7 +22111,7 @@ def S4_clbpnorm : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = normamt($Rss32)",
-tc_d088982c, TypeS_2op>, Enc_90cd8b {
+tc_14b5c689, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -22030,7 +22122,7 @@ def S4_extract : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
 "$Rd32 = extract($Rs32,#$Ii,#$II)",
-tc_c74f796f, TypeS_2op>, Enc_b388cf {
+tc_f675fee8, TypeS_2op>, Enc_b388cf {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b100011011;
 let hasNewValue = 1;
@@ -22041,7 +22133,7 @@ def S4_extract_rp : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Rd32 = extract($Rs32,$Rtt32)",
-tc_2b6f77c6, TypeS_3op>, Enc_e07374 {
+tc_002cb246, TypeS_3op>, Enc_e07374 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001001000;
@@ -22053,7 +22145,7 @@ def S4_extractp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
 "$Rdd32 = extract($Rss32,#$Ii,#$II)",
-tc_c74f796f, TypeS_2op>, Enc_b84c4c {
+tc_f675fee8, TypeS_2op>, Enc_b84c4c {
 let Inst{31-24} = 0b10001010;
 let prefersSlot3 = 1;
 }
@@ -22061,7 +22153,7 @@ def S4_extractp_rp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = extract($Rss32,$Rtt32)",
-tc_2b6f77c6, TypeS_3op>, Enc_a56825 {
+tc_002cb246, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -22071,7 +22163,7 @@ def S4_lsli : HInst<
 (outs IntRegs:$Rd32),
 (ins s6_0Imm:$Ii, IntRegs:$Rt32),
 "$Rd32 = lsl(#$Ii,$Rt32)",
-tc_540fdfbc, TypeS_3op>, Enc_fef969 {
+tc_946df596, TypeS_3op>, Enc_fef969 {
 let Inst{7-6} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110100;
@@ -22082,7 +22174,7 @@ def S4_ntstbit_i : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = !tstbit($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_83ee64 {
+tc_643b4717, TypeS_2op>, Enc_83ee64 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101001;
@@ -22091,7 +22183,7 @@ def S4_ntstbit_r : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !tstbit($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e {
+tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111001;
@@ -22100,7 +22192,7 @@ def S4_or_andi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rx32 |= and($Rs32,#$Ii)",
-tc_84df2cd3, TypeALU64>, Enc_b0e9d8 {
+tc_f429765c, TypeALU64>, Enc_b0e9d8 {
 let Inst{31-22} = 0b1101101000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -22117,7 +22209,7 @@ def S4_or_andix : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Ru32, IntRegs:$Rx32in, s32_0Imm:$Ii),
 "$Rx32 = or($Ru32,and($Rx32in,#$Ii))",
-tc_84df2cd3, TypeALU64>, Enc_b4e6cf {
+tc_f429765c, TypeALU64>, Enc_b4e6cf {
 let Inst{31-22} = 0b1101101001;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -22133,7 +22225,7 @@ def S4_or_ori : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rx32 |= or($Rs32,#$Ii)",
-tc_84df2cd3, TypeALU64>, Enc_b0e9d8 {
+tc_f429765c, TypeALU64>, Enc_b0e9d8 {
 let Inst{31-22} = 0b1101101010;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -22150,7 +22242,7 @@ def S4_ori_asl_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = or(#$Ii,asl($Rx32in,#$II))",
-tc_84df2cd3, TypeALU64>, Enc_c31910 {
+tc_f429765c, TypeALU64>, Enc_c31910 {
 let Inst{2-0} = 0b010;
 let Inst{4-4} = 0b0;
 let Inst{31-24} = 0b11011110;
@@ -22168,7 +22260,7 @@ def S4_ori_lsr_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = or(#$Ii,lsr($Rx32in,#$II))",
-tc_84df2cd3, TypeALU64>, Enc_c31910 {
+tc_f429765c, TypeALU64>, Enc_c31910 {
 let Inst{2-0} = 0b010;
 let Inst{4-4} = 0b1;
 let Inst{31-24} = 0b11011110;
@@ -22186,7 +22278,7 @@ def S4_parity : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = parity($Rs32,$Rt32)",
-tc_2b6f77c6, TypeALU64>, Enc_5ab2be {
+tc_002cb246, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101111;
@@ -22198,7 +22290,7 @@ def S4_pstorerbf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memb(#$Ii) = $Rt32",
-tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22223,7 +22315,7 @@ def S4_pstorerbf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110101000;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -22239,7 +22331,7 @@ def S4_pstorerbfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memb(#$Ii) = $Rt32",
-tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22265,7 +22357,7 @@ def S4_pstorerbfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memb($Rs32+#$Ii) = $Rt32",
-tc_f86c328a, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_da8d43, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110000;
 let isPredicated = 1;
@@ -22288,7 +22380,7 @@ def S4_pstorerbfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110111000;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -22305,7 +22397,7 @@ def S4_pstorerbfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4.new) memb($Rs32) = $Rt32",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -22313,7 +22405,7 @@ def S4_pstorerbnewf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memb(#$Ii) = $Nt8.new",
-tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel {
+tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b000;
@@ -22341,7 +22433,7 @@ def S4_pstorerbnewf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b00;
 let Inst{31-21} = 0b00110101101;
 let isPredicated = 1;
@@ -22361,7 +22453,7 @@ def S4_pstorerbnewfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memb(#$Ii) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b100;
@@ -22390,7 +22482,7 @@ def S4_pstorerbnewfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeV2LDST>, Enc_585242, AddrModeRel {
+tc_c79a189f, TypeV2LDST>, Enc_585242, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b01000110101;
@@ -22417,7 +22509,7 @@ def S4_pstorerbnewfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b00;
 let Inst{31-21} = 0b00110111101;
 let isPredicated = 1;
@@ -22438,7 +22530,7 @@ def S4_pstorerbnewfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4.new) memb($Rs32) = $Nt8.new",
-tc_e7d02c66, TypeMAPPING> {
+tc_c79a189f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -22447,7 +22539,7 @@ def S4_pstorerbnewt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memb(#$Ii) = $Nt8.new",
-tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel {
+tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b000;
@@ -22474,7 +22566,7 @@ def S4_pstorerbnewt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b00;
 let Inst{31-21} = 0b00110100101;
 let isPredicated = 1;
@@ -22493,7 +22585,7 @@ def S4_pstorerbnewtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memb(#$Ii) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b100;
@@ -22521,7 +22613,7 @@ def S4_pstorerbnewtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeV2LDST>, Enc_585242, AddrModeRel {
+tc_c79a189f, TypeV2LDST>, Enc_585242, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b01000010101;
@@ -22547,7 +22639,7 @@ def S4_pstorerbnewtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b00;
 let Inst{31-21} = 0b00110110101;
 let isPredicated = 1;
@@ -22567,7 +22659,7 @@ def S4_pstorerbnewtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4.new) memb($Rs32) = $Nt8.new",
-tc_e7d02c66, TypeMAPPING> {
+tc_c79a189f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -22576,7 +22668,7 @@ def S4_pstorerbt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memb(#$Ii) = $Rt32",
-tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22600,7 +22692,7 @@ def S4_pstorerbt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110100000;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -22615,7 +22707,7 @@ def S4_pstorerbtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memb(#$Ii) = $Rt32",
-tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22640,7 +22732,7 @@ def S4_pstorerbtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memb($Rs32+#$Ii) = $Rt32",
-tc_f86c328a, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_da8d43, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010000;
 let isPredicated = 1;
@@ -22662,7 +22754,7 @@ def S4_pstorerbtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110110000;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -22678,7 +22770,7 @@ def S4_pstorerbtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4.new) memb($Rs32) = $Rt32",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -22686,7 +22778,7 @@ def S4_pstorerdf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd(#$Ii) = $Rtt32",
-tc_238d91d2, TypeST>, Enc_50b5ac, AddrModeRel {
+tc_362c6592, TypeST>, Enc_50b5ac, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22710,7 +22802,7 @@ def S4_pstorerdf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_5274e61a, TypeST>, Enc_1a9974, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_1a9974, AddrModeRel {
 let Inst{31-21} = 0b00110101110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -22725,7 +22817,7 @@ def S4_pstorerdfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd(#$Ii) = $Rtt32",
-tc_66888ded, TypeST>, Enc_50b5ac, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_50b5ac, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22750,7 +22842,7 @@ def S4_pstorerdfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
-tc_f86c328a, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_57a33e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110110;
 let isPredicated = 1;
@@ -22772,7 +22864,7 @@ def S4_pstorerdfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_3e07fb90, TypeST>, Enc_1a9974, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_1a9974, AddrModeRel {
 let Inst{31-21} = 0b00110111110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -22788,7 +22880,7 @@ def S4_pstorerdfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd($Rs32) = $Rtt32",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -22796,7 +22888,7 @@ def S4_pstorerdt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4) memd(#$Ii) = $Rtt32",
-tc_238d91d2, TypeST>, Enc_50b5ac, AddrModeRel {
+tc_362c6592, TypeST>, Enc_50b5ac, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22819,7 +22911,7 @@ def S4_pstorerdt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_5274e61a, TypeST>, Enc_1a9974, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_1a9974, AddrModeRel {
 let Inst{31-21} = 0b00110100110;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -22833,7 +22925,7 @@ def S4_pstorerdtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd(#$Ii) = $Rtt32",
-tc_66888ded, TypeST>, Enc_50b5ac, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_50b5ac, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22857,7 +22949,7 @@ def S4_pstorerdtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
-tc_f86c328a, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_57a33e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010110;
 let isPredicated = 1;
@@ -22878,7 +22970,7 @@ def S4_pstorerdtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_3e07fb90, TypeST>, Enc_1a9974, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_1a9974, AddrModeRel {
 let Inst{31-21} = 0b00110110110;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -22893,7 +22985,7 @@ def S4_pstorerdtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd($Rs32) = $Rtt32",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -22901,7 +22993,7 @@ def S4_pstorerff_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh(#$Ii) = $Rt32.h",
-tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22925,7 +23017,7 @@ def S4_pstorerff_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110101011;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -22940,7 +23032,7 @@ def S4_pstorerffnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh(#$Ii) = $Rt32.h",
-tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22965,7 +23057,7 @@ def S4_pstorerffnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
-tc_f86c328a, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110011;
 let isPredicated = 1;
@@ -22987,7 +23079,7 @@ def S4_pstorerffnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110111011;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -23003,7 +23095,7 @@ def S4_pstorerffnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32) = $Rt32.h",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23011,7 +23103,7 @@ def S4_pstorerft_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh(#$Ii) = $Rt32.h",
-tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23034,7 +23126,7 @@ def S4_pstorerft_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110100011;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -23048,7 +23140,7 @@ def S4_pstorerftnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh(#$Ii) = $Rt32.h",
-tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23072,7 +23164,7 @@ def S4_pstorerftnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
-tc_f86c328a, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010011;
 let isPredicated = 1;
@@ -23093,7 +23185,7 @@ def S4_pstorerftnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110110011;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -23108,7 +23200,7 @@ def S4_pstorerftnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32) = $Rt32.h",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23116,7 +23208,7 @@ def S4_pstorerhf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh(#$Ii) = $Rt32",
-tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23141,7 +23233,7 @@ def S4_pstorerhf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110101010;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -23157,7 +23249,7 @@ def S4_pstorerhfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh(#$Ii) = $Rt32",
-tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23183,7 +23275,7 @@ def S4_pstorerhfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32",
-tc_f86c328a, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110010;
 let isPredicated = 1;
@@ -23206,7 +23298,7 @@ def S4_pstorerhfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110111010;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -23223,7 +23315,7 @@ def S4_pstorerhfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32) = $Rt32",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23231,7 +23323,7 @@ def S4_pstorerhnewf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memh(#$Ii) = $Nt8.new",
-tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel {
+tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b001;
@@ -23259,7 +23351,7 @@ def S4_pstorerhnewf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b01;
 let Inst{31-21} = 0b00110101101;
 let isPredicated = 1;
@@ -23279,7 +23371,7 @@ def S4_pstorerhnewfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memh(#$Ii) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b101;
@@ -23308,7 +23400,7 @@ def S4_pstorerhnewfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeV2LDST>, Enc_f44229, AddrModeRel {
+tc_c79a189f, TypeV2LDST>, Enc_f44229, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b01000110101;
@@ -23335,7 +23427,7 @@ def S4_pstorerhnewfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b01;
 let Inst{31-21} = 0b00110111101;
 let isPredicated = 1;
@@ -23356,7 +23448,7 @@ def S4_pstorerhnewfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4.new) memh($Rs32) = $Nt8.new",
-tc_e7d02c66, TypeMAPPING> {
+tc_c79a189f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -23365,7 +23457,7 @@ def S4_pstorerhnewt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memh(#$Ii) = $Nt8.new",
-tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel {
+tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b001;
@@ -23392,7 +23484,7 @@ def S4_pstorerhnewt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b01;
 let Inst{31-21} = 0b00110100101;
 let isPredicated = 1;
@@ -23411,7 +23503,7 @@ def S4_pstorerhnewtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memh(#$Ii) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b101;
@@ -23439,7 +23531,7 @@ def S4_pstorerhnewtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeV2LDST>, Enc_f44229, AddrModeRel {
+tc_c79a189f, TypeV2LDST>, Enc_f44229, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b01000010101;
@@ -23465,7 +23557,7 @@ def S4_pstorerhnewtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b01;
 let Inst{31-21} = 0b00110110101;
 let isPredicated = 1;
@@ -23485,7 +23577,7 @@ def S4_pstorerhnewtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4.new) memh($Rs32) = $Nt8.new",
-tc_e7d02c66, TypeMAPPING> {
+tc_c79a189f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -23494,7 +23586,7 @@ def S4_pstorerht_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh(#$Ii) = $Rt32",
-tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23518,7 +23610,7 @@ def S4_pstorerht_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110100010;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -23533,7 +23625,7 @@ def S4_pstorerhtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh(#$Ii) = $Rt32",
-tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23558,7 +23650,7 @@ def S4_pstorerhtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32",
-tc_f86c328a, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010010;
 let isPredicated = 1;
@@ -23580,7 +23672,7 @@ def S4_pstorerhtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110110010;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -23596,7 +23688,7 @@ def S4_pstorerhtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32) = $Rt32",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23604,7 +23696,7 @@ def S4_pstorerif_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memw(#$Ii) = $Rt32",
-tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23629,7 +23721,7 @@ def S4_pstorerif_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110101100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -23645,7 +23737,7 @@ def S4_pstorerifnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memw(#$Ii) = $Rt32",
-tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23671,7 +23763,7 @@ def S4_pstorerifnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memw($Rs32+#$Ii) = $Rt32",
-tc_f86c328a, TypeV2LDST>, Enc_397f23, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_397f23, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110100;
 let isPredicated = 1;
@@ -23694,7 +23786,7 @@ def S4_pstorerifnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110111100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -23711,7 +23803,7 @@ def S4_pstorerifnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4.new) memw($Rs32) = $Rt32",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23719,7 +23811,7 @@ def S4_pstorerinewf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memw(#$Ii) = $Nt8.new",
-tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel {
+tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b010;
@@ -23747,7 +23839,7 @@ def S4_pstorerinewf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b10;
 let Inst{31-21} = 0b00110101101;
 let isPredicated = 1;
@@ -23767,7 +23859,7 @@ def S4_pstorerinewfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memw(#$Ii) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b110;
@@ -23796,7 +23888,7 @@ def S4_pstorerinewfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+tc_c79a189f, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b01000110101;
@@ -23823,7 +23915,7 @@ def S4_pstorerinewfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b10;
 let Inst{31-21} = 0b00110111101;
 let isPredicated = 1;
@@ -23844,7 +23936,7 @@ def S4_pstorerinewfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4.new) memw($Rs32) = $Nt8.new",
-tc_e7d02c66, TypeMAPPING> {
+tc_c79a189f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -23853,7 +23945,7 @@ def S4_pstorerinewt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memw(#$Ii) = $Nt8.new",
-tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel {
+tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b010;
@@ -23880,7 +23972,7 @@ def S4_pstorerinewt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b10;
 let Inst{31-21} = 0b00110100101;
 let isPredicated = 1;
@@ -23899,7 +23991,7 @@ def S4_pstorerinewtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memw(#$Ii) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b110;
@@ -23927,7 +24019,7 @@ def S4_pstorerinewtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
-tc_e7d02c66, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+tc_c79a189f, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b01000010101;
@@ -23953,7 +24045,7 @@ def S4_pstorerinewtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b10;
 let Inst{31-21} = 0b00110110101;
 let isPredicated = 1;
@@ -23973,7 +24065,7 @@ def S4_pstorerinewtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4.new) memw($Rs32) = $Nt8.new",
-tc_e7d02c66, TypeMAPPING> {
+tc_c79a189f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -23982,7 +24074,7 @@ def S4_pstorerit_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memw(#$Ii) = $Rt32",
-tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -24006,7 +24098,7 @@ def S4_pstorerit_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel {
+tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110100100;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -24021,7 +24113,7 @@ def S4_pstoreritnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memw(#$Ii) = $Rt32",
-tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -24046,7 +24138,7 @@ def S4_pstoreritnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memw($Rs32+#$Ii) = $Rt32",
-tc_f86c328a, TypeV2LDST>, Enc_397f23, AddrModeRel {
+tc_da97ee82, TypeV2LDST>, Enc_397f23, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010100;
 let isPredicated = 1;
@@ -24068,7 +24160,7 @@ def S4_pstoreritnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel {
+tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110110100;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
@@ -24084,7 +24176,7 @@ def S4_pstoreritnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4.new) memw($Rs32) = $Rt32",
-tc_f86c328a, TypeMAPPING> {
+tc_da97ee82, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24092,7 +24184,7 @@ def S4_stored_locked : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "memd_locked($Rs32,$Pd4) = $Rtt32",
-tc_1372bca1, TypeST>, Enc_d7dc10 {
+tc_5abb5e3f, TypeST>, Enc_d7dc10 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100000111;
@@ -24105,7 +24197,7 @@ def S4_storeirb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "memb($Rs32+#$Ii) = #$II",
-tc_05b6c987, TypeST>, Enc_8203bb, PredNewRel {
+tc_b83e6d73, TypeST>, Enc_8203bb, PredNewRel {
 let Inst{31-21} = 0b00111100000;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
@@ -24124,7 +24216,7 @@ def S4_storeirb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, s8_0Imm:$II),
 "memb($Rs32) = #$II",
-tc_05b6c987, TypeMAPPING> {
+tc_b83e6d73, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24132,7 +24224,7 @@ def S4_storeirbf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4) memb($Rs32+#$Ii) = #$II",
-tc_8b15472a, TypeST>, Enc_d7a65e, PredNewRel {
+tc_0b2be201, TypeST>, Enc_d7a65e, PredNewRel {
 let Inst{31-21} = 0b00111000100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24152,7 +24244,7 @@ def S4_storeirbf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4) memb($Rs32) = #$II",
-tc_8b15472a, TypeMAPPING> {
+tc_0b2be201, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24160,7 +24252,7 @@ def S4_storeirbfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4.new) memb($Rs32+#$Ii) = #$II",
-tc_f86c328a, TypeST>, Enc_d7a65e, PredNewRel {
+tc_c4f596e3, TypeST>, Enc_d7a65e, PredNewRel {
 let Inst{31-21} = 0b00111001100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24181,7 +24273,7 @@ def S4_storeirbfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4.new) memb($Rs32) = #$II",
-tc_f86c328a, TypeMAPPING> {
+tc_c4f596e3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24189,7 +24281,7 @@ def S4_storeirbt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4) memb($Rs32+#$Ii) = #$II",
-tc_8b15472a, TypeST>, Enc_d7a65e, PredNewRel {
+tc_0b2be201, TypeST>, Enc_d7a65e, PredNewRel {
 let Inst{31-21} = 0b00111000000;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
@@ -24208,7 +24300,7 @@ def S4_storeirbt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4) memb($Rs32) = #$II",
-tc_8b15472a, TypeMAPPING> {
+tc_0b2be201, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24216,7 +24308,7 @@ def S4_storeirbtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4.new) memb($Rs32+#$Ii) = #$II",
-tc_f86c328a, TypeST>, Enc_d7a65e, PredNewRel {
+tc_c4f596e3, TypeST>, Enc_d7a65e, PredNewRel {
 let Inst{31-21} = 0b00111001000;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
@@ -24236,7 +24328,7 @@ def S4_storeirbtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4.new) memb($Rs32) = #$II",
-tc_f86c328a, TypeMAPPING> {
+tc_c4f596e3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24244,7 +24336,7 @@ def S4_storeirh_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "memh($Rs32+#$Ii) = #$II",
-tc_05b6c987, TypeST>, Enc_a803e0, PredNewRel {
+tc_b83e6d73, TypeST>, Enc_a803e0, PredNewRel {
 let Inst{31-21} = 0b00111100001;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
@@ -24263,7 +24355,7 @@ def S4_storeirh_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, s8_0Imm:$II),
 "memh($Rs32) = #$II",
-tc_05b6c987, TypeMAPPING> {
+tc_b83e6d73, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24271,7 +24363,7 @@ def S4_storeirhf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4) memh($Rs32+#$Ii) = #$II",
-tc_8b15472a, TypeST>, Enc_f20719, PredNewRel {
+tc_0b2be201, TypeST>, Enc_f20719, PredNewRel {
 let Inst{31-21} = 0b00111000101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24291,7 +24383,7 @@ def S4_storeirhf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4) memh($Rs32) = #$II",
-tc_8b15472a, TypeMAPPING> {
+tc_0b2be201, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24299,7 +24391,7 @@ def S4_storeirhfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4.new) memh($Rs32+#$Ii) = #$II",
-tc_f86c328a, TypeST>, Enc_f20719, PredNewRel {
+tc_c4f596e3, TypeST>, Enc_f20719, PredNewRel {
 let Inst{31-21} = 0b00111001101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24320,7 +24412,7 @@ def S4_storeirhfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4.new) memh($Rs32) = #$II",
-tc_f86c328a, TypeMAPPING> {
+tc_c4f596e3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24328,7 +24420,7 @@ def S4_storeirht_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4) memh($Rs32+#$Ii) = #$II",
-tc_8b15472a, TypeST>, Enc_f20719, PredNewRel {
+tc_0b2be201, TypeST>, Enc_f20719, PredNewRel {
 let Inst{31-21} = 0b00111000001;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
@@ -24347,7 +24439,7 @@ def S4_storeirht_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4) memh($Rs32) = #$II",
-tc_8b15472a, TypeMAPPING> {
+tc_0b2be201, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24355,7 +24447,7 @@ def S4_storeirhtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4.new) memh($Rs32+#$Ii) = #$II",
-tc_f86c328a, TypeST>, Enc_f20719, PredNewRel {
+tc_c4f596e3, TypeST>, Enc_f20719, PredNewRel {
 let Inst{31-21} = 0b00111001001;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
@@ -24375,7 +24467,7 @@ def S4_storeirhtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4.new) memh($Rs32) = #$II",
-tc_f86c328a, TypeMAPPING> {
+tc_c4f596e3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24383,7 +24475,7 @@ def S4_storeiri_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "memw($Rs32+#$Ii) = #$II",
-tc_05b6c987, TypeST>, Enc_f37377, PredNewRel {
+tc_b83e6d73, TypeST>, Enc_f37377, PredNewRel {
 let Inst{31-21} = 0b00111100010;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -24402,7 +24494,7 @@ def S4_storeiri_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, s8_0Imm:$II),
 "memw($Rs32) = #$II",
-tc_05b6c987, TypeMAPPING> {
+tc_b83e6d73, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24410,7 +24502,7 @@ def S4_storeirif_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4) memw($Rs32+#$Ii) = #$II",
-tc_8b15472a, TypeST>, Enc_5ccba9, PredNewRel {
+tc_0b2be201, TypeST>, Enc_5ccba9, PredNewRel {
 let Inst{31-21} = 0b00111000110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24430,7 +24522,7 @@ def S4_storeirif_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4) memw($Rs32) = #$II",
-tc_8b15472a, TypeMAPPING> {
+tc_0b2be201, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24438,7 +24530,7 @@ def S4_storeirifnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4.new) memw($Rs32+#$Ii) = #$II",
-tc_f86c328a, TypeST>, Enc_5ccba9, PredNewRel {
+tc_c4f596e3, TypeST>, Enc_5ccba9, PredNewRel {
 let Inst{31-21} = 0b00111001110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24459,7 +24551,7 @@ def S4_storeirifnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4.new) memw($Rs32) = #$II",
-tc_f86c328a, TypeMAPPING> {
+tc_c4f596e3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24467,7 +24559,7 @@ def S4_storeirit_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4) memw($Rs32+#$Ii) = #$II",
-tc_8b15472a, TypeST>, Enc_5ccba9, PredNewRel {
+tc_0b2be201, TypeST>, Enc_5ccba9, PredNewRel {
 let Inst{31-21} = 0b00111000010;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
@@ -24486,7 +24578,7 @@ def S4_storeirit_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4) memw($Rs32) = #$II",
-tc_8b15472a, TypeMAPPING> {
+tc_0b2be201, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24494,7 +24586,7 @@ def S4_storeiritnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4.new) memw($Rs32+#$Ii) = #$II",
-tc_f86c328a, TypeST>, Enc_5ccba9, PredNewRel {
+tc_c4f596e3, TypeST>, Enc_5ccba9, PredNewRel {
 let Inst{31-21} = 0b00111001010;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
@@ -24514,7 +24606,7 @@ def S4_storeiritnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4.new) memw($Rs32) = #$II",
-tc_f86c328a, TypeMAPPING> {
+tc_c4f596e3, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24522,7 +24614,7 @@ def S4_storerb_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Rt32),
 "memb($Re32=#$II) = $Rt32",
-tc_66888ded, TypeST>, Enc_8bcba4, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011000;
@@ -24543,7 +24635,7 @@ def S4_storerb_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_d9709180, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011000;
 let addrMode = BaseRegOffset;
@@ -24559,7 +24651,7 @@ def S4_storerb_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
 "memb($Ru32<<#$Ii+#$II) = $Rt32",
-tc_0dc560de, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101000;
 let addrMode = BaseLongOffset;
@@ -24581,7 +24673,7 @@ def S4_storerbnew_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Nt8),
 "memb($Re32=#$II) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_724154, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b10101011101;
@@ -24605,7 +24697,7 @@ def S4_storerbnew_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b166348b, TypeST>, Enc_c6220b, AddrModeRel {
+tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel {
 let Inst{6-3} = 0b0000;
 let Inst{31-21} = 0b00111011101;
 let addrMode = BaseRegOffset;
@@ -24624,7 +24716,7 @@ def S4_storerbnew_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
 "memb($Ru32<<#$Ii+#$II) = $Nt8.new",
-tc_a8acdac0, TypeST>, Enc_7eb485, AddrModeRel {
+tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel {
 let Inst{7-7} = 0b1;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b10101101101;
@@ -24649,7 +24741,7 @@ def S4_storerd_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, DoubleRegs:$Rtt32),
 "memd($Re32=#$II) = $Rtt32",
-tc_66888ded, TypeST>, Enc_c7a204 {
+tc_da4a37ed, TypeST>, Enc_c7a204 {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011110;
@@ -24669,7 +24761,7 @@ def S4_storerd_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_d9709180, TypeST>, Enc_55355c, AddrModeRel, ImmRegShl {
+tc_5aee39f7, TypeST>, Enc_55355c, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011110;
 let addrMode = BaseRegOffset;
@@ -24684,7 +24776,7 @@ def S4_storerd_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, DoubleRegs:$Rtt32),
 "memd($Ru32<<#$Ii+#$II) = $Rtt32",
-tc_0dc560de, TypeST>, Enc_f79415, AddrModeRel, ImmRegShl {
+tc_14b272fa, TypeST>, Enc_f79415, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101110;
 let addrMode = BaseLongOffset;
@@ -24705,7 +24797,7 @@ def S4_storerf_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Rt32),
 "memh($Re32=#$II) = $Rt32.h",
-tc_66888ded, TypeST>, Enc_8bcba4 {
+tc_da4a37ed, TypeST>, Enc_8bcba4 {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011011;
@@ -24725,7 +24817,7 @@ def S4_storerf_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_d9709180, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011011;
 let addrMode = BaseRegOffset;
@@ -24740,7 +24832,7 @@ def S4_storerf_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
 "memh($Ru32<<#$Ii+#$II) = $Rt32.h",
-tc_0dc560de, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101011;
 let addrMode = BaseLongOffset;
@@ -24761,7 +24853,7 @@ def S4_storerh_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Rt32),
 "memh($Re32=#$II) = $Rt32",
-tc_66888ded, TypeST>, Enc_8bcba4, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011010;
@@ -24782,7 +24874,7 @@ def S4_storerh_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_d9709180, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011010;
 let addrMode = BaseRegOffset;
@@ -24798,7 +24890,7 @@ def S4_storerh_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
 "memh($Ru32<<#$Ii+#$II) = $Rt32",
-tc_0dc560de, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101010;
 let addrMode = BaseLongOffset;
@@ -24820,7 +24912,7 @@ def S4_storerhnew_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Nt8),
 "memh($Re32=#$II) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_724154, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-11} = 0b001;
 let Inst{31-21} = 0b10101011101;
@@ -24844,7 +24936,7 @@ def S4_storerhnew_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b166348b, TypeST>, Enc_c6220b, AddrModeRel {
+tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel {
 let Inst{6-3} = 0b0001;
 let Inst{31-21} = 0b00111011101;
 let addrMode = BaseRegOffset;
@@ -24863,7 +24955,7 @@ def S4_storerhnew_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
 "memh($Ru32<<#$Ii+#$II) = $Nt8.new",
-tc_a8acdac0, TypeST>, Enc_7eb485, AddrModeRel {
+tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel {
 let Inst{7-7} = 0b1;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b10101101101;
@@ -24888,7 +24980,7 @@ def S4_storeri_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Rt32),
 "memw($Re32=#$II) = $Rt32",
-tc_66888ded, TypeST>, Enc_8bcba4, AddrModeRel {
+tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011100;
@@ -24909,7 +25001,7 @@ def S4_storeri_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_d9709180, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011100;
 let addrMode = BaseRegOffset;
@@ -24925,7 +25017,7 @@ def S4_storeri_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
 "memw($Ru32<<#$Ii+#$II) = $Rt32",
-tc_0dc560de, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101100;
 let addrMode = BaseLongOffset;
@@ -24947,7 +25039,7 @@ def S4_storerinew_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Nt8),
 "memw($Re32=#$II) = $Nt8.new",
-tc_53bdb2f6, TypeST>, Enc_724154, AddrModeRel {
+tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-11} = 0b010;
 let Inst{31-21} = 0b10101011101;
@@ -24971,7 +25063,7 @@ def S4_storerinew_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b166348b, TypeST>, Enc_c6220b, AddrModeRel {
+tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel {
 let Inst{6-3} = 0b0010;
 let Inst{31-21} = 0b00111011101;
 let addrMode = BaseRegOffset;
@@ -24990,7 +25082,7 @@ def S4_storerinew_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
 "memw($Ru32<<#$Ii+#$II) = $Nt8.new",
-tc_a8acdac0, TypeST>, Enc_7eb485, AddrModeRel {
+tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel {
 let Inst{7-7} = 0b1;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b10101101101;
@@ -25015,7 +25107,7 @@ def S4_subaddi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Ru32),
 "$Rd32 = add($Rs32,sub(#$Ii,$Ru32))",
-tc_c74f796f, TypeALU64>, Enc_8b8d61 {
+tc_f675fee8, TypeALU64>, Enc_8b8d61 {
 let Inst{31-23} = 0b110110111;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25030,7 +25122,7 @@ def S4_subi_asl_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = sub(#$Ii,asl($Rx32in,#$II))",
-tc_c74f796f, TypeALU64>, Enc_c31910 {
+tc_f675fee8, TypeALU64>, Enc_c31910 {
 let Inst{2-0} = 0b110;
 let Inst{4-4} = 0b0;
 let Inst{31-24} = 0b11011110;
@@ -25048,7 +25140,7 @@ def S4_subi_lsr_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = sub(#$Ii,lsr($Rx32in,#$II))",
-tc_c74f796f, TypeALU64>, Enc_c31910 {
+tc_f675fee8, TypeALU64>, Enc_c31910 {
 let Inst{2-0} = 0b110;
 let Inst{4-4} = 0b1;
 let Inst{31-24} = 0b11011110;
@@ -25066,7 +25158,7 @@ def S4_vrcrotate : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rdd32 = vrcrotate($Rss32,$Rt32,#$Ii)",
-tc_b9c0b731, TypeS_3op>, Enc_645d54 {
+tc_13bfbcf9, TypeS_3op>, Enc_645d54 {
 let Inst{7-6} = 0b11;
 let Inst{31-21} = 0b11000011110;
 let prefersSlot3 = 1;
@@ -25075,7 +25167,7 @@ def S4_vrcrotate_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rxx32 += vrcrotate($Rss32,$Rt32,#$Ii)",
-tc_60571023, TypeS_3op>, Enc_b72622 {
+tc_9debc299, TypeS_3op>, Enc_b72622 {
 let Inst{7-6} = 0b00;
 let Inst{31-21} = 0b11001011101;
 let prefersSlot3 = 1;
@@ -25085,7 +25177,7 @@ def S4_vxaddsubh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxaddsubh($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeS_3op>, Enc_a56825 {
+tc_779080bf, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -25096,7 +25188,7 @@ def S4_vxaddsubhr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxaddsubh($Rss32,$Rtt32):rnd:>>1:sat",
-tc_2b6f77c6, TypeS_3op>, Enc_a56825 {
+tc_002cb246, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -25107,7 +25199,7 @@ def S4_vxaddsubw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxaddsubw($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeS_3op>, Enc_a56825 {
+tc_779080bf, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -25118,7 +25210,7 @@ def S4_vxsubaddh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxsubaddh($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeS_3op>, Enc_a56825 {
+tc_779080bf, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -25129,7 +25221,7 @@ def S4_vxsubaddhr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxsubaddh($Rss32,$Rtt32):rnd:>>1:sat",
-tc_2b6f77c6, TypeS_3op>, Enc_a56825 {
+tc_002cb246, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -25140,7 +25232,7 @@ def S4_vxsubaddw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxsubaddw($Rss32,$Rtt32):sat",
-tc_b44c6e2a, TypeS_3op>, Enc_a56825 {
+tc_779080bf, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -25151,7 +25243,7 @@ def S5_asrhub_rnd_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146 {
+tc_002cb246, TypeS_2op>, Enc_11a146 {
 let Inst{7-5} = 0b100;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25164,7 +25256,7 @@ def S5_asrhub_rnd_sat_goodsyntax : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
-tc_2b6f77c6, TypeS_2op> {
+tc_002cb246, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -25173,7 +25265,7 @@ def S5_asrhub_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):sat",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146 {
+tc_002cb246, TypeS_2op>, Enc_11a146 {
 let Inst{7-5} = 0b101;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25186,7 +25278,7 @@ def S5_popcountp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = popcount($Rss32)",
-tc_00afc57e, TypeS_2op>, Enc_90cd8b {
+tc_703e822c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -25197,7 +25289,7 @@ def S5_vasrhrnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_12b6e9 {
+tc_002cb246, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000001;
@@ -25207,14 +25299,14 @@ def S5_vasrhrnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op> {
+tc_002cb246, TypeS_2op> {
 let isPseudo = 1;
 }
 def S6_allocframe_to_raw : HInst<
 (outs),
 (ins u11_3Imm:$Ii),
 "allocframe(#$Ii)",
-tc_e216a5db, TypeMAPPING>, Requires<[HasV65]> {
+tc_b44ecf75, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -25222,7 +25314,7 @@ def S6_rol_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = rol($Rss32,#$Ii)",
-tc_55050d58, TypeS_2op>, Enc_5eac98, Requires<[HasV60]> {
+tc_1fc97744, TypeS_2op>, Enc_5eac98, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000000000;
 }
@@ -25230,7 +25322,7 @@ def S6_rol_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -25240,7 +25332,7 @@ def S6_rol_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -25250,7 +25342,7 @@ def S6_rol_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -25260,7 +25352,7 @@ def S6_rol_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -25270,7 +25362,7 @@ def S6_rol_i_p_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 ^= rol($Rss32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010100;
 let prefersSlot3 = 1;
@@ -25280,7 +25372,7 @@ def S6_rol_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = rol($Rs32,#$Ii)",
-tc_55050d58, TypeS_2op>, Enc_a05677, Requires<[HasV60]> {
+tc_1fc97744, TypeS_2op>, Enc_a05677, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -25291,7 +25383,7 @@ def S6_rol_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -25304,7 +25396,7 @@ def S6_rol_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -25317,7 +25409,7 @@ def S6_rol_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -25330,7 +25422,7 @@ def S6_rol_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -25343,7 +25435,7 @@ def S6_rol_i_r_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 ^= rol($Rs32,#$Ii)",
-tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110100;
@@ -25356,7 +25448,7 @@ def S6_vsplatrbp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsplatb($Rs32)",
-tc_be706f30, TypeS_2op>, Enc_3a3d62, Requires<[HasV62]> {
+tc_a1c00888, TypeS_2op>, Enc_3a3d62, Requires<[HasV62]> {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100010;
 }
@@ -25364,7 +25456,7 @@ def S6_vtrunehb_ppp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunehb($Rss32,$Rtt32)",
-tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
+tc_1fc97744, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -25373,7 +25465,7 @@ def S6_vtrunohb_ppp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunohb($Rss32,$Rtt32)",
-tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
+tc_1fc97744, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -25382,7 +25474,7 @@ def SA1_addi : HInst<
 (outs GeneralSubRegs:$Rx16),
 (ins IntRegs:$Rx16in, s32_0Imm:$Ii),
 "$Rx16 = add($Rx16in,#$Ii)",
-tc_609d2efe, TypeSUBINSN>, Enc_93af4c {
+tc_0a705168, TypeSUBINSN>, Enc_93af4c {
 let Inst{12-11} = 0b00;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25399,7 +25491,7 @@ def SA1_addrx : HInst<
 (outs GeneralSubRegs:$Rx16),
 (ins IntRegs:$Rx16in, GeneralSubRegs:$Rs16),
 "$Rx16 = add($Rx16in,$Rs16)",
-tc_609d2efe, TypeSUBINSN>, Enc_0527db {
+tc_0a705168, TypeSUBINSN>, Enc_0527db {
 let Inst{12-8} = 0b11000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25411,7 +25503,7 @@ def SA1_addsp : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins u6_2Imm:$Ii),
 "$Rd16 = add(r29,#$Ii)",
-tc_a904d137, TypeSUBINSN>, Enc_2df31d {
+tc_9fc3dae0, TypeSUBINSN>, Enc_2df31d {
 let Inst{12-10} = 0b011;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25423,7 +25515,7 @@ def SA1_and1 : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = and($Rs16,#1)",
-tc_a904d137, TypeSUBINSN>, Enc_97d666 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10010;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25434,7 +25526,7 @@ def SA1_clrf : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins),
 "if (!p0) $Rd16 = #0",
-tc_1b82a277, TypeSUBINSN>, Enc_1f5ba6 {
+tc_a1123dda, TypeSUBINSN>, Enc_1f5ba6 {
 let Inst{12-4} = 0b110100111;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25448,7 +25540,7 @@ def SA1_clrfnew : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins),
 "if (!p0.new) $Rd16 = #0",
-tc_e9c822f7, TypeSUBINSN>, Enc_1f5ba6 {
+tc_8b3e402a, TypeSUBINSN>, Enc_1f5ba6 {
 let Inst{12-4} = 0b110100101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25463,7 +25555,7 @@ def SA1_clrt : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins),
 "if (p0) $Rd16 = #0",
-tc_1b82a277, TypeSUBINSN>, Enc_1f5ba6 {
+tc_a1123dda, TypeSUBINSN>, Enc_1f5ba6 {
 let Inst{12-4} = 0b110100110;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -25476,7 +25568,7 @@ def SA1_clrtnew : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins),
 "if (p0.new) $Rd16 = #0",
-tc_e9c822f7, TypeSUBINSN>, Enc_1f5ba6 {
+tc_8b3e402a, TypeSUBINSN>, Enc_1f5ba6 {
 let Inst{12-4} = 0b110100100;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -25490,7 +25582,7 @@ def SA1_cmpeqi : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u2_0Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$Ii)",
-tc_90f3e30c, TypeSUBINSN>, Enc_63eaeb {
+tc_5b7c0967, TypeSUBINSN>, Enc_63eaeb {
 let Inst{3-2} = 0b00;
 let Inst{12-8} = 0b11001;
 let AsmVariantName = "NonParsable";
@@ -25501,7 +25593,7 @@ def SA1_combine0i : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u2_0Imm:$Ii),
 "$Rdd8 = combine(#0,#$Ii)",
-tc_a904d137, TypeSUBINSN>, Enc_ed48be {
+tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be {
 let Inst{4-3} = 0b00;
 let Inst{12-7} = 0b111000;
 let hasNewValue = 1;
@@ -25513,7 +25605,7 @@ def SA1_combine1i : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u2_0Imm:$Ii),
 "$Rdd8 = combine(#1,#$Ii)",
-tc_a904d137, TypeSUBINSN>, Enc_ed48be {
+tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be {
 let Inst{4-3} = 0b01;
 let Inst{12-7} = 0b111000;
 let hasNewValue = 1;
@@ -25525,7 +25617,7 @@ def SA1_combine2i : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u2_0Imm:$Ii),
 "$Rdd8 = combine(#2,#$Ii)",
-tc_a904d137, TypeSUBINSN>, Enc_ed48be {
+tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be {
 let Inst{4-3} = 0b10;
 let Inst{12-7} = 0b111000;
 let hasNewValue = 1;
@@ -25537,7 +25629,7 @@ def SA1_combine3i : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u2_0Imm:$Ii),
 "$Rdd8 = combine(#3,#$Ii)",
-tc_a904d137, TypeSUBINSN>, Enc_ed48be {
+tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be {
 let Inst{4-3} = 0b11;
 let Inst{12-7} = 0b111000;
 let hasNewValue = 1;
@@ -25549,7 +25641,7 @@ def SA1_combinerz : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins GeneralSubRegs:$Rs16),
 "$Rdd8 = combine($Rs16,#0)",
-tc_a904d137, TypeSUBINSN>, Enc_399e12 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_399e12 {
 let Inst{3-3} = 0b1;
 let Inst{12-8} = 0b11101;
 let hasNewValue = 1;
@@ -25561,7 +25653,7 @@ def SA1_combinezr : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins GeneralSubRegs:$Rs16),
 "$Rdd8 = combine(#0,$Rs16)",
-tc_a904d137, TypeSUBINSN>, Enc_399e12 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_399e12 {
 let Inst{3-3} = 0b0;
 let Inst{12-8} = 0b11101;
 let hasNewValue = 1;
@@ -25573,7 +25665,7 @@ def SA1_dec : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1),
 "$Rd16 = add($Rs16,#$n1)",
-tc_609d2efe, TypeSUBINSN>, Enc_ee5ed0 {
+tc_0a705168, TypeSUBINSN>, Enc_ee5ed0 {
 let Inst{12-8} = 0b10011;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25584,7 +25676,7 @@ def SA1_inc : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = add($Rs16,#1)",
-tc_a904d137, TypeSUBINSN>, Enc_97d666 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10001;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25595,7 +25687,7 @@ def SA1_seti : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins u32_0Imm:$Ii),
 "$Rd16 = #$Ii",
-tc_a904d137, TypeSUBINSN>, Enc_e39bb2 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_e39bb2 {
 let Inst{12-10} = 0b010;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25611,7 +25703,7 @@ def SA1_setin1 : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins n1Const:$n1),
 "$Rd16 = #$n1",
-tc_a904d137, TypeSUBINSN>, Enc_7a0ea6 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_7a0ea6 {
 let Inst{12-4} = 0b110100000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25622,7 +25714,7 @@ def SA1_sxtb : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = sxtb($Rs16)",
-tc_a904d137, TypeSUBINSN>, Enc_97d666 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10101;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25633,7 +25725,7 @@ def SA1_sxth : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = sxth($Rs16)",
-tc_a904d137, TypeSUBINSN>, Enc_97d666 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10100;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25644,7 +25736,7 @@ def SA1_tfr : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = $Rs16",
-tc_a904d137, TypeSUBINSN>, Enc_97d666 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25655,7 +25747,7 @@ def SA1_zxtb : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = and($Rs16,#255)",
-tc_a904d137, TypeSUBINSN>, Enc_97d666 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10111;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25666,7 +25758,7 @@ def SA1_zxth : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = zxth($Rs16)",
-tc_a904d137, TypeSUBINSN>, Enc_97d666 {
+tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25677,7 +25769,7 @@ def SL1_loadri_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
 "$Rd16 = memw($Rs16+#$Ii)",
-tc_7f881c76, TypeSUBINSN>, Enc_53dca9 {
+tc_17e0d2cd, TypeSUBINSN>, Enc_53dca9 {
 let Inst{12-12} = 0b0;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25691,7 +25783,7 @@ def SL1_loadrub_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
 "$Rd16 = memub($Rs16+#$Ii)",
-tc_7f881c76, TypeSUBINSN>, Enc_c175d0 {
+tc_17e0d2cd, TypeSUBINSN>, Enc_c175d0 {
 let Inst{12-12} = 0b1;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25705,7 +25797,7 @@ def SL2_deallocframe : HInst<
 (outs),
 (ins),
 "deallocframe",
-tc_36c68ad1, TypeSUBINSN>, Enc_e3b0c4 {
+tc_39dfefe8, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111100000000;
 let accessSize = DoubleWordAccess;
 let AsmVariantName = "NonParsable";
@@ -25718,7 +25810,7 @@ def SL2_jumpr31 : HInst<
 (outs),
 (ins),
 "jumpr r31",
-tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 {
+tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000000;
 let isTerminator = 1;
 let isIndirectBranch = 1;
@@ -25733,7 +25825,7 @@ def SL2_jumpr31_f : HInst<
 (outs),
 (ins),
 "if (!p0) jumpr r31",
-tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 {
+tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25751,7 +25843,7 @@ def SL2_jumpr31_fnew : HInst<
 (outs),
 (ins),
 "if (!p0.new) jumpr:nt r31",
-tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 {
+tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000111;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25770,7 +25862,7 @@ def SL2_jumpr31_t : HInst<
 (outs),
 (ins),
 "if (p0) jumpr r31",
-tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 {
+tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000100;
 let isPredicated = 1;
 let isTerminator = 1;
@@ -25787,7 +25879,7 @@ def SL2_jumpr31_tnew : HInst<
 (outs),
 (ins),
 "if (p0.new) jumpr:nt r31",
-tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 {
+tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000110;
 let isPredicated = 1;
 let isTerminator = 1;
@@ -25805,7 +25897,7 @@ def SL2_loadrb_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u3_0Imm:$Ii),
 "$Rd16 = memb($Rs16+#$Ii)",
-tc_7f881c76, TypeSUBINSN>, Enc_2fbf3c {
+tc_17e0d2cd, TypeSUBINSN>, Enc_2fbf3c {
 let Inst{12-11} = 0b10;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25819,7 +25911,7 @@ def SL2_loadrd_sp : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u5_3Imm:$Ii),
 "$Rdd8 = memd(r29+#$Ii)",
-tc_9c98e8af, TypeSUBINSN>, Enc_86a14b {
+tc_c4db48cb, TypeSUBINSN>, Enc_86a14b {
 let Inst{12-8} = 0b11110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25834,7 +25926,7 @@ def SL2_loadrh_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
 "$Rd16 = memh($Rs16+#$Ii)",
-tc_7f881c76, TypeSUBINSN>, Enc_2bae10 {
+tc_17e0d2cd, TypeSUBINSN>, Enc_2bae10 {
 let Inst{12-11} = 0b00;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25848,7 +25940,7 @@ def SL2_loadri_sp : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins u5_2Imm:$Ii),
 "$Rd16 = memw(r29+#$Ii)",
-tc_9c98e8af, TypeSUBINSN>, Enc_51635c {
+tc_c4db48cb, TypeSUBINSN>, Enc_51635c {
 let Inst{12-9} = 0b1110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25863,7 +25955,7 @@ def SL2_loadruh_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
 "$Rd16 = memuh($Rs16+#$Ii)",
-tc_7f881c76, TypeSUBINSN>, Enc_2bae10 {
+tc_17e0d2cd, TypeSUBINSN>, Enc_2bae10 {
 let Inst{12-11} = 0b01;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25877,7 +25969,7 @@ def SL2_return : HInst<
 (outs),
 (ins),
 "dealloc_return",
-tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 {
+tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000000;
 let isTerminator = 1;
 let isIndirectBranch = 1;
@@ -25895,7 +25987,7 @@ def SL2_return_f : HInst<
 (outs),
 (ins),
 "if (!p0) dealloc_return",
-tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 {
+tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25916,7 +26008,7 @@ def SL2_return_fnew : HInst<
 (outs),
 (ins),
 "if (!p0.new) dealloc_return:nt",
-tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 {
+tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000111;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25938,7 +26030,7 @@ def SL2_return_t : HInst<
 (outs),
 (ins),
 "if (p0) dealloc_return",
-tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 {
+tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000100;
 let isPredicated = 1;
 let isTerminator = 1;
@@ -25958,7 +26050,7 @@ def SL2_return_tnew : HInst<
 (outs),
 (ins),
 "if (p0.new) dealloc_return:nt",
-tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 {
+tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000110;
 let isPredicated = 1;
 let isTerminator = 1;
@@ -25979,7 +26071,7 @@ def SS1_storeb_io : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii, GeneralSubRegs:$Rt16),
 "memb($Rs16+#$Ii) = $Rt16",
-tc_05b6c987, TypeSUBINSN>, Enc_b38ffc {
+tc_30b9bb4a, TypeSUBINSN>, Enc_b38ffc {
 let Inst{12-12} = 0b1;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
@@ -25991,7 +26083,7 @@ def SS1_storew_io : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii, GeneralSubRegs:$Rt16),
 "memw($Rs16+#$Ii) = $Rt16",
-tc_05b6c987, TypeSUBINSN>, Enc_f55a0c {
+tc_30b9bb4a, TypeSUBINSN>, Enc_f55a0c {
 let Inst{12-12} = 0b0;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -26003,7 +26095,7 @@ def SS2_allocframe : HInst<
 (outs),
 (ins u5_3Imm:$Ii),
 "allocframe(#$Ii)",
-tc_0fc1ae07, TypeSUBINSN>, Enc_6f70ca {
+tc_49a8207d, TypeSUBINSN>, Enc_6f70ca {
 let Inst{3-0} = 0b0000;
 let Inst{12-9} = 0b1110;
 let addrMode = BaseImmOffset;
@@ -26018,7 +26110,7 @@ def SS2_storebi0 : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
 "memb($Rs16+#$Ii) = #0",
-tc_57288781, TypeSUBINSN>, Enc_84d359 {
+tc_89e94ad3, TypeSUBINSN>, Enc_84d359 {
 let Inst{12-8} = 0b10010;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
@@ -26030,7 +26122,7 @@ def SS2_storebi1 : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
 "memb($Rs16+#$Ii) = #1",
-tc_57288781, TypeSUBINSN>, Enc_84d359 {
+tc_89e94ad3, TypeSUBINSN>, Enc_84d359 {
 let Inst{12-8} = 0b10011;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
@@ -26042,7 +26134,7 @@ def SS2_stored_sp : HInst<
 (outs),
 (ins s6_3Imm:$Ii, GeneralDoubleLow8Regs:$Rtt8),
 "memd(r29+#$Ii) = $Rtt8",
-tc_a788683e, TypeSUBINSN>, Enc_b8309d {
+tc_0371abea, TypeSUBINSN>, Enc_b8309d {
 let Inst{12-9} = 0b0101;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
@@ -26055,7 +26147,7 @@ def SS2_storeh_io : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii, GeneralSubRegs:$Rt16),
 "memh($Rs16+#$Ii) = $Rt16",
-tc_05b6c987, TypeSUBINSN>, Enc_625deb {
+tc_30b9bb4a, TypeSUBINSN>, Enc_625deb {
 let Inst{12-11} = 0b00;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
@@ -26067,7 +26159,7 @@ def SS2_storew_sp : HInst<
 (outs),
 (ins u5_2Imm:$Ii, GeneralSubRegs:$Rt16),
 "memw(r29+#$Ii) = $Rt16",
-tc_a788683e, TypeSUBINSN>, Enc_87c142 {
+tc_0371abea, TypeSUBINSN>, Enc_87c142 {
 let Inst{12-9} = 0b0100;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -26080,7 +26172,7 @@ def SS2_storewi0 : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
 "memw($Rs16+#$Ii) = #0",
-tc_57288781, TypeSUBINSN>, Enc_a6ce9c {
+tc_89e94ad3, TypeSUBINSN>, Enc_a6ce9c {
 let Inst{12-8} = 0b10000;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -26092,7 +26184,7 @@ def SS2_storewi1 : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
 "memw($Rs16+#$Ii) = #1",
-tc_57288781, TypeSUBINSN>, Enc_a6ce9c {
+tc_89e94ad3, TypeSUBINSN>, Enc_a6ce9c {
 let Inst{12-8} = 0b10001;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -26230,7 +26322,7 @@ def V6_extractw : HInst<
 (outs IntRegs:$Rd32),
 (ins HvxVR:$Vu32, IntRegs:$Rs32),
 "$Rd32 = vextract($Vu32,$Rs32)",
-tc_9777e6bf, TypeLD>, Enc_50e578, Requires<[UseHVXV60]> {
+tc_540c3da3, TypeLD>, Enc_50e578, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10010010000;
@@ -26451,7 +26543,7 @@ def V6_lvsplatb : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32),
 "$Vd32.b = vsplat($Rt32)",
-tc_6b78cf13, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> {
+tc_c4edf264, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b00011001110;
 let hasNewValue = 1;
@@ -26462,7 +26554,7 @@ def V6_lvsplath : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32),
 "$Vd32.h = vsplat($Rt32)",
-tc_6b78cf13, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> {
+tc_c4edf264, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b00011001110;
 let hasNewValue = 1;
@@ -26473,7 +26565,7 @@ def V6_lvsplatw : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32),
 "$Vd32 = vsplat($Rt32)",
-tc_6b78cf13, TypeCVI_VX_LATE>, Enc_a5ed8a, Requires<[UseHVXV60]> {
+tc_c4edf264, TypeCVI_VX_LATE>, Enc_a5ed8a, Requires<[UseHVXV60]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
@@ -26484,7 +26576,7 @@ def V6_pred_and : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxQR:$Qs4, HvxQR:$Qt4),
 "$Qd4 = and($Qs4,$Qt4)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{21-16} = 0b000011;
@@ -26497,7 +26589,7 @@ def V6_pred_and_n : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxQR:$Qs4, HvxQR:$Qt4),
 "$Qd4 = and($Qs4,!$Qt4)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000101;
 let Inst{13-10} = 0b0000;
 let Inst{21-16} = 0b000011;
@@ -26510,7 +26602,7 @@ def V6_pred_not : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxQR:$Qs4),
 "$Qd4 = not($Qs4)",
-tc_71337255, TypeCVI_VA>, Enc_bfbf03, Requires<[UseHVXV60]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_bfbf03, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000010;
 let Inst{13-10} = 0b0000;
 let Inst{31-16} = 0b0001111000000011;
@@ -26522,7 +26614,7 @@ def V6_pred_or : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxQR:$Qs4, HvxQR:$Qt4),
 "$Qd4 = or($Qs4,$Qt4)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000001;
 let Inst{13-10} = 0b0000;
 let Inst{21-16} = 0b000011;
@@ -26535,7 +26627,7 @@ def V6_pred_or_n : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxQR:$Qs4, HvxQR:$Qt4),
 "$Qd4 = or($Qs4,!$Qt4)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000100;
 let Inst{13-10} = 0b0000;
 let Inst{21-16} = 0b000011;
@@ -26548,7 +26640,7 @@ def V6_pred_scalar2 : HInst<
 (outs HvxQR:$Qd4),
 (ins IntRegs:$Rt32),
 "$Qd4 = vsetq($Rt32)",
-tc_4105d6b5, TypeCVI_VP>, Enc_7222b7, Requires<[UseHVXV60]> {
+tc_5bf8afbb, TypeCVI_VP>, Enc_7222b7, Requires<[UseHVXV60]> {
 let Inst{13-2} = 0b000000010001;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
@@ -26559,7 +26651,7 @@ def V6_pred_scalar2v2 : HInst<
 (outs HvxQR:$Qd4),
 (ins IntRegs:$Rt32),
 "$Qd4 = vsetq2($Rt32)",
-tc_4105d6b5, TypeCVI_VP>, Enc_7222b7, Requires<[UseHVXV62]> {
+tc_5bf8afbb, TypeCVI_VP>, Enc_7222b7, Requires<[UseHVXV62]> {
 let Inst{13-2} = 0b000000010011;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
@@ -26570,7 +26662,7 @@ def V6_pred_xor : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxQR:$Qs4, HvxQR:$Qt4),
 "$Qd4 = xor($Qs4,$Qt4)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000011;
 let Inst{13-10} = 0b0000;
 let Inst{21-16} = 0b000011;
@@ -26583,7 +26675,7 @@ def V6_shuffeqh : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxQR:$Qs4, HvxQR:$Qt4),
 "$Qd4.b = vshuffe($Qs4.h,$Qt4.h)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV62]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV62]> {
 let Inst{7-2} = 0b000110;
 let Inst{13-10} = 0b0000;
 let Inst{21-16} = 0b000011;
@@ -26596,7 +26688,7 @@ def V6_shuffeqw : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxQR:$Qs4, HvxQR:$Qt4),
 "$Qd4.h = vshuffe($Qs4.w,$Qt4.w)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV62]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV62]> {
 let Inst{7-2} = 0b000111;
 let Inst{13-10} = 0b0000;
 let Inst{21-16} = 0b000011;
@@ -26746,7 +26838,7 @@ def V6_vL32Ub_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32 = vmemu($Rt32+#$Ii)",
-tc_35e92f8e, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[UseHVXV60]> {
+tc_a7e6707d, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000000;
@@ -26763,7 +26855,7 @@ def V6_vL32Ub_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32 = vmemu($Rx32++#$Ii)",
-tc_4fd8566e, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[UseHVXV60]> {
+tc_3c56e5ce, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001000;
@@ -26782,7 +26874,7 @@ def V6_vL32Ub_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Vd32 = vmemu($Rx32++$Mu2)",
-tc_4fd8566e, TypeCVI_VM_VP_LDU>, Enc_2ebe3b, Requires<[UseHVXV60]> {
+tc_3c56e5ce, TypeCVI_VM_VP_LDU>, Enc_2ebe3b, Requires<[UseHVXV60]> {
 let Inst{12-5} = 0b00000111;
 let Inst{31-21} = 0b00101011000;
 let hasNewValue = 1;
@@ -26799,7 +26891,7 @@ def V6_vL32b_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32 = vmem($Rt32+#$Ii)",
-tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000000;
@@ -26819,7 +26911,7 @@ def V6_vL32b_cur_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32.cur = vmem($Rt32+#$Ii)",
-tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b001;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000000;
@@ -26839,7 +26931,7 @@ def V6_vL32b_cur_npred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
-tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b101;
 let Inst{31-21} = 0b00101000100;
 let isPredicated = 1;
@@ -26859,7 +26951,7 @@ def V6_vL32b_cur_npred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
@@ -26881,7 +26973,7 @@ def V6_vL32b_cur_npred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000101;
 let Inst{31-21} = 0b00101011100;
 let isPredicated = 1;
@@ -26902,7 +26994,7 @@ def V6_vL32b_cur_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32.cur = vmem($Rx32++#$Ii)",
-tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b001;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001000;
@@ -26923,7 +27015,7 @@ def V6_vL32b_cur_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Vd32.cur = vmem($Rx32++$Mu2)",
-tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
 let Inst{12-5} = 0b00000001;
 let Inst{31-21} = 0b00101011000;
 let hasNewValue = 1;
@@ -26943,7 +27035,7 @@ def V6_vL32b_cur_pred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
-tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b100;
 let Inst{31-21} = 0b00101000100;
 let isPredicated = 1;
@@ -26962,7 +27054,7 @@ def V6_vL32b_cur_pred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
@@ -26983,7 +27075,7 @@ def V6_vL32b_cur_pred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000100;
 let Inst{31-21} = 0b00101011100;
 let isPredicated = 1;
@@ -27003,7 +27095,7 @@ def V6_vL32b_npred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii)",
-tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b00101000100;
 let isPredicated = 1;
@@ -27022,7 +27114,7 @@ def V6_vL32b_npred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii)",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
@@ -27043,7 +27135,7 @@ def V6_vL32b_npred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2)",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000011;
 let Inst{31-21} = 0b00101011100;
 let isPredicated = 1;
@@ -27063,7 +27155,7 @@ def V6_vL32b_nt_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32 = vmem($Rt32+#$Ii):nt",
-tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000010;
@@ -27084,7 +27176,7 @@ def V6_vL32b_nt_cur_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32.cur = vmem($Rt32+#$Ii):nt",
-tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b001;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000010;
@@ -27105,7 +27197,7 @@ def V6_vL32b_nt_cur_npred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
-tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b101;
 let Inst{31-21} = 0b00101000110;
 let isPredicated = 1;
@@ -27126,7 +27218,7 @@ def V6_vL32b_nt_cur_npred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
@@ -27149,7 +27241,7 @@ def V6_vL32b_nt_cur_npred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000101;
 let Inst{31-21} = 0b00101011110;
 let isPredicated = 1;
@@ -27171,7 +27263,7 @@ def V6_vL32b_nt_cur_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32.cur = vmem($Rx32++#$Ii):nt",
-tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b001;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001010;
@@ -27193,7 +27285,7 @@ def V6_vL32b_nt_cur_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Vd32.cur = vmem($Rx32++$Mu2):nt",
-tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
 let Inst{12-5} = 0b00000001;
 let Inst{31-21} = 0b00101011010;
 let hasNewValue = 1;
@@ -27214,7 +27306,7 @@ def V6_vL32b_nt_cur_pred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
-tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b100;
 let Inst{31-21} = 0b00101000110;
 let isPredicated = 1;
@@ -27234,7 +27326,7 @@ def V6_vL32b_nt_cur_pred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
@@ -27256,7 +27348,7 @@ def V6_vL32b_nt_cur_pred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000100;
 let Inst{31-21} = 0b00101011110;
 let isPredicated = 1;
@@ -27277,7 +27369,7 @@ def V6_vL32b_nt_npred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
-tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b00101000110;
 let isPredicated = 1;
@@ -27297,7 +27389,7 @@ def V6_vL32b_nt_npred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
@@ -27319,7 +27411,7 @@ def V6_vL32b_nt_npred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000011;
 let Inst{31-21} = 0b00101011110;
 let isPredicated = 1;
@@ -27340,7 +27432,7 @@ def V6_vL32b_nt_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32 = vmem($Rx32++#$Ii):nt",
-tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001010;
@@ -27362,7 +27454,7 @@ def V6_vL32b_nt_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Vd32 = vmem($Rx32++$Mu2):nt",
-tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b00101011010;
 let hasNewValue = 1;
@@ -27383,7 +27475,7 @@ def V6_vL32b_nt_pred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if ($Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
-tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b00101000110;
 let isPredicated = 1;
@@ -27402,7 +27494,7 @@ def V6_vL32b_nt_pred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if ($Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
@@ -27423,7 +27515,7 @@ def V6_vL32b_nt_pred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if ($Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000010;
 let Inst{31-21} = 0b00101011110;
 let isPredicated = 1;
@@ -27443,7 +27535,7 @@ def V6_vL32b_nt_tmp_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32.tmp = vmem($Rt32+#$Ii):nt",
-tc_77a4c701, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b010;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000010;
@@ -27463,7 +27555,7 @@ def V6_vL32b_nt_tmp_npred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
-tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_3904b926, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b00101000110;
 let isPredicated = 1;
@@ -27483,7 +27575,7 @@ def V6_vL32b_nt_tmp_npred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
-tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
@@ -27505,7 +27597,7 @@ def V6_vL32b_nt_tmp_npred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
-tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000111;
 let Inst{31-21} = 0b00101011110;
 let isPredicated = 1;
@@ -27526,7 +27618,7 @@ def V6_vL32b_nt_tmp_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32.tmp = vmem($Rx32++#$Ii):nt",
-tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b010;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001010;
@@ -27547,7 +27639,7 @@ def V6_vL32b_nt_tmp_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Vd32.tmp = vmem($Rx32++$Mu2):nt",
-tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
+tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
 let Inst{12-5} = 0b00000010;
 let Inst{31-21} = 0b00101011010;
 let hasNewValue = 1;
@@ -27567,7 +27659,7 @@ def V6_vL32b_nt_tmp_pred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
-tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_3904b926, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b00101000110;
 let isPredicated = 1;
@@ -27586,7 +27678,7 @@ def V6_vL32b_nt_tmp_pred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
-tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
@@ -27607,7 +27699,7 @@ def V6_vL32b_nt_tmp_pred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
-tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000110;
 let Inst{31-21} = 0b00101011110;
 let isPredicated = 1;
@@ -27627,7 +27719,7 @@ def V6_vL32b_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32 = vmem($Rx32++#$Ii)",
-tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001000;
@@ -27648,7 +27740,7 @@ def V6_vL32b_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Vd32 = vmem($Rx32++$Mu2)",
-tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b00101011000;
 let hasNewValue = 1;
@@ -27668,7 +27760,7 @@ def V6_vL32b_pred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if ($Pv4) $Vd32 = vmem($Rt32+#$Ii)",
-tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b00101000100;
 let isPredicated = 1;
@@ -27686,7 +27778,7 @@ def V6_vL32b_pred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if ($Pv4) $Vd32 = vmem($Rx32++#$Ii)",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
@@ -27706,7 +27798,7 @@ def V6_vL32b_pred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if ($Pv4) $Vd32 = vmem($Rx32++$Mu2)",
-tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000010;
 let Inst{31-21} = 0b00101011100;
 let isPredicated = 1;
@@ -27725,7 +27817,7 @@ def V6_vL32b_tmp_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32.tmp = vmem($Rt32+#$Ii)",
-tc_77a4c701, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b010;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000000;
@@ -27744,7 +27836,7 @@ def V6_vL32b_tmp_npred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
-tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_3904b926, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b00101000100;
 let isPredicated = 1;
@@ -27763,7 +27855,7 @@ def V6_vL32b_tmp_npred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
-tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
@@ -27784,7 +27876,7 @@ def V6_vL32b_tmp_npred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
-tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000111;
 let Inst{31-21} = 0b00101011100;
 let isPredicated = 1;
@@ -27804,7 +27896,7 @@ def V6_vL32b_tmp_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32.tmp = vmem($Rx32++#$Ii)",
-tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
 let Inst{7-5} = 0b010;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001000;
@@ -27824,7 +27916,7 @@ def V6_vL32b_tmp_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Vd32.tmp = vmem($Rx32++$Mu2)",
-tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
+tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel {
 let Inst{12-5} = 0b00000010;
 let Inst{31-21} = 0b00101011000;
 let hasNewValue = 1;
@@ -27843,7 +27935,7 @@ def V6_vL32b_tmp_pred_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
 "if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
-tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
+tc_3904b926, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b00101000100;
 let isPredicated = 1;
@@ -27861,7 +27953,7 @@ def V6_vL32b_tmp_pred_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
 "if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
-tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
+tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
@@ -27881,7 +27973,7 @@ def V6_vL32b_tmp_pred_ppu : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
 "if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
-tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
+tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel {
 let Inst{10-5} = 0b000110;
 let Inst{31-21} = 0b00101011100;
 let isPredicated = 1;
@@ -27900,7 +27992,7 @@ def V6_vS32Ub_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "vmemu($Rt32+#$Ii) = $Vs32",
-tc_354299ad, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
+tc_f21e8abb, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b111;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
@@ -27915,7 +28007,7 @@ def V6_vS32Ub_npred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Pv4) vmemu($Rt32+#$Ii) = $Vs32",
-tc_d642eff3, TypeCVI_VM_STU>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
+tc_131f1c81, TypeCVI_VM_STU>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b00101000101;
 let isPredicated = 1;
@@ -27930,7 +28022,7 @@ def V6_vS32Ub_npred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Pv4) vmemu($Rx32++#$Ii) = $Vs32",
-tc_6fd9ad30, TypeCVI_VM_STU>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
+tc_c7039829, TypeCVI_VM_STU>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001101;
@@ -27947,7 +28039,7 @@ def V6_vS32Ub_npred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if (!$Pv4) vmemu($Rx32++$Mu2) = $Vs32",
-tc_6fd9ad30, TypeCVI_VM_STU>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
+tc_c7039829, TypeCVI_VM_STU>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-5} = 0b000111;
 let Inst{31-21} = 0b00101011101;
 let isPredicated = 1;
@@ -27963,7 +28055,7 @@ def V6_vS32Ub_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "vmemu($Rx32++#$Ii) = $Vs32",
-tc_7fa82b08, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
+tc_e2d2e9e5, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b111;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
@@ -27979,7 +28071,7 @@ def V6_vS32Ub_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "vmemu($Rx32++$Mu2) = $Vs32",
-tc_7fa82b08, TypeCVI_VM_STU>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel {
+tc_e2d2e9e5, TypeCVI_VM_STU>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{12-5} = 0b00000111;
 let Inst{31-21} = 0b00101011001;
 let addrMode = PostInc;
@@ -27994,7 +28086,7 @@ def V6_vS32Ub_pred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Pv4) vmemu($Rt32+#$Ii) = $Vs32",
-tc_d642eff3, TypeCVI_VM_STU>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
+tc_131f1c81, TypeCVI_VM_STU>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b00101000101;
 let isPredicated = 1;
@@ -28008,7 +28100,7 @@ def V6_vS32Ub_pred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Pv4) vmemu($Rx32++#$Ii) = $Vs32",
-tc_6fd9ad30, TypeCVI_VM_STU>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
+tc_c7039829, TypeCVI_VM_STU>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001101;
@@ -28024,7 +28116,7 @@ def V6_vS32Ub_pred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if ($Pv4) vmemu($Rx32++$Mu2) = $Vs32",
-tc_6fd9ad30, TypeCVI_VM_STU>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
+tc_c7039829, TypeCVI_VM_STU>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-5} = 0b000110;
 let Inst{31-21} = 0b00101011101;
 let isPredicated = 1;
@@ -28039,7 +28131,7 @@ def V6_vS32b_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "vmem($Rt32+#$Ii) = $Vs32",
-tc_e3748cdf, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
+tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
@@ -28055,7 +28147,7 @@ def V6_vS32b_new_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8),
 "vmem($Rt32+#$Ii) = $Os8.new",
-tc_1b93bdc6, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel {
+tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b00100;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
@@ -28074,7 +28166,7 @@ def V6_vS32b_new_npred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8),
 "if (!$Pv4) vmem($Rt32+#$Ii) = $Os8.new",
-tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel {
+tc_7177e272, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b01101;
 let Inst{31-21} = 0b00101000101;
 let isPredicated = 1;
@@ -28093,7 +28185,7 @@ def V6_vS32b_new_npred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8),
 "if (!$Pv4) vmem($Rx32++#$Ii) = $Os8.new",
-tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel {
+tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b01101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001101;
@@ -28114,7 +28206,7 @@ def V6_vS32b_new_npred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8),
 "if (!$Pv4) vmem($Rx32++$Mu2) = $Os8.new",
-tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel {
+tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-3} = 0b00001101;
 let Inst{31-21} = 0b00101011101;
 let isPredicated = 1;
@@ -28134,7 +28226,7 @@ def V6_vS32b_new_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8),
 "vmem($Rx32++#$Ii) = $Os8.new",
-tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel {
+tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b00100;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
@@ -28154,7 +28246,7 @@ def V6_vS32b_new_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8),
 "vmem($Rx32++$Mu2) = $Os8.new",
-tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[UseHVXV60]>, NewValueRel {
+tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{12-3} = 0b0000000100;
 let Inst{31-21} = 0b00101011001;
 let addrMode = PostInc;
@@ -28173,7 +28265,7 @@ def V6_vS32b_new_pred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8),
 "if ($Pv4) vmem($Rt32+#$Ii) = $Os8.new",
-tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel {
+tc_7177e272, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b01000;
 let Inst{31-21} = 0b00101000101;
 let isPredicated = 1;
@@ -28191,7 +28283,7 @@ def V6_vS32b_new_pred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8),
 "if ($Pv4) vmem($Rx32++#$Ii) = $Os8.new",
-tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel {
+tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b01000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001101;
@@ -28211,7 +28303,7 @@ def V6_vS32b_new_pred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8),
 "if ($Pv4) vmem($Rx32++$Mu2) = $Os8.new",
-tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel {
+tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-3} = 0b00001000;
 let Inst{31-21} = 0b00101011101;
 let isPredicated = 1;
@@ -28230,7 +28322,7 @@ def V6_vS32b_npred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Pv4) vmem($Rt32+#$Ii) = $Vs32",
-tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
+tc_a02a10a8, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b00101000101;
 let isPredicated = 1;
@@ -28246,7 +28338,7 @@ def V6_vS32b_npred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Pv4) vmem($Rx32++#$Ii) = $Vs32",
-tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
+tc_54a0dc47, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001101;
@@ -28264,7 +28356,7 @@ def V6_vS32b_npred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if (!$Pv4) vmem($Rx32++$Mu2) = $Vs32",
-tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
+tc_54a0dc47, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-5} = 0b000001;
 let Inst{31-21} = 0b00101011101;
 let isPredicated = 1;
@@ -28281,7 +28373,7 @@ def V6_vS32b_nqpred_ai : HInst<
 (outs),
 (ins HvxQR:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Qv4) vmem($Rt32+#$Ii) = $Vs32",
-tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> {
+tc_447d9895, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b00101000100;
 let addrMode = BaseImmOffset;
@@ -28293,7 +28385,7 @@ def V6_vS32b_nqpred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins HvxQR:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Qv4) vmem($Rx32++#$Ii) = $Vs32",
-tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> {
+tc_191381c1, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
@@ -28307,7 +28399,7 @@ def V6_vS32b_nqpred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins HvxQR:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if (!$Qv4) vmem($Rx32++$Mu2) = $Vs32",
-tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> {
+tc_191381c1, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> {
 let Inst{10-5} = 0b000001;
 let Inst{31-21} = 0b00101011100;
 let addrMode = PostInc;
@@ -28320,7 +28412,7 @@ def V6_vS32b_nt_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "vmem($Rt32+#$Ii):nt = $Vs32",
-tc_e3748cdf, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
+tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000011;
@@ -28337,7 +28429,7 @@ def V6_vS32b_nt_new_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8),
 "vmem($Rt32+#$Ii):nt = $Os8.new",
-tc_1b93bdc6, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel {
+tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b00100;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000011;
@@ -28357,7 +28449,7 @@ def V6_vS32b_nt_new_npred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8),
 "if (!$Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
-tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel {
+tc_7177e272, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b01111;
 let Inst{31-21} = 0b00101000111;
 let isPredicated = 1;
@@ -28377,7 +28469,7 @@ def V6_vS32b_nt_new_npred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8),
 "if (!$Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
-tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel {
+tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b01111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001111;
@@ -28399,7 +28491,7 @@ def V6_vS32b_nt_new_npred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8),
 "if (!$Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
-tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel {
+tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-3} = 0b00001111;
 let Inst{31-21} = 0b00101011111;
 let isPredicated = 1;
@@ -28420,7 +28512,7 @@ def V6_vS32b_nt_new_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8),
 "vmem($Rx32++#$Ii):nt = $Os8.new",
-tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel {
+tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b00100;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001011;
@@ -28441,7 +28533,7 @@ def V6_vS32b_nt_new_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8),
 "vmem($Rx32++$Mu2):nt = $Os8.new",
-tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[UseHVXV60]>, NewValueRel {
+tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{12-3} = 0b0000000100;
 let Inst{31-21} = 0b00101011011;
 let addrMode = PostInc;
@@ -28461,7 +28553,7 @@ def V6_vS32b_nt_new_pred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8),
 "if ($Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
-tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel {
+tc_7177e272, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b01010;
 let Inst{31-21} = 0b00101000111;
 let isPredicated = 1;
@@ -28480,7 +28572,7 @@ def V6_vS32b_nt_new_pred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8),
 "if ($Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
-tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel {
+tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-3} = 0b01010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001111;
@@ -28501,7 +28593,7 @@ def V6_vS32b_nt_new_pred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8),
 "if ($Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
-tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel {
+tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-3} = 0b00001010;
 let Inst{31-21} = 0b00101011111;
 let isPredicated = 1;
@@ -28521,7 +28613,7 @@ def V6_vS32b_nt_npred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
-tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
+tc_a02a10a8, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b00101000111;
 let isPredicated = 1;
@@ -28538,7 +28630,7 @@ def V6_vS32b_nt_npred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
-tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
+tc_54a0dc47, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001111;
@@ -28557,7 +28649,7 @@ def V6_vS32b_nt_npred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if (!$Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
-tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
+tc_54a0dc47, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-5} = 0b000001;
 let Inst{31-21} = 0b00101011111;
 let isPredicated = 1;
@@ -28575,7 +28667,7 @@ def V6_vS32b_nt_nqpred_ai : HInst<
 (outs),
 (ins HvxQR:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
-tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> {
+tc_447d9895, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b00101000110;
 let addrMode = BaseImmOffset;
@@ -28588,7 +28680,7 @@ def V6_vS32b_nt_nqpred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins HvxQR:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if (!$Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
-tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> {
+tc_191381c1, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
@@ -28603,7 +28695,7 @@ def V6_vS32b_nt_nqpred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins HvxQR:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if (!$Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
-tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> {
+tc_191381c1, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> {
 let Inst{10-5} = 0b000001;
 let Inst{31-21} = 0b00101011110;
 let addrMode = PostInc;
@@ -28617,7 +28709,7 @@ def V6_vS32b_nt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "vmem($Rx32++#$Ii):nt = $Vs32",
-tc_a4c9df3b, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
+tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001011;
@@ -28635,7 +28727,7 @@ def V6_vS32b_nt_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "vmem($Rx32++$Mu2):nt = $Vs32",
-tc_a4c9df3b, TypeCVI_VM_ST>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel {
+tc_3e2aaafc, TypeCVI_VM_ST>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b00101011011;
 let addrMode = PostInc;
@@ -28652,7 +28744,7 @@ def V6_vS32b_nt_pred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
-tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
+tc_a02a10a8, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b00101000111;
 let isPredicated = 1;
@@ -28668,7 +28760,7 @@ def V6_vS32b_nt_pred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
-tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
+tc_54a0dc47, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001111;
@@ -28686,7 +28778,7 @@ def V6_vS32b_nt_pred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if ($Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
-tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
+tc_54a0dc47, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-5} = 0b000000;
 let Inst{31-21} = 0b00101011111;
 let isPredicated = 1;
@@ -28703,7 +28795,7 @@ def V6_vS32b_nt_qpred_ai : HInst<
 (outs),
 (ins HvxQR:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
-tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> {
+tc_447d9895, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b00101000110;
 let addrMode = BaseImmOffset;
@@ -28716,7 +28808,7 @@ def V6_vS32b_nt_qpred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins HvxQR:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
-tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> {
+tc_191381c1, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
@@ -28731,7 +28823,7 @@ def V6_vS32b_nt_qpred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins HvxQR:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if ($Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
-tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> {
+tc_191381c1, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> {
 let Inst{10-5} = 0b000000;
 let Inst{31-21} = 0b00101011110;
 let addrMode = PostInc;
@@ -28745,7 +28837,7 @@ def V6_vS32b_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "vmem($Rx32++#$Ii) = $Vs32",
-tc_a4c9df3b, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
+tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
@@ -28762,7 +28854,7 @@ def V6_vS32b_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "vmem($Rx32++$Mu2) = $Vs32",
-tc_a4c9df3b, TypeCVI_VM_ST>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel {
+tc_3e2aaafc, TypeCVI_VM_ST>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b00101011001;
 let addrMode = PostInc;
@@ -28777,7 +28869,7 @@ def V6_vS32b_pred_ai : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Pv4) vmem($Rt32+#$Ii) = $Vs32",
-tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
+tc_a02a10a8, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b00101000101;
 let isPredicated = 1;
@@ -28792,7 +28884,7 @@ def V6_vS32b_pred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Pv4) vmem($Rx32++#$Ii) = $Vs32",
-tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
+tc_54a0dc47, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001101;
@@ -28809,7 +28901,7 @@ def V6_vS32b_pred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if ($Pv4) vmem($Rx32++$Mu2) = $Vs32",
-tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
+tc_54a0dc47, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel {
 let Inst{10-5} = 0b000000;
 let Inst{31-21} = 0b00101011101;
 let isPredicated = 1;
@@ -28825,7 +28917,7 @@ def V6_vS32b_qpred_ai : HInst<
 (outs),
 (ins HvxQR:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Qv4) vmem($Rt32+#$Ii) = $Vs32",
-tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> {
+tc_447d9895, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b00101000100;
 let addrMode = BaseImmOffset;
@@ -28837,7 +28929,7 @@ def V6_vS32b_qpred_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins HvxQR:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "if ($Qv4) vmem($Rx32++#$Ii) = $Vs32",
-tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> {
+tc_191381c1, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
@@ -28851,7 +28943,7 @@ def V6_vS32b_qpred_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins HvxQR:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32),
 "if ($Qv4) vmem($Rx32++$Mu2) = $Vs32",
-tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> {
+tc_191381c1, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> {
 let Inst{10-5} = 0b000000;
 let Inst{31-21} = 0b00101011100;
 let addrMode = PostInc;
@@ -28864,7 +28956,7 @@ def V6_vS32b_srls_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "vmem($Rt32+#$Ii):scatter_release",
-tc_29841470, TypeCVI_SCATTER_NEW_RST>, Enc_ff3442, Requires<[UseHVXV65]> {
+tc_3ce09744, TypeCVI_SCATTER_NEW_RST>, Enc_ff3442, Requires<[UseHVXV65]> {
 let Inst{7-0} = 0b00101000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
@@ -28878,7 +28970,7 @@ def V6_vS32b_srls_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "vmem($Rx32++#$Ii):scatter_release",
-tc_5c03dc63, TypeCVI_SCATTER_NEW_RST>, Enc_6c9ee0, Requires<[UseHVXV65]> {
+tc_20a4bbec, TypeCVI_SCATTER_NEW_RST>, Enc_6c9ee0, Requires<[UseHVXV65]> {
 let Inst{7-0} = 0b00101000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
@@ -28893,7 +28985,7 @@ def V6_vS32b_srls_ppu : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "vmem($Rx32++$Mu2):scatter_release",
-tc_5c03dc63, TypeCVI_SCATTER_NEW_RST>, Enc_44661f, Requires<[UseHVXV65]> {
+tc_20a4bbec, TypeCVI_SCATTER_NEW_RST>, Enc_44661f, Requires<[UseHVXV65]> {
 let Inst{12-0} = 0b0000000101000;
 let Inst{31-21} = 0b00101011001;
 let addrMode = PostInc;
@@ -28907,7 +28999,7 @@ def V6_vabsb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.b = vabs($Vu32.b)",
-tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV65]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
@@ -28930,7 +29022,7 @@ def V6_vabsb_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.b = vabs($Vu32.b):sat",
-tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV65]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
@@ -28953,7 +29045,7 @@ def V6_vabsdiffh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vabsdiff($Vu32.h,$Vv32.h)",
-tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
@@ -28976,7 +29068,7 @@ def V6_vabsdiffub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vabsdiff($Vu32.ub,$Vv32.ub)",
-tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
@@ -28999,7 +29091,7 @@ def V6_vabsdiffuh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vabsdiff($Vu32.uh,$Vv32.uh)",
-tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
@@ -29022,7 +29114,7 @@ def V6_vabsdiffw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uw = vabsdiff($Vu32.w,$Vv32.w)",
-tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
@@ -29045,7 +29137,7 @@ def V6_vabsh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.h = vabs($Vu32.h)",
-tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
@@ -29068,7 +29160,7 @@ def V6_vabsh_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.h = vabs($Vu32.h):sat",
-tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
@@ -29091,7 +29183,7 @@ def V6_vabsub_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.ub = vabs($Vu32.b)",
-tc_71337255, TypeMAPPING>, Requires<[UseHVXV65]> {
+tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -29102,7 +29194,7 @@ def V6_vabsuh_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.uh = vabs($Vu32.h)",
-tc_71337255, TypeMAPPING>, Requires<[UseHVXV65]> {
+tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -29113,7 +29205,7 @@ def V6_vabsuw_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.uw = vabs($Vu32.w)",
-tc_71337255, TypeMAPPING>, Requires<[UseHVXV65]> {
+tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -29124,7 +29216,7 @@ def V6_vabsw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.w = vabs($Vu32.w)",
-tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
@@ -29147,7 +29239,7 @@ def V6_vabsw_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.w = vabs($Vu32.w):sat",
-tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
@@ -29170,7 +29262,7 @@ def V6_vaddb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vadd($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
@@ -29193,7 +29285,7 @@ def V6_vaddb_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.b = vadd($Vuu32.b,$Vvv32.b)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
@@ -29216,7 +29308,7 @@ def V6_vaddbnq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if (!$Qv4) $Vx32.b += $Vu32.b",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000001;
@@ -29244,7 +29336,7 @@ def V6_vaddbq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if ($Qv4) $Vx32.b += $Vu32.b",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000001;
@@ -29272,7 +29364,7 @@ def V6_vaddbsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vadd($Vu32.b,$Vv32.b):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
@@ -29295,7 +29387,7 @@ def V6_vaddbsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.b = vadd($Vuu32.b,$Vvv32.b):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
@@ -29318,7 +29410,7 @@ def V6_vaddcarry : HInst<
 (outs HvxVR:$Vd32, HvxQR:$Qx4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, HvxQR:$Qx4in),
 "$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qx4):carry",
-tc_5a9fc4ec, TypeCVI_VA>, Enc_b43b67, Requires<[UseHVXV62]> {
+tc_7e6a3e89, TypeCVI_VA>, Enc_b43b67, Requires<[UseHVXV62]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100101;
@@ -29327,11 +29419,37 @@ let opNewValue = 0;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
+def V6_vaddcarryo : HInst<
+(outs HvxVR:$Vd32, HvxQR:$Qe4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.w,$Qe4 = vadd($Vu32.w,$Vv32.w):carry",
+tc_e35c1e93, TypeCOPROC_VX>, Enc_c1d806, Requires<[UseHVXV66]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddcarrysat : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32, HvxQR:$Qs4),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qs4):carry:sat",
+tc_257f6f7c, TypeCVI_VA>, Enc_e0820b, Requires<[UseHVXV66]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vaddclbh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vadd(vclb($Vu32.h),$Vv32.h)",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
@@ -29343,7 +29461,7 @@ def V6_vaddclbw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vadd(vclb($Vu32.w),$Vv32.w)",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
@@ -29355,7 +29473,7 @@ def V6_vaddh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vadd($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
@@ -29378,7 +29496,7 @@ def V6_vaddh_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.h = vadd($Vuu32.h,$Vvv32.h)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
@@ -29401,7 +29519,7 @@ def V6_vaddhnq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if (!$Qv4) $Vx32.h += $Vu32.h",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000001;
@@ -29429,7 +29547,7 @@ def V6_vaddhq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if ($Qv4) $Vx32.h += $Vu32.h",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000001;
@@ -29457,7 +29575,7 @@ def V6_vaddhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vadd($Vu32.h,$Vv32.h):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
@@ -29480,7 +29598,7 @@ def V6_vaddhsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.h = vadd($Vuu32.h,$Vvv32.h):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
@@ -29503,7 +29621,7 @@ def V6_vaddhw : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.w = vadd($Vu32.h,$Vv32.h)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
@@ -29515,7 +29633,7 @@ def V6_vaddhw_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.w += vadd($Vu32.h,$Vv32.h)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100001;
@@ -29553,7 +29671,7 @@ def V6_vaddubh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.h = vadd($Vu32.ub,$Vv32.ub)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
@@ -29565,7 +29683,7 @@ def V6_vaddubh_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.h += vadd($Vu32.ub,$Vv32.ub)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100010;
@@ -29603,7 +29721,7 @@ def V6_vaddubsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vadd($Vu32.ub,$Vv32.ub):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
@@ -29626,7 +29744,7 @@ def V6_vaddubsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.ub = vadd($Vuu32.ub,$Vvv32.ub):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
@@ -29649,7 +29767,7 @@ def V6_vaddububb_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vadd($Vu32.ub,$Vv32.b):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
@@ -29661,7 +29779,7 @@ def V6_vadduhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vadd($Vu32.uh,$Vv32.uh):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
@@ -29684,7 +29802,7 @@ def V6_vadduhsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.uh = vadd($Vuu32.uh,$Vvv32.uh):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
@@ -29707,7 +29825,7 @@ def V6_vadduhw : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.w = vadd($Vu32.uh,$Vv32.uh)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
@@ -29719,7 +29837,7 @@ def V6_vadduhw_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.w += vadd($Vu32.uh,$Vv32.uh)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100010;
@@ -29757,7 +29875,7 @@ def V6_vadduwsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uw = vadd($Vu32.uw,$Vv32.uw):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
@@ -29780,7 +29898,7 @@ def V6_vadduwsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.uw = vadd($Vuu32.uw,$Vvv32.uw):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
@@ -29803,7 +29921,7 @@ def V6_vaddw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vadd($Vu32.w,$Vv32.w)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
@@ -29826,7 +29944,7 @@ def V6_vaddw_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.w = vadd($Vuu32.w,$Vvv32.w)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
@@ -29849,7 +29967,7 @@ def V6_vaddwnq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if (!$Qv4) $Vx32.w += $Vu32.w",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000001;
@@ -29877,7 +29995,7 @@ def V6_vaddwq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if ($Qv4) $Vx32.w += $Vu32.w",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000001;
@@ -29905,7 +30023,7 @@ def V6_vaddwsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vadd($Vu32.w,$Vv32.w):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
@@ -29928,7 +30046,7 @@ def V6_vaddwsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.w = vadd($Vuu32.w,$Vvv32.w):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
@@ -29951,7 +30069,7 @@ def V6_valignb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = valign($Vu32,$Vv32,$Rt8)",
-tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_56e64202, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
@@ -29963,7 +30081,7 @@ def V6_valignbi : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii),
 "$Vd32 = valign($Vu32,$Vv32,#$Ii)",
-tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV60]> {
+tc_56e64202, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV60]> {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011110001;
 let hasNewValue = 1;
@@ -29974,7 +30092,7 @@ def V6_vand : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32 = vand($Vu32,$Vv32)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -29986,7 +30104,7 @@ def V6_vandnqrt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qu4, IntRegs:$Rt32),
 "$Vd32 = vand(!$Qu4,$Rt32)",
-tc_e231aa4f, TypeCVI_VX>, Enc_7b7ba8, Requires<[UseHVXV62]> {
+tc_ac4046bc, TypeCVI_VX>, Enc_7b7ba8, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-10} = 0b0001;
 let Inst{31-21} = 0b00011001101;
@@ -29998,7 +30116,7 @@ def V6_vandnqrt_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxQR:$Qu4, IntRegs:$Rt32),
 "$Vx32 |= vand(!$Qu4,$Rt32)",
-tc_9311da3f, TypeCVI_VX>, Enc_895bd9, Requires<[UseHVXV62]> {
+tc_2e8f5f6e, TypeCVI_VX>, Enc_895bd9, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b00011001011;
@@ -30036,7 +30154,7 @@ def V6_vandqrt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qu4, IntRegs:$Rt32),
 "$Vd32 = vand($Qu4,$Rt32)",
-tc_e231aa4f, TypeCVI_VX_LATE>, Enc_7b7ba8, Requires<[UseHVXV60]> {
+tc_ac4046bc, TypeCVI_VX_LATE>, Enc_7b7ba8, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b00011001101;
@@ -30048,7 +30166,7 @@ def V6_vandqrt_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxQR:$Qu4, IntRegs:$Rt32),
 "$Vx32 |= vand($Qu4,$Rt32)",
-tc_9311da3f, TypeCVI_VX_LATE>, Enc_895bd9, Requires<[UseHVXV60]> {
+tc_2e8f5f6e, TypeCVI_VX_LATE>, Enc_895bd9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b00011001011;
@@ -30086,7 +30204,7 @@ def V6_vandvnqv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qv4, HvxVR:$Vu32),
 "$Vd32 = vand(!$Qv4,$Vu32)",
-tc_bbaf280e, TypeCVI_VA>, Enc_c4dc92, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_c4dc92, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000011;
@@ -30099,7 +30217,7 @@ def V6_vandvqv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qv4, HvxVR:$Vu32),
 "$Vd32 = vand($Qv4,$Vu32)",
-tc_bbaf280e, TypeCVI_VA>, Enc_c4dc92, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_c4dc92, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000011;
@@ -30112,7 +30230,7 @@ def V6_vandvrt : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Qd4 = vand($Vu32,$Rt32)",
-tc_e231aa4f, TypeCVI_VX_LATE>, Enc_0f8bab, Requires<[UseHVXV60]> {
+tc_ac4046bc, TypeCVI_VX_LATE>, Enc_0f8bab, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b010010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001101;
@@ -30124,7 +30242,7 @@ def V6_vandvrt_acc : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Qx4 |= vand($Vu32,$Rt32)",
-tc_9311da3f, TypeCVI_VX_LATE>, Enc_adf111, Requires<[UseHVXV60]> {
+tc_2e8f5f6e, TypeCVI_VX_LATE>, Enc_adf111, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001011;
@@ -30158,7 +30276,7 @@ def V6_vaslh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.h = vasl($Vu32.h,$Rt32)",
-tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
@@ -30170,7 +30288,7 @@ def V6_vaslh_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.h += vasl($Vu32.h,$Rt32)",
-tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV65]> {
+tc_309dbb4f, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001101;
@@ -30208,7 +30326,7 @@ def V6_vaslhv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vasl($Vu32.h,$Vv32.h)",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
@@ -30231,7 +30349,7 @@ def V6_vaslw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vasl($Vu32.w,$Rt32)",
-tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
@@ -30243,7 +30361,7 @@ def V6_vaslw_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vasl($Vu32.w,$Rt32)",
-tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_309dbb4f, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001011;
@@ -30281,7 +30399,7 @@ def V6_vaslwv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vasl($Vu32.w,$Vv32.w)",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
@@ -30300,11 +30418,36 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vasr_into : HInst<
+(outs HvxWR:$Vxx32),
+(ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vxx32.w = vasrinto($Vu32.w,$Vv32.w)",
+tc_df80eeb0, TypeCVI_VP_VS>, Enc_3fc427, Requires<[UseHVXV66]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011010101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vasr_into_alt : HInst<
+(outs HvxWR:$Vxx32),
+(ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vxx32 = vasrinto($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
 def V6_vasrh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.h = vasr($Vu32.h,$Rt32)",
-tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
@@ -30316,7 +30459,7 @@ def V6_vasrh_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.h += vasr($Vu32.h,$Rt32)",
-tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV65]> {
+tc_309dbb4f, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
@@ -30354,7 +30497,7 @@ def V6_vasrhbrndsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
@@ -30366,7 +30509,7 @@ def V6_vasrhbrndsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
+tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30376,7 +30519,7 @@ def V6_vasrhbsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
@@ -30388,7 +30531,7 @@ def V6_vasrhubrndsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
@@ -30400,7 +30543,7 @@ def V6_vasrhubrndsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
+tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30410,7 +30553,7 @@ def V6_vasrhubsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
@@ -30422,7 +30565,7 @@ def V6_vasrhubsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
+tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30432,7 +30575,7 @@ def V6_vasrhv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vasr($Vu32.h,$Vv32.h)",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
@@ -30455,7 +30598,7 @@ def V6_vasruhubrndsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.ub = vasr($Vu32.uh,$Vv32.uh,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
@@ -30467,7 +30610,7 @@ def V6_vasruhubsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.ub = vasr($Vu32.uh,$Vv32.uh,$Rt8):sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011000;
@@ -30479,7 +30622,7 @@ def V6_vasruwuhrndsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
@@ -30491,7 +30634,7 @@ def V6_vasruwuhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011000;
@@ -30503,7 +30646,7 @@ def V6_vasrw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vasr($Vu32.w,$Rt32)",
-tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
@@ -30515,7 +30658,7 @@ def V6_vasrw_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vasr($Vu32.w,$Rt32)",
-tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_309dbb4f, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001011;
@@ -30553,7 +30696,7 @@ def V6_vasrwh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8)",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
@@ -30565,7 +30708,7 @@ def V6_vasrwh_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
+tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30575,7 +30718,7 @@ def V6_vasrwhrndsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
@@ -30587,7 +30730,7 @@ def V6_vasrwhrndsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
+tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30597,7 +30740,7 @@ def V6_vasrwhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
@@ -30609,7 +30752,7 @@ def V6_vasrwhsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
+tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30619,7 +30762,7 @@ def V6_vasrwuhrndsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
@@ -30631,7 +30774,7 @@ def V6_vasrwuhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
-tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
@@ -30643,7 +30786,7 @@ def V6_vasrwuhsat_alt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat",
-tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> {
+tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -30653,7 +30796,7 @@ def V6_vasrwv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vasr($Vu32.w,$Vv32.w)",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
@@ -30676,7 +30819,7 @@ def V6_vassign : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32 = $Vu32",
-tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-16} = 0b0001111000000011;
@@ -30698,7 +30841,7 @@ def V6_vavgb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vavg($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
@@ -30721,7 +30864,7 @@ def V6_vavgbrnd : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vavg($Vu32.b,$Vv32.b):rnd",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
@@ -30744,7 +30887,7 @@ def V6_vavgh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vavg($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
@@ -30767,7 +30910,7 @@ def V6_vavghrnd : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vavg($Vu32.h,$Vv32.h):rnd",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
@@ -30790,7 +30933,7 @@ def V6_vavgub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vavg($Vu32.ub,$Vv32.ub)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
@@ -30813,7 +30956,7 @@ def V6_vavgubrnd : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vavg($Vu32.ub,$Vv32.ub):rnd",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
@@ -30836,7 +30979,7 @@ def V6_vavguh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vavg($Vu32.uh,$Vv32.uh)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
@@ -30859,7 +31002,7 @@ def V6_vavguhrnd : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vavg($Vu32.uh,$Vv32.uh):rnd",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
@@ -30882,7 +31025,7 @@ def V6_vavguw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uw = vavg($Vu32.uw,$Vv32.uw)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
@@ -30905,7 +31048,7 @@ def V6_vavguwrnd : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uw = vavg($Vu32.uw,$Vv32.uw):rnd",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
@@ -30928,7 +31071,7 @@ def V6_vavgw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vavg($Vu32.w,$Vv32.w)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
@@ -30951,7 +31094,7 @@ def V6_vavgwrnd : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vavg($Vu32.w,$Vv32.w):rnd",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
@@ -30974,7 +31117,7 @@ def V6_vccombine : HInst<
 (outs HvxWR:$Vdd32),
 (ins PredRegs:$Ps4, HvxVR:$Vu32, HvxVR:$Vv32),
 "if ($Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
-tc_2171ebae, TypeCVI_VA_DV>, Enc_8c2412, Requires<[UseHVXV60]> {
+tc_af25efd9, TypeCVI_VA_DV>, Enc_8c2412, Requires<[UseHVXV60]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011010011;
@@ -30987,7 +31130,7 @@ def V6_vcl0h : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.uh = vcl0($Vu32.uh)",
-tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
@@ -31010,7 +31153,7 @@ def V6_vcl0w : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.uw = vcl0($Vu32.uw)",
-tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
@@ -31033,7 +31176,7 @@ def V6_vcmov : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Ps4, HvxVR:$Vu32),
 "if ($Ps4) $Vd32 = $Vu32",
-tc_b06ab583, TypeCVI_VA>, Enc_770858, Requires<[UseHVXV60]> {
+tc_3aacf4a8, TypeCVI_VA>, Enc_770858, Requires<[UseHVXV60]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001101000000000;
@@ -31046,7 +31189,7 @@ def V6_vcombine : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32 = vcombine($Vu32,$Vv32)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
@@ -31070,7 +31213,7 @@ def V6_vdd0 : HInst<
 (outs HvxWR:$Vdd32),
 (ins),
 "$Vdd32 = #0",
-tc_8a6eb39a, TypeMAPPING>, Requires<[UseHVXV65]> {
+tc_718b5c53, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -31081,7 +31224,7 @@ def V6_vdeal : HInst<
 (outs HvxVR:$Vy32, HvxVR:$Vx32),
 (ins HvxVR:$Vy32in, HvxVR:$Vx32in, IntRegs:$Rt32),
 "vdeal($Vy32,$Vx32,$Rt32)",
-tc_5c120602, TypeCVI_VP_VS>, Enc_989021, Requires<[UseHVXV60]> {
+tc_561aaa58, TypeCVI_VP_VS>, Enc_989021, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001111;
@@ -31096,7 +31239,7 @@ def V6_vdealb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.b = vdeal($Vu32.b)",
-tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_946013d8, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
@@ -31108,7 +31251,7 @@ def V6_vdealb4w : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vdeale($Vu32.b,$Vv32.b)",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
@@ -31142,7 +31285,7 @@ def V6_vdealh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.h = vdeal($Vu32.h)",
-tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_946013d8, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
@@ -31165,7 +31308,7 @@ def V6_vdealvdd : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vdd32 = vdeal($Vu32,$Vv32,$Rt8)",
-tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> {
+tc_87adc037, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
@@ -31177,7 +31320,7 @@ def V6_vdelta : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32 = vdelta($Vu32,$Vv32)",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
@@ -31189,7 +31332,7 @@ def V6_vdmpybus : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.h = vdmpy($Vu32.ub,$Rt32.b)",
-tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
@@ -31201,7 +31344,7 @@ def V6_vdmpybus_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.h += vdmpy($Vu32.ub,$Rt32.b)",
-tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001000;
@@ -31239,7 +31382,7 @@ def V6_vdmpybus_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.h = vdmpy($Vuu32.ub,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
@@ -31251,7 +31394,7 @@ def V6_vdmpybus_dv_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.h += vdmpy($Vuu32.ub,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001000;
@@ -31289,7 +31432,7 @@ def V6_vdmpyhb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vdmpy($Vu32.h,$Rt32.b)",
-tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
@@ -31301,7 +31444,7 @@ def V6_vdmpyhb_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vdmpy($Vu32.h,$Rt32.b)",
-tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001000;
@@ -31339,7 +31482,7 @@ def V6_vdmpyhb_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.w = vdmpy($Vuu32.h,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -31351,7 +31494,7 @@ def V6_vdmpyhb_dv_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.w += vdmpy($Vuu32.h,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -31389,7 +31532,7 @@ def V6_vdmpyhisat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vd32.w = vdmpy($Vuu32.h,$Rt32.h):sat",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -31401,7 +31544,7 @@ def V6_vdmpyhisat_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vx32.w += vdmpy($Vuu32.h,$Rt32.h):sat",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_cc857d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_cc857d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -31439,7 +31582,7 @@ def V6_vdmpyhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -31451,7 +31594,7 @@ def V6_vdmpyhsat_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -31489,7 +31632,7 @@ def V6_vdmpyhsuisat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vd32.w = vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -31501,7 +31644,7 @@ def V6_vdmpyhsuisat_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vx32.w += vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_cc857d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_cc857d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -31539,7 +31682,7 @@ def V6_vdmpyhsusat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -31551,7 +31694,7 @@ def V6_vdmpyhsusat_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -31589,7 +31732,7 @@ def V6_vdmpyhvsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -31601,7 +31744,7 @@ def V6_vdmpyhvsat_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vdmpy($Vu32.h,$Vv32.h):sat",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -31639,7 +31782,7 @@ def V6_vdsaduh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.uw = vdsad($Vuu32.uh,$Rt32.uh)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
@@ -31651,7 +31794,7 @@ def V6_vdsaduh_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.uw += vdsad($Vuu32.uh,$Rt32.uh)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001011;
@@ -31689,7 +31832,7 @@ def V6_veqb : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.eq($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -31701,7 +31844,7 @@ def V6_veqb_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.eq($Vu32.b,$Vv32.b)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31712,7 +31855,7 @@ def V6_veqb_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.eq($Vu32.b,$Vv32.b)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31724,7 +31867,7 @@ def V6_veqb_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.eq($Vu32.b,$Vv32.b)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31735,7 +31878,7 @@ def V6_veqh : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.eq($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -31747,7 +31890,7 @@ def V6_veqh_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.eq($Vu32.h,$Vv32.h)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31758,7 +31901,7 @@ def V6_veqh_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.eq($Vu32.h,$Vv32.h)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b010001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31770,7 +31913,7 @@ def V6_veqh_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.eq($Vu32.h,$Vv32.h)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31781,7 +31924,7 @@ def V6_veqw : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.eq($Vu32.w,$Vv32.w)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -31793,7 +31936,7 @@ def V6_veqw_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.eq($Vu32.w,$Vv32.w)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31804,7 +31947,7 @@ def V6_veqw_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.eq($Vu32.w,$Vv32.w)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b010010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31816,7 +31959,7 @@ def V6_veqw_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.eq($Vu32.w,$Vv32.w)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31827,7 +31970,7 @@ def V6_vgathermh : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
 "vtmp.h = vgather($Rt32,$Mu2,$Vv32.h).h",
-tc_66bb62ea, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> {
+tc_e8797b98, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> {
 let Inst{12-5} = 0b00001000;
 let Inst{31-21} = 0b00101111000;
 let hasNewValue = 1;
@@ -31843,7 +31986,7 @@ def V6_vgathermhq : HInst<
 (outs),
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
 "if ($Qs4) vtmp.h = vgather($Rt32,$Mu2,$Vv32.h).h",
-tc_63e3d94c, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> {
+tc_05ac6f98, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> {
 let Inst{12-7} = 0b001010;
 let Inst{31-21} = 0b00101111000;
 let hasNewValue = 1;
@@ -31859,7 +32002,7 @@ def V6_vgathermhw : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32),
 "vtmp.h = vgather($Rt32,$Mu2,$Vvv32.w).h",
-tc_bfe309d5, TypeCVI_GATHER>, Enc_28dcbb, Requires<[UseHVXV65]> {
+tc_05058f6f, TypeCVI_GATHER>, Enc_28dcbb, Requires<[UseHVXV65]> {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b00101111000;
 let hasNewValue = 1;
@@ -31875,7 +32018,7 @@ def V6_vgathermhwq : HInst<
 (outs),
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32),
 "if ($Qs4) vtmp.h = vgather($Rt32,$Mu2,$Vvv32.w).h",
-tc_98733e9d, TypeCVI_GATHER>, Enc_4e4a80, Requires<[UseHVXV65]> {
+tc_fd7610da, TypeCVI_GATHER>, Enc_4e4a80, Requires<[UseHVXV65]> {
 let Inst{12-7} = 0b001100;
 let Inst{31-21} = 0b00101111000;
 let hasNewValue = 1;
@@ -31891,7 +32034,7 @@ def V6_vgathermw : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
 "vtmp.w = vgather($Rt32,$Mu2,$Vv32.w).w",
-tc_66bb62ea, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> {
+tc_e8797b98, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b00101111000;
 let hasNewValue = 1;
@@ -31907,7 +32050,7 @@ def V6_vgathermwq : HInst<
 (outs),
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
 "if ($Qs4) vtmp.w = vgather($Rt32,$Mu2,$Vv32.w).w",
-tc_63e3d94c, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> {
+tc_05ac6f98, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> {
 let Inst{12-7} = 0b001000;
 let Inst{31-21} = 0b00101111000;
 let hasNewValue = 1;
@@ -31923,7 +32066,7 @@ def V6_vgtb : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.gt($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -31935,7 +32078,7 @@ def V6_vgtb_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.gt($Vu32.b,$Vv32.b)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31946,7 +32089,7 @@ def V6_vgtb_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.gt($Vu32.b,$Vv32.b)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b010100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31958,7 +32101,7 @@ def V6_vgtb_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.gt($Vu32.b,$Vv32.b)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31969,7 +32112,7 @@ def V6_vgth : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.gt($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -31981,7 +32124,7 @@ def V6_vgth_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.gt($Vu32.h,$Vv32.h)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -31992,7 +32135,7 @@ def V6_vgth_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.gt($Vu32.h,$Vv32.h)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b010101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32004,7 +32147,7 @@ def V6_vgth_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.gt($Vu32.h,$Vv32.h)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32015,7 +32158,7 @@ def V6_vgtub : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.gt($Vu32.ub,$Vv32.ub)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -32027,7 +32170,7 @@ def V6_vgtub_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.gt($Vu32.ub,$Vv32.ub)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32038,7 +32181,7 @@ def V6_vgtub_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.gt($Vu32.ub,$Vv32.ub)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32050,7 +32193,7 @@ def V6_vgtub_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.gt($Vu32.ub,$Vv32.ub)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b101000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32061,7 +32204,7 @@ def V6_vgtuh : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.gt($Vu32.uh,$Vv32.uh)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -32073,7 +32216,7 @@ def V6_vgtuh_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.gt($Vu32.uh,$Vv32.uh)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32084,7 +32227,7 @@ def V6_vgtuh_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.gt($Vu32.uh,$Vv32.uh)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b011001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32096,7 +32239,7 @@ def V6_vgtuh_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.gt($Vu32.uh,$Vv32.uh)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b101001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32107,7 +32250,7 @@ def V6_vgtuw : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.gt($Vu32.uw,$Vv32.uw)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -32119,7 +32262,7 @@ def V6_vgtuw_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.gt($Vu32.uw,$Vv32.uw)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32130,7 +32273,7 @@ def V6_vgtuw_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.gt($Vu32.uw,$Vv32.uw)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b011010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32142,7 +32285,7 @@ def V6_vgtuw_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.gt($Vu32.uw,$Vv32.uw)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b101010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32153,7 +32296,7 @@ def V6_vgtw : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qd4 = vcmp.gt($Vu32.w,$Vv32.w)",
-tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
@@ -32165,7 +32308,7 @@ def V6_vgtw_and : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.gt($Vu32.w,$Vv32.w)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32176,7 +32319,7 @@ def V6_vgtw_or : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 |= vcmp.gt($Vu32.w,$Vv32.w)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b010110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32188,7 +32331,7 @@ def V6_vgtw_xor : HInst<
 (outs HvxQR:$Qx4),
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.gt($Vu32.w,$Vv32.w)",
-tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
@@ -32199,7 +32342,7 @@ def V6_vhist : HInst<
 (outs),
 (ins),
 "vhist",
-tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV60]> {
+tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV60]> {
 let Inst{13-0} = 0b10000010000000;
 let Inst{31-16} = 0b0001111000000000;
 let DecoderNamespace = "EXT_mmvec";
@@ -32208,7 +32351,7 @@ def V6_vhistq : HInst<
 (outs),
 (ins HvxQR:$Qv4),
 "vhist($Qv4)",
-tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV60]> {
+tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV60]> {
 let Inst{13-0} = 0b10000010000000;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
@@ -32218,7 +32361,7 @@ def V6_vinsertwr : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, IntRegs:$Rt32),
 "$Vx32.w = vinsert($Rt32)",
-tc_e231aa4f, TypeCVI_VX_LATE>, Enc_569cfe, Requires<[UseHVXV60]> {
+tc_ac4046bc, TypeCVI_VX_LATE>, Enc_569cfe, Requires<[UseHVXV60]> {
 let Inst{13-5} = 0b100000001;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
@@ -32230,7 +32373,7 @@ def V6_vlalignb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32 = vlalign($Vu32,$Vv32,$Rt8)",
-tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_56e64202, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
@@ -32242,7 +32385,7 @@ def V6_vlalignbi : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii),
 "$Vd32 = vlalign($Vu32,$Vv32,#$Ii)",
-tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV60]> {
+tc_56e64202, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV60]> {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011110011;
 let hasNewValue = 1;
@@ -32253,7 +32396,7 @@ def V6_vlsrb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.ub = vlsr($Vu32.ub,$Rt32)",
-tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV62]> {
+tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
@@ -32265,7 +32408,7 @@ def V6_vlsrh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.uh = vlsr($Vu32.uh,$Rt32)",
-tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
@@ -32288,7 +32431,7 @@ def V6_vlsrhv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vlsr($Vu32.h,$Vv32.h)",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
@@ -32311,7 +32454,7 @@ def V6_vlsrw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.uw = vlsr($Vu32.uw,$Rt32)",
-tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
@@ -32334,7 +32477,7 @@ def V6_vlsrwv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vlsr($Vu32.w,$Vv32.w)",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
@@ -32357,7 +32500,7 @@ def V6_vlut4 : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, DoubleRegs:$Rtt32),
 "$Vd32.h = vlut4($Vu32.uh,$Rtt32.h)",
-tc_fa99dc24, TypeCVI_VX_DV>, Enc_263841, Requires<[UseHVXV65]> {
+tc_f1de44ef, TypeCVI_VX_DV>, Enc_263841, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
@@ -32369,7 +32512,7 @@ def V6_vlutvvb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8)",
-tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> {
+tc_56e64202, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
@@ -32381,7 +32524,7 @@ def V6_vlutvvb_nm : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8):nomatch",
-tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV62]> {
+tc_56e64202, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
@@ -32393,7 +32536,7 @@ def V6_vlutvvb_oracc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vx32.b |= vlut32($Vu32.b,$Vv32.b,$Rt8)",
-tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_245865, Requires<[UseHVXV60]> {
+tc_9d1dc972, TypeCVI_VP_VS>, Enc_245865, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
@@ -32407,7 +32550,7 @@ def V6_vlutvvb_oracci : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii),
 "$Vx32.b |= vlut32($Vu32.b,$Vv32.b,#$Ii)",
-tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_cd4705, Requires<[UseHVXV62]> {
+tc_9d1dc972, TypeCVI_VP_VS>, Enc_cd4705, Requires<[UseHVXV62]> {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
@@ -32420,7 +32563,7 @@ def V6_vlutvvbi : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii),
 "$Vd32.b = vlut32($Vu32.b,$Vv32.b,#$Ii)",
-tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV62]> {
+tc_56e64202, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV62]> {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110001;
 let hasNewValue = 1;
@@ -32431,7 +32574,7 @@ def V6_vlutvwh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8)",
-tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> {
+tc_87adc037, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
@@ -32443,7 +32586,7 @@ def V6_vlutvwh_nm : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8):nomatch",
-tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV62]> {
+tc_87adc037, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
@@ -32455,7 +32598,7 @@ def V6_vlutvwh_oracc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,$Rt8)",
-tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_7b523d, Requires<[UseHVXV60]> {
+tc_9d1dc972, TypeCVI_VP_VS>, Enc_7b523d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
@@ -32469,7 +32612,7 @@ def V6_vlutvwh_oracci : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii),
 "$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,#$Ii)",
-tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_1178da, Requires<[UseHVXV62]> {
+tc_9d1dc972, TypeCVI_VP_VS>, Enc_1178da, Requires<[UseHVXV62]> {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
@@ -32482,7 +32625,7 @@ def V6_vlutvwhi : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii),
 "$Vdd32.h = vlut16($Vu32.b,$Vv32.h,#$Ii)",
-tc_4e2a5159, TypeCVI_VP_VS>, Enc_4b39e4, Requires<[UseHVXV62]> {
+tc_87adc037, TypeCVI_VP_VS>, Enc_4b39e4, Requires<[UseHVXV62]> {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110011;
 let hasNewValue = 1;
@@ -32493,7 +32636,7 @@ def V6_vmaxb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vmax($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
@@ -32516,7 +32659,7 @@ def V6_vmaxh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vmax($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
@@ -32539,7 +32682,7 @@ def V6_vmaxub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vmax($Vu32.ub,$Vv32.ub)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
@@ -32562,7 +32705,7 @@ def V6_vmaxuh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vmax($Vu32.uh,$Vv32.uh)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
@@ -32585,7 +32728,7 @@ def V6_vmaxw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vmax($Vu32.w,$Vv32.w)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
@@ -32608,7 +32751,7 @@ def V6_vminb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vmin($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
@@ -32631,7 +32774,7 @@ def V6_vminh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vmin($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
@@ -32654,7 +32797,7 @@ def V6_vminub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vmin($Vu32.ub,$Vv32.ub)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
@@ -32677,7 +32820,7 @@ def V6_vminuh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vmin($Vu32.uh,$Vv32.uh)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
@@ -32700,7 +32843,7 @@ def V6_vminw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vmin($Vu32.w,$Vv32.w)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
@@ -32723,7 +32866,7 @@ def V6_vmpabus : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.h = vmpa($Vuu32.ub,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -32735,7 +32878,7 @@ def V6_vmpabus_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.h += vmpa($Vuu32.ub,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -32773,7 +32916,7 @@ def V6_vmpabusv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.b)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -32796,7 +32939,7 @@ def V6_vmpabuu : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.h = vmpa($Vuu32.ub,$Rt32.ub)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV65]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
@@ -32808,7 +32951,7 @@ def V6_vmpabuu_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.h += vmpa($Vuu32.ub,$Rt32.ub)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV65]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001101;
@@ -32846,7 +32989,7 @@ def V6_vmpabuuv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.ub)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
@@ -32869,7 +33012,7 @@ def V6_vmpahb : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.w = vmpa($Vuu32.h,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -32881,7 +33024,7 @@ def V6_vmpahb_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.w += vmpa($Vuu32.h,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -32919,7 +33062,7 @@ def V6_vmpahhsat : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32),
 "$Vx32.h = vmpa($Vx32in.h,$Vu32.h,$Rtt32.h):sat",
-tc_7474003e, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> {
+tc_90bcc1db, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
@@ -32932,7 +33075,7 @@ def V6_vmpauhb : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.w = vmpa($Vuu32.uh,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV62]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
@@ -32944,7 +33087,7 @@ def V6_vmpauhb_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.w += vmpa($Vuu32.uh,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV62]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
@@ -32982,7 +33125,7 @@ def V6_vmpauhuhsat : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32),
 "$Vx32.h = vmpa($Vx32in.h,$Vu32.uh,$Rtt32.uh):sat",
-tc_7474003e, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> {
+tc_90bcc1db, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
@@ -32995,7 +33138,7 @@ def V6_vmpsuhuhsat : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32),
 "$Vx32.h = vmps($Vx32in.h,$Vu32.uh,$Rtt32.uh):sat",
-tc_7474003e, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> {
+tc_90bcc1db, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
@@ -33008,7 +33151,7 @@ def V6_vmpybus : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vdd32.h = vmpy($Vu32.ub,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -33020,7 +33163,7 @@ def V6_vmpybus_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vxx32.h += vmpy($Vu32.ub,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -33058,7 +33201,7 @@ def V6_vmpybusv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.h = vmpy($Vu32.ub,$Vv32.b)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -33070,7 +33213,7 @@ def V6_vmpybusv_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.h += vmpy($Vu32.ub,$Vv32.b)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -33108,7 +33251,7 @@ def V6_vmpybv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.h = vmpy($Vu32.b,$Vv32.b)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -33120,7 +33263,7 @@ def V6_vmpybv_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.h += vmpy($Vu32.b,$Vv32.b)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -33158,7 +33301,7 @@ def V6_vmpyewuh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vmpye($Vu32.w,$Vv32.uh)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
@@ -33170,7 +33313,7 @@ def V6_vmpyewuh_64 : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32 = vmpye($Vu32.w,$Vv32.uh)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV62]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
@@ -33193,7 +33336,7 @@ def V6_vmpyh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vdd32.w = vmpy($Vu32.h,$Rt32.h)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
@@ -33205,7 +33348,7 @@ def V6_vmpyh_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vxx32.w += vmpy($Vu32.h,$Rt32.h)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV65]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001101;
@@ -33243,7 +33386,7 @@ def V6_vmpyhsat_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vxx32.w += vmpy($Vu32.h,$Rt32.h):sat",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001010;
@@ -33270,7 +33413,7 @@ def V6_vmpyhsrs : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
@@ -33293,7 +33436,7 @@ def V6_vmpyhss : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
@@ -33316,7 +33459,7 @@ def V6_vmpyhus : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.w = vmpy($Vu32.h,$Vv32.uh)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -33328,7 +33471,7 @@ def V6_vmpyhus_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.w += vmpy($Vu32.h,$Vv32.uh)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100001;
@@ -33366,7 +33509,7 @@ def V6_vmpyhv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.w = vmpy($Vu32.h,$Vv32.h)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -33378,7 +33521,7 @@ def V6_vmpyhv_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.w += vmpy($Vu32.h,$Vv32.h)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -33416,7 +33559,7 @@ def V6_vmpyhvsrs : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -33439,7 +33582,7 @@ def V6_vmpyieoh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vmpyieo($Vu32.h,$Vv32.h)",
-tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
@@ -33451,7 +33594,7 @@ def V6_vmpyiewh_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vmpyie($Vu32.w,$Vv32.h)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100010;
@@ -33478,7 +33621,7 @@ def V6_vmpyiewuh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vmpyie($Vu32.w,$Vv32.uh)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
@@ -33490,7 +33633,7 @@ def V6_vmpyiewuh_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vmpyie($Vu32.w,$Vv32.uh)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100001;
@@ -33528,7 +33671,7 @@ def V6_vmpyih : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vmpyi($Vu32.h,$Vv32.h)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -33540,7 +33683,7 @@ def V6_vmpyih_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.h += vmpyi($Vu32.h,$Vv32.h)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100001;
@@ -33578,7 +33721,7 @@ def V6_vmpyihb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.h = vmpyi($Vu32.h,$Rt32.b)",
-tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
@@ -33590,7 +33733,7 @@ def V6_vmpyihb_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.h += vmpyi($Vu32.h,$Rt32.b)",
-tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001011;
@@ -33628,7 +33771,7 @@ def V6_vmpyiowh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vmpyio($Vu32.w,$Vv32.h)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
@@ -33651,7 +33794,7 @@ def V6_vmpyiwb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vmpyi($Vu32.w,$Rt32.b)",
-tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001101;
@@ -33663,7 +33806,7 @@ def V6_vmpyiwb_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vmpyi($Vu32.w,$Rt32.b)",
-tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001010;
@@ -33701,7 +33844,7 @@ def V6_vmpyiwh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vmpyi($Vu32.w,$Rt32.h)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
@@ -33713,7 +33856,7 @@ def V6_vmpyiwh_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vmpyi($Vu32.w,$Rt32.h)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001010;
@@ -33751,7 +33894,7 @@ def V6_vmpyiwub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vmpyi($Vu32.w,$Rt32.ub)",
-tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV62]> {
+tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
@@ -33763,7 +33906,7 @@ def V6_vmpyiwub_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vmpyi($Vu32.w,$Rt32.ub)",
-tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV62]> {
+tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
@@ -33801,7 +33944,7 @@ def V6_vmpyowh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:sat",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
@@ -33813,7 +33956,7 @@ def V6_vmpyowh_64_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32 += vmpyo($Vu32.w,$Vv32.h)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100001;
@@ -33838,7 +33981,7 @@ def V6_vmpyowh_rnd : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
@@ -33861,7 +34004,7 @@ def V6_vmpyowh_rnd_sacc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat:shift",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100001;
@@ -33887,7 +34030,7 @@ def V6_vmpyowh_sacc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:sat:shift",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100001;
@@ -33913,7 +34056,7 @@ def V6_vmpyub : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vdd32.uh = vmpy($Vu32.ub,$Rt32.ub)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001110;
@@ -33925,7 +34068,7 @@ def V6_vmpyub_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vxx32.uh += vmpy($Vu32.ub,$Rt32.ub)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
@@ -33963,7 +34106,7 @@ def V6_vmpyubv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.uh = vmpy($Vu32.ub,$Vv32.ub)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -33975,7 +34118,7 @@ def V6_vmpyubv_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.uh += vmpy($Vu32.ub,$Vv32.ub)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -34013,7 +34156,7 @@ def V6_vmpyuh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vdd32.uw = vmpy($Vu32.uh,$Rt32.uh)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
@@ -34025,7 +34168,7 @@ def V6_vmpyuh_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vxx32.uw += vmpy($Vu32.uh,$Rt32.uh)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001010;
@@ -34063,7 +34206,7 @@ def V6_vmpyuhe : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.uw = vmpye($Vu32.uh,$Rt32.uh)",
-tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV65]> {
+tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
@@ -34075,7 +34218,7 @@ def V6_vmpyuhe_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.uw += vmpye($Vu32.uh,$Rt32.uh)",
-tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV65]> {
+tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
@@ -34089,7 +34232,7 @@ def V6_vmpyuhv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.uw = vmpy($Vu32.uh,$Vv32.uh)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -34101,7 +34244,7 @@ def V6_vmpyuhv_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vxx32.uw += vmpy($Vu32.uh,$Vv32.uh)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100001;
@@ -34139,7 +34282,7 @@ def V6_vmux : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qt4, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32 = vmux($Qt4,$Vu32,$Vv32)",
-tc_a3127e12, TypeCVI_VA>, Enc_31db33, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_31db33, Requires<[UseHVXV60]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011110111;
@@ -34151,7 +34294,7 @@ def V6_vnavgb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vnavg($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
@@ -34174,7 +34317,7 @@ def V6_vnavgh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vnavg($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
@@ -34197,7 +34340,7 @@ def V6_vnavgub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vnavg($Vu32.ub,$Vv32.ub)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
@@ -34220,7 +34363,7 @@ def V6_vnavgw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vnavg($Vu32.w,$Vv32.w)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
@@ -34243,7 +34386,7 @@ def V6_vnccombine : HInst<
 (outs HvxWR:$Vdd32),
 (ins PredRegs:$Ps4, HvxVR:$Vu32, HvxVR:$Vv32),
 "if (!$Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
-tc_2171ebae, TypeCVI_VA_DV>, Enc_8c2412, Requires<[UseHVXV60]> {
+tc_af25efd9, TypeCVI_VA_DV>, Enc_8c2412, Requires<[UseHVXV60]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011010010;
@@ -34257,7 +34400,7 @@ def V6_vncmov : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Ps4, HvxVR:$Vu32),
 "if (!$Ps4) $Vd32 = $Vu32",
-tc_b06ab583, TypeCVI_VA>, Enc_770858, Requires<[UseHVXV60]> {
+tc_3aacf4a8, TypeCVI_VA>, Enc_770858, Requires<[UseHVXV60]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001101000100000;
@@ -34271,7 +34414,7 @@ def V6_vnormamth : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.h = vnormamt($Vu32.h)",
-tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000011;
@@ -34294,7 +34437,7 @@ def V6_vnormamtw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.w = vnormamt($Vu32.w)",
-tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000011;
@@ -34317,7 +34460,7 @@ def V6_vnot : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32 = vnot($Vu32)",
-tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
@@ -34329,7 +34472,7 @@ def V6_vor : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32 = vor($Vu32,$Vv32)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -34341,7 +34484,7 @@ def V6_vpackeb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vpacke($Vu32.h,$Vv32.h)",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
@@ -34364,7 +34507,7 @@ def V6_vpackeh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vpacke($Vu32.w,$Vv32.w)",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
@@ -34387,7 +34530,7 @@ def V6_vpackhb_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vpack($Vu32.h,$Vv32.h):sat",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
@@ -34410,7 +34553,7 @@ def V6_vpackhub_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vpack($Vu32.h,$Vv32.h):sat",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
@@ -34433,7 +34576,7 @@ def V6_vpackob : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vpacko($Vu32.h,$Vv32.h)",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
@@ -34456,7 +34599,7 @@ def V6_vpackoh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vpacko($Vu32.w,$Vv32.w)",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
@@ -34479,7 +34622,7 @@ def V6_vpackwh_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vpack($Vu32.w,$Vv32.w):sat",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
@@ -34502,7 +34645,7 @@ def V6_vpackwuh_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vpack($Vu32.w,$Vv32.w):sat",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
@@ -34525,7 +34668,7 @@ def V6_vpopcounth : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.h = vpopcount($Vu32.h)",
-tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
@@ -34548,7 +34691,7 @@ def V6_vprefixqb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qv4),
 "$Vd32.b = prefixsum($Qv4)",
-tc_d2cb81ea, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> {
+tc_51d0ecc3, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> {
 let Inst{13-5} = 0b100000010;
 let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
@@ -34560,7 +34703,7 @@ def V6_vprefixqh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qv4),
 "$Vd32.h = prefixsum($Qv4)",
-tc_d2cb81ea, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> {
+tc_51d0ecc3, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> {
 let Inst{13-5} = 0b100001010;
 let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
@@ -34572,7 +34715,7 @@ def V6_vprefixqw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qv4),
 "$Vd32.w = prefixsum($Qv4)",
-tc_d2cb81ea, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> {
+tc_51d0ecc3, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> {
 let Inst{13-5} = 0b100010010;
 let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
@@ -34584,7 +34727,7 @@ def V6_vrdelta : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32 = vrdelta($Vu32,$Vv32)",
-tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
@@ -34596,7 +34739,7 @@ def V6_vrmpybub_rtt : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, DoubleRegs:$Rtt32),
 "$Vdd32.w = vrmpy($Vu32.b,$Rtt32.ub)",
-tc_a807365d, TypeCVI_VS_VX>, Enc_cb785b, Requires<[UseHVXV65]> {
+tc_cd94bfe0, TypeCVI_VS_VX>, Enc_cb785b, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001110;
@@ -34608,7 +34751,7 @@ def V6_vrmpybub_rtt_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32),
 "$Vxx32.w += vrmpy($Vu32.b,$Rtt32.ub)",
-tc_ee927c0e, TypeCVI_VS_VX>, Enc_ad9bef, Requires<[UseHVXV65]> {
+tc_15fdf750, TypeCVI_VS_VX>, Enc_ad9bef, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001101;
@@ -34646,7 +34789,7 @@ def V6_vrmpybus : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vrmpy($Vu32.ub,$Rt32.b)",
-tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
@@ -34658,7 +34801,7 @@ def V6_vrmpybus_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vrmpy($Vu32.ub,$Rt32.b)",
-tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001000;
@@ -34696,7 +34839,7 @@ def V6_vrmpybusi : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
 "$Vdd32.w = vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
-tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> {
+tc_1ad8a370, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
@@ -34708,7 +34851,7 @@ def V6_vrmpybusi_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
 "$Vxx32.w += vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
-tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> {
+tc_e675c45a, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001010;
@@ -34746,7 +34889,7 @@ def V6_vrmpybusv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vrmpy($Vu32.ub,$Vv32.b)",
-tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -34758,7 +34901,7 @@ def V6_vrmpybusv_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -34796,7 +34939,7 @@ def V6_vrmpybv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vrmpy($Vu32.b,$Vv32.b)",
-tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -34808,7 +34951,7 @@ def V6_vrmpybv_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vrmpy($Vu32.b,$Vv32.b)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -34846,7 +34989,7 @@ def V6_vrmpyub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.uw = vrmpy($Vu32.ub,$Rt32.ub)",
-tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
@@ -34858,7 +35001,7 @@ def V6_vrmpyub_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.uw += vrmpy($Vu32.ub,$Rt32.ub)",
-tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001000;
@@ -34896,7 +35039,7 @@ def V6_vrmpyub_rtt : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, DoubleRegs:$Rtt32),
 "$Vdd32.uw = vrmpy($Vu32.ub,$Rtt32.ub)",
-tc_a807365d, TypeCVI_VS_VX>, Enc_cb785b, Requires<[UseHVXV65]> {
+tc_cd94bfe0, TypeCVI_VS_VX>, Enc_cb785b, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001110;
@@ -34908,7 +35051,7 @@ def V6_vrmpyub_rtt_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32),
 "$Vxx32.uw += vrmpy($Vu32.ub,$Rtt32.ub)",
-tc_ee927c0e, TypeCVI_VS_VX>, Enc_ad9bef, Requires<[UseHVXV65]> {
+tc_15fdf750, TypeCVI_VS_VX>, Enc_ad9bef, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001101;
@@ -34946,7 +35089,7 @@ def V6_vrmpyubi : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
 "$Vdd32.uw = vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
-tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> {
+tc_1ad8a370, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> {
 let Inst{7-6} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001101;
@@ -34958,7 +35101,7 @@ def V6_vrmpyubi_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
 "$Vxx32.uw += vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
-tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> {
+tc_e675c45a, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> {
 let Inst{7-6} = 0b11;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001011;
@@ -34996,7 +35139,7 @@ def V6_vrmpyubv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uw = vrmpy($Vu32.ub,$Vv32.ub)",
-tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -35008,7 +35151,7 @@ def V6_vrmpyubv_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)",
-tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -35042,11 +35185,276 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vrmpyzbb_rt : HInst<
+(outs HvxVQR:$Vdddd32),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vdddd32.w = vrmpyz($Vu32.b,$Rt8.b)",
+tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyzbb_rt_acc : HInst<
+(outs HvxVQR:$Vyyyy32),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vyyyy32.w += vrmpyz($Vu32.b,$Rt8.b)",
+tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in";
+}
+def V6_vrmpyzbb_rx : HInst<
+(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
+(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vdddd32.w = vrmpyz($Vu32.b,$Rx8.b++)",
+tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx8 = $Rx8in";
+}
+def V6_vrmpyzbb_rx_acc : HInst<
+(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vyyyy32.w += vrmpyz($Vu32.b,$Rx8.b++)",
+tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
+}
+def V6_vrmpyzbub_rt : HInst<
+(outs HvxVQR:$Vdddd32),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vdddd32.w = vrmpyz($Vu32.b,$Rt8.ub)",
+tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyzbub_rt_acc : HInst<
+(outs HvxVQR:$Vyyyy32),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vyyyy32.w += vrmpyz($Vu32.b,$Rt8.ub)",
+tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in";
+}
+def V6_vrmpyzbub_rx : HInst<
+(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
+(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vdddd32.w = vrmpyz($Vu32.b,$Rx8.ub++)",
+tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx8 = $Rx8in";
+}
+def V6_vrmpyzbub_rx_acc : HInst<
+(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vyyyy32.w += vrmpyz($Vu32.b,$Rx8.ub++)",
+tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
+}
+def V6_vrmpyzcb_rt : HInst<
+(outs HvxVQR:$Vdddd32),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vdddd32.w = vr16mpyz($Vu32.c,$Rt8.b)",
+tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyzcb_rt_acc : HInst<
+(outs HvxVQR:$Vyyyy32),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vyyyy32.w += vr16mpyz($Vu32.c,$Rt8.b)",
+tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in";
+}
+def V6_vrmpyzcb_rx : HInst<
+(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
+(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vdddd32.w = vr16mpyz($Vu32.c,$Rx8.b++)",
+tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx8 = $Rx8in";
+}
+def V6_vrmpyzcb_rx_acc : HInst<
+(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vyyyy32.w += vr16mpyz($Vu32.c,$Rx8.b++)",
+tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
+}
+def V6_vrmpyzcbs_rt : HInst<
+(outs HvxVQR:$Vdddd32),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vdddd32.w = vr16mpyzs($Vu32.c,$Rt8.b)",
+tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyzcbs_rt_acc : HInst<
+(outs HvxVQR:$Vyyyy32),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vyyyy32.w += vr16mpyzs($Vu32.c,$Rt8.b)",
+tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in";
+}
+def V6_vrmpyzcbs_rx : HInst<
+(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
+(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vdddd32.w = vr16mpyzs($Vu32.c,$Rx8.b++)",
+tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx8 = $Rx8in";
+}
+def V6_vrmpyzcbs_rx_acc : HInst<
+(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vyyyy32.w += vr16mpyzs($Vu32.c,$Rx8.b++)",
+tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
+}
+def V6_vrmpyznb_rt : HInst<
+(outs HvxVQR:$Vdddd32),
+(ins HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vdddd32.w = vr8mpyz($Vu32.n,$Rt8.b)",
+tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyznb_rt_acc : HInst<
+(outs HvxVQR:$Vyyyy32),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8),
+"$Vyyyy32.w += vr8mpyz($Vu32.n,$Rt8.b)",
+tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in";
+}
+def V6_vrmpyznb_rx : HInst<
+(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8),
+(ins HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vdddd32.w = vr8mpyz($Vu32.n,$Rx8.b++)",
+tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-19} = 0b0001100111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx8 = $Rx8in";
+}
+def V6_vrmpyznb_rx_acc : HInst<
+(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8),
+(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in),
+"$Vyyyy32.w += vr8mpyz($Vu32.n,$Rx8.b++)",
+tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-19} = 0b0001100111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
+}
 def V6_vror : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32 = vror($Vu32,$Rt32)",
-tc_bf142ae2, TypeCVI_VP>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_6e7fa133, TypeCVI_VP>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
@@ -35054,11 +35462,34 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vrotr : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.uw = vrotr($Vu32.uw,$Vv32.uw)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV66]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011010100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrotr_alt : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32 = vrotr($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vroundhb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vround($Vu32.h,$Vv32.h):sat",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
@@ -35081,7 +35512,7 @@ def V6_vroundhub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vround($Vu32.h,$Vv32.h):sat",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
@@ -35104,7 +35535,7 @@ def V6_vrounduhub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vround($Vu32.uh,$Vv32.uh):sat",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
@@ -35127,7 +35558,7 @@ def V6_vrounduwuh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vround($Vu32.uw,$Vv32.uw):sat",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
@@ -35150,7 +35581,7 @@ def V6_vroundwh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vround($Vu32.w,$Vv32.w):sat",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
@@ -35173,7 +35604,7 @@ def V6_vroundwuh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vround($Vu32.w,$Vv32.w):sat",
-tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
@@ -35196,7 +35627,7 @@ def V6_vrsadubi : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
 "$Vdd32.uw = vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
-tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> {
+tc_1ad8a370, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> {
 let Inst{7-6} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
@@ -35208,7 +35639,7 @@ def V6_vrsadubi_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
 "$Vxx32.uw += vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
-tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> {
+tc_e675c45a, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> {
 let Inst{7-6} = 0b11;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001010;
@@ -35242,11 +35673,23 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vsatdw : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.w = vsatdw($Vu32.w,$Vv32.w)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV66]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vsathub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vsat($Vu32.h,$Vv32.h)",
-tc_9b9642a1, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_8772086c, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
@@ -35269,7 +35712,7 @@ def V6_vsatuwuh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vsat($Vu32.uw,$Vv32.uw)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
@@ -35292,7 +35735,7 @@ def V6_vsatwh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vsat($Vu32.w,$Vv32.w)",
-tc_9b9642a1, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_8772086c, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
@@ -35315,7 +35758,7 @@ def V6_vsb : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32),
 "$Vdd32.h = vsxt($Vu32.b)",
-tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> {
+tc_b4416217, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
@@ -35338,7 +35781,7 @@ def V6_vscattermh : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vv32.h).h = $Vw32",
-tc_4f190ba3, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
+tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b00101111001;
 let accessSize = HalfWordAccess;
@@ -35349,7 +35792,7 @@ def V6_vscattermh_add : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vv32.h).h += $Vw32",
-tc_4f190ba3, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
+tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b101;
 let Inst{31-21} = 0b00101111001;
 let accessSize = HalfWordAccess;
@@ -35380,7 +35823,7 @@ def V6_vscattermhq : HInst<
 (outs),
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "if ($Qs4) vscatter($Rt32,$Mu2,$Vv32.h).h = $Vw32",
-tc_df54ad52, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> {
+tc_8e420e4d, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b00101111100;
 let accessSize = HalfWordAccess;
@@ -35400,7 +35843,7 @@ def V6_vscattermhw : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vvv32.w).h = $Vw32",
-tc_ec58f88a, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> {
+tc_7273323b, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b00101111001;
 let accessSize = HalfWordAccess;
@@ -35411,7 +35854,7 @@ def V6_vscattermhw_add : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vvv32.w).h += $Vw32",
-tc_ec58f88a, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> {
+tc_7273323b, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b00101111001;
 let accessSize = HalfWordAccess;
@@ -35423,7 +35866,7 @@ def V6_vscattermhwq : HInst<
 (outs),
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32, HvxVR:$Vw32),
 "if ($Qs4) vscatter($Rt32,$Mu2,$Vvv32.w).h = $Vw32",
-tc_94f43c04, TypeCVI_SCATTER_DV>, Enc_3d6d37, Requires<[UseHVXV65]> {
+tc_58d21193, TypeCVI_SCATTER_DV>, Enc_3d6d37, Requires<[UseHVXV65]> {
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b00101111101;
 let accessSize = HalfWordAccess;
@@ -35434,7 +35877,7 @@ def V6_vscattermw : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vv32.w).w = $Vw32",
-tc_4f190ba3, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
+tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b00101111001;
 let accessSize = WordAccess;
@@ -35445,7 +35888,7 @@ def V6_vscattermw_add : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vv32.w).w += $Vw32",
-tc_4f190ba3, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
+tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b100;
 let Inst{31-21} = 0b00101111001;
 let accessSize = WordAccess;
@@ -35504,7 +35947,7 @@ def V6_vscattermwq : HInst<
 (outs),
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "if ($Qs4) vscatter($Rt32,$Mu2,$Vv32.w).w = $Vw32",
-tc_df54ad52, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> {
+tc_8e420e4d, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> {
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b00101111100;
 let accessSize = WordAccess;
@@ -35524,7 +35967,7 @@ def V6_vsh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32),
 "$Vdd32.w = vsxt($Vu32.h)",
-tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> {
+tc_b4416217, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
@@ -35547,7 +35990,7 @@ def V6_vshufeh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vshuffe($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
@@ -35570,7 +36013,7 @@ def V6_vshuff : HInst<
 (outs HvxVR:$Vy32, HvxVR:$Vx32),
 (ins HvxVR:$Vy32in, HvxVR:$Vx32in, IntRegs:$Rt32),
 "vshuff($Vy32,$Vx32,$Rt32)",
-tc_5c120602, TypeCVI_VP_VS>, Enc_989021, Requires<[UseHVXV60]> {
+tc_561aaa58, TypeCVI_VP_VS>, Enc_989021, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001111;
@@ -35585,7 +36028,7 @@ def V6_vshuffb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.b = vshuff($Vu32.b)",
-tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_946013d8, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
@@ -35608,7 +36051,7 @@ def V6_vshuffeb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vshuffe($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
@@ -35631,7 +36074,7 @@ def V6_vshuffh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
 "$Vd32.h = vshuff($Vu32.h)",
-tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> {
+tc_946013d8, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
@@ -35654,7 +36097,7 @@ def V6_vshuffob : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vshuffo($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
@@ -35677,7 +36120,7 @@ def V6_vshuffvdd : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
 "$Vdd32 = vshuff($Vu32,$Vv32,$Rt8)",
-tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> {
+tc_87adc037, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
@@ -35689,7 +36132,7 @@ def V6_vshufoeb : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.b = vshuffoe($Vu32.b,$Vv32.b)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
@@ -35712,7 +36155,7 @@ def V6_vshufoeh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.h = vshuffoe($Vu32.h,$Vv32.h)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
@@ -35735,7 +36178,7 @@ def V6_vshufoh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vshuffo($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
@@ -35758,7 +36201,7 @@ def V6_vsubb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vsub($Vu32.b,$Vv32.b)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
@@ -35781,7 +36224,7 @@ def V6_vsubb_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.b = vsub($Vuu32.b,$Vvv32.b)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
@@ -35804,7 +36247,7 @@ def V6_vsubbnq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if (!$Qv4) $Vx32.b -= $Vu32.b",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000010;
@@ -35830,7 +36273,7 @@ def V6_vsubbq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if ($Qv4) $Vx32.b -= $Vu32.b",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000001;
@@ -35856,7 +36299,7 @@ def V6_vsubbsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.b = vsub($Vu32.b,$Vv32.b):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
@@ -35879,7 +36322,7 @@ def V6_vsubbsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.b = vsub($Vuu32.b,$Vvv32.b):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
@@ -35902,7 +36345,7 @@ def V6_vsubcarry : HInst<
 (outs HvxVR:$Vd32, HvxQR:$Qx4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, HvxQR:$Qx4in),
 "$Vd32.w = vsub($Vu32.w,$Vv32.w,$Qx4):carry",
-tc_5a9fc4ec, TypeCVI_VA>, Enc_b43b67, Requires<[UseHVXV62]> {
+tc_7e6a3e89, TypeCVI_VA>, Enc_b43b67, Requires<[UseHVXV62]> {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100101;
@@ -35911,11 +36354,25 @@ let opNewValue = 0;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
+def V6_vsubcarryo : HInst<
+(outs HvxVR:$Vd32, HvxQR:$Qe4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.w,$Qe4 = vsub($Vu32.w,$Vv32.w):carry",
+tc_e35c1e93, TypeCOPROC_VX>, Enc_c1d806, Requires<[UseHVXV66]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vsubh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vsub($Vu32.h,$Vv32.h)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
@@ -35938,7 +36395,7 @@ def V6_vsubh_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.h = vsub($Vuu32.h,$Vvv32.h)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
@@ -35961,7 +36418,7 @@ def V6_vsubhnq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if (!$Qv4) $Vx32.h -= $Vu32.h",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000010;
@@ -35987,7 +36444,7 @@ def V6_vsubhq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if ($Qv4) $Vx32.h -= $Vu32.h",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000001;
@@ -36013,7 +36470,7 @@ def V6_vsubhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vsub($Vu32.h,$Vv32.h):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
@@ -36036,7 +36493,7 @@ def V6_vsubhsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.h = vsub($Vuu32.h,$Vvv32.h):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
@@ -36059,7 +36516,7 @@ def V6_vsubhw : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.w = vsub($Vu32.h,$Vv32.h)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
@@ -36082,7 +36539,7 @@ def V6_vsububh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.h = vsub($Vu32.ub,$Vv32.ub)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
@@ -36105,7 +36562,7 @@ def V6_vsububsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vsub($Vu32.ub,$Vv32.ub):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
@@ -36128,7 +36585,7 @@ def V6_vsububsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.ub = vsub($Vuu32.ub,$Vvv32.ub):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
@@ -36151,7 +36608,7 @@ def V6_vsubububb_sat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vsub($Vu32.ub,$Vv32.b):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
@@ -36163,7 +36620,7 @@ def V6_vsubuhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uh = vsub($Vu32.uh,$Vv32.uh):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
@@ -36186,7 +36643,7 @@ def V6_vsubuhsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.uh = vsub($Vuu32.uh,$Vvv32.uh):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
@@ -36209,7 +36666,7 @@ def V6_vsubuhw : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32.w = vsub($Vu32.uh,$Vv32.uh)",
-tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
@@ -36232,7 +36689,7 @@ def V6_vsubuwsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.uw = vsub($Vu32.uw,$Vv32.uw):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
@@ -36255,7 +36712,7 @@ def V6_vsubuwsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.uw = vsub($Vuu32.uw,$Vvv32.uw):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
@@ -36278,7 +36735,7 @@ def V6_vsubw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vsub($Vu32.w,$Vv32.w)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
@@ -36301,7 +36758,7 @@ def V6_vsubw_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.w = vsub($Vuu32.w,$Vvv32.w)",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
@@ -36324,7 +36781,7 @@ def V6_vsubwnq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if (!$Qv4) $Vx32.w -= $Vu32.w",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000010;
@@ -36350,7 +36807,7 @@ def V6_vsubwq : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32),
 "if ($Qv4) $Vx32.w -= $Vu32.w",
-tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
+tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{21-16} = 0b000010;
@@ -36376,7 +36833,7 @@ def V6_vsubwsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vsub($Vu32.w,$Vv32.w):sat",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
@@ -36399,7 +36856,7 @@ def V6_vsubwsat_dv : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32),
 "$Vdd32.w = vsub($Vuu32.w,$Vvv32.w):sat",
-tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
+tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
@@ -36422,7 +36879,7 @@ def V6_vswap : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxQR:$Qt4, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vdd32 = vswap($Qt4,$Vu32,$Vv32)",
-tc_316c637c, TypeCVI_VA_DV>, Enc_3dac0b, Requires<[UseHVXV60]> {
+tc_71646d06, TypeCVI_VA_DV>, Enc_3dac0b, Requires<[UseHVXV60]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011110101;
@@ -36434,7 +36891,7 @@ def V6_vtmpyb : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.h = vtmpy($Vuu32.b,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
@@ -36446,7 +36903,7 @@ def V6_vtmpyb_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.h += vtmpy($Vuu32.b,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001000;
@@ -36484,7 +36941,7 @@ def V6_vtmpybus : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.h = vtmpy($Vuu32.ub,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
@@ -36496,7 +36953,7 @@ def V6_vtmpybus_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.h += vtmpy($Vuu32.ub,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001000;
@@ -36534,7 +36991,7 @@ def V6_vtmpyhb : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vdd32.w = vtmpy($Vuu32.h,$Rt32.b)",
-tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
+tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001101;
@@ -36546,7 +37003,7 @@ def V6_vtmpyhb_acc : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32),
 "$Vxx32.w += vtmpy($Vuu32.h,$Rt32.b)",
-tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
+tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001000;
@@ -36598,7 +37055,7 @@ def V6_vunpackb : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32),
 "$Vdd32.h = vunpack($Vu32.b)",
-tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> {
+tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
@@ -36621,7 +37078,7 @@ def V6_vunpackh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32),
 "$Vdd32.w = vunpack($Vu32.h)",
-tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> {
+tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
@@ -36644,7 +37101,7 @@ def V6_vunpackob : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32),
 "$Vxx32.h |= vunpacko($Vu32.b)",
-tc_72ad7b54, TypeCVI_VP_VS>, Enc_500cb0, Requires<[UseHVXV60]> {
+tc_2c745bb8, TypeCVI_VP_VS>, Enc_500cb0, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-16} = 0b0001111000000000;
@@ -36670,7 +37127,7 @@ def V6_vunpackoh : HInst<
 (outs HvxWR:$Vxx32),
 (ins HvxWR:$Vxx32in, HvxVR:$Vu32),
 "$Vxx32.w |= vunpacko($Vu32.h)",
-tc_72ad7b54, TypeCVI_VP_VS>, Enc_500cb0, Requires<[UseHVXV60]> {
+tc_2c745bb8, TypeCVI_VP_VS>, Enc_500cb0, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-16} = 0b0001111000000000;
@@ -36697,7 +37154,7 @@ def V6_vunpackub : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32),
 "$Vdd32.uh = vunpack($Vu32.ub)",
-tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> {
+tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
@@ -36720,7 +37177,7 @@ def V6_vunpackuh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32),
 "$Vdd32.uw = vunpack($Vu32.uh)",
-tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> {
+tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
@@ -36743,7 +37200,7 @@ def V6_vwhist128 : HInst<
 (outs),
 (ins),
 "vwhist128",
-tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
+tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10010010000000;
 let Inst{31-16} = 0b0001111000000000;
 let DecoderNamespace = "EXT_mmvec";
@@ -36752,7 +37209,7 @@ def V6_vwhist128m : HInst<
 (outs),
 (ins u1_0Imm:$Ii),
 "vwhist128(#$Ii)",
-tc_b77635b4, TypeCVI_HIST>, Enc_efaed8, Requires<[UseHVXV62]> {
+tc_b28e51aa, TypeCVI_HIST>, Enc_efaed8, Requires<[UseHVXV62]> {
 let Inst{7-0} = 0b10000000;
 let Inst{13-9} = 0b10011;
 let Inst{31-16} = 0b0001111000000000;
@@ -36762,7 +37219,7 @@ def V6_vwhist128q : HInst<
 (outs),
 (ins HvxQR:$Qv4),
 "vwhist128($Qv4)",
-tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
+tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10010010000000;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
@@ -36772,7 +37229,7 @@ def V6_vwhist128qm : HInst<
 (outs),
 (ins HvxQR:$Qv4, u1_0Imm:$Ii),
 "vwhist128($Qv4,#$Ii)",
-tc_28978789, TypeCVI_HIST>, Enc_802dc0, Requires<[UseHVXV62]> {
+tc_767c4e9d, TypeCVI_HIST>, Enc_802dc0, Requires<[UseHVXV62]> {
 let Inst{7-0} = 0b10000000;
 let Inst{13-9} = 0b10011;
 let Inst{21-16} = 0b000010;
@@ -36783,7 +37240,7 @@ def V6_vwhist256 : HInst<
 (outs),
 (ins),
 "vwhist256",
-tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
+tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10001010000000;
 let Inst{31-16} = 0b0001111000000000;
 let DecoderNamespace = "EXT_mmvec";
@@ -36792,7 +37249,7 @@ def V6_vwhist256_sat : HInst<
 (outs),
 (ins),
 "vwhist256:sat",
-tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
+tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10001110000000;
 let Inst{31-16} = 0b0001111000000000;
 let DecoderNamespace = "EXT_mmvec";
@@ -36801,7 +37258,7 @@ def V6_vwhist256q : HInst<
 (outs),
 (ins HvxQR:$Qv4),
 "vwhist256($Qv4)",
-tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
+tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10001010000000;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
@@ -36811,7 +37268,7 @@ def V6_vwhist256q_sat : HInst<
 (outs),
 (ins HvxQR:$Qv4),
 "vwhist256($Qv4):sat",
-tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
+tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10001110000000;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
@@ -36821,7 +37278,7 @@ def V6_vxor : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32 = vxor($Vu32,$Vv32)",
-tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -36833,7 +37290,7 @@ def V6_vzb : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32),
 "$Vdd32.uh = vzxt($Vu32.ub)",
-tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> {
+tc_b4416217, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
@@ -36856,7 +37313,7 @@ def V6_vzh : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32),
 "$Vdd32.uw = vzxt($Vu32.uh)",
-tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> {
+tc_b4416217, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
@@ -36875,11 +37332,122 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_zLd_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"z = vmem($Rt32+#$Ii)",
+tc_e699ae41, TypeCVI_ZW>, Enc_ff3442, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101100000;
+let addrMode = BaseImmOffset;
+let mayLoad = 1;
+let isRestrictNoSlot1Store = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_zLd_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"z = vmem($Rx32++#$Ii)",
+tc_a0dbea28, TypeCVI_ZW>, Enc_6c9ee0, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-0} = 0b00000000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101101000;
+let addrMode = PostInc;
+let mayLoad = 1;
+let isRestrictNoSlot1Store = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_zLd_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"z = vmem($Rx32++$Mu2)",
+tc_a0dbea28, TypeCVI_ZW>, Enc_44661f, Requires<[UseHVXV66,UseZReg]> {
+let Inst{12-0} = 0b0000000000001;
+let Inst{31-21} = 0b00101101000;
+let addrMode = PostInc;
+let mayLoad = 1;
+let isRestrictNoSlot1Store = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_zLd_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) z = vmem($Rt32+#$Ii)",
+tc_dd5b0695, TypeCVI_ZW>, Enc_ef601b, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b00101100100;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let mayLoad = 1;
+let isRestrictNoSlot1Store = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_zLd_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) z = vmem($Rx32++#$Ii)",
+tc_3ad719fb, TypeCVI_ZW>, Enc_6baed4, Requires<[UseHVXV66,UseZReg]> {
+let Inst{7-0} = 0b00000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101101100;
+let isPredicated = 1;
+let addrMode = PostInc;
+let mayLoad = 1;
+let isRestrictNoSlot1Store = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_zLd_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) z = vmem($Rx32++$Mu2)",
+tc_3ad719fb, TypeCVI_ZW>, Enc_691712, Requires<[UseHVXV66,UseZReg]> {
+let Inst{10-0} = 0b00000000001;
+let Inst{31-21} = 0b00101101100;
+let isPredicated = 1;
+let addrMode = PostInc;
+let mayLoad = 1;
+let isRestrictNoSlot1Store = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_zextract : HInst<
+(outs HvxVR:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = zextract($Rt32)",
+tc_5bf8afbb, TypeCVI_VP>, Enc_a5ed8a, Requires<[UseHVXV66,UseZReg]> {
+let Inst{13-5} = 0b000001001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_zld0 : HInst<
+(outs),
+(ins IntRegs:$Rt32),
+"z = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_zldp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) z = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def Y2_barrier : HInst<
 (outs),
 (ins),
 "barrier",
-tc_367f7f3d, TypeST>, Enc_e3b0c4 {
+tc_8c99de45, TypeST>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b1010100000000000;
 let isSoloAX = 1;
@@ -36889,7 +37457,7 @@ def Y2_break : HInst<
 (outs),
 (ins),
 "brkpt",
-tc_4ca572d4, TypeCR>, Enc_e3b0c4 {
+tc_9ad9998f, TypeCR>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b0110110000100000;
 let isSolo = 1;
@@ -36898,7 +37466,7 @@ def Y2_dccleana : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dccleana($Rs32)",
-tc_00e7c26e, TypeST>, Enc_ecbcc8 {
+tc_b857bf4e, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000000;
 let isRestrictSlot1AOK = 1;
@@ -36908,7 +37476,7 @@ def Y2_dccleaninva : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dccleaninva($Rs32)",
-tc_00e7c26e, TypeST>, Enc_ecbcc8 {
+tc_b857bf4e, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000010;
 let isRestrictSlot1AOK = 1;
@@ -36918,7 +37486,7 @@ def Y2_dcfetch : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dcfetch($Rs32)",
-tc_3da80ba5, TypeMAPPING> {
+tc_d63f638c, TypeMAPPING> {
 let hasSideEffects = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
@@ -36927,7 +37495,7 @@ def Y2_dcfetchbo : HInst<
 (outs),
 (ins IntRegs:$Rs32, u11_3Imm:$Ii),
 "dcfetch($Rs32+#$Ii)",
-tc_4d9914c9, TypeLD>, Enc_2d829e {
+tc_9ca930f7, TypeLD>, Enc_2d829e {
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b10010100000;
 let addrMode = BaseImmOffset;
@@ -36938,7 +37506,7 @@ def Y2_dcinva : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dcinva($Rs32)",
-tc_00e7c26e, TypeST>, Enc_ecbcc8 {
+tc_b857bf4e, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000001;
 let isRestrictSlot1AOK = 1;
@@ -36948,7 +37516,7 @@ def Y2_dczeroa : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dczeroa($Rs32)",
-tc_00e7c26e, TypeST>, Enc_ecbcc8 {
+tc_b857bf4e, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000110;
 let isRestrictSlot1AOK = 1;
@@ -36959,7 +37527,7 @@ def Y2_icinva : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "icinva($Rs32)",
-tc_999d32db, TypeJ>, Enc_ecbcc8 {
+tc_5d7f5414, TypeJ>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01010110110;
 let isSolo = 1;
@@ -36968,7 +37536,7 @@ def Y2_isync : HInst<
 (outs),
 (ins),
 "isync",
-tc_b13761ae, TypeJ>, Enc_e3b0c4 {
+tc_8b121f4a, TypeJ>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000010;
 let Inst{31-16} = 0b0101011111000000;
 let isSolo = 1;
@@ -36977,16 +37545,25 @@ def Y2_syncht : HInst<
 (outs),
 (ins),
 "syncht",
-tc_367f7f3d, TypeST>, Enc_e3b0c4 {
+tc_8c99de45, TypeST>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b1010100001000000;
 let isSolo = 1;
 }
+def Y2_wait : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"wait($Rs32)",
+tc_174516e8, TypeCR>, Enc_ecbcc8, Requires<[HasV65]> {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01100100010;
+let isSolo = 1;
+}
 def Y4_l2fetch : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "l2fetch($Rs32,$Rt32)",
-tc_daa058fa, TypeST>, Enc_ca3887 {
+tc_fe211424, TypeST>, Enc_ca3887 {
 let Inst{7-0} = 0b00000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100110000;
@@ -36998,7 +37575,7 @@ def Y4_trace : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "trace($Rs32)",
-tc_c82dc1ff, TypeCR>, Enc_ecbcc8 {
+tc_6b25e783, TypeCR>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01100010010;
 let isSoloAX = 1;
@@ -37007,7 +37584,7 @@ def Y5_l2fetch : HInst<
 (outs),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "l2fetch($Rs32,$Rtt32)",
-tc_daa058fa, TypeST>, Enc_e6abcf {
+tc_fe211424, TypeST>, Enc_e6abcf {
 let Inst{7-0} = 0b00000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100110100;
@@ -37019,7 +37596,7 @@ def dep_A2_addsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add($Rs32,$Rt32):sat:deprecated",
-tc_b44c6e2a, TypeALU64>, Enc_5ab2be {
+tc_779080bf, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101100;
@@ -37032,7 +37609,7 @@ def dep_A2_subsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32,$Rs32):sat:deprecated",
-tc_b44c6e2a, TypeALU64>, Enc_bd6011 {
+tc_779080bf, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101100;
@@ -37045,7 +37622,7 @@ def dep_S2_packhl : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = packhl($Rs32,$Rt32):deprecated",
-tc_540fdfbc, TypeALU64>, Enc_be32a5 {
+tc_946df596, TypeALU64>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010100000;
diff --git a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
new file mode 100644
index 0000000000000000000000000000000000000000..2346fa5726264dfa393ebed107c098b4036dfd68
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -0,0 +1,3337 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Automatically generated file, please consult code owner before editing.
+//===----------------------------------------------------------------------===//
+
+
+// V5 Scalar Instructions.
+
+def: Pat<(int_hexagon_S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwh DoubleRegs:$src1),
+         (S2_vsatwh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysu_up IntRegs:$src1, IntRegs:$src2),
+         (M2_mpysu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpysc_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpysc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpysc_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpysc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2),
+         (M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2),
+         (F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1),
+         (A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
+         (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred:$src2),
+         (A4_combineri IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_notp DoubleRegs:$src1),
+         (A2_notp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_brevp DoubleRegs:$src1),
+         (S2_brevp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl1 IntRegs:$src1),
+         (S2_cl1 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplte IntRegs:$src1, IntRegs:$src2),
+         (C4_cmplte IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2),
+         (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1),
+         (S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_abssat IntRegs:$src1),
+         (A2_abssat IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtu IntRegs:$src1, IntRegs:$src2),
+         (C2_cmpgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2),
+         (A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2),
+         (C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1),
+         (F2_conv_df2uw_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpheq IntRegs:$src1, IntRegs:$src2),
+         (A4_cmpheq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3),
+         (S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2),
+         (S4_ntstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1),
+         (F2_conv_w2sf IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_not PredRegs:$src1),
+         (C2_not PredRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_tfrpr PredRegs:$src1),
+         (C2_tfrpr PredRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2),
+         (A4_cmpbgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2),
+         (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_orp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
+         (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
+         (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2),
+         (A4_cmpbgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2),
+         (A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2),
+         (A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2),
+         (A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2),
+         (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2),
+         (M4_pmpyw IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathb DoubleRegs:$src1),
+         (S2_vsathb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
+         (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_absp DoubleRegs:$src1),
+         (A2_absp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_all8 PredRegs:$src1),
+         (C2_all8 PredRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2),
+         (C2_bitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2),
+         (M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2),
+         (M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_boundscheck IntRegs:$src1, DoubleRegs:$src2),
+         (A4_boundscheck IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2),
+         (M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_fastcorner9 PredRegs:$src1, PredRegs:$src2),
+         (C4_fastcorner9 PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2),
+         (M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subsat IntRegs:$src1, IntRegs:$src2),
+         (A2_subsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r IntRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1),
+         (F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1),
+         (S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2),
+         (F2_sfadd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sub IntRegs:$src1, IntRegs:$src2),
+         (A2_sub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4),
+         (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2),
+         (S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2),
+         (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubhs IntRegs:$src1, IntRegs:$src2),
+         (A2_svsubhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1),
+         (F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2),
+         (C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1),
+         (A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clbp DoubleRegs:$src1),
+         (S2_clbp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_deinterleave DoubleRegs:$src1),
+         (S2_deinterleave DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_any8 PredRegs:$src1),
+         (C2_any8 PredRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2),
+         (S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1),
+         (F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1),
+         (S2_vsathb_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clrbit_r IntRegs:$src1, IntRegs:$src2),
+         (S2_clrbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
+         (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2),
+         (M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred:$src1),
+         (A2_tfrsi s32_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2),
+         (A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
+         (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2),
+         (A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
+         (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_combine_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_negsat IntRegs:$src1),
+         (A2_negsat IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_bitsplit IntRegs:$src1, IntRegs:$src2),
+         (A4_bitsplit IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabshsat DoubleRegs:$src1),
+         (A2_vabshsat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyui IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyui IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
+         (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupn IntRegs:$src1, IntRegs:$src2),
+         (F2_sffixupn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_xorp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_xorp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_tfrpcp DoubleRegs:$src1),
+         (A4_tfrpcp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_zxtb IntRegs:$src1),
+         (A2_zxtb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_zxth IntRegs:$src1),
+         (A2_zxth IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2),
+         (C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1),
+         (A2_sat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addsat IntRegs:$src1, IntRegs:$src2),
+         (A2_addsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2),
+         (A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2),
+         (C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_addp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+         (A4_addp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_xor PredRegs:$src1, PredRegs:$src2),
+         (C2_xor PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2ud_chop DoubleRegs:$src1),
+         (F2_conv_df2ud_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathub DoubleRegs:$src1),
+         (S2_vsathub DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2sf DoubleRegs:$src1),
+         (F2_conv_df2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2),
+         (M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxth IntRegs:$src1),
+         (A2_sxth IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxtb IntRegs:$src1),
+         (A2_sxtb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxtw IntRegs:$src1),
+         (A2_sxtw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl1p DoubleRegs:$src1),
+         (S2_cl1p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_andnp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A4_andnp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpeq IntRegs:$src1, IntRegs:$src2),
+         (F2_sfcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddb_map DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clbnorm IntRegs:$src1),
+         (S2_clbnorm IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3),
+         (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2),
+         (S2_tstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3),
+         (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2),
+         (S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
+         (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1),
+         (F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2),
+         (C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2),
+         (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1),
+         (S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3),
+         (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2),
+         (F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct1 IntRegs:$src1),
+         (S2_ct1 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct0 IntRegs:$src1),
+         (S2_ct0 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2),
+         (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1),
+         (F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2),
+         (C2_andn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
+         (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2),
+         (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2),
+         (A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2),
+         (A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2),
+         (A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2),
+         (A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineir s32_0ImmPred:$src1, IntRegs:$src2),
+         (A4_combineir s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpeq IntRegs:$src1, IntRegs:$src2),
+         (A4_rcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2),
+         (M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1),
+         (S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combinew IntRegs:$src1, IntRegs:$src2),
+         (A2_combinew IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2),
+         (C4_nbitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_modwrapu IntRegs:$src1, IntRegs:$src2),
+         (A4_modwrapu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpneq IntRegs:$src1, IntRegs:$src2),
+         (A4_rcmpneq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred:$src1),
+         (F2_sfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred:$src1),
+         (F2_sfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2),
+         (M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1),
+         (F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2),
+         (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2),
+         (F2_sfcmpuo IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1),
+         (S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_minu IntRegs:$src1, IntRegs:$src2),
+         (A2_minu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_minp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_minp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
+         (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpge IntRegs:$src1, IntRegs:$src2),
+         (F2_sfcmpge IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmin IntRegs:$src1, IntRegs:$src2),
+         (F2_sfmin IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpgt IntRegs:$src1, IntRegs:$src2),
+         (F2_sfcmpgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vpmpyh IntRegs:$src1, IntRegs:$src2),
+         (M4_vpmpyh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_roundsat DoubleRegs:$src1),
+         (A2_roundsat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct1p DoubleRegs:$src1),
+         (S2_ct1p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2),
+         (S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2),
+         (C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_tfrcpp CtrRegs64:$src1),
+         (A4_tfrcpp CtrRegs64:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2),
+         (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2),
+         (A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgtu IntRegs:$src1, IntRegs:$src2),
+         (A4_cmphgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sath IntRegs:$src1),
+         (A2_sath IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_satb IntRegs:$src1),
+         (A2_satb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4),
+         (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_asrh IntRegs:$src1),
+         (A2_asrh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_or PredRegs:$src1, PredRegs:$src2),
+         (C2_or PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_xor IntRegs:$src1, IntRegs:$src2),
+         (A2_xor IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_add IntRegs:$src1, IntRegs:$src2),
+         (A2_add IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfinvsqrta IntRegs:$src1),
+         (F2_sfinvsqrta IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct0p DoubleRegs:$src1),
+         (S2_ct0p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svaddh IntRegs:$src1, IntRegs:$src2),
+         (A2_svaddh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vcrotate DoubleRegs:$src1, IntRegs:$src2),
+         (S2_vcrotate DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_aslh IntRegs:$src1),
+         (A2_aslh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2),
+         (M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplatrh IntRegs:$src1),
+         (S2_vsplatrh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r IntRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplatrb IntRegs:$src1),
+         (S2_vsplatrb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3),
+         (C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_pxfer_map PredRegs:$src1),
+         (C2_pxfer_map PredRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3),
+         (M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
+         (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrcrr CtrRegs:$src1),
+         (A2_tfrcrr CtrRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
+         (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2),
+         (C2_orn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmpy IntRegs:$src1, IntRegs:$src2),
+         (F2_sfmpy IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_mask PredRegs:$src1),
+         (C2_mask PredRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2),
+         (A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrrcr IntRegs:$src1),
+         (A2_tfrrcr IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
+         (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1),
+         (S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2),
+         (C2_cmpgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2d_chop DoubleRegs:$src1),
+         (F2_conv_df2d_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1),
+         (F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2),
+         (F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3),
+         (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
+         (M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred:$src2),
+         (A2_addi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbpnorm DoubleRegs:$src1),
+         (S4_clbpnorm DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_rr_sat IntRegs:$src1, IntRegs:$src2),
+         (A4_round_rr_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2uw IntRegs:$src1),
+         (F2_conv_sf2uw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2ud IntRegs:$src1),
+         (F2_conv_sf2ud IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2uw_chop IntRegs:$src1),
+         (F2_conv_sf2uw_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwuh_nopack DoubleRegs:$src1),
+         (S2_vsatwuh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubuhs IntRegs:$src1, IntRegs:$src2),
+         (A2_svsubuhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmpybsu IntRegs:$src1, IntRegs:$src2),
+         (M5_vmpybsu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r IntRegs:$src1, IntRegs:$src2),
+         (S2_lsr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_subp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+         (A4_subp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_vitpack PredRegs:$src1, PredRegs:$src2),
+         (C2_vitpack PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+         (S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsclr IntRegs:$src1, IntRegs:$src2),
+         (C4_nbitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2),
+         (C2_and PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1),
+         (S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3),
+         (S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1),
+         (S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2),
+         (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpneq IntRegs:$src1, IntRegs:$src2),
+         (C4_cmpneq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clb IntRegs:$src1),
+         (S2_clb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2),
+         (A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1),
+         (F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred:$src1),
+         (F2_dfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2),
+         (A4_cmphgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred:$src1),
+         (F2_dfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3),
+         (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3),
+         (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cnacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cnacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2),
+         (A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred:$src2),
+         (A2_andir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfrecipa IntRegs:$src1, IntRegs:$src2),
+         (F2_sfrecipa IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2),
+         (A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2),
+         (A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2),
+         (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2),
+         (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2),
+         (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_or IntRegs:$src1, IntRegs:$src2),
+         (A2_or IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeq IntRegs:$src1, IntRegs:$src2),
+         (C2_cmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrp DoubleRegs:$src1),
+         (A2_tfrp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathub_nopack DoubleRegs:$src1),
+         (S2_vsathub_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_satuh IntRegs:$src1),
+         (A2_satuh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_satub IntRegs:$src1),
+         (A2_satub IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2),
+         (M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
+         (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2),
+         (C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2),
+         (A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred:$src2),
+         (A2_tfril IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
+         (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1),
+         (S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1),
+         (A2_vabsw DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1),
+         (A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2),
+         (F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3),
+         (C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
+         (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_swiz IntRegs:$src1),
+         (A2_swiz IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2),
+         (A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2w_chop DoubleRegs:$src1),
+         (F2_conv_df2w_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_and IntRegs:$src1, IntRegs:$src2),
+         (A2_and IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3),
+         (S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1),
+         (F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfr IntRegs:$src1),
+         (A2_tfr IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subri s32_0ImmPred:$src1, IntRegs:$src2),
+         (A2_subri s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2),
+         (M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
+         (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_brev IntRegs:$src1),
+         (S2_brev IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
+         (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
+         (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1),
+         (S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4),
+         (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1),
+         (F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred:$src2),
+         (A2_orir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2),
+         (A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2),
+         (M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2df IntRegs:$src1),
+         (F2_conv_sf2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunohb DoubleRegs:$src1),
+         (S2_vtrunohb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2d_chop IntRegs:$src1),
+         (F2_conv_sf2d_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1),
+         (F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2),
+         (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1),
+         (F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svadduhs IntRegs:$src1, IntRegs:$src2),
+         (A2_svadduhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2w_chop IntRegs:$src1),
+         (F2_conv_sf2w_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_svsathub IntRegs:$src1),
+         (S2_svsathub IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_setbit_r IntRegs:$src1, IntRegs:$src2),
+         (S2_setbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4),
+         (F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2),
+         (F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1),
+         (F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1),
+         (F2_conv_df2uw DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2),
+         (C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2),
+         (C4_cmplteu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_minup DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_minup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+         (S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_combine_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_combine_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2),
+         (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2),
+         (A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminb DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_lsli s6_0ImmPred:$src1, IntRegs:$src2),
+         (S4_lsli s6_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_tfrrp IntRegs:$src1),
+         (C2_tfrrp IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_abs IntRegs:$src1),
+         (A2_abs IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2),
+         (A4_cmpbeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1),
+         (A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwuh DoubleRegs:$src1),
+         (S2_vsatwuh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1),
+         (S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2),
+         (A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2),
+         (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2),
+         (A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3),
+         (S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3),
+         (S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2),
+         (A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2),
+         (S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1),
+         (S2_vsxtbh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_subp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_parity IntRegs:$src1, IntRegs:$src2),
+         (S4_parity IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2),
+         (A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2ud_chop IntRegs:$src1),
+         (F2_conv_sf2ud_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2),
+         (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2),
+         (S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2),
+         (A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
+         (M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrndpackwh DoubleRegs:$src1),
+         (S2_vrndpackwh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2),
+         (C2_bitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
+         (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2),
+         (C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
+         (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+
+// V55 Scalar Instructions.
+
+def: Pat<(int_hexagon_A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV55]>;
+
+// V60 Scalar Instructions.
+
+def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
+         (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2),
+         (S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
+         (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
+         (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
+
+// V62 Scalar Instructions.
+
+def: Pat<(int_hexagon_S6_vtrunehb_ppp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S6_vtrunehb_ppp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_V6_ldntnt0 IntRegs:$src1),
+         (V6_ldntnt0 IntRegs:$src1)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_M6_vabsdiffub DoubleRegs:$src1, DoubleRegs:$src2),
+         (M6_vabsdiffub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_S6_vtrunohb_ppp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S6_vtrunohb_ppp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_M6_vabsdiffb DoubleRegs:$src1, DoubleRegs:$src2),
+         (M6_vabsdiffb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_A6_vminub_RdP DoubleRegs:$src1, DoubleRegs:$src2),
+         (A6_vminub_RdP DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_S6_vsplatrbp IntRegs:$src1),
+         (S6_vsplatrbp IntRegs:$src1)>, Requires<[HasV62]>;
+
+// V65 Scalar Instructions.
+
+def: Pat<(int_hexagon_A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2),
+         (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV65]>;
+
+// V66 Scalar Instructions.
+
+def: Pat<(int_hexagon_F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
+def: Pat<(int_hexagon_F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
+def: Pat<(int_hexagon_M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV66]>;
+def: Pat<(int_hexagon_S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2),
+         (S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2)>, Requires<[HasV66]>;
+
+// V60 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminub HvxVR:$src1, HvxVR:$src2),
+         (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhvsrs_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsathub HvxVR:$src1, HvxVR:$src2),
+         (V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsathub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
+         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
+         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwv HvxVR:$src1, HvxVR:$src2),
+         (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsuisat_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
+         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
+         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgh HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgub HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubb HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgubrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgubrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusv HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundhb HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundhb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmux_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhus HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhus_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackeb HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackeb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavghrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavghrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtran2x2_map_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdelta HvxVR:$src1, HvxVR:$src2),
+         (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdelta_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackob HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackob_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_or HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_or_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lo HvxWR:$src1),
+         (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lo_128B HvxWR:$src1),
+         (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubb_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubb_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldu0 IntRegs:$src1),
+         (V6_ldu0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldu0_128B IntRegs:$src1),
+         (V6_ldu0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgh HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlalignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsh HvxVR:$src1),
+         (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsh_128B HvxVR:$src1),
+         (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_and_n HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_and_n_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsb HvxVR:$src1),
+         (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsb_128B HvxVR:$src1),
+         (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundwuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundwuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffh HvxVR:$src1),
+         (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffh_128B HvxVR:$src1),
+         (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgub HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybv HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnormamth HvxVR:$src1),
+         (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnormamth_128B HvxVR:$src1),
+         (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguh HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrwv HvxVR:$src1, HvxVR:$src2),
+         (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrwv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhisat_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddw HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vzh HvxVR:$src1),
+         (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vzh_128B HvxVR:$src1),
+         (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddh HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxub HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoeh HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoeh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh HvxVR:$src1, HvxVR:$src2),
+         (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabuuv HvxWR:$src1, HvxWR:$src2),
+         (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabuuv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vror HvxVR:$src1, IntRegs:$src2),
+         (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vror_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsh_sat HvxVR:$src1),
+         (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsh_sat_128B HvxVR:$src1),
+         (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_or_n HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_or_n_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealb HvxVR:$src1),
+         (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealb_128B HvxVR:$src1),
+         (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybusv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vzb HvxVR:$src1),
+         (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vzb_128B HvxVR:$src1),
+         (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_dv_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddb HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoeb HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoeb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackhub_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyb HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyb_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabusv HvxWR:$src1, HvxWR:$src2),
+         (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabusv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_and HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_and_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackwuh_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vswap_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslw HvxVR:$src1, IntRegs:$src2),
+         (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslw_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackhb_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyih_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddb_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddb_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackub HvxVR:$src1),
+         (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackub_128B HvxVR:$src1),
+         (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyieoh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyieoh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_extractw HvxVR:$src1, IntRegs:$src2),
+         (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_extractw_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgwrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgwrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyub HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyub_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuh HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackob HvxWR:$src1, HvxVR:$src2),
+         (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackob_128B HvxWR:$src1, HvxVR:$src2),
+         (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpahb HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpahb_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandqrt HvxQR:$src1, IntRegs:$src2),
+         (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandqrt_128B HvxQR:$src1, IntRegs:$src2),
+         (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vxor HvxVR:$src1, HvxVR:$src2),
+         (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vxor_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhsat_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybus_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhw HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealb4w HvxVR:$src1, HvxVR:$src2),
+         (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealb4w_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffh HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffob HvxVR:$src1, HvxVR:$src2),
+         (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffob_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyub_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnormamtw HvxVR:$src1),
+         (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnormamtw_128B HvxVR:$src1),
+         (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackuh HvxVR:$src1),
+         (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackuh_128B HvxVR:$src1),
+         (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackoh HvxWR:$src1, HvxVR:$src2),
+         (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackoh_128B HvxWR:$src1, HvxVR:$src2),
+         (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyubv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhss HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhss_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_hi HvxWR:$src1),
+         (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_hi_128B HvxWR:$src1),
+         (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw HvxVR:$src1, HvxVR:$src2),
+         (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdsaduh HvxWR:$src1, IntRegs:$src2),
+         (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubw HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubw_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubw_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyih HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyih_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybus HvxVR:$src1, IntRegs:$src2),
+         (V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybus_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybus_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
+         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
+         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1),
+         (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1),
+         (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrw HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrw_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsh HvxVR:$src1),
+         (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsh_128B HvxVR:$src1),
+         (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrh HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackoh HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackoh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcombine HvxVR:$src1, HvxVR:$src2),
+         (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcombine_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+         (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+         (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vinsertwr HvxVR:$src1, IntRegs:$src2),
+         (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vinsertwr_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubh_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubh_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffb HvxVR:$src1),
+         (V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffb_128B HvxVR:$src1),
+         (V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vand HvxVR:$src1, HvxVR:$src2),
+         (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vand_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyub HvxVR:$src1, IntRegs:$src2),
+         (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcl0h HvxVR:$src1),
+         (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcl0h_128B HvxVR:$src1),
+         (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhus_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
+         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
+         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufeh HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufeh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhsrs_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubh HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ld0 IntRegs:$src1),
+         (V6_ld0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ld0_128B IntRegs:$src1),
+         (V6_ld0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpopcounth HvxVR:$src1),
+         (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpopcounth_128B HvxVR:$src1),
+         (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldnt0 IntRegs:$src1),
+         (V6_ldnt0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldnt0_128B IntRegs:$src1),
+         (V6_ldnt0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackeh HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackeh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyh HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminh HvxVR:$src1, HvxVR:$src2),
+         (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2 IntRegs:$src1),
+         (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2_128B IntRegs:$src1),
+         (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealh HvxVR:$src1),
+         (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealh_128B HvxVR:$src1),
+         (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackwh_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslh HvxVR:$src1, IntRegs:$src2),
+         (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vor HvxVR:$src1, HvxVR:$src2),
+         (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vor_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiowh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiowh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+         (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+         (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvrt HvxVR:$src1, IntRegs:$src2),
+         (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvrt_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhw HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcl0w HvxVR:$src1),
+         (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcl0w_128B HvxVR:$src1),
+         (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyihb HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyihb_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpybus HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vd0 ),
+         (V6_vd0 )>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vd0_128B ),
+         (V6_vd0 )>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpybus HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybus HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybus_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvrt_acc_128B HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vassign HvxVR:$src1),
+         (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vassign_128B HvxVR:$src1),
+         (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_dv_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackb HvxVR:$src1),
+         (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackb_128B HvxVR:$src1),
+         (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackh HvxVR:$src1),
+         (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackh_128B HvxVR:$src1),
+         (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpahb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsatwh HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsatwh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyihb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrdelta HvxVR:$src1, HvxVR:$src2),
+         (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrdelta_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundwh HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundwh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddw_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddw_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffub HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffeb HvxVR:$src1, HvxVR:$src2),
+         (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffeb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth HvxVR:$src1, HvxVR:$src2),
+         (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnot HvxVR:$src1),
+         (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnot_128B HvxVR:$src1),
+         (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxw HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslwv HvxVR:$src1, HvxVR:$src2),
+         (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslwv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsw_sat HvxVR:$src1),
+         (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsw_sat_128B HvxVR:$src1),
+         (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundhub HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundhub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabus HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabus_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vassignp HvxWR:$src1),
+         (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vassignp_128B HvxWR:$src1),
+         (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb HvxVR:$src1, HvxVR:$src2),
+         (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububh HvxVR:$src1, HvxVR:$src2),
+         (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lvsplatw IntRegs:$src1),
+         (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lvsplatw_128B IntRegs:$src1),
+         (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_not HvxQR:$src1),
+         (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_not_128B HvxQR:$src1),
+         (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+         (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+         (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgw HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhw HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
+         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
+         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2),
+         (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_xor HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_xor_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguhrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguhrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
+         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
+         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2),
+         (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrh HvxVR:$src1, IntRegs:$src2),
+         (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhbrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffw HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+
+// V62 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+         (V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandnqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+         (V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddclbh HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddclbh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_64_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh_64_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsatuwuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsatuwuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_shuffeqh HvxQR:$src1, HvxQR:$src2),
+         (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_shuffeqh_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_shuffeqw HvxQR:$src1, HvxQR:$src2),
+         (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_shuffeqw_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldcnpnt0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhbsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminb HvxVR:$src1, HvxVR:$src2),
+         (V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrb HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrb_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldtp0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldtp0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
+         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
+         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldpnt0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldpnt0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvnqv HvxQR:$src1, HvxVR:$src2),
+         (V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvnqv_128B HvxQR:$src1, HvxVR:$src2),
+         (V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lvsplatb IntRegs:$src1),
+         (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lvsplatb_128B IntRegs:$src1),
+         (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lvsplath IntRegs:$src1),
+         (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lvsplath_128B IntRegs:$src1),
+         (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldtpnt0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldnpnt0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpauhb HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldtnp0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldtnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldtnp0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldtnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrounduhub HvxVR:$src1, HvxVR:$src2),
+         (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrounduhub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldcp0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldcp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldcp0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldcp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduwsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduwsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldtnpnt0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandnqrt HvxQR:$src1, IntRegs:$src2),
+         (V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandnqrt_128B HvxQR:$src1, IntRegs:$src2),
+         (V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxb HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvqv HvxQR:$src1, HvxVR:$src2),
+         (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvqv_128B HvxQR:$src1, HvxVR:$src2),
+         (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldnp0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldnp0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrounduwuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrounduwuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2v2 IntRegs:$src1),
+         (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2v2_128B IntRegs:$src1),
+         (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldp0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldp0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubh_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddclbw HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddclbw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldcpnt0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubububb_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_ldcnp0 PredRegs:$src1, IntRegs:$src2),
+         (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_ldcnp0_128B PredRegs:$src1, IntRegs:$src2),
+         (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
+         (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
+         (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+
+// V65 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2),
+         (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_128B HvxVR:$src1, DoubleRegs:$src2),
+         (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2),
+         (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2),
+         (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2),
+         (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_128B HvxVR:$src1, DoubleRegs:$src2),
+         (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1),
+         (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1),
+         (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1),
+         (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1),
+         (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1),
+         (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1),
+         (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1),
+         (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1),
+         (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdd0 ),
+         (V6_vdd0 )>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdd0_128B ),
+         (V6_vdd0 )>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1),
+         (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1),
+         (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+
+// V66 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddcarrysat_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasr_into_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsatdw HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatdw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsatdw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatdw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrotr HvxVR:$src1, HvxVR:$src2),
+         (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrotr_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>;
diff --git a/lib/Target/Hexagon/HexagonDepMappings.td b/lib/Target/Hexagon/HexagonDepMappings.td
index 03c504ff0b084ec781049ccdd1cf25ad22b3b1d0..b3132d41b903d607a0111a74781cda51aa9016e4 100644
--- a/lib/Target/Hexagon/HexagonDepMappings.td
+++ b/lib/Target/Hexagon/HexagonDepMappings.td
@@ -1,4 +1,4 @@
-//===- HexagonDepMappings.td ----------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,7 +9,6 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
 def A2_negAlias : InstAlias<"$Rd32 = neg($Rs32)", (A2_subri IntRegs:$Rd32, 0, IntRegs:$Rs32)>;
 def A2_notAlias : InstAlias<"$Rd32 = not($Rs32)", (A2_subri IntRegs:$Rd32, -1, IntRegs:$Rs32)>;
 def A2_tfrfAlias : InstAlias<"if (!$Pu4) $Rd32 = $Rs32", (A2_paddif IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
@@ -252,6 +251,7 @@ def V6_vaslhv_altAlias : InstAlias<"$Vd32 = vaslh($Vu32,$Vv32)", (V6_vaslhv HvxV
 def V6_vaslw_acc_altAlias : InstAlias<"$Vx32 += vaslw($Vu32,$Rt32)", (V6_vaslw_acc HvxVR:$Vx32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
 def V6_vaslw_altAlias : InstAlias<"$Vd32 = vaslw($Vu32,$Rt32)", (V6_vaslw HvxVR:$Vd32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
 def V6_vaslwv_altAlias : InstAlias<"$Vd32 = vaslw($Vu32,$Vv32)", (V6_vaslwv HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasr_into_altAlias : InstAlias<"$Vxx32 = vasrinto($Vu32,$Vv32)", (V6_vasr_into HvxWR:$Vxx32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vasrh_acc_altAlias : InstAlias<"$Vx32 += vasrh($Vu32,$Rt32)", (V6_vasrh_acc HvxVR:$Vx32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
 def V6_vasrh_altAlias : InstAlias<"$Vd32 = vasrh($Vu32,$Rt32)", (V6_vasrh HvxVR:$Vd32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
 def V6_vasrhbrndsat_altAlias : InstAlias<"$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhbrndsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>;
@@ -402,6 +402,7 @@ def V6_vrmpyubi_acc_altAlias : InstAlias<"$Vxx32 += vrmpyub($Vuu32,$Rt32,#$Ii)",
 def V6_vrmpyubi_altAlias : InstAlias<"$Vdd32 = vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi HvxWR:$Vdd32, HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
 def V6_vrmpyubv_acc_altAlias : InstAlias<"$Vx32 += vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv_acc HvxVR:$Vx32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vrmpyubv_altAlias : InstAlias<"$Vd32 = vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrotr_altAlias : InstAlias<"$Vd32 = vrotr($Vu32,$Vv32)", (V6_vrotr HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vroundhb_altAlias : InstAlias<"$Vd32 = vroundhb($Vu32,$Vv32):sat", (V6_vroundhb HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vroundhub_altAlias : InstAlias<"$Vd32 = vroundhub($Vu32,$Vv32):sat", (V6_vroundhub HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vrounduhub_altAlias : InstAlias<"$Vd32 = vrounduhub($Vu32,$Vv32):sat", (V6_vrounduhub HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
@@ -473,4 +474,6 @@ def V6_vunpackub_altAlias : InstAlias<"$Vdd32 = vunpackub($Vu32)", (V6_vunpackub
 def V6_vunpackuh_altAlias : InstAlias<"$Vdd32 = vunpackuh($Vu32)", (V6_vunpackuh HvxWR:$Vdd32, HvxVR:$Vu32)>, Requires<[UseHVX]>;
 def V6_vzb_altAlias : InstAlias<"$Vdd32 = vzxtb($Vu32)", (V6_vzb HvxWR:$Vdd32, HvxVR:$Vu32)>, Requires<[UseHVX]>;
 def V6_vzh_altAlias : InstAlias<"$Vdd32 = vzxth($Vu32)", (V6_vzh HvxWR:$Vdd32, HvxVR:$Vu32)>, Requires<[UseHVX]>;
+def V6_zld0Alias : InstAlias<"z = vmem($Rt32)", (V6_zLd_ai IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_zldp0Alias : InstAlias<"if ($Pv4) z = vmem($Rt32)", (V6_zLd_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
 def Y2_dcfetchAlias : InstAlias<"dcfetch($Rs32)", (Y2_dcfetchbo IntRegs:$Rs32, 0)>;
diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td
index 9d960953f8f529627be4ebcf5b7dbef2e2e52736..ef2d4fa4570218fd8e5b5710366fcaf446ee1fe3 100644
--- a/lib/Target/Hexagon/HexagonDepOperands.td
+++ b/lib/Target/Hexagon/HexagonDepOperands.td
@@ -1,4 +1,4 @@
-//===- HexagonDepOperands.td ----------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,16 +9,12 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
-
 def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
 def s4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
 def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
 def s29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
-def s10_6ImmOperand : AsmOperandClass { let Name = "s10_6Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s10_6Imm : Operand<i32> { let ParserMatchClass = s10_6ImmOperand; let DecoderMethod = "s10_6ImmDecoder"; }
-def s10_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 6>(N->getSExtValue());}]>;
 def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
 def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
 def u6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
@@ -130,6 +126,3 @@ def u8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<8, 0>(N->getSExtVal
 def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
 def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
 def u30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
-def s10_0ImmOperand : AsmOperandClass { let Name = "s10_0Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s10_0Imm : Operand<i32> { let ParserMatchClass = s10_0ImmOperand; let DecoderMethod = "s10_0ImmDecoder"; }
-def s10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 0>(N->getSExtValue());}]>;
diff --git a/lib/Target/Hexagon/HexagonDepTimingClasses.h b/lib/Target/Hexagon/HexagonDepTimingClasses.h
index 656c83f2d0c4ef0a1d1eb74d96d7b8377bcf1b6c..0fd55e8b7997a970e2f0bd9d3d9509558040b24b 100644
--- a/lib/Target/Hexagon/HexagonDepTimingClasses.h
+++ b/lib/Target/Hexagon/HexagonDepTimingClasses.h
@@ -1,4 +1,4 @@
-//===- HexagonDepTimingClasses.h ------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,7 +10,6 @@
 //===----------------------------------------------------------------------===//
 
 
-
 #ifndef TARGET_HEXAGON_HEXAGON_DEP_TIMING_CLASSES_H
 #define TARGET_HEXAGON_HEXAGON_DEP_TIMING_CLASSES_H
 
@@ -20,19 +19,25 @@ namespace llvm {
 
 inline bool is_TC3x(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_16d0d8d5:
-  case Hexagon::Sched::tc_1853ea6d:
-  case Hexagon::Sched::tc_60571023:
-  case Hexagon::Sched::tc_7934b9df:
-  case Hexagon::Sched::tc_8fd5f294:
-  case Hexagon::Sched::tc_b9c0b731:
-  case Hexagon::Sched::tc_bcc96cee:
-  case Hexagon::Sched::tc_c6ce9b3f:
-  case Hexagon::Sched::tc_c6ebf8dd:
-  case Hexagon::Sched::tc_c82dc1ff:
-  case Hexagon::Sched::tc_caaebcba:
-  case Hexagon::Sched::tc_cf59f215:
-  case Hexagon::Sched::tc_e913dc32:
+  case Hexagon::Sched::tc_05d3a09b:
+  case Hexagon::Sched::tc_0d8f5752:
+  case Hexagon::Sched::tc_13bfbcf9:
+  case Hexagon::Sched::tc_174516e8:
+  case Hexagon::Sched::tc_1a2fd869:
+  case Hexagon::Sched::tc_1c4528a2:
+  case Hexagon::Sched::tc_32779c6f:
+  case Hexagon::Sched::tc_5b54b33f:
+  case Hexagon::Sched::tc_6b25e783:
+  case Hexagon::Sched::tc_76851da1:
+  case Hexagon::Sched::tc_9debc299:
+  case Hexagon::Sched::tc_a9d88b22:
+  case Hexagon::Sched::tc_bafaade3:
+  case Hexagon::Sched::tc_bcf98408:
+  case Hexagon::Sched::tc_bdceeac1:
+  case Hexagon::Sched::tc_c8ce0b5c:
+  case Hexagon::Sched::tc_d1aa9eaa:
+  case Hexagon::Sched::tc_d773585a:
+  case Hexagon::Sched::tc_df3319ed:
     return true;
   default:
     return false;
@@ -41,8 +46,8 @@ inline bool is_TC3x(unsigned SchedClass) {
 
 inline bool is_TC2early(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_14cd4cfa:
-  case Hexagon::Sched::tc_2a160009:
+  case Hexagon::Sched::tc_b4407292:
+  case Hexagon::Sched::tc_fc3999b4:
     return true;
   default:
     return false;
@@ -51,12 +56,13 @@ inline bool is_TC2early(unsigned SchedClass) {
 
 inline bool is_TC4x(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_038a1342:
-  case Hexagon::Sched::tc_4d99bca9:
-  case Hexagon::Sched::tc_6792d5ff:
-  case Hexagon::Sched::tc_9c00ce8d:
-  case Hexagon::Sched::tc_d580173f:
-  case Hexagon::Sched::tc_f3eaa14b:
+  case Hexagon::Sched::tc_2f7c551d:
+  case Hexagon::Sched::tc_2ff964b4:
+  case Hexagon::Sched::tc_3a867367:
+  case Hexagon::Sched::tc_3b470976:
+  case Hexagon::Sched::tc_4560740b:
+  case Hexagon::Sched::tc_a58fd5cc:
+  case Hexagon::Sched::tc_b8bffe55:
     return true;
   default:
     return false;
@@ -65,23 +71,27 @@ inline bool is_TC4x(unsigned SchedClass) {
 
 inline bool is_TC2(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_00afc57e:
-  case Hexagon::Sched::tc_1b9c9ee5:
-  case Hexagon::Sched::tc_234a11a5:
-  case Hexagon::Sched::tc_2b6f77c6:
-  case Hexagon::Sched::tc_41d5298e:
-  case Hexagon::Sched::tc_5ba5997d:
-  case Hexagon::Sched::tc_84df2cd3:
-  case Hexagon::Sched::tc_87735c3b:
-  case Hexagon::Sched::tc_897d1a9d:
-  case Hexagon::Sched::tc_976ddc4f:
-  case Hexagon::Sched::tc_b44c6e2a:
-  case Hexagon::Sched::tc_b9c4623f:
-  case Hexagon::Sched::tc_c2f7d806:
-  case Hexagon::Sched::tc_c74f796f:
-  case Hexagon::Sched::tc_d088982c:
-  case Hexagon::Sched::tc_ef84f62f:
-  case Hexagon::Sched::tc_f49e76f4:
+  case Hexagon::Sched::tc_002cb246:
+  case Hexagon::Sched::tc_14b5c689:
+  case Hexagon::Sched::tc_1c80410a:
+  case Hexagon::Sched::tc_4414d8b1:
+  case Hexagon::Sched::tc_6132ba3d:
+  case Hexagon::Sched::tc_61830035:
+  case Hexagon::Sched::tc_679309b8:
+  case Hexagon::Sched::tc_703e822c:
+  case Hexagon::Sched::tc_779080bf:
+  case Hexagon::Sched::tc_784490da:
+  case Hexagon::Sched::tc_88b4f13d:
+  case Hexagon::Sched::tc_9461ff31:
+  case Hexagon::Sched::tc_9e313203:
+  case Hexagon::Sched::tc_a813cf9a:
+  case Hexagon::Sched::tc_bfec0f01:
+  case Hexagon::Sched::tc_cf8126ae:
+  case Hexagon::Sched::tc_d08ee0f4:
+  case Hexagon::Sched::tc_e4a7f9f0:
+  case Hexagon::Sched::tc_f429765c:
+  case Hexagon::Sched::tc_f675fee8:
+  case Hexagon::Sched::tc_f9058dd7:
     return true;
   default:
     return false;
@@ -90,45 +100,43 @@ inline bool is_TC2(unsigned SchedClass) {
 
 inline bool is_TC1(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_181af5d0:
-  case Hexagon::Sched::tc_1b82a277:
-  case Hexagon::Sched::tc_1e856f58:
-  case Hexagon::Sched::tc_351fed2d:
-  case Hexagon::Sched::tc_3669266a:
-  case Hexagon::Sched::tc_3cb8ea06:
-  case Hexagon::Sched::tc_452f85af:
-  case Hexagon::Sched::tc_481e5e5c:
-  case Hexagon::Sched::tc_49eb22c8:
-  case Hexagon::Sched::tc_523fcf30:
-  case Hexagon::Sched::tc_52d7bbea:
-  case Hexagon::Sched::tc_53bc8a6a:
-  case Hexagon::Sched::tc_540fdfbc:
-  case Hexagon::Sched::tc_55050d58:
-  case Hexagon::Sched::tc_609d2efe:
-  case Hexagon::Sched::tc_68cb12ce:
-  case Hexagon::Sched::tc_6ebb4a12:
-  case Hexagon::Sched::tc_6efc556e:
-  case Hexagon::Sched::tc_73043bf4:
-  case Hexagon::Sched::tc_7a830544:
-  case Hexagon::Sched::tc_855b0b61:
-  case Hexagon::Sched::tc_8fe6b782:
-  case Hexagon::Sched::tc_90f3e30c:
-  case Hexagon::Sched::tc_97743097:
-  case Hexagon::Sched::tc_99be14ca:
-  case Hexagon::Sched::tc_9faf76ae:
-  case Hexagon::Sched::tc_a46f0df5:
-  case Hexagon::Sched::tc_a904d137:
-  case Hexagon::Sched::tc_b9488031:
-  case Hexagon::Sched::tc_be706f30:
-  case Hexagon::Sched::tc_c6aa82f7:
-  case Hexagon::Sched::tc_cde8b071:
-  case Hexagon::Sched::tc_d6bf0472:
-  case Hexagon::Sched::tc_dbdffe3d:
-  case Hexagon::Sched::tc_e0739b8c:
-  case Hexagon::Sched::tc_e1e99bfa:
-  case Hexagon::Sched::tc_e9fae2d6:
-  case Hexagon::Sched::tc_f2704b9a:
-  case Hexagon::Sched::tc_f8eeed7a:
+  case Hexagon::Sched::tc_0663f615:
+  case Hexagon::Sched::tc_0a705168:
+  case Hexagon::Sched::tc_0ae0825c:
+  case Hexagon::Sched::tc_1b6f7cec:
+  case Hexagon::Sched::tc_1fc97744:
+  case Hexagon::Sched::tc_20cdee80:
+  case Hexagon::Sched::tc_2332b92e:
+  case Hexagon::Sched::tc_2eabeebe:
+  case Hexagon::Sched::tc_3d495a39:
+  case Hexagon::Sched::tc_4c5ba658:
+  case Hexagon::Sched::tc_56336eb0:
+  case Hexagon::Sched::tc_56f114f4:
+  case Hexagon::Sched::tc_57890846:
+  case Hexagon::Sched::tc_5a2711e5:
+  case Hexagon::Sched::tc_5b7c0967:
+  case Hexagon::Sched::tc_640086b5:
+  case Hexagon::Sched::tc_643b4717:
+  case Hexagon::Sched::tc_85c9c08f:
+  case Hexagon::Sched::tc_85d5d03f:
+  case Hexagon::Sched::tc_862b3e70:
+  case Hexagon::Sched::tc_946df596:
+  case Hexagon::Sched::tc_9c3ecd83:
+  case Hexagon::Sched::tc_9fc3dae0:
+  case Hexagon::Sched::tc_a1123dda:
+  case Hexagon::Sched::tc_a1c00888:
+  case Hexagon::Sched::tc_ae53734a:
+  case Hexagon::Sched::tc_b31c2e97:
+  case Hexagon::Sched::tc_b4b5c03a:
+  case Hexagon::Sched::tc_b51dc29a:
+  case Hexagon::Sched::tc_cd374165:
+  case Hexagon::Sched::tc_cfd8378a:
+  case Hexagon::Sched::tc_d5b7b0c1:
+  case Hexagon::Sched::tc_d9d43ecb:
+  case Hexagon::Sched::tc_db2bce9c:
+  case Hexagon::Sched::tc_de4df740:
+  case Hexagon::Sched::tc_de554571:
+  case Hexagon::Sched::tc_e78647bd:
     return true;
   default:
     return false;
@@ -136,4 +144,4 @@ inline bool is_TC1(unsigned SchedClass) {
 }
 } // namespace llvm
 
-#endif
+#endif
\ No newline at end of file
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 2f3e18c99c54465ec8280b307f864e873b0880bd..f5736546a87cca1d7ea38ad48c628e116c66441b 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -550,6 +550,37 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+/// Returns true if the target can safely skip saving callee-saved registers
+/// for noreturn nounwind functions.
+bool HexagonFrameLowering::enableCalleeSaveSkip(
+    const MachineFunction &MF) const {
+  const auto &F = MF.getFunction();
+  assert(F.hasFnAttribute(Attribute::NoReturn) &&
+         F.getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+         !F.getFunction().hasFnAttribute(Attribute::UWTable));
+  (void)F;
+
+  // No need to save callee saved registers if the function does not return.
+  return MF.getSubtarget<HexagonSubtarget>().noreturnStackElim();
+}
+
+// Helper function used to determine when to eliminate the stack frame for
+// functions marked as noreturn and when the noreturn-stack-elim options are
+// specified. When both these conditions are true, then a FP may not be needed
+// if the function makes a call. It is very similar to enableCalleeSaveSkip,
+// but it used to check if the allocframe can be eliminated as well.
+static bool enableAllocFrameElim(const MachineFunction &MF) {
+  const auto &F = MF.getFunction();
+  const auto &MFI = MF.getFrameInfo();
+  const auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  assert(!MFI.hasVarSizedObjects() &&
+         !HST.getRegisterInfo()->needsStackRealignment(MF));
+  return F.hasFnAttribute(Attribute::NoReturn) &&
+    F.hasFnAttribute(Attribute::NoUnwind) &&
+    !F.hasFnAttribute(Attribute::UWTable) && HST.noreturnStackElim() &&
+    MFI.getStackSize() == 0;
+}
+
 void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
       bool PrologueStubs) const {
   MachineFunction &MF = *MBB.getParent();
@@ -994,7 +1025,7 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
   }
 
   const auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
-  if (MFI.hasCalls() || HMFI.hasClobberLR())
+  if ((MFI.hasCalls() && !enableAllocFrameElim(MF)) || HMFI.hasClobberLR())
     return true;
 
   return false;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 988718860c5b874436ed73fe0b6dc1145434f37f..d65d870750f871dc1cd60da4af61c5faf28a064d 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -41,6 +41,8 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
       override {}
 
+  bool enableCalleeSaveSkip(const MachineFunction &MF) const override;
+
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
       MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI,
       const TargetRegisterInfo *TRI) const override {
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 637fb3ace6a427ea62c2a5018193b03ce943c6de..b796e442d4faf4c020c04ef63cd1b91d74e779d1 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -120,7 +120,7 @@ struct Coloring {
     return Color == ColorKind::Red ? ColorKind::Black : ColorKind::Red;
   }
 
-  void dump() const;
+  LLVM_DUMP_METHOD void dump() const;
 
 private:
   ArrayRef<Node> Order;
@@ -267,7 +267,7 @@ bool Coloring::color() {
   return true;
 }
 
-LLVM_DUMP_METHOD
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Coloring::dump() const {
   dbgs() << "{ Order:   {";
   for (unsigned I = 0; I != Order.size(); ++I) {
@@ -309,6 +309,7 @@ void Coloring::dump() const {
     dbgs() << "    " << C.first << " -> " << ColorKindToName(C.second) << "\n";
   dbgs() << "  }\n}\n";
 }
+#endif
 
 namespace {
 // Base class of for reordering networks. They don't strictly need to be
@@ -651,6 +652,7 @@ struct OpRef {
     IndexBits = 28,
   };
 
+  LLVM_DUMP_METHOD
   void print(raw_ostream &OS, const SelectionDAG &G) const;
 
 private:
@@ -663,7 +665,7 @@ struct NodeTemplate {
   MVT Ty = MVT::Other;
   std::vector<OpRef> Ops;
 
-  void print(raw_ostream &OS, const SelectionDAG &G) const;
+  LLVM_DUMP_METHOD void print(raw_ostream &OS, const SelectionDAG &G) const;
 };
 
 struct ResultStack {
@@ -699,10 +701,12 @@ struct ResultStack {
 
   BaseType List;
 
+  LLVM_DUMP_METHOD
   void print(raw_ostream &OS, const SelectionDAG &G) const;
 };
 } // namespace
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const {
   if (isValue()) {
     OpV.getNode()->print(OS, &G);
@@ -752,6 +756,7 @@ void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const {
     OS << '\n';
   }
 }
+#endif
 
 namespace {
 struct ShuffleMask {
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 755a8539be7f1d12b65356b6c37990f0eb30be84..b5be507bb97d67984868e024a9a867681450ac5a 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1505,13 +1505,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8,  Custom);
 
-  // Subtarget-specific operation actions.
-  //
-  if (Subtarget.hasV60Ops()) {
-    setOperationAction(ISD::ROTL, MVT::i32, Custom);
-    setOperationAction(ISD::ROTL, MVT::i64, Custom);
-  }
-
   // V5+.
   setOperationAction(ISD::FMA,  MVT::f64, Expand);
   setOperationAction(ISD::FADD, MVT::f64, Expand);
@@ -1542,6 +1535,17 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setIndexedStoreAction(ISD::POST_INC, VT, Legal);
   }
 
+  // Subtarget-specific operation actions.
+  //
+  if (Subtarget.hasV60Ops()) {
+    setOperationAction(ISD::ROTL, MVT::i32, Custom);
+    setOperationAction(ISD::ROTL, MVT::i64, Custom);
+  }
+  if (Subtarget.hasV66Ops()) {
+    setOperationAction(ISD::FADD, MVT::f64, Legal);
+    setOperationAction(ISD::FSUB, MVT::f64, Legal);
+  }
+
   if (Subtarget.useHVXOps())
     initializeHVXLowering();
 
@@ -3082,6 +3086,10 @@ HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
 
 bool HexagonTargetLowering::shouldReduceLoadWidth(SDNode *Load,
       ISD::LoadExtType ExtTy, EVT NewVT) const {
+  // TODO: This may be worth removing. Check regression tests for diffs.
+  if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
+    return false;
+
   auto *L = cast<LoadSDNode>(Load);
   std::pair<SDValue,int> BO = getBaseAndOffset(L->getBasePtr());
   // Small-data object, do not shrink.
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index a1082e7a77760db4d87461ca5a1de264e61737fe..2236140d5dd7bfe71cabd52bff26adcd2b25a976 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -69,101 +69,101 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   // Instruction type according to the ISA.
   IType Type = type;
-  let TSFlags{5-0} = Type.Value;
+  let TSFlags{6-0} = Type.Value;
 
   // Solo instructions, i.e., those that cannot be in a packet with others.
   bits<1> isSolo = 0;
-  let TSFlags{6} = isSolo;
+  let TSFlags{7} = isSolo;
   // Packed only with A or X-type instructions.
   bits<1> isSoloAX = 0;
-  let TSFlags{7} = isSoloAX;
+  let TSFlags{8} = isSoloAX;
   // Restricts slot 1 to ALU-only instructions.
   bits<1> isRestrictSlot1AOK = 0;
-  let TSFlags{8} = isRestrictSlot1AOK;
+  let TSFlags{9} = isRestrictSlot1AOK;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{9} = isPredicated;
+  let TSFlags{10} = isPredicated;
   bits<1> isPredicatedFalse = 0;
-  let TSFlags{10} = isPredicatedFalse;
+  let TSFlags{11} = isPredicatedFalse;
   bits<1> isPredicatedNew = 0;
-  let TSFlags{11} = isPredicatedNew;
+  let TSFlags{12} = isPredicatedNew;
   bits<1> isPredicateLate = 0;
-  let TSFlags{12} = isPredicateLate; // Late predicate producer insn.
+  let TSFlags{13} = isPredicateLate; // Late predicate producer insn.
 
   // New-value insn helper fields.
   bits<1> isNewValue = 0;
-  let TSFlags{13} = isNewValue; // New-value consumer insn.
+  let TSFlags{14} = isNewValue; // New-value consumer insn.
   bits<1> hasNewValue = 0;
-  let TSFlags{14} = hasNewValue; // New-value producer insn.
+  let TSFlags{15} = hasNewValue; // New-value producer insn.
   bits<3> opNewValue = 0;
-  let TSFlags{17-15} = opNewValue; // New-value produced operand.
+  let TSFlags{18-16} = opNewValue; // New-value produced operand.
   bits<1> isNVStorable = 0;
-  let TSFlags{18} = isNVStorable; // Store that can become new-value store.
+  let TSFlags{19} = isNVStorable; // Store that can become new-value store.
   bits<1> isNVStore = 0;
-  let TSFlags{19} = isNVStore; // New-value store insn.
+  let TSFlags{20} = isNVStore; // New-value store insn.
   bits<1> isCVLoadable = 0;
-  let TSFlags{20} = isCVLoadable; // Load that can become cur-value load.
+  let TSFlags{21} = isCVLoadable; // Load that can become cur-value load.
   bits<1> isCVLoad = 0;
-  let TSFlags{21} = isCVLoad; // Cur-value load insn.
+  let TSFlags{22} = isCVLoad; // Cur-value load insn.
 
   // Immediate extender helper fields.
   bits<1> isExtendable = 0;
-  let TSFlags{22} = isExtendable; // Insn may be extended.
+  let TSFlags{23} = isExtendable; // Insn may be extended.
   bits<1> isExtended = 0;
-  let TSFlags{23} = isExtended; // Insn must be extended.
+  let TSFlags{24} = isExtended; // Insn must be extended.
   bits<3> opExtendable = 0;
-  let TSFlags{26-24} = opExtendable; // Which operand may be extended.
+  let TSFlags{27-25} = opExtendable; // Which operand may be extended.
   bits<1> isExtentSigned = 0;
-  let TSFlags{27} = isExtentSigned; // Signed or unsigned range.
+  let TSFlags{28} = isExtentSigned; // Signed or unsigned range.
   bits<5> opExtentBits = 0;
-  let TSFlags{32-28} = opExtentBits; //Number of bits of range before extending.
+  let TSFlags{33-29} = opExtentBits; //Number of bits of range before extending.
   bits<2> opExtentAlign = 0;
-  let TSFlags{34-33} = opExtentAlign; // Alignment exponent before extending.
+  let TSFlags{35-34} = opExtentAlign; // Alignment exponent before extending.
 
   bit cofMax1 = 0;
-  let TSFlags{35} = cofMax1;
+  let TSFlags{36} = cofMax1;
   bit cofRelax1 = 0;
-  let TSFlags{36} = cofRelax1;
+  let TSFlags{37} = cofRelax1;
   bit cofRelax2 = 0;
-  let TSFlags{37} = cofRelax2;
+  let TSFlags{38} = cofRelax2;
 
   bit isRestrictNoSlot1Store = 0;
-  let TSFlags{38} = isRestrictNoSlot1Store;
+  let TSFlags{39} = isRestrictNoSlot1Store;
 
   // Addressing mode for load/store instructions.
   AddrModeType addrMode = NoAddrMode;
-  let TSFlags{43-41} = addrMode.Value;
+  let TSFlags{44-42} = addrMode.Value;
 
   // Memory access size for mem access instructions (load/store)
   MemAccessSize accessSize = NoMemAccess;
-  let TSFlags{47-44} = accessSize.Value;
+  let TSFlags{48-45} = accessSize.Value;
 
   bits<1> isTaken = 0;
-  let TSFlags {48} = isTaken; // Branch prediction.
+  let TSFlags {49} = isTaken; // Branch prediction.
 
   bits<1> isFP = 0;
-  let TSFlags {49} = isFP; // Floating-point.
+  let TSFlags {50} = isFP; // Floating-point.
 
   bits<1> isSomeOK = 0;
-  let TSFlags {50} = isSomeOK; // Relax some grouping constraints.
+  let TSFlags {51} = isSomeOK; // Relax some grouping constraints.
 
   bits<1> hasNewValue2 = 0;
-  let TSFlags{51} = hasNewValue2; // Second New-value producer insn.
+  let TSFlags{52} = hasNewValue2; // Second New-value producer insn.
   bits<3> opNewValue2 = 0;
-  let TSFlags{54-52} = opNewValue2; // Second New-value produced operand.
+  let TSFlags{55-53} = opNewValue2; // Second New-value produced operand.
 
   bits<1> isAccumulator = 0;
-  let TSFlags{55} = isAccumulator;
+  let TSFlags{56} = isAccumulator;
 
   bits<1> prefersSlot3 = 0;
-  let TSFlags{56} = prefersSlot3; // Complex XU
+  let TSFlags{57} = prefersSlot3; // Complex XU
 
   bits<1> hasTmpDst = 0;
-  let TSFlags{59} = hasTmpDst;  // v65 : 'fake" register VTMP is set
+  let TSFlags{60} = hasTmpDst;  // v65 : 'fake" register VTMP is set
 
   bit CVINew = 0;
-  let TSFlags{61} = CVINew;
+  let TSFlags{62} = CVINew;
 
   // Fields used for relation models.
   bit isNonTemporal = 0;
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV5.td b/lib/Target/Hexagon/HexagonInstrFormatsV5.td
index 482688ab90aa72f6c940a28af61659e309c94d12..c8de5cbcc1e0ccd5525a143fe8393ff95bc2f579 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV5.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV5.td
@@ -49,39 +49,39 @@ class InstDuplex<bits<4> iClass, list<dag> pattern = [],
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 
-  let TSFlags{5-0} = Type.Value;
+  let TSFlags{6-0} = Type.Value;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{6} = isPredicated;
+  let TSFlags{7} = isPredicated;
   bits<1> isPredicatedFalse = 0;
-  let TSFlags{7} = isPredicatedFalse;
+  let TSFlags{8} = isPredicatedFalse;
   bits<1> isPredicatedNew = 0;
-  let TSFlags{8} = isPredicatedNew;
+  let TSFlags{9} = isPredicatedNew;
 
   // New-value insn helper fields.
   bits<1> isNewValue = 0;
-  let TSFlags{9} = isNewValue; // New-value consumer insn.
+  let TSFlags{10} = isNewValue; // New-value consumer insn.
   bits<1> hasNewValue = 0;
-  let TSFlags{10} = hasNewValue; // New-value producer insn.
+  let TSFlags{11} = hasNewValue; // New-value producer insn.
   bits<3> opNewValue = 0;
-  let TSFlags{13-11} = opNewValue; // New-value produced operand.
+  let TSFlags{14-12} = opNewValue; // New-value produced operand.
   bits<1> isNVStorable = 0;
-  let TSFlags{14} = isNVStorable; // Store that can become new-value store.
+  let TSFlags{15} = isNVStorable; // Store that can become new-value store.
   bits<1> isNVStore = 0;
-  let TSFlags{15} = isNVStore; // New-value store insn.
+  let TSFlags{16} = isNVStore; // New-value store insn.
 
   // Immediate extender helper fields.
   bits<1> isExtendable = 0;
-  let TSFlags{16} = isExtendable; // Insn may be extended.
+  let TSFlags{17} = isExtendable; // Insn may be extended.
   bits<1> isExtended = 0;
-  let TSFlags{17} = isExtended; // Insn must be extended.
+  let TSFlags{18} = isExtended; // Insn must be extended.
   bits<3> opExtendable = 0;
-  let TSFlags{20-18} = opExtendable; // Which operand may be extended.
+  let TSFlags{21-19} = opExtendable; // Which operand may be extended.
   bits<1> isExtentSigned = 0;
-  let TSFlags{21} = isExtentSigned; // Signed or unsigned range.
+  let TSFlags{22} = isExtentSigned; // Signed or unsigned range.
   bits<5> opExtentBits = 0;
-  let TSFlags{26-22} = opExtentBits; //Number of bits of range before extending.
+  let TSFlags{27-23} = opExtentBits; //Number of bits of range before extending.
   bits<2> opExtentAlign = 0;
-  let TSFlags{28-27} = opExtentAlign; // Alignment exponent before extending.
+  let TSFlags{29-28} = opExtentAlign; // Alignment exponent before extending.
 }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index a3c160d01f88f8c97e88afd0e374d9766e1a25ce..de0d6c4d9e4e781200b184ed8adb93faea2bd342 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1293,7 +1293,6 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
                      .add(Op0)
                      .addReg(PReg, S)
-                     .add(Op1)
                      .addReg(SrcHi)
                      .addReg(SrcLo);
         if (IsDestLive)
@@ -2894,14 +2893,15 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
 }
 
 /// Get the base register and byte offset of a load/store instr.
-bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
-      unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI)
-      const {
+bool HexagonInstrInfo::getMemOperandWithOffset(
+    MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
+    const TargetRegisterInfo *TRI) const {
   unsigned AccessSize = 0;
-  int OffsetVal = 0;
-  BaseReg = getBaseAndOffset(LdSt, OffsetVal, AccessSize);
-  Offset = OffsetVal;
-  return BaseReg != 0;
+  BaseOp = getBaseAndOffset(LdSt, Offset, AccessSize);
+  assert((!BaseOp || BaseOp->isReg()) &&
+         "getMemOperandWithOffset only supports base "
+         "operands of type register.");
+  return BaseOp != nullptr;
 }
 
 /// Can these instructions execute at the same time in a bundle.
@@ -3108,21 +3108,22 @@ unsigned HexagonInstrInfo::getAddrMode(const MachineInstr &MI) const {
 
 // Returns the base register in a memory access (load/store). The offset is
 // returned in Offset and the access size is returned in AccessSize.
-// If the base register has a subregister or the offset field does not contain
-// an immediate value, return 0.
-unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI,
-      int &Offset, unsigned &AccessSize) const {
+// If the base operand has a subregister or the offset field does not contain
+// an immediate value, return nullptr.
+MachineOperand *HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI,
+                                                   int64_t &Offset,
+                                                   unsigned &AccessSize) const {
   // Return if it is not a base+offset type instruction or a MemOp.
   if (getAddrMode(MI) != HexagonII::BaseImmOffset &&
       getAddrMode(MI) != HexagonII::BaseLongOffset &&
       !isMemOp(MI) && !isPostIncrement(MI))
-    return 0;
+    return nullptr;
 
   AccessSize = getMemAccessSize(MI);
 
   unsigned BasePos = 0, OffsetPos = 0;
   if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
-    return 0;
+    return nullptr;
 
   // Post increment updates its EA after the mem access,
   // so we need to treat its offset as zero.
@@ -3131,14 +3132,14 @@ unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI,
   } else {
     const MachineOperand &OffsetOp = MI.getOperand(OffsetPos);
     if (!OffsetOp.isImm())
-      return 0;
+      return nullptr;
     Offset = OffsetOp.getImm();
   }
 
   const MachineOperand &BaseOp = MI.getOperand(BasePos);
   if (BaseOp.getSubReg() != 0)
-    return 0;
-  return BaseOp.getReg();
+    return nullptr;
+  return &const_cast<MachineOperand&>(BaseOp);
 }
 
 /// Return the position of the base and offset operands for this instruction.
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index fe4a2f3662e3d5fb9f3a1eda148595014e3012f2..9b840762e88ae8dff8aa60ffd04d0130f1d9fe57 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -216,9 +216,9 @@ public:
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   /// Get the base register and byte offset of a load/store instr.
-  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
-                             int64_t &Offset,
-                             const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+                               int64_t &Offset,
+                               const TargetRegisterInfo *TRI) const override;
 
   /// Reverses the branch condition of the specified condition list,
   /// returning false on success and true if it cannot be reversed.
@@ -436,8 +436,8 @@ public:
   bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
 
   unsigned getAddrMode(const MachineInstr &MI) const;
-  unsigned getBaseAndOffset(const MachineInstr &MI, int &Offset,
-                            unsigned &AccessSize) const;
+  MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset,
+                                   unsigned &AccessSize) const;
   SmallVector<MachineInstr*,2> getBranchingInstrs(MachineBasicBlock& MBB) const;
   unsigned getCExtOpNum(const MachineInstr &MI) const;
   HexagonII::CompoundGroup
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index 206e74983d203aba49b4bf3f3f2ab95d57d2751c..9cab5748bef2292d75e2c4c782064af6c7b4e0e8 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -6,726 +6,78 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-// This is populated based on the following specs:
-// Hexagon V2 Architecture
-// Application-Level Specification
-// 80-V9418-8 Rev. B
-// March 4, 2008
-//===----------------------------------------------------------------------===//
 
-class T_I_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID imm:$Is),
-         (MI imm:$Is)>;
+// These intrinsic patterns are not auto-generated.
 
 class T_R_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I32:$Rs),
          (MI I32:$Rs)>;
 
-class T_P_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs),
-         (MI I64:$Rs)>;
-
-class T_II_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
-  : Pat<(IntID Imm1:$Is, Imm2:$It),
-        (MI Imm1:$Is, Imm2:$It)>;
-
-class T_RI_pat <InstHexagon MI, Intrinsic IntID,
-                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
-  : Pat<(IntID I32:$Rs, ImmPred:$It),
-        (MI I32:$Rs, ImmPred:$It)>;
-
-class T_IR_pat <InstHexagon MI, Intrinsic IntID,
-                PatFrag ImmPred = PatLeaf<(i32 imm)>>
-  : Pat<(IntID ImmPred:$Is, I32:$Rt),
-        (MI ImmPred:$Is, I32:$Rt)>;
-
-class T_PI_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID I64:$Rs, imm:$It),
-        (MI I64:$Rs, imm:$It)>;
-
-class T_RP_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID I32:$Rs, I64:$Rt),
-        (MI I32:$Rs, I64:$Rt)>;
-
 class T_RR_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I32:$Rs, I32:$Rt),
          (MI I32:$Rs, I32:$Rt)>;
 
-class T_PP_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I64:$Rt),
-         (MI I64:$Rs, I64:$Rt)>;
-
-class T_QQ_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rs, I32:$Rt),
-         (MI (C2_tfrrp I32:$Rs), (C2_tfrrp I32:$Rt))>;
-
-class T_QII_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
-  : Pat <(IntID I32:$Rp, Imm1:$Is, Imm2:$It),
-         (MI (C2_tfrrp I32:$Rp), Imm1:$Is, Imm2:$It)>;
-
-class T_QRR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rp, I32:$Rs, I32:$Rt),
-         (MI (C2_tfrrp I32:$Rp), I32:$Rs, I32:$Rt)>;
-
-class T_QRI_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
-  : Pat <(IntID I32:$Rp, I32:$Rs, ImmPred:$Is),
-         (MI (C2_tfrrp I32:$Rp), I32:$Rs, ImmPred:$Is)>;
-
-class T_QIR_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
-  : Pat <(IntID I32:$Rp, ImmPred:$Is, I32:$Rs),
-         (MI (C2_tfrrp I32:$Rp), ImmPred:$Is, I32:$Rs)>;
-
-class T_QPP_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rp, I64:$Rs, I64:$Rt),
-         (MI (C2_tfrrp I32:$Rp), I64:$Rs, I64:$Rt)>;
-
-class T_RRI_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rs, I32:$Rt, imm:$Iu),
-         (MI I32:$Rs, I32:$Rt, imm:$Iu)>;
-
-class T_RII_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rs, imm:$It, imm:$Iu),
-         (MI I32:$Rs, imm:$It, imm:$Iu)>;
-
-class T_IRI_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID imm:$It, I32:$Rs, imm:$Iu),
-         (MI imm:$It, I32:$Rs, imm:$Iu)>;
-
-class T_IRR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID imm:$Is, I32:$Rs, I32:$Rt),
-         (MI imm:$Is, I32:$Rs, I32:$Rt)>;
-
-class T_RIR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rs, imm:$Is, I32:$Rt),
-         (MI I32:$Rs, imm:$Is, I32:$Rt)>;
-
-class T_RRR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rs, I32:$Rt, I32:$Ru),
-         (MI I32:$Rs, I32:$Rt, I32:$Ru)>;
-
-class T_PPI_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I64:$Rt, imm:$Iu),
-         (MI I64:$Rs, I64:$Rt, imm:$Iu)>;
-
-class T_PII_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, imm:$It, imm:$Iu),
-         (MI I64:$Rs, imm:$It, imm:$Iu)>;
-
-class T_PPP_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I64:$Rt, I64:$Ru),
-         (MI I64:$Rs, I64:$Rt, I64:$Ru)>;
-
-class T_PPR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Ru),
-         (MI I64:$Rs, I64:$Rt, I32:$Ru)>;
-
-class T_PRR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I32:$Rt, I32:$Ru),
-         (MI I64:$Rs, I32:$Rt, I32:$Ru)>;
-
-class T_PPQ_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Rp),
-         (MI I64:$Rs, I64:$Rt, (C2_tfrrp I32:$Rp))>;
-
-class T_PR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I32:$Rt),
-         (MI I64:$Rs, I32:$Rt)>;
-
-class T_D_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID (F64:$Rs)),
-        (MI (F64:$Rs))>;
-
-class T_DI_pat <InstHexagon MI, Intrinsic IntID,
-                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
-  : Pat<(IntID F64:$Rs, ImmPred:$It),
-        (MI F64:$Rs, ImmPred:$It)>;
-
-class T_F_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID F32:$Rs),
-        (MI F32:$Rs)>;
-
-class T_FI_pat <InstHexagon MI, Intrinsic IntID,
-                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
-  : Pat<(IntID F32:$Rs, ImmPred:$It),
-        (MI F32:$Rs, ImmPred:$It)>;
-
-class T_FF_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID F32:$Rs, F32:$Rt),
-        (MI F32:$Rs, F32:$Rt)>;
-
-class T_DD_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID F64:$Rs, F64:$Rt),
-        (MI F64:$Rs, F64:$Rt)>;
-
-class T_FFF_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID F32:$Rs, F32:$Rt, F32:$Ru),
-        (MI F32:$Rs, F32:$Rt, F32:$Ru)>;
-
-class T_FFFQ_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID F32:$Rs, F32:$Rt, F32:$Ru, I32:$Rp),
-         (MI F32:$Rs, F32:$Rt, F32:$Ru, (C2_tfrrp I32:$Rp))>;
-
-class T_Q_RI_pat <InstHexagon MI, Intrinsic IntID,
-                  PatLeaf ImmPred = PatLeaf<(i32 imm)>>
-  : Pat<(IntID I32:$Rs, ImmPred:$It),
-        (C2_tfrpr (MI I32:$Rs, ImmPred:$It))>;
-
-class T_Q_RR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rs, I32:$Rt),
-         (C2_tfrpr (MI I32:$Rs, I32:$Rt))>;
-
-class T_Q_RP_pat <InstHexagon MI, Intrinsic IntID>
+class T_RP_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I32:$Rs, I64:$Rt),
-         (C2_tfrpr (MI I32:$Rs, I64:$Rt))>;
-
-class T_Q_PR_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I32:$Rt),
-         (C2_tfrpr (MI I64:$Rs, I32:$Rt))>;
-
-class T_Q_PI_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID I64:$Rs, imm:$It),
-        (C2_tfrpr (MI I64:$Rs, imm:$It))>;
-
-class T_Q_PP_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I64:$Rt),
-         (C2_tfrpr (MI I64:$Rs, I64:$Rt))>;
-
-class T_Q_Q_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rp),
-         (C2_tfrpr (MI (C2_tfrrp I32:$Rp)))>;
-
-class T_Q_QQ_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rp, I32:$Rq),
-         (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq)))>;
-
-class T_Q_FF_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID F32:$Rs, F32:$Rt),
-        (C2_tfrpr (MI F32:$Rs, F32:$Rt))>;
-
-class T_Q_DD_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID F64:$Rs, F64:$Rt),
-        (C2_tfrpr (MI F64:$Rs, F64:$Rt))>;
-
-class T_Q_FI_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID F32:$Rs, imm:$It),
-        (C2_tfrpr (MI F32:$Rs, imm:$It))>;
-
-class T_Q_DI_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID F64:$Rs, imm:$It),
-        (C2_tfrpr (MI F64:$Rs, imm:$It))>;
-
-class T_Q_QQQ_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I32:$Rp, I32:$Rq, I32:$Rs),
-         (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq),
-                       (C2_tfrrp I32:$Rs)))>;
-
-//===----------------------------------------------------------------------===//
-// MPYS / Multipy signed/unsigned halfwords
-//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
-//===----------------------------------------------------------------------===//
-
-def : T_RR_pat <M2_mpy_ll_s1, int_hexagon_M2_mpy_ll_s1>;
-def : T_RR_pat <M2_mpy_ll_s0, int_hexagon_M2_mpy_ll_s0>;
-def : T_RR_pat <M2_mpy_lh_s1, int_hexagon_M2_mpy_lh_s1>;
-def : T_RR_pat <M2_mpy_lh_s0, int_hexagon_M2_mpy_lh_s0>;
-def : T_RR_pat <M2_mpy_hl_s1, int_hexagon_M2_mpy_hl_s1>;
-def : T_RR_pat <M2_mpy_hl_s0, int_hexagon_M2_mpy_hl_s0>;
-def : T_RR_pat <M2_mpy_hh_s1, int_hexagon_M2_mpy_hh_s1>;
-def : T_RR_pat <M2_mpy_hh_s0, int_hexagon_M2_mpy_hh_s0>;
-
-def : T_RR_pat <M2_mpyu_ll_s1, int_hexagon_M2_mpyu_ll_s1>;
-def : T_RR_pat <M2_mpyu_ll_s0, int_hexagon_M2_mpyu_ll_s0>;
-def : T_RR_pat <M2_mpyu_lh_s1, int_hexagon_M2_mpyu_lh_s1>;
-def : T_RR_pat <M2_mpyu_lh_s0, int_hexagon_M2_mpyu_lh_s0>;
-def : T_RR_pat <M2_mpyu_hl_s1, int_hexagon_M2_mpyu_hl_s1>;
-def : T_RR_pat <M2_mpyu_hl_s0, int_hexagon_M2_mpyu_hl_s0>;
-def : T_RR_pat <M2_mpyu_hh_s1, int_hexagon_M2_mpyu_hh_s1>;
-def : T_RR_pat <M2_mpyu_hh_s0, int_hexagon_M2_mpyu_hh_s0>;
-
-def : T_RR_pat <M2_mpy_sat_ll_s1, int_hexagon_M2_mpy_sat_ll_s1>;
-def : T_RR_pat <M2_mpy_sat_ll_s0, int_hexagon_M2_mpy_sat_ll_s0>;
-def : T_RR_pat <M2_mpy_sat_lh_s1, int_hexagon_M2_mpy_sat_lh_s1>;
-def : T_RR_pat <M2_mpy_sat_lh_s0, int_hexagon_M2_mpy_sat_lh_s0>;
-def : T_RR_pat <M2_mpy_sat_hl_s1, int_hexagon_M2_mpy_sat_hl_s1>;
-def : T_RR_pat <M2_mpy_sat_hl_s0, int_hexagon_M2_mpy_sat_hl_s0>;
-def : T_RR_pat <M2_mpy_sat_hh_s1, int_hexagon_M2_mpy_sat_hh_s1>;
-def : T_RR_pat <M2_mpy_sat_hh_s0, int_hexagon_M2_mpy_sat_hh_s0>;
-
-def : T_RR_pat <M2_mpy_rnd_ll_s1, int_hexagon_M2_mpy_rnd_ll_s1>;
-def : T_RR_pat <M2_mpy_rnd_ll_s0, int_hexagon_M2_mpy_rnd_ll_s0>;
-def : T_RR_pat <M2_mpy_rnd_lh_s1, int_hexagon_M2_mpy_rnd_lh_s1>;
-def : T_RR_pat <M2_mpy_rnd_lh_s0, int_hexagon_M2_mpy_rnd_lh_s0>;
-def : T_RR_pat <M2_mpy_rnd_hl_s1, int_hexagon_M2_mpy_rnd_hl_s1>;
-def : T_RR_pat <M2_mpy_rnd_hl_s0, int_hexagon_M2_mpy_rnd_hl_s0>;
-def : T_RR_pat <M2_mpy_rnd_hh_s1, int_hexagon_M2_mpy_rnd_hh_s1>;
-def : T_RR_pat <M2_mpy_rnd_hh_s0, int_hexagon_M2_mpy_rnd_hh_s0>;
-
-def : T_RR_pat <M2_mpy_sat_rnd_ll_s1, int_hexagon_M2_mpy_sat_rnd_ll_s1>;
-def : T_RR_pat <M2_mpy_sat_rnd_ll_s0, int_hexagon_M2_mpy_sat_rnd_ll_s0>;
-def : T_RR_pat <M2_mpy_sat_rnd_lh_s1, int_hexagon_M2_mpy_sat_rnd_lh_s1>;
-def : T_RR_pat <M2_mpy_sat_rnd_lh_s0, int_hexagon_M2_mpy_sat_rnd_lh_s0>;
-def : T_RR_pat <M2_mpy_sat_rnd_hl_s1, int_hexagon_M2_mpy_sat_rnd_hl_s1>;
-def : T_RR_pat <M2_mpy_sat_rnd_hl_s0, int_hexagon_M2_mpy_sat_rnd_hl_s0>;
-def : T_RR_pat <M2_mpy_sat_rnd_hh_s1, int_hexagon_M2_mpy_sat_rnd_hh_s1>;
-def : T_RR_pat <M2_mpy_sat_rnd_hh_s0, int_hexagon_M2_mpy_sat_rnd_hh_s0>;
-
-
-//===----------------------------------------------------------------------===//
-// MPYS / Multipy signed/unsigned halfwords and add/subtract the
-// result from the accumulator.
-//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//===----------------------------------------------------------------------===//
-
-def : T_RRR_pat <M2_mpy_acc_ll_s1, int_hexagon_M2_mpy_acc_ll_s1>;
-def : T_RRR_pat <M2_mpy_acc_ll_s0, int_hexagon_M2_mpy_acc_ll_s0>;
-def : T_RRR_pat <M2_mpy_acc_lh_s1, int_hexagon_M2_mpy_acc_lh_s1>;
-def : T_RRR_pat <M2_mpy_acc_lh_s0, int_hexagon_M2_mpy_acc_lh_s0>;
-def : T_RRR_pat <M2_mpy_acc_hl_s1, int_hexagon_M2_mpy_acc_hl_s1>;
-def : T_RRR_pat <M2_mpy_acc_hl_s0, int_hexagon_M2_mpy_acc_hl_s0>;
-def : T_RRR_pat <M2_mpy_acc_hh_s1, int_hexagon_M2_mpy_acc_hh_s1>;
-def : T_RRR_pat <M2_mpy_acc_hh_s0, int_hexagon_M2_mpy_acc_hh_s0>;
-
-def : T_RRR_pat <M2_mpyu_acc_ll_s1, int_hexagon_M2_mpyu_acc_ll_s1>;
-def : T_RRR_pat <M2_mpyu_acc_ll_s0, int_hexagon_M2_mpyu_acc_ll_s0>;
-def : T_RRR_pat <M2_mpyu_acc_lh_s1, int_hexagon_M2_mpyu_acc_lh_s1>;
-def : T_RRR_pat <M2_mpyu_acc_lh_s0, int_hexagon_M2_mpyu_acc_lh_s0>;
-def : T_RRR_pat <M2_mpyu_acc_hl_s1, int_hexagon_M2_mpyu_acc_hl_s1>;
-def : T_RRR_pat <M2_mpyu_acc_hl_s0, int_hexagon_M2_mpyu_acc_hl_s0>;
-def : T_RRR_pat <M2_mpyu_acc_hh_s1, int_hexagon_M2_mpyu_acc_hh_s1>;
-def : T_RRR_pat <M2_mpyu_acc_hh_s0, int_hexagon_M2_mpyu_acc_hh_s0>;
-
-def : T_RRR_pat <M2_mpy_nac_ll_s1, int_hexagon_M2_mpy_nac_ll_s1>;
-def : T_RRR_pat <M2_mpy_nac_ll_s0, int_hexagon_M2_mpy_nac_ll_s0>;
-def : T_RRR_pat <M2_mpy_nac_lh_s1, int_hexagon_M2_mpy_nac_lh_s1>;
-def : T_RRR_pat <M2_mpy_nac_lh_s0, int_hexagon_M2_mpy_nac_lh_s0>;
-def : T_RRR_pat <M2_mpy_nac_hl_s1, int_hexagon_M2_mpy_nac_hl_s1>;
-def : T_RRR_pat <M2_mpy_nac_hl_s0, int_hexagon_M2_mpy_nac_hl_s0>;
-def : T_RRR_pat <M2_mpy_nac_hh_s1, int_hexagon_M2_mpy_nac_hh_s1>;
-def : T_RRR_pat <M2_mpy_nac_hh_s0, int_hexagon_M2_mpy_nac_hh_s0>;
-
-def : T_RRR_pat <M2_mpyu_nac_ll_s1, int_hexagon_M2_mpyu_nac_ll_s1>;
-def : T_RRR_pat <M2_mpyu_nac_ll_s0, int_hexagon_M2_mpyu_nac_ll_s0>;
-def : T_RRR_pat <M2_mpyu_nac_lh_s1, int_hexagon_M2_mpyu_nac_lh_s1>;
-def : T_RRR_pat <M2_mpyu_nac_lh_s0, int_hexagon_M2_mpyu_nac_lh_s0>;
-def : T_RRR_pat <M2_mpyu_nac_hl_s1, int_hexagon_M2_mpyu_nac_hl_s1>;
-def : T_RRR_pat <M2_mpyu_nac_hl_s0, int_hexagon_M2_mpyu_nac_hl_s0>;
-def : T_RRR_pat <M2_mpyu_nac_hh_s1, int_hexagon_M2_mpyu_nac_hh_s1>;
-def : T_RRR_pat <M2_mpyu_nac_hh_s0, int_hexagon_M2_mpyu_nac_hh_s0>;
-
-def : T_RRR_pat <M2_mpy_acc_sat_ll_s1, int_hexagon_M2_mpy_acc_sat_ll_s1>;
-def : T_RRR_pat <M2_mpy_acc_sat_ll_s0, int_hexagon_M2_mpy_acc_sat_ll_s0>;
-def : T_RRR_pat <M2_mpy_acc_sat_lh_s1, int_hexagon_M2_mpy_acc_sat_lh_s1>;
-def : T_RRR_pat <M2_mpy_acc_sat_lh_s0, int_hexagon_M2_mpy_acc_sat_lh_s0>;
-def : T_RRR_pat <M2_mpy_acc_sat_hl_s1, int_hexagon_M2_mpy_acc_sat_hl_s1>;
-def : T_RRR_pat <M2_mpy_acc_sat_hl_s0, int_hexagon_M2_mpy_acc_sat_hl_s0>;
-def : T_RRR_pat <M2_mpy_acc_sat_hh_s1, int_hexagon_M2_mpy_acc_sat_hh_s1>;
-def : T_RRR_pat <M2_mpy_acc_sat_hh_s0, int_hexagon_M2_mpy_acc_sat_hh_s0>;
-
-def : T_RRR_pat <M2_mpy_nac_sat_ll_s1, int_hexagon_M2_mpy_nac_sat_ll_s1>;
-def : T_RRR_pat <M2_mpy_nac_sat_ll_s0, int_hexagon_M2_mpy_nac_sat_ll_s0>;
-def : T_RRR_pat <M2_mpy_nac_sat_lh_s1, int_hexagon_M2_mpy_nac_sat_lh_s1>;
-def : T_RRR_pat <M2_mpy_nac_sat_lh_s0, int_hexagon_M2_mpy_nac_sat_lh_s0>;
-def : T_RRR_pat <M2_mpy_nac_sat_hl_s1, int_hexagon_M2_mpy_nac_sat_hl_s1>;
-def : T_RRR_pat <M2_mpy_nac_sat_hl_s0, int_hexagon_M2_mpy_nac_sat_hl_s0>;
-def : T_RRR_pat <M2_mpy_nac_sat_hh_s1, int_hexagon_M2_mpy_nac_sat_hh_s1>;
-def : T_RRR_pat <M2_mpy_nac_sat_hh_s0, int_hexagon_M2_mpy_nac_sat_hh_s0>;
-
-
-//===----------------------------------------------------------------------===//
-// Multiply signed/unsigned halfwords with and without saturation and rounding
-// into a 64-bits destination register.
-//===----------------------------------------------------------------------===//
-
-def : T_RR_pat <M2_mpyd_hh_s0, int_hexagon_M2_mpyd_hh_s0>;
-def : T_RR_pat <M2_mpyd_hl_s0, int_hexagon_M2_mpyd_hl_s0>;
-def : T_RR_pat <M2_mpyd_lh_s0, int_hexagon_M2_mpyd_lh_s0>;
-def : T_RR_pat <M2_mpyd_ll_s0, int_hexagon_M2_mpyd_ll_s0>;
-def : T_RR_pat <M2_mpyd_hh_s1, int_hexagon_M2_mpyd_hh_s1>;
-def : T_RR_pat <M2_mpyd_hl_s1, int_hexagon_M2_mpyd_hl_s1>;
-def : T_RR_pat <M2_mpyd_lh_s1, int_hexagon_M2_mpyd_lh_s1>;
-def : T_RR_pat <M2_mpyd_ll_s1, int_hexagon_M2_mpyd_ll_s1>;
-
-def : T_RR_pat <M2_mpyd_rnd_hh_s0, int_hexagon_M2_mpyd_rnd_hh_s0>;
-def : T_RR_pat <M2_mpyd_rnd_hl_s0, int_hexagon_M2_mpyd_rnd_hl_s0>;
-def : T_RR_pat <M2_mpyd_rnd_lh_s0, int_hexagon_M2_mpyd_rnd_lh_s0>;
-def : T_RR_pat <M2_mpyd_rnd_ll_s0, int_hexagon_M2_mpyd_rnd_ll_s0>;
-def : T_RR_pat <M2_mpyd_rnd_hh_s1, int_hexagon_M2_mpyd_rnd_hh_s1>;
-def : T_RR_pat <M2_mpyd_rnd_hl_s1, int_hexagon_M2_mpyd_rnd_hl_s1>;
-def : T_RR_pat <M2_mpyd_rnd_lh_s1, int_hexagon_M2_mpyd_rnd_lh_s1>;
-def : T_RR_pat <M2_mpyd_rnd_ll_s1, int_hexagon_M2_mpyd_rnd_ll_s1>;
-
-def : T_RR_pat <M2_mpyud_hh_s0, int_hexagon_M2_mpyud_hh_s0>;
-def : T_RR_pat <M2_mpyud_hl_s0, int_hexagon_M2_mpyud_hl_s0>;
-def : T_RR_pat <M2_mpyud_lh_s0, int_hexagon_M2_mpyud_lh_s0>;
-def : T_RR_pat <M2_mpyud_ll_s0, int_hexagon_M2_mpyud_ll_s0>;
-def : T_RR_pat <M2_mpyud_hh_s1, int_hexagon_M2_mpyud_hh_s1>;
-def : T_RR_pat <M2_mpyud_hl_s1, int_hexagon_M2_mpyud_hl_s1>;
-def : T_RR_pat <M2_mpyud_lh_s1, int_hexagon_M2_mpyud_lh_s1>;
-def : T_RR_pat <M2_mpyud_ll_s1, int_hexagon_M2_mpyud_ll_s1>;
-
-//===----------------------------------------------------------------------===//
-// MPYS / Multipy signed/unsigned halfwords and add/subtract the
-// result from the 64-bit destination register.
-//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//===----------------------------------------------------------------------===//
-
-def : T_PRR_pat <M2_mpyd_acc_hh_s0, int_hexagon_M2_mpyd_acc_hh_s0>;
-def : T_PRR_pat <M2_mpyd_acc_hl_s0, int_hexagon_M2_mpyd_acc_hl_s0>;
-def : T_PRR_pat <M2_mpyd_acc_lh_s0, int_hexagon_M2_mpyd_acc_lh_s0>;
-def : T_PRR_pat <M2_mpyd_acc_ll_s0, int_hexagon_M2_mpyd_acc_ll_s0>;
-
-def : T_PRR_pat <M2_mpyd_acc_hh_s1, int_hexagon_M2_mpyd_acc_hh_s1>;
-def : T_PRR_pat <M2_mpyd_acc_hl_s1, int_hexagon_M2_mpyd_acc_hl_s1>;
-def : T_PRR_pat <M2_mpyd_acc_lh_s1, int_hexagon_M2_mpyd_acc_lh_s1>;
-def : T_PRR_pat <M2_mpyd_acc_ll_s1, int_hexagon_M2_mpyd_acc_ll_s1>;
-
-def : T_PRR_pat <M2_mpyd_nac_hh_s0, int_hexagon_M2_mpyd_nac_hh_s0>;
-def : T_PRR_pat <M2_mpyd_nac_hl_s0, int_hexagon_M2_mpyd_nac_hl_s0>;
-def : T_PRR_pat <M2_mpyd_nac_lh_s0, int_hexagon_M2_mpyd_nac_lh_s0>;
-def : T_PRR_pat <M2_mpyd_nac_ll_s0, int_hexagon_M2_mpyd_nac_ll_s0>;
-
-def : T_PRR_pat <M2_mpyd_nac_hh_s1, int_hexagon_M2_mpyd_nac_hh_s1>;
-def : T_PRR_pat <M2_mpyd_nac_hl_s1, int_hexagon_M2_mpyd_nac_hl_s1>;
-def : T_PRR_pat <M2_mpyd_nac_lh_s1, int_hexagon_M2_mpyd_nac_lh_s1>;
-def : T_PRR_pat <M2_mpyd_nac_ll_s1, int_hexagon_M2_mpyd_nac_ll_s1>;
-
-def : T_PRR_pat <M2_mpyud_acc_hh_s0, int_hexagon_M2_mpyud_acc_hh_s0>;
-def : T_PRR_pat <M2_mpyud_acc_hl_s0, int_hexagon_M2_mpyud_acc_hl_s0>;
-def : T_PRR_pat <M2_mpyud_acc_lh_s0, int_hexagon_M2_mpyud_acc_lh_s0>;
-def : T_PRR_pat <M2_mpyud_acc_ll_s0, int_hexagon_M2_mpyud_acc_ll_s0>;
-
-def : T_PRR_pat <M2_mpyud_acc_hh_s1, int_hexagon_M2_mpyud_acc_hh_s1>;
-def : T_PRR_pat <M2_mpyud_acc_hl_s1, int_hexagon_M2_mpyud_acc_hl_s1>;
-def : T_PRR_pat <M2_mpyud_acc_lh_s1, int_hexagon_M2_mpyud_acc_lh_s1>;
-def : T_PRR_pat <M2_mpyud_acc_ll_s1, int_hexagon_M2_mpyud_acc_ll_s1>;
-
-def : T_PRR_pat <M2_mpyud_nac_hh_s0, int_hexagon_M2_mpyud_nac_hh_s0>;
-def : T_PRR_pat <M2_mpyud_nac_hl_s0, int_hexagon_M2_mpyud_nac_hl_s0>;
-def : T_PRR_pat <M2_mpyud_nac_lh_s0, int_hexagon_M2_mpyud_nac_lh_s0>;
-def : T_PRR_pat <M2_mpyud_nac_ll_s0, int_hexagon_M2_mpyud_nac_ll_s0>;
-
-def : T_PRR_pat <M2_mpyud_nac_hh_s1, int_hexagon_M2_mpyud_nac_hh_s1>;
-def : T_PRR_pat <M2_mpyud_nac_hl_s1, int_hexagon_M2_mpyud_nac_hl_s1>;
-def : T_PRR_pat <M2_mpyud_nac_lh_s1, int_hexagon_M2_mpyud_nac_lh_s1>;
-def : T_PRR_pat <M2_mpyud_nac_ll_s1, int_hexagon_M2_mpyud_nac_ll_s1>;
-
-// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
-def : T_PP_pat <M2_vcmpy_s1_sat_i, int_hexagon_M2_vcmpy_s1_sat_i>;
-def : T_PP_pat <M2_vcmpy_s0_sat_i, int_hexagon_M2_vcmpy_s0_sat_i>;
-
-// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
-def : T_PP_pat <M2_vcmpy_s1_sat_r, int_hexagon_M2_vcmpy_s1_sat_r>;
-def : T_PP_pat <M2_vcmpy_s0_sat_r, int_hexagon_M2_vcmpy_s0_sat_r>;
-
-// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
-def : T_PP_pat <M2_vdmpys_s1, int_hexagon_M2_vdmpys_s1>;
-def : T_PP_pat <M2_vdmpys_s0, int_hexagon_M2_vdmpys_s0>;
-
-// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
-def : T_PP_pat <M2_vmpy2es_s1, int_hexagon_M2_vmpy2es_s1>;
-def : T_PP_pat <M2_vmpy2es_s0, int_hexagon_M2_vmpy2es_s0>;
-
-//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
-def : T_PP_pat <M2_mmpyh_s0,  int_hexagon_M2_mmpyh_s0>;
-def : T_PP_pat <M2_mmpyh_s1,  int_hexagon_M2_mmpyh_s1>;
-def : T_PP_pat <M2_mmpyh_rs0, int_hexagon_M2_mmpyh_rs0>;
-def : T_PP_pat <M2_mmpyh_rs1, int_hexagon_M2_mmpyh_rs1>;
-
-//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
-def : T_PP_pat <M2_mmpyl_s0,  int_hexagon_M2_mmpyl_s0>;
-def : T_PP_pat <M2_mmpyl_s1,  int_hexagon_M2_mmpyl_s1>;
-def : T_PP_pat <M2_mmpyl_rs0, int_hexagon_M2_mmpyl_rs0>;
-def : T_PP_pat <M2_mmpyl_rs1, int_hexagon_M2_mmpyl_rs1>;
-
-//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
-def : T_PP_pat <M2_mmpyuh_s0,  int_hexagon_M2_mmpyuh_s0>;
-def : T_PP_pat <M2_mmpyuh_s1,  int_hexagon_M2_mmpyuh_s1>;
-def : T_PP_pat <M2_mmpyuh_rs0, int_hexagon_M2_mmpyuh_rs0>;
-def : T_PP_pat <M2_mmpyuh_rs1, int_hexagon_M2_mmpyuh_rs1>;
-
-//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
-def : T_PP_pat <M2_mmpyul_s0,  int_hexagon_M2_mmpyul_s0>;
-def : T_PP_pat <M2_mmpyul_s1,  int_hexagon_M2_mmpyul_s1>;
-def : T_PP_pat <M2_mmpyul_rs0, int_hexagon_M2_mmpyul_rs0>;
-def : T_PP_pat <M2_mmpyul_rs1, int_hexagon_M2_mmpyul_rs1>;
-
-// Vector reduce add unsigned bytes: Rdd32[+]=vrmpybu(Rss32,Rtt32)
-def : T_PP_pat  <A2_vraddub,     int_hexagon_A2_vraddub>;
-def : T_PPP_pat <A2_vraddub_acc, int_hexagon_A2_vraddub_acc>;
-
-// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
-def : T_PP_pat  <A2_vrsadub,     int_hexagon_A2_vrsadub>;
-def : T_PPP_pat <A2_vrsadub_acc, int_hexagon_A2_vrsadub_acc>;
-
-// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
-def : T_PP_pat <M2_vabsdiffh, int_hexagon_M2_vabsdiffh>;
-
-// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
-def : T_PP_pat <M2_vabsdiffw, int_hexagon_M2_vabsdiffw>;
-
-// Vector reduce complex multiply real or imaginary:
-// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
-def : T_PP_pat  <M2_vrcmpyi_s0,  int_hexagon_M2_vrcmpyi_s0>;
-def : T_PP_pat  <M2_vrcmpyi_s0c, int_hexagon_M2_vrcmpyi_s0c>;
-def : T_PPP_pat <M2_vrcmaci_s0,  int_hexagon_M2_vrcmaci_s0>;
-def : T_PPP_pat <M2_vrcmaci_s0c, int_hexagon_M2_vrcmaci_s0c>;
-
-def : T_PP_pat  <M2_vrcmpyr_s0,  int_hexagon_M2_vrcmpyr_s0>;
-def : T_PP_pat  <M2_vrcmpyr_s0c, int_hexagon_M2_vrcmpyr_s0c>;
-def : T_PPP_pat <M2_vrcmacr_s0,  int_hexagon_M2_vrcmacr_s0>;
-def : T_PPP_pat <M2_vrcmacr_s0c, int_hexagon_M2_vrcmacr_s0c>;
-
-// Vector reduce halfwords
-// Rdd[+]=vrmpyh(Rss,Rtt)
-def : T_PP_pat  <M2_vrmpy_s0, int_hexagon_M2_vrmpy_s0>;
-def : T_PPP_pat <M2_vrmac_s0, int_hexagon_M2_vrmac_s0>;
-
-//===----------------------------------------------------------------------===//
-// Vector Multipy with accumulation
-//===----------------------------------------------------------------------===//
-
-// Vector multiply word by signed half with accumulation
-// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
-def : T_PPP_pat <M2_mmacls_s1, int_hexagon_M2_mmacls_s1>;
-def : T_PPP_pat <M2_mmacls_s0, int_hexagon_M2_mmacls_s0>;
-def : T_PPP_pat <M2_mmacls_rs1, int_hexagon_M2_mmacls_rs1>;
-def : T_PPP_pat <M2_mmacls_rs0, int_hexagon_M2_mmacls_rs0>;
-def : T_PPP_pat <M2_mmachs_s1, int_hexagon_M2_mmachs_s1>;
-def : T_PPP_pat <M2_mmachs_s0, int_hexagon_M2_mmachs_s0>;
-def : T_PPP_pat <M2_mmachs_rs1, int_hexagon_M2_mmachs_rs1>;
-def : T_PPP_pat <M2_mmachs_rs0, int_hexagon_M2_mmachs_rs0>;
-
-// Vector multiply word by unsigned half with accumulation
-// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
-def : T_PPP_pat <M2_mmaculs_s1, int_hexagon_M2_mmaculs_s1>;
-def : T_PPP_pat <M2_mmaculs_s0, int_hexagon_M2_mmaculs_s0>;
-def : T_PPP_pat <M2_mmaculs_rs1, int_hexagon_M2_mmaculs_rs1>;
-def : T_PPP_pat <M2_mmaculs_rs0, int_hexagon_M2_mmaculs_rs0>;
-def : T_PPP_pat <M2_mmacuhs_s1, int_hexagon_M2_mmacuhs_s1>;
-def : T_PPP_pat <M2_mmacuhs_s0, int_hexagon_M2_mmacuhs_s0>;
-def : T_PPP_pat <M2_mmacuhs_rs1, int_hexagon_M2_mmacuhs_rs1>;
-def : T_PPP_pat <M2_mmacuhs_rs0, int_hexagon_M2_mmacuhs_rs0>;
-
-// Vector multiply even halfwords with accumulation
-// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
-def : T_PPP_pat <M2_vmac2es, int_hexagon_M2_vmac2es>;
-def : T_PPP_pat <M2_vmac2es_s1, int_hexagon_M2_vmac2es_s1>;
-def : T_PPP_pat <M2_vmac2es_s0, int_hexagon_M2_vmac2es_s0>;
-
-// Vector dual multiply with accumulation
-// Rxx+=vdmpy(Rss,Rtt)[:sat]
-def : T_PPP_pat <M2_vdmacs_s1, int_hexagon_M2_vdmacs_s1>;
-def : T_PPP_pat <M2_vdmacs_s0, int_hexagon_M2_vdmacs_s0>;
-
-// Vector complex multiply real or imaginary with accumulation
-// Rxx+=vcmpy[ir](Rss,Rtt):sat
-def : T_PPP_pat <M2_vcmac_s0_sat_r, int_hexagon_M2_vcmac_s0_sat_r>;
-def : T_PPP_pat <M2_vcmac_s0_sat_i, int_hexagon_M2_vcmac_s0_sat_i>;
-
-//===----------------------------------------------------------------------===//
-// Add/Subtract halfword
-// Rd=add(Rt.L,Rs.[HL])[:sat]
-// Rd=sub(Rt.L,Rs.[HL])[:sat]
-// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
-// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
-//===----------------------------------------------------------------------===//
-
-//Rd=add(Rt.L,Rs.[LH])
-def : T_RR_pat <A2_addh_l16_ll,     int_hexagon_A2_addh_l16_ll>;
-def : T_RR_pat <A2_addh_l16_hl,     int_hexagon_A2_addh_l16_hl>;
-
-//Rd=add(Rt.L,Rs.[LH]):sat
-def : T_RR_pat <A2_addh_l16_sat_ll, int_hexagon_A2_addh_l16_sat_ll>;
-def : T_RR_pat <A2_addh_l16_sat_hl, int_hexagon_A2_addh_l16_sat_hl>;
-
-//Rd=sub(Rt.L,Rs.[LH])
-def : T_RR_pat <A2_subh_l16_ll,     int_hexagon_A2_subh_l16_ll>;
-def : T_RR_pat <A2_subh_l16_hl,     int_hexagon_A2_subh_l16_hl>;
-
-//Rd=sub(Rt.L,Rs.[LH]):sat
-def : T_RR_pat <A2_subh_l16_sat_ll, int_hexagon_A2_subh_l16_sat_ll>;
-def : T_RR_pat <A2_subh_l16_sat_hl, int_hexagon_A2_subh_l16_sat_hl>;
-
-//Rd=add(Rt.[LH],Rs.[LH]):<<16
-def : T_RR_pat <A2_addh_h16_ll,     int_hexagon_A2_addh_h16_ll>;
-def : T_RR_pat <A2_addh_h16_lh,     int_hexagon_A2_addh_h16_lh>;
-def : T_RR_pat <A2_addh_h16_hl,     int_hexagon_A2_addh_h16_hl>;
-def : T_RR_pat <A2_addh_h16_hh,     int_hexagon_A2_addh_h16_hh>;
-
-//Rd=sub(Rt.[LH],Rs.[LH]):<<16
-def : T_RR_pat <A2_subh_h16_ll,     int_hexagon_A2_subh_h16_ll>;
-def : T_RR_pat <A2_subh_h16_lh,     int_hexagon_A2_subh_h16_lh>;
-def : T_RR_pat <A2_subh_h16_hl,     int_hexagon_A2_subh_h16_hl>;
-def : T_RR_pat <A2_subh_h16_hh,     int_hexagon_A2_subh_h16_hh>;
-
-//Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
-def : T_RR_pat <A2_addh_h16_sat_ll, int_hexagon_A2_addh_h16_sat_ll>;
-def : T_RR_pat <A2_addh_h16_sat_lh, int_hexagon_A2_addh_h16_sat_lh>;
-def : T_RR_pat <A2_addh_h16_sat_hl, int_hexagon_A2_addh_h16_sat_hl>;
-def : T_RR_pat <A2_addh_h16_sat_hh, int_hexagon_A2_addh_h16_sat_hh>;
-
-//Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
-def : T_RR_pat <A2_subh_h16_sat_ll, int_hexagon_A2_subh_h16_sat_ll>;
-def : T_RR_pat <A2_subh_h16_sat_lh, int_hexagon_A2_subh_h16_sat_lh>;
-def : T_RR_pat <A2_subh_h16_sat_hl, int_hexagon_A2_subh_h16_sat_hl>;
-def : T_RR_pat <A2_subh_h16_sat_hh, int_hexagon_A2_subh_h16_sat_hh>;
-
-// ALU64 / ALU / min max
-def : T_RR_pat<A2_max,  int_hexagon_A2_max>;
-def : T_RR_pat<A2_min,  int_hexagon_A2_min>;
-def : T_RR_pat<A2_maxu, int_hexagon_A2_maxu>;
-def : T_RR_pat<A2_minu, int_hexagon_A2_minu>;
-
-// Shift and accumulate
-def : T_RRI_pat <S2_asr_i_r_nac,  int_hexagon_S2_asr_i_r_nac>;
-def : T_RRI_pat <S2_lsr_i_r_nac,  int_hexagon_S2_lsr_i_r_nac>;
-def : T_RRI_pat <S2_asl_i_r_nac,  int_hexagon_S2_asl_i_r_nac>;
-def : T_RRI_pat <S2_asr_i_r_acc,  int_hexagon_S2_asr_i_r_acc>;
-def : T_RRI_pat <S2_lsr_i_r_acc,  int_hexagon_S2_lsr_i_r_acc>;
-def : T_RRI_pat <S2_asl_i_r_acc,  int_hexagon_S2_asl_i_r_acc>;
-
-def : T_RRI_pat <S2_asr_i_r_and,  int_hexagon_S2_asr_i_r_and>;
-def : T_RRI_pat <S2_lsr_i_r_and,  int_hexagon_S2_lsr_i_r_and>;
-def : T_RRI_pat <S2_asl_i_r_and,  int_hexagon_S2_asl_i_r_and>;
-def : T_RRI_pat <S2_asr_i_r_or,   int_hexagon_S2_asr_i_r_or>;
-def : T_RRI_pat <S2_lsr_i_r_or,   int_hexagon_S2_lsr_i_r_or>;
-def : T_RRI_pat <S2_asl_i_r_or,   int_hexagon_S2_asl_i_r_or>;
-def : T_RRI_pat <S2_lsr_i_r_xacc, int_hexagon_S2_lsr_i_r_xacc>;
-def : T_RRI_pat <S2_asl_i_r_xacc, int_hexagon_S2_asl_i_r_xacc>;
-
-def : T_PPI_pat <S2_asr_i_p_nac,  int_hexagon_S2_asr_i_p_nac>;
-def : T_PPI_pat <S2_lsr_i_p_nac,  int_hexagon_S2_lsr_i_p_nac>;
-def : T_PPI_pat <S2_asl_i_p_nac,  int_hexagon_S2_asl_i_p_nac>;
-def : T_PPI_pat <S2_asr_i_p_acc,  int_hexagon_S2_asr_i_p_acc>;
-def : T_PPI_pat <S2_lsr_i_p_acc,  int_hexagon_S2_lsr_i_p_acc>;
-def : T_PPI_pat <S2_asl_i_p_acc,  int_hexagon_S2_asl_i_p_acc>;
-
-def : T_PPI_pat <S2_asr_i_p_and,  int_hexagon_S2_asr_i_p_and>;
-def : T_PPI_pat <S2_lsr_i_p_and,  int_hexagon_S2_lsr_i_p_and>;
-def : T_PPI_pat <S2_asl_i_p_and,  int_hexagon_S2_asl_i_p_and>;
-def : T_PPI_pat <S2_asr_i_p_or,   int_hexagon_S2_asr_i_p_or>;
-def : T_PPI_pat <S2_lsr_i_p_or,   int_hexagon_S2_lsr_i_p_or>;
-def : T_PPI_pat <S2_asl_i_p_or,   int_hexagon_S2_asl_i_p_or>;
-def : T_PPI_pat <S2_lsr_i_p_xacc, int_hexagon_S2_lsr_i_p_xacc>;
-def : T_PPI_pat <S2_asl_i_p_xacc, int_hexagon_S2_asl_i_p_xacc>;
-
-def : T_RRR_pat <S2_asr_r_r_nac,  int_hexagon_S2_asr_r_r_nac>;
-def : T_RRR_pat <S2_lsr_r_r_nac,  int_hexagon_S2_lsr_r_r_nac>;
-def : T_RRR_pat <S2_asl_r_r_nac,  int_hexagon_S2_asl_r_r_nac>;
-def : T_RRR_pat <S2_lsl_r_r_nac,  int_hexagon_S2_lsl_r_r_nac>;
-def : T_RRR_pat <S2_asr_r_r_acc,  int_hexagon_S2_asr_r_r_acc>;
-def : T_RRR_pat <S2_lsr_r_r_acc,  int_hexagon_S2_lsr_r_r_acc>;
-def : T_RRR_pat <S2_asl_r_r_acc,  int_hexagon_S2_asl_r_r_acc>;
-def : T_RRR_pat <S2_lsl_r_r_acc,  int_hexagon_S2_lsl_r_r_acc>;
-
-def : T_RRR_pat <S2_asr_r_r_and,  int_hexagon_S2_asr_r_r_and>;
-def : T_RRR_pat <S2_lsr_r_r_and,  int_hexagon_S2_lsr_r_r_and>;
-def : T_RRR_pat <S2_asl_r_r_and,  int_hexagon_S2_asl_r_r_and>;
-def : T_RRR_pat <S2_lsl_r_r_and,  int_hexagon_S2_lsl_r_r_and>;
-def : T_RRR_pat <S2_asr_r_r_or,   int_hexagon_S2_asr_r_r_or>;
-def : T_RRR_pat <S2_lsr_r_r_or,   int_hexagon_S2_lsr_r_r_or>;
-def : T_RRR_pat <S2_asl_r_r_or,   int_hexagon_S2_asl_r_r_or>;
-def : T_RRR_pat <S2_lsl_r_r_or,   int_hexagon_S2_lsl_r_r_or>;
-
-def : T_PPR_pat <S2_asr_r_p_nac,  int_hexagon_S2_asr_r_p_nac>;
-def : T_PPR_pat <S2_lsr_r_p_nac,  int_hexagon_S2_lsr_r_p_nac>;
-def : T_PPR_pat <S2_asl_r_p_nac,  int_hexagon_S2_asl_r_p_nac>;
-def : T_PPR_pat <S2_lsl_r_p_nac,  int_hexagon_S2_lsl_r_p_nac>;
-def : T_PPR_pat <S2_asr_r_p_acc,  int_hexagon_S2_asr_r_p_acc>;
-def : T_PPR_pat <S2_lsr_r_p_acc,  int_hexagon_S2_lsr_r_p_acc>;
-def : T_PPR_pat <S2_asl_r_p_acc,  int_hexagon_S2_asl_r_p_acc>;
-def : T_PPR_pat <S2_lsl_r_p_acc,  int_hexagon_S2_lsl_r_p_acc>;
-
-def : T_PPR_pat <S2_asr_r_p_and,  int_hexagon_S2_asr_r_p_and>;
-def : T_PPR_pat <S2_lsr_r_p_and,  int_hexagon_S2_lsr_r_p_and>;
-def : T_PPR_pat <S2_asl_r_p_and,  int_hexagon_S2_asl_r_p_and>;
-def : T_PPR_pat <S2_lsl_r_p_and,  int_hexagon_S2_lsl_r_p_and>;
-def : T_PPR_pat <S2_asr_r_p_or,   int_hexagon_S2_asr_r_p_or>;
-def : T_PPR_pat <S2_lsr_r_p_or,   int_hexagon_S2_lsr_r_p_or>;
-def : T_PPR_pat <S2_asl_r_p_or,   int_hexagon_S2_asl_r_p_or>;
-def : T_PPR_pat <S2_lsl_r_p_or,   int_hexagon_S2_lsl_r_p_or>;
-
-def : T_RRI_pat <S2_asr_i_r_nac,  int_hexagon_S2_asr_i_r_nac>;
-def : T_RRI_pat <S2_lsr_i_r_nac,  int_hexagon_S2_lsr_i_r_nac>;
-def : T_RRI_pat <S2_asl_i_r_nac,  int_hexagon_S2_asl_i_r_nac>;
-def : T_RRI_pat <S2_asr_i_r_acc,  int_hexagon_S2_asr_i_r_acc>;
-def : T_RRI_pat <S2_lsr_i_r_acc,  int_hexagon_S2_lsr_i_r_acc>;
-def : T_RRI_pat <S2_asl_i_r_acc,  int_hexagon_S2_asl_i_r_acc>;
-
-def : T_RRI_pat <S2_asr_i_r_and,  int_hexagon_S2_asr_i_r_and>;
-def : T_RRI_pat <S2_lsr_i_r_and,  int_hexagon_S2_lsr_i_r_and>;
-def : T_RRI_pat <S2_asl_i_r_and,  int_hexagon_S2_asl_i_r_and>;
-def : T_RRI_pat <S2_asr_i_r_or,   int_hexagon_S2_asr_i_r_or>;
-def : T_RRI_pat <S2_lsr_i_r_or,   int_hexagon_S2_lsr_i_r_or>;
-def : T_RRI_pat <S2_asl_i_r_or,   int_hexagon_S2_asl_i_r_or>;
-def : T_RRI_pat <S2_lsr_i_r_xacc, int_hexagon_S2_lsr_i_r_xacc>;
-def : T_RRI_pat <S2_asl_i_r_xacc, int_hexagon_S2_asl_i_r_xacc>;
-
-def : T_PPI_pat <S2_asr_i_p_nac,  int_hexagon_S2_asr_i_p_nac>;
-def : T_PPI_pat <S2_lsr_i_p_nac,  int_hexagon_S2_lsr_i_p_nac>;
-def : T_PPI_pat <S2_asl_i_p_nac,  int_hexagon_S2_asl_i_p_nac>;
-def : T_PPI_pat <S2_asr_i_p_acc,  int_hexagon_S2_asr_i_p_acc>;
-def : T_PPI_pat <S2_lsr_i_p_acc,  int_hexagon_S2_lsr_i_p_acc>;
-def : T_PPI_pat <S2_asl_i_p_acc,  int_hexagon_S2_asl_i_p_acc>;
-
-def : T_PPI_pat <S2_asr_i_p_and,  int_hexagon_S2_asr_i_p_and>;
-def : T_PPI_pat <S2_lsr_i_p_and,  int_hexagon_S2_lsr_i_p_and>;
-def : T_PPI_pat <S2_asl_i_p_and,  int_hexagon_S2_asl_i_p_and>;
-def : T_PPI_pat <S2_asr_i_p_or,   int_hexagon_S2_asr_i_p_or>;
-def : T_PPI_pat <S2_lsr_i_p_or,   int_hexagon_S2_lsr_i_p_or>;
-def : T_PPI_pat <S2_asl_i_p_or,   int_hexagon_S2_asl_i_p_or>;
-def : T_PPI_pat <S2_lsr_i_p_xacc, int_hexagon_S2_lsr_i_p_xacc>;
-def : T_PPI_pat <S2_asl_i_p_xacc, int_hexagon_S2_asl_i_p_xacc>;
-
-def : T_RRR_pat <S2_asr_r_r_nac,  int_hexagon_S2_asr_r_r_nac>;
-def : T_RRR_pat <S2_lsr_r_r_nac,  int_hexagon_S2_lsr_r_r_nac>;
-def : T_RRR_pat <S2_asl_r_r_nac,  int_hexagon_S2_asl_r_r_nac>;
-def : T_RRR_pat <S2_lsl_r_r_nac,  int_hexagon_S2_lsl_r_r_nac>;
-def : T_RRR_pat <S2_asr_r_r_acc,  int_hexagon_S2_asr_r_r_acc>;
-def : T_RRR_pat <S2_lsr_r_r_acc,  int_hexagon_S2_lsr_r_r_acc>;
-def : T_RRR_pat <S2_asl_r_r_acc,  int_hexagon_S2_asl_r_r_acc>;
-def : T_RRR_pat <S2_lsl_r_r_acc,  int_hexagon_S2_lsl_r_r_acc>;
-
-def : T_RRR_pat <S2_asr_r_r_and,  int_hexagon_S2_asr_r_r_and>;
-def : T_RRR_pat <S2_lsr_r_r_and,  int_hexagon_S2_lsr_r_r_and>;
-def : T_RRR_pat <S2_asl_r_r_and,  int_hexagon_S2_asl_r_r_and>;
-def : T_RRR_pat <S2_lsl_r_r_and,  int_hexagon_S2_lsl_r_r_and>;
-def : T_RRR_pat <S2_asr_r_r_or,   int_hexagon_S2_asr_r_r_or>;
-def : T_RRR_pat <S2_lsr_r_r_or,   int_hexagon_S2_lsr_r_r_or>;
-def : T_RRR_pat <S2_asl_r_r_or,   int_hexagon_S2_asl_r_r_or>;
-def : T_RRR_pat <S2_lsl_r_r_or,   int_hexagon_S2_lsl_r_r_or>;
-
-def : T_PPR_pat <S2_asr_r_p_nac,  int_hexagon_S2_asr_r_p_nac>;
-def : T_PPR_pat <S2_lsr_r_p_nac,  int_hexagon_S2_lsr_r_p_nac>;
-def : T_PPR_pat <S2_asl_r_p_nac,  int_hexagon_S2_asl_r_p_nac>;
-def : T_PPR_pat <S2_lsl_r_p_nac,  int_hexagon_S2_lsl_r_p_nac>;
-def : T_PPR_pat <S2_asr_r_p_acc,  int_hexagon_S2_asr_r_p_acc>;
-def : T_PPR_pat <S2_lsr_r_p_acc,  int_hexagon_S2_lsr_r_p_acc>;
-def : T_PPR_pat <S2_asl_r_p_acc,  int_hexagon_S2_asl_r_p_acc>;
-def : T_PPR_pat <S2_lsl_r_p_acc,  int_hexagon_S2_lsl_r_p_acc>;
-
-def : T_PPR_pat <S2_asr_r_p_and,  int_hexagon_S2_asr_r_p_and>;
-def : T_PPR_pat <S2_lsr_r_p_and,  int_hexagon_S2_lsr_r_p_and>;
-def : T_PPR_pat <S2_asl_r_p_and,  int_hexagon_S2_asl_r_p_and>;
-def : T_PPR_pat <S2_lsl_r_p_and,  int_hexagon_S2_lsl_r_p_and>;
-def : T_PPR_pat <S2_asr_r_p_or,   int_hexagon_S2_asr_r_p_or>;
-def : T_PPR_pat <S2_lsr_r_p_or,   int_hexagon_S2_lsr_r_p_or>;
-def : T_PPR_pat <S2_asl_r_p_or,   int_hexagon_S2_asl_r_p_or>;
-def : T_PPR_pat <S2_lsl_r_p_or,   int_hexagon_S2_lsl_r_p_or>;
-
-//*******************************************************************
-//           ALU32/ALU
-//*******************************************************************
-def : T_RR_pat<A2_add,      int_hexagon_A2_add>;
-def : T_RI_pat<A2_addi,     int_hexagon_A2_addi>;
-def : T_RR_pat<A2_sub,      int_hexagon_A2_sub>;
-def : T_IR_pat<A2_subri,    int_hexagon_A2_subri>;
-def : T_RR_pat<A2_and,      int_hexagon_A2_and>;
-def : T_RI_pat<A2_andir,    int_hexagon_A2_andir>;
-def : T_RR_pat<A2_or,       int_hexagon_A2_or>;
-def : T_RI_pat<A2_orir,     int_hexagon_A2_orir>;
-def : T_RR_pat<A2_xor,      int_hexagon_A2_xor>;
-def : T_RR_pat<A2_combinew, int_hexagon_A2_combinew>;
+         (MI I32:$Rs, I64:$Rt)>;
+
+def: Pat<(int_hexagon_A2_add IntRegs:$Rs, IntRegs:$Rt),
+         (A2_add IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(int_hexagon_A2_addi IntRegs:$Rs, imm:$s16),
+         (A2_addi IntRegs:$Rs, imm:$s16)>;
+def: Pat<(int_hexagon_A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt),
+         (A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+
+def: Pat<(int_hexagon_A2_sub IntRegs:$Rs, IntRegs:$Rt),
+         (A2_sub IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(int_hexagon_A2_subri imm:$s10, IntRegs:$Rs),
+         (A2_subri imm:$s10, IntRegs:$Rs)>;
+def: Pat<(int_hexagon_A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt),
+         (A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+
+def: Pat<(int_hexagon_M2_mpyi IntRegs:$Rs, IntRegs:$Rt),
+         (M2_mpyi IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(int_hexagon_M2_mpyui IntRegs:$Rs, IntRegs:$Rt), // Same as M2_mpyi
+         (M2_mpyi IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(int_hexagon_M2_mpysmi IntRegs:$Rs, imm:$s9),
+         (M2_mpysmi IntRegs:$Rs, imm:$s9)>;
+def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$Rs, IntRegs:$Rt),
+         (M2_dpmpyss_s0 IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt),
+         (M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$Rs, imm:$u5),
+         (S2_asl_i_r IntRegs:$Rs, imm:$u5)>;
+def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$Rs, imm:$u5),
+         (S2_lsr_i_r IntRegs:$Rs, imm:$u5)>;
+def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$Rs, imm:$u5),
+         (S2_asr_i_r IntRegs:$Rs, imm:$u5)>;
+def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$Rs, imm:$u6),
+         (S2_asl_i_p DoubleRegs:$Rs, imm:$u6)>;
+def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$Rs, imm:$u6),
+         (S2_lsr_i_p DoubleRegs:$Rs, imm:$u6)>;
+def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$Rs, imm:$u6),
+         (S2_asr_i_p DoubleRegs:$Rs, imm:$u6)>;
+
+def: Pat<(int_hexagon_A2_and IntRegs:$Rs, IntRegs:$Rt),
+         (A2_and IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(int_hexagon_A2_andir IntRegs:$Rs, imm:$s10),
+         (A2_andir IntRegs:$Rs, imm:$s10)>;
+def: Pat<(int_hexagon_A2_or IntRegs:$Rs, IntRegs:$Rt),
+         (A2_or IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(int_hexagon_A2_orir IntRegs:$Rs, imm:$s10),
+         (A2_orir IntRegs:$Rs, imm:$s10)>;
+def: Pat<(int_hexagon_A2_xor IntRegs:$Rs, IntRegs:$Rt),
+         (A2_xor IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(int_hexagon_A2_sxtb IntRegs:$Rs),
+         (A2_sxtb IntRegs:$Rs)>;
+def: Pat<(int_hexagon_A2_sxth IntRegs:$Rs),
+         (A2_sxth IntRegs:$Rs)>;
+def: Pat<(int_hexagon_A2_zxtb IntRegs:$Rs),
+         (A2_zxtb IntRegs:$Rs)>;
+def: Pat<(int_hexagon_A2_zxth IntRegs:$Rs),
+         (A2_zxth IntRegs:$Rs)>;
 
 // Assembler mapped from Rd32=not(Rs32) to Rd32=sub(#-1,Rs32)
 def : Pat <(int_hexagon_A2_not I32:$Rs),
@@ -757,16 +109,6 @@ def : Pat <(int_hexagon_S5_vasrhrnd_goodsyntax I64:$Rs, u4_0ImmPred:$imm),
 def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, u4_0ImmPred:$imm),
            (S5_asrhub_rnd_sat I64:$Rs, (UDEC1 u4_0ImmPred:$imm))>;
 
-// Transfer immediate
-def  : Pat <(int_hexagon_A2_tfril I32:$Rs, u16_0ImmPred:$Is),
-            (A2_tfril I32:$Rs, u16_0ImmPred:$Is)>;
-def  : Pat <(int_hexagon_A2_tfrih I32:$Rs, u16_0ImmPred:$Is),
-            (A2_tfrih I32:$Rs, u16_0ImmPred:$Is)>;
-
-//  Transfer Register/immediate.
-def : T_R_pat <A2_tfr, int_hexagon_A2_tfr>;
-def : T_I_pat <A2_tfrsi, int_hexagon_A2_tfrsi>;
-
 def ImmExt64: SDNodeXForm<imm, [{
   int64_t V = N->getSExtValue();
   return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i64);
@@ -783,49 +125,6 @@ def ImmExt64: SDNodeXForm<imm, [{
 def : Pat<(int_hexagon_A2_tfrpi imm:$Is),
           (A2_tfrpi (ImmExt64 $Is))>;
 
-// Assembler mapped from Rdd32=Rss32 to Rdd32=combine(Rss.H32,Rss.L32)
-def : Pat<(int_hexagon_A2_tfrp I64:$src),
-          (A2_combinew (HiReg I64:$src), (LoReg I64:$src))>;
-
-//*******************************************************************
-//           ALU32/PERM
-//*******************************************************************
-// Combine
-def: T_RR_pat<A2_combine_hh, int_hexagon_A2_combine_hh>;
-def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>;
-def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>;
-def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>;
-
-def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32_0ImmPred, s8_0ImmPred>;
-
-// Mux
-def : T_QRR_pat<C2_mux,   int_hexagon_C2_mux>;
-def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32_0ImmPred>;
-def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32_0ImmPred>;
-def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32_0ImmPred, s8_0ImmPred>;
-
-// Shift halfword
-def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>;
-def : T_R_pat<A2_asrh, int_hexagon_A2_asrh>;
-
-// Sign/zero extend
-def : T_R_pat<A2_sxth, int_hexagon_A2_sxth>;
-def : T_R_pat<A2_sxtb, int_hexagon_A2_sxtb>;
-def : T_R_pat<A2_zxth, int_hexagon_A2_zxth>;
-def : T_R_pat<A2_zxtb, int_hexagon_A2_zxtb>;
-
-//*******************************************************************
-//           ALU32/PRED
-//*******************************************************************
-// Compare
-def : T_Q_RR_pat<C2_cmpeq,  int_hexagon_C2_cmpeq>;
-def : T_Q_RR_pat<C2_cmpgt,  int_hexagon_C2_cmpgt>;
-def : T_Q_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>;
-
-def : T_Q_RI_pat<C2_cmpeqi,  int_hexagon_C2_cmpeqi, s32_0ImmPred>;
-def : T_Q_RI_pat<C2_cmpgti,  int_hexagon_C2_cmpgti, s32_0ImmPred>;
-def : T_Q_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32_0ImmPred>;
-
 def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred:$src2),
            (C2_tfrpr (C2_cmpgti I32:$src1, (SDEC1 s32_0ImmPred:$src2)))>;
 
@@ -839,420 +138,6 @@ def : Pat <(int_hexagon_C2_cmplt I32:$src1, I32:$src2),
 def : Pat <(int_hexagon_C2_cmpltu I32:$src1, I32:$src2),
            (C2_tfrpr (C2_cmpgtu I32:$src2, I32:$src1))>;
 
-//*******************************************************************
-//           ALU32/VH
-//*******************************************************************
-// Vector add, subtract, average halfwords
-def: T_RR_pat<A2_svaddh,   int_hexagon_A2_svaddh>;
-def: T_RR_pat<A2_svaddhs,  int_hexagon_A2_svaddhs>;
-def: T_RR_pat<A2_svadduhs, int_hexagon_A2_svadduhs>;
-
-def: T_RR_pat<A2_svsubh,   int_hexagon_A2_svsubh>;
-def: T_RR_pat<A2_svsubhs,  int_hexagon_A2_svsubhs>;
-def: T_RR_pat<A2_svsubuhs, int_hexagon_A2_svsubuhs>;
-
-def: T_RR_pat<A2_svavgh,   int_hexagon_A2_svavgh>;
-def: T_RR_pat<A2_svavghs,  int_hexagon_A2_svavghs>;
-def: T_RR_pat<A2_svnavgh,  int_hexagon_A2_svnavgh>;
-
-//*******************************************************************
-//           ALU64/ALU
-//*******************************************************************
-def: T_RR_pat<A2_addsat,     int_hexagon_A2_addsat>;
-def: T_RR_pat<A2_subsat,     int_hexagon_A2_subsat>;
-def: T_PP_pat<A2_addp,       int_hexagon_A2_addp>;
-def: T_PP_pat<A2_subp,       int_hexagon_A2_subp>;
-
-def: T_PP_pat<A2_andp,       int_hexagon_A2_andp>;
-def: T_PP_pat<A2_orp,        int_hexagon_A2_orp>;
-def: T_PP_pat<A2_xorp,       int_hexagon_A2_xorp>;
-
-def: T_Q_PP_pat<C2_cmpeqp,   int_hexagon_C2_cmpeqp>;
-def: T_Q_PP_pat<C2_cmpgtp,   int_hexagon_C2_cmpgtp>;
-def: T_Q_PP_pat<C2_cmpgtup,  int_hexagon_C2_cmpgtup>;
-
-def: T_PP_pat<S2_parityp,    int_hexagon_S2_parityp>;
-def: T_RR_pat<S2_packhl,     int_hexagon_S2_packhl>;
-
-//*******************************************************************
-//           ALU64/VB
-//*******************************************************************
-// ALU64 - Vector add
-def : T_PP_pat <A2_vaddub,   int_hexagon_A2_vaddub>;
-def : T_PP_pat <A2_vaddubs,  int_hexagon_A2_vaddubs>;
-def : T_PP_pat <A2_vaddh,    int_hexagon_A2_vaddh>;
-def : T_PP_pat <A2_vaddhs,   int_hexagon_A2_vaddhs>;
-def : T_PP_pat <A2_vadduhs,  int_hexagon_A2_vadduhs>;
-def : T_PP_pat <A2_vaddw,    int_hexagon_A2_vaddw>;
-def : T_PP_pat <A2_vaddws,   int_hexagon_A2_vaddws>;
-
-// ALU64 - Vector average
-def : T_PP_pat <A2_vavgub,   int_hexagon_A2_vavgub>;
-def : T_PP_pat <A2_vavgubr,  int_hexagon_A2_vavgubr>;
-def : T_PP_pat <A2_vavgh,    int_hexagon_A2_vavgh>;
-def : T_PP_pat <A2_vavghr,   int_hexagon_A2_vavghr>;
-def : T_PP_pat <A2_vavghcr,  int_hexagon_A2_vavghcr>;
-def : T_PP_pat <A2_vavguh,   int_hexagon_A2_vavguh>;
-def : T_PP_pat <A2_vavguhr,  int_hexagon_A2_vavguhr>;
-
-def : T_PP_pat <A2_vavgw,    int_hexagon_A2_vavgw>;
-def : T_PP_pat <A2_vavgwr,   int_hexagon_A2_vavgwr>;
-def : T_PP_pat <A2_vavgwcr,  int_hexagon_A2_vavgwcr>;
-def : T_PP_pat <A2_vavguw,   int_hexagon_A2_vavguw>;
-def : T_PP_pat <A2_vavguwr,  int_hexagon_A2_vavguwr>;
-
-// ALU64 - Vector negative average
-def : T_PP_pat <A2_vnavgh,   int_hexagon_A2_vnavgh>;
-def : T_PP_pat <A2_vnavghr,  int_hexagon_A2_vnavghr>;
-def : T_PP_pat <A2_vnavghcr, int_hexagon_A2_vnavghcr>;
-def : T_PP_pat <A2_vnavgw,   int_hexagon_A2_vnavgw>;
-def : T_PP_pat <A2_vnavgwr,  int_hexagon_A2_vnavgwr>;
-def : T_PP_pat <A2_vnavgwcr, int_hexagon_A2_vnavgwcr>;
-
-// ALU64 - Vector max
-def : T_PP_pat <A2_vmaxh,    int_hexagon_A2_vmaxh>;
-def : T_PP_pat <A2_vmaxw,    int_hexagon_A2_vmaxw>;
-def : T_PP_pat <A2_vmaxub,   int_hexagon_A2_vmaxub>;
-def : T_PP_pat <A2_vmaxuh,   int_hexagon_A2_vmaxuh>;
-def : T_PP_pat <A2_vmaxuw,   int_hexagon_A2_vmaxuw>;
-
-// ALU64 - Vector min
-def : T_PP_pat <A2_vminh,    int_hexagon_A2_vminh>;
-def : T_PP_pat <A2_vminw,    int_hexagon_A2_vminw>;
-def : T_PP_pat <A2_vminub,   int_hexagon_A2_vminub>;
-def : T_PP_pat <A2_vminuh,   int_hexagon_A2_vminuh>;
-def : T_PP_pat <A2_vminuw,   int_hexagon_A2_vminuw>;
-
-// ALU64 - Vector sub
-def : T_PP_pat <A2_vsubub,   int_hexagon_A2_vsubub>;
-def : T_PP_pat <A2_vsububs,  int_hexagon_A2_vsububs>;
-def : T_PP_pat <A2_vsubh,    int_hexagon_A2_vsubh>;
-def : T_PP_pat <A2_vsubhs,   int_hexagon_A2_vsubhs>;
-def : T_PP_pat <A2_vsubuhs,  int_hexagon_A2_vsubuhs>;
-def : T_PP_pat <A2_vsubw,    int_hexagon_A2_vsubw>;
-def : T_PP_pat <A2_vsubws,   int_hexagon_A2_vsubws>;
-
-// ALU64 - Vector compare bytes
-def : T_Q_PP_pat <A2_vcmpbeq,  int_hexagon_A2_vcmpbeq>;
-def : T_Q_PP_pat <A4_vcmpbgt,  int_hexagon_A4_vcmpbgt>;
-def : T_Q_PP_pat <A2_vcmpbgtu, int_hexagon_A2_vcmpbgtu>;
-
-// ALU64 - Vector compare halfwords
-def : T_Q_PP_pat <A2_vcmpheq,  int_hexagon_A2_vcmpheq>;
-def : T_Q_PP_pat <A2_vcmphgt,  int_hexagon_A2_vcmphgt>;
-def : T_Q_PP_pat <A2_vcmphgtu, int_hexagon_A2_vcmphgtu>;
-
-// ALU64 - Vector compare words
-def : T_Q_PP_pat <A2_vcmpweq,  int_hexagon_A2_vcmpweq>;
-def : T_Q_PP_pat <A2_vcmpwgt,  int_hexagon_A2_vcmpwgt>;
-def : T_Q_PP_pat <A2_vcmpwgtu, int_hexagon_A2_vcmpwgtu>;
-
-// ALU64 / VB / Vector mux.
-def : T_QPP_pat <C2_vmux,      int_hexagon_C2_vmux>;
-
-// MPY - Multiply and use full result
-// Rdd = mpy[u](Rs, Rt)
-def : T_RR_pat <M2_dpmpyss_s0, int_hexagon_M2_dpmpyss_s0>;
-def : T_RR_pat <M2_dpmpyuu_s0, int_hexagon_M2_dpmpyuu_s0>;
-
-// Complex multiply real or imaginary
-def : T_RR_pat <M2_cmpyi_s0,   int_hexagon_M2_cmpyi_s0>;
-def : T_RR_pat <M2_cmpyr_s0,   int_hexagon_M2_cmpyr_s0>;
-
-// Complex multiply
-def : T_RR_pat <M2_cmpys_s0,   int_hexagon_M2_cmpys_s0>;
-def : T_RR_pat <M2_cmpysc_s0,  int_hexagon_M2_cmpysc_s0>;
-def : T_RR_pat <M2_cmpys_s1,   int_hexagon_M2_cmpys_s1>;
-def : T_RR_pat <M2_cmpysc_s1,  int_hexagon_M2_cmpysc_s1>;
-
-// Vector multiply halfwords
-// Rdd=vmpyh(Rs,Rt)[:<<1]:sat
-def : T_RR_pat <M2_vmpy2s_s0,  int_hexagon_M2_vmpy2s_s0>;
-def : T_RR_pat <M2_vmpy2s_s1,  int_hexagon_M2_vmpy2s_s1>;
-
-// Rxx[+-]= mpy[u](Rs,Rt)
-def : T_PRR_pat <M2_dpmpyss_acc_s0, int_hexagon_M2_dpmpyss_acc_s0>;
-def : T_PRR_pat <M2_dpmpyss_nac_s0, int_hexagon_M2_dpmpyss_nac_s0>;
-def : T_PRR_pat <M2_dpmpyuu_acc_s0, int_hexagon_M2_dpmpyuu_acc_s0>;
-def : T_PRR_pat <M2_dpmpyuu_nac_s0, int_hexagon_M2_dpmpyuu_nac_s0>;
-
-// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
-def : T_PRR_pat <M2_cmacs_s0, int_hexagon_M2_cmacs_s0>;
-def : T_PRR_pat <M2_cnacs_s0, int_hexagon_M2_cnacs_s0>;
-def : T_PRR_pat <M2_cmacs_s1, int_hexagon_M2_cmacs_s1>;
-def : T_PRR_pat <M2_cnacs_s1, int_hexagon_M2_cnacs_s1>;
-
-// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
-def : T_PRR_pat <M2_cmacsc_s0, int_hexagon_M2_cmacsc_s0>;
-def : T_PRR_pat <M2_cnacsc_s0, int_hexagon_M2_cnacsc_s0>;
-def : T_PRR_pat <M2_cmacsc_s1, int_hexagon_M2_cmacsc_s1>;
-def : T_PRR_pat <M2_cnacsc_s1, int_hexagon_M2_cnacsc_s1>;
-
-// Rxx+=cmpy[ir](Rs,Rt)
-def : T_PRR_pat <M2_cmaci_s0, int_hexagon_M2_cmaci_s0>;
-def : T_PRR_pat <M2_cmacr_s0, int_hexagon_M2_cmacr_s0>;
-
-// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
-def : T_PRR_pat <M2_vmac2, int_hexagon_M2_vmac2>;
-def : T_PRR_pat <M2_vmac2s_s0, int_hexagon_M2_vmac2s_s0>;
-def : T_PRR_pat <M2_vmac2s_s1, int_hexagon_M2_vmac2s_s1>;
-
-//*******************************************************************
-//           CR
-//*******************************************************************
-def: T_Q_Q_pat<C2_not,       int_hexagon_C2_not>;
-def: T_Q_Q_pat<C2_all8,      int_hexagon_C2_all8>;
-def: T_Q_Q_pat<C2_any8,      int_hexagon_C2_any8>;
-def: T_Q_Q_pat<C2_pxfer_map, int_hexagon_C2_pxfer_map>;
-
-def: T_Q_QQ_pat<C2_and,      int_hexagon_C2_and>;
-def: T_Q_QQ_pat<C2_andn,     int_hexagon_C2_andn>;
-def: T_Q_QQ_pat<C2_or,       int_hexagon_C2_or>;
-def: T_Q_QQ_pat<C2_orn,      int_hexagon_C2_orn>;
-def: T_Q_QQ_pat<C2_xor,      int_hexagon_C2_xor>;
-
-// Multiply 32x32 and use lower result
-def : T_RRI_pat <M2_macsip, int_hexagon_M2_macsip>;
-def : T_RRI_pat <M2_macsin, int_hexagon_M2_macsin>;
-def : T_RRR_pat <M2_maci,   int_hexagon_M2_maci>;
-
-// Subtract and accumulate
-def : T_RRR_pat <M2_subacc, int_hexagon_M2_subacc>;
-
-// Add and accumulate
-def : T_RRR_pat <M2_acci,   int_hexagon_M2_acci>;
-def : T_RRR_pat <M2_nacci,  int_hexagon_M2_nacci>;
-def : T_RRI_pat <M2_accii,  int_hexagon_M2_accii>;
-def : T_RRI_pat <M2_naccii, int_hexagon_M2_naccii>;
-
-// XOR and XOR with destination
-def : T_RRR_pat <M2_xor_xacc, int_hexagon_M2_xor_xacc>;
-
-// Vector dual multiply with round and pack
-def : T_PP_pat <M2_vdmpyrs_s0, int_hexagon_M2_vdmpyrs_s0>;
-def : T_PP_pat <M2_vdmpyrs_s1, int_hexagon_M2_vdmpyrs_s1>;
-
-// Vector multiply halfwords with round and pack
-def : T_RR_pat <M2_vmpy2s_s0pack, int_hexagon_M2_vmpy2s_s0pack>;
-def : T_RR_pat <M2_vmpy2s_s1pack, int_hexagon_M2_vmpy2s_s1pack>;
-
-// Multiply and use lower result
-def : T_RR_pat <M2_mpyi, int_hexagon_M2_mpyi>;
-def : T_RI_pat <M2_mpysmi, int_hexagon_M2_mpysmi>;
-
-// Assembler mapped from Rd32=mpyui(Rs32,Rt32) to Rd32=mpyi(Rs32,Rt32)
-def : T_RR_pat <M2_mpyi, int_hexagon_M2_mpyui>;
-
-// Multiply and use upper result
-def : T_RR_pat <M2_mpy_up, int_hexagon_M2_mpy_up>;
-def : T_RR_pat <M2_mpyu_up, int_hexagon_M2_mpyu_up>;
-def : T_RR_pat <M2_hmmpyh_rs1, int_hexagon_M2_hmmpyh_rs1>;
-def : T_RR_pat <M2_hmmpyl_rs1, int_hexagon_M2_hmmpyl_rs1>;
-def : T_RR_pat <M2_dpmpyss_rnd_s0, int_hexagon_M2_dpmpyss_rnd_s0>;
-
-// Complex multiply with round and pack
-// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
-def : T_RR_pat <M2_cmpyrs_s0, int_hexagon_M2_cmpyrs_s0>;
-def : T_RR_pat <M2_cmpyrs_s1, int_hexagon_M2_cmpyrs_s1>;
-def : T_RR_pat <M2_cmpyrsc_s0, int_hexagon_M2_cmpyrsc_s0>;
-def : T_RR_pat <M2_cmpyrsc_s1, int_hexagon_M2_cmpyrsc_s1>;
-
-//*******************************************************************
-//           STYPE/ALU
-//*******************************************************************
-def : T_P_pat <A2_absp, int_hexagon_A2_absp>;
-def : T_P_pat <A2_negp, int_hexagon_A2_negp>;
-def : T_P_pat <A2_notp, int_hexagon_A2_notp>;
-
-//*******************************************************************
-//           STYPE/BIT
-//*******************************************************************
-
-// Count leading/trailing
-def: T_R_pat<S2_cl0,     int_hexagon_S2_cl0>;
-def: T_P_pat<S2_cl0p,    int_hexagon_S2_cl0p>;
-def: T_R_pat<S2_cl1,     int_hexagon_S2_cl1>;
-def: T_P_pat<S2_cl1p,    int_hexagon_S2_cl1p>;
-def: T_R_pat<S2_clb,     int_hexagon_S2_clb>;
-def: T_P_pat<S2_clbp,    int_hexagon_S2_clbp>;
-def: T_R_pat<S2_clbnorm, int_hexagon_S2_clbnorm>;
-def: T_R_pat<S2_ct0,     int_hexagon_S2_ct0>;
-def: T_R_pat<S2_ct1,     int_hexagon_S2_ct1>;
-
-// Compare bit mask
-def: T_RR_pat<C2_bitsclr,  int_hexagon_C2_bitsclr>;
-def: T_RI_pat<C2_bitsclri, int_hexagon_C2_bitsclri>;
-def: T_RR_pat<C2_bitsset,  int_hexagon_C2_bitsset>;
-
-// Vector shuffle
-def : T_PP_pat <S2_shuffeb, int_hexagon_S2_shuffeb>;
-def : T_PP_pat <S2_shuffob, int_hexagon_S2_shuffob>;
-def : T_PP_pat <S2_shuffeh, int_hexagon_S2_shuffeh>;
-def : T_PP_pat <S2_shuffoh, int_hexagon_S2_shuffoh>;
-
-// Vector truncate
-def : T_PP_pat <S2_vtrunewh, int_hexagon_S2_vtrunewh>;
-def : T_PP_pat <S2_vtrunowh, int_hexagon_S2_vtrunowh>;
-
-// Linear feedback-shift Iteration.
-def : T_PP_pat <S2_lfsp, int_hexagon_S2_lfsp>;
-
-// Vector align
-// Need custom lowering
-def : T_PPQ_pat <S2_valignrb, int_hexagon_S2_valignrb>;
-def : T_PPI_pat <S2_valignib, int_hexagon_S2_valignib>;
-
-// Vector splice
-def : T_PPQ_pat <S2_vsplicerb, int_hexagon_S2_vsplicerb>;
-def : T_PPI_pat <S2_vspliceib, int_hexagon_S2_vspliceib>;
-
-// Shift by immediate and add
-def : T_RRI_pat<S2_addasl_rrri, int_hexagon_S2_addasl_rrri>;
-
-// Extract bitfield
-def : T_PII_pat<S2_extractup,    int_hexagon_S2_extractup>;
-def : T_RII_pat<S2_extractu,     int_hexagon_S2_extractu>;
-def : T_RP_pat <S2_extractu_rp,  int_hexagon_S2_extractu_rp>;
-def : T_PP_pat <S2_extractup_rp, int_hexagon_S2_extractup_rp>;
-
-// Insert bitfield
-def : Pat <(int_hexagon_S2_insert_rp I32:$src1, I32:$src2, I64:$src3),
-           (S2_insert_rp I32:$src1, I32:$src2, I64:$src3)>;
-
-def : Pat<(i64 (int_hexagon_S2_insertp_rp I64:$src1, I64:$src2, I64:$src3)),
-          (i64 (S2_insertp_rp I64:$src1, I64:$src2, I64:$src3))>;
-
-def : Pat<(int_hexagon_S2_insert I32:$src1, I32:$src2,
-                                 u5_0ImmPred:$src3, u5_0ImmPred:$src4),
-          (S2_insert I32:$src1, I32:$src2,
-                     u5_0ImmPred:$src3, u5_0ImmPred:$src4)>;
-
-def : Pat<(i64 (int_hexagon_S2_insertp I64:$src1, I64:$src2,
-                                       u6_0ImmPred:$src3, u6_0ImmPred:$src4)),
-          (i64 (S2_insertp I64:$src1, I64:$src2,
-                           u6_0ImmPred:$src3, u6_0ImmPred:$src4))>;
-
-// Innterleave/deinterleave
-def : T_P_pat <S2_interleave, int_hexagon_S2_interleave>;
-def : T_P_pat <S2_deinterleave, int_hexagon_S2_deinterleave>;
-
-// Set/Clear/Toggle Bit
-def: T_RI_pat<S2_setbit_i,    int_hexagon_S2_setbit_i>;
-def: T_RI_pat<S2_clrbit_i,    int_hexagon_S2_clrbit_i>;
-def: T_RI_pat<S2_togglebit_i, int_hexagon_S2_togglebit_i>;
-
-def: T_RR_pat<S2_setbit_r,    int_hexagon_S2_setbit_r>;
-def: T_RR_pat<S2_clrbit_r,    int_hexagon_S2_clrbit_r>;
-def: T_RR_pat<S2_togglebit_r, int_hexagon_S2_togglebit_r>;
-
-// Test Bit
-def: T_Q_RI_pat<S2_tstbit_i,  int_hexagon_S2_tstbit_i>;
-def: T_Q_RR_pat<S2_tstbit_r,  int_hexagon_S2_tstbit_r>;
-
-//*******************************************************************
-//           STYPE/COMPLEX
-//*******************************************************************
-// Vector Complex conjugate
-def : T_P_pat <A2_vconj, int_hexagon_A2_vconj>;
-
-// Vector Complex rotate
-def : T_PR_pat <S2_vcrotate, int_hexagon_S2_vcrotate>;
-
-//*******************************************************************
-//           STYPE/PERM
-//*******************************************************************
-
-// Vector saturate without pack
-def : T_P_pat <S2_vsathb_nopack, int_hexagon_S2_vsathb_nopack>;
-def : T_P_pat <S2_vsathub_nopack, int_hexagon_S2_vsathub_nopack>;
-def : T_P_pat <S2_vsatwh_nopack, int_hexagon_S2_vsatwh_nopack>;
-def : T_P_pat <S2_vsatwuh_nopack, int_hexagon_S2_vsatwuh_nopack>;
-
-//*******************************************************************
-//           STYPE/PRED
-//*******************************************************************
-
-// Predicate transfer
-def: Pat<(i32 (int_hexagon_C2_tfrpr I32:$Rs)),
-         (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>;
-def: Pat<(i32 (int_hexagon_C2_tfrrp I32:$Rs)),
-         (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>;
-
-// Mask generate from predicate
-def: Pat<(i64 (int_hexagon_C2_mask I32:$Rs)),
-         (i64 (C2_mask (C2_tfrrp I32:$Rs)))>;
-
-// Viterbi pack even and odd predicate bits
-def: T_QQ_pat<C2_vitpack, int_hexagon_C2_vitpack>;
-
-//*******************************************************************
-//           STYPE/SHIFT
-//*******************************************************************
-
-def : T_PI_pat <S2_asr_i_p, int_hexagon_S2_asr_i_p>;
-def : T_PI_pat <S2_lsr_i_p, int_hexagon_S2_lsr_i_p>;
-def : T_PI_pat <S2_asl_i_p, int_hexagon_S2_asl_i_p>;
-
-def : T_PR_pat <S2_asr_r_p, int_hexagon_S2_asr_r_p>;
-def : T_PR_pat <S2_lsr_r_p, int_hexagon_S2_lsr_r_p>;
-def : T_PR_pat <S2_asl_r_p, int_hexagon_S2_asl_r_p>;
-def : T_PR_pat <S2_lsl_r_p, int_hexagon_S2_lsl_r_p>;
-
-def : T_RR_pat <S2_asr_r_r, int_hexagon_S2_asr_r_r>;
-def : T_RR_pat <S2_lsr_r_r, int_hexagon_S2_lsr_r_r>;
-def : T_RR_pat <S2_asl_r_r, int_hexagon_S2_asl_r_r>;
-def : T_RR_pat <S2_lsl_r_r, int_hexagon_S2_lsl_r_r>;
-
-def : T_RR_pat <S2_asr_r_r_sat, int_hexagon_S2_asr_r_r_sat>;
-def : T_RR_pat <S2_asl_r_r_sat, int_hexagon_S2_asl_r_r_sat>;
-
-def : T_R_pat <S2_vsxtbh,   int_hexagon_S2_vsxtbh>;
-def : T_R_pat <S2_vzxtbh,   int_hexagon_S2_vzxtbh>;
-def : T_R_pat <S2_vsxthw,   int_hexagon_S2_vsxthw>;
-def : T_R_pat <S2_vzxthw,   int_hexagon_S2_vzxthw>;
-def : T_R_pat <S2_vsplatrh, int_hexagon_S2_vsplatrh>;
-def : T_R_pat <A2_sxtw,     int_hexagon_A2_sxtw>;
-
-// Vector saturate and pack
-def : T_R_pat <S2_svsathb,  int_hexagon_S2_svsathb>;
-def : T_R_pat <S2_svsathub, int_hexagon_S2_svsathub>;
-def : T_P_pat <S2_vsathub,  int_hexagon_S2_vsathub>;
-def : T_P_pat <S2_vsatwh,   int_hexagon_S2_vsatwh>;
-def : T_P_pat <S2_vsatwuh,  int_hexagon_S2_vsatwuh>;
-def : T_P_pat <S2_vsathb,   int_hexagon_S2_vsathb>;
-
-def : T_P_pat <S2_vtrunohb,    int_hexagon_S2_vtrunohb>;
-def : T_P_pat <S2_vtrunehb,    int_hexagon_S2_vtrunehb>;
-def : T_P_pat <S2_vrndpackwh,  int_hexagon_S2_vrndpackwh>;
-def : T_P_pat <S2_vrndpackwhs, int_hexagon_S2_vrndpackwhs>;
-def : T_R_pat <S2_brev,        int_hexagon_S2_brev>;
-def : T_R_pat <S2_vsplatrb,    int_hexagon_S2_vsplatrb>;
-
-def : T_R_pat <A2_abs,    int_hexagon_A2_abs>;
-def : T_R_pat <A2_abssat, int_hexagon_A2_abssat>;
-def : T_R_pat <A2_negsat, int_hexagon_A2_negsat>;
-
-def : T_R_pat <A2_swiz,   int_hexagon_A2_swiz>;
-
-def : T_P_pat <A2_sat,    int_hexagon_A2_sat>;
-def : T_R_pat <A2_sath,   int_hexagon_A2_sath>;
-def : T_R_pat <A2_satuh,  int_hexagon_A2_satuh>;
-def : T_R_pat <A2_satub,  int_hexagon_A2_satub>;
-def : T_R_pat <A2_satb,   int_hexagon_A2_satb>;
-
-// Vector arithmetic shift right by immediate with truncate and pack.
-def : T_PI_pat<S2_asr_i_svw_trun, int_hexagon_S2_asr_i_svw_trun>;
-
-def : T_RI_pat <S2_asr_i_r,     int_hexagon_S2_asr_i_r>;
-def : T_RI_pat <S2_lsr_i_r,     int_hexagon_S2_lsr_i_r>;
-def : T_RI_pat <S2_asl_i_r,     int_hexagon_S2_asl_i_r>;
-def : T_RI_pat <S2_asr_i_r_rnd, int_hexagon_S2_asr_i_r_rnd>;
-def : T_RI_pat <S2_asr_i_r_rnd_goodsyntax,
-                int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
-
-// Shift left by immediate with saturation.
-def : T_RI_pat <S2_asl_i_r_sat, int_hexagon_S2_asl_i_r_sat>;
-
 //===----------------------------------------------------------------------===//
 // Template 'def pat' to map tableidx[bhwd] intrinsics to :raw instructions.
 //===----------------------------------------------------------------------===//
@@ -1277,11 +162,8 @@ def SDEC3 : SDNodeXForm<imm, [{
 // values from the 4th input operand. Please note that subtraction is not
 // needed for int_hexagon_S2_tableidxb_goodsyntax.
 
-def : Pat <(int_hexagon_S2_tableidxb_goodsyntax I32:$src1, I32:$src2,
-                                              u4_0ImmPred:$src3, u5_0ImmPred:$src4),
-           (S2_tableidxb I32:$src1, I32:$src2,
-                         u4_0ImmPred:$src3, u5_0ImmPred:$src4)>;
-
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxb_goodsyntax, S2_tableidxb,
+                         IdImm>;
 def : S2op_tableidx_pat <int_hexagon_S2_tableidxh_goodsyntax, S2_tableidxh,
                          SDEC1>;
 def : S2op_tableidx_pat <int_hexagon_S2_tableidxw_goodsyntax, S2_tableidxw,
@@ -1289,52 +171,6 @@ def : S2op_tableidx_pat <int_hexagon_S2_tableidxw_goodsyntax, S2_tableidxw,
 def : S2op_tableidx_pat <int_hexagon_S2_tableidxd_goodsyntax, S2_tableidxd,
                          SDEC3>;
 
-//*******************************************************************
-//           STYPE/VH
-//*******************************************************************
-
-// Vector absolute value halfwords with and without saturation
-// Rdd64=vabsh(Rss64)[:sat]
-def : T_P_pat <A2_vabsh, int_hexagon_A2_vabsh>;
-def : T_P_pat <A2_vabshsat, int_hexagon_A2_vabshsat>;
-
-// Vector shift halfwords by immediate
-// Rdd64=[vaslh/vasrh/vlsrh](Rss64,u4)
-def : T_PI_pat <S2_asr_i_vh, int_hexagon_S2_asr_i_vh>;
-def : T_PI_pat <S2_lsr_i_vh, int_hexagon_S2_lsr_i_vh>;
-def : T_PI_pat <S2_asl_i_vh, int_hexagon_S2_asl_i_vh>;
-
-// Vector shift halfwords by register
-// Rdd64=[vaslw/vasrw/vlslw/vlsrw](Rss64,Rt32)
-def : T_PR_pat <S2_asr_r_vh, int_hexagon_S2_asr_r_vh>;
-def : T_PR_pat <S2_lsr_r_vh, int_hexagon_S2_lsr_r_vh>;
-def : T_PR_pat <S2_asl_r_vh, int_hexagon_S2_asl_r_vh>;
-def : T_PR_pat <S2_lsl_r_vh, int_hexagon_S2_lsl_r_vh>;
-
-//*******************************************************************
-//           STYPE/VW
-//*******************************************************************
-
-// Vector absolute value words with and without saturation
-def : T_P_pat <A2_vabsw, int_hexagon_A2_vabsw>;
-def : T_P_pat <A2_vabswsat, int_hexagon_A2_vabswsat>;
-
-// Vector shift words by immediate.
-// Rdd64=[vasrw/vlsrw|vaslw](Rss64,u5)
-def : T_PI_pat <S2_asr_i_vw, int_hexagon_S2_asr_i_vw>;
-def : T_PI_pat <S2_lsr_i_vw, int_hexagon_S2_lsr_i_vw>;
-def : T_PI_pat <S2_asl_i_vw, int_hexagon_S2_asl_i_vw>;
-
-// Vector shift words by register.
-// Rdd64=[vasrw/vlsrw|vaslw|vlslw](Rss64,Rt32)
-def : T_PR_pat <S2_asr_r_vw, int_hexagon_S2_asr_r_vw>;
-def : T_PR_pat <S2_lsr_r_vw, int_hexagon_S2_lsr_r_vw>;
-def : T_PR_pat <S2_asl_r_vw, int_hexagon_S2_asl_r_vw>;
-def : T_PR_pat <S2_lsl_r_vw, int_hexagon_S2_lsl_r_vw>;
-
-// Vector shift words with truncate and pack
-def : T_PR_pat <S2_asr_r_svw_trun, int_hexagon_S2_asr_r_svw_trun>;
-
 // Load/store locked.
 def : T_R_pat<L2_loadw_locked, int_hexagon_L2_loadw_locked>;
 def : T_R_pat<L4_loadd_locked, int_hexagon_L4_loadd_locked>;
@@ -1370,10 +206,13 @@ def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
 
 multiclass MaskedStore <InstHexagon MI, Intrinsic IntID> {
   def : Pat<(IntID HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
-            (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>;
+            (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>,
+        Requires<[UseHVX]>;
+
   def : Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2,
                                              HvxVR:$src3),
-            (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>;
+            (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>,
+        Requires<[UseHVX]>;
 }
 
 defm : MaskedStore <V6_vS32b_qpred_ai, int_hexagon_V6_vmaskedstoreq>;
@@ -1398,5 +237,241 @@ def: T_R_pat<Y2_dczeroa,     int_hexagon_Y2_dczeroa>;
 def: T_RR_pat<Y4_l2fetch,    int_hexagon_Y4_l2fetch>;
 def: T_RP_pat<Y5_l2fetch,    int_hexagon_Y5_l2fetch>;
 
-include "HexagonIntrinsicsV5.td"
-include "HexagonIntrinsicsV60.td"
+//
+// Patterns for optimizing code generations for HVX.
+
+def u3_64_ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)(64 - N->getSExtValue());
+  return isUInt<3>(v);
+}]>;
+
+def u3_128_ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)(128 - N->getSExtValue());
+  return isUInt<3>(v);
+}]>;
+
+def SUB_64_VAL : SDNodeXForm<imm, [{
+   int32_t Imm = N->getSExtValue();
+   return CurDAG->getTargetConstant(64 - Imm, SDLoc(N), MVT::i32);
+}]>;
+
+def SUB_128_VAL : SDNodeXForm<imm, [{
+   int32_t Imm = N->getSExtValue();
+   return CurDAG->getTargetConstant(128 - Imm, SDLoc(N), MVT::i32);
+}]>;
+
+let AddedComplexity = 100 in {
+def : Pat <(v16i32 (int_hexagon_V6_lo (v32i32 HvxWR:$src1))),
+           (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_lo))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v16i32 (int_hexagon_V6_hi (v32i32 HvxWR:$src1))),
+           (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_hi))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v32i32 (int_hexagon_V6_lo_128B (v64i32 HvxWR:$src1))),
+           (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_lo))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v32i32 (int_hexagon_V6_hi_128B (v64i32 HvxWR:$src1))),
+           (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_hi))>,
+           Requires<[UseHVX]>;
+}
+
+def : Pat <(v512i1 (bitconvert (v16i32 HvxVR:$src1))),
+           (v512i1 (V6_vandvrt (v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v512i1 (bitconvert (v32i16 HvxVR:$src1))),
+           (v512i1 (V6_vandvrt (v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v512i1 (bitconvert (v64i8  HvxVR:$src1))),
+           (v512i1 (V6_vandvrt (v64i8  HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v16i32 (bitconvert (v512i1 HvxQR:$src1))),
+           (v16i32 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v32i16 (bitconvert (v512i1 HvxQR:$src1))),
+           (v32i16 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v64i8  (bitconvert (v512i1 HvxQR:$src1))),
+           (v64i8  (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v1024i1 (bitconvert (v32i32 HvxVR:$src1))),
+           (v1024i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v1024i1 (bitconvert (v64i16 HvxVR:$src1))),
+           (v1024i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v1024i1 (bitconvert (v128i8 HvxVR:$src1))),
+           (v1024i1 (V6_vandvrt (v128i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v32i32 (bitconvert (v1024i1 HvxQR:$src1))),
+           (v32i32 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v64i16 (bitconvert (v1024i1 HvxQR:$src1))),
+           (v64i16 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v128i8 (bitconvert (v1024i1 HvxQR:$src1))),
+           (v128i8 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+let AddedComplexity = 140 in {
+def : Pat <(store (v512i1 HvxQR:$src1), (i32 IntRegs:$addr)),
+           (V6_vS32b_ai IntRegs:$addr, 0,
+           (v16i32 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101))))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v512i1 (load (i32 IntRegs:$addr))),
+           (v512i1 (V6_vandvrt
+           (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(store (v1024i1 HvxQR:$src1), (i32 IntRegs:$addr)),
+           (V6_vS32b_ai IntRegs:$addr, 0,
+           (v32i32 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101))))>,
+           Requires<[UseHVX]>;
+
+def : Pat <(v1024i1 (load (i32 IntRegs:$addr))),
+           (v1024i1 (V6_vandvrt
+           (v32i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>,
+           Requires<[UseHVX]>;
+}
+
+def: Pat<(v64i16 (trunc v64i32:$Vdd)),
+         (v64i16 (V6_vpackwh_sat
+                 (v32i32 (V6_hi HvxWR:$Vdd)),
+                 (v32i32 (V6_lo HvxWR:$Vdd))))>,
+     Requires<[UseHVX]>;
+
+def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV55]>;
+
+multiclass T_VI_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID HvxVR:$src1, u3_0ImmPred:$src2),
+           (MI    HvxVR:$src1, HvxVR:$src1, u3_0ImmPred:$src2)>,
+       Requires<[UseHVX]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, u3_0ImmPred:$src2),
+           (MI                 HvxVR:$src1, HvxVR:$src1, u3_0ImmPred:$src2)>,
+       Requires<[UseHVX]>;
+}
+
+multiclass T_VI_inv_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID HvxVR:$src1, u3_64_ImmPred:$src2),
+           (MI    HvxVR:$src1, HvxVR:$src1,
+                  (SUB_64_VAL u3_64_ImmPred:$src2))>,
+       Requires<[UseHVX]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, u3_128_ImmPred:$src2),
+           (MI HvxVR:$src1, HvxVR:$src1, (SUB_128_VAL u3_128_ImmPred:$src2))>,
+       Requires<[UseHVX]>;
+}
+
+multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
+           (MI    HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>,
+       Requires<[UseHVX]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
+                                            u3_0ImmPred:$src3),
+           (MI                              HvxVR:$src1, HvxVR:$src2,
+                                            u3_0ImmPred:$src3)>,
+       Requires<[UseHVX]>;
+}
+
+multiclass T_VVI_inv_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, u3_64_ImmPred:$src3),
+           (MI    HvxVR:$src1, HvxVR:$src2,
+                                    (SUB_64_VAL u3_64_ImmPred:$src3))>,
+       Requires<[UseHVX]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
+                                            u3_128_ImmPred:$src3),
+           (MI                              HvxVR:$src1, HvxVR:$src2,
+                                          (SUB_128_VAL u3_128_ImmPred:$src3))>,
+       Requires<[UseHVX]>;
+}
+
+multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+           (MI    HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>,
+       Requires<[UseHVX]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
+                                            IntRegs:$src3),
+           (MI                              HvxVR:$src1, HvxVR:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVX]>;
+}
+
+defm : T_VI_pat <V6_valignbi, int_hexagon_V6_vror>;
+defm : T_VI_inv_pat <V6_vlalignbi, int_hexagon_V6_vror>;
+
+defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignb>;
+defm : T_VVI_inv_pat <V6_vlalignbi, int_hexagon_V6_valignbi>;
+defm : T_VVI_inv_pat <V6_vlalignbi, int_hexagon_V6_valignb>;
+defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignbi>;
+defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignb>;
+defm : T_VVI_inv_pat <V6_valignbi, int_hexagon_V6_vlalignbi>;
+defm : T_VVI_inv_pat <V6_valignbi, int_hexagon_V6_vlalignb>;
+defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignbi>;
+
+def: Pat<(int_hexagon_V6_vd0),
+         (V6_vd0)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vd0_128B ),
+         (V6_vd0)>, Requires<[HasV60, UseHVX128B]>;
+
+def: Pat<(int_hexagon_V6_vdd0),
+         (V6_vdd0)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdd0_128B),
+         (V6_vdd0)>, Requires<[HasV65, UseHVX128B]>;
+
+def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+         (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+         (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+         (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+         (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
+         (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+         (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+         (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+         (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+         (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>;
+def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
+         (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>;
+
+include "HexagonDepMapAsm2Intrin.td"
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 29c044b3b7295761ed0f20bbb4276c98bfa7dc33..c3a5bd5d57bfeee7c32697458f8dd0035d349179 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -502,7 +502,8 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
       MIB.add(ImmOp);
       OpStart = 4;
       Changed = true;
-    } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
+    } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset &&
+               OldMI->getOperand(2).isImm()) {
       short NewOpCode = HII->changeAddrMode_io_abs(*OldMI);
       assert(NewOpCode >= 0 && "Invalid New opcode\n");
       MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode))
@@ -518,17 +519,19 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
 
     LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
     LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
-  } else if (ImmOpNum == 2 && OldMI->getOperand(3).getImm() == 0) {
-    short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
-    assert(NewOpCode >= 0 && "Invalid New opcode\n");
-    MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-    MIB.add(OldMI->getOperand(0));
-    MIB.add(OldMI->getOperand(1));
-    MIB.add(ImmOp);
-    OpStart = 4;
-    Changed = true;
-    LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
-    LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+  } else if (ImmOpNum == 2) {
+    if (OldMI->getOperand(3).isImm() && OldMI->getOperand(3).getImm() == 0) {
+      short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+      MIB.add(OldMI->getOperand(0));
+      MIB.add(OldMI->getOperand(1));
+      MIB.add(ImmOp);
+      OpStart = 4;
+      Changed = true;
+      LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+      LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
+    }
   }
 
   if (Changed)
@@ -758,11 +761,13 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
       // This could happen, for example, when DefR = R4, but the used
       // register is D2.
 
+      // Change UseMI if replacement is possible. If any replacement failed,
+      // or wasn't attempted, make sure to keep the TFR.
+      bool Xformed = false;
       if (UseMOnum >= 0 && InstrEvalResult[UseMI])
-        // Change UseMI if replacement is possible.
-        Changed |= xformUseMI(MI, UseMI, UseN, UseMOnum);
-      else
-        KeepTfr = true;
+        Xformed = xformUseMI(MI, UseMI, UseN, UseMOnum);
+      Changed |=  Xformed;
+      KeepTfr |= !Xformed;
     }
     if (!KeepTfr)
       Deleted.insert(MI);
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index ddf5a9ca3645f958c1c11638078548f04e7b2a5a..311b5a702d5a7d5faaf0c980e2b402b995721be5 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -218,6 +218,8 @@ def I1toI32:  OutPatFrag<(ops node:$Rs), (C2_muxii (i1 $Rs), 1, 0)>;
 def I32toI1:  OutPatFrag<(ops node:$Rs), (i1 (C2_cmpgtui (i32 $Rs), (i32 0)))>;
 def ToZext64: OutPatFrag<(ops node:$Rs), (i64 (A4_combineir 0, (i32 $Rs)))>;
 def ToSext64: OutPatFrag<(ops node:$Rs), (i64 (A2_sxtw (i32 $Rs)))>;
+def ToAext64: OutPatFrag<(ops node:$Rs),
+  (REG_SEQUENCE DoubleRegs, (i32 (IMPLICIT_DEF)), isub_hi, (i32 $Rs), isub_lo)>;
 
 def Combinew: OutPatFrag<(ops node:$Rs, node:$Rt),
   (REG_SEQUENCE DoubleRegs, $Rs, isub_hi, $Rt, isub_lo)>;
@@ -246,6 +248,9 @@ def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>;
 def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>;
 def Sext64: PatLeaf<(i64 Usxtw:$Rs)>;
 
+def azext: PatFrags<(ops node:$Rs), [(zext node:$Rs), (anyext node:$Rs)]>;
+def asext: PatFrags<(ops node:$Rs), [(sext node:$Rs), (anyext node:$Rs)]>;
+
 def: Pat<(IsOrAdd (i32 AddrFI:$Rs), s32_0ImmPred:$off),
          (PS_fi (i32 AddrFI:$Rs), imm:$off)>;
 
@@ -416,52 +421,48 @@ def: Pat<(sext_inreg I64:$Rs, i32), (A2_sxtw (LoReg $Rs))>;
 def: Pat<(sext_inreg I64:$Rs, i16), (A2_sxtw (A2_sxth (LoReg $Rs)))>;
 def: Pat<(sext_inreg I64:$Rs, i8),  (A2_sxtw (A2_sxtb (LoReg $Rs)))>;
 
-def: Pat<(i64 (sext I1:$Pu)),
-         (Combinew (C2_muxii PredRegs:$Pu, -1, 0),
-                   (C2_muxii PredRegs:$Pu, -1, 0))>;
-
-def: Pat<(i32   (sext I1:$Pu)),   (C2_muxii I1:$Pu, -1, 0)>;
-def: Pat<(i32   (zext I1:$Pu)),   (C2_muxii I1:$Pu, 1, 0)>;
-def: Pat<(i64   (zext I1:$Pu)),   (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
-def: Pat<(v2i16 (sext V2I1:$Pu)), (S2_vtrunehb (C2_mask V2I1:$Pu))>;
-def: Pat<(v2i32 (sext V2I1:$Pu)), (C2_mask V2I1:$Pu)>;
-def: Pat<(v4i8  (sext V4I1:$Pu)), (S2_vtrunehb (C2_mask V4I1:$Pu))>;
-def: Pat<(v4i16 (sext V4I1:$Pu)), (C2_mask V4I1:$Pu)>;
-def: Pat<(v8i8  (sext V8I1:$Pu)), (C2_mask V8I1:$Pu)>;
-
 def: Pat<(i64 (sext I32:$Rs)), (A2_sxtw I32:$Rs)>;
 def: Pat<(Zext64 I32:$Rs),     (ToZext64 $Rs)>;
 def: Pat<(Aext64 I32:$Rs),     (ToZext64 $Rs)>;
 
 def: Pat<(i32 (trunc I64:$Rs)), (LoReg $Rs)>;
-def: Pat<(i1 (trunc I64:$Rs)),  (C2_tfrrp (LoReg $Rs))>;
+def: Pat<(i1 (trunc I32:$Rs)),  (S2_tstbit_i I32:$Rs, 0)>;
+def: Pat<(i1 (trunc I64:$Rs)),  (S2_tstbit_i (LoReg $Rs), 0)>;
 
 let AddedComplexity = 20 in {
   def: Pat<(and I32:$Rs, 255),   (A2_zxtb I32:$Rs)>;
   def: Pat<(and I32:$Rs, 65535), (A2_zxth I32:$Rs)>;
 }
 
-def: Pat<(i32 (anyext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
-def: Pat<(i64 (anyext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
+// Extensions from i1 or vectors of i1.
+def: Pat<(i32 (azext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
+def: Pat<(i64 (azext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
+def: Pat<(i32  (sext I1:$Pu)), (C2_muxii I1:$Pu, -1, 0)>;
+def: Pat<(i64  (sext I1:$Pu)), (Combinew (C2_muxii PredRegs:$Pu, -1, 0),
+                                         (C2_muxii PredRegs:$Pu, -1, 0))>;
+
+def: Pat<(v2i16 (sext V2I1:$Pu)), (S2_vtrunehb (C2_mask V2I1:$Pu))>;
+def: Pat<(v2i32 (sext V2I1:$Pu)), (C2_mask V2I1:$Pu)>;
+def: Pat<(v4i8  (sext V4I1:$Pu)), (S2_vtrunehb (C2_mask V4I1:$Pu))>;
+def: Pat<(v4i16 (sext V4I1:$Pu)), (C2_mask V4I1:$Pu)>;
+def: Pat<(v8i8  (sext V8I1:$Pu)), (C2_mask V8I1:$Pu)>;
 
 def Vsplatpi: OutPatFrag<(ops node:$V),
                          (Combinew (A2_tfrsi $V), (A2_tfrsi $V))>;
-def: Pat<(v8i8 (zext V8I1:$Pu)),
-         (A2_andp (C2_mask V8I1:$Pu), (Vsplatpi (i32 0x01010101)))>;
-def: Pat<(v4i16 (zext V4I1:$Pu)),
-         (A2_andp (C2_mask V4I1:$Pu), (Vsplatpi (i32 0x00010001)))>;
-def: Pat<(v2i32 (zext V2I1:$Pu)),
-         (A2_andp (C2_mask V2I1:$Pu), (A2_combineii (i32 1), (i32 1)))>;
 
-def: Pat<(v4i8 (zext V4I1:$Pu)),
-         (A2_andir (LoReg (C2_mask V4I1:$Pu)), (i32 0x01010101))>;
-def: Pat<(v2i16 (zext V2I1:$Pu)),
+def: Pat<(v2i16 (azext V2I1:$Pu)),
          (A2_andir (LoReg (C2_mask V2I1:$Pu)), (i32 0x00010001))>;
+def: Pat<(v2i32 (azext V2I1:$Pu)),
+         (A2_andp (C2_mask V2I1:$Pu), (A2_combineii (i32 1), (i32 1)))>;
+def: Pat<(v4i8 (azext V4I1:$Pu)),
+         (A2_andir (LoReg (C2_mask V4I1:$Pu)), (i32 0x01010101))>;
+def: Pat<(v4i16 (azext V4I1:$Pu)),
+         (A2_andp (C2_mask V4I1:$Pu), (Vsplatpi (i32 0x00010001)))>;
+def: Pat<(v8i8 (azext V8I1:$Pu)),
+         (A2_andp (C2_mask V8I1:$Pu), (Vsplatpi (i32 0x01010101)))>;
 
-def: Pat<(v4i16 (zext   V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
-def: Pat<(v2i32 (zext   V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
-def: Pat<(v4i16 (anyext V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
-def: Pat<(v2i32 (anyext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
+def: Pat<(v4i16 (azext  V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (azext  V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
 def: Pat<(v4i16 (sext   V4I8:$Rs)),  (S2_vsxtbh V4I8:$Rs)>;
 def: Pat<(v2i32 (sext   V2I16:$Rs)), (S2_vsxthw V2I16:$Rs)>;
 
@@ -812,9 +813,9 @@ def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
          (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
 
 def: Pat<(select I1:$Pu, V4I8:$Rs, V4I8:$Rt),
-         (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
+         (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>;
 def: Pat<(select I1:$Pu, V2I16:$Rs, V2I16:$Rt),
-         (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
+         (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>;
 def: Pat<(select I1:$Pu, V2I32:$Rs, V2I32:$Rt),
          (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
                    (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
@@ -1169,6 +1170,18 @@ def: Pat<(srl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
 def: Pat<(shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
          (S2_asl_i_vh V4I16:$b, imm:$c)>;
 
+def: Pat<(HexagonVASR V2I16:$Rs, u4_0ImmPred:$S),
+         (LoReg (S2_asr_i_vh (ToAext64 $Rs), imm:$S))>;
+def: Pat<(HexagonVASL V2I16:$Rs, u4_0ImmPred:$S),
+         (LoReg (S2_asl_i_vh (ToAext64 $Rs), imm:$S))>;
+def: Pat<(HexagonVLSR V2I16:$Rs, u4_0ImmPred:$S),
+         (LoReg (S2_lsr_i_vh (ToAext64 $Rs), imm:$S))>;
+def: Pat<(HexagonVASR V2I16:$Rs, I32:$Rt),
+         (LoReg (S2_asr_i_vh (ToAext64 $Rs), I32:$Rt))>;
+def: Pat<(HexagonVASL V2I16:$Rs, I32:$Rt),
+         (LoReg (S2_asl_i_vh (ToAext64 $Rs), I32:$Rt))>;
+def: Pat<(HexagonVLSR V2I16:$Rs, I32:$Rt),
+         (LoReg (S2_lsr_i_vh (ToAext64 $Rs), I32:$Rt))>;
 
 // --(9) Arithmetic/bitwise ----------------------------------------------
 //
@@ -1259,12 +1272,19 @@ def: OpR_RR_pat<F2_sfmpy,     pf2<fmul>,    f32, F32>;
 def: OpR_RR_pat<F2_sfmin,     pf2<fminnum>, f32, F32>;
 def: OpR_RR_pat<F2_sfmax,     pf2<fmaxnum>, f32, F32>;
 
+let Predicates = [HasV66] in {
+  def: OpR_RR_pat<F2_dfadd,     pf2<fadd>,    f64, F64>;
+  def: OpR_RR_pat<F2_dfsub,     pf2<fsub>,    f64, F64>;
+}
+
 // In expressions like a0*b0 + a1*b1 + ..., prefer to generate multiply-add,
 // over add-add with individual multiplies as inputs.
 let AddedComplexity = 10 in {
   def: AccRRI_pat<M2_macsip,    Add, Su<Mul>, I32, u32_0ImmPred>;
   def: AccRRI_pat<M2_macsin,    Sub, Su<Mul>, I32, u32_0ImmPred>;
   def: AccRRR_pat<M2_maci,      Add, Su<Mul>, I32, I32, I32>;
+  let Predicates = [HasV66] in
+  def: AccRRR_pat<M2_mnaci,     Sub, Su<Mul>, I32, I32, I32>;
 }
 
 def: AccRRI_pat<M2_naccii,    Sub, Su<Add>, I32, s32_0ImmPred>;
@@ -1506,9 +1526,9 @@ def: Pat<(add V2I32:$Rx, (mul V2I32:$Rs, V2I32:$Rt)),
 // Add/subtract two v4i8: Hexagon does not have an insn for this one, so
 // we use the double add v8i8, and use only the low part of the result.
 def: Pat<(add V4I8:$Rs, V4I8:$Rt),
-         (LoReg (A2_vaddub (ToZext64 $Rs), (ToZext64 $Rt)))>;
+         (LoReg (A2_vaddub (ToAext64 $Rs), (ToAext64 $Rt)))>;
 def: Pat<(sub V4I8:$Rs, V4I8:$Rt),
-         (LoReg (A2_vsubub (ToZext64 $Rs), (ToZext64 $Rt)))>;
+         (LoReg (A2_vsubub (ToAext64 $Rs), (ToAext64 $Rt)))>;
 
 // Use M2_vmpy2s_s0 for half-word vector multiply. It multiplies two
 // half-words, and saturates the result to a 32-bit value, except the
@@ -1857,10 +1877,10 @@ let AddedComplexity = 20 in {
 }
 
 let AddedComplexity = 30 in {
-  defm: Loadxim_pat<extloadi1,    i64, ToZext64, anyimm0, L2_loadrub_io>;
-  defm: Loadxim_pat<extloadi8,    i64, ToZext64, anyimm0, L2_loadrub_io>;
-  defm: Loadxim_pat<extloadi16,   i64, ToZext64, anyimm1, L2_loadruh_io>;
-  defm: Loadxim_pat<extloadi32,   i64, ToZext64, anyimm2, L2_loadri_io>;
+  defm: Loadxim_pat<extloadi1,    i64, ToAext64, anyimm0, L2_loadrub_io>;
+  defm: Loadxim_pat<extloadi8,    i64, ToAext64, anyimm0, L2_loadrub_io>;
+  defm: Loadxim_pat<extloadi16,   i64, ToAext64, anyimm1, L2_loadruh_io>;
+  defm: Loadxim_pat<extloadi32,   i64, ToAext64, anyimm2, L2_loadri_io>;
   defm: Loadxim_pat<zextloadi1,   i64, ToZext64, anyimm0, L2_loadrub_io>;
   defm: Loadxim_pat<zextloadi8,   i64, ToZext64, anyimm0, L2_loadrub_io>;
   defm: Loadxim_pat<zextloadi16,  i64, ToZext64, anyimm1, L2_loadruh_io>;
@@ -1895,13 +1915,13 @@ let AddedComplexity  = 60 in {
 
   def: Loadxum_pat<sextloadi8,  i64, anyimm0, ToSext64, L4_loadrb_ur>;
   def: Loadxum_pat<zextloadi8,  i64, anyimm0, ToZext64, L4_loadrub_ur>;
-  def: Loadxum_pat<extloadi8,   i64, anyimm0, ToZext64, L4_loadrub_ur>;
+  def: Loadxum_pat<extloadi8,   i64, anyimm0, ToAext64, L4_loadrub_ur>;
   def: Loadxum_pat<sextloadi16, i64, anyimm1, ToSext64, L4_loadrh_ur>;
   def: Loadxum_pat<zextloadi16, i64, anyimm1, ToZext64, L4_loadruh_ur>;
-  def: Loadxum_pat<extloadi16,  i64, anyimm1, ToZext64, L4_loadruh_ur>;
+  def: Loadxum_pat<extloadi16,  i64, anyimm1, ToAext64, L4_loadruh_ur>;
   def: Loadxum_pat<sextloadi32, i64, anyimm2, ToSext64, L4_loadri_ur>;
   def: Loadxum_pat<zextloadi32, i64, anyimm2, ToZext64, L4_loadri_ur>;
-  def: Loadxum_pat<extloadi32,  i64, anyimm2, ToZext64, L4_loadri_ur>;
+  def: Loadxum_pat<extloadi32,  i64, anyimm2, ToAext64, L4_loadri_ur>;
 }
 
 let AddedComplexity = 40 in {
@@ -1941,25 +1961,25 @@ let AddedComplexity = 20 in {
 }
 
 let AddedComplexity = 40 in {
-  def: Loadxrm_shl_pat<extloadi8,    i64, ToZext64, L4_loadrub_rr>;
+  def: Loadxrm_shl_pat<extloadi8,    i64, ToAext64, L4_loadrub_rr>;
   def: Loadxrm_shl_pat<zextloadi8,   i64, ToZext64, L4_loadrub_rr>;
   def: Loadxrm_shl_pat<sextloadi8,   i64, ToSext64, L4_loadrb_rr>;
-  def: Loadxrm_shl_pat<extloadi16,   i64, ToZext64, L4_loadruh_rr>;
+  def: Loadxrm_shl_pat<extloadi16,   i64, ToAext64, L4_loadruh_rr>;
   def: Loadxrm_shl_pat<zextloadi16,  i64, ToZext64, L4_loadruh_rr>;
   def: Loadxrm_shl_pat<sextloadi16,  i64, ToSext64, L4_loadrh_rr>;
-  def: Loadxrm_shl_pat<extloadi32,   i64, ToZext64, L4_loadri_rr>;
+  def: Loadxrm_shl_pat<extloadi32,   i64, ToAext64, L4_loadri_rr>;
   def: Loadxrm_shl_pat<zextloadi32,  i64, ToZext64, L4_loadri_rr>;
   def: Loadxrm_shl_pat<sextloadi32,  i64, ToSext64, L4_loadri_rr>;
 }
 
 let AddedComplexity = 20 in {
-  def: Loadxrm_add_pat<extloadi8,    i64, ToZext64, L4_loadrub_rr>;
+  def: Loadxrm_add_pat<extloadi8,    i64, ToAext64, L4_loadrub_rr>;
   def: Loadxrm_add_pat<zextloadi8,   i64, ToZext64, L4_loadrub_rr>;
   def: Loadxrm_add_pat<sextloadi8,   i64, ToSext64, L4_loadrb_rr>;
-  def: Loadxrm_add_pat<extloadi16,   i64, ToZext64, L4_loadruh_rr>;
+  def: Loadxrm_add_pat<extloadi16,   i64, ToAext64, L4_loadruh_rr>;
   def: Loadxrm_add_pat<zextloadi16,  i64, ToZext64, L4_loadruh_rr>;
   def: Loadxrm_add_pat<sextloadi16,  i64, ToSext64, L4_loadrh_rr>;
-  def: Loadxrm_add_pat<extloadi32,   i64, ToZext64, L4_loadri_rr>;
+  def: Loadxrm_add_pat<extloadi32,   i64, ToAext64, L4_loadri_rr>;
   def: Loadxrm_add_pat<zextloadi32,  i64, ToZext64, L4_loadri_rr>;
   def: Loadxrm_add_pat<sextloadi32,  i64, ToSext64, L4_loadri_rr>;
 }
@@ -1991,13 +2011,13 @@ let AddedComplexity  = 60 in {
 }
 
 let AddedComplexity  = 30 in {
-  def: Loadam_pat<extloadi8,      i64, anyimm0, ToZext64, PS_loadrubabs>;
+  def: Loadam_pat<extloadi8,      i64, anyimm0, ToAext64, PS_loadrubabs>;
   def: Loadam_pat<sextloadi8,     i64, anyimm0, ToSext64, PS_loadrbabs>;
   def: Loadam_pat<zextloadi8,     i64, anyimm0, ToZext64, PS_loadrubabs>;
-  def: Loadam_pat<extloadi16,     i64, anyimm1, ToZext64, PS_loadruhabs>;
+  def: Loadam_pat<extloadi16,     i64, anyimm1, ToAext64, PS_loadruhabs>;
   def: Loadam_pat<sextloadi16,    i64, anyimm1, ToSext64, PS_loadrhabs>;
   def: Loadam_pat<zextloadi16,    i64, anyimm1, ToZext64, PS_loadruhabs>;
-  def: Loadam_pat<extloadi32,     i64, anyimm2, ToZext64, PS_loadriabs>;
+  def: Loadam_pat<extloadi32,     i64, anyimm2, ToAext64, PS_loadriabs>;
   def: Loadam_pat<sextloadi32,    i64, anyimm2, ToSext64, PS_loadriabs>;
   def: Loadam_pat<zextloadi32,    i64, anyimm2, ToZext64, PS_loadriabs>;
 
@@ -2033,13 +2053,13 @@ let AddedComplexity  = 100 in {
 }
 
 let AddedComplexity  = 70 in {
-  def: Loadam_pat<extloadi8,      i64, addrgp,  ToZext64, L2_loadrubgp>;
+  def: Loadam_pat<extloadi8,      i64, addrgp,  ToAext64, L2_loadrubgp>;
   def: Loadam_pat<sextloadi8,     i64, addrgp,  ToSext64, L2_loadrbgp>;
   def: Loadam_pat<zextloadi8,     i64, addrgp,  ToZext64, L2_loadrubgp>;
-  def: Loadam_pat<extloadi16,     i64, addrgp,  ToZext64, L2_loadruhgp>;
+  def: Loadam_pat<extloadi16,     i64, addrgp,  ToAext64, L2_loadruhgp>;
   def: Loadam_pat<sextloadi16,    i64, addrgp,  ToSext64, L2_loadrhgp>;
   def: Loadam_pat<zextloadi16,    i64, addrgp,  ToZext64, L2_loadruhgp>;
-  def: Loadam_pat<extloadi32,     i64, addrgp,  ToZext64, L2_loadrigp>;
+  def: Loadam_pat<extloadi32,     i64, addrgp,  ToAext64, L2_loadrigp>;
   def: Loadam_pat<sextloadi32,    i64, addrgp,  ToSext64, L2_loadrigp>;
   def: Loadam_pat<zextloadi32,    i64, addrgp,  ToZext64, L2_loadrigp>;
 
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index 6935e3b7bebca915214d196d2bdb25a7ae515037..b9748c7e189c303dccaa6b977645160fbe4ca12e 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -526,11 +526,11 @@ let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS],
     addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in {
   def NAME#_pci : LDInst<(outs RC:$Rd32, IntRegs:$Rx32),
        (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Cs),
-       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_4403ca65>;
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_e93a3d71>;
 
   def NAME#_pcr : LDInst<(outs RC:$Rd32, IntRegs:$Rx32),
        (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Cs),
-       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_2fc0c436>;
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_44d3da28>;
 }
 }
 
@@ -547,11 +547,11 @@ let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS],
     addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in {
   def NAME#_pci : STInst<(outs IntRegs:$Rx32),
        (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs),
-       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_9fdb5406>;
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_e86aa961>;
 
   def NAME#_pcr : STInst<(outs IntRegs:$Rx32),
        (ins IntRegs:$Rx32in, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs),
-       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_f86c328a>;
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_da97ee82>;
 }
 }
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 545def45a1c307d669f7da63ac80cddf4b4c19b5..9b8f4e07376fca35520aee0ceca56750ded42cd3 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -312,6 +312,7 @@ unsigned HexagonRegisterInfo::getHexagonSubRegIndex(
 
   static const unsigned ISub[] = { Hexagon::isub_lo, Hexagon::isub_hi };
   static const unsigned VSub[] = { Hexagon::vsub_lo, Hexagon::vsub_hi };
+  static const unsigned WSub[] = { Hexagon::wsub_lo, Hexagon::wsub_hi };
 
   switch (RC.getID()) {
     case Hexagon::CtrRegs64RegClassID:
@@ -319,6 +320,8 @@ unsigned HexagonRegisterInfo::getHexagonSubRegIndex(
       return ISub[GenIdx];
     case Hexagon::HvxWRRegClassID:
       return VSub[GenIdx];
+    case Hexagon::HvxVQRRegClassID:
+      return WSub[GenIdx];
   }
 
   if (const TargetRegisterClass *SuperRC = *RC.getSuperClasses())
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index 1fe1ef4ac572c317083345aff0b7dfb76f7ee954..da90911e2c0514cba15f6b04ff1f2e5a8edf464d 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -82,13 +82,14 @@ let Namespace = "Hexagon" in {
   def isub_hi  : SubRegIndex<32, 32>;
   def vsub_lo  : SubRegIndex<512>;
   def vsub_hi  : SubRegIndex<512, 512>;
+  def wsub_lo  : SubRegIndex<1024>;
+  def wsub_hi  : SubRegIndex<1024, 1024>;
   def subreg_overflow : SubRegIndex<1, 0>;
 
   // Integer registers.
   foreach i = 0-28 in {
     def R#i  : Ri<i, "r"#i>,  DwarfRegNum<[i]>;
   }
-
   def R29 : Ri<29, "r29", ["sp"]>, DwarfRegNum<[29]>;
   def R30 : Ri<30, "r30", ["fp"]>, DwarfRegNum<[30]>;
   def R31 : Ri<31, "r31", ["lr"]>, DwarfRegNum<[31]>;
@@ -206,6 +207,18 @@ let Namespace = "Hexagon" in {
   def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>;
   }
 
+  // Aliases of the V* registers used to hold quad vec values.
+  let SubRegIndices = [wsub_lo, wsub_hi], CoveredBySubRegs = 1 in {
+  def VQ0  : Rd< 0, "v3:0",   [W0,  W1]>,  DwarfRegNum<[252]>;
+  def VQ1  : Rd< 4, "v7:4",   [W2,  W3]>,  DwarfRegNum<[253]>;
+  def VQ2  : Rd< 8, "v11:8",  [W4,  W5]>,  DwarfRegNum<[254]>;
+  def VQ3  : Rd<12, "v15:12", [W6,  W7]>,  DwarfRegNum<[255]>;
+  def VQ4  : Rd<16, "v19:16", [W8,  W9]>,  DwarfRegNum<[256]>;
+  def VQ5  : Rd<20, "v23:20", [W10, W11]>, DwarfRegNum<[257]>;
+  def VQ6  : Rd<24, "v27:24", [W12, W13]>, DwarfRegNum<[258]>;
+  def VQ7  : Rd<28, "v31:28", [W14, W15]>, DwarfRegNum<[259]>;
+  }
+
   // Vector Predicate registers.
   def Q0 : Rq<0, "q0">, DwarfRegNum<[131]>;
   def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>;
@@ -295,29 +308,6 @@ def VecQ32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
 
 // HVX register classes
 
-// Register classes.
-//
-// FIXME: the register order should be defined in terms of the preferred
-// allocation order...
-//
-def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
-  (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28),
-       R10, R11, R29, R30, R31)>;
-
-// Registers are listed in reverse order for allocation preference reasons.
-def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
-  (add R23, R22, R21, R20, R19, R18, R17, R16,
-       R7, R6, R5, R4, R3, R2, R1, R0)>;
-
-def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
-  (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
-
-def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
-  (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>;
-
-def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
-  (add D11, D10, D9, D8, D3, D2, D1, D0)>;
-
 def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512,
   (add (sequence "V%u", 0, 31), VTMP)> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
@@ -336,6 +326,32 @@ def HvxQR : RegisterClass<"Hexagon", [VecI1, VecQ8, VecQ16, VecQ32], 512,
     [RegInfo<512,512,512>, RegInfo<1024,1024,1024>, RegInfo<512,512,512>]>;
 }
 
+def HvxVQR : RegisterClass<"Hexagon", [untyped], 2048,
+  (add (sequence "VQ%u", 0, 7))> {
+  let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
+    [RegInfo<2048,2048,2048>, RegInfo<4096,4096,4096>, RegInfo<2048,2048,2048>]>;
+}
+
+// Core register classes
+
+def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
+  (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28),
+       R10, R11, R29, R30, R31)>;
+
+// Registers are listed in reverse order for allocation preference reasons.
+def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
+  (add R23, R22, R21, R20, R19, R18, R17, R16,
+       R7, R6, R5, R4, R3, R2, R1, R0)>;
+
+def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
+  (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
+
+def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
+  (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>;
+
+def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
+  (add D11, D10, D9, D8, D3, D2, D1, D0)>;
+
 let Size = 32 in
 def PredRegs : RegisterClass<"Hexagon",
   [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32, (add P0, P1, P2, P3)>;
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index fa4f9ca639ccf209ef2e51716c55465c952c0517..1024198e9b3ff9f9b631d586aecb504b21cdd12a 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -27,6 +27,7 @@ def CVI_SHIFT  : FuncUnit;
 def CVI_MPY0   : FuncUnit;
 def CVI_MPY1   : FuncUnit;
 def CVI_LD     : FuncUnit;
+def CVI_ZW     : FuncUnit; // Z register write port
 
 // Combined functional units.
 def CVI_XLSHF  : FuncUnit;
@@ -84,3 +85,9 @@ include "HexagonScheduleV62.td"
 //===----------------------------------------------------------------------===//
 
 include "HexagonScheduleV65.td"
+
+//===----------------------------------------------------------------------===//
+// V66 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV66.td"
diff --git a/lib/Target/Hexagon/HexagonScheduleV60.td b/lib/Target/Hexagon/HexagonScheduleV60.td
index a2544c92a72c05299b1fca4363bba1a2c0f95ab1..861a8d2b0339d7ceb960e10d7050b0e77c11fb00 100644
--- a/lib/Target/Hexagon/HexagonScheduleV60.td
+++ b/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -65,7 +65,7 @@ def HexagonItinerariesV60 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
                             CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
                             CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
-                            CVI_ALL_NOMEM],
+                            CVI_ALL_NOMEM, CVI_ZW],
                             [Hex_FWD, HVX_FWD], HexagonV60ItinList.ItinList>;
 
 def HexagonModelV60 : SchedMachineModel {
diff --git a/lib/Target/Hexagon/HexagonScheduleV62.td b/lib/Target/Hexagon/HexagonScheduleV62.td
index a0a8595f185fb0fdd9da145b90aeaa624b53ff75..1c274191277ca1dbede467254477d3c5ce48ecc0 100644
--- a/lib/Target/Hexagon/HexagonScheduleV62.td
+++ b/lib/Target/Hexagon/HexagonScheduleV62.td
@@ -21,7 +21,7 @@ def HexagonItinerariesV62 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
                             CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
                             CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
-                            CVI_ALL_NOMEM],
+                            CVI_ALL_NOMEM, CVI_ZW],
                            [Hex_FWD, HVX_FWD], HexagonV62ItinList.ItinList>;
 
 def HexagonModelV62 : SchedMachineModel {
diff --git a/lib/Target/Hexagon/HexagonScheduleV65.td b/lib/Target/Hexagon/HexagonScheduleV65.td
index e3b1313923f5c7a68f9391f653d471abdb7cc2d3..46a79d521795cef9b050e706ff7f59c09604240c 100644
--- a/lib/Target/Hexagon/HexagonScheduleV65.td
+++ b/lib/Target/Hexagon/HexagonScheduleV65.td
@@ -23,7 +23,7 @@ def HexagonItinerariesV65 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
                             CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
                             CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
-                            CVI_ALL_NOMEM],
+                            CVI_ALL_NOMEM, CVI_ZW],
                             [Hex_FWD, HVX_FWD],
                             HexagonV65ItinList.ItinList>;
 
diff --git a/lib/Target/Hexagon/HexagonScheduleV66.td b/lib/Target/Hexagon/HexagonScheduleV66.td
new file mode 100644
index 0000000000000000000000000000000000000000..38e3d21d370174133bf010918b22cb1651b35565
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonScheduleV66.td
@@ -0,0 +1,41 @@
+//=-HexagonScheduleV66.td - HexagonV66 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// ScalarItin and HVXItin contain some old itineraries
+// still used by a handful of instructions. Hopefully, we will be able
+// to get rid of them soon.
+
+def HexagonV66ItinList : DepScalarItinV66, ScalarItin,
+                         DepHVXItinV66, HVXItin, PseudoItin {
+  list<InstrItinData> ItinList =
+    !listconcat(DepScalarItinV66_list, ScalarItin_list,
+                DepHVXItinV66_list, HVXItin_list, PseudoItin_list);
+}
+
+def HexagonItinerariesV66 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+                            CVI_ALL_NOMEM, CVI_ZW],
+                            [Hex_FWD, HVX_FWD],
+                            HexagonV66ItinList.ItinList>;
+
+def HexagonModelV66 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV66;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V66 Resource Definitions -
+//===----------------------------------------------------------------------===//
+
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 68e276be0f691c15ce3831024e6eaf71afb83bda..9c77135c2f2f686580d2c12463cec38a7768a25e 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -98,6 +98,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
       {"hexagonv60", Hexagon::ArchEnum::V60},
       {"hexagonv62", Hexagon::ArchEnum::V62},
       {"hexagonv65", Hexagon::ArchEnum::V65},
+      {"hexagonv66", Hexagon::ArchEnum::V66},
   };
 
   auto FoundIt = CpuTable.find(CPUString);
@@ -275,11 +276,11 @@ void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs *DAG) {
     if (!L0.mayLoad() || L0.mayStore() ||
         HII.getAddrMode(L0) != HexagonII::BaseImmOffset)
       continue;
-    int Offset0;
+    int64_t Offset0;
     unsigned Size0;
-    unsigned Base0 = HII.getBaseAndOffset(L0, Offset0, Size0);
+    MachineOperand *BaseOp0 = HII.getBaseAndOffset(L0, Offset0, Size0);
     // Is the access size is longer than the L1 cache line, skip the check.
-    if (Base0 == 0 || Size0 >= 32)
+    if (BaseOp0 == nullptr || !BaseOp0->isReg() || Size0 >= 32)
       continue;
     // Scan only up to 32 instructions ahead (to avoid n^2 complexity).
     for (unsigned j = i+1, m = std::min(i+32, e); j != m; ++j) {
@@ -288,10 +289,11 @@ void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs *DAG) {
       if (!L1.mayLoad() || L1.mayStore() ||
           HII.getAddrMode(L1) != HexagonII::BaseImmOffset)
         continue;
-      int Offset1;
+      int64_t Offset1;
       unsigned Size1;
-      unsigned Base1 = HII.getBaseAndOffset(L1, Offset1, Size1);
-      if (Base1 == 0 || Size1 >= 32 || Base0 != Base1)
+      MachineOperand *BaseOp1 = HII.getBaseAndOffset(L1, Offset1, Size1);
+      if (BaseOp1 == nullptr || !BaseOp1->isReg() || Size1 >= 32 ||
+          BaseOp0->getReg() != BaseOp1->getReg())
         continue;
       // Check bits 3 and 4 of the offset: if they differ, a bank conflict
       // is unlikely.
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index eaae4db6ba90a6ce502d6abcbbc27eaecdd9b044..3a5acb53682cd2dee84e6f821745a12dea14b225 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -52,10 +52,12 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool UseNewValueJumps = false;
   bool UseNewValueStores = false;
   bool UseSmallData = false;
+  bool UseZRegOps = false;
 
   bool HasMemNoShuf = false;
   bool EnableDuplex = false;
   bool ReservedR19 = false;
+  bool NoreturnStackElim = false;
 
 public:
   Hexagon::ArchEnum HexagonArchVersion;
@@ -150,6 +152,12 @@ public:
   bool hasV65OpsOnly() const {
     return getHexagonArchVersion() == Hexagon::ArchEnum::V65;
   }
+  bool hasV66Ops() const {
+    return getHexagonArchVersion() >= Hexagon::ArchEnum::V66;
+  }
+  bool hasV66OpsOnly() const {
+    return getHexagonArchVersion() == Hexagon::ArchEnum::V66;
+  }
 
   bool useLongCalls() const { return UseLongCalls; }
   bool useMemops() const { return UseMemops; }
@@ -157,6 +165,7 @@ public:
   bool useNewValueJumps() const { return UseNewValueJumps; }
   bool useNewValueStores() const { return UseNewValueStores; }
   bool useSmallData() const { return UseSmallData; }
+  bool useZRegOps() const { return UseZRegOps; }
 
   bool useHVXOps() const {
     return HexagonHVXVersion > Hexagon::ArchEnum::NoArch;
@@ -168,6 +177,8 @@ public:
   bool hasReservedR19() const { return ReservedR19; }
   bool usePredicatedCalls() const;
 
+  bool noreturnStackElim() const { return NoreturnStackElim; }
+
   bool useBSBScheduling() const { return UseBSBScheduling; }
   bool enableMachineScheduler() const override;
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 184e5b774184fd502402f79bf4d3cca600e3348f..ddfda7e27793947dfa1df82d1565de16c58f2ea8 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -180,12 +180,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 extern "C" void LLVMInitializeHexagonTarget() {
   // Register the target.
   RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
@@ -222,7 +216,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
           "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048",
           TT, CPU, FS, Options, getEffectiveRelocModel(RM),
-          getEffectiveCodeModel(CM), (HexagonNoOpt ? CodeGenOpt::None : OL)),
+          getEffectiveCodeModel(CM, CodeModel::Small),
+          (HexagonNoOpt ? CodeGenOpt::None : OL)),
       TLOF(make_unique<HexagonTargetObjectFile>()) {
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 386cd14c827beaf3e6cb7e03dd8b332f5162a3cf..2185bf8eebc66d0fab86786a6e68bd5ae9748e7c 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -199,11 +199,10 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
 /// section.
 bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
       const TargetMachine &TM) const {
-  if (!isSmallDataEnabled(TM)) {
-    LLVM_DEBUG(dbgs() << "Small data is not available.\n");
-    return false;
-  }
-
+  bool HaveSData = isSmallDataEnabled(TM);
+  if (!HaveSData)
+    LLVM_DEBUG(dbgs() << "Small-data allocation is disabled, but symbols "
+                         "may have explicit section assignments...\n");
   // Only global variables, not functions.
   LLVM_DEBUG(dbgs() << "Checking if value is in small-data, -G"
                     << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
@@ -223,6 +222,12 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
     return IsSmall;
   }
 
+  // If sdata is disabled, stop the checks here.
+  if (!HaveSData) {
+    LLVM_DEBUG(dbgs() << "no, small-data allocation is disabled\n");
+    return false;
+  }
+
   if (GVar->isConstant()) {
     LLVM_DEBUG(dbgs() << "no, is a constant\n");
     return false;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index cb504b5c3d5d9bbcfd0491095de8cb107ef464cf..6543d831390064346a9185984783878e5607226f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -26,11 +26,9 @@ namespace llvm {
 /// instruction info tracks.
 namespace HexagonII {
   unsigned const TypeCVI_FIRST = TypeCVI_4SLOT_MPY;
-  unsigned const TypeCVI_LAST = TypeCVI_VX_LATE;
+  unsigned const TypeCVI_LAST = TypeCVI_ZW;
 
   enum SubTarget {
-    HasV4SubT     = 0x3f,
-    HasV5SubT     = 0x3e,
     HasV55SubT    = 0x3c,
     HasV60SubT    = 0x38,
   };
@@ -57,117 +55,117 @@ namespace HexagonII {
   // MCInstrDesc TSFlags
   // *** Must match HexagonInstrFormat*.td ***
   enum {
-    // This 5-bit field describes the insn type.
-    TypePos  = 0,
-    TypeMask = 0x3f,
+    // This 7-bit field describes the insn type.
+    TypePos = 0,
+    TypeMask = 0x7f,
 
     // Solo instructions.
-    SoloPos  = 6,
+    SoloPos = 7,
     SoloMask = 0x1,
     // Packed only with A or X-type instructions.
-    SoloAXPos  = 7,
+    SoloAXPos = 8,
     SoloAXMask = 0x1,
     // Only A-type instruction in first slot or nothing.
-    RestrictSlot1AOKPos  = 8,
+    RestrictSlot1AOKPos = 9,
     RestrictSlot1AOKMask = 0x1,
 
     // Predicated instructions.
-    PredicatedPos  = 9,
+    PredicatedPos = 10,
     PredicatedMask = 0x1,
-    PredicatedFalsePos  = 10,
+    PredicatedFalsePos = 11,
     PredicatedFalseMask = 0x1,
-    PredicatedNewPos  = 11,
+    PredicatedNewPos = 12,
     PredicatedNewMask = 0x1,
-    PredicateLatePos  = 12,
+    PredicateLatePos = 13,
     PredicateLateMask = 0x1,
 
     // New-Value consumer instructions.
-    NewValuePos  = 13,
+    NewValuePos = 14,
     NewValueMask = 0x1,
     // New-Value producer instructions.
-    hasNewValuePos  = 14,
+    hasNewValuePos = 15,
     hasNewValueMask = 0x1,
     // Which operand consumes or produces a new value.
-    NewValueOpPos  = 15,
+    NewValueOpPos = 16,
     NewValueOpMask = 0x7,
     // Stores that can become new-value stores.
-    mayNVStorePos  = 18,
+    mayNVStorePos = 19,
     mayNVStoreMask = 0x1,
     // New-value store instructions.
-    NVStorePos  = 19,
+    NVStorePos = 20,
     NVStoreMask = 0x1,
     // Loads that can become current-value loads.
-    mayCVLoadPos  = 20,
+    mayCVLoadPos = 21,
     mayCVLoadMask = 0x1,
     // Current-value load instructions.
-    CVLoadPos  = 21,
+    CVLoadPos = 22,
     CVLoadMask = 0x1,
 
     // Extendable insns.
-    ExtendablePos  = 22,
+    ExtendablePos = 23,
     ExtendableMask = 0x1,
     // Insns must be extended.
-    ExtendedPos  = 23,
+    ExtendedPos = 24,
     ExtendedMask = 0x1,
     // Which operand may be extended.
-    ExtendableOpPos  = 24,
+    ExtendableOpPos = 25,
     ExtendableOpMask = 0x7,
     // Signed or unsigned range.
-    ExtentSignedPos  = 27,
+    ExtentSignedPos = 28,
     ExtentSignedMask = 0x1,
     // Number of bits of range before extending operand.
-    ExtentBitsPos  = 28,
+    ExtentBitsPos = 29,
     ExtentBitsMask = 0x1f,
     // Alignment power-of-two before extending operand.
-    ExtentAlignPos  = 33,
+    ExtentAlignPos = 34,
     ExtentAlignMask = 0x3,
 
-    CofMax1Pos = 35,
+    CofMax1Pos = 36,
     CofMax1Mask = 0x1,
-    CofRelax1Pos = 36,
+    CofRelax1Pos = 37,
     CofRelax1Mask = 0x1,
-    CofRelax2Pos = 37,
+    CofRelax2Pos = 38,
     CofRelax2Mask = 0x1,
 
-    RestrictNoSlot1StorePos  = 38,
+    RestrictNoSlot1StorePos = 39,
     RestrictNoSlot1StoreMask = 0x1,
 
     // Addressing mode for load/store instructions.
-    AddrModePos  = 41,
+    AddrModePos = 42,
     AddrModeMask = 0x7,
     // Access size for load/store instructions.
-    MemAccessSizePos = 44,
+    MemAccessSizePos = 45,
     MemAccesSizeMask = 0xf,
 
     // Branch predicted taken.
-    TakenPos = 48,
+    TakenPos = 49,
     TakenMask = 0x1,
 
     // Floating-point instructions.
-    FPPos  = 49,
+    FPPos = 50,
     FPMask = 0x1,
 
     // New-Value producer-2 instructions.
-    hasNewValuePos2  = 51,
+    hasNewValuePos2 = 52,
     hasNewValueMask2 = 0x1,
     // Which operand consumes or produces a new value.
-    NewValueOpPos2  = 52,
+    NewValueOpPos2 = 53,
     NewValueOpMask2 = 0x7,
 
     // Accumulator instructions.
-    AccumulatorPos = 55,
+    AccumulatorPos = 56,
     AccumulatorMask = 0x1,
 
     // Complex XU, prevent xu competition by preferring slot3
-    PrefersSlot3Pos = 56,
+    PrefersSlot3Pos = 57,
     PrefersSlot3Mask = 0x1,
 
     // v65
-    HasTmpDstPos = 59,
+    HasTmpDstPos = 60,
     HasTmpDstMask = 0x1,
 
-    CVINewPos = 61,
-    CVINewMask = 0x1
+    CVINewPos = 62,
+    CVINewMask = 0x1,
   };
 
   // *** The code above must match HexagonInstrFormat*.td *** //
@@ -176,7 +174,7 @@ namespace HexagonII {
   enum HexagonMOTargetFlagVal {
     // Hexagon-specific MachineOperand target flags.
     //
-    // When chaning these, make sure to update
+    // When changing these, make sure to update
     // getSerializableDirectMachineOperandTargetFlags and
     // getSerializableBitmaskMachineOperandTargetFlags if needed.
     MO_NO_FLAG,
@@ -189,7 +187,8 @@ namespace HexagonII {
     MO_GOT,
 
     // Low or high part of a symbol.
-    MO_LO16, MO_HI16,
+    MO_LO16,
+    MO_HI16,
 
     // Offset from the base of the SDA.
     MO_GPREL,
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 8f3c09e7204f5407e40462f8c781dcbba6160543..92ce7345f3584c9de0d40ce065281f9537f4c5e2 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -71,6 +71,8 @@ cl::opt<bool> MV62("mv62", cl::Hidden, cl::desc("Build for Hexagon V62"),
                    cl::init(false));
 cl::opt<bool> MV65("mv65", cl::Hidden, cl::desc("Build for Hexagon V65"),
                    cl::init(false));
+cl::opt<bool> MV66("mv66", cl::Hidden, cl::desc("Build for Hexagon V66"),
+                   cl::init(false));
 } // namespace
 
 cl::opt<Hexagon::ArchEnum>
@@ -80,9 +82,10 @@ cl::opt<Hexagon::ArchEnum>
         clEnumValN(Hexagon::ArchEnum::V60, "v60", "Build for HVX v60"),
         clEnumValN(Hexagon::ArchEnum::V62, "v62", "Build for HVX v62"),
         clEnumValN(Hexagon::ArchEnum::V65, "v65", "Build for HVX v65"),
-        // Sentinal for no value specified
+        clEnumValN(Hexagon::ArchEnum::V66, "v66", "Build for HVX v66"),
+        // Sentinel for no value specified.
         clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
-      // Sentinal for flag not present
+      // Sentinel for flag not present.
       cl::init(Hexagon::ArchEnum::NoArch), cl::ValueOptional);
 
 static cl::opt<bool>
@@ -103,6 +106,8 @@ static StringRef HexagonGetArchVariant() {
     return "hexagonv62";
   if (MV65)
     return "hexagonv65";
+  if (MV66)
+    return "hexagonv66";
   return "";
 }
 
@@ -289,11 +294,15 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
   case Hexagon::ArchEnum::V65:
     Result.push_back("+hvxv65");
     break;
+  case Hexagon::ArchEnum::V66:
+    Result.push_back("+hvxv66");
+    break;
   case Hexagon::ArchEnum::Generic:{
     Result.push_back(StringSwitch<StringRef>(CPU)
              .Case("hexagonv60", "+hvxv60")
              .Case("hexagonv62", "+hvxv62")
-             .Case("hexagonv65", "+hvxv65"));
+             .Case("hexagonv65", "+hvxv65")
+             .Case("hexagonv66", "+hvxv66"));
     break;
   }
   case Hexagon::ArchEnum::NoArch:
@@ -308,7 +317,7 @@ static bool isCPUValid(std::string CPU)
 {
   std::vector<std::string> table {
     "generic",    "hexagonv5",  "hexagonv55", "hexagonv60",
-    "hexagonv62", "hexagonv65",
+    "hexagonv62", "hexagonv65", "hexagonv66",
   };
 
   return std::find(table.begin(), table.end(), CPU) != table.end();
@@ -330,7 +339,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   // turns on hvxvNN, corresponding to the existing ArchVNN.
   FeatureBitset FB = S;
   unsigned CpuArch = ArchV5;
-  for (unsigned F : {ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
+  for (unsigned F : {ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
     if (!FB.test(F))
       continue;
     CpuArch = F;
@@ -344,7 +353,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
     break;
   }
   bool HasHvxVer = false;
-  for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65}) {
+  for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65,
+                     ExtensionHVXV66}) {
     if (!FB.test(F))
       continue;
     HasHvxVer = true;
@@ -357,6 +367,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
 
   // HasHvxVer is false, and UseHvx is true.
   switch (CpuArch) {
+    case ArchV66:
+      FB.set(ExtensionHVXV66);
+      LLVM_FALLTHROUGH;
     case ArchV65:
       FB.set(ExtensionHVXV65);
       LLVM_FALLTHROUGH;
@@ -400,6 +413,7 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
     {"hexagonv60", ELF::EF_HEXAGON_MACH_V60},
     {"hexagonv62", ELF::EF_HEXAGON_MACH_V62},
     {"hexagonv65", ELF::EF_HEXAGON_MACH_V65},
+    {"hexagonv66", ELF::EF_HEXAGON_MACH_V66},
   };
 
   auto F = ElfFlags.find(STI.getCPU());
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 59f3caa6af9407c72f2d2c3087d5c53310124670..f4ee2bbfaaaa548ba698e2fe06c9c546ddb19bdf 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -138,6 +138,8 @@ void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
       UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
   (*TUL)[HexagonII::TypeCVI_SCATTER_NEW_ST] =
       UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_4SLOT_MPY] = UnitsAndLanes(CVI_XLANE, 4);
+  (*TUL)[HexagonII::TypeCVI_ZW] = UnitsAndLanes(CVI_ZW, 1);
 }
 
 HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
@@ -300,6 +302,7 @@ bool HexagonShuffler::check() {
   // Number of memory operations, loads, solo loads, stores, solo stores, single
   // stores.
   unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
+  unsigned NonZCVIloads = 0, AllCVIloads = 0, CVIstores = 0;
   // Number of duplex insns
   unsigned duplex = 0;
   unsigned pSlot3Cnt = 0;
@@ -331,6 +334,11 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeCVI_VM_TMP_LD:
     case HexagonII::TypeCVI_GATHER:
     case HexagonII::TypeCVI_GATHER_RST:
+      ++NonZCVIloads;
+      LLVM_FALLTHROUGH;
+    case HexagonII::TypeCVI_ZW:
+      ++AllCVIloads;
+      LLVM_FALLTHROUGH;
     case HexagonII::TypeLD:
       ++loads;
       ++memory;
@@ -348,6 +356,8 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeCVI_SCATTER_RST:
     case HexagonII::TypeCVI_SCATTER_NEW_RST:
     case HexagonII::TypeCVI_SCATTER_NEW_ST:
+      ++CVIstores;
+      LLVM_FALLTHROUGH;
     case HexagonII::TypeST:
       ++stores;
       ++memory;
@@ -405,7 +415,11 @@ bool HexagonShuffler::check() {
   applySlotRestrictions();
 
   // Check if the packet is legal.
-  if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory))) {
+  const unsigned ZCVIloads = AllCVIloads - NonZCVIloads;
+  const bool ValidHVXMem =
+      NonZCVIloads <= 1 && ZCVIloads <= 1 && CVIstores <= 1;
+  if ((load0 > 1 || store0 > 1 || !ValidHVXMem) ||
+      (duplex > 1 || (duplex && memory))) {
     reportError(llvm::Twine("invalid instruction packet"));
     return false;
   }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index 37f90bc46ac70cfa6903aee169a5ee7bc425561a..ef50c5bebbfb5e6dbe0e328dc3730a7c9e98be27 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -75,7 +75,8 @@ private:
     CVI_XLANE = 1 << 0,
     CVI_SHIFT = 1 << 1,
     CVI_MPY0 = 1 << 2,
-    CVI_MPY1 = 1 << 3
+    CVI_MPY1 = 1 << 3,
+    CVI_ZW = 1 << 4
   };
 
   // Count of adjacent slots that the insn requires to be executed.
diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp
index a18352738e132a2ad75471e4d884ec32ec3f505d..196768fdc56a3f01594c4621754329543a78febd 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -101,12 +101,12 @@ bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint(
   // the width doesn't overlap the offset of a higher memory access,
   // then the memory accesses are different.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  unsigned BaseRegA = 0, BaseRegB = 0;
+  MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
   int64_t OffsetA = 0, OffsetB = 0;
   unsigned int WidthA = 0, WidthB = 0;
-  if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
-      getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
-    if (BaseRegA == BaseRegB) {
+  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
+      getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
+    if (BaseOpA->isIdenticalTo(*BaseOpB)) {
       int LowOffset = std::min(OffsetA, OffsetB);
       int HighOffset = std::max(OffsetA, OffsetB);
       int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
@@ -755,9 +755,9 @@ unsigned LanaiInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   return 0;
 }
 
-bool LanaiInstrInfo::getMemOpBaseRegImmOfsWidth(
-    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
-    const TargetRegisterInfo * /*TRI*/) const {
+bool LanaiInstrInfo::getMemOperandWithOffsetWidth(
+    MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
+    unsigned &Width, const TargetRegisterInfo * /*TRI*/) const {
   // Handle only loads/stores with base register followed by immediate offset
   // and with add as ALU op.
   if (LdSt.getNumOperands() != 4)
@@ -787,14 +787,17 @@ bool LanaiInstrInfo::getMemOpBaseRegImmOfsWidth(
     break;
   }
 
-  BaseReg = LdSt.getOperand(1).getReg();
+  BaseOp = &LdSt.getOperand(1);
   Offset = LdSt.getOperand(2).getImm();
+  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                            "operands of type register.");
   return true;
 }
 
-bool LanaiInstrInfo::getMemOpBaseRegImmOfs(
-    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
-    const TargetRegisterInfo *TRI) const {
+bool LanaiInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
+                                        MachineOperand *&BaseOp,
+                                        int64_t &Offset,
+                                        const TargetRegisterInfo *TRI) const {
   switch (LdSt.getOpcode()) {
   default:
     return false;
@@ -808,6 +811,6 @@ bool LanaiInstrInfo::getMemOpBaseRegImmOfs(
   case Lanai::LDBs_RI:
   case Lanai::LDBz_RI:
     unsigned Width;
-    return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
+    return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
   }
 }
diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h
index fe22fde2470b76c14e39b9b109326414c407fc93..bdcf9a361b5f0a4158384d4aa5d78cfcb5ed06df 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/lib/Target/Lanai/LanaiInstrInfo.h
@@ -68,13 +68,13 @@ public:
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
-                             int64_t &Offset,
-                             const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+                               int64_t &Offset,
+                               const TargetRegisterInfo *TRI) const override;
 
-  bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
-                                  int64_t &Offset, unsigned &Width,
-                                  const TargetRegisterInfo *TRI) const;
+  bool getMemOperandWithOffsetWidth(MachineInstr &LdSt, MachineOperand *&BaseOp,
+                                    int64_t &Offset, unsigned &Width,
+                                    const TargetRegisterInfo *TRI) const;
 
   std::pair<unsigned, unsigned>
   decomposeMachineOperandsTargetFlags(unsigned TF) const override;
diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp
index 2c21a53b13bb5719886c1b6f6ad7e9b51d686738..10bd9e2c65d2f20437d5a62c0c5da0d6c790bf4c 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -53,12 +53,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Medium;
-}
-
 LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
                                        StringRef Cpu, StringRef FeatureString,
                                        const TargetOptions &Options,
@@ -67,7 +61,8 @@ LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OptLevel, bool JIT)
     : LLVMTargetMachine(T, computeDataLayout(), TT, Cpu, FeatureString, Options,
                         getEffectiveRelocModel(RM),
-                        getEffectiveCodeModel(CodeModel), OptLevel),
+                        getEffectiveCodeModel(CodeModel, CodeModel::Medium),
+                        OptLevel),
       Subtarget(TT, Cpu, FeatureString, *this, Options, getCodeModel(),
                 OptLevel),
       TLOF(new LanaiTargetObjectFile()) {
diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 3f7d1860e9a9388d88dedc1dab8aed6c7569d928..3cc6da2c21aaf8d6912853ff3c46ad152c4f9f27 100644
--- a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -30,7 +30,9 @@
 
 #define DEBUG_TYPE "msp430-asm-parser"
 
-namespace llvm {
+using namespace llvm;
+
+namespace {
 
 /// Parses MSP430 assembly from a stream.
 class MSP430AsmParser : public MCTargetAsmParser {
@@ -49,6 +51,7 @@ class MSP430AsmParser : public MCTargetAsmParser {
                         SMLoc NameLoc, OperandVector &Operands) override;
 
   bool ParseDirective(AsmToken DirectiveID) override;
+  bool ParseDirectiveRefSym(AsmToken DirectiveID);
 
   unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
@@ -244,6 +247,7 @@ public:
     }
   }
 };
+} // end anonymous namespace
 
 bool MSP430AsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
                                               OperandVector &Operands,
@@ -404,6 +408,16 @@ bool MSP430AsmParser::ParseInstruction(ParseInstructionInfo &Info,
   return false;
 }
 
+bool MSP430AsmParser::ParseDirectiveRefSym(AsmToken DirectiveID) {
+    StringRef Name;
+    if (getParser().parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+
+    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
+    return false;
+}
+
 bool MSP430AsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal.lower() == ".long") {
@@ -412,6 +426,8 @@ bool MSP430AsmParser::ParseDirective(AsmToken DirectiveID) {
     ParseLiteralValues(2, DirectiveID.getLoc());
   } else if (IDVal.lower() == ".byte") {
     ParseLiteralValues(1, DirectiveID.getLoc());
+  } else if (IDVal.lower() == ".refsym") {
+    return ParseDirectiveRefSym(DirectiveID);
   }
   return true;
 }
@@ -558,5 +574,3 @@ unsigned MSP430AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
 
   return Match_InvalidOperand;
 }
-
-} // end of namespace llvm
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
index 30d077b5b58831f1b200bc136364f1f395e2ef25..e47db2400a05345944bb9f6c97aaca4037a318d2 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -34,7 +34,7 @@ protected:
     // Translate fixup kind to ELF relocation type.
     switch ((unsigned)Fixup.getKind()) {
     case FK_Data_1:                   return ELF::R_MSP430_8;
-    case FK_Data_2:                   return ELF::R_MSP430_16;
+    case FK_Data_2:                   return ELF::R_MSP430_16_BYTE;
     case FK_Data_4:                   return ELF::R_MSP430_32;
     case MSP430::fixup_32:            return ELF::R_MSP430_32;
     case MSP430::fixup_10_pcrel:      return ELF::R_MSP430_10_PCREL;
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
index ba9f7d7a9a5e86ee6be15f2a8ed4f61fd5caa33a..adf2384f6e99515bdae8300b11b4516172e79bc1 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
@@ -91,12 +91,11 @@ void MSP430MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   Offset = 2;
 
   uint64_t BinaryOpCode = getBinaryCodeForInstr(MI, Fixups, STI);
-  const uint16_t *Words = reinterpret_cast<uint16_t const *>(&BinaryOpCode);
   size_t WordCount = Size / 2;
 
-  for (size_t i = 0; i < WordCount; ++i) {
-    uint16_t Word = Words[i];
-    support::endian::write(OS, Word, support::little);
+  while (WordCount--) {
+    support::endian::write(OS, (uint16_t)BinaryOpCode, support::little);
+    BinaryOpCode >>= 16;
   }
 }
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index ac93d7efc2b97a3ebeb833c45a2a67ead08d9ff4..73c8793aa01c08d337e8b4e7219aecc5fac403a1 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -217,8 +217,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
     // { RTLIB::NEG_F64,  "__mspabi_negd", ISD::SETCC_INVALID },
     // { RTLIB::NEG_F32,  "__mspabi_negf", ISD::SETCC_INVALID },
 
-    // TODO: SLL/SRA/SRL are in libgcc, RLL isn't
-
     // Universal Integer Operations - EABI Table 9
     { RTLIB::SDIV_I16,   "__mspabi_divi", ISD::SETCC_INVALID },
     { RTLIB::SDIV_I32,   "__mspabi_divli", ISD::SETCC_INVALID },
@@ -233,6 +231,13 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
     { RTLIB::UREM_I32,   "__mspabi_remul", ISD::SETCC_INVALID },
     { RTLIB::UREM_I64,   "__mspabi_remull", ISD::SETCC_INVALID },
 
+    // Bitwise Operations - EABI Table 10
+    // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc
+    { RTLIB::SRL_I32,    "__mspabi_srll", ISD::SETCC_INVALID },
+    { RTLIB::SRA_I32,    "__mspabi_sral", ISD::SETCC_INVALID },
+    { RTLIB::SHL_I32,    "__mspabi_slll", ISD::SETCC_INVALID },
+    // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc
+
   };
 
   for (const auto &LC : LibraryCalls) {
@@ -945,10 +950,20 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
   uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
   // Expand the stuff into sequence of shifts.
-  // FIXME: for some shift amounts this might be done better!
-  // E.g.: foo >> (8 + N) => sxt(swpb(foo)) >> N
   SDValue Victim = N->getOperand(0);
 
+  if ((Opc == ISD::SRA || Opc == ISD::SRL) && ShiftAmount >= 8) {
+    // foo >> (8 + N) => sxt(swpb(foo)) >> N
+    assert(VT == MVT::i16 && "Can not shift i8 by 8 and more");
+    Victim = DAG.getNode(ISD::BSWAP, dl, VT, Victim);
+    if (Opc == ISD::SRA)
+      Victim = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Victim,
+                           DAG.getValueType(MVT::i8));
+    else
+      Victim = DAG.getZeroExtendInReg(Victim, dl, MVT::i8);
+    ShiftAmount -= 8;
+  }
+
   if (Opc == ISD::SRL && ShiftAmount) {
     // Emit a special goodness here:
     // srl A, 1 => clrc; rrc A
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 01f44e266d7bc10b0cc65ab7965821a648820ed2..9f6ebba75ec66d7f032eaeddee1c9514b20e243b 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -32,12 +32,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      const TargetOptions &Options) {
   return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
@@ -51,7 +45,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOpt::Level OL, bool JIT)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
                         Options, getEffectiveRelocModel(RM),
-                        getEffectiveCodeModel(CM), OL),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 79e0c001a636cfdd6ca71cac52e015cbb1b58f87..78dfed68b9228fbab51a90e23ad92613017a2490 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -39,6 +39,7 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -64,6 +65,11 @@ class MCInstrInfo;
 
 } // end namespace llvm
 
+static cl::opt<bool>
+EmitJalrReloc("mips-jalr-reloc", cl::Hidden,
+              cl::desc("MIPS: Emit R_{MICRO}MIPS_JALR relocation with jalr"),
+              cl::init(true));
+
 namespace {
 
 class MipsAssemblerOptions {
@@ -2065,9 +2071,21 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     JalrInst.addOperand(MCOperand::createReg(Mips::RA));
     JalrInst.addOperand(MCOperand::createReg(Mips::T9));
 
-    // FIXME: Add an R_(MICRO)MIPS_JALR relocation after the JALR.
-    // This relocation is supposed to be an optimization hint for the linker
-    // and is not necessary for correctness.
+    if (EmitJalrReloc) {
+      // As an optimization hint for the linker, before the JALR we add:
+      // .reloc tmplabel, R_{MICRO}MIPS_JALR, symbol
+      // tmplabel:
+      MCSymbol *TmpLabel = getContext().createTempSymbol();
+      const MCExpr *TmpExpr = MCSymbolRefExpr::create(TmpLabel, getContext());
+      const MCExpr *RelocJalrExpr =
+          MCSymbolRefExpr::create(JalSym, MCSymbolRefExpr::VK_None,
+                                  getContext(), IDLoc);
+
+      TOut.getStreamer().EmitRelocDirective(*TmpExpr,
+          inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
+          RelocJalrExpr, IDLoc, *STI);
+      TOut.getStreamer().EmitLabel(TmpLabel);
+    }
 
     Inst = JalrInst;
     ExpandedJalSym = true;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 63f9151da6babaa50cb606a1d1fe1f22352558b4..265d1141cb0b22837bd5576e28a886a603cd3bca 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -339,6 +339,8 @@ Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
             (MCFixupKind)Mips::fixup_MICROMIPS_TLS_TPREL_HI16)
       .Case("R_MICROMIPS_TLS_TPREL_LO16",
             (MCFixupKind)Mips::fixup_MICROMIPS_TLS_TPREL_LO16)
+      .Case("R_MIPS_JALR", (MCFixupKind)Mips::fixup_Mips_JALR)
+      .Case("R_MICROMIPS_JALR", (MCFixupKind)Mips::fixup_MICROMIPS_JALR)
       .Default(MCAsmBackend::getFixupKind(Name));
 }
 
@@ -417,7 +419,9 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
     { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 },
     { "fixup_Mips_SUB",                  0,     64,   0 },
-    { "fixup_MICROMIPS_SUB",             0,     64,   0 }
+    { "fixup_MICROMIPS_SUB",             0,     64,   0 },
+    { "fixup_Mips_JALR",                 0,     32,   0 },
+    { "fixup_MICROMIPS_JALR",            0,     32,   0 }
   };
   static_assert(array_lengthof(LittleEndianInfos) == Mips::NumTargetFixupKinds,
                 "Not all MIPS little endian fixup kinds added!");
@@ -495,7 +499,9 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_TLS_TPREL_HI16",  16,     16,   0 },
     { "fixup_MICROMIPS_TLS_TPREL_LO16",  16,     16,   0 },
     { "fixup_Mips_SUB",                   0,     64,   0 },
-    { "fixup_MICROMIPS_SUB",              0,     64,   0 }
+    { "fixup_MICROMIPS_SUB",              0,     64,   0 },
+    { "fixup_Mips_JALR",                  0,     32,   0 },
+    { "fixup_MICROMIPS_JALR",             0,     32,   0 }
   };
   static_assert(array_lengthof(BigEndianInfos) == Mips::NumTargetFixupKinds,
                 "Not all MIPS big endian fixup kinds added!");
@@ -553,6 +559,7 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   case Mips::fixup_Mips_TLSLDM:
   case Mips::fixup_Mips_TPREL_HI:
   case Mips::fixup_Mips_TPREL_LO:
+  case Mips::fixup_Mips_JALR:
   case Mips::fixup_MICROMIPS_CALL16:
   case Mips::fixup_MICROMIPS_GOT_DISP:
   case Mips::fixup_MICROMIPS_GOT_PAGE:
@@ -565,6 +572,7 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   case Mips::fixup_MICROMIPS_TLS_LDM:
   case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
   case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+  case Mips::fixup_MICROMIPS_JALR:
     return true;
   }
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index c600382446a84c97169cdc54f705536db53e331f..0e8de0577a8a93361be17187cfb0cfaf5f410ae7 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -401,6 +401,10 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_MICROMIPS_HIGHER;
   case Mips::fixup_MICROMIPS_HIGHEST:
     return ELF::R_MICROMIPS_HIGHEST;
+  case Mips::fixup_Mips_JALR:
+    return ELF::R_MIPS_JALR;
+  case Mips::fixup_MICROMIPS_JALR:
+    return ELF::R_MICROMIPS_JALR;
   }
 
   llvm_unreachable("invalid fixup kind!");
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index d7f6cf91db73fc0ad9b9920f3e859303a7cb7a38..eedad16dddc30686de2db04367adcbd13289329d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -222,6 +222,10 @@ namespace Mips {
     fixup_Mips_SUB,
     fixup_MICROMIPS_SUB,
 
+    // resulting in - R_MIPS_JALR/R_MICROMIPS_JALR
+    fixup_Mips_JALR,
+    fixup_MICROMIPS_JALR,
+
     // Marker
     LastTargetFixupKind,
     NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 7bcda12051a515a7ea1c11b9d6b2dc1f38c4417c..1506b4a83649ea6fada9e31198324d9e8ec4cf09 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -49,25 +49,5 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
   DwarfRegNumForCFI = true;
   HasMipsExpressions = true;
-
-  // Enable IAS by default for O32.
-  if (TheTriple.isMIPS32())
-    UseIntegratedAssembler = true;
-
-  // Enable IAS by default for Debian mips64/mips64el.
-  if (TheTriple.getEnvironment() == Triple::GNUABI64)
-    UseIntegratedAssembler = true;
-
-  // Enable IAS by default for Debian mipsn32/mipsn32el.
-  if (TheTriple.getEnvironment() == Triple::GNUABIN32)
-    UseIntegratedAssembler = true;
-
-  // Enable IAS by default for Android mips64el that uses N64 ABI.
-  if (TheTriple.getArch() == Triple::mips64el && TheTriple.isAndroid())
-    UseIntegratedAssembler = true;
-
-  // Enable IAS by default for FreeBSD / OpenBSD mips64/mips64el.
-  if (TheTriple.isOSFreeBSD() ||
-      TheTriple.isOSOpenBSD())
-    UseIntegratedAssembler = true;
+  UseIntegratedAssembler = true;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 8e880b635c15d03f3a77e887ebad24f72c410f4a..f43a4d980f92aac74211faa9ed6320f9e9ec3537 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -613,6 +613,9 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
     case MipsMCExpr::MEK_Special:
       llvm_unreachable("Unhandled fixup kind!");
       break;
+    case MipsMCExpr::MEK_DTPREL:
+      llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only");
+      break;
     case MipsMCExpr::MEK_CALL_HI16:
       FixupKind = Mips::fixup_Mips_CALL_HI16;
       break;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 0bddba781453003840b36a23d68c7ad5289229e2..99857e083c6ca880964945fb67c0ad3ceeb6022a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -43,6 +43,9 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   case MEK_Special:
     llvm_unreachable("MEK_None and MEK_Special are invalid");
     break;
+  case MEK_DTPREL:
+    llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only");
+    break;
   case MEK_CALL_HI16:
     OS << "%call_hi";
     break;
@@ -157,6 +160,8 @@ MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
     case MEK_None:
     case MEK_Special:
       llvm_unreachable("MEK_None and MEK_Special are invalid");
+    case MEK_DTPREL:
+      llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only");
     case MEK_DTPREL_HI:
     case MEK_DTPREL_LO:
     case MEK_GOT:
@@ -244,6 +249,9 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   case MEK_Special:
     llvm_unreachable("MEK_None and MEK_Special are invalid");
     break;
+  case MEK_DTPREL:
+    llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only");
+    break;
   case MEK_CALL_HI16:
   case MEK_CALL_LO16:
   case MEK_GOT:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 495d525ccff42d92f7c76a83737d144f7c18570e..bf3274ab5d173196bc1a1c01ae1f91d891c08137 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -22,6 +22,7 @@ public:
     MEK_None,
     MEK_CALL_HI16,
     MEK_CALL_LO16,
+    MEK_DTPREL,
     MEK_DTPREL_HI,
     MEK_DTPREL_LO,
     MEK_GOT,
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 16a2481a00d8996d1ac91c9a22482112a1f0695a..362431fd42a6c0a1910b07c0876290e1688c2577 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1204,18 +1204,23 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 
 // Emit .dtprelword or .dtpreldword directive
 // and value for debug thread local expression.
-void MipsAsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
-                                          unsigned Size) const {
-  switch (Size) {
-  case 4:
-    OutStreamer->EmitDTPRel32Value(Value);
-    break;
-  case 8:
-    OutStreamer->EmitDTPRel64Value(Value);
-    break;
-  default:
-    llvm_unreachable("Unexpected size of expression value.");
+void MipsAsmPrinter::EmitDebugValue(const MCExpr *Value, unsigned Size) const {
+  if (auto *MipsExpr = dyn_cast<MipsMCExpr>(Value)) {
+    if (MipsExpr && MipsExpr->getKind() == MipsMCExpr::MEK_DTPREL) {
+      switch (Size) {
+      case 4:
+        OutStreamer->EmitDTPRel32Value(MipsExpr->getSubExpr());
+        break;
+      case 8:
+        OutStreamer->EmitDTPRel64Value(MipsExpr->getSubExpr());
+        break;
+      default:
+        llvm_unreachable("Unexpected size of expression value.");
+      }
+      return;
+    }
   }
+  AsmPrinter::EmitDebugValue(Value, Size);
 }
 
 // Align all targets of indirect branches on bundle size.  Used only if target
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 999b6f896baed110ceed3408c2192a76d03dd9f3..eb58234e3e77b6b15cbf7df22feac82ca6b4f4bc 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -160,7 +160,7 @@ public:
   void EmitStartOfAsmFile(Module &M) override;
   void EmitEndOfAsmFile(Module &M) override;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-  void EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const override;
+  void EmitDebugValue(const MCExpr *Value, unsigned Size) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp
index fbc15649c8a3da62adcab66f77e03e5a27ec28fb..4964ca9cbe15048f4553029f149461a0dae96e0b 100644
--- a/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -145,6 +145,33 @@ bool MipsInstructionSelector::select(MachineInstr &I,
              .addMemOperand(*I.memoperands_begin());
     break;
   }
+  case G_UDIV:
+  case G_UREM:
+  case G_SDIV:
+  case G_SREM: {
+    unsigned HILOReg = MRI.createVirtualRegister(&Mips::ACC64RegClass);
+    bool IsSigned = I.getOpcode() == G_SREM || I.getOpcode() == G_SDIV;
+    bool IsDiv = I.getOpcode() == G_UDIV || I.getOpcode() == G_SDIV;
+
+    MachineInstr *PseudoDIV, *PseudoMove;
+    PseudoDIV = BuildMI(MBB, I, I.getDebugLoc(),
+                        TII.get(IsSigned ? Mips::PseudoSDIV : Mips::PseudoUDIV))
+                    .addDef(HILOReg)
+                    .add(I.getOperand(1))
+                    .add(I.getOperand(2));
+    if (!constrainSelectedInstRegOperands(*PseudoDIV, TII, TRI, RBI))
+      return false;
+
+    PseudoMove = BuildMI(MBB, I, I.getDebugLoc(),
+                         TII.get(IsDiv ? Mips::PseudoMFLO : Mips::PseudoMFHI))
+                     .addDef(I.getOperand(0).getReg())
+                     .addUse(HILOReg);
+    if (!constrainSelectedInstRegOperands(*PseudoMove, TII, TRI, RBI))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
   case G_CONSTANT: {
     int Imm = I.getOperand(1).getCImm()->getValue().getLimitedValue();
     unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
@@ -258,8 +285,8 @@ bool MipsInstructionSelector::select(MachineInstr &I,
 
     MachineIRBuilder B(I);
     for (const struct Instr &Instruction : Instructions) {
-      MachineInstrBuilder MIB =
-          B.buildInstr(Instruction.Opcode, Instruction.Def, Instruction.LHS);
+      MachineInstrBuilder MIB = B.buildInstr(
+          Instruction.Opcode, {Instruction.Def}, {Instruction.LHS});
 
       if (Instruction.hasImm())
         MIB.addImm(Instruction.RHS);
diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp
index 02701f31e32947e4952280118350e2fd7058cece..0d80bd479d5eb7397e3fe592a45acc3863c5a9da 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -20,29 +20,40 @@ using namespace llvm;
 MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
   using namespace TargetOpcode;
 
+  const LLT s1 = LLT::scalar(1);
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
   const LLT p0 = LLT::pointer(0, 32);
 
   getActionDefinitionsBuilder(G_ADD)
       .legalFor({s32})
-      .minScalar(0, s32)
-      .customFor({s64});
+      .clampScalar(0, s32, s32);
+
+  getActionDefinitionsBuilder(G_UADDE)
+      .lowerFor({{s32, s1}});
 
   getActionDefinitionsBuilder({G_LOAD, G_STORE})
       .legalForCartesianProduct({p0, s32}, {p0});
 
-  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR, G_SHL, G_ASHR, G_LSHR})
+  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
+      .legalFor({s32})
+      .clampScalar(0, s32, s32);
+
+  getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
       .legalFor({s32});
 
+  getActionDefinitionsBuilder({G_SDIV, G_SREM, G_UREM, G_UDIV})
+      .legalFor({s32})
+      .minScalar(0, s32)
+      .libcallFor({s64});
+
   getActionDefinitionsBuilder(G_ICMP)
       .legalFor({{s32, s32}})
       .minScalar(0, s32);
 
   getActionDefinitionsBuilder(G_CONSTANT)
       .legalFor({s32})
-      .minScalar(0, s32)
-      .customFor({s64});
+      .clampScalar(0, s32, s32);
 
   getActionDefinitionsBuilder(G_GEP)
       .legalFor({{p0, s32}});
@@ -59,64 +70,12 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
 
 bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
                                        MachineRegisterInfo &MRI,
-                                       MachineIRBuilder &MIRBuilder) const {
+                                       MachineIRBuilder &MIRBuilder,
+                                       GISelChangeObserver &Observer) const {
 
   using namespace TargetOpcode;
 
   MIRBuilder.setInstr(MI);
 
-  switch (MI.getOpcode()) {
-  case G_ADD: {
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-
-    const LLT sHalf = LLT::scalar(Size / 2);
-
-    unsigned RHSLow = MRI.createGenericVirtualRegister(sHalf);
-    unsigned RHSHigh = MRI.createGenericVirtualRegister(sHalf);
-    unsigned LHSLow = MRI.createGenericVirtualRegister(sHalf);
-    unsigned LHSHigh = MRI.createGenericVirtualRegister(sHalf);
-    unsigned ResLow = MRI.createGenericVirtualRegister(sHalf);
-    unsigned ResHigh = MRI.createGenericVirtualRegister(sHalf);
-    unsigned Carry = MRI.createGenericVirtualRegister(sHalf);
-    unsigned TmpResHigh = MRI.createGenericVirtualRegister(sHalf);
-
-    MIRBuilder.buildUnmerge({RHSLow, RHSHigh}, MI.getOperand(2).getReg());
-    MIRBuilder.buildUnmerge({LHSLow, LHSHigh}, MI.getOperand(1).getReg());
-
-    MIRBuilder.buildAdd(TmpResHigh, LHSHigh, RHSHigh);
-    MIRBuilder.buildAdd(ResLow, LHSLow, RHSLow);
-    MIRBuilder.buildICmp(CmpInst::ICMP_ULT, Carry, ResLow, LHSLow);
-    MIRBuilder.buildAdd(ResHigh, TmpResHigh, Carry);
-
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResLow, ResHigh});
-
-    MI.eraseFromParent();
-    break;
-  }
-  case G_CONSTANT: {
-
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    const LLT sHalf = LLT::scalar(Size / 2);
-
-    const APInt &CImmValue = MI.getOperand(1).getCImm()->getValue();
-
-    unsigned ResLow = MRI.createGenericVirtualRegister(sHalf);
-    unsigned ResHigh = MRI.createGenericVirtualRegister(sHalf);
-    MIRBuilder.buildConstant(
-        ResLow, *ConstantInt::get(MI.getMF()->getFunction().getContext(),
-                                  CImmValue.trunc(Size / 2)));
-    MIRBuilder.buildConstant(
-        ResHigh, *ConstantInt::get(MI.getMF()->getFunction().getContext(),
-                                   CImmValue.lshr(Size / 2).trunc(Size / 2)));
-
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResLow, ResHigh});
-
-    MI.eraseFromParent();
-    break;
-  }
-  default:
-    return false;
-  }
-
-  return true;
+  return false;
 }
diff --git a/lib/Target/Mips/MipsLegalizerInfo.h b/lib/Target/Mips/MipsLegalizerInfo.h
index 96d95746ac879841a3161fd84c704812075df3de..75fadd6cf6139e179ce43163d46e2aeba0db390c 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.h
+++ b/lib/Target/Mips/MipsLegalizerInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSMACHINELEGALIZER_H
 #define LLVM_LIB_TARGET_MIPS_MIPSMACHINELEGALIZER_H
 
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 
 namespace llvm {
@@ -26,7 +27,8 @@ public:
   MipsLegalizerInfo(const MipsSubtarget &ST);
 
   bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                      MachineIRBuilder &MIRBuilder) const override;
+                      MachineIRBuilder &MIRBuilder,
+                      GISelChangeObserver &Observer) const override;
 };
 } // end namespace llvm
 #endif
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp
index a0c80048c024c735266fab33a6f97f1c3abc8d3c..53b9e42dc63a80070bc4047a6b575c301747ba88 100644
--- a/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -93,6 +93,10 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_SHL:
   case G_ASHR:
   case G_LSHR:
+  case G_SDIV:
+  case G_UDIV:
+  case G_SREM:
+  case G_UREM:
     OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
     break;
   case G_CONSTANT:
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index d745ce00149011b7b850a8d523aea8d64f6160c7..a78e544c35f01b8a5a116f18b0e7967e09b084c3 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -2360,24 +2360,6 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
   }
 }
 
-/// Check if the given BuildVectorSDNode is a splat.
-/// This method currently relies on DAG nodes being reused when equivalent,
-/// so it's possible for this to return false even when isConstantSplat returns
-/// true.
-static bool isSplatVector(const BuildVectorSDNode *N) {
-  unsigned int nOps = N->getNumOperands();
-  assert(nOps > 1 && "isSplatVector has 0 or 1 sized build vector");
-
-  SDValue Operand0 = N->getOperand(0);
-
-  for (unsigned int i = 1; i < nOps; ++i) {
-    if (N->getOperand(i) != Operand0)
-      return false;
-  }
-
-  return true;
-}
-
 // Lower ISD::EXTRACT_VECTOR_ELT into MipsISD::VEXTRACT_SEXT_ELT.
 //
 // The non-value bits resulting from ISD::EXTRACT_VECTOR_ELT are undefined. We
@@ -2488,7 +2470,7 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
       Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
 
     return Result;
-  } else if (isSplatVector(Node))
+  } else if (DAG.isSplatValue(Op, /* AllowUndefs */ false))
     return Op;
   else if (!isConstantOrUndefBUILD_VECTOR(Node)) {
     // Use INSERT_VECTOR_ELT operations rather than expand to stores.
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index c8df44d9f0b2c79479092d8d34a52d7a201b3617..9cc91d302893d94994e62e1f0d4be1bc1037b5bc 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -101,12 +101,6 @@ static Reloc::Model getEffectiveRelocModel(bool JIT,
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 // On function prologue, the stack is created by decrementing
 // its pointer. Once decremented, all references are done with positive
 // offset from the stack/frame pointer, using StackGrowsUp enables
@@ -121,7 +115,7 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
                                      bool isLittle)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
                         CPU, FS, Options, getEffectiveRelocModel(JIT, RM),
-                        getEffectiveCodeModel(CM), OL),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       isLittle(isLittle), TLOF(llvm::make_unique<MipsTargetObjectFile>()),
       ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
       Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this,
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index f767c83219883e03f90ea4bedcc57ec53885677c..f53ee0631b5e659448d994b3ee53b41846748984 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -10,6 +10,7 @@
 #include "MipsTargetObjectFile.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -189,6 +190,7 @@ const MCExpr *
 MipsTargetObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   const MCExpr *Expr =
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
-  return MCBinaryExpr::createAdd(
+  Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(0x8000, getContext()), getContext());
+  return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL, Expr, getContext());
 }
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index aec0d7db81a8c715052dc58e59837323bfdfca8f..63e227a12edc06b479d324e95c8cc1ace62d66e6 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -219,11 +219,12 @@ void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
     return;
   }
 
+  const NVPTXSubtarget &STI = MI->getMF()->getSubtarget<NVPTXSubtarget>();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
 
     MCOperand MCOp;
-    if (!nvptxSubtarget->hasImageHandles()) {
+    if (!STI.hasImageHandles()) {
       if (lowerImageHandleOperand(MI, i, MCOp)) {
         OutMI.addOperand(MCOp);
         continue;
@@ -329,11 +330,12 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
 
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
-  const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
+  const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
+  const TargetLowering *TLI = STI.getTargetLowering();
 
   Type *Ty = F->getReturnType();
 
-  bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
+  bool isABI = (STI.getSmVersion() >= 20);
 
   if (Ty->getTypeID() == Type::VoidTyID)
     return;
@@ -474,7 +476,6 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
 }
 
 bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) {
-  nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
   bool Result = AsmPrinter::runOnMachineFunction(F);
   // Emit closing brace for the body of function F.
   // The closing brace must be emitted here because we need to emit additional
@@ -508,8 +509,9 @@ void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
     OutStreamer->AddComment(Twine("implicit-def: ") +
                             getVirtualRegisterName(RegNo));
   } else {
+    const NVPTXSubtarget &STI = MI->getMF()->getSubtarget<NVPTXSubtarget>();
     OutStreamer->AddComment(Twine("implicit-def: ") +
-                            nvptxSubtarget->getRegisterInfo()->getName(RegNo));
+                            STI.getRegisterInfo()->getName(RegNo));
   }
   OutStreamer->AddBlankLine();
 }
@@ -1431,12 +1433,14 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
   const AttributeList &PAL = F->getAttributes();
-  const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
+  const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
+  const TargetLowering *TLI = STI.getTargetLowering();
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
   bool isKernelFunc = isKernelFunction(*F);
-  bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
+  bool isABI = (STI.getSmVersion() >= 20);
+  bool hasImageHandles = STI.hasImageHandles();
   MVT thePointerTy = TLI->getPointerTy(DL);
 
   if (F->arg_empty()) {
@@ -1460,7 +1464,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         if (isImage(*I)) {
           std::string sname = I->getName();
           if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
-            if (nvptxSubtarget->hasImageHandles())
+            if (hasImageHandles)
               O << "\t.param .u64 .ptr .surfref ";
             else
               O << "\t.param .surfref ";
@@ -1468,7 +1472,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
             O << "_param_" << paramIndex;
           }
           else { // Default image is read_only
-            if (nvptxSubtarget->hasImageHandles())
+            if (hasImageHandles)
               O << "\t.param .u64 .ptr .texref ";
             else
               O << "\t.param .texref ";
@@ -1476,7 +1480,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
             O << "_param_" << paramIndex;
           }
         } else {
-          if (nvptxSubtarget->hasImageHandles())
+          if (hasImageHandles)
             O << "\t.param .u64 .ptr .samplerref ";
           else
             O << "\t.param .samplerref ";
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index efe98003b1c8f80e2204852926128b7d78fd4b5a..44a09f5fe513039be490bbcd3f2b914b184d3a6b 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -258,9 +258,6 @@ private:
   typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
   VRegRCMap VRegMapping;
 
-  // Cache the subtarget here.
-  const NVPTXSubtarget *nvptxSubtarget;
-
   // List of variables demoted to a function scope.
   std::map<const Function *, std::vector<const GlobalVariable *>> localDecls;
 
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c352b9b9c9dc66a9f4f76649056e2a1b87d1dcff..5c16c34e21d8551f1de742ebe6724260e9d0c190 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -180,6 +180,18 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     return;
   }
 
+  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    auto const *SL = DL.getStructLayout(STy);
+    auto ElementNum = 0;
+    for(auto *EI : STy->elements()) {
+      ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
+                         StartingOffset + SL->getElementOffset(ElementNum));
+      ++ElementNum;
+    }
+    return;
+  }
+
   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     EVT VT = TempVTs[i];
@@ -1649,7 +1661,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  if (!Func) {
+  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
+  // between them we must rely on the call site value which is valid for
+  // indirect calls but is always null for libcalls.
+  bool isIndirectCall = !Func && CS;
+
+  if (isIndirectCall) {
     // This is indirect function call case : PTX requires a prototype of the
     // form
     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
@@ -1673,7 +1690,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
   };
   // We model convergent calls as separate opcodes.
-  unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
+  unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
   if (CLI.IsConvergent)
     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
                                               : NVPTXISD::PrintConvergentCall;
@@ -1707,12 +1724,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue CallArgEndOps[] = { Chain,
-                              DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
+                              DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
                               InFlag };
   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
   InFlag = Chain.getValue(1);
 
-  if (!Func) {
+  if (isIndirectCall) {
     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue PrototypeOps[] = { Chain,
                                DAG.getConstant(uniqueCallSite, dl, MVT::i32),
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index f0fbe5033fa8fbfa70c799b012365ee5c84075cf..48db941db9b80a7489d833e670a9bf0f891e7b89 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1318,9 +1318,6 @@ def ROTR64reg_sw :
 
 // Create SDNodes so they can be used in the DAG code, e.g.
 // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-def SDTIntShiftDOp :
-  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                       SDTCisInt<0>, SDTCisInt<3>]>;
 def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
 def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
 
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index b9cce28726dd826a7b82cf2f64ccf02c3097c9ee..8c009aed887706c1238c328c604d7a6cc09a9824 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -102,12 +102,6 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
   return Ret;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
@@ -118,7 +112,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
     // specified, as it is the only relocation model currently supported.
     : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
                         CPU, FS, Options, Reloc::PIC_,
-                        getEffectiveCodeModel(CM), OL),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
       TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
diff --git a/lib/Target/Nios2/LLVMBuild.txt b/lib/Target/Nios2/LLVMBuild.txt
index 0125bbedea58f264fb7674096e31246c7bdffda3..46dd93308c0412da965ee365a6198500124af7df 100644
--- a/lib/Target/Nios2/LLVMBuild.txt
+++ b/lib/Target/Nios2/LLVMBuild.txt
@@ -53,6 +53,7 @@ required_libraries = AsmPrinter
                      Core
                      GlobalISel
                      MC
+                     Nios2AsmPrinter
                      Nios2Desc
                      Nios2Info
                      SelectionDAG
diff --git a/lib/Target/Nios2/Nios2TargetMachine.cpp b/lib/Target/Nios2/Nios2TargetMachine.cpp
index b7594dde709d649773018582d3bdc289daedd75b..4f90db9e6f6a50e3d4bf6ed8482183765f16a2a1 100644
--- a/lib/Target/Nios2/Nios2TargetMachine.cpp
+++ b/lib/Target/Nios2/Nios2TargetMachine.cpp
@@ -37,23 +37,15 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
-                                              Reloc::Model RM, bool JIT) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 Nios2TargetMachine::Nios2TargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
                                        Optional<Reloc::Model> RM,
                                        Optional<CodeModel::Model> CM,
                                        CodeGenOpt::Level OL, bool JIT)
-    : LLVMTargetMachine(
-          T, computeDataLayout(), TT, CPU, FS, Options,
-          getEffectiveRelocModel(RM),
-          getEffectiveCodeModel(CM, getEffectiveRelocModel(RM), JIT), OL),
+    : LLVMTargetMachine(T, computeDataLayout(), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(RM),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(make_unique<Nios2TargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index ff27768128452107a80f090a75dab5f8367e0c72..3130d10fa5ed6eb279841576c533feb74929668b 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -10,6 +10,7 @@ tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM PPCGenExegesis.inc -gen-exegesis)
 
 add_public_tablegen_target(PowerPCCommonTableGen)
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 7c0f5f0eb30c889bb68df2fd26ac4c0b26cbc631..8c15ade6f9c4fd33405c92465475bc0937d74e9c 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -13,18 +13,13 @@
 
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "PPCInstrInfo.h"
+#include "PPCMCCodeEmitter.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -39,117 +34,6 @@ using namespace llvm;
 
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
-namespace {
-
-class PPCMCCodeEmitter : public MCCodeEmitter {
-  const MCInstrInfo &MCII;
-  const MCContext &CTX;
-  bool IsLittleEndian;
-
-public:
-  PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
-      : MCII(mcii), CTX(ctx),
-        IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {}
-  PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
-  void operator=(const PPCMCCodeEmitter &) = delete;
-  ~PPCMCCodeEmitter() override = default;
-
-  unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-  unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const;
-  unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-  unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
-                            SmallVectorImpl<MCFixup> &Fixups,
-                            const MCSubtargetInfo &STI) const;
-  unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-  unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const override {
-    verifyInstructionPredicates(MI,
-                                computeAvailableFeatures(STI.getFeatureBits()));
-
-    unsigned Opcode = MI.getOpcode();
-    const MCInstrDesc &Desc = MCII.get(Opcode);
-
-    uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
-
-    // Output the constant in big/little endian byte order.
-    unsigned Size = Desc.getSize();
-    support::endianness E = IsLittleEndian ? support::little : support::big;
-    switch (Size) {
-    case 0:
-      break;
-    case 4:
-      support::endian::write<uint32_t>(OS, Bits, E);
-      break;
-    case 8:
-      // If we emit a pair of instructions, the first one is
-      // always in the top 32 bits, even on little-endian.
-      support::endian::write<uint32_t>(OS, Bits >> 32, E);
-      support::endian::write<uint32_t>(OS, Bits, E);
-      break;
-    default:
-      llvm_unreachable("Invalid instruction size");
-    }
-
-    ++MCNumEmitted;  // Keep track of the # of mi's emitted.
-  }
-
-private:
-  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
-  void verifyInstructionPredicates(const MCInst &MI,
-                                   uint64_t AvailableFeatures) const;
-};
-
-} // end anonymous namespace
-
 MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
                                             const MCRegisterInfo &MRI,
                                             MCContext &Ctx) {
@@ -396,5 +280,42 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   return MO.getImm();
 }
 
+void PPCMCCodeEmitter::encodeInstruction(
+    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+
+  // Output the constant in big/little endian byte order.
+  unsigned Size = getInstSizeInBytes(MI);
+  support::endianness E = IsLittleEndian ? support::little : support::big;
+  switch (Size) {
+  case 0:
+    break;
+  case 4:
+    support::endian::write<uint32_t>(OS, Bits, E);
+    break;
+  case 8:
+    // If we emit a pair of instructions, the first one is
+    // always in the top 32 bits, even on little-endian.
+    support::endian::write<uint32_t>(OS, Bits >> 32, E);
+    support::endian::write<uint32_t>(OS, Bits, E);
+    break;
+  default:
+    llvm_unreachable("Invalid instruction size");
+  }
+
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+// Get the number of bytes used to encode the given MCInst.
+unsigned PPCMCCodeEmitter::getInstSizeInBytes(const MCInst &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  return Desc.getSize();
+}
+
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "PPCGenMCCodeEmitter.inc"
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4bcff4b9450c31e6e9882bf6baa1fdcee3b1037
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -0,0 +1,109 @@
+//===-- PPCMCCodeEmitter.h - Convert PPC code to machine code -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_MCCODEEMITTER_PPCCODEEMITTER_H
+#define LLVM_LIB_TARGET_PPC_MCCODEEMITTER_PPCCODEEMITTER_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+
+namespace llvm {
+
+class PPCMCCodeEmitter : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+  const MCContext &CTX;
+  bool IsLittleEndian;
+
+public:
+  PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), CTX(ctx),
+        IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {}
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
+  void operator=(const PPCMCCodeEmitter &) = delete;
+  ~PPCMCCodeEmitter() override = default;
+
+  unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+  unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+  unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+  unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  // Get the number of bytes used to encode the given MCInst.
+  unsigned getInstSizeInBytes(const MCInst &MI) const;
+
+private:
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_PPC_MCCODEEMITTER_PPCCODEEMITTER_H
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td
index c6cbb9037ede1f4867fa687b29eda25f86d41189..17c37964c5622ecebe3e546fbb8354c3ea5a43e6 100644
--- a/lib/Target/PowerPC/P9InstrResources.td
+++ b/lib/Target/PowerPC/P9InstrResources.td
@@ -111,11 +111,11 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
     (instregex "CNT(L|T)Z(D|W)(8)?(o)?$"),
     (instregex "POPCNT(D|W)$"),
     (instregex "CMPB(8)?$"),
+    (instregex "SETB(8)?$"),
     XSTDIVDP,
     XSTSQRTDP,
     XSXSIGDP,
     XSCVSPDPN,
-    SETB,
     BPERMD
 )>;
 
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 80ad4962a20f4adb8f9ccdf767823d8552f19cca..98e6e98e69744a9e04f564b4b4e7ed3b440cf443 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -305,11 +305,11 @@ def : Processor<"generic", G3Itineraries, [Directive32, FeatureHardFloat,
                                            FeatureMFTB]>;
 def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL,
                                           FeatureFRES, FeatureFRSQRTE,
-                                          FeatureICBT, FeatureBookE, 
+                                          FeatureICBT, FeatureBookE,
                                           FeatureMSYNC, FeatureMFTB]>;
 def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
                                           FeatureFRES, FeatureFRSQRTE,
-                                          FeatureICBT, FeatureBookE, 
+                                          FeatureICBT, FeatureBookE,
                                           FeatureMSYNC, FeatureMFTB]>;
 def : Processor<"601", G3Itineraries, [Directive601, FeatureFPU]>;
 def : Processor<"602", G3Itineraries, [Directive602, FeatureFPU,
@@ -348,7 +348,7 @@ def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
                                             FeatureFRES, FeatureFRSQRTE,
                                             FeatureMFTB]>;
 def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
-                                           FeatureFRES, FeatureFRSQRTE, 
+                                           FeatureFRES, FeatureFRSQRTE,
                                            FeatureMFTB]>;
 
 def : ProcessorModel<"970", G5Model,
@@ -369,11 +369,11 @@ def : ProcessorModel<"e500", PPCE500Model,
                    FeatureISEL, FeatureMFTB]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
                   [DirectiveE500mc,
-                   FeatureSTFIWX, FeatureICBT, FeatureBookE, 
+                   FeatureSTFIWX, FeatureICBT, FeatureBookE,
                    FeatureISEL, FeatureMFTB]>;
 def : ProcessorModel<"e5500", PPCE5500Model,
                   [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
-                   FeatureSTFIWX, FeatureICBT, FeatureBookE, 
+                   FeatureSTFIWX, FeatureICBT, FeatureBookE,
                    FeatureISEL, FeatureMFTB]>;
 def : ProcessorModel<"a2", PPCA2Model,
                   [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
@@ -428,7 +428,7 @@ def : ProcessorModel<"pwr6x", G5Model,
                    FeatureMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>;
-def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.Power9FeatureList>; 
+def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.Power9FeatureList>;
 def : Processor<"ppc", G3Itineraries, [Directive32, FeatureHardFloat,
                                        FeatureMFTB]>;
 def : Processor<"ppc32", G3Itineraries, [Directive32, FeatureHardFloat,
@@ -478,3 +478,9 @@ def PPC : Target {
   let AssemblyParserVariants = [PPCAsmParserVariant];
   let AllowRegisterRenaming = 1;
 }
+
+//===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "PPCPfmCounters.td"
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index a00f5bda2aab9c7ebb3addfccc2b7aca04ce817b..04aa3c9b1e22cd2d32cc8d4ac7f4d331bb2d2e15 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -327,7 +327,7 @@ MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
 }
 
 void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  SM.serializeToStackMapSection();
+  emitStackMaps(SM);
 }
 
 void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) {
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 668169839e78cc539ece36e3c38bb84e311b4890..aa55ac1f7acc428f63827295a46fa67978f85f2e 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -2354,7 +2354,8 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
         PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
     return false;
 
-  MI->eraseFromParent();
+  MachineBasicBlock::iterator I(MI);
+  removeDeadCode(I, std::next(I));
   return true;
 }
 
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 84dacf39646248b41df64fc0a88ae2ae72ccd8c3..8263954994d278862855d54e620aeb36c4116e66 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -17,6 +17,7 @@
 #include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -28,6 +29,16 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "framelowering"
+STATISTIC(NumNoNeedForFrame, "Number of functions without frames");
+STATISTIC(NumPESpillVSR, "Number of spills to vector in prologue");
+STATISTIC(NumPEReloadVSR, "Number of reloads from vector in epilogue");
+
+static cl::opt<bool>
+EnablePEVectorSpills("ppc-enable-pe-vector-spills",
+                     cl::desc("Enable spills in prologue to vector registers."),
+                     cl::init(false), cl::Hidden);
+
 /// VRRegNo - Map from a numbered VR register to its enum value.
 ///
 static const MCPhysReg VRRegNo[] = {
@@ -466,6 +477,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
 
   // Check whether we can skip adjusting the stack pointer (by using red zone)
   if (!DisableRedZone && CanUseRedZone && FitsInRedZone) {
+    NumNoNeedForFrame++;
     // No need for frame
     if (UpdateMF)
       MFI.setStackSize(0);
@@ -1213,11 +1225,20 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
         continue;
       }
 
-      int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-          nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
-      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+      if (CSI[I].isSpilledToReg()) {
+        unsigned SpilledReg = CSI[I].getDstReg();
+        unsigned CFIRegister = MF.addFrameInst(MCCFIInstruction::createRegister(
+            nullptr, MRI->getDwarfRegNum(Reg, true),
+            MRI->getDwarfRegNum(SpilledReg, true)));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIRegister);
+      } else {
+        int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
     }
   }
 }
@@ -1822,17 +1843,19 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     // Move general register save area spill slots down, taking into account
     // the size of the Floating-point register save area.
     for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) {
-      int FI = GPRegs[i].getFrameIdx();
-
-      MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+      if (!GPRegs[i].isSpilledToReg()) {
+        int FI = GPRegs[i].getFrameIdx();
+        MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+      }
     }
 
     // Move general register save area spill slots down, taking into account
     // the size of the Floating-point register save area.
     for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) {
-      int FI = G8Regs[i].getFrameIdx();
-
-      MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+      if (!G8Regs[i].isSpilledToReg()) {
+        int FI = G8Regs[i].getFrameIdx();
+        MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+      }
     }
 
     unsigned MinReg =
@@ -1947,6 +1970,64 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
   }
 }
 
+// This function checks if a callee saved gpr can be spilled to a volatile
+// vector register. This occurs for leaf functions when the option
+// ppc-enable-pe-vector-spills is enabled. If there are any remaining registers
+// which were not spilled to vectors, return false so the target independent
+// code can handle them by assigning a FrameIdx to a stack slot.
+bool PPCFrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+
+  if (CSI.empty())
+    return true; // Early exit if no callee saved registers are modified!
+
+  // Early exit if cannot spill gprs to volatile vector registers.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!EnablePEVectorSpills || MFI.hasCalls() || !Subtarget.hasP9Vector())
+    return false;
+
+  // Build a BitVector of VSRs that can be used for spilling GPRs.
+  BitVector BVAllocatable = TRI->getAllocatableSet(MF);
+  BitVector BVCalleeSaved(TRI->getNumRegs());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    BVCalleeSaved.set(CSRegs[i]);
+
+  for (unsigned Reg : BVAllocatable.set_bits()) {
+    // Set to 0 if the register is not a volatile VF/F8 register, or if it is
+    // used in the function.
+    if (BVCalleeSaved[Reg] ||
+        (!PPC::F8RCRegClass.contains(Reg) &&
+         !PPC::VFRCRegClass.contains(Reg)) ||
+        (MF.getRegInfo().isPhysRegUsed(Reg)))
+      BVAllocatable.reset(Reg);
+  }
+
+  bool AllSpilledToReg = true;
+  for (auto &CS : CSI) {
+    if (BVAllocatable.none())
+      return false;
+
+    unsigned Reg = CS.getReg();
+    if (!PPC::G8RCRegClass.contains(Reg) && !PPC::GPRCRegClass.contains(Reg)) {
+      AllSpilledToReg = false;
+      continue;
+    }
+
+    unsigned VolatileVFReg = BVAllocatable.find_first();
+    if (VolatileVFReg < BVAllocatable.size()) {
+      CS.setDstReg(VolatileVFReg);
+      BVAllocatable.reset(VolatileVFReg);
+    } else {
+      AllSpilledToReg = false;
+    }
+  }
+  return AllSpilledToReg;
+}
+
+
 bool
 PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MI,
@@ -2012,12 +2093,18 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                          CSI[i].getFrameIdx()));
       }
     } else {
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      // Use !IsLiveIn for the kill flag.
-      // We do not want to kill registers that are live in this function
-      // before their use because they will become undefined registers.
-      TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
-                              CSI[i].getFrameIdx(), RC, TRI);
+      if (CSI[i].isSpilledToReg()) {
+        NumPESpillVSR++;
+        BuildMI(MBB, MI, DL, TII.get(PPC::MTVSRD), CSI[i].getDstReg())
+          .addReg(Reg, getKillRegState(true));
+      } else {
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        // Use !IsLiveIn for the kill flag.
+        // We do not want to kill registers that are live in this function
+        // before their use because they will become undefined registers.
+        TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
+                                CSI[i].getFrameIdx(), RC, TRI);
+      }
     }
   }
   return true;
@@ -2157,13 +2244,19 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         CR2Spilled = CR3Spilled = CR4Spilled = false;
       }
 
-      // Default behavior for non-CR saves.
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
-                               RC, TRI);
-      assert(I != MBB.begin() &&
-             "loadRegFromStackSlot didn't insert any code!");
+      if (CSI[i].isSpilledToReg()) {
+        DebugLoc DL;
+        NumPEReloadVSR++;
+        BuildMI(MBB, I, DL, TII.get(PPC::MFVSRD), Reg)
+            .addReg(CSI[i].getDstReg(), getKillRegState(true));
+      } else {
+       // Default behavior for non-CR saves.
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+        assert(I != MBB.begin() &&
+               "loadRegFromStackSlot didn't insert any code!");
       }
+    }
 
     // Insert in reverse order.
     if (AtStart)
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 01c155594c442f9a953915d9a91a0b9352fa9a2b..69bd1484d6e59f6d9609bf9a30495a77a5095d51 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -99,6 +99,13 @@ public:
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
                                  const TargetRegisterInfo *TRI) const override;
+  /// This function will assign callee saved gprs to volatile vector registers
+  /// for prologue spills when applicable. It returns false if there are any
+  /// registers which were not spilled to volatile vector registers.
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index cf3547f3cfb04f538e86ea91a78450ed694caad3..5f6966cecd61a310dfb5cafc9c33c172467fb5bc 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -103,7 +103,7 @@ bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID,
   case PPC::Sched::IIC_LdStLHA:
   case PPC::Sched::IIC_LdStLHAU:
   case PPC::Sched::IIC_LdStLWA:
-  case PPC::Sched::IIC_LdStSTDU:
+  case PPC::Sched::IIC_LdStSTU:
   case PPC::Sched::IIC_LdStSTFDU:
     NSlots = 2;
     break;
@@ -112,7 +112,7 @@ bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID,
   case PPC::Sched::IIC_LdStLHAUX:
   case PPC::Sched::IIC_LdStLWARX:
   case PPC::Sched::IIC_LdStLDARX:
-  case PPC::Sched::IIC_LdStSTDUX:
+  case PPC::Sched::IIC_LdStSTUX:
   case PPC::Sched::IIC_LdStSTDCX:
   case PPC::Sched::IIC_LdStSTWCX:
   case PPC::Sched::IIC_BrMCRX: // mtcr
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 8861de6f0d8953dbd30ada9c80cdaa4c56b2195c..4f05f50f6994fa06bb7781498a88abb6d677d6d3 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -81,6 +81,8 @@ STATISTIC(NumLogicOpsOnComparison,
           "Number of logical ops on i1 values calculated in GPR.");
 STATISTIC(OmittedForNonExtendUses,
           "Number of compares not eliminated as they have non-extending uses.");
+STATISTIC(NumP9Setb,
+          "Number of compares lowered to setb.");
 
 // FIXME: Remove this once the bug has been fixed!
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
@@ -327,7 +329,6 @@ private:
 
     bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
     void transferMemOperands(SDNode *N, SDNode *Result);
-    MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr);
   };
 
 } // end anonymous namespace
@@ -4138,49 +4139,144 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Result), {MemOp});
 }
 
-/// This method returns a node after flipping the MSB of each element
-/// of vector integer type. Additionally, if SignBitVec is non-null,
-/// this method sets a node with one at MSB of all elements
-/// and zero at other bits in SignBitVec.
-MachineSDNode *
-PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) {
-  SDLoc dl(N);
-  EVT VecVT = N.getValueType();
-  if (VecVT == MVT::v4i32) {
-    if (SignBitVec) {
-      SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32);
-      *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT,
-                                        SDValue(ZV, 0));
-    }
-    return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N);
-  }
-  else if (VecVT == MVT::v8i16) {
-    SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32,
-                                     getI32Imm(0x8000, dl));
-    SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32,
-                                         SDValue(Hi, 0),
-                                         getI32Imm(0x8000, dl));
-    SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT,
-                                         SDValue(ScaImm, 0));
-    /*
-    Alternatively, we can do this as follow to use VRF instead of GPR.
-      vspltish 5, 1
-      vspltish 6, 15
-      vslh 5, 6, 5
-    */
-    if (SignBitVec) *SignBitVec = VecImm;
-    return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N,
-                                  SDValue(VecImm, 0));
-  }
-  else if (VecVT == MVT::v16i8) {
-    SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32,
-                                         getI32Imm(0x80, dl));
-    if (SignBitVec) *SignBitVec = VecImm;
-    return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N,
-                                  SDValue(VecImm, 0));
+static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
+                         bool &NeedSwapOps, bool &IsUnCmp) {
+
+  assert(N->getOpcode() == ISD::SELECT_CC && "Expecting a SELECT_CC here.");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue TrueRes = N->getOperand(2);
+  SDValue FalseRes = N->getOperand(3);
+  ConstantSDNode *TrueConst = dyn_cast<ConstantSDNode>(TrueRes);
+  if (!TrueConst)
+    return false;
+
+  assert((N->getSimpleValueType(0) == MVT::i64 ||
+          N->getSimpleValueType(0) == MVT::i32) &&
+         "Expecting either i64 or i32 here.");
+
+  // We are looking for any of:
+  // (select_cc lhs, rhs,  1, (sext (setcc [lr]hs, [lr]hs, cc2)), cc1)
+  // (select_cc lhs, rhs, -1, (zext (setcc [lr]hs, [lr]hs, cc2)), cc1)
+  // (select_cc lhs, rhs,  0, (select_cc [lr]hs, [lr]hs,  1, -1, cc2), seteq)
+  // (select_cc lhs, rhs,  0, (select_cc [lr]hs, [lr]hs, -1,  1, cc2), seteq)
+  int64_t TrueResVal = TrueConst->getSExtValue();
+  if ((TrueResVal < -1 || TrueResVal > 1) ||
+      (TrueResVal == -1 && FalseRes.getOpcode() != ISD::ZERO_EXTEND) ||
+      (TrueResVal == 1 && FalseRes.getOpcode() != ISD::SIGN_EXTEND) ||
+      (TrueResVal == 0 &&
+       (FalseRes.getOpcode() != ISD::SELECT_CC || CC != ISD::SETEQ)))
+    return false;
+
+  bool InnerIsSel = FalseRes.getOpcode() == ISD::SELECT_CC;
+  SDValue SetOrSelCC = InnerIsSel ? FalseRes : FalseRes.getOperand(0);
+  if (SetOrSelCC.getOpcode() != ISD::SETCC &&
+      SetOrSelCC.getOpcode() != ISD::SELECT_CC)
+    return false;
+
+  // Without this setb optimization, the outer SELECT_CC will be manually
+  // selected to SELECT_CC_I4/SELECT_CC_I8 Pseudo, then expand-isel-pseudos pass
+  // transforms pseduo instruction to isel instruction. When there are more than
+  // one use for result like zext/sext, with current optimization we only see
+  // isel is replaced by setb but can't see any significant gain. Since
+  // setb has longer latency than original isel, we should avoid this. Another
+  // point is that setb requires comparison always kept, it can break the
+  // oppotunity to get the comparison away if we have in future.
+  if (!SetOrSelCC.hasOneUse() || (!InnerIsSel && !FalseRes.hasOneUse()))
+    return false;
+
+  SDValue InnerLHS = SetOrSelCC.getOperand(0);
+  SDValue InnerRHS = SetOrSelCC.getOperand(1);
+  ISD::CondCode InnerCC =
+      cast<CondCodeSDNode>(SetOrSelCC.getOperand(InnerIsSel ? 4 : 2))->get();
+  // If the inner comparison is a select_cc, make sure the true/false values are
+  // 1/-1 and canonicalize it if needed.
+  if (InnerIsSel) {
+    ConstantSDNode *SelCCTrueConst =
+        dyn_cast<ConstantSDNode>(SetOrSelCC.getOperand(2));
+    ConstantSDNode *SelCCFalseConst =
+        dyn_cast<ConstantSDNode>(SetOrSelCC.getOperand(3));
+    if (!SelCCTrueConst || !SelCCFalseConst)
+      return false;
+    int64_t SelCCTVal = SelCCTrueConst->getSExtValue();
+    int64_t SelCCFVal = SelCCFalseConst->getSExtValue();
+    // The values must be -1/1 (requiring a swap) or 1/-1.
+    if (SelCCTVal == -1 && SelCCFVal == 1) {
+      std::swap(InnerLHS, InnerRHS);
+    } else if (SelCCTVal != 1 || SelCCFVal != -1)
+      return false;
   }
-  else
-    llvm_unreachable("Unsupported vector data type for flipSignBit");
+
+  // Canonicalize unsigned case
+  if (InnerCC == ISD::SETULT || InnerCC == ISD::SETUGT) {
+    IsUnCmp = true;
+    InnerCC = (InnerCC == ISD::SETULT) ? ISD::SETLT : ISD::SETGT;
+  }
+
+  bool InnerSwapped = false;
+  if (LHS == InnerRHS && RHS == InnerLHS)
+    InnerSwapped = true;
+  else if (LHS != InnerLHS || RHS != InnerRHS)
+    return false;
+
+  switch (CC) {
+  // (select_cc lhs, rhs,  0, \
+  //     (select_cc [lr]hs, [lr]hs, 1, -1, setlt/setgt), seteq)
+  case ISD::SETEQ:
+    if (!InnerIsSel)
+      return false;
+    if (InnerCC != ISD::SETLT && InnerCC != ISD::SETGT)
+      return false;
+    NeedSwapOps = (InnerCC == ISD::SETGT) ? InnerSwapped : !InnerSwapped;
+    break;
+
+  // (select_cc lhs, rhs, -1, (zext (setcc [lr]hs, [lr]hs, setne)), setu?lt)
+  // (select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setgt)), setu?lt)
+  // (select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setlt)), setu?lt)
+  // (select_cc lhs, rhs, 1, (sext (setcc [lr]hs, [lr]hs, setne)), setu?lt)
+  // (select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setgt)), setu?lt)
+  // (select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setlt)), setu?lt)
+  case ISD::SETULT:
+    if (!IsUnCmp && InnerCC != ISD::SETNE)
+      return false;
+    IsUnCmp = true;
+    LLVM_FALLTHROUGH;
+  case ISD::SETLT:
+    if (InnerCC == ISD::SETNE || (InnerCC == ISD::SETGT && !InnerSwapped) ||
+        (InnerCC == ISD::SETLT && InnerSwapped))
+      NeedSwapOps = (TrueResVal == 1);
+    else
+      return false;
+    break;
+
+  // (select_cc lhs, rhs, 1, (sext (setcc [lr]hs, [lr]hs, setne)), setu?gt)
+  // (select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setlt)), setu?gt)
+  // (select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setgt)), setu?gt)
+  // (select_cc lhs, rhs, -1, (zext (setcc [lr]hs, [lr]hs, setne)), setu?gt)
+  // (select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setlt)), setu?gt)
+  // (select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setgt)), setu?gt)
+  case ISD::SETUGT:
+    if (!IsUnCmp && InnerCC != ISD::SETNE)
+      return false;
+    IsUnCmp = true;
+    LLVM_FALLTHROUGH;
+  case ISD::SETGT:
+    if (InnerCC == ISD::SETNE || (InnerCC == ISD::SETLT && !InnerSwapped) ||
+        (InnerCC == ISD::SETGT && InnerSwapped))
+      NeedSwapOps = (TrueResVal == -1);
+    else
+      return false;
+    break;
+
+  default:
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Found a node that can be lowered to a SETB: ");
+  LLVM_DEBUG(N->dump());
+
+  return true;
 }
 
 // Select - Convert the specified operand from a target-independent to a
@@ -4645,6 +4741,31 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
         N->getOperand(0).getValueType() == MVT::i1)
       break;
 
+    if (PPCSubTarget->isISA3_0() && PPCSubTarget->isPPC64()) {
+      bool NeedSwapOps = false;
+      bool IsUnCmp = false;
+      if (mayUseP9Setb(N, CC, CurDAG, NeedSwapOps, IsUnCmp)) {
+        SDValue LHS = N->getOperand(0);
+        SDValue RHS = N->getOperand(1);
+        if (NeedSwapOps)
+          std::swap(LHS, RHS);
+
+        // Make use of SelectCC to generate the comparison to set CR bits, for
+        // equality comparisons having one literal operand, SelectCC probably
+        // doesn't need to materialize the whole literal and just use xoris to
+        // check it first, it leads the following comparison result can't
+        // exactly represent GT/LT relationship. So to avoid this we specify
+        // SETGT/SETUGT here instead of SETEQ.
+        SDValue GenCC =
+            SelectCC(LHS, RHS, IsUnCmp ? ISD::SETUGT : ISD::SETGT, dl);
+        CurDAG->SelectNodeTo(
+            N, N->getSimpleValueType(0) == MVT::i64 ? PPC::SETB8 : PPC::SETB,
+            N->getValueType(0), GenCC);
+        NumP9Setb++;
+        return;
+      }
+    }
+
     // Handle the setcc cases here.  select_cc lhs, 0, 1, 0, cc
     if (!isPPC64)
       if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
@@ -4736,14 +4857,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
     return;
   }
-  case ISD::VSELECT:
-    if (PPCSubTarget->hasVSX()) {
-      SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
-      CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
-      return;
-    }
-    break;
-
   case ISD::VECTOR_SHUFFLE:
     if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
@@ -4840,6 +4953,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       case PPC::PRED_NE: Opc = PPC::CRXOR;  Swap = false; break;
       }
 
+      // A signed comparison of i1 values produces the opposite result to an
+      // unsigned one if the condition code includes less-than or greater-than.
+      // This is because 1 is the most negative signed i1 number and the most
+      // positive unsigned i1 number. The CR-logical operations used for such
+      // comparisons are non-commutative so for signed comparisons vs. unsigned
+      // ones, the input operands just need to be swapped.
+      if (ISD::isSignedIntSetCC(CC))
+        Swap = !Swap;
+
       SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1,
                                              N->getOperand(Swap ? 3 : 2),
                                              N->getOperand(Swap ? 2 : 3)), 0);
@@ -4896,9 +5018,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue TOCbase = N->getOperand(1);
     SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
                                          TOCbase, GA);
-
-    if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) ||
-        CModel == CodeModel::Large) {
+    if (PPCLowering->isAccessedAsGotIndirect(GA)) {
+      // If it is access as got-indirect, we need an extra LD to load
+      // the address.
       SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
                                           SDValue(Tmp, 0));
       transferMemOperands(N, MN);
@@ -4906,18 +5028,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
-      const GlobalValue *GV = G->getGlobal();
-      unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
-      if (GVFlags & PPCII::MO_NLP_FLAG) {
-        SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
-                                            SDValue(Tmp, 0));
-        transferMemOperands(N, MN);
-        ReplaceNode(N, MN);
-        return;
-      }
-    }
-
+    // Build the address relative to the TOC-pointer..
     ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
                                           SDValue(Tmp, 0), GA));
     return;
@@ -5003,55 +5114,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       return;
     }
   }
-  case ISD::ABS: {
-    assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector");
-
-    // For vector absolute difference, we use VABSDUW instruction of POWER9.
-    // Since VABSDU instructions are for unsigned integers, we need adjustment
-    // for signed integers.
-    // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000).
-    // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1.
-    // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000).
-    EVT VecVT = N->getOperand(0).getValueType();
-    SDNode *AbsOp = nullptr;
-    unsigned AbsOpcode;
-
-    if (VecVT == MVT::v4i32)
-      AbsOpcode = PPC::VABSDUW;
-    else if (VecVT == MVT::v8i16)
-      AbsOpcode = PPC::VABSDUH;
-    else if (VecVT == MVT::v16i8)
-      AbsOpcode = PPC::VABSDUB;
-    else
-      llvm_unreachable("Unsupported vector data type for ISD::ABS");
-
-    // Even for signed integers, we can skip adjustment if all values are
-    // known to be positive (as signed integer) due to zero-extended inputs.
-    if (N->getOperand(0).getOpcode() == ISD::SUB &&
-        N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
-        N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) {
-      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
-                                     SDValue(N->getOperand(0)->getOperand(0)),
-                                     SDValue(N->getOperand(0)->getOperand(1)));
-      ReplaceNode(N, AbsOp);
-      return;
-    }
-    if (N->getOperand(0).getOpcode() == ISD::SUB) {
-      SDValue SubVal = N->getOperand(0);
-      SDNode *Op0 = flipSignBit(SubVal->getOperand(0));
-      SDNode *Op1 = flipSignBit(SubVal->getOperand(1));
-      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
-                                     SDValue(Op0, 0), SDValue(Op1, 0));
-    }
-    else {
-      SDNode *Op1 = nullptr;
-      SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1);
-      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0),
-                                     SDValue(Op1, 0));
-    }
-    ReplaceNode(N, AbsOp);
-    return;
-  }
   }
 
   SelectCode(N);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index c6f0212ab404ded54197438490319b8cc6e3774c..92af82dc4b9f52cb1af665dfdbd7b532ab07a939 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -251,12 +251,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::UREM, MVT::i64, Expand);
   }
 
-  if (Subtarget.hasP9Vector()) {
-    setOperationAction(ISD::ABS, MVT::v4i32, Legal);
-    setOperationAction(ISD::ABS, MVT::v8i16, Legal);
-    setOperationAction(ISD::ABS, MVT::v16i8, Legal);
-  }
-
   // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
@@ -323,12 +317,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // to speed up scalar BSWAP64.
   // CTPOP or CTTZ were introduced in P8/P9 respectively
   setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
-  if (Subtarget.isISA3_0()) {
+  if (Subtarget.hasP9Vector())
     setOperationAction(ISD::BSWAP, MVT::i64  , Custom);
+  else
+    setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
+  if (Subtarget.isISA3_0()) {
     setOperationAction(ISD::CTTZ , MVT::i32  , Legal);
     setOperationAction(ISD::CTTZ , MVT::i64  , Legal);
   } else {
-    setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
     setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
     setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
   }
@@ -554,6 +550,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       // add/sub are legal for all supported vector VT's.
       setOperationAction(ISD::ADD, VT, Legal);
       setOperationAction(ISD::SUB, VT, Legal);
+      setOperationAction(ISD::ABS, VT, Custom);
 
       // Vector instructions introduced in P8
       if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
@@ -586,6 +583,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       AddPromotedToType (ISD::LOAD  , VT, MVT::v4i32);
       setOperationAction(ISD::SELECT, VT, Promote);
       AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
+      setOperationAction(ISD::VSELECT, VT, Legal);
       setOperationAction(ISD::SELECT_CC, VT, Promote);
       AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
       setOperationAction(ISD::STORE, VT, Promote);
@@ -626,7 +624,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
       setOperationAction(ISD::FPOW, VT, Expand);
       setOperationAction(ISD::BSWAP, VT, Expand);
-      setOperationAction(ISD::VSELECT, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
       setOperationAction(ISD::ROTL, VT, Expand);
       setOperationAction(ISD::ROTR, VT, Expand);
@@ -659,6 +656,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
 
+    // Without hasP8Altivec set, v2i64 SMAX isn't available.
+    // But ABS custom lowering requires SMAX support.
+    if (!Subtarget.hasP8Altivec())
+      setOperationAction(ISD::ABS, MVT::v2i64, Expand);
+
     addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
     addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
     addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
@@ -727,12 +729,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
       setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
 
-      setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
-      setOperationAction(ISD::VSELECT, MVT::v8i16, Legal);
-      setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
-      setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
-      setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
-
       // Share the Altivec comparison restrictions.
       setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
       setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
@@ -1087,6 +1083,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setTargetDAGCombine(ISD::FSQRT);
   }
 
+  if (Subtarget.hasP9Altivec()) {
+    setTargetDAGCombine(ISD::ABS);
+    setTargetDAGCombine(ISD::VSELECT);
+  }
+
   // Darwin long double math library functions have $LDBL128 appended.
   if (Subtarget.isDarwin()) {
     setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
@@ -1347,6 +1348,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::RFEBB:           return "PPCISD::RFEBB";
   case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
   case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
+  case PPCISD::VABSD:           return "PPCISD::VABSD";
   case PPCISD::QVFPERM:         return "PPCISD::QVFPERM";
   case PPCISD::QVGPCI:          return "PPCISD::QVGPCI";
   case PPCISD::QVALIGNI:        return "PPCISD::QVALIGNI";
@@ -9007,35 +9009,6 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getRegister(PPC::R2, MVT::i32);
   }
 
-  // We are looking for absolute values here.
-  // The idea is to try to fit one of two patterns:
-  //  max (a, (0-a))  OR  max ((0-a), a)
-  if (Subtarget.hasP9Vector() &&
-      (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw ||
-       IntrinsicID == Intrinsic::ppc_altivec_vmaxsh ||
-       IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) {
-    SDValue V1 = Op.getOperand(1);
-    SDValue V2 = Op.getOperand(2);
-    if (V1.getSimpleValueType() == V2.getSimpleValueType() &&
-        (V1.getSimpleValueType() == MVT::v4i32 ||
-         V1.getSimpleValueType() == MVT::v8i16 ||
-         V1.getSimpleValueType() == MVT::v16i8)) {
-      if ( V1.getOpcode() == ISD::SUB &&
-           ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
-           V1.getOperand(1) == V2 ) {
-        // Generate the abs instruction with the operands
-        return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2);
-      }
-
-      if ( V2.getOpcode() == ISD::SUB &&
-           ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
-           V2.getOperand(1) == V1 ) {
-        // Generate the abs instruction with the operands
-        return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1);
-      }
-    }
-  }
-
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
   // opcode number of the comparison.
   int CompareOpc;
@@ -9576,6 +9549,44 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
+
+  assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");
+
+  EVT VT = Op.getValueType();
+  assert(VT.isVector() &&
+         "Only set vector abs as custom, scalar abs shouldn't reach here!");
+  assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+          VT == MVT::v16i8) &&
+         "Unexpected vector element type!");
+  assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) &&
+         "Current subtarget doesn't support smax v2i64!");
+
+  // For vector abs, it can be lowered to:
+  // abs x
+  // ==>
+  // y = -x
+  // smax(x, y)
+
+  SDLoc dl(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Zero = DAG.getConstant(0, dl, VT);
+  SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);
+
+  // SMAX patch https://reviews.llvm.org/D47332
+  // hasn't landed yet, so use intrinsic first here.
+  // TODO: Should use SMAX directly once SMAX patch landed
+  Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
+  if (VT == MVT::v2i64)
+    BifID = Intrinsic::ppc_altivec_vmaxsd;
+  else if (VT == MVT::v8i16)
+    BifID = Intrinsic::ppc_altivec_vmaxsh;
+  else if (VT == MVT::v16i8)
+    BifID = Intrinsic::ppc_altivec_vmaxsb;
+  
+  return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
+}
+
 /// LowerOperation - Provide custom lowering hooks for some operations.
 ///
 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -9628,6 +9639,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
+  case ISD::ABS:                return LowerABS(Op, DAG);
 
   // For counter-based loop handling.
   case ISD::INTRINSIC_W_CHAIN:  return SDValue();
@@ -12610,9 +12622,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         (Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
          (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
 
-      // STBRX can only handle simple types.
+      // STBRX can only handle simple types and it makes no sense to store less
+      // two bytes in byte-reversed order.
       EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
-      if (mVT.isExtended())
+      if (mVT.isExtended() || mVT.getSizeInBits() < 16)
         break;
 
       SDValue BSwapOp = N->getOperand(1).getOperand(0);
@@ -12988,6 +13001,39 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
           }
         }
       }
+
+      // Combine vmaxsw/h/b(a, a's negation) to abs(a)
+      // Expose the vabsduw/h/b opportunity for down stream
+      if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
+          (IID == Intrinsic::ppc_altivec_vmaxsw ||
+           IID == Intrinsic::ppc_altivec_vmaxsh ||
+           IID == Intrinsic::ppc_altivec_vmaxsb)) {
+        SDValue V1 = N->getOperand(1);
+        SDValue V2 = N->getOperand(2);
+        if ((V1.getSimpleValueType() == MVT::v4i32 ||
+             V1.getSimpleValueType() == MVT::v8i16 ||
+             V1.getSimpleValueType() == MVT::v16i8) &&
+            V1.getSimpleValueType() == V2.getSimpleValueType()) {
+          // (0-a, a)
+          if (V1.getOpcode() == ISD::SUB &&
+              ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
+              V1.getOperand(1) == V2) {
+            return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
+          }
+          // (a, 0-a)
+          if (V2.getOpcode() == ISD::SUB &&
+              ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
+              V2.getOperand(1) == V1) {
+            return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
+          }
+          // (x-y, y-x)
+          if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
+              V1.getOperand(0) == V2.getOperand(1) &&
+              V1.getOperand(1) == V2.getOperand(0)) {
+            return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
+          }
+        }
+      }
     }
 
     break;
@@ -13220,6 +13266,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::BUILD_VECTOR:
     return DAGCombineBuildVector(N, DCI);
+  case ISD::ABS: 
+    return combineABS(N, DCI);
+  case ISD::VSELECT: 
+    return combineVSelect(N, DCI);
   }
 
   return SDValue();
@@ -13713,6 +13763,35 @@ unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
   report_fatal_error("Invalid register name global variable");
 }
 
+bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
+  // 32-bit SVR4 ABI access everything as got-indirect.
+  if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
+    return true;
+
+  CodeModel::Model CModel = getTargetMachine().getCodeModel();
+  // If it is small or large code model, module locals are accessed
+  // indirectly by loading their address from .toc/.got. The difference
+  // is that for large code model we have ADDISTocHa + LDtocL and for
+  // small code model we simply have LDtoc.
+  if (CModel == CodeModel::Small || CModel == CodeModel::Large)
+    return true;
+
+  // JumpTable and BlockAddress are accessed as got-indirect. 
+  if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
+    return true;
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
+    const GlobalValue *GV = G->getGlobal();
+    unsigned char GVFlags = Subtarget.classifyGlobalReference(GV);
+    // The NLP flag indicates that a global access has to use an
+    // extra indirection.
+    if (GVFlags & PPCII::MO_NLP_FLAG)
+      return true;
+  }
+
+  return false;
+}
+
 bool
 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The PowerPC target isn't yet aware of offsets.
@@ -14477,3 +14556,109 @@ isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
   // For non-constant masks, we can always use the record-form and.
   return true;
 }
+
+// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0)
+// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0)
+// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0)
+// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0)
+// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32
+SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const {
+  assert((N->getOpcode() == ISD::ABS) && "Need ABS node here");
+  assert(Subtarget.hasP9Altivec() &&
+         "Only combine this when P9 altivec supported!");
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  if (N->getOperand(0).getOpcode() == ISD::SUB) {
+    // Even for signed integers, if it's known to be positive (as signed
+    // integer) due to zero-extended inputs.
+    unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode();
+    unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode();
+    if ((SubOpcd0 == ISD::ZERO_EXTEND ||
+         SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+        (SubOpcd1 == ISD::ZERO_EXTEND ||
+         SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) {
+      return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
+                         N->getOperand(0)->getOperand(0),
+                         N->getOperand(0)->getOperand(1),
+                         DAG.getTargetConstant(0, dl, MVT::i32));
+    }
+
+    // For type v4i32, it can be optimized with xvnegsp + vabsduw
+    if (N->getOperand(0).getValueType() == MVT::v4i32 &&
+        N->getOperand(0).hasOneUse()) {
+      return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
+                         N->getOperand(0)->getOperand(0),
+                         N->getOperand(0)->getOperand(1),
+                         DAG.getTargetConstant(1, dl, MVT::i32));
+    }
+  }
+
+  return SDValue();
+}
+
+// For type v4i32/v8ii16/v16i8, transform
+// from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b)
+// from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b)
+// from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b)
+// from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b)
+SDValue PPCTargetLowering::combineVSelect(SDNode *N,
+                                          DAGCombinerInfo &DCI) const {
+  assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here");
+  assert(Subtarget.hasP9Altivec() &&
+         "Only combine this when P9 altivec supported!");
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue TrueOpnd = N->getOperand(1);
+  SDValue FalseOpnd = N->getOperand(2);
+  EVT VT = N->getOperand(1).getValueType();
+
+  if (Cond.getOpcode() != ISD::SETCC || TrueOpnd.getOpcode() != ISD::SUB ||
+      FalseOpnd.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  // ABSD only available for type v4i32/v8i16/v16i8
+  if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
+    return SDValue();
+
+  // At least to save one more dependent computation
+  if (!(Cond.hasOneUse() || TrueOpnd.hasOneUse() || FalseOpnd.hasOneUse()))
+    return SDValue();
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+  // Can only handle unsigned comparison here
+  switch (CC) {
+  default:
+    return SDValue();
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    break;
+  case ISD::SETULT:
+  case ISD::SETULE:
+    std::swap(TrueOpnd, FalseOpnd);
+    break;
+  }
+
+  SDValue CmpOpnd1 = Cond.getOperand(0);
+  SDValue CmpOpnd2 = Cond.getOperand(1);
+
+  // SETCC CmpOpnd1 CmpOpnd2 cond
+  // TrueOpnd = CmpOpnd1 - CmpOpnd2
+  // FalseOpnd = CmpOpnd2 - CmpOpnd1
+  if (TrueOpnd.getOperand(0) == CmpOpnd1 &&
+      TrueOpnd.getOperand(1) == CmpOpnd2 &&
+      FalseOpnd.getOperand(0) == CmpOpnd2 &&
+      FalseOpnd.getOperand(1) == CmpOpnd1) {
+    return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(),
+                       CmpOpnd1, CmpOpnd2,
+                       DAG.getTargetConstant(0, dl, MVT::i32));
+  }
+
+  return SDValue();
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 1020cab48c8c7bdbf2c29472a04ce81c3803acd1..09039cb77363d909561ccea7d3c6098d5c5e7d94 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -373,6 +373,21 @@ namespace llvm {
       /// An SDNode for swaps that are not associated with any loads/stores
       /// and thereby have no chain.
       SWAP_NO_CHAIN,
+      
+      /// An SDNode for Power9 vector absolute value difference.
+      /// operand #0 vector
+      /// operand #1 vector
+      /// operand #2 constant i32 0 or 1, to indicate whether needs to patch
+      /// the most significant bit for signed i32
+      ///
+      /// Power9 VABSD* instructions are designed to support unsigned integer
+      /// vectors (byte/halfword/word), if we want to make use of them for signed
+      /// integer vectors, we have to flip their sign bits first. To flip sign bit
+      /// for byte/halfword integer vector would become inefficient, but for word
+      /// integer vector, we can leverage XVNEGSP to make it efficiently. eg:
+      /// abs(sub(a,b)) => VABSDUW(a+0x80000000, b+0x80000000) 
+      ///               => VABSDUW((XVNEGSP a), (XVNEGSP b))
+      VABSD,
 
       /// QVFPERM = This corresponds to the QPX qvfperm instruction.
       QVFPERM,
@@ -789,6 +804,9 @@ namespace llvm {
       return true;
     }
 
+    // Returns true if the address of the global is stored in TOC entry.
+    bool isAccessedAsGotIndirect(SDValue N) const;
+
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
     bool getTgtMemIntrinsic(IntrinsicInfo &Info,
@@ -995,6 +1013,7 @@ namespace llvm {
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
@@ -1098,6 +1117,8 @@ namespace llvm {
     SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index b533efd0ffa4433f253575eaf48cdc3bea91e5f0..2ce6ad3293eb1bc0644ecbef8b9176d7f01c5655 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -94,7 +94,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
 }
 
 let Defs = [LR8] in
-  def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>,
+  def MovePCtoLR8 : PPCEmitTimePseudo<(outs), (ins), "#MovePCtoLR8", []>,
                     PPC970_Unit_BRU;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
@@ -199,47 +199,45 @@ def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
 // clean this up in PPCMIPeephole with calls to
 // PPCInstrInfo::convertToImmediateForm() but we should probably not emit them
 // in the first place.
-let usesCustomInserter = 1 in {
-  let Defs = [CR0] in {
-    def ATOMIC_LOAD_ADD_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64",
-      [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_SUB_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64",
-      [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_OR_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64",
-      [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_XOR_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64",
-      [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_AND_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64",
-      [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_NAND_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64",
-      [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_MIN_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64",
-      [(set i64:$dst, (atomic_load_min_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_MAX_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64",
-      [(set i64:$dst, (atomic_load_max_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_UMIN_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64",
-      [(set i64:$dst, (atomic_load_umin_64 xoaddr:$ptr, i64:$incr))]>;
-    def ATOMIC_LOAD_UMAX_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64",
-      [(set i64:$dst, (atomic_load_umax_64 xoaddr:$ptr, i64:$incr))]>;
-
-    def ATOMIC_CMP_SWAP_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64",
-      [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>;
-
-    def ATOMIC_SWAP_I64 : Pseudo<
-      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64",
-      [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>;
-  }
+let Defs = [CR0] in {
+  def ATOMIC_LOAD_ADD_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64",
+    [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_SUB_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64",
+    [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_OR_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64",
+    [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_XOR_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64",
+    [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_AND_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64",
+    [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_NAND_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64",
+    [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_MIN_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64",
+    [(set i64:$dst, (atomic_load_min_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_MAX_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64",
+    [(set i64:$dst, (atomic_load_max_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_UMIN_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64",
+    [(set i64:$dst, (atomic_load_umin_64 xoaddr:$ptr, i64:$incr))]>;
+  def ATOMIC_LOAD_UMAX_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64",
+    [(set i64:$dst, (atomic_load_umax_64 xoaddr:$ptr, i64:$incr))]>;
+
+  def ATOMIC_CMP_SWAP_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64",
+    [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>;
+
+  def ATOMIC_SWAP_I64 : PPCCustomInserterPseudo<
+    (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64",
+    [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>;
 }
 
 // Instructions to support atomic operations
@@ -269,18 +267,18 @@ def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
 
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNdi8 :Pseudo< (outs),
+def TCRETURNdi8 :PPCEmitTimePseudo< (outs),
                         (ins calltarget:$dst, i32imm:$offset),
                  "#TC_RETURNd8 $dst $offset",
                  []>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
+def TCRETURNai8 :PPCEmitTimePseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
                  "#TC_RETURNa8 $func $offset",
                  [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
+def TCRETURNri8 : PPCEmitTimePseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
                  "#TC_RETURNr8 $dst $offset",
                  []>;
 
@@ -347,14 +345,19 @@ def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
 } // hasExtraSrcRegAllocReq = 1
 } // hasSideEffects = 0
 
-let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+// While longjmp is a control-flow barrier (fallthrough isn't allowed), setjmp
+// is not.
+let hasSideEffects = 1 in {
   let Defs = [CTR8] in
-  def EH_SjLj_SetJmp64  : Pseudo<(outs gprc:$dst), (ins memr:$buf),
+  def EH_SjLj_SetJmp64  : PPCCustomInserterPseudo<(outs gprc:$dst), (ins memr:$buf),
                             "#EH_SJLJ_SETJMP64",
                             [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
                           Requires<[In64BitMode]>;
+}
+
+let hasSideEffects = 1, isBarrier = 1 in {
   let isTerminator = 1 in
-  def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf),
+  def EH_SjLj_LongJmp64 : PPCCustomInserterPseudo<(outs), (ins memr:$buf),
                             "#EH_SJLJ_LONGJMP64",
                             [(PPCeh_sjlj_longjmp addr:$buf)]>,
                           Requires<[In64BitMode]>;
@@ -396,10 +399,10 @@ def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
 // the POWER3.
 
 let Defs = [X1], Uses = [X1] in
-def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
+def DYNALLOC8 : PPCEmitTimePseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
                        [(set i64:$result,
                              (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
-def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
+def DYNAREAOFFSET8 : PPCEmitTimePseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
                        [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
 
 let Defs = [LR8] in {
@@ -774,8 +777,12 @@ def MADDHDU : VAForm_1a<49, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC)
                        "maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
 def MADDLD : VAForm_1a<51, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
                        "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
-def SETB : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA),
-                     "setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
+def SETB : XForm_44<31, 128, (outs gprc:$RT), (ins crrc:$BFA),
+                       "setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+  def SETB8 : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA),
+                       "setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
+}
 def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins i32imm:$L),
                      "darn $RT, $L", IIC_LdStLD>, isPPC64;
 def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D),
@@ -1019,19 +1026,19 @@ def LD   : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
 // The following four definitions are selected for small code model only.
 // Otherwise, we need to create two instructions to form a 32-bit offset,
 // so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
-def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+def LDtoc: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
                   "#LDtoc",
                   [(set i64:$rD,
                      (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
-def LDtocJTI: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+def LDtocJTI: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
                   "#LDtocJTI",
                   [(set i64:$rD,
                      (PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64;
-def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+def LDtocCPT: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
                   "#LDtocCPT",
                   [(set i64:$rD,
                      (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
-def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+def LDtocBA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
                   "#LDtocCPT",
                   [(set i64:$rD,
                      (PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64;
@@ -1072,40 +1079,40 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
 // Support for medium and large code model.
 let hasSideEffects = 0 in {
 let isReMaterializable = 1 in {
-def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+def ADDIStocHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                        "#ADDIStocHA", []>, isPPC64;
-def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                      "#ADDItocL", []>, isPPC64;
 }
 let mayLoad = 1 in
-def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
+def LDtocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
                    "#LDtocL", []>, isPPC64;
 }
 
 // Support for thread-local storage.
-def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+def ADDISgotTprelHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDISgotTprelHA",
                          [(set i64:$rD,
                            (PPCaddisGotTprelHA i64:$reg,
                                                tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
+def LDgotTprelL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
                         "#LDgotTprelL",
                         [(set i64:$rD,
                           (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
                  isPPC64;
 
-let isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
-def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>;
+let Defs = [CR7], Itinerary = IIC_LdStSync in
+def CFENCE8 : PPCPostRAExpPseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>;
 
 def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
           (ADD8TLS $in, tglobaltlsaddr:$g)>;
-def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+def ADDIStlsgdHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIStlsgdHA",
                          [(set i64:$rD,
                            (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+def ADDItlsgdL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                        "#ADDItlsgdL",
                        [(set i64:$rD,
                          (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
@@ -1116,7 +1123,7 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
 // correct because the branch select pass is relying on it.
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8,
     Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
-def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+def GETtlsADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
                         "#GETtlsADDR",
                         [(set i64:$rD,
                           (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
@@ -1126,7 +1133,7 @@ def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
     Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7]
     in
-def ADDItlsgdLADDR : Pseudo<(outs g8rc:$rD),
+def ADDItlsgdLADDR : PPCEmitTimePseudo<(outs g8rc:$rD),
                             (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym),
                             "#ADDItlsgdLADDR",
                             [(set i64:$rD,
@@ -1134,12 +1141,12 @@ def ADDItlsgdLADDR : Pseudo<(outs g8rc:$rD),
                                                  tglobaltlsaddr:$disp,
                                                  tglobaltlsaddr:$sym))]>,
                      isPPC64;
-def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+def ADDIStlsldHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIStlsldHA",
                          [(set i64:$rD,
                            (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+def ADDItlsldL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                        "#ADDItlsldL",
                        [(set i64:$rD,
                          (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
@@ -1148,7 +1155,7 @@ def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
 // explicitly defined when this op is created, so not mentioned here.
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
     Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
-def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+def GETtlsldADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
                           "#GETtlsldADDR",
                           [(set i64:$rD,
                             (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
@@ -1158,7 +1165,7 @@ def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
     Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7]
     in
-def ADDItlsldLADDR : Pseudo<(outs g8rc:$rD),
+def ADDItlsldLADDR : PPCEmitTimePseudo<(outs g8rc:$rD),
                             (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym),
                             "#ADDItlsldLADDR",
                             [(set i64:$rD,
@@ -1166,13 +1173,13 @@ def ADDItlsldLADDR : Pseudo<(outs g8rc:$rD),
                                                  tglobaltlsaddr:$disp,
                                                  tglobaltlsaddr:$sym))]>,
                      isPPC64;
-def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+def ADDISdtprelHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                           "#ADDISdtprelHA",
                           [(set i64:$rD,
                             (PPCaddisDtprelHA i64:$reg,
                                               tglobaltlsaddr:$disp))]>,
                    isPPC64;
-def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+def ADDIdtprelL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIdtprelL",
                          [(set i64:$rD,
                            (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
@@ -1222,30 +1229,30 @@ def STDBRX: XForm_8_memOp<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
-                   "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   "stbu $rS, $dst", IIC_LdStSTU, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
-                   "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   "sthu $rS, $dst", IIC_LdStSTU, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
-                   "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   "stwu $rS, $dst", IIC_LdStSTU, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 
 def STBUX8: XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
                           (ins g8rc:$rS, memrr:$dst),
-                          "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          "stbux $rS, $dst", IIC_LdStSTUX, []>,
                           RegConstraint<"$dst.ptrreg = $ea_res">,
                           NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STHUX8: XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
                           (ins g8rc:$rS, memrr:$dst),
-                          "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          "sthux $rS, $dst", IIC_LdStSTUX, []>,
                           RegConstraint<"$dst.ptrreg = $ea_res">,
                           NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
                           (ins g8rc:$rS, memrr:$dst),
-                          "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          "stwux $rS, $dst", IIC_LdStSTUX, []>,
                           RegConstraint<"$dst.ptrreg = $ea_res">,
                           NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
@@ -1253,13 +1260,13 @@ def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
 
 def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res),
                    (ins g8rc:$rS, memrix:$dst),
-                   "stdu $rS, $dst", IIC_LdStSTDU, []>,
+                   "stdu $rS, $dst", IIC_LdStSTU, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
                    isPPC64;
 
 def STDUX : XForm_8_memOp<31, 181, (outs ptr_rc_nor0:$ea_res),
                           (ins g8rc:$rS, memrr:$dst),
-                          "stdux $rS, $dst", IIC_LdStSTDUX, []>,
+                          "stdux $rS, $dst", IIC_LdStSTUX, []>,
                           RegConstraint<"$dst.ptrreg = $ea_res">,
                           NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 24969d7ef853773e75f4fea921f3a2f820cd8ca3..69b19e45c3e9ccb9c118d210d1557ae1f93435ce 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1051,6 +1051,20 @@ def : Pat<(v4f32 (ftrunc v4f32:$vA)),
 def : Pat<(v4f32 (fnearbyint v4f32:$vA)),
           (VRFIN $vA)>;
 
+// Vector selection
+def : Pat<(v16i8 (vselect v16i8:$vA, v16i8:$vB, v16i8:$vC)),
+          (VSEL $vC, $vB, $vA)>;
+def : Pat<(v8i16 (vselect v8i16:$vA, v8i16:$vB, v8i16:$vC)),
+          (VSEL $vC, $vB, $vA)>;
+def : Pat<(v4i32 (vselect v4i32:$vA, v4i32:$vB, v4i32:$vC)),
+          (VSEL $vC, $vB, $vA)>;
+def : Pat<(v2i64 (vselect v2i64:$vA, v2i64:$vB, v2i64:$vC)),
+          (VSEL $vC, $vB, $vA)>;
+def : Pat<(v4f32 (vselect v4i32:$vA, v4f32:$vB, v4f32:$vC)),
+          (VSEL $vC, $vB, $vA)>;
+def : Pat<(v2f64 (vselect v2i64:$vA, v2f64:$vB, v2f64:$vC)),
+          (VSEL $vC, $vB, $vA)>;
+
 } // end HasAltivec
 
 def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">;
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index f5f4b46344cff8959e347a437cd49b894229b33c..2fe765dd99e1e90fdd104f6fcdbf737486cddfe1 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -2153,7 +2153,9 @@ class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
 }
 
 //===----------------------------------------------------------------------===//
-class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+// EmitTimePseudo won't have encoding information for the [MC]CodeEmitter
+// stuff
+class PPCEmitTimePseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
     : I<0, OOL, IOL, asmstr, NoItinerary> {
   let isCodeGenOnly = 1;
   let PPC64 = 0;
@@ -2162,6 +2164,21 @@ class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
   let hasNoSchedulingInfo = 1;
 }
 
+// Instruction that require custom insertion support
+// a.k.a. ISelPseudos, however, these won't have isPseudo set
+class PPCCustomInserterPseudo<dag OOL, dag IOL, string asmstr,
+                              list<dag> pattern>
+    : PPCEmitTimePseudo<OOL, IOL, asmstr, pattern> {
+  let usesCustomInserter = 1;
+}
+
+// PostRAPseudo will be expanded in expandPostRAPseudo, isPseudo flag in td
+// files is set only for PostRAPseudo
+class PPCPostRAExpPseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : PPCEmitTimePseudo<OOL, IOL, asmstr, pattern> {
+  let isPseudo = 1;
+}
+
 class PseudoXFormMemOp<dag OOL, dag IOL, string asmstr, list<dag> pattern>
-    : Pseudo<OOL, IOL, asmstr, pattern>, XFormMemOp;
+    : PPCPostRAExpPseudo<OOL, IOL, asmstr, pattern>, XFormMemOp;
 
diff --git a/lib/Target/PowerPC/PPCInstrHTM.td b/lib/Target/PowerPC/PPCInstrHTM.td
index 6c4e2129087cabb135101cadff829482a94fb6c9..0efe797c765daa7107c0fdf5e1a2a80aa925c705 100644
--- a/lib/Target/PowerPC/PPCInstrHTM.td
+++ b/lib/Target/PowerPC/PPCInstrHTM.td
@@ -20,8 +20,8 @@ def HTM_get_imm : SDNodeXForm<imm, [{
   return getI32Imm (N->getZExtValue(), SDLoc(N));
 }]>;
 
-let hasSideEffects = 1, usesCustomInserter = 1  in {
-def TCHECK_RET : Pseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>;
+let hasSideEffects = 1 in {
+def TCHECK_RET : PPCCustomInserterPseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>;
 }
 
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 559ed59bec9f1f08f9f1ba02bcf639e935aeb7a9..d26af451303b6368e0e513033860d47ffa31b104 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1429,17 +1429,15 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
                                       : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR)));
     } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
       MI.setDesc(get(PPC::BCLR));
-      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-          .addReg(Pred[1].getReg());
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
     } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
       MI.setDesc(get(PPC::BCLRn));
-      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-          .addReg(Pred[1].getReg());
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
     } else {
       MI.setDesc(get(PPC::BCCLR));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
           .addImm(Pred[0].getImm())
-          .addReg(Pred[1].getReg());
+          .add(Pred[1]);
     }
 
     return true;
@@ -1454,7 +1452,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
 
       MI.setDesc(get(PPC::BC));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-          .addReg(Pred[1].getReg())
+          .add(Pred[1])
           .addMBB(MBB);
     } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
       MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
@@ -1462,7 +1460,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
 
       MI.setDesc(get(PPC::BCn));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-          .addReg(Pred[1].getReg())
+          .add(Pred[1])
           .addMBB(MBB);
     } else {
       MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
@@ -1471,13 +1469,13 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
       MI.setDesc(get(PPC::BCC));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
           .addImm(Pred[0].getImm())
-          .addReg(Pred[1].getReg())
+          .add(Pred[1])
           .addMBB(MBB);
     }
 
     return true;
-  } else if (OpC == PPC::BCTR  || OpC == PPC::BCTR8 ||
-             OpC == PPC::BCTRL || OpC == PPC::BCTRL8) {
+  } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || OpC == PPC::BCTRL ||
+             OpC == PPC::BCTRL8) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR)
       llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
 
@@ -1487,14 +1485,12 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
     if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
       MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8)
                              : (setLR ? PPC::BCCTRL : PPC::BCCTR)));
-      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-          .addReg(Pred[1].getReg());
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
       return true;
     } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
       MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n)
                              : (setLR ? PPC::BCCTRLn : PPC::BCCTRn)));
-      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-          .addReg(Pred[1].getReg());
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
       return true;
     }
 
@@ -1502,7 +1498,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
                            : (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
         .addImm(Pred[0].getImm())
-        .addReg(Pred[1].getReg());
+        .add(Pred[1]);
     return true;
   }
 
@@ -2984,8 +2980,10 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       if (PostRA) {
         if (isVFReg(MI.getOperand(0).getReg()))
           III.ImmOpcode = PPC::LXSSP;
-        else
+        else {
           III.ImmOpcode = PPC::LFS;
+          III.ImmMustBeMultipleOf = 1;
+        }
         break;
       }
       LLVM_FALLTHROUGH;
@@ -2996,8 +2994,10 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       if (PostRA) {
         if (isVFReg(MI.getOperand(0).getReg()))
           III.ImmOpcode = PPC::LXSD;
-        else
+        else {
           III.ImmOpcode = PPC::LFD;
+          III.ImmMustBeMultipleOf = 1;
+        }
         break;
       }
       LLVM_FALLTHROUGH;
@@ -3012,8 +3012,10 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       if (PostRA) {
         if (isVFReg(MI.getOperand(0).getReg()))
           III.ImmOpcode = PPC::STXSSP;
-        else
+        else {
           III.ImmOpcode = PPC::STFS;
+          III.ImmMustBeMultipleOf = 1;
+        }
         break;
       }
       LLVM_FALLTHROUGH;
@@ -3024,8 +3026,10 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       if (PostRA) {
         if (isVFReg(MI.getOperand(0).getReg()))
           III.ImmOpcode = PPC::STXSD;
-        else
+        else {
           III.ImmOpcode = PPC::STFD;
+          III.ImmMustBeMultipleOf = 1;
+        }
         break;
       }
       LLVM_FALLTHROUGH;
@@ -3484,6 +3488,7 @@ static bool isSignExtendingOp(const MachineInstr &MI) {
       Opcode == PPC::EXTSH  || Opcode == PPC::EXTSHo  ||
       Opcode == PPC::EXTSB8 || Opcode == PPC::EXTSH8  ||
       Opcode == PPC::EXTSW  || Opcode == PPC::EXTSWo  ||
+      Opcode == PPC::SETB   || Opcode == PPC::SETB8   ||
       Opcode == PPC::EXTSH8_32_64 || Opcode == PPC::EXTSW_32_64 ||
       Opcode == PPC::EXTSB8_32_64)
     return true;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index d7e32a5d89f12f13c014e26f0f5a3dfa3707397c..dd3f1ac790894961d9550ffd2380c8cbd4f52424 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1195,77 +1195,76 @@ multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Definitions.
 
-// Pseudo-instructions:
+// Pseudo instructions:
 
 let hasCtrlDep = 1 in {
 let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+def ADJCALLSTACKDOWN : PPCEmitTimePseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
                               "#ADJCALLSTACKDOWN $amt1 $amt2",
                               [(callseq_start timm:$amt1, timm:$amt2)]>;
-def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+def ADJCALLSTACKUP   : PPCEmitTimePseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
                               "#ADJCALLSTACKUP $amt1 $amt2",
                               [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
-def UPDATE_VRSAVE    : Pseudo<(outs gprc:$rD), (ins gprc:$rS),
+def UPDATE_VRSAVE    : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$rS),
                               "UPDATE_VRSAVE $rD, $rS", []>;
 }
 
 let Defs = [R1], Uses = [R1] in
-def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
+def DYNALLOC : PPCEmitTimePseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
                        [(set i32:$result,
                              (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
-def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
+def DYNAREAOFFSET : PPCEmitTimePseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
                        [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
                          
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.
-let usesCustomInserter = 1,    // Expanded after instruction selection.
-    PPC970_Single = 1 in {
+let PPC970_Single = 1 in {
   // Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes
   // because either operand might become the first operand in an isel, and
   // that operand cannot be r0.
-  def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond,
+  def SELECT_CC_I4 : PPCCustomInserterPseudo<(outs gprc:$dst), (ins crrc:$cond,
                               gprc_nor0:$T, gprc_nor0:$F,
                               i32imm:$BROPC), "#SELECT_CC_I4",
                               []>;
-  def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond,
+  def SELECT_CC_I8 : PPCCustomInserterPseudo<(outs g8rc:$dst), (ins crrc:$cond,
                               g8rc_nox0:$T, g8rc_nox0:$F,
                               i32imm:$BROPC), "#SELECT_CC_I8",
                               []>;
-  def SELECT_CC_F4  : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F,
+  def SELECT_CC_F4  : PPCCustomInserterPseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F,
                               i32imm:$BROPC), "#SELECT_CC_F4",
                               []>;
-  def SELECT_CC_F8  : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
+  def SELECT_CC_F8  : PPCCustomInserterPseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
                               i32imm:$BROPC), "#SELECT_CC_F8",
                               []>;
-  def SELECT_CC_F16  : Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
+  def SELECT_CC_F16  : PPCCustomInserterPseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
                               i32imm:$BROPC), "#SELECT_CC_F16",
                               []>;
-  def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
+  def SELECT_CC_VRRC: PPCCustomInserterPseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
                               i32imm:$BROPC), "#SELECT_CC_VRRC",
                               []>;
 
   // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
   // register bit directly.
-  def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond,
+  def SELECT_I4 : PPCCustomInserterPseudo<(outs gprc:$dst), (ins crbitrc:$cond,
                           gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4",
                           [(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>;
-  def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
+  def SELECT_I8 : PPCCustomInserterPseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
                           g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
                           [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
 let Predicates = [HasFPU] in {
-  def SELECT_F4  : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
+  def SELECT_F4  : PPCCustomInserterPseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
                           f4rc:$T, f4rc:$F), "#SELECT_F4",
                           [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
-  def SELECT_F8  : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
+  def SELECT_F8  : PPCCustomInserterPseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
                           f8rc:$T, f8rc:$F), "#SELECT_F8",
                           [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
-  def SELECT_F16  : Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
+  def SELECT_F16  : PPCCustomInserterPseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
                           vrrc:$T, vrrc:$F), "#SELECT_F16",
                           [(set f128:$dst, (select i1:$cond, f128:$T, f128:$F))]>;
 }
-  def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
+  def SELECT_VRRC: PPCCustomInserterPseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
                           vrrc:$T, vrrc:$F), "#SELECT_VRRC",
                           [(set v4i32:$dst,
                                 (select i1:$cond, v4i32:$T, v4i32:$F))]>;
@@ -1274,18 +1273,18 @@ let Predicates = [HasFPU] in {
 // SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
 // scavenge a register for it.
 let mayStore = 1 in {
-def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F),
+def SPILL_CR : PPCEmitTimePseudo<(outs), (ins crrc:$cond, memri:$F),
                      "#SPILL_CR", []>;
-def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F),
+def SPILL_CRBIT : PPCEmitTimePseudo<(outs), (ins crbitrc:$cond, memri:$F),
                          "#SPILL_CRBIT", []>;
 }
 
 // RESTORE_CR - Indicate that we're restoring the CR register (previously
 // spilled), so we'll need to scavenge a register for it.
 let mayLoad = 1 in {
-def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F),
+def RESTORE_CR : PPCEmitTimePseudo<(outs crrc:$cond), (ins memri:$F),
                      "#RESTORE_CR", []>;
-def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F),
+def RESTORE_CRBIT : PPCEmitTimePseudo<(outs crbitrc:$cond), (ins memri:$F),
                            "#RESTORE_CRBIT", []>;
 }
 
@@ -1311,10 +1310,10 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
 }
 
 let Defs = [LR] in
-  def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
+  def MovePCtoLR : PPCEmitTimePseudo<(outs), (ins), "#MovePCtoLR", []>,
                    PPC970_Unit_BRU;
 let Defs = [LR] in
-  def MoveGOTtoLR : Pseudo<(outs), (ins), "#MoveGOTtoLR", []>,
+  def MoveGOTtoLR : PPCEmitTimePseudo<(outs), (ins), "#MoveGOTtoLR", []>,
                     PPC970_Unit_BRU;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
@@ -1512,19 +1511,19 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
 }
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNdi :Pseudo< (outs),
+def TCRETURNdi :PPCEmitTimePseudo< (outs),
                         (ins calltarget:$dst, i32imm:$offset),
                  "#TC_RETURNd $dst $offset",
                  []>;
 
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
+def TCRETURNai :PPCEmitTimePseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
                  "#TC_RETURNa $func $offset",
                  [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
+def TCRETURNri : PPCEmitTimePseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
                  "#TC_RETURNr $dst $offset",
                  []>;
 
@@ -1550,14 +1549,19 @@ def TAILBA   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
 
 }
 
-let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+// While longjmp is a control-flow barrier (fallthrough isn't allowed), setjmp
+// is not.
+let hasSideEffects = 1 in {
   let Defs = [CTR] in
-  def EH_SjLj_SetJmp32  : Pseudo<(outs gprc:$dst), (ins memr:$buf),
+  def EH_SjLj_SetJmp32  : PPCCustomInserterPseudo<(outs gprc:$dst), (ins memr:$buf),
                             "#EH_SJLJ_SETJMP32",
                             [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
                           Requires<[In32BitMode]>;
+}
+
+let hasSideEffects = 1, isBarrier = 1 in {
   let isTerminator = 1 in
-  def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf),
+  def EH_SjLj_LongJmp32 : PPCCustomInserterPseudo<(outs), (ins memr:$buf),
                             "#EH_SJLJ_LONGJMP32",
                             [(PPCeh_sjlj_longjmp addr:$buf)]>,
                           Requires<[In32BitMode]>;
@@ -1567,7 +1571,7 @@ let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
 // a terminator.  Size is set to 0 to prevent the builtin assembler
 // from emitting it.
 let isBranch = 1, isTerminator = 1, Size = 0 in {
-  def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst),
+  def EH_SjLj_Setup : PPCEmitTimePseudo<(outs), (ins directbrtarget:$dst),
                         "#EH_SjLj_Setup\t$dst", []>;
 }
 
@@ -1654,119 +1658,117 @@ def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
 // clean this up in PPCMIPeephole with calls to
 // PPCInstrInfo::convertToImmediateForm() but we should probably not emit them
 // in the first place.
-let usesCustomInserter = 1 in {
-  let Defs = [CR0] in {
-    def ATOMIC_LOAD_ADD_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
-      [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_SUB_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
-      [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_AND_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
-      [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_OR_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
-      [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_XOR_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
-      [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_NAND_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
-      [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_MIN_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8",
-      [(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_MAX_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8",
-      [(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_UMIN_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8",
-      [(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_UMAX_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8",
-      [(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_ADD_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
-      [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_SUB_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
-      [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_AND_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
-      [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_OR_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
-      [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_XOR_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
-      [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_NAND_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
-      [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_MIN_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16",
-      [(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_MAX_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16",
-      [(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_UMIN_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16",
-      [(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_UMAX_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16",
-      [(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_ADD_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
-      [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_SUB_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
-      [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_AND_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
-      [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_OR_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
-      [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_XOR_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
-      [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_NAND_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
-      [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_MIN_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32",
-      [(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_MAX_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32",
-      [(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_UMIN_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32",
-      [(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>;
-    def ATOMIC_LOAD_UMAX_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32",
-      [(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>;
-
-    def ATOMIC_CMP_SWAP_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
-      [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
-    def ATOMIC_CMP_SWAP_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
-      [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
-    def ATOMIC_CMP_SWAP_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
-      [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;
-
-    def ATOMIC_SWAP_I8 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
-      [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
-    def ATOMIC_SWAP_I16 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
-      [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
-    def ATOMIC_SWAP_I32 : Pseudo<
-      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
-      [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
-  }
+let Defs = [CR0] in {
+  def ATOMIC_LOAD_ADD_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
+    [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_SUB_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
+    [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_AND_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
+    [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_OR_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
+    [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_XOR_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
+    [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_NAND_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
+    [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_MIN_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8",
+    [(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_MAX_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8",
+    [(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_UMIN_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8",
+    [(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_UMAX_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8",
+    [(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_ADD_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
+    [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_SUB_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
+    [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_AND_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
+    [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_OR_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
+    [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_XOR_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
+    [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_NAND_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
+    [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_MIN_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16",
+    [(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_MAX_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16",
+    [(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_UMIN_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16",
+    [(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_UMAX_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16",
+    [(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_ADD_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
+    [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_SUB_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
+    [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_AND_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
+    [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_OR_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
+    [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_XOR_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
+    [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_NAND_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
+    [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_MIN_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32",
+    [(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_MAX_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32",
+    [(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_UMIN_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32",
+    [(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>;
+  def ATOMIC_LOAD_UMAX_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32",
+    [(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>;
+
+  def ATOMIC_CMP_SWAP_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
+    [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
+  def ATOMIC_CMP_SWAP_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
+    [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
+  def ATOMIC_CMP_SWAP_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
+    [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;
+
+  def ATOMIC_SWAP_I8 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
+    [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
+  def ATOMIC_SWAP_I16 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
+    [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
+  def ATOMIC_SWAP_I32 : PPCCustomInserterPseudo<
+    (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
+    [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
 }
 
 def : Pat<(PPCatomicCmpSwap_8 xoaddr:$ptr, i32:$old, i32:$new),
@@ -1994,15 +1996,15 @@ def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
 
 // Unindexed (r+i) Stores.
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
-def STB  : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
-                   "stb $rS, $src", IIC_LdStStore,
-                   [(truncstorei8 i32:$rS, iaddr:$src)]>;
-def STH  : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
-                   "sth $rS, $src", IIC_LdStStore,
-                   [(truncstorei16 i32:$rS, iaddr:$src)]>;
-def STW  : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
-                   "stw $rS, $src", IIC_LdStStore,
-                   [(store i32:$rS, iaddr:$src)]>;
+def STB  : DForm_1<38, (outs), (ins gprc:$rS, memri:$dst),
+                   "stb $rS, $dst", IIC_LdStStore,
+                   [(truncstorei8 i32:$rS, iaddr:$dst)]>;
+def STH  : DForm_1<44, (outs), (ins gprc:$rS, memri:$dst),
+                   "sth $rS, $dst", IIC_LdStStore,
+                   [(truncstorei16 i32:$rS, iaddr:$dst)]>;
+def STW  : DForm_1<36, (outs), (ins gprc:$rS, memri:$dst),
+                   "stw $rS, $dst", IIC_LdStStore,
+                   [(store i32:$rS, iaddr:$dst)]>;
 let Predicates = [HasFPU] in {
 def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
                    "stfs $rS, $dst", IIC_LdStSTFD,
@@ -2016,13 +2018,13 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
 // Unindexed (r+i) Stores with Update (preinc).
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STBU  : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
-                    "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    "stbu $rS, $dst", IIC_LdStSTU, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STHU  : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
-                    "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    "sthu $rS, $dst", IIC_LdStSTU, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STWU  : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
-                    "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    "stwu $rS, $dst", IIC_LdStSTU, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 let Predicates = [HasFPU] in {
 def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
@@ -2090,19 +2092,19 @@ def STFDX : XForm_28_memOp<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STBUX : XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
                           (ins gprc:$rS, memrr:$dst),
-                          "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          "stbux $rS, $dst", IIC_LdStSTUX, []>,
                           RegConstraint<"$dst.ptrreg = $ea_res">,
                           NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STHUX : XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
                           (ins gprc:$rS, memrr:$dst),
-                          "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          "sthux $rS, $dst", IIC_LdStSTUX, []>,
                           RegConstraint<"$dst.ptrreg = $ea_res">,
                           NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STWUX : XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
                           (ins gprc:$rS, memrr:$dst),
-                          "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+                          "stwux $rS, $dst", IIC_LdStSTUX, []>,
                           RegConstraint<"$dst.ptrreg = $ea_res">,
                           NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
@@ -2549,8 +2551,8 @@ def MTPMR : XFXForm_1<31, 462, (outs), (ins i32imm:$SPR, gprc:$RT),
 
 // A pseudo-instruction used to implement the read of the 64-bit cycle counter
 // on a 32-bit target.
-let hasSideEffects = 1, usesCustomInserter = 1 in
-def ReadTB : Pseudo<(outs gprc:$lo, gprc:$hi), (ins),
+let hasSideEffects = 1 in
+def ReadTB : PPCCustomInserterPseudo<(outs gprc:$lo, gprc:$hi), (ins),
                     "#ReadTB", []>;
 
 let Uses = [CTR] in {
@@ -2609,13 +2611,13 @@ def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>;
 // SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
 // so we'll need to scavenge a register for it.
 let mayStore = 1 in
-def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
+def SPILL_VRSAVE : PPCEmitTimePseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
                      "#SPILL_VRSAVE", []>;
 
 // RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
 // spilled), so we'll need to scavenge a register for it.
 let mayLoad = 1 in
-def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
+def RESTORE_VRSAVE : PPCEmitTimePseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
                      "#RESTORE_VRSAVE", []>;
 
 let hasSideEffects = 0 in {
@@ -2654,9 +2656,9 @@ def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
 } // hasSideEffects = 0
 
 let Predicates = [HasFPU] in {
-// Pseudo instruction to perform FADD in round-to-zero mode.
-let usesCustomInserter = 1, Uses = [RM] in {
-  def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
+// Custom inserter instruction to perform FADD in round-to-zero mode.
+let Uses = [RM] in {
+  def FADDrtz: PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
                       [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
 }
 
@@ -3028,23 +3030,23 @@ def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
           (ADDIS $in, tblockaddress:$g)>;
 
 // Support for thread-local storage.
-def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT", 
+def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT", 
                 [(set i32:$rD, (PPCppc32GOT))]>;
 
 // Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
 // This uses two output registers, the first as the real output, the second as a
 // temporary register, used internally in code generation.
-def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", 
+def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", 
                 []>, NoEncode<"$rT">;
 
-def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
+def LDgotTprelL32: PPCEmitTimePseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
                            "#LDgotTprelL32",
                            [(set i32:$rD,
                              (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
 def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g),
           (ADD4TLS $in, tglobaltlsaddr:$g)>;
 
-def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+def ADDItlsgdL32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
                          "#ADDItlsgdL32",
                          [(set i32:$rD,
                            (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>;
@@ -3052,7 +3054,7 @@ def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
 // explicitly defined when this op is created, so not mentioned here.
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
     Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
-def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+def GETtlsADDR32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
                           "GETtlsADDR32",
                           [(set i32:$rD,
                             (PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>;
@@ -3060,14 +3062,14 @@ def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
 // are true defines while the rest of the Defs are clobbers.
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
     Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
-def ADDItlsgdLADDR32 : Pseudo<(outs gprc:$rD),
+def ADDItlsgdLADDR32 : PPCEmitTimePseudo<(outs gprc:$rD),
                               (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
                               "#ADDItlsgdLADDR32",
                               [(set i32:$rD,
                                 (PPCaddiTlsgdLAddr i32:$reg,
                                                    tglobaltlsaddr:$disp,
                                                    tglobaltlsaddr:$sym))]>;
-def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+def ADDItlsldL32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
                           "#ADDItlsldL32",
                           [(set i32:$rD,
                             (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
@@ -3075,7 +3077,7 @@ def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
 // explicitly defined when this op is created, so not mentioned here.
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
     Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
-def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+def GETtlsldADDR32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
                             "GETtlsldADDR32",
                             [(set i32:$rD,
                               (PPCgetTlsldAddr i32:$reg,
@@ -3084,31 +3086,31 @@ def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
 // are true defines while the rest of the Defs are clobbers.
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
     Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
-def ADDItlsldLADDR32 : Pseudo<(outs gprc:$rD),
+def ADDItlsldLADDR32 : PPCEmitTimePseudo<(outs gprc:$rD),
                               (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
                               "#ADDItlsldLADDR32",
                               [(set i32:$rD,
                                 (PPCaddiTlsldLAddr i32:$reg,
                                                    tglobaltlsaddr:$disp,
                                                    tglobaltlsaddr:$sym))]>;
-def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+def ADDIdtprelL32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
                            "#ADDIdtprelL32",
                            [(set i32:$rD,
                              (PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>;
-def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+def ADDISdtprelHA32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
                             "#ADDISdtprelHA32",
                             [(set i32:$rD,
                               (PPCaddisDtprelHA i32:$reg,
                                                 tglobaltlsaddr:$disp))]>;
 
 // Support for Position-independent code
-def LWZtoc : Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
+def LWZtoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
                    "#LWZtoc",
                    [(set i32:$rD,
                       (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
 // Get Global (GOT) Base Register offset, from the word immediately preceding
 // the function label.
-def UpdateGBR : Pseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
+def UpdateGBR : PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
 
 
 // Standard shifts.  These are represented separately from the real shifts above
@@ -3936,21 +3938,19 @@ def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGT)),
 def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)),
           (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>;
 
-let usesCustomInserter = 1 in {
-def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+def ANDIo_1_EQ_BIT : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins gprc:$in),
                              "#ANDIo_1_EQ_BIT",
                              [(set i1:$dst, (trunc (not i32:$in)))]>;
-def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+def ANDIo_1_GT_BIT : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins gprc:$in),
                              "#ANDIo_1_GT_BIT",
                              [(set i1:$dst, (trunc i32:$in))]>;
 
-def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+def ANDIo_1_EQ_BIT8 : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins g8rc:$in),
                               "#ANDIo_1_EQ_BIT8",
                               [(set i1:$dst, (trunc (not i64:$in)))]>;
-def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+def ANDIo_1_GT_BIT8 : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins g8rc:$in),
                               "#ANDIo_1_GT_BIT8",
                               [(set i1:$dst, (trunc i64:$in))]>;
-}
 
 def : Pat<(i1 (not (trunc i32:$in))),
            (ANDIo_1_EQ_BIT $in)>;
diff --git a/lib/Target/PowerPC/PPCInstrQPX.td b/lib/Target/PowerPC/PPCInstrQPX.td
index c4bb02695b360ffb4989dd81c9a8159030d87856..ef589ad01fd7fa9e496178e7fc83be6bd887ff6f 100644
--- a/lib/Target/PowerPC/PPCInstrQPX.td
+++ b/lib/Target/PowerPC/PPCInstrQPX.td
@@ -245,32 +245,30 @@ let Uses = [RM] in {
 
   // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
   // instruction selection into a branch sequence.
-  let usesCustomInserter = 1 in {
-    def SELECT_CC_QFRC: Pseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F,
-                                i32imm:$BROPC), "#SELECT_CC_QFRC",
-                                []>;
-    def SELECT_CC_QSRC: Pseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F,
-                                i32imm:$BROPC), "#SELECT_CC_QSRC",
-                                []>;
-    def SELECT_CC_QBRC: Pseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F,
-                                i32imm:$BROPC), "#SELECT_CC_QBRC",
-                                []>;
-
-    // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
-    // register bit directly.
-    def SELECT_QFRC: Pseudo<(outs qfrc:$dst), (ins crbitrc:$cond,
-                            qfrc:$T, qfrc:$F), "#SELECT_QFRC",
-                            [(set v4f64:$dst,
-                                  (select i1:$cond, v4f64:$T, v4f64:$F))]>;
-    def SELECT_QSRC: Pseudo<(outs qsrc:$dst), (ins crbitrc:$cond,
-                            qsrc:$T, qsrc:$F), "#SELECT_QSRC",
-                            [(set v4f32:$dst,
-                                  (select i1:$cond, v4f32:$T, v4f32:$F))]>;
-    def SELECT_QBRC: Pseudo<(outs qbrc:$dst), (ins crbitrc:$cond,
-                            qbrc:$T, qbrc:$F), "#SELECT_QBRC",
-                            [(set v4i1:$dst,
-                                  (select i1:$cond, v4i1:$T, v4i1:$F))]>;
-  }
+  def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_QFRC",
+                              []>;
+  def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_QSRC",
+                              []>;
+  def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_QBRC",
+                              []>;
+
+  // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
+  // register bit directly.
+  def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond,
+                          qfrc:$T, qfrc:$F), "#SELECT_QFRC",
+                          [(set v4f64:$dst,
+                                (select i1:$cond, v4f64:$T, v4f64:$F))]>;
+  def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond,
+                          qsrc:$T, qsrc:$F), "#SELECT_QSRC",
+                          [(set v4f32:$dst,
+                                (select i1:$cond, v4f32:$T, v4f32:$F))]>;
+  def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond,
+                          qbrc:$T, qbrc:$F), "#SELECT_QBRC",
+                          [(set v4i1:$dst,
+                                (select i1:$cond, v4i1:$T, v4i1:$F))]>;
 
   // Convert and Round Instructions
   def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>;
diff --git a/lib/Target/PowerPC/PPCInstrSPE.td b/lib/Target/PowerPC/PPCInstrSPE.td
index 96649efdc1bc7152d89ac3e3e99ba283a8fa9a76..9f5891a45f22346e6b4e63e8d91538c131bd4cc8 100644
--- a/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/lib/Target/PowerPC/PPCInstrSPE.td
@@ -831,22 +831,20 @@ def : Pat<(f64 (fpextend f32:$src)),
 }
 
 let Predicates = [HasSPE] in {
-  let usesCustomInserter = 1 in {
-def SELECT_CC_SPE4 : Pseudo<(outs spe4rc:$dst),
+def SELECT_CC_SPE4 : PPCCustomInserterPseudo<(outs spe4rc:$dst),
                             (ins crrc:$cond, spe4rc:$T, spe4rc:$F,
                             i32imm:$BROPC), "#SELECT_CC_SPE4",
                             []>;
-def SELECT_CC_SPE  : Pseudo<(outs sperc:$dst),
+def SELECT_CC_SPE  : PPCCustomInserterPseudo<(outs sperc:$dst),
                             (ins crrc:$cond, sperc:$T, sperc:$F, i32imm:$BROPC),
                             "#SELECT_CC_SPE",
                             []>;
-def SELECT_SPE4  : Pseudo<(outs spe4rc:$dst), (ins crbitrc:$cond,
+def SELECT_SPE4  : PPCCustomInserterPseudo<(outs spe4rc:$dst), (ins crbitrc:$cond,
                           spe4rc:$T, spe4rc:$F), "#SELECT_SPE4",
                           [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
-def SELECT_SPE   : Pseudo<(outs sperc:$dst), (ins crbitrc:$cond,
+def SELECT_SPE   : PPCCustomInserterPseudo<(outs sperc:$dst), (ins crbitrc:$cond,
                           sperc:$T, sperc:$F), "#SELECT_SPE",
                           [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
-  }
 
 def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
           (SELECT_SPE4 (CRANDC $lhs, $rhs), $tval, $fval)>;
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 9d462df6fef5f41deb24716b38fdbad8b3ec8b47..8321d7f2ecf06d0640d7147316f56cfb3cbc3941 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -67,6 +67,10 @@ def SDT_PPCxxswapd : SDTypeProfile<1, 1, [
 def SDTVecConv : SDTypeProfile<1, 2, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>
 ]>;
+def SDTVabsd : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>
+]>;
+
 
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -79,6 +83,7 @@ def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
 def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
 def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
 def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
+def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
 
 multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
                     string asmstr, InstrItinClass itin, Intrinsic Int,
@@ -132,7 +137,7 @@ let Uses = [RM] in {
                         []>;
 
     // Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later
-    let isPseudo = 1, CodeSize = 3 in
+    let CodeSize = 3 in
       def XFLOADf64  : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
                               "#XFLOADf64",
                               [(set f64:$XT, (load xoaddr:$src))]>;
@@ -163,7 +168,7 @@ let Uses = [RM] in {
                         []>;
 
     // Pseudo instruction XFSTOREf64  will be expanded to STXSDX or STFDX later
-    let isPseudo = 1, CodeSize = 3 in
+    let CodeSize = 3 in
       def XFSTOREf64 : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
                               "#XFSTOREf64",
                               [(store f64:$XT, xoaddr:$dst)]>;
@@ -898,37 +903,36 @@ let Uses = [RM] in {
 
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.
-let usesCustomInserter = 1,    // Expanded after instruction selection.
-    PPC970_Single = 1 in {
+let PPC970_Single = 1 in {
 
-  def SELECT_CC_VSRC: Pseudo<(outs vsrc:$dst),
+  def SELECT_CC_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst),
                              (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC),
                              "#SELECT_CC_VSRC",
                              []>;
-  def SELECT_VSRC: Pseudo<(outs vsrc:$dst),
+  def SELECT_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst),
                           (ins crbitrc:$cond, vsrc:$T, vsrc:$F),
                           "#SELECT_VSRC",
                           [(set v2f64:$dst,
                                 (select i1:$cond, v2f64:$T, v2f64:$F))]>;
-  def SELECT_CC_VSFRC: Pseudo<(outs f8rc:$dst),
+  def SELECT_CC_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst),
                               (ins crrc:$cond, f8rc:$T, f8rc:$F,
                                i32imm:$BROPC), "#SELECT_CC_VSFRC",
                               []>;
-  def SELECT_VSFRC: Pseudo<(outs f8rc:$dst),
+  def SELECT_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst),
                            (ins crbitrc:$cond, f8rc:$T, f8rc:$F),
                            "#SELECT_VSFRC",
                            [(set f64:$dst,
                                  (select i1:$cond, f64:$T, f64:$F))]>;
-  def SELECT_CC_VSSRC: Pseudo<(outs f4rc:$dst),
+  def SELECT_CC_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst),
                               (ins crrc:$cond, f4rc:$T, f4rc:$F,
                                i32imm:$BROPC), "#SELECT_CC_VSSRC",
                               []>;
-  def SELECT_VSSRC: Pseudo<(outs f4rc:$dst),
+  def SELECT_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst),
                            (ins crbitrc:$cond, f4rc:$T, f4rc:$F),
                            "#SELECT_VSSRC",
                            [(set f32:$dst,
                                  (select i1:$cond, f32:$T, f32:$F))]>;
-} // usesCustomInserter
+} 
 } // AddedComplexity
 
 def : InstAlias<"xvmovdp $XT, $XB",
@@ -1152,6 +1156,26 @@ def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
 def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
           (XVRSQRTEDP $A)>;
 
+// Vector selection
+def : Pat<(v16i8 (vselect v16i8:$vA, v16i8:$vB, v16i8:$vC)),
+          (COPY_TO_REGCLASS 
+                 (XXSEL (COPY_TO_REGCLASS $vC, VSRC),
+                        (COPY_TO_REGCLASS $vB, VSRC), 
+                        (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
+def : Pat<(v8i16 (vselect v8i16:$vA, v8i16:$vB, v8i16:$vC)),
+          (COPY_TO_REGCLASS 
+                 (XXSEL (COPY_TO_REGCLASS $vC, VSRC),
+                        (COPY_TO_REGCLASS $vB, VSRC), 
+                        (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
+def : Pat<(vselect v4i32:$vA, v4i32:$vB, v4i32:$vC),
+          (XXSEL $vC, $vB, $vA)>;
+def : Pat<(vselect v2i64:$vA, v2i64:$vB, v2i64:$vC),
+          (XXSEL $vC, $vB, $vA)>;
+def : Pat<(vselect v4i32:$vA, v4f32:$vB, v4f32:$vC),
+          (XXSEL $vC, $vB, $vA)>;
+def : Pat<(vselect v2i64:$vA, v2f64:$vB, v2f64:$vC),
+          (XXSEL $vC, $vB, $vA)>;
+
 let Predicates = [IsLittleEndian] in {
 def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
           (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
@@ -1234,23 +1258,19 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
     def LXSIWZX : XX1Form_memOp<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
                           "lxsiwzx $XT, $src", IIC_LdStLFD, []>;
 
-    // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
-    // would cause these Pseudos are not expanded in expandPostRAPseudos()
-    let isPseudo = 1 in {
-      // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later
-      let CodeSize = 3 in
-      def XFLOADf32  : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src),
-                              "#XFLOADf32",
-                              [(set f32:$XT, (load xoaddr:$src))]>;
-      // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later
-      def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
-                         "#LIWAX",
-                         [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
-      // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later
-      def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
-                         "#LIWZX",
-                         [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
-    }
+    // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later
+    let CodeSize = 3 in
+    def XFLOADf32  : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src),
+                            "#XFLOADf32",
+                            [(set f32:$XT, (load xoaddr:$src))]>;
+    // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later
+    def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
+                       "#LIWAX",
+                       [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
+    // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later
+    def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src),
+                       "#LIWZX",
+                       [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
   } // mayLoad
 
   // VSX scalar stores introduced in ISA 2.07
@@ -1261,19 +1281,15 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
     def STXSIWX : XX1Form_memOp<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
                           "stxsiwx $XT, $dst", IIC_LdStSTFD, []>;
 
-    // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
-    // would cause these Pseudos are not expanded in expandPostRAPseudos()
-    let isPseudo = 1 in {
-      // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later
-      let CodeSize = 3 in
-      def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst),
-                              "#XFSTOREf32",
-                              [(store f32:$XT, xoaddr:$dst)]>;
-      // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later
-      def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
-                         "#STIWX",
-                        [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
-    }
+    // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later
+    let CodeSize = 3 in
+    def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst),
+                            "#XFSTOREf32",
+                            [(store f32:$XT, xoaddr:$dst)]>;
+    // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later
+    def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst),
+                       "#STIWX",
+                      [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
   } // mayStore
   } // UseVSXReg = 1
 
@@ -3242,20 +3258,19 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   def : Pat<(f64 (PPCVexts f64:$A, 2)),
             (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>;
 
-  let isPseudo = 1 in {
-    def DFLOADf32  : Pseudo<(outs vssrc:$XT), (ins memrix:$src),
-                            "#DFLOADf32",
-                            [(set f32:$XT, (load ixaddr:$src))]>;
-    def DFLOADf64  : Pseudo<(outs vsfrc:$XT), (ins memrix:$src),
-                            "#DFLOADf64",
-                            [(set f64:$XT, (load ixaddr:$src))]>;
-    def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst),
-                            "#DFSTOREf32",
-                            [(store f32:$XT, ixaddr:$dst)]>;
-    def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
-                            "#DFSTOREf64",
-                            [(store f64:$XT, ixaddr:$dst)]>;
-  }
+  def DFLOADf32  : PPCPostRAExpPseudo<(outs vssrc:$XT), (ins memrix:$src),
+                          "#DFLOADf32",
+                          [(set f32:$XT, (load ixaddr:$src))]>;
+  def DFLOADf64  : PPCPostRAExpPseudo<(outs vsfrc:$XT), (ins memrix:$src),
+                          "#DFLOADf64",
+                          [(set f64:$XT, (load ixaddr:$src))]>;
+  def DFSTOREf32 : PPCPostRAExpPseudo<(outs), (ins vssrc:$XT, memrix:$dst),
+                          "#DFSTOREf32",
+                          [(store f32:$XT, ixaddr:$dst)]>;
+  def DFSTOREf64 : PPCPostRAExpPseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
+                          "#DFSTOREf64",
+                          [(store f64:$XT, ixaddr:$dst)]>;
+
   def : Pat<(f64 (extloadf32 ixaddr:$src)),
             (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>;
   def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
@@ -3537,22 +3552,20 @@ let AddedComplexity = 400 in {
 }
 
 let Predicates = [HasP9Vector] in {
-  let isPseudo = 1 in {
-    let mayStore = 1 in {
-      def SPILLTOVSR_STX : PseudoXFormMemOp<(outs),
-                                            (ins spilltovsrrc:$XT, memrr:$dst),
-                                            "#SPILLTOVSR_STX", []>;
-      def SPILLTOVSR_ST : Pseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
-                                "#SPILLTOVSR_ST", []>;
-    }
-    let mayLoad = 1 in {
-      def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT),
-                                            (ins memrr:$src),
-                                            "#SPILLTOVSR_LDX", []>;
-      def SPILLTOVSR_LD : Pseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
-                                "#SPILLTOVSR_LD", []>;
+  let mayStore = 1 in {
+    def SPILLTOVSR_STX : PseudoXFormMemOp<(outs),
+                                          (ins spilltovsrrc:$XT, memrr:$dst),
+                                          "#SPILLTOVSR_STX", []>;
+    def SPILLTOVSR_ST : PPCPostRAExpPseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
+                              "#SPILLTOVSR_ST", []>;
+  }
+  let mayLoad = 1 in {
+    def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT),
+                                          (ins memrr:$src),
+                                          "#SPILLTOVSR_LDX", []>;
+    def SPILLTOVSR_LD : PPCPostRAExpPseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
+                              "#SPILLTOVSR_LD", []>;
 
-    }
   }
 }
 // Integer extend helper dags 32 -> 64
@@ -4009,3 +4022,21 @@ let AddedComplexity = 400 in {
   }
 }
 
+// Put this P9Altivec related definition here since it's possible to be 
+// selected to VSX instruction xvnegsp, avoid possible undef.
+let Predicates = [HasP9Altivec] in {
+
+  def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 0))),
+            (v4i32 (VABSDUW $A, $B))>;
+
+  def : Pat<(v8i16 (PPCvabsd v8i16:$A, v8i16:$B, (i32 0))),
+            (v8i16 (VABSDUH $A, $B))>;
+
+  def : Pat<(v16i8 (PPCvabsd v16i8:$A, v16i8:$B, (i32 0))),
+            (v16i8 (VABSDUB $A, $B))>;
+
+  // As PPCVABSD description, the last operand indicates whether do the
+  // sign bit flip.
+  def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))),
+            (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/PowerPC/PPCPfmCounters.td
similarity index 51%
rename from lib/Target/AMDGPU/AMDGPUIntrinsics.td
rename to lib/Target/PowerPC/PPCPfmCounters.td
index 230a04628504784b9892308053065c613963a57f..d2a09f30c0f3e58a826ee8645e7617137d30e3e1 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/PowerPC/PPCPfmCounters.td
@@ -1,4 +1,4 @@
-//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
+//===-- PPCPfmCounters.td - PPC Hardware Counters ----------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines intrinsics that are used by all hw codegen targets.
+// This describes the available hardware counters for PPC.
 //
 //===----------------------------------------------------------------------===//
 
-let TargetPrefix = "AMDGPU", isTarget = 1 in {
-  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
+def CpuCyclesPfmCounter : PfmCounter<"CYCLES">;
+
+def DefaultPfmCounters : ProcPfmCounters {
+  let CycleCounter = CpuCyclesPfmCounter;
 }
+def : PfmCountersDefaultBinding<DefaultPfmCounters>;
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 5ad0a517c1175db1480236b9b4f256fb1c91e983..c8fe7d7eea78382c61949789d28cedd11862742f 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -42,7 +42,6 @@ def IIC_LdStLoad     : InstrItinClass;
 def IIC_LdStLoadUpd  : InstrItinClass;
 def IIC_LdStLoadUpdX : InstrItinClass;
 def IIC_LdStStore    : InstrItinClass;
-def IIC_LdStStoreUpd : InstrItinClass;
 def IIC_LdStDSS      : InstrItinClass;
 def IIC_LdStICBI     : InstrItinClass;
 def IIC_LdStLD       : InstrItinClass;
@@ -63,8 +62,8 @@ def IIC_LdStSLBIA    : InstrItinClass;
 def IIC_LdStSLBIE    : InstrItinClass;
 def IIC_LdStSTD      : InstrItinClass;
 def IIC_LdStSTDCX    : InstrItinClass;
-def IIC_LdStSTDU     : InstrItinClass;
-def IIC_LdStSTDUX    : InstrItinClass;
+def IIC_LdStSTU      : InstrItinClass;
+def IIC_LdStSTUX     : InstrItinClass;
 def IIC_LdStSTFD     : InstrItinClass;
 def IIC_LdStSTFDU    : InstrItinClass;
 def IIC_LdStSTVEBX   : InstrItinClass;
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index 2455e5e52de5e385237c15040a8891ce1380e741..646822eedbe078560181d907f0516c5bb35a3fba 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -280,13 +280,6 @@ def PPC440Itineraries : ProcessorItineraries<
                                  InstrStage<2, [P440_LWB]>],
                                 [1, 1, 1],
                                 [NoBypass, P440_GPR_Bypass]>,
-  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
-                                 InstrStage<1, [P440_LRACC]>,
-                                 InstrStage<1, [P440_AGEN]>,
-                                 InstrStage<1, [P440_CRD]>,
-                                 InstrStage<2, [P440_LWB]>],
-                                [2, 1, 1, 1],
-                                [NoBypass, P440_GPR_Bypass]>,
   InstrItinData<IIC_LdStICBI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
                                  InstrStage<1, [P440_LRACC]>,
                                  InstrStage<1, [P440_AGEN]>,
@@ -373,14 +366,14 @@ def PPC440Itineraries : ProcessorItineraries<
                                  InstrStage<2, [P440_LWB]>],
                                 [4, 1, 1],
                                 [NoBypass, P440_GPR_Bypass]>,
-  InstrItinData<IIC_LdStSTDU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+  InstrItinData<IIC_LdStSTU,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
                                  InstrStage<1, [P440_LRACC]>,
                                  InstrStage<1, [P440_AGEN]>,
                                  InstrStage<1, [P440_CRD]>,
                                  InstrStage<2, [P440_LWB]>],
                                 [2, 1, 1, 1],
                                 [NoBypass, P440_GPR_Bypass]>,
-  InstrItinData<IIC_LdStSTDUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+  InstrItinData<IIC_LdStSTUX,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
                                  InstrStage<1, [P440_LRACC]>,
                                  InstrStage<1, [P440_AGEN]>,
                                  InstrStage<1, [P440_CRD]>,
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 54cfae5d74b7bcf2a825d5a77386ee2ba4fa495d..f34c1accc0fd9d2446ab1861a795a359029a9e09 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -81,8 +81,6 @@ def PPCA2Itineraries : ProcessorItineraries<
                                  [6, 0, 0]>,
   InstrItinData<IIC_LdStStore,   [InstrStage<1, [A2_XU]>],
                                  [0, 0, 0]>,
-  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [A2_XU]>],
-                                 [2, 0, 0, 0]>,
   InstrItinData<IIC_LdStICBI,    [InstrStage<1, [A2_XU]>],
                                  [16, 0, 0]>,
   InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [A2_XU]>],
@@ -105,9 +103,9 @@ def PPCA2Itineraries : ProcessorItineraries<
                                  [82, 0, 0]>, // L2 latency
   InstrItinData<IIC_LdStSTD,     [InstrStage<1, [A2_XU]>],
                                  [0, 0, 0]>,
-  InstrItinData<IIC_LdStSTDU,    [InstrStage<1, [A2_XU]>],
+  InstrItinData<IIC_LdStSTU,     [InstrStage<1, [A2_XU]>],
                                  [2, 0, 0, 0]>,
-  InstrItinData<IIC_LdStSTDUX,   [InstrStage<1, [A2_XU]>],
+  InstrItinData<IIC_LdStSTUX,    [InstrStage<1, [A2_XU]>],
                                  [2, 0, 0, 0]>,
   InstrItinData<IIC_LdStSTDCX,   [InstrStage<1, [A2_XU]>],
                                  [82, 0, 0]>, // L2 latency
diff --git a/lib/Target/PowerPC/PPCScheduleE500.td b/lib/Target/PowerPC/PPCScheduleE500.td
index d7c2bd15a2587eebca1cef6da2d5effe10ff4d7c..479a970b2537d2bfc3576ffa6d0295d97e1b769d 100644
--- a/lib/Target/PowerPC/PPCScheduleE500.td
+++ b/lib/Target/PowerPC/PPCScheduleE500.td
@@ -144,7 +144,13 @@ def PPCE500Itineraries : ProcessorItineraries<
                                   InstrStage<1, [E500_LSU_0]>],
                                  [6, 1], // Latency = 3
                                  [NoBypass, E500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+  InstrItinData<IIC_LdStSTU,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SU0, E500_SU1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStSTUX,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
                                   InstrStage<1, [E500_SU0, E500_SU1], 0>,
                                   InstrStage<1, [E500_LSU_0]>],
                                  [6, 1], // Latency = 3
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index 5f95f2a79f669fe94edc3433c0f05ce5c036cb2e..d8bda073833f8840d81a9293074c74ea3d94845a 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -157,7 +157,13 @@ def PPCE500mcItineraries : ProcessorItineraries<
                                   InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
                                  [NoBypass, E500mc_GPR_Bypass]>,
-  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+  InstrItinData<IIC_LdStSTU,     [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
+                                  InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
+                                  InstrStage<1, [E500mc_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500mc_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStSTUX,    [InstrStage<1, [E500mc_DIS0, E500mc_DIS1], 0>,
                                   InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>,
                                   InstrStage<1, [E500mc_LSU_0]>],
                                  [6, 1], // Latency = 3
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
index 32f8e652dd5681a60e3c5fc5b582224c8973ca4e..3e50803955c45b42be16ad2f78648d5cb07a60f8 100644
--- a/lib/Target/PowerPC/PPCScheduleE5500.td
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -206,12 +206,6 @@ def PPCE5500Itineraries : ProcessorItineraries<
                                   InstrStage<1, [E5500_LSU_0]>],
                                  [7, 2], // Latency = 3, Repeat rate = 1
                                  [NoBypass, E5500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
-                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
-                                  InstrStage<1, [E5500_LSU_0]>],
-                                 [7, 2], // Latency = 3, Repeat rate = 1
-                                 [NoBypass, E5500_GPR_Bypass],
-                                 2>, // 2 micro-ops
   InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
                                   InstrStage<1, [E5500_LSU_0]>],
                                  [7, 2], // Latency = 3, Repeat rate = 1
@@ -281,13 +275,13 @@ def PPCE5500Itineraries : ProcessorItineraries<
                                   InstrStage<1, [E5500_LSU_0]>],
                                  [7, 2], // Latency = 3, Repeat rate = 1
                                  [NoBypass, E5500_GPR_Bypass]>,
-  InstrItinData<IIC_LdStSTDU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+  InstrItinData<IIC_LdStSTU,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
                                   InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
                                   InstrStage<1, [E5500_LSU_0]>],
                                  [7, 2], // Latency = 3, Repeat rate = 1
                                  [NoBypass, E5500_GPR_Bypass],
                                  2>, // 2 micro-ops
-  InstrItinData<IIC_LdStSTDUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+  InstrItinData<IIC_LdStSTUX,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
                                   InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
                                   InstrStage<1, [E5500_LSU_0]>],
                                  [7, 2], // Latency = 3, Repeat rate = 1
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index 21efd8f8f6c927bcfa9f2ac2a7ebad7c0c210135..0995b7200d93d08e0e6cd0ffa052724d0d22b7d7 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -43,7 +43,8 @@ def G3Itineraries : ProcessorItineraries<
   InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G3_SLU]>]>,  
   InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G3_SLU]>]>,  
   InstrItinData<IIC_LdStStore   , [InstrStage<2, [G3_SLU]>]>,
-  InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStSTU     , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStSTUX    , [InstrStage<2, [G3_SLU]>]>,  
   InstrItinData<IIC_LdStICBI    , [InstrStage<3, [G3_SLU]>]>,
   InstrItinData<IIC_LdStSTFD    , [InstrStage<2, [G3_SLU]>]>,
   InstrItinData<IIC_LdStSTFDU   , [InstrStage<2, [G3_SLU]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index 340773ef787685982cc8c21d07a80e7be2da8db2..1b15c7b3c7ad487666e2b1999aeb20e03fb77d7b 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -48,7 +48,8 @@ def G4Itineraries : ProcessorItineraries<
   InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G4_SLU]>]>,
   InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G4_SLU]>]>,
   InstrItinData<IIC_LdStStore   , [InstrStage<2, [G4_SLU]>]>,
-  InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTU     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTUX    , [InstrStage<2, [G4_SLU]>]>,
   InstrItinData<IIC_LdStDSS     , [InstrStage<2, [G4_SLU]>]>,
   InstrItinData<IIC_LdStICBI    , [InstrStage<2, [G4_SLU]>]>,
   InstrItinData<IIC_LdStSTFD    , [InstrStage<2, [G4_SLU]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index 1d9f13fcb8500748c6c968da44e7934e3295af9d..0044c3c6a4490800301a7abf21de3297f7a29b2e 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -56,7 +56,6 @@ def G4PlusItineraries : ProcessorItineraries<
   InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G4P_SLU]>]>,
   InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G4P_SLU]>]>,
   InstrItinData<IIC_LdStStore   , [InstrStage<3, [G4P_SLU]>]>,
-  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G4P_SLU]>]>,
   InstrItinData<IIC_LdStDSS     , [InstrStage<3, [G4P_SLU]>]>,
   InstrItinData<IIC_LdStICBI    , [InstrStage<3, [G4P_IU2]>]>,
   InstrItinData<IIC_LdStSTFD    , [InstrStage<3, [G4P_SLU]>]>,
@@ -73,8 +72,8 @@ def G4PlusItineraries : ProcessorItineraries<
   InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G4P_SLU]>]>,
   InstrItinData<IIC_LdStSTD     , [InstrStage<3, [G4P_SLU]>]>,
   InstrItinData<IIC_LdStSTDCX   , [InstrStage<3, [G4P_SLU]>]>,
-  InstrItinData<IIC_LdStSTDU    , [InstrStage<3, [G4P_SLU]>]>,  
-  InstrItinData<IIC_LdStSTDUX   , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStSTU     , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStSTUX    , [InstrStage<3, [G4P_SLU]>]>,  
   InstrItinData<IIC_LdStSTVEBX  , [InstrStage<3, [G4P_SLU]>]>,
   InstrItinData<IIC_LdStSTWCX   , [InstrStage<3, [G4P_SLU]>]>,
   InstrItinData<IIC_LdStSync    , [InstrStage<35, [G4P_SLU]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index b5a9f96d45aef9956c2b0e894a27462f8ec359e5..c802b80170fb770c352d44cae50a27b18be17629 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -54,7 +54,6 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStStore   , [InstrStage<3, [G5_SLU]>]>,
-  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStDSS     , [InstrStage<10, [G5_SLU]>]>,
   InstrItinData<IIC_LdStICBI    , [InstrStage<40, [G5_SLU]>]>,
   InstrItinData<IIC_LdStSTFD    , [InstrStage<4, [G5_SLU]>]>,
@@ -76,8 +75,8 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<IIC_LdStSLBIA   , [InstrStage<40, [G5_SLU]>]>, // needs work
   InstrItinData<IIC_LdStSLBIE   , [InstrStage<2, [G5_SLU]>]>,
   InstrItinData<IIC_LdStSTD     , [InstrStage<3, [G5_SLU]>]>,
-  InstrItinData<IIC_LdStSTDU    , [InstrStage<3, [G5_SLU]>]>,
-  InstrItinData<IIC_LdStSTDUX   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTU     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTUX    , [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStSTDCX   , [InstrStage<11, [G5_SLU]>]>,
   InstrItinData<IIC_LdStSTVEBX  , [InstrStage<5, [G5_SLU]>]>,
   InstrItinData<IIC_LdStSTWCX   , [InstrStage<11, [G5_SLU]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td
index 541f290fadfc2f19ac945013848c671362161967..1d6e509819da5337ca9c8cfe172b1db9c9a7e225 100644
--- a/lib/Target/PowerPC/PPCScheduleP7.td
+++ b/lib/Target/PowerPC/PPCScheduleP7.td
@@ -261,13 +261,13 @@ def P7Itineraries : ProcessorItineraries<
                                    InstrStage<1, [P7_LS1, P7_LS2], 0>,
                                    InstrStage<1, [P7_FX1, P7_FX2]>],
                                   [1, 1, 1]>,
-  InstrItinData<IIC_LdStSTDU    , [InstrStage<1, [P7_DU1], 0>,
+  InstrItinData<IIC_LdStSTU     , [InstrStage<1, [P7_DU1], 0>,
                                    InstrStage<1, [P7_DU2], 0>,
                                    InstrStage<1, [P7_LS1, P7_LS2], 0>,
                                    InstrStage<1, [P7_FX1, P7_FX2]>,
                                    InstrStage<1, [P7_FX1, P7_FX2]>],
                                   [2, 1, 1, 1]>,
-  InstrItinData<IIC_LdStSTDUX   , [InstrStage<1, [P7_DU1], 0>,
+  InstrItinData<IIC_LdStSTUX    , [InstrStage<1, [P7_DU1], 0>,
                                    InstrStage<1, [P7_DU2], 0>,
                                    InstrStage<1, [P7_DU3], 0>,
                                    InstrStage<1, [P7_DU4], 0>,
diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td
index a3fa5ff2f3de866121b24b79439d1be71c064d2e..ff39dfda7016d3e8a6d977f4d88d111a919c8f47 100644
--- a/lib/Target/PowerPC/PPCScheduleP8.td
+++ b/lib/Target/PowerPC/PPCScheduleP8.td
@@ -267,14 +267,14 @@ def P8Itineraries : ProcessorItineraries<
                                    InstrStage<1, [P8_LU1, P8_LU2,
                                                   P8_LSU1, P8_LSU2]>]
                                   [1, 1, 1]>,
-  InstrItinData<IIC_LdStSTDU    , [InstrStage<1, [P8_DU1], 0>,
+  InstrItinData<IIC_LdStSTU     , [InstrStage<1, [P8_DU1], 0>,
                                    InstrStage<1, [P8_DU2], 0>,
                                    InstrStage<1, [P8_LU1, P8_LU2,
                                                   P8_LSU1, P8_LSU2], 0>,
                                    InstrStage<1, [P8_FXU1, P8_FXU2]>],
                                   [2, 1, 1, 1]>,
   // First+last
-  InstrItinData<IIC_LdStSTDUX   , [InstrStage<1, [P8_DU1], 0>,
+  InstrItinData<IIC_LdStSTUX    , [InstrStage<1, [P8_DU1], 0>,
                                    InstrStage<1, [P8_DU2], 0>,
                                    InstrStage<1, [P8_DU3], 0>,
                                    InstrStage<1, [P8_DU4], 0>,
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 34410393ef6f7125fde6d4e4809546c4dd14a90f..580d057602f5f711e2491a24ad32b4df7f07279d 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -214,19 +214,24 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   if (TT.isOSDarwin())
     return Reloc::DynamicNoPIC;
 
-  // Non-darwin 64-bit platforms are PIC by default.
-  if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)
+  // Big Endian PPC is PIC by default.
+  if (TT.getArch() == Triple::ppc64)
     return Reloc::PIC_;
 
-  // 32-bit is static by default.
+  // Rest are static by default.
   return Reloc::Static;
 }
 
-static CodeModel::Model getEffectiveCodeModel(const Triple &TT,
-                                              Optional<CodeModel::Model> CM,
-                                              bool JIT) {
-  if (CM)
+static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT,
+                                                 Optional<CodeModel::Model> CM,
+                                                 bool JIT) {
+  if (CM) {
+    if (*CM == CodeModel::Tiny)
+      report_fatal_error("Target does not support the tiny CodeModel");
+    if (*CM == CodeModel::Kernel)
+      report_fatal_error("Target does not support the kernel CodeModel");
     return *CM;
+  }
   if (!TT.isOSDarwin() && !JIT &&
       (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
     return CodeModel::Medium;
@@ -246,7 +251,7 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
                         computeFSAdditions(FS, OL, TT), Options,
                         getEffectiveRelocModel(TT, RM),
-                        getEffectiveCodeModel(TT, CM, JIT), OL),
+                        getEffectivePPCCodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())),
       TargetABI(computeTargetABI(TT, Options)) {
   initAsmInfo();
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index b4bf635dc2c75c4d6a77969b44f25dd51ecf7cec..7f5beca90455ac0508e11b551d29341356868cb8 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -663,3 +663,4 @@ and force instruction pairs to be scheduled together.
 More general handling of any_extend and zero_extend:
 
 See https://reviews.llvm.org/D24924#555306
+
diff --git a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index b1f1eb404010ed713fb87853c0ef70eae4b610f4..4e70ea402d8c8db1e53c015b5920a881c2859f7c 100644
--- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/RISCVAsmBackend.h"
 #include "MCTargetDesc/RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "MCTargetDesc/RISCVTargetStreamer.h"
 #include "Utils/RISCVBaseInfo.h"
+#include "Utils/RISCVMatInt.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -39,6 +43,8 @@ namespace {
 struct RISCVOperand;
 
 class RISCVAsmParser : public MCTargetAsmParser {
+  SmallVector<FeatureBitset, 4> FeatureBitStack;
+
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
   bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
 
@@ -115,6 +121,20 @@ class RISCVAsmParser : public MCTargetAsmParser {
     }
   }
 
+  void pushFeatureBits() {
+    FeatureBitStack.push_back(getSTI().getFeatureBits());
+  }
+
+  bool popFeatureBits() {
+    if (FeatureBitStack.empty())
+      return true;
+
+    FeatureBitset FeatureBits = FeatureBitStack.pop_back_val();
+    copySTI().setFeatureBits(FeatureBits);
+    setAvailableFeatures(ComputeAvailableFeatures(FeatureBits));
+
+    return false;
+  }
 public:
   enum RISCVMatchResultTy {
     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -1164,6 +1184,21 @@ bool RISCVAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                       StringRef Name, SMLoc NameLoc,
                                       OperandVector &Operands) {
+  // Ensure that if the instruction occurs when relaxation is enabled,
+  // relocations are forced for the file. Ideally this would be done when there
+  // is enough information to reliably determine if the instruction itself may
+  // cause relaxations. Unfortunately instruction processing stage occurs in the
+  // same pass as relocation emission, so it's too late to set a 'sticky bit'
+  // for the entire file.
+  if (getSTI().getFeatureBits()[RISCV::FeatureRelax]) {
+    auto *Assembler = getTargetStreamer().getStreamer().getAssemblerPtr();
+    if (Assembler != nullptr) {
+      RISCVAsmBackend &MAB =
+          static_cast<RISCVAsmBackend &>(Assembler->getBackend());
+      MAB.setForceRelocs();
+    }
+  }
+
   // First operand is token for instruction
   Operands.push_back(RISCVOperand::createToken(Name, NameLoc, isRV64()));
 
@@ -1267,6 +1302,33 @@ bool RISCVAsmParser::parseDirectiveOption() {
 
   StringRef Option = Tok.getIdentifier();
 
+  if (Option == "push") {
+    getTargetStreamer().emitDirectiveOptionPush();
+
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+
+    pushFeatureBits();
+    return false;
+  }
+
+  if (Option == "pop") {
+    SMLoc StartLoc = Parser.getTok().getLoc();
+    getTargetStreamer().emitDirectiveOptionPop();
+
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+
+    if (popFeatureBits())
+      return Error(StartLoc, ".option pop with no .option push");
+
+    return false;
+  }
+
   if (Option == "rvc") {
     getTargetStreamer().emitDirectiveOptionRVC();
 
@@ -1291,9 +1353,34 @@ bool RISCVAsmParser::parseDirectiveOption() {
     return false;
   }
 
+  if (Option == "relax") {
+    getTargetStreamer().emitDirectiveOptionRelax();
+
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+
+    setFeatureBits(RISCV::FeatureRelax, "relax");
+    return false;
+  }
+
+  if (Option == "norelax") {
+    getTargetStreamer().emitDirectiveOptionNoRelax();
+
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+
+    clearFeatureBits(RISCV::FeatureRelax, "relax");
+    return false;
+  }
+
   // Unknown option.
   Warning(Parser.getTok().getLoc(),
-          "unknown option, expected 'rvc' or 'norvc'");
+          "unknown option, expected 'push', 'pop', 'rvc', 'norvc', 'relax' or "
+          "'norelax'");
   Parser.eatToEndOfStatement();
   return false;
 }
@@ -1307,80 +1394,23 @@ void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
 
 void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value,
                                  MCStreamer &Out) {
-  if (isInt<32>(Value)) {
-    // Emits the MC instructions for loading a 32-bit constant into a register.
-    //
-    // Depending on the active bits in the immediate Value v, the following
-    // instruction sequences are emitted:
-    //
-    // v == 0                        : ADDI(W)
-    // v[0,12) != 0 && v[12,32) == 0 : ADDI(W)
-    // v[0,12) == 0 && v[12,32) != 0 : LUI
-    // v[0,32) != 0                  : LUI+ADDI(W)
-    //
-    int64_t Hi20 = ((Value + 0x800) >> 12) & 0xFFFFF;
-    int64_t Lo12 = SignExtend64<12>(Value);
-    unsigned SrcReg = RISCV::X0;
-
-    if (Hi20) {
-      emitToStreamer(Out,
-                     MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Hi20));
-      SrcReg = DestReg;
+  RISCVMatInt::InstSeq Seq;
+  RISCVMatInt::generateInstSeq(Value, isRV64(), Seq);
+
+  unsigned SrcReg = RISCV::X0;
+  for (RISCVMatInt::Inst &Inst : Seq) {
+    if (Inst.Opc == RISCV::LUI) {
+      emitToStreamer(
+          Out, MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Inst.Imm));
+    } else {
+      emitToStreamer(
+          Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm(
+                   Inst.Imm));
     }
 
-    if (Lo12 || Hi20 == 0) {
-      unsigned AddiOpcode =
-          STI->hasFeature(RISCV::Feature64Bit) ? RISCV::ADDIW : RISCV::ADDI;
-      emitToStreamer(Out, MCInstBuilder(AddiOpcode)
-                              .addReg(DestReg)
-                              .addReg(SrcReg)
-                              .addImm(Lo12));
-    }
-    return;
-  }
-  assert(STI->hasFeature(RISCV::Feature64Bit) &&
-         "Target must be 64-bit to support a >32-bit constant");
-
-  // In the worst case, for a full 64-bit constant, a sequence of 8 instructions
-  // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note
-  // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits
-  // while the following ADDI instructions contribute up to 12 bits each.
-  //
-  // On the first glance, implementing this seems to be possible by simply
-  // emitting the most significant 32 bits (LUI+ADDIW) followed by as many left
-  // shift (SLLI) and immediate additions (ADDI) as needed. However, due to the
-  // fact that ADDI performs a sign extended addition, doing it like that would
-  // only be possible when at most 11 bits of the ADDI instructions are used.
-  // Using all 12 bits of the ADDI instructions, like done by GAS, actually
-  // requires that the constant is processed starting with the least significant
-  // bit.
-  //
-  // In the following, constants are processed from LSB to MSB but instruction
-  // emission is performed from MSB to LSB by recursively calling
-  // emitLoadImm. In each recursion, first the lowest 12 bits are removed
-  // from the constant and the optimal shift amount, which can be greater than
-  // 12 bits if the constant is sparse, is determined. Then, the shifted
-  // remaining constant is processed recursively and gets emitted as soon as it
-  // fits into 32 bits. The emission of the shifts and additions is subsequently
-  // performed when the recursion returns.
-  //
-  int64_t Lo12 = SignExtend64<12>(Value);
-  int64_t Hi52 = (Value + 0x800) >> 12;
-  int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
-  Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
-
-  emitLoadImm(DestReg, Hi52, Out);
-
-  emitToStreamer(Out, MCInstBuilder(RISCV::SLLI)
-                          .addReg(DestReg)
-                          .addReg(DestReg)
-                          .addImm(ShiftAmount));
-
-  if (Lo12)
-    emitToStreamer(Out, MCInstBuilder(RISCV::ADDI)
-                            .addReg(DestReg)
-                            .addReg(DestReg)
-                            .addImm(Lo12));
+    // Only the first instruction has X0 as its source.
+    SrcReg = DestReg;
+  }
 }
 
 void RISCVAsmParser::emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc,
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 9ba7ebd0eb0f2a3938fc717653b8d4924209bc17..49239ac9099ee2d8c9c060b92e0fe1c8ae1055f4 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -7,115 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/RISCVFixupKinds.h"
-#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "RISCVAsmBackend.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-namespace {
-class RISCVAsmBackend : public MCAsmBackend {
-  const MCSubtargetInfo &STI;
-  uint8_t OSABI;
-  bool Is64Bit;
-
-public:
-  RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit)
-      : MCAsmBackend(support::little), STI(STI), OSABI(OSABI),
-        Is64Bit(Is64Bit) {}
-  ~RISCVAsmBackend() override {}
-
-  // Generate diff expression relocations if the relax feature is enabled,
-  // otherwise it is safe for the assembler to calculate these internally.
-  bool requiresDiffExpressionRelocations() const override {
-    return STI.getFeatureBits()[RISCV::FeatureRelax];
-  }
-  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
-                  const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved,
-                  const MCSubtargetInfo *STI) const override;
-
-  std::unique_ptr<MCObjectTargetWriter>
-  createObjectTargetWriter() const override;
-
-  // If linker relaxation is enabled, always emit relocations even if the fixup
-  // can be resolved. This is necessary for correctness as offsets may change
-  // during relaxation.
-  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target) override {
-    return STI.getFeatureBits()[RISCV::FeatureRelax];
-  }
-
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const override {
-    llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
-  }
-
-  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
-                                    uint64_t Value,
-                                    const MCRelaxableFragment *DF,
-                                    const MCAsmLayout &Layout,
-                                    const bool WasForced) const override;
-
-  unsigned getNumFixupKinds() const override {
-    return RISCV::NumTargetFixupKinds;
-  }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
-    const static MCFixupKindInfo Infos[] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // RISCVFixupKinds.h.
-      //
-      // name                      offset bits  flags
-      { "fixup_riscv_hi20",         12,     20,  0 },
-      { "fixup_riscv_lo12_i",       20,     12,  0 },
-      { "fixup_riscv_lo12_s",        0,     32,  0 },
-      { "fixup_riscv_pcrel_hi20",   12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_pcrel_lo12_i", 20,     12,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_pcrel_lo12_s",  0,     32,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_jal",          12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_branch",        0,     32,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_rvc_jump",      2,     11,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_rvc_branch",    0,     16,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_call",          0,     64,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_relax",         0,      0,  0 }
-    };
-    static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds,
-                  "Not all fixup kinds added to Infos array");
-
-    if (Kind < FirstTargetFixupKind)
-      return MCAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
-
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override;
-  unsigned getRelaxedOpcode(unsigned Op) const;
-
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override;
-
-
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
-};
-
-
 bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
                                                    bool Resolved,
                                                    uint64_t Value,
@@ -348,8 +253,6 @@ RISCVAsmBackend::createObjectTargetWriter() const {
   return createRISCVELFObjectWriter(OSABI, Is64Bit);
 }
 
-} // end anonymous namespace
-
 MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T,
                                           const MCSubtargetInfo &STI,
                                           const MCRegisterInfo &MRI,
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
new file mode 100644
index 0000000000000000000000000000000000000000..5601f07f9267828c92f0d4e5a6651f840c4815ac
--- /dev/null
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -0,0 +1,118 @@
+//===-- RISCVAsmBackend.h - RISCV Assembler Backend -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVASMBACKEND_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVASMBACKEND_H
+
+#include "MCTargetDesc/RISCVFixupKinds.h"
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+class MCAssembler;
+class MCObjectTargetWriter;
+class raw_ostream;
+
+class RISCVAsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo &STI;
+  uint8_t OSABI;
+  bool Is64Bit;
+  bool ForceRelocs = false;
+
+public:
+  RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit)
+      : MCAsmBackend(support::little), STI(STI), OSABI(OSABI),
+        Is64Bit(Is64Bit) {}
+  ~RISCVAsmBackend() override {}
+
+  void setForceRelocs() { ForceRelocs = true; }
+
+  // Generate diff expression relocations if the relax feature is enabled or had
+  // previously been enabled, otherwise it is safe for the assembler to
+  // calculate these internally.
+  bool requiresDiffExpressionRelocations() const override {
+    return STI.getFeatureBits()[RISCV::FeatureRelax] || ForceRelocs;
+  }
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
+
+  // If linker relaxation is enabled, or the relax option had previously been
+  // enabled, always emit relocations even if the fixup can be resolved. This is
+  // necessary for correctness as offsets may change during relaxation.
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
+    return STI.getFeatureBits()[RISCV::FeatureRelax] || ForceRelocs;
+  }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
+  }
+
+  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+                                    uint64_t Value,
+                                    const MCRelaxableFragment *DF,
+                                    const MCAsmLayout &Layout,
+                                    const bool WasForced) const override;
+
+  unsigned getNumFixupKinds() const override {
+    return RISCV::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // RISCVFixupKinds.h.
+      //
+      // name                      offset bits  flags
+      { "fixup_riscv_hi20",         12,     20,  0 },
+      { "fixup_riscv_lo12_i",       20,     12,  0 },
+      { "fixup_riscv_lo12_s",        0,     32,  0 },
+      { "fixup_riscv_pcrel_hi20",   12,     20,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_pcrel_lo12_i", 20,     12,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_pcrel_lo12_s",  0,     32,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_jal",          12,     20,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_branch",        0,     32,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_rvc_jump",      2,     11,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_rvc_branch",    0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_call",          0,     64,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_riscv_relax",         0,      0,  0 }
+    };
+    static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds,
+                  "Not all fixup kinds added to Infos array");
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override;
+  unsigned getRelaxedOpcode(unsigned Op) const;
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
+
+
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+};
+}
+
+#endif
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 6428b11cfe9c355c6bad1e575855ee01f651e981..a6ba1e41e9644cb01919091c2cb2c35a46a4f0ee 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -38,5 +38,9 @@ MCELFStreamer &RISCVTargetELFStreamer::getStreamer() {
   return static_cast<MCELFStreamer &>(Streamer);
 }
 
+void RISCVTargetELFStreamer::emitDirectiveOptionPush() {}
+void RISCVTargetELFStreamer::emitDirectiveOptionPop() {}
 void RISCVTargetELFStreamer::emitDirectiveOptionRVC() {}
 void RISCVTargetELFStreamer::emitDirectiveOptionNoRVC() {}
+void RISCVTargetELFStreamer::emitDirectiveOptionRelax() {}
+void RISCVTargetELFStreamer::emitDirectiveOptionNoRelax() {}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index daa7abfe1336f850ff9bdf6babe3ed3f34b69a19..1f36bbc4388242e06c3adb04666f20d79804c3b0 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -20,8 +20,12 @@ public:
   MCELFStreamer &getStreamer();
   RISCVTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
 
+  virtual void emitDirectiveOptionPush();
+  virtual void emitDirectiveOptionPop();
   virtual void emitDirectiveOptionRVC();
   virtual void emitDirectiveOptionNoRVC();
+  virtual void emitDirectiveOptionRelax();
+  virtual void emitDirectiveOptionNoRelax();
 };
 }
 #endif
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 2d5205aa7ef74327714577f8c2692b67b288d89c..8d5ef3dbd17f99e502b07ef2e8324495136faaf1 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -23,6 +23,14 @@ RISCVTargetAsmStreamer::RISCVTargetAsmStreamer(MCStreamer &S,
                                                formatted_raw_ostream &OS)
     : RISCVTargetStreamer(S), OS(OS) {}
 
+void RISCVTargetAsmStreamer::emitDirectiveOptionPush() {
+  OS << "\t.option\tpush\n";
+}
+
+void RISCVTargetAsmStreamer::emitDirectiveOptionPop() {
+  OS << "\t.option\tpop\n";
+}
+
 void RISCVTargetAsmStreamer::emitDirectiveOptionRVC() {
   OS << "\t.option\trvc\n";
 }
@@ -30,3 +38,11 @@ void RISCVTargetAsmStreamer::emitDirectiveOptionRVC() {
 void RISCVTargetAsmStreamer::emitDirectiveOptionNoRVC() {
   OS << "\t.option\tnorvc\n";
 }
+
+void RISCVTargetAsmStreamer::emitDirectiveOptionRelax() {
+  OS << "\t.option\trelax\n";
+}
+
+void RISCVTargetAsmStreamer::emitDirectiveOptionNoRelax() {
+  OS << "\t.option\tnorelax\n";
+}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
index 525c20810f244c1880faa37e023ec94125284db8..74ec9e303933d1260fa6947a9e325e149d718f30 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -18,8 +18,12 @@ class RISCVTargetStreamer : public MCTargetStreamer {
 public:
   RISCVTargetStreamer(MCStreamer &S);
 
+  virtual void emitDirectiveOptionPush() = 0;
+  virtual void emitDirectiveOptionPop() = 0;
   virtual void emitDirectiveOptionRVC() = 0;
   virtual void emitDirectiveOptionNoRVC() = 0;
+  virtual void emitDirectiveOptionRelax() = 0;
+  virtual void emitDirectiveOptionNoRelax() = 0;
 };
 
 // This part is for ascii assembly output
@@ -29,8 +33,12 @@ class RISCVTargetAsmStreamer : public RISCVTargetStreamer {
 public:
   RISCVTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 
+  void emitDirectiveOptionPush() override;
+  void emitDirectiveOptionPop() override;
   void emitDirectiveOptionRVC() override;
   void emitDirectiveOptionNoRVC() override;
+  void emitDirectiveOptionRelax() override;
+  void emitDirectiveOptionNoRelax() override;
 };
 
 }
diff --git a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 1c23680abc1dceb4d9632590ac4f999682407e37..35c185aa5edd3bae2e7dcc3b889be06d0b48bdb0 100644
--- a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -52,6 +52,9 @@ private:
                             MachineBasicBlock::iterator MBBI,
                             AtomicRMWInst::BinOp, bool IsMasked, int Width,
                             MachineBasicBlock::iterator &NextMBBI);
+  bool expandAtomicCmpXchg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, bool IsMasked,
+                           int Width, MachineBasicBlock::iterator &NextMBBI);
 };
 
 char RISCVExpandPseudo::ID = 0;
@@ -106,6 +109,10 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoMaskedAtomicLoadUMin32:
     return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, true, 32,
                                 NextMBBI);
+  case RISCV::PseudoCmpXchg32:
+    return expandAtomicCmpXchg(MBB, MBBI, false, 32, NextMBBI);
+  case RISCV::PseudoMaskedCmpXchg32:
+    return expandAtomicCmpXchg(MBB, MBBI, true, 32, NextMBBI);
   }
 
   return false;
@@ -441,6 +448,103 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp(
   return true;
 }
 
+bool RISCVExpandPseudo::expandAtomicCmpXchg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked,
+    int Width, MachineBasicBlock::iterator &NextMBBI) {
+  assert(Width == 32 && "RV64 atomic expansion currently unsupported");
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  // Insert new MBBs.
+  MF->insert(++MBB.getIterator(), LoopHeadMBB);
+  MF->insert(++LoopHeadMBB->getIterator(), LoopTailMBB);
+  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+
+  // Set up successors and transfer remaining instructions to DoneMBB.
+  LoopHeadMBB->addSuccessor(LoopTailMBB);
+  LoopHeadMBB->addSuccessor(DoneMBB);
+  LoopTailMBB->addSuccessor(DoneMBB);
+  LoopTailMBB->addSuccessor(LoopHeadMBB);
+  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
+  DoneMBB->transferSuccessors(&MBB);
+  MBB.addSuccessor(LoopHeadMBB);
+
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned ScratchReg = MI.getOperand(1).getReg();
+  unsigned AddrReg = MI.getOperand(2).getReg();
+  unsigned CmpValReg = MI.getOperand(3).getReg();
+  unsigned NewValReg = MI.getOperand(4).getReg();
+  AtomicOrdering Ordering =
+      static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
+
+  if (!IsMasked) {
+    // .loophead:
+    //   lr.w dest, (addr)
+    //   bne dest, cmpval, done
+    BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
+        .addReg(AddrReg);
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE))
+        .addReg(DestReg)
+        .addReg(CmpValReg)
+        .addMBB(DoneMBB);
+    // .looptail:
+    //   sc.w scratch, newval, (addr)
+    //   bnez scratch, loophead
+    BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg)
+        .addReg(AddrReg)
+        .addReg(NewValReg);
+    BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
+        .addReg(ScratchReg)
+        .addReg(RISCV::X0)
+        .addMBB(LoopHeadMBB);
+  } else {
+    // .loophead:
+    //   lr.w dest, (addr)
+    //   and scratch, dest, mask
+    //   bne scratch, cmpval, done
+    unsigned MaskReg = MI.getOperand(5).getReg();
+    BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
+        .addReg(AddrReg);
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg)
+        .addReg(DestReg)
+        .addReg(MaskReg);
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE))
+        .addReg(ScratchReg)
+        .addReg(CmpValReg)
+        .addMBB(DoneMBB);
+
+    // .looptail:
+    //   xor scratch, dest, newval
+    //   and scratch, scratch, mask
+    //   xor scratch, dest, scratch
+    //   sc.w scratch, scratch, (adrr)
+    //   bnez scratch, loophead
+    insertMaskedMerge(TII, DL, LoopTailMBB, ScratchReg, DestReg, NewValReg,
+                      MaskReg, ScratchReg);
+    BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg)
+        .addReg(AddrReg)
+        .addReg(ScratchReg);
+    BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
+        .addReg(ScratchReg)
+        .addReg(RISCV::X0)
+        .addMBB(LoopHeadMBB);
+  }
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *LoopHeadMBB);
+  computeAndAddLiveIns(LiveRegs, *LoopTailMBB);
+  computeAndAddLiveIns(LiveRegs, *DoneMBB);
+
+  return true;
+}
+
 } // end of anonymous namespace
 
 INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo",
diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index e355b208a75dde556956079e051cfb6c207037d5..aa80365feb830f9e23a6f35d1cd784e1b6deab96 100644
--- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RISCV.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "RISCV.h"
 #include "RISCVTargetMachine.h"
+#include "Utils/RISCVMatInt.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Debug.h"
@@ -63,6 +64,38 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() {
   doPeepholeLoadStoreADDI();
 }
 
+static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
+                         MVT XLenVT) {
+  RISCVMatInt::InstSeq Seq;
+  RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq);
+
+  SDNode *Result;
+  SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
+  for (RISCVMatInt::Inst &Inst : Seq) {
+    SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
+    if (Inst.Opc == RISCV::LUI)
+      Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm);
+    else
+      Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm);
+
+    // Only the first instruction has X0 as its source.
+    SrcReg = SDValue(Result, 0);
+  }
+
+  return Result;
+}
+
+// Returns true if the Node is an ISD::AND with a constant argument. If so,
+// set Mask to that constant value.
+static bool isConstantMask(SDNode *Node, uint64_t &Mask) {
+  if (Node->getOpcode() == ISD::AND &&
+      Node->getOperand(1).getOpcode() == ISD::Constant) {
+    Mask = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
 void RISCVDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we have already selected.
   if (Node->isMachineOpcode()) {
@@ -87,6 +120,11 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, New.getNode());
       return;
     }
+    int64_t Imm = ConstNode->getSExtValue();
+    if (XLenVT == MVT::i64) {
+      ReplaceNode(Node, selectImm(CurDAG, SDLoc(Node), Imm, XLenVT));
+      return;
+    }
     break;
   }
   case ISD::FrameIndex: {
@@ -96,6 +134,29 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
     return;
   }
+  case ISD::SRL: {
+    if (!Subtarget->is64Bit())
+      break;
+    SDValue Op0 = Node->getOperand(0);
+    SDValue Op1 = Node->getOperand(1);
+    uint64_t Mask;
+    // Match (srl (and val, mask), imm) where the result would be a
+    // zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
+    // is equivalent to this (SimplifyDemandedBits may have removed lower bits
+    // from the mask that aren't necessary due to the right-shifting).
+    if (Op1.getOpcode() == ISD::Constant &&
+        isConstantMask(Op0.getNode(), Mask)) {
+      uint64_t ShAmt = cast<ConstantSDNode>(Op1.getNode())->getZExtValue();
+
+      if ((Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff) {
+        SDValue ShAmtVal =
+            CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT);
+        CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0),
+                             ShAmtVal);
+        return;
+      }
+    }
+  }
   }
 
   // Select the default instruction.
diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index 85758c0cdf8cf4f050ad31edb311131fde361a9e..e78085edac38e39073636f627adf8709056bb54d 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -111,10 +111,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE,
       ISD::SETGT,  ISD::SETGE,  ISD::SETNE};
 
-  // TODO: add proper support for the various FMA variants
-  // (FMADD.S, FMSUB.S, FNMSUB.S, FNMADD.S).
   ISD::NodeType FPOpToExtend[] = {
-      ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FMA};
+      ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM};
 
   if (Subtarget.hasStdExtF()) {
     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
@@ -186,6 +184,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::riscv_masked_atomicrmw_min_i32:
   case Intrinsic::riscv_masked_atomicrmw_umax_i32:
   case Intrinsic::riscv_masked_atomicrmw_umin_i32:
+  case Intrinsic::riscv_masked_cmpxchg_i32:
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
@@ -266,6 +265,10 @@ bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return TargetLowering::isZExtFree(Val, VT2);
 }
 
+bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
+  return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
+}
+
 // Changes the condition code and swaps operands if necessary, so the SetCC
 // operation matches one of the comparisons supported directly in the RISC-V
 // ISA.
@@ -1708,3 +1711,23 @@ Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
 
   return Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering});
 }
+
+TargetLowering::AtomicExpansionKind
+RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR(
+    AtomicCmpXchgInst *CI) const {
+  unsigned Size = CI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
+  if (Size == 8 || Size == 16)
+    return AtomicExpansionKind::MaskedIntrinsic;
+  return AtomicExpansionKind::None;
+}
+
+Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
+    IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
+    Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
+  Value *Ordering = Builder.getInt32(static_cast<uint32_t>(Ord));
+  Type *Tys[] = {AlignedAddr->getType()};
+  Function *MaskedCmpXchg = Intrinsic::getDeclaration(
+      CI->getModule(), Intrinsic::riscv_masked_cmpxchg_i32, Tys);
+  return Builder.CreateCall(MaskedCmpXchg,
+                            {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
+}
diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h
index fe988bb14deaf27dd809e186f0675a2ac9ed0c42..6970900bb0622af57b46eb655ada4c6c4051cc07 100644
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@@ -54,6 +54,7 @@ public:
   bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
   bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
+  bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
 
   // Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -126,6 +127,13 @@ private:
   virtual Value *emitMaskedAtomicRMWIntrinsic(
       IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
       Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const override;
+  TargetLowering::AtomicExpansionKind
+  shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override;
+  virtual Value *
+  emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder, AtomicCmpXchgInst *CI,
+                                   Value *AlignedAddr, Value *CmpVal,
+                                   Value *NewVal, Value *Mask,
+                                   AtomicOrdering Ord) const override;
 };
 }
 
diff --git a/lib/Target/RISCV/RISCVInstrFormats.td b/lib/Target/RISCV/RISCVInstrFormats.td
index 529e048045c6f10c1ebc1ce4e2acddd62496100a..ebd676a6056ef366917c7e8a01548d07d498f20f 100644
--- a/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/lib/Target/RISCV/RISCVInstrFormats.td
@@ -45,11 +45,12 @@ def InstFormatCSS    : InstFormat<10>;
 def InstFormatCIW    : InstFormat<11>;
 def InstFormatCL     : InstFormat<12>;
 def InstFormatCS     : InstFormat<13>;
-def InstFormatCB     : InstFormat<14>;
-def InstFormatCJ     : InstFormat<15>;
-def InstFormatOther  : InstFormat<16>;
+def InstFormatCA     : InstFormat<14>;
+def InstFormatCB     : InstFormat<15>;
+def InstFormatCJ     : InstFormat<16>;
+def InstFormatOther  : InstFormat<17>;
 
-// The following opcode names and match those given in Table 19.1 in the
+// The following opcode names match those given in Table 19.1 in the
 // RISC-V User-level ISA specification ("RISC-V base opcode map").
 class RISCVOpcode<bits<7> val> {
   bits<7> Value = val;
diff --git a/lib/Target/RISCV/RISCVInstrFormatsC.td b/lib/Target/RISCV/RISCVInstrFormatsC.td
index 6abcbd7cc8a123fa9f466a03125726c332ebd910..bda8bbb558eb6ce32c01605ea16e0a3504e18c3b 100644
--- a/lib/Target/RISCV/RISCVInstrFormatsC.td
+++ b/lib/Target/RISCV/RISCVInstrFormatsC.td
@@ -118,6 +118,19 @@ class RVInst16CS<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
   let Inst{1-0} = opcode;
 }
 
+class RVInst16CA<bits<6> funct6, bits<2> funct2, bits<2> opcode, dag outs,
+                 dag ins, string opcodestr, string argstr>
+    : RVInst16<outs, ins, opcodestr, argstr, [], InstFormatCA> {
+  bits<3> rs2;
+  bits<3> rs1;
+
+  let Inst{15-10} = funct6;
+  let Inst{9-7} = rs1;
+  let Inst{6-5} = funct2;
+  let Inst{4-2} = rs2;
+  let Inst{1-0} = opcode;
+}
+
 class RVInst16CB<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
                  string opcodestr, string argstr>
     : RVInst16<outs, ins, opcodestr, argstr, [], InstFormatCB> {
diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 631a1f7deca04010c69bdb7f14cb2dc05d9a9dda..60c0f0b96a62b4140ff9e98c3ead532116ffe42e 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -400,6 +400,15 @@ def EBREAK : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ebreak", ""> {
   let rd = 0;
   let imm12 = 1;
 }
+
+// This is a de facto standard (as set by GNU binutils) 32-bit unimplemented
+// instruction (i.e., it should always trap, if your implementation has invalid
+// instruction traps).
+def UNIMP : RVInstI<0b001, OPC_SYSTEM, (outs), (ins), "unimp", ""> {
+  let rs1 = 0;
+  let rd = 0;
+  let imm12 = 0b110000000000;
+}
 } // hasSideEffects = 1, mayLoad = 0, mayStore = 0
 
 def CSRRW : CSR_ir<0b001, "csrrw">;
@@ -568,6 +577,16 @@ def : InstAlias<"csrwi $csr, $imm", (CSRRWI X0, csr_sysreg:$csr, uimm5:$imm)>;
 def : InstAlias<"csrsi $csr, $imm", (CSRRSI X0, csr_sysreg:$csr, uimm5:$imm)>;
 def : InstAlias<"csrci $csr, $imm", (CSRRCI X0, csr_sysreg:$csr, uimm5:$imm)>;
 
+let EmitPriority = 0 in {
+def : InstAlias<"csrw $csr, $imm", (CSRRWI X0, csr_sysreg:$csr, uimm5:$imm)>;
+def : InstAlias<"csrs $csr, $imm", (CSRRSI X0, csr_sysreg:$csr, uimm5:$imm)>;
+def : InstAlias<"csrc $csr, $imm", (CSRRCI X0, csr_sysreg:$csr, uimm5:$imm)>;
+
+def : InstAlias<"csrrw $rd, $csr, $imm", (CSRRWI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>;
+def : InstAlias<"csrrs $rd, $csr, $imm", (CSRRSI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>;
+def : InstAlias<"csrrc $rd, $csr, $imm", (CSRRCI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>;
+}
+
 def : InstAlias<"sfence.vma",     (SFENCE_VMA      X0, X0)>;
 def : InstAlias<"sfence.vma $rs", (SFENCE_VMA GPR:$rs, X0)>;
 
@@ -685,7 +704,9 @@ def : PatGprSimm12<setult, SLTIU>;
 
 // Define pattern expansions for setcc operations that aren't directly
 // handled by a RISC-V instruction.
+def : Pat<(seteq GPR:$rs1, 0), (SLTIU GPR:$rs1, 1)>;
 def : Pat<(seteq GPR:$rs1, GPR:$rs2), (SLTIU (XOR GPR:$rs1, GPR:$rs2), 1)>;
+def : Pat<(setne GPR:$rs1, 0), (SLTU X0, GPR:$rs1)>;
 def : Pat<(setne GPR:$rs1, GPR:$rs2), (SLTU X0, (XOR GPR:$rs1, GPR:$rs2))>;
 def : Pat<(setugt GPR:$rs1, GPR:$rs2), (SLTU GPR:$rs2, GPR:$rs1)>;
 def : Pat<(setuge GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs1, GPR:$rs2), 1)>;
@@ -813,7 +834,7 @@ defm : LdPat<sextloadi8, LB>;
 defm : LdPat<extloadi8, LB>;
 defm : LdPat<sextloadi16, LH>;
 defm : LdPat<extloadi16, LH>;
-defm : LdPat<load, LW>;
+defm : LdPat<load, LW>, Requires<[IsRV32]>;
 defm : LdPat<zextloadi8, LBU>;
 defm : LdPat<zextloadi16, LHU>;
 
@@ -864,6 +885,45 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                               [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
 } // Defs = [X2], Uses = [X2]
 
+/// RV64 patterns
+
+let Predicates = [IsRV64] in {
+
+/// sext and zext
+
+def : Pat<(sext_inreg GPR:$rs1, i32), (ADDIW GPR:$rs1, 0)>;
+def : Pat<(and GPR:$rs1, 0xffffffff), (SRLI (SLLI GPR:$rs1, 32), 32)>;
+
+/// ALU operations
+
+def : Pat<(sext_inreg (add GPR:$rs1, GPR:$rs2), i32),
+          (ADDW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sext_inreg (add GPR:$rs1, simm12:$imm12), i32),
+          (ADDIW GPR:$rs1, simm12:$imm12)>;
+def : Pat<(sext_inreg (sub GPR:$rs1, GPR:$rs2), i32),
+          (SUBW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32),
+          (SLLIW GPR:$rs1, uimm5:$shamt)>;
+// (srl (zexti32 ...), uimm5:$shamt) is matched with custom code due to the
+// need to undo manipulation of the mask value performed by DAGCombine.
+def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
+          (SRAIW GPR:$rs1, uimm5:$shamt)>;
+
+// TODO: patterns for SLLW/SRLW/SRAW.
+
+/// Loads
+
+defm : LdPat<sextloadi32, LW>;
+defm : LdPat<extloadi32, LW>;
+defm : LdPat<zextloadi32, LWU>;
+defm : LdPat<load, LD>;
+
+/// Stores
+
+defm : StPat<truncstorei32, SW, GPR>;
+defm : StPat<store, SD, GPR>;
+} // Predicates = [IsRV64]
+
 //===----------------------------------------------------------------------===//
 // Standard extensions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/RISCV/RISCVInstrInfoA.td b/lib/Target/RISCV/RISCVInstrInfoA.td
index 180434e9ff1363ea7eef15d01f7e4dd4ea28d74b..9cb1d2f0b627d54987c34663aa73473e23549094 100644
--- a/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -153,7 +153,7 @@ class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch),
 }
 
 def PseudoAtomicLoadNand32 : PseudoAMO;
-// Ordering constants must be kept in sync with the AtomicOrdering enum in 
+// Ordering constants must be kept in sync with the AtomicOrdering enum in
 // AtomicOrdering.h.
 def : Pat<(atomic_load_nand_32_monotonic GPR:$addr, GPR:$incr),
           (PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 2)>;
@@ -230,4 +230,49 @@ def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax_i32,
 def PseudoMaskedAtomicLoadUMin32 : PseudoMaskedAMOUMinUMax;
 def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin_i32,
                          PseudoMaskedAtomicLoadUMin32>;
+
+/// Compare and exchange
+
+class PseudoCmpXchg
+    : Pseudo<(outs GPR:$res, GPR:$scratch),
+             (ins GPR:$addr, GPR:$cmpval, GPR:$newval, i32imm:$ordering), []> {
+  let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+}
+
+// Ordering constants must be kept in sync with the AtomicOrdering enum in
+// AtomicOrdering.h.
+multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst> {
+  def : Pat<(!cast<PatFrag>(Op#"_monotonic") GPR:$addr, GPR:$cmp, GPR:$new),
+            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 2)>;
+  def : Pat<(!cast<PatFrag>(Op#"_acquire") GPR:$addr, GPR:$cmp, GPR:$new),
+            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 4)>;
+  def : Pat<(!cast<PatFrag>(Op#"_release") GPR:$addr, GPR:$cmp, GPR:$new),
+            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 5)>;
+  def : Pat<(!cast<PatFrag>(Op#"_acq_rel") GPR:$addr, GPR:$cmp, GPR:$new),
+            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 6)>;
+  def : Pat<(!cast<PatFrag>(Op#"_seq_cst") GPR:$addr, GPR:$cmp, GPR:$new),
+            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
+}
+
+def PseudoCmpXchg32 : PseudoCmpXchg;
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>;
+
+def PseudoMaskedCmpXchg32
+    : Pseudo<(outs GPR:$res, GPR:$scratch),
+             (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask,
+              i32imm:$ordering), []> {
+  let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+}
+
+def : Pat<(int_riscv_masked_cmpxchg_i32
+            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering),
+          (PseudoMaskedCmpXchg32
+            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
+
 } // Predicates = [HasStdExtA]
diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td
index 21e6cae5ef94f62c599323c0fd1de816acb29bc8..ad68b5a7dc976314e27aefdd445cb90917e6b2ed 100644
--- a/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -258,16 +258,13 @@ class Shift_right<bits<2> funct2, string OpcodeStr, RegisterClass cls,
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class CS_ALU<bits<2> funct2, string OpcodeStr, RegisterClass cls,
-             bit RV64only>
-    : RVInst16CS<0b100, 0b01, (outs cls:$rd_wb), (ins cls:$rd, cls:$rs2),
+class CS_ALU<bits<6> funct6, bits<2> funct2, string OpcodeStr,
+             RegisterClass cls>
+    : RVInst16CA<funct6, funct2, 0b01, (outs cls:$rd_wb), (ins cls:$rd, cls:$rs2),
                  OpcodeStr, "$rd, $rs2"> {
   bits<3> rd;
   let Constraints = "$rd = $rd_wb";
-  let Inst{12} = RV64only;
-  let Inst{11-10} = 0b11;
   let Inst{9-7} = rd;
-  let Inst{6-5} = funct2;
 }
 
 //===----------------------------------------------------------------------===//
@@ -411,14 +408,14 @@ def C_ANDI : RVInst16CB<0b100, 0b01, (outs GPRC:$rs1_wb), (ins GPRC:$rs1, simm6:
   let Inst{6-2} = imm{4-0};
 }
 
-def C_SUB  : CS_ALU<0b00, "c.sub", GPRC, 0>;
-def C_XOR  : CS_ALU<0b01, "c.xor", GPRC, 0>;
-def C_OR   : CS_ALU<0b10, "c.or" , GPRC, 0>;
-def C_AND  : CS_ALU<0b11, "c.and", GPRC, 0>;
+def C_SUB  : CS_ALU<0b100011, 0b00, "c.sub", GPRC>;
+def C_XOR  : CS_ALU<0b100011, 0b01, "c.xor", GPRC>;
+def C_OR   : CS_ALU<0b100011, 0b10, "c.or" , GPRC>;
+def C_AND  : CS_ALU<0b100011, 0b11, "c.and", GPRC>;
 
 let Predicates = [HasStdExtC, IsRV64] in {
-def C_SUBW : CS_ALU<0b00, "c.subw", GPRC, 1>;
-def C_ADDW : CS_ALU<0b01, "c.addw", GPRC, 1>;
+def C_SUBW : CS_ALU<0b100111, 0b00, "c.subw", GPRC>;
+def C_ADDW : CS_ALU<0b100111, 0b01, "c.addw", GPRC>;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -478,7 +475,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2),
                       "c.mv", "$rs1, $rs2">;
 
-let rs1 = 0, rs2 = 0, hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let rs1 = 0, rs2 = 0, hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 def C_EBREAK : RVInst16CR<0b1001, 0b10, (outs), (ins), "c.ebreak", "">;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
@@ -517,6 +514,13 @@ def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000> {
   let Inst{9-7}   = imm{8-6};
 }
 
+// The all zeros pattern isn't a valid RISC-V instruction. It's used by GNU
+// binutils as 16-bit instruction known to be unimplemented (i.e., trapping).
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther> {
+  let Inst{15-0} = 0;
+}
+
 } // Predicates = [HasStdExtC]
 
 //===----------------------------------------------------------------------===//
@@ -680,6 +684,7 @@ def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, X0),
 def : CompressPat<(ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, 0),
                   (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
 def : CompressPat<(EBREAK), (C_EBREAK)>;
+def : CompressPat<(UNIMP), (C_UNIMP)>;
 def : CompressPat<(JALR X1, GPRNoX0:$rs1, 0),
                   (C_JALR GPRNoX0:$rs1)>;
 def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs1, GPRNoX0:$rs2),
diff --git a/lib/Target/RISCV/RISCVInstrInfoD.td b/lib/Target/RISCV/RISCVInstrInfoD.td
index 73ceb2b0660e1b7793461637626b07b28f1ab389..9f1cd50de595d4592b529d6750ed6a61889d586f 100644
--- a/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -230,6 +230,22 @@ def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>;
 def : PatFpr64Fpr64<fcopysign, FSGNJ_D>;
 def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>;
 
+// fmadd: rs1 * rs2 + rs3
+def : Pat<(fma FPR64:$rs1, FPR64:$rs2, FPR64:$rs3),
+          (FMADD_D $rs1, $rs2, $rs3, 0b111)>;
+
+// fmsub: rs1 * rs2 - rs3
+def : Pat<(fma FPR64:$rs1, FPR64:$rs2, (fneg FPR64:$rs3)),
+          (FMSUB_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
+
+// fnmsub: -rs1 * rs2 + rs3
+def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, FPR64:$rs3),
+          (FNMSUB_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
+
+// fnmadd: -rs1 * rs2 - rs3
+def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, (fneg FPR64:$rs3)),
+          (FNMADD_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
+
 // The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the
 // canonical NaN when giving a signaling NaN. This doesn't match the LLVM
 // behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the
diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td
index a31b93b11ae8b1631862336dc35bfb9fa2ea9815..03bdac45873d0b90adf10d5fe084de97f6463f72 100644
--- a/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -270,6 +270,22 @@ def : Pat<(fabs FPR32:$rs1), (FSGNJX_S $rs1, $rs1)>;
 def : PatFpr32Fpr32<fcopysign, FSGNJ_S>;
 def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>;
 
+// fmadd: rs1 * rs2 + rs3
+def : Pat<(fma FPR32:$rs1, FPR32:$rs2, FPR32:$rs3),
+          (FMADD_S $rs1, $rs2, $rs3, 0b111)>;
+
+// fmsub: rs1 * rs2 - rs3
+def : Pat<(fma FPR32:$rs1, FPR32:$rs2, (fneg FPR32:$rs3)),
+          (FMSUB_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
+
+// fnmsub: -rs1 * rs2 + rs3
+def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, FPR32:$rs3),
+          (FNMSUB_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
+
+// fnmadd: -rs1 * rs2 - rs3
+def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, (fneg FPR32:$rs3)),
+          (FNMADD_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
+
 // The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the
 // canonical NaN when given a signaling NaN. This doesn't match the LLVM
 // behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the
diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp
index e75da768ee2e9ceb4c766e6f4d1bba8eafd2db92..8937ec200bd74cf8b59aed8ac1327eeddfede59d 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -47,12 +47,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
-  if (CM)
-    return *CM;
-  return CodeModel::Small;
-}
-
 RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
@@ -61,7 +55,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OL, bool JIT)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
                         getEffectiveRelocModel(TT, RM),
-                        getEffectiveCodeModel(CM), OL),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(make_unique<RISCVELFTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/RISCV/Utils/CMakeLists.txt b/lib/Target/RISCV/Utils/CMakeLists.txt
index b7869f696307f61f86d6de8df799a40dd7f9a26a..727ab4a9fd771ac509df66bcec71b2f301330bc4 100644
--- a/lib/Target/RISCV/Utils/CMakeLists.txt
+++ b/lib/Target/RISCV/Utils/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_llvm_library(LLVMRISCVUtils
   RISCVBaseInfo.cpp
+  RISCVMatInt.cpp
   )
diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/lib/Target/RISCV/Utils/RISCVBaseInfo.h
index 86cd2d6e58a1cd672d56e261c7a8b9f5f4a63bca..372e0e80bbaf8c40cf9364744ccc1f91c192c1bf 100644
--- a/lib/Target/RISCV/Utils/RISCVBaseInfo.h
+++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.h
@@ -39,9 +39,10 @@ enum {
   InstFormatCIW = 11,
   InstFormatCL = 12,
   InstFormatCS = 13,
-  InstFormatCB = 14,
-  InstFormatCJ = 15,
-  InstFormatOther = 16,
+  InstFormatCA = 14,
+  InstFormatCB = 15,
+  InstFormatCJ = 16,
+  InstFormatOther = 17,
 
   InstFormatMask = 31
 };
diff --git a/lib/Target/RISCV/Utils/RISCVMatInt.cpp b/lib/Target/RISCV/Utils/RISCVMatInt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3dc298246bc562ef8df573d877cb3519ca2d7c0b
--- /dev/null
+++ b/lib/Target/RISCV/Utils/RISCVMatInt.cpp
@@ -0,0 +1,79 @@
+//===- RISCVMatInt.cpp - Immediate materialisation -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVMatInt.h"
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/MathExtras.h"
+#include <cstdint>
+
+namespace llvm {
+
+namespace RISCVMatInt {
+void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) {
+  if (isInt<32>(Val)) {
+    // Depending on the active bits in the immediate Value v, the following
+    // instruction sequences are emitted:
+    //
+    // v == 0                        : ADDI
+    // v[0,12) != 0 && v[12,32) == 0 : ADDI
+    // v[0,12) == 0 && v[12,32) != 0 : LUI
+    // v[0,32) != 0                  : LUI+ADDI(W)
+    int64_t Hi20 = ((Val + 0x800) >> 12) & 0xFFFFF;
+    int64_t Lo12 = SignExtend64<12>(Val);
+
+    if (Hi20)
+      Res.push_back(Inst(RISCV::LUI, Hi20));
+
+    if (Lo12 || Hi20 == 0) {
+      unsigned AddiOpc = (Is64Bit && Hi20) ? RISCV::ADDIW : RISCV::ADDI;
+      Res.push_back(Inst(AddiOpc, Lo12));
+    }
+    return;
+  }
+
+  assert(Is64Bit && "Can't emit >32-bit imm for non-RV64 target");
+
+  // In the worst case, for a full 64-bit constant, a sequence of 8 instructions
+  // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note
+  // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits
+  // while the following ADDI instructions contribute up to 12 bits each.
+  //
+  // On the first glance, implementing this seems to be possible by simply
+  // emitting the most significant 32 bits (LUI+ADDIW) followed by as many left
+  // shift (SLLI) and immediate additions (ADDI) as needed. However, due to the
+  // fact that ADDI performs a sign extended addition, doing it like that would
+  // only be possible when at most 11 bits of the ADDI instructions are used.
+  // Using all 12 bits of the ADDI instructions, like done by GAS, actually
+  // requires that the constant is processed starting with the least significant
+  // bit.
+  //
+  // In the following, constants are processed from LSB to MSB but instruction
+  // emission is performed from MSB to LSB by recursively calling
+  // generateInstSeq. In each recursion, first the lowest 12 bits are removed
+  // from the constant and the optimal shift amount, which can be greater than
+  // 12 bits if the constant is sparse, is determined. Then, the shifted
+  // remaining constant is processed recursively and gets emitted as soon as it
+  // fits into 32 bits. The emission of the shifts and additions is subsequently
+  // performed when the recursion returns.
+
+  int64_t Lo12 = SignExtend64<12>(Val);
+  int64_t Hi52 = (Val + 0x800) >> 12;
+  int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
+  Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
+
+  generateInstSeq(Hi52, Is64Bit, Res);
+
+  Res.push_back(Inst(RISCV::SLLI, ShiftAmount));
+  if (Lo12)
+    Res.push_back(Inst(RISCV::ADDI, Lo12));
+}
+} // namespace RISCVMatInt
+} // namespace llvm
diff --git a/lib/Target/RISCV/Utils/RISCVMatInt.h b/lib/Target/RISCV/Utils/RISCVMatInt.h
new file mode 100644
index 0000000000000000000000000000000000000000..49d1d89adc7a1af5532098b50ed3e45c3edda9e1
--- /dev/null
+++ b/lib/Target/RISCV/Utils/RISCVMatInt.h
@@ -0,0 +1,36 @@
+//===- RISCVMatInt.h - Immediate materialisation ---------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_MATINT_H
+#define LLVM_LIB_TARGET_RISCV_MATINT_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/MachineValueType.h"
+#include <cstdint>
+
+namespace llvm {
+
+namespace RISCVMatInt {
+struct Inst {
+  unsigned Opc;
+  int64_t Imm;
+
+  Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {}
+};
+using InstSeq = SmallVector<Inst, 8>;
+
+// Helper to generate an instruction sequence that will materialise the given
+// immediate value into a register. A sequence of instructions represented by
+// a simple struct produced rather than directly emitting the instructions in
+// order to allow this helper to be used from both the MC layer and during
+// instruction selection.
+void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res);
+} // namespace RISCVMatInt
+} // namespace llvm
+#endif
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 35f52f7d279bfc1199a95bad3a1eb02c8e349f5b..691421e533eae975603700cb3bef5c88c673312f 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -78,6 +78,8 @@ class SparcAsmParser : public MCTargetAsmParser {
   // Custom parse functions for Sparc specific operands.
   OperandMatchResultTy parseMEMOperand(OperandVector &Operands);
 
+  OperandMatchResultTy parseMembarTag(OperandVector &Operands);
+
   OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
 
   OperandMatchResultTy
@@ -256,6 +258,7 @@ public:
   bool isMem() const override { return isMEMrr() || isMEMri(); }
   bool isMEMrr() const { return Kind == k_MemoryReg; }
   bool isMEMri() const { return Kind == k_MemoryImm; }
+  bool isMembarTag() const { return Kind == k_Immediate; }
 
   bool isIntReg() const {
     return (Kind == k_Register && Reg.Kind == rk_IntReg);
@@ -366,6 +369,12 @@ public:
     addExpr(Inst, Expr);
   }
 
+  void addMembarTagOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
   static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
     auto Op = make_unique<SparcOperand>(k_Token);
     Op->Tok.Data = Str.data();
@@ -742,6 +751,52 @@ SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy SparcAsmParser::parseMembarTag(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const MCExpr *EVal;
+  int64_t ImmVal = 0;
+
+  std::unique_ptr<SparcOperand> Mask;
+  if (parseSparcAsmOperand(Mask) == MatchOperand_Success) {
+    if (!Mask->isImm() || !Mask->getImm()->evaluateAsAbsolute(ImmVal) ||
+        ImmVal < 0 || ImmVal > 127) {
+      Error(S, "invalid membar mask number");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  while (getLexer().getKind() == AsmToken::Hash) {
+    SMLoc TagStart = getLexer().getLoc();
+    Parser.Lex(); // Eat the '#'.
+    unsigned MaskVal = StringSwitch<unsigned>(Parser.getTok().getString())
+      .Case("LoadLoad", 0x1)
+      .Case("StoreLoad", 0x2)
+      .Case("LoadStore", 0x4)
+      .Case("StoreStore", 0x8)
+      .Case("Lookaside", 0x10)
+      .Case("MemIssue", 0x20)
+      .Case("Sync", 0x40)
+      .Default(0);
+
+    Parser.Lex(); // Eat the identifier token.
+
+    if (!MaskVal) {
+      Error(TagStart, "unknown membar tag");
+      return MatchOperand_ParseFail;
+    }
+
+    ImmVal |= MaskVal;
+
+    if (getLexer().getKind() == AsmToken::Pipe)
+      Parser.Lex(); // Eat the '|'.
+  }
+
+  EVal = MCConstantExpr::create(ImmVal, getContext());
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(SparcOperand::CreateImm(EVal, S, E));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
index c1512cbdc44fd502345aa56867f958c4b25585a1..d152efae6d1f6f2cac5c90fbfcd77b9a760c464e 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -195,3 +195,26 @@ bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum,
   llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX.");
   return true;
 }
+
+void SparcInstPrinter::printMembarTag(const MCInst *MI, int opNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  static const char *const TagNames[] = {
+      "#LoadLoad",  "#StoreLoad", "#LoadStore", "#StoreStore",
+      "#Lookaside", "#MemIssue",  "#Sync"};
+
+  unsigned Imm = MI->getOperand(opNum).getImm();
+
+  if (Imm > 127) {
+    O << Imm;
+    return;
+  }
+
+  bool First = true;
+  for (unsigned i = 0; i < sizeof(TagNames) / sizeof(char *); i++) {
+    if (Imm & (1 << i)) {
+      O << (First ? "" : " | ") << TagNames[i];
+      First = false;
+    }
+  }
+}
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
index 6f06d1ddae32f9bd02cee798c018716fefb03e16..89015eb137c20167a3310d39fa964c16b9b99282 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -49,6 +49,8 @@ public:
                       raw_ostream &OS);
   bool printGetPCX(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &OS);
+  void printMembarTag(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
 };
 } // end namespace llvm
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 307b6e3c3821c2c7e12d6ebe67104b88f13ad323..7d908bb6b3207b24c467aa1e21e05a4d30ccef3d 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -3261,23 +3261,23 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       else
         return std::make_pair(0U, &SP::IntRegsRegClass);
     case 'f':
-      if (VT == MVT::f32)
+      if (VT == MVT::f32 || VT == MVT::i32)
         return std::make_pair(0U, &SP::FPRegsRegClass);
-      else if (VT == MVT::f64)
+      else if (VT == MVT::f64 || VT == MVT::i64)
         return std::make_pair(0U, &SP::LowDFPRegsRegClass);
       else if (VT == MVT::f128)
         return std::make_pair(0U, &SP::LowQFPRegsRegClass);
-      llvm_unreachable("Unknown ValueType for f-register-type!");
-      break;
+      // This will generate an error message
+      return std::make_pair(0U, nullptr);
     case 'e':
-      if (VT == MVT::f32)
+      if (VT == MVT::f32 || VT == MVT::i32)
         return std::make_pair(0U, &SP::FPRegsRegClass);
-      else if (VT == MVT::f64)
+      else if (VT == MVT::f64 || VT == MVT::i64 )
         return std::make_pair(0U, &SP::DFPRegsRegClass);
       else if (VT == MVT::f128)
         return std::make_pair(0U, &SP::QFPRegsRegClass);
-      llvm_unreachable("Unknown ValueType for e-register-type!");
-      break;
+      // This will generate an error message
+      return std::make_pair(0U, nullptr);
     }
   } else if (!Constraint.empty() && Constraint.size() <= 5
               && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 62f3d96af847b00bc86c506344de62f0fcda3456..558b37aeebcb895267679184c29a8d474277d1bb 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -138,6 +138,16 @@ def MEMri : Operand<iPTR> {
 
 def TLSSym : Operand<iPTR>;
 
+def SparcMembarTagAsmOperand : AsmOperandClass {
+  let Name = "MembarTag";
+  let ParserMethod = "parseMembarTag";
+}
+
+def MembarTag : Operand<i32> {
+  let PrintMethod = "printMembarTag";
+  let ParserMatchClass = SparcMembarTagAsmOperand;
+}
+
 // Branch targets have OtherVT type.
 def brtarget : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
@@ -1503,7 +1513,7 @@ def : Pat<(ctpop i32:$src),
           (POPCrr (SRLri $src, 0))>;
 
 let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
- def MEMBARi : F3_2<2, 0b101000, (outs), (ins simm13Op:$simm13),
+ def MEMBARi : F3_2<2, 0b101000, (outs), (ins MembarTag:$simm13),
                     "membar $simm13", []>;
 
 // The CAS instruction, unlike other instructions, only comes in a 
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 07f9e7250bd9f7822911500b48a4410864bc9f08..5b467235f809d20743692fc1d81b577d9e3e70d5 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -70,11 +70,16 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
 // pic32  PIC_    Medium     GOT < 2^32 bytes
 //
 // All code models require that the text segment is smaller than 2GB.
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
-                                              Reloc::Model RM, bool Is64Bit,
-                                              bool JIT) {
-  if (CM)
+static CodeModel::Model
+getEffectiveSparcCodeModel(Optional<CodeModel::Model> CM, Reloc::Model RM,
+                           bool Is64Bit, bool JIT) {
+  if (CM) {
+    if (*CM == CodeModel::Tiny)
+      report_fatal_error("Target does not support the tiny CodeModel");
+    if (*CM == CodeModel::Kernel)
+      report_fatal_error("Target does not support the kernel CodeModel");
     return *CM;
+  }
   if (Is64Bit) {
     if (JIT)
       return CodeModel::Large;
@@ -88,11 +93,11 @@ SparcTargetMachine::SparcTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
     const TargetOptions &Options, Optional<Reloc::Model> RM,
     Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT, bool is64bit)
-    : LLVMTargetMachine(
-          T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options,
-          getEffectiveRelocModel(RM),
-          getEffectiveCodeModel(CM, getEffectiveRelocModel(RM), is64bit, JIT),
-          OL),
+    : LLVMTargetMachine(T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(RM),
+                        getEffectiveSparcCodeModel(
+                            CM, getEffectiveRelocModel(RM), is64bit, JIT),
+                        OL),
       TLOF(make_unique<SparcELFTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) {
   initAsmInfo();
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index bd99fabb48c91ee5340107530d94bf724ae9279f..e2de721be56869249ec7cca62c546554618da092 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -647,7 +647,7 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  SM.serializeToStackMapSection();
+  emitStackMaps(SM);
 }
 
 // Force static initialization.
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index d2c33546716c051f7ab450b011ac279c0034d714..d7951ca810eecc0a1173c05c2d1ffe02643c6998 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -523,6 +523,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::FP_ROUND);
@@ -4479,6 +4480,7 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   // Constants with undefs to get a full vector constant and use that
   // as the starting point.
   SDValue Result;
+  SDValue ReplicatedVal;
   if (NumConstants > 0) {
     for (unsigned I = 0; I < NumElements; ++I)
       if (!Constants[I].getNode())
@@ -4489,17 +4491,21 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
     // avoid a false dependency on any previous contents of the vector
     // register.
 
-    // Use a VLREP if at least one element is a load.
-    unsigned LoadElIdx = UINT_MAX;
+    // Use a VLREP if at least one element is a load. Make sure to replicate
+    // the load with the most elements having its value.
+    std::map<const SDNode*, unsigned> UseCounts;
+    SDNode *LoadMaxUses = nullptr;
     for (unsigned I = 0; I < NumElements; ++I)
       if (Elems[I].getOpcode() == ISD::LOAD &&
           cast<LoadSDNode>(Elems[I])->isUnindexed()) {
-        LoadElIdx = I;
-        break;
+        SDNode *Ld = Elems[I].getNode();
+        UseCounts[Ld]++;
+        if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
+          LoadMaxUses = Ld;
       }
-    if (LoadElIdx != UINT_MAX) {
-      Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, Elems[LoadElIdx]);
-      Done[LoadElIdx] = true;
+    if (LoadMaxUses != nullptr) {
+      ReplicatedVal = SDValue(LoadMaxUses, 0);
+      Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, ReplicatedVal);
     } else {
       // Try to use VLVGP.
       unsigned I1 = NumElements / 2 - 1;
@@ -4520,7 +4526,7 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
 
   // Use VLVGx to insert the other elements.
   for (unsigned I = 0; I < NumElements; ++I)
-    if (!Done[I] && !Elems[I].isUndef())
+    if (!Done[I] && !Elems[I].isUndef() && Elems[I] != ReplicatedVal)
       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
                            DAG.getConstant(I, DL, MVT::i32));
   return Result;
@@ -5363,6 +5369,46 @@ SDValue SystemZTargetLowering::combineMERGE(
   return SDValue();
 }
 
+SDValue SystemZTargetLowering::combineLOAD(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT LdVT = N->getValueType(0);
+  if (LdVT.isVector() || LdVT.isInteger())
+    return SDValue();
+  // Transform a scalar load that is REPLICATEd as well as having other
+  // use(s) to the form where the other use(s) use the first element of the
+  // REPLICATE instead of the load. Otherwise instruction selection will not
+  // produce a VLREP. Avoid extracting to a GPR, so only do this for floating
+  // point loads.
+
+  SDValue Replicate;
+  SmallVector<SDNode*, 8> OtherUses;
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() == SystemZISD::REPLICATE) {
+      if (Replicate)
+        return SDValue(); // Should never happen
+      Replicate = SDValue(*UI, 0);
+    }
+    else if (UI.getUse().getResNo() == 0)
+      OtherUses.push_back(*UI);
+  }
+  if (!Replicate || OtherUses.empty())
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue Extract0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, LdVT,
+                              Replicate, DAG.getConstant(0, DL, MVT::i32));
+  // Update uses of the loaded Value while preserving old chains.
+  for (SDNode *U : OtherUses) {
+    SmallVector<SDValue, 8> Ops;
+    for (SDValue Op : U->ops())
+      Ops.push_back((Op.getNode() == N && Op.getResNo() == 0) ? Extract0 : Op);
+    DAG.UpdateNodeOperands(U, Ops);
+  }
+  return SDValue(N, 0);
+}
+
 SDValue SystemZTargetLowering::combineSTORE(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -5694,6 +5740,7 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND_INREG:  return combineSIGN_EXTEND_INREG(N, DCI);
   case SystemZISD::MERGE_HIGH:
   case SystemZISD::MERGE_LOW:   return combineMERGE(N, DCI);
+  case ISD::LOAD:               return combineLOAD(N, DCI);
   case ISD::STORE:              return combineSTORE(N, DCI);
   case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
   case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 9bf9440794713ea81a3e21c4edfe58a9fc6721e1..172dbeec26d0fab594205743f82d22144940f1c5 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -587,6 +587,7 @@ private:
   SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSIGN_EXTEND_INREG(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineLOAD(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 76ed6f80ba55575d383d29ace0a9fe2104ec97e1..e9f9188048da8a288ab7362ede6814838b8373e2 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -63,6 +63,10 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
                                            const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+      VirtReg, Order, Hints, MF, VRM, Matrix);
+
   if (MRI->getRegClass(VirtReg) == &SystemZ::GRX32BitRegClass) {
     SmallVector<unsigned, 8> Worklist;
     SmallSet<unsigned, 4> DoneRegs;
@@ -84,8 +88,18 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
             TRI->getCommonSubClass(getRC32(FalseMO, VRM, MRI),
                                    getRC32(TrueMO, VRM, MRI));
           if (RC && RC != &SystemZ::GRX32BitRegClass) {
+            // Pass the registers of RC as hints while making sure that if
+            // any of these registers are copy hints, hint them first.
+            SmallSet<unsigned, 4> CopyHints;
+            CopyHints.insert(Hints.begin(), Hints.end());
+            Hints.clear();
+            for (MCPhysReg Reg : Order)
+              if (CopyHints.count(Reg) &&
+                  RC->contains(Reg) && !MRI->isReserved(Reg))
+                Hints.push_back(Reg);
             for (MCPhysReg Reg : Order)
-              if (RC->contains(Reg) && !MRI->isReserved(Reg))
+              if (!CopyHints.count(Reg) &&
+                  RC->contains(Reg) && !MRI->isReserved(Reg))
                 Hints.push_back(Reg);
             // Return true to make these hints the only regs available to
             // RA. This may mean extra spilling but since the alternative is
@@ -102,8 +116,7 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     }
   }
 
-  return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
-                                                   VRM, Matrix);
+  return BaseImplRetVal;
 }
 
 const MCPhysReg *
@@ -270,25 +283,30 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI,
 
   // Check that the two virtual registers are local to MBB.
   MachineBasicBlock *MBB = MI->getParent();
-  if (LIS.isLiveInToMBB(IntGR128, MBB) || LIS.isLiveOutOfMBB(IntGR128, MBB) ||
-      LIS.isLiveInToMBB(IntGRNar, MBB) || LIS.isLiveOutOfMBB(IntGRNar, MBB))
+  MachineInstr *FirstMI_GR128 =
+    LIS.getInstructionFromIndex(IntGR128.beginIndex());
+  MachineInstr *FirstMI_GRNar =
+    LIS.getInstructionFromIndex(IntGRNar.beginIndex());
+  MachineInstr *LastMI_GR128 = LIS.getInstructionFromIndex(IntGR128.endIndex());
+  MachineInstr *LastMI_GRNar = LIS.getInstructionFromIndex(IntGRNar.endIndex());
+  if ((!FirstMI_GR128 || FirstMI_GR128->getParent() != MBB) ||
+      (!FirstMI_GRNar || FirstMI_GRNar->getParent() != MBB) ||
+      (!LastMI_GR128 || LastMI_GR128->getParent() != MBB) ||
+      (!LastMI_GRNar || LastMI_GRNar->getParent() != MBB))
     return false;
 
-  // Find the first and last MIs of the registers.
-  MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
+  MachineBasicBlock::iterator MII = nullptr, MEE = nullptr;
   if (WideOpNo == 1) {
-    FirstMI = LIS.getInstructionFromIndex(IntGR128.beginIndex());
-    LastMI  = LIS.getInstructionFromIndex(IntGRNar.endIndex());
+    MII = FirstMI_GR128;
+    MEE = LastMI_GRNar;
   } else {
-    FirstMI = LIS.getInstructionFromIndex(IntGRNar.beginIndex());
-    LastMI  = LIS.getInstructionFromIndex(IntGR128.endIndex());
+    MII = FirstMI_GRNar;
+    MEE = LastMI_GR128;
   }
-  assert (FirstMI && LastMI && "No instruction from index?");
 
   // Check if coalescing seems safe by finding the set of clobbered physreg
   // pairs in the region.
   BitVector PhysClobbered(getNumRegs());
-  MachineBasicBlock::iterator MII = FirstMI, MEE = LastMI;
   MEE++;
   for (; MII != MEE; ++MII) {
     for (const MachineOperand &MO : MII->operands())
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 1a87db6e6605bc52d78410f7331bbe75bad3eb71..74e1dad87908bc82fdf5e13b023745f7d652bc9a 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -609,7 +609,7 @@ def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
 // Compare double and swap
 def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2],
              (instregex "CDS(Y)?$")>;
-def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3, GroupAlone],
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3, GroupAlone3],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
@@ -664,10 +664,10 @@ def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
 def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
 def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
 
-def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone],
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone], (instregex "(M|D)P$")>;
-def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>;
 def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
 def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
 def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
@@ -684,7 +684,7 @@ def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
 
 // Load/store access multiple (not modeled precisely)
 def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
-def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Program mask and addressing mode
@@ -909,7 +909,7 @@ def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
-def : InstRW<[WLat10, LSU2, VecDF4, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Floating-point control register instructions
@@ -1210,7 +1210,7 @@ def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>;
 def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>;
 def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
 def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
-def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "VSTM$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM$")>;
 def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
 
 //===----------------------------------------------------------------------===//
@@ -1424,7 +1424,7 @@ def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
-def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>;
 def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
 def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
 def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
@@ -1468,8 +1468,8 @@ def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
 // System: Memory-move Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[WLat1, FXa, LSU5, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>;
 def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
 def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td
index 0f26ffce0e091aa6f19508a5f6f32486285975c4..1962fdf3a1d1fabe3cfea69ef6f16439e3b6a8a7 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -620,7 +620,7 @@ def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
 def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2],
              (instregex "CDS(Y)?$")>;
 def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3,
-              GroupAlone], (instregex "CDSG$")>;
+              GroupAlone3], (instregex "CDSG$")>;
 
 // Compare and swap and store
 def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
@@ -684,10 +684,10 @@ def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
 def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
 def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
 
-def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone],
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone], (instregex "(M|D)P$")>;
-def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone], (instregex "SRP$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "(M|D)P$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>;
 def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
 def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
 def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
@@ -704,7 +704,7 @@ def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
 
 // Load/store access multiple (not modeled precisely)
 def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
-def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Program mask and addressing mode
@@ -929,7 +929,7 @@ def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
-def : InstRW<[WLat10, LSU2, VecDF4, GroupAlone], (instregex "TCXB$")>;
+def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Floating-point control register instructions
@@ -1229,7 +1229,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
 def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>;
 def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
 def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
-def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "VSTM$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM$")>;
 def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
 def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
 
@@ -1500,7 +1500,7 @@ def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
-def : InstRW<[WLat20, GroupAlone], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat20, GroupAlone3], (instregex "LPSW(E)?$")>;
 def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
 def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
 def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
@@ -1513,7 +1513,7 @@ def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
-def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STCT(L|G)$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>;
 def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
 def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
 def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
@@ -1558,8 +1558,8 @@ def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
 // System: Memory-move Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[WLat1, FXa, LSU5, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>;
 def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
 def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
 
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index f3620dcf3b92e8af489a48332b4a154016017723..9596a2b6388d62a756ee918ccc1657ff539ee430 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -128,10 +128,16 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
 //   in range of LARL.  However, the JIT environment has no equivalent
 //   of copy relocs, so locally-binding data symbols might not be in
 //   the range of LARL.  We need the Medium model in that case.
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
-                                              Reloc::Model RM, bool JIT) {
-  if (CM)
+static CodeModel::Model
+getEffectiveSystemZCodeModel(Optional<CodeModel::Model> CM, Reloc::Model RM,
+                             bool JIT) {
+  if (CM) {
+    if (*CM == CodeModel::Tiny)
+      report_fatal_error("Target does not support the tiny CodeModel");
+    if (*CM == CodeModel::Kernel)
+      report_fatal_error("Target does not support the kernel CodeModel");
     return *CM;
+  }
   if (JIT)
     return RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
   return CodeModel::Small;
@@ -146,7 +152,8 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(
           T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
           getEffectiveRelocModel(RM),
-          getEffectiveCodeModel(CM, getEffectiveRelocModel(RM), JIT), OL),
+          getEffectiveSystemZCodeModel(CM, getEffectiveRelocModel(RM), JIT),
+          OL),
       TLOF(llvm::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f296d80dbf523061cb4a731bbc1885841f5d4699..129610fe095b03a45f41f1c5f8d675241d8ab59c 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -467,9 +467,6 @@ int SystemZTTIImpl::getArithmeticInstrCost(
     if (Opcode == Instruction::FRem)
       return LIBCALL_COST;
 
-    if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
-      return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
-
     // Or requires one instruction, although it has custom handling for i64.
     if (Opcode == Instruction::Or)
       return 1;
@@ -484,12 +481,8 @@ int SystemZTTIImpl::getArithmeticInstrCost(
       return (SignedDivRem ? SDivPow2Cost : 1);
     if (DivRemConst)
       return DivMulSeqCost;
-    if (SignedDivRem)
-      // sext of op(s) for narrow types
-      return DivInstrCost + (ScalarBits < 32 ? 3 : (ScalarBits == 32 ? 1 : 0));
-    if (UnsignedDivRem)
-      // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
-      return DivInstrCost + (ScalarBits < 32 ? 3 : 1);
+    if (SignedDivRem || UnsignedDivRem)
+      return DivInstrCost;
   }
 
   // Fallback to the default implementation.
@@ -718,7 +711,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
         NeedsExtracts = false;
 
-      TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+      TotCost += getScalarizationOverhead(Src, false, NeedsExtracts);
+      TotCost += getScalarizationOverhead(Dst, NeedsInserts, false);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -778,6 +772,18 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
 }
 
+// Scalar i8 / i16 operations will typically be made after first extending
+// the operands to i32.
+static unsigned getOperandsExtensionCost(const Instruction *I) {
+  unsigned ExtCost = 0;
+  for (Value *Op : I->operands())
+    // A load of i8 or i16 sign/zero extends to i32.
+    if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
+      ExtCost++;
+
+  return ExtCost;
+}
+
 int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                        Type *CondTy, const Instruction *I) {
   if (ValTy->isVectorTy()) {
@@ -833,9 +839,19 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   else { // Scalar
     switch (Opcode) {
     case Instruction::ICmp: {
+      // A loaded value compared with 0 with multiple users becomes Load and
+      // Test. The load is then not foldable, so return 0 cost for the ICmp.
+      unsigned ScalarBits = ValTy->getScalarSizeInBits();
+      if (I != nullptr && ScalarBits >= 32)
+        if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+          if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+            if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
+                C->getZExtValue() == 0)
+              return 0;
+
       unsigned Cost = 1;
       if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
-        Cost += 2; // extend both operands
+        Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
       return Cost;
     }
     case Instruction::Select:
@@ -898,17 +914,26 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
        UserI->getOpcode() == Instruction::UDiv) &&
       UserI->getOperand(1) != FoldedValue)
     return false; // Not commutative, only RHS foldable.
+  // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
+  // extension was made of the load.
+  unsigned LoadOrTruncBits =
+      ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
   switch (UserI->getOpcode()) {
   case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
   case Instruction::Sub:
+  case Instruction::ICmp:
     if (LoadedBits == 32 && ZExtBits == 64)
       return true;
     LLVM_FALLTHROUGH;
   case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
-    if (LoadedBits == 16 &&
-        (SExtBits == 32 ||
-         (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
-      return true;
+    if (UserI->getOpcode() != Instruction::ICmp) {
+      if (LoadedBits == 16 &&
+          (SExtBits == 32 ||
+           (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
+        return true;
+      if (LoadOrTruncBits == 16)
+        return true;
+    }
     LLVM_FALLTHROUGH;
   case Instruction::SDiv:// SE: 32->64
     if (LoadedBits == 32 && SExtBits == 64)
@@ -918,7 +943,6 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
-  case Instruction::ICmp:
     // This also makes sense for float operations, but disabled for now due
     // to regressions.
     // case Instruction::FCmp:
@@ -928,16 +952,27 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
     // case Instruction::FDiv:
 
     // All possible extensions of memory checked above.
-    if (SExtBits || ZExtBits)
-      return false;
 
-    unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);
+    // Comparison between memory and immediate.
+    if (UserI->getOpcode() == Instruction::ICmp)
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
+        if (isUInt<16>(CI->getZExtValue()))
+          return true;
     return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
     break;
   }
   return false;
 }
 
+static bool isBswapIntrinsicCall(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    if (auto *CI = dyn_cast<CallInst>(I))
+      if (auto *F = CI->getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::bswap)
+          return true;
+  return false;
+}
+
 int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                     unsigned Alignment, unsigned AddressSpace,
                                     const Instruction *I) {
@@ -974,6 +1009,22 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   unsigned NumOps =
     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
 
+  // Store/Load reversed saves one instruction.
+  if (!Src->isVectorTy() && NumOps == 1 && I != nullptr) {
+    if (Opcode == Instruction::Load && I->hasOneUse()) {
+      const Instruction *LdUser = cast<Instruction>(*I->user_begin());
+      // In case of load -> bswap -> store, return normal cost for the load.
+      if (isBswapIntrinsicCall(LdUser) &&
+          (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
+        return 0;
+    }
+    else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      const Value *StoredVal = SI->getValueOperand();
+      if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
+        return 0;
+    }
+  }
+
   if (Src->getScalarSizeInBits() == 128)
     // 128 bit scalars are held in a pair of two 64 bit registers.
     NumOps *= 2;
@@ -1046,3 +1097,29 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   // Cost of load/store operations and the permutations needed.
   return NumVectorMemOps + NumPermutes;
 }
+
+static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
+  if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
+    return getNumVectorRegs(RetTy); // VPERM
+  return -1;
+}
+
+int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                                          ArrayRef<Value *> Args,
+                                          FastMathFlags FMF, unsigned VF) {
+  int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
+  if (Cost != -1)
+    return Cost;
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+}
+
+int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                                          ArrayRef<Type *> Tys,
+                                          FastMathFlags FMF,
+                                          unsigned ScalarizationCostPassed) {
+  int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
+  if (Cost != -1)
+    return Cost;
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys,
+                                      FMF, ScalarizationCostPassed);
+}
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index dd85c4ea541717a4d4e0dc832b45abb1bab76587..e79bee1ea3a806dff55092e1a48650d5573278cf 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -98,6 +98,13 @@ public:
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
+
+  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                            ArrayRef<Value *> Args, FastMathFlags FMF,
+                            unsigned VF = 1);
+  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                            ArrayRef<Type *> Tys, FastMathFlags FMF,
+                            unsigned ScalarizationCostPassed = UINT_MAX);
   /// @}
 };
 
diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index efa6793cff27603a5900c69909cb2500e6a3af3a..f463aeaf8275084008533ea9618a2b91c388ad14 100644
--- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -38,7 +38,7 @@ namespace {
 /// WebAssemblyOperand - Instances of this class represent the operands in a
 /// parsed WASM machine instruction.
 struct WebAssemblyOperand : public MCParsedAsmOperand {
-  enum KindTy { Token, Integer, Float, Symbol } Kind;
+  enum KindTy { Token, Integer, Float, Symbol, BrList } Kind;
 
   SMLoc StartLoc, EndLoc;
 
@@ -58,11 +58,16 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
     const MCExpr *Exp;
   };
 
+  struct BrLOp {
+    std::vector<unsigned> List;
+  };
+
   union {
     struct TokOp Tok;
     struct IntOp Int;
     struct FltOp Flt;
     struct SymOp Sym;
+    struct BrLOp BrL;
   };
 
   WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, TokOp T)
@@ -73,6 +78,13 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
       : Kind(K), StartLoc(Start), EndLoc(End), Flt(F) {}
   WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, SymOp S)
       : Kind(K), StartLoc(Start), EndLoc(End), Sym(S) {}
+  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End)
+      : Kind(K), StartLoc(Start), EndLoc(End), BrL() {}
+
+  ~WebAssemblyOperand() {
+    if (isBrList())
+      BrL.~BrLOp();
+  }
 
   bool isToken() const override { return Kind == Token; }
   bool isImm() const override {
@@ -80,6 +92,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
   }
   bool isMem() const override { return false; }
   bool isReg() const override { return false; }
+  bool isBrList() const { return Kind == BrList; }
 
   unsigned getReg() const override {
     llvm_unreachable("Assembly inspects a register operand");
@@ -111,6 +124,12 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
       llvm_unreachable("Should be immediate or symbol!");
   }
 
+  void addBrListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && isBrList() && "Invalid BrList!");
+    for (auto Br : BrL.List)
+      Inst.addOperand(MCOperand::createImm(Br));
+  }
+
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case Token:
@@ -125,6 +144,9 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
     case Symbol:
       OS << "Sym:" << Sym.Exp;
       break;
+    case BrList:
+      OS << "BrList:" << BrL.List.size();
+      break;
     }
   }
 };
@@ -132,16 +154,40 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
 class WebAssemblyAsmParser final : public MCTargetAsmParser {
   MCAsmParser &Parser;
   MCAsmLexer &Lexer;
-  MCSymbolWasm *LastSymbol;
+
+  // Much like WebAssemblyAsmPrinter in the backend, we have to own these.
+  std::vector<std::unique_ptr<wasm::WasmSignature>> Signatures;
+
+  // Order of labels, directives and instructions in a .s file have no
+  // syntactical enforcement. This class is a callback from the actual parser,
+  // and yet we have to be feeding data to the streamer in a very particular
+  // order to ensure a correct binary encoding that matches the regular backend
+  // (the streamer does not enforce this). This "state machine" enum helps
+  // guarantee that correct order.
+  enum ParserState {
+    FileStart,
+    Label,
+    FunctionStart,
+    FunctionLocals,
+    Instructions,
+  } CurrentState = FileStart;
+
+  // We track this to see if a .functype following a label is the same,
+  // as this is how we recognize the start of a function.
+  MCSymbol *LastLabel = nullptr;
 
 public:
   WebAssemblyAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                        const MCInstrInfo &MII, const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, STI, MII), Parser(Parser),
-        Lexer(Parser.getLexer()), LastSymbol(nullptr) {
+        Lexer(Parser.getLexer()) {
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
+  void addSignature(std::unique_ptr<wasm::WasmSignature> &&Sig) {
+    Signatures.push_back(std::move(Sig));
+  }
+
 #define GET_ASSEMBLER_HEADER
 #include "WebAssemblyGenAsmMatcher.inc"
 
@@ -151,47 +197,66 @@ public:
     llvm_unreachable("ParseRegister is not implemented.");
   }
 
-  bool Error(const StringRef &msg, const AsmToken &tok) {
-    return Parser.Error(tok.getLoc(), msg + tok.getString());
+  bool error(const StringRef &Msg, const AsmToken &Tok) {
+    return Parser.Error(Tok.getLoc(), Msg + Tok.getString());
   }
 
-  bool IsNext(AsmToken::TokenKind Kind) {
-    auto ok = Lexer.is(Kind);
-    if (ok)
+  bool isNext(AsmToken::TokenKind Kind) {
+    auto Ok = Lexer.is(Kind);
+    if (Ok)
       Parser.Lex();
-    return ok;
+    return Ok;
   }
 
-  bool Expect(AsmToken::TokenKind Kind, const char *KindName) {
-    if (!IsNext(Kind))
-      return Error(std::string("Expected ") + KindName + ", instead got: ",
+  bool expect(AsmToken::TokenKind Kind, const char *KindName) {
+    if (!isNext(Kind))
+      return error(std::string("Expected ") + KindName + ", instead got: ",
                    Lexer.getTok());
     return false;
   }
 
+  StringRef expectIdent() {
+    if (!Lexer.is(AsmToken::Identifier)) {
+      error("Expected identifier, got: ", Lexer.getTok());
+      return StringRef();
+    }
+    auto Name = Lexer.getTok().getString();
+    Parser.Lex();
+    return Name;
+  }
 
-  std::pair<MVT::SimpleValueType, unsigned>
-  ParseRegType(const StringRef &RegType) {
-    // Derive type from .param .local decls, or the instruction itself.
-    return StringSwitch<std::pair<MVT::SimpleValueType, unsigned>>(RegType)
-        .Case("i32", {MVT::i32, wasm::WASM_TYPE_I32})
-        .Case("i64", {MVT::i64, wasm::WASM_TYPE_I64})
-        .Case("f32", {MVT::f32, wasm::WASM_TYPE_F32})
-        .Case("f64", {MVT::f64, wasm::WASM_TYPE_F64})
-        .Case("i8x16", {MVT::v16i8, wasm::WASM_TYPE_V128})
-        .Case("i16x8", {MVT::v8i16, wasm::WASM_TYPE_V128})
-        .Case("i32x4", {MVT::v4i32, wasm::WASM_TYPE_V128})
-        .Case("i64x2", {MVT::v2i64, wasm::WASM_TYPE_V128})
-        .Case("f32x4", {MVT::v4f32, wasm::WASM_TYPE_V128})
-        .Case("f64x2", {MVT::v2f64, wasm::WASM_TYPE_V128})
-        // arbitrarily chosen vector type to associate with "v128"
-        // FIXME: should these be EVTs to avoid this arbitrary hack? Do we want
-        // to accept more specific SIMD register types?
-        .Case("v128", {MVT::v16i8, wasm::WASM_TYPE_V128})
-        .Default({MVT::INVALID_SIMPLE_VALUE_TYPE, wasm::WASM_TYPE_NORESULT});
+  Optional<wasm::ValType> parseType(const StringRef &Type) {
+    // FIXME: can't use StringSwitch because wasm::ValType doesn't have a
+    // "invalid" value.
+    if (Type == "i32")
+      return wasm::ValType::I32;
+    if (Type == "i64")
+      return wasm::ValType::I64;
+    if (Type == "f32")
+      return wasm::ValType::F32;
+    if (Type == "f64")
+      return wasm::ValType::F64;
+    if (Type == "v128" || Type == "i8x16" || Type == "i16x8" ||
+        Type == "i32x4" || Type == "i64x2" || Type == "f32x4" ||
+        Type == "f64x2")
+      return wasm::ValType::V128;
+    return Optional<wasm::ValType>();
   }
 
-  void ParseSingleInteger(bool IsNegative, OperandVector &Operands) {
+  bool parseRegTypeList(SmallVectorImpl<wasm::ValType> &Types) {
+    while (Lexer.is(AsmToken::Identifier)) {
+      auto Type = parseType(Lexer.getTok().getString());
+      if (!Type)
+        return true;
+      Types.push_back(Type.getValue());
+      Parser.Lex();
+      if (!isNext(AsmToken::Comma))
+        break;
+    }
+    return false;
+  }
+
+  void parseSingleInteger(bool IsNegative, OperandVector &Operands) {
     auto &Int = Lexer.getTok();
     int64_t Val = Int.getIntVal();
     if (IsNegative)
@@ -202,9 +267,9 @@ public:
     Parser.Lex();
   }
 
-  bool ParseOperandStartingWithInteger(bool IsNegative, OperandVector &Operands,
+  bool parseOperandStartingWithInteger(bool IsNegative, OperandVector &Operands,
                                        StringRef InstName) {
-    ParseSingleInteger(IsNegative, Operands);
+    parseSingleInteger(IsNegative, Operands);
     // FIXME: there is probably a cleaner way to do this.
     auto IsLoadStore = InstName.startswith("load") ||
                        InstName.startswith("store") ||
@@ -214,7 +279,7 @@ public:
       // Parse load/store operands of the form: offset align
       auto &Offset = Lexer.getTok();
       if (Offset.is(AsmToken::Integer)) {
-        ParseSingleInteger(false, Operands);
+        parseSingleInteger(false, Operands);
       } else {
         // Alignment not specified.
         // FIXME: correctly derive a default from the instruction.
@@ -233,13 +298,15 @@ public:
     // Note: Name does NOT point into the sourcecode, but to a local, so
     // use NameLoc instead.
     Name = StringRef(NameLoc.getPointer(), Name.size());
+
     // WebAssembly has instructions with / in them, which AsmLexer parses
     // as seperate tokens, so if we find such tokens immediately adjacent (no
     // whitespace), expand the name to include them:
     for (;;) {
       auto &Sep = Lexer.getTok();
       if (Sep.getLoc().getPointer() != Name.end() ||
-          Sep.getKind() != AsmToken::Slash) break;
+          Sep.getKind() != AsmToken::Slash)
+        break;
       // Extend name with /
       Name = StringRef(Name.begin(), Name.size() + Sep.getString().size());
       Parser.Lex();
@@ -247,10 +314,11 @@ public:
       auto &Id = Lexer.getTok();
       if (Id.getKind() != AsmToken::Identifier ||
           Id.getLoc().getPointer() != Name.end())
-        return Error("Incomplete instruction name: ", Id);
+        return error("Incomplete instruction name: ", Id);
       Name = StringRef(Name.begin(), Name.size() + Id.getString().size());
       Parser.Lex();
     }
+
     // Now construct the name as first operand.
     Operands.push_back(make_unique<WebAssemblyOperand>(
         WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()),
@@ -258,6 +326,7 @@ public:
     auto NamePair = Name.split('.');
     // If no '.', there is no type prefix.
     auto BaseName = NamePair.second.empty() ? NamePair.first : NamePair.second;
+
     while (Lexer.isNot(AsmToken::EndOfStatement)) {
       auto &Tok = Lexer.getTok();
       switch (Tok.getKind()) {
@@ -266,7 +335,7 @@ public:
         const MCExpr *Val;
         SMLoc End;
         if (Parser.parsePrimaryExpr(Val, End))
-          return Error("Cannot parse symbol: ", Lexer.getTok());
+          return error("Cannot parse symbol: ", Lexer.getTok());
         Operands.push_back(make_unique<WebAssemblyOperand>(
             WebAssemblyOperand::Symbol, Id.getLoc(), Id.getEndLoc(),
             WebAssemblyOperand::SymOp{Val}));
@@ -275,33 +344,49 @@ public:
       case AsmToken::Minus:
         Parser.Lex();
         if (Lexer.isNot(AsmToken::Integer))
-          return Error("Expected integer instead got: ", Lexer.getTok());
-        if (ParseOperandStartingWithInteger(true, Operands, BaseName))
+          return error("Expected integer instead got: ", Lexer.getTok());
+        if (parseOperandStartingWithInteger(true, Operands, BaseName))
           return true;
         break;
       case AsmToken::Integer:
-        if (ParseOperandStartingWithInteger(false, Operands, BaseName))
+        if (parseOperandStartingWithInteger(false, Operands, BaseName))
           return true;
         break;
       case AsmToken::Real: {
         double Val;
         if (Tok.getString().getAsDouble(Val, false))
-          return Error("Cannot parse real: ", Tok);
+          return error("Cannot parse real: ", Tok);
         Operands.push_back(make_unique<WebAssemblyOperand>(
             WebAssemblyOperand::Float, Tok.getLoc(), Tok.getEndLoc(),
             WebAssemblyOperand::FltOp{Val}));
         Parser.Lex();
         break;
       }
+      case AsmToken::LCurly: {
+        Parser.Lex();
+        auto Op = make_unique<WebAssemblyOperand>(
+            WebAssemblyOperand::BrList, Tok.getLoc(), Tok.getEndLoc());
+        if (!Lexer.is(AsmToken::RCurly))
+          for (;;) {
+            Op->BrL.List.push_back(Lexer.getTok().getIntVal());
+            expect(AsmToken::Integer, "integer");
+            if (!isNext(AsmToken::Comma))
+              break;
+          }
+        expect(AsmToken::RCurly, "}");
+        Operands.push_back(std::move(Op));
+        break;
+      }
       default:
-        return Error("Unexpected token in operand: ", Tok);
+        return error("Unexpected token in operand: ", Tok);
       }
       if (Lexer.isNot(AsmToken::EndOfStatement)) {
-        if (Expect(AsmToken::Comma, ","))
+        if (expect(AsmToken::Comma, ","))
           return true;
       }
     }
     Parser.Lex();
+
     // Block instructions require a signature index, but these are missing in
     // assembly, so we add a dummy one explicitly (since we have no control
     // over signature tables here, we assume these will be regenerated when
@@ -315,9 +400,31 @@ public:
   }
 
   void onLabelParsed(MCSymbol *Symbol) override {
-    LastSymbol = cast<MCSymbolWasm>(Symbol);
+    LastLabel = Symbol;
+    CurrentState = Label;
+  }
+
+  bool parseSignature(wasm::WasmSignature *Signature) {
+    if (expect(AsmToken::LParen, "("))
+      return true;
+    if (parseRegTypeList(Signature->Params))
+      return true;
+    if (expect(AsmToken::RParen, ")"))
+      return true;
+    if (expect(AsmToken::MinusGreater, "->"))
+      return true;
+    if (expect(AsmToken::LParen, "("))
+      return true;
+    if (parseRegTypeList(Signature->Returns))
+      return true;
+    if (expect(AsmToken::RParen, ")"))
+      return true;
+    return false;
   }
 
+  // This function processes wasm-specific directives streamed to
+  // WebAssemblyTargetStreamer, all others go to the generic parser
+  // (see WasmAsmParser).
   bool ParseDirective(AsmToken DirectiveID) override {
     // This function has a really weird return value behavior that is different
     // from all the other parsing functions:
@@ -329,103 +436,89 @@ public:
     auto &Out = getStreamer();
     auto &TOut =
         reinterpret_cast<WebAssemblyTargetStreamer &>(*Out.getTargetStreamer());
+
     // TODO: any time we return an error, at least one token must have been
     // consumed, otherwise this will not signal an error to the caller.
-    if (DirectiveID.getString() == ".type") {
-      // This could be the start of a function, check if followed by
-      // "label,@function"
-      if (!Lexer.is(AsmToken::Identifier))
-        return Error("Expected label after .type directive, got: ",
-                     Lexer.getTok());
-      auto WasmSym = cast<MCSymbolWasm>(
-                       TOut.getStreamer().getContext().getOrCreateSymbol(
-                         Lexer.getTok().getString()));
-      Parser.Lex();
-      if (!(IsNext(AsmToken::Comma) && IsNext(AsmToken::At) &&
-            Lexer.is(AsmToken::Identifier)))
-        return Error("Expected label,@type declaration, got: ", Lexer.getTok());
-      auto TypeName = Lexer.getTok().getString();
-      if (TypeName == "function")
-        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
-      else if (TypeName == "global")
-        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
-      else
-        return Error("Unknown WASM symbol type: ", Lexer.getTok());
-      Parser.Lex();
-      return Expect(AsmToken::EndOfStatement, "EOL");
-    } else if (DirectiveID.getString() == ".size") {
-      if (!Lexer.is(AsmToken::Identifier))
-        return Error("Expected label after .size directive, got: ",
-                     Lexer.getTok());
-      auto WasmSym = cast<MCSymbolWasm>(
-                       TOut.getStreamer().getContext().getOrCreateSymbol(
-                         Lexer.getTok().getString()));
-      Parser.Lex();
-      if (!IsNext(AsmToken::Comma))
-        return Error("Expected `,`, got: ", Lexer.getTok());
-      const MCExpr *Exp;
-      if (Parser.parseExpression(Exp))
-        return Error("Cannot parse .size expression: ", Lexer.getTok());
-      WasmSym->setSize(Exp);
-      return Expect(AsmToken::EndOfStatement, "EOL");
-    } else if (DirectiveID.getString() == ".globaltype") {
-      if (!Lexer.is(AsmToken::Identifier))
-        return Error("Expected symbol name after .globaltype directive, got: ",
-                     Lexer.getTok());
-      auto Name = Lexer.getTok().getString();
-      Parser.Lex();
-      if (!IsNext(AsmToken::Comma))
-        return Error("Expected `,`, got: ", Lexer.getTok());
-      if (!Lexer.is(AsmToken::Identifier))
-        return Error("Expected type in .globaltype directive, got: ",
-                     Lexer.getTok());
-      auto Type = ParseRegType(Lexer.getTok().getString()).second;
-      if (Type == wasm::WASM_TYPE_NORESULT)
-        return Error("Unknown type in .globaltype directive: ",
-                     Lexer.getTok());
-      Parser.Lex();
+    if (DirectiveID.getString() == ".globaltype") {
+      auto SymName = expectIdent();
+      if (SymName.empty())
+        return true;
+      if (expect(AsmToken::Comma, ","))
+        return true;
+      auto TypeTok = Lexer.getTok();
+      auto TypeName = expectIdent();
+      if (TypeName.empty())
+        return true;
+      auto Type = parseType(TypeName);
+      if (!Type)
+        return error("Unknown type in .globaltype directive: ", TypeTok);
       // Now set this symbol with the correct type.
       auto WasmSym = cast<MCSymbolWasm>(
-                       TOut.getStreamer().getContext().getOrCreateSymbol(Name));
+          TOut.getStreamer().getContext().getOrCreateSymbol(SymName));
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
-      WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), true});
+      WasmSym->setGlobalType(
+          wasm::WasmGlobalType{uint8_t(Type.getValue()), true});
       // And emit the directive again.
       TOut.emitGlobalType(WasmSym);
-      return Expect(AsmToken::EndOfStatement, "EOL");
-    } else if (DirectiveID.getString() == ".param" ||
-               DirectiveID.getString() == ".local") {
-      // Track the number of locals, needed for correct virtual register
-      // assignment elsewhere.
-      // Also output a directive to the streamer.
-      std::vector<MVT> Params;
-      std::vector<MVT> Locals;
-      while (Lexer.is(AsmToken::Identifier)) {
-        auto RegType = ParseRegType(Lexer.getTok().getString()).first;
-        if (RegType == MVT::INVALID_SIMPLE_VALUE_TYPE)
-          return true;
-        if (DirectiveID.getString() == ".param") {
-          Params.push_back(RegType);
-        } else {
-          Locals.push_back(RegType);
-        }
-        Parser.Lex();
-        if (!IsNext(AsmToken::Comma))
-          break;
+      return expect(AsmToken::EndOfStatement, "EOL");
+    }
+
+    if (DirectiveID.getString() == ".functype") {
+      // This code has to send things to the streamer similar to
+      // WebAssemblyAsmPrinter::EmitFunctionBodyStart.
+      // TODO: would be good to factor this into a common function, but the
+      // assembler and backend really don't share any common code, and this code
+      // parses the locals seperately.
+      auto SymName = expectIdent();
+      if (SymName.empty())
+        return true;
+      auto WasmSym = cast<MCSymbolWasm>(
+          TOut.getStreamer().getContext().getOrCreateSymbol(SymName));
+      if (CurrentState == Label && WasmSym == LastLabel) {
+        // This .functype indicates a start of a function.
+        CurrentState = FunctionStart;
       }
-      assert(LastSymbol);
-      // TODO: LastSymbol isn't even used by emitParam, so could be removed.
-      TOut.emitParam(LastSymbol, Params);
+      auto Signature = make_unique<wasm::WasmSignature>();
+      if (parseSignature(Signature.get()))
+        return true;
+      WasmSym->setSignature(Signature.get());
+      addSignature(std::move(Signature));
+      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+      TOut.emitFunctionType(WasmSym);
+      // TODO: backend also calls TOut.emitIndIdx, but that is not implemented.
+      return expect(AsmToken::EndOfStatement, "EOL");
+    }
+
+    if (DirectiveID.getString() == ".eventtype") {
+      auto SymName = expectIdent();
+      if (SymName.empty())
+        return true;
+      auto WasmSym = cast<MCSymbolWasm>(
+          TOut.getStreamer().getContext().getOrCreateSymbol(SymName));
+      auto Signature = make_unique<wasm::WasmSignature>();
+      if (parseRegTypeList(Signature->Params))
+        return true;
+      WasmSym->setSignature(Signature.get());
+      addSignature(std::move(Signature));
+      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_EVENT);
+      TOut.emitEventType(WasmSym);
+      // TODO: backend also calls TOut.emitIndIdx, but that is not implemented.
+      return expect(AsmToken::EndOfStatement, "EOL");
+    }
+
+    if (DirectiveID.getString() == ".local") {
+      if (CurrentState != FunctionStart)
+        return error(".local directive should follow the start of a function",
+                     Lexer.getTok());
+      SmallVector<wasm::ValType, 4> Locals;
+      if (parseRegTypeList(Locals))
+        return true;
       TOut.emitLocal(Locals);
-      return Expect(AsmToken::EndOfStatement, "EOL");
-    } else {
-      // TODO: remove.
-      while (Lexer.isNot(AsmToken::EndOfStatement))
-        Parser.Lex();
-      return Expect(AsmToken::EndOfStatement, "EOL");
+      CurrentState = FunctionLocals;
+      return expect(AsmToken::EndOfStatement, "EOL");
     }
-    // TODO: current ELF directive parsing is broken, fix this is a followup.
-    //return true;  // We didn't process this directive.
-    return false;
+
+    return true; // We didn't process this directive.
   }
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/,
@@ -437,6 +530,16 @@ public:
         MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
     switch (MatchResult) {
     case Match_Success: {
+      if (CurrentState == FunctionStart) {
+        // This is the first instruction in a function, but we haven't seen
+        // a .local directive yet. The streamer requires locals to be encoded
+        // as a prelude to the instructions, so emit an empty list of locals
+        // here.
+        auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>(
+            *Out.getTargetStreamer());
+        TOut.emitLocal(SmallVector<wasm::ValType, 0>());
+      }
+      CurrentState = Instructions;
       Out.EmitInstruction(Inst, getSTI());
       return false;
     }
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 54403a562741ce22d2cf2861bbfc34600ef40a10..c068d6b43d492fbaffa043dcf447862ad0441b05 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -36,6 +36,8 @@ using DecodeStatus = MCDisassembler::DecodeStatus;
 #include "WebAssemblyGenDisassemblerTables.inc"
 
 namespace {
+static constexpr int WebAssemblyInstructionTableSize = 256;
+
 class WebAssemblyDisassembler final : public MCDisassembler {
   std::unique_ptr<const MCInstrInfo> MCII;
 
@@ -74,18 +76,26 @@ static int nextByte(ArrayRef<uint8_t> Bytes, uint64_t &Size) {
   return V;
 }
 
-static bool parseLEBImmediate(MCInst &MI, uint64_t &Size,
-                              ArrayRef<uint8_t> Bytes, bool Signed) {
+static bool nextLEB(int64_t &Val, ArrayRef<uint8_t> Bytes, uint64_t &Size,
+                    bool Signed = false) {
   unsigned N = 0;
   const char *Error = nullptr;
-  auto Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N,
-                                    Bytes.data() + Bytes.size(), &Error)
-                    : static_cast<int64_t>(
-                          decodeULEB128(Bytes.data() + Size, &N,
-                                        Bytes.data() + Bytes.size(), &Error));
+  Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N,
+                               Bytes.data() + Bytes.size(), &Error)
+               : static_cast<int64_t>(decodeULEB128(Bytes.data() + Size, &N,
+                                                    Bytes.data() + Bytes.size(),
+                                                    &Error));
   if (Error)
     return false;
   Size += N;
+  return true;
+}
+
+static bool parseLEBImmediate(MCInst &MI, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, bool Signed) {
+  int64_t Val;
+  if (!nextLEB(Val, Bytes, Size, Signed))
+    return false;
   MI.addOperand(MCOperand::createImm(Val));
   return true;
 }
@@ -111,7 +121,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     raw_ostream & /*OS*/, raw_ostream &CS) const {
   CommentStream = &CS;
   Size = 0;
-  auto Opc = nextByte(Bytes, Size);
+  int Opc = nextByte(Bytes, Size);
   if (Opc < 0)
     return MCDisassembler::Fail;
   const auto *WasmInst = &InstructionTable0[Opc];
@@ -127,10 +137,12 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     }
     if (!WasmInst)
       return MCDisassembler::Fail;
-    Opc = nextByte(Bytes, Size);
-    if (Opc < 0)
+    int64_t PrefixedOpc;
+    if (!nextLEB(PrefixedOpc, Bytes, Size))
+      return MCDisassembler::Fail;
+    if (PrefixedOpc < 0 || PrefixedOpc >= WebAssemblyInstructionTableSize)
       return MCDisassembler::Fail;
-    WasmInst += Opc;
+    WasmInst += PrefixedOpc;
   }
   if (WasmInst->ET == ET_Unused)
     return MCDisassembler::Fail;
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index e94faa1a21409240048f05b90647801980a48957..eaf9750792bba97c01f93247ab324882d3afd0c8 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -134,11 +134,19 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
     unsigned NumFixedOperands = Desc.NumOperands;
     SmallSet<uint64_t, 8> Printed;
     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
-      if (!(i < NumFixedOperands
-                ? (Desc.OpInfo[i].OperandType ==
-                   WebAssembly::OPERAND_BASIC_BLOCK)
-                : (Desc.TSFlags & WebAssemblyII::VariableOpImmediateIsLabel)))
-        continue;
+      // See if this operand denotes a basic block target.
+      if (i < NumFixedOperands) {
+        // A non-variable_ops operand, check its type.
+        if (Desc.OpInfo[i].OperandType != WebAssembly::OPERAND_BASIC_BLOCK)
+          continue;
+      } else {
+        // A variable_ops operand, which currently can be immediates (used in
+        // br_table) which are basic block targets, or for call instructions
+        // when using -wasm-keep-registers (in which case they are registers,
+        // and should not be processed).
+        if (!MI->getOperand(i).isImm())
+          continue;
+      }
       uint64_t Depth = MI->getOperand(i).getImm();
       if (!Printed.insert(Depth).second)
         continue;
@@ -194,9 +202,6 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                           raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
-            MII.get(MI->getOpcode()).TSFlags == 0) &&
-           "WebAssembly variable_ops register ops don't use TSFlags");
     unsigned WAReg = Op.getReg();
     if (int(WAReg) >= 0)
       printRegName(O, WAReg);
@@ -210,23 +215,9 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
       O << '=';
   } else if (Op.isImm()) {
-    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-    assert((OpNo < Desc.getNumOperands() ||
-            (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate)) &&
-           "WebAssemblyII::VariableOpIsImmediate should be set for "
-           "variable_ops immediate ops");
-    (void)Desc;
-    // TODO: (MII.get(MI->getOpcode()).TSFlags &
-    //        WebAssemblyII::VariableOpImmediateIsLabel)
-    // can tell us whether this is an immediate referencing a label in the
-    // control flow stack, and it may be nice to pretty-print.
     O << Op.getImm();
   } else if (Op.isFPImm()) {
     const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-    assert(OpNo < Desc.getNumOperands() &&
-           "Unexpected floating-point immediate as a non-fixed operand");
-    assert(Desc.TSFlags == 0 &&
-           "WebAssembly variable_ops floating point ops don't use TSFlags");
     const MCOperandInfo &Info = Desc.OpInfo[OpNo];
     if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
       // TODO: MC converts all floating point immediate operands to double.
@@ -237,16 +228,22 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       O << ::toString(APFloat(Op.getFPImm()));
     }
   } else {
-    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
-            (MII.get(MI->getOpcode()).TSFlags &
-             WebAssemblyII::VariableOpIsImmediate)) &&
-           "WebAssemblyII::VariableOpIsImmediate should be set for "
-           "variable_ops expr ops");
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     Op.getExpr()->print(O, &MAI);
   }
 }
 
+void WebAssemblyInstPrinter::printBrList(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  O << "{";
+  for (unsigned I = OpNo, E = MI->getNumOperands(); I != E; ++I) {
+    if (I != OpNo)
+      O << ", ";
+    O << MI->getOperand(I).getImm();
+  }
+  O << "}";
+}
+
 void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
                                                             unsigned OpNo,
                                                             raw_ostream &O) {
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index ded64f9a6e9b83a046f330ec112c1a732963a2c1..fd968bc9ea304be46d670023d6365143f5273a64 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -43,6 +43,7 @@ public:
 
   // Used by tblegen code.
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBrList(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O);
   void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index fa399371f2cba73a8918fc366aae6a26f55c2ab5..065a4dc94ca6a0bf09d0a7458cc19c63625444f2 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -67,7 +67,8 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
     OS << uint8_t(Binary);
   } else {
     assert(Binary <= UINT16_MAX && "Several-byte opcodes not supported yet");
-    OS << uint8_t(Binary >> 8) << uint8_t(Binary);
+    OS << uint8_t(Binary >> 8);
+    encodeULEB128(uint8_t(Binary), OS);
   }
 
   // For br_table instructions, encode the size of the table. In the MCInst,
@@ -85,10 +86,9 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
     const MCOperand &MO = MI.getOperand(i);
     if (MO.isReg()) {
       /* nothing to encode */
+
     } else if (MO.isImm()) {
       if (i < Desc.getNumOperands()) {
-        assert(Desc.TSFlags == 0 &&
-               "WebAssembly non-variable_ops don't use TSFlags");
         const MCOperandInfo &Info = Desc.OpInfo[i];
         LLVM_DEBUG(dbgs() << "Encoding immediate: type="
                           << int(Info.OperandType) << "\n");
@@ -123,15 +123,10 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
           encodeULEB128(uint64_t(MO.getImm()), OS);
         }
       } else {
-        assert(Desc.TSFlags == (WebAssemblyII::VariableOpIsImmediate |
-                                WebAssemblyII::VariableOpImmediateIsLabel));
         encodeULEB128(uint64_t(MO.getImm()), OS);
       }
+
     } else if (MO.isFPImm()) {
-      assert(i < Desc.getNumOperands() &&
-             "Unexpected floating-point immediate as a non-fixed operand");
-      assert(Desc.TSFlags == 0 &&
-             "WebAssembly variable_ops floating point ops don't use TSFlags");
       const MCOperandInfo &Info = Desc.OpInfo[i];
       if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
         // TODO: MC converts all floating point immediate operands to double.
@@ -143,22 +138,27 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         double d = MO.getFPImm();
         support::endian::write<double>(OS, d, support::little);
       }
+
     } else if (MO.isExpr()) {
       const MCOperandInfo &Info = Desc.OpInfo[i];
       llvm::MCFixupKind FixupKind;
       size_t PaddedSize = 5;
-      if (Info.OperandType == WebAssembly::OPERAND_I32IMM) {
+      switch (Info.OperandType) {
+      case WebAssembly::OPERAND_I32IMM:
         FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i32);
-      } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
+        break;
+      case WebAssembly::OPERAND_I64IMM:
         FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i64);
         PaddedSize = 10;
-      } else if (Info.OperandType == WebAssembly::OPERAND_FUNCTION32 ||
-                 Info.OperandType == WebAssembly::OPERAND_OFFSET32 ||
-                 Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+        break;
+      case WebAssembly::OPERAND_FUNCTION32:
+      case WebAssembly::OPERAND_OFFSET32:
+      case WebAssembly::OPERAND_TYPEINDEX:
+      case WebAssembly::OPERAND_GLOBAL:
+      case WebAssembly::OPERAND_EVENT:
         FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
-      } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
-        FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
-      } else {
+        break;
+      default:
         llvm_unreachable("unexpected symbolic operand kind");
       }
       Fixups.push_back(MCFixup::create(OS.tell() - Start, MO.getExpr(),
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 24dd5238f71334146c357004d6aa54e6c093ae06..390f367c29785231867e031f95c1173397026694 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -90,6 +90,10 @@ static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
   return new WebAssemblyTargetAsmStreamer(S, OS);
 }
 
+static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
+  return new WebAssemblyTargetNullStreamer(S);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeWebAssemblyTargetMC() {
   for (Target *T :
@@ -120,6 +124,8 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() {
                                                  createObjectTargetStreamer);
     // Register the asm target streamer.
     TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer);
+    // Register the null target streamer.
+    TargetRegistry::RegisterNullTargetStreamer(*T, createNullTargetStreamer);
   }
 }
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 730bb8d6c79c25d5bf3cdf3f53ec5db722da1554..4ca20f5936831b3133da63444b713052b0f14981 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -77,18 +77,12 @@ enum OperandType {
   OPERAND_SIGNATURE,
   /// type signature immediate for call_indirect.
   OPERAND_TYPEINDEX,
+  /// Event index.
+  OPERAND_EVENT,
 };
 } // end namespace WebAssembly
 
 namespace WebAssemblyII {
-enum {
-  // For variadic instructions, this flag indicates whether an operand
-  // in the variable_ops range is an immediate value.
-  VariableOpIsImmediate = (1 << 0),
-  // For immediate values in the variable_ops range, this flag indicates
-  // whether the value represents a control-flow label.
-  VariableOpImmediateIsLabel = (1 << 1)
-};
 
 /// Target Operand Flag enum.
 enum TOF {
@@ -97,7 +91,8 @@ enum TOF {
   // Flags to indicate the type of the symbol being referenced
   MO_SYMBOL_FUNCTION = 0x1,
   MO_SYMBOL_GLOBAL = 0x2,
-  MO_SYMBOL_MASK = 0x3,
+  MO_SYMBOL_EVENT = 0x4,
+  MO_SYMBOL_MASK = 0x7,
 };
 } // end namespace WebAssemblyII
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 4c4ca4e599c67c53b41731f058ac92ed31d933f5..70ac50272545b9cde60550bc5d3f6168e81e69d0 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -39,74 +39,80 @@ WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
 WebAssemblyTargetWasmStreamer::WebAssemblyTargetWasmStreamer(MCStreamer &S)
     : WebAssemblyTargetStreamer(S) {}
 
-static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
+static void printTypes(formatted_raw_ostream &OS,
+                       ArrayRef<wasm::ValType> Types) {
   bool First = true;
-  for (MVT Type : Types) {
+  for (auto Type : Types) {
     if (First)
       First = false;
     else
       OS << ", ";
-    OS << WebAssembly::TypeToString(WebAssembly::toValType(Type));
+    OS << WebAssembly::TypeToString(Type);
   }
   OS << '\n';
 }
 
-void WebAssemblyTargetAsmStreamer::emitParam(MCSymbol *Symbol,
-                                             ArrayRef<MVT> Types) {
+void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<wasm::ValType> Types) {
   if (!Types.empty()) {
-    OS << "\t.param  \t";
-
-    // FIXME: Currently this applies to the "current" function; it may
-    // be cleaner to specify an explicit symbol as part of the directive.
-
-    PrintTypes(OS, Types);
+    OS << "\t.local  \t";
+    printTypes(OS, Types);
   }
 }
 
-void WebAssemblyTargetAsmStreamer::emitResult(MCSymbol *Symbol,
-                                              ArrayRef<MVT> Types) {
-  if (!Types.empty()) {
-    OS << "\t.result \t";
+void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
 
-    // FIXME: Currently this applies to the "current" function; it may
-    // be cleaner to specify an explicit symbol as part of the directive.
+void WebAssemblyTargetAsmStreamer::emitSignature(
+    const wasm::WasmSignature *Sig) {
+  OS << "(";
+  emitParamList(Sig);
+  OS << ") -> (";
+  emitReturnList(Sig);
+  OS << ")";
+}
 
-    PrintTypes(OS, Types);
+void WebAssemblyTargetAsmStreamer::emitParamList(
+    const wasm::WasmSignature *Sig) {
+  auto &Params = Sig->Params;
+  for (auto &Ty : Params) {
+    if (&Ty != &Params[0])
+      OS << ", ";
+    OS << WebAssembly::TypeToString(Ty);
   }
 }
 
-void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
-  if (!Types.empty()) {
-    OS << "\t.local  \t";
-    PrintTypes(OS, Types);
+void WebAssemblyTargetAsmStreamer::emitReturnList(
+    const wasm::WasmSignature *Sig) {
+  auto &Returns = Sig->Returns;
+  for (auto &Ty : Returns) {
+    if (&Ty != &Returns[0])
+      OS << ", ";
+    OS << WebAssembly::TypeToString(Ty);
   }
 }
 
-void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
-
-void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
-    MCSymbolWasm *Symbol) {
-  OS << "\t.functype\t" << Symbol->getName();
-  if (Symbol->getSignature()->Returns.empty())
-    OS << ", void";
-  else {
-    assert(Symbol->getSignature()->Returns.size() == 1);
-    OS << ", "
-       << WebAssembly::TypeToString(Symbol->getSignature()->Returns.front());
-  }
-  for (auto Ty : Symbol->getSignature()->Params)
-    OS << ", " << WebAssembly::TypeToString(Ty);
-  OS << '\n';
+void WebAssemblyTargetAsmStreamer::emitFunctionType(const MCSymbolWasm *Sym) {
+  assert(Sym->isFunction());
+  OS << "\t.functype\t" << Sym->getName() << " ";
+  emitSignature(Sym->getSignature());
+  OS << "\n";
 }
 
-void WebAssemblyTargetAsmStreamer::emitGlobalType(MCSymbolWasm *Sym) {
+void WebAssemblyTargetAsmStreamer::emitGlobalType(const MCSymbolWasm *Sym) {
+  assert(Sym->isGlobal());
   OS << "\t.globaltype\t" << Sym->getName() << ", " <<
         WebAssembly::TypeToString(
           static_cast<wasm::ValType>(Sym->getGlobalType().Type)) <<
         '\n';
 }
 
-void WebAssemblyTargetAsmStreamer::emitImportModule(MCSymbolWasm *Sym,
+void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) {
+  assert(Sym->isEvent());
+  OS << "\t.eventtype\t" << Sym->getName() << " ";
+  emitParamList(Sym->getSignature());
+  OS << "\n";
+}
+
+void WebAssemblyTargetAsmStreamer::emitImportModule(const MCSymbolWasm *Sym,
                                                     StringRef ModuleName) {
   OS << "\t.import_module\t" << Sym->getName() << ", " << ModuleName << '\n';
 }
@@ -115,19 +121,9 @@ void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
   OS << "\t.indidx  \t" << *Value << '\n';
 }
 
-void WebAssemblyTargetWasmStreamer::emitParam(MCSymbol *Symbol,
-                                              ArrayRef<MVT> Types) {
-  // The Symbol already has its signature
-}
-
-void WebAssemblyTargetWasmStreamer::emitResult(MCSymbol *Symbol,
-                                               ArrayRef<MVT> Types) {
-  // The Symbol already has its signature
-}
-
-void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<MVT> Types) {
-  SmallVector<std::pair<MVT, uint32_t>, 4> Grouped;
-  for (MVT Type : Types) {
+void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<wasm::ValType> Types) {
+  SmallVector<std::pair<wasm::ValType, uint32_t>, 4> Grouped;
+  for (auto Type : Types) {
     if (Grouped.empty() || Grouped.back().first != Type)
       Grouped.push_back(std::make_pair(Type, 1));
     else
@@ -137,7 +133,7 @@ void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<MVT> Types) {
   Streamer.EmitULEB128IntValue(Grouped.size());
   for (auto Pair : Grouped) {
     Streamer.EmitULEB128IntValue(Pair.second);
-    emitValueType(WebAssembly::toValType(Pair.first));
+    emitValueType(Pair.first);
   }
 }
 
@@ -148,18 +144,3 @@ void WebAssemblyTargetWasmStreamer::emitEndFunc() {
 void WebAssemblyTargetWasmStreamer::emitIndIdx(const MCExpr *Value) {
   llvm_unreachable(".indidx encoding not yet implemented");
 }
-
-void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType(
-    MCSymbolWasm *Symbol) {
-  // Symbol already has its arguments and result set.
-  Symbol->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
-}
-
-void WebAssemblyTargetWasmStreamer::emitGlobalType(MCSymbolWasm *Sym) {
-  // Not needed.
-}
-
-void WebAssemblyTargetWasmStreamer::emitImportModule(MCSymbolWasm *Sym,
-                                                     StringRef ModuleName) {
-  Sym->setModuleName(ModuleName);
-}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index e60158b5defb90369d2113a4a29d2a58a908563b..3073938118b458be1e3e0c9e9ecffd6b91ed47ce 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -31,22 +31,21 @@ class WebAssemblyTargetStreamer : public MCTargetStreamer {
 public:
   explicit WebAssemblyTargetStreamer(MCStreamer &S);
 
-  /// .param
-  virtual void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
-  /// .result
-  virtual void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
   /// .local
-  virtual void emitLocal(ArrayRef<MVT> Types) = 0;
+  virtual void emitLocal(ArrayRef<wasm::ValType> Types) = 0;
   /// .endfunc
   virtual void emitEndFunc() = 0;
   /// .functype
-  virtual void emitIndirectFunctionType(MCSymbolWasm *Symbol) = 0;
+  virtual void emitFunctionType(const MCSymbolWasm *Sym) = 0;
   /// .indidx
   virtual void emitIndIdx(const MCExpr *Value) = 0;
   /// .globaltype
-  virtual void emitGlobalType(MCSymbolWasm *Sym) = 0;
+  virtual void emitGlobalType(const MCSymbolWasm *Sym) = 0;
+  /// .eventtype
+  virtual void emitEventType(const MCSymbolWasm *Sym) = 0;
   /// .import_module
-  virtual void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) = 0;
+  virtual void emitImportModule(const MCSymbolWasm *Sym,
+                                StringRef ModuleName) = 0;
 
 protected:
   void emitValueType(wasm::ValType Type);
@@ -55,18 +54,20 @@ protected:
 /// This part is for ascii assembly output
 class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
   formatted_raw_ostream &OS;
+  void emitSignature(const wasm::WasmSignature *Sig);
+  void emitParamList(const wasm::WasmSignature *Sig);
+  void emitReturnList(const wasm::WasmSignature *Sig);
 
 public:
   WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 
-  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
-  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
-  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<wasm::ValType> Types) override;
   void emitEndFunc() override;
-  void emitIndirectFunctionType(MCSymbolWasm *Symbol) override;
+  void emitFunctionType(const MCSymbolWasm *Sym) override;
   void emitIndIdx(const MCExpr *Value) override;
-  void emitGlobalType(MCSymbolWasm *Sym) override;
-  void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
+  void emitGlobalType(const MCSymbolWasm *Sym) override;
+  void emitEventType(const MCSymbolWasm *Sym) override;
+  void emitImportModule(const MCSymbolWasm *Sym, StringRef ModuleName) override;
 };
 
 /// This part is for Wasm object output
@@ -74,14 +75,29 @@ class WebAssemblyTargetWasmStreamer final : public WebAssemblyTargetStreamer {
 public:
   explicit WebAssemblyTargetWasmStreamer(MCStreamer &S);
 
-  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
-  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
-  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<wasm::ValType> Types) override;
   void emitEndFunc() override;
-  void emitIndirectFunctionType(MCSymbolWasm *Symbol) override;
+  void emitFunctionType(const MCSymbolWasm *Sym) override {}
   void emitIndIdx(const MCExpr *Value) override;
-  void emitGlobalType(MCSymbolWasm *Sym) override;
-  void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
+  void emitGlobalType(const MCSymbolWasm *Sym) override {}
+  void emitEventType(const MCSymbolWasm *Sym) override {}
+  void emitImportModule(const MCSymbolWasm *Sym,
+                        StringRef ModuleName) override {}
+};
+
+/// This part is for null output
+class WebAssemblyTargetNullStreamer final : public WebAssemblyTargetStreamer {
+public:
+  explicit WebAssemblyTargetNullStreamer(MCStreamer &S)
+      : WebAssemblyTargetStreamer(S) {}
+
+  void emitLocal(ArrayRef<wasm::ValType>) override {}
+  void emitEndFunc() override {}
+  void emitFunctionType(const MCSymbolWasm *) override {}
+  void emitIndIdx(const MCExpr *) override {}
+  void emitGlobalType(const MCSymbolWasm *) override {}
+  void emitEventType(const MCSymbolWasm *) override {}
+  void emitImportModule(const MCSymbolWasm *, StringRef) override {}
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index d9ed146ede7d423504aec4148d7baf2c8a830959..763e30be8e02002f241450e1ccb8e967a37d9cb1 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -86,6 +86,11 @@ static bool IsGlobalType(const MCValue &Target) {
   return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_GLOBAL;
 }
 
+static bool IsEventType(const MCValue &Target) {
+  const MCSymbolRefExpr *RefA = Target.getSymA();
+  return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_EVENT;
+}
+
 unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
                                                    const MCFixup &Fixup) const {
   // WebAssembly functions are not allocated in the data address space. To
@@ -106,6 +111,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
       return wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB;
     if (IsFunction)
       return wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB;
+    if (IsEventType(Target))
+      return wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB;
     return wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB;
   case FK_Data_4:
     if (IsFunction)
diff --git a/lib/Target/WebAssembly/WebAssembly.td b/lib/Target/WebAssembly/WebAssembly.td
index 2f301da8e4223964f353cdcf08529c01ea0f22dd..ec9dbffde7f2f6f4ce0b08c900c791a15ec6156f 100644
--- a/lib/Target/WebAssembly/WebAssembly.td
+++ b/lib/Target/WebAssembly/WebAssembly.td
@@ -71,7 +71,8 @@ def : ProcessorModel<"generic", NoSchedModel, []>;
 
 // Latest and greatest experimental version of WebAssembly. Bugs included!
 def : ProcessorModel<"bleeding-edge", NoSchedModel,
-                      [FeatureSIMD128, FeatureAtomics]>;
+                      [FeatureSIMD128, FeatureAtomics,
+                       FeatureNontrappingFPToInt, FeatureSignExt]>;
 
 //===----------------------------------------------------------------------===//
 // Target Declaration
diff --git a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
index a4d276c7936525cfa258130bf0f08857b11d53e1..3f17160964b6e537f96c664ede240be6802940d4 100644
--- a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
@@ -90,16 +90,23 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
     Function *NewF = nullptr;
     for (Use &U : F.uses()) {
       LLVM_DEBUG(dbgs() << "prototype-less use: " << F.getName() << "\n");
-      if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser())) {
-        FunctionType *DestType =
-            cast<FunctionType>(BC->getDestTy()->getPointerElementType());
-
-        // Create a new function with the correct type
-        NewType = DestType;
-        NewF = Function::Create(NewType, F.getLinkage(), F.getName());
-        NewF->setAttributes(F.getAttributes());
-        NewF->removeFnAttr("no-prototype");
-        break;
+      if (auto *BC = dyn_cast<BitCastOperator>(U.getUser())) {
+        if (auto *DestType = dyn_cast<FunctionType>(
+                BC->getDestTy()->getPointerElementType())) {
+          if (!NewType) {
+            // Create a new function with the correct type
+            NewType = DestType;
+            NewF = Function::Create(NewType, F.getLinkage(), F.getName());
+            NewF->setAttributes(F.getAttributes());
+            NewF->removeFnAttr("no-prototype");
+          } else {
+            if (NewType != DestType) {
+              report_fatal_error("Prototypeless function used with "
+                                 "conflicting signatures: " +
+                                 F.getName());
+            }
+          }
+        }
       }
     }
 
@@ -110,28 +117,38 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
       continue;
     }
 
-    for (Use &U : F.uses()) {
-      if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser())) {
-        FunctionType *DestType =
-            cast<FunctionType>(BC->getDestTy()->getPointerElementType());
-        if (NewType != DestType) {
-          report_fatal_error(
-              "Prototypeless function used with conflicting signatures: " +
-              F.getName());
-        }
-        BC->replaceAllUsesWith(NewF);
-        Replacements.emplace_back(&F, NewF);
-      } else {
-        dbgs() << *U.getUser()->getType() << "\n";
+    SmallVector<Instruction *, 4> DeadInsts;
+
+    for (Use &US : F.uses()) {
+      User *U = US.getUser();
+      if (auto *BC = dyn_cast<BitCastOperator>(U)) {
+        if (auto *Inst = dyn_cast<BitCastInst>(U)) {
+          // Replace with a new bitcast
+          IRBuilder<> Builder(Inst);
+          Value *NewCast = Builder.CreatePointerCast(NewF, BC->getDestTy());
+          Inst->replaceAllUsesWith(NewCast);
+          DeadInsts.push_back(Inst);
+        } else if (auto *Const = dyn_cast<ConstantExpr>(U)) {
+          Constant *NewConst =
+              ConstantExpr::getPointerCast(NewF, BC->getDestTy());
+          Const->replaceAllUsesWith(NewConst);
+        } else {
+          dbgs() << *U->getType() << "\n";
 #ifndef NDEBUG
-        U.getUser()->dump();
+          U->dump();
 #endif
-        report_fatal_error(
-            "unexpected use of prototypeless function: " + F.getName() + "\n");
+          report_fatal_error("unexpected use of prototypeless function: " +
+                             F.getName() + "\n");
+        }
       }
     }
+
+    for (auto I : DeadInsts)
+      I->eraseFromParent();
+    Replacements.emplace_back(&F, NewF);
   }
 
+
   // Finally replace the old function declarations with the new ones
   for (auto &Pair : Replacements) {
     Function *Old = Pair.first;
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 1e21ab92b62997ab3488284e5eedc07fa00f2968..c4f03dfa7f9e451ed81859023d2c64cde7bb15f7 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -79,11 +79,12 @@ WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
 
 void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
   for (auto &It : OutContext.getSymbols()) {
-    // Emit a .globaltype declaration.
+    // Emit a .globaltype and .eventtype declaration.
     auto Sym = cast<MCSymbolWasm>(It.getValue());
-    if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL) {
+    if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL)
       getTargetStreamer()->emitGlobalType(Sym);
-    }
+    else if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_EVENT)
+      getTargetStreamer()->emitEventType(Sym);
   }
 
   for (const auto &F : M) {
@@ -93,6 +94,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
       SmallVector<MVT, 4> Params;
       ComputeSignatureVTs(F.getFunctionType(), F, TM, Params, Results);
       auto *Sym = cast<MCSymbolWasm>(getSymbol(&F));
+      Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
       if (!Sym->getSignature()) {
         auto Signature = SignatureFromMVTs(Results, Params);
         Sym->setSignature(Signature.get());
@@ -103,12 +105,13 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
       // infer the type from a call). With object files it applies to all
       // imports. so fix the names and the tests, or rethink how import
       // delcarations work in asm files.
-      getTargetStreamer()->emitIndirectFunctionType(Sym);
+      getTargetStreamer()->emitFunctionType(Sym);
 
       if (TM.getTargetTriple().isOSBinFormatWasm() &&
           F.hasFnAttribute("wasm-import-module")) {
         StringRef Name =
             F.getFnAttribute("wasm-import-module").getValueAsString();
+        Sym->setModuleName(Name);
         getTargetStreamer()->emitImportModule(Sym, Name);
       }
     }
@@ -163,9 +166,10 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
   auto *WasmSym = cast<MCSymbolWasm>(CurrentFnSym);
   WasmSym->setSignature(Signature.get());
   addSignature(std::move(Signature));
+  WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
 
   // FIXME: clean up how params and results are emitted (use signatures)
-  getTargetStreamer()->emitParam(CurrentFnSym, ParamVTs);
+  getTargetStreamer()->emitFunctionType(WasmSym);
 
   // Emit the function index.
   if (MDNode *Idx = F.getMetadata("wasm.index")) {
@@ -175,8 +179,9 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
         cast<ConstantAsMetadata>(Idx->getOperand(0))->getValue()));
   }
 
-  getTargetStreamer()->emitResult(CurrentFnSym, ResultVTs);
-  getTargetStreamer()->emitLocal(MFI->getLocals());
+  SmallVector<wasm::ValType, 16> Locals;
+  ValTypesFromMVTs(MFI->getLocals(), Locals);
+  getTargetStreamer()->emitLocal(Locals);
 
   AsmPrinter::EmitFunctionBodyStart();
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index b1955017c687d5c56900be37994016813b767d39..f8f5f4040c86a3e9427c9ff052e672049fadae07 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -78,12 +78,11 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   // <LOOP|TRY marker, Loop/exception bottom BB> map
   DenseMap<const MachineInstr *, MachineBasicBlock *> BeginToBottom;
 
-  // Helper functions to register / unregister scope information created by
-  // marker instructions.
+  // Helper functions to register scope information created by marker
+  // instructions.
   void registerScope(MachineInstr *Begin, MachineInstr *End);
   void registerTryScope(MachineInstr *Begin, MachineInstr *End,
                         MachineBasicBlock *EHPad);
-  void unregisterScope(MachineInstr *Begin);
 
   MachineBasicBlock *getBottom(const MachineInstr *Begin);
 
@@ -182,23 +181,6 @@ void WebAssemblyCFGStackify::registerTryScope(MachineInstr *Begin,
   EHPadToTry[EHPad] = Begin;
 }
 
-void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) {
-  assert(BeginToEnd.count(Begin));
-  MachineInstr *End = BeginToEnd[Begin];
-  assert(EndToBegin.count(End));
-  BeginToEnd.erase(Begin);
-  EndToBegin.erase(End);
-  MachineBasicBlock *EHPad = TryToEHPad.lookup(Begin);
-  if (EHPad) {
-    assert(EHPadToTry.count(EHPad));
-    TryToEHPad.erase(Begin);
-    EHPadToTry.erase(EHPad);
-  }
-  MachineBasicBlock *Bottom = BeginToBottom.lookup(Begin);
-  if (Bottom)
-    BeginToBottom.erase(Begin);
-}
-
 // Given a LOOP/TRY marker, returns its bottom BB. Use cached information if any
 // to prevent recomputation.
 MachineBasicBlock *
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index 444a087605ef6fa92c65d30fea03fa19450e995b..e987d7f7f43a12946c8c4a3b4400df45de9a03e4 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -25,5 +25,6 @@ HANDLE_NODETYPE(SHUFFLE)
 HANDLE_NODETYPE(VEC_SHL)
 HANDLE_NODETYPE(VEC_SHR_S)
 HANDLE_NODETYPE(VEC_SHR_U)
+HANDLE_NODETYPE(THROW)
 
 // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 578d23570f85c03073cba1bec336c1930fc2a4d9..a0382157f76918dfe6ed4cf59d997b9b8a889e88 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
@@ -122,14 +123,22 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
         setOperationAction(Op, T, Legal);
 
-  for (auto T : {MVT::i32, MVT::i64}) {
-    // Expand unavailable integer operations.
-    for (auto Op :
-         {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
-          ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS, ISD::SRA_PARTS,
-          ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC, ISD::SUBE}) {
+  // Expand unavailable integer operations.
+  for (auto Op :
+       {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
+        ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS, ISD::SRA_PARTS,
+        ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC, ISD::SUBE}) {
+    for (auto T : {MVT::i32, MVT::i64}) {
       setOperationAction(Op, T, Expand);
     }
+    if (Subtarget->hasSIMD128()) {
+      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) {
+        setOperationAction(Op, T, Expand);
+      }
+      if (EnableUnimplementedWasmSIMDInstrs) {
+        setOperationAction(Op, MVT::v2i64, Expand);
+      }
+    }
   }
 
   // There is no i64x2.mul instruction
@@ -156,14 +165,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
         setOperationAction(Op, MVT::v2i64, Custom);
   }
 
-  // There is no select instruction for vectors
-  if (Subtarget->hasSIMD128()) {
-    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
-      setOperationAction(ISD::VSELECT, T, Expand);
-    if (EnableUnimplementedWasmSIMDInstrs)
-      for (auto T : {MVT::v2i64, MVT::v2f64})
-        setOperationAction(ISD::VSELECT, T, Expand);
-  }
+  // There are no select instructions for vectors
+  if (Subtarget->hasSIMD128())
+    for (auto Op : {ISD::VSELECT, ISD::SELECT_CC, ISD::SELECT}) {
+      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+        setOperationAction(Op, T, Expand);
+      if (EnableUnimplementedWasmSIMDInstrs)
+        for (auto T : {MVT::v2i64, MVT::v2f64})
+          setOperationAction(Op, T, Expand);
+    }
 
   // As a special case, these operators use the type to mean the type to
   // sign-extend from.
@@ -233,6 +243,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   // Exception handling intrinsics
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   setMaxAtomicSizeInBitsSupported(64);
 }
@@ -881,6 +892,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
   case ISD::EXTRACT_VECTOR_ELT:
   case ISD::INSERT_VECTOR_ELT:
     return LowerAccessVectorElement(Op, DAG);
+  case ISD::INTRINSIC_VOID:
+    return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::SHL:
@@ -1044,6 +1057,44 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 }
 
+SDValue
+WebAssemblyTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  SDLoc DL(Op);
+
+  switch (IntNo) {
+  default:
+    return {}; // Don't custom lower most intrinsics.
+
+  case Intrinsic::wasm_throw: {
+    int Tag = cast<ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+    switch (Tag) {
+    case CPP_EXCEPTION: {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+      const char *SymName = MF.createExternalSymbolName("__cpp_exception");
+      SDValue SymNode =
+          DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT,
+                      DAG.getTargetExternalSymbol(
+                          SymName, PtrVT, WebAssemblyII::MO_SYMBOL_EVENT));
+      return DAG.getNode(WebAssemblyISD::THROW, DL,
+                         MVT::Other, // outchain type
+                         {
+                             Op.getOperand(0), // inchain
+                             SymNode,          // exception symbol
+                             Op.getOperand(3)  // thrown value
+                         });
+    }
+    default:
+      llvm_unreachable("Invalid tag!");
+    }
+    break;
+  }
+  }
+}
+
 SDValue
 WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                SelectionDAG &DAG) const {
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 5182a58efc784839ed0216a613b7e44b8d947e29..80076818de7335293226909c1a7047b6a2cb2805 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -98,6 +98,7 @@ private:
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 0af94ef875518d0484e6e2671b09d9dfb53a269b..a7fc7564009007474825f53e500007dc8bf5deb7 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -33,11 +33,17 @@ def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
 def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
           (BR_UNLESS bb_op:$dst, I32:$cond)>;
 
+// A list of branch targets enclosed in {} and separated by comma.
+// Used by br_table only.
+def BrListAsmOperand : AsmOperandClass { let Name = "BrList"; }
+def brlist : Operand<i32> {
+  let ParserMatchClass = BrListAsmOperand;
+  let PrintMethod = "printBrList";
+}
+
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
 // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
 // currently.
-// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
-// Set TSFlags{1} to 1 to indicate that the immediates represent labels.
 // FIXME: this can't inherit from I<> since there is no way to inherit from a
 // multiclass and still have the let statements.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
@@ -45,29 +51,19 @@ let isCodeGenOnly = 1 in
 def BR_TABLE_I32 : NI<(outs), (ins I32:$index, variable_ops),
                       [(WebAssemblybr_table I32:$index)], "false",
                       "br_table \t$index", 0x0e> {
-  let TSFlags{0} = 1;
-  let TSFlags{1} = 1;
 }
 let BaseName = "BR_TABLE_I32" in
-def BR_TABLE_I32_S : NI<(outs), (ins variable_ops),
-                        [], "true",
-                        "br_table \t", 0x0e> {
-  let TSFlags{0} = 1;
-  let TSFlags{1} = 1;
+def BR_TABLE_I32_S : NI<(outs), (ins brlist:$brl), [], "true",
+                        "br_table \t$brl", 0x0e> {
 }
 let isCodeGenOnly = 1 in
 def BR_TABLE_I64 : NI<(outs), (ins I64:$index, variable_ops),
                       [(WebAssemblybr_table I64:$index)], "false",
                       "br_table \t$index"> {
-  let TSFlags{0} = 1;
-  let TSFlags{1} = 1;
 }
 let BaseName = "BR_TABLE_I64" in
-def BR_TABLE_I64_S : NI<(outs), (ins variable_ops),
-                        [], "true",
-                        "br_table \t"> {
-  let TSFlags{0} = 1;
-  let TSFlags{1} = 1;
+def BR_TABLE_I64_S : NI<(outs), (ins brlist:$brl), [], "true",
+                        "br_table \t$brl"> {
 }
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
@@ -146,14 +142,16 @@ let Predicates = [HasExceptionHandling] in {
 
 // Throwing an exception: throw / rethrow
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-defm THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$val),
-                   (outs), (ins i32imm:$tag),
-                   [(int_wasm_throw imm:$tag, I32:$val)],
+defm THROW_I32 : I<(outs), (ins event_op:$tag, I32:$val),
+                   (outs), (ins event_op:$tag),
+                   [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag),
+                                      I32:$val)],
                    "throw   \t$tag, $val", "throw   \t$tag",
                    0x08>;
-defm THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$val),
-                   (outs), (ins i32imm:$tag),
-                   [(int_wasm_throw imm:$tag, I64:$val)],
+defm THROW_I64 : I<(outs), (ins event_op:$tag, I64:$val),
+                   (outs), (ins event_op:$tag),
+                   [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag),
+                                      I64:$val)],
                    "throw   \t$tag, $val", "throw   \t$tag",
                    0x08>;
 defm RETHROW : NRI<(outs), (ins bb_op:$dst), [], "rethrow \t$dst", 0x09>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 8fff924265ff3133c99054732d305c45ee137460..085c5a77e68af79de223bdd232c0341b3608b0e6 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -64,6 +64,7 @@ def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
 def SDT_WebAssemblyReturn   : SDTypeProfile<0, -1, []>;
 def SDT_WebAssemblyWrapper  : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                    SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyThrow    : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific DAG Nodes.
@@ -90,6 +91,8 @@ def WebAssemblyreturn   : SDNode<"WebAssemblyISD::RETURN",
                                  SDT_WebAssemblyReturn, [SDNPHasChain]>;
 def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
                                  SDT_WebAssemblyWrapper>;
+def WebAssemblythrow : SDNode<"WebAssemblyISD::THROW", SDT_WebAssemblyThrow,
+                              [SDNPHasChain]>;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific Operands.
@@ -140,6 +143,10 @@ let OperandType = "OPERAND_P2ALIGN" in {
 def P2Align : Operand<i32> {
   let PrintMethod = "printWebAssemblyP2AlignOperand";
 }
+
+let OperandType = "OPERAND_EVENT" in
+def event_op : Operand<i32>;
+
 } // OperandType = "OPERAND_P2ALIGN"
 
 let OperandType = "OPERAND_SIGNATURE" in {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index caad638e9e387107485f5e7baa249da8c58127ee..7ac2d15c295188fbbbe28265d4a0273442a7023f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -36,6 +36,58 @@ def ImmI#SIZE : ImmLeaf<i32,
 foreach SIZE = [2, 4, 8, 16, 32] in
 def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
 
+//===----------------------------------------------------------------------===//
+// Load and store
+//===----------------------------------------------------------------------===//
+
+// Load: v128.load
+multiclass SIMDLoad<ValueType vec_t> {
+  let mayLoad = 1 in
+  defm LOAD_#vec_t :
+    SIMD_I<(outs V128:$dst), (ins P2Align:$align, offset32_op:$off, I32:$addr),
+           (outs), (ins P2Align:$align, offset32_op:$off), [],
+           "v128.load\t$dst, ${off}(${addr})$align",
+           "v128.load\t$off$align", 0>;
+}
+
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
+defm "" : SIMDLoad<vec_t>;
+
+// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
+def : LoadPatNoOffset<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
+def : LoadPatImmOff<vec_t, load, regPlusImm, !cast<NI>("LOAD_"#vec_t)>;
+def : LoadPatImmOff<vec_t, load, or_is_add, !cast<NI>("LOAD_"#vec_t)>;
+def : LoadPatGlobalAddr<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
+def : LoadPatExternalSym<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
+def : LoadPatOffsetOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
+def : LoadPatGlobalAddrOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
+def : LoadPatExternSymOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
+}
+
+// Store: v128.store
+multiclass SIMDStore<ValueType vec_t> {
+  let mayStore = 1 in
+  defm STORE_#vec_t :
+    SIMD_I<(outs), (ins P2Align:$align, offset32_op:$off, I32:$addr, V128:$vec),
+           (outs), (ins P2Align:$align, offset32_op:$off), [],
+           "v128.store\t${off}(${addr})$align, $vec",
+           "v128.store\t$off$align", 1>;
+}
+
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
+defm "" : SIMDStore<vec_t>;
+
+// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
+def : StorePatNoOffset<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatImmOff<vec_t, store, regPlusImm, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatImmOff<vec_t, store, or_is_add, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatGlobalAddr<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatExternalSym<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatOffsetOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatExternSymOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Constructing SIMD values
 //===----------------------------------------------------------------------===//
@@ -46,7 +98,7 @@ multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
   defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops,
                                   [(set V128:$dst, (vec_t pat))],
                                   "v128.const\t$dst, "#args,
-                                  "v128.const\t"#args, 0>;
+                                  "v128.const\t"#args, 2>;
 }
 
 defm "" : ConstVec<v16i8,
@@ -94,6 +146,61 @@ defm "" : ConstVec<v2f64,
                   (build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)),
                   "$i0, $i1">;
 
+// Shuffle lanes: shuffle
+defm SHUFFLE :
+  SIMD_I<(outs V128:$dst),
+         (ins V128:$x, V128:$y,
+           vec_i8imm_op:$m0, vec_i8imm_op:$m1,
+           vec_i8imm_op:$m2, vec_i8imm_op:$m3,
+           vec_i8imm_op:$m4, vec_i8imm_op:$m5,
+           vec_i8imm_op:$m6, vec_i8imm_op:$m7,
+           vec_i8imm_op:$m8, vec_i8imm_op:$m9,
+           vec_i8imm_op:$mA, vec_i8imm_op:$mB,
+           vec_i8imm_op:$mC, vec_i8imm_op:$mD,
+           vec_i8imm_op:$mE, vec_i8imm_op:$mF),
+         (outs),
+         (ins
+           vec_i8imm_op:$m0, vec_i8imm_op:$m1,
+           vec_i8imm_op:$m2, vec_i8imm_op:$m3,
+           vec_i8imm_op:$m4, vec_i8imm_op:$m5,
+           vec_i8imm_op:$m6, vec_i8imm_op:$m7,
+           vec_i8imm_op:$m8, vec_i8imm_op:$m9,
+           vec_i8imm_op:$mA, vec_i8imm_op:$mB,
+           vec_i8imm_op:$mC, vec_i8imm_op:$mD,
+           vec_i8imm_op:$mE, vec_i8imm_op:$mF),
+         [],
+         "v8x16.shuffle\t$dst, $x, $y, "#
+           "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
+           "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
+         "v8x16.shuffle\t"#
+           "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
+           "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
+         3>;
+
+// Shuffles after custom lowering
+def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
+def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>;
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
+def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
+            (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
+            (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
+            (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
+            (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7),
+            (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9),
+            (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
+            (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
+            (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF))),
+          (vec_t (SHUFFLE (vec_t V128:$x), (vec_t V128:$y),
+            (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
+            (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
+            (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
+            (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7),
+            (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9),
+            (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
+            (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
+            (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>;
+}
+
 // Create vector with identical lanes: splat
 def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
 def splat4 : PatFrag<(ops node:$x), (build_vector
@@ -116,12 +223,12 @@ multiclass Splat<ValueType vec_t, string vec, WebAssemblyRegClass reg_t,
                              vec#".splat\t$dst, $x", vec#".splat", simdop>;
 }
 
-defm "" : Splat<v16i8, "i8x16", I32, splat16, 3>;
-defm "" : Splat<v8i16, "i16x8", I32, splat8, 4>;
-defm "" : Splat<v4i32, "i32x4", I32, splat4, 5>;
-defm "" : Splat<v2i64, "i64x2", I64, splat2, 6>;
-defm "" : Splat<v4f32, "f32x4", F32, splat4, 7>;
-defm "" : Splat<v2f64, "f64x2", F64, splat2, 8>;
+defm "" : Splat<v16i8, "i8x16", I32, splat16, 4>;
+defm "" : Splat<v8i16, "i16x8", I32, splat8, 8>;
+defm "" : Splat<v4i32, "i32x4", I32, splat4, 12>;
+defm "" : Splat<v2i64, "i64x2", I64, splat2, 15>;
+defm "" : Splat<v4f32, "f32x4", F32, splat4, 18>;
+defm "" : Splat<v2f64, "f64x2", F64, splat2, 21>;
 
 //===----------------------------------------------------------------------===//
 // Accessing lanes
@@ -164,16 +271,16 @@ defm extract_i16x8 : ExtractPat<i16, 0xffff>;
 multiclass ExtractLaneExtended<string sign, bits<32> baseInst> {
   defm "" : ExtractLane<v16i8, "i8x16", LaneIdx16, I32, baseInst, sign,
                         !cast<PatFrag>("extract_i8x16"#sign)>;
-  defm "" : ExtractLane<v8i16, "i16x8", LaneIdx8, I32, !add(baseInst, 2), sign,
+  defm "" : ExtractLane<v8i16, "i16x8", LaneIdx8, I32, !add(baseInst, 4), sign,
                         !cast<PatFrag>("extract_i16x8"#sign)>;
 }
 
-defm "" : ExtractLaneExtended<"_s", 9>;
-defm "" : ExtractLaneExtended<"_u", 10>;
+defm "" : ExtractLaneExtended<"_s", 5>;
+defm "" : ExtractLaneExtended<"_u", 6>;
 defm "" : ExtractLane<v4i32, "i32x4", LaneIdx4, I32, 13>;
-defm "" : ExtractLane<v2i64, "i64x2", LaneIdx2, I64, 14>;
-defm "" : ExtractLane<v4f32, "f32x4", LaneIdx4, F32, 15>;
-defm "" : ExtractLane<v2f64, "f64x2", LaneIdx2, F64, 16>;
+defm "" : ExtractLane<v2i64, "i64x2", LaneIdx2, I64, 16>;
+defm "" : ExtractLane<v4f32, "f32x4", LaneIdx4, F32, 19>;
+defm "" : ExtractLane<v2f64, "f64x2", LaneIdx2, F64, 22>;
 
 // Follow convention of making implicit expansions unsigned
 def : Pat<(i32 (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx))),
@@ -216,12 +323,12 @@ multiclass ReplaceLane<ValueType vec_t, string vec, ImmLeaf imm_t,
              vec#".replace_lane\t$idx", simdop>;
 }
 
-defm "" : ReplaceLane<v16i8, "i8x16", LaneIdx16, I32, i32, 17>;
-defm "" : ReplaceLane<v8i16, "i16x8", LaneIdx8, I32, i32, 18>;
-defm "" : ReplaceLane<v4i32, "i32x4", LaneIdx4, I32, i32, 19>;
-defm "" : ReplaceLane<v2i64, "i64x2", LaneIdx2, I64, i64, 20>;
-defm "" : ReplaceLane<v4f32, "f32x4", LaneIdx4, F32, f32, 21>;
-defm "" : ReplaceLane<v2f64, "f64x2", LaneIdx2, F64, f64, 22>;
+defm "" : ReplaceLane<v16i8, "i8x16", LaneIdx16, I32, i32, 7>;
+defm "" : ReplaceLane<v8i16, "i16x8", LaneIdx8, I32, i32, 11>;
+defm "" : ReplaceLane<v4i32, "i32x4", LaneIdx4, I32, i32, 14>;
+defm "" : ReplaceLane<v2i64, "i64x2", LaneIdx2, I64, i64, 17>;
+defm "" : ReplaceLane<v4f32, "f32x4", LaneIdx4, F32, f32, 20>;
+defm "" : ReplaceLane<v2f64, "f64x2", LaneIdx2, F64, f64, 23>;
 
 // Lower undef lane indices to zero
 def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef),
@@ -349,63 +456,80 @@ def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))),
           (v2f64 (REPLACE_LANE_v2f64
             (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>;
 
-// Shuffle lanes: shuffle
-defm SHUFFLE :
-  SIMD_I<(outs V128:$dst),
-         (ins V128:$x, V128:$y,
-           vec_i8imm_op:$m0, vec_i8imm_op:$m1,
-           vec_i8imm_op:$m2, vec_i8imm_op:$m3,
-           vec_i8imm_op:$m4, vec_i8imm_op:$m5,
-           vec_i8imm_op:$m6, vec_i8imm_op:$m7,
-           vec_i8imm_op:$m8, vec_i8imm_op:$m9,
-           vec_i8imm_op:$mA, vec_i8imm_op:$mB,
-           vec_i8imm_op:$mC, vec_i8imm_op:$mD,
-           vec_i8imm_op:$mE, vec_i8imm_op:$mF),
-         (outs),
-         (ins
-           vec_i8imm_op:$m0, vec_i8imm_op:$m1,
-           vec_i8imm_op:$m2, vec_i8imm_op:$m3,
-           vec_i8imm_op:$m4, vec_i8imm_op:$m5,
-           vec_i8imm_op:$m6, vec_i8imm_op:$m7,
-           vec_i8imm_op:$m8, vec_i8imm_op:$m9,
-           vec_i8imm_op:$mA, vec_i8imm_op:$mB,
-           vec_i8imm_op:$mC, vec_i8imm_op:$mD,
-           vec_i8imm_op:$mE, vec_i8imm_op:$mF),
-         [],
-         "v8x16.shuffle\t$dst, $x, $y, "#
-           "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
-           "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
-         "v8x16.shuffle\t"#
-           "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
-           "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
-         23>;
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
 
-// Shuffles after custom lowering
-def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
-def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>;
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
-            (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
-            (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
-            (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
-            (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7),
-            (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9),
-            (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
-            (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
-            (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF))),
-          (vec_t (SHUFFLE (vec_t V128:$x), (vec_t V128:$y),
-            (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
-            (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
-            (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
-            (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7),
-            (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9),
-            (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
-            (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
-            (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>;
+multiclass SIMDCondition<ValueType vec_t, ValueType out_t, string vec,
+                         string name, CondCode cond, bits<32> simdop> {
+  defm _#vec_t :
+    SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
+           [(set (out_t V128:$dst),
+             (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond)
+           )],
+           vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>;
+}
+
+multiclass SIMDConditionInt<string name, CondCode cond, bits<32> baseInst> {
+  defm "" : SIMDCondition<v16i8, v16i8, "i8x16", name, cond, baseInst>;
+  defm "" : SIMDCondition<v8i16, v8i16, "i16x8", name, cond,
+                          !add(baseInst, 10)>;
+  defm "" : SIMDCondition<v4i32, v4i32, "i32x4", name, cond,
+                          !add(baseInst, 20)>;
+}
+
+multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
+  defm "" : SIMDCondition<v4f32, v4i32, "f32x4", name, cond, baseInst>;
+  defm "" : SIMDCondition<v2f64, v2i64, "f64x2", name, cond,
+                          !add(baseInst, 6)>;
 }
 
+// Equality: eq
+let isCommutable = 1 in {
+defm EQ : SIMDConditionInt<"eq", SETEQ, 24>;
+defm EQ : SIMDConditionFP<"eq", SETOEQ, 64>;
+} // isCommutable = 1
+
+// Non-equality: ne
+let isCommutable = 1 in {
+defm NE : SIMDConditionInt<"ne", SETNE, 25>;
+defm NE : SIMDConditionFP<"ne", SETUNE, 65>;
+} // isCommutable = 1
+
+// Less than: lt_s / lt_u / lt
+defm LT_S : SIMDConditionInt<"lt_s", SETLT, 26>;
+defm LT_U : SIMDConditionInt<"lt_u", SETULT, 27>;
+defm LT : SIMDConditionFP<"lt", SETOLT, 66>;
+
+// Greater than: gt_s / gt_u / gt
+defm GT_S : SIMDConditionInt<"gt_s", SETGT, 28>;
+defm GT_U : SIMDConditionInt<"gt_u", SETUGT, 29>;
+defm GT : SIMDConditionFP<"gt", SETOGT, 67>;
+
+// Less than or equal: le_s / le_u / le
+defm LE_S : SIMDConditionInt<"le_s", SETLE, 30>;
+defm LE_U : SIMDConditionInt<"le_u", SETULE, 31>;
+defm LE : SIMDConditionFP<"le", SETOLE, 68>;
+
+// Greater than or equal: ge_s / ge_u / ge
+defm GE_S : SIMDConditionInt<"ge_s", SETGE, 32>;
+defm GE_U : SIMDConditionInt<"ge_u", SETUGE, 33>;
+defm GE : SIMDConditionFP<"ge", SETOGE, 69>;
+
+// Lower float comparisons that don't care about NaN to standard WebAssembly
+// float comparisons. These instructions are generated in the target-independent
+// expansion of unordered comparisons and ordered ne.
+def : Pat<(v4i32 (seteq (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
+          (v4i32 (EQ_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
+def : Pat<(v4i32 (setne (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
+          (v4i32 (NE_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
+def : Pat<(v2i64 (seteq (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
+          (v2i64 (EQ_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
+def : Pat<(v2i64 (setne (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
+          (v2i64 (NE_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
+
 //===----------------------------------------------------------------------===//
-// Integer arithmetic
+// Bitwise operations
 //===----------------------------------------------------------------------===//
 
 multiclass SIMDBinary<ValueType vec_t, string vec, SDNode node, string name,
@@ -419,66 +543,87 @@ multiclass SIMDBinary<ValueType vec_t, string vec, SDNode node, string name,
                         simdop>;
 }
 
-multiclass SIMDBinaryIntNoI64x2<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDBinary<v16i8, "i8x16", node, name, baseInst>;
-  defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 1)>;
-  defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 2)>;
+multiclass SIMDBitwise<SDNode node, string name, bits<32> simdop> {
+  defm "" : SIMDBinary<v16i8, "v128", node, name, simdop>;
+  defm "" : SIMDBinary<v8i16, "v128", node, name, simdop>;
+  defm "" : SIMDBinary<v4i32, "v128", node, name, simdop>;
+  defm "" : SIMDBinary<v2i64, "v128", node, name, simdop>;
 }
 
-multiclass SIMDBinaryInt<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDBinaryIntNoI64x2<node, name, baseInst>;
-  defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 3)>;
+multiclass SIMDUnary<ValueType vec_t, string vec, SDNode node, string name,
+                     bits<32> simdop> {
+  defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
+                        [(set (vec_t V128:$dst),
+                          (vec_t (node (vec_t V128:$vec)))
+                        )],
+                        vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>;
 }
 
-// Integer vector negation
-def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
-
-// Integer addition: add
-let isCommutable = 1 in
-defm ADD : SIMDBinaryInt<add, "add", 24>;
-
-// Integer subtraction: sub
-defm SUB : SIMDBinaryInt<sub, "sub", 28>;
+// Bitwise logic: v128.not
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
+defm NOT: SIMDUnary<vec_t, "v128", vnot, "not", 76>;
 
-// Integer multiplication: mul
-defm MUL : SIMDBinaryIntNoI64x2<mul, "mul", 32>;
+// Bitwise logic: v128.and / v128.or / v128.xor
+let isCommutable = 1 in {
+defm AND : SIMDBitwise<and, "and", 77>;
+defm OR : SIMDBitwise<or, "or", 78>;
+defm XOR : SIMDBitwise<xor, "xor", 79>;
+} // isCommutable = 1
 
-// Integer negation: neg
-multiclass SIMDNeg<ValueType vec_t, string vec, SDNode neg, bits<32> simdop> {
-  defm NEG_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
-                           [(set (vec_t V128:$dst),
-                             (vec_t (neg (vec_t V128:$vec)))
-                           )],
-                           vec#".neg\t$dst, $vec", vec#".neg", simdop>;
-}
+// Bitwise select: v128.bitselect
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
+  defm BITSELECT_#vec_t :
+    SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins),
+           [(set (vec_t V128:$dst),
+             (vec_t (int_wasm_bitselect
+               (vec_t V128:$c), (vec_t V128:$v1), (vec_t V128:$v2)
+             ))
+           )],
+           "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 80>;
 
-defm "" : SIMDNeg<v16i8, "i8x16", ivneg, 36>;
-defm "" : SIMDNeg<v8i16, "i16x8", ivneg, 37>;
-defm "" : SIMDNeg<v4i32, "i32x4", ivneg, 38>;
-defm "" : SIMDNeg<v2i64, "i64x2", ivneg, 39>;
+// Bitselect is equivalent to (c & v1) | (~c & v2)
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
+  def : Pat<(vec_t (or (and (vec_t V128:$c), (vec_t V128:$v1)),
+              (and (vnot V128:$c), (vec_t V128:$v2)))),
+            (!cast<Instruction>("BITSELECT_"#vec_t)
+              V128:$v1, V128:$v2, V128:$c)>;
 
 //===----------------------------------------------------------------------===//
-// Saturating integer arithmetic
+// Integer unary arithmetic
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDBinarySat<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDBinary<v16i8, "i8x16", node, name, baseInst>;
-  defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 2)>;
+multiclass SIMDUnaryInt<SDNode node, string name, bits<32> baseInst> {
+  defm "" : SIMDUnary<v16i8, "i8x16", node, name, baseInst>;
+  defm "" : SIMDUnary<v8i16, "i16x8", node, name, !add(baseInst, 17)>;
+  defm "" : SIMDUnary<v4i32, "i32x4", node, name, !add(baseInst, 34)>;
+  defm "" : SIMDUnary<v2i64, "i64x2", node, name, !add(baseInst, 51)>;
 }
 
-// Saturating integer addition: add_saturate_s / add_saturate_u
-let isCommutable = 1 in {
-defm ADD_SAT_S :
-  SIMDBinarySat<saddsat, "add_saturate_s", 40>;
-defm ADD_SAT_U :
-  SIMDBinarySat<uaddsat, "add_saturate_u", 41>;
-} // isCommutable = 1
+multiclass SIMDReduceVec<ValueType vec_t, string vec, SDNode op, string name,
+                         bits<32> simdop> {
+  defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
+                        [(set I32:$dst, (i32 (op (vec_t V128:$vec))))],
+                        vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>;
+}
 
-// Saturating integer subtraction: sub_saturate_s / sub_saturate_u
-defm SUB_SAT_S :
-  SIMDBinarySat<int_wasm_sub_saturate_signed, "sub_saturate_s", 44>;
-defm SUB_SAT_U :
-  SIMDBinarySat<int_wasm_sub_saturate_unsigned, "sub_saturate_u", 45>;
+multiclass SIMDReduce<SDNode op, string name, bits<32> baseInst> {
+  defm "" : SIMDReduceVec<v16i8, "i8x16", op, name, baseInst>;
+  defm "" : SIMDReduceVec<v8i16, "i16x8", op, name, !add(baseInst, 17)>;
+  defm "" : SIMDReduceVec<v4i32, "i32x4", op, name, !add(baseInst, 34)>;
+  defm "" : SIMDReduceVec<v2i64, "i64x2", op, name, !add(baseInst, 51)>;
+}
+
+// Integer vector negation
+def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
+
+// Integer negation: neg
+defm NEG : SIMDUnaryInt<ivneg, "neg", 81>;
+
+// Any lane true: any_true
+defm ANYTRUE : SIMDReduce<int_wasm_anytrue, "any_true", 82>;
+
+// All lanes true: all_true
+defm ALLTRUE : SIMDReduce<int_wasm_alltrue, "all_true", 83>;
 
 //===----------------------------------------------------------------------===//
 // Bit shifts
@@ -493,22 +638,22 @@ multiclass SIMDShift<ValueType vec_t, string vec, SDNode node, dag shift_vec,
                         vec#"."#name#"\t$dst, $vec, $x", vec#"."#name, simdop>;
 }
 
-multiclass SIMDShiftInt<SDNode node, string name, bits<32> baseInst, int skip> {
+multiclass SIMDShiftInt<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDShift<v16i8, "i8x16", node, (splat16 I32:$x), name, baseInst>;
   defm "" : SIMDShift<v8i16, "i16x8", node, (splat8 I32:$x), name,
-                      !add(baseInst, !if(skip, 2, 1))>;
+                      !add(baseInst, 17)>;
   defm "" : SIMDShift<v4i32, "i32x4", node, (splat4 I32:$x), name,
-                      !add(baseInst, !if(skip, 4, 2))>;
+                      !add(baseInst, 34)>;
   defm "" : SIMDShift<v2i64, "i64x2", node, (splat2 (i64 (zext I32:$x))),
-                      name, !add(baseInst, !if(skip, 6, 3))>;
+                      name, !add(baseInst, 51)>;
 }
 
 // Left shift by scalar: shl
-defm SHL : SIMDShiftInt<shl, "shl", 48, 0>;
+defm SHL : SIMDShiftInt<shl, "shl", 84>;
 
 // Right shift by scalar: shr_s / shr_u
-defm SHR_S : SIMDShiftInt<sra, "shr_s", 52, 1>;
-defm SHR_U : SIMDShiftInt<srl, "shr_u", 53, 1>;
+defm SHR_S : SIMDShiftInt<sra, "shr_s", 85>;
+defm SHR_U : SIMDShiftInt<srl, "shr_u", 86>;
 
 // Truncate i64 shift operands to i32s
 foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in
@@ -529,267 +674,87 @@ def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), I32:$x)),
           (v2i64 (shifts[1] (v2i64 V128:$vec), I32:$x))>;
 
 //===----------------------------------------------------------------------===//
-// Bitwise operations
-//===----------------------------------------------------------------------===//
-
-multiclass SIMDBitwise<SDNode node, string name, bits<32> simdop> {
-  defm "" : SIMDBinary<v16i8, "v128", node, name, simdop>;
-  defm "" : SIMDBinary<v8i16, "v128", node, name, simdop>;
-  defm "" : SIMDBinary<v4i32, "v128", node, name, simdop>;
-  defm "" : SIMDBinary<v2i64, "v128", node, name, simdop>;
-}
-
-// Bitwise logic: v128.and / v128.or / v128.xor
-let isCommutable = 1 in {
-defm AND : SIMDBitwise<and, "and", 60>;
-defm OR : SIMDBitwise<or, "or", 61>;
-defm XOR : SIMDBitwise<xor, "xor", 62>;
-} // isCommutable = 1
-
-// Bitwise logic: v128.not
-multiclass SIMDNot<ValueType vec_t> {
-  defm NOT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
-                           [(set (vec_t V128:$dst), (vec_t (vnot V128:$vec)))],
-                           "v128.not\t$dst, $vec", "v128.not", 63>;
-}
-
-defm "" : SIMDNot<v16i8>;
-defm "" : SIMDNot<v8i16>;
-defm "" : SIMDNot<v4i32>;
-defm "" : SIMDNot<v2i64>;
-
-// Bitwise select: v128.bitselect
-multiclass Bitselect<ValueType vec_t> {
-  defm BITSELECT_#vec_t :
-    SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins),
-           [(set (vec_t V128:$dst),
-             (vec_t (int_wasm_bitselect
-               (vec_t V128:$c), (vec_t V128:$v1), (vec_t V128:$v2)
-             ))
-           )],
-           "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 64>;
-}
-
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
-defm "" : Bitselect<vec_t>;
-
-// Bitselect is equivalent to (c & v1) | (~c & v2)
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
-  def : Pat<(vec_t (or (and (vec_t V128:$c), (vec_t V128:$v1)),
-              (and (vnot V128:$c), (vec_t V128:$v2)))),
-            (!cast<Instruction>("BITSELECT_"#vec_t)
-              V128:$v1, V128:$v2, V128:$c)>;
-
-//===----------------------------------------------------------------------===//
-// Boolean horizontal reductions
+// Integer binary arithmetic
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDReduceVec<ValueType vec_t, string vec, string name, SDNode op,
-                         bits<32> simdop> {
-  defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
-                        [(set I32:$dst, (i32 (op (vec_t V128:$vec))))],
-                        vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>;
-}
-
-multiclass SIMDReduce<string name, SDNode op, bits<32> baseInst> {
-  defm "" : SIMDReduceVec<v16i8, "i8x16", name, op, baseInst>;
-  defm "" : SIMDReduceVec<v8i16, "i16x8", name, op, !add(baseInst, 1)>;
-  defm "" : SIMDReduceVec<v4i32, "i32x4", name, op, !add(baseInst, 2)>;
-  defm "" : SIMDReduceVec<v2i64, "i64x2", name, op, !add(baseInst, 3)>;
-}
-
-// Any lane true: any_true
-defm ANYTRUE : SIMDReduce<"any_true", int_wasm_anytrue, 65>;
-
-// All lanes true: all_true
-defm ALLTRUE : SIMDReduce<"all_true", int_wasm_alltrue, 69>;
-
-//===----------------------------------------------------------------------===//
-// Comparisons
-//===----------------------------------------------------------------------===//
-
-multiclass SIMDCondition<ValueType vec_t, ValueType out_t, string vec,
-                         string name, CondCode cond, bits<32> simdop> {
-  defm _#vec_t :
-    SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
-           [(set (out_t V128:$dst),
-             (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond)
-           )],
-           vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>;
+multiclass SIMDBinaryIntSmall<SDNode node, string name, bits<32> baseInst> {
+  defm "" : SIMDBinary<v16i8, "i8x16", node, name, baseInst>;
+  defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 17)>;
 }
 
-multiclass SIMDConditionInt<string name, CondCode cond, bits<32> baseInst,
-                            int step = 1> {
-  defm "" : SIMDCondition<v16i8, v16i8, "i8x16", name, cond, baseInst>;
-  defm "" : SIMDCondition<v8i16, v8i16, "i16x8", name, cond,
-                          !add(baseInst, step)>;
-  defm "" : SIMDCondition<v4i32, v4i32, "i32x4", name, cond,
-                          !add(!add(baseInst, step), step)>;
+multiclass SIMDBinaryIntNoI64x2<SDNode node, string name, bits<32> baseInst> {
+  defm "" : SIMDBinaryIntSmall<node, name, baseInst>;
+  defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 34)>;
 }
 
-multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
-  defm "" : SIMDCondition<v4f32, v4i32, "f32x4", name, cond, baseInst>;
-  defm "" : SIMDCondition<v2f64, v2i64, "f64x2", name, cond,
-                          !add(baseInst, 1)>;
+multiclass SIMDBinaryInt<SDNode node, string name, bits<32> baseInst> {
+  defm "" : SIMDBinaryIntNoI64x2<node, name, baseInst>;
+  defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 51)>;
 }
 
-// Equality: eq
+// Integer addition: add / add_saturate_s / add_saturate_u
 let isCommutable = 1 in {
-defm EQ : SIMDConditionInt<"eq", SETEQ, 73>;
-defm EQ : SIMDConditionFP<"eq", SETOEQ, 77>;
+defm ADD : SIMDBinaryInt<add, "add", 87>;
+defm ADD_SAT_S : SIMDBinaryIntSmall<saddsat, "add_saturate_s", 88>;
+defm ADD_SAT_U : SIMDBinaryIntSmall<uaddsat, "add_saturate_u", 89>;
 } // isCommutable = 1
 
-// Non-equality: ne
-let isCommutable = 1 in {
-defm NE : SIMDConditionInt<"ne", SETNE, 79>;
-defm NE : SIMDConditionFP<"ne", SETUNE, 83>;
-} // isCommutable = 1
-
-// Less than: lt_s / lt_u / lt
-defm LT_S : SIMDConditionInt<"lt_s", SETLT, 85, 2>;
-defm LT_U : SIMDConditionInt<"lt_u", SETULT, 86, 2>;
-defm LT : SIMDConditionFP<"lt", SETOLT, 93>;
-
-// Less than or equal: le_s / le_u / le
-defm LE_S : SIMDConditionInt<"le_s", SETLE, 95, 2>;
-defm LE_U : SIMDConditionInt<"le_u", SETULE, 96, 2>;
-defm LE : SIMDConditionFP<"le", SETOLE, 103>;
-
-// Greater than: gt_s / gt_u / gt
-defm GT_S : SIMDConditionInt<"gt_s", SETGT, 105, 2>;
-defm GT_U : SIMDConditionInt<"gt_u", SETUGT, 106, 2>;
-defm GT : SIMDConditionFP<"gt", SETOGT, 113>;
-
-// Greater than or equal: ge_s / ge_u / ge
-defm GE_S : SIMDConditionInt<"ge_s", SETGE, 115, 2>;
-defm GE_U : SIMDConditionInt<"ge_u", SETUGE, 116, 2>;
-defm GE : SIMDConditionFP<"ge", SETOGE, 123>;
+// Integer subtraction: sub / sub_saturate_s / sub_saturate_u
+defm SUB : SIMDBinaryInt<sub, "sub", 90>;
+defm SUB_SAT_S :
+  SIMDBinaryIntSmall<int_wasm_sub_saturate_signed, "sub_saturate_s", 91>;
+defm SUB_SAT_U :
+  SIMDBinaryIntSmall<int_wasm_sub_saturate_unsigned, "sub_saturate_u", 92>;
 
-// Lower float comparisons that don't care about NaN to standard WebAssembly
-// float comparisons. These instructions are generated in the target-independent
-// expansion of unordered comparisons and ordered ne.
-def : Pat<(v4i32 (seteq (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
-          (v4i32 (EQ_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
-def : Pat<(v4i32 (setne (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
-          (v4i32 (NE_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
-def : Pat<(v2i64 (seteq (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
-          (v2i64 (EQ_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
-def : Pat<(v2i64 (setne (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
-          (v2i64 (NE_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
+// Integer multiplication: mul
+defm MUL : SIMDBinaryIntNoI64x2<mul, "mul", 93>;
 
 //===----------------------------------------------------------------------===//
-// Load and store
+// Floating-point unary arithmetic
 //===----------------------------------------------------------------------===//
 
-// Load: v128.load
-multiclass SIMDLoad<ValueType vec_t> {
-  let mayLoad = 1 in
-  defm LOAD_#vec_t :
-    SIMD_I<(outs V128:$dst), (ins P2Align:$align, offset32_op:$off, I32:$addr),
-           (outs), (ins P2Align:$align, offset32_op:$off), [],
-           "v128.load\t$dst, ${off}(${addr})$align",
-           "v128.load\t$off$align", 1>;
+multiclass SIMDUnaryFP<SDNode node, string name, bits<32> baseInst> {
+  defm "" : SIMDUnary<v4f32, "f32x4", node, name, baseInst>;
+  defm "" : SIMDUnary<v2f64, "f64x2", node, name, !add(baseInst, 11)>;
 }
 
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-defm "" : SIMDLoad<vec_t>;
-
-// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-def : LoadPatNoOffset<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatImmOff<vec_t, load, regPlusImm, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatImmOff<vec_t, load, or_is_add, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatGlobalAddr<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatExternalSym<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatOffsetOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatGlobalAddrOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatExternSymOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-}
-
-// Store: v128.store
-multiclass SIMDStore<ValueType vec_t> {
-  let mayStore = 1 in
-  defm STORE_#vec_t :
-    SIMD_I<(outs), (ins P2Align:$align, offset32_op:$off, I32:$addr, V128:$vec),
-           (outs), (ins P2Align:$align, offset32_op:$off), [],
-           "v128.store\t${off}(${addr})$align, $vec",
-           "v128.store\t$off$align", 2>;
-}
-
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-defm "" : SIMDStore<vec_t>;
-
-// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-def : StorePatNoOffset<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatImmOff<vec_t, store, regPlusImm, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatImmOff<vec_t, store, or_is_add, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatGlobalAddr<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatExternalSym<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatOffsetOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatExternSymOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point sign bit operations
-//===----------------------------------------------------------------------===//
+// Absolute value: abs
+defm ABS : SIMDUnaryFP<fabs, "abs", 149>;
 
 // Negation: neg
-defm "" : SIMDNeg<v4f32, "f32x4", fneg, 125>;
-defm "" : SIMDNeg<v2f64, "f64x2", fneg, 126>;
+defm NEG : SIMDUnaryFP<fneg, "neg", 150>;
 
-// Absolute value: abs
-multiclass SIMDAbs<ValueType vec_t, string vec, bits<32> simdop> {
-  defm ABS_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
-                           [(set (vec_t V128:$dst), (vec_t (fabs V128:$vec)))],
-                           vec#".abs\t$dst, $vec", vec#".abs", simdop>;
-}
-
-defm "" : SIMDAbs<v4f32, "f32x4", 127>;
-defm "" : SIMDAbs<v2f64, "f64x2", 128>;
+// Square root: sqrt
+defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 151>;
 
 //===----------------------------------------------------------------------===//
-// Floating-point min and max
+// Floating-point binary arithmetic
 //===----------------------------------------------------------------------===//
 
 multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<v4f32, "f32x4", node, name, baseInst>;
-  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 1)>;
+  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 11)>;
 }
 
-// NaN-propagating minimum: min
-defm MIN : SIMDBinaryFP<fminimum, "min", 129>;
-
-// NaN-propagating maximum: max
-defm MAX : SIMDBinaryFP<fmaximum, "max", 131>;
-
-//===----------------------------------------------------------------------===//
-// Floating-point arithmetic
-//===----------------------------------------------------------------------===//
-
 // Addition: add
 let isCommutable = 1 in
-defm ADD : SIMDBinaryFP<fadd, "add", 133>;
+defm ADD : SIMDBinaryFP<fadd, "add", 154>;
 
 // Subtraction: sub
-defm SUB : SIMDBinaryFP<fsub, "sub", 135>;
-
-// Division: div
-defm DIV : SIMDBinaryFP<fdiv, "div", 137>;
+defm SUB : SIMDBinaryFP<fsub, "sub", 155>;
 
 // Multiplication: mul
 let isCommutable = 1 in
-defm MUL : SIMDBinaryFP<fmul, "mul", 139>;
+defm MUL : SIMDBinaryFP<fmul, "mul", 156>;
 
-// Square root: sqrt
-multiclass SIMDSqrt<ValueType vec_t, string vec, bits<32> simdop> {
-  defm SQRT_#vec_t :
-    SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
-           [(set (vec_t V128:$dst), (vec_t (fsqrt V128:$vec)))],
-           vec#".sqrt\t$dst, $vec", vec#".sqrt", simdop>;
-}
+// Division: div
+defm DIV : SIMDBinaryFP<fdiv, "div", 157>;
 
-defm "" : SIMDSqrt<v4f32, "f32x4", 141>;
-defm "" : SIMDSqrt<v2f64, "f64x2", 142>;
+// NaN-propagating minimum: min
+defm MIN : SIMDBinaryFP<fminimum, "min", 158>;
+
+// NaN-propagating maximum: max
+defm MAX : SIMDBinaryFP<fmaximum, "max", 159>;
 
 //===----------------------------------------------------------------------===//
 // Conversions
@@ -804,16 +769,16 @@ multiclass SIMDConvert<ValueType vec_t, ValueType arg_t, SDNode op,
 }
 
 // Integer to floating point: convert_s / convert_u
-defm "" : SIMDConvert<v4f32, v4i32, sint_to_fp, "f32x4.convert_s/i32x4", 143>;
-defm "" : SIMDConvert<v4f32, v4i32, uint_to_fp, "f32x4.convert_u/i32x4", 144>;
-defm "" : SIMDConvert<v2f64, v2i64, sint_to_fp, "f64x2.convert_s/i64x2", 145>;
-defm "" : SIMDConvert<v2f64, v2i64, uint_to_fp, "f64x2.convert_u/i64x2", 146>;
+defm "" : SIMDConvert<v4f32, v4i32, sint_to_fp, "f32x4.convert_s/i32x4", 175>;
+defm "" : SIMDConvert<v4f32, v4i32, uint_to_fp, "f32x4.convert_u/i32x4", 176>;
+defm "" : SIMDConvert<v2f64, v2i64, sint_to_fp, "f64x2.convert_s/i64x2", 177>;
+defm "" : SIMDConvert<v2f64, v2i64, uint_to_fp, "f64x2.convert_u/i64x2", 178>;
 
 // Floating point to integer with saturation: trunc_sat_s / trunc_sat_u
-defm "" : SIMDConvert<v4i32, v4f32, fp_to_sint, "i32x4.trunc_sat_s/f32x4", 147>;
-defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_u/f32x4", 148>;
-defm "" : SIMDConvert<v2i64, v2f64, fp_to_sint, "i64x2.trunc_sat_s/f64x2", 149>;
-defm "" : SIMDConvert<v2i64, v2f64, fp_to_uint, "i64x2.trunc_sat_u/f64x2", 150>;
+defm "" : SIMDConvert<v4i32, v4f32, fp_to_sint, "i32x4.trunc_sat_s/f32x4", 171>;
+defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_u/f32x4", 172>;
+defm "" : SIMDConvert<v2i64, v2f64, fp_to_sint, "i64x2.trunc_sat_s/f64x2", 173>;
+defm "" : SIMDConvert<v2i64, v2f64, fp_to_uint, "i64x2.trunc_sat_u/f64x2", 174>;
 
 // Lower llvm.wasm.trunc.saturate.* to saturating instructions
 def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 98953f0948244eec5b3f2caa040677901fef4f08..871e92082a488286ab0ef55390253830ac16f4d4 100644
--- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -31,6 +31,7 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  bool removeUnnecessaryUnreachables(MachineFunction &MF);
   bool replaceFuncletReturns(MachineFunction &MF);
   bool hoistCatches(MachineFunction &MF);
   bool addCatchAlls(MachineFunction &MF);
@@ -59,7 +60,7 @@ FunctionPass *llvm::createWebAssemblyLateEHPrepare() {
 // possible search paths should be the same.
 // Returns nullptr in case it does not find any EH pad in the search, or finds
 // multiple different EH pads.
-static MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) {
+static MachineBasicBlock *getMatchingEHPad(MachineInstr *MI) {
   MachineFunction *MF = MI->getParent()->getParent();
   SmallVector<MachineBasicBlock *, 2> WL;
   SmallPtrSet<MachineBasicBlock *, 2> Visited;
@@ -83,18 +84,15 @@ static MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) {
   return EHPad;
 }
 
-// Erases the given BBs and all their children from the function. If other BBs
-// have the BB as a successor, the successor relationships will be deleted as
-// well.
+// Erase the specified BBs if the BB does not have any remaining predecessors,
+// and also all its dead children.
 template <typename Container>
-static void EraseBBsAndChildren(const Container &MBBs) {
+static void eraseDeadBBsAndChildren(const Container &MBBs) {
   SmallVector<MachineBasicBlock *, 8> WL(MBBs.begin(), MBBs.end());
   while (!WL.empty()) {
     MachineBasicBlock *MBB = WL.pop_back_val();
-    SmallVector<MachineBasicBlock *, 4> Preds(MBB->pred_begin(),
-                                              MBB->pred_end());
-    for (auto *Pred : Preds)
-      Pred->removeSuccessor(MBB);
+    if (!MBB->pred_empty())
+      continue;
     SmallVector<MachineBasicBlock *, 4> Succs(MBB->succ_begin(),
                                               MBB->succ_end());
     WL.append(MBB->succ_begin(), MBB->succ_end());
@@ -110,6 +108,7 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   bool Changed = false;
+  Changed |= removeUnnecessaryUnreachables(MF);
   Changed |= addRethrows(MF);
   if (!MF.getFunction().hasPersonalityFn())
     return Changed;
@@ -122,6 +121,31 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
+bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
+    MachineFunction &MF) {
+  bool Changed = false;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (!WebAssembly::isThrow(MI))
+        continue;
+      Changed = true;
+
+      // The instruction after the throw should be an unreachable or a branch to
+      // another BB that should eventually lead to an unreachable. Delete it
+      // because throw itself is a terminator, and also delete successors if
+      // any.
+      MBB.erase(std::next(MachineBasicBlock::iterator(MI)), MBB.end());
+      SmallVector<MachineBasicBlock *, 8> Succs(MBB.succ_begin(),
+                                                MBB.succ_end());
+      for (auto *Succ : Succs)
+        MBB.removeSuccessor(Succ);
+      eraseDeadBBsAndChildren(Succs);
+    }
+  }
+
+  return Changed;
+}
+
 bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
   bool Changed = false;
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
@@ -183,7 +207,7 @@ bool WebAssemblyLateEHPrepare::hoistCatches(MachineFunction &MF) {
         Catches.push_back(&MI);
 
   for (auto *Catch : Catches) {
-    MachineBasicBlock *EHPad = GetMatchingEHPad(Catch);
+    MachineBasicBlock *EHPad = getMatchingEHPad(Catch);
     assert(EHPad && "No matching EH pad for catch");
     if (EHPad->begin() == Catch)
       continue;
@@ -242,7 +266,7 @@ bool WebAssemblyLateEHPrepare::addRethrows(MachineFunction &MF) {
         Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(),
                           TII.get(WebAssembly::RETHROW_TO_CALLER));
 
-      // Becasue __cxa_rethrow does not return, the instruction after the
+      // Because __cxa_rethrow does not return, the instruction after the
       // rethrow should be an unreachable or a branch to another BB that should
       // eventually lead to an unreachable. Delete it because rethrow itself is
       // a terminator, and also delete non-EH pad successors if any.
@@ -251,7 +275,9 @@ bool WebAssemblyLateEHPrepare::addRethrows(MachineFunction &MF) {
       for (auto *Succ : MBB.successors())
         if (!Succ->isEHPad())
           NonPadSuccessors.push_back(Succ);
-      EraseBBsAndChildren(NonPadSuccessors);
+      for (auto *Succ : NonPadSuccessors)
+        MBB.removeSuccessor(Succ);
+      eraseDeadBBsAndChildren(NonPadSuccessors);
     }
   return Changed;
 }
@@ -283,7 +309,7 @@ bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) {
 
   bool Changed = false;
   for (auto *Call : ClangCallTerminateCalls) {
-    MachineBasicBlock *EHPad = GetMatchingEHPad(Call);
+    MachineBasicBlock *EHPad = getMatchingEHPad(Call);
     assert(EHPad && "No matching EH pad for catch");
 
     // If it is already the form we want, skip it
@@ -308,7 +334,11 @@ bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) {
     BuildMI(*EHPad, InsertPos, Call->getDebugLoc(),
             TII.get(WebAssembly::UNREACHABLE));
     EHPad->erase(InsertPos, EHPad->end());
-    EraseBBsAndChildren(EHPad->successors());
+    SmallVector<MachineBasicBlock *, 8> Succs(EHPad->succ_begin(),
+                                              EHPad->succ_end());
+    for (auto *Succ : Succs)
+      EHPad->removeSuccessor(Succ);
+    eraseDeadBBsAndChildren(Succs);
   }
   return Changed;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index f0d24075801f31e3364e4a8aabc47e6a86f95ba5..8755198f45b52481ae73854e18d8d7248e93a380 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -50,24 +50,19 @@
 ///
 /// In detail, this pass does following things:
 ///
-/// 1) Assumes the existence of global variables: __THREW__, __threwValue, and
-///    __tempRet0.
-///    __tempRet0 will be set within __cxa_find_matching_catch() function in
-///    JS library, and __THREW__ and __threwValue will be set in invoke wrappers
+/// 1) Assumes the existence of global variables: __THREW__, __threwValue
+///    __THREW__ and __threwValue will be set in invoke wrappers
 ///    in JS glue code. For what invoke wrappers are, refer to 3). These
 ///    variables are used for both exceptions and setjmp/longjmps.
 ///    __THREW__ indicates whether an exception or a longjmp occurred or not. 0
 ///    means nothing occurred, 1 means an exception occurred, and other numbers
 ///    mean a longjmp occurred. In the case of longjmp, __threwValue variable
 ///    indicates the corresponding setjmp buffer the longjmp corresponds to.
-///    In exception handling, __tempRet0 indicates the type of an exception
-///    caught, and in setjmp/longjmp, it means the second argument to longjmp
-///    function.
 ///
 /// * Exception handling
 ///
-/// 2) We assume the existence of setThrew and setTempRet0 functions at link
-///    time.
+/// 2) We assume the existence of setThrew and setTempRet0/getTempRet0 functions
+///    at link time.
 ///    The global variables in 1) will exist in wasm address space,
 ///    but their values should be set in JS code, so these functions
 ///    as interfaces to JS glue code. These functions are equivalent to the
@@ -80,10 +75,12 @@
 ///        __threwValue = value;
 ///      }
 ///    }
+//
+///    setTempRet0 is called from __cxa_find_matching_catch() in JS glue code.
 ///
-///    function setTempRet0(value) {
-///      __tempRet0 = value;
-///    }
+///    In exception handling, getTempRet0 indicates the type of an exception
+///    caught, and in setjmp/longjmp, it means the second argument to longjmp
+///    function.
 ///
 /// 3) Lower
 ///      invoke @func(arg1, arg2) to label %invoke.cont unwind label %lpad
@@ -120,11 +117,10 @@
 ///      ... use %val ...
 ///    into
 ///      %fmc = call @__cxa_find_matching_catch_N(c1, c2, c3, ...)
-///      %val = {%fmc, __tempRet0}
+///      %val = {%fmc, getTempRet0()}
 ///      ... use %val ...
 ///    Here N is a number calculated based on the number of clauses.
-///    Global variable __tempRet0 is set within __cxa_find_matching_catch() in
-///    JS glue code.
+///    setTempRet0 is called from __cxa_find_matching_catch() in JS glue code.
 ///
 /// 5) Lower
 ///      resume {%a, %b}
@@ -140,7 +136,17 @@
 ///
 /// * Setjmp / Longjmp handling
 ///
-/// 7) In the function entry that calls setjmp, initialize setjmpTable and
+/// In case calls to longjmp() exists
+///
+/// 1) Lower
+///      longjmp(buf, value)
+///    into
+///      emscripten_longjmp_jmpbuf(buf, value)
+///    emscripten_longjmp_jmpbuf will be lowered to emscripten_longjmp later.
+///
+/// In case calls to setjmp() exists
+///
+/// 2) In the function entry that calls setjmp, initialize setjmpTable and
 ///    sejmpTableSize as follows:
 ///      setjmpTableSize = 4;
 ///      setjmpTable = (int *) malloc(40);
@@ -148,27 +154,22 @@
 ///    setjmpTable and setjmpTableSize are used in saveSetjmp() function in JS
 ///    code.
 ///
-/// 8) Lower
+/// 3) Lower
 ///      setjmp(buf)
 ///    into
 ///      setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize);
-///      setjmpTableSize = __tempRet0;
+///      setjmpTableSize = getTempRet0();
 ///    For each dynamic setjmp call, setjmpTable stores its ID (a number which
 ///    is incrementally assigned from 0) and its label (a unique number that
 ///    represents each callsite of setjmp). When we need more entries in
 ///    setjmpTable, it is reallocated in saveSetjmp() in JS code and it will
 ///    return the new table address, and assign the new table size in
-///    __tempRet0. saveSetjmp also stores the setjmp's ID into the buffer buf.
-///    A BB with setjmp is split into two after setjmp call in order to make the
-///    post-setjmp BB the possible destination of longjmp BB.
+///    setTempRet0(). saveSetjmp also stores the setjmp's ID into the buffer
+///    buf. A BB with setjmp is split into two after setjmp call in order to
+///    make the post-setjmp BB the possible destination of longjmp BB.
 ///
-/// 9) Lower
-///      longjmp(buf, value)
-///    into
-///      emscripten_longjmp_jmpbuf(buf, value)
-///    emscripten_longjmp_jmpbuf will be lowered to emscripten_longjmp later.
 ///
-/// 10) Lower every call that might longjmp into
+/// 4) Lower every call that might longjmp into
 ///      __THREW__ = 0;
 ///      call @__invoke_SIG(func, arg1, arg2)
 ///      %__THREW__.val = __THREW__;
@@ -178,32 +179,32 @@
 ///                            setjmpTableSize);
 ///        if (%label == 0)
 ///          emscripten_longjmp(%__THREW__.val, __threwValue);
-///        __tempRet0 = __threwValue;
+///        setTempRet0(__threwValue);
 ///      } else {
 ///        %label = -1;
 ///      }
-///      longjmp_result = __tempRet0;
+///      longjmp_result = getTempRet0();
 ///      switch label {
 ///        label 1: goto post-setjmp BB 1
 ///        label 2: goto post-setjmp BB 2
 ///        ...
 ///        default: goto splitted next BB
 ///      }
-///     testSetjmp examines setjmpTable to see if there is a matching setjmp
-///     call. After calling an invoke wrapper, if a longjmp occurred, __THREW__
-///     will be the address of matching jmp_buf buffer and __threwValue be the
-///     second argument to longjmp. mem[__THREW__.val] is a setjmp ID that is
-///     stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to
-///     each setjmp callsite. Label 0 means this longjmp buffer does not
-///     correspond to one of the setjmp callsites in this function, so in this
-///     case we just chain the longjmp to the caller. (Here we call
-///     emscripten_longjmp, which is different from emscripten_longjmp_jmpbuf.
-///     emscripten_longjmp_jmpbuf takes jmp_buf as its first argument, while
-///     emscripten_longjmp takes an int. Both of them will eventually be lowered
-///     to emscripten_longjmp in s2wasm, but here we need two signatures - we
-///     can't translate an int value to a jmp_buf.)
-///     Label -1 means no longjmp occurred. Otherwise we jump to the right
-///     post-setjmp BB based on the label.
+///    testSetjmp examines setjmpTable to see if there is a matching setjmp
+///    call. After calling an invoke wrapper, if a longjmp occurred, __THREW__
+///    will be the address of matching jmp_buf buffer and __threwValue be the
+///    second argument to longjmp. mem[__THREW__.val] is a setjmp ID that is
+///    stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to
+///    each setjmp callsite. Label 0 means this longjmp buffer does not
+///    correspond to one of the setjmp callsites in this function, so in this
+///    case we just chain the longjmp to the caller. (Here we call
+///    emscripten_longjmp, which is different from emscripten_longjmp_jmpbuf.
+///    emscripten_longjmp_jmpbuf takes jmp_buf as its first argument, while
+///    emscripten_longjmp takes an int. Both of them will eventually be lowered
+///    to emscripten_longjmp in s2wasm, but here we need two signatures - we
+///    can't translate an int value to a jmp_buf.)
+///    Label -1 means no longjmp occurred. Otherwise we jump to the right
+///    post-setjmp BB based on the label.
 ///
 ///===----------------------------------------------------------------------===//
 
@@ -241,7 +242,8 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
 
   GlobalVariable *ThrewGV;
   GlobalVariable *ThrewValueGV;
-  GlobalVariable *TempRet0GV;
+  Function *GetTempRet0Func;
+  Function *SetTempRet0Func;
   Function *ResumeF;
   Function *EHTypeIDF;
   Function *EmLongjmpF;
@@ -281,9 +283,10 @@ public:
 
   WebAssemblyLowerEmscriptenEHSjLj(bool EnableEH = true, bool EnableSjLj = true)
       : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj),
-        ThrewGV(nullptr), ThrewValueGV(nullptr), TempRet0GV(nullptr),
-        ResumeF(nullptr), EHTypeIDF(nullptr), EmLongjmpF(nullptr),
-        EmLongjmpJmpbufF(nullptr), SaveSetjmpF(nullptr), TestSetjmpF(nullptr) {
+        ThrewGV(nullptr), ThrewValueGV(nullptr), GetTempRet0Func(nullptr),
+        SetTempRet0Func(nullptr), ResumeF(nullptr), EHTypeIDF(nullptr),
+        EmLongjmpF(nullptr), EmLongjmpJmpbufF(nullptr), SaveSetjmpF(nullptr),
+        TestSetjmpF(nullptr) {
     EHWhitelistSet.insert(EHWhitelist.begin(), EHWhitelist.end());
   }
   bool runOnModule(Module &M) override;
@@ -509,7 +512,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
   Function *ThrowF = M.getFunction("__cxa_throw");
   Function *TerminateF = M.getFunction("__clang_call_terminate");
   if (Callee == BeginCatchF || Callee == EndCatchF ||
-      Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF)
+      Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF ||
+      Callee == GetTempRet0Func || Callee == SetTempRet0Func)
     return false;
 
   // Otherwise we don't know
@@ -522,11 +526,11 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
 //   %label = _testSetjmp(mem[%__THREW__.val], setjmpTable, setjmpTableSize);
 //   if (%label == 0)
 //     emscripten_longjmp(%__THREW__.val, threwValue);
-//   __tempRet0 = threwValue;
+//   setTempRet0(threwValue);
 // } else {
 //   %label = -1;
 // }
-// %longjmp_result = __tempRet0;
+// %longjmp_result = getTempRet0();
 //
 // As output parameters. returns %label, %longjmp_result, and the BB the last
 // instruction (%longjmp_result = ...) is in.
@@ -570,15 +574,15 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
   IRB.CreateCall(EmLongjmpF, {Threw, ThrewValue});
   IRB.CreateUnreachable();
 
-  // __tempRet0 = threwValue;
+  // setTempRet0(threwValue);
   IRB.SetInsertPoint(EndBB2);
-  IRB.CreateStore(ThrewValue, TempRet0GV);
+  IRB.CreateCall(SetTempRet0Func, ThrewValue);
   IRB.CreateBr(EndBB1);
 
   IRB.SetInsertPoint(ElseBB1);
   IRB.CreateBr(EndBB1);
 
-  // longjmp_result = __tempRet0;
+  // longjmp_result = getTempRet0();
   IRB.SetInsertPoint(EndBB1);
   PHINode *LabelPHI = IRB.CreatePHI(IRB.getInt32Ty(), 2, "label");
   LabelPHI->addIncoming(ThenLabel, EndBB2);
@@ -588,7 +592,7 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
   // Output parameter assignment
   Label = LabelPHI;
   EndBB = EndBB1;
-  LongjmpResult = IRB.CreateLoad(TempRet0GV, "longjmp_result");
+  LongjmpResult = IRB.CreateCall(GetTempRet0Func, None, "longjmp_result");
 }
 
 void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
@@ -628,12 +632,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty();
   bool DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed);
 
-  // Declare (or get) global variables __THREW__, __threwValue, and __tempRet0,
-  // which are used in common for both exception handling and setjmp/longjmp
-  // handling
+  // Declare (or get) global variables __THREW__, __threwValue, and
+  // getTempRet0/setTempRet0 function which are used in common for both
+  // exception handling and setjmp/longjmp handling
   ThrewGV = getGlobalVariableI32(M, IRB, "__THREW__");
   ThrewValueGV = getGlobalVariableI32(M, IRB, "__threwValue");
-  TempRet0GV = getGlobalVariableI32(M, IRB, "__tempRet0");
+  GetTempRet0Func =
+      Function::Create(FunctionType::get(IRB.getInt32Ty(), false),
+                       GlobalValue::ExternalLinkage, "getTempRet0", &M);
+  SetTempRet0Func = Function::Create(
+      FunctionType::get(IRB.getVoidTy(), IRB.getInt32Ty(), false),
+      GlobalValue::ExternalLinkage, "setTempRet0", &M);
+  GetTempRet0Func->setDoesNotThrow();
+  SetTempRet0Func->setDoesNotThrow();
 
   bool Changed = false;
 
@@ -662,22 +673,6 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   if (DoSjLj) {
     Changed = true; // We have setjmp or longjmp somewhere
 
-    // Register saveSetjmp function
-    FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
-    SmallVector<Type *, 4> Params = {SetjmpFTy->getParamType(0),
-                                     IRB.getInt32Ty(), Type::getInt32PtrTy(C),
-                                     IRB.getInt32Ty()};
-    FunctionType *FTy =
-        FunctionType::get(Type::getInt32PtrTy(C), Params, false);
-    SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                   SaveSetjmpFName, &M);
-
-    // Register testSetjmp function
-    Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()};
-    FTy = FunctionType::get(IRB.getInt32Ty(), Params, false);
-    TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                   TestSetjmpFName, &M);
-
     if (LongjmpF) {
       // Replace all uses of longjmp with emscripten_longjmp_jmpbuf, which is
       // defined in JS code
@@ -687,20 +682,39 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
 
       LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF);
     }
-    FTy = FunctionType::get(IRB.getVoidTy(),
-                            {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
-    EmLongjmpF =
-        Function::Create(FTy, GlobalValue::ExternalLinkage, EmLongjmpFName, &M);
-
-    // Only traverse functions that uses setjmp in order not to insert
-    // unnecessary prep / cleanup code in every function
-    SmallPtrSet<Function *, 8> SetjmpUsers;
-    for (User *U : SetjmpF->users()) {
-      auto *UI = cast<Instruction>(U);
-      SetjmpUsers.insert(UI->getFunction());
+
+    if (SetjmpF) {
+      // Register saveSetjmp function
+      FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
+      SmallVector<Type *, 4> Params = {SetjmpFTy->getParamType(0),
+                                       IRB.getInt32Ty(), Type::getInt32PtrTy(C),
+                                       IRB.getInt32Ty()};
+      FunctionType *FTy =
+          FunctionType::get(Type::getInt32PtrTy(C), Params, false);
+      SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                     SaveSetjmpFName, &M);
+
+      // Register testSetjmp function
+      Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()};
+      FTy = FunctionType::get(IRB.getInt32Ty(), Params, false);
+      TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                     TestSetjmpFName, &M);
+
+      FTy = FunctionType::get(IRB.getVoidTy(),
+                              {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
+      EmLongjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                    EmLongjmpFName, &M);
+
+      // Only traverse functions that uses setjmp in order not to insert
+      // unnecessary prep / cleanup code in every function
+      SmallPtrSet<Function *, 8> SetjmpUsers;
+      for (User *U : SetjmpF->users()) {
+        auto *UI = cast<Instruction>(U);
+        SetjmpUsers.insert(UI->getFunction());
+      }
+      for (Function *F : SetjmpUsers)
+        runSjLjOnFunction(*F);
     }
-    for (Function *F : SetjmpUsers)
-      runSjLjOnFunction(*F);
   }
 
   if (!Changed) {
@@ -840,8 +854,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     CallInst *FMCI = IRB.CreateCall(FMCF, FMCArgs, "fmc");
     Value *Undef = UndefValue::get(LPI->getType());
     Value *Pair0 = IRB.CreateInsertValue(Undef, FMCI, 0, "pair0");
-    Value *TempRet0 =
-        IRB.CreateLoad(TempRet0GV, TempRet0GV->getName() + ".val");
+    Value *TempRet0 = IRB.CreateCall(GetTempRet0Func, None, "tempret0");
     Value *Pair1 = IRB.CreateInsertValue(Pair0, TempRet0, 1, "pair1");
 
     LPI->replaceAllUsesWith(Pair1);
@@ -922,7 +935,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
     Instruction *NewSetjmpTable =
         IRB.CreateCall(SaveSetjmpF, Args, "setjmpTable");
     Instruction *NewSetjmpTableSize =
-        IRB.CreateLoad(TempRet0GV, "setjmpTableSize");
+        IRB.CreateCall(GetTempRet0Func, None, "setjmpTableSize");
     SetjmpTableInsts.push_back(NewSetjmpTable);
     SetjmpTableSizeInsts.push_back(NewSetjmpTableSize);
     ToErase.push_back(CI);
@@ -1044,7 +1057,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   // ...
   // somebb:
   //   setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize);
-  //   setjmpTableSize = __tempRet0;
+  //   setjmpTableSize = getTempRet0();
   // So we need to make sure the SSA for these variables is valid so that every
   // saveSetjmp and testSetjmp calls have the correct arguments.
   SSAUpdater SetjmpTableSSA;
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 1dad7b8a2890f9644c2369cb9491d84f81683435..fa862fbaa634c2b3772f9958af28049ed00d2983 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -75,10 +75,10 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
       cast<MCSymbolWasm>(Printer.GetExternalSymbolSymbol(Name));
   const WebAssemblySubtarget &Subtarget = Printer.getSubtarget();
 
-  // __stack_pointer is a global variable; all other external symbols used by
-  // CodeGen are functions.  It's OK to hardcode knowledge of specific symbols
-  // here; this method is precisely there for fetching the signatures of known
-  // Clang-provided symbols.
+  // Except for the two exceptions (__stack_pointer and __cpp_exception), all
+  // other external symbols used by CodeGen are functions. It's OK to hardcode
+  // knowledge of specific symbols here; this method is precisely there for
+  // fetching the signatures of known Clang-provided symbols.
   if (strcmp(Name, "__stack_pointer") == 0) {
     WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
     WasmSym->setGlobalType(wasm::WasmGlobalType{
@@ -90,24 +90,45 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
 
   SmallVector<wasm::ValType, 4> Returns;
   SmallVector<wasm::ValType, 4> Params;
-  GetLibcallSignature(Subtarget, Name, Returns, Params);
+  if (strcmp(Name, "__cpp_exception") == 0) {
+    WasmSym->setType(wasm::WASM_SYMBOL_TYPE_EVENT);
+    // We can't confirm its signature index for now because there can be
+    // imported exceptions. Set it to be 0 for now.
+    WasmSym->setEventType(
+        {wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION, /* SigIndex */ 0});
+    // We may have multiple C++ compilation units to be linked together, each of
+    // which defines the exception symbol. To resolve them, we declare them as
+    // weak.
+    WasmSym->setWeak(true);
+    WasmSym->setExternal(true);
+
+    // All C++ exceptions are assumed to have a single i32 (for wasm32) or i64
+    // (for wasm64) param type and void return type. The reaon is, all C++
+    // exception values are pointers, and to share the type section with
+    // functions, exceptions are assumed to have void return type.
+    Params.push_back(Subtarget.hasAddr64() ? wasm::ValType::I64
+                                           : wasm::ValType::I32);
+  } else { // Function symbols
+    WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+    GetLibcallSignature(Subtarget, Name, Returns, Params);
+  }
   auto Signature =
       make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params));
   WasmSym->setSignature(Signature.get());
   Printer.addSignature(std::move(Signature));
-  WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
 
   return WasmSym;
 }
 
 MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
                                                      int64_t Offset,
-                                                     bool IsFunc,
-                                                     bool IsGlob) const {
+                                                     bool IsFunc, bool IsGlob,
+                                                     bool IsEvent) const {
   MCSymbolRefExpr::VariantKind VK =
       IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
              : IsGlob ? MCSymbolRefExpr::VK_WebAssembly_GLOBAL
-                      : MCSymbolRefExpr::VK_None;
+                      : IsEvent ? MCSymbolRefExpr::VK_WebAssembly_EVENT
+                                : MCSymbolRefExpr::VK_None;
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
 
@@ -116,6 +137,8 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
       report_fatal_error("Function addresses with offsets not supported");
     if (IsGlob)
       report_fatal_error("Global indexes with offsets not supported");
+    if (IsEvent)
+      report_fatal_error("Event indexes with offsets not supported");
     Expr =
         MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
   }
@@ -218,7 +241,7 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
              "WebAssembly does not use target flags on GlobalAddresses");
       MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(),
                                 MO.getGlobal()->getValueType()->isFunctionTy(),
-                                false);
+                                false, false);
       break;
     case MachineOperand::MO_ExternalSymbol:
       // The target flag indicates whether this is a symbol for a
@@ -228,14 +251,16 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
       MCOp = LowerSymbolOperand(
           GetExternalSymbolSymbol(MO), /*Offset=*/0,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0,
-          (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0);
+          (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0,
+          (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_EVENT) != 0);
       break;
     case MachineOperand::MO_MCSymbol:
       // This is currently used only for LSDA symbols (GCC_except_table),
       // because global addresses or other external symbols are handled above.
       assert(MO.getTargetFlags() == 0 &&
              "WebAssembly does not use target flags on MCSymbol");
-      MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false);
+      MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false,
+                                false);
       break;
     }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index 54a55796415488953771314daa96acf8731fef18..fa7a0ea61b3b386a95a455a2174227823899f2ed 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -34,7 +34,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
   MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
   MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset, bool IsFunc,
-                               bool IsGlob) const;
+                               bool IsGlob, bool IsEvent) const;
 
 public:
   WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 073706e567e74caae137c8bec3dacc757646f305..0157af0f85109dbaa85d30d8d041f1876f1fe881 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -64,13 +64,17 @@ void llvm::ComputeSignatureVTs(const FunctionType *Ty, const Function &F,
     Params.push_back(PtrVT);
 }
 
+void llvm::ValTypesFromMVTs(const ArrayRef<MVT> &In,
+                            SmallVectorImpl<wasm::ValType> &Out) {
+  for (MVT Ty : In)
+    Out.push_back(WebAssembly::toValType(Ty));
+}
+
 std::unique_ptr<wasm::WasmSignature>
 llvm::SignatureFromMVTs(const SmallVectorImpl<MVT> &Results,
                         const SmallVectorImpl<MVT> &Params) {
   auto Sig = make_unique<wasm::WasmSignature>();
-  for (MVT Ty : Results)
-    Sig->Returns.push_back(WebAssembly::toValType(Ty));
-  for (MVT Ty : Params)
-    Sig->Params.push_back(WebAssembly::toValType(Ty));
+  ValTypesFromMVTs(Results, Sig->Returns);
+  ValTypesFromMVTs(Params, Sig->Params);
   return Sig;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index cde44d24599a209119fc720068c0c412bb941f57..4be4beb85d048b560d9809b7ca021c82faa9eecd 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -129,6 +129,9 @@ void ComputeSignatureVTs(const FunctionType *Ty, const Function &F,
                          const TargetMachine &TM, SmallVectorImpl<MVT> &Params,
                          SmallVectorImpl<MVT> &Results);
 
+void ValTypesFromMVTs(const ArrayRef<MVT> &In,
+                      SmallVectorImpl<wasm::ValType> &Out);
+
 std::unique_ptr<wasm::WasmSignature>
 SignatureFromMVTs(const SmallVectorImpl<MVT> &Results,
                   const SmallVectorImpl<MVT> &Params);
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 954c8a6b50b8e585443c32fd7c1d46af8491feff..a6fb78280dad82cd22c99d5dd1a5e3df7906a3d3 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -82,8 +82,12 @@ extern "C" void LLVMInitializeWebAssemblyTarget() {
 //===----------------------------------------------------------------------===//
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::PIC_;
+  if (!RM.hasValue()) {
+    // Default to static relocation model.  This should always be more optimial
+    // than PIC since the static linker can determine all global addresses and
+    // assume direct function calls.
+    return Reloc::Static;
+  }
   return *RM;
 }
 
@@ -97,7 +101,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
                         TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
                                          : "e-m:e-p:32:32-i64:64-n32:64-S128",
                         TT, CPU, FS, Options, getEffectiveRelocModel(RM),
-                        CM ? *CM : CodeModel::Large, OL),
+                        getEffectiveCodeModel(CM, CodeModel::Large), OL),
       TLOF(new WebAssemblyTargetObjectFile()) {
   // WebAssembly type-checks instructions, but a noreturn function with a return
   // type that doesn't match the context will cause a check failure. So we lower
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 5ded1f971a035c4a997db32d145f897e9d1e8e80..524b4ae53be12f8df06ba4718ee3929ca108c560 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -30,6 +30,7 @@ set(sources
   X86CmovConversion.cpp
   X86CondBrFolding.cpp
   X86DomainReassignment.cpp
+  X86DiscriminateMemOps.cpp
   X86ExpandPseudo.cpp
   X86FastISel.cpp
   X86FixupBWInsts.cpp
@@ -44,6 +45,7 @@ set(sources
   X86ISelLowering.cpp
   X86IndirectBranchTracking.cpp
   X86InterleavedAccess.cpp
+  X86InsertPrefetch.cpp
   X86InstrFMA3Info.cpp
   X86InstrFoldTables.cpp
   X86InstrInfo.cpp
diff --git a/lib/Target/X86/LLVMBuild.txt b/lib/Target/X86/LLVMBuild.txt
index 2062163afb054fbba74b2ce79b49201cd0847e83..055336baac195e30b1e50813f507e7e7eefb1234 100644
--- a/lib/Target/X86/LLVMBuild.txt
+++ b/lib/Target/X86/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = X86CodeGen
 parent = X86
-required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target X86AsmPrinter X86Desc X86Info X86Utils GlobalISel
+required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target X86AsmPrinter X86Desc X86Info X86Utils GlobalISel ProfileData
 add_to_library_groups = X86
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 362ca96ac0939472f15e22c957f8df81f91f7256..ea4aaf14223d1ff8545218a79e9966d8bca104df 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -39,7 +39,7 @@ using namespace llvm;
 #include "X86GenRegisterInfo.inc"
 
 #define GET_INSTRINFO_MC_DESC
-#define GET_GENINSTRINFO_MC_HELPERS
+#define GET_INSTRINFO_MC_HELPERS
 #include "X86GenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_MC_DESC
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 595c26d31e3f7920cb8656799ad8341e145fffa8..4e9f5ba60d2e4ebc16837d4a90467c7c4451222c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -134,7 +134,7 @@ unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
 // Defines symbolic names for the X86 instructions.
 //
 #define GET_INSTRINFO_ENUM
-#define GET_GENINSTRINFO_MC_DECL
+#define GET_INSTRINFO_MC_HELPER_DECLS
 #include "X86GenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_ENUM
diff --git a/lib/Target/X86/Utils/LLVMBuild.txt b/lib/Target/X86/Utils/LLVMBuild.txt
index de0a30fa19c8082c7d3bb5df0f2035ebad8d6da0..fdb886f53a082030076fb6ed61b96ba57c43718c 100644
--- a/lib/Target/X86/Utils/LLVMBuild.txt
+++ b/lib/Target/X86/Utils/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = X86Utils
 parent = X86
-required_libraries = Core Support
+required_libraries = Support
 add_to_library_groups = X86
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 19f8e35ade04bf70cc1e3982b9164ec307791872..1c8813815b863a339732d69feb4e163a5de9ed05 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -122,6 +122,13 @@ FunctionPass *createX86EvexToVexInsts();
 /// This pass creates the thunks for the retpoline feature.
 FunctionPass *createX86RetpolineThunksPass();
 
+/// This pass ensures instructions featuring a memory operand
+/// have distinctive <LineNumber, Discriminator> (with respect to eachother)
+FunctionPass *createX86DiscriminateMemOpsPass();
+
+/// This pass applies profiling information to insert cache prefetches.
+FunctionPass *createX86InsertPrefetchPass();
+
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
@@ -136,6 +143,7 @@ void initializeWinEHStatePassPass(PassRegistry &);
 void initializeX86AvoidSFBPassPass(PassRegistry &);
 void initializeX86CallFrameOptimizationPass(PassRegistry &);
 void initializeX86CmovConverterPassPass(PassRegistry &);
+void initializeX86CondBrFoldingPassPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
 void initializeX86ExecutionDomainFixPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 74135656528d5a73a7130f972335b70a6f5002a3..6b1749fc75005965e9e6908cc869ce965070031b 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -883,6 +883,17 @@ class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
 def : SkylakeServerProc<"skylake-avx512">;
 def : SkylakeServerProc<"skx">; // Legacy alias.
 
+def CLXFeatures : ProcessorFeatures<SKXFeatures.Value, [
+  FeatureVNNI
+]>;
+
+class CascadelakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
+                                              CLXFeatures.Value, [
+  FeatureHasFastGather,
+  FeaturePOPCNTFalseDeps
+]>;
+def : CascadelakeProc<"cascadelake">;
+
 def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [
   FeatureAVX512,
   FeatureCDI,
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 74dbdcd27930dcc65d5cee676e7724e87c447e4f..b4be9edf1666b7f7ef0bf701812c1b923b84da5a 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -88,19 +88,19 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
 void X86AsmPrinter::EmitFunctionBodyStart() {
   if (EmitFPOData) {
-    X86TargetStreamer *XTS =
-        static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
-    unsigned ParamsSize =
-        MF->getInfo<X86MachineFunctionInfo>()->getArgumentStackSize();
-    XTS->emitFPOProc(CurrentFnSym, ParamsSize);
+    if (auto *XTS =
+        static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()))
+      XTS->emitFPOProc(
+          CurrentFnSym,
+          MF->getInfo<X86MachineFunctionInfo>()->getArgumentStackSize());
   }
 }
 
 void X86AsmPrinter::EmitFunctionBodyEnd() {
   if (EmitFPOData) {
-    X86TargetStreamer *XTS =
-        static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
-    XTS->emitFPOEndProc();
+    if (auto *XTS =
+            static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()))
+      XTS->emitFPOEndProc();
   }
 }
 
@@ -674,7 +674,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     emitNonLazyStubs(MMI, *OutStreamer);
 
     // Emit stack and fault map information.
-    SM.serializeToStackMapSection();
+    emitStackMaps(SM);
     FM.serializeToFaultMapSection();
 
     // This flag tells the linker that no global symbols contain code that fall
@@ -695,12 +695,12 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 
   if (TT.isOSBinFormatCOFF()) {
-    SM.serializeToStackMapSection();
+    emitStackMaps(SM);
     return;
   }
 
   if (TT.isOSBinFormatELF()) {
-    SM.serializeToStackMapSection();
+    emitStackMaps(SM);
     FM.serializeToFaultMapSection();
     return;
   }
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index eb9c4b3e5977e4a0b45e4c9b45659bc6bb9d6370..2850baf7a65e7a898190f1d5d7943e4085fd43cc 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -586,7 +586,7 @@ void X86AvoidSFBPass::breakBlockedCopies(
       StDisp2 += OverlapDelta;
       Size2 -= OverlapDelta;
     }
-    Size1 = std::abs(std::abs(LdDisp2) - std::abs(LdDisp1));
+    Size1 = LdDisp2 - LdDisp1;
 
     // Build a copy for the point until the current blocking store's
     // displacement.
diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp
index 1d221930c2a46b3a4bd5eac60794181b0c95a2e9..7ce443c4656af40827ff32455a51b0eb558705cc 100644
--- a/lib/Target/X86/X86CondBrFolding.cpp
+++ b/lib/Target/X86/X86CondBrFolding.cpp
@@ -62,8 +62,9 @@ STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded");
 namespace {
 class X86CondBrFoldingPass : public MachineFunctionPass {
 public:
-  X86CondBrFoldingPass() : MachineFunctionPass(ID) {}
-
+  X86CondBrFoldingPass() : MachineFunctionPass(ID) {
+    initializeX86CondBrFoldingPassPass(*PassRegistry::getPassRegistry());
+  }
   StringRef getPassName() const override { return "X86 CondBr Folding"; }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -73,12 +74,13 @@ public:
     AU.addRequired<MachineBranchProbabilityInfo>();
   }
 
-private:
+public:
   static char ID;
 };
+} // namespace
 
 char X86CondBrFoldingPass::ID = 0;
-} // namespace
+INITIALIZE_PASS(X86CondBrFoldingPass, "X86CondBrFolding", "X86CondBrFolding", false, false)
 
 FunctionPass *llvm::createX86CondBrFolding() {
   return new X86CondBrFoldingPass();
@@ -466,7 +468,8 @@ bool X86CondBrFolding::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
     break;
   }
   SrcReg = MI.getOperand(SrcRegIndex).getReg();
-  assert(MI.getOperand(ValueIndex).isImm() && "Expecting Imm operand");
+  if (!MI.getOperand(ValueIndex).isImm())
+    return false;
   CmpValue = MI.getOperand(ValueIndex).getImm();
   return true;
 }
diff --git a/lib/Target/X86/X86DiscriminateMemOps.cpp b/lib/Target/X86/X86DiscriminateMemOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5653babedf8eccd67fcb41be29fd61c377c6000a
--- /dev/null
+++ b/lib/Target/X86/X86DiscriminateMemOps.cpp
@@ -0,0 +1,145 @@
+//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass aids profile-driven cache prefetch insertion by ensuring all
+/// instructions that have a memory operand are distinguishible from each other.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-discriminate-memops"
+
+namespace {
+
+using Location = std::pair<StringRef, unsigned>;
+
+Location diToLocation(const DILocation *Loc) {
+  return std::make_pair(Loc->getFilename(), Loc->getLine());
+}
+
+/// Ensure each instruction having a memory operand has a distinct <LineNumber,
+/// Discriminator> pair.
+void updateDebugInfo(MachineInstr *MI, const DILocation *Loc) {
+  DebugLoc DL(Loc);
+  MI->setDebugLoc(DL);
+}
+
+class X86DiscriminateMemOps : public MachineFunctionPass {
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override {
+    return "X86 Discriminate Memory Operands";
+  }
+
+public:
+  static char ID;
+
+  /// Default construct and initialize the pass.
+  X86DiscriminateMemOps();
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+char X86DiscriminateMemOps::ID = 0;
+
+/// Default construct and initialize the pass.
+X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {}
+
+bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
+  DISubprogram *FDI = MF.getFunction().getSubprogram();
+  if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling())
+    return false;
+
+  // Have a default DILocation, if we find instructions with memops that don't
+  // have any debug info.
+  const DILocation *ReferenceDI =
+      DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI);
+
+  DenseMap<Location, unsigned> MemOpDiscriminators;
+  MemOpDiscriminators[diToLocation(ReferenceDI)] = 0;
+
+  // Figure out the largest discriminator issued for each Location. When we
+  // issue new discriminators, we can thus avoid issuing discriminators
+  // belonging to instructions that don't have memops. This isn't a requirement
+  // for the goals of this pass, however, it avoids unnecessary ambiguity.
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      const auto &DI = MI.getDebugLoc();
+      if (!DI)
+        continue;
+      Location Loc = diToLocation(DI);
+      MemOpDiscriminators[Loc] =
+          std::max(MemOpDiscriminators[Loc], DI->getBaseDiscriminator());
+    }
+  }
+
+  // Keep track of the discriminators seen at each Location. If an instruction's
+  // DebugInfo has a Location and discriminator we've already seen, replace its
+  // discriminator with a new one, to guarantee uniqueness.
+  DenseMap<Location, DenseSet<unsigned>> Seen;
+
+  bool Changed = false;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0)
+        continue;
+      const DILocation *DI = MI.getDebugLoc();
+      if (!DI) {
+        DI = ReferenceDI;
+      }
+      DenseSet<unsigned> &Set = Seen[diToLocation(DI)];
+      const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert =
+          Set.insert(DI->getBaseDiscriminator());
+      if (!TryInsert.second) {
+        DI = DI->setBaseDiscriminator(++MemOpDiscriminators[diToLocation(DI)]);
+        updateDebugInfo(&MI, DI);
+        Changed = true;
+        const std::pair<DenseSet<unsigned>::iterator, bool> MustInsert =
+            Set.insert(DI->getBaseDiscriminator());
+        // FIXME (mtrofin): check if the to-be inserted base discriminator can
+        // be added. This requires a new API on DILocation.
+        // The assumption is that this scenario is infrequent/OK not to support.
+        // If evidence points otherwise, we can explore synthesize unique DIs by
+        // adding fake line numbers.
+        if (!MustInsert.second) {
+          LLVM_DEBUG(dbgs()
+                     << "Unable to create a unique discriminator in "
+                     << DI->getFilename() << " Line: " << DI->getLine()
+                     << " Column: " << DI->getColumn()
+                     << ". This is likely due to a large macro expansion.\n");
+        }
+      }
+
+      // Bump the reference DI to avoid cramming discriminators on line 0.
+      // FIXME(mtrofin): pin ReferenceDI on blocks or first instruction with DI
+      // in a block. It's more consistent than just relying on the last memop
+      // instruction we happened to see.
+      ReferenceDI = DI;
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createX86DiscriminateMemOpsPass() {
+  return new X86DiscriminateMemOps();
+}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index a49ad8bd59dfc840c7a56d8c0fed2132db8bd72d..cbfdc4b3b93b3cb7b478ac58896ec484f4388b20 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -3998,7 +3998,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   }
 
   Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
-  MI->eraseFromParent();
+  MachineBasicBlock::iterator I(MI);
+  removeDeadCode(I, std::next(I));
   return true;
 }
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index e40b0f81e3306b8a690272c7429f320753192385..d722c75c4f0a516a924bca273c89483190c02caf 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -2270,9 +2270,15 @@ void X86FrameLowering::adjustForSegmentedStacks(
 
   // Do not generate a prologue for leaf functions with a stack of size zero.
   // For non-leaf functions we have to allow for the possibility that the
-  // call is to a non-split function, as in PR37807.
-  if (StackSize == 0 && !MFI.hasTailCall())
+  // callis to a non-split function, as in PR37807. This function could also
+  // take the address of a non-split function. When the linker tries to adjust
+  // its non-existent prologue, it would fail with an error. Mark the object
+  // file so that such failures are not errors. See this Go language bug-report
+  // https://go-review.googlesource.com/c/go/+/148819/
+  if (StackSize == 0 && !MFI.hasTailCall()) {
+    MF.getMMI().setHasNosplitStack(true);
     return;
+  }
 
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 16819f4451c2d2b23d311042281c8591b51e14ee..43d0538c4cda660d9646c6335e802ac22890029b 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2192,17 +2192,17 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
 }
 
 /// Test whether the given X86ISD::CMP node has any uses which require the SF
-/// or OF bits to be accurate.
-static bool hasNoSignedComparisonUses(SDNode *N) {
+/// flag to be accurate.
+static bool hasNoSignFlagUses(SDValue Flags) {
   // Examine each user of the node.
-  for (SDNode::use_iterator UI = N->use_begin(),
-         UE = N->use_end(); UI != UE; ++UI) {
-    // Only examine CopyToReg uses.
-    if (UI->getOpcode() != ISD::CopyToReg)
-      return false;
+  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+         UI != UE; ++UI) {
+    // Only check things that use the flags.
+    if (UI.getUse().getResNo() != Flags.getResNo())
+      continue;
     // Only examine CopyToReg uses that copy to EFLAGS.
-    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
-          X86::EFLAGS)
+    if (UI->getOpcode() != ISD::CopyToReg ||
+        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
       return false;
     // Examine each user of the CopyToReg use.
     for (SDNode::use_iterator FlagUI = UI->use_begin(),
@@ -2215,11 +2215,14 @@ static bool hasNoSignedComparisonUses(SDNode *N) {
       switch (FlagUI->getMachineOpcode()) {
       // These comparisons don't treat the most significant bit specially.
       case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
-      case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+      case X86::SETEr: case X86::SETNEr: case X86::SETOr: case X86::SETNOr:
+      case X86::SETPr: case X86::SETNPr:
       case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
-      case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+      case X86::SETEm: case X86::SETNEm: case X86::SETOm: case X86::SETNOm:
+      case X86::SETPm: case X86::SETNPm:
       case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
-      case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
+      case X86::JE_1: case X86::JNE_1: case X86::JO_1: case X86::JNO_1:
+      case X86::JP_1: case X86::JNP_1:
       case X86::CMOVA16rr: case X86::CMOVA16rm:
       case X86::CMOVA32rr: case X86::CMOVA32rm:
       case X86::CMOVA64rr: case X86::CMOVA64rm:
@@ -2241,6 +2244,9 @@ static bool hasNoSignedComparisonUses(SDNode *N) {
       case X86::CMOVNP16rr: case X86::CMOVNP16rm:
       case X86::CMOVNP32rr: case X86::CMOVNP32rm:
       case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+      case X86::CMOVO16rr: case X86::CMOVO16rm:
+      case X86::CMOVO32rr: case X86::CMOVO32rm:
+      case X86::CMOVO64rr: case X86::CMOVO64rm:
       case X86::CMOVP16rr: case X86::CMOVP16rm:
       case X86::CMOVP32rr: case X86::CMOVP32rm:
       case X86::CMOVP64rr: case X86::CMOVP64rm:
@@ -2255,18 +2261,16 @@ static bool hasNoSignedComparisonUses(SDNode *N) {
 
 /// Test whether the given node which sets flags has any uses which require the
 /// CF flag to be accurate.
-static bool hasNoCarryFlagUses(SDNode *N) {
+static bool hasNoCarryFlagUses(SDValue Flags) {
   // Examine each user of the node.
-  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
-       ++UI) {
+  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+         UI != UE; ++UI) {
     // Only check things that use the flags.
-    if (UI.getUse().getResNo() != 1)
+    if (UI.getUse().getResNo() != Flags.getResNo())
       continue;
-    // Only examine CopyToReg uses.
-    if (UI->getOpcode() != ISD::CopyToReg)
-      return false;
     // Only examine CopyToReg uses that copy to EFLAGS.
-    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
+    if (UI->getOpcode() != ISD::CopyToReg ||
+        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
       return false;
     // Examine each user of the CopyToReg use.
     for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
@@ -2631,7 +2635,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
             (-OperandV).getMinSignedBits() <= 8) ||
            (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 &&
             (-OperandV).getMinSignedBits() <= 32)) &&
-          hasNoCarryFlagUses(StoredVal.getNode())) {
+          hasNoCarryFlagUses(StoredVal.getValue(1))) {
         OperandV = -OperandV;
         Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
       }
@@ -2858,10 +2862,28 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
 
   // Shift NBits left by 8 bits, thus producing 'control'.
+  // This makes the low 8 bits to be zero.
   SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
   SDValue Control = CurDAG->getNode(ISD::SHL, DL, NVT, NBits, C8);
   insertDAGNode(*CurDAG, OrigNBits, Control);
-  // NOTE: could also try to extract  start  from  (x >> start)
+
+  // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
+  if (X.getOpcode() == ISD::SRL) {
+    SDValue ShiftAmt = X.getOperand(1);
+    X = X.getOperand(0);
+
+    assert(ShiftAmt.getValueType() == MVT::i8 &&
+           "Expected shift amount to be i8");
+
+    // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
+    SDValue OrigShiftAmt = ShiftAmt;
+    ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, NVT, ShiftAmt);
+    insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
+
+    // And now 'or' these low 8 bits of shift amount into the 'control'.
+    Control = CurDAG->getNode(ISD::OR, DL, NVT, Control, ShiftAmt);
+    insertDAGNode(*CurDAG, OrigNBits, Control);
+  }
 
   // And finally, form the BEXTR itself.
   SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, NVT, X, Control);
@@ -3392,14 +3414,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
     unsigned Opc, MOpc;
     bool isSigned = Opcode == ISD::SMUL_LOHI;
-    bool hasBMI2 = Subtarget->hasBMI2();
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
-      case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
-                     MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
-      case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
-                     MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
+      case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+      case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
       }
     } else {
       switch (NVT.SimpleTy) {
@@ -3420,12 +3439,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     case X86::MUL64r:
       SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
       break;
-    case X86::MULX32rr:
-      SrcReg = X86::EDX; LoReg = HiReg = 0;
-      break;
-    case X86::MULX64rr:
-      SrcReg = X86::RDX; LoReg = HiReg = 0;
-      break;
     }
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
@@ -3439,26 +3452,15 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
                                           N0, SDValue()).getValue(1);
-    SDValue ResHi, ResLo;
-
     if (foldedLoad) {
       SDValue Chain;
       MachineSDNode *CNode = nullptr;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
-      if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
-        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
-        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
-        ResHi = SDValue(CNode, 0);
-        ResLo = SDValue(CNode, 1);
-        Chain = SDValue(CNode, 2);
-        InFlag = SDValue(CNode, 3);
-      } else {
-        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
-        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
-        Chain = SDValue(CNode, 0);
-        InFlag = SDValue(CNode, 1);
-      }
+      SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+      Chain = SDValue(CNode, 0);
+      InFlag = SDValue(CNode, 1);
 
       // Update the chain.
       ReplaceUses(N1.getValue(1), Chain);
@@ -3466,39 +3468,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       SDValue Ops[] = { N1, InFlag };
-      if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
-        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
-        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
-        ResHi = SDValue(CNode, 0);
-        ResLo = SDValue(CNode, 1);
-        InFlag = SDValue(CNode, 2);
-      } else {
-        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
-        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
-        InFlag = SDValue(CNode, 0);
-      }
+      SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+      SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+      InFlag = SDValue(CNode, 0);
     }
 
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
-      if (!ResLo.getNode()) {
-        assert(LoReg && "Register for low half is not defined!");
-        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
-                                       InFlag);
-        InFlag = ResLo.getValue(2);
-      }
+      assert(LoReg && "Register for low half is not defined!");
+      SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
+                                             NVT, InFlag);
+      InFlag = ResLo.getValue(2);
       ReplaceUses(SDValue(Node, 0), ResLo);
       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
                  dbgs() << '\n');
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      if (!ResHi.getNode()) {
-        assert(HiReg && "Register for high half is not defined!");
-        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
-                                       InFlag);
-        InFlag = ResHi.getValue(2);
-      }
+      assert(HiReg && "Register for high half is not defined!");
+      SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
+                                             NVT, InFlag);
+      InFlag = ResHi.getValue(2);
       ReplaceUses(SDValue(Node, 1), ResHi);
       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
                  dbgs() << '\n');
@@ -3731,7 +3721,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
       if (isUInt<8>(Mask) &&
           (!(Mask & 0x80) || CmpVT == MVT::i8 ||
-           hasNoSignedComparisonUses(Node))) {
+           hasNoSignFlagUses(SDValue(Node, 0)))) {
         // For example, convert "testl %eax, $8" to "testb %al, $8"
         VT = MVT::i8;
         SubRegOp = X86::sub_8bit;
@@ -3739,7 +3729,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         MOpc = X86::TEST8mi;
       } else if (OptForMinSize && isUInt<16>(Mask) &&
                  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
-                  hasNoSignedComparisonUses(Node))) {
+                  hasNoSignFlagUses(SDValue(Node, 0)))) {
         // For example, "testl %eax, $32776" to "testw %ax, $32776".
         // NOTE: We only want to form TESTW instructions if optimizing for
         // min size. Otherwise we only save one byte and possibly get a length
@@ -3753,7 +3743,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
                    // Without minsize 16-bit Cmps can get here so we need to
                    // be sure we calculate the correct sign flag if needed.
                    (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
-                  CmpVT == MVT::i32 || hasNoSignedComparisonUses(Node))) {
+                  CmpVT == MVT::i32 ||
+                  hasNoSignFlagUses(SDValue(Node, 0)))) {
         // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
         // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
         // Otherwize, we find ourselves in a position where we have to do
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 38d3a30cb19e8f6be885a339ae8957a63cfdfb33..6eb21ab7087d9c001aac3bf9e227fe76894d262e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19,7 +19,6 @@
 #include "X86InstrBuilder.h"
 #include "X86IntrinsicsInfo.h"
 #include "X86MachineFunctionInfo.h"
-#include "X86ShuffleDecodeConstantPool.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -196,6 +195,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
   }
 
+  // Funnel shifts.
+  for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
+    setOperationAction(ShiftOp             , MVT::i16  , Custom);
+    setOperationAction(ShiftOp             , MVT::i32  , Custom);
+    if (Subtarget.is64Bit())
+      setOperationAction(ShiftOp           , MVT::i64  , Custom);
+  }
+
   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
   // operation.
   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
@@ -786,14 +793,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
-    setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
-    setOperationAction(ISD::SREM, MVT::v2i32, Custom);
-    setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
-    setOperationAction(ISD::UREM, MVT::v2i32, Custom);
+    for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
+                     MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
+      setOperationAction(ISD::SDIV, VT, Custom);
+      setOperationAction(ISD::SREM, VT, Custom);
+      setOperationAction(ISD::UDIV, VT, Custom);
+      setOperationAction(ISD::UREM, VT, Custom);
+    }
 
     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
     setOperationAction(ISD::MUL,                MVT::v2i16, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i32, Custom);
+    setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
+    setOperationAction(ISD::MUL,                MVT::v4i16, Custom);
+    setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
 
     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
@@ -816,6 +829,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
     }
 
+    setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
+    setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
+    setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
+    setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
+    // Use widening instead of promotion.
+    for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
+                     MVT::v4i16, MVT::v2i16 }) {
+      setOperationAction(ISD::UADDSAT, VT, Custom);
+      setOperationAction(ISD::SADDSAT, VT, Custom);
+      setOperationAction(ISD::USUBSAT, VT, Custom);
+      setOperationAction(ISD::SSUBSAT, VT, Custom);
+    }
+
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
@@ -850,9 +880,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // scalars) and extend in-register to a legal 128-bit vector type. For sext
     // loads these must work with a single scalar load.
     for (MVT VT : MVT::integer_vector_valuetypes()) {
-      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
-      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
-      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+      if (!ExperimentalVectorWideningLegalization) {
+        // We don't want narrow result types here when widening.
+        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+      }
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
@@ -861,6 +894,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
     }
 
+    if (ExperimentalVectorWideningLegalization &&
+        !Subtarget.hasSSE41() && Subtarget.is64Bit()) {
+      // This lets DAG combine create sextloads that get split and scalarized.
+      // TODO: Does this make sense? What about v2i8->v2i64?
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8,  Custom);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8,  Custom);
+    }
+
     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
@@ -883,10 +924,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i16, Custom);
-    // Custom legalize these to avoid over promotion.
+
+    // Custom legalize these to avoid over promotion or custom promotion.
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i8,  Custom);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i16, Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i16, Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i16, Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i8,  Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i16, Custom);
+
+    // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
+    // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
+    // split again based on the input type, this will cause an AssertSExt i16 to
+    // be emitted instead of an AssertZExt. This will allow packssdw followed by
+    // packuswb to be used to truncate to v8i8. This is necessary since packusdw
+    // isn't available until sse4.1.
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
@@ -906,7 +963,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
     // store.
     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
+    setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
+    setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
+    setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
 
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
@@ -918,6 +981,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 
+    if (ExperimentalVectorWideningLegalization) {
+      setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+
+      setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
+      setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
+      setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
+      setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
+      setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
+      setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
+    }
+
     // In the customized shift lowering, the legal v4i32/v2i64 cases
     // in AVX2 will be recognized.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
@@ -928,7 +1002,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
     setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
-    setOperationAction(ISD::ROTL,               MVT::v16i8, Custom);
+
+    // With BWI, expanding (and promoting the shifts) is the better.
+    if (!Subtarget.hasBWI())
+      setOperationAction(ISD::ROTL,             MVT::v16i8, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -974,17 +1051,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
     }
 
-    for (MVT VT : MVT::integer_vector_valuetypes()) {
-      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
-      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
-      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+    if (!ExperimentalVectorWideningLegalization) {
+      // Avoid narrow result types when widening. The legal types are listed
+      // in the next loop.
+      for (MVT VT : MVT::integer_vector_valuetypes()) {
+        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+      }
     }
 
     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
-      setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8,  Legal);
+      if (!ExperimentalVectorWideningLegalization)
+        setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
@@ -1060,9 +1142,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SRA, VT, Custom);
     }
 
+    if (ExperimentalVectorWideningLegalization) {
+      // These types need custom splitting if their input is a 128-bit vector.
+      setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
+      setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
+      setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
+      setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
+    }
+
     setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
     setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
-    setOperationAction(ISD::ROTL,              MVT::v32i8,  Custom);
+
+    // With BWI, expanding (and promoting the shifts) is the better.
+    if (!Subtarget.hasBWI())
+      setOperationAction(ISD::ROTL,            MVT::v32i8,  Custom);
 
     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
@@ -1124,6 +1217,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
 
+    setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
+
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
@@ -1241,6 +1343,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::SELECT,           VT, Custom);
       setOperationAction(ISD::TRUNCATE,         VT, Custom);
+      setOperationAction(ISD::UADDSAT,          VT, Custom);
+      setOperationAction(ISD::SADDSAT,          VT, Custom);
+      setOperationAction(ISD::USUBSAT,          VT, Custom);
+      setOperationAction(ISD::SSUBSAT,          VT, Custom);
 
       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -1324,6 +1430,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
 
+    if (ExperimentalVectorWideningLegalization) {
+      // Need to custom widen this if we don't have AVX512BW.
+      setOperationAction(ISD::ANY_EXTEND,         MVT::v8i8, Custom);
+      setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i8, Custom);
+      setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i8, Custom);
+    }
+
     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FFLOOR,           VT, Legal);
       setOperationAction(ISD::FCEIL,            VT, Legal);
@@ -1332,12 +1445,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FNEARBYINT,       VT, Legal);
     }
 
-    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
-
     // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
-    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
-    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+    for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
+    }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
@@ -1495,6 +1607,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SUB,                VT, Custom);
       setOperationAction(ISD::MUL,                VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Expand);
+      setOperationAction(ISD::UADDSAT,            VT, Custom);
+      setOperationAction(ISD::SADDSAT,            VT, Custom);
+      setOperationAction(ISD::USUBSAT,            VT, Custom);
+      setOperationAction(ISD::SSUBSAT,            VT, Custom);
 
       setOperationAction(ISD::TRUNCATE,           VT, Custom);
       setOperationAction(ISD::SETCC,              VT, Custom);
@@ -1555,6 +1671,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
 
     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
 
@@ -1574,6 +1691,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SMIN,         VT, Legal);
       setOperationAction(ISD::UMIN,         VT, Legal);
       setOperationAction(ISD::SETCC,        VT, Custom);
+      setOperationAction(ISD::UADDSAT,      VT, Legal);
+      setOperationAction(ISD::SADDSAT,      VT, Legal);
+      setOperationAction(ISD::USUBSAT,      VT, Legal);
+      setOperationAction(ISD::SSUBSAT,      VT, Legal);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1736,8 +1857,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
-  setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
-  setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
@@ -4709,6 +4828,14 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return true;
 }
 
+bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
+  // If we are using XMM registers in the ABI and the condition of the select is
+  // a floating-point compare and we have blendv or conditional move, then it is
+  // cheaper to select instead of doing a cross-register move and creating a
+  // load that depends on the compare result.
+  return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
+}
+
 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
   // TODO: It might be a win to ease or lift this restriction, but the generic
   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
@@ -4737,6 +4864,12 @@ bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
 }
 
+bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
+                                                 bool IsSigned) const {
+  // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
+  return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
+}
+
 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                                 unsigned Index) const {
   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
@@ -4763,7 +4896,11 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
 
 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
                                                 EVT BitcastVT) const {
-  if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
+  if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
+      BitcastVT.getVectorElementType() == MVT::i1)
+    return false;
+
+  if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
     return false;
 
   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
@@ -5456,25 +5593,29 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   return DAG.getBitcast(VT, Vec);
 }
 
-static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
+static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
                               SelectionDAG &DAG) {
   EVT InVT = In.getValueType();
-  assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
-
-  if (VT.is128BitVector() && InVT.is128BitVector())
-    return DAG.getNode(X86ISD::VSEXT == Opc ? ISD::SIGN_EXTEND_VECTOR_INREG
-                                            : ISD::ZERO_EXTEND_VECTOR_INREG,
-                       DL, VT, In);
+  assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
 
   // For 256-bit vectors, we only need the lower (128-bit) input half.
   // For 512-bit vectors, we only need the lower input half or quarter.
-  if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
-    int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
+  if (InVT.getSizeInBits() > 128) {
+    assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
+           "Expected VTs to be the same size!");
+    unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
     In = extractSubVector(In, 0, DAG, DL,
-                          std::max(128, (int)VT.getSizeInBits() / Scale));
+                          std::max(128U, VT.getSizeInBits() / Scale));
+    InVT = In.getValueType();
   }
 
-  return DAG.getNode(Opc, DL, VT, In);
+  if (VT.getVectorNumElements() == InVT.getVectorNumElements())
+    return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                       DL, VT, In);
+
+  return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG
+                            : ISD::ZERO_EXTEND_VECTOR_INREG,
+                     DL, VT, In);
 }
 
 /// Returns a vector_shuffle node for an unpackl operation.
@@ -5869,6 +6010,31 @@ static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
   }
 }
 
+// Split the demanded elts of a PACKSS/PACKUS node between its operands.
+static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
+                                APInt &DemandedLHS, APInt &DemandedRHS) {
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumElts = DemandedElts.getBitWidth();
+  int NumInnerElts = NumElts / 2;
+  int NumEltsPerLane = NumElts / NumLanes;
+  int NumInnerEltsPerLane = NumInnerElts / NumLanes;
+
+  DemandedLHS = APInt::getNullValue(NumInnerElts);
+  DemandedRHS = APInt::getNullValue(NumInnerElts);
+
+  // Map DemandedElts to the packed operands.
+  for (int Lane = 0; Lane != NumLanes; ++Lane) {
+    for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
+      int OuterIdx = (Lane * NumEltsPerLane) + Elt;
+      int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
+      if (DemandedElts[OuterIdx])
+        DemandedLHS.setBit(InnerIdx);
+      if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
+        DemandedRHS.setBit(InnerIdx);
+    }
+  }
+}
+
 /// Calculates the shuffle mask corresponding to the target-specific opcode.
 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
 /// operands in \p Ops, and returns true.
@@ -6146,8 +6312,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     Ops.push_back(N->getOperand(0));
     Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(1);
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+                                    RawUndefs)) {
+      DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
@@ -6529,7 +6696,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     return true;
   }
   case ISD::ZERO_EXTEND_VECTOR_INREG:
-  case X86ISD::VZEXT: {
+  case ISD::ZERO_EXTEND: {
     // TODO - add support for VPMOVZX with smaller input vector types.
     SDValue Src = N.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
@@ -6586,8 +6753,7 @@ static bool resolveTargetShuffleInputs(SDValue Op,
       return false;
 
   resolveTargetShuffleInputsAndMask(Inputs, Mask);
-  // TODO - Add support for more than 2 inputs.
-  return Inputs.size() <= 2;
+  return true;
 }
 
 /// Returns the scalar element that will make up the ith
@@ -8576,9 +8742,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // If we are inserting one variable into a vector of non-zero constants, try
   // to avoid loading each constant element as a scalar. Load the constants as a
   // vector and then insert the variable scalar element. If insertion is not
-  // supported, we assume that we will fall back to a shuffle to get the scalar
-  // blended with the constants. Insertion into a zero vector is handled as a
-  // special-case somewhere below here.
+  // supported, fall back to a shuffle to get the scalar blended with the
+  // constants. Insertion into a zero vector is handled as a special-case
+  // somewhere below here.
   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
@@ -8616,7 +8782,21 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     MachineFunction &MF = DAG.getMachineFunction();
     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
-    return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
+    unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
+    unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
+    if (InsertC < NumEltsInLow128Bits)
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
+
+    // There's no good way to insert into the high elements of a >128-bit
+    // vector, so use shuffles to avoid an extract/insert sequence.
+    assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
+    assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
+    SmallVector<int, 8> ShuffleMask;
+    unsigned NumElts = VT.getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i)
+      ShuffleMask.push_back(i == InsertC ? NumElts : i);
+    SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
+    return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
   }
 
   // Special case for single non-zero, non-undef, element.
@@ -10051,6 +10231,11 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
            "256-bit byte-blends require AVX2 support!");
 
+    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
+    if (SDValue Masked =
+            lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+      return Masked;
+
     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
       MVT IntegerType =
           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
@@ -10058,11 +10243,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
     }
 
-    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
-    if (SDValue Masked =
-            lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
-      return Masked;
-
     // Scale the blend by the number of bytes per element.
     int Scale = VT.getScalarSizeInBits() / 8;
 
@@ -10129,7 +10309,8 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
                                                    SDValue V1, SDValue V2,
                                                    ArrayRef<int> Mask,
-                                                   SelectionDAG &DAG) {
+                                                   SelectionDAG &DAG,
+                                                   bool ImmBlends = false) {
   // We build up the blend mask while checking whether a blend is a viable way
   // to reduce the shuffle.
   SmallVector<int, 32> BlendMask(Mask.size(), -1);
@@ -10149,6 +10330,12 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
     PermuteMask[i] = Mask[i] % Size;
   }
 
+  // If only immediate blends, then bail if the blend mask can't be widened to
+  // i16.
+  unsigned EltSize = VT.getScalarSizeInBits();
+  if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
+    return SDValue();
+
   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
 }
@@ -10219,6 +10406,92 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
   return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
 }
 
+/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
+/// permuting the elements of the result in place.
+static SDValue lowerVectorShuffleAsByteRotateAndPermute(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
+      (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
+      (VT.is512BitVector() && !Subtarget.hasBWI()))
+    return SDValue();
+
+  // We don't currently support lane crossing permutes.
+  if (is128BitLaneCrossingShuffleMask(VT, Mask))
+    return SDValue();
+
+  int Scale = VT.getScalarSizeInBits() / 8;
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumElts = VT.getVectorNumElements();
+  int NumEltsPerLane = NumElts / NumLanes;
+
+  // Determine range of mask elts.
+  bool Blend1 = true;
+  bool Blend2 = true;
+  std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
+  std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
+  for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
+    for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
+      int M = Mask[Lane + Elt];
+      if (M < 0)
+        continue;
+      if (M < NumElts) {
+        Blend1 &= (M == (Lane + Elt));
+        assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
+        M = M % NumEltsPerLane;
+        Range1.first = std::min(Range1.first, M);
+        Range1.second = std::max(Range1.second, M);
+      } else {
+        M -= NumElts;
+        Blend2 &= (M == (Lane + Elt));
+        assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
+        M = M % NumEltsPerLane;
+        Range2.first = std::min(Range2.first, M);
+        Range2.second = std::max(Range2.second, M);
+      }
+    }
+  }
+
+  // Bail if we don't need both elements.
+  // TODO - it might be worth doing this for unary shuffles if the permute
+  // can be widened.
+  if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
+      !(0 <= Range2.first && Range2.second < NumEltsPerLane))
+    return SDValue();
+
+  if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
+    return SDValue();
+
+  // Rotate the 2 ops so we can access both ranges, then permute the result.
+  auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
+    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+    SDValue Rotate = DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
+                        DAG.getBitcast(ByteVT, Lo),
+                        DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
+    SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
+    for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
+      for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
+        int M = Mask[Lane + Elt];
+        if (M < 0)
+          continue;
+        if (M < NumElts)
+          PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
+        else
+          PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
+      }
+    }
+    return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
+  };
+
+  // Check if the ranges are small enough to rotate from either direction.
+  if (Range2.second < Range1.first)
+    return RotateAndPermute(V1, V2, Range1.first, 0);
+  if (Range1.second < Range2.first)
+    return RotateAndPermute(V2, V1, Range2.first, NumElts);
+  return SDValue();
+}
+
 /// Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
@@ -10226,11 +10499,9 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
 /// operations. It will try to pick the best arrangement of shuffles and
 /// blends.
-static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
-                                                          MVT VT, SDValue V1,
-                                                          SDValue V2,
-                                                          ArrayRef<int> Mask,
-                                                          SelectionDAG &DAG) {
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   // Shuffle the input elements into the desired positions in V1 and V2 and
   // blend them together.
   SmallVector<int, 32> V1Mask(Mask.size(), -1);
@@ -10245,18 +10516,26 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
       BlendMask[i] = i + Size;
     }
 
-  // Try to lower with the simpler initial blend/unpack strategies unless one of
-  // the input shuffles would be a no-op. We prefer to shuffle inputs as the
-  // shuffle may be able to fold with a load or other benefit. However, when
-  // we'll have to do 2x as many shuffles in order to achieve this,
-  // blending/unpacking first is a better strategy.
+  // Try to lower with the simpler initial blend/unpack/rotate strategies unless
+  // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
+  // the shuffle may be able to fold with a load or other benefit. However, when
+  // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
+  // pre-shuffle first is a better strategy.
   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
-    if (SDValue BlendPerm =
-            lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+    // Only prefer immediate blends to unpack/rotate.
+    if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+            DL, VT, V1, V2, Mask, DAG, true))
       return BlendPerm;
     if (SDValue UnpackPerm =
             lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
       return UnpackPerm;
+    if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute(
+            DL, VT, V1, V2, Mask, Subtarget, DAG))
+      return RotatePerm;
+    // Unpack/rotate failed - try again with variable blends.
+    if (SDValue BlendPerm =
+            lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+      return BlendPerm;
   }
 
   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
@@ -10767,7 +11046,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
     InputV = ShuffleOffset(InputV);
-    InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
+    InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG);
     return DAG.getBitcast(VT, InputV);
   }
 
@@ -11305,7 +11584,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
   SDValue BC = peekThroughBitcasts(V);
 
   // Also check the simpler case, where we can directly reuse the scalar.
-  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+  if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
     V = V.getOperand(BroadcastIdx);
 
@@ -11791,7 +12070,7 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // a permute. That will be faster than the domain cross.
   if (IsBlendSupported)
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
-                                                      Mask, DAG);
+                                                      Mask, Subtarget, DAG);
 
   // We implement this with SHUFPD which is pretty lame because it will likely
   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
@@ -12101,7 +12380,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     // a permute. That will be faster than the domain cross.
     if (IsBlendSupported)
       return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
-                                                        Mask, DAG);
+                                                        Mask, Subtarget, DAG);
 
     // Try to lower by permuting the inputs into an unpack instruction.
     if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
@@ -12816,7 +13095,7 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // We can always bit-blend if we have to so the fallback strategy is to
   // decompose into single-input permutes and blends.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
-                                                    Mask, DAG);
+                                                    Mask, Subtarget, DAG);
 }
 
 /// Check whether a compaction lowering can be done by dropping even
@@ -12949,6 +13228,10 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+      return V;
+
     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
     // Notably, this handles splat and partial-splat shuffles more efficiently.
     // However, it only makes sense if the pre-duplication shuffle simplifies
@@ -13092,6 +13375,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
       if (Subtarget.hasVBMI() && Subtarget.hasVLX())
         return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+
+      // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
+      // PALIGNR will be cheaper than the second PSHUFB+OR.
+      if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute(
+              DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+        return V;
     }
 
     return PSHUFB;
@@ -13147,7 +13436,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // Handle multi-input cases by blending single-input shuffles.
   if (NumV2Elements > 0)
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
-                                                      Mask, DAG);
+                                                      Mask, Subtarget, DAG);
 
   // The fallback path for single-input shuffles widens this into two v8i16
   // vectors with unpacks, shuffles those, and then pulls them back together
@@ -13360,6 +13649,7 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
                                                 SDValue V1, SDValue V2,
                                                 ArrayRef<int> Mask,
+                                                const X86Subtarget &Subtarget,
                                                 SelectionDAG &DAG) {
   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
          "shuffles as it could then recurse on itself.");
@@ -13386,7 +13676,7 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
   };
   if (DoBothBroadcast())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
-                                                      DAG);
+                                                      Subtarget, DAG);
 
   // If the inputs all stem from a single 128-bit lane of each input, then we
   // split them rather than blending because the split will decompose to
@@ -13404,7 +13694,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
 
   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
   // that the decomposed single-input shuffles don't end up here.
-  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+                                                    Subtarget, DAG);
 }
 
 /// Lower a vector shuffle crossing multiple 128-bit lanes as
@@ -14247,10 +14538,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
-                                                      Mask, DAG);
+                                                      Mask, Subtarget, DAG);
 
   // Otherwise fall back on generic lowering.
-  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
+                                          Subtarget, DAG);
 }
 
 /// Handle lowering of 4-lane 64-bit integer shuffles.
@@ -14344,7 +14636,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Otherwise fall back on generic blend lowering.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
-                                                    Mask, DAG);
+                                                    Mask, Subtarget, DAG);
 }
 
 /// Handle lowering of 8-lane 32-bit floating point shuffles.
@@ -14433,17 +14725,18 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // vpunpckhwd instrs than vblend.
   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
     if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
-                                                     Mask, DAG))
+                                                     Mask, Subtarget, DAG))
       return V;
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
-                                                      Mask, DAG);
+                                                      Mask, Subtarget, DAG);
 
   // Otherwise fall back on generic lowering.
-  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+                                          Subtarget, DAG);
 }
 
 /// Handle lowering of 8-lane 32-bit integer shuffles.
@@ -14472,8 +14765,8 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // vpunpcklwd and vpunpckhwd instrs.
   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
       !Subtarget.hasAVX512())
-    if (SDValue V =
-            lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
+    if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2,
+                                                     Mask, Subtarget, DAG))
       return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
@@ -14556,7 +14849,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Otherwise fall back on generic blend lowering.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
-                                                    Mask, DAG);
+                                                    Mask, Subtarget, DAG);
 }
 
 /// Handle lowering of 16-lane 16-bit integer shuffles.
@@ -14657,7 +14950,8 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return V;
 
   // Otherwise fall back on generic lowering.
-  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
+                                          Subtarget, DAG);
 }
 
 /// Handle lowering of 32-lane 8-bit integer shuffles.
@@ -14747,7 +15041,8 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return V;
 
   // Otherwise fall back on generic lowering.
-  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
+                                          Subtarget, DAG);
 }
 
 /// High-level routine to lower various 256-bit x86 vector shuffles.
@@ -15249,6 +15544,11 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
     return V;
 
+  // Use dedicated pack instructions for masks that match their pattern.
+  if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
+                                             Subtarget))
+    return V;
+
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
@@ -15609,6 +15909,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   SmallVector<int, 16> WidenedMask;
   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
       canWidenShuffleElements(ZeroableMask, WidenedMask)) {
+    // Shuffle mask widening should not interfere with a broadcast opportunity
+    // by obfuscating the operands with bitcasts.
+    // TODO: Avoid lowering directly from this top-level function: make this
+    // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
+    if (SDValue Broadcast =
+            lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
+
     MVT NewEltVT = VT.isFloatingPoint()
                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
@@ -15726,7 +16034,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
     // Build a mask by testing the condition against zero.
     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
-                                getZeroVector(CondVT, Subtarget, DAG, dl),
+                                DAG.getConstant(0, dl, CondVT),
                                 ISD::SETNE);
     // Now return a new VSELECT using the mask.
     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
@@ -16716,6 +17024,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
 /// Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
+/// TODO: Can this be moved to general expansion code?
 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   MVT VT = Op.getSimpleValueType();
@@ -16725,8 +17034,8 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
-  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
-  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
+  // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
   // during isel.
   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
@@ -16736,10 +17045,10 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
 
   SDValue Tmp2, Tmp3;
   if (Op.getOpcode() == ISD::SHL_PARTS) {
-    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   } else {
-    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+    Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   }
 
@@ -16763,6 +17072,37 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   return DAG.getMergeValues({ Lo, Hi }, dl);
 }
 
+static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
+         "Unexpected funnel shift opcode!");
+  assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+         "Unexpected funnel shift type!");
+
+  SDLoc DL(Op);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+
+  // Expand slow SHLD/SHRD cases if we are not optimizing for size.
+  bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+  if (!OptForSize && Subtarget.isSHLDSlow())
+    return SDValue();
+
+  bool IsFSHR = Op.getOpcode() == ISD::FSHR;
+  if (IsFSHR)
+    std::swap(Op0, Op1);
+
+  // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
+  if (VT == MVT::i16)
+    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
+                      DAG.getConstant(15, DL, Amt.getValueType()));
+
+  unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
+  return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
+}
+
 // Try to use a packed vector operation to handle i64 on 32-bit targets when
 // AVX512DQ is enabled.
 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
@@ -17450,8 +17790,19 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
           InVT.getVectorElementType() == MVT::i32) &&
          "Unexpected element type");
 
+  // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
+  if (InVT == MVT::v8i8) {
+    if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+      return SDValue();
+
+    In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
+                     MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
+    // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input.
+    return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In);
+  }
+
   if (Subtarget.hasInt256())
-    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
+    return Op;
 
   // Optimize vectors in AVX mode:
   //
@@ -17534,7 +17885,7 @@ static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
   }
 
   SDValue One = DAG.getConstant(1, DL, WideVT);
-  SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
+  SDValue Zero = DAG.getConstant(0, DL, WideVT);
 
   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
 
@@ -17742,10 +18093,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
   }
   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
   if (Subtarget.hasDQI())
-    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
-                       In, ISD::SETGT);
-  return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
-                      ISD::SETNE);
+    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
+  return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
 }
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
@@ -17758,20 +18107,22 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
          "Invalid TRUNCATE operation");
 
+  // If called by the legalizer just return.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+    return SDValue();
+
   if (VT.getVectorElementType() == MVT::i1)
     return LowerTruncateVecI1(Op, DAG, Subtarget);
 
   // vpmovqb/w/d, vpmovdb/w, vpmovwb
   if (Subtarget.hasAVX512()) {
-    // word to byte only under BWI
-    if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
-      // Make sure we're allowed to promote 512-bits.
-      if (Subtarget.canExtendTo512DQ())
-        return DAG.getNode(ISD::TRUNCATE, DL, VT,
-                           DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
-    } else {
+    // word to byte only under BWI. Otherwise we have to promoted to v16i32
+    // and then truncate that. But we should only do that if we haven't been
+    // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
+    // handled by isel patterns.
+    if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
+        Subtarget.canExtendTo512DQ())
       return Op;
-    }
   }
 
   unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
@@ -17859,6 +18210,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getBitcast(MVT::v8i16, res);
   }
 
+  if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
+    // Use an AND to zero uppper bits for PACKUS.
+    In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
+
+    SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
+                               DAG.getIntPtrConstant(0, DL));
+    SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
+                               DAG.getIntPtrConstant(8, DL));
+    return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
+  }
+
   // Handle truncation of V256 to V128 using shuffles.
   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
 
@@ -18213,8 +18575,8 @@ static bool hasNonFlagsUse(SDValue Op) {
 
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
-                                    SelectionDAG &DAG) const {
+static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
+                        SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   // CF and OF aren't always set the way we want. Determine which
   // of these we need.
   bool NeedCF = false;
@@ -18257,27 +18619,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   unsigned Opcode = 0;
   unsigned NumOperands = 0;
 
-  // Truncate operations may prevent the merge of the SETCC instruction
-  // and the arithmetic instruction before it. Attempt to truncate the operands
-  // of the arithmetic instruction and use a reduced bit-width instruction.
-  bool NeedTruncation = false;
   SDValue ArithOp = Op;
-  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
-    SDValue Arith = Op->getOperand(0);
-    // Both the trunc and the arithmetic op need to have one user each.
-    if (Arith->hasOneUse())
-      switch (Arith.getOpcode()) {
-        default: break;
-        case ISD::ADD:
-        case ISD::SUB:
-        case ISD::AND:
-        case ISD::OR:
-        case ISD::XOR: {
-          NeedTruncation = true;
-          ArithOp = Arith;
-        }
-      }
-  }
 
   // Sometimes flags can be set either with an AND or with an SRL/SHL
   // instruction. SRL/SHL variant should be preferred for masks longer than this
@@ -18326,27 +18668,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     Opcode = X86ISD::ADD;
     NumOperands = 2;
     break;
-  case ISD::SHL:
-  case ISD::SRL:
-    // If we have a constant logical shift that's only used in a comparison
-    // against zero turn it into an equivalent AND. This allows turning it into
-    // a TEST instruction later.
-    if (ZeroCheck && Op->hasOneUse() &&
-        isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
-      EVT VT = Op.getValueType();
-      unsigned BitWidth = VT.getSizeInBits();
-      unsigned ShAmt = Op->getConstantOperandVal(1);
-      if (ShAmt >= BitWidth) // Avoid undefined shifts.
-        break;
-      APInt Mask = ArithOp.getOpcode() == ISD::SRL
-                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
-                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
-      if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
-        break;
-      Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
-                       DAG.getConstant(Mask, dl, VT));
-    }
-    break;
 
   case ISD::AND:
     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
@@ -18389,8 +18710,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
         if (LeadingOnes + TrailingZeros == BitWidth) {
           assert(TrailingZeros < VT.getSizeInBits() &&
                  "Shift amount should be less than the type width");
-          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
-          SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
+          SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, MVT::i8);
           Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
           break;
         }
@@ -18401,8 +18721,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
         if (LeadingZeros + TrailingOnes == BitWidth) {
           assert(LeadingZeros < VT.getSizeInBits() &&
                  "Shift amount should be less than the type width");
-          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
-          SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
+          SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, MVT::i8);
           Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
           break;
         }
@@ -18447,36 +18766,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     break;
   }
 
-  // If we found that truncation is beneficial, perform the truncation and
-  // update 'Op'.
-  if (NeedTruncation) {
-    EVT VT = Op.getValueType();
-    SDValue WideVal = Op->getOperand(0);
-    EVT WideVT = WideVal.getValueType();
-    unsigned ConvertedOp = 0;
-    // Use a target machine opcode to prevent further DAGCombine
-    // optimizations that may separate the arithmetic operations
-    // from the setcc node.
-    switch (WideVal.getOpcode()) {
-      default: break;
-      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
-      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
-      case ISD::AND: ConvertedOp = X86ISD::AND; break;
-      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
-      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
-    }
-
-    if (ConvertedOp) {
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
-        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
-        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
-        SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-        Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
-      }
-    }
-  }
-
   if (Opcode == 0) {
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
@@ -18495,10 +18784,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                                    const SDLoc &dl, SelectionDAG &DAG) const {
   if (isNullConstant(Op1))
-    return EmitTest(Op0, X86CC, dl, DAG);
-
-  assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
-         "Unexpected comparison operation for MVT::i1 operands");
+    return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
 
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
@@ -18521,6 +18807,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
     return SDValue(Sub.getNode(), 1);
   }
+  assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!");
   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
 }
 
@@ -18830,34 +19117,32 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
 }
 
-/// Try to turn a VSETULT into a VSETULE by modifying its second
-/// operand \p Op1.  If non-trivial (for example because it's not constant)
-/// return an empty value.
-static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
-                                      SelectionDAG &DAG) {
-  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+/// Given a simple buildvector constant, return a new vector constant with each
+/// element decremented. If decrementing would result in underflow or this
+/// is not a simple vector constant, return an empty value.
+static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
+  auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
   if (!BV)
     return SDValue();
 
-  MVT VT = Op1.getSimpleValueType();
-  MVT EVT = VT.getVectorElementType();
-  unsigned n = VT.getVectorNumElements();
-  SmallVector<SDValue, 8> ULTOp1;
-
-  for (unsigned i = 0; i < n; ++i) {
-    ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
-    if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
+  MVT VT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> NewVecC;
+  SDLoc DL(V);
+  for (unsigned i = 0; i < NumElts; ++i) {
+    auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
       return SDValue();
 
     // Avoid underflow.
-    APInt Val = Elt->getAPIntValue();
-    if (Val == 0)
+    if (Elt->getAPIntValue().isNullValue())
       return SDValue();
 
-    ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
+    NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
   }
 
-  return DAG.getBuildVector(VT, dl, ULTOp1);
+  return DAG.getBuildVector(VT, DL, NewVecC);
 }
 
 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
@@ -18886,7 +19171,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
     // Only do this pre-AVX since vpcmp* is no longer destructive.
     if (Subtarget.hasAVX())
       return SDValue();
-    SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
+    SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
     if (!ULEOp1)
       return SDValue();
     Op1 = ULEOp1;
@@ -18900,9 +19185,9 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
     break;
   }
 
-  SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
+  SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
-                     getZeroVector(VT, Subtarget, DAG, dl));
+                     DAG.getConstant(0, dl, VT));
 }
 
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
@@ -19065,13 +19350,26 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
 
-  // Special case: Use min/max operations for unsigned compares. We only want
-  // to do this for unsigned compares if we need to flip signs or if it allows
-  // use to avoid an invert.
+  // Special case: Use min/max operations for unsigned compares.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (ISD::isUnsignedIntSetCC(Cond) &&
       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
       TLI.isOperationLegal(ISD::UMIN, VT)) {
+    // If we have a constant operand, increment/decrement it and change the
+    // condition to avoid an invert.
+    // TODO: This could be extended to handle a non-splat constant by checking
+    // that each element of the constant is not the max/null value.
+    APInt C;
+    if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) {
+      // X > C --> X >= (C+1) --> X == umax(X, C+1)
+      Op1 = DAG.getConstant(C + 1, dl, VT);
+      Cond = ISD::SETUGE;
+    }
+    if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) {
+      // X < C --> X <= (C-1) --> X == umin(X, C-1)
+      Op1 = DAG.getConstant(C - 1, dl, VT);
+      Cond = ISD::SETULE;
+    }
     bool Invert = false;
     unsigned Opc;
     switch (Cond) {
@@ -19381,7 +19679,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       // of 3 logic instructions for size savings and potentially speed.
       // Unfortunately, there is no scalar form of VBLENDV.
 
-      // If either operand is a constant, don't try this. We can expect to
+      // If either operand is a +0.0 constant, don't try this. We can expect to
       // optimize away at least one of the logic instructions later in that
       // case, so that sequence would be faster than a variable blend.
 
@@ -19389,13 +19687,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       // uses XMM0 as the selection register. That may need just as many
       // instructions as the AND/ANDN/OR sequence due to register moves, so
       // don't bother.
-
-      if (Subtarget.hasAVX() &&
-          !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
-
+      if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
+          !isNullFPConstant(Op2)) {
         // Convert to vectors, do a VSELECT, and convert back to scalar.
         // All of the conversions should be optimized away.
-
         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
@@ -19489,22 +19784,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       // (select (x == 0), 0, -1) -> neg & sbb
       if (isNullConstant(Y) &&
           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
-        SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
-        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
-        SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
-                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                  SDValue(Neg.getNode(), 1));
-        return Res;
+        SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
+        SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+        Zero = DAG.getConstant(0, DL, Op.getValueType());
+        return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
       }
 
       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
 
+      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+      SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
       SDValue Res =   // Res = 0 or -1.
-        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
-                    DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
+        DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
 
       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
         Res = DAG.getNOT(DL, Res, Res.getValueType());
@@ -19632,7 +19926,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   if (AddTest) {
     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
-    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
+    Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
+                   X86::COND_NE, DL, DAG);
   }
 
   // a <  b ? -1 :  0 -> RES = ~setcc_carry
@@ -19723,8 +20018,8 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
     V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
   } else {
-    SDValue NegOne = getOnesVector(WideVT, DAG, dl);
-    SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
+    SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
+    SDValue Zero = DAG.getConstant(0, dl, WideVT);
     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
   }
 
@@ -19764,7 +20059,6 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   SDValue In = Op->getOperand(0);
   MVT VT = Op->getSimpleValueType(0);
   MVT InVT = In.getSimpleValueType();
-  assert(VT.getSizeInBits() == InVT.getSizeInBits());
 
   MVT SVT = VT.getVectorElementType();
   MVT InSVT = InVT.getVectorElementType();
@@ -19785,11 +20079,12 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
 
   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
   // For 512-bit vectors, we need 128-bits or 256-bits.
-  if (VT.getSizeInBits() > 128) {
+  if (InVT.getSizeInBits() > 128) {
     // Input needs to be at least the same number of elements as output, and
     // at least 128-bits.
     int InSize = InSVT.getSizeInBits() * NumElts;
     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
+    InVT = In.getSimpleValueType();
   }
 
   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
@@ -19797,8 +20092,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   // need to be handled here for 256/512-bit results.
   if (Subtarget.hasInt256()) {
     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
+
+    if (InVT.getVectorNumElements() != NumElts)
+      return DAG.getNode(Op.getOpcode(), dl, VT, In);
+
+    // FIXME: Apparently we create inreg operations that could be regular
+    // extends.
     unsigned ExtOpc =
-        Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT;
+        Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
+                                             : ISD::ZERO_EXTEND;
     return DAG.getNode(ExtOpc, dl, VT, In);
   }
 
@@ -19808,7 +20110,6 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
     int HalfNumElts = NumElts / 2;
     MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);
 
-    InVT = In.getSimpleValueType();
     unsigned NumSrcElts = InVT.getVectorNumElements();
     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
     for (int i = 0; i != HalfNumElts; ++i)
@@ -19822,39 +20123,46 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
 
   // We should only get here for sign extend.
   assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
+  assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
 
   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
   SDValue Curr = In;
-  MVT CurrVT = InVT;
+  SDValue SignExt = Curr;
 
   // As SRAI is only available on i16/i32 types, we expand only up to i32
   // and handle i64 separately.
-  while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
-    Curr = getUnpackl(DAG, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
-    MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
-    CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
-    Curr = DAG.getBitcast(CurrVT, Curr);
-  }
+  if (InVT != MVT::v4i32) {
+    MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
 
-  SDValue SignExt = Curr;
-  if (CurrVT != InVT) {
-    unsigned SignExtShift =
-        CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
-    SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+    unsigned DestWidth = DestVT.getScalarSizeInBits();
+    unsigned Scale = DestWidth / InSVT.getSizeInBits();
+
+    unsigned InNumElts = InVT.getVectorNumElements();
+    unsigned DestElts = DestVT.getVectorNumElements();
+
+    // Build a shuffle mask that takes each input element and places it in the
+    // MSBs of the new element size.
+    SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
+    for (unsigned i = 0; i != DestElts; ++i)
+      Mask[i * Scale + (Scale - 1)] = i;
+
+    Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
+    Curr = DAG.getBitcast(DestVT, Curr);
+
+    unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
+    SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
                           DAG.getConstant(SignExtShift, dl, MVT::i8));
   }
 
-  if (CurrVT == VT)
-    return SignExt;
-
-  if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
-    SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
-                               DAG.getConstant(31, dl, MVT::i8));
-    SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
-    return DAG.getBitcast(VT, Ext);
+  if (VT == MVT::v2i64) {
+    assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
+    SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
+    SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
+    SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
+    SignExt = DAG.getBitcast(VT, SignExt);
   }
 
-  return SDValue();
+  return SignExt;
 }
 
 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
@@ -19879,8 +20187,18 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
           InVT.getVectorElementType() == MVT::i32) &&
          "Unexpected element type");
 
+  // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
+  if (InVT == MVT::v8i8) {
+    if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+      return SDValue();
+
+    In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
+                     MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
+    return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
+  }
+
   if (Subtarget.hasInt256())
-    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+    return Op;
 
   // Optimize vectors in AVX mode
   // Sign extend  v8i16 to v8i32 and
@@ -19913,7 +20231,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(St);
   SDValue StoredVal = St->getValue();
 
-  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
   if (StoredVal.getValueType().isVector() &&
       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
     assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
@@ -19935,14 +20253,23 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
   if (St->isTruncatingStore())
     return SDValue();
 
-  assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT");
+  MVT StoreVT = StoredVal.getSimpleValueType();
+  assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
+         "Unexpected VT");
+  if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
+        TargetLowering::TypeWidenVector)
+    return SDValue();
 
-  // Widen the vector, cast to a v2x64 type, extract the single 64-bit
-  // element and store it.
-  StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal,
-                          DAG.getUNDEF(MVT::v2f32));
-  StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal);
-  StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal,
+  // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
+  // and store it.
+  MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
+                                StoreVT.getVectorNumElements() * 2);
+  StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
+                          DAG.getUNDEF(StoreVT));
+  MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
+  MVT CastVT = MVT::getVectorVT(StVT, 2);
+  StoredVal = DAG.getBitcast(CastVT, StoredVal);
+  StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
                           DAG.getIntPtrConstant(0, dl));
 
   return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
@@ -19952,7 +20279,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
 
 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
 // may emit an illegal shuffle but the expansion is still better than scalar
-// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
+// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
 // we'll emit a shuffle and a arithmetic shift.
 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
 // TODO: It is possible to support ZExt by zeroing the undef values during
@@ -19960,16 +20287,16 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   MVT RegVT = Op.getSimpleValueType();
-  assert(RegVT.isVector() && "We only custom lower vector sext loads.");
+  assert(RegVT.isVector() && "We only custom lower vector loads.");
   assert(RegVT.isInteger() &&
-         "We only custom lower integer vector sext loads.");
+         "We only custom lower integer vector loads.");
 
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
   EVT MemVT = Ld->getMemoryVT();
 
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
-  if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
+  if (RegVT.getVectorElementType() == MVT::i1) {
     assert(EVT(RegVT) == MemVT && "Expected non-extending load");
     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
@@ -20072,26 +20399,26 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
          "Can only lower sext loads with a single scalar load!");
 
-  unsigned loadRegZize = RegSz;
+  unsigned loadRegSize = RegSz;
   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
-    loadRegZize = 128;
+    loadRegSize = 128;
 
   // If we don't have BWI we won't be able to create the shuffle needed for
   // v8i8->v8i64.
   if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
       MemVT == MVT::v8i8)
-    loadRegZize = 128;
+    loadRegSize = 128;
 
   // Represent our vector as a sequence of elements which are the
   // largest scalar that we can load.
   EVT LoadUnitVecVT = EVT::getVectorVT(
-      *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
+      *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());
 
   // Represent the data using the same element type that is stored in
   // memory. In practice, we ''widen'' MemVT.
   EVT WideVecVT =
       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                       loadRegZize / MemVT.getScalarSizeInBits());
+                       loadRegSize / MemVT.getScalarSizeInBits());
 
   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
          "Invalid vector type");
@@ -20137,25 +20464,13 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
   unsigned SizeRatio = RegSz / MemSz;
 
   if (Ext == ISD::SEXTLOAD) {
-    // If we have SSE4.1, we can directly emit a VSEXT node.
-    if (Subtarget.hasSSE41()) {
-      SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
-      return DAG.getMergeValues({Sext, TF}, dl);
-    }
-
-    // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
-    // lanes.
-    assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
-           "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
-
-    SDValue Shuff = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, RegVT,
-                                SlicedVec);
-    return DAG.getMergeValues({Shuff, TF}, dl);
+    SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG);
+    return DAG.getMergeValues({Sext, TF}, dl);
   }
 
   if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
       MemVT == MVT::v8i8) {
-    SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
+    SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG);
     return DAG.getMergeValues({Sext, TF}, dl);
   }
 
@@ -20459,7 +20774,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   if (addTest) {
     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
-    Cond = EmitTest(Cond, X86Cond, dl, DAG);
+    Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
+                   X86Cond, dl, DAG);
   }
   Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -20967,7 +21283,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
   SDLoc dl(Op);
 
   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
-  SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
+  SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
+                              DAG.getBitcast(MVT::v8i1, Mask),
+                              DAG.getIntPtrConstant(0, dl));
   if (Op.getOpcode() == X86ISD::FSETCCM ||
       Op.getOpcode() == X86ISD::FSETCCM_RND ||
       Op.getOpcode() == X86ISD::VFPCLASSS)
@@ -21595,10 +21913,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case ADX: {
       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
       SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
-      SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
-                                  DAG.getConstant(-1, dl, MVT::i8));
-      SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
-                                Op.getOperand(3), GenCF.getValue(1));
+
+      SDValue Res;
+      // If the carry in is zero, then we should just use ADD/SUB instead of
+      // ADC/SBB.
+      if (isNullConstant(Op.getOperand(1))) {
+        Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
+                          Op.getOperand(3));
+      } else {
+        SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
+                                    DAG.getConstant(-1, dl, MVT::i8));
+        Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
+                          Op.getOperand(3), GenCF.getValue(1));
+      }
       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
       SDValue Results[] = { SetCC, Res };
       return DAG.getMergeValues(Results, dl);
@@ -22859,11 +23186,10 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
   // we just take the hi result (by masking the lo result to zero before the
   // add).
   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
-  SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
+  SDValue Zero = DAG.getConstant(0, DL, CurrVT);
 
-  SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
-  SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
+  SDValue Lo = Op0;
   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
   SDValue HiZ;
   if (CurrVT.is512BitVector()) {
@@ -23078,6 +23404,28 @@ static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
   return split256IntArith(Op, DAG);
 }
 
+static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  if (VT.getScalarType() == MVT::i1) {
+    SDLoc dl(Op);
+    switch (Op.getOpcode()) {
+    default: llvm_unreachable("Expected saturated arithmetic opcode");
+    case ISD::UADDSAT:
+    case ISD::SADDSAT:
+      return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1));
+    case ISD::USUBSAT:
+    case ISD::SSUBSAT:
+      return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
+                         DAG.getNOT(dl, Op.getOperand(1), VT));
+    }
+  }
+
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return split256IntArith(Op, DAG);
+}
+
 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
@@ -23155,53 +23503,49 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
   // vector pairs, multiply and truncate.
   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
-    if (Subtarget.hasInt256()) {
-      // For 512-bit vectors, split into 256-bit vectors to allow the
-      // sign-extension to occur.
-      if (VT == MVT::v64i8)
-        return split512IntArith(Op, DAG);
-
-      // For 256-bit vectors, split into 128-bit vectors to allow the
-      // sign-extension to occur. We don't need this on AVX512BW as we can
-      // safely sign-extend to v32i16.
-      if (VT == MVT::v32i8 && !Subtarget.hasBWI())
-        return split256IntArith(Op, DAG);
+    unsigned NumElts = VT.getVectorNumElements();
 
+    if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
+        (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
       return DAG.getNode(
           ISD::TRUNCATE, dl, VT,
           DAG.getNode(ISD::MUL, dl, ExVT,
-                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
-                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
+                      DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
+                      DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
     }
 
-    assert(VT == MVT::v16i8 &&
-           "Pre-AVX2 support only supports v16i8 multiplication");
-    MVT ExVT = MVT::v8i16;
+    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
-    // Extract the lo parts and sign extend to i16
-    // We're going to mask off the low byte of each result element of the
-    // pmullw, so it doesn't matter what's in the high byte of each 16-bit
-    // element.
-    const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
-                              4, -1, 5, -1, 6, -1, 7, -1};
-    SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
-    SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
-    ALo = DAG.getBitcast(ExVT, ALo);
-    BLo = DAG.getBitcast(ExVT, BLo);
-
-    // Extract the hi parts and sign extend to i16
+    // Extract the lo/hi parts to any extend to i16.
     // We're going to mask off the low byte of each result element of the
     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
     // element.
-    const int HiShufMask[] = {8,  -1, 9,  -1, 10, -1, 11, -1,
-                              12, -1, 13, -1, 14, -1, 15, -1};
-    SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
-    SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
-    AHi = DAG.getBitcast(ExVT, AHi);
-    BHi = DAG.getBitcast(ExVT, BHi);
-
-    // Multiply, mask the lower 8bits of the lo/hi results and pack
+    SDValue Undef = DAG.getUNDEF(VT);
+    SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
+    SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
+
+    SDValue BLo, BHi;
+    if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+      // If the LHS is a constant, manually unpackl/unpackh.
+      SmallVector<SDValue, 16> LoOps, HiOps;
+      for (unsigned i = 0; i != NumElts; i += 16) {
+        for (unsigned j = 0; j != 8; ++j) {
+          LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
+                                               MVT::i16));
+          HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
+                                               MVT::i16));
+        }
+      }
+
+      BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+      BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+    } else {
+      BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
+      BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
+    }
+
+    // Multiply, mask the lower 8bits of the lo/hi results and pack.
     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
@@ -23262,7 +23606,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
 
-  SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+  SDValue Zero = DAG.getConstant(0, dl, VT);
 
   // Only multiply lo/hi halves that aren't known to be zero.
   SDValue AloBlo = Zero;
@@ -23352,11 +23696,11 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
     // If we have a signed multiply but no PMULDQ fix up the result of an
     // unsigned multiply.
     if (IsSigned && !Subtarget.hasSSE41()) {
-      SDValue ShAmt = DAG.getConstant(31, dl, VT);
+      SDValue Zero = DAG.getConstant(0, dl, VT);
       SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
-                               DAG.getNode(ISD::SRA, dl, VT, A, ShAmt), B);
+                               DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
       SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
-                               DAG.getNode(ISD::SRA, dl, VT, B, ShAmt), A);
+                               DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
 
       SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
@@ -23375,113 +23719,138 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
 
   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
   // and then ashr/lshr the upper bits down to the lower bits before multiply.
-  unsigned ExShift = IsSigned ? X86ISD::VSRAI : X86ISD::VSRLI;
   unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
 
-  // For 512-bit vectors, split into 256-bit vectors to allow the
+  if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
+      (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
+    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
+    SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
+    SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
+    Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
+  }
+
+  // For signed 512-bit vectors, split into 256-bit vectors to allow the
   // sign-extension to occur.
-  if (VT == MVT::v64i8)
+  if (VT == MVT::v64i8 && IsSigned)
     return split512IntArith(Op, DAG);
 
-  // AVX2 implementations - extend xmm subvectors to ymm.
-  if (Subtarget.hasInt256()) {
+  // Signed AVX2 implementation - extend xmm subvectors to ymm.
+  if (VT == MVT::v32i8 && IsSigned) {
     SDValue Lo = DAG.getIntPtrConstant(0, dl);
     SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl);
 
-    if (VT == MVT::v32i8) {
-      if (Subtarget.canExtendTo512BW()) {
-        MVT ExVT = MVT::v32i16;
-        SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
-        SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
-        SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
-        Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
-        return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
+    MVT ExVT = MVT::v16i16;
+    SDValue ALo = extract128BitVector(A, 0, DAG, dl);
+    SDValue BLo = extract128BitVector(B, 0, DAG, dl);
+    SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
+    SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
+    ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
+    BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
+    AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
+    BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
+    Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+    Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+    Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
+    Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
+
+    // Bitcast back to VT and then pack all the even elements from Lo and Hi.
+    // Shuffle lowering should turn this into PACKUS+PERMQ
+    Lo = DAG.getBitcast(VT, Lo);
+    Hi = DAG.getBitcast(VT, Hi);
+    return DAG.getVectorShuffle(VT, dl, Lo, Hi,
+                                { 0,  2,  4,  6,  8, 10, 12, 14,
+                                 16, 18, 20, 22, 24, 26, 28, 30,
+                                 32, 34, 36, 38, 40, 42, 44, 46,
+                                 48, 50, 52, 54, 56, 58, 60, 62});
+  }
+
+  // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
+  // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
+  // shift the results and pack the half lane results back together.
+
+  MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+  static const int PSHUFDMask[] = { 8,  9, 10, 11, 12, 13, 14, 15,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
+
+  // Extract the lo parts and zero/sign extend to i16.
+  // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
+  // shifts to sign extend. Using unpack for unsigned only requires an xor to
+  // create zeros and a copy due to tied registers contraints pre-avx. But using
+  // zero_extend_vector_inreg would require an additional pshufd for the high
+  // part.
+
+  SDValue ALo, AHi;
+  if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
+    ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
+
+    AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
+    AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
+  } else if (IsSigned) {
+    ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
+    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
+
+    ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
+    AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
+  } else {
+    ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
+                                          DAG.getConstant(0, dl, VT)));
+    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
+                                          DAG.getConstant(0, dl, VT)));
+  }
+
+  SDValue BLo, BHi;
+  if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+    // If the LHS is a constant, manually unpackl/unpackh and extend.
+    SmallVector<SDValue, 16> LoOps, HiOps;
+    for (unsigned i = 0; i != NumElts; i += 16) {
+      for (unsigned j = 0; j != 8; ++j) {
+        SDValue LoOp = B.getOperand(i + j);
+        SDValue HiOp = B.getOperand(i + j + 8);
+
+        if (IsSigned) {
+          LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
+          HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
+        } else {
+          LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
+          HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+        }
+
+        LoOps.push_back(LoOp);
+        HiOps.push_back(HiOp);
       }
-      MVT ExVT = MVT::v16i16;
-      SDValue ALo = extract128BitVector(A, 0, DAG, dl);
-      SDValue BLo = extract128BitVector(B, 0, DAG, dl);
-      SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
-      SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
-      ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
-      BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
-      AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
-      BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
-      Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
-      Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
-      Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
-      Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
-
-      SDValue Res = DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
-      // The ymm variant of PACKUS treats the 128-bit lanes separately, so we
-      // need to permute the final result into place.
-      Res = DAG.getBitcast(MVT::v4i64, Res);
-      Res = DAG.getVectorShuffle(MVT::v4i64, dl, Res, Res, { 0, 2, 1, 3 });
-      return DAG.getBitcast(VT, Res);
     }
 
-    assert(VT == MVT::v16i8 && "Unexpected VT");
+    BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+    BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+  } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
+    BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
 
-    SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
-    SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
-    SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
-    Mul =
-        getTargetVShiftByConstNode(X86ISD::VSRLI, dl, MVT::v16i16, Mul, 8, DAG);
-    // If we have BWI we can use truncate instruction.
-    if (Subtarget.hasBWI())
-      return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
-    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
-    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
-    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
-  }
+    BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
+    BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
+  } else if (IsSigned) {
+    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
+    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
 
-  assert(VT == MVT::v16i8 &&
-         "Pre-AVX2 support only supports v16i8 multiplication");
-  MVT ExVT = MVT::v8i16;
-  unsigned ExSSE41 = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
-                              : ISD::ZERO_EXTEND_VECTOR_INREG;
-
-  // Extract the lo parts and zero/sign extend to i16.
-  SDValue ALo, BLo;
-  if (Subtarget.hasSSE41()) {
-    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
-    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
-  } else {
-    const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
-                            -1, 4, -1, 5, -1, 6, -1, 7};
-    ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
-    BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-    ALo = DAG.getBitcast(ExVT, ALo);
-    BLo = DAG.getBitcast(ExVT, BLo);
-    ALo = getTargetVShiftByConstNode(ExShift, dl, ExVT, ALo, 8, DAG);
-    BLo = getTargetVShiftByConstNode(ExShift, dl, ExVT, BLo, 8, DAG);
-  }
-
-  // Extract the hi parts and zero/sign extend to i16.
-  SDValue AHi, BHi;
-  if (Subtarget.hasSSE41()) {
-    const int ShufMask[] = { 8,  9, 10, 11, 12, 13, 14, 15,
-                            -1, -1, -1, -1, -1, -1, -1, -1};
-    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
-    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
-    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+    BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
+    BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
   } else {
-    const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
-                            -1, 12, -1, 13, -1, 14, -1, 15};
-    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
-    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-    AHi = DAG.getBitcast(ExVT, AHi);
-    BHi = DAG.getBitcast(ExVT, BHi);
-    AHi = getTargetVShiftByConstNode(ExShift, dl, ExVT, AHi, 8, DAG);
-    BHi = getTargetVShiftByConstNode(ExShift, dl, ExVT, BHi, 8, DAG);
+    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
+                                          DAG.getConstant(0, dl, VT)));
+    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
+                                          DAG.getConstant(0, dl, VT)));
   }
 
   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
-  // pack back to v16i8.
+  // pack back to vXi8.
   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
   RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
   RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
+
+  // Bitcast back to VT and then pack all the even elements from Lo and Hi.
   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
 }
 
@@ -23605,8 +23974,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
              "Unsupported PCMPGT op");
-      return DAG.getNode(X86ISD::PCMPGT, dl, VT,
-                         getZeroVector(VT, Subtarget, DAG, dl), R);
+      return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
     }
 
     if (ShiftAmt >= 32) {
@@ -23621,7 +23989,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
                                   {9, 1, 11, 3, 13, 5, 15, 7});
     } else {
-      // SRA upper i32, SHL whole i64 and select lower i32.
+      // SRA upper i32, SRL whole i64 and select lower i32.
       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
                                                  ShiftAmt, DAG);
       SDValue Lower =
@@ -23662,7 +24030,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
     // ashr(R, 7)  === cmp_slt(R, 0)
     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
-      SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+      SDValue Zeros = DAG.getConstant(0, dl, VT);
       if (VT.is512BitVector()) {
         assert(VT == MVT::v64i8 && "Unexpected element type!");
         SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
@@ -23708,60 +24076,52 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Determine if V is a splat value, and return the scalar.
-static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
-                            SelectionDAG &DAG, const X86Subtarget &Subtarget,
-                            unsigned Opcode) {
-   V = peekThroughEXTRACT_SUBVECTORs(V);
+// If V is a splat value, return the source vector and splat index;
+static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) {
+  V = peekThroughEXTRACT_SUBVECTORs(V);
 
-  // Check if this is a splat build_vector node.
-  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
-    SDValue SplatAmt = BV->getSplatValue();
-    if (SplatAmt && SplatAmt.isUndef())
-      return SDValue();
-    return SplatAmt;
-  }
-
-  // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
-  if (V.getOpcode() == ISD::SUB &&
-      !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
-    SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
-    SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
-
-    // Ensure that the corresponding splat BV element is not UNDEF.
-    BitVector UndefElts;
-    BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
-    ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
-    if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
-      unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
-      if (!UndefElts[SplatIdx])
-        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                           VT.getVectorElementType(), V,
-                           DAG.getIntPtrConstant(SplatIdx, dl));
+  EVT VT = V.getValueType();
+  unsigned Opcode = V.getOpcode();
+  switch (Opcode) {
+  default: {
+    APInt UndefElts;
+    APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+    if (DAG.isSplatValue(V, DemandedElts, UndefElts)) {
+      // Handle case where all demanded elements are UNDEF.
+      if (DemandedElts.isSubsetOf(UndefElts)) {
+        SplatIdx = 0;
+        return DAG.getUNDEF(VT);
+      }
+      SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
+      return V;
     }
+    break;
   }
-
-  // Check if this is a shuffle node doing a splat.
-  ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
-  if (!SVN || !SVN->isSplat())
-    return SDValue();
-
-  unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
-  SDValue InVec = V.getOperand(0);
-  if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
-    assert((SplatIdx < VT.getVectorNumElements()) &&
-           "Unexpected shuffle index found!");
-    return InVec.getOperand(SplatIdx);
-  } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
-      if (C->getZExtValue() == SplatIdx)
-        return InVec.getOperand(1);
+  case ISD::VECTOR_SHUFFLE: {
+    // Check if this is a shuffle node doing a splat.
+    // TODO - remove this and rely purely on SelectionDAG::isSplatValue,
+    // getTargetVShiftNode currently struggles without the splat source.
+    auto *SVN = cast<ShuffleVectorSDNode>(V);
+    if (!SVN->isSplat())
+      break;
+    int Idx = SVN->getSplatIndex();
+    int NumElts = V.getValueType().getVectorNumElements();
+    SplatIdx = Idx % NumElts;
+    return V.getOperand(Idx / NumElts);
   }
+  }
+
+  return SDValue();
+}
 
-  // Avoid introducing an extract element from a shuffle.
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                     VT.getVectorElementType(), InVec,
-                     DAG.getIntPtrConstant(SplatIdx, dl));
+static SDValue GetSplatValue(SDValue V, const SDLoc &dl,
+                             SelectionDAG &DAG) {
+  int SplatIdx;
+  if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG))
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                       SrcVector.getValueType().getScalarType(), SrcVector,
+                       DAG.getIntPtrConstant(SplatIdx, dl));
+  return SDValue();
 }
 
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
@@ -23774,9 +24134,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
 
-  Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
-
-  if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
+  if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) {
     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
       MVT EltVT = VT.getVectorElementType();
       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
@@ -23901,7 +24259,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
 
   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
-    SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+    SDValue Z = DAG.getConstant(0, dl, VT);
     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
@@ -24095,7 +24453,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       // just zero-extending, but for SSE just duplicating the top 16-bits is
       // cheaper and has the same effect for out of range values.
       if (Subtarget.hasAVX()) {
-        SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+        SDValue Z = DAG.getConstant(0, dl, VT);
         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
@@ -24230,7 +24588,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
-      SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
+      SDValue Z = DAG.getConstant(0, dl, SelVT);
       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
       return DAG.getSelect(dl, SelVT, C, V0, V1);
     };
@@ -24267,10 +24625,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       // For SRA we need to unpack each byte to the higher byte of a i16 vector
       // so we can correctly sign extend. We don't care what happens to the
       // lower byte.
-      SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
-      SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
-      SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
-      SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
+      SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
+      SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
+      SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
+      SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
       ALo = DAG.getBitcast(ExtVT, ALo);
       AHi = DAG.getBitcast(ExtVT, AHi);
       RLo = DAG.getBitcast(ExtVT, RLo);
@@ -24312,11 +24670,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
 
   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
     MVT ExtVT = MVT::v8i32;
-    SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
-    SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
-    SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
-    SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
-    SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
+    SDValue Z = DAG.getConstant(0, dl, VT);
+    SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
+    SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
+    SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
+    SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
     ALo = DAG.getBitcast(ExtVT, ALo);
     AHi = DAG.getBitcast(ExtVT, AHi);
     RLo = DAG.getBitcast(ExtVT, RLo);
@@ -24408,20 +24766,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Amt = Op.getOperand(1);
   unsigned Opcode = Op.getOpcode();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  int NumElts = VT.getVectorNumElements();
+
+  // Check for constant splat rotation amount.
+  APInt UndefElts;
+  SmallVector<APInt, 32> EltBits;
+  int CstSplatIndex = -1;
+  if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
+    for (int i = 0; i != NumElts; ++i)
+      if (!UndefElts[i]) {
+        if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
+          CstSplatIndex = i;
+          continue;
+        }
+        CstSplatIndex = -1;
+        break;
+      }
 
+  // AVX512 implicitly uses modulo rotation amounts.
   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
     // Attempt to rotate by immediate.
-    APInt UndefElts;
-    SmallVector<APInt, 16> EltBits;
-    if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
-      if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
-            return EltBits[0] == V;
-          })) {
-        unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
-        uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
-        return DAG.getNode(Op, DL, VT, R,
-                           DAG.getConstant(RotateAmt, DL, MVT::i8));
-      }
+    if (0 <= CstSplatIndex) {
+      unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+      uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
+      return DAG.getNode(Op, DL, VT, R,
+                         DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
 
     // Else, fall-back on VPROLV/VPRORV.
@@ -24432,19 +24801,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
+  // XOP implicitly uses modulo rotation amounts.
   if (Subtarget.hasXOP()) {
     if (VT.is256BitVector())
       return split256IntArith(Op, DAG);
     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
 
     // Attempt to rotate by immediate.
-    if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
-      if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
-        uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
-        assert(RotateAmt < EltSizeInBits && "Rotation out of range");
-        return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
-                           DAG.getConstant(RotateAmt, DL, MVT::i8));
-      }
+    if (0 <= CstSplatIndex) {
+      uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
+      return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
+                         DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
 
     // Use general rotate by variable (per-element).
@@ -24461,44 +24828,19 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
          "Only vXi32/vXi16/vXi8 vector rotates supported");
 
   // Rotate by an uniform constant - expand back to shifts.
-  // TODO - legalizers should be able to handle this.
-  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
-    if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
-      uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
-      assert(RotateAmt < EltSizeInBits && "Rotation out of range");
-      if (RotateAmt == 0)
-        return R;
-
-      SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT);
-      SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
-      SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
-      return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
-    }
-  }
+  if (0 <= CstSplatIndex)
+    return SDValue();
 
-  // Rotate by splat - expand back to shifts.
-  // TODO - legalizers should be able to handle this.
-  if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) &&
-      IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
-    SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
-    AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
-    SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
-    SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
-    return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
-  }
+  bool IsSplatAmt = DAG.isSplatValue(Amt);
 
   // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
   // the amount bit.
-  if (EltSizeInBits == 8) {
-    if (Subtarget.hasBWI()) {
-      SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
-      AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
-      SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
-      SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
-      return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
-    }
+  if (EltSizeInBits == 8 && !IsSplatAmt) {
+    if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
+      return SDValue();
 
-    MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+    // We don't need ModuloAmt here as we just peek at individual bits.
+    MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
       if (Subtarget.hasSSE41()) {
@@ -24512,7 +24854,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
-      SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL);
+      SDValue Z = DAG.getConstant(0, DL, SelVT);
       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
       return DAG.getSelect(DL, SelVT, C, V0, V1);
     };
@@ -24553,14 +24895,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return SignBitSelect(VT, Amt, M, R);
   }
 
+  // ISD::ROT* uses modulo rotate amounts.
+  Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
+                    DAG.getConstant(EltSizeInBits - 1, DL, VT));
+
   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
   bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
                         SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
 
-  // Best to fallback for all supported variable shifts.
-  // AVX2 - best to fallback for non-constants as well.
-  // TODO - legalizers should be able to handle this.
-  if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
+  // Fallback for splats + all supported variable shifts.
+  // Fallback for non-constants AVX2 vXi16 as well.
+  if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
     SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
@@ -24938,40 +25283,32 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
       SrcVT == MVT::i64) {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
-    if (DstVT != MVT::f64)
+    if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
+        !(DstVT == MVT::x86mmx && SrcVT.isVector()))
       // This conversion needs to be expanded.
       return SDValue();
 
-    SmallVector<SDValue, 16> Elts;
     SDLoc dl(Op);
-    unsigned NumElts;
-    MVT SVT;
     if (SrcVT.isVector()) {
-      NumElts = SrcVT.getVectorNumElements();
-      SVT = SrcVT.getVectorElementType();
-
       // Widen the vector in input in the case of MVT::v2i32.
       // Example: from MVT::v2i32 to MVT::v4i32.
-      for (unsigned i = 0, e = NumElts; i != e; ++i)
-        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
-                                   DAG.getIntPtrConstant(i, dl)));
+      MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
+                                   SrcVT.getVectorNumElements() * 2);
+      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
+                        DAG.getUNDEF(SrcVT));
     } else {
       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
              "Unexpected source type in LowerBITCAST");
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
-                                 DAG.getIntPtrConstant(0, dl)));
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
-                                 DAG.getIntPtrConstant(1, dl)));
-      NumElts = 2;
-      SVT = MVT::i32;
-    }
-    // Explicitly mark the extra elements as Undef.
-    Elts.append(NumElts, DAG.getUNDEF(SVT));
-
-    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
-    SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
-    SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+      Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
+    }
+
+    MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
+    Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
+
+    if (DstVT == MVT::x86mmx)
+      return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
                        DAG.getIntPtrConstant(0, dl));
   }
 
@@ -25014,7 +25351,7 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
   // PSADBW instruction horizontally add all bytes and leave the result in i64
   // chunks, thus directly computes the pop count for v2i64 and v4i64.
   if (EltVT == MVT::i64) {
-    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
     return DAG.getBitcast(VT, V);
@@ -25026,13 +25363,13 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
     // this is that it lines up the results of two PSADBW instructions to be
     // two v2i64 vectors which concatenated are the 4 population counts. We can
     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
-    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+    SDValue Zeros = DAG.getConstant(0, DL, VT);
     SDValue V32 = DAG.getBitcast(VT, V);
-    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
-    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
+    SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
+    SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
 
     // Do the horizontal sums into two v2i64s.
-    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    Zeros = DAG.getConstant(0, DL, ByteVecVT);
     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
                       DAG.getBitcast(ByteVecVT, Low), Zeros);
@@ -25805,6 +26142,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SHL_PARTS:
   case ISD::SRA_PARTS:
   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
+  case ISD::FSHL:
+  case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
@@ -25872,6 +26211,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
   case ISD::ADD:
   case ISD::SUB:                return LowerADD_SUB(Op, DAG);
+  case ISD::UADDSAT:
+  case ISD::SADDSAT:
+  case ISD::USUBSAT:
+  case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG);
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
@@ -25922,8 +26265,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     llvm_unreachable("Do not know how to custom type legalize this operation!");
   case ISD::MUL: {
     EVT VT = N->getValueType(0);
-    assert(VT.isVector() && VT.getVectorNumElements() == 2 && "Unexpected VT");
-    if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
+    assert(VT.isVector() && "Unexpected VT");
+    if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
+        VT.getVectorNumElements() == 2) {
       // Promote to a pattern that will be turned into PMULUDQ.
       SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
                                N->getOperand(0));
@@ -25935,33 +26279,55 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                        DAG.getConstant(0xffffffff, dl, MVT::v2i64));
       SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v2i64, N0, N1);
       Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
+    } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+               VT.getVectorElementType() == MVT::i8) {
+      // Pre-promote these to vXi16 to avoid op legalization thinking all 16
+      // elements are needed.
+      MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+      SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
+      SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
+      SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
+      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+      unsigned NumConcats = 16 / VT.getVectorNumElements();
+      SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
+      ConcatOps[0] = Res;
+      Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
+      Results.push_back(Res);
     }
     return;
   }
-  case X86ISD::ADDUS:
-  case X86ISD::SUBUS:
+  case ISD::UADDSAT:
+  case ISD::SADDSAT:
+  case ISD::USUBSAT:
+  case ISD::SSUBSAT:
+  case X86ISD::VPMADDWD:
   case X86ISD::AVG: {
-    // Legalize types for X86ISD::AVG/ADDUS/SUBUS by widening.
+    // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
+    // X86ISD::AVG/VPMADDWD by widening.
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
 
-    auto InVT = N->getValueType(0);
-    assert(InVT.getSizeInBits() < 128);
-    assert(128 % InVT.getSizeInBits() == 0);
+    EVT VT = N->getValueType(0);
+    EVT InVT = N->getOperand(0).getValueType();
+    assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
+           "Expected a VT that divides into 128 bits.");
     unsigned NumConcat = 128 / InVT.getSizeInBits();
 
-    EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
-                                 InVT.getVectorElementType(),
-                                 NumConcat * InVT.getVectorNumElements());
+    EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
+                                    InVT.getVectorElementType(),
+                                    NumConcat * InVT.getVectorNumElements());
+    EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
+                                  VT.getVectorElementType(),
+                                  NumConcat * VT.getVectorNumElements());
 
     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
     Ops[0] = N->getOperand(0);
-    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
     Ops[0] = N->getOperand(1);
-    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
 
-    SDValue Res = DAG.getNode(N->getOpcode(), dl, RegVT, InVec0, InVec1);
-    if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
-      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+    SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
+    if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
                         DAG.getIntPtrConstant(0, dl));
     Results.push_back(Res);
     return;
@@ -26004,25 +26370,155 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SREM:
-  case ISD::UREM:
-    if (N->getValueType(0) == MVT::v2i32) {
-      // If we're already legalizing via widening, we don't need this since
-      // that will scalarize div/rem.
-      if (getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
-        return;
+  case ISD::UREM: {
+    EVT VT = N->getValueType(0);
+    if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
+      // If this RHS is a constant splat vector we can widen this and let
+      // division/remainder by constant optimize it.
+      // TODO: Can we do something for non-splat?
+      APInt SplatVal;
+      if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
+        unsigned NumConcats = 128 / VT.getSizeInBits();
+        SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
+        Ops0[0] = N->getOperand(0);
+        EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+        SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
+        SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
+        SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
+        Results.push_back(Res);
+      }
+      return;
+    }
+
+    if (VT == MVT::v2i32) {
       // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
       // v2i64 and unroll later. But then we create i64 scalar ops which
       // might be slow in 64-bit mode or require a libcall in 32-bit mode.
       Results.push_back(DAG.UnrollVectorOp(N));
       return;
     }
+
+    if (VT.isVector())
+      return;
+
     LLVM_FALLTHROUGH;
+  }
   case ISD::SDIVREM:
   case ISD::UDIVREM: {
     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
     Results.push_back(V);
     return;
   }
+  case ISD::TRUNCATE: {
+    MVT VT = N->getSimpleValueType(0);
+    if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+      return;
+
+    // The generic legalizer will try to widen the input type to the same
+    // number of elements as the widened result type. But this isn't always
+    // the best thing so do some custom legalization to avoid some cases.
+    MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
+    SDValue In = N->getOperand(0);
+    EVT InVT = In.getValueType();
+
+    unsigned InBits = InVT.getSizeInBits();
+    if (128 % InBits == 0) {
+      // 128 bit and smaller inputs should avoid truncate all together and
+      // just use a build_vector that will become a shuffle.
+      // TODO: Widen and use a shuffle directly?
+      MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
+      EVT EltVT = VT.getVectorElementType();
+      unsigned WidenNumElts = WidenVT.getVectorNumElements();
+      SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
+      // Use the original element count so we don't do more scalar opts than
+      // necessary.
+      unsigned MinElts = VT.getVectorNumElements();
+      for (unsigned i=0; i < MinElts; ++i) {
+        SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
+                                  DAG.getIntPtrConstant(i, dl));
+        Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
+      }
+      Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
+      return;
+    }
+    // With AVX512 there are some cases that can use a target specific
+    // truncate node to go from 256/512 to less than 128 with zeros in the
+    // upper elements of the 128 bit result.
+    if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
+      // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
+      if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
+        Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
+        return;
+      }
+      // There's one case we can widen to 512 bits and use VTRUNC.
+      if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
+        In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
+                         DAG.getUNDEF(MVT::v4i64));
+        Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
+        return;
+      }
+    }
+    return;
+  }
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND: {
+    if (!ExperimentalVectorWideningLegalization)
+      return;
+
+    EVT VT = N->getValueType(0);
+    SDValue In = N->getOperand(0);
+    EVT InVT = In.getValueType();
+    if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
+        (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
+      // Custom split this so we can extend i8/i16->i32 invec. This is better
+      // since sign_extend_inreg i8/i16->i64 requires two sra operations. So
+      // this allows the first to be shared.
+      In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
+
+      // Fill a vector with sign bits for each element.
+      SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
+      SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
+
+      // Create an unpackl and unpackh to interleave the sign bits then bitcast
+      // to v2i64.
+      SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
+                                        {0, 4, 1, 5});
+      Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
+      SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
+                                        {2, 6, 3, 7});
+      Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
+
+      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+      Results.push_back(Res);
+      return;
+    }
+
+    if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) {
+      // Perform custom splitting instead of the two stage extend we would get
+      // by default.
+      EVT LoVT, HiVT;
+      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+      assert(isTypeLegal(LoVT) && "Split VT not legal?");
+
+      bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
+
+      SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG);
+
+      // We need to shift the input over by half the number of elements.
+      unsigned NumElts = InVT.getVectorNumElements();
+      unsigned HalfNumElts = NumElts / 2;
+      SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
+      for (unsigned i = 0; i != HalfNumElts; ++i)
+        ShufMask[i] = i + HalfNumElts;
+
+      SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
+      Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG);
+
+      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+      Results.push_back(Res);
+    }
+    return;
+  }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: {
     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -26033,7 +26529,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     // Promote these manually to avoid over promotion to v2i64. Type
     // legalization will revisit the v2i32 operation for more cleanup.
     if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
-        getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
+        getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
       // AVX512DQ provides instructions that produce a v2i64 result.
       if (Subtarget.hasDQI())
         return;
@@ -26048,6 +26544,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
+    if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
+      if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+        return;
+
+      // Try to create a 128 bit vector, but don't exceed a 32 bit element.
+      unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
+      MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
+                                       VT.getVectorNumElements());
+      SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
+
+      // Preserve what we know about the size of the original result. Except
+      // when the result is v2i32 since we can't widen the assert.
+      if (PromoteVT != MVT::v2i32)
+        Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
+                                                            : ISD::AssertSext,
+                          dl, PromoteVT, Res,
+                          DAG.getValueType(VT.getVectorElementType()));
+
+      // Truncate back to the original width.
+      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+
+      // Now widen to 128 bits.
+      unsigned NumConcats = 128 / VT.getSizeInBits();
+      MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
+                                      VT.getVectorNumElements() * NumConcats);
+      SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
+      ConcatOps[0] = Res;
+      Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
+      Results.push_back(Res);
+      return;
+    }
+
+
     if (VT == MVT::v2i32) {
       assert((IsSigned || Subtarget.hasAVX512()) &&
              "Can only handle signed conversion without AVX512");
@@ -26400,44 +26929,53 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         Results.push_back(Chain);
         return;
       }
-      EVT IndexVT = Index.getValueType();
-      EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
-                                        IndexVT.getScalarType(), 4);
-      // Otherwise we need to custom widen everything to avoid promotion.
-      Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
-                          DAG.getUNDEF(IndexVT));
-      Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
-                         DAG.getConstant(0, dl, MVT::v2i1));
-      SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
-                        Gather->getBasePtr(), Index, Gather->getScale() };
-      SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
-                                        Gather->getMemoryVT(), dl, Ops,
-                                        Gather->getMemOperand());
-      SDValue Chain = Res.getValue(1);
-      if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
-        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
-                          DAG.getIntPtrConstant(0, dl));
-      Results.push_back(Res);
-      Results.push_back(Chain);
-      return;
+      if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
+        EVT IndexVT = Index.getValueType();
+        EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
+                                          IndexVT.getScalarType(), 4);
+        // Otherwise we need to custom widen everything to avoid promotion.
+        Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
+                            DAG.getUNDEF(IndexVT));
+        Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+                           DAG.getConstant(0, dl, MVT::v2i1));
+        SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
+                          Gather->getBasePtr(), Index, Gather->getScale() };
+        SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
+                                          Gather->getMemoryVT(), dl, Ops,
+                                          Gather->getMemOperand());
+        SDValue Chain = Res.getValue(1);
+        if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
+          Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+                            DAG.getIntPtrConstant(0, dl));
+        Results.push_back(Res);
+        Results.push_back(Chain);
+        return;
+      }
     }
-    break;
+    return;
   }
   case ISD::LOAD: {
-    // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids
-    // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast
-    // since type legalization will try to use an i64 load.
-    assert(N->getValueType(0) == MVT::v2f32 && "Unexpected VT");
+    // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
+    // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
+    // cast since type legalization will try to use an i64 load.
+    MVT VT = N->getSimpleValueType(0);
+    assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
+    if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+      return;
     if (!ISD::isNON_EXTLoad(N))
       return;
     auto *Ld = cast<LoadSDNode>(N);
-    SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(),
+    MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
+    SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
                               Ld->getPointerInfo(),
                               Ld->getAlignment(),
                               Ld->getMemOperand()->getFlags());
     SDValue Chain = Res.getValue(1);
-    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res);
-    Res = DAG.getBitcast(MVT::v4f32, Res);
+    MVT WideVT = MVT::getVectorVT(LdVT, 2);
+    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
+    MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                  VT.getVectorNumElements() * 2);
+    Res = DAG.getBitcast(CastVT, Res);
     Results.push_back(Res);
     Results.push_back(Chain);
     return;
@@ -26499,8 +27037,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
-  case X86ISD::ADDUS:              return "X86ISD::ADDUS";
-  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   case X86ISD::HADD:               return "X86ISD::HADD";
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
@@ -26547,8 +27083,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::LDEC:               return "X86ISD::LDEC";
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
-  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
-  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
@@ -26716,8 +27250,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
-  case X86ISD::ADDS:               return "X86ISD::ADDS";
-  case X86ISD::SUBS:               return "X86ISD::SUBS";
   case X86ISD::AVG:                return "X86ISD::AVG";
   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
@@ -29493,34 +30025,29 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   }
   case X86ISD::PACKUS: {
     // PACKUS is just a truncation if the upper half is zero.
-    // TODO: Add DemandedElts support.
+    APInt DemandedLHS, DemandedRHS;
+    getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+    Known.One = APInt::getAllOnesValue(BitWidth * 2);
+    Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
+
     KnownBits Known2;
-    DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
-    DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
+    if (!!DemandedLHS) {
+      DAG.computeKnownBits(Op.getOperand(0), Known2, DemandedLHS, Depth + 1);
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
+    }
+    if (!!DemandedRHS) {
+      DAG.computeKnownBits(Op.getOperand(1), Known2, DemandedRHS, Depth + 1);
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
+    }
+
     if (Known.countMinLeadingZeros() < BitWidth)
       Known.resetAll();
     Known = Known.trunc(BitWidth);
     break;
   }
-  case X86ISD::VZEXT: {
-    // TODO: Add DemandedElts support.
-    SDValue N0 = Op.getOperand(0);
-    unsigned NumElts = VT.getVectorNumElements();
-
-    EVT SrcVT = N0.getValueType();
-    unsigned InNumElts = SrcVT.getVectorNumElements();
-    unsigned InBitWidth = SrcVT.getScalarSizeInBits();
-    assert(InNumElts >= NumElts && "Illegal VZEXT input");
-
-    Known = KnownBits(InBitWidth);
-    APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
-    DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
-    Known = Known.zext(BitWidth);
-    Known.Zero.setBitsFrom(InBitWidth);
-    break;
-  }
   case X86ISD::CMOV: {
     DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
     // If we don't know any bits, early out.
@@ -29598,14 +30125,6 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
     return VTBits;
 
-  case X86ISD::VSEXT: {
-    // TODO: Add DemandedElts support.
-    SDValue Src = Op.getOperand(0);
-    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
-    Tmp += VTBits - Src.getScalarValueSizeInBits();
-    return Tmp;
-  }
-
   case X86ISD::VTRUNC: {
     // TODO: Add DemandedElts support.
     SDValue Src = Op.getOperand(0);
@@ -29619,10 +30138,16 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
 
   case X86ISD::PACKSS: {
     // PACKSS is just a truncation if the sign bits extend to the packed size.
-    // TODO: Add DemandedElts support.
+    APInt DemandedLHS, DemandedRHS;
+    getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
+                        DemandedRHS);
+
     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
-    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
-    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+    unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
+    if (!!DemandedLHS)
+      Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+    if (!!DemandedRHS)
+      Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
     unsigned Tmp = std::min(Tmp0, Tmp1);
     if (Tmp > (SrcBits - VTBits))
       return Tmp - (SrcBits - VTBits);
@@ -29676,21 +30201,6 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
   return N;
 }
 
-/// Returns true (and the GlobalValue and the offset) if the node is a
-/// GlobalAddress + offset.
-bool X86TargetLowering::isGAPlusOffset(SDNode *N,
-                                       const GlobalValue* &GA,
-                                       int64_t &Offset) const {
-  if (N->getOpcode() == X86ISD::Wrapper) {
-    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
-      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
-      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
-      return true;
-    }
-  }
-  return TargetLowering::isGAPlusOffset(N, GA, Offset);
-}
-
 // Attempt to match a combined shuffle mask against supported unary shuffle
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
@@ -29729,10 +30239,12 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                             MVT::getIntegerVT(MaskEltSize);
         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
 
-        if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
+        if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
-          Shuffle = unsigned(X86ISD::VZEXT);
-        } else
+
+        if (SrcVT.getVectorNumElements() == NumDstElts)
+          Shuffle = unsigned(ISD::ZERO_EXTEND);
+        else
           Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
 
         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
@@ -30756,7 +31268,10 @@ static SDValue combineX86ShufflesRecursively(
   if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
     return SDValue();
 
-  assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
+  // TODO - Add support for more than 2 inputs.
+  if (2 < OpInputs.size())
+    return SDValue();
+
   SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
   SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
 
@@ -31773,11 +32288,69 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 
   // Handle special case opcodes.
   switch (Opc) {
+  case X86ISD::VSHL:
+  case X86ISD::VSRL:
+  case X86ISD::VSRA: {
+    // We only need the bottom 64-bits of the (128-bit) shift amount.
+    SDValue Amt = Op.getOperand(1);
+    MVT AmtVT = Amt.getSimpleValueType();
+    assert(AmtVT.is128BitVector() && "Unexpected value type");
+    APInt AmtUndef, AmtZero;
+    unsigned NumAmtElts = AmtVT.getVectorNumElements();
+    APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
+    if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
+                                   Depth + 1))
+      return true;
+    LLVM_FALLTHROUGH;
+  }
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI:
+  case X86ISD::VSRAI: {
+    SDValue Src = Op.getOperand(0);
+    APInt SrcUndef;
+    if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
+                                   Depth + 1))
+      return true;
+    // TODO convert SrcUndef to KnownUndef.
+    break;
+  }
+  case X86ISD::CVTSI2P:
+  case X86ISD::CVTUI2P: {
+    SDValue Src = Op.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+    APInt SrcUndef, SrcZero;
+    APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+    if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
+      return true;
+    break;
+  }
+  case X86ISD::PACKSS:
+  case X86ISD::PACKUS: {
+    APInt DemandedLHS, DemandedRHS;
+    getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+    APInt SrcUndef, SrcZero;
+    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
+                                   SrcZero, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
+                                   SrcZero, TLO, Depth + 1))
+      return true;
+    break;
+  }
   case X86ISD::VBROADCAST: {
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
     if (!SrcVT.isVector())
       return false;
+    // Don't bother broadcasting if we just need the 0'th element.
+    if (DemandedElts == 1) {
+      if(Src.getValueType() != VT)
+        Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
+                             SDLoc(Op));
+      return TLO.CombineTo(Op, Src);
+    }
     APInt SrcUndef, SrcZero;
     APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
@@ -31797,13 +32370,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   }
 
   // Simplify target shuffles.
-  if (!isTargetShuffle(Opc))
+  if (!isTargetShuffle(Opc) || !VT.isSimple())
     return false;
 
   // Get target shuffle mask.
+  bool IsUnary;
   SmallVector<int, 64> OpMask;
   SmallVector<SDValue, 2> OpInputs;
-  if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, TLO.DAG))
+  if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
+                            OpMask, IsUnary))
     return false;
 
   // Shuffle inputs must be the same type as the result.
@@ -31811,11 +32386,19 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
                    [VT](SDValue V) { return VT != V.getValueType(); }))
     return false;
 
+  // Clear known elts that might have been set above.
+  KnownZero.clearAllBits();
+  KnownUndef.clearAllBits();
+
   // Check if shuffle mask can be simplified to undef/zero/identity.
   int NumSrcs = OpInputs.size();
-  for (int i = 0; i != NumElts; ++i)
+  for (int i = 0; i != NumElts; ++i) {
+    int &M = OpMask[i];
     if (!DemandedElts[i])
-      OpMask[i] = SM_SentinelUndef;
+      M = SM_SentinelUndef;
+    else if (0 <= M && OpInputs[M / NumElts].isUndef())
+      M = SM_SentinelUndef;
+  }
 
   if (isUndefInRange(OpMask, 0, NumElts)) {
     KnownUndef.setAllBits();
@@ -31860,8 +32443,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 }
 
 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
-    SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known,
-    TargetLoweringOpt &TLO, unsigned Depth) const {
+    SDValue Op, const APInt &OriginalDemandedBits,
+    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+    unsigned Depth) const {
+  EVT VT = Op.getValueType();
   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
   unsigned Opc = Op.getOpcode();
   switch(Opc) {
@@ -31884,41 +32469,129 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
       if (ShiftImm->getAPIntValue().uge(BitWidth))
         break;
 
-      KnownBits KnownOp;
       unsigned ShAmt = ShiftImm->getZExtValue();
       APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
-      if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO,
-                               Depth + 1))
+
+      if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
+                               OriginalDemandedElts, Known, TLO, Depth + 1))
         return true;
+
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero <<= ShAmt;
+      Known.One <<= ShAmt;
+
+      // Low bits known zero.
+      Known.Zero.setLowBits(ShAmt);
     }
     break;
   }
-  case X86ISD::VSRAI:
   case X86ISD::VSRLI: {
     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       if (ShiftImm->getAPIntValue().uge(BitWidth))
         break;
 
-      KnownBits KnownOp;
       unsigned ShAmt = ShiftImm->getZExtValue();
       APInt DemandedMask = OriginalDemandedBits << ShAmt;
 
+      if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
+                               OriginalDemandedElts, Known, TLO, Depth + 1))
+        return true;
+
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShAmt);
+      Known.One.lshrInPlace(ShAmt);
+
+      // High bits known zero.
+      Known.Zero.setHighBits(ShAmt);
+    }
+    break;
+  }
+  case X86ISD::VSRAI: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
+      if (ShiftImm->getAPIntValue().uge(BitWidth))
+        break;
+
+      unsigned ShAmt = ShiftImm->getZExtValue();
+      APInt DemandedMask = OriginalDemandedBits << ShAmt;
+
+      // If we just want the sign bit then we don't need to shift it.
+      if (OriginalDemandedBits.isSignMask())
+        return TLO.CombineTo(Op, Op0);
+
+      // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
+      if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
+        SDValue Op00 = Op0.getOperand(0);
+        unsigned NumSignBits =
+            TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
+        if (ShAmt < NumSignBits)
+          return TLO.CombineTo(Op, Op00);
+      }
+
       // If any of the demanded bits are produced by the sign extension, we also
       // demand the input sign bit.
-      if (Opc == X86ISD::VSRAI &&
-          OriginalDemandedBits.countLeadingZeros() < ShAmt)
+      if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
         DemandedMask.setSignBit();
 
-      if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO,
-                               Depth + 1))
+      if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
+                               TLO, Depth + 1))
         return true;
+
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShAmt);
+      Known.One.lshrInPlace(ShAmt);
+
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (Known.Zero[BitWidth - ShAmt - 1] ||
+          OriginalDemandedBits.countLeadingZeros() >= ShAmt)
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
+
+      // High bits are known one.
+      if (Known.One[BitWidth - ShAmt - 1])
+        Known.One.setHighBits(ShAmt);
     }
     break;
   }
+  case X86ISD::MOVMSK: {
+    SDValue Src = Op.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+    unsigned SrcBits = SrcVT.getScalarSizeInBits();
+    unsigned NumElts = SrcVT.getVectorNumElements();
+
+    // If we don't need the sign bits at all just return zero.
+    if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+    // Only demand the vector elements of the sign bits we need.
+    APInt KnownUndef, KnownZero;
+    APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
+    if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+                                   TLO, Depth + 1))
+      return true;
+
+    Known.Zero = KnownZero.zextOrSelf(BitWidth);
+    Known.Zero.setHighBits(BitWidth - NumElts);
+
+    // MOVMSK only uses the MSB from each vector element.
+    KnownBits KnownSrc;
+    if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
+                             KnownSrc, TLO, Depth + 1))
+      return true;
+
+    if (KnownSrc.One[SrcBits - 1])
+      Known.One.setLowBits(NumElts);
+    else if (KnownSrc.Zero[SrcBits - 1])
+      Known.Zero.setLowBits(NumElts);
+    return false;
+  }
   }
 
   return TargetLowering::SimplifyDemandedBitsForTargetNode(
-      Op, OriginalDemandedBits, Known, TLO, Depth);
+      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
 }
 
 /// Check if a vector extract from a target-specific shuffle of a load can be
@@ -31972,9 +32645,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   if (Idx == SM_SentinelUndef)
     return DAG.getUNDEF(EltVT);
 
+  // Bail if any mask element is SM_SentinelZero - getVectorShuffle below
+  // won't handle it.
+  if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
+    return SDValue();
+
   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
-  SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
-                                         : ShuffleOps[1];
+  SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];
 
   // If inputs to shuffle are the same for both ops, then allow 2 uses
   unsigned AllowedUses =
@@ -32584,7 +33261,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
   // ready for the PHMINPOS.
   if (ExtractVT == MVT::i8) {
     SDValue Upper = DAG.getVectorShuffle(
-        SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
+        SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
   }
@@ -33165,9 +33842,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue Cond = N->getOperand(0);
-  // Get the LHS/RHS of the select.
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
+
+  // Try simplification again because we use this function to optimize
+  // SHRUNKBLEND nodes that are not handled by the generic combiner.
+  if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
+    return V;
+
   EVT VT = LHS.getValueType();
   EVT CondVT = Cond.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -33362,7 +34044,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // Since SKX these selects have a proper lowering.
   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
       CondVT.getVectorElementType() == MVT::i1 &&
-      VT.getVectorNumElements() > 4 &&
+      (ExperimentalVectorWideningLegalization ||
+       VT.getVectorNumElements() > 4) &&
       (VT.getVectorElementType() == MVT::i8 ||
        VT.getVectorElementType() == MVT::i16)) {
     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
@@ -33427,33 +34110,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
       SDValue CondRHS = Cond->getOperand(1);
 
-      auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                             ArrayRef<SDValue> Ops) {
-        return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
-      };
-
       // Look for a general sub with unsigned saturation first.
       // x >= y ? x-y : 0 --> subus x, y
       // x >  y ? x-y : 0 --> subus x, y
       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
           Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
-        return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
-                                SUBUSBuilder);
+        return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
 
       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
         if (isa<BuildVectorSDNode>(CondRHS)) {
           // If the RHS is a constant we have to reverse the const
           // canonicalization.
           // x > C-1 ? x+-C : 0 --> subus x, C
-          auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+          // TODO: Handle build_vectors with undef elements.
+          auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
             return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
           };
           if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
-              ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
+              ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) {
             OpRHS = DAG.getNode(ISD::SUB, DL, VT,
                                 DAG.getConstant(0, DL, VT), OpRHS);
-            return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
-                                    SUBUSBuilder);
+            return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
           }
 
           // Another special case: If C was a sign bit, the sub has been
@@ -33465,11 +34142,10 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
             if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
                 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
                 OpRHSConst->getAPIntValue().isSignMask()) {
-              OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
               // Note that we have to rebuild the RHS constant here to ensure we
               // don't rely on particular values of undef lanes.
-              return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
-                                      SUBUSBuilder);
+              OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
+              return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
             }
           }
         }
@@ -33502,11 +34178,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
       SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
 
-      auto ADDUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                             ArrayRef<SDValue> Ops) {
-        return DAG.getNode(X86ISD::ADDUS, DL, Ops[0].getValueType(), Ops);
-      };
-
       // Canonicalize condition operands.
       if (CC == ISD::SETUGE) {
         std::swap(CondLHS, CondRHS);
@@ -33518,21 +34189,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       // x+y >= x ? x+y : ~0 --> addus x, y
       if (CC == ISD::SETULE && Other == CondRHS &&
           (OpLHS == CondLHS || OpRHS == CondLHS))
-        return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
-                                ADDUSBuilder);
+        return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
 
       if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
           CondLHS == OpLHS) {
         // If the RHS is a constant we have to reverse the const
         // canonicalization.
         // x > ~C ? x+C : ~0 --> addus x, C
-        auto MatchADDUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+        auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
           return Cond->getAPIntValue() == ~Op->getAPIntValue();
         };
         if (CC == ISD::SETULE &&
-            ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchADDUS))
-          return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
-                                  ADDUSBuilder);
+            ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
+          return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
       }
     }
   }
@@ -34106,24 +34775,8 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
   for (unsigned i = 0; i < 2; i++) {
     SDValue Opd = N->getOperand(i);
 
-    // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
-    // compute signbits for it separately.
-    if (Opd.getOpcode() == ISD::ANY_EXTEND) {
-      // For anyextend, it is safe to assume an appropriate number of leading
-      // sign/zero bits.
-      if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
-        SignBits[i] = 25;
-      else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
-               MVT::i16)
-        SignBits[i] = 17;
-      else
-        return false;
-      IsPositive[i] = true;
-    } else {
-      SignBits[i] = DAG.ComputeNumSignBits(Opd);
-      if (DAG.SignBitIsZero(Opd))
-        IsPositive[i] = true;
-    }
+    SignBits[i] = DAG.ComputeNumSignBits(Opd);
+    IsPositive[i] = DAG.SignBitIsZero(Opd);
   }
 
   bool AllPositive = IsPositive[0] && IsPositive[1];
@@ -34208,7 +34861,8 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
 
-  if (NumElts >= OpsVT.getVectorNumElements()) {
+  if (ExperimentalVectorWideningLegalization ||
+      NumElts >= OpsVT.getVectorNumElements()) {
     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
     // lower part is needed.
     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
@@ -34397,12 +35051,24 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
+  // Also allow v2i32 if it will be widened.
   MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
+  if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) ||
+        DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+
+  // If we are zero extending two steps without SSE4.1, its better to reduce
+  // the vmul width instead.
+  if (!Subtarget.hasSSE41() &&
+      (N0.getOpcode() == ISD::ZERO_EXTEND &&
+       N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
+      (N1.getOpcode() == ISD::ZERO_EXTEND &&
+       N1.getOperand(0).getScalarValueSizeInBits() <= 8))
+    return SDValue();
+
   APInt Mask17 = APInt::getHighBitsSet(32, 17);
   if (!DAG.MaskedValueIsZero(N1, Mask17) ||
       !DAG.MaskedValueIsZero(N0, Mask17))
@@ -34819,6 +35485,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
          "Unexpected PACKSS/PACKUS input type");
 
+  bool IsSigned = (X86ISD::PACKSS == Opcode);
+
   // Constant Folding.
   APInt UndefElts0, UndefElts1;
   SmallVector<APInt, 32> EltBits0, EltBits1;
@@ -34831,7 +35499,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
     unsigned NumSrcElts = NumDstElts / 2;
     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
-    bool IsSigned = (X86ISD::PACKSS == Opcode);
 
     APInt Undefs(NumDstElts, 0);
     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
@@ -34875,6 +35542,25 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
+  // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
+  // truncate to create a larger truncate.
+  if (Subtarget.hasAVX512() &&
+      N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
+      N0.getOperand(0).getValueType() == MVT::v8i32) {
+    if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
+        (!IsSigned &&
+         DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
+      if (Subtarget.hasVLX())
+        return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
+
+      // Widen input to v16i32 so we can truncate that.
+      SDLoc dl(N);
+      SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
+                                   N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
+    }
+  }
+
   // Attempt to combine as shuffle.
   SDValue Op(N, 0);
   if (SDValue Res =
@@ -34886,6 +35572,28 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget &Subtarget) {
+  assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
+          X86ISD::VSRL == N->getOpcode()) &&
+         "Unexpected shift opcode");
+  EVT VT = N->getValueType(0);
+
+  // Shift zero -> zero.
+  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  APInt KnownUndef, KnownZero;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+                                     KnownZero, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget &Subtarget) {
@@ -34900,13 +35608,14 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
          "Unexpected value type");
+  assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
 
   // Out of range logical bit shifts are guaranteed to be zero.
   // Out of range arithmetic bit shifts splat the sign bit.
-  APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
-  if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
+  unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
+  if (ShiftVal >= NumBitsPerElt) {
     if (LogicalShift)
-      return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+      return DAG.getConstant(0, SDLoc(N), VT);
     else
       ShiftVal = NumBitsPerElt - 1;
   }
@@ -34917,26 +35626,21 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
 
   // Shift zero -> zero.
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
-    return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
-
-  // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
-  // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
-  // TODO - support other sra opcodes as needed.
-  if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
-      N0.getOpcode() == X86ISD::VSRAI)
-    return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
-
-  // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
-  if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
-      N1 == N0.getOperand(1)) {
-    SDValue N00 = N0.getOperand(0);
-    unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
-    if (ShiftVal.ult(NumSignBits))
-      return N00;
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
+  // clamped to (NumBitsPerElt - 1).
+  if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
+    unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    unsigned NewShiftVal = ShiftVal + ShiftVal2;
+    if (NewShiftVal >= NumBitsPerElt)
+      NewShiftVal = NumBitsPerElt - 1;
+    return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
+                       DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
   }
 
   // We can decode 'whole byte' logical bit shifts as shuffles.
-  if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
+  if (LogicalShift && (ShiftVal % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
@@ -34951,14 +35655,13 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
     assert(EltBits.size() == VT.getVectorNumElements() &&
            "Unexpected shift value type");
-    unsigned ShiftImm = ShiftVal.getZExtValue();
     for (APInt &Elt : EltBits) {
       if (X86ISD::VSHLI == Opcode)
-        Elt <<= ShiftImm;
+        Elt <<= ShiftVal;
       else if (X86ISD::VSRAI == Opcode)
-        Elt.ashrInPlace(ShiftImm);
+        Elt.ashrInPlace(ShiftVal);
       else
-        Elt.lshrInPlace(ShiftImm);
+        Elt.lshrInPlace(ShiftVal);
     }
     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
   }
@@ -35130,8 +35833,8 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
 // some of the transition sequences.
 // Even with AVX-512 this is still useful for removing casts around logical
 // operations on vXi1 mask types.
-static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
-                                   const X86Subtarget &Subtarget) {
+static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   assert(VT.isVector() && "Expected vector type");
 
@@ -35871,10 +36574,10 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
   // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
   // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
-  unsigned Bits = VT.getSizeInBits();
+  unsigned Bits = VT.getScalarSizeInBits();
   if (ShAmt1.getOpcode() == ISD::SUB) {
     SDValue Sum = ShAmt1.getOperand(0);
-    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+    if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
       if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
@@ -35884,8 +36587,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                            DAG.getNode(ISD::TRUNCATE, DL,
                                        MVT::i8, ShAmt0));
     }
-  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
-    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+  } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+    auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
     if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
       return DAG.getNode(Opc, DL, VT,
                          N0.getOperand(0), N1.getOperand(0),
@@ -35893,7 +36596,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                                        MVT::i8, ShAmt0));
   } else if (ShAmt1.getOpcode() == ISD::XOR) {
     SDValue Mask = ShAmt1.getOperand(1);
-    if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
+    if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
       unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
       SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
       if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
@@ -36592,7 +37295,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
                                      Mld->getBasePtr(), NewMask, WidePassThru,
                                      Mld->getMemoryVT(), Mld->getMemOperand(),
                                      ISD::NON_EXTLOAD);
-  SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
+  SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
 
@@ -36740,6 +37443,18 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
+  // Convert a store of vXi1 into a store of iX and a bitcast.
+  if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
+      VT.getVectorElementType() == MVT::i1) {
+
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
+    StoredVal = DAG.getBitcast(NewVT, StoredVal);
+
+    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags());
+  }
+
   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
   // This will avoid a copy to k-register.
   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
@@ -37078,7 +37793,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
     if (!LHS.getOperand(1).isUndef())
       B = LHS.getOperand(1);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
-    std::copy(Mask.begin(), Mask.end(), LMask.begin());
+    llvm::copy(Mask, LMask.begin());
   } else {
     A = LHS;
     for (unsigned i = 0; i != NumElts; ++i)
@@ -37095,7 +37810,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
     if (!RHS.getOperand(1).isUndef())
       D = RHS.getOperand(1);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
-    std::copy(Mask.begin(), Mask.end(), RMask.begin());
+    llvm::copy(Mask, RMask.begin());
   } else {
     C = RHS;
     for (unsigned i = 0; i != NumElts; ++i)
@@ -37184,6 +37899,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
 /// the codegen.
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
+/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
+///       anything that is guaranteed to be transformed by DAGCombiner.
 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget,
                                           const SDLoc &DL) {
@@ -37434,9 +38151,11 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  // Only handle vXi16 types that are at least 128-bits.
+  // Only handle vXi16 types that are at least 128-bits unless they will be
+  // widened.
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
-      VT.getVectorNumElements() < 8)
+      (!ExperimentalVectorWideningLegalization &&
+       VT.getVectorNumElements() < 8))
     return SDValue();
 
   // Input type should be vXi32.
@@ -38059,6 +38778,20 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
 }
 
+static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  APInt KnownUndef, KnownZero;
+  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+                                     KnownZero, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
@@ -38071,7 +38804,7 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
 
   // ANDNP(x, 0) -> 0
   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
-    return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
+    return DAG.getConstant(0, SDLoc(N), VT);
 
   // Turn ANDNP back to AND if input is inverted.
   if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR &&
@@ -38401,6 +39134,9 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const X86Subtarget &Subtarget) {
+  if (ExperimentalVectorWideningLegalization)
+    return SDValue();
+
   unsigned Opcode = N->getOpcode();
   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
     return SDValue();
@@ -38415,6 +39151,22 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
   EVT InVT = N0.getValueType();
   EVT InSVT = InVT.getScalarType();
 
+  // FIXME: Generic DAGCombiner previously had a bug that would cause a
+  // sign_extend of setcc to sometimes return the original node and tricked it
+  // into thinking CombineTo was used which prevented the target combines from
+  // running.
+  // Earlying out here to avoid regressions like this
+  //  (v4i32 (sext (v4i1 (setcc (v4i16)))))
+  // Becomes
+  //  (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
+  // Type legalized to
+  //  (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
+  // Leading to a packssdw+pmovsxwd
+  // We could write a DAG combine to fix this, but really we shouldn't be
+  // creating sext_invec that's forcing v8i16 into the DAG.
+  if (N0.getOpcode() == ISD::SETCC)
+    return SDValue();
+
   // Input type must be a vector and we must be extending legal integer types.
   if (!VT.isVector() || VT.getVectorNumElements() < 2)
     return SDValue();
@@ -38575,7 +39327,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector())
-    if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
+    if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
@@ -38746,7 +39498,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector())
-    if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
+    if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
@@ -38899,7 +39651,9 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   // NOTE: The element count check is to ignore operand types that need to
   // go through type promotion to a 128-bit vector.
   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
-      VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
+      VT.getVectorElementType() == MVT::i1 &&
+      (ExperimentalVectorWideningLegalization ||
+       VT.getVectorNumElements() > 4) &&
       (OpVT.getVectorElementType() == MVT::i8 ||
        OpVT.getVectorElementType() == MVT::i16)) {
     SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
@@ -38920,10 +39674,11 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
   SDValue Src = N->getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
+  MVT VT = N->getSimpleValueType(0);
 
   // Perform constant folding.
   if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
-    assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
+    assert(VT== MVT::i32 && "Unexpected result type");
     APInt Imm(32, 0);
     for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
       SDValue In = Src.getOperand(Idx);
@@ -38931,7 +39686,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
           cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
         Imm.setBit(Idx);
     }
-    return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
+    return DAG.getConstant(Imm, SDLoc(N), VT);
   }
 
   // Look through int->fp bitcasts that don't change the element width.
@@ -38941,11 +39696,10 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
         EVT(SrcVT).changeVectorElementTypeToInteger())
     Src = Src.getOperand(0);
 
+  // Simplify the inputs.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
-  // MOVMSK only uses the MSB from each vector element.
-  APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
-  if (TLI.SimplifyDemandedBits(Src, DemandedMask, DCI))
+  APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
     return SDValue(N, 0);
 
   // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)).
@@ -39232,6 +39986,159 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static bool needCarryOrOverflowFlag(SDValue Flags) {
+  assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
+
+  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+         UI != UE; ++UI) {
+    SDNode *User = *UI;
+
+    X86::CondCode CC;
+    switch (User->getOpcode()) {
+    default:
+      // Be conservative.
+      return true;
+    case X86ISD::SETCC:
+    case X86ISD::SETCC_CARRY:
+      CC = (X86::CondCode)User->getConstantOperandVal(0);
+      break;
+    case X86ISD::BRCOND:
+      CC = (X86::CondCode)User->getConstantOperandVal(2);
+      break;
+    case X86ISD::CMOV:
+      CC = (X86::CondCode)User->getConstantOperandVal(2);
+      break;
+    }
+
+    switch (CC) {
+    default: break;
+    case X86::COND_A: case X86::COND_AE:
+    case X86::COND_B: case X86::COND_BE:
+    case X86::COND_O: case X86::COND_NO:
+    case X86::COND_G: case X86::COND_GE:
+    case X86::COND_L: case X86::COND_LE:
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool onlyZeroFlagUsed(SDValue Flags) {
+  assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
+
+  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+         UI != UE; ++UI) {
+    SDNode *User = *UI;
+
+    unsigned CCOpNo;
+    switch (User->getOpcode()) {
+    default:
+      // Be conservative.
+      return false;
+    case X86ISD::SETCC:       CCOpNo = 0; break;
+    case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
+    case X86ISD::BRCOND:      CCOpNo = 2; break;
+    case X86ISD::CMOV:        CCOpNo = 2; break;
+    }
+
+    X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
+    if (CC != X86::COND_E && CC != X86::COND_NE)
+      return false;
+  }
+
+  return true;
+}
+
+static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
+  // Only handle test patterns.
+  if (!isNullConstant(N->getOperand(1)))
+    return SDValue();
+
+  // If we have a CMP of a truncated binop, see if we can make a smaller binop
+  // and use its flags directly.
+  // TODO: Maybe we should try promoting compares that only use the zero flag
+  // first if we can prove the upper bits with computeKnownBits?
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+  EVT VT = Op.getValueType();
+
+  // If we have a constant logical shift that's only used in a comparison
+  // against zero turn it into an equivalent AND. This allows turning it into
+  // a TEST instruction later.
+  if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
+      Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
+      onlyZeroFlagUsed(SDValue(N, 0))) {
+    EVT VT = Op.getValueType();
+    unsigned BitWidth = VT.getSizeInBits();
+    unsigned ShAmt = Op.getConstantOperandVal(1);
+    if (ShAmt < BitWidth) { // Avoid undefined shifts.
+      APInt Mask = Op.getOpcode() == ISD::SRL
+                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+      if (Mask.isSignedIntN(32)) {
+        Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
+                         DAG.getConstant(Mask, dl, VT));
+        return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                           DAG.getConstant(0, dl, VT));
+      }
+    }
+  }
+
+
+  // Look for a truncate with a single use.
+  if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
+    return SDValue();
+
+  Op = Op.getOperand(0);
+
+  // Arithmetic op can only have one use.
+  if (!Op.hasOneUse())
+    return SDValue();
+
+  unsigned NewOpc;
+  switch (Op.getOpcode()) {
+  default: return SDValue();
+  case ISD::AND:
+    // Skip and with constant. We have special handling for and with immediate
+    // during isel to generate test instructions.
+    if (isa<ConstantSDNode>(Op.getOperand(1)))
+      return SDValue();
+    NewOpc = X86ISD::AND;
+    break;
+  case ISD::OR:  NewOpc = X86ISD::OR;  break;
+  case ISD::XOR: NewOpc = X86ISD::XOR; break;
+  case ISD::ADD:
+    // If the carry or overflow flag is used, we can't truncate.
+    if (needCarryOrOverflowFlag(SDValue(N, 0)))
+      return SDValue();
+    NewOpc = X86ISD::ADD;
+    break;
+  case ISD::SUB:
+    // If the carry or overflow flag is used, we can't truncate.
+    if (needCarryOrOverflowFlag(SDValue(N, 0)))
+      return SDValue();
+    NewOpc = X86ISD::SUB;
+    break;
+  }
+
+  // We found an op we can narrow. Truncate its inputs.
+  SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
+  SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
+
+  // Use a X86 specific opcode to avoid DAG combine messing with it.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+  Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
+
+  // For AND, keep a CMP so that we can match the test pattern.
+  if (NewOpc == X86ISD::AND)
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, dl, VT));
+
+  // Return the flags.
+  return Op.getValue(1);
+}
+
 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
     MVT VT = N->getSimpleValueType(0);
@@ -39278,21 +40185,6 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
-/// which is more useful than 0/1 in some cases.
-static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
-  SDLoc DL(N);
-  // "Condition code B" is also known as "the carry flag" (CF).
-  SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
-  SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
-  MVT VT = N->getSimpleValueType(0);
-  if (VT == MVT::i8)
-    return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
-
-  assert(VT == MVT::i1 && "Unexpected type for SETCC node");
-  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
-}
-
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
@@ -39363,13 +40255,11 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
   }
 
   if (CC == X86::COND_B) {
-    // X + SETB Z --> X + (mask SBB Z, Z)
-    // X - SETB Z --> X - (mask SBB Z, Z)
-    // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
-    SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
-    if (SBB.getValueSizeInBits() != VT.getSizeInBits())
-      SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
-    return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+    // X + SETB Z --> adc X, 0
+    // X - SETB Z --> sbb X, 0
+    return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+                       DAG.getVTList(VT, MVT::i32), X,
+                       DAG.getConstant(0, DL, VT), Y.getOperand(1));
   }
 
   if (CC == X86::COND_A) {
@@ -39387,10 +40277,9 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
                                    EFLAGS.getNode()->getVTList(),
                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
-      SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
-      if (SBB.getValueSizeInBits() != VT.getSizeInBits())
-        SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
-      return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+      return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+                         DAG.getVTList(VT, MVT::i32), X,
+                         DAG.getConstant(0, DL, VT), NewEFLAGS);
     }
   }
 
@@ -39728,6 +40617,39 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
                           PMADDBuilder);
 }
 
+// Try to turn (add (umax X, C), -C) into (psubus X, C)
+static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // psubus is available in SSE2 for i8 and i16 vectors.
+  if (!VT.isVector() || VT.getVectorNumElements() < 2 ||
+      !isPowerOf2_32(VT.getVectorNumElements()) ||
+      !(VT.getVectorElementType() == MVT::i8 ||
+        VT.getVectorElementType() == MVT::i16))
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() != ISD::UMAX)
+    return SDValue();
+
+  // The add should have a constant that is the negative of the max.
+  // TODO: Handle build_vectors with undef elements.
+  auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
+    return Max->getAPIntValue() == (-Op->getAPIntValue());
+  };
+  if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT))
+    return SDValue();
+
+  SDLoc DL(N);
+  return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0),
+                     Op0.getOperand(1));
+}
+
 // Attempt to turn this pattern into PMADDWD.
 // (mul (add (zext (build_vector)), (zext (build_vector))),
 //      (add (zext (build_vector)), (zext (build_vector)))
@@ -39883,6 +40805,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineIncDecVector(N, DAG))
     return V;
 
+  if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget))
+    return V;
+
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
@@ -39928,16 +40853,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   } else
     return SDValue();
 
-  auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                         ArrayRef<SDValue> Ops) {
-    return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
+  auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                           ArrayRef<SDValue> Ops) {
+    return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
   };
 
   // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
   // special preprocessing in some cases.
   if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
-                            { SubusLHS, SubusRHS }, SUBUSBuilder);
+                            { SubusLHS, SubusRHS }, USUBSATBuilder);
 
   // Special preprocessing case can be only applied
   // if the value was zero extended from 16 bit,
@@ -39969,7 +40894,7 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
   SDValue Psubus =
       SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
-                       { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
+                       { NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
   // Zero extend the result, it may be used somewhere as 32 bit,
   // if not zext and following trunc will shrink.
   return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
@@ -40022,98 +40947,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
-static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI,
-                             const X86Subtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  SDLoc DL(N);
-  unsigned Opcode = N->getOpcode();
-  MVT VT = N->getSimpleValueType(0);
-  MVT SVT = VT.getVectorElementType();
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned EltSizeInBits = SVT.getSizeInBits();
-
-  SDValue Op = N->getOperand(0);
-  MVT OpVT = Op.getSimpleValueType();
-  MVT OpEltVT = OpVT.getVectorElementType();
-  unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
-  unsigned InputBits = OpEltSizeInBits * NumElts;
-
-  // Perform any constant folding.
-  // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
-  APInt UndefElts;
-  SmallVector<APInt, 64> EltBits;
-  if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
-    APInt Undefs(NumElts, 0);
-    SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
-    bool IsZEXT =
-        (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
-    for (unsigned i = 0; i != NumElts; ++i) {
-      if (UndefElts[i]) {
-        Undefs.setBit(i);
-        continue;
-      }
-      Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
-                       : EltBits[i].sextOrTrunc(EltSizeInBits);
-    }
-    return getConstVector(Vals, Undefs, VT, DAG, DL);
-  }
-
-  // (vzext (bitcast (vzext (x)) -> (vzext x)
-  // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
-  SDValue V = peekThroughBitcasts(Op);
-  if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
-    MVT InnerVT = V.getSimpleValueType();
-    MVT InnerEltVT = InnerVT.getVectorElementType();
-
-    // If the element sizes match exactly, we can just do one larger vzext. This
-    // is always an exact type match as vzext operates on integer types.
-    if (OpEltVT == InnerEltVT) {
-      assert(OpVT == InnerVT && "Types must match for vzext!");
-      return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
-    }
-
-    // The only other way we can combine them is if only a single element of the
-    // inner vzext is used in the input to the outer vzext.
-    if (InnerEltVT.getSizeInBits() < InputBits)
-      return SDValue();
-
-    // In this case, the inner vzext is completely dead because we're going to
-    // only look at bits inside of the low element. Just do the outer vzext on
-    // a bitcast of the input to the inner.
-    return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
-  }
-
-  // Check if we can bypass extracting and re-inserting an element of an input
-  // vector. Essentially:
-  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
-  // TODO: Add X86ISD::VSEXT support
-  if (Opcode == X86ISD::VZEXT &&
-      V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-      V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
-    SDValue ExtractedV = V.getOperand(0);
-    SDValue OrigV = ExtractedV.getOperand(0);
-    if (isNullConstant(ExtractedV.getOperand(1))) {
-        MVT OrigVT = OrigV.getSimpleValueType();
-        // Extract a subvector if necessary...
-        if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
-          int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
-          OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
-                                    OrigVT.getVectorNumElements() / Ratio);
-          OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
-                              DAG.getIntPtrConstant(0, DL));
-        }
-        Op = DAG.getBitcast(OpVT, OrigV);
-        return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
-      }
-  }
-
-  return SDValue();
-}
-
 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
@@ -40121,9 +40954,9 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
 
   if (N->getOperand(0) == N->getOperand(1)) {
     if (N->getOpcode() == X86ISD::PCMPEQ)
-      return getOnesVector(VT, DAG, DL);
+      return DAG.getConstant(-1, DL, VT);
     if (N->getOpcode() == X86ISD::PCMPGT)
-      return getZeroVector(VT, Subtarget, DAG, DL);
+      return DAG.getConstant(0, DL, VT);
   }
 
   return SDValue();
@@ -40364,13 +41197,20 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
       }
     }
-    if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) &&
+    if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) &&
         OpVT.is128BitVector() &&
         InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
-      unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG
-                                                 : ISD::SIGN_EXTEND_VECTOR_INREG;
+      unsigned ExtOp =
+        InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG
+                                     : ISD::SIGN_EXTEND_VECTOR_INREG;
       return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
     }
+    if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+         InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+        OpVT.is128BitVector() &&
+        InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
+      return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0));
+    }
     if (InOpcode == ISD::BITCAST) {
       // TODO - do this for target shuffles in general.
       SDValue InVecBC = peekThroughOneUseBitcasts(InVec);
@@ -40448,6 +41288,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
+  case X86ISD::CMP:         return combineCMP(N, DAG);
   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
   case X86ISD::SBB:         return combineSBB(N, DAG);
@@ -40479,6 +41320,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
   case ISD::FMINNUM:
   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
+  case X86ISD::CVTSI2P:  
+  case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
   case X86ISD::BT:          return combineBT(N, DAG, DCI);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
@@ -40489,14 +41332,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
   case X86ISD::PACKSS:
   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
+  case X86ISD::VSHL:
+  case X86ISD::VSRA:
+  case X86ISD::VSRL:
+    return combineVectorShiftVar(N, DAG, DCI, Subtarget);
   case X86ISD::VSHLI:
   case X86ISD::VSRAI:
   case X86ISD::VSRLI:
     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
-  case ISD::SIGN_EXTEND_VECTOR_INREG:
-  case ISD::ZERO_EXTEND_VECTOR_INREG:
-  case X86ISD::VSEXT:
-  case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
   case X86ISD::PINSRB:
   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
@@ -40558,10 +41401,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
-/// Return true if the target has native support for the specified value type
-/// and it is 'desirable' to use the type for the given node type. e.g. On x86
-/// i16 is legal, but undesirable since i16 instruction encodings are longer and
-/// some i16 instructions are slow.
 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (!isTypeLegal(VT))
     return false;
@@ -40570,26 +41409,37 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
     return false;
 
-  if (VT != MVT::i16)
-    return true;
-
-  switch (Opc) {
-  default:
-    return true;
-  case ISD::LOAD:
-  case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND:
-  case ISD::SHL:
-  case ISD::SRL:
-  case ISD::SUB:
-  case ISD::ADD:
-  case ISD::MUL:
-  case ISD::AND:
-  case ISD::OR:
-  case ISD::XOR:
+  // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
+  // we have specializations to turn 32-bit multiply into LEA or other ops.
+  // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
+  // check for a constant operand to the multiply.
+  if (Opc == ISD::MUL && VT == MVT::i8)
     return false;
+
+  // i16 instruction encodings are longer and some i16 instructions are slow,
+  // so those are not desirable.
+  if (VT == MVT::i16) {
+    switch (Opc) {
+    default:
+      break;
+    case ISD::LOAD:
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+    case ISD::SHL:
+    case ISD::SRL:
+    case ISD::SUB:
+    case ISD::ADD:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      return false;
+    }
   }
+
+  // Any legal type not explicitly accounted for above here is desirable.
+  return true;
 }
 
 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
@@ -40608,12 +41458,16 @@ SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
 }
 
-/// This method query the target whether it is beneficial for dag combiner to
-/// promote the specified node. If true, it should return the desired promotion
-/// type by reference.
 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   EVT VT = Op.getValueType();
-  if (VT != MVT::i16)
+  bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
+                             isa<ConstantSDNode>(Op.getOperand(1));
+
+  // i16 is legal, but undesirable since i16 instruction encodings are longer
+  // and some i16 instructions are slow.
+  // 8-bit multiply-by-constant can usually be expanded to something cheaper
+  // using LEA and/or other ALU ops.
+  if (VT != MVT::i16 && !Is8BitMulByConstant)
     return false;
 
   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 7cda0259bf27bdd837cdf1d0314deb816776354c..17fd315a2b4c5f5e50cb9dd083dda5c0068ff14f 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -226,14 +226,6 @@ namespace llvm {
       SCALEF,
       SCALEFS,
 
-      // Integer add/sub with unsigned saturation.
-      ADDUS,
-      SUBUS,
-
-      // Integer add/sub with signed saturation.
-      ADDS,
-      SUBS,
-
       // Unsigned Integer average.
       AVG,
 
@@ -295,11 +287,6 @@ namespace llvm {
       // Vector move to low scalar and zero higher vector elements.
       VZEXT_MOVL,
 
-      // Vector integer zero-extend.
-      VZEXT,
-      // Vector integer signed-extend.
-      VSEXT,
-
       // Vector integer truncate.
       VTRUNC,
       // Vector integer truncate with unsigned/signed saturation.
@@ -876,15 +863,13 @@ namespace llvm {
 
     bool SimplifyDemandedBitsForTargetNode(SDValue Op,
                                            const APInt &DemandedBits,
+                                           const APInt &DemandedElts,
                                            KnownBits &Known,
                                            TargetLoweringOpt &TLO,
                                            unsigned Depth) const override;
 
     SDValue unwrapAddress(SDValue N) const override;
 
-    bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
-                        int64_t &Offset) const override;
-
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
     bool ExpandInlineAsm(CallInst *CI) const override;
@@ -1046,10 +1031,15 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override;
+
     bool convertSelectOfConstantsToMath(EVT VT) const override;
 
     bool decomposeMulByConstant(EVT VT, SDValue C) const override;
 
+    bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
+                                  bool IsSigned) const override;
+
     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
     /// with this index.
     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
@@ -1365,11 +1355,6 @@ namespace llvm {
     MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
                                              MachineBasicBlock *MBB) const;
 
-    /// Emit nodes that will be selected as "test Op0,Op0", or something
-    /// equivalent, for use with the given x86 condition code.
-    SDValue EmitTest(SDValue Op0, unsigned X86CC, const SDLoc &dl,
-                     SelectionDAG &DAG) const;
-
     /// Emit nodes that will be selected as "cmp Op0,Op1", or something
     /// equivalent, for use with the given x86 condition code.
     SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl,
diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30b46a09ef0f90131127c47f366eb09ce8ce4530
--- /dev/null
+++ b/lib/Target/X86/X86InsertPrefetch.cpp
@@ -0,0 +1,253 @@
+//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass applies cache prefetch instructions based on a profile. The pass
+// assumes DiscriminateMemOps ran immediately before, to ensure debug info
+// matches the one used at profile generation time. The profile is encoded in
+// afdo format (text or binary). It contains prefetch hints recommendations.
+// Each recommendation is made in terms of debug info locations, a type (i.e.
+// nta, t{0|1|2}) and a delta. The debug info identifies an instruction with a
+// memory operand (see X86DiscriminateMemOps). The prefetch will be made for
+// a location at that memory operand + the delta specified in the
+// recommendation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
+using namespace llvm;
+using namespace sampleprof;
+
+static cl::opt<std::string>
+    PrefetchHintsFile("prefetch-hints-file",
+                      cl::desc("Path to the prefetch hints profile."),
+                      cl::Hidden);
+namespace {
+
+class X86InsertPrefetch : public MachineFunctionPass {
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool doInitialization(Module &) override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  struct PrefetchInfo {
+    unsigned InstructionID;
+    int64_t Delta;
+  };
+  typedef SmallVectorImpl<PrefetchInfo> Prefetches;
+  bool findPrefetchInfo(const FunctionSamples *Samples, const MachineInstr &MI,
+                        Prefetches &prefetches) const;
+
+public:
+  static char ID;
+  X86InsertPrefetch(const std::string &PrefetchHintsFilename);
+  StringRef getPassName() const override {
+    return "X86 Insert Cache Prefetches";
+  }
+
+private:
+  std::string Filename;
+  std::unique_ptr<SampleProfileReader> Reader;
+};
+
+using PrefetchHints = SampleRecord::CallTargetMap;
+
+// Return any prefetching hints for the specified MachineInstruction. The hints
+// are returned as pairs (name, delta).
+ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples,
+                                        const MachineInstr &MI) {
+  if (const auto &Loc = MI.getDebugLoc())
+    if (const auto *Samples = TopSamples->findFunctionSamples(Loc))
+      return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc),
+                                          Loc->getBaseDiscriminator());
+  return std::error_code();
+}
+
+// The prefetch instruction can't take memory operands involving vector
+// registers.
+bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
+  unsigned BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
+  unsigned IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
+  return (BaseReg == 0 ||
+          X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
+          X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
+         (IndexReg == 0 ||
+          X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
+          X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg));
+}
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+char X86InsertPrefetch::ID = 0;
+
+X86InsertPrefetch::X86InsertPrefetch(const std::string &PrefetchHintsFilename)
+    : MachineFunctionPass(ID), Filename(PrefetchHintsFilename) {}
+
+/// Return true if the provided MachineInstruction has cache prefetch hints. In
+/// that case, the prefetch hints are stored, in order, in the Prefetches
+/// vector.
+bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
+                                         const MachineInstr &MI,
+                                         Prefetches &Prefetches) const {
+  assert(Prefetches.empty() &&
+         "Expected caller passed empty PrefetchInfo vector.");
+  static const std::pair<const StringRef, unsigned> HintTypes[] = {
+      {"_nta_", X86::PREFETCHNTA},
+      {"_t0_", X86::PREFETCHT0},
+      {"_t1_", X86::PREFETCHT1},
+      {"_t2_", X86::PREFETCHT2},
+  };
+  static const char *SerializedPrefetchPrefix = "__prefetch";
+
+  const ErrorOr<PrefetchHints> T = getPrefetchHints(TopSamples, MI);
+  if (!T)
+    return false;
+  int16_t max_index = -1;
+  // Convert serialized prefetch hints into PrefetchInfo objects, and populate
+  // the Prefetches vector.
+  for (const auto &S_V : *T) {
+    StringRef Name = S_V.getKey();
+    if (Name.consume_front(SerializedPrefetchPrefix)) {
+      int64_t D = static_cast<int64_t>(S_V.second);
+      unsigned IID = 0;
+      for (const auto &HintType : HintTypes) {
+        if (Name.startswith(HintType.first)) {
+          Name = Name.drop_front(HintType.first.size());
+          IID = HintType.second;
+          break;
+        }
+      }
+      if (IID == 0)
+        return false;
+      uint8_t index = 0;
+      Name.consumeInteger(10, index);
+
+      if (index >= Prefetches.size())
+        Prefetches.resize(index + 1);
+      Prefetches[index] = {IID, D};
+      max_index = std::max(max_index, static_cast<int16_t>(index));
+    }
+  }
+  assert(max_index + 1 >= 0 &&
+         "Possible overflow: max_index + 1 should be positive.");
+  assert(static_cast<size_t>(max_index + 1) == Prefetches.size() &&
+         "The number of prefetch hints received should match the number of "
+         "PrefetchInfo objects returned");
+  return !Prefetches.empty();
+}
+
+bool X86InsertPrefetch::doInitialization(Module &M) {
+  if (Filename.empty())
+    return false;
+
+  LLVMContext &Ctx = M.getContext();
+  ErrorOr<std::unique_ptr<SampleProfileReader>> ReaderOrErr =
+      SampleProfileReader::create(Filename, Ctx);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    std::string Msg = "Could not open profile: " + EC.message();
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg,
+                                             DiagnosticSeverity::DS_Warning));
+    return false;
+  }
+  Reader = std::move(ReaderOrErr.get());
+  Reader->read();
+  return true;
+}
+
+void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+}
+
+bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
+  if (!Reader)
+    return false;
+  const FunctionSamples *Samples = Reader->getSamplesFor(MF.getFunction());
+  if (!Samples)
+    return false;
+
+  bool Changed = false;
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<PrefetchInfo, 4> Prefetches;
+  for (auto &MBB : MF) {
+    for (auto MI = MBB.instr_begin(); MI != MBB.instr_end();) {
+      auto Current = MI;
+      ++MI;
+
+      int Offset = X86II::getMemoryOperandNo(Current->getDesc().TSFlags);
+      if (Offset < 0)
+        continue;
+      unsigned Bias = X86II::getOperandBias(Current->getDesc());
+      int MemOpOffset = Offset + Bias;
+      // FIXME(mtrofin): ORE message when the recommendation cannot be taken.
+      if (!IsMemOpCompatibleWithPrefetch(*Current, MemOpOffset))
+        continue;
+      Prefetches.clear();
+      if (!findPrefetchInfo(Samples, *Current, Prefetches))
+        continue;
+      assert(!Prefetches.empty() &&
+             "The Prefetches vector should contain at least a value if "
+             "findPrefetchInfo returned true.");
+      for (auto &PrefInfo : Prefetches) {
+        unsigned PFetchInstrID = PrefInfo.InstructionID;
+        int64_t Delta = PrefInfo.Delta;
+        const MCInstrDesc &Desc = TII->get(PFetchInstrID);
+        MachineInstr *PFetch =
+            MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
+        MachineInstrBuilder MIB(MF, PFetch);
+
+        assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
+               X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
+               X86::AddrSegmentReg == 4 &&
+               "Unexpected change in X86 operand offset order.");
+
+        // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
+        // FIXME(mtrofin): consider adding a:
+        //     MachineInstrBuilder::set(unsigned offset, op).
+        MIB.addReg(Current->getOperand(MemOpOffset + X86::AddrBaseReg).getReg())
+            .addImm(
+                Current->getOperand(MemOpOffset + X86::AddrScaleAmt).getImm())
+            .addReg(
+                Current->getOperand(MemOpOffset + X86::AddrIndexReg).getReg())
+            .addImm(Current->getOperand(MemOpOffset + X86::AddrDisp).getImm() +
+                    Delta)
+            .addReg(Current->getOperand(MemOpOffset + X86::AddrSegmentReg)
+                        .getReg());
+
+        if (!Current->memoperands_empty()) {
+          MachineMemOperand *CurrentOp = *(Current->memoperands_begin());
+          MIB.addMemOperand(MF.getMachineMemOperand(
+              CurrentOp, CurrentOp->getOffset() + Delta, CurrentOp->getSize()));
+        }
+
+        // Insert before Current. This is because Current may clobber some of
+        // the registers used to describe the input memory operand.
+        MBB.insert(Current, PFetch);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createX86InsertPrefetchPass() {
+  return new X86InsertPrefetch(PrefetchHintsFile);
+}
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index f8ade37f8dfce5351e772587071d72d8b7957426..7e60b9caf05b8d83d7c79853e4983b5997649cca 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -4830,13 +4830,13 @@ defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
                                     SchedWriteVecALU, 1>;
 defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
                                     SchedWriteVecALU, 0>;
-defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
+defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", saddsat,
                                     SchedWriteVecALU, HasBWI, 1>;
-defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
+defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", ssubsat,
                                     SchedWriteVecALU, HasBWI, 0>;
-defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
+defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", uaddsat,
                                      SchedWriteVecALU, HasBWI, 1>;
-defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
+defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", usubsat,
                                      SchedWriteVecALU, HasBWI, 0>;
 defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
                                     SchedWritePMULLD, HasAVX512, 1>, T8PD;
@@ -8790,7 +8790,7 @@ let Predicates = [HasVLX] in {
                (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >;
 }
 
-//  Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+//  Unordered/Ordered scalar fp compare with Sae and set EFLAGS
 multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
                             string OpcodeStr, X86FoldableSchedWrite sched> {
   let hasSideEffects = 0 in
@@ -9528,7 +9528,7 @@ multiclass WriteShuffle256_BD<bits<8> opc, string OpcodeStr,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
 
     defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
-                   v16i8x_info, i64mem, LdFrag, OpNode>,
+                   v16i8x_info, i64mem, LdFrag, InVecNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
@@ -9547,12 +9547,12 @@ multiclass WriteShuffle256_BQ<bits<8> opc, string OpcodeStr,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG;
 
     defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
-                   v16i8x_info, i32mem, LdFrag, OpNode>,
+                   v16i8x_info, i32mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
-                   v16i8x_info, i64mem, LdFrag, OpNode>,
+                   v16i8x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
@@ -9585,7 +9585,7 @@ multiclass WriteShuffle256_WQ<bits<8> opc, string OpcodeStr,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
 
     defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
-                   v8i16x_info, i64mem, LdFrag, OpNode>,
+                   v8i16x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
@@ -9615,23 +9615,107 @@ multiclass WriteShuffle256_DQ<bits<8> opc, string OpcodeStr,
   }
 }
 
-defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", zext, zext_invec, "z", WriteShuffle256>;
+
+defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", sext, sext_invec, "s", WriteShuffle256>;
+
+
+// Patterns that we also need any extend versions of. aext_vector_inreg
+// is currently legalized to zext_vector_inreg.
+multiclass AVX512_pmovx_patterns_base<string OpcPrefix, SDNode ExtOp> {
+  // 256-bit patterns
+  let Predicates = [HasVLX, HasBWI] in {
+    def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
+              (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+    def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+    def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+  }
+
+  let Predicates = [HasVLX] in {
+    def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
+              (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+    def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+    def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+
+    def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
+              (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+    def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+    def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+  }
+
+  // 512-bit patterns
+  let Predicates = [HasBWI] in {
+    def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))),
+              (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX512] in {
+    def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))),
+              (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
+    def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))),
+              (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
+
+    def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))),
+              (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
+
+    def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))),
+              (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
+  }
+}
+
+multiclass AVX512_pmovx_patterns_aext<string OpcPrefix, SDNode ExtOp> :
+    AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> {
+  let Predicates = [HasVLX, HasBWI] in {
+    def : Pat<(v16i16 (ExtOp (v16i8 VR128X:$src))),
+              (!cast<I>(OpcPrefix#BWZ256rr) VR128X:$src)>;
+  }
 
-defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", WriteShuffle256>;
+  let Predicates = [HasVLX] in {
+    def : Pat<(v8i32 (ExtOp (v8i16 VR128X:$src))),
+              (!cast<I>(OpcPrefix#WDZ256rr) VR128X:$src)>;
+
+    def : Pat<(v4i64 (ExtOp (v4i32 VR128X:$src))),
+              (!cast<I>(OpcPrefix#DQZ256rr) VR128X:$src)>;
+  }
+
+  // 512-bit patterns
+  let Predicates = [HasBWI] in {
+    def : Pat<(v32i16 (ExtOp (v32i8 VR256X:$src))),
+              (!cast<I>(OpcPrefix#BWZrr) VR256X:$src)>;
+  }
+  let Predicates = [HasAVX512] in {
+    def : Pat<(v16i32 (ExtOp (v16i8 VR128X:$src))),
+              (!cast<I>(OpcPrefix#BDZrr) VR128X:$src)>;
+    def : Pat<(v16i32 (ExtOp (v16i16 VR256X:$src))),
+              (!cast<I>(OpcPrefix#WDZrr) VR256X:$src)>;
+
+    def : Pat<(v8i64 (ExtOp (v8i16 VR128X:$src))),
+              (!cast<I>(OpcPrefix#WQZrr) VR128X:$src)>;
+
+    def : Pat<(v8i64 (ExtOp (v8i32 VR256X:$src))),
+              (!cast<I>(OpcPrefix#DQZrr) VR256X:$src)>;
+  }
+}
 
 
 multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
-                                 SDNode InVecOp> {
+                                 SDNode InVecOp> :
+    AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> {
   // 128-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
   def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9695,84 +9779,70 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   }
-  // 256-bit patterns
-  let Predicates = [HasVLX, HasBWI] in {
-  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
-            (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
-  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
-  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
-  }
   let Predicates = [HasVLX] in {
-  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
-            (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
-
-  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
-
-  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
-            (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
   }
   // 512-bit patterns
-  let Predicates = [HasBWI] in {
-  def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))),
-            (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
-  }
   let Predicates = [HasAVX512] in {
-  def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))),
-            (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
-
-  def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
-  def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+  }
+}
 
-  def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))),
-            (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
-
-  def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))),
-            (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
+defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>;
+defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>;
 
-  def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))),
-            (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
-  }
+// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge
+// ext+trunc aggresively making it impossible to legalize the DAG to this
+// pattern directly.
+let Predicates = [HasAVX512, NoBWI] in {
+def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
+         (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
+def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))),
+         (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>;
+def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst),
+         (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
 }
 
-defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec>;
-defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec>;
+// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge
+// ext+trunc aggresively making it impossible to legalize the DAG to this
+// pattern directly.
+let Predicates = [HasAVX512, NoBWI] in {
+def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
+         (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
+def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))),
+         (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>;
+def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst),
+         (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
+}
 
 //===----------------------------------------------------------------------===//
 // GATHER - SCATTER Operations
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 94e460ff36283a2a20f2f2f87a80c7e42cf98d3d..3288fe22f65ac003fa501d6d9712a7f9d3ae90aa 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -913,8 +913,8 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   let Defs = [EFLAGS] in {
     let Constraints = "$src1 = $dst" in {
       let isCommutable = CommutableRR in {
-        def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
         let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
           def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
           def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
           def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
@@ -931,9 +931,9 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
       def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
 
-      def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
-
       let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+
         // NOTE: These are order specific, we want the ri8 forms to be listed
         // first so that they are slightly preferred to the ri forms.
         def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 71b43a38dc2cd994070cbad16728338cd3995ee6..529d054d081a82c9bacd2cbd335126fdb0f56a12 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -143,7 +143,7 @@ def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
 // These instructions XOR the frame pointer into a GPR. They are used in some
 // stack protection schemes. These are post-RA pseudos because we only know the
 // frame register after register allocation.
-let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in {
+let Constraints = "$src = $dst", isMoveImm = 1, isPseudo = 1, Defs = [EFLAGS] in {
   def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
                   "xorl\t$$FP, $src", []>,
                   Requires<[NotLP64]>, Sched<[WriteALU]>;
@@ -270,7 +270,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, AddedComplexity = 10 in
+    isPseudo = 1, isMoveImm = 1, AddedComplexity = 10 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)]>, Sched<[WriteZero]>;
 
@@ -362,29 +362,20 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
           (SETBr)>;
 
-// (add OP, SETB) -> (adc OP, 0)
-def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
-          (ADC8ri GR8:$op, 0)>;
-def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
-          (ADC32ri8 GR32:$op, 0)>;
-def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
-          (ADC64ri8 GR64:$op, 0)>;
-
-// (sub OP, SETB) -> (sbb OP, 0)
-def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
-          (SBB8ri GR8:$op, 0)>;
-def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
-          (SBB32ri8 GR32:$op, 0)>;
-def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
-          (SBB64ri8 GR64:$op, 0)>;
-
-// (sub OP, SETCC_CARRY) -> (adc OP, 0)
-def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
-          (ADC8ri GR8:$op, 0)>;
-def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
-          (ADC32ri8 GR32:$op, 0)>;
-def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
-          (ADC64ri8 GR64:$op, 0)>;
+// Patterns to give priority when both inputs are zero so that we don't use
+// an immediate for the RHS.
+// TODO: Should we use a 32-bit sbb for 8/16 to push the extract_subreg out?
+def : Pat<(X86sbb_flag (i8 0), (i8 0), EFLAGS),
+          (SBB8rr (EXTRACT_SUBREG (MOV32r0), sub_8bit),
+                  (EXTRACT_SUBREG (MOV32r0), sub_8bit))>;
+def : Pat<(X86sbb_flag (i16 0), (i16 0), EFLAGS),
+          (SBB16rr (EXTRACT_SUBREG (MOV32r0), sub_16bit),
+                   (EXTRACT_SUBREG (MOV32r0), sub_16bit))>;
+def : Pat<(X86sbb_flag (i32 0), (i32 0), EFLAGS),
+          (SBB32rr (MOV32r0), (MOV32r0))>;
+def : Pat<(X86sbb_flag (i64 0), (i64 0), EFLAGS),
+          (SBB64rr (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit),
+                   (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit))>;
 
 //===----------------------------------------------------------------------===//
 // String Pseudo Instructions
@@ -1320,6 +1311,15 @@ def : Pat<(i64 (anyext GR16:$src)),
 def : Pat<(i64 (anyext GR32:$src)),
           (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, sub_32bit)>;
 
+// If this is an anyext of the remainder of an 8-bit sdivrem, use a MOVSX
+// instead of a MOVZX. The sdivrem lowering will emit emit a MOVSX to move
+// %ah to the lower byte of a register. By using a MOVSX here we allow a
+// post-isel peephole to merge the two MOVSX instructions into one.
+def anyext_sdiv : PatFrag<(ops node:$lhs), (anyext node:$lhs),[{
+  return (N->getOperand(0).getOpcode() == ISD::SDIVREM &&
+          N->getOperand(0).getResNo() == 1);
+}]>;
+def : Pat<(i32 (anyext_sdiv GR8:$src)), (MOVSX32rr8 GR8:$src)>;
 
 // Any instruction that defines a 32-bit result leaves the high half of the
 // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 7bc8d0aa5309ad385a4e199ba76d12c5a8cd2c48..0b98abaa7a267b4592765fdbd9f9e8099594cfeb 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -103,16 +103,6 @@ def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
 def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
-def X86vzext   : SDNode<"X86ISD::VZEXT",
-                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                              SDTCisInt<0>, SDTCisInt<1>,
-                                              SDTCisOpSmallerThanOp<1, 0>]>>;
-
-def X86vsext   : SDNode<"X86ISD::VSEXT",
-                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                              SDTCisInt<0>, SDTCisInt<1>,
-                                              SDTCisOpSmallerThanOp<1, 0>]>>;
-
 def SDTVtrunc    : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                        SDTCisInt<0>, SDTCisInt<1>,
                                        SDTCisOpSmallerThanOp<0, 1>]>;
@@ -237,10 +227,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
                                           SDTCisSameAs<2, 1>]>;
 
-def X86addus   : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>;
-def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
-def X86adds    : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>;
-def X86subs    : SDNode<"X86ISD::SUBS", SDTIntBinOp>;
 def X86mulhrs  : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>;
 def X86avg     : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index fe26389050c2e4a2b3399dae11ca8fe9983e3641..40de049bfe8bab671a64ff346ff0f8e358ab7c0d 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -718,7 +718,7 @@ bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
 }
 
 /// Check whether the shift count for a machine operand is non-zero.
-inline static unsigned getTruncatedShiftCount(MachineInstr &MI,
+inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
                                               unsigned ShiftAmtOperandIdx) {
   // The shift count is six bits with the REX.W prefix and five bits without.
   unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
@@ -739,8 +739,7 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
 
 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                                   unsigned Opc, bool AllowSP, unsigned &NewSrc,
-                                  bool &isKill, bool &isUndef,
-                                  MachineOperand &ImplicitOp,
+                                  bool &isKill, MachineOperand &ImplicitOp,
                                   LiveVariables *LV) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetRegisterClass *RC;
@@ -757,7 +756,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
   if (Opc != X86::LEA64_32r) {
     NewSrc = SrcReg;
     isKill = Src.isKill();
-    isUndef = Src.isUndef();
+    assert(!Src.isUndef() && "Undef op doesn't need optimization");
 
     if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
         !MF.getRegInfo().constrainRegClass(NewSrc, RC))
@@ -774,7 +773,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
 
     NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
     isKill = Src.isKill();
-    isUndef = Src.isUndef();
+    assert(!Src.isUndef() && "Undef op doesn't need optimization");
   } else {
     // Virtual register of the wrong class, we have to create a temporary 64-bit
     // vreg to feed into the LEA.
@@ -786,7 +785,6 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
 
     // Which is obviously going to be dead after we're done with it.
     isKill = true;
-    isUndef = false;
 
     if (LV)
       LV->replaceKillInstruction(SrcReg, MI, *Copy);
@@ -796,88 +794,99 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
   return true;
 }
 
-/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
-/// LEA to form 3-address code by promoting to a 32-bit superregister and then
-/// truncating back down to a 16-bit subregister.
 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
     unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
     LiveVariables *LV) const {
-  MachineBasicBlock::iterator MBBI = MI.getIterator();
-  unsigned Dest = MI.getOperand(0).getReg();
-  unsigned Src = MI.getOperand(1).getReg();
-  bool isDead = MI.getOperand(0).isDead();
-  bool isKill = MI.getOperand(1).isKill();
-
+  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
+  bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri);
   MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
-  unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
-  unsigned Opc, leaInReg;
-  if (Subtarget.is64Bit()) {
-    Opc = X86::LEA64_32r;
-    leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
-  } else {
-    Opc = X86::LEA32r;
-    leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
-  }
+  assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
+              *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
+         "Unexpected type for LEA transform");
+
+  // TODO: For a 32-bit target, we need to adjust the LEA variables with
+  // something like this:
+  //   Opcode = X86::LEA32r;
+  //   InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  //   OutRegLEA =
+  //       Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
+  //                : RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  if (!Subtarget.is64Bit())
+    return nullptr;
+
+  unsigned Opcode = X86::LEA64_32r;
+  unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+  unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
 
   // Build and insert into an implicit UNDEF value. This is OK because
-  // well be shifting and then extracting the lower 16-bits.
+  // we will be shifting and then extracting the lower 8/16-bits.
   // This has the potential to cause partial register stall. e.g.
   //   movw    (%rbp,%rcx,2), %dx
   //   leal    -65(%rdx), %esi
   // But testing has shown this *does* help performance in 64-bit mode (at
   // least on modern x86 machines).
-  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+  bool IsDead = MI.getOperand(0).isDead();
+  bool IsKill = MI.getOperand(1).isKill();
+  unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit;
+  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
+  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
   MachineInstr *InsMI =
       BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
-          .addReg(leaInReg, RegState::Define, X86::sub_16bit)
-          .addReg(Src, getKillRegState(isKill));
+          .addReg(InRegLEA, RegState::Define, SubReg)
+          .addReg(Src, getKillRegState(IsKill));
 
   MachineInstrBuilder MIB =
-      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opc), leaOutReg);
+      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
   switch (MIOpc) {
   default: llvm_unreachable("Unreachable!");
   case X86::SHL16ri: {
     unsigned ShAmt = MI.getOperand(2).getImm();
     MIB.addReg(0).addImm(1ULL << ShAmt)
-       .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
+       .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
     break;
   }
   case X86::INC16r:
-    addRegOffset(MIB, leaInReg, true, 1);
+    addRegOffset(MIB, InRegLEA, true, 1);
     break;
   case X86::DEC16r:
-    addRegOffset(MIB, leaInReg, true, -1);
+    addRegOffset(MIB, InRegLEA, true, -1);
     break;
+  case X86::ADD8ri:
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
-    addRegOffset(MIB, leaInReg, true, MI.getOperand(2).getImm());
+    addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
     break;
+  case X86::ADD8rr:
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
     unsigned Src2 = MI.getOperand(2).getReg();
-    bool isKill2 = MI.getOperand(2).isKill();
-    unsigned leaInReg2 = 0;
+    bool IsKill2 = MI.getOperand(2).isKill();
+    assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
+    unsigned InRegLEA2 = 0;
     MachineInstr *InsMI2 = nullptr;
     if (Src == Src2) {
-      // ADD16rr killed %reg1028, %reg1028
+      // ADD8rr/ADD16rr killed %reg1028, %reg1028
       // just a single insert_subreg.
-      addRegReg(MIB, leaInReg, true, leaInReg, false);
+      addRegReg(MIB, InRegLEA, true, InRegLEA, false);
     } else {
       if (Subtarget.is64Bit())
-        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+        InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
       else
-        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+        InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
       // Build and insert into an implicit UNDEF value. This is OK because
-      // well be shifting and then extracting the lower 16-bits.
-      BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
+      // we will be shifting and then extracting the lower 8/16-bits.
+      BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
       InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
-                   .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
-                   .addReg(Src2, getKillRegState(isKill2));
-      addRegReg(MIB, leaInReg, true, leaInReg2, true);
+                   .addReg(InRegLEA2, RegState::Define, SubReg)
+                   .addReg(Src2, getKillRegState(IsKill2));
+      addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
     }
-    if (LV && isKill2 && InsMI2)
+    if (LV && IsKill2 && InsMI2)
       LV->replaceKillInstruction(Src2, MI, *InsMI2);
     break;
   }
@@ -886,16 +895,16 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
   MachineInstr *NewMI = MIB;
   MachineInstr *ExtMI =
       BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
-          .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-          .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+          .addReg(Dest, RegState::Define | getDeadRegState(IsDead))
+          .addReg(OutRegLEA, RegState::Kill, SubReg);
 
   if (LV) {
-    // Update live variables
-    LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
-    LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
-    if (isKill)
+    // Update live variables.
+    LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
+    LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
+    if (IsKill)
       LV->replaceKillInstruction(Src, MI, *InsMI);
-    if (isDead)
+    if (IsDead)
       LV->replaceKillInstruction(Dest, MI, *ExtMI);
   }
 
@@ -926,12 +935,18 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   const MachineOperand &Dest = MI.getOperand(0);
   const MachineOperand &Src = MI.getOperand(1);
 
+  // Ideally, operations with undef should be folded before we get here, but we
+  // can't guarantee it. Bail out because optimizing undefs is a waste of time.
+  // Without this, we have to forward undef state to new register operands to
+  // avoid machine verifier errors.
+  if (Src.isUndef())
+    return nullptr;
+  if (MI.getNumOperands() > 2)
+    if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
+      return nullptr;
+
   MachineInstr *NewMI = nullptr;
-  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
-  // we have better subtarget support, enable the 16-bit LEA generation here.
-  // 16-bit LEA is also slow on Core2.
-  bool DisableLEA16 = true;
-  bool is64Bit = Subtarget.is64Bit();
+  bool Is64Bit = Subtarget.is64Bit();
 
   unsigned MIOpc = MI.getOpcode();
   switch (MIOpc) {
@@ -961,14 +976,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
-    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     // LEA can't handle ESP.
-    bool isKill, isUndef;
+    bool isKill;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+                        SrcReg, isKill, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -976,7 +991,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
             .add(Dest)
             .addReg(0)
             .addImm(1ULL << ShAmt)
-            .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+            .addReg(SrcReg, getKillRegState(isKill))
             .addImm(0)
             .addReg(0);
     if (ImplicitOp.getReg() != 0)
@@ -988,37 +1003,26 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::SHL16ri: {
     assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
-
-    if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
-                     : nullptr;
-    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                .add(Dest)
-                .addReg(0)
-                .addImm(1ULL << ShAmt)
-                .add(Src)
-                .addImm(0)
-                .addReg(0);
-    break;
+    if (!isTruncatedShiftCountForLEA(ShAmt))
+      return nullptr;
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
   }
   case X86::INC64r:
   case X86::INC32r: {
     assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
-    unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
-      : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-    bool isKill, isUndef;
+    unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
+        (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+    bool isKill;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
+                        ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB =
         BuildMI(MF, MI.getDebugLoc(), get(Opc))
             .add(Dest)
-            .addReg(SrcReg,
-                    getKillRegState(isKill) | getUndefRegState(isUndef));
+            .addReg(SrcReg, getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
 
@@ -1026,30 +1030,23 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   case X86::INC16r:
-    if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
-                     : nullptr;
-    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
-    NewMI = addOffset(
-        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), 1);
-    break;
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
   case X86::DEC64r:
   case X86::DEC32r: {
     assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
     unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
-      : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+        : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
 
-    bool isKill, isUndef;
+    bool isKill;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
+                        ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
                                   .add(Dest)
-                                  .addReg(SrcReg, getUndefRegState(isUndef) |
-                                                      getKillRegState(isKill));
+                                  .addReg(SrcReg, getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
 
@@ -1058,13 +1055,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   case X86::DEC16r:
-    if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
-                     : nullptr;
-    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
-    NewMI = addOffset(
-        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), -1);
-    break;
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
   case X86::ADD32rr:
@@ -1074,21 +1065,21 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
       Opc = X86::LEA64r;
     else
-      Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+      Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-    bool isKill, isUndef;
+    bool isKill;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+                        SrcReg, isKill, ImplicitOp, LV))
       return nullptr;
 
     const MachineOperand &Src2 = MI.getOperand(2);
-    bool isKill2, isUndef2;
+    bool isKill2;
     unsigned SrcReg2;
     MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
-                        SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
+                        SrcReg2, isKill2, ImplicitOp2, LV))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
@@ -1098,36 +1089,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MIB.add(ImplicitOp2);
 
     NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
-
-    // Preserve undefness of the operands.
-    NewMI->getOperand(1).setIsUndef(isUndef);
-    NewMI->getOperand(3).setIsUndef(isUndef2);
-
     if (LV && Src2.isKill())
       LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
     break;
   }
+  case X86::ADD8rr:
   case X86::ADD16rr:
-  case X86::ADD16rr_DB: {
-    if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
-                     : nullptr;
-    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    unsigned Src2 = MI.getOperand(2).getReg();
-    bool isKill2 = MI.getOperand(2).isKill();
-    NewMI = addRegReg(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest),
-                      Src.getReg(), Src.isKill(), Src2, isKill2);
-
-    // Preserve undefness of the operands.
-    bool isUndef = MI.getOperand(1).isUndef();
-    bool isUndef2 = MI.getOperand(2).isUndef();
-    NewMI->getOperand(1).setIsUndef(isUndef);
-    NewMI->getOperand(3).setIsUndef(isUndef2);
-
-    if (LV && isKill2)
-      LV->replaceKillInstruction(Src2, MI, *NewMI);
-    break;
-  }
+  case X86::ADD16rr_DB:
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
   case X86::ADD64ri32:
   case X86::ADD64ri8:
   case X86::ADD64ri32_DB:
@@ -1142,38 +1111,30 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::ADD32ri_DB:
   case X86::ADD32ri8_DB: {
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-    bool isKill, isUndef;
+    bool isKill;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+                        SrcReg, isKill, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
                                   .add(Dest)
-                                  .addReg(SrcReg, getUndefRegState(isUndef) |
-                                                      getKillRegState(isKill));
+                                  .addReg(SrcReg, getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
 
     NewMI = addOffset(MIB, MI.getOperand(2));
     break;
   }
+  case X86::ADD8ri:
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
-    if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
-                     : nullptr;
-    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    NewMI = addOffset(
-        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src),
-        MI.getOperand(2));
-    break;
-
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
   case X86::VMOVDQU8Z128rmk:
   case X86::VMOVDQU8Z256rmk:
   case X86::VMOVDQU8Zrmk:
@@ -3257,9 +3218,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   }
 }
 
-bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
-                                         int64_t &Offset,
-                                         const TargetRegisterInfo *TRI) const {
+bool X86InstrInfo::getMemOperandWithOffset(
+    MachineInstr &MemOp, MachineOperand *&BaseOp, int64_t &Offset,
+    const TargetRegisterInfo *TRI) const {
   const MCInstrDesc &Desc = MemOp.getDesc();
   int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
   if (MemRefBegin < 0)
@@ -3267,11 +3228,10 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
 
   MemRefBegin += X86II::getOperandBias(Desc);
 
-  MachineOperand &BaseMO = MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
-  if (!BaseMO.isReg()) // Can be an MO_FrameIndex
+  BaseOp = &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
+  if (!BaseOp->isReg()) // Can be an MO_FrameIndex
     return false;
 
-  BaseReg = BaseMO.getReg();
   if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
     return false;
 
@@ -3287,6 +3247,8 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
 
   Offset = DispMO.getImm();
 
+  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+                            "operands of type register.");
   return true;
 }
 
@@ -3459,9 +3421,10 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
 /// This function can be extended later on.
 /// SrcReg, SrcRegs: register operands for FlagI.
 /// ImmValue: immediate for FlagI if it takes an immediate.
-inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
-                                        unsigned SrcReg2, int ImmMask,
-                                        int ImmValue, MachineInstr &OI) {
+inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
+                                        unsigned SrcReg, unsigned SrcReg2,
+                                        int ImmMask, int ImmValue,
+                                        const MachineInstr &OI) {
   if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
        (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
        (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
@@ -3492,7 +3455,7 @@ inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
 
 /// Check whether the definition can be converted
 /// to remove a comparison against zero.
-inline static bool isDefConvertible(MachineInstr &MI) {
+inline static bool isDefConvertible(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default: return false;
 
@@ -3592,12 +3555,16 @@ inline static bool isDefConvertible(MachineInstr &MI) {
   case X86::BLSFILL64rr: case X86::BLSFILL64rm:
   case X86::BLSIC32rr:   case X86::BLSIC32rm:
   case X86::BLSIC64rr:   case X86::BLSIC64rm:
+  case X86::T1MSKC32rr:  case X86::T1MSKC32rm:
+  case X86::T1MSKC64rr:  case X86::T1MSKC64rm:
+  case X86::TZMSK32rr:   case X86::TZMSK32rm:
+  case X86::TZMSK64rr:   case X86::TZMSK64rm:
     return true;
   }
 }
 
 /// Check whether the use can be converted to remove a comparison against zero.
-static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
+static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default: return X86::COND_INVALID;
   case X86::LZCNT16rr: case X86::LZCNT16rm:
@@ -3612,12 +3579,12 @@ static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
   case X86::TZCNT32rr: case X86::TZCNT32rm:
   case X86::TZCNT64rr: case X86::TZCNT64rm:
     return X86::COND_B;
-  case X86::BSF16rr:
-  case X86::BSF16rm:
-  case X86::BSF32rr:
-  case X86::BSF32rm:
-  case X86::BSF64rr:
-  case X86::BSF64rm:
+  case X86::BSF16rr: case X86::BSF16rm:
+  case X86::BSF32rr: case X86::BSF32rm:
+  case X86::BSF64rr: case X86::BSF64rm:
+  case X86::BSR16rr: case X86::BSR16rm:
+  case X86::BSR32rr: case X86::BSR32rm:
+  case X86::BSR64rr: case X86::BSR64rm:
     return X86::COND_E;
   }
 }
@@ -7830,5 +7797,5 @@ X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
   return It;
 }
 
-#define GET_TII_HELPERS
+#define GET_INSTRINFO_HELPERS
 #include "X86GenInstrInfo.inc"
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index f3965db4fe7c5c114b110650d0a4fbdd95f49c46..159cb50afc5c017fcb10ffb52f85f0528d88a1a1 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -258,7 +258,7 @@ public:
   /// operand to the LEA instruction.
   bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                       unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
-                      bool &isKill, bool &isUndef, MachineOperand &ImplicitOp,
+                      bool &isKill, MachineOperand &ImplicitOp,
                       LiveVariables *LV) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
@@ -327,9 +327,9 @@ public:
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
-                             int64_t &Offset,
-                             const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+                               int64_t &Offset,
+                               const TargetRegisterInfo *TRI) const override;
   bool analyzeBranchPredicate(MachineBasicBlock &MBB,
                               TargetInstrInfo::MachineBranchPredicate &MBP,
                               bool AllowModify = false) const override;
@@ -558,7 +558,7 @@ public:
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
                      const outliner::Candidate &C) const override;
 
-#define GET_TII_HELPER_DECLS
+#define GET_INSTRINFO_HELPER_DECLS
 #include "X86GenInstrInfo.inc"
 
 protected:
@@ -584,6 +584,9 @@ protected:
                        const MachineOperand *&Destination) const override;
 
 private:
+  /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions.
+  /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit
+  /// super-register and then truncating back down to a 8/16-bit sub-register.
   MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
                                              MachineFunction::iterator &MFI,
                                              MachineInstr &MI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 992e9543b33dac2ccea6bf90d6b48afb7f98726f..fe9b530e8ea1a33e17dc054c5ec8bc6a3c0d74ec 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -17,10 +17,6 @@
 // X86 specific DAG Nodes.
 //
 
-def SDTIntShiftDOp: SDTypeProfile<1, 3,
-                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                                   SDTCisInt<0>, SDTCisInt<3>]>;
-
 def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
 
 def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
@@ -1493,7 +1489,7 @@ def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", []>;
 }
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
                    [(set GR8:$dst, imm:$src)]>;
@@ -1507,7 +1503,7 @@ def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
                        "mov{q}\t{$src, $dst|$dst, $src}",
                        [(set GR64:$dst, i64immSExt32:$src)]>;
 }
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isMoveImm = 1 in {
 def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
                     "movabs{q}\t{$src, $dst|$dst, $src}",
                     [(set GR64:$dst, relocImm:$src)]>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 85e4fd3856339c3eb9e89d083353872e2fa96478..94cd5a611f2ca78cb3c8da3c2c682bd1365d714e 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3623,13 +3623,13 @@ defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
                              SchedWriteVecALU, 1, NoVLX>;
 defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
                              SchedWriteVecALU, 1, NoVLX>;
-defm PADDSB  : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
+defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
-defm PADDSW  : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
+defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
-defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
+defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
-defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
+defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
 defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
                              SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
@@ -3645,13 +3645,13 @@ defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
                              SchedWriteVecALU, 0, NoVLX>;
 defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
                              SchedWriteVecALU, 0, NoVLX>;
-defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
+defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
-defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
+defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
-defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
-defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
 defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
@@ -5198,34 +5198,72 @@ defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
 
 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
 
-// AVX2 Patterns
-multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
+// Patterns that we also need for any_extend.
+// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg.
+multiclass SS41I_pmovx_avx2_patterns_base<string OpcPrefix, SDNode ExtOp> {
   // Register-Register patterns
-  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
-            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+    def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+              (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
   }
-  let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
+
+  let Predicates = [HasAVX2, NoVLX] in {
+    def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+              (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
+
+    def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+              (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
+  }
+
+  // AVX2 Register-Memory patterns
+  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+    def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
+              (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+    def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+    def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  }
+
+  let Predicates = [HasAVX2, NoVLX] in {
+    def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
+              (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+    def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+    def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
+    def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
+              (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+    def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+    def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+              (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  }
+}
+
+// AVX2 Patterns
+multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
+                                     SDNode ExtOp, SDNode InVecOp> :
+    SS41I_pmovx_avx2_patterns_base<OpcPrefix, ExtOp> {
+
+  // Register-Register patterns
+  let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
             (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
-  def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
+  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
             (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
-            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
-  def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
+  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
             (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
-
-  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
-            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
   }
 
   // Simple Register-Memory patterns
-  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   }
-  let Predicates = [HasAVX, NoVLX] in {
+  let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
@@ -5241,60 +5279,39 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
   }
 
   // AVX2 Register-Memory patterns
-  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
-            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
-  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
-  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
-  }
-  let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
-            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
-
-  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-
-  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
-            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
-            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   }
 }
 
-defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
-defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
+defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
+defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
+defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>;
 
 // SSE4.1/AVX patterns.
 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index e592330ce1688f631a0c1aef21a07fe11c005fe4..c20336387b2db2ac990b4ddc4020b07b9fd697c6 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -373,6 +373,7 @@ bool X86InstructionSelector::select(MachineInstr &I,
   case TargetOpcode::G_UNMERGE_VALUES:
     return selectUnmergeValues(I, MRI, MF, CoverageInfo);
   case TargetOpcode::G_MERGE_VALUES:
+  case TargetOpcode::G_CONCAT_VECTORS:
     return selectMergeValues(I, MRI, MF, CoverageInfo);
   case TargetOpcode::G_EXTRACT:
     return selectExtract(I, MRI, MF);
@@ -1349,7 +1350,8 @@ bool X86InstructionSelector::selectUnmergeValues(
 bool X86InstructionSelector::selectMergeValues(
     MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
     CodeGenCoverage &CoverageInfo) const {
-  assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES) &&
+  assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES ||
+          I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS) &&
          "unexpected instruction");
 
   // Split to inserts.
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 252d64808f0d8a8c30cc7ffb4a645662252c270a..fdcd31c3f9dd16187b38df1200468ee517113da8 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -286,10 +286,8 @@ static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
  * the alphabetical order.
  */
 static const IntrinsicData  IntrinsicsWithoutChain[] = {
-  X86_INTRINSIC_DATA(addcarry_u32,      ADX, X86ISD::ADC, 0),
-  X86_INTRINSIC_DATA(addcarry_u64,      ADX, X86ISD::ADC, 0),
-  X86_INTRINSIC_DATA(addcarryx_u32,     ADX, X86ISD::ADC, 0),
-  X86_INTRINSIC_DATA(addcarryx_u64,     ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarry_32,       ADX, X86ISD::ADC, X86ISD::ADD),
+  X86_INTRINSIC_DATA(addcarry_64,       ADX, X86ISD::ADC, X86ISD::ADD),
   X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
   X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
   X86_INTRINSIC_DATA(avx_cmp_pd_256,    INTR_TYPE_3OP, X86ISD::CMPP, 0),
@@ -321,8 +319,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
-  X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, ISD::SADDSAT, 0),
+  X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, ISD::SADDSAT, 0),
   X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
@@ -363,8 +361,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
-  X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
-  X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, ISD::SSUBSAT, 0),
+  X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, ISD::SSUBSAT, 0),
   X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
@@ -922,8 +920,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_padds_b_512, INTR_TYPE_2OP, X86ISD::ADDS, 0),
-  X86_INTRINSIC_DATA(avx512_padds_w_512, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_padds_b_512, INTR_TYPE_2OP, ISD::SADDSAT, 0),
+  X86_INTRINSIC_DATA(avx512_padds_w_512, INTR_TYPE_2OP, ISD::SADDSAT, 0),
   X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -1006,8 +1004,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
-  X86_INTRINSIC_DATA(avx512_psubs_b_512, INTR_TYPE_2OP, X86ISD::SUBS, 0),
-  X86_INTRINSIC_DATA(avx512_psubs_w_512, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_psubs_b_512, INTR_TYPE_2OP, ISD::SSUBSAT, 0),
+  X86_INTRINSIC_DATA(avx512_psubs_w_512, INTR_TYPE_2OP, ISD::SSUBSAT, 0),
   X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
   X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
   X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
@@ -1170,8 +1168,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(sse2_padds_b,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
-  X86_INTRINSIC_DATA(sse2_padds_w,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(sse2_padds_b,      INTR_TYPE_2OP, ISD::SADDSAT, 0),
+  X86_INTRINSIC_DATA(sse2_padds_w,      INTR_TYPE_2OP, ISD::SADDSAT, 0),
   X86_INTRINSIC_DATA(sse2_pmadd_wd,     INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
   X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
@@ -1193,8 +1191,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_psrli_d,      VSHIFT, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(sse2_psrli_q,      VSHIFT, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(sse2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
-  X86_INTRINSIC_DATA(sse2_psubs_b,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
-  X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(sse2_psubs_b,      INTR_TYPE_2OP, ISD::SSUBSAT, 0),
+  X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, ISD::SSUBSAT, 0),
   X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_ucomigt_sd,   COMI, X86ISD::UCOMI, ISD::SETGT),
@@ -1223,8 +1221,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
   X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
   X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
-  X86_INTRINSIC_DATA(subborrow_u32,     ADX, X86ISD::SBB, 0),
-  X86_INTRINSIC_DATA(subborrow_u64,     ADX, X86ISD::SBB, 0),
+  X86_INTRINSIC_DATA(subborrow_32,      ADX, X86ISD::SBB, X86ISD::SUB),
+  X86_INTRINSIC_DATA(subborrow_64,      ADX, X86ISD::SBB, X86ISD::SUB),
   X86_INTRINSIC_DATA(tbm_bextri_u32,    INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(tbm_bextri_u64,    INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(vcvtph2ps_128,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 4f59e0f79a72cc76042f107e3c476371576df0ae..4a49fa68dd0606721b1b8eb0f7507a42e84d3b10 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -271,7 +271,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE1() {
 
   // Merge/Unmerge
   for (const auto &Ty : {v4s32, v2s64}) {
-    setAction({G_MERGE_VALUES, Ty}, Legal);
+    setAction({G_CONCAT_VECTORS, Ty}, Legal);
     setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
   }
   setAction({G_MERGE_VALUES, 1, s64}, Legal);
@@ -316,11 +316,11 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() {
   // Merge/Unmerge
   for (const auto &Ty :
        {v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) {
-    setAction({G_MERGE_VALUES, Ty}, Legal);
+    setAction({G_CONCAT_VECTORS, Ty}, Legal);
     setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
   }
   for (const auto &Ty : {v16s8, v8s16, v4s32, v2s64}) {
-    setAction({G_MERGE_VALUES, 1, Ty}, Legal);
+    setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
     setAction({G_UNMERGE_VALUES, Ty}, Legal);
   }
 }
@@ -367,12 +367,12 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
   // Merge/Unmerge
   for (const auto &Ty :
        {v32s8, v64s8, v16s16, v32s16, v8s32, v16s32, v4s64, v8s64}) {
-    setAction({G_MERGE_VALUES, Ty}, Legal);
+    setAction({G_CONCAT_VECTORS, Ty}, Legal);
     setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
   }
   for (const auto &Ty :
        {v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) {
-    setAction({G_MERGE_VALUES, 1, Ty}, Legal);
+    setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
     setAction({G_UNMERGE_VALUES, Ty}, Legal);
   }
 }
@@ -400,11 +400,11 @@ void X86LegalizerInfo::setLegalizerInfoAVX2() {
 
   // Merge/Unmerge
   for (const auto &Ty : {v64s8, v32s16, v16s32, v8s64}) {
-    setAction({G_MERGE_VALUES, Ty}, Legal);
+    setAction({G_CONCAT_VECTORS, Ty}, Legal);
     setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
   }
   for (const auto &Ty : {v32s8, v16s16, v8s32, v4s64}) {
-    setAction({G_MERGE_VALUES, 1, Ty}, Legal);
+    setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
     setAction({G_UNMERGE_VALUES, Ty}, Legal);
   }
 }
diff --git a/lib/Target/X86/X86PfmCounters.td b/lib/Target/X86/X86PfmCounters.td
index c57798e621e53a56b1467b938c61fb61b22e7ee7..a1a4210b5ebf8e4ef226218f19b58e4568ab8f31 100644
--- a/lib/Target/X86/X86PfmCounters.td
+++ b/lib/Target/X86/X86PfmCounters.td
@@ -14,6 +14,56 @@
 def UnhaltedCoreCyclesPfmCounter : PfmCounter<"unhalted_core_cycles">;
 def UopsIssuedPfmCounter : PfmCounter<"uops_issued:any">;
 
+// No default counters on X86.
+def DefaultPfmCounters : ProcPfmCounters {}
+def : PfmCountersDefaultBinding<DefaultPfmCounters>;
+
+// Intel X86 Counters.
+def PentiumPfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+  let UopsCounter = PfmCounter<"uops_retired">;
+}
+def : PfmCountersBinding<"pentiumpro", PentiumPfmCounters>;
+def : PfmCountersBinding<"pentium2", PentiumPfmCounters>;
+def : PfmCountersBinding<"pentium3", PentiumPfmCounters>;
+def : PfmCountersBinding<"pentium3m", PentiumPfmCounters>;
+def : PfmCountersBinding<"pentium-m", PentiumPfmCounters>;
+
+def CorePfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = PfmCounter<"uops_retired:any">;
+}
+def : PfmCountersBinding<"yonah", CorePfmCounters>;
+def : PfmCountersBinding<"prescott", CorePfmCounters>;
+def : PfmCountersBinding<"core2", CorePfmCounters>;
+def : PfmCountersBinding<"penryn", CorePfmCounters>;
+def : PfmCountersBinding<"nehalem", CorePfmCounters>;
+def : PfmCountersBinding<"corei7", CorePfmCounters>;
+def : PfmCountersBinding<"westmere", CorePfmCounters>;
+
+def AtomPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = PfmCounter<"uops_retired:any">;
+}
+def : PfmCountersBinding<"bonnell", AtomPfmCounters>;
+def : PfmCountersBinding<"atom", AtomPfmCounters>;
+
+def SLMPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = PfmCounter<"uops_retired:any">;
+}
+def : PfmCountersBinding<"silvermont", SLMPfmCounters>;
+def : PfmCountersBinding<"goldmont", SLMPfmCounters>;
+def : PfmCountersBinding<"goldmont-plus", SLMPfmCounters>;
+def : PfmCountersBinding<"tremont", SLMPfmCounters>;
+
+def KnightPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = PfmCounter<"uops_retired:all">;
+}
+def : PfmCountersBinding<"knl", KnightPfmCounters>;
+def : PfmCountersBinding<"knm", KnightPfmCounters>;
+
 def SandyBridgePfmCounters : ProcPfmCounters {
   let CycleCounter = UnhaltedCoreCyclesPfmCounter;
   let UopsCounter = UopsIssuedPfmCounter;
@@ -26,6 +76,7 @@ def SandyBridgePfmCounters : ProcPfmCounters {
   ];
 }
 def : PfmCountersBinding<"sandybridge", SandyBridgePfmCounters>;
+def : PfmCountersBinding<"ivybridge", SandyBridgePfmCounters>;
 
 def HaswellPfmCounters : ProcPfmCounters {
   let CycleCounter = UnhaltedCoreCyclesPfmCounter;
@@ -90,6 +141,31 @@ def SkylakeServerPfmCounters : ProcPfmCounters {
   ];
 }
 def : PfmCountersBinding<"skylake-avx512", SkylakeServerPfmCounters>;
+def : PfmCountersBinding<"cascadelake", SkylakeServerPfmCounters>;
+def : PfmCountersBinding<"cannonlake", SkylakeServerPfmCounters>;
+def : PfmCountersBinding<"icelake-client", SkylakeServerPfmCounters>;
+def : PfmCountersBinding<"icelake-server", SkylakeServerPfmCounters>;
+
+// AMD X86 Counters.
+// Set basic counters for AMD cpus that we know libpfm4 supports.
+def DefaultAMDPfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+}
+def : PfmCountersBinding<"athlon", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-tbird", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-4", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-xp", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-mp", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"k8", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"opteron", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon64", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-fx", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"k8-sse3", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"opteron-sse3", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon64-sse3", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"amdfam10", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"barcelona", DefaultAMDPfmCounters>;
 
 def BdVer2PfmCounters : ProcPfmCounters {
   let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
@@ -101,8 +177,31 @@ def BdVer2PfmCounters : ProcPfmCounters {
     PfmIssueCounter<"PdFPU3", "dispatched_fpu_ops:ops_pipe3 + dispatched_fpu_ops:ops_dual_pipe3">
   ];
 }
+def : PfmCountersBinding<"bdver1", BdVer2PfmCounters>;
 def : PfmCountersBinding<"bdver2", BdVer2PfmCounters>;
 
+def BdVer3PfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+  let IssueCounters = [
+    PfmIssueCounter<"SrFPU0", "dispatched_fpu_ops:ops_pipe0 + dispatched_fpu_ops:ops_dual_pipe0">,
+    PfmIssueCounter<"SrFPU1", "dispatched_fpu_ops:ops_pipe1 + dispatched_fpu_ops:ops_dual_pipe1">,
+    PfmIssueCounter<"SrFPU2", "dispatched_fpu_ops:ops_pipe2 + dispatched_fpu_ops:ops_dual_pipe2">
+  ];
+}
+def : PfmCountersBinding<"bdver3", BdVer3PfmCounters>;
+def : PfmCountersBinding<"bdver4", BdVer3PfmCounters>;
+
+def BtVer1PfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+  let IssueCounters = [
+    PfmIssueCounter<"BtFPU0", "dispatched_fpu:pipe0">,
+    PfmIssueCounter<"BtFPU1", "dispatched_fpu:pipe1">
+  ];
+}
+def : PfmCountersBinding<"btver1", BtVer1PfmCounters>;
+
 def BtVer2PfmCounters : ProcPfmCounters {
   let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
   let UopsCounter = PfmCounter<"retired_uops">;
@@ -112,3 +211,16 @@ def BtVer2PfmCounters : ProcPfmCounters {
   ];
 }
 def : PfmCountersBinding<"btver2", BtVer2PfmCounters>;
+
+def ZnVer1PfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cycles_not_in_halt">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+  let IssueCounters = [
+    PfmIssueCounter<"ZnFPU0", "fpu_pipe_assignment:total0">,
+    PfmIssueCounter<"ZnFPU1", "fpu_pipe_assignment:total1">,
+    PfmIssueCounter<"ZnFPU2", "fpu_pipe_assignment:total2">,
+    PfmIssueCounter<"ZnFPU3", "fpu_pipe_assignment:total3">,
+    PfmIssueCounter<"ZnDivider", "div_op_count">
+  ];
+}
+def : PfmCountersBinding<"znver1", ZnVer1PfmCounters>;
diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td
index 028729057fe6ef7ba97c2d5ec83d152eb76e3e48..971a50196e45a477d2b30d9fd45daf4e1a09be3c 100644
--- a/lib/Target/X86/X86SchedBroadwell.td
+++ b/lib/Target/X86/X86SchedBroadwell.td
@@ -878,10 +878,10 @@ def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> {
 }
 def: InstRW<[BWWriteResGroup45], (instrs FNCLEX)>;
 
-def BWWriteResGroup46 : SchedWriteRes<[BWPort015,BWPort0156]> {
-  let Latency = 4;
+def BWWriteResGroup46 : SchedWriteRes<[]> {
+  let Latency = 0;
   let NumMicroOps = 4;
-  let ResourceCycles = [1,3];
+  let ResourceCycles = [];
 }
 def: InstRW<[BWWriteResGroup46], (instrs VZEROUPPER)>;
 
@@ -1229,7 +1229,7 @@ def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,3];
 }
-def: InstRW<[BWWriteResGroup112], (instregex "RDRAND(16|32|64)r")>;
+def: InstRW<[BWWriteResGroup112], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>;
 
 def BWWriteResGroup113 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
   let Latency = 9;
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 799ba1efdde3a7e12fde57d6ed9625f99e129040..06a32fb0b1cdfb490ee7d8cce6872465bfaf739d 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -720,7 +720,7 @@ def HWWriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
   let NumMicroOps = 17;
   let ResourceCycles = [1, 16];
 }
-def : InstRW<[HWWriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
+def : InstRW<[HWWriteRDRAND], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>;
 
 //=== Floating Point x87 Instructions ===//
 //-- Move instructions --//
@@ -1408,10 +1408,10 @@ def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> {
 }
 def: InstRW<[HWWriteResGroup81], (instrs FNCLEX)>;
 
-def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> {
-  let Latency = 4;
+def HWWriteResGroup82 : SchedWriteRes<[]> {
+  let Latency = 0;
   let NumMicroOps = 4;
-  let ResourceCycles = [1,3];
+  let ResourceCycles = [];
 }
 def: InstRW<[HWWriteResGroup82], (instrs VZEROUPPER)>;
 
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index 569ae366eaafdfa1d0614d820253211aab45d73e..9dbf0976989f83595ca0d715bd12bd3bd95b544c 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -1112,6 +1112,13 @@ def SBWriteResGroupVzeroall : SchedWriteRes<[SBPort5]> {
 }
 def: InstRW<[SBWriteResGroupVzeroall], (instrs VZEROALL)>;
 
+def SBWriteResGroupVzeroupper : SchedWriteRes<[]> {
+  let Latency = 1;
+  let NumMicroOps = 4;
+  let ResourceCycles = [];
+}
+def: InstRW<[SBWriteResGroupVzeroupper], (instrs VZEROUPPER)>;
+
 def: InstRW<[WriteZero], (instrs CLC)>;
 
 // Intruction variants handled by the renamer. These might not need execution
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index d4a3eb07b982771643611326db277f43b25c7c49..2c9eb7516085c1d6e63da3a91a142e8c2fc314be 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -897,10 +897,10 @@ def SKLWriteResGroup55 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
 }
 def: InstRW<[SKLWriteResGroup55], (instrs PAUSE)>;
 
-def SKLWriteResGroup56 : SchedWriteRes<[SKLPort015,SKLPort0156]> {
-  let Latency = 4;
+def SKLWriteResGroup56 : SchedWriteRes<[]> {
+  let Latency = 0;
   let NumMicroOps = 4;
-  let ResourceCycles = [1,3];
+  let ResourceCycles = [];
 }
 def: InstRW<[SKLWriteResGroup56], (instrs VZEROUPPER)>;
 
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index cbcb6a6e58bb2b0aadb2bc276f414035b39312ee..ec8e4db02d8aa59474082ea9cc3a6e7d99cd49f2 100644
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -1009,10 +1009,10 @@ def SKXWriteResGroup55 : SchedWriteRes<[SKXPort0156]> {
 }
 def: InstRW<[SKXWriteResGroup55], (instrs FNCLEX)>;
 
-def SKXWriteResGroup56 : SchedWriteRes<[SKXPort015,SKXPort0156]> {
-  let Latency = 4;
+def SKXWriteResGroup56 : SchedWriteRes<[]> {
+  let Latency = 0;
   let NumMicroOps = 4;
-  let ResourceCycles = [1,3];
+  let ResourceCycles = [];
 }
 def: InstRW<[SKXWriteResGroup56], (instrs VZEROUPPER)>;
 
diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td
index bc5d112c2f4f23895d007f7a10869eeba1ab5a4f..5798e1b2671b9581558c5e7e764485466821246c 100644
--- a/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/lib/Target/X86/X86ScheduleBdVer2.td
@@ -130,18 +130,22 @@ def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> {
 // Load-Store Units
 //
 
-// FIXME: does this even make sense?
-
-def PdLoad  : ProcResGroup<[PdAGLU01]> {
+let Super = PdAGLU01 in
+def PdLoad  : ProcResource<2> {
   // For Piledriver, the load queue is 40 entries deep.
   let BufferSize = 40;
 }
 
-def PdStore : ProcResGroup<[PdAGLU01]> {
+def PdLoadQueue : LoadQueue<PdLoad>;
+
+let Super = PdAGLU01 in
+def PdStore : ProcResource<1> {
   // For Piledriver, the store queue is 24 entries deep.
   let BufferSize = 24;
 }
 
+def PdStoreQueue : StoreQueue<PdStore>;
+
 //===----------------------------------------------------------------------===//
 // Integer Execution Units
 //
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
index 23113e0f0df94cdffbda4b4c1994350586687bc4..a866f843106b641439136fbf30324a3d3a2b3fd9 100644
--- a/lib/Target/X86/X86ScheduleZnver1.td
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -790,7 +790,7 @@ def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>;
 def : InstRW<[WriteMicrocoded], (instrs RDPMC)>;
 
 // RDRAND.
-def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>;
+def : InstRW<[WriteMicrocoded], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>;
 
 // XGETBV.
 def : InstRW<[WriteMicrocoded], (instrs XGETBV)>;
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index b8cb11fb862eb5402037e7780f7410f3e8c54adc..a729161a1beb5c0fe90210ba3a368fe9f74cb157 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -1141,7 +1141,9 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
     unsigned TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB);
 
     // Insert a comparison of the incoming target register with this block's
-    // address.
+    // address. This also requires us to mark the block as having its address
+    // taken explicitly.
+    MBB.setHasAddressTaken();
     auto InsertPt = MBB.SkipPHIsLabelsAndDebug(MBB.begin());
     if (MF.getTarget().getCodeModel() == CodeModel::Small &&
         !Subtarget->isPositionIndependent()) {
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 6426e5b076cc8c2cc0b2793cccac324fbafdb801..afcb49dc22632f8c475f1dac861c2822c16601c1 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -57,7 +57,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
 static cl::opt<bool> EnableCondBrFoldingPass("x86-condbr-folding",
                                cl::desc("Enable the conditional branch "
                                         "folding pass"),
-                               cl::init(true), cl::Hidden);
+                               cl::init(false), cl::Hidden);
 
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
@@ -78,6 +78,7 @@ extern "C" void LLVMInitializeX86Target() {
   initializeX86AvoidSFBPassPass(PR);
   initializeX86SpeculativeLoadHardeningPassPass(PR);
   initializeX86FlagsCopyLoweringPassPass(PR);
+  initializeX86CondBrFoldingPassPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -189,10 +190,13 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
-                                              bool JIT, bool Is64Bit) {
-  if (CM)
+static CodeModel::Model getEffectiveX86CodeModel(Optional<CodeModel::Model> CM,
+                                                 bool JIT, bool Is64Bit) {
+  if (CM) {
+    if (*CM == CodeModel::Tiny)
+      report_fatal_error("Target does not support the tiny CodeModel");
     return *CM;
+  }
   if (JIT)
     return Is64Bit ? CodeModel::Large : CodeModel::Small;
   return CodeModel::Small;
@@ -209,7 +213,8 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(
           T, computeDataLayout(TT), TT, CPU, FS, Options,
           getEffectiveRelocModel(TT, JIT, RM),
-          getEffectiveCodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL),
+          getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
+          OL),
       TLOF(createTLOF(getTargetTriple())) {
   // Windows stack unwinder gets confused when execution flow "falls through"
   // after a call to 'noreturn' function.
@@ -497,6 +502,8 @@ void X86PassConfig::addPreEmitPass() {
     addPass(createX86FixupLEAs());
     addPass(createX86EvexToVexInsts());
   }
+  addPass(createX86DiscriminateMemOpsPass());
+  addPass(createX86InsertPrefetchPass());
 }
 
 void X86PassConfig::addPreEmitPass2() {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index ebb8aca5fb146a17f41d1f29c3f103650fc26937..7889322159262d3962050614431e93a9798f4473 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -612,9 +612,18 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for XOP lowering tricks.
-  if (ST->hasXOP())
-    if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
+  if (ST->hasXOP()) {
+    // If the right shift is constant then we'll fold the negation so
+    // it's as cheap as a left shift.
+    int ShiftISD = ISD;
+    if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
+        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+      ShiftISD = ISD::SHL;
+    if (const auto *Entry =
+            CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
       return LT.first * Entry->Cost;
+  }
 
   static const CostTblEntry SSE2UniformShiftCostTable[] = {
     // Uniform splats are cheaper for the following instructions.
@@ -872,6 +881,20 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   if (Kind == TTI::SK_Broadcast)
     LT.first = 1;
 
+  // Subvector extractions are free if they start at the beginning of a
+  // vector and cheap if the subvectors are aligned.
+  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
+    int NumElts = LT.second.getVectorNumElements();
+    if ((Index % NumElts) == 0)
+      return 0;
+    std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
+    if (SubLT.second.isVector()) {
+      int NumSubElts = SubLT.second.getVectorNumElements();
+      if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
+        return SubLT.first;
+    }
+  }
+
   // We are going to permute multiple sources and the result will be in multiple
   // destinations. Providing an accurate cost only for splits where the element
   // type remains the same.
@@ -909,15 +932,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   }
 
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-    { TTI::SK_Reverse,          MVT::v64i8,  1 }, // vpermb
-    { TTI::SK_Reverse,          MVT::v32i8,  1 }, // vpermb
+      {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
+      {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
 
-    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  1 }, // vpermb
-    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  1 }, // vpermb
+      {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
+      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
 
-    { TTI::SK_PermuteTwoSrc,    MVT::v64i8,  1 }, // vpermt2b
-    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  1 }, // vpermt2b
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  1 }  // vpermt2b
+      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
+      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
+      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}  // vpermt2b
   };
 
   if (ST->hasVBMI())
@@ -926,25 +949,25 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX512BWShuffleTbl[] = {
-    { TTI::SK_Broadcast,        MVT::v32i16, 1 }, // vpbroadcastw
-    { TTI::SK_Broadcast,        MVT::v64i8,  1 }, // vpbroadcastb
+      {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
 
-    { TTI::SK_Reverse,          MVT::v32i16, 1 }, // vpermw
-    { TTI::SK_Reverse,          MVT::v16i16, 1 }, // vpermw
-    { TTI::SK_Reverse,          MVT::v64i8,  2 }, // pshufb + vshufi64x2
+      {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
+      {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
+      {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
 
-    { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
-    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
-    { TTI::SK_PermuteSingleSrc, MVT::v8i16,  1 }, // vpermw
-    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  8 }, // extend to v32i16
-    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  3 }, // vpermw + zext/trunc
+      {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1},  // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
+      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3},  // vpermw + zext/trunc
 
-    { TTI::SK_PermuteTwoSrc,    MVT::v32i16, 1 }, // vpermt2w
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 1 }, // vpermt2w
-    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  1 }, // vpermt2w
-    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  3 }, // zext + vpermt2w + trunc
-    { TTI::SK_PermuteTwoSrc,    MVT::v64i8, 19 }, // 6 * v32i8 + 1
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  3 }  // zext + vpermt2w + trunc
+      {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3},  // zext + vpermt2w + trunc
+      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
+      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}   // zext + vpermt2w + trunc
   };
 
   if (ST->hasBWI())
@@ -953,42 +976,42 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX512ShuffleTbl[] = {
-    { TTI::SK_Broadcast,        MVT::v8f64,  1 }, // vbroadcastpd
-    { TTI::SK_Broadcast,        MVT::v16f32, 1 }, // vbroadcastps
-    { TTI::SK_Broadcast,        MVT::v8i64,  1 }, // vpbroadcastq
-    { TTI::SK_Broadcast,        MVT::v16i32, 1 }, // vpbroadcastd
-
-    { TTI::SK_Reverse,          MVT::v8f64,  1 }, // vpermpd
-    { TTI::SK_Reverse,          MVT::v16f32, 1 }, // vpermps
-    { TTI::SK_Reverse,          MVT::v8i64,  1 }, // vpermq
-    { TTI::SK_Reverse,          MVT::v16i32, 1 }, // vpermd
-
-    { TTI::SK_PermuteSingleSrc, MVT::v8f64,  1 }, // vpermpd
-    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
-    { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // vpermpd
-    { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
-    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
-    { TTI::SK_PermuteSingleSrc, MVT::v4f32,  1 }, // vpermps
-    { TTI::SK_PermuteSingleSrc, MVT::v8i64,  1 }, // vpermq
-    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
-    { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // vpermq
-    { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
-    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
-    { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // vpermd
-    { TTI::SK_PermuteSingleSrc, MVT::v16i8,  1 }, // pshufb
-
-    { TTI::SK_PermuteTwoSrc,    MVT::v8f64,  1 }, // vpermt2pd
-    { TTI::SK_PermuteTwoSrc,    MVT::v16f32, 1 }, // vpermt2ps
-    { TTI::SK_PermuteTwoSrc,    MVT::v8i64,  1 }, // vpermt2q
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i32, 1 }, // vpermt2d
-    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  1 }, // vpermt2pd
-    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  1 }, // vpermt2ps
-    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  1 }, // vpermt2q
-    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  1 }, // vpermt2d
-    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // vpermt2pd
-    { TTI::SK_PermuteTwoSrc,    MVT::v4f32,  1 }, // vpermt2ps
-    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // vpermt2q
-    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  1 }  // vpermt2d
+      {TTI::SK_Broadcast, MVT::v8f64, 1},  // vbroadcastpd
+      {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
+      {TTI::SK_Broadcast, MVT::v8i64, 1},  // vpbroadcastq
+      {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
+
+      {TTI::SK_Reverse, MVT::v8f64, 1},  // vpermpd
+      {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
+      {TTI::SK_Reverse, MVT::v8i64, 1},  // vpermq
+      {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
+
+      {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1},  // vpermpd
+      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
+      {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1},  // vpermpd
+      {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
+      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
+      {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1},  // vpermps
+      {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1},  // vpermq
+      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
+      {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1},  // vpermq
+      {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
+      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
+      {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1},  // vpermd
+      {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1},  // pshufb
+
+      {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1},  // vpermt2pd
+      {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
+      {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1},  // vpermt2q
+      {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
+      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1},  // vpermt2pd
+      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1},  // vpermt2ps
+      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1},  // vpermt2q
+      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1},  // vpermt2d
+      {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1},  // vpermt2pd
+      {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1},  // vpermt2ps
+      {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1},  // vpermt2q
+      {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}   // vpermt2d
   };
 
   if (ST->hasAVX512())
@@ -996,40 +1019,40 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX2ShuffleTbl[] = {
-    { TTI::SK_Broadcast, MVT::v4f64,  1 }, // vbroadcastpd
-    { TTI::SK_Broadcast, MVT::v8f32,  1 }, // vbroadcastps
-    { TTI::SK_Broadcast, MVT::v4i64,  1 }, // vpbroadcastq
-    { TTI::SK_Broadcast, MVT::v8i32,  1 }, // vpbroadcastd
-    { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
-    { TTI::SK_Broadcast, MVT::v32i8,  1 }, // vpbroadcastb
-
-    { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
-    { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
-    { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
-    { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
-    { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
-    { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
-
-    { TTI::SK_Select,    MVT::v16i16, 1 }, // vpblendvb
-    { TTI::SK_Select,    MVT::v32i8,  1 }, // vpblendvb
-
-    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
-    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
-    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
-    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
-    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
+      {TTI::SK_Broadcast, MVT::v4f64, 1},  // vbroadcastpd
+      {TTI::SK_Broadcast, MVT::v8f32, 1},  // vbroadcastps
+      {TTI::SK_Broadcast, MVT::v4i64, 1},  // vpbroadcastq
+      {TTI::SK_Broadcast, MVT::v8i32, 1},  // vpbroadcastd
+      {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v32i8, 1},  // vpbroadcastb
+
+      {TTI::SK_Reverse, MVT::v4f64, 1},  // vpermpd
+      {TTI::SK_Reverse, MVT::v8f32, 1},  // vpermps
+      {TTI::SK_Reverse, MVT::v4i64, 1},  // vpermq
+      {TTI::SK_Reverse, MVT::v8i32, 1},  // vpermd
+      {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
+      {TTI::SK_Reverse, MVT::v32i8, 2},  // vperm2i128 + pshufb
+
+      {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
+      {TTI::SK_Select, MVT::v32i8, 1},  // vpblendvb
+
+      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
+      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
+      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
+      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
+      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
                                                   // + vpblendvb
-    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  4 }, // vperm2i128 + 2*vpshufb
+      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vperm2i128 + 2*vpshufb
                                                   // + vpblendvb
 
-    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  3 }, // 2*vpermpd + vblendpd
-    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  3 }, // 2*vpermps + vblendps
-    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  3 }, // 2*vpermq + vpblendd
-    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  3 }, // 2*vpermd + vpblendd
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
-                                                  // + vpblendvb
-    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  7 }, // 2*vperm2i128 + 4*vpshufb
-                                                  // + vpblendvb
+      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},  // 2*vpermpd + vblendpd
+      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3},  // 2*vpermps + vblendps
+      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},  // 2*vpermq + vpblendd
+      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3},  // 2*vpermd + vpblendd
+      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
+                                               // + vpblendvb
+      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7},  // 2*vperm2i128 + 4*vpshufb
+                                               // + vpblendvb
   };
 
   if (ST->hasAVX2())
@@ -1037,21 +1060,21 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry XOPShuffleTbl[] = {
-    { TTI::SK_PermuteSingleSrc, MVT::v4f64,   2 }, // vperm2f128 + vpermil2pd
-    { TTI::SK_PermuteSingleSrc, MVT::v8f32,   2 }, // vperm2f128 + vpermil2ps
-    { TTI::SK_PermuteSingleSrc, MVT::v4i64,   2 }, // vperm2f128 + vpermil2pd
-    { TTI::SK_PermuteSingleSrc, MVT::v8i32,   2 }, // vperm2f128 + vpermil2ps
-    { TTI::SK_PermuteSingleSrc, MVT::v16i16,  4 }, // vextractf128 + 2*vpperm
-                                                   // + vinsertf128
-    { TTI::SK_PermuteSingleSrc, MVT::v32i8,   4 }, // vextractf128 + 2*vpperm
-                                                   // + vinsertf128
-
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i16,  9 }, // 2*vextractf128 + 6*vpperm
-                                                   // + vinsertf128
-    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,   1 }, // vpperm
-    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,   9 }, // 2*vextractf128 + 6*vpperm
-                                                   // + vinsertf128
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,   1 }, // vpperm
+      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vpermil2pd
+      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2},  // vperm2f128 + vpermil2ps
+      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vpermil2pd
+      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2},  // vperm2f128 + vpermil2ps
+      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
+                                                  // + vinsertf128
+      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vextractf128 + 2*vpperm
+                                                  // + vinsertf128
+
+      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
+                                               // + vinsertf128
+      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpperm
+      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9},  // 2*vextractf128 + 6*vpperm
+                                               // + vinsertf128
+      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1},  // vpperm
   };
 
   if (ST->hasXOP())
@@ -1059,46 +1082,46 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX1ShuffleTbl[] = {
-    { TTI::SK_Broadcast, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
-    { TTI::SK_Broadcast, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
-    { TTI::SK_Broadcast, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
-    { TTI::SK_Broadcast, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
-    { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
-    { TTI::SK_Broadcast, MVT::v32i8,  2 }, // vpshufb + vinsertf128
-
-    { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
-    { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
-    { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
-    { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
-    { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
-                                           // + vinsertf128
-    { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
-                                           // + vinsertf128
-
-    { TTI::SK_Select,    MVT::v4i64,  1 }, // vblendpd
-    { TTI::SK_Select,    MVT::v4f64,  1 }, // vblendpd
-    { TTI::SK_Select,    MVT::v8i32,  1 }, // vblendps
-    { TTI::SK_Select,    MVT::v8f32,  1 }, // vblendps
-    { TTI::SK_Select,    MVT::v16i16, 3 }, // vpand + vpandn + vpor
-    { TTI::SK_Select,    MVT::v32i8,  3 }, // vpand + vpandn + vpor
-
-    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  2 }, // vperm2f128 + vshufpd
-    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  2 }, // vperm2f128 + vshufpd
-    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  4 }, // 2*vperm2f128 + 2*vshufps
-    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  4 }, // 2*vperm2f128 + 2*vshufps
-    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
+      {TTI::SK_Broadcast, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
+      {TTI::SK_Broadcast, MVT::v8f32, 2},  // vperm2f128 + vpermilps
+      {TTI::SK_Broadcast, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
+      {TTI::SK_Broadcast, MVT::v8i32, 2},  // vperm2f128 + vpermilps
+      {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
+      {TTI::SK_Broadcast, MVT::v32i8, 2},  // vpshufb + vinsertf128
+
+      {TTI::SK_Reverse, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
+      {TTI::SK_Reverse, MVT::v8f32, 2},  // vperm2f128 + vpermilps
+      {TTI::SK_Reverse, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
+      {TTI::SK_Reverse, MVT::v8i32, 2},  // vperm2f128 + vpermilps
+      {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
+                                         // + vinsertf128
+      {TTI::SK_Reverse, MVT::v32i8, 4},  // vextractf128 + 2*pshufb
+                                         // + vinsertf128
+
+      {TTI::SK_Select, MVT::v4i64, 1},  // vblendpd
+      {TTI::SK_Select, MVT::v4f64, 1},  // vblendpd
+      {TTI::SK_Select, MVT::v8i32, 1},  // vblendps
+      {TTI::SK_Select, MVT::v8f32, 1},  // vblendps
+      {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
+      {TTI::SK_Select, MVT::v32i8, 3},  // vpand + vpandn + vpor
+
+      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vshufpd
+      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vshufpd
+      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4},  // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4},  // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
                                                   // + 2*por + vinsertf128
-    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  8 }, // vextractf128 + 4*pshufb
+      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8},  // vextractf128 + 4*pshufb
                                                   // + 2*por + vinsertf128
 
-    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,   3 }, // 2*vperm2f128 + vshufpd
-    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,   3 }, // 2*vperm2f128 + vshufpd
-    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,   4 }, // 2*vperm2f128 + 2*vshufps
-    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,   4 }, // 2*vperm2f128 + 2*vshufps
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
-                                                   // + 4*por + vinsertf128
-    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  15 }, // 2*vextractf128 + 8*pshufb
-                                                   // + 4*por + vinsertf128
+      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},   // 2*vperm2f128 + vshufpd
+      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},   // 2*vperm2f128 + vshufpd
+      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4},   // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4},   // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
+                                                // + 4*por + vinsertf128
+      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15},  // 2*vextractf128 + 8*pshufb
+                                                // + 4*por + vinsertf128
   };
 
   if (ST->hasAVX())
@@ -1106,12 +1129,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE41ShuffleTbl[] = {
-    { TTI::SK_Select,    MVT::v2i64,  1 }, // pblendw
-    { TTI::SK_Select,    MVT::v2f64,  1 }, // movsd
-    { TTI::SK_Select,    MVT::v4i32,  1 }, // pblendw
-    { TTI::SK_Select,    MVT::v4f32,  1 }, // blendps
-    { TTI::SK_Select,    MVT::v8i16,  1 }, // pblendw
-    { TTI::SK_Select,    MVT::v16i8,  1 }  // pblendvb
+      {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
+      {TTI::SK_Select, MVT::v2f64, 1}, // movsd
+      {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
+      {TTI::SK_Select, MVT::v4f32, 1}, // blendps
+      {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
+      {TTI::SK_Select, MVT::v16i8, 1}  // pblendvb
   };
 
   if (ST->hasSSE41())
@@ -1119,20 +1142,20 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSSE3ShuffleTbl[] = {
-    { TTI::SK_Broadcast, MVT::v8i16,  1 }, // pshufb
-    { TTI::SK_Broadcast, MVT::v16i8,  1 }, // pshufb
+      {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
+      {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
 
-    { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
-    { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
+      {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
+      {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
 
-    { TTI::SK_Select,    MVT::v8i16,  3 }, // 2*pshufb + por
-    { TTI::SK_Select,    MVT::v16i8,  3 }, // 2*pshufb + por
+      {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
+      {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
 
-    { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
-    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
+      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
+      {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
 
-    { TTI::SK_PermuteTwoSrc,    MVT::v8i16, 3 }, // 2*pshufb + por
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 3 }, // 2*pshufb + por
+      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
+      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
   };
 
   if (ST->hasSSSE3())
@@ -1140,29 +1163,29 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE2ShuffleTbl[] = {
-    { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
-    { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
-    { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
-    { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw + pshufd
-    { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
-
-    { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
-    { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
-    { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
-    { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw + pshufd
-    { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
-                                           // + 2*pshufd + 2*unpck + packus
-
-    { TTI::SK_Select,    MVT::v2i64,  1 }, // movsd
-    { TTI::SK_Select,    MVT::v2f64,  1 }, // movsd
-    { TTI::SK_Select,    MVT::v4i32,  2 }, // 2*shufps
-    { TTI::SK_Select,    MVT::v8i16,  3 }, // pand + pandn + por
-    { TTI::SK_Select,    MVT::v16i8,  3 }, // pand + pandn + por
-
-    { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // shufpd
-    { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // pshufd
-    { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // pshufd
-    { TTI::SK_PermuteSingleSrc, MVT::v8i16,  5 }, // 2*pshuflw + 2*pshufhw
+      {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
+      {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
+      {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
+      {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
+      {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
+
+      {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
+      {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
+      {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
+      {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
+      {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
+                                        // + 2*pshufd + 2*unpck + packus
+
+      {TTI::SK_Select, MVT::v2i64, 1}, // movsd
+      {TTI::SK_Select, MVT::v2f64, 1}, // movsd
+      {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
+      {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
+      {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
+
+      {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
+      {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
+      {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
+      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
                                                   // + pshufd/unpck
     { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
                                                   // + 2*pshufd + 2*unpck + 2*packus
@@ -1201,6 +1224,27 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   // FIXME: Need a better design of the cost table to handle non-simple types of
   // potential massive combinations (elem_num x src_type x dst_type).
 
+  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
+
+    // Mask sign extend has an instruction.
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1, 1 },
+
+    // Mask zero extend is a load + broadcast.
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1, 2 },
+  };
+
   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
     { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
@@ -1526,43 +1570,51 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
 
-  if (ST->hasDQI())
-    if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
-                                                   DstTy.getSimpleVT(),
-                                                   SrcTy.getSimpleVT()))
-      return Entry->Cost;
+  MVT SimpleSrcTy = SrcTy.getSimpleVT();
+  MVT SimpleDstTy = DstTy.getSimpleVT();
 
-  if (ST->hasAVX512())
-    if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
-                                                   DstTy.getSimpleVT(),
-                                                   SrcTy.getSimpleVT()))
-      return Entry->Cost;
+  // Make sure that neither type is going to be split before using the
+  // AVX512 tables. This handles -mprefer-vector-width=256
+  // with -min-legal-vector-width<=256
+  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
+      TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
+    if (ST->hasBWI())
+      if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
+                                                     SimpleDstTy, SimpleSrcTy))
+        return Entry->Cost;
+
+    if (ST->hasDQI())
+      if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+                                                     SimpleDstTy, SimpleSrcTy))
+        return Entry->Cost;
+
+    if (ST->hasAVX512())
+      if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+                                                     SimpleDstTy, SimpleSrcTy))
+        return Entry->Cost;
+  }
 
   if (ST->hasAVX2()) {
     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
-                                                   DstTy.getSimpleVT(),
-                                                   SrcTy.getSimpleVT()))
+                                                   SimpleDstTy, SimpleSrcTy))
       return Entry->Cost;
   }
 
   if (ST->hasAVX()) {
     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
-                                                   DstTy.getSimpleVT(),
-                                                   SrcTy.getSimpleVT()))
+                                                   SimpleDstTy, SimpleSrcTy))
       return Entry->Cost;
   }
 
   if (ST->hasSSE41()) {
     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
-                                                   DstTy.getSimpleVT(),
-                                                   SrcTy.getSimpleVT()))
+                                                   SimpleDstTy, SimpleSrcTy))
       return Entry->Cost;
   }
 
   if (ST->hasSSE2()) {
     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
-                                                   DstTy.getSimpleVT(),
-                                                   SrcTy.getSimpleVT()))
+                                                   SimpleDstTy, SimpleSrcTy))
       return Entry->Cost;
   }
 
@@ -1834,7 +1886,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
   };
   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
-    { ISD::BITREVERSE, MVT::i64,    14 }
+    { ISD::BITREVERSE, MVT::i64,    14 } 
   };
   static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
     { ISD::BITREVERSE, MVT::i32,    14 },
@@ -1866,71 +1918,163 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     break;
   }
 
-  // Legalize the type.
-  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
-  MVT MTy = LT.second;
+  if (ISD != ISD::DELETED_NODE) {
+    // Legalize the type.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+    MVT MTy = LT.second;
 
-  // Attempt to lookup cost.
-  if (ST->isGLM())
-    if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    // Attempt to lookup cost.
+    if (ST->isGLM())
+      if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->isSLM())
-    if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->isSLM())
+      if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasCDI())
-    if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasCDI())
+      if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasBWI())
-    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasBWI())
+      if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasAVX512())
-    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasAVX512())
+      if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasXOP())
-    if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasXOP())
+      if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasAVX2())
-    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasAVX2())
+      if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasAVX())
-    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasSSE42())
-    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasSSSE3())
-    if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasSSSE3())
+      if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasSSE2())
-    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasSSE2())
+      if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->hasSSE1())
-    if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->hasSSE1())
+      if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (ST->is64Bit())
-    if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+    if (ST->is64Bit())
+      if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
-    return LT.first * Entry->Cost;
+    if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+  }
 
   return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                     ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+                                      ArrayRef<Value *> Args, FastMathFlags FMF,
+                                      unsigned VF) {
+  static const CostTblEntry AVX512CostTbl[] = {
+    { ISD::ROTL,       MVT::v8i64,   1 },
+    { ISD::ROTL,       MVT::v4i64,   1 },
+    { ISD::ROTL,       MVT::v2i64,   1 },
+    { ISD::ROTL,       MVT::v16i32,  1 },
+    { ISD::ROTL,       MVT::v8i32,   1 },
+    { ISD::ROTL,       MVT::v4i32,   1 },
+    { ISD::ROTR,       MVT::v8i64,   1 },
+    { ISD::ROTR,       MVT::v4i64,   1 },
+    { ISD::ROTR,       MVT::v2i64,   1 },
+    { ISD::ROTR,       MVT::v16i32,  1 },
+    { ISD::ROTR,       MVT::v8i32,   1 },
+    { ISD::ROTR,       MVT::v4i32,   1 }
+  };
+  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
+  static const CostTblEntry XOPCostTbl[] = {
+    { ISD::ROTL,       MVT::v4i64,   4 },
+    { ISD::ROTL,       MVT::v8i32,   4 },
+    { ISD::ROTL,       MVT::v16i16,  4 },
+    { ISD::ROTL,       MVT::v32i8,   4 },
+    { ISD::ROTL,       MVT::v2i64,   1 },
+    { ISD::ROTL,       MVT::v4i32,   1 },
+    { ISD::ROTL,       MVT::v8i16,   1 },
+    { ISD::ROTL,       MVT::v16i8,   1 },
+    { ISD::ROTR,       MVT::v4i64,   6 },
+    { ISD::ROTR,       MVT::v8i32,   6 },
+    { ISD::ROTR,       MVT::v16i16,  6 },
+    { ISD::ROTR,       MVT::v32i8,   6 },
+    { ISD::ROTR,       MVT::v2i64,   2 },
+    { ISD::ROTR,       MVT::v4i32,   2 },
+    { ISD::ROTR,       MVT::v8i16,   2 },
+    { ISD::ROTR,       MVT::v16i8,   2 }
+  };
+  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+    { ISD::ROTL,       MVT::i64,     1 },
+    { ISD::ROTR,       MVT::i64,     1 },
+    { ISD::FSHL,       MVT::i64,     4 }
+  };
+  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+    { ISD::ROTL,       MVT::i32,     1 },
+    { ISD::ROTL,       MVT::i16,     1 },
+    { ISD::ROTL,       MVT::i8,      1 },
+    { ISD::ROTR,       MVT::i32,     1 },
+    { ISD::ROTR,       MVT::i16,     1 },
+    { ISD::ROTR,       MVT::i8,      1 },
+    { ISD::FSHL,       MVT::i32,     4 },
+    { ISD::FSHL,       MVT::i16,     4 },
+    { ISD::FSHL,       MVT::i8,      4 }
+  };
+
+  unsigned ISD = ISD::DELETED_NODE;
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::fshl:
+    ISD = ISD::FSHL;
+    if (Args[0] == Args[1])
+      ISD = ISD::ROTL;
+    break;
+  case Intrinsic::fshr:
+    // FSHR has same costs so don't duplicate.
+    ISD = ISD::FSHL;
+    if (Args[0] == Args[1])
+      ISD = ISD::ROTR;
+    break;
+  }
+
+  if (ISD != ISD::DELETED_NODE) {
+    // Legalize the type.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+    MVT MTy = LT.second;
+
+    // Attempt to lookup cost.
+    if (ST->hasAVX512())
+      if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasXOP())
+      if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->is64Bit())
+      if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+  }
+
   return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
 }
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 38925bfd51b0273a3be194aa8830d53112843c5b..2aa9932e2465a51e1a15eed7ea68c55404b02088 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -31,7 +31,8 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
+static CodeModel::Model
+getEffectiveXCoreCodeModel(Optional<CodeModel::Model> CM) {
   if (CM) {
     if (*CM != CodeModel::Small && *CM != CodeModel::Large)
       report_fatal_error("Target only supports CodeModel Small or Large");
@@ -51,7 +52,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(
           T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32",
           TT, CPU, FS, Options, getEffectiveRelocModel(RM),
-          getEffectiveCodeModel(CM), OL),
+          getEffectiveXCoreCodeModel(CM), OL),
       TLOF(llvm::make_unique<XCoreTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/TextAPI/CMakeLists.txt b/lib/TextAPI/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7912e91fbf1c69c5441767f11545b8e59ddfa313
--- /dev/null
+++ b/lib/TextAPI/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_llvm_library(LLVMTextAPI
+  ELF/ELFStub.cpp
+  ELF/TBEHandler.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  "${LLVM_MAIN_INCLUDE_DIR}/llvm/TextAPI"
+)
diff --git a/lib/TextAPI/ELF/ELFStub.cpp b/lib/TextAPI/ELF/ELFStub.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..248a078a2404dbb1b0082421f315d1456f708ccc
--- /dev/null
+++ b/lib/TextAPI/ELF/ELFStub.cpp
@@ -0,0 +1,29 @@
+//===- ELFStub.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-----------------------------------------------------------------------===/
+
+#include "llvm/TextAPI/ELF/ELFStub.h"
+
+using namespace llvm;
+using namespace llvm::elfabi;
+
+ELFStub::ELFStub(ELFStub const &Stub) {
+  TbeVersion = Stub.TbeVersion;
+  Arch = Stub.Arch;
+  SoName = Stub.SoName;
+  NeededLibs = Stub.NeededLibs;
+  Symbols = Stub.Symbols;
+}
+
+ELFStub::ELFStub(ELFStub &&Stub) {
+  TbeVersion = std::move(Stub.TbeVersion);
+  Arch = std::move(Stub.Arch);
+  SoName = std::move(Stub.SoName);
+  NeededLibs = std::move(Stub.NeededLibs);
+  Symbols = std::move(Stub.Symbols);
+}
diff --git a/lib/TextAPI/ELF/TBEHandler.cpp b/lib/TextAPI/ELF/TBEHandler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dde980292a167623ee52fd15fbee0370b74648cf
--- /dev/null
+++ b/lib/TextAPI/ELF/TBEHandler.cpp
@@ -0,0 +1,160 @@
+//===- TBEHandler.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-----------------------------------------------------------------------===/
+
+#include "llvm/TextAPI/ELF/TBEHandler.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/TextAPI/ELF/ELFStub.h"
+
+using namespace llvm;
+using namespace llvm::elfabi;
+
+LLVM_YAML_STRONG_TYPEDEF(ELFArch, ELFArchMapper)
+
+namespace llvm {
+namespace yaml {
+
+/// YAML traits for ELFSymbolType.
+template <> struct ScalarEnumerationTraits<ELFSymbolType> {
+  static void enumeration(IO &IO, ELFSymbolType &SymbolType) {
+    IO.enumCase(SymbolType, "NoType", ELFSymbolType::NoType);
+    IO.enumCase(SymbolType, "Func", ELFSymbolType::Func);
+    IO.enumCase(SymbolType, "Object", ELFSymbolType::Object);
+    IO.enumCase(SymbolType, "TLS", ELFSymbolType::TLS);
+    IO.enumCase(SymbolType, "Unknown", ELFSymbolType::Unknown);
+    // Treat other symbol types as noise, and map to Unknown.
+    if (!IO.outputting() && IO.matchEnumFallback())
+      SymbolType = ELFSymbolType::Unknown;
+  }
+};
+
+/// YAML traits for ELFArch.
+template <> struct ScalarTraits<ELFArchMapper> {
+  static void output(const ELFArchMapper &Value, void *,
+                     llvm::raw_ostream &Out) {
+    // Map from integer to architecture string.
+    switch (Value) {
+    case (ELFArch)ELF::EM_X86_64:
+      Out << "x86_64";
+      break;
+    case (ELFArch)ELF::EM_AARCH64:
+      Out << "AArch64";
+      break;
+    case (ELFArch)ELF::EM_NONE:
+    default:
+      Out << "Unknown";
+    }
+  }
+
+  static StringRef input(StringRef Scalar, void *, ELFArchMapper &Value) {
+    // Map from architecture string to integer.
+    Value = StringSwitch<ELFArch>(Scalar)
+                .Case("x86_64", ELF::EM_X86_64)
+                .Case("AArch64", ELF::EM_AARCH64)
+                .Case("Unknown", ELF::EM_NONE)
+                .Default(ELF::EM_NONE);
+
+    // Returning empty StringRef indicates successful parse.
+    return StringRef();
+  }
+
+  // Don't place quotation marks around architecture value.
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
+/// YAML traits for TbeVersion.
+template <> struct ScalarTraits<VersionTuple> {
+  static void output(const VersionTuple &Value, void *,
+                     llvm::raw_ostream &Out) {
+    Out << Value.getAsString();
+  }
+
+  static StringRef input(StringRef Scalar, void *, VersionTuple &Value) {
+    if (Value.tryParse(Scalar))
+      return StringRef("Can't parse version: invalid version format.");
+
+    if (Value > TBEVersionCurrent)
+      return StringRef("Unsupported TBE version.");
+
+    // Returning empty StringRef indicates successful parse.
+    return StringRef();
+  }
+
+  // Don't place quotation marks around version value.
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
+/// YAML traits for ELFSymbol.
+template <> struct MappingTraits<ELFSymbol> {
+  static void mapping(IO &IO, ELFSymbol &Symbol) {
+    IO.mapRequired("Type", Symbol.Type);
+    // The need for symbol size depends on the symbol type.
+    if (Symbol.Type == ELFSymbolType::NoType) {
+      IO.mapOptional("Size", Symbol.Size, (uint64_t)0);
+    } else if (Symbol.Type == ELFSymbolType::Func) {
+      Symbol.Size = 0;
+    } else {
+      IO.mapRequired("Size", Symbol.Size);
+    }
+    IO.mapOptional("Undefined", Symbol.Undefined, false);
+    IO.mapOptional("Warning", Symbol.Warning);
+  }
+
+  // Compacts symbol information into a single line.
+  static const bool flow = true;
+};
+
+/// YAML traits for set of ELFSymbols.
+template <> struct CustomMappingTraits<std::set<ELFSymbol>> {
+  static void inputOne(IO &IO, StringRef Key, std::set<ELFSymbol> &Set) {
+    ELFSymbol Sym(Key.str());
+    IO.mapRequired(Key.str().c_str(), Sym);
+    Set.insert(Sym);
+  }
+
+  static void output(IO &IO, std::set<ELFSymbol> &Set) {
+    for (auto &Sym : Set)
+      IO.mapRequired(Sym.Name.c_str(), const_cast<ELFSymbol &>(Sym));
+  }
+};
+
+/// YAML traits for ELFStub objects.
+template <> struct MappingTraits<ELFStub> {
+  static void mapping(IO &IO, ELFStub &Stub) {
+    if (!IO.mapTag("!tapi-tbe", true))
+      IO.setError("Not a .tbe YAML file.");
+    IO.mapRequired("TbeVersion", Stub.TbeVersion);
+    IO.mapOptional("SoName", Stub.SoName);
+    IO.mapRequired("Arch", (ELFArchMapper &)Stub.Arch);
+    IO.mapOptional("NeededLibs", Stub.NeededLibs);
+    IO.mapRequired("Symbols", Stub.Symbols);
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+Expected<std::unique_ptr<ELFStub>> elfabi::readTBEFromBuffer(StringRef Buf) {
+  yaml::Input YamlIn(Buf);
+  std::unique_ptr<ELFStub> Stub(new ELFStub());
+  YamlIn >> *Stub;
+  if (std::error_code Err = YamlIn.error())
+    return createStringError(Err, "YAML failed reading as TBE");
+
+  return std::move(Stub);
+}
+
+Error elfabi::writeTBEToOutputStream(raw_ostream &OS, const ELFStub &Stub) {
+  yaml::Output YamlOut(OS, NULL, /*WrapColumn =*/0);
+
+  YamlOut << const_cast<ELFStub &>(Stub);
+  return Error::success();
+}
diff --git a/lib/TextAPI/LLVMBuild.txt b/lib/TextAPI/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..83733a72f506b93729748ddca4696be291deb881
--- /dev/null
+++ b/lib/TextAPI/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/TextAPI/LLVMBuild.txt ------------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = TextAPI
+parent = Libraries
+required_libraries = Support BinaryFormat
diff --git a/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7560060b7604de4e8d04f6431783b4feb96d7417..a9442255e34c1432818bbbc2b16bd932431f8faa 100644
--- a/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -59,6 +59,99 @@ public:
 };
 } // namespace
 
+/// Match a pattern for a bitwise rotate operation that partially guards
+/// against undefined behavior by branching around the rotation when the shift
+/// amount is 0.
+static bool foldGuardedRotateToFunnelShift(Instruction &I) {
+  if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
+    return false;
+
+  // As with the one-use checks below, this is not strictly necessary, but we
+  // are being cautious to avoid potential perf regressions on targets that
+  // do not actually have a rotate instruction (where the funnel shift would be
+  // expanded back into math/shift/logic ops).
+  if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
+    return false;
+
+  // Match V to funnel shift left/right and capture the source operand and
+  // shift amount in X and Y.
+  auto matchRotate = [](Value *V, Value *&X, Value *&Y) {
+    Value *L0, *L1, *R0, *R1;
+    unsigned Width = V->getType()->getScalarSizeInBits();
+    auto Sub = m_Sub(m_SpecificInt(Width), m_Value(R1));
+
+    // rotate_left(X, Y) == (X << Y) | (X >> (Width - Y))
+    auto RotL = m_OneUse(m_c_Or(m_Shl(m_Value(L0), m_Value(L1)),
+                                m_LShr(m_Value(R0), Sub)));
+    if (RotL.match(V) && L0 == R0 && L1 == R1) {
+      X = L0;
+      Y = L1;
+      return Intrinsic::fshl;
+    }
+
+    // rotate_right(X, Y) == (X >> Y) | (X << (Width - Y))
+    auto RotR = m_OneUse(m_c_Or(m_LShr(m_Value(L0), m_Value(L1)),
+                                m_Shl(m_Value(R0), Sub)));
+    if (RotR.match(V) && L0 == R0 && L1 == R1) {
+      X = L0;
+      Y = L1;
+      return Intrinsic::fshr;
+    }
+
+    return Intrinsic::not_intrinsic;
+  };
+
+  // One phi operand must be a rotate operation, and the other phi operand must
+  // be the source value of that rotate operation:
+  // phi [ rotate(RotSrc, RotAmt), RotBB ], [ RotSrc, GuardBB ]
+  PHINode &Phi = cast<PHINode>(I);
+  Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
+  Value *RotSrc, *RotAmt;
+  Intrinsic::ID IID = matchRotate(P0, RotSrc, RotAmt);
+  if (IID == Intrinsic::not_intrinsic || RotSrc != P1) {
+    IID = matchRotate(P1, RotSrc, RotAmt);
+    if (IID == Intrinsic::not_intrinsic || RotSrc != P0)
+      return false;
+    assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
+           "Pattern must match funnel shift left or right");
+  }
+
+  // The incoming block with our source operand must be the "guard" block.
+  // That must contain a cmp+branch to avoid the rotate when the shift amount
+  // is equal to 0. The other incoming block is the block with the rotate.
+  BasicBlock *GuardBB = Phi.getIncomingBlock(RotSrc == P1);
+  BasicBlock *RotBB = Phi.getIncomingBlock(RotSrc != P1);
+  Instruction *TermI = GuardBB->getTerminator();
+  BasicBlock *TrueBB, *FalseBB;
+  ICmpInst::Predicate Pred;
+  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()),
+                         TrueBB, FalseBB)))
+    return false;
+
+  BasicBlock *PhiBB = Phi.getParent();
+  if (Pred != CmpInst::ICMP_EQ || TrueBB != PhiBB || FalseBB != RotBB)
+    return false;
+
+  // We matched a variation of this IR pattern:
+  // GuardBB:
+  //   %cmp = icmp eq i32 %RotAmt, 0
+  //   br i1 %cmp, label %PhiBB, label %RotBB
+  // RotBB:
+  //   %sub = sub i32 32, %RotAmt
+  //   %shr = lshr i32 %X, %sub
+  //   %shl = shl i32 %X, %RotAmt
+  //   %rot = or i32 %shr, %shl
+  //   br label %PhiBB
+  // PhiBB:
+  //   %cond = phi i32 [ %rot, %RotBB ], [ %X, %GuardBB ]
+  // -->
+  // llvm.fshl.i32(i32 %X, i32 %RotAmt)
+  IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
+  Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
+  Phi.replaceAllUsesWith(Builder.CreateCall(F, {RotSrc, RotSrc, RotAmt}));
+  return true;
+}
+
 /// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
 /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
 /// of 'and' ops, then we also need to capture the fact that we saw an
@@ -174,8 +267,10 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
     // Also, we want to avoid matching partial patterns.
     // TODO: It would be more efficient if we removed dead instructions
     // iteratively in this loop rather than waiting until the end.
-    for (Instruction &I : make_range(BB.rbegin(), BB.rend()))
+    for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldAnyOrAllBitsSet(I);
+      MadeChange |= foldGuardedRotateToFunnelShift(I);
+    }
   }
 
   // We're done with transforms, so remove dead instructions.
diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp
index 2462ae4abd3d914c7800992b9be43491c2a10c04..9eeceb217ba8bc03e353b8f9971eb24e9681e41d 100644
--- a/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -538,43 +538,92 @@ static void handleNoSuspendCoroutine(CoroBeginInst *CoroBegin, Type *FrameTy) {
   CoroBegin->eraseFromParent();
 }
 
-// look for a very simple pattern
-//    coro.save
-//    no other calls
-//    resume or destroy call
-//    coro.suspend
-//
-// If there are other calls between coro.save and coro.suspend, they can
-// potentially resume or destroy the coroutine, so it is unsafe to eliminate a
-// suspend point.
-static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
-                                 CoroBeginInst *CoroBegin) {
-  auto *Save = Suspend->getCoroSave();
-  auto *BB = Suspend->getParent();
-  if (BB != Save->getParent())
-    return false;
+// SimplifySuspendPoint needs to check that there is no calls between
+// coro_save and coro_suspend, since any of the calls may potentially resume
+// the coroutine and if that is the case we cannot eliminate the suspend point.
+static bool hasCallsInBlockBetween(Instruction *From, Instruction *To) {
+  for (Instruction *I = From; I != To; I = I->getNextNode()) {
+    // Assume that no intrinsic can resume the coroutine.
+    if (isa<IntrinsicInst>(I))
+      continue;
 
-  CallSite SingleCallSite;
+    if (CallSite(I))
+      return true;
+  }
+  return false;
+}
 
-  // Check that we have only one CallSite.
-  for (Instruction *I = Save->getNextNode(); I != Suspend;
-       I = I->getNextNode()) {
-    if (isa<CoroFrameInst>(I))
-      continue;
-    if (isa<CoroSubFnInst>(I))
-      continue;
-    if (CallSite CS = CallSite(I)) {
-      if (SingleCallSite)
-        return false;
-      else
-        SingleCallSite = CS;
-    }
+static bool hasCallsInBlocksBetween(BasicBlock *SaveBB, BasicBlock *ResDesBB) {
+  SmallPtrSet<BasicBlock *, 8> Set;
+  SmallVector<BasicBlock *, 8> Worklist;
+
+  Set.insert(SaveBB);
+  Worklist.push_back(ResDesBB);
+
+  // Accumulate all blocks between SaveBB and ResDesBB. Because CoroSaveIntr
+  // returns a token consumed by suspend instruction, all blocks in between
+  // will have to eventually hit SaveBB when going backwards from ResDesBB.
+  while (!Worklist.empty()) {
+    auto *BB = Worklist.pop_back_val();
+    Set.insert(BB);
+    for (auto *Pred : predecessors(BB))
+      if (Set.count(Pred) == 0)
+        Worklist.push_back(Pred);
   }
-  auto *CallInstr = SingleCallSite.getInstruction();
-  if (!CallInstr)
+
+  // SaveBB and ResDesBB are checked separately in hasCallsBetween.
+  Set.erase(SaveBB);
+  Set.erase(ResDesBB);
+
+  for (auto *BB : Set)
+    if (hasCallsInBlockBetween(BB->getFirstNonPHI(), nullptr))
+      return true;
+
+  return false;
+}
+
+static bool hasCallsBetween(Instruction *Save, Instruction *ResumeOrDestroy) {
+  auto *SaveBB = Save->getParent();
+  auto *ResumeOrDestroyBB = ResumeOrDestroy->getParent();
+
+  if (SaveBB == ResumeOrDestroyBB)
+    return hasCallsInBlockBetween(Save->getNextNode(), ResumeOrDestroy);
+
+  // Any calls from Save to the end of the block?
+  if (hasCallsInBlockBetween(Save->getNextNode(), nullptr))
+    return true;
+
+  // Any calls from begging of the block up to ResumeOrDestroy?
+  if (hasCallsInBlockBetween(ResumeOrDestroyBB->getFirstNonPHI(),
+                             ResumeOrDestroy))
+    return true;
+
+  // Any calls in all of the blocks between SaveBB and ResumeOrDestroyBB?
+  if (hasCallsInBlocksBetween(SaveBB, ResumeOrDestroyBB))
+    return true;
+
+  return false;
+}
+
+// If a SuspendIntrin is preceded by Resume or Destroy, we can eliminate the
+// suspend point and replace it with nornal control flow.
+static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
+                                 CoroBeginInst *CoroBegin) {
+  Instruction *Prev = Suspend->getPrevNode();
+  if (!Prev) {
+    auto *Pred = Suspend->getParent()->getSinglePredecessor();
+    if (!Pred)
+      return false;
+    Prev = Pred->getTerminator();
+  }
+
+  CallSite CS{Prev};
+  if (!CS)
     return false;
 
-  auto *Callee = SingleCallSite.getCalledValue()->stripPointerCasts();
+  auto *CallInstr = CS.getInstruction();
+
+  auto *Callee = CS.getCalledValue()->stripPointerCasts();
 
   // See if the callsite is for resumption or destruction of the coroutine.
   auto *SubFn = dyn_cast<CoroSubFnInst>(Callee);
@@ -585,6 +634,13 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
   if (SubFn->getFrame() != CoroBegin)
     return false;
 
+  // See if the transformation is safe. Specifically, see if there are any
+  // calls in between Save and CallInstr. They can potenitally resume the
+  // coroutine rendering this optimization unsafe.
+  auto *Save = Suspend->getCoroSave();
+  if (hasCallsBetween(Save, CallInstr))
+    return false;
+
   // Replace llvm.coro.suspend with the value that results in resumption over
   // the resume or cleanup path.
   Suspend->replaceAllUsesWith(SubFn->getRawIndex());
@@ -592,8 +648,20 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
   Save->eraseFromParent();
 
   // No longer need a call to coro.resume or coro.destroy.
+  if (auto *Invoke = dyn_cast<InvokeInst>(CallInstr)) {
+    BranchInst::Create(Invoke->getNormalDest(), Invoke);
+  }
+
+  // Grab the CalledValue from CS before erasing the CallInstr.
+  auto *CalledValue = CS.getCalledValue();
   CallInstr->eraseFromParent();
 
+  // If no more users remove it. Usually it is a bitcast of SubFn.
+  if (CalledValue != SubFn && CalledValue->user_empty())
+    if (auto *I = dyn_cast<Instruction>(CalledValue))
+      I->eraseFromParent();
+
+  // Now we are good to remove SubFn.
   if (SubFn->user_empty())
     SubFn->eraseFromParent();
 
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index f2c2b55b1c5b82c528a0780c14eda7197427eeea..517a9c082a4cf6f4fe46c79a7be73ca77d4a0a75 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -213,7 +213,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
 
   // Create the new function body and insert it into the module.
-  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(),
+                                  F->getName());
   NF->copyAttributesFrom(F);
 
   // Patch the pointer to LLVM function in debug info descriptor.
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 0cf238b5d6065943eb7b76ea5943cccdb86934d3..cb30e8f46a54dfc92f38f955caccaabf288c8abf 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -24,7 +24,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -165,7 +164,7 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   unsigned NumArgs = Params.size();
 
   // Create the new function body and insert it into the module...
-  Function *NF = Function::Create(NFTy, Fn.getLinkage());
+  Function *NF = Function::Create(NFTy, Fn.getLinkage(), Fn.getAddressSpace());
   NF->copyAttributesFrom(&Fn);
   NF->setComdat(Fn.getComdat());
   Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);
@@ -864,7 +863,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     return false;
 
   // Create the new function body and insert it into the module...
-  Function *NF = Function::Create(NFTy, F->getLinkage());
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace());
   NF->copyAttributesFrom(F);
   NF->setComdat(F->getComdat());
   NF->setAttributes(NewPAL);
@@ -954,16 +953,16 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     ArgAttrVec.clear();
 
     Instruction *New = NewCS.getInstruction();
-    if (!Call->use_empty()) {
+    if (!Call->use_empty() || Call->isUsedByMetadata()) {
       if (New->getType() == Call->getType()) {
         // Return type not changed? Just replace users then.
         Call->replaceAllUsesWith(New);
         New->takeName(Call);
       } else if (New->getType()->isVoidTy()) {
-        // Our return value has uses, but they will get removed later on.
-        // Replace by null for now.
+        // If the return value is dead, replace any uses of it with undef
+        // (any non-debug value uses will get removed later on).
         if (!Call->getType()->isX86_MMXTy())
-          Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
+          Call->replaceAllUsesWith(UndefValue::get(Call->getType()));
       } else {
         assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
                "Return type changed, but not into a void. The old return type"
@@ -1023,10 +1022,10 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       I2->takeName(&*I);
       ++I2;
     } else {
-      // If this argument is dead, replace any uses of it with null constants
-      // (these are guaranteed to become unused later on).
+      // If this argument is dead, replace any uses of it with undef
+      // (any non-debug value uses will get removed later on).
       if (!I->getType()->isX86_MMXTy())
-        I->replaceAllUsesWith(Constant::getNullValue(I->getType()));
+        I->replaceAllUsesWith(UndefValue::get(I->getType()));
     }
 
   // If we change the return value of the function we must rewrite any return
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index d45a8832391095f3b5f9584fdf428cd2830a6a57..a744d7f2d2d9597202c32c98183d8b80ed845827 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -135,6 +135,7 @@ namespace {
           llvm::Value *Declaration;
           if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
             Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                           CurI->getAddressSpace(),
                                            CurI->getName(), &M);
 
           } else {
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index f01c6a4e99bf10be8c9d7eaa63be70a384d73fc4..3a04f7a00d413b65b8ca8af08a6e246cdbb719da 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
@@ -1307,13 +1308,14 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
   // If all of the calls in F are identifiable and are to norecurse functions, F
   // is norecurse. This check also detects self-recursion as F is not currently
   // marked norecurse, so any called from F to F will not be marked norecurse.
-  for (Instruction &I : instructions(*F))
-    if (auto CS = CallSite(&I)) {
-      Function *Callee = CS.getCalledFunction();
-      if (!Callee || Callee == F || !Callee->doesNotRecurse())
-        // Function calls a potentially recursive function.
-        return false;
-    }
+  for (auto &BB : *F)
+    for (auto &I : BB.instructionsWithoutDebug())
+      if (auto CS = CallSite(&I)) {
+        Function *Callee = CS.getCalledFunction();
+        if (!Callee || Callee == F || !Callee->doesNotRecurse())
+          // Function calls a potentially recursive function.
+          return false;
+      }
 
   // Every call was to a non-recursive function other than this function, and
   // we have no indirect recursion as the SCC size is one. This function cannot
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 31531beea5e78d76a95d856b0119d123d301a8d8..531a7c19e360075466d83a3dbd7d030ff432cb46 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -293,11 +293,21 @@ static void computeImportForReferencedGlobals(
 
     LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n");
 
+    // If this is a local variable, make sure we import the copy
+    // in the caller's module. The only time a local variable can
+    // share an entry in the index is if there is a local with the same name
+    // in another module that had the same source file name (in a different
+    // directory), where each was compiled in their own directory so there
+    // was not distinguishing path.
+    auto LocalNotInModule = [&](const GlobalValueSummary *RefSummary) -> bool {
+      return GlobalValue::isLocalLinkage(RefSummary->linkage()) &&
+             RefSummary->modulePath() != Summary.modulePath();
+    };
+
     for (auto &RefSummary : VI.getSummaryList())
-      if (RefSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind &&
-          !RefSummary->notEligibleToImport() &&
-          !GlobalValue::isInterposableLinkage(RefSummary->linkage()) &&
-          RefSummary->refs().empty()) {
+      if (isa<GlobalVarSummary>(RefSummary.get()) &&
+          canImportGlobalVar(RefSummary.get()) &&
+          !LocalNotInModule(RefSummary.get())) {
         auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
         // Only update stat if we haven't already imported this variable.
         if (ILI.second)
@@ -824,6 +834,25 @@ void llvm::computeDeadSymbols(
   NumLiveSymbols += LiveSymbols;
 }
 
+// Compute dead symbols and propagate constants in combined index.
+void llvm::computeDeadSymbolsWithConstProp(
+    ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+    function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
+    bool ImportEnabled) {
+  computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
+  if (ImportEnabled) {
+    Index.propagateConstants(GUIDPreservedSymbols);
+  } else {
+    // If import is disabled we should drop read-only attribute
+    // from all summaries to prevent internalization.
+    for (auto &P : Index)
+      for (auto &S : P.second.SummaryList)
+        if (auto *GVS = dyn_cast<GlobalVarSummary>(S.get()))
+          GVS->setReadOnly(false);
+  }
+}
+
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
 /// \p ModulePath.
 void llvm::gatherImportedSummariesForModule(
@@ -882,7 +911,8 @@ bool llvm::convertToDeclaration(GlobalValue &GV) {
     if (GV.getValueType()->isFunctionTy())
       NewGV =
           Function::Create(cast<FunctionType>(GV.getValueType()),
-                           GlobalValue::ExternalLinkage, "", GV.getParent());
+                           GlobalValue::ExternalLinkage, GV.getAddressSpace(),
+                           "", GV.getParent());
     else
       NewGV =
           new GlobalVariable(*GV.getParent(), GV.getValueType(),
@@ -897,8 +927,8 @@ bool llvm::convertToDeclaration(GlobalValue &GV) {
   return true;
 }
 
-/// Fixup WeakForLinker linkages in \p TheModule based on summary analysis.
-void llvm::thinLTOResolveWeakForLinkerModule(
+/// Fixup prevailing symbol linkages in \p TheModule based on summary analysis.
+void llvm::thinLTOResolvePrevailingInModule(
     Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
   auto updateLinkage = [&](GlobalValue &GV) {
     // See if the global summary analysis computed a new resolved linkage.
@@ -915,13 +945,15 @@ void llvm::thinLTOResolveWeakForLinkerModule(
     // as we need access to the resolution vectors for each input file in
     // order to find which symbols have been redefined.
     // We may consider reorganizing this code and moving the linkage recording
-    // somewhere else, e.g. in thinLTOResolveWeakForLinkerInIndex.
+    // somewhere else, e.g. in thinLTOResolvePrevailingInIndex.
     if (NewLinkage == GlobalValue::WeakAnyLinkage) {
       GV.setLinkage(NewLinkage);
       return;
     }
 
-    if (!GlobalValue::isWeakForLinker(GV.getLinkage()))
+    if (GlobalValue::isLocalLinkage(GV.getLinkage()) ||
+        // In case it was dead and already converted to declaration.
+        GV.isDeclaration())
       return;
     // Check for a non-prevailing def that has interposable linkage
     // (e.g. non-odr weak or linkonce). In that case we can't simply
@@ -932,7 +964,7 @@ void llvm::thinLTOResolveWeakForLinkerModule(
         GlobalValue::isInterposableLinkage(GV.getLinkage())) {
       if (!convertToDeclaration(GV))
         // FIXME: Change this to collect replaced GVs and later erase
-        // them from the parent module once thinLTOResolveWeakForLinkerGUID is
+        // them from the parent module once thinLTOResolvePrevailingGUID is
         // changed to enable this for aliases.
         llvm_unreachable("Expected GV to be converted");
     } else {
@@ -1018,6 +1050,18 @@ static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
   return NewFn;
 }
 
+// Internalize values that we marked with specific attribute
+// in processGlobalForThinLTO.
+static void internalizeImmutableGVs(Module &M) {
+  for (auto &GV : M.globals())
+    // Skip GVs which have been converted to declarations
+    // by dropDeadSymbols.
+    if (!GV.isDeclaration() && GV.hasAttribute("thinlto-internalize")) {
+      GV.setLinkage(GlobalValue::InternalLinkage);
+      GV.setVisibility(GlobalValue::DefaultVisibility);
+    }
+}
+
 // Automatically import functions in Module \p DestModule based on the summaries
 // index.
 Expected<bool> FunctionImporter::importFunctions(
@@ -1141,6 +1185,8 @@ Expected<bool> FunctionImporter::importFunctions(
     NumImportedModules++;
   }
 
+  internalizeImmutableGVs(DestModule);
+
   NumImportedFunctions += (ImportedCount - ImportedGVCount);
   NumImportedGlobalVars += ImportedGVCount;
 
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index c77ea04102c7a69ac5e96a4c78deece37040d45c..34de87433367c8c576dde9e853efebc4ca1b1b77 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
@@ -75,13 +76,17 @@ ModulePass *llvm::createGlobalDCEPass() {
   return new GlobalDCELegacyPass();
 }
 
-/// Returns true if F contains only a single "ret" instruction.
+/// Returns true if F is effectively empty.
 static bool isEmptyFunction(Function *F) {
   BasicBlock &Entry = F->getEntryBlock();
-  if (Entry.size() != 1 || !isa<ReturnInst>(Entry.front()))
-    return false;
-  ReturnInst &RI = cast<ReturnInst>(Entry.front());
-  return RI.getReturnValue() == nullptr;
+  for (auto &I : Entry) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+    if (auto *RI = dyn_cast<ReturnInst>(&I))
+      return !RI->getReturnValue();
+    break;
+  }
+  return false;
 }
 
 /// Compute the set of GlobalValue that depends from V.
diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index 621ac7dc8ab80ed1524e2d433ace813bc4c8bbd9..5d989a40f65f2058485aac8d98b10ed3ab34cc8f 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -25,6 +26,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -97,49 +99,43 @@ bool blockEndsInUnreachable(const BasicBlock &BB) {
   return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
 }
 
-static bool exceptionHandlingFunctions(const CallInst *CI) {
-  auto F = CI->getCalledFunction();
-  if (!F)
-    return false;
-  auto FName = F->getName();
-  return FName == "__cxa_begin_catch" ||
-         FName == "__cxa_free_exception" ||
-         FName == "__cxa_allocate_exception" ||
-         FName == "__cxa_begin_catch" ||
-         FName == "__cxa_end_catch";
-}
-
-static bool unlikelyExecuted(const BasicBlock &BB) {
-  if (blockEndsInUnreachable(BB))
-    return true;
+bool unlikelyExecuted(BasicBlock &BB) {
   // Exception handling blocks are unlikely executed.
   if (BB.isEHPad())
     return true;
-  for (const Instruction &I : BB)
-    if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
-      // The block is cold if it calls functions tagged as cold or noreturn.
-      if (CI->hasFnAttr(Attribute::Cold) ||
-          CI->hasFnAttr(Attribute::NoReturn) ||
-          exceptionHandlingFunctions(CI))
+
+  // The block is cold if it calls/invokes a cold function.
+  for (Instruction &I : BB)
+    if (auto CS = CallSite(&I))
+      if (CS.hasFnAttr(Attribute::Cold))
         return true;
 
-      // Assume that inline assembly is hot code.
-      if (isa<InlineAsm>(CI->getCalledValue()))
+  // The block is cold if it has an unreachable terminator, unless it's
+  // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
+  if (blockEndsInUnreachable(BB)) {
+    if (auto *CI =
+            dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
+      if (CI->hasFnAttr(Attribute::NoReturn))
         return false;
-    }
+    return true;
+  }
+
   return false;
 }
 
 /// Check whether it's safe to outline \p BB.
 static bool mayExtractBlock(const BasicBlock &BB) {
-  return !BB.hasAddressTaken();
+  return !BB.hasAddressTaken() && !BB.isEHPad();
 }
 
-/// Check whether \p BB is profitable to outline (i.e. its code size cost meets
-/// the threshold set in \p MinOutliningThreshold).
-static bool isProfitableToOutline(const BasicBlock &BB,
+/// Check whether \p Region is profitable to outline.
+static bool isProfitableToOutline(const BlockSequence &Region,
                                   TargetTransformInfo &TTI) {
+  if (Region.size() > 1)
+    return true;
+
   int Cost = 0;
+  const BasicBlock &BB = *Region[0];
   for (const Instruction &I : BB) {
     if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator())
       continue;
@@ -152,151 +148,16 @@ static bool isProfitableToOutline(const BasicBlock &BB,
   return false;
 }
 
-/// Identify the maximal region of cold blocks which includes \p SinkBB.
-///
-/// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all
-/// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which
-/// cannot be outlined.
-///
-/// Return an empty sequence if the cold region is too small to outline, or if
-/// the cold region has no warm predecessors.
-static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB,
-                                           TargetTransformInfo &TTI,
-                                           DominatorTree &DT,
-                                           PostDomTree &PDT) {
-  // The maximal cold region.
-  BlockSequence ColdRegion = {};
-
-  // The ancestor farthest-away from SinkBB, and also post-dominated by it.
-  BasicBlock *MaxAncestor = &SinkBB;
-  unsigned MaxAncestorHeight = 0;
-
-  // Visit SinkBB's ancestors using inverse DFS.
-  auto PredIt = ++idf_begin(&SinkBB);
-  auto PredEnd = idf_end(&SinkBB);
-  while (PredIt != PredEnd) {
-    BasicBlock &PredBB = **PredIt;
-    bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
-
-    // If SinkBB does not post-dominate a predecessor, do not mark the
-    // predecessor (or any of its predecessors) cold.
-    if (!SinkPostDom || !mayExtractBlock(PredBB)) {
-      PredIt.skipChildren();
-      continue;
-    }
-
-    // Keep track of the post-dominated ancestor farthest away from the sink.
-    unsigned AncestorHeight = PredIt.getPathLength();
-    if (AncestorHeight > MaxAncestorHeight) {
-      MaxAncestor = &PredBB;
-      MaxAncestorHeight = AncestorHeight;
-    }
-
-    ColdRegion.push_back(&PredBB);
-    ++PredIt;
-  }
-
-  // CodeExtractor requires that all blocks to be extracted must be dominated
-  // by the first block to be extracted.
-  //
-  // To avoid spurious or repeated outlining, require that the max ancestor
-  // has a predecessor. By construction this predecessor is not in the cold
-  // region, i.e. its existence implies we don't outline the whole function.
-  //
-  // TODO: If MaxAncestor has no predecessors, we may be able to outline the
-  // second largest cold region that has a predecessor.
-  if (pred_empty(MaxAncestor) ||
-      MaxAncestor->getSinglePredecessor() == MaxAncestor)
-    return {};
-
-  // Filter out predecessors not dominated by the max ancestor.
-  //
-  // TODO: Blocks not dominated by the max ancestor could be extracted as
-  // other cold regions. Marking outlined calls as noreturn when appropriate
-  // and outlining more than once per function could achieve most of the win.
-  auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) {
-    return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB);
-  });
-  ColdRegion.erase(EraseIt, ColdRegion.end());
-
-  // Add SinkBB to the cold region.
-  ColdRegion.push_back(&SinkBB);
-
-  // Ensure that the first extracted block is the max ancestor.
-  if (ColdRegion[0] != MaxAncestor) {
-    auto AncestorIt = find(ColdRegion, MaxAncestor);
-    *AncestorIt = ColdRegion[0];
-    ColdRegion[0] = MaxAncestor;
-  }
-
-  // Find all successors of SinkBB dominated by SinkBB using DFS.
-  auto SuccIt = ++df_begin(&SinkBB);
-  auto SuccEnd = df_end(&SinkBB);
-  while (SuccIt != SuccEnd) {
-    BasicBlock &SuccBB = **SuccIt;
-    bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
-
-    // If SinkBB does not dominate a successor, do not mark the successor (or
-    // any of its successors) cold.
-    if (!SinkDom || !mayExtractBlock(SuccBB)) {
-      SuccIt.skipChildren();
-      continue;
-    }
-
-    ColdRegion.push_back(&SuccBB);
-    ++SuccIt;
-  }
-
-  if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI))
-    return {};
-
-  return ColdRegion;
-}
-
-/// Get the largest cold region in \p F.
-static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
-                                          BlockFrequencyInfo *BFI,
-                                          TargetTransformInfo &TTI,
-                                          DominatorTree &DT, PostDomTree &PDT) {
-  // Keep track of the largest cold region.
-  BlockSequence LargestColdRegion = {};
-
-  for (BasicBlock &BB : F) {
-    // Identify cold blocks.
-    if (!mayExtractBlock(BB))
-      continue;
-    bool Cold =
-        PSI.isColdBB(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB));
-    if (!Cold)
-      continue;
-
-    LLVM_DEBUG({
-      dbgs() << "Found cold block:\n";
-      BB.dump();
-    });
-
-    // Find a maximal cold region we can outline.
-    BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT);
-    if (ColdRegion.empty()) {
-      LLVM_DEBUG(dbgs() << "  Skipping (block not profitable to extract)\n");
-      continue;
-    }
-
-    ++NumColdRegionsFound;
-
-    LLVM_DEBUG({
-      llvm::dbgs() << "Identified cold region with " << ColdRegion.size()
-                   << " blocks:\n";
-      for (BasicBlock *BB : ColdRegion)
-        BB->dump();
-    });
-
-    // TODO: Outline more than one region.
-    if (ColdRegion.size() > LargestColdRegion.size())
-      LargestColdRegion = std::move(ColdRegion);
+/// Mark \p F cold. Return true if it's changed.
+static bool markEntireFunctionCold(Function &F) {
+  assert(!F.hasFnAttribute(Attribute::OptimizeNone) && "Can't mark this cold");
+  bool Changed = false;
+  if (!F.hasFnAttribute(Attribute::MinSize)) {
+    F.addFnAttr(Attribute::MinSize);
+    Changed = true;
   }
-
-  return LargestColdRegion;
+  // TODO: Move this function into a cold section.
+  return Changed;
 }
 
 class HotColdSplitting {
@@ -310,6 +171,10 @@ public:
 
 private:
   bool shouldOutlineFrom(const Function &F) const;
+  bool outlineColdRegions(Function &F, ProfileSummaryInfo &PSI,
+                          BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
+                          DominatorTree &DT, PostDomTree &PDT,
+                          OptimizationRemarkEmitter &ORE);
   Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
                               BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
                               OptimizationRemarkEmitter &ORE, unsigned Count);
@@ -375,8 +240,6 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
                                               OptimizationRemarkEmitter &ORE,
                                               unsigned Count) {
   assert(!Region.empty());
-  LLVM_DEBUG(for (auto *BB : Region)
-          llvm::dbgs() << "\nExtracting: " << *BB;);
 
   // TODO: Pass BFI and BPI to update profile information.
   CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
@@ -408,9 +271,7 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
 
     // Try to make the outlined code as small as possible on the assumption
     // that it's cold.
-    assert(!OutF->hasFnAttribute(Attribute::OptimizeNone) &&
-           "An outlined function should never be marked optnone");
-    OutF->addFnAttr(Attribute::MinSize);
+    markEntireFunctionCold(*OutF);
 
     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
     ORE.emit([&]() {
@@ -431,32 +292,285 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
   return nullptr;
 }
 
+/// A pair of (basic block, score).
+using BlockTy = std::pair<BasicBlock *, unsigned>;
+
+/// A maximal outlining region. This contains all blocks post-dominated by a
+/// sink block, the sink block itself, and all blocks dominated by the sink.
+class OutliningRegion {
+  /// A list of (block, score) pairs. A block's score is non-zero iff it's a
+  /// viable sub-region entry point. Blocks with higher scores are better entry
+  /// points (i.e. they are more distant ancestors of the sink block).
+  SmallVector<BlockTy, 0> Blocks = {};
+
+  /// The suggested entry point into the region. If the region has multiple
+  /// entry points, all blocks within the region may not be reachable from this
+  /// entry point.
+  BasicBlock *SuggestedEntryPoint = nullptr;
+
+  /// Whether the entire function is cold.
+  bool EntireFunctionCold = false;
+
+  /// Whether or not \p BB could be the entry point of an extracted region.
+  static bool isViableEntryPoint(BasicBlock &BB) { return !BB.isEHPad(); }
+
+  /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
+  static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
+    return isViableEntryPoint(BB) ? Score : 0;
+  }
+
+  /// These scores should be lower than the score for predecessor blocks,
+  /// because regions starting at predecessor blocks are typically larger.
+  static constexpr unsigned ScoreForSuccBlock = 1;
+  static constexpr unsigned ScoreForSinkBlock = 1;
+
+  OutliningRegion(const OutliningRegion &) = delete;
+  OutliningRegion &operator=(const OutliningRegion &) = delete;
+
+public:
+  OutliningRegion() = default;
+  OutliningRegion(OutliningRegion &&) = default;
+  OutliningRegion &operator=(OutliningRegion &&) = default;
+
+  static OutliningRegion create(BasicBlock &SinkBB, const DominatorTree &DT,
+                                const PostDomTree &PDT) {
+    OutliningRegion ColdRegion;
+
+    SmallPtrSet<BasicBlock *, 4> RegionBlocks;
+
+    auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
+      RegionBlocks.insert(BB);
+      ColdRegion.Blocks.emplace_back(BB, Score);
+      assert(RegionBlocks.size() == ColdRegion.Blocks.size() && "Duplicate BB");
+    };
+
+    // The ancestor farthest-away from SinkBB, and also post-dominated by it.
+    unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
+    ColdRegion.SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
+    unsigned BestScore = SinkScore;
+
+    // Visit SinkBB's ancestors using inverse DFS.
+    auto PredIt = ++idf_begin(&SinkBB);
+    auto PredEnd = idf_end(&SinkBB);
+    while (PredIt != PredEnd) {
+      BasicBlock &PredBB = **PredIt;
+      bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
+
+      // If the predecessor is cold and has no predecessors, the entire
+      // function must be cold.
+      if (SinkPostDom && pred_empty(&PredBB)) {
+        ColdRegion.EntireFunctionCold = true;
+        return ColdRegion;
+      }
+
+      // If SinkBB does not post-dominate a predecessor, do not mark the
+      // predecessor (or any of its predecessors) cold.
+      if (!SinkPostDom || !mayExtractBlock(PredBB)) {
+        PredIt.skipChildren();
+        continue;
+      }
+
+      // Keep track of the post-dominated ancestor farthest away from the sink.
+      // The path length is always >= 2, ensuring that predecessor blocks are
+      // considered as entry points before the sink block.
+      unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
+      if (PredScore > BestScore) {
+        ColdRegion.SuggestedEntryPoint = &PredBB;
+        BestScore = PredScore;
+      }
+
+      addBlockToRegion(&PredBB, PredScore);
+      ++PredIt;
+    }
+
+    // Add SinkBB to the cold region. It's considered as an entry point before
+    // any sink-successor blocks.
+    addBlockToRegion(&SinkBB, SinkScore);
+
+    // Find all successors of SinkBB dominated by SinkBB using DFS.
+    auto SuccIt = ++df_begin(&SinkBB);
+    auto SuccEnd = df_end(&SinkBB);
+    while (SuccIt != SuccEnd) {
+      BasicBlock &SuccBB = **SuccIt;
+      bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
+
+      // Don't allow the backwards & forwards DFSes to mark the same block.
+      bool DuplicateBlock = RegionBlocks.count(&SuccBB);
+
+      // If SinkBB does not dominate a successor, do not mark the successor (or
+      // any of its successors) cold.
+      if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
+        SuccIt.skipChildren();
+        continue;
+      }
+
+      unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
+      if (SuccScore > BestScore) {
+        ColdRegion.SuggestedEntryPoint = &SuccBB;
+        BestScore = SuccScore;
+      }
+
+      addBlockToRegion(&SuccBB, SuccScore);
+      ++SuccIt;
+    }
+
+    return ColdRegion;
+  }
+
+  /// Whether this region has nothing to extract.
+  bool empty() const { return !SuggestedEntryPoint; }
+
+  /// The blocks in this region.
+  ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
+
+  /// Whether the entire function containing this region is cold.
+  bool isEntireFunctionCold() const { return EntireFunctionCold; }
+
+  /// Remove a sub-region from this region and return it as a block sequence.
+  BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
+    assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
+
+    // Remove blocks dominated by the suggested entry point from this region.
+    // During the removal, identify the next best entry point into the region.
+    // Ensure that the first extracted block is the suggested entry point.
+    BlockSequence SubRegion = {SuggestedEntryPoint};
+    BasicBlock *NextEntryPoint = nullptr;
+    unsigned NextScore = 0;
+    auto RegionEndIt = Blocks.end();
+    auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
+      BasicBlock *BB = Block.first;
+      unsigned Score = Block.second;
+      bool InSubRegion =
+          BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
+      if (!InSubRegion && Score > NextScore) {
+        NextEntryPoint = BB;
+        NextScore = Score;
+      }
+      if (InSubRegion && BB != SuggestedEntryPoint)
+        SubRegion.push_back(BB);
+      return InSubRegion;
+    });
+    Blocks.erase(RegionStartIt, RegionEndIt);
+
+    // Update the suggested entry point.
+    SuggestedEntryPoint = NextEntryPoint;
+
+    return SubRegion;
+  }
+};
+
+bool HotColdSplitting::outlineColdRegions(Function &F, ProfileSummaryInfo &PSI,
+                                          BlockFrequencyInfo *BFI,
+                                          TargetTransformInfo &TTI,
+                                          DominatorTree &DT, PostDomTree &PDT,
+                                          OptimizationRemarkEmitter &ORE) {
+  bool Changed = false;
+
+  // The set of cold blocks.
+  SmallPtrSet<BasicBlock *, 4> ColdBlocks;
+
+  // The worklist of non-intersecting regions left to outline.
+  SmallVector<OutliningRegion, 2> OutliningWorklist;
+
+  // Set up an RPO traversal. Experimentally, this performs better (outlines
+  // more) than a PO traversal, because we prevent region overlap by keeping
+  // the first region to contain a block.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+
+  // Find all cold regions.
+  for (BasicBlock *BB : RPOT) {
+    // Skip blocks which can't be outlined.
+    if (!mayExtractBlock(*BB))
+      continue;
+
+    // This block is already part of some outlining region.
+    if (ColdBlocks.count(BB))
+      continue;
+
+    bool Cold = PSI.isColdBlock(BB, BFI) ||
+                (EnableStaticAnalyis && unlikelyExecuted(*BB));
+    if (!Cold)
+      continue;
+
+    LLVM_DEBUG({
+      dbgs() << "Found a cold block:\n";
+      BB->dump();
+    });
+
+    auto Region = OutliningRegion::create(*BB, DT, PDT);
+    if (Region.empty())
+      continue;
+
+    if (Region.isEntireFunctionCold()) {
+      LLVM_DEBUG(dbgs() << "Entire function is cold\n");
+      return markEntireFunctionCold(F);
+    }
+
+    // If this outlining region intersects with another, drop the new region.
+    //
+    // TODO: It's theoretically possible to outline more by only keeping the
+    // largest region which contains a block, but the extra bookkeeping to do
+    // this is tricky/expensive.
+    bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
+      return !ColdBlocks.insert(Block.first).second;
+    });
+    if (RegionsOverlap)
+      continue;
+
+    OutliningWorklist.emplace_back(std::move(Region));
+    ++NumColdRegionsFound;
+  }
+
+  // Outline single-entry cold regions, splitting up larger regions as needed.
+  unsigned OutlinedFunctionID = 1;
+  while (!OutliningWorklist.empty()) {
+    OutliningRegion Region = OutliningWorklist.pop_back_val();
+    assert(!Region.empty() && "Empty outlining region in worklist");
+    do {
+      BlockSequence SubRegion = Region.takeSingleEntrySubRegion(DT);
+      if (!isProfitableToOutline(SubRegion, TTI)) {
+        LLVM_DEBUG({
+          dbgs() << "Skipping outlining; not profitable to outline\n";
+          SubRegion[0]->dump();
+        });
+        continue;
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
+        for (BasicBlock *BB : SubRegion)
+          BB->dump();
+      });
+
+      Function *Outlined =
+          extractColdRegion(SubRegion, DT, BFI, TTI, ORE, OutlinedFunctionID);
+      if (Outlined) {
+        ++OutlinedFunctionID;
+        OutlinedFunctions.insert(Outlined);
+        Changed = true;
+      }
+    } while (!Region.empty());
+  }
+
+  return Changed;
+}
+
 bool HotColdSplitting::run(Module &M) {
   bool Changed = false;
+  OutlinedFunctions.clear();
   for (auto &F : M) {
     if (!shouldOutlineFrom(F)) {
-      LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
       continue;
     }
-
     LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
     DominatorTree DT(F);
     PostDomTree PDT(F);
     PDT.recalculate(F);
     BlockFrequencyInfo *BFI = GetBFI(F);
     TargetTransformInfo &TTI = GetTTI(F);
-
-    BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT);
-    if (ColdRegion.empty())
-      continue;
-
     OptimizationRemarkEmitter &ORE = (*GetORE)(F);
-    Function *Outlined =
-        extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1);
-    if (Outlined) {
-      OutlinedFunctions.insert(Outlined);
-      Changed = true;
-    }
+    Changed |= outlineColdRegions(F, *PSI, BFI, TTI, DT, PDT, ORE);
   }
   return Changed;
 }
@@ -465,7 +579,7 @@ bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
   ProfileSummaryInfo *PSI =
-      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   auto GTTI = [this](Function &F) -> TargetTransformInfo & {
     return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   };
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 66aea45323fea9a817be7946e0928dcfbe8d8c01..44001b5779cc354dbc89bfc4eea60d9bc5c666a5 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -746,7 +746,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
 bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   ACT = &getAnalysis<AssumptionCacheTracker>();
-  PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return ACT->getAssumptionCache(F);
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 2d29e93b1ca1de6f15ed015434f7fe2ae3feadb6..e4dcd4d4dd7758cf690d3ca182a214db483f8b6a 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -989,6 +989,7 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
     if (F->isDSOLocal()) {
       Function *RealF = Function::Create(F->getFunctionType(),
                                          GlobalValue::ExternalLinkage,
+                                         F->getAddressSpace(),
                                          Name + ".cfi", &M);
       RealF->setVisibility(GlobalVariable::HiddenVisibility);
       replaceDirectCalls(F, RealF);
@@ -1000,13 +1001,13 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
   if (F->isDeclarationForLinker() && !isDefinition) {
     // Declaration of an external function.
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
-                             Name + ".cfi_jt", &M);
+                             F->getAddressSpace(), Name + ".cfi_jt", &M);
     FDecl->setVisibility(GlobalValue::HiddenVisibility);
   } else if (isDefinition) {
     F->setName(Name + ".cfi");
     F->setLinkage(GlobalValue::ExternalLinkage);
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
-                             Name, &M);
+                             F->getAddressSpace(), Name, &M);
     FDecl->setVisibility(Visibility);
     Visibility = GlobalValue::HiddenVisibility;
 
@@ -1016,7 +1017,8 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
     for (auto &U : F->uses()) {
       if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
         Function *AliasDecl = Function::Create(
-            F->getFunctionType(), GlobalValue::ExternalLinkage, "", &M);
+            F->getFunctionType(), GlobalValue::ExternalLinkage,
+            F->getAddressSpace(), "", &M);
         AliasDecl->takeName(A);
         A->replaceAllUsesWith(AliasDecl);
         ToErase.push_back(A);
@@ -1191,7 +1193,9 @@ void LowerTypeTestsModule::moveInitializerToModuleConstructor(
     WeakInitializerFn = Function::Create(
         FunctionType::get(Type::getVoidTy(M.getContext()),
                           /* IsVarArg */ false),
-        GlobalValue::InternalLinkage, "__cfi_global_var_init", &M);
+        GlobalValue::InternalLinkage,
+        M.getDataLayout().getProgramAddressSpace(),
+        "__cfi_global_var_init", &M);
     BasicBlock *BB =
         BasicBlock::Create(M.getContext(), "entry", WeakInitializerFn);
     ReturnInst::Create(M.getContext(), BB);
@@ -1234,7 +1238,8 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
   // placeholder first.
   Function *PlaceholderFn =
       Function::Create(cast<FunctionType>(F->getValueType()),
-                       GlobalValue::ExternalWeakLinkage, "", &M);
+                       GlobalValue::ExternalWeakLinkage,
+                       F->getAddressSpace(), "", &M);
   replaceCfiUses(F, PlaceholderFn, IsDefinition);
 
   Constant *Target = ConstantExpr::getSelect(
@@ -1424,7 +1429,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
   Function *JumpTableFn =
       Function::Create(FunctionType::get(Type::getVoidTy(M.getContext()),
                                          /* IsVarArg */ false),
-                       GlobalValue::PrivateLinkage, ".cfi.jumptable", &M);
+                       GlobalValue::PrivateLinkage,
+                       M.getDataLayout().getProgramAddressSpace(),
+                       ".cfi.jumptable", &M);
   ArrayType *JumpTableType =
       ArrayType::get(getJumpTableEntryType(), Functions.size());
   auto JumpTable =
@@ -1813,7 +1820,8 @@ bool LowerTypeTestsModule::lower() {
         if (!F)
           F = Function::Create(
               FunctionType::get(Type::getVoidTy(M.getContext()), false),
-              GlobalVariable::ExternalLinkage, FunctionName, &M);
+              GlobalVariable::ExternalLinkage,
+              M.getDataLayout().getProgramAddressSpace(), FunctionName, &M);
 
         // If the function is available_externally, remove its definition so
         // that it is handled the same way as a declaration. Later we will try
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 4c51cd131a1015d654e1de9f8b5232b390d097af..550750d17438cfb408f0c5d1ccb7b123ab9bfb89 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -136,6 +136,7 @@ using namespace llvm;
 
 STATISTIC(NumFunctionsMerged, "Number of functions merged");
 STATISTIC(NumThunksWritten, "Number of thunks generated");
+STATISTIC(NumAliasesWritten, "Number of aliases generated");
 STATISTIC(NumDoubleWeak, "Number of new functions created");
 
 static cl::opt<unsigned> NumFunctionsForSanityCheck(
@@ -165,6 +166,11 @@ static cl::opt<bool>
                       cl::desc("Preserve debug info in thunk when mergefunc "
                                "transformations are made."));
 
+static cl::opt<bool>
+    MergeFunctionsAliases("mergefunc-use-aliases", cl::Hidden,
+                          cl::init(false),
+                          cl::desc("Allow mergefunc to create aliases"));
+
 namespace {
 
 class FunctionNode {
@@ -272,6 +278,13 @@ private:
   /// delete G.
   void writeThunk(Function *F, Function *G);
 
+  // Replace G with an alias to F (deleting function G)
+  void writeAlias(Function *F, Function *G);
+
+  // Replace G with an alias to F if possible, or a thunk to F if
+  // profitable. Returns false if neither is the case.
+  bool writeThunkOrAlias(Function *F, Function *G);
+
   /// Replace function F with function G in the function tree.
   void replaceFunctionInTree(const FunctionNode &FN, Function *G);
 
@@ -680,8 +693,8 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
     GEntryBlock->getTerminator()->eraseFromParent();
     BB = GEntryBlock;
   } else {
-    NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
-                            G->getParent());
+    NewG = Function::Create(G->getFunctionType(), G->getLinkage(),
+                            G->getAddressSpace(), "", G->getParent());
     BB = BasicBlock::Create(F->getContext(), "", NewG);
   }
 
@@ -735,27 +748,76 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   ++NumThunksWritten;
 }
 
+// Whether this function may be replaced by an alias
+static bool canCreateAliasFor(Function *F) {
+  if (!MergeFunctionsAliases || !F->hasGlobalUnnamedAddr())
+    return false;
+
+  // We should only see linkages supported by aliases here
+  assert(F->hasLocalLinkage() || F->hasExternalLinkage()
+      || F->hasWeakLinkage() || F->hasLinkOnceLinkage());
+  return true;
+}
+
+// Replace G with an alias to F (deleting function G)
+void MergeFunctions::writeAlias(Function *F, Function *G) {
+  Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+  PointerType *PtrType = G->getType();
+  auto *GA = GlobalAlias::create(
+      PtrType->getElementType(), PtrType->getAddressSpace(),
+      G->getLinkage(), "", BitcastF, G->getParent());
+
+  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+  GA->takeName(G);
+  GA->setVisibility(G->getVisibility());
+  GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  removeUsers(G);
+  G->replaceAllUsesWith(GA);
+  G->eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
+  ++NumAliasesWritten;
+}
+
+// Replace G with an alias to F if possible, or a thunk to F if
+// profitable. Returns false if neither is the case.
+bool MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
+  if (canCreateAliasFor(G)) {
+    writeAlias(F, G);
+    return true;
+  }
+  if (isThunkProfitable(F)) {
+    writeThunk(F, G);
+    return true;
+  }
+  return false;
+}
+
 // Merge two equivalent functions. Upon completion, Function G is deleted.
 void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
   if (F->isInterposable()) {
     assert(G->isInterposable());
 
-    if (!isThunkProfitable(F)) {
+    // Both writeThunkOrAlias() calls below must succeed, either because we can
+    // create aliases for G and NewF, or because a thunk for F is profitable.
+    // F here has the same signature as NewF below, so that's what we check.
+    if (!isThunkProfitable(F) && (!canCreateAliasFor(F) || !canCreateAliasFor(G))) {
       return;
     }
 
     // Make them both thunks to the same internal function.
-    Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
-                                   F->getParent());
-    H->copyAttributesFrom(F);
-    H->takeName(F);
+    Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                      F->getAddressSpace(), "", F->getParent());
+    NewF->copyAttributesFrom(F);
+    NewF->takeName(F);
     removeUsers(F);
-    F->replaceAllUsesWith(H);
+    F->replaceAllUsesWith(NewF);
 
-    unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment());
+    unsigned MaxAlignment = std::max(G->getAlignment(), NewF->getAlignment());
 
-    writeThunk(F, G);
-    writeThunk(F, H);
+    writeThunkOrAlias(F, G);
+    writeThunkOrAlias(F, NewF);
 
     F->setAlignment(MaxAlignment);
     F->setLinkage(GlobalValue::PrivateLinkage);
@@ -789,12 +851,9 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
       return;
     }
 
-    if (!isThunkProfitable(F)) {
-      return;
+    if (writeThunkOrAlias(F, G)) {
+      ++NumFunctionsMerged;
     }
-
-    writeThunk(F, G);
-    ++NumFunctionsMerged;
   }
 }
 
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index bcb19af85b29b9bd501be3b53a3c5f98a61add49..917582a706b36c5bc294a7e7951dbd490b34e615 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -359,7 +359,7 @@ struct PartialInlinerLegacyPass : public ModulePass {
     TargetTransformInfoWrapperPass *TTIWP =
         &getAnalysis<TargetTransformInfoWrapperPass>();
     ProfileSummaryInfo *PSI =
-        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+        &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&ACT](Function &F) -> AssumptionCache & {
@@ -403,7 +403,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
 
   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
     BasicBlock *Dom = BlockList.front();
-    return BlockList.size() > 1 && pred_size(Dom) == 1;
+    return BlockList.size() > 1 && Dom->hasNPredecessors(1);
   };
 
   auto IsSingleExit =
@@ -468,7 +468,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
     // Only consider regions with predecessor blocks that are considered
     // not-cold (default: part of the top 99.99% of all block counters)
     // AND greater than our minimum block execution count (default: 100).
-    if (PSI->isColdBB(thisBB, BFI) ||
+    if (PSI->isColdBlock(thisBB, BFI) ||
         BBProfileCount(thisBB) < MinBlockCounterExecution)
       continue;
     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
@@ -834,42 +834,41 @@ bool PartialInlinerImpl::shouldPartialInline(
 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
   int InlineCost = 0;
   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-
-    switch (I->getOpcode()) {
+  for (Instruction &I : BB->instructionsWithoutDebug()) {
+    // Skip free instructions.
+    switch (I.getOpcode()) {
     case Instruction::BitCast:
     case Instruction::PtrToInt:
     case Instruction::IntToPtr:
     case Instruction::Alloca:
+    case Instruction::PHI:
       continue;
     case Instruction::GetElementPtr:
-      if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
+      if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
         continue;
       break;
     default:
       break;
     }
 
-    IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
+    IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&I);
     if (IntrInst) {
       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
           IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
         continue;
     }
 
-    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
       InlineCost += getCallsiteCost(CallSite(CI), DL);
       continue;
     }
 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
       InlineCost += getCallsiteCost(CallSite(II), DL);
       continue;
     }
 
-    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
       continue;
     }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 19ff2a21cd2186cb0532822b1e0e405c9dc0e256..60bd6def6cefea8923acc78d57988cbe5773b0f8 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -378,8 +378,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   if (EnableLoopInterchange)
     MPM.add(createLoopInterchangePass()); // Interchange loops
 
-  if (!DisableUnrollLoops)
-    MPM.add(createSimpleLoopUnrollPass(OptLevel));    // Unroll small loops
+  MPM.add(createSimpleLoopUnrollPass(OptLevel,
+                                     DisableUnrollLoops)); // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
   // This ends the loop pass pipelines.
 
@@ -637,7 +637,7 @@ void PassManagerBuilder::populateModulePassManager(
   // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   MPM.add(createLoopDistributePass());
 
-  MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
+  MPM.add(createLoopVectorizePass(DisableUnrollLoops, !LoopVectorize));
 
   // Eliminate loads by forwarding stores from the previous iteration to loads
   // of the current iteration.
@@ -682,16 +682,17 @@ void PassManagerBuilder::populateModulePassManager(
   addExtensionsToPM(EP_Peephole, MPM);
   addInstructionCombiningPass(MPM);
 
-  if (!DisableUnrollLoops) {
-    if (EnableUnrollAndJam) {
-      // Unroll and Jam. We do this before unroll but need to be in a separate
-      // loop pass manager in order for the outer loop to be processed by
-      // unroll and jam before the inner loop is unrolled.
-      MPM.add(createLoopUnrollAndJamPass(OptLevel));
-    }
+  if (EnableUnrollAndJam && !DisableUnrollLoops) {
+    // Unroll and Jam. We do this before unroll but need to be in a separate
+    // loop pass manager in order for the outer loop to be processed by
+    // unroll and jam before the inner loop is unrolled.
+    MPM.add(createLoopUnrollAndJamPass(OptLevel));
+  }
 
-    MPM.add(createLoopUnrollPass(OptLevel));    // Unroll small loops
+  MPM.add(createLoopUnrollPass(OptLevel,
+                               DisableUnrollLoops)); // Unroll small loops
 
+  if (!DisableUnrollLoops) {
     // LoopUnroll may generate some redundency to cleanup.
     addInstructionCombiningPass(MPM);
 
@@ -700,7 +701,9 @@ void PassManagerBuilder::populateModulePassManager(
     // outer loop. LICM pass can help to promote the runtime check out if the
     // checked value is loop invariant.
     MPM.add(createLICMPass());
- }
+  }
+
+  MPM.add(createWarnMissedTransformationsPass());
 
   // After vectorization and unrolling, assume intrinsics may tell us more
   // about pointer alignments.
@@ -747,6 +750,12 @@ void PassManagerBuilder::populateModulePassManager(
 }
 
 void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
+  // Load sample profile before running the LTO optimization pipeline.
+  if (!PGOSampleUse.empty()) {
+    PM.add(createPruneEHPass());
+    PM.add(createSampleProfileLoaderPass(PGOSampleUse));
+  }
+
   // Remove unused virtual tables to improve the quality of code generated by
   // whole-program devirtualization and bitset lowering.
   PM.add(createGlobalDCEPass());
@@ -864,12 +873,13 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   if (EnableLoopInterchange)
     PM.add(createLoopInterchangePass());
 
-  if (!DisableUnrollLoops)
-    PM.add(createSimpleLoopUnrollPass(OptLevel));   // Unroll small loops
-  PM.add(createLoopVectorizePass(true, LoopVectorize));
+  PM.add(createSimpleLoopUnrollPass(OptLevel,
+                                    DisableUnrollLoops)); // Unroll small loops
+  PM.add(createLoopVectorizePass(true, !LoopVectorize));
   // The vectorizer may have significantly shortened a loop body; unroll again.
-  if (!DisableUnrollLoops)
-    PM.add(createLoopUnrollPass(OptLevel));
+  PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops));
+
+  PM.add(createWarnMissedTransformationsPass());
 
   // Now that we've optimized loops (in particular loop induction variables),
   // we may have exposed more scalar opportunities. Run parts of the scalar
diff --git a/lib/Transforms/IPO/SCCP.cpp b/lib/Transforms/IPO/SCCP.cpp
index e2bb6f185c3bf5b76fb4e58ce3b14684a6881147..d2c34abfc13270f25dba1203155a375cc470832f 100644
--- a/lib/Transforms/IPO/SCCP.cpp
+++ b/lib/Transforms/IPO/SCCP.cpp
@@ -1,5 +1,6 @@
 #include "llvm/Transforms/IPO/SCCP.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
@@ -10,16 +11,21 @@ PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
   const DataLayout &DL = M.getDataLayout();
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  auto getPredicateInfo =
-      [&FAM](Function &F) -> std::unique_ptr<PredicateInfo> {
-    return make_unique<PredicateInfo>(F,
-                                      FAM.getResult<DominatorTreeAnalysis>(F),
-                                      FAM.getResult<AssumptionAnalysis>(F));
+  auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn {
+    DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+    return {
+        make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
+        &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F)};
   };
 
-  if (!runIPSCCP(M, DL, &TLI, getPredicateInfo))
+  if (!runIPSCCP(M, DL, &TLI, getAnalysis))
     return PreservedAnalyses::all();
-  return PreservedAnalyses::none();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  PA.preserve<FunctionAnalysisManagerModuleProxy>();
+  return PA;
 }
 
 namespace {
@@ -44,14 +50,19 @@ public:
     const TargetLibraryInfo *TLI =
         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
-    auto getPredicateInfo =
-        [this](Function &F) -> std::unique_ptr<PredicateInfo> {
-      return make_unique<PredicateInfo>(
-          F, this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(),
-          this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+    auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn {
+      DominatorTree &DT =
+          this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+      return {
+          make_unique<PredicateInfo>(
+              F, DT,
+              this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+                  F)),
+          nullptr,  // We cannot preserve the DT or PDT with the legacy pass
+          nullptr}; // manager, so set them to nullptr.
     };
 
-    return runIPSCCP(M, DL, TLI, getPredicateInfo);
+    return runIPSCCP(M, DL, TLI, getAnalysis);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index a78e0d459c891b909d7f9766667f0f8c7cecb7d4..06a1ce89827f65265ecce14a60021fce6504415f 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -123,6 +123,12 @@ static cl::opt<bool> NoWarnSampleUnused(
     cl::desc("Use this option to turn off/on warnings about function with "
              "samples but without debug information to use those samples. "));
 
+static cl::opt<bool> ProfileSampleAccurate(
+    "profile-sample-accurate", cl::Hidden, cl::init(false),
+    cl::desc("If the sample profile is accurate, we will mark all un-sampled "
+             "callsite and function as having 0 samples. Otherwise, treat "
+             "un-sampled callsites and functions conservatively as unknown. "));
+
 namespace {
 
 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
@@ -1599,15 +1605,23 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
   ACT = &getAnalysis<AssumptionCacheTracker>();
   TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
   ProfileSummaryInfo *PSI =
-      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   return SampleLoader.runOnModule(M, nullptr, PSI);
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
-  // Initialize the entry count to -1, which will be treated conservatively
-  // by getEntryCount as the same as unknown (None). If we have samples this
-  // will be overwritten in emitAnnotations.
-  F.setEntryCount(ProfileCount(-1, Function::PCT_Real));
+  // By default the entry count is initialized to -1, which will be treated
+  // conservatively by getEntryCount as the same as unknown (None). This is
+  // to avoid newly added code to be treated as cold. If we have samples
+  // this will be overwritten in emitAnnotations.
+  // If ProfileSampleAccurate is true or F has profile-sample-accurate
+  // attribute, initialize the entry count to 0 so callsites or functions
+  // unsampled will be treated as cold.
+  uint64_t initialEntryCount =
+      (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate"))
+          ? 0
+          : -1;
+  F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {
     auto &FAM =
diff --git a/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
index 3c5ad37bced1997800b4e72a0070acd1984ba387..64837d4f5d6196a60c2a2625f35c15e9c7c10c26 100644
--- a/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
+++ b/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
@@ -46,7 +46,7 @@ using ProfileCount = Function::ProfileCount;
 #define DEBUG_TYPE "synthetic-counts-propagation"
 
 /// Initial synthetic count assigned to functions.
-static cl::opt<int>
+cl::opt<int>
     InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10),
                           cl::ZeroOrMore,
                           cl::desc("Initial value of synthetic entry count."));
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index bfab96a1ddb9536280e2e456837c7925c6598e54..a5382c4638d8baf7d4c71f8a8b3699d0adbc60b1 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -154,7 +154,8 @@ void simplifyExternals(Module &M) {
       continue;
 
     Function *NewF =
-        Function::Create(EmptyFT, GlobalValue::ExternalLinkage, "", &M);
+        Function::Create(EmptyFT, GlobalValue::ExternalLinkage,
+                         F.getAddressSpace(), "", &M);
     NewF->setVisibility(F.getVisibility());
     NewF->takeName(&F);
     F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index b8f68d4d15d4055c1fb607735fa4704be2ab6274..37905daee4cbd792f83b47ff93f1fb3946b8874a 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -864,10 +864,13 @@ void DevirtModule::tryICallBranchFunnel(
   Function *JT;
   if (isa<MDString>(Slot.TypeID)) {
     JT = Function::Create(FT, Function::ExternalLinkage,
+                          M.getDataLayout().getProgramAddressSpace(),
                           getGlobalName(Slot, {}, "branch_funnel"), &M);
     JT->setVisibility(GlobalValue::HiddenVisibility);
   } else {
-    JT = Function::Create(FT, Function::InternalLinkage, "branch_funnel", &M);
+    JT = Function::Create(FT, Function::InternalLinkage,
+                          M.getDataLayout().getProgramAddressSpace(),
+                          "branch_funnel", &M);
   }
   JT->addAttribute(1, Attribute::Nest);
 
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 7e7a515bfc8de4869966455018cfdecc070db285..24a82ba9ae40049de5fe15b691ddaa2e9494ac7c 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -53,11 +53,11 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC) {
 /// operands into either a constant true or false, or a brand new ICmp
 /// instruction. The sign is passed in to determine which kind of predicate to
 /// use in the new icmp instruction.
-static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
+static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS,
                               InstCombiner::BuilderTy &Builder) {
   ICmpInst::Predicate NewPred;
-  if (Value *NewConstant = getICmpValue(Sign, Code, LHS, RHS, NewPred))
-    return NewConstant;
+  if (Constant *TorF = getPredForICmpCode(Code, Sign, LHS->getType(), NewPred))
+    return TorF;
   return Builder.CreateICmp(NewPred, LHS, RHS);
 }
 
@@ -1033,7 +1033,7 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
   // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
-  if (PredicatesFoldable(PredL, PredR)) {
+  if (predicatesFoldable(PredL, PredR)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -1041,8 +1041,8 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
         LHS->getOperand(1) == RHS->getOperand(1)) {
       Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
       unsigned Code = getICmpCode(LHS) & getICmpCode(RHS);
-      bool isSigned = LHS->isSigned() || RHS->isSigned();
-      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+      bool IsSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
     }
   }
 
@@ -1131,7 +1131,7 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     return nullptr;
 
   // We can't fold (ugt x, C) & (sgt x, C2).
-  if (!PredicatesFoldable(PredL, PredR))
+  if (!predicatesFoldable(PredL, PredR))
     return nullptr;
 
   // Ensure that the larger constant is on the RHS.
@@ -1762,10 +1762,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   return nullptr;
 }
 
-/// Given an OR instruction, check to see if this is a bswap idiom. If so,
-/// insert the new intrinsic and return it.
-Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {
+  assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
+  Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
 
   // Look through zero extends.
   if (Instruction *Ext = dyn_cast<ZExtInst>(Op0))
@@ -1801,7 +1800,7 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
     return nullptr;
 
   SmallVector<Instruction*, 4> Insts;
-  if (!recognizeBSwapOrBitReverseIdiom(&I, true, false, Insts))
+  if (!recognizeBSwapOrBitReverseIdiom(&Or, true, false, Insts))
     return nullptr;
   Instruction *LastInst = Insts.pop_back_val();
   LastInst->removeFromParent();
@@ -1977,7 +1976,7 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   }
 
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
-  if (PredicatesFoldable(PredL, PredR)) {
+  if (predicatesFoldable(PredL, PredR)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -1985,8 +1984,8 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
         LHS->getOperand(1) == RHS->getOperand(1)) {
       Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
       unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
-      bool isSigned = LHS->isSigned() || RHS->isSigned();
-      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+      bool IsSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
     }
   }
 
@@ -2067,7 +2066,7 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     return nullptr;
 
   // We can't fold (ugt x, C) | (sgt x, C2).
-  if (!PredicatesFoldable(PredL, PredR))
+  if (!predicatesFoldable(PredL, PredR))
     return nullptr;
 
   // Ensure that the larger constant is on the RHS.
@@ -2168,7 +2167,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
     return FoldedLogic;
 
-  if (Instruction *BSwap = MatchBSwap(I))
+  if (Instruction *BSwap = matchBSwap(I))
     return BSwap;
 
   Value *X, *Y;
@@ -2463,7 +2462,7 @@ static Instruction *foldXorToXor(BinaryOperator &I,
 }
 
 Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
-  if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
+  if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -2472,8 +2471,8 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
       Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
       unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
-      bool isSigned = LHS->isSigned() || RHS->isSigned();
-      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+      bool IsSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index bdd9a43be2721a9e441456060a1e127805877230..ae158aeabdff6c9a206f305e0ccad9bd2c7ff473 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -736,7 +736,8 @@ static Value *simplifyX86round(IntrinsicInst &II,
   return Builder.CreateInsertElement(Dst, Res, (uint64_t)0);
 }
 
-static Value *simplifyX86movmsk(const IntrinsicInst &II) {
+static Value *simplifyX86movmsk(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
   Value *Arg = II.getArgOperand(0);
   Type *ResTy = II.getType();
   Type *ArgTy = Arg->getType();
@@ -749,29 +750,46 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II) {
   if (!ArgTy->isVectorTy())
     return nullptr;
 
-  auto *C = dyn_cast<Constant>(Arg);
-  if (!C)
-    return nullptr;
+  if (auto *C = dyn_cast<Constant>(Arg)) {
+    // Extract signbits of the vector input and pack into integer result.
+    APInt Result(ResTy->getPrimitiveSizeInBits(), 0);
+    for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) {
+      auto *COp = C->getAggregateElement(I);
+      if (!COp)
+        return nullptr;
+      if (isa<UndefValue>(COp))
+        continue;
 
-  // Extract signbits of the vector input and pack into integer result.
-  APInt Result(ResTy->getPrimitiveSizeInBits(), 0);
-  for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) {
-    auto *COp = C->getAggregateElement(I);
-    if (!COp)
-      return nullptr;
-    if (isa<UndefValue>(COp))
-      continue;
+      auto *CInt = dyn_cast<ConstantInt>(COp);
+      auto *CFp = dyn_cast<ConstantFP>(COp);
+      if (!CInt && !CFp)
+        return nullptr;
 
-    auto *CInt = dyn_cast<ConstantInt>(COp);
-    auto *CFp = dyn_cast<ConstantFP>(COp);
-    if (!CInt && !CFp)
-      return nullptr;
+      if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative()))
+        Result.setBit(I);
+    }
+    return Constant::getIntegerValue(ResTy, Result);
+  }
 
-    if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative()))
-      Result.setBit(I);
+  // Look for a sign-extended boolean source vector as the argument to this
+  // movmsk. If the argument is bitcast, look through that, but make sure the
+  // source of that bitcast is still a vector with the same number of elements.
+  // TODO: We can also convert a bitcast with wider elements, but that requires
+  // duplicating the bool source sign bits to match the number of elements
+  // expected by the movmsk call.
+  Arg = peekThroughBitcast(Arg);
+  Value *X;
+  if (Arg->getType()->isVectorTy() &&
+      Arg->getType()->getVectorNumElements() == ArgTy->getVectorNumElements() &&
+      match(Arg, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+    // call iM movmsk(sext <N x i1> X) --> zext (bitcast <N x i1> X to iN) to iM
+    unsigned NumElts = X->getType()->getVectorNumElements();
+    Type *ScalarTy = Type::getIntNTy(Arg->getContext(), NumElts);
+    Value *BC = Builder.CreateBitCast(X, ScalarTy);
+    return Builder.CreateZExtOrTrunc(BC, ResTy);
   }
 
-  return Constant::getIntegerValue(ResTy, Result);
+  return nullptr;
 }
 
 static Value *simplifyX86insertps(const IntrinsicInst &II,
@@ -1837,6 +1855,17 @@ Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
   return nullptr;
 }
 
+static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) {
+  assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap");
+  Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1);
+  if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) {
+    Call.setArgOperand(0, Arg1);
+    Call.setArgOperand(1, Arg0);
+    return &Call;
+  }
+  return nullptr;
+}
+
 /// CallInst simplification. This mostly only handles folding of intrinsic
 /// instructions. For normal calls, it allows visitCallSite to do the heavy
 /// lifting.
@@ -1990,18 +2019,49 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return I;
     break;
 
+  case Intrinsic::fshl:
+  case Intrinsic::fshr: {
+    const APInt *SA;
+    if (match(II->getArgOperand(2), m_APInt(SA))) {
+      Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1);
+      unsigned BitWidth = SA->getBitWidth();
+      uint64_t ShiftAmt = SA->urem(BitWidth);
+      assert(ShiftAmt != 0 && "SimplifyCall should have handled zero shift");
+      // Normalize to funnel shift left.
+      if (II->getIntrinsicID() == Intrinsic::fshr)
+        ShiftAmt = BitWidth - ShiftAmt;
+
+      // fshl(X, 0, C) -> shl X, C
+      // fshl(X, undef, C) -> shl X, C
+      if (match(Op1, m_Zero()) || match(Op1, m_Undef()))
+        return BinaryOperator::CreateShl(
+            Op0, ConstantInt::get(II->getType(), ShiftAmt));
+
+      // fshl(0, X, C) -> lshr X, (BW-C)
+      // fshl(undef, X, C) -> lshr X, (BW-C)
+      if (match(Op0, m_Zero()) || match(Op0, m_Undef()))
+        return BinaryOperator::CreateLShr(
+            Op1, ConstantInt::get(II->getType(), BitWidth - ShiftAmt));
+    }
+
+    // The shift amount (operand 2) of a funnel shift is modulo the bitwidth,
+    // so only the low bits of the shift amount are demanded if the bitwidth is
+    // a power-of-2.
+    unsigned BitWidth = II->getType()->getScalarSizeInBits();
+    if (!isPowerOf2_32(BitWidth))
+      break;
+    APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth));
+    KnownBits Op2Known(BitWidth);
+    if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known))
+      return &CI;
+    break;
+  }
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::umul_with_overflow:
   case Intrinsic::smul_with_overflow:
-    if (isa<Constant>(II->getArgOperand(0)) &&
-        !isa<Constant>(II->getArgOperand(1))) {
-      // Canonicalize constants into the RHS.
-      Value *LHS = II->getArgOperand(0);
-      II->setArgOperand(0, II->getArgOperand(1));
-      II->setArgOperand(1, LHS);
-      return II;
-    }
+    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
+      return I;
     LLVM_FALLTHROUGH;
 
   case Intrinsic::usub_with_overflow:
@@ -2019,19 +2079,102 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::uadd_sat:
+  case Intrinsic::sadd_sat:
+    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
+      return I;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::usub_sat:
+  case Intrinsic::ssub_sat: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    Intrinsic::ID IID = II->getIntrinsicID();
+
+    // Make use of known overflow information.
+    OverflowResult OR;
+    switch (IID) {
+    default:
+      llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::uadd_sat:
+      OR = computeOverflowForUnsignedAdd(Arg0, Arg1, II);
+      if (OR == OverflowResult::NeverOverflows)
+        return BinaryOperator::CreateNUWAdd(Arg0, Arg1);
+      if (OR == OverflowResult::AlwaysOverflows)
+        return replaceInstUsesWith(*II,
+                                   ConstantInt::getAllOnesValue(II->getType()));
+      break;
+    case Intrinsic::usub_sat:
+      OR = computeOverflowForUnsignedSub(Arg0, Arg1, II);
+      if (OR == OverflowResult::NeverOverflows)
+        return BinaryOperator::CreateNUWSub(Arg0, Arg1);
+      if (OR == OverflowResult::AlwaysOverflows)
+        return replaceInstUsesWith(*II,
+                                   ConstantInt::getNullValue(II->getType()));
+      break;
+    case Intrinsic::sadd_sat:
+      if (willNotOverflowSignedAdd(Arg0, Arg1, *II))
+        return BinaryOperator::CreateNSWAdd(Arg0, Arg1);
+      break;
+    case Intrinsic::ssub_sat:
+      if (willNotOverflowSignedSub(Arg0, Arg1, *II))
+        return BinaryOperator::CreateNSWSub(Arg0, Arg1);
+      break;
+    }
+
+    // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN
+    Constant *C;
+    if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) &&
+        C->isNotMinSignedValue()) {
+      Value *NegVal = ConstantExpr::getNeg(C);
+      return replaceInstUsesWith(
+          *II, Builder.CreateBinaryIntrinsic(
+              Intrinsic::sadd_sat, Arg0, NegVal));
+    }
+
+    // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2))
+    // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2))
+    // if Val and Val2 have the same sign
+    if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) {
+      Value *X;
+      const APInt *Val, *Val2;
+      APInt NewVal;
+      bool IsUnsigned =
+          IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat;
+      if (Other->getIntrinsicID() == II->getIntrinsicID() &&
+          match(Arg1, m_APInt(Val)) &&
+          match(Other->getArgOperand(0), m_Value(X)) &&
+          match(Other->getArgOperand(1), m_APInt(Val2))) {
+        if (IsUnsigned)
+          NewVal = Val->uadd_sat(*Val2);
+        else if (Val->isNonNegative() == Val2->isNonNegative()) {
+          bool Overflow;
+          NewVal = Val->sadd_ov(*Val2, Overflow);
+          if (Overflow) {
+            // Both adds together may add more than SignedMaxValue
+            // without saturating the final result.
+            break;
+          }
+        } else {
+          // Cannot fold saturated addition with different signs.
+          break;
+        }
+
+        return replaceInstUsesWith(
+            *II, Builder.CreateBinaryIntrinsic(
+                     IID, X, ConstantInt::get(II->getType(), NewVal)));
+      }
+    }
+    break;
+  }
+
   case Intrinsic::minnum:
   case Intrinsic::maxnum:
   case Intrinsic::minimum:
   case Intrinsic::maximum: {
+    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
+      return I;
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
-    // Canonicalize constants to the RHS.
-    if (isa<ConstantFP>(Arg0) && !isa<ConstantFP>(Arg1)) {
-      II->setArgOperand(0, Arg1);
-      II->setArgOperand(1, Arg0);
-      return II;
-    }
-
     Intrinsic::ID IID = II->getIntrinsicID();
     Value *X, *Y;
     if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
@@ -2111,17 +2254,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     LLVM_FALLTHROUGH;
   }
   case Intrinsic::fma: {
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-
-    // Canonicalize constant multiply operand to Src1.
-    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
-      II->setArgOperand(0, Src1);
-      II->setArgOperand(1, Src0);
-      std::swap(Src0, Src1);
-    }
+    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
+      return I;
 
     // fma fneg(x), fneg(y), z -> fma x, y, z
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
     Value *X, *Y;
     if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
       II->setArgOperand(0, X);
@@ -2423,7 +2561,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx_movmsk_pd_256:
   case Intrinsic::x86_avx_movmsk_ps_256:
   case Intrinsic::x86_avx2_pmovmskb:
-    if (Value *V = simplifyX86movmsk(*II))
+    if (Value *V = simplifyX86movmsk(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -3479,22 +3617,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe;
 
-    // TODO: Also emit sub if only width is constant.
-    if (!CWidth && COffset && Offset == 0) {
-      Constant *KSize = ConstantInt::get(COffset->getType(), IntSize);
-      Value *ShiftVal = Builder.CreateSub(KSize, II->getArgOperand(2));
-      ShiftVal = Builder.CreateZExt(ShiftVal, II->getType());
-
-      Value *Shl = Builder.CreateShl(Src, ShiftVal);
-      Value *RightShift = Signed ? Builder.CreateAShr(Shl, ShiftVal)
-                                 : Builder.CreateLShr(Shl, ShiftVal);
-      RightShift->takeName(II);
-      return replaceInstUsesWith(*II, RightShift);
-    }
-
     if (!CWidth || !COffset)
       break;
 
+    // The case of Width == 0 is handled above, which makes this tranformation
+    // safe.  If Width == 0, then the ashr and lshr instructions become poison
+    // value since the shift amount would be equal to the bit size.
+    assert(Width != 0);
+
     // TODO: This allows folding to undef when the hardware has specific
     // behavior?
     if (Offset + Width < IntSize) {
@@ -3910,8 +4040,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));
 
       // isKnownNonNull -> nonnull attribute
-      if (isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT))
+      if (!II->hasRetAttr(Attribute::NonNull) &&
+          isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) {
         II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+        return II;
+      }
     }
 
     // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 9fa27d89911dbea920ab88f843f0c08b3151b2c6..7a8c762d494fb481862277ba4d68150cf62ef723 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -498,6 +498,13 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
           shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) &&
          "Don't narrow to an illegal scalar type");
 
+  // Bail out on strange types. It is possible to handle some of these patterns
+  // even with non-power-of-2 sizes, but it is not a likely scenario.
+  Type *DestTy = Trunc.getType();
+  unsigned NarrowWidth = DestTy->getScalarSizeInBits();
+  if (!isPowerOf2_32(NarrowWidth))
+    return nullptr;
+
   // First, find an or'd pair of opposite shifts with the same shifted operand:
   // trunc (or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1))
   Value *Or0, *Or1;
@@ -514,22 +521,38 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
   if (ShiftOpcode0 == ShiftOpcode1)
     return nullptr;
 
-  // The shift amounts must add up to the narrow bit width.
-  Value *ShAmt;
-  bool SubIsOnLHS;
-  Type *DestTy = Trunc.getType();
-  unsigned NarrowWidth = DestTy->getScalarSizeInBits();
-  if (match(ShAmt0,
-            m_OneUse(m_Sub(m_SpecificInt(NarrowWidth), m_Specific(ShAmt1))))) {
-    ShAmt = ShAmt1;
-    SubIsOnLHS = true;
-  } else if (match(ShAmt1, m_OneUse(m_Sub(m_SpecificInt(NarrowWidth),
-                                          m_Specific(ShAmt0))))) {
-    ShAmt = ShAmt0;
-    SubIsOnLHS = false;
-  } else {
+  // Match the shift amount operands for a rotate pattern. This always matches
+  // a subtraction on the R operand.
+  auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * {
+    // The shift amounts may add up to the narrow bit width:
+    // (shl ShVal, L) | (lshr ShVal, Width - L)
+    if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L)))))
+      return L;
+
+    // The shift amount may be masked with negation:
+    // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
+    Value *X;
+    unsigned Mask = Width - 1;
+    if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
+        match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
+      return X;
+
+    // Same as above, but the shift amount may be extended after masking:
+    if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
+        match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
+      return X;
+
     return nullptr;
+  };
+
+  Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth);
+  bool SubIsOnLHS = false;
+  if (!ShAmt) {
+    ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth);
+    SubIsOnLHS = true;
   }
+  if (!ShAmt)
+    return nullptr;
 
   // The shifted value must have high zeros in the wide type. Typically, this
   // will be a zext, but it could also be the result of an 'and' or 'shift'.
@@ -1084,12 +1107,9 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
-  // Attempt to extend the entire input expression tree to the destination
-  // type.   Only do this if the dest type is a simple type, don't convert the
-  // expression tree to something weird like i93 unless the source is also
-  // strange.
+  // Try to extend the entire expression tree to the wide destination type.
   unsigned BitsToClear;
-  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
+  if (shouldChangeType(SrcTy, DestTy) &&
       canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
     assert(BitsToClear <= SrcTy->getScalarSizeInBits() &&
            "Can't clear more bits than in SrcTy");
@@ -1366,12 +1386,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     return replaceInstUsesWith(CI, ZExt);
   }
 
-  // Attempt to extend the entire input expression tree to the destination
-  // type.   Only do this if the dest type is a simple type, don't convert the
-  // expression tree to something weird like i93 unless the source is also
-  // strange.
-  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
-      canEvaluateSExtd(Src, DestTy)) {
+  // Try to extend the entire expression tree to the wide destination type.
+  if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) {
     // Okay, we can transform this!  Insert the new expression now.
     LLVM_DEBUG(
         dbgs() << "ICE: EvaluateInDifferentType converting expression type"
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2ba1174517ffe662b7b8258ffa5421963b45c597..b5bbb09935e2731e378164a89565f1ea8346522e 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -522,11 +522,9 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
   }
 
   // Otherwise, there is an index.  The computation we will do will be modulo
-  // the pointer size, so get it.
-  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
-
-  Offset &= PtrSizeMask;
-  VariableScale &= PtrSizeMask;
+  // the pointer size.
+  Offset = SignExtend64(Offset, IntPtrWidth);
+  VariableScale = SignExtend64(VariableScale, IntPtrWidth);
 
   // To do this transformation, any constant index must be a multiple of the
   // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
@@ -1335,17 +1333,12 @@ Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
   return nullptr;
 }
 
-// Fold icmp Pred X, C.
+/// Fold icmp Pred X, C.
+/// TODO: This code structure does not make sense. The saturating add fold
+/// should be moved to some other helper and extended as noted below (it is also
+/// possible that code has been made unnecessary - do we canonicalize IR to
+/// overflow/saturating intrinsics or not?).
 Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
-  CmpInst::Predicate Pred = Cmp.getPredicate();
-  Value *X = Cmp.getOperand(0);
-
-  const APInt *C;
-  if (!match(Cmp.getOperand(1), m_APInt(C)))
-    return nullptr;
-
-  Value *A = nullptr, *B = nullptr;
-
   // Match the following pattern, which is a common idiom when writing
   // overflow-safe integer arithmetic functions. The source performs an addition
   // in wider type and explicitly checks for overflow using comparisons against
@@ -1357,37 +1350,62 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
   //
   // sum = a + b
   // if (sum+128 >u 255)  ...  -> llvm.sadd.with.overflow.i8
-  {
-    ConstantInt *CI2; // I = icmp ugt (add (add A, B), CI2), CI
-    if (Pred == ICmpInst::ICMP_UGT &&
-        match(X, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
-      if (Instruction *Res = processUGT_ADDCST_ADD(
-              Cmp, A, B, CI2, cast<ConstantInt>(Cmp.getOperand(1)), *this))
-        return Res;
-  }
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *Op0 = Cmp.getOperand(0), *Op1 = Cmp.getOperand(1);
+  Value *A, *B;
+  ConstantInt *CI, *CI2; // I = icmp ugt (add (add A, B), CI2), CI
+  if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) &&
+      match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
+    if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this))
+      return Res;
 
-  // FIXME: Use m_APInt to allow folds for splat constants.
-  ConstantInt *CI = dyn_cast<ConstantInt>(Cmp.getOperand(1));
-  if (!CI)
+  return nullptr;
+}
+
+/// Canonicalize icmp instructions based on dominating conditions.
+Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
+  // This is a cheap/incomplete check for dominance - just match a single
+  // predecessor with a conditional branch.
+  BasicBlock *CmpBB = Cmp.getParent();
+  BasicBlock *DomBB = CmpBB->getSinglePredecessor();
+  if (!DomBB)
     return nullptr;
 
-  // Canonicalize icmp instructions based on dominating conditions.
-  BasicBlock *Parent = Cmp.getParent();
-  BasicBlock *Dom = Parent->getSinglePredecessor();
-  auto *BI = Dom ? dyn_cast<BranchInst>(Dom->getTerminator()) : nullptr;
-  ICmpInst::Predicate Pred2;
+  Value *DomCond;
   BasicBlock *TrueBB, *FalseBB;
-  ConstantInt *CI2;
-  if (BI && match(BI, m_Br(m_ICmp(Pred2, m_Specific(X), m_ConstantInt(CI2)),
-                           TrueBB, FalseBB)) &&
-      TrueBB != FalseBB) {
-    ConstantRange CR =
-        ConstantRange::makeAllowedICmpRegion(Pred, CI->getValue());
+  if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
+    return nullptr;
+
+  assert((TrueBB == CmpBB || FalseBB == CmpBB) &&
+         "Predecessor block does not point to successor?");
+
+  // The branch should get simplified. Don't bother simplifying this condition.
+  if (TrueBB == FalseBB)
+    return nullptr;
+
+  // Try to simplify this compare to T/F based on the dominating condition.
+  Optional<bool> Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB);
+  if (Imp)
+    return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp));
+
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = Cmp.getOperand(0), *Y = Cmp.getOperand(1);
+  ICmpInst::Predicate DomPred;
+  const APInt *C, *DomC;
+  if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) &&
+      match(Y, m_APInt(C))) {
+    // We have 2 compares of a variable with constants. Calculate the constant
+    // ranges of those compares to see if we can transform the 2nd compare:
+    // DomBB:
+    //   DomCond = icmp DomPred X, DomC
+    //   br DomCond, CmpBB, FalseBB
+    // CmpBB:
+    //   Cmp = icmp Pred X, C
+    ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C);
     ConstantRange DominatingCR =
-        (Parent == TrueBB)
-            ? ConstantRange::makeExactICmpRegion(Pred2, CI2->getValue())
-            : ConstantRange::makeExactICmpRegion(
-                  CmpInst::getInversePredicate(Pred2), CI2->getValue());
+        (CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC)
+                          : ConstantRange::makeExactICmpRegion(
+                                CmpInst::getInversePredicate(DomPred), *DomC);
     ConstantRange Intersection = DominatingCR.intersectWith(CR);
     ConstantRange Difference = DominatingCR.difference(CR);
     if (Intersection.isEmptySet())
@@ -1395,23 +1413,20 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
     if (Difference.isEmptySet())
       return replaceInstUsesWith(Cmp, Builder.getTrue());
 
-    // If this is a normal comparison, it demands all bits. If it is a sign
-    // bit comparison, it only demands the sign bit.
-    bool UnusedBit;
-    bool IsSignBit = isSignBitCheck(Pred, CI->getValue(), UnusedBit);
-
     // Canonicalizing a sign bit comparison that gets used in a branch,
     // pessimizes codegen by generating branch on zero instruction instead
     // of a test and branch. So we avoid canonicalizing in such situations
     // because test and branch instruction has better branch displacement
     // than compare and branch instruction.
+    bool UnusedBit;
+    bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit);
     if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
       return nullptr;
 
-    if (auto *AI = Intersection.getSingleElement())
-      return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*AI));
-    if (auto *AD = Difference.getSingleElement())
-      return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*AD));
+    if (const APInt *EqC = Intersection.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC));
+    if (const APInt *NeC = Difference.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC));
   }
 
   return nullptr;
@@ -2750,6 +2765,7 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
 
   // Handle icmp {eq|ne} <intrinsic>, Constant.
   Type *Ty = II->getType();
+  unsigned BitWidth = C.getBitWidth();
   switch (II->getIntrinsicID()) {
   case Intrinsic::bswap:
     Worklist.Add(II);
@@ -2758,21 +2774,39 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
     return &Cmp;
 
   case Intrinsic::ctlz:
-  case Intrinsic::cttz:
+  case Intrinsic::cttz: {
     // ctz(A) == bitwidth(A)  ->  A == 0 and likewise for !=
-    if (C == C.getBitWidth()) {
+    if (C == BitWidth) {
       Worklist.Add(II);
       Cmp.setOperand(0, II->getArgOperand(0));
       Cmp.setOperand(1, ConstantInt::getNullValue(Ty));
       return &Cmp;
     }
+
+    // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set
+    // and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits.
+    // Limit to one use to ensure we don't increase instruction count.
+    unsigned Num = C.getLimitedValue(BitWidth);
+    if (Num != BitWidth && II->hasOneUse()) {
+      bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz;
+      APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1)
+                               : APInt::getHighBitsSet(BitWidth, Num + 1);
+      APInt Mask2 = IsTrailing
+        ? APInt::getOneBitSet(BitWidth, Num)
+        : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
+      Cmp.setOperand(0, Builder.CreateAnd(II->getArgOperand(0), Mask1));
+      Cmp.setOperand(1, ConstantInt::get(Ty, Mask2));
+      Worklist.Add(II);
+      return &Cmp;
+    }
     break;
+  }
 
   case Intrinsic::ctpop: {
     // popcount(A) == 0  ->  A == 0 and likewise for !=
     // popcount(A) == bitwidth(A)  ->  A == -1 and likewise for !=
     bool IsZero = C.isNullValue();
-    if (IsZero || C == C.getBitWidth()) {
+    if (IsZero || C == BitWidth) {
       Worklist.Add(II);
       Cmp.setOperand(0, II->getArgOperand(0));
       auto *NewOp =
@@ -2955,12 +2989,20 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
     //  x & (-1 >> y) s>= x    ->    x s<= (-1 >> y)
     if (X != I.getOperand(1)) // X must be on RHS of comparison!
       return nullptr;         // Ignore the other case.
+    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
+      return nullptr;
+    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
+      return nullptr;
     DstPred = ICmpInst::Predicate::ICMP_SLE;
     break;
   case ICmpInst::Predicate::ICMP_SLT:
     //  x & (-1 >> y) s< x    ->    x s> (-1 >> y)
     if (X != I.getOperand(1)) // X must be on RHS of comparison!
       return nullptr;         // Ignore the other case.
+    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
+      return nullptr;
+    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
+      return nullptr;
     DstPred = ICmpInst::Predicate::ICMP_SGT;
     break;
   case ICmpInst::Predicate::ICMP_SLE:
@@ -4765,6 +4807,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpWithConstant(I))
     return Res;
 
+  if (Instruction *Res = foldICmpWithDominatingICmp(I))
+    return Res;
+
   if (Instruction *Res = foldICmpUsingKnownBits(I))
     return Res;
 
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 431856c9e00609f45623383138556585d8f8b0fe..e507630f9ea0b2722bb356913c02c7ab868b611d 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -857,6 +857,7 @@ private:
   Instruction *foldICmpWithCastAndCast(ICmpInst &ICI);
 
   Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
+  Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
   Instruction *foldICmpWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
@@ -919,8 +920,11 @@ private:
   Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
                          bool isSigned, bool Inside);
   Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
-  Instruction *MatchBSwap(BinaryOperator &I);
-  bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+  bool mergeStoreIntoSuccessor(StoreInst &SI);
+
+  /// Given an 'or' instruction, check to see if it is part of a bswap idiom.
+  /// If so, return the equivalent bswap intrinsic.
+  Instruction *matchBSwap(BinaryOperator &Or);
 
   Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI);
   Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI);
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 1ff7037c8d2d2b67ff8163ba0d35321bc9f14846..02ebb5ef294681473a8a4592ccba602c4b195944 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
@@ -1498,64 +1499,45 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   if (isa<UndefValue>(Val))
     return eraseInstFromFunction(SI);
 
-  // If this store is the last instruction in the basic block (possibly
-  // excepting debug info instructions), and if the block ends with an
-  // unconditional branch, try to move it to the successor block.
+  // If this store is the second-to-last instruction in the basic block
+  // (excluding debug info and bitcasts of pointers) and if the block ends with
+  // an unconditional branch, try to move the store to the successor block.
   BBI = SI.getIterator();
   do {
     ++BBI;
   } while (isa<DbgInfoIntrinsic>(BBI) ||
            (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy()));
+
   if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
     if (BI->isUnconditional())
-      if (SimplifyStoreAtEndOfBlock(SI))
-        return nullptr;  // xform done!
+      mergeStoreIntoSuccessor(SI);
 
   return nullptr;
 }
 
-/// SimplifyStoreAtEndOfBlock - Turn things like:
+/// Try to transform:
 ///   if () { *P = v1; } else { *P = v2 }
-/// into a phi node with a store in the successor.
-///
-/// Simplify things like:
+/// or:
 ///   *P = v1; if () { *P = v2; }
 /// into a phi node with a store in the successor.
-///
-bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
+bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
   assert(SI.isUnordered() &&
-         "this code has not been auditted for volatile or ordered store case");
+         "This code has not been audited for volatile or ordered store case.");
 
+  // Check if the successor block has exactly 2 incoming edges.
   BasicBlock *StoreBB = SI.getParent();
-
-  // Check to see if the successor block has exactly two incoming edges.  If
-  // so, see if the other predecessor contains a store to the same location.
-  // if so, insert a PHI node (if needed) and move the stores down.
   BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
-
-  // Determine whether Dest has exactly two predecessors and, if so, compute
-  // the other predecessor.
-  pred_iterator PI = pred_begin(DestBB);
-  BasicBlock *P = *PI;
-  BasicBlock *OtherBB = nullptr;
-
-  if (P != StoreBB)
-    OtherBB = P;
-
-  if (++PI == pred_end(DestBB))
+  if (!DestBB->hasNPredecessors(2))
     return false;
 
-  P = *PI;
-  if (P != StoreBB) {
-    if (OtherBB)
-      return false;
-    OtherBB = P;
-  }
-  if (++PI != pred_end(DestBB))
-    return false;
+  // Capture the other block (the block that doesn't contain our store).
+  pred_iterator PredIter = pred_begin(DestBB);
+  if (*PredIter == StoreBB)
+    ++PredIter;
+  BasicBlock *OtherBB = *PredIter;
 
-  // Bail out if all the relevant blocks aren't distinct (this can happen,
-  // for example, if SI is in an infinite loop)
+  // Bail out if all of the relevant blocks aren't distinct. This can happen,
+  // for example, if SI is in an infinite loop.
   if (StoreBB == DestBB || OtherBB == DestBB)
     return false;
 
@@ -1566,7 +1548,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
     return false;
 
   // If the other block ends in an unconditional branch, check for the 'if then
-  // else' case.  there is an instruction before the branch.
+  // else' case. There is an instruction before the branch.
   StoreInst *OtherStore = nullptr;
   if (OtherBr->isUnconditional()) {
     --BBI;
@@ -1591,7 +1573,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
       return false;
 
     // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
-    // if/then triangle.  See if there is a store to the same ptr as SI that
+    // if/then triangle. See if there is a store to the same ptr as SI that
     // lives in OtherBB.
     for (;; --BBI) {
       // Check to see if we find the matching store.
@@ -1602,15 +1584,14 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
         break;
       }
       // If we find something that may be using or overwriting the stored
-      // value, or if we run out of instructions, we can't do the xform.
+      // value, or if we run out of instructions, we can't do the transform.
       if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
           BBI->mayWriteToMemory() || BBI == OtherBB->begin())
         return false;
     }
 
-    // In order to eliminate the store in OtherBr, we have to
-    // make sure nothing reads or overwrites the stored value in
-    // StoreBB.
+    // In order to eliminate the store in OtherBr, we have to make sure nothing
+    // reads or overwrites the stored value in StoreBB.
     for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
       // FIXME: This should really be AA driven.
       if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
@@ -1620,24 +1601,24 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
 
   // Insert a PHI node now if we need it.
   Value *MergedVal = OtherStore->getOperand(0);
+  // The debug locations of the original instructions might differ. Merge them.
+  DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
+                                                     OtherStore->getDebugLoc());
   if (MergedVal != SI.getOperand(0)) {
     PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
     PN->addIncoming(SI.getOperand(0), SI.getParent());
     PN->addIncoming(OtherStore->getOperand(0), OtherBB);
     MergedVal = InsertNewInstBefore(PN, DestBB->front());
+    PN->setDebugLoc(MergedLoc);
   }
 
-  // Advance to a place where it is safe to insert the new store and
-  // insert it.
+  // Advance to a place where it is safe to insert the new store and insert it.
   BBI = DestBB->getFirstInsertionPt();
   StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
-                                   SI.isVolatile(),
-                                   SI.getAlignment(),
-                                   SI.getOrdering(),
-                                   SI.getSyncScopeID());
+                                   SI.isVolatile(), SI.getAlignment(),
+                                   SI.getOrdering(), SI.getSyncScopeID());
   InsertNewInstBefore(NewSI, *BBI);
-  // The debug locations of the original instructions might differ; merge them.
-  NewSI->applyMergedLocation(SI.getDebugLoc(), OtherStore->getDebugLoc());
+  NewSI->setDebugLoc(MergedLoc);
 
   // If the two stores had AA tags, merge them.
   AAMDNodes AATags;
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 88a72bb8eb57e1e87d3fe6ce4019d64fdb1947c4..19858ae149aba951db98330eb0823061342cc022 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1546,6 +1546,66 @@ static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
   return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp);
 }
 
+/// Try to reduce a rotate pattern that includes a compare and select into a
+/// sequence of ALU ops only. Example:
+/// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b)))
+///              --> (a >> (-b & 31)) | (a << (b & 31))
+static Instruction *foldSelectRotate(SelectInst &Sel,
+                                     InstCombiner::BuilderTy &Builder) {
+  // The false value of the select must be a rotate of the true value.
+  Value *Or0, *Or1;
+  if (!match(Sel.getFalseValue(), m_OneUse(m_Or(m_Value(Or0), m_Value(Or1)))))
+    return nullptr;
+
+  Value *TVal = Sel.getTrueValue();
+  Value *SA0, *SA1;
+  if (!match(Or0, m_OneUse(m_LogicalShift(m_Specific(TVal), m_Value(SA0)))) ||
+      !match(Or1, m_OneUse(m_LogicalShift(m_Specific(TVal), m_Value(SA1)))))
+    return nullptr;
+
+  auto ShiftOpcode0 = cast<BinaryOperator>(Or0)->getOpcode();
+  auto ShiftOpcode1 = cast<BinaryOperator>(Or1)->getOpcode();
+  if (ShiftOpcode0 == ShiftOpcode1)
+    return nullptr;
+
+  // We have one of these patterns so far:
+  // select ?, TVal, (or (lshr TVal, SA0), (shl TVal, SA1))
+  // select ?, TVal, (or (shl TVal, SA0), (lshr TVal, SA1))
+  // This must be a power-of-2 rotate for a bitmasking transform to be valid.
+  unsigned Width = Sel.getType()->getScalarSizeInBits();
+  if (!isPowerOf2_32(Width))
+    return nullptr;
+
+  // Check the shift amounts to see if they are an opposite pair.
+  Value *ShAmt;
+  if (match(SA1, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA0)))))
+    ShAmt = SA0;
+  else if (match(SA0, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA1)))))
+    ShAmt = SA1;
+  else
+    return nullptr;
+
+  // Finally, see if the select is filtering out a shift-by-zero.
+  Value *Cond = Sel.getCondition();
+  ICmpInst::Predicate Pred;
+  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) ||
+      Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  // This is a rotate that avoids shift-by-bitwidth UB in a suboptimal way.
+  // Convert to safely bitmasked shifts.
+  // TODO: When we can canonicalize to funnel shift intrinsics without risk of
+  // performance regressions, replace this sequence with that call.
+  Value *NegShAmt = Builder.CreateNeg(ShAmt);
+  Value *MaskedShAmt = Builder.CreateAnd(ShAmt, Width - 1);
+  Value *MaskedNegShAmt = Builder.CreateAnd(NegShAmt, Width - 1);
+  Value *NewSA0 = ShAmt == SA0 ? MaskedShAmt : MaskedNegShAmt;
+  Value *NewSA1 = ShAmt == SA1 ? MaskedShAmt : MaskedNegShAmt;
+  Value *NewSh0 = Builder.CreateBinOp(ShiftOpcode0, TVal, NewSA0);
+  Value *NewSh1 = Builder.CreateBinOp(ShiftOpcode1, TVal, NewSA1);
+  return BinaryOperator::CreateOr(NewSh0, NewSh1);
+}
+
 Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -1806,8 +1866,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // MAX(~a, C)  -> ~MIN(a, ~C)
       // MIN(~a, ~b) -> ~MAX(a, b)
       // MIN(~a, C)  -> ~MAX(a, ~C)
-      auto moveNotAfterMinMax = [&](Value *X, Value *Y,
-                                    bool Swapped) -> Instruction * {
+      auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * {
         Value *A;
         if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) &&
             !IsFreeToInvert(A, A->hasOneUse()) &&
@@ -1820,14 +1879,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
           if (MDNode *MD = SI.getMetadata(LLVMContext::MD_prof)) {
             cast<SelectInst>(NewMinMax)->setMetadata(LLVMContext::MD_prof, MD);
             // Swap the metadata if the operands are swapped.
-            if (Swapped) {
-              assert(X == SI.getFalseValue() && Y == SI.getTrueValue() &&
-                     "Unexpected operands.");
+            if (X == SI.getFalseValue() && Y == SI.getTrueValue())
               cast<SelectInst>(NewMinMax)->swapProfMetadata();
-            } else {
-              assert(X == SI.getTrueValue() && Y == SI.getFalseValue() &&
-                     "Unexpected operands.");
-            }
           }
 
           return BinaryOperator::CreateNot(NewMinMax);
@@ -1836,9 +1889,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return nullptr;
       };
 
-      if (Instruction *I = moveNotAfterMinMax(LHS, RHS, /*Swapped*/false))
+      if (Instruction *I = moveNotAfterMinMax(LHS, RHS))
         return I;
-      if (Instruction *I = moveNotAfterMinMax(RHS, LHS, /*Swapped*/true))
+      if (Instruction *I = moveNotAfterMinMax(RHS, LHS))
         return I;
 
       if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
@@ -1968,24 +2021,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
   }
 
-  // See if we can determine the result of this select based on a dominating
-  // condition.
-  BasicBlock *Parent = SI.getParent();
-  if (BasicBlock *Dom = Parent->getSinglePredecessor()) {
-    auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
-    if (PBI && PBI->isConditional() &&
-        PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
-        (PBI->getSuccessor(0) == Parent || PBI->getSuccessor(1) == Parent)) {
-      bool CondIsTrue = PBI->getSuccessor(0) == Parent;
-      Optional<bool> Implication = isImpliedCondition(
-          PBI->getCondition(), SI.getCondition(), DL, CondIsTrue);
-      if (Implication) {
-        Value *V = *Implication ? TrueVal : FalseVal;
-        return replaceInstUsesWith(SI, V);
-      }
-    }
-  }
-
   // If we can compute the condition, there's no need for a select.
   // Like the above fold, we are attempting to reduce compile-time cost by
   // putting this fold here with limitations rather than in InstSimplify.
@@ -2010,5 +2045,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI))
     return Select;
 
+  if (Instruction *Rot = foldSelectRotate(SI, Builder))
+    return Rot;
+
   return nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 45cacc73d63becb324ffabe1d4d5425a349e2873..a193dde1c39517e4c971aca666ac9e9709a67328 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -690,6 +690,30 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
+      case Intrinsic::fshr:
+      case Intrinsic::fshl: {
+        const APInt *SA;
+        if (!match(I->getOperand(2), m_APInt(SA)))
+          break;
+
+        // Normalize to funnel shift left. APInt shifts of BitWidth are well-
+        // defined, so no need to special-case zero shifts here.
+        uint64_t ShiftAmt = SA->urem(BitWidth);
+        if (II->getIntrinsicID() == Intrinsic::fshr)
+          ShiftAmt = BitWidth - ShiftAmt;
+
+        APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
+        APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
+        if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
+            SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))
+          return I;
+
+        Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
+                     RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
+        Known.One = LHSKnown.One.shl(ShiftAmt) |
+                    RHSKnown.One.lshr(BitWidth - ShiftAmt);
+        break;
+      }
       case Intrinsic::x86_mmx_pmovmskb:
       case Intrinsic::x86_sse_movmsk_ps:
       case Intrinsic::x86_sse2_movmsk_pd:
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 21dd7ed227af476fac845aeb974e4b62dd98164c..0ad1fc0e791f140a28c910ae44bf17570143652d 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -46,40 +46,34 @@ using namespace PatternMatch;
 #define DEBUG_TYPE "instcombine"
 
 /// Return true if the value is cheaper to scalarize than it is to leave as a
-/// vector operation. isConstant indicates whether we're extracting one known
-/// element. If false we're extracting a variable index.
-static bool cheapToScalarize(Value *V, bool isConstant) {
-  if (Constant *C = dyn_cast<Constant>(V)) {
-    if (isConstant) return true;
+/// vector operation. IsConstantExtractIndex indicates whether we are extracting
+/// one known element from a vector constant.
+///
+/// FIXME: It's possible to create more instructions than previously existed.
+static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
+  // If we can pick a scalar constant value out of a vector, that is free.
+  if (auto *C = dyn_cast<Constant>(V))
+    return IsConstantExtractIndex || C->getSplatValue();
+
+  // An insertelement to the same constant index as our extract will simplify
+  // to the scalar inserted element. An insertelement to a different constant
+  // index is irrelevant to our extract.
+  if (match(V, m_InsertElement(m_Value(), m_Value(), m_ConstantInt())))
+    return IsConstantExtractIndex;
+
+  if (match(V, m_OneUse(m_Load(m_Value()))))
+    return true;
 
-    // If all elts are the same, we can extract it and use any of the values.
-    if (Constant *Op0 = C->getAggregateElement(0U)) {
-      for (unsigned i = 1, e = V->getType()->getVectorNumElements(); i != e;
-           ++i)
-        if (C->getAggregateElement(i) != Op0)
-          return false;
+  Value *V0, *V1;
+  if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))
+    if (cheapToScalarize(V0, IsConstantExtractIndex) ||
+        cheapToScalarize(V1, IsConstantExtractIndex))
       return true;
-    }
-  }
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
 
-  // Insert element gets simplified to the inserted element or is deleted if
-  // this is constant idx extract element and its a constant idx insertelt.
-  if (I->getOpcode() == Instruction::InsertElement && isConstant &&
-      isa<ConstantInt>(I->getOperand(2)))
-    return true;
-  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
-    return true;
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
-    if (BO->hasOneUse() &&
-        (cheapToScalarize(BO->getOperand(0), isConstant) ||
-         cheapToScalarize(BO->getOperand(1), isConstant)))
-      return true;
-  if (CmpInst *CI = dyn_cast<CmpInst>(I))
-    if (CI->hasOneUse() &&
-        (cheapToScalarize(CI->getOperand(0), isConstant) ||
-         cheapToScalarize(CI->getOperand(1), isConstant)))
+  CmpInst::Predicate UnusedPred;
+  if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))
+    if (cheapToScalarize(V0, IsConstantExtractIndex) ||
+        cheapToScalarize(V1, IsConstantExtractIndex))
       return true;
 
   return false;
@@ -261,34 +255,30 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
 }
 
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
-  if (Value *V = SimplifyExtractElementInst(EI.getVectorOperand(),
-                                            EI.getIndexOperand(),
+  Value *SrcVec = EI.getVectorOperand();
+  Value *Index = EI.getIndexOperand();
+  if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
                                             SQ.getWithInstruction(&EI)))
     return replaceInstUsesWith(EI, V);
 
-  // If vector val is constant with all elements the same, replace EI with
-  // that element.  We handle a known element # below.
-  if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
-    if (cheapToScalarize(C, false))
-      return replaceInstUsesWith(EI, C->getAggregateElement(0U));
-
   // If extracting a specified index from the vector, see if we can recursively
   // find a previously computed scalar that was inserted into the vector.
-  if (ConstantInt *IdxC = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+  auto *IndexC = dyn_cast<ConstantInt>(Index);
+  if (IndexC) {
     unsigned NumElts = EI.getVectorOperandType()->getNumElements();
 
     // InstSimplify should handle cases where the index is invalid.
-    if (!IdxC->getValue().ule(NumElts))
+    if (!IndexC->getValue().ule(NumElts))
       return nullptr;
 
     // This instruction only demands the single element from the input vector.
     // If the input vector has a single use, simplify it based on this use
     // property.
-    if (EI.getOperand(0)->hasOneUse() && NumElts != 1) {
+    if (SrcVec->hasOneUse() && NumElts != 1) {
       APInt UndefElts(NumElts, 0);
-      APInt DemandedMask(NumElts, 0);
-      DemandedMask.setBit(IdxC->getZExtValue());
-      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), DemandedMask,
+      APInt DemandedElts(NumElts, 0);
+      DemandedElts.setBit(IndexC->getZExtValue());
+      if (Value *V = SimplifyDemandedVectorElts(SrcVec, DemandedElts,
                                                 UndefElts)) {
         EI.setOperand(0, V);
         return &EI;
@@ -300,43 +290,46 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
 
     // If there's a vector PHI feeding a scalar use through this extractelement
     // instruction, try to scalarize the PHI.
-    if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) {
-      Instruction *scalarPHI = scalarizePHI(EI, PN);
-      if (scalarPHI)
-        return scalarPHI;
-    }
+    if (auto *Phi = dyn_cast<PHINode>(SrcVec))
+      if (Instruction *ScalarPHI = scalarizePHI(EI, Phi))
+        return ScalarPHI;
   }
 
-  if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
-    // Push extractelement into predecessor operation if legal and
-    // profitable to do so.
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-      if (I->hasOneUse() &&
-          cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
-        Value *newEI0 =
-          Builder.CreateExtractElement(BO->getOperand(0), EI.getOperand(1),
-                                       EI.getName()+".lhs");
-        Value *newEI1 =
-          Builder.CreateExtractElement(BO->getOperand(1), EI.getOperand(1),
-                                       EI.getName()+".rhs");
-        return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(),
-                                                     newEI0, newEI1, BO);
-      }
-    } else if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
+  BinaryOperator *BO;
+  if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) {
+    // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)
+    Value *X = BO->getOperand(0), *Y = BO->getOperand(1);
+    Value *E0 = Builder.CreateExtractElement(X, Index);
+    Value *E1 = Builder.CreateExtractElement(Y, Index);
+    return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO);
+  }
+
+  Value *X, *Y;
+  CmpInst::Predicate Pred;
+  if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
+      cheapToScalarize(SrcVec, IndexC)) {
+    // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
+    Value *E0 = Builder.CreateExtractElement(X, Index);
+    Value *E1 = Builder.CreateExtractElement(Y, Index);
+    return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1);
+  }
+
+  if (auto *I = dyn_cast<Instruction>(SrcVec)) {
+    if (auto *IE = dyn_cast<InsertElementInst>(I)) {
       // Extracting the inserted element?
-      if (IE->getOperand(2) == EI.getOperand(1))
+      if (IE->getOperand(2) == Index)
         return replaceInstUsesWith(EI, IE->getOperand(1));
       // If the inserted and extracted elements are constants, they must not
       // be the same value, extract from the pre-inserted value instead.
-      if (isa<Constant>(IE->getOperand(2)) && isa<Constant>(EI.getOperand(1))) {
-        Worklist.AddValue(EI.getOperand(0));
+      if (isa<Constant>(IE->getOperand(2)) && IndexC) {
+        Worklist.AddValue(SrcVec);
         EI.setOperand(0, IE->getOperand(0));
         return &EI;
       }
-    } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+    } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
       // If this is extracting an element from a shufflevector, figure out where
       // it came from and extract from the appropriate input element instead.
-      if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+      if (auto *Elt = dyn_cast<ConstantInt>(Index)) {
         int SrcIdx = SVI->getMaskValue(Elt->getZExtValue());
         Value *Src;
         unsigned LHSWidth =
@@ -355,13 +348,12 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
                                           ConstantInt::get(Int32Ty,
                                                            SrcIdx, false));
       }
-    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    } else if (auto *CI = dyn_cast<CastInst>(I)) {
       // Canonicalize extractelement(cast) -> cast(extractelement).
       // Bitcasts can change the number of vector elements, and they cost
       // nothing.
       if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
-        Value *EE = Builder.CreateExtractElement(CI->getOperand(0),
-                                                 EI.getIndexOperand());
+        Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
         Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index a3962a04b500cbfa0fbf169f1e9cf6aa074a3c19..be7d43bbcf2c32fbfe25920e40f20b24fdef4369 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -183,7 +183,10 @@ bool InstCombiner::shouldChangeType(unsigned FromWidth,
 /// a fundamental type in IR, and there are many specialized optimizations for
 /// i1 types.
 bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
-  assert(From->isIntegerTy() && To->isIntegerTy());
+  // TODO: This could be extended to allow vectors. Datalayout changes might be
+  // needed to properly support that.
+  if (!From->isIntegerTy() || !To->isIntegerTy())
+    return false;
 
   unsigned FromWidth = From->getPrimitiveSizeInBits();
   unsigned ToWidth = To->getPrimitiveSizeInBits();
@@ -1424,21 +1427,23 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
       V1->getType()->getVectorNumElements() <= NumElts) {
     assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() &&
            "Shuffle should not change scalar type");
-    unsigned V1Width = V1->getType()->getVectorNumElements();
+
     // Find constant NewC that has property:
     //   shuffle(NewC, ShMask) = C
     // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>)
     // reorder is not possible. A 1-to-1 mapping is not required. Example:
     // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
+    bool ConstOp1 = isa<Constant>(RHS);
     SmallVector<int, 16> ShMask;
     ShuffleVectorInst::getShuffleMask(Mask, ShMask);
-    SmallVector<Constant *, 16>
-        NewVecC(V1Width, UndefValue::get(C->getType()->getScalarType()));
+    unsigned SrcVecNumElts = V1->getType()->getVectorNumElements();
+    UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType());
+    SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar);
     bool MayChange = true;
     for (unsigned I = 0; I < NumElts; ++I) {
+      Constant *CElt = C->getAggregateElement(I);
       if (ShMask[I] >= 0) {
         assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle");
-        Constant *CElt = C->getAggregateElement(I);
         Constant *NewCElt = NewVecC[ShMask[I]];
         // Bail out if:
         // 1. The constant vector contains a constant expression.
@@ -1447,26 +1452,41 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
         // 3. This is a widening shuffle that copies elements of V1 into the
         //    extended elements (extending with undef is allowed).
         if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt) ||
-            I >= V1Width) {
+            I >= SrcVecNumElts) {
           MayChange = false;
           break;
         }
         NewVecC[ShMask[I]] = CElt;
       }
+      // If this is a widening shuffle, we must be able to extend with undef
+      // elements. If the original binop does not produce an undef in the high
+      // lanes, then this transform is not safe.
+      // TODO: We could shuffle those non-undef constant values into the
+      //       result by using a constant vector (rather than an undef vector)
+      //       as operand 1 of the new binop, but that might be too aggressive
+      //       for target-independent shuffle creation.
+      if (I >= SrcVecNumElts) {
+        Constant *MaybeUndef =
+            ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt)
+                     : ConstantExpr::get(Opcode, CElt, UndefScalar);
+        if (!isa<UndefValue>(MaybeUndef)) {
+          MayChange = false;
+          break;
+        }
+      }
     }
     if (MayChange) {
       Constant *NewC = ConstantVector::get(NewVecC);
       // It may not be safe to execute a binop on a vector with undef elements
       // because the entire instruction can be folded to undef or create poison
       // that did not exist in the original code.
-      bool ConstOp1 = isa<Constant>(Inst.getOperand(1));
       if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1))
         NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1);
 
       // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask)
       // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask)
-      Value *NewLHS = isa<Constant>(LHS) ? NewC : V1;
-      Value *NewRHS = isa<Constant>(LHS) ? V1 : NewC;
+      Value *NewLHS = ConstOp1 ? V1 : NewC;
+      Value *NewRHS = ConstOp1 ? NewC : V1;
       return createBinOpShuffle(NewLHS, NewRHS, Mask);
     }
   }
@@ -2244,9 +2264,16 @@ static bool isAllocSiteRemovable(Instruction *AI,
 }
 
 Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
-  // If we have a malloc call which is only used in any amount of comparisons
-  // to null and free calls, delete the calls and replace the comparisons with
-  // true or false as appropriate.
+  // If we have a malloc call which is only used in any amount of comparisons to
+  // null and free calls, delete the calls and replace the comparisons with true
+  // or false as appropriate.
+
+  // This is based on the principle that we can substitute our own allocation
+  // function (which will never return null) rather than knowledge of the
+  // specific function being called. In some sense this can change the permitted
+  // outputs of a program (when we convert a malloc to an alloca, the fact that
+  // the allocation is now on the stack is potentially visible, for example),
+  // but we believe in a permissible manner.
   SmallVector<WeakTrackingVH, 64> Users;
 
   // If we are removing an alloca with a dbg.declare, insert dbg.value calls
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 42b8179f80052704ef6f9e828625f2de34ce8f05..2ede46774b24ebff28080fefea5e8d0581710326 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -109,6 +109,7 @@ static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
 static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSDKasan_ShadowOffset64 = 0xdfff900000000000;
 static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
 static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
 
@@ -344,10 +345,14 @@ static cl::opt<uint32_t> ClForceExperiment(
     cl::init(0));
 
 static cl::opt<bool>
-    ClUsePrivateAliasForGlobals("asan-use-private-alias",
-                                cl::desc("Use private aliases for global"
-                                         " variables"),
-                                cl::Hidden, cl::init(false));
+    ClUsePrivateAlias("asan-use-private-alias",
+                      cl::desc("Use private aliases for global variables"),
+                      cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClUseOdrIndicator("asan-use-odr-indicator",
+                      cl::desc("Use odr indicators to improve ODR reporting"),
+                      cl::Hidden, cl::init(false));
 
 static cl::opt<bool>
     ClUseGlobalsGC("asan-globals-live-support",
@@ -436,8 +441,11 @@ public:
     for (auto MDN : Globals->operands()) {
       // Metadata node contains the global and the fields of "Entry".
       assert(MDN->getNumOperands() == 5);
-      auto *GV = mdconst::extract_or_null<GlobalVariable>(MDN->getOperand(0));
+      auto *V = mdconst::extract_or_null<Constant>(MDN->getOperand(0));
       // The optimizer may optimize away a global entirely.
+      if (!V) continue;
+      auto *StrippedV = V->stripPointerCasts();
+      auto *GV = dyn_cast<GlobalVariable>(StrippedV);
       if (!GV) continue;
       // We can already have an entry for GV if it was merged with another
       // global.
@@ -540,9 +548,12 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
       Mapping.Offset = kSystemZ_ShadowOffset64;
     else if (IsFreeBSD && !IsMIPS64)
       Mapping.Offset = kFreeBSD_ShadowOffset64;
-    else if (IsNetBSD)
-      Mapping.Offset = kNetBSD_ShadowOffset64;
-    else if (IsPS4CPU)
+    else if (IsNetBSD) {
+      if (IsKasan)
+        Mapping.Offset = kNetBSDKasan_ShadowOffset64;
+      else
+        Mapping.Offset = kNetBSD_ShadowOffset64;
+    } else if (IsPS4CPU)
       Mapping.Offset = kPS4CPU_ShadowOffset64;
     else if (IsLinux && IsX86_64) {
       if (IsKasan)
@@ -731,9 +742,12 @@ public:
 
   explicit AddressSanitizerModule(bool CompileKernel = false,
                                   bool Recover = false,
-                                  bool UseGlobalsGC = true)
-      : ModulePass(ID),
-        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+                                  bool UseGlobalsGC = true,
+                                  bool UseOdrIndicator = false)
+      : ModulePass(ID), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+        // Enable aliases as they should have no downside with ODR indicators.
+        UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias),
+        UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator),
         // Not a typo: ClWithComdat is almost completely pointless without
         // ClUseGlobalsGC (because then it only works on modules without
         // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
@@ -742,11 +756,10 @@ public:
         // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
         // do globals-gc.
         UseCtorComdat(UseGlobalsGC && ClWithComdat) {
-          this->Recover = ClRecover.getNumOccurrences() > 0 ?
-              ClRecover : Recover;
-          this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
-              ClEnableKasan : CompileKernel;
-	}
+    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+    this->CompileKernel =
+        ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
+  }
 
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override { return "AddressSanitizerModule"; }
@@ -790,6 +803,8 @@ private:
   bool CompileKernel;
   bool Recover;
   bool UseGlobalsGC;
+  bool UsePrivateAlias;
+  bool UseOdrIndicator;
   bool UseCtorComdat;
   Type *IntptrTy;
   LLVMContext *C;
@@ -1089,9 +1104,11 @@ INITIALIZE_PASS(
 
 ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel,
                                                    bool Recover,
-                                                   bool UseGlobalsGC) {
+                                                   bool UseGlobalsGC,
+                                                   bool UseOdrIndicator) {
   assert(!CompileKernel || Recover);
-  return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC);
+  return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC,
+                                    UseOdrIndicator);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -2173,12 +2190,20 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
     bool CanUsePrivateAliases =
         TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() ||
         TargetTriple.isOSBinFormatWasm();
-    if (CanUsePrivateAliases && ClUsePrivateAliasForGlobals) {
+    if (CanUsePrivateAliases && UsePrivateAlias) {
       // Create local alias for NewGlobal to avoid crash on ODR between
       // instrumented and non-instrumented libraries.
-      auto *GA = GlobalAlias::create(GlobalValue::InternalLinkage,
-                                     NameForGlobal + M.getName(), NewGlobal);
+      InstrumentedGlobal =
+          GlobalAlias::create(GlobalValue::PrivateLinkage, "", NewGlobal);
+    }
 
+    // ODR check is not useful for the following, but we see false reports
+    // caused by linker optimizations.
+    if (NewGlobal->hasLocalLinkage() || NewGlobal->hasLinkOnceODRLinkage() ||
+        NewGlobal->hasWeakODRLinkage()) {
+      ODRIndicator = ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1),
+                                               IRB.getInt8PtrTy());
+    } else if (UseOdrIndicator) {
       // With local aliases, we need to provide another externally visible
       // symbol __odr_asan_XXX to detect ODR violation.
       auto *ODRIndicatorSym =
@@ -2192,7 +2217,6 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
       ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
       ODRIndicatorSym->setAlignment(1);
       ODRIndicator = ODRIndicatorSym;
-      InstrumentedGlobal = GA;
     }
 
     Constant *Initializer = ConstantStruct::get(
diff --git a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 8f4159d3d191ce61ade22b1cd8b6aa03ccfe653a..1ada0b7130927aeed0efd624dde2d837de362e09 100644
--- a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1416,7 +1416,7 @@ bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) {
 void CHR::sortScopes(SmallVectorImpl<CHRScope *> &Input,
                      SmallVectorImpl<CHRScope *> &Output) {
   Output.resize(Input.size());
-  std::copy(Input.begin(), Input.end(), Output.begin());
+  llvm::copy(Input, Output.begin());
   std::stable_sort(Output.begin(), Output.end(), CHRScopeSorter);
 }
 
@@ -2040,7 +2040,7 @@ bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
       getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   ProfileSummaryInfo &PSI =
-      *getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
       llvm::make_unique<OptimizationRemarkEmitter>(&F);
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 084e6b7e43690d50033ce4d2d32396a3d123c5e9..829123911c61b4f6c54bc0a832d72b9bccf36925 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
@@ -96,6 +97,11 @@ private:
   // profiling runtime to emit .gcda files when run.
   bool emitProfileArcs();
 
+  bool isFunctionInstrumented(const Function &F);
+  std::vector<Regex> createRegexesFromString(StringRef RegexesStr);
+  static bool doesFilenameMatchARegex(StringRef Filename,
+                                      std::vector<Regex> &Regexes);
+
   // Get pointers to the functions in the runtime library.
   Constant *getStartFileFunc();
   Constant *getEmitFunctionFunc();
@@ -125,6 +131,9 @@ private:
   const TargetLibraryInfo *TLI;
   LLVMContext *Ctx;
   SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
+  std::vector<Regex> FilterRe;
+  std::vector<Regex> ExcludeRe;
+  StringMap<bool> InstrumentedFiles;
 };
 
 class GCOVProfilerLegacyPass : public ModulePass {
@@ -171,6 +180,21 @@ static StringRef getFunctionName(const DISubprogram *SP) {
   return SP->getName();
 }
 
+/// Extract a filename for a DISubprogram.
+///
+/// Prefer relative paths in the coverage notes. Clang also may split
+/// up absolute paths into a directory and filename component. When
+/// the relative path doesn't exist, reconstruct the absolute path.
+SmallString<128> getFilename(const DISubprogram *SP) {
+  SmallString<128> Path;
+  StringRef RelPath = SP->getFilename();
+  if (sys::fs::exists(RelPath))
+    Path = RelPath;
+  else
+    sys::path::append(Path, SP->getDirectory(), SP->getFilename());
+  return Path;
+}
+
 namespace {
   class GCOVRecord {
    protected:
@@ -247,7 +271,7 @@ namespace {
     }
 
    private:
-    StringRef Filename;
+    std::string Filename;
     SmallVector<uint32_t, 32> Lines;
   };
 
@@ -368,8 +392,9 @@ namespace {
 
     void writeOut() {
       writeBytes(FunctionTag, 4);
+      SmallString<128> Filename = getFilename(SP);
       uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
-                          1 + lengthOfGCOVString(SP->getFilename()) + 1;
+                          1 + lengthOfGCOVString(Filename) + 1;
       if (UseCfgChecksum)
         ++BlockLen;
       write(BlockLen);
@@ -378,7 +403,7 @@ namespace {
       if (UseCfgChecksum)
         write(CfgChecksum);
       writeGCOVString(getFunctionName(SP));
-      writeGCOVString(SP->getFilename());
+      writeGCOVString(Filename);
       write(SP->getLine());
 
       // Emit count of blocks.
@@ -423,6 +448,72 @@ namespace {
   };
 }
 
+// RegexesStr is a string containing differents regex separated by a semi-colon.
+// For example "foo\..*$;bar\..*$".
+std::vector<Regex> GCOVProfiler::createRegexesFromString(StringRef RegexesStr) {
+  std::vector<Regex> Regexes;
+  while (!RegexesStr.empty()) {
+    std::pair<StringRef, StringRef> HeadTail = RegexesStr.split(';');
+    if (!HeadTail.first.empty()) {
+      Regex Re(HeadTail.first);
+      std::string Err;
+      if (!Re.isValid(Err)) {
+        Ctx->emitError(Twine("Regex ") + HeadTail.first +
+                       " is not valid: " + Err);
+      }
+      Regexes.emplace_back(std::move(Re));
+    }
+    RegexesStr = HeadTail.second;
+  }
+  return Regexes;
+}
+
+bool GCOVProfiler::doesFilenameMatchARegex(StringRef Filename,
+                                           std::vector<Regex> &Regexes) {
+  for (Regex &Re : Regexes) {
+    if (Re.match(Filename)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
+  if (FilterRe.empty() && ExcludeRe.empty()) {
+    return true;
+  }
+  SmallString<128> Filename = getFilename(F.getSubprogram());
+  auto It = InstrumentedFiles.find(Filename);
+  if (It != InstrumentedFiles.end()) {
+    return It->second;
+  }
+
+  SmallString<256> RealPath;
+  StringRef RealFilename;
+
+  // Path can be
+  // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
+  // such a case we must get the real_path.
+  if (sys::fs::real_path(Filename, RealPath)) {
+    // real_path can fail with path like "foo.c".
+    RealFilename = Filename;
+  } else {
+    RealFilename = RealPath;
+  }
+
+  bool ShouldInstrument;
+  if (FilterRe.empty()) {
+    ShouldInstrument = !doesFilenameMatchARegex(RealFilename, ExcludeRe);
+  } else if (ExcludeRe.empty()) {
+    ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe);
+  } else {
+    ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe) &&
+                       !doesFilenameMatchARegex(RealFilename, ExcludeRe);
+  }
+  InstrumentedFiles[Filename] = ShouldInstrument;
+  return ShouldInstrument;
+}
+
 std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
                                      GCovFileType OutputType) {
   bool Notes = OutputType == GCovFileType::GCNO;
@@ -472,6 +563,9 @@ bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) {
 
   AddFlushBeforeForkAndExec();
 
+  FilterRe = createRegexesFromString(Options.Filter);
+  ExcludeRe = createRegexesFromString(Options.Exclude);
+
   if (Options.EmitNotes) emitProfileNotes();
   if (Options.EmitData) return emitProfileArcs();
   return false;
@@ -589,7 +683,8 @@ void GCOVProfiler::emitProfileNotes() {
     for (auto &F : M->functions()) {
       DISubprogram *SP = F.getSubprogram();
       if (!SP) continue;
-      if (!functionHasLines(F)) continue;
+      if (!functionHasLines(F) || !isFunctionInstrumented(F))
+        continue;
       // TODO: Functions using scope-based EH are currently not supported.
       if (isUsingScopeBasedEH(F)) continue;
 
@@ -609,7 +704,8 @@ void GCOVProfiler::emitProfileNotes() {
       // Add the function line number to the lines of the entry block
       // to have a counter for the function definition.
       uint32_t Line = SP->getLine();
-      Func.getBlock(&EntryBlock).getFile(SP->getFilename()).addLine(Line);
+      auto Filename = getFilename(SP);
+      Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line);
 
       for (auto &BB : F) {
         GCOVBlock &Block = Func.getBlock(&BB);
@@ -640,7 +736,7 @@ void GCOVProfiler::emitProfileNotes() {
           if (SP != getDISubprogram(Loc.getScope()))
             continue;
 
-          GCOVLines &Lines = Block.getFile(SP->getFilename());
+          GCOVLines &Lines = Block.getFile(Filename);
           Lines.addLine(Loc.getLine());
         }
         Line = 0;
@@ -673,7 +769,8 @@ bool GCOVProfiler::emitProfileArcs() {
     for (auto &F : M->functions()) {
       DISubprogram *SP = F.getSubprogram();
       if (!SP) continue;
-      if (!functionHasLines(F)) continue;
+      if (!functionHasLines(F) || !isFunctionInstrumented(F))
+        continue;
       // TODO: Functions using scope-based EH are currently not supported.
       if (isUsingScopeBasedEH(F)) continue;
       if (!Result) Result = true;
diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 91021604169a6936e642567291202fe15ec31b50..042e8ea0d2945fea576259ec523aa97442ef75b0 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -309,15 +309,24 @@ bool HWAddressSanitizer::doInitialization(Module &M) {
                                             kHwasanInitName,
                                             /*InitArgTypes=*/{},
                                             /*InitArgs=*/{});
-    appendToGlobalCtors(M, HwasanCtorFunction, 0);
-  }
+    Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
+    HwasanCtorFunction->setComdat(CtorComdat);
+    appendToGlobalCtors(M, HwasanCtorFunction, 0, HwasanCtorFunction);
+
+    // Create a zero-length global in __hwasan_frame so that the linker will
+    // always create start and stop symbols.
+    //
+    // N.B. If we ever start creating associated metadata in this pass this
+    // global will need to be associated with the ctor.
+    Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0);
+    auto GV =
+        new GlobalVariable(M, Int8Arr0Ty, /*isConstantGlobal*/ true,
+                           GlobalVariable::PrivateLinkage,
+                           Constant::getNullValue(Int8Arr0Ty), "__hwasan");
+    GV->setSection(getFrameSection());
+    GV->setComdat(CtorComdat);
+    appendToCompilerUsed(M, GV);
 
-  // Create a call to __hwasan_init_frames.
-  if (HwasanCtorFunction) {
-    // Create a dummy frame description for the CTOR function.
-    // W/o it we would have to create the call to __hwasan_init_frames after
-    // all functions are instrumented (i.e. need to have a ModulePass).
-    createFrameGlobal(*HwasanCtorFunction, "");
     IRBuilder<> IRBCtor(HwasanCtorFunction->getEntryBlock().getTerminator());
     IRBCtor.CreateCall(
         declareSanitizerInitFunction(M, "__hwasan_init_frames",
@@ -742,10 +751,9 @@ void HWAddressSanitizer::createFrameGlobal(Function &F,
   GV->setSection(getFrameSection());
   appendToCompilerUsed(M, GV);
   // Put GV into the F's Comadat so that if F is deleted GV can be deleted too.
-  if (&F != HwasanCtorFunction)
-    if (auto Comdat =
-            GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
-      GV->setComdat(Comdat);
+  if (auto Comdat =
+          GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
+    GV->setComdat(Comdat);
 }
 
 Value *HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB,
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index d839dd21a239c4fd379d7c9ab7a638dce9c8853f..7edc53d07c7565fd3892d709744ad2bffef9a975 100644
--- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -427,7 +427,7 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
 
 bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
   ProfileSummaryInfo *PSI =
-      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 
   // Command-line option has the priority for InLTO.
   return promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 62da93002539e45d56c12fe285ba4a286ed8415f..15b94388cbe51429b513478eaee6f1af8e251742 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -701,6 +701,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Module &M) {
   // Use linker script magic to get data/cnts/name start/end.
   if (Triple(M.getTargetTriple()).isOSLinux() ||
       Triple(M.getTargetTriple()).isOSFreeBSD() ||
+      Triple(M.getTargetTriple()).isOSNetBSD() ||
       Triple(M.getTargetTriple()).isOSFuchsia() ||
       Triple(M.getTargetTriple()).isPS4CPU())
     return false;
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 960c1f42900c63ddb4e02a7bf5cc342fd0eb45f5..0bbf3a90b9560c13a59af88d3f574a2168634bec 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -255,10 +255,13 @@ static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
 // passed into an assembly call. Note that this may cause false positives.
 // Because it's impossible to figure out the array sizes, we can only unpoison
 // the first sizeof(type) bytes for each type* pointer.
+// The instrumentation is only enabled in KMSAN builds, and only if
+// -msan-handle-asm-conservative is on. This is done because we may want to
+// quickly disable assembly instrumentation when it breaks.
 static cl::opt<bool> ClHandleAsmConservative(
     "msan-handle-asm-conservative",
     cl::desc("conservative handling of inline assembly"), cl::Hidden,
-    cl::init(false));
+    cl::init(true));
 
 // This flag controls whether we check the shadow of the address
 // operand of load or store. Such bugs are very rare, since load from
@@ -3118,7 +3121,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // outputs as clean. Note that any side effects of the inline asm that are
       // not immediately visible in its constraints are not handled.
       if (Call->isInlineAsm()) {
-        if (ClHandleAsmConservative)
+        if (ClHandleAsmConservative && MS.CompileKernel)
           visitAsmInstruction(I);
         else
           visitInstruction(I);
diff --git a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index ba4924c9cb2d35d0aacfc1c79bad4550d3431382..7f6b157304a3808589fe4245e6a120bc9469af18 100644
--- a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -74,27 +75,27 @@ public:
 
     switch (kind) {
     case ARCRuntimeEntryPointKind::AutoreleaseRV:
-      return getI8XRetI8XEntryPoint(AutoreleaseRV,
-                                    "objc_autoreleaseReturnValue", true);
+      return getIntrinsicEntryPoint(AutoreleaseRV,
+                                    Intrinsic::objc_autoreleaseReturnValue);
     case ARCRuntimeEntryPointKind::Release:
-      return getVoidRetI8XEntryPoint(Release, "objc_release");
+      return getIntrinsicEntryPoint(Release, Intrinsic::objc_release);
     case ARCRuntimeEntryPointKind::Retain:
-      return getI8XRetI8XEntryPoint(Retain, "objc_retain", true);
+      return getIntrinsicEntryPoint(Retain, Intrinsic::objc_retain);
     case ARCRuntimeEntryPointKind::RetainBlock:
-      return getI8XRetI8XEntryPoint(RetainBlock, "objc_retainBlock", false);
+      return getIntrinsicEntryPoint(RetainBlock, Intrinsic::objc_retainBlock);
     case ARCRuntimeEntryPointKind::Autorelease:
-      return getI8XRetI8XEntryPoint(Autorelease, "objc_autorelease", true);
+      return getIntrinsicEntryPoint(Autorelease, Intrinsic::objc_autorelease);
     case ARCRuntimeEntryPointKind::StoreStrong:
-      return getI8XRetI8XXI8XEntryPoint(StoreStrong, "objc_storeStrong");
+      return getIntrinsicEntryPoint(StoreStrong, Intrinsic::objc_storeStrong);
     case ARCRuntimeEntryPointKind::RetainRV:
-      return getI8XRetI8XEntryPoint(RetainRV,
-                                    "objc_retainAutoreleasedReturnValue", true);
+      return getIntrinsicEntryPoint(RetainRV,
+                                Intrinsic::objc_retainAutoreleasedReturnValue);
     case ARCRuntimeEntryPointKind::RetainAutorelease:
-      return getI8XRetI8XEntryPoint(RetainAutorelease, "objc_retainAutorelease",
-                                    true);
+      return getIntrinsicEntryPoint(RetainAutorelease,
+                                    Intrinsic::objc_retainAutorelease);
     case ARCRuntimeEntryPointKind::RetainAutoreleaseRV:
-      return getI8XRetI8XEntryPoint(RetainAutoreleaseRV,
-                                    "objc_retainAutoreleaseReturnValue", true);
+      return getIntrinsicEntryPoint(RetainAutoreleaseRV,
+                                Intrinsic::objc_retainAutoreleaseReturnValue);
     }
 
     llvm_unreachable("Switch should be a covered switch.");
@@ -131,54 +132,11 @@ private:
   /// Declaration for objc_retainAutoreleaseReturnValue().
   Constant *RetainAutoreleaseRV = nullptr;
 
-  Constant *getVoidRetI8XEntryPoint(Constant *&Decl, StringRef Name) {
+  Constant *getIntrinsicEntryPoint(Constant *&Decl, Intrinsic::ID IntID) {
     if (Decl)
       return Decl;
 
-    LLVMContext &C = TheModule->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeList Attr = AttributeList().addAttribute(
-        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
-    FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
-                                          /*isVarArg=*/false);
-    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
-  }
-
-  Constant *getI8XRetI8XEntryPoint(Constant *&Decl, StringRef Name,
-                                   bool NoUnwind = false) {
-    if (Decl)
-      return Decl;
-
-    LLVMContext &C = TheModule->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *Fty = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeList Attr = AttributeList();
-
-    if (NoUnwind)
-      Attr = Attr.addAttribute(C, AttributeList::FunctionIndex,
-                               Attribute::NoUnwind);
-
-    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
-  }
-
-  Constant *getI8XRetI8XXI8XEntryPoint(Constant *&Decl, StringRef Name) {
-    if (Decl)
-      return Decl;
-
-    LLVMContext &C = TheModule->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *I8XX = PointerType::getUnqual(I8X);
-    Type *Params[] = { I8XX, I8X };
-
-    AttributeList Attr = AttributeList().addAttribute(
-        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
-    Attr = Attr.addParamAttribute(C, 0, Attribute::NoCapture);
-
-    FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
-                                          /*isVarArg=*/false);
-
-    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
+    return Decl = Intrinsic::getDeclaration(TheModule, IntID);
   }
 };
 
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 1f1ea9f5873923020ed3be9b7a1e576ce059640a..abe2871c0b8f97f7f411de2a0f68a4a4a848e2fe 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -522,7 +522,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
         TailOkForStoreStrongs = false;
       return true;
     case ARCInstKind::IntrinsicUser:
-      // Remove calls to @clang.arc.use(...).
+      // Remove calls to @llvm.objc.clang.arc.use(...).
       Inst->eraseFromParent();
       return true;
     default:
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index 3a8ef073cb48ade72987474e753c14c9ce76eef7..f63182e57c1a3186015848994ed9cf3a947efc5a 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -38,7 +38,8 @@ STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
 /// instruction may need to be cleared of assumptions that can no longer be
 /// guaranteed correct.
 static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
-  assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?");
+  assert(I->getType()->isIntOrIntVectorTy() &&
+         "Trivializing a non-integer value?");
 
   // Initialize the worklist with eligible direct users.
   SmallVector<Instruction *, 16> WorkList;
@@ -46,13 +47,13 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
     // If all bits of a user are demanded, then we know that nothing below that
     // in the def-use chain needs to be changed.
     auto *J = dyn_cast<Instruction>(JU);
-    if (J && J->getType()->isSized() &&
+    if (J && J->getType()->isIntOrIntVectorTy() &&
         !DB.getDemandedBits(J).isAllOnesValue())
       WorkList.push_back(J);
 
-    // Note that we need to check for unsized types above before asking for
+    // Note that we need to check for non-int types above before asking for
     // demanded bits. Normally, the only way to reach an instruction with an
-    // unsized type is via an instruction that has side effects (or otherwise
+    // non-int type is via an instruction that has side effects (or otherwise
     // will demand its input bits). However, if we have a readnone function
     // that returns an unsized type (e.g., void), we must avoid asking for the
     // demanded bits of the function call's return value. A void-returning
@@ -78,7 +79,7 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
       // If all bits of a user are demanded, then we know that nothing below
       // that in the def-use chain needs to be changed.
       auto *K = dyn_cast<Instruction>(KU);
-      if (K && !Visited.count(K) && K->getType()->isSized() &&
+      if (K && !Visited.count(K) && K->getType()->isIntOrIntVectorTy() &&
           !DB.getDemandedBits(K).isAllOnesValue())
         WorkList.push_back(K);
     }
@@ -95,7 +96,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
     if (I.mayHaveSideEffects() && I.use_empty())
       continue;
 
-    if (I.getType()->isIntegerTy() &&
+    if (I.getType()->isIntOrIntVectorTy() &&
         !DB.getDemandedBits(&I).getBoolValue()) {
       // For live instructions that have all dead bits, first make them dead by
       // replacing all uses with something else. Then, if they don't need to
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index fce37d4bffb8f509079b14f7b9192e3cb408aa54..e3548ce5cd0afdd7472a9107d66939de49709ff6 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_library(LLVMScalarOpts
   LowerAtomic.cpp
   LowerExpectIntrinsic.cpp
   LowerGuardIntrinsic.cpp
+  MakeGuardsExplicit.cpp
   MemCpyOptimizer.cpp
   MergeICmps.cpp
   MergedLoadStoreMotion.cpp
@@ -68,6 +69,7 @@ add_llvm_library(LLVMScalarOpts
   StraightLineStrengthReduce.cpp
   StructurizeCFG.cpp
   TailRecursionElimination.cpp
+  WarnMissedTransforms.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index b9e8e3424ccfe61e80587adad135b83daf69b056..a806d6faed60a6333eedf4529647366b41f9322a 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -149,14 +149,14 @@ static void recordCondition(CallSite CS, BasicBlock *From, BasicBlock *To,
 
 /// Record ICmp conditions relevant to any argument in CS following Pred's
 /// single predecessors. If there are conflicting conditions along a path, like
-/// x == 1 and x == 0, the first condition will be used.
+/// x == 1 and x == 0, the first condition will be used. We stop once we reach
+/// an edge to StopAt.
 static void recordConditions(CallSite CS, BasicBlock *Pred,
-                             ConditionsTy &Conditions) {
-  recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
+                             ConditionsTy &Conditions, BasicBlock *StopAt) {
   BasicBlock *From = Pred;
   BasicBlock *To = Pred;
   SmallPtrSet<BasicBlock *, 4> Visited;
-  while (!Visited.count(From->getSinglePredecessor()) &&
+  while (To != StopAt && !Visited.count(From->getSinglePredecessor()) &&
          (From = From->getSinglePredecessor())) {
     recordCondition(CS, From, To, Conditions);
     Visited.insert(From);
@@ -302,7 +302,7 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
 static void splitCallSite(
     CallSite CS,
     const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
-    DominatorTree *DT) {
+    DomTreeUpdater &DTU) {
   Instruction *Instr = CS.getInstruction();
   BasicBlock *TailBB = Instr->getParent();
   bool IsMustTailCall = CS.isMustTailCall();
@@ -327,7 +327,7 @@ static void splitCallSite(
     BasicBlock *PredBB = Preds[i].first;
     BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
         TailBB, PredBB, &*std::next(Instr->getIterator()), ValueToValueMaps[i],
-        DT);
+        DTU);
     assert(SplitBlock && "Unexpected new basic block split.");
 
     Instruction *NewCI =
@@ -365,11 +365,13 @@ static void splitCallSite(
     // attempting removal.
     SmallVector<BasicBlock *, 2> Splits(predecessors((TailBB)));
     assert(Splits.size() == 2 && "Expected exactly 2 splits!");
-    for (unsigned i = 0; i < Splits.size(); i++)
+    for (unsigned i = 0; i < Splits.size(); i++) {
       Splits[i]->getTerminator()->eraseFromParent();
+      DTU.deleteEdge(Splits[i], TailBB);
+    }
 
     // Erase the tail block once done with musttail patching
-    TailBB->eraseFromParent();
+    DTU.deleteBB(TailBB);
     return;
   }
 
@@ -438,48 +440,73 @@ static bool isPredicatedOnPHI(CallSite CS) {
   return false;
 }
 
-static bool tryToSplitOnPHIPredicatedArgument(CallSite CS, DominatorTree *DT) {
+using PredsWithCondsTy = SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2>;
+
+// Check if any of the arguments in CS are predicated on a PHI node and return
+// the set of predecessors we should use for splitting.
+static PredsWithCondsTy shouldSplitOnPHIPredicatedArgument(CallSite CS) {
   if (!isPredicatedOnPHI(CS))
-    return false;
+    return {};
 
   auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
-  SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS = {
-      {Preds[0], {}}, {Preds[1], {}}};
-  splitCallSite(CS, PredsCS, DT);
-  return true;
+  return {{Preds[0], {}}, {Preds[1], {}}};
 }
 
-static bool tryToSplitOnPredicatedArgument(CallSite CS, DominatorTree *DT) {
+// Checks if any of the arguments in CS are predicated in a predecessor and
+// returns a list of predecessors with the conditions that hold on their edges
+// to CS.
+static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallSite CS,
+                                                        DomTreeUpdater &DTU) {
   auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
   if (Preds[0] == Preds[1])
-    return false;
+    return {};
+
+  // We can stop recording conditions once we reached the immediate dominator
+  // for the block containing the call site. Conditions in predecessors of the
+  // that node will be the same for all paths to the call site and splitting
+  // is not beneficial.
+  assert(DTU.hasDomTree() && "We need a DTU with a valid DT!");
+  auto *CSDTNode = DTU.getDomTree().getNode(CS.getInstruction()->getParent());
+  BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr;
 
   SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
   for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
     ConditionsTy Conditions;
-    recordConditions(CS, Pred, Conditions);
+    // Record condition on edge BB(CS) <- Pred
+    recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
+    // Record conditions following Pred's single predecessors.
+    recordConditions(CS, Pred, Conditions, StopAt);
     PredsCS.push_back({Pred, Conditions});
   }
 
   if (all_of(PredsCS, [](const std::pair<BasicBlock *, ConditionsTy> &P) {
         return P.second.empty();
       }))
-    return false;
+    return {};
 
-  splitCallSite(CS, PredsCS, DT);
-  return true;
+  return PredsCS;
 }
 
 static bool tryToSplitCallSite(CallSite CS, TargetTransformInfo &TTI,
-                               DominatorTree *DT) {
+                               DomTreeUpdater &DTU) {
+  // Check if we can split the call site.
   if (!CS.arg_size() || !canSplitCallSite(CS, TTI))
     return false;
-  return tryToSplitOnPredicatedArgument(CS, DT) ||
-         tryToSplitOnPHIPredicatedArgument(CS, DT);
+
+  auto PredsWithConds = shouldSplitOnPredicatedArgument(CS, DTU);
+  if (PredsWithConds.empty())
+    PredsWithConds = shouldSplitOnPHIPredicatedArgument(CS);
+  if (PredsWithConds.empty())
+    return false;
+
+  splitCallSite(CS, PredsWithConds, DTU);
+  return true;
 }
 
 static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
-                                TargetTransformInfo &TTI, DominatorTree *DT) {
+                                TargetTransformInfo &TTI, DominatorTree &DT) {
+
+  DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
   bool Changed = false;
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
     BasicBlock &BB = *BI++;
@@ -503,7 +530,7 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
       // Check if such path is possible before attempting the splitting.
       bool IsMustTail = CS.isMustTailCall();
 
-      Changed |= tryToSplitCallSite(CS, TTI, DT);
+      Changed |= tryToSplitCallSite(CS, TTI, DTU);
 
       // There're no interesting instructions after this. The call site
       // itself might have been erased on splitting.
@@ -524,6 +551,7 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
@@ -534,9 +562,8 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
 
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-    return doCallSiteSplitting(F, TLI, TTI,
-                               DTWP ? &DTWP->getDomTree() : nullptr);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    return doCallSiteSplitting(F, TLI, TTI, DT);
   }
 };
 } // namespace
@@ -546,6 +573,7 @@ INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
                       "Call-site splitting", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
                     "Call-site splitting", false, false)
 FunctionPass *llvm::createCallSiteSplittingPass() {
@@ -556,7 +584,7 @@ PreservedAnalyses CallSiteSplittingPass::run(Function &F,
                                              FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
 
   if (!doCallSiteSplitting(F, TLI, TTI, DT))
     return PreservedAnalyses::all();
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 46915889ce7cd6da8101ce791c4caf85cd1fc460..51032b0625f886a3c0a466fecf75c6493d0464e1 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -18,21 +18,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
-#include <set>
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "constprop"
 
 STATISTIC(NumInstKilled, "Number of instructions killed");
+DEBUG_COUNTER(CPCounter, "constprop-transform",
+              "Controls which instructions are killed");
 
 namespace {
   struct ConstantPropagation : public FunctionPass {
@@ -66,9 +70,15 @@ bool ConstantPropagation::runOnFunction(Function &F) {
     return false;
 
   // Initialize the worklist to all of the instructions ready to process...
-  std::set<Instruction*> WorkList;
-  for (Instruction &I: instructions(&F))
+  SmallPtrSet<Instruction *, 16> WorkList;
+  // The SmallVector of WorkList ensures that we do iteration at stable order.
+  // We use two containers rather than one SetVector, since remove is
+  // linear-time, and we don't care enough to remove from Vec.
+  SmallVector<Instruction *, 16> WorkListVec;
+  for (Instruction &I : instructions(&F)) {
     WorkList.insert(&I);
+    WorkListVec.push_back(&I);
+  }
 
   bool Changed = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
@@ -76,29 +86,36 @@ bool ConstantPropagation::runOnFunction(Function &F) {
       &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   while (!WorkList.empty()) {
-    Instruction *I = *WorkList.begin();
-    WorkList.erase(WorkList.begin());    // Get an element from the worklist...
-
-    if (!I->use_empty())                 // Don't muck with dead instructions...
-      if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
-        // Add all of the users of this instruction to the worklist, they might
-        // be constant propagatable now...
-        for (User *U : I->users())
-          WorkList.insert(cast<Instruction>(U));
-
-        // Replace all of the uses of a variable with uses of the constant.
-        I->replaceAllUsesWith(C);
-
-        // Remove the dead instruction.
-        WorkList.erase(I);
-        if (isInstructionTriviallyDead(I, TLI)) {
-          I->eraseFromParent();
-          ++NumInstKilled;
+    SmallVector<Instruction*, 16> NewWorkListVec;
+    for (auto *I : WorkListVec) {
+      WorkList.erase(I); // Remove element from the worklist...
+
+      if (!I->use_empty()) // Don't muck with dead instructions...
+        if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
+          if (!DebugCounter::shouldExecute(CPCounter))
+            continue;
+
+          // Add all of the users of this instruction to the worklist, they might
+          // be constant propagatable now...
+          for (User *U : I->users()) {
+            // If user not in the set, then add it to the vector.
+            if (WorkList.insert(cast<Instruction>(U)).second)
+              NewWorkListVec.push_back(cast<Instruction>(U));
+          }
+
+          // Replace all of the uses of a variable with uses of the constant.
+          I->replaceAllUsesWith(C);
+
+          if (isInstructionTriviallyDead(I, TLI)) {
+            I->eraseFromParent();
+            ++NumInstKilled;
+          }
+
+          // We made a change to the function...
+          Changed = true;
         }
-
-        // We made a change to the function...
-        Changed = true;
-      }
+    }
+    WorkListVec = std::move(NewWorkListVec);
   }
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 99402985f28dc0dc186649561845fcf80690378e..d0105701c73fcc3478fe0eb6b3fd884bad83f7b7 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -273,10 +273,11 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
 /// information is sufficient to prove this comparison. Even for local
 /// conditions, this can sometimes prove conditions instcombine can't by
 /// exploiting range information.
-static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
-  Value *Op0 = C->getOperand(0);
-  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
-  if (!Op1) return false;
+static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
+  Value *Op0 = Cmp->getOperand(0);
+  auto *C = dyn_cast<Constant>(Cmp->getOperand(1));
+  if (!C)
+    return false;
 
   // As a policy choice, we choose not to waste compile time on anything where
   // the comparison is testing local values.  While LVI can sometimes reason
@@ -284,20 +285,18 @@ static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
   // the block local query for uses from terminator instructions, but that's
   // handled in the code for each terminator.
   auto *I = dyn_cast<Instruction>(Op0);
-  if (I && I->getParent() == C->getParent())
+  if (I && I->getParent() == Cmp->getParent())
     return false;
 
   LazyValueInfo::Tristate Result =
-    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
-  if (Result == LazyValueInfo::Unknown) return false;
+      LVI->getPredicateAt(Cmp->getPredicate(), Op0, C, Cmp);
+  if (Result == LazyValueInfo::Unknown)
+    return false;
 
   ++NumCmps;
-  if (Result == LazyValueInfo::True)
-    C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
-  else
-    C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
-  C->eraseFromParent();
-
+  Constant *TorF = ConstantInt::get(Type::getInt1Ty(Cmp->getContext()), Result);
+  Cmp->replaceAllUsesWith(TorF);
+  Cmp->eraseFromParent();
   return true;
 }
 
@@ -308,7 +307,8 @@ static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
 /// that cannot fire no matter what the incoming edge can safely be removed. If
 /// a case fires on every incoming edge then the entire switch can be removed
 /// and replaced with a branch to the case destination.
-static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI, DominatorTree *DT) {
+static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI,
+                          DominatorTree *DT) {
   DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
   Value *Cond = SI->getCondition();
   BasicBlock *BB = SI->getParent();
@@ -430,23 +430,21 @@ static bool willNotOverflow(IntrinsicInst *II, LazyValueInfo *LVI) {
 }
 
 static void processOverflowIntrinsic(IntrinsicInst *II) {
+  IRBuilder<> B(II);
   Value *NewOp = nullptr;
   switch (II->getIntrinsicID()) {
   default:
     llvm_unreachable("Unexpected instruction.");
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
-    NewOp = BinaryOperator::CreateAdd(II->getOperand(0), II->getOperand(1),
-                                      II->getName(), II);
+    NewOp = B.CreateAdd(II->getOperand(0), II->getOperand(1), II->getName());
     break;
   case Intrinsic::usub_with_overflow:
   case Intrinsic::ssub_with_overflow:
-    NewOp = BinaryOperator::CreateSub(II->getOperand(0), II->getOperand(1),
-                                      II->getName(), II);
+    NewOp = B.CreateSub(II->getOperand(0), II->getOperand(1), II->getName());
     break;
   }
   ++NumOverflows;
-  IRBuilder<> B(II);
   Value *NewI = B.CreateInsertValue(UndefValue::get(II->getType()), NewOp, 0);
   NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(II->getContext()), 1);
   II->replaceAllUsesWith(NewI);
@@ -528,17 +526,17 @@ static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
     return false;
 
   ++NumUDivs;
+  IRBuilder<> B{Instr};
   auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
-  auto *LHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(0), TruncTy,
-                               Instr->getName() + ".lhs.trunc", Instr);
-  auto *RHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(1), TruncTy,
-                               Instr->getName() + ".rhs.trunc", Instr);
-  auto *BO =
-      BinaryOperator::Create(Instr->getOpcode(), LHS, RHS, Instr->getName(), Instr);
-  auto *Zext = CastInst::Create(Instruction::ZExt, BO, Instr->getType(),
-                                Instr->getName() + ".zext", Instr);
-  if (BO->getOpcode() == Instruction::UDiv)
-    BO->setIsExact(Instr->isExact());
+  auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
+                                     Instr->getName() + ".lhs.trunc");
+  auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy,
+                                     Instr->getName() + ".rhs.trunc");
+  auto *BO = B.CreateBinOp(Instr->getOpcode(), LHS, RHS, Instr->getName());
+  auto *Zext = B.CreateZExt(BO, Instr->getType(), Instr->getName() + ".zext");
+  if (auto *BinOp = dyn_cast<BinaryOperator>(BO))
+    if (BinOp->getOpcode() == Instruction::UDiv)
+      BinOp->setIsExact(Instr->isExact());
 
   Instr->replaceAllUsesWith(Zext);
   Instr->eraseFromParent();
@@ -552,6 +550,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
   ++NumSRems;
   auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
                                         SDI->getName(), SDI);
+  BO->setDebugLoc(SDI->getDebugLoc());
   SDI->replaceAllUsesWith(BO);
   SDI->eraseFromParent();
 
@@ -573,6 +572,7 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
   ++NumSDivs;
   auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
                                         SDI->getName(), SDI);
+  BO->setDebugLoc(SDI->getDebugLoc());
   BO->setIsExact(SDI->isExact());
   SDI->replaceAllUsesWith(BO);
   SDI->eraseFromParent();
@@ -595,6 +595,7 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
   ++NumAShrs;
   auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
                                         SDI->getName(), SDI);
+  BO->setDebugLoc(SDI->getDebugLoc());
   BO->setIsExact(SDI->isExact());
   SDI->replaceAllUsesWith(BO);
   SDI->eraseFromParent();
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 11009d501a9c89a6351af7fde3981589a5cebd6d..1f09979b33824da181180b37bac02b36d2ec9d11 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -812,7 +812,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
         continue;
       }
-      salvageDebugInfo(*Inst);
+      if (!salvageDebugInfo(*Inst))
+        replaceDbgUsesWithUndef(Inst);
       removeMSSA(Inst);
       Inst->eraseFromParent();
       Changed = true;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index c080c2a1813de93c49048cf0f401bf7570ff224e..440ea4a5bc789668d2f4cf39d14d20ef37460ce0 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -2156,6 +2156,16 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   if (isa<CmpInst>(CurInst))
     return false;
 
+  // Don't do PRE on GEPs. The inserted PHI would prevent CodeGenPrepare from
+  // sinking the addressing mode computation back to its uses. Extending the
+  // GEP's live range increases the register pressure, and therefore it can
+  // introduce unnecessary spills.
+  //
+  // This doesn't prevent Load PRE. PHI translation will make the GEP available
+  // to the load by moving it to the predecessor block if necessary.
+  if (isa<GetElementPtrInst>(CurInst))
+    return false;
+
   // We don't currently value number ANY inline asm calls.
   if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
     if (CallI->isInlineAsm())
diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp
index 187c668c338584a935ab8266b35e86ac3aeac7dd..1df5f5400c148f508f1f965311caa6755b5abc5f 100644
--- a/lib/Transforms/Scalar/GVNSink.cpp
+++ b/lib/Transforms/Scalar/GVNSink.cpp
@@ -258,14 +258,14 @@ public:
   /// Create a PHI from an array of incoming values and incoming blocks.
   template <typename VArray, typename BArray>
   ModelledPHI(const VArray &V, const BArray &B) {
-    std::copy(V.begin(), V.end(), std::back_inserter(Values));
-    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+    llvm::copy(V, std::back_inserter(Values));
+    llvm::copy(B, std::back_inserter(Blocks));
   }
 
   /// Create a PHI from [I[OpNum] for I in Insts].
   template <typename BArray>
   ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
-    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+    llvm::copy(B, std::back_inserter(Blocks));
     for (auto *I : Insts)
       Values.push_back(I->getOperand(OpNum));
   }
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index ec51ad71abc517e92c16ddca63d1437a6f0a0441..48d8e457ba7c1d561b28a6b935c5b5e51749ed64 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -145,6 +145,7 @@ class IndVarSimplify {
   bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
   bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
   bool rewriteFirstIterationLoopExitValues(Loop *L);
+  bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) const;
 
   bool linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
                                  PHINode *IndVar, SCEVExpander &Rewriter);
@@ -524,6 +525,29 @@ struct RewritePhi {
 // As a side effect, reduces the amount of IV processing within the loop.
 //===----------------------------------------------------------------------===//
 
+bool IndVarSimplify::hasHardUserWithinLoop(const Loop *L, const Instruction *I) const {
+  SmallPtrSet<const Instruction *, 8> Visited;
+  SmallVector<const Instruction *, 8> WorkList;
+  Visited.insert(I);
+  WorkList.push_back(I);
+  while (!WorkList.empty()) {
+    const Instruction *Curr = WorkList.pop_back_val();
+    // This use is outside the loop, nothing to do.
+    if (!L->contains(Curr))
+      continue;
+    // Do we assume it is a "hard" use which will not be eliminated easily?
+    if (Curr->mayHaveSideEffects())
+      return true;
+    // Otherwise, add all its users to worklist.
+    for (auto U : Curr->users()) {
+      auto *UI = cast<Instruction>(U);
+      if (Visited.insert(UI).second)
+        WorkList.push_back(UI);
+    }
+  }
+  return false;
+}
+
 /// Check to see if this loop has a computable loop-invariant execution count.
 /// If so, this means that we can compute the final value of any expressions
 /// that are recurrent in the loop, and substitute the exit values from the loop
@@ -598,19 +622,8 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         // Computing the value outside of the loop brings no benefit if it is
         // definitely used inside the loop in a way which can not be optimized
         // away.
-        if (ExitValue->getSCEVType()>=scMulExpr) {
-          bool HasHardInternalUses = false;
-          for (auto *IB : Inst->users()) {
-            Instruction *UseInstr = cast<Instruction>(IB);
-            unsigned Opc = UseInstr->getOpcode();
-            if (L->contains(UseInstr) && Opc == Instruction::Call) {
-              HasHardInternalUses = true;
-              break;
-            }
-          }
-          if (HasHardInternalUses)
-            continue;
-        }
+        if (!isa<SCEVConstant>(ExitValue) && hasHardUserWithinLoop(L, Inst))
+          continue;
 
         bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
         Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 849ff71e198b6033402d0db5e8712e44cbbbd83a..14296293ddaa67609b9565c1d513379b0d335c37 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -574,9 +574,11 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
 /// BB in the result vector.
 ///
 /// This returns true if there were any known values.
-bool JumpThreadingPass::ComputeValueKnownInPredecessors(
+bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
     Value *V, BasicBlock *BB, PredValueInfo &Result,
-    ConstantPreference Preference, Instruction *CxtI) {
+    ConstantPreference Preference,
+    DenseSet<std::pair<Value *, BasicBlock *>> &RecursionSet,
+    Instruction *CxtI) {
   // This method walks up use-def chains recursively.  Because of this, we could
   // get into an infinite loop going around loops in the use-def chain.  To
   // prevent this, keep track of what (value, block) pairs we've already visited
@@ -584,10 +586,6 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
   if (!RecursionSet.insert(std::make_pair(V, BB)).second)
     return false;
 
-  // An RAII help to remove this pair from the recursion set once the recursion
-  // stack pops back out again.
-  RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB));
-
   // If V is a constant, then it is known in all predecessors.
   if (Constant *KC = getKnownConstant(V, Preference)) {
     for (BasicBlock *Pred : predecessors(BB))
@@ -657,7 +655,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
     Value *Source = CI->getOperand(0);
     if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
       return false;
-    ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI);
+    ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
+                                        RecursionSet, CxtI);
     if (Result.empty())
       return false;
 
@@ -677,10 +676,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
         I->getOpcode() == Instruction::And) {
       PredValueInfoTy LHSVals, RHSVals;
 
-      ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
-                                      WantInteger, CxtI);
-      ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
-                                      WantInteger, CxtI);
+      ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
+                                      WantInteger, RecursionSet, CxtI);
+      ComputeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
+                                          WantInteger, RecursionSet, CxtI);
 
       if (LHSVals.empty() && RHSVals.empty())
         return false;
@@ -715,8 +714,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
     if (I->getOpcode() == Instruction::Xor &&
         isa<ConstantInt>(I->getOperand(1)) &&
         cast<ConstantInt>(I->getOperand(1))->isOne()) {
-      ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result,
-                                      WantInteger, CxtI);
+      ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
+                                          WantInteger, RecursionSet, CxtI);
       if (Result.empty())
         return false;
 
@@ -733,8 +732,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
             && "A binary operator creating a block address?");
     if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
       PredValueInfoTy LHSVals;
-      ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals,
-                                      WantInteger, CxtI);
+      ComputeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
+                                          WantInteger, RecursionSet, CxtI);
 
       // Try to use constant folding to simplify the binary operator.
       for (const auto &LHSVal : LHSVals) {
@@ -879,8 +878,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
       // Try to find a constant value for the LHS of a comparison,
       // and evaluate it statically if we can.
       PredValueInfoTy LHSVals;
-      ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
-                                      WantInteger, CxtI);
+      ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
+                                          WantInteger, RecursionSet, CxtI);
 
       for (const auto &LHSVal : LHSVals) {
         Constant *V = LHSVal.first;
@@ -900,8 +899,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
     Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
     PredValueInfoTy Conds;
     if ((TrueVal || FalseVal) &&
-        ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
-                                        WantInteger, CxtI)) {
+        ComputeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
+                                            WantInteger, RecursionSet, CxtI)) {
       for (auto &C : Conds) {
         Constant *Cond = C.first;
 
@@ -2655,28 +2654,16 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
   // Duplicate all instructions before the guard and the guard itself to the
   // branch where implication is not proved.
   BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
-      BB, PredGuardedBlock, AfterGuard, GuardedMapping);
+      BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU);
   assert(GuardedBlock && "Could not create the guarded block?");
   // Duplicate all instructions before the guard in the unguarded branch.
   // Since we have successfully duplicated the guarded block and this block
   // has fewer instructions, we expect it to succeed.
   BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
-      BB, PredUnguardedBlock, Guard, UnguardedMapping);
+      BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU);
   assert(UnguardedBlock && "Could not create the unguarded block?");
   LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
                     << GuardedBlock->getName() << "\n");
-  // DuplicateInstructionsInSplitBetween inserts a new block "BB.split" between
-  // PredBB and BB. We need to perform two inserts and one delete for each of
-  // the above calls to update Dominators.
-  DTU->applyUpdates(
-      {// Guarded block split.
-       {DominatorTree::Delete, PredGuardedBlock, BB},
-       {DominatorTree::Insert, PredGuardedBlock, GuardedBlock},
-       {DominatorTree::Insert, GuardedBlock, BB},
-       // Unguarded block split.
-       {DominatorTree::Delete, PredUnguardedBlock, BB},
-       {DominatorTree::Insert, PredUnguardedBlock, UnguardedBlock},
-       {DominatorTree::Insert, UnguardedBlock, BB}});
   // Some instructions before the guard may still have uses. For them, we need
   // to create Phi nodes merging their copies in both guarded and unguarded
   // branches. Those instructions that have no uses can be just removed.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 7789cb923450d6c377b8fb7c5a0bc5ee3b1d0214..695eaf6cf4a73aa42196ab555cf2574b41084469 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -31,6 +31,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -41,6 +42,7 @@
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSA.h"
@@ -75,6 +77,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "licm"
 
+STATISTIC(NumCreatedBlocks, "Number of blocks created");
+STATISTIC(NumClonedBranches, "Number of branches cloned");
 STATISTIC(NumSunk, "Number of instructions sunk out of loop");
 STATISTIC(NumHoisted, "Number of instructions hoisted out of loop");
 STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
@@ -86,6 +90,10 @@ static cl::opt<bool>
     DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
                      cl::desc("Disable memory promotion in LICM pass"));
 
+static cl::opt<bool> ControlFlowHoisting(
+    "licm-control-flow-hoisting", cl::Hidden, cl::init(false),
+    cl::desc("Enable control flow (and PHI) hoisting in LICM"));
+
 static cl::opt<uint32_t> MaxNumUsesTraversed(
     "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
     cl::desc("Max num uses visited for identifying load "
@@ -103,7 +111,7 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   const LoopSafetyInfo *SafetyInfo,
                                   TargetTransformInfo *TTI, bool &FreeInLoop);
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  ICFLoopSafetyInfo *SafetyInfo,
+                  BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
                   OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                  const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
@@ -126,6 +134,9 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
 static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
                              AliasSetTracker *AST);
 
+static void moveInstructionBefore(Instruction &I, Instruction &Dest,
+                                  ICFLoopSafetyInfo &SafetyInfo);
+
 namespace {
 struct LoopInvariantCodeMotion {
   using ASTrackerMapTy = DenseMap<Loop *, std::unique_ptr<AliasSetTracker>>;
@@ -434,6 +445,228 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
   return Changed;
 }
 
+// This is a helper class for hoistRegion to make it able to hoist control flow
+// in order to be able to hoist phis. The way this works is that we initially
+// start hoisting to the loop preheader, and when we see a loop invariant branch
+// we make note of this. When we then come to hoist an instruction that's
+// conditional on such a branch we duplicate the branch and the relevant control
+// flow, then hoist the instruction into the block corresponding to its original
+// block in the duplicated control flow.
+class ControlFlowHoister {
+private:
+  // Information about the loop we are hoisting from
+  LoopInfo *LI;
+  DominatorTree *DT;
+  Loop *CurLoop;
+
+  // A map of blocks in the loop to the block their instructions will be hoisted
+  // to.
+  DenseMap<BasicBlock *, BasicBlock *> HoistDestinationMap;
+
+  // The branches that we can hoist, mapped to the block that marks a
+  // convergence point of their control flow.
+  DenseMap<BranchInst *, BasicBlock *> HoistableBranches;
+
+public:
+  ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop)
+      : LI(LI), DT(DT), CurLoop(CurLoop) {}
+
+  void registerPossiblyHoistableBranch(BranchInst *BI) {
+    // We can only hoist conditional branches with loop invariant operands.
+    if (!ControlFlowHoisting || !BI->isConditional() ||
+        !CurLoop->hasLoopInvariantOperands(BI))
+      return;
+
+    // The branch destinations need to be in the loop, and we don't gain
+    // anything by duplicating conditional branches with duplicate successors,
+    // as it's essentially the same as an unconditional branch.
+    BasicBlock *TrueDest = BI->getSuccessor(0);
+    BasicBlock *FalseDest = BI->getSuccessor(1);
+    if (!CurLoop->contains(TrueDest) || !CurLoop->contains(FalseDest) ||
+        TrueDest == FalseDest)
+      return;
+
+    // We can hoist BI if one branch destination is the successor of the other,
+    // or both have common successor which we check by seeing if the
+    // intersection of their successors is non-empty.
+    // TODO: This could be expanded to allowing branches where both ends
+    // eventually converge to a single block.
+    SmallPtrSet<BasicBlock *, 4> TrueDestSucc, FalseDestSucc;
+    TrueDestSucc.insert(succ_begin(TrueDest), succ_end(TrueDest));
+    FalseDestSucc.insert(succ_begin(FalseDest), succ_end(FalseDest));
+    BasicBlock *CommonSucc = nullptr;
+    if (TrueDestSucc.count(FalseDest)) {
+      CommonSucc = FalseDest;
+    } else if (FalseDestSucc.count(TrueDest)) {
+      CommonSucc = TrueDest;
+    } else {
+      set_intersect(TrueDestSucc, FalseDestSucc);
+      // If there's one common successor use that.
+      if (TrueDestSucc.size() == 1)
+        CommonSucc = *TrueDestSucc.begin();
+      // If there's more than one pick whichever appears first in the block list
+      // (we can't use the value returned by TrueDestSucc.begin() as it's
+      // unpredicatable which element gets returned).
+      else if (!TrueDestSucc.empty()) {
+        Function *F = TrueDest->getParent();
+        auto IsSucc = [&](BasicBlock &BB) { return TrueDestSucc.count(&BB); };
+        auto It = std::find_if(F->begin(), F->end(), IsSucc);
+        assert(It != F->end() && "Could not find successor in function");
+        CommonSucc = &*It;
+      }
+    }
+    // The common successor has to be dominated by the branch, as otherwise
+    // there will be some other path to the successor that will not be
+    // controlled by this branch so any phi we hoist would be controlled by the
+    // wrong condition. This also takes care of avoiding hoisting of loop back
+    // edges.
+    // TODO: In some cases this could be relaxed if the successor is dominated
+    // by another block that's been hoisted and we can guarantee that the
+    // control flow has been replicated exactly.
+    if (CommonSucc && DT->dominates(BI, CommonSucc))
+      HoistableBranches[BI] = CommonSucc;
+  }
+
+  bool canHoistPHI(PHINode *PN) {
+    // The phi must have loop invariant operands.
+    if (!ControlFlowHoisting || !CurLoop->hasLoopInvariantOperands(PN))
+      return false;
+    // We can hoist phis if the block they are in is the target of hoistable
+    // branches which cover all of the predecessors of the block.
+    SmallPtrSet<BasicBlock *, 8> PredecessorBlocks;
+    BasicBlock *BB = PN->getParent();
+    for (BasicBlock *PredBB : predecessors(BB))
+      PredecessorBlocks.insert(PredBB);
+    // If we have less predecessor blocks than predecessors then the phi will
+    // have more than one incoming value for the same block which we can't
+    // handle.
+    // TODO: This could be handled be erasing some of the duplicate incoming
+    // values.
+    if (PredecessorBlocks.size() != pred_size(BB))
+      return false;
+    for (auto &Pair : HoistableBranches) {
+      if (Pair.second == BB) {
+        // Which blocks are predecessors via this branch depends on if the
+        // branch is triangle-like or diamond-like.
+        if (Pair.first->getSuccessor(0) == BB) {
+          PredecessorBlocks.erase(Pair.first->getParent());
+          PredecessorBlocks.erase(Pair.first->getSuccessor(1));
+        } else if (Pair.first->getSuccessor(1) == BB) {
+          PredecessorBlocks.erase(Pair.first->getParent());
+          PredecessorBlocks.erase(Pair.first->getSuccessor(0));
+        } else {
+          PredecessorBlocks.erase(Pair.first->getSuccessor(0));
+          PredecessorBlocks.erase(Pair.first->getSuccessor(1));
+        }
+      }
+    }
+    // PredecessorBlocks will now be empty if for every predecessor of BB we
+    // found a hoistable branch source.
+    return PredecessorBlocks.empty();
+  }
+
+  BasicBlock *getOrCreateHoistedBlock(BasicBlock *BB) {
+    if (!ControlFlowHoisting)
+      return CurLoop->getLoopPreheader();
+    // If BB has already been hoisted, return that
+    if (HoistDestinationMap.count(BB))
+      return HoistDestinationMap[BB];
+
+    // Check if this block is conditional based on a pending branch
+    auto HasBBAsSuccessor =
+        [&](DenseMap<BranchInst *, BasicBlock *>::value_type &Pair) {
+          return BB != Pair.second && (Pair.first->getSuccessor(0) == BB ||
+                                       Pair.first->getSuccessor(1) == BB);
+        };
+    auto It = std::find_if(HoistableBranches.begin(), HoistableBranches.end(),
+                           HasBBAsSuccessor);
+
+    // If not involved in a pending branch, hoist to preheader
+    BasicBlock *InitialPreheader = CurLoop->getLoopPreheader();
+    if (It == HoistableBranches.end()) {
+      LLVM_DEBUG(dbgs() << "LICM using " << InitialPreheader->getName()
+                        << " as hoist destination for " << BB->getName()
+                        << "\n");
+      HoistDestinationMap[BB] = InitialPreheader;
+      return InitialPreheader;
+    }
+    BranchInst *BI = It->first;
+    assert(std::find_if(++It, HoistableBranches.end(), HasBBAsSuccessor) ==
+               HoistableBranches.end() &&
+           "BB is expected to be the target of at most one branch");
+
+    LLVMContext &C = BB->getContext();
+    BasicBlock *TrueDest = BI->getSuccessor(0);
+    BasicBlock *FalseDest = BI->getSuccessor(1);
+    BasicBlock *CommonSucc = HoistableBranches[BI];
+    BasicBlock *HoistTarget = getOrCreateHoistedBlock(BI->getParent());
+
+    // Create hoisted versions of blocks that currently don't have them
+    auto CreateHoistedBlock = [&](BasicBlock *Orig) {
+      if (HoistDestinationMap.count(Orig))
+        return HoistDestinationMap[Orig];
+      BasicBlock *New =
+          BasicBlock::Create(C, Orig->getName() + ".licm", Orig->getParent());
+      HoistDestinationMap[Orig] = New;
+      DT->addNewBlock(New, HoistTarget);
+      if (CurLoop->getParentLoop())
+        CurLoop->getParentLoop()->addBasicBlockToLoop(New, *LI);
+      ++NumCreatedBlocks;
+      LLVM_DEBUG(dbgs() << "LICM created " << New->getName()
+                        << " as hoist destination for " << Orig->getName()
+                        << "\n");
+      return New;
+    };
+    BasicBlock *HoistTrueDest = CreateHoistedBlock(TrueDest);
+    BasicBlock *HoistFalseDest = CreateHoistedBlock(FalseDest);
+    BasicBlock *HoistCommonSucc = CreateHoistedBlock(CommonSucc);
+
+    // Link up these blocks with branches.
+    if (!HoistCommonSucc->getTerminator()) {
+      // The new common successor we've generated will branch to whatever that
+      // hoist target branched to.
+      BasicBlock *TargetSucc = HoistTarget->getSingleSuccessor();
+      assert(TargetSucc && "Expected hoist target to have a single successor");
+      HoistCommonSucc->moveBefore(TargetSucc);
+      BranchInst::Create(TargetSucc, HoistCommonSucc);
+    }
+    if (!HoistTrueDest->getTerminator()) {
+      HoistTrueDest->moveBefore(HoistCommonSucc);
+      BranchInst::Create(HoistCommonSucc, HoistTrueDest);
+    }
+    if (!HoistFalseDest->getTerminator()) {
+      HoistFalseDest->moveBefore(HoistCommonSucc);
+      BranchInst::Create(HoistCommonSucc, HoistFalseDest);
+    }
+
+    // If BI is being cloned to what was originally the preheader then
+    // HoistCommonSucc will now be the new preheader.
+    if (HoistTarget == InitialPreheader) {
+      // Phis in the loop header now need to use the new preheader.
+      InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc);
+      // The new preheader dominates the loop header.
+      DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc);
+      DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader());
+      DT->changeImmediateDominator(HeaderNode, PreheaderNode);
+      // The preheader hoist destination is now the new preheader, with the
+      // exception of the hoist destination of this branch.
+      for (auto &Pair : HoistDestinationMap)
+        if (Pair.second == InitialPreheader && Pair.first != BI->getParent())
+          Pair.second = HoistCommonSucc;
+    }
+
+    // Now finally clone BI.
+    ReplaceInstWithInst(
+        HoistTarget->getTerminator(),
+        BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition()));
+    ++NumClonedBranches;
+
+    assert(CurLoop->getLoopPreheader() &&
+           "Hoisting blocks should not have destroyed preheader");
+    return HoistDestinationMap[BB];
+  }
+};
+
 /// Walk the specified region of the CFG (defined by all blocks dominated by
 /// the specified block, and that are in the current loop) in depth first
 /// order w.r.t the DominatorTree.  This allows us to visit definitions before
@@ -448,24 +681,24 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
          CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to hoistRegion");
 
-  // We want to visit parents before children. We will enque all the parents
-  // before their children in the worklist and process the worklist in order.
-  SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
+  ControlFlowHoister CFH(LI, DT, CurLoop);
 
+  // Keep track of instructions that have been hoisted, as they may need to be
+  // re-hoisted if they end up not dominating all of their uses.
+  SmallVector<Instruction *, 16> HoistedInstructions;
+
+  // For PHI hoisting to work we need to hoist blocks before their successors.
+  // We can do this by iterating through the blocks in the loop in reverse
+  // post-order.
+  LoopBlocksRPO Worklist(CurLoop);
+  Worklist.perform(LI);
   bool Changed = false;
-  for (DomTreeNode *DTN : Worklist) {
-    BasicBlock *BB = DTN->getBlock();
+  for (BasicBlock *BB : Worklist) {
     // Only need to process the contents of this block if it is not part of a
     // subloop (which would already have been processed).
     if (inSubLoop(BB, CurLoop, LI))
       continue;
 
-    // Keep track of whether the prefix instructions could have written memory.
-    // TODO: This may be done smarter if we keep track of all throwing and
-    // mem-writing operations in every block, e.g. using something similar to
-    // isGuaranteedToExecute.
-    bool IsMemoryNotModified = CurLoop->getHeader() == BB;
-
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
       Instruction &I = *II++;
       // Try constant folding this instruction.  If all the operands are
@@ -486,13 +719,16 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       // Try hoisting the instruction out to the preheader.  We can only do
       // this if all of the operands of the instruction are loop invariant and
       // if it is safe to hoist the instruction.
-      //
+      // TODO: It may be safe to hoist if we are hoisting to a conditional block
+      // and we have accurately duplicated the control flow from the loop header
+      // to that block.
       if (CurLoop->hasLoopInvariantOperands(&I) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, true, ORE) &&
           isSafeToExecuteUnconditionally(
               I, DT, CurLoop, SafetyInfo, ORE,
               CurLoop->getLoopPreheader()->getTerminator())) {
-        hoist(I, DT, CurLoop, SafetyInfo, ORE);
+        hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, ORE);
+        HoistedInstructions.push_back(&I);
         Changed = true;
         continue;
       }
@@ -517,7 +753,9 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         I.replaceAllUsesWith(Product);
         eraseInstruction(I, *SafetyInfo, CurAST);
 
-        hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+        hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB),
+              SafetyInfo, ORE);
+        HoistedInstructions.push_back(ReciprocalDivisor);
         Changed = true;
         continue;
       }
@@ -526,18 +764,75 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       if (((I.use_empty() &&
             match(&I, m_Intrinsic<Intrinsic::invariant_start>())) ||
            isGuard(&I)) &&
-          IsMemoryNotModified && CurLoop->hasLoopInvariantOperands(&I) &&
-          SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) {
-        hoist(I, DT, CurLoop, SafetyInfo, ORE);
+          CurLoop->hasLoopInvariantOperands(&I) &&
+          SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) &&
+          SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop)) {
+        hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, ORE);
+        HoistedInstructions.push_back(&I);
         Changed = true;
         continue;
       }
 
-      if (IsMemoryNotModified)
-        IsMemoryNotModified = !I.mayWriteToMemory();
+      if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+        if (CFH.canHoistPHI(PN)) {
+          // Redirect incoming blocks first to ensure that we create hoisted
+          // versions of those blocks before we hoist the phi.
+          for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i)
+            PN->setIncomingBlock(
+                i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i)));
+          hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+                ORE);
+          assert(DT->dominates(PN, BB) && "Conditional PHIs not expected");
+          Changed = true;
+          continue;
+        }
+      }
+
+      // Remember possibly hoistable branches so we can actually hoist them
+      // later if needed.
+      if (BranchInst *BI = dyn_cast<BranchInst>(&I))
+        CFH.registerPossiblyHoistableBranch(BI);
     }
   }
 
+  // If we hoisted instructions to a conditional block they may not dominate
+  // their uses that weren't hoisted (such as phis where some operands are not
+  // loop invariant). If so make them unconditional by moving them to their
+  // immediate dominator. We iterate through the instructions in reverse order
+  // which ensures that when we rehoist an instruction we rehoist its operands,
+  // and also keep track of where in the block we are rehoisting to to make sure
+  // that we rehoist instructions before the instructions that use them.
+  Instruction *HoistPoint = nullptr;
+  if (ControlFlowHoisting) {
+    for (Instruction *I : reverse(HoistedInstructions)) {
+      if (!llvm::all_of(I->uses(),
+                        [&](Use &U) { return DT->dominates(I, U); })) {
+        BasicBlock *Dominator =
+            DT->getNode(I->getParent())->getIDom()->getBlock();
+        LLVM_DEBUG(dbgs() << "LICM rehoisting to " << Dominator->getName()
+                          << ": " << *I << "\n");
+        if (!HoistPoint || HoistPoint->getParent() != Dominator) {
+          if (HoistPoint)
+            assert(DT->dominates(Dominator, HoistPoint->getParent()) &&
+                   "New hoist point expected to dominate old hoist point");
+          HoistPoint = Dominator->getTerminator();
+        }
+        moveInstructionBefore(*I, *HoistPoint, *SafetyInfo);
+        HoistPoint = I;
+        Changed = true;
+      }
+    }
+  }
+
+  // Now that we've finished hoisting make sure that LI and DT are still valid.
+#ifndef NDEBUG
+  if (Changed) {
+    assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
+           "Dominator tree verification failed");
+    LI->verify(*DT);
+  }
+#endif
+
   return Changed;
 }
 
@@ -890,6 +1185,13 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
   I.eraseFromParent();
 }
 
+static void moveInstructionBefore(Instruction &I, Instruction &Dest,
+                                  ICFLoopSafetyInfo &SafetyInfo) {
+  SafetyInfo.removeInstruction(&I);
+  SafetyInfo.insertInstructionTo(Dest.getParent());
+  I.moveBefore(&Dest);
+}
+
 static Instruction *sinkThroughTriviallyReplaceablePHI(
     PHINode *TPN, Instruction *I, LoopInfo *LI,
     SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
@@ -1098,9 +1400,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
 /// is safe to hoist, this instruction is called to do the dirty work.
 ///
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
-  auto *Preheader = CurLoop->getLoopPreheader();
-  LLVM_DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
+                  BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
+                  OptimizationRemarkEmitter *ORE) {
+  LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getName() << ": " << I
                     << "\n");
   ORE->emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
@@ -1118,10 +1420,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
       !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
     I.dropUnknownNonDebugMetadata();
 
-  SafetyInfo->removeInstruction(&I);
-  SafetyInfo->insertInstructionTo(Preheader);
-  // Move the new node to the Preheader, before its terminator.
-  I.moveBefore(Preheader->getTerminator());
+  if (isa<PHINode>(I))
+    // Move the new node to the end of the phi list in the destination block.
+    moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo);
+  else
+    // Move the new node to the destination block, before its terminator.
+    moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo);
 
   // Do not retain debug locations when we are moving instructions to different
   // basic blocks, because we want to avoid jumpy line tables. Calls, however,
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 06083a4f5086c1c99e876b1d1f0a095af817490a..d797c9dc9e726e02deb175f54387475f9b3f87ed 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -78,6 +78,18 @@ using namespace llvm;
 #define LDIST_NAME "loop-distribute"
 #define DEBUG_TYPE LDIST_NAME
 
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopDistributeFollowupAll =
+    "llvm.loop.distribute.followup_all";
+static const char *const LLVMLoopDistributeFollowupCoincident =
+    "llvm.loop.distribute.followup_coincident";
+static const char *const LLVMLoopDistributeFollowupSequential =
+    "llvm.loop.distribute.followup_sequential";
+static const char *const LLVMLoopDistributeFollowupFallback =
+    "llvm.loop.distribute.followup_fallback";
+/// @}
+
 static cl::opt<bool>
     LDistVerify("loop-distribute-verify", cl::Hidden,
                 cl::desc("Turn on DominatorTree and LoopInfo verification "
@@ -186,7 +198,7 @@ public:
   /// Returns the loop where this partition ends up after distribution.
   /// If this partition is mapped to the original loop then use the block from
   /// the loop.
-  const Loop *getDistributedLoop() const {
+  Loop *getDistributedLoop() const {
     return ClonedLoop ? ClonedLoop : OrigLoop;
   }
 
@@ -443,6 +455,9 @@ public:
     assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
            "preheader not empty");
 
+    // Preserve the original loop ID for use after the transformation.
+    MDNode *OrigLoopID = L->getLoopID();
+
     // Create a loop for each partition except the last.  Clone the original
     // loop before PH along with adding a preheader for the cloned loop.  Then
     // update PH to point to the newly added preheader.
@@ -457,9 +472,13 @@ public:
 
       Part->getVMap()[ExitBlock] = TopPH;
       Part->remapInstructions();
+      setNewLoopID(OrigLoopID, Part);
     }
     Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
 
+    // Also set a new loop ID for the last loop.
+    setNewLoopID(OrigLoopID, &PartitionContainer.back());
+
     // Now go in forward order and update the immediate dominator for the
     // preheaders with the exiting block of the previous loop.  Dominance
     // within the loop is updated in cloneLoopWithPreheader.
@@ -575,6 +594,19 @@ private:
       }
     }
   }
+
+  /// Assign new LoopIDs for the partition's cloned loop.
+  void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) {
+    Optional<MDNode *> PartitionID = makeFollowupLoopID(
+        OrigLoopID,
+        {LLVMLoopDistributeFollowupAll,
+         Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
+                             : LLVMLoopDistributeFollowupCoincident});
+    if (PartitionID.hasValue()) {
+      Loop *NewLoop = Part->getDistributedLoop();
+      NewLoop->setLoopID(PartitionID.getValue());
+    }
+  }
 };
 
 /// For each memory instruction, this class maintains difference of the
@@ -743,6 +775,9 @@ public:
       return fail("TooManySCEVRuntimeChecks",
                   "too many SCEV run-time checks needed.\n");
 
+    if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L))
+      return fail("HeuristicDisabled", "distribution heuristic disabled");
+
     LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
     // We're done forming the partitions set up the reverse mapping from
     // instructions to partitions.
@@ -762,6 +797,8 @@ public:
                                                   RtPtrChecking);
 
     if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+      MDNode *OrigLoopID = L->getLoopID();
+
       LLVM_DEBUG(dbgs() << "\nPointers:\n");
       LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
       LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
@@ -769,6 +806,17 @@ public:
       LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
       LVer.versionLoop(DefsUsedOutside);
       LVer.annotateLoopWithNoAlias();
+
+      // The unversioned loop will not be changed, so we inherit all attributes
+      // from the original loop, but remove the loop distribution metadata to
+      // avoid to distribute it again.
+      MDNode *UnversionedLoopID =
+          makeFollowupLoopID(OrigLoopID,
+                             {LLVMLoopDistributeFollowupAll,
+                              LLVMLoopDistributeFollowupFallback},
+                             "llvm.loop.distribute.", true)
+              .getValue();
+      LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
     }
 
     // Create identical copies of the original loop for each partition and hook
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 7a4ae2eb30307a590ad805f4da4c37e17203456f..766e39b439a0d762fb9f7e39d5267159040f0cc9 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -339,10 +339,21 @@ public:
 
   bool currentLimitations();
 
+  const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
+    return OuterInnerReductions;
+  }
+
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
   bool containsUnsafeInstructions(BasicBlock *BB);
-  bool findInductions(Loop *L, SmallVector<PHINode *, 8> &Inductions);
+
+  /// Discover induction and reduction PHIs in the header of \p L. Induction
+  /// PHIs are added to \p Inductions, reductions are added to
+  /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
+  /// to be passed as \p InnerLoop.
+  bool findInductionAndReductions(Loop *L,
+                                  SmallVector<PHINode *, 8> &Inductions,
+                                  Loop *InnerLoop);
 
   Loop *OuterLoop;
   Loop *InnerLoop;
@@ -352,6 +363,9 @@ private:
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
+  /// Set of reduction PHIs taking part of a reduction across the inner and
+  /// outer loop.
+  SmallPtrSet<PHINode *, 4> OuterInnerReductions;
 };
 
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -384,9 +398,10 @@ class LoopInterchangeTransform {
 public:
   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
                            LoopInfo *LI, DominatorTree *DT,
-                           BasicBlock *LoopNestExit)
+                           BasicBlock *LoopNestExit,
+                           const LoopInterchangeLegality &LIL)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
-        LoopExit(LoopNestExit) {}
+        LoopExit(LoopNestExit), LIL(LIL) {}
 
   /// Interchange OuterLoop and InnerLoop.
   bool transform();
@@ -411,6 +426,8 @@ private:
   LoopInfo *LI;
   DominatorTree *DT;
   BasicBlock *LoopExit;
+
+  const LoopInterchangeLegality &LIL;
 };
 
 // Main LoopInterchange Pass.
@@ -560,8 +577,8 @@ struct LoopInterchange : public LoopPass {
              << "Loop interchanged with enclosing loop.";
     });
 
-    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
-                                 LoopNestExit);
+    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit,
+                                 LIL);
     LIT.transform();
     LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
     LoopsInterchanged++;
@@ -633,8 +650,36 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood(
   return true;
 }
 
-bool LoopInterchangeLegality::findInductions(
-    Loop *L, SmallVector<PHINode *, 8> &Inductions) {
+// If SV is a LCSSA PHI node with a single incoming value, return the incoming
+// value.
+static Value *followLCSSA(Value *SV) {
+  PHINode *PHI = dyn_cast<PHINode>(SV);
+  if (!PHI)
+    return SV;
+
+  if (PHI->getNumIncomingValues() != 1)
+    return SV;
+  return followLCSSA(PHI->getIncomingValue(0));
+}
+
+// Check V's users to see if it is involved in a reduction in L.
+static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
+  for (Value *User : V->users()) {
+    if (PHINode *PHI = dyn_cast<PHINode>(User)) {
+      if (PHI->getNumIncomingValues() == 1)
+        continue;
+      RecurrenceDescriptor RD;
+      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+        return PHI;
+      return nullptr;
+    }
+  }
+
+  return nullptr;
+}
+
+bool LoopInterchangeLegality::findInductionAndReductions(
+    Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
   if (!L->getLoopLatch() || !L->getLoopPredecessor())
     return false;
   for (PHINode &PHI : L->getHeader()->phis()) {
@@ -643,8 +688,32 @@ bool LoopInterchangeLegality::findInductions(
     if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
       Inductions.push_back(&PHI);
     else {
-      LLVM_DEBUG(dbgs() << "Failed to recognize PHI as an induction.\n");
-      return false;
+      // PHIs in inner loops need to be part of a reduction in the outer loop,
+      // discovered when checking the PHIs of the outer loop earlier.
+      if (!InnerLoop) {
+        if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) {
+          LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
+                               "across the outer loop.\n");
+          return false;
+        }
+      } else {
+        assert(PHI.getNumIncomingValues() == 2 &&
+               "Phis in loop header should have exactly 2 incoming values");
+        // Check if we have a PHI node in the outer loop that has a reduction
+        // result from the inner loop as an incoming value.
+        Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
+        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
+        if (!InnerRedPhi ||
+            !llvm::any_of(InnerRedPhi->incoming_values(),
+                          [&PHI](Value *V) { return V == &PHI; })) {
+          LLVM_DEBUG(
+              dbgs()
+              << "Failed to recognize PHI as an induction or reduction.\n");
+          return false;
+        }
+        OuterInnerReductions.insert(&PHI);
+        OuterInnerReductions.insert(InnerRedPhi);
+      }
     }
   }
   return true;
@@ -693,63 +762,64 @@ bool LoopInterchangeLegality::currentLimitations() {
 
   PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
-  if (!findInductions(InnerLoop, Inductions)) {
+  if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
     LLVM_DEBUG(
-        dbgs() << "Only inner loops with induction or reduction PHI nodes "
+        dbgs() << "Only outer loops with induction or reduction PHI nodes "
                << "are supported currently.\n");
     ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "Only inner loops with induction or reduction PHI nodes can be"
-                " interchange currently.";
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Only outer loops with induction or reduction PHI nodes can be"
+                " interchanged currently.";
     });
     return true;
   }
 
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
-    LLVM_DEBUG(
-        dbgs() << "We currently only support loops with 1 induction variable."
-               << "Failed to interchange due to current limitation\n");
+    LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+                      << "supported currently.\n");
     ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "Only inner loops with 1 induction variable can be "
+      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Only outer loops with 1 induction variable can be "
                 "interchanged currently.";
     });
     return true;
   }
 
-  InnerInductionVar = Inductions.pop_back_val();
-  if (!findInductions(OuterLoop, Inductions)) {
+  Inductions.clear();
+  if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) {
     LLVM_DEBUG(
-        dbgs() << "Only outer loops with induction or reduction PHI nodes "
+        dbgs() << "Only inner loops with induction or reduction PHI nodes "
                << "are supported currently.\n");
     ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
-                                      OuterLoop->getStartLoc(),
-                                      OuterLoop->getHeader())
-             << "Only outer loops with induction or reduction PHI nodes can be"
-                " interchanged currently.";
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Only inner loops with induction or reduction PHI nodes can be"
+                " interchange currently.";
     });
     return true;
   }
 
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
-    LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
-                      << "supported currently.\n");
+    LLVM_DEBUG(
+        dbgs() << "We currently only support loops with 1 induction variable."
+               << "Failed to interchange due to current limitation\n");
     ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
-                                      OuterLoop->getStartLoc(),
-                                      OuterLoop->getHeader())
-             << "Only outer loops with 1 induction variable can be "
+      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Only inner loops with 1 induction variable can be "
                 "interchanged currently.";
     });
     return true;
   }
+  InnerInductionVar = Inductions.pop_back_val();
 
   // TODO: Triangular loops are not handled for now.
   if (!isLoopStructureUnderstood(InnerInductionVar)) {
@@ -1387,13 +1457,29 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   // replaced by Inners'.
   updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
 
-  // Make sure we have no other PHIs.
-  auto InnerPhis = drop_begin(InnerLoopHeader->phis(), 1);
-  auto OuterPhis = drop_begin(OuterLoopHeader->phis(), 1);
-  (void) InnerPhis;
-  (void) OuterPhis;
-  assert(begin(InnerPhis) == end(InnerPhis) && "Unexpected PHIs in inner loop");
-  assert(begin(OuterPhis) == end(OuterPhis) && "Unexpected PHis in outer loop");
+  // Now update the reduction PHIs in the inner and outer loop headers.
+  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
+  for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
+    InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+  for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
+    OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+
+  auto &OuterInnerReductions = LIL.getOuterInnerReductions();
+  (void)OuterInnerReductions;
+
+  // Now move the remaining reduction PHIs from outer to inner loop header and
+  // vice versa. The PHI nodes must be part of a reduction across the inner and
+  // outer loop and all the remains to do is and updating the incoming blocks.
+  for (PHINode *PHI : OuterLoopPHIs) {
+    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
+    assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() &&
+           "Expected a reduction PHI node");
+  }
+  for (PHINode *PHI : InnerLoopPHIs) {
+    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
+    assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() &&
+           "Expected a reduction PHI node");
+  }
 
   // Update the incoming blocks for moved PHI nodes.
   updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader);
diff --git a/lib/Transforms/Scalar/LoopPassManager.cpp b/lib/Transforms/Scalar/LoopPassManager.cpp
index 6759bd2537c6fbaa0364edd9a16895031f7ef6ce..774ad7b945a0f282fff4a27f16142c0c1d8cce0d 100644
--- a/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -44,7 +44,11 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
 
     PreservedAnalyses PassPA = Pass->run(L, AM, AR, U);
 
-    PI.runAfterPass<Loop>(*Pass, L);
+    // do not pass deleted Loop into the instrumentation
+    if (U.skipCurrentLoop())
+      PI.runAfterPassInvalidated<Loop>(*Pass);
+    else
+      PI.runAfterPass<Loop>(*Pass, L);
 
     // If the loop was deleted, abort the run and return to the outer walk.
     if (U.skipCurrentLoop()) {
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 6cac3787311794de4dd69c9ecd8c4c09e035e2c4..c370efa120772714e2d3f56bc4e3c60e3cfcc71d 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -41,6 +41,389 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-simplifycfg"
 
+static cl::opt<bool> EnableTermFolding("enable-loop-simplifycfg-term-folding",
+                                       cl::init(true));
+
+STATISTIC(NumTerminatorsFolded,
+          "Number of terminators folded to unconditional branches");
+
+/// If \p BB is a switch or a conditional branch, but only one of its successors
+/// can be reached from this block in runtime, return this successor. Otherwise,
+/// return nullptr.
+static BasicBlock *getOnlyLiveSuccessor(BasicBlock *BB) {
+  Instruction *TI = BB->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return nullptr;
+    if (BI->getSuccessor(0) == BI->getSuccessor(1))
+      return BI->getSuccessor(0);
+    ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+    if (!Cond)
+      return nullptr;
+    return Cond->isZero() ? BI->getSuccessor(1) : BI->getSuccessor(0);
+  }
+
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
+    if (!CI)
+      return nullptr;
+    for (auto Case : SI->cases())
+      if (Case.getCaseValue() == CI)
+        return Case.getCaseSuccessor();
+    return SI->getDefaultDest();
+  }
+
+  return nullptr;
+}
+
+/// Helper class that can turn branches and switches with constant conditions
+/// into unconditional branches.
+class ConstantTerminatorFoldingImpl {
+private:
+  Loop &L;
+  LoopInfo &LI;
+  DominatorTree &DT;
+  MemorySSAUpdater *MSSAU;
+
+  // Whether or not the current loop has irreducible CFG.
+  bool HasIrreducibleCFG = false;
+  // Whether or not the current loop will still exist after terminator constant
+  // folding will be done. In theory, there are two ways how it can happen:
+  // 1. Loop's latch(es) become unreachable from loop header;
+  // 2. Loop's header becomes unreachable from method entry.
+  // In practice, the second situation is impossible because we only modify the
+  // current loop and its preheader and do not affect preheader's reachibility
+  // from any other block. So this variable set to true means that loop's latch
+  // has become unreachable from loop header.
+  bool DeleteCurrentLoop = false;
+
+  // The blocks of the original loop that will still be reachable from entry
+  // after the constant folding.
+  SmallPtrSet<BasicBlock *, 8> LiveLoopBlocks;
+  // The blocks of the original loop that will become unreachable from entry
+  // after the constant folding.
+  SmallPtrSet<BasicBlock *, 8> DeadLoopBlocks;
+  // The exits of the original loop that will still be reachable from entry
+  // after the constant folding.
+  SmallPtrSet<BasicBlock *, 8> LiveExitBlocks;
+  // The exits of the original loop that will become unreachable from entry
+  // after the constant folding.
+  SmallVector<BasicBlock *, 8> DeadExitBlocks;
+  // The blocks that will still be a part of the current loop after folding.
+  SmallPtrSet<BasicBlock *, 8> BlocksInLoopAfterFolding;
+  // The blocks that have terminators with constant condition that can be
+  // folded. Note: fold candidates should be in L but not in any of its
+  // subloops to avoid complex LI updates.
+  SmallVector<BasicBlock *, 8> FoldCandidates;
+
+  void dump() const {
+    dbgs() << "Constant terminator folding for loop " << L << "\n";
+    dbgs() << "After terminator constant-folding, the loop will";
+    if (!DeleteCurrentLoop)
+      dbgs() << " not";
+    dbgs() << " be destroyed\n";
+    auto PrintOutVector = [&](const char *Message,
+                           const SmallVectorImpl<BasicBlock *> &S) {
+      dbgs() << Message << "\n";
+      for (const BasicBlock *BB : S)
+        dbgs() << "\t" << BB->getName() << "\n";
+    };
+    auto PrintOutSet = [&](const char *Message,
+                           const SmallPtrSetImpl<BasicBlock *> &S) {
+      dbgs() << Message << "\n";
+      for (const BasicBlock *BB : S)
+        dbgs() << "\t" << BB->getName() << "\n";
+    };
+    PrintOutVector("Blocks in which we can constant-fold terminator:",
+                   FoldCandidates);
+    PrintOutSet("Live blocks from the original loop:", LiveLoopBlocks);
+    PrintOutSet("Dead blocks from the original loop:", DeadLoopBlocks);
+    PrintOutSet("Live exit blocks:", LiveExitBlocks);
+    PrintOutVector("Dead exit blocks:", DeadExitBlocks);
+    if (!DeleteCurrentLoop)
+      PrintOutSet("The following blocks will still be part of the loop:",
+                  BlocksInLoopAfterFolding);
+  }
+
+  /// Whether or not the current loop has irreducible CFG.
+  bool hasIrreducibleCFG(LoopBlocksDFS &DFS) {
+    assert(DFS.isComplete() && "DFS is expected to be finished");
+    // Index of a basic block in RPO traversal.
+    DenseMap<const BasicBlock *, unsigned> RPO;
+    unsigned Current = 0;
+    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I)
+      RPO[*I] = Current++;
+
+    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      for (auto *Succ : successors(BB))
+        if (L.contains(Succ) && !LI.isLoopHeader(Succ) && RPO[BB] > RPO[Succ])
+          // If an edge goes from a block with greater order number into a block
+          // with lesses number, and it is not a loop backedge, then it can only
+          // be a part of irreducible non-loop cycle.
+          return true;
+    }
+    return false;
+  }
+
+  /// Fill all information about status of blocks and exits of the current loop
+  /// if constant folding of all branches will be done.
+  void analyze() {
+    LoopBlocksDFS DFS(&L);
+    DFS.perform(&LI);
+    assert(DFS.isComplete() && "DFS is expected to be finished");
+
+    // TODO: The algorithm below relies on both RPO and Postorder traversals.
+    // When the loop has only reducible CFG inside, then the invariant "all
+    // predecessors of X are processed before X in RPO" is preserved. However
+    // an irreducible loop can break this invariant (e.g. latch does not have to
+    // be the last block in the traversal in this case, and the algorithm relies
+    // on this). We can later decide to support such cases by altering the
+    // algorithms, but so far we just give up analyzing them.
+    if (hasIrreducibleCFG(DFS)) {
+      HasIrreducibleCFG = true;
+      return;
+    }
+
+    // Collect live and dead loop blocks and exits.
+    LiveLoopBlocks.insert(L.getHeader());
+    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
+      BasicBlock *BB = *I;
+
+      // If a loop block wasn't marked as live so far, then it's dead.
+      if (!LiveLoopBlocks.count(BB)) {
+        DeadLoopBlocks.insert(BB);
+        continue;
+      }
+
+      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB);
+
+      // If a block has only one live successor, it's a candidate on constant
+      // folding. Only handle blocks from current loop: branches in child loops
+      // are skipped because if they can be folded, they should be folded during
+      // the processing of child loops.
+      if (TheOnlySucc && LI.getLoopFor(BB) == &L)
+        FoldCandidates.push_back(BB);
+
+      // Handle successors.
+      for (BasicBlock *Succ : successors(BB))
+        if (!TheOnlySucc || TheOnlySucc == Succ) {
+          if (L.contains(Succ))
+            LiveLoopBlocks.insert(Succ);
+          else
+            LiveExitBlocks.insert(Succ);
+        }
+    }
+
+    // Sanity check: amount of dead and live loop blocks should match the total
+    // number of blocks in loop.
+    assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() &&
+           "Malformed block sets?");
+
+    // Now, all exit blocks that are not marked as live are dead.
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    L.getExitBlocks(ExitBlocks);
+    for (auto *ExitBlock : ExitBlocks)
+      if (!LiveExitBlocks.count(ExitBlock))
+        DeadExitBlocks.push_back(ExitBlock);
+
+    // Whether or not the edge From->To will still be present in graph after the
+    // folding.
+    auto IsEdgeLive = [&](BasicBlock *From, BasicBlock *To) {
+      if (!LiveLoopBlocks.count(From))
+        return false;
+      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(From);
+      return !TheOnlySucc || TheOnlySucc == To;
+    };
+
+    // The loop will not be destroyed if its latch is live.
+    DeleteCurrentLoop = !IsEdgeLive(L.getLoopLatch(), L.getHeader());
+
+    // If we are going to delete the current loop completely, no extra analysis
+    // is needed.
+    if (DeleteCurrentLoop)
+      return;
+
+    // Otherwise, we should check which blocks will still be a part of the
+    // current loop after the transform.
+    BlocksInLoopAfterFolding.insert(L.getLoopLatch());
+    // If the loop is live, then we should compute what blocks are still in
+    // loop after all branch folding has been done. A block is in loop if
+    // it has a live edge to another block that is in the loop; by definition,
+    // latch is in the loop.
+    auto BlockIsInLoop = [&](BasicBlock *BB) {
+      return any_of(successors(BB), [&](BasicBlock *Succ) {
+        return BlocksInLoopAfterFolding.count(Succ) && IsEdgeLive(BB, Succ);
+      });
+    };
+    for (auto I = DFS.beginPostorder(), E = DFS.endPostorder(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      if (BlockIsInLoop(BB))
+        BlocksInLoopAfterFolding.insert(BB);
+    }
+
+    // Sanity check: header must be in loop.
+    assert(BlocksInLoopAfterFolding.count(L.getHeader()) &&
+           "Header not in loop?");
+    assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() &&
+           "All blocks that stay in loop should be live!");
+  }
+
+  /// Constant-fold terminators of blocks acculumated in FoldCandidates into the
+  /// unconditional branches.
+  void foldTerminators() {
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+
+    for (BasicBlock *BB : FoldCandidates) {
+      assert(LI.getLoopFor(BB) == &L && "Should be a loop block!");
+      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB);
+      assert(TheOnlySucc && "Should have one live successor!");
+
+      LLVM_DEBUG(dbgs() << "Replacing terminator of " << BB->getName()
+                        << " with an unconditional branch to the block "
+                        << TheOnlySucc->getName() << "\n");
+
+      SmallPtrSet<BasicBlock *, 2> DeadSuccessors;
+      // Remove all BB's successors except for the live one.
+      unsigned TheOnlySuccDuplicates = 0;
+      for (auto *Succ : successors(BB))
+        if (Succ != TheOnlySucc) {
+          DeadSuccessors.insert(Succ);
+          // If our successor lies in a different loop, we don't want to remove
+          // the one-input Phi because it is a LCSSA Phi.
+          bool PreserveLCSSAPhi = !L.contains(Succ);
+          Succ->removePredecessor(BB, PreserveLCSSAPhi);
+          if (MSSAU)
+            MSSAU->removeEdge(BB, Succ);
+        } else
+          ++TheOnlySuccDuplicates;
+
+      assert(TheOnlySuccDuplicates > 0 && "Should be!");
+      // If TheOnlySucc was BB's successor more than once, after transform it
+      // will be its successor only once. Remove redundant inputs from
+      // TheOnlySucc's Phis.
+      bool PreserveLCSSAPhi = !L.contains(TheOnlySucc);
+      for (unsigned Dup = 1; Dup < TheOnlySuccDuplicates; ++Dup)
+        TheOnlySucc->removePredecessor(BB, PreserveLCSSAPhi);
+      if (MSSAU && TheOnlySuccDuplicates > 1)
+        MSSAU->removeDuplicatePhiEdgesBetween(BB, TheOnlySucc);
+
+      IRBuilder<> Builder(BB->getContext());
+      Instruction *Term = BB->getTerminator();
+      Builder.SetInsertPoint(Term);
+      Builder.CreateBr(TheOnlySucc);
+      Term->eraseFromParent();
+
+      for (auto *DeadSucc : DeadSuccessors)
+        DTU.deleteEdge(BB, DeadSucc);
+
+      ++NumTerminatorsFolded;
+    }
+  }
+
+public:
+  ConstantTerminatorFoldingImpl(Loop &L, LoopInfo &LI, DominatorTree &DT,
+                                MemorySSAUpdater *MSSAU)
+      : L(L), LI(LI), DT(DT), MSSAU(MSSAU) {}
+  bool run() {
+    assert(L.getLoopLatch() && "Should be single latch!");
+
+    // Collect all available information about status of blocks after constant
+    // folding.
+    analyze();
+
+    LLVM_DEBUG(dbgs() << "In function " << L.getHeader()->getParent()->getName()
+                      << ": ");
+
+    if (HasIrreducibleCFG) {
+      LLVM_DEBUG(dbgs() << "Loops with irreducible CFG are not supported!\n");
+      return false;
+    }
+
+    // Nothing to constant-fold.
+    if (FoldCandidates.empty()) {
+      LLVM_DEBUG(
+          dbgs() << "No constant terminator folding candidates found in loop "
+                 << L.getHeader()->getName() << "\n");
+      return false;
+    }
+
+    // TODO: Support deletion of the current loop.
+    if (DeleteCurrentLoop) {
+      LLVM_DEBUG(
+          dbgs()
+          << "Give up constant terminator folding in loop "
+          << L.getHeader()->getName()
+          << ": we don't currently support deletion of the current loop.\n");
+      return false;
+    }
+
+    // TODO: Support deletion of dead loop blocks.
+    if (!DeadLoopBlocks.empty()) {
+      LLVM_DEBUG(dbgs() << "Give up constant terminator folding in loop "
+                        << L.getHeader()->getName()
+                        << ": we don't currently"
+                           " support deletion of dead in-loop blocks.\n");
+      return false;
+    }
+
+    // TODO: Support dead loop exits.
+    if (!DeadExitBlocks.empty()) {
+      LLVM_DEBUG(dbgs() << "Give up constant terminator folding in loop "
+                        << L.getHeader()->getName()
+                        << ": we don't currently support dead loop exits.\n");
+      return false;
+    }
+
+    // TODO: Support blocks that are not dead, but also not in loop after the
+    // folding.
+    if (BlocksInLoopAfterFolding.size() != L.getNumBlocks()) {
+      LLVM_DEBUG(
+          dbgs() << "Give up constant terminator folding in loop "
+                 << L.getHeader()->getName()
+                 << ": we don't currently"
+                    " support blocks that are not dead, but will stop "
+                    "being a part of the loop after constant-folding.\n");
+      return false;
+    }
+
+    // Dump analysis results.
+    LLVM_DEBUG(dump());
+
+    LLVM_DEBUG(dbgs() << "Constant-folding " << FoldCandidates.size()
+                      << " terminators in loop " << L.getHeader()->getName()
+                      << "\n");
+
+    // Make the actual transforms.
+    foldTerminators();
+
+#ifndef NDEBUG
+    // Make sure that we have preserved all data structures after the transform.
+    DT.verify();
+    assert(DT.isReachableFromEntry(L.getHeader()));
+    LI.verify(DT);
+#endif
+
+    return true;
+  }
+};
+
+/// Turn branches and switches with known constant conditions into unconditional
+/// branches.
+static bool constantFoldTerminators(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                                    MemorySSAUpdater *MSSAU) {
+  if (!EnableTermFolding)
+    return false;
+
+  // To keep things simple, only process loops with single latch. We
+  // canonicalize most loops to this form. We can support multi-latch if needed.
+  if (!L.getLoopLatch())
+    return false;
+
+  ConstantTerminatorFoldingImpl BranchFolder(L, LI, DT, MSSAU);
+  return BranchFolder.run();
+}
+
 static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT,
                                         LoopInfo &LI, MemorySSAUpdater *MSSAU) {
   bool Changed = false;
@@ -73,6 +456,9 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
                             ScalarEvolution &SE, MemorySSAUpdater *MSSAU) {
   bool Changed = false;
 
+  // Constant-fold terminators with known constant conditions.
+  Changed |= constantFoldTerminators(L, DT, LI, MSSAU);
+
   // Eliminate unconditional branches by merging blocks into their predecessors.
   Changed |= mergeBlocksIntoPredecessors(L, DT, LI, MSSAU);
 
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index ce6ecea2dc2a8dcf6bdd3059175b095811d32b4f..540d19fc7939199b63c7c753fe1b9880583e4f84 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -280,6 +280,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   // Compute alias set.
   for (BasicBlock *BB : L.blocks())
     CurAST.add(*BB);
+  CurAST.add(*Preheader);
 
   // Sort loop's basic blocks by frequency
   SmallVector<BasicBlock *, 10> ColdLoopBBs;
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 857b83da96d8cf84502e6bd4c3d3374bc2b0a04b..773ffb9df0a2804a7d70898945c73f99dcf8ef90 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -155,6 +155,11 @@ static cl::opt<bool> FilterSameScaledReg(
     cl::desc("Narrow LSR search space by filtering non-optimal formulae"
              " with the same ScaledReg and Scale"));
 
+static cl::opt<unsigned> ComplexityLimit(
+  "lsr-complexity-limit", cl::Hidden,
+  cl::init(std::numeric_limits<uint16_t>::max()),
+  cl::desc("LSR search space complexity limit"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -3638,32 +3643,62 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
                                        Formula Base) {
   // This method is only interesting on a plurality of registers.
-  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
+  if (Base.BaseRegs.size() + (Base.Scale == 1) +
+      (Base.UnfoldedOffset != 0) <= 1)
     return;
 
   // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
   // processing the formula.
   Base.unscale();
-  Formula F = Base;
-  F.BaseRegs.clear();
   SmallVector<const SCEV *, 4> Ops;
+  Formula NewBase = Base;
+  NewBase.BaseRegs.clear();
+  Type *CombinedIntegerType = nullptr;
   for (const SCEV *BaseReg : Base.BaseRegs) {
     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
-        !SE.hasComputableLoopEvolution(BaseReg, L))
+        !SE.hasComputableLoopEvolution(BaseReg, L)) {
+      if (!CombinedIntegerType)
+        CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
       Ops.push_back(BaseReg);
+    }
     else
-      F.BaseRegs.push_back(BaseReg);
+      NewBase.BaseRegs.push_back(BaseReg);
   }
-  if (Ops.size() > 1) {
-    const SCEV *Sum = SE.getAddExpr(Ops);
+
+  // If no register is relevant, we're done.
+  if (Ops.size() == 0)
+    return;
+
+  // Utility function for generating the required variants of the combined
+  // registers.
+  auto GenerateFormula = [&](const SCEV *Sum) {
+    Formula F = NewBase;
+
     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
     // opportunity to fold something. For now, just ignore such cases
     // rather than proceed with zero in a register.
-    if (!Sum->isZero()) {
-      F.BaseRegs.push_back(Sum);
-      F.canonicalize(*L);
-      (void)InsertFormula(LU, LUIdx, F);
-    }
+    if (Sum->isZero())
+      return;
+
+    F.BaseRegs.push_back(Sum);
+    F.canonicalize(*L);
+    (void)InsertFormula(LU, LUIdx, F);
+  };
+
+  // If we collected at least two registers, generate a formula combining them.
+  if (Ops.size() > 1) {
+    SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
+    GenerateFormula(SE.getAddExpr(OpsCopy));
+  }
+
+  // If we have an unfolded offset, generate a formula combining it with the
+  // registers collected.
+  if (NewBase.UnfoldedOffset) {
+    assert(CombinedIntegerType && "Missing a type for the unfolded offset");
+    Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
+                                 true));
+    NewBase.UnfoldedOffset = 0;
+    GenerateFormula(SE.getAddExpr(Ops));
   }
 }
 
@@ -4281,9 +4316,6 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
   });
 }
 
-// This is a rough guess that seems to work fairly well.
-static const size_t ComplexityLimit = std::numeric_limits<uint16_t>::max();
-
 /// Estimate the worst-case number of solutions the solver might have to
 /// consider. It almost never considers this many solutions because it prune the
 /// search space, but the pruning isn't always sufficient.
diff --git a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 30dfb9b5dd26459d1e7652b8d5090a84a314c50a..da46210b6fdd21facd3c53d0c59801e880101619 100644
--- a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -56,6 +56,20 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-unroll-and-jam"
 
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopUnrollAndJamFollowupAll =
+    "llvm.loop.unroll_and_jam.followup_all";
+static const char *const LLVMLoopUnrollAndJamFollowupInner =
+    "llvm.loop.unroll_and_jam.followup_inner";
+static const char *const LLVMLoopUnrollAndJamFollowupOuter =
+    "llvm.loop.unroll_and_jam.followup_outer";
+static const char *const LLVMLoopUnrollAndJamFollowupRemainderInner =
+    "llvm.loop.unroll_and_jam.followup_remainder_inner";
+static const char *const LLVMLoopUnrollAndJamFollowupRemainderOuter =
+    "llvm.loop.unroll_and_jam.followup_remainder_outer";
+/// @}
+
 static cl::opt<bool>
     AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
                       cl::desc("Allows loops to be unroll-and-jammed."));
@@ -112,11 +126,6 @@ static bool HasUnrollAndJamEnablePragma(const Loop *L) {
   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
 }
 
-// Returns true if the loop has an unroll_and_jam(disable) pragma.
-static bool HasUnrollAndJamDisablePragma(const Loop *L) {
-  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable");
-}
-
 // If loop has an unroll_and_jam_count pragma return the (necessarily
 // positive) value from the pragma.  Otherwise return 0.
 static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
@@ -299,13 +308,16 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                     << L->getHeader()->getParent()->getName() << "] Loop %"
                     << L->getHeader()->getName() << "\n");
 
+  TransformationMode EnableMode = hasUnrollAndJamTransformation(L);
+  if (EnableMode & TM_Disable)
+    return LoopUnrollResult::Unmodified;
+
   // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
   // the unroller, so long as it does not explicitly have unroll_and_jam
   // metadata. This means #pragma nounroll will disable unroll and jam as well
   // as unrolling
-  if (HasUnrollAndJamDisablePragma(L) ||
-      (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
-       !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) {
+  if (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
+      !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam.")) {
     LLVM_DEBUG(dbgs() << "  Disabled due to pragma.\n");
     return LoopUnrollResult::Unmodified;
   }
@@ -344,6 +356,19 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
+  // Save original loop IDs for after the transformation.
+  MDNode *OrigOuterLoopID = L->getLoopID();
+  MDNode *OrigSubLoopID = SubLoop->getLoopID();
+
+  // To assign the loop id of the epilogue, assign it before unrolling it so it
+  // is applied to every inner loop of the epilogue. We later apply the loop ID
+  // for the jammed inner loop.
+  Optional<MDNode *> NewInnerEpilogueLoopID = makeFollowupLoopID(
+      OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+                        LLVMLoopUnrollAndJamFollowupRemainderInner});
+  if (NewInnerEpilogueLoopID.hasValue())
+    SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue());
+
   // Find trip count and trip multiple
   unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
   unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
@@ -359,9 +384,39 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   if (OuterTripCount && UP.Count > OuterTripCount)
     UP.Count = OuterTripCount;
 
-  LoopUnrollResult UnrollResult =
-      UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple,
-                       UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE);
+  Loop *EpilogueOuterLoop = nullptr;
+  LoopUnrollResult UnrollResult = UnrollAndJamLoop(
+      L, UP.Count, OuterTripCount, OuterTripMultiple, UP.UnrollRemainder, LI,
+      &SE, &DT, &AC, &ORE, &EpilogueOuterLoop);
+
+  // Assign new loop attributes.
+  if (EpilogueOuterLoop) {
+    Optional<MDNode *> NewOuterEpilogueLoopID = makeFollowupLoopID(
+        OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+                          LLVMLoopUnrollAndJamFollowupRemainderOuter});
+    if (NewOuterEpilogueLoopID.hasValue())
+      EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue());
+  }
+
+  Optional<MDNode *> NewInnerLoopID =
+      makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+                                           LLVMLoopUnrollAndJamFollowupInner});
+  if (NewInnerLoopID.hasValue())
+    SubLoop->setLoopID(NewInnerLoopID.getValue());
+  else
+    SubLoop->setLoopID(OrigSubLoopID);
+
+  if (UnrollResult == LoopUnrollResult::PartiallyUnrolled) {
+    Optional<MDNode *> NewOuterLoopID = makeFollowupLoopID(
+        OrigOuterLoopID,
+        {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter});
+    if (NewOuterLoopID.hasValue()) {
+      L->setLoopID(NewOuterLoopID.getValue());
+
+      // Do not setLoopAlreadyUnrolled if a followup was given.
+      return UnrollResult;
+    }
+  }
 
   // If loop has an unroll count pragma or unrolled by explicitly set count
   // mark loop as unrolled to prevent unrolling beyond that requested.
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index d10dae124a79477cd383c4c41584e2d942a386fa..38b80f48ed0e2ccf5bb86432db3905d9e6b915b4 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -661,11 +661,6 @@ static bool HasUnrollEnablePragma(const Loop *L) {
   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
 }
 
-// Returns true if the loop has an unroll(disable) pragma.
-static bool HasUnrollDisablePragma(const Loop *L) {
-  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
-}
-
 // Returns true if the loop has an runtime unroll(disable) pragma.
 static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
@@ -713,12 +708,19 @@ static uint64_t getUnrolledLoopSize(
 
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
+// Unless IgnoreUser is true, will also use metadata and command-line options
+// that are specific to to the LoopUnroll pass (which, for instance, are
+// irrelevant for the LoopUnrollAndJam pass).
+// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
+// many LoopUnroll-specific options. The shared functionality should be
+// refactored into it own function.
 bool llvm::computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
     ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
     OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
     unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
+
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
   bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
@@ -963,13 +965,15 @@ static LoopUnrollResult tryToUnrollLoop(
     Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
     const TargetTransformInfo &TTI, AssumptionCache &AC,
     OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
-    Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
-    Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
-    Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
+    bool OnlyWhenForced, Optional<unsigned> ProvidedCount,
+    Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
+    Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
+    Optional<bool> ProvidedAllowPeeling) {
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
                     << L->getHeader()->getName() << "\n");
-  if (HasUnrollDisablePragma(L))
+  TransformationMode TM = hasUnrollTransformation(L);
+  if (TM & TM_Disable)
     return LoopUnrollResult::Unmodified;
   if (!L->isLoopSimplifyForm()) {
     LLVM_DEBUG(
@@ -977,6 +981,11 @@ static LoopUnrollResult tryToUnrollLoop(
     return LoopUnrollResult::Unmodified;
   }
 
+  // When automtatic unrolling is disabled, do not unroll unless overridden for
+  // this loop.
+  if (OnlyWhenForced && !(TM & TM_Enable))
+    return LoopUnrollResult::Unmodified;
+
   unsigned NumInlineCandidates;
   bool NotDuplicatable;
   bool Convergent;
@@ -1066,14 +1075,39 @@ static LoopUnrollResult tryToUnrollLoop(
   if (TripCount && UP.Count > TripCount)
     UP.Count = TripCount;
 
+  // Save loop properties before it is transformed.
+  MDNode *OrigLoopID = L->getLoopID();
+
   // Unroll the loop.
+  Loop *RemainderLoop = nullptr;
   LoopUnrollResult UnrollResult = UnrollLoop(
       L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
       UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
-      LI, &SE, &DT, &AC, &ORE, PreserveLCSSA);
+      LI, &SE, &DT, &AC, &ORE, PreserveLCSSA, &RemainderLoop);
   if (UnrollResult == LoopUnrollResult::Unmodified)
     return LoopUnrollResult::Unmodified;
 
+  if (RemainderLoop) {
+    Optional<MDNode *> RemainderLoopID =
+        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
+                                        LLVMLoopUnrollFollowupRemainder});
+    if (RemainderLoopID.hasValue())
+      RemainderLoop->setLoopID(RemainderLoopID.getValue());
+  }
+
+  if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
+    Optional<MDNode *> NewLoopID =
+        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
+                                        LLVMLoopUnrollFollowupUnrolled});
+    if (NewLoopID.hasValue()) {
+      L->setLoopID(NewLoopID.getValue());
+
+      // Do not setLoopAlreadyUnrolled if loop attributes have been specified
+      // explicitly.
+      return UnrollResult;
+    }
+  }
+
   // If loop has an unroll count pragma or unrolled by explicitly set count
   // mark loop as unrolled to prevent unrolling beyond that requested.
   // If the loop was peeled, we already "used up" the profile information
@@ -1092,6 +1126,12 @@ public:
   static char ID; // Pass ID, replacement for typeid
 
   int OptLevel;
+
+  /// If false, use a cost model to determine whether unrolling of a loop is
+  /// profitable. If true, only loops that explicitly request unrolling via
+  /// metadata are considered. All other loops are skipped.
+  bool OnlyWhenForced;
+
   Optional<unsigned> ProvidedCount;
   Optional<unsigned> ProvidedThreshold;
   Optional<bool> ProvidedAllowPartial;
@@ -1099,15 +1139,16 @@ public:
   Optional<bool> ProvidedUpperBound;
   Optional<bool> ProvidedAllowPeeling;
 
-  LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
+  LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
+             Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
              Optional<bool> UpperBound = None,
              Optional<bool> AllowPeeling = None)
-      : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
-        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
-        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
-        ProvidedAllowPeeling(AllowPeeling) {
+      : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
+        ProvidedCount(std::move(Count)), ProvidedThreshold(Threshold),
+        ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime),
+        ProvidedUpperBound(UpperBound), ProvidedAllowPeeling(AllowPeeling) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
@@ -1130,8 +1171,8 @@ public:
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
     LoopUnrollResult Result = tryToUnrollLoop(
-        L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount,
-        ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
+        L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, OnlyWhenForced,
+        ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
         ProvidedUpperBound, ProvidedAllowPeeling);
 
     if (Result == LoopUnrollResult::FullyUnrolled)
@@ -1161,14 +1202,16 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
-                                 int AllowPartial, int Runtime, int UpperBound,
+Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
+                                 int Threshold, int Count, int AllowPartial,
+                                 int Runtime, int UpperBound,
                                  int AllowPeeling) {
   // TODO: It would make more sense for this function to take the optionals
   // directly, but that's dangerous since it would silently break out of tree
   // callers.
   return new LoopUnroll(
-      OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
+      OptLevel, OnlyWhenForced,
+      Threshold == -1 ? None : Optional<unsigned>(Threshold),
       Count == -1 ? None : Optional<unsigned>(Count),
       AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
       Runtime == -1 ? None : Optional<bool>(Runtime),
@@ -1176,8 +1219,8 @@ Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
       AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
 }
 
-Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
-  return createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0, 0);
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced) {
+  return createLoopUnrollPass(OptLevel, OnlyWhenForced, -1, -1, 0, 0, 0, 0);
 }
 
 PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
@@ -1207,7 +1250,8 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
 
   bool Changed =
       tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
-                      /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+                      /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
+                      /*Count*/ None,
                       /*Threshold*/ None, /*AllowPartial*/ false,
                       /*Runtime*/ false, /*UpperBound*/ false,
                       /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
@@ -1344,7 +1388,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
     // flavors of unrolling during construction time (by setting UnrollOpts).
     LoopUnrollResult Result = tryToUnrollLoop(
         &L, DT, &LI, SE, TTI, AC, ORE,
-        /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*Count*/ None,
+        /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
+        /*Count*/ None,
         /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
         UnrollOpts.AllowUpperBound, LocalAllowPeeling);
     Changed |= Result != LoopUnrollResult::Unmodified;
diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 06e86081e8a0c31cc286a1aefb66f776d9bdb6d5..c0c59d24dffc708b61d582e4ef951a24f91ce322 100644
--- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -594,6 +594,11 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   if (skipLoop(L))
     return false;
+
+  // Do not do the transformation if disabled by metadata.
+  if (hasLICMVersioningTransformation(L) & TM_Disable)
+    return false;
+
   // Get Analysis information.
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
diff --git a/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ba3994eba0ed57aac65aa2a1d98357f9cf257d1
--- /dev/null
+++ b/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -0,0 +1,120 @@
+//===- MakeGuardsExplicit.cpp - Turn guard intrinsics into guard branches -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the @llvm.experimental.guard intrinsic to the new form of
+// guard represented as widenable explicit branch to the deopt block. The
+// difference between this pass and LowerGuardIntrinsic is that after this pass
+// the guard represented as intrinsic:
+//
+//   call void(i1, ...) @llvm.experimental.guard(i1 %old_cond) [ "deopt"() ]
+//
+// transforms to a guard represented as widenable explicit branch:
+//
+//   %widenable_cond = call i1 @llvm.experimental.widenable.condition()
+//   br i1 (%old_cond & %widenable_cond), label %guarded, label %deopt
+//
+// Here:
+//   - The semantics of @llvm.experimental.widenable.condition allows to replace
+//     %widenable_cond with the construction (%widenable_cond & %any_other_cond)
+//     without loss of correctness;
+//   - %guarded is the lower part of old guard intrinsic's parent block split by
+//     the intrinsic call;
+//   - %deopt is a block containing a sole call to @llvm.experimental.deoptimize
+//     intrinsic.
+//
+// Therefore, this branch preserves the property of widenability.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MakeGuardsExplicit.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct MakeGuardsExplicitLegacyPass : public FunctionPass {
+  static char ID;
+  MakeGuardsExplicitLegacyPass() : FunctionPass(ID) {
+    initializeMakeGuardsExplicitLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+}
+
+static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) {
+  // Replace the guard with an explicit branch (just like in GuardWidening).
+  BasicBlock *BB = Guard->getParent();
+  makeGuardControlFlowExplicit(DeoptIntrinsic, Guard);
+  BranchInst *ExplicitGuard = cast<BranchInst>(BB->getTerminator());
+  assert(ExplicitGuard->isConditional() && "Must be!");
+
+  // We want the guard to be expressed as explicit control flow, but still be
+  // widenable. For that, we add Widenable Condition intrinsic call to the
+  // guard's condition.
+  IRBuilder<> B(ExplicitGuard);
+  auto *WidenableCondition =
+      B.CreateIntrinsic(Intrinsic::experimental_widenable_condition,
+                        {}, {}, nullptr, "widenable_cond");
+  WidenableCondition->setCallingConv(Guard->getCallingConv());
+  auto *NewCond =
+      B.CreateAnd(ExplicitGuard->getCondition(), WidenableCondition);
+  NewCond->setName("exiplicit_guard_cond");
+  ExplicitGuard->setCondition(NewCond);
+  Guard->eraseFromParent();
+}
+
+static bool explicifyGuards(Function &F) {
+  // Check if we can cheaply rule out the possibility of not having any work to
+  // do.
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  SmallVector<CallInst *, 8> GuardIntrinsics;
+  for (auto &I : instructions(F))
+    if (isGuard(&I))
+      GuardIntrinsics.push_back(cast<CallInst>(&I));
+
+  if (GuardIntrinsics.empty())
+    return false;
+
+  auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
+  DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
+
+  for (auto *Guard : GuardIntrinsics)
+    turnToExplicitForm(Guard, DeoptIntrinsic);
+
+  return true;
+}
+
+bool MakeGuardsExplicitLegacyPass::runOnFunction(Function &F) {
+  return explicifyGuards(F);
+}
+
+char MakeGuardsExplicitLegacyPass::ID = 0;
+INITIALIZE_PASS(MakeGuardsExplicitLegacyPass, "make-guards-explicit",
+                "Lower the guard intrinsic to explicit control flow form",
+                false, false)
+
+PreservedAnalyses MakeGuardsExplicitPass::run(Function &F,
+                                           FunctionAnalysisManager &) {
+  if (explicifyGuards(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 4e82e2bd42c008ffefe5e5527bca37c06ee68314..8756a1afcddfb52f2ffec90e56f3df0ebeb37e40 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1144,6 +1144,21 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   return true;
 }
 
+/// Determine whether the instruction has undefined content for the given Size,
+/// either because it was freshly alloca'd or started its lifetime.
+static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
+  if (isa<AllocaInst>(I))
+    return true;
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+        if (LTSize->getZExtValue() >= Size->getZExtValue())
+          return true;
+
+  return false;
+}
+
 /// Transform memcpy to memset when its source was just memset.
 /// In other words, turn:
 /// \code
@@ -1167,12 +1182,27 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
   if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
     return false;
 
-  ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
+  // A known memset size is required.
   ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
+  if (!MemSetSize)
+    return false;
+
   // Make sure the memcpy doesn't read any more than what the memset wrote.
   // Don't worry about sizes larger than i64.
-  if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue())
-    return false;
+  ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
+  if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) {
+    // If the memcpy is larger than the memset, but the memory was undef prior
+    // to the memset, we can just ignore the tail. Technically we're only
+    // interested in the bytes from MemSetSize..CopySize here, but as we can't
+    // easily represent this location, we use the full 0..CopySize range.
+    MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
+    MemDepResult DepInfo = MD->getPointerDependencyFrom(
+        MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
+    if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
+      CopySize = MemSetSize;
+    else
+      return false;
+  }
 
   IRBuilder<> Builder(MemCpy);
   Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
@@ -1252,19 +1282,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
     if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
       return processMemCpyMemCpyDependence(M, MDep);
   } else if (SrcDepInfo.isDef()) {
-    Instruction *I = SrcDepInfo.getInst();
-    bool hasUndefContents = false;
-
-    if (isa<AllocaInst>(I)) {
-      hasUndefContents = true;
-    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start)
-        if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
-          if (LTSize->getZExtValue() >= CopySize->getZExtValue())
-            hasUndefContents = true;
-    }
-
-    if (hasUndefContents) {
+    if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
       MD->removeInstruction(M);
       M->eraseFromParent();
       ++NumMemCpyInstr;
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 9803bcb485d26994f17c82a20a8809bdd2f9b818..7cbb0fe70f820762001e702f0b8ab81418e3af37 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -3175,8 +3175,7 @@ bool NewGVN::singleReachablePHIPath(
   auto FilteredPhiArgs =
       make_filter_range(MP->operands(), ReachableOperandPred);
   SmallVector<const Value *, 32> OperandList;
-  std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
-            std::back_inserter(OperandList));
+  llvm::copy(FilteredPhiArgs, std::back_inserter(OperandList));
   bool Okay = is_splat(OperandList);
   if (Okay)
     return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
@@ -4094,10 +4093,13 @@ bool NewGVN::eliminateInstructions(Function &F) {
           // It's about to be alive again.
           if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
             ProbablyDead.erase(cast<Instruction>(DominatingLeader));
-          // Copy instructions, however, are still dead because we use their
-          // operand as the leader.
-          if (LeaderUseCount == 0 && isSSACopy)
-            ProbablyDead.insert(II);
+          // For copy instructions, we use their operand as a leader,
+          // which means we remove a user of the copy and it may become dead.
+          if (isSSACopy) {
+            unsigned &IIUseCount = UseCounts[II];
+            if (--IIUseCount == 0)
+              ProbablyDead.insert(II);
+          }
           ++LeaderUseCount;
           AnythingReplaced = true;
         }
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 61b6d7ca259300f1f5435fe30752c878b2590d01..cb893eab1654f6968cd3812758b26628f0495789 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -789,13 +789,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
       // Discard any debug info related to the expressions that has changed (we
       // can leave debug infor related to the root, since the result of the
       // expression tree should be the same even after reassociation).
-      SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
-      findDbgUsers(DbgUsers, ExpressionChanged);
-      for (auto *DII : DbgUsers) {
-        Value *Undef = UndefValue::get(ExpressionChanged->getType());
-        DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
-                                                ValueAsMetadata::get(Undef)));
-      }
+      replaceDbgUsesWithUndef(ExpressionChanged);
 
       ExpressionChanged->moveBefore(I);
       ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 1f98128f923a6301593d5c69b97af4d8b9d4d874..2f6ed05c023b1e6bb63835fb59682ebc334b1786 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -247,19 +247,25 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   using Edge = std::pair<BasicBlock *, BasicBlock *>;
   DenseSet<Edge> KnownFeasibleEdges;
 
-  DenseMap<Function *, std::unique_ptr<PredicateInfo>> PredInfos;
+  DenseMap<Function *, AnalysisResultsForFn> AnalysisResults;
   DenseMap<Value *, SmallPtrSet<User *, 2>> AdditionalUsers;
 
 public:
-  void addPredInfo(Function &F, std::unique_ptr<PredicateInfo> PI) {
-    PredInfos[&F] = std::move(PI);
+  void addAnalysis(Function &F, AnalysisResultsForFn A) {
+    AnalysisResults.insert({&F, std::move(A)});
   }
 
   const PredicateBase *getPredicateInfoFor(Instruction *I) {
-    auto PI = PredInfos.find(I->getFunction());
-    if (PI == PredInfos.end())
+    auto A = AnalysisResults.find(I->getParent()->getParent());
+    if (A == AnalysisResults.end())
       return nullptr;
-    return PI->second->getPredicateInfoFor(I);
+    return A->second.PredInfo->getPredicateInfoFor(I);
+  }
+
+  DomTreeUpdater getDTU(Function &F) {
+    auto A = AnalysisResults.find(&F);
+    assert(A != AnalysisResults.end() && "Need analysis results for function.");
+    return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy};
   }
 
   SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
@@ -1165,26 +1171,26 @@ void SCCPSolver::visitCallSite(CallSite CS) {
       if (!PI)
         return;
 
-      auto *PBranch = dyn_cast<PredicateBranch>(getPredicateInfoFor(I));
+      Value *CopyOf = I->getOperand(0);
+      auto *PBranch = dyn_cast<PredicateBranch>(PI);
       if (!PBranch) {
-        mergeInValue(ValueState[I], I, getValueState(PI->OriginalOp));
+        mergeInValue(ValueState[I], I, getValueState(CopyOf));
         return;
       }
 
-      Value *CopyOf = I->getOperand(0);
       Value *Cond = PBranch->Condition;
 
       // Everything below relies on the condition being a comparison.
       auto *Cmp = dyn_cast<CmpInst>(Cond);
       if (!Cmp) {
-        mergeInValue(ValueState[I], I, getValueState(PI->OriginalOp));
+        mergeInValue(ValueState[I], I, getValueState(CopyOf));
         return;
       }
 
       Value *CmpOp0 = Cmp->getOperand(0);
       Value *CmpOp1 = Cmp->getOperand(1);
       if (CopyOf != CmpOp0 && CopyOf != CmpOp1) {
-        mergeInValue(ValueState[I], I, getValueState(PI->OriginalOp));
+        mergeInValue(ValueState[I], I, getValueState(CopyOf));
         return;
       }
 
@@ -1211,7 +1217,7 @@ void SCCPSolver::visitCallSite(CallSite CS) {
         return;
       }
 
-      return (void)mergeInValue(IV, I, getValueState(PBranch->OriginalOp));
+      return (void)mergeInValue(IV, I, getValueState(CopyOf));
     }
   }
 
@@ -1927,10 +1933,9 @@ static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) {
   }
 }
 
-
 bool llvm::runIPSCCP(
     Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI,
-    function_ref<std::unique_ptr<PredicateInfo>(Function &)> getPredicateInfo) {
+    function_ref<AnalysisResultsForFn(Function &)> getAnalysis) {
   SCCPSolver Solver(DL, TLI);
 
   // Loop over all functions, marking arguments to those with their addresses
@@ -1939,7 +1944,8 @@ bool llvm::runIPSCCP(
     if (F.isDeclaration())
       continue;
 
-    Solver.addPredInfo(F, getPredicateInfo(F));
+    Solver.addAnalysis(F, getAnalysis(F));
+
     // Determine if we can track the function's return values. If so, add the
     // function to the solver's set of return-tracked functions.
     if (canTrackReturnsInterprocedurally(&F))
@@ -1988,12 +1994,13 @@ bool llvm::runIPSCCP(
 
   // Iterate over all of the instructions in the module, replacing them with
   // constants if we have found them to be of constant values.
-  SmallVector<BasicBlock*, 512> BlocksToErase;
 
   for (Function &F : M) {
     if (F.isDeclaration())
       continue;
 
+    SmallVector<BasicBlock *, 512> BlocksToErase;
+
     if (Solver.isBlockExecutable(&F.front()))
       for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
            ++AI) {
@@ -2029,23 +2036,26 @@ bool llvm::runIPSCCP(
       }
     }
 
-    // Change dead blocks to unreachable. We do it after replacing constants in
-    // all executable blocks, because changeToUnreachable may remove PHI nodes
-    // in executable blocks we found values for. The function's entry block is
-    // not part of BlocksToErase, so we have to handle it separately.
-    for (BasicBlock *BB : BlocksToErase)
+    DomTreeUpdater DTU = Solver.getDTU(F);
+    // Change dead blocks to unreachable. We do it after replacing constants
+    // in all executable blocks, because changeToUnreachable may remove PHI
+    // nodes in executable blocks we found values for. The function's entry
+    // block is not part of BlocksToErase, so we have to handle it separately.
+    for (BasicBlock *BB : BlocksToErase) {
       NumInstRemoved +=
-          changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
+          changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false,
+                              /*PreserveLCSSA=*/false, &DTU);
+    }
     if (!Solver.isBlockExecutable(&F.front()))
       NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
-                                            /*UseLLVMTrap=*/false);
+                                            /*UseLLVMTrap=*/false,
+                                            /*PreserveLCSSA=*/false, &DTU);
 
-    // Now that all instructions in the function are constant folded, erase dead
-    // blocks, because we can now use ConstantFoldTerminator to get rid of
-    // in-edges.
-    for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+    // Now that all instructions in the function are constant folded,
+    // use ConstantFoldTerminator to get rid of in-edges, record DT updates and
+    // delete dead BBs.
+    for (BasicBlock *DeadBB : BlocksToErase) {
       // If there are any PHI nodes in this successor, drop entries for BB now.
-      BasicBlock *DeadBB = BlocksToErase[i];
       for (Value::user_iterator UI = DeadBB->user_begin(),
                                 UE = DeadBB->user_end();
            UI != UE;) {
@@ -2060,16 +2070,16 @@ bool llvm::runIPSCCP(
         // If we have forced an edge for an indeterminate value, then force the
         // terminator to fold to that edge.
         forceIndeterminateEdge(I, Solver);
-        bool Folded = ConstantFoldTerminator(I->getParent());
+        bool Folded = ConstantFoldTerminator(I->getParent(),
+                                             /*DeleteDeadConditions=*/false,
+                                             /*TLI=*/nullptr, &DTU);
         assert(Folded &&
               "Expect TermInst on constantint or blockaddress to be folded");
         (void) Folded;
       }
-
-      // Finally, delete the basic block.
-      F.getBasicBlockList().erase(DeadBB);
+      // Mark dead BB for deletion.
+      DTU.deleteBB(DeadBB);
     }
-    BlocksToErase.clear();
 
     for (BasicBlock &BB : F) {
       for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index a8b9ee566395a3b6cabc74ba05eaa2afef4795ad..81eea0dee4e4425fb192aae64e92d86bd5efa0ec 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -3164,7 +3164,12 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   /// value (as opposed to the user).
   Use *U;
 
+  /// Used to calculate offsets, and hence alignment, of subobjects.
+  const DataLayout &DL;
+
 public:
+  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
+
   /// Rewrite loads and stores through a pointer and all pointers derived from
   /// it.
   bool rewrite(Instruction &I) {
@@ -3208,10 +3213,22 @@ private:
     /// split operations.
     Value *Ptr;
 
+    /// The base pointee type being GEPed into.
+    Type *BaseTy;
+
+    /// Known alignment of the base pointer.
+    unsigned BaseAlign;
+
+    /// To calculate offset of each component so we can correctly deduce
+    /// alignments.
+    const DataLayout &DL;
+
     /// Initialize the splitter with an insertion point, Ptr and start with a
     /// single zero GEP index.
-    OpSplitter(Instruction *InsertionPoint, Value *Ptr)
-        : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+    OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+               unsigned BaseAlign, const DataLayout &DL)
+        : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
+          BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
 
   public:
     /// Generic recursive split emission routine.
@@ -3228,8 +3245,11 @@ private:
     /// \param Agg The aggregate value being built up or stored, depending on
     /// whether this is splitting a load or a store respectively.
     void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
-      if (Ty->isSingleValueType())
-        return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
+      if (Ty->isSingleValueType()) {
+        unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
+        return static_cast<Derived *>(this)->emitFunc(
+            Ty, Agg, MinAlign(BaseAlign, Offset), Name);
+      }
 
       if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
         unsigned OldSize = Indices.size();
@@ -3268,17 +3288,19 @@ private:
   struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
     AAMDNodes AATags;
 
-    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
-        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+                   AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL)
+        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
+                                     DL), AATags(AATags) {}
 
     /// Emit a leaf load of a single value. This is called at the leaves of the
     /// recursive emission to actually load values.
-    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+    void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) {
       assert(Ty->isSingleValueType());
       // Load the single value and insert it using the indices.
       Value *GEP =
           IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
-      LoadInst *Load = IRB.CreateLoad(GEP, Name + ".load");
+      LoadInst *Load = IRB.CreateAlignedLoad(GEP, Align, Name + ".load");
       if (AATags)
         Load->setAAMetadata(AATags);
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
@@ -3295,7 +3317,8 @@ private:
     LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
     AAMDNodes AATags;
     LI.getAAMetadata(AATags);
-    LoadOpSplitter Splitter(&LI, *U, AATags);
+    LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags,
+                            getAdjustedAlignment(&LI, 0, DL), DL);
     Value *V = UndefValue::get(LI.getType());
     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
     LI.replaceAllUsesWith(V);
@@ -3304,13 +3327,15 @@ private:
   }
 
   struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
-    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags)
-        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {}
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+                    AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL)
+        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
+                                      DL),
+          AATags(AATags) {}
     AAMDNodes AATags;
-
     /// Emit a leaf store of a single value. This is called at the leaves of the
     /// recursive emission to actually produce stores.
-    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+    void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) {
       assert(Ty->isSingleValueType());
       // Extract the single value and store it using the indices.
       //
@@ -3320,7 +3345,8 @@ private:
           IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
       Value *InBoundsGEP =
           IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
-      StoreInst *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
+      StoreInst *Store =
+          IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Align);
       if (AATags)
         Store->setAAMetadata(AATags);
       LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
@@ -3338,7 +3364,8 @@ private:
     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
     AAMDNodes AATags;
     SI.getAAMetadata(AATags);
-    StoreOpSplitter Splitter(&SI, *U, AATags);
+    StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
+                             getAdjustedAlignment(&SI, 0, DL), DL);
     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
     SI.eraseFromParent();
     return true;
@@ -4356,7 +4383,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   // First, split any FCA loads and stores touching this alloca to promote
   // better splitting and promotion opportunities.
-  AggLoadStoreRewriter AggRewriter;
+  AggLoadStoreRewriter AggRewriter(DL);
   Changed |= AggRewriter.rewrite(AI);
 
   // Build the slices using a recursive instruction-visiting builder.
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 1b140acbaeee1fa6d8c24a813d37cef9d9d129e4..976daf4c78c2fd2c3ab86e755c1e2055aadfd9de 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 
@@ -43,7 +44,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeDCELegacyPassPass(Registry);
   initializeDeadInstEliminationPass(Registry);
   initializeDivRemPairsLegacyPassPass(Registry);
-  initializeScalarizerPass(Registry);
+  initializeScalarizerLegacyPassPass(Registry);
   initializeDSELegacyPassPass(Registry);
   initializeGuardWideningLegacyPassPass(Registry);
   initializeLoopGuardWideningLegacyPassPass(Registry);
@@ -51,6 +52,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeNewGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);
+  initializeMakeGuardsExplicitLegacyPassPass(Registry);
   initializeGVNHoistLegacyPassPass(Registry);
   initializeGVNSinkLegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
@@ -73,6 +75,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopUnrollPass(Registry);
   initializeLoopUnrollAndJamPass(Registry);
   initializeLoopUnswitchPass(Registry);
+  initializeWarnMissedTransformationsLegacyPass(Registry);
   initializeLoopVersioningLICMPass(Registry);
   initializeLoopIdiomRecognizeLegacyPassPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index 2f873abca66a95fb3ac9975d6bd1abcbd5006390..3a6f8e6b09595833481e78ff883b3355a0ad3887 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Options.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -49,6 +50,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "scalarizer"
 
+// This is disabled by default because having separate loads and stores
+// makes it more likely that the -combiner-alias-analysis limits will be
+// reached.
+static cl::opt<bool>
+    ScalarizeLoadStore("scalarize-load-store", cl::init(false), cl::Hidden,
+                       cl::desc("Allow the scalarizer pass to scalarize loads and store"));
+
 namespace {
 
 // Used to store the scattered form of a vector.
@@ -152,17 +160,13 @@ struct VectorLayout {
   uint64_t ElemSize = 0;
 };
 
-class Scalarizer : public FunctionPass,
-                   public InstVisitor<Scalarizer, bool> {
+class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 public:
-  static char ID;
-
-  Scalarizer() : FunctionPass(ID) {
-    initializeScalarizerPass(*PassRegistry::getPassRegistry());
+  ScalarizerVisitor(unsigned ParallelLoopAccessMDKind)
+    : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind) {
   }
 
-  bool doInitialization(Module &M) override;
-  bool runOnFunction(Function &F) override;
+  bool visit(Function &F);
 
   // InstVisitor methods.  They return true if the instruction was scalarized,
   // false if nothing changed.
@@ -180,16 +184,6 @@ public:
   bool visitStoreInst(StoreInst &SI);
   bool visitCallInst(CallInst &ICI);
 
-  static void registerOptions() {
-    // This is disabled by default because having separate loads and stores
-    // makes it more likely that the -combiner-alias-analysis limits will be
-    // reached.
-    OptionRegistry::registerOption<bool, Scalarizer,
-                                 &Scalarizer::ScalarizeLoadStore>(
-        "scalarize-load-store",
-        "Allow the scalarizer pass to scalarize loads and store", false);
-  }
-
 private:
   Scatterer scatter(Instruction *Point, Value *V);
   void gather(Instruction *Op, const ValueVector &CV);
@@ -205,16 +199,28 @@ private:
 
   ScatterMap Scattered;
   GatherList Gathered;
+
   unsigned ParallelLoopAccessMDKind;
-  bool ScalarizeLoadStore;
 };
 
-} // end anonymous namespace
+class ScalarizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ScalarizerLegacyPass() : FunctionPass(ID) {
+    initializeScalarizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
 
-char Scalarizer::ID = 0;
+} // end anonymous namespace
 
-INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
-                             "Scalarize vector operations", false, false)
+char ScalarizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer",
+                      "Scalarize vector operations", false, false)
+INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer",
+                    "Scalarize vector operations", false, false)
 
 Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
                      ValueVector *cachePtr)
@@ -278,17 +284,22 @@ Value *Scatterer::operator[](unsigned I) {
   return CV[I];
 }
 
-bool Scalarizer::doInitialization(Module &M) {
-  ParallelLoopAccessMDKind =
+bool ScalarizerLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  Module &M = *F.getParent();
+  unsigned ParallelLoopAccessMDKind =
       M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
-  ScalarizeLoadStore =
-      M.getContext().getOption<bool, Scalarizer, &Scalarizer::ScalarizeLoadStore>();
-  return false;
+  ScalarizerVisitor Impl(ParallelLoopAccessMDKind);
+  return Impl.visit(F);
 }
 
-bool Scalarizer::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
+FunctionPass *llvm::createScalarizerPass() {
+  return new ScalarizerLegacyPass();
+}
+
+bool ScalarizerVisitor::visit(Function &F) {
   assert(Gathered.empty() && Scattered.empty());
 
   // To ensure we replace gathered components correctly we need to do an ordered
@@ -297,7 +308,7 @@ bool Scalarizer::runOnFunction(Function &F) {
   for (BasicBlock *BB : RPOT) {
     for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
       Instruction *I = &*II;
-      bool Done = visit(I);
+      bool Done = InstVisitor::visit(I);
       ++II;
       if (Done && I->getType()->isVoidTy())
         I->eraseFromParent();
@@ -308,7 +319,7 @@ bool Scalarizer::runOnFunction(Function &F) {
 
 // Return a scattered form of V that can be accessed by Point.  V must be a
 // vector or a pointer to a vector.
-Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {
+Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
   if (Argument *VArg = dyn_cast<Argument>(V)) {
     // Put the scattered form of arguments in the entry block,
     // so that it can be used everywhere.
@@ -332,7 +343,7 @@ Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {
 // deletion of Op and creation of the gathered form to the end of the pass,
 // so that we can avoid creating the gathered form if all uses of Op are
 // replaced with uses of CV.
-void Scalarizer::gather(Instruction *Op, const ValueVector &CV) {
+void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
   // Since we're not deleting Op yet, stub out its operands, so that it
   // doesn't make anything live unnecessarily.
   for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I)
@@ -361,7 +372,7 @@ void Scalarizer::gather(Instruction *Op, const ValueVector &CV) {
 
 // Return true if it is safe to transfer the given metadata tag from
 // vector to scalar instructions.
-bool Scalarizer::canTransferMetadata(unsigned Tag) {
+bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) {
   return (Tag == LLVMContext::MD_tbaa
           || Tag == LLVMContext::MD_fpmath
           || Tag == LLVMContext::MD_tbaa_struct
@@ -373,7 +384,7 @@ bool Scalarizer::canTransferMetadata(unsigned Tag) {
 
 // Transfer metadata from Op to the instructions in CV if it is known
 // to be safe to do so.
-void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
+void ScalarizerVisitor::transferMetadata(Instruction *Op, const ValueVector &CV) {
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   Op->getAllMetadataOtherThanDebugLoc(MDs);
   for (unsigned I = 0, E = CV.size(); I != E; ++I) {
@@ -389,7 +400,7 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
 
 // Try to fill in Layout from Ty, returning true on success.  Alignment is
 // the alignment of the vector, or 0 if the ABI default should be used.
-bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
+bool ScalarizerVisitor::getVectorLayout(Type *Ty, unsigned Alignment,
                                  VectorLayout &Layout, const DataLayout &DL) {
   // Make sure we're dealing with a vector.
   Layout.VecTy = dyn_cast<VectorType>(Ty);
@@ -413,7 +424,7 @@ bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
 // Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
 // to create an instruction like I with operands X and Y and name Name.
 template<typename Splitter>
-bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {
+bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
   VectorType *VT = dyn_cast<VectorType>(I.getType());
   if (!VT)
     return false;
@@ -446,7 +457,7 @@ static Function *getScalarIntrinsicDeclaration(Module *M,
 
 /// If a call to a vector typed intrinsic function, split into a scalar call per
 /// element if possible for the intrinsic.
-bool Scalarizer::splitCall(CallInst &CI) {
+bool ScalarizerVisitor::splitCall(CallInst &CI) {
   VectorType *VT = dyn_cast<VectorType>(CI.getType());
   if (!VT)
     return false;
@@ -504,7 +515,7 @@ bool Scalarizer::splitCall(CallInst &CI) {
   return true;
 }
 
-bool Scalarizer::visitSelectInst(SelectInst &SI) {
+bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) {
   VectorType *VT = dyn_cast<VectorType>(SI.getType());
   if (!VT)
     return false;
@@ -534,19 +545,19 @@ bool Scalarizer::visitSelectInst(SelectInst &SI) {
   return true;
 }
 
-bool Scalarizer::visitICmpInst(ICmpInst &ICI) {
+bool ScalarizerVisitor::visitICmpInst(ICmpInst &ICI) {
   return splitBinary(ICI, ICmpSplitter(ICI));
 }
 
-bool Scalarizer::visitFCmpInst(FCmpInst &FCI) {
+bool ScalarizerVisitor::visitFCmpInst(FCmpInst &FCI) {
   return splitBinary(FCI, FCmpSplitter(FCI));
 }
 
-bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) {
+bool ScalarizerVisitor::visitBinaryOperator(BinaryOperator &BO) {
   return splitBinary(BO, BinarySplitter(BO));
 }
 
-bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
   if (!VT)
     return false;
@@ -592,7 +603,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   return true;
 }
 
-bool Scalarizer::visitCastInst(CastInst &CI) {
+bool ScalarizerVisitor::visitCastInst(CastInst &CI) {
   VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
   if (!VT)
     return false;
@@ -610,7 +621,7 @@ bool Scalarizer::visitCastInst(CastInst &CI) {
   return true;
 }
 
-bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {
+bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
   VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
   VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
   if (!DstVT || !SrcVT)
@@ -665,7 +676,7 @@ bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {
   return true;
 }
 
-bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   VectorType *VT = dyn_cast<VectorType>(SVI.getType());
   if (!VT)
     return false;
@@ -689,7 +700,7 @@ bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   return true;
 }
 
-bool Scalarizer::visitPHINode(PHINode &PHI) {
+bool ScalarizerVisitor::visitPHINode(PHINode &PHI) {
   VectorType *VT = dyn_cast<VectorType>(PHI.getType());
   if (!VT)
     return false;
@@ -714,7 +725,7 @@ bool Scalarizer::visitPHINode(PHINode &PHI) {
   return true;
 }
 
-bool Scalarizer::visitLoadInst(LoadInst &LI) {
+bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) {
   if (!ScalarizeLoadStore)
     return false;
   if (!LI.isSimple())
@@ -738,7 +749,7 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) {
   return true;
 }
 
-bool Scalarizer::visitStoreInst(StoreInst &SI) {
+bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) {
   if (!ScalarizeLoadStore)
     return false;
   if (!SI.isSimple())
@@ -765,13 +776,13 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) {
   return true;
 }
 
-bool Scalarizer::visitCallInst(CallInst &CI) {
+bool ScalarizerVisitor::visitCallInst(CallInst &CI) {
   return splitCall(CI);
 }
 
 // Delete the instructions that we scalarized.  If a full vector result
 // is still needed, recreate it using InsertElements.
-bool Scalarizer::finish() {
+bool ScalarizerVisitor::finish() {
   // The presence of data in Gathered or Scattered indicates changes
   // made to the Function.
   if (Gathered.empty() && Scattered.empty())
@@ -802,6 +813,11 @@ bool Scalarizer::finish() {
   return true;
 }
 
-FunctionPass *llvm::createScalarizerPass() {
-  return new Scalarizer();
+PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+  Module &M = *F.getParent();
+  unsigned ParallelLoopAccessMDKind =
+      M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+  ScalarizerVisitor Impl(ParallelLoopAccessMDKind);
+  bool Changed = Impl.visit(F);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 368f0925abac0011e658acf2c11ecff2265f4b4c..abec9183f0fe13ae92c6e016dade5d2968966f10 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -25,6 +25,8 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -62,6 +64,9 @@ STATISTIC(NumBranches, "Number of branches unswitched");
 STATISTIC(NumSwitches, "Number of switches unswitched");
 STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+STATISTIC(
+    NumCostMultiplierSkipped,
+    "Number of unswitch candidates that had their cost multiplier skipped");
 
 static cl::opt<bool> EnableNonTrivialUnswitch(
     "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
@@ -72,6 +77,17 @@ static cl::opt<int>
     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
                       cl::desc("The cost threshold for unswitching a loop."));
 
+static cl::opt<bool> EnableUnswitchCostMultiplier(
+    "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden,
+    cl::desc("Enable unswitch cost multiplier that prohibits exponential "
+             "explosion in nontrivial unswitch."));
+static cl::opt<int> UnswitchSiblingsToplevelDiv(
+    "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
+    cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
+    "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
+    cl::desc("Number of unswitch candidates that are ignored when calculating "
+             "cost multiplier."));
 static cl::opt<bool> UnswitchGuards(
     "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
     cl::desc("If enabled, simple loop unswitching will also consider "
@@ -335,7 +351,8 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
 /// If `SE` is not null, it will be updated based on the potential loop SCEVs
 /// invalidated by this.
 static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
-                                  LoopInfo &LI, ScalarEvolution *SE) {
+                                  LoopInfo &LI, ScalarEvolution *SE,
+                                  MemorySSAUpdater *MSSAU) {
   assert(BI.isConditional() && "Can only unswitch a conditional branch!");
   LLVM_DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
 
@@ -409,11 +426,14 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
       SE->forgetTopmostLoop(&L);
   }
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // Split the preheader, so that we know that there is a safe place to insert
   // the conditional branch. We will change the preheader to have a conditional
   // branch on LoopCond.
   BasicBlock *OldPH = L.getLoopPreheader();
-  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
 
   // Now that we have a place to insert the conditional branch, create a place
   // to branch to: this is the exit block out of the loop that we are
@@ -425,9 +445,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
            "A branch's parent isn't a predecessor!");
     UnswitchedBB = LoopExitBB;
   } else {
-    UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
+    UnswitchedBB =
+        SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI, MSSAU);
   }
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // Actually move the invariant uses into the unswitched position. If possible,
   // we do this by moving the instructions, but when doing partial unswitching
   // we do it by building a new merge of the values in the unswitched position.
@@ -438,12 +462,17 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
     // its successors.
     OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
                                 BI);
+    if (MSSAU) {
+      // Temporarily clone the terminator, to make MSSA update cheaper by
+      // separating "insert edge" updates from "remove edge" ones.
+      ParentBB->getInstList().push_back(BI.clone());
+    } else {
+      // Create a new unconditional branch that will continue the loop as a new
+      // terminator.
+      BranchInst::Create(ContinueBB, ParentBB);
+    }
     BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
     BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
-
-    // Create a new unconditional branch that will continue the loop as a new
-    // terminator.
-    BranchInst::Create(ContinueBB, ParentBB);
   } else {
     // Only unswitching a subset of inputs to the condition, so we will need to
     // build a new branch that merges the invariant inputs.
@@ -459,6 +488,32 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
                                           *UnswitchedBB, *NewPH);
   }
 
+  // Update the dominator tree with the added edge.
+  DT.insertEdge(OldPH, UnswitchedBB);
+
+  // After the dominator tree was updated with the added edge, update MemorySSA
+  // if available.
+  if (MSSAU) {
+    SmallVector<CFGUpdate, 1> Updates;
+    Updates.push_back({cfg::UpdateKind::Insert, OldPH, UnswitchedBB});
+    MSSAU->applyInsertUpdates(Updates, DT);
+  }
+
+  // Finish updating dominator tree and memory ssa for full unswitch.
+  if (FullUnswitch) {
+    if (MSSAU) {
+      // Remove the cloned branch instruction.
+      ParentBB->getTerminator()->eraseFromParent();
+      // Create unconditional branch now.
+      BranchInst::Create(ContinueBB, ParentBB);
+      MSSAU->removeEdge(ParentBB, LoopExitBB);
+    }
+    DT.deleteEdge(ParentBB, LoopExitBB);
+  }
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // Rewrite the relevant PHI nodes.
   if (UnswitchedBB == LoopExitBB)
     rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
@@ -466,13 +521,6 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
     rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
                                               *ParentBB, *OldPH, FullUnswitch);
 
-  // Now we need to update the dominator tree.
-  SmallVector<DominatorTree::UpdateType, 2> DTUpdates;
-  DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB});
-  if (FullUnswitch)
-    DTUpdates.push_back({DT.Delete, ParentBB, LoopExitBB});
-  DT.applyUpdates(DTUpdates);
-
   // The constant we can replace all of our invariants with inside the loop
   // body. If any of the invariants have a value other than this the loop won't
   // be entered.
@@ -523,7 +571,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
 /// If `SE` is not null, it will be updated based on the potential loop SCEVs
 /// invalidated by this.
 static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
-                                  LoopInfo &LI, ScalarEvolution *SE) {
+                                  LoopInfo &LI, ScalarEvolution *SE,
+                                  MemorySSAUpdater *MSSAU) {
   LLVM_DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n");
   Value *LoopCond = SI.getCondition();
 
@@ -550,6 +599,9 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
 
   LLVM_DEBUG(dbgs() << "    unswitching trivial switch...\n");
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // We may need to invalidate SCEVs for the outermost loop reached by any of
   // the exits.
   Loop *OuterL = &L;
@@ -612,7 +664,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   // Split the preheader, so that we know that there is a safe place to insert
   // the switch.
   BasicBlock *OldPH = L.getLoopPreheader();
-  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
   OldPH->getTerminator()->eraseFromParent();
 
   // Now add the unswitched switch.
@@ -635,9 +687,10 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
       rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
     } else {
       auto *SplitBB =
-          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
-      rewritePHINodesForExitAndUnswitchedBlocks(
-          *DefaultExitBB, *SplitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true);
+          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI, MSSAU);
+      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+                                                *ParentBB, *OldPH,
+                                                /*FullUnswitch*/ true);
       DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
     }
   }
@@ -661,9 +714,10 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
     if (!SplitExitBB) {
       // If this is the first time we see this, do the split and remember it.
-      SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
-      rewritePHINodesForExitAndUnswitchedBlocks(
-          *ExitBB, *SplitExitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true);
+      SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
+      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+                                                *ParentBB, *OldPH,
+                                                /*FullUnswitch*/ true);
     }
     // Update the case pair to point to the split block.
     CasePair.second = SplitExitBB;
@@ -740,6 +794,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB});
   }
   DT.applyUpdates(DTUpdates);
+
+  if (MSSAU) {
+    MSSAU->applyUpdates(DTUpdates, DT);
+    if (VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+  }
+
   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
 
   // We may have changed the nesting relationship for this loop so hoist it to
@@ -765,7 +826,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
 /// If `SE` is not null, it will be updated based on the potential loop SCEVs
 /// invalidated by this.
 static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
-                                         LoopInfo &LI, ScalarEvolution *SE) {
+                                         LoopInfo &LI, ScalarEvolution *SE,
+                                         MemorySSAUpdater *MSSAU) {
   bool Changed = false;
 
   // If loop header has only one reachable successor we should keep looking for
@@ -799,7 +861,7 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
       if (isa<Constant>(SI->getCondition()))
         return Changed;
 
-      if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE))
+      if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE, MSSAU))
         // Couldn't unswitch this one so we're done.
         return Changed;
 
@@ -831,7 +893,7 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
 
     // Found a trivial condition candidate: non-foldable conditional branch. If
     // we fail to unswitch this, we can't do anything else that is trivial.
-    if (!unswitchTrivialBranch(L, *BI, DT, LI, SE))
+    if (!unswitchTrivialBranch(L, *BI, DT, LI, SE, MSSAU))
       return Changed;
 
     // Mark that we managed to unswitch something.
@@ -884,7 +946,7 @@ static BasicBlock *buildClonedLoopBlocks(
     const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc,
     ValueToValueMapTy &VMap,
     SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
-    DominatorTree &DT, LoopInfo &LI) {
+    DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
   SmallVector<BasicBlock *, 4> NewBlocks;
   NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
 
@@ -929,7 +991,7 @@ static BasicBlock *buildClonedLoopBlocks(
     // place to merge the CFG, so split the exit first. This is always safe to
     // do because there cannot be any non-loop predecessors of a loop exit in
     // loop simplified form.
-    auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+    auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
 
     // Rearrange the names to make it easier to write test cases by having the
     // exit block carry the suffix rather than the merge block carrying the
@@ -1360,7 +1422,7 @@ static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
 static void
 deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
                        ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
-                       DominatorTree &DT) {
+                       DominatorTree &DT, MemorySSAUpdater *MSSAU) {
   // Find all the dead clones, and remove them from their successors.
   SmallVector<BasicBlock *, 16> DeadBlocks;
   for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
@@ -1372,6 +1434,13 @@ deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
           DeadBlocks.push_back(ClonedBB);
         }
 
+  // Remove all MemorySSA in the dead blocks
+  if (MSSAU) {
+    SmallPtrSet<BasicBlock *, 16> DeadBlockSet(DeadBlocks.begin(),
+                                               DeadBlocks.end());
+    MSSAU->removeBlocks(DeadBlockSet);
+  }
+
   // Drop any remaining references to break cycles.
   for (BasicBlock *BB : DeadBlocks)
     BB->dropAllReferences();
@@ -1380,10 +1449,10 @@ deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
     BB->eraseFromParent();
 }
 
-static void
-deleteDeadBlocksFromLoop(Loop &L,
-                         SmallVectorImpl<BasicBlock *> &ExitBlocks,
-                         DominatorTree &DT, LoopInfo &LI) {
+static void deleteDeadBlocksFromLoop(Loop &L,
+                                     SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                                     DominatorTree &DT, LoopInfo &LI,
+                                     MemorySSAUpdater *MSSAU) {
   // Find all the dead blocks tied to this loop, and remove them from their
   // successors.
   SmallPtrSet<BasicBlock *, 16> DeadBlockSet;
@@ -1404,6 +1473,10 @@ deleteDeadBlocksFromLoop(Loop &L,
     }
   }
 
+  // Remove all MemorySSA in the dead blocks
+  if (MSSAU)
+    MSSAU->removeBlocks(DeadBlockSet);
+
   // Filter out the dead blocks from the exit blocks list so that it can be
   // used in the caller.
   llvm::erase_if(ExitBlocks,
@@ -1803,7 +1876,7 @@ static void unswitchNontrivialInvariants(
     Loop &L, Instruction &TI, ArrayRef<Value *> Invariants,
     SmallVectorImpl<BasicBlock *> &ExitBlocks, DominatorTree &DT, LoopInfo &LI,
     AssumptionCache &AC, function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
-    ScalarEvolution *SE) {
+    ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
   auto *ParentBB = TI.getParent();
   BranchInst *BI = dyn_cast<BranchInst>(&TI);
   SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
@@ -1820,6 +1893,9 @@ static void unswitchNontrivialInvariants(
     assert(isa<Instruction>(BI->getCondition()) &&
            "Partial unswitching requires an instruction as the condition!");
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // Constant and BBs tracking the cloned and continuing successor. When we are
   // unswitching the entire condition, this can just be trivially chosen to
   // unswitch towards `true`. However, when we are unswitching a set of
@@ -1860,6 +1936,10 @@ static void unswitchNontrivialInvariants(
 
   // Compute the parent loop now before we start hacking on things.
   Loop *ParentL = L.getParentLoop();
+  // Get blocks in RPO order for MSSA update, before changing the CFG.
+  LoopBlocksRPO LBRPO(&L);
+  if (MSSAU)
+    LBRPO.perform(&LI);
 
   // Compute the outer-most loop containing one of our exit blocks. This is the
   // furthest up our loopnest which can be mutated, which we will use below to
@@ -1909,7 +1989,7 @@ static void unswitchNontrivialInvariants(
   // between the unswitched versions, and we will have a new preheader for the
   // original loop.
   BasicBlock *SplitBB = L.getLoopPreheader();
-  BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
+  BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI, MSSAU);
 
   // Keep track of the dominator tree updates needed.
   SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
@@ -1922,7 +2002,7 @@ static void unswitchNontrivialInvariants(
     VMaps.emplace_back(new ValueToValueMapTy());
     ClonedPHs[SuccBB] = buildClonedLoopBlocks(
         L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB,
-        DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI);
+        DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU);
   }
 
   // The stitching of the branched code back together depends on whether we're
@@ -1930,7 +2010,63 @@ static void unswitchNontrivialInvariants(
   // nuke the initial terminator placed in the split block.
   SplitBB->getTerminator()->eraseFromParent();
   if (FullUnswitch) {
-    // First we need to unhook the successor relationship as we'll be replacing
+    // Splice the terminator from the original loop and rewrite its
+    // successors.
+    SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
+
+    // Keep a clone of the terminator for MSSA updates.
+    Instruction *NewTI = TI.clone();
+    ParentBB->getInstList().push_back(NewTI);
+
+    // First wire up the moved terminator to the preheaders.
+    if (BI) {
+      BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+      BI->setSuccessor(ClonedSucc, ClonedPH);
+      BI->setSuccessor(1 - ClonedSucc, LoopPH);
+      DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+    } else {
+      assert(SI && "Must either be a branch or switch!");
+
+      // Walk the cases and directly update their successors.
+      assert(SI->getDefaultDest() == RetainedSuccBB &&
+             "Not retaining default successor!");
+      SI->setDefaultDest(LoopPH);
+      for (auto &Case : SI->cases())
+        if (Case.getCaseSuccessor() == RetainedSuccBB)
+          Case.setSuccessor(LoopPH);
+        else
+          Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
+
+      // We need to use the set to populate domtree updates as even when there
+      // are multiple cases pointing at the same successor we only want to
+      // remove and insert one edge in the domtree.
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        DTUpdates.push_back(
+            {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
+    }
+
+    if (MSSAU) {
+      DT.applyUpdates(DTUpdates);
+      DTUpdates.clear();
+
+      // Remove all but one edge to the retained block and all unswitched
+      // blocks. This is to avoid having duplicate entries in the cloned Phis,
+      // when we know we only keep a single edge for each case.
+      MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, RetainedSuccBB);
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, SuccBB);
+
+      for (auto &VMap : VMaps)
+        MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
+                                   /*IgnoreIncomingWithNoClones=*/true);
+      MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
+
+      // Remove all edges to unswitched blocks.
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        MSSAU->removeEdge(ParentBB, SuccBB);
+    }
+
+    // Now unhook the successor relationship as we'll be replacing
     // the terminator with a direct branch. This is much simpler for branches
     // than switches so we handle those first.
     if (BI) {
@@ -1948,9 +2084,10 @@ static void unswitchNontrivialInvariants(
       // is a duplicate edge to the retained successor as the retained successor
       // is always the default successor and as we'll replace this with a direct
       // branch we no longer need the duplicate entries in the PHI nodes.
-      assert(SI->getDefaultDest() == RetainedSuccBB &&
+      SwitchInst *NewSI = cast<SwitchInst>(NewTI);
+      assert(NewSI->getDefaultDest() == RetainedSuccBB &&
              "Not retaining default successor!");
-      for (auto &Case : SI->cases())
+      for (auto &Case : NewSI->cases())
         Case.getCaseSuccessor()->removePredecessor(
             ParentBB,
             /*DontDeleteUselessPHIs*/ true);
@@ -1962,34 +2099,8 @@ static void unswitchNontrivialInvariants(
         DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB});
     }
 
-    // Now that we've unhooked the successor relationship, splice the terminator
-    // from the original loop to the split.
-    SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
-
-    // Now wire up the terminator to the preheaders.
-    if (BI) {
-      BasicBlock *ClonedPH = ClonedPHs.begin()->second;
-      BI->setSuccessor(ClonedSucc, ClonedPH);
-      BI->setSuccessor(1 - ClonedSucc, LoopPH);
-      DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
-    } else {
-      assert(SI && "Must either be a branch or switch!");
-
-      // Walk the cases and directly update their successors.
-      SI->setDefaultDest(LoopPH);
-      for (auto &Case : SI->cases())
-        if (Case.getCaseSuccessor() == RetainedSuccBB)
-          Case.setSuccessor(LoopPH);
-        else
-          Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
-
-      // We need to use the set to populate domtree updates as even when there
-      // are multiple cases pointing at the same successor we only want to
-      // remove and insert one edge in the domtree.
-      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
-        DTUpdates.push_back(
-            {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
-    }
+    // After MSSAU update, remove the cloned terminator instruction NewTI.
+    ParentBB->getTerminator()->eraseFromParent();
 
     // Create a new unconditional branch to the continuing block (as opposed to
     // the one cloned).
@@ -2008,12 +2119,19 @@ static void unswitchNontrivialInvariants(
 
   // Apply the updates accumulated above to get an up-to-date dominator tree.
   DT.applyUpdates(DTUpdates);
+  if (!FullUnswitch && MSSAU) {
+    // Update MSSA for partial unswitch, after DT update.
+    SmallVector<CFGUpdate, 1> Updates;
+    Updates.push_back(
+        {cfg::UpdateKind::Insert, SplitBB, ClonedPHs.begin()->second});
+    MSSAU->applyInsertUpdates(Updates, DT);
+  }
 
   // Now that we have an accurate dominator tree, first delete the dead cloned
   // blocks so that we can accurately build any cloned loops. It is important to
   // not delete the blocks from the original loop yet because we still want to
   // reference the original loop to understand the cloned loop's structure.
-  deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT);
+  deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU);
 
   // Build the cloned loop structure itself. This may be substantially
   // different from the original structure due to the simplified CFG. This also
@@ -2025,10 +2143,17 @@ static void unswitchNontrivialInvariants(
   // Now that our cloned loops have been built, we can update the original loop.
   // First we delete the dead blocks from it and then we rebuild the loop
   // structure taking these deletions into account.
-  deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI);
+  deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   SmallVector<Loop *, 4> HoistedLoops;
   bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // This transformation has a high risk of corrupting the dominator tree, and
   // the below steps to rebuild loop structures will result in hard to debug
   // errors in that case so verify that the dominator tree is sane first.
@@ -2153,6 +2278,9 @@ static void unswitchNontrivialInvariants(
       SibLoops.push_back(UpdatedL);
   UnswitchCB(IsStillLoop, SibLoops);
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   ++NumBranches;
 }
 
@@ -2213,11 +2341,14 @@ computeDomSubtreeCost(DomTreeNode &N,
 static BranchInst *
 turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
                     SmallVectorImpl<BasicBlock *> &ExitBlocks,
-                    DominatorTree &DT, LoopInfo &LI) {
+                    DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
   SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
   LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
   BasicBlock *CheckBB = GI->getParent();
 
+  if (MSSAU && VerifyMemorySSA)
+     MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // Remove all CheckBB's successors from DomTree. A block can be seen among
   // successors more than once, but for DomTree it should be added only once.
   SmallPtrSet<BasicBlock *, 4> Successors;
@@ -2235,10 +2366,14 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
   BasicBlock *GuardedBlock = CheckBI->getSuccessor(0);
   GuardedBlock->setName("guarded");
   CheckBI->getSuccessor(1)->setName("deopt");
+  BasicBlock *DeoptBlock = CheckBI->getSuccessor(1);
 
   // We now have a new exit block.
   ExitBlocks.push_back(CheckBI->getSuccessor(1));
 
+  if (MSSAU)
+    MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI);
+
   GI->moveBefore(DeoptBlockTerm);
   GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
 
@@ -2256,15 +2391,107 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
   // Inform LI of a new loop block.
   L.addBasicBlockToLoop(GuardedBlock, LI);
 
+  if (MSSAU) {
+    MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI));
+    MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::End);
+    if (VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+  }
+
   ++NumGuards;
   return CheckBI;
 }
 
+/// Cost multiplier is a way to limit potentially exponential behavior
+/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
+/// candidates available. Also accounting for the number of "sibling" loops with
+/// the idea to account for previous unswitches that already happened on this
+/// cluster of loops. There was an attempt to keep this formula simple,
+/// just enough to limit the worst case behavior. Even if it is not that simple
+/// now it is still not an attempt to provide a detailed heuristic size
+/// prediction.
+///
+/// TODO: Make a proper accounting of "explosion" effect for all kinds of
+/// unswitch candidates, making adequate predictions instead of wild guesses.
+/// That requires knowing not just the number of "remaining" candidates but
+/// also costs of unswitching for each of these candidates.
+static int calculateUnswitchCostMultiplier(
+    Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT,
+    ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>>
+        UnswitchCandidates) {
+
+  // Guards and other exiting conditions do not contribute to exponential
+  // explosion as soon as they dominate the latch (otherwise there might be
+  // another path to the latch remaining that does not allow to eliminate the
+  // loop copy on unswitch).
+  BasicBlock *Latch = L.getLoopLatch();
+  BasicBlock *CondBlock = TI.getParent();
+  if (DT.dominates(CondBlock, Latch) &&
+      (isGuard(&TI) ||
+       llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) {
+         return L.contains(SuccBB);
+       }) <= 1)) {
+    NumCostMultiplierSkipped++;
+    return 1;
+  }
+
+  auto *ParentL = L.getParentLoop();
+  int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
+                               : std::distance(LI.begin(), LI.end()));
+  // Count amount of clones that all the candidates might cause during
+  // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases.
+  int UnswitchedClones = 0;
+  for (auto Candidate : UnswitchCandidates) {
+    Instruction *CI = Candidate.first;
+    BasicBlock *CondBlock = CI->getParent();
+    bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch);
+    if (isGuard(CI)) {
+      if (!SkipExitingSuccessors)
+        UnswitchedClones++;
+      continue;
+    }
+    int NonExitingSuccessors = llvm::count_if(
+        successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) {
+          return !SkipExitingSuccessors || L.contains(SuccBB);
+        });
+    UnswitchedClones += Log2_32(NonExitingSuccessors);
+  }
+
+  // Ignore up to the "unscaled candidates" number of unswitch candidates
+  // when calculating the power-of-two scaling of the cost. The main idea
+  // with this control is to allow a small number of unswitches to happen
+  // and rely more on siblings multiplier (see below) when the number
+  // of candidates is small.
+  unsigned ClonesPower =
+      std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0);
+
+  // Allowing top-level loops to spread a bit more than nested ones.
+  int SiblingsMultiplier =
+      std::max((ParentL ? SiblingsCount
+                        : SiblingsCount / (int)UnswitchSiblingsToplevelDiv),
+               1);
+  // Compute the cost multiplier in a way that won't overflow by saturating
+  // at an upper bound.
+  int CostMultiplier;
+  if (ClonesPower > Log2_32(UnswitchThreshold) ||
+      SiblingsMultiplier > UnswitchThreshold)
+    CostMultiplier = UnswitchThreshold;
+  else
+    CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
+                              (int)UnswitchThreshold);
+
+  LLVM_DEBUG(dbgs() << "  Computed multiplier  " << CostMultiplier
+                    << " (siblings " << SiblingsMultiplier << " * clones "
+                    << (1 << ClonesPower) << ")"
+                    << " for unswitch candidate: " << TI << "\n");
+  return CostMultiplier;
+}
+
 static bool
 unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
                       AssumptionCache &AC, TargetTransformInfo &TTI,
                       function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
-                      ScalarEvolution *SE) {
+                      ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
   // Collect all invariant conditions within this loop (as opposed to an inner
   // loop which would be handled when visiting that inner loop).
   SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
@@ -2473,8 +2700,23 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     int CandidateCost = ComputeUnswitchedCost(
         TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
                                      Invariants[0] == BI->getCondition()));
-    LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
-                      << " for unswitch candidate: " << TI << "\n");
+    // Calculate cost multiplier which is a tool to limit potentially
+    // exponential behavior of loop-unswitch.
+    if (EnableUnswitchCostMultiplier) {
+      int CostMultiplier =
+          calculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
+      assert(
+          (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) &&
+          "cost multiplier needs to be in the range of 1..UnswitchThreshold");
+      CandidateCost *= CostMultiplier;
+      LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
+                        << " (multiplier: " << CostMultiplier << ")"
+                        << " for unswitch candidate: " << TI << "\n");
+    } else {
+      LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
+                        << " for unswitch candidate: " << TI << "\n");
+    }
+
     if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
       BestUnswitchTI = &TI;
       BestUnswitchCost = CandidateCost;
@@ -2491,13 +2733,13 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
   // If the best candidate is a guard, turn it into a branch.
   if (isGuard(BestUnswitchTI))
     BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L,
-                                         ExitBlocks, DT, LI);
+                                         ExitBlocks, DT, LI, MSSAU);
 
   LLVM_DEBUG(dbgs() << "  Unswitching non-trivial (cost = "
                     << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
                     << "\n");
   unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants,
-                               ExitBlocks, DT, LI, AC, UnswitchCB, SE);
+                               ExitBlocks, DT, LI, AC, UnswitchCB, SE, MSSAU);
   return true;
 }
 
@@ -2510,6 +2752,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
 ///
 /// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also
 /// updated based on the unswitch.
+/// The `MSSA` analysis is also updated if valid (i.e. its use is enabled).
 ///
 /// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is
 /// true, we will attempt to do non-trivial unswitching as well as trivial
@@ -2525,7 +2768,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
                          AssumptionCache &AC, TargetTransformInfo &TTI,
                          bool NonTrivial,
                          function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
-                         ScalarEvolution *SE) {
+                         ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
   assert(L.isRecursivelyLCSSAForm(DT, LI) &&
          "Loops must be in LCSSA form before unswitching.");
   bool Changed = false;
@@ -2535,7 +2778,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return false;
 
   // Try trivial unswitch first before loop over other basic blocks in the loop.
-  if (unswitchAllTrivialConditions(L, DT, LI, SE)) {
+  if (unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) {
     // If we unswitched successfully we will want to clean up the loop before
     // processing it further so just mark it as unswitched and return.
     UnswitchCB(/*CurrentLoopValid*/ true, {});
@@ -2556,7 +2799,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
 
   // Try to unswitch the best invariant condition. We prefer this full unswitch to
   // a partial unswitch when possible below the threshold.
-  if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE))
+  if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE, MSSAU))
     return true;
 
   // No other opportunities to unswitch.
@@ -2590,10 +2833,19 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
       U.markLoopAsDeleted(L, LoopName);
   };
 
+  Optional<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA) {
+    MSSAU = MemorySSAUpdater(AR.MSSA);
+    if (VerifyMemorySSA)
+      AR.MSSA->verifyMemorySSA();
+  }
   if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB,
-                    &AR.SE))
+                    &AR.SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
     return PreservedAnalyses::all();
 
+  if (AR.MSSA && VerifyMemorySSA)
+    AR.MSSA->verifyMemorySSA();
+
   // Historically this pass has had issues with the dominator tree so verify it
   // in asserts builds.
   assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
@@ -2619,6 +2871,10 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (EnableMSSALoopDependency) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
     getLoopAnalysisUsage(AU);
   }
 };
@@ -2638,6 +2894,12 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  MemorySSA *MSSA = nullptr;
+  Optional<MemorySSAUpdater> MSSAU;
+  if (EnableMSSALoopDependency) {
+    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    MSSAU = MemorySSAUpdater(MSSA);
+  }
 
   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
   auto *SE = SEWP ? &SEWP->getSE() : nullptr;
@@ -2657,7 +2919,14 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
       LPM.markLoopAsDeleted(*L);
   };
 
-  bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE);
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
+  bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE,
+                              MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
 
   // If anything was unswitched, also clear any cached information about this
   // loop.
@@ -2677,6 +2946,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
                     "Simple unswitch loops", false, false)
diff --git a/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/lib/Transforms/Scalar/WarnMissedTransforms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..80f761e537747462adcd7bdbf477a8e5c035c957
--- /dev/null
+++ b/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -0,0 +1,149 @@
+//===- LoopTransformWarning.cpp -  ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Emit warnings if forced code transformations have not been performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "transform-warning"
+
+/// Emit warnings for forced (i.e. user-defined) loop transformations which have
+/// still not been performed.
+static void warnAboutLeftoverTransformations(Loop *L,
+                                             OptimizationRemarkEmitter *ORE) {
+  if (hasUnrollTransformation(L) == TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Leftover unroll transformation\n");
+    ORE->emit(
+        DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                          "FailedRequestedUnrolling",
+                                          L->getStartLoc(), L->getHeader())
+        << "loop not unrolled: the optimizer was unable to perform the "
+           "requested transformation; the transformation might be disabled or "
+           "specified as part of an unsupported transformation ordering");
+  }
+
+  if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Leftover unroll-and-jam transformation\n");
+    ORE->emit(
+        DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                          "FailedRequestedUnrollAndJamming",
+                                          L->getStartLoc(), L->getHeader())
+        << "loop not unroll-and-jammed: the optimizer was unable to perform "
+           "the requested transformation; the transformation might be disabled "
+           "or specified as part of an unsupported transformation ordering");
+  }
+
+  if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
+    Optional<int> VectorizeWidth =
+        getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width");
+    Optional<int> InterleaveCount =
+        getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
+
+    if (VectorizeWidth.getValueOr(0) != 1)
+      ORE->emit(
+          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                            "FailedRequestedVectorization",
+                                            L->getStartLoc(), L->getHeader())
+          << "loop not vectorized: the optimizer was unable to perform the "
+             "requested transformation; the transformation might be disabled "
+             "or specified as part of an unsupported transformation ordering");
+    else if (InterleaveCount.getValueOr(0) != 1)
+      ORE->emit(
+          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                            "FailedRequestedInterleaving",
+                                            L->getStartLoc(), L->getHeader())
+          << "loop not interleaved: the optimizer was unable to perform the "
+             "requested transformation; the transformation might be disabled "
+             "or specified as part of an unsupported transformation ordering");
+  }
+
+  if (hasDistributeTransformation(L) == TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Leftover distribute transformation\n");
+    ORE->emit(
+        DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                          "FailedRequestedDistribution",
+                                          L->getStartLoc(), L->getHeader())
+        << "loop not distributed: the optimizer was unable to perform the "
+           "requested transformation; the transformation might be disabled or "
+           "specified as part of an unsupported transformation ordering");
+  }
+}
+
+static void warnAboutLeftoverTransformations(Function *F, LoopInfo *LI,
+                                             OptimizationRemarkEmitter *ORE) {
+  for (auto *L : LI->getLoopsInPreorder())
+    warnAboutLeftoverTransformations(L, ORE);
+}
+
+// New pass manager boilerplate
+PreservedAnalyses
+WarnMissedTransformationsPass::run(Function &F, FunctionAnalysisManager &AM) {
+  // Do not warn about not applied transformations if optimizations are
+  // disabled.
+  if (F.hasFnAttribute(Attribute::OptimizeNone))
+    return PreservedAnalyses::all();
+
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+
+  warnAboutLeftoverTransformations(&F, &LI, &ORE);
+
+  return PreservedAnalyses::all();
+}
+
+// Legacy pass manager boilerplate
+namespace {
+class WarnMissedTransformationsLegacy : public FunctionPass {
+public:
+  static char ID;
+
+  explicit WarnMissedTransformationsLegacy() : FunctionPass(ID) {
+    initializeWarnMissedTransformationsLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+    warnAboutLeftoverTransformations(&F, &LI, &ORE);
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+
+    AU.setPreservesAll();
+  }
+};
+} // end anonymous namespace
+
+char WarnMissedTransformationsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(WarnMissedTransformationsLegacy, "transform-warning",
+                      "Warn about non-applied transformations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(WarnMissedTransformationsLegacy, "transform-warning",
+                    "Warn about non-applied transformations", false, false)
+
+Pass *llvm::createWarnMissedTransformationsPass() {
+  return new WarnMissedTransformationsLegacy();
+}
diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp
index 4db579156d90aa538008434d1b90c0dec6765cd6..e58ddcf34667d5bf6be9cab699d216885c5f1162 100644
--- a/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -393,6 +393,13 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
   // to the correct type.
   auto CalleeType = Callee->getFunctionType();
   auto CalleeParamNum = CalleeType->getNumParams();
+
+  LLVMContext &Ctx = Callee->getContext();
+  const AttributeList &CallerPAL = CS.getAttributes();
+  // The new list of argument attributes.
+  SmallVector<AttributeSet, 4> NewArgAttrs;
+  bool AttributeChanged = false;
+
   for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) {
     auto *Arg = CS.getArgument(ArgNo);
     Type *FormalTy = CalleeType->getParamType(ArgNo);
@@ -401,13 +408,31 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
       auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "",
                                                     CS.getInstruction());
       CS.setArgument(ArgNo, Cast);
-    }
+
+      // Remove any incompatible attributes for the argument.
+      AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo));
+      ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
+      NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs));
+      AttributeChanged = true;
+    } else
+      NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo));
   }
 
   // If the return type of the call site doesn't match that of the callee, cast
   // the returned value to the appropriate type.
-  if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy)
+  // Remove any incompatible return value attribute.
+  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+  if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) {
     createRetBitCast(CS, CallSiteRetTy, RetBitCast);
+    RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy));
+    AttributeChanged = true;
+  }
+
+  // Set the new callsite attribute.
+  if (AttributeChanged)
+    CS.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(),
+                                        AttributeSet::get(Ctx, RAttrs),
+                                        NewArgAttrs));
 
   return CS.getInstruction();
 }
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 000af808945ae599d2089d5b5a3610a91989a610..8f8c601f5f136b9ff86258710ad15357e2003d8d 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -18,11 +18,11 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DomTreeUpdater.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
@@ -32,6 +32,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 using namespace llvm;
@@ -795,11 +796,12 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
 
 /// Duplicate non-Phi instructions from the beginning of block up to
 /// StopAt instruction into a split block between BB and its predecessor.
-BasicBlock *
-llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
-                                          Instruction *StopAt,
-                                          ValueToValueMapTy &ValueMapping,
-                                          DominatorTree *DT) {
+BasicBlock *llvm::DuplicateInstructionsInSplitBetween(
+    BasicBlock *BB, BasicBlock *PredBB, Instruction *StopAt,
+    ValueToValueMapTy &ValueMapping, DomTreeUpdater &DTU) {
+
+  assert(count(successors(PredBB), BB) == 1 &&
+         "There must be a single edge between PredBB and BB!");
   // We are going to have to map operands from the original BB block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
   // account for entry from PredBB.
@@ -807,10 +809,16 @@ llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
   for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
     ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
 
-  BasicBlock *NewBB = SplitEdge(PredBB, BB, DT);
+  BasicBlock *NewBB = SplitEdge(PredBB, BB);
   NewBB->setName(PredBB->getName() + ".split");
   Instruction *NewTerm = NewBB->getTerminator();
 
+  // FIXME: SplitEdge does not yet take a DTU, so we include the split edge
+  //        in the update set here.
+  DTU.applyUpdates({{DominatorTree::Delete, PredBB, BB},
+                    {DominatorTree::Insert, PredBB, NewBB},
+                    {DominatorTree::Insert, NewBB, BB}});
+
   // Clone the non-phi instructions of BB into NewBB, keeping track of the
   // mapping and using it to remap operands in the cloned instructions.
   // Stop once we see the terminator too. This covers the case where BB's
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 419e1db08bfd5320e654215daffb999812295431..a6b01107752e5b5ce6d7a08a2980ed7047e34a6f 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -531,10 +531,10 @@ void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
   }
 }
 
-/// severSplitPHINodes - If a PHI node has multiple inputs from outside of the
-/// region, we need to split the entry block of the region so that the PHI node
-/// is easier to deal with.
-void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
+/// severSplitPHINodesOfEntry - If a PHI node has multiple inputs from outside
+/// of the region, we need to split the entry block of the region so that the
+/// PHI node is easier to deal with.
+void CodeExtractor::severSplitPHINodesOfEntry(BasicBlock *&Header) {
   unsigned NumPredsFromRegion = 0;
   unsigned NumPredsOutsideRegion = 0;
 
@@ -606,6 +606,56 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
   }
 }
 
+/// severSplitPHINodesOfExits - if PHI nodes in exit blocks have inputs from
+/// outlined region, we split these PHIs on two: one with inputs from region
+/// and other with remaining incoming blocks; then first PHIs are placed in
+/// outlined region.
+void CodeExtractor::severSplitPHINodesOfExits(
+    const SmallPtrSetImpl<BasicBlock *> &Exits) {
+  for (BasicBlock *ExitBB : Exits) {
+    BasicBlock *NewBB = nullptr;
+
+    for (PHINode &PN : ExitBB->phis()) {
+      // Find all incoming values from the outlining region.
+      SmallVector<unsigned, 2> IncomingVals;
+      for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+        if (Blocks.count(PN.getIncomingBlock(i)))
+          IncomingVals.push_back(i);
+
+      // Do not process PHI if there is one (or fewer) predecessor from region.
+      // If PHI has exactly one predecessor from region, only this one incoming
+      // will be replaced on codeRepl block, so it should be safe to skip PHI.
+      if (IncomingVals.size() <= 1)
+        continue;
+
+      // Create block for new PHIs and add it to the list of outlined if it
+      // wasn't done before.
+      if (!NewBB) {
+        NewBB = BasicBlock::Create(ExitBB->getContext(),
+                                   ExitBB->getName() + ".split",
+                                   ExitBB->getParent(), ExitBB);
+        SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBB),
+                                           pred_end(ExitBB));
+        for (BasicBlock *PredBB : Preds)
+          if (Blocks.count(PredBB))
+            PredBB->getTerminator()->replaceUsesOfWith(ExitBB, NewBB);
+        BranchInst::Create(ExitBB, NewBB);
+        Blocks.insert(NewBB);
+      }
+
+      // Split this PHI.
+      PHINode *NewPN =
+          PHINode::Create(PN.getType(), IncomingVals.size(),
+                          PN.getName() + ".ce", NewBB->getFirstNonPHI());
+      for (unsigned i : IncomingVals)
+        NewPN->addIncoming(PN.getIncomingValue(i), PN.getIncomingBlock(i));
+      for (unsigned i : reverse(IncomingVals))
+        PN.removeIncomingValue(i, false);
+      PN.addIncoming(NewPN, NewBB);
+    }
+  }
+}
+
 void CodeExtractor::splitReturnBlocks() {
   for (BasicBlock *Block : Blocks)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(Block->getTerminator())) {
@@ -942,17 +992,15 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       continue;
 
     // Find proper insertion point.
-    Instruction *InsertPt;
+    BasicBlock::iterator InsertPt;
     // In case OutI is an invoke, we insert the store at the beginning in the
     // 'normal destination' BB. Otherwise we insert the store right after OutI.
     if (auto *InvokeI = dyn_cast<InvokeInst>(OutI))
-      InsertPt = InvokeI->getNormalDest()->getFirstNonPHI();
+      InsertPt = InvokeI->getNormalDest()->getFirstInsertionPt();
+    else if (auto *Phi = dyn_cast<PHINode>(OutI))
+      InsertPt = Phi->getParent()->getFirstInsertionPt();
     else
-      InsertPt = OutI->getNextNode();
-
-    // Let's assume that there is no other guy interleave non-PHI in PHIs.
-    if (isa<PHINode>(InsertPt))
-      InsertPt = InsertPt->getParent()->getFirstNonPHI();
+      InsertPt = std::next(OutI->getIterator());
 
     assert(OAI != newFunction->arg_end() &&
            "Number of output arguments should match "
@@ -962,13 +1010,13 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
-          StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(), InsertPt);
-      new StoreInst(outputs[i], GEP, InsertPt);
+          StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(), &*InsertPt);
+      new StoreInst(outputs[i], GEP, &*InsertPt);
       // Since there should be only one struct argument aggregating
       // all the output values, we shouldn't increment OAI, which always
       // points to the struct argument, in this case.
     } else {
-      new StoreInst(outputs[i], &*OAI, InsertPt);
+      new StoreInst(outputs[i], &*OAI, &*InsertPt);
       ++OAI;
     }
   }
@@ -1173,13 +1221,33 @@ Function *CodeExtractor::extractCodeRegion() {
     }
   }
 
-  // If we have to split PHI nodes or the entry block, do so now.
-  severSplitPHINodes(header);
-
   // If we have any return instructions in the region, split those blocks so
   // that the return is not in the region.
   splitReturnBlocks();
 
+  // Calculate the exit blocks for the extracted region and the total exit
+  // weights for each of those blocks.
+  DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
+  SmallPtrSet<BasicBlock *, 1> ExitBlocks;
+  for (BasicBlock *Block : Blocks) {
+    for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
+         ++SI) {
+      if (!Blocks.count(*SI)) {
+        // Update the branch weight for this successor.
+        if (BFI) {
+          BlockFrequency &BF = ExitWeights[*SI];
+          BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI);
+        }
+        ExitBlocks.insert(*SI);
+      }
+    }
+  }
+  NumExitBlocks = ExitBlocks.size();
+
+  // If we have to split PHI nodes of the entry or exit blocks, do so now.
+  severSplitPHINodesOfEntry(header);
+  severSplitPHINodesOfExits(ExitBlocks);
+
   // This takes place of the original loop
   BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(),
                                                 "codeRepl", oldFunction,
@@ -1224,25 +1292,6 @@ Function *CodeExtractor::extractCodeRegion() {
       cast<Instruction>(II)->moveBefore(TI);
   }
 
-  // Calculate the exit blocks for the extracted region and the total exit
-  // weights for each of those blocks.
-  DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
-  SmallPtrSet<BasicBlock *, 1> ExitBlocks;
-  for (BasicBlock *Block : Blocks) {
-    for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
-         ++SI) {
-      if (!Blocks.count(*SI)) {
-        // Update the branch weight for this successor.
-        if (BFI) {
-          BlockFrequency &BF = ExitWeights[*SI];
-          BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI);
-        }
-        ExitBlocks.insert(*SI);
-      }
-    }
-  }
-  NumExitBlocks = ExitBlocks.size();
-
   // Construct new function based on inputs/outputs & add allocas for all defs.
   Function *newFunction = constructFunction(inputs, outputs, header,
                                             newFuncRoot,
@@ -1270,8 +1319,8 @@ Function *CodeExtractor::extractCodeRegion() {
   if (BFI && NumExitBlocks > 1)
     calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
 
-  // Loop over all of the PHI nodes in the header block, and change any
-  // references to the old incoming edge to be the new incoming edge.
+  // Loop over all of the PHI nodes in the header and exit blocks, and change
+  // any references to the old incoming edge to be the new incoming edge.
   for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
     PHINode *PN = cast<PHINode>(I);
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
@@ -1279,35 +1328,23 @@ Function *CodeExtractor::extractCodeRegion() {
         PN->setIncomingBlock(i, newFuncRoot);
   }
 
-  // Look at all successors of the codeReplacer block.  If any of these blocks
-  // had PHI nodes in them, we need to update the "from" block to be the code
-  // replacer, not the original block in the extracted region.
-  for (BasicBlock *SuccBB : successors(codeReplacer)) {
-    for (PHINode &PN : SuccBB->phis()) {
+  for (BasicBlock *ExitBB : ExitBlocks)
+    for (PHINode &PN : ExitBB->phis()) {
       Value *IncomingCodeReplacerVal = nullptr;
-      SmallVector<unsigned, 2> IncomingValsToRemove;
-      for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) {
-        BasicBlock *IncomingBB = PN.getIncomingBlock(I);
-
+      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
         // Ignore incoming values from outside of the extracted region.
-        if (!Blocks.count(IncomingBB))
+        if (!Blocks.count(PN.getIncomingBlock(i)))
           continue;
 
         // Ensure that there is only one incoming value from codeReplacer.
         if (!IncomingCodeReplacerVal) {
-          PN.setIncomingBlock(I, codeReplacer);
-          IncomingCodeReplacerVal = PN.getIncomingValue(I);
-        } else {
-          assert(IncomingCodeReplacerVal == PN.getIncomingValue(I) &&
+          PN.setIncomingBlock(i, codeReplacer);
+          IncomingCodeReplacerVal = PN.getIncomingValue(i);
+        } else
+          assert(IncomingCodeReplacerVal == PN.getIncomingValue(i) &&
                  "PHI has two incompatbile incoming values from codeRepl");
-          IncomingValsToRemove.push_back(I);
-        }
       }
-
-      for (unsigned I : reverse(IncomingValsToRemove))
-        PN.removeIncomingValue(I, /*DeletePHIIfEmpty=*/false);
     }
-  }
 
   // Erase debug info intrinsics. Variable updates within the new function are
   // invisible to debuggers. This could be improved by defining a DISubprogram
@@ -1330,7 +1367,21 @@ Function *CodeExtractor::extractCodeRegion() {
       DVI->eraseFromParent();
   }
 
-  LLVM_DEBUG(if (verifyFunction(*newFunction))
-                 report_fatal_error("verifyFunction failed!"));
+  // Mark the new function `noreturn` if applicable. Terminators which resume
+  // exception propagation are treated as returning instructions. This is to
+  // avoid inserting traps after calls to outlined functions which unwind.
+  bool doesNotReturn = none_of(*newFunction, [](const BasicBlock &BB) {
+    const Instruction *Term = BB.getTerminator();
+    return isa<ReturnInst>(Term) || isa<ResumeInst>(Term);
+  });
+  if (doesNotReturn)
+    newFunction->setDoesNotReturn();
+
+  LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) {
+    newFunction->dump();
+    report_fatal_error("verification of newFunction failed!");
+  });
+  LLVM_DEBUG(if (verifyFunction(*oldFunction))
+             report_fatal_error("verification of oldFunction failed!"));
   return newFunction;
 }
diff --git a/lib/Transforms/Utils/CtorUtils.cpp b/lib/Transforms/Utils/CtorUtils.cpp
index 9a0240144d0808cf492a7c1f18080ed844b0845f..4e7da7d0449f9213c6c6d0a4fe871789c652d901 100644
--- a/lib/Transforms/Utils/CtorUtils.cpp
+++ b/lib/Transforms/Utils/CtorUtils.cpp
@@ -22,11 +22,10 @@
 
 #define DEBUG_TYPE "ctor_utils"
 
-namespace llvm {
+using namespace llvm;
 
-namespace {
 /// Given a specified llvm.global_ctors list, remove the listed elements.
-void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
+static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
   // Filter out the initializer elements to remove.
   ConstantArray *OldCA = cast<ConstantArray>(GCL->getInitializer());
   SmallVector<Constant *, 10> CAList;
@@ -64,7 +63,7 @@ void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
 
 /// Given a llvm.global_ctors list that we can understand,
 /// return a list of the functions and null terminator as a vector.
-std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
+static std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
   if (GV->getInitializer()->isNullValue())
     return std::vector<Function *>();
   ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
@@ -79,7 +78,7 @@ std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
 
 /// Find the llvm.global_ctors list, verifying that all initializers have an
 /// init priority of 65535.
-GlobalVariable *findGlobalCtors(Module &M) {
+static GlobalVariable *findGlobalCtors(Module &M) {
   GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
   if (!GV)
     return nullptr;
@@ -112,12 +111,11 @@ GlobalVariable *findGlobalCtors(Module &M) {
 
   return GV;
 }
-} // namespace
 
 /// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
 /// entries for which it returns true.  Return true if anything changed.
-bool optimizeGlobalCtorsList(Module &M,
-                             function_ref<bool(Function *)> ShouldRemove) {
+bool llvm::optimizeGlobalCtorsList(
+    Module &M, function_ref<bool(Function *)> ShouldRemove) {
   GlobalVariable *GlobalCtors = findGlobalCtors(M);
   if (!GlobalCtors)
     return false;
@@ -160,5 +158,3 @@ bool optimizeGlobalCtorsList(Module &M,
   removeGlobalCtors(GlobalCtors, CtorsToRemove);
   return true;
 }
-
-} // End llvm namespace
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 479816a339d0afac328b7d077ca316821f9c67f6..a9772e31da5095ccfa3415c52d1d6a4d6eaa6424 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -124,7 +124,6 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
     return SGV->getLinkage();
 
   switch (SGV->getLinkage()) {
-  case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
   case GlobalValue::ExternalLinkage:
     // External and linkonce definitions are converted to available_externally
@@ -144,11 +143,13 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
     // An imported available_externally declaration stays that way.
     return SGV->getLinkage();
 
+  case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::WeakAnyLinkage:
-    // Can't import weak_any definitions correctly, or we might change the
-    // program semantics, since the linker will pick the first weak_any
-    // definition and importing would change the order they are seen by the
-    // linker. The module linking caller needs to enforce this.
+    // Can't import linkonce_any/weak_any definitions correctly, or we might
+    // change the program semantics, since the linker will pick the first
+    // linkonce_any/weak_any definition and importing would change the order
+    // they are seen by the linker. The module linking caller needs to enforce
+    // this.
     assert(!doImportAsDefinition(SGV));
     // If imported as a declaration, it becomes external_weak.
     return SGV->getLinkage();
@@ -202,10 +203,26 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
 
 void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
 
-  // Check the summaries to see if the symbol gets resolved to a known local
-  // definition.
+  ValueInfo VI;
   if (GV.hasName()) {
-    ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID());
+    VI = ImportIndex.getValueInfo(GV.getGUID());
+    // Set synthetic function entry counts.
+    if (VI && ImportIndex.hasSyntheticEntryCounts()) {
+      if (Function *F = dyn_cast<Function>(&GV)) {
+        if (!F->isDeclaration()) {
+          for (auto &S : VI.getSummaryList()) {
+            FunctionSummary *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
+            if (FS->modulePath() == M.getModuleIdentifier()) {
+              F->setEntryCount(Function::ProfileCount(FS->entryCount(),
+                                                      Function::PCT_Synthetic));
+              break;
+            }
+          }
+        }
+      }
+    }
+    // Check the summaries to see if the symbol gets resolved to a known local
+    // definition.
     if (VI && VI.isDSOLocal()) {
       GV.setDSOLocal(true);
       if (GV.hasDLLImportStorageClass())
@@ -213,6 +230,22 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
     }
   }
 
+  // Mark read-only variables which can be imported with specific attribute.
+  // We can't internalize them now because IRMover will fail to link variable
+  // definitions to their external declarations during ThinLTO import. We'll
+  // internalize read-only variables later, after import is finished.
+  // See internalizeImmutableGVs.
+  //
+  // If global value dead stripping is not enabled in summary then
+  // propagateConstants hasn't been run. We can't internalize GV
+  // in such case.
+  if (!GV.isDeclaration() && VI && ImportIndex.withGlobalValueDeadStripping()) {
+    const auto &SL = VI.getSummaryList();
+    auto *GVS = SL.empty() ? nullptr : dyn_cast<GlobalVarSummary>(SL[0].get());
+    if (GVS && GVS->isReadOnly())
+      cast<GlobalVariable>(&GV)->addAttribute("thinlto-internalize");
+  }
+
   bool DoPromote = false;
   if (GV.hasLocalLinkage() &&
       ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) {
@@ -230,7 +263,7 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
   // Remove functions imported as available externally defs from comdats,
   // as this is a declaration for the linker, and will be dropped eventually.
   // It is illegal for comdats to contain declarations.
-  auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
+  auto *GO = dyn_cast<GlobalObject>(&GV);
   if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
     // The IRMover should not have placed any imported declarations in
     // a comdat, so the only declaration that should be in a comdat
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 0dcd7371210d75a475719a6fde4c19b37f5d63a1..79343a90a0e3af394cefead8ab3d0ccd821e4a3b 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -476,6 +476,17 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions(
   }
 }
 
+bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
+  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
+  findDbgUsers(DbgUsers, I);
+  for (auto *DII : DbgUsers) {
+    Value *Undef = UndefValue::get(I->getType());
+    DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+                                            ValueAsMetadata::get(Undef)));
+  }
+  return !DbgUsers.empty();
+}
+
 /// areAllUsesEqual - Check whether the uses of a value are all the same.
 /// This is similar to Instruction::hasOneUse() except this will also return
 /// true when there are no uses or multiple uses that all refer to the same
@@ -682,10 +693,9 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
 
   // DTU updates: Collect all the edges that enter
   // PredBB. These dominator edges will be redirected to DestBB.
-  std::vector <DominatorTree::UpdateType> Updates;
+  SmallVector<DominatorTree::UpdateType, 32> Updates;
 
   if (DTU) {
-    Updates.reserve(1 + (2 * pred_size(PredBB)));
     Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
     for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) {
       Updates.push_back({DominatorTree::Delete, *I, PredBB});
@@ -989,9 +999,8 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
 
   LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
 
-  std::vector<DominatorTree::UpdateType> Updates;
+  SmallVector<DominatorTree::UpdateType, 32> Updates;
   if (DTU) {
-    Updates.reserve(1 + (2 * pred_size(BB)));
     Updates.push_back({DominatorTree::Delete, BB, Succ});
     // All predecessors of BB will be moved to Succ.
     for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
@@ -1288,33 +1297,6 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
     return;
   }
 
-  // If an argument is zero extended then use argument directly. The ZExt
-  // may be zapped by an optimization pass in future.
-  Argument *ExtendedArg = nullptr;
-  if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
-    ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
-  if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
-    ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
-  if (ExtendedArg) {
-    // If this DII was already describing only a fragment of a variable, ensure
-    // that fragment is appropriately narrowed here.
-    // But if a fragment wasn't used, describe the value as the original
-    // argument (rather than the zext or sext) so that it remains described even
-    // if the sext/zext is optimized away. This widens the variable description,
-    // leaving it up to the consumer to know how the smaller value may be
-    // represented in a larger register.
-    if (auto Fragment = DIExpr->getFragmentInfo()) {
-      unsigned FragmentOffset = Fragment->OffsetInBits;
-      SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(),
-                                   DIExpr->elements_end() - 3);
-      Ops.push_back(dwarf::DW_OP_LLVM_fragment);
-      Ops.push_back(FragmentOffset);
-      const DataLayout &DL = DII->getModule()->getDataLayout();
-      Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
-      DIExpr = Builder.createExpression(Ops);
-    }
-    DV = ExtendedArg;
-  }
   if (!LdStHasDebugValue(DIVar, DIExpr, SI))
     Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, DII->getDebugLoc(),
                                     SI);
@@ -1961,6 +1943,7 @@ static void changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr) {
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
   NewCall->setDebugLoc(II->getDebugLoc());
+  NewCall->copyMetadata(*II);
   II->replaceAllUsesWith(NewCall);
 
   // Follow the call by a branch to the normal destination.
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 877e0e4dcf900a54e6a409421910ae855c9a4eee..efd8b92e814d0d7cd2651235c90d2fa3293414bc 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -329,12 +329,15 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
 ///
 /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
 /// DominatorTree if they are non-null.
+///
+/// If RemainderLoop is non-null, it will receive the remainder loop (if
+/// required and not fully unrolled).
 LoopUnrollResult llvm::UnrollLoop(
     Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime,
     bool AllowExpensiveTripCount, bool PreserveCondBr, bool PreserveOnlyFirst,
     unsigned TripMultiple, unsigned PeelCount, bool UnrollRemainder,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
-    OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) {
+    OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop) {
 
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
@@ -468,7 +471,7 @@ LoopUnrollResult llvm::UnrollLoop(
   if (RuntimeTripCount && TripMultiple % Count != 0 &&
       !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
                                   EpilogProfitability, UnrollRemainder, LI, SE,
-                                  DT, AC, PreserveLCSSA)) {
+                                  DT, AC, PreserveLCSSA, RemainderLoop)) {
     if (Force)
       RuntimeTripCount = false;
     else {
diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 8949c603a841e79d3c02d3f517c90911410642ce..b5d80f669fbf669eae07d12f0d3fb598f105b467 100644
--- a/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -167,12 +167,14 @@ static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
 
   isSafeToUnrollAndJam should be used prior to calling this to make sure the
   unrolling will be valid. Checking profitablility is also advisable.
+
+  If EpilogueLoop is non-null, it receives the epilogue loop (if it was
+  necessary to create one and not fully unrolled).
 */
-LoopUnrollResult
-llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
-                       unsigned TripMultiple, bool UnrollRemainder,
-                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
-                       AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
+LoopUnrollResult llvm::UnrollAndJamLoop(
+    Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
+    bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+    AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
 
   // When we enter here we should have already checked that it is safe
   BasicBlock *Header = L->getHeader();
@@ -196,7 +198,8 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   if (TripMultiple == 1 || TripMultiple % Count != 0) {
     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
                                     /*UseEpilogRemainder*/ true,
-                                    UnrollRemainder, LI, SE, DT, AC, true)) {
+                                    UnrollRemainder, LI, SE, DT, AC, true,
+                                    EpilogueLoop)) {
       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
                            "generated when assuming runtime trip count\n");
       return LoopUnrollResult::Unmodified;
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 3361883acd0c088b7e8d709bc35bf9c85c30b336..3606ec4b9fcaa9be8edaaf2748f8381dcf0a4cc4 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -380,6 +380,7 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
   }
   if (CreateRemainderLoop) {
     Loop *NewLoop = NewLoops[L];
+    MDNode *LoopID = NewLoop->getLoopID();
     assert(NewLoop && "L should have been cloned");
 
     // Only add loop metadata if the loop is not going to be completely
@@ -387,6 +388,16 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
     if (UnrollRemainder)
       return NewLoop;
 
+    Optional<MDNode *> NewLoopID = makeFollowupLoopID(
+        LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
+    if (NewLoopID.hasValue()) {
+      NewLoop->setLoopID(NewLoopID.getValue());
+
+      // Do not setLoopAlreadyUnrolled if loop attributes have been defined
+      // explicitly.
+      return NewLoop;
+    }
+
     // Add unroll disable metadata to disable future unrolling for this loop.
     NewLoop->setLoopAlreadyUnrolled();
     return NewLoop;
@@ -525,10 +536,10 @@ static bool canProfitablyUnrollMultiExitLoop(
 bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                       bool AllowExpensiveTripCount,
                                       bool UseEpilogRemainder,
-                                      bool UnrollRemainder,
-                                      LoopInfo *LI, ScalarEvolution *SE,
-                                      DominatorTree *DT, AssumptionCache *AC,
-                                      bool PreserveLCSSA) {
+                                      bool UnrollRemainder, LoopInfo *LI,
+                                      ScalarEvolution *SE, DominatorTree *DT,
+                                      AssumptionCache *AC, bool PreserveLCSSA,
+                                      Loop **ResultLoop) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -911,16 +922,20 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
       formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
   }
 
+  auto UnrollResult = LoopUnrollResult::Unmodified;
   if (remainderLoop && UnrollRemainder) {
     LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
-    UnrollLoop(remainderLoop, /*Count*/ Count - 1, /*TripCount*/ Count - 1,
-               /*Force*/ false, /*AllowRuntime*/ false,
-               /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
-               /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
-               /*PeelCount*/ 0, /*UnrollRemainder*/ false, LI, SE, DT, AC,
-               /*ORE*/ nullptr, PreserveLCSSA);
+    UnrollResult =
+        UnrollLoop(remainderLoop, /*Count*/ Count - 1, /*TripCount*/ Count - 1,
+                   /*Force*/ false, /*AllowRuntime*/ false,
+                   /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
+                   /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
+                   /*PeelCount*/ 0, /*UnrollRemainder*/ false, LI, SE, DT, AC,
+                   /*ORE*/ nullptr, PreserveLCSSA);
   }
 
+  if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
+    *ResultLoop = remainderLoop;
   NumRuntimeUnrolled++;
   return true;
 }
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 249869e1bdeecd27f26fbae41a8c4e2177e7e919..1d4f07ff1b9a9548899fe3f88238664786b288e0 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -26,9 +26,11 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DomTreeUpdater.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
@@ -42,6 +44,8 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "loop-utils"
 
+static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
+
 bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
                                    bool PreserveLCSSA) {
   bool Changed = false;
@@ -183,14 +187,8 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) {
   INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 }
 
-/// Find string metadata for loop
-///
-/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
-/// operand or null otherwise.  If the string metadata is not found return
-/// Optional's not-a-value.
-Optional<const MDOperand *> llvm::findStringMetadataForLoop(Loop *TheLoop,
-                                                            StringRef Name) {
-  MDNode *LoopID = TheLoop->getLoopID();
+static Optional<MDNode *> findOptionMDForLoopID(MDNode *LoopID,
+                                                StringRef Name) {
   // Return none if LoopID is false.
   if (!LoopID)
     return None;
@@ -209,18 +207,253 @@ Optional<const MDOperand *> llvm::findStringMetadataForLoop(Loop *TheLoop,
       continue;
     // Return true if MDString holds expected MetaData.
     if (Name.equals(S->getString()))
-      switch (MD->getNumOperands()) {
-      case 1:
-        return nullptr;
-      case 2:
-        return &MD->getOperand(1);
-      default:
-        llvm_unreachable("loop metadata has 0 or 1 operand");
-      }
+      return MD;
   }
   return None;
 }
 
+static Optional<MDNode *> findOptionMDForLoop(const Loop *TheLoop,
+                                              StringRef Name) {
+  return findOptionMDForLoopID(TheLoop->getLoopID(), Name);
+}
+
+/// Find string metadata for loop
+///
+/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
+/// operand or null otherwise.  If the string metadata is not found return
+/// Optional's not-a-value.
+Optional<const MDOperand *> llvm::findStringMetadataForLoop(Loop *TheLoop,
+                                                            StringRef Name) {
+  auto MD = findOptionMDForLoop(TheLoop, Name).getValueOr(nullptr);
+  if (!MD)
+    return None;
+  switch (MD->getNumOperands()) {
+  case 1:
+    return nullptr;
+  case 2:
+    return &MD->getOperand(1);
+  default:
+    llvm_unreachable("loop metadata has 0 or 1 operand");
+  }
+}
+
+static Optional<bool> getOptionalBoolLoopAttribute(const Loop *TheLoop,
+                                                   StringRef Name) {
+  Optional<MDNode *> MD = findOptionMDForLoop(TheLoop, Name);
+  if (!MD.hasValue())
+    return None;
+  MDNode *OptionNode = MD.getValue();
+  if (OptionNode == nullptr)
+    return None;
+  switch (OptionNode->getNumOperands()) {
+  case 1:
+    // When the value is absent it is interpreted as 'attribute set'.
+    return true;
+  case 2:
+    return mdconst::extract_or_null<ConstantInt>(
+        OptionNode->getOperand(1).get());
+  }
+  llvm_unreachable("unexpected number of options");
+}
+
+static bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) {
+  return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false);
+}
+
+llvm::Optional<int> llvm::getOptionalIntLoopAttribute(Loop *TheLoop,
+                                                      StringRef Name) {
+  const MDOperand *AttrMD =
+      findStringMetadataForLoop(TheLoop, Name).getValueOr(nullptr);
+  if (!AttrMD)
+    return None;
+
+  ConstantInt *IntMD = mdconst::extract_or_null<ConstantInt>(AttrMD->get());
+  if (!IntMD)
+    return None;
+
+  return IntMD->getSExtValue();
+}
+
+Optional<MDNode *> llvm::makeFollowupLoopID(
+    MDNode *OrigLoopID, ArrayRef<StringRef> FollowupOptions,
+    const char *InheritOptionsExceptPrefix, bool AlwaysNew) {
+  if (!OrigLoopID) {
+    if (AlwaysNew)
+      return nullptr;
+    return None;
+  }
+
+  assert(OrigLoopID->getOperand(0) == OrigLoopID);
+
+  bool InheritAllAttrs = !InheritOptionsExceptPrefix;
+  bool InheritSomeAttrs =
+      InheritOptionsExceptPrefix && InheritOptionsExceptPrefix[0] != '\0';
+  SmallVector<Metadata *, 8> MDs;
+  MDs.push_back(nullptr);
+
+  bool Changed = false;
+  if (InheritAllAttrs || InheritSomeAttrs) {
+    for (const MDOperand &Existing : drop_begin(OrigLoopID->operands(), 1)) {
+      MDNode *Op = cast<MDNode>(Existing.get());
+
+      auto InheritThisAttribute = [InheritSomeAttrs,
+                                   InheritOptionsExceptPrefix](MDNode *Op) {
+        if (!InheritSomeAttrs)
+          return false;
+
+        // Skip malformatted attribute metadata nodes.
+        if (Op->getNumOperands() == 0)
+          return true;
+        Metadata *NameMD = Op->getOperand(0).get();
+        if (!isa<MDString>(NameMD))
+          return true;
+        StringRef AttrName = cast<MDString>(NameMD)->getString();
+
+        // Do not inherit excluded attributes.
+        return !AttrName.startswith(InheritOptionsExceptPrefix);
+      };
+
+      if (InheritThisAttribute(Op))
+        MDs.push_back(Op);
+      else
+        Changed = true;
+    }
+  } else {
+    // Modified if we dropped at least one attribute.
+    Changed = OrigLoopID->getNumOperands() > 1;
+  }
+
+  bool HasAnyFollowup = false;
+  for (StringRef OptionName : FollowupOptions) {
+    MDNode *FollowupNode =
+        findOptionMDForLoopID(OrigLoopID, OptionName).getValueOr(nullptr);
+    if (!FollowupNode)
+      continue;
+
+    HasAnyFollowup = true;
+    for (const MDOperand &Option : drop_begin(FollowupNode->operands(), 1)) {
+      MDs.push_back(Option.get());
+      Changed = true;
+    }
+  }
+
+  // Attributes of the followup loop not specified explicity, so signal to the
+  // transformation pass to add suitable attributes.
+  if (!AlwaysNew && !HasAnyFollowup)
+    return None;
+
+  // If no attributes were added or remove, the previous loop Id can be reused.
+  if (!AlwaysNew && !Changed)
+    return OrigLoopID;
+
+  // No attributes is equivalent to having no !llvm.loop metadata at all.
+  if (MDs.size() == 1)
+    return nullptr;
+
+  // Build the new loop ID.
+  MDTuple *FollowupLoopID = MDNode::get(OrigLoopID->getContext(), MDs);
+  FollowupLoopID->replaceOperandWith(0, FollowupLoopID);
+  return FollowupLoopID;
+}
+
+bool llvm::hasDisableAllTransformsHint(const Loop *L) {
+  return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced);
+}
+
+TransformationMode llvm::hasUnrollTransformation(Loop *L) {
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable"))
+    return TM_SuppressedByUser;
+
+  Optional<int> Count =
+      getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count");
+  if (Count.hasValue())
+    return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
+
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable"))
+    return TM_ForcedByUser;
+
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.full"))
+    return TM_ForcedByUser;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+TransformationMode llvm::hasUnrollAndJamTransformation(Loop *L) {
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.disable"))
+    return TM_SuppressedByUser;
+
+  Optional<int> Count =
+      getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count");
+  if (Count.hasValue())
+    return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
+
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable"))
+    return TM_ForcedByUser;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+TransformationMode llvm::hasVectorizeTransformation(Loop *L) {
+  Optional<bool> Enable =
+      getOptionalBoolLoopAttribute(L, "llvm.loop.vectorize.enable");
+
+  if (Enable == false)
+    return TM_SuppressedByUser;
+
+  Optional<int> VectorizeWidth =
+      getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width");
+  Optional<int> InterleaveCount =
+      getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
+
+  if (Enable == true) {
+    // 'Forcing' vector width and interleave count to one effectively disables
+    // this tranformation.
+    if (VectorizeWidth == 1 && InterleaveCount == 1)
+      return TM_SuppressedByUser;
+    return TM_ForcedByUser;
+  }
+
+  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
+    return TM_Disable;
+
+  if (VectorizeWidth == 1 && InterleaveCount == 1)
+    return TM_Disable;
+
+  if (VectorizeWidth > 1 || InterleaveCount > 1)
+    return TM_Enable;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+TransformationMode llvm::hasDistributeTransformation(Loop *L) {
+  if (getBooleanLoopAttribute(L, "llvm.loop.distribute.enable"))
+    return TM_ForcedByUser;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+TransformationMode llvm::hasLICMVersioningTransformation(Loop *L) {
+  if (getBooleanLoopAttribute(L, "llvm.loop.licm_versioning.disable"))
+    return TM_SuppressedByUser;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
 /// Does a BFS from a given node to all of its children inside a given loop.
 /// The returned vector of nodes includes the starting point.
 SmallVector<DomTreeNode *, 16>
@@ -336,6 +569,10 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
     DTU.deleteEdge(Preheader, L->getHeader());
   }
 
+  // Use a map to unique and a vector to guarantee deterministic ordering.
+  llvm::SmallDenseSet<std::pair<DIVariable *, DIExpression *>, 4> DeadDebugSet;
+  llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
+
   // Given LCSSA form is satisfied, we should not have users of instructions
   // within the dead loop outside of the loop. However, LCSSA doesn't take
   // unreachable uses into account. We handle them here.
@@ -360,8 +597,27 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
                  "Unexpected user in reachable block");
         U.set(Undef);
       }
+      auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
+      if (!DVI)
+        continue;
+      auto Key = DeadDebugSet.find({DVI->getVariable(), DVI->getExpression()});
+      if (Key != DeadDebugSet.end())
+        continue;
+      DeadDebugSet.insert({DVI->getVariable(), DVI->getExpression()});
+      DeadDebugInst.push_back(DVI);
     }
 
+  // After the loop has been deleted all the values defined and modified
+  // inside the loop are going to be unavailable.
+  // Since debug values in the loop have been deleted, inserting an undef
+  // dbg.value truncates the range of any dbg.value before the loop where the
+  // loop used to be. This is particularly important for constant values.
+  DIBuilder DIB(*ExitBlock->getModule());
+  for (auto *DVI : DeadDebugInst)
+    DIB.insertDbgValueIntrinsic(
+        UndefValue::get(Builder.getInt32Ty()), DVI->getVariable(),
+        DVI->getExpression(), DVI->getDebugLoc(), ExitBlock->getFirstNonPHI());
+
   // Remove the block from the reference counting scheme, so that we can
   // delete it freely later.
   for (auto *Block : L->blocks())
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 6773ad7bfc70ad0236d3d3d9f41db16b8ac23937..a53083c31e3f12afa3b8fd7ed98f2020a26b52f3 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -751,14 +751,18 @@ void PromoteMem2Reg::run() {
     // Ok, now we know that all of the PHI nodes are missing entries for some
     // basic blocks.  Start by sorting the incoming predecessors for efficient
     // access.
-    llvm::sort(Preds);
+    auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) {
+      return BBNumbers.lookup(A) < BBNumbers.lookup(B);
+    };
+    llvm::sort(Preds, CompareBBNumbers);
 
     // Now we loop through all BB's which have entries in SomePHI and remove
     // them from the Preds list.
     for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
       // Do a log(n) search of the Preds list for the entry we want.
       SmallVectorImpl<BasicBlock *>::iterator EntIt = std::lower_bound(
-          Preds.begin(), Preds.end(), SomePHI->getIncomingBlock(i));
+          Preds.begin(), Preds.end(), SomePHI->getIncomingBlock(i),
+          CompareBBNumbers);
       assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) &&
              "PHI node has entry for a block which is not a predecessor!");
 
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 849f9ee1d19dde4749c66a67148f7e5f1c308bb1..b98f2fff65eaeb96fd0daad9c6561533ceeed348 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -693,7 +693,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     // Do not permit merging of large switch instructions into their
     // predecessors unless there is only one predecessor.
-    if (SI->getNumSuccessors() * pred_size(SI->getParent()) <= 128)
+    if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors()))
       CV = SI->getCondition();
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
     if (BI->isConditional() && BI->getCondition()->hasOneUse())
@@ -1372,6 +1372,14 @@ HoistTerminator:
     }
   }
 
+  // As the parent basic block terminator is a branch instruction which is
+  // removed at the end of the current transformation, use its previous
+  // non-debug instruction, as the reference insertion point, which will
+  // provide the debug location for generated select instructions. For BBs
+  // with only debug instructions, use an empty debug location.
+  Instruction *InsertPt =
+      BIParent->getTerminator()->getPrevNonDebugInstruction();
+
   // Okay, it is safe to hoist the terminator.
   Instruction *NT = I1->clone();
   BIParent->getInstList().insert(BI->getIterator(), NT);
@@ -1381,7 +1389,16 @@ HoistTerminator:
     NT->takeName(I1);
   }
 
+  // Ensure terminator gets a debug location, even an unknown one, in case
+  // it involves inlinable calls.
+  NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
   IRBuilder<NoFolder> Builder(NT);
+  // If an earlier instruction in this BB had a location, adopt it, otherwise
+  // clear debug locations.
+  Builder.SetCurrentDebugLocation(InsertPt ? InsertPt->getDebugLoc()
+                                           : DebugLoc());
+
   // Hoisting one of the terminators from our successor is a great thing.
   // Unfortunately, the successors of the if/else blocks may have PHI nodes in
   // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
@@ -2874,7 +2891,7 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
       if (!AlternativeV)
         break;
 
-      assert(pred_size(Succ) == 2);
+      assert(Succ->hasNPredecessors(2));
       auto PredI = pred_begin(Succ);
       BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
       if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
@@ -5758,7 +5775,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
   // backedge, so we can eliminate BB.
   bool NeedCanonicalLoop =
       Options.NeedCanonicalLoop &&
-      (LoopHeaders && pred_size(BB) > 1 &&
+      (LoopHeaders && BB->hasNPredecessorsOrMore(2) &&
        (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
@@ -5837,28 +5854,17 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (SimplifyBranchOnICmpChain(BI, Builder, DL))
     return true;
 
-  // If this basic block has a single dominating predecessor block and the
-  // dominating block's condition implies BI's condition, we know the direction
-  // of the BI branch.
-  if (BasicBlock *Dom = BB->getSinglePredecessor()) {
-    auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
-    if (PBI && PBI->isConditional() &&
-        PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
-      assert(PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB);
-      bool CondIsTrue = PBI->getSuccessor(0) == BB;
-      Optional<bool> Implication = isImpliedCondition(
-          PBI->getCondition(), BI->getCondition(), DL, CondIsTrue);
-      if (Implication) {
-        // Turn this into a branch on constant.
-        auto *OldCond = BI->getCondition();
-        ConstantInt *CI = *Implication
-                              ? ConstantInt::getTrue(BB->getContext())
-                              : ConstantInt::getFalse(BB->getContext());
-        BI->setCondition(CI);
-        RecursivelyDeleteTriviallyDeadInstructions(OldCond);
-        return requestResimplify();
-      }
-    }
+  // If this basic block has dominating predecessor blocks and the dominating
+  // blocks' conditions imply BI's condition, we know the direction of BI.
+  Optional<bool> Imp = isImpliedByDomCondition(BI->getCondition(), BI, DL);
+  if (Imp) {
+    // Turn this into a branch on constant.
+    auto *OldCond = BI->getCondition();
+    ConstantInt *TorF = *Imp ? ConstantInt::getTrue(BB->getContext())
+                             : ConstantInt::getFalse(BB->getContext());
+    BI->setCondition(TorF);
+    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+    return requestResimplify();
   }
 
   // If this basic block is ONLY a compare and a branch, and if a predecessor
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 27a4d241b320080bb52964385e3f979edee3c619..06eaadf58c3fce1f6b9c3531335d23d6ff06d370 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_library(LLVMVectorize
   VPlan.cpp
   VPlanHCFGBuilder.cpp
   VPlanHCFGTransforms.cpp
+  VPlanSLP.cpp
   VPlanVerifier.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index f2344fec3cd3bb3d8611b2d7f89060531b8e18a0..9ff18328c219d4eba51b0f342e5868076ba5caea 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -79,6 +79,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Vectorize.h"
+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
@@ -205,12 +206,12 @@ private:
                           unsigned Alignment);
 };
 
-class LoadStoreVectorizer : public FunctionPass {
+class LoadStoreVectorizerLegacyPass : public FunctionPass {
 public:
   static char ID;
 
-  LoadStoreVectorizer() : FunctionPass(ID) {
-    initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
+  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
+    initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &F) override;
@@ -230,30 +231,23 @@ public:
 
 } // end anonymous namespace
 
-char LoadStoreVectorizer::ID = 0;
+char LoadStoreVectorizerLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
                       "Vectorize load and Store instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
+INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
                     "Vectorize load and store instructions", false, false)
 
 Pass *llvm::createLoadStoreVectorizerPass() {
-  return new LoadStoreVectorizer();
+  return new LoadStoreVectorizerLegacyPass();
 }
 
-// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
-// vectors of Instructions.
-static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
-  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
-  propagateMetadata(I, VL);
-}
-
-bool LoadStoreVectorizer::runOnFunction(Function &F) {
+bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
   // Don't vectorize when the attribute NoImplicitFloat is used.
   if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
     return false;
@@ -268,6 +262,30 @@ bool LoadStoreVectorizer::runOnFunction(Function &F) {
   return V.run();
 }
 
+PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+  // Don't vectorize when the attribute NoImplicitFloat is used.
+  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+    return PreservedAnalyses::all();
+
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+  Vectorizer V(F, AA, DT, SE, TTI);
+  bool Changed = V.run();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return Changed ? PA : PreservedAnalyses::all();
+}
+
+// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
+// vectors of Instructions.
+static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
+  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
+  propagateMetadata(I, VL);
+}
+
 // Vectorizer Implementation
 bool Vectorizer::run() {
   bool Changed = false;
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 755ad32a7bf5079b871a0b09c53a9f756d508bfe..ac989dd66f72d228d974df9f7b045aab38940dec 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -80,10 +80,11 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   return false;
 }
 
-LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
+LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
+                                       bool InterleaveOnlyWhenForced,
                                        OptimizationRemarkEmitter &ORE)
     : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
-      Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+      Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
       Force("vectorize.enable", FK_Undefined, HK_FORCE),
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
   // Populate values with existing loop metadata.
@@ -98,19 +99,19 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
     // consider the loop to have been already vectorized because there's
     // nothing more that we can do.
     IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
-  LLVM_DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+  LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
              << "LV: Interleaving disabled by the pass manager\n");
 }
 
-bool LoopVectorizeHints::allowVectorization(Function *F, Loop *L,
-                                            bool AlwaysVectorize) const {
+bool LoopVectorizeHints::allowVectorization(
+    Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
   if (getForce() == LoopVectorizeHints::FK_Disabled) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
     emitRemarkWithHints();
     return false;
   }
 
-  if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
+  if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
     emitRemarkWithHints();
     return false;
@@ -817,15 +818,14 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (!LAI->canVectorizeMemory())
     return false;
 
-  if (LAI->hasMultipleStoresToLoopInvariantAddress()) {
+  if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
-              << "multiple writes to a loop invariant address could not "
+              << "write to a loop invariant address could not "
                  "be vectorized");
     LLVM_DEBUG(
-        dbgs() << "LV: We don't allow multiple stores to a uniform address\n");
+        dbgs() << "LV: Non vectorizable stores to a uniform address\n");
     return false;
   }
-
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
   PSE.addPredicate(LAI->getPSE().getUnionPredicate());
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c9c70b5c5364a0d9522cae79ebb7bd763a251f87..c74352cf7039b49e70066eac51a10bd250e4b21c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -152,6 +152,16 @@ using namespace llvm;
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
 
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopVectorizeFollowupAll =
+    "llvm.loop.vectorize.followup_all";
+static const char *const LLVMLoopVectorizeFollowupVectorized =
+    "llvm.loop.vectorize.followup_vectorized";
+static const char *const LLVMLoopVectorizeFollowupEpilogue =
+    "llvm.loop.vectorize.followup_epilogue";
+/// @}
+
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 
@@ -796,27 +806,6 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
   }
 }
 
-static void emitMissedWarning(Function *F, Loop *L,
-                              const LoopVectorizeHints &LH,
-                              OptimizationRemarkEmitter *ORE) {
-  LH.emitRemarkWithHints();
-
-  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
-    if (LH.getWidth() != 1)
-      ORE->emit(DiagnosticInfoOptimizationFailure(
-                    DEBUG_TYPE, "FailedRequestedVectorization",
-                    L->getStartLoc(), L->getHeader())
-                << "loop not vectorized: "
-                << "failed explicitly specified loop vectorization");
-    else if (LH.getInterleave() != 1)
-      ORE->emit(DiagnosticInfoOptimizationFailure(
-                    DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
-                    L->getHeader())
-                << "loop not interleaved: "
-                << "failed explicitly specified loop interleaving");
-  }
-}
-
 namespace llvm {
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -976,7 +965,7 @@ public:
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
   /// interleaving group \p Grp and vector width \p VF.
-  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
+  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
                            InstWidening W, unsigned Cost) {
     assert(VF >= 2 && "Expected VF >=2");
     /// Broadcast this decicion to all instructions inside the group.
@@ -1131,7 +1120,8 @@ public:
   }
 
   /// Get the interleaved access group that \p Instr belongs to.
-  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+  const InterleaveGroup<Instruction> *
+  getInterleavedAccessGroup(Instruction *Instr) {
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
@@ -1369,14 +1359,15 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp,
     return false;
 
   Function *Fn = OuterLp->getHeader()->getParent();
-  if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
+  if (!Hints.allowVectorization(Fn, OuterLp,
+                                true /*VectorizeOnlyWhenForced*/)) {
     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
     return false;
   }
 
   if (!Hints.getWidth()) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
-    emitMissedWarning(Fn, OuterLp, Hints, ORE);
+    Hints.emitRemarkWithHints();
     return false;
   }
 
@@ -1384,7 +1375,7 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp,
     // TODO: Interleave support is future work.
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
                          "outer loops.\n");
-    emitMissedWarning(Fn, OuterLp, Hints, ORE);
+    Hints.emitRemarkWithHints();
     return false;
   }
 
@@ -1425,10 +1416,11 @@ struct LoopVectorize : public FunctionPass {
 
   LoopVectorizePass Impl;
 
-  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
+  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
+                         bool VectorizeOnlyWhenForced = false)
       : FunctionPass(ID) {
-    Impl.DisableUnrolling = NoUnrolling;
-    Impl.AlwaysVectorize = AlwaysVectorize;
+    Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
+    Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -1994,7 +1986,8 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
                                                    VectorParts *BlockInMask) {
-  const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
+  const InterleaveGroup<Instruction> *Group =
+      Cost->getInterleavedAccessGroup(Instr);
   assert(Group && "Fail to get an interleaved access group.");
 
   // Skip if current instruction is not the insert position.
@@ -2737,6 +2730,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
   assert(VectorPH && "Invalid loop structure");
   assert(ExitBlock && "Must have an exit block");
 
@@ -2880,6 +2874,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
 
+  Optional<MDNode *> VectorizedLoopID =
+      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+                                      LLVMLoopVectorizeFollowupVectorized});
+  if (VectorizedLoopID.hasValue()) {
+    Lp->setLoopID(VectorizedLoopID.getValue());
+
+    // Do not setAlreadyVectorized if loop attributes have been defined
+    // explicitly.
+    return LoopVectorPreHeader;
+  }
+
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
   if (MDNode *LID = OrigLoop->getLoopID())
@@ -5826,9 +5831,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     auto *Phi = cast<PHINode>(I);
 
     // First-order recurrences are replaced by vector shuffles inside the loop.
+    // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                VectorTy, VF - 1, ToVectorTy(RetTy, 1));
+                                VectorTy, VF - 1, VectorType::get(RetTy, 1));
 
     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
     // converted into select instructions. We require N - 1 selects per phi
@@ -6018,8 +6024,9 @@ INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
 
-Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
-  return new LoopVectorize(NoUnrolling, AlwaysVectorize);
+Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
+                              bool VectorizeOnlyWhenForced) {
+  return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
 }
 
 } // end namespace llvm
@@ -6376,7 +6383,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
                                                            VFRange &Range,
                                                            VPlanPtr &Plan) {
-  const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
+  const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
   if (!IG)
     return nullptr;
 
@@ -6792,7 +6799,8 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes(
 
       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
       // member of the IG, do not construct any Recipe for it.
-      const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr);
+      const InterleaveGroup<Instruction> *IG =
+          CM.getInterleavedAccessGroup(Instr);
       if (IG && Instr != IG->getInsertPos() &&
           Range.Start >= 2 && // Query is illegal for VF == 1
           CM.getWideningDecision(Instr, Range.Start) ==
@@ -7136,7 +7144,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                     << L->getHeader()->getParent()->getName() << "\" from "
                     << DebugLocStr << "\n");
 
-  LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
+  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
 
   LLVM_DEBUG(
       dbgs() << "LV: Loop hints:"
@@ -7160,7 +7168,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // less verbose reporting vectorized loops and unvectorized loops that may
   // benefit from vectorization, respectively.
 
-  if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
+  if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
     return false;
   }
@@ -7173,7 +7181,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                 &Requirements, &Hints, DB, AC);
   if (!LVL.canVectorize(EnableVPlanNativePath)) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
-    emitMissedWarning(F, L, Hints, ORE);
+    Hints.emitRemarkWithHints();
     return false;
   }
 
@@ -7246,7 +7254,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
                                      "NoImplicitFloat", L)
               << "loop not vectorized due to NoImplicitFloat attribute");
-    emitMissedWarning(F, L, Hints, ORE);
+    Hints.emitRemarkWithHints();
     return false;
   }
 
@@ -7261,7 +7269,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     ORE->emit(
         createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
         << "loop not vectorized due to unsafe FP support.");
-    emitMissedWarning(F, L, Hints, ORE);
+    Hints.emitRemarkWithHints();
     return false;
   }
 
@@ -7303,7 +7311,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (Requirements.doesNotMeet(F, L, Hints)) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
                          "requirements.\n");
-    emitMissedWarning(F, L, Hints, ORE);
+    Hints.emitRemarkWithHints();
     return false;
   }
 
@@ -7380,6 +7388,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   LVP.setBestPlan(VF.Width, IC);
 
   using namespace ore;
+  bool DisableRuntimeUnroll = false;
+  MDNode *OrigLoopID = L->getLoopID();
 
   if (!VectorizeLoop) {
     assert(IC > 1 && "interleave count should not be 1 or 0");
@@ -7406,7 +7416,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // no runtime checks about strides and memory. A scalar loop that is
     // rarely used is not worth unrolling.
     if (!LB.areSafetyChecksAdded())
-      AddRuntimeUnrollDisableMetaData(L);
+      DisableRuntimeUnroll = true;
 
     // Report the vectorization decision.
     ORE->emit([&]() {
@@ -7418,8 +7428,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     });
   }
 
-  // Mark the loop as already vectorized to avoid vectorizing again.
-  Hints.setAlreadyVectorized();
+  Optional<MDNode *> RemainderLoopID =
+      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+                                      LLVMLoopVectorizeFollowupEpilogue});
+  if (RemainderLoopID.hasValue()) {
+    L->setLoopID(RemainderLoopID.getValue());
+  } else {
+    if (DisableRuntimeUnroll)
+      AddRuntimeUnrollDisableMetaData(L);
+
+    // Mark the loop as already vectorized to avoid vectorizing again.
+    Hints.setAlreadyVectorized();
+  }
 
   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
   return true;
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3592df3ede3d13f187717c122af372fa98694a69..87a861970d10a0e33c9d2ef427c6b50792db24a1 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3643,6 +3643,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       auto &Locs = ExternallyUsedValues[Scalar];
       ExternallyUsedValues.insert({Ex, Locs});
       ExternallyUsedValues.erase(Scalar);
+      // Required to update internally referenced instructions.
+      Scalar->replaceAllUsesWith(Ex);
       continue;
     }
 
@@ -5453,7 +5455,7 @@ class HorizontalReduction {
     }
   };
 
-  Instruction *ReductionRoot = nullptr;
+  WeakTrackingVH ReductionRoot;
 
   /// The operation data of the reduction operation.
   OperationData ReductionData;
@@ -5738,7 +5740,7 @@ public:
     unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
 
     Value *VectorizedTree = nullptr;
-    IRBuilder<> Builder(ReductionRoot);
+    IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
     FastMathFlags Unsafe;
     Unsafe.setFast();
     Builder.setFastMathFlags(Unsafe);
@@ -5747,8 +5749,13 @@ public:
     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
     // The same extra argument may be used several time, so log each attempt
     // to use it.
-    for (auto &Pair : ExtraArgs)
+    for (auto &Pair : ExtraArgs) {
+      assert(Pair.first && "DebugLoc must be set.");
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
+    }
+    // The reduction root is used as the insertion point for new instructions,
+    // so set it as externally used to prevent it from being deleted.
+    ExternallyUsedValues[ReductionRoot];
     SmallVector<Value *, 16> IgnoreList;
     for (auto &V : ReductionOps)
       IgnoreList.append(V.begin(), V.end());
@@ -5800,6 +5807,7 @@ public:
       Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
 
       // Emit a reduction.
+      Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
       Value *ReducedSubTree =
           emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
       if (VectorizedTree) {
@@ -5826,8 +5834,6 @@ public:
         VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
       }
       for (auto &Pair : ExternallyUsedValues) {
-        assert(!Pair.second.empty() &&
-               "At least one DebugLoc must be inserted");
         // Add each externally used value to the final reduction.
         for (auto *I : Pair.second) {
           Builder.SetCurrentDebugLocation(I->getDebugLoc());
diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index a3c15a36b05d77d8ba344d07fe7880942cbc1109..05a5400beb4e69e49e26fe6ff988150229c33a4e 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@@ -338,6 +338,12 @@ void VPInstruction::print(raw_ostream &O) const {
   case VPInstruction::ICmpULE:
     O << "icmp ule";
     break;
+  case VPInstruction::SLPLoad:
+    O << "combined load";
+    break;
+  case VPInstruction::SLPStore:
+    O << "combined store";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -462,7 +468,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
            "One successor of a basic block does not lead to the other.");
     assert(InterimSucc->getSinglePredecessor() &&
            "Interim successor has more than one predecessor.");
-    assert(pred_size(PostDomSucc) == 2 &&
+    assert(PostDomSucc->hasNPredecessors(2) &&
            "PostDom successor has more than two predecessors.");
     DT->addNewBlock(InterimSucc, BB);
     DT->addNewBlock(PostDomSucc, BB);
@@ -680,3 +686,55 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O,
 }
 
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
+
+void VPValue::replaceAllUsesWith(VPValue *New) {
+  for (VPUser *User : users())
+    for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
+      if (User->getOperand(I) == this)
+        User->setOperand(I, New);
+}
+
+void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
+                                          Old2NewTy &Old2New,
+                                          InterleavedAccessInfo &IAI) {
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
+  for (VPBlockBase *Base : RPOT) {
+    visitBlock(Base, Old2New, IAI);
+  }
+}
+
+void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+                                         InterleavedAccessInfo &IAI) {
+  if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
+    for (VPRecipeBase &VPI : *VPBB) {
+      assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
+      auto *VPInst = cast<VPInstruction>(&VPI);
+      auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+      auto *IG = IAI.getInterleaveGroup(Inst);
+      if (!IG)
+        continue;
+
+      auto NewIGIter = Old2New.find(IG);
+      if (NewIGIter == Old2New.end())
+        Old2New[IG] = new InterleaveGroup<VPInstruction>(
+            IG->getFactor(), IG->isReverse(), IG->getAlignment());
+
+      if (Inst == IG->getInsertPos())
+        Old2New[IG]->setInsertPos(VPInst);
+
+      InterleaveGroupMap[VPInst] = Old2New[IG];
+      InterleaveGroupMap[VPInst]->insertMember(
+          VPInst, IG->getIndex(Inst),
+          IG->isReverse() ? (-1) * int(IG->getFactor()) : IG->getFactor());
+    }
+  } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+    visitRegion(Region, Old2New, IAI);
+  else
+    llvm_unreachable("Unsupported kind of VPBlock.");
+}
+
+VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
+                                                 InterleavedAccessInfo &IAI) {
+  Old2NewTy Old2New;
+  visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
+}
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index 9daaea1acdedd082396cc69d92742f38f5797b3e..5c1b4a83c30e10991dce1adba2ceac4decba0741 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -38,6 +38,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IRBuilder.h"
 #include <algorithm>
 #include <cassert>
@@ -52,12 +53,14 @@ class LoopVectorizationCostModel;
 class BasicBlock;
 class DominatorTree;
 class InnerLoopVectorizer;
-class InterleaveGroup;
+template <class T> class InterleaveGroup;
+class LoopInfo;
 class raw_ostream;
 class Value;
 class VPBasicBlock;
 class VPRegionBlock;
 class VPlan;
+class VPlanSlp;
 
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
@@ -607,10 +610,16 @@ public:
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPUser, public VPRecipeBase {
   friend class VPlanHCFGTransforms;
+  friend class VPlanSlp;
 
 public:
   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
-  enum { Not = Instruction::OtherOpsEnd + 1, ICmpULE };
+  enum {
+    Not = Instruction::OtherOpsEnd + 1,
+    ICmpULE,
+    SLPLoad,
+    SLPStore,
+  };
 
 private:
   typedef unsigned char OpcodeTy;
@@ -620,6 +629,13 @@ private:
   /// modeled instruction.
   void generateInstruction(VPTransformState &State, unsigned Part);
 
+protected:
+  Instruction *getUnderlyingInstr() {
+    return cast_or_null<Instruction>(getUnderlyingValue());
+  }
+
+  void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
+
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
       : VPUser(VPValue::VPInstructionSC, Operands),
@@ -633,6 +649,11 @@ public:
     return V->getVPValueID() == VPValue::VPInstructionSC;
   }
 
+  VPInstruction *clone() const {
+    SmallVector<VPValue *, 2> Operands(operands());
+    return new VPInstruction(Opcode, Operands);
+  }
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC;
@@ -650,6 +671,14 @@ public:
 
   /// Print the VPInstruction.
   void print(raw_ostream &O) const;
+
+  /// Return true if this instruction may modify memory.
+  bool mayWriteToMemory() const {
+    // TODO: we can use attributes of the called function to rule out memory
+    //       modifications.
+    return Opcode == Instruction::Store || Opcode == Instruction::Call ||
+           Opcode == Instruction::Invoke || Opcode == SLPStore;
+  }
 };
 
 /// VPWidenRecipe is a recipe for producing a copy of vector type for each
@@ -771,11 +800,11 @@ public:
 /// or stores into one wide load/store and shuffles.
 class VPInterleaveRecipe : public VPRecipeBase {
 private:
-  const InterleaveGroup *IG;
+  const InterleaveGroup<Instruction> *IG;
   std::unique_ptr<VPUser> User;
 
 public:
-  VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask)
+  VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Mask)
       : VPRecipeBase(VPInterleaveSC), IG(IG) {
     if (Mask) // Create a VPInstruction to register as a user of the mask.
       User.reset(new VPUser({Mask}));
@@ -793,7 +822,7 @@ public:
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent) const override;
 
-  const InterleaveGroup *getInterleaveGroup() { return IG; }
+  const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
 };
 
 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
@@ -1464,6 +1493,144 @@ public:
   }
 };
 
+class VPInterleavedAccessInfo {
+private:
+  DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
+      InterleaveGroupMap;
+
+  /// Type for mapping of instruction based interleave groups to VPInstruction
+  /// interleave groups
+  using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
+                             InterleaveGroup<VPInstruction> *>;
+
+  /// Recursively \p Region and populate VPlan based interleave groups based on
+  /// \p IAI.
+  void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
+                   InterleavedAccessInfo &IAI);
+  /// Recursively traverse \p Block and populate VPlan based interleave groups
+  /// based on \p IAI.
+  void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+                  InterleavedAccessInfo &IAI);
+
+public:
+  VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
+
+  ~VPInterleavedAccessInfo() {
+    SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
+    // Avoid releasing a pointer twice.
+    for (auto &I : InterleaveGroupMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
+  /// Get the interleave group that \p Instr belongs to.
+  ///
+  /// \returns nullptr if doesn't have such group.
+  InterleaveGroup<VPInstruction> *
+  getInterleaveGroup(VPInstruction *Instr) const {
+    if (InterleaveGroupMap.count(Instr))
+      return InterleaveGroupMap.find(Instr)->second;
+    return nullptr;
+  }
+};
+
+/// Class that maps (parts of) an existing VPlan to trees of combined
+/// VPInstructions.
+class VPlanSlp {
+private:
+  enum class OpMode { Failed, Load, Opcode };
+
+  /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
+  /// DenseMap keys.
+  struct BundleDenseMapInfo {
+    static SmallVector<VPValue *, 4> getEmptyKey() {
+      return {reinterpret_cast<VPValue *>(-1)};
+    }
+
+    static SmallVector<VPValue *, 4> getTombstoneKey() {
+      return {reinterpret_cast<VPValue *>(-2)};
+    }
+
+    static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
+      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+    }
+
+    static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
+                        const SmallVector<VPValue *, 4> &RHS) {
+      return LHS == RHS;
+    }
+  };
+
+  /// Mapping of values in the original VPlan to a combined VPInstruction.
+  DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
+      BundleToCombined;
+
+  VPInterleavedAccessInfo &IAI;
+
+  /// Basic block to operate on. For now, only instructions in a single BB are
+  /// considered.
+  const VPBasicBlock &BB;
+
+  /// Indicates whether we managed to combine all visited instructions or not.
+  bool CompletelySLP = true;
+
+  /// Width of the widest combined bundle in bits.
+  unsigned WidestBundleBits = 0;
+
+  using MultiNodeOpTy =
+      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+
+  // Input operand bundles for the current multi node. Each multi node operand
+  // bundle contains values not matching the multi node's opcode. They will
+  // be reordered in reorderMultiNodeOps, once we completed building a
+  // multi node.
+  SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
+
+  /// Indicates whether we are building a multi node currently.
+  bool MultiNodeActive = false;
+
+  /// Check if we can vectorize Operands together.
+  bool areVectorizable(ArrayRef<VPValue *> Operands) const;
+
+  /// Add combined instruction \p New for the bundle \p Operands.
+  void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
+
+  /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
+  VPInstruction *markFailed();
+
+  /// Reorder operands in the multi node to maximize sequential memory access
+  /// and commutative operations.
+  SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
+
+  /// Choose the best candidate to use for the lane after \p Last. The set of
+  /// candidates to choose from are values with an opcode matching \p Last's
+  /// or loads consecutive to \p Last.
+  std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
+                                       SmallPtrSetImpl<VPValue *> &Candidates,
+                                       VPInterleavedAccessInfo &IAI);
+
+  /// Print bundle \p Values to dbgs().
+  void dumpBundle(ArrayRef<VPValue *> Values);
+
+public:
+  VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
+
+  ~VPlanSlp() {
+    for (auto &KV : BundleToCombined)
+      delete KV.second;
+  }
+
+  /// Tries to build an SLP tree rooted at \p Operands and returns a
+  /// VPInstruction combining \p Operands, if they can be combined.
+  VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
+
+  /// Return the width of the widest combined bundle in bits.
+  unsigned getWidestBundleBits() const { return WidestBundleBits; }
+
+  /// Return true if all visited instruction can be combined.
+  bool isCompletelySLP() const { return CompletelySLP; }
+};
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/lib/Transforms/Vectorize/VPlanSLP.cpp b/lib/Transforms/Vectorize/VPlanSLP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad3a85a6f760fca5cc6e8dae88f10fe73018a512
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -0,0 +1,468 @@
+//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// This file implements SLP analysis based on VPlan. The analysis is based on
+/// the ideas described in
+///
+///   Look-ahead SLP: auto-vectorization in the presence of commutative
+///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+///   Luís F. W. Góes
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan-slp"
+
+// Number of levels to look ahead when re-ordering multi node operands.
+static unsigned LookaheadMaxDepth = 5;
+
+VPInstruction *VPlanSlp::markFailed() {
+  // FIXME: Currently this is used to signal we hit instructions we cannot
+  //        trivially SLP'ize.
+  CompletelySLP = false;
+  return nullptr;
+}
+
+void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
+  if (all_of(Operands, [](VPValue *V) {
+        return cast<VPInstruction>(V)->getUnderlyingInstr();
+      })) {
+    unsigned BundleSize = 0;
+    for (VPValue *V : Operands) {
+      Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
+      assert(!T->isVectorTy() && "Only scalar types supported for now");
+      BundleSize += T->getScalarSizeInBits();
+    }
+    WidestBundleBits = std::max(WidestBundleBits, BundleSize);
+  }
+
+  auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
+  assert(Res.second &&
+         "Already created a combined instruction for the operand bundle");
+  (void)Res;
+}
+
+bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
+  // Currently we only support VPInstructions.
+  if (!all_of(Operands, [](VPValue *Op) {
+        return Op && isa<VPInstruction>(Op) &&
+               cast<VPInstruction>(Op)->getUnderlyingInstr();
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
+    return false;
+  }
+
+  // Check if opcodes and type width agree for all instructions in the bundle.
+  // FIXME: Differing widths/opcodes can be handled by inserting additional
+  //        instructions.
+  // FIXME: Deal with non-primitive types.
+  const Instruction *OriginalInstr =
+      cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
+  unsigned Opcode = OriginalInstr->getOpcode();
+  unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
+  if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
+        const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
+        return I->getOpcode() == Opcode &&
+               I->getType()->getPrimitiveSizeInBits() == Width;
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
+    return false;
+  }
+
+  // For now, all operands must be defined in the same BB.
+  if (any_of(Operands, [this](VPValue *Op) {
+        return cast<VPInstruction>(Op)->getParent() != &this->BB;
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
+    return false;
+  }
+
+  if (any_of(Operands,
+             [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
+    return false;
+  }
+
+  // For loads, check that there are no instructions writing to memory in
+  // between them.
+  // TODO: we only have to forbid instructions writing to memory that could
+  //       interfere with any of the loads in the bundle
+  if (Opcode == Instruction::Load) {
+    unsigned LoadsSeen = 0;
+    VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
+    for (auto &I : *Parent) {
+      auto *VPI = cast<VPInstruction>(&I);
+      if (VPI->getOpcode() == Instruction::Load &&
+          std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
+        LoadsSeen++;
+
+      if (LoadsSeen == Operands.size())
+        break;
+      if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
+        LLVM_DEBUG(
+            dbgs() << "VPSLP: instruction modifying memory between loads\n");
+        return false;
+      }
+    }
+
+    if (!all_of(Operands, [](VPValue *Op) {
+          return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+              ->isSimple();
+        })) {
+      LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
+      return false;
+    }
+  }
+
+  if (Opcode == Instruction::Store)
+    if (!all_of(Operands, [](VPValue *Op) {
+          return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+              ->isSimple();
+        })) {
+      LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
+      return false;
+    }
+
+  return true;
+}
+
+static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
+                                             unsigned OperandIndex) {
+  SmallVector<VPValue *, 4> Operands;
+  for (VPValue *V : Values) {
+    auto *U = cast<VPUser>(V);
+    Operands.push_back(U->getOperand(OperandIndex));
+  }
+  return Operands;
+}
+
+static bool areCommutative(ArrayRef<VPValue *> Values) {
+  return Instruction::isCommutative(
+      cast<VPInstruction>(Values[0])->getOpcode());
+}
+
+static SmallVector<SmallVector<VPValue *, 4>, 4>
+getOperands(ArrayRef<VPValue *> Values) {
+  SmallVector<SmallVector<VPValue *, 4>, 4> Result;
+  auto *VPI = cast<VPInstruction>(Values[0]);
+
+  switch (VPI->getOpcode()) {
+  case Instruction::Load:
+    llvm_unreachable("Loads terminate a tree, no need to get operands");
+  case Instruction::Store:
+    Result.push_back(getOperands(Values, 0));
+    break;
+  default:
+    for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
+      Result.push_back(getOperands(Values, I));
+    break;
+  }
+
+  return Result;
+}
+
+/// Returns the opcode of Values or ~0 if they do not all agree.
+static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
+  unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
+  if (any_of(Values, [Opcode](VPValue *V) {
+        return cast<VPInstruction>(V)->getOpcode() != Opcode;
+      }))
+    return None;
+  return {Opcode};
+}
+
+/// Returns true if A and B access sequential memory if they are loads or
+/// stores or if they have identical opcodes otherwise.
+static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
+                                  VPInterleavedAccessInfo &IAI) {
+  if (A->getOpcode() != B->getOpcode())
+    return false;
+
+  if (A->getOpcode() != Instruction::Load &&
+      A->getOpcode() != Instruction::Store)
+    return true;
+  auto *GA = IAI.getInterleaveGroup(A);
+  auto *GB = IAI.getInterleaveGroup(B);
+
+  return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
+}
+
+/// Implements getLAScore from Listing 7 in the paper.
+/// Traverses and compares operands of V1 and V2 to MaxLevel.
+static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
+                           VPInterleavedAccessInfo &IAI) {
+  if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
+    return 0;
+
+  if (MaxLevel == 0)
+    return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
+                                           cast<VPInstruction>(V2), IAI);
+
+  unsigned Score = 0;
+  for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
+    for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
+      Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
+                          cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
+  return Score;
+}
+
+std::pair<VPlanSlp::OpMode, VPValue *>
+VPlanSlp::getBest(OpMode Mode, VPValue *Last,
+                  SmallPtrSetImpl<VPValue *> &Candidates,
+                  VPInterleavedAccessInfo &IAI) {
+  assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
+         "Currently we only handle load and commutative opcodes");
+  LLVM_DEBUG(dbgs() << "      getBest\n");
+
+  SmallVector<VPValue *, 4> BestCandidates;
+  LLVM_DEBUG(dbgs() << "        Candidates  for "
+                    << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
+  for (auto *Candidate : Candidates) {
+    auto *LastI = cast<VPInstruction>(Last);
+    auto *CandidateI = cast<VPInstruction>(Candidate);
+    if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
+      LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
+                        << " ");
+      BestCandidates.push_back(Candidate);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+
+  if (BestCandidates.empty())
+    return {OpMode::Failed, nullptr};
+
+  if (BestCandidates.size() == 1)
+    return {Mode, BestCandidates[0]};
+
+  VPValue *Best = nullptr;
+  unsigned BestScore = 0;
+  for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
+    unsigned PrevScore = ~0u;
+    bool AllSame = true;
+
+    // FIXME: Avoid visiting the same operands multiple times.
+    for (auto *Candidate : BestCandidates) {
+      unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
+      if (PrevScore == ~0u)
+        PrevScore = Score;
+      if (PrevScore != Score)
+        AllSame = false;
+      PrevScore = Score;
+
+      if (Score > BestScore) {
+        BestScore = Score;
+        Best = Candidate;
+      }
+    }
+    if (!AllSame)
+      break;
+  }
+  LLVM_DEBUG(dbgs() << "Found best "
+                    << *cast<VPInstruction>(Best)->getUnderlyingInstr()
+                    << "\n");
+  Candidates.erase(Best);
+
+  return {Mode, Best};
+}
+
+SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
+  SmallVector<MultiNodeOpTy, 4> FinalOrder;
+  SmallVector<OpMode, 4> Mode;
+  FinalOrder.reserve(MultiNodeOps.size());
+  Mode.reserve(MultiNodeOps.size());
+
+  LLVM_DEBUG(dbgs() << "Reordering multinode\n");
+
+  for (auto &Operands : MultiNodeOps) {
+    FinalOrder.push_back({Operands.first, {Operands.second[0]}});
+    if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
+        Instruction::Load)
+      Mode.push_back(OpMode::Load);
+    else
+      Mode.push_back(OpMode::Opcode);
+  }
+
+  for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
+    LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
+    SmallPtrSet<VPValue *, 4> Candidates;
+    LLVM_DEBUG(dbgs() << "  Candidates  ");
+    for (auto Ops : MultiNodeOps) {
+      LLVM_DEBUG(
+          dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
+                 << " ");
+      Candidates.insert(Ops.second[Lane]);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
+      LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
+      if (Mode[Op] == OpMode::Failed)
+        continue;
+
+      VPValue *Last = FinalOrder[Op].second[Lane - 1];
+      std::pair<OpMode, VPValue *> Res =
+          getBest(Mode[Op], Last, Candidates, IAI);
+      if (Res.second)
+        FinalOrder[Op].second.push_back(Res.second);
+      else
+        // TODO: handle this case
+        FinalOrder[Op].second.push_back(markFailed());
+    }
+  }
+
+  return FinalOrder;
+}
+
+void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
+  dbgs() << " Ops: ";
+  for (auto Op : Values)
+    if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
+      dbgs() << *Instr << " | ";
+    else
+      dbgs() << " nullptr | ";
+  dbgs() << "\n";
+}
+
+VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
+  assert(!Values.empty() && "Need some operands!");
+
+  // If we already visited this instruction bundle, re-use the existing node
+  auto I = BundleToCombined.find(to_vector<4>(Values));
+  if (I != BundleToCombined.end()) {
+#ifndef NDEBUG
+    // Check that the resulting graph is a tree. If we re-use a node, this means
+    // its values have multiple users. We only allow this, if all users of each
+    // value are the same instruction.
+    for (auto *V : Values) {
+      auto UI = V->user_begin();
+      auto *FirstUser = *UI++;
+      while (UI != V->user_end()) {
+        assert(*UI == FirstUser && "Currently we only support SLP trees.");
+        UI++;
+      }
+    }
+#endif
+    return I->second;
+  }
+
+  // Dump inputs
+  LLVM_DEBUG({
+    dbgs() << "buildGraph: ";
+    dumpBundle(Values);
+  });
+
+  if (!areVectorizable(Values))
+    return markFailed();
+
+  assert(getOpcode(Values) && "Opcodes for all values must match");
+  unsigned ValuesOpcode = getOpcode(Values).getValue();
+
+  SmallVector<VPValue *, 4> CombinedOperands;
+  if (areCommutative(Values)) {
+    bool MultiNodeRoot = !MultiNodeActive;
+    MultiNodeActive = true;
+    for (auto &Operands : getOperands(Values)) {
+      LLVM_DEBUG({
+        dbgs() << "  Visiting Commutative";
+        dumpBundle(Operands);
+      });
+
+      auto OperandsOpcode = getOpcode(Operands);
+      if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
+        LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
+        CombinedOperands.push_back(buildGraph(Operands));
+      } else {
+        LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
+        // Create dummy VPInstruction, which will we replace later by the
+        // re-ordered operand.
+        VPInstruction *Op = new VPInstruction(0, {});
+        CombinedOperands.push_back(Op);
+        MultiNodeOps.emplace_back(Op, Operands);
+      }
+    }
+
+    if (MultiNodeRoot) {
+      LLVM_DEBUG(dbgs() << "Reorder \n");
+      MultiNodeActive = false;
+
+      auto FinalOrder = reorderMultiNodeOps();
+
+      MultiNodeOps.clear();
+      for (auto &Ops : FinalOrder) {
+        VPInstruction *NewOp = buildGraph(Ops.second);
+        Ops.first->replaceAllUsesWith(NewOp);
+        for (unsigned i = 0; i < CombinedOperands.size(); i++)
+          if (CombinedOperands[i] == Ops.first)
+            CombinedOperands[i] = NewOp;
+        delete Ops.first;
+        Ops.first = NewOp;
+      }
+      LLVM_DEBUG(dbgs() << "Found final order\n");
+    }
+  } else {
+    LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
+    if (ValuesOpcode == Instruction::Load)
+      for (VPValue *V : Values)
+        CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
+    else
+      for (auto &Operands : getOperands(Values))
+        CombinedOperands.push_back(buildGraph(Operands));
+  }
+
+  unsigned Opcode;
+  switch (ValuesOpcode) {
+  case Instruction::Load:
+    Opcode = VPInstruction::SLPLoad;
+    break;
+  case Instruction::Store:
+    Opcode = VPInstruction::SLPStore;
+    break;
+  default:
+    Opcode = ValuesOpcode;
+    break;
+  }
+
+  if (!CompletelySLP)
+    return markFailed();
+
+  assert(CombinedOperands.size() > 0 && "Need more some operands");
+  auto *VPI = new VPInstruction(Opcode, CombinedOperands);
+  VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+
+  LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
+             cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
+  addCombined(Values, VPI);
+  return VPI;
+}
diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h
index a81044a5e1b8d5e66eee1b1673c3eb504d377991..b473579b699fb751b595bc3f0de79a1fd9da5aed 100644
--- a/lib/Transforms/Vectorize/VPlanValue.h
+++ b/lib/Transforms/Vectorize/VPlanValue.h
@@ -40,6 +40,7 @@ class VPValue {
   friend class VPBuilder;
   friend class VPlanHCFGTransforms;
   friend class VPBasicBlock;
+  friend class VPInterleavedAccessInfo;
 
 private:
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -105,6 +106,20 @@ public:
   const_user_range users() const {
     return const_user_range(user_begin(), user_end());
   }
+
+  /// Returns true if the value has more than one unique user.
+  bool hasMoreThanOneUniqueUser() {
+    if (getNumUsers() == 0)
+      return false;
+
+    // Check if all users match the first user.
+    auto Current = std::next(user_begin());
+    while (Current != user_end() && *user_begin() == *Current)
+      Current++;
+    return Current != user_end();
+  }
+
+  void replaceAllUsesWith(VPValue *New);
 };
 
 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
@@ -150,6 +165,8 @@ public:
     return Operands[N];
   }
 
+  void setOperand(unsigned I, VPValue *New) { Operands[I] = New; }
+
   typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
   typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
   typedef iterator_range<operand_iterator> operand_range;
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index f62a885583285e25f1a00525e66000033386d09a..559ab1968844e4b7af5c9b85a6f1368add3a5ef5 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -27,7 +27,7 @@ using namespace llvm;
 void llvm::initializeVectorization(PassRegistry &Registry) {
   initializeLoopVectorizePass(Registry);
   initializeSLPVectorizerPass(Registry);
-  initializeLoadStoreVectorizerPass(Registry);
+  initializeLoadStoreVectorizerLegacyPassPass(Registry);
 }
 
 void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
diff --git a/lib/XRay/FDRRecordProducer.cpp b/lib/XRay/FDRRecordProducer.cpp
index 122578010c435acbed71546374a45224f6c7630a..25b3ee8af219353a86901ec489a737e87f228417 100644
--- a/lib/XRay/FDRRecordProducer.cpp
+++ b/lib/XRay/FDRRecordProducer.cpp
@@ -9,29 +9,32 @@
 #include "llvm/XRay/FDRRecordProducer.h"
 #include "llvm/Support/DataExtractor.h"
 
+#include <cstdint>
+
 namespace llvm {
 namespace xray {
 
 namespace {
 
+// Keep this in sync with the values written in the XRay FDR mode runtime in
+// compiler-rt.
+enum MetadataRecordKinds : uint8_t {
+  NewBufferKind,
+  EndOfBufferKind,
+  NewCPUIdKind,
+  TSCWrapKind,
+  WalltimeMarkerKind,
+  CustomEventMarkerKind,
+  CallArgumentKind,
+  BufferExtentsKind,
+  TypedEventMarkerKind,
+  PidKind,
+  // This is an end marker, used to identify the upper bound for this enum.
+  EnumEndMarker,
+};
+
 Expected<std::unique_ptr<Record>>
 metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
-  // Keep this in sync with the values written in the XRay FDR mode runtime in
-  // compiler-rt.
-  enum MetadataRecordKinds : uint8_t {
-    NewBufferKind,
-    EndOfBufferKind,
-    NewCPUIdKind,
-    TSCWrapKind,
-    WalltimeMarkerKind,
-    CustomEventMarkerKind,
-    CallArgumentKind,
-    BufferExtentsKind,
-    TypedEventMarkerKind,
-    PidKind,
-    // This is an end marker, used to identify the upper bound for this enum.
-    EnumEndMarker,
-  };
 
   if (T >= static_cast<uint8_t>(MetadataRecordKinds::EnumEndMarker))
     return createStringError(std::make_error_code(std::errc::invalid_argument),
@@ -70,9 +73,70 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
   llvm_unreachable("Unhandled MetadataRecordKinds enum value");
 }
 
+constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
+  return FirstByte & 0x01u;
+}
+
 } // namespace
 
+Expected<std::unique_ptr<Record>>
+FileBasedRecordProducer::findNextBufferExtent() {
+  // We seek one byte at a time until we find a suitable buffer extents metadata
+  // record introducer.
+  std::unique_ptr<Record> R;
+  while (!R) {
+    auto PreReadOffset = OffsetPtr;
+    uint8_t FirstByte = E.getU8(&OffsetPtr);
+    if (OffsetPtr == PreReadOffset)
+      return createStringError(
+          std::make_error_code(std::errc::executable_format_error),
+          "Failed reading one byte from offset %d.", OffsetPtr);
+
+    if (isMetadataIntroducer(FirstByte)) {
+      auto LoadedType = FirstByte >> 1;
+      if (LoadedType == MetadataRecordKinds::BufferExtentsKind) {
+        auto MetadataRecordOrErr = metadataRecordType(Header, LoadedType);
+        if (!MetadataRecordOrErr)
+          return MetadataRecordOrErr.takeError();
+
+        R = std::move(MetadataRecordOrErr.get());
+        RecordInitializer RI(E, OffsetPtr);
+        if (auto Err = R->apply(RI))
+          return std::move(Err);
+        return std::move(R);
+      }
+    }
+  }
+  llvm_unreachable("Must always terminate with either an error or a record.");
+}
+
 Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
+  // First, we set up our result record.
+  std::unique_ptr<Record> R;
+
+  // Before we do any further reading, we should check whether we're at the end
+  // of the current buffer we're been consuming. In FDR logs version >= 3, we
+  // rely on the buffer extents record to determine how many bytes we should be
+  // considering as valid records.
+  if (Header.Version >= 3 && CurrentBufferBytes == 0) {
+    // Find the next buffer extents record.
+    auto BufferExtentsOrError = findNextBufferExtent();
+    if (!BufferExtentsOrError)
+      return joinErrors(
+          BufferExtentsOrError.takeError(),
+          createStringError(
+              std::make_error_code(std::errc::executable_format_error),
+              "Failed to find the next BufferExtents record."));
+
+    R = std::move(BufferExtentsOrError.get());
+    assert(R != nullptr);
+    assert(isa<BufferExtents>(R.get()));
+    auto BE = dyn_cast<BufferExtents>(R.get());
+    CurrentBufferBytes = BE->size();
+    return std::move(R);
+  }
+
+  //
   // At the top level, we read one byte to determine the type of the record to
   // create. This byte will comprise of the following bits:
   //
@@ -90,11 +154,8 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
         std::make_error_code(std::errc::executable_format_error),
         "Failed reading one byte from offset %d.", OffsetPtr);
 
-  // Set up our result record.
-  std::unique_ptr<Record> R;
-
   // For metadata records, handle especially here.
-  if (FirstByte & 0x01) {
+  if (isMetadataIntroducer(FirstByte)) {
     auto LoadedType = FirstByte >> 1;
     auto MetadataRecordOrErr = metadataRecordType(Header, LoadedType);
     if (!MetadataRecordOrErr)
@@ -113,6 +174,22 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
   if (auto Err = R->apply(RI))
     return std::move(Err);
 
+  // If we encountered a BufferExtents record, we should record the remaining
+  // bytes for the current buffer, to determine when we should start ignoring
+  // potentially malformed data and looking for buffer extents records.
+  if (auto BE = dyn_cast<BufferExtents>(R.get())) {
+    CurrentBufferBytes = BE->size();
+  } else if (Header.Version >= 3) {
+    if (OffsetPtr - PreReadOffset > CurrentBufferBytes)
+      return createStringError(
+          std::make_error_code(std::errc::executable_format_error),
+          "Buffer over-read at offset %d (over-read by %d bytes); Record Type "
+          "= %s.",
+          OffsetPtr, (OffsetPtr - PreReadOffset) - CurrentBufferBytes,
+          Record::kindToString(R->getRecordType()).data());
+
+    CurrentBufferBytes -= OffsetPtr - PreReadOffset;
+  }
   assert(R != nullptr);
   return std::move(R);
 }
diff --git a/lib/XRay/FDRRecords.cpp b/lib/XRay/FDRRecords.cpp
index 2b68a73686fbf95713cb71e2e96922e633c2f3ea..2a40d5e0622937d6a42208452a167b6a4f01629b 100644
--- a/lib/XRay/FDRRecords.cpp
+++ b/lib/XRay/FDRRecords.cpp
@@ -29,5 +29,39 @@ Error FunctionRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 Error CustomEventRecordV5::apply(RecordVisitor &V) { return V.visit(*this); }
 Error TypedEventRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 
+StringRef Record::kindToString(RecordKind K) {
+  switch (K) {
+  case RecordKind::RK_Metadata:
+    return "Metadata";
+  case RecordKind::RK_Metadata_BufferExtents:
+    return "Metadata:BufferExtents";
+  case RecordKind::RK_Metadata_WallClockTime:
+    return "Metadata:WallClockTime";
+  case RecordKind::RK_Metadata_NewCPUId:
+    return "Metadata:NewCPUId";
+  case RecordKind::RK_Metadata_TSCWrap:
+    return "Metadata:TSCWrap";
+  case RecordKind::RK_Metadata_CustomEvent:
+    return "Metadata:CustomEvent";
+  case RecordKind::RK_Metadata_CustomEventV5:
+    return "Metadata:CustomEventV5";
+  case RecordKind::RK_Metadata_CallArg:
+    return "Metadata:CallArg";
+  case RecordKind::RK_Metadata_PIDEntry:
+    return "Metadata:PIDEntry";
+  case RecordKind::RK_Metadata_NewBuffer:
+    return "Metadata:NewBuffer";
+  case RecordKind::RK_Metadata_EndOfBuffer:
+    return "Metadata:EndOfBuffer";
+  case RecordKind::RK_Metadata_TypedEvent:
+    return "Metadata:TypedEvent";
+  case RecordKind::RK_Metadata_LastMetadata:
+    return "Metadata:LastMetadata";
+  case RecordKind::RK_Function:
+    return "Function";
+  }
+  return "Unknown";
+}
+
 } // namespace xray
 } // namespace llvm
diff --git a/lib/XRay/FDRTraceWriter.cpp b/lib/XRay/FDRTraceWriter.cpp
index d5f9697998638396c53aa04d1f2daf55635503f0..c5224f4be094d30c7fa551ea08418d884de51457 100644
--- a/lib/XRay/FDRTraceWriter.cpp
+++ b/lib/XRay/FDRTraceWriter.cpp
@@ -43,7 +43,9 @@ template <size_t Index> struct IndexedWriter {
 
 template <uint8_t Kind, class... Values>
 Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
-  uint8_t FirstByte = (Kind << 1) | uint8_t{0x01};
+  // The first bit in the first byte of metadata records is always set to 1, so
+  // we ensure this is the case when we write out the first byte of the record.
+  uint8_t FirstByte = (static_cast<uint8_t>(Kind) << 1) | uint8_t{0x01u};
   auto T = std::make_tuple(std::forward<Values>(std::move(Ds))...);
   // Write in field order.
   OS.write(FirstByte);
@@ -112,7 +114,7 @@ Error FDRTraceWriter::visit(CustomEventRecordV5 &R) {
 }
 
 Error FDRTraceWriter::visit(TypedEventRecord &R) {
-  if (auto E = writeMetadata<7u>(OS, R.size(), R.delta(), R.eventType()))
+  if (auto E = writeMetadata<8u>(OS, R.size(), R.delta(), R.eventType()))
     return E;
   auto D = R.data();
   ArrayRef<char> Bytes(D.data(), D.size());
diff --git a/lib/XRay/InstrumentationMap.cpp b/lib/XRay/InstrumentationMap.cpp
index 84317708d3f41aa422b3982fef68af52140ceebd..9f2b179486f07dd1b023947c50d7cb751a41c9d7 100644
--- a/lib/XRay/InstrumentationMap.cpp
+++ b/lib/XRay/InstrumentationMap.cpp
@@ -12,12 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/XRay/InstrumentationMap.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
@@ -46,6 +48,8 @@ Optional<uint64_t> InstrumentationMap::getFunctionAddr(int32_t FuncId) const {
   return None;
 }
 
+using RelocMap = DenseMap<uint64_t, uint64_t>;
+
 static Error
 loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
           InstrumentationMap::SledContainer &Sleds,
@@ -79,6 +83,31 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
     return errorCodeToError(
         std::make_error_code(std::errc::executable_format_error));
 
+  RelocMap Relocs;
+  if (ObjFile.getBinary()->isELF()) {
+    uint32_t RelativeRelocation = [](object::ObjectFile *ObjFile) {
+      if (const auto *ELFObj = dyn_cast<object::ELF32LEObjectFile>(ObjFile))
+        return ELFObj->getELFFile()->getRelativeRelocationType();
+      else if (const auto *ELFObj = dyn_cast<object::ELF32BEObjectFile>(ObjFile))
+        return ELFObj->getELFFile()->getRelativeRelocationType();
+      else if (const auto *ELFObj = dyn_cast<object::ELF64LEObjectFile>(ObjFile))
+        return ELFObj->getELFFile()->getRelativeRelocationType();
+      else if (const auto *ELFObj = dyn_cast<object::ELF64BEObjectFile>(ObjFile))
+        return ELFObj->getELFFile()->getRelativeRelocationType();
+      else
+        return static_cast<uint32_t>(0);
+    }(ObjFile.getBinary());
+
+    for (const object::SectionRef &Section : Sections) {
+      for (const object::RelocationRef &Reloc : Section.relocations()) {
+        if (Reloc.getType() != RelativeRelocation)
+          continue;
+        if (auto AddendOrErr = object::ELFRelocationRef(Reloc).getAddend())
+          Relocs.insert({Reloc.getOffset(), *AddendOrErr});
+      }
+    }
+  }
+
   // Copy the instrumentation map data into the Sleds data structure.
   auto C = Contents.bytes_begin();
   static constexpr size_t ELF64SledEntrySize = 32;
@@ -89,6 +118,16 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
               "an XRay sled entry in ELF64."),
         std::make_error_code(std::errc::executable_format_error));
 
+  auto RelocateOrElse = [&](uint32_t Offset, uint64_t Address) {
+    if (!Address) {
+      uint64_t A = I->getAddress() + C - Contents.bytes_begin() + Offset;
+      RelocMap::const_iterator R = Relocs.find(A);
+      if (R != Relocs.end())
+        return R->second;
+    }
+    return Address;
+  };
+
   int32_t FuncId = 1;
   uint64_t CurFn = 0;
   for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) {
@@ -98,8 +137,10 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
     Sleds.push_back({});
     auto &Entry = Sleds.back();
     uint32_t OffsetPtr = 0;
-    Entry.Address = Extractor.getU64(&OffsetPtr);
-    Entry.Function = Extractor.getU64(&OffsetPtr);
+    uint32_t AddrOff = OffsetPtr;
+    Entry.Address = RelocateOrElse(AddrOff, Extractor.getU64(&OffsetPtr));
+    uint32_t FuncOff = OffsetPtr;
+    Entry.Function = RelocateOrElse(FuncOff, Extractor.getU64(&OffsetPtr));
     auto Kind = Extractor.getU8(&OffsetPtr);
     static constexpr SledEntry::FunctionKinds Kinds[] = {
         SledEntry::FunctionKinds::ENTRY, SledEntry::FunctionKinds::EXIT,
diff --git a/lib/XRay/RecordInitializer.cpp b/lib/XRay/RecordInitializer.cpp
index cc9dd4609498c28e47c8a174d0bae0c249b3226e..f136a1e456b77381c53b765c3ed555aac54d38a9 100644
--- a/lib/XRay/RecordInitializer.cpp
+++ b/lib/XRay/RecordInitializer.cpp
@@ -111,6 +111,12 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
         std::make_error_code(std::errc::invalid_argument),
         "Cannot read a custom event record size field offset %d.", OffsetPtr);
 
+  if (R.Size <= 0)
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid size for custom event (size = %d) at offset %d.", R.Size,
+        OffsetPtr);
+
   PreReadOffset = OffsetPtr;
   R.TSC = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
@@ -142,11 +148,21 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
 
   std::vector<uint8_t> Buffer;
   Buffer.resize(R.Size);
+  PreReadOffset = OffsetPtr;
   if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
         "Failed reading data into buffer of size %d at offset %d.", R.Size,
         OffsetPtr);
+
+  assert(OffsetPtr >= PreReadOffset);
+  if (OffsetPtr - PreReadOffset != static_cast<uint32_t>(R.Size))
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Failed reading enough bytes for the custom event payload -- read %d "
+        "expecting %d bytes at offset %d.",
+        OffsetPtr - PreReadOffset, R.Size, PreReadOffset);
+
   R.Data.assign(Buffer.begin(), Buffer.end());
   return Error::success();
 }
@@ -167,6 +183,12 @@ Error RecordInitializer::visit(CustomEventRecordV5 &R) {
         std::make_error_code(std::errc::invalid_argument),
         "Cannot read a custom event record size field offset %d.", OffsetPtr);
 
+  if (R.Size <= 0)
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid size for custom event (size = %d) at offset %d.", R.Size,
+        OffsetPtr);
+
   PreReadOffset = OffsetPtr;
   R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
@@ -188,11 +210,21 @@ Error RecordInitializer::visit(CustomEventRecordV5 &R) {
 
   std::vector<uint8_t> Buffer;
   Buffer.resize(R.Size);
+  PreReadOffset = OffsetPtr;
   if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
         "Failed reading data into buffer of size %d at offset %d.", R.Size,
         OffsetPtr);
+
+  assert(OffsetPtr >= PreReadOffset);
+  if (OffsetPtr - PreReadOffset != static_cast<uint32_t>(R.Size))
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Failed reading enough bytes for the custom event payload -- read %d "
+        "expecting %d bytes at offset %d.",
+        OffsetPtr - PreReadOffset, R.Size, PreReadOffset);
+
   R.Data.assign(Buffer.begin(), Buffer.end());
   return Error::success();
 }
@@ -213,6 +245,12 @@ Error RecordInitializer::visit(TypedEventRecord &R) {
         std::make_error_code(std::errc::invalid_argument),
         "Cannot read a typed event record size field offset %d.", OffsetPtr);
 
+  if (R.Size <= 0)
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid size for typed event (size = %d) at offset %d.", R.Size,
+        OffsetPtr);
+
   PreReadOffset = OffsetPtr;
   R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
@@ -241,11 +279,21 @@ Error RecordInitializer::visit(TypedEventRecord &R) {
 
   std::vector<uint8_t> Buffer;
   Buffer.resize(R.Size);
+  PreReadOffset = OffsetPtr;
   if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
         "Failed reading data into buffer of size %d at offset %d.", R.Size,
         OffsetPtr);
+
+  assert(OffsetPtr >= PreReadOffset);
+  if (OffsetPtr - PreReadOffset != static_cast<uint32_t>(R.Size))
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Failed reading enough bytes for the typed event payload -- read %d "
+        "expecting %d bytes at offset %d.",
+        OffsetPtr - PreReadOffset, R.Size, PreReadOffset);
+
   R.Data.assign(Buffer.begin(), Buffer.end());
   return Error::success();
 }
@@ -337,7 +385,11 @@ Error RecordInitializer::visit(FunctionRecord &R) {
     return createStringError(std::make_error_code(std::errc::bad_address),
                              "Cannot read function id field from offset %d.",
                              OffsetPtr);
-  unsigned FunctionType = (Buffer >> 1) & 0x07;
+
+  // To get the function record type, we shift the buffer one to the right
+  // (truncating the function record indicator) then take the three bits
+  // (0b0111) to get the record type as an unsigned value.
+  unsigned FunctionType = (Buffer >> 1) & 0x07u;
   switch (FunctionType) {
   case static_cast<unsigned>(RecordTypes::ENTER):
   case static_cast<unsigned>(RecordTypes::ENTER_ARG):
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index 32617fd4ba62979a59379bfb18cc9c90022c1266..9afc30c8928e3cbd33fda03e6102cae3ea2e2eb7 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -13,7 +13,8 @@ foreach(entry ${entries})
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/parallel-libs) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/openmp) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/debuginfo-tests))
-      add_subdirectory(${entry})
+      get_filename_component(entry_name "${entry}" NAME)
+      add_llvm_external_project(${entry_name})
     endif()
   endif()
 endforeach(entry)
diff --git a/test/Analysis/ConstantFolding/func-and-folding.ll b/test/Analysis/ConstantFolding/func-and-folding.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2dbbe673601d574b9f01b2768cde9f3d5dbe71c0
--- /dev/null
+++ b/test/Analysis/ConstantFolding/func-and-folding.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -constprop -S -o - | FileCheck %s
+
+; Function Attrs: minsize norecurse nounwind optsize readnone
+define void @foo1() #0 {
+entry:
+  ret void
+}
+
+; Function Attrs: minsize norecurse nounwind optsize readnone
+define void @foo2() align 4 {
+entry:
+  ret void
+}
+
+; Function Attrs: minsize nounwind optsize
+define i32 @main() local_unnamed_addr #1 {
+entry:
+; CHECK: ptrtoint
+  %call = tail call i32 bitcast (i32 (...)* @process to i32 (i32)*)(i32 and (i32 ptrtoint (void ()* @foo1 to i32), i32 2)) #3
+; CHECK-NEXT: ptrtoint
+  %call2 = tail call i32 bitcast (i32 (...)* @process to i32 (i32)*)(i32 and (i32 ptrtoint (void ()* @foo2 to i32), i32 2)) #3
+  ret i32 0
+}
+
+; Function Attrs: minsize optsize
+declare i32 @process(...) local_unnamed_addr #2
+
diff --git a/test/Analysis/ConstantFolding/saturating-add-sub.ll b/test/Analysis/ConstantFolding/saturating-add-sub.ll
new file mode 100644
index 0000000000000000000000000000000000000000..14c6a9fabfade7ad69911da348bfa6ca81221252
--- /dev/null
+++ b/test/Analysis/ConstantFolding/saturating-add-sub.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -constprop -S | FileCheck %s
+
+declare void @dummy(i8)
+declare void @dummy_vec(<2 x i8>)
+
+declare i8 @llvm.uadd.sat.i8(i8, i8)
+declare i8 @llvm.sadd.sat.i8(i8, i8)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i8 @llvm.usub.sat.i8(i8, i8)
+declare i8 @llvm.ssub.sat.i8(i8, i8)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+define void @test_add_scalar() {
+; CHECK-LABEL: @test_add_scalar(
+; CHECK-NEXT:    call void @dummy(i8 30)
+; CHECK-NEXT:    call void @dummy(i8 -1)
+; CHECK-NEXT:    call void @dummy(i8 -10)
+; CHECK-NEXT:    call void @dummy(i8 127)
+; CHECK-NEXT:    call void @dummy(i8 -128)
+; CHECK-NEXT:    ret void
+;
+  %x1 = call i8 @llvm.uadd.sat.i8(i8 10, i8 20)
+  call void @dummy(i8 %x1)
+  %x2 = call i8 @llvm.uadd.sat.i8(i8 250, i8 100)
+  call void @dummy(i8 %x2)
+
+  %y1 = call i8 @llvm.sadd.sat.i8(i8 10, i8 -20)
+  call void @dummy(i8 %y1)
+  %y2 = call i8 @llvm.sadd.sat.i8(i8 120, i8 10)
+  call void @dummy(i8 %y2)
+  %y3 = call i8 @llvm.sadd.sat.i8(i8 -120, i8 -10)
+  call void @dummy(i8 %y3)
+
+  ret void
+}
+
+define void @test_add_vector(<2 x i8> %a) {
+; CHECK-LABEL: @test_add_vector(
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 20, i8 30>)
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 -1, i8 -1>)
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 -10, i8 -30>)
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 127, i8 127>)
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 -128, i8 -128>)
+; CHECK-NEXT:    ret void
+;
+  %x1 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> <i8 10, i8 15>, <2 x i8> <i8 10, i8 15>)
+  call void @dummy_vec(<2 x i8> %x1)
+  %x2 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> <i8 100, i8 200>, <2 x i8> <i8 250, i8 100>)
+  call void @dummy_vec(<2 x i8> %x2)
+
+  %y1 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 10, i8 -15>, <2 x i8> <i8 -20, i8 -15>)
+  call void @dummy_vec(<2 x i8> %y1)
+  %y2 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 100, i8 10>, <2 x i8> <i8 30, i8 120>)
+  call void @dummy_vec(<2 x i8> %y2)
+  %y3 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 -100, i8 -10>, <2 x i8> <i8 -30, i8 -120>)
+  call void @dummy_vec(<2 x i8> %y3)
+
+  ret void
+}
+
+define void @test_usub_ssub_scalar() {
+; CHECK-LABEL: @test_usub_ssub_scalar(
+; CHECK-NEXT:    call void @dummy(i8 10)
+; CHECK-NEXT:    call void @dummy(i8 0)
+; CHECK-NEXT:    call void @dummy(i8 -30)
+; CHECK-NEXT:    call void @dummy(i8 127)
+; CHECK-NEXT:    call void @dummy(i8 -128)
+; CHECK-NEXT:    ret void
+;
+  %x1 = call i8 @llvm.usub.sat.i8(i8 20, i8 10)
+  call void @dummy(i8 %x1)
+  %x2 = call i8 @llvm.usub.sat.i8(i8 200, i8 250)
+  call void @dummy(i8 %x2)
+
+  %y1 = call i8 @llvm.ssub.sat.i8(i8 -10, i8 20)
+  call void @dummy(i8 %y1)
+  %y2 = call i8 @llvm.ssub.sat.i8(i8 120, i8 -10)
+  call void @dummy(i8 %y2)
+  %y3 = call i8 @llvm.ssub.sat.i8(i8 -120, i8 10)
+  call void @dummy(i8 %y3)
+
+  ret void
+}
+
+define void @test_sub_vector(<2 x i8> %a) {
+; CHECK-LABEL: @test_sub_vector(
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 10, i8 5>)
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> zeroinitializer)
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 30, i8 0>)
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 127, i8 127>)
+; CHECK-NEXT:    call void @dummy_vec(<2 x i8> <i8 -128, i8 -128>)
+; CHECK-NEXT:    ret void
+;
+  %x1 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> <i8 20, i8 15>, <2 x i8> <i8 10, i8 10>)
+  call void @dummy_vec(<2 x i8> %x1)
+  %x2 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> <i8 100, i8 200>, <2 x i8> <i8 150, i8 250>)
+  call void @dummy_vec(<2 x i8> %x2)
+
+  %y1 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> <i8 10, i8 -15>, <2 x i8> <i8 -20, i8 -15>)
+  call void @dummy_vec(<2 x i8> %y1)
+  %y2 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> <i8 100, i8 10>, <2 x i8> <i8 -30, i8 -120>)
+  call void @dummy_vec(<2 x i8> %y2)
+  %y3 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> <i8 -100, i8 -10>, <2 x i8> <i8 30, i8 120>)
+  call void @dummy_vec(<2 x i8> %y3)
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/AArch64/vector-reduce.ll b/test/Analysis/CostModel/AArch64/vector-reduce.ll
index c268a18e7f8c6d0c439569c976530cd788c67048..8cc6eea4ec4c13d67ecface8fb06d1fb6630f6c0 100644
--- a/test/Analysis/CostModel/AArch64/vector-reduce.ll
+++ b/test/Analysis/CostModel/AArch64/vector-reduce.ll
@@ -47,7 +47,7 @@ define i32 @add.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: umin.i8.v8i8
-; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umin.i8.v8i8
 ; CODE:       uminv b0, v0.8b
 define i8 @umin.i8.v8i8(<8 x i8> %v) {
@@ -56,7 +56,7 @@ define i8 @umin.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: umin.i8.v16i8
-; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umin.i8.v16i8
 ; CODE:       uminv b0, v0.16b
 define i8 @umin.i8.v16i8(<16 x i8> %v) {
@@ -65,7 +65,7 @@ define i8 @umin.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: umin.i16.v4i16
-; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umin.i16.v4i16
 ; CODE:       uminv h0, v0.4h
 define i16 @umin.i16.v4i16(<4 x i16> %v) {
@@ -74,7 +74,7 @@ define i16 @umin.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: umin.i16.v8i16
-; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umin.i16.v8i16
 ; CODE:       uminv h0, v0.8h
 define i16 @umin.i16.v8i16(<8 x i16> %v) {
@@ -83,7 +83,7 @@ define i16 @umin.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: umin.i32.v4i32
-; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umin.i32.v4i32
 ; CODE:       uminv s0, v0.4s
 define i32 @umin.i32.v4i32(<4 x i32> %v) {
@@ -92,7 +92,7 @@ define i32 @umin.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: umax.i8.v8i8
-; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umax.i8.v8i8
 ; CODE:       umaxv b0, v0.8b
 define i8 @umax.i8.v8i8(<8 x i8> %v) {
@@ -101,7 +101,7 @@ define i8 @umax.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: umax.i8.v16i8
-; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umax.i8.v16i8
 ; CODE:       umaxv b0, v0.16b
 define i8 @umax.i8.v16i8(<16 x i8> %v) {
@@ -110,7 +110,7 @@ define i8 @umax.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: umax.i16.v4i16
-; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umax.i16.v4i16
 ; CODE:       umaxv h0, v0.4h
 define i16 @umax.i16.v4i16(<4 x i16> %v) {
@@ -119,7 +119,7 @@ define i16 @umax.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: umax.i16.v8i16
-; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umax.i16.v8i16
 ; CODE:       umaxv h0, v0.8h
 define i16 @umax.i16.v8i16(<8 x i16> %v) {
@@ -128,7 +128,7 @@ define i16 @umax.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: umax.i32.v4i32
-; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umax.i32.v4i32
 ; CODE:       umaxv s0, v0.4s
 define i32 @umax.i32.v4i32(<4 x i32> %v) {
@@ -137,7 +137,7 @@ define i32 @umax.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: smin.i8.v8i8
-; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smin.i8.v8i8
 ; CODE:       sminv b0, v0.8b
 define i8 @smin.i8.v8i8(<8 x i8> %v) {
@@ -146,7 +146,7 @@ define i8 @smin.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: smin.i8.v16i8
-; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smin.i8.v16i8
 ; CODE:       sminv b0, v0.16b
 define i8 @smin.i8.v16i8(<16 x i8> %v) {
@@ -155,7 +155,7 @@ define i8 @smin.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: smin.i16.v4i16
-; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smin.i16.v4i16
 ; CODE:       sminv h0, v0.4h
 define i16 @smin.i16.v4i16(<4 x i16> %v) {
@@ -164,7 +164,7 @@ define i16 @smin.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: smin.i16.v8i16
-; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smin.i16.v8i16
 ; CODE:       sminv h0, v0.8h
 define i16 @smin.i16.v8i16(<8 x i16> %v) {
@@ -173,7 +173,7 @@ define i16 @smin.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: smin.i32.v4i32
-; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smin.i32.v4i32
 ; CODE:       sminv s0, v0.4s
 define i32 @smin.i32.v4i32(<4 x i32> %v) {
@@ -182,7 +182,7 @@ define i32 @smin.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: smax.i8.v8i8
-; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smax.i8.v8i8
 ; CODE:       smaxv b0, v0.8b
 define i8 @smax.i8.v8i8(<8 x i8> %v) {
@@ -191,7 +191,7 @@ define i8 @smax.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: smax.i8.v16i8
-; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smax.i8.v16i8
 ; CODE:       smaxv b0, v0.16b
 define i8 @smax.i8.v16i8(<16 x i8> %v) {
@@ -200,7 +200,7 @@ define i8 @smax.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: smax.i16.v4i16
-; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smax.i16.v4i16
 ; CODE:       smaxv h0, v0.4h
 define i16 @smax.i16.v4i16(<4 x i16> %v) {
@@ -209,7 +209,7 @@ define i16 @smax.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: smax.i16.v8i16
-; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smax.i16.v8i16
 ; CODE:       smaxv h0, v0.8h
 define i16 @smax.i16.v8i16(<8 x i16> %v) {
@@ -218,7 +218,7 @@ define i16 @smax.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: smax.i32.v4i32
-; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smax.i32.v4i32
 ; CODE:       smaxv s0, v0.4s
 define i32 @smax.i32.v4i32(<4 x i32> %v) {
@@ -227,7 +227,7 @@ define i32 @smax.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: fmin.f32.v4f32
-; COST:       Found an estimated cost of 62 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmin.f32.v4f32
 ; CODE:       fminnmv s0, v0.4s
 define float @fmin.f32.v4f32(<4 x float> %v) {
@@ -236,7 +236,7 @@ define float @fmin.f32.v4f32(<4 x float> %v) {
 }
 
 ; COST-LABEL: fmax.f32.v4f32
-; COST:       Found an estimated cost of 62 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmax.f32.v4f32
 ; CODE:       fmaxnmv s0, v0.4s
 define float @fmax.f32.v4f32(<4 x float> %v) {
diff --git a/test/Analysis/CostModel/SystemZ/cmp-mem.ll b/test/Analysis/CostModel/SystemZ/cmp-mem.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4f92d5b191a92b971d22f1d096a2570e6604ee9a
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/cmp-mem.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Test costs for i8 and i16 comparisons against memory with a small immediate.
+
+define i32 @fun0(i8* %Src, i8* %Dst, i8 %Val) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun0':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %Ld = load i8, i8* %Src
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %Cmp = icmp eq i8 %Ld, 123
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %Ret = zext i1 %Cmp to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   ret i32 %Ret
+  %Ld = load i8, i8* %Src
+  %Cmp = icmp eq i8 %Ld, 123
+  %Ret = zext i1 %Cmp to i32
+  ret i32 %Ret
+}
+
+define i32 @fun1(i16* %Src, i16* %Dst, i16 %Val) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun1':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %Ld = load i16, i16* %Src
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %Cmp = icmp eq i16
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %Ret = zext i1 %Cmp to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   ret i32 %Ret
+  %Ld = load i16, i16* %Src
+  %Cmp = icmp eq i16 %Ld, 1234
+  %Ret = zext i1 %Cmp to i32
+  ret i32 %Ret
+}
diff --git a/test/Analysis/CostModel/SystemZ/divrem-reg.ll b/test/Analysis/CostModel/SystemZ/divrem-reg.ll
index 0cb1293cf3bf75d1c28274f9cd4ace36ed15268c..669d3e9064dd5b8fc882053e3661bef86306fee6 100644
--- a/test/Analysis/CostModel/SystemZ/divrem-reg.ll
+++ b/test/Analysis/CostModel/SystemZ/divrem-reg.ll
@@ -16,19 +16,19 @@ define i64 @fun0(i64 %a, i64 %b) {
 define i32 @fun1(i32 %a, i32 %b) {
   %r = sdiv i32 %a, %b
   ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = sdiv i32 %a, %b
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i32 %a, %b
 }
 
 define i16 @fun2(i16 %a, i16 %b) {
   %r = sdiv i16 %a, %b
   ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = sdiv i16 %a, %b
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i16 %a, %b
 }
 
 define i8 @fun3(i8 %a, i8 %b) {
   %r = sdiv i8 %a, %b
   ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = sdiv i8 %a, %b
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i8 %a, %b
 }
 
 ; Vector sdiv
@@ -42,13 +42,13 @@ define <2 x i64> @fun4(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i32> @fun5(<4 x i32> %a, <4 x i32> %b) {
   %r = sdiv <4 x i32> %a, %b
   ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = sdiv <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = sdiv <4 x i32>
 }
 
 define <2 x i32> @fun6(<2 x i32> %a, <2 x i32> %b) {
   %r = sdiv <2 x i32> %a, %b
   ret <2 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = sdiv <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %r = sdiv <2 x i32>
 }
 
 define <8 x i16> @fun7(<8 x i16> %a, <8 x i16> %b) {
@@ -60,7 +60,7 @@ define <8 x i16> @fun7(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i16> @fun8(<4 x i16> %a, <4 x i16> %b) {
   %r = sdiv <4 x i16> %a, %b
   ret <4 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = sdiv <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = sdiv <4 x i16>
 }
 
 define <16 x i8> @fun9(<16 x i8> %a, <16 x i8> %b) {
@@ -80,25 +80,25 @@ define <8 x i8> @fun10(<8 x i8> %a, <8 x i8> %b) {
 define i64 @fun11(i64 %a, i64 %b) {
   %r = udiv i64 %a, %b
   ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = udiv i64 %a, %b
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = udiv i64 %a, %b
 }
 
 define i32 @fun12(i32 %a, i32 %b) {
   %r = udiv i32 %a, %b
   ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = udiv i32
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = udiv i32
 }
 
 define i16 @fun13(i16 %a, i16 %b) {
   %r = udiv i16 %a, %b
   ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = udiv i16
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = udiv i16
 }
 
 define i8 @fun14(i8 %a, i8 %b) {
   %r = udiv i8 %a, %b
   ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = udiv i8
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = udiv i8
 }
 
 ; Vector udiv
@@ -106,19 +106,19 @@ define i8 @fun14(i8 %a, i8 %b) {
 define <2 x i64> @fun15(<2 x i64> %a, <2 x i64> %b) {
   %r = udiv <2 x i64> %a, %b
   ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %r = udiv <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = udiv <2 x i64>
 }
 
 define <4 x i32> @fun16(<4 x i32> %a, <4 x i32> %b) {
   %r = udiv <4 x i32> %a, %b
   ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = udiv <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = udiv <4 x i32>
 }
 
 define <2 x i32> @fun17(<2 x i32> %a, <2 x i32> %b) {
   %r = udiv <2 x i32> %a, %b
   ret <2 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = udiv <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %r = udiv <2 x i32>
 }
 
 define <8 x i16> @fun18(<8 x i16> %a, <8 x i16> %b) {
@@ -130,7 +130,7 @@ define <8 x i16> @fun18(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i16> @fun19(<4 x i16> %a, <4 x i16> %b) {
   %r = udiv <4 x i16> %a, %b
   ret <4 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = udiv <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = udiv <4 x i16>
 }
 
 define <16 x i8> @fun20(<16 x i8> %a, <16 x i8> %b) {
@@ -156,19 +156,19 @@ define i64 @fun22(i64 %a, i64 %b) {
 define i32 @fun23(i32 %a, i32 %b) {
   %r = srem i32 %a, %b
   ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = srem i32
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i32
 }
 
 define i16 @fun24(i16 %a, i16 %b) {
   %r = srem i16 %a, %b
   ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = srem i16
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i16
 }
 
 define i8 @fun25(i8 %a, i8 %b) {
   %r = srem i8 %a, %b
   ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = srem i8
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i8
 }
 
 ; Vector srem
@@ -182,13 +182,13 @@ define <2 x i64> @fun26(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i32> @fun27(<4 x i32> %a, <4 x i32> %b) {
   %r = srem <4 x i32> %a, %b
   ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = srem <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = srem <4 x i32>
 }
 
 define <2 x i32> @fun28(<2 x i32> %a, <2 x i32> %b) {
   %r = srem <2 x i32> %a, %b
   ret <2 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = srem <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %r = srem <2 x i32>
 }
 
 define <8 x i16> @fun29(<8 x i16> %a, <8 x i16> %b) {
@@ -200,7 +200,7 @@ define <8 x i16> @fun29(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i16> @fun30(<4 x i16> %a, <4 x i16> %b) {
   %r = srem <4 x i16> %a, %b
   ret <4 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = srem <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = srem <4 x i16>
 }
 
 define <16 x i8> @fun31(<16 x i8> %a, <16 x i8> %b) {
@@ -220,25 +220,25 @@ define <8 x i8> @fun32(<8 x i8> %a, <8 x i8> %b) {
 define i64 @fun33(i64 %a, i64 %b) {
   %r = urem i64 %a, %b
   ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = urem i64
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = urem i64
 }
 
 define i32 @fun34(i32 %a, i32 %b) {
   %r = urem i32 %a, %b
   ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = urem i32
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = urem i32
 }
 
 define i16 @fun35(i16 %a, i16 %b) {
   %r = urem i16 %a, %b
   ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = urem i16
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = urem i16
 }
 
 define i8 @fun36(i8 %a, i8 %b) {
   %r = urem i8 %a, %b
   ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = urem i8
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = urem i8
 }
 
 ; Vector urem
@@ -246,19 +246,19 @@ define i8 @fun36(i8 %a, i8 %b) {
 define <2 x i64> @fun37(<2 x i64> %a, <2 x i64> %b) {
   %r = urem <2 x i64> %a, %b
   ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %r = urem <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = urem <2 x i64>
 }
 
 define <4 x i32> @fun38(<4 x i32> %a, <4 x i32> %b) {
   %r = urem <4 x i32> %a, %b
   ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = urem <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = urem <4 x i32>
 }
 
 define <2 x i32> @fun39(<2 x i32> %a, <2 x i32> %b) {
   %r = urem <2 x i32> %a, %b
   ret <2 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = urem <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %r = urem <2 x i32>
 }
 
 define <8 x i16> @fun40(<8 x i16> %a, <8 x i16> %b) {
@@ -270,7 +270,7 @@ define <8 x i16> @fun40(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i16> @fun41(<4 x i16> %a, <4 x i16> %b) {
   %r = urem <4 x i16> %a, %b
   ret <4 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = urem <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = urem <4 x i16>
 }
 
 define <16 x i8> @fun42(<16 x i8> %a, <16 x i8> %b) {
diff --git a/test/Analysis/CostModel/SystemZ/fp-cast.ll b/test/Analysis/CostModel/SystemZ/fp-cast.ll
index 20feefb8025946412f37a1648adf8d55f76dd92f..cbad825e486469d75048aa757132fca9bec9c05a 100644
--- a/test/Analysis/CostModel/SystemZ/fp-cast.ll
+++ b/test/Analysis/CostModel/SystemZ/fp-cast.ll
@@ -110,45 +110,45 @@ define void @fptosi() {
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v14 = fptosi <2 x fp128> undef to <2 x i16>
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = fptosi <2 x fp128> undef to <2 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v16 = fptosi <2 x double> undef to <2 x i64>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v17 = fptosi <2 x double> undef to <2 x i32>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v18 = fptosi <2 x double> undef to <2 x i16>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v19 = fptosi <2 x double> undef to <2 x i8>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v20 = fptosi <2 x float> undef to <2 x i64>
-; CHECK: Cost Model: Found an estimated cost of 14 for instruction:   %v21 = fptosi <2 x float> undef to <2 x i32>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v22 = fptosi <2 x float> undef to <2 x i16>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v23 = fptosi <2 x float> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v17 = fptosi <2 x double> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = fptosi <2 x double> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v19 = fptosi <2 x double> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v20 = fptosi <2 x float> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v21 = fptosi <2 x float> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v22 = fptosi <2 x float> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v23 = fptosi <2 x float> undef to <2 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v24 = fptosi <4 x fp128> undef to <4 x i64>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v25 = fptosi <4 x fp128> undef to <4 x i32>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v26 = fptosi <4 x fp128> undef to <4 x i16>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = fptosi <4 x fp128> undef to <4 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v28 = fptosi <4 x double> undef to <4 x i64>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v29 = fptosi <4 x double> undef to <4 x i32>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v30 = fptosi <4 x double> undef to <4 x i16>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v31 = fptosi <4 x double> undef to <4 x i8>
-; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v32 = fptosi <4 x float> undef to <4 x i64>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v33 = fptosi <4 x float> undef to <4 x i32>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v34 = fptosi <4 x float> undef to <4 x i16>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v35 = fptosi <4 x float> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = fptosi <4 x double> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = fptosi <4 x double> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v31 = fptosi <4 x double> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %v32 = fptosi <4 x float> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = fptosi <4 x float> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v34 = fptosi <4 x float> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v35 = fptosi <4 x float> undef to <4 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v36 = fptosi <8 x fp128> undef to <8 x i64>
 ; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v37 = fptosi <8 x fp128> undef to <8 x i32>
 ; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v38 = fptosi <8 x fp128> undef to <8 x i16>
 ; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = fptosi <8 x fp128> undef to <8 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v40 = fptosi <8 x double> undef to <8 x i64>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v41 = fptosi <8 x double> undef to <8 x i32>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v42 = fptosi <8 x double> undef to <8 x i16>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v43 = fptosi <8 x double> undef to <8 x i8>
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %v44 = fptosi <8 x float> undef to <8 x i64>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v45 = fptosi <8 x float> undef to <8 x i32>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v46 = fptosi <8 x float> undef to <8 x i16>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v47 = fptosi <8 x float> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = fptosi <8 x double> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = fptosi <8 x double> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v43 = fptosi <8 x double> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %v44 = fptosi <8 x float> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = fptosi <8 x float> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v46 = fptosi <8 x float> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v47 = fptosi <8 x float> undef to <8 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = fptosi <16 x double> undef to <16 x i64>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v49 = fptosi <16 x double> undef to <16 x i32>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v50 = fptosi <16 x double> undef to <16 x i16>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v51 = fptosi <16 x double> undef to <16 x i8>
-; CHECK: Cost Model: Found an estimated cost of 41 for instruction:   %v52 = fptosi <16 x float> undef to <16 x i64>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v53 = fptosi <16 x float> undef to <16 x i32>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v54 = fptosi <16 x float> undef to <16 x i16>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v55 = fptosi <16 x float> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = fptosi <16 x double> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = fptosi <16 x double> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = fptosi <16 x double> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %v52 = fptosi <16 x float> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v53 = fptosi <16 x float> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v54 = fptosi <16 x float> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v55 = fptosi <16 x float> undef to <16 x i8>
 
   ret void;
 }
@@ -229,45 +229,45 @@ define void @fptoui() {
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v14 = fptoui <2 x fp128> undef to <2 x i16>
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = fptoui <2 x fp128> undef to <2 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v16 = fptoui <2 x double> undef to <2 x i64>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v17 = fptoui <2 x double> undef to <2 x i32>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v18 = fptoui <2 x double> undef to <2 x i16>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v19 = fptoui <2 x double> undef to <2 x i8>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v20 = fptoui <2 x float> undef to <2 x i64>
-; CHECK: Cost Model: Found an estimated cost of 14 for instruction:   %v21 = fptoui <2 x float> undef to <2 x i32>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v22 = fptoui <2 x float> undef to <2 x i16>
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v23 = fptoui <2 x float> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v17 = fptoui <2 x double> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = fptoui <2 x double> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v19 = fptoui <2 x double> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v20 = fptoui <2 x float> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v21 = fptoui <2 x float> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v22 = fptoui <2 x float> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v23 = fptoui <2 x float> undef to <2 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v24 = fptoui <4 x fp128> undef to <4 x i64>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v25 = fptoui <4 x fp128> undef to <4 x i32>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v26 = fptoui <4 x fp128> undef to <4 x i16>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = fptoui <4 x fp128> undef to <4 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v28 = fptoui <4 x double> undef to <4 x i64>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v29 = fptoui <4 x double> undef to <4 x i32>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v30 = fptoui <4 x double> undef to <4 x i16>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v31 = fptoui <4 x double> undef to <4 x i8>
-; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v32 = fptoui <4 x float> undef to <4 x i64>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v33 = fptoui <4 x float> undef to <4 x i32>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v34 = fptoui <4 x float> undef to <4 x i16>
-; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v35 = fptoui <4 x float> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = fptoui <4 x double> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = fptoui <4 x double> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v31 = fptoui <4 x double> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %v32 = fptoui <4 x float> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = fptoui <4 x float> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v34 = fptoui <4 x float> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v35 = fptoui <4 x float> undef to <4 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v36 = fptoui <8 x fp128> undef to <8 x i64>
 ; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v37 = fptoui <8 x fp128> undef to <8 x i32>
 ; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v38 = fptoui <8 x fp128> undef to <8 x i16>
 ; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = fptoui <8 x fp128> undef to <8 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v40 = fptoui <8 x double> undef to <8 x i64>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v41 = fptoui <8 x double> undef to <8 x i32>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v42 = fptoui <8 x double> undef to <8 x i16>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v43 = fptoui <8 x double> undef to <8 x i8>
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %v44 = fptoui <8 x float> undef to <8 x i64>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v45 = fptoui <8 x float> undef to <8 x i32>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v46 = fptoui <8 x float> undef to <8 x i16>
-; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v47 = fptoui <8 x float> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = fptoui <8 x double> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = fptoui <8 x double> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v43 = fptoui <8 x double> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %v44 = fptoui <8 x float> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = fptoui <8 x float> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v46 = fptoui <8 x float> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v47 = fptoui <8 x float> undef to <8 x i8>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = fptoui <16 x double> undef to <16 x i64>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v49 = fptoui <16 x double> undef to <16 x i32>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v50 = fptoui <16 x double> undef to <16 x i16>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v51 = fptoui <16 x double> undef to <16 x i8>
-; CHECK: Cost Model: Found an estimated cost of 41 for instruction:   %v52 = fptoui <16 x float> undef to <16 x i64>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v53 = fptoui <16 x float> undef to <16 x i32>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v54 = fptoui <16 x float> undef to <16 x i16>
-; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v55 = fptoui <16 x float> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = fptoui <16 x double> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = fptoui <16 x double> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = fptoui <16 x double> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %v52 = fptoui <16 x float> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v53 = fptoui <16 x float> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v54 = fptoui <16 x float> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v55 = fptoui <16 x float> undef to <16 x i8>
 
   ret void;
 }
@@ -374,50 +374,50 @@ define void @sitofp() {
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v9 = sitofp i8 undef to fp128
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = sitofp i8 undef to double
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v11 = sitofp i8 undef to float
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v12 = sitofp <2 x i64> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v12 = sitofp <2 x i64> undef to <2 x fp128>
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v13 = sitofp <2 x i64> undef to <2 x double>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v14 = sitofp <2 x i64> undef to <2 x float>
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = sitofp <2 x i32> undef to <2 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v16 = sitofp <2 x i32> undef to <2 x double>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v17 = sitofp <2 x i32> undef to <2 x float>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = sitofp <2 x i16> undef to <2 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v19 = sitofp <2 x i16> undef to <2 x double>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v20 = sitofp <2 x i16> undef to <2 x float>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v21 = sitofp <2 x i8> undef to <2 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v22 = sitofp <2 x i8> undef to <2 x double>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v23 = sitofp <2 x i8> undef to <2 x float>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v24 = sitofp <4 x i64> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v14 = sitofp <2 x i64> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v15 = sitofp <2 x i32> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v16 = sitofp <2 x i32> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 14 for instruction:   %v17 = sitofp <2 x i32> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v18 = sitofp <2 x i16> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v19 = sitofp <2 x i16> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v20 = sitofp <2 x i16> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v21 = sitofp <2 x i8> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v22 = sitofp <2 x i8> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v23 = sitofp <2 x i8> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v24 = sitofp <4 x i64> undef to <4 x fp128>
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v25 = sitofp <4 x i64> undef to <4 x double>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v26 = sitofp <4 x i64> undef to <4 x float>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = sitofp <4 x i32> undef to <4 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v28 = sitofp <4 x i32> undef to <4 x double>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = sitofp <4 x i32> undef to <4 x float>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = sitofp <4 x i16> undef to <4 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v31 = sitofp <4 x i16> undef to <4 x double>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v32 = sitofp <4 x i16> undef to <4 x float>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = sitofp <4 x i8> undef to <4 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v34 = sitofp <4 x i8> undef to <4 x double>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v35 = sitofp <4 x i8> undef to <4 x float>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v36 = sitofp <8 x i64> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v26 = sitofp <4 x i64> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v27 = sitofp <4 x i32> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v28 = sitofp <4 x i32> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v29 = sitofp <4 x i32> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v30 = sitofp <4 x i16> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v31 = sitofp <4 x i16> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v32 = sitofp <4 x i16> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v33 = sitofp <4 x i8> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v34 = sitofp <4 x i8> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v35 = sitofp <4 x i8> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v36 = sitofp <8 x i64> undef to <8 x fp128>
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v37 = sitofp <8 x i64> undef to <8 x double>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v38 = sitofp <8 x i64> undef to <8 x float>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = sitofp <8 x i32> undef to <8 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v40 = sitofp <8 x i32> undef to <8 x double>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = sitofp <8 x i32> undef to <8 x float>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = sitofp <8 x i16> undef to <8 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v43 = sitofp <8 x i16> undef to <8 x double>
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v44 = sitofp <8 x i16> undef to <8 x float>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = sitofp <8 x i8> undef to <8 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v46 = sitofp <8 x i8> undef to <8 x double>
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v47 = sitofp <8 x i8> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v38 = sitofp <8 x i64> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v39 = sitofp <8 x i32> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v40 = sitofp <8 x i32> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v41 = sitofp <8 x i32> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v42 = sitofp <8 x i16> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 33 for instruction:   %v43 = sitofp <8 x i16> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 33 for instruction:   %v44 = sitofp <8 x i16> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v45 = sitofp <8 x i8> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 33 for instruction:   %v46 = sitofp <8 x i8> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 33 for instruction:   %v47 = sitofp <8 x i8> undef to <8 x float>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = sitofp <16 x i64> undef to <16 x double>
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = sitofp <16 x i64> undef to <16 x float>
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = sitofp <16 x i32> undef to <16 x double>
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = sitofp <16 x i32> undef to <16 x float>
-; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v52 = sitofp <16 x i16> undef to <16 x double>
-; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v53 = sitofp <16 x i16> undef to <16 x float>
-; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v54 = sitofp <16 x i8> undef to <16 x double>
-; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v55 = sitofp <16 x i8> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v49 = sitofp <16 x i64> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v50 = sitofp <16 x i32> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v51 = sitofp <16 x i32> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 65 for instruction:   %v52 = sitofp <16 x i16> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 65 for instruction:   %v53 = sitofp <16 x i16> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 65 for instruction:   %v54 = sitofp <16 x i8> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 65 for instruction:   %v55 = sitofp <16 x i8> undef to <16 x float>
 
   ret void;
 }
@@ -492,50 +492,50 @@ define void @uitofp() {
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v9 = uitofp i8 undef to fp128
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = uitofp i8 undef to double
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v11 = uitofp i8 undef to float
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v12 = uitofp <2 x i64> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v12 = uitofp <2 x i64> undef to <2 x fp128>
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v13 = uitofp <2 x i64> undef to <2 x double>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v14 = uitofp <2 x i64> undef to <2 x float>
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = uitofp <2 x i32> undef to <2 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v16 = uitofp <2 x i32> undef to <2 x double>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v17 = uitofp <2 x i32> undef to <2 x float>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = uitofp <2 x i16> undef to <2 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v19 = uitofp <2 x i16> undef to <2 x double>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v20 = uitofp <2 x i16> undef to <2 x float>
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v21 = uitofp <2 x i8> undef to <2 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v22 = uitofp <2 x i8> undef to <2 x double>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v23 = uitofp <2 x i8> undef to <2 x float>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v24 = uitofp <4 x i64> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v14 = uitofp <2 x i64> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v15 = uitofp <2 x i32> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v16 = uitofp <2 x i32> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 14 for instruction:   %v17 = uitofp <2 x i32> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v18 = uitofp <2 x i16> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v19 = uitofp <2 x i16> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v20 = uitofp <2 x i16> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v21 = uitofp <2 x i8> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v22 = uitofp <2 x i8> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v23 = uitofp <2 x i8> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v24 = uitofp <4 x i64> undef to <4 x fp128>
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v25 = uitofp <4 x i64> undef to <4 x double>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v26 = uitofp <4 x i64> undef to <4 x float>
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = uitofp <4 x i32> undef to <4 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v28 = uitofp <4 x i32> undef to <4 x double>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = uitofp <4 x i32> undef to <4 x float>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = uitofp <4 x i16> undef to <4 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v31 = uitofp <4 x i16> undef to <4 x double>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v32 = uitofp <4 x i16> undef to <4 x float>
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = uitofp <4 x i8> undef to <4 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v34 = uitofp <4 x i8> undef to <4 x double>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v35 = uitofp <4 x i8> undef to <4 x float>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v36 = uitofp <8 x i64> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v26 = uitofp <4 x i64> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 9 for instruction:   %v27 = uitofp <4 x i32> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v28 = uitofp <4 x i32> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v29 = uitofp <4 x i32> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v30 = uitofp <4 x i16> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v31 = uitofp <4 x i16> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v32 = uitofp <4 x i16> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v33 = uitofp <4 x i8> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v34 = uitofp <4 x i8> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v35 = uitofp <4 x i8> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v36 = uitofp <8 x i64> undef to <8 x fp128>
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v37 = uitofp <8 x i64> undef to <8 x double>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v38 = uitofp <8 x i64> undef to <8 x float>
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = uitofp <8 x i32> undef to <8 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v40 = uitofp <8 x i32> undef to <8 x double>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = uitofp <8 x i32> undef to <8 x float>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = uitofp <8 x i16> undef to <8 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v43 = uitofp <8 x i16> undef to <8 x double>
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v44 = uitofp <8 x i16> undef to <8 x float>
-; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = uitofp <8 x i8> undef to <8 x fp128>
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v46 = uitofp <8 x i8> undef to <8 x double>
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v47 = uitofp <8 x i8> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v38 = uitofp <8 x i64> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 17 for instruction:   %v39 = uitofp <8 x i32> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v40 = uitofp <8 x i32> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v41 = uitofp <8 x i32> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v42 = uitofp <8 x i16> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 33 for instruction:   %v43 = uitofp <8 x i16> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 33 for instruction:   %v44 = uitofp <8 x i16> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v45 = uitofp <8 x i8> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 33 for instruction:   %v46 = uitofp <8 x i8> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 33 for instruction:   %v47 = uitofp <8 x i8> undef to <8 x float>
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = uitofp <16 x i64> undef to <16 x double>
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = uitofp <16 x i64> undef to <16 x float>
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = uitofp <16 x i32> undef to <16 x double>
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = uitofp <16 x i32> undef to <16 x float>
-; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v52 = uitofp <16 x i16> undef to <16 x double>
-; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v53 = uitofp <16 x i16> undef to <16 x float>
-; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v54 = uitofp <16 x i8> undef to <16 x double>
-; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v55 = uitofp <16 x i8> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v49 = uitofp <16 x i64> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v50 = uitofp <16 x i32> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v51 = uitofp <16 x i32> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 65 for instruction:   %v52 = uitofp <16 x i16> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 65 for instruction:   %v53 = uitofp <16 x i16> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 65 for instruction:   %v54 = uitofp <16 x i8> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 65 for instruction:   %v55 = uitofp <16 x i8> undef to <16 x float>
 
   ret void;
 }
diff --git a/test/Analysis/CostModel/SystemZ/int-operands-extcost.ll b/test/Analysis/CostModel/SystemZ/int-operands-extcost.ll
new file mode 100644
index 0000000000000000000000000000000000000000..138f48f42c78f79ceb8c6b92c2311885955034cf
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/int-operands-extcost.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s
+;
+; Test that i8/i16 operands get extra costs for extensions to i32 only in
+; cases where this is needed.
+
+define void @icmp() {
+  %li8_0 = load i8, i8* undef
+  %li8_1 = load i8, i8* undef
+  icmp slt i8 %li8_0, %li8_1
+
+  %a0 = add i8 %li8_0, 1
+  %a1 = add i8 %li8_1, 1
+  icmp slt i8 %a0, %a1
+
+  icmp slt i8 %a0, 123
+
+  %li16_0 = load i16, i16* undef
+  %li16_1 = load i16, i16* undef
+  icmp slt i16 %li16_0, %li16_1
+
+  %a2 = add i16 %li16_0, 1
+  %a3 = add i16 %li16_1, 1
+  icmp slt i16 %a2, %a3
+
+  icmp slt i16 %a2, 123
+
+  ret void;
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'icmp':
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li8_0 = load i8, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li8_1 = load i8, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = icmp slt i8 %li8_0, %li8_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %a0 = add i8 %li8_0, 1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %a1 = add i8 %li8_1, 1
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %2 = icmp slt i8 %a0, %a1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %3 = icmp slt i8 %a0, 123
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = icmp slt i16 %li16_0, %li16_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %a2 = add i16 %li16_0, 1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %a3 = add i16 %li16_1, 1
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %5 = icmp slt i16 %a2, %a3
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %6 = icmp slt i16 %a2, 123
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   ret void
+}
diff --git a/test/Analysis/CostModel/SystemZ/intrinsics.ll b/test/Analysis/CostModel/SystemZ/intrinsics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f83cf5a7c3d4d39d411b77c6fd9113fb6f893aba
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/intrinsics.ll
@@ -0,0 +1,124 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+define void @bswap_i64(i64 %arg, <2 x i64> %arg2) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i64':
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp1 = tail call i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp2 = tail call <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %swp4 = tail call <4 x i64>
+  %swp1 = tail call i64 @llvm.bswap.i64(i64 %arg)
+  %swp2 = tail call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %arg2)
+  %swp4 = tail call <4 x i64> @llvm.bswap.v4i64(<4 x i64> undef)
+  ret void
+}
+
+define void @bswap_i32(i32 %arg, <2 x i32> %arg2, <4 x i32> %arg4) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i32':
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp1 = tail call i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp2 = tail call <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp4 = tail call <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %swp8 = tail call <8 x i32>
+  %swp1 = tail call i32 @llvm.bswap.i32(i32 %arg)
+  %swp2 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %arg2)
+  %swp4 = tail call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %arg4)
+  %swp8 = tail call <8 x i32> @llvm.bswap.v8i32(<8 x i32> undef)
+  ret void
+}
+
+define void @bswap_i16(i16 %arg, <2 x i16> %arg2, <4 x i16> %arg4,
+                       <8 x i16> %arg8) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i16':
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp1 = tail call i16 @llvm.bswap.i16(i16 %arg)
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp2 = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %arg2)
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp4 = tail call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %arg4)
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp8 = tail call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %arg8)
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %swp16 = tail call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef)
+  %swp1 = tail call i16 @llvm.bswap.i16(i16 %arg)
+  %swp2 = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %arg2)
+  %swp4 = tail call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %arg4)
+  %swp8 = tail call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %arg8)
+  %swp16 = tail call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef)
+  ret void
+}
+
+; Test that store/load reversed is reflected in costs.
+define void @bswap_i64_mem(i64* %src, i64 %arg, i64* %dst) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i64_mem':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %Ld1 = load i64, i64* %src
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp1 = tail call i64 @llvm.bswap.i64(i64 %Ld1)
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp2 = tail call i64 @llvm.bswap.i64(i64 %arg)
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   store i64 %swp2, i64* %dst
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %Ld2 = load i64, i64* %src
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp3 = tail call i64 @llvm.bswap.i64(i64 %Ld2)
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   store i64 %swp3, i64* %dst
+  %Ld1  = load i64, i64* %src
+  %swp1 = tail call i64 @llvm.bswap.i64(i64 %Ld1)
+
+  %swp2 = tail call i64 @llvm.bswap.i64(i64 %arg)
+  store i64 %swp2, i64* %dst
+
+  %Ld2  = load i64, i64* %src
+  %swp3 = tail call i64 @llvm.bswap.i64(i64 %Ld2)
+  store i64 %swp3, i64* %dst
+
+  ret void
+}
+
+define void @bswap_i32_mem(i32* %src, i32 %arg, i32* %dst) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i32_mem':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %Ld1 = load i32, i32* %src
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp1 = tail call i32 @llvm.bswap.i32(i32 %Ld1)
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp2 = tail call i32 @llvm.bswap.i32(i32 %arg)
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   store i32 %swp2, i32* %dst
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %Ld2 = load i32, i32* %src
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp3 = tail call i32 @llvm.bswap.i32(i32 %Ld2)
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   store i32 %swp3, i32* %dst
+  %Ld1  = load i32, i32* %src
+  %swp1 = tail call i32 @llvm.bswap.i32(i32 %Ld1)
+
+  %swp2 = tail call i32 @llvm.bswap.i32(i32 %arg)
+  store i32 %swp2, i32* %dst
+
+  %Ld2  = load i32, i32* %src
+  %swp3 = tail call i32 @llvm.bswap.i32(i32 %Ld2)
+  store i32 %swp3, i32* %dst
+
+  ret void
+}
+
+define void @bswap_i16_mem(i16* %src, i16 %arg, i16* %dst) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i16_mem':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %Ld1 = load i16, i16* %src
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp1 = tail call i16 @llvm.bswap.i16(i16 %Ld1)
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp2 = tail call i16 @llvm.bswap.i16(i16 %arg)
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   store i16 %swp2, i16* %dst
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %Ld2 = load i16, i16* %src
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp3 = tail call i16 @llvm.bswap.i16(i16 %Ld2)
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   store i16 %swp3, i16* %dst
+  %Ld1  = load i16, i16* %src
+  %swp1 = tail call i16 @llvm.bswap.i16(i16 %Ld1)
+
+  %swp2 = tail call i16 @llvm.bswap.i16(i16 %arg)
+  store i16 %swp2, i16* %dst
+
+  %Ld2  = load i16, i16* %src
+  %swp3 = tail call i16 @llvm.bswap.i16(i16 %Ld2)
+  store i16 %swp3, i16* %dst
+
+  ret void
+}
+
+
+declare i64 @llvm.bswap.i64(i64)
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
+
+declare i32 @llvm.bswap.i32(i32)
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
+
+declare i16 @llvm.bswap.i16(i16)
+declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
+declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
+declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
diff --git a/test/Analysis/CostModel/SystemZ/load-and-test.ll b/test/Analysis/CostModel/SystemZ/load-and-test.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f0695688889e3abfe57502c6da102f9d9506c2aa
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/load-and-test.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Test that load and test results in 0 cost for the compare.
+
+define i64 @fun0(i64* %Src, i64 %Arg) {
+  %Ld1 = load i64, i64* %Src
+  %Cmp = icmp eq i64 %Ld1, 0
+  %S   = select i1 %Cmp, i64 %Arg, i64 %Ld1
+  ret i64 %S
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun0':
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %Ld1 = load i64, i64* %Src
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %Cmp = icmp eq i64 %Ld1, 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %S = select
+}
+
+define i32 @fun1(i32* %Src, i32 %Arg) {
+  %Ld1 = load i32, i32* %Src
+  %Cmp = icmp eq i32 %Ld1, 0
+  %S   = select i1 %Cmp, i32 %Arg, i32 %Ld1
+  ret i32 %S
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun1':
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %Ld1 = load i32, i32* %Src
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %Cmp = icmp eq i32 %Ld1, 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %S = select
+}
diff --git a/test/Analysis/CostModel/SystemZ/logical.ll b/test/Analysis/CostModel/SystemZ/logical.ll
index 41984e0a29c4206107a5981b729a2e7135ac508e..c4d1fc2b92d44e056266e74dbf6bbfa304873101 100644
--- a/test/Analysis/CostModel/SystemZ/logical.ll
+++ b/test/Analysis/CostModel/SystemZ/logical.ll
@@ -68,8 +68,8 @@ define void @ashr() {
   %res18 = ashr <16 x i32> undef, undef
   %res19 = ashr <16 x i64> undef, undef
 
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res0 = ashr i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res1 = ashr i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = ashr i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = ashr i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = ashr i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = ashr i64 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = ashr <2 x i8> undef, undef
@@ -114,8 +114,8 @@ define void @lshr() {
   %res18 = lshr <16 x i32> undef, undef
   %res19 = lshr <16 x i64> undef, undef
 
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res0 = lshr i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res1 = lshr i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = lshr i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = lshr i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = lshr i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = lshr i64 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = lshr <2 x i8> undef, undef
diff --git a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
index d5c097ced6254bd86033b4012db8d58271afc720..59999e3bc9e07499579116a09da81362e4dcf342 100644
--- a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
@@ -85,6 +85,37 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = add i32 %sext_3, undef
 }
 
+define void @add_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
+  %L1 = load i16, i16* %Src1
+  %S0 = add i16 %L1, %Arg
+  store volatile i16 %S0, i16* %Dst
+
+  %L2 = load i16, i16* %Src1
+  %L3 = load i16, i16* %Src2
+  %S1 = add i16 %L2, %L3
+  store volatile i16 %S1, i16* %Dst
+
+  ; Truncated load
+  %L32 = load i32, i32* %Src32
+  %tr = trunc i32 %L32 to i16
+  %S2 = add i16 %tr, %Arg
+  store volatile i16 %S2, i16* %Dst
+
+  ret void
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'add_i16_mem16':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L1 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %S0 = add i16 %L1, %Arg
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %S0, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L2 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %L3 = load i16, i16* %Src2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %S1 = add i16 %L2, %L3
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %S1, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L32 = load i32, i32* %Src32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i32 %L32 to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %S2 = add i16 %tr, %Arg
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %S2, i16* %Dst
+}
+
 define void @sub_lhs_mem() {
   %li32 = load i32, i32* undef
   sub i32 %li32, undef
@@ -228,6 +259,37 @@ define void @sub_rhs_mem() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = sub i32 undef, %sext_3
 }
 
+define void @sub_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
+  %L1 = load i16, i16* %Src1
+  %D0 = sub i16 %Arg, %L1
+  store volatile i16 %D0, i16* %Dst
+
+  %L2 = load i16, i16* %Src1
+  %L3 = load i16, i16* %Src2
+  %D1 = sub i16 %L2, %L3
+  store volatile i16 %D1, i16* %Dst
+
+  ; Truncated load
+  %L32 = load i32, i32* %Src32
+  %tr = trunc i32 %L32 to i16
+  %D2 = sub i16 %Arg, %tr
+  store volatile i16 %D2, i16* %Dst
+
+  ret void
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'sub_i16_mem16':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L1 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %D0 = sub i16 %Arg, %L1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %D0, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %L2 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L3 = load i16, i16* %Src2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %D1 = sub i16 %L2, %L3
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %D1, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L32 = load i32, i32* %Src32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i32 %L32 to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %D2 = sub i16 %Arg, %tr
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %D2, i16* %Dst
+}
+
 define void @mul() {
   %li32 = load i32, i32* undef
   mul i32 %li32, undef
@@ -305,6 +367,37 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = mul i32 %sext_3, undef
 }
 
+define void @mul_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
+  %L1 = load i16, i16* %Src1
+  %P0 = mul i16 %Arg, %L1
+  store volatile i16 %P0, i16* %Dst
+
+  %L2 = load i16, i16* %Src1
+  %L3 = load i16, i16* %Src2
+  %P1 = mul i16 %L2, %L3
+  store volatile i16 %P1, i16* %Dst
+
+  ; Truncated load
+  %L32 = load i32, i32* %Src32
+  %tr = trunc i32 %L32 to i16
+  %P2 = mul i16 %Arg, %tr
+  store volatile i16 %P2, i16* %Dst
+
+  ret void
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'mul_i16_mem16':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L1 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %P0 = mul i16 %Arg, %L1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %P0, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L2 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %L3 = load i16, i16* %Src2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %P1 = mul i16 %L2, %L3
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %P1, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L32 = load i32, i32* %Src32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i32 %L32 to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %P2 = mul i16 %Arg, %tr
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %P2, i16* %Dst
+}
+
 define void @sdiv_lhs(i32 %arg32, i64 %arg64) {
   %li32 = load i32, i32* undef
   sdiv i32 %li32, %arg32
@@ -340,10 +433,10 @@ define void @sdiv_lhs(i32 %arg32, i64 %arg64) {
 
 ; An sdiv loaded dividend (lhs) operand is *not* foldable.
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = sdiv i32 %li32, %arg32
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %1 = sdiv i32 %li32, %arg32
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_0 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_1 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = sdiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %2 = sdiv i32 %li32_0, %li32_1
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %3 = sdiv i64 %li64, %arg64
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_0 = load i64, i64* undef
@@ -388,12 +481,12 @@ define void @sdiv_rhs(i32 %arg32, i64 %arg64) {
 
 ; An sdiv loaded divisor (rhs) operand is foldable.
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = sdiv i32 %arg32, %li32
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %1 = sdiv i32 %arg32, %li32
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %2 = sdiv i64 %arg64, %li64
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = sdiv i32 undef, %tr
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %3 = sdiv i32 undef, %tr
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i32 %li32_2 to i64
 ; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %4 = sdiv i64 undef, %sext_0
@@ -432,15 +525,15 @@ define void @udiv_lhs(i32 %arg32, i64 %arg64) {
 
 ; An udiv loaded dividend (lhs) operand is *not* foldable.
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = udiv i32 %li32, %arg32
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %1 = udiv i32 %li32, %arg32
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_0 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_1 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = udiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %2 = udiv i32 %li32_0, %li32_1
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = udiv i64 %li64, %arg64
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %3 = udiv i64 %li64, %arg64
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_1 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %4 = udiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %4 = udiv i64 %li64_0, %li64_1
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_2 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
 ; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = udiv i32 %tr_0, undef
@@ -470,15 +563,15 @@ define void @udiv_rhs(i32 %arg32, i64 %arg64) {
 
 ; An udiv loaded divisor (rhs) operand is foldable.
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = udiv i32 %arg32, %li32
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %1 = udiv i32 %arg32, %li32
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = udiv i64 %arg64, %li64
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %2 = udiv i64 %arg64, %li64
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = udiv i32 undef, %tr_0
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %3 = udiv i32 undef, %tr_0
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %4 = udiv i64 undef, %li64_3
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %4 = udiv i64 undef, %li64_3
 }
 
 define void @and() {
@@ -633,6 +726,16 @@ define void @icmp() {
   %tr_0 = trunc i64 %li64_2 to i32
   icmp eq i32 %tr_0, undef
 
+  ; Sign-extended load
+  %li32_2 = load i32, i32* undef
+  %sext = sext i32 %li32_2 to i64
+  icmp eq i64 %sext, undef
+
+  ; Zero-extended load
+  %li32_3 = load i32, i32* undef
+  %zext = zext i32 %li32_3 to i64
+  icmp eq i64 %zext, undef
+
   ; Loads with multiple uses are *not* folded
   %li64_3 = load i64, i64* undef
   %tr_1 = trunc i64 %li64_3 to i32
@@ -652,7 +755,13 @@ define void @icmp() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = icmp eq i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = icmp eq i64 %sext, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext = zext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = icmp eq i64 %zext, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = icmp eq i64 %li64_3, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = icmp eq i64 %li64_3, undef
 }
diff --git a/test/Analysis/CostModel/X86/cast-widen.ll b/test/Analysis/CostModel/X86/cast-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..05304f4927f4b82fb8e33949025a7d976a7f0faf
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cast-widen.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s  -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s  -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
+; RUN: opt < %s  -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s  -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s  -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s  -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s  -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @add(i32 %arg) {
+; SSE-LABEL: 'add'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A = zext <4 x i1> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %D = zext <8 x i1> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %E = sext <8 x i1> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'add'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A = zext <4 x i1> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = zext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %E = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'add'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A = zext <4 x i1> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %D = zext <8 x i1> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %E = sext <8 x i1> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'add'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %A = zext <4 x i1> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %B = sext <4 x i1> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %D = zext <8 x i1> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %E = sext <8 x i1> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  ; -- Same size registeres --
+  %A = zext <4 x i1> undef to <4 x i32>
+  %B = sext <4 x i1> undef to <4 x i32>
+  %C = trunc <4 x i32> undef to <4 x i1>
+
+  ; -- Different size registers --
+  %D = zext <8 x i1> undef to <8 x i32>
+  %E = sext <8 x i1> undef to <8 x i32>
+  %F = trunc <8 x i32> undef to <8 x i1>
+
+  ; -- scalars --
+  %G = zext i1 undef to i32
+  %H = trunc i32 undef to i1
+
+  ret i32 undef
+}
+
+define i32 @zext_sext(<8 x i1> %in) {
+; SSE2-LABEL: 'zext_sext'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %A = sext <8 x i16> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i16> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C = sext <4 x i32> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %D = zext <4 x i32> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D3 = zext <16 x i16> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %D4 = zext <16 x i8> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %D5 = zext <16 x i1> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'zext_sext'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %A = sext <8 x i16> undef to <8 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B = zext <8 x i16> undef to <8 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C = sext <4 x i32> undef to <4 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D = zext <4 x i32> undef to <4 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D3 = zext <16 x i16> undef to <16 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D4 = zext <16 x i8> undef to <16 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %D5 = zext <16 x i1> undef to <16 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'zext_sext'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %A = sext <8 x i16> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %B = zext <8 x i16> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %C = sext <4 x i32> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = zext <4 x i32> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D3 = zext <16 x i16> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D4 = zext <16 x i8> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %D5 = zext <16 x i1> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'zext_sext'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i16> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %C = sext <4 x i32> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D = zext <4 x i32> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D3 = zext <16 x i16> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D4 = zext <16 x i8> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %D5 = zext <16 x i1> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'zext_sext'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i16> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %C = sext <4 x i32> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D = zext <4 x i32> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D3 = zext <16 x i16> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D4 = zext <16 x i8> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D5 = zext <16 x i1> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %Z = zext <8 x i1> %in to <8 x i32>
+  %S = sext <8 x i1> %in to <8 x i32>
+
+  %A1 = zext <16 x i8> undef to <16 x i16>
+  %A2 = sext <16 x i8> undef to <16 x i16>
+  %A = sext <8 x i16> undef to <8 x i32>
+  %B = zext <8 x i16> undef to <8 x i32>
+  %C = sext <4 x i32> undef to <4 x i64>
+
+  %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
+  %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+  %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
+  %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+
+  %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
+  %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+
+  %D = zext <4 x i32> undef to <4 x i64>
+
+  %D1 = zext <8 x i32> undef to <8 x i64>
+
+  %D2 = sext <8 x i32> undef to <8 x i64>
+
+  %D3 = zext <16 x i16> undef to <16 x i32>
+  %D4 = zext <16 x i8> undef to <16 x i32>
+  %D5 = zext <16 x i1> undef to <16 x i32>
+
+  %E = trunc <4 x i64> undef to <4 x i32>
+  %F = trunc <8 x i32> undef to <8 x i16>
+  %F1 = trunc <16 x i16> undef to <16 x i8>
+  %F2 = trunc <8 x i32> undef to <8 x i8>
+  %F3 = trunc <4 x i64> undef to <4 x i8>
+
+  %G = trunc <8 x i64> undef to <8 x i32>
+  %G1 = trunc <16 x i32> undef to <16 x i16>
+  %G2 = trunc <16 x i32> undef to <16 x i8>
+  ret i32 undef
+}
+
+define i32 @masks8(<8 x i1> %in) {
+; SSE-LABEL: 'masks8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'masks8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'masks8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'masks8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %Z = zext <8 x i1> %in to <8 x i32>
+  %S = sext <8 x i1> %in to <8 x i32>
+  ret i32 undef
+}
+
+define i32 @masks4(<4 x i1> %in) {
+; SSE-LABEL: 'masks4'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %S = sext <4 x i1> %in to <4 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'masks4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %S = sext <4 x i1> %in to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'masks4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %S = sext <4 x i1> %in to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'masks4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %S = sext <4 x i1> %in to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %Z = zext <4 x i1> %in to <4 x i64>
+  %S = sext <4 x i1> %in to <4 x i64>
+  ret i32 undef
+}
+
+define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
+; SSE-LABEL: 'sitofp4'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'sitofp4'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'sitofp4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A1 = sitofp <4 x i1> %a to <4 x float>
+  %A2 = sitofp <4 x i1> %a to <4 x double>
+  %B1 = sitofp <4 x i8> %b to <4 x float>
+  %B2 = sitofp <4 x i8> %b to <4 x double>
+  %C1 = sitofp <4 x i16> %c to <4 x float>
+  %C2 = sitofp <4 x i16> %c to <4 x double>
+  %D1 = sitofp <4 x i32> %d to <4 x float>
+  %D2 = sitofp <4 x i32> %d to <4 x double>
+  ret void
+}
+
+define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
+; SSE-LABEL: 'sitofp8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'sitofp8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'sitofp8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A1 = sitofp <8 x i1> %a to <8 x float>
+  %B1 = sitofp <8 x i8> %b to <8 x float>
+  %C1 = sitofp <8 x i16> %c to <8 x float>
+  %D1 = sitofp <8 x i32> %d to <8 x float>
+  ret void
+}
+
+define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
+; SSE-LABEL: 'uitofp4'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'uitofp4'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'uitofp4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A1 = uitofp <4 x i1> %a to <4 x float>
+  %A2 = uitofp <4 x i1> %a to <4 x double>
+  %B1 = uitofp <4 x i8> %b to <4 x float>
+  %B2 = uitofp <4 x i8> %b to <4 x double>
+  %C1 = uitofp <4 x i16> %c to <4 x float>
+  %C2 = uitofp <4 x i16> %c to <4 x double>
+  %D1 = uitofp <4 x i32> %d to <4 x float>
+  %D2 = uitofp <4 x i32> %d to <4 x double>
+  ret void
+}
+
+define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
+; SSE-LABEL: 'uitofp8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'uitofp8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'uitofp8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'uitofp8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A1 = uitofp <8 x i1> %a to <8 x float>
+  %B1 = uitofp <8 x i8> %b to <8 x float>
+  %C1 = uitofp <8 x i16> %c to <8 x float>
+  %D1 = uitofp <8 x i32> %d to <8 x float>
+  ret void
+}
+
+define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) {
+; SSE-LABEL: 'fp_conv'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'fp_conv'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'fp_conv'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A1 = fpext <4 x float> %c to <4 x double>
+  %A2 = fpext <8 x float> %a to <8 x double>
+  %A3 = fptrunc <4 x double> undef to <4 x float>
+  %A4 = fptrunc <8 x double> undef to <8 x float>
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/extend.ll b/test/Analysis/CostModel/X86/extend.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1de0b0b20e1bbb14253511f7e3ed9e64fcbba282
--- /dev/null
+++ b/test/Analysis/CostModel/X86/extend.ll
@@ -0,0 +1,860 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+;
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+define i32 @zext_vXi32() {
+; SSE2-LABEL: 'zext_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'zext_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'zext_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'zext_vXi32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'zext_vXi32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'zext_vXi32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'zext_vXi32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = zext <2 x i32> undef to <2 x i64>
+  %V4i64 = zext <4 x i32> undef to <4 x i64>
+  %V8i64 = zext <8 x i32> undef to <8 x i64>
+
+  ret i32 undef
+}
+
+define i32 @zext_vXi16() {
+; SSE2-LABEL: 'zext_vXi16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'zext_vXi16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'zext_vXi16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'zext_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'zext_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'zext_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'zext_vXi16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = zext <2 x i16> undef to <2 x i64>
+  %V4i64 = zext <4 x i16> undef to <4 x i64>
+  %V8i64 = zext <8 x i16> undef to <8 x i64>
+
+  %V2i32 = zext <2 x i16> undef to <2 x i32>
+  %V4i32 = zext <4 x i16> undef to <4 x i32>
+  %V8i32 = zext <8 x i16> undef to <8 x i32>
+  %V16i32 = zext <16 x i16> undef to <16 x i32>
+
+  ret i32 undef
+}
+
+define i32 @zext_vXi8() {
+; SSE2-LABEL: 'zext_vXi8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'zext_vXi8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'zext_vXi8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'zext_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'zext_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'zext_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'zext_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'zext_vXi8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = zext <2 x i8> undef to <2 x i64>
+  %V4i64 = zext <4 x i8> undef to <4 x i64>
+  %V8i64 = zext <8 x i8> undef to <8 x i64>
+
+  %V2i32 = zext <2 x i8> undef to <2 x i32>
+  %V4i32 = zext <4 x i8> undef to <4 x i32>
+  %V8i32 = zext <8 x i8> undef to <8 x i32>
+  %V16i32 = zext <16 x i8> undef to <16 x i32>
+
+  %V2i16 = zext <2 x i8> undef to <2 x i16>
+  %V4i16 = zext <4 x i8> undef to <4 x i16>
+  %V8i16 = zext <8 x i8> undef to <8 x i16>
+  %V16i16 = zext <16 x i8> undef to <16 x i16>
+  %V32i16 = zext <32 x i8> undef to <32 x i16>
+
+  ret i32 undef
+}
+
+define i32 @zext_vXi1() {
+; SSE-LABEL: 'zext_vXi1'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'zext_vXi1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'zext_vXi1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'zext_vXi1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'zext_vXi1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'zext_vXi1'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = zext <2 x i1> undef to <2 x i64>
+  %V4i64 = zext <4 x i1> undef to <4 x i64>
+  %V8i64 = zext <8 x i1> undef to <8 x i64>
+
+  %V2i32 = zext <2 x i1> undef to <2 x i32>
+  %V4i32 = zext <4 x i1> undef to <4 x i32>
+  %V8i32 = zext <8 x i1> undef to <8 x i32>
+  %V16i32 = zext <16 x i1> undef to <16 x i32>
+
+  %V2i16 = zext <2 x i1> undef to <2 x i16>
+  %V4i16 = zext <4 x i1> undef to <4 x i16>
+  %V8i16 = zext <8 x i1> undef to <8 x i16>
+  %V16i16 = zext <16 x i1> undef to <16 x i16>
+  %V32i16 = zext <32 x i1> undef to <32 x i16>
+
+  %V2i8 = zext <2 x i1> undef to <2 x i8>
+  %V4i8 = zext <4 x i1> undef to <4 x i8>
+  %V8i8 = zext <8 x i1> undef to <8 x i8>
+  %V16i8 = zext <16 x i1> undef to <16 x i8>
+  %V32i8 = zext <32 x i1> undef to <32 x i8>
+  %V64i8 = zext <64 x i1> undef to <64 x i8>
+
+  ret i32 undef
+}
+
+define i32 @sext_vXi32() {
+; SSE2-LABEL: 'sext_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'sext_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'sext_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'sext_vXi32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'sext_vXi32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'sext_vXi32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sext_vXi32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = sext <2 x i32> undef to <2 x i64>
+  %V4i64 = sext <4 x i32> undef to <4 x i64>
+  %V8i64 = sext <8 x i32> undef to <8 x i64>
+
+  ret i32 undef
+}
+
+define i32 @sext_vXi16() {
+; SSE2-LABEL: 'sext_vXi16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'sext_vXi16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'sext_vXi16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'sext_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'sext_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'sext_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sext_vXi16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = sext <2 x i16> undef to <2 x i64>
+  %V4i64 = sext <4 x i16> undef to <4 x i64>
+  %V8i64 = sext <8 x i16> undef to <8 x i64>
+
+  %V2i32 = sext <2 x i16> undef to <2 x i32>
+  %V4i32 = sext <4 x i16> undef to <4 x i32>
+  %V8i32 = sext <8 x i16> undef to <8 x i32>
+  %V16i32 = sext <16 x i16> undef to <16 x i32>
+
+  ret i32 undef
+}
+
+define i32 @sext_vXi8() {
+; SSE2-LABEL: 'sext_vXi8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'sext_vXi8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'sext_vXi8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'sext_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'sext_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'sext_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'sext_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sext_vXi8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = sext <2 x i8> undef to <2 x i64>
+  %V4i64 = sext <4 x i8> undef to <4 x i64>
+  %V8i64 = sext <8 x i8> undef to <8 x i64>
+
+  %V2i32 = sext <2 x i8> undef to <2 x i32>
+  %V4i32 = sext <4 x i8> undef to <4 x i32>
+  %V8i32 = sext <8 x i8> undef to <8 x i32>
+  %V16i32 = sext <16 x i8> undef to <16 x i32>
+
+  %V2i16 = sext <2 x i8> undef to <2 x i16>
+  %V4i16 = sext <4 x i8> undef to <4 x i16>
+  %V8i16 = sext <8 x i8> undef to <8 x i16>
+  %V16i16 = sext <16 x i8> undef to <16 x i16>
+  %V32i16 = sext <32 x i8> undef to <32 x i16>
+
+  ret i32 undef
+}
+
+define i32 @sext_vXi1() {
+; SSE-LABEL: 'sext_vXi1'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'sext_vXi1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'sext_vXi1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'sext_vXi1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'sext_vXi1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sext_vXi1'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = sext <2 x i1> undef to <2 x i64>
+  %V4i64 = sext <4 x i1> undef to <4 x i64>
+  %V8i64 = sext <8 x i1> undef to <8 x i64>
+
+  %V2i32 = sext <2 x i1> undef to <2 x i32>
+  %V4i32 = sext <4 x i1> undef to <4 x i32>
+  %V8i32 = sext <8 x i1> undef to <8 x i32>
+  %V16i32 = sext <16 x i1> undef to <16 x i32>
+
+  %V2i16 = sext <2 x i1> undef to <2 x i16>
+  %V4i16 = sext <4 x i1> undef to <4 x i16>
+  %V8i16 = sext <8 x i1> undef to <8 x i16>
+  %V16i16 = sext <16 x i1> undef to <16 x i16>
+  %V32i16 = sext <32 x i1> undef to <32 x i16>
+
+  %V2i8 = sext <2 x i1> undef to <2 x i8>
+  %V4i8 = sext <4 x i1> undef to <4 x i8>
+  %V8i8 = sext <8 x i1> undef to <8 x i8>
+  %V16i8 = sext <16 x i1> undef to <16 x i8>
+  %V32i8 = sext <32 x i1> undef to <32 x i8>
+  %V64i8 = sext <64 x i1> undef to <64 x i8>
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/fptosi-widen.ll b/test/Analysis/CostModel/X86/fptosi-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2135fef504fe1e6345e3d99a01ed0264c24e0a7a
--- /dev/null
+++ b/test/Analysis/CostModel/X86/fptosi-widen.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+;
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+define i32 @fptosi_double_i64(i32 %arg) {
+; SSE-LABEL: 'fptosi_double_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptosi_double_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'fptosi_double_i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'fptosi_double_i64'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptosi_double_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = fptosi double undef to i64
+  %V2I64 = fptosi <2 x double> undef to <2 x i64>
+  %V4I64 = fptosi <4 x double> undef to <4 x i64>
+  %V8I64 = fptosi <8 x double> undef to <8 x i64>
+  ret i32 undef
+}
+
+define i32 @fptosi_double_i32(i32 %arg) {
+; SSE-LABEL: 'fptosi_double_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptosi_double_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptosi_double_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptosi_double_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I32 = fptosi double undef to i32
+  %V2I32 = fptosi <2 x double> undef to <2 x i32>
+  %V4I32 = fptosi <4 x double> undef to <4 x i32>
+  %V8I32 = fptosi <8 x double> undef to <8 x i32>
+  ret i32 undef
+}
+
+define i32 @fptosi_double_i16(i32 %arg) {
+; SSE-LABEL: 'fptosi_double_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptosi_double_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptosi_double_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptosi_double_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I16 = fptosi double undef to i16
+  %V2I16 = fptosi <2 x double> undef to <2 x i16>
+  %V4I16 = fptosi <4 x double> undef to <4 x i16>
+  %V8I16 = fptosi <8 x double> undef to <8 x i16>
+  ret i32 undef
+}
+
+define i32 @fptosi_double_i8(i32 %arg) {
+; SSE-LABEL: 'fptosi_double_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptosi_double_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptosi_double_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptosi_double_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I8 = fptosi double undef to i8
+  %V2I8 = fptosi <2 x double> undef to <2 x i8>
+  %V4I8 = fptosi <4 x double> undef to <4 x i8>
+  %V8I8 = fptosi <8 x double> undef to <8 x i8>
+  ret i32 undef
+}
+
+define i32 @fptosi_float_i64(i32 %arg) {
+; SSE-LABEL: 'fptosi_float_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptosi_float_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'fptosi_float_i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'fptosi_float_i64'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptosi_float_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = fptosi float undef to i64
+  %V2I64 = fptosi <2 x float> undef to <2 x i64>
+  %V4I64 = fptosi <4 x float> undef to <4 x i64>
+  %V8I64 = fptosi <8 x float> undef to <8 x i64>
+  %V16I64 = fptosi <16 x float> undef to <16 x i64>
+  ret i32 undef
+}
+
+define i32 @fptosi_float_i32(i32 %arg) {
+; CHECK-LABEL: 'fptosi_float_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptosi_float_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I32 = fptosi float undef to i32
+  %V4I32 = fptosi <4 x float> undef to <4 x i32>
+  %V8I32 = fptosi <8 x float> undef to <8 x i32>
+  %V16I32 = fptosi <16 x float> undef to <16 x i32>
+  ret i32 undef
+}
+
+define i32 @fptosi_float_i16(i32 %arg) {
+; SSE-LABEL: 'fptosi_float_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptosi_float_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptosi_float_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptosi_float_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I16 = fptosi float undef to i16
+  %V4I16 = fptosi <4 x float> undef to <4 x i16>
+  %V8I16 = fptosi <8 x float> undef to <8 x i16>
+  %V16I16 = fptosi <16 x float> undef to <16 x i16>
+  ret i32 undef
+}
+
+define i32 @fptosi_float_i8(i32 %arg) {
+; SSE-LABEL: 'fptosi_float_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptosi_float_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptosi_float_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptosi_float_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I8 = fptosi float undef to i8
+  %V4I8 = fptosi <4 x float> undef to <4 x i8>
+  %V8I8 = fptosi <8 x float> undef to <8 x i8>
+  %V16I8 = fptosi <16 x float> undef to <16 x i8>
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/fptoui-widen.ll b/test/Analysis/CostModel/X86/fptoui-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d5b0482c162677b4fc0b97ee1fb208bacf785c44
--- /dev/null
+++ b/test/Analysis/CostModel/X86/fptoui-widen.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+;
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+define i32 @fptoui_double_i64(i32 %arg) {
+; SSE-LABEL: 'fptoui_double_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptoui_double_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'fptoui_double_i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'fptoui_double_i64'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptoui_double_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = fptoui double undef to i64
+  %V2I64 = fptoui <2 x double> undef to <2 x i64>
+  %V4I64 = fptoui <4 x double> undef to <4 x i64>
+  %V8I64 = fptoui <8 x double> undef to <8 x i64>
+  ret i32 undef
+}
+
+define i32 @fptoui_double_i32(i32 %arg) {
+; SSE-LABEL: 'fptoui_double_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptoui_double_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptoui_double_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptoui_double_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I32 = fptoui double undef to i32
+  %V2I32 = fptoui <2 x double> undef to <2 x i32>
+  %V4I32 = fptoui <4 x double> undef to <4 x i32>
+  %V8I32 = fptoui <8 x double> undef to <8 x i32>
+  ret i32 undef
+}
+
+define i32 @fptoui_double_i16(i32 %arg) {
+; SSE-LABEL: 'fptoui_double_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptoui_double_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptoui_double_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptoui_double_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I16 = fptoui double undef to i16
+  %V2I16 = fptoui <2 x double> undef to <2 x i16>
+  %V4I16 = fptoui <4 x double> undef to <4 x i16>
+  %V8I16 = fptoui <8 x double> undef to <8 x i16>
+  ret i32 undef
+}
+
+define i32 @fptoui_double_i8(i32 %arg) {
+; SSE-LABEL: 'fptoui_double_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptoui_double_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptoui_double_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptoui_double_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I8 = fptoui double undef to i8
+  %V2I8 = fptoui <2 x double> undef to <2 x i8>
+  %V4I8 = fptoui <4 x double> undef to <4 x i8>
+  %V8I8 = fptoui <8 x double> undef to <8 x i8>
+  ret i32 undef
+}
+
+define i32 @fptoui_float_i64(i32 %arg) {
+; SSE-LABEL: 'fptoui_float_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptoui_float_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'fptoui_float_i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'fptoui_float_i64'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptoui_float_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = fptoui float undef to i64
+  %V2I64 = fptoui <2 x float> undef to <2 x i64>
+  %V4I64 = fptoui <4 x float> undef to <4 x i64>
+  %V8I64 = fptoui <8 x float> undef to <8 x i64>
+  %V16I64 = fptoui <16 x float> undef to <16 x i64>
+  ret i32 undef
+}
+
+define i32 @fptoui_float_i32(i32 %arg) {
+; SSE-LABEL: 'fptoui_float_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptoui_float_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptoui_float_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptoui_float_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I32 = fptoui float undef to i32
+  %V4I32 = fptoui <4 x float> undef to <4 x i32>
+  %V8I32 = fptoui <8 x float> undef to <8 x i32>
+  %V16I32 = fptoui <16 x float> undef to <16 x i32>
+  ret i32 undef
+}
+
+define i32 @fptoui_float_i16(i32 %arg) {
+; SSE-LABEL: 'fptoui_float_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptoui_float_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptoui_float_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptoui_float_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I16 = fptoui float undef to i16
+  %V4I16 = fptoui <4 x float> undef to <4 x i16>
+  %V8I16 = fptoui <8 x float> undef to <8 x i16>
+  %V16I16 = fptoui <16 x float> undef to <16 x i16>
+  ret i32 undef
+}
+
+define i32 @fptoui_float_i8(i32 %arg) {
+; SSE-LABEL: 'fptoui_float_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'fptoui_float_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'fptoui_float_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'fptoui_float_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I8 = fptoui float undef to i8
+  %V4I8 = fptoui <4 x float> undef to <4 x i8>
+  %V8I8 = fptoui <8 x float> undef to <8 x i8>
+  %V16I8 = fptoui <16 x float> undef to <16 x i8>
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/fshl.ll b/test/Analysis/CostModel/X86/fshl.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d5da8e7ae4da41c356aad2139bae9929de1a0fea
--- /dev/null
+++ b/test/Analysis/CostModel/X86/fshl.ll
@@ -0,0 +1,2632 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+;
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=BDVER2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; Variable Funnel Shifts
+;
+
+define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) {
+; SSSE3-LABEL: 'var_funnel_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_funnel_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_funnel_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_funnel_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'var_funnel_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_funnel_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_funnel_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_funnel_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_funnel_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+  %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+  %V4I64  = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+  %V8I64  = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+  ret void
+}
+
+define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) {
+; SSSE3-LABEL: 'var_funnel_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_funnel_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_funnel_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_funnel_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'var_funnel_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_funnel_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_funnel_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_funnel_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_funnel_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+  %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+  %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+  %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+  ret void
+}
+
+define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
+; SSSE3-LABEL: 'var_funnel_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_funnel_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_funnel_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_funnel_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'var_funnel_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'var_funnel_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'var_funnel_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_funnel_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_funnel_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_funnel_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_funnel_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+  %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+  %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+  %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+  ret void
+}
+
+define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
+; SSSE3-LABEL: 'var_funnel_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_funnel_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_funnel_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_funnel_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'var_funnel_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'var_funnel_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'var_funnel_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_funnel_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_funnel_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_funnel_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_funnel_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+  %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+  %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+  %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+  ret void
+}
+
+;
+; Uniform Variable Funnel Shifts
+;
+
+define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) {
+; SSSE3-LABEL: 'splatvar_funnel_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'splatvar_funnel_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_funnel_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_funnel_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatvar_funnel_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_funnel_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_funnel_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_funnel_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_funnel_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+  %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+  %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+  %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+  %V4I64  = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+  %V8I64  = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+  ret void
+}
+
+define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) {
+; SSE-LABEL: 'splatvar_funnel_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_funnel_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_funnel_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatvar_funnel_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_funnel_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_funnel_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_funnel_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_funnel_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+  %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+  %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+  %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+  %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+  %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+  ret void
+}
+
+define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
+; SSE-LABEL: 'splatvar_funnel_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_funnel_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_funnel_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatvar_funnel_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatvar_funnel_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatvar_funnel_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_funnel_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_funnel_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_funnel_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_funnel_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+  %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+  %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+  %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+  %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+  %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+  ret void
+}
+
+define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
+; SSSE3-LABEL: 'splatvar_funnel_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'splatvar_funnel_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_funnel_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_funnel_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatvar_funnel_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatvar_funnel_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatvar_funnel_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_funnel_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_funnel_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_funnel_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_funnel_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+  %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+  %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+  %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+  %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+  %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+  ret void
+}
+
+;
+; Constant Funnel Shifts
+;
+
+define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
+; SSSE3-LABEL: 'constant_funnel_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_funnel_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_funnel_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_funnel_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'constant_funnel_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_funnel_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_funnel_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_funnel_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_funnel_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+  %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+  %V4I64  = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+  %V8I64  = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+  ret void
+}
+
+define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
+; SSSE3-LABEL: 'constant_funnel_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_funnel_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_funnel_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_funnel_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'constant_funnel_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_funnel_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_funnel_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_funnel_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_funnel_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+  %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+  %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+  %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+  ret void
+}
+
+define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
+; SSSE3-LABEL: 'constant_funnel_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_funnel_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_funnel_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_funnel_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'constant_funnel_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'constant_funnel_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'constant_funnel_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_funnel_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_funnel_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_funnel_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_funnel_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+  %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret void
+}
+
+define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
+; SSSE3-LABEL: 'constant_funnel_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_funnel_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_funnel_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_funnel_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'constant_funnel_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'constant_funnel_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'constant_funnel_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_funnel_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_funnel_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_funnel_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_funnel_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+  %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret void
+}
+
+;
+; Uniform Constant Funnel Shifts
+;
+
+define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
+; SSSE3-LABEL: 'splatconstant_funnel_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'splatconstant_funnel_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_funnel_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_funnel_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatconstant_funnel_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_funnel_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_funnel_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_funnel_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_funnel_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+  %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+  %V4I64  = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+  %V8I64  = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+  ret void
+}
+
+define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
+; SSE-LABEL: 'splatconstant_funnel_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_funnel_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_funnel_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatconstant_funnel_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_funnel_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_funnel_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_funnel_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_funnel_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+  %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+  %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+  %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+  ret void
+}
+
+define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
+; SSE-LABEL: 'splatconstant_funnel_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_funnel_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_funnel_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatconstant_funnel_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatconstant_funnel_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatconstant_funnel_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_funnel_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_funnel_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_funnel_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_funnel_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+  %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret void
+}
+
+define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
+; SSE-LABEL: 'splatconstant_funnel_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_funnel_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_funnel_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatconstant_funnel_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatconstant_funnel_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatconstant_funnel_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_funnel_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_funnel_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_funnel_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_funnel_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+  %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret void
+}
+
+;
+; Variable Unary Funnel Shifts (Rotates)
+;
+
+define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) {
+; SSE-LABEL: 'var_rotate_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_rotate_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_rotate_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'var_rotate_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_rotate_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_rotate_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_rotate_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_rotate_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+  %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+  %V4I64  = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+  %V8I64  = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+  ret void
+}
+
+define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) {
+; SSSE3-LABEL: 'var_rotate_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_rotate_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_rotate_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_rotate_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'var_rotate_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_rotate_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_rotate_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_rotate_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_rotate_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+  %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+  %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+  %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+  ret void
+}
+
+define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
+; SSSE3-LABEL: 'var_rotate_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 268 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_rotate_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_rotate_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_rotate_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'var_rotate_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'var_rotate_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'var_rotate_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_rotate_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_rotate_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_rotate_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_rotate_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+  %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+  %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+  %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+  ret void
+}
+
+define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
+; SSSE3-LABEL: 'var_rotate_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_rotate_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_rotate_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_rotate_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'var_rotate_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'var_rotate_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'var_rotate_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_rotate_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_rotate_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_rotate_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_rotate_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+  %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+  %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+  %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+  ret void
+}
+
+;
+; Uniform Variable Unary Funnel Shifts (Rotates)
+;
+
+define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) {
+; SSE-LABEL: 'splatvar_rotate_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_rotate_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_rotate_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatvar_rotate_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_rotate_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_rotate_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_rotate_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_rotate_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+  %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+  %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+  %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+  %V4I64  = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+  %V8I64  = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+  ret void
+}
+
+define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) {
+; SSE-LABEL: 'splatvar_rotate_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_rotate_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_rotate_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatvar_rotate_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_rotate_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_rotate_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_rotate_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_rotate_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+  %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+  %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+  %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+  %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+  %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+  ret void
+}
+
+define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
+; SSE-LABEL: 'splatvar_rotate_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_rotate_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_rotate_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatvar_rotate_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatvar_rotate_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatvar_rotate_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_rotate_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_rotate_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_rotate_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_rotate_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+  %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+  %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+  %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+  %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+  %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+  ret void
+}
+
+define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
+; SSSE3-LABEL: 'splatvar_rotate_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'splatvar_rotate_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_rotate_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_rotate_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatvar_rotate_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatvar_rotate_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatvar_rotate_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_rotate_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_rotate_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_rotate_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_rotate_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+  %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+  %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+  %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+  %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+  %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+  ret void
+}
+
+;
+; Constant Unary Funnel Shifts (Rotates)
+;
+
+define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
+; SSE-LABEL: 'constant_rotate_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_rotate_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_rotate_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'constant_rotate_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_rotate_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_rotate_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_rotate_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_rotate_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+  %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+  %V4I64  = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+  %V8I64  = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+  ret void
+}
+
+define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
+; SSSE3-LABEL: 'constant_rotate_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_rotate_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_rotate_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_rotate_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'constant_rotate_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_rotate_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_rotate_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_rotate_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_rotate_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+  %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+  %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+  %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+  ret void
+}
+
+define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
+; SSSE3-LABEL: 'constant_rotate_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_rotate_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_rotate_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_rotate_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'constant_rotate_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'constant_rotate_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'constant_rotate_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_rotate_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_rotate_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_rotate_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_rotate_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+  %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret void
+}
+
+define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
+; SSSE3-LABEL: 'constant_rotate_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_rotate_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_rotate_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_rotate_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'constant_rotate_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'constant_rotate_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'constant_rotate_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_rotate_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_rotate_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_rotate_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_rotate_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+  %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret void
+}
+
+;
+; Uniform Constant Unary Funnel Shifts (Rotates)
+;
+
+define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
+; SSE-LABEL: 'splatconstant_rotate_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_rotate_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_rotate_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatconstant_rotate_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_rotate_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_rotate_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_rotate_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_rotate_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+  %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+  %V4I64  = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+  %V8I64  = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+  ret void
+}
+
+define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
+; SSE-LABEL: 'splatconstant_rotate_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_rotate_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_rotate_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatconstant_rotate_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_rotate_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_rotate_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_rotate_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_rotate_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+  %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+  %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+  %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+  ret void
+}
+
+define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
+; SSE-LABEL: 'splatconstant_rotate_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_rotate_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_rotate_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatconstant_rotate_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatconstant_rotate_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_rotate_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_rotate_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_rotate_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_rotate_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+  %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret void
+}
+
+define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
+; SSE-LABEL: 'splatconstant_rotate_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_rotate_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_rotate_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatconstant_rotate_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatconstant_rotate_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_rotate_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_rotate_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_rotate_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_rotate_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+  %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret void
+}
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i8  @llvm.fshl.i8 (i8,  i8,  i8)
+
+declare <2 x i64>  @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i32>  @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i16>  @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <16 x i8>  @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <4 x i64>  @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <8 x i32>  @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
+declare <32 x i8>  @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
+
+declare <8 x i64>  @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
+declare <64 x i8>  @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
diff --git a/test/Analysis/CostModel/X86/fshr.ll b/test/Analysis/CostModel/X86/fshr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7450fc73d40703a1959b5cf18fb582dd5be21663
--- /dev/null
+++ b/test/Analysis/CostModel/X86/fshr.ll
@@ -0,0 +1,2633 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+;
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=BDVER2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; Variable Funnel Shifts
+;
+
+define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) {
+; SSSE3-LABEL: 'var_funnel_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_funnel_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_funnel_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_funnel_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'var_funnel_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_funnel_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_funnel_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_funnel_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_funnel_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+  %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+  %V4I64  = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+  %V8I64  = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+  ret void
+}
+
+define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) {
+; SSSE3-LABEL: 'var_funnel_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_funnel_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_funnel_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_funnel_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'var_funnel_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_funnel_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_funnel_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_funnel_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_funnel_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+  %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+  %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+  %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+  ret void
+}
+
+define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
+; SSSE3-LABEL: 'var_funnel_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_funnel_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_funnel_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_funnel_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'var_funnel_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'var_funnel_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'var_funnel_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_funnel_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_funnel_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_funnel_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_funnel_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+  %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+  %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+  %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+  ret void
+}
+
+define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
+; SSSE3-LABEL: 'var_funnel_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_funnel_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_funnel_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_funnel_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'var_funnel_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'var_funnel_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'var_funnel_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_funnel_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_funnel_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_funnel_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_funnel_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+  %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+  %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+  %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+  ret void
+}
+
+;
+; Uniform Variable Funnel Shifts
+;
+
+define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) {
+; SSSE3-LABEL: 'splatvar_funnel_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'splatvar_funnel_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_funnel_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_funnel_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatvar_funnel_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_funnel_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_funnel_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_funnel_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_funnel_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+  %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+  %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+  %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+  %V4I64  = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+  %V8I64  = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+  ret void
+}
+
+define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) {
+; SSE-LABEL: 'splatvar_funnel_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_funnel_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_funnel_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatvar_funnel_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_funnel_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_funnel_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_funnel_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_funnel_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+  %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+  %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+  %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+  %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+  %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+  ret void
+}
+
+define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
+; SSE-LABEL: 'splatvar_funnel_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_funnel_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_funnel_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatvar_funnel_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatvar_funnel_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatvar_funnel_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_funnel_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_funnel_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_funnel_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_funnel_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+  %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+  %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+  %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+  %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+  %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+  ret void
+}
+
+define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
+; SSSE3-LABEL: 'splatvar_funnel_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'splatvar_funnel_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_funnel_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_funnel_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatvar_funnel_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatvar_funnel_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatvar_funnel_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_funnel_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_funnel_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_funnel_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_funnel_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+  %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+  %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+  %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+  %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+  %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+  ret void
+}
+
+;
+; Constant Funnel Shifts
+;
+
+define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
+; SSSE3-LABEL: 'constant_funnel_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_funnel_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_funnel_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_funnel_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'constant_funnel_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_funnel_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_funnel_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_funnel_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_funnel_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+  %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+  %V4I64  = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+  %V8I64  = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+  ret void
+}
+
+define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
+; SSSE3-LABEL: 'constant_funnel_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_funnel_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_funnel_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_funnel_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'constant_funnel_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_funnel_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_funnel_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_funnel_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_funnel_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+  %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+  %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+  %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+  ret void
+}
+
+define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
+; SSSE3-LABEL: 'constant_funnel_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_funnel_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_funnel_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_funnel_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'constant_funnel_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'constant_funnel_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'constant_funnel_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_funnel_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_funnel_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_funnel_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_funnel_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+  %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret void
+}
+
+define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
+; SSSE3-LABEL: 'constant_funnel_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_funnel_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_funnel_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_funnel_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'constant_funnel_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'constant_funnel_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'constant_funnel_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_funnel_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_funnel_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_funnel_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_funnel_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+  %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret void
+}
+
+;
+; Uniform Constant Funnel Shifts
+;
+
+define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
+; SSSE3-LABEL: 'splatconstant_funnel_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'splatconstant_funnel_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_funnel_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_funnel_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatconstant_funnel_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_funnel_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_funnel_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_funnel_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_funnel_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+  %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+  %V4I64  = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+  %V8I64  = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+  ret void
+}
+
+define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
+; SSE-LABEL: 'splatconstant_funnel_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_funnel_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_funnel_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatconstant_funnel_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_funnel_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_funnel_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_funnel_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_funnel_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+  %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+  %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+  %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+  ret void
+}
+
+define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
+; SSE-LABEL: 'splatconstant_funnel_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_funnel_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_funnel_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatconstant_funnel_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatconstant_funnel_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatconstant_funnel_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_funnel_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_funnel_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_funnel_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_funnel_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+  %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret void
+}
+
+define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
+; SSE-LABEL: 'splatconstant_funnel_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_funnel_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_funnel_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatconstant_funnel_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatconstant_funnel_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatconstant_funnel_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_funnel_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_funnel_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_funnel_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_funnel_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+  %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret void
+}
+
+;
+; Variable Unary Funnel Shifts (Rotates)
+;
+; TODO - Add constant and uniform-constant tests
+
+define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) {
+; SSE-LABEL: 'var_rotate_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_rotate_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_rotate_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'var_rotate_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_rotate_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_rotate_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_rotate_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_rotate_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+  %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+  %V4I64  = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+  %V8I64  = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+  ret void
+}
+
+define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) {
+; SSSE3-LABEL: 'var_rotate_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_rotate_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_rotate_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_rotate_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'var_rotate_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_rotate_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_rotate_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_rotate_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_rotate_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+  %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+  %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+  %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+  ret void
+}
+
+define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
+; SSSE3-LABEL: 'var_rotate_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 268 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_rotate_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_rotate_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_rotate_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'var_rotate_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'var_rotate_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'var_rotate_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_rotate_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_rotate_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_rotate_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_rotate_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+  %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+  %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+  %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+  ret void
+}
+
+define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
+; SSSE3-LABEL: 'var_rotate_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'var_rotate_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'var_rotate_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'var_rotate_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'var_rotate_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'var_rotate_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'var_rotate_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'var_rotate_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'var_rotate_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'var_rotate_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'var_rotate_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+  %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+  %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+  %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+  ret void
+}
+
+;
+; Uniform Variable Unary Funnel Shifts (Rotates)
+;
+
+define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) {
+; SSE-LABEL: 'splatvar_rotate_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_rotate_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_rotate_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatvar_rotate_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_rotate_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_rotate_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_rotate_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_rotate_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+  %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+  %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+  %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+  %V4I64  = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+  %V8I64  = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+  ret void
+}
+
+define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) {
+; SSE-LABEL: 'splatvar_rotate_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_rotate_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_rotate_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatvar_rotate_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_rotate_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_rotate_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_rotate_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_rotate_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+  %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+  %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+  %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+  %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+  %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+  ret void
+}
+
+define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
+; SSE-LABEL: 'splatvar_rotate_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_rotate_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_rotate_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatvar_rotate_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatvar_rotate_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatvar_rotate_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_rotate_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_rotate_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_rotate_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_rotate_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+  %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+  %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+  %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+  %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+  %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+  ret void
+}
+
+define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
+; SSSE3-LABEL: 'splatvar_rotate_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'splatvar_rotate_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatvar_rotate_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatvar_rotate_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatvar_rotate_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatvar_rotate_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatvar_rotate_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatvar_rotate_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatvar_rotate_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatvar_rotate_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatvar_rotate_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+  %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+  %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+  %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+  %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+  %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+  ret void
+}
+
+;
+; Constant Unary Funnel Shifts (Rotates)
+;
+
+define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
+; SSE-LABEL: 'constant_rotate_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_rotate_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_rotate_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'constant_rotate_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_rotate_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_rotate_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_rotate_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_rotate_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+  %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+  %V4I64  = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+  %V8I64  = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+  ret void
+}
+
+define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
+; SSSE3-LABEL: 'constant_rotate_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_rotate_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_rotate_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_rotate_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'constant_rotate_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_rotate_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_rotate_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_rotate_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_rotate_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+  %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+  %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+  %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+  ret void
+}
+
+define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
+; SSSE3-LABEL: 'constant_rotate_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_rotate_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_rotate_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_rotate_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'constant_rotate_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'constant_rotate_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'constant_rotate_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_rotate_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_rotate_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_rotate_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_rotate_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+  %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret void
+}
+
+define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
+; SSSE3-LABEL: 'constant_rotate_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'constant_rotate_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'constant_rotate_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'constant_rotate_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'constant_rotate_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'constant_rotate_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'constant_rotate_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'constant_rotate_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'constant_rotate_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'constant_rotate_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'constant_rotate_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+  %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret void
+}
+
+;
+; Uniform Constant Unary Funnel Shifts (Rotates)
+;
+
+define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
+; SSE-LABEL: 'splatconstant_rotate_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_rotate_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_rotate_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatconstant_rotate_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_rotate_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_rotate_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_rotate_i64'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_rotate_i64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+  %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+  %V4I64  = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+  %V8I64  = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+  ret void
+}
+
+define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
+; SSE-LABEL: 'splatconstant_rotate_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_rotate_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_rotate_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'splatconstant_rotate_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_rotate_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_rotate_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_rotate_i32'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_rotate_i32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+  %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+  %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+  %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+  ret void
+}
+
+define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
+; SSE-LABEL: 'splatconstant_rotate_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_rotate_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_rotate_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatconstant_rotate_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatconstant_rotate_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_rotate_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_rotate_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_rotate_i16'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_rotate_i16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+  %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret void
+}
+
+define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
+; SSE-LABEL: 'splatconstant_rotate_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'splatconstant_rotate_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'splatconstant_rotate_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'splatconstant_rotate_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'splatconstant_rotate_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLM-LABEL: 'splatconstant_rotate_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'splatconstant_rotate_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BDVER2-LABEL: 'splatconstant_rotate_i8'
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BDVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'splatconstant_rotate_i8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+  %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret void
+}
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+declare i8  @llvm.fshr.i8 (i8,  i8,  i8)
+
+declare <2 x i64>  @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i32>  @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i16>  @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <16 x i8>  @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <4 x i64>  @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <8 x i32>  @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <16 x i16> @llvm.fshr.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
+declare <32 x i8>  @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
+
+declare <8 x i64>  @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
+declare <64 x i8>  @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
diff --git a/test/Analysis/CostModel/X86/interleave-load-i32.ll b/test/Analysis/CostModel/X86/interleave-load-i32.ll
index 3c94d8c446f95605d9c9b4e637716d83fd2ee378..697b5000870aa88c2a6eb5f4b32bd545c4c70e01 100755
--- a/test/Analysis/CostModel/X86/interleave-load-i32.ll
+++ b/test/Analysis/CostModel/X86/interleave-load-i32.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s 
+; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s 
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Analysis/CostModel/X86/interleave-store-i32.ll b/test/Analysis/CostModel/X86/interleave-store-i32.ll
index e3076bfa294b8178f84bc22bbcd1fd6481f62510..d2901d0e37898cbae8e79437122a9996daf4abf7 100755
--- a/test/Analysis/CostModel/X86/interleave-store-i32.ll
+++ b/test/Analysis/CostModel/X86/interleave-store-i32.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Analysis/CostModel/X86/masked-intrinsic-cost-widen.ll b/test/Analysis/CostModel/X86/masked-intrinsic-cost-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7ca1c4ec6df0e701b3e3b8c1f49db25ba8e01f06
--- /dev/null
+++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost-widen.ll
@@ -0,0 +1,606 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze | FileCheck %s --check-prefix=AVX2
+; RUN: opt < %s -S -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=skylake -cost-model -analyze | FileCheck %s --check-prefix=SKL
+; RUN: opt < %s -S -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze | FileCheck %s --check-prefix=KNL
+; RUN: opt < %s -S -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze | FileCheck %s --check-prefix=SKX
+
+
+define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
+; AVX2-LABEL: 'test1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test1'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+; KNL-LABEL: 'test1'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+; SKX-LABEL: 'test1'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
+; AVX2-LABEL: 'test2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test2'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test2'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+; AVX2-LABEL: 'test3'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKL-LABEL: 'test3'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; KNL-LABEL: 'test3'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'test3'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
+;
+; KNL-LABEL: 'test4'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
+;
+; SKX-LABEL: 'test4'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
+; AVX2-LABEL: 'test5'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKL-LABEL: 'test5'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; KNL-LABEL: 'test5'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'test5'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
+; AVX2-LABEL: 'test6'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKL-LABEL: 'test6'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; KNL-LABEL: 'test6'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'test6'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
+; AVX2-LABEL: 'test7'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+;
+; SKL-LABEL: 'test7'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+;
+; KNL-LABEL: 'test7'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+;
+; SKX-LABEL: 'test7'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
+; AVX2-LABEL: 'test8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+;
+; SKL-LABEL: 'test8'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+;
+; KNL-LABEL: 'test8'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+;
+; SKX-LABEL: 'test8'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+; KNL-LABEL: 'test_gather_2f64'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+; SKX-LABEL: 'test_gather_2f64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+
+
+
+
+%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
+
+define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+
+
+
+
+%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0)  {
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+
+
+
+
+%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)
+
+define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; KNL-LABEL: 'test_gather_16f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; SKX-LABEL: 'test_gather_16f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) {
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; KNL-LABEL: 'test_gather_16f32_var_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; SKX-LABEL: 'test_gather_16f32_var_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; KNL-LABEL: 'test_gather_16f32_ra_var_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; SKX-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; KNL-LABEL: 'test_gather_16f32_const_mask2'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+; SKX-LABEL: 'test_gather_16f32_const_mask2'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_16i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_16i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
+; AVX2-LABEL: 'test_scatter_8i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_8i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_8i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_8i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
+; AVX2-LABEL: 'test_scatter_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
+declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
+declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
+declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
+declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
diff --git a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 18dc250de814394b4053826c75257ce8b58f5529..5f2a51c7f938bd8e387cb0746c599cb992fb13a2 100644
--- a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -112,22 +112,22 @@ define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %d
 define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
 ; AVX2-LABEL: 'test5'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SKL-LABEL: 'test5'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; KNL-LABEL: 'test5'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SKX-LABEL: 'test5'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -164,22 +164,22 @@ define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
 define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
 ; AVX2-LABEL: 'test7'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
 ;
 ; SKL-LABEL: 'test7'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
 ;
 ; KNL-LABEL: 'test7'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
 ;
 ; SKX-LABEL: 'test7'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
diff --git a/test/Analysis/CostModel/X86/min-legal-vector-width.ll b/test/Analysis/CostModel/X86/min-legal-vector-width.ll
new file mode 100644
index 0000000000000000000000000000000000000000..028ed87a9896084c83e536bbb94822b09cad38e2
--- /dev/null
+++ b/test/Analysis/CostModel/X86/min-legal-vector-width.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,VEC256,AVX
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512vl,+avx512bw,+avx512dq,+prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC256,SKX256
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512vl,+avx512bw,+avx512dq,-prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC512
+
+define void @zext256() "min-legal-vector-width"="256" {
+; VEC256-LABEL: 'zext256'
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A = zext <8 x i16> undef to <8 x i64>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i32> undef to <8 x i64>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %C = zext <16 x i8> undef to <16 x i32>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = zext <16 x i16> undef to <16 x i32>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %E = zext <32 x i8> undef to <32 x i16>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VEC512-LABEL: 'zext256'
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A = zext <8 x i16> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i32> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %C = zext <16 x i8> undef to <16 x i32>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D = zext <16 x i16> undef to <16 x i32>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %E = zext <32 x i8> undef to <32 x i16>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A = zext <8 x i16> undef to <8 x i64>
+  %B = zext <8 x i32> undef to <8 x i64>
+  %C = zext <16 x i8> undef to <16 x i32>
+  %D = zext <16 x i16> undef to <16 x i32>
+  %E = zext <32 x i8> undef to <32 x i16>
+  ret void
+}
+
+define void @zext512() "min-legal-vector-width"="512" {
+; AVX-LABEL: 'zext512'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A = zext <8 x i16> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i32> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %C = zext <16 x i8> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = zext <16 x i16> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %E = zext <32 x i8> undef to <32 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX256-LABEL: 'zext512'
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A = zext <8 x i16> undef to <8 x i64>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i32> undef to <8 x i64>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %C = zext <16 x i8> undef to <16 x i32>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D = zext <16 x i16> undef to <16 x i32>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %E = zext <32 x i8> undef to <32 x i16>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VEC512-LABEL: 'zext512'
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A = zext <8 x i16> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i32> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %C = zext <16 x i8> undef to <16 x i32>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D = zext <16 x i16> undef to <16 x i32>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %E = zext <32 x i8> undef to <32 x i16>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A = zext <8 x i16> undef to <8 x i64>
+  %B = zext <8 x i32> undef to <8 x i64>
+  %C = zext <16 x i8> undef to <16 x i32>
+  %D = zext <16 x i16> undef to <16 x i32>
+  %E = zext <32 x i8> undef to <32 x i16>
+  ret void
+}
+
+define void @sext256() "min-legal-vector-width"="256" {
+; VEC256-LABEL: 'sext256'
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %B = sext <8 x i16> undef to <8 x i64>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C = sext <8 x i32> undef to <8 x i64>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = sext <16 x i8> undef to <16 x i32>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %E = sext <16 x i16> undef to <16 x i32>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F = sext <32 x i8> undef to <32 x i16>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VEC512-LABEL: 'sext256'
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %E = sext <16 x i16> undef to <16 x i32>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F = sext <32 x i8> undef to <32 x i16>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A = sext <8 x i8> undef to <8 x i64>
+  %B = sext <8 x i16> undef to <8 x i64>
+  %C = sext <8 x i32> undef to <8 x i64>
+  %D = sext <16 x i8> undef to <16 x i32>
+  %E = sext <16 x i16> undef to <16 x i32>
+  %F = sext <32 x i8> undef to <32 x i16>
+  ret void
+}
+
+define void @sext512() "min-legal-vector-width"="512" {
+; AVX-LABEL: 'sext512'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %B = sext <8 x i16> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C = sext <8 x i32> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = sext <16 x i8> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %E = sext <16 x i16> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F = sext <32 x i8> undef to <32 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX256-LABEL: 'sext512'
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %E = sext <16 x i16> undef to <16 x i32>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F = sext <32 x i8> undef to <32 x i16>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VEC512-LABEL: 'sext512'
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %E = sext <16 x i16> undef to <16 x i32>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F = sext <32 x i8> undef to <32 x i16>
+; VEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %A = sext <8 x i8> undef to <8 x i64>
+  %B = sext <8 x i16> undef to <8 x i64>
+  %C = sext <8 x i32> undef to <8 x i64>
+  %D = sext <16 x i8> undef to <16 x i32>
+  %E = sext <16 x i16> undef to <16 x i32>
+  %F = sext <32 x i8> undef to <32 x i16>
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/reduce-add-widen.ll b/test/Analysis/CostModel/X86/reduce-add-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5230511b11eadc48430460ab2bb00c9ce924ef9e
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-add-widen.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-add.ll b/test/Analysis/CostModel/X86/reduce-add.ll
index 97f7a75ffa2c9e106c98213e0e99a974c5464275..bb762233385906d8a810ce008fecc99dd0dc1669 100644
--- a/test/Analysis/CostModel/X86/reduce-add.ll
+++ b/test/Analysis/CostModel/X86/reduce-add.ll
@@ -1,28 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
@@ -45,8 +45,8 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
@@ -59,19 +59,19 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
@@ -94,8 +94,8 @@ define i32 @reduce_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
@@ -108,22 +108,25 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
@@ -132,45 +135,51 @@ define i32 @reduce_i16(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
@@ -181,69 +190,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
@@ -264,12 +291,15 @@ declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-and-widen.ll b/test/Analysis/CostModel/X86/reduce-and-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6f9db08b7076850b95573153e96b1023ad63befb
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-and-widen.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1>)
diff --git a/test/Analysis/CostModel/X86/reduce-and.ll b/test/Analysis/CostModel/X86/reduce-and.ll
index 1dfa0953c286e2d5ceec9f3be962d39102f30da8..35caf489c0045ed8b79d31e7b85b0cf7bd0005bd 100644
--- a/test/Analysis/CostModel/X86/reduce-and.ll
+++ b/test/Analysis/CostModel/X86/reduce-and.ll
@@ -1,44 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
@@ -51,35 +51,35 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
@@ -92,69 +92,78 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
@@ -165,69 +174,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
@@ -239,90 +266,90 @@ define i32 @reduce_i8(i32 %arg) {
 define i32 @reduce_i1(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i1'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i1'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
@@ -348,12 +375,15 @@ declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-mul-widen.ll b/test/Analysis/CostModel/X86/reduce-mul-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1331f78f0b4b65e2680eb19adfff1c25e6c1e374
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-mul-widen.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i64'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i64'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 171 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 249 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-mul.ll b/test/Analysis/CostModel/X86/reduce-mul.ll
index 97e67a92f8f351cd8198deb74d735c1bcd313cc7..68f94cdfec9901692d4ab6e2494dd8f699b076ba 100644
--- a/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -1,60 +1,60 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i64'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i64'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
@@ -67,67 +67,67 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i32'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i32'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i32'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
@@ -140,69 +140,78 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
@@ -213,69 +222,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 275 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 171 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 249 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 241 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
@@ -296,12 +323,15 @@ declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-or-widen.ll b/test/Analysis/CostModel/X86/reduce-or-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8cdef17e438a4be3a367fd67c517b419495cb8aa
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-or-widen.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1>)
diff --git a/test/Analysis/CostModel/X86/reduce-or.ll b/test/Analysis/CostModel/X86/reduce-or.ll
index 13814ac2b76efb094876d36ce723ba3a7d09dc71..ed0ef620482c9c4ac25c3b454142b687a0e0db08 100644
--- a/test/Analysis/CostModel/X86/reduce-or.ll
+++ b/test/Analysis/CostModel/X86/reduce-or.ll
@@ -1,44 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
@@ -51,35 +51,35 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
@@ -92,69 +92,78 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
@@ -165,69 +174,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
@@ -239,90 +266,90 @@ define i32 @reduce_i8(i32 %arg) {
 define i32 @reduce_i1(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i1'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i1'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
@@ -348,12 +375,15 @@ declare i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-smax-widen.ll b/test/Analysis/CostModel/X86/reduce-smax-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..44b01d3707c8ce74607a9e28103558d4594628b2
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-smax-widen.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-smax.ll b/test/Analysis/CostModel/X86/reduce-smax.ll
index 5426c7f9c80b072d9df9ee54f5318c0fedd3891b..b4b9a3ba6379802b4c29809f953d94c51013e1fe 100644
--- a/test/Analysis/CostModel/X86/reduce-smax.ll
+++ b/test/Analysis/CostModel/X86/reduce-smax.ll
@@ -1,32 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
@@ -34,15 +34,15 @@ define i32 @reduce_i64(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
@@ -50,7 +50,7 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
@@ -67,19 +67,19 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
@@ -124,22 +124,25 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
@@ -148,14 +151,16 @@ define i32 @reduce_i16(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
@@ -164,6 +169,7 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
@@ -172,14 +178,16 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
@@ -187,6 +195,7 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
@@ -197,69 +206,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
@@ -280,12 +307,15 @@ declare i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-smin-widen.ll b/test/Analysis/CostModel/X86/reduce-smin-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..18dd54cf026ed04f10eb0309e803c0608ef7bdee
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-smin-widen.ll
@@ -0,0 +1,314 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-smin.ll b/test/Analysis/CostModel/X86/reduce-smin.ll
index b8076a98513266b217ada124e11a1986815292c1..f420ad736ed35820afd271489c81be1b96f45c29 100644
--- a/test/Analysis/CostModel/X86/reduce-smin.ll
+++ b/test/Analysis/CostModel/X86/reduce-smin.ll
@@ -1,32 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
@@ -34,15 +34,15 @@ define i32 @reduce_i64(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
@@ -50,7 +50,7 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
@@ -67,19 +67,19 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
@@ -124,22 +124,25 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
@@ -148,14 +151,16 @@ define i32 @reduce_i16(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
@@ -164,6 +169,7 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
@@ -172,14 +178,16 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
@@ -187,6 +195,7 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
@@ -197,69 +206,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
@@ -280,12 +307,15 @@ declare i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-umax-widen.ll b/test/Analysis/CostModel/X86/reduce-umax-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c48cfdf9778bec8855b978b5988e4f6c84249abe
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-umax-widen.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-umax.ll b/test/Analysis/CostModel/X86/reduce-umax.ll
index 6b947ebc225b6e077a809c22e0fce796359d776d..490b0e469b1a3264cfca75f7c25b2e48116eb610 100644
--- a/test/Analysis/CostModel/X86/reduce-umax.ll
+++ b/test/Analysis/CostModel/X86/reduce-umax.ll
@@ -1,32 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
@@ -34,15 +34,15 @@ define i32 @reduce_i64(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
@@ -50,7 +50,7 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
@@ -67,19 +67,19 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
@@ -124,22 +124,25 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
@@ -148,14 +151,16 @@ define i32 @reduce_i16(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
@@ -164,6 +169,7 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
@@ -172,14 +178,16 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
@@ -187,6 +195,7 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
@@ -197,69 +206,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
@@ -280,12 +307,15 @@ declare i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-umin-widen.ll b/test/Analysis/CostModel/X86/reduce-umin-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b59e2f3712ed5d1f8ade8d789b9dbd84305b45a0
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-umin-widen.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-umin.ll b/test/Analysis/CostModel/X86/reduce-umin.ll
index 0fe9029bc826c41755be3c97abfadbaef19abc8a..f2f519c6023df5b4384c7c020d99bb1af28a4643 100644
--- a/test/Analysis/CostModel/X86/reduce-umin.ll
+++ b/test/Analysis/CostModel/X86/reduce-umin.ll
@@ -1,32 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
@@ -34,15 +34,15 @@ define i32 @reduce_i64(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
@@ -50,7 +50,7 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
@@ -67,19 +67,19 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
@@ -124,22 +124,25 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
@@ -148,14 +151,16 @@ define i32 @reduce_i16(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
@@ -164,6 +169,7 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
@@ -172,14 +178,16 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
@@ -187,6 +195,7 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
@@ -197,69 +206,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
@@ -280,12 +307,15 @@ declare i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-xor-widen.ll b/test/Analysis/CostModel/X86/reduce-xor-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ff0d276c2213e3bf83a6189b25eee214f6cbf016
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-xor-widen.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2   = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1>)
diff --git a/test/Analysis/CostModel/X86/reduce-xor.ll b/test/Analysis/CostModel/X86/reduce-xor.ll
index f8e82d05aa7ccaecaba8cb75ee734d7c86386a05..8f7dfdd02fb64d0f16cabd09c74050b849afefe5 100644
--- a/test/Analysis/CostModel/X86/reduce-xor.ll
+++ b/test/Analysis/CostModel/X86/reduce-xor.ll
@@ -1,44 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
@@ -51,35 +51,35 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
@@ -92,69 +92,78 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2  = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
   %V8  = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
   %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
@@ -165,69 +174,87 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %V2   = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef)
   %V8   = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
   %V16  = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
   %V32  = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
@@ -239,90 +266,90 @@ define i32 @reduce_i8(i32 %arg) {
 define i32 @reduce_i1(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i1'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i1'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
@@ -348,12 +375,15 @@ declare i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8>)
 declare i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
index 04e40d72246944b136af64e5df051fbe1914fc61..0acb16c0cd9e3065d789c55cb4906cdaf4330082 100644
--- a/test/Analysis/CostModel/X86/reduction.ll
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -15,7 +15,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'reduction_cost_float'
@@ -23,7 +23,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'reduction_cost_float'
@@ -52,35 +52,15 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 }
 
 define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
-; SSE2-LABEL: 'reduction_cost_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
-;
-; SSSE3-LABEL: 'reduction_cost_int'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
-;
-; SSE42-LABEL: 'reduction_cost_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SSE-LABEL: 'reduction_cost_int'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX1-LABEL: 'reduction_cost_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -127,7 +107,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -138,7 +118,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -188,7 +168,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -199,7 +179,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -248,7 +228,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -258,7 +238,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -300,13 +280,13 @@ define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1
 ; SSE2-LABEL: 'no_pairwise_reduction2double'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction2double'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction2double'
@@ -334,7 +314,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction4float'
@@ -342,7 +322,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction4float'
@@ -376,7 +356,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction4double'
@@ -384,7 +364,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction4double'
@@ -428,7 +408,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8float'
@@ -438,7 +418,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction8float'
@@ -486,13 +466,13 @@ define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
 ; SSE2-LABEL: 'no_pairwise_reduction2i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction2i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction2i64'
@@ -520,7 +500,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction4i32'
@@ -528,7 +508,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction4i32'
@@ -557,29 +537,13 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 }
 
 define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
-; SSE2-LABEL: 'no_pairwise_reduction4i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
-;
-; SSSE3-LABEL: 'no_pairwise_reduction4i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
-;
-; SSE42-LABEL: 'no_pairwise_reduction4i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SSE-LABEL: 'no_pairwise_reduction4i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction4i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -614,7 +578,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8i16'
@@ -624,7 +588,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction8i16'
@@ -659,35 +623,15 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 }
 
 define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
-; SSE2-LABEL: 'no_pairwise_reduction8i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
-;
-; SSSE3-LABEL: 'no_pairwise_reduction8i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
-;
-; SSE42-LABEL: 'no_pairwise_reduction8i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SSE-LABEL: 'no_pairwise_reduction8i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction8i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -725,14 +669,14 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction2double'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction2double'
@@ -765,7 +709,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4float'
@@ -775,7 +719,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4float'
@@ -817,7 +761,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4double'
@@ -827,7 +771,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4double'
@@ -882,7 +826,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8float'
@@ -895,7 +839,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8float'
@@ -956,14 +900,14 @@ define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction2i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction2i64'
@@ -996,7 +940,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4i32'
@@ -1006,7 +950,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4i32'
@@ -1041,35 +985,15 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 }
 
 define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
-; SSE2-LABEL: 'pairwise_reduction4i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
-;
-; SSSE3-LABEL: 'pairwise_reduction4i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
-;
-; SSE42-LABEL: 'pairwise_reduction4i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SSE-LABEL: 'pairwise_reduction4i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction4i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -1113,7 +1037,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8i16'
@@ -1126,7 +1050,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8i16'
@@ -1180,7 +1104,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8i32'
@@ -1193,7 +1117,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8i32'
diff --git a/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
index 5bb2e1a756d619c4d8fe75eb7cd9c805c54a596d..f74b30cf1d37f52a4c99e43225d4a99c8da054e7 100644
--- a/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
+++ b/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -17,28 +17,52 @@
 ;
 
 define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
-; CHECK-LABEL: 'test_vXf64'
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXf64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BTVER2-LABEL: 'test_vXf64'
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -54,28 +78,52 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 }
 
 define void @test_vXfi64(<4 x i64> %src256, <8 x i64> %src512) {
-; CHECK-LABEL: 'test_vXfi64'
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXfi64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXfi64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXfi64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BTVER2-LABEL: 'test_vXfi64'
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
diff --git a/test/Analysis/CostModel/X86/sitofp-widen.ll b/test/Analysis/CostModel/X86/sitofp-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3672f393c49e4c319b0ef2e1a8ecb0b6cd1bad64
--- /dev/null
+++ b/test/Analysis/CostModel/X86/sitofp-widen.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+;
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+define i32 @sitofp_i8_double() {
+; SSE-LABEL: 'sitofp_i8_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'sitofp_i8_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'sitofp_i8_double'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sitofp_i8_double'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i8_f64 = sitofp i8 undef to double
+  %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+  %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+  %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+  ret i32 undef
+}
+
+define i32 @sitofp_i16_double() {
+; SSE-LABEL: 'sitofp_i16_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'sitofp_i16_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'sitofp_i16_double'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sitofp_i16_double'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i16_f64 = sitofp i16 undef to double
+  %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+  %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+  %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+  ret i32 undef
+}
+
+define i32 @sitofp_i32_double() {
+; SSE-LABEL: 'sitofp_i32_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'sitofp_i32_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'sitofp_i32_double'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sitofp_i32_double'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i32_f64 = sitofp i32 undef to double
+  %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+  %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+  %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+  ret i32 undef
+}
+
+define i32 @sitofp_i64_double() {
+; SSE-LABEL: 'sitofp_i64_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'sitofp_i64_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'sitofp_i64_double'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'sitofp_i64_double'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sitofp_i64_double'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i64_f64 = sitofp i64 undef to double
+  %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+  %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+  %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+  ret i32 undef
+}
+
+define i32 @sitofp_i8_float() {
+; SSE-LABEL: 'sitofp_i8_float'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'sitofp_i8_float'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'sitofp_i8_float'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sitofp_i8_float'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i8_f32 = sitofp i8 undef to float
+  %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+  %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+  %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+  ret i32 undef
+}
+
+define i32 @sitofp_i16_float() {
+; SSE-LABEL: 'sitofp_i16_float'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'sitofp_i16_float'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'sitofp_i16_float'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sitofp_i16_float'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i16_f32 = sitofp i16 undef to float
+  %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+  %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+  %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+  ret i32 undef
+}
+
+define i32 @sitofp_i32_float() {
+; SSE-LABEL: 'sitofp_i32_float'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'sitofp_i32_float'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'sitofp_i32_float'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sitofp_i32_float'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i32_f32 = sitofp i32 undef to float
+  %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+  %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+  %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+  ret i32 undef
+}
+
+define i32 @sitofp_i64_float() {
+; SSE-LABEL: 'sitofp_i64_float'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'sitofp_i64_float'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'sitofp_i64_float'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'sitofp_i64_float'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'sitofp_i64_float'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i64_f32 = sitofp i64 undef to float
+  %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+  %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+  %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+  %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/strided-load-i16.ll b/test/Analysis/CostModel/X86/strided-load-i16.ll
index ef786e7509503d1c3ee57b2bc64c8e1df837846e..8061d8abcd791e17ac139f42567d2167dec928be 100755
--- a/test/Analysis/CostModel/X86/strided-load-i16.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i16.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512bw --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Analysis/CostModel/X86/strided-load-i32.ll b/test/Analysis/CostModel/X86/strided-load-i32.ll
index fad9def2bf7849fa02009cd7cfaa419658725209..9f9447a63c87b2c998da13074415cff2b5d19ad0 100755
--- a/test/Analysis/CostModel/X86/strided-load-i32.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i32.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Analysis/CostModel/X86/strided-load-i64.ll b/test/Analysis/CostModel/X86/strided-load-i64.ll
index a7f593017d6035c5d226e8eae884a73e04d60304..8b5091b3b6dd6d746eeccfe787bb19e1d8240c9f 100755
--- a/test/Analysis/CostModel/X86/strided-load-i64.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i64.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Analysis/CostModel/X86/strided-load-i8.ll b/test/Analysis/CostModel/X86/strided-load-i8.ll
index 72c9398fe2d449052d2ed7b804ee63f03adceb4b..3c88b38edec7de7510902464041c3bb541a972d2 100755
--- a/test/Analysis/CostModel/X86/strided-load-i8.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i8.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512bw --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Analysis/CostModel/X86/testshiftashr-widen.ll b/test/Analysis/CostModel/X86/testshiftashr-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dcd70ecf1599b7e63f6ea7d0f176690cf38ee9e6
--- /dev/null
+++ b/test/Analysis/CostModel/X86/testshiftashr-widen.ll
@@ -0,0 +1,531 @@
+; RUN: llc -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s
+; RUN: opt -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
+
+%shifttype = type <2 x i16>
+define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
+entry:
+  ; SSE2-LABEL: shift2i16
+  ; SSE2: cost of 32 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift2i16
+  ; SSE2-CODEGEN: psraw
+
+  %0 = ashr %shifttype %a , %b
+  ret %shifttype %0
+}
+
+%shifttype4i16 = type <4 x i16>
+define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
+entry:
+  ; SSE2-LABEL: shift4i16
+  ; SSE2: cost of 32 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift4i16
+  ; SSE2-CODEGEN: psraw
+
+  %0 = ashr %shifttype4i16 %a , %b
+  ret %shifttype4i16 %0
+}
+
+%shifttype8i16 = type <8 x i16>
+define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
+entry:
+  ; SSE2-LABEL: shift8i16
+  ; SSE2: cost of 32 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift8i16
+  ; SSE2-CODEGEN: psraw
+
+  %0 = ashr %shifttype8i16 %a , %b
+  ret %shifttype8i16 %0
+}
+
+%shifttype16i16 = type <16 x i16>
+define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
+entry:
+  ; SSE2-LABEL: shift16i16
+  ; SSE2: cost of 64 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift16i16
+  ; SSE2-CODEGEN: psraw
+
+  %0 = ashr %shifttype16i16 %a , %b
+  ret %shifttype16i16 %0
+}
+
+%shifttype32i16 = type <32 x i16>
+define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
+entry:
+  ; SSE2-LABEL: shift32i16
+  ; SSE2: cost of 128 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift32i16
+  ; SSE2-CODEGEN: psraw
+
+  %0 = ashr %shifttype32i16 %a , %b
+  ret %shifttype32i16 %0
+}
+
+%shifttype2i32 = type <2 x i32>
+define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
+entry:
+  ; SSE2-LABEL: shift2i32
+  ; SSE2: cost of 16 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift2i32
+  ; SSE2-CODEGEN: psrad
+
+  %0 = ashr %shifttype2i32 %a , %b
+  ret %shifttype2i32 %0
+}
+
+%shifttype4i32 = type <4 x i32>
+define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
+entry:
+  ; SSE2-LABEL: shift4i32
+  ; SSE2: cost of 16 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift4i32
+  ; SSE2-CODEGEN: psrad
+
+  %0 = ashr %shifttype4i32 %a , %b
+  ret %shifttype4i32 %0
+}
+
+%shifttype8i32 = type <8 x i32>
+define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
+entry:
+  ; SSE2-LABEL: shift8i32
+  ; SSE2: cost of 32 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift8i32
+  ; SSE2-CODEGEN: psrad
+
+  %0 = ashr %shifttype8i32 %a , %b
+  ret %shifttype8i32 %0
+}
+
+%shifttype16i32 = type <16 x i32>
+define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
+entry:
+  ; SSE2-LABEL: shift16i32
+  ; SSE2: cost of 64 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift16i32
+  ; SSE2-CODEGEN: psrad
+
+  %0 = ashr %shifttype16i32 %a , %b
+  ret %shifttype16i32 %0
+}
+
+%shifttype32i32 = type <32 x i32>
+define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
+entry:
+  ; SSE2-LABEL: shift32i32
+  ; SSE2: cost of 128 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift32i32
+  ; SSE2-CODEGEN: psrad
+
+  %0 = ashr %shifttype32i32 %a , %b
+  ret %shifttype32i32 %0
+}
+
+%shifttype2i64 = type <2 x i64>
+define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
+entry:
+  ; SSE2-LABEL: shift2i64
+  ; SSE2: cost of 12 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift2i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = ashr %shifttype2i64 %a , %b
+  ret %shifttype2i64 %0
+}
+
+%shifttype4i64 = type <4 x i64>
+define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
+entry:
+  ; SSE2-LABEL: shift4i64
+  ; SSE2: cost of 24 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift4i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = ashr %shifttype4i64 %a , %b
+  ret %shifttype4i64 %0
+}
+
+%shifttype8i64 = type <8 x i64>
+define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
+entry:
+  ; SSE2-LABEL: shift8i64
+  ; SSE2: cost of 48 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift8i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = ashr %shifttype8i64 %a , %b
+  ret %shifttype8i64 %0
+}
+
+%shifttype16i64 = type <16 x i64>
+define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
+entry:
+  ; SSE2-LABEL: shift16i64
+  ; SSE2: cost of 96 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift16i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = ashr %shifttype16i64 %a , %b
+  ret %shifttype16i64 %0
+}
+
+%shifttype32i64 = type <32 x i64>
+define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
+entry:
+  ; SSE2-LABEL: shift32i64
+  ; SSE2: cost of 192 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift32i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = ashr %shifttype32i64 %a , %b
+  ret %shifttype32i64 %0
+}
+
+%shifttype2i8 = type <2 x i8>
+define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
+entry:
+  ; SSE2-LABEL: shift2i8
+  ; SSE2: cost of 54 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift2i8
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = ashr %shifttype2i8 %a , %b
+  ret %shifttype2i8 %0
+}
+
+%shifttype4i8 = type <4 x i8>
+define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
+entry:
+  ; SSE2-LABEL: shift4i8
+  ; SSE2: cost of 54 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift4i8
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = ashr %shifttype4i8 %a , %b
+  ret %shifttype4i8 %0
+}
+
+%shifttype8i8 = type <8 x i8>
+define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
+entry:
+  ; SSE2-LABEL: shift8i8
+  ; SSE2: cost of 54 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift8i8
+  ; SSE2-CODEGEN: psraw
+
+  %0 = ashr %shifttype8i8 %a , %b
+  ret %shifttype8i8 %0
+}
+
+%shifttype16i8 = type <16 x i8>
+define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
+entry:
+  ; SSE2-LABEL: shift16i8
+  ; SSE2: cost of 54 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift16i8
+  ; SSE2-CODEGEN: psraw
+
+  %0 = ashr %shifttype16i8 %a , %b
+  ret %shifttype16i8 %0
+}
+
+%shifttype32i8 = type <32 x i8>
+define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
+entry:
+  ; SSE2-LABEL: shift32i8
+  ; SSE2: cost of 108 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift32i8
+  ; SSE2-CODEGEN: psraw
+
+  %0 = ashr %shifttype32i8 %a , %b
+  ret %shifttype32i8 %0
+}
+
+; Test shift by a constant a value.
+
+%shifttypec = type <2 x i16>
+define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
+entry:
+  ; SSE2-LABEL: shift2i16const
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift2i16const
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec %a , <i16 3, i16 3>
+  ret %shifttypec %0
+}
+
+%shifttypec4i16 = type <4 x i16>
+define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
+entry:
+  ; SSE2-LABEL: shift4i16const
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift4i16const
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec4i16 %0
+}
+
+%shifttypec8i16 = type <8 x i16>
+define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
+entry:
+  ; SSE2-LABEL: shift8i16const
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift8i16const
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                  i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec8i16 %0
+}
+
+%shifttypec16i16 = type <16 x i16>
+define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
+                                         %shifttypec16i16 %b) {
+entry:
+  ; SSE2-LABEL: shift16i16const
+  ; SSE2: cost of 2 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift16i16const
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec16i16 %0
+}
+
+%shifttypec32i16 = type <32 x i16>
+define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
+                                        %shifttypec32i16 %b) {
+entry:
+  ; SSE2-LABEL: shift32i16const
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift32i16const
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec32i16 %0
+}
+
+%shifttypec2i32 = type <2 x i32>
+define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
+entry:
+  ; SSE2-LABEL: shift2i32c
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift2i32c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec2i32 %a , <i32 3, i32 3>
+  ret %shifttypec2i32 %0
+}
+
+%shifttypec4i32 = type <4 x i32>
+define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
+entry:
+  ; SSE2-LABEL: shift4i32c
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift4i32c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec4i32 %0
+}
+
+%shifttypec8i32 = type <8 x i32>
+define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
+entry:
+  ; SSE2-LABEL: shift8i32c
+  ; SSE2: cost of 2 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift8i32c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                  i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec8i32 %0
+}
+
+%shifttypec16i32 = type <16 x i32>
+define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
+entry:
+  ; SSE2-LABEL: shift16i32c
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift16i32c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec16i32 %0
+}
+
+%shifttypec32i32 = type <32 x i32>
+define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
+entry:
+  ; SSE2-LABEL: shift32i32c
+  ; getTypeConversion fails here and promotes this to a i64.
+  ; SSE2: cost of 8 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift32i32c
+  ; SSE2-CODEGEN: psrad $3
+  %0 = ashr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec32i32 %0
+}
+
+%shifttypec2i64 = type <2 x i64>
+define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
+entry:
+  ; SSE2-LABEL: shift2i64c
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift2i64c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec2i64 %a , <i64 3, i64 3>
+  ret %shifttypec2i64 %0
+}
+
+%shifttypec4i64 = type <4 x i64>
+define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
+entry:
+  ; SSE2-LABEL: shift4i64c
+  ; SSE2: cost of 8 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift4i64c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec4i64 %0
+}
+
+%shifttypec8i64 = type <8 x i64>
+define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
+entry:
+  ; SSE2-LABEL: shift8i64c
+  ; SSE2: cost of 16 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift8i64c
+  ; SSE2-CODEGEN: psrad $3
+
+ %0 = ashr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                 i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec8i64 %0
+}
+
+%shifttypec16i64 = type <16 x i64>
+define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
+entry:
+  ; SSE2-LABEL: shift16i64c
+  ; SSE2: cost of 32 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift16i64c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec16i64 %0
+}
+
+%shifttypec32i64 = type <32 x i64>
+define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
+entry:
+  ; SSE2-LABEL: shift32i64c
+  ; SSE2: cost of 64 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift32i64c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec32i64 %0
+}
+
+%shifttypec2i8 = type <2 x i8>
+define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
+entry:
+  ; SSE2-LABEL: shift2i8c
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift2i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = ashr %shifttypec2i8 %a , <i8 3, i8 3>
+  ret %shifttypec2i8 %0
+}
+
+%shifttypec4i8 = type <4 x i8>
+define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
+entry:
+  ; SSE2-LABEL: shift4i8c
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift4i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = ashr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec4i8 %0
+}
+
+%shifttypec8i8 = type <8 x i8>
+define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
+entry:
+  ; SSE2-LABEL: shift8i8c
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift8i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = ashr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                 i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec8i8 %0
+}
+
+%shifttypec16i8 = type <16 x i8>
+define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
+entry:
+  ; SSE2-LABEL: shift16i8c
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift16i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = ashr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec16i8 %0
+}
+
+%shifttypec32i8 = type <32 x i8>
+define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
+entry:
+  ; SSE2-LABEL: shift32i8c
+  ; SSE2: cost of 8 {{.*}} ashr
+  ; SSE2-CODEGEN-LABEL: shift32i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = ashr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec32i8 %0
+}
+
diff --git a/test/Analysis/CostModel/X86/testshiftashr.ll b/test/Analysis/CostModel/X86/testshiftashr.ll
index 864ea2e5559e5ebca13f0e6152b035d64d5af245..7f588a7488909e8ab9ff9365340ea8d259e3b776 100644
--- a/test/Analysis/CostModel/X86/testshiftashr.ll
+++ b/test/Analysis/CostModel/X86/testshiftashr.ll
@@ -261,7 +261,7 @@ entry:
   ; SSE2-LABEL: shift4i16const
   ; SSE2: cost of 1 {{.*}} ashr
   ; SSE2-CODEGEN-LABEL: shift4i16const
-  ; SSE2-CODEGEN: psrad $3
+  ; SSE2-CODEGEN: psrad $19
 
   %0 = ashr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
   ret %shifttypec4i16 %0
@@ -476,7 +476,7 @@ entry:
   ; SSE2-LABEL: shift4i8c
   ; SSE2: cost of 1 {{.*}} ashr
   ; SSE2-CODEGEN-LABEL: shift4i8c
-  ; SSE2-CODEGEN: psrad $3
+  ; SSE2-CODEGEN: psrad $27
 
   %0 = ashr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
   ret %shifttypec4i8 %0
@@ -488,7 +488,7 @@ entry:
   ; SSE2-LABEL: shift8i8c
   ; SSE2: cost of 1 {{.*}} ashr
   ; SSE2-CODEGEN-LABEL: shift8i8c
-  ; SSE2-CODEGEN: psraw $3
+  ; SSE2-CODEGEN: psraw $11
 
   %0 = ashr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
                                  i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/testshiftlshr-widen.ll b/test/Analysis/CostModel/X86/testshiftlshr-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..96a0ef75c066069f4ce8aef03374023436224f8c
--- /dev/null
+++ b/test/Analysis/CostModel/X86/testshiftlshr-widen.ll
@@ -0,0 +1,529 @@
+; RUN: llc -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s
+; RUN: opt -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
+
+%shifttype = type <2 x i16>
+define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
+entry:
+  ; SSE2-LABEL: shift2i16
+  ; SSE2: cost of 32 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift2i16
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype %a , %b
+  ret %shifttype %0
+}
+
+%shifttype4i16 = type <4 x i16>
+define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
+entry:
+  ; SSE2-LABEL: shift4i16
+  ; SSE2: cost of 32 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift4i16
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype4i16 %a , %b
+  ret %shifttype4i16 %0
+}
+
+%shifttype8i16 = type <8 x i16>
+define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
+entry:
+  ; SSE2-LABEL: shift8i16
+  ; SSE2: cost of 32 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift8i16
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype8i16 %a , %b
+  ret %shifttype8i16 %0
+}
+
+%shifttype16i16 = type <16 x i16>
+define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
+entry:
+  ; SSE2-LABEL: shift16i16
+  ; SSE2: cost of 64 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift16i16
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype16i16 %a , %b
+  ret %shifttype16i16 %0
+}
+
+%shifttype32i16 = type <32 x i16>
+define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
+entry:
+  ; SSE2-LABEL: shift32i16
+  ; SSE2: cost of 128 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift32i16
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype32i16 %a , %b
+  ret %shifttype32i16 %0
+}
+
+%shifttype2i32 = type <2 x i32>
+define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
+entry:
+  ; SSE2-LABEL: shift2i32
+  ; SSE2: cost of 16 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift2i32
+  ; SSE2-CODEGEN: psrld
+
+  %0 = lshr %shifttype2i32 %a , %b
+  ret %shifttype2i32 %0
+}
+
+%shifttype4i32 = type <4 x i32>
+define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
+entry:
+  ; SSE2-LABEL: shift4i32
+  ; SSE2: cost of 16 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift4i32
+  ; SSE2-CODEGEN: psrld
+
+  %0 = lshr %shifttype4i32 %a , %b
+  ret %shifttype4i32 %0
+}
+
+%shifttype8i32 = type <8 x i32>
+define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
+entry:
+  ; SSE2-LABEL: shift8i32
+  ; SSE2: cost of 32 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift8i32
+  ; SSE2-CODEGEN: psrld
+
+  %0 = lshr %shifttype8i32 %a , %b
+  ret %shifttype8i32 %0
+}
+
+%shifttype16i32 = type <16 x i32>
+define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
+entry:
+  ; SSE2-LABEL: shift16i32
+  ; SSE2: cost of 64 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift16i32
+  ; SSE2-CODEGEN: psrld
+
+  %0 = lshr %shifttype16i32 %a , %b
+  ret %shifttype16i32 %0
+}
+
+%shifttype32i32 = type <32 x i32>
+define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
+entry:
+  ; SSE2-LABEL: shift32i32
+  ; SSE2: cost of 128 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift32i32
+  ; SSE2-CODEGEN: psrld
+
+  %0 = lshr %shifttype32i32 %a , %b
+  ret %shifttype32i32 %0
+}
+
+%shifttype2i64 = type <2 x i64>
+define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
+entry:
+  ; SSE2-LABEL: shift2i64
+  ; SSE2: cost of 4 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift2i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = lshr %shifttype2i64 %a , %b
+  ret %shifttype2i64 %0
+}
+
+%shifttype4i64 = type <4 x i64>
+define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
+entry:
+  ; SSE2-LABEL: shift4i64
+  ; SSE2: cost of 8 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift4i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = lshr %shifttype4i64 %a , %b
+  ret %shifttype4i64 %0
+}
+
+%shifttype8i64 = type <8 x i64>
+define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
+entry:
+  ; SSE2-LABEL: shift8i64
+  ; SSE2: cost of 16 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift8i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = lshr %shifttype8i64 %a , %b
+  ret %shifttype8i64 %0
+}
+
+%shifttype16i64 = type <16 x i64>
+define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
+entry:
+  ; SSE2-LABEL: shift16i64
+  ; SSE2: cost of 32 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift16i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = lshr %shifttype16i64 %a , %b
+  ret %shifttype16i64 %0
+}
+
+%shifttype32i64 = type <32 x i64>
+define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
+entry:
+  ; SSE2-LABEL: shift32i64
+  ; SSE2: cost of 64 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift32i64
+  ; SSE2-CODEGEN: psrlq
+
+  %0 = lshr %shifttype32i64 %a , %b
+  ret %shifttype32i64 %0
+}
+
+%shifttype2i8 = type <2 x i8>
+define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
+entry:
+  ; SSE2-LABEL: shift2i8
+  ; SSE2: cost of 26 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift2i8
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype2i8 %a , %b
+  ret %shifttype2i8 %0
+}
+
+%shifttype4i8 = type <4 x i8>
+define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
+entry:
+  ; SSE2-LABEL: shift4i8
+  ; SSE2: cost of 26 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift4i8
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype4i8 %a , %b
+  ret %shifttype4i8 %0
+}
+
+%shifttype8i8 = type <8 x i8>
+define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
+entry:
+  ; SSE2-LABEL: shift8i8
+  ; SSE2: cost of 26 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift8i8
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype8i8 %a , %b
+  ret %shifttype8i8 %0
+}
+
+%shifttype16i8 = type <16 x i8>
+define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
+entry:
+  ; SSE2-LABEL: shift16i8
+  ; SSE2: cost of 26 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift16i8
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype16i8 %a , %b
+  ret %shifttype16i8 %0
+}
+
+%shifttype32i8 = type <32 x i8>
+define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
+entry:
+  ; SSE2-LABEL: shift32i8
+  ; SSE2: cost of 52 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift32i8
+  ; SSE2-CODEGEN: psrlw
+
+  %0 = lshr %shifttype32i8 %a , %b
+  ret %shifttype32i8 %0
+}
+
+; Test shift by a constant vector.
+
+%shifttypec = type <2 x i16>
+define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
+entry:
+  ; SSE2-LABEL: shift2i16const
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift2i16const
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec %a , <i16 3, i16 3>
+  ret %shifttypec %0
+}
+
+%shifttypec4i16 = type <4 x i16>
+define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
+entry:
+  ; SSE2-LABEL: shift4i16const
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift4i16const
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec4i16 %0
+}
+
+%shifttypec8i16 = type <8 x i16>
+define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
+entry:
+  ; SSE2-LABEL: shift8i16const
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift8i16const
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                  i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec8i16 %0
+}
+
+%shifttypec16i16 = type <16 x i16>
+define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
+                                         %shifttypec16i16 %b) {
+entry:
+  ; SSE2-LABEL: shift16i16const
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift16i16const
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec16i16 %0
+}
+
+%shifttypec32i16 = type <32 x i16>
+define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
+                                        %shifttypec32i16 %b) {
+entry:
+  ; SSE2-LABEL: shift32i16const
+  ; SSE2: cost of 4 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift32i16const
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec32i16 %0
+}
+
+%shifttypec2i32 = type <2 x i32>
+define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
+entry:
+  ; SSE2-LABEL: shift2i32c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift2i32c
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec2i32 %a , <i32 3, i32 3>
+  ret %shifttypec2i32 %0
+}
+
+%shifttypec4i32 = type <4 x i32>
+define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
+entry:
+  ; SSE2-LABEL: shift4i32c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift4i32c
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec4i32 %0
+}
+
+%shifttypec8i32 = type <8 x i32>
+define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
+entry:
+  ; SSE2-LABEL: shift8i32c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift8i32c
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                  i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec8i32 %0
+}
+
+%shifttypec16i32 = type <16 x i32>
+define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
+entry:
+  ; SSE2-LABEL: shift16i32c
+  ; SSE2: cost of 4 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift16i32c
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec16i32 %0
+}
+
+%shifttypec32i32 = type <32 x i32>
+define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
+entry:
+  ; SSE2-LABEL: shift32i32c
+  ; SSE2: cost of 8 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift32i32c
+  ; SSE2-CODEGEN: psrld $3
+  %0 = lshr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec32i32 %0
+}
+
+%shifttypec2i64 = type <2 x i64>
+define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
+entry:
+  ; SSE2-LABEL: shift2i64c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift2i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec2i64 %a , <i64 3, i64 3>
+  ret %shifttypec2i64 %0
+}
+
+%shifttypec4i64 = type <4 x i64>
+define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
+entry:
+  ; SSE2-LABEL: shift4i64c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift4i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec4i64 %0
+}
+
+%shifttypec8i64 = type <8 x i64>
+define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
+entry:
+  ; SSE2-LABEL: shift8i64c
+  ; SSE2: cost of 4 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift8i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+ %0 = lshr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                 i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec8i64 %0
+}
+
+%shifttypec16i64 = type <16 x i64>
+define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
+entry:
+  ; SSE2-LABEL: shift16i64c
+  ; SSE2: cost of 8 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift16i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec16i64 %0
+}
+
+%shifttypec32i64 = type <32 x i64>
+define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
+entry:
+  ; SSE2-LABEL: shift32i64c
+  ; SSE2: cost of 16 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift32i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec32i64 %0
+}
+
+%shifttypec2i8 = type <2 x i8>
+define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
+entry:
+  ; SSE2-LABEL: shift2i8c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift2i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec2i8 %a , <i8 3, i8 3>
+  ret %shifttypec2i8 %0
+}
+
+%shifttypec4i8 = type <4 x i8>
+define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
+entry:
+  ; SSE2-LABEL: shift4i8c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift4i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec4i8 %0
+}
+
+%shifttypec8i8 = type <8 x i8>
+define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
+entry:
+  ; SSE2-LABEL: shift8i8c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift8i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                 i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec8i8 %0
+}
+
+%shifttypec16i8 = type <16 x i8>
+define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
+entry:
+  ; SSE2-LABEL: shift16i8c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift16i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec16i8 %0
+}
+
+%shifttypec32i8 = type <32 x i8>
+define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
+entry:
+  ; SSE2-LABEL: shift32i8c
+  ; SSE2: cost of 4 {{.*}} lshr
+  ; SSE2-CODEGEN-LABEL: shift32i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec32i8 %0
+}
diff --git a/test/Analysis/CostModel/X86/testshiftshl-widen.ll b/test/Analysis/CostModel/X86/testshiftshl-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..28276e5835fd44b01da83b708c499dccb03e4a67
--- /dev/null
+++ b/test/Analysis/CostModel/X86/testshiftshl-widen.ll
@@ -0,0 +1,529 @@
+; RUN: llc -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s
+; RUN: opt -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
+
+%shifttype = type <2 x i16>
+define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
+entry:
+  ; SSE2-LABEL: shift2i16
+  ; SSE2: cost of 32 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift2i16
+  ; SSE2-CODEGEN: pmullw
+
+  %0 = shl %shifttype %a , %b
+  ret %shifttype %0
+}
+
+%shifttype4i16 = type <4 x i16>
+define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
+entry:
+  ; SSE2-LABEL: shift4i16
+  ; SSE2: cost of 32 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift4i16
+  ; SSE2-CODEGEN: pmullw
+
+  %0 = shl %shifttype4i16 %a , %b
+  ret %shifttype4i16 %0
+}
+
+%shifttype8i16 = type <8 x i16>
+define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
+entry:
+  ; SSE2-LABEL: shift8i16
+  ; SSE2: cost of 32 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift8i16
+  ; SSE2-CODEGEN: pmullw
+
+  %0 = shl %shifttype8i16 %a , %b
+  ret %shifttype8i16 %0
+}
+
+%shifttype16i16 = type <16 x i16>
+define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
+entry:
+  ; SSE2-LABEL: shift16i16
+  ; SSE2: cost of 64 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift16i16
+  ; SSE2-CODEGEN: pmullw
+
+  %0 = shl %shifttype16i16 %a , %b
+  ret %shifttype16i16 %0
+}
+
+%shifttype32i16 = type <32 x i16>
+define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
+entry:
+  ; SSE2-LABEL: shift32i16
+  ; SSE2: cost of 128 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift32i16
+  ; SSE2-CODEGEN: pmullw
+
+  %0 = shl %shifttype32i16 %a , %b
+  ret %shifttype32i16 %0
+}
+
+%shifttype2i32 = type <2 x i32>
+define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
+entry:
+  ; SSE2-LABEL: shift2i32
+  ; SSE2: cost of 10 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift2i32
+  ; SSE2-CODEGEN: pmuludq
+
+  %0 = shl %shifttype2i32 %a , %b
+  ret %shifttype2i32 %0
+}
+
+%shifttype4i32 = type <4 x i32>
+define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
+entry:
+  ; SSE2-LABEL: shift4i32
+  ; SSE2: cost of 10 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift4i32
+  ; SSE2-CODEGEN: pmuludq
+
+  %0 = shl %shifttype4i32 %a , %b
+  ret %shifttype4i32 %0
+}
+
+%shifttype8i32 = type <8 x i32>
+define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
+entry:
+  ; SSE2-LABEL: shift8i32
+  ; SSE2: cost of 20 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift8i32
+  ; SSE2-CODEGEN: pmuludq
+
+  %0 = shl %shifttype8i32 %a , %b
+  ret %shifttype8i32 %0
+}
+
+%shifttype16i32 = type <16 x i32>
+define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
+entry:
+  ; SSE2-LABEL: shift16i32
+  ; SSE2: cost of 40 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift16i32
+  ; SSE2-CODEGEN: pmuludq
+
+  %0 = shl %shifttype16i32 %a , %b
+  ret %shifttype16i32 %0
+}
+
+%shifttype32i32 = type <32 x i32>
+define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
+entry:
+  ; SSE2-LABEL: shift32i32
+  ; SSE2: cost of 80 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift32i32
+  ; SSE2-CODEGEN: pmuludq
+
+  %0 = shl %shifttype32i32 %a , %b
+  ret %shifttype32i32 %0
+}
+
+%shifttype2i64 = type <2 x i64>
+define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
+entry:
+  ; SSE2-LABEL: shift2i64
+  ; SSE2: cost of 4 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift2i64
+  ; SSE2-CODEGEN: psllq
+
+  %0 = shl %shifttype2i64 %a , %b
+  ret %shifttype2i64 %0
+}
+
+%shifttype4i64 = type <4 x i64>
+define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
+entry:
+  ; SSE2-LABEL: shift4i64
+  ; SSE2: cost of 8 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift4i64
+  ; SSE2-CODEGEN: psllq
+
+  %0 = shl %shifttype4i64 %a , %b
+  ret %shifttype4i64 %0
+}
+
+%shifttype8i64 = type <8 x i64>
+define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
+entry:
+  ; SSE2-LABEL: shift8i64
+  ; SSE2: cost of 16 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift8i64
+  ; SSE2-CODEGEN: psllq
+
+  %0 = shl %shifttype8i64 %a , %b
+  ret %shifttype8i64 %0
+}
+
+%shifttype16i64 = type <16 x i64>
+define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
+entry:
+  ; SSE2-LABEL: shift16i64
+  ; SSE2: cost of 32 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift16i64
+  ; SSE2-CODEGEN: psllq
+
+  %0 = shl %shifttype16i64 %a , %b
+  ret %shifttype16i64 %0
+}
+
+%shifttype32i64 = type <32 x i64>
+define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
+entry:
+  ; SSE2-LABEL: shift32i64
+  ; SSE2: cost of 64 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift32i64
+  ; SSE2-CODEGEN: psllq
+
+  %0 = shl %shifttype32i64 %a , %b
+  ret %shifttype32i64 %0
+}
+
+%shifttype2i8 = type <2 x i8>
+define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
+entry:
+  ; SSE2-LABEL: shift2i8
+  ; SSE2: cost of 26 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift2i8
+  ; SSE2-CODEGEN: psllw
+
+  %0 = shl %shifttype2i8 %a , %b
+  ret %shifttype2i8 %0
+}
+
+%shifttype4i8 = type <4 x i8>
+define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
+entry:
+  ; SSE2-LABEL: shift4i8
+  ; SSE2: cost of 26 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift4i8
+  ; SSE2-CODEGEN: psllw
+
+  %0 = shl %shifttype4i8 %a , %b
+  ret %shifttype4i8 %0
+}
+
+%shifttype8i8 = type <8 x i8>
+define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
+entry:
+  ; SSE2-LABEL: shift8i8
+  ; SSE2: cost of 26 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift8i8
+  ; SSE2-CODEGEN: psllw
+
+  %0 = shl %shifttype8i8 %a , %b
+  ret %shifttype8i8 %0
+}
+
+%shifttype16i8 = type <16 x i8>
+define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
+entry:
+  ; SSE2-LABEL: shift16i8
+  ; SSE2: cost of 26 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift16i8
+  ; SSE2-CODEGEN: psllw
+
+  %0 = shl %shifttype16i8 %a , %b
+  ret %shifttype16i8 %0
+}
+
+%shifttype32i8 = type <32 x i8>
+define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
+entry:
+  ; SSE2-LABEL: shift32i8
+  ; SSE2: cost of 52 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift32i8
+  ; SSE2-CODEGEN: psllw
+
+  %0 = shl %shifttype32i8 %a , %b
+  ret %shifttype32i8 %0
+}
+
+; Test shift by a constant vector.
+
+%shifttypec = type <2 x i16>
+define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
+entry:
+  ; SSE2-LABEL: shift2i16const
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift2i16const
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec %a , <i16 3, i16 3>
+  ret %shifttypec %0
+}
+
+%shifttypec4i16 = type <4 x i16>
+define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
+entry:
+  ; SSE2-LABEL: shift4i16const
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift4i16const
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec4i16 %0
+}
+
+%shifttypec8i16 = type <8 x i16>
+define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
+entry:
+  ; SSE2-LABEL: shift8i16const
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift8i16const
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                  i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec8i16 %0
+}
+
+%shifttypec16i16 = type <16 x i16>
+define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
+                                         %shifttypec16i16 %b) {
+entry:
+  ; SSE2-LABEL: shift16i16const
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift16i16const
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec16i16 %0
+}
+
+%shifttypec32i16 = type <32 x i16>
+define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
+                                        %shifttypec32i16 %b) {
+entry:
+  ; SSE2-LABEL: shift32i16const
+  ; SSE2: cost of 4 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift32i16const
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec32i16 %0
+}
+
+%shifttypec2i32 = type <2 x i32>
+define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
+entry:
+  ; SSE2-LABEL: shift2i32c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift2i32c
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec2i32 %a , <i32 3, i32 3>
+  ret %shifttypec2i32 %0
+}
+
+%shifttypec4i32 = type <4 x i32>
+define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
+entry:
+  ; SSE2-LABEL: shift4i32c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift4i32c
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec4i32 %0
+}
+
+%shifttypec8i32 = type <8 x i32>
+define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
+entry:
+  ; SSE2-LABEL: shift8i32c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift8i32c
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                  i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec8i32 %0
+}
+
+%shifttypec16i32 = type <16 x i32>
+define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
+entry:
+  ; SSE2-LABEL: shift16i32c
+  ; SSE2: cost of 4 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift16i32c
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec16i32 %0
+}
+
+%shifttypec32i32 = type <32 x i32>
+define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
+entry:
+  ; SSE2-LABEL: shift32i32c
+  ; SSE2: cost of 8 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift32i32c
+  ; SSE2-CODEGEN: pslld $3
+  %0 = shl %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec32i32 %0
+}
+
+%shifttypec2i64 = type <2 x i64>
+define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
+entry:
+  ; SSE2-LABEL: shift2i64c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift2i64c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec2i64 %a , <i64 3, i64 3>
+  ret %shifttypec2i64 %0
+}
+
+%shifttypec4i64 = type <4 x i64>
+define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
+entry:
+  ; SSE2-LABEL: shift4i64c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift4i64c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec4i64 %0
+}
+
+%shifttypec8i64 = type <8 x i64>
+define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
+entry:
+  ; SSE2-LABEL: shift8i64c
+  ; SSE2: cost of 4 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift8i64c
+  ; SSE2-CODEGEN: psllq $3
+
+ %0 = shl %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                 i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec8i64 %0
+}
+
+%shifttypec16i64 = type <16 x i64>
+define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
+entry:
+  ; SSE2-LABEL: shift16i64c
+  ; SSE2: cost of 8 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift16i64c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec16i64 %0
+}
+
+%shifttypec32i64 = type <32 x i64>
+define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
+entry:
+  ; SSE2-LABEL: shift32i64c
+  ; SSE2: cost of 16 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift32i64c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec32i64 %0
+}
+
+%shifttypec2i8 = type <2 x i8>
+define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
+entry:
+  ; SSE2-LABEL: shift2i8c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift2i8c
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec2i8 %a , <i8 3, i8 3>
+  ret %shifttypec2i8 %0
+}
+
+%shifttypec4i8 = type <4 x i8>
+define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
+entry:
+  ; SSE2-LABEL: shift4i8c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift4i8c
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec4i8 %0
+}
+
+%shifttypec8i8 = type <8 x i8>
+define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
+entry:
+  ; SSE2-LABEL: shift8i8c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift8i8c
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                 i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec8i8 %0
+}
+
+%shifttypec16i8 = type <16 x i8>
+define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
+entry:
+  ; SSE2-LABEL: shift16i8c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift16i8c
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec16i8 %0
+}
+
+%shifttypec32i8 = type <32 x i8>
+define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
+entry:
+  ; SSE2-LABEL: shift32i8c
+  ; SSE2: cost of 4 {{.*}} shl
+  ; SSE2-CODEGEN-LABEL: shift32i8c
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec32i8 %0
+}
diff --git a/test/Analysis/CostModel/X86/trunc.ll b/test/Analysis/CostModel/X86/trunc.ll
index 2e0dc8e5dafdc2c8b65db693bb3e4d5b9b072898..e2b2cfa732d4193879272b9bf8bad830d343dcc7 100644
--- a/test/Analysis/CostModel/X86/trunc.ll
+++ b/test/Analysis/CostModel/X86/trunc.ll
@@ -53,6 +53,7 @@ define i32 @trunc_vXi16() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
@@ -62,6 +63,7 @@ define i32 @trunc_vXi16() {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
@@ -71,6 +73,7 @@ define i32 @trunc_vXi16() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
@@ -80,6 +83,7 @@ define i32 @trunc_vXi16() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
@@ -89,6 +93,7 @@ define i32 @trunc_vXi16() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
@@ -98,6 +103,7 @@ define i32 @trunc_vXi16() {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
@@ -107,6 +113,7 @@ define i32 @trunc_vXi16() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
@@ -115,6 +122,7 @@ define i32 @trunc_vXi16() {
   %V2i64 = trunc <2 x i64> undef to <2 x i16>
   %V4i64 = trunc <4 x i64> undef to <4 x i16>
   %V8i64 = trunc <8 x i64> undef to <8 x i16>
+  %V2i32 = trunc <2 x i32> undef to <2 x i16>
   %V4i32 = trunc <4 x i32> undef to <4 x i16>
   %V8i32 = trunc <8 x i32> undef to <8 x i16>
   %V16i32 = trunc <16 x i32> undef to <16 x i16>
@@ -259,3 +267,93 @@ define i32 @trunc_vXi8() {
 
   ret i32 undef
 }
+
+define i32 @trunc_vXi1() {
+; SSE-LABEL: 'trunc_vXi1'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'trunc_vXi1'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2i64 = trunc <2 x i64> undef to <2 x i1>
+  %V4i64 = trunc <4 x i64> undef to <4 x i1>
+  %V8i64 = trunc <8 x i64> undef to <8 x i1>
+
+  %V2i32 = trunc <2 x i32> undef to <2 x i1>
+  %V4i32 = trunc <4 x i32> undef to <4 x i1>
+  %V8i32 = trunc <8 x i32> undef to <8 x i1>
+  %V16i32 = trunc <16 x i32> undef to <16 x i1>
+
+  %V2i16 = trunc <2 x i16> undef to <2 x i1>
+  %V4i16 = trunc <4 x i16> undef to <4 x i1>
+  %V8i16 = trunc <8 x i16> undef to <8 x i1>
+  %V16i16 = trunc <16 x i16> undef to <16 x i1>
+  %V32i16 = trunc <32 x i16> undef to <32 x i1>
+
+  %V32i8 = trunc <32 x i8> undef to <32 x i1>
+  %V64i8 = trunc <64 x i8> undef to <64 x i1>
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/uitofp-widen.ll b/test/Analysis/CostModel/X86/uitofp-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7513d1cdcb5c3e45d66dc3d4b64e000641cf4c7c
--- /dev/null
+++ b/test/Analysis/CostModel/X86/uitofp-widen.ll
@@ -0,0 +1,326 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+;
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+define i32 @uitofp_i8_double() {
+; SSE-LABEL: 'uitofp_i8_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'uitofp_i8_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'uitofp_i8_double'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'uitofp_i8_double'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i8_f64 = uitofp i8 undef to double
+  %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+  %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+  %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+  ret i32 undef
+}
+
+define i32 @uitofp_i16_double() {
+; SSE-LABEL: 'uitofp_i16_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'uitofp_i16_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'uitofp_i16_double'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'uitofp_i16_double'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i16_f64 = uitofp i16 undef to double
+  %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+  %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+  %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+  ret i32 undef
+}
+
+define i32 @uitofp_i32_double() {
+; SSE-LABEL: 'uitofp_i32_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'uitofp_i32_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'uitofp_i32_double'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'uitofp_i32_double'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i32_f64 = uitofp i32 undef to double
+  %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+  %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+  %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+  ret i32 undef
+}
+
+define i32 @uitofp_i64_double() {
+; SSE-LABEL: 'uitofp_i64_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'uitofp_i64_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'uitofp_i64_double'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'uitofp_i64_double'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'uitofp_i64_double'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i64_f64 = uitofp i64 undef to double
+  %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+  %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+  %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+  ret i32 undef
+}
+
+define i32 @uitofp_i8_float() {
+; SSE-LABEL: 'uitofp_i8_float'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'uitofp_i8_float'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'uitofp_i8_float'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'uitofp_i8_float'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i8_f32 = uitofp i8 undef to float
+  %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+  %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+  %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+  ret i32 undef
+}
+
+define i32 @uitofp_i16_float() {
+; SSE-LABEL: 'uitofp_i16_float'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'uitofp_i16_float'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'uitofp_i16_float'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'uitofp_i16_float'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i16_f32 = uitofp i16 undef to float
+  %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+  %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+  %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+  ret i32 undef
+}
+
+define i32 @uitofp_i32_float() {
+; SSE-LABEL: 'uitofp_i32_float'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'uitofp_i32_float'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'uitofp_i32_float'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'uitofp_i32_float'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'uitofp_i32_float'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i32_f32 = uitofp i32 undef to float
+  %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+  %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+  %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+  ret i32 undef
+}
+
+define i32 @uitofp_i64_float() {
+; SSE-LABEL: 'uitofp_i64_float'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'uitofp_i64_float'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'uitofp_i64_float'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'uitofp_i64_float'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; BTVER2-LABEL: 'uitofp_i64_float'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %cvt_i64_f32 = uitofp i64 undef to float
+  %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+  %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+  %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+  %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index 1d1aafabe924fb67bcc29944c3bd3136b96cc174..75515730846ab8b8dc7d30a39117255fd56ea21a 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -1074,7 +1074,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v2i64'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <2 x i64> %a, <i64 1, i64 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <2 x i64> %a, <i64 1, i64 7>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v2i64'
@@ -1103,7 +1103,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v4i64'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v4i64'
@@ -1132,7 +1132,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v8i64'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v8i64'
@@ -1164,13 +1164,9 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
-; XOPAVX1-LABEL: 'constant_shift_v4i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
-;
-; XOPAVX2-LABEL: 'constant_shift_v4i32'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+; XOP-LABEL: 'constant_shift_v4i32'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v4i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -1202,7 +1198,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'constant_shift_v8i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'constant_shift_v8i32'
@@ -1239,7 +1235,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'constant_shift_v16i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'constant_shift_v16i32'
@@ -1272,7 +1268,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v8i16'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
 ; AVX512F-LABEL: 'constant_shift_v8i16'
@@ -1317,7 +1313,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v16i16'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
 ; AVX512F-LABEL: 'constant_shift_v16i16'
@@ -1362,7 +1358,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v32i16'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
 ; AVX512F-LABEL: 'constant_shift_v32i16'
@@ -1403,7 +1399,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v16i8'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v16i8'
@@ -1436,7 +1432,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v32i8'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v32i8'
@@ -1469,7 +1465,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v64i8'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'constant_shift_v64i8'
@@ -1510,7 +1506,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
 ; XOP-LABEL: 'splatconstant_shift_v2i64'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <2 x i64> %a, <i64 7, i64 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <2 x i64> %a, <i64 7, i64 7>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatconstant_shift_v2i64'
@@ -1538,13 +1534,9 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
-; XOPAVX1-LABEL: 'splatconstant_shift_v4i64'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
-;
-; XOPAVX2-LABEL: 'splatconstant_shift_v4i64'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+; XOP-LABEL: 'splatconstant_shift_v4i64'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatconstant_shift_v4i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
@@ -1571,13 +1563,9 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
-; XOPAVX1-LABEL: 'splatconstant_shift_v8i64'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
-;
-; XOPAVX2-LABEL: 'splatconstant_shift_v8i64'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+; XOP-LABEL: 'splatconstant_shift_v8i64'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatconstant_shift_v8i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -1618,7 +1606,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v8i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v8i32'
@@ -1651,7 +1639,7 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v16i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v16i32'
@@ -1697,7 +1685,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v16i16'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v16i16'
@@ -1730,7 +1718,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v32i16'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v32i16'
@@ -1771,7 +1759,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
 ; XOP-LABEL: 'splatconstant_shift_v16i8'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
 ; AVX512-LABEL: 'splatconstant_shift_v16i8'
@@ -1799,13 +1787,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
-; XOPAVX1-LABEL: 'splatconstant_shift_v32i8'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
-;
-; XOPAVX2-LABEL: 'splatconstant_shift_v32i8'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+; XOP-LABEL: 'splatconstant_shift_v32i8'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
 ; AVX512-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1832,13 +1816,9 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
-; XOPAVX1-LABEL: 'splatconstant_shift_v64i8'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
-;
-; XOPAVX2-LABEL: 'splatconstant_shift_v64i8'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+; XOP-LABEL: 'splatconstant_shift_v64i8'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 8ebe6f821653fb5fa1d4a7d13280a993720f06a8..096699be31dae357d88a7e5a9d052106f07007af 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -1067,13 +1067,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, <i64 1, i64 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
-; XOPAVX1-LABEL: 'constant_shift_v2i64'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <2 x i64> %a, <i64 1, i64 7>
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
-;
-; XOPAVX2-LABEL: 'constant_shift_v2i64'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, <i64 1, i64 7>
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+; XOP-LABEL: 'constant_shift_v2i64'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, <i64 1, i64 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v2i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, <i64 1, i64 7>
@@ -1101,7 +1097,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'constant_shift_v4i64'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'constant_shift_v4i64'
@@ -1134,7 +1130,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'constant_shift_v8i64'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'constant_shift_v8i64'
@@ -1170,13 +1166,9 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
-; XOPAVX1-LABEL: 'constant_shift_v4i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
-;
-; XOPAVX2-LABEL: 'constant_shift_v4i32'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+; XOP-LABEL: 'constant_shift_v4i32'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v4i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -1208,7 +1200,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'constant_shift_v8i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'constant_shift_v8i32'
@@ -1245,7 +1237,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'constant_shift_v16i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'constant_shift_v16i32'
@@ -1278,7 +1270,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v8i16'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
 ; AVX512F-LABEL: 'constant_shift_v8i16'
@@ -1323,7 +1315,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v16i16'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
 ; AVX512F-LABEL: 'constant_shift_v16i16'
@@ -1368,7 +1360,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v32i16'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
 ; AVX512F-LABEL: 'constant_shift_v32i16'
@@ -1409,7 +1401,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v16i8'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v16i8'
@@ -1442,7 +1434,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v32i8'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
 ; AVX512-LABEL: 'constant_shift_v32i8'
@@ -1475,7 +1467,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
 ; XOP-LABEL: 'constant_shift_v64i8'
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'constant_shift_v64i8'
@@ -1533,7 +1525,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v4i64'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v4i64'
@@ -1566,7 +1558,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v8i64'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v8i64'
@@ -1612,7 +1604,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v8i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v8i32'
@@ -1645,7 +1637,7 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v16i32'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v16i32'
@@ -1691,7 +1683,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v16i16'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v16i16'
@@ -1724,7 +1716,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v32i16'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v32i16'
@@ -1756,9 +1748,21 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 }
 
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: 'splatconstant_shift_v16i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+; SSE-LABEL: 'splatconstant_shift_v16i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX-LABEL: 'splatconstant_shift_v16i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; XOP-LABEL: 'splatconstant_shift_v16i8'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512-LABEL: 'splatconstant_shift_v16i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
 ; BTVER2-LABEL: 'splatconstant_shift_v16i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1782,7 +1786,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v32i8'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v32i8'
@@ -1815,7 +1819,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatconstant_shift_v64i8'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatconstant_shift_v64i8'
diff --git a/test/Analysis/DemandedBits/intrinsics.ll b/test/Analysis/DemandedBits/intrinsics.ll
index 48f6d4624422dfb97d2ba40bfadcd21c981f561e..6987f14f8b1ba814e1079a17bb9d2f265d00c252 100644
--- a/test/Analysis/DemandedBits/intrinsics.ll
+++ b/test/Analysis/DemandedBits/intrinsics.ll
@@ -23,3 +23,82 @@ define i8 @test_bitreverse(i32 %x) {
 }
 declare i32 @llvm.bitreverse.i32(i32)
 
+; Funnel shifts
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i33 @llvm.fshr.i33(i33, i33, i33)
+
+; CHECK-DAG: DemandedBits: 0xff for   %x2 = or i32 %x, 1
+; CHECK-DAG: DemandedBits: 0xff000000 for   %y2 = or i32 %y, 1
+; CHECK-DAG: DemandedBits: 0xffff for   %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 8)
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and i32 %z, 65535
+define i32 @test_fshl(i32 %x, i32 %y) {
+  %x2 = or i32 %x, 1
+  %y2 = or i32 %y, 1
+  %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 8)
+  %r = and i32 %z, 65535
+  ret i32 %r
+}
+
+; CHECK-DAG: DemandedBits: 0xff for   %x2 = or i33 %x, 1
+; CHECK-DAG: DemandedBits: 0x1fe000000 for   %y2 = or i33 %y, 1
+; CHECK-DAG: DemandedBits: 0xffff for   %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 25)
+; CHECK-DAG: DemandedBits: 0x1ffffffff for   %r = and i33 %z, 65535
+define i33 @test_fshr(i33 %x, i33 %y) {
+  %x2 = or i33 %x, 1
+  %y2 = or i33 %y, 1
+  %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 25)
+  %r = and i33 %z, 65535
+  ret i33 %r
+}
+
+; CHECK-DAG: DemandedBits: 0xffff for   %x2 = or i32 %x, 1
+; CHECK-DAG: DemandedBits: 0x0 for   %y2 = or i32 %y, 1
+; CHECK-DAG: DemandedBits: 0xffff for   %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 0)
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and i32 %z, 65535
+define i32 @test_fshl_zero_shift(i32 %x, i32 %y) {
+  %x2 = or i32 %x, 1
+  %y2 = or i32 %y, 1
+  %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 0)
+  %r = and i32 %z, 65535
+  ret i32 %r
+}
+
+; CHECK-DAG: DemandedBits: 0x0 for   %x2 = or i33 %x, 1
+; CHECK-DAG: DemandedBits: 0xffff for   %y2 = or i33 %y, 1
+; CHECK-DAG: DemandedBits: 0xffff for   %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 33)
+; CHECK-DAG: DemandedBits: 0x1ffffffff for   %r = and i33 %z, 65535
+define i33 @test_fshr_full_shift(i33 %x, i33 %y) {
+  %x2 = or i33 %x, 1
+  %y2 = or i33 %y, 1
+  %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 33)
+  %r = and i33 %z, 65535
+  ret i33 %r
+}
+
+; CHECK-DAG: DemandedBits: 0xffffffff for   %x2 = or i32 %x, 1
+; CHECK-DAG: DemandedBits: 0xffffffff for   %y2 = or i32 %y, 1
+; CHECK-DAG: DemandedBits: 0x1f for   %z2 = or i32 %z, 1
+; CHECK-DAG: DemandedBits: 0xffff for   %f = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 %z2)
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and i32 %f, 65535
+define i32 @test_fshl_pow2_bitwidth(i32 %x, i32 %y, i32 %z) {
+  %x2 = or i32 %x, 1
+  %y2 = or i32 %y, 1
+  %z2 = or i32 %z, 1
+  %f = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 %z2)
+  %r = and i32 %f, 65535
+  ret i32 %r
+}
+
+; CHECK-DAG: DemandedBits: 0x1ffffffff for   %x2 = or i33 %x, 1
+; CHECK-DAG: DemandedBits: 0x1ffffffff for   %y2 = or i33 %y, 1
+; CHECK-DAG: DemandedBits: 0x1ffffffff for   %z2 = or i33 %z, 1
+; CHECK-DAG: DemandedBits: 0xffff for   %f = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 %z2)
+; CHECK-DAG: DemandedBits: 0x1ffffffff for   %r = and i33 %f, 65535
+define i33 @test_fshr_non_pow2_bitwidth(i33 %x, i33 %y, i33 %z) {
+  %x2 = or i33 %x, 1
+  %y2 = or i33 %y, 1
+  %z2 = or i33 %z, 1
+  %f = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 %z2)
+  %r = and i33 %f, 65535
+  ret i33 %r
+}
diff --git a/test/Analysis/DemandedBits/vectors.ll b/test/Analysis/DemandedBits/vectors.ll
new file mode 100644
index 0000000000000000000000000000000000000000..36cde05fb7c6221c94834919ad8fde9062db2d80
--- /dev/null
+++ b/test/Analysis/DemandedBits/vectors.ll
@@ -0,0 +1,136 @@
+; RUN: opt -S -demanded-bits -analyze < %s | FileCheck %s
+; RUN: opt -S -disable-output -passes="print<demanded-bits>" < %s 2>&1 | FileCheck %s
+
+; CHECK-DAG: DemandedBits: 0xff00 for   %x = or <2 x i32> %a, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xff00 for   %y = or <2 x i32> %b, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xff00 for   %z = or <2 x i32> %x, %y
+; CHECK-DAG: DemandedBits: 0xff for   %u = lshr <2 x i32> %z, <i32 8, i32 8>
+; CHECK-DAG: DemandedBits: 0xff for   %r = trunc <2 x i32> %u to <2 x i8>
+define <2 x i8> @test_basic(<2 x i32> %a, <2 x i32> %b) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %y = or <2 x i32> %b, zeroinitializer
+  %z = or <2 x i32> %x, %y
+  %u = lshr <2 x i32> %z, <i32 8, i32 8>
+  %r = trunc <2 x i32> %u to <2 x i8>
+  ret <2 x i8> %r
+}
+
+; Vector-specific instructions
+
+; CHECK-DAG: DemandedBits: 0xff for   %x = or <2 x i32> %a, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xf0 for   %z = extractelement <2 x i32> %x, i32 1
+; CHECK-DAG: DemandedBits: 0xf for   %y = extractelement <2 x i32> %x, i32 0
+; CHECK-DAG: DemandedBits: 0xffffffff for   %u = and i32 %y, 15
+; CHECK-DAG: DemandedBits: 0xffffffff for   %v = and i32 %z, 240
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = or i32 %u, %v
+define i32 @test_extractelement(<2 x i32> %a) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %y = extractelement <2 x i32> %x, i32 0
+  %z = extractelement <2 x i32> %x, i32 1
+  %u = and i32 %y, 15
+  %v = and i32 %z, 240
+  %r = or i32 %u, %v
+  ret i32 %r
+}
+
+; CHECK-DAG: DemandedBits: 0xff for   %x = or i32 %a, 0
+; CHECK-DAG: DemandedBits: 0xff for   %y = or i32 %b, 0
+; CHECK-DAG: DemandedBits: 0xff for   %z = insertelement <2 x i32> undef, i32 %x, i32 0
+; CHECK-DAG: DemandedBits: 0xff for   %u = insertelement <2 x i32> %z, i32 %y, i32 1
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and <2 x i32> %u, <i32 255, i32 127>
+define <2 x i32> @test_insertelement(i32 %a, i32 %b) {
+  %x = or i32 %a, 0
+  %y = or i32 %b, 0
+  %z = insertelement <2 x i32> undef, i32 %x, i32 0
+  %u = insertelement <2 x i32> %z, i32 %y, i32 1
+  %r = and <2 x i32> %u, <i32 255, i32 127>
+  ret <2 x i32> %r
+}
+
+; CHECK-DAG: DemandedBits: 0xff for   %x = or <2 x i32> %a, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xff for   %y = or <2 x i32> %b, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xff for   %z = shufflevector <2 x i32> %x, <2 x i32> %y, <3 x i32> <i32 0, i32 3, i32 1>
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and <3 x i32> %z, <i32 255, i32 127, i32 0>
+define <3 x i32> @test_shufflevector(<2 x i32> %a, <2 x i32> %b) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %y = or <2 x i32> %b, zeroinitializer
+  %z = shufflevector <2 x i32> %x, <2 x i32> %y, <3 x i32> <i32 0, i32 3, i32 1>
+  %r = and <3 x i32> %z, <i32 255, i32 127, i32 0>
+  ret <3 x i32> %r
+}
+
+; Shifts with splat shift amounts
+
+; CHECK-DAG: DemandedBits: 0xf for   %x = or <2 x i32> %a, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xf0 for   %y = shl <2 x i32> %x, <i32 4, i32 4>
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and <2 x i32> %y, <i32 240, i32 240>
+define <2 x i32> @test_shl(<2 x i32> %a) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %y = shl <2 x i32> %x, <i32 4, i32 4>
+  %r = and <2 x i32> %y, <i32 240, i32 240>
+  ret <2 x i32> %r
+}
+
+; CHECK-DAG: DemandedBits: 0xf00 for   %x = or <2 x i32> %a, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xf0 for   %y = ashr <2 x i32> %x, <i32 4, i32 4>
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and <2 x i32> %y, <i32 240, i32 240>
+define <2 x i32> @test_ashr(<2 x i32> %a) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %y = ashr <2 x i32> %x, <i32 4, i32 4>
+  %r = and <2 x i32> %y, <i32 240, i32 240>
+  ret <2 x i32> %r
+}
+
+; CHECK-DAG: DemandedBits: 0xf00 for   %x = or <2 x i32> %a, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xf0 for   %y = lshr <2 x i32> %x, <i32 4, i32 4>
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and <2 x i32> %y, <i32 240, i32 240>
+define <2 x i32> @test_lshr(<2 x i32> %a) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %y = lshr <2 x i32> %x, <i32 4, i32 4>
+  %r = and <2 x i32> %y, <i32 240, i32 240>
+  ret <2 x i32> %r
+}
+
+declare <2 x i32> @llvm.fshl.i32(<2 x i32>, <2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.fshr.i32(<2 x i32>, <2 x i32>, <2 x i32>)
+
+; CHECK-DAG: DemandedBits: 0xf for   %x = or <2 x i32> %a, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xf0000000 for   %y = or <2 x i32> %b, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xff for   %z = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 4>)
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and <2 x i32> %z, <i32 255, i32 255>
+define <2 x i32> @test_fshl(<2 x i32> %a, <2 x i32> %b) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %y = or <2 x i32> %b, zeroinitializer
+  %z = call <2 x i32> @llvm.fshl.i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 4>)
+  %r = and <2 x i32> %z, <i32 255, i32 255>
+  ret <2 x i32> %r
+}
+
+; CHECK-DAG: DemandedBits: 0xf for   %x = or <2 x i32> %a, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xf0000000 for   %y = or <2 x i32> %b, zeroinitializer
+; CHECK-DAG: DemandedBits: 0xff for   %z = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 28, i32 28>)
+; CHECK-DAG: DemandedBits: 0xffffffff for   %r = and <2 x i32> %z, <i32 255, i32 255>
+define <2 x i32> @test_fshr(<2 x i32> %a, <2 x i32> %b) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %y = or <2 x i32> %b, zeroinitializer
+  %z = call <2 x i32> @llvm.fshr.i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 28, i32 28>)
+  %r = and <2 x i32> %z, <i32 255, i32 255>
+  ret <2 x i32> %r
+}
+
+; FP / Int conversion. These have different input / output types.
+
+; CHECK-DAG: DemandedBits: 0xffffffff for   %x = or <2 x i32> %a, zeroinitializer
+define <2 x float> @test_uitofp(<2 x i32> %a) {
+  %x = or <2 x i32> %a, zeroinitializer
+  %r = uitofp <2 x i32> %x to <2 x float>
+  ret <2 x float> %r
+}
+
+; CHECK-DAG: DemandedBits: 0xffffffff for   %y = fptoui <2 x float> %x to <2 x i32>
+define <2 x i32> @test_fptoui(<2 x float> %a) {
+  %x = fadd <2 x float> %a, <float 1.0, float 1.0>
+  %y = fptoui <2 x float> %x to <2 x i32>
+  %r = and <2 x i32> %y, <i32 255, i32 255>
+  ret <2 x i32> %y
+}
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eaac9ce526c4be351ad813c4ce6392cd2e445f32
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
@@ -0,0 +1,14 @@
+; RUN: opt  -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+define amdgpu_kernel void @workitem_id_x() #1 {
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
+; CHECK-NOT: DIVERGENT:  %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..521d528d7952111600d3970c1d81eb3a72950713
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
@@ -0,0 +1,45 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
+define i32 @test1(i32* %ptr, i32 %val) #0 {
+  %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
+  ret i32 %orig
+}
+
+; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) {
+  %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+  ret {i32, i1} %orig
+}
+
+; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
+define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
+  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
+define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
+define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
+define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind argmemonly }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..660c4f573f1df0f8c3778032b0b066ea26035570
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
@@ -0,0 +1,26 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cond.var = icmp slt i32 %tid, 0
+  br i1 %cond.var, label %B, label %C ; divergent
+; CHECK: DIVERGENT: br i1 %cond.var,
+B:
+  %cond.uni = icmp slt i32 %n, 0
+  br i1 %cond.uni, label %C, label %merge ; uniform
+; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
+C:
+  %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B  ]
+; CHECK: DIVERGENT: %phi.var.hidden = phi i32
+  br label %merge
+merge:
+  %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ]
+; CHECK: DIVERGENT: %phi.ipd = phi i32
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..12e2b0ffd443802c9aa25231feff811cd3d1a332
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
@@ -0,0 +1,223 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; divergent loop (H<header><exiting to X>, B<exiting to Y>)
+; the divergent join point in %exit is obscured by uniform control joining in %X
+define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_loop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %X, label %H  ; uniform
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ]
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %B ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+B:
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exity = icmp sgt i32 %tid, 0
+  br i1 %div.exity, label %Y, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exity =  
+; CHECK: DIVERGENT: br i1 %div.exity, 
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi
+  br i1 %uni.cond, label %Y, label %exit
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  br label %exit
+; CHECK: DIVERGENT: %div.merge.y =
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (H<header><exiting to X>, B<exiting to Y>)
+; the phi nodes in X and Y don't actually receive divergent values
+define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unobserved_loop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %X, label %H  ; uniform
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ]
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %B ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+B:
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exity = icmp sgt i32 %tid, 0
+  br i1 %div.exity, label %Y, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exity =  
+; CHECK: DIVERGENT: br i1 %div.exity, 
+
+X:
+  %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] 
+  br label %exit
+
+Y:
+  %uni.merge.y = phi i32 [ %b, %B ]
+  br label %exit
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (G<header>, L<exiting to D>) inside divergent loop (H<header>, B<exiting to X>, C<exiting to Y>, D, G, L)
+; the inner loop has no exit to top level.
+; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D.
+define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_nestedloop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  br i1 %uni.cond, label %X, label %C 
+
+C:
+  br i1 %uni.cond, label %Y, label %D
+
+D:
+  %uni.inc = add i32 %uni.merge.h, 1
+  br label %H
+
+G:
+  br i1 %div.exitx, label %C, label %L
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  br i1 %uni.cond, label %D, label %G
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi
+  br i1 %uni.cond, label %Y, label %exit
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ]
+  br label %exit
+; CHECK: DIVERGENT: %div.merge.y =
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (G<header>, L<exiting to X>) in divergent loop (H<header>, B<exiting to C>, C, G, L)
+; the outer loop has no immediately divergent exiting edge.
+; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent.
+define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_doublebreak_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  br i1 %uni.cond, label %Y, label %C 
+
+C:
+  %uni.inc = add i32 %uni.merge.h, 1
+  br label %H
+
+G:
+  br i1 %div.exitx, label %X, label %L ; two-level break
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  br i1 %uni.cond, label %C, label %G
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence
+  br label %Y
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.y =
+}
+
+; divergent loop (G<header>, L<exiting to D>) contained inside a uniform loop (H<header>, B, G, L , D<exiting to x>)
+define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_containedloop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ]
+  br label %D
+; CHECK: DIVERGENT: %div.merge.b =
+
+G:
+  %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ]
+  br i1 %div.exitx, label %B, label %L
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  %uni.inc.l = add i32 %uni.merge.g, 1
+  br i1 %uni.cond, label %G, label %D
+
+D:
+  %uni.inc.d = add i32 %uni.merge.h, 1
+  br i1 %uni.cond, label %X, label %H
+
+X:
+  %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ]
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1b84fd5262df5d9a1132832047cfcfafdf3c0610
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@@ -0,0 +1,13 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
+  %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
+  store i32 %swizzle, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
+
+attributes #0 = { nounwind convergent }
+attributes #1 = { nounwind readnone convergent }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9a94328be67658de4feebce8681425c29af9ac96
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll
@@ -0,0 +1,48 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; This test contains an unstructured loop.
+;           +-------------- entry ----------------+
+;           |                                     |
+;           V                                     V
+; i1 = phi(0, i3)                            i2 = phi(0, i3)
+;     j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2
+;           ^                 |                   ^
+;           |                 V                   |
+;           +-------- switch (tid / i3) ----------+
+;                             |
+;                             V
+;                        if (i3 == 5) // divergent
+; because sync dependent on (tid / i3).
+define i32 @unstructured_loop(i1 %entry_cond) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
+loop_entry_1:
+  %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
+  %j1 = add i32 %i1, 1
+  br label %loop_body
+loop_entry_2:
+  %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
+  %j2 = add i32 %i2, 2
+  br label %loop_body
+loop_body:
+  %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
+  br label %loop_latch
+loop_latch:
+  %div = sdiv i32 %tid, %i3
+  switch i32 %div, label %branch [ i32 1, label %loop_entry_1
+                                   i32 2, label %loop_entry_2 ]
+branch:
+  %cmp = icmp eq i32 %i3, 5
+  br i1 %cmp, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cmp,
+then:
+  ret i32 0
+else:
+  ret i32 1
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d7e050de1f3a2b6789e9c242b8408e2a64faee49
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
@@ -0,0 +1,41 @@
+; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps':
+; CHECK: DIVERGENT:
+; CHECK-NOT: %arg0
+; CHECK-NOT: %arg1
+; CHECK-NOT: %arg2
+; CHECK: <2 x i32> %arg3
+; CHECK: DIVERGENT:  <3 x i32> %arg4
+; CHECK: DIVERGENT:  float %arg5
+; CHECK: DIVERGENT:  i32 %arg6
+
+define amdgpu_ps void @test_amdgpu_ps([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
+  ret void
+}
+
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel':
+; CHECK-NOT: %arg0
+; CHECK-NOT: %arg1
+; CHECK-NOT: %arg2
+; CHECK-NOT: %arg3
+; CHECK-NOT: %arg4
+; CHECK-NOT: %arg5
+; CHECK-NOT: %arg6
+define amdgpu_kernel void @test_amdgpu_kernel([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
+  ret void
+}
+
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c':
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+define void @test_c([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg b/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2a665f06be72e5515ca6e27018facb35daa201be
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..508a25406527f4190fe68d9bf7ac57e764b8bde9
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -0,0 +1,103 @@
+;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(
+define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add(
+define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(
+define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(
+define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(
+define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(
+define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(
+define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and(
+define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or(
+define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(
+define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(
+define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..97ef984dc816b63485aecaadf25bacba1be20e47
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -0,0 +1,131 @@
+;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(
+define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(
+define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(
+define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(
+define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(
+define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(
+define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(
+define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(
+define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(
+define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(
+define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(
+define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(
+define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(
+define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(
+define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fb7c041e2d1841c7c918be1783d8be50fb377240
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+; CHECK: DIVERGENT:  %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2
+; CHECK: DIVERGENT:  %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4
+; CHECK: DIVERGENT:  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
+
+; The post dominator tree does not have a root node in this case
+define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
+bb0:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %tmp2 = sext i32 %tmp to i64
+  %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2
+  %tmp6 = load volatile float, float addrspace(1)* %tmp5, align 4
+  %tmp8 = fcmp olt float %tmp6, 0.000000e+00
+  br i1 %tmp8, label %bb1, label %bb2
+
+bb1:
+  %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4
+  br label %bb2
+
+bb2:
+  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
+  br label %bb1
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
new file mode 100644
index 0000000000000000000000000000000000000000..978bc4232b1d0c05caffd9f2d069b01aa3db8fbe
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
@@ -0,0 +1,31 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; CHECK-LABEL: 'test1':
+; CHECK-NEXT: DIVERGENT: i32 %bound
+; CHECK: {{^  *}}%counter =
+; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound
+; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body
+; CHECK: {{^  *}}%counter.next =
+; CHECK: {{^  *}}%counter.footer =
+; CHECK: DIVERGENT: br i1 %break, label %end, label %header
+; Note: %counter is not divergent!
+define amdgpu_ps void @test1(i32 %bound) {
+entry:
+  br label %header
+
+header:
+  %counter = phi i32 [ 0, %entry ], [ %counter.footer, %footer ]
+  %break = icmp sge i32 %counter, %bound
+  br i1 %break, label %footer, label %body
+
+body:
+  %counter.next = add i32 %counter, 1
+  br label %footer
+
+footer:
+  %counter.footer = phi i32 [ %counter.next, %body ], [ undef, %header ]
+  br i1 %break, label %end, label %header
+
+end:
+  ret void
+}
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4211ca28ad66ee8412e3f4d119b170c29466f91f
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll
@@ -0,0 +1,154 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; temporal-divergent use of value carried by divergent loop
+define amdgpu_kernel void @temporal_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+X:
+  %div.user = add i32 %uni.inc, 5
+  ret void
+}
+
+; temporal-divergent use of value carried by divergent loop inside a top-level loop
+define amdgpu_kernel void @temporal_diverge_inloop(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_inloop':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %G
+
+G:
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+X:
+  %div.user = add i32 %uni.inc, 5
+  br i1 %uni.cond, label %G, label %Y
+
+Y:
+  %div.alsouser = add i32 %uni.inc, 5
+  ret void
+}
+
+
+; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop
+define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_uniform_indivloop':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %G
+
+G:
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  br i1 %uni.cond, label %X, label %H ; divergent branch
+
+X:
+  %uni.user = add i32 %uni.inc, 5
+  %div.exity = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.exity =  
+  br i1 %div.exity, label %G, label %Y
+; CHECK: DIVERGENT: br i1 %div.exity, 
+
+Y:
+  %div.alsouser = add i32 %uni.inc, 5
+  ret void
+}
+
+
+; temporal-divergent use of value carried by divergent loop, user is inside sibling loop
+define amdgpu_kernel void @temporal_diverge_loopuser(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+X:
+  br label %G
+
+G:
+  %div.user = add i32 %uni.inc, 5
+  br i1 %uni.cond, label %G, label %Y
+
+Y:
+  ret void
+}
+
+; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop
+define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser_nested':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+X:
+  br label %G
+
+G:
+  %div.user = add i32 %uni.inc, 5
+  br i1 %uni.cond, label %G, label %Y
+
+Y:
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b22c5f11abe601480ff952f64a2109d49523daff
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -0,0 +1,45 @@
+; RUN: opt  -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
+
+; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @workitem_id_x() #1 {
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %id.x, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: DIVERGENT:  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+define amdgpu_kernel void @workitem_id_y() #1 {
+  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %id.y, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: DIVERGENT:  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+define amdgpu_kernel void @workitem_id_z() #1 {
+  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %id.z, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: DIVERGENT:  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
+define amdgpu_kernel void @mbcnt_lo() #1 {
+  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
+  store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: DIVERGENT:  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+define amdgpu_kernel void @mbcnt_hi() #1 {
+  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+  store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll b/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
new file mode 100644
index 0000000000000000000000000000000000000000..89954b6f7c06d11c80fae4257c57dd68bac16b8d
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
@@ -0,0 +1,47 @@
+; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @daorder(i32 %n) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'daorder'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cond = icmp slt i32 %tid, 0
+  br i1 %cond, label %A, label %B ; divergent
+; CHECK: DIVERGENT: br i1 %cond,
+A:
+  %defAtA = add i32 %n, 1 ; uniform
+; CHECK-NOT: DIVERGENT: %defAtA = 
+  br label %C
+B:
+  %defAtB = add i32 %n, 2 ; uniform
+; CHECK-NOT: DIVERGENT: %defAtB = 
+  br label %C
+C:
+  %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent
+; CHECK: DIVERGENT: %defAtC =
+  br label %D
+
+D:
+  %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform
+; CHECK-NOT: DIVERGENT: %i = phi
+  br label %E
+
+E:
+  %i.inc = add i32 %i, 1
+  %loopCnt = icmp slt i32 %i.inc, %n
+; CHECK-NOT: DIVERGENT: %loopCnt =
+  br i1 %loopCnt, label %D, label %exit
+
+exit:
+  ret i32 %n
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+!nvvm.annotations = !{!0}
+!0 = !{i32 (i32)* @daorder, !"kernel", i32 1}
diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll b/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e2e547282205fecb15a06abfe95fc3200ea4b7b6
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
@@ -0,0 +1,175 @@
+; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
+define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cond = icmp slt i32 %n, 0
+  br i1 %cond, label %then, label %else ; uniform
+; CHECK-NOT: DIVERGENT: br i1 %cond,
+then:
+  %a1 = add i32 %a, %tid
+  br label %merge
+else:
+  %b2 = add i32 %b, %tid
+  br label %merge
+merge:
+  %c = phi i32 [ %a1, %then ], [ %b2, %else ]
+  ret i32 %c
+}
+
+; c = a;
+; if (threadIdx.x < 5)    // divergent: data dependent
+;   c = b;
+; return c;               // c is divergent: sync dependent
+define i32 @sync(i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync'
+bb1:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  %cond = icmp slt i32 %tid, 5
+  br i1 %cond, label %bb2, label %bb3
+; CHECK: DIVERGENT: br i1 %cond,
+bb2:
+  br label %bb3
+bb3:
+  %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid
+; CHECK: DIVERGENT: %c =
+  ret i32 %c
+}
+
+; c = 0;
+; if (threadIdx.x >= 5) {  // divergent
+;   c = (n < 0 ? a : b);  // c here is uniform because n is uniform
+; }
+; // c here is divergent because it is sync dependent on threadIdx.x >= 5
+; return c;
+define i32 @mixed(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed'
+bb1:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+  %cond = icmp slt i32 %tid, 5
+  br i1 %cond, label %bb6, label %bb2
+; CHECK: DIVERGENT: br i1 %cond,
+bb2:
+  %cond2 = icmp slt i32 %n, 0
+  br i1 %cond2, label %bb4, label %bb3
+bb3:
+  br label %bb5
+bb4:
+  br label %bb5
+bb5:
+  %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ]
+; CHECK-NOT: DIVERGENT: %c =
+  br label %bb6
+bb6:
+  %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ]
+; CHECK: DIVERGENT: %c2 =
+  ret i32 %c2
+}
+
+; We conservatively treats all parameters of a __device__ function as divergent.
+define i32 @device(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device'
+; CHECK: DIVERGENT: i32 %n
+; CHECK: DIVERGENT: i32 %a
+; CHECK: DIVERGENT: i32 %b
+entry:
+  %cond = icmp slt i32 %n, 0
+  br i1 %cond, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cond,
+then:
+  br label %merge
+else:
+  br label %merge
+merge:
+  %c = phi i32 [ %a, %then ], [ %b, %else ]
+  ret i32 %c
+}
+
+; int i = 0;
+; do {
+;   i++;                  // i here is uniform
+; } while (i < laneid);
+; return i == 10 ? 0 : 1; // i here is divergent
+;
+; The i defined in the loop is used outside.
+define i32 @loop() {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop'
+entry:
+  %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
+; CHECK-NOT: DIVERGENT: %i =
+  %i1 = add i32 %i, 1
+  %exit_cond = icmp sge i32 %i1, %laneid
+  br i1 %exit_cond, label %loop_exit, label %loop
+loop_exit:
+  %cond = icmp eq i32 %i, 10
+  br i1 %cond, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cond,
+then:
+  ret i32 0
+else:
+  ret i32 1
+}
+
+; Same as @loop, but the loop is in the LCSSA form.
+define i32 @lcssa() {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
+; CHECK-NOT: DIVERGENT: %i =
+  %i1 = add i32 %i, 1
+  %exit_cond = icmp sge i32 %i1, %tid
+  br i1 %exit_cond, label %loop_exit, label %loop
+loop_exit:
+  %i.lcssa = phi i32 [ %i, %loop ]
+; CHECK: DIVERGENT: %i.lcssa =
+  %cond = icmp eq i32 %i.lcssa, 10
+  br i1 %cond, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cond,
+then:
+  ret i32 0
+else:
+  ret i32 1
+}
+
+; Verifies sync-dependence is computed correctly in the absense of loops.
+define i32 @sync_no_loop(i32 %arg) {
+entry:
+  %0 = add i32 %arg, 1
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %1 = icmp sge i32 %tid, 10
+  br i1 %1, label %bb1, label %bb2
+
+bb1:
+  br label %bb3
+
+bb2:
+  br label %bb3
+
+bb3:
+  %2 = add i32 %0, 2
+  ; CHECK-NOT: DIVERGENT: %2
+  ret i32 %2
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+!nvvm.annotations = !{!0, !1, !2, !3, !4}
+!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1}
+!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1}
+!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1}
+!3 = !{i32 ()* @loop, !"kernel", i32 1}
+!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1}
diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll b/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3d61986657e0bb690bc521185d2762eba27b44c0
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cond.var = icmp slt i32 %tid, 0
+  br i1 %cond.var, label %B, label %C ; divergent
+; CHECK: DIVERGENT: br i1 %cond.var,
+B:
+  %cond.uni = icmp slt i32 %n, 0
+  br i1 %cond.uni, label %C, label %merge ; uniform
+; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
+C:
+  %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B  ]
+; CHECK: DIVERGENT: %phi.var.hidden = phi i32
+  br label %merge
+merge:
+  %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ]
+; CHECK: DIVERGENT: %phi.ipd = phi i32
+  ret i32 %phi.ipd
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+
+!nvvm.annotations = !{!0}
+!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1}
diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll b/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2e1686a446d4a9dc59699dc1e0d98889b27f5178
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll
@@ -0,0 +1,55 @@
+; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; This test contains an unstructured loop.
+;           +-------------- entry ----------------+
+;           |                                     |
+;           V                                     V
+; i1 = phi(0, i3)                            i2 = phi(0, i3)
+;     j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2
+;           ^                 |                   ^
+;           |                 V                   |
+;           +-------- switch (tid / i3) ----------+
+;                             |
+;                             V
+;                        if (i3 == 5) // divergent
+; because sync dependent on (tid / i3).
+define i32 @unstructured_loop(i1 %entry_cond) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
+loop_entry_1:
+  %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
+  %j1 = add i32 %i1, 1
+  br label %loop_body
+loop_entry_2:
+  %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
+  %j2 = add i32 %i2, 2
+  br label %loop_body
+loop_body:
+  %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
+  br label %loop_latch
+loop_latch:
+  %div = sdiv i32 %tid, %i3
+  switch i32 %div, label %branch [ i32 1, label %loop_entry_1
+                                   i32 2, label %loop_entry_2 ]
+branch:
+  %cmp = icmp eq i32 %i3, 5
+  br i1 %cmp, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cmp,
+then:
+  ret i32 0
+else:
+  ret i32 1
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+!nvvm.annotations = !{!0}
+!0 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1}
diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg b/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2cb98eb371b21bc47c99d369adaffefd84d4a625
--- /dev/null
+++ b/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll b/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll
index 27cd2263beaa258abb543bf67320c0411cfeeb8b..fbc2eb135d75da5bebc1d16556e4b7b5345cedd7 100644
--- a/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll
+++ b/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll
@@ -72,7 +72,7 @@ loop:
   %cnd1 = icmp sge i32 %iv, 0
   %cnd2 = icmp sgt i32 %iv2, 0
 ; CHECK:       %cnd2 = icmp sgt i32 %iv2, 0
-; CHECK:         ; LatticeVal for: '  %cnd = and i1 %cnd1, %cnd2' in BB: '%loop' is: overdefined
+; CHECK:         ; LatticeVal for: '  %cnd = and i1 %cnd1, %cnd2' in BB: '%loop' is: constantrange<-1, -1>
 ; CHECK-DAG:     ; LatticeVal for: '  %cnd = and i1 %cnd1, %cnd2' in BB: '%backedge' is: constantrange<-1, 0>
 ; CHECK-DAG:     ; LatticeVal for: '  %cnd = and i1 %cnd1, %cnd2' in BB: '%exit' is: overdefined
 ; CHECK-NEXT:  %cnd = and i1 %cnd1, %cnd2
@@ -92,7 +92,7 @@ backedge:
 ; CHECK-NEXT:    ; LatticeVal for: '  %cont2 = icmp sgt i32 %iv2.next, 0' in BB: '%backedge' is: overdefined
 ; CHECK-NEXT:  %cont2 = icmp sgt i32 %iv2.next, 0
   %cont2 = icmp sgt i32 %iv2.next, 0
-; CHECK-NEXT:    ; LatticeVal for: '  %cont = and i1 %cont1, %cont2' in BB: '%backedge' is: overdefined
+; CHECK-NEXT:    ; LatticeVal for: '  %cont = and i1 %cont1, %cont2' in BB: '%backedge' is: constantrange<-1, -1>
 ; CHECK-NEXT:  %cont = and i1 %cont1, %cont2
   %cont = and i1 %cont1, %cont2
   br i1 %cont, label %loop, label %exit
diff --git a/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll b/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
index 0d0fe65694c800503c88f990aa35a2ae7db997a9..cb1b7edb3d10127c624f1108fe5df2bca13f71c7 100644
--- a/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
+++ b/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
@@ -39,7 +39,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 ; CHECK-NEXT:      Group
 ; CHECK-NEXT:        (Low: %b High: ((4 * (1 umax %x)) + %b))
 ; CHECK-NEXT:          Member: {%b,+,4}<%for.body>
-; CHECK:         Multiple stores to invariant address were not found in loop.
+; CHECK:         Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:    SCEV assumptions:
 ; CHECK-NEXT:    {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:    {0,+,1}<%for.body> Added Flags: <nusw>
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
index f24211d1e0dfe1aae30949aa33c9fff51c8e9887..611e957168ffdcc5a34e7132c324f991221e019e 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
@@ -14,14 +14,14 @@
 ; The LAA with the new PM is a loop pass so we go from inner to outer loops.
 
 ; OLDPM: for.cond1.preheader:
-; OLDPM:   Multiple stores to invariant address were not found in loop.
+; OLDPM:   Non vectorizable stores to invariant address were not found in loop.
 ; OLDPM: for.body3:
-; OLDPM:   Multiple stores to invariant address were found in loop.
+; OLDPM:   Non vectorizable stores to invariant address were found in loop.
 
 ; NEWPM: for.body3:
-; NEWPM:   Multiple stores to invariant address were found in loop.
+; NEWPM:   Non vectorizable stores to invariant address were found in loop.
 ; NEWPM: for.cond1.preheader:
-; NEWPM:   Multiple stores to invariant address were not found in loop.
+; NEWPM:   Non vectorizable stores to invariant address were not found in loop.
 
 define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
 entry:
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
index 07bcdcc5c669cf224334fdeec852e031ba36e7b2..d21cc6926c3b131775dc2c3f056a17f83bc8e702 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
@@ -10,8 +10,8 @@
 ;    }
 ;  }
 
-; CHECK: Multiple stores to invariant address were not found in loop.
-; CHECK-NOT: Multiple stores to invariant address were found in loop.
+; CHECK: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NOT: Non vectorizable stores to invariant address were found in loop.
 
 
 define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 {
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
index 8d7452471f5c9d93e1a4d0117da8d2bba8d7c9db..b25d79b3d0394bb66be3515afd0f962b0704ccec 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
@@ -10,7 +10,7 @@
 ;    }
 ;  }
 
-; CHECK: Multiple stores to invariant address were not found in loop.
+; CHECK: Non vectorizable stores to invariant address were not found in loop.
 
 define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 {
 entry:
diff --git a/test/Analysis/ScalarEvolution/pr28705.ll b/test/Analysis/ScalarEvolution/pr28705.ll
index 8fbc08e3ca63ec85eadf29e2d4fb98b36d031996..9a8487a6c662172a3743de91dd51a4ccf67a9dd7 100644
--- a/test/Analysis/ScalarEvolution/pr28705.ll
+++ b/test/Analysis/ScalarEvolution/pr28705.ll
@@ -1,11 +1,11 @@
 ; PR28705
 ; RUN: opt < %s -indvars -S | FileCheck %s
 
-; Check IndVarSimplify replaces the exitval use of the induction var "%inc.i.i"
-; with "%.sroa.speculated + 1".
+; Check IndVarSimplify doesn't replace external use of the induction var
+; "%inc.i.i" with "%.sroa.speculated + 1" because it is not profitable.
 ;
 ; CHECK-LABEL: @foo(
-; CHECK: %[[EXIT:.+]] = sub i32 %.sroa.speculated, -1
+; CHECK: %[[EXIT:.+]] = phi i32 [ %inc.i.i, %for.body650 ]
 ; CHECK: %DB.sroa.9.0.lcssa = phi i32 [ 1, %entry ], [ %[[EXIT]], %loopexit ]
 ;
 define void @foo(i32 %sub.ptr.div.i, i8* %ref.i1174) local_unnamed_addr {
diff --git a/test/Analysis/ScalarEvolution/solve-quadratic.ll b/test/Analysis/ScalarEvolution/solve-quadratic.ll
index cc4f807108dc8187f82b5824b7b8f18e2bb39022..5d5e02efd0e4a39e511b7acf51125a6dbc4ae8e8 100644
--- a/test/Analysis/ScalarEvolution/solve-quadratic.ll
+++ b/test/Analysis/ScalarEvolution/solve-quadratic.ll
@@ -41,13 +41,13 @@
 ; {14,+,14,+,14} -> X=0, Y=14, Z=14
 ;
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test01'
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {-2,+,-2,+,-2}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 4
-; CHECK: GetQuadraticEquation: equation -2x^2 + -2x + -4, coeff bw: 5, multiplied by 2
-; CHECK: SolveQuadraticAddRecExact: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving -2x^2 + -2x + -4, rw:5
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 2x^2 + 2x + -28, rw:5
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 4
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {-2,+,-2,+,-2}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 4
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation -2x^2 + -2x + -4, coeff bw: 5, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -2x^2 + -2x + -4, rw:5
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 2x^2 + 2x + -28, rw:5
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 4
 ; CHECK: Loop %loop: Unpredictable backedge-taken count
 define signext i32 @test01() {
 entry:
@@ -73,25 +73,25 @@ exit:
 ; {-72,+,-36,+,1} -> X=-37, Y=-35, Z=1
 ;
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test02':
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,-36,+,1}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 32
-; CHECK: GetQuadraticEquation: equation 1x^2 + -73x + 0, coeff bw: 33, multiplied by 2
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -73x + 4294967154, rw:32
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -73x + -142, rw:32
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 75
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -73x + 4294967154, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -73x + -4294967438, rw:33
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 65573
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -73x + -146, rw:32
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -73x + -146, rw:32
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 75
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -73x + -146, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -73x + -146, rw:33
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 75
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,-36,+,1}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 1x^2 + -73x + 0, coeff bw: 33, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -73x + 4294967154, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -73x + -142, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 75
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -73x + 4294967154, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -73x + -4294967438, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 65573
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -73x + -146, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -73x + -146, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 75
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -73x + -146, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -73x + -146, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 75
 ; CHECK: Loop %loop: backedge-taken count is 75
 define signext i32 @test02() {
 entry:
@@ -117,13 +117,13 @@ exit:
 ; {17,+,-1,+,2} -> X=-3, Y=20, Z=2
 ;
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test03':
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {1,+,-1,+,2}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 4
-; CHECK: GetQuadraticEquation: equation 2x^2 + -4x + 2, coeff bw: 5, multiplied by 2
-; CHECK: SolveQuadraticAddRecExact: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 2x^2 + -4x + 2, rw:5
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 2x^2 + -4x + 2, rw:5
-; CHECK: SolveQuadraticEquationWrap: solution (root): 1
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {1,+,-1,+,2}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 4
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 2x^2 + -4x + 2, coeff bw: 5, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 2x^2 + -4x + 2, rw:5
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 2x^2 + -4x + 2, rw:5
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (root): 1
 ; CHECK: Loop %loop: backedge-taken count is 1
 define signext i32 @test03() {
 entry:
@@ -174,32 +174,32 @@ exit:
 ;
 
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test04':
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,3,+,4}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 16
-; CHECK: GetQuadraticEquation: equation 4x^2 + 2x + 0, coeff bw: 17, multiplied by 2
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:16
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -65534, rw:16
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 128
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:17
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -131070, rw:17
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 181
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:16
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -65534, rw:16
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 128
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:17
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -131070, rw:17
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 181
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {1,+,3,+,4}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 16
-; CHECK: GetQuadraticEquation: equation 4x^2 + 2x + 2, coeff bw: 17, multiplied by 2
-; CHECK: SolveQuadraticAddRecExact: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:17
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -131070, rw:17
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 181
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,3,+,4}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 16
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 4x^2 + 2x + 0, coeff bw: 17, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:16
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -65534, rw:16
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 128
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:17
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -131070, rw:17
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 181
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:16
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -65534, rw:16
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 128
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:17
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -131070, rw:17
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 181
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {1,+,3,+,4}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 16
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 4x^2 + 2x + 2, coeff bw: 17, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:17
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -131070, rw:17
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 181
 ; CHECK: Loop %loop: Unpredictable backedge-taken count.
 define signext i32 @test04() {
 entry:
@@ -224,25 +224,25 @@ exit:
 ; A case with signed arithmetic, but unsigned comparison.
 
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test05':
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,-1,+,-1}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 32
-; CHECK: GetQuadraticEquation: equation -1x^2 + -1x + 0, coeff bw: 33, multiplied by 2
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving -1x^2 + -1x + 4, rw:32
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + 1x + -4, rw:32
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 2
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving -1x^2 + -1x + 4, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + 1x + -4, rw:33
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 2
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving -1x^2 + -1x + -2, rw:32
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + 1x + -4294967294, rw:32
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 65536
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving -1x^2 + -1x + -2, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + 1x + -8589934590, rw:33
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 92682
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,-1,+,-1}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation -1x^2 + -1x + 0, coeff bw: 33, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -1x^2 + -1x + 4, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + 1x + -4, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 2
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -1x^2 + -1x + 4, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + 1x + -4, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 2
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -1x^2 + -1x + -2, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + 1x + -4294967294, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 65536
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -1x^2 + -1x + -2, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + 1x + -8589934590, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 92682
 ; CHECK: Loop %loop: backedge-taken count is 2
 
 define signext i32 @test05() {
@@ -268,25 +268,25 @@ exit:
 ; A test that used to crash with one of the earlier versions of the code.
 
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test06':
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,-99999,+,1}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 32
-; CHECK: GetQuadraticEquation: equation 1x^2 + -199999x + 0, coeff bw: 33, multiplied by 2
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -199999x + -4294967294, rw:32
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -199999x + 2, rw:32
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 1
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -199999x + -4294967294, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -199999x + 4294967298, rw:33
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 24469
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -199999x + -12, rw:32
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -199999x + 4294967284, rw:32
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 24469
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -199999x + -12, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -199999x + 8589934580, rw:33
-; CHECK: SolveQuadraticEquationWrap: solution (wrap): 62450
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,-99999,+,1}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 1x^2 + -199999x + 0, coeff bw: 33, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -199999x + -4294967294, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -199999x + 2, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 1
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -199999x + -4294967294, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -199999x + 4294967298, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 24469
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -199999x + -12, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -199999x + 4294967284, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 24469
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -199999x + -12, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -199999x + 8589934580, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 62450
 ; CHECK: Loop %loop: backedge-taken count is 24469
 define signext i32 @test06() {
 entry:
@@ -315,32 +315,32 @@ exit:
 ; solves the equation exactly, or changes the sign of it between n and n+1.
 
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test07':
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,40811489,+,532052752}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 32
-; CHECK: GetQuadraticEquation: equation 532052752x^2 + -450429774x + 0, coeff bw: 33, multiplied by 2
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:32
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:32
-; CHECK: SolveQuadraticEquationWrap: no valid solution
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33
-; CHECK: SolveQuadraticEquationWrap: no valid solution
-; CHECK: SolveQuadraticAddRecRange: solving for signed overflow
-; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:32
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:32
-; CHECK: SolveQuadraticEquationWrap: no valid solution
-; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33
-; CHECK: SolveQuadraticEquationWrap: no valid solution
-; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {35594207,+,40811489,+,532052752}<%loop>
-; CHECK: GetQuadraticEquation: addrec coeff bw: 32
-; CHECK: GetQuadraticEquation: equation 532052752x^2 + -450429774x + 71188414, coeff bw: 33, multiplied by 2
-; CHECK: SolveQuadraticAddRecExact: solving for unsigned overflow
-; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:33
-; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33
-; CHECK: SolveQuadraticEquationWrap: no valid solution
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,40811489,+,532052752}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 532052752x^2 + -450429774x + 0, coeff bw: 33, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:32
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution
+; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {35594207,+,40811489,+,532052752}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 532052752x^2 + -450429774x + 71188414, coeff bw: 33, multiplied by 2
+; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33
+; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution
 ; CHECK: Loop %loop: Unpredictable backedge-taken count.
 define signext i32 @test07() {
 entry:
diff --git a/test/Analysis/StackSafetyAnalysis/Inputs/ipa-alias.ll b/test/Analysis/StackSafetyAnalysis/Inputs/ipa-alias.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bd290a8321aa382f9753648ce9cf17a95759a88c
--- /dev/null
+++ b/test/Analysis/StackSafetyAnalysis/Inputs/ipa-alias.ll
@@ -0,0 +1,18 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@InterposableAliasWrite1 = linkonce dso_local alias void(i8*), void(i8*)* @Write1
+
+@PreemptableAliasWrite1 = dso_preemptable alias void(i8*), void(i8*)* @Write1
+@AliasToPreemptableAliasWrite1 = dso_local alias void(i8*), void(i8*)* @PreemptableAliasWrite1
+
+@AliasWrite1 = dso_local alias void(i8*), void(i8*)* @Write1
+
+@BitcastAliasWrite1 = dso_local alias void(i32*), bitcast (void(i8*)* @Write1 to void(i32*)*)
+@AliasToBitcastAliasWrite1 = dso_local alias void(i8*), bitcast (void(i32*)* @BitcastAliasWrite1 to void(i8*)*)
+
+define dso_local void @Write1(i8* %p) {
+entry:
+  store i8 0, i8* %p, align 1
+  ret void
+}
diff --git a/test/Analysis/StackSafetyAnalysis/Inputs/ipa.ll b/test/Analysis/StackSafetyAnalysis/Inputs/ipa.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fb789345692d93535db92b96a8aa17e570992e0a
--- /dev/null
+++ b/test/Analysis/StackSafetyAnalysis/Inputs/ipa.ll
@@ -0,0 +1,118 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @Write1(i8* %p) {
+entry:
+  store i8 0, i8* %p, align 1
+  ret void
+}
+
+define dso_local void @Write4(i8* %p) {
+entry:
+  %0 = bitcast i8* %p to i32*
+  store i32 0, i32* %0, align 1
+  ret void
+}
+
+define dso_local void @Write4_2(i8* %p, i8* %q) {
+entry:
+  %0 = bitcast i8* %p to i32*
+  store i32 0, i32* %0, align 1
+  %1 = bitcast i8* %q to i32*
+  store i32 0, i32* %1, align 1
+  ret void
+}
+
+define dso_local void @Write8(i8* %p) {
+entry:
+  %0 = bitcast i8* %p to i64*
+  store i64 0, i64* %0, align 1
+  ret void
+}
+
+define dso_local i8* @WriteAndReturn8(i8* %p) {
+entry:
+  store i8 0, i8* %p, align 1
+  ret i8* %p
+}
+
+declare dso_local void @ExternalCall(i8* %p)
+
+define dso_preemptable void @PreemptableWrite1(i8* %p) {
+entry:
+  store i8 0, i8* %p, align 1
+  ret void
+}
+
+define linkonce dso_local void @InterposableWrite1(i8* %p) {
+entry:
+  store i8 0, i8* %p, align 1
+  ret void
+}
+
+define dso_local i8* @ReturnDependent(i8* %p) {
+entry:
+  %p2 = getelementptr i8, i8* %p, i64 2
+  ret i8* %p2
+}
+
+; access range [2, 6)
+define dso_local void @Rec0(i8* %p) {
+entry:
+  %p1 = getelementptr i8, i8* %p, i64 2
+  call void @Write4(i8* %p1)
+  ret void
+}
+
+; access range [3, 7)
+define dso_local void @Rec1(i8* %p) {
+entry:
+  %p1 = getelementptr i8, i8* %p, i64 1
+  call void @Rec0(i8* %p1)
+  ret void
+}
+
+; access range [-2, 2)
+define dso_local void @Rec2(i8* %p) {
+entry:
+  %p1 = getelementptr i8, i8* %p, i64 -5
+  call void @Rec1(i8* %p1)
+  ret void
+}
+
+; Recursive function that passes %acc unchanged => access range [0, 4).
+define dso_local void @RecursiveNoOffset(i32* %p, i32 %size, i32* %acc) {
+entry:
+  %cmp = icmp eq i32 %size, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  %0 = load i32, i32* %p, align 4
+  %1 = load i32, i32* %acc, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, i32* %acc, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %sub = add nsw i32 %size, -1
+  tail call void @RecursiveNoOffset(i32* %add.ptr, i32 %sub, i32* %acc)
+  ret void
+
+return:
+  ret void
+}
+
+; Recursive function that advances %acc on each iteration => access range unlimited.
+define dso_local void @RecursiveWithOffset(i32 %size, i32* %acc) {
+entry:
+  %cmp = icmp eq i32 %size, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  store i32 0, i32* %acc, align 4
+  %acc2 = getelementptr inbounds i32, i32* %acc, i64 1
+  %sub = add nsw i32 %size, -1
+  tail call void @RecursiveWithOffset(i32 %sub, i32* %acc2)
+  ret void
+
+return:
+  ret void
+}
diff --git a/test/Analysis/StackSafetyAnalysis/ipa-alias.ll b/test/Analysis/StackSafetyAnalysis/ipa-alias.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d77e7a68925a28e41cb71dc00b7a5ec57064031c
--- /dev/null
+++ b/test/Analysis/StackSafetyAnalysis/ipa-alias.ll
@@ -0,0 +1,133 @@
+; Test IPA over a single combined file
+; RUN: llvm-as %s -o %t0.bc
+; RUN: llvm-as %S/Inputs/ipa-alias.ll -o %t1.bc
+; RUN: llvm-link %t0.bc %t1.bc -o %t.combined.bc
+; RUN: opt -S -analyze -stack-safety-local %t.combined.bc | FileCheck %s --check-prefixes=CHECK,LOCAL
+; RUN: opt -S -passes="print<stack-safety-local>" -disable-output %t.combined.bc 2>&1 | FileCheck %s --check-prefixes=CHECK,LOCAL
+; RUN: opt -S -analyze -stack-safety %t.combined.bc | FileCheck %s --check-prefixes=CHECK,GLOBAL
+; RUN: opt -S -passes="print-stack-safety" -disable-output %t.combined.bc 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @PreemptableAliasWrite1(i8* %p)
+declare void @AliasToPreemptableAliasWrite1(i8* %p)
+
+declare void @InterposableAliasWrite1(i8* %p)
+; Aliases to interposable aliases are not allowed
+
+declare void @AliasWrite1(i8* %p)
+
+declare void @BitcastAliasWrite1(i32* %p)
+declare void @AliasToBitcastAliasWrite1(i8* %p)
+
+; Call to dso_preemptable alias to a dso_local aliasee
+define void @PreemptableAliasCall() {
+; CHECK-LABEL: @PreemptableAliasCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x1[1]: empty-set, @PreemptableAliasWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x1[1]: full-set, @PreemptableAliasWrite1(arg0, [0,1)){{$}}
+; LOCAL-NEXT: x2[1]: empty-set, @AliasToPreemptableAliasWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x2[1]: [0,1), @AliasToPreemptableAliasWrite1(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x1 = alloca i8
+  call void @PreemptableAliasWrite1(i8* %x1)
+
+  %x2 = alloca i8
+; Alias to a preemptable alias is not preemptable
+  call void @AliasToPreemptableAliasWrite1(i8* %x2)
+  ret void
+}
+
+; Call to an interposable alias to a non-interposable aliasee
+define void @InterposableAliasCall() {
+; CHECK-LABEL: @InterposableAliasCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[1]: empty-set, @InterposableAliasWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[1]: full-set, @InterposableAliasWrite1(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i8
+; ThinLTO can resolve the prevailing implementation for interposable definitions.
+  call void @InterposableAliasWrite1(i8* %x)
+  ret void
+}
+
+; Call to a dso_local/non-interposable alias/aliasee
+define void @AliasCall() {
+; CHECK-LABEL: @AliasCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[1]: empty-set, @AliasWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[1]: [0,1), @AliasWrite1(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i8
+  call void @AliasWrite1(i8* %x)
+  ret void
+}
+
+; Call to a bitcasted dso_local/non-interposable alias/aliasee
+define void @BitcastAliasCall() {
+; CHECK-LABEL: @BitcastAliasCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x1[4]: empty-set, @BitcastAliasWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x1[4]: [0,1), @BitcastAliasWrite1(arg0, [0,1)){{$}}
+; LOCAL-NEXT: x2[1]: empty-set, @AliasToBitcastAliasWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x2[1]: [0,1), @AliasToBitcastAliasWrite1(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x1 = alloca i32
+  call void @BitcastAliasWrite1(i32* %x1)
+  %x2 = alloca i8
+  call void @AliasToBitcastAliasWrite1(i8* %x2)
+  ret void
+}
+
+; The rest is from Inputs/ipa-alias.ll
+
+; CHECK-LABEL: @Write1{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: [0,1){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; GLOBAL-LABEL: @InterposableAliasWrite1 interposable{{$}}
+; GLOBAL-NEXT: args uses:
+; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: allocas uses:
+; GLOBAL-NOT: ]:
+
+; GLOBAL-LABEL: @PreemptableAliasWrite1 dso_preemptable{{$}}
+; GLOBAL-NEXT: args uses:
+; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: allocas uses:
+; GLOBAL-NOT: ]:
+
+; GLOBAL-LABEL: @AliasToPreemptableAliasWrite1{{$}}
+; GLOBAL-NEXT: args uses:
+; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: allocas uses:
+; GLOBAL-NOT: ]:
+
+; GLOBAL-LABEL: @AliasWrite1{{$}}
+; GLOBAL-NEXT: args uses:
+; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: allocas uses:
+; GLOBAL-NOT: ]:
+
+; GLOBAL-LABEL: @BitcastAliasWrite1{{$}}
+; GLOBAL-NEXT: args uses:
+; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: allocas uses:
+; GLOBAL-NOT: ]:
+
+; GLOBAL-LABEL: @AliasToBitcastAliasWrite1{{$}}
+; GLOBAL-NEXT: args uses:
+; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: allocas uses:
+; GLOBAL-NOT: ]:
diff --git a/test/Analysis/StackSafetyAnalysis/ipa.ll b/test/Analysis/StackSafetyAnalysis/ipa.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6791dd0866b85bb1bf7d49b48d539e091406575f
--- /dev/null
+++ b/test/Analysis/StackSafetyAnalysis/ipa.ll
@@ -0,0 +1,448 @@
+; RUN: llvm-as %s -o %t0.bc
+; RUN: llvm-as %S/Inputs/ipa.ll -o %t1.bc
+; RUN: llvm-link -disable-lazy-loading %t0.bc %t1.bc -o %t.combined.bc
+; RUN: opt -S -analyze -stack-safety-local %t.combined.bc | FileCheck %s --check-prefixes=CHECK,LOCAL
+; RUN: opt -S -passes="print<stack-safety-local>" -disable-output %t.combined.bc 2>&1 | FileCheck %s --check-prefixes=CHECK,LOCAL
+; RUN: opt -S -analyze -stack-safety %t.combined.bc | FileCheck %s --check-prefixes=CHECK,GLOBAL
+; RUN: opt -S -passes="print-stack-safety" -disable-output %t.combined.bc 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @Write1(i8* %p)
+declare void @Write4(i8* %p)
+declare void @Write4_2(i8* %p, i8* %q)
+declare void @Write8(i8* %p)
+declare dso_local i8* @WriteAndReturn8(i8* %p)
+declare dso_local void @ExternalCall(i8* %p)
+declare void @PreemptableWrite1(i8* %p)
+declare void @InterposableWrite1(i8* %p)
+declare i8* @ReturnDependent(i8* %p)
+declare void @Rec2(i8* %p)
+declare void @RecursiveNoOffset(i32* %p, i32 %size, i32* %acc)
+declare void @RecursiveWithOffset(i32 %size, i32* %acc)
+
+; Basic out-of-bounds.
+define void @f1() {
+; CHECK-LABEL: @f1 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @Write8(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[4]: [0,8), @Write8(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void @Write8(i8* %x1)
+  ret void
+}
+
+; Basic in-bounds.
+define void @f2() {
+; CHECK-LABEL: @f2 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @Write1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[4]: [0,1), @Write1(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void @Write1(i8* %x1)
+  ret void
+}
+
+; Another basic in-bounds.
+define void @f3() {
+; CHECK-LABEL: @f3 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @Write4(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[4]: [0,4), @Write4(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void @Write4(i8* %x1)
+  ret void
+}
+
+; In-bounds with offset.
+define void @f4() {
+; CHECK-LABEL: @f4 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @Write1(arg0, [1,2)){{$}}
+; GLOBAL-NEXT: x[4]: [1,2), @Write1(arg0, [1,2)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 1
+  call void @Write1(i8* %x2)
+  ret void
+}
+
+; Out-of-bounds with offset.
+define void @f5() {
+; CHECK-LABEL: @f5 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: empty-set, @Write4(arg0, [1,2)){{$}}
+; GLOBAL-NEXT: [1,5), @Write4(arg0, [1,2)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 1
+  call void @Write4(i8* %x2)
+  ret void
+}
+
+; External call.
+define void @f6() {
+; CHECK-LABEL: @f6 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @ExternalCall(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[4]: full-set, @ExternalCall(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void @ExternalCall(i8* %x1)
+  ret void
+}
+
+; Call to dso_preemptable function
+define void @PreemptableCall() {
+; CHECK-LABEL: @PreemptableCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @PreemptableWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[4]: full-set, @PreemptableWrite1(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void @PreemptableWrite1(i8* %x1)
+  ret void
+}
+
+; Call to function with interposable linkage
+define void @InterposableCall() {
+; CHECK-LABEL: @InterposableCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @InterposableWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[4]: full-set, @InterposableWrite1(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void @InterposableWrite1(i8* %x1)
+  ret void
+}
+
+; Call to function with private linkage
+define void @PrivateCall() {
+; CHECK-LABEL: @PrivateCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @PrivateWrite1(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[4]: [0,1), @PrivateWrite1(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void @PrivateWrite1(i8* %x1)
+  ret void
+}
+
+define private void @PrivateWrite1(i8* %p) {
+; CHECK-LABEL: @PrivateWrite1{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: [0,1){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+entry:
+  store i8 0, i8* %p, align 1
+  ret void
+}
+
+; Caller returns a dependent value.
+; FIXME: alloca considered unsafe even if the return value is unused.
+define void @f7() {
+; CHECK-LABEL: @f7 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[4]: empty-set, @ReturnDependent(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[4]: full-set, @ReturnDependent(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = call i8* @ReturnDependent(i8* %x1)
+  ret void
+}
+
+define void @f8left() {
+; CHECK-LABEL: @f8left dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Rec2(arg0, [2,3)){{$}}
+; GLOBAL-NEXT: x[8]: [0,4), @Rec2(arg0, [2,3)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 2
+; 2 + [-2, 2) = [0, 4) => OK
+  call void @Rec2(i8* %x2)
+  ret void
+}
+
+define void @f8right() {
+; CHECK-LABEL: @f8right dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Rec2(arg0, [6,7)){{$}}
+; GLOBAL-NEXT: x[8]: [4,8), @Rec2(arg0, [6,7)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 6
+; 6 + [-2, 2) = [4, 8) => OK
+  call void @Rec2(i8* %x2)
+  ret void
+}
+
+define void @f8oobleft() {
+; CHECK-LABEL: @f8oobleft dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Rec2(arg0, [1,2)){{$}}
+; GLOBAL-NEXT: x[8]: [-1,3), @Rec2(arg0, [1,2)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 1
+; 1 + [-2, 2) = [-1, 3) => NOT OK
+  call void @Rec2(i8* %x2)
+  ret void
+}
+
+define void @f8oobright() {
+; CHECK-LABEL: @f8oobright dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Rec2(arg0, [7,8)){{$}}
+; GLOBAL-NEXT: x[8]: [5,9), @Rec2(arg0, [7,8)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 7
+; 7 + [-2, 2) = [5, 9) => NOT OK
+  call void @Rec2(i8* %x2)
+  ret void
+}
+
+define void @TwoArguments() {
+; CHECK-LABEL: @TwoArguments dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Write4_2(arg1, [0,1)), @Write4_2(arg0, [4,5)){{$}}
+; GLOBAL-NEXT: x[8]: [0,8), @Write4_2(arg1, [0,1)), @Write4_2(arg0, [4,5)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 4
+  call void @Write4_2(i8* %x2, i8* %x1)
+  ret void
+}
+
+define void @TwoArgumentsOOBOne() {
+; CHECK-LABEL: @TwoArgumentsOOBOne dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Write4_2(arg1, [0,1)), @Write4_2(arg0, [5,6)){{$}}
+; GLOBAL-NEXT: x[8]: [0,9), @Write4_2(arg1, [0,1)), @Write4_2(arg0, [5,6)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 5
+  call void @Write4_2(i8* %x2, i8* %x1)
+  ret void
+}
+
+define void @TwoArgumentsOOBOther() {
+; CHECK-LABEL: @TwoArgumentsOOBOther dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Write4_2(arg1, [-1,0)), @Write4_2(arg0, [4,5)){{$}}
+; GLOBAL-NEXT: x[8]: [-1,8), @Write4_2(arg1, [-1,0)), @Write4_2(arg0, [4,5)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x0 = bitcast i64* %x to i8*
+  %x1 = getelementptr i8, i8* %x0, i64 -1
+  %x2 = getelementptr i8, i8* %x0, i64 4
+  call void @Write4_2(i8* %x2, i8* %x1)
+  ret void
+}
+
+define void @TwoArgumentsOOBBoth() {
+; CHECK-LABEL: @TwoArgumentsOOBBoth dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Write4_2(arg1, [-1,0)), @Write4_2(arg0, [5,6)){{$}}
+; GLOBAL-NEXT: x[8]: [-1,9), @Write4_2(arg1, [-1,0)), @Write4_2(arg0, [5,6)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x0 = bitcast i64* %x to i8*
+  %x1 = getelementptr i8, i8* %x0, i64 -1
+  %x2 = getelementptr i8, i8* %x0, i64 5
+  call void @Write4_2(i8* %x2, i8* %x1)
+  ret void
+}
+
+define i32 @TestRecursiveNoOffset(i32* %p, i32 %size) {
+; CHECK-LABEL: @TestRecursiveNoOffset dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; LOCAL-NEXT: p[]: empty-set, @RecursiveNoOffset(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: p[]: full-set, @RecursiveNoOffset(arg0, [0,1)){{$}}
+; CHECK-NEXT: size[]: empty-set, @RecursiveNoOffset(arg1, [0,1)){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: sum[4]: [0,4), @RecursiveNoOffset(arg2, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %sum = alloca i32, align 4
+  %0 = bitcast i32* %sum to i8*
+  store i32 0, i32* %sum, align 4
+  call void @RecursiveNoOffset(i32* %p, i32 %size, i32* %sum)
+  %1 = load i32, i32* %sum, align 4
+  ret i32 %1
+}
+
+define void @TestRecursiveWithOffset(i32 %size) {
+; CHECK-LABEL: @TestRecursiveWithOffset dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: size[]: empty-set, @RecursiveWithOffset(arg0, [0,1)){{$}}
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: sum[64]: empty-set, @RecursiveWithOffset(arg1, [0,1)){{$}}
+; GLOBAL-NEXT: sum[64]: full-set, @RecursiveWithOffset(arg1, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %sum = alloca i32, i64 16, align 4
+  call void @RecursiveWithOffset(i32 %size, i32* %sum)
+  ret void
+}
+
+; FIXME: IPA should detect that access is safe
+define void @TestUpdateArg() {
+; CHECK-LABEL: @TestUpdateArg dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[16]: empty-set, @WriteAndReturn8(arg0, [0,1)){{$}}
+; GLOBAL-NEXT: x[16]: full-set, @WriteAndReturn8(arg0, [0,1)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i8, i64 16, align 4
+  %0 = call i8* @WriteAndReturn8(i8* %x)
+  ret void
+}
+
+; The rest is from Inputs/ipa.ll
+
+; CHECK-LABEL: @Write1{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: [0,1){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @Write4{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: [0,4){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @Write4_2{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: [0,4){{$}}
+; CHECK-NEXT: q[]: [0,4){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @Write8{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: [0,8){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @WriteAndReturn8{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: full-set{{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @PreemptableWrite1 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: [0,1){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @InterposableWrite1 interposable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: [0,1){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @ReturnDependent{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: full-set{{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @Rec0{{$}}
+; CHECK-NEXT: args uses:
+; LOCAL-NEXT: p[]: empty-set, @Write4(arg0, [2,3)){{$}}
+; GLOBAL-NEXT: p[]: [2,6), @Write4(arg0, [2,3)){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @Rec1{{$}}
+; CHECK-NEXT: args uses:
+; LOCAL-NEXT: p[]: empty-set, @Rec0(arg0, [1,2)){{$}}
+; GLOBAL-NEXT: p[]: [3,7), @Rec0(arg0, [1,2)){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @Rec2{{$}}
+; CHECK-NEXT: args uses:
+; LOCAL-NEXT: p[]: empty-set, @Rec1(arg0, [-5,-4)){{$}}
+; GLOBAL-NEXT: p[]: [-2,2), @Rec1(arg0, [-5,-4)){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @RecursiveNoOffset{{$}}
+; CHECK-NEXT: args uses:
+; LOCAL-NEXT: p[]: [0,4), @RecursiveNoOffset(arg0, [4,5)){{$}}
+; GLOBAL-NEXT: p[]: full-set, @RecursiveNoOffset(arg0, [4,5)){{$}}
+; CHECK-NEXT: size[]: empty-set, @RecursiveNoOffset(arg1, [4294967295,4294967296)){{$}}
+; CHECK-NEXT: acc[]: [0,4), @RecursiveNoOffset(arg2, [0,1)){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+
+; CHECK-LABEL: @RecursiveWithOffset{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: size[]: empty-set, @RecursiveWithOffset(arg0, [4294967295,4294967296)){{$}}
+; LOCAL-NEXT: acc[]: [0,4), @RecursiveWithOffset(arg1, [4,5)){{$}}
+; GLOBAL-NEXT: acc[]: full-set, @RecursiveWithOffset(arg1, [4,5)){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
diff --git a/test/Analysis/StackSafetyAnalysis/local.ll b/test/Analysis/StackSafetyAnalysis/local.ll
new file mode 100644
index 0000000000000000000000000000000000000000..814b97d29638493eeba1b728da03a55b3c0bae06
--- /dev/null
+++ b/test/Analysis/StackSafetyAnalysis/local.ll
@@ -0,0 +1,351 @@
+; RUN: opt -S -analyze -stack-safety-local < %s | FileCheck %s --check-prefixes=CHECK,LOCAL
+; RUN: opt -S -passes="print<stack-safety-local>" -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LOCAL
+; RUN: opt -S -analyze -stack-safety < %s | FileCheck %s --check-prefixes=CHECK,GLOBAL
+; RUN: opt -S -passes="print-stack-safety" -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@sink = global i8* null, align 8
+
+declare void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 %len, i1 %isvolatile)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 %isvolatile)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 %isvolatile)
+
+; Address leaked.
+define void @LeakAddress() {
+; CHECK-LABEL: @LeakAddress dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: full-set{{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  store i8* %x1, i8** @sink, align 8
+  ret void
+}
+
+define void @StoreInBounds() {
+; CHECK-LABEL: @StoreInBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,1){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  store i8 0, i8* %x1, align 1
+  ret void
+}
+
+define void @StoreInBounds2() {
+; CHECK-LABEL: @StoreInBounds2 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,4){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  store i32 0, i32* %x, align 4
+  ret void
+}
+
+define void @StoreInBounds3() {
+; CHECK-LABEL: @StoreInBounds3 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [2,3){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 2
+  store i8 0, i8* %x2, align 1
+  ret void
+}
+
+; FIXME: ScalarEvolution does not look through ptrtoint/inttoptr.
+define void @StoreInBounds4() {
+; CHECK-LABEL: @StoreInBounds4 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [2,-1){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = ptrtoint i32* %x to i64
+  %x2 = add i64 %x1, 2
+  %x3 = inttoptr i64 %x2 to i8*
+  store i8 0, i8* %x3, align 1
+  ret void
+}
+
+define void @StoreOutOfBounds() {
+; CHECK-LABEL: @StoreOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [2,6){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 2
+  %x3 = bitcast i8* %x2 to i32*
+  store i32 0, i32* %x3, align 1
+  ret void
+}
+
+; There is no difference in load vs store handling.
+define void @LoadInBounds() {
+; CHECK-LABEL: @LoadInBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,1){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %v = load i8, i8* %x1, align 1
+  ret void
+}
+
+define void @LoadOutOfBounds() {
+; CHECK-LABEL: @LoadOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [2,6){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 2
+  %x3 = bitcast i8* %x2 to i32*
+  %v = load i32, i32* %x3, align 1
+  ret void
+}
+
+; Leak through ret.
+define i8* @Ret() {
+; CHECK-LABEL: @Ret dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: full-set{{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 2
+  ret i8* %x2
+}
+
+declare void @Foo(i16* %p)
+
+define void @DirectCall() {
+; CHECK-LABEL: @DirectCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[8]: empty-set, @Foo(arg0, [2,3)){{$}}
+; GLOBAL-NEXT: x[8]: full-set, @Foo(arg0, [2,3)){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i16*
+  %x2 = getelementptr i16, i16* %x1, i64 1
+  call void @Foo(i16* %x2);
+  ret void
+}
+
+; Indirect calls can not be analyzed (yet).
+; FIXME: %p[]: full-set looks invalid
+define void @IndirectCall(void (i8*)* %p) {
+; CHECK-LABEL: @IndirectCall dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: p[]: full-set{{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: full-set{{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void %p(i8* %x1);
+  ret void
+}
+
+define void @NonConstantOffset(i1 zeroext %z) {
+; CHECK-LABEL: @NonConstantOffset dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: z[]: full-set{{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,4){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %idx = select i1 %z, i64 1, i64 2
+  %x2 = getelementptr i8, i8* %x1, i64 %idx
+  store i8 0, i8* %x2, align 1
+  ret void
+}
+
+define void @NonConstantOffsetOOB(i1 zeroext %z) {
+; CHECK-LABEL: @NonConstantOffsetOOB dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: z[]: full-set{{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,6){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %idx = select i1 %z, i64 1, i64 4
+  %x2 = getelementptr i8, i8* %x1, i64 %idx
+  store i8 0, i8* %x2, align 1
+  ret void
+}
+
+define void @ArrayAlloca() {
+; CHECK-LABEL: @ArrayAlloca dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[40]: [36,40){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, i32 10, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 36
+  %x3 = bitcast i8* %x2 to i32*
+  store i32 0, i32* %x3, align 1
+  ret void
+}
+
+define void @ArrayAllocaOOB() {
+; CHECK-LABEL: @ArrayAllocaOOB dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[40]: [37,41){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, i32 10, align 4
+  %x1 = bitcast i32* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 37
+  %x3 = bitcast i8* %x2 to i32*
+  store i32 0, i32* %x3, align 1
+  ret void
+}
+
+define void @DynamicAllocaUnused(i64 %size) {
+; CHECK-LABEL: @DynamicAllocaUnused dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: size[]: empty-set{{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[0]: empty-set{{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, i64 %size, align 16
+  ret void
+}
+
+; Dynamic alloca with unknown size.
+define void @DynamicAlloca(i64 %size) {
+; CHECK-LABEL: @DynamicAlloca dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: size[]: [0,-12){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[0]: [0,4){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, i64 %size, align 16
+  store i32 0, i32* %x, align 1
+  ret void
+}
+
+; Dynamic alloca with limited size.
+; FIXME: could be proved safe. Implement.
+define void @DynamicAllocaFiniteSizeRange(i1 zeroext %z) {
+; CHECK-LABEL: @DynamicAllocaFiniteSizeRange dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: z[]: [0,-12){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[0]: [0,4){{$}}
+; CHECK-NOT: ]:
+entry:
+  %size = select i1 %z, i64 3, i64 5
+  %x = alloca i32, i64 %size, align 16
+  store i32 0, i32* %x, align 1
+  ret void
+}
+
+define signext i8 @SimpleLoop() {
+; CHECK-LABEL: @SimpleLoop dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[10]: [0,10){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca [10 x i8], align 1
+  %0 = getelementptr inbounds [10 x i8], [10 x i8]* %x, i64 0, i64 0
+  %lftr.limit = getelementptr inbounds [10 x i8], [10 x i8]* %x, i64 0, i64 10
+  br label %for.body
+
+for.body:
+  %sum.010 = phi i8 [ 0, %entry ], [ %add, %for.body ]
+  %p.09 = phi i8* [ %0, %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %p.09, i64 1
+  %1 = load volatile i8, i8* %p.09, align 1
+  %add = add i8 %1, %sum.010
+  %exitcond = icmp eq i8* %incdec.ptr, %lftr.limit
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i8 %add
+}
+
+; OOB in a loop.
+define signext i8 @SimpleLoopOOB() {
+; CHECK-LABEL: @SimpleLoopOOB dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[10]: [0,11){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca [10 x i8], align 1
+  %0 = getelementptr inbounds [10 x i8], [10 x i8]* %x, i64 0, i64 0
+ ; 11 iterations
+  %lftr.limit = getelementptr inbounds [10 x i8], [10 x i8]* %x, i64 0, i64 11
+  br label %for.body
+
+for.body:
+  %sum.010 = phi i8 [ 0, %entry ], [ %add, %for.body ]
+  %p.09 = phi i8* [ %0, %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %p.09, i64 1
+  %1 = load volatile i8, i8* %p.09, align 1
+  %add = add i8 %1, %sum.010
+  %exitcond = icmp eq i8* %incdec.ptr, %lftr.limit
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i8 %add
+}
+
+; FIXME: we don't understand that %sz in the memset call is limited to 128 by the preceding check.
+define dso_local void @SizeCheck(i32 %sz) {
+; CHECK-LABEL: @SizeCheck{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: sz[]: [0,1){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x1[128]: full-set{{$}}
+; CHECK-NOT: ]:
+entry:
+  %x1 = alloca [128 x i8], align 16
+  %x1.sub = getelementptr inbounds [128 x i8], [128 x i8]* %x1, i64 0, i64 0
+  %cmp = icmp slt i32 %sz, 129
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  call void @llvm.memset.p0i8.i32(i8* nonnull align 16 %x1.sub, i8 0, i32 %sz, i1 false)
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/Analysis/StackSafetyAnalysis/memintrin.ll b/test/Analysis/StackSafetyAnalysis/memintrin.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2eea9ea74bdf9fce190d63b5bbf4d111bd9367ee
--- /dev/null
+++ b/test/Analysis/StackSafetyAnalysis/memintrin.ll
@@ -0,0 +1,202 @@
+; RUN: opt -S -analyze -stack-safety-local < %s | FileCheck %s
+; RUN: opt -S -passes="print<stack-safety-local>" -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -S -analyze -stack-safety < %s | FileCheck %s
+; RUN: opt -S -passes="print-stack-safety" -disable-output < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 %len, i1 %isvolatile)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 %isvolatile)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 %isvolatile)
+
+define void @MemsetInBounds() {
+; CHECK-LABEL: MemsetInBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,4){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 4, i1 false)
+  ret void
+}
+
+; Volatile does not matter for access bounds.
+define void @VolatileMemsetInBounds() {
+; CHECK-LABEL: VolatileMemsetInBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,4){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+    %x1 = bitcast i32* %x to i8*
+  call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 4, i1 true)
+  ret void
+}
+
+define void @MemsetOutOfBounds() {
+; CHECK-LABEL: MemsetOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,5){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+    %x1 = bitcast i32* %x to i8*
+  call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 5, i1 false)
+  ret void
+}
+
+define void @MemsetNonConst(i32 %size) {
+; CHECK-LABEL: MemsetNonConst dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: size[]: [0,1){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: full-set{{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+    %x1 = bitcast i32* %x to i8*
+  call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 %size, i1 false)
+  ret void
+}
+
+; FIXME: memintrinsics should look at size range when possible
+; Right now we refuse any non-constant size.
+define void @MemsetNonConstInBounds(i1 zeroext %z) {
+; CHECK-LABEL: MemsetNonConstInBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: z[]: [0,1){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: full-set{{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %size = select i1 %z, i32 3, i32 4
+  call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 %size, i1 false)
+  ret void
+}
+
+define void @MemcpyInBounds() {
+; CHECK-LABEL: MemcpyInBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,4){{$}}
+; CHECK-NEXT: y[4]: [0,4){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %y1 = bitcast i32* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %y1, i32 4, i1 false)
+  ret void
+}
+
+define void @MemcpySrcOutOfBounds() {
+; CHECK-LABEL: MemcpySrcOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[8]: [0,5){{$}}
+; CHECK-NEXT: y[4]: [0,5){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %y = alloca i32, align 4
+  %x1 = bitcast i64* %x to i8*
+  %y1 = bitcast i32* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %y1, i32 5, i1 false)
+  ret void
+}
+
+define void @MemcpyDstOutOfBounds() {
+; CHECK-LABEL: MemcpyDstOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,5){{$}}
+; CHECK-NEXT: y[8]: [0,5){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %y = alloca i64, align 4
+  %x1 = bitcast i32* %x to i8*
+  %y1 = bitcast i64* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %y1, i32 5, i1 false)
+  ret void
+}
+
+define void @MemcpyBothOutOfBounds() {
+; CHECK-LABEL: MemcpyBothOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,9){{$}}
+; CHECK-NEXT: y[8]: [0,9){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %y = alloca i64, align 4
+  %x1 = bitcast i32* %x to i8*
+  %y1 = bitcast i64* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %y1, i32 9, i1 false)
+  ret void
+}
+
+define void @MemcpySelfInBounds() {
+; CHECK-LABEL: MemcpySelfInBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[8]: [0,8){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 5
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %x2, i32 3, i1 false)
+  ret void
+}
+
+define void @MemcpySelfSrcOutOfBounds() {
+; CHECK-LABEL: MemcpySelfSrcOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[8]: [0,9){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 5
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %x2, i32 4, i1 false)
+  ret void
+}
+
+define void @MemcpySelfDstOutOfBounds() {
+; CHECK-LABEL: MemcpySelfDstOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[8]: [0,9){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 5
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x2, i8* %x1, i32 4, i1 false)
+  ret void
+}
+
+define void @MemmoveSelfBothOutOfBounds() {
+; CHECK-LABEL: MemmoveSelfBothOutOfBounds dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[8]: [0,14){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i64, align 4
+  %x1 = bitcast i64* %x to i8*
+  %x2 = getelementptr i8, i8* %x1, i64 5
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %x1, i8* %x2, i32 9, i1 false)
+  ret void
+}
diff --git a/test/Analysis/StackSafetyAnalysis/scev-udiv.ll b/test/Analysis/StackSafetyAnalysis/scev-udiv.ll
new file mode 100644
index 0000000000000000000000000000000000000000..95a663a11fcc232ae28232b10e170b4220feff67
--- /dev/null
+++ b/test/Analysis/StackSafetyAnalysis/scev-udiv.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -analyze -stack-safety-local < %s | FileCheck %s --check-prefixes=CHECK,LOCAL
+; RUN: opt -S -passes="print<stack-safety-local>" -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LOCAL
+; RUN: opt -S -analyze -stack-safety < %s | FileCheck %s --check-prefixes=CHECK,GLOBAL
+; RUN: opt -S -passes="print-stack-safety" -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL
+
+; Regression test that exercises a case when a AllocaOffsetRewritten SCEV
+; could return an empty-set range. This could occur with udiv SCEVs where the
+; RHS was re-written to 0.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @ExternalFn(i64)
+
+define void @Test1() {
+; CHECK-LABEL: @Test1 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[1]: empty-set, @Divide1(arg0, full-set){{$}}
+; GLOBAL-NEXT: x[1]: full-set, @Divide1(arg0, full-set){{$}}
+; CHECK-NOT: ]:
+  %x = alloca i8
+  %int = ptrtoint i8* %x to i64
+  call void @Divide1(i64 %int)
+  ret void
+}
+
+define dso_local void @Divide1(i64 %arg) {
+; CHECK-LABEL: @Divide1{{$}}
+; CHECK-NEXT: args uses:
+; LOCAL-NEXT: arg[]: empty-set, @ExternalFn(arg0, full-set){{$}}
+; GLOBAL-NEXT: arg[]: full-set, @ExternalFn(arg0, full-set){{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+  %quotient = udiv i64 undef, %arg
+  call void @ExternalFn(i64 %quotient)
+  unreachable
+}
+
+define void @Test2(i64 %arg) {
+; CHECK-LABEL: @Test2 dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: arg[]: empty-set{{$}}
+; CHECK-NEXT: allocas uses:
+; LOCAL-NEXT: x[1]: empty-set, @Divide2(arg0, full-set){{$}}
+; GLOBAL-NEXT: x[1]: full-set, @Divide2(arg0, full-set){{$}}
+; CHECK-NOT: ]:
+  %x = alloca i8
+  %int = ptrtoint i8* %x to i64
+  call void @Divide2(i64 %int)
+  ret void
+}
+
+define dso_local void @Divide2(i64 %arg) {
+; CHECK-LABEL: @Divide2{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: arg[]: full-set{{$}}
+; CHECK-NEXT: allocas uses:
+; CHECK-NOT: ]:
+  %x = inttoptr i64 %arg to i8*
+  %quotient = udiv i64 undef, %arg
+  %arrayidx = getelementptr i8, i8* %x, i64 %quotient
+  load i8, i8* %arrayidx
+  unreachable
+}
diff --git a/test/Assembler/2004-03-07-FunctionAddressAlignment.ll b/test/Assembler/2004-03-07-FunctionAddressAlignment.ll
deleted file mode 100644
index 7fa0802db405c9852be6867c60785bf849fcc49a..0000000000000000000000000000000000000000
--- a/test/Assembler/2004-03-07-FunctionAddressAlignment.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis | not grep ptrtoint
-; RUN: verify-uselistorder %s
-; All of these should be eliminable
-
-
-define i32 @foo() {
-	ret i32 and (i32 ptrtoint (i32()* @foo to i32), i32 1)
-}
-
-define i32 @foo2() {
-	ret i32 and (i32 1, i32 ptrtoint (i32()* @foo2 to i32))
-}
-
-define i1 @foo3() {
-	ret i1 icmp ne (i1()* @foo3, i1()* null)
-}
diff --git a/test/Assembler/debug-info-source-invalid.ll b/test/Assembler/debug-info-source-invalid.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d746e9e25fc3d12556d231d521cf82a03c04461a
--- /dev/null
+++ b/test/Assembler/debug-info-source-invalid.ll
@@ -0,0 +1,27 @@
+; RUN: llvm-as < %s 2>&1 >/dev/null | FileCheck %s
+
+; Ensure we reject debug info where the DIFiles of a DICompileUnit mix source
+; and no-source.
+
+define dso_local void @foo() !dbg !5 {
+  ret void
+}
+
+define dso_local void @bar() !dbg !6 {
+  ret void
+}
+
+!llvm.dbg.cu = !{!4}
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 2, !"Dwarf Version", i32 5}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+
+!2 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A")
+; CHECK: inconsistent use of embedded source
+; CHECK: warning: ignoring invalid debug info
+!3 = !DIFile(filename: "bar.h", directory: "dir")
+
+!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2)
+!5 = distinct !DISubprogram(name: "foo", file: !2, unit: !4)
+!6 = distinct !DISubprogram(name: "bar", file: !3, unit: !4)
diff --git a/test/Assembler/debug-info-source.ll b/test/Assembler/debug-info-source.ll
new file mode 100644
index 0000000000000000000000000000000000000000..381603ef35c38399164625dde6e0c2ebac2d0706
--- /dev/null
+++ b/test/Assembler/debug-info-source.ll
@@ -0,0 +1,41 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; Ensure we accept debug info where DIFiles within a DICompileUnit either all
+; have source, or none have source.
+
+define dso_local void @foo() !dbg !6 {
+  ret void
+}
+
+define dso_local void @bar() !dbg !7 {
+  ret void
+}
+
+define dso_local void @baz() !dbg !9 {
+  ret void
+}
+
+define dso_local void @qux() !dbg !11 {
+  ret void
+}
+
+!llvm.dbg.cu = !{!0, !2}
+!llvm.module.flags = !{!4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+; CHECK: !1 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A")
+!1 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A")
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3)
+; CHECK: !3 = !DIFile(filename: "qux.h", directory: "dir")
+!3 = !DIFile(filename: "qux.h", directory: "dir")
+!4 = !{i32 2, !"Dwarf Version", i32 5}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", file: !1, unit: !0)
+!7 = distinct !DISubprogram(name: "bar", file: !8, unit: !0)
+; CHECK: !8 = !DIFile(filename: "bar.h", directory: "dir", source: "void bar() { }\0A")
+!8 = !DIFile(filename: "bar.h", directory: "dir", source: "void bar() { }\0A")
+!9 = distinct !DISubprogram(name: "baz", file: !10, unit: !2)
+; CHECK: !10 = !DIFile(filename: "baz.c", directory: "dir")
+!10 = !DIFile(filename: "baz.c", directory: "dir")
+!11 = distinct !DISubprogram(name: "qux", file: !3, unit: !2)
diff --git a/test/Assembler/disubprogram.ll b/test/Assembler/disubprogram.ll
index a407f3da8bd8d5747fb35e9a355e280e23ffe8c4..df97efb417c7a85051ba8402ef4f48919a5fa065 100644
--- a/test/Assembler/disubprogram.ll
+++ b/test/Assembler/disubprogram.ll
@@ -6,8 +6,8 @@ define void @_Z3foov() !dbg !9 {
   ret void
 }
 
-; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14}
-!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14}
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
 
 !0 = !{null}
 !1 = distinct !DICompositeType(tag: DW_TAG_structure_type)
@@ -17,7 +17,7 @@ define void @_Z3foov() !dbg !9 {
 !5 = distinct !{}
 !6 = distinct !{}
 
-; CHECK: !7 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !8)
+; CHECK: !7 = distinct !DISubprogram(scope: null, spFlags: DISPFlagDefinition, unit: !8)
 !7 = distinct !DISubprogram(unit: !8)
 
 !8 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
@@ -25,10 +25,10 @@ define void @_Z3foov() !dbg !9 {
                              isOptimized: true, flags: "-O2",
                              splitDebugFilename: "abc.debug", emissionKind: 2)
 
-; CHECK: !9 = !DISubprogram(scope: null, isLocal: false, isDefinition: false, isOptimized: false)
+; CHECK: !9 = !DISubprogram(scope: null, spFlags: 0)
 !9 = !DISubprogram(isDefinition: false)
 
-; CHECK: !10 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, file: !2, line: 7, type: !3, isLocal: true, isDefinition: true, scopeLine: 8, containingType: !4, virtuality: DW_VIRTUALITY_pure_virtual, virtualIndex: 10, thisAdjustment: 3, flags: DIFlagPrototyped | DIFlagNoReturn, isOptimized: true, unit: !8, templateParams: !5, declaration: !9, retainedNodes: !6)
+; CHECK: !10 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, file: !2, line: 7, type: !3, scopeLine: 8, containingType: !4, virtualIndex: 10, thisAdjustment: 3, flags: DIFlagPrototyped | DIFlagNoReturn, spFlags: DISPFlagPureVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !8, templateParams: !5, declaration: !9, retainedNodes: !6)
 !10 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
                             file: !2, line: 7, type: !3, isLocal: true,
                             isDefinition: true, scopeLine: 8,
@@ -63,12 +63,21 @@ define void @_Z3foov() !dbg !9 {
 
 !13 = !{!4}
 ; CHECK: !13 = !{!4}
-; CHECK: !14 = distinct !DISubprogram(name: "foo", scope: !1, file: !2, line: 1, type: !3, isLocal: true, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !8, thrownTypes: !13)
+; CHECK: !14 = distinct !DISubprogram(name: "foo", scope: !1, file: !2, line: 1, type: !3, scopeLine: 2, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !8, thrownTypes: !13)
 !14 = distinct !DISubprogram(name: "foo", scope: !1,
                             file: !2, line: 1, type: !3, isLocal: true,
                             isDefinition: true, scopeLine: 2, isOptimized: false,
                             unit: !8, thrownTypes: !13)
 
-!15 = !{i32 1, !"Debug Info Version", i32 3}
-!llvm.module.flags = !{!15}
+; CHECK: !15 = distinct !DISubprogram({{.*}}, flags: DIFlagPrototyped, spFlags: DISPFlagPureVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized,
+!15 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                             file: !2, line: 7, type: !3, scopeLine: 8,
+							 containingType: !4, virtualIndex: 0,
+							 flags: DIFlagPrototyped,
+							 spFlags: DISPFlagPureVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized,
+							 unit: !8, templateParams: !5, declaration: !9,
+							 retainedNodes: !6)
+
+!16 = !{i32 1, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!16}
 !llvm.dbg.cu = !{!8}
diff --git a/test/Assembler/fast-math-flags.ll b/test/Assembler/fast-math-flags.ll
index edff26e6d685f98a9af7671898428db56c6be760..1af2320f0a108658d90ee21c94e57970a6c9c8f9 100644
--- a/test/Assembler/fast-math-flags.ll
+++ b/test/Assembler/fast-math-flags.ll
@@ -38,8 +38,12 @@ entry:
   %e = frem float %x, %y
 ; CHECK:  %e_vec = frem <3 x float> %vec, %vec
   %e_vec = frem <3 x float> %vec, %vec
-; CHECK:  ret float %e
-  ret float %e
+; CHECK:  %f = fneg float %x
+  %f = fneg float %x
+; CHECK:  %f_vec = fneg <3 x float> %vec
+  %f_vec = fneg <3 x float> %vec
+; CHECK:  ret float %f
+  ret  float %f
 }
 
 ; CHECK: no_nan
@@ -72,8 +76,12 @@ entry:
   %e = frem nnan float %x, %y
 ; CHECK:  %e_vec = frem nnan <3 x float> %vec, %vec
   %e_vec = frem nnan <3 x float> %vec, %vec
-; CHECK:  ret float %e
-  ret float %e
+; CHECK:  %f = fneg nnan float %x
+  %f = fneg nnan float %x
+; CHECK:  %f_vec = fneg nnan <3 x float> %vec
+  %f_vec = fneg nnan <3 x float> %vec
+; CHECK:  ret float %f
+  ret float %f
 }
 
 ; CHECK: @contract(
@@ -174,6 +182,10 @@ entry:
   %e = frem nnan nsz float %x, %y
 ; CHECK:  %e_vec = frem nnan <3 x float> %vec, %vec
   %e_vec = frem nnan <3 x float> %vec, %vec
-; CHECK:  ret float %e
-  ret float %e
+; CHECK:  %f = fneg nnan nsz float %x
+  %f = fneg nnan nsz float %x
+; CHECK:  %f_vec = fneg fast <3 x float> %vec
+  %f_vec = fneg fast <3 x float> %vec
+; CHECK:  ret float %f
+  ret float %f
 }
diff --git a/test/Assembler/invalid-disubprogram-uniqued-definition.ll b/test/Assembler/invalid-disubprogram-uniqued-definition.ll
index c146883d6649f7567e0d94ff433a8d25550fbe90..6641e6dc21d72a07dfe9827c0f465285b08e68dd 100644
--- a/test/Assembler/invalid-disubprogram-uniqued-definition.ll
+++ b/test/Assembler/invalid-disubprogram-uniqued-definition.ll
@@ -1,4 +1,4 @@
 ; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
 
-; CHECK: <stdin>:[[@LINE+1]]:6: error: missing 'distinct', required for !DISubprogram when 'isDefinition'
+; CHECK: <stdin>:[[@LINE+1]]:6: error: missing 'distinct', required for !DISubprogram that is a Definition
 !0 = !DISubprogram(isDefinition: true)
diff --git a/test/Assembler/thinlto-summary.ll b/test/Assembler/thinlto-summary.ll
index 64af835ae2b101bde9c5ea62673670588060d3cc..4d1c0dd74d2bb0289ceb01f5f58a532e76ad5c3f 100644
--- a/test/Assembler/thinlto-summary.ll
+++ b/test/Assembler/thinlto-summary.ll
@@ -9,7 +9,7 @@
 ; Check a function that makes several calls with various profile hotness, and a
 ; reference (also tests forward references to function and variables in calls
 ; and refs).
-^2 = gv: (guid: 1, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15, hotness: hot), (callee: ^17, hotness: cold), (callee: ^16, hotness: none)), refs: (^13))))
+^2 = gv: (guid: 1, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15, hotness: hot), (callee: ^17, hotness: cold), (callee: ^16, hotness: none)), refs: (readonly ^13, ^14))))
 
 ; Function with a call that has relative block frequency instead of profile
 ; hotness.
@@ -24,16 +24,16 @@
 ^8 = gv: (guid: 7, summaries: (function: (module: ^0, flags: (linkage: linkonce_odr, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1)))
 ^9 = gv: (guid: 8, summaries: (function: (module: ^0, flags: (linkage: weak_odr, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1)))
 ^10 = gv: (guid: 9, summaries: (function: (module: ^0, flags: (linkage: weak, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1)))
-^11 = gv: (guid: 10, summaries: (variable: (module: ^0, flags: (linkage: common, notEligibleToImport: 0, live: 0, dsoLocal: 0))))
+^11 = gv: (guid: 10, summaries: (variable: (module: ^0, flags: (linkage: common, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 0))))
 ; Test appending globel variable with reference (tests backward reference on
 ; refs).
-^12 = gv: (guid: 11, summaries: (variable: (module: ^0, flags: (linkage: appending, notEligibleToImport: 0, live: 0, dsoLocal: 0), refs: (^4))))
+^12 = gv: (guid: 11, summaries: (variable: (module: ^0, flags: (linkage: appending, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 0), refs: (^4))))
 
 ; Test a referenced global variable.
-^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0))))
+^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 1))))
 
 ; Test a dsoLocal variable.
-^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1))))
+^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1), varFlags: (readonly: 0))))
 
 ; Functions with various flag combinations (notEligibleToImport, Live,
 ; combinations of optional function flags).
@@ -67,7 +67,7 @@
 ; Make sure we get back from llvm-dis essentially what we put in via llvm-as.
 ; CHECK: ^0 = module: (path: "thinlto-summary1.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049))
 ; CHECK: ^1 = module: (path: "thinlto-summary2.o", hash: (2998369023, 4283347029, 1195487472, 2757298015, 1852134156))
-; CHECK: ^2 = gv: (guid: 1, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15, hotness: hot), (callee: ^17, hotness: cold), (callee: ^16, hotness: none)), refs: (^13))))
+; CHECK: ^2 = gv: (guid: 1, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15, hotness: hot), (callee: ^17, hotness: cold), (callee: ^16, hotness: none)), refs: (^14, readonly ^13))))
 ; CHECK: ^3 = gv: (guid: 2, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15)))))
 ; CHECK: ^4 = gv: (guid: 3, summaries: (function: (module: ^0, flags: (linkage: internal, notEligibleToImport: 0, live: 0, dsoLocal: 1), insts: 1)))
 ; CHECK: ^5 = gv: (guid: 4, summaries: (alias: (module: ^0, flags: (linkage: private, notEligibleToImport: 0, live: 0, dsoLocal: 1), aliasee: ^14)))
@@ -76,10 +76,10 @@
 ; CHECK: ^8 = gv: (guid: 7, summaries: (function: (module: ^0, flags: (linkage: linkonce_odr, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1)))
 ; CHECK: ^9 = gv: (guid: 8, summaries: (function: (module: ^0, flags: (linkage: weak_odr, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1)))
 ; CHECK: ^10 = gv: (guid: 9, summaries: (function: (module: ^0, flags: (linkage: weak, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1)))
-; CHECK: ^11 = gv: (guid: 10, summaries: (variable: (module: ^0, flags: (linkage: common, notEligibleToImport: 0, live: 0, dsoLocal: 0))))
-; CHECK: ^12 = gv: (guid: 11, summaries: (variable: (module: ^0, flags: (linkage: appending, notEligibleToImport: 0, live: 0, dsoLocal: 0), refs: (^4))))
-; CHECK: ^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0))))
-; CHECK: ^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1))))
+; CHECK: ^11 = gv: (guid: 10, summaries: (variable: (module: ^0, flags: (linkage: common, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 0))))
+; CHECK: ^12 = gv: (guid: 11, summaries: (variable: (module: ^0, flags: (linkage: appending, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 0), refs: (^4))))
+; CHECK: ^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 1))))
+; CHECK: ^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1), varFlags: (readonly: 0))))
 ; CHECK: ^15 = gv: (guid: 14, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 1, live: 1, dsoLocal: 0), insts: 1)))
 ; CHECK: ^16 = gv: (guid: 15, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 1, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0))))
 ; CHECK: ^17 = gv: (guid: 16, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 0, readOnly: 1, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0), calls: ((callee: ^15)))))
diff --git a/test/Bindings/llvm-c/debug_info.ll b/test/Bindings/llvm-c/debug_info.ll
index 46fb0dd16ced19b4f9dbe89de087b6f0305a892e..84635f041a31976bc5b5bd5359efacfe522a7d4f 100644
--- a/test/Bindings/llvm-c/debug_info.ll
+++ b/test/Bindings/llvm-c/debug_info.ll
@@ -43,7 +43,7 @@
 ; CHECK-NEXT: !17 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !18, file: !1, size: 192, elements: !19, runtimeLang: DW_LANG_C89, identifier: "MyStruct")
 ; CHECK-NEXT: !18 = !DINamespace(name: "NameSpace", scope: !6)
 ; CHECK-NEXT: !19 = !{!11, !11, !11}
-; CHECK-NEXT: !20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !21, isLocal: true, isDefinition: true, scopeLine: 42, isOptimized: false, unit: !0, retainedNodes: !26)
+; CHECK-NEXT: !20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !21, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !26)
 ; CHECK-NEXT: !21 = !DISubroutineType(types: !22)
 ; CHECK-NEXT: !22 = !{!11, !11, !23}
 ; CHECK-NEXT: !23 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, size: 640, flags: DIFlagVector, elements: !24)
diff --git a/test/Bitcode/DISubprogram-distinct-definitions.ll b/test/Bitcode/DISubprogram-distinct-definitions.ll
index bcb9a4ec4da6670f9d72b4e2647fa316afd763b0..9d141d52de1c8d7306806633e504242309379ed9 100644
--- a/test/Bitcode/DISubprogram-distinct-definitions.ll
+++ b/test/Bitcode/DISubprogram-distinct-definitions.ll
@@ -9,6 +9,6 @@ define void @f() !dbg !3 { ret void }
 !0 = distinct !DICompileUnit(language: 12, file: !1, subprograms: !{!3})
 !1 = !DIFile(filename: "path/to/file", directory: "/path/to/dir")
 
-; CHECK: = distinct !DISubprogram({{.*}}, isDefinition: true
+; CHECK: = distinct !DISubprogram({{.*}} DISPFlagDefinition
 !3 = !DISubprogram(name: "foo", isDefinition: true)
 !4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/Bitcode/DISubprogram-v4.ll b/test/Bitcode/DISubprogram-v4.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ce5521002cafb989d606a7f152748a8d517f67a1
--- /dev/null
+++ b/test/Bitcode/DISubprogram-v4.ll
@@ -0,0 +1,86 @@
+; The .bc file was generated from this source using llvm-as from r347766.
+; A 7.0 release version should work to recreate it if necessary.
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; CHECK: define void @_Z3foov() !dbg !9
+define void @_Z3foov() !dbg !9 {
+  ret void
+}
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18}
+
+!0 = !{null}
+!1 = distinct !DICompositeType(tag: DW_TAG_structure_type)
+!2 = !DIFile(filename: "path/to/file", directory: "/path/to/dir")
+!3 = !DISubroutineType(types: !0)
+!4 = distinct !DICompositeType(tag: DW_TAG_structure_type)
+!5 = distinct !{}
+!6 = distinct !{}
+
+; CHECK: !7 = distinct !DISubprogram(scope: null, spFlags: DISPFlagDefinition, unit: !8)
+!7 = distinct !DISubprogram(unit: !8)
+
+!8 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
+                             file: !2, isOptimized: true, flags: "-O2")
+
+; CHECK: !9 = !DISubprogram(scope: null, spFlags: 0)
+!9 = !DISubprogram(isDefinition: false)
+
+; CHECK: !10 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagPureVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized,
+!10 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: true, isDefinition: true, isOptimized: true,
+							virtuality: DW_VIRTUALITY_pure_virtual,
+                            unit: !8)
+
+; CHECK: !11 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized,
+!11 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: true, isDefinition: true, isOptimized: true,
+							virtuality: DW_VIRTUALITY_virtual,
+                            unit: !8)
+
+; CHECK: !12 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized,
+!12 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: true, isDefinition: true, isOptimized: true,
+							virtuality: DW_VIRTUALITY_none,
+                            unit: !8)
+
+; CHECK: !13 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagDefinition | DISPFlagOptimized,
+!13 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: false, isDefinition: true, isOptimized: true,
+                            unit: !8)
+
+; CHECK: !14 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!14 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: true, isDefinition: false, isOptimized: true)
+
+; CHECK: !15 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition,
+!15 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: true, isDefinition: true, isOptimized: false,
+                            unit: !8)
+
+; CHECK: !16 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagLocalToUnit)
+!16 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: true, isDefinition: false, isOptimized: false)
+
+; CHECK: !17 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagDefinition,
+!17 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: false, isDefinition: true, isOptimized: false,
+                            unit: !8)
+
+; CHECK: !18 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagOptimized)
+!18 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
+                            file: !2, line: 7, type: !3, containingType: !4,
+							isLocal: false, isDefinition: false, isOptimized: true)
+
+!19 = !{i32 1, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!19}
+!llvm.dbg.cu = !{!8}
diff --git a/test/Bitcode/DISubprogram-v4.ll.bc b/test/Bitcode/DISubprogram-v4.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..392df4aae74d3fb0df0c2196191496a4eb4045dd
Binary files /dev/null and b/test/Bitcode/DISubprogram-v4.ll.bc differ
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index 8c0471a1134efd85165254b48e9bdb63ad994b8f..1d57f759391af70c5daa041a3ab660ed1b2293d5 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -762,7 +762,27 @@ define void @atomics(i32* %word) {
 }
 
 ;; Fast Math Flags
-define void @fastmathflags(float %op1, float %op2) {
+define void @fastmathflags_unop(float %op1) {
+  %f.nnan = fneg nnan float %op1
+  ; CHECK: %f.nnan = fneg nnan float %op1
+  %f.ninf = fneg ninf float %op1
+  ; CHECK: %f.ninf = fneg ninf float %op1
+  %f.nsz = fneg nsz float %op1
+  ; CHECK: %f.nsz = fneg nsz float %op1
+  %f.arcp = fneg arcp float %op1
+  ; CHECK: %f.arcp = fneg arcp float %op1
+  %f.contract = fneg contract float %op1
+  ; CHECK: %f.contract = fneg contract float %op1
+  %f.afn = fneg afn float %op1
+  ; CHECK: %f.afn = fneg afn float %op1
+  %f.reassoc = fneg reassoc float %op1
+  ; CHECK: %f.reassoc = fneg reassoc float %op1
+  %f.fast = fneg fast float %op1
+  ; CHECK: %f.fast = fneg fast float %op1
+  ret void
+}
+
+define void @fastmathflags_binops(float %op1, float %op2) {
   %f.nnan = fadd nnan float %op1, %op2
   ; CHECK: %f.nnan = fadd nnan float %op1, %op2
   %f.ninf = fadd ninf float %op1, %op2
@@ -997,6 +1017,13 @@ continue:
   ret i32 0
 }
 
+; Instructions -- Unary Operations
+define void @instructions.unops(double %op1) {
+  fneg double %op1
+  ; CHECK: fneg double %op1
+  ret void
+}
+
 ; Instructions -- Binary Operations
 define void @instructions.binops(i8 %op1, i8 %op2) {
   ; nuw x nsw
diff --git a/test/Bitcode/function-encoding-rel-operands.ll b/test/Bitcode/function-encoding-rel-operands.ll
index 1307dd4833770c3b5cc3816a605ec5c34760f720..9486e04a0d5aa856547affb94cc0f783c5c9027d 100644
--- a/test/Bitcode/function-encoding-rel-operands.ll
+++ b/test/Bitcode/function-encoding-rel-operands.ll
@@ -3,6 +3,15 @@
 ; RUN: llvm-as < %s | llvm-bcanalyzer -dump | FileCheck %s
 ; RUN: verify-uselistorder < %s
 
+; CHECK: FUNCTION_BLOCK
+; CHECK: INST_UNOP {{.*}}op0=1
+; CHECK: INST_RET {{.*}}op0=1
+define double @test_float_unops(double %a) nounwind {
+  %1 = fneg double %a
+  ret double %1
+}
+
+
 ; CHECK: FUNCTION_BLOCK
 ; CHECK: INST_BINOP {{.*}}op0=1 op1=1
 ; CHECK: INST_BINOP {{.*}}op0=1 op1=1
diff --git a/test/Bitcode/summary_version.ll b/test/Bitcode/summary_version.ll
index b285da7a6f4e6dcd74f06d53a6931676680be278..fc3b3bd4877718f6cda3f2c1575fd9b12373119a 100644
--- a/test/Bitcode/summary_version.ll
+++ b/test/Bitcode/summary_version.ll
@@ -2,7 +2,7 @@
 ; RUN: opt  -module-summary  %s -o - | llvm-bcanalyzer -dump | FileCheck %s
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK: <VERSION op0=4/>
+; CHECK: <VERSION op0=6/>
 
 
 
diff --git a/test/Bitcode/thinlto-alias.ll b/test/Bitcode/thinlto-alias.ll
index 05de932faeebed7a5a2136520dcff4891854dd46..835d720c69e5d76564c025cca1f31e235b81e034 100644
--- a/test/Bitcode/thinlto-alias.ll
+++ b/test/Bitcode/thinlto-alias.ll
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:    <VERSION
 ; See if the call to func is registered.
 ; The value id 1 matches the second FUNCTION record above.
-; CHECK-NEXT:    <PERMODULE {{.*}} op5=1/>
+; CHECK-NEXT:    <PERMODULE {{.*}} op6=1/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; CHECK: <STRTAB_BLOCK
@@ -33,7 +33,7 @@
 ; COMBINED-NEXT:    <VALUE_GUID op0=[[ALIASID:[0-9]+]] op1=-5751648690987223394/>
 ; COMBINED-NEXT:    <VALUE_GUID
 ; COMBINED-NEXT:    <VALUE_GUID op0=[[ALIASEEID:[0-9]+]] op1=-1039159065113703048/>
-; COMBINED-NEXT:    <COMBINED {{.*}} op6=[[ALIASID]]/>
+; COMBINED-NEXT:    <COMBINED {{.*}} op8=[[ALIASID]]/>
 ; COMBINED-NEXT:    <COMBINED {{.*}}
 ; COMBINED-NEXT:    <COMBINED_ALIAS  {{.*}} op3=[[ALIASEEID]]
 ; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK
diff --git a/test/Bitcode/thinlto-alias2.ll b/test/Bitcode/thinlto-alias2.ll
index 90e886570d8f5b6f7876de85b0319b939e48ed96..3d68e3fd4ba6393070f740e1c469b5a13ce173e1 100644
--- a/test/Bitcode/thinlto-alias2.ll
+++ b/test/Bitcode/thinlto-alias2.ll
@@ -4,7 +4,7 @@
 
 ; CHECK:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
-; CHECK-NEXT:    <PERMODULE {{.*}} op4=0 op5=[[ALIASID:[0-9]+]]/>
+; CHECK-NEXT:    <PERMODULE {{.*}} op4=0 op5=0 op6=[[ALIASID:[0-9]+]]/>
 ; CHECK-NEXT:    <PERMODULE {{.*}} op0=[[ALIASEEID:[0-9]+]]
 ; CHECK-NEXT:    <ALIAS {{.*}} op0=[[ALIASID]] {{.*}} op2=[[ALIASEEID]]/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-cast.ll b/test/Bitcode/thinlto-function-summary-callgraph-cast.ll
index 45801c9a74dd20a9961c477f5840c441a2ceda12..79644403d38cf4edcabc29a52e06d96d8a452194 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-cast.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-cast.ll
@@ -6,9 +6,9 @@
 ; CHECK:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; "op7" is a call to "callee" function.
-; CHECK-NEXT:    <PERMODULE {{.*}} op7=3 op8=[[ALIASID:[0-9]+]]/>
+; CHECK-NEXT:    <PERMODULE {{.*}} op8=3 op9=[[ALIASID:[0-9]+]]/>
 ; "another_caller" has only references but no calls.
-; CHECK-NEXT:    <PERMODULE {{.*}} op4=3 {{.*}} op7={{[0-9]+}}/>
+; CHECK-NEXT:    <PERMODULE {{.*}} op4=3 {{.*}} op8={{[0-9]+}}/>
 ; CHECK-NEXT:    <PERMODULE {{.*}} op0=[[ALIASEEID:[0-9]+]]
 ; CHECK-NEXT:    <ALIAS {{.*}} op0=[[ALIASID]] {{.*}} op2=[[ALIASEEID]]/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll b/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
index bb3e8e978356b9d50868c88d572eaf2fb4d47deb..e332224343e96b52ff8d0e94396ef1065e469565 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
@@ -17,7 +17,7 @@
 ; CHECK:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; See if the call to func is registered, using the expected hotness type.
-; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op5=1 op6=2/>
+; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op6=1 op7=2/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 ; CHECK: <STRTAB_BLOCK
 ; CHECK-NEXT: blob data = 'mainfunc{{.*}}'
@@ -30,7 +30,7 @@
 ; COMBINED-NEXT:    <COMBINED
 ; See if the call to func is registered, using the expected hotness type.
 ; op6=2 which is hotnessType::None.
-; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op6=[[FUNCID]] op7=2/>
+; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op8=[[FUNCID]] op9=2/>
 ; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; ModuleID = 'thinlto-function-summary-callgraph.ll'
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
index fc3c5c90ab97a96f57a1446d50034e85654beb43..31c99c189ac014ae85e6bd850b9421b2d4ddef36 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
@@ -48,7 +48,7 @@
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <VALUE_GUID op0=25 op1=123/>
 ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
-; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op5=1 op6=3 op7=5 op8=1 op9=2 op10=3 op11=4 op12=1 op13=6 op14=2 op15=3 op16=3 op17=7 op18=2 op19=8 op20=2 op21=25 op22=4/>
+; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op6=1 op7=3 op8=5 op9=1 op10=2 op11=3 op12=4 op13=1 op14=6 op15=2 op16=3 op17=3 op18=7 op19=2 op20=8 op21=2 op22=25 op23=4/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; CHECK: <STRTAB_BLOCK
@@ -71,7 +71,7 @@
 ; COMBINED-NEXT:    <COMBINED abbrevid=
 ; COMBINED-NEXT:    <COMBINED abbrevid=
 ; COMBINED-NEXT:    <COMBINED abbrevid=
-; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op6=[[HOT1:.*]] op7=3 op8=[[COLD:.*]] op9=1 op10=[[HOT2:.*]] op11=3 op12=[[NONE1:.*]] op13=2 op14=[[HOT3:.*]] op15=3 op16=[[NONE2:.*]] op17=2 op18=[[NONE3:.*]] op19=2/>
+; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op8=[[HOT1:.*]] op9=3 op10=[[COLD:.*]] op11=1 op12=[[HOT2:.*]] op13=3 op14=[[NONE1:.*]] op15=2 op16=[[HOT3:.*]] op17=3 op18=[[NONE2:.*]] op19=2 op20=[[NONE3:.*]] op21=2/>
 ; COMBINED_NEXT:    <COMBINED abbrevid=
 ; COMBINED_NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll b/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll
index d84517865a8ccbbe060408da64d6ee76e3881116..6c14465222569d449442e225b0c922c95d740e7c 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll
@@ -13,7 +13,7 @@
 ; CHECK:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; See if the call to func is registered.
-; CHECK-NEXT:    <PERMODULE_RELBF {{.*}} op4=1 {{.*}} op7=256
+; CHECK-NEXT:    <PERMODULE_RELBF {{.*}} op4=1 {{.*}} op8=256
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 ; CHECK: <STRTAB_BLOCK
 ; CHECK-NEXT: blob data = 'undefinedglobmainfunc{{.*}}'
@@ -38,5 +38,5 @@ declare void @func(...) #1
 
 ; DIS: ^0 = module: (path: "{{.*}}", hash: (0, 0, 0, 0, 0))
 ; DIS: ^1 = gv: (name: "func") ; guid = 7289175272376759421
-; DIS: ^2 = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 3, calls: ((callee: ^1, relbf: 256)), refs: (^3)))) ; guid = 15822663052811949562
+; DIS: ^2 = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 3, calls: ((callee: ^1, relbf: 256)), refs: (readonly ^3)))) ; guid = 15822663052811949562
 ; DIS: ^3 = gv: (name: "undefinedglob") ; guid = 18036901804029949403
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
index 8793a08d3bfbcde8a6f50b4f86433cce00956cff..d1f980ab5f6a7f864ab03af031c94904d73d1f10 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <VALUE_GUID op0=26 op1=123/>
 ; op4=none1 op6=hot1 op8=cold1 op10=none2 op12=hot2 op14=cold2 op16=none3 op18=hot3 op20=cold3 op22=123
-; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op5=7 op6=0 op7=1 op8=3 op9=4 op10=1 op11=8 op12=0 op13=2 op14=3 op15=5 op16=1 op17=9 op18=0 op19=3 op20=3 op21=6 op22=1 op23=26 op24=4/>
+; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op6=7 op7=0 op8=1 op9=3 op10=4 op11=1 op12=8 op13=0 op14=2 op15=3 op16=5 op17=1 op18=9 op19=0 op20=3 op21=3 op22=6 op23=1 op24=26 op25=4/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; CHECK: <STRTAB_BLOCK
@@ -58,7 +58,7 @@
 ; COMBINED-NEXT:    <COMBINED abbrevid=
 ; COMBINED-NEXT:    <COMBINED abbrevid=
 ; COMBINED-NEXT:    <COMBINED abbrevid=
-; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op6=[[NONE1:.*]] op7=0 op8=[[HOT1:.*]] op9=3 op10=[[COLD1:.*]] op11=1 op12=[[NONE2:.*]] op13=0 op14=[[HOT2:.*]] op15=3 op16=[[COLD2:.*]] op17=1 op18=[[NONE3:.*]] op19=0 op20=[[HOT3:.*]] op21=3 op22=[[COLD3:.*]] op23=1/>
+; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op8=[[NONE1:.*]] op9=0 op10=[[HOT1:.*]] op11=3 op12=[[COLD1:.*]] op13=1 op14=[[NONE2:.*]] op15=0 op16=[[HOT2:.*]] op17=3 op18=[[COLD2:.*]] op19=1 op20=[[NONE3:.*]] op21=0 op22=[[HOT3:.*]] op23=3 op24=[[COLD3:.*]] op25=1/>
 ; COMBINED_NEXT:    <COMBINED abbrevid=
 ; COMBINED_NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
diff --git a/test/Bitcode/thinlto-function-summary-callgraph.ll b/test/Bitcode/thinlto-function-summary-callgraph.ll
index 8025eee5929ba9a6770c3aaf41ece19d584c21ae..a605b7ec2219b8c79bf73980057b71b3e5b066a9 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT: <FUNCTION op0=17 op1=4
 ; CHECK:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
-; See if the call to func is registered.
+; See if the call to func is registered
 ; CHECK-NEXT:    <PERMODULE {{.*}} op4=1
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 ; CHECK: <STRTAB_BLOCK
@@ -33,7 +33,7 @@
 ; COMBINED-NEXT:    <VALUE_GUID
 ; COMBINED-NEXT:    <COMBINED
 ; See if the call to func is registered.
-; COMBINED-NEXT:    <COMBINED {{.*}} op6=[[FUNCID]]/>
+; COMBINED-NEXT:    <COMBINED {{.*}} op8=[[FUNCID]]/>
 ; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; ModuleID = 'thinlto-function-summary-callgraph.ll'
diff --git a/test/Bitcode/thinlto-function-summary-refgraph.ll b/test/Bitcode/thinlto-function-summary-refgraph.ll
index 848598fa6860477a2d4ee8fdaf175920f402cb16..e8a2a770f61df0550b597a8352050a550ce96c1a 100644
--- a/test/Bitcode/thinlto-function-summary-refgraph.ll
+++ b/test/Bitcode/thinlto-function-summary-refgraph.ll
@@ -41,27 +41,27 @@
 ; CHECK:       <GLOBALVAL_SUMMARY_BLOCK
 ; Function main contains call to func, as well as address reference to func:
 ; op0=main op4=func op5=func
-; CHECK-DAG:    <PERMODULE {{.*}} op0=11 op1=0 {{.*}} op4=1 op5=2 op6=2/>
+; CHECK-DAG:    <PERMODULE {{.*}} op0=11 op1=0 {{.*}} op4=1 op5=0 op6=2 op7=2/>
 ; Function W contains a call to func3 as well as a reference to globalvar:
 ; op0=W op4=globalvar op5=func3
-; CHECK-DAG:    <PERMODULE {{.*}} op0=6 op1=5 {{.*}} op4=1 op5=1 op6=5/>
+; CHECK-DAG:    <PERMODULE {{.*}} op0=6 op1=5 {{.*}} op4=1 op5=0 op6=1 op7=5/>
 ; Function X contains call to foo, as well as address reference to foo
 ; which is in the same instruction as the call:
 ; op0=X op4=foo op5=foo
-; CHECK-DAG:    <PERMODULE {{.*}} op0=7 op1=1 {{.*}} op4=1 op5=4 op6=4/>
+; CHECK-DAG:    <PERMODULE {{.*}} op0=7 op1=1 {{.*}} op4=1 op5=0 op6=4 op7=4/>
 ; Function Y contains call to func2, and ensures we don't incorrectly add
 ; a reference to it when reached while earlier analyzing the phi using its
 ; return value:
 ; op0=Y op4=func2
-; CHECK-DAG:    <PERMODULE {{.*}} op0=8 op1=72 {{.*}} op4=0 op5=3/>
+; CHECK-DAG:    <PERMODULE {{.*}} op0=8 op1=72 {{.*}} op4=0 op5=0 op6=3/>
 ; Function Z contains call to func2, and ensures we don't incorrectly add
 ; a reference to it when reached while analyzing subsequent use of its return
 ; value:
 ; op0=Z op4=func2
-; CHECK-DAG:    <PERMODULE {{.*}} op0=9 op1=3 {{.*}} op4=0 op5=3/>
+; CHECK-DAG:    <PERMODULE {{.*}} op0=9 op1=3 {{.*}} op4=0 op5=0 op6=3/>
 ; Variable bar initialization contains address reference to func:
 ; op0=bar op2=func
-; CHECK-DAG:    <PERMODULE_GLOBALVAR_INIT_REFS {{.*}} op0=0 op1=0 op2=2/>
+; CHECK-DAG:    <PERMODULE_GLOBALVAR_INIT_REFS {{.*}} op0=0 op1=0 op2=1 op3=2/>
 ; CHECK:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; CHECK: <STRTAB_BLOCK
@@ -154,11 +154,11 @@ entry:
 ; DIS-DAG: = gv: (name: "foo") ; guid = 6699318081062747564
 ; DIS-DAG: = gv: (name: "func") ; guid = 7289175272376759421
 ; DIS-DAG: = gv: (name: "func3") ; guid = 11517462787082255043
-; DIS-DAG: = gv: (name: "globalvar", summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0)))) ; guid = 12887606300320728018
+; DIS-DAG: = gv: (name: "globalvar", summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 1)))) ; guid = 12887606300320728018
 ; DIS-DAG: = gv: (name: "func2") ; guid = 14069196320850861797
 ; DIS-DAG: = gv: (name: "llvm.ctpop.i8") ; guid = 15254915475081819833
 ; DIS-DAG: = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 9, calls: ((callee: ^{{.*}})), refs: (^{{.*}})))) ; guid = 15822663052811949562
-; DIS-DAG: = gv: (name: "bar", summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), refs: (^{{.*}})))) ; guid = 16434608426314478903
+; DIS-DAG: = gv: (name: "bar", summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 1),  refs: (^{{.*}})))) ; guid = 16434608426314478903
 ; Don't try to match the exact GUID. Since it is private, the file path
 ; will get hashed, and that will be test dependent.
 ; DIS-DAG: = gv: (name: "Y", summaries: (function: (module: ^0, flags: (linkage: private, notEligibleToImport: 0, live: 0, dsoLocal: 1), insts: 14, calls: ((callee: ^{{.*}}))))) ; guid =
diff --git a/test/Bitcode/thinlto-function-summary.ll b/test/Bitcode/thinlto-function-summary.ll
index 7f59eeabd9c42e97d9ddddaa7634e7c10b54b063..be7e9749086b06337d34ee86909011316e315b0b 100644
--- a/test/Bitcode/thinlto-function-summary.ll
+++ b/test/Bitcode/thinlto-function-summary.ll
@@ -13,18 +13,20 @@
 ; BC-NEXT: <FUNCTION op0=7 op1=39
 ; "variadic"
 ; BC-NEXT: <FUNCTION op0=46 op1=8
+; "llvm.va_start"
+; BC-NEXT: <FUNCTION op0=54 op1=13
 ; "f"
-; BC-NEXT: <ALIAS op0=54 op1=1
+; BC-NEXT: <ALIAS op0=67 op1=1
 ; BC: <GLOBALVAL_SUMMARY_BLOCK
 ; BC-NEXT: <VERSION
 ; BC-NEXT: <PERMODULE {{.*}} op0=1 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=2 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=3 op1=7
-; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=0 op2=1 op3=16
-; BC-NEXT: <ALIAS {{.*}} op0=5 op1=0 op2=3
+; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=0 op2=4 op3=0
+; BC-NEXT: <ALIAS {{.*}} op0=6 op1=0 op2=3
 ; BC-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 ; BC: <STRTAB_BLOCK
-; BC-NEXT: blob data = 'hfoobaranon.{{................................}}.0variadicf{{.*}}'
+; BC-NEXT: blob data = 'hfoobaranon.{{................................}}.0variadicllvm.va_startf{{.*}}'
 
 
 ; RUN: opt -name-anon-globals -module-summary < %s | llvm-dis | FileCheck %s
@@ -68,5 +70,10 @@ return:         ; preds = %entry
 }
 
 define i32 @variadic(...) {
+    %ap = alloca i8*, align 8
+    %ap.0 = bitcast i8** %ap to i8*
+    call void @llvm.va_start(i8* %ap.0)
     ret i32 42
 }
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/Bitcode/thinlto-synthetic-count-flag.ll b/test/Bitcode/thinlto-synthetic-count-flag.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eb18a025b94ac60ceace1f7ac150da8d1925e30a
--- /dev/null
+++ b/test/Bitcode/thinlto-synthetic-count-flag.ll
@@ -0,0 +1,21 @@
+; REQUIRES: x86-registered-target
+; RUN: opt -module-summary %s -o %t.o
+
+; Ensure synthetic entry count flag is not set on distributed index
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -compute-dead=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOSYNTHETIC
+; NOSYNTHETIC: <FLAGS op0=0/>
+
+; Ensure synthetic entry count flag is set on distributed index
+; when option used to enable synthetic count propagation
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -thinlto-synthesize-entry-counts \
+; RUN:          -compute-dead=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=HASSYNTHETIC
+; HASSYNTHETIC: <FLAGS op0=4/>
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glob = global i32 0
diff --git a/test/BugPoint/compile-custom.ll b/test/BugPoint/compile-custom.ll
index cb82a01383e1120ee291a5c7f96810be092779ad..847d1184f016b6360355f59527dc87de89da46bb 100644
--- a/test/BugPoint/compile-custom.ll
+++ b/test/BugPoint/compile-custom.ll
@@ -1,4 +1,4 @@
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%/s.py arg1 arg2" --opt-command opt --output-prefix %t %s | FileCheck %s
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%python %/s.py arg1 arg2" --output-prefix %t %s | FileCheck %s
 ; REQUIRES: loadable_module
 
 ; Test that arguments are correctly passed in --compile-command.  The output
diff --git a/test/BugPoint/crash-narrowfunctiontest.ll b/test/BugPoint/crash-narrowfunctiontest.ll
index ef78eb2b0232fb30531f6ac86807d10711e71917..d080d9dd4b0cac54d64b4fe42e9853ebddc2c2e2 100644
--- a/test/BugPoint/crash-narrowfunctiontest.ll
+++ b/test/BugPoint/crash-narrowfunctiontest.ll
@@ -1,6 +1,6 @@
 ; Test that bugpoint can narrow down the testcase to the important function
 ;
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls --opt-command opt -silence-passes > /dev/null
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; REQUIRES: loadable_module
 
 define i32 @foo() { ret i32 1 }
diff --git a/test/BugPoint/func-attrs-keyval.ll b/test/BugPoint/func-attrs-keyval.ll
new file mode 100644
index 0000000000000000000000000000000000000000..830d0964cde1a3523dcc6b8bea852268fb59b0f8
--- /dev/null
+++ b/test/BugPoint/func-attrs-keyval.ll
@@ -0,0 +1,11 @@
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashfuncattr -silence-passes
+; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
+; REQUIRES: loadable_module
+
+; CHECK: f() #[[ATTRS:[0-9]+]]
+define void @f() #0 {
+  ret void
+}
+
+; CHECK: attributes #[[ATTRS]] = { "bugpoint-crash"="sure" }
+attributes #0 = { "bugpoint-crash"="sure" noreturn "no-frame-pointer-elim-non-leaf" }
diff --git a/test/BugPoint/func-attrs.ll b/test/BugPoint/func-attrs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3941e73c217c83b882c8a06f543518364ea2824c
--- /dev/null
+++ b/test/BugPoint/func-attrs.ll
@@ -0,0 +1,11 @@
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashfuncattr -silence-passes
+; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
+; REQUIRES: loadable_module
+
+; CHECK: f() #[[ATTRS:[0-9]+]]
+define void @f() #0 {
+  ret void
+}
+
+; CHECK: attributes #[[ATTRS]] = { "bugpoint-crash" }
+attributes #0 = { noinline "bugpoint-crash" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/BugPoint/invalid-debuginfo.ll b/test/BugPoint/invalid-debuginfo.ll
index f703667982c02a79a04c8b227023ea89fa796033..2005a13b67578de17d186fd7f4163c21bf3cec91 100644
--- a/test/BugPoint/invalid-debuginfo.ll
+++ b/test/BugPoint/invalid-debuginfo.ll
@@ -1,4 +1,4 @@
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes --opt-command opt 2>&1 | FileCheck %s
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes 2>&1 | FileCheck %s
 ; REQUIRES: loadable_module
 ; CHECK: DICompileUnit not listed in llvm.dbg.cu
 
diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll
index e5986494e16feb0184b8dea64d256b417efe316f..ac77b9e5a7d1772f6418dfae461a956418315fad 100644
--- a/test/BugPoint/metadata.ll
+++ b/test/BugPoint/metadata.ll
@@ -1,11 +1,11 @@
 ; REQUIRES: loadable_module
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes -disable-namedmd-remove -disable-strip-debuginfo -disable-strip-debug-types --opt-command opt > /dev/null
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes -disable-namedmd-remove -disable-strip-debuginfo -disable-strip-debug-types > /dev/null
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ;
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t-nodebug -bugpoint-crashcalls -silence-passes -disable-namedmd-remove --opt-command opt > /dev/null
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t-nodebug -bugpoint-crashcalls -silence-passes -disable-namedmd-remove > /dev/null
 ; RUN: llvm-dis %t-nodebug-reduced-simplified.bc -o - | FileCheck %s --check-prefix=NODEBUG
 ;
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t-notype -bugpoint-crashcalls -silence-passes -disable-namedmd-remove -disable-strip-debuginfo --opt-command opt > /dev/null
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t-notype -bugpoint-crashcalls -silence-passes -disable-namedmd-remove -disable-strip-debuginfo > /dev/null
 ; RUN: llvm-dis %t-notype-reduced-simplified.bc -o - | FileCheck %s --check-prefix=NOTYPE
 ;
 ; Bugpoint should keep the call's metadata attached to the call.
diff --git a/test/BugPoint/named-md.ll b/test/BugPoint/named-md.ll
index 21b65af6f73eb697e22d8b13cf3c9f8eea3438e5..1ed34435fbebba40ac8b3c3c59bdf934c86cd7e4 100644
--- a/test/BugPoint/named-md.ll
+++ b/test/BugPoint/named-md.ll
@@ -1,4 +1,4 @@
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes -disable-strip-debuginfo --opt-command opt > /dev/null
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes -disable-strip-debuginfo > /dev/null
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; RUN-DISABLE: bugpoint -disable-namedmd-remove -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes > /dev/null
 ; RUN-DISABLE: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
diff --git a/test/BugPoint/remove_arguments_test.ll b/test/BugPoint/remove_arguments_test.ll
index d5967675ef868ae77af27eac060285f83b0c2609..72be4fe559363d61da57e2a76f20d9114136880b 100644
--- a/test/BugPoint/remove_arguments_test.ll
+++ b/test/BugPoint/remove_arguments_test.ll
@@ -1,4 +1,4 @@
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes --opt-command opt
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
 
diff --git a/test/BugPoint/replace-funcs-with-null.ll b/test/BugPoint/replace-funcs-with-null.ll
index b6976c9a2bbb77fe12d74cfa19c2709cc29224f2..622f9eb67a29faa7ee22256804f678f8ef5e9f2d 100644
--- a/test/BugPoint/replace-funcs-with-null.ll
+++ b/test/BugPoint/replace-funcs-with-null.ll
@@ -1,6 +1,6 @@
 ; Test that bugpoint can reduce the set of functions by replacing them with null.
 ;
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -replace-funcs-with-null -bugpoint-crash-decl-funcs -silence-passes -safe-run-llc --opt-command opt
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -replace-funcs-with-null -bugpoint-crash-decl-funcs -silence-passes -safe-run-llc
 ; REQUIRES: loadable_module
 
 @foo2 = alias i32 (), i32 ()* @foo
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll
index 62abf3d81d58009ef1eaf4e1a5f533fcfa23f47b..ad01264f1a43f19362aec25d92e7682a25feb39c 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll
@@ -3,6 +3,9 @@
 
 ; CHECK: name: test_stack_guard
 
+; CHECK: frameInfo:
+; CHECK: stackProtector:  '%stack.0.StackGuardSlot'
+
 ; CHECK: stack:
 ; CHECK:  - { id: 0, name: StackGuardSlot,  type: default, offset: 0, size: 8, alignment: 8,
 ; CHECK-NOT: id: 1
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 2997c5350ebc1cce4dfee1d22f3685d8dc1184a6..74de033447fcf5cc4ba95aea45da8cc81213a8f4 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1173,12 +1173,12 @@ define void @test_constant_float(float* %addr) {
 ; CHECK: [[BOOLADDR:%[0-9]+]]:_(p0) = COPY $x2
 ; CHECK: [[LHS:%[0-9]+]]:_(s32) = G_LOAD [[LHSADDR]](p0)
 ; CHECK: [[RHS:%[0-9]+]]:_(s32) = G_LOAD [[RHSADDR]](p0)
-; CHECK: [[TST:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[LHS]](s32), [[RHS]]
+; CHECK: [[TST:%[0-9]+]]:_(s1) = nnan ninf nsz arcp contract afn reassoc G_FCMP floatpred(oge), [[LHS]](s32), [[RHS]]
 ; CHECK: G_STORE [[TST]](s1), [[BOOLADDR]](p0)
 define void @float_comparison(float* %a.addr, float* %b.addr, i1* %bool.addr) {
   %a = load float, float* %a.addr
   %b = load float, float* %b.addr
-  %res = fcmp oge float %a, %b
+  %res = fcmp nnan ninf nsz arcp contract afn reassoc oge float %a, %b
   store i1 %res, i1* %bool.addr
   ret void
 }
@@ -1338,9 +1338,9 @@ define float @test_pow_intrin(float %l, float %r) {
 ; CHECK-LABEL: name: test_pow_intrin
 ; CHECK: [[LHS:%[0-9]+]]:_(s32) = COPY $s0
 ; CHECK: [[RHS:%[0-9]+]]:_(s32) = COPY $s1
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FPOW [[LHS]], [[RHS]]
+; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FPOW [[LHS]], [[RHS]]
 ; CHECK: $s0 = COPY [[RES]]
-  %res = call float @llvm.pow.f32(float %l, float %r)
+  %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.pow.f32(float %l, float %r)
   ret float %res
 }
 
@@ -1350,9 +1350,9 @@ define float @test_fma_intrin(float %a, float %b, float %c) {
 ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0
 ; CHECK: [[B:%[0-9]+]]:_(s32) = COPY $s1
 ; CHECK: [[C:%[0-9]+]]:_(s32) = COPY $s2
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FMA [[A]], [[B]], [[C]]
+; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FMA [[A]], [[B]], [[C]]
 ; CHECK: $s0 = COPY [[RES]]
-  %res = call float @llvm.fma.f32(float %a, float %b, float %c)
+  %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %res
 }
 
@@ -1360,9 +1360,9 @@ declare float @llvm.exp.f32(float)
 define float @test_exp_intrin(float %a) {
 ; CHECK-LABEL: name: test_exp_intrin
 ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FEXP [[A]]
+; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FEXP [[A]]
 ; CHECK: $s0 = COPY [[RES]]
-  %res = call float @llvm.exp.f32(float %a)
+  %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.exp.f32(float %a)
   ret float %res
 }
 
@@ -1370,9 +1370,9 @@ declare float @llvm.exp2.f32(float)
 define float @test_exp2_intrin(float %a) {
 ; CHECK-LABEL: name: test_exp2_intrin
 ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FEXP2 [[A]]
+; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FEXP2 [[A]]
 ; CHECK: $s0 = COPY [[RES]]
-  %res = call float @llvm.exp2.f32(float %a)
+  %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.exp2.f32(float %a)
   ret float %res
 }
 
@@ -1380,9 +1380,9 @@ declare float @llvm.log.f32(float)
 define float @test_log_intrin(float %a) {
 ; CHECK-LABEL: name: test_log_intrin
 ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FLOG [[A]]
+; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FLOG [[A]]
 ; CHECK: $s0 = COPY [[RES]]
-  %res = call float @llvm.log.f32(float %a)
+  %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.log.f32(float %a)
   ret float %res
 }
 
@@ -1396,13 +1396,23 @@ define float @test_log2_intrin(float %a) {
   ret float %res
 }
 
+declare float @llvm.log10.f32(float)
+define float @test_log10_intrin(float %a) {
+; CHECK-LABEL: name: test_log10_intrin
+; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0
+; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FLOG10 [[A]]
+; CHECK: $s0 = COPY [[RES]]
+  %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.log10.f32(float %a)
+  ret float %res
+}
+
 declare float @llvm.fabs.f32(float)
 define float @test_fabs_intrin(float %a) {
 ; CHECK-LABEL: name: test_fabs_intrin
 ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FABS [[A]]
+; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FABS [[A]]
 ; CHECK: $s0 = COPY [[RES]]
-  %res = call float @llvm.fabs.f32(float %a)
+  %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.fabs.f32(float %a)
   ret float %res
 }
 
@@ -1561,7 +1571,7 @@ define i32 @test_singleelementvector(i32 %elt){
 define <2 x i32> @test_constantaggzerovector_v2i32() {
 ; CHECK-LABEL: name: test_constantaggzerovector_v2i32
 ; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ZERO]](s32), [[ZERO]](s32)
 ; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
   ret <2 x i32> zeroinitializer
 }
@@ -1569,7 +1579,7 @@ define <2 x i32> @test_constantaggzerovector_v2i32() {
 define <2 x float> @test_constantaggzerovector_v2f32() {
 ; CHECK-LABEL: name: test_constantaggzerovector_v2f32
 ; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ZERO]](s32), [[ZERO]](s32)
 ; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
   ret <2 x float> zeroinitializer
 }
@@ -1577,7 +1587,7 @@ define <2 x float> @test_constantaggzerovector_v2f32() {
 define i32 @test_constantaggzerovector_v3i32() {
 ; CHECK-LABEL: name: test_constantaggzerovector_v3i32
 ; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32), [[ZERO]](s32)
+; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ZERO]](s32), [[ZERO]](s32), [[ZERO]](s32)
 ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>)
   %elt = extractelement <3 x i32> zeroinitializer, i32 1
   ret i32 %elt
@@ -1587,7 +1597,7 @@ define <2 x i32> @test_constantdatavector_v2i32() {
 ; CHECK-LABEL: name: test_constantdatavector_v2i32
 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32)
 ; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
   ret <2 x i32> <i32 1, i32 2>
 }
@@ -1597,7 +1607,7 @@ define i32 @test_constantdatavector_v3i32() {
 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
 ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32), [[C3]](s32)
+; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C3]](s32)
 ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>)
   %elt = extractelement <3 x i32> <i32 1, i32 2, i32 3>, i32 1
   ret i32 %elt
@@ -1609,7 +1619,7 @@ define <4 x i32> @test_constantdatavector_v4i32() {
 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
 ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
 ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-; CHECK: [[VEC:%[0-9]+]]:_(<4 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32)
+; CHECK: [[VEC:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32)
 ; CHECK: $q0 = COPY [[VEC]](<4 x s32>)
   ret <4 x i32> <i32 1, i32 2, i32 3, i32 4>
 }
@@ -1618,7 +1628,7 @@ define <2 x double> @test_constantdatavector_v2f64() {
 ; CHECK-LABEL: name: test_constantdatavector_v2f64
 ; CHECK: [[FC1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
 ; CHECK: [[FC2:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s64>) = G_MERGE_VALUES [[FC1]](s64), [[FC2]](s64)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FC1]](s64), [[FC2]](s64)
 ; CHECK: $q0 = COPY [[VEC]](<2 x s64>)
   ret <2 x double> <double 1.0, double 2.0>
 }
@@ -1662,7 +1672,7 @@ define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $w0
 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
 ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32)
+; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C0]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], [[MASK]](<2 x s32>)
 ; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
@@ -1688,7 +1698,7 @@ define <2 x i32> @test_shufflevector_v2s32_v2s32(<2 x i32> %arg) {
 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32)
+; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C0]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[MASK]](<2 x s32>)
 ; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
   %res = shufflevector <2 x i32> %arg, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
@@ -1701,7 +1711,7 @@ define i32 @test_shufflevector_v2s32_v3s32(<2 x i32> %arg) {
 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-; CHECK-DAG: [[MASK:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32), [[C1]](s32)
+; CHECK-DAG: [[MASK:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C0]](s32), [[C1]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[MASK]](<3 x s32>)
 ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>)
   %vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
@@ -1717,7 +1727,7 @@ define <4 x i32> @test_shufflevector_v2s32_v4s32(<2 x i32> %arg1, <2 x i32> %arg
 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
 ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-; CHECK: [[MASK:%[0-9]+]]:_(<4 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32)
+; CHECK: [[MASK:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[ARG1]](<2 x s32>), [[ARG2]], [[MASK]](<4 x s32>)
 ; CHECK: $q0 = COPY [[VEC]](<4 x s32>)
   %res = shufflevector <2 x i32> %arg1, <2 x i32> %arg2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1730,7 +1740,7 @@ define <2 x i32> @test_shufflevector_v4s32_v2s32(<4 x i32> %arg) {
 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK-DAG: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C3]](s32)
+; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C3]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<4 x s32>), [[UNDEF]], [[MASK]](<2 x s32>)
 ; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
   %res = shufflevector <4 x i32> %arg, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
@@ -1758,7 +1768,7 @@ define <16 x i8> @test_shufflevector_v8s8_v16s8(<8 x i8> %arg1, <8 x i8> %arg2)
 ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
 ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
 ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
-; CHECK: [[MASK:%[0-9]+]]:_(<16 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C8]](s32), [[C1]](s32), [[C9]](s32), [[C2]](s32), [[C10]](s32), [[C3]](s32), [[C11]](s32), [[C4]](s32), [[C12]](s32), [[C5]](s32), [[C13]](s32), [[C6]](s32), [[C14]](s32), [[C7]](s32), [[C15]](s32)
+; CHECK: [[MASK:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C8]](s32), [[C1]](s32), [[C9]](s32), [[C2]](s32), [[C10]](s32), [[C3]](s32), [[C11]](s32), [[C4]](s32), [[C12]](s32), [[C5]](s32), [[C13]](s32), [[C6]](s32), [[C14]](s32), [[C7]](s32), [[C15]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[ARG1]](<8 x s8>), [[ARG2]], [[MASK]](<16 x s32>)
 ; CHECK: $q0 = COPY [[VEC]](<16 x s8>)
   %res = shufflevector <8 x i8> %arg1, <8 x i8> %arg2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1768,7 +1778,7 @@ define <16 x i8> @test_shufflevector_v8s8_v16s8(<8 x i8> %arg1, <8 x i8> %arg2)
 ; CHECK-LABEL: test_constant_vector
 ; CHECK: [[UNDEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
 ; CHECK: [[F:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
-; CHECK: [[M:%[0-9]+]]:_(<4 x s16>) = G_MERGE_VALUES [[UNDEF]](s16), [[UNDEF]](s16), [[UNDEF]](s16), [[F]](s16)
+; CHECK: [[M:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UNDEF]](s16), [[UNDEF]](s16), [[UNDEF]](s16), [[F]](s16)
 ; CHECK: $d0 = COPY [[M]](<4 x s16>)
 define <4 x half> @test_constant_vector() {
   ret <4 x half> <half undef, half undef, half undef, half 0xH3C00>
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-stackprotect-check.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-stackprotect-check.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3559c8284a46ff3a9544dcb519db5d43722ae95f
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-stackprotect-check.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O0 -stop-before=irtranslator -global-isel %s -o - | FileCheck %s
+; RUN: llc -O0 -stop-after=irtranslator -verify-machineinstrs -global-isel %s -o - | FileCheck --check-prefixes CHECK,CHECK-MIR %s
+
+; Check that when using GlobalISel, the StackProtector pass currently inserts
+; both prologue and epilogue instrumentation because GlobalISel does not have
+; the same epilogue insertion/optimization as SelectionDAG.
+
+target triple = "aarch64-none-unknown-eabi"
+
+define void @foo() ssp {
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %StackGuardSlot = alloca i8*
+; CHECK-NEXT:   %0 = call i8* @llvm.stackguard()
+; CHECK-NEXT:   call void @llvm.stackprotector(i8* %0, i8** %StackGuardSlot)
+; CHECK-NEXT:   %buf = alloca [8 x i8], align 1
+; CHECK-NEXT:   %1 = call i8* @llvm.stackguard()
+; CHECK-NEXT:   %2 = load volatile i8*, i8** %StackGuardSlot
+; CHECK-NEXT:   %3 = icmp eq i8* %1, %2
+; CHECK-NEXT:   br i1 %3, label %SP_return, label %CallStackCheckFailBlk, !prof !0
+;
+; CHECK: SP_return:
+; CHECK-NEXT:   ret void
+;
+; CHECK: CallStackCheckFailBlk:
+; CHECK-NEXT:   call void @__stack_chk_fail()
+; CHECK-NEXT:   unreachable
+
+; CHECK-MIR: bb.1.entry:
+; CHECK-MIR:   %0:_(p0) = G_FRAME_INDEX %stack.0.StackGuardSlot
+; CHECK-MIR-NEXT:   %1:gpr64sp(p0) = LOAD_STACK_GUARD :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+; CHECK-MIR-NEXT:   %2:gpr64sp(p0) = LOAD_STACK_GUARD :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+; CHECK-MIR-NEXT:   G_STORE %2(p0), %0(p0) :: (volatile store 8 into %stack.0.StackGuardSlot)
+; CHECK-MIR-NEXT:   %3:_(p0) = G_FRAME_INDEX %stack.1.buf
+; CHECK-MIR-NEXT:   %4:gpr64sp(p0) = LOAD_STACK_GUARD :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+; CHECK-MIR-NEXT:   %5:_(p0) = G_LOAD %0(p0) :: (volatile load 8 from %ir.StackGuardSlot)
+; CHECK-MIR-NEXT:   %6:_(s1) = G_ICMP intpred(eq), %4(p0), %5
+; CHECK-MIR-NEXT:   G_BRCOND %6(s1), %bb.2
+; CHECK-MIR-NEXT:   G_BR %bb.3
+;
+; CHECK-MIR: bb.2.SP_return:
+; CHECK-MIR-NEXT:   RET_ReallyLR
+;
+; CHECK-MIR: bb.3.CallStackCheckFailBlk:
+; CHECK-MIR-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+; CHECK-MIR-NEXT:   BL @__stack_chk_fail, csr_aarch64_aapcs, implicit-def $lr, implicit $sp
+; CHECK-MIR-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+entry:
+  %buf = alloca [8 x i8], align 1
+  ret void
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll
index 9bda39c9fca7fa5305b2686f5110d67e780db485..099a9e71e42dfa565aabbb48e6afef1a5f50e7bf 100644
--- a/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll
@@ -3,7 +3,7 @@
 @g = global i16 0, align 2
 declare void @bar(i32)
 
-; Check that only one load is generated. We fall back to
+; Check that only one load is generated for an extending volatile load.
 define hidden void @foo() {
 ; CHECK-NOT: ldrh
 ; CHECK: ldrsh
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
index fe6079c0db407e686c273d2e737d75c69dbb33e4..c3773f50cb0d0204a7a0bbbeec5d46637d490372 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
@@ -95,8 +95,8 @@ body:             |
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
     %3:_(<2 x s64>) = COPY $q3
-    %4:_(<4 x s64>) = G_MERGE_VALUES %0(<2 x s64>), %1(<2 x s64>)
-    %5:_(<4 x s64>) = G_MERGE_VALUES %2(<2 x s64>), %3(<2 x s64>)
+    %4:_(<4 x s64>) = G_CONCAT_VECTORS %0, %1
+    %5:_(<4 x s64>) = G_CONCAT_VECTORS %2, %3
     %6:_(<4 x s64>) = G_ADD %4, %5
     %7:_(<2 x s64>), %8:_(<2 x s64>) = G_UNMERGE_VALUES %6(<4 x s64>)
     $q0 = COPY %7(<2 x s64>)
@@ -122,12 +122,11 @@ body:             |
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
     %3:_(<2 x s64>) = COPY $q3
-    %4:_(<6 x s64>) = G_MERGE_VALUES %0(<2 x s64>), %1(<2 x s64>), %2(<2 x s64>)
-    %5:_(<6 x s64>) = G_MERGE_VALUES %1(<2 x s64>), %2(<2 x s64>), %3(<2 x s64>)
+    %4:_(<6 x s64>) = G_CONCAT_VECTORS %0(<2 x s64>), %1(<2 x s64>), %2(<2 x s64>)
+    %5:_(<6 x s64>) = G_CONCAT_VECTORS %1(<2 x s64>), %2(<2 x s64>), %3(<2 x s64>)
     %6:_(<6 x s64>) = G_ADD %4, %5
     %7:_(<2 x s64>), %8:_(<2 x s64>), %9:_(<2 x s64>) = G_UNMERGE_VALUES %6(<6 x s64>)
     $q0 = COPY %7(<2 x s64>)
     $q1 = COPY %8(<2 x s64>)
     $q2 = COPY %9(<2 x s64>)
-
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir b/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir
new file mode 100644
index 0000000000000000000000000000000000000000..226a469682e7a853eb4ffdb1a93def69cb353d62
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            legal_v4s32
+body: |
+  bb.0:
+    liveins: $w0, $w1, $w2, $w3
+    ; CHECK-LABEL: name: legal_v4s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    $q0 = COPY %4(<4 x s32>)
+    RET_ReallyLR
+...
+---
+name:            legal_v2s64
+body: |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: legal_v2s64
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
+    ; CHECK: $q0 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; CHECK: RET_ReallyLR
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64)
+    $q0 = COPY %2(<2 x s64>)
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
index b1be33cbeb5c24939a241e407ba36e9f776554aa..a8926832e096ca586fb2a631d4197c955cbca50a 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
@@ -51,10 +51,8 @@ body:             |
     ; CHECK: $w0 = COPY [[ASHR2]](s32)
     ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK: [[TRUNC10:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
-    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC10]], [[COPY5]]
-    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
-    ; CHECK: $w0 = COPY [[COPY6]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC10]], [[C6]]
+    ; CHECK: $w0 = COPY [[AND3]](s32)
     ; CHECK: [[TRUNC11:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK: $w0 = COPY [[TRUNC11]](s32)
     ; CHECK: [[TRUNC12:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
@@ -118,3 +116,59 @@ body:             |
     $w0 = COPY %27(s32)
 
 ...
+---
+name:            test_anyext_anyext
+body:             |
+  bb.0:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: test_anyext_anyext
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: $w0 = COPY [[COPY1]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(s8) = G_ANYEXT %1(s1)
+    %3:_(s32) = G_ANYEXT %2(s8)
+    $w0 = COPY %3(s32)
+
+...
+---
+name:            test_anyext_sext
+body:             |
+  bb.0:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: test_anyext_sext
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]]
+    ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]]
+    ; CHECK: $w0 = COPY [[ASHR]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(s8) = G_SEXT %1(s1)
+    %3:_(s32) = G_ANYEXT %2(s8)
+    $w0 = COPY %3(s32)
+
+...
+---
+name:            test_anyext_zext
+body:             |
+  bb.0:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: test_anyext_zext
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(s8) = G_ZEXT %1(s1)
+    %3:_(s32) = G_ANYEXT %2(s8)
+    $w0 = COPY %3(s32)
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir b/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir
new file mode 100644
index 0000000000000000000000000000000000000000..ecba4f226301ec0e5025e8a43486fbeb40568503
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir
@@ -0,0 +1,21 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            test_eve_1
+body: |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name: test_eve_1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[C]](s32)
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[SEXT]](s64)
+    ; CHECK: $x0 = COPY [[EVEC]](s64)
+    ; CHECK: RET_ReallyLR
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(s64) = G_EXTRACT_VECTOR_ELT %0(<2 x s64>), %1(s32)
+    $x0 = COPY %2(s64)
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir
deleted file mode 100644
index 7f42f6e6c336942dbcd1924f889992a5a2d54b1a..0000000000000000000000000000000000000000
--- a/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir
+++ /dev/null
@@ -1,39 +0,0 @@
-# RUN: llc -march=aarch64 -o - -run-pass=legalizer -global-isel-abort=0 -debug-only=legalizer 2>&1 %s | FileCheck %s
-# REQUIRES: asserts
-
-# CHECK: Legalize Machine IR for: load_v4s32
-# CHECK-NEXT: %{{[0-9]+}}:_(<4 x s32>) = G_LOAD %{{[0-9]+}}:_(p0)
-# CHECK-NEXT: Reduce number of elements
----
-name:            load_v4s32
-legalized:       false
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0
-
-    %0:_(p0) = COPY $x0
-    %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16, align 4)
-    %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>)
-    $w0 = COPY %5(s32)
-
-...
-
-# Make sure we are able to scalarize v2s64.
-# CHECK: Legalize Machine IR for: load_v2s64
-# CHECK-NEXT: %{{[0-9]+}}:_(<2 x s64>) = G_LOAD %{{[0-9]+}}:_(p0)
-# CHECK-NEXT: Reduce number of elements
----
-name:            load_v2s64
-legalized:       false
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0
-
-    %0:_(p0) = COPY $x0
-    %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load 16)
-    %2:_(s64), %3:_(s64) = G_UNMERGE_VALUES %1(<2 x s64>)
-    $x0 = COPY %3(s64)
-
-...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store-fewerElts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store-fewerElts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..27791881ef3b18cb09010e8e914d6ad2e1776a23
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store-fewerElts.mir
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -o - -run-pass=legalizer %s | FileCheck %s
+---
+name:            load_v4s32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: load_v4s32
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load 8, align 16)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP]](p0) :: (load 8)
+    ; CHECK: G_STORE [[LOAD]](<2 x s32>), [[COPY1]](p0) :: (store 8, align 16)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64)
+    ; CHECK: G_STORE [[LOAD1]](<2 x s32>), [[GEP1]](p0) :: (store 8)
+    %0:_(p0) = COPY $x0
+    %1:_(p0) = COPY $x1
+    %2:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16)
+    G_STORE %2(<4 x s32>), %1(p0) :: (store 16)
+
+...
+---
+name:            load_v2s64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: load_v2s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8, align 16)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP]](p0) :: (load 8)
+    ; CHECK: G_STORE [[LOAD]](s64), [[COPY1]](p0) :: (store 8, align 16)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64)
+    ; CHECK: G_STORE [[LOAD1]](s64), [[GEP1]](p0) :: (store 8)
+    %0:_(p0) = COPY $x0
+    %1:_(p0) = COPY $x1
+    %2:_(<2 x s64>) = G_LOAD %0(p0) :: (load 16)
+    G_STORE %2(<2 x s64>), %1(p0) :: (store 16)
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir b/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir
deleted file mode 100644
index f0a45bf3cf0cb251577f6c8a826bf6643467fbaf..0000000000000000000000000000000000000000
--- a/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir
+++ /dev/null
@@ -1,33 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_legalize_merge_v3s64() {
-    ret void
-  }
-...
----
-name:            test_legalize_merge_v3s64
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
-  bb.0:
-    liveins: $w0
-    ; CHECK-LABEL: name: test_legalize_merge_v3s64
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[MV:%[0-9]+]]:_(<3 x s64>) = G_MERGE_VALUES [[COPY]](s64), [[COPY]](s64), [[COPY]](s64)
-    ; CHECK: $x0 = COPY [[COPY]](s64)
-    ; CHECK: $noreg = PATCHABLE_RET [[MV]](<3 x s64>)
-    %0(s64) = COPY $x0
-    %1(<3 x s64>) = G_MERGE_VALUES %0(s64), %0(s64), %0(s64)
-    %2(s64), %3(s64), %4(s64) = G_UNMERGE_VALUES %1(<3 x s64>)
-    $x0 = COPY %3(s64)
-    $noreg = PATCHABLE_RET %1(<3 x s64>)
-...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir b/test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b07445041450f4df07ef3ddbc403b140e36d121f
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--"
+  define void @test_unmerge() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_unmerge
+body:             |
+  bb.1:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_unmerge
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: $w0 = COPY [[COPY]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %0(s32), %0(s32), %0(s32)
+    %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>)
+    $w0 = COPY %2(s32)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index ca059cf15443d26c666dba2c0bd07ec3b8c48b5a..0c75cd3ce3b60ef9686079fa371d88b7a5eeae3c 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -69,6 +69,15 @@
 # DEBUG-NEXT: G_MERGE_VALUES (opcode {{[0-9]+}}): 2 type indices
 # DEBUG:      .. type index coverage check SKIPPED: user-defined predicate detected
 #
+# DEBUG-NEXT: G_BUILD_VECTOR (opcode {{[0-9]+}}): 2 type indices
+# DEBUG:      .. type index coverage check SKIPPED: user-defined predicate detected
+#
+# DEBUG-NEXT: G_BUILD_VECTOR_TRUNC (opcode {{[0-9]+}}): 2 type indices
+# DEBUG:      .. type index coverage check SKIPPED: no rules defined
+#
+# DEBUG-NEXT: G_CONCAT_VECTORS (opcode {{[0-9]+}}): 2 type indices
+# DEBUG:      .. type index coverage check SKIPPED: no rules defined
+#
 # DEBUG-NEXT: G_PTRTOINT (opcode {{[0-9]+}}): 2 type indices
 # DEBUG:      .. the first uncovered type index: 2, OK
 #
@@ -258,6 +267,9 @@
 # DEBUG-NEXT: G_FLOG2 (opcode {{[0-9]+}}): 1 type index
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
 #
+# DEBUG-NEXT: G_FLOG10 (opcode {{[0-9]+}}): 1 type index
+# DEBUG:      .. type index coverage check SKIPPED: no rules defined
+#
 # DEBUG-NEXT: G_FNEG (opcode {{[0-9]+}}): 1 type index
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
 #
diff --git a/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir b/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir
index 8a78ed962e38b94ef2fb49808fbca2ffde1d37f6..c41405c5a0dd968b4f6ee1cce39e2b1d7b070015 100644
--- a/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir
+++ b/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir
@@ -1,4 +1,9 @@
 # RUN: llc -O0 -run-pass=aarch64-prelegalizer-combiner -global-isel %s -o - | FileCheck %s
+# RUN: llc -O0 -run-pass=aarch64-prelegalizer-combiner -global-isel %s -o - \
+# RUN:   -debug-only=aarch64-prelegalizer-combiner,gi-combiner 2>&1 >/dev/null \
+# RUN:   | FileCheck %s --check-prefix=CHECK-WORKLIST
+
+# REQUIRES: asserts
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -102,6 +107,23 @@ body: |
     ; CHECK: $w1 = COPY [[T6]](s32)
     $w0 = COPY %3
     $w1 = COPY %9
+
+# Check that we report the correct modifications to the observer. This acts as
+# a test of the debug output and a test.
+#
+# CHECK-WORKLIST-LABEL: Generic MI Combiner for: multiple_copies
+# CHECK-WORKLIST:       Try combining [[IN0:%[0-9]+]]:_(s8) = G_LOAD [[IN1:%[0-9]+]]:_(p0) :: (load 1 from %ir.addr)
+# CHECK-WORKLIST:       Preferred use is: [[IN2:%[0-9]+]]:_(s32) = G_SEXT [[IN0]]:_(s8)
+# CHECK-WORKLIST-DAG:   Changing: [[IN0]]:_(s8) = G_LOAD [[IN1]]:_(p0) :: (load 1 from %ir.addr)
+# CHECK-WORKLIST-DAG:   Changing: [[IN3:%[0-9]+]]:_(s8) = G_ADD [[IN0]]:_, [[IN4:%[0-9]+]]:_
+# CHECK-WORKLIST-DAG:   Changed: [[IN3]]:_(s8) = G_ADD [[NEW1:%[0-9]+]]:_, [[IN4]]:_
+# CHECK-WORKLIST-DAG:   Changing: [[IN5:%[0-9]+]]:_(s8) = G_SUB [[IN0]]:_, [[IN6:%[0-9]+]]:_
+# CHECK-WORKLIST-DAG:   Changed: [[IN5]]:_(s8) = G_SUB [[NEW2:%[0-9]+]]:_, [[IN6]]:_
+# CHECK-WORKLIST-DAG:   Erased: [[IN2]]:_(s32) = G_SEXT [[IN0]]:_(s8)
+# CHECK-WORKLIST-DAG:   Changed: [[IN2]]:_(s32) = G_SEXTLOAD [[IN1]]:_(p0) :: (load 1 from %ir.addr)
+# CHECK-WORKLIST-DAG:   Created: [[NEW1]]:_(s8) = G_TRUNC [[IN2]]:_(s32)
+# CHECK-WORKLIST-DAG:   Created: [[NEW2]]:_(s8) = G_TRUNC [[IN2]]:_(s32)
+# CHECK-WORKLIST:       Try combining
 ...
 
 ---
@@ -157,37 +179,52 @@ body: |
     %0:_(p0) = COPY $x0
     %1:_(s32) = COPY $w1
     %2:_(s8) = G_LOAD %0 :: (load 1 from %ir.addr)
-    %3:_(s32) = G_SEXT %2
-    %4:_(s32) = G_CONSTANT i32 1
-    %5:_(s1) = G_ICMP intpred(ne), %1:_(s32), %4:_
-    G_BRCOND %5:_(s1), %bb.1
+    %3:_(s32) = G_CONSTANT i32 1
+    %4:_(s1) = G_ICMP intpred(ne), %1:_(s32), %3:_
+    G_BRCOND %4:_(s1), %bb.1
     G_BR %bb.2.else
   bb.1.if:
   ; CHECK: bb.1.if:
     successors: %bb.3(0x80000000)
-    %10:_(s8) = G_CONSTANT i8 1
+    %5:_(s8) = G_CONSTANT i8 1
     ; CHECK: [[T1:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32)
-    %6:_(s8) = G_ADD %2, %10
+    %6:_(s8) = G_ADD %2, %5
     ; CHECK: [[T2:%[0-9]+]]:_(s8) = G_ADD [[T1]], {{.*}}
     G_BR %bb.3.exit
   bb.2.else:
   ; CHECK: bb.2.else:
     successors: %bb.3(0x80000000)
-    %11:_(s8) = G_CONSTANT i8 1
+    %7:_(s8) = G_CONSTANT i8 1
     ; CHECK: [[T3:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32)
-    %7:_(s8) = G_SUB %2, %11
+    %8:_(s8) = G_SUB %2, %7
     ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_SUB [[T3]], {{.*}}
     G_BR %bb.3.exit
   bb.3.exit:
   ; CHECK: bb.3.exit:
-    %8:_(s8) = G_PHI %6:_(s8), %bb.1, %7:_(s8), %bb.2
+    %9:_(s8) = G_PHI %6:_(s8), %bb.1, %8:_(s8), %bb.2
     ; CHECK: [[T5:%[0-9]+]]:_(s8) = G_PHI [[T2]](s8), %bb.1, [[T4]](s8)
-    %9:_(s32) = G_ZEXT %8
+    %10:_(s32) = G_SEXT %2
+    %11:_(s32) = G_ZEXT %9
     ; CHECK: [[T6:%[0-9]+]]:_(s32) = G_ZEXT [[T5]](s8)
     ; CHECK: $w0 = COPY [[T0]](s32)
     ; CHECK: $w1 = COPY [[T6]](s32)
-    $w0 = COPY %3
-    $w1 = COPY %9
+    $w0 = COPY %10
+    $w1 = COPY %11
+# CHECK-WORKLIST-LABEL: Generic MI Combiner for: sink_to_phi_nondominating
+# CHECK-WORKLIST:       Try combining [[IN0:%[0-9]+]]:_(s8) = G_LOAD [[IN1:%[0-9]+]]:_(p0) :: (load 1 from %ir.addr)
+# CHECK-WORKLIST:       Preferred use is: [[IN2:%[0-9]+]]:_(s32) = G_SEXT [[IN0]]:_(s8)
+# CHECK-WORKLIST-DAG:   Changing: [[IN0]]:_(s8) = G_LOAD [[IN1]]:_(p0) :: (load 1 from %ir.addr)
+# CHECK-WORKLIST-DAG:   Creating: G_TRUNC
+# CHECK-WORKLIST-DAG:   Changing: [[IN3:%[0-9]+]]:_(s8) = G_ADD [[IN0]]:_, [[IN4:%[0-9]+]]:_
+# CHECK-WORKLIST-DAG:   Changed: [[IN3]]:_(s8) = G_ADD [[OUT1:%[0-9]+]]:_, [[IN4]]:_
+# CHECK-WORKLIST-DAG:   Creating: G_TRUNC
+# CHECK-WORKLIST-DAG:   Changing: [[IN5:%[0-9]+]]:_(s8) = G_SUB [[IN0]]:_, [[IN6:%[0-9]+]]:_
+# CHECK-WORKLIST-DAG:   Changed: [[IN5]]:_(s8) = G_SUB [[OUT2:%[0-9]+]]:_, [[IN6]]:_
+# CHECK-WORKLIST-DAG:   Erased: [[IN2]]:_(s32) = G_SEXT [[IN0]]:_(s8)
+# CHECK-WORKLIST-DAG:   Changed: [[IN2]]:_(s32) = G_SEXTLOAD [[IN1]]:_(p0) :: (load 1 from %ir.addr)
+# CHECK-WORKLIST-DAG:   Created: [[OUT1]]:_(s8) = G_TRUNC [[IN2]]:_(s32)
+# CHECK-WORKLIST-DAG:   Created: [[OUT2]]:_(s8) = G_TRUNC [[IN2]]:_(s32)
+# CHECK-WORKLIST:       Try combining
 ...
 
 ---
diff --git a/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir b/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
new file mode 100644
index 0000000000000000000000000000000000000000..5f3abcc627dde6a29de25345bed46a871fd00b07
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
@@ -0,0 +1,301 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64"
+
+  define <4 x float> @test_f32(float %a, float %b, float %c, float %d) {
+    ret <4 x float> undef
+  }
+
+  define <2 x double> @test_f64(double %a, double %b) {
+    ret <2 x double> undef
+  }
+
+  define <4 x i32> @test_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
+    ret <4 x i32> undef
+  }
+
+  define <2 x i64> @test_i64(i64 %a, i64 %b) {
+    ret <2 x i64> undef
+  }
+
+...
+---
+name:            test_f32
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr, preferred-register: '' }
+  - { id: 1, class: fpr, preferred-register: '' }
+  - { id: 2, class: fpr, preferred-register: '' }
+  - { id: 3, class: fpr, preferred-register: '' }
+  - { id: 4, class: fpr, preferred-register: '' }
+  - { id: 5, class: _, preferred-register: '' }
+  - { id: 6, class: _, preferred-register: '' }
+  - { id: 7, class: _, preferred-register: '' }
+  - { id: 8, class: _, preferred-register: '' }
+  - { id: 9, class: _, preferred-register: '' }
+  - { id: 10, class: _, preferred-register: '' }
+  - { id: 11, class: _, preferred-register: '' }
+  - { id: 12, class: _, preferred-register: '' }
+  - { id: 13, class: gpr, preferred-register: '' }
+  - { id: 14, class: gpr, preferred-register: '' }
+  - { id: 15, class: gpr, preferred-register: '' }
+  - { id: 16, class: gpr, preferred-register: '' }
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $s0, $s1, $s2, $s3
+
+    ; CHECK-LABEL: name: test_f32
+    ; CHECK: liveins: $s0, $s1, $s2, $s3
+    ; CHECK: [[COPY:%[0-9]+]]:fpr32 = COPY $s0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY $s1
+    ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY $s2
+    ; CHECK: [[COPY3:%[0-9]+]]:fpr32 = COPY $s3
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.ssub
+    ; CHECK: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
+    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY2]], %subreg.ssub
+    ; CHECK: [[INSvi32lane1:%[0-9]+]]:fpr128 = INSvi32lane [[INSvi32lane]], 2, [[INSERT_SUBREG2]], 0
+    ; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY3]], %subreg.ssub
+    ; CHECK: [[INSvi32lane2:%[0-9]+]]:fpr128 = INSvi32lane [[INSvi32lane1]], 3, [[INSERT_SUBREG3]], 0
+    ; CHECK: $q0 = COPY [[INSvi32lane2]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(s32) = COPY $s0
+    %1:fpr(s32) = COPY $s1
+    %2:fpr(s32) = COPY $s2
+    %3:fpr(s32) = COPY $s3
+    %4:fpr(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    $q0 = COPY %4(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_f64
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr, preferred-register: '' }
+  - { id: 1, class: fpr, preferred-register: '' }
+  - { id: 2, class: fpr, preferred-register: '' }
+  - { id: 3, class: fpr, preferred-register: '' }
+  - { id: 4, class: fpr, preferred-register: '' }
+  - { id: 5, class: _, preferred-register: '' }
+  - { id: 6, class: _, preferred-register: '' }
+  - { id: 7, class: _, preferred-register: '' }
+  - { id: 8, class: _, preferred-register: '' }
+  - { id: 9, class: gpr, preferred-register: '' }
+  - { id: 10, class: gpr, preferred-register: '' }
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $d0, $d1, $d2, $d3
+
+    ; CHECK-LABEL: name: test_f64
+    ; CHECK: liveins: $d0, $d1, $d2, $d3
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub
+    ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
+    ; CHECK: $q0 = COPY [[INSvi64lane]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(s64) = COPY $d0
+    %1:fpr(s64) = COPY $d1
+    %4:fpr(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64)
+    $q0 = COPY %4(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_i32
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr, preferred-register: '' }
+  - { id: 1, class: gpr, preferred-register: '' }
+  - { id: 2, class: gpr, preferred-register: '' }
+  - { id: 3, class: gpr, preferred-register: '' }
+  - { id: 4, class: fpr, preferred-register: '' }
+  - { id: 5, class: _, preferred-register: '' }
+  - { id: 6, class: _, preferred-register: '' }
+  - { id: 7, class: _, preferred-register: '' }
+  - { id: 8, class: _, preferred-register: '' }
+  - { id: 9, class: _, preferred-register: '' }
+  - { id: 10, class: _, preferred-register: '' }
+  - { id: 11, class: _, preferred-register: '' }
+  - { id: 12, class: _, preferred-register: '' }
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $w0, $w1, $w2, $w3
+
+    ; CHECK-LABEL: name: test_i32
+    ; CHECK: liveins: $w0, $w1, $w2, $w3
+    ; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY $w2
+    ; CHECK: [[COPY3:%[0-9]+]]:gpr32 = COPY $w3
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub
+    ; CHECK: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, [[COPY1]]
+    ; CHECK: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSvi32gpr]], 2, [[COPY2]]
+    ; CHECK: [[INSvi32gpr2:%[0-9]+]]:fpr128 = INSvi32gpr [[INSvi32gpr1]], 3, [[COPY3]]
+    ; CHECK: $q0 = COPY [[INSvi32gpr2]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:gpr(s32) = COPY $w0
+    %1:gpr(s32) = COPY $w1
+    %2:gpr(s32) = COPY $w2
+    %3:gpr(s32) = COPY $w3
+    %4:fpr(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    $q0 = COPY %4(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_i64
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr, preferred-register: '' }
+  - { id: 1, class: gpr, preferred-register: '' }
+  - { id: 2, class: gpr, preferred-register: '' }
+  - { id: 3, class: gpr, preferred-register: '' }
+  - { id: 4, class: fpr, preferred-register: '' }
+  - { id: 5, class: _, preferred-register: '' }
+  - { id: 6, class: _, preferred-register: '' }
+  - { id: 7, class: _, preferred-register: '' }
+  - { id: 8, class: _, preferred-register: '' }
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $x0, $x1, $x2, $x3
+
+    ; CHECK-LABEL: name: test_i64
+    ; CHECK: liveins: $x0, $x1, $x2, $x3
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64all = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
+    ; CHECK: [[INSvi64gpr:%[0-9]+]]:fpr128 = INSvi64gpr [[INSERT_SUBREG]], 1, [[COPY1]]
+    ; CHECK: $q0 = COPY [[INSvi64gpr]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:gpr(s64) = COPY $x0
+    %1:gpr(s64) = COPY $x1
+    %4:fpr(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64)
+    $q0 = COPY %4(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
diff --git a/test/CodeGen/AArch64/O0-pipeline.ll b/test/CodeGen/AArch64/O0-pipeline.ll
index d85d126883c3260f746a59d3f4d63837d91dd999..6d0aa91272bd058c71155db579e6d9160a3ea4fe 100644
--- a/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/test/CodeGen/AArch64/O0-pipeline.ll
@@ -50,6 +50,7 @@
 ; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 pseudo instruction expansion pass
+; CHECK-NEXT:       AArch64 speculation hardening pass
 ; CHECK-NEXT:       Analyze Machine Code For Garbage Collection
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       AArch64 Branch Targets
diff --git a/test/CodeGen/AArch64/O3-pipeline.ll b/test/CodeGen/AArch64/O3-pipeline.ll
index dc2316987d33f1c9efbf9e7b776b321ad6ea789a..98cef01b6a906f5c0a113a4bc423d3b01ffeb536 100644
--- a/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/test/CodeGen/AArch64/O3-pipeline.ll
@@ -49,6 +49,11 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Memory SSA
+; CHECK-NEXT:       Interleaved Load Combine Pass
+; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Interleaved Access Pass
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       CodeGen Prepare
@@ -141,6 +146,7 @@
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 load / store optimization pass
+; CHECK-NEXT:       AArch64 speculation hardening pass
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Falkor HW Prefetch Fix Late Phase
@@ -149,6 +155,7 @@
 ; CHECK-NEXT:       Machine Block Frequency Analysis
 ; CHECK-NEXT:       MachinePostDominator Tree Construction
 ; CHECK-NEXT:       Branch Probability Basic Block Placement
+; CHECK-NEXT:       AArch64 load / store optimization pass
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       AArch64 Branch Targets
 ; CHECK-NEXT:       AArch64 Compress Jump Tables
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll b/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll
new file mode 100644
index 0000000000000000000000000000000000000000..970422164a49dc854b7e7ec57b7cd8a89a31cf52
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll
@@ -0,0 +1,406 @@
+; RUN: llc < %s | FileCheck --check-prefix AS %s
+; RUN: opt -S -interleaved-load-combine < %s | FileCheck %s
+
+; ModuleID = 'aarch64_interleaved-ld-combine.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnu"
+
+; This should be lowered into LD4
+define void @aarch64_ilc_const(<4 x float>* %ptr) {
+entry:
+
+;;; Check LLVM transformation
+; CHECK-LABEL: @aarch64_ilc_const(
+; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
+; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>*
+; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 16
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: ret void
+
+;;; Check if it gets lowerd
+; AS-LABEL: aarch64_ilc_const
+; AS: ld4
+; AS: ret
+
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  2
+  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  3
+  %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  4
+  %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  5
+  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
+  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+  %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
+  %ld4 = load <4 x float>, <4 x float>* %gep4, align 16
+  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %m8_11  = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+  store <4 x float> %m8_11, <4 x float>* %gep3, align 16
+  store <4 x float> %m12_15, <4 x float>* %gep4, align 16
+  ret void
+}
+
+; This should be lowered into LD4
+define void @aarch64_ilc_idx(<4 x float>* %ptr, i64 %idx) {
+entry:
+
+;;; Check LLVM transformation
+; CHECK-LABEL: @aarch64_ilc_idx(
+; CHECK-DAG: [[ADD:%.+]] = add i64 %idx, 16
+; CHECK-DAG: [[LSHR:%.+]] = lshr i64 [[ADD]], 2
+; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]]
+; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>*
+; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 16
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: ret void
+
+; AS-LABEL: aarch64_ilc_idx
+; AS-DAG: lsl [[LSL:x[0-9]+]], x1, #2
+; AS-DAG: add [[ADD:x[0-9]+]], [[LSL]], #64
+; AS-DAG: and [[AND:x[0-9]+]], [[ADD]], #0xfffffffffffffff0
+; AS-DAG: add [[ADR:x[0-9]+]], x0, [[AND]]
+; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, {{\[}}[[ADR]]{{\]}}
+; AS-DAG: str q[[V0]]
+; AS-DAG: str q[[V1]]
+; AS-DAG: str q[[V2]]
+; AS-DAG: str q[[V3]]
+; AS: ret
+
+  %a2 = add i64 %idx, 20
+  %idx2 = lshr i64 %a2, 2
+  %a3 = add i64 %idx, 24
+  %a1 = add i64 %idx, 16
+  %idx1 = lshr i64 %a1, 2
+  %idx3 = lshr i64 %a3, 2
+  %a4 = add i64 %idx, 28
+  %idx4 = lshr i64 %a4, 2
+
+  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx2
+  %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx4
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx1
+  %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx3
+  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
+  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+  %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
+  %ld4 = load <4 x float>, <4 x float>* %gep4, align 16
+  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %m8_11  = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+  store <4 x float> %m8_11, <4 x float>* %gep3, align 16
+  store <4 x float> %m12_15, <4 x float>* %gep4, align 16
+  ret void
+}
+
+; This should be lowered into LD4, a offset of has to be taken into account
+%struct.ilc = type <{ float, [0 x <4 x float>] }>
+define void @aarch64_ilc_struct(%struct.ilc* %ptr, i64 %idx) {
+entry:
+
+;;; Check LLVM transformation
+; CHECK-LABEL: @aarch64_ilc_struct(
+; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
+; CHECK-DAG: [[GEP:%.+]] = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 [[LSHR]]
+; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>*
+; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 4
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: ret void
+
+; AS-LABEL: aarch64_ilc_struct
+; AS-DAG: lsl [[LSL:x[0-9]+]], x1, #2
+; AS-DAG: add [[ADD:x[0-9]+]], x0, #4
+; AS-DAG: and [[AND:x[0-9]+]], [[LSL]], #0xfffffffffffffff0
+; AS-DAG: add [[ADR:x[0-9]+]], [[ADD]], [[AND]]
+; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, {{\[}}[[ADR]]{{\]}}
+; AS-DAG: str q[[V0]]
+; AS-DAG: str q[[V1]]
+; AS-DAG: str q[[V2]]
+; AS-DAG: str q[[V3]]
+; AS: ret
+
+  %a1 = add i64 %idx, 4
+  %idx2 = lshr i64 %a1, 2
+  %a2 = add i64 %idx, 8
+  %idx3 = lshr i64 %a2, 2
+  %a3 = add i64 %idx, 12
+  %idx4 = lshr i64 %a3, 2
+
+  %gep2 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx2
+  %gep3 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx3
+  %gep4 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx4
+  %idx1 = lshr i64 %idx, 2
+  %gep1 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx1
+  %ld1 = load <4 x float>, <4 x float>* %gep1, align 4
+  %ld2 = load <4 x float>, <4 x float>* %gep2, align 4
+  %ld3 = load <4 x float>, <4 x float>* %gep3, align 4
+  %ld4 = load <4 x float>, <4 x float>* %gep4, align 4
+  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %m8_11  = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+  store <4 x float> %m8_11, <4 x float>* %gep3, align 16
+  store <4 x float> %m12_15, <4 x float>* %gep4, align 16
+  ret void
+}
+
+; This should be lowered into LD2
+define void @aarch64_ilc_idx_ld2(<4 x float>* %ptr, i64 %idx) {
+entry:
+; CHECK-LABEL: @aarch64_ilc_idx_ld2(
+; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
+; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]]
+; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <8 x float>*
+; CHECK-DAG: [[LOAD:%.+]] = load <8 x float>, <8 x float>* [[CAST]], align 16
+; CHECK: %{{.* }}= shufflevector <8 x float> [[LOAD]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %{{.* }}= shufflevector <8 x float> [[LOAD]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-DAG: ret void
+
+; AS-LABEL: aarch64_ilc_idx_ld2
+; AS: ld2
+; AS: ret
+
+  %idx1 = lshr i64 %idx, 2
+  %a1 = add i64 %idx, 4
+  %idx2 = lshr i64 %a1, 2
+
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx1
+  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx2
+  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
+  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+  %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  store <4 x float> %m0_3, <4 x float>* %gep1
+  store <4 x float> %m4_7, <4 x float>* %gep2
+  ret void
+}
+
+; This should be lowered into LD3
+define void @aarch64_ilc_idx_ld3(<4 x float>* %ptr, i64 %idx) {
+entry:
+; CHECK-LABEL: @aarch64_ilc_idx_ld3(
+; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
+; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]]
+; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <12 x float>*
+; CHECK-DAG: [[LOAD:%.+]] = load <12 x float>, <12 x float>* [[CAST]], align 16
+; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-DAG: ret void
+
+; AS-LABEL: aarch64_ilc_idx_ld3
+; AS: ld3
+; AS: ret
+
+  %idx1 = lshr i64 %idx, 2
+  %a1 = add i64 %idx, 4
+  %idx2 = lshr i64 %a1, 2
+  %a2 = add i64 %idx, 8
+  %idx3 = lshr i64 %a2, 2
+
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx1
+  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx2
+  %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64  %idx3
+  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
+  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+  %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
+
+  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 3, i32 6, i32 undef>
+  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 4, i32 7, i32 undef>
+  %sv3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
+  %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %ld3, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  %m4_7 = shufflevector <4 x float> %sv2, <4 x float> %ld3, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  %m8_11 = shufflevector <4 x float> %sv3, <4 x float> %ld3, <4 x i32> <i32 0, i32 1, i32 4, i32 7>
+
+  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+  store <4 x float> %m8_11, <4 x float>* %gep3, align 16
+  ret void
+}
+;  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
+
+; This must not be lowered
+define void @aarch64_ilc_i32_idx(<4 x float>* %ptr, i32 %idx) {
+; CHECK-LABEL: @aarch64_ilc_i32_idx(
+; CHECK: %idx1 = lshr i32 %idx, 2
+; CHECK-NEXT: %a1 = add i32 %idx, 4
+; CHECK-NEXT: %idx2 = lshr i32 %a1, 2
+; CHECK-NEXT: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx1
+; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx2
+; CHECK-NEXT: %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
+; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+; CHECK-NEXT: ret void
+
+; AS-LABEL: aarch64_ilc_i32_idx
+; AS-DAG: @function
+; AS-NOT: ld2
+; AS-NOT: ld3
+; AS-NOT: ld4
+; AS-DAG: ret
+
+entry:
+  %idx1 = lshr i32 %idx, 2
+  %a1 = add i32 %idx, 4
+  %idx2 = lshr i32 %a1, 2
+
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx1
+  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx2
+  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
+  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+  %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+  ret void
+}
+
+; Volatile loads must not be lowered
+define void @aarch64_ilc_volatile(<4 x float>* %ptr) {
+; CHECK-LABEL: @aarch64_ilc_volatile(
+; CHECK: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0
+; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1
+; CHECK-NEXT: %ld1 = load volatile <4 x float>, <4 x float>* %gep1, align 16
+; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+; CHECK-NEXT: ret void
+
+; AS-LABEL: aarch64_ilc_volatile
+; AS-DAG: @function
+; AS-NOT: ld2
+; AS-NOT: ld3
+; AS-NOT: ld4
+; AS-DAG: ret
+
+entry:
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0
+  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1
+  %ld1 = load volatile <4 x float>, <4 x float>* %gep1, align 16
+  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+  %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+  ret void
+}
+
+; This must not be lowered
+define void @aarch64_ilc_depmem(<4 x float>* %ptr, i32 %idx) {
+entry:
+; CHECK-LABEL: @aarch64_ilc_depmem(
+; CHECK: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0
+; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1
+; CHECK-NEXT: %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
+; CHECK-NEXT: store <4 x float> %ld1, <4 x float>* %gep2, align 16
+; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+; CHECK-NEXT: ret void
+
+; AS-LABEL: aarch64_ilc_depmem
+; AS-DAG: @function
+; AS-NOT: ld2
+; AS-NOT: ld3
+; AS-NOT: ld4
+; AS-DAG: ret
+
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0
+  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1
+  %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
+  store <4 x float> %ld1, <4 x float>* %gep2, align 16
+  %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
+  %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  store <4 x float> %m0_3, <4 x float>* %gep1, align 16
+  store <4 x float> %m4_7, <4 x float>* %gep2, align 16
+  ret void
+}
+
+; This cannot be converted - insertion position cannot be determined
+define void @aarch64_no_insertion_pos(float* %ptr) {
+entry:
+; CHECK-LABEL: @aarch64_no_insertion_pos(
+; CHECK: %p0 = getelementptr inbounds float, float* %ptr, i32 0
+; CHECK-NEXT: %p1 = getelementptr inbounds float, float* %ptr, i32 4
+; CHECK-NEXT: %b0 = bitcast float* %p0 to <5 x float>*
+; CHECK-NEXT: %b1 = bitcast float* %p1 to <5 x float>*
+; CHECK-NEXT: %l0 = load <5 x float>, <5 x float>* %b0
+; CHECK-NEXT: %l1 = load <5 x float>, <5 x float>* %b1
+; CHECK-NEXT: %s0 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> <i32 1, i32 3, i32 6, i32 8>
+; CHECK-NEXT: %s1 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> <i32 2, i32 4, i32 7, i32 9>
+; CHECK-NEXT: ret void
+
+  %p0 = getelementptr inbounds float, float* %ptr, i32 0
+  %p1 = getelementptr inbounds float, float* %ptr, i32 4
+  %b0 = bitcast float* %p0 to <5 x float>*
+  %b1 = bitcast float* %p1 to <5 x float>*
+  %l0 = load <5 x float>, <5 x float>* %b0
+  %l1 = load <5 x float>, <5 x float>* %b1
+  %s0 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> <i32 1, i32 3, i32 6, i32 8>
+  %s1 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> <i32 2, i32 4, i32 7, i32 9>
+  ret void
+}
+
+; This cannot be converted - the insertion position does not dominate all
+; uses
+define void @aarch64_insertpos_does_not_dominate(float* %ptr) {
+entry:
+; CHECK-LABEL: @aarch64_insertpos_does_not_dominate(
+; CHECK: %p0 = getelementptr inbounds float, float* %ptr, i32 0
+; CHECK-NEXT: %p1 = getelementptr inbounds float, float* %ptr, i32 1
+; CHECK-NEXT: %b0 = bitcast float* %p0 to <7 x float>*
+; CHECK-NEXT: %b1 = bitcast float* %p1 to <7 x float>*
+; CHECK-NEXT: %l1 = load <7 x float>, <7 x float>* %b1
+; CHECK-NEXT: %s1 = shufflevector <7 x float> %l1, <7 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: %l0 = load <7 x float>, <7 x float>* %b0
+; CHECK-NEXT: %s0 = shufflevector <7 x float> %l0, <7 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: ret void
+  %p0 = getelementptr inbounds float, float* %ptr, i32 0
+  %p1 = getelementptr inbounds float, float* %ptr, i32 1
+  %b0 = bitcast float* %p0 to <7 x float>*
+  %b1 = bitcast float* %p1 to <7 x float>*
+  %l1 = load <7 x float>, <7 x float>* %b1
+  %s1 = shufflevector <7 x float> %l1, <7 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %l0 = load <7 x float>, <7 x float>* %b0
+  %s0 = shufflevector <7 x float> %l0, <7 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64-smull.ll b/test/CodeGen/AArch64/aarch64-smull.ll
index 1c8d13a00b2a2f82cc2c04cfc91293a44f051bb6..8922ae971c2bc84e961a6796fe6149a940e3adea 100644
--- a/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/test/CodeGen/AArch64/aarch64-smull.ll
@@ -291,15 +291,16 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
   ret <2 x i64> %tmp4
 }
 
-define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
+define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) {
 ; If one operand has a zero-extend and the other a sign-extend, smull
 ; cannot be used.
 ; CHECK-LABEL: smullWithInconsistentExtensions:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  %1 = sext <8 x i8> %vec to <8 x i16>
-  %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %3 = extractelement <8 x i16> %2, i32 0
-  ret i16 %3
+  %s = sext <8 x i8> %x to <8 x i16>
+  %z = zext <8 x i8> %y to <8 x i16>
+  %m = mul <8 x i16> %s, %z
+  %r = extractelement <8 x i16> %m, i32 0
+  ret i16 %r
 }
 
 define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index 92c320a82102f211fef1af29a9412fa1fad4678d..b0a4256552726b203636ce49f78f41a55f8e3b83 100644
--- a/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -7,14 +7,13 @@ define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
 ; CHECK-LABEL: fn9:
 ; 9th fixed argument
 ; CHECK: ldr {{w[0-9]+}}, [sp, #64]
-; CHECK: add [[ARGS:x[0-9]+]], sp, #72
-; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
+; CHECK-DAG: add [[ARGS:x[0-9]+]], sp, #72
 ; First vararg
-; CHECK: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #72]
 ; Second vararg
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8
+; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #80]
 ; Third vararg
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8
+; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #88]
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
index d7fe9c6d68bfd086bf1e95a96e9129ff06cd62e6..d268f761c9a5db51749806c6aac2923855217328 100644
--- a/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -53,3 +53,25 @@ define void @widen_f16_build_vector(half* %addr) {
   store <2 x half> <half 0xH33EE, half 0xH33EE>, <2 x half>* %1, align 2
   ret void
 }
+
+; Check that a single element vector is constructed with a mov
+define <1 x i64> @single_element_vector_i64(<1 x i64> %arg) {
+; CHECK-LABEL: single_element_vector_i64
+; CHECK: orr w[[GREG:[0-9]+]], wzr, #0x1
+; CHECK: fmov d[[DREG:[0-9]+]], x[[GREG]]
+; CHECK: add d0, d0, d[[DREG]]
+; CHECK: ret
+entry:
+  %add = add <1 x i64> %arg, <i64 1>
+  ret <1 x i64> %add
+}
+
+define <1 x double> @single_element_vector_double(<1 x double> %arg) {
+; CHECK-LABEL: single_element_vector_double
+; CHECK: fmov d[[DREG:[0-9]+]], #1.00000000
+; CHECK: fadd d0, d0, d[[DREG]]
+; CHECK: ret
+entry:
+  %add = fadd <1 x double> %arg, <double 1.0>
+  ret <1 x double> %add
+}
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
index b18e638a3a947b59dd1fe20ff332085d425ea8a1..6b497e8f7bfda1ac98b9118ff76f5ab23120b104 100644
--- a/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -526,8 +526,8 @@ define i32 @select_or_olt_one(double %v0, double %v1, double %v2, double %v3, i3
 ; CHECK-LABEL: select_or_one_olt:
 ; CHECK-LABEL: ; %bb.0:
 ; CHECK-NEXT: fcmp d0, d1
-; CHECK-NEXT: fccmp d0, d1, #1, ne
-; CHECK-NEXT: fccmp d2, d3, #8, vs
+; CHECK-NEXT: fccmp d0, d1, #8, le
+; CHECK-NEXT: fccmp d2, d3, #8, pl
 ; CHECK-NEXT: csel w0, w0, w1, mi
 ; CHECK-NEXT: ret
 define i32 @select_or_one_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
@@ -556,8 +556,8 @@ define i32 @select_or_olt_ueq(double %v0, double %v1, double %v2, double %v3, i3
 ; CHECK-LABEL: select_or_ueq_olt:
 ; CHECK-LABEL: ; %bb.0:
 ; CHECK-NEXT: fcmp d0, d1
-; CHECK-NEXT: fccmp d0, d1, #8, le
-; CHECK-NEXT: fccmp d2, d3, #8, mi
+; CHECK-NEXT: fccmp d0, d1, #1, ne
+; CHECK-NEXT: fccmp d2, d3, #8, vc
 ; CHECK-NEXT: csel w0, w0, w1, mi
 ; CHECK-NEXT: ret
 define i32 @select_or_ueq_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
@@ -656,4 +656,68 @@ define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3,
   ret i32 %sel
 }
 
+; This testcase resembles the core problem of http://llvm.org/PR39550
+; (an OR operation is 2 levels deep but needs to be implemented first)
+; CHECK-LABEL: deep_or
+; CHECK: cmp w2, #20
+; CHECK-NEXT: ccmp w2, #15, #4, ne
+; CHECK-NEXT: ccmp w1, #0, #4, eq
+; CHECK-NEXT: ccmp w0, #0, #4, ne
+; CHECK-NEXT: csel w0, w4, w5, ne
+; CHECK-NEXT: ret
+define i32 @deep_or(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
+  %c0 = icmp ne i32 %a0, 0
+  %c1 = icmp ne i32 %a1, 0
+  %c2 = icmp eq i32 %a2, 15
+  %c3 = icmp eq i32 %a2, 20
+
+  %or = or i1 %c2, %c3
+  %and0 = and i1 %or, %c1
+  %and1 = and i1 %and0, %c0
+  %sel = select i1 %and1, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+; Variation of deep_or, we still need to implement the OR first though.
+; CHECK-LABEL: deep_or1
+; CHECK: cmp w2, #20
+; CHECK-NEXT: ccmp w2, #15, #4, ne
+; CHECK-NEXT: ccmp w0, #0, #4, eq
+; CHECK-NEXT: ccmp w1, #0, #4, ne
+; CHECK-NEXT: csel w0, w4, w5, ne
+; CHECK-NEXT: ret
+define i32 @deep_or1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
+  %c0 = icmp ne i32 %a0, 0
+  %c1 = icmp ne i32 %a1, 0
+  %c2 = icmp eq i32 %a2, 15
+  %c3 = icmp eq i32 %a2, 20
+
+  %or = or i1 %c2, %c3
+  %and0 = and i1 %c0, %or
+  %and1 = and i1 %and0, %c1
+  %sel = select i1 %and1, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+; Variation of deep_or, we still need to implement the OR first though.
+; CHECK-LABEL: deep_or2
+; CHECK: cmp w2, #20
+; CHECK-NEXT: ccmp w2, #15, #4, ne
+; CHECK-NEXT: ccmp w1, #0, #4, eq
+; CHECK-NEXT: ccmp w0, #0, #4, ne
+; CHECK-NEXT: csel w0, w4, w5, ne
+; CHECK-NEXT: ret
+define i32 @deep_or2(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
+  %c0 = icmp ne i32 %a0, 0
+  %c1 = icmp ne i32 %a1, 0
+  %c2 = icmp eq i32 %a2, 15
+  %c3 = icmp eq i32 %a2, 20
+
+  %or = or i1 %c2, %c3
+  %and0 = and i1 %c0, %c1
+  %and1 = and i1 %and0, %or
+  %sel = select i1 %and1, i32 %x, i32 %y
+  ret i32 %sel
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index b9dbfc7745f8d3909c52cd01951dfcd703e7c8e9..ba7bdc4488c71e9afc8ba339d96e8e42fcdcdbf6 100644
--- a/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -2,30 +2,30 @@
 
 
 define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
-; CHECK: fptosi_v4f64_to_v4i16
+; CHECK-LABEL: fptosi_v4f64_to_v4i16
 ; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v0.2d
 ; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v1.2d
-; CHECK-DAG: xtn  v[[MID:[0-9]+]].2s, v[[LHS]].2d
-; CHECK-DAG: xtn2  v[[MID]].4s, v[[RHS]].2d
-; CHECK:     xtn  v0.4h, v[[MID]].4s
+; CHECK-DAG: xtn  v[[XTN0:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn  v[[XTN1:[0-9]+]].2s, v[[RHS]].2d
+; CHECK:     uzp1  v0.4h, v[[XTN1]].4h, v[[XTN0]].4h
   %tmp1 = load <4 x double>, <4 x double>* %ptr
   %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
   ret <4 x i16> %tmp2
 }
 
 define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
-; CHECK: fptosi_v4f64_to_v4i8
+; CHECK-LABEL: fptosi_v4f64_to_v4i8
 ; CHECK-DAG:  fcvtzs  v[[CONV0:[0-9]+]].2d, v0.2d
 ; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
 ; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
 ; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
-; CHECK-DAG:  xtn  v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
-; CHECK-DAG:  xtn2  v[[NA2]].4s, v[[CONV3]].2d
-; CHECK-DAG:  xtn  v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
-; CHECK-DAG:  xtn2  v[[NA0]].4s, v[[CONV1]].2d
-; CHECK-DAG:  xtn  v[[TMP1:[0-9]+]].4h, v[[NA2]].4s
-; CHECK-DAG:  xtn2  v[[TMP1]].8h, v[[NA0]].4s
-; CHECK:      xtn  v0.8b, v[[TMP1]].8h
+; CHECK-DAG:  xtn  v[[XTN0:[0-9]+]].2s, v[[CONV0]].2d
+; CHECK-DAG:  xtn  v[[XTN1:[0-9]+]].2s, v[[CONV1]].2d
+; CHECK-DAG:  xtn  v[[XTN2:[0-9]+]].2s, v[[CONV2]].2d
+; CHECK-DAG:  xtn  v[[XTN3:[0-9]+]].2s, v[[CONV3]].2d
+; CHECK-DAG:  uzp1 v[[UZP0:[0-9]+]].4h, v[[XTN1]].4h, v[[XTN0]].4h
+; CHECK-DAG:  uzp1 v[[UZP1:[0-9]+]].4h, v[[XTN3]].4h, v[[XTN2]].4h
+; CHECK:      uzp1  v0.8b, v[[UZP1:[0-9]+]].8b, v[[UZP0:[0-9]+]].8b
   %tmp1 = load <8 x double>, <8 x double>* %ptr
   %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
   ret <8 x i8> %tmp2
@@ -54,12 +54,12 @@ define <4 x i16> @trunc_v4i64_to_v4i16(<4 x i64>* %ptr) {
 }
 
 define <4 x i16> @fptoui_v4f64_to_v4i16(<4 x double>* %ptr) {
-; CHECK: fptoui_v4f64_to_v4i16
-; CHECK-DAG: fcvtzu  v[[LHS:[0-9]+]].2d, v0.2d
-; CHECK-DAG: fcvtzu  v[[RHS:[0-9]+]].2d, v1.2d
-; CHECK-DAG: xtn  v[[MID:[0-9]+]].2s, v[[LHS]].2d
-; CHECK-DAG: xtn2  v[[MID]].4s, v[[RHS]].2d
-; CHECK:     xtn  v0.4h, v[[MID]].4s
+; CHECK-LABEL: fptoui_v4f64_to_v4i16
+; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v0.2d
+; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v1.2d
+; CHECK-DAG: xtn  v[[XTN0:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn  v[[XTN1:[0-9]+]].2s, v[[RHS]].2d
+; CHECK:     uzp1  v0.4h, v[[XTN1]].4h, v[[XTN0]].4h
   %tmp1 = load <4 x double>, <4 x double>* %ptr
   %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
   ret <4 x i16> %tmp2
diff --git a/test/CodeGen/AArch64/arm64-ld1.ll b/test/CodeGen/AArch64/arm64-ld1.ll
index 5f1caa2d67f89a2a7d2de3106972b5d1e175f494..6b119932cb7378136b188d9f2cb99d73606d22c8 100644
--- a/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/test/CodeGen/AArch64/arm64-ld1.ll
@@ -915,7 +915,9 @@ entry:
 ; CHECK: ld1r_2s_from_dup
 ; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
 ; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
-; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
+; CHECK-NEXT: ushll.8h [[ARG1]], [[ARG1]], #0
+; CHECK-NEXT: ushll.8h [[ARG2]], [[ARG2]], #0
+; CHECK-NEXT: sub.4h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: str d[[RESREGNUM]], [x2]
 ; CHECK-NEXT: ret
   %tmp = bitcast i8* %a to i32*
diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
index 951076c10b843209926dbbaf6a1c4ae85007827b..629cf37926aeb1a8009b4ef5db9dfee9a3ff5ea9 100644
--- a/test/CodeGen/AArch64/arm64-memcpy-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@@ -16,10 +16,8 @@
 define i32 @t0() {
 entry:
 ; CHECK-LABEL: t0:
-; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
-; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
-; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
-; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
+; CHECK: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7]
+; CHECK: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7]
 ; CHECK: ldr [[REG2:x[0-9]+]],
 ; CHECK: str [[REG2]],
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @dst, i32 0, i32 0), i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @src, i32 0, i32 0), i32 11, i1 false)
@@ -74,9 +72,9 @@ entry:
 define void @t5(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t5:
-; CHECK: strb wzr, [x0, #6]
-; CHECK: mov [[REG7:w[0-9]+]], #21587
-; CHECK: strh [[REG7]], [x0, #4]
+; CHECK: mov [[REG7:w[0-9]+]], #21337
+; CHECK: movk [[REG7]],
+; CHECK: stur [[REG7]], [x0, #3]
 ; CHECK: mov [[REG8:w[0-9]+]],
 ; CHECK: movk [[REG8]],
 ; CHECK: str [[REG8]], [x0]
diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
index 7a9f3b2fa97f90e8eb09091e679f3dc63ed575a8..460ccc25a270ab0a2fb16576075642320a0a537f 100644
--- a/test/CodeGen/AArch64/arm64-memset-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -113,9 +113,9 @@ define void @bzero_20_stack() {
 
 define void @bzero_26_stack() {
 ; CHECK-LABEL: bzero_26_stack:
-; CHECK:       stp xzr, xzr, [sp]
+; CHECK:       stp xzr, xzr, [sp, #8]
+; CHECK-NEXT:  str xzr, [sp]
 ; CHECK-NEXT:  strh wzr, [sp, #24]
-; CHECK-NEXT:  str xzr, [sp, #16]
 ; CHECK-NEXT:  bl something
   %buf = alloca [26 x i8], align 1
   %cast = bitcast [26 x i8]* %buf to i8*
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index 0b6132b1be64bd3e229895de38c59e45876410db..0d4d2c7460071ef18ac71142c2c8550ebc0091a7 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1,58 +1,81 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
 ; CHECK-LABEL: ins16bw:
-; CHECK: mov {{v[0-9]+}}.b[15], {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.b[15], w0
+; CHECK-NEXT:    ret
   %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
 ; CHECK-LABEL: ins8hw:
-; CHECK: mov {{v[0-9]+}}.h[6], {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.h[6], w0
+; CHECK-NEXT:    ret
   %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
 ; CHECK-LABEL: ins4sw:
-; CHECK: mov {{v[0-9]+}}.s[2], {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[2], w0
+; CHECK-NEXT:    ret
   %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
 ; CHECK-LABEL: ins2dw:
-; CHECK: mov {{v[0-9]+}}.d[1], {{x[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.d[1], x0
+; CHECK-NEXT:    ret
   %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
   ret <2 x i64> %tmp3
 }
 
 define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
 ; CHECK-LABEL: ins8bw:
-; CHECK: mov {{v[0-9]+}}.b[5], {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.b[5], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
 ; CHECK-LABEL: ins4hw:
-; CHECK: mov {{v[0-9]+}}.h[3], {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.h[3], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
 ; CHECK-LABEL: ins2sw:
-; CHECK: mov {{v[0-9]+}}.s[1], {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[1], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
 ; CHECK-LABEL: ins16b16:
-; CHECK: mov {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.b[15], v0.b[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <16 x i8> %tmp1, i32 2
   %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
   ret <16 x i8> %tmp4
@@ -60,7 +83,10 @@ define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
 
 define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
 ; CHECK-LABEL: ins8h8:
-; CHECK: mov {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.h[7], v0.h[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
   ret <8 x i16> %tmp4
@@ -68,7 +94,10 @@ define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
 
 define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
 ; CHECK-LABEL: ins4s4:
-; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.s[1], v0.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
   ret <4 x i32> %tmp4
@@ -76,7 +105,10 @@ define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
 
 define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
 ; CHECK-LABEL: ins2d2:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x i64> %tmp1, i32 0
   %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
   ret <2 x i64> %tmp4
@@ -84,7 +116,10 @@ define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
 
 define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
 ; CHECK-LABEL: ins4f4:
-; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.s[1], v0.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x float> %tmp1, i32 2
   %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
   ret <4 x float> %tmp4
@@ -92,7 +127,10 @@ define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
 
 define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
 ; CHECK-LABEL: ins2df2:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x double> %tmp1, i32 0
   %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
   ret <2 x double> %tmp4
@@ -100,7 +138,11 @@ define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
 
 define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
 ; CHECK-LABEL: ins8b16:
-; CHECK: mov {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.b[15], v0.b[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i8> %tmp1, i32 2
   %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
   ret <16 x i8> %tmp4
@@ -108,7 +150,11 @@ define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
 
 define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
 ; CHECK-LABEL: ins4h8:
-; CHECK: mov {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.h[7], v0.h[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
   ret <8 x i16> %tmp4
@@ -116,7 +162,11 @@ define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
 
 define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
 ; CHECK-LABEL: ins2s4:
-; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
   ret <4 x i32> %tmp4
@@ -124,7 +174,11 @@ define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
 
 define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
 ; CHECK-LABEL: ins1d2:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
   ret <2 x i64> %tmp4
@@ -132,7 +186,11 @@ define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
 
 define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
 ; CHECK-LABEL: ins2f4:
-; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x float> %tmp1, i32 1
   %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
   ret <4 x float> %tmp4
@@ -140,7 +198,10 @@ define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
 
 define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
 ; CHECK-LABEL: ins1f2:
-; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    zip1 v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <1 x double> %tmp1, i32 0
   %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
   ret <2 x double> %tmp4
@@ -148,7 +209,11 @@ define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
 
 define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
 ; CHECK-LABEL: ins16b8:
-; CHECK: mov {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v1.b[7], v0.b[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <16 x i8> %tmp1, i32 2
   %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
   ret <8 x i8> %tmp4
@@ -156,7 +221,11 @@ define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
 
 define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
 ; CHECK-LABEL: ins8h4:
-; CHECK: mov {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v1.h[3], v0.h[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
   ret <4 x i16> %tmp4
@@ -164,7 +233,11 @@ define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
 
 define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
 ; CHECK-LABEL: ins4s2:
-; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v1.s[1], v0.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
   ret <2 x i32> %tmp4
@@ -172,7 +245,11 @@ define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
 
 define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
 ; CHECK-LABEL: ins2d1:
-; CHECK: mov {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v1.d[0], v0.d[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x i64> %tmp1, i32 0
   %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
   ret <1 x i64> %tmp4
@@ -180,7 +257,11 @@ define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
 
 define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
 ; CHECK-LABEL: ins4f2:
-; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v1.s[1], v0.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x float> %tmp1, i32 2
   %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
   ret <2 x float> %tmp4
@@ -188,7 +269,10 @@ define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
 
 define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
 ; CHECK-LABEL: ins2f1:
-; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2d, v0.d[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x double> %tmp1, i32 1
   %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
   ret <1 x double> %tmp4
@@ -196,7 +280,12 @@ define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
 
 define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
 ; CHECK-LABEL: ins8b8:
-; CHECK: mov {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.b[4], v0.b[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i8> %tmp1, i32 2
   %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
   ret <8 x i8> %tmp4
@@ -204,7 +293,12 @@ define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
 
 define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
 ; CHECK-LABEL: ins4h4:
-; CHECK: mov {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.h[3], v0.h[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
   ret <4 x i16> %tmp4
@@ -212,7 +306,12 @@ define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
 
 define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
 ; CHECK-LABEL: ins2s2:
-; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x i32> %tmp1, i32 0
   %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
   ret <2 x i32> %tmp4
@@ -220,7 +319,12 @@ define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
 
 define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
 ; CHECK-LABEL: ins1d1:
-; CHECK: mov {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.d[0], v0.d[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
   ret <1 x i64> %tmp4
@@ -228,7 +332,12 @@ define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
 
 define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
 ; CHECK-LABEL: ins2f2:
-; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x float> %tmp1, i32 0
   %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
   ret <2 x float> %tmp4
@@ -236,7 +345,8 @@ define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
 
 define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
 ; CHECK-LABEL: ins1df1:
-; CHECK-NOT: mov {{v[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <1 x double> %tmp1, i32 0
   %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
   ret <1 x double> %tmp4
@@ -244,7 +354,9 @@ define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
 
 define i32 @umovw16b(<16 x i8> %tmp1) {
 ; CHECK-LABEL: umovw16b:
-; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[8]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = zext i8 %tmp3 to i32
   ret i32 %tmp4
@@ -252,7 +364,9 @@ define i32 @umovw16b(<16 x i8> %tmp1) {
 
 define i32 @umovw8h(<8 x i16> %tmp1) {
 ; CHECK-LABEL: umovw8h:
-; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[2]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = zext i16 %tmp3 to i32
   ret i32 %tmp4
@@ -260,21 +374,28 @@ define i32 @umovw8h(<8 x i16> %tmp1) {
 
 define i32 @umovw4s(<4 x i32> %tmp1) {
 ; CHECK-LABEL: umovw4s:
-; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, v0.s[2]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   ret i32 %tmp3
 }
 
 define i64 @umovx2d(<2 x i64> %tmp1) {
 ; CHECK-LABEL: umovx2d:
-; CHECK: mov {{x[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, v0.d[1]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x i64> %tmp1, i32 1
   ret i64 %tmp3
 }
 
 define i32 @umovw8b(<8 x i8> %tmp1) {
 ; CHECK-LABEL: umovw8b:
-; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.b[7]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i8> %tmp1, i32 7
   %tmp4 = zext i8 %tmp3 to i32
   ret i32 %tmp4
@@ -282,7 +403,10 @@ define i32 @umovw8b(<8 x i8> %tmp1) {
 
 define i32 @umovw4h(<4 x i16> %tmp1) {
 ; CHECK-LABEL: umovw4h:
-; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.h[2]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = zext i16 %tmp3 to i32
   ret i32 %tmp4
@@ -290,21 +414,30 @@ define i32 @umovw4h(<4 x i16> %tmp1) {
 
 define i32 @umovw2s(<2 x i32> %tmp1) {
 ; CHECK-LABEL: umovw2s:
-; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w0, v0.s[1]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   ret i32 %tmp3
 }
 
 define i64 @umovx1d(<1 x i64> %tmp1) {
 ; CHECK-LABEL: umovx1d:
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   ret i64 %tmp3
 }
 
 define i32 @smovw16b(<16 x i8> %tmp1) {
 ; CHECK-LABEL: smovw16b:
-; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smov w8, v0.b[8]
+; CHECK-NEXT:    add w0, w8, w8
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = sext i8 %tmp3 to i32
   %tmp5 = add i32 %tmp4, %tmp4
@@ -313,7 +446,10 @@ define i32 @smovw16b(<16 x i8> %tmp1) {
 
 define i32 @smovw8h(<8 x i16> %tmp1) {
 ; CHECK-LABEL: smovw8h:
-; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smov w8, v0.h[2]
+; CHECK-NEXT:    add w0, w8, w8
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   %tmp5 = add i32 %tmp4, %tmp4
@@ -322,7 +458,9 @@ define i32 @smovw8h(<8 x i16> %tmp1) {
 
 define i64 @smovx16b(<16 x i8> %tmp1) {
 ; CHECK-LABEL: smovx16b:
-; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smov x0, v0.b[8]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = sext i8 %tmp3 to i64
   ret i64 %tmp4
@@ -330,7 +468,9 @@ define i64 @smovx16b(<16 x i8> %tmp1) {
 
 define i64 @smovx8h(<8 x i16> %tmp1) {
 ; CHECK-LABEL: smovx8h:
-; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smov x0, v0.h[2]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i64
   ret i64 %tmp4
@@ -338,7 +478,9 @@ define i64 @smovx8h(<8 x i16> %tmp1) {
 
 define i64 @smovx4s(<4 x i32> %tmp1) {
 ; CHECK-LABEL: smovx4s:
-; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smov x0, v0.s[2]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = sext i32 %tmp3 to i64
   ret i64 %tmp4
@@ -346,7 +488,11 @@ define i64 @smovx4s(<4 x i32> %tmp1) {
 
 define i32 @smovw8b(<8 x i8> %tmp1) {
 ; CHECK-LABEL: smovw8b:
-; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w8, v0.b[4]
+; CHECK-NEXT:    add w0, w8, w8
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i8> %tmp1, i32 4
   %tmp4 = sext i8 %tmp3 to i32
   %tmp5 = add i32 %tmp4, %tmp4
@@ -355,7 +501,11 @@ define i32 @smovw8b(<8 x i8> %tmp1) {
 
 define i32 @smovw4h(<4 x i16> %tmp1) {
 ; CHECK-LABEL: smovw4h:
-; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w8, v0.h[2]
+; CHECK-NEXT:    add w0, w8, w8
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   %tmp5 = add i32 %tmp4, %tmp4
@@ -364,7 +514,10 @@ define i32 @smovw4h(<4 x i16> %tmp1) {
 
 define i32 @smovx8b(<8 x i8> %tmp1) {
 ; CHECK-LABEL: smovx8b:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[6]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w0, v0.b[6]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <8 x i8> %tmp1, i32 6
   %tmp4 = sext i8 %tmp3 to i32
   ret i32 %tmp4
@@ -372,7 +525,10 @@ define i32 @smovx8b(<8 x i8> %tmp1) {
 
 define i32 @smovx4h(<4 x i16> %tmp1) {
 ; CHECK-LABEL: smovx4h:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w0, v0.h[2]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   ret i32 %tmp4
@@ -380,7 +536,10 @@ define i32 @smovx4h(<4 x i16> %tmp1) {
 
 define i64 @smovx2s(<2 x i32> %tmp1) {
 ; CHECK-LABEL: smovx2s:
-; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov x0, v0.s[1]
+; CHECK-NEXT:    ret
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   %tmp4 = sext i32 %tmp3 to i64
   ret i64 %tmp4
@@ -388,35 +547,52 @@ define i64 @smovx2s(<2 x i32> %tmp1) {
 
 define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
 ; CHECK-LABEL: test_vcopy_lane_s8:
-; CHECK: mov  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.b[5], v1.b[3]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
   ret <8 x i8> %vset_lane
 }
 
 define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
 ; CHECK-LABEL: test_vcopyq_laneq_s8:
-; CHECK: mov  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.b[14], v1.b[6]
+; CHECK-NEXT:    ret
   %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
   ret <16 x i8> %vset_lane
 }
 
 define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
 ; CHECK-LABEL: test_vcopy_lane_swap_s8:
-; CHECK: mov {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v1.b[7], v0.b[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
   ret <8 x i8> %vset_lane
 }
 
 define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
 ; CHECK-LABEL: test_vcopyq_laneq_swap_s8:
-; CHECK: mov {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.b[0], v0.b[15]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i8> %vset_lane
 }
 
 define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
 ; CHECK-LABEL: test_vdup_n_u8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8b, w0
+; CHECK-NEXT:    ret
   %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
   %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
   %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
@@ -430,7 +606,9 @@ define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
 
 define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
 ; CHECK-LABEL: test_vdup_n_u16:
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4h, w0
+; CHECK-NEXT:    ret
   %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
@@ -440,7 +618,9 @@ define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
 
 define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
 ; CHECK-LABEL: test_vdup_n_u32:
-; CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2s, w0
+; CHECK-NEXT:    ret
   %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
   ret <2 x i32> %vecinit1.i
@@ -448,14 +628,18 @@ define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
 
 define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
 ; CHECK-LABEL: test_vdup_n_u64:
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
   %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
   ret <1 x i64> %vecinit.i
 }
 
 define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
 ; CHECK-LABEL: test_vdupq_n_u8:
-; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.16b, w0
+; CHECK-NEXT:    ret
   %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
   %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
   %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
@@ -477,7 +661,9 @@ define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
 
 define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
 ; CHECK-LABEL: test_vdupq_n_u16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, w0
+; CHECK-NEXT:    ret
   %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
   %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
   %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
@@ -491,7 +677,9 @@ define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
 
 define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
 ; CHECK-LABEL: test_vdupq_n_u32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4s, w0
+; CHECK-NEXT:    ret
   %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
   %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
   %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
@@ -501,7 +689,9 @@ define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
 
 define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
 ; CHECK-LABEL: test_vdupq_n_u64:
-; CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2d, x0
+; CHECK-NEXT:    ret
   %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
   %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
   ret <2 x i64> %vecinit1.i
@@ -509,190 +699,252 @@ define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
 
 define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
 ; CHECK-LABEL: test_vdup_lane_s8:
-; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8b, v0.b[5]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i8> %shuffle
 }
 
 define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
 ; CHECK-LABEL: test_vdup_lane_s16:
-; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[2]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i16> %shuffle
 }
 
 define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
 ; CHECK-LABEL: test_vdup_lane_s32:
-; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[1]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x i32> %shuffle
 }
 
 define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
 ; CHECK-LABEL: test_vdupq_lane_s8:
-; CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.16b, v0.b[5]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <16 x i8> %shuffle
 }
 
 define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
 ; CHECK-LABEL: test_vdupq_lane_s16:
-; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8h, v0.h[2]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   ret <8 x i16> %shuffle
 }
 
 define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
 ; CHECK-LABEL: test_vdupq_lane_s32:
-; CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4s, v0.s[1]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %shuffle
 }
 
 define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
 ; CHECK-LABEL: test_vdupq_lane_s64:
-; CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %shuffle
 }
 
 define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
 ; CHECK-LABEL: test_vdup_laneq_s8:
-; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8b, v0.b[5]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i8> %shuffle
 }
 
 define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
 ; CHECK-LABEL: test_vdup_laneq_s16:
-; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4h, v0.h[2]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i16> %shuffle
 }
 
 define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
 ; CHECK-LABEL: test_vdup_laneq_s32:
-; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2s, v0.s[1]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x i32> %shuffle
 }
 
 define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
 ; CHECK-LABEL: test_vdupq_laneq_s8:
-; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.16b, v0.b[5]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <16 x i8> %shuffle
 }
 
 define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
 ; CHECK-LABEL: test_vdupq_laneq_s16:
-; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[2]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   ret <8 x i16> %shuffle
 }
 
 define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
 ; CHECK-LABEL: test_vdupq_laneq_s32:
-; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4s, v0.s[1]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %shuffle
 }
 
 define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
 ; CHECK-LABEL: test_vdupq_laneq_s64:
-; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %shuffle
 }
 
 define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
 ; CHECK-LABEL: test_bitcastv8i8toi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
    %res = bitcast <8 x i8> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
    ret i64 %res
 }
 
 define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
 ; CHECK-LABEL: test_bitcastv4i16toi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
    %res = bitcast <4 x i16> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
    ret i64 %res
 }
 
 define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
 ; CHECK-LABEL: test_bitcastv2i32toi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
    %res = bitcast <2 x i32> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
    ret i64 %res
 }
 
 define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
 ; CHECK-LABEL: test_bitcastv2f32toi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
    %res = bitcast <2 x float> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
    ret i64 %res
 }
 
 define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
 ; CHECK-LABEL: test_bitcastv1i64toi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
    %res = bitcast <1 x i64> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
    ret i64 %res
 }
 
 define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
 ; CHECK-LABEL: test_bitcastv1f64toi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
    %res = bitcast <1 x double> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
    ret i64 %res
 }
 
 define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
 ; CHECK-LABEL: test_bitcasti64tov8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
    %res = bitcast i64 %in to <8 x i8>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
    ret <8 x i8> %res
 }
 
 define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
 ; CHECK-LABEL: test_bitcasti64tov4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
    %res = bitcast i64 %in to <4 x i16>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
    ret <4 x i16> %res
 }
 
 define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
 ; CHECK-LABEL: test_bitcasti64tov2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
    %res = bitcast i64 %in to <2 x i32>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
    ret <2 x i32> %res
 }
 
 define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
 ; CHECK-LABEL: test_bitcasti64tov2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
    %res = bitcast i64 %in to <2 x float>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
    ret <2 x float> %res
 }
 
 define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
 ; CHECK-LABEL: test_bitcasti64tov1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
    %res = bitcast i64 %in to <1 x i64>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
    ret <1 x i64> %res
 }
 
 define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
 ; CHECK-LABEL: test_bitcasti64tov1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
    %res = bitcast i64 %in to <1 x double>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
    ret <1 x double> %res
 }
 
 define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
 ; CHECK-LABEL: test_bitcastv8i8tov1f64:
-; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v0.8b, v0.8b
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %sub.i = sub <8 x i8> zeroinitializer, %a
   %1 = bitcast <8 x i8> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -701,8 +953,11 @@ define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
 
 define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
 ; CHECK-LABEL: test_bitcastv4i16tov1f64:
-; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v0.4h, v0.4h
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %sub.i = sub <4 x i16> zeroinitializer, %a
   %1 = bitcast <4 x i16> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -711,8 +966,11 @@ define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
 
 define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
 ; CHECK-LABEL: test_bitcastv2i32tov1f64:
-; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v0.2s, v0.2s
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %sub.i = sub <2 x i32> zeroinitializer, %a
   %1 = bitcast <2 x i32> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -721,8 +979,11 @@ define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
 
 define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1i64tov1f64:
-; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg d0, d0
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %sub.i = sub <1 x i64> zeroinitializer, %a
   %1 = bitcast <1 x i64> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -731,8 +992,11 @@ define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
 
 define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_bitcastv2f32tov1f64:
-; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fneg v0.2s, v0.2s
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
   %1 = bitcast <2 x float> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -741,8 +1005,12 @@ define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
 
 define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov8i8:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    neg v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
   %sub.i = sub <8 x i8> zeroinitializer, %1
@@ -751,8 +1019,12 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
 
 define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov4i16:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    neg v0.4h, v0.4h
+; CHECK-NEXT:    ret
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
   %sub.i = sub <4 x i16> zeroinitializer, %1
@@ -761,8 +1033,12 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
 
 define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov2i32:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    neg v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
   %sub.i = sub <2 x i32> zeroinitializer, %1
@@ -771,8 +1047,12 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
 
 define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov1i64:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    neg d0, d0
+; CHECK-NEXT:    ret
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
   %sub.i = sub <1 x i64> zeroinitializer, %1
@@ -781,8 +1061,12 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
 
 define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov2f32:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    fneg v0.2s, v0.2s
+; CHECK-NEXT:    ret
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <2 x float>
   %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %1
@@ -882,7 +1166,9 @@ define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
 
 define <8 x i8> @getl(<16 x i8> %x) #0 {
 ; CHECK-LABEL: getl:
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %vecext = extractelement <16 x i8> %x, i32 0
   %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
   %vecext1 = extractelement <16 x i8> %x, i32 1
@@ -902,15 +1188,22 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
   ret <8 x i8> %vecinit14
 }
 
-; CHECK-LABEL: test_extracts_inserts_varidx_extract:
-; CHECK: str q0
-; CHECK-DAG: and [[MASKED_IDX:x[0-9]+]], x0, #0x7
-; CHECK: bfi [[PTR:x[0-9]+]], [[MASKED_IDX]], #1, #3
-; CHECK-DAG: ldr h[[R:[0-9]+]], {{\[}}[[PTR]]{{\]}}
-; CHECK-DAG: mov v[[R]].h[1], v0.h[1]
-; CHECK-DAG: mov v[[R]].h[2], v0.h[2]
-; CHECK-DAG: mov v[[R]].h[3], v0.h[3]
 define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
+; CHECK-LABEL: test_extracts_inserts_varidx_extract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp, #-16]!
+; CHECK-NEXT:    and x8, x0, #0x7
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    bfi x9, x8, #1, #3
+; CHECK-NEXT:    ldr h1, [x9]
+; CHECK-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ret
   %tmp = extractelement <8 x i16> %x, i32 %idx
   %tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 0
   %tmp3 = extractelement <8 x i16> %x, i32 1
@@ -922,15 +1215,23 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
   ret <4 x i16> %tmp8
 }
 
-; CHECK-LABEL: test_extracts_inserts_varidx_insert:
-; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3
-; CHECK: bfi x9, [[MASKED_IDX]], #1, #2
-; CHECK: str h0, [x9]
-; CHECK-DAG: ldr d[[R:[0-9]+]]
-; CHECK-DAG: mov v[[R]].h[1], v0.h[1]
-; CHECK-DAG: mov v[[R]].h[2], v0.h[2]
-; CHECK-DAG: mov v[[R]].h[3], v0.h[3]
 define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) {
+; CHECK-LABEL: test_extracts_inserts_varidx_insert:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16 // =16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    and x8, x0, #0x3
+; CHECK-NEXT:    add x9, sp, #8 // =8
+; CHECK-NEXT:    bfi x9, x8, #1, #2
+; CHECK-NEXT:    str h0, [x9]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ret
   %tmp = extractelement <8 x i16> %x, i32 0
   %tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 %idx
   %tmp3 = extractelement <8 x i16> %x, i32 1
@@ -944,7 +1245,10 @@ define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) {
 
 define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
 ; CHECK-LABEL: test_dup_v2i32_v4i16:
-; CHECK: dup v0.4h, v0.h[2]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[2]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <2 x i32> %a, i32 1
   %vget_lane = trunc i32 %x to i16
@@ -957,7 +1261,9 @@ entry:
 
 define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
 ; CHECK-LABEL: test_dup_v4i32_v8i16:
-; CHECK: dup v0.8h, v0.h[6]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, v0.h[6]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <4 x i32> %a, i32 3
   %vget_lane = trunc i32 %x to i16
@@ -974,7 +1280,10 @@ entry:
 
 define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
 ; CHECK-LABEL: test_dup_v1i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <1 x i64> %a, i32 0
   %vget_lane = trunc i64 %x to i16
@@ -987,7 +1296,10 @@ entry:
 
 define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
 ; CHECK-LABEL: test_dup_v1i64_v2i32:
-; CHECK: dup v0.2s, v0.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <1 x i64> %a, i32 0
   %vget_lane = trunc i64 %x to i32
@@ -998,7 +1310,9 @@ entry:
 
 define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
 ; CHECK-LABEL: test_dup_v2i64_v8i16:
-; CHECK: dup v0.8h, v0.h[4]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, v0.h[4]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <2 x i64> %a, i32 1
   %vget_lane = trunc i64 %x to i16
@@ -1015,7 +1329,9 @@ entry:
 
 define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
 ; CHECK-LABEL: test_dup_v2i64_v4i32:
-; CHECK: dup v0.4s, v0.s[2]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, v0.s[2]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <2 x i64> %a, i32 1
   %vget_lane = trunc i64 %x to i32
@@ -1028,7 +1344,9 @@ entry:
 
 define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
 ; CHECK-LABEL: test_dup_v4i32_v4i16:
-; CHECK: dup v0.4h, v0.h[2]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4h, v0.h[2]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <4 x i32> %a, i32 1
   %vget_lane = trunc i32 %x to i16
@@ -1041,7 +1359,9 @@ entry:
 
 define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
 ; CHECK-LABEL: test_dup_v2i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <2 x i64> %a, i32 0
   %vget_lane = trunc i64 %x to i16
@@ -1054,7 +1374,9 @@ entry:
 
 define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
 ; CHECK-LABEL: test_dup_v2i64_v2i32:
-; CHECK: dup v0.2s, v0.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
 entry:
   %x = extractelement <2 x i64> %a, i32 0
   %vget_lane = trunc i64 %x to i32
@@ -1066,8 +1388,9 @@ entry:
 
 define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32:
-; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmaxp s0, v0.2s
+; CHECK-NEXT:    ret
 entry:
   %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
   %1 = insertelement <1 x float> undef, float %0, i32 0
@@ -1078,8 +1401,9 @@ entry:
 
 define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
 ; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32:
-; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmaxp s0, v0.2s
+; CHECK-NEXT:    ret
 entry:
   %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
   %1 = insertelement <1 x float> undef, float %0, i32 0
@@ -1092,7 +1416,10 @@ declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
 
 define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
 ; CHECK-LABEL: test_concat_undef_v1i32:
-; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = extractelement <2 x i32> %a, i32 0
   %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
@@ -1103,8 +1430,10 @@ declare i32 @llvm.aarch64.neon.sqabs.i32(i32) #4
 
 define <2 x i32> @test_concat_v1i32_undef(i32 %a) {
 ; CHECK-LABEL: test_concat_v1i32_undef:
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    sqabs s0, s0
+; CHECK-NEXT:    ret
 entry:
   %b = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
   %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0
@@ -1113,7 +1442,10 @@ entry:
 
 define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) {
 ; CHECK-LABEL: test_concat_same_v1i32_v1i32:
-; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = extractelement <2 x i32> %a, i32 0
   %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
@@ -1123,9 +1455,15 @@ entry:
 
 define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: mov {{v[0-9]+}}.s[1], w{{[0-9]+}}
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w1
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    sqabs s1, s1
+; CHECK-NEXT:    sqabs s0, s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
   %d = insertelement <2 x i32> undef, i32 %c, i32 0
@@ -1137,7 +1475,9 @@ entry:
 
 define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 {
 ; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %vecinit30
@@ -1145,7 +1485,10 @@ entry:
 
 define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i8> %x, i32 0
   %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
@@ -1169,7 +1512,10 @@ entry:
 
 define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
 ; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <16 x i8> %x, i32 0
   %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
@@ -1208,7 +1554,11 @@ entry:
 
 define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
 ; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i8> %x, i32 0
   %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
@@ -1247,7 +1597,9 @@ entry:
 
 define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 {
 ; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i16> %vecinit14
@@ -1255,7 +1607,10 @@ entry:
 
 define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i16> %x, i32 0
   %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
@@ -1271,7 +1626,10 @@ entry:
 
 define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i16> %x, i32 0
   %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
@@ -1294,7 +1652,11 @@ entry:
 
 define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i16> %x, i32 0
   %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
@@ -1317,7 +1679,9 @@ entry:
 
 define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x i32> %vecinit6
@@ -1325,7 +1689,10 @@ entry:
 
 define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <2 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -1337,7 +1704,10 @@ entry:
 
 define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
 ; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -1352,7 +1722,11 @@ entry:
 
 define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
 ; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %vecinit6
@@ -1360,7 +1734,9 @@ entry:
 
 define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
 ; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
-; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
 entry:
   %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %vecinit2
@@ -1368,7 +1744,10 @@ entry:
 
 define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
 ; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
-; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <1 x i64> %x, i32 0
   %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
@@ -1378,7 +1757,10 @@ entry:
 
 define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
 ; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <2 x i64> %x, i32 0
   %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
@@ -1389,7 +1771,11 @@ entry:
 
 define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 {
 ; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64:
-; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecext = extractelement <1 x i64> %x, i32 0
   %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
@@ -1401,84 +1787,113 @@ entry:
 
 define <4 x i16> @concat_vector_v4i16_const() {
 ; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: movi {{v[0-9]+}}.2d, #0
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
 
 define <4 x i16> @concat_vector_v4i16_const_one() {
 ; CHECK-LABEL: concat_vector_v4i16_const_one:
-; CHECK: movi {{v[0-9]+}}.4h, #1
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4h, #1
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
 
 define <4 x i32> @concat_vector_v4i32_const() {
 ; CHECK-LABEL: concat_vector_v4i32_const:
-; CHECK: movi {{v[0-9]+}}.2d, #0
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
  ret <4 x i32> %r
 }
 
 define <8 x i8> @concat_vector_v8i8_const() {
 ; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: movi {{v[0-9]+}}.2d, #0
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
  ret <8 x i8> %r
 }
 
 define <8 x i16> @concat_vector_v8i16_const() {
 ; CHECK-LABEL: concat_vector_v8i16_const:
-; CHECK: movi {{v[0-9]+}}.2d, #0
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
  ret <8 x i16> %r
 }
 
 define <8 x i16> @concat_vector_v8i16_const_one() {
 ; CHECK-LABEL: concat_vector_v8i16_const_one:
-; CHECK: movi {{v[0-9]+}}.8h, #1
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
  ret <8 x i16> %r
 }
 
 define <16 x i8> @concat_vector_v16i8_const() {
 ; CHECK-LABEL: concat_vector_v16i8_const:
-; CHECK: movi {{v[0-9]+}}.2d, #0
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
  ret <16 x i8> %r
 }
 
 define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
 ; CHECK-LABEL: concat_vector_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
 
 define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
 ; CHECK-LABEL: concat_vector_v4i32:
-; CHECK: dup v0.4s, v0.s[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
  ret <4 x i32> %r
 }
 
 define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
 ; CHECK-LABEL: concat_vector_v8i8:
-; CHECK: dup v0.8b, v0.b[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
  ret <8 x i8> %r
 }
 
 define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
 ; CHECK-LABEL: concat_vector_v8i16:
-; CHECK: dup v0.8h, v0.h[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
  ret <8 x i16> %r
 }
 
 define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
 ; CHECK-LABEL: concat_vector_v16i8:
-; CHECK: dup v0.16b, v0.b[0]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.16b, v0.b[0]
+; CHECK-NEXT:    ret
  %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
  ret <16 x i8> %r
 }
diff --git a/test/CodeGen/AArch64/cluster-frame-index.mir b/test/CodeGen/AArch64/cluster-frame-index.mir
new file mode 100644
index 0000000000000000000000000000000000000000..75ccdc4559ba51263bf7c68eb329bd380a4d2428
--- /dev/null
+++ b/test/CodeGen/AArch64/cluster-frame-index.mir
@@ -0,0 +1,27 @@
+#RUN: llc -mtriple=aarch64-- -mcpu=cyclone -run-pass machine-scheduler -o - %s | FileCheck %s
+...
+---
+name:            merge_stack
+# CHECK-LABEL: name: merge_stack
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 8 }
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    %0:gpr32 = COPY $w0
+    %1:gpr32 = COPY $w1
+    undef %3.sub_32:gpr64 = ORRWrs $wzr, %0, 0
+    STRXui %3, %stack.0, 0 :: (store 8)
+    undef %5.sub_32:gpr64 = ORRWrs $wzr, %1, 0
+    STRXui %5, %stack.0, 1 :: (store 8)
+    RET_ReallyLR
+
+    ; CHECK: COPY
+    ; CHECK-NEXT: COPY
+    ; CHECK-NEXT: ORRWrs
+    ; CHECK-NEXT: ORRWrs
+    ; CHECK-NEXT: STRXui
+    ; CHECK-NEXT: STRXui
+    ; CHECK-NEXT: RET
diff --git a/test/CodeGen/AArch64/cmp-to-cmn.ll b/test/CodeGen/AArch64/cmp-to-cmn.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6275da67fbfc45843809edd53097dde19e71290b
--- /dev/null
+++ b/test/CodeGen/AArch64/cmp-to-cmn.ll
@@ -0,0 +1,399 @@
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "arm64"
+
+define i1 @test_EQ_IllEbT(i64 %a, i64 %b) {
+; CHECK-LABEL: test_EQ_IllEbT
+; CHECK:      cmn	x1, x0
+; CHECK-NEXT: cset	w0, eq
+; CHECK-NEXT: ret
+entry:
+  %add = sub i64 0, %b
+  %cmp = icmp eq i64 %add, %a
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IliEbT(i64 %a, i32 %b) {
+; CHECK-LABEL: test_EQ_IliEbT
+; CHECK:      cmn	x0, w1, sxtw
+; CHECK-NEXT: cset	w0, eq
+; CHECK-NEXT: ret
+entry:
+  %conv = sext i32 %b to i64
+  %add = sub i64 0, %a
+  %cmp = icmp eq i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IlsEbT(i64 %a, i16 %b) {
+; CHECK-LABEL: test_EQ_IlsEbT
+; CHECK:      cmn	x0, w1, sxth
+; CHECK-NEXT: cset	w0, eq
+; CHECK-NEXT: ret
+entry:
+  %conv = sext i16 %b to i64
+  %add = sub i64 0, %a
+  %cmp = icmp eq i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IlcEbT(i64 %a, i8 %b) {
+; CHECK-LABEL: test_EQ_IlcEbT
+; CHECK: 	cmn	x0, w1, uxtb
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %conv = zext i8 %b to i64
+  %add = sub i64 0, %a
+  %cmp = icmp eq i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IilEbT(i32 %a, i64 %b) {
+; CHECK-LABEL: test_EQ_IilEbT
+; CHECK: 	cmn	x1, w0, sxtw
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i32 %a to i64
+  %add = sub i64 0, %b
+  %cmp = icmp eq i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IiiEbT(i32 %a, i32 %b) {
+; CHECK-LABEL: test_EQ_IiiEbT
+; CHECK: 	cmn	w1, w0
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %add = sub i32 0, %b
+  %cmp = icmp eq i32 %add, %a
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IisEbT(i32 %a, i16 %b) {
+; CHECK-LABEL: test_EQ_IisEbT
+; CHECK: 	cmn	w0, w1, sxth
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i16 %b to i32
+  %add = sub i32 0, %a
+  %cmp = icmp eq i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IicEbT(i32 %a, i8 %b) {
+; CHECK-LABEL: test_EQ_IicEbT
+; CHECK: 	cmn	w0, w1, uxtb
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %conv = zext i8 %b to i32
+  %add = sub i32 0, %a
+  %cmp = icmp eq i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IslEbT(i16 %a, i64 %b) {
+; CHECK-LABEL: test_EQ_IslEbT
+; CHECK: 	cmn	x1, w0, sxth
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i16 %a to i64
+  %add = sub i64 0, %b
+  %cmp = icmp eq i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IsiEbT(i16 %a, i32 %b) {
+; CHECK-LABEL: test_EQ_IsiEbT
+; CHECK: 	cmn	w1, w0, sxth
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i16 %a to i32
+  %add = sub i32 0, %b
+  %cmp = icmp eq i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IssEbT(i16 %a, i16 %b) {
+; CHECK-LABEL: test_EQ_IssEbT
+; CHECK: 	sxth	w8, w1
+; CHECK-NEXT: 	cmn	w8, w0, sxth
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT;   ret
+entry:
+  %conv = sext i16 %a to i32
+  %conv1 = sext i16 %b to i32
+  %add = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IscEbT(i16 %a, i8 %b) {
+; CHECK-LABEL: test_EQ_IscEbT
+; CHECK: 	and	w8, w1, #0xff
+; CHECK-NEXT: 	cmn	w8, w0, sxth
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT;   ret
+entry:
+  %conv = sext i16 %a to i32
+  %conv1 = zext i8 %b to i32
+  %add = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IclEbT(i8 %a, i64 %b) {
+; CHECK-LABEL: test_EQ_IclEbT
+; CHECK:      	cmn	x1, w0, uxtb
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %conv = zext i8 %a to i64
+  %add = sub i64 0, %b
+  %cmp = icmp eq i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IciEbT(i8 %a, i32 %b) {
+; CHECK-LABEL: test_EQ_IciEbT
+; CHECK:      	cmn	w1, w0, uxtb
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT: 	ret
+entry:
+  %conv = zext i8 %a to i32
+  %add = sub i32 0, %b
+  %cmp = icmp eq i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IcsEbT(i8 %a, i16 %b) {
+; CHECK-LABEL: test_EQ_IcsEbT
+; CHECK:      	sxth	w8, w1
+; CHECK-NEXT: 	cmn	w8, w0, uxtb
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT:	ret
+entry:
+  %conv = zext i8 %a to i32
+  %conv1 = sext i16 %b to i32
+  %add = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_EQ_IccEbT(i8 %a, i8 %b) {
+; CHECK-LABEL: test_EQ_IccEbT
+; CHECK:      	and	w8, w1, #0xff
+; CHECK-NEXT: 	cmn	w8, w0, uxtb
+; CHECK-NEXT: 	cset	w0, eq
+; CHECK-NEXT:	ret
+entry:
+  %conv = zext i8 %a to i32
+  %conv1 = zext i8 %b to i32
+  %add = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IllEbT(i64 %a, i64 %b) {
+; CHECK-LABEL: test_NE_IllEbT
+; CHECK:      	cmn	x1, x0
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %add = sub i64 0, %b
+  %cmp = icmp ne i64 %add, %a
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IliEbT(i64 %a, i32 %b) {
+; CHECK-LABEL: test_NE_IliEbT
+; CHECK:      	cmn	x0, w1, sxtw
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i32 %b to i64
+  %add = sub i64 0, %a
+  %cmp = icmp ne i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IlsEbT(i64 %a, i16 %b) {
+; CHECK-LABEL: test_NE_IlsEbT
+; CHECK:      	cmn	x0, w1, sxth
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i16 %b to i64
+  %add = sub i64 0, %a
+  %cmp = icmp ne i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IlcEbT(i64 %a, i8 %b) {
+; CHECK-LABEL: test_NE_IlcEbT
+; CHECK:      	cmn	x0, w1, uxtb
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = zext i8 %b to i64
+  %add = sub i64 0, %a
+  %cmp = icmp ne i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IilEbT(i32 %a, i64 %b) {
+; CHECK-LABEL: test_NE_IilEbT
+; CHECK:      	cmn	x1, w0, sxtw
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i32 %a to i64
+  %add = sub i64 0, %b
+  %cmp = icmp ne i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IiiEbT(i32 %a, i32 %b) {
+; CHECK-LABEL: test_NE_IiiEbT
+; CHECK:      	cmn	w1, w0
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %add = sub i32 0, %b
+  %cmp = icmp ne i32 %add, %a
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IisEbT(i32 %a, i16 %b) {
+; CHECK-LABEL: test_NE_IisEbT
+; CHECK:      	cmn	w0, w1, sxth
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i16 %b to i32
+  %add = sub i32 0, %a
+  %cmp = icmp ne i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IicEbT(i32 %a, i8 %b) {
+; CHECK-LABEL: test_NE_IicEbT
+; CHECK:      	cmn	w0, w1, uxtb
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = zext i8 %b to i32
+  %add = sub i32 0, %a
+  %cmp = icmp ne i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IslEbT(i16 %a, i64 %b) {
+; CHECK-LABEL: test_NE_IslEbT
+; CHECK:      	cmn	x1, w0, sxth
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i16 %a to i64
+  %add = sub i64 0, %b
+  %cmp = icmp ne i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IsiEbT(i16 %a, i32 %b) {
+; CHECK-LABEL: test_NE_IsiEbT
+; CHECK:      	cmn	w1, w0, sxth
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = sext i16 %a to i32
+  %add = sub i32 0, %b
+  %cmp = icmp ne i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IssEbT(i16 %a, i16 %b) {
+; CHECK-LABEL:test_NE_IssEbT
+; CHECK:      	sxth	w8, w1
+; CHECK-NEXT: 	cmn	w8, w0, sxth
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT:	ret
+entry:
+  %conv = sext i16 %a to i32
+  %conv1 = sext i16 %b to i32
+  %add = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IscEbT(i16 %a, i8 %b) {
+; CHECK-LABEL:test_NE_IscEbT
+; CHECK:      	and	w8, w1, #0xff
+; CHECK-NEXT: 	cmn	w8, w0, sxth
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT:	ret
+entry:
+  %conv = sext i16 %a to i32
+  %conv1 = zext i8 %b to i32
+  %add = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IclEbT(i8 %a, i64 %b) {
+; CHECK-LABEL:test_NE_IclEbT
+; CHECK:      	cmn	x1, w0, uxtb
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = zext i8 %a to i64
+  %add = sub i64 0, %b
+  %cmp = icmp ne i64 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IciEbT(i8 %a, i32 %b) {
+; CHECK-LABEL:test_NE_IciEbT
+; CHECK:      	cmn	w1, w0, uxtb
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT: 	ret
+entry:
+  %conv = zext i8 %a to i32
+  %add = sub i32 0, %b
+  %cmp = icmp ne i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IcsEbT(i8 %a, i16 %b) {
+; CHECK-LABEL:test_NE_IcsEbT
+; CHECK:      	sxth	w8, w1
+; CHECK-NEXT: 	cmn	w8, w0, uxtb
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT:	ret
+entry:
+  %conv = zext i8 %a to i32
+  %conv1 = sext i16 %b to i32
+  %add = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %add
+  ret i1 %cmp
+}
+
+define i1 @test_NE_IccEbT(i8 %a, i8 %b) {
+; CHECK-LABEL:test_NE_IccEbT
+; CHECK:      	and	w8, w1, #0xff
+; CHECK-NEXT: 	cmn	w8, w0, uxtb
+; CHECK-NEXT: 	cset	w0, ne
+; CHECK-NEXT:	ret
+entry:
+  %conv = zext i8 %a to i32
+  %conv1 = zext i8 %b to i32
+  %add = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %add
+  ret i1 %cmp
+}
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index 0e21903c2760f223ab26c79d549578fabc1da71e..d4a32bd278caa513c0c74bc91beb41523a7ffb14 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -17,6 +17,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=saphira 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=kryo 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=thunderx2t99 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=tsv110 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/fast-isel-erase.ll b/test/CodeGen/AArch64/fast-isel-erase.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e8265bce9fec3b2d6769302d53b0be4c872b7b68
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-erase.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s -fast-isel=1 -O0 | FileCheck %s
+
+; The zext can be folded into the load and removed, but doing so can invalidate
+; pointers internal to FastISel and cause a crash so it must be done carefully.
+define i32 @test() {
+; CHECK-LABEL: test:
+; CHECK: ldrh
+; CHECK: bl _callee
+; CHECK-NOT: uxth
+
+entry:
+  store i32 undef, i32* undef, align 4
+  %t81 = load i16, i16* undef, align 2
+  call void @callee()
+  %t82 = zext i16 %t81 to i32
+  %t83 = shl i32 %t82, 16
+  %t84 = or i32 undef, %t83
+  br label %end
+
+end:
+  %val = phi i32 [%t84, %entry]
+  ret i32 %val
+}
+
+declare void @callee()
diff --git a/test/CodeGen/AArch64/funclet-local-stack-size.ll b/test/CodeGen/AArch64/funclet-local-stack-size.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3b6ca6a94c19e8e381daea0280db497af96d8799
--- /dev/null
+++ b/test/CodeGen/AArch64/funclet-local-stack-size.ll
@@ -0,0 +1,53 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows | FileCheck %s
+; Check that the local stack size is computed correctly for a funclet contained
+; within a varargs function.  The varargs component shouldn't be included in the
+; local stack size computation.
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+
+$"??_R0H@8" = comdat any
+
+@"??_7type_info@@6B@" = external constant i8*
+@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+
+; CHECK-LABEL: ?catch$2@?0??func@@YAHHHZZ@4HA
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: ldp x29, x30, [sp], #16
+; Function Attrs: uwtable
+define dso_local i32 @"?func@@YAHHHZZ"(i32 %a, i32, ...) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %arr = alloca [10 x i32], align 4
+  %a2 = alloca i32, align 4
+  %1 = bitcast [10 x i32]* %arr to i8*
+  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %arr, i64 0, i64 0
+  %call = call i32 @"?init@@YAHPEAH@Z"(i32* nonnull %arraydecay)
+  %call1 = invoke i32 @"?func2@@YAHXZ"()
+          to label %cleanup unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %2 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %3 = catchpad within %2 [%rtti.TypeDescriptor2* @"??_R0H@8", i32 0, i32* %a2]
+  %4 = load i32, i32* %a2, align 4
+  %add = add nsw i32 %4, 1
+  catchret from %3 to label %cleanup
+
+cleanup:                                          ; preds = %entry, %catch
+  %retval.0 = phi i32 [ %add, %catch ], [ %call1, %entry ]
+  ret i32 %retval.0
+}
+
+declare dso_local i32 @"?init@@YAHPEAH@Z"(i32*)
+
+declare dso_local i32 @"?func2@@YAHXZ"()
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+
+attributes #0 = { uwtable }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
diff --git a/test/CodeGen/AArch64/half.ll b/test/CodeGen/AArch64/half.ll
index 154d85c9bb610757e9524e598d6effb50716b337..171019080744cf7b33def2640780ad48aab974ce 100644
--- a/test/CodeGen/AArch64/half.ll
+++ b/test/CodeGen/AArch64/half.ll
@@ -87,9 +87,9 @@ define i16 @test_fccmp(i1 %a) {
 ;CHECK: fcmp
   %cmp0 = fcmp ogt half 0xH3333, undef
   %cmp1 = fcmp ogt half 0xH2222, undef
-  %x = select i1 %cmp0, i16 0, i16 undef
+  %x = select i1 %cmp0, i16 0, i16 1
   %or = or i1 %cmp1, %cmp0
-  %y = select i1 %or, i16 4, i16 undef
+  %y = select i1 %or, i16 4, i16 1
   %r = add i16 %x, %y
   ret i16 %r
 }
diff --git a/test/CodeGen/AArch64/ldst-opt-after-block-placement.ll b/test/CodeGen/AArch64/ldst-opt-after-block-placement.ll
new file mode 100644
index 0000000000000000000000000000000000000000..468f77309d367abdc8c58d427c61c7646c8365cd
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-opt-after-block-placement.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -mtriple=aarch64-arm < %s | FileCheck %s
+
+; Run at O3 to make sure we can optimize load/store instructions after Machine
+; Block Placement takes place using Tail Duplication Threshold = 4.
+
+define void @foo(i1 %cond, i64* %ptr) {
+; CHECK-LABEL: foo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    tbz w0, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    ldp x9, x8, [x1, #8]
+; CHECK-NEXT:    str xzr, [x1, #16]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.lt .LBB0_3
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_2: // %if.else
+; CHECK-NEXT:    ldp x8, x9, [x1]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ge .LBB0_4
+; CHECK-NEXT:  .LBB0_3: // %exit1
+; CHECK-NEXT:    str xzr, [x1, #8]
+; CHECK-NEXT:  .LBB0_4: // %exit2
+; CHECK-NEXT:    ret
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %0 = getelementptr inbounds i64, i64* %ptr, i64 2
+  %1 = load i64, i64* %0, align 8
+  store i64 0, i64* %0, align 8
+  br label %if.end
+
+if.else:
+  %2 = load i64, i64* %ptr, align 8
+  br label %if.end
+
+if.end:
+  %3 = phi i64 [ %1, %if.then ], [ %2, %if.else ]
+  %4 = getelementptr inbounds i64, i64* %ptr, i64 1
+  %5 = load i64, i64* %4, align 8
+  %6 = icmp slt i64 %3, %5
+  br i1 %6, label %exit1, label %exit2
+
+exit1:
+  store i64 0, i64* %4, align 8
+  ret void
+
+exit2:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
index ae3f59ee8f5dc0f2cf7105bc6c16ba135052d031..7f6cba2133f05a69f2c6885e7f211f3f1979d539 100644
--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -1465,10 +1465,10 @@ entry:
 define void @merge_zr32_3vec(<3 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_3vec:
 ; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
 ; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #4]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <3 x i32> zeroinitializer, <3 x i32>* %p
@@ -1480,8 +1480,8 @@ define void @merge_zr32_4vec(<4 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_4vec:
 ; CHECK: // %entry
 ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <4 x i32> zeroinitializer, <4 x i32>* %p
@@ -1505,8 +1505,8 @@ define void @merge_zr32_4vecf(<4 x float>* %p) {
 ; CHECK-LABEL: merge_zr32_4vecf:
 ; CHECK: // %entry
 ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <4 x float> zeroinitializer, <4 x float>* %p
@@ -1589,8 +1589,8 @@ entry:
 define void @merge_zr64_3vec(<3 x i64>* %p) {
 ; CHECK-LABEL: merge_zr64_3vec:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}, #16]
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #8]
+; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <3 x i64> zeroinitializer, <3 x i64>* %p
diff --git a/test/CodeGen/AArch64/machine-outliner-all-stack.mir b/test/CodeGen/AArch64/machine-outliner-all-stack.mir
new file mode 100644
index 0000000000000000000000000000000000000000..83b170a3c9894d030f74919e2a76c71df1b9dac0
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-outliner-all-stack.mir
@@ -0,0 +1,112 @@
+# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner \
+# RUN: -verify-machineinstrs %s -o - | FileCheck %s
+
+# Show that, when instructions that use the stack are present, it's possible
+# for us to outline everything as the default outlining type.
+# It's possible for reg-save-possible to outline by storing LR to a register,
+# but most candidates in this case require us to modify the stack. The outliner
+# should see that it's more beneficial to fix up instructions and save LR to
+# the stack in this case.
+
+--- |
+  define void @reg-save-possible() #0 { ret void }
+  define void @stack-save1() #0 { ret void }
+  define void @stack-save2() #0 { ret void }
+  define void @stack-save3() #0 { ret void }
+  attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" }
+...
+---
+
+name:            reg-save-possible
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+  $lr = ORRXri $xzr, 1
+  $x19 = ORRXri $xzr, 1
+  $x20 = ORRXri $xzr, 1
+  bb.1:
+    liveins: $lr
+    ; CHECK-LABEL: name:            reg-save-possible
+    ; CHECK: $sp = STRXpre $lr, $sp, -16
+    ; CHECK-NEXT: BL [[FN:@OUTLINED_FUNCTION_[0-9]+]]
+    ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+  bb.2:
+    RET undef $lr
+
+...
+---
+
+name:            stack-save1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+  $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    ; CHECK-LABEL: name:            stack-save1
+    ; CHECK: $sp = STRXpre $lr, $sp, -16
+    ; CHECK-NEXT: BL [[FN]]
+    ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+  bb.2:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    RET undef $lr
+
+...
+---
+
+name:            stack-save2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+  $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    ; CHECK-LABEL: name:            stack-save2
+    ; CHECK: $sp = STRXpre $lr, $sp, -16
+    ; CHECK-NEXT: BL [[FN]]
+    ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+  bb.2:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    RET undef $lr
+
+...
+---
+
+name:            stack-save3
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+  $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    ; CHECK-LABEL: name:            stack-save3
+    ; CHECK: $sp = STRXpre $lr, $sp, -16
+    ; CHECK-NEXT: BL [[FN]]
+    ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+    $x20, $x19 = LDPXi $sp, 10
+  bb.2:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    RET undef $lr
diff --git a/test/CodeGen/AArch64/machine-outliner-compatible-candidates.mir b/test/CodeGen/AArch64/machine-outliner-compatible-candidates.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b153b4c5de2771ffd5789f40fe88812225a8ee5b
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-outliner-compatible-candidates.mir
@@ -0,0 +1,103 @@
+# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner \
+# RUN: -verify-machineinstrs %s -o - | FileCheck %s
+
+# Ensure that we can outline candidates with compatible call/frame classes.
+#
+# - Save/restores that don't impact the stack can be outlined together.
+# - Save/restores that impact the stack if the outlined sequence doesn't use
+#   the stack.
+
+--- |
+  define void @no-save1() #0 { ret void }
+  define void @no-save2() #0 { ret void }
+  define void @reg-save() #0 { ret void }
+  define void @stack-save() #0 { ret void }
+  attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" }
+...
+---
+
+name:            no-save1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+  $lr = ORRXri $xzr, 1
+  bb.1:
+    ; CHECK-LABEL: name:            no-save1
+    ; CHECK: BL [[FN:@OUTLINED_FUNCTION_[0-9]+]]
+    ; CHECK-NOT: STRXpre
+    ; CHECK-NOT: $lr =
+    ; CHECK-NOT: ORRXrs
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+  bb.2:
+    RET undef $lr
+
+...
+---
+
+name:            no-save2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+  $lr = ORRXri $xzr, 1
+  bb.1:
+    ; CHECK-LABEL: name:            no-save2
+    ; CHECK: BL [[FN]]
+    ; CHECK-NOT: STRXpre
+    ; CHECK-NOT: $lr =
+    ; CHECK-NOT: ORRXrs
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+  bb.2:
+    RET undef $lr
+...
+---
+
+name:            reg-save
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+  $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr
+    ; CHECK-LABEL: name:            reg-save
+    ; CHECK: $[[REG:x[0-9]+]] = ORRXrs $xzr, $lr, 0
+    ; CHECK-NEXT: BL [[FN]]
+    ; CHECK-NEXT: $lr = ORRXrs $xzr, $[[REG]], 0
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+  bb.2:
+  liveins: $lr
+    RET undef $lr
+
+...
+---
+
+name:            stack-save
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+  $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    ; CHECK-LABEL: name:            stack-save
+    ; CHECK: $sp = STRXpre $lr, $sp, -16
+    ; CHECK-NEXT: BL [[FN]]
+    ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+  bb.2:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    RET undef $lr
diff --git a/test/CodeGen/AArch64/machine-outliner-drop-stack.mir b/test/CodeGen/AArch64/machine-outliner-drop-stack.mir
new file mode 100644
index 0000000000000000000000000000000000000000..eb17dab5c10df1f0a378f5d194bd75d383b29342
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-outliner-drop-stack.mir
@@ -0,0 +1,99 @@
+# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner \
+# RUN: -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  define void @no-save1() #0 { ret void }
+  define void @no-save2() #0 { ret void }
+  define void @reg-save() #0 { ret void }
+  define void @stack-save() #0 { ret void }
+  attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" }
+...
+---
+
+name:            no-save1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+  $lr = ORRXri $xzr, 1
+  bb.1:
+    ; CHECK-LABEL: name:            no-save1
+    ; CHECK: BL [[FN:@OUTLINED_FUNCTION_[0-9]+]]
+    ; CHECK-NOT: STRXpre
+    ; CHECK-NOT: $lr =
+    ; CHECK-NOT: ORRXrs
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+  bb.2:
+    RET undef $lr
+
+...
+---
+
+name:            no-save2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+  $lr = ORRXri $xzr, 1
+  bb.1:
+    ; CHECK-LABEL: name:            no-save2
+    ; CHECK: BL [[FN]]
+    ; CHECK-NOT: STRXpre
+    ; CHECK-NOT: $lr =
+    ; CHECK-NOT: ORRXrs
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+  bb.2:
+    RET undef $lr
+...
+---
+
+name:            reg-save
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+  $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr
+    ; CHECK-LABEL: name:            reg-save
+    ; CHECK: $[[REG:x[0-9]+]] = ORRXrs $xzr, $lr, 0
+    ; CHECK-NEXT: BL [[FN]]
+    ; CHECK-NEXT: $lr = ORRXrs $xzr, $[[REG]], 0
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+  bb.2:
+  liveins: $lr
+    RET undef $lr
+
+...
+---
+
+name:            stack-save
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+  $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    ; CHECK-LABEL: name:            stack-save
+    ; CHECK-NOT: BL
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+  bb.2:
+  liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+    RET undef $lr
diff --git a/test/CodeGen/AArch64/machine-outliner-ordering.mir b/test/CodeGen/AArch64/machine-outliner-ordering.mir
new file mode 100644
index 0000000000000000000000000000000000000000..7395df641f5542dc84e01d8ac3c2135d39800cf4
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-outliner-ordering.mir
@@ -0,0 +1,106 @@
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=machine-outliner \
+# RUN: -verify-machineinstrs %s -o - | FileCheck %s
+
+# Ensure that outlined function names appear as expected. Currently, they are
+# output in order of benefit.
+
+--- |
+  define void @should_have_fn2() #0 { ret void }
+  define void @should_have_fn0() #0 { ret void }
+  define void @should_have_fn1() #0 { ret void }
+  attributes #0 = { noredzone optsize minsize }
+...
+---
+
+name: should_have_fn2
+tracksRegLiveness: true
+body:             |
+    bb.0:
+        ; CHECK-LABEL: name:        should_have_fn2
+        ; CHECK-NOT: OUTLINED_FUNCTION_1
+        ; CHECK-NOT: OUTLINED_FUNCTION_0
+        ; CHECK: OUTLINED_FUNCTION_2
+        $w0 = ORRWri $wzr, 1
+        $w0 = ORRWri $wzr, 1
+        $w0 = ORRWri $wzr, 1
+    bb.1:
+        ; CHECK-DAG: OUTLINED_FUNCTION_2
+        $w0 = ORRWri $wzr, 1
+        $w0 = ORRWri $wzr, 1
+        $w0 = ORRWri $wzr, 1
+    bb.2:
+        ; CHECK-DAG: OUTLINED_FUNCTION_2
+        $w0 = ORRWri $wzr, 1
+        $w0 = ORRWri $wzr, 1
+        $w0 = ORRWri $wzr, 1
+    bb.3:
+        ; CHECK-DAG: OUTLINED_FUNCTION_2
+        $w0 = ORRWri $wzr, 1
+        $w0 = ORRWri $wzr, 1
+        $w0 = ORRWri $wzr, 1
+    bb.4:
+        RET undef $lr
+
+...
+---
+
+name: should_have_fn0
+
+tracksRegLiveness: true
+body:             |
+    bb.0:
+        ; CHECK-LABEL: name: should_have_fn0
+        ; CHECK-NOT: OUTLINED_FUNCTION_1
+        ; CHECK-NOT: OUTLINED_FUNCTION_2
+        ; CHECK: OUTLINED_FUNCTION_0
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+    bb.1:
+        ; CHECK-DAG: OUTLINED_FUNCTION_0
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+    bb.3:
+        ; CHECK-DAG: OUTLINED_FUNCTION_0
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+        $w1 = ORRWri $wzr, 1
+    bb.4:
+        RET undef $lr
+
+...
+---
+
+name: should_have_fn1
+tracksRegLiveness: true
+body:             |
+    bb.0:
+        ; CHECK-LABEL: name: should_have_fn1
+        ; CHECK-NOT: OUTLINED_FUNCTION_0
+        ; CHECK-NOT: OUTLINED_FUNCTION_2
+        ; CHECK: OUTLINED_FUNCTION_1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+    bb.1:
+        ; CHECK-DAG: OUTLINED_FUNCTION_1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+    bb.3:
+        ; CHECK-DAG: OUTLINED_FUNCTION_1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+        $w2 = ORRWri $wzr, 1
+    bb.4:
+        RET undef $lr
diff --git a/test/CodeGen/AArch64/machine-outliner-regsave.mir b/test/CodeGen/AArch64/machine-outliner-regsave.mir
index 6d00bd39cde7063d257ef59cff30493be78d6571..d05cbddbb327164f3a1ab8bbba3ab29b0d179c4b 100644
--- a/test/CodeGen/AArch64/machine-outliner-regsave.mir
+++ b/test/CodeGen/AArch64/machine-outliner-regsave.mir
@@ -2,8 +2,8 @@
 # RUN: -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s
 # Check that we save LR to a callee-saved register when possible.
 # foo() should use a callee-saved register. However, bar() should not.
---- |  
-  
+--- |
+
   define void @foo() #0 {
     ret void
   }
@@ -17,33 +17,40 @@
 ---
 # Make sure that when we outline and a register is available, we
 # use it to save + restore LR instead of SP.
+# Also make sure that we can call functions that require no save as the same
+# outlined function.
 # CHECK: name:            foo
-# CHECK-DAG: bb.0
+# CHECK-DAG: bb.1
 # CHECK-DAG: $x[[REG:[0-9]+]] = ORRXrs $xzr, $lr, 0
 # CHECK-NEXT: BL
 # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0
-# CHECK-DAG: bb.1
+# CHECK-DAG: bb.2
 # CHECK-DAG: $x[[REG]] = ORRXrs $xzr, $lr, 0
 # CHECK-NEXT: BL
 # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0
-# CHECK-DAG: bb.2
+# CHECK-DAG: bb.3
 # CHECK-DAG: $x[[REG]] = ORRXrs $xzr, $lr, 0
 # CHECK-NEXT: BL
 # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0
+# CHECK-DAG: bb.4
+# CHECK-NOT: $x[[REG]] = ORRXrs $xzr, $lr, 0
+# CHECK-DAG: BL
 name:            foo
 tracksRegLiveness: true
-fixedStack:      
+fixedStack:
 body:             |
   bb.0:
-    liveins: $lr, $w9
     $x25 = ORRXri $xzr, 1
+    $lr = ORRXri $xzr, 1
+  bb.1:
+    liveins: $lr, $w9
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 2
-  bb.1:
+  bb.2:
     liveins: $lr, $w9
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 1
@@ -51,7 +58,15 @@ body:             |
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 2
-  bb.2:
+  bb.3:
+    liveins: $lr, $w9
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 2
+  bb.4:
     liveins: $lr, $w9
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 1
@@ -59,6 +74,8 @@ body:             |
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 1
     $w9 = ORRWri $wzr, 2
+  bb.5:
+    liveins: $w9
     RET undef $lr
 
 ...
@@ -109,4 +126,3 @@ body:             |
   bb.3:
     liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28
     RET undef $lr
-
diff --git a/test/CodeGen/AArch64/machine-outliner-remarks.ll b/test/CodeGen/AArch64/machine-outliner-remarks.ll
index 29872d9518afc3b5257e32e0208d82380d7a3451..19351262b82b16be43bd1c37a12477825b35dc87 100644
--- a/test/CodeGen/AArch64/machine-outliner-remarks.ll
+++ b/test/CodeGen/AArch64/machine-outliner-remarks.ll
@@ -1,10 +1,10 @@
 ; RUN: llc %s -enable-machine-outliner -mtriple=aarch64-unknown-unknown -pass-remarks=machine-outliner -pass-remarks-missed=machine-outliner -o /dev/null 2>&1 | FileCheck %s
 ; CHECK: <unknown>:0:0:
 ; CHECK-SAME: Did not outline 2 instructions from 2 locations.
-; CHECK-SAME: Bytes from outlining all occurrences (36) >=
+; CHECK-SAME: Bytes from outlining all occurrences (16) >=
 ; CHECK-SAME: Unoutlined instruction bytes (16)
 ; CHECK-SAME: (Also found at: <UNKNOWN LOCATION>)
-; CHECK: remark: <unknown>:0:0: Saved 20 bytes by outlining 12 instructions
+; CHECK: remark: <unknown>:0:0: Saved 48 bytes by outlining 14 instructions
 ; CHECK-SAME: from 2 locations. (Found at: <UNKNOWN LOCATION>,
 ; CHECK-SAME: <UNKNOWN LOCATION>)
 ; RUN: llc %s -enable-machine-outliner -mtriple=aarch64-unknown-unknown -o /dev/null -pass-remarks-missed=machine-outliner -pass-remarks-output=%t.yaml
@@ -16,7 +16,7 @@
 ; YAML-NEXT: Pass:            machine-outliner
 ; YAML-NEXT: Name:            NotOutliningCheaper
 ; YAML-NEXT: Function:
-; YAML-NEXT: Args:            
+; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Did not outline '
 ; YAML-NEXT:   - Length:          '2'
 ; YAML-NEXT:   - String:          ' instructions'
@@ -24,7 +24,7 @@
 ; YAML-NEXT:   - NumOccurrences:  '2'
 ; YAML-NEXT:   - String:          ' locations.'
 ; YAML-NEXT:   - String:          ' Bytes from outlining all occurrences ('
-; YAML-NEXT:   - OutliningCost:   '36'
+; YAML-NEXT:   - OutliningCost:   '16'
 ; YAML-NEXT:   - String:          ')'
 ; YAML-NEXT:   - String:          ' >= Unoutlined instruction bytes ('
 ; YAML-NEXT:   - NotOutliningCost: '16'
@@ -36,12 +36,12 @@
 ; YAML-NEXT: Pass:            machine-outliner
 ; YAML-NEXT: Name:            OutlinedFunction
 ; YAML-NEXT: Function:        OUTLINED_FUNCTION_0
-; YAML-NEXT: Args:            
+; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Saved '
-; YAML-NEXT:   - OutliningBenefit: '20'
+; YAML-NEXT:   - OutliningBenefit: '48'
 ; YAML-NEXT:   - String:          ' bytes by '
 ; YAML-NEXT:   - String:          'outlining '
-; YAML-NEXT:   - Length:          '12'
+; YAML-NEXT:   - Length:          '14'
 ; YAML-NEXT:   - String:          ' instructions '
 ; YAML-NEXT:   - String:          'from '
 ; YAML-NEXT:   - NumOccurrences:  '2'
diff --git a/test/CodeGen/AArch64/machine-outliner-unsafe-stack-call.mir b/test/CodeGen/AArch64/machine-outliner-unsafe-stack-call.mir
new file mode 100644
index 0000000000000000000000000000000000000000..330d61eb03d4474e1358aac6e62a518628f0e9d6
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-outliner-unsafe-stack-call.mir
@@ -0,0 +1,72 @@
+# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner \
+# RUN: -verify-machineinstrs %s -o - | FileCheck %s
+
+# Ensure that we never outline calls into sequences where unsafe stack
+# instructions are present.
+
+--- |
+  define void @foo() #0 { ret void }
+  define void @bar() #0 { ret void }
+  define void @baz() #0 { ret void }
+  define void @f1() #0 { ret void }
+  define void @f2() #0 { ret void }
+  attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" }
+...
+---
+
+name:            f1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+    ; CHECK-LABEL: name:            f1
+    ; CHECK: foo
+    ; CHECK-DAG: bar
+    ; CHECK-DAG: baz
+    $lr = ORRXri $xzr, 1
+    BL @foo, implicit-def dead $lr, implicit $sp
+    $x20, $x19 = LDPXi $sp, 255
+    $x20, $x19 = LDPXi $sp, 255
+    $x20, $x19 = LDPXi $sp, 255
+    $x20, $x19 = LDPXi $sp, 255
+  bb.1:
+    BL @bar, implicit-def dead $lr, implicit $sp
+    $x11 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x13 = ADDXri $sp, 48, 0;
+    $x14 = ADDXri $sp, 48, 0;
+  bb.2:
+    BL @baz, implicit-def dead $lr, implicit $sp
+    $x0 = ADDXri $sp, 48, 0;
+    $x1 = ADDXri $sp, 48, 0;
+    RET undef $lr
+
+...
+---
+
+name:            f2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name:            f2
+    ; CHECK: foo
+    ; CHECK-DAG: bar
+    ; CHECK-DAG: baz
+    $lr = ORRXri $xzr, 1
+    BL @foo, implicit-def dead $lr, implicit $sp
+    $x20, $x19 = LDPXi $sp, 255
+    $x20, $x19 = LDPXi $sp, 255
+    $x20, $x19 = LDPXi $sp, 255
+    $x20, $x19 = LDPXi $sp, 255
+  bb.1:
+    BL @bar, implicit-def dead $lr, implicit $sp
+    $x11 = ADDXri $sp, 48, 0;
+    $x12 = ADDXri $sp, 48, 0;
+    $x13 = ADDXri $sp, 48, 0;
+    $x14 = ADDXri $sp, 48, 0;
+  bb.2:
+    BL @baz, implicit-def dead $lr, implicit $sp
+    $x0 = ADDXri $sp, 48, 0;
+    $x1 = ADDXri $sp, 48, 0;
+    RET undef $lr
diff --git a/test/CodeGen/AArch64/machine-outliner.ll b/test/CodeGen/AArch64/machine-outliner.ll
index 19be14d8d39136628953dd96cb650a19b649b40a..42d0a09f0282ba36fb275252540074bdfb8b5d38 100644
--- a/test/CodeGen/AArch64/machine-outliner.ll
+++ b/test/CodeGen/AArch64/machine-outliner.ll
@@ -103,6 +103,7 @@ define void @dog() #0 {
 ; CHECK-NEXT: str     w8, [sp, #12]
 ; CHECK-NEXT: orr     w8, wzr, #0x6
 ; CHECK-NEXT: str     w8, [sp, #8]
+; CHECK-NEXT: add     sp, sp, #32
 ; CHECK-NEXT: ret
 
 attributes #0 = { noredzone "target-cpu"="cyclone" "target-features"="+sse" }
diff --git a/test/CodeGen/AArch64/machine-outliner.mir b/test/CodeGen/AArch64/machine-outliner.mir
index bd1abdccd44c21302ca27fa82a8bffb900baa021..625d6a826848cafc28274aa94df25717b145cda1 100644
--- a/test/CodeGen/AArch64/machine-outliner.mir
+++ b/test/CodeGen/AArch64/machine-outliner.mir
@@ -10,11 +10,11 @@
   define i32 @main() #0 {
     ret i32 0
   }
-  
+
   define void @bar(i32 %a) #0 {
     ret void
   }
-  
+
   attributes #0 = { noinline noredzone "no-frame-pointer-elim"="true" }
 ...
 ---
@@ -22,79 +22,79 @@
 # - Create outlined functions
 # - Don't outline anything to do with LR or W30
 # - Save LR when it's not available
-# - Don't outline stack instructions when we might need to save + restore
 # - Functions whose addresses are taken can still be outlined
 #
-# CHECK-LABEL: name: main
-
-# CHECK: BL @OUTLINED_FUNCTION_[[F0:[0-9]+]]
+# CHECK-LABEL: main
+# CHECK-LABEL: bb.1:
+# CHECK-DAG: BL @OUTLINED_FUNCTION_[[F0:[0-9]+]]
 # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG:[0-9]+]], 0
-# CHECK-NEXT: $x16 = ADDXri $sp, 48, 0
-# CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1
+# CHECK-NEXT: STRHHroW $w12, $x9, $w30, 1, 1
 # CHECK-NEXT: $lr = ORRXri $xzr, 1
 
+# CHECK-DAG: bb.2
 # CHECK: BL @OUTLINED_FUNCTION_[[F0]]
 # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0
-# CHECK-NEXT: $x16 = ADDXri $sp, 48, 0
-# CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1
+# CHECK-NEXT: STRHHroW $w12, $x9, $w30, 1, 1
 # CHECK-NEXT: $lr = ORRXri $xzr, 1
 
+# CHECK-DAG: bb.3
 # CHECK: BL @OUTLINED_FUNCTION_[[F0]]
 # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0
-# CHECK-NEXT: $x16 = ADDXri $sp, 48, 0
-# CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1
+# CHECK-NEXT: STRHHroW $w12, $x9, $w30, 1, 1
 # CHECK-NEXT: $lr = ORRXri $xzr, 1
 name:            main
 tracksRegLiveness: true
 body:             |
   bb.0:
+  liveins: $lr
     $sp = frame-setup SUBXri $sp, 16, 0
     renamable $x9 = ADRP target-flags(aarch64-page) @bar
     $x9 = ORRXri $xzr, 1
-    $w16 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
     $w30 = ORRWri $wzr, 1
     $lr = ORRXri $xzr, 1
-
+  bb.1:
+  liveins: $lr
     $x20, $x19 = LDPXi $sp, 10
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
     renamable $x9 = ADRP target-flags(aarch64-page) @x
-    $x16 = ADDXri $sp, 48, 0;
-    STRHHroW $w16, $x9, $w30, 1, 1
+    $x12 = ADDXri $sp, 48, 0;
+    STRHHroW $w12, $x9, $w30, 1, 1
     $lr = ORRXri $xzr, 1
-    $w3 = ORRWri $wzr, 1993
-
+  bb.2:
+  liveins: $lr
     $x20, $x19 = LDPXi $sp, 10
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
     renamable $x9 = ADRP target-flags(aarch64-page) @x
-    $x16 = ADDXri $sp, 48, 0;
-    STRHHroW $w16, $x9, $w30, 1, 1
-    $lr = ORRXri $xzr, 1 
-
-    $w4 = ORRWri $wzr, 1994
-
+    $x12 = ADDXri $sp, 48, 0;
+    STRHHroW $w12, $x9, $w30, 1, 1
+    $lr = ORRXri $xzr, 1
+  bb.3:
+  liveins: $lr
     $x20, $x19 = LDPXi $sp, 10
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
-    $w16 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
     renamable $x9 = ADRP target-flags(aarch64-page) @x
-    $x16 = ADDXri $sp, 48, 0;
-    STRHHroW $w16, $x9, $w30, 1, 1
+    $x12 = ADDXri $sp, 48, 0;
+    STRHHroW $w12, $x9, $w30, 1, 1
     $lr = ORRXri $xzr, 1
-
     $sp = ADDXri $sp, 16, 0
+  bb.4:
+  liveins: $lr
     RET undef $lr
 
 ...
@@ -104,10 +104,10 @@ body:             |
 # CHECK-LABEL: bb.1:
 # CHECK-NOT: BL @baz, implicit-def dead $lr, implicit $sp
 # CHECK: BL @OUTLINED_FUNCTION_[[F1:[0-9]+]], implicit-def $lr, implicit $sp
-# CHECK-NEXT: $w17 = ORRWri $wzr, 2
+# CHECK-NEXT: $w11 = ORRWri $wzr, 2
 # CHECK-NEXT: BL @OUTLINED_FUNCTION_[[F1]], implicit-def $lr, implicit $sp
 # CHECK-NEXT: $w8 = ORRWri $wzr, 0
-# CHECK-NOT: $w17 = KILL renamable $w17, implicit killed $w17
+# CHECK-NOT: $w11 = KILL renamable $w11, implicit killed $w11
 name:            bar
 tracksRegLiveness: true
 body:             |
@@ -118,25 +118,25 @@ body:             |
 
   bb.1:
     BL @baz, implicit-def dead $lr, implicit $sp
-    $w17 = ORRWri $wzr, 1
-    $w17 = ORRWri $wzr, 1
-    $w17 = KILL renamable $w17, implicit killed $w17
-    $w17 = ORRWri $wzr, 1
-    $w17 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = KILL renamable $w11, implicit killed $w11
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
     BL @baz, implicit-def dead $lr, implicit $sp
-    $w17 = ORRWri $wzr, 1
-    $w17 = ORRWri $wzr, 1
-    $w17 = ORRWri $wzr, 2
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 2
     BL @baz, implicit-def dead $lr, implicit $sp
-    $w17 = ORRWri $wzr, 1
-    $w17 = ORRWri $wzr, 1
-    $w17 = ORRWri $wzr, 1
-    $w17 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
     BL @baz, implicit-def dead $lr, implicit $sp
-    $w17 = ORRWri $wzr, 1
-    $w17 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
+    $w11 = ORRWri $wzr, 1
     $w8 = ORRWri $wzr, 0
-    
+
   bb.2:
     $w15 = ORRWri $wzr, 1
     $w15 = ORRWri $wzr, 1
@@ -150,7 +150,7 @@ body:             |
     $w15 = ORRWri $wzr, 1
     $x15 = ADDXri $sp, 48, 0;
     $w8 = ORRWri $wzr, 0
-    
+
   bb.3:
     $fp, $lr = LDPXi $sp, 2
     $sp = ADDXri $sp, 32, 0
diff --git a/test/CodeGen/AArch64/pr40091.ll b/test/CodeGen/AArch64/pr40091.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8cf51f4beb5f44a87cfecda9360103abe108a20c
--- /dev/null
+++ b/test/CodeGen/AArch64/pr40091.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-arm-none-eabi | FileCheck %s
+
+define i64 @test(i64 %aa) {
+; CHECK-LABEL: test:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %a = bitcast i64 %aa to  <1 x i64>
+  %k = icmp sgt <1 x i64> %a, zeroinitializer
+  %l = zext <1 x i1> %k to <1 x i64>
+  %o = and <1 x i64> %l, %a
+  %p = xor <1 x i64> %l, <i64 -1>
+  %q = and <1 x i64> %p, <i64 81985529216486895>
+  %r = or <1 x i64> %q, %o
+  %s = bitcast <1 x i64> %r to <8 x i8>
+  %t = shufflevector <8 x i8> %s, <8 x i8> %s, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %u = bitcast <8 x i8> %t to i64
+  ret i64 %u
+}
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index ed1d415067e82e2fb012b4b0bcb2af8eccfe04a5..52ebe876014491566da199e907f4750603adca85 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -13,6 +13,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=saphira -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx2t99 -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=tsv110 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s
 
 %X = type { i64, i64, i64 }
diff --git a/test/CodeGen/AArch64/select_cc.ll b/test/CodeGen/AArch64/select_cc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..785e6b800ed74acc160efd04b554a40f674a6131
--- /dev/null
+++ b/test/CodeGen/AArch64/select_cc.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+define i64 @select_ogt_float(float %a, float %b) {
+; CHECK-LABEL: select_ogt_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+entry:
+  %cc = fcmp ogt float %a, %b
+  %sel = select i1 %cc, i64 4, i64 0
+  ret i64 %sel
+}
+
+define i64 @select_ule_float_inverse(float %a, float %b) {
+; CHECK-LABEL: select_ule_float_inverse:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+entry:
+  %cc = fcmp ule float %a, %b
+  %sel = select i1 %cc, i64 0, i64 4
+  ret i64 %sel
+}
+
+define i64 @select_eq_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: select_eq_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+entry:
+  %cc = icmp eq i32 %a, %b
+  %sel = select i1 %cc, i64 4, i64 0
+  ret i64 %sel
+}
+
+define i64 @select_ne_i32_inverse(i32 %a, i32 %b) {
+; CHECK-LABEL: select_ne_i32_inverse:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+entry:
+  %cc = icmp ne i32 %a, %b
+  %sel = select i1 %cc, i64 0, i64 4
+  ret i64 %sel
+}
diff --git a/test/CodeGen/AArch64/shadow-call-stack.ll b/test/CodeGen/AArch64/shadow-call-stack.ll
index dbd44fd3cd17dfbbf50e7780edbe9b65a6b33aa1..060d81b7fc218d5b00525fabcf9dce55b17d33cf 100644
--- a/test/CodeGen/AArch64/shadow-call-stack.ll
+++ b/test/CodeGen/AArch64/shadow-call-stack.ll
@@ -22,6 +22,7 @@ declare i32 @bar()
 define i32 @f3() shadowcallstack {
   ; CHECK: f3:
   ; CHECK: str x30, [x18], #8
+  ; CHECK: .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78
   ; CHECK: str x30, [sp, #-16]!
   %res = call i32 @bar()
   %res1 = add i32 %res, 1
@@ -45,3 +46,11 @@ define i32 @f4() shadowcallstack {
   ; CHECK: ret
   ret i32 %res1234
 }
+
+define i32 @f5() shadowcallstack nounwind {
+  ; CHECK: f5:
+  ; CHECK-NOT: .cfi_escape
+  %res = call i32 @bar()
+  %res1 = add i32 %res, 1
+  ret i32 %res
+}
diff --git a/test/CodeGen/AArch64/sign-return-address.ll b/test/CodeGen/AArch64/sign-return-address.ll
index c057c815acfd4b6ae5c7ef0d0e755a1a993d9c61..dfd52f87fdeb18f20aa16201ecd7939cfac6dc27 100644
--- a/test/CodeGen/AArch64/sign-return-address.ll
+++ b/test/CodeGen/AArch64/sign-return-address.ll
@@ -24,17 +24,17 @@ define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ; CHECK-LABEL: @leaf_sign_all
 ; CHECK: paciasp
 ; CHECK: autiasp
-; CHECK-NEXT: ret
+; CHECK: ret
 define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
   ret i32 %x
 }
 
 ; CHECK: @leaf_clobbers_lr
 ; CHECK: paciasp
-; CHECK-NEXT: str x30, [sp, #-16]!
+; CHECK: str x30, [sp, #-16]!
 ; CHECK: ldr  x30, [sp], #16
 ; CHECK-NEXT: autiasp
-; CHECK-NEXT: ret
+; CHECK: ret
 define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
   call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
   ret i64 %x
@@ -45,7 +45,7 @@ declare i32 @foo(i32)
 ; CHECK: @non_leaf_sign_all
 ; CHECK: paciasp
 ; CHECK: autiasp
-; CHECK-NEXT: ret
+; CHECK: ret
 define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
   %call = call i32 @foo(i32 %x)
   ret i32 %call
@@ -53,10 +53,10 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 
 ; CHECK: @non_leaf_sign_non_leaf
 ; CHECK: paciasp
-; CHECK-NEXT: str x30, [sp, #-16]!
+; CHECK: str x30, [sp, #-16]!
 ; CHECK: ldr  x30, [sp], #16
-; CHECK-NEXT: autiasp
-; CHECK-NEXT: ret
+; CHECK: autiasp
+; CHECK: ret
 define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
   %call = call i32 @foo(i32 %x)
   ret i32 %call
@@ -65,7 +65,7 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ; CHECK-LABEL: @leaf_sign_all_v83
 ; CHECK: paciasp
 ; CHECK-NOT: ret
-; CHECK-NEXT: retaa
+; CHECK: retaa
 ; CHECK-NOT: ret
 define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
   ret i32 %x
@@ -75,10 +75,10 @@ declare fastcc i64 @bar(i64)
 
 ; CHECK-LABEL: @spill_lr_and_tail_call
 ; CHECK: paciasp
-; CHECK-NEXT: str x30, [sp, #-16]!
+; CHECK: str x30, [sp, #-16]!
 ; CHECK: ldr  x30, [sp], #16
-; CHECK-NEXT: autiasp
-; CHECK-NEXT: b  bar
+; CHECK: autiasp
+; CHECK: b  bar
 define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
   call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
   tail call fastcc i64 @bar(i64 %x)
diff --git a/test/CodeGen/AArch64/speculation-hardening-dagisel.ll b/test/CodeGen/AArch64/speculation-hardening-dagisel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4d13d98441e0d1a76d543e274a644ba21d7021a5
--- /dev/null
+++ b/test/CodeGen/AArch64/speculation-hardening-dagisel.ll
@@ -0,0 +1,71 @@
+; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure
+
+declare i64 @g(i64, i64) local_unnamed_addr
+define i64 @f_using_reserved_reg_x16(i64 %a, i64 %b) local_unnamed_addr SLHATTR {
+; CHECK-LABEL: f_using_reserved_reg_x16
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+entry:
+  %cmp = icmp ugt i64 %a, %b
+  br i1 %cmp, label %if.then, label %cleanup
+
+; CHECK: b.ls
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+if.then:
+  %0 = tail call i64 asm "autia1716", "={x17},{x16},0"(i64 %b, i64 %a)
+; CHECK: bl g
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+; CHECK: ret
+  %call = tail call i64 @g(i64 %a, i64 %b) #3
+  %add = add i64 %call, %0
+  br label %cleanup
+
+cleanup:
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+; SLH: ret
+  %retval.0 = phi i64 [ %add, %if.then ], [ %b, %entry ]
+  ret i64 %retval.0
+}
+
+define i32 @f_clobbered_reg_w16(i32 %a, i32 %b) local_unnamed_addr SLHATTR {
+; CHECK-LABEL: f_clobbered_reg_w16
+entry:
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: b.le
+
+if.then:
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+; CHECK: mov w16, w0
+  tail call void asm sideeffect "mov w16, ${0:w}", "r,~{w16}"(i32 %a)
+  br label %if.end
+; SLH: ret
+
+if.end:
+  %add = add nsw i32 %b, %a
+  ret i32 %add
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+; SLH: ret
+}
diff --git a/test/CodeGen/AArch64/speculation-hardening.ll b/test/CodeGen/AArch64/speculation-hardening.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3535b63c32cc8f11bd11845c7b68840e7ab3fcc0
--- /dev/null
+++ b/test/CodeGen/AArch64/speculation-hardening.ll
@@ -0,0 +1,156 @@
+; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure
+; RUN sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -fast-isel | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -fast-isel | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure
+
+define i32 @f(i8* nocapture readonly %p, i32 %i, i32 %N) local_unnamed_addr SLHATTR {
+; CHECK-LABEL: f
+entry:
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+; NOSLH-NOT:  cmp sp, #0
+; NOSLH-NOT:  csetm x16, ne
+
+; SLH:  mov x17, sp
+; SLH:  and x17, x17, x16
+; SLH:  mov sp, x17
+; NOSLH-NOT:  mov x17, sp
+; NOSLH-NOT:  and x17, x17, x16
+; NOSLH-NOT:  mov sp, x17
+  %call = tail call i32 @tail_callee(i32 %i)
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+; NOSLH-NOT:  cmp sp, #0
+; NOSLH-NOT:  csetm x16, ne
+  %cmp = icmp slt i32 %call, %N
+  br i1 %cmp, label %if.then, label %return
+; GlobalISel lowers the branch to a b.ne sometimes instead of b.ge as expected..
+; CHECK: b.[[COND:(ge)|(lt)|(ne)]]
+
+if.then:                                          ; preds = %entry
+; NOSLH-NOT: csel x16, x16, xzr, {{(lt)|(ge)|(eq)}}
+; SLH-DAG: csel x16, x16, xzr, {{(lt)|(ge)|(eq)}}
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %idxprom
+  %0 = load i8, i8* %arrayidx, align 1
+; CHECK-DAG:      ldrb [[LOADED:w[0-9]+]],
+  %conv = zext i8 %0 to i32
+  br label %return
+
+; SLH-DAG: csel x16, x16, xzr, [[COND]]
+; NOSLH-NOT: csel x16, x16, xzr, [[COND]]
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %conv, %if.then ], [ 0, %entry ]
+; SLH:  mov x17, sp
+; SLH:  and x17, x17, x16
+; SLH:  mov sp, x17
+; NOSLH-NOT:  mov x17, sp
+; NOSLH-NOT:  and x17, x17, x16
+; NOSLH-NOT:  mov sp, x17
+  ret i32 %retval.0
+}
+
+; Make sure that for a tail call, taint doesn't get put into SP twice.
+define i32 @tail_caller(i32 %a) local_unnamed_addr SLHATTR {
+; CHECK-LABEL: tail_caller:
+; SLH:     mov     x17, sp
+; SLH:     and     x17, x17, x16
+; SLH:     mov     sp, x17
+; NOSLH-NOT:     mov     x17, sp
+; NOSLH-NOT:     and     x17, x17, x16
+; NOSLH-NOT:     mov     sp, x17
+;  GlobalISel doesn't optimize tail calls (yet?), so only check that
+;  cross-call taint register setup code is missing if a tail call was
+;  actually produced.
+; SLH:     {{(bl tail_callee[[:space:]] cmp sp, #0)|(b tail_callee)}}
+; SLH-NOT: cmp sp, #0
+  %call = tail call i32 @tail_callee(i32 %a)
+  ret i32 %call
+}
+
+declare i32 @tail_callee(i32) local_unnamed_addr
+
+; Verify that no cb(n)z/tb(n)z instructions are produced when implementing
+; SLH
+define i32 @compare_branch_zero(i32, i32) SLHATTR {
+; CHECK-LABEL: compare_branch_zero
+  %3 = icmp eq i32 %0, 0
+  br i1 %3, label %then, label %else
+;SLH-NOT:   cb{{n?}}z
+;NOSLH:     cb{{n?}}z
+then:
+  %4 = sdiv i32 5, %1
+  ret i32 %4
+else:
+  %5 = sdiv i32 %1, %0
+  ret i32 %5
+}
+
+define i32 @test_branch_zero(i32, i32) SLHATTR {
+; CHECK-LABEL: test_branch_zero
+  %3 = and i32 %0, 16
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %then, label %else
+;SLH-NOT:   tb{{n?}}z
+;NOSLH:     tb{{n?}}z
+then:
+  %5 = sdiv i32 5, %1
+  ret i32 %5
+else:
+  %6 = sdiv i32 %1, %0
+  ret i32 %6
+}
+
+define i32 @landingpad(i32 %l0, i32 %l1) SLHATTR personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: landingpad
+entry:
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+; NOSLH-NOT:  cmp sp, #0
+; NOSLH-NOT:  csetm x16, ne
+; CHECK: bl _Z10throwing_fv
+  invoke void @_Z10throwing_fv()
+          to label %exit unwind label %lpad
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+
+lpad:
+  %l4 = landingpad { i8*, i32 }
+          catch i8* null
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+; NOSLH-NOT:  cmp sp, #0
+; NOSLH-NOT:  csetm x16, ne
+  %l5 = extractvalue { i8*, i32 } %l4, 0
+  %l6 = tail call i8* @__cxa_begin_catch(i8* %l5)
+  %l7 = icmp sgt i32 %l0, %l1
+  br i1 %l7, label %then, label %else
+; GlobalISel lowers the branch to a b.ne sometimes instead of b.ge as expected..
+; CHECK: b.[[COND:(le)|(gt)|(ne)]]
+
+then:
+; SLH-DAG: csel x16, x16, xzr, [[COND]]
+  %l9 = sdiv i32 %l0, %l1
+  br label %postif
+
+else:
+; SLH-DAG: csel x16, x16, xzr, {{(gt)|(le)|(eq)}}
+  %l11 = sdiv i32 %l1, %l0
+  br label %postif
+
+postif:
+  %l13 = phi i32 [ %l9, %then ], [ %l11, %else ]
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  %l15 = phi i32 [ %l13, %postif ], [ 0, %entry ]
+  ret i32 %l15
+}
+
+declare i32 @__gxx_personality_v0(...)
+declare void @_Z10throwing_fv() local_unnamed_addr
+declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+declare void @__cxa_end_catch() local_unnamed_addr
diff --git a/test/CodeGen/AArch64/speculation-hardening.mir b/test/CodeGen/AArch64/speculation-hardening.mir
new file mode 100644
index 0000000000000000000000000000000000000000..cf8357d9558b0064b54c9f618e97411ff830bde9
--- /dev/null
+++ b/test/CodeGen/AArch64/speculation-hardening.mir
@@ -0,0 +1,117 @@
+# RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu \
+# RUN:     -start-before aarch64-speculation-hardening -o - %s \
+# RUN:   | FileCheck %s --dump-input-on-failure
+
+# Check that the speculation hardening pass generates code as expected for
+# basic blocks ending with a variety of branch patterns:
+# - (1) no branches (fallthrough)
+# - (2) one unconditional branch
+# - (3) one conditional branch + fall-through
+# - (4) one conditional branch + one unconditional branch
+# - other direct branches don't seem to be generated by the AArch64 codegen
+--- |
+  define void @nobranch_fallthrough(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+  define void @uncondbranch(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+  define void @condbranch_fallthrough(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+  define void @condbranch_uncondbranch(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+  define void @indirectbranch(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+...
+---
+name:            nobranch_fallthrough
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: nobranch_fallthrough
+  bb.0:
+    successors: %bb.1
+    liveins: $w0, $w1
+  ; CHECK-NOT: csel
+  bb.1:
+    liveins: $w0
+   RET undef $lr, implicit $w0
+...
+---
+name:            uncondbranch
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: uncondbranch
+  bb.0:
+    successors: %bb.1
+    liveins: $w0, $w1
+    B %bb.1
+  ; CHECK-NOT: csel
+  bb.1:
+   liveins: $w0
+   RET undef $lr, implicit $w0
+...
+---
+name:            condbranch_fallthrough
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: condbranch_fallthrough
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $w0, $w1
+    $wzr = SUBSWrs renamable $w0, renamable $w1, 0, implicit-def $nzcv, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+  ; CHECK: b.lt [[BB_LT_T:\.LBB[0-9_]+]]
+
+  bb.1:
+    liveins: $nzcv, $w0
+  ; CHECK: csel x16, x16, xzr, ge
+    RET undef $lr, implicit $w0
+  bb.2:
+    liveins: $nzcv, $w0
+  ; CHECK: csel x16, x16, xzr, lt
+    RET undef $lr, implicit $w0
+...
+---
+name:            condbranch_uncondbranch
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: condbranch_uncondbranch
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $w0, $w1
+    $wzr = SUBSWrs renamable $w0, renamable $w1, 0, implicit-def $nzcv, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1, implicit $nzcv
+  ; CHECK: b.lt [[BB_LT_T:\.LBB[0-9_]+]]
+
+  bb.1:
+    liveins: $nzcv, $w0
+  ; CHECK: csel x16, x16, xzr, ge
+    RET undef $lr, implicit $w0
+  bb.2:
+    liveins: $nzcv, $w0
+  ; CHECK: csel x16, x16, xzr, lt
+    RET undef $lr, implicit $w0
+...
+---
+name:            indirectbranch
+tracksRegLiveness: true
+body:             |
+  ; Check that no instrumentation is done on indirect branches (for now).
+  ; CHECK-LABEL: indirectbranch
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $x0
+    BR $x0
+  bb.1:
+   liveins: $x0
+  ; CHECK-NOT: csel
+   RET undef $lr, implicit $x0
+  bb.2:
+   liveins: $x0
+  ; CHECK-NOT: csel
+   RET undef $lr, implicit $x0
+...
diff --git a/test/CodeGen/AArch64/stack-protector-target.ll b/test/CodeGen/AArch64/stack-protector-target.ll
index 787e4a76ec01b7192a5b4fcab6dbc7d3fd0f5777..e54fb8c4b0a13e434308ade52827f38db93ee747 100644
--- a/test/CodeGen/AArch64/stack-protector-target.ll
+++ b/test/CodeGen/AArch64/stack-protector-target.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-AARCH64 %s
 ; RUN: llc -mtriple=aarch64-fuchsia < %s -o - | FileCheck --check-prefixes=FUCHSIA-AARCH64-COMMON,FUCHSIA-AARCH64-USER %s
 ; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel < %s -o - | FileCheck --check-prefixes=FUCHSIA-AARCH64-COMMON,FUCHSIA-AARCH64-KERNEL %s
+; RUN: llc -mtriple=aarch64-windows < %s -o - | FileCheck --check-prefix=WINDOWS-AARCH64 %s
 
 define void @_Z1fv() sspreq {
 entry:
@@ -27,3 +28,10 @@ declare void @_Z7CapturePi(i32*)
 ; FUCHSIA-AARCH64-COMMON: ldur [[C:.*]], {{\[}}[[A]], #-16]
 ; FUCHSIA-AARCH64-COMMON: ldr [[D:.*]], [sp,
 ; FUCHSIA-AARCH64-COMMON: cmp [[C]], [[D]]
+
+; WINDOWS-AARCH64: adrp x8, __security_cookie
+; WINDOWS-AARCH64: ldr x8, [x8, __security_cookie]
+; WINDOWS-AARCH64: str x8, [sp, #8]
+; WINDOWS-AARCH64: bl  _Z7CapturePi
+; WINDOWS-AARCH64: ldr x0, [sp, #8]
+; WINDOWS-AARCH64: bl  __security_check_cookie
diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll
index 637ff3e2e29bb3325fe19577a074b07c1db3f107..8ea89464ab0e9107bba35f260f754ed5e22e9e8f 100644
--- a/test/CodeGen/AArch64/swifterror.ll
+++ b/test/CodeGen/AArch64/swifterror.ll
@@ -314,13 +314,12 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
 ; CHECK-APPLE-DAG: strb [[ID]], [x0, #8]
 
 ; First vararg
-; CHECK-APPLE-DAG: orr {{x[0-9]+}}, [[ARGS]], #0x8
 ; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #16]
 ; Second vararg
-; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8
+; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #24]
 ; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #16
 ; Third vararg
-; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8
+; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #32]
 
 ; CHECK-APPLE: mov x21, x0
 ; CHECK-APPLE-NOT: x21
diff --git a/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll b/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll
index 41de7e8af195970f9a4262914b64387f6cc374c9..f3dd80f71fe0e410d5c7957e9d346ad2bf37885e 100644
--- a/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll
+++ b/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll
@@ -65,9 +65,9 @@ define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 define i8 @in8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-LABEL: in8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, w2
-; CHECK-NEXT:    bic w9, w1, w2
-; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    bic w8, w1, w2
+; CHECK-NEXT:    and w9, w0, w2
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
   %n0 = xor i8 %x, %y
   %n1 = and i8 %n0, %mask
@@ -78,9 +78,9 @@ define i8 @in8(i8 %x, i8 %y, i8 %mask) {
 define i16 @in16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-LABEL: in16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, w2
-; CHECK-NEXT:    bic w9, w1, w2
-; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    bic w8, w1, w2
+; CHECK-NEXT:    and w9, w0, w2
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
   %n0 = xor i16 %x, %y
   %n1 = and i16 %n0, %mask
diff --git a/test/CodeGen/AArch64/vcvt-oversize.ll b/test/CodeGen/AArch64/vcvt-oversize.ll
index b6e25cfadaa918a1571fce84a03adc068ebaa89d..823fe44744c52f0ff4858146233206dab6d5f839 100644
--- a/test/CodeGen/AArch64/vcvt-oversize.ll
+++ b/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -2,14 +2,14 @@
 
 define <8 x i8> @float_to_i8(<8 x float>* %in) {
 ; CHECK-LABEL: float_to_i8:
-; CHECK: ldp     q1, q0, [x0]
-; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v1.4s, v1.4s
-; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v0.4s, v0.4s
-; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
-; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
+; CHECK: ldp     q0, q1, [x0]
+; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s
+; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s
+; CHECK-DAG: fcvtzs v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
+; CHECK-DAG: fcvtzs v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
 ; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s
-; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s
-; CHECK-DAG: xtn v0.8b, v[[TMP]].8h
+; CHECK-DAG: xtn v[[TMP2:[0-9]+]].4h, v[[MSB]].4s
+; CHECK-DAG: uzp1 v0.8b, v[[TMP]].8b, v[[TMP2]].8b
   %l = load <8 x float>, <8 x float>* %in
   %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
   %conv = fptoui <8 x float> %scale to <8 x i8>
diff --git a/test/CodeGen/AArch64/windows-SEH-support.ll b/test/CodeGen/AArch64/windows-SEH-support.ll
new file mode 100644
index 0000000000000000000000000000000000000000..499ae782beb1c5dcf615418fa73423aea855ab62
--- /dev/null
+++ b/test/CodeGen/AArch64/windows-SEH-support.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype asm -o /dev/null %s
+
+declare dllimport void @f() local_unnamed_addr
+
+declare dso_local i32 @__C_specific_handler(...)
+
+define hidden swiftcc void @g() unnamed_addr personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  invoke void @f() to label %__try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %__except] unwind to caller
+
+__except:
+  %1 = catchpad within %0 [i8* null]              ; preds = %catch.dispatch
+  catchret from %1 to label %__try.cont
+
+__try.cont:                                       ; preds = %__except, %entry
+  ret void
+}
+
+define hidden fastcc void @h() unnamed_addr personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  invoke void @f() to label %__try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %__except] unwind to caller
+
+__except:                                         ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null]
+  catchret from %1 to label %__try.cont
+
+__try.cont:                                       ; preds = %__except, %entry
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/wineh-mingw.ll b/test/CodeGen/AArch64/wineh-mingw.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ae26b06cfa3857d7c513ddac3a28a12fd5aa21df
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-mingw.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -exception-model=wineh -mtriple=aarch64-pc-mingw32 | FileCheck %s -check-prefix=WINEH
+; RUN: llc < %s -exception-model=wineh -mtriple=aarch64-pc-mingw32 -filetype=obj | llvm-readobj -s | FileCheck %s -check-prefix=WINEH-SECTIONS
+
+; Check emission of eh handler and handler data
+declare i32 @_d_eh_personality(i32, i32, i64, i8*, i8*)
+declare void @_d_eh_resume_unwind(i8*)
+
+declare i32 @bar()
+
+define i32 @foo4() #0 personality i32 (i32, i32, i64, i8*, i8*)* @_d_eh_personality {
+entry:
+  %step = alloca i32, align 4
+  store i32 0, i32* %step
+  %tmp = load i32, i32* %step
+
+  %tmp1 = invoke i32 @bar()
+          to label %finally unwind label %landingpad
+
+finally:
+  store i32 1, i32* %step
+  br label %endtryfinally
+
+landingpad:
+  %landing_pad = landingpad { i8*, i32 }
+          cleanup
+  %tmp3 = extractvalue { i8*, i32 } %landing_pad, 0
+  store i32 2, i32* %step
+  call void @_d_eh_resume_unwind(i8* %tmp3)
+  unreachable
+
+endtryfinally:
+  %tmp10 = load i32, i32* %step
+  ret i32 %tmp10
+}
+; WINEH-LABEL: foo4:
+; WINEH: .seh_proc foo4
+; WINEH: .seh_handler _d_eh_personality, @unwind, @except
+; WINEH: ret
+; WINEH: .section .xdata,"dr"
+; WINEH-NEXT: .seh_handlerdata
+; WINEH-NEXT: .text
+; WINEH-NEXT: .seh_endproc
+; WINEH: .section .xdata,"dr"
+; WINEH-NEXT: .p2align 2
+; WINEH-NEXT: GCC_except_table0:
+
+; WINEH-SECTIONS: Name: .xdata
+; WINEH-SECTIONS-NOT: Name: .gcc_except_table
diff --git a/test/CodeGen/AArch64/wineh-try-catch-nobase.ll b/test/CodeGen/AArch64/wineh-try-catch-nobase.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9d3af8043d7ad6b837d11a6b385efacefdb0628f
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-try-catch-nobase.ll
@@ -0,0 +1,49 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s
+
+; Make sure we don't have a base pointer.
+; CHECK-LABEL: "?a@@YAXXZ":
+; CHECK-NOT: x19
+
+; Check that we compute the address relative to fp.
+; CHECK-LABEL: "?catch$2@?0??a@@YAXXZ@4HA":
+; CHECK:             stp     x29, x30, [sp, #-16]!   ; 16-byte Folded Spill
+; CHECK-NEXT:        sub     x0, x29, #16            ; =16
+; CHECK-NEXT:        mov     x1, xzr
+; CHECK-NEXT:        bl      "?bb@@YAXPEAHH@Z"
+; CHECK-NEXT:        adrp    x0, .LBB0_1
+; CHECK-NEXT:        add     x0, x0, .LBB0_1
+; CHECK-NEXT:        ldp     x29, x30, [sp], #16     ; 16-byte Folded Reload
+; CHECK-NEXT:        ret
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+define dso_local void @"?a@@YAXXZ"(i64 %p1) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %a = alloca i32, align 16
+  %0 = bitcast i32* %a to i8*  
+  store i32 305419896, i32* %a, align 16
+  invoke void @"?bb@@YAXPEAHH@Z"(i32* nonnull %a, i32* null)
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %2 = catchpad within %1 [i8* null, i32 64, i8* null]
+  call void @"?bb@@YAXPEAHH@Z"(i32* nonnull %a, i32* null) [ "funclet"(token %2) ]
+  catchret from %2 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  call void @"?cc@@YAXXZ"()
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
+
+declare dso_local void @"?bb@@YAXPEAHH@Z"(i32*, i32*)
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+
+declare dso_local void @"?cc@@YAXXZ"()
+
diff --git a/test/CodeGen/AArch64/wineh-try-catch-realign.ll b/test/CodeGen/AArch64/wineh-try-catch-realign.ll
new file mode 100644
index 0000000000000000000000000000000000000000..78255fbb16691de1388bd88d48b353e99d729eed
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-try-catch-realign.ll
@@ -0,0 +1,59 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s
+
+; Make sure we have a base pointer.
+; CHECK-LABEL: "?a@@YAXXZ":
+; CHECK: and     sp, x9, #0xffffffffffffffc0
+; CHECK: mov     x19, sp
+
+; Make sure the funclet prologue/epilogue are correct: specifically,
+; it shouldn't access the parent's frame via sp, and the prologue and
+; epilogue should be symmetrical.
+; CHECK-LABEL: "?catch$2@?0??a@@YAXXZ@4HA":
+; CHECK:      str     x28, [sp, #-32]!
+; CHECK-NEXT: str     x19, [sp, #8]
+; CHECK-NEXT: stp     x29, x30, [sp, #16]
+; CHECK-NEXT: add     x0, x19, #64
+; CHECK-NEXT: mov     w1, wzr
+; CHECK-NEXT: bl      "?bb@@YAXPEAHH@Z"
+; CHECK-NEXT: adrp    x0, .LBB0_1
+; CHECK-NEXT: add     x0, x0, .LBB0_1
+; CHECK-NEXT: ldp     x29, x30, [sp, #16]
+; CHECK-NEXT: ldr     x19, [sp, #8]
+; CHECK-NEXT: ldr     x28, [sp], #32
+; CHECK-NEXT: ret
+
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+define dso_local void @"?a@@YAXXZ"() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %a = alloca [100 x i32], align 64
+  %0 = bitcast [100 x i32]* %a to i8*  
+  call void @llvm.memset.p0i8.i64(i8* nonnull align 64 %0, i8 0, i64 400, i1 false)
+  %1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i64 0, i64 0
+  store i32 305419896, i32* %1, align 64
+  invoke void @"?bb@@YAXPEAHH@Z"(i32* nonnull %1, i32 1)
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %2 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %3 = catchpad within %2 [i8* null, i32 64, i8* null]
+  call void @"?bb@@YAXPEAHH@Z"(i32* nonnull %1, i32 0) [ "funclet"(token %3) ]
+  catchret from %3 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  call void @"?cc@@YAXXZ"()
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
+
+declare dso_local void @"?bb@@YAXPEAHH@Z"(i32*, i32)
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+
+declare dso_local void @"?cc@@YAXXZ"()
+
diff --git a/test/CodeGen/AArch64/wineh-try-catch-vla.ll b/test/CodeGen/AArch64/wineh-try-catch-vla.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7ae5ca54ff913d9aabe8b6c9e407cd9b238a7c5f
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-try-catch-vla.ll
@@ -0,0 +1,52 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s
+
+; Check we don't have a base pointer.
+; CHECK-LABEL: "?a@@YAXXZ":
+; CHECK-NOT: x19
+
+; Make sure the prologue and epilogue are sane. Make sure the
+; frame index is relative to the FP, since there is no base pointer.
+; (Funclets aren't allowed to contain dynamic allocas.)
+; CHECK-LABEL: "?catch$2@?0??a@@YAXXZ@4HA":
+; CHECK:      stp     x29, x30, [sp, #-16]!
+; CHECK-NEXT: ldur    x0, [x29, #-8]
+; CHECK-NEXT: mov     x1, x0
+; CHECK-NEXT: bl      "?bb@@YAXPEAHH@Z"
+; CHECK-NEXT: adrp    x0, .LBB0_1
+; CHECK-NEXT: add     x0, x0, .LBB0_1
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-NEXT: ret
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+define dso_local void @"?a@@YAXXZ"(i64 %p1) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %a = alloca i32, i64 %p1, align 16
+  %0 = bitcast i32* %a to i8*  
+  call void @llvm.memset.p0i8.i64(i8* nonnull align 16 %0, i8 0, i64 400, i1 false)
+  store i32 305419896, i32* %a, align 16
+  invoke void @"?bb@@YAXPEAHH@Z"(i32* nonnull %a, i32* null)
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %2 = catchpad within %1 [i8* null, i32 64, i8* null]
+  call void @"?bb@@YAXPEAHH@Z"(i32* nonnull %a, i32* %a) [ "funclet"(token %2) ]
+  catchret from %2 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  call void @"?cc@@YAXXZ"()
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
+
+declare dso_local void @"?bb@@YAXPEAHH@Z"(i32*, i32*)
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+
+declare dso_local void @"?cc@@YAXXZ"()
+
diff --git a/test/CodeGen/AArch64/wineh-try-catch.ll b/test/CodeGen/AArch64/wineh-try-catch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..940a86282d39a1a7b0642c9dabb5f81657642083
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-try-catch.ll
@@ -0,0 +1,197 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s
+; RUN: llc -o %t -filetype=obj %s -mtriple=aarch64-windows
+; RUN: llvm-readobj -unwind %t | FileCheck %s -check-prefix=UNWIND
+
+; We test the following
+; 1) That the unwind help object is created and that its offset from the stack
+;    pointer on entry is patched into the table fed to __CxxFrameHandler3
+; 2) That the stack update for the catch funclet only includes the callee saved
+;    registers
+; 3) That the locals are accessed using the frame pointer in both the funclet
+;    and the parent function.
+
+; The following checks that the unwind help object has -2 stored into it at
+; fp - 400 - 256 = fp - 656, which is on-entry sp - 48 + 32 - 656 =
+; on-entry sp - 672.  We check this offset in the table later on.
+
+; CHECK-LABEL: "?func@@YAHXZ":
+; CHECK:       str     x28, [sp, #-48]!
+; CHECK:       str     x21, [sp, #8]
+; CHECK:       stp     x19, x20, [sp, #16]
+; CHECK:       stp     x29, x30, [sp, #32]
+; CHECK:       add     x29, sp, #32
+; CHECK:       sub     sp, sp, #624
+; CHECK:       mov     x19, sp
+; CHECK:       orr     x1, xzr, #0xfffffffffffffffe
+; CHECK:       stur    x1, [x19]
+
+; Now check that x is stored at fp - 20.  We check that this is the same
+; location accessed from the funclet to retrieve x.
+; CHECK:       orr     w8, wzr, #0x1
+; CHECK:       stur    w8, [x29, [[X_OFFSET:#-[1-9][0-9]+]]
+
+; Check the offset off the frame pointer at which B is located.
+; Check the same offset is used to pass the address of B to init2 in the
+; funclet.
+; CHECK:       sub     x0, x29, [[B_OFFSET:#[1-9][0-9]+]]
+; CHECK:       bl      "?init@@YAXPEAH@Z"
+
+; This is the label for the throw that is encoded in the ip2state.
+; We are inside the try block, where we make a call to func2
+; CHECK-LABEL: .Ltmp0:
+; CHECK:       bl      "?func2@@YAHXZ
+
+; CHECK:        [[CATCHRETDEST:.LBB0_[0-9]+]]:      ; %catchret.dest
+
+; Check the catch funclet.
+; CHECK-LABEL: "?catch$2@?0??func@@YAHXZ@4HA":
+
+; Check that the stack space is allocated only for the callee saved registers.
+; CHECK:       str     x28, [sp, #-48]!
+; CHECK:       str     x21, [sp, #8]
+; CHECK:       stp     x19, x20, [sp, #16]
+; CHECK:       stp     x29, x30, [sp, #32]
+; CHECK:       add     x20, x19, #12
+
+; Check that there are no further stack updates.
+; CHECK-NOT:   sub     sp, sp
+
+; Check that the stack address passed to init2 is off the frame pointer, and
+; that it matches the address of B in the parent function.
+; CHECK:       sub     x0, x29, [[B_OFFSET]]
+; CHECK:       bl      "?init2@@YAXPEAH@Z"
+
+; Check that are storing x back to the same location off the frame pointer as in
+; the parent function.
+; CHECK:       stur    w8, [x29, [[X_OFFSET]]]
+
+; Check that the funclet branches back to the catchret destination
+; CHECK:       adrp    x0, .LBB0_3
+; CHECK-NEXT:  add     x0, x0, [[CATCHRETDEST]]
+
+
+; Now check that the offset of the unwind help object from the stack pointer on
+; entry to func is encoded in cppxdata that is passed to __CxxFrameHandler3.  As
+; computed above, this comes to -672.
+; CHECK-LABEL:        "$cppxdata$?func@@YAHXZ":
+; CHECK-NEXT:         .word   429065506               ; MagicNumber
+; CHECK-NEXT:         .word   2                       ; MaxState
+; CHECK-NEXT:         .word   ("$stateUnwindMap$?func@@YAHXZ")@IMGREL ; UnwindMap
+; CHECK-NEXT:         .word   1                       ; NumTryBlocks
+; CHECK-NEXT:         .word   ("$tryMap$?func@@YAHXZ")@IMGREL ; TryBlockMap
+; CHECK-NEXT:         .word   4                       ; IPMapEntries
+; CHECK-NEXT:         .word   ("$ip2state$?func@@YAHXZ")@IMGREL ; IPToStateXData
+; CHECK-NEXT:         .word   -672                    ; UnwindHelp
+
+; UNWIND: Function: ?func@@YAHXZ (0x0)
+; UNWIND: Prologue [
+; UNWIND-NEXT: ; nop
+; UNWIND-NEXT: ; sub sp, #624
+; UNWIND-NEXT: ; add fp, sp, #32
+; UNWIND-NEXT: ; stp x29, x30, [sp, #32]
+; UNWIND-NEXT: ; stp x19, x20, [sp, #16]
+; UNWIND-NEXT: ; str x21, [sp, #8]
+; UNWIND-NEXT: ; str x28, [sp, #48]!
+; UNWIND-NEXT: ; end
+; UNWIND: Function: ?catch$2@?0??func@@YAHXZ@4HA
+; UNWIND: Prologue [
+; UNWIND-NEXT: ; stp x29, x30, [sp, #32]
+; UNWIND-NEXT: ; stp x19, x20, [sp, #16]
+; UNWIND-NEXT: ; str x21, [sp, #8]
+; UNWIND-NEXT: ; str x28, [sp, #48]!
+; UNWIND-NEXT: ; end
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
+%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
+%eh.ThrowInfo = type { i32, i32, i32, i32 }
+
+$"??_R0H@8" = comdat any
+
+$"_CT??_R0H@84" = comdat any
+
+$_CTA1H = comdat any
+
+$_TI1H = comdat any
+
+@"??_7type_info@@6B@" = external constant i8*
+@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+@__ImageBase = external dso_local constant i8
+@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
+@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
+@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
+
+; Function Attrs: noinline optnone
+define dso_local i32 @"?func@@YAHXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %B = alloca [50 x i32], align 4
+  %x = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %i = alloca i32, align 4
+  %C = alloca [100 x i32], align 4
+  store i32 1, i32* %x, align 4
+  %arraydecay = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 0, i32 0
+  call void @"?init@@YAXPEAH@Z"(i32* %arraydecay)
+  %call = invoke i32 @"?func2@@YAHXZ"()
+          to label %invoke.cont unwind label %catch.dispatch
+
+invoke.cont:                                      ; preds = %entry
+  store i32 %call, i32* %tmp, align 4
+  %0 = bitcast i32* %tmp to i8*
+  invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #2
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %invoke.cont, %entry
+  %1 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %2 = catchpad within %1 [%rtti.TypeDescriptor2* @"??_R0H@8", i32 0, i32* %i]
+  %arraydecay1 = getelementptr inbounds [100 x i32], [100 x i32]* %C, i32 0, i32 0
+  call void @"?init@@YAXPEAH@Z"(i32* %arraydecay1) [ "funclet"(token %2) ]
+  %arraydecay2 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 0, i32 0
+  call void @"?init2@@YAXPEAH@Z"(i32* %arraydecay2) [ "funclet"(token %2) ]
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [50 x i32], [50 x i32]* %B, i64 0, i64 %idxprom
+  %4 = load i32, i32* %arrayidx, align 4
+  %5 = load i32, i32* %i, align 4
+  %idxprom3 = sext i32 %5 to i64
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* %C, i64 0, i64 %idxprom3
+  %6 = load i32, i32* %arrayidx4, align 4
+  %add = add nsw i32 %4, %6
+  %7 = load i32, i32* %i, align 4
+  %8 = load i32, i32* %i, align 4
+  %mul = mul nsw i32 %7, %8
+  %add5 = add nsw i32 %add, %mul
+  store i32 %add5, i32* %x, align 4
+  catchret from %2 to label %catchret.dest
+
+catchret.dest:                                    ; preds = %catch
+  br label %try.cont
+
+try.cont:                                         ; preds = %catchret.dest
+  %arrayidx6 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i64 0, i64 2
+  %9 = load i32, i32* %arrayidx6, align 4
+  %10 = load i32, i32* %x, align 4
+  %add7 = add nsw i32 %9, %10
+  ret i32 %add7
+
+unreachable:                                      ; preds = %invoke.cont
+  unreachable
+}
+
+declare dso_local void @"?init@@YAXPEAH@Z"(i32*)
+
+declare dso_local i32 @"?func2@@YAHXZ"()
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+
+declare dllimport void @_CxxThrowException(i8*, %eh.ThrowInfo*)
+
+declare dso_local void @"?init2@@YAXPEAH@Z"(i32*)
+
+attributes #0 = { noinline optnone }
+attributes #2 = { noreturn }
diff --git a/test/CodeGen/AArch64/xor.ll b/test/CodeGen/AArch64/xor.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f7cd8bf0c4c83c2fe4fdae826406dd8a88f8a218
--- /dev/null
+++ b/test/CodeGen/AArch64/xor.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+define i32 @PR39657(i8* %p, i64 %x) {
+; CHECK-LABEL: PR39657:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x8, x1
+; CHECK-NEXT:    ldr w0, [x0, x8, lsl #2]
+; CHECK-NEXT:    ret
+  %sh = shl i64 %x, 2
+  %mul = xor i64 %sh, -4
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 %mul
+  %bc = bitcast i8* %add.ptr to i32*
+  %load = load i32, i32* %bc, align 4
+  ret i32 %load
+}
+
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir
index 3f8d58522caa4bbf0c26de828bde7809184e3888..57e1148bc59eefbce645dcac8a2a43023a156fb8 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir
@@ -1,16 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_add() { ret void }
-...
-
 ---
 name:            test_add
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
@@ -19,8 +11,8 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
-    %0(s32) = COPY $vgpr0
-    %1(s32) = COPY $vgpr1
-    %2(s32) = G_ADD %0, %1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_ADD %0, %1
     $vgpr0 = COPY %2
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir
index 0282ff40837d6b5984d574859ba7040c8cb44a8a..f561cc7d092d6fae8ad64c306f850b86284a27f5 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir
@@ -1,16 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_and() { ret void }
-...
-
 ---
 name:            test_and
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
@@ -19,8 +11,8 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[COPY1]]
-    %0(s32) = COPY $vgpr0
-    %1(s32) = COPY $vgpr1
-    %2(s32) = G_AND %0, %1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_AND %0, %1
     $vgpr0 = COPY %2
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
index 71a9de8e6bfbc009d70630dc6e723e7804585c2a..59b5ac993ce47fc2c2708c0bc170eec6bd79c90b 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
@@ -3,10 +3,6 @@
 
 ---
 name:            test_ashr
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0.entry:
     liveins: $vgpr0, $vgpr1
@@ -15,8 +11,8 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[COPY1]]
-    %0(s32) = COPY $vgpr0
-    %1(s32) = COPY $vgpr1
-    %2(s32) = G_ASHR %0, %1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_ASHR %0, %1
     $vgpr0 = COPY %2
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
index 5bf39e38c2cdaf78759d9677f6dee83cc33904d6..492da48a93f0913aeb20395be257f8b245eaa0ca 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
@@ -1,16 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_bitcast() { ret void }
-...
-
 ---
 name:            test_bitcast
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0:
     liveins: $vgpr0
@@ -19,8 +11,8 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY]](s32)
     ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>)
-    %0(s32) = COPY $vgpr0
-    %1(<2 x s16>) = G_BITCAST %0
-    %2(s32) = G_BITCAST %1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(<2 x s16>) = G_BITCAST %0
+    %2:_(s32) = G_BITCAST %1
     $vgpr0 = COPY %2
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-block-addr.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-block-addr.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c532d117e07b3e05d52a98d0b46d84eb42cd5f02
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-block-addr.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+
+  @addr = global i8* null
+
+  define void @test_blockaddress() {
+    store i8* blockaddress(@test_blockaddress, %block), i8** @addr
+    indirectbr i8* blockaddress(@test_blockaddress, %block), [label %block]
+
+  block:
+    ret void
+  }
+
+...
+---
+name:            test_blockaddress
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    ; CHECK-LABEL: name: test_blockaddress
+    ; CHECK: [[BLOCK_ADDR:%[0-9]+]]:_(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block)
+    ; CHECK: S_ENDPGM implicit [[BLOCK_ADDR]](p0)
+    %0:_(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block)
+    S_ENDPGM implicit %0
+
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
index 194d14afe24a6b5bf8900da471dc206ffc7b21cd..ec609f296391f119fa0eb49a2be7235a95e43279 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir
@@ -58,12 +58,12 @@ body: |
     liveins: $vgpr0
     ; CHECK-LABEL: name: extract_vector_elt_0_v5i32
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK: [[MV:%[0-9]+]]:_(<5 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<5 x s32>), [[C]](s32)
     ; CHECK: $vgpr0 = COPY [[EVEC]](s32)
     %0:_(s32) = COPY $vgpr0
-    %1:_(<5 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0
+    %1:_(<5 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0
     %2:_(s32) = G_CONSTANT i32 0
     %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2
     $vgpr0 = COPY %3
@@ -77,12 +77,12 @@ body: |
     liveins: $vgpr0
     ; CHECK-LABEL: name: extract_vector_elt_0_v6i32
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK: [[MV:%[0-9]+]]:_(<6 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<6 x s32>), [[C]](s32)
     ; CHECK: $vgpr0 = COPY [[EVEC]](s32)
     %0:_(s32) = COPY $vgpr0
-    %1:_(<6 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0, %0
+    %1:_(<6 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0, %0
     %2:_(s32) = G_CONSTANT i32 0
     %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2
     $vgpr0 = COPY %3
@@ -96,12 +96,12 @@ body: |
     liveins: $vgpr0
     ; CHECK-LABEL: name: extract_vector_elt_0_v7i32
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK: [[MV:%[0-9]+]]:_(<7 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<7 x s32>), [[C]](s32)
     ; CHECK: $vgpr0 = COPY [[EVEC]](s32)
     %0:_(s32) = COPY $vgpr0
-    %1:_(<7 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0, %0, %0
+    %1:_(<7 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0, %0, %0
     %2:_(s32) = G_CONSTANT i32 0
     %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2
     $vgpr0 = COPY %3
@@ -115,12 +115,12 @@ body: |
     liveins: $vgpr0
     ; CHECK-LABEL: name: extract_vector_elt_0_v8i32
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK: [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<8 x s32>), [[C]](s32)
     ; CHECK: $vgpr0 = COPY [[EVEC]](s32)
     %0:_(s32) = COPY $vgpr0
-    %1:_(<8 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0, %0, %0, %0
+    %1:_(<8 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0, %0, %0, %0
     %2:_(s32) = G_CONSTANT i32 0
     %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2
     $vgpr0 = COPY %3
@@ -134,12 +134,12 @@ body: |
     liveins: $vgpr0
     ; CHECK-LABEL: name: extract_vector_elt_0_v16i32
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK: [[MV:%[0-9]+]]:_(<16 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<16 x s32>), [[C]](s32)
     ; CHECK: $vgpr0 = COPY [[EVEC]](s32)
     %0:_(s32) = COPY $vgpr0
-    %1:_(<16 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0
+    %1:_(<16 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0
     %2:_(s32) = G_CONSTANT i32 0
     %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2
     $vgpr0 = COPY %3
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir
new file mode 100644
index 0000000000000000000000000000000000000000..31f1e41775fe73bc48c788c2b378d53d692cd958
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=legalizer -global-isel %s -o - | FileCheck  %s
+
+---
+name: test_fabs_f32
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: test_fabs_f32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_FABS %0
+...
+---
+name: test_fabs_f64
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: test_fabs_f64
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_FABS %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir
index 1a66bbabe8cf8c82fa70a0eabed2b6c5f4badb41..85ed234bf47fb51790d14ad8fcbcad6293ff2e29 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir
@@ -1,27 +1,26 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_fadd() {
-  entry:
-    ret void
-  }
-
-...
-
 ---
-name:            test_fadd
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
+name: test_fadd_f32
 body: |
   bb.0.entry:
     liveins: $vgpr0, $vgpr1
     ; CHECK-LABEL: name: test_fadd
     ; CHECK: %2:_(s32) = G_FADD %0, %1
 
-    %0(s32) = COPY $vgpr0
-    %1(s32) = COPY $vgpr1
-    %2(s32) = G_FADD %0, %1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FADD %0, %1
     $vgpr0 = COPY %2
 ...
+---
+name: test_fadd_f64
+body: |
+  bb.0.entry:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FADD %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir
index bcd372e05873f47439adfb1d437e7185a8c250b4..c09903f25730767a8b278415c455e441168bc204 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir
@@ -3,33 +3,25 @@
 
 ---
 name: test_fcmp_f32
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0:
     liveins: $vgpr0
     ; CHECK-LABEL: name: test_fcmp_f32
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    %0(s32) = G_CONSTANT i32 0
-    %1(s32) = COPY $vgpr0
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = COPY $vgpr0
 
-    %2(s1) = G_FCMP floatpred(uge), %0, %1
+    %2:_(s1) = G_FCMP floatpred(uge), %0, %1
 ...
 ---
 name: test_fcmp_f64
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1
     ; CHECK-LABEL: name: test_fcmp_f64
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    %0(s64) = G_CONSTANT i64 0
-    %1(s64) = COPY $vgpr0_vgpr1
+    %0:_(s64) = G_CONSTANT i64 0
+    %1:_(s64) = COPY $vgpr0_vgpr1
 
-    %2(s1) = G_FCMP floatpred(uge), %0, %1
+    %2:_(s1) = G_FCMP floatpred(uge), %0, %1
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir
new file mode 100644
index 0000000000000000000000000000000000000000..954859e8cdaf309916a93b0182a8910b9911bdf6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: test_fma_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: test_fma
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = G_FMA %0, %1, %2
+    $vgpr0 = COPY %3
+...
+---
+name: test_fma_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3,  $vgpr4_vgpr5
+
+    ; CHECK-LABEL: name: test_fma
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; CHECK: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[COPY]], [[COPY1]]
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = COPY $vgpr4_vgpr5
+    %3:_(s64) = G_FMA %0, %1, %2
+    $vgpr0_vgpr1 = COPY %3
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir
index e68b1bed5fb49efff64876b4abdf8cb5f6b996ee..8a0ce1c6e91a5c69b3010b17ddde4e00695977e7 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir
@@ -1,16 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_fmul() { ret void }
-...
-
 ---
-name:            test_fmul
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
+name: test_fmul_f32
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
@@ -19,8 +11,23 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
-    %0(s32) = COPY $vgpr0
-    %1(s32) = COPY $vgpr1
-    %2(s32) = G_FMUL %0, %1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMUL %0, %1
     $vgpr0 = COPY %2
 ...
+---
+name: test_fmul_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: test_fmul
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; CHECK: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY1]]
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMUL %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir
new file mode 100644
index 0000000000000000000000000000000000000000..7d280f7f9c11781490a514e78cb0efbe0717a51c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=legalizer -global-isel %s -o - | FileCheck  %s
+
+---
+name: test_fneg_f32
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: test_fneg_f32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_FNEG %0
+...
+---
+name: test_fneg_f64
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: test_fneg_f64
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_FNEG %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir
index d83eb1ae2a8da40578d78e2d6a7b1edb924782d5..aede6af2603590517f71fe17103590369c327e49 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir
@@ -1,22 +1,14 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_fptoui() { ret void }
-...
-
 ---
 name:            test_fptoui
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0:
     liveins: $vgpr0
     ; CHECK-LABEL: name: test_fptoui
     ; CHECK: %1:_(s32) = G_FPTOUI %0
 
-    %0(s32) = COPY $vgpr0
-    %1(s32) = G_FPTOUI %0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_FPTOUI %0
     $vgpr0 = COPY %1
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir
new file mode 100644
index 0000000000000000000000000000000000000000..13012f459ed3bfb245fab07f8d2cf1cae3548e8a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: test_fsub_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: test_fsub_f32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[COPY1]]
+    ; CHECK: $vgpr0 = COPY [[FSUB]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FSUB %0, %1
+    $vgpr0 = COPY %2
+...
+---
+name: test_fsub_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: test_fsub_f64
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; CHECK: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]]
+    ; CHECK: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]]
+    ; CHECK: $vgpr0_vgpr1 = COPY [[FADD]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FSUB %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir
index 0549c685ce46442abc154b98fc75e98aaf4ec759..00c0c96335074a36be17ebf207cfba35f1303e49 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir
@@ -1,19 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -O0 -march=amdgcn -mcpu=fiji  -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_icmp() {
-  entry:
-    ret void
-  }
-...
-
 ---
 name:            test_icmp
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0.entry:
     liveins: $vgpr0
@@ -23,9 +12,9 @@ body: |
     ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[COPY]]
     ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]]
     ; CHECK: $vgpr0 = COPY [[SELECT]](s32)
-    %0(s32) = G_CONSTANT i32 0
-    %1(s32) = COPY $vgpr0
-    %2(s1) = G_ICMP intpred(ne), %0, %1
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s1) = G_ICMP intpred(ne), %0, %1
     %3:_(s32) = G_SELECT %2(s1), %0(s32), %1(s32)
     $vgpr0 = COPY %3
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir
similarity index 86%
rename from test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir
rename to test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir
index c9d8a071a90dc80a1a4c6f864e1b15ea6864a35b..df966affcf264f51cc43c335516dff4827113ec0 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir
@@ -23,11 +23,11 @@ body: |
     ; CHECK-LABEL: name: test_merge_s32_s32_v2s32
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[MV:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32)
     ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](<2 x s32>)
     %0:_(s32) = G_CONSTANT i32 0
     %1:_(s32) = G_CONSTANT i32 1
-    %2:_(<2 x s32>) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32)
     $vgpr0_vgpr1 = COPY %2(<2 x s32>)
 ...
 
@@ -39,12 +39,12 @@ body: |
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; CHECK: [[MV:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[C]](s32), [[C1]](s32), [[C2]](s32)
+    ; CHECK: [[MV:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32)
     ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](<3 x s32>)
     %0:_(s32) = G_CONSTANT i32 0
     %1:_(s32) = G_CONSTANT i32 1
     %2:_(s32) = G_CONSTANT i32 2
-    %3:_(<3 x s32>) = G_MERGE_VALUES %0:_(s32), %1:_(s32), %2:_(s32)
+    %3:_(<3 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %2:_(s32)
     $vgpr0_vgpr1_vgpr2 = COPY %3(<3 x s32>)
 ...
 
@@ -55,11 +55,11 @@ body: |
     ; CHECK-LABEL: name: test_merge_s64_s64_s128
     ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK: [[MV:%[0-9]+]]:_(<2 x s64>) = G_MERGE_VALUES [[C]](s64), [[C1]](s64)
+    ; CHECK: [[MV:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64)
     ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](<2 x s64>)
     %0:_(s64) = G_CONSTANT i64 0
     %1:_(s64) = G_CONSTANT i64 1
-    %2:_(<2 x s64>) = G_MERGE_VALUES %0(s64), %1(s64)
+    %2:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2(<2 x s64>)
 ...
 
@@ -72,13 +72,13 @@ body: |
     ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; CHECK: [[MV:%[0-9]+]]:_(<4 x s64>) = G_MERGE_VALUES [[C]](s64), [[C1]](s64), [[C2]](s64), [[C3]](s64)
+    ; CHECK: [[MV:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64), [[C2]](s64), [[C3]](s64)
     ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV]](<4 x s64>)
     %0:_(s64) = G_CONSTANT i64 0
     %1:_(s64) = G_CONSTANT i64 1
     %2:_(s64) = G_CONSTANT i64 2
     %3:_(s64) = G_CONSTANT i64 3
-    %4:_(<4 x s64>) = G_MERGE_VALUES %0(s64), %1(s64), %2(s64), %3(s64)
+    %4:_(<4 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64), %2(s64), %3(s64)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %4(<4 x s64>)
 ...
 
@@ -109,6 +109,6 @@ body: |
 
 #     %16:_(s32) = G_CONSTANT i32 16
 
-#     %17:_(<17 x s32>) = G_MERGE_VALUES %0:_(s32), %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32), %6:_(s32), %7:_(s32), %8:_(s32), %9:_(s32), %10:_(s32), %11:_(s32), %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32), %16:_(s32)
+#     %17:_(<17 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32), %6:_(s32), %7:_(s32), %8:_(s32), %9:_(s32), %10:_(s32), %11:_(s32), %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32), %16:_(s32)
 #     S_ENDPGM implicit %17(<17 x s32>)
 # ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir
index 5b176e408d27d78aa5ac8bed78501fcfe5f6ac2b..6e988dabe172f278923f2c64c9bd694a2df71e8a 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir
@@ -1,15 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_or() { ret void }
-...
 ---
 name:            test_or
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
@@ -18,8 +11,8 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY1]]
-    %0(s32) = COPY $vgpr0
-    %1(s32) = COPY $vgpr1
-    %2(s32) = G_OR %0, %1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_OR %0, %1
     $vgpr0 = COPY %2
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir
index ec223539cfc5170299886854a7971200178023fd..4c04612240e81435c5866b296683611fc1a0edeb 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir
@@ -1,19 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -O0 -march=amdgcn -mcpu=fiji  -run-pass=legalizer %s -o - | FileCheck %s
 
---- |
-  define void @test_select() { ret void }
-...
-
 ---
 name:            test_select
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
 body: |
   bb.0:
     liveins: $vgpr0
@@ -24,13 +13,13 @@ body: |
     ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[C2]]
-    %0(s32) = G_CONSTANT i32 0
-    %1(s32) = COPY $vgpr0
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = COPY $vgpr0
 
-    %2(s1) = G_ICMP intpred(ne), %0, %1
-    %3(s32) = G_CONSTANT i32 1
-    %4(s32) = G_CONSTANT i32 2
-    %5(s32) = G_SELECT %2, %3, %4
+    %2:_(s1) = G_ICMP intpred(ne), %0, %1
+    %3:_(s32) = G_CONSTANT i32 1
+    %4:_(s32) = G_CONSTANT i32 2
+    %5:_(s32) = G_SELECT %2, %3, %4
     $vgpr0 = COPY %5
 
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir
index 5d2ec5f7fb6243115bfc4ae3a5ed81c657db8d85..cd24d48a0dbe4f51b668a3d81f51ecd3020fd8f6 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir
@@ -3,10 +3,6 @@
 
 ---
 name:            test_shl
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body: |
   bb.0.entry:
     liveins: $vgpr0, $vgpr1
@@ -15,8 +11,8 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]]
-    %0(s32) = COPY $vgpr0
-    %1(s32) = COPY $vgpr1
-    %2(s32) = G_SHL %0, %1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_SHL %0, %1
     $vgpr0 = COPY %2
 ...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
index 1f4a21b73561c327dfe47388310c3408e9506367..1e004ac478b3e21e0f4bfb58098eb006eebaa044 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
@@ -16,18 +16,3 @@ body: |
     $vgpr2 = COPY %2(s32)
 ...
 
----
-name: test_unmerge_v2s32_s32
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1
-    ; CHECK-LABEL: name: test_unmerge_v2s32_s32
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; CHECK: $vgpr0 = COPY [[UV]](s32)
-    ; CHECK: $vgpr2 = COPY [[UV1]](s32)
-    %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0:_(<2 x s32>)
-    $vgpr0 = COPY %1(s32)
-    $vgpr2 = COPY %2(s32)
-...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
new file mode 100644
index 0000000000000000000000000000000000000000..828e5e41e2ebccb15e0812431a8876367f429143
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: anyext_i32_to_i64_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: anyext_i32_to_i64_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[SEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[COPY]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s64) = G_ANYEXT %0
+...
+
+---
+name: anyext_i32_to_i64_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: anyext_i32_to_i64_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[SEXT:%[0-9]+]]:vgpr(s64) = G_ANYEXT [[COPY]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s64) = G_ANYEXT %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir
new file mode 100644
index 0000000000000000000000000000000000000000..10a9bea6a8743b1c3d27503dd3184ecaf3cb233e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir
@@ -0,0 +1,29 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass=regbankselect %s -o - | FileCheck %s
+
+--- |
+
+  @addr = global i8* null
+
+  define void @test_blockaddress() {
+    store i8* blockaddress(@test_blockaddress, %block), i8** @addr
+    indirectbr i8* blockaddress(@test_blockaddress, %block), [label %block]
+
+  block:                                            ; preds = %0
+    ret void
+  }
+
+...
+---
+name:            test_blockaddress
+alignment:       4
+legalized: true
+body:             |
+  bb.1 (%ir-block.0):
+    ; CHECK-LABEL: name: test_blockaddress
+    ; CHECK: [[BLOCK_ADDR:%[0-9]+]]:sgpr(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block)
+    ; CHECK: S_ENDPGM implicit [[BLOCK_ADDR]](p0)
+    %0:_(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block)
+    S_ENDPGM implicit %0
+
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir
new file mode 100644
index 0000000000000000000000000000000000000000..9850b87959af380d36c8da2b9bb91e7fd5c90bec
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: bswap_i32_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: bswap_i32_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[BSWAP:%[0-9]+]]:sgpr(s32) = G_BSWAP [[COPY]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_BSWAP %0
+...
+
+---
+name: bswap_i32_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: bswap_i32_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[BSWAP:%[0-9]+]]:vgpr(s32) = G_BSWAP [[COPY]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_BSWAP %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir
new file mode 100644
index 0000000000000000000000000000000000000000..19b64b93499beb8eb0a1e73acecdb247f54b3197
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: ctlz_zero_undef_i32_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: ctlz_zero_undef_i32_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:sgpr(s32) = G_CTLZ_ZERO_UNDEF [[COPY]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_CTLZ_ZERO_UNDEF %0
+...
+
+---
+name: ctlz_zero_undef_i32_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: ctlz_zero_undef_i32_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[COPY]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_CTLZ_ZERO_UNDEF %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz.mir
new file mode 100644
index 0000000000000000000000000000000000000000..441bc894f2bab43d71256959bc63ff9551e6b3d9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: ctlz_i32_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: ctlz_i32_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[CTLZ:%[0-9]+]]:sgpr(s32) = G_CTLZ [[COPY]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_CTLZ %0
+...
+
+---
+name: ctlz_i32_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: ctlz_i32_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[CTLZ:%[0-9]+]]:vgpr(s32) = G_CTLZ [[COPY]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_CTLZ %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctpop.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctpop.mir
new file mode 100644
index 0000000000000000000000000000000000000000..e4694371805aa176863cd19677282fdf505d2ee1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctpop.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: ctpop_i32_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: ctpop_i32_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[CTPOP:%[0-9]+]]:sgpr(s32) = G_CTPOP [[COPY]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_CTPOP %0
+...
+
+---
+name: ctpop_i32_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: ctpop_i32_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[CTPOP:%[0-9]+]]:vgpr(s32) = G_CTPOP [[COPY]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_CTPOP %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
new file mode 100644
index 0000000000000000000000000000000000000000..127c7fb8a84ec48592e4a47d734e9d5645ff35ca
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: cttz_zero_undef_i32_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: cttz_zero_undef_i32_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:sgpr(s32) = G_CTTZ_ZERO_UNDEF [[COPY]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_CTTZ_ZERO_UNDEF %0
+...
+
+---
+name: cttz_zero_undef_i32_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: cttz_zero_undef_i32_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[COPY]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_CTTZ_ZERO_UNDEF %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c75d89bdaffbac34101a10e631b77faba5037d69
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: cttz_i32_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: cttz_i32_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[CTTZ:%[0-9]+]]:sgpr(s32) = G_CTTZ [[COPY]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_CTTZ %0
+...
+
+---
+name: cttz_i32_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: cttz_i32_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[CTTZ:%[0-9]+]]:vgpr(s32) = G_CTTZ [[COPY]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_CTTZ %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c48d10191d57c40b60003f6584c1317c2a6c6248
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: fabs_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: fabs_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[FABS:%[0-9]+]]:sgpr(s32) = G_FABS [[COPY]]
+    ; CHECK: $vgpr0 = COPY [[FABS]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_FABS %0
+    $vgpr0 = COPY %1
+...
+
+---
+name: fabs_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: fabs_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[FABS:%[0-9]+]]:vgpr(s32) = G_FABS [[COPY]]
+    ; CHECK: $vgpr0 = COPY [[FABS]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_FABS %0
+    $vgpr0 = COPY %1
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir
new file mode 100644
index 0000000000000000000000000000000000000000..3f0dc2237dca154d3f53e2d89afb06b81ea18f12
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir
@@ -0,0 +1,148 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: fma_sss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: fma_sss
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY3]], [[COPY4]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = G_FMA %0, %1, %2
+...
+---
+name: fma_vss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: fma_vss
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY3]], [[COPY4]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr0
+    %2:_(s32) = COPY $sgpr1
+    %3:_(s32) = G_FMA %0, %1, %2
+...
+---
+name: fma_svs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $sgpr1
+    ; CHECK-LABEL: name: fma_svs
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY3]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $sgpr1
+    %3:_(s32) = G_FMA %0, %1, %2
+...
+---
+name: fma_ssv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr0
+    ; CHECK-LABEL: name: fma_ssv
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY3]], [[COPY2]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = G_FMA %0, %1, %2
+...
+---
+name: fma_vvs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0
+    ; CHECK-LABEL: name: fma_vvs
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY3]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $sgpr0
+    %3:_(s32) = G_FMA %0, %1, %2
+...
+---
+name: fma_vsv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr0, $vgpr1
+    ; CHECK-LABEL: name: fma_vsv
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY3]], [[COPY2]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s32) = G_FMA %0, %1, %2
+...
+---
+name: fma_svv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: fma_svv
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s32) = G_FMA %0, %1, %2
+...
+---
+name: fma_vvv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-LABEL: name: fma_vvv
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = G_FMA %0, %1, %2
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir
new file mode 100644
index 0000000000000000000000000000000000000000..3438275fbe38d371bb957c13c6efe6b17e8dd907
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: fneg_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: fneg_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[FNEG:%[0-9]+]]:sgpr(s32) = G_FNEG [[COPY]]
+    ; CHECK: $vgpr0 = COPY [[FNEG]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_FNEG %0
+    $vgpr0 = COPY %1
+...
+
+---
+name: fneg_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: fneg_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[FNEG:%[0-9]+]]:vgpr(s32) = G_FNEG [[COPY]]
+    ; CHECK: $vgpr0 = COPY [[FNEG]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_FNEG %0
+    $vgpr0 = COPY %1
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
new file mode 100644
index 0000000000000000000000000000000000000000..77a444d6d14e9264b457c615852d97d6c578937c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
@@ -0,0 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+--- |
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+  define void @test_frame_index_p5() {
+      %ptr0 = alloca i32, addrspace(5)
+     ret void
+    }
+...
+---
+name: test_frame_index_p5
+legalized:       true
+stack:
+  - { id: 0, name: ptr0, offset: 0, size: 4, alignment: 4 }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_frame_index_p5
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0.ptr0
+    %0:_(p5) = G_FRAME_INDEX %stack.0.ptr0
+
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fsub.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fsub.mir
new file mode 100644
index 0000000000000000000000000000000000000000..a924e5ee49769ea7a72e93eb8679dee4b854e556
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fsub.mir
@@ -0,0 +1,69 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: fsub_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: fsub_ss
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[FSUB:%[0-9]+]]:vgpr(s32) = G_FSUB [[COPY]], [[COPY2]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = G_FSUB %0, %1
+...
+
+---
+name: fsub_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: fsub_sv
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[FSUB:%[0-9]+]]:vgpr(s32) = G_FSUB [[COPY]], [[COPY1]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = G_FSUB %0, %1
+...
+
+---
+name: fsub_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: fsub_vs
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[FSUB:%[0-9]+]]:vgpr(s32) = G_FSUB [[COPY]], [[COPY2]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr0
+    %2:_(s32) = G_FSUB %0, %1
+...
+
+---
+name: fsub_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: fsub_vv
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[FSUB:%[0-9]+]]:vgpr(s32) = G_FSUB [[COPY]], [[COPY1]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FSUB %0, %1
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
new file mode 100644
index 0000000000000000000000000000000000000000..83d8994d0fe310eaf6cd8d135160d76427e39e36
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: ptrtoint_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: ptrtoint_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK: [[PTRTOINT:%[0-9]+]]:sgpr(p4) = G_PTRTOINT [[COPY]](s64)
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(p4) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: ptrtoint_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[PTRTOINT:%[0-9]+]]:vgpr(p0) = G_PTRTOINT [[COPY]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(p0) = G_PTRTOINT %0
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
new file mode 100644
index 0000000000000000000000000000000000000000..9c3e2f218d2c45a2ce120dddc3ce08264ea8d419
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: zext_i32_to_i64_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: zext_i32_to_i64_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[SEXT:%[0-9]+]]:sgpr(s64) = G_SEXT [[COPY]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s64) = G_SEXT %0
+...
+
+---
+name: zext_i32_to_i64_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: zext_i32_to_i64_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[SEXT:%[0-9]+]]:vgpr(s64) = G_SEXT [[COPY]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s64) = G_SEXT %0
+...
diff --git a/test/CodeGen/AMDGPU/add3.ll b/test/CodeGen/AMDGPU/add3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..35055190b348e628033c1489105a425b31a20afb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add3.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+
+; ===================================================================================
+; V_ADD3_U32
+; ===================================================================================
+
+define amdgpu_ps float @add3(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: add3:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add3_u32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+; ThreeOp instruction variant not used due to Constant Bus Limitations
+; TODO: with reassociation it is possible to replace a v_add_u32_e32 with a s_add_i32
+define amdgpu_ps float @add3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
+; VI-LABEL: add3_vgpr_b:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s3, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add3_vgpr_b:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @add3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: add3_vgpr_all2:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add3_vgpr_all2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add3_u32 v0, v1, v2, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %b, %c
+  %result = add i32 %a, %x
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @add3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
+; VI-LABEL: add3_vgpr_bc:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add3_vgpr_bc:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add3_u32 v0, s2, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @add3_vgpr_const(i32 %a, i32 %b) {
+; VI-LABEL: add3_vgpr_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add3_vgpr_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add3_u32 v0, v0, v1, 16
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, %b
+  %result = add i32 %x, 16
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps <2 x float> @add3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x) {
+; VI-LABEL: add3_multiuse_outer:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mul_lo_i32 v1, v0, v3
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add3_multiuse_outer:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add3_u32 v0, v0, v1, v2
+; GFX9-NEXT:    v_mul_lo_i32 v1, v0, v3
+; GFX9-NEXT:    ; return to shader part epilog
+  %inner = add i32 %a, %b
+  %outer = add i32 %inner, %c
+  %x1 = mul i32 %outer, %x
+  %r1 = insertelement <2 x i32> undef, i32 %outer, i32 0
+  %r0 = insertelement <2 x i32> %r1, i32 %x1, i32 1
+  %bc = bitcast <2 x i32> %r0 to <2 x float>
+  ret <2 x float> %bc
+}
+
+define amdgpu_ps <2 x float> @add3_multiuse_inner(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: add3_multiuse_inner:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add3_multiuse_inner:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v0, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %inner = add i32 %a, %b
+  %outer = add i32 %inner, %c
+  %r1 = insertelement <2 x i32> undef, i32 %inner, i32 0
+  %r0 = insertelement <2 x i32> %r1, i32 %outer, i32 1
+  %bc = bitcast <2 x i32> %r0 to <2 x float>
+  ret <2 x float> %bc
+}
+
+; A case where uniform values end up in VGPRs -- we could use v_add3_u32 here,
+; but we don't.
+define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) {
+; VI-LABEL: add3_uniform_vgpr:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; VI-NEXT:    v_add_f32_e64 v0, s2, 1.0
+; VI-NEXT:    v_add_f32_e64 v1, s3, 2.0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add3_uniform_vgpr:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GFX9-NEXT:    v_add_f32_e64 v0, s2, 1.0
+; GFX9-NEXT:    v_add_f32_e64 v1, s3, 2.0
+; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %a1 = fadd float %a, 1.0
+  %b2 = fadd float %b, 2.0
+  %c3 = fadd float %c, 3.0
+  %bc.a = bitcast float %a1 to i32
+  %bc.b = bitcast float %b2 to i32
+  %bc.c = bitcast float %c3 to i32
+  %x = add i32 %bc.a, %bc.b
+  %result = add i32 %x, %bc.c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
diff --git a/test/CodeGen/AMDGPU/add_shl.ll b/test/CodeGen/AMDGPU/add_shl.ll
new file mode 100644
index 0000000000000000000000000000000000000000..88ae042e86726785482801232971c458f9bee14a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add_shl.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+
+; ===================================================================================
+; V_ADD_LSHL_U32
+; ===================================================================================
+
+define amdgpu_ps float @add_shl(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: add_shl:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add_shl:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add_lshl_u32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, %b
+  %result = shl i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @add_shl_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) {
+; VI-LABEL: add_shl_vgpr_c:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s2, s2, s3
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add_shl_vgpr_c:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, %b
+  %result = shl i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @add_shl_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) {
+; VI-LABEL: add_shl_vgpr_ac:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add_shl_vgpr_ac:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add_lshl_u32 v0, v0, s2, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, %b
+  %result = shl i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) {
+; VI-LABEL: add_shl_vgpr_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add_shl_vgpr_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add_lshl_u32 v0, v0, v1, 9
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, %b
+  %result = shl i32 %x, 9
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
+; VI-LABEL: add_shl_vgpr_const_inline_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7e800, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add_shl_vgpr_const_inline_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e800
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 9, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, 1012
+  %result = shl i32 %x, 9
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+; TODO: Non-optimal code generation because SelectionDAG combines
+;       (shl (add x, CONST), y) ---> (add (shl x, y), CONST').
+;
+define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) {
+; VI-LABEL: add_shl_vgpr_inline_const_x2:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x600, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: add_shl_vgpr_inline_const_x2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x600
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 9, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = add i32 %a, 3
+  %result = shl i32 %x, 9
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
diff --git a/test/CodeGen/AMDGPU/addrspacecast-captured.ll b/test/CodeGen/AMDGPU/addrspacecast-captured.ll
index 138bc36b9e1b8bcaee5f4fb6516ec732841e3889..64ff0e5fac82a412f50b07c345832bbfa625e9f2 100644
--- a/test/CodeGen/AMDGPU/addrspacecast-captured.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast-captured.ll
@@ -1,45 +1,46 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
-
 ; Nothing should be done if the addrspacecast is captured.
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 declare void @consume_ptr2int(i32) #0
 
 ; CHECK-LABEL: @addrspacecast_captured(
-; CHECK: %data = alloca i32, align 4
-; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
-; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
+; CHECK: %data = alloca i32, align 4, addrspace(5)
+; CHECK: %cast = addrspacecast i32 addrspace(5)* %data to i32*
+; CHECK: %ptr2int = ptrtoint i32* %cast to i32
 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out
 define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
 entry:
-  %data = alloca i32, align 4
-  %cast = addrspacecast i32* %data to i32 addrspace(4)*
-  %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
+  %data = alloca i32, align 4, addrspace(5)
+  %cast = addrspacecast i32 addrspace(5)* %data to i32*
+  %ptr2int = ptrtoint i32* %cast to i32
   store i32 %ptr2int, i32 addrspace(1)* %out
   ret void
 }
 
 ; CHECK-LABEL: @addrspacecast_captured_store(
-; CHECK: %data = alloca i32, align 4
-; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
-; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
-define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
+; CHECK: %data = alloca i32, align 4, addrspace(5)
+; CHECK: %cast = addrspacecast i32 addrspace(5)* %data to i32*
+; CHECK: store i32* %cast, i32* addrspace(1)* %out
+define amdgpu_kernel void @addrspacecast_captured_store(i32* addrspace(1)* %out) #0 {
 entry:
-  %data = alloca i32, align 4
-  %cast = addrspacecast i32* %data to i32 addrspace(4)*
-  store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
+  %data = alloca i32, align 4, addrspace(5)
+  %cast = addrspacecast i32 addrspace(5)* %data to i32*
+  store i32* %cast, i32* addrspace(1)* %out
   ret void
 }
 
 ; CHECK-LABEL: @addrspacecast_captured_call(
-; CHECK: %data = alloca i32, align 4
-; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
-; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
+; CHECK: %data = alloca i32, align 4, addrspace(5)
+; CHECK: %cast = addrspacecast i32 addrspace(5)* %data to i32*
+; CHECK: %ptr2int = ptrtoint i32* %cast to i32
 ; CHECK: call void @consume_ptr2int(i32 %ptr2int)
 define amdgpu_kernel void @addrspacecast_captured_call() #0 {
 entry:
-  %data = alloca i32, align 4
-  %cast = addrspacecast i32* %data to i32 addrspace(4)*
-  %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
+  %data = alloca i32, align 4, addrspace(5)
+  %cast = addrspacecast i32 addrspace(5)* %data to i32*
+  %ptr2int = ptrtoint i32* %cast to i32
   call void @consume_ptr2int(i32 %ptr2int)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index 95bbe958e9321ee64bd59e2f4f8bf84e85aaf87e..ea40cda4fa6bed1736133c62d0936af86e40cfc8 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
 
 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
 ; HSA: enable_sgpr_private_segment_buffer = 1
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 023c19915c7a175e441dab83e9301b34b0ad0187..199a96c64435ca7abbb1746abd0ad0005a39f48f 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -1,10 +1,10 @@
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
 
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s
 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s
diff --git a/test/CodeGen/AMDGPU/and_or.ll b/test/CodeGen/AMDGPU/and_or.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c8ff582976c643b8a4d2ab97ccab0b2b5a27fc7f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/and_or.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
+;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+
+; ===================================================================================
+; V_AND_OR_B32
+; ===================================================================================
+
+define amdgpu_ps float @and_or(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: and_or:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_and_b32_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: and_or:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = and i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+; ThreeOp instruction variant not used due to Constant Bus Limitations
+define amdgpu_ps float @and_or_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
+; VI-LABEL: and_or_vgpr_b:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_and_b32_e32 v0, s2, v0
+; VI-NEXT:    v_or_b32_e32 v0, s3, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: and_or_vgpr_b:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = and i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @and_or_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
+; VI-LABEL: and_or_vgpr_ab:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_and_b32_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, s2, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: and_or_vgpr_ab:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = and i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @and_or_vgpr_const(i32 %a, i32 %b) {
+; VI-LABEL: and_or_vgpr_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_and_b32_e32 v0, 4, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: and_or_vgpr_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_or_b32 v0, v0, 4, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = and i32 4, %a
+  %result = or i32 %x, %b
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @and_or_vgpr_const_inline_const(i32 %a) {
+; VI-LABEL: and_or_vgpr_const_inline_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_and_b32_e32 v0, 20, v0
+; VI-NEXT:    v_or_b32_e32 v0, 0x808, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: and_or_vgpr_const_inline_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x808
+; GFX9-NEXT:    v_and_or_b32 v0, v0, 20, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = and i32 20, %a
+  %result = or i32 %x, 2056
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @and_or_vgpr_inline_const_x2(i32 %a) {
+; VI-LABEL: and_or_vgpr_inline_const_x2:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_and_b32_e32 v0, 4, v0
+; VI-NEXT:    v_or_b32_e32 v0, 1, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: and_or_vgpr_inline_const_x2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_or_b32 v0, v0, 4, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = and i32 4, %a
+  %result = or i32 %x, 1
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
diff --git a/test/CodeGen/AMDGPU/andorbitset.ll b/test/CodeGen/AMDGPU/andorbitset.ll
new file mode 100644
index 0000000000000000000000000000000000000000..95bba7d54382f067a50aa51754fe248cb890938a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/andorbitset.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_clear_msb:
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
+define amdgpu_kernel void @s_clear_msb(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 %in, 2147483647
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_set_msb:
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+define amdgpu_kernel void @s_set_msb(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 %in, 2147483648
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_clear_lsb:
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2
+define amdgpu_kernel void @s_clear_lsb(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 %in, 4294967294
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_set_lsb:
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @s_set_lsb(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 %in, 1
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_clear_midbit:
+; SI: s_bitset0_b32 s{{[0-9]+}}, 8
+define amdgpu_kernel void @s_clear_midbit(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 %in, 4294967039
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_set_midbit:
+; SI: s_bitset1_b32 s{{[0-9]+}}, 8
+define amdgpu_kernel void @s_set_midbit(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 %in, 256
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/andorn2.ll b/test/CodeGen/AMDGPU/andorn2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..390c103f3676a37ab17a39b790e262f53f4bf103
--- /dev/null
+++ b/test/CodeGen/AMDGPU/andorn2.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s
+; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s
+
+; GCN-LABEL: {{^}}scalar_andn2_i32_one_use
+; GCN: s_andn2_b32
+define amdgpu_kernel void @scalar_andn2_i32_one_use(
+    i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+entry:
+  %nb = xor i32 %b, -1
+  %r0.val = and i32 %a, %nb
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_andn2_i64_one_use
+; GCN: s_andn2_b64
+define amdgpu_kernel void @scalar_andn2_i64_one_use(
+    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+  %nb = xor i64 %b, -1
+  %r0.val = and i64 %a, %nb
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
+; GCN: s_orn2_b32
+define amdgpu_kernel void @scalar_orn2_i32_one_use(
+    i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+entry:
+  %nb = xor i32 %b, -1
+  %r0.val = or i32 %a, %nb
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_orn2_i64_one_use
+; GCN: s_orn2_b64
+define amdgpu_kernel void @scalar_orn2_i64_one_use(
+    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+  %nb = xor i64 %b, -1
+  %r0.val = or i64 %a, %nb
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
+; GCN: v_not_b32
+; GCN: v_and_b32
+define amdgpu_kernel void @vector_andn2_i32_s_v_one_use(
+    i32 addrspace(1)* %r0, i32 %s) {
+entry:
+  %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %not = xor i32 %v, -1
+  %r0.val = and i32 %s, %not
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}vector_andn2_i32_v_s_one_use
+; GCN: s_not_b32
+; GCN: v_and_b32
+define amdgpu_kernel void @vector_andn2_i32_v_s_one_use(
+    i32 addrspace(1)* %r0, i32 %s) {
+entry:
+  %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %not = xor i32 %s, -1
+  %r0.val = and i32 %v, %not
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}vector_orn2_i32_s_v_one_use
+; GCN: v_not_b32
+; GCN: v_or_b32
+define amdgpu_kernel void @vector_orn2_i32_s_v_one_use(
+    i32 addrspace(1)* %r0, i32 %s) {
+entry:
+  %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %not = xor i32 %v, -1
+  %r0.val = or i32 %s, %not
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}vector_orn2_i32_v_s_one_use
+; GCN: s_not_b32
+; GCN: v_or_b32
+define amdgpu_kernel void @vector_orn2_i32_v_s_one_use(
+    i32 addrspace(1)* %r0, i32 %s) {
+entry:
+  %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %not = xor i32 %s, -1
+  %r0.val = or i32 %v, %not
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/test/CodeGen/AMDGPU/andorxorinvimm.ll b/test/CodeGen/AMDGPU/andorxorinvimm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a16fd534247ec2e15566736062389ea38e7dbf98
--- /dev/null
+++ b/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_or_to_orn2:
+; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_or_to_orn2(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 %in, -51
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_or_to_orn2_imm0:
+; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_or_to_orn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 -51, %in
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_and_to_andn2:
+; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_and_to_andn2(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 %in, -51
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_and_to_andn2_imm0:
+; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_and_to_andn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 -51, %in
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_xor_to_xnor:
+; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_xor_to_xnor(i32 addrspace(1)* %out, i32 %in) {
+  %x = xor i32 %in, -51
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_xor_to_xnor_imm0:
+; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_xor_to_xnor_imm0(i32 addrspace(1)* %out, i32 %in) {
+  %x = xor i32 -51, %in
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e57ce963e3c61a221b94bb111e1b94435b942708
--- /dev/null
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
@@ -0,0 +1,145 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+code-object-v3 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -amdgpu-verify-hsa-metadata -filetype=obj -mattr=+code-object-v3 -o /dev/null < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK-LABEL: {{^}}min_64_max_64:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_64_max_64() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
+
+; CHECK-LABEL: {{^}}min_64_max_128:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_64_max_128() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
+
+; CHECK-LABEL: {{^}}min_128_max_128:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_128_max_128() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
+
+; CHECK-LABEL: {{^}}min_1024_max_2048
+; CHECK: SGPRBlocks: 1
+; CHECK: VGPRBlocks: 7
+; CHECK: NumSGPRsForWavesPerEU: 12
+; CHECK: NumVGPRsForWavesPerEU: 32
+@var = addrspace(1) global float 0.0
+define amdgpu_kernel void @min_1024_max_2048() #3 {
+  %val0 = load volatile float, float addrspace(1)* @var
+  %val1 = load volatile float, float addrspace(1)* @var
+  %val2 = load volatile float, float addrspace(1)* @var
+  %val3 = load volatile float, float addrspace(1)* @var
+  %val4 = load volatile float, float addrspace(1)* @var
+  %val5 = load volatile float, float addrspace(1)* @var
+  %val6 = load volatile float, float addrspace(1)* @var
+  %val7 = load volatile float, float addrspace(1)* @var
+  %val8 = load volatile float, float addrspace(1)* @var
+  %val9 = load volatile float, float addrspace(1)* @var
+  %val10 = load volatile float, float addrspace(1)* @var
+  %val11 = load volatile float, float addrspace(1)* @var
+  %val12 = load volatile float, float addrspace(1)* @var
+  %val13 = load volatile float, float addrspace(1)* @var
+  %val14 = load volatile float, float addrspace(1)* @var
+  %val15 = load volatile float, float addrspace(1)* @var
+  %val16 = load volatile float, float addrspace(1)* @var
+  %val17 = load volatile float, float addrspace(1)* @var
+  %val18 = load volatile float, float addrspace(1)* @var
+  %val19 = load volatile float, float addrspace(1)* @var
+  %val20 = load volatile float, float addrspace(1)* @var
+  %val21 = load volatile float, float addrspace(1)* @var
+  %val22 = load volatile float, float addrspace(1)* @var
+  %val23 = load volatile float, float addrspace(1)* @var
+  %val24 = load volatile float, float addrspace(1)* @var
+  %val25 = load volatile float, float addrspace(1)* @var
+  %val26 = load volatile float, float addrspace(1)* @var
+  %val27 = load volatile float, float addrspace(1)* @var
+  %val28 = load volatile float, float addrspace(1)* @var
+  %val29 = load volatile float, float addrspace(1)* @var
+  %val30 = load volatile float, float addrspace(1)* @var
+  %val31 = load volatile float, float addrspace(1)* @var
+  %val32 = load volatile float, float addrspace(1)* @var
+  %val33 = load volatile float, float addrspace(1)* @var
+  %val34 = load volatile float, float addrspace(1)* @var
+  %val35 = load volatile float, float addrspace(1)* @var
+  %val36 = load volatile float, float addrspace(1)* @var
+  %val37 = load volatile float, float addrspace(1)* @var
+  %val38 = load volatile float, float addrspace(1)* @var
+  %val39 = load volatile float, float addrspace(1)* @var
+  %val40 = load volatile float, float addrspace(1)* @var
+
+  store volatile float %val0, float addrspace(1)* @var
+  store volatile float %val1, float addrspace(1)* @var
+  store volatile float %val2, float addrspace(1)* @var
+  store volatile float %val3, float addrspace(1)* @var
+  store volatile float %val4, float addrspace(1)* @var
+  store volatile float %val5, float addrspace(1)* @var
+  store volatile float %val6, float addrspace(1)* @var
+  store volatile float %val7, float addrspace(1)* @var
+  store volatile float %val8, float addrspace(1)* @var
+  store volatile float %val9, float addrspace(1)* @var
+  store volatile float %val10, float addrspace(1)* @var
+  store volatile float %val11, float addrspace(1)* @var
+  store volatile float %val12, float addrspace(1)* @var
+  store volatile float %val13, float addrspace(1)* @var
+  store volatile float %val14, float addrspace(1)* @var
+  store volatile float %val15, float addrspace(1)* @var
+  store volatile float %val16, float addrspace(1)* @var
+  store volatile float %val17, float addrspace(1)* @var
+  store volatile float %val18, float addrspace(1)* @var
+  store volatile float %val19, float addrspace(1)* @var
+  store volatile float %val20, float addrspace(1)* @var
+  store volatile float %val21, float addrspace(1)* @var
+  store volatile float %val22, float addrspace(1)* @var
+  store volatile float %val23, float addrspace(1)* @var
+  store volatile float %val24, float addrspace(1)* @var
+  store volatile float %val25, float addrspace(1)* @var
+  store volatile float %val26, float addrspace(1)* @var
+  store volatile float %val27, float addrspace(1)* @var
+  store volatile float %val28, float addrspace(1)* @var
+  store volatile float %val29, float addrspace(1)* @var
+  store volatile float %val30, float addrspace(1)* @var
+  store volatile float %val31, float addrspace(1)* @var
+  store volatile float %val32, float addrspace(1)* @var
+  store volatile float %val33, float addrspace(1)* @var
+  store volatile float %val34, float addrspace(1)* @var
+  store volatile float %val35, float addrspace(1)* @var
+  store volatile float %val36, float addrspace(1)* @var
+  store volatile float %val37, float addrspace(1)* @var
+  store volatile float %val38, float addrspace(1)* @var
+  store volatile float %val39, float addrspace(1)* @var
+  store volatile float %val40, float addrspace(1)* @var
+
+  ret void
+}
+attributes #3 = {"amdgpu-flat-work-group-size"="1024,2048"}
+
+; CHECK: amdhsa.kernels:
+; CHECK:   .max_flat_workgroup_size: 64
+; CHECK:   .name:                 min_64_max_64
+; CHECK:   .max_flat_workgroup_size: 128
+; CHECK:   .name:                 min_64_max_128
+; CHECK:   .max_flat_workgroup_size: 128
+; CHECK:   .name:                 min_128_max_128
+; CHECK:   .max_flat_workgroup_size: 2048
+; CHECK:   .name:                 min_1024_max_2048
+; CHECK: amdhsa.version:
+; CHECK:   - 1
+; CHECK:   - 0
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 7fe5604c3ec724cbb4d9841f9398490cf072ecbc..2f281cab48cfc36f87ff265cac28a3f1dad5d7ab 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=HSAMD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=HSAMD %s
 
 ; CHECK-LABEL: {{^}}min_64_max_64:
 ; CHECK: SGPRBlocks: 0
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index 72c983d5d9788c881ce541254ba7e4e0a5fe29d4..45ed056567c2e2463d329694a9a8418e82ab089b 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -444,7 +444,7 @@ endif:
 ; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]]
 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
-; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop
+; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop{{$}}
 ; GCN: ;;#ASMSTART
 ; GCN: v_nop_e64
 ; GCN: v_nop_e64
diff --git a/test/CodeGen/AMDGPU/byval-frame-setup.ll b/test/CodeGen/AMDGPU/byval-frame-setup.ll
index bf2f5d38babc73145f576160fd5a2ae1bb2d9c56..3a48ffdda9ed22deb0b0adc6cc268903ae866e3b 100644
--- a/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -48,8 +48,8 @@ entry:
 
 ; GCN: v_readlane_b32
 ; GCN-NOT: v_readlane_b32 s32
-; GCN: buffer_load_dword v32,
-; GCN: buffer_load_dword v33,
+; GCN-DAG: buffer_load_dword v32,
+; GCN-DAG: buffer_load_dword v33,
 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
 ; GCN: s_setpc_b64
 define void  @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll
index 84d327b6f37980e9919da43330d60be12dd0a2f8..67614adc9152d6426895dca0eb6be40062182629 100644
--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -61,11 +61,11 @@ declare void @external_void_func_v16i8(<16 x i8>) #0
 
 ; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
 
-; GCN: v_mov_b32_e32 v0, 1{{$}}
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4
+; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
 ; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
@@ -123,12 +123,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
-; GCN: v_mov_b32_e32 v0, 0x7b
-; HSA-DAG: s_mov_b32 s4, s33{{$}}
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
+; GCN-DAG: v_mov_b32_e32 v0, 0x7b
 
+; HSA-DAG: s_mov_b32 s4, s33{{$}}
 ; GCN-DAG: s_mov_b32 s32, s33{{$}}
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -144,11 +144,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: buffer_load_sbyte v0
-; GCN: s_mov_b32 s4, s33
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4
 
+; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s3
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -165,10 +165,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; HSA-DAG: s_mov_b32 s33, s9{{$}}
 
 ; GCN-DAG: buffer_load_ubyte v0
-; GCN: s_mov_b32 s4, s33
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
@@ -197,10 +196,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: buffer_load_sshort v0
-; GCN: s_mov_b32 s4, s33
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
@@ -217,11 +215,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 
-; GCN-DAG: buffer_load_ushort v0
-; GCN: s_mov_b32 s4, s33
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
@@ -237,11 +233,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
-; GCN: v_mov_b32_e32 v0, 42
-; GCN: s_mov_b32 s4, s33
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
+; GCN-DAG: v_mov_b32_e32 v0, 42
+; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -688,7 +684,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GCN-NOT: s_add_u32 [[SP]]
 ; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
 ; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
-; GCN-NEXT: s_swappc_b64
+; GCN: s_swappc_b64
 ; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
 ; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
 ; GCN-NOT: s_sub_u32 [[SP]]
diff --git a/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 21c69d9bee7d0eeb2b69ba366379cd1015d8bca5..c4c30a6675518d1eadcde687a562238ef13d931e 100644
--- a/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
 
 ; Make sure to run a GPU with the SGPR allocation bug.
 
diff --git a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index 79abb96cccf16221d0011a2512c309f0d5c853ba..5060c0fed1a9bcd7a0a6aa5b24d704de92300b47 100644
--- a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}use_dispatch_ptr:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
@@ -383,13 +383,12 @@ define void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 0
 ; GCN: enable_sgpr_workgroup_id_z = 0
 
+; GCN-NOT: s6
 ; GCN-DAG: s_mov_b32 s33, s7
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-
-; GCN-NOT: s6
-; GCN: s_mov_b32 s4, s33
-; GCN-NOT: s6
+; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
+; GCN-NOT: s6
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
   call void @other_arg_use_workgroup_id_x(i32 555)
@@ -578,16 +577,16 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
 
 ; GCN: s_swappc_b64
 
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_X]]
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_X]]
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Y]]
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Y]]
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Z]]
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Z]]
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
+; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]]
+; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]]
+; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[LO2:[0-9]+]], s[[LO_Y]]
+; GCN-DAG: v_mov_b32_e32 v[[HI2:[0-9]+]], s[[HI_Y]]
+; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO2]]:[[HI2]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[LO3:[0-9]+]], s[[LO_Z]]
+; GCN-DAG: v_mov_b32_e32 v[[HI3:[0-9]+]], s[[HI_Z]]
+; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO3]]:[[HI3]]{{\]}}
 ; GCN: ; use
 ; GCN: ; use [[SAVE_X]]
 ; GCN: ; use [[SAVE_Y]]
diff --git a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 7f14a24d6da65eb931c10240d236f1a2c6d43325..750a0203c9bfc4b5a424c317eb84bc140e57b1c7 100644
--- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}use_workitem_id_x:
 ; GCN: s_waitcnt
diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 04ad3bcccd3f33ebee62871b006af8ed9b7a8479..f6c30f8af1c37244dacda586d2ce8512b1463d3d 100644
--- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -125,11 +125,11 @@ ret:
 ; GCN: s_cbranch_scc1
 
 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
-; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0xff
+; VI: v_mov_b32_e32 v{{[0-9]+}}, 0xff
 
 ; GCN: BB2_2:
 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
-; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0x7f
+; VI: v_mov_b32_e32 v{{[0-9]+}}, 0x7f
 
 ; GCN: BB2_3:
 ; GCN: buffer_store_short
diff --git a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 789decb45f27b6727d0699ad225a09e35d707222..faa468c974c0829afaf8aa06dbb65d3928b9d55a 100644
--- a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -32,7 +32,6 @@ bb2:
 
 ; GCN-LABEL: {{^}}preserve_condition_undef_flag:
 ; GCN-NOT: vcc
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
@@ -40,7 +39,7 @@ bb0:
   %tmp4 = select i1 %undef, float %arg, float 1.000000e+00
   %tmp5 = fcmp ogt float %arg2, 0.000000e+00
   %tmp6 = fcmp olt float %arg2, 1.000000e+00
-  %tmp7 = fcmp olt float %arg, %tmp4
+  %tmp7 = fcmp olt float %arg, undef
   %tmp8 = and i1 %tmp5, %tmp6
   %tmp9 = and i1 %tmp8, %tmp7
   br i1 %tmp9, label %bb1, label %bb2
diff --git a/test/CodeGen/AMDGPU/code-object-v3.ll b/test/CodeGen/AMDGPU/code-object-v3.ll
index 3f13cebc1583889b62c3019ec32ea29b4e1adbb4..d84325539e81918b393b8d5a0b000c3ddf654b91 100644
--- a/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -3,6 +3,8 @@
 
 ; ALL-ASM-LABEL: {{^}}fadd:
 
+; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_version
+; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_isa
 ; OSABI-AMDHSA-ASM-NOT: .amdgpu_hsa_kernel
 ; OSABI-AMDHSA-ASM-NOT: .amd_kernel_code_t
 
@@ -57,7 +59,8 @@
 ; OSABI-AMDHSA-ELF: {{[0-9]+}}: 0000000000000000 64         OBJECT GLOBAL DEFAULT {{[0-9]+}} fadd.kd
 ; OSABI-AMDHSA-ELF: {{[0-9]+}}: 0000000000000040 64         OBJECT GLOBAL DEFAULT {{[0-9]+}} fsub.kd
 
-; OSABI-AMDHSA-ELF-NOT: Displaying notes found
+; OSABI-AMDHSA-ELF: Displaying notes found at file offset
+; OSABI-AMDHSA-ELF: AMDGPU 0x{{[0-9a-f]+}} NT_AMDGPU_METADATA (AMDGPU Metadata)
 
 define amdgpu_kernel void @fadd(
     float addrspace(1)* %r,
diff --git a/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll b/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll
new file mode 100644
index 0000000000000000000000000000000000000000..06049b77b2136735264895b53bd79adcbe14124e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: s_waitcnt
+define <2 x i16> @main(<2 x float>) #0 {
+  %2 = bitcast <2 x float> %0 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <2 x i32> <i32 0, i32 undef>
+  %4 = extractelement <4 x i16> %2, i32 0
+  %5 = insertelement <2 x i16> %3, i16 %4, i32 0
+  ret <2 x i16> %5
+}
+
diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 09d4b2c8bd7744969280786861f94228761761e2..8611cd080e15d05752ea79ad7a0eb5ffd109a200 100644
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -107,7 +107,7 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
 ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
 ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
 ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
-; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]]
+; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 0b28d8912a3d08b6581aa3155db431b5a7d330cd..d8126fa635af8367d45aa9d06ec7b25906e40094 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -36,10 +36,10 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias
 ; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
 ; GCN-NOT: v_cvt_f32_ubyte3_e32
-; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
+; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
+; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
index 5bcdaa0fc0abb6b2489b409a528e4fc9aedcdcb7..9e1c58bcef7e280c5cca26d89280073f72246535 100644
--- a/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
+++ b/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
@@ -1,19 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -O0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}eq_t:
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
-; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
-; GCN:     v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
+; GCN:     v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], 1.0{{$}}
 ; GCN-NOT: 0xddd5
 ; GCN-NOT: v_cndmask_b32
 ; GCN-NOT: v_cmp_eq_u32
-; GCN-NOT: v_cndmask_b32
-; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
-; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
-; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
-; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC]]
+; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], 2.0, 4.0, [[CC]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
 define amdgpu_kernel void @eq_t(float %x) {
   %c1 = fcmp olt float %x, 1.0
@@ -26,18 +19,11 @@ define amdgpu_kernel void @eq_t(float %x) {
 
 ; GCN-LABEL: {{^}}ne_t:
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
-; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
-; GCN:     v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
+; GCN:     v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], 1.0{{$}}
 ; GCN-NOT: 0xddd5
 ; GCN-NOT: v_cndmask_b32
 ; GCN-NOT: v_cmp_eq_u32
-; GCN-NOT: v_cndmask_b32
-; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
-; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
-; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
-; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VFOUR]], [[VTWO]], [[CC]]
+; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], 4.0, 2.0, [[CC]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
 define amdgpu_kernel void @ne_t(float %x) {
   %c1 = fcmp olt float %x, 1.0
@@ -50,18 +36,11 @@ define amdgpu_kernel void @ne_t(float %x) {
 
 ; GCN-LABEL: {{^}}eq_f:
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
-; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
-; GCN:     v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
+; GCN:     v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], 1.0{{$}}
 ; GCN-NOT: 0xddd5
 ; GCN-NOT: v_cndmask_b32
 ; GCN-NOT: v_cmp_eq_u32
-; GCN-NOT: v_cndmask_b32
-; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
-; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
-; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
-; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VFOUR]], [[VTWO]], [[CC]]
+; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], 4.0, 2.0, [[CC]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
 define amdgpu_kernel void @eq_f(float %x) {
   %c1 = fcmp olt float %x, 1.0
@@ -74,18 +53,11 @@ define amdgpu_kernel void @eq_f(float %x) {
 
 ; GCN-LABEL: {{^}}ne_f:
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
-; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
-; GCN:     v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
+; GCN:     v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], 1.0{{$}}
 ; GCN-NOT: 0xddd5
 ; GCN-NOT: v_cndmask_b32
 ; GCN-NOT: v_cmp_eq_u32
-; GCN-NOT: v_cndmask_b32
-; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
-; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
-; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
-; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC]]
+; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], 2.0, 4.0, [[CC]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
 define amdgpu_kernel void @ne_f(float %x) {
   %c1 = fcmp olt float %x, 1.0
@@ -97,18 +69,8 @@ define amdgpu_kernel void @ne_f(float %x) {
 }
 
 ; GCN-LABEL: {{^}}different_constants:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]]
-; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
-; GCN-DAG: v_cmp_lt_f32_e{{32|64}} [[CC1:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[CND1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
-; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC2:s\[[0-9]+:[0-9]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
-; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
-; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
-; GCN:     v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC2]]
-; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
+; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 2.0
+; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
 define amdgpu_kernel void @different_constants(float %x) {
   %c1 = fcmp olt float %x, 1.0
   %s1 = select i1 %c1, i32 56789, i32 1
diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
index 46d81e5706592cc301a419ec688db11fb225f292..b416537b9f8d818d680a8a430b6529ca547edee6 100644
--- a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
+++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR
 target datalayout = "A5"
 
 ; CHECK: debug_wavefront_private_segment_offset_sgpr = [[SOFF:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll
index 6867afee1706e3ddeea466e2339d6c764ce0cf9e..94d518e63ce242479c709b76b894b49cf1e950f0 100644
--- a/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -86,12 +86,10 @@ define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0,
 ; GCN: IeeeMode: 0
 define amdgpu_gs void @kill_gs_const() {
 main_body:
-  %0 = icmp ule i32 0, 3
-  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %1)
-  %2 = icmp ule i32 3, 0
-  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %3)
+  %cmp0 = icmp ule i32 0, 3
+  call void @llvm.amdgcn.kill(i1 %cmp0)
+  %cmp1 = icmp ule i32 3, 0
+  call void @llvm.amdgcn.kill(i1 %cmp1)
   ret void
 }
 
@@ -100,12 +98,12 @@ main_body:
 define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(4)* byval, [17 x <16 x i8>] addrspace(4)* byval, [17 x <4 x i32>] addrspace(4)* byval, [34 x <8 x i32>] addrspace(4)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
 entry:
   %tmp0 = fcmp olt float %13, 0.0
-  call void @llvm.AMDGPU.kill(float %14)
+  call void @llvm.amdgcn.kill(i1 %tmp0)
   %tmp1 = select i1 %tmp0, float 1.0, float 0.0
   ret float %tmp1
 }
 
-declare void @llvm.AMDGPU.kill(float)
+declare void @llvm.amdgcn.kill(i1)
 
 attributes #0 = { nounwind "target-cpu"="tahiti" }
 attributes #1 = { nounwind "target-cpu"="fiji" }
diff --git a/test/CodeGen/AMDGPU/dpp_combine.ll b/test/CodeGen/AMDGPU/dpp_combine.ll
new file mode 100644
index 0000000000000000000000000000000000000000..36356a72a772705fed2eb772e75141b282010dcc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -0,0 +1,185 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine -verify-machineinstrs < %s | FileCheck %s
+
+; VOP2 with literal cannot be combined
+; CHECK-LABEL: {{^}}dpp_combine_i32_literal:
+; CHECK: v_mov_b32_dpp [[OLD:v[0-9]+]], {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x1 bound_ctrl:0
+; CHECK: v_add_u32_e32 {{v[0-9]+}}, vcc, 42, [[OLD]]
+define amdgpu_kernel void @dpp_combine_i32_literal(i32 addrspace(1)* %out, i32 %in) {
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 2, i32 1, i1 1) #0
+  %res = add nsw i32 %dpp, 42
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_i32_bz:
+; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define amdgpu_kernel void @dpp_combine_i32_bz(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  %res = add nsw i32 %dpp, %x
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_i32_boff_undef:
+; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+define amdgpu_kernel void @dpp_combine_i32_boff_undef(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
+  %res = add nsw i32 %dpp, %x
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_i32_boff_0:
+; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define amdgpu_kernel void @dpp_combine_i32_boff_0(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
+  %res = add nsw i32 %dpp, %x
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_i32_boff_max:
+; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], -2
+; CHECK: v_max_i32_dpp [[OLD]], {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+define amdgpu_kernel void @dpp_combine_i32_boff_max(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 2147483647, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
+  %cmp = icmp sge i32 %dpp, %x
+  %res = select i1 %cmp, i32 %dpp, i32 %x
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_i32_boff_min:
+; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], 1
+; CHECK: v_min_i32_dpp [[OLD]], {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+define amdgpu_kernel void @dpp_combine_i32_boff_min(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
+  %cmp = icmp sle i32 %dpp, %x
+  %res = select i1 %cmp, i32 %dpp, i32 %x
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_i32_boff_mul:
+; CHECK: v_mul_i32_i24_dpp v0, v3, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+define amdgpu_kernel void @dpp_combine_i32_boff_mul(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 1, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
+
+  %dpp.shl = shl i32 %dpp, 8
+  %dpp.24 = ashr i32 %dpp.shl, 8
+  %x.shl = shl i32 %x, 8
+  %x.24 = ashr i32 %x.shl, 8
+  %res = mul i32 %dpp.24, %x.24
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_i32_commute:
+; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define amdgpu_kernel void @dpp_combine_i32_commute(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 2, i32 1, i32 1, i1 1) #0
+  %res = sub nsw i32 %x, %dpp
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_f32:
+; CHECK: v_add_f32_dpp {{v[0-9]+}}, {{v[0-9]+}}, v0  quad_perm:[3,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define amdgpu_kernel void @dpp_combine_f32(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 3, i32 1, i32 1, i1 1) #0
+  %dpp.f32 = bitcast i32 %dpp to float
+  %x.f32 = bitcast i32 %x to float
+  %res.f32 = fadd float %x.f32, %dpp.f32
+  %res = bitcast float %res.f32 to i32
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_test_f32_mods:
+; CHECK: v_mul_f32_dpp {{v[0-9]+}}, |{{v[0-9]+}}|, -v0  quad_perm:[0,1,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define amdgpu_kernel void @dpp_combine_test_f32_mods(i32 addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 4, i32 1, i32 1, i1 1) #0
+
+  %x.f32 = bitcast i32 %x to float
+  %x.f32.neg = fsub float -0.000000e+00, %x.f32
+
+  %dpp.f32 = bitcast i32 %dpp to float
+  %dpp.f32.cmp = fcmp fast olt float %dpp.f32, 0.000000e+00
+  %dpp.f32.sign = select i1 %dpp.f32.cmp, float -1.000000e+00, float 1.000000e+00
+  %dpp.f32.abs = fmul fast float %dpp.f32, %dpp.f32.sign
+
+  %res.f32 = fmul float %x.f32.neg, %dpp.f32.abs
+  %res = bitcast float %res.f32 to i32
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_mac:
+; CHECK: v_mac_f32_dpp v0, {{v[0-9]+}}, v1  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define amdgpu_kernel void @dpp_combine_mac(float addrspace(1)* %out, i32 %in) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  %dpp.f32 = bitcast i32 %dpp to float
+  %x.f32 = bitcast i32 %x to float
+  %y.f32 = bitcast i32 %y to float
+
+  %mult = fmul float %dpp.f32, %y.f32
+  %res = fadd float %mult, %x.f32
+  store float %res, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_sequence:
+define amdgpu_kernel void @dpp_combine_sequence(i32 addrspace(1)* %out, i32 %in, i1 %cmp) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  br i1 %cmp, label %bb1, label %bb2
+bb1:
+; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+  %resadd = add nsw i32 %dpp, %x
+  br label %bb3
+bb2:
+; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+  %ressub = sub nsw i32 %x, %dpp
+  br label %bb3
+bb3:
+  %res = phi i32 [%resadd, %bb1], [%ressub, %bb2]
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}dpp_combine_sequence_negative:
+; CHECK: v_mov_b32_dpp v1, v1  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define amdgpu_kernel void @dpp_combine_sequence_negative(i32 addrspace(1)* %out, i32 %in, i1 %cmp) {
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  br i1 %cmp, label %bb1, label %bb2
+bb1:
+  %resadd = add nsw i32 %dpp, %x
+  br label %bb3
+bb2:
+  %ressub = sub nsw i32 2, %dpp ; break seq
+  br label %bb3
+bb3:
+  %res = phi i32 [%resadd, %bb1], [%ressub, %bb2]
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/dpp_combine_subregs.mir b/test/CodeGen/AMDGPU/dpp_combine_subregs.mir
new file mode 100644
index 0000000000000000000000000000000000000000..83f992f492a7bb72b6e1a7a8e0fe69680ee1bb26
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dpp_combine_subregs.mir
@@ -0,0 +1,143 @@
+# RUN: llc -march=amdgcn -mcpu=tonga  -run-pass=gcn-dpp-combine  -o - %s | FileCheck %s
+
+# test if $old definition is correctly tracked through subreg manipulation pseudos
+
+---
+# CHECK-LABEL: name: mul_old_subreg
+# CHECK: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
+
+name:            mul_old_subreg
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: vreg_64 }
+  - { id: 5, class: vreg_64 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$vgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vreg_64 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
+    %4 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
+    %5 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4
+    %6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec
+    %7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec
+...
+
+# CHECK-LABEL: name: add_old_subreg
+# CHECK: [[OLD:\%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+# CHECK: %5:vgpr_32 = V_ADD_U32_dpp [[OLD]], %1, %0.sub1, 1, 1, 1, 1, implicit $exec
+
+name:            add_old_subreg
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: vgpr_32 }
+
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$vgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vreg_64 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted
+    %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
+    %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
+...
+
+# CHECK-LABEL: name: add_old_subreg_undef
+# CHECK: %5:vgpr_32 = V_ADD_U32_dpp %3.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
+
+name:            add_old_subreg_undef
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: vgpr_32 }
+
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$vgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vreg_64 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef
+    %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
+    %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
+...
+
+# CHECK-LABEL: name: add_f32_e64
+# CHECK: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
+# CHECK: %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
+# CHECK: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 1, 1, 1, implicit $exec
+# CHECK: %7:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 1, 1, 1, implicit $exec
+# CHECK: %9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec
+
+name:            add_f32_e64
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: vgpr_32 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+  - { id: 8, class: vgpr_32 }
+  - { id: 9, class: vgpr_32 }
+
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$vgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
+
+    ; this shouldn't be combined as omod is set
+    %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
+
+    %5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
+
+    ; this should be combined as all modifiers are default
+    %6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $exec
+
+    ; this should be combined as modifiers other than abs|neg are default
+    %7:vgpr_32 = V_ADD_F32_e64 1, %5, 2, %0, 0, 0, implicit $exec
+
+    %8:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
+
+    ; this shouldn't be combined as modifiers aren't abs|neg
+    %9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec
+...
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index 7cb070c12b65df4f44069de79814992893004971..03436f6b3a3c2ce048451362950b3f385076313f 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -31,8 +31,8 @@ define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, flo
 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
@@ -177,8 +177,8 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)*
 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
@@ -362,8 +362,8 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace
 ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 
-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
 
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
index 54f2500afab4f7a366b4f042dd7b1f6e83d30a92..2a405352f25352f86c19acc8b9d313f4446d22ee 100644
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 
@@ -30,8 +30,8 @@ define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)*
 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
@@ -59,8 +59,8 @@ define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)*
 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
@@ -87,8 +87,8 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrsp
 ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 
-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
 ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index 626a6e2c5b82854815e57ab6bf6e31300616c660..b8bb10632acb61d6e81db8b39b275ffaa06a7d03 100644
--- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -60,8 +60,7 @@ endif:
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 
-; GCN-DAG: buffer_store_dword v
-; GCN-DAG: buffer_store_dwordx2
+; GCN-DAG: buffer_store_dwordx3
 define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/elf-notes.ll b/test/CodeGen/AMDGPU/elf-notes.ll
index b81292bfdb96d8e15edf7c1c6602722d217d5b12..43e569de65cfc110307157bc23157f1f0e56d41e 100644
--- a/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/test/CodeGen/AMDGPU/elf-notes.ll
@@ -1,12 +1,12 @@
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ELF --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ELF --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ELF --check-prefix=GFX802 %s
 ; RUN: llc -march=r600 < %s | FileCheck --check-prefix=R600 %s
 
 ; OSABI-UNK-NOT: .hsa_code_object_version
diff --git a/test/CodeGen/AMDGPU/elf.metadata.ll b/test/CodeGen/AMDGPU/elf.metadata.ll
new file mode 100644
index 0000000000000000000000000000000000000000..097310a167392b43a3b2dbf395d9d37285091bd5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/elf.metadata.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -filetype=obj | llvm-readobj -symbols -s -sd - | FileCheck %s
+
+; CHECK: Section {
+; CHECK: Name: .AMDGPU.metadata.info_1
+; CHECK: Type: SHT_PROGBITS (0x1)
+; CHECK: Flags [ (0x0)
+; CHECK: Size: 16
+; CHECK: SectionData (
+; CHECK: 0000: 414D4431 414D4431 414D4431 414D4431  |AMD1AMD1AMD1AMD1|
+; CHECK: )
+; CHECK: }
+
+; CHECK: Section {
+; CHECK: Name: .AMDGPU.metadata.info_2
+; CHECK: Type: SHT_PROGBITS (0x1)
+; CHECK: Flags [ (0x0)
+; CHECK: Size: 16
+; CHECK: SectionData (
+; CHECK: 0000: 414D4432 414D4432 414D4432 414D4432  |AMD2AMD2AMD2AMD2|
+; CHECK: )
+; CHECK: }
+
+; CHECK: Section {
+; CHECK: Name: .AMDGPU.metadata.info_3
+; CHECK: Type: SHT_PROGBITS (0x1)
+; CHECK: Flags [ (0x0)
+; CHECK: Size: 16
+; CHECK: SectionData (
+; CHECK: 0000: 414D4433 414D4433 414D4433 414D4433  |AMD3AMD3AMD3AMD3|
+; CHECK: )
+; CHECK: }
+
+; CHECK: Symbol {
+; CHECK: Name: metadata_info_var_1
+; CHECK: Size: 16
+; CHECK: Binding: Local
+; CHECK: Section: .AMDGPU.metadata.info_1
+; CHECK: }
+
+; CHECK: Symbol {
+; CHECK: Name: metadata_info_var_2
+; CHECK: Size: 16
+; CHECK: Binding: Global
+; CHECK: Section: .AMDGPU.metadata.info_2
+; CHECK: }
+
+; CHECK: Symbol {
+; CHECK: Name: metadata_info_var_3
+; CHECK: Size: 16
+; CHECK: Binding: Global
+; CHECK: Section: .AMDGPU.metadata.info_3
+; CHECK: }
+
+@metadata_info_var_1 = internal global [4 x i32][i32 826559809, i32 826559809, i32 826559809, i32 826559809], align 1, section ".AMDGPU.metadata.info_1"
+@metadata_info_var_2 = constant [4 x i32][i32 843337025, i32 843337025, i32 843337025, i32 843337025], align 1, section ".AMDGPU.metadata.info_2"
+@metadata_info_var_3 = global [4 x i32][i32 860114241, i32 860114241, i32 860114241, i32 860114241], align 1, section ".AMDGPU.metadata.info_3"
diff --git a/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a2b68cc4214a9a24a3d19bfafa9e38aa77541a34
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -0,0 +1,321 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}float4_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]]
+; GCN: store_dword v[{{[0-9:]+}}], [[V3]]
+define amdgpu_kernel void @float4_extelt(float addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
+  store float %ext, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}int4_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4, [[V2]], [[C3]]
+; GCN: store_dword v[{{[0-9:]+}}], [[V3]]
+define amdgpu_kernel void @int4_extelt(i32 addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}double4_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
+define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}half4_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
+; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4
+; GCN:     s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]]
+; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
+; GCN:     store_short v[{{[0-9:]+}}], v[[VRL]]
+define amdgpu_kernel void @half4_extelt(half addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
+  store half %ext, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}float2_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
+; GCN: store_dword v[{{[0-9:]+}}], [[V1]]
+define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
+  store float %ext, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}double2_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
+define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}half8_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
+; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
+; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
+; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
+; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
+define amdgpu_kernel void @half8_extelt(half addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
+  store half %ext, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}short8_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
+; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
+; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
+; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
+; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
+define amdgpu_kernel void @short8_extelt(i16 addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i32 %sel
+  store i16 %ext, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}float8_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
+; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
+; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
+; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
+; GCN:     store_dword v[{{[0-9:]+}}], [[V7]]
+define amdgpu_kernel void @float8_extelt(float addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
+  store float %ext, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}float16_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: s_mov_b32 m0,
+; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000
+; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
+; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
+define amdgpu_kernel void @float16_extelt(float addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %sel
+  store float %ext, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}double16_extelt:
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: store_dword
+define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}byte8_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x8070605
+; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 3
+; GCN:     s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]]
+; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
+; GCN:     store_byte v[{{[0-9:]+}}], v[[VRL]]
+define amdgpu_kernel void @byte8_extelt(i8 addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i32 %sel
+  store i8 %ext, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}byte16_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
+; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
+; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
+; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
+; GCN-DAG: v_cmp_ne_u32_e64 [[C8:[^,]+]], [[IDX]], 8
+; GCN-DAG: v_cmp_ne_u32_e64 [[C9:[^,]+]], [[IDX]], 9
+; GCN-DAG: v_cmp_ne_u32_e64 [[C10:[^,]+]], [[IDX]], 10
+; GCN-DAG: v_cmp_ne_u32_e64 [[C11:[^,]+]], [[IDX]], 11
+; GCN-DAG: v_cmp_ne_u32_e64 [[C12:[^,]+]], [[IDX]], 12
+; GCN-DAG: v_cmp_ne_u32_e64 [[C13:[^,]+]], [[IDX]], 13
+; GCN-DAG: v_cmp_ne_u32_e64 [[C14:[^,]+]], [[IDX]], 14
+; GCN-DAG: v_cmp_ne_u32_e64 [[C15:[^,]+]], [[IDX]], 15
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V8:v[0-9]+]], {{[^,]+}}, [[V7]], [[C8]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V9:v[0-9]+]], {{[^,]+}}, [[V8]], [[C8]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V10:v[0-9]+]], {{[^,]+}}, [[V9]], [[C10]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V11:v[0-9]+]], {{[^,]+}}, [[V10]], [[C11]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V12:v[0-9]+]], {{[^,]+}}, [[V11]], [[C12]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V13:v[0-9]+]], {{[^,]+}}, [[V12]], [[C13]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V14:v[0-9]+]], {{[^,]+}}, [[V13]], [[C14]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V15:v[0-9]+]], {{[^,]+}}, [[V14]], [[C15]]
+; GCN:     store_byte v[{{[0-9:]+}}], [[V15]]
+define amdgpu_kernel void @byte16_extelt(i8 addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, i32 %sel
+  store i8 %ext, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}bit4_extelt:
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN-DAG: buffer_store_byte [[ZERO]],
+; GCN-DAG: buffer_store_byte [[ONE]],
+; GCN-DAG: buffer_store_byte [[ZERO]],
+; GCN-DAG: buffer_store_byte [[ONE]],
+; GCN:     buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[LOAD]]
+; GCN:     flat_store_dword v[{{[0-9:]+}}], [[RES]]
+define amdgpu_kernel void @bit4_extelt(i32 addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <4 x i1> <i1 0, i1 1, i1 0, i1 1>, i32 %sel
+  %zext = zext i1 %ext to i32
+  store i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}bit128_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 1, 0,
+; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
+; GCN-DAG: v_cmp_ne_u32_e32 [[CL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, v{{[0-9]+}}, [[CL]]
+; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]]
+; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
+define amdgpu_kernel void @bit128_extelt(i32 addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <128 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, i32 %sel
+  %zext = zext i1 %ext to i32
+  store i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index 203c813f4fb452e5f5ab1556444dd62999f7959e..1dc85cab8fc8a622378699af78d73b927882bb7e 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2:
 ; GCN: buffer_load_dwordx4
@@ -13,6 +13,14 @@ define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
+; GCN-NOT: buffer_load
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
@@ -20,6 +28,17 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %ou
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
+; GCN-NOT: buffer_load
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 96c16d320153a21a9990b9e50b2e2db9c499b8ef..62144ba0437632b8fb6aca8ef32b08f6970aaae5 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -30,6 +30,11 @@ define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
+; GCN-NOT: buffer_load
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
@@ -37,6 +42,12 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
+; GCN:     buffer_load_dwordx4
+; GCN-NOT: buffer_load
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
   %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
   %or = or <2 x i64> %load, %arst
@@ -46,6 +57,14 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
+; GCN-NOT: buffer_load
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
@@ -53,6 +72,17 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
+; GCN-NOT: buffer_load
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll
index ba72969b82617e16c452934a0cdbaa38e83b7046..f96019dba6dcc99ea446106959b1f2206ae5ec40 100644
--- a/test/CodeGen/AMDGPU/fabs.ll
+++ b/test/CodeGen/AMDGPU/fabs.ll
@@ -11,7 +11,8 @@
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
@@ -23,7 +24,8 @@ define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
@@ -34,7 +36,8 @@ define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
 ; FUNC-LABEL: {{^}}s_fabs_f32:
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
index ba26ed2383296e34057dda632980f0c73e45a13a..da852af3f2303dd87b4e3550e663ffcc5b8a71a8 100644
--- a/test/CodeGen/AMDGPU/fceil64.ll
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -17,8 +17,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ;        are not always followed.
 ; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01
 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]]
-; SI-DAG: s_not_b64
-; SI-DAG: s_and_b64
+; SI-DAG: s_andn2_b64
 ; SI-DAG: cmp_gt_i32
 ; SI-DAG: cndmask_b32
 ; SI-DAG: cndmask_b32
diff --git a/test/CodeGen/AMDGPU/fdot2.ll b/test/CodeGen/AMDGPU/fdot2.ll
index 35364d340601e86c576d4b3db3052d7a4aa9c698..da573abe86a52bfe83c2a6b4a08ddda5b50a9ff5 100644
--- a/test/CodeGen/AMDGPU/fdot2.ll
+++ b/test/CodeGen/AMDGPU/fdot2.ll
@@ -8,16 +8,16 @@
 ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
 ; are not converted from f16 to f32.
 ; GCN-LABEL: {{^}}dotproduct_f16
-; GFX900: v_fma_legacy_f16
-; GCN900: v_fma_legacy_f16
+; GFX900: v_fma_f16
+; GFX900: v_fma_f16
 
 ; GFX906: v_mul_f16_e32
 ; GFX906: v_mul_f16_e32
 
-; GFX906-UNSAFE:  v_fma_legacy_f16
+; GFX906-UNSAFE:  v_fma_f16
 
 ; GFX906-CONTRACT: v_mac_f16_e32
-; GFX906-DENORM-CONTRACT: v_fma_legacy_f16
+; GFX906-DENORM-CONTRACT: v_fma_f16
 define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1,
                                           <2 x half> addrspace(1)* %src2,
                                           half addrspace(1)* nocapture %dst) {
@@ -45,7 +45,7 @@ entry:
 ; and the vectors are of type <2 x half>
 ; GCN-LABEL: {{^}}dotproduct_f16_f32
 ; GFX900: v_mad_mix_f32
-; GCN900: v_mad_mix_f32
+; GFX900: v_mad_mix_f32
 
 ; GFX906: v_mad_f32
 ; GFX906: v_mac_f32_e32
@@ -85,7 +85,7 @@ entry:
 ; and the vectors are of type <2 x half>
 ; GCN-LABEL: {{^}}dotproduct_diffvecorder
 ; GFX900: v_mad_mix_f32
-; GCN900: v_mad_mix_f32
+; GFX900: v_mad_mix_f32
 
 ; GFX906: v_mad_f32
 ; GFX906: v_mac_f32_e32
@@ -159,7 +159,7 @@ entry:
 
 ; GCN-LABEL: {{^}}NotAdotproduct
 ; GFX900: v_mad_mix_f32
-; GCN900: v_mad_mix_f32
+; GFX900: v_mad_mix_f32
 
 ; GFX906: v_mad_f32
 ; GFX906: v_mac_f32_e32
@@ -196,7 +196,7 @@ entry:
 
 ; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
 ; GFX900: v_mad_mix_f32
-; GCN900: v_mad_mix_f32
+; GFX900: v_mad_mix_f32
 
 ; GFX906: v_mad_f32
 ; GFX906: v_mac_f32_e32
@@ -229,4 +229,4 @@ entry:
   %acc2 = fadd float %mul1, %acc1
   store float %acc2, float addrspace(1)* %dst, align 4
   ret void
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index b2ac534a7d655d82acc96c90e2de5b0d43a8649f..17f557b3a6cd59605f49b6672f848f6830dd7a9a 100644
--- a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=HSA-NOADDR64 -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index a7664c399fbb0c505ffa1d82e71a65b4a0fd7f32..38909d3e3e9c1a0840fe70b1a5753694ec6a66c9 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -7,9 +7,9 @@
 ; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=stoney  -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 ; HSA-CI: is_xnack_enabled = 0
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
index c72dab08e38676b5c91c3548582bc5907d20de06..0ff5d9652c1047bada4f4b480d4884e52732e92c 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -1,5 +1,5 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
@@ -35,6 +35,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; VI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
diff --git a/test/CodeGen/AMDGPU/fpext-free.ll b/test/CodeGen/AMDGPU/fpext-free.ll
index 0a504b3e03e4e833ec1a9bfd2b7b7e31d3b597ca..1bd4caa180a4b853e0d4f9f9f5c04d1266f9e7aa 100644
--- a/test/CodeGen/AMDGPU/fpext-free.ll
+++ b/test/CodeGen/AMDGPU/fpext-free.ll
@@ -171,7 +171,7 @@ entry:
 
 ; GCN-LABEL: {{^}}fadd_fpext_fmuladd_f16_to_f32:
 ; GFX9: v_mul_f16
-; GFX9: v_fma_legacy_f16
+; GFX9: v_fma_f16
 ; GFX9: v_cvt_f32_f16
 ; GFX9: v_add_f32_e32
 define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
@@ -185,7 +185,7 @@ entry:
 
 ; GCN-LABEL: {{^}}fadd_fpext_fma_f16_to_f32:
 ; GFX9: v_mul_f16
-; GFX9: v_fma_legacy_f16
+; GFX9: v_fma_f16
 ; GFX9: v_cvt_f32_f16
 ; GFX9: v_add_f32_e32
 define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
@@ -199,7 +199,7 @@ entry:
 
 ; GCN-LABEL: {{^}}fadd_fpext_fma_f16_to_f32_commute:
 ; GFX9: v_mul_f16
-; GFX9: v_fma_legacy_f16
+; GFX9: v_fma_f16
 ; GFX9: v_cvt_f32_f16
 ; GFX9: v_add_f32_e32
 define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
@@ -322,7 +322,7 @@ entry:
 
 ; GCN-LABEL: {{^}}fsub_fpext_muladd_mul_f16_to_f32:
 ; GFX9: v_mul_f16
-; GFX9: v_fma_legacy_f16
+; GFX9: v_fma_f16
 ; GFX9: v_cvt_f32_f16
 ; GFX9: v_sub_f32
 ; GCN: s_setpc_b64
@@ -363,7 +363,7 @@ entry:
 ; GCN-LABEL: {{^}}fsub_fpext_muladd_mul_f16_to_f32_commute:
 ; GCN: s_waitcnt
 ; GFX9-NEXT: v_mul_f16_e32 v3, v3, v4
-; GFX9-NEXT: v_fma_legacy_f16 v1, v1, v2, v3
+; GFX9-NEXT: v_fma_f16 v1, v1, v2, v3
 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT: s_setpc_b64
diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 44a69185f43c65e58d4d5891b35996159e826e99..114710d1e92a2d93d43040b3eb4a79f451b5007f 100644
--- a/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -167,13 +167,13 @@ ret:
 ; GCN-DAG: s_movk_i32 s6, 0x204
 
 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
-; CI: v_add_i32_e64 v0, s[6:7], s6, [[SCALED]]
+; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], s6, [[SCALED]]
 
 ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
-; GFX9: v_add_u32_e32 v0, s6, [[SCALED]]
+; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], s6, [[SCALED]]
 
-; GCN: v_mul_lo_i32 v0, v0, 9
-; GCN: ds_write_b32 v0, v0
+; GCN: v_mul_lo_i32 [[VZ]], [[VZ]], 9
+; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
@@ -191,13 +191,13 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
 ; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204
 
 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6
-; CI: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
+; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
 
 ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]]
-; GFX9: v_add_u32_e32 v0, [[OFFSET]], [[SCALED]]
+; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[OFFSET]], [[SCALED]]
 
-; GCN: v_mul_lo_i32 v0, v0, 9
-; GCN: ds_write_b32 v0, v0
+; GCN: v_mul_lo_i32 [[VZ]], [[VZ]], 9
+; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 {
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 6fc4c8b7d243b737f768a6cadef22df56658f1cb..226125335c38dfc3fd4c5432f16a5e402c459e89 100644
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -27,8 +27,7 @@ define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrsp
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
-; SI-DAG: s_not_b64
-; SI-DAG: s_and_b64
+; SI-DAG: s_andn2_b64
 ; SI-DAG: cmp_gt_i32
 ; SI-DAG: cndmask_b32
 ; SI-DAG: cndmask_b32
diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll
index 7fb47e08ea58c51b6e2d36c7e59b8b4952033a96..b2fd9f6acf9d95939ec929d20c1f5822f52180ec 100644
--- a/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %arr
 ; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
 ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
 ; SI, which is why it is being OR'd with the base pointer.
-; SI: s_or_b32
+; SI: s_bitset1_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
 define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
diff --git a/test/CodeGen/AMDGPU/gfx902-without-xnack.ll b/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
index 445e112a3016a40711333714460076ed47af999e..8577382cff5a303f433423eae1d4ed0b206a886f 100644
--- a/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
+++ b/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-code-object-v3,-xnack < %s | FileCheck %s
 
 ; CHECK: .hsa_code_object_isa 9,0,2,"AMD","AMDGPU"
 define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
diff --git a/test/CodeGen/AMDGPU/global-load-store-atomics.mir b/test/CodeGen/AMDGPU/global-load-store-atomics.mir
new file mode 100644
index 0000000000000000000000000000000000000000..8ac7c3e14a172d337e706fd662208de6474f4ff8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-load-store-atomics.mir
@@ -0,0 +1,249 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-fixup-vector-isel -amdgpu-enable-global-sgpr-addr %s -o - | FileCheck -check-prefix=GCN %s
+
+# Coverage tests for GLOBAL_* to their _SADDR equivalent.
+
+# GCN-LABEL: name: global_load_store_atomics
+# GCN:      GLOBAL_LOAD_DWORD_SADDR
+# GCN:      GLOBAL_STORE_DWORD_SADDR
+# GCN:      GLOBAL_LOAD_DWORDX2_SADDR
+# GCN:      GLOBAL_STORE_DWORDX2_SADDR
+# GCN:      GLOBAL_LOAD_DWORDX3_SADDR
+# GCN:      GLOBAL_STORE_DWORDX3_SADDR
+# GCN:      GLOBAL_LOAD_DWORDX4_SADDR
+# GCN:      GLOBAL_STORE_DWORDX4_SADDR
+# GCN:      GLOBAL_LOAD_SSHORT_SADDR
+# GCN:      GLOBAL_STORE_SHORT_SADDR
+# GCN:      GLOBAL_LOAD_USHORT_SADDR
+# GCN:      GLOBAL_STORE_SHORT_SADDR
+# GCN:      GLOBAL_LOAD_UBYTE_SADDR
+# GCN:      GLOBAL_STORE_BYTE_SADDR
+# GCN:      GLOBAL_LOAD_SBYTE_SADDR
+# GCN:      GLOBAL_STORE_BYTE_SADDR
+# GCN:      GLOBAL_LOAD_SBYTE_D16_SADDR
+# GCN:      GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_UBYTE_D16_SADDR
+# GCN:      GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_SBYTE_D16_HI_SADDR
+# GCN:      GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_UBYTE_D16_HI_SADDR
+# GCN:      GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_SHORT_D16_HI_SADDR
+# GCN:      GLOBAL_STORE_SHORT_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_SHORT_D16_SADDR
+# GCN:      GLOBAL_STORE_SHORT_D16_HI_SADDR
+
+# GCN:      GLOBAL_ATOMIC_XOR_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_XOR_SADDR %
+# GCN:      GLOBAL_ATOMIC_SMIN_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_SMIN_SADDR %
+# GCN:      GLOBAL_ATOMIC_AND_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_AND_SADDR %
+# GCN:      GLOBAL_ATOMIC_SWAP_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_SWAP_SADDR %
+# GCN:      GLOBAL_ATOMIC_SMAX_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_SMAX_SADDR %
+# GCN:      GLOBAL_ATOMIC_UMIN_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_UMIN_SADDR %
+# GCN:      GLOBAL_ATOMIC_UMAX_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_UMAX_SADDR %
+# GCN:      GLOBAL_ATOMIC_OR_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_OR_SADDR %
+# GCN:      GLOBAL_ATOMIC_ADD_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_ADD_SADDR %
+# GCN:      GLOBAL_ATOMIC_SUB_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_SUB_SADDR %
+# GCN:      GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_CMPSWAP_SADDR %
+# GCN:      GLOBAL_ATOMIC_INC_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_INC_SADDR %
+# GCN:      GLOBAL_ATOMIC_DEC_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_DEC_SADDR %
+
+# GCN:      GLOBAL_ATOMIC_OR_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_OR_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_XOR_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_XOR_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_AND_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_AND_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_ADD_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_ADD_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_SUB_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_SUB_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_DEC_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_DEC_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_INC_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_INC_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_SMIN_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_SMIN_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_SWAP_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_SWAP_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_SMAX_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_SMAX_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_UMIN_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_UMIN_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_UMAX_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_UMAX_X2_SADDR %
+# GCN:      GLOBAL_ATOMIC_CMPSWAP_X2_SADDR_RTN
+# GCN:      GLOBAL_ATOMIC_CMPSWAP_X2_SADDR %
+
+name:            global_load_store_atomics
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 36, 0 :: (dereferenceable invariant load 8 )
+    %5:sreg_32_xm0 = S_MOV_B32 2
+    %6:vgpr_32 = V_LSHLREV_B32_e64 killed %5, %0, implicit $exec
+    %7:sreg_32_xm0 = S_MOV_B32 0
+    %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %14:vreg_64 = REG_SEQUENCE killed %6, %subreg.sub0, killed %15, %subreg.sub1
+    %21:sgpr_32 = COPY %4.sub0
+    %22:vgpr_32 = COPY %14.sub0
+    %23:sgpr_32 = COPY %4.sub1
+    %24:vgpr_32 = COPY %14.sub1
+    %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21, %22, implicit $exec
+    %25:vgpr_32 = COPY %23
+    %18:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %25, %24, killed %19, implicit $exec
+    %16:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %18, %subreg.sub1
+    %11:vreg_64 = COPY %16
+
+    %10:vgpr_32 = GLOBAL_LOAD_DWORD %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_DWORD %11, %10, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %40:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_DWORDX2 %11, %40, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %41:vreg_96 = GLOBAL_LOAD_DWORDX3 %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_DWORDX3 %11, %41, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %42:vreg_128 = GLOBAL_LOAD_DWORDX4 %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_DWORDX4 %11, %42, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %43:vgpr_32 = GLOBAL_LOAD_SSHORT %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_SHORT %11, %43, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %44:vgpr_32 = GLOBAL_LOAD_USHORT %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_SHORT %11, %44, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %45:vgpr_32 = GLOBAL_LOAD_UBYTE %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE %11, %45, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %46:vgpr_32 = GLOBAL_LOAD_SBYTE %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE %11, %46, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %47:vgpr_32 = GLOBAL_LOAD_SBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE_D16_HI %11, %47, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %48:vgpr_32 = GLOBAL_LOAD_UBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE_D16_HI %11, %48, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %49:vgpr_32 = GLOBAL_LOAD_SBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE_D16_HI %11, %49, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %50:vgpr_32 = GLOBAL_LOAD_UBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE_D16_HI %11, %50, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %51:vgpr_32 = GLOBAL_LOAD_SHORT_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_SHORT_D16_HI %11, %51, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %52:vgpr_32 = GLOBAL_LOAD_SHORT_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_SHORT_D16_HI %11, %52, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+
+    %53:vgpr_32 = GLOBAL_ATOMIC_XOR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %53, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_XOR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %54:vgpr_32 = GLOBAL_ATOMIC_SMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %54, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_SMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %55:vgpr_32 = GLOBAL_ATOMIC_AND_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %55, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_AND %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %56:vgpr_32 = GLOBAL_ATOMIC_SWAP_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %56, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_SWAP %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %57:vgpr_32 = GLOBAL_ATOMIC_SMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %57, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_SMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %58:vgpr_32 = GLOBAL_ATOMIC_UMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %58, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_UMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %59:vgpr_32 = GLOBAL_ATOMIC_UMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %59, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_UMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %60:vgpr_32 = GLOBAL_ATOMIC_OR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %60, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_OR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %61:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %61, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_ADD %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %62:vgpr_32 = GLOBAL_ATOMIC_SUB_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %62, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_SUB %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %63:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %63, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_CMPSWAP %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %64:vgpr_32 = GLOBAL_ATOMIC_INC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %64, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_INC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %65:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORD %11, %65, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_DEC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %66:vreg_64 = GLOBAL_ATOMIC_OR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %66, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_OR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %67:vreg_64 = GLOBAL_ATOMIC_XOR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %67, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_XOR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %68:vreg_64 = GLOBAL_ATOMIC_AND_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %68, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_AND_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %69:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %69, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_ADD_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %70:vreg_64 = GLOBAL_ATOMIC_SUB_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %70, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_SUB_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %71:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %71, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_DEC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %72:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %72, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_INC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %73:vreg_64 = GLOBAL_ATOMIC_SMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %73, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_SMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %74:vreg_64 = GLOBAL_ATOMIC_SWAP_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %74, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_SWAP_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %75:vreg_64 = GLOBAL_ATOMIC_SMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %75, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_SMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %76:vreg_64 = GLOBAL_ATOMIC_UMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %76, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_UMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %77:vreg_64 = GLOBAL_ATOMIC_UMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %77, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_UMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    %79:sreg_128 = REG_SEQUENCE %4, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3
+    %80:vreg_128 = COPY %79
+
+    %78:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %11, %78, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    GLOBAL_ATOMIC_CMPSWAP_X2 %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/global-saddr.ll b/test/CodeGen/AMDGPU/global-saddr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b21fd9852267908909c299eb983f6502a58ed6be
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-saddr.ll
@@ -0,0 +1,102 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GFX9 %s
+
+; Test for a conv2d like sequence of loads.
+
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}}
+; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
+
+define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep = getelementptr i64, i64 addrspace(1)* %src_image, i64 %idx
+  %ptr0 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1
+  %load0 = load i64, i64 addrspace(1)* %ptr0
+  %ptr1 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 2
+  %load1 = load i64, i64 addrspace(1)* %ptr1
+  %ptr2 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 3
+  %load2 = load i64, i64 addrspace(1)* %ptr2
+  %ptr3 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 4
+  %load3 = load i64, i64 addrspace(1)* %ptr3
+  %ptr4 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -4
+  %load4 = load i64, i64 addrspace(1)* %ptr4
+  %ptr5 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -3
+  %load5 = load i64, i64 addrspace(1)* %ptr5
+  %ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2
+  %load6 = load i64, i64 addrspace(1)* %ptr6
+  %ptr7 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1
+  %load7 = load i64, i64 addrspace(1)* %ptr7
+  %ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 0
+  %load8 = load i64, i64 addrspace(1)* %ptr8
+  %add0 = add i64 %load1, %load0
+  %add1 = add i64 %load3, %load2
+  %add2 = add i64 %load5, %load4
+  %add3 = add i64 %load7, %load6
+  %add4 = add i64 %add0, %load8
+  %add5 = add i64 %add2, %add1
+  %add6 = add i64 %add4, %add3
+  %add7 = add i64 %add6, %add5
+  %gep9 = getelementptr i64, i64 addrspace(1)* %dst_image, i64 %idx
+  %ptr9 = getelementptr inbounds i64, i64 addrspace(1)* %gep9, i64 1
+  store volatile i64 %add7, i64 addrspace(1)* %ptr9
+
+; Test various offset boundaries.
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
+  %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511
+  %load11 = load i64, i64 addrspace(1)* %gep11
+  %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023
+  %load12 = load i64, i64 addrspace(1)* %gep12
+  %gep13 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
+  %load13 = load i64, i64 addrspace(1)* %gep13
+  %add11 = add i64 %load11, %load12
+  %add12 = add i64 %add11, %load13
+  store volatile i64 %add12, i64 addrspace(1)* undef
+
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}}
+  %gep21 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1024
+  %load21 = load i64, i64 addrspace(1)* %gep21
+  %gep22 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2048
+  %load22 = load i64, i64 addrspace(1)* %gep22
+  %gep23 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -512
+  %load23 = load i64, i64 addrspace(1)* %gep23
+  %add21 = add i64 %load22, %load21
+  %add22 = add i64 %add21, %load23
+  store volatile i64 %add22, i64 addrspace(1)* undef
+
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
+  %gep31 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 257
+  %load31 = load i64, i64 addrspace(1)* %gep31
+  %gep32 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 256
+  %load32 = load i64, i64 addrspace(1)* %gep32
+  %gep33 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
+  %load33 = load i64, i64 addrspace(1)* %gep33
+  %add34 = add i64 %load32, %load31
+  %add35 = add i64 %add34, %load33
+  store volatile i64 %add35, i64 addrspace(1)* undef
+  ret void
+}
+
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}}
+; GFX9-NEXT: s_waitcnt
+; NGFX9-NOT: global_load_dword
+
+define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) {
+bb:
+  %tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)*
+  %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16
+  store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index b1901cf894b08ef4f02b02ea800b83c50cd9bae1..a454fa02579be15dbae02c9c46c944621d4e18eb 100644
--- a/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -70,10 +70,10 @@ define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, doub
   ret void
 }
 
-attributes #0 = { nounwind "target-cpu"="kaveri" }
-attributes #1 = { nounwind "target-cpu"="fiji" }
-attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" }
-attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" }
-attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
-attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
-attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" }
+attributes #0 = { nounwind "target-cpu"="kaveri" "target-features"="-code-object-v3" }
+attributes #1 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3" }
+attributes #2 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,+fp64-fp16-denormals" }
+attributes #3 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,-fp64-fp16-denormals" }
+attributes #4 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,+fp64-fp16-denormals" }
+attributes #5 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,-fp64-fp16-denormals" }
+attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3,-dx10-clamp" }
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index d117cf59ee15c238a6ee58c02e09b41ec94bd46d..76a17215b7fa1224728bb0357c1323176a826090 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-deduce-ro-arg-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-deduce-ro-arg-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..15b6286bba77d6c85eb40b6f2c6f0019ecc82e5d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-deduce-ro-arg-v3.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s
+
+; CHECK:        .symbol:          test_ro_arg.kd
+; CHECK:        .name:            test_ro_arg
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:       'float*'
+; CHECK-NEXT:       .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            in
+; CHECK-NEXT:       .access:          read_only
+; CHECK-NEXT:       .offset:          0
+; CHECK-NEXT:       .is_const:        true
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .is_restrict:     true
+; CHECK-NEXT:       .value_type:      f32
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .type_name:       'float*'
+; CHECK-NEXT:       .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            out
+; CHECK-NEXT:       .offset:          8
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      f32
+; CHECK-NEXT:       .address_space:   global
+
+define amdgpu_kernel void @test_ro_arg(float addrspace(1)* noalias readonly %in, float addrspace(1)* %out)
+    !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2
+    !kernel_arg_base_type !2 !kernel_arg_type_qual !3 {
+  ret void
+}
+
+!0 = !{i32 1, i32 1}
+!1 = !{!"none", !"none"}
+!2 = !{!"float*", !"float*"}
+!3 = !{!"const restrict", !""}
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d72f3f6d745f503a8df8477b859110043ff35ad7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel-v3.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+; CHECK:        .symbol:          test_non_enqueue_kernel_caller.kd
+; CHECK:        .name:            test_non_enqueue_kernel_caller
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:     char
+; CHECK-NEXT:       .value_kind:    by_value
+; CHECK-NEXT:       .offset:        0
+; CHECK-NEXT:       .size:          1
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .name:          a
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_x
+; CHECK-NEXT:       .offset:        8
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_y
+; CHECK-NEXT:       .offset:        16
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_z
+; CHECK-NEXT:       .offset:        24
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NOT:        .value_kind:    hidden_default_queue
+; CHECK-NOT:        .value_kind:    hidden_completion_action
+define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_enqueue_kernel_caller.kd
+; CHECK:        .name:            test_enqueue_kernel_caller
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:     char
+; CHECK-NEXT:       .value_kind:    by_value
+; CHECK-NEXT:       .offset:        0
+; CHECK-NEXT:       .size:          1
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .name:          a
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_x
+; CHECK-NEXT:       .offset:        8
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_y
+; CHECK-NEXT:       .offset:        16
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_z
+; CHECK-NEXT:       .offset:        24
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_none
+; CHECK-NEXT:       .offset:        32
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .address_space: global
+; CHECK-NEXT:     - .value_kind:    hidden_default_queue
+; CHECK-NEXT:       .offset:        40
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .address_space: global
+; CHECK-NEXT:     - .value_kind:    hidden_completion_action
+; CHECK-NEXT:       .offset:        48
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .address_space: global
+define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK-NOT:  amdhsa.printf:
+
+attributes #0 = { "calls-enqueue-kernel" }
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"char"}
+!4 = !{!""}
+
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll b/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
index 77624aaafad122007be6918140aee1ba3a83a047..f6e3d94b4dce16faf517549e241d3fdadcbb7d76 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
 
 ; CHECK: ---
 ; CHECK:  Version: [ 1, 0 ]
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..85ee9b4858b4defa898193ecc5835518ebb1cab1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
@@ -0,0 +1,1453 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+%struct.A = type { i8, float }
+%opencl.image1d_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+%opencl.queue_t = type opaque
+%opencl.pipe_t = type opaque
+%struct.B = type { i32 addrspace(1)*}
+%opencl.clk_event_t = type opaque
+
+@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+; CHECK:        .symbol:          test_char.kd
+; CHECK:        .name:            test_char
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      char
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           1
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NOT:        .value_kind:     hidden_default_queue
+; CHECK-NOT:        .value_kind:     hidden_completion_action
+define amdgpu_kernel void @test_char(i8 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
+    !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_ushort2.kd
+; CHECK:        .name:            test_ushort2
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      ushort2
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     u16
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_ushort2(<2 x i16> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10
+    !kernel_arg_base_type !10 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_int3.kd
+; CHECK:        .name:            test_int3
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int3
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           16
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_int3(<3 x i32> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11
+    !kernel_arg_base_type !11 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_ulong4.kd
+; CHECK:        .name:            test_ulong4
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      ulong4
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           32
+; CHECK-NEXT:       .value_type:     u64
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         56
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_ulong4(<4 x i64> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12
+    !kernel_arg_base_type !12 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_half8.kd
+; CHECK:        .name:            test_half8
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      half8
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           16
+; CHECK-NEXT:       .value_type:     f16
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_half8(<8 x half> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13
+    !kernel_arg_base_type !13 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_float16.kd
+; CHECK:        .name:            test_float16
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      float16
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           64
+; CHECK-NEXT:       .value_type:     f32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         64
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         72
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         80
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         88
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_float16(<16 x float> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14
+    !kernel_arg_base_type !14 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_double16.kd
+; CHECK:        .name:            test_double16
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      double16
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           128
+; CHECK-NEXT:       .value_type:     f64
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         128
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         136
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         144
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         152
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_double16(<16 x double> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15
+    !kernel_arg_base_type !15 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_pointer.kd
+; CHECK:        .name:            test_pointer
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16
+    !kernel_arg_base_type !16 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_image.kd
+; CHECK:        .name:            test_image
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      image2d_t
+; CHECK-NEXT:       .value_kind:     image
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17
+    !kernel_arg_base_type !17 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_sampler.kd
+; CHECK:        .name:            test_sampler
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      sampler_t
+; CHECK-NEXT:       .value_kind:     sampler
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_sampler(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18
+    !kernel_arg_base_type !18 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_queue.kd
+; CHECK:        .name:            test_queue
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      queue_t
+; CHECK-NEXT:       .value_kind:     queue
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19
+    !kernel_arg_base_type !19 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_struct.kd
+; CHECK:        .name:            test_struct
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      struct A
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space: private
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_struct(%struct.A addrspace(5)* byval %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20
+    !kernel_arg_base_type !20 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_i128.kd
+; CHECK:        .name:            test_i128
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      i128
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           16
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_i128(i128 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21
+    !kernel_arg_base_type !21 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_multi_arg.kd
+; CHECK:        .name:            test_multi_arg
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .type_name:      short2
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         4
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i16
+; CHECK-NEXT:       .name:           b
+; CHECK-NEXT:     - .type_name:      char3
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .name:           c
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c)
+    !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24
+    !kernel_arg_base_type !24 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_addr_space.kd
+; CHECK:        .name:            test_addr_space
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           g
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           c
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space: constant
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           l
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .pointee_align:  4
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g,
+                                           i32 addrspace(4)* %c,
+                                           i32 addrspace(3)* %l)
+    !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51
+    !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_type_qual.kd
+; CHECK:        .name:            test_type_qual
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .is_volatile:    true
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           b
+; CHECK-NEXT:       .is_const:       true
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .is_restrict:    true
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     pipe
+; CHECK-NEXT:       .name:           c
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .is_pipe:        true
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a,
+                                          i32 addrspace(1)* %b,
+                                          %opencl.pipe_t addrspace(1)* %c)
+    !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51
+    !kernel_arg_base_type !51 !kernel_arg_type_qual !70 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_access_qual.kd
+; CHECK:        .name:            test_access_qual
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      image1d_t
+; CHECK-NEXT:       .value_kind:     image
+; CHECK-NEXT:       .name:           ro
+; CHECK-NEXT:       .access:         read_only
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      image2d_t
+; CHECK-NEXT:       .value_kind:     image
+; CHECK-NEXT:       .name:           wo
+; CHECK-NEXT:       .access:         write_only
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      image3d_t
+; CHECK-NEXT:       .value_kind:     image
+; CHECK-NEXT:       .name:           rw
+; CHECK-NEXT:       .access:         read_write
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro,
+                                            %opencl.image2d_t addrspace(1)* %wo,
+                                            %opencl.image3d_t addrspace(1)* %rw)
+    !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62
+    !kernel_arg_base_type !62 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_half.kd
+; CHECK:        .name:            test_vec_type_hint_half
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   half
+define amdgpu_kernel void @test_vec_type_hint_half(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_float.kd
+; CHECK:        .name:            test_vec_type_hint_float
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   float
+define amdgpu_kernel void @test_vec_type_hint_float(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_double.kd
+; CHECK:        .name:            test_vec_type_hint_double
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   double
+define amdgpu_kernel void @test_vec_type_hint_double(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_char.kd
+; CHECK:        .name:            test_vec_type_hint_char
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   char
+define amdgpu_kernel void @test_vec_type_hint_char(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_short.kd
+; CHECK:        .name:            test_vec_type_hint_short
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   short
+define amdgpu_kernel void @test_vec_type_hint_short(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_long.kd
+; CHECK:        .name:            test_vec_type_hint_long
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   long
+define amdgpu_kernel void @test_vec_type_hint_long(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_unknown.kd
+; CHECK:        .name:            test_vec_type_hint_unknown
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   unknown
+define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 {
+  ret void
+}
+
+; CHECK:        .reqd_workgroup_size:
+; CHECK-NEXT:     - 1
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 4
+; CHECK:        .symbol:          test_reqd_wgs_vec_type_hint.kd
+; CHECK:        .name:            test_reqd_wgs_vec_type_hint
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:          int
+; CHECK-NEXT:       .value_kind:         by_value
+; CHECK-NEXT:       .offset:             0
+; CHECK-NEXT:       .size:               4
+; CHECK-NEXT:       .value_type:         i32
+; CHECK-NEXT:       .name:               a
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_x
+; CHECK-NEXT:       .offset:             8
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_y
+; CHECK-NEXT:       .offset:             16
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_z
+; CHECK-NEXT:       .offset:             24
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_printf_buffer
+; CHECK-NEXT:       .offset:             32
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i8
+; CHECK-NEXT:       .address_space:      global
+; CHECK:        .vec_type_hint:       int
+define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5
+    !reqd_work_group_size !6 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_wgs_hint_vec_type_hint.kd
+; CHECK:        .workgroup_size_hint:
+; CHECK-NEXT:     - 8
+; CHECK-NEXT:     - 16
+; CHECK-NEXT:     - 32
+; CHECK:        .name:            test_wgs_hint_vec_type_hint
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:          int
+; CHECK-NEXT:       .value_kind:         by_value
+; CHECK-NEXT:       .offset:             0
+; CHECK-NEXT:       .size:               4
+; CHECK-NEXT:       .value_type:         i32
+; CHECK-NEXT:       .name:               a
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_x
+; CHECK-NEXT:       .offset:             8
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_y
+; CHECK-NEXT:       .offset:             16
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_z
+; CHECK-NEXT:       .offset:             24
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_printf_buffer
+; CHECK-NEXT:       .offset:             32
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i8
+; CHECK-NEXT:       .address_space:      global
+; CHECK:        .vec_type_hint:       uint4
+define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7
+    !work_group_size_hint !8 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_arg_ptr_to_ptr.kd
+; CHECK:        .name:            test_arg_ptr_to_ptr
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)* addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 addrspace(5)* addrspace(1)* %a)
+    !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80
+    !kernel_arg_base_type !80 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_arg_struct_contains_ptr.kd
+; CHECK:        .name:            test_arg_struct_contains_ptr
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      struct B
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space: private
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B addrspace(5)* byval %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82
+    !kernel_arg_base_type !82 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK:        .symbol:          test_arg_vector_of_ptr.kd
+; CHECK:        .name:            test_arg_vector_of_ptr
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'global int addrspace(5)* __attribute__((ext_vector_type(2)))'
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           16
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83
+    !kernel_arg_base_type !83 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_arg_unknown_builtin_type.kd
+; CHECK:        .name:            test_arg_unknown_builtin_type
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      clk_event_t
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_arg_unknown_builtin_type(
+    %opencl.clk_event_t addrspace(1)* %a)
+    !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84
+    !kernel_arg_base_type !84 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_pointee_align.kd
+; CHECK:        .name:            test_pointee_align
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'long  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      'char  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           b
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  1
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char2  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           c
+; CHECK-NEXT:       .offset:         12
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  2
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char3  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           d
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  4
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char4  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           e
+; CHECK-NEXT:       .offset:         20
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  4
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char8  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           f
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  8
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char16  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           g
+; CHECK-NEXT:       .offset:         28
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  16
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         56
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
+                                              i8 addrspace(3)* %b,
+                                              <2 x i8> addrspace(3)* %c,
+                                              <3 x i8> addrspace(3)* %d,
+                                              <4 x i8> addrspace(3)* %e,
+                                              <8 x i8> addrspace(3)* %f,
+                                              <16 x i8> addrspace(3)* %g)
+    !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93
+    !kernel_arg_base_type !93 !kernel_arg_type_qual !94 {
+  ret void
+}
+
+; CHECK:        .symbol:          __test_block_invoke_kernel.kd
+; CHECK:        .device_enqueue_symbol: __test_block_invoke_kernel_runtime_handle
+; CHECK:        .name:            __test_block_invoke_kernel
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      __block_literal
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           25
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .name:           arg
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         56
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @__test_block_invoke_kernel(
+    <{ i32, i32, i8*, i8 addrspace(1)*, i8 }> %arg) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110
+    !kernel_arg_base_type !110 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_enqueue_kernel_caller.kd
+; CHECK:        .name:            test_enqueue_kernel_caller
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      char
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           1
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_default_queue
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_completion_action
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
+    !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK: .symbol:            unknown_addrspace_kernarg.kd
+; CHECK: .name:              unknown_addrspace_kernarg
+; CHECK: .args:
+; CHECK-NEXT: .value_kind:      global_buffer
+; CHECK-NEXT: .offset:          0
+; CHECK-NEXT: .size:            8
+; CHECK-NEXT: .value_type:      i32
+; CHECK-NEXT: .name:            ptr
+define amdgpu_kernel void @unknown_addrspace_kernarg(i32 addrspace(12345)* %ptr) #0 {
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK:  amdhsa.printf:
+; CHECK-NEXT: - '1:1:4:%d\n'
+; CHECK-NEXT: - '2:1:8:%g\n'
+
+attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
+attributes #1 = { "calls-enqueue-kernel" }
+
+!llvm.printf.fmts = !{!100, !101}
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"int"}
+!4 = !{!""}
+!5 = !{i32 undef, i32 1}
+!6 = !{i32 1, i32 2, i32 4}
+!7 = !{<4 x i32> undef, i32 0}
+!8 = !{i32 8, i32 16, i32 32}
+!9 = !{!"char"}
+!10 = !{!"ushort2"}
+!11 = !{!"int3"}
+!12 = !{!"ulong4"}
+!13 = !{!"half8"}
+!14 = !{!"float16"}
+!15 = !{!"double16"}
+!16 = !{!"int  addrspace(5)*"}
+!17 = !{!"image2d_t"}
+!18 = !{!"sampler_t"}
+!19 = !{!"queue_t"}
+!20 = !{!"struct A"}
+!21 = !{!"i128"}
+!22 = !{i32 0, i32 0, i32 0}
+!23 = !{!"none", !"none", !"none"}
+!24 = !{!"int", !"short2", !"char3"}
+!25 = !{!"", !"", !""}
+!26 = !{half undef, i32 1}
+!27 = !{float undef, i32 1}
+!28 = !{double undef, i32 1}
+!29 = !{i8 undef, i32 1}
+!30 = !{i16 undef, i32 1}
+!31 = !{i64 undef, i32 1}
+!32 = !{i32  addrspace(5)*undef, i32 1}
+!50 = !{i32 1, i32 2, i32 3}
+!51 = !{!"int  addrspace(5)*", !"int  addrspace(5)*", !"int  addrspace(5)*"}
+!60 = !{i32 1, i32 1, i32 1}
+!61 = !{!"read_only", !"write_only", !"read_write"}
+!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"}
+!70 = !{!"volatile", !"const restrict", !"pipe"}
+!80 = !{!"int  addrspace(5)* addrspace(5)*"}
+!81 = !{i32 1}
+!82 = !{!"struct B"}
+!83 = !{!"global int addrspace(5)* __attribute__((ext_vector_type(2)))"}
+!84 = !{!"clk_event_t"}
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3}
+!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"}
+!93 = !{!"long  addrspace(5)*", !"char  addrspace(5)*", !"char2  addrspace(5)*", !"char3  addrspace(5)*", !"char4  addrspace(5)*", !"char8  addrspace(5)*", !"char16  addrspace(5)*"}
+!94 = !{!"", !"", !"", !"", !"", !"", !""}
+!100 = !{!"1:1:4:%d\5Cn"}
+!101 = !{!"2:1:8:%g\5Cn"}
+!110 = !{!"__block_literal"}
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
index 485e02da7d9008f7774118bce0ad6d5244301246..4dce2bf832edabffb4d570e65c67ffa9475d96d4 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
 
 %struct.A = type { i8, float }
 %opencl.image1d_t = type opaque
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fa8a182cca94bfd0ce9ffa27a7c231833597512e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+; CHECK:        .symbol:     test.kd
+; CHECK:        .name:       test
+; CHECK:        .args:
+; CHECK-NEXT:     - .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            r
+; CHECK-NEXT:       .offset:          0
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      f16
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            a
+; CHECK-NEXT:       .offset:          8
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      f16
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            b
+; CHECK-NEXT:       .offset:          16
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      f16
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      hidden_global_offset_x
+; CHECK-NEXT:       .offset:          24
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i64
+; CHECK-NEXT:     - .value_kind:      hidden_global_offset_y
+; CHECK-NEXT:       .offset:          32
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i64
+; CHECK-NEXT:     - .value_kind:      hidden_global_offset_z
+; CHECK-NEXT:       .offset:          40
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i64
+; CHECK-NEXT:     - .value_kind:      hidden_none
+; CHECK-NEXT:       .offset:          48
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i8
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      hidden_none
+; CHECK-NEXT:       .offset:          56
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i8
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      hidden_none
+; CHECK-NEXT:       .offset:          64
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i8
+; CHECK-NEXT:       .address_space:   global
+define amdgpu_kernel void @test(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+
+!opencl.ocl.version = !{!0}
+!0 = !{i32 2, i32 0}
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
index ed2e79684c6de511cd37922b2e8bab05f1ada46d..6dbc1e2523d249caaf40954671c97aa6aa6def4f 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 ; CHECK: ---
 ; CHECK:  Version: [ 1, 0 ]
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7a9c810541ef8f90d48dd51069dac5cbfd35d3ee
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
@@ -0,0 +1,95 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+%opencl.image1d_t = type opaque
+%opencl.image1d_array_t = type opaque
+%opencl.image1d_buffer_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image2d_array_t = type opaque
+%opencl.image2d_array_depth_t = type opaque
+%opencl.image2d_array_msaa_t = type opaque
+%opencl.image2d_array_msaa_depth_t = type opaque
+%opencl.image2d_depth_t = type opaque
+%opencl.image2d_msaa_t = type opaque
+%opencl.image2d_msaa_depth_t = type opaque
+%opencl.image3d_t = type opaque
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+; CHECK:      .symbol:     test.kd
+; CHECK:      .name:       test
+; CHECK:      .args:
+; CHECK:        - .type_name:  image1d_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       a
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image1d_array_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       b
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image1d_buffer_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       c
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       d
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_array_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       e
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_array_depth_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       f
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_array_msaa_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       g
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_array_msaa_depth_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       h
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_depth_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       i
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_msaa_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       j
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_msaa_depth_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       k
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image3d_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       l
+; CHECK:          .size:       8
+define amdgpu_kernel void @test(%opencl.image1d_t addrspace(1)* %a,
+                                %opencl.image1d_array_t addrspace(1)* %b,
+                                %opencl.image1d_buffer_t addrspace(1)* %c,
+                                %opencl.image2d_t addrspace(1)* %d,
+                                %opencl.image2d_array_t addrspace(1)* %e,
+                                %opencl.image2d_array_depth_t addrspace(1)* %f,
+                                %opencl.image2d_array_msaa_t addrspace(1)* %g,
+                                %opencl.image2d_array_msaa_depth_t addrspace(1)* %h,
+                                %opencl.image2d_depth_t addrspace(1)* %i,
+                                %opencl.image2d_msaa_t addrspace(1)* %j,
+                                %opencl.image2d_msaa_depth_t addrspace(1)* %k,
+                                %opencl.image3d_t addrspace(1)* %l)
+    !kernel_arg_type !1 !kernel_arg_base_type !1 {
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+
+!1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t",
+       !"image2d_t", !"image2d_array_t", !"image2d_array_depth_t",
+       !"image2d_array_msaa_t", !"image2d_array_msaa_depth_t",
+       !"image2d_depth_t", !"image2d_msaa_t", !"image2d_msaa_depth_t",
+       !"image3d_t"}
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-images.ll b/test/CodeGen/AMDGPU/hsa-metadata-images.ll
index 00dee3b6c69420ffb757fef4422da539523eb48c..fd0159984297409d3c76239fe82dd7964a0878b4 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-images.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-images.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 %opencl.image1d_t = type opaque
 %opencl.image1d_array_t = type opaque
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fefb22e939e29e7ab0f7c1d621b24bd2480e302c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK: ...
+
+!opencl.ocl.version = !{}
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-2-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-2-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0d9f920ade1a418f692305b1c656bc40595fe7d8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-2-v3.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK: ...
+
+!opencl.ocl.version = !{!0}
+!0 = !{}
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5382e66b6beaf46010e36f9aae3a4d76bea00613
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK: ...
+
+!opencl.ocl.version = !{!0}
+!0 = !{i32 1}
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1c23b2e72058af3190d4cbf03dc52f129b120a07
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+@var = addrspace(1) global float 0.0
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+
+; CHECK: - .max_flat_workgroup_size:    256
+; CHECK:   .kernarg_segment_size:       24
+; CHECK:   .private_segment_fixed_size: 0
+; CHECK:   .wavefront_size:             64
+; CHECK:   .symbol:     test.kd
+; CHECK:   .name:       test
+; CHECK:   .sgpr_count:                 8
+; CHECK:   .kernarg_segment_align:      8
+; CHECK:   .vgpr_count:                 6
+; CHECK:   .group_segment_fixed_size:   0
+define amdgpu_kernel void @test(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; CHECK:   .symbol:     num_spilled_sgprs.kd
+; CHECK:   .name:       num_spilled_sgprs
+; GFX700:   .sgpr_spill_count: 40
+; GFX803:   .sgpr_spill_count: 24
+; GFX900:   .sgpr_spill_count: 24
+define amdgpu_kernel void @num_spilled_sgprs(
+    i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
+    i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
+    i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32],
+    i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32],
+    i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32],
+    i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32],
+    i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32],
+    i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32],
+    i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32],
+    i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32],
+    i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32],
+    i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
+entry:
+  store i32 %in0, i32 addrspace(1)* %out0
+  store i32 %in1, i32 addrspace(1)* %out1
+  store i32 %in2, i32 addrspace(1)* %out2
+  store i32 %in3, i32 addrspace(1)* %out3
+  store i32 %in4, i32 addrspace(1)* %out4
+  store i32 %in5, i32 addrspace(1)* %out5
+  store i32 %in6, i32 addrspace(1)* %out6
+  store i32 %in7, i32 addrspace(1)* %out7
+  store i32 %in8, i32 addrspace(1)* %out8
+  store i32 %in9, i32 addrspace(1)* %out9
+  store i32 %ina, i32 addrspace(1)* %outa
+  store i32 %inb, i32 addrspace(1)* %outb
+  store i32 %inc, i32 addrspace(1)* %outc
+  store i32 %ind, i32 addrspace(1)* %outd
+  store i32 %ine, i32 addrspace(1)* %oute
+  store i32 %inf, i32 addrspace(1)* %outf
+  ret void
+}
+
+; CHECK:   .symbol:     num_spilled_vgprs.kd
+; CHECK:   .name:       num_spilled_vgprs
+; CHECK:   .vgpr_spill_count: 14
+define amdgpu_kernel void @num_spilled_vgprs() #1 {
+  %val0 = load volatile float, float addrspace(1)* @var
+  %val1 = load volatile float, float addrspace(1)* @var
+  %val2 = load volatile float, float addrspace(1)* @var
+  %val3 = load volatile float, float addrspace(1)* @var
+  %val4 = load volatile float, float addrspace(1)* @var
+  %val5 = load volatile float, float addrspace(1)* @var
+  %val6 = load volatile float, float addrspace(1)* @var
+  %val7 = load volatile float, float addrspace(1)* @var
+  %val8 = load volatile float, float addrspace(1)* @var
+  %val9 = load volatile float, float addrspace(1)* @var
+  %val10 = load volatile float, float addrspace(1)* @var
+  %val11 = load volatile float, float addrspace(1)* @var
+  %val12 = load volatile float, float addrspace(1)* @var
+  %val13 = load volatile float, float addrspace(1)* @var
+  %val14 = load volatile float, float addrspace(1)* @var
+  %val15 = load volatile float, float addrspace(1)* @var
+  %val16 = load volatile float, float addrspace(1)* @var
+  %val17 = load volatile float, float addrspace(1)* @var
+  %val18 = load volatile float, float addrspace(1)* @var
+  %val19 = load volatile float, float addrspace(1)* @var
+  %val20 = load volatile float, float addrspace(1)* @var
+  %val21 = load volatile float, float addrspace(1)* @var
+  %val22 = load volatile float, float addrspace(1)* @var
+  %val23 = load volatile float, float addrspace(1)* @var
+  %val24 = load volatile float, float addrspace(1)* @var
+  %val25 = load volatile float, float addrspace(1)* @var
+  %val26 = load volatile float, float addrspace(1)* @var
+  %val27 = load volatile float, float addrspace(1)* @var
+  %val28 = load volatile float, float addrspace(1)* @var
+  %val29 = load volatile float, float addrspace(1)* @var
+  %val30 = load volatile float, float addrspace(1)* @var
+
+  store volatile float %val0, float addrspace(1)* @var
+  store volatile float %val1, float addrspace(1)* @var
+  store volatile float %val2, float addrspace(1)* @var
+  store volatile float %val3, float addrspace(1)* @var
+  store volatile float %val4, float addrspace(1)* @var
+  store volatile float %val5, float addrspace(1)* @var
+  store volatile float %val6, float addrspace(1)* @var
+  store volatile float %val7, float addrspace(1)* @var
+  store volatile float %val8, float addrspace(1)* @var
+  store volatile float %val9, float addrspace(1)* @var
+  store volatile float %val10, float addrspace(1)* @var
+  store volatile float %val11, float addrspace(1)* @var
+  store volatile float %val12, float addrspace(1)* @var
+  store volatile float %val13, float addrspace(1)* @var
+  store volatile float %val14, float addrspace(1)* @var
+  store volatile float %val15, float addrspace(1)* @var
+  store volatile float %val16, float addrspace(1)* @var
+  store volatile float %val17, float addrspace(1)* @var
+  store volatile float %val18, float addrspace(1)* @var
+  store volatile float %val19, float addrspace(1)* @var
+  store volatile float %val20, float addrspace(1)* @var
+  store volatile float %val21, float addrspace(1)* @var
+  store volatile float %val22, float addrspace(1)* @var
+  store volatile float %val23, float addrspace(1)* @var
+  store volatile float %val24, float addrspace(1)* @var
+  store volatile float %val25, float addrspace(1)* @var
+  store volatile float %val26, float addrspace(1)* @var
+  store volatile float %val27, float addrspace(1)* @var
+  store volatile float %val28, float addrspace(1)* @var
+  store volatile float %val29, float addrspace(1)* @var
+  store volatile float %val30, float addrspace(1)* @var
+
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+
+attributes #0 = { "amdgpu-num-sgpr"="14" }
+attributes #1 = { "amdgpu-num-vgpr"="20" }
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 3dc3f320db851bc772342f3af3631242ca58daba..b5b6aa450bfbc6214dfe1538e1399b0a10a76fd7 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 @var = addrspace(1) global float 0.0
 
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
index fab086e6cb11bb84c16a546cfcf6064ea587e929..7eacdc1cdaba0068e473c1dc2e5da7f9b07aff17 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 target datalayout = "A5"
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
@@ -32,7 +32,7 @@ entry:
   ret void, !dbg !25
 }
 
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !opencl.ocl.version = !{!3}
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index e937aaca66f21db299434bce43d2981ab1c1c695..4cc9e3c112a28f90d66a0923ce18d9e6d0d4ae80 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -1,29 +1,29 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx600 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI600 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx601 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI601 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx700 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx701 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx702 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx703 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx704 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=bonaire | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=mullins | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=hawaii | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kabini | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris10 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris11 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx801 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx802 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx600 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI600 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx601 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI601 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx700 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx701 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx702 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx703 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx704 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=bonaire -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=mullins -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=hawaii -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kabini -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris10 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris11 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx801 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx802 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
 
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index 0b19fbe7d70ce3faa648991db3c6ca192db6a76e..e23b2d922a37ebb2b64ebd2179b4aa0b397b81cd 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/test/CodeGen/AMDGPU/idot8.ll b/test/CodeGen/AMDGPU/idot8.ll
index e0cd2ad506be985e24f3a0ca36129cb49b2f7e26..edb88f1912efc322ff9b6ae66c8eb6c2e47990f3 100644
--- a/test/CodeGen/AMDGPU/idot8.ll
+++ b/test/CodeGen/AMDGPU/idot8.ll
@@ -3656,67 +3656,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s0, 0
-; GFX7-NEXT:    s_mov_b32 s12, s0
-; GFX7-NEXT:    s_mov_b32 s14, s0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s38, s[4:5], 0x0
-; GFX7-NEXT:    s_mov_b32 s16, s0
-; GFX7-NEXT:    s_mov_b32 s18, s0
+; GFX7-NEXT:    s_load_dword s9, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s13, s1, 4
-; GFX7-NEXT:    s_lshl_b32 s15, s1, 12
-; GFX7-NEXT:    s_lshl_b32 s17, s1, 16
-; GFX7-NEXT:    s_lshl_b32 s19, s1, 20
-; GFX7-NEXT:    s_ashr_i64 s[10:11], s[12:13], 60
+; GFX7-NEXT:    s_ashr_i64 s[10:11], s[0:1], 60
+; GFX7-NEXT:    s_lshl_b32 s11, s1, 4
+; GFX7-NEXT:    s_ashr_i64 s[14:15], s[10:11], 60
+; GFX7-NEXT:    s_lshl_b32 s11, s1, 12
+; GFX7-NEXT:    s_ashr_i64 s[16:17], s[10:11], 60
+; GFX7-NEXT:    s_lshl_b32 s11, s1, 16
+; GFX7-NEXT:    s_ashr_i64 s[18:19], s[10:11], 60
+; GFX7-NEXT:    s_lshl_b32 s11, s1, 20
 ; GFX7-NEXT:    s_lshl_b32 s13, s1, 8
-; GFX7-NEXT:    s_ashr_i64 s[8:9], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s21, s1, 24
+; GFX7-NEXT:    s_ashr_i64 s[20:21], s[10:11], 60
+; GFX7-NEXT:    s_lshl_b32 s11, s1, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 28
-; GFX7-NEXT:    s_ashr_i64 s[22:23], s[0:1], 60
-; GFX7-NEXT:    s_mov_b32 s1, s2
-; GFX7-NEXT:    s_ashr_i64 s[24:25], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s2, 4
+; GFX7-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
+; GFX7-NEXT:    s_lshl_b32 s1, s9, 4
 ; GFX7-NEXT:    s_ashr_i64 s[26:27], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s2, 8
+; GFX7-NEXT:    s_lshl_b32 s1, s9, 8
 ; GFX7-NEXT:    s_ashr_i64 s[28:29], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s2, 12
+; GFX7-NEXT:    s_lshl_b32 s1, s9, 12
 ; GFX7-NEXT:    s_ashr_i64 s[30:31], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX7-NEXT:    s_lshl_b32 s1, s9, 16
 ; GFX7-NEXT:    s_ashr_i64 s[32:33], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s2, 20
+; GFX7-NEXT:    s_lshl_b32 s1, s9, 20
 ; GFX7-NEXT:    s_ashr_i64 s[34:35], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s9, 24
 ; GFX7-NEXT:    s_ashr_i64 s[36:37], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s2, 28
-; GFX7-NEXT:    s_mov_b32 s20, s0
-; GFX7-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s38
-; GFX7-NEXT:    v_mad_i32_i24 v0, s22, v0, v1
-; GFX7-NEXT:    s_ashr_i64 s[20:21], s[20:21], 60
+; GFX7-NEXT:    s_lshl_b32 s1, s9, 28
+; GFX7-NEXT:    s_ashr_i64 s[24:25], s[8:9], 60
+; GFX7-NEXT:    s_ashr_i64 s[8:9], s[0:1], 60
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v0, v1
+; GFX7-NEXT:    s_ashr_i64 s[22:23], s[10:11], 60
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s36
-; GFX7-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
-; GFX7-NEXT:    s_ashr_i64 s[18:19], s[18:19], 60
+; GFX7-NEXT:    v_mad_i32_i24 v0, s22, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s34
-; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
-; GFX7-NEXT:    s_ashr_i64 s[16:17], s[16:17], 60
+; GFX7-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s32
-; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
-; GFX7-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s30
-; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
 ; GFX7-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s28
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s26
-; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s24
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -3724,67 +3717,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT:    s_mov_b32 s8, 0
-; GFX8-NEXT:    s_mov_b32 s10, s8
-; GFX8-NEXT:    s_mov_b32 s12, s8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s9, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s36, s[0:1], 0x0
-; GFX8-NEXT:    s_mov_b32 s14, s8
-; GFX8-NEXT:    s_mov_b32 s16, s8
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s11, s9, 4
-; GFX8-NEXT:    s_lshl_b32 s13, s9, 8
-; GFX8-NEXT:    s_lshl_b32 s15, s9, 16
-; GFX8-NEXT:    s_lshl_b32 s17, s9, 20
-; GFX8-NEXT:    s_ashr_i64 s[6:7], s[10:11], 60
-; GFX8-NEXT:    s_ashr_i64 s[10:11], s[12:13], 60
-; GFX8-NEXT:    s_lshl_b32 s13, s9, 12
-; GFX8-NEXT:    s_ashr_i64 s[4:5], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s19, s9, 24
-; GFX8-NEXT:    s_lshl_b32 s9, s9, 28
-; GFX8-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
-; GFX8-NEXT:    s_mov_b32 s9, s2
-; GFX8-NEXT:    s_ashr_i64 s[22:23], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 4
-; GFX8-NEXT:    s_ashr_i64 s[24:25], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 8
-; GFX8-NEXT:    s_ashr_i64 s[26:27], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 12
-; GFX8-NEXT:    s_ashr_i64 s[28:29], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 16
-; GFX8-NEXT:    s_ashr_i64 s[30:31], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 20
-; GFX8-NEXT:    s_ashr_i64 s[32:33], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 24
-; GFX8-NEXT:    s_ashr_i64 s[34:35], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 28
-; GFX8-NEXT:    s_mov_b32 s18, s8
-; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, s36
-; GFX8-NEXT:    v_mad_i32_i24 v0, s20, v0, v1
-; GFX8-NEXT:    s_ashr_i64 s[18:19], s[18:19], 60
-; GFX8-NEXT:    v_mov_b32_e32 v1, s34
-; GFX8-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
-; GFX8-NEXT:    s_ashr_i64 s[16:17], s[16:17], 60
-; GFX8-NEXT:    v_mov_b32_e32 v1, s32
-; GFX8-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
-; GFX8-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
-; GFX8-NEXT:    v_mov_b32_e32 v1, s30
-; GFX8-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
-; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX8-NEXT:    v_mov_b32_e32 v1, s28
-; GFX8-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s26
-; GFX8-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s24
-; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s22
-; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
+; GFX8-NEXT:    s_load_dword s5, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s7, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_ashr_i64 s[0:1], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s5, 4
+; GFX8-NEXT:    s_ashr_i64 s[12:13], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX8-NEXT:    s_ashr_i64 s[14:15], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s5, 20
+; GFX8-NEXT:    s_ashr_i64 s[16:17], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s5, 24
+; GFX8-NEXT:    s_ashr_i64 s[18:19], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s5, 28
+; GFX8-NEXT:    s_lshl_b32 s9, s5, 8
+; GFX8-NEXT:    s_lshl_b32 s11, s5, 12
+; GFX8-NEXT:    s_ashr_i64 s[4:5], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s7, 4
+; GFX8-NEXT:    s_ashr_i64 s[22:23], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s7, 8
+; GFX8-NEXT:    s_ashr_i64 s[24:25], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s7, 12
+; GFX8-NEXT:    s_ashr_i64 s[26:27], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s7, 16
+; GFX8-NEXT:    s_ashr_i64 s[28:29], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s7, 20
+; GFX8-NEXT:    s_ashr_i64 s[30:31], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s7, 24
+; GFX8-NEXT:    s_ashr_i64 s[32:33], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s1, s7, 28
+; GFX8-NEXT:    s_ashr_i64 s[20:21], s[6:7], 60
+; GFX8-NEXT:    s_ashr_i64 s[6:7], s[0:1], 60
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s32
+; GFX8-NEXT:    v_mad_i32_i24 v2, s18, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s30
+; GFX8-NEXT:    v_mad_i32_i24 v2, s16, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s28
+; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
+; GFX8-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX8-NEXT:    v_mov_b32_e32 v3, s26
+; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v3, v2
+; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
+; GFX8-NEXT:    v_mov_b32_e32 v3, s24
+; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s22
+; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
+; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3792,67 +3778,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s36, s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s16, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s11, s9, 4
-; GFX9-NEXT:    s_lshl_b32 s13, s9, 8
-; GFX9-NEXT:    s_lshl_b32 s15, s9, 16
-; GFX9-NEXT:    s_lshl_b32 s17, s9, 20
-; GFX9-NEXT:    s_ashr_i64 s[6:7], s[10:11], 60
-; GFX9-NEXT:    s_ashr_i64 s[10:11], s[12:13], 60
-; GFX9-NEXT:    s_lshl_b32 s13, s9, 12
-; GFX9-NEXT:    s_ashr_i64 s[4:5], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s19, s9, 24
-; GFX9-NEXT:    s_lshl_b32 s9, s9, 28
-; GFX9-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
-; GFX9-NEXT:    s_mov_b32 s9, s2
-; GFX9-NEXT:    s_ashr_i64 s[22:23], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 4
-; GFX9-NEXT:    s_ashr_i64 s[24:25], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 8
-; GFX9-NEXT:    s_ashr_i64 s[26:27], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 12
-; GFX9-NEXT:    s_ashr_i64 s[28:29], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 16
-; GFX9-NEXT:    s_ashr_i64 s[30:31], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 20
-; GFX9-NEXT:    s_ashr_i64 s[32:33], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 24
-; GFX9-NEXT:    s_ashr_i64 s[34:35], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 28
-; GFX9-NEXT:    s_mov_b32 s18, s8
-; GFX9-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s36
-; GFX9-NEXT:    v_mad_i32_i24 v0, s20, v0, v1
-; GFX9-NEXT:    s_ashr_i64 s[18:19], s[18:19], 60
-; GFX9-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
-; GFX9-NEXT:    s_ashr_i64 s[16:17], s[16:17], 60
-; GFX9-NEXT:    v_mov_b32_e32 v1, s32
-; GFX9-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
-; GFX9-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
-; GFX9-NEXT:    v_mov_b32_e32 v1, s30
-; GFX9-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
-; GFX9-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX9-NEXT:    v_mov_b32_e32 v1, s28
-; GFX9-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s26
-; GFX9-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s24
-; GFX9-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s22
-; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
+; GFX9-NEXT:    s_load_dword s5, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s7, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i64 s[0:1], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s5, 4
+; GFX9-NEXT:    s_ashr_i64 s[12:13], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX9-NEXT:    s_ashr_i64 s[14:15], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s5, 20
+; GFX9-NEXT:    s_ashr_i64 s[16:17], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s5, 24
+; GFX9-NEXT:    s_ashr_i64 s[18:19], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s5, 28
+; GFX9-NEXT:    s_lshl_b32 s9, s5, 8
+; GFX9-NEXT:    s_lshl_b32 s11, s5, 12
+; GFX9-NEXT:    s_ashr_i64 s[4:5], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s7, 4
+; GFX9-NEXT:    s_ashr_i64 s[22:23], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s7, 8
+; GFX9-NEXT:    s_ashr_i64 s[24:25], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s7, 12
+; GFX9-NEXT:    s_ashr_i64 s[26:27], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s7, 16
+; GFX9-NEXT:    s_ashr_i64 s[28:29], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s7, 20
+; GFX9-NEXT:    s_ashr_i64 s[30:31], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s7, 24
+; GFX9-NEXT:    s_ashr_i64 s[32:33], s[0:1], 60
+; GFX9-NEXT:    s_lshl_b32 s1, s7, 28
+; GFX9-NEXT:    s_ashr_i64 s[20:21], s[6:7], 60
+; GFX9-NEXT:    s_ashr_i64 s[6:7], s[0:1], 60
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s32
+; GFX9-NEXT:    v_mad_i32_i24 v2, s18, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s30
+; GFX9-NEXT:    v_mad_i32_i24 v2, s16, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s28
+; GFX9-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
+; GFX9-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX9-NEXT:    v_mov_b32_e32 v3, s26
+; GFX9-NEXT:    v_mad_i32_i24 v2, s10, v3, v2
+; GFX9-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
+; GFX9-NEXT:    v_mov_b32_e32 v3, s24
+; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s22
+; GFX9-NEXT:    v_mad_i32_i24 v2, s12, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s20
+; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3860,67 +3839,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s8, 0
-; GFX9-DL-NEXT:    s_mov_b32 s10, s8
-; GFX9-DL-NEXT:    s_mov_b32 s12, s8
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s9, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s36, s[0:1], 0x0
-; GFX9-DL-NEXT:    s_mov_b32 s14, s8
-; GFX9-DL-NEXT:    s_mov_b32 s16, s8
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshl_b32 s11, s9, 4
-; GFX9-DL-NEXT:    s_lshl_b32 s13, s9, 8
-; GFX9-DL-NEXT:    s_lshl_b32 s15, s9, 16
-; GFX9-DL-NEXT:    s_lshl_b32 s17, s9, 20
-; GFX9-DL-NEXT:    s_ashr_i64 s[6:7], s[10:11], 60
-; GFX9-DL-NEXT:    s_ashr_i64 s[10:11], s[12:13], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s13, s9, 12
-; GFX9-DL-NEXT:    s_ashr_i64 s[4:5], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s19, s9, 24
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s9, 28
-; GFX9-DL-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
-; GFX9-DL-NEXT:    s_mov_b32 s9, s2
-; GFX9-DL-NEXT:    s_ashr_i64 s[22:23], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s2, 4
-; GFX9-DL-NEXT:    s_ashr_i64 s[24:25], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s2, 8
-; GFX9-DL-NEXT:    s_ashr_i64 s[26:27], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s2, 12
-; GFX9-DL-NEXT:    s_ashr_i64 s[28:29], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s2, 16
-; GFX9-DL-NEXT:    s_ashr_i64 s[30:31], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s2, 20
-; GFX9-DL-NEXT:    s_ashr_i64 s[32:33], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s2, 24
-; GFX9-DL-NEXT:    s_ashr_i64 s[34:35], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s2, 28
-; GFX9-DL-NEXT:    s_mov_b32 s18, s8
-; GFX9-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s36
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s20, v0, v1
-; GFX9-DL-NEXT:    s_ashr_i64 s[18:19], s[18:19], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
-; GFX9-DL-NEXT:    s_ashr_i64 s[16:17], s[16:17], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s32
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
-; GFX9-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s30
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
-; GFX9-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s28
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s26
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s24
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s22
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
+; GFX9-DL-NEXT:    s_load_dword s5, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s7, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_ashr_i64 s[0:1], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 4
+; GFX9-DL-NEXT:    s_ashr_i64 s[12:13], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX9-DL-NEXT:    s_ashr_i64 s[14:15], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 20
+; GFX9-DL-NEXT:    s_ashr_i64 s[16:17], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 24
+; GFX9-DL-NEXT:    s_ashr_i64 s[18:19], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 28
+; GFX9-DL-NEXT:    s_lshl_b32 s9, s5, 8
+; GFX9-DL-NEXT:    s_lshl_b32 s11, s5, 12
+; GFX9-DL-NEXT:    s_ashr_i64 s[4:5], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 4
+; GFX9-DL-NEXT:    s_ashr_i64 s[22:23], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 8
+; GFX9-DL-NEXT:    s_ashr_i64 s[24:25], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 12
+; GFX9-DL-NEXT:    s_ashr_i64 s[26:27], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 16
+; GFX9-DL-NEXT:    s_ashr_i64 s[28:29], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 20
+; GFX9-DL-NEXT:    s_ashr_i64 s[30:31], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 24
+; GFX9-DL-NEXT:    s_ashr_i64 s[32:33], s[0:1], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 28
+; GFX9-DL-NEXT:    s_ashr_i64 s[20:21], s[6:7], 60
+; GFX9-DL-NEXT:    s_ashr_i64 s[6:7], s[0:1], 60
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s32
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s18, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s30
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s16, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s28
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
+; GFX9-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s26
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s10, v3, v2
+; GFX9-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s24
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s22
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s12, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s20
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c8f75518518be3124bc27e67875a5dc1a7110e98
--- /dev/null
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
@@ -0,0 +1,83 @@
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
+
+; indexing of vectors.
+
+; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
+; to avoid gfx9 scheduling induced issues.
+
+
+; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
+; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
+
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
+
+; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
+; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
+
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
+; IDXMODE: s_set_gpr_idx_off
+
+; GCN-NEXT: s_xor_b64 exec, exec, vcc
+; GCN: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
+
+; GCN: s_mov_b64 [[MASK]], exec
+
+; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
+; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63
+
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63
+; IDXMODE: s_set_gpr_idx_off
+
+; GCN-NEXT: s_xor_b64 exec, exec, vcc
+; GCN: s_cbranch_execnz [[LOOP1]]
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
+
+; GCN: buffer_store_dword [[INS0]]
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %id.ext = zext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %idx1 = add i32 %idx0, 1
+  %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
+  %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
+  %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
+  store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
+  %cmp = icmp eq i32 %id, 0
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 %live.out.val, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
index 63384f5e445083cef0c57a7831d0e5cac27db966..693d06ce68d5cade67dfe7dfc60d2daac8925ba4 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
@@ -9,11 +9,14 @@
 ; CHECK: s_load_dword [[IN:s[0-9]+]]
 ; CHECK: s_mov_b32 m0, [[IN]]
 ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
-; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+; CHECK: buffer_store_dwordx4
+; CHECK: buffer_store_dwordx4
+; CHECK: buffer_store_dwordx4
+; CHECK: buffer_store_dwordx4
+define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
 entry:
-  %ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
-  store <4 x float> %ins, <4 x float> addrspace(1)* %out
+  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
+  store <16 x float> %ins, <16 x float> addrspace(1)* %out
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
new file mode 100644
index 0000000000000000000000000000000000000000..544ab990fee8457de8356d44d8a8f42434eee699
--- /dev/null
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
@@ -0,0 +1,86 @@
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
+
+; Tests for indirect addressing on SI, which is implemented using dynamic
+; indexing of vectors.
+
+; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
+; to avoid gfx9 scheduling induced issues.
+
+
+; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
+; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
+
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
+
+; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
+; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
+
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
+; IDXMODE: s_set_gpr_idx_off
+
+; GCN-NEXT: s_xor_b64 exec, exec, vcc
+; GCN: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
+
+; GCN: s_mov_b64 [[MASK]], exec
+
+; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
+; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63
+
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63
+; IDXMODE: s_set_gpr_idx_off
+
+; GCN-NEXT: s_xor_b64 exec, exec, vcc
+; GCN: s_cbranch_execnz [[LOOP1]]
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
+
+; GCN: buffer_store_dword [[INS0]]
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %id.ext = zext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %idx1 = add i32 %idx0, 1
+  %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
+  %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
+  %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
+  store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
+  %cmp = icmp eq i32 %id, 0
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 %live.out.val, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 8e02303377c001354b7acbe1d24ffa2e2e2490f4..081f623ac13d5f43bb79297ddedc80fb0772a9a0 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -22,7 +22,7 @@
 define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %idx = add i32 %in, 1
-  %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %idx
+  %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
   store float %elt, float addrspace(1)* %out
   ret void
 }
@@ -44,11 +44,11 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
+define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <16 x i32> %or.val) {
 entry:
   %idx = add i32 %in, 1
-  %vec = or <4 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4>
-  %elt = extractelement <4 x i32> %vec, i32 %idx
+  %vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %elt = extractelement <16 x i32> %vec, i32 %idx
   store i32 %elt, i32 addrspace(1)* %out
   ret void
 }
@@ -68,7 +68,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
-  %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
+  %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in
   store float %elt, float addrspace(1)* %out
   ret void
 }
@@ -79,15 +79,15 @@ entry:
 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
 
 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
-; IDXMODE: v_mov_b32_e32 v2, 2
-; IDXMODE: v_mov_b32_e32 v3, 3
+; IDXMODE: v_mov_b32_e32 v14, 15
+; IDXMODE: v_mov_b32_e32 v15, 16
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
-  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+  %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
@@ -102,14 +102,26 @@ entry:
 ; IDXMODE: v_mov_b32_e32 v1,
 ; IDXMODE: v_mov_b32_e32 v2,
 ; IDXMODE: v_mov_b32_e32 v3,
+; IDXMODE: v_mov_b32_e32 v4,
+; IDXMODE: v_mov_b32_e32 v5,
+; IDXMODE: v_mov_b32_e32 v6,
+; IDXMODE: v_mov_b32_e32 v7,
+; IDXMODE: v_mov_b32_e32 v8,
+; IDXMODE: v_mov_b32_e32 v9,
+; IDXMODE: v_mov_b32_e32 v10,
+; IDXMODE: v_mov_b32_e32 v11,
+; IDXMODE: v_mov_b32_e32 v12,
+; IDXMODE: v_mov_b32_e32 v13,
+; IDXMODE: v_mov_b32_e32 v14,
+; IDXMODE: v_mov_b32_e32 v15,
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
-  %or = or <4 x i32> %vec0, %vec1
-  %value = extractelement <4 x i32> %or, i32 %index
+  %or = or <16 x i32> %vec0, %vec1
+  %value = extractelement <16 x i32> %or, i32 %index
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
@@ -138,7 +150,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
-  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+  %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
@@ -170,15 +182,16 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
 ; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
-; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000
+; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
+; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
 
 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
 ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
-define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
 entry:
-  %0 = add i32 %in, 1
-  %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
-  store <4 x float> %1, <4 x float> addrspace(1)* %out
+  %add = add i32 %in, 1
+  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
+  store <16 x float> %ins, <16 x float> addrspace(1)* %out
   ret void
 }
 
@@ -193,27 +206,27 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
 entry:
-  %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
+  store <16 x float> %ins, <16 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
 ; The offset depends on the register that holds the first element of the vector.
 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; MOVREL: v_movreld_b32_e32 v0, 5
+; MOVREL: v_movreld_b32_e32 v0, 16
 
 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
-; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
+; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
-  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
-  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  %value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
+  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
   ret void
 }
 
@@ -229,11 +242,11 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
-  %value = insertelement <4 x i32> %vec, i32 5, i32 %index
-  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  %value = insertelement <16 x i32> %vec, i32 5, i32 %index
+  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
   ret void
 }
 
@@ -244,6 +257,18 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
 
 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
@@ -251,23 +276,23 @@ entry:
 ; GCN: s_and_saveexec_b64 vcc, vcc
 
 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
-; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
+; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 33
 
 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
+; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 33
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: s_cbranch_execnz [[LOOPBB]]
 ; GCN: s_mov_b64 exec, [[SAVEEXEC]]
 
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
-  %value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 5, i32 %index
-  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
+  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
   ret void
 }
 
@@ -277,6 +302,18 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
 
 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
@@ -293,12 +330,12 @@ entry:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: s_cbranch_execnz
-define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -16
-  %value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 500, i32 %index
-  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
+  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
   ret void
 }
 
@@ -364,9 +401,9 @@ entry:
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
   %idx0 = load volatile i32, i32 addrspace(1)* %gep
   %idx1 = add i32 %idx0, 1
-  %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
+  %val0 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx0
   %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
-  %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
+  %val1 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx1
   store volatile i32 %val0, i32 addrspace(1)* %out0
   store volatile i32 %val1, i32 addrspace(1)* %out0
   %cmp = icmp eq i32 %id, 0
@@ -380,76 +417,9 @@ bb2:
   ret void
 }
 
-; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
-; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
-; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
-
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
-
-; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
-; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
-; GCN: s_and_saveexec_b64 vcc, vcc
-
-; MOVREL: s_mov_b32 m0, [[READLANE]]
-; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
-
-; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
-; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
-; IDXMODE: s_set_gpr_idx_off
-
-; GCN-NEXT: s_xor_b64 exec, exec, vcc
-; GCN: s_cbranch_execnz [[LOOP0]]
-
-; FIXME: Redundant copy
-; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
-
-; GCN: s_mov_b64 [[MASK]], exec
-
-; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
-; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
-; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
-; GCN: s_and_saveexec_b64 vcc, vcc
-
-; MOVREL: s_mov_b32 m0, [[READLANE]]
-; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
-
-; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
-; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
-; IDXMODE: s_set_gpr_idx_off
-
-; GCN-NEXT: s_xor_b64 exec, exec, vcc
-; GCN: s_cbranch_execnz [[LOOP1]]
-
-; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
-
-; GCN: buffer_store_dword [[INS0]]
-define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %id.ext = zext i32 %id to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
-  %idx0 = load volatile i32, i32 addrspace(1)* %gep
-  %idx1 = add i32 %idx0, 1
-  %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
-  %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
-  %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
-  store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
-  %cmp = icmp eq i32 %id, 0
-  br i1 %cmp, label %bb1, label %bb2
-
-bb1:
-  store volatile i32 %live.out.val, i32 addrspace(1)* undef
-  br label %bb2
-
-bb2:
-  ret void
-}
+; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to
+; avoid very different schedule induced isses with gfx9.
+; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
 
 
 ; GCN-LABEL: {{^}}insert_adjacent_blocks:
@@ -483,10 +453,10 @@ bb7:                                              ; preds = %bb4, %bb1
 ; GCN: s_load_dword [[ARG:s[0-9]+]]
 
 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
-; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
 ; MOVREL: s_waitcnt
 ; MOVREL: s_add_i32 m0, [[ARG]], -16
 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
+; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
 ; MOVREL: s_mov_b32 m0, -1
 
@@ -508,13 +478,13 @@ bb7:                                              ; preds = %bb4, %bb1
 define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
 bb:
   %tmp1 = add i32 %arg, -16
-  %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1
+  %tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
   %tmp3 = add i32 %arg, -16
-  %tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float -4.0, i32 %tmp3
-  %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
-  %tmp6 = extractelement <6 x i32> %tmp5, i32 1
-  %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
-  %tmp8 = extractelement <6 x i32> %tmp7, i32 5
+  %tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3
+  %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
+  %tmp6 = extractelement <9 x i32> %tmp5, i32 1
+  %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
+  %tmp8 = extractelement <9 x i32> %tmp7, i32 5
   store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
   store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
   ret void
@@ -522,7 +492,7 @@ bb:
 
 ; offset puts outside of superegister bounaries, so clamp to 1st element.
 ; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
-; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\].* offset:48}}
 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
 ; MOVREL: s_mov_b32 m0, [[IDX]]
 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
@@ -532,11 +502,11 @@ bb:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
-  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
-  %offset = add i32 %idx, 3
-  %value = extractelement <4 x i32> %ld, i32 %offset
+  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
+  %offset = add i32 %idx, 15
+  %value = extractelement <16 x i32> %ld, i32 %offset
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
@@ -544,20 +514,20 @@ entry:
 ; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
 ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
-; MOVREL: s_add_i32 m0, [[IDX]], 4
+; MOVREL: s_add_i32 m0, [[IDX]], 16
 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
 
-; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 4
+; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
-  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
-  %offset = add i32 %idx, 4
-  %value = extractelement <4 x i32> %ld, i32 %offset
+  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
+  %offset = add i32 %idx, 16
+  %value = extractelement <16 x i32> %ld, i32 %offset
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
@@ -565,7 +535,7 @@ entry:
 ; Test that the or is folded into the base address register instead of
 ; added to m0
 
-; GCN-LABEL: {{^}}extractelement_v4i32_or_index:
+; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
 ; GCN-NOT: [[IDX_SHL]]
@@ -576,17 +546,17 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) {
 entry:
-  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
+  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
-  %value = extractelement <4 x i32> %ld, i32 %idx
+  %value = extractelement <16 x i32> %ld, i32 %idx
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
+; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
 ; GCN-NOT: [[IDX_SHL]]
@@ -597,11 +567,11 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
-  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
-  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
+  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
   ret void
 }
 
@@ -636,9 +606,9 @@ bb2:                                              ; preds = %bb4, %bb
 
 bb4:                                              ; preds = %bb2
   %vgpr = load volatile i32, i32 addrspace(1)* undef
-  %tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr
-  %tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr
-  %tmp7 = extractelement <8 x i32> %tmp6, i32 0
+  %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
+  %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
+  %tmp7 = extractelement <16 x i32> %tmp6, i32 0
   br label %bb2
 
 bb8:                                              ; preds = %bb2
diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll
index 5005b781ca3af75233a2155ca2696501687cd757..75ad58df43b347a52932d403868246fb90ca997a 100644
--- a/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -32,8 +32,8 @@ loop:
 ; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]
 
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: [[LOOP:BB[0-9]+_[0-9]+]]:  ; %loop
 ; SI: s_and_b64 vcc, exec, -1
+; SI: [[LOOP:BB[0-9]+_[0-9]+]]:  ; %loop
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_cbranch_vccnz [[LOOP]]
diff --git a/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir b/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir
new file mode 100644
index 0000000000000000000000000000000000000000..9427b5cd25420fac63ce0749c6efd11166bc11e4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir
@@ -0,0 +1,320 @@
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+# GCN-LABEL: name: and_execz_mov_vccz
+# GCN-NOT: S_MOV_
+# GCN-NOT: S_AND_
+# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_imm_vccz
+# GCN-NOT: S_AND_
+# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_imm_vccz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execnz_imm_vccnz
+# GCN-NOT: S_AND_
+# GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec
+name:            and_execnz_imm_vccnz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_imm_vccz_live_scc
+# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc
+# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_imm_vccz_live_scc
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $vcc = S_AND_B64 $exec, -1, implicit-def $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_live_scc
+# GCN-NOT: S_MOV_
+# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc
+# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_live_scc
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_live_sreg
+# GCN:      $sgpr0_sgpr1 = S_MOV_B64 -1
+# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_live_sreg
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_live_sreg_commute
+# GCN:      $sgpr0_sgpr1 = S_MOV_B64 -1
+# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_live_sreg_commute
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $sgpr0_sgpr1, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_live_scc_commute
+# GCN-NOT: S_MOV_
+# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc
+# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_live_scc_commute
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_AND_B64 killed $sgpr0_sgpr1, $exec, implicit-def $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_commute
+# GCN-NOT: S_MOV_
+# GCN-NOT: S_AND_
+# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_commute
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_AND_B64 killed $sgpr0_sgpr1, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_exec_vccz
+# GCN:      $exec = S_MOV_B64 -1
+# GCN-NEXT: S_ENDPGM
+name:            and_execz_mov_exec_vccz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $exec = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_exec_vccnz
+# GCN:      $exec = S_MOV_B64 -1
+# GCN-NEXT: S_BRANCH %bb.1{{$}}
+name:            and_execz_mov_exec_vccnz
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $exec = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_reads_sreg_early
+# GCN:      $sgpr0_sgpr1 = S_MOV_B64 -1
+# GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr1
+# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_reads_sreg_early
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $sgpr2 = S_MOV_B32 $sgpr1
+    $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_reads_sreg_late
+# GCN:      $sgpr0_sgpr1 = S_MOV_B64 -1
+# GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr1
+# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_reads_sreg_late
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, $sgpr0_sgpr1, implicit-def dead $scc
+    $sgpr2 = S_MOV_B32 $sgpr1
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+# GCN-LABEL: name: and_execz_mov_vccz_reads_writes_sreg_early
+# GCN:      $sgpr0_sgpr1 = S_MOV_B64 -1
+# GCN-NEXT: $sgpr1 = S_MOV_B32 $sgpr0
+# GCN-NEXT: $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+name:            and_execz_mov_vccz_reads_writes_sreg_early
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $sgpr1 = S_MOV_B32 $sgpr0
+    $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_reads_cond
+# GCN:      $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+# GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
+# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_reads_cond
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    $sgpr2 = S_MOV_B32 $vcc_lo
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_mov_vccz_modifies_sreg
+# GCN:      $sgpr0_sgpr1 = S_MOV_B64 -1
+# GCN-NEXT: $sgpr0 = S_MOV_B32 0
+# GCN-NEXT: $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+name:            and_execz_mov_vccz_modifies_sreg
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0_sgpr1 = S_MOV_B64 -1
+    $sgpr0 = S_MOV_B32 0
+    $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM
+...
+---
+# GCN-LABEL: name: and_execz_imm_vccz_liveout_scc
+# GCN:      $vcc = S_AND_B64 $exec, -1, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
+# GCN-NEXT  S_ENDPGM implicit $scc
+name:            and_execz_imm_vccz_liveout_scc
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $vcc = S_AND_B64 $exec, -1, implicit-def $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM implicit $scc
+...
diff --git a/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..80309b40e17335da2f5f0294b3ed1670c3d9b5d0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -0,0 +1,312 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}float4_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
+; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
+define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
+entry:
+  %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
+  store <4 x float> %v, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}float4_inselt_undef:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-NOT: v_cmp_
+; GCN-NOT: v_cndmask_
+; GCN:     v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
+; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
+; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
+define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
+entry:
+  %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
+  store <4 x float> %v, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}int4_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]]
+; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
+define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
+entry:
+  %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}float2_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
+; GCN:     flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
+define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
+entry:
+  %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
+  store <2 x float> %v, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}float8_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4
+; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
+; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
+; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
+define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
+entry:
+  %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
+  store <8 x float> %v, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}float16_inselt:
+; GCN: v_movreld_b32
+define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
+entry:
+  %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
+  store <16 x float> %v, <16 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}half4_inselt:
+; GCN-NOT: v_cndmask_b32
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
+; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
+; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00
+define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
+entry:
+  %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
+  store <4 x half> %v, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}half2_inselt:
+; GCN-NOT: v_cndmask_b32
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
+; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
+; GCN:     v_bfi_b32 v{{[0-9]+}}, [[V]], v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
+entry:
+  %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
+  store <2 x half> %v, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}half8_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
+entry:
+  %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
+  store <8 x half> %v, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}short2_inselt:
+; GCN-NOT: v_cndmask_b32
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
+; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
+; GCN:     v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}}
+define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
+entry:
+  %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
+  store <2 x i16> %v, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}short4_inselt:
+; GCN-NOT: v_cndmask_b32
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
+; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
+; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
+entry:
+  %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
+  store <4 x i16> %v, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}byte8_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
+; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
+; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
+entry:
+  %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
+  store <8 x i8> %v, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}byte16_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
+; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+; GCN-DAG: v_or_b32_sdwa
+define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
+entry:
+  %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
+  store <16 x i8> %v, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}double2_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
+define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
+entry:
+  %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
+  store <2 x double> %v, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}double8_inselt:
+; GCN-NOT: v_cndmask
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
+entry:
+  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
+  store <8 x double> %v, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}bit4_inselt:
+; GCN: buffer_store_byte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
+entry:
+  %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
+  store <4 x i1> %v, <4 x i1> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}bit128_inselt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
+; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
+define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
+entry:
+  %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
+  store <128 x i1> %v, <128 x i1> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 692696ff7302b8b0483f3e9c1067d5f13cf9d51a..93ee16ea85d85369e49a75a8fdf3f81794e5c890 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -83,8 +83,11 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
-; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
@@ -93,10 +96,14 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:
-; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
-; GCN-DAG: buffer_store_dword v
+; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: buffer_store_dwordx3 v
 define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
@@ -104,8 +111,15 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:
-; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC4]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
 ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
 define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
@@ -114,7 +128,11 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:
-; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CCL]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
@@ -136,8 +154,11 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
-; GCN: v_movreld_b32
-; GCN: buffer_store_dwordx2
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5, v{{[0-9]+}}, [[CC1]]
+; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
@@ -145,9 +166,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:
-; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5
-; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
-; GCN-DAG: buffer_store_dword v
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC3]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: buffer_store_dwordx3 v
 define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
   store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
@@ -156,8 +181,15 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
 ; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
-; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
+; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC4]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}},  v{{[0-9]+}}, [[VVAL]], [[CC3]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC2]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC1]]
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
@@ -166,7 +198,10 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:
-; GCN: v_movreld_b32
+; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
@@ -229,8 +264,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
-; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
-; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]]
+; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]],  [[SHIFTED_MASK]]
 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
 ; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
 
@@ -269,8 +303,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
 ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
-; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
-; VI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[NOT_MASK]], [[VEC]]
+; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
 ; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5
 ; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]]
 ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]
@@ -288,24 +321,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
 ; GCN: s_load_dwordx4
 ; GCN: s_load_dword s
 
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-
-; GCN: buffer_store_byte
+; GCN-NOT: buffer_store_byte
+
+; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 15
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
+; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
+
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
@@ -343,23 +365,18 @@ endif:
 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
 
-; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
-
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
 
-; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
-; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0
-
-; Increment to next element folded into base register, but FileCheck
-; can't do math expressions
-
-; FIXME: Should be able to manipulate m0 directly instead of s_lshl_b32 + copy to m0
-
-; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC2]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC1]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
@@ -371,8 +388,12 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:
 
-; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 5
-; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
@@ -383,34 +404,41 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}},  v{{[0-9]+}}, 5, [[CC3]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}},  v{{[0-9]+}}, 0, [[CC3]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
 define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
   store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
   ret void
 }
 
-; FIXME: Should be able to do without stack access. The used stack
-; space is also 2x what should be required.
-
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
 
-; Stack store
-
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
-
-; Write element
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
-
-; Stack reload
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
+; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40200000
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC4]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC4]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}},  v{{[0-9]+}}, [[CONST]], [[CC3]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}},  v{{[0-9]+}}, 0, [[CC3]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC2]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
+; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC1]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
 
-; Store result
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-; GCN: ScratchSize: 64
+; GCN: ScratchSize: 0
 
 define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index d9b253328196b8cdf410b8a84265e50956e75302..b4fb59983cbbe934cb8083274cef57a991ab60ae 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -444,40 +444,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
   ret void
 }
 
-; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
-; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
-
-; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-
-; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
-
-; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
-
-; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %idx = load i32, i32 addrspace(1)* %idx.gep
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
-  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
-  ret void
-}
-
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
 ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
 
-; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e35cf7a349ac572da1bd3985d7b81259205c0718
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+
+; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %idx = load i32, i32 addrspace(1)* %idx.gep
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..07ee65526c9610c2badeda24bb719fb8dab72c23
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+
+; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %idx = load i32, i32 addrspace(1)* %idx.gep
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 11067522f8573f11303d908cc7abd17811a0f540..6b6473124bb9f468b7f6beec31d3d22a609f71a6 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,19 +1,19 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-GFX9,FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s
 
 ; FUNC-LABEL: {{^}}i8_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
-; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
-; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
+; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
 
 ; EGCM: VTX_READ_8{{.*}} #3
@@ -25,13 +25,13 @@ define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) noun
 }
 
 ; FUNC-LABEL: {{^}}i8_zext_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
-; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
-; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
+; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
 
 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
@@ -49,15 +49,15 @@ define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zero
 }
 
 ; FUNC-LABEL: {{^}}i8_sext_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
-; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
-; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
-; HSA-VI: flat_store_dword
+; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-GFX9: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
+; HSA-GFX9: global_store_dword
 
 
 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
@@ -75,17 +75,17 @@ define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 sign
 }
 
 ; FUNC-LABEL: {{^}}i16_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 
 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
-; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
-; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
-; HSA-VI: flat_store_dword
+; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
+; HSA-GFX9: global_store_dword
 
 ; EGCM: VTX_READ_16
 ; EGCM: KC0[2].Y
@@ -96,15 +96,15 @@ define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) no
 }
 
 ; FUNC-LABEL: {{^}}i16_zext_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
-; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
-; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
-; HSA-VI: flat_store_dword
+; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
+; HSA-GFX9: global_store_dword
 
 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
@@ -121,16 +121,16 @@ define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 ze
 }
 
 ; FUNC-LABEL: {{^}}i16_sext_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 
-; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
-; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
-; HSA-VI: flat_store_dword
+; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-GFX9: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
+; HSA-GFX9: global_store_dword
 
 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
@@ -147,13 +147,13 @@ define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 si
 }
 
 ; FUNC-LABEL: {{^}}i32_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
+; HSA-GFX9: s_load_dword s{{[0-9]}}, s[4:5], 0x8
 define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
@@ -161,12 +161,12 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}f32_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
 define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
@@ -174,8 +174,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v2i8_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
@@ -189,15 +189,15 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v2i16_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 
 ; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
@@ -205,14 +205,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v2i32_arg:
-; HSA-VI: kernarg_segment_byte_size = 16
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 16
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
-; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
+; HSA-GFX9: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
 define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
@@ -220,14 +220,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v2f32_arg:
-; HSA-VI: kernarg_segment_byte_size = 16
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 16
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
+; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
 define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
@@ -235,8 +235,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v3i8_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
@@ -253,8 +253,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v3i16_arg:
-; HSA-VI: kernarg_segment_byte_size = 16
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 16
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
@@ -271,14 +271,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v3i32_arg:
-; HSA-VI: kernarg_segment_byte_size = 32
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 32
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
-; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
+; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
 define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
@@ -286,14 +286,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v3f32_arg:
-; HSA-VI: kernarg_segment_byte_size = 32
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 32
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
-; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
+; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
 define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
@@ -301,8 +301,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v4i8_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
@@ -317,8 +317,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v4i16_arg:
-; HSA-VI: kernarg_segment_byte_size = 16
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 16
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
@@ -334,8 +334,8 @@ entry:
 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
 
-; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
-; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
@@ -343,8 +343,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v4i32_arg:
-; HSA-VI: kernarg_segment_byte_size = 32
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 32
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
@@ -352,7 +352,7 @@ entry:
 
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
-; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
+; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
 define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
@@ -360,15 +360,15 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v4f32_arg:
-; HSA-VI: kernarg_segment_byte_size = 32
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 32
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
-; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
+; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
 define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
@@ -377,8 +377,8 @@ entry:
 
 ; FIXME: Lots of unpack and re-pack junk on VI
 ; FUNC-LABEL: {{^}}v8i8_arg:
-; HSA-VI: kernarg_segment_byte_size = 16
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 16
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
@@ -405,8 +405,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v8i16_arg:
-; HSA-VI: kernarg_segment_byte_size = 32
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 32
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
@@ -423,7 +423,7 @@ entry:
 
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
 
-; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
+; HSA-GFX9: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
@@ -431,8 +431,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v8i32_arg:
-; HSA-VI: kernarg_segment_byte_size = 64
-; HSA-VI: kernarg_segment_alignment = 5
+; HSA-GFX9: kernarg_segment_byte_size = 64
+; HSA-GFX9: kernarg_segment_alignment = 5
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -444,7 +444,7 @@ entry:
 
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
-; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
+; HSA-GFX9: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
 define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
@@ -452,8 +452,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v8f32_arg:
-; HSA-VI: kernarg_segment_byte_size = 64
-; HSA-VI: kernarg_segment_alignment = 5
+; HSA-GFX9: kernarg_segment_byte_size = 64
+; HSA-GFX9: kernarg_segment_alignment = 5
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -472,8 +472,8 @@ entry:
 ; FIXME: Pack/repack on VI
 
 ; FUNC-LABEL: {{^}}v16i8_arg:
-; HSA-VI: kernarg_segment_byte_size = 32
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 32
+; HSA-GFX9: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
@@ -508,8 +508,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v16i16_arg:
-; HSA-VI: kernarg_segment_byte_size = 64
-; HSA-VI: kernarg_segment_alignment = 5
+; HSA-GFX9: kernarg_segment_byte_size = 64
+; HSA-GFX9: kernarg_segment_alignment = 5
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
@@ -535,7 +535,7 @@ entry:
 
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 
-; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-GFX9: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
@@ -543,8 +543,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v16i32_arg:
-; HSA-VI: kernarg_segment_byte_size = 128
-; HSA-VI: kernarg_segment_alignment = 6
+; HSA-GFX9: kernarg_segment_byte_size = 128
+; HSA-GFX9: kernarg_segment_alignment = 6
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -563,7 +563,7 @@ entry:
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
-; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
+; HSA-GFX9: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
 define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
@@ -571,8 +571,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}v16f32_arg:
-; HSA-VI: kernarg_segment_byte_size = 128
-; HSA-VI: kernarg_segment_alignment = 6
+; HSA-GFX9: kernarg_segment_byte_size = 128
+; HSA-GFX9: kernarg_segment_alignment = 6
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -591,7 +591,7 @@ entry:
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
-; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
+; HSA-GFX9: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
 define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
@@ -600,7 +600,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}kernel_arg_i64:
 ; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
-; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
 ; MESA-GCN: buffer_store_dwordx2
 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
@@ -613,7 +613,7 @@ define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwi
 ; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
 ; MESA-GCN: buffer_store_dwordx2
 
-; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
@@ -630,10 +630,10 @@ entry:
 ; }
 
 ; FUNC-LABEL: {{^}}i65_arg:
-; HSA-VI: kernarg_segment_byte_size = 24
-; HSA-VI: kernarg_segment_alignment = 4
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-GFX9: kernarg_segment_byte_size = 24
+; HSA-GFX9: kernarg_segment_alignment = 4
+; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
 define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
 entry:
   store i65 %in, i65 addrspace(1)* %out, align 4
@@ -641,20 +641,20 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}i1_arg:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword s
 ; GCN: s_and_b32
-; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat|global}}_store_byte
 define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i1_arg_zext_i32:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword
 ; SGCN: buffer_store_dword
@@ -665,11 +665,11 @@ define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
 }
 
 ; FUNC-LABEL: {{^}}i1_arg_zext_i64:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword s
-; GCN: {{buffer|flat}}_store_dwordx2
+; GCN: {{buffer|flat|global}}_store_dwordx2
 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
@@ -677,11 +677,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
 }
 
 ; FUNC-LABEL: {{^}}i1_arg_sext_i32:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword
-; GCN: {{buffer|flat}}_store_dword
+; GCN: {{buffer|flat|global}}_store_dword
 define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
@@ -689,12 +689,12 @@ define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
 }
 
 ; FUNC-LABEL: {{^}}i1_arg_sext_i64:
-; HSA-VI: kernarg_segment_byte_size = 12
-; HSA-VI: kernarg_segment_alignment = 4
+; HSA-GFX9: kernarg_segment_byte_size = 12
+; HSA-GFX9: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword
 ; GCN: s_bfe_i64
-; GCN: {{buffer|flat}}_store_dwordx2
+; GCN: {{buffer|flat|global}}_store_dwordx2
 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
@@ -702,7 +702,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
 }
 
 ; FUNC-LABEL: {{^}}empty_struct_arg:
-; HSA-VI: kernarg_segment_byte_size = 0
+; HSA-GFX9: kernarg_segment_byte_size = 0
 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
   ret void
 }
@@ -718,11 +718,11 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
 
 ; FIXME: Total argument size is computed wrong
 ; FUNC-LABEL: {{^}}struct_argument_alignment:
-; HSA-VI: kernarg_segment_byte_size = 40
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-GFX9: kernarg_segment_byte_size = 40
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
   %val0 = extractvalue {i32, i64} %arg0, 0
   %val1 = extractvalue {i32, i64} %arg0, 1
@@ -738,11 +738,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
 ; No padding between i8 and next struct, but round up at end to 4 byte
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
-; HSA-VI: kernarg_segment_byte_size = 28
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
+; HSA-GFX9: kernarg_segment_byte_size = 28
+; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
+; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
   %val0 = extractvalue <{i32, i64}> %arg0, 0
   %val1 = extractvalue <{i32, i64}> %arg0, 1
@@ -756,12 +756,12 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
 }
 
 ; GCN-LABEL: {{^}}struct_argument_alignment_after:
-; HSA-VI: kernarg_segment_byte_size = 64
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
-; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
+; HSA-GFX9: kernarg_segment_byte_size = 64
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-GFX9: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
 define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
   %val0 = extractvalue {i32, i64} %arg0, 0
   %val1 = extractvalue {i32, i64} %arg0, 1
@@ -776,10 +776,10 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
 }
 
 ; GCN-LABEL: {{^}}array_3xi32:
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
+; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
   store volatile i16 %arg0, i16 addrspace(1)* undef
   store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
@@ -788,13 +788,19 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
 
 ; FIXME: Why not all scalar loads?
 ; GCN-LABEL: {{^}}array_3xi16:
-; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2
-; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0
-; HSA-VI: flat_load_ushort
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:2
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:6
 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
   store volatile i8 %arg0, i8 addrspace(1)* undef
   store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
   ret void
 }
+
+; GCN-LABEL: {{^}}small_array_round_down_offset:
+; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:1
+define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
+  %val = extractvalue [1 x i8] %arg, 0
+  store volatile i8 %val, i8 addrspace(1)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index a1bb6c28e7402bed39aa12da3631becacaf2cd3b..b7344cfb33c63c975e3b9fc49b3020d446217a37 100644
--- a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
 
 ; Repeat of some problematic tests in kernel-args.ll, with the IR
 ; argument lowering pass disabled. Struct padding needs to be
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index d8cf52341e39fbbb3b692a7ab96fbbd42dac1f49..0343052601f430d1637dabba3e58e722f38ac473 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
 
 ; FIXME: align on alloca seems to be ignored for private_segment_alignment
 
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
deleted file mode 100644
index 6374c9e02edde48eea02d7b7cc421cad2c5b3242..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}kill_gs_const:
-; SI-NOT: v_cmpx_le_f32
-; SI: s_mov_b64 exec, 0
-define amdgpu_gs void @kill_gs_const() {
-main_body:
-  %tmp = icmp ule i32 0, 3
-  %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %tmp1)
-  %tmp2 = icmp ule i32 3, 0
-  %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %tmp3)
-  ret void
-}
-
-; SI-LABEL: {{^}}kill_vcc_implicit_def:
-; SI-NOT: v_cmp_gt_f32_e32 vcc,
-; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
-; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(4)* byval %arg, [17 x <16 x i8>] addrspace(4)* byval %arg1, [17 x <4 x i32>] addrspace(4)* byval %arg2, [34 x <8 x i32>] addrspace(4)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) {
-entry:
-  %tmp0 = fcmp olt float %arg13, 0.000000e+00
-  call void @llvm.AMDGPU.kill(float %arg14)
-  %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
-  ret void
-}
-
-declare void @llvm.AMDGPU.kill(float) #0
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
deleted file mode 100644
index 1d370aba6da3496929725f665470c1aad7cb6ee3..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
-
-; Example of a simple geometry shader loading vertex attributes from the
-; ESGS ring buffer
-
-; FIXME: Out of bounds immediate offset crashes
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc
-; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
-
-define amdgpu_vs void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <32 x i8>] addrspace(4)* byval %arg2, [2 x <4 x i32>] addrspace(4)* byval %arg3, [17 x <4 x i32>] addrspace(4)* inreg %arg4, [17 x <4 x i32>] addrspace(4)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
-main_body:
-  %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(4)* %arg3, i64 0, i32 1
-  %tmp10 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
-  %tmp11 = shl i32 %arg6, 2
-  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
-  %tmp13 = bitcast i32 %tmp12 to float
-  %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
-  %tmp15 = bitcast i32 %tmp14 to float
-  %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
-  %tmp17 = bitcast i32 %tmp16 to float
-  %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
-  %tmp19 = bitcast i32 %tmp18 to float
-
-  %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0)
-  %tmp21 = bitcast i32 %tmp20 to float
-
-  %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
-  %tmp23 = bitcast i32 %tmp22 to float
-
-  call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp21, float %tmp23, float %tmp23, float %tmp23, i1 true, i1 false)
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
-
-attributes #0 = { nounwind readonly }
-attributes #1 = { nounwind inaccessiblememonly }
-
-!0 = !{!"const", !1, i32 1}
-!1 = !{!"tbaa root"}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
deleted file mode 100644
index 01b76422c03f83772c771e73149c9cebea9d988b..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}test1:
-;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc
-define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
-    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
-        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test1_idx:
-;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc
-define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) {
-    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
-        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test1_scalar_offset:
-;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc
-define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) {
-    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
-        i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test1_no_glc_slc:
-;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32
-define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) {
-    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
-        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0,
-        i32 0, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test2:
-;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc
-define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
-    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
-        i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test3:
-;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc
-define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
-    %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v2i32(<4 x i32> undef, <2 x i32> %vdata,
-        i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test4:
-;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc
-define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) {
-    call void @llvm.SI.tbuffer.store.i32(<4 x i32> undef, i32 %vdata,
-        i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-declare void @llvm.SI.tbuffer.store.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
index 49ca7d405724b6693c2235b59199c0b1010e1ff6..ba9f206f1512ca0110c0bce7a746c65ca526933e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -193,6 +193,22 @@ main_body:
   ret void
 }
 
+;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+main_body:
+  %a1 = add i32 %a, 4
+  %a2 = add i32 %a, 12
+  %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+  %r1 = extractelement <2 x float> %vr1, i32 0
+  %r2 = extractelement <2 x float> %vr1, i32 1
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
+  ret void
+}
+
 ;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged:
 ;CHECK-NEXT: %bb.
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
@@ -227,6 +243,20 @@ main_body:
   ret void
 }
 
+;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) {
+main_body:
+  %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
+  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
+  %r1 = extractelement <2 x float> %vr1, i32 0
+  %r2 = extractelement <2 x float> %vr1, i32 1
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
+  ret void
+}
+
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
index 48a30c7f57511556be18ba7223f3fdef3f7b3c87..7e2996ea92f419266d4f388c6db8ab8a44eb1584 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@@ -147,6 +147,41 @@ define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %
   ret void
 }
 
+;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
+define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) {
+  %a1 = add i32 %a, 28
+  %a2 = add i32 %a, 32
+  %a3 = add i32 %a, 36
+  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) {
+  %a1 = add i32 %a, 4
+  %a2 = add i32 %a, 12
+  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) {
+  %a1 = add i32 %a, 4
+  %a2 = add i32 %a, 8
+  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+  ret void
+}
+
 ;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
@@ -164,12 +199,40 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, floa
 ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) {
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
   ret void
 }
 
+;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) {
+  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) {
+  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8
+define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) {
+  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
+  ret void
+}
+
 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index b6f9f951d9bda2931737f88dbd3519e2bdaa9fd4..a2f2ced72e7adc97fecc0e9bee73612dc95ab63e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 5853d8d8e4e1d296e2b5e959b6ebbf2e6eb2d49c..ee039a392e2644658b3914b08a9b99f87a6d1475 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s
 ; RUN: llc -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index f8c60451ac73656215edb696f9200b75d027e9f4..6866d9537b3e55018df4c1edb49a491b9efdc00d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eae81fee4393690e44d99399d74c3cc26d041ed9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -0,0 +1,114 @@
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}s_buffer_load_imm:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4
+define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
+main_body:
+  %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
+  %bitcast = bitcast i32 %load to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_load_index:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) {
+main_body:
+  %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
+  %bitcast = bitcast i32 %load to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_loadx2_imm:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40
+define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) {
+main_body:
+  %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0)
+  %bitcast = bitcast <2 x i32> %load to <2 x float>
+  %x = extractelement <2 x float> %bitcast, i32 0
+  %y = extractelement <2 x float> %bitcast, i32 1
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_loadx2_index:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) {
+main_body:
+  %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
+  %bitcast = bitcast <2 x i32> %load to <2 x float>
+  %x = extractelement <2 x float> %bitcast, i32 0
+  %y = extractelement <2 x float> %bitcast, i32 1
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_loadx4_imm:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8
+define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) {
+main_body:
+  %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0)
+  %bitcast = bitcast <4 x i32> %load to <4 x float>
+  %x = extractelement <4 x float> %bitcast, i32 0
+  %y = extractelement <4 x float> %bitcast, i32 1
+  %z = extractelement <4 x float> %bitcast, i32 2
+  %w = extractelement <4 x float> %bitcast, i32 3
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_loadx4_index:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) {
+main_body:
+  %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
+  %bitcast = bitcast <4 x i32> %load to <4 x float>
+  %x = extractelement <4 x float> %bitcast, i32 0
+  %y = extractelement <4 x float> %bitcast, i32 1
+  %z = extractelement <4 x float> %bitcast, i32 2
+  %w = extractelement <4 x float> %bitcast, i32 3
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4
+define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
+main_body:
+  %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
+  %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
+  %x = bitcast i32 %load0 to float
+  %y = bitcast i32 %load1 to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8
+define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
+main_body:
+  %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
+  %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0)
+  %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0)
+  %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0)
+  %x = bitcast i32 %load0 to float
+  %y = bitcast i32 %load1 to float
+  %z = bitcast i32 %load2 to float
+  %w = bitcast i32 %load3 to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
+declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index 9fe0c0f8fc1fda1ad9efe9641fd245aced9433d8..18abf607aea5a2ddef0df877bd05145a0a1a69dd 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s
 
 ; VI-LABEL: {{^}}dpp_test:
 ; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
index 349e7f0f0e8da3c9fbb82cc39277bc7632f5ea00..377785e0ca2d14d4c9c709afde3aa39f1b27f5ef 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=UNKNOWN-OS -check-prefix=SI-MESA %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=UNKNOWN-OS -check-prefix=VI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
 
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workgroup.id.y() #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 8b80998cab6fe90f1bae91f2ba69864072555db0..13e204b03a039dd3b9def4ab44e4ce98ecf3f596 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
index 1946e6a3618673d69a4373e6264f80d8f57ffc30..57af85b67bf3be099740af8de09b67a6d19a9a15 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
@@ -33,20 +33,17 @@ main_body:
 ;CHECK-LABEL: {{^}}kill:
 ;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1
 ;CHECK: s_wqm_b64 [[WQM:[^,]+]], [[CMP]]
-;FIXME: This could just be: s_and_b64 exec, exec, [[WQM]]
-;CHECK: v_cndmask_b32_e64 [[KILL:[^,]+]], -1.0, 1.0, [[WQM]]
-;CHECK: v_cmpx_le_f32_e32 {{[^,]+}}, 0, [[KILL]]
+;CHECK: s_and_b64 exec, exec, [[WQM]]
 ;CHECK: s_endpgm
 define amdgpu_ps void @kill(i32 %v0, i32 %v1) #1 {
 main_body:
   %c = icmp eq i32 %v0, %v1
   %w = call i1 @llvm.amdgcn.wqm.vote(i1 %c)
-  %r = select i1 %w, float 1.0, float -1.0
-  call void @llvm.AMDGPU.kill(float %r)
+  call void @llvm.amdgcn.kill(i1 %w)
   ret void
 }
 
-declare void @llvm.AMDGPU.kill(float) #1
+declare void @llvm.amdgcn.kill(i1) #1
 declare i1 @llvm.amdgcn.wqm.vote(i1)
 
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-lo16.ll b/test/CodeGen/AMDGPU/load-lo16.ll
index eec9144cda285f49631e24f46088437d87a31aba..b7512f24c207743f6142a0b389954264fa282f63 100644
--- a/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/test/CodeGen/AMDGPU/load-lo16.ll
@@ -305,8 +305,11 @@ entry:
 ; GFX9-NEXT: s_waitcnt
 ; GFX9-NEXT: s_setpc_b64
 
-; VI: flat_load_ubyte v{{[0-9]+}}
-; VI: v_or_b32_e32
+; VI: flat_load_ubyte [[LO:v[0-9]+]]
+; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2
+; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00
+; VI: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]]
+; VI: flat_store_dword v[0:1], [[RES]]
 define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
index 87c18a7fc449fbf78100e07a99d9b709072f0a79..f0dca0780aaef82c1da1ca87aeb4603f098f564f 100644
--- a/test/CodeGen/AMDGPU/local-64.ll
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -48,7 +48,7 @@ define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i
 
 ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
 ; SI, which is why it is being OR'd with the base pointer.
-; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; SI-DAG: s_bitset1_b32 [[ADDR:s[0-9]+]], 16
 ; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll
index fc7f9f7615ad443a634782bb133e0eae202d7e75..af64f2b1d578ee16eec77410793042a3c6e58a74 100644
--- a/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -364,7 +364,7 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) noun
 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
 ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 3528eaff8187d6336e467e2b8e4b6df32409a9ff..df1ef1cba71c8e5c21d33ae1aba4a4c7b5289f66 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
@@ -8,8 +8,10 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN-LABEL: {{^}}madak_f32:
 ; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
 ; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GCN:    v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -88,8 +90,10 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %o
 ; GCN-LABEL: {{^}}madak_inline_imm_f32:
 ; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
 ; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GCN:    v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll b/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f692c763c0b94e95bf02779c6aaaaa5676b58d86
--- /dev/null
+++ b/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll
@@ -0,0 +1,222 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx800 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
+
+; FUNC-LABEL: {{^}}system_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_acquire() {
+entry:
+  fence acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_release() {
+entry:
+  fence release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_acq_rel() {
+entry:
+  fence acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_seq_cst() {
+entry:
+  fence seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_acquire() {
+entry:
+  fence syncscope("singlethread") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_release() {
+entry:
+  fence syncscope("singlethread") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_acq_rel() {
+entry:
+  fence syncscope("singlethread") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_seq_cst() {
+entry:
+  fence syncscope("singlethread") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_acquire() {
+entry:
+  fence syncscope("agent") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_release() {
+entry:
+  fence syncscope("agent") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_acq_rel() {
+entry:
+  fence syncscope("agent") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_seq_cst() {
+entry:
+  fence syncscope("agent") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_acquire() {
+entry:
+  fence syncscope("workgroup") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_release:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_release() {
+entry:
+  fence syncscope("workgroup") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_acq_rel() {
+entry:
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_seq_cst() {
+entry:
+  fence syncscope("workgroup") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_acquire() {
+entry:
+  fence syncscope("wavefront") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_release() {
+entry:
+  fence syncscope("wavefront") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_acq_rel() {
+entry:
+  fence syncscope("wavefront") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_seq_cst() {
+entry:
+  fence syncscope("wavefront") seq_cst
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/test/CodeGen/AMDGPU/memory-legalizer-load.ll
index 76ab6259b4253ea4a952b0ee132d2c6841db2114..179cb3f625d9853c69ea48e6ead5e2f8cedd47a0 100644
--- a/test/CodeGen/AMDGPU/memory-legalizer-load.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-load.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -319,7 +319,7 @@ entry:
 
 ; GCN-LABEL: {{^}}nontemporal_global_1:
 ; GFX8:  flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
-; GFX9:  global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
+; GFX9:  global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
 define amdgpu_kernel void @nontemporal_global_1(
     i32 addrspace(1)* %in, i32* %out) {
 entry:
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll b/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll
new file mode 100644
index 0000000000000000000000000000000000000000..609dc0400c490034e20fe58c51a86dafa8631e05
--- /dev/null
+++ b/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll
@@ -0,0 +1,222 @@
+; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
+; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
+; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx800 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
+; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
+
+; FUNC-LABEL: {{^}}system_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_acquire() {
+entry:
+  fence acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_release() {
+entry:
+  fence release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_acq_rel() {
+entry:
+  fence acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_seq_cst() {
+entry:
+  fence seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_acquire() {
+entry:
+  fence syncscope("singlethread") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_release() {
+entry:
+  fence syncscope("singlethread") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_acq_rel() {
+entry:
+  fence syncscope("singlethread") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_seq_cst() {
+entry:
+  fence syncscope("singlethread") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_acquire() {
+entry:
+  fence syncscope("agent") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_release() {
+entry:
+  fence syncscope("agent") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_acq_rel() {
+entry:
+  fence syncscope("agent") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_seq_cst() {
+entry:
+  fence syncscope("agent") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_acquire() {
+entry:
+  fence syncscope("workgroup") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_release:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_release() {
+entry:
+  fence syncscope("workgroup") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_acq_rel() {
+entry:
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_seq_cst() {
+entry:
+  fence syncscope("workgroup") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_acquire() {
+entry:
+  fence syncscope("wavefront") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_release() {
+entry:
+  fence syncscope("wavefront") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_acq_rel() {
+entry:
+  fence syncscope("wavefront") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_seq_cst() {
+entry:
+  fence syncscope("wavefront") seq_cst
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/test/CodeGen/AMDGPU/memory-legalizer-store.ll
index 7bec74d36c81e215b1e2a1c33c630a3cb01583eb..87c43949df6a7559e281980dec3167906e83544c 100644
--- a/test/CodeGen/AMDGPU/memory-legalizer-store.ll
+++ b/test/CodeGen/AMDGPU/memory-legalizer-store.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -240,7 +240,7 @@ entry:
 
 ; GCN-LABEL: {{^}}nontemporal_global_1:
 ; GFX8:  flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
-; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
+; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
 define amdgpu_kernel void @nontemporal_global_1(
     i32* %in, i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/AMDGPU/memory_clause.ll b/test/CodeGen/AMDGPU/memory_clause.ll
index f4a66f83fd94b6af0a977c4ce3a566ef09880d37..9ae068a4340d557ed45ff2c1dcd7e7092866d6f9 100644
--- a/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/test/CodeGen/AMDGPU/memory_clause.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}vector_clause:
 ; GCN:      global_load_dwordx4
@@ -105,7 +105,7 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}vector_clause_indirect:
-; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], off
+; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}]
 ; GCN-NEXT: s_nop 0
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_nop 0
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
index 2b2f67cd1b3d59cb1829fe515eda1cfbbde27aa0..7d0c0db22b7e3750630fe230265dcd7619bf1111 100644
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -164,8 +164,8 @@ define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float ad
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dword
+; SI-DAG: buffer_store_dwordx3
+; SI-NOT: buffer_store_dwordx2
 ; SI-NOT: buffer_store_dword
 ; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
@@ -274,11 +274,9 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
-; SI-DAG: buffer_load_dwordx2
-; SI-DAG: buffer_load_dword v
+; SI-DAG: buffer_load_dwordx3
 ; GCN: s_waitcnt
-; SI-DAG: buffer_store_dword v
-; SI-DAG: buffer_store_dwordx2 v
+; SI-DAG: buffer_store_dwordx3 v
 ; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
@@ -563,8 +561,7 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)*
 
 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
 ; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx3
 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
@@ -611,13 +608,11 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)*
 
 ; GCN-LABEL: {{^}}copy_v3i32_align4:
 ; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-NOT: offen
 ; GCN: s_waitcnt vmcnt
 ; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
 ; GCN: ScratchSize: 0{{$}}
 define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
@@ -644,13 +639,11 @@ define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %ou
 
 ; GCN-LABEL: {{^}}copy_v3f32_align4:
 ; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-NOT: offen
 ; GCN: s_waitcnt vmcnt
 ; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: ScratchSize: 0{{$}}
 define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/mode-register.mir b/test/CodeGen/AMDGPU/mode-register.mir
new file mode 100644
index 0000000000000000000000000000000000000000..bf06a01d4c3c5685507bff8ca14c457a0867ea55
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mode-register.mir
@@ -0,0 +1,459 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-mode-register  %s -o - | FileCheck %s
+
+---
+# check that the mode is changed to rtz from default rtn for interp f16
+# CHECK-LABEL: name: interp_f16_default
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: V_INTERP_P1LL_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NEXT: V_ADD_F16_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: interp_f16_default
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    $m0 = S_MOV_B32 killed $sgpr2
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that the mode is not changed for interp f16 when the mode is already RTZ
+# CHECK-LABEL: name: interp_f16_explicit_rtz
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: V_MOV_B32_e32
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NEXT: V_ADD_F16_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: interp_f16_explicit_rtz
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_SETREG_IMM32_B32 3, 2177
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that explicit RTN mode change is registered
+# CHECK-LABEL: name: explicit_rtn
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: V_INTERP_P1LL_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NEXT: V_ADD_F16_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: explicit_rtn
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    $m0 = S_MOV_B32 killed $sgpr2
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    S_SETREG_IMM32_B32 0, 2177
+    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that the mode is unchanged from RTN for F64 instruction
+# CHECK-LABEL: name: rtn_default
+# CHECK-LABEL: bb.0:
+# CHECK-NOT: S_SETREG_IMM32_B32
+# CHECK: V_FRACT_F64
+
+name: rtn_default
+
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    S_ENDPGM
+...
+---
+# check that the mode is changed from RTZ to RTN for F64 instruction
+# CHECK-LABEL: name: rtn_from_rtz
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NEXT: V_FRACT_F64
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: rtn_from_rtz
+
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    S_SETREG_IMM32_B32 3, 2177
+    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    S_ENDPGM
+...
+---
+# CHECK-LABEL: name: rtz_from_rtn
+# CHECK-LABEL: bb.1:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: rtz_from_rtn
+
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr1_vgpr2
+    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that the mode is changed from RTZ to RTN for F64 instruction
+# and back again for remaining interp instruction
+# CHECK-LABEL: name: interp_f16_plus_sqrt_f64
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P2_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P2_F16
+
+name: interp_f16_plus_sqrt_f64
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    $m0 = S_MOV_B32 killed $sgpr2
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that an explicit change to the single precision mode has no effect
+# CHECK-LABEL: name: single_precision_mode_change
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P2_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P2_F16
+
+name: single_precision_mode_change
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    $m0 = S_MOV_B32 killed $sgpr2
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    S_SETREG_IMM32_B32 2, 2049
+    $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that mode is propagated back to start of loop - first instruction is RTN but needs
+# setreg as RTZ is set in loop
+# CHECK-LABEL: name: loop
+# CHECK-LABEL: bb.1:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64
+# CHECK-LABEL: bb.2:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: loop
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    successors: %bb.1
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.1, %bb.3
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.3:
+    S_ENDPGM
+...
+---
+# two back-edges to same node with different modes
+# CHECK-LABEL: name: double_loop
+# CHECK-NOT: S_SETREG_IMM32_B32
+# CHECK-LABEL: bb.2:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64_e32
+# CHECK-LABEL: bb.4:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+
+name: double_loop
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    successors: %bb.1
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    S_NOP 1
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.1, %bb.3
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4
+    S_NOP 1
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.5
+    S_NOP 1
+    S_BRANCH %bb.5
+
+  bb.5:
+    successors: %bb.1, %bb.6
+    S_SETREG_IMM32_B32 3, 2177
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+    S_BRANCH %bb.6
+
+  bb.6:
+    S_ENDPGM
+...
+---
+# check that mode is propagated back to start of loop and through a block that
+# neither sets or uses the mode.
+# CHECK-LABEL: name: loop_indirect
+# CHECK_NOT: S_SETREG_IMM32_B32
+# CHECK-LABEL: bb.3:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: loop_indirect
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    successors: %bb.1
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    S_NOP 1
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    S_NOP 1
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.1, %bb.4
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_ENDPGM
+...
+---
+# check that multiple mode values are propagated to a block that uses the mode
+# CHECK-LABEL: name: multiple_mode_direct
+# CHECK-LABEL: bb.3:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: multiple_mode_direct
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    successors: %bb.1
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.3
+    S_CBRANCH_VCCZ %bb.2, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.3
+    S_SETREG_IMM32_B32 3, 2177
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_ENDPGM
+...
+---
+# check that multiple mode values are propagated through a block that neither
+# sets or uses the mode.
+# CHECK-LABEL: name: multiple_mode_indirect
+# CHECK-LABEL: bb.4:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: multiple_mode_indirect
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    successors: %bb.1
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.3
+    S_CBRANCH_VCCZ %bb.2, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.3
+    S_SETREG_IMM32_B32 3, 2177
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4
+    S_NOP 1
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.5
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM
+...
+---
+# CHECK-LABEL: name: pass_through_blocks
+# CHECK-LABEL: bb.0:
+# CHECK: V_FRACT_F64_e32
+# CHECK-NEXT: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: pass_through_blocks
+
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr1_vgpr2
+    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4
+    S_BRANCH %bb.4
+
+  bb.4:
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that multiple mode values are propagated
+# CHECK-LABEL: name: if_then_else
+# CHECK-LABEL: bb.3:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: if_then_else
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    successors: %bb.1
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.3
+    S_CBRANCH_VCCZ %bb.3, implicit $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    S_SETREG_IMM32_B32 3, 2177
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/movreld-bug.ll b/test/CodeGen/AMDGPU/movreld-bug.ll
index 0cc8a5271bd7383be5c7f15666f11b58365dc102..d07cb5f3fa63ca086e0304491d6c47ce7a38fa02 100644
--- a/test/CodeGen/AMDGPU/movreld-bug.ll
+++ b/test/CodeGen/AMDGPU/movreld-bug.ll
@@ -7,8 +7,8 @@
 ; GCN: ; return
 define amdgpu_ps float @main(i32 inreg %arg) #0 {
 main_body:
-  %tmp24 = insertelement <2 x float> undef, float 0.000000e+00, i32 %arg
-  %tmp25 = extractelement <2 x float> %tmp24, i32 1
+  %tmp24 = insertelement <16 x float> undef, float 0.000000e+00, i32 %arg
+  %tmp25 = extractelement <16 x float> %tmp24, i32 1
   ret float %tmp25
 }
 
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index d5c22d22cca5ca27e0cab452bf3c6145d3c38b02..9cc48b41d6d7eaa5df9913592ae9527a48184b5a 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -60,7 +60,7 @@ main_body:
   %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
   %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
   %tmp2 = shl i32 %6, 2
-  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1)
   %tmp4 = add i32 %6, 16
   %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32>
   call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
@@ -79,7 +79,7 @@ main_body:
   %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
   %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
   %tmp2 = shl i32 %6, 2
-  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1)
   %tmp4 = add i32 %6, 16
   %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32>
   call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
@@ -176,7 +176,7 @@ define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
   ret void
 }
 
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32>, i32, i32, i32) #0
 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 
 attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 679fd7c987038ba7060e19c0ec5811923ad245d9..7988a246a0f792b8d7754c4e6b9fb45cac4b6355 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -60,11 +60,11 @@
 
 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
 
-; GCN:      s_mov_b64           [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
-; GCN:      v_cmp_lt_i32_e32    vcc, 1,
-; GCN:      s_mov_b64           [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
-; GCN:      s_and_saveexec_b64
-; GCN:      s_xor_b64
+; GCN-DAG:  s_mov_b64           [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
+; GCN-DAG:  v_cmp_lt_i32_e32    vcc, 1,
+; GCN-DAG:  s_mov_b64           [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
+; GCN-DAG:  s_and_saveexec_b64
+; GCN-DAG:  s_xor_b64
 
 ; GCN: ; %LeafBlock1
 ; GCN-NEXT: s_mov_b64           [[EXIT0]], exec
@@ -92,8 +92,8 @@
 ; GCN-NEXT: s_xor_b64
 
 ; GCN: ; %exit1
-; GCN:      ds_write_b32
-; GCN:      s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
+; GCN-DAG:  ds_write_b32
+; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
 
 ; GCN: ; %Flow5
 ; GCN-NEXT: s_or_b64            exec, exec,
diff --git a/test/CodeGen/AMDGPU/nand.ll b/test/CodeGen/AMDGPU/nand.ll
new file mode 100644
index 0000000000000000000000000000000000000000..be7d9f677ec022691f8d654d936160e297ffe4bb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/nand.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s
+; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s
+
+; GCN-LABEL: {{^}}scalar_nand_i32_one_use
+; GCN: s_nand_b32
+define amdgpu_kernel void @scalar_nand_i32_one_use(
+    i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+entry:
+  %and = and i32 %a, %b
+  %r0.val = xor i32 %and, -1
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nand_i32_mul_use
+; GCN-NOT: s_nand_b32
+; GCN: s_and_b32
+; GCN: s_not_b32
+; GCN: s_add_i32
+define amdgpu_kernel void @scalar_nand_i32_mul_use(
+    i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) {
+entry:
+  %and = and i32 %a, %b
+  %r0.val = xor i32 %and, -1
+  %r1.val = add i32 %and, %a
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  store i32 %r1.val, i32 addrspace(1)* %r1
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nand_i64_one_use
+; GCN: s_nand_b64
+define amdgpu_kernel void @scalar_nand_i64_one_use(
+    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+  %and = and i64 %a, %b
+  %r0.val = xor i64 %and, -1
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nand_i64_mul_use
+; GCN-NOT: s_nand_b64
+; GCN: s_and_b64
+; GCN: s_not_b64
+; GCN: s_add_u32
+; GCN: s_addc_u32
+define amdgpu_kernel void @scalar_nand_i64_mul_use(
+    i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) {
+entry:
+  %and = and i64 %a, %b
+  %r0.val = xor i64 %and, -1
+  %r1.val = add i64 %and, %a
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  store i64 %r1.val, i64 addrspace(1)* %r1
+  ret void
+}
+
+; GCN-LABEL: {{^}}vector_nand_i32_one_use
+; GCN-NOT: s_nand_b32
+; GCN: v_and_b32
+; GCN: v_not_b32
+define i32 @vector_nand_i32_one_use(i32 %a, i32 %b) {
+entry:
+  %and = and i32 %a, %b
+  %r = xor i32 %and, -1
+  ret i32 %r
+}
+
+; GCN-LABEL: {{^}}vector_nand_i64_one_use
+; GCN-NOT: s_nand_b64
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_not_b32
+; GCN: v_not_b32
+define i64 @vector_nand_i64_one_use(i64 %a, i64 %b) {
+entry:
+  %and = and i64 %a, %b
+  %r = xor i64 %and, -1
+  ret i64 %r
+}
diff --git a/test/CodeGen/AMDGPU/nop-data.ll b/test/CodeGen/AMDGPU/nop-data.ll
index 790e31c781aea2458b9aa5fd93ee69e16766c395..4e836a398ee9db5e821c39e9b277a6a8a4976e3d 100644
--- a/test/CodeGen/AMDGPU/nop-data.ll
+++ b/test/CodeGen/AMDGPU/nop-data.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s
 
 ; CHECK: kernel0:
 ; CHECK-NEXT: s_endpgm
diff --git a/test/CodeGen/AMDGPU/nor.ll b/test/CodeGen/AMDGPU/nor.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8fddd39cad3ce876d800141e1c77230660c550e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/nor.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s
+; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s
+
+; GCN-LABEL: {{^}}scalar_nor_i32_one_use
+; GCN: s_nor_b32
+define amdgpu_kernel void @scalar_nor_i32_one_use(
+    i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+entry:
+  %or = or i32 %a, %b
+  %r0.val = xor i32 %or, -1
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nor_i32_mul_use
+; GCN-NOT: s_nor_b32
+; GCN: s_or_b32
+; GCN: s_not_b32
+; GCN: s_add_i32
+define amdgpu_kernel void @scalar_nor_i32_mul_use(
+    i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) {
+entry:
+  %or = or i32 %a, %b
+  %r0.val = xor i32 %or, -1
+  %r1.val = add i32 %or, %a
+  store i32 %r0.val, i32 addrspace(1)* %r0
+  store i32 %r1.val, i32 addrspace(1)* %r1
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nor_i64_one_use
+; GCN: s_nor_b64
+define amdgpu_kernel void @scalar_nor_i64_one_use(
+    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+  %or = or i64 %a, %b
+  %r0.val = xor i64 %or, -1
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nor_i64_mul_use
+; GCN-NOT: s_nor_b64
+; GCN: s_or_b64
+; GCN: s_not_b64
+; GCN: s_add_u32
+; GCN: s_addc_u32
+define amdgpu_kernel void @scalar_nor_i64_mul_use(
+    i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) {
+entry:
+  %or = or i64 %a, %b
+  %r0.val = xor i64 %or, -1
+  %r1.val = add i64 %or, %a
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  store i64 %r1.val, i64 addrspace(1)* %r1
+  ret void
+}
+
+; GCN-LABEL: {{^}}vector_nor_i32_one_use
+; GCN-NOT: s_nor_b32
+; GCN: v_or_b32
+; GCN: v_not_b32
+define i32 @vector_nor_i32_one_use(i32 %a, i32 %b) {
+entry:
+  %or = or i32 %a, %b
+  %r = xor i32 %or, -1
+  ret i32 %r
+}
+
+; GCN-LABEL: {{^}}vector_nor_i64_one_use
+; GCN-NOT: s_nor_b64
+; GCN: v_or_b32
+; GCN: v_or_b32
+; GCN: v_not_b32
+; GCN: v_not_b32
+define i64 @vector_nor_i64_one_use(i64 %a, i64 %b) {
+entry:
+  %or = or i64 %a, %b
+  %r = xor i64 %or, -1
+  ret i64 %r
+}
diff --git a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
index 576c1ce9874cb312769975f878a3315ab05b204f..f8a7b229c61b4446ed5e4c8e6b608069c5d67c81 100644
--- a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
+++ b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
@@ -1,29 +1,21 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-optimize-exec-masking -o -  %s | FileCheck %s
 
 --- |
-  define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) {
   main_body:
-    %id = call i32 @llvm.amdgcn.workitem.id.x()
-    %cc = icmp eq i32 %id, 0
-    %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %cc)
-    %1 = extractvalue { i1, i64 } %0, 0
-    %2 = extractvalue { i1, i64 } %0, 1
-    br i1 %1, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:                                               ; preds = %main_body
     %v.if = load volatile i32, i32 addrspace(1)* undef
     br label %end
 
   end:                                              ; preds = %if, %main_body
-    %r = phi i32 [ 4, %main_body ], [ %v.if, %if ]
-    call void @llvm.amdgcn.end.cf(i64 %2)
-    store i32 %r, i32 addrspace(1)* undef
     ret void
   }
 
-  define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v) {
   main_body:
-      br i1 undef, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:
     br label %end
@@ -32,9 +24,9 @@
     ret void
   }
 
-  define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v) {
   main_body:
-      br i1 undef, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:
     br label %end
@@ -43,31 +35,20 @@
     ret void
   }
 
-
-  define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) {
   main_body:
-    %id = call i32 @llvm.amdgcn.workitem.id.x()
-    %cc = icmp eq i32 %id, 0
-    %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %cc)
-    %1 = extractvalue { i1, i64 } %0, 0
-    %2 = extractvalue { i1, i64 } %0, 1
-    store i32 %id, i32 addrspace(1)* undef
-    br i1 %1, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:                                               ; preds = %main_body
-    %v.if = load volatile i32, i32 addrspace(1)* undef
     br label %end
 
   end:                                              ; preds = %if, %main_body
-    %r = phi i32 [ 4, %main_body ], [ %v.if, %if ]
-    call void @llvm.amdgcn.end.cf(i64 %2)
-    store i32 %r, i32 addrspace(1)* undef
     ret void
   }
 
-  define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v) {
   main_body:
-      br i1 undef, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:
     br label %end
@@ -76,9 +57,9 @@
     ret void
   }
 
-  define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v) {
   main_body:
-      br i1 undef, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:
     br label %end
@@ -87,9 +68,9 @@
     ret void
   }
 
-  define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v) {
   main_body:
-      br i1 undef, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:
     br label %end
@@ -98,9 +79,9 @@
     ret void
   }
 
-  define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v) {
   main_body:
-      br i1 undef, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:
     br label %end
@@ -109,9 +90,9 @@
     ret void
   }
 
-  define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v) {
   main_body:
-      br i1 undef, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:
     br label %end
@@ -120,9 +101,9 @@
     ret void
   }
 
-  define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v) {
   main_body:
-      br i1 undef, label %if, label %end
+    br i1 undef, label %if, label %end
 
   if:
     br label %end
@@ -131,16 +112,16 @@
     ret void
   }
 
-  ; Function Attrs: nounwind readnone
-  declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-  declare { i1, i64 } @llvm.amdgcn.if(i1)
-
-  declare void @llvm.amdgcn.end.cf(i64)
+  define amdgpu_kernel void @if_and_xor_read_exec_copy_subreg() {
+  main_body:
+    br i1 undef, label %if, label %end
 
+  if:                                               ; preds = %main_body
+    br label %end
 
-  attributes #0 = { nounwind }
-  attributes #1 = { nounwind readnone }
+  end:                                              ; preds = %if, %main_body
+    ret void
+  }
 
 ...
 ---
@@ -150,28 +131,8 @@
 # CHECK-NEXT: SI_MASK_BRANCH
 
 name:            optimize_if_and_saveexec_xor
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -190,7 +151,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -198,7 +159,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
@@ -208,28 +169,8 @@ body:             |
 # CHECK-NEXT: SI_MASK_BRANCH
 
 name:            optimize_if_and_saveexec
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -247,7 +188,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -255,7 +196,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
@@ -265,28 +206,8 @@ body:             |
 # CHECK-NEXT: SI_MASK_BRANCH
 
 name:            optimize_if_or_saveexec
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -304,7 +225,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -312,40 +233,20 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
 ---
 # CHECK-LABEL: name: optimize_if_and_saveexec_xor_valu_middle
 # CHECK: $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc
-# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
 # CHECK-NEXT: $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc
 # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3
 # CHECK-NEXT: SI_MASK_BRANCH
 name:            optimize_if_and_saveexec_xor_valu_middle
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -354,7 +255,7 @@ body:             |
     $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec
     $vgpr0 = V_MOV_B32_e32 4, implicit $exec
     $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc
-    BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc
     $exec = S_MOV_B64_term killed $sgpr2_sgpr3
     SI_MASK_BRANCH %bb.2, implicit $exec
@@ -365,7 +266,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -373,7 +274,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
@@ -384,28 +285,8 @@ body:             |
 # CHECK-NEXT: $exec = COPY $sgpr0_sgpr1
 # CHECK-NEXT: SI_MASK_BRANCH %bb.2, implicit $exec
 name:            optimize_if_and_saveexec_xor_wrong_reg
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -423,7 +304,7 @@ body:             |
 
   bb.1.if:
     liveins: $sgpr0_sgpr1 , $sgpr4_sgpr5_sgpr6_sgpr7
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1, $sgpr4_sgpr5_sgpr6_sgpr7
@@ -431,7 +312,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
@@ -444,28 +325,8 @@ body:             |
 # CHECK-NEXT: SI_MASK_BRANCH %bb.2, implicit $exec
 
 name:            optimize_if_and_saveexec_xor_modify_copy_to_exec
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -485,7 +346,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -495,7 +356,7 @@ body:             |
     $sgpr1 = S_MOV_B32 1
     $sgpr2 = S_MOV_B32 -1
     $sgpr3 = S_MOV_B32 61440
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
@@ -506,28 +367,8 @@ body:             |
 # CHECK-NEXT: $exec = COPY $sgpr2_sgpr3
 # CHECK-NEXT: SI_MASK_BRANCH
 name:            optimize_if_and_saveexec_xor_live_out_setexec
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -546,7 +387,7 @@ body:             |
     S_SLEEP 0, implicit $sgpr2_sgpr3
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -554,7 +395,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
@@ -566,28 +407,8 @@ body:             |
 # CHECK-NEXT: SI_MASK_BRANCH %bb.2, implicit $exec
 
 name:            optimize_if_unknown_saveexec
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -605,7 +426,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -613,7 +434,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
@@ -623,28 +444,8 @@ body:             |
 # CHECK-NEXT: SI_MASK_BRANCH
 
 name:            optimize_if_andn2_saveexec
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -662,7 +463,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -670,7 +471,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
@@ -680,28 +481,8 @@ body:             |
 # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3
 # CHECK-NEXT: SI_MASK_BRANCH %bb.2, implicit $exec
 name:            optimize_if_andn2_saveexec_no_commute
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 liveins:
   - { reg: '$vgpr0' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.main_body:
     liveins: $vgpr0
@@ -719,7 +500,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -727,7 +508,45 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM
 
 ...
+---
+# A read from exec copy subreg prevents optimization
+# CHECK-LABEL: name: if_and_xor_read_exec_copy_subreg{{$}}
+# CHECK: $sgpr0_sgpr1 = COPY $exec
+# CHECK-NEXT: $sgpr4 = S_MOV_B32 $sgpr1
+name:            if_and_xor_read_exec_copy_subreg
+liveins:
+  - { reg: '$vgpr0' }
+body:             |
+  bb.0.main_body:
+    liveins: $vgpr0
+
+    $sgpr0_sgpr1 = COPY $exec
+    $sgpr4 = S_MOV_B32 $sgpr1
+    $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 4, implicit $exec
+    $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc
+    $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc
+    $exec = S_MOV_B64_term killed $sgpr2_sgpr3
+    SI_MASK_BRANCH %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1.if:
+    liveins: $sgpr0_sgpr1
+
+    $sgpr7 = S_MOV_B32 61440
+    $sgpr6 = S_MOV_B32 -1
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
+
+  bb.2.end:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
+    $sgpr3 = S_MOV_B32 61440
+    $sgpr2 = S_MOV_B32 -1
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking.mir
new file mode 100644
index 0000000000000000000000000000000000000000..61e6f44348706a31a04741f410b706483d8fc728
--- /dev/null
+++ b/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking.mir
@@ -0,0 +1,465 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=si-optimize-exec-masking-pre-ra -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN: name: negated_cond_vop2
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop2_redef_vcc1
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+# GCN-NEXT: $vcc_lo = COPY $sgpr0
+# GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_redef_vcc1
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc_lo = COPY $sgpr0
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop2_redef_vcc2
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+# GCN-NEXT: $vcc_hi = COPY $sgpr0
+# GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_redef_vcc2
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc_hi = COPY $sgpr0
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3_redef_cmp
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+# GCN-NEXT: %2.sub1:sreg_64_xexec = COPY $sgpr0
+# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_redef_cmp
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+    %2.sub1 = COPY $sgpr0
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_undef_vcc
+# GCN:      $vcc = S_AND_B64 $exec, undef $vcc, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_undef_vcc
+body:             |
+  bb.0:
+    $vcc = S_AND_B64 $exec, undef $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3_imp_vcc
+# GCN:      $vcc = IMPLICIT_DEF
+# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, $vcc, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_imp_vcc
+body:             |
+  bb.0:
+    $vcc = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, $vcc, implicit $exec
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop2_imp_vcc
+# GCN:      $vcc = IMPLICIT_DEF
+# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, $vcc, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_imp_vcc
+body:             |
+  bb.0:
+    $vcc = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, $vcc, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc = S_AND_B64 killed $vcc, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3_redef_sel
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: %1:vgpr_32 = COPY $vgpr0
+# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_redef_sel
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %1:vgpr_32 = COPY $vgpr0
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop2_used_sel
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_used_sel
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    $vgpr0 = COPY %1
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop2_used_vcc
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+# GCN-NEXT: $sgpr0_sgpr1 = COPY $vcc
+# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_used_vcc
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $sgpr0_sgpr1 = COPY $vcc
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3_sel_wrong_subreg1
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
+# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_wrong_subreg1
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1.sub1 = IMPLICIT_DEF
+    %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3_sel_wrong_subreg2
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
+# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_wrong_subreg2
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %1.sub1 = IMPLICIT_DEF
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3_sel_right_subreg1
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_right_subreg1
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1.sub1 = IMPLICIT_DEF
+    %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub0, 1, implicit $exec
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3_sel_right_subreg2
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_right_subreg2
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %1.sub1 = IMPLICIT_DEF
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub0, 1, implicit $exec
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop3_sel_subreg_overlap
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: %1.sub2:vreg_128 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN-NEXT: %1.sub2_sub3:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub2, 1, implicit $exec
+# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_subreg_overlap
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1.sub2:vreg_128 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %1.sub2_sub3 = IMPLICIT_DEF
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub2, 1, implicit $exec
+    $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop2_dominated_blocks
+# GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN:      $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+---
+name:            negated_cond_vop2_dominated_blocks
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+
+  bb.1:
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_BRANCH %bb.1
+
+  bb.3:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop2_different_blocks_cmp_and
+# GCN:      %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+# GCN:      $vcc = S_AND_B64 $exec, %2, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+---
+name:            negated_cond_vop2_different_blocks_cmp_and
+body:             |
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+    %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+
+  bb.1:
+    $vcc = S_AND_B64 $exec, killed %2, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_BRANCH %bb.1
+
+  bb.3:
+    S_ENDPGM
+...
+
+# GCN: name: negated_cond_vop2_not_dominated_blocks
+# GCN:      V_CNDMASK_B32_e64 0, 1,
+# GCN:      $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.4, implicit $vcc
+---
+name:            negated_cond_vop2_not_dominated_blocks
+body:             |
+  bb.0:
+    $vcc = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec
+
+  bb.2:
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.4, implicit killed $vcc
+    S_BRANCH %bb.3
+
+  bb.3:
+    S_BRANCH %bb.2
+
+  bb.4:
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/test/CodeGen/AMDGPU/optimize-negated-cond.ll
new file mode 100644
index 0000000000000000000000000000000000000000..be5d8d4720509896ecfaca05bff29a079e30b390
--- /dev/null
+++ b/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}negated_cond:
+; GCN: BB0_1:
+; GCN:   v_cmp_eq_u32_e64 [[CC:[^,]+]],
+; GCN: BB0_2:
+; GCN-NOT: v_cndmask_b32
+; GCN-NOT: v_cmp
+; GCN:   s_andn2_b64 vcc, exec, [[CC]]
+; GCN:   s_cbranch_vccnz BB0_4
+define amdgpu_kernel void @negated_cond(i32 addrspace(1)* %arg1) {
+bb:
+  br label %bb1
+
+bb1:
+  %tmp1 = load i32, i32 addrspace(1)* %arg1
+  %tmp2 = icmp eq i32 %tmp1, 0
+  br label %bb2
+
+bb2:
+  %tmp3 = phi i32 [ 0, %bb1 ], [ %tmp6, %bb4 ]
+  %tmp4 = shl i32 %tmp3, 5
+  br i1 %tmp2, label %bb3, label %bb4
+
+bb3:
+  %tmp5 = add i32 %tmp4, 1
+  br label %bb4
+
+bb4:
+  %tmp6 = phi i32 [ %tmp5, %bb3 ], [ %tmp4, %bb2 ]
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp6
+  store i32 0, i32 addrspace(1)* %gep
+  %tmp7 = icmp eq i32 %tmp6, 32
+  br i1 %tmp7, label %bb1, label %bb2
+}
+
+; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
+; GCN:   v_cmp_eq_u32_e64 [[CC:[^,]+]],
+; GCN: BB1_1:
+; GCN-NOT: v_cndmask_b32
+; GCN-NOT: v_cmp
+; GCN:   s_andn2_b64 vcc, exec, [[CC]]
+; GCN:   s_cbranch_vccz BB1_3
+define amdgpu_kernel void @negated_cond_dominated_blocks(i32 addrspace(1)* %arg1) {
+bb:
+  br label %bb2
+
+bb2:
+  %tmp1 = load i32, i32 addrspace(1)* %arg1
+  %tmp2 = icmp eq i32 %tmp1, 0
+  br label %bb4
+
+bb3:
+  ret void
+
+bb4:
+  %tmp3 = phi i32 [ 0, %bb2 ], [ %tmp7, %bb7 ]
+  %tmp4 = shl i32 %tmp3, 5
+  br i1 %tmp2, label %bb5, label %bb6
+
+bb5:
+  %tmp5 = add i32 %tmp4, 1
+  br label %bb7
+
+bb6:
+  %tmp6 = add i32 %tmp3, 1
+  br label %bb7
+
+bb7:
+  %tmp7 = phi i32 [ %tmp5, %bb5 ], [ %tmp6, %bb6 ]
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp7
+  store i32 0, i32 addrspace(1)* %gep
+  %tmp8 = icmp eq i32 %tmp7, 32
+  br i1 %tmp8, label %bb3, label %bb4
+}
diff --git a/test/CodeGen/AMDGPU/or3.ll b/test/CodeGen/AMDGPU/or3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..400d9c4b0103a8b22a05654e80ad5389dcf3925b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/or3.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+
+; ===================================================================================
+; V_OR3_B32
+; ===================================================================================
+
+define amdgpu_ps float @or3(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: or3:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: or3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = or i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+; ThreeOp instruction variant not used due to Constant Bus Limitations
+; TODO: with reassociation it is possible to replace a v_or_b32_e32 with an s_or_b32
+define amdgpu_ps float @or3_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
+; VI-LABEL: or3_vgpr_a:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_or_b32_e32 v0, s2, v0
+; VI-NEXT:    v_or_b32_e32 v0, s3, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: or3_vgpr_a:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = or i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @or3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: or3_vgpr_all2:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: or3_vgpr_all2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = or i32 %b, %c
+  %result = or i32 %a, %x
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @or3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
+; VI-LABEL: or3_vgpr_bc:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_or_b32_e32 v0, s2, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: or3_vgpr_bc:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_or3_b32 v0, s2, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = or i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @or3_vgpr_const(i32 %a, i32 %b) {
+; VI-LABEL: or3_vgpr_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_or_b32_e32 v0, 64, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: or3_vgpr_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_or3_b32 v0, v1, v0, 64
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = or i32 64, %b
+  %result = or i32 %x, %a
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index 6a41c3ad2e88fda950b8023afa33685b0ff25d0d..d7e38a602ffa80b9642993a02a97be410c775f53 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
-; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
+; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-code-object-v3,+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
+; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-code-object-v3,+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}promote_alloca_i32_array_array:
 ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
index e8dcb50a3c1fe400275464db7584aea4d3b639d5..83a608ad5f31cece6e597268a4ee418db4d14129 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
 
 ; This shows that the amount of LDS estimate is sensitive to the order
 ; of the LDS globals.
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
index ebeed0dd4435b1cdf7db4d07a63945237dd602b9..28e4925f95061b4453e7b943acedbcae9a59b9bf 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -1,12 +1,14 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
 ; CHECK: %alloca = alloca i32
-; CHECK: select i1 undef, i32* undef, i32* %alloca
+; CHECK: select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %alloca
 define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
-  %alloca = alloca i32, align 4
-  %select = select i1 undef, i32* undef, i32* %alloca
-  store i32 0, i32* %select, align 4
+  %alloca = alloca i32, align 4, addrspace(5)
+  %select = select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %alloca
+  store i32 0, i32 addrspace(5)* %select, align 4
   ret void
 }
 
@@ -17,11 +19,11 @@ define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand()
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
 define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
-  %alloca = alloca [16 x i32], align 4
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
-  %select = select i1 undef, i32* %ptr0, i32* %ptr1
-  store i32 0, i32* %select, align 4
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
+  %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
+  store i32 0, i32 addrspace(5)* %select, align 4
   ret void
 }
 
@@ -30,16 +32,16 @@ define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a
 ; CHECK-LABEL: @lds_promote_alloca_select_two_allocas(
 ; CHECK: %alloca0 = alloca i32, i32 16, align 4
 ; CHECK: %alloca1 = alloca i32, i32 16, align 4
-; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
-; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
-; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
+; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(5)* %alloca0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %alloca1, i32 %b
+; CHECK: %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
 define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
-  %alloca0 = alloca i32, i32 16, align 4
-  %alloca1 = alloca i32, i32 16, align 4
-  %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
-  %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
-  %select = select i1 undef, i32* %ptr0, i32* %ptr1
-  store i32 0, i32* %select, align 4
+  %alloca0 = alloca i32, i32 16, align 4, addrspace(5)
+  %alloca1 = alloca i32, i32 16, align 4, addrspace(5)
+  %ptr0 = getelementptr inbounds i32, i32 addrspace(5)* %alloca0, i32 %a
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %alloca1, i32 %b
+  %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
+  store i32 0, i32 addrspace(5)* %select, align 4
   ret void
 }
 
@@ -51,11 +53,11 @@ define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b)
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
 define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
-  %alloca = alloca [16 x i32], align 4
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
-  %select = select i1 undef, i32* %ptr0, i32* %ptr1
-  store i32 0, i32* %select, align 4
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 3
+  %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
+  store i32 0, i32 addrspace(5)* %select, align 4
   ret void
 }
 
@@ -68,34 +70,34 @@ define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointe
 ; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
 ; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
 define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
-  %alloca = alloca [16 x i32], align 4
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
-  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
-  %select0 = select i1 undef, i32* %ptr0, i32* %ptr1
-  %select1 = select i1 undef, i32* %select0, i32* %ptr2
-  store i32 0, i32* %select1, align 4
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
+  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %c
+  %select0 = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
+  %select1 = select i1 undef, i32 addrspace(5)* %select0, i32 addrspace(5)* %ptr2
+  store i32 0, i32 addrspace(5)* %select1, align 4
   ret void
 }
 
 define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
 entry:
-  %alloca = alloca [16 x i32], align 4
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
-  store i32 0, i32* %ptr0
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
+  store i32 0, i32 addrspace(5)* %ptr0
   br i1 undef, label %bb1, label %bb2
 
 bb1:
-  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
-  %select0 = select i1 undef, i32* undef, i32* %ptr2
-  store i32 0, i32* %ptr1
+  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %c
+  %select0 = select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %ptr2
+  store i32 0, i32 addrspace(5)* %ptr1
   br label %bb2
 
 bb2:
-  %phi.ptr = phi i32* [ %ptr0, %entry ], [ %select0, %bb1 ]
-  %select1 = select i1 undef, i32* %phi.ptr, i32* %ptr1
-  store i32 0, i32* %select1, align 4
+  %phi.ptr = phi i32 addrspace(5)* [ %ptr0, %entry ], [ %select0, %bb1 ]
+  %select1 = select i1 undef, i32 addrspace(5)* %phi.ptr, i32 addrspace(5)* %ptr1
+  store i32 0, i32 addrspace(5)* %select1, align 4
   ret void
 }
 
@@ -104,12 +106,12 @@ bb2:
 ; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
 define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
-  %tmp = alloca double, align 8
-  store double 0.000000e+00, double* %tmp, align 8
+  %tmp = alloca double, align 8, addrspace(5)
+  store double 0.000000e+00, double addrspace(5)* %tmp, align 8
   %tmp2 = icmp eq i32 %arg1, 0
-  %tmp3 = select i1 %tmp2, double* %tmp, double* null
-  store double 1.000000e+00, double* %tmp3, align 8
-  %tmp4 = load double, double* %tmp, align 8
+  %tmp3 = select i1 %tmp2, double addrspace(5)* %tmp, double addrspace(5)* null
+  store double 1.000000e+00, double addrspace(5)* %tmp3, align 8
+  %tmp4 = load double, double addrspace(5)* %tmp, align 8
   store double %tmp4, double addrspace(1)* %arg
   ret void
 }
@@ -119,12 +121,12 @@ bb:
 ; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
 define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
-  %tmp = alloca double, align 8
-  store double 0.000000e+00, double* %tmp, align 8
+  %tmp = alloca double, align 8, addrspace(5)
+  store double 0.000000e+00, double addrspace(5)* %tmp, align 8
   %tmp2 = icmp eq i32 %arg1, 0
-  %tmp3 = select i1 %tmp2, double* null, double* %tmp
-  store double 1.000000e+00, double* %tmp3, align 8
-  %tmp4 = load double, double* %tmp, align 8
+  %tmp3 = select i1 %tmp2, double addrspace(5)* null, double addrspace(5)* %tmp
+  store double 1.000000e+00, double addrspace(5)* %tmp3, align 8
+  %tmp4 = load double, double addrspace(5)* %tmp, align 8
   store double %tmp4, double addrspace(1)* %arg
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 80112160412c1b96f1e005a788b74f1db64e3335..15da72db4abb7ff9dfa13d7cebf472fb6f6f67d9 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}float4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
 
-; GFX-NOT: buffer_
-; GCN:  v_readfirstlane_b32
-; GFX8: v_movrels_b32
-; GFX9: s_set_gpr_idx_on
-; GFX9: s_set_gpr_idx_off
+; GCN-NOT: buffer_
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
+; GCN: store_dword v[{{[0-9:]+}}], [[RES]]
 
 ; OPT:  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
 ; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
@@ -36,11 +36,15 @@ entry:
 ; GCN-LABEL: {{^}}float4_alloca_load4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4
 
-; GFX-NOT: buffer_
-; GCN:  v_readfirstlane_b32
-; GFX8: v_movreld_b32
-; GFX9: s_set_gpr_idx_on
-; GFX9: s_set_gpr_idx_off
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-NOT: v_cmp_
+; GCN-NOT: v_cndmask_
+; GCN:     v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
+; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
+; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
+; GCN:     store_dwordx4 v[{{[0-9:]+}}],
 
 ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
@@ -68,7 +72,7 @@ entry:
 ; GCN-LABEL: {{^}}half4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4
 
-; GFX-NOT: buffer_
+; GCN-NOT: buffer_
 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
@@ -98,7 +102,7 @@ entry:
 ; GCN-LABEL: {{^}}half4_alloca_load4:
 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
 
-; GFX-NOT: buffer_
+; GCN-NOT: buffer_
 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
 
@@ -128,7 +132,7 @@ entry:
 ; GCN-LABEL: {{^}}short4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4
 
-; GFX-NOT: buffer_
+; GCN-NOT: buffer_
 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
@@ -158,7 +162,7 @@ entry:
 ; GCN-LABEL: {{^}}short4_alloca_load4:
 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
 
-; GFX-NOT: buffer_
+; GCN-NOT: buffer_
 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
 
diff --git a/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a9fc318ce0e110fcc7acffcc9214ed9dc1e43b2c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -0,0 +1,485 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+declare i64 @_Z13get_global_idj(i32)
+
+define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)*  %buffer) {
+; GCN-LABEL: clmem_read_simplified:
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+entry:
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %conv = and i64 %call, 255
+  %a0 = shl i64 %call, 7
+  %idx.ext11 = and i64 %a0, 4294934528
+  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
+  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
+
+  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
+  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
+  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
+  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
+  %add.1 = add i64 %load2, %load1
+
+  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
+  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
+  %add.2 = add i64 %load3, %add.1
+  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
+  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
+  %add.3 = add i64 %load4, %add.2
+
+  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
+  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
+  %add.4 = add i64 %load5, %add.3
+  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
+  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
+  %add.5 = add i64 %load6, %add.4
+
+  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
+  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
+  %add.6 = add i64 %load7, %add.5
+  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
+  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
+  %add.7 = add i64 %load8, %add.6
+
+  store i64 %add.7, i64 addrspace(1)* %saddr, align 8
+  ret void
+}
+
+define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
+; GCN-LABEL: clmem_read:
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+entry:
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %conv = and i64 %call, 255
+  %a0 = shl i64 %call, 17
+  %idx.ext11 = and i64 %a0, 4261412864
+  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
+  %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
+  %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
+  br label %for.cond.preheader
+
+while.cond.loopexit:                              ; preds = %for.body
+  %dec = add nsw i32 %dec31, -1
+  %tobool = icmp eq i32 %dec31, 0
+  br i1 %tobool, label %while.end, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %entry, %while.cond.loopexit
+  %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
+  %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.cond.preheader
+  %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
+  %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
+  %conv3 = zext i32 %block.029 to i64
+  %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
+  %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
+  %add = add i64 %load1, %sum.128
+
+  %add9 = or i32 %block.029, 256
+  %conv3.1 = zext i32 %add9 to i64
+  %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
+  %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
+  %add.1 = add i64 %load2, %add
+
+  %add9.1 = or i32 %block.029, 512
+  %conv3.2 = zext i32 %add9.1 to i64
+  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
+  %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
+  %add.2 = add i64 %l3, %add.1
+
+  %add9.2 = or i32 %block.029, 768
+  %conv3.3 = zext i32 %add9.2 to i64
+  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
+  %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
+  %add.3 = add i64 %l4, %add.2
+
+  %add9.3 = or i32 %block.029, 1024
+  %conv3.4 = zext i32 %add9.3 to i64
+  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
+  %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
+  %add.4 = add i64 %l5, %add.3
+
+  %add9.4 = or i32 %block.029, 1280
+  %conv3.5 = zext i32 %add9.4 to i64
+  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
+  %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
+  %add.5 = add i64 %l6, %add.4
+
+  %add9.5 = or i32 %block.029, 1536
+  %conv3.6 = zext i32 %add9.5 to i64
+  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
+  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
+  %add.6 = add i64 %load7, %add.5
+
+  %add9.6 = or i32 %block.029, 1792
+  %conv3.7 = zext i32 %add9.6 to i64
+  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
+  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
+  %add.7 = add i64 %load8, %add.6
+
+  %add9.7 = or i32 %block.029, 2048
+  %conv3.8 = zext i32 %add9.7 to i64
+  %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
+  %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
+  %add.8 = add i64 %load9, %add.7
+
+  %add9.8 = or i32 %block.029, 2304
+  %conv3.9 = zext i32 %add9.8 to i64
+  %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
+  %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
+  %add.9 = add i64 %load10, %add.8
+
+  %add9.9 = or i32 %block.029, 2560
+  %conv3.10 = zext i32 %add9.9 to i64
+  %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
+  %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
+  %add.10 = add i64 %load11, %add.9
+
+  %add9.31 = add nuw nsw i32 %block.029, 8192
+  %cmp.31 = icmp ult i32 %add9.31, 4194304
+  br i1 %cmp.31, label %for.body, label %while.cond.loopexit
+
+while.end:                                        ; preds = %while.cond.loopexit
+  store i64 %add.10, i64 addrspace(1)* %a1, align 8
+  ret void
+}
+
+; using 32bit address.
+define amdgpu_kernel void @Address32(i8 addrspace(1)*  %buffer) {
+; GCN-LABEL: Address32:
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+;
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
+entry:
+   %call = tail call i64 @_Z13get_global_idj(i32 0)
+   %conv = and i64 %call, 255
+   %id = shl i64 %call, 7
+   %idx.ext11 = and i64 %id, 4294934528
+   %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
+   %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
+
+   %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
+   %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
+
+   %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
+   %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
+   %add.1 = add i32 %load2, %load1
+
+   %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
+   %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
+   %add.2 = add i32 %load3, %add.1
+
+   %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
+   %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
+   %add.3 = add i32 %load4, %add.2
+
+   %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
+   %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
+   %add.4 = add i32 %load5, %add.3
+
+   %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
+   %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
+   %add.5 = add i32 %load6, %add.4
+
+   %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
+   %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
+   %add.6 = add i32 %load7, %add.5
+
+   %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
+   %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
+   %add.7 = add i32 %load8, %add.6
+
+   %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
+   %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
+   %add.8 = add i32 %load9, %add.7
+
+   %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
+   %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
+   %add.9 = add i32 %load10, %add.8
+
+   store i32 %add.9, i32 addrspace(1)* %addr, align 4
+   ret void
+}
+
+define amdgpu_kernel void @Offset64(i8 addrspace(1)*  %buffer) {
+; GCN-LABEL: Offset64:
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+entry:
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %conv = and i64 %call, 255
+  %a0 = shl i64 %call, 7
+  %idx.ext11 = and i64 %a0, 4294934528
+  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
+  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
+
+  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
+  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
+
+  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
+  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
+
+  %add1 = add i64 %load2, %load1
+
+  %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
+  %load3 = load i64, i64 addrspace(1)* %addr3, align 8
+
+  %add2 = add i64 %load3, %add1
+
+  %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
+  %load4 = load i64, i64 addrspace(1)* %addr4, align 8
+  %add4 = add i64 %load4, %add2
+
+  store i64 %add4, i64 addrspace(1)* %saddr, align 8
+  ret void
+}
+
+; TODO: Support load4 as anchor instruction.
+define amdgpu_kernel void @p32Offset64(i8 addrspace(1)*  %buffer) {
+; GCN-LABEL: p32Offset64:
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+;
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
+entry:
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %conv = and i64 %call, 255
+  %a0 = shl i64 %call, 7
+  %idx.ext11 = and i64 %a0, 4294934528
+  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
+  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
+
+  %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
+  %load1 = load i32, i32 addrspace(1)* %addr1, align 8
+
+  %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
+  %load2 = load i32, i32 addrspace(1)* %addr2, align 8
+
+  %add1 = add i32 %load2, %load1
+
+  %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
+  %load3 = load i32, i32 addrspace(1)* %addr3, align 8
+
+  %add2 = add i32 %load3, %add1
+
+  %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
+  %load4 = load i32, i32 addrspace(1)* %addr4, align 8
+  %add4 = add i32 %load4, %add2
+
+  store i32 %add4, i32 addrspace(1)* %saddr, align 8
+  ret void
+}
+
+define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
+; GCN-LABEL: DiffBase:
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+                                    i8 addrspace(1)* %buffer2) {
+entry:
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %conv = and i64 %call, 255
+  %a0 = shl i64 %call, 7
+  %idx.ext11 = and i64 %a0, 4294934528
+  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
+  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
+
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
+  %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
+
+  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
+  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
+  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
+  %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
+  %add1 = add i64 %load2, %load1
+  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
+  %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
+  %add2 = add i64 %load3, %add1
+
+  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
+  %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
+
+  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
+  %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
+  %add3 = add i64 %load5, %load4
+
+  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
+  %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
+  %add4 = add i64 %load6, %add3
+
+  %add5 = add i64 %add2, %add4
+
+  store i64 %add5, i64 addrspace(1)* %saddr, align 8
+  ret void
+}
+
+define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
+; GCN-LABEL: ReverseOrder:
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+entry:
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %conv = and i64 %call, 255
+  %a0 = shl i64 %call, 7
+  %idx.ext11 = and i64 %a0, 4294934528
+  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
+  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
+
+  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
+  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
+
+  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
+  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
+  %add7 = add i64 %load8, %load1
+
+  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
+  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
+  %add6 = add i64 %load7, %add7
+
+  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
+  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
+  %add5 = add i64 %load6, %add6
+
+  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
+  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
+  %add4 = add i64 %load5, %add5
+
+  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
+  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
+  %add3 = add i64 %load4, %add4
+
+  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
+  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
+  %add2 = add i64 %load3, %add3
+
+  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
+  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
+  %add1 = add i64 %load2, %add2
+
+  store i64 %add1, i64 addrspace(1)* %saddr, align 8
+  ret void
+}
+
+define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
+; GCN-LABEL: negativeoffset:
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+entry:
+  %call = tail call i64 @_Z13get_global_idj(i32 0) #2
+  %conv = and i64 %call, 255
+  %0 = shl i64 %call, 7
+  %idx.ext11 = and i64 %0, 4294934528
+  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
+  %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
+
+  %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
+
+  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
+  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
+
+  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
+  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
+
+
+  %add = add i64 %load2, %load1
+
+  store i64 %add, i64 addrspace(1)* %buffer_head, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir b/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b5e4032a56f7917bf0d98974d06d92cb5ba3c61c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
@@ -0,0 +1,190 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
+
+# GFX9-LABEL: name: diffoporder_add
+# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, -2048, 0, 0
+# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, 0, 0, 0
+
+name: diffoporder_add
+body:             |
+  bb.0.entry:
+    %0:sgpr_64 = COPY $sgpr0_sgpr1
+    %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
+    %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
+    %4:sreg_32_xm0 = COPY $sgpr101
+    %5:sreg_32_xm0 = S_MOV_B32 0
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
+    $sgpr4 = COPY %4
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    %6:vreg_64 = COPY $vgpr0_vgpr1
+    %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
+    %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
+    %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
+    %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
+    %12:sgpr_32 = COPY %1.sub1
+    %13:vgpr_32 = COPY %5
+    %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
+    %16:vgpr_32 = COPY %12
+    %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
+    %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
+    %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
+    %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
+    %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
+    %25:sgpr_32 = S_MOV_B32 4096
+    %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %25, %21, implicit $exec
+    %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
+    %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
+    %32:sgpr_32 = S_MOV_B32 6144
+    %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
+    %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
+    %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
+...
+---
+
+# GFX9-LABEL: name: LowestInMiddle
+# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 11200
+# GFX9: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]]
+# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]]
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -3200, 0, 0
+#
+# GFX9: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 6400
+# GFX9: [[BASE1_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_7:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_2]]
+# GFX9: [[BASE1_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_7]]
+# GFX9: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE1_LO]], %subreg.sub0, [[BASE1_HI]], %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE3]], 0, 0, 0,
+# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0,
+
+name: LowestInMiddle
+body:             |
+  bb.0.entry:
+    %0:sgpr_64 = COPY $sgpr0_sgpr1
+    %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
+    %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
+    %4:sreg_32_xm0 = COPY $sgpr101
+    %5:sreg_32_xm0 = S_MOV_B32 0
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
+    $sgpr4 = COPY %4
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    %6:vreg_64 = COPY $vgpr0_vgpr1
+    %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
+    %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
+    %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
+    %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
+    %12:sgpr_32 = COPY %1.sub1
+    %13:vgpr_32 = COPY %5
+    %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
+    %16:vgpr_32 = COPY %12
+    %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
+    %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
+    %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
+    %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
+    %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
+    %25:sgpr_32 = S_MOV_B32 8000
+    %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec
+    %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
+    %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
+    %32:sgpr_32 = S_MOV_B32 6400
+    %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
+    %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
+    %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
+    %39:sgpr_32 = S_MOV_B32 11200
+    %40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec
+    %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec
+    %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
+    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec
+...
+---
+
+# GFX9-LABEL: name: NegativeDistance
+# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 10240
+# GFX9: [[V_ADD_I32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]]
+# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]]
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_4]], %subreg.sub0, [[BASE_HI]], %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -4096, 0, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -2048, 0, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0
+
+name: NegativeDistance
+body:             |
+  bb.0.entry:
+    %0:sgpr_64 = COPY $sgpr0_sgpr1
+    %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
+    %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
+    %4:sreg_32_xm0 = COPY $sgpr101
+    %5:sreg_32_xm0 = S_MOV_B32 0
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
+    $sgpr4 = COPY %4
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    %6:vreg_64 = COPY $vgpr0_vgpr1
+    %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
+    %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
+    %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
+    %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
+    %12:sgpr_32 = COPY %1.sub1
+    %13:vgpr_32 = COPY %5
+    %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
+    %16:vgpr_32 = COPY %12
+    %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
+    %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
+    %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
+    %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
+    %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
+    %25:sgpr_32 = S_MOV_B32 6144
+    %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec
+    %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
+    %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
+    %32:sgpr_32 = S_MOV_B32 8192
+    %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
+    %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
+    %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
+    %39:sgpr_32 = S_MOV_B32 10240
+    %40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec
+    %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec
+    %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
+    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec
+...
+---
+
+# Tests for a successful compilation.
+name: assert_hit
+body:             |
+    bb.0.entry:
+    %0:sgpr_64 = COPY $sgpr0_sgpr1
+    %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
+    %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
+    %4:sreg_32_xm0 = COPY $sgpr101
+    %5:sreg_32_xm0 = S_MOV_B32 0
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
+    $sgpr4 = COPY %4
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    %6:vreg_64 = COPY $vgpr0_vgpr1
+    %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
+    %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
+    %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
+    %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
+    %12:sgpr_32 = COPY %1.sub1
+    %13:vgpr_32 = COPY %5
+    %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
+    %16:vgpr_32 = COPY %12
+    %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
+    %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
+    %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
+    %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
+    %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
+
+    %25:sgpr_32 = S_MOV_B32 6144
+    %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec
+    %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 4294967295, killed %27, implicit $exec
+    %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
+...
diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll
index 265d37e8d641601c67579e9fab4f43752dfd5b61..b7d202638dbce46e7b79d5c2eea0ee4258589db2 100644
--- a/test/CodeGen/AMDGPU/ret.ll
+++ b/test/CodeGen/AMDGPU/ret.ll
@@ -17,14 +17,13 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}vgpr_literal:
-; GCN: v_mov_b32_e32 v4, v0
-; GCN: exp mrt0 v4, v4, v4, v4 done vm
+; GCN: exp mrt0 v0, v0, v0, v0 done vm
 
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
-; GCN: s_waitcnt expcnt(0)
+; GCN-DAG: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
 define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
@@ -226,15 +225,14 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}structure_literal:
-; GCN: v_mov_b32_e32 v3, v0
-; GCN: exp mrt0 v3, v3, v3, v3 done vm
+; GCN: exp mrt0 v0, v0, v0, v0 done vm
 
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: s_mov_b32 s0, 2
 ; GCN-DAG: s_mov_b32 s1, 3
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
-; GCN: s_waitcnt expcnt(0)
+; GCN-DAG: s_waitcnt expcnt(0)
 define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll b/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6e7d24cdc3c73c9bc888a29bf61495e8b13158f5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-OPT %s
+; RUN: llc -march=amdgcn -mcpu=fiji -O0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-NOOPT %s
+
+; GCN-LABEL: {{^}}scalar_to_vector_i16:
+; GCN-NOOPT: s_mov_b32 [[S:s[0-9]+]], 42
+; GCN-NOOPT: v_mov_b32_e32 [[V:v[0-9]+]], [[S]]
+; GCN-OPT:   v_mov_b32_e32 [[V:v[0-9]+]], 42
+; GCN: buffer_store_short [[V]],
+define void @scalar_to_vector_i16() {
+  %tmp = load <2 x i16>, <2 x i16> addrspace(5)* undef
+  %tmp1 = insertelement <2 x i16> %tmp, i16 42, i64 0
+  store <2 x i16> %tmp1, <2 x i16> addrspace(5)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_to_vector_f16:
+; GCN-NOOPT: s_movk_i32 [[S:s[0-9]+]], 0x3c00
+; GCN-NOOPT: v_mov_b32_e32 [[V:v[0-9]+]], [[S]]
+; GCN-OPT:   v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00
+; GCN: buffer_store_short [[V]],
+define void @scalar_to_vector_f16() {
+  %tmp = load <2 x half>, <2 x half> addrspace(5)* undef
+  %tmp1 = insertelement <2 x half> %tmp, half 1.0, i64 0
+  store <2 x half> %tmp1, <2 x half> addrspace(5)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
index 5edc2c5c9b713eeadd0f6b1550c2b36bd6b819c3..d93cde6a885f76fc2f9b60f8f4a45898963d9b8f 100644
--- a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
+++ b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -24,7 +24,7 @@ main_body:
   %array_vector9 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp1, i32 1
   %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
   %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
-  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0)
   call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1)
   %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
   %tmp4 = extractelement <4 x i32> %bc, i32 undef
@@ -46,7 +46,7 @@ main_body:
 }
 
 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
+declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2
 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3
 
 attributes #0 = { nounwind "target-cpu"="tonga" }
diff --git a/test/CodeGen/AMDGPU/sdwa-op64-test.ll b/test/CodeGen/AMDGPU/sdwa-op64-test.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6ed12078bddb46a15e25f4a65d1a63484f554af3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdwa-op64-test.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI,GCN %s
+
+; GCN-LABEL: {{^}}test_add_co_sdwa:
+; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
+; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
+define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp5 = and i32 %tmp4, 255
+  %tmp6 = zext i32 %tmp5 to i64
+  %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
+  %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8
+  %tmp9 = add nsw i64 %tmp8, %tmp6
+  store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}test_sub_co_sdwa:
+; GFX9: v_sub_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
+; FIJI: v_sub_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; FIJI: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
+define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp5 = and i32 %tmp4, 255
+  %tmp6 = zext i32 %tmp5 to i64
+  %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
+  %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8
+  %tmp9 = sub nsw i64 %tmp8, %tmp6
+  store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test1_add_co_sdwa:
+; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
+; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
+; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
+; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
+define amdgpu_kernel void @test1_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1, i64 addrspace(1)* %arg2) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp5 = and i32 %tmp4, 255
+  %tmp6 = zext i32 %tmp5 to i64
+  %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
+  %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8
+  %tmp9 = add nsw i64 %tmp8, %tmp6
+  store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8
+  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
+  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
+  %tmp15 = and i32 %tmp14, 255
+  %tmp16 = zext i32 %tmp15 to i64
+  %tmp17 = getelementptr inbounds i64, i64 addrspace(1)* %arg2, i32 %tmp
+  %tmp18 = load i64, i64 addrspace(1)* %tmp17, align 8
+  %tmp19 = add nsw i64 %tmp18, %tmp16
+  store i64 %tmp19, i64 addrspace(1)* %tmp17, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/sdwa-ops.mir b/test/CodeGen/AMDGPU/sdwa-ops.mir
new file mode 100644
index 0000000000000000000000000000000000000000..9660c88bbb027dfcc7420ad002ec7e4af304ac39
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdwa-ops.mir
@@ -0,0 +1,390 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s
+
+# test for 3 consecutive _sdwa's
+# GFX9-LABEL: name:            test1_add_co_sdwa
+# GFX9: V_ADD_I32_sdwa
+# GFX9-NEXT: V_ADDC_U32_e32
+# GFX9: V_ADD_I32_sdwa
+# GFX9-NEXT: V_ADDC_U32_e32
+# GFX9: V_ADD_I32_sdwa
+# GFX9-NEXT: V_ADDC_U32_e32
+---
+name:            test1_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+    %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec
+    %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec
+    %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %162, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+    %171:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %173:vgpr_32, %175:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %171, implicit $exec
+    %174:vgpr_32, dead %176:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %175, implicit $exec
+    %172:vreg_64 = REG_SEQUENCE %173, %subreg.sub0, %174, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %172, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+...
+
+# test for VCC interference on sdwa, should generate 1 xform only
+# GFX9-LABEL: name:            test2_add_co_sdwa
+# GFX9: V_ADD_I32_sdwa
+# GFX9: V_ADDC_U32_e32
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9-NOT: V_ADDC_U32_e32
+---
+name:            test2_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+
+    %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec
+    %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec
+    %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1
+
+    %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+    %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec
+    %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec
+    %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %162, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+...
+
+# test for CarryOut used, should reject
+# GFX9-LABEL: name:            test3_add_co_sdwa
+# GFX9: V_ADD_I32_e64
+# GFX9: V_ADDC_U32_e64
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9-NOT: V_ADDC_U32_e32
+---
+name:            test3_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %66, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+...
+
+# test for CarryIn used more than once, should reject
+# GFX9-LABEL: name:            test4_add_co_sdwa
+# GFX9: V_ADD_I32_e64
+# GFX9: V_ADDC_U32_e64
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9-NOT: V_ADDC_U32_e32
+---
+name:            test4_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %65, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+
+...
+
+# test for simple example, should generate sdwa
+# GFX9-LABEL: name:            test5_add_co_sdwa
+# GFX9: V_ADD_I32_sdwa
+# GFX9: V_ADDC_U32_e32
+---
+name:            test5_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+
+...
+
+# test for V_ADD_I32_e64 only, should reject
+# GFX9-LABEL: name:            test6_add_co_sdwa
+# GFX9: V_ADD_I32_e64
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9-NOT: V_ADDC_U32_e32
+---
+name:            test6_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %23, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+
+...
+
+# test for V_ADDC_U32_e64 only, should reject
+# GFX9-LABEL: name:            test7_add_co_sdwa
+# GFX9: V_ADDC_U32_e64
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9-NOT: V_ADDC_U32_e32
+---
+name:            test7_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %24:sreg_64_xexec = COPY $sgpr0_sgpr1
+
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %24, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %23, %subreg.sub0, %23, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+
+...
+
+# test for $vcc defined between two adds, should not generate
+# GFX9-LABEL: name:            test8_add_co_sdwa
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9: V_ADDC_U32_e64
+---
+name:            test8_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    $vcc = COPY %30
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec
+    %31:vreg_64 = COPY $vcc
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %31, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+
+...
+
+# test for non dead $vcc, should not generate
+# GFX9-LABEL: name:            test9_add_co_sdwa
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9: V_ADDC_U32_e64
+---
+name:            test9_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    $vcc = COPY %30
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec
+    %31:vreg_64 = COPY $vcc
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %31, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+...
+
+# test for def $vcc_lo, should not generate
+# GFX9-LABEL: name:            test10_add_co_sdwa
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9: V_ADDC_U32_e64
+---
+name:            test10_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    $vcc_lo = COPY %30.sub0
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    %31:vgpr_32 = COPY $vcc_lo
+    %32:vreg_64 = REG_SEQUENCE %31, %subreg.sub0, %23, %subreg.sub1
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %32, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+...
+
+# test for read $vcc_hi, should not generate
+# GFX9-LABEL: name:            test11_add_co_sdwa
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9: V_ADDC_U32_e64
+---
+name:            test11_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    $vcc_hi = COPY %30.sub0
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    %31:vgpr_32 = COPY $vcc_hi
+    %32:vreg_64 = REG_SEQUENCE %31, %subreg.sub0, %23, %subreg.sub1
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %32, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
+...
+
+# test for $vcc defined and used between adds, should not generate
+# GFX9-LABEL: name:            test12_add_co_sdwa
+# GFX9-NOT: V_ADD_I32_sdwa
+# GFX9: V_ADDC_U32_e64
+---
+name:            test12_add_co_sdwa
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %22:sreg_32_xm0 = S_MOV_B32 255
+    %30:vreg_64 = COPY $sgpr0_sgpr1
+    %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
+    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec
+    $vcc = COPY %30
+    %31:vreg_64 = COPY killed $vcc
+    %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec
+    %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
+    GLOBAL_STORE_DWORDX2_SADDR %31, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 3c92e8e5cba3d65a4c294c2fc1d2bd96ee2944b4..6beac4904a704ef4251019c7f06faa95bc10c287 100644
--- a/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -501,7 +501,12 @@ entry:
 ; GCN-LABEL: {{^}}sdwa_crash_inlineasm_def:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x10000,
+;
+; TODO: Why is the constant not peepholed into the v_or_b32_e32?
+;
+; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0,
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000,
 define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
 bb:
   br label %bb1
diff --git a/test/CodeGen/AMDGPU/select-undef.ll b/test/CodeGen/AMDGPU/select-undef.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6597d6784e0c2367ba62c024bf3c1f520831c681
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-undef.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}select_undef_lhs:
+; GCN: s_waitcnt
+; GCN-NOT: v_cmp
+; GCN-NOT: v_cndmask
+; GCN-NEXT: s_setpc_b64
+define float @select_undef_lhs(float %val, i1 %cond) {
+  %undef = call float @llvm.amdgcn.rcp.f32(float undef)
+  %sel = select i1 %cond, float %undef, float %val
+  ret float %sel
+}
+
+; GCN-LABEL: {{^}}select_undef_rhs:
+; GCN: s_waitcnt
+; GCN-NOT: v_cmp
+; GCN-NOT: v_cndmask
+; GCN-NEXT: s_setpc_b64
+define float @select_undef_rhs(float %val, i1 %cond) {
+  %undef = call float @llvm.amdgcn.rcp.f32(float undef)
+  %sel = select i1 %cond, float %val, float %undef
+  ret float %sel
+}
+
+; GCN-LABEL: {{^}}select_undef_n1:
+; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
+; GCN: store_dword {{[^,]+}}, [[RES]]
+define void @select_undef_n1(float addrspace(1)* %a, i32 %c) {
+  %cc = icmp eq i32 %c, 0
+  %sel = select i1 %cc, float 1.000000e+00, float undef
+  store float %sel, float addrspace(1)* %a
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_undef_n2:
+; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
+; GCN: store_dword {{[^,]+}}, [[RES]]
+define void @select_undef_n2(float addrspace(1)* %a, i32 %c) {
+  %cc = icmp eq i32 %c, 0
+  %sel = select i1 %cc, float undef, float 1.000000e+00
+  store float %sel, float addrspace(1)* %a
+  ret void
+}
+
+declare float @llvm.amdgcn.rcp.f32(float)
diff --git a/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
index be8b207a74083cd06d2babf67d6c174a9e0289ee..9481e98502d6e095676ca5e807adcc210213d275 100644
--- a/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
+++ b/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
@@ -75,23 +75,22 @@
 name:            sgpr_spill_wrong_stack_id
 tracksRegLiveness: true
 frameInfo:
-  adjustsStack:    false
   hasCalls:        true
 body:             |
-  bb.0.bb:
-    %8:sreg_32_xm0 = COPY $sgpr5
-    %4:vreg_64 = IMPLICIT_DEF
-    %3:vgpr_32 = FLAT_LOAD_DWORD %4, 0, 0, 0, implicit $exec, implicit $flat_scr
-    %5:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
+  bb.0:
+    %0:sreg_32_xm0 = COPY $sgpr5
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit $exec, implicit $flat_scr
+    %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
     ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5
-    dead $sgpr30_sgpr31 = SI_CALL %5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0
-    $sgpr5 = COPY %8
-    %12:sreg_32_xm0 = COPY $sgpr5
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0
+    $sgpr5 = COPY %0
+    %4:sreg_32_xm0 = COPY $sgpr5
     ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5
     ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5
-    $vgpr0 = COPY %3
-    dead $sgpr30_sgpr31 = SI_CALL %5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0
-    $sgpr5 = COPY %12
+    $vgpr0 = COPY %2
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0
+    $sgpr5 = COPY %4
     ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5
 
 ...
diff --git a/test/CodeGen/AMDGPU/shl_add.ll b/test/CodeGen/AMDGPU/shl_add.ll
new file mode 100644
index 0000000000000000000000000000000000000000..062198a8cc8a0d0ba83bd56490225034a228c55e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl_add.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+
+; ===================================================================================
+; V_LSHL_ADD_U32
+; ===================================================================================
+
+define amdgpu_ps float @shl_add(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: shl_add:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_add:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+; ThreeOp instruction variant not used due to Constant Bus Limitations
+define amdgpu_ps float @shl_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
+; VI-LABEL: shl_add_vgpr_a:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s3, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_add_vgpr_a:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_add_vgpr_all(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: shl_add_vgpr_all:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_add_vgpr_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
+; VI-LABEL: shl_add_vgpr_ab:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_add_vgpr_ab:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, v1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_add_vgpr_const(i32 %a, i32 %b) {
+; VI-LABEL: shl_add_vgpr_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_add_vgpr_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, 3
+  %result = add i32 %x, %b
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
diff --git a/test/CodeGen/AMDGPU/shl_or.ll b/test/CodeGen/AMDGPU/shl_or.ll
new file mode 100644
index 0000000000000000000000000000000000000000..24d34efa28b1c26991364c11088c3de5a9341b03
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl_or.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+
+; ===================================================================================
+; V_LSHL_OR_B32
+; ===================================================================================
+
+define amdgpu_ps float @shl_or(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: shl_or:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_or:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_or_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) {
+; VI-LABEL: shl_or_vgpr_c:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_lshl_b32 s0, s2, s3
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_or_vgpr_c:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshl_b32 s0, s2, s3
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_or_vgpr_all2(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: shl_or_vgpr_all2:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_or_vgpr_all2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = or i32 %c, %x
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_or_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) {
+; VI-LABEL: shl_or_vgpr_ac:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_or_vgpr_ac:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, s2, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = or i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_or_vgpr_const(i32 %a, i32 %b) {
+; VI-LABEL: shl_or_vgpr_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_or_b32_e32 v0, 6, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_or_vgpr_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v1, 6
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, %b
+  %result = or i32 %x, 6
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_or_vgpr_const2(i32 %a, i32 %b) {
+; VI-LABEL: shl_or_vgpr_const2:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_or_vgpr_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 6, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, 6
+  %result = or i32 %x, %b
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_or_vgpr_const_scalar1(i32 inreg %a, i32 %b) {
+; VI-LABEL: shl_or_vgpr_const_scalar1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_lshl_b32 s0, s2, 6
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_or_vgpr_const_scalar1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_or_b32 v0, s2, 6, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, 6
+  %result = or i32 %x, %b
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @shl_or_vgpr_const_scalar2(i32 %a, i32 inreg %b) {
+; VI-LABEL: shl_or_vgpr_const_scalar2:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; VI-NEXT:    v_or_b32_e32 v0, s2, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: shl_or_vgpr_const_scalar2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 6, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = shl i32 %a, 6
+  %result = or i32 %x, %b
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 7c0f5b9a4711d618eb48acd41f8b985e25bf6b98..e9a6ba9894261660eca007becf62a088cc3ce4bd 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
@@ -265,16 +265,15 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o
 ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
 ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
 
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44
 
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52
 
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}}
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20
-
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52
 define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
diff --git a/test/CodeGen/AMDGPU/sibling-call.ll b/test/CodeGen/AMDGPU/sibling-call.ll
index efc01083a3673734fcf1c68ec330d61f5530f751..260953b184c3e23e103d973e2b97c357595c2d87 100644
--- a/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/test/CodeGen/AMDGPU/sibling-call.ll
@@ -136,8 +136,7 @@ entry:
 
 
 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9: v_add_u32_e32 v0, v0, [[LOAD_0]]
-; GFX9: v_add_u32_e32 v0, v0, [[LOAD_1]]
+; GFX9: v_add3_u32 v0, v0, v3, v2
 
 ; GCN-NEXT: s_setpc_b64
 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
diff --git a/test/CodeGen/AMDGPU/simplify-libcalls.ll b/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 24117e2c83689377063b698540ebfd2baba62b2e..859f848d228c4448b092c389b67f7462f8ab9cda 100644
--- a/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -783,5 +783,10 @@ entry:
   ret void
 }
 
+; GCN-PRELINK: declare float @_Z4fabsf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]]
+; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]]
+; GCN-PRELINK: declare float @_Z11native_sqrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]]
+
 ; CGN-PRELINK: attributes #[[$NOUNWIND]] = { nounwind }
+; GCN-PRELINK: attributes #[[$NOUNWIND_READONLY]] = { nounwind readonly }
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
index 42a28b9527391240b33604c5932e5d02132618b4..a02e6f8349cd3b7a623bd39f123e90fc2ffdd74b 100644
--- a/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -2,9 +2,10 @@
 
 ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
 ; CHECK-NEXT: ; %bb.0:
+; CHECK-NEXT: ; %bb.1:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
-  call void @llvm.AMDGPU.kill(float 0.0)
+  call void @llvm.amdgcn.kill(i1 true)
   ret void
 }
 
@@ -14,7 +15,7 @@ define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
 ; CHECK-NEXT: ; %bb.1:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
-  call void @llvm.AMDGPU.kill(float -0.0)
+  call void @llvm.amdgcn.kill(i1 false)
   ret void
 }
 
@@ -27,58 +28,62 @@ define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
 ; CHECK-NEXT: ; %bb.2:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
-  call void @llvm.AMDGPU.kill(float -0.0)
-  call void @llvm.AMDGPU.kill(float -1.0)
+  call void @llvm.amdgcn.kill(i1 false)
+  call void @llvm.amdgcn.kill(i1 false)
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_kill_depth_var:
 ; CHECK-NEXT: ; %bb.0:
-; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
 ; CHECK-NEXT: ; %bb.1:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
-  call void @llvm.AMDGPU.kill(float %x)
+  %cmp = fcmp olt float %x, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp)
   ret void
 }
 
 ; FIXME: Ideally only one would be emitted
 ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
 ; CHECK-NEXT: ; %bb.0:
-; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
 ; CHECK-NEXT: ; %bb.1:
-; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
 ; CHECK-NEXT: ; %bb.2:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
-  call void @llvm.AMDGPU.kill(float %x)
-  call void @llvm.AMDGPU.kill(float %x)
+  %cmp = fcmp olt float %x, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp)
+  call void @llvm.amdgcn.kill(i1 %cmp)
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
 ; CHECK-NEXT: ; %bb.0:
-; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
 ; CHECK-NEXT: ; %bb.1:
-; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1
 ; CHECK-NEXT: ; %bb.2:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
-  call void @llvm.AMDGPU.kill(float %x)
-  call void @llvm.AMDGPU.kill(float %y)
+  %cmp.x = fcmp olt float %x, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.x)
+  %cmp.y = fcmp olt float %y, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.y)
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions:
 ; CHECK-NEXT: ; %bb.0:
-; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
 ; CHECK-NEXT: s_cbranch_execnz BB6_2
 ; CHECK-NEXT: ; %bb.1:
 ; CHECK-NEXT: exp
 ; CHECK-NEXT: s_endpgm
 ; CHECK-NEXT: BB6_2:
 ; CHECK: v_mov_b32_e64 v7, -1
-; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
 ; CHECK-NEXT: s_cbranch_execnz BB6_4
 ; CHECK-NEXT: ; %bb.3:
 ; CHECK-NEXT: exp
@@ -86,9 +91,11 @@ define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
 ; CHECK-NEXT: BB6_4:
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
-  call void @llvm.AMDGPU.kill(float %x)
+  %cmp.x = fcmp olt float %x, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.x)
   %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={v7}"()
-  call void @llvm.AMDGPU.kill(float %y)
+  %cmp.y = fcmp olt float %y, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.y)
   ret void
 }
 
@@ -111,7 +118,7 @@ define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
 ; CHECK: v_nop_e64
 ; CHECK: v_nop_e64
 
-; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: ; %bb.2:
 ; CHECK-NEXT: exp null off, off, off, off done vm
@@ -137,7 +144,8 @@ bb:
     v_nop_e64
     v_nop_e64
     v_nop_e64", "={v7}"()
-  call void @llvm.AMDGPU.kill(float %var)
+  %cmp.var = fcmp olt float %var, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.var)
   br label %exit
 
 exit:
@@ -162,7 +170,7 @@ exit:
 ; CHECK: ;;#ASMEND
 ; CHECK: v_mov_b32_e64 v8, -1
 ; CHECK: ;;#ASMEND
-; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
 
 ; CHECK-NEXT: ; %bb.2:
@@ -196,7 +204,8 @@ bb:
     v_nop_e64
     v_nop_e64", "={v7}"()
   %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={v8}"()
-  call void @llvm.AMDGPU.kill(float %var)
+  %cmp.var = fcmp olt float %var, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.var)
   store volatile float %live.across, float addrspace(1)* undef
   %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={v9}"()
   br label %exit
@@ -221,7 +230,7 @@ exit:
 
 ; CHECK: v_mov_b32_e64 v7, -1
 ; CHECK: v_nop_e64
-; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
 
 ; CHECK-NEXT: ; %bb.3:
 ; CHECK: buffer_load_dword [[LOAD:v[0-9]+]]
@@ -251,7 +260,8 @@ bb:
     v_nop_e64
     v_nop_e64
     v_nop_e64", "={v7}"()
-  call void @llvm.AMDGPU.kill(float %var)
+  %cmp.var = fcmp olt float %var, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.var)
   %vgpr = load volatile i32, i32 addrspace(1)* undef
   %loop.cond = icmp eq i32 %vgpr, 0
   br i1 %loop.cond, label %bb, label %exit
@@ -264,7 +274,7 @@ exit:
 ; bug 28550
 ; CHECK-LABEL: {{^}}phi_use_def_before_kill:
 ; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0,
-; CHECK: v_cmpx_le_f32_e32 vcc, 0,
+; CHECK: v_cmpx_lt_f32_e32 vcc, 0,
 ; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]]
 
 ; CHECK: exp
@@ -288,7 +298,8 @@ bb:
   %tmp = fadd float %x, 1.000000e+00
   %tmp1 = fcmp olt float 0.000000e+00, %tmp
   %tmp2 = select i1 %tmp1, float -1.000000e+00, float 0.000000e+00
-  call void @llvm.AMDGPU.kill(float %tmp2)
+  %cmp.tmp2 = fcmp olt float %tmp2, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.tmp2)
   br i1 undef, label %phibb, label %bb8
 
 phibb:
@@ -335,7 +346,7 @@ bb5:                                              ; preds = %bb4, %bb3
   unreachable
 
 bb6:                                              ; preds = %bb
-  call void @llvm.AMDGPU.kill(float -1.000000e+00)
+  call void @llvm.amdgcn.kill(i1 false)
   unreachable
 
 bb7:                                              ; preds = %bb4
@@ -348,7 +359,7 @@ bb7:                                              ; preds = %bb4
 ; CHECK: s_xor_b64
 ; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
 
-; CHECK: v_cmpx_le_f32_e32 vcc, 0,
+; CHECK: v_cmpx_gt_f32_e32 vcc, 0,
 ; CHECK: [[BB4]]:
 ; CHECK: s_or_b64 exec, exec
 ; CHECK: image_sample_c
@@ -369,7 +380,8 @@ bb:
   br i1 %tmp, label %bb3, label %bb4
 
 bb3:                                              ; preds = %bb
-  call void @llvm.AMDGPU.kill(float %arg)
+  %cmp.arg = fcmp olt float %arg, 0.0
+  call void @llvm.amdgcn.kill(i1 %cmp.arg)
   br label %bb4
 
 bb4:                                              ; preds = %bb3, %bb
@@ -387,7 +399,7 @@ bb9:                                              ; preds = %bb4
 }
 
 declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
-declare void @llvm.AMDGPU.kill(float) #0
+declare void @llvm.amdgcn.kill(i1) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll
index 22f11da4648b5cc2ede211559003c2c1a2a1047c..22c3b2d42f35af1822894ebb5336c0180c02f67c 100644
--- a/test/CodeGen/AMDGPU/smed3.ll
+++ b/test/CodeGen/AMDGPU/smed3.ll
@@ -364,6 +364,211 @@ bb:
   ret void
 }
 
+; 16 combinations
+
+; 16: min(max(x, y), max(min(x, y), z))
+; 17: min(max(x, y), max(min(y, x), z))
+; 18: min(max(x, y), max(z, min(x, y)))
+; 19: min(max(x, y), max(z, min(y, x)))
+; 20: min(max(y, x), max(min(x, y), z))
+; 21: min(max(y, x), max(min(y, x), z))
+; 22: min(max(y, x), max(z, min(x, y)))
+; 23: min(max(y, x), max(z, min(y, x)))
+;
+; + commute outermost min
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_16:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_16(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_17:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_17(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_18:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_18(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_19:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_19(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_20:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_20(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_21:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_21(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_22:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_22(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_23:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_23(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_24:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_24(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_25:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_25(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_26:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_26(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_27:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_27(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_28:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_28(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_29:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_29(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_30:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_30(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_31:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_smed3_i32_pat_31(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
 ; FIXME: Should keep scalar or not promote
 ; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0:
 ; GCN: s_sext_i32_i16
@@ -476,6 +681,28 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1:
+; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+define amdgpu_kernel void @v_test_smed3_i16_pat_1(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %x = load i16, i16 addrspace(1)* %gep0
+  %y = load i16, i16 addrspace(1)* %gep1
+  %z = load i16, i16 addrspace(1)* %gep2
+
+  %tmp0 = call i16 @smin16(i16 %x, i16 %y)
+  %tmp1 = call i16 @smax16(i16 %x, i16 %y)
+  %tmp2 = call i16 @smax16(i16 %tmp0, i16 %z)
+  %tmp3 = call i16 @smin16(i16 %tmp1, i16 %tmp2)
+  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/smrd-fold-offset.mir b/test/CodeGen/AMDGPU/smrd-fold-offset.mir
index 44954f0652313a4cf80f8790c248de28759c8972..10601ccaeb77dbcd1e4ad8bfeccce23a5f5cd39b 100644
--- a/test/CodeGen/AMDGPU/smrd-fold-offset.mir
+++ b/test/CodeGen/AMDGPU/smrd-fold-offset.mir
@@ -1,6 +1,8 @@
 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s
 
-# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
+# GCN-LABEL: name: smrd_vgpr_offset_imm
+# GCN: V_READFIRSTLANE_B32
+# GCN: S_BUFFER_LOAD_DWORD_SGPR
 ---
 name:            smrd_vgpr_offset_imm
 body:             |
@@ -22,7 +24,9 @@ body:             |
     SI_RETURN_TO_EPILOG $vgpr0
 ...
 
-# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
+# GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32
+# GCN: V_READFIRSTLANE_B32
+# GCN: S_BUFFER_LOAD_DWORD_SGPR
 ---
 name:            smrd_vgpr_offset_imm_add_u32
 body:             |
diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index 3e0fd566cc7bb47cd37afc4ee86f28533c8b12c6..823785dc752dc17809e381543219fd47c42963b2 100644
--- a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -5,7 +5,7 @@
 ; GCN-FUNC: {{^}}vccz_workaround:
 ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0
 ; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0{{$}}
-; VCCZ-BUG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VCCZ-BUG: s_waitcnt lgkmcnt(0)
 ; VCCZ-BUG: s_mov_b64 vcc, vcc
 ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
 ; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index c87145a1a5ba7a740dcc8be4a03ad6ad7893bcfe..22ee62ef427606039e07d2bc6a026cccd485b881 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -292,18 +292,19 @@ main_body:
 
 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
 ; GCN-NEXT: %bb.
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ;
+; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
 define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
 main_body:
-  %off = add i32 %offset, 4095
+  %off = add i32 %offset, 4092
   %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
   ret float %r
 }
 
 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
 ; GCN-NEXT: %bb.
-; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
+; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
+; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
+; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
 define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
 main_body:
   %off = add i32 %offset, 4096
@@ -332,7 +333,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}smrd_imm_merge_m0:
 ;
-; SICIVI: s_buffer_load_dwordx2
+; GCN: s_buffer_load_dwordx2
 ; SICIVI: s_mov_b32 m0
 ; SICIVI_DAG: v_interp_p1_f32
 ; SICIVI_DAG: v_interp_p1_f32
@@ -340,13 +341,15 @@ main_body:
 ; SICIVI_DAG: v_interp_p2_f32
 ; SICIVI_DAG: v_interp_p2_f32
 ; SICIVI_DAG: v_interp_p2_f32
-; SICIVI: s_mov_b32 m0
-; SICIVI: v_movrels_b32_e32
+;
+; extractelement does not result in movrels anymore for vectors gitting 8 dwords
+; SICIVI-NOT: s_mov_b32 m0
+; SICIVI-NOT: v_movrels_b32_e32
+; v_cndmask_b32_e32
+; v_cndmask_b32_e32
 ;
 ; Merging is still thwarted on GFX9 due to s_set_gpr_idx
 ;
-; GFX9: s_buffer_load_dword
-; GFX9: s_buffer_load_dword
 define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
 main_body:
   %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
@@ -510,12 +513,15 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}smrd_load_nonconst4:
-; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
-; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ;
-; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
-; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
-; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
-; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
+; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
 ; GCN: ; return to shader part epilog
 define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
 main_body:
@@ -526,12 +532,16 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}smrd_load_nonconst5:
-; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0
-; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0
-; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
-; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
-; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
-; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
+; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; VIGFX9: s_movk_i32 s4, 0xfc0
+; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
 ; GCN: ; return to shader part epilog
 define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
 main_body:
@@ -559,9 +569,10 @@ main_body:
 
 ; GCN-LABEL: {{^}}smrd_uniform_loop:
 ;
-; TODO: this should use an s_buffer_load
+; TODO: we should keep the loop counter in an SGPR
 ;
-; GCN: buffer_load_dword
+; GCN: v_readfirstlane_b32
+; GCN: s_buffer_load_dword
 define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
 main_body:
   br label %loop
@@ -614,6 +625,68 @@ exit:
   ret float %sum.next
 }
 
+; This test checks that the load after some control flow with an offset based
+; on a divergent shader input is correctly recognized as divergent. This was
+; reduced from an actual regression. Yes, the %unused argument matters, as
+; well as the fact that %arg4 is a vector.
+;
+; GCN-LABEL: {{^}}arg_divergence:
+; GCN: buffer_load_dword v0, v0,
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: ; return to shader part epilog
+define amdgpu_cs float @arg_divergence(i32 inreg %unused, <3 x i32> %arg4) #0 {
+main_body:
+  br i1 undef, label %if1, label %endif1
+
+if1:                                              ; preds = %main_body
+  store i32 0, i32 addrspace(3)* undef, align 4
+  br label %endif1
+
+endif1:                                           ; preds = %if1, %main_body
+  %tmp13 = extractelement <3 x i32> %arg4, i32 0
+  %tmp97 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 %tmp13)
+  ret float %tmp97
+}
+
+; GCN-LABEL: {{^}}s_buffer_load_f32:
+; GCN: s_buffer_load_dword s0, s[0:3], s4
+define amdgpu_ps void @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
+  %sgpr = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
+  call void asm sideeffect "; use $0", "s"(float %sgpr)
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_buffer_load_v2f32:
+; GCN: s_buffer_load_dwordx2 s[0:1], s[0:3], s4
+define amdgpu_ps void @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
+  %sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %offset, i32 0)
+  call void asm sideeffect "; use $0", "s"(<2 x float> %sgpr)
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_buffer_load_v4f32:
+; GCN: s_buffer_load_dwordx4 s[0:3], s[0:3], s4
+define amdgpu_ps void @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
+  %sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %offset, i32 0)
+  call void asm sideeffect "; use $0", "s"(<4 x float> %sgpr)
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_buffer_load_v8f32:
+; GCN: s_buffer_load_dwordx8 s[0:7], s[0:3], s4
+define amdgpu_ps void @s_buffer_load_v8f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
+  %sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %offset, i32 0)
+  call void asm sideeffect "; use $0", "s"(<8 x float> %sgpr)
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_buffer_load_v16f32:
+; GCN: s_buffer_load_dwordx16 s[0:15], s[0:3], s4
+define amdgpu_ps void @s_buffer_load_v16f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
+  %sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %offset, i32 0)
+  call void asm sideeffect "; use $0", "s"(<16 x float> %sgpr)
+  ret void
+}
 
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
@@ -625,6 +698,12 @@ declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
 declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32)
 declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32)
 
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
+declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32)
+declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32)
+declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32)
+declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32)
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
index 32b4f158a011c31fc792444ab0ebc25f0e60e0b4..384042f5a6351f80ed1987bdbd0e68accb3fb501 100644
--- a/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa-opencl -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s
 # https://bugs.llvm.org/show_bug.cgi?id=33620
 
 ---
@@ -10,7 +10,7 @@
 # CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $exec
 # CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
 # CHECK-NEXT: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
-# CHECK-NEXT: dead %2:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $exec
+# CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $exec
 
 # CHECK: S_NOP 0, implicit %6.sub1
 # CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
@@ -19,20 +19,16 @@
 
 name: expecting_non_empty_interval
 tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_64, preferred-register: '' }
-  - { id: 1, class: vgpr_32, preferred-register: '' }
-  - { id: 2, class: vgpr_32, preferred-register: '' }
-  - { id: 3, class: vreg_64, preferred-register: '' }
 body:             |
   bb.0:
     successors: %bb.1
-    undef %0.sub1 = V_MAC_F32_e32 0, undef %1, undef %0.sub1, implicit $exec
-    undef %3.sub1 = V_MOV_B32_e32 1786773504, implicit $exec
-    dead %2 = V_MUL_F32_e32 0, %3.sub1, implicit $exec
+
+    undef %0.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %0.sub1, implicit $exec
+    undef %2.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
+    dead %3:vgpr_32 = V_MUL_F32_e32 0, %2.sub1, implicit $exec
 
   bb.1:
-    S_NOP 0, implicit %3.sub1
+    S_NOP 0, implicit %2.sub1
     S_NOP 0, implicit %0.sub1
     S_NOP 0, implicit undef %0.sub0
 
@@ -44,29 +40,24 @@ body:             |
 # CHECK-LABEL: name: rematerialize_empty_interval_has_reference
 
 # CHECK-NOT: MOV
-# CHECK: undef %3.sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
+# CHECK: undef %1.sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
 
 # CHECK: bb.1:
-# CHECK-NEXT: S_NOP 0, implicit %3.sub2
-# CHECK-NEXT: S_NOP 0, implicit undef %6.sub0
-# CHECK-NEXT: undef %4.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
-# CHECK-NEXT: S_NOP 0, implicit %4.sub2
+# CHECK-NEXT: S_NOP 0, implicit %1.sub2
+# CHECK-NEXT: S_NOP 0, implicit undef %4.sub0
+# CHECK-NEXT: undef %2.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: S_NOP 0, implicit %2.sub2
 name: rematerialize_empty_interval_has_reference
 tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_128, preferred-register: '' }
-  - { id: 1, class: vgpr_32, preferred-register: '' }
-  - { id: 2, class: vgpr_32, preferred-register: '' }
-  - { id: 3, class: vreg_128, preferred-register: '' }
 body:             |
   bb.0:
     successors: %bb.1
 
-    undef %0.sub2 = V_MOV_B32_e32 0, implicit $exec
-    undef %3.sub2 = V_MOV_B32_e32 1786773504, implicit $exec
+    undef %0.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+    undef %1.sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
 
   bb.1:
-    S_NOP 0, implicit %3.sub2
+    S_NOP 0, implicit %1.sub2
     S_NOP 0, implicit undef %0.sub0
     S_NOP 0, implicit %0.sub2
 
diff --git a/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
index 68c8b31a95f5578f7b12ea0fe3588d1b5f942631..559c0d438ef00e1f27cd1fcd80b48c399a65ce9d 100644
--- a/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
+++ b/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
@@ -17,18 +17,12 @@
 
 name: no_merge_sgpr_vgpr_spill_slot
 tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32 }
-  - { id: 1, class: sreg_32_xm0_xexec }
-  - { id: 2, class: vgpr_32 }
-  - { id: 3, class: sreg_32_xm0_xexec }
-
 body: |
   bb.0:
-    %0 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec
-    %2 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec
+      %0:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec
+    %2:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec
     S_NOP 0, implicit %0
-    %1 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0
-    %3 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0
+    %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0
     S_NOP 0, implicit %1
 ...
diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll
index 5040ab3bc29a9719349b6b38720951c82ac5bae2..50f6b81d7ac71cadbc229b1fd9b9fea9b15cf5a8 100644
--- a/test/CodeGen/AMDGPU/store-global.ll
+++ b/test/CodeGen/AMDGPU/store-global.ll
@@ -273,8 +273,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}store_v3i32:
-; SIVI-DAG: buffer_store_dwordx2
-; SIVI-DAG: buffer_store_dword v
+; SIVI-DAG: buffer_store_dwordx3
 
 ; GFX9-DAG: global_store_dwordx2
 ; GFX9-DAG: global_store_dword v
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll
index da905b82ab4baf636243e2484a4875673a4aea82..534347f3fae96b0674e938d22151ce9044c5884d 100644
--- a/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -89,8 +89,7 @@ define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
-; GCN-DAG: buffer_store_dwordx2
-; GCN-DAG: buffer_store_dword v
+; GCN-DAG: buffer_store_dwordx3
 define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i32>
   store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/store-weird-sizes.ll b/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 5bcf78f11ac5724075a365168b2e7f4edb148b04..13380e03e32f1ba5fc497b960262928b9e851101 100644
--- a/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -1,64 +1,235 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,HAWAII %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FIJI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
-; GCN-LABEL: {{^}}local_store_i56:
-; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
-
-; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
-; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}}
-
-
 define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
+; CIVI-LABEL: local_store_i56:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_mov_b32 m0, -1
+; CIVI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; CIVI-NEXT:    ds_write_b16 v0, v2 offset:4
+; CIVI-NEXT:    ds_write_b32 v0, v1
+; CIVI-NEXT:    ds_write_b8 v0, v3 offset:6
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: local_store_i56:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:6
+; GFX9-NEXT:    ds_write_b16 v0, v2 offset:4
+; GFX9-NEXT:    ds_write_b32 v0, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   store i56 %arg, i56 addrspace(3)* %ptr, align 8
   ret void
 }
 
-; GCN-LABEL: {{^}}local_store_i55:
-; CIVI-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:6
-; CIVI-DAG: ds_write_b16 v{{[0-9]+}}, v{{[0-9]+}} offset:4
-; CIVI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+$}}
-
-; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
-; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}}
 define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
+; HAWAII-LABEL: local_store_i55:
+; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_add_u32 s0, s4, 14
+; HAWAII-NEXT:    s_addc_u32 s1, s5, 0
+; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
+; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
+; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x0
+; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x2
+; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x3
+; HAWAII-NEXT:    s_mov_b32 m0, -1
+; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s0
+; HAWAII-NEXT:    v_mov_b32_e32 v3, s1
+; HAWAII-NEXT:    v_mov_b32_e32 v2, s2
+; HAWAII-NEXT:    ds_write_b16 v1, v2 offset:4
+; HAWAII-NEXT:    s_waitcnt vmcnt(0)
+; HAWAII-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; HAWAII-NEXT:    ds_write_b8 v1, v0 offset:6
+; HAWAII-NEXT:    ds_write_b32 v1, v3
+; HAWAII-NEXT:    s_endpgm
+;
+; FIJI-LABEL: local_store_i55:
+; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_add_u32 s0, s4, 14
+; FIJI-NEXT:    s_addc_u32 s1, s5, 0
+; FIJI-NEXT:    v_mov_b32_e32 v0, s0
+; FIJI-NEXT:    v_mov_b32_e32 v1, s1
+; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
+; FIJI-NEXT:    s_load_dword s0, s[4:5], 0x0
+; FIJI-NEXT:    s_load_dword s1, s[4:5], 0x8
+; FIJI-NEXT:    s_load_dword s2, s[4:5], 0xc
+; FIJI-NEXT:    s_mov_b32 m0, -1
+; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIJI-NEXT:    v_mov_b32_e32 v1, s0
+; FIJI-NEXT:    v_mov_b32_e32 v3, s1
+; FIJI-NEXT:    v_mov_b32_e32 v2, s2
+; FIJI-NEXT:    ds_write_b16 v1, v2 offset:4
+; FIJI-NEXT:    s_waitcnt vmcnt(0)
+; FIJI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; FIJI-NEXT:    ds_write_b8 v1, v0 offset:6
+; FIJI-NEXT:    ds_write_b32 v1, v3
+; FIJI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_store_i55:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_load_ubyte_d16_hi v0, v[0:1], off offset:14
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    ds_write_b16 v1, v2 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f0000, v0
+; GFX9-NEXT:    ds_write_b8_d16_hi v1, v0 offset:6
+; GFX9-NEXT:    ds_write_b32 v1, v3
+; GFX9-NEXT:    s_endpgm
   store i55 %arg, i55 addrspace(3)* %ptr, align 8
   ret void
 }
 
-; GCN-LABEL: {{^}}local_store_i48:
-; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}}
 define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 {
+; HAWAII-LABEL: local_store_i48:
+; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x0
+; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x2
+; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x3
+; HAWAII-NEXT:    s_mov_b32 m0, -1
+; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
+; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
+; HAWAII-NEXT:    v_mov_b32_e32 v2, s2
+; HAWAII-NEXT:    ds_write_b16 v0, v2 offset:4
+; HAWAII-NEXT:    ds_write_b32 v0, v1
+; HAWAII-NEXT:    s_endpgm
+;
+; FIJI-LABEL: local_store_i48:
+; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_load_dword s0, s[4:5], 0x0
+; FIJI-NEXT:    s_load_dword s1, s[4:5], 0x8
+; FIJI-NEXT:    s_load_dword s2, s[4:5], 0xc
+; FIJI-NEXT:    s_mov_b32 m0, -1
+; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIJI-NEXT:    v_mov_b32_e32 v0, s0
+; FIJI-NEXT:    v_mov_b32_e32 v1, s1
+; FIJI-NEXT:    v_mov_b32_e32 v2, s2
+; FIJI-NEXT:    ds_write_b16 v0, v2 offset:4
+; FIJI-NEXT:    ds_write_b32 v0, v1
+; FIJI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_store_i48:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    ds_write_b16 v0, v2 offset:4
+; GFX9-NEXT:    ds_write_b32 v0, v1
+; GFX9-NEXT:    s_endpgm
   store i48 %arg, i48 addrspace(3)* %ptr, align 8
   ret void
 }
 
-; GCN-LABEL: {{^}}local_store_i65:
-; GCN-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:8
-; GCN-DAG: ds_write_b64
 define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 {
+; HAWAII-LABEL: local_store_i65:
+; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x0
+; HAWAII-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; HAWAII-NEXT:    s_load_dword s3, s[4:5], 0x4
+; HAWAII-NEXT:    s_mov_b32 m0, -1
+; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
+; HAWAII-NEXT:    v_mov_b32_e32 v2, s2
+; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
+; HAWAII-NEXT:    s_and_b32 s0, s3, 1
+; HAWAII-NEXT:    v_mov_b32_e32 v3, s0
+; HAWAII-NEXT:    ds_write_b8 v2, v3 offset:8
+; HAWAII-NEXT:    ds_write_b64 v2, v[0:1]
+; HAWAII-NEXT:    s_endpgm
+;
+; FIJI-LABEL: local_store_i65:
+; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_load_dword s2, s[4:5], 0x0
+; FIJI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; FIJI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; FIJI-NEXT:    s_mov_b32 m0, -1
+; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIJI-NEXT:    v_mov_b32_e32 v2, s2
+; FIJI-NEXT:    v_mov_b32_e32 v0, s0
+; FIJI-NEXT:    v_mov_b32_e32 v1, s1
+; FIJI-NEXT:    s_and_b32 s0, s3, 1
+; FIJI-NEXT:    v_mov_b32_e32 v3, s0
+; FIJI-NEXT:    ds_write_b8 v2, v3 offset:8
+; FIJI-NEXT:    ds_write_b64 v2, v[0:1]
+; FIJI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_store_i65:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x10
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_and_b32 s0, s3, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    ds_write_b8 v2, v3 offset:8
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
   store i65 %arg, i65 addrspace(3)* %ptr, align 8
   ret void
 }
 
-; GCN-LABEL: {{^}}local_store_i13:
-; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x1fff, v1
-; GCN: ds_write_b16 v0, [[TRUNC]]
 define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 {
+; CIVI-LABEL: local_store_i13:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
+; CIVI-NEXT:    s_mov_b32 m0, -1
+; CIVI-NEXT:    ds_write_b16 v0, v1
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: local_store_i13:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
+; GFX9-NEXT:    ds_write_b16 v0, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   store i13 %arg, i13 addrspace(3)* %ptr, align 8
   ret void
 }
 
-; GCN-LABEL: {{^}}local_store_i17:
-; GCN: ds_write_b16 v0
-; CIVI: ds_write_b8 v0, v{{[0-9]+}} offset:2
-; GFX9: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:2
 define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
+; CIVI-LABEL: local_store_i17:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CIVI-NEXT:    s_mov_b32 m0, -1
+; CIVI-NEXT:    ds_write_b16 v0, v1
+; CIVI-NEXT:    ds_write_b8 v0, v2 offset:2
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: local_store_i17:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b16 v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x1ffff, v1
+; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   store i17 %arg, i17 addrspace(3)* %ptr, align 8
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
index 4458a049618e723ea44da615544865c8b4d84ddf..26228dfb1c34e200d1cb2f3dd8f3a3fd439ee5fb 100644
--- a/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
+++ b/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
@@ -102,6 +102,13 @@ body: |
 
   bb.7:
     successors: %bb.13(0x80000000)
+
+    ; In reality we are checking that this code doesn't assert when splitting
+    ; and inserting a spill. Here we just check that the point where the error
+    ; occurs we see a correctly generated spill.
+    ; GCN-LABEL: bb.7:
+    ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec
+
     undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
     %15.sub1:vreg_128 = COPY %15.sub0
     %15.sub2:vreg_128 = COPY %15.sub0
@@ -114,6 +121,10 @@ body: |
 
   bb.9:
     successors: %bb.12(0x80000000)
+
+    ; GCN-LABEL: bb.9:
+    ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec
+
     undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
     %15.sub1:vreg_128 = COPY %15.sub0
     %15.sub2:vreg_128 = COPY %15.sub0
@@ -121,6 +132,10 @@ body: |
 
   bb.10:
     successors: %bb.12(0x80000000)
+
+    ; GCN-LABEL: bb.10:
+    ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec
+
     undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec
     %15.sub1:vreg_128 = COPY %15.sub0
     %15.sub2:vreg_128 = COPY %15.sub0
@@ -143,12 +158,6 @@ body: |
   bb.13:
     successors: %bb.15(0x40000000), %bb.14(0x40000000)
 
-    ; In reality we are checking that this code doesn't assert when splitting
-    ; and inserting a spill. Here we just check that the point where the error
-    ; occurs we see a correctly generated spill.
-    ; GCN-LABEL: bb.13:
-    ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec
-
     %18:vgpr_32 = V_MAD_F32 0, %10.sub0, 0, target-flags(amdgpu-gotprel) 1073741824, 0, -1082130432, 0, 0, implicit $exec
     %19:vgpr_32 = V_MAD_F32 0, %12.sub0, 0, target-flags(amdgpu-gotprel) 0, 0, 0, 0, 0, implicit $exec
     %20:sreg_128 = S_BUFFER_LOAD_DWORDX4_IMM undef %21:sreg_128, 1040, 0 :: (dereferenceable invariant load 16)
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index 0241490de7a5a51411a1adf87d0fa312c5835c1d..1c92f74ec4bdf6224e333d6db0701e96535d06a3 100644
--- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -51,8 +51,7 @@ define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x
 ;   t21: v2i32,ch = load<LD8[%in(addrspace=1)]> t12, t10, undef:i64
 ;        t23: i64 = bitcast t21
 ;      t30: i16 = truncate t23
-; SI: buffer_load_dword v[[VAL:[0-9]+]]
-; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
+; GCN: buffer_load_dword v[[VAL:[0-9]+]]
 ; GCN: buffer_store_short v[[VAL]], off
 define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll
index 071fc07ea597db0eb2d325f683c11d50020cfd3b..2f6685921df84d40444bffefb2bc9624b2393f7f 100644
--- a/test/CodeGen/AMDGPU/umed3.ll
+++ b/test/CodeGen/AMDGPU/umed3.ll
@@ -363,6 +363,212 @@ bb:
   ret void
 }
 
+; 16 combinations
+
+; 16: min(max(x, y), max(min(x, y), z))
+; 17: min(max(x, y), max(min(y, x), z))
+; 18: min(max(x, y), max(z, min(x, y)))
+; 19: min(max(x, y), max(z, min(y, x)))
+; 20: min(max(y, x), max(min(x, y), z))
+; 21: min(max(y, x), max(min(y, x), z))
+; 22: min(max(y, x), max(z, min(x, y)))
+; 23: min(max(y, x), max(z, min(y, x)))
+;
+; + commute outermost min
+
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_16:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_16(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_17:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_17(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_18:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_18(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_19:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_19(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_20:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_20(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_21:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_21(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_22:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_22(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_23:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_23(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_24:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_24(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_25:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_25(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_26:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_26(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_27:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_27(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_28:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_28(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_29:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_29(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
+  %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_30:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_30(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_31:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_umed3_i32_pat_31(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
+  %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
 ; GCN-LABEL: {{^}}s_test_umed3_i16_pat_0:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
@@ -510,6 +716,27 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1:
+; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_umed3_i16_pat_1(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %x = load i16, i16 addrspace(1)* %gep0
+  %y = load i16, i16 addrspace(1)* %gep1
+  %z = load i16, i16 addrspace(1)* %gep2
+
+  %tmp0 = call i16 @umin16(i16 %x, i16 %y)
+  %tmp1 = call i16 @umax16(i16 %x, i16 %y)
+  %tmp2 = call i16 @umax16(i16 %tmp0, i16 %z)
+  %tmp3 = call i16 @umin16(i16 %tmp1, i16 %tmp2)
+  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 3dfc2a39cbcee90bdbaab3379248f43c5b600989..edcdf4cb62923503515ccb41845b2aff9bc9b14d 100644
--- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -47,7 +47,7 @@
 ---
 # CHECK-LABEL: name: vccz_corrupt_workaround
 # CHECK: $vcc = V_CMP_EQ_F32
-# CHECK-NEXT: S_WAITCNT 0
+# CHECK-NEXT: S_WAITCNT 127
 # CHECK-NEXT: $vcc = S_MOV_B64 $vcc
 # CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit killed $vcc
 
diff --git a/test/CodeGen/AMDGPU/vector-extract-insert.ll b/test/CodeGen/AMDGPU/vector-extract-insert.ll
index e3027d09309988246bd074f3c61b5146dbc2461d..5bb4159ee9f845e90ca2ccef8dfe5bb3d62da9fe 100644
--- a/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -27,8 +27,13 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32:
 ; GCN: buffer_load_dwordx4
-; GCN: v_movreld_b32
-; GCN: v_movrels_b32
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
 ; GCN: buffer_store_dword v
 define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/verifier-pseudo-terminators.mir b/test/CodeGen/AMDGPU/verifier-pseudo-terminators.mir
new file mode 100644
index 0000000000000000000000000000000000000000..d9e8aa71f32aebf95ada14bf04e0f76f85e85861
--- /dev/null
+++ b/test/CodeGen/AMDGPU/verifier-pseudo-terminators.mir
@@ -0,0 +1,23 @@
+# RUN: not llc -march=amdgcn -run-pass=verify %s 2>&1 | FileCheck %s
+# Make sure that mismatched successors are caught when a _term
+# instruction is used
+
+# CHECK: *** Bad machine code: MBB exits via unconditional branch but the CFG successor doesn't match the actual successor! ***
+
+---
+name: verifier_pseudo_terminators
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0:sreg_64 = S_XOR_B64_term undef %1:sreg_64, undef %2:sreg_64, implicit-def $scc
+    $exec = S_MOV_B64_term %0
+    S_BRANCH %bb.2
+
+  bb.1:
+    S_SETPC_B64_return undef $sgpr30_sgpr31
+
+  bb.2:
+    S_SETPC_B64_return undef $sgpr30_sgpr31
+
+...
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index 32607c75e67db09d5c52dc0537e0276765f309ee..bcebc9f2d8b3c82c59bc3ff636d7af531169c901 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
-; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
-; RUN: llc -march=amdgcn  -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn  -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
 ; This ends up using all 256 registers and requires register
 ; scavenging which will fail to find an unsued register.
diff --git a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
index 8d66c346ed5b8314c64488ef5eb08f2817f77bfe..d6b3f035a3e3b97534adff38440efb033466c491 100644
--- a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
@@ -17,7 +17,7 @@ attributes #1 = { nounwind }
 !llvm.module.flags = !{!2, !3}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-!1 = !DIFile(filename: "foo.cl", directory: "/dev/null")
+!1 = !DIFile(filename: "foo.cl", directory: ".")
 !2 = !{i32 2, !"Dwarf Version", i32 4}
 !3 = !{i32 2, !"Debug Info Version", i32 3}
 !4 = !DILocation(line: 1, column: 42, scope: !5)
diff --git a/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir b/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca67febbf8ac384d012b1251cd14f098bb9e87
--- /dev/null
+++ b/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
@@ -0,0 +1,47 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s
+
+# GCN-LABEL: name: irreducible_loop{{$}}
+# GCN: S_LOAD_DWORDX4_IMM
+# GCN: S_WAITCNT 127{{$}}
+# GCN: S_BUFFER_LOAD_DWORD_IMM
+# GCN: S_WAITCNT 127{{$}}
+# GCN: S_CMP_GE_I32
+--- |
+
+  define amdgpu_ps void @irreducible_loop() {
+  main:
+    ret void
+  }
+
+...
+---
+name:            irreducible_loop
+body:             |
+  bb.0:
+    successors: %bb.3, %bb.2
+
+    S_CBRANCH_VCCZ %bb.2, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.1:
+    successors: %bb.3, %bb.2
+
+    S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+
+  bb.2:
+    successors: %bb.3
+
+    renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0
+    renamable $sgpr3 = S_BUFFER_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0
+
+  bb.3:
+    successors: %bb.1, %bb.4
+
+    S_CMP_GE_I32 renamable $sgpr2, renamable $sgpr3, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.1, implicit killed $scc
+
+  bb.4:
+
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/waitcnt-preexisting.mir b/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
new file mode 100644
index 0000000000000000000000000000000000000000..211e8bce939353abc04c02cfce7e8e26db7ab37f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
@@ -0,0 +1,37 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s
+
+# GCN-LABEL: name: test{{$}}
+# GCN: S_WAITCNT -16257
+# GGN: DS_READ2_B32
+# GGN: DS_READ2_B32
+# GCN: S_WAITCNT 383{{$}}
+# GCN-NEXT: $vgpr1 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+# GCN-NEXT: S_WAITCNT 127{{$}}
+# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
+--- |
+  define amdgpu_cs void @test() {
+    ret void
+  }
+...
+---
+name:            test
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr0
+
+    renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 480, 0
+    renamable $vgpr13 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec
+    S_WAITCNT -16257
+    renamable $vgpr0_vgpr1 = DS_READ2_B32 renamable $vgpr13, 0, 1, 0, implicit $m0, implicit $exec
+    renamable $vgpr2_vgpr3 = DS_READ2_B32 renamable $vgpr13, 2, 3, 0, implicit $m0, implicit $exec
+    renamable $vgpr1 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec
+    renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
+    renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
+    IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
index 81232df43d176ddd083073cfcc59d5315dccf077..3d1818368487d44978863c15a8575119eaa27fca 100644
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -586,7 +586,8 @@ main_body:
   %data.0 = extractelement <2 x float> %data, i32 0
   call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
 
-  call void @llvm.AMDGPU.kill(float %z)
+  %z.cmp = fcmp olt float %z, 0.0
+  call void @llvm.amdgcn.kill(i1 %z.cmp)
 
   %idx.1 = extractelement <2 x i32> %idx, i32 1
   %data.1 = extractelement <2 x float> %data, i32 1
@@ -619,7 +620,8 @@ main_body:
 
   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
 
-  call void @llvm.AMDGPU.kill(float %z)
+  %z.cmp = fcmp olt float %z, 0.0
+  call void @llvm.amdgcn.kill(i1 %z.cmp)
 
   ret <4 x float> %dtex
 }
@@ -826,7 +828,7 @@ declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i3
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
-declare void @llvm.AMDGPU.kill(float) #1
+declare void @llvm.amdgcn.kill(i1) #1
 declare float @llvm.amdgcn.wqm.f32(float) #3
 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
 declare float @llvm.amdgcn.wwm.f32(float) #3
diff --git a/test/CodeGen/AMDGPU/xnor.ll b/test/CodeGen/AMDGPU/xnor.ll
index 0371cc68f040956f0ad41f5fc5a1fa2fa3c4fda6..716bcb5922e9a52d49ad62843d573ddeb34cd5d8 100644
--- a/test/CodeGen/AMDGPU/xnor.ll
+++ b/test/CodeGen/AMDGPU/xnor.ll
@@ -61,8 +61,8 @@ entry:
 
 ; GCN-LABEL: {{^}}vector_xnor_i32_one_use
 ; GCN-NOT: s_xnor_b32
-; GCN: v_xor_b32
 ; GCN: v_not_b32
+; GCN: v_xor_b32
 ; GCN-DL: v_xnor_b32
 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) {
 entry:
@@ -73,10 +73,10 @@ entry:
 
 ; GCN-LABEL: {{^}}vector_xnor_i64_one_use
 ; GCN-NOT: s_xnor_b64
-; GCN: v_xor_b32
-; GCN: v_xor_b32
 ; GCN: v_not_b32
 ; GCN: v_not_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
@@ -85,3 +85,114 @@ entry:
   %r = xor i64 %xor, -1
   ret i64 %r
 }
+
+; GCN-LABEL: {{^}}xnor_s_v_i32_one_use
+; GCN-NOT: s_xnor_b32
+; GCN: s_not_b32
+; GCN: v_xor_b32
+define amdgpu_kernel void @xnor_s_v_i32_one_use(i32 addrspace(1)* %out, i32 %s) {
+  %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %xor = xor i32 %s, %v
+  %d = xor i32 %xor, -1
+  store i32 %d, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}xnor_v_s_i32_one_use
+; GCN-NOT: s_xnor_b32
+; GCN: s_not_b32
+; GCN: v_xor_b32
+define amdgpu_kernel void @xnor_v_s_i32_one_use(i32 addrspace(1)* %out, i32 %s) {
+  %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %xor = xor i32 %v, %s
+  %d = xor i32 %xor, -1
+  store i32 %d, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}xnor_i64_s_v_one_use
+; GCN-NOT: s_xnor_b64
+; GCN: s_not_b64
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN-DL: v_xnor_b32
+; GCN-DL: v_xnor_b32
+define amdgpu_kernel void @xnor_i64_s_v_one_use(
+  i64 addrspace(1)* %r0, i64 %a) {
+entry:
+  %b32 = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %b64 = zext i32 %b32 to i64
+  %b = shl i64 %b64, 29
+  %xor = xor i64 %a, %b
+  %r0.val = xor i64 %xor, -1
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}xnor_i64_v_s_one_use
+; GCN-NOT: s_xnor_b64
+; GCN: s_not_b64
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN-DL: v_xnor_b32
+; GCN-DL: v_xnor_b32
+define amdgpu_kernel void @xnor_i64_v_s_one_use(
+  i64 addrspace(1)* %r0, i64 %a) {
+entry:
+  %b32 = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %b64 = zext i32 %b32 to i64
+  %b = shl i64 %b64, 29
+  %xor = xor i64 %b, %a
+  %r0.val = xor i64 %xor, -1
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}vector_xor_na_b_i32_one_use
+; GCN-NOT: s_xnor_b32
+; GCN: v_not_b32
+; GCN: v_xor_b32
+; GCN-DL: v_xnor_b32
+define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) {
+entry:
+  %na = xor i32 %a, -1
+  %r = xor i32 %na, %b
+  ret i32 %r
+}
+
+; GCN-LABEL: {{^}}vector_xor_a_nb_i32_one_use
+; GCN-NOT: s_xnor_b32
+; GCN: v_not_b32
+; GCN: v_xor_b32
+; GCN-DL: v_xnor_b32
+define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) {
+entry:
+  %nb = xor i32 %b, -1
+  %r = xor i32 %a, %nb
+  ret i32 %r
+}
+
+; GCN-LABEL: {{^}}scalar_xor_a_nb_i64_one_use
+; GCN: s_xnor_b64
+define amdgpu_kernel void @scalar_xor_a_nb_i64_one_use(
+    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+  %nb = xor i64 %b, -1
+  %r0.val = xor i64 %a, %nb
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_xor_na_b_i64_one_use
+; GCN: s_xnor_b64
+define amdgpu_kernel void @scalar_xor_na_b_i64_one_use(
+    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+  %na = xor i64 %a, -1
+  %r0.val = xor i64 %na, %b
+  store i64 %r0.val, i64 addrspace(1)* %r0
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/test/CodeGen/AMDGPU/xor_add.ll b/test/CodeGen/AMDGPU/xor_add.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1a13fc51d44a3fa6ecef31385e281642b5d15719
--- /dev/null
+++ b/test/CodeGen/AMDGPU/xor_add.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+
+; ===================================================================================
+; V_XAD_U32
+; ===================================================================================
+
+define amdgpu_ps float @xor_add(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: xor_add:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_xor_b32_e32 v0, v0, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: xor_add:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_xad_u32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = xor i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+; ThreeOp instruction variant not used due to Constant Bus Limitations
+define amdgpu_ps float @xor_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
+; VI-LABEL: xor_add_vgpr_a:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s3, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: xor_add_vgpr_a:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = xor i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @xor_add_vgpr_all(i32 %a, i32 %b, i32 %c) {
+; VI-LABEL: xor_add_vgpr_all:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_xor_b32_e32 v0, v0, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: xor_add_vgpr_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_xad_u32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = xor i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @xor_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
+; VI-LABEL: xor_add_vgpr_ab:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_xor_b32_e32 v0, v0, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: xor_add_vgpr_ab:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_xad_u32 v0, v0, v1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = xor i32 %a, %b
+  %result = add i32 %x, %c
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
+
+define amdgpu_ps float @xor_add_vgpr_const(i32 %a, i32 %b) {
+; VI-LABEL: xor_add_vgpr_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_xor_b32_e32 v0, 3, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: xor_add_vgpr_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_xad_u32 v0, v0, 3, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %x = xor i32 %a, 3
+  %result = add i32 %x, %b
+  %bc = bitcast i32 %result to float
+  ret float %bc
+}
diff --git a/test/CodeGen/ARM/CGP/arm-cgp-calls.ll b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
index 10cd6671ffc470844b8afdaee8665a874195775b..4c95ade534dc070a17f6e6b619f231e015d1360b 100644
--- a/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
@@ -22,12 +22,10 @@ define i16 @test_call(i8 zeroext %arg) {
   ret i16 %conv
 }
 
-; Test that the transformation bails when it finds that i16 is larger than i8.
-; TODO: We should be able to remove the uxtb in these cases.
 ; CHECK-LABEL: promote_i8_sink_i16_1
 ; CHECK: bl dummy_i8
 ; CHECK: add{{.*}} r0, #1
-; CHECK: uxtb r0, r0
+; CHECK-NOT: uxt
 ; CHECK: cmp r0
 define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
   %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
@@ -179,10 +177,46 @@ entry:
   ret i1 %tobool
 }
 
+; CHECK-LABEL: i1_zeroext_call
+; CHECK: uxt
+define i1 @i1_zeroext_call(i16* %ts, i32 %a, i16* %b, i8* %c) {
+entry:
+  %0 = load i16, i16* %ts, align 2
+  %conv.i860 = trunc i32 %a to i16
+  store i16 %conv.i860, i16* %b, align 2
+  %call.i848 = call zeroext i1 @i1_zeroext(i8* %c, i32 64, i16 zeroext %conv.i860)
+  br i1 %call.i848, label %if.then223, label %if.else227
+
+if.then223:
+  %cmp235 = icmp eq i16 %0, %conv.i860
+  br label %exit
+
+if.else227:
+  %cmp236 = icmp ult i16 %0, %conv.i860
+  br label %exit
+
+exit:
+  %retval = phi i1 [ %cmp235, %if.then223 ], [ %cmp236, %if.else227 ]
+  ret i1 %retval
+}
+
+; CHECK-LABEL: promote_arg_pass_to_call
+; CHECK-NOT: uxt
+define i16 @promote_arg_pass_to_call(i16 zeroext %arg1, i16 zeroext %arg2) {
+  %conv = add nuw i16 %arg1, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  %cmp = icmp ult i16 %mul, %arg2
+  %trunc = trunc i16 %arg1 to i8
+  %res = call zeroext i16 @dummy4(i1 %cmp, i8 %trunc, i16 %arg1)
+  ret i16 %res
+}
+
+
 declare i32 @assert(...)
 declare i8 @dummy_i8(i8)
 declare i8 @dummy2(i8*, i8, i8)
 declare i16 @dummy3(i16)
+declare i16 @dummy4(i1, i8, i16)
 
 declare dso_local i32 @e(...) local_unnamed_addr #1
 declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
@@ -193,3 +227,4 @@ declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2)
 declare dso_local fastcc i64 @safe_sub_func_int64_t_s_s(i64, i64)
 declare dso_local fastcc zeroext i8 @safe_lshift_func(i8 zeroext, i32)
 declare dso_local fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 returned zeroext)
+declare i1 @i1_zeroext(i8*, i32, i16 zeroext)
diff --git a/test/CodeGen/ARM/CGP/arm-cgp-casts.ll b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
index 431846482c60c9622ef96250294841ed5296009f..273fea370a1f647dbf05e639b660bdb9228d1f24 100644
--- a/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
@@ -104,9 +104,7 @@ entry:
 
 ; CHECK-COMMON-LABEL: or_icmp_ugt:
 ; CHECK-COMMON:     ldrb
-; CHECK-COMMON:     sub.w
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON:     cmp.w
+; CHECK-COMMON:     subs.w
 ; CHECK-COMMON-NOT: uxt
 ; CHECK-COMMON:     cmp
 define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
@@ -122,36 +120,6 @@ entry:
   ret i1 %or
 }
 
-; CHECK-COMMON-LABEL: icmp_switch_trunc:
-; CHECK-COMMON-NOT: uxt
-define i16 @icmp_switch_trunc(i16 zeroext %arg) {
-entry:
-  %conv = add nuw i16 %arg, 15
-  %mul = mul nuw nsw i16 %conv, 3
-  %trunc = trunc i16 %arg to i3
-  switch i3 %trunc, label %default [
-    i3 0, label %sw.bb
-    i3 1, label %sw.bb.i
-  ]
-
-sw.bb:
-  %cmp0 = icmp ult i16 %mul, 127
-  %select = select i1 %cmp0, i16 %mul, i16 127
-  br label %exit
-
-sw.bb.i:
-  %cmp1 = icmp ugt i16 %mul, 34
-  %select.i = select i1 %cmp1, i16 %mul, i16 34
-  br label %exit
-
-default:
-  br label %exit
-
-exit:
-  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
-  ret i16 %res
-}
-
 ; We currently only handle truncs as sinks, so a uxt will still be needed for
 ; the icmp ugt instruction.
 ; CHECK-COMMON-LABEL: urem_trunc_icmps
@@ -187,47 +155,6 @@ exit:
   ret void
 }
 
-; CHECK-COMMON-LABEL: phi_feeding_switch
-; CHECK-COMMON: ldrb
-; CHECK-COMMON: uxtb
-define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) {
-entry:
-  %pre = load i8, i8* %memblock, align 1
-  %conv = trunc i16 %arg to i8
-  br label %header
-
-header:
-  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
-  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
-  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
-  switch i8 %phi.0, label %default [
-    i8 43, label %for.inc.i
-    i8 45, label %for.inc.i.i
-  ]
-
-for.inc.i:
-  %xor = xor i8 %phi.1, 1
-  br label %latch
-
-for.inc.i.i:
-  %and = and i8 %phi.1, 3
-  br label %latch
-
-default:
-  %sub = sub i8 %phi.0, 1
-  %cmp2 = icmp ugt i8 %sub, 4
-  br i1 %cmp2, label %latch, label %exit
-
-latch:
-  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
-  %count = add nuw i8 %phi.2, 1
-  store i8 %count, i8* %store, align 1
-  br label %header
-
-exit:
-  ret void
-}
-
 ; Check that %exp requires uxth in all cases, and will also be required to
 ; promote %1 for the call - unless we can generate a uadd16.
 ; CHECK-COMMON-LABEL: zext_load_sink_call:
@@ -254,40 +181,6 @@ exit:
   ret i32 %exitval
 }
 
-%class.ae = type { i8 }
-%class.x = type { i8 }
-%class.v = type { %class.q }
-%class.q = type { i16 }
-
-; CHECK-COMMON-LABEL: trunc_i16_i9_switch
-; CHECK-COMMON-NOT: uxt
-define i32 @trunc_i16_i9_switch(%class.ae* %this) {
-entry:
-  %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this)
-  %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call)
-  %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0
-  %1 = load i16, i16* %0, align 2
-  %2 = trunc i16 %1 to i9
-  %trunc = and i9 %2, -64
-  switch i9 %trunc, label %cleanup.fold.split [
-    i9 0, label %cleanup
-    i9 -256, label %if.then7
-  ]
-
-if.then7:
-  %3 = and i16 %1, 7
-  %tobool = icmp eq i16 %3, 0
-  %cond = select i1 %tobool, i32 2, i32 1
-  br label %cleanup
-
-cleanup.fold.split:
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ]
-  ret i32 %retval.0
-}
-
 ; CHECK-COMMON-LABEL: bitcast_i16
 ; CHECK-COMMON-NOT: uxt
 define i16 @bitcast_i16(i16 zeroext %arg0, i16 zeroext %arg1) {
@@ -332,16 +225,14 @@ entry:
   ret i8 %res
 }
 
-declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr
-declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr
 declare i32 @dummy(i32, i32)
 
 @d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
 @sh1 = hidden local_unnamed_addr global i16 0, align 2
 @d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
 
-; CHECK-LABEL: two_stage_zext_trunc_mix
-; CHECK-NOT: uxt
+; CHECK-COMMON-LABEL: two_stage_zext_trunc_mix
+; CHECK-COMMON-NOT: uxt
 define i8* @two_stage_zext_trunc_mix(i32* %this, i32 %__pos1, i32 %__n1, i32** %__str, i32 %__pos2, i32 %__n2) {
 entry:
   %__size_.i.i.i.i = bitcast i32** %__str to i8*
@@ -363,3 +254,337 @@ entry:
   %res = select i1 %tobool.i.i.i.i.i,  i8* %7, i8* %8
   ret i8* %res
 }
+
+; CHECK-COMMON-LABEL: search_through_zext_1
+; CHECK-COMMON-NOT: uxt
+define i8 @search_through_zext_1(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i16
+  %cmp2 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxtb here. The transform fails because
+; the icmp ugt uses an i32, which is too large... but this doesn't matter
+; because it won't be writing a large value to a register as a result.
+; CHECK-COMMON-LABEL: search_through_zext_2
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+define i8 @search_through_zext_2(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i32
+  %cmp2 = icmp ugt i32 %conv2, %d
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxtb here as all the calculations are
+; performed on i8s. The promotion of i8 to i16 and then the later truncation
+; results in the uxtb.
+; CHECK-COMMON-LABEL: search_through_zext_3
+; CHECK-COMMON: uxtb
+; CHECK-COMMON: uxtb
+define i8 @search_through_zext_3(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %conv to i8
+  %sub = sub nuw i8 %b, %trunc
+  %conv2 = zext i8 %sub to i32
+  %cmp2 = icmp ugt i32 %conv2, %d
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxt that gets introduced for %conv2
+; CHECK-COMMON-LABEL: search_through_zext_cmp
+; CHECK-COMMON: uxt
+define i8 @search_through_zext_cmp(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %cmp = icmp ne i8 %a, %b
+  %conv = zext i1 %cmp to i16
+  %cmp1 = icmp ult i16 %conv, %c
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i16
+  %cmp3 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp3, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: search_through_zext_load
+; CHECK-COMMON-NOT: uxt
+define i8 @search_through_zext_load(i8* %a, i8 zeroext %b, i16 zeroext %c) {
+entry:
+  %load = load i8, i8* %a
+  %conv = zext i8 %load to i16
+  %cmp1 = icmp ult i16 %conv, %c
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %load
+  %conv2 = zext i8 %sub to i16
+  %cmp3 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp3, i8 %load, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than
+; CHECK-COMMON-NOT: uxth
+; CHECK-COMMON: cmp
+; CHECK-COMMON: uxtb
+define i16 @trunc_sink_less_than_cmp(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, 1
+  %cmp2 = icmp ugt i8 %trunc, %add
+  %res = select i1 %cmp2, i16 %a, i16 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
+  ret i16 %retval
+}
+
+; TODO: We should be able to remove the uxth introduced to handle %sub
+; CHECK-COMMON-LABEL: trunc_sink_less_than_arith
+; CHECK-COMMON: uxth
+; CHECK-COMMON: cmp
+; CHECK-COMMON: uxtb
+define i16 @trunc_sink_less_than_arith(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  %cmp2 = icmp ugt i8 %e, %add
+  %res = select i1 %cmp2, i16 %a, i16 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
+  ret i16 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_store
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: cmp
+; CHECK-COMMON-NOT: uxt
+define i16 @trunc_sink_less_than_store(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8* %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  store i8 %add, i8* %e
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %sub, %if.then ]
+  ret i16 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_ret
+; CHECK-COMMON-NOT: uxt
+define i8 @trunc_sink_less_than_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: trunc_sink_less_than_zext_ret
+; CHECK-COMMON-NOT: uxth
+; CHECK-COMMON: sub
+; CHECK-COMMON: uxtb
+define zeroext i8 @trunc_sink_less_than_zext_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
+  ret i8 %retval
+}
+
+; CHECK-COMMON-LABEL: bitcast_i1
+; CHECK-COMMON-NOT: uxt
+define i32 @bitcast_i1(i16 zeroext %a, i32 %b, i32 %c) {
+entry:
+  %0 = bitcast i1 1 to i1
+  %1 = trunc i16 %a to i1
+  %cmp = icmp eq i1 %1, %0
+  br i1 %cmp, label %if.then, label %exit
+
+if.then:
+  %conv = zext i1 %0 to i16
+  %conv1 = zext i1 %1 to i16
+  %cmp1 = icmp uge i16 %conv, %conv1
+  %select = select i1 %cmp1, i32 %b, i32 %c
+  br label %exit
+
+exit:
+  %retval = phi i32 [ %select, %if.then ], [ 0, %entry ]
+  ret i32 %retval
+}
+
+; CHECK-COMMON-LABEL: search_back_through_trunc
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: cmp
+; CHECK-COMMON: strb
+; CHECK-COMMON: strb
+define void @search_back_through_trunc(i8* %a, i8* %b, i8* %c, i8* %d, i16* %e) {
+entry:
+  %0 = load i8, i8* %a, align 1
+  %conv106 = zext i8 %0 to i16
+  %shl = shl nuw i16 %conv106, 8
+  %1 = load i8, i8* %b, align 1
+  %conv108 = zext i8 %1 to i16
+  %or109 = or i16 %shl, %conv108
+  %2 = load i8, i8* %c, align 1
+  %conv119 = zext i8 %2 to i16
+  %shl120 = shl nuw i16 %conv119, 8
+  %3 = load i8, i8* %d, align 1
+  %conv122 = zext i8 %3 to i16
+  %or123 = or i16 %shl120, %conv122
+  %cmp133 = icmp eq i16 %or109, %or123
+  br i1 %cmp133, label %if.end183, label %if.else136
+
+if.else136:
+  %4 = load i16, i16* %e, align 2
+  %extract.t854 = trunc i16 %4 to i8
+  %extract856 = lshr i16 %4, 8
+  %extract.t857 = trunc i16 %extract856 to i8
+  br label %if.end183
+
+if.end183:
+  %w.0.off0 = phi i8 [ %extract.t854, %if.else136 ], [ %1, %entry ]
+  %w.0.off8 = phi i8 [ %extract.t857, %if.else136 ], [ %2, %entry ]
+  store i8 %w.0.off8, i8* %c, align 1
+  store i8 %w.0.off0, i8* %d, align 1
+  ret void
+}
+
+@c = common dso_local local_unnamed_addr global i16 0, align 2
+@b = common dso_local local_unnamed_addr global i16 0, align 2
+@f = common dso_local local_unnamed_addr global i32 0, align 4
+@e = common dso_local local_unnamed_addr global i8 0, align 1
+@a = common dso_local local_unnamed_addr global i8 0, align 1
+@d = common dso_local local_unnamed_addr global i32 0, align 4
+
+; CHECK-LABEL: and_trunc
+; CHECK: ldrh
+; CHECK: sxth
+; CHECK: uxtb
+define void @and_trunc_two_zext() {
+entry:
+  %0 = load i16, i16* @c, align 2
+  %1 = load i16, i16* @b, align 2
+  %conv = sext i16 %1 to i32
+  store i32 %conv, i32* @f, align 4
+  %2 = trunc i16 %1 to i8
+  %conv1 = and i8 %2, 1
+  store i8 %conv1, i8* @e, align 1
+  %3 = load i8, i8* @a, align 1
+  %narrow = mul nuw i8 %3, %conv1
+  %mul = zext i8 %narrow to i32
+  store i32 %mul, i32* @d, align 4
+  %4 = zext i8 %narrow to i16
+  %conv5 = or i16 %0, %4
+  %tobool = icmp eq i16 %conv5, 0
+  br i1 %tobool, label %if.end, label %for.cond
+
+for.cond:
+  br label %for.cond
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: zext_urem_trunc
+; CHECK-NOT: uxt
+define void @zext_urem_trunc() {
+entry:
+  %0 = load i16, i16* @c, align 2
+  %cmp = icmp eq i16 %0, 0
+  %1 = load i8, i8* @e, align 1
+  br i1 %cmp, label %cond.end, label %cond.false
+
+cond.false:
+  %rem.lhs.trunc = zext i8 %1 to i16
+  %rem7 = urem i16 %rem.lhs.trunc, %0
+  %rem.zext = trunc i16 %rem7 to i8
+  br label %cond.end
+
+cond.end:
+  %cond = phi i8 [ %rem.zext, %cond.false ], [ %1, %entry ]
+  store i8 %cond, i8* @a, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll b/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
index 65588b84f7ccc6cab0e80c2759d33f919658f380..e7adc5ae2491b2517c5109cd670081e13f63bb59 100644
--- a/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=thumbv7m -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
-; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
-; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false -arm-enable-scalar-dsp=true -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
-; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
+; RUN: llc -mtriple=thumbv7m -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON
+; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON
+; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false -arm-enable-scalar-dsp=true -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON
+; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON
 
 ; Test that ARMCodeGenPrepare can handle:
 ; - loops
@@ -172,3 +172,15 @@ if.end:
 exit:
   ret i16 %unrelated
 }
+
+; CHECK-COMMON-LABEL: promote_arg_return
+; CHECK-COMMON-NOT: uxt
+; CHECK-COMMON: strb
+define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) {
+  %add = add nuw i16 %arg1, 15
+  %mul = mul nuw nsw i16 %add, 3
+  %cmp = icmp ult i16 %mul, %arg2
+  %conv = zext i1 %cmp to i8
+  store i8 %conv, i8* %res
+  ret i16 %arg1
+}
diff --git a/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll b/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
index 85a10ba80809b491eb7bf9ae8f9b6f25a8446257..98794f500d4c9bb76d6c8ed4482a311657be7f97 100644
--- a/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
@@ -11,18 +11,17 @@
 ; CHECK-NODSP: sxtb
 ; CHECK-NODSP: cmp
 
-; CHECK-DSP: add
-; CHECK-DSP: uxtb
+; CHECK-DSP: uadd8
+; CHECK-DSP: sub
 ; CHECK-DSP: cmp
 ; CHECK-DSP: sxtb
-; CHECK-DSP: sub
 ; CHECK-DSP: sxtb
 ; CHECK-DSP: cmp
 
 ; CHECK-DSP-IMM: uadd8 [[ADD:r[0-9]+]],
 ; CHECK-DSP-IMM: cmp [[ADD]],
+; CHECK-DSP-IMM: subs [[SUB:r[0-9]+]],
 ; CHECK-DSP-IMM: sxtb [[SEXT0:r[0-9]+]], [[ADD]]
-; CHECK-DSP-IMM: usub8 [[SUB:r[0-9]+]],
 ; CHECK-DSP-IMM: sxtb [[SEXT1:r[0-9]+]], [[SUB]]
 ; CHECK-DSP-IMM: cmp [[SEXT1]], [[SEXT0]]
 define i8 @eq_sgt(i8* %x, i8 *%y, i8 zeroext %z) {
diff --git a/test/CodeGen/ARM/CGP/arm-cgp-switch.ll b/test/CodeGen/ARM/CGP/arm-cgp-switch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..29c35fbc96e001da000d00d392328ff718adacb4
--- /dev/null
+++ b/test/CodeGen/ARM/CGP/arm-cgp-switch.ll
@@ -0,0 +1,168 @@
+; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7-linux-android %s -arm-disable-cgp=false -o - | FileCheck %s
+
+; CHECK-LABEL: truncate_source_phi_switch
+; CHECK: ldrb
+; CHECK: uxtb
+define void @truncate_source_phi_switch(i8* %memblock, i8* %store, i16 %arg) {
+entry:
+  %pre = load i8, i8* %memblock, align 1
+  %conv = trunc i16 %arg to i8
+  br label %header
+
+header:
+  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
+  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
+  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
+  switch i8 %phi.0, label %default [
+    i8 43, label %for.inc.i
+    i8 45, label %for.inc.i.i
+  ]
+
+for.inc.i:
+  %xor = xor i8 %phi.1, 1
+  br label %latch
+
+for.inc.i.i:
+  %and = and i8 %phi.1, 3
+  br label %latch
+
+default:
+  %sub = sub i8 %phi.0, 1
+  %cmp2 = icmp ugt i8 %sub, 4
+  br i1 %cmp2, label %latch, label %exit
+
+latch:
+  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
+  %count = add nuw i8 %phi.2, 1
+  store i8 %count, i8* %store, align 1
+  br label %header
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: icmp_switch_source:
+; CHECK-NOT: uxt
+define i16 @icmp_switch_source(i16 zeroext %arg) {
+entry:
+  %conv = add nuw i16 %arg, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  switch i16 %arg, label %default [
+    i16 0, label %sw.bb
+    i16 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+; CHECK-LABEL: icmp_switch_narrow_source:
+; CHECK-NOT: uxt
+define i16 @icmp_switch_narrow_source(i8 zeroext %arg) {
+entry:
+  %conv = zext i8 %arg to i16
+  %add = add nuw i16 %conv, 15
+  %mul = mul nuw nsw i16 %add, 3
+  switch i8 %arg, label %default [
+    i8 0, label %sw.bb
+    i8 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+; CHECK-LABEL: icmp_switch_trunc:
+; CHECK-NOT: uxt
+define i16 @icmp_switch_trunc(i16 zeroext %arg) {
+entry:
+  %conv = add nuw i16 %arg, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  %trunc = trunc i16 %arg to i3
+  switch i3 %trunc, label %default [
+    i3 0, label %sw.bb
+    i3 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+%class.ae = type { i8 }
+%class.x = type { i8 }
+%class.v = type { %class.q }
+%class.q = type { i16 }
+declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr
+declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr
+
+; CHECK-LABEL: trunc_i16_i9_switch
+; CHECK-NOT: uxt
+define i32 @trunc_i16_i9_switch(%class.ae* %this) {
+entry:
+  %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this)
+  %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call)
+  %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0
+  %1 = load i16, i16* %0, align 2
+  %2 = trunc i16 %1 to i9
+  %trunc = and i9 %2, -64
+  switch i9 %trunc, label %cleanup.fold.split [
+    i9 0, label %cleanup
+    i9 -256, label %if.then7
+  ]
+
+if.then7:
+  %3 = and i16 %1, 7
+  %tobool = icmp eq i16 %3, 0
+  %cond = select i1 %tobool, i32 2, i32 1
+  br label %cleanup
+
+cleanup.fold.split:
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-call-lowering.ll b/test/CodeGen/ARM/GlobalISel/arm-call-lowering.ll
index d667f1756eb083d2b9f5306201fc7944d9452b95..84602a3c4738c05e619853eff4a322f791a76d3c 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-call-lowering.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-call-lowering.ll
@@ -1,13 +1,16 @@
-; RUN: llc -mtriple arm-unknown -mattr=-v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,NOV4T
-; RUN: llc -mtriple arm-unknown -mattr=+v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,V4T
-; RUN: llc -mtriple arm-unknown -mattr=+v5t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,V5T
+; RUN: llc -mtriple arm-unknown -mattr=-v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,NOV4T,ARM
+; RUN: llc -mtriple arm-unknown -mattr=+v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,V4T,ARM
+; RUN: llc -mtriple arm-unknown -mattr=+v5t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,V5T,ARM
+; RUN: llc -mtriple thumb-unknown -mattr=+v6t2 -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,THUMB
 
 define arm_aapcscc void @test_indirect_call(void() *%fptr) {
 ; CHECK-LABEL: name: test_indirect_call
+; THUMB: %[[FPTR:[0-9]+]]:gpr(p0) = COPY $r0
 ; V5T: %[[FPTR:[0-9]+]]:gpr(p0) = COPY $r0
 ; V4T: %[[FPTR:[0-9]+]]:tgpr(p0) = COPY $r0
 ; NOV4T: %[[FPTR:[0-9]+]]:tgpr(p0) = COPY $r0
 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
+; THUMB: tBLXr 14, $noreg, %[[FPTR]](p0), csr_aapcs, implicit-def $lr, implicit $sp
 ; V5T: BLX %[[FPTR]](p0), csr_aapcs, implicit-def $lr, implicit $sp
 ; V4T: BX_CALL %[[FPTR]](p0), csr_aapcs, implicit-def $lr, implicit $sp
 ; NOV4T: BMOVPCRX_CALL %[[FPTR]](p0), csr_aapcs, implicit-def $lr, implicit $sp
@@ -22,7 +25,8 @@ declare arm_aapcscc void @call_target()
 define arm_aapcscc void @test_direct_call() {
 ; CHECK-LABEL: name: test_direct_call
 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
-; CHECK: BL @call_target, csr_aapcs, implicit-def $lr, implicit $sp
+; THUMB: tBL 14, $noreg, @call_target, csr_aapcs, implicit-def $lr, implicit $sp
+; ARM: BL @call_target, csr_aapcs, implicit-def $lr, implicit $sp
 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
 entry:
   notail call arm_aapcscc void @call_target()
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
index b361023067de68049ab33e8f825e340a14645ac4..d3e9796b72f5282270359acbc532cc2584fee714 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
@@ -42,6 +42,9 @@
 
   define void @test_vfnmss() #4 { ret void }
 
+  define void @test_bfc() #2 { ret void }
+  define void @test_no_bfc_bad_mask() #2 { ret void }
+
   attributes #0 = { "target-features"="+v6" }
   attributes #1 = { "target-features"="-v6" }
   attributes #2 = { "target-features"="+v6t2" }
@@ -1142,3 +1145,57 @@ body:             |
     BX_RET 14, $noreg, implicit $s0
     ; CHECK: BX_RET 14, $noreg, implicit $s0
 ...
+---
+name:            test_bfc
+# CHECK-LABEL: name: test_bfc
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    %1(s32) = G_CONSTANT i32 -65529 ; 0xFFFF0007
+    %2(s32) = G_AND %0, %1
+    ; CHECK: [[RS:%[0-9]+]]:gpr = COPY $r0
+    ; CHECK: [[RD:%[0-9]+]]:gpr = BFC [[RS]], -65529, 14, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[RD]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_no_bfc_bad_mask
+# CHECK-LABEL: name: test_no_bfc_bad_mask
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    %1(s32) = G_CONSTANT i32 6 ; 0x00000006
+    %2(s32) = G_AND %0, %1
+    ; CHECK: [[RS:%[0-9]+]]:gpr = COPY $r0
+    ; CHECK-NOT: BFC
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[RD]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 95c454cdc1545beffb6827e258fdbddfecbcd664..175673740cddc73252be7c88e8b09029d5de9d7b 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -440,7 +440,7 @@ define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $r0
 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
 ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32)
+; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C0]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], [[MASK]](<2 x s32>)
 ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>)
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
@@ -456,7 +456,7 @@ define i32 @test_shufflevector_v2s32_v3s32(i32 %arg1, i32 %arg2) {
 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
 ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-; CHECK-DAG: [[MASK:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32), [[C1]](s32)
+; CHECK-DAG: [[MASK:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C0]](s32), [[C1]](s32)
 ; CHECK-DAG: [[V1:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32)
 ; CHECK-DAG: [[V2:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_SHUFFLE_VECTOR [[V2]](<2 x s32>), [[UNDEF]], [[MASK]](<3 x s32>)
@@ -476,7 +476,7 @@ define i32 @test_shufflevector_v2s32_v4s32(i32 %arg1, i32 %arg2) {
 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
 ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-; CHECK-DAG: [[MASK:%[0-9]+]]:_(<4 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32), [[C0]](s32), [[C0]](s32)
+; CHECK-DAG: [[MASK:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C0]](s32), [[C0]](s32), [[C0]](s32)
 ; CHECK-DAG: [[V1:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32)
 ; CHECK-DAG: [[V2:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32)
 ; CHECK: [[VEC:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[V2]](<2 x s32>), [[UNDEF]], [[MASK]](<4 x s32>)
@@ -499,7 +499,7 @@ define i32 @test_shufflevector_v4s32_v2s32(i32 %arg1, i32 %arg2, i32 %arg3, i32
 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK-DAG: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
 ; CHECK-DAG: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C3]](s32)
+; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C3]](s32)
 ; CHECK-DAG: [[V1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32)
 ; CHECK-DAG: [[V2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32)
 ; CHECK-DAG: [[V3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[V2]], [[ARG3]](s32), [[C2]](s32)
@@ -521,7 +521,7 @@ define i32 @test_constantstruct_v2s32() {
 ; CHECK-LABEL: name: test_constantstruct_v2s32
 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32)
 ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
 ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[C3]]
   %vec = extractvalue %struct.v2s32 {<2 x i32><i32 1, i32 2>}, 0
@@ -535,7 +535,7 @@ define i32 @test_constantstruct_v2s32_s32_s32() {
 ; CHECK-LABEL: name: test_constantstruct_v2s32_s32_s32
 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32)
 ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
 ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
 ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-binops.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-binops.mir
new file mode 100644
index 0000000000000000000000000000000000000000..51783caee3c66948147adc9e079568995e240b19
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-binops.mir
@@ -0,0 +1,561 @@
+# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+  define void @test_add_s8() { ret void }
+  define void @test_add_s16() { ret void }
+  define void @test_add_s32() { ret void }
+
+  define void @test_sub_s8() { ret void }
+  define void @test_sub_s16() { ret void }
+  define void @test_sub_s32() { ret void }
+
+  define void @test_mul_s8() { ret void }
+  define void @test_mul_s16() { ret void }
+  define void @test_mul_s32() { ret void }
+
+  define void @test_and_s8() { ret void }
+  define void @test_and_s16() { ret void }
+  define void @test_and_s32() { ret void }
+
+  define void @test_or_s8() { ret void }
+  define void @test_or_s16() { ret void }
+  define void @test_or_s32() { ret void }
+
+  define void @test_xor_s8() { ret void }
+  define void @test_xor_s16() { ret void }
+  define void @test_xor_s32() { ret void }
+...
+---
+name:            test_add_s8
+# CHECK-LABEL: name: test_add_s8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s8) = G_LOAD %0 :: (load 1)
+    %2(p0) = COPY $r0
+    %3(s8) = G_LOAD %2 :: (load 1)
+    %4(s8) = G_ADD %1, %3
+    ; G_ADD with s8 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s8)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_add_s16
+# CHECK-LABEL: name: test_add_s16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s16) = G_LOAD %0 :: (load 2)
+    %2(p0) = COPY $r0
+    %3(s16) = G_LOAD %2 :: (load 2)
+    %4(s16) = G_ADD %1, %3
+    ; G_ADD with s16 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s16)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_add_s32
+# CHECK-LABEL: name: test_add_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    %1(s32) = COPY $r1
+    %2(s32) = G_ADD %0, %1
+    ; G_ADD with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+
+...
+---
+name:            test_sub_s8
+# CHECK-LABEL: name: test_sub_s8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s8) = G_LOAD %0 :: (load 1)
+    %2(p0) = COPY $r0
+    %3(s8) = G_LOAD %2 :: (load 1)
+    %4(s8) = G_SUB %1, %3
+    ; G_SUB with s8 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s8)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_sub_s16
+# CHECK-LABEL: name: test_sub_s16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s16) = G_LOAD %0 :: (load 2)
+    %2(p0) = COPY $r0
+    %3(s16) = G_LOAD %2 :: (load 2)
+    %4(s16) = G_SUB %1, %3
+    ; G_SUB with s16 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s16)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_sub_s32
+# CHECK-LABEL: name: test_sub_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    %1(s32) = COPY $r1
+    %2(s32) = G_SUB %0, %1
+    ; G_SUB with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+
+...
+---
+name:            test_mul_s8
+# CHECK-LABEL: name: test_mul_s8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s8) = G_LOAD %0 :: (load 1)
+    %2(p0) = COPY $r0
+    %3(s8) = G_LOAD %2 :: (load 1)
+    %4(s8) = G_MUL %1, %3
+    ; G_MUL with s8 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s8)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_mul_s16
+# CHECK-LABEL: name: test_mul_s16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s16) = G_LOAD %0 :: (load 2)
+    %2(p0) = COPY $r0
+    %3(s16) = G_LOAD %2 :: (load 2)
+    %4(s16) = G_MUL %1, %3
+    ; G_MUL with s16 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s16)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_mul_s32
+# CHECK-LABEL: name: test_mul_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    %1(s32) = COPY $r1
+    %2(s32) = G_MUL %0, %1
+    ; G_MUL with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+
+...
+---
+name:            test_and_s8
+# CHECK-LABEL: name: test_and_s8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s8) = G_LOAD %0 :: (load 1)
+    %2(p0) = COPY $r0
+    %3(s8) = G_LOAD %2 :: (load 1)
+    %4(s8) = G_AND %1, %3
+    ; G_AND with s8 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_AND {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_AND {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s8)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_and_s16
+# CHECK-LABEL: name: test_and_s16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s16) = G_LOAD %0 :: (load 2)
+    %2(p0) = COPY $r0
+    %3(s16) = G_LOAD %2 :: (load 2)
+    %4(s16) = G_AND %1, %3
+    ; G_AND with s16 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_AND {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_AND {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s16)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_and_s32
+# CHECK-LABEL: name: test_and_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    %1(s32) = COPY $r1
+    %2(s32) = G_AND %0, %1
+    ; G_AND with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+
+...
+---
+name:            test_or_s8
+# CHECK-LABEL: name: test_or_s8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s8) = G_LOAD %0 :: (load 1)
+    %2(p0) = COPY $r0
+    %3(s8) = G_LOAD %2 :: (load 1)
+    %4(s8) = G_OR %1, %3
+    ; G_OR with s8 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_OR {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_OR {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s8)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_or_s16
+# CHECK-LABEL: name: test_or_s16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s16) = G_LOAD %0 :: (load 2)
+    %2(p0) = COPY $r0
+    %3(s16) = G_LOAD %2 :: (load 2)
+    %4(s16) = G_OR %1, %3
+    ; G_OR with s16 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_OR {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_OR {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s16)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_or_s32
+# CHECK-LABEL: name: test_or_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    %1(s32) = COPY $r1
+    %2(s32) = G_OR %0, %1
+    ; G_OR with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+
+...
+---
+name:            test_xor_s8
+# CHECK-LABEL: name: test_xor_s8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s8) = G_LOAD %0 :: (load 1)
+    %2(p0) = COPY $r0
+    %3(s8) = G_LOAD %2 :: (load 1)
+    %4(s8) = G_XOR %1, %3
+    ; G_XOR with s8 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_XOR {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_XOR {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s8)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_xor_s16
+# CHECK-LABEL: name: test_xor_s16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(p0) = COPY $r0
+    %1(s16) = G_LOAD %0 :: (load 2)
+    %2(p0) = COPY $r0
+    %3(s16) = G_LOAD %2 :: (load 2)
+    %4(s16) = G_XOR %1, %3
+    ; G_XOR with s16 should widen
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_XOR {{%[0-9]+, %[0-9]+}}
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_XOR {{%[0-9]+, %[0-9]+}}
+    %5(s32) = G_SEXT %4(s16)
+    $r0 = COPY %5(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_xor_s32
+# CHECK-LABEL: name: test_xor_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    %1(s32) = COPY $r1
+    %2(s32) = G_XOR %0, %1
+    ; G_XOR with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c9323eedced43e8ab5d91f7c3761296933cb8a6a
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
@@ -0,0 +1,177 @@
+# RUN: llc -mtriple arm-linux-gnueabi -mattr=+v5t -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,CLZ
+# RUN: llc -mtriple arm-linux-gnueabi -mattr=-v5t -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,LIBCALLS
+--- |
+  define void @test_ctlz_s32() { ret void }
+  define void @test_ctlz_zero_undef_s32() { ret void }
+
+  ; same as above but with extensions
+  define void @test_ctlz_s16() { ret void }
+  define void @test_ctlz_zero_undef_s8() { ret void }
+...
+---
+name:            test_ctlz_s32
+# CHECK-LABEL: name: test_ctlz_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $r0
+    %0(s32) = COPY $r0
+
+    ; CLZ: [[R:%[0-9]+]]:_(s32) = G_CTLZ [[X]]
+    ; LIBCALLS-NOT: G_CTLZ
+    ; LIBCALLS: ADJCALLSTACKDOWN
+    ; LIBCALLS: $r0 = COPY [[X]]
+    ; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0
+    ; LIBCALLS: [[COUNT:%[0-9]+]]:_(s32) = COPY $r0
+    ; LIBCALLS: ADJCALLSTACKUP
+    ; LIBCALLS-NOT: G_CTLZ
+    ; LIBCALLS: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; LIBCALLS: [[BITS:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; LIBCALLS: [[CMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[X]](s32), [[ZERO]]
+    ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_SELECT [[CMP]](s1), [[BITS]], [[COUNT]]
+    ; LIBCALLS-NOT: G_CTLZ
+    %1(s32) = G_CTLZ %0
+
+    ; CHECK: $r0 = COPY [[R]]
+    $r0 = COPY %1(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_ctlz_zero_undef_s32
+# CHECK-LABEL: name: test_ctlz_zero_undef_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $r0
+    %0(s32) = COPY $r0
+
+    ; CLZ: [[R:%[0-9]+]]:_(s32) = G_CTLZ [[X]]
+    ; LIBCALLS-NOT: G_CTLZ
+    ; LIBCALLS: ADJCALLSTACKDOWN
+    ; LIBCALLS: $r0 = COPY [[X]]
+    ; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0
+    ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = COPY $r0
+    ; LIBCALLS: ADJCALLSTACKUP
+    ; LIBCALLS-NOT: G_CTLZ
+    %1(s32) = G_CTLZ_ZERO_UNDEF %0
+
+    ; CHECK: $r0 = COPY [[R]]
+    $r0 = COPY %1(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_ctlz_s16
+# CHECK-LABEL: name: test_ctlz_s16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $r0
+    ; CHECK: [[BITMASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[XAGAIN:%[0-9]+]]:_(s32) = COPY [[X]]
+    ; CHECK: [[X32:%[0-9]+]]:_(s32) = G_AND [[XAGAIN]], [[BITMASK]]
+    %0(s32) = COPY $r0
+    %1(s16) = G_TRUNC %0(s32)
+
+    ; Check that the operation is performed for 32 bits
+    ; CLZ: [[COUNT:%[0-9]+]]:_(s32) = G_CTLZ [[X32]]
+    ; LIBCALLS-NOT: G_CTLZ
+    ; LIBCALLS: ADJCALLSTACKDOWN
+    ; LIBCALLS: $r0 = COPY [[X32]]
+    ; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0
+    ; LIBCALLS: [[UNDEFCOUNT:%[0-9]+]]:_(s32) = COPY $r0
+    ; LIBCALLS: ADJCALLSTACKUP
+    ; LIBCALLS-NOT: G_CTLZ
+    ; LIBCALLS: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; LIBCALLS: [[BITS:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; LIBCALLS: [[CMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), {{%[0-9]+}}(s32), [[ZERO]]
+    ; LIBCALLS: [[COUNT:%[0-9]+]]:_(s32) = G_SELECT [[CMP]](s1), [[BITS]], [[UNDEFCOUNT]]
+    ; LIBCALLS-NOT: G_CTLZ
+    ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]]
+    %2(s16) = G_CTLZ %1
+
+    ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[RAGAIN:%[0-9]+]]:_(s32) = COPY [[R32]]
+    ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[RAGAIN]], [[BITDIFF]]
+    ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
+    ; CHECK: $r0 = COPY [[R]]
+    %3(s32) = G_SEXT %2(s16)
+    $r0 = COPY %3(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_ctlz_zero_undef_s8
+# CHECK-LABEL: name: test_ctlz_zero_undef_s8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $r0
+    ; CHECK: [[BITMASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK: [[XAGAIN:%[0-9]+]]:_(s32) = COPY [[X]]
+    ; CHECK: [[X32:%[0-9]+]]:_(s32) = G_AND [[XAGAIN]], [[BITMASK]]
+    %0(s32) = COPY $r0
+    %1(s8) = G_TRUNC %0(s32)
+
+    ; Check that the operation is performed for 32 bits
+    ; CLZ: [[COUNT:%[0-9]+]]:_(s32) = G_CTLZ
+    ; CLZ-NOT: G_CTLZ_ZERO_UNDEF
+    ; LIBCALLS-NOT: G_CTLZ
+    ; LIBCALLS: ADJCALLSTACKDOWN
+    ; LIBCALLS: $r0 = COPY [[X32]]
+    ; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0
+    ; LIBCALLS: [[COUNT:%[0-9]+]]:_(s32) = COPY $r0
+    ; LIBCALLS: ADJCALLSTACKUP
+    ; LIBCALLS-NOT: G_CTLZ
+    ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]]
+    %2(s8) = G_CTLZ_ZERO_UNDEF %1
+
+    ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK: [[RAGAIN:%[0-9]+]]:_(s32) = COPY [[R32]]
+    ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[RAGAIN]], [[BITDIFF]]
+    ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
+    ; CHECK: $r0 = COPY [[R]]
+    %3(s32) = G_SEXT %2(s8)
+    $r0 = COPY %3(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-casts.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-casts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c45bc3d5385d7573ecdfc171e8a34b566234fdf7
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-casts.mir
@@ -0,0 +1,50 @@
+# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+  define void @test_inttoptr_s32() { ret void }
+  define void @test_ptrtoint_s32() { ret void }
+...
+---
+name:            test_inttoptr_s32
+# CHECK-LABEL: name: test_inttoptr_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    %1(p0) = G_INTTOPTR %0(s32)
+    ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}}
+    $r0 = COPY %1(p0)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_ptrtoint_s32
+# CHECK-LABEL: name: test_ptrtoint_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(p0) = COPY $r0
+    %1(s32) = G_PTRTOINT %0(p0)
+    ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}}
+    $r0 = COPY %1(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..d3248fe63a024864bb4f33ccc9cc650b7b651218
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir
@@ -0,0 +1,57 @@
+# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+  define void @test_constants() { ret void }
+...
+---
+name:            test_constants
+# CHECK-LABEL: name: test_constants
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %4(p0) = COPY $r0
+
+    %0(s32) = G_CONSTANT 42
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_CONSTANT 42
+
+    %1(s16) = G_CONSTANT i16 21
+    G_STORE %1(s16), %4(p0) :: (store 2)
+    ; CHECK-NOT: G_CONSTANT i16
+    ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 21
+    ; CHECK: {{%[0-9]+}}:_(s16) = G_TRUNC [[EXT]](s32)
+    ; CHECK-NOT: G_CONSTANT i16
+
+    %2(s8) = G_CONSTANT i8 10
+    G_STORE %2(s8), %4(p0) :: (store 1)
+    ; CHECK-NOT: G_CONSTANT i8
+    ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK: {{%[0-9]+}}:_(s8) = G_TRUNC [[EXT]](s32)
+    ; CHECK-NOT: G_CONSTANT i8
+
+    %3(s1) = G_CONSTANT i1 1
+    G_STORE %3(s1), %4(p0) :: (store 1)
+    ; CHECK-NOT: G_CONSTANT i1
+    ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32)
+    ; CHECK-NOT: G_CONSTANT i1
+
+    %5(p0) = G_CONSTANT 0
+    G_STORE %5(p0), %4(p0) :: (store 4)
+    ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0
+
+    $r0 = COPY %0(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-exts.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-exts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..672b42671a154d0bbcc4b718ca1d7d2a3055656e
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-exts.mir
@@ -0,0 +1,79 @@
+# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+  define void @test_zext_s16() { ret void }
+  define void @test_sext_s8() { ret void }
+  define void @test_anyext_s1() { ret void }
+...
+---
+name:            test_zext_s16
+# CHECK-LABEL: name: test_zext_s16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(p0) = COPY $r0
+    %1(s16) = G_LOAD %0 :: (load 2)
+    %2(s32) = G_ZEXT %1
+    ; G_ZEXT with s16 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_ZEXT {{%[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_sext_s8
+# CHECK-LABEL: name: test_sext_s8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(p0) = COPY $r0
+    %1(s8) = G_LOAD %0(p0) :: (load 1)
+    %2(s32) = G_SEXT %1
+    ; G_SEXT with s8 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_SEXT {{%[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_anyext_s1
+# CHECK-LABEL: name: test_anyext_s1
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(p0) = COPY $r0
+    %1(s1) = G_LOAD %0(p0) :: (load 1)
+    %2(s32) = G_ANYEXT %1
+    ; G_ANYEXT with s1 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_ANYEXT {{%[0-9]+}}
+    $r0 = COPY %2(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir
new file mode 100644
index 0000000000000000000000000000000000000000..a955af9a97ffbe27fc3f54c88c7578d3d0e70f33
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir
@@ -0,0 +1,49 @@
+# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+  define void @test_legal_loads_stores() { ret void }
+...
+---
+name:            test_legal_loads_stores
+# CHECK-LABEL: name: test_legal_loads_stores
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+  - { id: 6, class: _ }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    ; These are all legal, so we should find them unchanged in the output
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s32), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s16), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s8), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s1), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(p0), %0(p0)
+    ; CHECK-DAG: {{%[0-9]+}}:_(s32) = G_LOAD %0(p0)
+    ; CHECK-DAG: {{%[0-9]+}}:_(s16) = G_LOAD %0(p0)
+    ; CHECK-DAG: {{%[0-9]+}}:_(s8) = G_LOAD %0(p0)
+    ; CHECK-DAG: {{%[0-9]+}}:_(s1) = G_LOAD %0(p0)
+    ; CHECK-DAG: {{%[0-9]+}}:_(p0) = G_LOAD %0(p0)
+    %0(p0) = COPY $r0
+    %2(s32) = G_LOAD %0(p0) :: (load 4)
+    G_STORE %2(s32), %0(p0) :: (store 4)
+    %3(s16) = G_LOAD %0(p0) :: (load 2)
+    G_STORE %3(s16), %0(p0) :: (store 2)
+    %4(s8) = G_LOAD %0(p0) :: (load 1)
+    G_STORE %4(s8), %0(p0) :: (store 1)
+    %5(s1) = G_LOAD %0(p0) :: (load 1)
+    G_STORE %5(s1), %0(p0) :: (store 1)
+    %6(p0) = G_LOAD %0(p0) :: (load 4)
+    G_STORE %6(p0), %0(p0) :: (store 4)
+    BX_RET 14, $noreg
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index 4d61593b47a16b07d830e38b8b3a3c010107d7e2..bee8b7bbbabd2b801c2ae01c8357f4fec3ab56f0 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -1,45 +1,15 @@
 # RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s
 --- |
-  define void @test_sext_s8() { ret void }
-  define void @test_zext_s16() { ret void }
-
-  define void @test_inttoptr_s32() { ret void }
-  define void @test_ptrtoint_s32() { ret void }
-
-  define void @test_add_s8() { ret void }
-  define void @test_add_s16() { ret void }
-  define void @test_add_s32() { ret void }
-
-  define void @test_sub_s8() { ret void }
-  define void @test_sub_s16() { ret void }
-  define void @test_sub_s32() { ret void }
-
-  define void @test_mul_s8() { ret void }
-  define void @test_mul_s16() { ret void }
-  define void @test_mul_s32() { ret void }
-
-  define void @test_and_s8() { ret void }
-  define void @test_and_s16() { ret void }
-  define void @test_and_s32() { ret void }
-
-  define void @test_or_s8() { ret void }
-  define void @test_or_s16() { ret void }
-  define void @test_or_s32() { ret void }
-
-  define void @test_xor_s8() { ret void }
-  define void @test_xor_s16() { ret void }
-  define void @test_xor_s32() { ret void }
-
   define void @test_lshr_s32() { ret void }
   define void @test_ashr_s32() { ret void }
   define void @test_shl_s32() { ret void }
 
   define void @test_load_from_stack() { ret void }
-  define void @test_legal_loads_stores() #0 { ret void }
+  define void @test_load_store_64() #0 { ret void }
 
   define void @test_gep() { ret void }
 
-  define void @test_constants() { ret void }
+  define void @test_constants_s64() { ret void }
 
   define void @test_icmp_s8() { ret void }
   define void @test_icmp_s16() { ret void }
@@ -61,632 +31,6 @@
   attributes #0 = { "target-features"="+vfp2" }
 ...
 ---
-name:            test_sext_s8
-# CHECK-LABEL: name: test_sext_s8
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0
-
-    %0(p0) = COPY $r0
-    %1(s8) = G_LOAD %0(p0) :: (load 1)
-    %2(s32) = G_SEXT %1
-    ; G_SEXT with s8 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_SEXT {{%[0-9]+}}
-    $r0 = COPY %2(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_zext_s16
-# CHECK-LABEL: name: test_zext_s16
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0
-
-    %0(p0) = COPY $r0
-    %1(s16) = G_LOAD %0 :: (load 2)
-    %2(s32) = G_ZEXT %1
-    ; G_ZEXT with s16 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_ZEXT {{%[0-9]+}}
-    $r0 = COPY %2(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_inttoptr_s32
-# CHECK-LABEL: name: test_inttoptr_s32
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0
-
-    %0(s32) = COPY $r0
-    %1(p0) = G_INTTOPTR %0(s32)
-    ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}}
-    $r0 = COPY %1(p0)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_ptrtoint_s32
-# CHECK-LABEL: name: test_ptrtoint_s32
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0
-
-    %0(p0) = COPY $r0
-    %1(s32) = G_PTRTOINT %0(p0)
-    ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}}
-    $r0 = COPY %1(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_add_s8
-# CHECK-LABEL: name: test_add_s8
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s8) = G_LOAD %0 :: (load 1)
-    %2(p0) = COPY $r0
-    %3(s8) = G_LOAD %2 :: (load 1)
-    %4(s8) = G_ADD %1, %3
-    ; G_ADD with s8 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s8)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_add_s16
-# CHECK-LABEL: name: test_add_s16
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s16) = G_LOAD %0 :: (load 2)
-    %2(p0) = COPY $r0
-    %3(s16) = G_LOAD %2 :: (load 2)
-    %4(s16) = G_ADD %1, %3
-    ; G_ADD with s16 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s16)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_add_s32
-# CHECK-LABEL: name: test_add_s32
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(s32) = COPY $r0
-    %1(s32) = COPY $r1
-    %2(s32) = G_ADD %0, %1
-    ; G_ADD with s32 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
-    $r0 = COPY %2(s32)
-    BX_RET 14, $noreg, implicit $r0
-
-...
----
-name:            test_sub_s8
-# CHECK-LABEL: name: test_sub_s8
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s8) = G_LOAD %0 :: (load 1)
-    %2(p0) = COPY $r0
-    %3(s8) = G_LOAD %2 :: (load 1)
-    %4(s8) = G_SUB %1, %3
-    ; G_SUB with s8 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s8)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_sub_s16
-# CHECK-LABEL: name: test_sub_s16
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s16) = G_LOAD %0 :: (load 2)
-    %2(p0) = COPY $r0
-    %3(s16) = G_LOAD %2 :: (load 2)
-    %4(s16) = G_SUB %1, %3
-    ; G_SUB with s16 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s16)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_sub_s32
-# CHECK-LABEL: name: test_sub_s32
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(s32) = COPY $r0
-    %1(s32) = COPY $r1
-    %2(s32) = G_SUB %0, %1
-    ; G_SUB with s32 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
-    $r0 = COPY %2(s32)
-    BX_RET 14, $noreg, implicit $r0
-
-...
----
-name:            test_mul_s8
-# CHECK-LABEL: name: test_mul_s8
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s8) = G_LOAD %0 :: (load 1)
-    %2(p0) = COPY $r0
-    %3(s8) = G_LOAD %2 :: (load 1)
-    %4(s8) = G_MUL %1, %3
-    ; G_MUL with s8 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s8)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_mul_s16
-# CHECK-LABEL: name: test_mul_s16
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s16) = G_LOAD %0 :: (load 2)
-    %2(p0) = COPY $r0
-    %3(s16) = G_LOAD %2 :: (load 2)
-    %4(s16) = G_MUL %1, %3
-    ; G_MUL with s16 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s16)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_mul_s32
-# CHECK-LABEL: name: test_mul_s32
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(s32) = COPY $r0
-    %1(s32) = COPY $r1
-    %2(s32) = G_MUL %0, %1
-    ; G_MUL with s32 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
-    $r0 = COPY %2(s32)
-    BX_RET 14, $noreg, implicit $r0
-
-...
----
-name:            test_and_s8
-# CHECK-LABEL: name: test_and_s8
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s8) = G_LOAD %0 :: (load 1)
-    %2(p0) = COPY $r0
-    %3(s8) = G_LOAD %2 :: (load 1)
-    %4(s8) = G_AND %1, %3
-    ; G_AND with s8 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_AND {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_AND {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s8)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_and_s16
-# CHECK-LABEL: name: test_and_s16
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s16) = G_LOAD %0 :: (load 2)
-    %2(p0) = COPY $r0
-    %3(s16) = G_LOAD %2 :: (load 2)
-    %4(s16) = G_AND %1, %3
-    ; G_AND with s16 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_AND {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_AND {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s16)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_and_s32
-# CHECK-LABEL: name: test_and_s32
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(s32) = COPY $r0
-    %1(s32) = COPY $r1
-    %2(s32) = G_AND %0, %1
-    ; G_AND with s32 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}}
-    $r0 = COPY %2(s32)
-    BX_RET 14, $noreg, implicit $r0
-
-...
----
-name:            test_or_s8
-# CHECK-LABEL: name: test_or_s8
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s8) = G_LOAD %0 :: (load 1)
-    %2(p0) = COPY $r0
-    %3(s8) = G_LOAD %2 :: (load 1)
-    %4(s8) = G_OR %1, %3
-    ; G_OR with s8 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_OR {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_OR {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s8)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_or_s16
-# CHECK-LABEL: name: test_or_s16
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s16) = G_LOAD %0 :: (load 2)
-    %2(p0) = COPY $r0
-    %3(s16) = G_LOAD %2 :: (load 2)
-    %4(s16) = G_OR %1, %3
-    ; G_OR with s16 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_OR {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_OR {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s16)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_or_s32
-# CHECK-LABEL: name: test_or_s32
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(s32) = COPY $r0
-    %1(s32) = COPY $r1
-    %2(s32) = G_OR %0, %1
-    ; G_OR with s32 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}}
-    $r0 = COPY %2(s32)
-    BX_RET 14, $noreg, implicit $r0
-
-...
----
-name:            test_xor_s8
-# CHECK-LABEL: name: test_xor_s8
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s8) = G_LOAD %0 :: (load 1)
-    %2(p0) = COPY $r0
-    %3(s8) = G_LOAD %2 :: (load 1)
-    %4(s8) = G_XOR %1, %3
-    ; G_XOR with s8 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_XOR {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_XOR {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s8)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_xor_s16
-# CHECK-LABEL: name: test_xor_s16
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(p0) = COPY $r0
-    %1(s16) = G_LOAD %0 :: (load 2)
-    %2(p0) = COPY $r0
-    %3(s16) = G_LOAD %2 :: (load 2)
-    %4(s16) = G_XOR %1, %3
-    ; G_XOR with s16 should widen
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_XOR {{%[0-9]+, %[0-9]+}}
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
-    ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_XOR {{%[0-9]+, %[0-9]+}}
-    %5(s32) = G_SEXT %4(s16)
-    $r0 = COPY %5(s32)
-    BX_RET 14, $noreg, implicit $r0
-...
----
-name:            test_xor_s32
-# CHECK-LABEL: name: test_xor_s32
-legalized:       false
-# CHECK: legalized: true
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $r1
-
-    %0(s32) = COPY $r0
-    %1(s32) = COPY $r1
-    %2(s32) = G_XOR %0, %1
-    ; G_XOR with s32 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
-    $r0 = COPY %2(s32)
-    BX_RET 14, $noreg, implicit $r0
-
-...
----
 name:            test_lshr_s32
 # CHECK-LABEL: name: test_lshr_s32
 legalized:       false
@@ -792,8 +136,8 @@ body:             |
     BX_RET 14, $noreg, implicit $r0
 ...
 ---
-name:            test_legal_loads_stores
-# CHECK-LABEL: name: test_legal_loads_stores
+name:            test_load_store_64
+# CHECK-LABEL: name: test_load_store_64
 legalized:       false
 # CHECK: legalized: true
 regBankSelected: false
@@ -811,32 +155,12 @@ body:             |
   bb.0:
     liveins: $r0
 
-    ; These are all legal, so we should find them unchanged in the output
+    ; These are legal, so we should find them unchanged in the output
     ; CHECK-DAG: G_STORE {{%[0-9]+}}(s64), %0(p0)
-    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s32), %0(p0)
-    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s16), %0(p0)
-    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s8), %0(p0)
-    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s1), %0(p0)
-    ; CHECK-DAG: G_STORE {{%[0-9]+}}(p0), %0(p0)
     ; CHECK-DAG: {{%[0-9]+}}:_(s64) = G_LOAD %0(p0)
-    ; CHECK-DAG: {{%[0-9]+}}:_(s32) = G_LOAD %0(p0)
-    ; CHECK-DAG: {{%[0-9]+}}:_(s16) = G_LOAD %0(p0)
-    ; CHECK-DAG: {{%[0-9]+}}:_(s8) = G_LOAD %0(p0)
-    ; CHECK-DAG: {{%[0-9]+}}:_(s1) = G_LOAD %0(p0)
-    ; CHECK-DAG: {{%[0-9]+}}:_(p0) = G_LOAD %0(p0)
     %0(p0) = COPY $r0
     %1(s64) = G_LOAD %0(p0) :: (load 8)
     G_STORE %1(s64), %0(p0) :: (store 8)
-    %2(s32) = G_LOAD %0(p0) :: (load 4)
-    G_STORE %2(s32), %0(p0) :: (store 4)
-    %3(s16) = G_LOAD %0(p0) :: (load 2)
-    G_STORE %3(s16), %0(p0) :: (store 2)
-    %4(s8) = G_LOAD %0(p0) :: (load 1)
-    G_STORE %4(s8), %0(p0) :: (store 1)
-    %5(s1) = G_LOAD %0(p0) :: (load 1)
-    G_STORE %5(s1), %0(p0) :: (store 1)
-    %6(p0) = G_LOAD %0(p0) :: (load 4)
-    G_STORE %6(p0), %0(p0) :: (store 4)
     BX_RET 14, $noreg
 ...
 ---
@@ -865,8 +189,8 @@ body:             |
     BX_RET 14, $noreg, implicit $r0
 ...
 ---
-name:            test_constants
-# CHECK-LABEL: name: test_constants
+name:            test_constants_s64
+# CHECK-LABEL: name: test_constants_s64
 legalized:       false
 # CHECK: legalized: true
 regBankSelected: false
@@ -877,55 +201,21 @@ registers:
   - { id: 1, class: _ }
   - { id: 2, class: _ }
   - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
 body:             |
   bb.0:
     liveins: $r0
 
-    %4(p0) = COPY $r0
-
-    %0(s32) = G_CONSTANT 42
-    ; CHECK: {{%[0-9]+}}:_(s32) = G_CONSTANT 42
-
-    %1(s16) = G_CONSTANT i16 21
-    G_STORE %1(s16), %4(p0) :: (store 2)
-    ; CHECK-NOT: G_CONSTANT i16
-    ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 21
-    ; CHECK: {{%[0-9]+}}:_(s16) = G_TRUNC [[EXT]](s32)
-    ; CHECK-NOT: G_CONSTANT i16
-
-    %2(s8) = G_CONSTANT i8 10
-    G_STORE %2(s8), %4(p0) :: (store 1)
-    ; CHECK-NOT: G_CONSTANT i8
-    ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-    ; CHECK: {{%[0-9]+}}:_(s8) = G_TRUNC [[EXT]](s32)
-    ; CHECK-NOT: G_CONSTANT i8
-
-    %3(s1) = G_CONSTANT i1 1
-    G_STORE %3(s1), %4(p0) :: (store 1)
-    ; CHECK-NOT: G_CONSTANT i1
-    ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32)
-    ; CHECK-NOT: G_CONSTANT i1
-
-    %5(p0) = G_CONSTANT 0
-    G_STORE %5(p0), %4(p0) :: (store 4)
-    ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0
+    %0(p0) = COPY $r0
 
-    %6(s64) = G_CONSTANT i64 17179869200 ; = 4 * 2 ^ 32 + 16
-    %7(s32), %8(s32) = G_UNMERGE_VALUES %6(s64)
-    G_STORE %7(s32), %4(p0) :: (store 4)
-    G_STORE %8(s32), %4(p0) :: (store 4)
+    %1(s64) = G_CONSTANT i64 17179869200 ; = 4 * 2 ^ 32 + 16
+    %2(s32), %3(s32) = G_UNMERGE_VALUES %1(s64)
+    G_STORE %2(s32), %0(p0) :: (store 4)
+    G_STORE %3(s32), %0(p0) :: (store 4)
     ; CHECK-DAG: {{%[0-9]+}}:_(s32) = G_CONSTANT i32 4
     ; CHECK-DAG: {{%[0-9]+}}:_(s32) = G_CONSTANT i32 16
     ; CHECK-NOT: G_CONSTANT i64
 
-    $r0 = COPY %0(s32)
-    BX_RET 14, $noreg, implicit $r0
+    BX_RET 14, $noreg
 ...
 ---
 name:            test_icmp_s8
diff --git a/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll b/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll
index 14ddeda39f01d7f966fb48948f5c29249a77c871..3c3da0edd53e7884dace11c30146034d1850b8b7 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mtriple arm-unknown -mattr=+vfp2,+v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LITTLE
-; RUN: llc -mtriple armeb-unknown -mattr=+vfp2,+v4t -global-isel -global-isel-abort=0 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=BIG
+; RUN: llc -mtriple arm-unknown -mattr=+vfp2,+v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=ARM -check-prefix=LITTLE
+; RUN: llc -mtriple armeb-unknown -mattr=+vfp2,+v4t -global-isel -global-isel-abort=0 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=ARM -check-prefix=BIG
 ; XFAIL: armeb
+; RUN: llc -mtriple thumb-unknown -mattr=+vfp2,+v6t2 -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LITTLE -check-prefix=THUMB
 
 declare arm_aapcscc i32* @simple_reg_params_target(i32, i32*)
 
@@ -11,11 +12,13 @@ define arm_aapcscc i32* @test_call_simple_reg_params(i32 *%a, i32 %b) {
 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
 ; CHECK-DAG: $r0 = COPY [[BVREG]]
 ; CHECK-DAG: $r1 = COPY [[AVREG]]
-; CHECK: BL @simple_reg_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0
+; ARM: BL @simple_reg_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0
+; THUMB: tBL 14, $noreg, @simple_reg_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0
 ; CHECK: [[RVREG:%[0-9]+]]:_(p0) = COPY $r0
 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
 ; CHECK: $r0 = COPY [[RVREG]]
-; CHECK: BX_RET 14, $noreg, implicit $r0
+; ARM: BX_RET 14, $noreg, implicit $r0
+; THUMB: tBX_RET 14, $noreg, implicit $r0
 entry:
   %r = notail call arm_aapcscc i32 *@simple_reg_params_target(i32 %b, i32 *%a)
   ret i32 *%r
@@ -40,11 +43,13 @@ define arm_aapcscc i32* @test_call_simple_stack_params(i32 *%a, i32 %b) {
 ; CHECK: [[OFF2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
 ; CHECK: [[FI2:%[0-9]+]]:_(p0) = G_GEP [[SP2]], [[OFF2]](s32)
 ; CHECK: G_STORE [[AVREG]](p0), [[FI2]](p0){{.*}}store 4
-; CHECK: BL @simple_stack_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0
+; ARM: BL @simple_stack_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0
+; THUMB: tBL 14, $noreg, @simple_stack_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0
 ; CHECK: [[RVREG:%[0-9]+]]:_(p0) = COPY $r0
 ; CHECK: ADJCALLSTACKUP 8, 0, 14, $noreg, implicit-def $sp, implicit $sp
 ; CHECK: $r0 = COPY [[RVREG]]
-; CHECK: BX_RET 14, $noreg, implicit $r0
+; ARM: BX_RET 14, $noreg, implicit $r0
+; THUMB: tBX_RET 14, $noreg, implicit $r0
 entry:
   %r = notail call arm_aapcscc i32 *@simple_stack_params_target(i32 %b, i32 *%a, i32 %b, i32 *%a, i32 %b, i32 *%a)
   ret i32 *%r
@@ -94,13 +99,15 @@ define arm_aapcscc signext i16 @test_call_ext_params(i8 %a, i16 %b, i1 %c) {
 ; CHECK: [[FI5:%[0-9]+]]:_(p0) = G_GEP [[SP5]], [[OFF5]](s32)
 ; CHECK: [[ZEXTC:%[0-9]+]]:_(s32) = G_ZEXT [[CVREG]]
 ; CHECK: G_STORE [[ZEXTC]](s32), [[FI5]](p0){{.*}}store 4
-; CHECK: BL @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0
+; ARM: BL @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0
+; THUMB: tBL 14, $noreg, @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0
 ; CHECK: [[R0VREG:%[0-9]+]]:_(s32) = COPY $r0
 ; CHECK: [[RVREG:%[0-9]+]]:_(s16) = G_TRUNC [[R0VREG]]
 ; CHECK: ADJCALLSTACKUP 20, 0, 14, $noreg, implicit-def $sp, implicit $sp
 ; CHECK: [[RExtVREG:%[0-9]+]]:_(s32) = G_SEXT [[RVREG]]
 ; CHECK: $r0 = COPY [[RExtVREG]]
-; CHECK: BX_RET 14, $noreg, implicit $r0
+; ARM: BX_RET 14, $noreg, implicit $r0
+; THUMB: tBX_RET 14, $noreg, implicit $r0
 entry:
   %r = notail call arm_aapcscc signext i16 @ext_target(i8 signext %a, i8 zeroext %a, i16 signext %b, i16 zeroext %b, i8 signext %a, i8 zeroext %a, i16 signext %b, i16 zeroext %b, i1 zeroext %c)
   ret i16 %r
@@ -115,11 +122,13 @@ define arm_aapcs_vfpcc double @test_call_vfpcc_fp_params(double %a, float %b) {
 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
 ; CHECK-DAG: $s0 = COPY [[BVREG]]
 ; CHECK-DAG: $d1 = COPY [[AVREG]]
-; CHECK: BL @vfpcc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $d1, implicit-def $d0
+; ARM: BL @vfpcc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $d1, implicit-def $d0
+; THUMB: tBL 14, $noreg, @vfpcc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $d1, implicit-def $d0
 ; CHECK: [[RVREG:%[0-9]+]]:_(s64) = COPY $d0
 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
 ; CHECK: $d0 = COPY [[RVREG]]
-; CHECK: BX_RET 14, $noreg, implicit $d0
+; ARM: BX_RET 14, $noreg, implicit $d0
+; THUMB: tBX_RET 14, $noreg, implicit $d0
 entry:
   %r = notail call arm_aapcs_vfpcc double @vfpcc_fp_target(float %b, double %a)
   ret double %r
@@ -149,7 +158,8 @@ define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) {
 ; CHECK: [[OFF2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
 ; CHECK: [[FI2:%[0-9]+]]:_(p0) = G_GEP [[SP2]], [[OFF2]](s32)
 ; CHECK: G_STORE [[AVREG]](s64), [[FI2]](p0){{.*}}store 8
-; CHECK: BL @aapcscc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
+; ARM: BL @aapcscc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
+; THUMB: tBL 14, $noreg, @aapcscc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
 ; CHECK-DAG: [[R1:%[0-9]+]]:_(s32) = COPY $r0
 ; CHECK-DAG: [[R2:%[0-9]+]]:_(s32) = COPY $r1
 ; LITTLE: [[RVREG:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[R1]](s32), [[R2]](s32)
@@ -160,7 +170,8 @@ define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) {
 ; LITTLE-DAG: $r1 = COPY [[R2]]
 ; BIG-DAG: $r0 = COPY [[R2]]
 ; BIG-DAG: $r1 = COPY [[R1]]
-; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1
+; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1
+; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1
 entry:
   %r = notail call arm_aapcscc double @aapcscc_fp_target(float %b, double %a, float %b, double %a)
   ret double %r
@@ -173,11 +184,13 @@ define arm_aapcs_vfpcc float @test_call_different_call_conv(float %x) {
 ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $s0
 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
 ; CHECK: $r0 = COPY [[X]]
-; CHECK: BL @different_call_conv_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit-def $r0
+; ARM: BL @different_call_conv_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit-def $r0
+; THUMB: tBL 14, $noreg, @different_call_conv_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit-def $r0
 ; CHECK: [[R:%[0-9]+]]:_(s32) = COPY $r0
 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
 ; CHECK: $s0 = COPY [[R]]
-; CHECK: BX_RET 14, $noreg, implicit $s0
+; ARM: BX_RET 14, $noreg, implicit $s0
+; THUMB: tBX_RET 14, $noreg, implicit $s0
 entry:
   %r = notail call arm_aapcscc float @different_call_conv_target(float %x)
   ret float %r
@@ -200,7 +213,8 @@ define arm_aapcscc [3 x i32] @test_tiny_int_arrays([2 x i32] %arr) {
 ; CHECK: [[R0:%[0-9]+]]:_(s32), [[R1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INS2]](s64)
 ; CHECK: $r0 = COPY [[R0]]
 ; CHECK: $r1 = COPY [[R1]]
-; CHECK: BL @tiny_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1
+; ARM: BL @tiny_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1
+; THUMB: tBL 14, $noreg, @tiny_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1
 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $r0
 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $r1
 ; CHECK: [[R2:%[0-9]+]]:_(s32) = COPY $r2
@@ -215,7 +229,8 @@ define arm_aapcscc [3 x i32] @test_tiny_int_arrays([2 x i32] %arr) {
 ; CHECK: $r0 = COPY [[EXT3]]
 ; CHECK: $r1 = COPY [[EXT4]]
 ; CHECK: $r2 = COPY [[EXT5]]
-; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1, implicit $r2
+; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1, implicit $r2
+; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1, implicit $r2
 entry:
   %r = notail call arm_aapcscc [3 x i32] @tiny_int_arrays_target([2 x i32] %arr)
   ret [3 x i32] %r
@@ -249,9 +264,11 @@ define arm_aapcscc void @test_multiple_int_arrays([2 x i32] %arr0, [2 x i32] %ar
 ; CHECK: $r1 = COPY [[R1]]
 ; CHECK: $r2 = COPY [[R2]]
 ; CHECK: $r3 = COPY [[R3]]
-; CHECK: BL @multiple_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
+; ARM: BL @multiple_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
+; THUMB: tBL 14, $noreg, @multiple_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp
-; CHECK: BX_RET 14, $noreg
+; ARM: BX_RET 14, $noreg
+; THUMB: tBX_RET 14, $noreg
 entry:
   notail call arm_aapcscc void @multiple_int_arrays_target([2 x i32] %arr0, [2 x i32] %arr1)
   ret void
@@ -293,9 +310,11 @@ define arm_aapcscc void @test_large_int_arrays([20 x i32] %arr) {
 ; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]]:_(s32) = G_CONSTANT i32 60
 ; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]]:_(p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32)
 ; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4
-; CHECK: BL @large_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
+; ARM: BL @large_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
+; THUMB: tBL 14, $noreg, @large_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
 ; CHECK: ADJCALLSTACKUP 64, 0, 14, $noreg, implicit-def $sp, implicit $sp
-; CHECK: BX_RET 14, $noreg
+; ARM: BX_RET 14, $noreg
+; THUMB: tBX_RET 14, $noreg
 entry:
   notail call arm_aapcscc void @large_int_arrays_target([20 x i32] %arr)
   ret void
@@ -342,7 +361,8 @@ define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) {
 ; CHECK: [[ARR2_OFFSET:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
 ; CHECK: [[ARR2_ADDR:%[0-9]+]]:_(p0) = G_GEP [[SP]], [[ARR2_OFFSET]](s32)
 ; CHECK: G_STORE [[ARR2]](s64), [[ARR2_ADDR]](p0){{.*}}store 8
-; CHECK: BL @fp_arrays_aapcs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
+; ARM: BL @fp_arrays_aapcs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
+; THUMB: tBL 14, $noreg, @fp_arrays_aapcs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $r0
 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $r1
 ; CHECK: [[R_MERGED:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32)
@@ -351,7 +371,8 @@ define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) {
 ; CHECK: [[EXT5:%[0-9]+]]:_(s32) = G_EXTRACT [[R_MERGED]](s64), 32
 ; CHECK: $r0 = COPY [[EXT4]]
 ; CHECK: $r1 = COPY [[EXT5]]
-; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1
+; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1
+; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1
 entry:
   %r = notail call arm_aapcscc [2 x float] @fp_arrays_aapcs_target([3 x double] %arr)
   ret [2 x float] %r
@@ -433,7 +454,8 @@ define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3
 ; CHECK: [[Z3_OFFSET:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
 ; CHECK: [[Z3_ADDR:%[0-9]+]]:_(p0) = G_GEP [[SP]], [[Z3_OFFSET]](s32)
 ; CHECK: G_STORE [[Z3]](s64), [[Z3_ADDR]](p0){{.*}}store 8
-; CHECK: BL @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit $d2, implicit $s6, implicit $s7, implicit $s8, implicit-def $s0, implicit-def $s1, implicit-def $s2, implicit-def $s3
+; ARM: BL @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit $d2, implicit $s6, implicit $s7, implicit $s8, implicit-def $s0, implicit-def $s1, implicit-def $s2, implicit-def $s3
+; THUMB: tBL 14, $noreg, @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit $d2, implicit $s6, implicit $s7, implicit $s8, implicit-def $s0, implicit-def $s1, implicit-def $s2, implicit-def $s3
 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $s0
 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $s1
 ; CHECK: [[R2:%[0-9]+]]:_(s32) = COPY $s2
@@ -448,7 +470,8 @@ define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3
 ; CHECK: $s1 = COPY [[EXT12]]
 ; CHECK: $s2 = COPY [[EXT13]]
 ; CHECK: $s3 = COPY [[EXT14]]
-; CHECK: BX_RET 14, $noreg, implicit $s0, implicit $s1, implicit $s2, implicit $s3
+; ARM: BX_RET 14, $noreg, implicit $s0, implicit $s1, implicit $s2, implicit $s3
+; THUMB: tBX_RET 14, $noreg, implicit $s0, implicit $s1, implicit $s2, implicit $s3
 entry:
   %r = notail call arm_aapcs_vfpcc [4 x float] @fp_arrays_aapcs_vfp_target([3 x double] %x, [3 x float] %y, [4 x double] %z)
   ret [4 x float] %r
@@ -490,7 +513,8 @@ define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) {
 ; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]]:_(s32) = G_CONSTANT i32 76
 ; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]]:_(p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32)
 ; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4
-; CHECK: BL @tough_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
+; ARM: BL @tough_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
+; THUMB: tBL 14, $noreg, @tough_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $r0
 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $r1
 ; CHECK: [[RES_ARR:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32)
@@ -499,7 +523,8 @@ define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) {
 ; CHECK: [[EXT2:%[0-9]+]]:_(p0) = G_EXTRACT [[RES_ARR]](s64), 32
 ; CHECK: $r0 = COPY [[EXT1]]
 ; CHECK: $r1 = COPY [[EXT2]]
-; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1
+; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1
+; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1
 entry:
   %r = notail call arm_aapcscc [2 x i32*] @tough_arrays_target([6 x [4 x i32]] %arr)
   ret [2 x i32*] %r
@@ -522,7 +547,8 @@ define arm_aapcscc {i32, i32} @test_structs({i32, i32} %x) {
 ; CHECK: [[X0:%[0-9]+]]:_(s32), [[X1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INS2]](s64)
 ; CHECK-DAG: $r0 = COPY [[X0]](s32)
 ; CHECK-DAG: $r1 = COPY [[X1]](s32)
-; CHECK: BL @structs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1
+; ARM: BL @structs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1
+; THUMB: tBL 14, $noreg, @structs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1
 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $r0
 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $r1
 ; CHECK: [[R:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32)
@@ -531,7 +557,8 @@ define arm_aapcscc {i32, i32} @test_structs({i32, i32} %x) {
 ; CHECK: [[EXT4:%[0-9]+]]:_(s32) = G_EXTRACT [[R]](s64), 32
 ; CHECK: $r0 = COPY [[EXT3]](s32)
 ; CHECK: $r1 = COPY [[EXT4]](s32)
-; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1
+; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1
+; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1
   %r = notail call arm_aapcscc {i32, i32} @structs_target({i32, i32} %x)
   ret {i32, i32} %r
 }
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index 4634e5a0d9dfa5095af4b30337a93d7664a6947c..281218192a18826c0b1322fbe3394089693c24ac 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -27,6 +27,8 @@
   define void @test_inttoptr_s32() { ret void }
   define void @test_ptrtoint_s32() { ret void }
 
+  define void @test_ctlz_s32() #3 { ret void }
+
   @a_global = global float 1.0
   define void @test_globals() { ret void }
 
@@ -83,6 +85,7 @@
   attributes #0 = { "target-features"="+vfp2"}
   attributes #1 = { "target-features"="+hwdiv-arm" }
   attributes #2 = { "target-features"="+vfp4"}
+  attributes #3 = { "target-features"="+v5t"}
 ...
 ---
 name:            test_add_s32
@@ -561,6 +564,25 @@ body:             |
     BX_RET 14, $noreg, implicit $r0
 ...
 ---
+name:            test_ctlz_s32
+# CHECK-LABEL: name: test_ctlz_s32
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb, preferred-register: '' }
+# CHECK: - { id: 1, class: gprb, preferred-register: '' }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    %0(s32) = COPY $r0
+    %1(s32) = G_CTLZ %0(s32)
+    $r0 = COPY %1(s32)
+    BX_RET 14, $noreg, implicit $r0
+...
+---
 name:            test_globals
 # CHECK-LABEL: name: test_globals
 legalized:       true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-select-globals-ropi-rwpi.mir b/test/CodeGen/ARM/GlobalISel/arm-select-globals-ropi-rwpi.mir
index fa15f09497e51c3c50c6b9ffe80a8222176ccbd4..dc017cc2654e5d6a7735d6689a75c7fac6a7b545 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-select-globals-ropi-rwpi.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-select-globals-ropi-rwpi.mir
@@ -1,9 +1,9 @@
-# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RW-DEFAULT-MOVT,RW-DEFAULT,ROPI-MOVT,ROPI
-# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RW-DEFAULT-NOMOVT,RW-DEFAULT,ROPI-NOMOVT,ROPI
-# RUN: llc -O0 -mtriple arm-linux -relocation-model=rwpi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-MOVT,RWPI,RO-DEFAULT-MOVT,RO-DEFAULT
-# RUN: llc -O0 -mtriple arm-linux -relocation-model=rwpi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-NOMOVT,RWPI,RO-DEFAULT-NOMOVT,RO-DEFAULT
-# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi-rwpi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-MOVT,RWPI,ROPI-MOVT,ROPI
-# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi-rwpi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-NOMOVT,RWPI,ROPI-NOMOVT,ROPI
+# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RW-DEFAULT-MOVT,ROPI-MOVT
+# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RW-DEFAULT-NOMOVT,ROPI-NOMOVT
+# RUN: llc -O0 -mtriple arm-linux -relocation-model=rwpi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-MOVT,RWPI,RO-DEFAULT-MOVT
+# RUN: llc -O0 -mtriple arm-linux -relocation-model=rwpi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-NOMOVT,RWPI,RO-DEFAULT-NOMOVT
+# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi-rwpi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-MOVT,RWPI,ROPI-MOVT
+# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi-rwpi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-NOMOVT,RWPI,ROPI-NOMOVT
 --- |
   @internal_global = internal global i32 42
   define void @test_internal_global() { ret void }
diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-arithmetic-ops.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-arithmetic-ops.mir
new file mode 100644
index 0000000000000000000000000000000000000000..cf42a815cd4b8b4668da0f1324b21865a09d1e4b
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/thumb-select-arithmetic-ops.mir
@@ -0,0 +1,251 @@
+# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @test_add_regs() { ret void }
+  define void @test_add_fold_imm() { ret void }
+  define void @test_add_fold_imm12() { ret void }
+  define void @test_add_no_fold_imm() { ret void }
+
+  define void @test_sub_imm_lhs() { ret void }
+  define void @test_sub_imm_rhs() { ret void }
+
+  define void @test_mul() { ret void }
+  define void @test_mla() { ret void }
+...
+---
+name:            test_add_regs
+# CHECK-LABEL: name: test_add_regs
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY $r0
+
+    %1(s32) = COPY $r1
+    ; CHECK: [[VREGY:%[0-9]+]]:rgpr = COPY $r1
+
+    %2(s32) = G_ADD %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2ADDrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_add_fold_imm
+# CHECK-LABEL: name: test_add_fold_imm
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c
+    %2(s32) = G_ADD %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2ADDri [[VREGX]], 786444, 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_add_fold_imm12
+# CHECK-LABEL: name: test_add_fold_imm12
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 4093
+    %2(s32) = G_ADD %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2ADDri12 [[VREGX]], 4093, 14, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_add_no_fold_imm
+# CHECK-LABEL: name: test_add_no_fold_imm
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 185470479 ; 0x0b0e0e0f
+    ; CHECK: [[VREGY:%[0-9]+]]:rgpr = t2MOVi32imm 185470479
+
+    %2(s32) = G_ADD %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2ADDrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_sub_imm_lhs
+# CHECK-LABEL: name: test_sub_imm_lhs
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c
+    %2(s32) = G_SUB %1, %0
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2RSBri [[VREGX]], 786444, 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_sub_imm_rhs
+# CHECK-LABEL: name: test_sub_imm_rhs
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c
+    %2(s32) = G_SUB %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2SUBri [[VREGX]], 786444, 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_mul
+# CHECK-LABEL: name: test_mul
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+
+    %1(s32) = COPY $r1
+    ; CHECK: [[VREGY:%[0-9]+]]:rgpr = COPY $r1
+
+    %2(s32) = G_MUL %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MUL [[VREGX]], [[VREGY]], 14, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_mla
+# CHECK-LABEL: name: test_mla
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0, $r1, $r2
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+
+    %1(s32) = COPY $r1
+    ; CHECK: [[VREGY:%[0-9]+]]:rgpr = COPY $r1
+
+    %2(s32) = COPY $r2
+    ; CHECK: [[VREGZ:%[0-9]+]]:rgpr = COPY $r2
+
+    %3(s32) = G_MUL %0, %1
+    %4(s32) = G_ADD %3, %2
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MLA [[VREGX]], [[VREGY]], [[VREGZ]], 14, $noreg
+
+    $r0 = COPY %4(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-casts.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-casts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..df5417e7a917ef76d126383acd1bc8bd2e039e1a
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/thumb-select-casts.mir
@@ -0,0 +1,51 @@
+# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @test_inttoptr_s32() { ret void }
+  define void @test_ptrtoint_s32() { ret void }
+...
+---
+name:            test_inttoptr_s32
+# CHECK-LABEL: name: test_inttoptr_s32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    %1(p0) = G_INTTOPTR %0(s32)
+    ; CHECK: [[INT:%[0-9]+]]:gpr = COPY $r0
+
+    $r0 = COPY %1(p0)
+    ; CHECK: $r0 = COPY [[INT]]
+
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_ptrtoint_s32
+# CHECK-LABEL: name: test_ptrtoint_s32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(p0) = COPY $r0
+    %1(s32) = G_PTRTOINT %0(p0)
+    ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY $r0
+
+    $r0 = COPY %1(s32)
+    ; CHECK: $r0 = COPY [[PTR]]
+
+    BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-exts.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-exts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..365a614d6ec397b84286deefc26339aa5fff2bd0
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/thumb-select-exts.mir
@@ -0,0 +1,288 @@
+# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @test_trunc_and_zext_s1() { ret void }
+  define void @test_trunc_and_sext_s1() { ret void }
+  define void @test_trunc_and_anyext_s1() { ret void }
+
+  define void @test_trunc_and_zext_s8() { ret void }
+  define void @test_trunc_and_sext_s8() { ret void }
+  define void @test_trunc_and_anyext_s8() { ret void }
+
+  define void @test_trunc_and_zext_s16() { ret void }
+  define void @test_trunc_and_sext_s16() { ret void }
+  define void @test_trunc_and_anyext_s16() { ret void }
+...
+---
+name:            test_trunc_and_zext_s1
+# CHECK-LABEL: name: test_trunc_and_zext_s1
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s1) = G_TRUNC %0(s32)
+
+    %2(s32) = G_ZEXT %1(s1)
+    ; CHECK: [[RVREG:%[0-9]+]]:rgpr = COPY [[VREG]]
+    ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2ANDri [[RVREG]], 1, 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGEXT]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_trunc_and_sext_s1
+# CHECK-LABEL: name: test_trunc_and_sext_s1
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s1) = G_TRUNC %0(s32)
+
+    %2(s32) = G_SEXT %1(s1)
+    ; CHECK: [[RVREG:%[0-9]+]]:rgpr = COPY [[VREG]]
+    ; CHECK: [[VREGAND:%[0-9]+]]:rgpr = t2ANDri [[RVREG]], 1, 14, $noreg, $noreg
+    ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2RSBri [[VREGAND]], 0, 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGEXT]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_trunc_and_anyext_s1
+# CHECK-LABEL: name: test_trunc_and_anyext_s1
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s1) = G_TRUNC %0(s32)
+
+    %2(s32) = G_ANYEXT %1(s1)
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREG]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_trunc_and_zext_s8
+# CHECK-LABEL: name: test_trunc_and_zext_s8
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s8) = G_TRUNC %0(s32)
+    ; CHECK: [[VREGTRUNC:%[0-9]+]]:rgpr = COPY [[VREG]]
+
+    %2(s32) = G_ZEXT %1(s8)
+    ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2UXTB [[VREGTRUNC]], 0, 14, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGEXT]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_trunc_and_sext_s8
+# CHECK-LABEL: name: test_trunc_and_sext_s8
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s8) = G_TRUNC %0(s32)
+    ; CHECK: [[VREGTRUNC:%[0-9]+]]:rgpr = COPY [[VREG]]
+
+    %2(s32) = G_SEXT %1(s8)
+    ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2SXTB [[VREGTRUNC]], 0, 14, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGEXT]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_trunc_and_anyext_s8
+# CHECK-LABEL: name: test_trunc_and_anyext_s8
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s8) = G_TRUNC %0(s32)
+
+    %2(s32) = G_ANYEXT %1(s8)
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREG]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_trunc_and_zext_s16
+# CHECK-LABEL: name: test_trunc_and_zext_s16
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s16) = G_TRUNC %0(s32)
+    ; CHECK: [[VREGTRUNC:%[0-9]+]]:rgpr = COPY [[VREG]]
+
+    %2(s32) = G_ZEXT %1(s16)
+    ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2UXTH [[VREGTRUNC]], 0, 14, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGEXT]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_trunc_and_sext_s16
+# CHECK-LABEL: name: test_trunc_and_sext_s16
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s16) = G_TRUNC %0(s32)
+    ; CHECK: [[VREGTRUNC:%[0-9]+]]:rgpr = COPY [[VREG]]
+
+    %2(s32) = G_SEXT %1(s16)
+    ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2SXTH [[VREGTRUNC]], 0, 14, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGEXT]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_trunc_and_anyext_s16
+# CHECK-LABEL: name: test_trunc_and_anyext_s16
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0
+
+    %1(s16) = G_TRUNC %0(s32)
+
+    %2(s32) = G_ANYEXT %1(s16)
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREG]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-imm.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-imm.mir
new file mode 100644
index 0000000000000000000000000000000000000000..4979491aca03e89d641bc0ec03972cb0bd99e446
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/thumb-select-imm.mir
@@ -0,0 +1,66 @@
+# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @test_movi() { ret void }
+  define void @test_movi16() { ret void }
+  define void @test_movi32() { ret void }
+...
+---
+name:            test_movi
+# CHECK-LABEL: name: test_movi
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT i32 786444 ; 0x000c000c
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MOVi 786444, 14, $noreg, $noreg
+
+    $r0 = COPY %0(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_movi16
+# CHECK-LABEL: name: test_movi16
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT i32 65533
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MOVi16 65533, 14, $noreg
+
+    $r0 = COPY %0(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_movi32
+# CHECK-LABEL: name: test_movi32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT i32 185470479 ; 0x0b0e0e0f
+    ; CHECK: [[VREGY:%[0-9]+]]:rgpr = t2MOVi32imm 185470479
+
+    $r0 = COPY %0(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-load-store.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-load-store.mir
new file mode 100644
index 0000000000000000000000000000000000000000..170c26e996cb512731422e4a85431360416a807b
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/thumb-select-load-store.mir
@@ -0,0 +1,84 @@
+# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @test_s8() { ret void }
+  define void @test_s16() { ret void }
+  define void @test_s32() { ret void }
+...
+---
+name:            test_s8
+# CHECK-LABEL: name: test_s8
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(p0) = COPY $r0
+    ; CHECK: %[[P:[0-9]+]]:gpr = COPY $r0
+
+    %1(s8) = G_LOAD %0(p0) :: (load 1)
+    ; CHECK: %[[V:[0-9]+]]:rgpr = t2LDRBi12 %[[P]], 0, 14, $noreg :: (load 1)
+
+    G_STORE %1(s8), %0(p0) :: (store 1)
+    ; CHECK: t2STRBi12 %[[V]], %[[P]], 0, 14, $noreg :: (store 1)
+
+    BX_RET 14, $noreg
+    ; CHECK: BX_RET 14, $noreg
+...
+---
+name:            test_s16
+# CHECK-LABEL: name: test_s16
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(p0) = COPY $r0
+    ; CHECK: %[[P:[0-9]+]]:gpr = COPY $r0
+
+    %1(s16) = G_LOAD %0(p0) :: (load 2)
+    ; CHECK: %[[V:[0-9]+]]:rgpr = t2LDRHi12 %[[P]], 0, 14, $noreg :: (load 2)
+
+    G_STORE %1(s16), %0(p0) :: (store 2)
+    ; CHECK: t2STRHi12 %[[V]], %[[P]], 0, 14, $noreg :: (store 2)
+
+    BX_RET 14, $noreg
+    ; CHECK: BX_RET 14, $noreg
+...
+---
+name:            test_s32
+# CHECK-LABEL: name: test_s32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(p0) = COPY $r0
+    ; CHECK: %[[P:[0-9]+]]:gpr = COPY $r0
+
+    %1(s32) = G_LOAD %0(p0) :: (load 4)
+    ; CHECK: %[[V:[0-9]+]]:gpr = t2LDRi12 %[[P]], 0, 14, $noreg :: (load 4)
+
+    G_STORE %1(s32), %0(p0) :: (store 4)
+    ; CHECK: t2STRi12 %[[V]], %[[P]], 0, 14, $noreg :: (store 4)
+
+    BX_RET 14, $noreg
+    ; CHECK: BX_RET 14, $noreg
+...
diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-logical-ops.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-logical-ops.mir
new file mode 100644
index 0000000000000000000000000000000000000000..d63c59942b784ab7c681f577270a00d0df2202be
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/thumb-select-logical-ops.mir
@@ -0,0 +1,219 @@
+# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @test_and_regs() { ret void }
+  define void @test_and_imm() { ret void }
+
+  define void @test_bfc() { ret void }
+  define void @test_no_bfc_bad_mask() { ret void }
+
+  define void @test_mvn() { ret void }
+  define void @test_bic() { ret void }
+  define void @test_orn() { ret void }
+...
+---
+name:            test_and_regs
+# CHECK-LABEL: name: test_and_regs
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+
+    %1(s32) = COPY $r1
+    ; CHECK: [[VREGY:%[0-9]+]]:rgpr = COPY $r1
+
+    %2(s32) = G_AND %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2ANDrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_and_imm
+# CHECK-LABEL: name: test_and_imm
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c
+    %2(s32) = G_AND %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2ANDri [[VREGX]], 786444, 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_bfc
+# CHECK-LABEL: name: test_bfc
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 -65529 ; 0xFFFF0007
+    %2(s32) = G_AND %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2BFC [[VREGX]], -65529, 14, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_no_bfc_bad_mask
+# CHECK-LABEL: name: test_no_bfc_bad_mask
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c
+    %2(s32) = G_AND %0, %1
+    ; CHECK-NOT: t2BFC
+
+    $r0 = COPY %2(s32)
+
+    BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_mvn
+# CHECK-LABEL: name: test_mvn
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0
+
+    %0(s32) = COPY $r0
+    ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+
+    %1(s32) = G_CONSTANT i32 -1
+    %2(s32) = G_XOR %0, %1
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MVNr [[VREGX]], 14, $noreg, $noreg
+
+    $r0 = COPY %2(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_bic
+# CHECK-LABEL: name: test_bic
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    %1(s32) = COPY $r1
+    ; CHECK-DAG: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+    ; CHECK-DAG: [[VREGY:%[0-9]+]]:rgpr = COPY $r1
+
+    %2(s32) = G_CONSTANT i32 -1
+    %3(s32) = G_XOR %1, %2
+
+    %4(s32) = G_AND %0, %3
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2BICrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg
+
+    $r0 = COPY %4(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
+---
+name:            test_orn
+# CHECK-LABEL: name: test_orn
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+body:             |
+  bb.0:
+    liveins: $r0, $r1
+
+    %0(s32) = COPY $r0
+    %1(s32) = COPY $r1
+    ; CHECK-DAG: [[VREGX:%[0-9]+]]:rgpr = COPY $r0
+    ; CHECK-DAG: [[VREGY:%[0-9]+]]:rgpr = COPY $r1
+
+    %2(s32) = G_CONSTANT i32 -1
+    %3(s32) = G_XOR %1, %2
+
+    %4(s32) = G_OR %0, %3
+    ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2ORNrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg
+
+    $r0 = COPY %4(s32)
+    ; CHECK: $r0 = COPY [[VREGRES]]
+
+    BX_RET 14, $noreg, implicit $r0
+    ; CHECK: BX_RET 14, $noreg, implicit $r0
+...
diff --git a/test/CodeGen/ARM/alloca-align.ll b/test/CodeGen/ARM/alloca-align.ll
index 6186d137ef7fdd64421a03db1b8cffa603b70fc0..3326d361c07f07111e522e48d10dc655a1402df5 100644
--- a/test/CodeGen/ARM/alloca-align.ll
+++ b/test/CodeGen/ARM/alloca-align.ll
@@ -12,8 +12,7 @@ declare void @bar(i32*, [20000 x i8]* byval)
 ; And a base pointer getting used.
 ; CHECK: mov r6, sp
 ; Which is passed to the call
-; CHECK: add [[REG:r[0-9]+|lr]], r6, #19456
-; CHECK: add r0, [[REG]], #536
+; CHECK: mov r0, r6
 ; CHECK: bl bar
 define void @foo([20000 x i8]* %addr) {
   %tmp = alloca [4 x i32], align 32
diff --git a/test/CodeGen/ARM/arm-storebytesmerge.ll b/test/CodeGen/ARM/arm-storebytesmerge.ll
index edc25302f7c5701a8798e67594539d9be80de224..00c5914b34b8bd6aca2652a34c1eafd0d90597a3 100644
--- a/test/CodeGen/ARM/arm-storebytesmerge.ll
+++ b/test/CodeGen/ARM/arm-storebytesmerge.ll
@@ -8,101 +8,95 @@ target triple = "thumbv7em-arm-none-eabi"
 define arm_aapcs_vfpcc void @test(i8* %v50) #0 {
 ; CHECK-LABEL: test:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    movw r1, #35722
-; CHECK-NEXT:    movt r1, #36236
-; CHECK-NEXT:    str.w r1, [r0, #394]
-; CHECK-NEXT:    movw r1, #36750
-; CHECK-NEXT:    movt r1, #37264
-; CHECK-NEXT:    str.w r1, [r0, #398]
-; CHECK-NEXT:    movw r1, #37778
-; CHECK-NEXT:    movt r1, #38292
-; CHECK-NEXT:    str.w r1, [r0, #402]
-; CHECK-NEXT:    movw r1, #38806
-; CHECK-NEXT:    movt r1, #39320
-; CHECK-NEXT:    str.w r1, [r0, #406]
-; CHECK-NEXT:    movw r1, #39834
-; CHECK-NEXT:    strh.w r1, [r0, #410]
-; CHECK-NEXT:    movw r1, #40348
-; CHECK-NEXT:    movt r1, #40862
-; CHECK-NEXT:    str.w r1, [r0, #412]
-; CHECK-NEXT:    movw r1, #41376
-; CHECK-NEXT:    movt r1, #41890
-; CHECK-NEXT:    str.w r1, [r0, #416]
-; CHECK-NEXT:    movw r1, #42404
-; CHECK-NEXT:    movt r1, #42918
-; CHECK-NEXT:    str.w r1, [r0, #420]
-; CHECK-NEXT:    movw r1, #43432
-; CHECK-NEXT:    movt r1, #43946
-; CHECK-NEXT:    str.w r1, [r0, #424]
-; CHECK-NEXT:    movw r1, #44460
-; CHECK-NEXT:    movt r1, #44974
-; CHECK-NEXT:    str.w r1, [r0, #428]
-; CHECK-NEXT:    movw r1, #45488
-; CHECK-NEXT:    strh.w r1, [r0, #432]
+; CHECK-NEXT:    movw r1, #65534
+; CHECK-NEXT:    strh.w r1, [r0, #510]
+; CHECK-NEXT:    movw r1, #64506
+; CHECK-NEXT:    movt r1, #65020
+; CHECK-NEXT:    str.w r1, [r0, #506]
+; CHECK-NEXT:    movw r1, #63478
+; CHECK-NEXT:    movt r1, #63992
+; CHECK-NEXT:    str.w r1, [r0, #502]
+; CHECK-NEXT:    movw r1, #62450
+; CHECK-NEXT:    movt r1, #62964
+; CHECK-NEXT:    str.w r1, [r0, #498]
+; CHECK-NEXT:    movw r1, #61422
+; CHECK-NEXT:    movt r1, #61936
+; CHECK-NEXT:    str.w r1, [r0, #494]
+; CHECK-NEXT:    movw r1, #60394
+; CHECK-NEXT:    movt r1, #60908
+; CHECK-NEXT:    str.w r1, [r0, #490]
+; CHECK-NEXT:    movw r1, #59366
+; CHECK-NEXT:    movt r1, #59880
+; CHECK-NEXT:    str.w r1, [r0, #486]
+; CHECK-NEXT:    movw r1, #58338
+; CHECK-NEXT:    movt r1, #58852
+; CHECK-NEXT:    str.w r1, [r0, #482]
+; CHECK-NEXT:    movw r1, #57310
+; CHECK-NEXT:    movt r1, #57824
+; CHECK-NEXT:    str.w r1, [r0, #478]
+; CHECK-NEXT:    movw r1, #56282
+; CHECK-NEXT:    movt r1, #56796
+; CHECK-NEXT:    str.w r1, [r0, #474]
+; CHECK-NEXT:    movw r1, #55254
+; CHECK-NEXT:    movt r1, #55768
+; CHECK-NEXT:    str.w r1, [r0, #470]
+; CHECK-NEXT:    movw r1, #54226
+; CHECK-NEXT:    movt r1, #54740
+; CHECK-NEXT:    str.w r1, [r0, #466]
+; CHECK-NEXT:    movw r1, #53198
+; CHECK-NEXT:    movt r1, #53712
+; CHECK-NEXT:    str.w r1, [r0, #462]
+; CHECK-NEXT:    movw r1, #52170
+; CHECK-NEXT:    movt r1, #52684
+; CHECK-NEXT:    str.w r1, [r0, #458]
+; CHECK-NEXT:    movw r1, #51142
+; CHECK-NEXT:    movt r1, #51656
+; CHECK-NEXT:    str.w r1, [r0, #454]
+; CHECK-NEXT:    movw r1, #50114
+; CHECK-NEXT:    movt r1, #50628
+; CHECK-NEXT:    str.w r1, [r0, #450]
+; CHECK-NEXT:    movw r1, #49086
+; CHECK-NEXT:    movt r1, #49600
+; CHECK-NEXT:    str.w r1, [r0, #446]
+; CHECK-NEXT:    movw r1, #48058
+; CHECK-NEXT:    movt r1, #48572
+; CHECK-NEXT:    str.w r1, [r0, #442]
+; CHECK-NEXT:    movw r1, #47030
+; CHECK-NEXT:    movt r1, #47544
+; CHECK-NEXT:    str.w r1, [r0, #438]
 ; CHECK-NEXT:    movw r1, #46002
 ; CHECK-NEXT:    movt r1, #46516
 ; CHECK-NEXT:    str.w r1, [r0, #434]
-; CHECK-NEXT:    movw r1, #47030
-; CHECK-NEXT:    strh.w r1, [r0, #438]
-; CHECK-NEXT:    movw r1, #47544
-; CHECK-NEXT:    movt r1, #48058
-; CHECK-NEXT:    str.w r1, [r0, #440]
-; CHECK-NEXT:    movw r1, #48572
-; CHECK-NEXT:    movt r1, #49086
-; CHECK-NEXT:    str.w r1, [r0, #444]
-; CHECK-NEXT:    movw r1, #49600
-; CHECK-NEXT:    strh.w r1, [r0, #448]
-; CHECK-NEXT:    movs r1, #194
-; CHECK-NEXT:    strb.w r1, [r0, #450]
-; CHECK-NEXT:    movw r1, #50371
-; CHECK-NEXT:    movt r1, #50885
-; CHECK-NEXT:    str.w r1, [r0, #451]
-; CHECK-NEXT:    movw r1, #51399
-; CHECK-NEXT:    movt r1, #51913
-; CHECK-NEXT:    str.w r1, [r0, #455]
-; CHECK-NEXT:    movw r1, #52427
-; CHECK-NEXT:    movt r1, #52941
-; CHECK-NEXT:    str.w r1, [r0, #459]
-; CHECK-NEXT:    movw r1, #53455
-; CHECK-NEXT:    movt r1, #53969
-; CHECK-NEXT:    str.w r1, [r0, #463]
-; CHECK-NEXT:    movw r1, #54483
-; CHECK-NEXT:    strh.w r1, [r0, #467]
-; CHECK-NEXT:    movw r1, #54997
-; CHECK-NEXT:    movt r1, #55511
-; CHECK-NEXT:    str.w r1, [r0, #469]
-; CHECK-NEXT:    movw r1, #56025
-; CHECK-NEXT:    movt r1, #56539
-; CHECK-NEXT:    str.w r1, [r0, #473]
-; CHECK-NEXT:    movw r1, #57053
-; CHECK-NEXT:    movt r1, #57567
-; CHECK-NEXT:    str.w r1, [r0, #477]
-; CHECK-NEXT:    movw r1, #58081
-; CHECK-NEXT:    movt r1, #58595
-; CHECK-NEXT:    str.w r1, [r0, #481]
-; CHECK-NEXT:    movw r1, #59109
-; CHECK-NEXT:    movt r1, #59623
-; CHECK-NEXT:    str.w r1, [r0, #485]
-; CHECK-NEXT:    movw r1, #60137
-; CHECK-NEXT:    strh.w r1, [r0, #489]
-; CHECK-NEXT:    movw r1, #60651
-; CHECK-NEXT:    movt r1, #61165
-; CHECK-NEXT:    str.w r1, [r0, #491]
-; CHECK-NEXT:    movw r1, #61679
-; CHECK-NEXT:    strh.w r1, [r0, #495]
-; CHECK-NEXT:    movw r1, #62193
-; CHECK-NEXT:    movt r1, #62707
-; CHECK-NEXT:    str.w r1, [r0, #497]
-; CHECK-NEXT:    movw r1, #63221
-; CHECK-NEXT:    movt r1, #63735
-; CHECK-NEXT:    str.w r1, [r0, #501]
-; CHECK-NEXT:    movw r1, #64249
-; CHECK-NEXT:    strh.w r1, [r0, #505]
-; CHECK-NEXT:    movs r1, #251
-; CHECK-NEXT:    strb.w r1, [r0, #507]
-; CHECK-NEXT:    movw r1, #65020
-; CHECK-NEXT:    movt r1, #65534
-; CHECK-NEXT:    str.w r1, [r0, #508]
+; CHECK-NEXT:    movw r1, #44974
+; CHECK-NEXT:    movt r1, #45488
+; CHECK-NEXT:    str.w r1, [r0, #430]
+; CHECK-NEXT:    movw r1, #43946
+; CHECK-NEXT:    movt r1, #44460
+; CHECK-NEXT:    str.w r1, [r0, #426]
+; CHECK-NEXT:    movw r1, #42918
+; CHECK-NEXT:    movt r1, #43432
+; CHECK-NEXT:    str.w r1, [r0, #422]
+; CHECK-NEXT:    movw r1, #41890
+; CHECK-NEXT:    movt r1, #42404
+; CHECK-NEXT:    str.w r1, [r0, #418]
+; CHECK-NEXT:    movw r1, #40862
+; CHECK-NEXT:    movt r1, #41376
+; CHECK-NEXT:    str.w r1, [r0, #414]
+; CHECK-NEXT:    movw r1, #39834
+; CHECK-NEXT:    movt r1, #40348
+; CHECK-NEXT:    str.w r1, [r0, #410]
+; CHECK-NEXT:    movw r1, #38806
+; CHECK-NEXT:    movt r1, #39320
+; CHECK-NEXT:    str.w r1, [r0, #406]
+; CHECK-NEXT:    movw r1, #37778
+; CHECK-NEXT:    movt r1, #38292
+; CHECK-NEXT:    str.w r1, [r0, #402]
+; CHECK-NEXT:    movw r1, #36750
+; CHECK-NEXT:    movt r1, #37264
+; CHECK-NEXT:    str.w r1, [r0, #398]
+; CHECK-NEXT:    movw r1, #35722
+; CHECK-NEXT:    movt r1, #36236
+; CHECK-NEXT:    str.w r1, [r0, #394]
 ; CHECK-NEXT:    bx lr
   %v190 = getelementptr inbounds i8, i8* %v50, i32 394
   store i8 -118, i8* %v190, align 1
diff --git a/test/CodeGen/ARM/atomic-ops-m33.ll b/test/CodeGen/ARM/atomic-ops-m33.ll
new file mode 100644
index 0000000000000000000000000000000000000000..474ad8960cf5b1860690ce851ecab5462d711b64
--- /dev/null
+++ b/test/CodeGen/ARM/atomic-ops-m33.ll
@@ -0,0 +1,140 @@
+; RUN: llc -mtriple=thumbv7-none-eabi -mcpu=cortex-m33 -verify-machineinstrs -o -  %s | FileCheck %s
+
+define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i8:
+  %old = atomicrmw add i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+  ret i8 %old
+}
+
+define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i16:
+  %old = atomicrmw add i16* @var16, i16 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+  ret i16 %old
+}
+
+define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i32:
+  %old = atomicrmw add i32* @var32, i32 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+  ret i32 %old
+}
+
+define void @test_atomic_load_add_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i64:
+; CHECK: bl __sync_fetch_and_add_8
+   %old = atomicrmw add i64* @var64, i64 %offset monotonic
+  store i64 %old, i64* @var64
+  ret void
+}
+
+define i8 @test_load_acquire_i8(i8* %ptr) {
+; CHECK-LABEL: test_load_acquire_i8:
+; CHECK: ldab r0, [r0]
+  %val = load atomic i8, i8* %ptr seq_cst, align 1
+  ret i8 %val
+}
+
+define i16 @test_load_acquire_i16(i16* %ptr) {
+; CHECK-LABEL: test_load_acquire_i16:
+; CHECK: ldah r0, [r0]
+  %val = load atomic i16, i16* %ptr acquire, align 2
+  ret i16 %val
+}
+
+define i32 @test_load_acquire_i32(i32* %ptr) {
+; CHECK-LABEL: test_load_acquire_i32:
+; CHECK: lda r0, [r0]
+  %val = load atomic i32, i32* %ptr acquire, align 4
+  ret i32 %val
+}
+
+define i64 @test_load_acquire_i64(i64* %ptr) {
+; CHECK-LABEL: test_load_acquire_i64:
+; CHECK: bl __atomic_load
+  %val = load atomic i64, i64* %ptr acquire, align 4
+  ret i64 %val
+}
+
+define void @test_store_release_i8(i8 %val, i8* %ptr) {
+; CHECK-LABEL: test_store_release_i8:
+; CHECK: stlb r0, [r1]
+  store atomic i8 %val, i8* %ptr seq_cst, align 1
+  ret void
+}
+
+define void @test_store_release_i16(i16 %val, i16* %ptr) {
+; CHECK-LABEL: test_store_release_i16:
+; CHECK: stlh r0, [r1]
+  store atomic i16 %val, i16* %ptr release, align 2
+  ret void
+}
+
+define void @test_store_release_i32(i32 %val, i32* %ptr) {
+; CHECK-LABEL: test_store_release_i32:
+; CHECK: stl r0, [r1]
+  store atomic i32 %val, i32* %ptr seq_cst, align 4
+  ret void
+}
+
+define void @test_store_release_i64(i64 %val, i64* %ptr) {
+; CHECK-LABEL: test_store_release_i64:
+; CHECK: bl __atomic_store
+  store atomic i64 %val, i64* %ptr seq_cst, align 4
+  ret void
+}
+
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
diff --git a/test/CodeGen/ARM/cmp.ll b/test/CodeGen/ARM/cmp.ll
index 5c1630912579b08ed09353c5d22936a9dc1cf8e0..2e6b20cce7323d6410fb56dd98c5b5824fa0f12d 100644
--- a/test/CodeGen/ARM/cmp.ll
+++ b/test/CodeGen/ARM/cmp.ll
@@ -39,15 +39,11 @@ define i1 @f6(i32 %a, i32 %b) {
 
 define i1 @f7(i32 %a, i32 %b) {
 ; CHECK-LABEL: f7:
-; CHECK: sub     r2, r0, r1, lsr #6
-; CHECK: cmp     r0, r1, lsr #6
-; CHECK: movwne  r2, #1
-; CHECK: mov     r0, r2
-; CHECK-T2: sub.w   r2, r0, r1, lsr #6
-; CHECK-T2: cmp.w   r0, r1, lsr #6
+; CHECK: subs    r0, r0, r1, lsr #6
+; CHECK: movwne  r0, #1
+; CHECK-T2: subs.w   r0, r0, r1, lsr #6
 ; CHECK-T2: it      ne
-; CHECK-T2: movne   r2, #1
-; CHECK-T2: mov     r0, r2
+; CHECK-T2: movne   r0, #1
     %tmp = lshr i32 %b, 6
     %tmp1 = icmp ne i32 %a, %tmp
     ret i1 %tmp1
@@ -68,15 +64,11 @@ define i1 @f8(i32 %a, i32 %b) {
 
 define i1 @f9(i32 %a) {
 ; CHECK-LABEL: f9:
-; CHECK: sub     r1, r0, r0, ror #8
-; CHECK: cmp     r0, r0, ror #8
-; CHECK: movwne  r1, #1
-; CHECK: mov     r0, r1
-; CHECK-T2: sub.w   r1, r0, r0, ror #8
-; CHECK-T2: cmp.w   r0, r0, ror #8
+; CHECK: subs    r0, r0, r0, ror #8
+; CHECK: movwne  r0, #1
+; CHECK-T2: subs.w   r0, r0, r0, ror #8
 ; CHECK-T2: it      ne
-; CHECK-T2: movne   r1, #1
-; CHECK-T2: mov     r0, r1
+; CHECK-T2: movne   r0, #1
     %l8 = shl i32 %a, 24
     %r8 = lshr i32 %a, 8
     %tmp = or i32 %l8, %r8
diff --git a/test/CodeGen/ARM/codemodel.ll b/test/CodeGen/ARM/codemodel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ec9982faba12bfd536f6ba8992c48bfa19ecccd1
--- /dev/null
+++ b/test/CodeGen/ARM/codemodel.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -verify-machineinstrs -o - -mtriple=arm-none-eabi -code-model=tiny < %s 2>&1 | FileCheck %s --check-prefix=TINY
+; RUN: not llc -verify-machineinstrs -o - -mtriple=arm-none-eabi -code-model=kernel < %s 2>&1 | FileCheck %s --check-prefix=KERNEL
+
+; TINY:    Target does not support the tiny CodeModel
+; KERNEL:    Target does not support the kernel CodeModel
+
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index fa4d79c604af2727ab3e52e44c688340c653b8f7..67628dd2a3a2dc106121befa490abaabb37eed92 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -17,12 +17,12 @@ target triple = "thumbv7-apple-macosx10.6.7"
 
 declare <4 x float> @test0001(float) nounwind readnone ssp
 
-define i32 @main(i32 %argc, i8** nocapture %argv, <4 x float> %x) nounwind ssp !dbg !10 {
+define i32 @main(i32 %argc, i8** nocapture %argv, <4 x float> %x, <4 x float> %y) nounwind ssp !dbg !10 {
 entry:
   br label %for.body9
 
 for.body9:                                        ; preds = %for.body9, %entry
-  %add19 = fadd <4 x float> %x, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
+  %add19 = fadd <4 x float> %x, %y, !dbg !39
   br i1 undef, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
diff --git a/test/CodeGen/ARM/float-helpers.s b/test/CodeGen/ARM/float-helpers.s
index 0a9f2490d20d01e016af2f860e4155f5d2004e89..42ab56084d45cdeb966add4f00931e693c5d88dc 100644
--- a/test/CodeGen/ARM/float-helpers.s
+++ b/test/CodeGen/ARM/float-helpers.s
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -asm-verbose=false -mattr=-vfp2 -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-SOFT
-; RUN: llc -asm-verbose=false -mattr=-vfp2 -mtriple=arm-eabi -meabi=gnu < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-SOFT
-; RUN: llc -asm-verbose=false -mattr=+vfp3 -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-SOFTFP
-; RUN: llc -asm-verbose=false -mattr=+vfp3 -meabi=gnu -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-SOFTFP
-; RUN: llc -asm-verbose=false -mattr=+vfp3 -float-abi=hard -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-DP
-; RUN: llc -asm-verbose=false -mattr=+vfp3 -float-abi=hard -meabi=gnu -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-DP
-; RUN: llc -asm-verbose=false -mattr=+vfp3,+fp-only-sp -float-abi=hard -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-SPONLY
-; RUN: llc -asm-verbose=false -mattr=+vfp3,+fp-only-sp -float-abi=hard -mtriple=arm-eabi -meabi=gnu < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-SPONLY
+; RUN: llc -asm-verbose=false -mattr=-vfp2 -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-SOFT
+; RUN: llc -asm-verbose=false -mattr=-vfp2 -mtriple=arm-eabi -meabi=gnu < %s | FileCheck %s -check-prefix=CHECK-SOFT
+; RUN: llc -asm-verbose=false -mattr=+vfp3 -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-SOFTFP
+; RUN: llc -asm-verbose=false -mattr=+vfp3 -meabi=gnu -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-SOFTFP
+; RUN: llc -asm-verbose=false -mattr=+vfp3 -float-abi=hard -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-DP
+; RUN: llc -asm-verbose=false -mattr=+vfp3 -float-abi=hard -meabi=gnu -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-DP
+; RUN: llc -asm-verbose=false -mattr=+vfp3,+fp-only-sp -float-abi=hard -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-SPONLY
+; RUN: llc -asm-verbose=false -mattr=+vfp3,+fp-only-sp -float-abi=hard -mtriple=arm-eabi -meabi=gnu < %s | FileCheck %s -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-SPONLY
 
 ; The Runtime ABI for the ARM Architecture IHI0043 section 4.1.2 The
 ; floating-point helper functions to always use the base AAPCS (soft-float)
diff --git a/test/CodeGen/ARM/fp16-vld.ll b/test/CodeGen/ARM/fp16-vld.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5052b99e6c9dac2a1474a69c947e52506ee181a2
--- /dev/null
+++ b/test/CodeGen/ARM/fp16-vld.ll
@@ -0,0 +1,48 @@
+; RUN: llc -asm-verbose=false < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8.2a-arm-unknown-eabihf"
+
+define dso_local void @vec8(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 {
+; CHECK:      .LBB0_1:
+; CHECK-NEXT: vld1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: subs r1, r1, #8
+; CHECK-NEXT: bne .LBB0_1
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds half, half* %V, i32 %index
+  %1 = bitcast half* %0 to <8 x half>*
+  %wide.load = load volatile <8 x half>, <8 x half>* %1, align 2
+  %index.next = add i32 %index, 8
+  %cmp = icmp eq i32 %index.next, %N
+  br i1 %cmp, label %byeblock, label %vector.body
+
+byeblock:
+  ret void
+}
+
+define dso_local void @vec4(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 {
+; CHECK:      .LBB1_1:
+; CHECK-NEXT: vld1.16 {d16}, [r0]!
+; CHECK-NEXT: subs r1, r1, #4
+; CHECK-NEXT:	bne	.LBB1_1
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds half, half* %V, i32 %index
+  %1 = bitcast half* %0 to <4 x half>*
+  %wide.load = load volatile <4 x half>, <4 x half>* %1, align 2
+  %index.next = add i32 %index, 4
+  %cmp = icmp eq i32 %index.next, %N
+  br i1 %cmp, label %byeblock, label %vector.body
+
+byeblock:
+  ret void
+}
+
+attributes #0 = { norecurse nounwind readonly "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "target-cpu"="generic" "target-features"="+armv8.2-a,+fullfp16,+strict-align,-thumb-mode" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/ldrcppic.ll b/test/CodeGen/ARM/ldrcppic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c7727290d842b74c90ab47b842a8d8e6aaed47b1
--- /dev/null
+++ b/test/CodeGen/ARM/ldrcppic.ll
@@ -0,0 +1,56 @@
+; The failure is caused by ARM LDRcp/PICADD pairs. In PIC mode, the constant pool
+; need a label to do address computation. This label is emitted when backend emits
+; PICADD. When the target becomes dead, PICADD will be deleted. without this patch
+; LDRcp is dead but not being deleted. This will cause a dead contant pool entry
+; using a non existing label. This will cause an error in MC object emitting pass.
+
+; RUN: llc -relocation-model=pic -mcpu=cortex-a53 %s -filetype=obj -o - | llvm-nm - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-unknown-linux-android"
+
+@_ZN15UsecaseSelector25AllowedImplDefinedFormatsE = external dso_local unnamed_addr constant <{ i32, i32, i32, i32, [12 x i32] }>, align 4
+
+; Function Attrs: noinline nounwind optnone sspstrong uwtable
+define dso_local fastcc void @_ZN15UsecaseSelector26IsAllowedImplDefinedFormatE15ChiBufferFormatj() unnamed_addr #1 align 2 {
+  br label %1
+
+; <label>:1:                                      ; preds = %13, %0
+  %2 = icmp ult i32 undef, 4
+  br i1 %2, label %3, label %14
+
+; <label>:3:                                      ; preds = %1
+  br i1 undef, label %4, label %13
+
+; <label>:4:                                      ; preds = %3
+  %5 = getelementptr inbounds [16 x i32], [16 x i32]* bitcast (<{ i32, i32, i32, i32, [12 x i32] }>* @_ZN15UsecaseSelector25AllowedImplDefinedFormatsE to [16 x i32]*), i32 0, i32 undef
+  %6 = load i32, i32* %5, align 4
+  %7 = icmp eq i32 10, %6
+  br i1 %7, label %9, label %8
+
+; <label>:8:                                      ; preds = %4
+  br i1 undef, label %9, label %12
+
+; <label>:9:                                      ; preds = %8, %4
+  br i1 undef, label %10, label %13
+
+; <label>:10:                                     ; preds = %9
+  br i1 undef, label %11, label %13
+
+; <label>:11:                                     ; preds = %10
+  br label %14
+
+; <label>:12:                                     ; preds = %8
+  br label %14
+
+; <label>:13:                                     ; preds = %10, %9, %3
+  br label %1
+
+; <label>:14:                                     ; preds = %12, %11, %1
+  ret void
+}
+
+attributes #1 = { noinline optnone }
+
+; CHECK: _ZN15UsecaseSelector26IsAllowedImplDefinedFormatE15ChiBufferFormatj
+
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 7cb3b407bf1646080ef4696df31f31bc1b8dbb70..4cdafa72f62e8de0c6dc16541ddeefbd0e79455e 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -3,7 +3,7 @@
 ; rdar://6949835
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK -check-prefix=NORMAL
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK -check-prefix=NORMAL
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=CHECK -check-prefix=NORMAL
 
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=CHECK -check-prefix=CONSERVATIVE
 
diff --git a/test/CodeGen/ARM/ldstrex-m.ll b/test/CodeGen/ARM/ldstrex-m.ll
index 5b717f7f1ae95d694e309ac034fa2f3cdd77ff3a..713fb9e3df011ab1b64bcb746398dc82bb1ceafb 100644
--- a/test/CodeGen/ARM/ldstrex-m.ll
+++ b/test/CodeGen/ARM/ldstrex-m.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8m.main-none-eabi | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8m.base-none-eabi | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V7
+; RUN: llc < %s -mtriple=thumbv8m.main-none-eabi | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V8
+; RUN: llc < %s -mtriple=thumbv8m.base-none-eabi | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V8
 
 ; CHECK-LABEL: f0:
 ; CHECK-NOT: ldrexd
@@ -28,7 +28,8 @@ entry:
 }
 
 ; CHECK-LABEL: f3:
-; CHECK: ldr
+; CHECK-V7: ldr
+; CHECK-V8: lda
 define i32 @f3(i32* %p) nounwind readonly {
 entry:
   %0 = load atomic i32, i32* %p seq_cst, align 4
@@ -36,7 +37,8 @@ entry:
 }
 
 ; CHECK-LABEL: f4:
-; CHECK: ldrb
+; CHECK-V7: ldrb
+; CHECK-V8: ldab
 define i8 @f4(i8* %p) nounwind readonly {
 entry:
   %0 = load atomic i8, i8* %p seq_cst, align 4
@@ -44,7 +46,8 @@ entry:
 }
 
 ; CHECK-LABEL: f5:
-; CHECK: str
+; CHECK-V7: str
+; CHECK-V8: stl
 define void @f5(i32* %p) nounwind readonly {
 entry:
   store atomic i32 0, i32* %p seq_cst, align 4
@@ -52,8 +55,10 @@ entry:
 }
 
 ; CHECK-LABEL: f6:
-; CHECK: ldrex
-; CHECK: strex
+; CHECK-V7: ldrex
+; CHECK-V7: strex
+; CHECK-V8: ldaex
+; CHECK-V8: stlex
 define i32 @f6(i32* %p) nounwind readonly {
 entry:
   %0 = atomicrmw add i32* %p, i32 1 seq_cst
diff --git a/test/CodeGen/ARM/load_store_opt_clobber_cpsr.mir b/test/CodeGen/ARM/load_store_opt_clobber_cpsr.mir
new file mode 100644
index 0000000000000000000000000000000000000000..7a4db88479ba79da0637f8715e6b8b1aaf216c2a
--- /dev/null
+++ b/test/CodeGen/ARM/load_store_opt_clobber_cpsr.mir
@@ -0,0 +1,33 @@
+# RUN: llc -mtriple=thumbv6m--eabi -verify-machineinstrs -run-pass=arm-ldst-opt %s -o - | FileCheck %s
+
+# Make sure bb.0 isn't transformed: it would incorrectly clobber CPSR.
+#
+# Make sure bb.1 is transformed, so the test doesn't accidentally break.
+
+# CHECK-LABEL: bb.0:
+# CHECK: renamable $r0 = tLDRi renamable $r4, 0, 14, $noreg :: (load 4)
+# CHECK: renamable $r1 = tLDRi renamable $r4, 1, 14, $noreg :: (load 4)
+
+# CHECK-LABEL: bb.1:
+# CHECK: $r4 = tLDMIA_UPD $r4, 14, $noreg, def $r0, def $r1
+# CHECK: $r4, dead $cpsr = tSUBi8 $r4, 8, 14, $noreg
+
+name: foo
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r2, $r4
+    renamable $r0 = tLDRi renamable $r2, 4, 14, $noreg :: (load 4)
+    dead renamable $r0, $cpsr = tADDi3 killed renamable $r0, 1, 14, $noreg
+    renamable $r0 = tLDRi renamable $r4, 0, 14, $noreg :: (load 4)
+    renamable $r1 = tLDRi renamable $r4, 1, 14, $noreg :: (load 4)
+    tBcc %bb.1, 0, killed $cpsr
+  bb.1:
+    liveins: $r2, $r4
+    renamable $r0 = tLDRi renamable $r2, 4, 14, $noreg :: (load 4)
+    dead renamable $r0, $cpsr = tADDi3 killed renamable $r0, 1, 14, $noreg
+    renamable $r0 = tLDRi renamable $r4, 0, 14, $noreg :: (load 4)
+    renamable $r1 = tLDRi renamable $r4, 1, 14, $noreg :: (load 4)
+  bb.2:
+    liveins: $r4
+    TRAP
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index c410403a9f11a4b359e503a2c8081eba44a2d98c..8dab9b67d5593adeb291d4b08d581b2aacd0a254 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -86,10 +86,9 @@ entry:
 define void @t5(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t5:
-; CHECK: movs [[REG5:r[0-9]+]], #0
-; CHECK: strb [[REG5]], [r0, #6]
-; CHECK: movw [[REG6:r[0-9]+]], #21587
-; CHECK: strh [[REG6]], [r0, #4]
+; CHECK: movw [[REG5:r[0-9]+]], #21337
+; CHECK: movt [[REG5]], #84
+; CHECK: str.w [[REG5]], [r0, #3]
 ; CHECK: movw [[REG7:r[0-9]+]], #18500
 ; CHECK: movt [[REG7:r[0-9]+]], #22866
 ; CHECK: str [[REG7]]
diff --git a/test/CodeGen/ARM/memcpy-ldm-stm.ll b/test/CodeGen/ARM/memcpy-ldm-stm.ll
index 314f559e357ac7c2b258d8b5b719e8a62f4281d3..4009de5bd46a4b696f2b817049bdd088776178d8 100644
--- a/test/CodeGen/ARM/memcpy-ldm-stm.ll
+++ b/test/CodeGen/ARM/memcpy-ldm-stm.ll
@@ -34,14 +34,16 @@ entry:
 ; CHECK-LABEL: t2:
 ; CHECKV6: ldr [[LB:r[0-7]]],
 ; CHECKV6-NEXT: ldr [[SB:r[0-7]]],
+; CHECKV6-NEXT: ldm{{(\.w)?}} [[LB]]!,
+; CHECKV6-NEXT: stm{{(\.w)?}} [[SB]]!,
+; CHECKV6-NEXT: ldrh{{(\.w)?}} {{.*}}, {{\[}}[[LB]]]
+; CHECKV6-NEXT: ldrb{{(\.w)?}} {{.*}}, {{\[}}[[LB]], #2]
+; CHECKV6-NEXT: strb{{(\.w)?}} {{.*}}, {{\[}}[[SB]], #2]
+; CHECKV6-NEXT: strh{{(\.w)?}} {{.*}}, {{\[}}[[SB]]]
 ; CHECKV7: movt [[LB:[rl0-9]+]], :upper16:d
 ; CHECKV7-NEXT: movt [[SB:[rl0-9]+]], :upper16:s
-; CHECK-NEXT: ldm{{(\.w)?}} [[LB]]!,
-; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!,
-; CHECK-NEXT: ldrh{{(\.w)?}} {{.*}}, {{\[}}[[LB]]]
-; CHECK-NEXT: ldrb{{(\.w)?}} {{.*}}, {{\[}}[[LB]], #2]
-; CHECK-NEXT: strb{{(\.w)?}} {{.*}}, {{\[}}[[SB]], #2]
-; CHECK-NEXT: strh{{(\.w)?}} {{.*}}, {{\[}}[[SB]]]
+; CHECKV7: ldr{{(\.w)?}} {{.*}}, {{\[}}[[LB]], #11]
+; CHECKV7-NEXT: str{{(\.w)?}} {{.*}}, {{\[}}[[SB]], #11]
     tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast ([64 x i32]* @s to i8*), i8* align 4 bitcast ([64 x i32]* @d to i8*), i32 15, i1 false)
     ret void
 }
diff --git a/test/CodeGen/ARM/misched-fusion-aes.ll b/test/CodeGen/ARM/misched-fusion-aes.ll
index 483f26cc8e007cf649ad8d56c5cf8787147943f9..b6ca49646f83c07d470eb7cd1600ca63e4e100e9 100644
--- a/test/CodeGen/ARM/misched-fusion-aes.ll
+++ b/test/CodeGen/ARM/misched-fusion-aes.ll
@@ -72,20 +72,27 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
 ; CHECK-LABEL: aesea:
 ; CHECK: aese.8 [[QA:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QA]]
+
 ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]]
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
+
 ; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]]
+
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]]
+
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]]
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
+
 ; CHECK: aese.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QF]]
+
 ; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]]
+
 ; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]]
@@ -160,14 +167,14 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QA]]
 ; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]]
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]]
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]]
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]]
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
 ; CHECK: aesd.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QF]]
 ; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
diff --git a/test/CodeGen/ARM/nonreserved-callframe-with-basereg.mir b/test/CodeGen/ARM/nonreserved-callframe-with-basereg.mir
new file mode 100644
index 0000000000000000000000000000000000000000..a262594473ff6e96c8af124f306bbb013df8d139
--- /dev/null
+++ b/test/CodeGen/ARM/nonreserved-callframe-with-basereg.mir
@@ -0,0 +1,54 @@
+# RUN: llc -run-pass=prologepilog %s -o -  | FileCheck %s
+
+# Make sure we use the correct offset for stack accesses using the base pointer
+# within call frame blocks. Key points of test:
+#    + A large SP in ADJCALLSTACKDOWN forces each call to get its own adjustment.
+#    + An over-aligned stack variable means that we must use r6 rather than fp
+#      to access this variables.
+#
+# Under these circumstances, the ADJCALLSTACKDOWN must not apply to r6 offsets.
+
+--- |
+  ; ModuleID = 'simple.ll'
+  source_filename = "simple.ll"
+  target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128"
+  target triple = "thumbv7k-apple-ios"
+
+  declare void @bar([4 x i32], i32)
+
+  define void @foo(i32 %n) {
+    ret void
+  }
+
+...
+---
+name:            foo
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+frameInfo:
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 2276
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: 0, alignment: 32,  size: 4 }
+constants:       []
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $r0
+
+    ; CHECK: t2STRi12 killed $r0, $r6, [[OFFSET:[0-9]+]]
+    t2STRi12 killed $r0, %stack.0, 0, 14, $noreg :: (store 4 into %stack.0)
+
+    ADJCALLSTACKDOWN 2276, 0, 14, $noreg, implicit-def dead $sp, implicit $sp
+
+    ; CHECK: renamable $r0 = t2LDRi12 $r6, [[OFFSET]]
+    renamable $r0 = t2LDRi12 %stack.0, 0, 14, $noreg, :: (load 4 from %stack.0)
+    renamable $r1 = IMPLICIT_DEF
+    renamable $r2 = IMPLICIT_DEF
+    renamable $r3 = IMPLICIT_DEF
+    tBL 14, $noreg, @bar, csr_ios, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit killed $r1, implicit killed $r2, implicit killed $r3, implicit-def $sp
+
+    ADJCALLSTACKUP 2276, 0, 14, $noreg, implicit-def dead $sp, implicit $sp
+    tBX_RET 14, $noreg
+
+...
diff --git a/test/CodeGen/ARM/pr36577.ll b/test/CodeGen/ARM/pr36577.ll
index 2309ce21292a98974d9aa4254fe010e192ead9b0..11805cb5d84eaa458de15ca79c5b3de8fd6e2749 100644
--- a/test/CodeGen/ARM/pr36577.ll
+++ b/test/CodeGen/ARM/pr36577.ll
@@ -9,12 +9,11 @@
 
 ; CHECK-LABEL: pr36577
 ; CHECK: ldrh r0, [r0]
-; CHECK: bic r0, r1, r0, lsr #5
-; CHECK: mvn r1, #7
-; CHECK: orr r0, r0, r1
+; CHECK: mvn	r0, r0, lsr #7
+; CHECK: orr r0, r1, r0, lsl #2
 ; CHECK-T2: ldrh r0, [r0]
-; CHECK-T2: bic.w r0, r1, r0, lsr #5
-; CHECK-T2: orn r0, r0, #7
+; CHECK-T2: mvn.w	r0, r0, lsr #7
+; CHECK-T2: orr.w	r0, r1, r0, lsl #2
 define dso_local arm_aapcscc i32** @pr36577() {
 entry:
   %0 = load i16, i16* @a, align 2
diff --git a/test/CodeGen/ARM/pr39571.ll b/test/CodeGen/ARM/pr39571.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fbc910a557af6ea0260aaf8e4f6f19daf8d19211
--- /dev/null
+++ b/test/CodeGen/ARM/pr39571.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple armv4t-unknown-linux-gnueabi -mattr=+strict-align
+
+; Avoid crash from forwarding indexed-loads back to store.
+%struct.anon = type { %struct.ma*, %struct.mb }
+%struct.ma = type { i8 }
+%struct.mb = type { i8, i8 }
+%struct.anon.0 = type { %struct.anon.1 }
+%struct.anon.1 = type { %struct.ds }
+%struct.ds = type <{ i8, %union.ie }>
+%union.ie = type { %struct.ib }
+%struct.ib = type { i8, i8, i16 }
+
+@a = common dso_local local_unnamed_addr global %struct.anon* null, align 4
+@b = common dso_local local_unnamed_addr global %struct.anon.0 zeroinitializer, align 1
+
+; Function Attrs: norecurse nounwind
+define dso_local void @func() local_unnamed_addr {
+entry:
+  %0 = load %struct.anon*, %struct.anon** @a, align 4
+  %ad = getelementptr inbounds %struct.anon, %struct.anon* %0, i32 0, i32 0
+  %1 = load %struct.ma*, %struct.ma** %ad, align 4
+  %c.sroa.0.0..sroa_idx = getelementptr inbounds %struct.ma, %struct.ma* %1, i32 0, i32 0
+  %c.sroa.0.0.copyload = load i8, i8* %c.sroa.0.0..sroa_idx, align 1
+  %cb = getelementptr inbounds %struct.anon, %struct.anon* %0, i32 0, i32 1
+  %band = getelementptr inbounds %struct.anon, %struct.anon* %0, i32 0, i32 1, i32 1
+  store i8 %c.sroa.0.0.copyload, i8* %band, align 4
+  store i8 6, i8* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @b, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0), align 1
+  store i8 2, i8* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @b, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1), align 1
+  %2 = bitcast %struct.mb* %cb to i32*
+  %3 = load i32, i32* bitcast (i8* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @b, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) to i32*), align 1
+  store i32 %3, i32* %2, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM/sdiv-pow2-arm-size.ll b/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a9eda31e729e2c632ace978d0356e7b93dc570da
--- /dev/null
+++ b/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mtriple=armv7a -mattr=+hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,DIV
+; RUN: llc -mtriple=armv7a -mattr=-hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,NODIV
+
+; Check SREM
+define dso_local i32 @test_rem(i32 %F) local_unnamed_addr #0 {
+; CHECK-LABEL: test_rem 
+; CHECK:       asr   r1, r0, #31
+; CHECK-NEXT:  add   r1, r0, r1, lsr #30
+; CHECK-NEXT:  bic   r1, r1, #3
+; CHECK-NEXT:  sub   r0, r0, r1
+
+entry:
+  %div = srem i32 %F, 4
+  ret i32 %div
+}
+
+; Try an i16 sdiv, with a small immediate.
+define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 {
+; CHECK-LABEL: f0
+
+; DIV:         mov     r1, #2
+; DIV-NEXT:    sdiv    r0, r0, r1
+; DIV-NEXT:    sxth    r0, r0
+; DIV-NEXT:    bx      lr
+
+; NODIV:       uxth r1, r0
+; NODIV-NEXT:  add  r0, r0, r1, lsr #15
+; NODIV-NEXT:  sxth r0, r0
+; NODIV-NEXT:  asr  r0, r0, #1
+; NODIV-NEXT:  bx   lr
+
+entry:
+  %0 = sdiv i16 %F, 2
+  ret i16 %0
+}
+
+; Try an i32 sdiv, with a small immediate.
+define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 {
+; CHECK-LABEL: f1
+
+; DIV:       mov     r1, #4
+; DIV-NEXT:  sdiv    r0, r0, r1
+; DIV-NEXT:  bx      lr
+
+; NODIV:       asr  r1, r0, #31
+; NODIV-NEXT:  add  r0, r0, r1, lsr #30
+; NODIV-NEXT:  asr  r0, r0, #2
+; NODIV-NEXT:  bx	  lr
+
+entry:
+  %div = sdiv i32 %F, 4
+  ret i32 %div
+}
+
+; Try a large power of 2 immediate, which should also be materialised with 1
+; move immediate instruction.
+define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 {
+; CHECK-LABEL:  f2
+; DIV:        mov r1, #131072
+; DIV-NEXT:   sdiv  r0, r0, r1
+; DIV-NEXT:   bx  lr
+entry:
+  %div = sdiv i32 %F, 131072
+  ret i32 %div
+}
+
+; MinSize not set, so should expand to the faster but longer sequence.
+define dso_local i32 @f3(i32 %F) {
+; CHECK-LABEL: f3
+; CHECK:       asr r1, r0, #31
+; CHECK-NEXT:  add r0, r0, r1, lsr #30
+; CHECK-NEXT:  asr r0, r0, #2
+; CHECK-NEXT:  bx  lr
+entry:
+  %div = sdiv i32 %F, 4
+  ret i32 %div
+}
+
+attributes #0 = { minsize norecurse nounwind optsize readnone }
diff --git a/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll b/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4b0419577cdf01b92f194dabd02f95ecd1c88c4f
--- /dev/null
+++ b/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll
@@ -0,0 +1,105 @@
+; RUN: llc -mtriple=thumbv8 %s -o -       | FileCheck %s --check-prefixes=CHECK,T2
+; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefixes=CHECK,T2
+; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefixes=CHECK,T1
+; RUN: llc -mtriple=thumbv7em %s -o -     | FileCheck %s --check-prefixes=CHECK,T2
+; RUN: llc -mtriple=thumbv6m %s -o -      | FileCheck %s --check-prefixes=V6M
+
+; Armv6m targets don't have a sdiv instruction, so sdiv should not appear at
+; all in the output:
+
+; V6M: .file {{.*}}
+; V6M-NOT:  sdiv
+; V6M-NOT:  idiv
+
+; Test sdiv i16
+define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 {
+; CHECK-LABEL: f0
+; CHECK:       movs    r1, #2
+; CHECK-NEXT:  sdiv    r0, r0, r1
+; CHECK-NEXT:  sxth    r0, r0
+; CHECK-NEXT:  bx      lr
+
+entry:
+  %0 = sdiv i16 %F, 2
+  ret i16 %0
+}
+
+; Same as above, but now with i32
+define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 {
+; CHECK-LABEL: f1
+; CHECK:       movs    r1, #4
+; CHECK-NEXT:  sdiv    r0, r0, r1
+; CHECK-NEXT:  bx      lr
+
+entry:
+  %div = sdiv i32 %F, 4
+  ret i32 %div
+}
+
+; The immediate is not a power of 2, so we expect a sdiv.
+define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 {
+; CHECK-LABEL: f2
+; CHECK:       movs    r1, #5
+; CHECK-NEXT:  sdiv    r0, r0, r1
+; CHECK-NEXT:  bx      lr
+
+entry:
+  %div = sdiv i32 %F, 5
+  ret i32 %div
+}
+
+; Try a larger power of 2 immediate: immediates larger than
+; 128 don't give any code size savings.
+define dso_local i32 @f3(i32 %F) local_unnamed_addr #0 {
+; CHECK-LABEL:  f3
+; CHECK-NOT:    sdiv
+entry:
+  %div = sdiv i32 %F, 256
+  ret i32 %div
+}
+
+attributes #0 = { minsize norecurse nounwind optsize readnone }
+
+
+; These functions don't have the minsize attribute set, so should not lower
+; the sdiv to sdiv, but to the faster instruction sequence.
+
+define dso_local signext i16 @f4(i16 signext %F) {
+; T2-LABEL:  f4
+; T2:        uxth    r1, r0
+; T2-NEXT:   add.w   r0, r0, r1, lsr #15
+; T2-NEXT:   sxth    r0, r0
+; T2-NEXT:   asrs    r0, r0, #1
+; T2-NEXT:   bx      lr
+
+; T1-LABEL: f4
+; T1: 	    uxth  r1, r0
+; T1-NEXT: 	lsrs  r1, r1, #15
+; T1-NEXT: 	adds  r0, r0, r1
+; T1-NEXT: 	sxth  r0, r0
+; T1-NEXT: 	asrs  r0, r0, #1
+; T1-NEXT: 	bx	lr
+
+entry:
+  %0 = sdiv i16 %F, 2
+  ret i16 %0
+}
+
+define dso_local i32 @f5(i32 %F) {
+; T2-LABEL: f5
+; T2:       asrs  r1, r0, #31
+; T2-NEXT:  add.w   r0, r0, r1, lsr #30
+; T2-NEXT:  asrs    r0, r0, #2
+; T2-NEXT:  bx      lr
+
+; T1-LABEL: f5
+; T1: 	    asrs r1, r0, #31
+; T1-NEXT:	lsrs  r1, r1, #30
+; T1-NEXT:	adds  r0, r0, r1
+; T1-NEXT:	asrs  r0, r0, #2
+; T1-NEXT:	bx  lr
+
+entry:
+  %div = sdiv i32 %F, 4
+  ret i32 %div
+}
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 639b88183cc421db2ba400e570c39c9a5e4e8a63..54bc1b9a9925b8c2a4c6cd776a4c2b37778252c2 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -142,3 +142,14 @@ define float @f12(i32 %a, i32 %b) nounwind uwtable readnone ssp {
   ret float %2
 }
 
+; CHECK-LABEL: test_overflow_recombine:
+define i1 @test_overflow_recombine(i32 %in) {
+; CHECK: smull [[LO:r[0-9]+]], [[HI:r[0-9]+]]
+; CHECK: subs [[ZERO:r[0-9]+]], [[HI]], [[LO]], asr #31
+; CHECK: movne [[ZERO]], #1
+  %prod = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 0, i32 %in)
+  %overflow = extractvalue { i32, i1 } %prod, 1
+  ret i1 %overflow
+}
+
+declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32)
diff --git a/test/CodeGen/ARM/setcc-logic.ll b/test/CodeGen/ARM/setcc-logic.ll
index 2c2792ed9868ff912001d410f88dcbab99186f1b..cf482f39f2b5beece7511c44cff8fd8d782a3e1e 100644
--- a/test/CodeGen/ARM/setcc-logic.ll
+++ b/test/CodeGen/ARM/setcc-logic.ll
@@ -61,9 +61,8 @@ define <4 x i1> @and_eq_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32>
 ; CHECK-NEXT:    vceq.i32 q8, q9, q8
 ; CHECK-NEXT:    vld1.64 {d22, d23}, [r0]
 ; CHECK-NEXT:    vceq.i32 q9, q11, q10
+; CHECK-NEXT:    vand q8, q8, q9
 ; CHECK-NEXT:    vmovn.i32 d16, q8
-; CHECK-NEXT:    vmovn.i32 d17, q9
-; CHECK-NEXT:    vand d16, d16, d17
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    pop {r11, pc}
   %cmp1 = icmp eq <4 x i32> %a, %b
diff --git a/test/CodeGen/ARM/smlad0.ll b/test/CodeGen/ARM/smlad0.ll
index b9278b4c22b1e16635b60c4bc4391ea186161ba5..477f5659c162135733e8c40a91a487aa4cb09e51 100644
--- a/test/CodeGen/ARM/smlad0.ll
+++ b/test/CodeGen/ARM/smlad0.ll
@@ -130,3 +130,83 @@ for.body:
   %cmp = icmp slt i32 %add29, %arg
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
+
+define i32 @one_zext(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+; CHECK-LABEL: @one_zext
+; CHECK-NOT: call i32 @llvm.arm.smlad
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = zext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = zext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+define i32 @two_zext(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+; CHECK-LABEL: @two_zext
+; CHECK-NOT: call i32 @llvm.arm.smlad
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  ret i32 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = zext i16 %2 to i32
+  %conv4 = zext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = zext i16 %3 to i32
+  %conv8 = zext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
diff --git a/test/CodeGen/ARM/smlald0.ll b/test/CodeGen/ARM/smlald0.ll
index 6d98c227cfe1a26e4142ea664fde7682a92c6961..97177366d5664e51a4a27949ff27e57b88c73eeb 100644
--- a/test/CodeGen/ARM/smlald0.ll
+++ b/test/CodeGen/ARM/smlald0.ll
@@ -130,3 +130,44 @@ for.body:
   %cmp = icmp slt i32 %add29, %arg
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
+
+define i64 @reduction_zext(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+; CHECK-LABEL: @reduction_zext
+; CHECK-NOT: call i64 @llvm.arm.smlald
+; CHECK-NOT: call i32 @llvm.arm.smlad
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i64 [ 0, %entry ], [ %add11, %for.body ]
+  ret i64 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i64 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i64
+  %conv4 = zext i16 %0 to i64
+  %mul = mul nsw i64 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i64
+  %conv8 = zext i16 %1 to i64
+  %mul9 = mul nsw i64 %conv7, %conv8
+  %add10 = add i64 %mul, %mac1.026
+  %add11 = add i64 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
diff --git a/test/CodeGen/ARM/smlald2.ll b/test/CodeGen/ARM/smlald2.ll
index bf70489f9790503379f43b980610ea37d7df52ec..517a9456c0eca82c09fabbd1da995fdda7279efa 100644
--- a/test/CodeGen/ARM/smlald2.ll
+++ b/test/CodeGen/ARM/smlald2.ll
@@ -136,3 +136,89 @@ for.body:
   %cmp = icmp slt i32 %add29, %arg
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
+
+define i64 @zext_mul_reduction(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+; CHECK-LABEL: @zext_mul_reduction
+; CHECK-NOT: call i64 @llvm.arm.smlald
+; CHECK-NOT: call i32 @llvm.arm.smlad
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i64 [ 0, %entry ], [ %add11, %for.body ]
+  ret i64 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i64 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = zext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %sext0 = sext i32 %mul to i64
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = zext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %sext1 = sext i32 %mul9 to i64
+  %add10 = add i64 %sext0, %mac1.026
+  %add11 = add i64 %sext1, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+define i64 @zext_add_reduction(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+; CHECK-LABEL: @zext_add_reduction
+; CHECK-NOT: call i64 @llvm.arm.smlald
+; CHECK-NOT: call i32 @llvm.arm.smlad
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i64 [ 0, %entry ], [ %add11, %for.body ]
+  ret i64 %mac1.0.lcssa
+
+for.body:
+  %mac1.026 = phi i64 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %sext0 = zext i32 %mul to i64
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %sext1 = zext i32 %mul9 to i64
+  %add10 = add i64 %sext0, %mac1.026
+  %add11 = add i64 %sext1, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
index 79048348e9bb645f9fdc2bf03bf12200c040b494..af34000b2a0ba157a7c33967621f575379acdecf 100644
--- a/test/CodeGen/ARM/smml.ll
+++ b/test/CodeGen/ARM/smml.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6T2
 ; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6T2
 ; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V4
-; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6T2
+; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6T2
 
 define i32 @Test0(i32 %a, i32 %b, i32 %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/ARM/softfp-fabs-fneg.ll b/test/CodeGen/ARM/softfp-fabs-fneg.ll
index b7c684d35b5719a8b58ff763dfca89cb1c4ca122..9777995fe64e1243027a6484dae18f7466501934 100644
--- a/test/CodeGen/ARM/softfp-fabs-fneg.ll
+++ b/test/CodeGen/ARM/softfp-fabs-fneg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=armv7 < %s | FileCheck %s --check-prefix=CHECK-ARM --check-prefix=CHECK
-; RUN: llc -mtriple=thumbv7 < %s | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK
+; RUN: llc -mtriple=armv7 < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7 < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--"
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 45964e534114de9b48957427551dc97497b62063..9720df795eb6d847decdc550e867dcc389918b79 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -75,7 +75,7 @@ if.else:
 ; CHECK: cmp
 define i32 @bc_raise(i1 %cond) nounwind ssp {
 entry:
-  %val.2.i = select i1 %cond, i32 0, i32 undef
+  %val.2.i = select i1 %cond, i32 0, i32 1
   %sub.i = sub nsw i32 0, %val.2.i
   %retval.0.i = select i1 %cond, i32 %val.2.i, i32 %sub.i
   %cmp1 = icmp eq i32 %retval.0.i, 0
diff --git a/test/CodeGen/ARM/tls-models.ll b/test/CodeGen/ARM/tls-models.ll
index 0a82f4d6c8f25c88b90b8b50840d283bddeec3d7..33c85299bdcc56483d697a20506edb6d8e7caa85 100644
--- a/test/CodeGen/ARM/tls-models.ll
+++ b/test/CodeGen/ARM/tls-models.ll
@@ -3,9 +3,9 @@
 ; RUN: llc -mtriple=arm-linux-gnueabi -relocation-model=pic < %s \
 ; RUN:     | FileCheck -check-prefix=CHECK-PIC  -check-prefix=COMMON %s
 ; RUN: llc -emulated-tls -mtriple=arm-linux-gnueabi < %s \
-; RUN:     | FileCheck -check-prefix=EMUNONPIC -check-prefix=EMU -check-prefix=COMMON %s
+; RUN:     | FileCheck -check-prefix=EMU -check-prefix=COMMON %s
 ; RUN: llc -emulated-tls -mtriple=arm-linux-gnueabi -relocation-model=pic < %s \
-; RUN:     | FileCheck -check-prefix=EMUPIC -check-prefix=EMU -check-prefix=COMMON %s
+; RUN:     | FileCheck -check-prefix=EMU -check-prefix=COMMON %s
 
 
 @external_gd = external thread_local global i32
diff --git a/test/CodeGen/ARM/umulo-32.ll b/test/CodeGen/ARM/umulo-32.ll
index 1c8357314c288f16f9d41e9889b79b109f33622f..cfd132aebcc0dde78765a030c8e94bd5ba37db8b 100644
--- a/test/CodeGen/ARM/umulo-32.ll
+++ b/test/CodeGen/ARM/umulo-32.ll
@@ -2,12 +2,12 @@
 
 %umul.ty = type { i32, i1 }
 
-define i32 @test1(i32 %a) nounwind {
+define i32 @test1(i32 %a, i1 %x) nounwind {
 ; CHECK: test1:
 ; CHECK: muldi3
   %tmp0 = tail call %umul.ty @llvm.umul.with.overflow.i32(i32 %a, i32 37)
   %tmp1 = extractvalue %umul.ty %tmp0, 0
-  %tmp2 = select i1 undef, i32 -1, i32 %tmp1
+  %tmp2 = select i1 %x, i32 -1, i32 %tmp1
   ret i32 %tmp2
 }
 
diff --git a/test/CodeGen/ARM/vcvt.ll b/test/CodeGen/ARM/vcvt.ll
index 7052607bf80f4453dc582af67361c47dfa155927..f16c8dc3a151f1d3ecb7b1af99d5d69417ee06b1 100644
--- a/test/CodeGen/ARM/vcvt.ll
+++ b/test/CodeGen/ARM/vcvt.ll
@@ -293,14 +293,14 @@ define <4 x i16> @fix_double_to_i16(<4 x double> %in) {
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
 ; CHECK-NEXT:    vmov d19, r2, r3
 ; CHECK-NEXT:    vadd.f64 d18, d18, d18
-; CHECK-NEXT:    vcvt.u32.f64 s0, d18
+; CHECK-NEXT:    vcvt.s32.f64 s0, d18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vadd.f64 d20, d16, d16
 ; CHECK-NEXT:    vadd.f64 d19, d19, d19
 ; CHECK-NEXT:    vadd.f64 d16, d17, d17
-; CHECK-NEXT:    vcvt.u32.f64 s2, d20
-; CHECK-NEXT:    vcvt.u32.f64 s4, d19
-; CHECK-NEXT:    vcvt.u32.f64 s6, d16
+; CHECK-NEXT:    vcvt.s32.f64 s2, d20
+; CHECK-NEXT:    vcvt.s32.f64 s4, d19
+; CHECK-NEXT:    vcvt.s32.f64 s6, d16
 ; CHECK-NEXT:    vmov.32 d16[0], r0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov.32 d17[0], r0
@@ -308,7 +308,7 @@ define <4 x i16> @fix_double_to_i16(<4 x double> %in) {
 ; CHECK-NEXT:    vmov.32 d16[1], r0
 ; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov.32 d17[1], r0
-; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vuzp.16 d16, d17
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 
diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll
index 8623d2c164bae76776aed403996f2ed028704ccb..ba045a8c5303e79b3a22c28746928066bbfd4aa3 100644
--- a/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -237,17 +237,10 @@ entry:
 ; illegal type to a legal type.
 define <2 x i8> @test_truncate(<2 x i128> %in) {
 ; CHECK-LABEL: test_truncate:
-; REG2 Should map on the same Q register as REG1, i.e., REG2 = REG1 - 1, but we
-; cannot express that.
-; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
 ; CHECK-NEXT: mov [[BASE:r[0-9]+]], sp
-; CHECK-NEXT: vld1.32 {[[REG1:d[0-9]+]][0]}, {{\[}}[[BASE]]:32]
-; CHECK-NEXT: add [[BASE2:r[0-9]+]], [[BASE]], #4
-; CHECK-NEXT: vmov.32 [[REG2]][1], r1
-; CHECK-NEXT: vld1.32 {[[REG1]][1]}, {{\[}}[[BASE2]]:32]
-; The Q register used here should match floor(REG1/2), but we cannot express that.
-; CHECK-NEXT: vmovn.i64 [[RES:d[0-9]+]], q{{[0-9]+}}
-; CHECK-NEXT: vmov r0, r1, [[RES]]
+; CHECK-NEXT: vld1.32 {[[REG]][1]}, {{\[}}[[BASE]]:32]
+; CHECK-NEXT: vmov r0, r1, [[REG]]
 entry:
   %res = trunc <2 x i128> %in to <2 x i8>
   ret <2 x i8> %res
diff --git a/test/CodeGen/AVR/directmem.ll b/test/CodeGen/AVR/directmem.ll
index d0674cbc459510a2ecdb944b6d6f26b2b18b7a7c..6d2ddc536d2a7e4a239b9373f7100a489fa35999 100644
--- a/test/CodeGen/AVR/directmem.ll
+++ b/test/CodeGen/AVR/directmem.ll
@@ -84,6 +84,11 @@ define i16 @global16_load() {
 define void @array16_store() {
 ; CHECK-LABEL: array16_store:
 
+; CHECK: ldi [[REG1:r[0-9]+]], 221
+; CHECK: ldi [[REG2:r[0-9]+]], 170
+; CHECK: sts int.array+5, [[REG2]]
+; CHECK: sts int.array+4, [[REG1]]
+
 ; CHECK: ldi [[REG1:r[0-9]+]], 204
 ; CHECK: ldi [[REG2:r[0-9]+]], 170
 ; CHECK: sts int.array+3, [[REG2]]
@@ -93,12 +98,6 @@ define void @array16_store() {
 ; CHECK: ldi [[REG2:r[0-9]+]], 170
 ; CHECK: sts int.array+1, [[REG2]]
 ; CHECK: sts int.array, [[REG1]]
-
-
-; CHECK: ldi [[REG1:r[0-9]+]], 221
-; CHECK: ldi [[REG2:r[0-9]+]], 170
-; CHECK: sts int.array+5, [[REG2]]
-; CHECK: sts int.array+4, [[REG1]]
   store i16 43707, i16* getelementptr inbounds ([3 x i16], [3 x i16]* @int.array, i32 0, i64 0)
   store i16 43724, i16* getelementptr inbounds ([3 x i16], [3 x i16]* @int.array, i32 0, i64 1)
   store i16 43741, i16* getelementptr inbounds ([3 x i16], [3 x i16]* @int.array, i32 0, i64 2)
@@ -152,30 +151,36 @@ define i32 @global32_load() {
 
 define void @array32_store() {
 ; CHECK-LABEL: array32_store:
+
+; CHECK: ldi [[REG1:r[0-9]+]], 170
+; CHECK: ldi [[REG2:r[0-9]+]], 153
+; CHECK: sts long.array+11, [[REG2]]
+; CHECK: sts long.array+10, [[REG1]]
+
+; CHECK: ldi [[REG1:r[0-9]+]], 204
+; CHECK: ldi [[REG2:r[0-9]+]], 187
+; CHECK: sts long.array+9, [[REG2]]
+; CHECK: sts long.array+8, [[REG1]]
+
 ; CHECK: ldi [[REG1:r[0-9]+]], 102
 ; CHECK: ldi [[REG2:r[0-9]+]], 85
 ; CHECK: sts long.array+7, [[REG2]]
 ; CHECK: sts long.array+6, [[REG1]]
+
 ; CHECK: ldi [[REG1:r[0-9]+]], 136
 ; CHECK: ldi [[REG2:r[0-9]+]], 119
 ; CHECK: sts long.array+5, [[REG2]]
 ; CHECK: sts long.array+4, [[REG1]]
+
 ; CHECK: ldi [[REG1:r[0-9]+]], 27
 ; CHECK: ldi [[REG2:r[0-9]+]], 172
 ; CHECK: sts long.array+3, [[REG2]]
 ; CHECK: sts long.array+2, [[REG1]]
+
 ; CHECK: ldi [[REG1:r[0-9]+]], 68
 ; CHECK: ldi [[REG2:r[0-9]+]], 13
 ; CHECK: sts long.array+1, [[REG2]]
 ; CHECK: sts long.array, [[REG1]]
-; CHECK: ldi [[REG1:r[0-9]+]], 170
-; CHECK: ldi [[REG2:r[0-9]+]], 153
-; CHECK: sts long.array+11, [[REG2]]
-; CHECK: sts long.array+10, [[REG1]]
-; CHECK: ldi [[REG1:r[0-9]+]], 204
-; CHECK: ldi [[REG2:r[0-9]+]], 187
-; CHECK: sts long.array+9, [[REG2]]
-; CHECK: sts long.array+8, [[REG1]]
   store i32 2887454020, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @long.array, i32 0, i64 0)
   store i32 1432778632, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @long.array, i32 0, i64 1)
   store i32 2578103244, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @long.array, i32 0, i64 2)
diff --git a/test/CodeGen/Generic/fneg-fabs.ll b/test/CodeGen/Generic/fneg-fabs.ll
index 2f2f59762cb9b08d9c33fa9fcd40e84cae7d41a1..cafc1f132b57c138baba223ce87671d1669431f5 100644
--- a/test/CodeGen/Generic/fneg-fabs.ll
+++ b/test/CodeGen/Generic/fneg-fabs.ll
@@ -10,6 +10,21 @@ define float @fnegf(float %X) {
         ret float %Y
 }
 
+define double @real_fneg(double %X) {
+        %Y = fneg double %X               ; <double> [#uses=1]
+        ret double %Y
+}
+
+define double @real_fneg_constant() {
+        %Y = fneg double -2.0             ; <double> [#uses=1]
+        ret double %Y
+}
+
+define float @real_fnegf(float %X) {
+        %Y = fneg float %X                ; <float> [#uses=1]
+        ret float %Y
+}
+
 declare double @fabs(double)
 
 declare float @fabsf(float)
diff --git a/test/CodeGen/Generic/llc-start-stop-instance-errors.ll b/test/CodeGen/Generic/llc-start-stop-instance-errors.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cb1b30f3a8197d99cd19a7152e9c0096524c582d
--- /dev/null
+++ b/test/CodeGen/Generic/llc-start-stop-instance-errors.ll
@@ -0,0 +1,4 @@
+; RUN: not llc -debug-pass=Structure -stop-after=dead-mi-elimination,arst %s -o /dev/null 2>&1 \
+; RUN:   | FileCheck -check-prefix=NOT-NUM %s
+
+; NOT-NUM: LLVM ERROR: invalid pass instance specifier dead-mi-elimination,arst
diff --git a/test/CodeGen/Hexagon/addrmode-immop.mir b/test/CodeGen/Hexagon/addrmode-immop.mir
new file mode 100644
index 0000000000000000000000000000000000000000..6aa72fd3414bf97453309f795eb0c01721772451
--- /dev/null
+++ b/test/CodeGen/Hexagon/addrmode-immop.mir
@@ -0,0 +1,40 @@
+# RUN: llc -march=hexagon -run-pass amode-opt -verify-machineinstrs %s -o - | FileCheck %s
+# REQUIRES: asserts
+
+# Check that this doesn't crash.
+# CHECK: $r2 = L2_loadri_io killed $r2, @f1 - 1
+
+--- |
+  target triple = "hexagon-unknown-unknown-elf"
+
+  %s.0 = type { i32 (...)**, i32, i32, %s.1 }
+  %s.1 = type { i32, i32 }
+
+  @g0 = external dso_local unnamed_addr constant { [3 x i8*], [3 x i8*] }, align 4
+
+  ; Function Attrs: norecurse
+  define void @f0() #0 {
+  b0:
+    %v0 = load i32 (%s.0*)*, i32 (%s.0*)** bitcast (i8* getelementptr (i8, i8* bitcast (i8** getelementptr inbounds ({ [3 x i8*], [3 x i8*] }, { [3 x i8*], [3 x i8*] }* @g0, i32 0, inrange i32 0, i32 3) to i8*), i32 sub (i32 ptrtoint (i32 (%s.0*)* @f1 to i32), i32 1)) to i32 (%s.0*)**), align 4
+    %v1 = call i32 %v0(%s.0* nonnull undef)
+    unreachable
+  }
+
+  ; Function Attrs: norecurse nounwind
+  declare dso_local i32 @f1(%s.0*) #1 align 2
+
+  attributes #0 = { norecurse "target-cpu"="hexagonv60" }
+  attributes #1 = { norecurse nounwind "target-cpu"="hexagonv60" }
+...
+
+---
+name: f0
+tracksRegLiveness: true
+body: |
+  bb.0.b0:
+    $r2 = A2_tfrsi @g0 + 12
+    $r2 = L2_loadri_io killed $r2, @f1 - 1 :: (load 4 from `i32 (%s.0*)** bitcast (i8* getelementptr (i8, i8* bitcast (i8** getelementptr inbounds ({ [3 x i8*], [3 x i8*] }, { [3 x i8*], [3 x i8*] }* @g0, i32 0, inrange i32 0, i32 3) to i8*), i32 sub (i32 ptrtoint (i32 (%s.0*)* @f1 to i32), i32 1)) to i32 (%s.0*)**)`)
+    ADJCALLSTACKDOWN 0, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
+    PS_callr_nr killed $r2, hexagoncsr, implicit undef $r0, implicit-def $r29, implicit-def dead $r0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
+...
diff --git a/test/CodeGen/Hexagon/autohvx/bitwise-pred-128b.ll b/test/CodeGen/Hexagon/autohvx/bitwise-pred-128b.ll
index 129c95205b94265a5d7d59aea5c1848e857b2d8c..0fc8ba4bf1dce1bfdcd68aaf0c585ca324373e4a 100644
--- a/test/CodeGen/Hexagon/autohvx/bitwise-pred-128b.ll
+++ b/test/CodeGen/Hexagon/autohvx/bitwise-pred-128b.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; CHECK-LABEL: t00
-; CHECK: and(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vand(v{{[0-9:]+}},v{{[0-9:]+}})
 define <128 x i8> @t00(<128 x i8> %a0, <128 x i8> %a1) #0 {
   %q0 = trunc <128 x i8> %a0 to <128 x i1>
   %q1 = trunc <128 x i8> %a1 to <128 x i1>
@@ -13,7 +13,7 @@ define <128 x i8> @t00(<128 x i8> %a0, <128 x i8> %a1) #0 {
 declare <1024 x i1> @llvm.hexagon.vandvrt.128B(<128 x i8>, i32)
 
 ; CHECK-LABEL: t01
-; CHECK: or(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <128 x i8> @t01(<128 x i8> %a0, <128 x i8> %a1) #0 {
   %q0 = trunc <128 x i8> %a0 to <128 x i1>
   %q1 = trunc <128 x i8> %a1 to <128 x i1>
@@ -23,7 +23,7 @@ define <128 x i8> @t01(<128 x i8> %a0, <128 x i8> %a1) #0 {
 }
 
 ; CHECK-LABEL: t02
-; CHECK: xor(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vxor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <128 x i8> @t02(<128 x i8> %a0, <128 x i8> %a1) #0 {
   %q0 = trunc <128 x i8> %a0 to <128 x i1>
   %q1 = trunc <128 x i8> %a1 to <128 x i1>
@@ -33,7 +33,7 @@ define <128 x i8> @t02(<128 x i8> %a0, <128 x i8> %a1) #0 {
 }
 
 ; CHECK-LABEL: t10
-; CHECK: and(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vand(v{{[0-9:]+}},v{{[0-9:]+}})
 define <64 x i16> @t10(<64 x i16> %a0, <64 x i16> %a1) #0 {
   %q0 = trunc <64 x i16> %a0 to <64 x i1>
   %q1 = trunc <64 x i16> %a1 to <64 x i1>
@@ -43,7 +43,7 @@ define <64 x i16> @t10(<64 x i16> %a0, <64 x i16> %a1) #0 {
 }
 
 ; CHECK-LABEL: t11
-; CHECK: or(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <64 x i16> @t11(<64 x i16> %a0, <64 x i16> %a1) #0 {
   %q0 = trunc <64 x i16> %a0 to <64 x i1>
   %q1 = trunc <64 x i16> %a1 to <64 x i1>
@@ -53,7 +53,7 @@ define <64 x i16> @t11(<64 x i16> %a0, <64 x i16> %a1) #0 {
 }
 
 ; CHECK-LABEL: t12
-; CHECK: xor(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vxor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <64 x i16> @t12(<64 x i16> %a0, <64 x i16> %a1) #0 {
   %q0 = trunc <64 x i16> %a0 to <64 x i1>
   %q1 = trunc <64 x i16> %a1 to <64 x i1>
@@ -63,7 +63,7 @@ define <64 x i16> @t12(<64 x i16> %a0, <64 x i16> %a1) #0 {
 }
 
 ; CHECK-LABEL: t20
-; CHECK: and(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vand(v{{[0-9:]+}},v{{[0-9:]+}})
 define <32 x i32> @t20(<32 x i32> %a0, <32 x i32> %a1) #0 {
   %q0 = trunc <32 x i32> %a0 to <32 x i1>
   %q1 = trunc <32 x i32> %a1 to <32 x i1>
@@ -73,7 +73,7 @@ define <32 x i32> @t20(<32 x i32> %a0, <32 x i32> %a1) #0 {
 }
 
 ; CHECK-LABEL: t21
-; CHECK: or(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <32 x i32> @t21(<32 x i32> %a0, <32 x i32> %a1) #0 {
   %q0 = trunc <32 x i32> %a0 to <32 x i1>
   %q1 = trunc <32 x i32> %a1 to <32 x i1>
@@ -83,7 +83,7 @@ define <32 x i32> @t21(<32 x i32> %a0, <32 x i32> %a1) #0 {
 }
 
 ; CHECK-LABEL: t22
-; CHECK: xor(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vxor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <32 x i32> @t22(<32 x i32> %a0, <32 x i32> %a1) #0 {
   %q0 = trunc <32 x i32> %a0 to <32 x i1>
   %q1 = trunc <32 x i32> %a1 to <32 x i1>
diff --git a/test/CodeGen/Hexagon/autohvx/bitwise-pred-64b.ll b/test/CodeGen/Hexagon/autohvx/bitwise-pred-64b.ll
index 1b547f7801733bf04fbc7f69bba4a07339690846..3895670a04437f1b221a0a964aef865cdf6ab337 100644
--- a/test/CodeGen/Hexagon/autohvx/bitwise-pred-64b.ll
+++ b/test/CodeGen/Hexagon/autohvx/bitwise-pred-64b.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; CHECK-LABEL: t00
-; CHECK: and(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vand(v{{[0-9:]+}},v{{[0-9:]+}})
 define <64 x i8> @t00(<64 x i8> %a0, <64 x i8> %a1) #0 {
   %q0 = trunc <64 x i8> %a0 to <64 x i1>
   %q1 = trunc <64 x i8> %a1 to <64 x i1>
@@ -11,7 +11,7 @@ define <64 x i8> @t00(<64 x i8> %a0, <64 x i8> %a1) #0 {
 }
 
 ; CHECK-LABEL: t01
-; CHECK: or(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <64 x i8> @t01(<64 x i8> %a0, <64 x i8> %a1) #0 {
   %q0 = trunc <64 x i8> %a0 to <64 x i1>
   %q1 = trunc <64 x i8> %a1 to <64 x i1>
@@ -21,7 +21,7 @@ define <64 x i8> @t01(<64 x i8> %a0, <64 x i8> %a1) #0 {
 }
 
 ; CHECK-LABEL: t02
-; CHECK: xor(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vxor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <64 x i8> @t02(<64 x i8> %a0, <64 x i8> %a1) #0 {
   %q0 = trunc <64 x i8> %a0 to <64 x i1>
   %q1 = trunc <64 x i8> %a1 to <64 x i1>
@@ -31,7 +31,7 @@ define <64 x i8> @t02(<64 x i8> %a0, <64 x i8> %a1) #0 {
 }
 
 ; CHECK-LABEL: t10
-; CHECK: and(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vand(v{{[0-9:]+}},v{{[0-9:]+}})
 define <32 x i16> @t10(<32 x i16> %a0, <32 x i16> %a1) #0 {
   %q0 = trunc <32 x i16> %a0 to <32 x i1>
   %q1 = trunc <32 x i16> %a1 to <32 x i1>
@@ -41,7 +41,7 @@ define <32 x i16> @t10(<32 x i16> %a0, <32 x i16> %a1) #0 {
 }
 
 ; CHECK-LABEL: t11
-; CHECK: or(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <32 x i16> @t11(<32 x i16> %a0, <32 x i16> %a1) #0 {
   %q0 = trunc <32 x i16> %a0 to <32 x i1>
   %q1 = trunc <32 x i16> %a1 to <32 x i1>
@@ -51,7 +51,7 @@ define <32 x i16> @t11(<32 x i16> %a0, <32 x i16> %a1) #0 {
 }
 
 ; CHECK-LABEL: t12
-; CHECK: xor(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vxor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <32 x i16> @t12(<32 x i16> %a0, <32 x i16> %a1) #0 {
   %q0 = trunc <32 x i16> %a0 to <32 x i1>
   %q1 = trunc <32 x i16> %a1 to <32 x i1>
@@ -61,7 +61,7 @@ define <32 x i16> @t12(<32 x i16> %a0, <32 x i16> %a1) #0 {
 }
 
 ; CHECK-LABEL: t20
-; CHECK: and(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vand(v{{[0-9:]+}},v{{[0-9:]+}})
 define <16 x i32> @t20(<16 x i32> %a0, <16 x i32> %a1) #0 {
   %q0 = trunc <16 x i32> %a0 to <16 x i1>
   %q1 = trunc <16 x i32> %a1 to <16 x i1>
@@ -71,7 +71,7 @@ define <16 x i32> @t20(<16 x i32> %a0, <16 x i32> %a1) #0 {
 }
 
 ; CHECK-LABEL: t21
-; CHECK: or(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <16 x i32> @t21(<16 x i32> %a0, <16 x i32> %a1) #0 {
   %q0 = trunc <16 x i32> %a0 to <16 x i1>
   %q1 = trunc <16 x i32> %a1 to <16 x i1>
@@ -81,7 +81,7 @@ define <16 x i32> @t21(<16 x i32> %a0, <16 x i32> %a1) #0 {
 }
 
 ; CHECK-LABEL: t22
-; CHECK: xor(q{{[0-3]}},q{{[0-3]}})
+; CHECK: vxor(v{{[0-9:]+}},v{{[0-9:]+}})
 define <16 x i32> @t22(<16 x i32> %a0, <16 x i32> %a1) #0 {
   %q0 = trunc <16 x i32> %a0 to <16 x i1>
   %q1 = trunc <16 x i32> %a1 to <16 x i1>
diff --git a/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll
index 86687b3c047e5e76deefe7542e1875e5f9c98df6..c79525442c6a7a00d531eb048b20345c0649ff70 100644
--- a/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll
+++ b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll
@@ -9,11 +9,11 @@ target triple = "hexagon"
 @g0 = global <16 x float> zeroinitializer, align 8
 @g1 = global <16 x i32> zeroinitializer, align 8
 
-define void @fred() #0 {
+define void @fred(<16 x i16> %x) #0 {
 b0:
   %v1 = load <16 x float>, <16 x float>* @g0, align 8
   %v2 = fcmp olt <16 x float> undef, %v1
-  %v3 = select <16 x i1> %v2, <16 x i16> undef, <16 x i16> zeroinitializer
+  %v3 = select <16 x i1> %v2, <16 x i16> %x, <16 x i16> zeroinitializer
   %v4 = sext <16 x i16> %v3 to <16 x i32>
   store <16 x i32> %v4, <16 x i32>* @g1, align 64
   ret void
diff --git a/test/CodeGen/Hexagon/autohvx/isel-concat-multiple.ll b/test/CodeGen/Hexagon/autohvx/isel-concat-multiple.ll
index 67d8e6663cbd50d7adb289541ef33c892f97e992..4645ebce5de6b5b4f62b03080931436fb409154e 100644
--- a/test/CodeGen/Hexagon/autohvx/isel-concat-multiple.ll
+++ b/test/CodeGen/Hexagon/autohvx/isel-concat-multiple.ll
@@ -15,16 +15,16 @@ b0:
   %v4 = sub nsw i32 0, %v3
   %v5 = load i32, i32* %a1, align 4
   %v6 = insertelement <2 x i32> undef, i32 %v5, i32 1
-  %v7 = add nsw <2 x i32> undef, %v6
+  %v7 = add nsw <2 x i32> %v6, %v6
   %v8 = extractelement <2 x i32> %v7, i32 0
   %v9 = insertelement <4 x i32> undef, i32 %v4, i32 2
   %v10 = insertelement <4 x i32> %v9, i32 undef, i32 3
-  %v11 = add <4 x i32> %v10, <i32 131072, i32 131072, i32 131072, i32 131072>
+  %v11 = add <4 x i32> %v10, %v10
   %v12 = sub <4 x i32> %v11, zeroinitializer
   %v13 = shufflevector <4 x i32> %v12, <4 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
   %v14 = shufflevector <8 x i32> undef, <8 x i32> %v13, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   %v15 = lshr <8 x i32> %v14, <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18>
-  %v16 = and <8 x i32> %v15, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
+  %v16 = and <8 x i32> %v15, %v14
   %v17 = extractelement <8 x i32> %v16, i32 5
   %v18 = getelementptr inbounds i8, i8* null, i32 %v17
   %v19 = load i8, i8* %v18, align 1
diff --git a/test/CodeGen/Hexagon/autohvx/isel-extractelt-illegal-type.ll b/test/CodeGen/Hexagon/autohvx/isel-extractelt-illegal-type.ll
index 2e915e017f50d1221c3db00bc38c56cf120f18b9..cc44c149ca7247707d8f91d87dfd5e9d5f82fa76 100644
--- a/test/CodeGen/Hexagon/autohvx/isel-extractelt-illegal-type.ll
+++ b/test/CodeGen/Hexagon/autohvx/isel-extractelt-illegal-type.ll
@@ -21,7 +21,7 @@ b0:
   %v7 = tail call i32 @llvm.hexagon.A2.subh.l16.sat.ll(i32 %v6, i32 16)
   %v8 = trunc i32 %v7 to i16
   %v9 = icmp sgt i16 %v8, -1
-  %v10 = select i1 %v9, i16 0, i16 undef
+  %v10 = select i1 %v9, i16 0, i16 1
   ret i16 %v10
 }
 
diff --git a/test/CodeGen/Hexagon/dfp.ll b/test/CodeGen/Hexagon/dfp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e4fe10bad0bd75611cff975cebafb15f9b9349bc
--- /dev/null
+++ b/test/CodeGen/Hexagon/dfp.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK-LABEL: df_add:
+; CHECK: dfadd
+define double @df_add(double %x, double %y) local_unnamed_addr #0 {
+entry:
+  %add = fadd double %x, %y
+  ret double %add
+}
+
+; CHECK-LABEL: df_sub:
+; CHECK: dfsub
+define double @df_sub(double %x, double %y) local_unnamed_addr #0 {
+entry:
+  %sub = fsub double %x, %y
+  ret double %sub
+}
+
+attributes #0 = { norecurse nounwind readnone "target-cpu"="hexagonv66" }
diff --git a/test/CodeGen/Hexagon/expand-condsets-pred-undef2.ll b/test/CodeGen/Hexagon/expand-condsets-pred-undef2.ll
index e581506109d805bbef99f1c83a2d63682a533f77..5201f3e30ebc91bd132bb7dad23f4f50b8775b09 100644
--- a/test/CodeGen/Hexagon/expand-condsets-pred-undef2.ll
+++ b/test/CodeGen/Hexagon/expand-condsets-pred-undef2.ll
@@ -3,11 +3,11 @@
 ; CHECK: if{{.*}}sub
 
 ; Function Attrs: nounwind
-define i32 @f0(i32 %a0, i32 %a1, i32 %a2) #0 {
+define i32 @f0(i32 %a0, i32 %a1, i32 %a2, i1 %x) #0 {
 b0:
   %v0 = add i32 %a0, %a2
   %v1 = sub i32 %a1, %a2
-  %v2 = select i1 undef, i32 %v0, i32 %v1
+  %v2 = select i1 %x, i32 %v0, i32 %v1
   ret i32 %v2
 }
 
diff --git a/test/CodeGen/Hexagon/expand-wselect.mir b/test/CodeGen/Hexagon/expand-wselect.mir
new file mode 100644
index 0000000000000000000000000000000000000000..cb3dbc25aa5ec3640c625e70371249ec1d90b190
--- /dev/null
+++ b/test/CodeGen/Hexagon/expand-wselect.mir
@@ -0,0 +1,13 @@
+# RUN: llc -march=hexagon -run-pass postrapseudos %s -o - | FileCheck %s
+
+# Check that this doesn't crash.
+# CHECK: $w2 = V6_vccombine $p0, $v1, $v0
+# CHECK: $w2 = V6_vnccombine killed $p0, $v3, $v2, implicit $w2
+
+name: fred
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $w0, $w1
+    $w2 = PS_wselect killed $p0, killed $w0, killed $w1
+---
diff --git a/test/CodeGen/Hexagon/intrinsics-v66.ll b/test/CodeGen/Hexagon/intrinsics-v66.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2732b9cb39611362e991a95bad520d7c722c91c7
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics-v66.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv66 < %s | FileCheck %s
+
+; CHECK-LABEL: @test1
+; CHECK: r0 -= mpyi(r1,r2)
+define i32 @test1(i32 %rx, i32 %rs, i32 %rt) local_unnamed_addr #0 {
+entry:
+  %v0 = tail call i32 @llvm.hexagon.M2.mnaci(i32 %rx, i32 %rs, i32 %rt)
+  ret i32 %v0
+}
+
+declare i32 @llvm.hexagon.M2.mnaci(i32, i32, i32) #1
+
+; CHECK-LABEL: @test2
+; CHECK: r1:0 = dfadd(r1:0,r3:2)
+define double @test2(double %rss, double %rtt) local_unnamed_addr #0 {
+entry:
+  %v0 = tail call double @llvm.hexagon.F2.dfadd(double %rss, double %rtt)
+  ret double %v0
+}
+
+declare double @llvm.hexagon.F2.dfadd(double, double) #1
+
+; CHECK-LABEL: @test3
+; CHECK: r1:0 = dfsub(r1:0,r3:2)
+define double @test3(double %rss, double %rtt) local_unnamed_addr #0 {
+entry:
+  %v0 = tail call double @llvm.hexagon.F2.dfsub(double %rss, double %rtt)
+  ret double %v0
+}
+
+declare double @llvm.hexagon.F2.dfsub(double, double) #1
+
+; CHECK-LABEL: @test4
+; CHECK: r0 = mask(#1,#2)
+define i32 @test4() local_unnamed_addr #0 {
+entry:
+  %v0 = tail call i32 @llvm.hexagon.S2.mask(i32 1, i32 2)
+  ret i32 %v0
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.S2.mask(i32, i32) #1
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv66" "target-features"="-hvx,-long-calls" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/isel-global-offset-alignment.ll b/test/CodeGen/Hexagon/isel-global-offset-alignment.ll
index 3c56db4d90d9a447f3f37164e2844a12064598e7..ed37687a15949f514d217f6c164c88c4137fd1a1 100644
--- a/test/CodeGen/Hexagon/isel-global-offset-alignment.ll
+++ b/test/CodeGen/Hexagon/isel-global-offset-alignment.ll
@@ -3,17 +3,17 @@
 ; This should compile without errors, and the offsets with respect to the
 ; beginning of the global "array" don't need to be multiples of 8.
 ;
-; CHECK-DAG: memd(r0+##array+174)
-; CHECK-DAG: memd(r0+##array+182)
+; CHECK-DAG: memd(r2+##array+174)
+; CHECK-DAG: memd(r2+##array+182)
 
 target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
 target triple = "hexagon"
 
 @array = external global [1000000 x i16], align 8
 
-define void @fred() #0 {
+define void @fred(i1 %x) #0 {
 b0:
-  br i1 undef, label %b3, label %b1
+  br i1 %x, label %b3, label %b1
 
 b1:                                               ; preds = %b0
   %v2 = add i32 0, 512
diff --git a/test/CodeGen/Hexagon/isel-vlsr-v2i16.ll b/test/CodeGen/Hexagon/isel-vlsr-v2i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..995ce06129d810d9d69a34dff8b6943a1ab3f1a6
--- /dev/null
+++ b/test/CodeGen/Hexagon/isel-vlsr-v2i16.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; This used to crash with "cannot select" error.
+; CHECK: vlsrh(r1:0,#4)
+
+target triple = "hexagon-unknown-linux-gnu"
+
+define <2 x i16> @foo(<2 x i32>* nocapture %v) nounwind {
+  %vec = load <2 x i32>, <2 x i32>* %v, align 8
+  %trunc = trunc <2 x i32> %vec to <2 x i16>
+  %r = lshr <2 x i16> %trunc, <i16 4, i16 4>
+  ret <2 x i16> %r
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
+
diff --git a/test/CodeGen/Hexagon/mnaci_v66.ll b/test/CodeGen/Hexagon/mnaci_v66.ll
new file mode 100644
index 0000000000000000000000000000000000000000..63f3788fe565ec58adf7ed19b1da9a10e6a31de7
--- /dev/null
+++ b/test/CodeGen/Hexagon/mnaci_v66.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; This test validates the generation of v66 only instruction M2_mnaci
+; CHECK: r{{[0-9]+}} -= mpyi(r{{[0-9]+}},r{{[0-9]+}})
+
+target triple = "hexagon-unknown--elf"
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @_Z4testiii(i32 %a, i32 %b, i32 %c) #0 {
+entry:
+  %mul = mul nsw i32 %c, %b
+  %sub = sub nsw i32 %a, %mul
+  ret i32 %sub
+}
+
+attributes #0 = { norecurse nounwind readnone "target-cpu"="hexagonv66" "target-features"="-hvx,-long-calls" }
diff --git a/test/CodeGen/Hexagon/multi-cycle.ll b/test/CodeGen/Hexagon/multi-cycle.ll
index 920ca9fddb4a33ded5c989b5efe7be585965ffd6..1e2ea26a7931cc7e9f714f9e3468c7c32d5d16be 100644
--- a/test/CodeGen/Hexagon/multi-cycle.ll
+++ b/test/CodeGen/Hexagon/multi-cycle.ll
@@ -3,10 +3,10 @@
 ; CHECK: v{{[0-9]+}}.h = vadd(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
 ; CHECK: }
 ; CHECK: {
-; CHECK: v{{[0-9]+}} = valign(v{{[0-9]+}},v{{[0-9]+}},r{{[0-9]+}})
+; CHECK: v{{[0-9]+}} = valign(v{{[0-9]+}},v{{[0-9]+}},#2)
 ; CHECK: }
 ; CHECK: {
-; CHECK: v{{[0-9]+}} = valign(v{{[0-9]+}},v{{[0-9]+}},r{{[0-9]+}})
+; CHECK: v{{[0-9]+}} = valign(v{{[0-9]+}},v{{[0-9]+}},#2)
 
 target triple = "hexagon"
 
diff --git a/test/CodeGen/Hexagon/noreturn-stack-elim.ll b/test/CodeGen/Hexagon/noreturn-stack-elim.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e58215e62e03ce37ad9cd8b9c4927d4621b3cc71
--- /dev/null
+++ b/test/CodeGen/Hexagon/noreturn-stack-elim.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mtriple=hexagon-unknown--elf -hexagon-initial-cfg-cleanup=false < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon-unknown--elf -hexagon-initial-cfg-cleanup=false -mattr=+noreturn-stack-elim < %s | FileCheck %s --check-prefix=CHECK-FLAG
+
+; Test the noreturn stack elimination feature. We've added a new flag/feature
+; that attempts to eliminate the local stack for noreturn nounwind functions.
+; The optimization eliminates the need to save callee saved registers, and
+; eliminates the allocframe, when no local stack space is needed.
+
+%struct.A = type { i32, i32 }
+
+; Test the case when noreturn-stack-elim determins that both callee saved
+; register do not need to be saved, and the allocframe can be eliminated.
+
+; CHECK-LABEL: test1
+; CHECK: memd(r29+#-16) = r17:16
+; CHECK: allocframe
+
+; CHECK-FLAG-LABEL: test1
+; CHECK-FLAG-NOT: memd(r29+#-16) = r17:16
+; CHECK-FLAG-NOT: allocframe
+
+define dso_local void @test1(i32 %a, %struct.A* %b) local_unnamed_addr #0 {
+entry:
+  %n = getelementptr inbounds %struct.A, %struct.A* %b, i32 0, i32 0
+  store i32 %a, i32* %n, align 4
+  tail call void @f1() #3
+  tail call void @nrf1(%struct.A* %b) #4
+  unreachable
+}
+
+; Test that noreturn-stack-elim doesn't eliminate the local stack, when
+; a function needs to allocate a local variable.
+
+; CHECK-LABEL: test2
+; CHECK: allocframe
+
+; CHECK-FLAG-LABEL: test2
+; CHECK-FLAG: allocframe
+
+define dso_local void @test2() local_unnamed_addr #0 {
+entry:
+  %a = alloca i32, align 4
+  %0 = bitcast i32* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
+  call void @f3(i32* nonnull %a) #4
+  unreachable
+}
+
+; Test that noreturn-stack-elim can elimnate the allocframe when no locals
+; are allocated on the stack.
+
+; CHECK-LABEL: test3
+; CHECK: allocframe
+
+; CHECK-FLAG-LABEL: test3
+; CHECK-FLAG-NOT: allocframe
+
+define dso_local void @test3(i32 %a) local_unnamed_addr #0 {
+entry:
+  %add = add nsw i32 %a, 5
+  call void @f2(i32 %add)
+  unreachable
+}
+
+; Test that nothing is optimized when an alloca is needed for local stack.
+
+; CHECK-LABEL: test4
+; CHECK: allocframe
+
+; CHECK-FLAG-LABEL: test4
+; CHECK-FLAG: allocframe
+
+define dso_local void @test4(i32 %n) local_unnamed_addr #0 {
+entry:
+  %vla = alloca i32, i32 %n, align 8
+  call void @f3(i32* nonnull %vla) #4
+  unreachable
+}
+
+
+declare dso_local void @f1() local_unnamed_addr
+declare dso_local void @f2(i32) local_unnamed_addr
+declare dso_local void @f3(i32*) local_unnamed_addr
+
+declare dso_local void @nrf1(%struct.A*) local_unnamed_addr #2
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #5
+
+attributes #0 = { noreturn nounwind }
+attributes #2 = { noreturn }
+attributes #3 = { nounwind }
+attributes #4 = { noreturn nounwind }
+attributes #5 = { argmemonly nounwind }
+
diff --git a/test/CodeGen/Hexagon/packetize-debug-loc.mir b/test/CodeGen/Hexagon/packetize-debug-loc.mir
index 9bcc5ba069317746b9246a667ca1ff63b61d99ba..f86fb05ac670a8fcd5441b10e562a304143de23e 100644
--- a/test/CodeGen/Hexagon/packetize-debug-loc.mir
+++ b/test/CodeGen/Hexagon/packetize-debug-loc.mir
@@ -41,9 +41,9 @@ body: |
 # CHECK-LABEL: name: test
 
 # CHECK: BUNDLE
-# CHECK-SAME: debug-location [[DL1:[0-9x<>]+]]
+# CHECK-SAME: debug-location [[DL1:!DILocation([^)]+)]]
 # CHECK-NEXT: L2_loadri_io $r1, 0, debug-location [[DL1]]
-# CHECK-NEXT: L2_loadri_io $r1, 0, debug-location [[DL2:[0-9x<>]+]]
+# CHECK-NEXT: L2_loadri_io $r1, 0, debug-location [[DL2:!DILocation([^)]+)]]
 
 # CHECK: BUNDLE
 # CHECK-SAME: debug-location [[DL1]]
diff --git a/test/CodeGen/Hexagon/sdata-explicit-section.ll b/test/CodeGen/Hexagon/sdata-explicit-section.ll
new file mode 100644
index 0000000000000000000000000000000000000000..807474f43826c2f1dea24e9fe7eebc6bc7c85bab
--- /dev/null
+++ b/test/CodeGen/Hexagon/sdata-explicit-section.ll
@@ -0,0 +1,6 @@
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -relocation-model=pic < %s | FileCheck %s
+
+; CHECK: .section .sdata.4,"aws",@progbits
+@g0 = global i32 zeroinitializer, section ".sdata"
+
diff --git a/test/CodeGen/Hexagon/swp-const-tc1.ll b/test/CodeGen/Hexagon/swp-const-tc1.ll
index 26ad82a20c424aeefa6d0ba04554704af38f2c9e..95dfc37e3062068cb05f864203f1ad0e2d5ac27c 100644
--- a/test/CodeGen/Hexagon/swp-const-tc1.ll
+++ b/test/CodeGen/Hexagon/swp-const-tc1.ll
@@ -13,7 +13,7 @@
 ; CHECK: memb(r{{[0-9]+}}+#0) =
 
 ; Function Attrs: nounwind optsize
-define void @f0() #0 {
+define void @f0(i1 %x) #0 {
 b0:
   br label %b1
 
@@ -34,7 +34,7 @@ b3:                                               ; preds = %b3, %b2
   %v7 = add i32 %v6, undef
   %v8 = icmp slt i32 undef, %v7
   %v9 = add nsw i32 %v7, 1
-  %v10 = select i1 undef, i32 undef, i32 %v9
+  %v10 = select i1 %x, i32 1, i32 %v9
   %v11 = add i32 %v10, 0
   %v12 = getelementptr inbounds i8, i8* null, i32 %v11
   %v13 = load i8, i8* %v12, align 1, !tbaa !4
diff --git a/test/CodeGen/Lanai/codemodel.ll b/test/CodeGen/Lanai/codemodel.ll
index 6bc0dc4d398c137f4ad0e815ddc1e821218ff574..72d1d65daf52d24809f0538eddbbf05822f72813 100644
--- a/test/CodeGen/Lanai/codemodel.ll
+++ b/test/CodeGen/Lanai/codemodel.ll
@@ -1,5 +1,10 @@
 ; RUN: llc -march=lanai < %s | FileCheck %s
 ; RUN: llc -march=lanai < %s -code-model=small  | FileCheck -check-prefix CHECK-SMALL %s
+; RUN: not llc -march=lanai < %s -code-model=tiny 2>&1 | FileCheck -check-prefix CHECK-TINY %s
+; RUN: not llc -march=lanai < %s -code-model=kernel 2>&1 | FileCheck -check-prefix CHECK-KERNEL %s
+
+; CHECK-TINY: Target does not support the tiny CodeModel
+; CHECK-KERNEL: Target does not support the kernel CodeModel
 
 @data = external global [0 x i32]		; <[0 x i32]*> [#uses=5]
 
diff --git a/test/CodeGen/MIR/AArch64/cfi.mir b/test/CodeGen/MIR/AArch64/cfi.mir
index 747d58eb32019e6df315f44bf9ea2247d72af87a..04380e07f3ee3c53f215c06cb739a8e4b0ff3e47 100644
--- a/test/CodeGen/MIR/AArch64/cfi.mir
+++ b/test/CodeGen/MIR/AArch64/cfi.mir
@@ -45,4 +45,6 @@ body: |
     ; CHECK: CFI_INSTRUCTION escape 0x61, 0x62, 0x63
     CFI_INSTRUCTION window_save
     ; CHECK: CFI_INSTRUCTION window_save
+    CFI_INSTRUCTION negate_ra_sign_state
+    ; CHECK: CFI_INSTRUCTION negate_ra_sign_state
     RET_ReallyLR
diff --git a/test/CodeGen/MIR/PowerPC/prolog_vec_spills.mir b/test/CodeGen/MIR/PowerPC/prolog_vec_spills.mir
new file mode 100644
index 0000000000000000000000000000000000000000..15ff4dd48340505e71ca53707d8474f723dc4bef
--- /dev/null
+++ b/test/CodeGen/MIR/PowerPC/prolog_vec_spills.mir
@@ -0,0 +1,62 @@
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=prologepilog -ppc-enable-pe-vector-spills %s -o - | FileCheck %s
+
+---
+name:            test1BB
+alignment:       4
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0.entry:
+    $r14 = IMPLICIT_DEF
+    $r15 = IMPLICIT_DEF
+    $r16 = IMPLICIT_DEF
+    $f0 = IMPLICIT_DEF
+    $v20 = IMPLICIT_DEF
+    BLR8 implicit undef $lr8, implicit undef $rm
+
+# CHECK-LABEL: name:            test1BB
+# CHECK: body:             |
+# CHECK: $f1 = MTVSRD killed $x14
+# CHECK-NEXT: $f2 = MTVSRD killed $x15
+# CHECK-NEXT: $f3 = MTVSRD killed $x16
+# CHECK: $x16 = MFVSRD killed $f3
+# CHECK-NEXT: $x15 = MFVSRD killed $f2
+# CHECK-NEXT: $x14 = MFVSRD killed $f1
+...
+
+---
+name:            test2BBs
+alignment:       4
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+
+    $cr0 = IMPLICIT_DEF
+    BCC 4, killed renamable $cr0, %bb.2
+    B %bb.1
+
+  bb.1:
+    $r14 = IMPLICIT_DEF
+    $r15 = IMPLICIT_DEF
+    $r16 = IMPLICIT_DEF
+    $r3 = IMPLICIT_DEF
+    B %bb.3
+
+  bb.2:
+    liveins: $x3
+    $r3 = IMPLICIT_DEF
+
+  bb.3:
+    BLR8 implicit undef $lr8, implicit undef $rm
+
+# CHECK-LABEL: name:            test2BB
+# CHECK: body:             |
+# CHECK: $f0 = MTVSRD killed $x14
+# CHECK-NEXT: $f1 = MTVSRD killed $x15
+# CHECK-NEXT: $f2 = MTVSRD killed $x16
+# CHECK: $x16 = MFVSRD killed $f2
+# CHECK-NEXT: $x15 = MFVSRD killed $f1
+# CHECK-NEXT: $x14 = MFVSRD killed $f0
+...
diff --git a/test/CodeGen/MIR/X86/branch-folder-with-label.mir b/test/CodeGen/MIR/X86/branch-folder-with-label.mir
new file mode 100644
index 0000000000000000000000000000000000000000..f9180ac6764cc73b8f3dd8d2f96f7b58bba8ea02
--- /dev/null
+++ b/test/CodeGen/MIR/X86/branch-folder-with-label.mir
@@ -0,0 +1,397 @@
+# Generated with
+#
+# clang -g -O1 -S -emit-llvm test.c
+# llc -stop-before=branch-folder test.ll
+#
+# typedef struct bar {
+#   int data;
+# } bar;
+#
+# int foo(int a)
+# {
+#   return a;
+# }
+#
+# int baz(int *out)
+# {
+#   int ret;
+#
+#   if ((ret = foo(*out)) < 0)
+#     return ret;
+#   if (out)
+#     *out = 1;
+#
+#   return 0;
+# }
+#
+# int test(bar *s)
+# {
+#   int idx, ret;
+#
+# retry:
+#   do {
+#     ret = baz(&idx);
+#     if (ret < 0)
+#       return ret;
+#   } while (idx < 0 || !s->data);
+#
+#   goto retry;
+# }
+#
+# RUN: llc -o - %s -mtriple=x86_64-- -run-pass=branch-folder | FileCheck %s
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.ll"
+  
+  %struct.bar = type { i32 }
+  
+  define i32 @foo(i32 returned %a) local_unnamed_addr {
+  entry:
+    ret i32 %a
+  }
+  
+  define i32 @baz(i32* %out) local_unnamed_addr !dbg !4 {
+  entry:
+    %0 = load i32, i32* %out, align 4
+    %call = tail call i32 @foo(i32 %0), !dbg !9
+    %cmp = icmp slt i32 %call, 0
+    br i1 %cmp, label %cleanup, label %if.then1
+  
+  if.then1:                                         ; preds = %entry
+    store i32 1, i32* %out, align 4
+    br label %cleanup
+  
+  cleanup:                                          ; preds = %if.then1, %entry
+    %retval.0 = phi i32 [ %call, %entry ], [ 0, %if.then1 ]
+    ret i32 %retval.0
+  }
+  
+  define i32 @test(%struct.bar* nocapture readonly %s) local_unnamed_addr !dbg !11 {
+  entry:
+    %idx = alloca i32, align 4
+    call void @llvm.dbg.label(metadata !20), !dbg !21
+    %call58 = call i32 @baz(i32* nonnull %idx), !dbg !22
+    %cmp69 = icmp slt i32 %call58, 0
+    br i1 %cmp69, label %if.then, label %do.cond.lr.ph.lr.ph
+  
+  do.cond.lr.ph.lr.ph:                              ; preds = %entry
+    br label %do.cond
+  
+  retry.loopexit:                                   ; preds = %lor.rhs
+    call void @llvm.dbg.label(metadata !20), !dbg !21
+    %call5 = call i32 @baz(i32* nonnull %idx), !dbg !22
+    %cmp6 = icmp slt i32 %call5, 0
+    br i1 %cmp6, label %if.then, label %do.cond
+  
+  if.then:                                          ; preds = %do.body.backedge, %retry.loopexit, %entry
+    %call.lcssa = phi i32 [ %call58, %entry ], [ %call, %do.body.backedge ], [ %call5, %retry.loopexit ]
+    ret i32 %call.lcssa
+  
+  do.cond:                                          ; preds = %retry.loopexit, %do.body.backedge, %do.cond.lr.ph.lr.ph
+    %0 = load i32, i32* %idx, align 4
+    %cmp1 = icmp slt i32 %0, 0
+    br i1 %cmp1, label %do.body.backedge, label %lor.rhs
+  
+  lor.rhs:                                          ; preds = %do.cond
+    %1 = bitcast %struct.bar* %s to i32*
+    %2 = load i32, i32* %1, align 4
+    %tobool = icmp eq i32 %2, 0
+    br i1 %tobool, label %do.body.backedge, label %retry.loopexit
+  
+  do.body.backedge:                                 ; preds = %lor.rhs, %do.cond
+    %call = call i32 @baz(i32* nonnull %idx), !dbg !22
+    %cmp = icmp slt i32 %call, 0
+    br i1 %cmp, label %if.then, label %do.cond
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.label(metadata) #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { nounwind readnone speculatable }
+  attributes #1 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: GNU)
+  !1 = !DIFile(filename: "test.c", directory: "/home/users")
+  !2 = !{}
+  !3 = !{i32 2, !"Debug Info Version", i32 3}
+  !4 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 10, type: !5, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7, !8}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+  !9 = !DILocation(line: 14, column: 14, scope: !10)
+  !10 = distinct !DILexicalBlock(scope: !4, file: !1, line: 14, column: 7)
+  !11 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 22, type: !12, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !19)
+  !12 = !DISubroutineType(types: !13)
+  !13 = !{!7, !14}
+  !14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !15, size: 64)
+  !15 = !DIDerivedType(tag: DW_TAG_typedef, name: "bar", file: !1, line: 3, baseType: !16)
+  !16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "bar", file: !1, line: 1, size: 32, elements: !17)
+  !17 = !{!18}
+  !18 = !DIDerivedType(tag: DW_TAG_member, name: "data", scope: !16, file: !1, line: 2, baseType: !7, size: 32)
+  !19 = !{!20}
+  !20 = !DILabel(scope: !11, name: "retry", file: !1, line: 26)
+  !21 = !DILocation(line: 26, column: 1, scope: !11)
+  !22 = !DILocation(line: 28, column: 11, scope: !23)
+  !23 = distinct !DILexicalBlock(scope: !11, file: !1, line: 27, column: 6)
+
+...
+---
+name:            foo
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       
+liveins:         
+  - { reg: '$edi', virtual-reg: '' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      
+stack:           
+constants:       
+body:             |
+  bb.0.entry:
+    liveins: $edi
+  
+    renamable $eax = COPY $edi
+    RET 0, $eax
+
+...
+---
+name:            baz
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       
+liveins:         
+  - { reg: '$rdi', virtual-reg: '' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 8
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: 0, 
+      callee-saved-register: '$rbx', callee-saved-restored: true, debug-info-variable: '', 
+      debug-info-expression: '', debug-info-location: '' }
+stack:           
+constants:       
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $rdi, $rbx
+  
+    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    CFI_INSTRUCTION offset $rbx, -16
+    renamable $rbx = COPY $rdi
+    renamable $edi = MOV32rm $rdi, 1, $noreg, 0, $noreg :: (load 4 from %ir.out)
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !9
+    TEST32rr renamable $eax, renamable $eax, implicit-def $eflags
+    JNS_1 %bb.2, implicit killed $eflags
+  ; CHECK: JS_1 %bb.2, implicit $eflags
+  
+  bb.1:
+    successors: %bb.3(0x80000000)
+    liveins: $eax
+  
+    JMP_1 %bb.3
+  
+  bb.2.if.then1:
+    successors: %bb.3(0x80000000)
+    liveins: $rbx
+  
+    MOV32mi killed renamable $rbx, 1, $noreg, 0, $noreg, 1 :: (store 4 into %ir.out)
+    renamable $eax = MOV32r0 implicit-def dead $eflags
+  
+  bb.3.cleanup:
+    liveins: $eax
+  
+    $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 8
+    RET 0, $eax
+
+...
+---
+name:            test
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       
+liveins:         
+  - { reg: '$rdi', virtual-reg: '' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       24
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 16
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$rbx', callee-saved-restored: true, debug-info-variable: '', 
+      debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: 0, 
+      callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '', 
+      debug-info-expression: '', debug-info-location: '' }
+stack:           
+  - { id: 0, name: idx, type: default, offset: -28, size: 4, alignment: 4, 
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:       
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $rdi, $r14, $rbx
+  
+    frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 24
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 32
+    CFI_INSTRUCTION offset $rbx, -24
+    CFI_INSTRUCTION offset $r14, -16
+    renamable $rbx = COPY $rdi
+    DBG_LABEL !20, debug-location !21
+    renamable $rdi = LEA64r $rsp, 1, $noreg, 4, $noreg
+    CALL64pcrel32 @baz, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !22
+    TEST32rr renamable $eax, renamable $eax, implicit-def $eflags
+    JNS_1 %bb.2, implicit killed $eflags
+  ; CHECK: JS_1 %bb.5, implicit $eflags
+  
+  bb.1:
+    successors: %bb.5(0x80000000)
+    liveins: $eax
+  
+    JMP_1 %bb.5
+  
+  bb.2:
+    successors: %bb.6(0x80000000)
+    liveins: $rbx
+  
+    renamable $r14 = LEA64r $rsp, 1, $noreg, 4, $noreg
+    JMP_1 %bb.6
+  
+  bb.3.retry.loopexit:
+    successors: %bb.4(0x04000000), %bb.6(0x7c000000)
+    liveins: $rbx, $r14
+  
+    DBG_LABEL !20, debug-location !21
+    $rdi = COPY renamable $r14, debug-location !22
+    CALL64pcrel32 @baz, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !22
+    TEST32rr renamable $eax, renamable $eax, implicit-def $eflags
+    JNS_1 %bb.6, implicit killed $eflags
+  
+  bb.4:
+    successors: %bb.5(0x80000000)
+    liveins: $eax
+  
+  
+  bb.5.if.then:
+    liveins: $eax
+  
+    $rsp = frame-destroy ADD64ri8 $rsp, 8, implicit-def dead $eflags
+    CFI_INSTRUCTION def_cfa_offset 24
+    $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    $r14 = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 8
+    RET 0, $eax
+  
+  bb.6.do.cond:
+    successors: %bb.8(0x30000000), %bb.7(0x50000000)
+    liveins: $rbx, $r14
+  
+    CMP32mi8 $rsp, 1, $noreg, 4, $noreg, 0, implicit-def $eflags :: (dereferenceable load 4 from %ir.idx)
+    JS_1 %bb.8, implicit killed $eflags
+    JMP_1 %bb.7
+  
+  bb.7.lor.rhs:
+    successors: %bb.8(0x30000000), %bb.3(0x50000000)
+    liveins: $rbx, $r14
+  
+    CMP32mi8 renamable $rbx, 1, $noreg, 0, $noreg, 0, implicit-def $eflags :: (load 4 from %ir.1)
+    JNE_1 %bb.3, implicit killed $eflags
+    JMP_1 %bb.8
+  
+  bb.8.do.body.backedge:
+    successors: %bb.9(0x04000000), %bb.6(0x7c000000)
+    liveins: $rbx, $r14
+  
+    $rdi = COPY renamable $r14, debug-location !22
+    CALL64pcrel32 @baz, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !22
+    TEST32rr renamable $eax, renamable $eax, implicit-def $eflags
+    JNS_1 %bb.6, implicit killed $eflags
+  
+  bb.9:
+    successors: %bb.5(0x80000000)
+    liveins: $eax
+  
+    JMP_1 %bb.5
+
+...
diff --git a/test/CodeGen/MIR/X86/instructions-debug-location.mir b/test/CodeGen/MIR/X86/instructions-debug-location.mir
index 8b6c5cbf5261b8f36e25a0a1bf849f37ac3b8c02..fe74f5c0535d1b2711dc81764a7df7b577ce4ff7 100644
--- a/test/CodeGen/MIR/X86/instructions-debug-location.mir
+++ b/test/CodeGen/MIR/X86/instructions-debug-location.mir
@@ -22,6 +22,17 @@
     ret i32 %0, !dbg !14
   }
 
+  define i32 @test_mir_created(i32 %x) #0 !dbg !15 {
+  entry:
+    %x.addr = alloca i32, align 4
+    store i32 %x, i32* %x.addr, align 4
+    %0 = load i32, i32* %x.addr, align 4
+    %1 = load i32, i32* %x.addr, align 4
+    %2 = load i32, i32* %x.addr, align 4
+    %3 = load i32, i32* %x.addr, align 4
+    ret i32 %0, !dbg !16
+  }
+
   declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
   attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
@@ -45,6 +56,8 @@
   !12 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 4, type: !8)
   !13 = !DILocation(line: 4, scope: !4)
   !14 = !DILocation(line: 8, scope: !4)
+  !15 = distinct !DISubprogram(name: "test_mir_created", scope: !5, file: !5, line: 10, type: !6, isLocal: false, isDefinition: true, scopeLine: 10, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !16 = !DILocation(line: 20, scope: !15)
 
 ...
 ---
@@ -96,3 +109,31 @@ body: |
     $eax = COPY %0
     RETQ $eax
 ...
+---
+name:            test_mir_created
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+frameInfo:
+  maxAlignment:  4
+stack:
+  - { id: 0, name: x.addr, size: 4, alignment: 4 }
+body: |
+  bb.0.entry:
+    liveins: $edi
+
+    %0 = COPY $edi
+  ; CHECK-LABEL: name: test_mir_created
+  ; CHECK:       MOV32mr %stack.0.x.addr, 1, $noreg, 0, $noreg, %0, debug-location !DILocation(line: 1, scope: !14)
+  ; CHECK:       MOV32mr %stack.0.x.addr, 1, $noreg, 0, $noreg, %0, debug-location !DILocation(line: 2, column: 2, scope: !14)
+  ; CHECK:       MOV32mr %stack.0.x.addr, 1, $noreg, 0, $noreg, %0, debug-location !DILocation(line: 3, column: 2, scope: !14, isImplicitCode: true)
+  ; CHECK:       MOV32mr %stack.0.x.addr, 1, $noreg, 0, $noreg, %0, debug-location !DILocation(line: 4, scope: !14, inlinedAt: !15)
+  ; CHECK:       MOV32mr %stack.0.x.addr, 1, $noreg, 0, $noreg, %0, debug-location !DILocation(line: 5, scope: !14, inlinedAt: !DILocation(line: 4, scope: !14))
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0, debug-location !DILocation(line: 1, scope: !15)
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0, debug-location !DILocation(line: 2, column: 2, scope: !15)
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0, debug-location !DILocation(line: 3, column: 2, scope: !15, isImplicitCode: true)
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0, debug-location !DILocation(line: 4, scope: !15, inlinedAt: !16)
+    MOV32mr %stack.0.x.addr, 1, _, 0, _, %0, debug-location !DILocation(line: 5, scope: !15, inlinedAt: !DILocation(line: 4, scope: !15))
+    $eax = COPY %0
+    RETQ $eax
+...
diff --git a/test/CodeGen/MSP430/callee-saved.ll b/test/CodeGen/MSP430/callee-saved.ll
new file mode 100644
index 0000000000000000000000000000000000000000..76db4dcdfe886724e5c1a222a577d86d51773a70
--- /dev/null
+++ b/test/CodeGen/MSP430/callee-saved.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"
+target triple = "msp430-generic-generic"
+
+; Test the r4-r10 callee-saved registers (MSP430 EABI p. 3.2.2).
+
+@g = global float 0.0
+
+define void @foo() {
+; CHECK-LABEL: foo:
+; CHECK-NOT: push	r15
+; CHECK-NOT: push	r14
+; CHECK-NOT: push	r13
+; CHECK-NOT: push	r12
+; CHECK-NOT: push	r11
+; CHECK: push	r10
+; CHECK: push	r9
+; CHECK: push	r8
+; CHECK: push	r7
+; CHECK: push	r6
+; CHECK: push	r5
+; CHECK: push	r4
+  %t1 = load volatile float, float* @g
+  %t2 = load volatile float, float* @g
+  %t3 = load volatile float, float* @g
+  %t4 = load volatile float, float* @g
+  %t5 = load volatile float, float* @g
+  %t6 = load volatile float, float* @g
+  %t7 = load volatile float, float* @g
+  store volatile float %t1, float* @g
+  store volatile float %t2, float* @g
+  store volatile float %t3, float* @g
+  store volatile float %t4, float* @g
+  store volatile float %t5, float* @g
+  store volatile float %t6, float* @g
+  ret void
+}
diff --git a/test/CodeGen/MSP430/calls.ll b/test/CodeGen/MSP430/calls.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c5540bf1f99913296d9d9b471be3841ad5eaa922
--- /dev/null
+++ b/test/CodeGen/MSP430/calls.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+declare i32 @direct(i32 %a)
+
+define i32 @test_direct(i32 %a) nounwind {
+; CHECK-LABEL: test_direct:
+; CHECK: call #direct
+  %1 = call i32 @direct(i32 %a)
+  ret i32 %1
+}
+
+define i16 @test_indirect(i16 (i16)* %a, i16 %b) nounwind {
+; CHECK-LABEL: test_indirect:
+; CHECK: mov	r12, r14
+; CHECK: mov	r13, r12
+; CHECK: call	r14
+  %1 = call i16 %a(i16 %b)
+  ret i16 %1
+}
diff --git a/test/CodeGen/MSP430/cc_args.ll b/test/CodeGen/MSP430/cc_args.ll
index eb7e470a9b61450348a1f1c66659ab3413d5d8d1..c8164f1291ddeb266053d583cd0e242521df651b 100644
--- a/test/CodeGen/MSP430/cc_args.ll
+++ b/test/CodeGen/MSP430/cc_args.ll
@@ -54,6 +54,29 @@ entry:
 ; CHECK: call #f_i16_i64_i16
   call void @f_i16_i64_i16(i16 1, i64 72623859790382856, i16 2)
 
+; Check that r15 is not used and the last i32 argument passed through the stack.
+; CHECK: mov	#258, 10(r1)
+; CHECK: mov	#772, 8(r1)
+; CHECK: mov	#258, 6(r1)
+; CHECK: mov	#772, 4(r1)
+; CHECK: mov	#1286, 2(r1)
+; CHECK: mov	#1800, 0(r1)
+; CHECK: mov	#1, r12
+; CHECK: mov	#772, r13
+; CHECK: mov	#258, r14
+  call void @f_i16_i64_i32_i32(i16 1, i64 72623859790382856, i32 16909060, i32 16909060)
+
+; CHECK: mov	#258, 6(r1)
+; CHECK: mov	#772, 4(r1)
+; CHECK: mov	#1286, 2(r1)
+; CHECK: mov	#1800, 0(r1)
+; CHECK: mov	#1800, r12
+; CHECK: mov	#1286, r13
+; CHECK: mov	#772, r14
+; CHECK: mov	#258, r15
+; CHECK: call	#f_i64_i64
+  call void @f_i64_i64(i64 72623859790382856, i64 72623859790382856)
+
   ret void
 }
 
@@ -136,4 +159,60 @@ define void @f_i16_i64_i16(i16 %a, i64 %b, i16 %c) #0 {
   ret void
 }
 
+define void @f_i64_i64(i64 %a, i64 %b) #0 {
+; CHECK: f_i64_i64:
+; CHECK: mov	r15, &g_i64+6
+; CHECK: mov	r14, &g_i64+4
+; CHECK: mov	r13, &g_i64+2
+; CHECK: mov	r12, &g_i64
+  store volatile i64 %a, i64* @g_i64, align 2
+; CHECK: mov	10(r4), &g_i64+6
+; CHECK: mov	8(r4), &g_i64+4
+; CHECK: mov	6(r4), &g_i64+2
+; CHECK: mov	4(r4), &g_i64
+  store volatile i64 %b, i64* @g_i64, align 2
+  ret void
+}
+
+define void @f_i16_i64_i32_i32(i16 %a, i64 %b, i32 %c, i32 %d) #0 {
+; CHECK-LABEL: f_i16_i64_i32_i32:
+; CHECK: mov	r12, &g_i16
+  store volatile i16 %a, i16* @g_i16, align 2
+; CHECK: mov	10(r4), &g_i64+6
+; CHECK: mov	8(r4), &g_i64+4
+; CHECK: mov	6(r4), &g_i64+2
+; CHECK: mov	4(r4), &g_i64
+  store volatile i64 %b, i64* @g_i64, align 2
+; CHECK: mov	r14, &g_i32+2
+; CHECK: mov	r13, &g_i32
+  store volatile i32 %c, i32* @g_i32, align 2
+; CHECK: mov	14(r4), &g_i32+2
+; CHECK: mov	12(r4), &g_i32
+  store volatile i32 %d, i32* @g_i32, align 2
+  ret void
+}
+; MSP430 EABI p. 6.3
+; For helper functions which take two long long arguments
+; the first argument is passed in R8::R11 and the second argument
+; is in R12::R15.
+
+@g_i64_2 = common global i64 0, align 2
+
+define i64 @helper_call_i64() #0 {
+  %1 = load i64, i64* @g_i64, align 2
+  %2 = load i64, i64* @g_i64_2, align 2
+; CHECK-LABEL: helper_call_i64:
+; CHECK: mov	&g_i64, r8
+; CHECK: mov	&g_i64+2, r9
+; CHECK: mov	&g_i64+4, r10
+; CHECK: mov	&g_i64+6, r11
+; CHECK: mov	&g_i64_2, r12
+; CHECK: mov	&g_i64_2+2, r13
+; CHECK: mov	&g_i64_2+4, r14
+; CHECK: mov	&g_i64_2+6, r15
+; CHECK: call	#__mspabi_divlli
+  %3 = sdiv i64 %1, %2
+  ret i64 %3
+}
+
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/MSP430/interrupt.ll b/test/CodeGen/MSP430/interrupt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5fa0c849c2602a6e4e01392e663a855739cb415a
--- /dev/null
+++ b/test/CodeGen/MSP430/interrupt.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"
+target triple = "msp430-generic-generic"
+
+@llvm.used = appending global [1 x i8*] [i8* bitcast (void ()* @ISR to i8*)], section "llvm.metadata"
+
+; MSP430 EABI p. 3.9
+; Interrupt functions must save all the registers that are used, even those
+; that are normally considered callee-saved.
+
+; To return from an interrupt function, the function must execute the special
+; instruction RETI, which restores the SR register and branches to the PC where
+; the interrupt occurred.
+
+@g = global float 0.0
+
+define msp430_intrcc void @ISR() #0 {
+entry:
+; CHECK-LABEL: ISR:
+; CHECK: push	r15
+; CHECK: push	r14
+; CHECK: push	r13
+; CHECK: push	r12
+; CHECK: push	r11
+; CHECK: push	r10
+; CHECK: push	r9
+; CHECK: push	r8
+; CHECK: push	r7
+; CHECK: push	r6
+; CHECK: push	r5
+; CHECK: push	r4
+  %t1 = load volatile float, float* @g
+  %t2 = load volatile float, float* @g
+  %t3 = load volatile float, float* @g
+  %t4 = load volatile float, float* @g
+  %t5 = load volatile float, float* @g
+  %t6 = load volatile float, float* @g
+  %t7 = load volatile float, float* @g
+  store volatile float %t1, float* @g
+  store volatile float %t2, float* @g
+  store volatile float %t3, float* @g
+  store volatile float %t4, float* @g
+  store volatile float %t5, float* @g
+  store volatile float %t6, float* @g
+; CHECK: reti
+  ret void
+}
+
diff --git a/test/CodeGen/MSP430/libcalls.ll b/test/CodeGen/MSP430/libcalls.ll
index 3040237781306af3d64c23ad3198e7197267c2bd..b6e24db6c551abf3a8d89d9c13467d6f4d15907f 100644
--- a/test/CodeGen/MSP430/libcalls.ll
+++ b/test/CodeGen/MSP430/libcalls.ll
@@ -604,4 +604,39 @@ entry:
   ret i64 %1
 }
 
+@i = external global i32, align 2
+
+define i32 @srll() #0 {
+entry:
+; CHECK-LABEL: srll:
+; CHECK: call #__mspabi_srll
+  %0 = load volatile i32, i32* @g_i32, align 2
+  %1 = load volatile i32, i32* @i, align 2
+  %shr = lshr i32 %0, %1
+
+  ret i32 %shr
+}
+
+define i32 @sral() #0 {
+entry:
+; CHECK-LABEL: sral:
+; CHECK: call #__mspabi_sral
+  %0 = load volatile i32, i32* @g_i32, align 2
+  %1 = load volatile i32, i32* @i, align 2
+  %shr = ashr i32 %0, %1
+
+  ret i32 %shr
+}
+
+define i32 @slll() #0 {
+entry:
+; CHECK-LABEL: slll:
+; CHECK: call #__mspabi_slll
+  %0 = load volatile i32, i32* @g_i32, align 2
+  %1 = load volatile i32, i32* @i, align 2
+  %shr = shl i32 %0, %1
+
+  ret i32 %shr
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/shifts.ll b/test/CodeGen/MSP430/shifts.ll
index 6d4050f42bef5e8de51ba36cc408ea079fa0c4a2..073251943d01b0074246a2b151b8b9d90bd606c6 100644
--- a/test/CodeGen/MSP430/shifts.ll
+++ b/test/CodeGen/MSP430/shifts.ll
@@ -5,6 +5,7 @@ target triple = "msp430-elf"
 define zeroext i8 @lshr8(i8 zeroext %a, i8 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK-LABEL: lshr8:
+; CHECK: clrc
 ; CHECK: rrc.b
   %shr = lshr i8 %a, %cnt
   ret i8 %shr
@@ -29,6 +30,7 @@ entry:
 define zeroext i16 @lshr16(i16 zeroext %a, i16 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK-LABEL: lshr16:
+; CHECK: clrc
 ; CHECK: rrc
   %shr = lshr i16 %a, %cnt
   ret i16 %shr
@@ -49,3 +51,26 @@ entry:
   %shl = shl i16 %a, %cnt
   ret i16 %shl
 }
+
+define i16 @ashr10_i16(i16 %a) #0 {
+entry:
+; CHECK-LABEL: ashr10_i16:
+; CHECK:      swpb	r12
+; CHECK-NEXT: sxt	r12
+; CHECK-NEXT: rra	r12
+; CHECK-NEXT: rra	r12
+  %shr = ashr i16 %a, 10
+  ret i16 %shr
+}
+
+define i16 @lshr10_i16(i16 %a) #0 {
+entry:
+; CHECK-LABEL: lshr10_i16:
+; CHECK:      swpb	r12
+; CHECK-NEXT: mov.b	r12, r12
+; CHECK-NEXT: clrc
+; CHECK-NEXT: rrc	r12
+; CHECK-NEXT: rra	r12
+  %shr = lshr i16 %a, 10
+  ret i16 %shr
+}
diff --git a/test/CodeGen/MSP430/struct-return.ll b/test/CodeGen/MSP430/struct-return.ll
index a52ea1b702a3bc55c6e382bc7bd70dc058739099..8f662897a2919697fe2f6791d55bd30fd2972005 100644
--- a/test/CodeGen/MSP430/struct-return.ll
+++ b/test/CodeGen/MSP430/struct-return.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
 target triple = "msp430---elf"
 
-; Allow simple structures to be returned by value.
+; Pass large structures by reference (MSP430 EABI p. 3.5)
 
 %s = type { i64, i64 }
 
@@ -17,7 +17,39 @@ define %s @fred() #0 {
 ; CHECK: mov	#772, 4(r12)
 ; CHECK: mov	#1286, 2(r12)
 ; CHECK: mov	#1800, 0(r12)
-  ret %s {i64 72623859790382856, i64 651345242494996224} 
+  ret %s {i64 72623859790382856, i64 651345242494996224}
+}
+
+%struct.S = type { i16, i16, i16 }
+
+@a = common global i16 0, align 2
+@b = common global i16 0, align 2
+@c = common global i16 0, align 2
+
+define void @test() #1 {
+; CHECK-LABEL: test:
+  %1 = alloca %struct.S, align 2
+; CHECK:      mov	r1, r12
+; CHECK-NEXT: call	#sret
+  call void @sret(%struct.S* nonnull sret %1) #3
+  ret void
+}
+
+define void @sret(%struct.S* noalias nocapture sret) #0 {
+; CHECK-LABEL: sret:
+; CHECK: mov	&a, 0(r12)
+; CHECK: mov	&b, 2(r12)
+; CHECK: mov	&c, 4(r12)
+  %2 = getelementptr inbounds %struct.S, %struct.S* %0, i16 0, i32 0
+  %3 = load i16, i16* @a, align 2
+  store i16 %3, i16* %2, align 2
+  %4 = getelementptr inbounds %struct.S, %struct.S* %0, i16 0, i32 1
+  %5 = load i16, i16* @b, align 2
+  store i16 %5, i16* %4, align 2
+  %6 = getelementptr inbounds %struct.S, %struct.S* %0, i16 0, i32 2
+  %7 = load i16, i16* @c, align 2
+  store i16 %7, i16* %6, align 2
+  ret void
 }
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/Mips/GlobalISel/instruction-select/bitwise.mir b/test/CodeGen/Mips/GlobalISel/instruction-select/bitwise.mir
index 1939811e6261183e28241e63c314e7dc654f15cb..710b00ef7a65a1bf638ae0daa862de451273287a 100644
--- a/test/CodeGen/Mips/GlobalISel/instruction-select/bitwise.mir
+++ b/test/CodeGen/Mips/GlobalISel/instruction-select/bitwise.mir
@@ -2,9 +2,9 @@
 # RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
 --- |
 
-  define void @and(i32, i32) {entry: ret void}
-  define void @or(i32, i32) {entry: ret void}
-  define void @xor(i32, i32) {entry: ret void}
+  define void @and_i32() {entry: ret void}
+  define void @or_i32() {entry: ret void}
+  define void @xor_i32() {entry: ret void}
   define void @shl(i32) {entry: ret void}
   define void @ashr(i32) {entry: ret void}
   define void @lshr(i32) {entry: ret void}
@@ -14,7 +14,7 @@
 
 ...
 ---
-name:            and
+name:            and_i32
 alignment:       2
 legalized:       true
 regBankSelected: true
@@ -23,7 +23,7 @@ body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: and
+    ; MIPS32-LABEL: name: and_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
@@ -38,7 +38,7 @@ body:             |
 
 ...
 ---
-name:            or
+name:            or_i32
 alignment:       2
 legalized:       true
 regBankSelected: true
@@ -47,7 +47,7 @@ body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: or
+    ; MIPS32-LABEL: name: or_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
@@ -62,7 +62,7 @@ body:             |
 
 ...
 ---
-name:            xor
+name:            xor_i32
 alignment:       2
 legalized:       true
 regBankSelected: true
@@ -71,7 +71,7 @@ body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: xor
+    ; MIPS32-LABEL: name: xor_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
diff --git a/test/CodeGen/Mips/GlobalISel/instruction-select/rem_and_div.mir b/test/CodeGen/Mips/GlobalISel/instruction-select/rem_and_div.mir
new file mode 100644
index 0000000000000000000000000000000000000000..55594330836ac5e5600dc6248c5a905d3f753858
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/instruction-select/rem_and_div.mir
@@ -0,0 +1,110 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
+--- |
+
+  define void @sdiv_i32() {entry: ret void}
+  define void @srem_i32() {entry: ret void}
+  define void @udiv_i32() {entry: ret void}
+  define void @urem_i32() {entry: ret void}
+
+...
+---
+name:            sdiv_i32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: sdiv_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
+    ; MIPS32: [[PseudoSDIV:%[0-9]+]]:acc64 = PseudoSDIV [[COPY1]], [[COPY]]
+    ; MIPS32: [[PseudoMFLO:%[0-9]+]]:gpr32 = PseudoMFLO [[PseudoSDIV]]
+    ; MIPS32: $v0 = COPY [[PseudoMFLO]]
+    ; MIPS32: RetRA implicit $v0
+    %0:gprb(s32) = COPY $a0
+    %1:gprb(s32) = COPY $a1
+    %2:gprb(s32) = G_SDIV %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            srem_i32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: srem_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
+    ; MIPS32: [[PseudoSDIV:%[0-9]+]]:acc64 = PseudoSDIV [[COPY1]], [[COPY]]
+    ; MIPS32: [[PseudoMFHI:%[0-9]+]]:gpr32 = PseudoMFHI [[PseudoSDIV]]
+    ; MIPS32: $v0 = COPY [[PseudoMFHI]]
+    ; MIPS32: RetRA implicit $v0
+    %0:gprb(s32) = COPY $a0
+    %1:gprb(s32) = COPY $a1
+    %2:gprb(s32) = G_SREM %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            udiv_i32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: udiv_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
+    ; MIPS32: [[PseudoUDIV:%[0-9]+]]:acc64 = PseudoUDIV [[COPY1]], [[COPY]]
+    ; MIPS32: [[PseudoMFLO:%[0-9]+]]:gpr32 = PseudoMFLO [[PseudoUDIV]]
+    ; MIPS32: $v0 = COPY [[PseudoMFLO]]
+    ; MIPS32: RetRA implicit $v0
+    %0:gprb(s32) = COPY $a0
+    %1:gprb(s32) = COPY $a1
+    %2:gprb(s32) = G_UDIV %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            urem_i32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: urem_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
+    ; MIPS32: [[PseudoUDIV:%[0-9]+]]:acc64 = PseudoUDIV [[COPY1]], [[COPY]]
+    ; MIPS32: [[PseudoMFHI:%[0-9]+]]:gpr32 = PseudoMFHI [[PseudoUDIV]]
+    ; MIPS32: $v0 = COPY [[PseudoMFHI]]
+    ; MIPS32: RetRA implicit $v0
+    %0:gprb(s32) = COPY $a0
+    %1:gprb(s32) = COPY $a1
+    %2:gprb(s32) = G_UREM %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
diff --git a/test/CodeGen/Mips/GlobalISel/irtranslator/bitwise.ll b/test/CodeGen/Mips/GlobalISel/irtranslator/bitwise.ll
index 917a67ac9bcc4ac7141d0f79ae53d19838a67b4f..fa34ea403085e9e1c074d2157980abf5b35c93d8 100644
--- a/test/CodeGen/Mips/GlobalISel/irtranslator/bitwise.ll
+++ b/test/CodeGen/Mips/GlobalISel/irtranslator/bitwise.ll
@@ -1,48 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -O0 -mtriple=mipsel-linux-gnu -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
 
-define i32 @and(i32 %a, i32 %b) {
-  ; MIPS32-LABEL: name: and
-  ; MIPS32: bb.1.entry:
-  ; MIPS32:   liveins: $a0, $a1
-  ; MIPS32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-  ; MIPS32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-  ; MIPS32:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
-  ; MIPS32:   $v0 = COPY [[AND]](s32)
-  ; MIPS32:   RetRA implicit $v0
-entry:
-  %and = and i32 %b, %a
-  ret i32 %and
-}
-
-define i32 @or(i32 %a, i32 %b) {
-  ; MIPS32-LABEL: name: or
-  ; MIPS32: bb.1.entry:
-  ; MIPS32:   liveins: $a0, $a1
-  ; MIPS32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-  ; MIPS32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-  ; MIPS32:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
-  ; MIPS32:   $v0 = COPY [[OR]](s32)
-  ; MIPS32:   RetRA implicit $v0
-entry:
-  %or = or i32 %b, %a
-  ret i32 %or
-}
-
-define i32 @xor(i32 %a, i32 %b) {
-  ; MIPS32-LABEL: name: xor
-  ; MIPS32: bb.1.entry:
-  ; MIPS32:   liveins: $a0, $a1
-  ; MIPS32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-  ; MIPS32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-  ; MIPS32:   [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
-  ; MIPS32:   $v0 = COPY [[XOR]](s32)
-  ; MIPS32:   RetRA implicit $v0
-entry:
-  %xor = xor i32 %b, %a
-  ret i32 %xor
-}
-
 define i32 @shl(i32 %a) {
   ; MIPS32-LABEL: name: shl
   ; MIPS32: bb.1.entry:
diff --git a/test/CodeGen/Mips/GlobalISel/legalizer/add.mir b/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
index ff9ae06a9374cdfb6b485971e0f334aca283be5d..c90b8e6d86c6ab7a506581c55edb62bb31c4736a 100644
--- a/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
@@ -10,6 +10,7 @@
   define void @add_i16_zext() {entry: ret void}
   define void @add_i16_aext() {entry: ret void}
   define void @add_i64() {entry: ret void}
+  define void @add_i128() {entry: ret void}
 
 ...
 ---
@@ -226,11 +227,19 @@ body:             |
     ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]]
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND]]
     ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[COPY3]]
-    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[ADD2]](s32)
+    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32)
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]]
+    ; MIPS32: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[AND1]]
+    ; MIPS32: $v0 = COPY [[ADD3]](s32)
     ; MIPS32: $v1 = COPY [[ADD1]](s32)
     ; MIPS32: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
@@ -246,3 +255,82 @@ body:             |
     RetRA implicit $v0, implicit $v1
 
 ...
+---
+name:            add_i128
+alignment:       2
+tracksRegLiveness: true
+fixedStack:
+  - { id: 0, offset: 28, size: 4, alignment: 4, stack-id: 0, isImmutable: true }
+  - { id: 1, offset: 24, size: 4, alignment: 8, stack-id: 0, isImmutable: true }
+  - { id: 2, offset: 20, size: 4, alignment: 4, stack-id: 0, isImmutable: true }
+  - { id: 3, offset: 16, size: 4, alignment: 8, stack-id: 0, isImmutable: true }
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1, $a2, $a3
+
+    ; MIPS32-LABEL: name: add_i128
+    ; MIPS32: liveins: $a0, $a1, $a2, $a3
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    ; MIPS32: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load 4 from %fixed-stack.0, align 0)
+    ; MIPS32: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
+    ; MIPS32: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load 4 from %fixed-stack.1, align 0)
+    ; MIPS32: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2
+    ; MIPS32: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load 4 from %fixed-stack.2, align 0)
+    ; MIPS32: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3
+    ; MIPS32: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (load 4 from %fixed-stack.3, align 0)
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[COPY]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND]]
+    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[LOAD]]
+    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[COPY1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32)
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]]
+    ; MIPS32: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[AND1]]
+    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[LOAD1]]
+    ; MIPS32: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[COPY2]]
+    ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ICMP1]](s32)
+    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]]
+    ; MIPS32: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[AND2]]
+    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD5]](s32), [[LOAD2]]
+    ; MIPS32: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[COPY3]]
+    ; MIPS32: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ICMP2]](s32)
+    ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]]
+    ; MIPS32: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[AND3]]
+    ; MIPS32: $v0 = COPY [[ADD1]](s32)
+    ; MIPS32: $v1 = COPY [[ADD3]](s32)
+    ; MIPS32: $a0 = COPY [[ADD5]](s32)
+    ; MIPS32: $a1 = COPY [[ADD7]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1
+    %2:_(s32) = COPY $a0
+    %3:_(s32) = COPY $a1
+    %4:_(s32) = COPY $a2
+    %5:_(s32) = COPY $a3
+    %0:_(s128) = G_MERGE_VALUES %2(s32), %3(s32), %4(s32), %5(s32)
+    %10:_(p0) = G_FRAME_INDEX %fixed-stack.3
+    %6:_(s32) = G_LOAD %10(p0) :: (load 4 from %fixed-stack.3, align 0)
+    %11:_(p0) = G_FRAME_INDEX %fixed-stack.2
+    %7:_(s32) = G_LOAD %11(p0) :: (load 4 from %fixed-stack.2, align 0)
+    %12:_(p0) = G_FRAME_INDEX %fixed-stack.1
+    %8:_(s32) = G_LOAD %12(p0) :: (load 4 from %fixed-stack.1, align 0)
+    %13:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    %9:_(s32) = G_LOAD %13(p0) :: (load 4 from %fixed-stack.0, align 0)
+    %1:_(s128) = G_MERGE_VALUES %6(s32), %7(s32), %8(s32), %9(s32)
+    %14:_(s128) = G_ADD %1, %0
+    %15:_(s32), %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(s128)
+    $v0 = COPY %15(s32)
+    $v1 = COPY %16(s32)
+    $a0 = COPY %17(s32)
+    $a1 = COPY %18(s32)
+    RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1
+
+...
diff --git a/test/CodeGen/Mips/GlobalISel/legalizer/bitwise.mir b/test/CodeGen/Mips/GlobalISel/legalizer/bitwise.mir
index 80944aaac0f83c09648f46b9eff64a5e6c8b07ec..1e5d50f3d51ae5733d26fb1c8a3e6adc9998158a 100644
--- a/test/CodeGen/Mips/GlobalISel/legalizer/bitwise.mir
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/bitwise.mir
@@ -2,9 +2,21 @@
 # RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
 --- |
 
-  define void @and(i32, i32) {entry: ret void}
-  define void @or(i32, i32) {entry: ret void}
-  define void @xor(i32, i32) {entry: ret void}
+  define void @and_i1() {entry: ret void}
+  define void @and_i8() {entry: ret void}
+  define void @and_i16() {entry: ret void}
+  define void @and_i32() {entry: ret void}
+  define void @and_i64() {entry: ret void}
+  define void @or_i1() {entry: ret void}
+  define void @or_i8() {entry: ret void}
+  define void @or_i16() {entry: ret void}
+  define void @or_i32() {entry: ret void}
+  define void @or_i64() {entry: ret void}
+  define void @xor_i1() {entry: ret void}
+  define void @xor_i8() {entry: ret void}
+  define void @xor_i16() {entry: ret void}
+  define void @xor_i32() {entry: ret void}
+  define void @xor_i64() {entry: ret void}
   define void @shl(i32) {entry: ret void}
   define void @ashr(i32) {entry: ret void}
   define void @lshr(i32) {entry: ret void}
@@ -14,14 +26,98 @@
 
 ...
 ---
-name:            and
+name:            and_i1
 alignment:       2
 tracksRegLiveness: true
 body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: and
+    ; MIPS32-LABEL: name: and_i1
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s1) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s1) = G_TRUNC %3(s32)
+    %4:_(s1) = G_AND %1, %0
+    %5:_(s32) = G_ANYEXT %4(s1)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            and_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: and_i8
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_AND %1, %0
+    %5:_(s32) = G_ANYEXT %4(s8)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            and_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: and_i16
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s16) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s16) = G_TRUNC %3(s32)
+    %4:_(s16) = G_AND %1, %0
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            and_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: and_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
@@ -36,14 +132,130 @@ body:             |
 
 ...
 ---
-name:            or
+name:            and_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1, $a2, $a3
+
+    ; MIPS32-LABEL: name: and_i64
+    ; MIPS32: liveins: $a0, $a1, $a2, $a3
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[COPY]]
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: $v1 = COPY [[AND1]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %2:_(s32) = COPY $a0
+    %3:_(s32) = COPY $a1
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $a2
+    %5:_(s32) = COPY $a3
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_AND %1, %0
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $v0 = COPY %7(s32)
+    $v1 = COPY %8(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
+---
+name:            or_i1
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: or_i1
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s1) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s1) = G_TRUNC %3(s32)
+    %4:_(s1) = G_OR %1, %0
+    %5:_(s32) = G_ANYEXT %4(s1)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            or_i8
 alignment:       2
 tracksRegLiveness: true
 body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: or
+    ; MIPS32-LABEL: name: or_i8
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_OR %1, %0
+    %5:_(s32) = G_ANYEXT %4(s8)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            or_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: or_i16
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s16) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s16) = G_TRUNC %3(s32)
+    %4:_(s16) = G_OR %1, %0
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            or_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: or_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
@@ -58,14 +270,130 @@ body:             |
 
 ...
 ---
-name:            xor
+name:            or_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1, $a2, $a3
+
+    ; MIPS32-LABEL: name: or_i64
+    ; MIPS32: liveins: $a0, $a1, $a2, $a3
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY]]
+    ; MIPS32: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[COPY1]]
+    ; MIPS32: $v0 = COPY [[OR]](s32)
+    ; MIPS32: $v1 = COPY [[OR1]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %2:_(s32) = COPY $a0
+    %3:_(s32) = COPY $a1
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $a2
+    %5:_(s32) = COPY $a3
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_OR %1, %0
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $v0 = COPY %7(s32)
+    $v1 = COPY %8(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
+---
+name:            xor_i1
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: xor_i1
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s1) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s1) = G_TRUNC %3(s32)
+    %4:_(s1) = G_XOR %1, %0
+    %5:_(s32) = G_ANYEXT %4(s1)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            xor_i8
 alignment:       2
 tracksRegLiveness: true
 body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: xor
+    ; MIPS32-LABEL: name: xor_i8
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_XOR %1, %0
+    %5:_(s32) = G_ANYEXT %4(s8)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            xor_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: xor_i16
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[COPY3]]
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s16) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s16) = G_TRUNC %3(s32)
+    %4:_(s16) = G_XOR %1, %0
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            xor_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: xor_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
@@ -78,6 +406,38 @@ body:             |
     $v0 = COPY %2(s32)
     RetRA implicit $v0
 
+...
+---
+name:            xor_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1, $a2, $a3
+
+    ; MIPS32-LABEL: name: xor_i64
+    ; MIPS32: liveins: $a0, $a1, $a2, $a3
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[COPY]]
+    ; MIPS32: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY3]], [[COPY1]]
+    ; MIPS32: $v0 = COPY [[XOR]](s32)
+    ; MIPS32: $v1 = COPY [[XOR1]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %2:_(s32) = COPY $a0
+    %3:_(s32) = COPY $a1
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $a2
+    %5:_(s32) = COPY $a3
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_XOR %1, %0
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $v0 = COPY %7(s32)
+    $v1 = COPY %8(s32)
+    RetRA implicit $v0, implicit $v1
+
 ...
 ---
 name:            shl
diff --git a/test/CodeGen/Mips/GlobalISel/legalizer/rem_and_div.mir b/test/CodeGen/Mips/GlobalISel/legalizer/rem_and_div.mir
new file mode 100644
index 0000000000000000000000000000000000000000..f722e8e46f05d5b804e06fa17f8b00951dcfa4fd
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/rem_and_div.mir
@@ -0,0 +1,554 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
+--- |
+
+  define void @sdiv_i8() {entry: ret void}
+  define void @sdiv_i16() {entry: ret void}
+  define void @sdiv_i32() {entry: ret void}
+  define void @sdiv_i64() {entry: ret void}
+  define void @srem_i8() {entry: ret void}
+  define void @srem_i16() {entry: ret void}
+  define void @srem_i32() {entry: ret void}
+  define void @srem_i64() {entry: ret void}
+  define void @udiv_i8() {entry: ret void}
+  define void @udiv_i16() {entry: ret void}
+  define void @udiv_i32() {entry: ret void}
+  define void @udiv_i64() {entry: ret void}
+  define void @urem_i8() {entry: ret void}
+  define void @urem_i16() {entry: ret void}
+  define void @urem_i32() {entry: ret void}
+  define void @urem_i64() {entry: ret void}
+
+...
+---
+name:            sdiv_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: sdiv_i8
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C1]]
+    ; MIPS32: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C1]]
+    ; MIPS32: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[ASHR]], [[ASHR1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
+    ; MIPS32: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]]
+    ; MIPS32: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL2]], [[C2]]
+    ; MIPS32: $v0 = COPY [[ASHR2]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_SDIV %1, %0
+    %5:_(s32) = G_SEXT %4(s8)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            sdiv_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: sdiv_i16
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C1]]
+    ; MIPS32: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C1]]
+    ; MIPS32: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[ASHR]], [[ASHR1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
+    ; MIPS32: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]]
+    ; MIPS32: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL2]], [[C2]]
+    ; MIPS32: $v0 = COPY [[ASHR2]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s16) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s16) = G_TRUNC %3(s32)
+    %4:_(s16) = G_SDIV %1, %0
+    %5:_(s32) = G_SEXT %4(s16)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            sdiv_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: sdiv_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[COPY1]], [[COPY]]
+    ; MIPS32: $v0 = COPY [[SDIV]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = COPY $a1
+    %2:_(s32) = G_SDIV %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            sdiv_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1, $a2, $a3
+
+    ; MIPS32-LABEL: name: sdiv_i64
+    ; MIPS32: liveins: $a0, $a1, $a2, $a3
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; MIPS32: $a0 = COPY [[COPY2]](s32)
+    ; MIPS32: $a1 = COPY [[COPY3]](s32)
+    ; MIPS32: $a2 = COPY [[COPY]](s32)
+    ; MIPS32: $a3 = COPY [[COPY1]](s32)
+    ; MIPS32: JAL &__divdi3, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit $a1, implicit $a2, implicit $a3, implicit-def $v0, implicit-def $v1
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY $v0
+    ; MIPS32: [[COPY5:%[0-9]+]]:_(s32) = COPY $v1
+    ; MIPS32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: $v1 = COPY [[COPY5]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %2:_(s32) = COPY $a0
+    %3:_(s32) = COPY $a1
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $a2
+    %5:_(s32) = COPY $a3
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_SDIV %1, %0
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $v0 = COPY %7(s32)
+    $v1 = COPY %8(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
+---
+name:            srem_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: srem_i8
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C1]]
+    ; MIPS32: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C1]]
+    ; MIPS32: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[ASHR]], [[ASHR1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
+    ; MIPS32: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]]
+    ; MIPS32: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL2]], [[C2]]
+    ; MIPS32: $v0 = COPY [[ASHR2]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_SREM %1, %0
+    %5:_(s32) = G_SEXT %4(s8)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            srem_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: srem_i16
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C1]]
+    ; MIPS32: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C1]]
+    ; MIPS32: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[ASHR]], [[ASHR1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
+    ; MIPS32: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]]
+    ; MIPS32: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL2]], [[C2]]
+    ; MIPS32: $v0 = COPY [[ASHR2]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s16) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s16) = G_TRUNC %3(s32)
+    %4:_(s16) = G_SREM %1, %0
+    %5:_(s32) = G_SEXT %4(s16)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            srem_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: srem_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY1]], [[COPY]]
+    ; MIPS32: $v0 = COPY [[SREM]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = COPY $a1
+    %2:_(s32) = G_SREM %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            srem_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1, $a2, $a3
+
+    ; MIPS32-LABEL: name: srem_i64
+    ; MIPS32: liveins: $a0, $a1, $a2, $a3
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; MIPS32: $a0 = COPY [[COPY2]](s32)
+    ; MIPS32: $a1 = COPY [[COPY3]](s32)
+    ; MIPS32: $a2 = COPY [[COPY]](s32)
+    ; MIPS32: $a3 = COPY [[COPY1]](s32)
+    ; MIPS32: JAL &__moddi3, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit $a1, implicit $a2, implicit $a3, implicit-def $v0, implicit-def $v1
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY $v0
+    ; MIPS32: [[COPY5:%[0-9]+]]:_(s32) = COPY $v1
+    ; MIPS32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: $v1 = COPY [[COPY5]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %2:_(s32) = COPY $a0
+    %3:_(s32) = COPY $a1
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $a2
+    %5:_(s32) = COPY $a3
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_SREM %1, %0
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $v0 = COPY %7(s32)
+    $v1 = COPY %8(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
+---
+name:            udiv_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: udiv_i8
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; MIPS32: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C2]]
+    ; MIPS32: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_UDIV %1, %0
+    %5:_(s32) = G_SEXT %4(s8)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            udiv_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: udiv_i16
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; MIPS32: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C2]]
+    ; MIPS32: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s16) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s16) = G_TRUNC %3(s32)
+    %4:_(s16) = G_UDIV %1, %0
+    %5:_(s32) = G_SEXT %4(s16)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            udiv_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: udiv_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[COPY1]], [[COPY]]
+    ; MIPS32: $v0 = COPY [[UDIV]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = COPY $a1
+    %2:_(s32) = G_UDIV %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            udiv_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1, $a2, $a3
+
+    ; MIPS32-LABEL: name: udiv_i64
+    ; MIPS32: liveins: $a0, $a1, $a2, $a3
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; MIPS32: $a0 = COPY [[COPY2]](s32)
+    ; MIPS32: $a1 = COPY [[COPY3]](s32)
+    ; MIPS32: $a2 = COPY [[COPY]](s32)
+    ; MIPS32: $a3 = COPY [[COPY1]](s32)
+    ; MIPS32: JAL &__udivdi3, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit $a1, implicit $a2, implicit $a3, implicit-def $v0, implicit-def $v1
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY $v0
+    ; MIPS32: [[COPY5:%[0-9]+]]:_(s32) = COPY $v1
+    ; MIPS32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: $v1 = COPY [[COPY5]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %2:_(s32) = COPY $a0
+    %3:_(s32) = COPY $a1
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $a2
+    %5:_(s32) = COPY $a3
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_UDIV %1, %0
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $v0 = COPY %7(s32)
+    $v1 = COPY %8(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
+---
+name:            urem_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: urem_i8
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; MIPS32: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C2]]
+    ; MIPS32: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_UREM %1, %0
+    %5:_(s32) = G_SEXT %4(s8)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            urem_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: urem_i16
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; MIPS32: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
+    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C2]]
+    ; MIPS32: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %2:_(s32) = COPY $a0
+    %0:_(s16) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $a1
+    %1:_(s16) = G_TRUNC %3(s32)
+    %4:_(s16) = G_UREM %1, %0
+    %5:_(s32) = G_SEXT %4(s16)
+    $v0 = COPY %5(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            urem_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: urem_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[COPY1]], [[COPY]]
+    ; MIPS32: $v0 = COPY [[UREM]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = COPY $a1
+    %2:_(s32) = G_UREM %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            urem_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1, $a2, $a3
+
+    ; MIPS32-LABEL: name: urem_i64
+    ; MIPS32: liveins: $a0, $a1, $a2, $a3
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; MIPS32: $a0 = COPY [[COPY2]](s32)
+    ; MIPS32: $a1 = COPY [[COPY3]](s32)
+    ; MIPS32: $a2 = COPY [[COPY]](s32)
+    ; MIPS32: $a3 = COPY [[COPY1]](s32)
+    ; MIPS32: JAL &__umoddi3, csr_o32, implicit-def $ra, implicit-def $sp, implicit $a0, implicit $a1, implicit $a2, implicit $a3, implicit-def $v0, implicit-def $v1
+    ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY $v0
+    ; MIPS32: [[COPY5:%[0-9]+]]:_(s32) = COPY $v1
+    ; MIPS32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; MIPS32: $v0 = COPY [[COPY4]](s32)
+    ; MIPS32: $v1 = COPY [[COPY5]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %2:_(s32) = COPY $a0
+    %3:_(s32) = COPY $a1
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $a2
+    %5:_(s32) = COPY $a3
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_UREM %1, %0
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $v0 = COPY %7(s32)
+    $v1 = COPY %8(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
diff --git a/test/CodeGen/Mips/GlobalISel/llvm-ir/add.ll b/test/CodeGen/Mips/GlobalISel/llvm-ir/add.ll
index cf6d1e2c561f9ff0625010384227eee6916469b8..a1fb026f16b31d2ab5a85d3b7a2b69890a7a1543 100644
--- a/test/CodeGen/Mips/GlobalISel/llvm-ir/add.ll
+++ b/test/CodeGen/Mips/GlobalISel/llvm-ir/add.ll
@@ -90,14 +90,73 @@ entry:
 define i64 @add_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: add_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    addu $5, $7, $5
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $1, $1, 0
 ; MIPS32-NEXT:    addu $4, $6, $4
-; MIPS32-NEXT:    sltu $6, $4, $6
-; MIPS32-NEXT:    addu $3, $5, $6
-; MIPS32-NEXT:    move $2, $4
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 1
+; MIPS32-NEXT:    and $1, $1, $2
+; MIPS32-NEXT:    addu $1, $4, $1
+; MIPS32-NEXT:    sltu $2, $1, $6
+; MIPS32-NEXT:    addu $4, $7, $5
+; MIPS32-NEXT:    lui $5, 0
+; MIPS32-NEXT:    ori $5, $5, 1
+; MIPS32-NEXT:    and $2, $2, $5
+; MIPS32-NEXT:    addu $3, $4, $2
+; MIPS32-NEXT:    move $2, $1
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
   %add = add i64 %b, %a
   ret i64 %add
-}
\ No newline at end of file
+}
+
+define i128 @add_i128(i128 %a, i128 %b) {
+; MIPS32-LABEL: add_i128:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    addiu $sp, $sp, -8
+; MIPS32-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32-NEXT:    addiu $1, $sp, 24
+; MIPS32-NEXT:    lw $1, 0($1)
+; MIPS32-NEXT:    addiu $2, $sp, 28
+; MIPS32-NEXT:    lw $2, 0($2)
+; MIPS32-NEXT:    addiu $3, $sp, 32
+; MIPS32-NEXT:    lw $3, 0($3)
+; MIPS32-NEXT:    addiu $8, $sp, 36
+; MIPS32-NEXT:    lw $8, 0($8)
+; MIPS32-NEXT:    lui $9, 0
+; MIPS32-NEXT:    ori $9, $9, 0
+; MIPS32-NEXT:    addu $4, $1, $4
+; MIPS32-NEXT:    lui $10, 0
+; MIPS32-NEXT:    ori $10, $10, 1
+; MIPS32-NEXT:    and $9, $9, $10
+; MIPS32-NEXT:    addu $4, $4, $9
+; MIPS32-NEXT:    sltu $1, $4, $1
+; MIPS32-NEXT:    addu $5, $2, $5
+; MIPS32-NEXT:    lui $9, 0
+; MIPS32-NEXT:    ori $9, $9, 1
+; MIPS32-NEXT:    and $1, $1, $9
+; MIPS32-NEXT:    addu $1, $5, $1
+; MIPS32-NEXT:    sltu $2, $1, $2
+; MIPS32-NEXT:    addu $5, $3, $6
+; MIPS32-NEXT:    lui $6, 0
+; MIPS32-NEXT:    ori $6, $6, 1
+; MIPS32-NEXT:    and $2, $2, $6
+; MIPS32-NEXT:    addu $2, $5, $2
+; MIPS32-NEXT:    sltu $3, $2, $3
+; MIPS32-NEXT:    addu $5, $8, $7
+; MIPS32-NEXT:    lui $6, 0
+; MIPS32-NEXT:    ori $6, $6, 1
+; MIPS32-NEXT:    and $3, $3, $6
+; MIPS32-NEXT:    addu $5, $5, $3
+; MIPS32-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $2, $4
+; MIPS32-NEXT:    move $3, $1
+; MIPS32-NEXT:    lw $4, 4($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    addiu $sp, $sp, 8
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %add = add i128 %b, %a
+  ret i128 %add
+}
diff --git a/test/CodeGen/Mips/GlobalISel/llvm-ir/bitwise.ll b/test/CodeGen/Mips/GlobalISel/llvm-ir/bitwise.ll
index 0ecd52d174ccc687162bb4b73d2c8d7f00a0ffa7..9d8671eed3c194b833f1af29245b745fba733bdf 100644
--- a/test/CodeGen/Mips/GlobalISel/llvm-ir/bitwise.ll
+++ b/test/CodeGen/Mips/GlobalISel/llvm-ir/bitwise.ll
@@ -1,9 +1,41 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-
 ; RUN: llc  -O0 -mtriple=mipsel-linux-gnu -global-isel  -verify-machineinstrs %s -o -| FileCheck %s -check-prefixes=MIPS32
 
-define i32 @and(i32 %a, i32 %b) {
-; MIPS32-LABEL: and:
+define i1 @and_i1(i1 %a, i1 %b) {
+; MIPS32-LABEL: and_i1:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    and $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %and = and i1 %b, %a
+  ret i1 %and
+}
+
+define i8 @and_i8(i8 %a, i8 %b) {
+; MIPS32-LABEL: and_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    and $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %and = and i8 %b, %a
+  ret i8 %and
+}
+
+define i16 @and_i16(i16 %a, i16 %b) {
+; MIPS32-LABEL: and_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    and $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %and = and i16 %b, %a
+  ret i16 %and
+}
+
+define i32 @and_i32(i32 %a, i32 %b) {
+; MIPS32-LABEL: and_i32:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    and $2, $5, $4
 ; MIPS32-NEXT:    jr $ra
@@ -13,8 +45,53 @@ entry:
   ret i32 %and
 }
 
-define i32 @or(i32 %a, i32 %b) {
-; MIPS32-LABEL: or:
+define i64 @and_i64(i64 %a, i64 %b) {
+; MIPS32-LABEL: and_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    and $2, $6, $4
+; MIPS32-NEXT:    and $3, $7, $5
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %and = and i64 %b, %a
+  ret i64 %and
+}
+
+define i1 @or_i1(i1 %a, i1 %b) {
+; MIPS32-LABEL: or_i1:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    or $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %or = or i1 %b, %a
+  ret i1 %or
+}
+
+define i8 @or_i8(i8 %a, i8 %b) {
+; MIPS32-LABEL: or_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    or $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %or = or i8 %b, %a
+  ret i8 %or
+}
+
+define i16 @or_i16(i16 %a, i16 %b) {
+; MIPS32-LABEL: or_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    or $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %or = or i16 %b, %a
+  ret i16 %or
+}
+
+define i32 @or_i32(i32 %a, i32 %b) {
+; MIPS32-LABEL: or_i32:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    or $2, $5, $4
 ; MIPS32-NEXT:    jr $ra
@@ -24,8 +101,53 @@ entry:
   ret i32 %or
 }
 
-define i32 @xor(i32 %a, i32 %b) {
-; MIPS32-LABEL: xor:
+define i64 @or_i64(i64 %a, i64 %b) {
+; MIPS32-LABEL: or_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    or $2, $6, $4
+; MIPS32-NEXT:    or $3, $7, $5
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %or = or i64 %b, %a
+  ret i64 %or
+}
+
+define i1 @xor_i1(i1 %a, i1 %b) {
+; MIPS32-LABEL: xor_i1:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    xor $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %xor = xor i1 %b, %a
+  ret i1 %xor
+}
+
+define i8 @xor_i8(i8 %a, i8 %b) {
+; MIPS32-LABEL: xor_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    xor $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %xor = xor i8 %b, %a
+  ret i8 %xor
+}
+
+define i16 @xor_i16(i16 %a, i16 %b) {
+; MIPS32-LABEL: xor_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    xor $2, $5, $4
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %xor = xor i16 %b, %a
+  ret i16 %xor
+}
+
+define i32 @xor_i32(i32 %a, i32 %b) {
+; MIPS32-LABEL: xor_i32:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    xor $2, $5, $4
 ; MIPS32-NEXT:    jr $ra
@@ -35,6 +157,18 @@ entry:
   ret i32 %xor
 }
 
+define i64 @xor_i64(i64 %a, i64 %b) {
+; MIPS32-LABEL: xor_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    xor $2, $6, $4
+; MIPS32-NEXT:    xor $3, $7, $5
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %xor = xor i64 %b, %a
+  ret i64 %xor
+}
+
 define i32 @shl(i32 %a) {
 ; MIPS32-LABEL: shl:
 ; MIPS32:       # %bb.0: # %entry
diff --git a/test/CodeGen/Mips/GlobalISel/llvm-ir/rem_and_div.ll b/test/CodeGen/Mips/GlobalISel/llvm-ir/rem_and_div.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b17c6037bd3072d88dd570c622e6e7861d44c295
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/llvm-ir/rem_and_div.ll
@@ -0,0 +1,314 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc  -O0 -mtriple=mipsel-linux-gnu -global-isel  -verify-machineinstrs %s -o -| FileCheck %s -check-prefixes=MIPS32
+
+; sdiv
+define signext i8 @sdiv_i8(i8 signext %a, i8 signext %b) {
+; MIPS32-LABEL: sdiv_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    sll $5, $5, 24
+; MIPS32-NEXT:    sra $5, $5, 24
+; MIPS32-NEXT:    sll $4, $4, 24
+; MIPS32-NEXT:    sra $4, $4, 24
+; MIPS32-NEXT:    div $zero, $5, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mflo $4
+; MIPS32-NEXT:    sll $4, $4, 24
+; MIPS32-NEXT:    sra $2, $4, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = sdiv i8 %b, %a
+  ret i8 %div
+}
+
+define signext i16 @sdiv_i16(i16 signext %a, i16 signext %b) {
+; MIPS32-LABEL: sdiv_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    sll $5, $5, 16
+; MIPS32-NEXT:    sra $5, $5, 16
+; MIPS32-NEXT:    sll $4, $4, 16
+; MIPS32-NEXT:    sra $4, $4, 16
+; MIPS32-NEXT:    div $zero, $5, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mflo $4
+; MIPS32-NEXT:    sll $4, $4, 16
+; MIPS32-NEXT:    sra $2, $4, 16
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = sdiv i16 %b, %a
+  ret i16 %div
+}
+
+define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b) {
+; MIPS32-LABEL: sdiv_i32:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    div $zero, $5, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mflo $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = sdiv i32 %b, %a
+  ret i32 %div
+}
+
+define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) {
+; MIPS32-LABEL: sdiv_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    sw $4, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $4, $6
+; MIPS32-NEXT:    sw $5, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $5, $7
+; MIPS32-NEXT:    lw $6, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $7, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jal __divdi3
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    addiu $sp, $sp, 32
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = sdiv i64 %b, %a
+  ret i64 %div
+}
+
+; srem
+define signext i8 @srem_i8(i8 signext %a, i8 signext %b) {
+; MIPS32-LABEL: srem_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    sll $5, $5, 24
+; MIPS32-NEXT:    sra $5, $5, 24
+; MIPS32-NEXT:    sll $4, $4, 24
+; MIPS32-NEXT:    sra $4, $4, 24
+; MIPS32-NEXT:    div $zero, $5, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mflo $4
+; MIPS32-NEXT:    sll $4, $4, 24
+; MIPS32-NEXT:    sra $2, $4, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = sdiv i8 %b, %a
+  ret i8 %div
+}
+
+define signext i16 @srem_i16(i16 signext %a, i16 signext %b) {
+; MIPS32-LABEL: srem_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    sll $5, $5, 16
+; MIPS32-NEXT:    sra $5, $5, 16
+; MIPS32-NEXT:    sll $4, $4, 16
+; MIPS32-NEXT:    sra $4, $4, 16
+; MIPS32-NEXT:    div $zero, $5, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mfhi $4
+; MIPS32-NEXT:    sll $4, $4, 16
+; MIPS32-NEXT:    sra $2, $4, 16
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %rem = srem i16 %b, %a
+  ret i16 %rem
+}
+
+define signext i32 @srem_i32(i32 signext %a, i32 signext %b) {
+; MIPS32-LABEL: srem_i32:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    div $zero, $5, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mfhi $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %rem = srem i32 %b, %a
+  ret i32 %rem
+}
+
+define signext i64 @srem_i64(i64 signext %a, i64 signext %b) {
+; MIPS32-LABEL: srem_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    sw $4, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $4, $6
+; MIPS32-NEXT:    sw $5, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $5, $7
+; MIPS32-NEXT:    lw $6, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $7, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jal __moddi3
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    addiu $sp, $sp, 32
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %rem = srem i64 %b, %a
+  ret i64 %rem
+}
+
+; udiv
+define signext i8 @udiv_i8(i8 signext %a, i8 signext %b) {
+; MIPS32-LABEL: udiv_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $1, $1, 255
+; MIPS32-NEXT:    and $1, $5, $1
+; MIPS32-NEXT:    lui $5, 0
+; MIPS32-NEXT:    ori $5, $5, 255
+; MIPS32-NEXT:    and $4, $4, $5
+; MIPS32-NEXT:    divu $zero, $1, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mflo $1
+; MIPS32-NEXT:    sll $1, $1, 24
+; MIPS32-NEXT:    sra $2, $1, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = udiv i8 %b, %a
+  ret i8 %div
+}
+
+define signext i16 @udiv_i16(i16 signext %a, i16 signext %b) {
+; MIPS32-LABEL: udiv_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $1, $1, 65535
+; MIPS32-NEXT:    and $1, $5, $1
+; MIPS32-NEXT:    lui $5, 0
+; MIPS32-NEXT:    ori $5, $5, 65535
+; MIPS32-NEXT:    and $4, $4, $5
+; MIPS32-NEXT:    divu $zero, $1, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mflo $1
+; MIPS32-NEXT:    sll $1, $1, 16
+; MIPS32-NEXT:    sra $2, $1, 16
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = udiv i16 %b, %a
+  ret i16 %div
+}
+
+define signext i32 @udiv_i32(i32 signext %a, i32 signext %b) {
+; MIPS32-LABEL: udiv_i32:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    divu $zero, $5, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mflo $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = udiv i32 %b, %a
+  ret i32 %div
+}
+
+define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) {
+; MIPS32-LABEL: udiv_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    sw $4, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $4, $6
+; MIPS32-NEXT:    sw $5, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $5, $7
+; MIPS32-NEXT:    lw $6, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $7, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jal __udivdi3
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    addiu $sp, $sp, 32
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %div = udiv i64 %b, %a
+  ret i64 %div
+}
+
+; urem
+define signext i8 @urem_i8(i8 signext %a, i8 signext %b) {
+; MIPS32-LABEL: urem_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $1, $1, 255
+; MIPS32-NEXT:    and $1, $5, $1
+; MIPS32-NEXT:    lui $5, 0
+; MIPS32-NEXT:    ori $5, $5, 255
+; MIPS32-NEXT:    and $4, $4, $5
+; MIPS32-NEXT:    divu $zero, $1, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mfhi $1
+; MIPS32-NEXT:    sll $1, $1, 24
+; MIPS32-NEXT:    sra $2, $1, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %rem = urem i8 %b, %a
+  ret i8 %rem
+}
+
+define signext i16 @urem_i16(i16 signext %a, i16 signext %b) {
+; MIPS32-LABEL: urem_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $1, $1, 65535
+; MIPS32-NEXT:    and $1, $5, $1
+; MIPS32-NEXT:    lui $5, 0
+; MIPS32-NEXT:    ori $5, $5, 65535
+; MIPS32-NEXT:    and $4, $4, $5
+; MIPS32-NEXT:    divu $zero, $1, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mfhi $1
+; MIPS32-NEXT:    sll $1, $1, 16
+; MIPS32-NEXT:    sra $2, $1, 16
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %rem = urem i16 %b, %a
+  ret i16 %rem
+}
+
+define signext i32 @urem_i32(i32 signext %a, i32 signext %b) {
+; MIPS32-LABEL: urem_i32:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    divu $zero, $5, $4
+; MIPS32-NEXT:    teq $4, $zero, 7
+; MIPS32-NEXT:    mfhi $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %rem = urem i32 %b, %a
+  ret i32 %rem
+}
+
+define signext i64 @urem_i64(i64 signext %a, i64 signext %b) {
+; MIPS32-LABEL: urem_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    sw $4, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $4, $6
+; MIPS32-NEXT:    sw $5, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $5, $7
+; MIPS32-NEXT:    lw $6, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $7, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jal __umoddi3
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    addiu $sp, $sp, 32
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  %rem = urem i64 %b, %a
+  ret i64 %rem
+}
diff --git a/test/CodeGen/Mips/GlobalISel/regbankselect/bitwise.mir b/test/CodeGen/Mips/GlobalISel/regbankselect/bitwise.mir
index 5df2c201f74546091009bb84b5e85c3cf46ab92e..595b4a18124b319314b13901e4acc6759e8a8f74 100644
--- a/test/CodeGen/Mips/GlobalISel/regbankselect/bitwise.mir
+++ b/test/CodeGen/Mips/GlobalISel/regbankselect/bitwise.mir
@@ -3,9 +3,9 @@
 --- |
 
 
-  define void @and(i32, i32) {entry: ret void}
-  define void @or(i32, i32) {entry: ret void}
-  define void @xor(i32, i32) {entry: ret void}
+  define void @and_i32() {entry: ret void}
+  define void @or_i32() {entry: ret void}
+  define void @xor_i32() {entry: ret void}
   define void @shl(i32) {entry: ret void}
   define void @ashr(i32) {entry: ret void}
   define void @lshr(i32) {entry: ret void}
@@ -15,7 +15,7 @@
 
 ...
 ---
-name:            and
+name:            and_i32
 alignment:       2
 legalized:       true
 tracksRegLiveness: true
@@ -23,7 +23,7 @@ body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: and
+    ; MIPS32-LABEL: name: and_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:gprb(s32) = COPY $a1
@@ -38,7 +38,7 @@ body:             |
 
 ...
 ---
-name:            or
+name:            or_i32
 alignment:       2
 legalized:       true
 tracksRegLiveness: true
@@ -46,7 +46,7 @@ body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: or
+    ; MIPS32-LABEL: name: or_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:gprb(s32) = COPY $a1
@@ -61,7 +61,7 @@ body:             |
 
 ...
 ---
-name:            xor
+name:            xor_i32
 alignment:       2
 legalized:       true
 tracksRegLiveness: true
@@ -69,7 +69,7 @@ body:             |
   bb.1.entry:
     liveins: $a0, $a1
 
-    ; MIPS32-LABEL: name: xor
+    ; MIPS32-LABEL: name: xor_i32
     ; MIPS32: liveins: $a0, $a1
     ; MIPS32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
     ; MIPS32: [[COPY1:%[0-9]+]]:gprb(s32) = COPY $a1
diff --git a/test/CodeGen/Mips/GlobalISel/regbankselect/rem_and_div.mir b/test/CodeGen/Mips/GlobalISel/regbankselect/rem_and_div.mir
new file mode 100644
index 0000000000000000000000000000000000000000..bee36309a9fb97509843e3217b7876f067e50b0c
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/regbankselect/rem_and_div.mir
@@ -0,0 +1,102 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
+--- |
+
+  define void @sdiv_i32() {entry: ret void}
+  define void @srem_i32() {entry: ret void}
+  define void @udiv_i32() {entry: ret void}
+  define void @urem_i32() {entry: ret void}
+
+...
+---
+name:            sdiv_i32
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: sdiv_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:gprb(s32) = COPY $a1
+    ; MIPS32: [[SDIV:%[0-9]+]]:gprb(s32) = G_SDIV [[COPY1]], [[COPY]]
+    ; MIPS32: $v0 = COPY [[SDIV]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = COPY $a1
+    %2:_(s32) = G_SDIV %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            srem_i32
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: srem_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:gprb(s32) = COPY $a1
+    ; MIPS32: [[SREM:%[0-9]+]]:gprb(s32) = G_SREM [[COPY1]], [[COPY]]
+    ; MIPS32: $v0 = COPY [[SREM]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = COPY $a1
+    %2:_(s32) = G_SREM %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            udiv_i32
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: udiv_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:gprb(s32) = COPY $a1
+    ; MIPS32: [[UDIV:%[0-9]+]]:gprb(s32) = G_UDIV [[COPY1]], [[COPY]]
+    ; MIPS32: $v0 = COPY [[UDIV]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = COPY $a1
+    %2:_(s32) = G_UDIV %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            urem_i32
+alignment:       2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $a0, $a1
+
+    ; MIPS32-LABEL: name: urem_i32
+    ; MIPS32: liveins: $a0, $a1
+    ; MIPS32: [[COPY:%[0-9]+]]:gprb(s32) = COPY $a0
+    ; MIPS32: [[COPY1:%[0-9]+]]:gprb(s32) = COPY $a1
+    ; MIPS32: [[UREM:%[0-9]+]]:gprb(s32) = G_UREM [[COPY1]], [[COPY]]
+    ; MIPS32: $v0 = COPY [[UREM]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = COPY $a0
+    %1:_(s32) = COPY $a1
+    %2:_(s32) = G_UREM %1, %0
+    $v0 = COPY %2(s32)
+    RetRA implicit $v0
+
+...
diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll
index 9a55285feae207000197642c02f556d827aa0060..6a07c4f34564deb54d111dd191fb0a2091ecdddf 100644
--- a/test/CodeGen/Mips/cconv/vector.ll
+++ b/test/CodeGen/Mips/cconv/vector.ll
@@ -61,19 +61,15 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) {
 ; MIPS32R5EB-NEXT:    sw $5, 36($sp)
 ; MIPS32R5EB-NEXT:    sw $4, 40($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 37($sp)
-; MIPS32R5EB-NEXT:    sw $1, 20($sp)
+; MIPS32R5EB-NEXT:    sw $1, 28($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 36($sp)
-; MIPS32R5EB-NEXT:    sw $1, 16($sp)
+; MIPS32R5EB-NEXT:    sw $1, 20($sp)
+; MIPS32R5EB-NEXT:    lbu $1, 41($sp)
+; MIPS32R5EB-NEXT:    sw $1, 12($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 40($sp)
-; MIPS32R5EB-NEXT:    lbu $2, 41($sp)
-; MIPS32R5EB-NEXT:    sw $2, 4($sp)
-; MIPS32R5EB-NEXT:    sw $1, 0($sp)
-; MIPS32R5EB-NEXT:    ld.w $w0, 16($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w0, $w0, $w0
-; MIPS32R5EB-NEXT:    shf.w $w0, $w0, 177
-; MIPS32R5EB-NEXT:    ld.w $w1, 0($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT:    sw $1, 4($sp)
+; MIPS32R5EB-NEXT:    ld.d $w0, 16($sp)
+; MIPS32R5EB-NEXT:    ld.d $w1, 0($sp)
 ; MIPS32R5EB-NEXT:    addv.d $w0, $w1, $w0
 ; MIPS32R5EB-NEXT:    shf.w $w0, $w0, 177
 ; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[1]
@@ -166,17 +162,15 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) {
 ; MIPS32R5EL-NEXT:    sw $5, 36($sp)
 ; MIPS32R5EL-NEXT:    sw $4, 40($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 37($sp)
-; MIPS32R5EL-NEXT:    sw $1, 20($sp)
+; MIPS32R5EL-NEXT:    sw $1, 24($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 36($sp)
 ; MIPS32R5EL-NEXT:    sw $1, 16($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 41($sp)
-; MIPS32R5EL-NEXT:    sw $1, 4($sp)
+; MIPS32R5EL-NEXT:    sw $1, 8($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 40($sp)
 ; MIPS32R5EL-NEXT:    sw $1, 0($sp)
-; MIPS32R5EL-NEXT:    ld.w $w0, 16($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w0, $w0, $w0
-; MIPS32R5EL-NEXT:    ld.w $w1, 0($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT:    ld.d $w0, 16($sp)
+; MIPS32R5EL-NEXT:    ld.d $w1, 0($sp)
 ; MIPS32R5EL-NEXT:    addv.d $w0, $w1, $w0
 ; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[0]
 ; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[2]
@@ -327,61 +321,47 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x
 ; MIPS32R5EB-NEXT:    sw $5, 132($sp)
 ; MIPS32R5EB-NEXT:    sw $4, 136($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 133($sp)
-; MIPS32R5EB-NEXT:    sw $1, 68($sp)
+; MIPS32R5EB-NEXT:    sw $1, 76($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 132($sp)
-; MIPS32R5EB-NEXT:    sw $1, 64($sp)
+; MIPS32R5EB-NEXT:    sw $1, 68($sp)
+; MIPS32R5EB-NEXT:    lbu $1, 137($sp)
+; MIPS32R5EB-NEXT:    sw $1, 60($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 136($sp)
-; MIPS32R5EB-NEXT:    lbu $2, 137($sp)
-; MIPS32R5EB-NEXT:    sw $2, 52($sp)
-; MIPS32R5EB-NEXT:    sw $1, 48($sp)
-; MIPS32R5EB-NEXT:    ld.w $w0, 64($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w0, $w0, $w0
-; MIPS32R5EB-NEXT:    shf.w $w0, $w0, 177
-; MIPS32R5EB-NEXT:    ld.w $w1, 48($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT:    sw $1, 52($sp)
+; MIPS32R5EB-NEXT:    ld.d $w0, 64($sp)
+; MIPS32R5EB-NEXT:    ld.d $w1, 48($sp)
 ; MIPS32R5EB-NEXT:    addv.d $w0, $w1, $w0
 ; MIPS32R5EB-NEXT:    sw $6, 128($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 129($sp)
-; MIPS32R5EB-NEXT:    sw $1, 84($sp)
+; MIPS32R5EB-NEXT:    sw $1, 92($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 128($sp)
-; MIPS32R5EB-NEXT:    sw $1, 80($sp)
-; MIPS32R5EB-NEXT:    ld.w $w1, 80($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT:    sw $1, 84($sp)
+; MIPS32R5EB-NEXT:    ld.d $w1, 80($sp)
 ; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EB-NEXT:    sw $7, 124($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 125($sp)
-; MIPS32R5EB-NEXT:    sw $1, 100($sp)
+; MIPS32R5EB-NEXT:    sw $1, 108($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 124($sp)
-; MIPS32R5EB-NEXT:    sw $1, 96($sp)
-; MIPS32R5EB-NEXT:    ld.w $w1, 96($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT:    sw $1, 100($sp)
+; MIPS32R5EB-NEXT:    ld.d $w1, 96($sp)
 ; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EB-NEXT:    lbu $1, 161($fp)
-; MIPS32R5EB-NEXT:    sw $1, 4($sp)
+; MIPS32R5EB-NEXT:    sw $1, 12($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 160($fp)
-; MIPS32R5EB-NEXT:    sw $1, 0($sp)
-; MIPS32R5EB-NEXT:    ld.w $w1, 0($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT:    sw $1, 4($sp)
+; MIPS32R5EB-NEXT:    ld.d $w1, 0($sp)
 ; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EB-NEXT:    lbu $1, 165($fp)
-; MIPS32R5EB-NEXT:    sw $1, 20($sp)
+; MIPS32R5EB-NEXT:    sw $1, 28($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 164($fp)
-; MIPS32R5EB-NEXT:    sw $1, 16($sp)
-; MIPS32R5EB-NEXT:    ld.w $w1, 16($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT:    sw $1, 20($sp)
+; MIPS32R5EB-NEXT:    ld.d $w1, 16($sp)
 ; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EB-NEXT:    lbu $1, 169($fp)
-; MIPS32R5EB-NEXT:    sw $1, 36($sp)
+; MIPS32R5EB-NEXT:    sw $1, 44($sp)
 ; MIPS32R5EB-NEXT:    lbu $1, 168($fp)
-; MIPS32R5EB-NEXT:    sw $1, 32($sp)
-; MIPS32R5EB-NEXT:    ld.w $w1, 32($sp)
-; MIPS32R5EB-NEXT:    ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT:    sw $1, 36($sp)
+; MIPS32R5EB-NEXT:    ld.d $w1, 32($sp)
 ; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EB-NEXT:    shf.w $w0, $w0, 177
 ; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[1]
@@ -579,54 +559,47 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x
 ; MIPS32R5EL-NEXT:    sw $5, 132($sp)
 ; MIPS32R5EL-NEXT:    sw $4, 136($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 133($sp)
-; MIPS32R5EL-NEXT:    sw $1, 68($sp)
+; MIPS32R5EL-NEXT:    sw $1, 72($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 132($sp)
 ; MIPS32R5EL-NEXT:    sw $1, 64($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 137($sp)
-; MIPS32R5EL-NEXT:    sw $1, 52($sp)
+; MIPS32R5EL-NEXT:    sw $1, 56($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 136($sp)
 ; MIPS32R5EL-NEXT:    sw $1, 48($sp)
-; MIPS32R5EL-NEXT:    ld.w $w0, 64($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w0, $w0, $w0
-; MIPS32R5EL-NEXT:    ld.w $w1, 48($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT:    ld.d $w0, 64($sp)
+; MIPS32R5EL-NEXT:    ld.d $w1, 48($sp)
 ; MIPS32R5EL-NEXT:    addv.d $w0, $w1, $w0
 ; MIPS32R5EL-NEXT:    sw $6, 128($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 129($sp)
-; MIPS32R5EL-NEXT:    sw $1, 84($sp)
+; MIPS32R5EL-NEXT:    sw $1, 88($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 128($sp)
 ; MIPS32R5EL-NEXT:    sw $1, 80($sp)
-; MIPS32R5EL-NEXT:    ld.w $w1, 80($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT:    ld.d $w1, 80($sp)
 ; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EL-NEXT:    sw $7, 124($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 125($sp)
-; MIPS32R5EL-NEXT:    sw $1, 100($sp)
+; MIPS32R5EL-NEXT:    sw $1, 104($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 124($sp)
 ; MIPS32R5EL-NEXT:    sw $1, 96($sp)
-; MIPS32R5EL-NEXT:    ld.w $w1, 96($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT:    ld.d $w1, 96($sp)
 ; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EL-NEXT:    lbu $1, 161($fp)
-; MIPS32R5EL-NEXT:    sw $1, 4($sp)
+; MIPS32R5EL-NEXT:    sw $1, 8($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 160($fp)
 ; MIPS32R5EL-NEXT:    sw $1, 0($sp)
-; MIPS32R5EL-NEXT:    ld.w $w1, 0($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT:    ld.d $w1, 0($sp)
 ; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EL-NEXT:    lbu $1, 165($fp)
-; MIPS32R5EL-NEXT:    sw $1, 20($sp)
+; MIPS32R5EL-NEXT:    sw $1, 24($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 164($fp)
 ; MIPS32R5EL-NEXT:    sw $1, 16($sp)
-; MIPS32R5EL-NEXT:    ld.w $w1, 16($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT:    ld.d $w1, 16($sp)
 ; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EL-NEXT:    lbu $1, 169($fp)
-; MIPS32R5EL-NEXT:    sw $1, 36($sp)
+; MIPS32R5EL-NEXT:    sw $1, 40($sp)
 ; MIPS32R5EL-NEXT:    lbu $1, 168($fp)
 ; MIPS32R5EL-NEXT:    sw $1, 32($sp)
-; MIPS32R5EL-NEXT:    ld.w $w1, 32($sp)
-; MIPS32R5EL-NEXT:    ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT:    ld.d $w1, 32($sp)
 ; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
 ; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[0]
 ; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[2]
diff --git a/test/CodeGen/Mips/fastcc.ll b/test/CodeGen/Mips/fastcc.ll
index fb1bc4d9a8ab5efe7d4902d1308ec30c4ad967d6..e48dee4721d8452139732f015edb14e2962be1f2 100644
--- a/test/CodeGen/Mips/fastcc.ll
+++ b/test/CodeGen/Mips/fastcc.ll
@@ -223,24 +223,24 @@ entry:
 define internal fastcc void @callee1(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8, float %a9, float %a10, float %a11, float %a12, float %a13, float %a14, float %a15, float %a16, float %a17, float %a18, float %a19, float %a20) nounwind noinline {
 entry:
 ; CHECK-LABEL: callee1:
-; CHECK-DAG: swc1  $f0
-; CHECK-DAG: swc1  $f1
-; CHECK-DAG: swc1  $f2
-; CHECK-DAG: swc1  $f3
-; CHECK-DAG: swc1  $f4
-; CHECK-DAG: swc1  $f5
-; CHECK-DAG: swc1  $f6
-; CHECK-DAG: swc1  $f7
-; CHECK-DAG: swc1  $f8
-; CHECK-DAG: swc1  $f9
-; CHECK-DAG: swc1  $f10
-; CHECK-DAG: swc1  $f11
-; CHECK-DAG: swc1  $f12
-; CHECK-DAG: swc1  $f13
-; CHECK-DAG: swc1  $f14
-; CHECK-DAG: swc1  $f15
-; CHECK-DAG: swc1  $f16
 ; CHECK-DAG: swc1  $f17
+; CHECK-DAG: swc1  $f16
+; CHECK-DAG: swc1  $f15
+; CHECK-DAG: swc1  $f14
+; CHECK-DAG: swc1  $f13
+; CHECK-DAG: swc1  $f12
+; CHECK-DAG: swc1  $f11
+; CHECK-DAG: swc1  $f10
+; CHECK-DAG: swc1  $f9
+; CHECK-DAG: swc1  $f8
+; CHECK-DAG: swc1  $f7
+; CHECK-DAG: swc1  $f6
+; CHECK-DAG: swc1  $f5
+; CHECK-DAG: swc1  $f4
+; CHECK-DAG: swc1  $f3
+; CHECK-DAG: swc1  $f2
+; CHECK-DAG: swc1  $f1
+; CHECK-DAG: swc1  $f0
 ; CHECK-DAG: swc1  $f18
 ; CHECK-DAG: swc1  $f19
 
@@ -330,7 +330,7 @@ entry:
 ; NOODDSPREG-DAG:    swc1    $f16, 32($[[R0]])
 ; NOODDSPREG-DAG:    swc1    $f18, 36($[[R0]])
 
-; NOODDSPREG-DAG:    lwc1    $[[F0:f[0-9]*[02468]]], 0($sp)
+; NOODDSPREG-DAG:    lwc1    $[[F0:f[0-9]*[02468]]], {{[0-9]+}}($sp)
 ; NOODDSPREG-DAG:    swc1    $[[F0]], 40($[[R0]])
 
   store float %a0, float* getelementptr ([11 x float], [11 x float]* @fa, i32 0, i32 0), align 4
diff --git a/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index 4618c96d879900c9b82918ba285bd7bf9ea2ed26..9105e9249d4f021619229b719460500451e37800 100644
--- a/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -251,260 +251,60 @@ entry:
   ret void
 }
 
+; Entire fp16 (unsigned) range fits into (signed) i32.
 define i32 @ffptoui() {
-; MIPS32-O32-LABEL: ffptoui:
-; MIPS32-O32:       # %bb.0: # %entry
-; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
-; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MIPS32-O32-NEXT:    addu $1, $2, $25
-; MIPS32-O32-NEXT:    lw $2, %got(h)($1)
-; MIPS32-O32-NEXT:    lw $3, %got($CPI3_0)($1)
-; MIPS32-O32-NEXT:    lwc1 $f0, %lo($CPI3_0)($3)
-; MIPS32-O32-NEXT:    lh $2, 0($2)
-; MIPS32-O32-NEXT:    fill.h $w1, $2
-; MIPS32-O32-NEXT:    fexupr.w $w1, $w1
-; MIPS32-O32-NEXT:    copy_s.w $2, $w1[0]
-; MIPS32-O32-NEXT:    mtc1 $2, $f2
-; MIPS32-O32-NEXT:    sub.s $f0, $f2, $f0
-; MIPS32-O32-NEXT:    mfc1 $2, $f0
-; MIPS32-O32-NEXT:    fill.w $w0, $2
-; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
-; MIPS32-O32-NEXT:    fexupr.d $w0, $w0
-; MIPS32-O32-NEXT:    copy_s.w $2, $w0[0]
-; MIPS32-O32-NEXT:    mtc1 $2, $f3
-; MIPS32-O32-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32-O32-NEXT:    mthc1 $2, $f3
-; MIPS32-O32-NEXT:    trunc.w.d $f0, $f3
-; MIPS32-O32-NEXT:    mfc1 $2, $f0
-; MIPS32-O32-NEXT:    fexupr.d $w0, $w1
-; MIPS32-O32-NEXT:    copy_s.w $3, $w0[0]
-; MIPS32-O32-NEXT:    mtc1 $3, $f1
-; MIPS32-O32-NEXT:    copy_s.w $3, $w0[1]
-; MIPS32-O32-NEXT:    mthc1 $3, $f1
-; MIPS32-O32-NEXT:    trunc.w.d $f0, $f1
-; MIPS32-O32-NEXT:    mfc1 $3, $f0
-; MIPS32-O32-NEXT:    lw $1, %got($CPI3_1)($1)
-; MIPS32-O32-NEXT:    addiu $1, $1, %lo($CPI3_1)
-; MIPS32-O32-NEXT:    lui $4, 32768
-; MIPS32-O32-NEXT:    xor $2, $2, $4
-; MIPS32-O32-NEXT:    lh $1, 0($1)
-; MIPS32-O32-NEXT:    fill.h $w0, $1
-; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
-; MIPS32-O32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32-O32-NEXT:    mtc1 $1, $f0
-; MIPS32-O32-NEXT:    c.olt.s $f2, $f0
-; MIPS32-O32-NEXT:    jr $ra
-; MIPS32-O32-NEXT:    movt $2, $3, $fcc0
-;
-; MIPS64R5-N32-LABEL: ffptoui:
-; MIPS64R5-N32:       # %bb.0: # %entry
-; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
-; MIPS64R5-N32-NEXT:    addu $1, $1, $25
-; MIPS64R5-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
-; MIPS64R5-N32-NEXT:    lw $2, %got_disp(h)($1)
-; MIPS64R5-N32-NEXT:    lw $3, %got_page(.LCPI3_0)($1)
-; MIPS64R5-N32-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($3)
-; MIPS64R5-N32-NEXT:    lh $2, 0($2)
-; MIPS64R5-N32-NEXT:    fill.h $w1, $2
-; MIPS64R5-N32-NEXT:    fexupr.w $w1, $w1
-; MIPS64R5-N32-NEXT:    copy_s.w $2, $w1[0]
-; MIPS64R5-N32-NEXT:    mtc1 $2, $f2
-; MIPS64R5-N32-NEXT:    sub.s $f0, $f2, $f0
-; MIPS64R5-N32-NEXT:    mfc1 $2, $f0
-; MIPS64R5-N32-NEXT:    fill.w $w0, $2
-; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
-; MIPS64R5-N32-NEXT:    fexupr.d $w0, $w0
-; MIPS64R5-N32-NEXT:    copy_s.d $2, $w0[0]
-; MIPS64R5-N32-NEXT:    dmtc1 $2, $f0
-; MIPS64R5-N32-NEXT:    trunc.w.d $f0, $f0
-; MIPS64R5-N32-NEXT:    mfc1 $2, $f0
-; MIPS64R5-N32-NEXT:    fexupr.d $w0, $w1
-; MIPS64R5-N32-NEXT:    copy_s.d $3, $w0[0]
-; MIPS64R5-N32-NEXT:    dmtc1 $3, $f0
-; MIPS64R5-N32-NEXT:    trunc.w.d $f0, $f0
-; MIPS64R5-N32-NEXT:    mfc1 $3, $f0
-; MIPS64R5-N32-NEXT:    lw $1, %got_page(.LCPI3_1)($1)
-; MIPS64R5-N32-NEXT:    addiu $1, $1, %got_ofst(.LCPI3_1)
-; MIPS64R5-N32-NEXT:    lui $4, 32768
-; MIPS64R5-N32-NEXT:    xor $2, $2, $4
-; MIPS64R5-N32-NEXT:    lh $1, 0($1)
-; MIPS64R5-N32-NEXT:    fill.h $w0, $1
-; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
-; MIPS64R5-N32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64R5-N32-NEXT:    mtc1 $1, $f0
-; MIPS64R5-N32-NEXT:    c.olt.s $f2, $f0
-; MIPS64R5-N32-NEXT:    jr $ra
-; MIPS64R5-N32-NEXT:    movt $2, $3, $fcc0
-;
-; MIPS64R5-N64-LABEL: ffptoui:
-; MIPS64R5-N64:       # %bb.0: # %entry
-; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
-; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
-; MIPS64R5-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
-; MIPS64R5-N64-NEXT:    ld $2, %got_disp(h)($1)
-; MIPS64R5-N64-NEXT:    ld $3, %got_page(.LCPI3_0)($1)
-; MIPS64R5-N64-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($3)
-; MIPS64R5-N64-NEXT:    lh $2, 0($2)
-; MIPS64R5-N64-NEXT:    fill.h $w1, $2
-; MIPS64R5-N64-NEXT:    fexupr.w $w1, $w1
-; MIPS64R5-N64-NEXT:    copy_s.w $2, $w1[0]
-; MIPS64R5-N64-NEXT:    mtc1 $2, $f2
-; MIPS64R5-N64-NEXT:    sub.s $f0, $f2, $f0
-; MIPS64R5-N64-NEXT:    mfc1 $2, $f0
-; MIPS64R5-N64-NEXT:    fill.w $w0, $2
-; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
-; MIPS64R5-N64-NEXT:    fexupr.d $w0, $w0
-; MIPS64R5-N64-NEXT:    copy_s.d $2, $w0[0]
-; MIPS64R5-N64-NEXT:    dmtc1 $2, $f0
-; MIPS64R5-N64-NEXT:    trunc.w.d $f0, $f0
-; MIPS64R5-N64-NEXT:    mfc1 $2, $f0
-; MIPS64R5-N64-NEXT:    fexupr.d $w0, $w1
-; MIPS64R5-N64-NEXT:    copy_s.d $3, $w0[0]
-; MIPS64R5-N64-NEXT:    dmtc1 $3, $f0
-; MIPS64R5-N64-NEXT:    trunc.w.d $f0, $f0
-; MIPS64R5-N64-NEXT:    mfc1 $3, $f0
-; MIPS64R5-N64-NEXT:    ld $1, %got_page(.LCPI3_1)($1)
-; MIPS64R5-N64-NEXT:    daddiu $1, $1, %got_ofst(.LCPI3_1)
-; MIPS64R5-N64-NEXT:    lui $4, 32768
-; MIPS64R5-N64-NEXT:    xor $2, $2, $4
-; MIPS64R5-N64-NEXT:    lh $1, 0($1)
-; MIPS64R5-N64-NEXT:    fill.h $w0, $1
-; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
-; MIPS64R5-N64-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64R5-N64-NEXT:    mtc1 $1, $f0
-; MIPS64R5-N64-NEXT:    c.olt.s $f2, $f0
-; MIPS64R5-N64-NEXT:    jr $ra
-; MIPS64R5-N64-NEXT:    movt $2, $3, $fcc0
-;
-; MIPSR6-O32-LABEL: ffptoui:
-; MIPSR6-O32:       # %bb.0: # %entry
-; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
-; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MIPSR6-O32-NEXT:    addu $1, $2, $25
-; MIPSR6-O32-NEXT:    lw $2, %got(h)($1)
-; MIPSR6-O32-NEXT:    lw $1, %got($CPI3_0)($1)
-; MIPSR6-O32-NEXT:    lwc1 $f0, %lo($CPI3_0)($1)
-; MIPSR6-O32-NEXT:    lh $1, 0($2)
-; MIPSR6-O32-NEXT:    fill.h $w1, $1
-; MIPSR6-O32-NEXT:    fexupr.w $w1, $w1
-; MIPSR6-O32-NEXT:    copy_s.w $1, $w1[0]
-; MIPSR6-O32-NEXT:    mtc1 $1, $f2
-; MIPSR6-O32-NEXT:    cmp.lt.s $f3, $f2, $f0
-; MIPSR6-O32-NEXT:    sub.s $f0, $f2, $f0
-; MIPSR6-O32-NEXT:    mfc1 $1, $f0
-; MIPSR6-O32-NEXT:    fill.w $w0, $1
-; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-O32-NEXT:    fexupr.d $w0, $w0
-; MIPSR6-O32-NEXT:    copy_s.w $1, $w0[0]
-; MIPSR6-O32-NEXT:    mtc1 $1, $f2
-; MIPSR6-O32-NEXT:    copy_s.w $1, $w0[1]
-; MIPSR6-O32-NEXT:    mthc1 $1, $f2
-; MIPSR6-O32-NEXT:    trunc.w.d $f0, $f2
-; MIPSR6-O32-NEXT:    mfc1 $1, $f0
-; MIPSR6-O32-NEXT:    fexupr.d $w0, $w1
-; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-O32-NEXT:    mtc1 $2, $f1
-; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[1]
-; MIPSR6-O32-NEXT:    mthc1 $2, $f1
-; MIPSR6-O32-NEXT:    trunc.w.d $f0, $f1
-; MIPSR6-O32-NEXT:    mfc1 $2, $f0
-; MIPSR6-O32-NEXT:    lui $3, 32768
-; MIPSR6-O32-NEXT:    xor $1, $1, $3
-; MIPSR6-O32-NEXT:    mfc1 $3, $f3
-; MIPSR6-O32-NEXT:    seleqz $1, $1, $3
-; MIPSR6-O32-NEXT:    selnez $2, $2, $3
-; MIPSR6-O32-NEXT:    jr $ra
-; MIPSR6-O32-NEXT:    or $2, $2, $1
+; MIPS32-LABEL: ffptoui:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(h)($1)
+; MIPS32-NEXT:    lh $1, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    fexupr.d $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    mtc1 $1, $f1
+; MIPS32-NEXT:    copy_s.w $1, $w0[1]
+; MIPS32-NEXT:    mthc1 $1, $f1
+; MIPS32-NEXT:    trunc.w.d $f0, $f1
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    mfc1 $2, $f0
 ;
-; MIPSR6-N32-LABEL: ffptoui:
-; MIPSR6-N32:       # %bb.0: # %entry
-; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
-; MIPSR6-N32-NEXT:    addu $1, $1, $25
-; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
-; MIPSR6-N32-NEXT:    lw $2, %got_disp(h)($1)
-; MIPSR6-N32-NEXT:    lw $1, %got_page(.LCPI3_0)($1)
-; MIPSR6-N32-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($1)
-; MIPSR6-N32-NEXT:    lh $1, 0($2)
-; MIPSR6-N32-NEXT:    fill.h $w1, $1
-; MIPSR6-N32-NEXT:    fexupr.w $w1, $w1
-; MIPSR6-N32-NEXT:    copy_s.w $1, $w1[0]
-; MIPSR6-N32-NEXT:    mtc1 $1, $f2
-; MIPSR6-N32-NEXT:    cmp.lt.s $f3, $f2, $f0
-; MIPSR6-N32-NEXT:    sub.s $f0, $f2, $f0
-; MIPSR6-N32-NEXT:    mfc1 $1, $f0
-; MIPSR6-N32-NEXT:    fill.w $w0, $1
-; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N32-NEXT:    fexupr.d $w0, $w0
-; MIPSR6-N32-NEXT:    copy_s.d $1, $w0[0]
-; MIPSR6-N32-NEXT:    dmtc1 $1, $f0
-; MIPSR6-N32-NEXT:    trunc.w.d $f0, $f0
-; MIPSR6-N32-NEXT:    mfc1 $1, $f0
-; MIPSR6-N32-NEXT:    fexupr.d $w0, $w1
-; MIPSR6-N32-NEXT:    copy_s.d $2, $w0[0]
-; MIPSR6-N32-NEXT:    dmtc1 $2, $f0
-; MIPSR6-N32-NEXT:    trunc.w.d $f0, $f0
-; MIPSR6-N32-NEXT:    mfc1 $2, $f0
-; MIPSR6-N32-NEXT:    lui $3, 32768
-; MIPSR6-N32-NEXT:    xor $1, $1, $3
-; MIPSR6-N32-NEXT:    mfc1 $3, $f3
-; MIPSR6-N32-NEXT:    seleqz $1, $1, $3
-; MIPSR6-N32-NEXT:    selnez $2, $2, $3
-; MIPSR6-N32-NEXT:    jr $ra
-; MIPSR6-N32-NEXT:    or $2, $2, $1
+; MIPS64-N32-LABEL: ffptoui:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(h)($1)
+; MIPS64-N32-NEXT:    lh $1, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64-N32-NEXT:    dmtc1 $1, $f0
+; MIPS64-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
 ;
-; MIPSR6-N64-LABEL: ffptoui:
-; MIPSR6-N64:       # %bb.0: # %entry
-; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
-; MIPSR6-N64-NEXT:    daddu $1, $1, $25
-; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
-; MIPSR6-N64-NEXT:    ld $2, %got_disp(h)($1)
-; MIPSR6-N64-NEXT:    ld $1, %got_page(.LCPI3_0)($1)
-; MIPSR6-N64-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($1)
-; MIPSR6-N64-NEXT:    lh $1, 0($2)
-; MIPSR6-N64-NEXT:    fill.h $w1, $1
-; MIPSR6-N64-NEXT:    fexupr.w $w1, $w1
-; MIPSR6-N64-NEXT:    copy_s.w $1, $w1[0]
-; MIPSR6-N64-NEXT:    mtc1 $1, $f2
-; MIPSR6-N64-NEXT:    cmp.lt.s $f3, $f2, $f0
-; MIPSR6-N64-NEXT:    sub.s $f0, $f2, $f0
-; MIPSR6-N64-NEXT:    mfc1 $1, $f0
-; MIPSR6-N64-NEXT:    fill.w $w0, $1
-; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N64-NEXT:    fexupr.d $w0, $w0
-; MIPSR6-N64-NEXT:    copy_s.d $1, $w0[0]
-; MIPSR6-N64-NEXT:    dmtc1 $1, $f0
-; MIPSR6-N64-NEXT:    trunc.w.d $f0, $f0
-; MIPSR6-N64-NEXT:    mfc1 $1, $f0
-; MIPSR6-N64-NEXT:    fexupr.d $w0, $w1
-; MIPSR6-N64-NEXT:    copy_s.d $2, $w0[0]
-; MIPSR6-N64-NEXT:    dmtc1 $2, $f0
-; MIPSR6-N64-NEXT:    trunc.w.d $f0, $f0
-; MIPSR6-N64-NEXT:    mfc1 $2, $f0
-; MIPSR6-N64-NEXT:    lui $3, 32768
-; MIPSR6-N64-NEXT:    xor $1, $1, $3
-; MIPSR6-N64-NEXT:    mfc1 $3, $f3
-; MIPSR6-N64-NEXT:    seleqz $1, $1, $3
-; MIPSR6-N64-NEXT:    selnez $2, $2, $3
-; MIPSR6-N64-NEXT:    jr $ra
-; MIPSR6-N64-NEXT:    or $2, $2, $1
+; MIPS64-N64-LABEL: ffptoui:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(h)($1)
+; MIPS64-N64-NEXT:    lh $1, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64-N64-NEXT:    dmtc1 $1, $f0
+; MIPS64-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
 entry:
   %0 = load half, half * @h, align 2
   %1 = fptoui half %0 to i32
-
-
-
-
-
-
-
-
-
   ret i32 %1
 }
 
diff --git a/test/CodeGen/NVPTX/f16x2-instructions.ll b/test/CodeGen/NVPTX/f16x2-instructions.ll
index 1dae1de580b0b4cac5b139f12eb110fd071f4d2b..77ce45a9dc3a5a798622343e0f0b81ebdbfc6870 100644
--- a/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -823,10 +823,8 @@ define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
 
 ; CHECK-LABEL: test_uitofp_2xi64(
 ; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
-; CHECK-DAG:  cvt.rn.f32.u64  [[F0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f32.u64  [[F1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[F0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[F1]];
+; CHECK-DAG:  cvt.rn.f16.u64  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.u64  [[R1:%h[0-9]+]], [[A1]];
 ; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
@@ -849,10 +847,8 @@ define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
 
 ; CHECK-LABEL: test_sitofp_2xi64(
 ; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
-; CHECK-DAG:  cvt.rn.f32.s64  [[F0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f32.s64  [[F1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[F0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[F1]];
+; CHECK-DAG:  cvt.rn.f16.s64  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.s64  [[R1:%h[0-9]+]], [[A1]];
 ; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
 ; CHECK:      st.param.b32    [func_retval0+0], [[R]];
 ; CHECK:      ret;
diff --git a/test/CodeGen/NVPTX/i128-struct.ll b/test/CodeGen/NVPTX/i128-struct.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c619be40f369446418ef7c4d98f402247bbfaf25
--- /dev/null
+++ b/test/CodeGen/NVPTX/i128-struct.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[32]) foo(
+define { i128, i128 } @foo(i64 %a, i32 %b) {
+  %1 = sext i64 %a to i128
+  %2 = sext i32 %b to i128
+  %3 = insertvalue { i128, i128 } undef, i128 %1, 0
+  %4 = insertvalue { i128, i128 } %3, i128 %2, 1
+
+  ; CHECK: st.param.v2.b64 [func_retval0+0],  {%[[REG1:rd[0-9]+]], %[[REG2:rd[0-9]+]]};
+  ; CHECK: st.param.v2.b64 [func_retval0+16], {%[[REG3:rd[0-9]+]], %[[REG4:rd[0-9]+]]};
+  ret { i128, i128 } %4
+}
diff --git a/test/CodeGen/NVPTX/libcall-instruction.ll b/test/CodeGen/NVPTX/libcall-instruction.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0c2cab7eaa5a1b435b900f91fe1fff03d7e892d5
--- /dev/null
+++ b/test/CodeGen/NVPTX/libcall-instruction.ll
@@ -0,0 +1,8 @@
+; RUN: not llc < %s -march=nvptx 2>&1 | FileCheck %s
+; used to panic on failed assetion and now fails with a "Cannot select"
+
+; CHECK: LLVM ERROR: Cannot select: {{t28|0x[0-9a-f]+}}: i32 = ExternalSymbol'__umodti3'
+define hidden i128 @remainder(i128, i128) {
+  %3 = urem i128 %0, %1
+  ret i128 %3
+}
diff --git a/test/CodeGen/NVPTX/nofunc.ll b/test/CodeGen/NVPTX/nofunc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e9dcffbdb7faef84cde0a00c24e5be0de50d7754
--- /dev/null
+++ b/test/CodeGen/NVPTX/nofunc.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+; Test that we don't crash if we're compiling a module with function references,
+; but without any functions in it.
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+@Funcs = local_unnamed_addr addrspace(1) externally_initialized
+         global [1 x void (i8*)*] [void (i8*)* @func], align 8
+
+declare void @func(i8*)
+
+; CHECK: Funcs[1] = {func}
diff --git a/test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll b/test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll
index fa5916aa98e1173fd12831d8ae494187fa2b86a9..38b7d51b4a7457a969212821cea15da4ceac877e 100644
--- a/test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll
+++ b/test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
 ; The instructions addis,addi, bl are used to calculate the address of TLS
 ; thread local variables. These TLS access code sequences are generated
 ; repeatedly every time the thread local variable is accessed. By communicating
diff --git a/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
index d31a5553bf9f684dd499cd91c850d64008e093c3..2e1a4cec2b52be3cc1dacdfaa34a0825719e2613 100644
--- a/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
+++ b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 < %s | FileCheck %s
+; RUN: llc -O2 -verify-machineinstrs < %s | FileCheck %s
 ; ModuleID = 'bugpoint-reduced-simplified.bc'
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
diff --git a/test/CodeGen/PowerPC/addegluecrash.ll b/test/CodeGen/PowerPC/addegluecrash.ll
index a1d9805458368bdca9f4d58f9c1bf3ab1ca392bc..a7653735eaac51e0a5c8443204ab8253c98432aa 100644
--- a/test/CodeGen/PowerPC/addegluecrash.ll
+++ b/test/CodeGen/PowerPC/addegluecrash.ll
@@ -27,6 +27,7 @@ define void @bn_mul_comba8(i64* nocapture %r, i64* nocapture readonly %a, i64* n
 ; CHECK-NEXT:    mr 4, 10
 ; CHECK-NEXT:    clrldi 4, 4, 32
 ; CHECK-NEXT:    std 4, 0(3)
+; CHECK-NEXT:    std 6, -8(1) # 8-byte Folded Spill
 ; CHECK-NEXT:    blr
   %1 = load i64, i64* %a, align 8
   %conv = zext i64 %1 to i128
diff --git a/test/CodeGen/PowerPC/atomics-constant.ll b/test/CodeGen/PowerPC/atomics-constant.ll
index 559cd9eb656ac8531e09cd04d77d35b1df97d5f1..ac0b16b55f7c1a27c2e14ebc6d8b2ad4cf8df00c 100644
--- a/test/CodeGen/PowerPC/atomics-constant.ll
+++ b/test/CodeGen/PowerPC/atomics-constant.ll
@@ -8,14 +8,14 @@ target triple = "powerpc64le-unknown-linux-gnu"
 define i64 @foo() {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis 3, 2, .LC0@toc@ha
+; CHECK-NEXT:    addis 3, 2, a@toc@ha
 ; CHECK-NEXT:    li 4, 0
-; CHECK-NEXT:    ld 3, .LC0@toc@l(3)
+; CHECK-NEXT:    addi 3, 3, a@toc@l
 ; CHECK-NEXT:    cmpd 7, 4, 4
 ; CHECK-NEXT:    ld 3, 0(3)
+; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    bne- 7, .+4
 ; CHECK-NEXT:    isync
-; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    blr
 entry:
   %value = load atomic i64, i64* @a acquire, align 8
diff --git a/test/CodeGen/PowerPC/brcond.ll b/test/CodeGen/PowerPC/brcond.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5bf8c6c1f35550fa9f124c66aa28bc2281b311ae
--- /dev/null
+++ b/test/CodeGen/PowerPC/brcond.ll
@@ -0,0 +1,602 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+
+define signext i32 @testi32slt(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32slt
+; CHECK: crorc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp slt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32ult(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32ult
+; CHECK: crorc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp ult i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32sle(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32sle
+; CHECK: crandc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp sle i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32ule(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32ule
+; CHECK: crandc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp ule i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32eq(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32eq:
+; CHECK: crxor [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp eq i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32sge(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32sge:
+; CHECK: crandc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp sge i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32uge(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32uge:
+; CHECK: crandc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp uge i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32sgt(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32sgt:
+; CHECK: crorc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32ugt(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32ugt:
+; CHECK: crorc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define signext i32 @testi32ne(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
+; CHECK-LABEL: testi32ne:
+; CHECK: creqv [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i32 %c3, %c4
+  %cmp3tmp = icmp eq i32 %c1, %c2
+  %cmp3 = icmp ne i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i32 %a1
+iffalse:
+  ret i32 %a2
+}
+
+define i64 @testi64slt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64slt
+; CHECK: crorc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp slt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64ult(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64ult
+; CHECK: crorc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp ult i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64sle(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64sle
+; CHECK: crandc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp sle i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64ule(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64ule
+; CHECK: crandc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp ule i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64eq(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64eq
+; CHECK: crxor [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp eq i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64sge(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64sge
+; CHECK: crandc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp sge i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64uge(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64uge
+; CHECK: crandc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp uge i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64sgt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64sgt
+; CHECK: crorc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64ugt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64ugt
+; CHECK: crorc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define i64 @testi64ne(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
+; CHECK-LABEL: testi64ne
+; CHECK: creqv [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = icmp eq i64 %c3, %c4
+  %cmp3tmp = icmp eq i64 %c1, %c2
+  %cmp3 = icmp ne i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret i64 %a1
+iffalse:
+  ret i64 %a2
+}
+
+define float @testfloatslt(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatslt
+; CHECK: crorc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp slt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloatult(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatult
+; CHECK: crorc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp ult i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloatsle(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatsle
+; CHECK: crandc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp sle i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloatule(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatule
+; CHECK: crandc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp ule i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloateq(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloateq
+; CHECK: crxor [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp eq i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloatsge(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatsge
+; CHECK: crandc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp sge i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloatuge(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatuge
+; CHECK: crandc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp uge i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloatsgt(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatsgt
+; CHECK: crorc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloatugt(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatugt
+; CHECK: crorc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define float @testfloatne(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
+; CHECK-LABEL: testfloatne
+; CHECK: creqv [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq float %c3, %c4
+  %cmp3tmp = fcmp oeq float %c1, %c2
+  %cmp3 = icmp ne i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret float %a1
+iffalse:
+  ret float %a2
+}
+
+define double @testdoubleslt(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoubleslt
+; CHECK: crorc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp slt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoubleult(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoubleult:
+; CHECK: crorc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp ult i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoublesle(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoublesle
+; CHECK: crandc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp sle i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoubleule(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoubleule:
+; CHECK: crandc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp ule i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoubleeq(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoubleeq
+; CHECK: crxor [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp eq i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoublesge(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoublesge
+; CHECK: crandc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp sge i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoubleuge(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoubleuge
+; CHECK: crandc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp uge i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoublesgt(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoublesgt:
+; CHECK: crorc [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoubleugt(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoubleugt
+; CHECK: crorc [[REG:[0-9]+]], 2, 6
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
+
+define double @testdoublene(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
+; CHECK-LABEL: testdoublene
+; CHECK: creqv [[REG:[0-9]+]], 6, 2
+; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+entry:
+  %cmp1 = fcmp oeq double %c3, %c4
+  %cmp3tmp = fcmp oeq double %c1, %c2
+  %cmp3 = icmp ne i1 %cmp3tmp, %cmp1
+  br i1 %cmp3, label %iftrue, label %iffalse
+iftrue:
+  ret double %a1
+iffalse:
+  ret double %a2
+}
diff --git a/test/CodeGen/PowerPC/bswap64.ll b/test/CodeGen/PowerPC/bswap64.ll
index 0a78aa2dc548b301e210dc769cf02ff80c1e5a68..afc69515b2443038a8de481a61b0688a50f04d9a 100644
--- a/test/CodeGen/PowerPC/bswap64.ll
+++ b/test/CodeGen/PowerPC/bswap64.ll
@@ -1,11 +1,35 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64le-- -mcpu=pwr9 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -mcpu=pwr9 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unkknown-unkknown \
+; RUN:   -mcpu=pwr9 -mattr=-altivec | FileCheck %s --check-prefix=NO-ALTIVEC
 
 declare i64 @llvm.bswap.i64(i64)
 
-; CHECK: mtvsrdd
-; CHECK: xxbrd
-; CHECK: mfvsrd
 define i64 @bswap64(i64 %x) {
+; CHECK-LABEL: bswap64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mtvsrdd 34, 3, 3
+; CHECK-NEXT:    xxbrd 0, 34
+; CHECK-NEXT:    mfvsrd 3, 0
+; CHECK-NEXT:    blr
+;
+; NO-ALTIVEC-LABEL: bswap64:
+; NO-ALTIVEC:       # %bb.0: # %entry
+; NO-ALTIVEC-NEXT:    rotldi 5, 3, 16
+; NO-ALTIVEC-NEXT:    rotldi 4, 3, 8
+; NO-ALTIVEC-NEXT:    rldimi 4, 5, 8, 48
+; NO-ALTIVEC-NEXT:    rotldi 5, 3, 24
+; NO-ALTIVEC-NEXT:    rldimi 4, 5, 16, 40
+; NO-ALTIVEC-NEXT:    rotldi 5, 3, 32
+; NO-ALTIVEC-NEXT:    rldimi 4, 5, 24, 32
+; NO-ALTIVEC-NEXT:    rotldi 5, 3, 48
+; NO-ALTIVEC-NEXT:    rldimi 4, 5, 40, 16
+; NO-ALTIVEC-NEXT:    rotldi 5, 3, 56
+; NO-ALTIVEC-NEXT:    rldimi 4, 5, 48, 8
+; NO-ALTIVEC-NEXT:    rldimi 4, 3, 56, 0
+; NO-ALTIVEC-NEXT:    mr 3, 4
+; NO-ALTIVEC-NEXT:    blr
 entry:
   %0 = call i64 @llvm.bswap.i64(i64 %x)
   ret i64 %0
diff --git a/test/CodeGen/PowerPC/codemodel.ll b/test/CodeGen/PowerPC/codemodel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ee3ceae6df234f8c2ba98a9a63da4c924e95da25
--- /dev/null
+++ b/test/CodeGen/PowerPC/codemodel.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -verify-machineinstrs -o - -mtriple=powerpc-pc-linux -code-model=tiny < %s 2>&1 | FileCheck %s --check-prefix=TINY
+; RUN: not llc -verify-machineinstrs -o - -mtriple=powerpc-pc-linux -code-model=kernel < %s 2>&1 | FileCheck %s --check-prefix=KERNEL
+
+; TINY:    Target does not support the tiny CodeModel
+; KERNEL:    Target does not support the kernel CodeModel
+
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/convert-rr-to-ri-p9-vector.mir b/test/CodeGen/PowerPC/convert-rr-to-ri-p9-vector.mir
new file mode 100644
index 0000000000000000000000000000000000000000..be3e8e39ef423f1a41708b4ca7fb630fee42b1a2
--- /dev/null
+++ b/test/CodeGen/PowerPC/convert-rr-to-ri-p9-vector.mir
@@ -0,0 +1,162 @@
+# RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu -start-after \ 
+# RUN:   ppc-mi-peepholes -ppc-late-peephole %s -o - | FileCheck %s 
+
+---
+name:            testLXSSPX
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 1, class: g8rc, preferred-register: '' }
+  - { id: 2, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 3, class: gprc, preferred-register: '' }
+  - { id: 4, class: g8rc, preferred-register: '' }
+  - { id: 5, class: g8rc, preferred-register: '' }
+  - { id: 6, class: g8rc, preferred-register: '' }
+  - { id: 7, class: vssrc, preferred-register: '' }
+  - { id: 8, class: gprc, preferred-register: '' }
+  - { id: 9, class: g8rc, preferred-register: '' }
+  - { id: 10, class: g8rc, preferred-register: '' }
+  - { id: 11, class: g8rc, preferred-register: '' }
+  - { id: 12, class: vssrc, preferred-register: '' }
+  - { id: 13, class: vssrc, preferred-register: '' }
+liveins:
+  - { reg: '$x3', virtual-reg: '%0' }
+  - { reg: '$x4', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4
+
+    %1 = COPY $x4
+    %0 = COPY $x3
+    %2 = COPY %1.sub_32
+    %3 = ADDI %2, 1
+    %5 = IMPLICIT_DEF
+    %4 = INSERT_SUBREG %5, killed %3, 1
+    %6 = LI8 97
+    %7 = LXSSPX %0, killed %6, implicit $rm
+    ; CHECK:  lfs [[REG1:[0-9]+]], 97(3)
+    %8 = ADDI %2, 2
+    %10 = IMPLICIT_DEF
+    %9 = INSERT_SUBREG %10, killed %8, 1
+    %11 = LI8 -92
+    %12 = LXSSPX %0, killed %11, implicit $rm
+    ; CHECK-NEXT:  lfs [[REG2:[0-9]+]], -92(3)
+    %13 = XSADDSP killed %7, killed %12
+    ; CHECK-NEXT:  xsaddsp {{[0-9]+}}, [[REG1]], [[REG2]]
+    $f1 = COPY %13
+    BLR8 implicit $lr8, implicit $rm, implicit $f1
+...
+
+
+---
+name:            testLXSDX
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 1, class: g8rc, preferred-register: '' }
+  - { id: 2, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 3, class: gprc, preferred-register: '' }
+  - { id: 4, class: g8rc, preferred-register: '' }
+  - { id: 5, class: g8rc, preferred-register: '' }
+  - { id: 6, class: g8rc, preferred-register: '' }
+  - { id: 7, class: vsfrc, preferred-register: '' }
+  - { id: 8, class: gprc, preferred-register: '' }
+  - { id: 9, class: g8rc, preferred-register: '' }
+  - { id: 10, class: g8rc, preferred-register: '' }
+  - { id: 11, class: g8rc, preferred-register: '' }
+  - { id: 12, class: vsfrc, preferred-register: '' }
+  - { id: 13, class: vsfrc, preferred-register: '' }
+liveins:         
+  - { reg: '$x3', virtual-reg: '%0' }
+  - { reg: '$x4', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4
+  
+    %1 = COPY $x4
+    %0 = COPY $x3
+    %2 = COPY %1.sub_32
+    %3 = ADDI %2, 1
+    %5 = IMPLICIT_DEF
+    %4 = INSERT_SUBREG %5, killed %3, 1
+    %6 = LI8 99
+    %7 = LXSDX %0, killed %6, implicit $rm
+    ; CHECK:  lfd [[REG1:[0-9]+]], 99(3)
+    %8 = ADDI %2, 2
+    %10 = IMPLICIT_DEF
+    %9 = INSERT_SUBREG %10, killed %8, 1
+    %11 = LI8 -120
+    %12 = LXSDX %0, killed %11, implicit $rm
+    ; CHECK-NEXT:  lfd [[REG2:[0-9]+]], -120(3)
+    %13 = XSADDDP killed %7, killed %12, implicit $rm
+    ; CHECK-NEXT:  xsadddp {{[0-9]+}}, [[REG1]], [[REG2]]
+    $f1 = COPY %13
+    BLR8 implicit $lr8, implicit $rm, implicit $f1
+...
+
+
+---
+name:            testSTXSSPX
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 1, class: vssrc, preferred-register: '' }
+  - { id: 2, class: g8rc, preferred-register: '' }
+  - { id: 3, class: g8rc, preferred-register: '' }
+liveins:
+  - { reg: '$x3', virtual-reg: '%0' }
+  - { reg: '$f1', virtual-reg: '%1' }
+  - { reg: '$x5', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: $x3, $f1, $x5
+
+    %2 = COPY $x5
+    %1 = COPY $f1
+    %0 = COPY $x3
+    %3 = LI8 443
+    STXSSPX %1, %0, killed %3, implicit $rm
+    ; CHECK: stfs {{[0-9]+}}, 443(3)
+    BLR8 implicit $lr8, implicit $rm
+...
+
+
+---
+name:            testSTXSDX
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 1, class: vsfrc, preferred-register: '' }
+  - { id: 2, class: g8rc, preferred-register: '' }
+  - { id: 3, class: g8rc, preferred-register: '' }
+liveins:
+  - { reg: '$x3', virtual-reg: '%0' }
+  - { reg: '$f1', virtual-reg: '%1' }
+  - { reg: '$x5', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: $x3, $f1, $x5
+
+    %2 = COPY $x5
+    %1 = COPY $f1
+    %0 = COPY $x3
+    %3 = LI8  7
+    STXSDX %1, %0, killed %3, implicit $rm
+    ; CHECK: stfd {{[0-9]+}}, 7(3)
+    BLR8 implicit $lr8, implicit $rm
+...
diff --git a/test/CodeGen/PowerPC/cr-spills.ll b/test/CodeGen/PowerPC/cr-spills.ll
index 1a903115c0da3974842baddd19a60284acdf8bc1..170744679c8895e0e5b1ab1561b3b38b26d60b0a 100644
--- a/test/CodeGen/PowerPC/cr-spills.ll
+++ b/test/CodeGen/PowerPC/cr-spills.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/ctr-cleanup.ll b/test/CodeGen/PowerPC/ctr-cleanup.ll
index 1a669eb051d86a0396fbb42ec23a3c7f0f588da7..0824bb2766f814ed23d38d0175fc37e6681818c6 100644
--- a/test/CodeGen/PowerPC/ctr-cleanup.ll
+++ b/test/CodeGen/PowerPC/ctr-cleanup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=a2 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mcpu=a2 | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/ctrloop-large-ec.ll b/test/CodeGen/PowerPC/ctrloop-large-ec.ll
index cce23fabf63e5c6165bb0b1b32db3d4ff8125d49..0d139c3d549658efbeae8ef67e78c302169fb4ef 100644
--- a/test/CodeGen/PowerPC/ctrloop-large-ec.ll
+++ b/test/CodeGen/PowerPC/ctrloop-large-ec.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=ppc32 < %s | FileCheck %s
+; RUN: llc -mcpu=ppc32 -verify-machineinstrs < %s | FileCheck %s
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
 target triple = "powerpc-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/ctrloop-udivti3.ll b/test/CodeGen/PowerPC/ctrloop-udivti3.ll
index 54abd181f82cfafdfa56722c32306675f85b882a..c94233106b65d7b7b63d184023f74474b1e2b33e 100644
--- a/test/CodeGen/PowerPC/ctrloop-udivti3.ll
+++ b/test/CodeGen/PowerPC/ctrloop-udivti3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/early-ret2.ll b/test/CodeGen/PowerPC/early-ret2.ll
index f9758d3f7d4c5d9cd92df49be4c9f9b36e91e139..67a496adf14a0108a05f7eff1613b0b88e4c99f3 100644
--- a/test/CodeGen/PowerPC/early-ret2.ll
+++ b/test/CodeGen/PowerPC/early-ret2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-crbits | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s -check-prefix=CHECK-CRB
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-crbits | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s -check-prefix=CHECK-CRB
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/f128-aggregates.ll b/test/CodeGen/PowerPC/f128-aggregates.ll
index 8c934ba65862f02d4dbb5857be153832e5ca73c1..d671c76f5f48f728016bf4ed2252a49a48c0a38a 100644
--- a/test/CodeGen/PowerPC/f128-aggregates.ll
+++ b/test/CodeGen/PowerPC/f128-aggregates.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN: llc -relocation-model=pic -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   -enable-ppc-quad-precision -verify-machineinstrs \
 ; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s
-; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-unknown \
+; RUN: llc -relocation-model=pic -mcpu=pwr9 -mtriple=powerpc64-unknown-unknown \
 ; RUN:   -enable-ppc-quad-precision -verify-machineinstrs \
 ; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s \
 ; RUN:   | FileCheck -check-prefix=CHECK-BE %s
diff --git a/test/CodeGen/PowerPC/f128-conv.ll b/test/CodeGen/PowerPC/f128-conv.ll
index 6c8d5964cb321928444b2a79fc3fb0e462fa5ad7..ef433bd3e45ecbd00f4d313eadb6a7b6e552a1ab 100644
--- a/test/CodeGen/PowerPC/f128-conv.ll
+++ b/test/CodeGen/PowerPC/f128-conv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN: llc -relocation-model=pic -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   -enable-ppc-quad-precision -ppc-vsr-nums-as-vr \
 ; RUN:   -verify-machineinstrs -ppc-asm-full-reg-names < %s | FileCheck %s
 
diff --git a/test/CodeGen/PowerPC/f128-truncateNconv.ll b/test/CodeGen/PowerPC/f128-truncateNconv.ll
index d5ffea69fac21886f7f8ba116a1455b4893bbdb3..d346683d502263ffbb8e4b9f5bdfe5604b6cbca0 100644
--- a/test/CodeGen/PowerPC/f128-truncateNconv.ll
+++ b/test/CodeGen/PowerPC/f128-truncateNconv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN: llc -relocation-model=pic -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   -verify-machineinstrs -enable-ppc-quad-precision \
 ; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s
 
diff --git a/test/CodeGen/PowerPC/f128-vecExtractNconv.ll b/test/CodeGen/PowerPC/f128-vecExtractNconv.ll
index 3ab5ee941421dc8fc4d7bf4ecfa0225770d84d90..bae676cd09cd34a8fdc2fbaf277a391b6a41eda1 100644
--- a/test/CodeGen/PowerPC/f128-vecExtractNconv.ll
+++ b/test/CodeGen/PowerPC/f128-vecExtractNconv.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown -ppc-vsr-nums-as-vr \
-; RUN:   -ppc-asm-full-reg-names -verify-machineinstrs \
+; RUN:   -relocation-model=pic -ppc-asm-full-reg-names -verify-machineinstrs \
 ; RUN:   -enable-ppc-quad-precision < %s | FileCheck %s
 ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-unknown -ppc-vsr-nums-as-vr \
 ; RUN:   -ppc-asm-full-reg-names -verify-machineinstrs \
diff --git a/test/CodeGen/PowerPC/fast-isel-call.ll b/test/CodeGen/PowerPC/fast-isel-call.ll
index a080baedd8e43f9f807b2e7cc5f9b15d1a69d66f..8823beb96844b841af220743cd9aca6ca7443f5d 100644
--- a/test/CodeGen/PowerPC/fast-isel-call.ll
+++ b/test/CodeGen/PowerPC/fast-isel-call.ll
@@ -2,7 +2,7 @@
 ; registers and with -fast-isel-abort=1 turned on the test case will then fail.
 ; When fastisel better supports VSX fix up this test case.
 ;
-; RUN: llc < %s -O0 -verify-machineinstrs -mattr=-vsx -fast-isel-abort=1 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -ppc-late-peephole=true | FileCheck %s --check-prefix=ELF64
+; RUN: llc < %s -O0 -relocation-model=pic -verify-machineinstrs -mattr=-vsx -fast-isel-abort=1 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -ppc-late-peephole=true | FileCheck %s --check-prefix=ELF64
 
 define i32 @t1(i8 signext %a) nounwind {
   %1 = sext i8 %a to i32
diff --git a/test/CodeGen/PowerPC/hoist-logic.ll b/test/CodeGen/PowerPC/hoist-logic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e9b052e94cd40498d6ab956a6b0091c113fd1c08
--- /dev/null
+++ b/test/CodeGen/PowerPC/hoist-logic.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
+
+; This is good - eliminate an op by hoisting logic.
+
+define i32 @lshr_or(i32 %x, i32 %y, i32 %z, i32* %p1, i32* %p2) {
+; CHECK-LABEL: lshr_or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    srw 3, 3, 5
+; CHECK-NEXT:    blr
+  %xt = lshr i32 %x, %z
+  %yt = lshr i32 %y, %z
+  %r = or i32 %xt, %yt
+  ret i32 %r
+}
+
+; This is questionable - hoisting doesn't eliminate anything.
+; It might result in an extra register move.
+
+define i32 @lshr_or_multiuse1(i32 %x, i32 %y, i32 %z, i32* %p1, i32* %p2) {
+; CHECK-LABEL: lshr_or_multiuse1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srw 7, 3, 5
+; CHECK-NEXT:    srw 3, 4, 5
+; CHECK-NEXT:    or 3, 7, 3
+; CHECK-NEXT:    stw 7, 0(6)
+; CHECK-NEXT:    blr
+  %xt = lshr i32 %x, %z
+  %yt = lshr i32 %y, %z
+  store i32 %xt, i32* %p1
+  %r = or i32 %xt, %yt
+  ret i32 %r
+}
+
+; This is questionable - hoisting doesn't eliminate anything.
+
+define i32 @lshr_multiuse2(i32 %x, i32 %y, i32 %z, i32* %p1, i32* %p2) {
+; CHECK-LABEL: lshr_multiuse2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srw 3, 3, 5
+; CHECK-NEXT:    srw 4, 4, 5
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    stw 4, 0(7)
+; CHECK-NEXT:    blr
+  %xt = lshr i32 %x, %z
+  %yt = lshr i32 %y, %z
+  store i32 %yt, i32* %p2
+  %r = or i32 %xt, %yt
+  ret i32 %r
+}
+
+; This is not profitable to hoist. We need an extra shift instruction.
+
+define i32 @lshr_multiuse3(i32 %x, i32 %y, i32 %z, i32* %p1, i32* %p2) {
+; CHECK-LABEL: lshr_multiuse3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srw 3, 3, 5
+; CHECK-NEXT:    srw 4, 4, 5
+; CHECK-NEXT:    stw 3, 0(6)
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    stw 4, 0(7)
+; CHECK-NEXT:    blr
+  %xt = lshr i32 %x, %z
+  %yt = lshr i32 %y, %z
+  store i32 %xt, i32* %p1
+  store i32 %yt, i32* %p2
+  %r = or i32 %xt, %yt
+  ret i32 %r
+}
+
diff --git a/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll b/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll
index baa8d87562cf34caf649990369b0a7ab9ad4854d..de75469f16baf07c78ca18d715b9d35baf47c7c7 100644
--- a/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll
+++ b/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll
@@ -1,5 +1,5 @@
 ; ModuleID = 'bugpoint-reduced-instructions.bc'
-; RUN: llc -O2 -o - %s | FileCheck %s
+; RUN: llc -O2 -o - %s -verify-machineinstrs | FileCheck %s
 source_filename = "bugpoint-output-9ad75f8.bc"
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/jaggedstructs.ll b/test/CodeGen/PowerPC/jaggedstructs.ll
index 6128316f45fa3d0712cab94d96acb36fa405d0e8..45e043a669d74fccdcc8dddd24ed60c8c2c32172 100644
--- a/test/CodeGen/PowerPC/jaggedstructs.ll
+++ b/test/CodeGen/PowerPC/jaggedstructs.ll
@@ -34,11 +34,9 @@ entry:
 ; CHECK-DAG: lwz {{[0-9]+}}, 178(1)
 ; CHECK-DAG: sth {{[0-9]+}}, 70(1)
 ; CHECK-DAG: stw {{[0-9]+}}, 66(1)
-; CHECK-DAG: lbz {{[0-9]+}}, 191(1)
-; CHECK-DAG: lhz {{[0-9]+}}, 189(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 188(1)
 ; CHECK-DAG: lwz {{[0-9]+}}, 185(1)
-; CHECK-DAG: stb {{[0-9]+}}, 79(1)
-; CHECK-DAG: sth {{[0-9]+}}, 77(1)
+; CHECK-DAG: stw {{[0-9]+}}, 76(1)
 ; CHECK-DAG: stw {{[0-9]+}}, 73(1)
 ; CHECK-DAG: ld 6, 72(1)
 ; CHECK-DAG: ld 5, 64(1)
diff --git a/test/CodeGen/PowerPC/mcm-13.ll b/test/CodeGen/PowerPC/mcm-13.ll
index 6f69b43194a4a8ca0f2ec081d240fc48ff867cd3..d7c50efbbae590de09f64e084313124ad9d6df2e 100644
--- a/test/CodeGen/PowerPC/mcm-13.ll
+++ b/test/CodeGen/PowerPC/mcm-13.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck %s
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck %s
 
 ; Test correct code generation for medium and large code model
 ; for loading and storing a weak variable
diff --git a/test/CodeGen/PowerPC/mcm-6.ll b/test/CodeGen/PowerPC/mcm-6.ll
index b1ad8c2b43cdcddf0e1dd8a6d47ed1fc336d2e3f..a8fe9c332b16bc0d3cc242096d91607f4f125e61 100644
--- a/test/CodeGen/PowerPC/mcm-6.ll
+++ b/test/CodeGen/PowerPC/mcm-6.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -O0 -code-model=medium < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -O0 -code-model=large < %s | FileCheck %s
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mcpu=pwr7 -O0 -code-model=medium < %s | FileCheck %s
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mcpu=pwr7 -O0 -code-model=large < %s | FileCheck %s
 
 ; Test correct code generation for medium and large code model
 ; for loading and storing a tentatively defined variable.
diff --git a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
index eb26209ee2a2e9b6fae9862c7c181ad94c4551e1..30aff0e32b8dadbf940445192803ebc5131cd797 100644
--- a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -147,9 +147,8 @@ define signext i32 @zeroEqualityTest05() {
 ; CHECK-NEXT:    li 4, -1
 ; CHECK-NEXT:    isel 5, 4, 3, 0
 ; CHECK-NEXT:  .LBB4_3: # %endblock
-; CHECK-NEXT:    srwi 3, 5, 31
-; CHECK-NEXT:    xori 3, 3, 1
-; CHECK-NEXT:    clrldi 3, 3, 32
+; CHECK-NEXT:    nor 3, 5, 5
+; CHECK-NEXT:    rlwinm 3, 3, 1, 31, 31
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
   %call.lobit = lshr i32 %call, 31
diff --git a/test/CodeGen/PowerPC/merge-st-chain-op.ll b/test/CodeGen/PowerPC/merge-st-chain-op.ll
index 4d5b9170ece078ef34458bf515b4b0188eb6a8e5..0b9ba93c43560248220d933558086ae606d1601c 100644
--- a/test/CodeGen/PowerPC/merge-st-chain-op.ll
+++ b/test/CodeGen/PowerPC/merge-st-chain-op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/mi-scheduling-lhs.ll b/test/CodeGen/PowerPC/mi-scheduling-lhs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4de384dce6ec4b421c3d16b49acaec45f3475aeb
--- /dev/null
+++ b/test/CodeGen/PowerPC/mi-scheduling-lhs.ll
@@ -0,0 +1,49 @@
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 <%s | FileCheck %s
+
+%struct.Record = type { %struct.Record*, i32 }
+
+@n = local_unnamed_addr global i32 500000000, align 4
+@m = common global %struct.Record zeroinitializer, align 8
+@a = hidden local_unnamed_addr global %struct.Record* @m, align 8
+@o = common global %struct.Record zeroinitializer, align 8
+@b = hidden local_unnamed_addr global %struct.Record* @o, align 8
+
+define signext i32 @foo() local_unnamed_addr {
+entry:
+  %0 = load i64, i64* bitcast (%struct.Record** @b to i64*), align 8
+  %1 = load i64*, i64** bitcast (%struct.Record** @a to i64**), align 8
+  store i64 %0, i64* %1, align 8
+  %2 = load i32, i32* @n, align 4
+  %cmp9 = icmp eq i32 %2, 0
+  br i1 %cmp9, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %3 = load %struct.Record*, %struct.Record** @a, align 8
+  %IntComp = getelementptr inbounds %struct.Record, %struct.Record* %3, i64 0, i32 1
+  store i32 5, i32* %IntComp, align 8
+  %PtrComp2 = getelementptr inbounds %struct.Record, %struct.Record* %3, i64 0, i32 0
+  %4 = load %struct.Record*, %struct.Record** %PtrComp2, align 8
+  %IntComp3 = getelementptr inbounds %struct.Record, %struct.Record* %4, i64 0, i32 1
+  store i32 5, i32* %IntComp3, align 8
+  %PtrComp6 = getelementptr inbounds %struct.Record, %struct.Record* %4, i64 0, i32 0
+  store %struct.Record* %4, %struct.Record** %PtrComp6, align 8
+  %inc = add nuw i32 %i.010, 1
+  %cmp = icmp ult i32 %inc, %2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 0
+
+; CHECK-LABEL: foo
+; CHECK: addis [[REG1:[0-9]+]], 2, a@toc@ha
+; CHECK: li [[REG4:[0-9]+]], 5
+; CHECK: [[LAB:[a-z0-9A-Z_.]+]]:
+; CHECK: ld [[REG2:[0-9]+]], a@toc@l([[REG1]])
+; CHECK: ld [[REG3:[0-9]+]], 0([[REG2]])
+; CHECK: stw [[REG4]], 8([[REG2]])
+; CHECK: stw [[REG4]], 8([[REG3]]) 
+; CHECK: std [[REG3]], 0([[REG3]])
+; CHECK: bdnz [[LAB]]
+}
+
diff --git a/test/CodeGen/PowerPC/negctr.ll b/test/CodeGen/PowerPC/negctr.ll
index 2e649930da6112fc5eac63b9de755e518d7f3bcf..7ed4b6ae4b84c5ca1f8d7a899c56f0e88c4efc87 100644
--- a/test/CodeGen/PowerPC/negctr.ll
+++ b/test/CodeGen/PowerPC/negctr.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=a2 | FileCheck %s
-; RUN: llc < %s -mcpu=a2 -disable-lsr | FileCheck -check-prefix=NOLSR %s
+; RUN: llc < %s -mcpu=a2 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=a2 -disable-lsr -verify-machineinstrs | FileCheck -check-prefix=NOLSR %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 12585c2b7fddbdf7578bde614018e27227f09847..1b11bfd2b47e7d7cb4a7501ffb448a475016d7c2 100644
--- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \
-; RUN:       -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr8 \
+; RUN:       -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr8 -relocation-model=pic \
 ; RUN:       | FileCheck %s
 ; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN:       -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr8 \
+; RUN:       -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr8 -relocation-model=pic \
 ; RUN:       | FileCheck %s -check-prefix=CHECK-LE
 
 ; The build[csilf] functions simply test the scalar_to_vector handling with
diff --git a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
index 621bda04159aa3ee2957513c6bc1fb9bc02b0d26..8a5aa6a9119cae93a91b7a20b07a4e8178bd3284 100644
--- a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
+++ b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu %s -o - -enable-shrink-wrap=false |  FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu %s -o - -enable-shrink-wrap=false -verify-machineinstrs |  FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.
diff --git a/test/CodeGen/PowerPC/ppc-vaarg-agg.ll b/test/CodeGen/PowerPC/ppc-vaarg-agg.ll
index 3468a91a1b631001095a2b92af34eac8c1165fd5..19b85efbf0625d256c22b0a45a9fa1364f0c06ad 100644
--- a/test/CodeGen/PowerPC/ppc-vaarg-agg.ll
+++ b/test/CodeGen/PowerPC/ppc-vaarg-agg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
 target triple = "powerpc-montavista-linux-gnuspe"
 
diff --git a/test/CodeGen/PowerPC/ppc64-P9-setb.ll b/test/CodeGen/PowerPC/ppc64-P9-setb.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0006c07fa3addf8b127876a764287e868227b78f
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-P9-setb.ll
@@ -0,0 +1,1330 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -enable-ppc-quad-precision -ppc-asm-full-reg-names < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names < %s | FileCheck %s -check-prefix=CHECK-PWR8    \
+; RUN:   -implicit-check-not "\<setb\>"
+
+; Test different patterns with type i64
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setne)), setlt
+define i64 @setb1(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %a, %b
+  %t2 = icmp ne i64 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb1:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: addic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb1
+; CHECK-PWR8-DAG: xor
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: addic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setne)), setgt
+define i64 @setb2(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %b, %a
+  %t2 = icmp ne i64 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb2:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: addic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb2
+; CHECK-PWR8-DAG: xor
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: addic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setne)), setlt
+define i64 @setb3(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %a, %b
+  %t2 = icmp ne i64 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb3:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: addic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb3
+; CHECK-PWR8-DAG: xor
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: addic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setne)), setgt
+define i64 @setb4(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %b, %a
+  %t2 = icmp ne i64 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb4:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: addic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb4
+; CHECK-PWR8-DAG: xor
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: addic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setgt)), setlt
+define i64 @setb5(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %a, %b
+  %t2 = icmp sgt i64 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb5:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb5
+; CHECK-PWR8-DAG: sradi
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setgt)), setgt
+define i64 @setb6(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %b, %a
+  %t2 = icmp sgt i64 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb6:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb6
+; CHECK-PWR8-DAG: sradi
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setlt)), setlt
+define i64 @setb7(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %a, %b
+  %t2 = icmp slt i64 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb7:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb7
+; CHECK-PWR8-DAG: sradi
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setlt)), setgt
+define i64 @setb8(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %b, %a
+  %t2 = icmp slt i64 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb8:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb8
+; CHECK-PWR8-DAG: sradi
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setne)), setgt
+define i64 @setb9(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %a, %b
+  %t2 = icmp ne i64 %a, %b
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb9:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb9
+; CHECK-PWR8-DAG: xor
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setne)), setlt
+define i64 @setb10(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %b, %a
+  %t2 = icmp ne i64 %a, %b
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb10:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb10
+; CHECK-PWR8-DAG: xor
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setne)), setgt
+define i64 @setb11(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %a, %b
+  %t2 = icmp ne i64 %b, %a
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb11:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb11
+; CHECK-PWR8-DAG: xor
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setne)), setlt
+define i64 @setb12(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %b, %a
+  %t2 = icmp ne i64 %b, %a
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb12:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb12
+; CHECK-PWR8-DAG: xor
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setlt)), setgt
+define i64 @setb13(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %a, %b
+  %t2 = icmp slt i64 %a, %b
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb13:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: neg
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb13
+; CHECK-PWR8-DAG: sradi
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8-DAG: neg
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setlt)), setlt
+define i64 @setb14(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %b, %a
+  %t2 = icmp slt i64 %a, %b
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb14:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: neg
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb14
+; CHECK-PWR8-DAG: sradi
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8-DAG: neg
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setgt)), setgt
+define i64 @setb15(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %a, %b
+  %t2 = icmp sgt i64 %b, %a
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb15:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: neg
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb15
+; CHECK-PWR8-DAG: sradi
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8-DAG: neg
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setgt)), setlt
+define i64 @setb16(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %b, %a
+  %t2 = icmp sgt i64 %b, %a
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb16:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: neg
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb16
+; CHECK-PWR8-DAG: sradi
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8-DAG: neg
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc lhs, rhs, 1, -1, setgt), seteq
+define i64 @setb17(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %a, %b
+  %t2 = icmp sgt i64 %a, %b
+  %t3 = select i1 %t2, i64 1, i64 -1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb17:
+; CHECK-NOT: li
+; CHECK-NOT: cmpld
+; CHECK-NOT: isel
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb17
+; CHECK-PWR8: cmpd
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmpld
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc rhs, lhs, 1, -1, setgt), seteq
+define i64 @setb18(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %b, %a
+  %t2 = icmp sgt i64 %a, %b
+  %t3 = select i1 %t2, i64 1, i64 -1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb18:
+; CHECK-NOT: li
+; CHECK-NOT: cmpld
+; CHECK-NOT: isel
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb18
+; CHECK-PWR8: cmpd
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmpld
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc rhs, lhs, 1, -1, setlt), seteq
+define i64 @setb19(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %a, %b
+  %t2 = icmp slt i64 %b, %a
+  %t3 = select i1 %t2, i64 1, i64 -1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb19:
+; CHECK-NOT: li
+; CHECK-NOT: cmpld
+; CHECK-NOT: isel
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb19
+; CHECK-PWR8: cmpd
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmpld
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc lhs, rhs, 1, -1, setlt), seteq
+define i64 @setb20(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %b, %a
+  %t2 = icmp slt i64 %b, %a
+  %t3 = select i1 %t2, i64 1, i64 -1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb20:
+; CHECK-NOT: li
+; CHECK-NOT: cmpld
+; CHECK-NOT: isel
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb20
+; CHECK-PWR8: cmpd
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmpld
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc lhs, rhs, -1, 1, setlt), seteq
+define i64 @setb21(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %a, %b
+  %t2 = icmp slt i64 %a, %b
+  %t3 = select i1 %t2, i64 -1, i64 1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb21:
+; CHECK-NOT: li
+; CHECK-NOT: cmpld
+; CHECK-NOT: isel
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb21
+; CHECK-PWR8: cmpd
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmpld
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc rhs, lhs, -1, 1, setlt), seteq
+define i64 @setb22(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %b, %a
+  %t2 = icmp slt i64 %a, %b
+  %t3 = select i1 %t2, i64 -1, i64 1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb22:
+; CHECK-NOT: li
+; CHECK-NOT: cmpld
+; CHECK-NOT: isel
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb22
+; CHECK-PWR8: cmpd
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmpld
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc rhs, lhs, -1, 1, setgt), seteq
+define i64 @setb23(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %a, %b
+  %t2 = icmp sgt i64 %b, %a
+  %t3 = select i1 %t2, i64 -1, i64 1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb23:
+; CHECK-NOT: li
+; CHECK-NOT: cmpld
+; CHECK-NOT: isel
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb23
+; CHECK-PWR8: cmpd
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmpld
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc lhs, rhs, -1, 1, setgt), seteq
+define i64 @setb24(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %b, %a
+  %t2 = icmp sgt i64 %b, %a
+  %t3 = select i1 %t2, i64 -1, i64 1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb24:
+; CHECK-NOT: li
+; CHECK-NOT: cmpld
+; CHECK-NOT: isel
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb24
+; CHECK-PWR8: cmpd
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmpld
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+; end all patterns testing for i64
+
+; Test with swapping the input parameters
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setne)), setlt
+define i64 @setb25(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %b, %a
+  %t2 = icmp ne i64 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb25:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK-NOT: cmpd
+; CHECK: cmpd {{c?r?(0, )?}}r4, r3
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: addic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb25
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: addic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setne)), setgt
+define i64 @setb26(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %a, %b
+  %t2 = icmp ne i64 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setb26:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r4, r3
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: addic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb26
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: addic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; Test with different scalar integer type for selected value
+; i32/i16/i8 rather than i64 above
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setne)), setlt
+define i64 @setb27(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %a, %b
+  %t2 = icmp ne i64 %b, %a
+  %t3 = zext i1 %t2 to i32
+  %t4 = select i1 %t1, i32 -1, i32 %t3
+  %t5 = sext i32 %t4 to i64
+  ret i64 %t5
+; CHECK-LABEL: setb27:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: addic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: extsw
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb27
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: addic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: extsw
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setne)), setgt
+define i64 @setb28(i64 %a, i64 %b) {
+  %t1 = icmp sgt i64 %b, %a
+  %t2 = icmp ne i64 %b, %a
+  %t3 = zext i1 %t2 to i16
+  %t4 = select i1 %t1, i16 -1, i16 %t3
+  %t5 = sext i16 %t4 to i64
+  ret i64 %t5
+; CHECK-LABEL: setb28:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: addic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: extsh
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb28
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: addic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: extsh
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setgt)), setlt
+define i64 @setb29(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %a, %b
+  %t2 = icmp sgt i64 %a, %b
+  %t3 = zext i1 %t2 to i8
+  %t4 = select i1 %t1, i8 -1, i8 %t3
+  %t5 = zext i8 %t4 to i64
+  ret i64 %t5
+; CHECK-LABEL: setb29:
+; CHECK-NOT: sradi
+; CHECK-NOT: rldicl
+; CHECK-NOT: li
+; CHECK: cmpd {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: adde
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setb29
+; CHECK-PWR8-DAG: cmpd
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: adde
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; Testings to cover different comparison opcodes
+; Test with integer type i32/i16/i8 for input parameter
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setne)), setlt
+define i64 @setbsw1(i32 %a, i32 %b) {
+  %t1 = icmp slt i32 %a, %b
+  %t2 = icmp ne i32 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbsw1:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpw {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: cntlzw
+; CHECK-NOT: srwi
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbsw1
+; CHECK-PWR8-DAG: cntlzw
+; CHECK-PWR8-DAG: cmpw
+; CHECK-PWR8-DAG: srwi
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setne)), setgt
+define i64 @setbsw2(i32 %a, i32 %b) {
+  %t1 = icmp sgt i32 %b, %a
+  %t2 = icmp ne i32 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbsw2:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpw {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: cntlzw
+; CHECK-NOT: srwi
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbsw2
+; CHECK-PWR8-DAG: cntlzw
+; CHECK-PWR8-DAG: cmpw
+; CHECK-PWR8-DAG: srwi
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc rhs, lhs, -1, 1, setgt), seteq
+define i64 @setbsw3(i32 %a, i32 %b) {
+  %t1 = icmp eq i32 %a, %b
+  %t2 = icmp sgt i32 %b, %a
+  %t3 = select i1 %t2, i64 -1, i64 1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbsw3:
+; CHECK-NOT: li
+; CHECK: cmpw {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK-NOT: cmplw
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbsw3
+; CHECK-PWR8: cmpw
+; CHECK-PWR8: isel
+; CHECK-PWR8: cmplw
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setne)), setlt
+define i64 @setbsh1(i16 signext %a, i16 signext %b) {
+  %t1 = icmp slt i16 %a, %b
+  %t2 = icmp ne i16 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbsh1:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpw {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: cntlzw
+; CHECK-NOT: srwi
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbsh1
+; CHECK-PWR8-DAG: cntlzw
+; CHECK-PWR8-DAG: cmpw
+; CHECK-PWR8-DAG: srwi
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setne)), setgt
+define i64 @setbsh2(i16 signext %a, i16 signext %b) {
+  %t1 = icmp sgt i16 %b, %a
+  %t2 = icmp ne i16 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbsh2:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpw {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: cntlzw
+; CHECK-NOT: srwi
+; CHECK-NOT: xori
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbsh2
+; CHECK-PWR8-DAG: cmpw
+; CHECK-PWR8-DAG: cntlzw
+; CHECK-PWR8-DAG: srwi
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setgt)), setlt
+define i64 @setbsc1(i8 %a, i8 %b) {
+  %t1 = icmp slt i8 %a, %b
+  %t2 = icmp sgt i8 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbsc1:
+; CHECK-DAG: extsb [[RA:r[0-9]+]], r3
+; CHECK-DAG: extsb [[RB:r[0-9]+]], r4
+; CHECK-NOT: li
+; CHECK-NOT: sub
+; CHECK: cmpw {{c?r?(0, )?}}[[RA]], [[RB]]
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: rldicl
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbsc1
+; CHECK-PWR8-DAG: extsb
+; CHECK-PWR8-DAG: extsb
+; CHECK-PWR8-DAG: extsw
+; CHECK-PWR8-DAG: extsw
+; CHECK-PWR8-DAG: cmpw
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setgt)), setgt
+define i64 @setbsc2(i8 %a, i8 %b) {
+  %t1 = icmp sgt i8 %b, %a
+  %t2 = icmp sgt i8 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbsc2:
+; CHECK-DAG: extsb [[RA:r[0-9]+]], r3
+; CHECK-DAG: extsb [[RB:r[0-9]+]], r4
+; CHECK-NOT: li
+; CHECK-NOT: sub
+; CHECK: cmpw {{c?r?(0, )?}}[[RA]], [[RB]]
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: rldicl
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbsc2
+; CHECK-PWR8-DAG: extsb
+; CHECK-PWR8-DAG: extsb
+; CHECK-PWR8-DAG: extsw
+; CHECK-PWR8-DAG: extsw
+; CHECK-PWR8-DAG: cmpw
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setlt)), setlt
+define i64 @setbsc3(i4 %a, i4 %b) {
+  %t1 = icmp slt i4 %a, %b
+  %t2 = icmp slt i4 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbsc3:
+; CHECK-DAG: slwi [[RA:r[0-9]+]], r3, 28
+; CHECK-DAG: slwi [[RB:r[0-9]+]], r4, 28
+; CHECK-NOT: li
+; CHECK-DAG: srawi [[RA1:r[0-9]+]], [[RA]], 28
+; CHECK-DAG: srawi [[RB1:r[0-9]+]], [[RB]], 28
+; CHECK-NOT: sub
+; CHECK: cmpw {{c?r?(0, )?}}[[RA1]], [[RB1]]
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: rldicl
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbsc3
+; CHECK-PWR8-DAG: slwi
+; CHECK-PWR8-DAG: slwi
+; CHECK-PWR8-DAG: srawi
+; CHECK-PWR8-DAG: srawi
+; CHECK-PWR8-DAG: extsw
+; CHECK-PWR8-DAG: extsw
+; CHECK-PWR8-DAG: cmpw
+; CHECK-PWR8-DAG: rldicl
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; Test with unsigned integer type i64/i32/i16/i8 for input parameter
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setult)), setugt
+define i64 @setbud1(i64 %a, i64 %b) {
+  %t1 = icmp ugt i64 %b, %a
+  %t2 = icmp ult i64 %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbud1:
+; CHECK-NOT: li
+; CHECK: cmpld {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfc
+; CHECK-NOT: subfe
+; CHECK-NOT: neg
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbud1
+; CHECK-PWR8-DAG: subfc
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8-DAG: cmpld
+; CHECK-PWR8-DAG: neg
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setne)), setugt
+define i64 @setbud2(i64 %a, i64 %b) {
+  %t1 = icmp ugt i64 %a, %b
+  %t2 = icmp ne i64 %a, %b
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbud2:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmpld {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: subfic
+; CHECK-NOT: subfe
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbud2
+; CHECK-PWR8-DAG: cmpld
+; CHECK-PWR8-DAG: subfic
+; CHECK-PWR8-DAG: subfe
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc lhs, rhs, -1, 1, setugt), seteq
+define i64 @setbud3(i64 %a, i64 %b) {
+  %t1 = icmp eq i64 %b, %a
+  %t2 = icmp ugt i64 %b, %a
+  %t3 = select i1 %t2, i64 -1, i64 1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbud3:
+; CHECK-NOT: li
+; CHECK: cmpld {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: li
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbud3
+; CHECK-PWR8-DAG: cmpld
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8-DAG: li
+; CHECK-PWR8: isel
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setne)), setult
+define i64 @setbuw1(i32 %a, i32 %b) {
+  %t1 = icmp ult i32 %b, %a
+  %t2 = icmp ne i32 %a, %b
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbuw1:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmplw {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: cntlzw
+; CHECK-NOT: srwi
+; CHECK-NOT: xori
+; CHECK-NOT: neg
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbuw1
+; CHECK-PWR8-DAG: cntlzw
+; CHECK-PWR8-DAG: cmplw
+; CHECK-PWR8-DAG: srwi
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8-DAG: neg
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setne)), setugt
+define i64 @setbuw2(i32 %a, i32 %b) {
+  %t1 = icmp ugt i32 %a, %b
+  %t2 = icmp ne i32 %b, %a
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbuw2:
+; CHECK-NOT: xor
+; CHECK-NOT: li
+; CHECK: cmplw {{c?r?(0, )?}}r3, r4
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: cntlzw
+; CHECK-NOT: srwi
+; CHECK-NOT: xori
+; CHECK-NOT: neg
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbuw2
+; CHECK-PWR8-DAG: cntlzw
+; CHECK-PWR8-DAG: cmplw
+; CHECK-PWR8-DAG: srwi
+; CHECK-PWR8-DAG: xori
+; CHECK-PWR8-DAG: neg
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setne)), setult
+define i64 @setbuh(i16 %a, i16 %b) {
+  %t1 = icmp ult i16 %b, %a
+  %t2 = icmp ne i16 %b, %a
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbuh:
+; CHECK-DAG: rlwinm [[RA:r[0-9]+]], r3, 0, 16, 31
+; CHECK-DAG: rlwinm [[RB:r[0-9]+]], r4, 0, 16, 31
+; CHECK-NOT: li
+; CHECK-NOT: xor
+; CHECK: cmplw {{c?r?(0, )?}}[[RA]], [[RB]]
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: cntlzw
+; CHECK-NOT: srwi
+; CHECK-NOT: xori
+; CHECK-NOT: neg
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbuh
+; CHECK-PWR8: rlwinm
+; CHECK-PWR8: rlwinm
+; CHECK-PWR8-DAG: cmplw
+; CHECK-PWR8-DAG: cntlzw
+; CHECK-PWR8: srwi
+; CHECK-PWR8: xori
+; CHECK-PWR8: neg
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setult)), setugt
+define i64 @setbuc(i8 %a, i8 %b) {
+  %t1 = icmp ugt i8 %a, %b
+  %t2 = icmp ult i8 %a, %b
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbuc:
+; CHECK-DAG: rlwinm [[RA:r[0-9]+]], r3, 0, 24, 31
+; CHECK-DAG: rlwinm [[RB:r[0-9]+]], r4, 0, 24, 31
+; CHECK-NOT: li
+; CHECK-NOT: clrldi
+; CHECK: cmplw {{c?r?(0, )?}}[[RA]], [[RB]]
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: sub
+; CHECK-NOT: sradi
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbuc
+; CHECK-PWR8: rlwinm
+; CHECK-PWR8: rlwinm
+; CHECK-PWR8-DAG: clrldi
+; CHECK-PWR8-DAG: clrldi
+; CHECK-PWR8-DAG: cmplw
+; CHECK-PWR8: sradi
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; Test with float/double/float128 for input parameter
+
+; select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setlt)), setlt
+define i64 @setbf1(float %a, float %b) {
+  %t1 = fcmp fast olt float %a, %b
+  %t2 = fcmp fast olt float %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbf1:
+; CHECK-NOT: li
+; CHECK: fcmpu cr0, f1, f2
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK-NOT: li
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbf1
+; CHECK-PWR8: isel
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setlt)), setgt
+define i64 @setbf2(float %a, float %b) {
+  %t1 = fcmp fast ogt float %b, %a
+  %t2 = fcmp fast olt float %b, %a
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbf2:
+; CHECK-NOT: li
+; CHECK: fcmpu cr0, f1, f2
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK-NOT: li
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbf2
+; CHECK-PWR8: isel
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 0, (select_cc lhs, rhs, -1, 1, setgt), seteq
+define i64 @setbdf1(double %a, double %b) {
+  %t1 = fcmp fast oeq double %b, %a
+  %t2 = fcmp fast ogt double %b, %a
+  %t3 = select i1 %t2, i64 -1, i64 1
+  %t4 = select i1 %t1, i64 0, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbdf1:
+; CHECK: xscmpudp cr0, f1, f2
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: li
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbdf1
+; CHECK-PWR8: isel
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setgt)), setlt
+define i64 @setbdf2(double %a, double %b) {
+  %t1 = fcmp fast olt double %b, %a
+  %t2 = fcmp fast ogt double %b, %a
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbdf2:
+; CHECK-NOT: fcmpu
+; CHECK-NOT: li
+; CHECK: xscmpudp cr0, f1, f2
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: li
+; CHECK-NOT: isel
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbdf2
+; CHECK-PWR8: isel
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+define i64 @setbf128(fp128 %a, fp128 %b) {
+  %t1 = fcmp fast ogt fp128 %a, %b
+  %t2 = fcmp fast olt fp128 %a, %b
+  %t3 = sext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbf128:
+; CHECK-NOT: li
+; CHECK: xscmpuqp cr0, v2, v3
+; CHECK-NEXT: setb r3, cr0
+; CHECK-NOT: isel
+; CHECK-NOT: li
+; CHECK: blr
+; CHECK-PWR8-LABEL: setbf128
+; CHECK-PWR8: isel
+; CHECK-PWR8: blr
+}
+
+; Some cases we can't leverage setb
+
+define i64 @setbn1(i64 %a, i64 %b) {
+  %t1 = icmp slt i64 %a, %b
+  %t2 = icmp eq i64 %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbn1:
+; CHECK-NOT: {{\<setb\>}}
+; CHECK: isel
+; CHECK: blr
+}
+
+define i64 @setbn2(double %a, double %b) {
+  %t1 = fcmp olt double %a, %b
+  %t2 = fcmp one double %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbn2:
+; CHECK-NOT: {{\<setb\>}}
+; CHECK: isel
+; CHECK: blr
+}
+
+define i64 @setbn3(float %a, float %b) {
+  %t1 = fcmp ult float %a, %b
+  %t2 = fcmp une float %a, %b
+  %t3 = zext i1 %t2 to i64
+  %t4 = select i1 %t1, i64 -1, i64 %t3
+  ret i64 %t4
+; CHECK-LABEL: setbn3:
+; CHECK-NOT: {{\<setb\>}}
+; CHECK: isel
+; CHECK: blr
+}
diff --git a/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 5768f0ce4e3e58b67c58b31a3904a555d4b185aa..653b2121e40ab5d39896b22f036d36ea36c1af89 100644
--- a/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -1,37 +1,41 @@
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 -implicit-check-not vabsdu
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 -implicit-check-not vabsdu
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR7 -implicit-check-not vmaxsd
 
-; Function Attrs: nounwind readnone
 define <4 x i32> @simple_absv_32(<4 x i32> %a) local_unnamed_addr {
 entry:
   %sub.i = sub <4 x i32> zeroinitializer, %a
   %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %a, <4 x i32> %sub.i)
   ret <4 x i32> %0
 ; CHECK-LABEL: simple_absv_32
-; CHECK-DAG: vxor {{[0-9]+}}, [[REG:[0-9]+]], [[REG]]
-; CHECK-DAG: xvnegsp 34, 34
-; CHECK-DAG: xvnegsp 35, {{[0-9]+}}
-; CHECK-NEXT: vabsduw 2, 2, {{[0-9]+}}
+; CHECK-NOT:  vxor 
+; CHECK-NOT:  vabsduw
+; CHECK:      vnegw v[[REG:[0-9]+]], v2
+; CHECK-NEXT: vmaxsw v2, v2, v[[REG]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_32
 ; CHECK-PWR8: xxlxor
 ; CHECK-PWR8: vsubuwm
 ; CHECK-PWR8: vmaxsw
 ; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: simple_absv_32
+; CHECK-PWR7: xxlxor
+; CHECK-PWR7: vsubuwm
+; CHECK-PWR7: vmaxsw
+; CHECK-PWR7: blr
 }
 
-; Function Attrs: nounwind readnone
 define <4 x i32> @simple_absv_32_swap(<4 x i32> %a) local_unnamed_addr {
 entry:
   %sub.i = sub <4 x i32> zeroinitializer, %a
   %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub.i, <4 x i32> %a)
   ret <4 x i32> %0
 ; CHECK-LABEL: simple_absv_32_swap
-; CHECK-DAG: vxor {{[0-9]+}}, [[REG:[0-9]+]], [[REG]]
-; CHECK-DAG: xvnegsp 34, 34
-; CHECK-DAG: xvnegsp 35, {{[0-9]+}}
-; CHECK-NEXT: vabsduw 2, 2, {{[0-9]+}}
+; CHECK-NOT:  vxor 
+; CHECK-NOT:  vabsduw
+; CHECK:      vnegw  v[[REG:[0-9]+]], v2
+; CHECK-NEXT: vmaxsw v2, v2, v[[REG]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_32_swap
 ; CHECK-PWR8: xxlxor
@@ -46,37 +50,72 @@ entry:
   %0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %a, <8 x i16> %sub.i)
   ret <8 x i16> %0
 ; CHECK-LABEL: simple_absv_16
-; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vadduhm 2, 2, [[IMM:[0-9]+]]
-; CHECK-NEXT: vabsduh 2, 2, [[IMM]]
+; CHECK-NOT:  mtvsrws
+; CHECK-NOT:  vabsduh
+; CHECK:      xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-NEXT: vsubuhm v[[REG:[0-9]+]], v[[ZERO]], v2
+; CHECK-NEXT: vmaxsh v2, v2, v[[REG]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_16
 ; CHECK-PWR8: xxlxor
 ; CHECK-PWR8: vsubuhm
 ; CHECK-PWR8: vmaxsh
 ; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: simple_absv_16
+; CHECK-PWR7: xxlxor
+; CHECK-PWR7: vsubuhm
+; CHECK-PWR7: vmaxsh
+; CHECK-PWR7: blr
 }
 
-; Function Attrs: nounwind readnone
 define <16 x i8> @simple_absv_8(<16 x i8> %a) local_unnamed_addr {
 entry:
   %sub.i = sub <16 x i8> zeroinitializer, %a
   %0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %a, <16 x i8> %sub.i)
   ret <16 x i8> %0
 ; CHECK-LABEL: simple_absv_8
-; CHECK: xxspltib {{[0-9]+}}, 128
-; CHECK-NEXT: vaddubm 2, 2, [[IMM:[0-9]+]]
-; CHECK-NEXT: vabsdub 2, 2, [[IMM]]
+; CHECK-NOT:  xxspltib
+; CHECK-NOT:  vabsdub
+; CHECK:      xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-NEXT: vsububm v[[REG:[0-9]+]], v[[ZERO]], v2
+; CHECK-NEXT: vmaxsb v2, v2, v[[REG]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_8
 ; CHECK-PWR8: xxlxor
 ; CHECK-PWR8: vsububm
 ; CHECK-PWR8: vmaxsb
 ; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: simple_absv_8
+; CHECK-PWR7: xxlxor
+; CHECK-PWR7: vsububm
+; CHECK-PWR7: vmaxsb
+; CHECK-PWR7: blr
+}
+
+; v2i64 vmax isn't avaiable on pwr7 
+define <2 x i64> @sub_absv_64(<2 x i64> %a, <2 x i64> %b) local_unnamed_addr {
+entry:
+  %0 = sub nsw <2 x i64> %a, %b
+  %1 = icmp sgt <2 x i64> %0, <i64 -1, i64 -1>
+  %2 = sub <2 x i64> zeroinitializer, %0
+  %3 = select <2 x i1> %1, <2 x i64> %0, <2 x i64> %2
+  ret <2 x i64> %3
+; CHECK-LABEL: sub_absv_64
+; CHECK: vsubudm
+; CHECK: vnegd
+; CHECK: vmaxsd
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: sub_absv_64
+; CHECK-PWR8-DAG: vsubudm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8: vmaxsd
+; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: sub_absv_64
+; CHECK-PWR7-NOT: vmaxsd
+; CHECK-PWR7: blr
 }
 
 ; The select pattern can only be detected for v4i32.
-; Function Attrs: norecurse nounwind readnone
 define <4 x i32> @sub_absv_32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr {
 entry:
   %0 = sub nsw <4 x i32> %a, %b
@@ -85,14 +124,77 @@ entry:
   %3 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> %2
   ret <4 x i32> %3
 ; CHECK-LABEL: sub_absv_32
-; CHECK-DAG: xvnegsp 34, 34
-; CHECK-DAG: xvnegsp 35, 35
-; CHECK-NEXT: vabsduw 2, 2, 3
+; CHECK-NOT:  vsubuwm
+; CHECK-NOT:  vnegw
+; CHECK-NOT:  vmaxsw
+; CHECK-DAG:  xvnegsp v2, v2
+; CHECK-DAG:  xvnegsp v3, v3
+; CHECK-NEXT: vabsduw v2, v{{[23]}}, v{{[23]}}
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: sub_absv_32
-; CHECK-PWR8: vsubuwm
-; CHECK-PWR8: xxlxor
+; CHECK-PWR8-DAG: vsubuwm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8: vmaxsw
 ; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: sub_absv_32
+; CHECK-PWR7-DAG: vsubuwm
+; CHECK-PWR7-DAG: xxlxor
+; CHECK-PWR7: vmaxsw
+; CHECK-PWR7: blr
+}
+
+define <8 x i16> @sub_absv_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr {
+entry:
+  %0 = sub nsw <8 x i16> %a, %b
+  %1 = icmp sgt <8 x i16> %0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %2 = sub <8 x i16> zeroinitializer, %0
+  %3 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> %2
+  ret <8 x i16> %3
+; CHECK-LABEL: sub_absv_16
+; CHECK-NOT:  vabsduh
+; CHECK-DAG:  xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-DAG:  vsubuhm v[[SUB:[0-9]+]], v2, v3
+; CHECK:      vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]]
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: sub_absv_16
+; CHECK-PWR8-DAG:  xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-PWR8-DAG:  vsubuhm v[[SUB:[0-9]+]], v2, v3
+; CHECK-PWR8:      vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-PWR8-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]]
+; CHECK-PWR8-NEXT: blr
+; CHECK-PWR7-LABEL: sub_absv_16
+; CHECK-PWR7-DAG: vsubuhm
+; CHECK-PWR7-DAG: xxlxor
+; CHECK-PWR7: vmaxsh
+; CHECK-PWR7-NEXT: blr
+}
+
+define <16 x i8> @sub_absv_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr {
+entry:
+  %0 = sub nsw <16 x i8> %a, %b
+  %1 = icmp sgt <16 x i8> %0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %2 = sub <16 x i8> zeroinitializer, %0
+  %3 = select <16 x i1> %1, <16 x i8> %0, <16 x i8> %2
+  ret <16 x i8> %3
+; CHECK-LABEL: sub_absv_8
+; CHECK-NOT:  vabsdub
+; CHECK-DAG:  xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-DAG:  vsububm v[[SUB:[0-9]+]], v2, v3
+; CHECK:      vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]]
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: sub_absv_8
+; CHECK-PWR8-DAG:  xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-PWR8-DAG:  vsububm v[[SUB:[0-9]+]], v2, v3
+; CHECK-PWR8:      vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-PWR8-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]]
+; CHECK-PWR8-NEXT: blr
+; CHECK-PWR7-LABEL: sub_absv_8
+; CHECK-PWR7-DAG:  xxlxor
+; CHECK-PWR7-DAG:  vsububm
+; CHECK-PWR7: vmaxsb
+; CHECK-PWR7-NEXT: blr
 }
 
 ; FIXME: This does not produce the ISD::ABS that we are looking for.
@@ -100,8 +202,7 @@ entry:
 ; We do manage to find the word version of ABS but not the halfword.
 ; Threfore, we end up doing more work than is required with a pair of abs for word
 ;  instead of just one for the halfword.
-; Function Attrs: norecurse nounwind readnone
-define <8 x i16> @sub_absv_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr {
+define <8 x i16> @sub_absv_16_ext(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr {
 entry:
   %0 = sext <8 x i16> %a to <8 x i32>
   %1 = sext <8 x i16> %b to <8 x i32>
@@ -111,23 +212,25 @@ entry:
   %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
   %6 = trunc <8 x i32> %5 to <8 x i16>
   ret <8 x i16> %6
-; CHECK-LABEL: sub_absv_16
+; CHECK-LABEL: sub_absv_16_ext
 ; CHECK-NOT: vabsduh
 ; CHECK: vabsduw
+; CHECK-NOT: vnegw
 ; CHECK-NOT: vabsduh
 ; CHECK: vabsduw
+; CHECK-NOT: vnegw
 ; CHECK-NOT: vabsduh
 ; CHECK: blr
 ; CHECK-PWR8-LABEL: sub_absv_16
-; CHECK-PWR8: vsubuwm
-; CHECK-PWR8: xxlxor
+; CHECK-PWR8-DAG: vsubuwm
+; CHECK-PWR8-DAG: xxlxor
 ; CHECK-PWR8: blr
 }
 
 ; FIXME: This does not produce ISD::ABS. This does not even vectorize correctly!
 ; This function should look like sub_absv_32 and sub_absv_16 except that the type is v16i8.
 ; Function Attrs: norecurse nounwind readnone
-define <16 x i8> @sub_absv_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr {
+define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr {
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
   %conv = zext i8 %vecext to i32
@@ -290,20 +393,19 @@ entry:
   %conv122 = trunc i32 %15 to i8
   %vecins123 = insertelement <16 x i8> %vecins115, i8 %conv122, i32 15
   ret <16 x i8> %vecins123
-; CHECK-LABEL: sub_absv_8
+; CHECK-LABEL: sub_absv_8_ext
 ; CHECK-NOT: vabsdub
 ; CHECK: subf
 ; CHECK-NOT: vabsdub
 ; CHECK: xor
 ; CHECK-NOT: vabsdub
 ; CHECK: blr
-; CHECK-PWR8-LABEL: sub_absv_8
+; CHECK-PWR8-LABEL: sub_absv_8_ext
 ; CHECK-PWR8: subf
 ; CHECK-PWR8: xor
 ; CHECK-PWR8: blr
 }
 
-; Function Attrs: nounwind readnone
 define <4 x i32> @sub_absv_vec_32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr {
 entry:
   %sub = sub <4 x i32> %a, %b
@@ -311,16 +413,20 @@ entry:
   %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub, <4 x i32> %sub.i)
   ret <4 x i32> %0
 ; CHECK-LABEL: sub_absv_vec_32
-; CHECK: vabsduw 2, 2, 3
+; CHECK-NOT:  vsubuwm
+; CHECK-NOT:  vnegw
+; CHECK-NOT:  vmaxsw
+; CHECK-DAG:  xvnegsp v2, v2
+; CHECK-DAG:  xvnegsp v3, v3
+; CHECK-NEXT: vabsduw v2, v{{[23]}}, v{{[23]}}
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: sub_absv_vec_32
-; CHECK-PWR8: xxlxor
-; CHECK-PWR8: vsubuwm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8-DAG: vsubuwm
 ; CHECK-PWR8: vmaxsw
 ; CHECK-PWR8: blr
 }
 
-; Function Attrs: nounwind readnone
 define <8 x i16> @sub_absv_vec_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr {
 entry:
   %sub = sub <8 x i16> %a, %b
@@ -328,16 +434,20 @@ entry:
   %0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %sub, <8 x i16> %sub.i)
   ret <8 x i16> %0
 ; CHECK-LABEL: sub_absv_vec_16
-; CHECK: vabsduh 2, 2, 3
+; CHECK-NOT:  mtvsrws
+; CHECK-NOT:  vabsduh
+; CHECK-DAG:  xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-DAG:  vsubuhm v[[SUB:[0-9]+]], v2, v3
+; CHECK:      vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: sub_absv_vec_16
-; CHECK-PWR8: xxlxor
-; CHECK-PWR8: vsubuhm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8-DAG: vsubuhm
 ; CHECK-PWR8: vmaxsh
 ; CHECK-PWR8: blr
 }
 
-; Function Attrs: nounwind readnone
 define <16 x i8> @sub_absv_vec_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr {
 entry:
   %sub = sub <16 x i8> %a, %b
@@ -345,22 +455,313 @@ entry:
   %0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %sub, <16 x i8> %sub.i)
   ret <16 x i8> %0
 ; CHECK-LABEL: sub_absv_vec_8
-; CHECK: vabsdub 2, 2, 3
+; CHECK-NOT:  xxspltib
+; CHECK-NOT:  vabsdub
+; CHECK-DAG:  xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-DAG:  vsububm v[[SUB:[0-9]+]], v2, v3
+; CHECK:      vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: sub_absv_vec_8
-; CHECK-PWR8: xxlxor
-; CHECK-PWR8: vsububm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8-DAG: vsububm
 ; CHECK-PWR8: vmaxsb
 ; CHECK-PWR8: blr
 }
 
+define <4 x i32> @zext_sub_absd32(<4 x i16>, <4 x i16>) local_unnamed_addr {
+    %3 = zext <4 x i16> %0 to <4 x i32>
+    %4 = zext <4 x i16> %1 to <4 x i32>
+    %5 = sub <4 x i32> %3, %4
+    %6 = sub <4 x i32> zeroinitializer, %5
+    %7 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %5, <4 x i32> %6)
+    ret <4 x i32> %7
+; CHECK-LABEL: zext_sub_absd32
+; CHECK-NOT: xvnegsp
+; CHECK:     vabsduw
+; CHECK:     blr
+; CHECK-PWR8-LABEL: zext_sub_absd32
+; CHECK-PWR8: vmaxsw
+; CHECK-PWR8: blr
+}
+
+define <8 x i16> @zext_sub_absd16(<8 x i8>, <8 x i8>) local_unnamed_addr {
+    %3 = zext <8 x i8> %0 to <8 x i16>
+    %4 = zext <8 x i8> %1 to <8 x i16>
+    %5 = sub <8 x i16> %3, %4
+    %6 = sub <8 x i16> zeroinitializer, %5
+    %7 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %5, <8 x i16> %6)
+    ret <8 x i16> %7
+; CHECK-LABEL: zext_sub_absd16
+; CHECK-NOT: vadduhm
+; CHECK:     vabsduh
+; CHECK:     blr
+; CHECK-PWR8-LABEL: zext_sub_absd16
+; CHECK-PWR8: vmaxsh
+; CHECK-PWR8: blr
+}
+
+define <16 x i8> @zext_sub_absd8(<16 x i4>, <16 x i4>) local_unnamed_addr {
+    %3 = zext <16 x i4> %0 to <16 x i8>
+    %4 = zext <16 x i4> %1 to <16 x i8>
+    %5 = sub <16 x i8> %3, %4
+    %6 = sub <16 x i8> zeroinitializer, %5
+    %7 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %5, <16 x i8> %6)
+    ret <16 x i8> %7
+; CHECK-LABEL: zext_sub_absd8
+; CHECK-NOT: vaddubm
+; CHECK:     vabsdub
+; CHECK:     blr
+; CHECK-PWR8-LABEL: zext_sub_absd8
+; CHECK-PWR8: vmaxsb
+; CHECK-PWR8: blr
+}
+
+; To verify vabsdu* exploitation for ucmp + sub + select sequence
+
+define <4 x i32> @absd_int32_ugt(<4 x i32>, <4 x i32>) {
+  %3 = icmp ugt <4 x i32> %0, %1
+  %4 = sub <4 x i32> %0, %1
+  %5 = sub <4 x i32> %1, %0
+  %6 = select <4 x i1> %3, <4 x i32> %4, <4 x i32> %5
+  ret <4 x i32> %6
+; CHECK-LABEL: absd_int32_ugt
+; CHECK-NOT: vcmpgtuw
+; CHECK-NOT: xxsel
+; CHECK: vabsduw v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int32_ugt
+; CHECK-PWR8: vcmpgtuw
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <4 x i32> @absd_int32_uge(<4 x i32>, <4 x i32>) {
+  %3 = icmp uge <4 x i32> %0, %1
+  %4 = sub <4 x i32> %0, %1
+  %5 = sub <4 x i32> %1, %0
+  %6 = select <4 x i1> %3, <4 x i32> %4, <4 x i32> %5
+  ret <4 x i32> %6
+; CHECK-LABEL: absd_int32_uge
+; CHECK-NOT: vcmpgtuw
+; CHECK-NOT: xxsel
+; CHECK: vabsduw v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int32_uge
+; CHECK-PWR8: vcmpgtuw
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <4 x i32> @absd_int32_ult(<4 x i32>, <4 x i32>) {
+  %3 = icmp ult <4 x i32> %0, %1
+  %4 = sub <4 x i32> %0, %1
+  %5 = sub <4 x i32> %1, %0
+  %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> %4
+  ret <4 x i32> %6
+; CHECK-LABEL: absd_int32_ult
+; CHECK-NOT: vcmpgtuw
+; CHECK-NOT: xxsel
+; CHECK: vabsduw v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int32_ult
+; CHECK-PWR8: vcmpgtuw
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <4 x i32> @absd_int32_ule(<4 x i32>, <4 x i32>) {
+  %3 = icmp ule <4 x i32> %0, %1
+  %4 = sub <4 x i32> %0, %1
+  %5 = sub <4 x i32> %1, %0
+  %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> %4
+  ret <4 x i32> %6
+; CHECK-LABEL: absd_int32_ule
+; CHECK-NOT: vcmpgtuw
+; CHECK-NOT: xxsel
+; CHECK: vabsduw v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int32_ule
+; CHECK-PWR8: vcmpgtuw
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <8 x i16> @absd_int16_ugt(<8 x i16>, <8 x i16>) {
+  %3 = icmp ugt <8 x i16> %0, %1
+  %4 = sub <8 x i16> %0, %1
+  %5 = sub <8 x i16> %1, %0
+  %6 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> %5
+  ret <8 x i16> %6
+; CHECK-LABEL: absd_int16_ugt
+; CHECK-NOT: vcmpgtuh
+; CHECK-NOT: xxsel
+; CHECK: vabsduh v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int16_ugt
+; CHECK-PWR8: vcmpgtuh
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <8 x i16> @absd_int16_uge(<8 x i16>, <8 x i16>) {
+  %3 = icmp uge <8 x i16> %0, %1
+  %4 = sub <8 x i16> %0, %1
+  %5 = sub <8 x i16> %1, %0
+  %6 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> %5
+  ret <8 x i16> %6
+; CHECK-LABEL: absd_int16_uge
+; CHECK-NOT: vcmpgtuh
+; CHECK-NOT: xxsel
+; CHECK: vabsduh v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int16_uge
+; CHECK-PWR8: vcmpgtuh
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <8 x i16> @absd_int16_ult(<8 x i16>, <8 x i16>) {
+  %3 = icmp ult <8 x i16> %0, %1
+  %4 = sub <8 x i16> %0, %1
+  %5 = sub <8 x i16> %1, %0
+  %6 = select <8 x i1> %3, <8 x i16> %5, <8 x i16> %4
+  ret <8 x i16> %6
+; CHECK-LABEL: absd_int16_ult
+; CHECK-NOT: vcmpgtuh
+; CHECK-NOT: xxsel
+; CHECK: vabsduh v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int16_ult
+; CHECK-PWR8: vcmpgtuh
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <8 x i16> @absd_int16_ule(<8 x i16>, <8 x i16>) {
+  %3 = icmp ule <8 x i16> %0, %1
+  %4 = sub <8 x i16> %0, %1
+  %5 = sub <8 x i16> %1, %0
+  %6 = select <8 x i1> %3, <8 x i16> %5, <8 x i16> %4
+  ret <8 x i16> %6
+; CHECK-LABEL: absd_int16_ule
+; CHECK-NOT: vcmpgtuh
+; CHECK-NOT: xxsel
+; CHECK: vabsduh v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int16_ule
+; CHECK-PWR8: vcmpgtuh
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <16 x i8> @absd_int8_ugt(<16 x i8>, <16 x i8>) {
+  %3 = icmp ugt <16 x i8> %0, %1
+  %4 = sub <16 x i8> %0, %1
+  %5 = sub <16 x i8> %1, %0
+  %6 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> %5
+  ret <16 x i8> %6
+; CHECK-LABEL: absd_int8_ugt
+; CHECK-NOT: vcmpgtub
+; CHECK-NOT: xxsel
+; CHECK: vabsdub v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int8_ugt
+; CHECK-PWR8: vcmpgtub
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <16 x i8> @absd_int8_uge(<16 x i8>, <16 x i8>) {
+  %3 = icmp uge <16 x i8> %0, %1
+  %4 = sub <16 x i8> %0, %1
+  %5 = sub <16 x i8> %1, %0
+  %6 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> %5
+  ret <16 x i8> %6
+; CHECK-LABEL: absd_int8_uge
+; CHECK-NOT: vcmpgtub
+; CHECK-NOT: xxsel
+; CHECK: vabsdub v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int8_uge
+; CHECK-PWR8: vcmpgtub
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <16 x i8> @absd_int8_ult(<16 x i8>, <16 x i8>) {
+  %3 = icmp ult <16 x i8> %0, %1
+  %4 = sub <16 x i8> %0, %1
+  %5 = sub <16 x i8> %1, %0
+  %6 = select <16 x i1> %3, <16 x i8> %5, <16 x i8> %4
+  ret <16 x i8> %6
+; CHECK-LABEL: absd_int8_ult
+; CHECK-NOT: vcmpgtub
+; CHECK-NOT: xxsel
+; CHECK: vabsdub v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int8_ult
+; CHECK-PWR8: vcmpgtub
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <16 x i8> @absd_int8_ule(<16 x i8>, <16 x i8>) {
+  %3 = icmp ule <16 x i8> %0, %1
+  %4 = sub <16 x i8> %0, %1
+  %5 = sub <16 x i8> %1, %0
+  %6 = select <16 x i1> %3, <16 x i8> %5, <16 x i8> %4
+  ret <16 x i8> %6
+; CHECK-LABEL: absd_int8_ule
+; CHECK-NOT: vcmpgtub
+; CHECK-NOT: xxsel
+; CHECK: vabsdub v2, v2, v3
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: absd_int8_ule
+; CHECK-PWR8: vcmpgtub
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+; some cases we are unable to optimize
+; check whether goes beyond the scope
+define <4 x i32> @absd_int32_ugt_opp(<4 x i32>, <4 x i32>) {
+  %3 = icmp ugt <4 x i32> %0, %1
+  %4 = sub <4 x i32> %0, %1
+  %5 = sub <4 x i32> %1, %0
+  %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> %4
+  ret <4 x i32> %6
+; CHECK-LABEL: absd_int32_ugt_opp
+; CHECK-NOT: vabsduw
+; CHECK: vcmpgtuw
+; CHECK: xxsel
+; CHECK: blr
+; CHECK-PWR8-LABEL: absd_int32_ugt_opp
+; CHECK-PWR8: vcmpgtuw
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
+
+define <2 x i64> @absd_int64_ugt(<2 x i64>, <2 x i64>) {
+  %3 = icmp ugt <2 x i64> %0, %1
+  %4 = sub <2 x i64> %0, %1
+  %5 = sub <2 x i64> %1, %0
+  %6 = select <2 x i1> %3, <2 x i64> %4, <2 x i64> %5
+  ret <2 x i64> %6
+; CHECK-LABEL: absd_int64_ugt
+; CHECK-NOT: vabsduw
+; CHECK: vcmpgtud
+; CHECK: xxsel
+; CHECK: blr
+; CHECK-PWR8-LABEL: absd_int64_ugt
+; CHECK-PWR8: vcmpgtud
+; CHECK-PWR8: xxsel
+; CHECK-PWR8: blr
+}
 
-; Function Attrs: nounwind readnone
 declare <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32>, <4 x i32>)
 
-; Function Attrs: nounwind readnone
 declare <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16>, <8 x i16>)
 
-; Function Attrs: nounwind readnone
 declare <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8>, <16 x i8>)
 
diff --git a/test/CodeGen/PowerPC/ppc64-blnop.ll b/test/CodeGen/PowerPC/ppc64-blnop.ll
index 3b3d9add183e26582b5ade7863e9e89888027ddc..6c3b55d59879c060d47a69e990e4abe267a8e5e5 100644
--- a/test/CodeGen/PowerPC/ppc64-blnop.ll
+++ b/test/CodeGen/PowerPC/ppc64-blnop.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
 ; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
-; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS
+; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -function-sections -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS
 ; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
-; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS
-; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: llc < %s -relocation-model=pic -function-sections -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS
+; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN: -code-model=small -mcpu=pwr8 | FileCheck %s -check-prefix=SCM
 
 %class.T = type { [2 x i8] }
diff --git a/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
index f5319b24d457e5fa6c1e23ea186561be125dcc57..2593fa4c00039ea66ce0b4f399937c45bcb922b4 100644
--- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll
+++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
@@ -1,33 +1,33 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE \
 ; RUN:   --implicit-check-not xxswapd
 
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-BE
 
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX
 
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX \
 ; RUN:   --implicit-check-not xxswapd
 
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-BE-NOVSX
 
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr8 -mattr=-vsx < %s | \
 ; RUN:   FileCheck %s -check-prefix=CHECK-LE-NOVSX --implicit-check-not xxswapd
 
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
 ; RUN:   FileCheck %s -check-prefix=CHECK-P9 --implicit-check-not xxswapd
 
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr9 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX \
 ; RUN:   --implicit-check-not xxswapd
 
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr9 -mattr=-power9-vector -mattr=-direct-move < %s | \
 ; RUN:   FileCheck %s -check-prefix=CHECK-LE --implicit-check-not xxswapd
 
diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
index a35250526c7cb62bb18cdef1a1b4f2b30ba119c6..3819e2654f2aee560cdc9a6a6baef8169f286721 100644
--- a/test/CodeGen/PowerPC/ppc64le-aggregates.ll
+++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
@@ -1,8 +1,8 @@
-; RUN: llc -verify-machineinstrs < %s -mcpu=pwr8 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs < %s -mcpu=pwr8 \
 ; RUN:   -mattr=+altivec -mattr=-vsx | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mattr=+altivec \
+; RUN: llc -relocation-model=pic -verify-machineinstrs < %s -mattr=+altivec \
 ; RUN:   -mattr=-vsx | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mcpu=pwr9 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs < %s -mcpu=pwr9 \
 ; RUN:   -mattr=-direct-move -mattr=+altivec | FileCheck %s
 
 ; Currently VSX support is disabled for this test because we generate lxsdx
diff --git a/test/CodeGen/PowerPC/ppcf128-endian.ll b/test/CodeGen/PowerPC/ppcf128-endian.ll
index 738577a412c51fdd16e2b4476088865dcc9fbcb9..851942caf4c07216dc56563dc84aa6ffb24350fb 100644
--- a/test/CodeGen/PowerPC/ppcf128-endian.ll
+++ b/test/CodeGen/PowerPC/ppcf128-endian.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=+altivec -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mcpu=pwr7 -mattr=+altivec -mattr=-vsx < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/pr16556.ll b/test/CodeGen/PowerPC/pr16556.ll
index dc36f0b6eafc56db6ed4b59eb619d4965b8a1c62..eea2db4501edfe03a5e008d4a8cc975d42fdf9e7 100644
--- a/test/CodeGen/PowerPC/pr16556.ll
+++ b/test/CodeGen/PowerPC/pr16556.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -verify-machineinstrs < %s
 
 ; This test formerly failed due to no handling for a ppc_fp128 undef.
 
diff --git a/test/CodeGen/PowerPC/pr25157-peephole.ll b/test/CodeGen/PowerPC/pr25157-peephole.ll
index cb958d416306a0ed385ce027023c416ca3bc894d..4c10c3813fb5ca6d6d68b23c2253984da9b4dd4f 100644
--- a/test/CodeGen/PowerPC/pr25157-peephole.ll
+++ b/test/CodeGen/PowerPC/pr25157-peephole.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
-; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck \
 ; RUN:   %s --check-prefix=CHECK-P9
 
 ; Verify peephole simplification of splats and swaps.  Bugpoint-reduced
diff --git a/test/CodeGen/PowerPC/pr25157.ll b/test/CodeGen/PowerPC/pr25157.ll
index 7d89d29af443d91dfddffb6028261d7266f49f49..982dfcd74a8e8a0ec937f789de85ceca8c4f87ea 100644
--- a/test/CodeGen/PowerPC/pr25157.ll
+++ b/test/CodeGen/PowerPC/pr25157.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
-; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck \
 ; RUN:   --check-prefix=CHECK-P9 %s
 
 ; Verify correct generation of an lxsspx rather than an invalid optimization
diff --git a/test/CodeGen/PowerPC/pr32140.ll b/test/CodeGen/PowerPC/pr32140.ll
index 3feb9bd9c9e194c2245443a2d3339665ac4a354a..ddcf27232398e3ead29095c592e8e8ea4aefa352 100644
--- a/test/CodeGen/PowerPC/pr32140.ll
+++ b/test/CodeGen/PowerPC/pr32140.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=powerpc64le-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
-; RUN: llc -mtriple=powerpc64-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-linux-gnu -mcpu=pwr8 < %s | FileCheck %s --check-prefix CHECK-LE
+; RUN: llc -mtriple=powerpc64-linux-gnu -mcpu=pwr8 < %s | FileCheck %s --check-prefix CHECK-BE
 
 @as = common local_unnamed_addr global i16 0, align 2
 @bs = common local_unnamed_addr global i16 0, align 2
@@ -10,10 +10,33 @@
 define void @bswapStorei64Toi32() {
 ; CHECK-LABEL: bswapStorei64Toi32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK:         lwa 3, 0(3)
+; CHECK-NEXT:    addis 3, 2, ai@toc@ha
+; CHECK-NEXT:    addis 4, 2, bi@toc@ha
+; CHECK-NEXT:    lwa 3, ai@toc@l(3)
+; CHECK-NEXT:    addi 4, 4, bi@toc@l
 ; CHECK-NEXT:    rldicl 3, 3, 32, 32
 ; CHECK-NEXT:    stwbrx 3, 0, 4
 ; CHECK-NEXT:    blr
+; CHECK-LE-LABEL: bswapStorei64Toi32:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addis 3, 2, ai@toc@ha
+; CHECK-LE-NEXT:    addis 4, 2, bi@toc@ha
+; CHECK-LE-NEXT:    lwa 3, ai@toc@l(3)
+; CHECK-LE-NEXT:    addi 4, 4, bi@toc@l
+; CHECK-LE-NEXT:    rldicl 3, 3, 32, 32
+; CHECK-LE-NEXT:    stwbrx 3, 0, 4
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: bswapStorei64Toi32:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addis 4, 2, .LC1@toc@ha
+; CHECK-BE-NEXT:    ld 3, .LC0@toc@l(3)
+; CHECK-BE-NEXT:    ld 4, .LC1@toc@l(4)
+; CHECK-BE-NEXT:    lwa 3, 0(3)
+; CHECK-BE-NEXT:    rldicl 3, 3, 32, 32
+; CHECK-BE-NEXT:    stwbrx 3, 0, 4
+; CHECK-BE-NEXT:    blr
 entry:
   %0 = load i32, i32* @ai, align 4
   %conv.i = sext i32 %0 to i64
@@ -26,10 +49,33 @@ entry:
 define void @bswapStorei32Toi16() {
 ; CHECK-LABEL: bswapStorei32Toi16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK:         lha 3, 0(3)
+; CHECK-NEXT:    addis 3, 2, as@toc@ha
+; CHECK-NEXT:    addis 4, 2, bs@toc@ha
+; CHECK-NEXT:    lha 3, as@toc@l(3)
+; CHECK-NEXT:    addi 4, 4, bs@toc@l
 ; CHECK-NEXT:    srwi 3, 3, 16
 ; CHECK-NEXT:    sthbrx 3, 0, 4
 ; CHECK-NEXT:    blr
+; CHECK-LE-LABEL: bswapStorei32Toi16:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addis 3, 2, as@toc@ha
+; CHECK-LE-NEXT:    addis 4, 2, bs@toc@ha
+; CHECK-LE-NEXT:    lha 3, as@toc@l(3)
+; CHECK-LE-NEXT:    addi 4, 4, bs@toc@l
+; CHECK-LE-NEXT:    srwi 3, 3, 16
+; CHECK-LE-NEXT:    sthbrx 3, 0, 4
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: bswapStorei32Toi16:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LC2@toc@ha
+; CHECK-BE-NEXT:    addis 4, 2, .LC3@toc@ha
+; CHECK-BE-NEXT:    ld 3, .LC2@toc@l(3)
+; CHECK-BE-NEXT:    ld 4, .LC3@toc@l(4)
+; CHECK-BE-NEXT:    lha 3, 0(3)
+; CHECK-BE-NEXT:    srwi 3, 3, 16
+; CHECK-BE-NEXT:    sthbrx 3, 0, 4
+; CHECK-BE-NEXT:    blr
 entry:
   %0 = load i16, i16* @as, align 2
   %conv.i = sext i16 %0 to i32
@@ -42,10 +88,33 @@ entry:
 define void @bswapStorei64Toi16() {
 ; CHECK-LABEL: bswapStorei64Toi16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK:         lha 3, 0(3)
+; CHECK-NEXT:    addis 3, 2, as@toc@ha
+; CHECK-NEXT:    addis 4, 2, bs@toc@ha
+; CHECK-NEXT:    lha 3, as@toc@l(3)
+; CHECK-NEXT:    addi 4, 4, bs@toc@l
 ; CHECK-NEXT:    rldicl 3, 3, 16, 48
 ; CHECK-NEXT:    sthbrx 3, 0, 4
 ; CHECK-NEXT:    blr
+; CHECK-LE-LABEL: bswapStorei64Toi16:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addis 3, 2, as@toc@ha
+; CHECK-LE-NEXT:    addis 4, 2, bs@toc@ha
+; CHECK-LE-NEXT:    lha 3, as@toc@l(3)
+; CHECK-LE-NEXT:    addi 4, 4, bs@toc@l
+; CHECK-LE-NEXT:    rldicl 3, 3, 16, 48
+; CHECK-LE-NEXT:    sthbrx 3, 0, 4
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: bswapStorei64Toi16:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LC2@toc@ha
+; CHECK-BE-NEXT:    addis 4, 2, .LC3@toc@ha
+; CHECK-BE-NEXT:    ld 3, .LC2@toc@l(3)
+; CHECK-BE-NEXT:    ld 4, .LC3@toc@l(4)
+; CHECK-BE-NEXT:    lha 3, 0(3)
+; CHECK-BE-NEXT:    rldicl 3, 3, 16, 48
+; CHECK-BE-NEXT:    sthbrx 3, 0, 4
+; CHECK-BE-NEXT:    blr
 entry:
   %0 = load i16, i16* @as, align 2
   %conv.i = sext i16 %0 to i64
diff --git a/test/CodeGen/PowerPC/pr39478.ll b/test/CodeGen/PowerPC/pr39478.ll
new file mode 100644
index 0000000000000000000000000000000000000000..defb85608ad8e012dd8ecd3ec5ad1b6f892f1c53
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr39478.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-unknown -verify-machineinstrs < %s | \
+; RUN:   FileCheck %s
+
+define void @pr39478() {
+; CHECK-LABEL: pr39478:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lwz 3, 0(3)
+; CHECK-NEXT:    stb 3, 0(3)
+; CHECK-NEXT:    blr
+entry:
+  %tmp32 = load i64, i64* undef, align 8
+  %tmp33 = load i32, i32* undef, align 4
+  %tmp34 = and i32 %tmp33, -256
+  %tmp35 = lshr i64 %tmp32, 32
+  %tmp36 = shl nuw nsw i64 %tmp35, 24
+  %tmp37 = trunc i64 %tmp36 to i32
+  %tmp38 = call i32 @llvm.bswap.i32(i32 %tmp37)
+  %tmp39 = or i32 %tmp38, %tmp34
+  store i32 %tmp39, i32* undef, align 4
+  ret void
+}
+
+declare i32 @llvm.bswap.i32(i32)
diff --git a/test/CodeGen/PowerPC/pre-inc-disable.ll b/test/CodeGen/PowerPC/pre-inc-disable.ll
index f7b829436c7f9a3ca9af07e9c507a95141b3f208..52fc7fe8746cffc2206fe1d0b1906f33aee71657 100644
--- a/test/CodeGen/PowerPC/pre-inc-disable.ll
+++ b/test/CodeGen/PowerPC/pre-inc-disable.ll
@@ -19,20 +19,20 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
 ; CHECK:    lxvx v2, 0, r5
 ; CHECK:    lxvx v3, 0, r6
 ; CHECK:    xxpermdi v5, f0, f0, 2
-; CHECK:    vperm v0, v4, v5, v2
-; CHECK:    vperm v5, v5, v4, v3
-; CHECK:    xvnegsp v5, v5
-; CHECK:    xvnegsp v0, v0
+; CHECK-DAG: vperm v[[VR1:[0-9]+]], v4, v5, v2
+; CHECK-DAG: vperm v[[VR2:[0-9]+]], v5, v4, v3
+; CHECK-DAG: xvnegsp v[[VR3:[0-9]+]], v[[VR1]]
+; CHECK-DAG: xvnegsp v[[VR4:[0-9]+]], v[[VR2]]
 
 ; CHECK:  .LBB0_1: # %for.cond1.preheader
 ; CHECK:    lfd f0, 0(r3)
 ; CHECK:    xxpermdi v1, f0, f0, 2
 ; CHECK:    vperm v6, v1, v4, v3
 ; CHECK:    vperm v1, v4, v1, v2
-; CHECK:    xvnegsp v6, v6
-; CHECK:    xvnegsp v1, v1
-; CHECK:    vabsduw v1, v1, v0
-; CHECK:    vabsduw v6, v6, v5
+; CHECK-DAG:    xvnegsp v6, v6
+; CHECK-DAG:    xvnegsp v1, v1
+; CHECK-DAG: vabsduw v1, v1, v[[VR3]]
+; CHECK-DAG: vabsduw v6, v6, v[[VR4]]
 ; CHECK:    vadduwm v1, v6, v1
 ; CHECK:    xxswapd v6, v1
 ; CHECK:    vadduwm v1, v1, v6
@@ -46,10 +46,10 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
 ; CHECK:    xxswapd v1, vs0
 ; CHECK:    vperm v6, v1, v4, v3
 ; CHECK:    vperm v1, v4, v1, v2
-; CHECK:    xvnegsp v6, v6
-; CHECK:    xvnegsp v1, v1
-; CHECK:    vabsduw v1, v1, v0
-; CHECK:    vabsduw v6, v6, v5
+; CHECK-DAG: xvnegsp v6, v6
+; CHECK-DAG: xvnegsp v1, v1
+; CHECK-DAG: vabsduw v1, v1, v[[VR3]]
+; CHECK-DAG: vabsduw v6, v6, v[[VR4]]
 ; CHECK:    vadduwm v1, v6, v1
 ; CHECK:    xxswapd v6, v1
 ; CHECK:    vadduwm v1, v1, v6
@@ -72,36 +72,36 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
 ; P9BE:    lxvx v3, 0, r6
 ; P9BE:    xxlor v5, vs0, vs0
 ; P9BE:    li r6, 0
-; P9BE:    vperm v0, v4, v5, v2
-; P9BE:    vperm v5, v4, v5, v3
-; P9BE:    xvnegsp v5, v5
-; P9BE:    xvnegsp v0, v0
+; P9BE-DAG: vperm v[[VR1:[0-9]+]], v4, v5, v2
+; P9BE-DAG: vperm v[[VR2:[0-9]+]], v4, v5, v3
+; P9BE-DAG: xvnegsp v[[VR3:[0-9]+]], v[[VR1]]
+; P9BE-DAG: xvnegsp v[[VR4:[0-9]+]], v[[VR2]]
 
 ; P9BE:  .LBB0_1: # %for.cond1.preheader
 ; P9BE:    lfd f0, 0(r3)
 ; P9BE:    xxlor v1, vs0, vs0
 ; P9BE:    vperm v6, v4, v1, v3
 ; P9BE:    vperm v1, v4, v1, v2
-; P9BE:    xvnegsp v6, v6
-; P9BE:    xvnegsp v1, v1
-; P9BE:    vabsduw v1, v1, v0
-; P9BE:    vabsduw v6, v6, v5
+; P9BE-DAG: xvnegsp v6, v6
+; P9BE-DAG: xvnegsp v1, v1
+; P9BE-DAG: vabsduw v1, v1, v[[VR3]]
+; P9BE-DAG: vabsduw v6, v6, v[[VR4]]
 ; P9BE:    vadduwm v1, v6, v1
 ; P9BE:    xxswapd v6, v1
 ; P9BE:    vadduwm v1, v1, v6
 ; P9BE:    xxspltw v6, v1, 1
 ; P9BE:    vadduwm v1, v1, v6
-; P9BE:    vextuwlx r7, r6, v1
-; P9BE:    ldux r8, r3, r4
+; P9BE:    vextuwlx r[[GR1:[0-9]+]], r6, v1
+; P9BE:    ldux r[[GR2:[0-9]+]], r3, r4
 ; P9BE:    add r3, r3, r4
-; P9BE:    add r5, r7, r5
-; P9BE:    mtvsrd v1, r8
+; P9BE:    add r5, r[[GR1]], r5
+; P9BE:    mtvsrd v1, r[[GR2]]
 ; P9BE:    vperm v6, v4, v1, v3
 ; P9BE:    vperm v1, v4, v1, v2
-; P9BE:    xvnegsp v6, v6
-; P9BE:    xvnegsp v1, v1
-; P9BE:    vabsduw v1, v1, v0
-; P9BE:    vabsduw v6, v6, v5
+; P9BE-DAG: xvnegsp v6, v6
+; P9BE-DAG: xvnegsp v1, v1
+; P9BE-DAG: vabsduw v1, v1, v[[VR3]]
+; P9BE-DAG: vabsduw v6, v6, v[[VR4]]
 ; P9BE:    vadduwm v1, v6, v1
 ; P9BE:    xxswapd v6, v1
 ; P9BE:    vadduwm v1, v1, v6
@@ -181,12 +181,8 @@ define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* noc
 ; CHECK:    vperm v4, v0, v4, v3
 ; CHECK:    vperm v2, v5, v0, v2
 ; CHECK:    vperm v3, v0, v5, v3
-; CHECK:    xvnegsp v5, v1
-; CHECK:    xvnegsp v4, v4
-; CHECK:    xvnegsp v2, v2
-; CHECK:    xvnegsp v3, v3
 ; CHECK:    vabsduw v3, v4, v3
-; CHECK:    vabsduw v2, v5, v2
+; CHECK:    vabsduw v2, v1, v2
 ; CHECK:    vadduwm v2, v2, v3
 ; CHECK:    xxswapd v3, v2
 ; CHECK:    vadduwm v2, v2, v3
@@ -212,12 +208,8 @@ define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* noc
 ; P9BE:    vperm v4, v5, v4, v3
 ; P9BE:    vperm v2, v5, v0, v2
 ; P9BE:    vperm v3, v5, v0, v3
-; P9BE:    xvnegsp v5, v1
-; P9BE:    xvnegsp v4, v4
-; P9BE:    xvnegsp v2, v2
-; P9BE:    xvnegsp v3, v3
 ; P9BE:    vabsduw v3, v4, v3
-; P9BE:    vabsduw v2, v5, v2
+; P9BE:    vabsduw v2, v1, v2
 ; P9BE:    vadduwm v2, v2, v3
 ; P9BE:    xxswapd v3, v2
 ; P9BE:    vadduwm v2, v2, v3
diff --git a/test/CodeGen/PowerPC/preemption.ll b/test/CodeGen/PowerPC/preemption.ll
index 5652f6d730a143057b0029902ead65ef1edc135b..9a0d81fbbd78be5c39c12f01b00768f8976ade64 100644
--- a/test/CodeGen/PowerPC/preemption.ll
+++ b/test/CodeGen/PowerPC/preemption.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple powerpc64le-unkown-gnu-linux  < %s |  FileCheck %s
+; RUN: llc -mtriple powerpc64le-unkown-gnu-linux  -relocation-model=pic \
+; RUN: < %s |  FileCheck %s
 ; RUN: llc -mtriple powerpc64le-unkown-gnu-linux -relocation-model=static \
 ; RUN: < %s |  FileCheck --check-prefix=STATIC %s
 ; RUN: llc -mtriple powerpc64le-unkown-gnu-linux -relocation-model=pic \
diff --git a/test/CodeGen/PowerPC/save-bp.ll b/test/CodeGen/PowerPC/save-bp.ll
index 2e403cb80f4af359855a15cd2da8067976de6f1a..8a7cef69263fbe44b339a73181349e08e5bded40 100644
--- a/test/CodeGen/PowerPC/save-bp.ll
+++ b/test/CodeGen/PowerPC/save-bp.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=ppc64-- -ppc-always-use-base-pointer < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC64
-; RUN: llc -ppc-always-use-base-pointer < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC32
-; RUN: llc -ppc-always-use-base-pointer -relocation-model pic < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC32PIC
+; RUN: llc -ppc-always-use-base-pointer -relocation-model=static < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC32
+; RUN: llc -ppc-always-use-base-pointer -relocation-model=pic < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC32PIC
 
 ; CHECK-LABEL: fred:
 
diff --git a/test/CodeGen/PowerPC/setcc-logic.ll b/test/CodeGen/PowerPC/setcc-logic.ll
index 4abfa36535c6a9d3952fd62856d6400c03acf2bf..e883531d0094be910905b76dfd839666b0269004 100644
--- a/test/CodeGen/PowerPC/setcc-logic.ll
+++ b/test/CodeGen/PowerPC/setcc-logic.ll
@@ -435,9 +435,9 @@ define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) {
 define zeroext i1 @and_eq(i16 zeroext  %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) {
 ; CHECK-LABEL: and_eq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xor 5, 5, 6
 ; CHECK-NEXT:    xor 3, 3, 4
-; CHECK-NEXT:    or 3, 3, 5
+; CHECK-NEXT:    xor 4, 5, 6
+; CHECK-NEXT:    or 3, 3, 4
 ; CHECK-NEXT:    cntlzw 3, 3
 ; CHECK-NEXT:    srwi 3, 3, 5
 ; CHECK-NEXT:    blr
@@ -450,9 +450,9 @@ define zeroext i1 @and_eq(i16 zeroext  %a, i16 zeroext %b, i16 zeroext %c, i16 z
 define zeroext i1 @or_ne(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: or_ne:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xor 5, 5, 6
 ; CHECK-NEXT:    xor 3, 3, 4
-; CHECK-NEXT:    or 3, 3, 5
+; CHECK-NEXT:    xor 4, 5, 6
+; CHECK-NEXT:    or 3, 3, 4
 ; CHECK-NEXT:    cntlzw 3, 3
 ; CHECK-NEXT:    srwi 3, 3, 5
 ; CHECK-NEXT:    xori 3, 3, 1
diff --git a/test/CodeGen/PowerPC/setcc-to-sub.ll b/test/CodeGen/PowerPC/setcc-to-sub.ll
index a143d73c7c0cac28671ddc233ec170e37fc4150f..c190591665405d1f2f0c9c6759c5c19dddbb25a2 100644
--- a/test/CodeGen/PowerPC/setcc-to-sub.ll
+++ b/test/CodeGen/PowerPC/setcc-to-sub.ll
@@ -36,8 +36,8 @@ define zeroext i1 @test2(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr
 ; CHECK-NEXT:    rlwinm 3, 3, 0, 28, 28
 ; CHECK-NEXT:    rlwinm 4, 4, 0, 28, 28
 ; CHECK-NEXT:    sub 3, 4, 3
+; CHECK-NEXT:    not 3, 3
 ; CHECK-NEXT:    rldicl 3, 3, 1, 63
-; CHECK-NEXT:    xori 3, 3, 1
 ; CHECK-NEXT:    blr
 entry:
   %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
@@ -81,8 +81,8 @@ define zeroext i1 @test4(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr
 ; CHECK-NEXT:    rlwinm 3, 3, 0, 28, 28
 ; CHECK-NEXT:    rlwinm 4, 4, 0, 28, 28
 ; CHECK-NEXT:    sub 3, 3, 4
+; CHECK-NEXT:    not 3, 3
 ; CHECK-NEXT:    rldicl 3, 3, 1, 63
-; CHECK-NEXT:    xori 3, 3, 1
 ; CHECK-NEXT:    blr
 entry:
   %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
diff --git a/test/CodeGen/PowerPC/sj-ctr-loop.ll b/test/CodeGen/PowerPC/sj-ctr-loop.ll
index 1866bcd174208460bea7893379b0856720838c22..7e485e26123a117ab6123db99b27790048239c92 100644
--- a/test/CodeGen/PowerPC/sj-ctr-loop.ll
+++ b/test/CodeGen/PowerPC/sj-ctr-loop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/sjlj.ll b/test/CodeGen/PowerPC/sjlj.ll
index 68b53417f01ec804089b98cf3afce0435246ca14..c5dba12c1e2ebcdf1227be05e0525c61c662cdde 100644
--- a/test/CodeGen/PowerPC/sjlj.ll
+++ b/test/CodeGen/PowerPC/sjlj.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 | FileCheck -check-prefix=CHECK-NOAV %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -verify-machineinstrs | FileCheck -check-prefix=CHECK-NOAV %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -7,6 +7,7 @@ target triple = "powerpc64-unknown-linux-gnu"
 %struct.__sigset_t = type { [16 x i64] }
 
 @env_sigill = internal global [1 x %struct.__jmp_buf_tag] zeroinitializer, align 16
+@cond = external global i8, align 1
 
 define void @foo() #0 {
 entry:
@@ -145,6 +146,24 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK: blr
 }
 
+define void @test_sjlj_setjmp() #0 {
+entry:
+  %0 = load i8, i8* @cond, align 1
+  %tobool = trunc i8 %0 to i1
+  br i1 %tobool, label %return, label %end
+
+end:
+  %1 = call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([1 x %struct.__jmp_buf_tag]* @env_sigill to i8*))
+  br label %return
+
+return:
+  ret void
+
+; CHECK-LABEL: test_sjlj_setjmp:
+; intrinsic llvm.eh.sjlj.setjmp does not call builtin function _setjmp.
+; CHECK-NOT: bl _setjmp
+}
+
 declare void @bar(i8*) #3
 
 declare i8* @llvm.frameaddress(i32) #2
diff --git a/test/CodeGen/PowerPC/sjlj_no0x.ll b/test/CodeGen/PowerPC/sjlj_no0x.ll
index 2018bcbbc9312a377c517c47197235604e3691aa..01053c4070f1c3aa093f2dffb71dd7df25c455b5 100644
--- a/test/CodeGen/PowerPC/sjlj_no0x.ll
+++ b/test/CodeGen/PowerPC/sjlj_no0x.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/structsinmem.ll b/test/CodeGen/PowerPC/structsinmem.ll
index c8ea3be7cceb3f117493d4051208ba02e62a6413..bbe82894a67aa49a52c0f71cc51c99733c0ef91d 100644
--- a/test/CodeGen/PowerPC/structsinmem.ll
+++ b/test/CodeGen/PowerPC/structsinmem.ll
@@ -157,8 +157,7 @@ entry:
 ; CHECK: stw {{[0-9]+}}, 147(1)
 ; CHECK: sth {{[0-9]+}}, 158(1)
 ; CHECK: stw {{[0-9]+}}, 154(1)
-; CHECK: stb {{[0-9]+}}, 167(1)
-; CHECK: sth {{[0-9]+}}, 165(1)
+; CHECK: stw {{[0-9]+}}, 164(1)
 ; CHECK: stw {{[0-9]+}}, 161(1)
 }
 
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
index d8afc8f8559cf01c3de94758223811d09141f1f6..52976ca784533a1b9e1880de6ffef3e5d93e3eca 100644
--- a/test/CodeGen/PowerPC/structsinregs.ll
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -148,8 +148,7 @@ entry:
 ; CHECK: stw {{[0-9]+}}, 83(1)
 ; CHECK: sth {{[0-9]+}}, 94(1)
 ; CHECK: stw {{[0-9]+}}, 90(1)
-; CHECK: stb {{[0-9]+}}, 103(1)
-; CHECK: sth {{[0-9]+}}, 101(1)
+; CHECK: stw {{[0-9]+}}, 100(1)
 ; CHECK: stw {{[0-9]+}}, 97(1)
 ; CHECK: ld 9, 96(1)
 ; CHECK: ld 8, 88(1)
diff --git a/test/CodeGen/PowerPC/stwu-sched.ll b/test/CodeGen/PowerPC/stwu-sched.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a8a8896570682a34fba6eb670576df7cc1ab994a
--- /dev/null
+++ b/test/CodeGen/PowerPC/stwu-sched.ll
@@ -0,0 +1,72 @@
+; RUN: llc  -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc  -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc  -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck %s \
+; RUN: --check-prefix=CHECK-ITIN
+; RUN: llc  -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s -verify-machineinstrs | FileCheck %s \
+; RUN: --check-prefix=CHECK-ITIN
+
+
+%0 = type { i32, i32 }
+
+; Function Attrs: norecurse nounwind writeonly
+define void @initCombList(%0* nocapture, i32 signext) local_unnamed_addr #0 {
+; CHECK-LABEL: initCombList:
+; CHECK:   addi 3, 3, -8
+; CHECK-NEXT: stwu 5, 64(4)
+
+; CHECK-ITIN-LABEL: initCombList:
+; CHECK-ITIN: stwu 5, 64(4)
+; CHECK-ITIN-NEXT:   addi 3, 3, -8
+
+
+  %3 = zext i32 %1 to i64
+  br i1 undef, label %6, label %4
+
+; <label>:4:                                      ; preds = %2
+  store i32 0, i32* undef, align 4, !tbaa !1
+  %5 = add nuw nsw i64 0, 1
+  br label %6
+
+; <label>:6:                                      ; preds = %4, %2
+  %7 = phi i64 [ 0, %2 ], [ %5, %4 ]
+  br i1 undef, label %23, label %8
+
+; <label>:8:                                      ; preds = %8, %6
+  %9 = phi i64 [ %21, %8 ], [ %7, %6 ]
+  %10 = getelementptr inbounds %0, %0* %0, i64 %9, i32 1
+  store i32 0, i32* %10, align 4, !tbaa !1
+  %11 = add nuw nsw i64 %9, 1
+  %12 = getelementptr inbounds %0, %0* %0, i64 %11, i32 1
+  store i32 0, i32* %12, align 4, !tbaa !1
+  %13 = add nsw i64 %9, 2
+  %14 = getelementptr inbounds %0, %0* %0, i64 %13, i32 1
+  store i32 0, i32* %14, align 4, !tbaa !1
+  %15 = add nsw i64 %9, 3
+  %16 = getelementptr inbounds %0, %0* %0, i64 %15, i32 1
+  store i32 0, i32* %16, align 4, !tbaa !1
+  %17 = add nsw i64 %9, 4
+  %18 = getelementptr inbounds %0, %0* %0, i64 %17, i32 1
+  store i32 0, i32* %18, align 4, !tbaa !1
+  %19 = add nsw i64 %9, 6
+  %20 = getelementptr inbounds %0, %0* %0, i64 %19, i32 1
+  store i32 0, i32* %20, align 4, !tbaa !1
+  %21 = add nsw i64 %9, 8
+  %22 = icmp eq i64 %21, %3
+  br i1 %22, label %23, label %8, !llvm.loop !6
+
+; <label>:23:                                     ; preds = %8, %6
+  ret void
+}
+
+attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 8.0.0 "}
+!1 = !{!2, !3, i64 4}
+!2 = !{!"", !3, i64 0, !3, i64 4}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/test/CodeGen/PowerPC/stwux.ll b/test/CodeGen/PowerPC/stwux.ll
index 4f83c9f64ea598cbb2ba4723e65fb647aca70f34..157e23e35c5dd909d83419f3875a3903d4ef8e03 100644
--- a/test/CodeGen/PowerPC/stwux.ll
+++ b/test/CodeGen/PowerPC/stwux.ll
@@ -1,6 +1,6 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 
 @multvec_i = external unnamed_addr global [100 x i32], align 4
 
diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll
index ac0bcc740681db82ad3a3f43ed3ceda4a25ce43a..0811287a792a2ecd3ad3fd8c962f43da481e549c 100644
--- a/test/CodeGen/PowerPC/swaps-le-6.ll
+++ b/test/CodeGen/PowerPC/swaps-le-6.ll
@@ -1,13 +1,13 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-vsr-nums-as-vr \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mcpu=pwr8 -ppc-vsr-nums-as-vr \
 ; RUN:   -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -O3 < %s | FileCheck %s
 
-; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -O3 \
+; RUN: llc -relocation-model=pic -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -O3 \
 ; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s --check-prefix=CHECK-P9 \
 ; RUN:   --implicit-check-not xxswapd
 
-; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -O3 \
+; RUN: llc -relocation-model=pic -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -O3 \
 ; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \
 ; RUN:   -mattr=-power9-vector < %s | FileCheck %s
 
diff --git a/test/CodeGen/PowerPC/testComparesi32gtu.ll b/test/CodeGen/PowerPC/testComparesi32gtu.ll
index 4341b59390e70e0b52b6ba87f35752f055d5ea86..62d66c5747b912e48f681a5a65dd219f9aced1b5 100644
--- a/test/CodeGen/PowerPC/testComparesi32gtu.ll
+++ b/test/CodeGen/PowerPC/testComparesi32gtu.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesi32ltu.ll b/test/CodeGen/PowerPC/testComparesi32ltu.ll
index 9623a63e9bc374163d9e39da3130fd2297725557..2b9d0ef1faaf4077904dbe3fdf289d37a432afee 100644
--- a/test/CodeGen/PowerPC/testComparesi32ltu.ll
+++ b/test/CodeGen/PowerPC/testComparesi32ltu.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesieqsc.ll b/test/CodeGen/PowerPC/testComparesieqsc.ll
index 513caa30bc604ef35ff93bb6e04186e84f13e3b9..893b374d8521e20be78671e95c1300a51df163e3 100644
--- a/test/CodeGen/PowerPC/testComparesieqsc.ll
+++ b/test/CodeGen/PowerPC/testComparesieqsc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testComparesieqsc.c'
 
@@ -17,6 +17,19 @@ define signext i32 @test_ieqsc(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -32,6 +45,21 @@ define signext i32 @test_ieqsc_sext(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %sub = sext i1 %cmp to i32
@@ -45,6 +73,17 @@ define signext i32 @test_ieqsc_z(i8 signext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsc_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsc_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv1 = zext i1 %cmp to i32
@@ -59,6 +98,19 @@ define signext i32 @test_ieqsc_sext_z(i8 signext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsc_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsc_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %sub = sext i1 %cmp to i32
@@ -69,13 +121,30 @@ entry:
 define void @test_ieqsc_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_ieqsc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -87,14 +156,33 @@ entry:
 define void @test_ieqsc_sext_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_ieqsc_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = sext i1 %cmp to i8
@@ -106,12 +194,27 @@ entry:
 define void @test_ieqsc_z_store(i8 signext %a) {
 ; CHECK-LABEL: test_ieqsc_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsc_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsc_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = zext i1 %cmp to i8
@@ -123,13 +226,30 @@ entry:
 define void @test_ieqsc_sext_z_store(i8 signext %a) {
 ; CHECK-LABEL: test_ieqsc_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsc_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsc_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testComparesieqsi.ll b/test/CodeGen/PowerPC/testComparesieqsi.ll
index 97fd7445527996b380a1c6a76303c3ce010c8a9e..d75485fcb6ff9af9fb43794f28e1057dd95bab69 100644
--- a/test/CodeGen/PowerPC/testComparesieqsi.ll
+++ b/test/CodeGen/PowerPC/testComparesieqsi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testComparesieqsi.c'
 
@@ -17,6 +17,19 @@ define signext i32 @test_ieqsi(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -32,6 +45,21 @@ define signext i32 @test_ieqsi_sext(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -45,6 +73,17 @@ define signext i32 @test_ieqsi_z(i32 signext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsi_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsi_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -59,6 +98,19 @@ define signext i32 @test_ieqsi_sext_z(i32 signext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsi_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsi_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %sub = sext i1 %cmp to i32
@@ -69,13 +121,30 @@ entry:
 define void @test_ieqsi_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_ieqsi_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -87,14 +156,33 @@ entry:
 define void @test_ieqsi_sext_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_ieqsi_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -106,12 +194,27 @@ entry:
 define void @test_ieqsi_z_store(i32 signext %a) {
 ; CHECK-LABEL: test_ieqsi_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsi_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsi_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -123,13 +226,30 @@ entry:
 define void @test_ieqsi_sext_z_store(i32 signext %a) {
 ; CHECK-LABEL: test_ieqsi_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsi_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsi_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testComparesieqsll.ll b/test/CodeGen/PowerPC/testComparesieqsll.ll
index bb0d6ca986771aa11003d5704243e87a90914354..f5627ff766dcc7cef03f27db3351a1354128ff3a 100644
--- a/test/CodeGen/PowerPC/testComparesieqsll.ll
+++ b/test/CodeGen/PowerPC/testComparesieqsll.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testComparesieqsll.c'
 
@@ -17,6 +17,19 @@ define signext i32 @test_ieqsll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv = zext i1 %cmp to i32
@@ -31,6 +44,19 @@ define signext i32 @test_ieqsll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %sub = sext i1 %cmp to i32
@@ -44,6 +70,17 @@ define signext i32 @test_ieqsll_z(i64 %a) {
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv = zext i1 %cmp to i32
@@ -57,6 +94,17 @@ define signext i32 @test_ieqsll_sext_z(i64 %a) {
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %sub = sext i1 %cmp to i32
@@ -67,13 +115,30 @@ entry:
 define void @test_ieqsll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_ieqsll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -85,13 +150,30 @@ entry:
 define void @test_ieqsll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_ieqsll_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -103,12 +185,27 @@ entry:
 define void @test_ieqsll_z_store(i64 %a) {
 ; CHECK-LABEL: test_ieqsll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzd r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -120,12 +217,27 @@ entry:
 define void @test_ieqsll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_ieqsll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addic r3, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqsll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqsll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesieqss.ll b/test/CodeGen/PowerPC/testComparesieqss.ll
index 24cee5a381e9f9ea760ed24c0a8f60a0a6f3b81c..2a5ee0c0f9d2824a8447181f39fc8cf930d95786 100644
--- a/test/CodeGen/PowerPC/testComparesieqss.ll
+++ b/test/CodeGen/PowerPC/testComparesieqss.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testComparesieqss.c'
 
@@ -17,6 +17,19 @@ define signext i32 @test_ieqss(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqss:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqss:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -32,6 +45,21 @@ define signext i32 @test_ieqss_sext(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqss_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqss_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %sub = sext i1 %cmp to i32
@@ -45,6 +73,17 @@ define signext i32 @test_ieqss_z(i16 signext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqss_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqss_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv1 = zext i1 %cmp to i32
@@ -59,6 +98,19 @@ define signext i32 @test_ieqss_sext_z(i16 signext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqss_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqss_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %sub = sext i1 %cmp to i32
@@ -69,13 +121,30 @@ entry:
 define void @test_ieqss_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_ieqss_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqss_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqss_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -87,14 +156,33 @@ entry:
 define void @test_ieqss_sext_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_ieqss_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqss_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqss_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = sext i1 %cmp to i16
@@ -106,12 +194,27 @@ entry:
 define void @test_ieqss_z_store(i16 signext %a) {
 ; CHECK-LABEL: test_ieqss_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqss_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqss_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = zext i1 %cmp to i16
@@ -123,13 +226,30 @@ entry:
 define void @test_ieqss_sext_z_store(i16 signext %a) {
 ; CHECK-LABEL: test_ieqss_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ieqss_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ieqss_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesiequc.ll b/test/CodeGen/PowerPC/testComparesiequc.ll
index 4ce9747156e169a0f404f9e314186eb99ece4909..18243d36619efa9ff77d475f23866687d575dec3 100644
--- a/test/CodeGen/PowerPC/testComparesiequc.ll
+++ b/test/CodeGen/PowerPC/testComparesiequc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testComparesiequc.c'
 
@@ -17,6 +17,19 @@ define signext i32 @test_iequc(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -32,6 +45,21 @@ define signext i32 @test_iequc_sext(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %sub = sext i1 %cmp to i32
@@ -45,6 +73,17 @@ define signext i32 @test_iequc_z(i8 zeroext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequc_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequc_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv1 = zext i1 %cmp to i32
@@ -59,6 +98,19 @@ define signext i32 @test_iequc_sext_z(i8 zeroext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequc_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequc_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %sub = sext i1 %cmp to i32
@@ -69,13 +121,30 @@ entry:
 define void @test_iequc_store(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: test_iequc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -87,14 +156,33 @@ entry:
 define void @test_iequc_sext_store(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: test_iequc_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = sext i1 %cmp to i8
@@ -106,12 +194,27 @@ entry:
 define void @test_iequc_z_store(i8 zeroext %a) {
 ; CHECK-LABEL: test_iequc_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequc_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequc_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = zext i1 %cmp to i8
@@ -123,13 +226,30 @@ entry:
 define void @test_iequc_sext_z_store(i8 zeroext %a) {
 ; CHECK-LABEL: test_iequc_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequc_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequc_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testComparesiequi.ll b/test/CodeGen/PowerPC/testComparesiequi.ll
index a0dc89048b63edf3c2dbd57586f5457b3bd4d480..afa447525efb303141f5c48f7363506c93d1bdab 100644
--- a/test/CodeGen/PowerPC/testComparesiequi.ll
+++ b/test/CodeGen/PowerPC/testComparesiequi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testComparesiequi.c'
 
@@ -17,6 +17,19 @@ define signext i32 @test_iequi(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -32,6 +45,21 @@ define signext i32 @test_iequi_sext(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -45,6 +73,17 @@ define signext i32 @test_iequi_z(i32 zeroext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequi_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequi_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -59,6 +98,19 @@ define signext i32 @test_iequi_sext_z(i32 zeroext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequi_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequi_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %sub = sext i1 %cmp to i32
@@ -69,13 +121,30 @@ entry:
 define void @test_iequi_store(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: test_iequi_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -87,14 +156,33 @@ entry:
 define void @test_iequi_sext_store(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: test_iequi_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -106,12 +194,27 @@ entry:
 define void @test_iequi_z_store(i32 zeroext %a) {
 ; CHECK-LABEL: test_iequi_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequi_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequi_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -123,13 +226,30 @@ entry:
 define void @test_iequi_sext_z_store(i32 zeroext %a) {
 ; CHECK-LABEL: test_iequi_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequi_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequi_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testComparesiequll.ll b/test/CodeGen/PowerPC/testComparesiequll.ll
index 60e11e6b61c2012ca4c0be39032ca07293ae3caa..16399715386c97c99d59aaffad3f24c91db9a905 100644
--- a/test/CodeGen/PowerPC/testComparesiequll.ll
+++ b/test/CodeGen/PowerPC/testComparesiequll.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testComparesiequll.c'
 
@@ -17,6 +17,19 @@ define signext i32 @test_iequll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv = zext i1 %cmp to i32
@@ -31,6 +44,19 @@ define signext i32 @test_iequll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %sub = sext i1 %cmp to i32
@@ -44,6 +70,17 @@ define signext i32 @test_iequll_z(i64 %a) {
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv = zext i1 %cmp to i32
@@ -57,6 +94,17 @@ define signext i32 @test_iequll_sext_z(i64 %a) {
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %sub = sext i1 %cmp to i32
@@ -67,13 +115,30 @@ entry:
 define void @test_iequll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_iequll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -85,13 +150,30 @@ entry:
 define void @test_iequll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_iequll_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -103,12 +185,27 @@ entry:
 define void @test_iequll_z_store(i64 %a) {
 ; CHECK-LABEL: test_iequll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzd r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -120,12 +217,27 @@ entry:
 define void @test_iequll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_iequll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addic r3, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesiequs.ll b/test/CodeGen/PowerPC/testComparesiequs.ll
index 710eaf5e232b0b7d6d261f7f47a748b6be3ee609..99b388313e12102e6db44b5bbab35914003101b7 100644
--- a/test/CodeGen/PowerPC/testComparesiequs.ll
+++ b/test/CodeGen/PowerPC/testComparesiequs.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testComparesiequs.c'
 
@@ -17,6 +17,19 @@ define signext i32 @test_iequs(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequs:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequs:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -32,6 +45,21 @@ define signext i32 @test_iequs_sext(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequs_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequs_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %sub = sext i1 %cmp to i32
@@ -45,6 +73,17 @@ define signext i32 @test_iequs_z(i16 zeroext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequs_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequs_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv1 = zext i1 %cmp to i32
@@ -59,6 +98,19 @@ define signext i32 @test_iequs_sext_z(i16 zeroext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequs_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequs_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %sub = sext i1 %cmp to i32
@@ -69,13 +121,30 @@ entry:
 define void @test_iequs_store(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: test_iequs_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequs_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequs_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -87,14 +156,33 @@ entry:
 define void @test_iequs_sext_store(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: test_iequs_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequs_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequs_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = sext i1 %cmp to i16
@@ -106,12 +194,27 @@ entry:
 define void @test_iequs_z_store(i16 zeroext %a) {
 ; CHECK-LABEL: test_iequs_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequs_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequs_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = zext i1 %cmp to i16
@@ -123,13 +226,30 @@ entry:
 define void @test_iequs_sext_z_store(i16 zeroext %a) {
 ; CHECK-LABEL: test_iequs_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iequs_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iequs_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesigesc.ll b/test/CodeGen/PowerPC/testComparesigesc.ll
index 80152f84fc34ab684b18342f446600abdf9a3543..eb457951fb784d85085e34ed2cd682873dd6b9fe 100644
--- a/test/CodeGen/PowerPC/testComparesigesc.ll
+++ b/test/CodeGen/PowerPC/testComparesigesc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i8 0, align 1
 
@@ -14,6 +14,19 @@ define signext i32 @test_igesc(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i8 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -27,6 +40,19 @@ define signext i32 @test_igesc_sext(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i8 %a, %b
   %sub = sext i1 %cmp to i32
@@ -36,13 +62,30 @@ entry:
 define void @test_igesc_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_igesc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -53,13 +96,30 @@ entry:
 define void @test_igesc_sext_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_igesc_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i8 %a, %b
   %conv3 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testComparesigesi.ll b/test/CodeGen/PowerPC/testComparesigesi.ll
index d5a194e447fe999b0760bc04400bc8e94dffc240..93e6c1e1f9cf69ed7aee51773bab1a49560ae7da 100644
--- a/test/CodeGen/PowerPC/testComparesigesi.ll
+++ b/test/CodeGen/PowerPC/testComparesigesi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i32 0, align 4
 
@@ -14,6 +14,19 @@ define signext i32 @test_igesi(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -27,6 +40,19 @@ define signext i32 @test_igesi_sext(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -36,13 +62,30 @@ entry:
 define void @test_igesi_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_igesi_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -53,13 +96,30 @@ entry:
 define void @test_igesi_sext_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_igesi_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i32 %a, %b
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testComparesigesll.ll b/test/CodeGen/PowerPC/testComparesigesll.ll
index 0926d9eb212117173405535102a6b3c72c882de1..2edbdc5bb8096b4c9c70d3e8164d7f22a7ac62dd 100644
--- a/test/CodeGen/PowerPC/testComparesigesll.ll
+++ b/test/CodeGen/PowerPC/testComparesigesll.ll
@@ -1,10 +1,10 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 @glob = common local_unnamed_addr global i64 0, align 8
 
 define signext i32 @test_igesll(i64 %a, i64 %b) {
@@ -15,6 +15,21 @@ define signext i32 @test_igesll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    subfc r3, r4, r3
 ; CHECK-NEXT:    adde r3, r5, r6
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r5, r3, 63
+; CHECK-BE-NEXT:    rldicl r6, r4, 1, 63
+; CHECK-BE-NEXT:    subfc r3, r4, r3
+; CHECK-BE-NEXT:    adde r3, r5, r6
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r5, r3, 63
+; CHECK-LE-NEXT:    rldicl r6, r4, 1, 63
+; CHECK-LE-NEXT:    subfc r3, r4, r3
+; CHECK-LE-NEXT:    adde r3, r5, r6
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i64 %a, %b
   %conv = zext i1 %cmp to i32
@@ -30,6 +45,23 @@ define signext i32 @test_igesll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    adde r3, r5, r6
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r5, r3, 63
+; CHECK-BE-NEXT:    rldicl r6, r4, 1, 63
+; CHECK-BE-NEXT:    subfc r3, r4, r3
+; CHECK-BE-NEXT:    adde r3, r5, r6
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r5, r3, 63
+; CHECK-LE-NEXT:    rldicl r6, r4, 1, 63
+; CHECK-LE-NEXT:    subfc r3, r4, r3
+; CHECK-LE-NEXT:    adde r3, r5, r6
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i64 %a, %b
   %sub = sext i1 %cmp to i32
@@ -39,9 +71,20 @@ entry:
 define signext i32 @test_igesll_z(i64 %a) {
 ; CHECK-LABEL: test_igesll_z:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    not r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
-; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    not r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    not r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sgt i64 %a, -1
   %conv = zext i1 %cmp to i32
@@ -54,6 +97,17 @@ define signext i32 @test_igesll_sext_z(i64 %a) {
 ; CHECK-NEXT:    sradi r3, r3, 63
 ; CHECK-NEXT:    not r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r3, r3, 63
+; CHECK-BE-NEXT:    not r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r3, r3, 63
+; CHECK-LE-NEXT:    not r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sgt i64 %a, -1
   %sub = sext i1 %cmp to i32
@@ -63,12 +117,33 @@ entry:
 define void @test_igesll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_igesll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK:    sradi r6, r3, 63
-; CHECK:    subfc r3, r4, r3
-; CHECK:    rldicl r3, r4, 1, 63
-; CHECK:    adde r3, r6, r3
-; CHECK:    std r3
+; CHECK-NEXT:    sradi r6, r3, 63
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    subfc r3, r4, r3
+; CHECK-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-NEXT:    adde r3, r6, r3
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sradi r6, r3, 63
+; CHECK-BE-NEXT:    ld r5, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    subfc r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-BE-NEXT:    adde r3, r6, r3
+; CHECK-BE-NEXT:    std r3, 0(r5)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r6, r3, 63
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfc r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-LE-NEXT:    adde r3, r6, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -80,14 +155,35 @@ define void @test_igesll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_igesll_sext_store:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    sradi r6, r3, 63
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfc r3, r4, r3
 ; CHECK-NEXT:    rldicl r3, r4, 1, 63
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    adde r3, r6, r3
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r6, r3, 63
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    subfc r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    adde r3, r6, r3
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r6, r3, 63
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfc r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-LE-NEXT:    adde r3, r6, r3
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -98,12 +194,27 @@ entry:
 define void @test_igesll_z_store(i64 %a) {
 ; CHECK-LABEL: test_igesll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    not r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    not r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    not r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sgt i64 %a, -1
   %conv1 = zext i1 %cmp to i64
@@ -114,12 +225,27 @@ entry:
 define void @test_igesll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_igesll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    not r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    sradi r3, r3, 63
-; CHECK-NEXT:    std r3,
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igesll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    not r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    sradi r3, r3, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igesll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    not r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    sradi r3, r3, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sgt i64 %a, -1
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesigess.ll b/test/CodeGen/PowerPC/testComparesigess.ll
index 8dcdafb5b85a1c3b14ce05dcdfed0496f9f6ccd7..ac04e9e86a2e4b83cbe201449e93ed56a06d5419 100644
--- a/test/CodeGen/PowerPC/testComparesigess.ll
+++ b/test/CodeGen/PowerPC/testComparesigess.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i16 0, align 2
 
@@ -14,6 +14,19 @@ define signext i32 @test_igess(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igess:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igess:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i16 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -27,6 +40,19 @@ define signext i32 @test_igess_sext(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igess_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igess_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i16 %a, %b
   %sub = sext i1 %cmp to i32
@@ -36,13 +62,30 @@ entry:
 define void @test_igess_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_igess_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igess_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igess_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -53,13 +96,30 @@ entry:
 define void @test_igess_sext_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_igess_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_igess_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_igess_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i16 %a, %b
   %conv3 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesigeuc.ll b/test/CodeGen/PowerPC/testComparesigeuc.ll
index 907cda39a5af775d58cbb310a8f007e6ce13a01e..c36b6a5e2e0327bad3707c49e7f19776fd6f08db 100644
--- a/test/CodeGen/PowerPC/testComparesigeuc.ll
+++ b/test/CodeGen/PowerPC/testComparesigeuc.ll
@@ -15,8 +15,8 @@ entry:
   ret i32 %conv2
 ; CHECK-LABEL: test_igeuc:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -65,8 +65,8 @@ entry:
   ret void
 ; CHECK_LABEL: test_igeuc_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesigeui.ll b/test/CodeGen/PowerPC/testComparesigeui.ll
index ac55500432f1b421bba47935a27d0ef94b825c22..bec1e21c586f89f6e0866741ebefcbbc0b68c8f4 100644
--- a/test/CodeGen/PowerPC/testComparesigeui.ll
+++ b/test/CodeGen/PowerPC/testComparesigeui.ll
@@ -15,8 +15,8 @@ entry:
   ret i32 %conv
 ; CHECK-LABEL: test_igeui:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -64,8 +64,8 @@ entry:
   ret void
 ; CHECK_LABEL: test_igeuc_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesigeus.ll b/test/CodeGen/PowerPC/testComparesigeus.ll
index ab8790746d68f01ca3269c4a95ec7ae96d3a12aa..33266e04a11365113ee5c0a97de07b3c5a34ee50 100644
--- a/test/CodeGen/PowerPC/testComparesigeus.ll
+++ b/test/CodeGen/PowerPC/testComparesigeus.ll
@@ -15,8 +15,8 @@ entry:
   ret i32 %conv2
 ; CHECK-LABEL: test_igeus:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -64,8 +64,8 @@ entry:
   ret void
 ; CHECK_LABEL: test_igeus_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesigtsc.ll b/test/CodeGen/PowerPC/testComparesigtsc.ll
index 8009043c45d95ca7ab4b47ac089f62dae5621ee0..c669d694c435a50f796838c8d549bff6f3d23a1f 100644
--- a/test/CodeGen/PowerPC/testComparesigtsc.ll
+++ b/test/CodeGen/PowerPC/testComparesigtsc.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesigtsi.ll b/test/CodeGen/PowerPC/testComparesigtsi.ll
index 77dfc3583f1ae0f9a556bc3671a719264c2148e5..9b5f1f43c7e7b7256235874b15214553f2f79541 100644
--- a/test/CodeGen/PowerPC/testComparesigtsi.ll
+++ b/test/CodeGen/PowerPC/testComparesigtsi.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesigtsll.ll b/test/CodeGen/PowerPC/testComparesigtsll.ll
index 75314d708f5253fd6b20741c61bf06fa959599e4..c2198d10bc50ec3713d032dd88e5759db09d0fa1 100644
--- a/test/CodeGen/PowerPC/testComparesigtsll.ll
+++ b/test/CodeGen/PowerPC/testComparesigtsll.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesigtss.ll b/test/CodeGen/PowerPC/testComparesigtss.ll
index 23ddbe30f7e458e43d62f0eb046a3ca6e5341821..93e6ccd53bd265c22b2d0b399be6aebfff57e253 100644
--- a/test/CodeGen/PowerPC/testComparesigtss.ll
+++ b/test/CodeGen/PowerPC/testComparesigtss.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesigtuc.ll b/test/CodeGen/PowerPC/testComparesigtuc.ll
index 540b82001c2cfc38587077dff5b9498149c446af..2886130dfba4404a25c27141805f5f2afe06fb76 100644
--- a/test/CodeGen/PowerPC/testComparesigtuc.ll
+++ b/test/CodeGen/PowerPC/testComparesigtuc.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesigtui.ll b/test/CodeGen/PowerPC/testComparesigtui.ll
index 6fef78c6b0b96aa6d64e8a3a4779b33175c074db..a81a1a64f617ed1a4e7edf278806695556c4ffeb 100644
--- a/test/CodeGen/PowerPC/testComparesigtui.ll
+++ b/test/CodeGen/PowerPC/testComparesigtui.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesigtus.ll b/test/CodeGen/PowerPC/testComparesigtus.ll
index 07e810115f977e9ba3ba940ed4468c699ac374ae..7beca852c166d9ca405a2f061067ddbccfaac99f 100644
--- a/test/CodeGen/PowerPC/testComparesigtus.ll
+++ b/test/CodeGen/PowerPC/testComparesigtus.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesilesc.ll b/test/CodeGen/PowerPC/testComparesilesc.ll
index c625dca9a0e19877cea600c9f8c9d7252d1694e7..94118d642eda79f69441d981ae2b13866cef7773 100644
--- a/test/CodeGen/PowerPC/testComparesilesc.ll
+++ b/test/CodeGen/PowerPC/testComparesilesc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i8 0, align 1
 
@@ -14,6 +14,19 @@ define signext i32 @test_ilesc(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i8 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -27,6 +40,19 @@ define signext i32 @test_ilesc_sext(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i8 %a, %b
   %sub = sext i1 %cmp to i32
@@ -36,13 +62,30 @@ entry:
 define void @test_ilesc_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_ilesc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -53,13 +96,30 @@ entry:
 define void @test_ilesc_sext_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_ilesc_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i8 %a, %b
   %conv3 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testComparesilesi.ll b/test/CodeGen/PowerPC/testComparesilesi.ll
index 343aa51533b9db8e1dc4b1a6942c4cb624cd7730..a5f2f84a5a695401308caa500a4c1b93a0fe866d 100644
--- a/test/CodeGen/PowerPC/testComparesilesi.ll
+++ b/test/CodeGen/PowerPC/testComparesilesi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i32 0, align 4
 
@@ -14,6 +14,19 @@ define signext i32 @test_ilesi(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -27,6 +40,19 @@ define signext i32 @test_ilesi_sext(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -36,13 +62,30 @@ entry:
 define void @test_ilesi_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_ilesi_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -53,13 +96,30 @@ entry:
 define void @test_ilesi_sext_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_ilesi_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testComparesilesll.ll b/test/CodeGen/PowerPC/testComparesilesll.ll
index bd51ad65e3570b178eaa35b721fb7d78e1b3809d..532df4c3cdbc3b7e36fe65eaae7a3ad57dd9f11c 100644
--- a/test/CodeGen/PowerPC/testComparesilesll.ll
+++ b/test/CodeGen/PowerPC/testComparesilesll.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i64 0, align 8
 
@@ -15,6 +15,21 @@ define signext i32 @test_ilesll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    subfc r3, r3, r4
 ; CHECK-NEXT:    adde r3, r5, r6
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r5, r4, 63
+; CHECK-BE-NEXT:    rldicl r6, r3, 1, 63
+; CHECK-BE-NEXT:    subfc r3, r3, r4
+; CHECK-BE-NEXT:    adde r3, r5, r6
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r5, r4, 63
+; CHECK-LE-NEXT:    rldicl r6, r3, 1, 63
+; CHECK-LE-NEXT:    subfc r3, r3, r4
+; CHECK-LE-NEXT:    adde r3, r5, r6
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i64 %a, %b
   %conv = zext i1 %cmp to i32
@@ -30,6 +45,23 @@ define signext i32 @test_ilesll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    adde r3, r5, r6
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r5, r4, 63
+; CHECK-BE-NEXT:    rldicl r6, r3, 1, 63
+; CHECK-BE-NEXT:    subfc r3, r3, r4
+; CHECK-BE-NEXT:    adde r3, r5, r6
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r5, r4, 63
+; CHECK-LE-NEXT:    rldicl r6, r3, 1, 63
+; CHECK-LE-NEXT:    subfc r3, r3, r4
+; CHECK-LE-NEXT:    adde r3, r5, r6
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i64 %a, %b
   %sub = sext i1 %cmp to i32
@@ -43,6 +75,19 @@ define signext i32 @test_ilesll_z(i64 %a) {
 ; CHECK-NEXT:    or r3, r4, r3
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addi r4, r3, -1
+; CHECK-BE-NEXT:    or r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addi r4, r3, -1
+; CHECK-LE-NEXT:    or r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp slt i64 %a, 1
   %conv = zext i1 %cmp to i32
@@ -56,6 +101,19 @@ define signext i32 @test_ilesll_sext_z(i64 %a) {
 ; CHECK-NEXT:    or r3, r4, r3
 ; CHECK-NEXT:    sradi r3, r3, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addi r4, r3, -1
+; CHECK-BE-NEXT:    or r3, r4, r3
+; CHECK-BE-NEXT:    sradi r3, r3, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addi r4, r3, -1
+; CHECK-LE-NEXT:    or r3, r4, r3
+; CHECK-LE-NEXT:    sradi r3, r3, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp slt i64 %a, 1
   %sub = sext i1 %cmp to i32
@@ -65,14 +123,33 @@ entry:
 define void @test_ilesll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_ilesll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sradi r6, r4, 63
-; CHECK-NEXT:    ld r5, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfc r4, r3, r4
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    adde r3, r6, r3
-; CHECK-NEXT:    std r3, 0(r5)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sradi r6, r4, 63
+; CHECK-BE-NEXT:    ld r5, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    subfc r4, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    adde r3, r6, r3
+; CHECK-BE-NEXT:    std r3, 0(r5)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r6, r4, 63
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfc r4, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    adde r3, r6, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -84,14 +161,35 @@ define void @test_ilesll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_ilesll_sext_store:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    sradi r6, r4, 63
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfc r4, r3, r4
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    adde r3, r6, r3
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r6, r4, 63
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    subfc r4, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    adde r3, r6, r3
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r6, r4, 63
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfc r4, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    adde r3, r6, r3
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -102,13 +200,30 @@ entry:
 define void @test_ilesll_z_store(i64 %a) {
 ; CHECK-LABEL: test_ilesll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addi r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    or r3, r5, r3
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addi r5, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    or r3, r5, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addi r5, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    or r3, r5, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp slt i64 %a, 1
   %conv1 = zext i1 %cmp to i64
@@ -119,13 +234,30 @@ entry:
 define void @test_ilesll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_ilesll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addi r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    or r3, r5, r3
 ; CHECK-NEXT:    sradi r3, r3, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ilesll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addi r5, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    or r3, r5, r3
+; CHECK-BE-NEXT:    sradi r3, r3, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ilesll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addi r5, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    or r3, r5, r3
+; CHECK-LE-NEXT:    sradi r3, r3, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp slt i64 %a, 1
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesiless.ll b/test/CodeGen/PowerPC/testComparesiless.ll
index 10e7b39dd8112ee59543038d30d843ad44fec1a1..45c887d21adcab78e1df3a3af4480893af9cfa70 100644
--- a/test/CodeGen/PowerPC/testComparesiless.ll
+++ b/test/CodeGen/PowerPC/testComparesiless.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i16 0, align 2
 
@@ -14,6 +14,19 @@ define signext i32 @test_iless(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iless:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iless:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i16 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -27,6 +40,19 @@ define signext i32 @test_iless_sext(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iless_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iless_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i16 %a, %b
   %sub = sext i1 %cmp to i32
@@ -36,13 +62,30 @@ entry:
 define void @test_iless_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_iless_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iless_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iless_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -53,13 +96,30 @@ entry:
 define void @test_iless_sext_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_iless_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iless_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iless_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i16 %a, %b
   %conv3 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesileuc.ll b/test/CodeGen/PowerPC/testComparesileuc.ll
index b387d4b867f7eff2519025fc90bb7fb8b12133b4..3084b7ee9ed29d201ea332f571c05eda2b047c4d 100644
--- a/test/CodeGen/PowerPC/testComparesileuc.ll
+++ b/test/CodeGen/PowerPC/testComparesileuc.ll
@@ -15,8 +15,8 @@ entry:
   ret i32 %conv2
 ; CHECK-LABEL: test_ileuc:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -67,8 +67,8 @@ entry:
   ret void
 ; CHECK-LABEL: test_ileuc_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesileui.ll b/test/CodeGen/PowerPC/testComparesileui.ll
index fd80167e50aef4c4f9bf105699be921a077b989b..addaa9b991ed0c1d3586949814f6924a03e678f6 100644
--- a/test/CodeGen/PowerPC/testComparesileui.ll
+++ b/test/CodeGen/PowerPC/testComparesileui.ll
@@ -15,8 +15,8 @@ entry:
   ret i32 %sub
 ; CHECK-LABEL: test_ileui:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -67,8 +67,8 @@ entry:
   ret void
 ; CHECK-LABEL: test_ileui_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesileus.ll b/test/CodeGen/PowerPC/testComparesileus.ll
index 6e54c5a521005d1a72bd760e31595445b4b210ac..122d46f24c60b3275b18fd4cdc4e2710b3b8afa9 100644
--- a/test/CodeGen/PowerPC/testComparesileus.ll
+++ b/test/CodeGen/PowerPC/testComparesileus.ll
@@ -15,8 +15,8 @@ entry:
   ret i32 %conv2
 ; CHECK-LABEL: test_ileus:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK-NEXT: blr
 }
 
@@ -67,8 +67,8 @@ entry:
   ret void
 ; CHECK-LABEL: test_ileus_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesinesc.ll b/test/CodeGen/PowerPC/testComparesinesc.ll
index a498f64462224d86c2c0afede26fd2d1d901863a..f5ad934409e5f196915dcdcada7807e17c8ef54c 100644
--- a/test/CodeGen/PowerPC/testComparesinesc.ll
+++ b/test/CodeGen/PowerPC/testComparesinesc.ll
@@ -1,20 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
 @glob = common local_unnamed_addr global i8 0, align 1
 
 define signext i32 @test_inesc(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_inesc:
-; CHECK:    xor r3, r3, r4
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -23,12 +39,30 @@ entry:
 
 define signext i32 @test_inesc_sext(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_inesc_sext:
-; CHECK:    xor r3, r3, r4
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, %b
   %sub = sext i1 %cmp to i32
@@ -37,10 +71,24 @@ entry:
 
 define signext i32 @test_inesc_z(i8 signext %a) {
 ; CHECK-LABEL: test_inesc_z:
-; CHECK:    cntlzw r3, r3
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesc_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesc_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, 0
   %conv1 = zext i1 %cmp to i32
@@ -49,11 +97,27 @@ entry:
 
 define signext i32 @test_inesc_sext_z(i8 signext %a) {
 ; CHECK-LABEL: test_inesc_sext_z:
-; CHECK:    cntlzw r3, r3
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesc_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesc_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, 0
   %sub = sext i1 %cmp to i32
@@ -62,12 +126,34 @@ entry:
 
 define void @test_inesc_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_inesc_store:
-; CHECK:    xor r3, r3, r4
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    stb r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -77,13 +163,37 @@ entry:
 
 define void @test_inesc_sext_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_inesc_sext_store:
-; CHECK:    xor r3, r3, r4
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    neg r3, r3
-; CHECK:    stb r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, %b
   %conv3 = sext i1 %cmp to i8
@@ -93,11 +203,31 @@ entry:
 
 define void @test_inesc_z_store(i8 signext %a) {
 ; CHECK-LABEL: test_inesc_z_store:
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    stb r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesc_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesc_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, 0
   %conv2 = zext i1 %cmp to i8
@@ -107,12 +237,34 @@ entry:
 
 define void @test_inesc_sext_z_store(i8 signext %a) {
 ; CHECK-LABEL: test_inesc_sext_z_store:
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    neg r3, r3
-; CHECK:    stb r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesc_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesc_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, 0
   %conv2 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testComparesinesi.ll b/test/CodeGen/PowerPC/testComparesinesi.ll
index b47f6c808495e737905c3124498a246209e44877..3b0fe78e99c84957a3403ffafd2d8088bece2080 100644
--- a/test/CodeGen/PowerPC/testComparesinesi.ll
+++ b/test/CodeGen/PowerPC/testComparesinesi.ll
@@ -1,20 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
 @glob = common local_unnamed_addr global i32 0, align 4
 
 define signext i32 @test_inesi(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_inesi:
-; CHECK:    xor r3, r3, r4
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -23,12 +39,30 @@ entry:
 
 define signext i32 @test_inesi_sext(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_inesi_sext:
-; CHECK:    xor r3, r3, r4
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -37,10 +71,24 @@ entry:
 
 define signext i32 @test_inesi_z(i32 signext %a) {
 ; CHECK-LABEL: test_inesi_z:
-; CHECK:    cntlzw r3, r3
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesi_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesi_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -49,11 +97,27 @@ entry:
 
 define signext i32 @test_inesi_sext_z(i32 signext %a) {
 ; CHECK-LABEL: test_inesi_sext_z:
-; CHECK:    cntlzw r3, r3
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesi_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesi_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, 0
   %sub = sext i1 %cmp to i32
@@ -62,12 +126,34 @@ entry:
 
 define void @test_inesi_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_inesi_store:
-; CHECK:    xor r3, r3, r4
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    stw r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -77,13 +163,37 @@ entry:
 
 define void @test_inesi_sext_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_inesi_sext_store:
-; CHECK:    xor r3, r3, r4
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    neg r3, r3
-; CHECK:    stw r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -93,11 +203,31 @@ entry:
 
 define void @test_inesi_z_store(i32 signext %a) {
 ; CHECK-LABEL: test_inesi_z_store:
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    stw r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesi_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesi_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -107,12 +237,34 @@ entry:
 
 define void @test_inesi_sext_z_store(i32 signext %a) {
 ; CHECK-LABEL: test_inesi_sext_z_store:
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    neg r3, r3
-; CHECK:    stw r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesi_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesi_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, 0
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testComparesinesll.ll b/test/CodeGen/PowerPC/testComparesinesll.ll
index 33416a06d02159198d2c048851a768f14c666322..bfc0b9d3fbb473ce42918de05b369dd83b2ff8ea 100644
--- a/test/CodeGen/PowerPC/testComparesinesll.ll
+++ b/test/CodeGen/PowerPC/testComparesinesll.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i64 0, align 8
@@ -15,6 +15,19 @@ define signext i32 @test_inesll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addic r4, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r4, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv = zext i1 %cmp to i32
@@ -28,6 +41,19 @@ define signext i32 @test_inesll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %sub = sext i1 %cmp to i32
@@ -40,6 +66,17 @@ define signext i32 @test_inesll_z(i64 %a) {
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addic r4, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r4, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv = zext i1 %cmp to i32
@@ -52,6 +89,17 @@ define signext i32 @test_inesll_sext_z(i64 %a) {
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %sub = sext i1 %cmp to i32
@@ -61,13 +109,30 @@ entry:
 define void @test_inesll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_inesll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    addic r5, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r5, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -78,13 +143,30 @@ entry:
 define void @test_inesll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_inesll_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -95,12 +177,27 @@ entry:
 define void @test_inesll_z_store(i64 %a) {
 ; CHECK-LABEL: test_inesll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addic r5, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r5, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r5, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r5, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -111,12 +208,27 @@ entry:
 define void @test_inesll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_inesll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_inesll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_inesll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesiness.ll b/test/CodeGen/PowerPC/testComparesiness.ll
index 66c95cd0d91d00abe576ec9dc29fba64992b99ec..9e570949e1273d2ae15180ec8548d7b5c0db0636 100644
--- a/test/CodeGen/PowerPC/testComparesiness.ll
+++ b/test/CodeGen/PowerPC/testComparesiness.ll
@@ -1,20 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
 @glob = common local_unnamed_addr global i16 0, align 2
 
 define signext i32 @test_iness(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_iness:
-; CHECK:    xor r3, r3, r4
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iness:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iness:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -23,12 +39,30 @@ entry:
 
 define signext i32 @test_iness_sext(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_iness_sext:
-; CHECK:    xor r3, r3, r4
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iness_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iness_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, %b
   %sub = sext i1 %cmp to i32
@@ -37,10 +71,24 @@ entry:
 
 define signext i32 @test_iness_z(i16 signext %a) {
 ; CHECK-LABEL: test_iness_z:
-; CHECK:    cntlzw r3, r3
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iness_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iness_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, 0
   %conv1 = zext i1 %cmp to i32
@@ -49,11 +97,27 @@ entry:
 
 define signext i32 @test_iness_sext_z(i16 signext %a) {
 ; CHECK-LABEL: test_iness_sext_z:
-; CHECK:    cntlzw r3, r3
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iness_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iness_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, 0
   %sub = sext i1 %cmp to i32
@@ -62,12 +126,34 @@ entry:
 
 define void @test_iness_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_iness_store:
-; CHECK:    xor r3, r3, r4
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    sth r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iness_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iness_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -77,13 +163,37 @@ entry:
 
 define void @test_iness_sext_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_iness_sext_store:
-; CHECK:    xor r3, r3, r4
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    neg r3, r3
-; CHECK:    sth r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iness_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iness_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, %b
   %conv3 = sext i1 %cmp to i16
@@ -93,11 +203,31 @@ entry:
 
 define void @test_iness_z_store(i16 signext %a) {
 ; CHECK-LABEL: test_iness_z_store:
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    sth r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iness_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iness_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, 0
   %conv2 = zext i1 %cmp to i16
@@ -107,12 +237,34 @@ entry:
 
 define void @test_iness_sext_z_store(i16 signext %a) {
 ; CHECK-LABEL: test_iness_sext_z_store:
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    neg r3, r3
-; CHECK:    sth r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_iness_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_iness_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, 0
   %conv2 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesineuc.ll b/test/CodeGen/PowerPC/testComparesineuc.ll
index fe91449f3c53a2e1e5186f37fb9e88ee753f4917..4e7f5c05cf6de5ce66b117928717e6b93f4e032b 100644
--- a/test/CodeGen/PowerPC/testComparesineuc.ll
+++ b/test/CodeGen/PowerPC/testComparesineuc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i8 0, align 1
 
@@ -15,6 +15,21 @@ define signext i32 @test_ineuc(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineuc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineuc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -30,6 +45,23 @@ define signext i32 @test_ineuc_sext(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineuc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineuc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, %b
   %sub = sext i1 %cmp to i32
@@ -43,6 +75,19 @@ define signext i32 @test_ineuc_z(i8 zeroext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineuc_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineuc_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, 0
   %conv1 = zext i1 %cmp to i32
@@ -57,6 +102,21 @@ define signext i32 @test_ineuc_sext_z(i8 zeroext %a) {
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineuc_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineuc_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, 0
   %sub = sext i1 %cmp to i32
@@ -66,14 +126,33 @@ entry:
 define void @test_ineuc_store(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: test_ineuc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineuc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineuc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -85,14 +164,35 @@ define void @test_ineuc_sext_store(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: test_ineuc_sext_store:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineuc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineuc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, %b
   %conv3 = sext i1 %cmp to i8
@@ -103,13 +203,30 @@ entry:
 define void @test_ineuc_z_store(i8 zeroext %a) {
 ; CHECK-LABEL: test_ineuc_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineuc_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineuc_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, 0
   %conv2 = zext i1 %cmp to i8
@@ -120,14 +237,33 @@ entry:
 define void @test_ineuc_sext_z_store(i8 zeroext %a) {
 ; CHECK-LABEL: test_ineuc_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineuc_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineuc_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i8 %a, 0
   %conv2 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testComparesineui.ll b/test/CodeGen/PowerPC/testComparesineui.ll
index ef126270f418ba95a4d7726d15a8877d9c841ea6..e8fb3dd85be77806d2619a3d84e1a2610b28c677 100644
--- a/test/CodeGen/PowerPC/testComparesineui.ll
+++ b/test/CodeGen/PowerPC/testComparesineui.ll
@@ -1,20 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
 @glob = common local_unnamed_addr global i32 0, align 4
 
 define signext i32 @test_ineui(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: test_ineui:
-; CHECK:    xor r3, r3, r4
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineui:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineui:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -23,12 +39,30 @@ entry:
 
 define signext i32 @test_ineui_sext(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: test_ineui_sext:
-; CHECK:    xor r3, r3, r4
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineui_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineui_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -37,10 +71,24 @@ entry:
 
 define signext i32 @test_ineui_z(i32 zeroext %a) {
 ; CHECK-LABEL: test_ineui_z:
-; CHECK:    cntlzw r3, r3
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineui_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineui_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -49,11 +97,27 @@ entry:
 
 define signext i32 @test_ineui_sext_z(i32 zeroext %a) {
 ; CHECK-LABEL: test_ineui_sext_z:
-; CHECK:    cntlzw r3, r3
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineui_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineui_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, 0
   %sub = sext i1 %cmp to i32
@@ -62,12 +126,34 @@ entry:
 
 define void @test_ineui_store(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: test_ineui_store:
-; CHECK:    xor r3, r3, r4
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    stw r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineui_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineui_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -77,13 +163,37 @@ entry:
 
 define void @test_ineui_sext_store(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: test_ineui_sext_store:
-; CHECK:    xor r3, r3, r4
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    neg r3, r3
-; CHECK:    stw r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineui_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineui_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -93,11 +203,31 @@ entry:
 
 define void @test_ineui_z_store(i32 zeroext %a) {
 ; CHECK-LABEL: test_ineui_z_store:
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    stw r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineui_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineui_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -107,12 +237,34 @@ entry:
 
 define void @test_ineui_sext_z_store(i32 zeroext %a) {
 ; CHECK-LABEL: test_ineui_sext_z_store:
-; CHECK:    cntlzw r3, r3
-; CHECK:    srwi r3, r3, 5
-; CHECK:    xori r3, r3, 1
-; CHECK:    neg r3, r3
-; CHECK:    stw r3, 0(r4)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineui_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineui_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i32 %a, 0
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testComparesineull.ll b/test/CodeGen/PowerPC/testComparesineull.ll
index 7f80de420fcd80693ceca3390bc7a5bc8af6e2bd..e288988bdcde6d22f641c2b4a8b423e4ef93bddf 100644
--- a/test/CodeGen/PowerPC/testComparesineull.ll
+++ b/test/CodeGen/PowerPC/testComparesineull.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i64 0, align 8
@@ -15,6 +15,19 @@ define signext i32 @test_ineull(i64 %a, i64 %b) {
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineull:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addic r4, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r4, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineull:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv = zext i1 %cmp to i32
@@ -28,6 +41,19 @@ define signext i32 @test_ineull_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineull_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineull_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %sub = sext i1 %cmp to i32
@@ -40,6 +66,17 @@ define signext i32 @test_ineull_z(i64 %a) {
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineull_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addic r4, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r4, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineull_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv = zext i1 %cmp to i32
@@ -52,6 +89,17 @@ define signext i32 @test_ineull_sext_z(i64 %a) {
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineull_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineull_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %sub = sext i1 %cmp to i32
@@ -61,13 +109,30 @@ entry:
 define void @test_ineull_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_ineull_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineull_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    addic r5, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r5, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineull_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -78,13 +143,30 @@ entry:
 define void @test_ineull_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_ineull_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineull_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineull_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -95,12 +177,27 @@ entry:
 define void @test_ineull_z_store(i64 %a) {
 ; CHECK-LABEL: test_ineull_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineull_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addic r5, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r5, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineull_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r5, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r5, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -111,12 +208,27 @@ entry:
 define void @test_ineull_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_ineull_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineull_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineull_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesineus.ll b/test/CodeGen/PowerPC/testComparesineus.ll
index 9efd5d65df61d2d0b43290a9bc308e7376f6490b..4a2f851fa647c4aad533970418e0585c7d88d093 100644
--- a/test/CodeGen/PowerPC/testComparesineus.ll
+++ b/test/CodeGen/PowerPC/testComparesineus.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i16 0, align 2
@@ -16,6 +16,21 @@ define signext i32 @test_ineus(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineus:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineus:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, %b
   %conv2 = zext i1 %cmp to i32
@@ -31,6 +46,23 @@ define signext i32 @test_ineus_sext(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineus_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineus_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, %b
   %sub = sext i1 %cmp to i32
@@ -44,6 +76,19 @@ define signext i32 @test_ineus_z(i16 zeroext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineus_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineus_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, 0
   %conv1 = zext i1 %cmp to i32
@@ -58,6 +103,21 @@ define signext i32 @test_ineus_sext_z(i16 zeroext %a) {
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineus_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineus_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, 0
   %sub = sext i1 %cmp to i32
@@ -67,14 +127,33 @@ entry:
 define void @test_ineus_store(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: test_ineus_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineus_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineus_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -86,14 +165,35 @@ define void @test_ineus_sext_store(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: test_ineus_sext_store:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineus_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineus_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, %b
   %conv3 = sext i1 %cmp to i16
@@ -104,13 +204,30 @@ entry:
 define void @test_ineus_z_store(i16 zeroext %a) {
 ; CHECK-LABEL: test_ineus_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineus_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineus_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, 0
   %conv2 = zext i1 %cmp to i16
@@ -121,14 +238,33 @@ entry:
 define void @test_ineus_sext_z_store(i16 zeroext %a) {
 ; CHECK-LABEL: test_ineus_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_ineus_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_ineus_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i16 %a, 0
   %conv2 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testCompareslleqsc.ll b/test/CodeGen/PowerPC/testCompareslleqsc.ll
index bdd4568c8e5c724cb450455d4ed56ec9fb5b5944..3e519edcd0cd3cf90f1f5f99f086b23fb7af9353 100644
--- a/test/CodeGen/PowerPC/testCompareslleqsc.ll
+++ b/test/CodeGen/PowerPC/testCompareslleqsc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; ModuleID = 'ComparisonTestCases/testCompareslleqsc.c'
 
@@ -17,6 +17,19 @@ define i64 @test_lleqsc(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = zext i1 %cmp to i64
@@ -32,6 +45,21 @@ define i64 @test_lleqsc_sext(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = sext i1 %cmp to i64
@@ -45,6 +73,17 @@ define i64 @test_lleqsc_z(i8 signext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsc_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsc_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = zext i1 %cmp to i64
@@ -59,6 +98,19 @@ define i64 @test_lleqsc_sext_z(i8 signext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsc_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsc_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = sext i1 %cmp to i64
@@ -69,13 +121,30 @@ entry:
 define void @test_lleqsc_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_lleqsc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -87,14 +156,33 @@ entry:
 define void @test_lleqsc_sext_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_lleqsc_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = sext i1 %cmp to i8
@@ -106,12 +194,27 @@ entry:
 define void @test_lleqsc_z_store(i8 signext %a) {
 ; CHECK-LABEL: test_lleqsc_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsc_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsc_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = zext i1 %cmp to i8
@@ -123,13 +226,30 @@ entry:
 define void @test_lleqsc_sext_z_store(i8 signext %a) {
 ; CHECK-LABEL: test_lleqsc_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsc_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsc_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testCompareslleqsi.ll b/test/CodeGen/PowerPC/testCompareslleqsi.ll
index 6d879c691ced53ff58760cf7f26176e772c64a29..4de1d588238cf90cd5d368e1e98e01eec7521bc1 100644
--- a/test/CodeGen/PowerPC/testCompareslleqsi.ll
+++ b/test/CodeGen/PowerPC/testCompareslleqsi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i32 0, align 4
@@ -16,6 +16,19 @@ define i64 @test_lleqsi(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -31,6 +44,21 @@ define i64 @test_lleqsi_sext(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -44,6 +72,17 @@ define i64 @test_lleqsi_z(i32 signext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsi_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsi_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -58,6 +97,19 @@ define i64 @test_lleqsi_sext_z(i32 signext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsi_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsi_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv1 = sext i1 %cmp to i64
@@ -68,13 +120,30 @@ entry:
 define void @test_lleqsi_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_lleqsi_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -86,14 +155,33 @@ entry:
 define void @test_lleqsi_sext_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_lleqsi_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -105,13 +193,28 @@ entry:
 define void @test_lleqsi_z_store(i32 signext %a) {
 ; CHECK-LABEL: test_lleqsi_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
 ; CHECKNEXT:    blr
+; CHECK-BE-LABEL: test_lleqsi_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsi_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -123,13 +226,30 @@ entry:
 define void @test_lleqsi_sext_z_store(i32 signext %a) {
 ; CHECK-LABEL: test_lleqsi_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsi_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsi_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testCompareslleqsll.ll b/test/CodeGen/PowerPC/testCompareslleqsll.ll
index f997bd900cabea9904d9f55660915921511b46dd..ac92d171a68bbb16930d092947175c16264c6dd7 100644
--- a/test/CodeGen/PowerPC/testCompareslleqsll.ll
+++ b/test/CodeGen/PowerPC/testCompareslleqsll.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i64 0, align 8
@@ -16,6 +16,19 @@ define i64 @test_lleqsll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -30,6 +43,19 @@ define i64 @test_lleqsll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -43,6 +69,17 @@ define i64 @test_lleqsll_z(i64 %a) {
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -56,6 +93,17 @@ define i64 @test_lleqsll_sext_z(i64 %a) {
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = sext i1 %cmp to i64
@@ -66,13 +114,30 @@ entry:
 define void @test_lleqsll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_lleqsll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -84,13 +149,30 @@ entry:
 define void @test_lleqsll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_lleqsll_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -102,12 +184,27 @@ entry:
 define void @test_lleqsll_z_store(i64 %a) {
 ; CHECK-LABEL: test_lleqsll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzd r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -119,12 +216,27 @@ entry:
 define void @test_lleqsll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_lleqsll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addic r3, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqsll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqsll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testCompareslleqss.ll b/test/CodeGen/PowerPC/testCompareslleqss.ll
index 0c4edc33a729d41f1ef64c07b4ac08f240d75c03..d65990f9dc936c11b7b1b89d1c5f74529387e3f5 100644
--- a/test/CodeGen/PowerPC/testCompareslleqss.ll
+++ b/test/CodeGen/PowerPC/testCompareslleqss.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i16 0, align 2
@@ -16,6 +16,19 @@ define i64 @test_lleqss(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqss:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqss:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = zext i1 %cmp to i64
@@ -31,6 +44,21 @@ define i64 @test_lleqss_sext(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqss_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqss_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = sext i1 %cmp to i64
@@ -44,6 +72,17 @@ define i64 @test_lleqss_z(i16 signext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqss_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqss_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = zext i1 %cmp to i64
@@ -58,6 +97,19 @@ define i64 @test_lleqss_sext_z(i16 signext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqss_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqss_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = sext i1 %cmp to i64
@@ -68,13 +120,30 @@ entry:
 define void @test_lleqss_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_lleqss_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqss_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqss_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -86,14 +155,33 @@ entry:
 define void @test_lleqss_sext_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_lleqss_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqss_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqss_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = sext i1 %cmp to i16
@@ -105,12 +193,27 @@ entry:
 define void @test_lleqss_z_store(i16 signext %a) {
 ; CHECK-LABEL: test_lleqss_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqss_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqss_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = zext i1 %cmp to i16
@@ -122,13 +225,30 @@ entry:
 define void @test_lleqss_sext_z_store(i16 signext %a) {
 ; CHECK-LABEL: test_lleqss_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lleqss_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lleqss_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesllequc.ll b/test/CodeGen/PowerPC/testComparesllequc.ll
index 85523f05fb87af8f5a58ffb38ea3a04198ae76dc..1672d19a4a3da6a36b2459cbb6c2cf155513f6d7 100644
--- a/test/CodeGen/PowerPC/testComparesllequc.ll
+++ b/test/CodeGen/PowerPC/testComparesllequc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i8 0, align 1
@@ -16,6 +16,19 @@ define i64 @test_llequc(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = zext i1 %cmp to i64
@@ -31,6 +44,21 @@ define i64 @test_llequc_sext(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = sext i1 %cmp to i64
@@ -44,6 +72,17 @@ define i64 @test_llequc_z(i8 zeroext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequc_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequc_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = zext i1 %cmp to i64
@@ -58,6 +97,19 @@ define i64 @test_llequc_sext_z(i8 zeroext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequc_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequc_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = sext i1 %cmp to i64
@@ -68,13 +120,30 @@ entry:
 define void @test_llequc_store(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: test_llequc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -86,14 +155,33 @@ entry:
 define void @test_llequc_sext_store(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: test_llequc_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, %b
   %conv3 = sext i1 %cmp to i8
@@ -105,12 +193,27 @@ entry:
 define void @test_llequc_z_store(i8 zeroext %a) {
 ; CHECK-LABEL: test_llequc_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequc_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequc_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = zext i1 %cmp to i8
@@ -122,13 +225,30 @@ entry:
 define void @test_llequc_sext_z_store(i8 zeroext %a) {
 ; CHECK-LABEL: test_llequc_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequc_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequc_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i8 %a, 0
   %conv2 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testComparesllequi.ll b/test/CodeGen/PowerPC/testComparesllequi.ll
index cb7be180ee6788da2bd0992ac25be3f8e2711940..fd1d13bae7f6b929480f1817178941d8be8a6f62 100644
--- a/test/CodeGen/PowerPC/testComparesllequi.ll
+++ b/test/CodeGen/PowerPC/testComparesllequi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i32 0, align 4
@@ -16,6 +16,19 @@ define i64 @test_llequi(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -31,6 +44,21 @@ define i64 @test_llequi_sext(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -44,6 +72,17 @@ define i64 @test_llequi_z(i32 zeroext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequi_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequi_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -58,6 +97,19 @@ define i64 @test_llequi_sext_z(i32 zeroext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequi_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequi_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv1 = sext i1 %cmp to i64
@@ -68,13 +120,30 @@ entry:
 define void @test_llequi_store(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: test_llequi_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -86,14 +155,33 @@ entry:
 define void @test_llequi_sext_store(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: test_llequi_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, %b
   %sub = sext i1 %cmp to i32
@@ -105,12 +193,27 @@ entry:
 define void @test_llequi_z_store(i32 zeroext %a) {
 ; CHECK-LABEL: test_llequi_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequi_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequi_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -122,13 +225,30 @@ entry:
 define void @test_llequi_sext_z_store(i32 zeroext %a) {
 ; CHECK-LABEL: test_llequi_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequi_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequi_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %a, 0
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testComparesllequll.ll b/test/CodeGen/PowerPC/testComparesllequll.ll
index 01136f13a4ee24782c99e474b01c2cbe4773688e..a42b5e4c7b7b148ebe39bdbf10b9e6e38dd1cd77 100644
--- a/test/CodeGen/PowerPC/testComparesllequll.ll
+++ b/test/CodeGen/PowerPC/testComparesllequll.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i64 0, align 8
@@ -16,6 +16,19 @@ define i64 @test_llequll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -30,6 +43,19 @@ define i64 @test_llequll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -43,6 +69,17 @@ define i64 @test_llequll_z(i64 %a) {
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -56,6 +93,17 @@ define i64 @test_llequll_sext_z(i64 %a) {
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = sext i1 %cmp to i64
@@ -66,13 +114,30 @@ entry:
 define void @test_llequll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_llequll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -84,13 +149,30 @@ entry:
 define void @test_llequll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_llequll_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    addic r3, r3, -1
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -102,12 +184,27 @@ entry:
 define void @test_llequll_z_store(i64 %a) {
 ; CHECK-LABEL: test_llequll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzd r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzd r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzd r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -119,12 +216,27 @@ entry:
 define void @test_llequll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_llequll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addic r3, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addic r3, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r3, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i64 %a, 0
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesllequs.ll b/test/CodeGen/PowerPC/testComparesllequs.ll
index 459df8baf4608d91d2d1c92bc592da54fde72bdb..e3dde77aba7e6799e0696f333a1a174387daaec8 100644
--- a/test/CodeGen/PowerPC/testComparesllequs.ll
+++ b/test/CodeGen/PowerPC/testComparesllequs.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i16 0, align 2
@@ -16,6 +16,19 @@ define i64 @test_llequs(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequs:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequs:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = zext i1 %cmp to i64
@@ -31,6 +44,21 @@ define i64 @test_llequs_sext(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequs_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequs_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = sext i1 %cmp to i64
@@ -44,6 +72,17 @@ define i64 @test_llequs_z(i16 zeroext %a) {
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequs_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequs_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = zext i1 %cmp to i64
@@ -58,6 +97,19 @@ define i64 @test_llequs_sext_z(i16 zeroext %a) {
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequs_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequs_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = sext i1 %cmp to i64
@@ -68,13 +120,30 @@ entry:
 define void @test_llequs_store(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: test_llequs_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequs_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequs_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -86,14 +155,33 @@ entry:
 define void @test_llequs_sext_store(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: test_llequs_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequs_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequs_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, %b
   %conv3 = sext i1 %cmp to i16
@@ -105,12 +193,27 @@ entry:
 define void @test_llequs_z_store(i16 zeroext %a) {
 ; CHECK-LABEL: test_llequs_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequs_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequs_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = zext i1 %cmp to i16
@@ -122,13 +225,30 @@ entry:
 define void @test_llequs_sext_z_store(i16 zeroext %a) {
 ; CHECK-LABEL: test_llequs_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llequs_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    cntlzw r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    srwi r3, r3, 5
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llequs_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    cntlzw r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    srwi r3, r3, 5
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp eq i16 %a, 0
   %conv2 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesllgesc.ll b/test/CodeGen/PowerPC/testComparesllgesc.ll
index 39499a6e895473353a4c63f97ef89e6cbe245ecc..b97da250323bc3432b7fa1a8cd6b8a9db52af4ed 100644
--- a/test/CodeGen/PowerPC/testComparesllgesc.ll
+++ b/test/CodeGen/PowerPC/testComparesllgesc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i8 0, align 1
 
@@ -14,6 +14,19 @@ define i64 @test_llgesc(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i8 %a, %b
   %conv3 = zext i1 %cmp to i64
@@ -27,6 +40,19 @@ define i64 @test_llgesc_sext(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i8 %a, %b
   %conv3 = sext i1 %cmp to i64
@@ -36,13 +62,30 @@ entry:
 define void @test_llgesc_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_llgesc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -53,13 +96,30 @@ entry:
 define void @test_llgesc_sext_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_llgesc_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i8 %a, %b
   %conv3 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testComparesllgesi.ll b/test/CodeGen/PowerPC/testComparesllgesi.ll
index d0200001befad2d96efd42d4df90877d735fc412..cc23babc00055421689df953e619d33406b70538 100644
--- a/test/CodeGen/PowerPC/testComparesllgesi.ll
+++ b/test/CodeGen/PowerPC/testComparesllgesi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i32 0, align 4
 
@@ -14,6 +14,19 @@ define i64 @test_llgesi(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i32 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -27,6 +40,19 @@ define i64 @test_llgesi_sext(i32 signext %a, i32 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i32 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -36,13 +62,30 @@ entry:
 define void @test_llgesi_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_llgesi_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -53,13 +96,30 @@ entry:
 define void @test_llgesi_sext_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_llgesi_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i32 %a, %b
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testComparesllgesll.ll b/test/CodeGen/PowerPC/testComparesllgesll.ll
index f2096c28ee208576a134e536684418bb3624ee31..1c119b950a35a40d546594b9277b6da28ff5c517 100644
--- a/test/CodeGen/PowerPC/testComparesllgesll.ll
+++ b/test/CodeGen/PowerPC/testComparesllgesll.ll
@@ -1,10 +1,10 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 @glob = common local_unnamed_addr global i64 0, align 8
 
 define i64 @test_llgesll(i64 %a, i64 %b) {
@@ -15,6 +15,21 @@ define i64 @test_llgesll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    subfc r3, r4, r3
 ; CHECK-NEXT:    adde r3, r5, r6
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r5, r3, 63
+; CHECK-BE-NEXT:    rldicl r6, r4, 1, 63
+; CHECK-BE-NEXT:    subfc r3, r4, r3
+; CHECK-BE-NEXT:    adde r3, r5, r6
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r5, r3, 63
+; CHECK-LE-NEXT:    rldicl r6, r4, 1, 63
+; CHECK-LE-NEXT:    subfc r3, r4, r3
+; CHECK-LE-NEXT:    adde r3, r5, r6
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -30,6 +45,23 @@ define i64 @test_llgesll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    adde r3, r5, r6
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r5, r3, 63
+; CHECK-BE-NEXT:    rldicl r6, r4, 1, 63
+; CHECK-BE-NEXT:    subfc r3, r4, r3
+; CHECK-BE-NEXT:    adde r3, r5, r6
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r5, r3, 63
+; CHECK-LE-NEXT:    rldicl r6, r4, 1, 63
+; CHECK-LE-NEXT:    subfc r3, r4, r3
+; CHECK-LE-NEXT:    adde r3, r5, r6
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -42,6 +74,17 @@ define i64 @test_llgesll_z(i64 %a) {
 ; CHECK-NEXT:    not r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    not r3, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    not r3, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sgt i64 %a, -1
   %conv1 = zext i1 %cmp to i64
@@ -54,6 +97,17 @@ define i64 @test_llgesll_sext_z(i64 %a) {
 ; CHECK-NEXT:    not r3, r3
 ; CHECK-NEXT:    sradi r3, r3, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    not r3, r3
+; CHECK-BE-NEXT:    sradi r3, r3, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    not r3, r3
+; CHECK-LE-NEXT:    sradi r3, r3, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sgt i64 %a, -1
   %conv1 = sext i1 %cmp to i64
@@ -63,12 +117,33 @@ entry:
 define void @test_llgesll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_llgesll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK:    sradi r6, r3, 63
-; CHECK:    subfc r3, r4, r3
-; CHECK:    rldicl r3, r4, 1, 63
-; CHECK:    adde r3, r6, r3
-; CHECK:    std r3,
+; CHECK-NEXT:    sradi r6, r3, 63
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    subfc r3, r4, r3
+; CHECK-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-NEXT:    adde r3, r6, r3
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sradi r6, r3, 63
+; CHECK-BE-NEXT:    ld r5, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    subfc r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-BE-NEXT:    adde r3, r6, r3
+; CHECK-BE-NEXT:    std r3, 0(r5)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r6, r3, 63
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfc r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-LE-NEXT:    adde r3, r6, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -80,14 +155,35 @@ define void @test_llgesll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_llgesll_sext_store:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    sradi r6, r3, 63
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfc r3, r4, r3
 ; CHECK-NEXT:    rldicl r3, r4, 1, 63
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    adde r3, r6, r3
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r6, r3, 63
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    subfc r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    adde r3, r6, r3
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r6, r3, 63
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfc r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r4, 1, 63
+; CHECK-LE-NEXT:    adde r3, r6, r3
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -98,12 +194,27 @@ entry:
 define void @test_llgesll_z_store(i64 %a) {
 ; CHECK-LABEL: test_llgesll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    not r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    not r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    not r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sgt i64 %a, -1
   %conv1 = zext i1 %cmp to i64
@@ -114,12 +225,27 @@ entry:
 define void @test_llgesll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_llgesll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    not r3, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    sradi r3, r3, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgesll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    not r3, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    sradi r3, r3, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgesll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    not r3, r3
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    sradi r3, r3, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sgt i64 %a, -1
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesllgess.ll b/test/CodeGen/PowerPC/testComparesllgess.ll
index 71af3d34ce5d8bbcb8ee9b240593bdb058054d7e..400c11bef4b61b1ba85df3c98051333553bdfd5c 100644
--- a/test/CodeGen/PowerPC/testComparesllgess.ll
+++ b/test/CodeGen/PowerPC/testComparesllgess.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i16 0, align 2
 
@@ -14,6 +14,19 @@ define i64 @test_llgess(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgess:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgess:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i16 %a, %b
   %conv3 = zext i1 %cmp to i64
@@ -27,6 +40,19 @@ define i64 @test_llgess_sext(i16 signext %a, i16 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgess_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgess_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i16 %a, %b
   %conv3 = sext i1 %cmp to i64
@@ -36,13 +62,30 @@ entry:
 define void @test_llgess_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_llgess_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgess_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgess_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -53,13 +96,30 @@ entry:
 define void @test_llgess_sext_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_llgess_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llgess_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llgess_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sge i16 %a, %b
   %conv3 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesllgeuc.ll b/test/CodeGen/PowerPC/testComparesllgeuc.ll
index e56e09b37ac3c20bcc33127f093d3fd70c3282ae..f1f50dd5173f7dd523305a74e34fb9d5a47007c1 100644
--- a/test/CodeGen/PowerPC/testComparesllgeuc.ll
+++ b/test/CodeGen/PowerPC/testComparesllgeuc.ll
@@ -15,8 +15,8 @@ entry:
   ret i64 %conv3
 ; CHECK-LABEL: test_llgeuc:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -64,8 +64,8 @@ entry:
   ret void
 ; CHECK_LABEL: test_llgeuc_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesllgeui.ll b/test/CodeGen/PowerPC/testComparesllgeui.ll
index 9e971b140de4fa852f8559645e08a92391ec5e05..116cc946467ecf2696782e57932031c437e99454 100644
--- a/test/CodeGen/PowerPC/testComparesllgeui.ll
+++ b/test/CodeGen/PowerPC/testComparesllgeui.ll
@@ -15,8 +15,8 @@ entry:
   ret i64 %conv1
 ; CHECK-LABEL: test_llgeui:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -64,8 +64,8 @@ entry:
   ret void
 ; CHECK_LABEL: test_igeuc_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesllgeus.ll b/test/CodeGen/PowerPC/testComparesllgeus.ll
index 4520ca3ead5973ac34aa229ce963cc47e9912b46..67d6d190602792bacd51431fd7b68d37a0557b85 100644
--- a/test/CodeGen/PowerPC/testComparesllgeus.ll
+++ b/test/CodeGen/PowerPC/testComparesllgeus.ll
@@ -15,8 +15,8 @@ entry:
   ret i64 %conv3
 ; CHECK-LABEL: test_llgeus:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -64,8 +64,8 @@ entry:
   ret void
 ; CHECK_LABEL: test_llgeus_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r3, r4
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG2]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesllgtsll.ll b/test/CodeGen/PowerPC/testComparesllgtsll.ll
index 0dc1374374f792d20eec86bd96698bc77539c24c..deb9ad5621d02b1c5cd9f45e0a3a316411b5fb80 100644
--- a/test/CodeGen/PowerPC/testComparesllgtsll.ll
+++ b/test/CodeGen/PowerPC/testComparesllgtsll.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesllgtuc.ll b/test/CodeGen/PowerPC/testComparesllgtuc.ll
index ba70713d61e58352f3a3bf9d9761097083dd2627..685026ad03d1cb5b3e122bc677426293a59a2913 100644
--- a/test/CodeGen/PowerPC/testComparesllgtuc.ll
+++ b/test/CodeGen/PowerPC/testComparesllgtuc.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesllgtui.ll b/test/CodeGen/PowerPC/testComparesllgtui.ll
index d07e85972f2dc01eaf39098466c7f77279294b24..a1c8777e3832a473491f17f735a00acd66e4b707 100644
--- a/test/CodeGen/PowerPC/testComparesllgtui.ll
+++ b/test/CodeGen/PowerPC/testComparesllgtui.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesllgtus.ll b/test/CodeGen/PowerPC/testComparesllgtus.ll
index 3758e8e097cee35df01b093fd590a7b6d28565e6..94bb145460642d9c9ea1fc8549e2d4c944729607 100644
--- a/test/CodeGen/PowerPC/testComparesllgtus.ll
+++ b/test/CodeGen/PowerPC/testComparesllgtus.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testCompareslllesc.ll b/test/CodeGen/PowerPC/testCompareslllesc.ll
index 575451ec7fdbeabe1fbabc86a461cbe646e02be8..1a5ad44ee080b14b1a79fa13c7cedf2b61c39009 100644
--- a/test/CodeGen/PowerPC/testCompareslllesc.ll
+++ b/test/CodeGen/PowerPC/testCompareslllesc.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i8 0, align 1
@@ -15,6 +15,19 @@ define i64 @test_lllesc(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesc:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i8 %a, %b
   %conv3 = zext i1 %cmp to i64
@@ -28,6 +41,19 @@ define i64 @test_lllesc_sext(i8 signext %a, i8 signext %b) {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesc_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesc_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i8 %a, %b
   %conv3 = sext i1 %cmp to i64
@@ -37,13 +63,30 @@ entry:
 define void @test_lllesc_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_lllesc_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesc_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesc_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i8 %a, %b
   %conv3 = zext i1 %cmp to i8
@@ -54,13 +97,30 @@ entry:
 define void @test_lllesc_sext_store(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: test_lllesc_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    stb r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesc_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    stb r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesc_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    stb r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i8 %a, %b
   %conv3 = sext i1 %cmp to i8
diff --git a/test/CodeGen/PowerPC/testCompareslllesi.ll b/test/CodeGen/PowerPC/testCompareslllesi.ll
index e04641d608ed8286cd642129df5c9424eebdcd3f..9b79c1739a5c42e6a8c9d4fa9dc639ae356a9b7c 100644
--- a/test/CodeGen/PowerPC/testCompareslllesi.ll
+++ b/test/CodeGen/PowerPC/testCompareslllesi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i32 0, align 4
@@ -15,6 +15,19 @@ define i64 @test_lllesi(i32 signext %a, i32 signext %b)  {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesi:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesi:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -28,6 +41,19 @@ define i64 @test_lllesi_sext(i32 signext %a, i32 signext %b)  {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesi_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesi_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -37,13 +63,30 @@ entry:
 define void @test_lllesi_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_lllesi_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesi_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesi_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -54,13 +97,30 @@ entry:
 define void @test_lllesi_sext_store(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: test_lllesi_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    stw r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesi_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    stw r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesi_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    stw r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %sub = sext i1 %cmp to i32
diff --git a/test/CodeGen/PowerPC/testCompareslllesll.ll b/test/CodeGen/PowerPC/testCompareslllesll.ll
index b5c340d7d58fb97bb805d1e7c011b579f09b112c..83ce2f812ae14056b6611d9333f927aa7e5050fb 100644
--- a/test/CodeGen/PowerPC/testCompareslllesll.ll
+++ b/test/CodeGen/PowerPC/testCompareslllesll.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 @glob = common local_unnamed_addr global i64 0, align 8
 
@@ -16,6 +16,21 @@ define i64 @test_lllesll(i64 %a, i64 %b)  {
 ; CHECK-NEXT:    subfc r3, r3, r4
 ; CHECK-NEXT:    adde r3, r5, r6
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r5, r4, 63
+; CHECK-BE-NEXT:    rldicl r6, r3, 1, 63
+; CHECK-BE-NEXT:    subfc r3, r3, r4
+; CHECK-BE-NEXT:    adde r3, r5, r6
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r5, r4, 63
+; CHECK-LE-NEXT:    rldicl r6, r3, 1, 63
+; CHECK-LE-NEXT:    subfc r3, r3, r4
+; CHECK-LE-NEXT:    adde r3, r5, r6
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -32,6 +47,23 @@ define i64 @test_lllesll_sext(i64 %a, i64 %b)  {
 ; CHECK-NEXT:    adde r3, r5, r6
 ; CHECK-NEXT:    neg r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r5, r4, 63
+; CHECK-BE-NEXT:    rldicl r6, r3, 1, 63
+; CHECK-BE-NEXT:    subfc r3, r3, r4
+; CHECK-BE-NEXT:    adde r3, r5, r6
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r5, r4, 63
+; CHECK-LE-NEXT:    rldicl r6, r3, 1, 63
+; CHECK-LE-NEXT:    subfc r3, r3, r4
+; CHECK-LE-NEXT:    adde r3, r5, r6
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -46,6 +78,19 @@ define i64 @test_lllesll_z(i64 %a)  {
 ; CHECK-NEXT:    or r3, r4, r3
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addi r4, r3, -1
+; CHECK-BE-NEXT:    or r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addi r4, r3, -1
+; CHECK-LE-NEXT:    or r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp slt i64 %a, 1
   %conv1 = zext i1 %cmp to i64
@@ -60,6 +105,19 @@ define i64 @test_lllesll_sext_z(i64 %a)  {
 ; CHECK-NEXT:    or r3, r4, r3
 ; CHECK-NEXT:    sradi r3, r3, 63
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addi r4, r3, -1
+; CHECK-BE-NEXT:    or r3, r4, r3
+; CHECK-BE-NEXT:    sradi r3, r3, 63
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addi r4, r3, -1
+; CHECK-LE-NEXT:    or r3, r4, r3
+; CHECK-LE-NEXT:    sradi r3, r3, 63
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp slt i64 %a, 1
   %conv1 = sext i1 %cmp to i64
@@ -70,14 +128,33 @@ entry:
 define void @test_lllesll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_lllesll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sradi r6, r4, 63
-; CHECK-NEXT:    ld r5, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfc r4, r3, r4
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    adde r3, r6, r3
-; CHECK-NEXT:    std r3, 0(r5)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sradi r6, r4, 63
+; CHECK-BE-NEXT:    ld r5, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    subfc r4, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    adde r3, r6, r3
+; CHECK-BE-NEXT:    std r3, 0(r5)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r6, r4, 63
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfc r4, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    adde r3, r6, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -90,14 +167,35 @@ define void @test_lllesll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_lllesll_sext_store:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    sradi r6, r4, 63
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfc r4, r3, r4
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
 ; CHECK-NEXT:    adde r3, r6, r3
 ; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sradi r6, r4, 63
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    subfc r4, r3, r4
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    adde r3, r6, r3
+; CHECK-BE-NEXT:    neg r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sradi r6, r4, 63
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfc r4, r3, r4
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    adde r3, r6, r3
+; CHECK-LE-NEXT:    neg r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -109,13 +207,30 @@ entry:
 define void @test_lllesll_z_store(i64 %a) {
 ; CHECK-LABEL: test_lllesll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addi r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    or r3, r5, r3
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addi r5, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    or r3, r5, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addi r5, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    or r3, r5, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp slt i64 %a, 1
   %conv1 = zext i1 %cmp to i64
@@ -127,13 +242,30 @@ entry:
 define void @test_lllesll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_lllesll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addi r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    or r3, r5, r3
 ; CHECK-NEXT:    sradi r3, r3, 63
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_lllesll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addi r5, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    or r3, r5, r3
+; CHECK-BE-NEXT:    sradi r3, r3, 63
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_lllesll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addi r5, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    or r3, r5, r3
+; CHECK-LE-NEXT:    sradi r3, r3, 63
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp slt i64 %a, 1
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesllless.ll b/test/CodeGen/PowerPC/testComparesllless.ll
index 1f066524cc46a3b171aa34f82e0e3ba4d29a8c7a..6f8657dac4b1d7a0c584e043a57a09d01b58841e 100644
--- a/test/CodeGen/PowerPC/testComparesllless.ll
+++ b/test/CodeGen/PowerPC/testComparesllless.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:   --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i16 0, align 2
@@ -15,6 +15,19 @@ define i64 @test_llless(i16 signext %a, i16 signext %b)  {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llless:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llless:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i16 %a, %b
   %conv3 = zext i1 %cmp to i64
@@ -28,6 +41,19 @@ define i64 @test_llless_sext(i16 signext %a, i16 signext %b)  {
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llless_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llless_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i16 %a, %b
   %conv3 = sext i1 %cmp to i64
@@ -37,13 +63,30 @@ entry:
 define void @test_llless_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_llless_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llless_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    xori r3, r3, 1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llless_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    xori r3, r3, 1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i16 %a, %b
   %conv3 = zext i1 %cmp to i16
@@ -54,13 +97,30 @@ entry:
 define void @test_llless_sext_store(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: test_llless_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    sub r3, r4, r3
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    rldicl r3, r3, 1, 63
 ; CHECK-NEXT:    addi r3, r3, -1
-; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    sth r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llless_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    sub r3, r4, r3
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-BE-NEXT:    addi r3, r3, -1
+; CHECK-BE-NEXT:    sth r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llless_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    sub r3, r4, r3
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    rldicl r3, r3, 1, 63
+; CHECK-LE-NEXT:    addi r3, r3, -1
+; CHECK-LE-NEXT:    sth r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp sle i16 %a, %b
   %conv3 = sext i1 %cmp to i16
diff --git a/test/CodeGen/PowerPC/testComparesllleuc.ll b/test/CodeGen/PowerPC/testComparesllleuc.ll
index 3e4beb73b6fba1b51db2d6ef8b15cac50640c130..b2df0167f4864147144fac07a56c1d73856fa620 100644
--- a/test/CodeGen/PowerPC/testComparesllleuc.ll
+++ b/test/CodeGen/PowerPC/testComparesllleuc.ll
@@ -15,8 +15,8 @@ entry:
   ret i64 %conv3
 ; CHECK-LABEL: test_llleuc:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK-NEXT: blr
 }
 
@@ -67,8 +67,8 @@ entry:
   ret void
 ; CHECK-LABEL: test_llleuc_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesllleui.ll b/test/CodeGen/PowerPC/testComparesllleui.ll
index 0dca0dcb56fae235f9f4234ca6ea1efec0207791..fa77495d7e548082ec15f0c5141e2272dabdd4a8 100644
--- a/test/CodeGen/PowerPC/testComparesllleui.ll
+++ b/test/CodeGen/PowerPC/testComparesllleui.ll
@@ -15,8 +15,8 @@ entry:
   ret i64 %conv1
 ; CHECK-LABEL: test_llleui:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -67,8 +67,8 @@ entry:
   ret void
 ; CHECK-LABEL: test_llleui_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesllleus.ll b/test/CodeGen/PowerPC/testComparesllleus.ll
index 422a2b3f0f26c9a202d6bf2c9c7b8978dd164b08..51ebb5e572d9b0ae57fcb72454dd106cc1391291 100644
--- a/test/CodeGen/PowerPC/testComparesllleus.ll
+++ b/test/CodeGen/PowerPC/testComparesllleus.ll
@@ -15,8 +15,8 @@ entry:
   ret i64 %conv3
 ; CHECK-LABEL: test_llleus:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK-NEXT: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK-NEXT: xori r3, [[REG2]], 1
+; CHECK-NEXT: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
@@ -67,8 +67,8 @@ entry:
   ret void
 ; CHECK-LABEL: test_llleus_store:
 ; CHECK: sub [[REG1:r[0-9]+]], r4, r3
-; CHECK: rldicl [[REG2:r[0-9]+]], [[REG1]], 1, 63
-; CHECK: xori {{r[0-9]+}}, [[REG2]], 1
+; CHECK: not [[REG2:r[0-9]+]], [[REG1]]
+; CHECK-NEXT: rldicl r3, [[REG2]], 1, 63
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/testComparesllltui.ll b/test/CodeGen/PowerPC/testComparesllltui.ll
index e785942b3c9fff1885975d5cab4f33f4a5dc7855..126d110d04cb99fbcf6d420fc21d7796de8a0d25 100644
--- a/test/CodeGen/PowerPC/testComparesllltui.ll
+++ b/test/CodeGen/PowerPC/testComparesllltui.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: llc --relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
 ; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
diff --git a/test/CodeGen/PowerPC/testComparesllnesll.ll b/test/CodeGen/PowerPC/testComparesllnesll.ll
index 47545e94bfff607a5bcf7003e663cb692218a770..10bff2f7d94e421467bee1813f537fa6b39a6016 100644
--- a/test/CodeGen/PowerPC/testComparesllnesll.ll
+++ b/test/CodeGen/PowerPC/testComparesllnesll.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i64 0, align 8
@@ -15,6 +15,19 @@ define i64 @test_llnesll(i64 %a, i64 %b) {
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llnesll:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addic r4, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r4, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llnesll:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -28,6 +41,19 @@ define i64 @test_llnesll_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llnesll_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llnesll_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -40,6 +66,17 @@ define i64 @test_llnesll_z(i64 %a) {
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llnesll_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addic r4, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r4, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llnesll_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -52,6 +89,17 @@ define i64 @test_llnesll_sext_z(i64 %a) {
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llnesll_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llnesll_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = sext i1 %cmp to i64
@@ -61,13 +109,30 @@ entry:
 define void @test_llnesll_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_llnesll_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llnesll_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    addic r5, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r5, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llnesll_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -78,13 +143,30 @@ entry:
 define void @test_llnesll_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_llnesll_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llnesll_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llnesll_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -95,12 +177,27 @@ entry:
 define void @test_llnesll_z_store(i64 %a) {
 ; CHECK-LABEL: test_llnesll_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llnesll_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addic r5, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r5, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llnesll_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r5, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r5, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -111,12 +208,27 @@ entry:
 define void @test_llnesll_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_llnesll_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llnesll_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llnesll_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/testComparesllneull.ll b/test/CodeGen/PowerPC/testComparesllneull.ll
index b2c5086fcba7cefeb1a6784afd0556234164db9b..9e6be97e8c176ce1942245dc91916a54e8475089 100644
--- a/test/CodeGen/PowerPC/testComparesllneull.ll
+++ b/test/CodeGen/PowerPC/testComparesllneull.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-BE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:   -ppc-gpr-icmps=all -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-LE \
 ; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
 
 @glob = common local_unnamed_addr global i64 0, align 8
@@ -15,6 +15,19 @@ define i64 @test_llneull(i64 %a, i64 %b) {
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llneull:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    addic r4, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r4, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llneull:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -28,6 +41,19 @@ define i64 @test_llneull_sext(i64 %a, i64 %b) {
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llneull_sext:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llneull_sext:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -40,6 +66,17 @@ define i64 @test_llneull_z(i64 %a) {
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llneull_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addic r4, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r4, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llneull_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -52,6 +89,17 @@ define i64 @test_llneull_sext_z(i64 %a) {
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llneull_sext_z:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llneull_sext_z:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = sext i1 %cmp to i64
@@ -61,13 +109,30 @@ entry:
 define void @test_llneull_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_llneull_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llneull_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    addic r5, r3, -1
+; CHECK-BE-NEXT:    subfe r3, r5, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llneull_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    addic r4, r3, -1
+; CHECK-LE-NEXT:    subfe r3, r4, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = zext i1 %cmp to i64
@@ -78,13 +143,30 @@ entry:
 define void @test_llneull_sext_store(i64 %a, i64 %b) {
 ; CHECK-LABEL: test_llneull_sext_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
 ; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    addis r5, r2, glob@toc@ha
 ; CHECK-NEXT:    subfic r3, r3, 0
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r5)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llneull_sext_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    xor r3, r3, r4
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llneull_sext_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xor r3, r3, r4
+; CHECK-LE-NEXT:    addis r5, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r5)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, %b
   %conv1 = sext i1 %cmp to i64
@@ -95,12 +177,27 @@ entry:
 define void @test_llneull_z_store(i64 %a) {
 ; CHECK-LABEL: test_llneull_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llneull_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    addic r5, r3, -1
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r5, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llneull_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    addic r5, r3, -1
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r5, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = zext i1 %cmp to i64
@@ -111,12 +208,27 @@ entry:
 define void @test_llneull_sext_z_store(i64 %a) {
 ; CHECK-LABEL: test_llneull_sext_z_store:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
 ; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    addis r4, r2, glob@toc@ha
 ; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
+; CHECK-NEXT:    std r3, glob@toc@l(r4)
 ; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: test_llneull_sext_z_store:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-BE-NEXT:    subfic r3, r3, 0
+; CHECK-BE-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-BE-NEXT:    subfe r3, r3, r3
+; CHECK-BE-NEXT:    std r3, 0(r4)
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: test_llneull_sext_z_store:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    subfic r3, r3, 0
+; CHECK-LE-NEXT:    addis r4, r2, glob@toc@ha
+; CHECK-LE-NEXT:    subfe r3, r3, r3
+; CHECK-LE-NEXT:    std r3, glob@toc@l(r4)
+; CHECK-LE-NEXT:    blr
 entry:
   %cmp = icmp ne i64 %a, 0
   %conv1 = sext i1 %cmp to i64
diff --git a/test/CodeGen/PowerPC/toc-float.ll b/test/CodeGen/PowerPC/toc-float.ll
index 4f5c34110eafd0d935d4efce2bf7dfe648faf3de..5ecc6a5c6821cd1364e2e34252bc49f4e7a5c467 100644
--- a/test/CodeGen/PowerPC/toc-float.ll
+++ b/test/CodeGen/PowerPC/toc-float.ll
@@ -1,14 +1,15 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 <%s | FileCheck -check-prefix=CHECK-P9 %s
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 <%s | FileCheck -check-prefix=CHECK-P8 %s
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 <%s | FileCheck -check-prefix=CHECK-P9 %s
+; RUN: llc -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 <%s | FileCheck -check-prefix=CHECK-P8 %s
 
 ; As the constant could be represented as float, a float is
 ; loaded from constant pool.
 define double @doubleConstant1() {
   ret double 1.400000e+01
 
-; CHECK-LABEL: doubleConstant1:
+; CHECK-P9-LABEL: doubleConstant1:
 ; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P9: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P8-LABEL: doubleConstant1:
 ; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P8: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 }
@@ -18,9 +19,10 @@ define double @doubleConstant1() {
 define double @doubleConstant2() {
   ret double 2.408904e+01
 
-; CHECK-LABEL: doubleConstant2:
+; CHECK-P9-LABEL: doubleConstant2:
 ; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P9: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P8-LABEL: doubleConstant2:
 ; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P8: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 }
@@ -32,9 +34,10 @@ define float @floatConstantArray() local_unnamed_addr  {
   %2 = fadd float %1, 0x400B333340000000
   ret float %2
 
-; CHECK-LABEL: floatConstantArray 
+; CHECK-P9-LABEL: floatConstantArray 
 ; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha+[[REG2:[0-9]+]]
 ; CHECK-P9: lfs {{[0-9]+}}, [[VAR]]@toc@l+[[REG2]]([[REG1]])
+; CHECK-P8-LABEL: floatConstantArray 
 ; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P8: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
 ; CHECK-P8: lfs {{[0-9]+}}, 12([[REG2]])
@@ -43,9 +46,10 @@ define float @floatConstantArray() local_unnamed_addr  {
 define float @floatConstant() {
   ret float 0x400470A3E0000000
 
-; CHECK-LABEL: floatConstant:
+; CHECK-P9-LABEL: floatConstant:
 ; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P9: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P8-LABEL: floatConstant:
 ; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P8: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 }
@@ -59,9 +63,10 @@ define double @doubleConstantArray()  {
   %2 = fadd double %1, 6.880000e+00
   ret double %2
 
-; CHECK-LABEL: doubleConstantArray
+; CHECK-P9-LABEL: doubleConstantArray
 ; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha+[[REG2:[0-9]+]]
 ; CHECK-P9: lfd {{[0-9]+}}, [[VAR]]@toc@l+[[REG2]]([[REG1]])
+; CHECK-P8-LABEL: doubleConstantArray
 ; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P8: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
 ; CHECK-P8: lfd {{[0-9]+}}, 24([[REG2]])
@@ -75,12 +80,13 @@ define double @doubleLargeConstantArray()  {
   ret double %2
 
 ; Access an element with an offset that doesn't fit in the displacement field of LFD. 
-; CHECK-LABEL: doubleLargeConstantArray
+; CHECK-P9-LABEL: doubleLargeConstantArray
 ; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P9: li [[REG2:[0-9]+]], 0 
 ; CHECK-P9: addi [[REG3:[0-9]+]], [[REG1]], [[VAR:[a-z0-9A-Z_.]+]]@toc@l
 ; CHECK-P9: ori [[REG4:[0-9]+]], [[REG2]], 32768 
 ; CHECK-P9: lfdx {{[0-9]+}}, [[REG3]], [[REG4]] 
+; CHECK-P8-LABEL: doubleLargeConstantArray
 ; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P8: li [[REG2:[0-9]+]], 0 
 ; CHECK-P8: addi [[REG3:[0-9]+]], [[REG1]], [[VAR:[a-z0-9A-Z_.]+]]@toc@l
@@ -95,10 +101,11 @@ entry:
   %0 = load <4 x i32>, <4 x i32>* getelementptr inbounds ([10 x <4 x i32>], [10 x <4 x i32>]* @vec_arr, i64 0, i64 2), align 16
   ret <4 x i32> %0
 
-; CHECK-LABEL: vectorArray
+; CHECK-P9-LABEL: vectorArray
 ; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P9: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
 ; CHECK-P9: lxv {{[0-9]+}}, 32([[REG2]])
+; CHECK-P8-LABEL: vectorArray
 ; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
 ; CHECK-P8: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
 ; CHECK-P8: addi [[REG3:[0-9]+]], [[REG2]], 32
diff --git a/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll b/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9977410f9979366eac61b476cbac900d0b52010f
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll
@@ -0,0 +1,1474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i32 @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xxsldwi vs1, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    mfvsrwz r3, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    mfvsrwz r4, f1
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    li r3, 0
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x float>
+  %1 = fptoui <2 x float> %0 to <2 x i16>
+  %2 = bitcast <2 x i16> %1 to i32
+  ret i32 %2
+}
+
+define i64 @test4elt(<4 x float> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    mtvsrd f3, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs2
+; CHECK-P8-NEXT:    xxswapd v5, vs3
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglh v3, v4, v5
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P9-NEXT:    xxswapd vs1, v2
+; CHECK-P9-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, v2
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    mfvsrwz r5, f3
+; CHECK-P9-NEXT:    mfvsrwz r3, f0
+; CHECK-P9-NEXT:    mfvsrwz r4, f1
+; CHECK-P9-NEXT:    mfvsrwz r6, f2
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglh v3, v4, v5
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-BE-NEXT:    xxswapd vs1, v2
+; CHECK-BE-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-BE-NEXT:    xscvspdpn f3, v2
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    mfvsrwz r5, f3
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    mfvsrwz r3, f0
+; CHECK-BE-NEXT:    mfvsrwz r4, f1
+; CHECK-BE-NEXT:    mfvsrwz r6, f2
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v4, v5
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <4 x float> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to i64
+  ret i64 %1
+}
+
+define <8 x i16> @test8elt(<8 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lvx v5, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xxsldwi vs2, v5, v5, 3
+; CHECK-P8-NEXT:    xscvspdpn f4, v5
+; CHECK-P8-NEXT:    xxswapd vs3, v5
+; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    mfvsrwz r6, f1
+; CHECK-P8-NEXT:    mfvsrwz r5, f0
+; CHECK-P8-NEXT:    mtvsrd f1, r6
+; CHECK-P8-NEXT:    mtvsrd f0, r5
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 1
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, v2
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v1, vs4
+; CHECK-P8-NEXT:    vmrglh v2, v4, v3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    xxswapd v5, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v6, vs3
+; CHECK-P8-NEXT:    xxswapd v0, vs0
+; CHECK-P8-NEXT:    vmrglh v3, v3, v4
+; CHECK-P8-NEXT:    vmrglh v4, v0, v5
+; CHECK-P8-NEXT:    vmrglh v5, v1, v6
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs3, vs1
+; CHECK-P9-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd vs6, vs0
+; CHECK-P9-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    mfvsrwz r5, f1
+; CHECK-P9-NEXT:    mfvsrwz r9, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f2
+; CHECK-P9-NEXT:    mfvsrwz r4, f3
+; CHECK-P9-NEXT:    mfvsrwz r6, f4
+; CHECK-P9-NEXT:    mfvsrwz r7, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r10, f7
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v6, vs6
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    xxswapd v0, vs4
+; CHECK-P9-NEXT:    xxswapd v1, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs7
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglh v3, v4, v5
+; CHECK-P9-NEXT:    vmrglh v4, v1, v0
+; CHECK-P9-NEXT:    vmrglh v5, v6, v7
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    vmrglw v3, v5, v4
+; CHECK-P9-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs3, vs1
+; CHECK-BE-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd vs6, vs0
+; CHECK-BE-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    mfvsrwz r5, f1
+; CHECK-BE-NEXT:    mfvsrwz r9, f0
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    mfvsrwz r3, f2
+; CHECK-BE-NEXT:    mfvsrwz r4, f3
+; CHECK-BE-NEXT:    mfvsrwz r6, f4
+; CHECK-BE-NEXT:    mfvsrwz r7, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r10, f7
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v4, v5
+; CHECK-BE-NEXT:    vmrghh v4, v1, v0
+; CHECK-BE-NEXT:    vmrghh v5, v6, v7
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x float>, <8 x float>* %0, align 32
+  %1 = fptoui <8 x float> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define void @test16elt(<16 x i16>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lvx v5, 0, r4
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v2, r4, r6
+; CHECK-P8-NEXT:    lvx v3, r4, r5
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    xscvspdpn f0, v5
+; CHECK-P8-NEXT:    xxsldwi vs1, v5, v5, 3
+; CHECK-P8-NEXT:    lvx v4, r4, r6
+; CHECK-P8-NEXT:    xscvspdpn f4, v2
+; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
+; CHECK-P8-NEXT:    xscvspdpn f2, v3
+; CHECK-P8-NEXT:    xxswapd vs3, v5
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xxswapd vs8, v3
+; CHECK-P8-NEXT:    xscvspdpn f6, v4
+; CHECK-P8-NEXT:    xxsldwi vs7, v3, v3, 3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xxsldwi vs10, v2, v2, 3
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xxsldwi vs9, v3, v3, 1
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xxsldwi vs12, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f8, vs8
+; CHECK-P8-NEXT:    xxswapd vs11, v2
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xxswapd v2, v4
+; CHECK-P8-NEXT:    xscvspdpn f7, vs7
+; CHECK-P8-NEXT:    xxsldwi vs13, v4, v4, 3
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xxsldwi v3, v4, v4, 1
+; CHECK-P8-NEXT:    xscvspdpn f10, vs10
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvspdpn f9, vs9
+; CHECK-P8-NEXT:    xscvdpsxws f6, f6
+; CHECK-P8-NEXT:    xscvspdpn f12, vs12
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvspdpn f11, vs11
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvspdpn v2, v2
+; CHECK-P8-NEXT:    xscvdpsxws f8, f8
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f7, f7
+; CHECK-P8-NEXT:    mfvsrwz r6, f2
+; CHECK-P8-NEXT:    xscvspdpn f13, vs13
+; CHECK-P8-NEXT:    xscvspdpn v3, v3
+; CHECK-P8-NEXT:    xscvdpsxws f10, f10
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xscvdpsxws f9, f9
+; CHECK-P8-NEXT:    mtvsrd f2, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f6
+; CHECK-P8-NEXT:    xscvdpsxws f12, f12
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    xscvdpsxws f11, f11
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f6, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f3
+; CHECK-P8-NEXT:    xscvdpsxws v2, v2
+; CHECK-P8-NEXT:    xxswapd v9, vs6
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f8
+; CHECK-P8-NEXT:    mtvsrd f3, r6
+; CHECK-P8-NEXT:    xxswapd v0, vs5
+; CHECK-P8-NEXT:    mfvsrwz r6, f7
+; CHECK-P8-NEXT:    xscvdpsxws f13, f13
+; CHECK-P8-NEXT:    xxswapd v5, vs3
+; CHECK-P8-NEXT:    xscvdpsxws v3, v3
+; CHECK-P8-NEXT:    mtvsrd f8, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f10
+; CHECK-P8-NEXT:    mtvsrd f7, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f9
+; CHECK-P8-NEXT:    mtvsrd f10, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f12
+; CHECK-P8-NEXT:    mtvsrd f9, r6
+; CHECK-P8-NEXT:    xxswapd v6, vs10
+; CHECK-P8-NEXT:    mfvsrwz r6, f11
+; CHECK-P8-NEXT:    mtvsrd f12, r4
+; CHECK-P8-NEXT:    xxswapd v1, vs9
+; CHECK-P8-NEXT:    mfvsrwz r4, v2
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    mtvsrd f11, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f13
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    xxswapd v7, vs11
+; CHECK-P8-NEXT:    mfvsrwz r4, v3
+; CHECK-P8-NEXT:    vmrglh v3, v5, v4
+; CHECK-P8-NEXT:    xxswapd v4, vs7
+; CHECK-P8-NEXT:    vmrglh v2, v2, v0
+; CHECK-P8-NEXT:    xxswapd v5, vs8
+; CHECK-P8-NEXT:    xxswapd v0, vs2
+; CHECK-P8-NEXT:    mtvsrd f13, r6
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    vmrglh v4, v5, v4
+; CHECK-P8-NEXT:    vmrglh v5, v0, v1
+; CHECK-P8-NEXT:    xxswapd v1, vs4
+; CHECK-P8-NEXT:    vmrglh v0, v7, v6
+; CHECK-P8-NEXT:    xxswapd v6, vs12
+; CHECK-P8-NEXT:    xxswapd v7, vs13
+; CHECK-P8-NEXT:    xxswapd v10, vs1
+; CHECK-P8-NEXT:    vmrglw v2, v2, v3
+; CHECK-P8-NEXT:    vmrglh v1, v1, v6
+; CHECK-P8-NEXT:    vmrglh v6, v8, v7
+; CHECK-P8-NEXT:    vmrglh v7, v9, v10
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v4, v1, v0
+; CHECK-P8-NEXT:    vmrglw v5, v7, v6
+; CHECK-P8-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P8-NEXT:    stvx v2, 0, r3
+; CHECK-P8-NEXT:    xxmrgld v3, v5, v4
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxsldwi vs4, vs3, vs3, 3
+; CHECK-P9-NEXT:    xxswapd vs5, vs3
+; CHECK-P9-NEXT:    xxsldwi vs6, vs3, vs3, 1
+; CHECK-P9-NEXT:    xxsldwi vs7, vs2, vs2, 3
+; CHECK-P9-NEXT:    xxswapd vs8, vs2
+; CHECK-P9-NEXT:    xxsldwi vs9, vs2, vs2, 1
+; CHECK-P9-NEXT:    xxsldwi vs10, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs11, vs1
+; CHECK-P9-NEXT:    xxsldwi vs12, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs13, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxsldwi v3, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvspdpn f8, vs8
+; CHECK-P9-NEXT:    xscvspdpn f9, vs9
+; CHECK-P9-NEXT:    xscvspdpn f10, vs10
+; CHECK-P9-NEXT:    xscvspdpn f11, vs11
+; CHECK-P9-NEXT:    xscvspdpn f12, vs12
+; CHECK-P9-NEXT:    xscvspdpn f13, vs13
+; CHECK-P9-NEXT:    xscvspdpn v2, v2
+; CHECK-P9-NEXT:    xscvspdpn v3, v3
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    xscvdpsxws f8, f8
+; CHECK-P9-NEXT:    xscvdpsxws f9, f9
+; CHECK-P9-NEXT:    xscvdpsxws f10, f10
+; CHECK-P9-NEXT:    xscvdpsxws f11, f11
+; CHECK-P9-NEXT:    xscvdpsxws f12, f12
+; CHECK-P9-NEXT:    xscvdpsxws f13, f13
+; CHECK-P9-NEXT:    xscvdpsxws v2, v2
+; CHECK-P9-NEXT:    xscvdpsxws v3, v3
+; CHECK-P9-NEXT:    mfvsrwz r4, f3
+; CHECK-P9-NEXT:    mfvsrwz r5, f2
+; CHECK-P9-NEXT:    mfvsrwz r12, f1
+; CHECK-P9-NEXT:    mfvsrwz r0, f0
+; CHECK-P9-NEXT:    mfvsrwz r6, f4
+; CHECK-P9-NEXT:    mfvsrwz r7, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r9, f7
+; CHECK-P9-NEXT:    mfvsrwz r10, f8
+; CHECK-P9-NEXT:    mfvsrwz r11, f9
+; CHECK-P9-NEXT:    mfvsrwz r30, f10
+; CHECK-P9-NEXT:    mfvsrwz r29, f11
+; CHECK-P9-NEXT:    mfvsrwz r28, f12
+; CHECK-P9-NEXT:    mfvsrwz r27, f13
+; CHECK-P9-NEXT:    mfvsrwz r26, v2
+; CHECK-P9-NEXT:    mfvsrwz r25, v3
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    mtvsrd f1, r5
+; CHECK-P9-NEXT:    mtvsrd f8, r12
+; CHECK-P9-NEXT:    mtvsrd f9, r0
+; CHECK-P9-NEXT:    mtvsrd f2, r6
+; CHECK-P9-NEXT:    mtvsrd f3, r7
+; CHECK-P9-NEXT:    mtvsrd f4, r8
+; CHECK-P9-NEXT:    mtvsrd f5, r9
+; CHECK-P9-NEXT:    mtvsrd f6, r10
+; CHECK-P9-NEXT:    mtvsrd f7, r11
+; CHECK-P9-NEXT:    mtvsrd f10, r30
+; CHECK-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f11, r29
+; CHECK-P9-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f12, r28
+; CHECK-P9-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f13, r27
+; CHECK-P9-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd v2, r26
+; CHECK-P9-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd v3, r25
+; CHECK-P9-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    xxswapd v5, vs2
+; CHECK-P9-NEXT:    xxswapd v0, vs3
+; CHECK-P9-NEXT:    xxswapd v1, vs4
+; CHECK-P9-NEXT:    xxswapd v6, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs6
+; CHECK-P9-NEXT:    xxswapd v8, vs1
+; CHECK-P9-NEXT:    xxswapd v9, vs7
+; CHECK-P9-NEXT:    xxswapd v10, vs10
+; CHECK-P9-NEXT:    xxswapd v11, vs11
+; CHECK-P9-NEXT:    xxswapd v12, vs8
+; CHECK-P9-NEXT:    xxswapd v13, vs12
+; CHECK-P9-NEXT:    xxswapd v14, vs13
+; CHECK-P9-NEXT:    xxswapd v2, v2
+; CHECK-P9-NEXT:    xxswapd v15, vs9
+; CHECK-P9-NEXT:    xxswapd v3, v3
+; CHECK-P9-NEXT:    vmrglh v5, v0, v5
+; CHECK-P9-NEXT:    vmrglh v4, v4, v1
+; CHECK-P9-NEXT:    vmrglh v0, v7, v6
+; CHECK-P9-NEXT:    vmrglh v1, v8, v9
+; CHECK-P9-NEXT:    vmrglh v6, v11, v10
+; CHECK-P9-NEXT:    vmrglh v7, v12, v13
+; CHECK-P9-NEXT:    vmrglh v2, v2, v14
+; CHECK-P9-NEXT:    vmrglh v3, v15, v3
+; CHECK-P9-NEXT:    vmrglw v4, v4, v5
+; CHECK-P9-NEXT:    vmrglw v5, v1, v0
+; CHECK-P9-NEXT:    vmrglw v0, v7, v6
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    xxmrgld vs0, v5, v4
+; CHECK-P9-NEXT:    xxmrgld vs1, v2, v0
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs3, 16(r4)
+; CHECK-BE-NEXT:    lxv vs0, 32(r4)
+; CHECK-BE-NEXT:    lxv vs1, 48(r4)
+; CHECK-BE-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxsldwi vs4, vs3, vs3, 3
+; CHECK-BE-NEXT:    xxswapd vs5, vs3
+; CHECK-BE-NEXT:    xxsldwi vs6, vs3, vs3, 1
+; CHECK-BE-NEXT:    xxsldwi vs7, vs2, vs2, 3
+; CHECK-BE-NEXT:    xxswapd vs8, vs2
+; CHECK-BE-NEXT:    xxsldwi vs9, vs2, vs2, 1
+; CHECK-BE-NEXT:    xxsldwi vs10, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs11, vs1
+; CHECK-BE-NEXT:    xxsldwi vs12, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs13, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd v2, vs0
+; CHECK-BE-NEXT:    xxsldwi v3, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvspdpn f8, vs8
+; CHECK-BE-NEXT:    xscvspdpn f9, vs9
+; CHECK-BE-NEXT:    xscvspdpn f10, vs10
+; CHECK-BE-NEXT:    xscvspdpn f11, vs11
+; CHECK-BE-NEXT:    xscvspdpn f12, vs12
+; CHECK-BE-NEXT:    xscvspdpn f13, vs13
+; CHECK-BE-NEXT:    xscvspdpn v2, v2
+; CHECK-BE-NEXT:    xscvspdpn v3, v3
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    xscvdpsxws f8, f8
+; CHECK-BE-NEXT:    xscvdpsxws f9, f9
+; CHECK-BE-NEXT:    xscvdpsxws f10, f10
+; CHECK-BE-NEXT:    xscvdpsxws f11, f11
+; CHECK-BE-NEXT:    xscvdpsxws f12, f12
+; CHECK-BE-NEXT:    xscvdpsxws f13, f13
+; CHECK-BE-NEXT:    xscvdpsxws v2, v2
+; CHECK-BE-NEXT:    xscvdpsxws v3, v3
+; CHECK-BE-NEXT:    mfvsrwz r4, f3
+; CHECK-BE-NEXT:    mfvsrwz r5, f2
+; CHECK-BE-NEXT:    mfvsrwz r12, f1
+; CHECK-BE-NEXT:    mfvsrwz r0, f0
+; CHECK-BE-NEXT:    mfvsrwz r6, f4
+; CHECK-BE-NEXT:    mfvsrwz r7, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r9, f7
+; CHECK-BE-NEXT:    mfvsrwz r10, f8
+; CHECK-BE-NEXT:    mfvsrwz r11, f9
+; CHECK-BE-NEXT:    mfvsrwz r30, f10
+; CHECK-BE-NEXT:    mfvsrwz r29, f11
+; CHECK-BE-NEXT:    mfvsrwz r28, f12
+; CHECK-BE-NEXT:    mfvsrwz r27, f13
+; CHECK-BE-NEXT:    mfvsrwz r26, v2
+; CHECK-BE-NEXT:    mfvsrwz r25, v3
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r12, r12, 48
+; CHECK-BE-NEXT:    sldi r0, r0, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    sldi r11, r11, 48
+; CHECK-BE-NEXT:    sldi r30, r30, 48
+; CHECK-BE-NEXT:    sldi r29, r29, 48
+; CHECK-BE-NEXT:    sldi r28, r28, 48
+; CHECK-BE-NEXT:    sldi r27, r27, 48
+; CHECK-BE-NEXT:    sldi r26, r26, 48
+; CHECK-BE-NEXT:    sldi r25, r25, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r4
+; CHECK-BE-NEXT:    mtvsrd v3, r5
+; CHECK-BE-NEXT:    mtvsrd v10, r12
+; CHECK-BE-NEXT:    mtvsrd v14, r0
+; CHECK-BE-NEXT:    mtvsrd v4, r6
+; CHECK-BE-NEXT:    mtvsrd v5, r7
+; CHECK-BE-NEXT:    mtvsrd v0, r8
+; CHECK-BE-NEXT:    mtvsrd v1, r9
+; CHECK-BE-NEXT:    mtvsrd v6, r10
+; CHECK-BE-NEXT:    mtvsrd v7, r11
+; CHECK-BE-NEXT:    mtvsrd v8, r30
+; CHECK-BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v9, r29
+; CHECK-BE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v11, r28
+; CHECK-BE-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v12, r27
+; CHECK-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v13, r26
+; CHECK-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v15, r25
+; CHECK-BE-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    vmrghh v4, v5, v4
+; CHECK-BE-NEXT:    vmrghh v2, v2, v0
+; CHECK-BE-NEXT:    vmrghh v5, v6, v1
+; CHECK-BE-NEXT:    vmrghh v3, v3, v7
+; CHECK-BE-NEXT:    vmrghh v0, v9, v8
+; CHECK-BE-NEXT:    vmrghh v1, v10, v11
+; CHECK-BE-NEXT:    vmrghh v6, v13, v12
+; CHECK-BE-NEXT:    vmrghh v7, v14, v15
+; CHECK-BE-NEXT:    vmrghw v2, v2, v4
+; CHECK-BE-NEXT:    vmrghw v3, v3, v5
+; CHECK-BE-NEXT:    vmrghw v4, v1, v0
+; CHECK-BE-NEXT:    vmrghw v5, v7, v6
+; CHECK-BE-NEXT:    xxmrghd vs0, v3, v2
+; CHECK-BE-NEXT:    xxmrghd vs1, v5, v4
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x float>, <16 x float>* %0, align 64
+  %1 = fptoui <16 x float> %a to <16 x i16>
+  store <16 x i16> %1, <16 x i16>* %agg.result, align 32
+  ret void
+}
+
+define i32 @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xxsldwi vs1, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    mfvsrwz r3, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    mfvsrwz r4, f1
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    li r3, 0
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x float>
+  %1 = fptosi <2 x float> %0 to <2 x i16>
+  %2 = bitcast <2 x i16> %1 to i32
+  ret i32 %2
+}
+
+define i64 @test4elt_signed(<4 x float> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    mtvsrd f3, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs2
+; CHECK-P8-NEXT:    xxswapd v5, vs3
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglh v3, v4, v5
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P9-NEXT:    xxswapd vs1, v2
+; CHECK-P9-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, v2
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    mfvsrwz r5, f3
+; CHECK-P9-NEXT:    mfvsrwz r3, f0
+; CHECK-P9-NEXT:    mfvsrwz r4, f1
+; CHECK-P9-NEXT:    mfvsrwz r6, f2
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglh v3, v4, v5
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-BE-NEXT:    xxswapd vs1, v2
+; CHECK-BE-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-BE-NEXT:    xscvspdpn f3, v2
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    mfvsrwz r5, f3
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    mfvsrwz r3, f0
+; CHECK-BE-NEXT:    mfvsrwz r4, f1
+; CHECK-BE-NEXT:    mfvsrwz r6, f2
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v4, v5
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptosi <4 x float> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to i64
+  ret i64 %1
+}
+
+define <8 x i16> @test8elt_signed(<8 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lvx v5, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xxsldwi vs2, v5, v5, 3
+; CHECK-P8-NEXT:    xscvspdpn f4, v5
+; CHECK-P8-NEXT:    xxswapd vs3, v5
+; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    mfvsrwz r6, f1
+; CHECK-P8-NEXT:    mfvsrwz r5, f0
+; CHECK-P8-NEXT:    mtvsrd f1, r6
+; CHECK-P8-NEXT:    mtvsrd f0, r5
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 1
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, v2
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v1, vs4
+; CHECK-P8-NEXT:    vmrglh v2, v4, v3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    xxswapd v5, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v6, vs3
+; CHECK-P8-NEXT:    xxswapd v0, vs0
+; CHECK-P8-NEXT:    vmrglh v3, v3, v4
+; CHECK-P8-NEXT:    vmrglh v4, v0, v5
+; CHECK-P8-NEXT:    vmrglh v5, v1, v6
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs3, vs1
+; CHECK-P9-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd vs6, vs0
+; CHECK-P9-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    mfvsrwz r5, f1
+; CHECK-P9-NEXT:    mfvsrwz r9, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f2
+; CHECK-P9-NEXT:    mfvsrwz r4, f3
+; CHECK-P9-NEXT:    mfvsrwz r6, f4
+; CHECK-P9-NEXT:    mfvsrwz r7, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r10, f7
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v6, vs6
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    xxswapd v0, vs4
+; CHECK-P9-NEXT:    xxswapd v1, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs7
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglh v3, v4, v5
+; CHECK-P9-NEXT:    vmrglh v4, v1, v0
+; CHECK-P9-NEXT:    vmrglh v5, v6, v7
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    vmrglw v3, v5, v4
+; CHECK-P9-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs3, vs1
+; CHECK-BE-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd vs6, vs0
+; CHECK-BE-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    mfvsrwz r5, f1
+; CHECK-BE-NEXT:    mfvsrwz r9, f0
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    mfvsrwz r3, f2
+; CHECK-BE-NEXT:    mfvsrwz r4, f3
+; CHECK-BE-NEXT:    mfvsrwz r6, f4
+; CHECK-BE-NEXT:    mfvsrwz r7, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r10, f7
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v4, v5
+; CHECK-BE-NEXT:    vmrghh v4, v1, v0
+; CHECK-BE-NEXT:    vmrghh v5, v6, v7
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x float>, <8 x float>* %0, align 32
+  %1 = fptosi <8 x float> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define void @test16elt_signed(<16 x i16>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lvx v5, 0, r4
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v2, r4, r6
+; CHECK-P8-NEXT:    lvx v3, r4, r5
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    xscvspdpn f0, v5
+; CHECK-P8-NEXT:    xxsldwi vs1, v5, v5, 3
+; CHECK-P8-NEXT:    lvx v4, r4, r6
+; CHECK-P8-NEXT:    xscvspdpn f4, v2
+; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
+; CHECK-P8-NEXT:    xscvspdpn f2, v3
+; CHECK-P8-NEXT:    xxswapd vs3, v5
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xxswapd vs8, v3
+; CHECK-P8-NEXT:    xscvspdpn f6, v4
+; CHECK-P8-NEXT:    xxsldwi vs7, v3, v3, 3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xxsldwi vs10, v2, v2, 3
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xxsldwi vs9, v3, v3, 1
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xxsldwi vs12, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f8, vs8
+; CHECK-P8-NEXT:    xxswapd vs11, v2
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xxswapd v2, v4
+; CHECK-P8-NEXT:    xscvspdpn f7, vs7
+; CHECK-P8-NEXT:    xxsldwi vs13, v4, v4, 3
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xxsldwi v3, v4, v4, 1
+; CHECK-P8-NEXT:    xscvspdpn f10, vs10
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvspdpn f9, vs9
+; CHECK-P8-NEXT:    xscvdpsxws f6, f6
+; CHECK-P8-NEXT:    xscvspdpn f12, vs12
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvspdpn f11, vs11
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvspdpn v2, v2
+; CHECK-P8-NEXT:    xscvdpsxws f8, f8
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f7, f7
+; CHECK-P8-NEXT:    mfvsrwz r6, f2
+; CHECK-P8-NEXT:    xscvspdpn f13, vs13
+; CHECK-P8-NEXT:    xscvspdpn v3, v3
+; CHECK-P8-NEXT:    xscvdpsxws f10, f10
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xscvdpsxws f9, f9
+; CHECK-P8-NEXT:    mtvsrd f2, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f6
+; CHECK-P8-NEXT:    xscvdpsxws f12, f12
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    xscvdpsxws f11, f11
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f6, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f3
+; CHECK-P8-NEXT:    xscvdpsxws v2, v2
+; CHECK-P8-NEXT:    xxswapd v9, vs6
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f8
+; CHECK-P8-NEXT:    mtvsrd f3, r6
+; CHECK-P8-NEXT:    xxswapd v0, vs5
+; CHECK-P8-NEXT:    mfvsrwz r6, f7
+; CHECK-P8-NEXT:    xscvdpsxws f13, f13
+; CHECK-P8-NEXT:    xxswapd v5, vs3
+; CHECK-P8-NEXT:    xscvdpsxws v3, v3
+; CHECK-P8-NEXT:    mtvsrd f8, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f10
+; CHECK-P8-NEXT:    mtvsrd f7, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f9
+; CHECK-P8-NEXT:    mtvsrd f10, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f12
+; CHECK-P8-NEXT:    mtvsrd f9, r6
+; CHECK-P8-NEXT:    xxswapd v6, vs10
+; CHECK-P8-NEXT:    mfvsrwz r6, f11
+; CHECK-P8-NEXT:    mtvsrd f12, r4
+; CHECK-P8-NEXT:    xxswapd v1, vs9
+; CHECK-P8-NEXT:    mfvsrwz r4, v2
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    mtvsrd f11, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f13
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    xxswapd v7, vs11
+; CHECK-P8-NEXT:    mfvsrwz r4, v3
+; CHECK-P8-NEXT:    vmrglh v3, v5, v4
+; CHECK-P8-NEXT:    xxswapd v4, vs7
+; CHECK-P8-NEXT:    vmrglh v2, v2, v0
+; CHECK-P8-NEXT:    xxswapd v5, vs8
+; CHECK-P8-NEXT:    xxswapd v0, vs2
+; CHECK-P8-NEXT:    mtvsrd f13, r6
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    vmrglh v4, v5, v4
+; CHECK-P8-NEXT:    vmrglh v5, v0, v1
+; CHECK-P8-NEXT:    xxswapd v1, vs4
+; CHECK-P8-NEXT:    vmrglh v0, v7, v6
+; CHECK-P8-NEXT:    xxswapd v6, vs12
+; CHECK-P8-NEXT:    xxswapd v7, vs13
+; CHECK-P8-NEXT:    xxswapd v10, vs1
+; CHECK-P8-NEXT:    vmrglw v2, v2, v3
+; CHECK-P8-NEXT:    vmrglh v1, v1, v6
+; CHECK-P8-NEXT:    vmrglh v6, v8, v7
+; CHECK-P8-NEXT:    vmrglh v7, v9, v10
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v4, v1, v0
+; CHECK-P8-NEXT:    vmrglw v5, v7, v6
+; CHECK-P8-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P8-NEXT:    stvx v2, 0, r3
+; CHECK-P8-NEXT:    xxmrgld v3, v5, v4
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxsldwi vs4, vs3, vs3, 3
+; CHECK-P9-NEXT:    xxswapd vs5, vs3
+; CHECK-P9-NEXT:    xxsldwi vs6, vs3, vs3, 1
+; CHECK-P9-NEXT:    xxsldwi vs7, vs2, vs2, 3
+; CHECK-P9-NEXT:    xxswapd vs8, vs2
+; CHECK-P9-NEXT:    xxsldwi vs9, vs2, vs2, 1
+; CHECK-P9-NEXT:    xxsldwi vs10, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs11, vs1
+; CHECK-P9-NEXT:    xxsldwi vs12, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs13, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxsldwi v3, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvspdpn f8, vs8
+; CHECK-P9-NEXT:    xscvspdpn f9, vs9
+; CHECK-P9-NEXT:    xscvspdpn f10, vs10
+; CHECK-P9-NEXT:    xscvspdpn f11, vs11
+; CHECK-P9-NEXT:    xscvspdpn f12, vs12
+; CHECK-P9-NEXT:    xscvspdpn f13, vs13
+; CHECK-P9-NEXT:    xscvspdpn v2, v2
+; CHECK-P9-NEXT:    xscvspdpn v3, v3
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    xscvdpsxws f8, f8
+; CHECK-P9-NEXT:    xscvdpsxws f9, f9
+; CHECK-P9-NEXT:    xscvdpsxws f10, f10
+; CHECK-P9-NEXT:    xscvdpsxws f11, f11
+; CHECK-P9-NEXT:    xscvdpsxws f12, f12
+; CHECK-P9-NEXT:    xscvdpsxws f13, f13
+; CHECK-P9-NEXT:    xscvdpsxws v2, v2
+; CHECK-P9-NEXT:    xscvdpsxws v3, v3
+; CHECK-P9-NEXT:    mfvsrwz r4, f3
+; CHECK-P9-NEXT:    mfvsrwz r5, f2
+; CHECK-P9-NEXT:    mfvsrwz r12, f1
+; CHECK-P9-NEXT:    mfvsrwz r0, f0
+; CHECK-P9-NEXT:    mfvsrwz r6, f4
+; CHECK-P9-NEXT:    mfvsrwz r7, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r9, f7
+; CHECK-P9-NEXT:    mfvsrwz r10, f8
+; CHECK-P9-NEXT:    mfvsrwz r11, f9
+; CHECK-P9-NEXT:    mfvsrwz r30, f10
+; CHECK-P9-NEXT:    mfvsrwz r29, f11
+; CHECK-P9-NEXT:    mfvsrwz r28, f12
+; CHECK-P9-NEXT:    mfvsrwz r27, f13
+; CHECK-P9-NEXT:    mfvsrwz r26, v2
+; CHECK-P9-NEXT:    mfvsrwz r25, v3
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    mtvsrd f1, r5
+; CHECK-P9-NEXT:    mtvsrd f8, r12
+; CHECK-P9-NEXT:    mtvsrd f9, r0
+; CHECK-P9-NEXT:    mtvsrd f2, r6
+; CHECK-P9-NEXT:    mtvsrd f3, r7
+; CHECK-P9-NEXT:    mtvsrd f4, r8
+; CHECK-P9-NEXT:    mtvsrd f5, r9
+; CHECK-P9-NEXT:    mtvsrd f6, r10
+; CHECK-P9-NEXT:    mtvsrd f7, r11
+; CHECK-P9-NEXT:    mtvsrd f10, r30
+; CHECK-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f11, r29
+; CHECK-P9-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f12, r28
+; CHECK-P9-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f13, r27
+; CHECK-P9-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd v2, r26
+; CHECK-P9-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd v3, r25
+; CHECK-P9-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    xxswapd v5, vs2
+; CHECK-P9-NEXT:    xxswapd v0, vs3
+; CHECK-P9-NEXT:    xxswapd v1, vs4
+; CHECK-P9-NEXT:    xxswapd v6, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs6
+; CHECK-P9-NEXT:    xxswapd v8, vs1
+; CHECK-P9-NEXT:    xxswapd v9, vs7
+; CHECK-P9-NEXT:    xxswapd v10, vs10
+; CHECK-P9-NEXT:    xxswapd v11, vs11
+; CHECK-P9-NEXT:    xxswapd v12, vs8
+; CHECK-P9-NEXT:    xxswapd v13, vs12
+; CHECK-P9-NEXT:    xxswapd v14, vs13
+; CHECK-P9-NEXT:    xxswapd v2, v2
+; CHECK-P9-NEXT:    xxswapd v15, vs9
+; CHECK-P9-NEXT:    xxswapd v3, v3
+; CHECK-P9-NEXT:    vmrglh v5, v0, v5
+; CHECK-P9-NEXT:    vmrglh v4, v4, v1
+; CHECK-P9-NEXT:    vmrglh v0, v7, v6
+; CHECK-P9-NEXT:    vmrglh v1, v8, v9
+; CHECK-P9-NEXT:    vmrglh v6, v11, v10
+; CHECK-P9-NEXT:    vmrglh v7, v12, v13
+; CHECK-P9-NEXT:    vmrglh v2, v2, v14
+; CHECK-P9-NEXT:    vmrglh v3, v15, v3
+; CHECK-P9-NEXT:    vmrglw v4, v4, v5
+; CHECK-P9-NEXT:    vmrglw v5, v1, v0
+; CHECK-P9-NEXT:    vmrglw v0, v7, v6
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    xxmrgld vs0, v5, v4
+; CHECK-P9-NEXT:    xxmrgld vs1, v2, v0
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs3, 16(r4)
+; CHECK-BE-NEXT:    lxv vs0, 32(r4)
+; CHECK-BE-NEXT:    lxv vs1, 48(r4)
+; CHECK-BE-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxsldwi vs4, vs3, vs3, 3
+; CHECK-BE-NEXT:    xxswapd vs5, vs3
+; CHECK-BE-NEXT:    xxsldwi vs6, vs3, vs3, 1
+; CHECK-BE-NEXT:    xxsldwi vs7, vs2, vs2, 3
+; CHECK-BE-NEXT:    xxswapd vs8, vs2
+; CHECK-BE-NEXT:    xxsldwi vs9, vs2, vs2, 1
+; CHECK-BE-NEXT:    xxsldwi vs10, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs11, vs1
+; CHECK-BE-NEXT:    xxsldwi vs12, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs13, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd v2, vs0
+; CHECK-BE-NEXT:    xxsldwi v3, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvspdpn f8, vs8
+; CHECK-BE-NEXT:    xscvspdpn f9, vs9
+; CHECK-BE-NEXT:    xscvspdpn f10, vs10
+; CHECK-BE-NEXT:    xscvspdpn f11, vs11
+; CHECK-BE-NEXT:    xscvspdpn f12, vs12
+; CHECK-BE-NEXT:    xscvspdpn f13, vs13
+; CHECK-BE-NEXT:    xscvspdpn v2, v2
+; CHECK-BE-NEXT:    xscvspdpn v3, v3
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    xscvdpsxws f8, f8
+; CHECK-BE-NEXT:    xscvdpsxws f9, f9
+; CHECK-BE-NEXT:    xscvdpsxws f10, f10
+; CHECK-BE-NEXT:    xscvdpsxws f11, f11
+; CHECK-BE-NEXT:    xscvdpsxws f12, f12
+; CHECK-BE-NEXT:    xscvdpsxws f13, f13
+; CHECK-BE-NEXT:    xscvdpsxws v2, v2
+; CHECK-BE-NEXT:    xscvdpsxws v3, v3
+; CHECK-BE-NEXT:    mfvsrwz r4, f3
+; CHECK-BE-NEXT:    mfvsrwz r5, f2
+; CHECK-BE-NEXT:    mfvsrwz r12, f1
+; CHECK-BE-NEXT:    mfvsrwz r0, f0
+; CHECK-BE-NEXT:    mfvsrwz r6, f4
+; CHECK-BE-NEXT:    mfvsrwz r7, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r9, f7
+; CHECK-BE-NEXT:    mfvsrwz r10, f8
+; CHECK-BE-NEXT:    mfvsrwz r11, f9
+; CHECK-BE-NEXT:    mfvsrwz r30, f10
+; CHECK-BE-NEXT:    mfvsrwz r29, f11
+; CHECK-BE-NEXT:    mfvsrwz r28, f12
+; CHECK-BE-NEXT:    mfvsrwz r27, f13
+; CHECK-BE-NEXT:    mfvsrwz r26, v2
+; CHECK-BE-NEXT:    mfvsrwz r25, v3
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r12, r12, 48
+; CHECK-BE-NEXT:    sldi r0, r0, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    sldi r11, r11, 48
+; CHECK-BE-NEXT:    sldi r30, r30, 48
+; CHECK-BE-NEXT:    sldi r29, r29, 48
+; CHECK-BE-NEXT:    sldi r28, r28, 48
+; CHECK-BE-NEXT:    sldi r27, r27, 48
+; CHECK-BE-NEXT:    sldi r26, r26, 48
+; CHECK-BE-NEXT:    sldi r25, r25, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r4
+; CHECK-BE-NEXT:    mtvsrd v3, r5
+; CHECK-BE-NEXT:    mtvsrd v10, r12
+; CHECK-BE-NEXT:    mtvsrd v14, r0
+; CHECK-BE-NEXT:    mtvsrd v4, r6
+; CHECK-BE-NEXT:    mtvsrd v5, r7
+; CHECK-BE-NEXT:    mtvsrd v0, r8
+; CHECK-BE-NEXT:    mtvsrd v1, r9
+; CHECK-BE-NEXT:    mtvsrd v6, r10
+; CHECK-BE-NEXT:    mtvsrd v7, r11
+; CHECK-BE-NEXT:    mtvsrd v8, r30
+; CHECK-BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v9, r29
+; CHECK-BE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v11, r28
+; CHECK-BE-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v12, r27
+; CHECK-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v13, r26
+; CHECK-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v15, r25
+; CHECK-BE-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    vmrghh v4, v5, v4
+; CHECK-BE-NEXT:    vmrghh v2, v2, v0
+; CHECK-BE-NEXT:    vmrghh v5, v6, v1
+; CHECK-BE-NEXT:    vmrghh v3, v3, v7
+; CHECK-BE-NEXT:    vmrghh v0, v9, v8
+; CHECK-BE-NEXT:    vmrghh v1, v10, v11
+; CHECK-BE-NEXT:    vmrghh v6, v13, v12
+; CHECK-BE-NEXT:    vmrghh v7, v14, v15
+; CHECK-BE-NEXT:    vmrghw v2, v2, v4
+; CHECK-BE-NEXT:    vmrghw v3, v3, v5
+; CHECK-BE-NEXT:    vmrghw v4, v1, v0
+; CHECK-BE-NEXT:    vmrghw v5, v7, v6
+; CHECK-BE-NEXT:    xxmrghd vs0, v3, v2
+; CHECK-BE-NEXT:    xxmrghd vs1, v5, v4
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x float>, <16 x float>* %0, align 64
+  %1 = fptosi <16 x float> %a to <16 x i16>
+  store <16 x i16> %1, <16 x i16>* %agg.result, align 32
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll b/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3a733f0e934a000b30318eeedb39cebfe329c91f
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll
@@ -0,0 +1,846 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define <2 x i64> @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xxsldwi vs1, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x float>
+  %1 = fptoui <2 x float> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define void @test4elt(<4 x i64>* noalias nocapture sret %agg.result, <4 x float> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f2, v2
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P8-NEXT:    xvcvdpuxds v3, vs1
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P9-NEXT:    xxswapd vs1, v2
+; CHECK-P9-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, v2
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsldwi vs0, v2, v2, 1
+; CHECK-BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-BE-NEXT:    xxswapd vs2, v2
+; CHECK-BE-NEXT:    xscvspdpn f3, v2
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xxmrghd vs0, vs3, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs2, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <4 x float> %a to <4 x i64>
+  store <4 x i64> %0, <4 x i64>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt(<8 x i64>* noalias nocapture sret %agg.result, <8 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xxsldwi vs5, v3, v3, 3
+; CHECK-P8-NEXT:    xxswapd vs6, v3
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xxsldwi vs7, v3, v3, 1
+; CHECK-P8-NEXT:    xscvspdpn f2, v2
+; CHECK-P8-NEXT:    xscvspdpn f4, v3
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvspdpn f6, vs6
+; CHECK-P8-NEXT:    xscvspdpn f7, vs7
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xxmrghd vs2, vs6, vs5
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs7
+; CHECK-P8-NEXT:    xvcvdpuxds v3, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v4, vs2
+; CHECK-P8-NEXT:    xvcvdpuxds v5, vs3
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    xxswapd vs3, v4
+; CHECK-P8-NEXT:    xxswapd vs2, v5
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs3, vs1
+; CHECK-P9-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd vs6, vs0
+; CHECK-P9-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs1, vs1, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs6, vs5
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs7
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs3, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs2, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xxsldwi vs2, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs3, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs4, vs1
+; CHECK-BE-NEXT:    xxsldwi vs5, vs0, vs0, 1
+; CHECK-BE-NEXT:    xxsldwi vs6, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd vs7, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs4, vs3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs5
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs0, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x float>, <8 x float>* %0, align 32
+  %1 = fptoui <8 x float> %a to <8 x i64>
+  store <8 x i64> %1, <8 x i64>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt(<16 x i64>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    lvx v4, 0, r4
+; CHECK-P8-NEXT:    li r8, 64
+; CHECK-P8-NEXT:    lvx v5, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r7
+; CHECK-P8-NEXT:    lvx v2, r4, r6
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    xxsldwi vs13, v4, v4, 3
+; CHECK-P8-NEXT:    xscvspdpn f6, v4
+; CHECK-P8-NEXT:    xxsldwi vs1, v5, v5, 3
+; CHECK-P8-NEXT:    xxswapd vs3, v5
+; CHECK-P8-NEXT:    xxsldwi vs9, v3, v3, 1
+; CHECK-P8-NEXT:    xscvspdpn f4, v3
+; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
+; CHECK-P8-NEXT:    xxsldwi vs10, v3, v3, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xxswapd vs11, v3
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xxsldwi vs7, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f9, vs9
+; CHECK-P8-NEXT:    xxswapd vs8, v2
+; CHECK-P8-NEXT:    xscvspdpn f0, v5
+; CHECK-P8-NEXT:    xxsldwi vs12, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f2, v2
+; CHECK-P8-NEXT:    xxswapd v2, v4
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xxsldwi v3, v4, v4, 1
+; CHECK-P8-NEXT:    xscvspdpn f10, vs10
+; CHECK-P8-NEXT:    xscvspdpn f11, vs11
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs1
+; CHECK-P8-NEXT:    xscvspdpn f7, vs7
+; CHECK-P8-NEXT:    xxmrghd vs4, vs4, vs9
+; CHECK-P8-NEXT:    xscvspdpn f8, vs8
+; CHECK-P8-NEXT:    xscvspdpn f12, vs12
+; CHECK-P8-NEXT:    xscvspdpn f13, vs13
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs5
+; CHECK-P8-NEXT:    xscvspdpn f3, v2
+; CHECK-P8-NEXT:    xscvspdpn f9, v3
+; CHECK-P8-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-P8-NEXT:    xvcvdpuxds v3, vs4
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs12
+; CHECK-P8-NEXT:    xxmrghd vs2, vs8, vs7
+; CHECK-P8-NEXT:    xvcvdpuxds v4, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs3, vs13
+; CHECK-P8-NEXT:    xvcvdpuxds v5, vs5
+; CHECK-P8-NEXT:    xxmrghd vs3, vs6, vs9
+; CHECK-P8-NEXT:    xvcvdpuxds v0, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v1, vs2
+; CHECK-P8-NEXT:    xvcvdpuxds v6, vs0
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    xvcvdpuxds v7, vs3
+; CHECK-P8-NEXT:    xxswapd vs4, v2
+; CHECK-P8-NEXT:    xxswapd vs3, v4
+; CHECK-P8-NEXT:    xxswapd vs1, v5
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xxswapd vs2, v0
+; CHECK-P8-NEXT:    xxswapd vs0, v1
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs5, v6
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs1, v7
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r8
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    lxv vs2, 48(r4)
+; CHECK-P9-NEXT:    lxv vs3, 32(r4)
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxsldwi vs4, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs5, vs1
+; CHECK-P9-NEXT:    xxsldwi vs6, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs7, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd vs8, vs0
+; CHECK-P9-NEXT:    xxsldwi vs9, vs0, vs0, 1
+; CHECK-P9-NEXT:    xxsldwi vs10, vs3, vs3, 3
+; CHECK-P9-NEXT:    xxswapd vs11, vs3
+; CHECK-P9-NEXT:    xxsldwi vs12, vs3, vs3, 1
+; CHECK-P9-NEXT:    xxsldwi vs13, vs2, vs2, 3
+; CHECK-P9-NEXT:    xxswapd v2, vs2
+; CHECK-P9-NEXT:    xxsldwi v3, vs2, vs2, 1
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvspdpn f8, vs8
+; CHECK-P9-NEXT:    xscvspdpn f9, vs9
+; CHECK-P9-NEXT:    xscvspdpn f10, vs10
+; CHECK-P9-NEXT:    xscvspdpn f11, vs11
+; CHECK-P9-NEXT:    xscvspdpn f12, vs12
+; CHECK-P9-NEXT:    xscvspdpn f13, vs13
+; CHECK-P9-NEXT:    xscvspdpn f31, v2
+; CHECK-P9-NEXT:    xscvspdpn f30, v3
+; CHECK-P9-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs1, vs1, vs6
+; CHECK-P9-NEXT:    xxmrghd vs5, vs8, vs7
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs9
+; CHECK-P9-NEXT:    xxmrghd vs6, vs11, vs10
+; CHECK-P9-NEXT:    xxmrghd vs3, vs3, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs31, vs13
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxmrghd vs2, vs2, vs30
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xvcvdpuxds vs4, vs4
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds vs5, vs5
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpuxds vs7, vs7
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs5, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs4, 0(r3)
+; CHECK-P9-NEXT:    stxv vs2, 112(r3)
+; CHECK-P9-NEXT:    stxv vs7, 96(r3)
+; CHECK-P9-NEXT:    stxv vs3, 80(r3)
+; CHECK-P9-NEXT:    stxv vs6, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    lxv vs2, 48(r4)
+; CHECK-BE-NEXT:    lxv vs3, 32(r4)
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs5, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs6, vs1
+; CHECK-BE-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-BE-NEXT:    xxsldwi vs8, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd vs9, vs0
+; CHECK-BE-NEXT:    xxsldwi vs10, vs3, vs3, 1
+; CHECK-BE-NEXT:    xxsldwi vs11, vs3, vs3, 3
+; CHECK-BE-NEXT:    xxswapd vs12, vs3
+; CHECK-BE-NEXT:    xxsldwi vs13, vs2, vs2, 1
+; CHECK-BE-NEXT:    xxsldwi v2, vs2, vs2, 3
+; CHECK-BE-NEXT:    xxswapd v3, vs2
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvspdpn f8, vs8
+; CHECK-BE-NEXT:    xscvspdpn f9, vs9
+; CHECK-BE-NEXT:    xscvspdpn f10, vs10
+; CHECK-BE-NEXT:    xscvspdpn f11, vs11
+; CHECK-BE-NEXT:    xscvspdpn f12, vs12
+; CHECK-BE-NEXT:    xscvspdpn f13, vs13
+; CHECK-BE-NEXT:    xscvspdpn f31, v2
+; CHECK-BE-NEXT:    xscvspdpn f30, v3
+; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs4
+; CHECK-BE-NEXT:    xxmrghd vs4, vs6, vs5
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs7
+; CHECK-BE-NEXT:    xxmrghd vs5, vs9, vs8
+; CHECK-BE-NEXT:    xxmrghd vs3, vs3, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs12, vs11
+; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs13
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs4, vs4
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs5, vs5
+; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-BE-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-BE-NEXT:    xvcvdpuxds vs7, vs7
+; CHECK-BE-NEXT:    stxv vs5, 48(r3)
+; CHECK-BE-NEXT:    stxv vs0, 32(r3)
+; CHECK-BE-NEXT:    stxv vs4, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs2, 96(r3)
+; CHECK-BE-NEXT:    stxv vs6, 80(r3)
+; CHECK-BE-NEXT:    stxv vs3, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x float>, <16 x float>* %0, align 64
+  %1 = fptoui <16 x float> %a to <16 x i64>
+  store <16 x i64> %1, <16 x i64>* %agg.result, align 128
+  ret void
+}
+
+define <2 x i64> @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xxsldwi vs1, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x float>
+  %1 = fptoui <2 x float> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define void @test4elt_signed(<4 x i64>* noalias nocapture sret %agg.result, <4 x float> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f2, v2
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P8-NEXT:    xvcvdpuxds v3, vs1
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P9-NEXT:    xxswapd vs1, v2
+; CHECK-P9-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, v2
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsldwi vs0, v2, v2, 1
+; CHECK-BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-BE-NEXT:    xxswapd vs2, v2
+; CHECK-BE-NEXT:    xscvspdpn f3, v2
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xxmrghd vs0, vs3, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs2, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <4 x float> %a to <4 x i64>
+  store <4 x i64> %0, <4 x i64>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt_signed(<8 x i64>* noalias nocapture sret %agg.result, <8 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xxsldwi vs5, v3, v3, 3
+; CHECK-P8-NEXT:    xxswapd vs6, v3
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xxsldwi vs7, v3, v3, 1
+; CHECK-P8-NEXT:    xscvspdpn f2, v2
+; CHECK-P8-NEXT:    xscvspdpn f4, v3
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvspdpn f6, vs6
+; CHECK-P8-NEXT:    xscvspdpn f7, vs7
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xxmrghd vs2, vs6, vs5
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs7
+; CHECK-P8-NEXT:    xvcvdpuxds v3, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v4, vs2
+; CHECK-P8-NEXT:    xvcvdpuxds v5, vs3
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    xxswapd vs3, v4
+; CHECK-P8-NEXT:    xxswapd vs2, v5
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs3, vs1
+; CHECK-P9-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd vs6, vs0
+; CHECK-P9-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs1, vs1, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs6, vs5
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs7
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs3, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs2, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xxsldwi vs2, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs3, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs4, vs1
+; CHECK-BE-NEXT:    xxsldwi vs5, vs0, vs0, 1
+; CHECK-BE-NEXT:    xxsldwi vs6, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd vs7, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs4, vs3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs5
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs0, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x float>, <8 x float>* %0, align 32
+  %1 = fptoui <8 x float> %a to <8 x i64>
+  store <8 x i64> %1, <8 x i64>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt_signed(<16 x i64>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    lvx v4, 0, r4
+; CHECK-P8-NEXT:    li r8, 64
+; CHECK-P8-NEXT:    lvx v5, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r7
+; CHECK-P8-NEXT:    lvx v2, r4, r6
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    xxsldwi vs13, v4, v4, 3
+; CHECK-P8-NEXT:    xscvspdpn f6, v4
+; CHECK-P8-NEXT:    xxsldwi vs1, v5, v5, 3
+; CHECK-P8-NEXT:    xxswapd vs3, v5
+; CHECK-P8-NEXT:    xxsldwi vs9, v3, v3, 1
+; CHECK-P8-NEXT:    xscvspdpn f4, v3
+; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
+; CHECK-P8-NEXT:    xxsldwi vs10, v3, v3, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xxswapd vs11, v3
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xxsldwi vs7, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f9, vs9
+; CHECK-P8-NEXT:    xxswapd vs8, v2
+; CHECK-P8-NEXT:    xscvspdpn f0, v5
+; CHECK-P8-NEXT:    xxsldwi vs12, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f2, v2
+; CHECK-P8-NEXT:    xxswapd v2, v4
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xxsldwi v3, v4, v4, 1
+; CHECK-P8-NEXT:    xscvspdpn f10, vs10
+; CHECK-P8-NEXT:    xscvspdpn f11, vs11
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs1
+; CHECK-P8-NEXT:    xscvspdpn f7, vs7
+; CHECK-P8-NEXT:    xxmrghd vs4, vs4, vs9
+; CHECK-P8-NEXT:    xscvspdpn f8, vs8
+; CHECK-P8-NEXT:    xscvspdpn f12, vs12
+; CHECK-P8-NEXT:    xscvspdpn f13, vs13
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs5
+; CHECK-P8-NEXT:    xscvspdpn f3, v2
+; CHECK-P8-NEXT:    xscvspdpn f9, v3
+; CHECK-P8-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-P8-NEXT:    xvcvdpuxds v3, vs4
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs12
+; CHECK-P8-NEXT:    xxmrghd vs2, vs8, vs7
+; CHECK-P8-NEXT:    xvcvdpuxds v4, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs3, vs13
+; CHECK-P8-NEXT:    xvcvdpuxds v5, vs5
+; CHECK-P8-NEXT:    xxmrghd vs3, vs6, vs9
+; CHECK-P8-NEXT:    xvcvdpuxds v0, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v1, vs2
+; CHECK-P8-NEXT:    xvcvdpuxds v6, vs0
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    xvcvdpuxds v7, vs3
+; CHECK-P8-NEXT:    xxswapd vs4, v2
+; CHECK-P8-NEXT:    xxswapd vs3, v4
+; CHECK-P8-NEXT:    xxswapd vs1, v5
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xxswapd vs2, v0
+; CHECK-P8-NEXT:    xxswapd vs0, v1
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs5, v6
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs1, v7
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r8
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    lxv vs2, 48(r4)
+; CHECK-P9-NEXT:    lxv vs3, 32(r4)
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxsldwi vs4, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs5, vs1
+; CHECK-P9-NEXT:    xxsldwi vs6, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs7, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd vs8, vs0
+; CHECK-P9-NEXT:    xxsldwi vs9, vs0, vs0, 1
+; CHECK-P9-NEXT:    xxsldwi vs10, vs3, vs3, 3
+; CHECK-P9-NEXT:    xxswapd vs11, vs3
+; CHECK-P9-NEXT:    xxsldwi vs12, vs3, vs3, 1
+; CHECK-P9-NEXT:    xxsldwi vs13, vs2, vs2, 3
+; CHECK-P9-NEXT:    xxswapd v2, vs2
+; CHECK-P9-NEXT:    xxsldwi v3, vs2, vs2, 1
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvspdpn f8, vs8
+; CHECK-P9-NEXT:    xscvspdpn f9, vs9
+; CHECK-P9-NEXT:    xscvspdpn f10, vs10
+; CHECK-P9-NEXT:    xscvspdpn f11, vs11
+; CHECK-P9-NEXT:    xscvspdpn f12, vs12
+; CHECK-P9-NEXT:    xscvspdpn f13, vs13
+; CHECK-P9-NEXT:    xscvspdpn f31, v2
+; CHECK-P9-NEXT:    xscvspdpn f30, v3
+; CHECK-P9-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs1, vs1, vs6
+; CHECK-P9-NEXT:    xxmrghd vs5, vs8, vs7
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs9
+; CHECK-P9-NEXT:    xxmrghd vs6, vs11, vs10
+; CHECK-P9-NEXT:    xxmrghd vs3, vs3, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs31, vs13
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxmrghd vs2, vs2, vs30
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xvcvdpuxds vs4, vs4
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds vs5, vs5
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpuxds vs7, vs7
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs5, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs4, 0(r3)
+; CHECK-P9-NEXT:    stxv vs2, 112(r3)
+; CHECK-P9-NEXT:    stxv vs7, 96(r3)
+; CHECK-P9-NEXT:    stxv vs3, 80(r3)
+; CHECK-P9-NEXT:    stxv vs6, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    lxv vs2, 48(r4)
+; CHECK-BE-NEXT:    lxv vs3, 32(r4)
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs5, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs6, vs1
+; CHECK-BE-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-BE-NEXT:    xxsldwi vs8, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd vs9, vs0
+; CHECK-BE-NEXT:    xxsldwi vs10, vs3, vs3, 1
+; CHECK-BE-NEXT:    xxsldwi vs11, vs3, vs3, 3
+; CHECK-BE-NEXT:    xxswapd vs12, vs3
+; CHECK-BE-NEXT:    xxsldwi vs13, vs2, vs2, 1
+; CHECK-BE-NEXT:    xxsldwi v2, vs2, vs2, 3
+; CHECK-BE-NEXT:    xxswapd v3, vs2
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvspdpn f8, vs8
+; CHECK-BE-NEXT:    xscvspdpn f9, vs9
+; CHECK-BE-NEXT:    xscvspdpn f10, vs10
+; CHECK-BE-NEXT:    xscvspdpn f11, vs11
+; CHECK-BE-NEXT:    xscvspdpn f12, vs12
+; CHECK-BE-NEXT:    xscvspdpn f13, vs13
+; CHECK-BE-NEXT:    xscvspdpn f31, v2
+; CHECK-BE-NEXT:    xscvspdpn f30, v3
+; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs4
+; CHECK-BE-NEXT:    xxmrghd vs4, vs6, vs5
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs7
+; CHECK-BE-NEXT:    xxmrghd vs5, vs9, vs8
+; CHECK-BE-NEXT:    xxmrghd vs3, vs3, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs12, vs11
+; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs13
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs4, vs4
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs5, vs5
+; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-BE-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-BE-NEXT:    xvcvdpuxds vs7, vs7
+; CHECK-BE-NEXT:    stxv vs5, 48(r3)
+; CHECK-BE-NEXT:    stxv vs0, 32(r3)
+; CHECK-BE-NEXT:    stxv vs4, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs2, 96(r3)
+; CHECK-BE-NEXT:    stxv vs6, 80(r3)
+; CHECK-BE-NEXT:    stxv vs3, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x float>, <16 x float>* %0, align 64
+  %1 = fptoui <16 x float> %a to <16 x i64>
+  store <16 x i64> %1, <16 x i64>* %agg.result, align 128
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll b/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..708e844b549df704d556a0f3f8281080307e0543
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll
@@ -0,0 +1,1486 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i16 @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r3, r3, 48
+; CHECK-P8-NEXT:    sth r3, -2(r1)
+; CHECK-P8-NEXT:    lhz r3, -2(r1)
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    addi r3, r1, -2
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-P9-NEXT:    vsldoi v2, v2, v2, 8
+; CHECK-P9-NEXT:    stxsihx v2, 0, r3
+; CHECK-P9-NEXT:    lhz r3, -2(r1)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xxsldwi vs1, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    mfvsrwz r3, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    mfvsrwz r4, f1
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    addi r3, r1, -2
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-NEXT:    vsldoi v2, v2, v2, 10
+; CHECK-BE-NEXT:    stxsihx v2, 0, r3
+; CHECK-BE-NEXT:    lhz r3, -2(r1)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x float>
+  %1 = fptoui <2 x float> %0 to <2 x i8>
+  %2 = bitcast <2 x i8> %1 to i16
+  ret i16 %2
+}
+
+define i32 @test4elt(<4 x float> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    mtvsrd f3, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs2
+; CHECK-P8-NEXT:    xxswapd v5, vs3
+; CHECK-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-P8-NEXT:    vmrglb v3, v4, v5
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P9-NEXT:    xxswapd vs1, v2
+; CHECK-P9-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, v2
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    mfvsrwz r5, f3
+; CHECK-P9-NEXT:    mfvsrwz r3, f0
+; CHECK-P9-NEXT:    mfvsrwz r4, f1
+; CHECK-P9-NEXT:    mfvsrwz r6, f2
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-P9-NEXT:    vmrglb v3, v4, v5
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-BE-NEXT:    xxswapd vs1, v2
+; CHECK-BE-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-BE-NEXT:    xscvspdpn f3, v2
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    mfvsrwz r5, f3
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    mfvsrwz r3, f0
+; CHECK-BE-NEXT:    mfvsrwz r4, f1
+; CHECK-BE-NEXT:    mfvsrwz r6, f2
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    li r3, 0
+; CHECK-BE-NEXT:    vmrghb v2, v3, v2
+; CHECK-BE-NEXT:    vmrghb v3, v4, v5
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <4 x float> %a to <4 x i8>
+  %1 = bitcast <4 x i8> %0 to i32
+  ret i32 %1
+}
+
+define i64 @test8elt(<8 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lvx v5, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xxsldwi vs2, v5, v5, 3
+; CHECK-P8-NEXT:    xscvspdpn f4, v5
+; CHECK-P8-NEXT:    xxswapd vs3, v5
+; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    mfvsrwz r6, f1
+; CHECK-P8-NEXT:    mfvsrwz r5, f0
+; CHECK-P8-NEXT:    mtvsrd f1, r6
+; CHECK-P8-NEXT:    mtvsrd f0, r5
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 1
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, v2
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v1, vs4
+; CHECK-P8-NEXT:    vmrglb v2, v4, v3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    xxswapd v5, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v6, vs3
+; CHECK-P8-NEXT:    xxswapd v0, vs0
+; CHECK-P8-NEXT:    vmrglb v3, v3, v4
+; CHECK-P8-NEXT:    vmrglb v4, v0, v5
+; CHECK-P8-NEXT:    vmrglb v5, v1, v6
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglh v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs3, vs1
+; CHECK-P9-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd vs6, vs0
+; CHECK-P9-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    mfvsrwz r5, f1
+; CHECK-P9-NEXT:    mfvsrwz r9, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f2
+; CHECK-P9-NEXT:    mfvsrwz r4, f3
+; CHECK-P9-NEXT:    mfvsrwz r6, f4
+; CHECK-P9-NEXT:    mfvsrwz r7, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r10, f7
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v6, vs6
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    xxswapd v0, vs4
+; CHECK-P9-NEXT:    xxswapd v1, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs7
+; CHECK-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-P9-NEXT:    vmrglb v3, v4, v5
+; CHECK-P9-NEXT:    vmrglb v4, v1, v0
+; CHECK-P9-NEXT:    vmrglb v5, v6, v7
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglh v3, v5, v4
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs3, vs1
+; CHECK-BE-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd vs6, vs0
+; CHECK-BE-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    mfvsrwz r5, f1
+; CHECK-BE-NEXT:    mfvsrwz r9, f0
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    sldi r9, r9, 56
+; CHECK-BE-NEXT:    mfvsrwz r3, f2
+; CHECK-BE-NEXT:    mfvsrwz r4, f3
+; CHECK-BE-NEXT:    mfvsrwz r6, f4
+; CHECK-BE-NEXT:    mfvsrwz r7, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r10, f7
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    sldi r7, r7, 56
+; CHECK-BE-NEXT:    sldi r8, r8, 56
+; CHECK-BE-NEXT:    sldi r10, r10, 56
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    vmrghb v2, v3, v2
+; CHECK-BE-NEXT:    vmrghb v3, v4, v5
+; CHECK-BE-NEXT:    vmrghb v4, v1, v0
+; CHECK-BE-NEXT:    vmrghb v5, v6, v7
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v5, v4
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x float>, <8 x float>* %0, align 32
+  %1 = fptoui <8 x float> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to i64
+  ret i64 %2
+}
+
+define <16 x i8> @test16elt(<16 x float>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lvx v3, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    xscvspdpn f2, v2
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f4, v3
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xxsldwi vs5, v3, v3, 3
+; CHECK-P8-NEXT:    lvx v2, r3, r4
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xxswapd vs6, v3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xxsldwi vs7, v3, v3, 1
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xxsldwi vs8, v2, v2, 3
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xxswapd vs9, v2
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvspdpn f6, vs6
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xscvspdpn f7, vs7
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvspdpn f8, vs8
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvdpsxws f0, f5
+; CHECK-P8-NEXT:    xxswapd v0, vs4
+; CHECK-P8-NEXT:    xscvspdpn f9, vs9
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xscvdpsxws f1, f6
+; CHECK-P8-NEXT:    xxswapd v3, vs5
+; CHECK-P8-NEXT:    mtvsrd f6, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    xscvdpsxws f3, f7
+; CHECK-P8-NEXT:    xxswapd v4, vs6
+; CHECK-P8-NEXT:    mtvsrd f7, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvdpsxws f0, f8
+; CHECK-P8-NEXT:    xxswapd v5, vs7
+; CHECK-P8-NEXT:    mtvsrd f8, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xscvdpsxws f1, f9
+; CHECK-P8-NEXT:    xxswapd v1, vs8
+; CHECK-P8-NEXT:    mtvsrd f9, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    vmrglb v3, v4, v3
+; CHECK-P8-NEXT:    xxswapd v4, vs2
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    xxswapd v6, vs9
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvspdpn f0, v2
+; CHECK-P8-NEXT:    xxswapd v7, vs3
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    vmrglb v4, v4, v5
+; CHECK-P8-NEXT:    xxswapd v5, vs5
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    lvx v9, r3, r4
+; CHECK-P8-NEXT:    vmrglb v1, v6, v1
+; CHECK-P8-NEXT:    xxswapd v8, vs1
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 1
+; CHECK-P8-NEXT:    xxsldwi vs2, v9, v9, 3
+; CHECK-P8-NEXT:    xscvspdpn f4, v9
+; CHECK-P8-NEXT:    xxswapd vs3, v9
+; CHECK-P8-NEXT:    xxsldwi vs5, v9, v9, 1
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v9, vs4
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xxswapd v6, vs1
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    vmrglb v2, v0, v7
+; CHECK-P8-NEXT:    xxswapd v0, vs0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v7, vs2
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    vmrglb v5, v8, v5
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    xxswapd v10, vs3
+; CHECK-P8-NEXT:    vmrglb v0, v0, v6
+; CHECK-P8-NEXT:    vmrglh v3, v4, v3
+; CHECK-P8-NEXT:    vmrglb v6, v8, v7
+; CHECK-P8-NEXT:    vmrglb v7, v9, v10
+; CHECK-P8-NEXT:    vmrglh v2, v2, v1
+; CHECK-P8-NEXT:    vmrglh v4, v0, v5
+; CHECK-P8-NEXT:    vmrglh v5, v7, v6
+; CHECK-P8-NEXT:    vmrglw v2, v2, v3
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs2, 16(r3)
+; CHECK-P9-NEXT:    lxv vs3, 0(r3)
+; CHECK-P9-NEXT:    lxv vs0, 48(r3)
+; CHECK-P9-NEXT:    lxv vs1, 32(r3)
+; CHECK-P9-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxsldwi vs4, vs3, vs3, 3
+; CHECK-P9-NEXT:    xxswapd vs5, vs3
+; CHECK-P9-NEXT:    xxsldwi vs6, vs3, vs3, 1
+; CHECK-P9-NEXT:    xxsldwi vs7, vs2, vs2, 3
+; CHECK-P9-NEXT:    xxswapd vs8, vs2
+; CHECK-P9-NEXT:    xxsldwi vs9, vs2, vs2, 1
+; CHECK-P9-NEXT:    xxsldwi vs10, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs11, vs1
+; CHECK-P9-NEXT:    xxsldwi vs12, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs13, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxsldwi v3, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvspdpn f8, vs8
+; CHECK-P9-NEXT:    xscvspdpn f9, vs9
+; CHECK-P9-NEXT:    xscvspdpn f10, vs10
+; CHECK-P9-NEXT:    xscvspdpn f11, vs11
+; CHECK-P9-NEXT:    xscvspdpn f12, vs12
+; CHECK-P9-NEXT:    xscvspdpn f13, vs13
+; CHECK-P9-NEXT:    xscvspdpn v2, v2
+; CHECK-P9-NEXT:    xscvspdpn v3, v3
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    xscvdpsxws f8, f8
+; CHECK-P9-NEXT:    xscvdpsxws f9, f9
+; CHECK-P9-NEXT:    xscvdpsxws f10, f10
+; CHECK-P9-NEXT:    xscvdpsxws f11, f11
+; CHECK-P9-NEXT:    xscvdpsxws f12, f12
+; CHECK-P9-NEXT:    xscvdpsxws f13, f13
+; CHECK-P9-NEXT:    xscvdpsxws v2, v2
+; CHECK-P9-NEXT:    xscvdpsxws v3, v3
+; CHECK-P9-NEXT:    mfvsrwz r3, f3
+; CHECK-P9-NEXT:    mfvsrwz r4, f2
+; CHECK-P9-NEXT:    mfvsrwz r11, f1
+; CHECK-P9-NEXT:    mfvsrwz r12, f0
+; CHECK-P9-NEXT:    mfvsrwz r5, f4
+; CHECK-P9-NEXT:    mfvsrwz r6, f5
+; CHECK-P9-NEXT:    mfvsrwz r7, f6
+; CHECK-P9-NEXT:    mfvsrwz r8, f7
+; CHECK-P9-NEXT:    mfvsrwz r9, f8
+; CHECK-P9-NEXT:    mfvsrwz r10, f9
+; CHECK-P9-NEXT:    mfvsrwz r0, f10
+; CHECK-P9-NEXT:    mfvsrwz r30, f11
+; CHECK-P9-NEXT:    mfvsrwz r29, f12
+; CHECK-P9-NEXT:    mfvsrwz r28, f13
+; CHECK-P9-NEXT:    mfvsrwz r27, v2
+; CHECK-P9-NEXT:    mfvsrwz r26, v3
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f8, r11
+; CHECK-P9-NEXT:    mtvsrd f9, r12
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    mtvsrd f10, r0
+; CHECK-P9-NEXT:    mtvsrd f11, r30
+; CHECK-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f12, r29
+; CHECK-P9-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f13, r28
+; CHECK-P9-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd v2, r27
+; CHECK-P9-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd v3, r26
+; CHECK-P9-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    xxswapd v5, vs2
+; CHECK-P9-NEXT:    xxswapd v0, vs3
+; CHECK-P9-NEXT:    xxswapd v1, vs4
+; CHECK-P9-NEXT:    xxswapd v6, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs6
+; CHECK-P9-NEXT:    xxswapd v8, vs1
+; CHECK-P9-NEXT:    xxswapd v9, vs7
+; CHECK-P9-NEXT:    xxswapd v10, vs10
+; CHECK-P9-NEXT:    xxswapd v11, vs11
+; CHECK-P9-NEXT:    xxswapd v12, vs8
+; CHECK-P9-NEXT:    xxswapd v13, vs12
+; CHECK-P9-NEXT:    xxswapd v14, vs13
+; CHECK-P9-NEXT:    xxswapd v2, v2
+; CHECK-P9-NEXT:    xxswapd v15, vs9
+; CHECK-P9-NEXT:    xxswapd v3, v3
+; CHECK-P9-NEXT:    vmrglb v5, v0, v5
+; CHECK-P9-NEXT:    vmrglb v4, v4, v1
+; CHECK-P9-NEXT:    vmrglb v0, v7, v6
+; CHECK-P9-NEXT:    vmrglb v1, v8, v9
+; CHECK-P9-NEXT:    vmrglb v6, v11, v10
+; CHECK-P9-NEXT:    vmrglb v7, v12, v13
+; CHECK-P9-NEXT:    vmrglb v2, v2, v14
+; CHECK-P9-NEXT:    vmrglb v3, v15, v3
+; CHECK-P9-NEXT:    vmrglh v4, v4, v5
+; CHECK-P9-NEXT:    vmrglh v5, v1, v0
+; CHECK-P9-NEXT:    vmrglh v0, v7, v6
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglw v3, v5, v4
+; CHECK-P9-NEXT:    vmrglw v2, v2, v0
+; CHECK-P9-NEXT:    xxmrgld v2, v2, v3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxsldwi vs4, vs3, vs3, 3
+; CHECK-BE-NEXT:    xxswapd vs5, vs3
+; CHECK-BE-NEXT:    xxsldwi vs6, vs3, vs3, 1
+; CHECK-BE-NEXT:    xxsldwi vs7, vs2, vs2, 3
+; CHECK-BE-NEXT:    xxswapd vs8, vs2
+; CHECK-BE-NEXT:    xxsldwi vs9, vs2, vs2, 1
+; CHECK-BE-NEXT:    xxsldwi vs10, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs11, vs1
+; CHECK-BE-NEXT:    xxsldwi vs12, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs13, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd v2, vs0
+; CHECK-BE-NEXT:    xxsldwi v3, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvspdpn f8, vs8
+; CHECK-BE-NEXT:    xscvspdpn f9, vs9
+; CHECK-BE-NEXT:    xscvspdpn f10, vs10
+; CHECK-BE-NEXT:    xscvspdpn f11, vs11
+; CHECK-BE-NEXT:    xscvspdpn f12, vs12
+; CHECK-BE-NEXT:    xscvspdpn f13, vs13
+; CHECK-BE-NEXT:    xscvspdpn v2, v2
+; CHECK-BE-NEXT:    xscvspdpn v3, v3
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    xscvdpsxws f8, f8
+; CHECK-BE-NEXT:    xscvdpsxws f9, f9
+; CHECK-BE-NEXT:    xscvdpsxws f10, f10
+; CHECK-BE-NEXT:    xscvdpsxws f11, f11
+; CHECK-BE-NEXT:    xscvdpsxws f12, f12
+; CHECK-BE-NEXT:    xscvdpsxws f13, f13
+; CHECK-BE-NEXT:    xscvdpsxws v2, v2
+; CHECK-BE-NEXT:    xscvdpsxws v3, v3
+; CHECK-BE-NEXT:    mfvsrwz r3, f3
+; CHECK-BE-NEXT:    mfvsrwz r4, f2
+; CHECK-BE-NEXT:    mfvsrwz r11, f1
+; CHECK-BE-NEXT:    mfvsrwz r12, f0
+; CHECK-BE-NEXT:    mfvsrwz r5, f4
+; CHECK-BE-NEXT:    mfvsrwz r6, f5
+; CHECK-BE-NEXT:    mfvsrwz r7, f6
+; CHECK-BE-NEXT:    mfvsrwz r8, f7
+; CHECK-BE-NEXT:    mfvsrwz r9, f8
+; CHECK-BE-NEXT:    mfvsrwz r10, f9
+; CHECK-BE-NEXT:    mfvsrwz r0, f10
+; CHECK-BE-NEXT:    mfvsrwz r30, f11
+; CHECK-BE-NEXT:    mfvsrwz r29, f12
+; CHECK-BE-NEXT:    mfvsrwz r28, f13
+; CHECK-BE-NEXT:    mfvsrwz r27, v2
+; CHECK-BE-NEXT:    mfvsrwz r26, v3
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r11, r11, 56
+; CHECK-BE-NEXT:    sldi r12, r12, 56
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    sldi r7, r7, 56
+; CHECK-BE-NEXT:    sldi r8, r8, 56
+; CHECK-BE-NEXT:    sldi r9, r9, 56
+; CHECK-BE-NEXT:    sldi r10, r10, 56
+; CHECK-BE-NEXT:    sldi r0, r0, 56
+; CHECK-BE-NEXT:    sldi r30, r30, 56
+; CHECK-BE-NEXT:    sldi r29, r29, 56
+; CHECK-BE-NEXT:    sldi r28, r28, 56
+; CHECK-BE-NEXT:    sldi r27, r27, 56
+; CHECK-BE-NEXT:    sldi r26, r26, 56
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v10, r11
+; CHECK-BE-NEXT:    mtvsrd v14, r12
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    mtvsrd v8, r0
+; CHECK-BE-NEXT:    mtvsrd v9, r30
+; CHECK-BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v11, r29
+; CHECK-BE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v12, r28
+; CHECK-BE-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v13, r27
+; CHECK-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v15, r26
+; CHECK-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    vmrghb v4, v5, v4
+; CHECK-BE-NEXT:    vmrghb v2, v2, v0
+; CHECK-BE-NEXT:    vmrghb v5, v6, v1
+; CHECK-BE-NEXT:    vmrghb v3, v3, v7
+; CHECK-BE-NEXT:    vmrghb v0, v9, v8
+; CHECK-BE-NEXT:    vmrghb v1, v10, v11
+; CHECK-BE-NEXT:    vmrghb v6, v13, v12
+; CHECK-BE-NEXT:    vmrghb v7, v14, v15
+; CHECK-BE-NEXT:    vmrghh v2, v2, v4
+; CHECK-BE-NEXT:    vmrghh v3, v3, v5
+; CHECK-BE-NEXT:    vmrghh v4, v1, v0
+; CHECK-BE-NEXT:    vmrghh v5, v7, v6
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x float>, <16 x float>* %0, align 64
+  %1 = fptoui <16 x float> %a to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define i16 @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r3, r3, 48
+; CHECK-P8-NEXT:    sth r3, -2(r1)
+; CHECK-P8-NEXT:    lhz r3, -2(r1)
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    addi r3, r1, -2
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-P9-NEXT:    vsldoi v2, v2, v2, 8
+; CHECK-P9-NEXT:    stxsihx v2, 0, r3
+; CHECK-P9-NEXT:    lhz r3, -2(r1)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xxsldwi vs1, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    mfvsrwz r3, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    mfvsrwz r4, f1
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    addi r3, r1, -2
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-NEXT:    vsldoi v2, v2, v2, 10
+; CHECK-BE-NEXT:    stxsihx v2, 0, r3
+; CHECK-BE-NEXT:    lhz r3, -2(r1)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x float>
+  %1 = fptosi <2 x float> %0 to <2 x i8>
+  %2 = bitcast <2 x i8> %1 to i16
+  ret i16 %2
+}
+
+define i32 @test4elt_signed(<4 x float> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f1, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    mtvsrd f3, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs2
+; CHECK-P8-NEXT:    xxswapd v5, vs3
+; CHECK-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-P8-NEXT:    vmrglb v3, v4, v5
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P9-NEXT:    xxswapd vs1, v2
+; CHECK-P9-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, v2
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    mfvsrwz r5, f3
+; CHECK-P9-NEXT:    mfvsrwz r3, f0
+; CHECK-P9-NEXT:    mfvsrwz r4, f1
+; CHECK-P9-NEXT:    mfvsrwz r6, f2
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-P9-NEXT:    vmrglb v3, v4, v5
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-BE-NEXT:    xxswapd vs1, v2
+; CHECK-BE-NEXT:    xxsldwi vs2, v2, v2, 1
+; CHECK-BE-NEXT:    xscvspdpn f3, v2
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    mfvsrwz r5, f3
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    mfvsrwz r3, f0
+; CHECK-BE-NEXT:    mfvsrwz r4, f1
+; CHECK-BE-NEXT:    mfvsrwz r6, f2
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    li r3, 0
+; CHECK-BE-NEXT:    vmrghb v2, v3, v2
+; CHECK-BE-NEXT:    vmrghb v3, v4, v5
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptosi <4 x float> %a to <4 x i8>
+  %1 = bitcast <4 x i8> %0 to i32
+  ret i32 %1
+}
+
+define i64 @test8elt_signed(<8 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lvx v5, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xxsldwi vs2, v5, v5, 3
+; CHECK-P8-NEXT:    xscvspdpn f4, v5
+; CHECK-P8-NEXT:    xxswapd vs3, v5
+; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    mfvsrwz r6, f1
+; CHECK-P8-NEXT:    mfvsrwz r5, f0
+; CHECK-P8-NEXT:    mtvsrd f1, r6
+; CHECK-P8-NEXT:    mtvsrd f0, r5
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 1
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    xscvspdpn f0, v2
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v1, vs4
+; CHECK-P8-NEXT:    vmrglb v2, v4, v3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    xxswapd v5, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    xxswapd v4, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v6, vs3
+; CHECK-P8-NEXT:    xxswapd v0, vs0
+; CHECK-P8-NEXT:    vmrglb v3, v3, v4
+; CHECK-P8-NEXT:    vmrglb v4, v0, v5
+; CHECK-P8-NEXT:    vmrglb v5, v1, v6
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglh v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs3, vs1
+; CHECK-P9-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd vs6, vs0
+; CHECK-P9-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    mfvsrwz r5, f1
+; CHECK-P9-NEXT:    mfvsrwz r9, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f2
+; CHECK-P9-NEXT:    mfvsrwz r4, f3
+; CHECK-P9-NEXT:    mfvsrwz r6, f4
+; CHECK-P9-NEXT:    mfvsrwz r7, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r10, f7
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v6, vs6
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    xxswapd v0, vs4
+; CHECK-P9-NEXT:    xxswapd v1, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs7
+; CHECK-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-P9-NEXT:    vmrglb v3, v4, v5
+; CHECK-P9-NEXT:    vmrglb v4, v1, v0
+; CHECK-P9-NEXT:    vmrglb v5, v6, v7
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglh v3, v5, v4
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxsldwi vs2, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs3, vs1
+; CHECK-BE-NEXT:    xxsldwi vs4, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs5, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd vs6, vs0
+; CHECK-BE-NEXT:    xxsldwi vs7, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    mfvsrwz r5, f1
+; CHECK-BE-NEXT:    mfvsrwz r9, f0
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    sldi r9, r9, 56
+; CHECK-BE-NEXT:    mfvsrwz r3, f2
+; CHECK-BE-NEXT:    mfvsrwz r4, f3
+; CHECK-BE-NEXT:    mfvsrwz r6, f4
+; CHECK-BE-NEXT:    mfvsrwz r7, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r10, f7
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    sldi r7, r7, 56
+; CHECK-BE-NEXT:    sldi r8, r8, 56
+; CHECK-BE-NEXT:    sldi r10, r10, 56
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    vmrghb v2, v3, v2
+; CHECK-BE-NEXT:    vmrghb v3, v4, v5
+; CHECK-BE-NEXT:    vmrghb v4, v1, v0
+; CHECK-BE-NEXT:    vmrghb v5, v6, v7
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v5, v4
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x float>, <8 x float>* %0, align 32
+  %1 = fptosi <8 x float> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to i64
+  ret i64 %2
+}
+
+define <16 x i8> @test16elt_signed(<16 x float>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lvx v3, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    xscvspdpn f2, v2
+; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P8-NEXT:    xscvspdpn f4, v3
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
+; CHECK-P8-NEXT:    xxsldwi vs5, v3, v3, 3
+; CHECK-P8-NEXT:    lvx v2, r3, r4
+; CHECK-P8-NEXT:    xscvspdpn f0, vs0
+; CHECK-P8-NEXT:    xxswapd vs6, v3
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xxsldwi vs7, v3, v3, 1
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xxsldwi vs8, v2, v2, 3
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xxswapd vs9, v2
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvspdpn f6, vs6
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xscvspdpn f7, vs7
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvspdpn f8, vs8
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvdpsxws f0, f5
+; CHECK-P8-NEXT:    xxswapd v0, vs4
+; CHECK-P8-NEXT:    xscvspdpn f9, vs9
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xscvdpsxws f1, f6
+; CHECK-P8-NEXT:    xxswapd v3, vs5
+; CHECK-P8-NEXT:    mtvsrd f6, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    xscvdpsxws f3, f7
+; CHECK-P8-NEXT:    xxswapd v4, vs6
+; CHECK-P8-NEXT:    mtvsrd f7, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvdpsxws f0, f8
+; CHECK-P8-NEXT:    xxswapd v5, vs7
+; CHECK-P8-NEXT:    mtvsrd f8, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xscvdpsxws f1, f9
+; CHECK-P8-NEXT:    xxswapd v1, vs8
+; CHECK-P8-NEXT:    mtvsrd f9, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    vmrglb v3, v4, v3
+; CHECK-P8-NEXT:    xxswapd v4, vs2
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    xxswapd v6, vs9
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvspdpn f0, v2
+; CHECK-P8-NEXT:    xxswapd v7, vs3
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    vmrglb v4, v4, v5
+; CHECK-P8-NEXT:    xxswapd v5, vs5
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    lvx v9, r3, r4
+; CHECK-P8-NEXT:    vmrglb v1, v6, v1
+; CHECK-P8-NEXT:    xxswapd v8, vs1
+; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 1
+; CHECK-P8-NEXT:    xxsldwi vs2, v9, v9, 3
+; CHECK-P8-NEXT:    xscvspdpn f4, v9
+; CHECK-P8-NEXT:    xxswapd vs3, v9
+; CHECK-P8-NEXT:    xxsldwi vs5, v9, v9, 1
+; CHECK-P8-NEXT:    xscvspdpn f1, vs1
+; CHECK-P8-NEXT:    xscvspdpn f2, vs2
+; CHECK-P8-NEXT:    xscvspdpn f3, vs3
+; CHECK-P8-NEXT:    xscvspdpn f5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f4, f4
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    xxswapd v9, vs4
+; CHECK-P8-NEXT:    mtvsrd f1, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    xxswapd v6, vs1
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    vmrglb v2, v0, v7
+; CHECK-P8-NEXT:    xxswapd v0, vs0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v7, vs2
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    vmrglb v5, v8, v5
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    xxswapd v10, vs3
+; CHECK-P8-NEXT:    vmrglb v0, v0, v6
+; CHECK-P8-NEXT:    vmrglh v3, v4, v3
+; CHECK-P8-NEXT:    vmrglb v6, v8, v7
+; CHECK-P8-NEXT:    vmrglb v7, v9, v10
+; CHECK-P8-NEXT:    vmrglh v2, v2, v1
+; CHECK-P8-NEXT:    vmrglh v4, v0, v5
+; CHECK-P8-NEXT:    vmrglh v5, v7, v6
+; CHECK-P8-NEXT:    vmrglw v2, v2, v3
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs2, 16(r3)
+; CHECK-P9-NEXT:    lxv vs3, 0(r3)
+; CHECK-P9-NEXT:    lxv vs0, 48(r3)
+; CHECK-P9-NEXT:    lxv vs1, 32(r3)
+; CHECK-P9-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxsldwi vs4, vs3, vs3, 3
+; CHECK-P9-NEXT:    xxswapd vs5, vs3
+; CHECK-P9-NEXT:    xxsldwi vs6, vs3, vs3, 1
+; CHECK-P9-NEXT:    xxsldwi vs7, vs2, vs2, 3
+; CHECK-P9-NEXT:    xxswapd vs8, vs2
+; CHECK-P9-NEXT:    xxsldwi vs9, vs2, vs2, 1
+; CHECK-P9-NEXT:    xxsldwi vs10, vs1, vs1, 3
+; CHECK-P9-NEXT:    xxswapd vs11, vs1
+; CHECK-P9-NEXT:    xxsldwi vs12, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi vs13, vs0, vs0, 3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxsldwi v3, vs0, vs0, 1
+; CHECK-P9-NEXT:    xscvspdpn f3, vs3
+; CHECK-P9-NEXT:    xscvspdpn f2, vs2
+; CHECK-P9-NEXT:    xscvspdpn f1, vs1
+; CHECK-P9-NEXT:    xscvspdpn f0, vs0
+; CHECK-P9-NEXT:    xscvspdpn f4, vs4
+; CHECK-P9-NEXT:    xscvspdpn f5, vs5
+; CHECK-P9-NEXT:    xscvspdpn f6, vs6
+; CHECK-P9-NEXT:    xscvspdpn f7, vs7
+; CHECK-P9-NEXT:    xscvspdpn f8, vs8
+; CHECK-P9-NEXT:    xscvspdpn f9, vs9
+; CHECK-P9-NEXT:    xscvspdpn f10, vs10
+; CHECK-P9-NEXT:    xscvspdpn f11, vs11
+; CHECK-P9-NEXT:    xscvspdpn f12, vs12
+; CHECK-P9-NEXT:    xscvspdpn f13, vs13
+; CHECK-P9-NEXT:    xscvspdpn v2, v2
+; CHECK-P9-NEXT:    xscvspdpn v3, v3
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    xscvdpsxws f8, f8
+; CHECK-P9-NEXT:    xscvdpsxws f9, f9
+; CHECK-P9-NEXT:    xscvdpsxws f10, f10
+; CHECK-P9-NEXT:    xscvdpsxws f11, f11
+; CHECK-P9-NEXT:    xscvdpsxws f12, f12
+; CHECK-P9-NEXT:    xscvdpsxws f13, f13
+; CHECK-P9-NEXT:    xscvdpsxws v2, v2
+; CHECK-P9-NEXT:    xscvdpsxws v3, v3
+; CHECK-P9-NEXT:    mfvsrwz r3, f3
+; CHECK-P9-NEXT:    mfvsrwz r4, f2
+; CHECK-P9-NEXT:    mfvsrwz r11, f1
+; CHECK-P9-NEXT:    mfvsrwz r12, f0
+; CHECK-P9-NEXT:    mfvsrwz r5, f4
+; CHECK-P9-NEXT:    mfvsrwz r6, f5
+; CHECK-P9-NEXT:    mfvsrwz r7, f6
+; CHECK-P9-NEXT:    mfvsrwz r8, f7
+; CHECK-P9-NEXT:    mfvsrwz r9, f8
+; CHECK-P9-NEXT:    mfvsrwz r10, f9
+; CHECK-P9-NEXT:    mfvsrwz r0, f10
+; CHECK-P9-NEXT:    mfvsrwz r30, f11
+; CHECK-P9-NEXT:    mfvsrwz r29, f12
+; CHECK-P9-NEXT:    mfvsrwz r28, f13
+; CHECK-P9-NEXT:    mfvsrwz r27, v2
+; CHECK-P9-NEXT:    mfvsrwz r26, v3
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f8, r11
+; CHECK-P9-NEXT:    mtvsrd f9, r12
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    mtvsrd f10, r0
+; CHECK-P9-NEXT:    mtvsrd f11, r30
+; CHECK-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f12, r29
+; CHECK-P9-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd f13, r28
+; CHECK-P9-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd v2, r27
+; CHECK-P9-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrd v3, r26
+; CHECK-P9-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    xxswapd v5, vs2
+; CHECK-P9-NEXT:    xxswapd v0, vs3
+; CHECK-P9-NEXT:    xxswapd v1, vs4
+; CHECK-P9-NEXT:    xxswapd v6, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs6
+; CHECK-P9-NEXT:    xxswapd v8, vs1
+; CHECK-P9-NEXT:    xxswapd v9, vs7
+; CHECK-P9-NEXT:    xxswapd v10, vs10
+; CHECK-P9-NEXT:    xxswapd v11, vs11
+; CHECK-P9-NEXT:    xxswapd v12, vs8
+; CHECK-P9-NEXT:    xxswapd v13, vs12
+; CHECK-P9-NEXT:    xxswapd v14, vs13
+; CHECK-P9-NEXT:    xxswapd v2, v2
+; CHECK-P9-NEXT:    xxswapd v15, vs9
+; CHECK-P9-NEXT:    xxswapd v3, v3
+; CHECK-P9-NEXT:    vmrglb v5, v0, v5
+; CHECK-P9-NEXT:    vmrglb v4, v4, v1
+; CHECK-P9-NEXT:    vmrglb v0, v7, v6
+; CHECK-P9-NEXT:    vmrglb v1, v8, v9
+; CHECK-P9-NEXT:    vmrglb v6, v11, v10
+; CHECK-P9-NEXT:    vmrglb v7, v12, v13
+; CHECK-P9-NEXT:    vmrglb v2, v2, v14
+; CHECK-P9-NEXT:    vmrglb v3, v15, v3
+; CHECK-P9-NEXT:    vmrglh v4, v4, v5
+; CHECK-P9-NEXT:    vmrglh v5, v1, v0
+; CHECK-P9-NEXT:    vmrglh v0, v7, v6
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglw v3, v5, v4
+; CHECK-P9-NEXT:    vmrglw v2, v2, v0
+; CHECK-P9-NEXT:    xxmrgld v2, v2, v3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxsldwi vs4, vs3, vs3, 3
+; CHECK-BE-NEXT:    xxswapd vs5, vs3
+; CHECK-BE-NEXT:    xxsldwi vs6, vs3, vs3, 1
+; CHECK-BE-NEXT:    xxsldwi vs7, vs2, vs2, 3
+; CHECK-BE-NEXT:    xxswapd vs8, vs2
+; CHECK-BE-NEXT:    xxsldwi vs9, vs2, vs2, 1
+; CHECK-BE-NEXT:    xxsldwi vs10, vs1, vs1, 3
+; CHECK-BE-NEXT:    xxswapd vs11, vs1
+; CHECK-BE-NEXT:    xxsldwi vs12, vs1, vs1, 1
+; CHECK-BE-NEXT:    xxsldwi vs13, vs0, vs0, 3
+; CHECK-BE-NEXT:    xxswapd v2, vs0
+; CHECK-BE-NEXT:    xxsldwi v3, vs0, vs0, 1
+; CHECK-BE-NEXT:    xscvspdpn f3, vs3
+; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xscvspdpn f1, vs1
+; CHECK-BE-NEXT:    xscvspdpn f0, vs0
+; CHECK-BE-NEXT:    xscvspdpn f4, vs4
+; CHECK-BE-NEXT:    xscvspdpn f5, vs5
+; CHECK-BE-NEXT:    xscvspdpn f6, vs6
+; CHECK-BE-NEXT:    xscvspdpn f7, vs7
+; CHECK-BE-NEXT:    xscvspdpn f8, vs8
+; CHECK-BE-NEXT:    xscvspdpn f9, vs9
+; CHECK-BE-NEXT:    xscvspdpn f10, vs10
+; CHECK-BE-NEXT:    xscvspdpn f11, vs11
+; CHECK-BE-NEXT:    xscvspdpn f12, vs12
+; CHECK-BE-NEXT:    xscvspdpn f13, vs13
+; CHECK-BE-NEXT:    xscvspdpn v2, v2
+; CHECK-BE-NEXT:    xscvspdpn v3, v3
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    xscvdpsxws f8, f8
+; CHECK-BE-NEXT:    xscvdpsxws f9, f9
+; CHECK-BE-NEXT:    xscvdpsxws f10, f10
+; CHECK-BE-NEXT:    xscvdpsxws f11, f11
+; CHECK-BE-NEXT:    xscvdpsxws f12, f12
+; CHECK-BE-NEXT:    xscvdpsxws f13, f13
+; CHECK-BE-NEXT:    xscvdpsxws v2, v2
+; CHECK-BE-NEXT:    xscvdpsxws v3, v3
+; CHECK-BE-NEXT:    mfvsrwz r3, f3
+; CHECK-BE-NEXT:    mfvsrwz r4, f2
+; CHECK-BE-NEXT:    mfvsrwz r11, f1
+; CHECK-BE-NEXT:    mfvsrwz r12, f0
+; CHECK-BE-NEXT:    mfvsrwz r5, f4
+; CHECK-BE-NEXT:    mfvsrwz r6, f5
+; CHECK-BE-NEXT:    mfvsrwz r7, f6
+; CHECK-BE-NEXT:    mfvsrwz r8, f7
+; CHECK-BE-NEXT:    mfvsrwz r9, f8
+; CHECK-BE-NEXT:    mfvsrwz r10, f9
+; CHECK-BE-NEXT:    mfvsrwz r0, f10
+; CHECK-BE-NEXT:    mfvsrwz r30, f11
+; CHECK-BE-NEXT:    mfvsrwz r29, f12
+; CHECK-BE-NEXT:    mfvsrwz r28, f13
+; CHECK-BE-NEXT:    mfvsrwz r27, v2
+; CHECK-BE-NEXT:    mfvsrwz r26, v3
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r11, r11, 56
+; CHECK-BE-NEXT:    sldi r12, r12, 56
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    sldi r7, r7, 56
+; CHECK-BE-NEXT:    sldi r8, r8, 56
+; CHECK-BE-NEXT:    sldi r9, r9, 56
+; CHECK-BE-NEXT:    sldi r10, r10, 56
+; CHECK-BE-NEXT:    sldi r0, r0, 56
+; CHECK-BE-NEXT:    sldi r30, r30, 56
+; CHECK-BE-NEXT:    sldi r29, r29, 56
+; CHECK-BE-NEXT:    sldi r28, r28, 56
+; CHECK-BE-NEXT:    sldi r27, r27, 56
+; CHECK-BE-NEXT:    sldi r26, r26, 56
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v10, r11
+; CHECK-BE-NEXT:    mtvsrd v14, r12
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    mtvsrd v8, r0
+; CHECK-BE-NEXT:    mtvsrd v9, r30
+; CHECK-BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v11, r29
+; CHECK-BE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v12, r28
+; CHECK-BE-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v13, r27
+; CHECK-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v15, r26
+; CHECK-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    vmrghb v4, v5, v4
+; CHECK-BE-NEXT:    vmrghb v2, v2, v0
+; CHECK-BE-NEXT:    vmrghb v5, v6, v1
+; CHECK-BE-NEXT:    vmrghb v3, v3, v7
+; CHECK-BE-NEXT:    vmrghb v0, v9, v8
+; CHECK-BE-NEXT:    vmrghb v1, v10, v11
+; CHECK-BE-NEXT:    vmrghb v6, v13, v12
+; CHECK-BE-NEXT:    vmrghb v7, v14, v15
+; CHECK-BE-NEXT:    vmrghh v2, v2, v4
+; CHECK-BE-NEXT:    vmrghh v3, v3, v5
+; CHECK-BE-NEXT:    vmrghh v4, v1, v0
+; CHECK-BE-NEXT:    vmrghh v5, v7, v6
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x float>, <16 x float>* %0, align 64
+  %1 = fptosi <16 x float> %a to <16 x i8>
+  ret <16 x i8> %1
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll b/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a713b52a740a8c51e86228952dfe7a64568e1d86
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll
@@ -0,0 +1,1304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i32 @test2elt(<2 x double> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    xscvdpsxws f1, v2
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    vmrglh v2, v2, v3
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxswapd vs0, v2
+; CHECK-P9-NEXT:    xscvdpsxws f1, v2
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxswapd vs0, v2
+; CHECK-BE-NEXT:    xscvdpsxws f1, v2
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    mfvsrwz r4, f0
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    li r3, 0
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <2 x double> %a to <2 x i16>
+  %1 = bitcast <2 x i16> %0 to i32
+  ret i32 %1
+}
+
+define i64 @test4elt(<4 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f3, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    xxswapd v2, vs2
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xxswapd v4, vs3
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    xxswapd v5, vs1
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglh v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs2, vs1
+; CHECK-P9-NEXT:    xxswapd vs3, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mfvsrwz r5, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mfvsrwz r4, f2
+; CHECK-P9-NEXT:    mfvsrwz r6, f3
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-P9-NEXT:    vmrglh v3, v4, v5
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxswapd vs2, vs1
+; CHECK-BE-NEXT:    xxswapd vs3, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    mfvsrwz r5, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    mfvsrwz r4, f2
+; CHECK-BE-NEXT:    mfvsrwz r6, f3
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-NEXT:    vmrghh v3, v4, v5
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x double>, <4 x double>* %0, align 32
+  %1 = fptoui <4 x double> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to i64
+  ret i64 %2
+}
+
+define <8 x i16> @test8elt(<8 x double>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    lxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f4, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f5, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f6, f2
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f7, f3
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    mtvsrd f4, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f6
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs4
+; CHECK-P8-NEXT:    mfvsrwz r4, f7
+; CHECK-P8-NEXT:    mtvsrd f6, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs5
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mtvsrd f7, r4
+; CHECK-P8-NEXT:    xxswapd v4, vs6
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v1, vs7
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v5, vs0
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    xxswapd v0, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    xxswapd v6, vs2
+; CHECK-P8-NEXT:    vmrglh v2, v5, v2
+; CHECK-P8-NEXT:    xxswapd v5, vs0
+; CHECK-P8-NEXT:    vmrglh v3, v0, v3
+; CHECK-P8-NEXT:    vmrglh v4, v6, v4
+; CHECK-P8-NEXT:    vmrglh v5, v5, v1
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r3)
+; CHECK-P9-NEXT:    lxv vs1, 32(r3)
+; CHECK-P9-NEXT:    lxv vs2, 16(r3)
+; CHECK-P9-NEXT:    lxv vs3, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs4, vs3
+; CHECK-P9-NEXT:    xxswapd vs5, vs2
+; CHECK-P9-NEXT:    xxswapd vs6, vs1
+; CHECK-P9-NEXT:    xxswapd vs7, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    mfvsrwz r3, f3
+; CHECK-P9-NEXT:    mfvsrwz r5, f2
+; CHECK-P9-NEXT:    mfvsrwz r7, f1
+; CHECK-P9-NEXT:    mfvsrwz r9, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mfvsrwz r4, f4
+; CHECK-P9-NEXT:    mfvsrwz r6, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r10, f7
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    xxswapd v0, vs4
+; CHECK-P9-NEXT:    xxswapd v6, vs6
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    xxswapd v1, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs7
+; CHECK-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-P9-NEXT:    vmrglh v3, v4, v5
+; CHECK-P9-NEXT:    vmrglh v4, v0, v1
+; CHECK-P9-NEXT:    vmrglh v5, v6, v7
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    vmrglw v3, v5, v4
+; CHECK-P9-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    xxswapd vs4, vs3
+; CHECK-BE-NEXT:    xxswapd vs5, vs2
+; CHECK-BE-NEXT:    xxswapd vs6, vs1
+; CHECK-BE-NEXT:    xxswapd vs7, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    mfvsrwz r3, f3
+; CHECK-BE-NEXT:    mfvsrwz r5, f2
+; CHECK-BE-NEXT:    mfvsrwz r7, f1
+; CHECK-BE-NEXT:    mfvsrwz r9, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    mfvsrwz r4, f4
+; CHECK-BE-NEXT:    mfvsrwz r6, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r10, f7
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-NEXT:    vmrghh v3, v4, v5
+; CHECK-BE-NEXT:    vmrghh v4, v0, v1
+; CHECK-BE-NEXT:    vmrghh v5, v6, v7
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x double>, <8 x double>* %0, align 64
+  %1 = fptoui <8 x double> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define void @test16elt(<16 x i16>* noalias nocapture sret %agg.result, <16 x double>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r6
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r6
+; CHECK-P8-NEXT:    li r6, 64
+; CHECK-P8-NEXT:    xscvdpsxws f4, f0
+; CHECK-P8-NEXT:    lxvd2x vs5, r4, r6
+; CHECK-P8-NEXT:    li r6, 80
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f6, f1
+; CHECK-P8-NEXT:    lxvd2x vs7, r4, r6
+; CHECK-P8-NEXT:    li r6, 96
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f8, f2
+; CHECK-P8-NEXT:    lxvd2x vs9, r4, r6
+; CHECK-P8-NEXT:    li r6, 112
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f10, f3
+; CHECK-P8-NEXT:    lxvd2x vs11, r4, r6
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f12, f5
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f13, f7
+; CHECK-P8-NEXT:    xxswapd vs7, vs7
+; CHECK-P8-NEXT:    xscvdpsxws v2, f9
+; CHECK-P8-NEXT:    xxswapd vs9, vs9
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    xscvdpsxws v3, f11
+; CHECK-P8-NEXT:    xxswapd vs11, vs11
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r6, f6
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f8
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xxswapd v4, vs4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    mtvsrd f6, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f10
+; CHECK-P8-NEXT:    mtvsrd f8, r4
+; CHECK-P8-NEXT:    xxswapd v5, vs6
+; CHECK-P8-NEXT:    mfvsrwz r4, f12
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    xxswapd v0, vs8
+; CHECK-P8-NEXT:    mtvsrd f10, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f13
+; CHECK-P8-NEXT:    mtvsrd f12, r4
+; CHECK-P8-NEXT:    xxswapd v1, vs10
+; CHECK-P8-NEXT:    mfvsrwz r4, v2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xxswapd v6, vs12
+; CHECK-P8-NEXT:    xscvdpsxws f9, f9
+; CHECK-P8-NEXT:    mtvsrd f13, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, v3
+; CHECK-P8-NEXT:    mtvsrd v2, r4
+; CHECK-P8-NEXT:    xxswapd v7, vs13
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvdpsxws f7, f7
+; CHECK-P8-NEXT:    xxswapd v2, v2
+; CHECK-P8-NEXT:    xscvdpsxws f11, f11
+; CHECK-P8-NEXT:    mtvsrd v3, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f1
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    xxswapd v3, v3
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    mtvsrd f1, r6
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    xxswapd v9, vs1
+; CHECK-P8-NEXT:    mfvsrwz r6, f3
+; CHECK-P8-NEXT:    xxswapd v10, vs2
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f9
+; CHECK-P8-NEXT:    mtvsrd f3, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f7
+; CHECK-P8-NEXT:    mtvsrd f9, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f11
+; CHECK-P8-NEXT:    vmrglh v4, v8, v4
+; CHECK-P8-NEXT:    xxswapd v8, vs3
+; CHECK-P8-NEXT:    vmrglh v5, v9, v5
+; CHECK-P8-NEXT:    xxswapd v9, vs5
+; CHECK-P8-NEXT:    mtvsrd f7, r6
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    vmrglh v0, v10, v0
+; CHECK-P8-NEXT:    xxswapd v10, vs7
+; CHECK-P8-NEXT:    vmrglh v1, v8, v1
+; CHECK-P8-NEXT:    xxswapd v8, vs9
+; CHECK-P8-NEXT:    vmrglh v6, v9, v6
+; CHECK-P8-NEXT:    xxswapd v9, vs0
+; CHECK-P8-NEXT:    vmrglh v7, v10, v7
+; CHECK-P8-NEXT:    vmrglh v2, v8, v2
+; CHECK-P8-NEXT:    vmrglh v3, v9, v3
+; CHECK-P8-NEXT:    vmrglw v4, v5, v4
+; CHECK-P8-NEXT:    vmrglw v5, v1, v0
+; CHECK-P8-NEXT:    vmrglw v0, v7, v6
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxmrgld v3, v5, v4
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    xxmrgld v2, v2, v0
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs2, 48(r4)
+; CHECK-P9-NEXT:    lxv vs4, 32(r4)
+; CHECK-P9-NEXT:    lxv vs5, 16(r4)
+; CHECK-P9-NEXT:    lxv vs6, 0(r4)
+; CHECK-P9-NEXT:    lxv vs0, 112(r4)
+; CHECK-P9-NEXT:    lxv vs1, 96(r4)
+; CHECK-P9-NEXT:    lxv vs3, 80(r4)
+; CHECK-P9-NEXT:    lxv vs7, 64(r4)
+; CHECK-P9-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxswapd vs8, vs6
+; CHECK-P9-NEXT:    xxswapd vs9, vs5
+; CHECK-P9-NEXT:    xxswapd vs10, vs4
+; CHECK-P9-NEXT:    xxswapd vs11, vs2
+; CHECK-P9-NEXT:    xxswapd vs12, vs7
+; CHECK-P9-NEXT:    xxswapd vs13, vs3
+; CHECK-P9-NEXT:    xxswapd v2, vs1
+; CHECK-P9-NEXT:    xxswapd v3, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f8, f8
+; CHECK-P9-NEXT:    xscvdpsxws f9, f9
+; CHECK-P9-NEXT:    xscvdpsxws f10, f10
+; CHECK-P9-NEXT:    xscvdpsxws f11, f11
+; CHECK-P9-NEXT:    xscvdpsxws f12, f12
+; CHECK-P9-NEXT:    xscvdpsxws f13, f13
+; CHECK-P9-NEXT:    xscvdpsxws v2, v2
+; CHECK-P9-NEXT:    xscvdpsxws v3, v3
+; CHECK-P9-NEXT:    mfvsrwz r4, f6
+; CHECK-P9-NEXT:    mfvsrwz r5, f5
+; CHECK-P9-NEXT:    mfvsrwz r6, f4
+; CHECK-P9-NEXT:    mfvsrwz r7, f2
+; CHECK-P9-NEXT:    mfvsrwz r12, f7
+; CHECK-P9-NEXT:    mfvsrwz r0, f3
+; CHECK-P9-NEXT:    mfvsrwz r30, f1
+; CHECK-P9-NEXT:    mfvsrwz r29, f0
+; CHECK-P9-NEXT:    mfvsrwz r8, f8
+; CHECK-P9-NEXT:    mfvsrwz r9, f9
+; CHECK-P9-NEXT:    mfvsrwz r10, f10
+; CHECK-P9-NEXT:    mfvsrwz r11, f11
+; CHECK-P9-NEXT:    mfvsrwz r28, f12
+; CHECK-P9-NEXT:    mfvsrwz r27, f13
+; CHECK-P9-NEXT:    mfvsrwz r26, v2
+; CHECK-P9-NEXT:    mfvsrwz r25, v3
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    mtvsrd f1, r5
+; CHECK-P9-NEXT:    mtvsrd f2, r6
+; CHECK-P9-NEXT:    mtvsrd f3, r7
+; CHECK-P9-NEXT:    mtvsrd f8, r12
+; CHECK-P9-NEXT:    mtvsrd f9, r0
+; CHECK-P9-NEXT:    mtvsrd f10, r30
+; CHECK-P9-NEXT:    mtvsrd f11, r29
+; CHECK-P9-NEXT:    mtvsrd f4, r8
+; CHECK-P9-NEXT:    mtvsrd f5, r9
+; CHECK-P9-NEXT:    mtvsrd f6, r10
+; CHECK-P9-NEXT:    mtvsrd f7, r11
+; CHECK-P9-NEXT:    mtvsrd f12, r28
+; CHECK-P9-NEXT:    mtvsrd f13, r27
+; CHECK-P9-NEXT:    mtvsrd v2, r26
+; CHECK-P9-NEXT:    mtvsrd v3, r25
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    xxswapd v5, vs1
+; CHECK-P9-NEXT:    xxswapd v0, vs2
+; CHECK-P9-NEXT:    xxswapd v1, vs3
+; CHECK-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v6, vs4
+; CHECK-P9-NEXT:    xxswapd v7, vs5
+; CHECK-P9-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v8, vs6
+; CHECK-P9-NEXT:    xxswapd v9, vs7
+; CHECK-P9-NEXT:    xxswapd v10, vs8
+; CHECK-P9-NEXT:    xxswapd v11, vs12
+; CHECK-P9-NEXT:    xxswapd v12, vs9
+; CHECK-P9-NEXT:    xxswapd v13, vs13
+; CHECK-P9-NEXT:    xxswapd v14, vs10
+; CHECK-P9-NEXT:    xxswapd v2, v2
+; CHECK-P9-NEXT:    xxswapd v15, vs11
+; CHECK-P9-NEXT:    xxswapd v3, v3
+; CHECK-P9-NEXT:    vmrglh v4, v4, v6
+; CHECK-P9-NEXT:    vmrglh v5, v5, v7
+; CHECK-P9-NEXT:    vmrglh v0, v0, v8
+; CHECK-P9-NEXT:    vmrglh v1, v1, v9
+; CHECK-P9-NEXT:    vmrglh v6, v10, v11
+; CHECK-P9-NEXT:    vmrglh v7, v12, v13
+; CHECK-P9-NEXT:    vmrglh v2, v14, v2
+; CHECK-P9-NEXT:    vmrglh v3, v15, v3
+; CHECK-P9-NEXT:    vmrglw v4, v5, v4
+; CHECK-P9-NEXT:    vmrglw v5, v1, v0
+; CHECK-P9-NEXT:    vmrglw v0, v7, v6
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    xxmrgld vs0, v5, v4
+; CHECK-P9-NEXT:    xxmrgld vs1, v2, v0
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs4, 16(r4)
+; CHECK-BE-NEXT:    lxv vs5, 32(r4)
+; CHECK-BE-NEXT:    lxv vs6, 48(r4)
+; CHECK-BE-NEXT:    lxv vs0, 64(r4)
+; CHECK-BE-NEXT:    lxv vs1, 80(r4)
+; CHECK-BE-NEXT:    lxv vs3, 96(r4)
+; CHECK-BE-NEXT:    lxv vs7, 112(r4)
+; CHECK-BE-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxswapd vs8, vs6
+; CHECK-BE-NEXT:    xxswapd vs9, vs5
+; CHECK-BE-NEXT:    xxswapd vs10, vs4
+; CHECK-BE-NEXT:    xxswapd vs11, vs2
+; CHECK-BE-NEXT:    xxswapd vs12, vs7
+; CHECK-BE-NEXT:    xxswapd vs13, vs3
+; CHECK-BE-NEXT:    xxswapd v2, vs1
+; CHECK-BE-NEXT:    xxswapd v3, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f8, f8
+; CHECK-BE-NEXT:    xscvdpsxws f9, f9
+; CHECK-BE-NEXT:    xscvdpsxws f10, f10
+; CHECK-BE-NEXT:    xscvdpsxws f11, f11
+; CHECK-BE-NEXT:    xscvdpsxws f12, f12
+; CHECK-BE-NEXT:    xscvdpsxws f13, f13
+; CHECK-BE-NEXT:    xscvdpsxws v2, v2
+; CHECK-BE-NEXT:    xscvdpsxws v3, v3
+; CHECK-BE-NEXT:    mfvsrwz r4, f6
+; CHECK-BE-NEXT:    mfvsrwz r5, f5
+; CHECK-BE-NEXT:    mfvsrwz r6, f4
+; CHECK-BE-NEXT:    mfvsrwz r7, f2
+; CHECK-BE-NEXT:    mfvsrwz r12, f7
+; CHECK-BE-NEXT:    mfvsrwz r0, f3
+; CHECK-BE-NEXT:    mfvsrwz r30, f1
+; CHECK-BE-NEXT:    mfvsrwz r29, f0
+; CHECK-BE-NEXT:    mfvsrwz r8, f8
+; CHECK-BE-NEXT:    mfvsrwz r9, f9
+; CHECK-BE-NEXT:    mfvsrwz r10, f10
+; CHECK-BE-NEXT:    mfvsrwz r11, f11
+; CHECK-BE-NEXT:    mfvsrwz r28, f12
+; CHECK-BE-NEXT:    mfvsrwz r27, f13
+; CHECK-BE-NEXT:    mfvsrwz r26, v2
+; CHECK-BE-NEXT:    mfvsrwz r25, v3
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r12, r12, 48
+; CHECK-BE-NEXT:    sldi r0, r0, 48
+; CHECK-BE-NEXT:    sldi r30, r30, 48
+; CHECK-BE-NEXT:    sldi r29, r29, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    sldi r11, r11, 48
+; CHECK-BE-NEXT:    sldi r28, r28, 48
+; CHECK-BE-NEXT:    sldi r27, r27, 48
+; CHECK-BE-NEXT:    sldi r26, r26, 48
+; CHECK-BE-NEXT:    sldi r25, r25, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r4
+; CHECK-BE-NEXT:    mtvsrd v3, r5
+; CHECK-BE-NEXT:    mtvsrd v4, r6
+; CHECK-BE-NEXT:    mtvsrd v5, r7
+; CHECK-BE-NEXT:    mtvsrd v8, r12
+; CHECK-BE-NEXT:    mtvsrd v10, r0
+; CHECK-BE-NEXT:    mtvsrd v12, r30
+; CHECK-BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v0, r8
+; CHECK-BE-NEXT:    mtvsrd v1, r9
+; CHECK-BE-NEXT:    mtvsrd v6, r10
+; CHECK-BE-NEXT:    mtvsrd v7, r11
+; CHECK-BE-NEXT:    mtvsrd v9, r28
+; CHECK-BE-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v11, r27
+; CHECK-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v13, r26
+; CHECK-BE-NEXT:    mtvsrd v14, r29
+; CHECK-BE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v15, r25
+; CHECK-BE-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    vmrghh v2, v2, v0
+; CHECK-BE-NEXT:    vmrghh v3, v3, v1
+; CHECK-BE-NEXT:    vmrghh v4, v4, v6
+; CHECK-BE-NEXT:    vmrghh v5, v5, v7
+; CHECK-BE-NEXT:    vmrghh v0, v8, v9
+; CHECK-BE-NEXT:    vmrghh v1, v10, v11
+; CHECK-BE-NEXT:    vmrghh v6, v12, v13
+; CHECK-BE-NEXT:    vmrghh v7, v14, v15
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    vmrghw v4, v1, v0
+; CHECK-BE-NEXT:    vmrghw v5, v7, v6
+; CHECK-BE-NEXT:    xxmrghd vs0, v3, v2
+; CHECK-BE-NEXT:    xxmrghd vs1, v5, v4
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x double>, <16 x double>* %0, align 128
+  %1 = fptoui <16 x double> %a to <16 x i16>
+  store <16 x i16> %1, <16 x i16>* %agg.result, align 32
+  ret void
+}
+
+define i32 @test2elt_signed(<2 x double> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    xscvdpsxws f1, v2
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    vmrglh v2, v2, v3
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxswapd vs0, v2
+; CHECK-P9-NEXT:    xscvdpsxws f1, v2
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxswapd vs0, v2
+; CHECK-BE-NEXT:    xscvdpsxws f1, v2
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    mfvsrwz r4, f0
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    li r3, 0
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptosi <2 x double> %a to <2 x i16>
+  %1 = bitcast <2 x i16> %0 to i32
+  ret i32 %1
+}
+
+define i64 @test4elt_signed(<4 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f3, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    xxswapd v2, vs2
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xxswapd v4, vs3
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    xxswapd v5, vs1
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglh v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs2, vs1
+; CHECK-P9-NEXT:    xxswapd vs3, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mfvsrwz r5, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mfvsrwz r4, f2
+; CHECK-P9-NEXT:    mfvsrwz r6, f3
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-P9-NEXT:    vmrglh v3, v4, v5
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxswapd vs2, vs1
+; CHECK-BE-NEXT:    xxswapd vs3, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    mfvsrwz r5, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    mfvsrwz r4, f2
+; CHECK-BE-NEXT:    mfvsrwz r6, f3
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-NEXT:    vmrghh v3, v4, v5
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x double>, <4 x double>* %0, align 32
+  %1 = fptosi <4 x double> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to i64
+  ret i64 %2
+}
+
+define <8 x i16> @test8elt_signed(<8 x double>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    lxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f4, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f5, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f6, f2
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f7, f3
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    mtvsrd f4, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f6
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs4
+; CHECK-P8-NEXT:    mfvsrwz r4, f7
+; CHECK-P8-NEXT:    mtvsrd f6, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs5
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mtvsrd f7, r4
+; CHECK-P8-NEXT:    xxswapd v4, vs6
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v1, vs7
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v5, vs0
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    xxswapd v0, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    xxswapd v6, vs2
+; CHECK-P8-NEXT:    vmrglh v2, v5, v2
+; CHECK-P8-NEXT:    xxswapd v5, vs0
+; CHECK-P8-NEXT:    vmrglh v3, v0, v3
+; CHECK-P8-NEXT:    vmrglh v4, v6, v4
+; CHECK-P8-NEXT:    vmrglh v5, v5, v1
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r3)
+; CHECK-P9-NEXT:    lxv vs1, 32(r3)
+; CHECK-P9-NEXT:    lxv vs2, 16(r3)
+; CHECK-P9-NEXT:    lxv vs3, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs4, vs3
+; CHECK-P9-NEXT:    xxswapd vs5, vs2
+; CHECK-P9-NEXT:    xxswapd vs6, vs1
+; CHECK-P9-NEXT:    xxswapd vs7, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    mfvsrwz r3, f3
+; CHECK-P9-NEXT:    mfvsrwz r5, f2
+; CHECK-P9-NEXT:    mfvsrwz r7, f1
+; CHECK-P9-NEXT:    mfvsrwz r9, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mfvsrwz r4, f4
+; CHECK-P9-NEXT:    mfvsrwz r6, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r10, f7
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    xxswapd v0, vs4
+; CHECK-P9-NEXT:    xxswapd v6, vs6
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    xxswapd v1, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs7
+; CHECK-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-P9-NEXT:    vmrglh v3, v4, v5
+; CHECK-P9-NEXT:    vmrglh v4, v0, v1
+; CHECK-P9-NEXT:    vmrglh v5, v6, v7
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    vmrglw v3, v5, v4
+; CHECK-P9-NEXT:    xxmrgld v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    xxswapd vs4, vs3
+; CHECK-BE-NEXT:    xxswapd vs5, vs2
+; CHECK-BE-NEXT:    xxswapd vs6, vs1
+; CHECK-BE-NEXT:    xxswapd vs7, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    mfvsrwz r3, f3
+; CHECK-BE-NEXT:    mfvsrwz r5, f2
+; CHECK-BE-NEXT:    mfvsrwz r7, f1
+; CHECK-BE-NEXT:    mfvsrwz r9, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    mfvsrwz r4, f4
+; CHECK-BE-NEXT:    mfvsrwz r6, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r10, f7
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-NEXT:    vmrghh v3, v4, v5
+; CHECK-BE-NEXT:    vmrghh v4, v0, v1
+; CHECK-BE-NEXT:    vmrghh v5, v6, v7
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x double>, <8 x double>* %0, align 64
+  %1 = fptosi <8 x double> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define void @test16elt_signed(<16 x i16>* noalias nocapture sret %agg.result, <16 x double>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r6
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r6
+; CHECK-P8-NEXT:    li r6, 64
+; CHECK-P8-NEXT:    xscvdpsxws f4, f0
+; CHECK-P8-NEXT:    lxvd2x vs5, r4, r6
+; CHECK-P8-NEXT:    li r6, 80
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f6, f1
+; CHECK-P8-NEXT:    lxvd2x vs7, r4, r6
+; CHECK-P8-NEXT:    li r6, 96
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f8, f2
+; CHECK-P8-NEXT:    lxvd2x vs9, r4, r6
+; CHECK-P8-NEXT:    li r6, 112
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f10, f3
+; CHECK-P8-NEXT:    lxvd2x vs11, r4, r6
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f12, f5
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f13, f7
+; CHECK-P8-NEXT:    xxswapd vs7, vs7
+; CHECK-P8-NEXT:    xscvdpsxws v2, f9
+; CHECK-P8-NEXT:    xxswapd vs9, vs9
+; CHECK-P8-NEXT:    mfvsrwz r4, f4
+; CHECK-P8-NEXT:    xscvdpsxws v3, f11
+; CHECK-P8-NEXT:    xxswapd vs11, vs11
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r6, f6
+; CHECK-P8-NEXT:    mtvsrd f4, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f8
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xxswapd v4, vs4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    mtvsrd f6, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f10
+; CHECK-P8-NEXT:    mtvsrd f8, r4
+; CHECK-P8-NEXT:    xxswapd v5, vs6
+; CHECK-P8-NEXT:    mfvsrwz r4, f12
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    xxswapd v0, vs8
+; CHECK-P8-NEXT:    mtvsrd f10, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f13
+; CHECK-P8-NEXT:    mtvsrd f12, r4
+; CHECK-P8-NEXT:    xxswapd v1, vs10
+; CHECK-P8-NEXT:    mfvsrwz r4, v2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xxswapd v6, vs12
+; CHECK-P8-NEXT:    xscvdpsxws f9, f9
+; CHECK-P8-NEXT:    mtvsrd f13, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, v3
+; CHECK-P8-NEXT:    mtvsrd v2, r4
+; CHECK-P8-NEXT:    xxswapd v7, vs13
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    xscvdpsxws f7, f7
+; CHECK-P8-NEXT:    xxswapd v2, v2
+; CHECK-P8-NEXT:    xscvdpsxws f11, f11
+; CHECK-P8-NEXT:    mtvsrd v3, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f1
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    xxswapd v3, v3
+; CHECK-P8-NEXT:    mfvsrwz r4, f2
+; CHECK-P8-NEXT:    mtvsrd f1, r6
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    mtvsrd f2, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    xxswapd v9, vs1
+; CHECK-P8-NEXT:    mfvsrwz r6, f3
+; CHECK-P8-NEXT:    xxswapd v10, vs2
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f9
+; CHECK-P8-NEXT:    mtvsrd f3, r6
+; CHECK-P8-NEXT:    mfvsrwz r6, f7
+; CHECK-P8-NEXT:    mtvsrd f9, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f11
+; CHECK-P8-NEXT:    vmrglh v4, v8, v4
+; CHECK-P8-NEXT:    xxswapd v8, vs3
+; CHECK-P8-NEXT:    vmrglh v5, v9, v5
+; CHECK-P8-NEXT:    xxswapd v9, vs5
+; CHECK-P8-NEXT:    mtvsrd f7, r6
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    vmrglh v0, v10, v0
+; CHECK-P8-NEXT:    xxswapd v10, vs7
+; CHECK-P8-NEXT:    vmrglh v1, v8, v1
+; CHECK-P8-NEXT:    xxswapd v8, vs9
+; CHECK-P8-NEXT:    vmrglh v6, v9, v6
+; CHECK-P8-NEXT:    xxswapd v9, vs0
+; CHECK-P8-NEXT:    vmrglh v7, v10, v7
+; CHECK-P8-NEXT:    vmrglh v2, v8, v2
+; CHECK-P8-NEXT:    vmrglh v3, v9, v3
+; CHECK-P8-NEXT:    vmrglw v4, v5, v4
+; CHECK-P8-NEXT:    vmrglw v5, v1, v0
+; CHECK-P8-NEXT:    vmrglw v0, v7, v6
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxmrgld v3, v5, v4
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    xxmrgld v2, v2, v0
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs2, 48(r4)
+; CHECK-P9-NEXT:    lxv vs4, 32(r4)
+; CHECK-P9-NEXT:    lxv vs5, 16(r4)
+; CHECK-P9-NEXT:    lxv vs6, 0(r4)
+; CHECK-P9-NEXT:    lxv vs0, 112(r4)
+; CHECK-P9-NEXT:    lxv vs1, 96(r4)
+; CHECK-P9-NEXT:    lxv vs3, 80(r4)
+; CHECK-P9-NEXT:    lxv vs7, 64(r4)
+; CHECK-P9-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxswapd vs8, vs6
+; CHECK-P9-NEXT:    xxswapd vs9, vs5
+; CHECK-P9-NEXT:    xxswapd vs10, vs4
+; CHECK-P9-NEXT:    xxswapd vs11, vs2
+; CHECK-P9-NEXT:    xxswapd vs12, vs7
+; CHECK-P9-NEXT:    xxswapd vs13, vs3
+; CHECK-P9-NEXT:    xxswapd v2, vs1
+; CHECK-P9-NEXT:    xxswapd v3, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f8, f8
+; CHECK-P9-NEXT:    xscvdpsxws f9, f9
+; CHECK-P9-NEXT:    xscvdpsxws f10, f10
+; CHECK-P9-NEXT:    xscvdpsxws f11, f11
+; CHECK-P9-NEXT:    xscvdpsxws f12, f12
+; CHECK-P9-NEXT:    xscvdpsxws f13, f13
+; CHECK-P9-NEXT:    xscvdpsxws v2, v2
+; CHECK-P9-NEXT:    xscvdpsxws v3, v3
+; CHECK-P9-NEXT:    mfvsrwz r4, f6
+; CHECK-P9-NEXT:    mfvsrwz r5, f5
+; CHECK-P9-NEXT:    mfvsrwz r6, f4
+; CHECK-P9-NEXT:    mfvsrwz r7, f2
+; CHECK-P9-NEXT:    mfvsrwz r12, f7
+; CHECK-P9-NEXT:    mfvsrwz r0, f3
+; CHECK-P9-NEXT:    mfvsrwz r30, f1
+; CHECK-P9-NEXT:    mfvsrwz r29, f0
+; CHECK-P9-NEXT:    mfvsrwz r8, f8
+; CHECK-P9-NEXT:    mfvsrwz r9, f9
+; CHECK-P9-NEXT:    mfvsrwz r10, f10
+; CHECK-P9-NEXT:    mfvsrwz r11, f11
+; CHECK-P9-NEXT:    mfvsrwz r28, f12
+; CHECK-P9-NEXT:    mfvsrwz r27, f13
+; CHECK-P9-NEXT:    mfvsrwz r26, v2
+; CHECK-P9-NEXT:    mfvsrwz r25, v3
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    mtvsrd f1, r5
+; CHECK-P9-NEXT:    mtvsrd f2, r6
+; CHECK-P9-NEXT:    mtvsrd f3, r7
+; CHECK-P9-NEXT:    mtvsrd f8, r12
+; CHECK-P9-NEXT:    mtvsrd f9, r0
+; CHECK-P9-NEXT:    mtvsrd f10, r30
+; CHECK-P9-NEXT:    mtvsrd f11, r29
+; CHECK-P9-NEXT:    mtvsrd f4, r8
+; CHECK-P9-NEXT:    mtvsrd f5, r9
+; CHECK-P9-NEXT:    mtvsrd f6, r10
+; CHECK-P9-NEXT:    mtvsrd f7, r11
+; CHECK-P9-NEXT:    mtvsrd f12, r28
+; CHECK-P9-NEXT:    mtvsrd f13, r27
+; CHECK-P9-NEXT:    mtvsrd v2, r26
+; CHECK-P9-NEXT:    mtvsrd v3, r25
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    xxswapd v5, vs1
+; CHECK-P9-NEXT:    xxswapd v0, vs2
+; CHECK-P9-NEXT:    xxswapd v1, vs3
+; CHECK-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v6, vs4
+; CHECK-P9-NEXT:    xxswapd v7, vs5
+; CHECK-P9-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v8, vs6
+; CHECK-P9-NEXT:    xxswapd v9, vs7
+; CHECK-P9-NEXT:    xxswapd v10, vs8
+; CHECK-P9-NEXT:    xxswapd v11, vs12
+; CHECK-P9-NEXT:    xxswapd v12, vs9
+; CHECK-P9-NEXT:    xxswapd v13, vs13
+; CHECK-P9-NEXT:    xxswapd v14, vs10
+; CHECK-P9-NEXT:    xxswapd v2, v2
+; CHECK-P9-NEXT:    xxswapd v15, vs11
+; CHECK-P9-NEXT:    xxswapd v3, v3
+; CHECK-P9-NEXT:    vmrglh v4, v4, v6
+; CHECK-P9-NEXT:    vmrglh v5, v5, v7
+; CHECK-P9-NEXT:    vmrglh v0, v0, v8
+; CHECK-P9-NEXT:    vmrglh v1, v1, v9
+; CHECK-P9-NEXT:    vmrglh v6, v10, v11
+; CHECK-P9-NEXT:    vmrglh v7, v12, v13
+; CHECK-P9-NEXT:    vmrglh v2, v14, v2
+; CHECK-P9-NEXT:    vmrglh v3, v15, v3
+; CHECK-P9-NEXT:    vmrglw v4, v5, v4
+; CHECK-P9-NEXT:    vmrglw v5, v1, v0
+; CHECK-P9-NEXT:    vmrglw v0, v7, v6
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    xxmrgld vs0, v5, v4
+; CHECK-P9-NEXT:    xxmrgld vs1, v2, v0
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs4, 16(r4)
+; CHECK-BE-NEXT:    lxv vs5, 32(r4)
+; CHECK-BE-NEXT:    lxv vs6, 48(r4)
+; CHECK-BE-NEXT:    lxv vs0, 64(r4)
+; CHECK-BE-NEXT:    lxv vs1, 80(r4)
+; CHECK-BE-NEXT:    lxv vs3, 96(r4)
+; CHECK-BE-NEXT:    lxv vs7, 112(r4)
+; CHECK-BE-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxswapd vs8, vs6
+; CHECK-BE-NEXT:    xxswapd vs9, vs5
+; CHECK-BE-NEXT:    xxswapd vs10, vs4
+; CHECK-BE-NEXT:    xxswapd vs11, vs2
+; CHECK-BE-NEXT:    xxswapd vs12, vs7
+; CHECK-BE-NEXT:    xxswapd vs13, vs3
+; CHECK-BE-NEXT:    xxswapd v2, vs1
+; CHECK-BE-NEXT:    xxswapd v3, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f8, f8
+; CHECK-BE-NEXT:    xscvdpsxws f9, f9
+; CHECK-BE-NEXT:    xscvdpsxws f10, f10
+; CHECK-BE-NEXT:    xscvdpsxws f11, f11
+; CHECK-BE-NEXT:    xscvdpsxws f12, f12
+; CHECK-BE-NEXT:    xscvdpsxws f13, f13
+; CHECK-BE-NEXT:    xscvdpsxws v2, v2
+; CHECK-BE-NEXT:    xscvdpsxws v3, v3
+; CHECK-BE-NEXT:    mfvsrwz r4, f6
+; CHECK-BE-NEXT:    mfvsrwz r5, f5
+; CHECK-BE-NEXT:    mfvsrwz r6, f4
+; CHECK-BE-NEXT:    mfvsrwz r7, f2
+; CHECK-BE-NEXT:    mfvsrwz r12, f7
+; CHECK-BE-NEXT:    mfvsrwz r0, f3
+; CHECK-BE-NEXT:    mfvsrwz r30, f1
+; CHECK-BE-NEXT:    mfvsrwz r29, f0
+; CHECK-BE-NEXT:    mfvsrwz r8, f8
+; CHECK-BE-NEXT:    mfvsrwz r9, f9
+; CHECK-BE-NEXT:    mfvsrwz r10, f10
+; CHECK-BE-NEXT:    mfvsrwz r11, f11
+; CHECK-BE-NEXT:    mfvsrwz r28, f12
+; CHECK-BE-NEXT:    mfvsrwz r27, f13
+; CHECK-BE-NEXT:    mfvsrwz r26, v2
+; CHECK-BE-NEXT:    mfvsrwz r25, v3
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r12, r12, 48
+; CHECK-BE-NEXT:    sldi r0, r0, 48
+; CHECK-BE-NEXT:    sldi r30, r30, 48
+; CHECK-BE-NEXT:    sldi r29, r29, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    sldi r11, r11, 48
+; CHECK-BE-NEXT:    sldi r28, r28, 48
+; CHECK-BE-NEXT:    sldi r27, r27, 48
+; CHECK-BE-NEXT:    sldi r26, r26, 48
+; CHECK-BE-NEXT:    sldi r25, r25, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r4
+; CHECK-BE-NEXT:    mtvsrd v3, r5
+; CHECK-BE-NEXT:    mtvsrd v4, r6
+; CHECK-BE-NEXT:    mtvsrd v5, r7
+; CHECK-BE-NEXT:    mtvsrd v8, r12
+; CHECK-BE-NEXT:    mtvsrd v10, r0
+; CHECK-BE-NEXT:    mtvsrd v12, r30
+; CHECK-BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v0, r8
+; CHECK-BE-NEXT:    mtvsrd v1, r9
+; CHECK-BE-NEXT:    mtvsrd v6, r10
+; CHECK-BE-NEXT:    mtvsrd v7, r11
+; CHECK-BE-NEXT:    mtvsrd v9, r28
+; CHECK-BE-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v11, r27
+; CHECK-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v13, r26
+; CHECK-BE-NEXT:    mtvsrd v14, r29
+; CHECK-BE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v15, r25
+; CHECK-BE-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    vmrghh v2, v2, v0
+; CHECK-BE-NEXT:    vmrghh v3, v3, v1
+; CHECK-BE-NEXT:    vmrghh v4, v4, v6
+; CHECK-BE-NEXT:    vmrghh v5, v5, v7
+; CHECK-BE-NEXT:    vmrghh v0, v8, v9
+; CHECK-BE-NEXT:    vmrghh v1, v10, v11
+; CHECK-BE-NEXT:    vmrghh v6, v12, v13
+; CHECK-BE-NEXT:    vmrghh v7, v14, v15
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    vmrghw v4, v1, v0
+; CHECK-BE-NEXT:    vmrghw v5, v7, v6
+; CHECK-BE-NEXT:    xxmrghd vs0, v3, v2
+; CHECK-BE-NEXT:    xxmrghd vs1, v5, v4
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x double>, <16 x double>* %0, align 128
+  %1 = fptosi <16 x double> %a to <16 x i16>
+  store <16 x i16> %1, <16 x i16>* %agg.result, align 32
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll b/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5f35f808fb89d3862ed9043578ef2e588f66bfa2
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll
@@ -0,0 +1,598 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i64 @test2elt(<2 x double> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    xscvdpuxws f1, v2
+; CHECK-P8-NEXT:    xscvdpuxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    vmrglw v2, v2, v3
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxswapd vs0, v2
+; CHECK-P9-NEXT:    xscvdpuxws f1, v2
+; CHECK-P9-NEXT:    xscvdpuxws f0, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mtvsrws v3, r4
+; CHECK-P9-NEXT:    vmrglw v2, v2, v3
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxswapd vs0, v2
+; CHECK-BE-NEXT:    xscvdpuxws f1, v2
+; CHECK-BE-NEXT:    xscvdpuxws f0, f0
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    mfvsrwz r4, f0
+; CHECK-BE-NEXT:    mtvsrws v3, r4
+; CHECK-BE-NEXT:    vmrghw v2, v2, v3
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <2 x double> %a to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to i64
+  ret i64 %1
+}
+
+define <4 x i32> @test4elt(<4 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxmrgld vs2, vs0, vs1
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xvcvdpuxws v2, vs2
+; CHECK-P8-NEXT:    xvcvdpuxws v3, vs0
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 0(r3)
+; CHECK-P9-NEXT:    lxv vs1, 16(r3)
+; CHECK-P9-NEXT:    xxmrgld vs2, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xvcvdpuxws v2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxws v3, vs0
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r3)
+; CHECK-BE-NEXT:    lxv vs1, 0(r3)
+; CHECK-BE-NEXT:    xxmrgld vs2, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xvcvdpuxws v2, vs2
+; CHECK-BE-NEXT:    xvcvdpuxws v3, vs0
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x double>, <4 x double>* %0, align 32
+  %1 = fptoui <4 x double> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define void @test8elt(<8 x i32>* noalias nocapture sret %agg.result, <8 x double>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r5
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxmrgld vs4, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrgld vs1, vs2, vs3
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs3
+; CHECK-P8-NEXT:    xvcvdpuxws v2, vs4
+; CHECK-P8-NEXT:    xvcvdpuxws v3, vs0
+; CHECK-P8-NEXT:    xvcvdpuxws v4, vs1
+; CHECK-P8-NEXT:    xvcvdpuxws v5, vs2
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 32(r4)
+; CHECK-P9-NEXT:    lxv vs1, 48(r4)
+; CHECK-P9-NEXT:    lxv vs2, 0(r4)
+; CHECK-P9-NEXT:    lxv vs3, 16(r4)
+; CHECK-P9-NEXT:    xxmrgld vs4, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P9-NEXT:    xxmrgld vs3, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xvcvdpuxws v2, vs4
+; CHECK-P9-NEXT:    xvcvdpuxws v3, vs2
+; CHECK-P9-NEXT:    xvcvdpuxws v4, vs3
+; CHECK-P9-NEXT:    xvcvdpuxws v5, vs0
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    xxmrgld vs4, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-BE-NEXT:    xxmrgld vs3, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xvcvdpuxws v2, vs4
+; CHECK-BE-NEXT:    xvcvdpuxws v3, vs2
+; CHECK-BE-NEXT:    xvcvdpuxws v4, vs3
+; CHECK-BE-NEXT:    xvcvdpuxws v5, vs0
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x double>, <8 x double>* %0, align 64
+  %1 = fptoui <8 x double> %a to <8 x i32>
+  store <8 x i32> %1, <8 x i32>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt(<16 x i32>* noalias nocapture sret %agg.result, <16 x double>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    li r8, 64
+; CHECK-P8-NEXT:    li r7, 16
+; CHECK-P8-NEXT:    li r9, 80
+; CHECK-P8-NEXT:    lxvd2x vs7, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r8
+; CHECK-P8-NEXT:    li r8, 96
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    lxvd2x vs5, r4, r8
+; CHECK-P8-NEXT:    li r8, 112
+; CHECK-P8-NEXT:    lxvd2x vs4, r4, r9
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    lxvd2x vs6, r4, r8
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    xxmrgld vs8, vs1, vs0
+; CHECK-P8-NEXT:    xxswapd vs6, vs6
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs7
+; CHECK-P8-NEXT:    xxmrgld vs7, vs4, vs3
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    xxmrgld vs4, vs6, vs5
+; CHECK-P8-NEXT:    xvcvdpuxws v2, vs8
+; CHECK-P8-NEXT:    xvcvdpuxws v3, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs6, vs5
+; CHECK-P8-NEXT:    xxmrgld vs5, vs2, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs1
+; CHECK-P8-NEXT:    xvcvdpuxws v4, vs7
+; CHECK-P8-NEXT:    xvcvdpuxws v5, vs3
+; CHECK-P8-NEXT:    xvcvdpuxws v0, vs4
+; CHECK-P8-NEXT:    xvcvdpuxws v1, vs0
+; CHECK-P8-NEXT:    xvcvdpuxws v6, vs5
+; CHECK-P8-NEXT:    xvcvdpuxws v7, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    vmrgew v4, v1, v0
+; CHECK-P8-NEXT:    vmrgew v5, v7, v6
+; CHECK-P8-NEXT:    stvx v2, r3, r7
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    stvx v4, r3, r6
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 32(r4)
+; CHECK-P9-NEXT:    lxv vs1, 48(r4)
+; CHECK-P9-NEXT:    lxv vs2, 0(r4)
+; CHECK-P9-NEXT:    lxv vs3, 16(r4)
+; CHECK-P9-NEXT:    lxv vs4, 96(r4)
+; CHECK-P9-NEXT:    lxv vs5, 112(r4)
+; CHECK-P9-NEXT:    lxv vs6, 64(r4)
+; CHECK-P9-NEXT:    lxv vs7, 80(r4)
+; CHECK-P9-NEXT:    xxmrgld vs8, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P9-NEXT:    xxmrgld vs3, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrgld vs1, vs7, vs6
+; CHECK-P9-NEXT:    xxmrghd vs6, vs7, vs6
+; CHECK-P9-NEXT:    xxmrgld vs7, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-P9-NEXT:    xvcvdpuxws v2, vs8
+; CHECK-P9-NEXT:    xvcvdpuxws v3, vs2
+; CHECK-P9-NEXT:    xvcvdpuxws v4, vs3
+; CHECK-P9-NEXT:    xvcvdpuxws v5, vs0
+; CHECK-P9-NEXT:    xvcvdpuxws v0, vs1
+; CHECK-P9-NEXT:    xvcvdpuxws v1, vs6
+; CHECK-P9-NEXT:    xvcvdpuxws v6, vs7
+; CHECK-P9-NEXT:    xvcvdpuxws v7, vs4
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    vmrgew v4, v1, v0
+; CHECK-P9-NEXT:    vmrgew v5, v7, v6
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    stxv v5, 48(r3)
+; CHECK-P9-NEXT:    stxv v4, 32(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    lxv vs4, 112(r4)
+; CHECK-BE-NEXT:    lxv vs5, 96(r4)
+; CHECK-BE-NEXT:    lxv vs6, 80(r4)
+; CHECK-BE-NEXT:    lxv vs7, 64(r4)
+; CHECK-BE-NEXT:    xxmrgld vs8, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-BE-NEXT:    xxmrgld vs3, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrgld vs1, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd vs6, vs7, vs6
+; CHECK-BE-NEXT:    xxmrgld vs7, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-BE-NEXT:    xvcvdpuxws v2, vs8
+; CHECK-BE-NEXT:    xvcvdpuxws v3, vs2
+; CHECK-BE-NEXT:    xvcvdpuxws v4, vs3
+; CHECK-BE-NEXT:    xvcvdpuxws v5, vs0
+; CHECK-BE-NEXT:    xvcvdpuxws v0, vs1
+; CHECK-BE-NEXT:    xvcvdpuxws v1, vs6
+; CHECK-BE-NEXT:    xvcvdpuxws v6, vs7
+; CHECK-BE-NEXT:    xvcvdpuxws v7, vs4
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    vmrgew v4, v1, v0
+; CHECK-BE-NEXT:    vmrgew v5, v7, v6
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x double>, <16 x double>* %0, align 128
+  %1 = fptoui <16 x double> %a to <16 x i32>
+  store <16 x i32> %1, <16 x i32>* %agg.result, align 64
+  ret void
+}
+
+define i64 @test2elt_signed(<2 x double> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    xscvdpsxws f1, v2
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    vmrglw v2, v2, v3
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxswapd vs0, v2
+; CHECK-P9-NEXT:    xscvdpsxws f1, v2
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mtvsrws v3, r4
+; CHECK-P9-NEXT:    vmrglw v2, v2, v3
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxswapd vs0, v2
+; CHECK-BE-NEXT:    xscvdpsxws f1, v2
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    mfvsrwz r4, f0
+; CHECK-BE-NEXT:    mtvsrws v3, r4
+; CHECK-BE-NEXT:    vmrghw v2, v2, v3
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptosi <2 x double> %a to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to i64
+  ret i64 %1
+}
+
+define <4 x i32> @test4elt_signed(<4 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxmrgld vs2, vs0, vs1
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xvcvdpsxws v2, vs2
+; CHECK-P8-NEXT:    xvcvdpsxws v3, vs0
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 0(r3)
+; CHECK-P9-NEXT:    lxv vs1, 16(r3)
+; CHECK-P9-NEXT:    xxmrgld vs2, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xvcvdpsxws v2, vs2
+; CHECK-P9-NEXT:    xvcvdpsxws v3, vs0
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r3)
+; CHECK-BE-NEXT:    lxv vs1, 0(r3)
+; CHECK-BE-NEXT:    xxmrgld vs2, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xvcvdpsxws v2, vs2
+; CHECK-BE-NEXT:    xvcvdpsxws v3, vs0
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x double>, <4 x double>* %0, align 32
+  %1 = fptosi <4 x double> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define void @test8elt_signed(<8 x i32>* noalias nocapture sret %agg.result, <8 x double>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r5
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxmrgld vs4, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrgld vs1, vs2, vs3
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs3
+; CHECK-P8-NEXT:    xvcvdpsxws v2, vs4
+; CHECK-P8-NEXT:    xvcvdpsxws v3, vs0
+; CHECK-P8-NEXT:    xvcvdpsxws v4, vs1
+; CHECK-P8-NEXT:    xvcvdpsxws v5, vs2
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 32(r4)
+; CHECK-P9-NEXT:    lxv vs1, 48(r4)
+; CHECK-P9-NEXT:    lxv vs2, 0(r4)
+; CHECK-P9-NEXT:    lxv vs3, 16(r4)
+; CHECK-P9-NEXT:    xxmrgld vs4, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P9-NEXT:    xxmrgld vs3, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xvcvdpsxws v2, vs4
+; CHECK-P9-NEXT:    xvcvdpsxws v3, vs2
+; CHECK-P9-NEXT:    xvcvdpsxws v4, vs3
+; CHECK-P9-NEXT:    xvcvdpsxws v5, vs0
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    xxmrgld vs4, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-BE-NEXT:    xxmrgld vs3, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xvcvdpsxws v2, vs4
+; CHECK-BE-NEXT:    xvcvdpsxws v3, vs2
+; CHECK-BE-NEXT:    xvcvdpsxws v4, vs3
+; CHECK-BE-NEXT:    xvcvdpsxws v5, vs0
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x double>, <8 x double>* %0, align 64
+  %1 = fptosi <8 x double> %a to <8 x i32>
+  store <8 x i32> %1, <8 x i32>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt_signed(<16 x i32>* noalias nocapture sret %agg.result, <16 x double>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    li r8, 64
+; CHECK-P8-NEXT:    li r7, 16
+; CHECK-P8-NEXT:    li r9, 80
+; CHECK-P8-NEXT:    lxvd2x vs7, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r8
+; CHECK-P8-NEXT:    li r8, 96
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    lxvd2x vs5, r4, r8
+; CHECK-P8-NEXT:    li r8, 112
+; CHECK-P8-NEXT:    lxvd2x vs4, r4, r9
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    lxvd2x vs6, r4, r8
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    xxmrgld vs8, vs1, vs0
+; CHECK-P8-NEXT:    xxswapd vs6, vs6
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs7
+; CHECK-P8-NEXT:    xxmrgld vs7, vs4, vs3
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    xxmrgld vs4, vs6, vs5
+; CHECK-P8-NEXT:    xvcvdpsxws v2, vs8
+; CHECK-P8-NEXT:    xvcvdpsxws v3, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs6, vs5
+; CHECK-P8-NEXT:    xxmrgld vs5, vs2, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs1
+; CHECK-P8-NEXT:    xvcvdpsxws v4, vs7
+; CHECK-P8-NEXT:    xvcvdpsxws v5, vs3
+; CHECK-P8-NEXT:    xvcvdpsxws v0, vs4
+; CHECK-P8-NEXT:    xvcvdpsxws v1, vs0
+; CHECK-P8-NEXT:    xvcvdpsxws v6, vs5
+; CHECK-P8-NEXT:    xvcvdpsxws v7, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    vmrgew v4, v1, v0
+; CHECK-P8-NEXT:    vmrgew v5, v7, v6
+; CHECK-P8-NEXT:    stvx v2, r3, r7
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    stvx v4, r3, r6
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 32(r4)
+; CHECK-P9-NEXT:    lxv vs1, 48(r4)
+; CHECK-P9-NEXT:    lxv vs2, 0(r4)
+; CHECK-P9-NEXT:    lxv vs3, 16(r4)
+; CHECK-P9-NEXT:    lxv vs4, 96(r4)
+; CHECK-P9-NEXT:    lxv vs5, 112(r4)
+; CHECK-P9-NEXT:    lxv vs6, 64(r4)
+; CHECK-P9-NEXT:    lxv vs7, 80(r4)
+; CHECK-P9-NEXT:    xxmrgld vs8, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P9-NEXT:    xxmrgld vs3, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrgld vs1, vs7, vs6
+; CHECK-P9-NEXT:    xxmrghd vs6, vs7, vs6
+; CHECK-P9-NEXT:    xxmrgld vs7, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-P9-NEXT:    xvcvdpsxws v2, vs8
+; CHECK-P9-NEXT:    xvcvdpsxws v3, vs2
+; CHECK-P9-NEXT:    xvcvdpsxws v4, vs3
+; CHECK-P9-NEXT:    xvcvdpsxws v5, vs0
+; CHECK-P9-NEXT:    xvcvdpsxws v0, vs1
+; CHECK-P9-NEXT:    xvcvdpsxws v1, vs6
+; CHECK-P9-NEXT:    xvcvdpsxws v6, vs7
+; CHECK-P9-NEXT:    xvcvdpsxws v7, vs4
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    vmrgew v4, v1, v0
+; CHECK-P9-NEXT:    vmrgew v5, v7, v6
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    stxv v5, 48(r3)
+; CHECK-P9-NEXT:    stxv v4, 32(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    lxv vs4, 112(r4)
+; CHECK-BE-NEXT:    lxv vs5, 96(r4)
+; CHECK-BE-NEXT:    lxv vs6, 80(r4)
+; CHECK-BE-NEXT:    lxv vs7, 64(r4)
+; CHECK-BE-NEXT:    xxmrgld vs8, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-BE-NEXT:    xxmrgld vs3, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrgld vs1, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd vs6, vs7, vs6
+; CHECK-BE-NEXT:    xxmrgld vs7, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-BE-NEXT:    xvcvdpsxws v2, vs8
+; CHECK-BE-NEXT:    xvcvdpsxws v3, vs2
+; CHECK-BE-NEXT:    xvcvdpsxws v4, vs3
+; CHECK-BE-NEXT:    xvcvdpsxws v5, vs0
+; CHECK-BE-NEXT:    xvcvdpsxws v0, vs1
+; CHECK-BE-NEXT:    xvcvdpsxws v1, vs6
+; CHECK-BE-NEXT:    xvcvdpsxws v6, vs7
+; CHECK-BE-NEXT:    xvcvdpsxws v7, vs4
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    vmrgew v4, v1, v0
+; CHECK-BE-NEXT:    vmrgew v5, v7, v6
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x double>, <16 x double>* %0, align 128
+  %1 = fptosi <16 x double> %a to <16 x i32>
+  store <16 x i32> %1, <16 x i32>* %agg.result, align 64
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll b/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ef7b9c190bc23d331201b2c4fb83a11ac3ea72fc
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll
@@ -0,0 +1,1316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i16 @test2elt(<2 x double> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    xscvdpsxws f1, v2
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    vmrglb v2, v2, v3
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r3, r3, 48
+; CHECK-P8-NEXT:    sth r3, -2(r1)
+; CHECK-P8-NEXT:    lhz r3, -2(r1)
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxswapd vs0, v2
+; CHECK-P9-NEXT:    xscvdpsxws f1, v2
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    addi r3, r1, -2
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    vmrglb v2, v2, v3
+; CHECK-P9-NEXT:    vsldoi v2, v2, v2, 8
+; CHECK-P9-NEXT:    stxsihx v2, 0, r3
+; CHECK-P9-NEXT:    lhz r3, -2(r1)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxswapd vs0, v2
+; CHECK-BE-NEXT:    xscvdpsxws f1, v2
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    mfvsrwz r4, f0
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    addi r3, r1, -2
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-NEXT:    vsldoi v2, v2, v2, 10
+; CHECK-BE-NEXT:    stxsihx v2, 0, r3
+; CHECK-BE-NEXT:    lhz r3, -2(r1)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <2 x double> %a to <2 x i8>
+  %1 = bitcast <2 x i8> %0 to i16
+  ret i16 %1
+}
+
+define i32 @test4elt(<4 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f3, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    xxswapd v2, vs2
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xxswapd v4, vs3
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    xxswapd v5, vs1
+; CHECK-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-P8-NEXT:    vmrglb v3, v5, v4
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs2, vs1
+; CHECK-P9-NEXT:    xxswapd vs3, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mfvsrwz r5, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    mfvsrwz r4, f2
+; CHECK-P9-NEXT:    mfvsrwz r6, f3
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    vmrglb v2, v2, v3
+; CHECK-P9-NEXT:    vmrglb v3, v4, v5
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxswapd vs2, vs1
+; CHECK-BE-NEXT:    xxswapd vs3, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    mfvsrwz r5, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    mfvsrwz r4, f2
+; CHECK-BE-NEXT:    mfvsrwz r6, f3
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    li r3, 0
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-NEXT:    vmrghb v3, v4, v5
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x double>, <4 x double>* %0, align 32
+  %1 = fptoui <4 x double> %a to <4 x i8>
+  %2 = bitcast <4 x i8> %1 to i32
+  ret i32 %2
+}
+
+define i64 @test8elt(<8 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    lxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f4, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f5, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f6, f2
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f7, f3
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    mtvsrd f4, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f6
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs4
+; CHECK-P8-NEXT:    mfvsrwz r4, f7
+; CHECK-P8-NEXT:    mtvsrd f6, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs5
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mtvsrd f7, r4
+; CHECK-P8-NEXT:    xxswapd v4, vs6
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v1, vs7
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v5, vs0
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    xxswapd v0, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    xxswapd v6, vs2
+; CHECK-P8-NEXT:    vmrglb v2, v5, v2
+; CHECK-P8-NEXT:    xxswapd v5, vs0
+; CHECK-P8-NEXT:    vmrglb v3, v0, v3
+; CHECK-P8-NEXT:    vmrglb v4, v6, v4
+; CHECK-P8-NEXT:    vmrglb v5, v5, v1
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglh v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r3)
+; CHECK-P9-NEXT:    lxv vs1, 32(r3)
+; CHECK-P9-NEXT:    lxv vs2, 16(r3)
+; CHECK-P9-NEXT:    lxv vs3, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs4, vs3
+; CHECK-P9-NEXT:    xxswapd vs5, vs2
+; CHECK-P9-NEXT:    xxswapd vs6, vs1
+; CHECK-P9-NEXT:    xxswapd vs7, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    mfvsrwz r3, f3
+; CHECK-P9-NEXT:    mfvsrwz r5, f2
+; CHECK-P9-NEXT:    mfvsrwz r7, f1
+; CHECK-P9-NEXT:    mfvsrwz r9, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mfvsrwz r4, f4
+; CHECK-P9-NEXT:    mfvsrwz r6, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r10, f7
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    xxswapd v0, vs4
+; CHECK-P9-NEXT:    xxswapd v6, vs6
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    xxswapd v1, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs7
+; CHECK-P9-NEXT:    vmrglb v2, v2, v3
+; CHECK-P9-NEXT:    vmrglb v3, v4, v5
+; CHECK-P9-NEXT:    vmrglb v4, v0, v1
+; CHECK-P9-NEXT:    vmrglb v5, v6, v7
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglh v3, v5, v4
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    xxswapd vs4, vs3
+; CHECK-BE-NEXT:    xxswapd vs5, vs2
+; CHECK-BE-NEXT:    xxswapd vs6, vs1
+; CHECK-BE-NEXT:    xxswapd vs7, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    mfvsrwz r3, f3
+; CHECK-BE-NEXT:    mfvsrwz r5, f2
+; CHECK-BE-NEXT:    mfvsrwz r7, f1
+; CHECK-BE-NEXT:    mfvsrwz r9, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    sldi r7, r7, 56
+; CHECK-BE-NEXT:    sldi r9, r9, 56
+; CHECK-BE-NEXT:    mfvsrwz r4, f4
+; CHECK-BE-NEXT:    mfvsrwz r6, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r10, f7
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    sldi r8, r8, 56
+; CHECK-BE-NEXT:    sldi r10, r10, 56
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-NEXT:    vmrghb v3, v4, v5
+; CHECK-BE-NEXT:    vmrghb v4, v0, v1
+; CHECK-BE-NEXT:    vmrghb v5, v6, v7
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v5, v4
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x double>, <8 x double>* %0, align 64
+  %1 = fptoui <8 x double> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to i64
+  ret i64 %2
+}
+
+define <16 x i8> @test16elt(<16 x double>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    lxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    li r4, 64
+; CHECK-P8-NEXT:    xscvdpsxws f4, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    lxvd2x vs5, r3, r4
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xscvdpsxws f6, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    lxvd2x vs7, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xscvdpsxws f8, f2
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    lxvd2x vs9, r3, r4
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    xscvdpsxws f10, f3
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    lxvd2x vs11, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f12, f5
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f13, f7
+; CHECK-P8-NEXT:    xxswapd vs7, vs7
+; CHECK-P8-NEXT:    xscvdpsxws v2, f9
+; CHECK-P8-NEXT:    xxswapd vs9, vs9
+; CHECK-P8-NEXT:    mfvsrwz r3, f4
+; CHECK-P8-NEXT:    xscvdpsxws v3, f11
+; CHECK-P8-NEXT:    xxswapd vs11, vs11
+; CHECK-P8-NEXT:    mfvsrwz r4, f6
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mtvsrd f4, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f8
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xxswapd v4, vs4
+; CHECK-P8-NEXT:    mtvsrd f6, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f10
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xxswapd v5, vs6
+; CHECK-P8-NEXT:    mtvsrd f8, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f12
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xxswapd v0, vs8
+; CHECK-P8-NEXT:    mtvsrd f10, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f13
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    xxswapd v1, vs10
+; CHECK-P8-NEXT:    mtvsrd f12, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, v2
+; CHECK-P8-NEXT:    xscvdpsxws f7, f7
+; CHECK-P8-NEXT:    xxswapd v6, vs12
+; CHECK-P8-NEXT:    mtvsrd f13, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, v3
+; CHECK-P8-NEXT:    mtvsrd v2, r3
+; CHECK-P8-NEXT:    xxswapd v7, vs13
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    xscvdpsxws f9, f9
+; CHECK-P8-NEXT:    xxswapd v2, v2
+; CHECK-P8-NEXT:    xscvdpsxws f11, f11
+; CHECK-P8-NEXT:    mtvsrd v3, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v3, v3
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    xxswapd v9, vs1
+; CHECK-P8-NEXT:    mfvsrwz r3, f5
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    xxswapd v10, vs2
+; CHECK-P8-NEXT:    mfvsrwz r4, f7
+; CHECK-P8-NEXT:    mtvsrd f5, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f9
+; CHECK-P8-NEXT:    mtvsrd f7, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f11
+; CHECK-P8-NEXT:    vmrglb v4, v8, v4
+; CHECK-P8-NEXT:    xxswapd v8, vs3
+; CHECK-P8-NEXT:    vmrglb v5, v9, v5
+; CHECK-P8-NEXT:    xxswapd v9, vs5
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    vmrglb v0, v10, v0
+; CHECK-P8-NEXT:    xxswapd v10, vs7
+; CHECK-P8-NEXT:    vmrglb v1, v8, v1
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    vmrglb v6, v9, v6
+; CHECK-P8-NEXT:    xxswapd v9, vs1
+; CHECK-P8-NEXT:    vmrglb v7, v10, v7
+; CHECK-P8-NEXT:    vmrglb v2, v8, v2
+; CHECK-P8-NEXT:    vmrglb v3, v9, v3
+; CHECK-P8-NEXT:    vmrglh v4, v5, v4
+; CHECK-P8-NEXT:    vmrglh v5, v1, v0
+; CHECK-P8-NEXT:    vmrglh v0, v7, v6
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v2, v2, v0
+; CHECK-P8-NEXT:    xxmrgld v2, v2, v3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs2, 48(r3)
+; CHECK-P9-NEXT:    lxv vs3, 32(r3)
+; CHECK-P9-NEXT:    lxv vs4, 16(r3)
+; CHECK-P9-NEXT:    lxv vs5, 0(r3)
+; CHECK-P9-NEXT:    lxv vs0, 112(r3)
+; CHECK-P9-NEXT:    lxv vs1, 96(r3)
+; CHECK-P9-NEXT:    lxv vs6, 80(r3)
+; CHECK-P9-NEXT:    lxv vs7, 64(r3)
+; CHECK-P9-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxswapd vs8, vs5
+; CHECK-P9-NEXT:    xxswapd vs9, vs4
+; CHECK-P9-NEXT:    xxswapd vs10, vs3
+; CHECK-P9-NEXT:    xxswapd vs11, vs2
+; CHECK-P9-NEXT:    xxswapd vs12, vs7
+; CHECK-P9-NEXT:    xxswapd vs13, vs6
+; CHECK-P9-NEXT:    xxswapd v2, vs1
+; CHECK-P9-NEXT:    xxswapd v3, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f8, f8
+; CHECK-P9-NEXT:    xscvdpsxws f9, f9
+; CHECK-P9-NEXT:    xscvdpsxws f10, f10
+; CHECK-P9-NEXT:    xscvdpsxws f11, f11
+; CHECK-P9-NEXT:    xscvdpsxws f12, f12
+; CHECK-P9-NEXT:    xscvdpsxws f13, f13
+; CHECK-P9-NEXT:    xscvdpsxws v2, v2
+; CHECK-P9-NEXT:    xscvdpsxws v3, v3
+; CHECK-P9-NEXT:    mfvsrwz r3, f5
+; CHECK-P9-NEXT:    mfvsrwz r4, f4
+; CHECK-P9-NEXT:    mfvsrwz r5, f3
+; CHECK-P9-NEXT:    mfvsrwz r6, f2
+; CHECK-P9-NEXT:    mfvsrwz r11, f7
+; CHECK-P9-NEXT:    mfvsrwz r12, f6
+; CHECK-P9-NEXT:    mfvsrwz r0, f1
+; CHECK-P9-NEXT:    mfvsrwz r30, f0
+; CHECK-P9-NEXT:    mfvsrwz r7, f8
+; CHECK-P9-NEXT:    mfvsrwz r8, f9
+; CHECK-P9-NEXT:    mfvsrwz r9, f10
+; CHECK-P9-NEXT:    mfvsrwz r10, f11
+; CHECK-P9-NEXT:    mfvsrwz r29, f12
+; CHECK-P9-NEXT:    mfvsrwz r28, f13
+; CHECK-P9-NEXT:    mfvsrwz r27, v2
+; CHECK-P9-NEXT:    mfvsrwz r26, v3
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f8, r11
+; CHECK-P9-NEXT:    mtvsrd f9, r12
+; CHECK-P9-NEXT:    mtvsrd f10, r0
+; CHECK-P9-NEXT:    mtvsrd f11, r30
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    mtvsrd f12, r29
+; CHECK-P9-NEXT:    mtvsrd f13, r28
+; CHECK-P9-NEXT:    mtvsrd v2, r27
+; CHECK-P9-NEXT:    mtvsrd v3, r26
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    xxswapd v5, vs1
+; CHECK-P9-NEXT:    xxswapd v0, vs2
+; CHECK-P9-NEXT:    xxswapd v1, vs3
+; CHECK-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v6, vs4
+; CHECK-P9-NEXT:    xxswapd v7, vs5
+; CHECK-P9-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v8, vs6
+; CHECK-P9-NEXT:    xxswapd v9, vs7
+; CHECK-P9-NEXT:    xxswapd v10, vs8
+; CHECK-P9-NEXT:    xxswapd v11, vs12
+; CHECK-P9-NEXT:    xxswapd v12, vs9
+; CHECK-P9-NEXT:    xxswapd v13, vs13
+; CHECK-P9-NEXT:    xxswapd v14, vs10
+; CHECK-P9-NEXT:    xxswapd v2, v2
+; CHECK-P9-NEXT:    xxswapd v15, vs11
+; CHECK-P9-NEXT:    xxswapd v3, v3
+; CHECK-P9-NEXT:    vmrglb v4, v4, v6
+; CHECK-P9-NEXT:    vmrglb v5, v5, v7
+; CHECK-P9-NEXT:    vmrglb v0, v0, v8
+; CHECK-P9-NEXT:    vmrglb v1, v1, v9
+; CHECK-P9-NEXT:    vmrglb v6, v10, v11
+; CHECK-P9-NEXT:    vmrglb v7, v12, v13
+; CHECK-P9-NEXT:    vmrglb v2, v14, v2
+; CHECK-P9-NEXT:    vmrglb v3, v15, v3
+; CHECK-P9-NEXT:    vmrglh v4, v5, v4
+; CHECK-P9-NEXT:    vmrglh v5, v1, v0
+; CHECK-P9-NEXT:    vmrglh v0, v7, v6
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglw v3, v5, v4
+; CHECK-P9-NEXT:    vmrglw v2, v2, v0
+; CHECK-P9-NEXT:    xxmrgld v2, v2, v3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 64(r3)
+; CHECK-BE-NEXT:    lxv vs3, 80(r3)
+; CHECK-BE-NEXT:    lxv vs4, 96(r3)
+; CHECK-BE-NEXT:    lxv vs5, 112(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs6, 32(r3)
+; CHECK-BE-NEXT:    lxv vs7, 48(r3)
+; CHECK-BE-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxswapd vs8, vs5
+; CHECK-BE-NEXT:    xxswapd vs9, vs4
+; CHECK-BE-NEXT:    xxswapd vs10, vs3
+; CHECK-BE-NEXT:    xxswapd vs11, vs2
+; CHECK-BE-NEXT:    xxswapd vs12, vs7
+; CHECK-BE-NEXT:    xxswapd vs13, vs6
+; CHECK-BE-NEXT:    xxswapd v2, vs1
+; CHECK-BE-NEXT:    xxswapd v3, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f8, f8
+; CHECK-BE-NEXT:    xscvdpsxws f9, f9
+; CHECK-BE-NEXT:    xscvdpsxws f10, f10
+; CHECK-BE-NEXT:    xscvdpsxws f11, f11
+; CHECK-BE-NEXT:    xscvdpsxws f12, f12
+; CHECK-BE-NEXT:    xscvdpsxws f13, f13
+; CHECK-BE-NEXT:    xscvdpsxws v2, v2
+; CHECK-BE-NEXT:    xscvdpsxws v3, v3
+; CHECK-BE-NEXT:    mfvsrwz r3, f5
+; CHECK-BE-NEXT:    mfvsrwz r4, f4
+; CHECK-BE-NEXT:    mfvsrwz r5, f3
+; CHECK-BE-NEXT:    mfvsrwz r6, f2
+; CHECK-BE-NEXT:    mfvsrwz r11, f7
+; CHECK-BE-NEXT:    mfvsrwz r12, f6
+; CHECK-BE-NEXT:    mfvsrwz r0, f1
+; CHECK-BE-NEXT:    mfvsrwz r30, f0
+; CHECK-BE-NEXT:    mfvsrwz r7, f8
+; CHECK-BE-NEXT:    mfvsrwz r8, f9
+; CHECK-BE-NEXT:    mfvsrwz r9, f10
+; CHECK-BE-NEXT:    mfvsrwz r10, f11
+; CHECK-BE-NEXT:    mfvsrwz r29, f12
+; CHECK-BE-NEXT:    mfvsrwz r28, f13
+; CHECK-BE-NEXT:    mfvsrwz r27, v2
+; CHECK-BE-NEXT:    mfvsrwz r26, v3
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    sldi r11, r11, 56
+; CHECK-BE-NEXT:    sldi r12, r12, 56
+; CHECK-BE-NEXT:    sldi r0, r0, 56
+; CHECK-BE-NEXT:    sldi r30, r30, 56
+; CHECK-BE-NEXT:    sldi r7, r7, 56
+; CHECK-BE-NEXT:    sldi r8, r8, 56
+; CHECK-BE-NEXT:    sldi r9, r9, 56
+; CHECK-BE-NEXT:    sldi r10, r10, 56
+; CHECK-BE-NEXT:    sldi r29, r29, 56
+; CHECK-BE-NEXT:    sldi r28, r28, 56
+; CHECK-BE-NEXT:    sldi r27, r27, 56
+; CHECK-BE-NEXT:    sldi r26, r26, 56
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v8, r11
+; CHECK-BE-NEXT:    mtvsrd v10, r12
+; CHECK-BE-NEXT:    mtvsrd v12, r0
+; CHECK-BE-NEXT:    mtvsrd v14, r30
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v9, r29
+; CHECK-BE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v11, r28
+; CHECK-BE-NEXT:    mtvsrd v13, r27
+; CHECK-BE-NEXT:    mtvsrd v15, r26
+; CHECK-BE-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    vmrghb v2, v2, v0
+; CHECK-BE-NEXT:    vmrghb v3, v3, v1
+; CHECK-BE-NEXT:    vmrghb v4, v4, v6
+; CHECK-BE-NEXT:    vmrghb v5, v5, v7
+; CHECK-BE-NEXT:    vmrghb v0, v8, v9
+; CHECK-BE-NEXT:    vmrghb v1, v10, v11
+; CHECK-BE-NEXT:    vmrghb v6, v12, v13
+; CHECK-BE-NEXT:    vmrghb v7, v14, v15
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v5, v4
+; CHECK-BE-NEXT:    vmrghh v4, v1, v0
+; CHECK-BE-NEXT:    vmrghh v5, v7, v6
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x double>, <16 x double>* %0, align 128
+  %1 = fptoui <16 x double> %a to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define i16 @test2elt_signed(<2 x double> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    xscvdpsxws f1, v2
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mfvsrwz r3, f1
+; CHECK-P8-NEXT:    mfvsrwz r4, f0
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xxswapd v3, vs1
+; CHECK-P8-NEXT:    vmrglb v2, v2, v3
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r3, r3, 48
+; CHECK-P8-NEXT:    sth r3, -2(r1)
+; CHECK-P8-NEXT:    lhz r3, -2(r1)
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxswapd vs0, v2
+; CHECK-P9-NEXT:    xscvdpsxws f1, v2
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mfvsrwz r4, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    addi r3, r1, -2
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    vmrglb v2, v2, v3
+; CHECK-P9-NEXT:    vsldoi v2, v2, v2, 8
+; CHECK-P9-NEXT:    stxsihx v2, 0, r3
+; CHECK-P9-NEXT:    lhz r3, -2(r1)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxswapd vs0, v2
+; CHECK-BE-NEXT:    xscvdpsxws f1, v2
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    mfvsrwz r4, f0
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    addi r3, r1, -2
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-NEXT:    vsldoi v2, v2, v2, 10
+; CHECK-BE-NEXT:    stxsihx v2, 0, r3
+; CHECK-BE-NEXT:    lhz r3, -2(r1)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptosi <2 x double> %a to <2 x i8>
+  %1 = bitcast <2 x i8> %0 to i16
+  ret i16 %1
+}
+
+define i32 @test4elt_signed(<4 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f3, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    xxswapd v2, vs2
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    xxswapd v4, vs3
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    xxswapd v5, vs1
+; CHECK-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-P8-NEXT:    vmrglb v3, v5, v4
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs2, vs1
+; CHECK-P9-NEXT:    xxswapd vs3, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    mfvsrwz r3, f1
+; CHECK-P9-NEXT:    mfvsrwz r5, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    mfvsrwz r4, f2
+; CHECK-P9-NEXT:    mfvsrwz r6, f3
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    vmrglb v2, v2, v3
+; CHECK-P9-NEXT:    vmrglb v3, v4, v5
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxswapd vs2, vs1
+; CHECK-BE-NEXT:    xxswapd vs3, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    mfvsrwz r3, f1
+; CHECK-BE-NEXT:    mfvsrwz r5, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    mfvsrwz r4, f2
+; CHECK-BE-NEXT:    mfvsrwz r6, f3
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    li r3, 0
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-NEXT:    vmrghb v3, v4, v5
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x double>, <4 x double>* %0, align 32
+  %1 = fptosi <4 x double> %a to <4 x i8>
+  %2 = bitcast <4 x i8> %1 to i32
+  ret i32 %2
+}
+
+define i64 @test8elt_signed(<8 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    lxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f4, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvdpsxws f5, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvdpsxws f6, f2
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvdpsxws f7, f3
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    mfvsrwz r3, f4
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    mfvsrwz r4, f5
+; CHECK-P8-NEXT:    mtvsrd f4, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f6
+; CHECK-P8-NEXT:    mtvsrd f5, r4
+; CHECK-P8-NEXT:    xxswapd v2, vs4
+; CHECK-P8-NEXT:    mfvsrwz r4, f7
+; CHECK-P8-NEXT:    mtvsrd f6, r3
+; CHECK-P8-NEXT:    xxswapd v3, vs5
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    mtvsrd f7, r4
+; CHECK-P8-NEXT:    xxswapd v4, vs6
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v1, vs7
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v5, vs0
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    xxswapd v0, vs1
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    xxswapd v6, vs2
+; CHECK-P8-NEXT:    vmrglb v2, v5, v2
+; CHECK-P8-NEXT:    xxswapd v5, vs0
+; CHECK-P8-NEXT:    vmrglb v3, v0, v3
+; CHECK-P8-NEXT:    vmrglb v4, v6, v4
+; CHECK-P8-NEXT:    vmrglb v5, v5, v1
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglh v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r3)
+; CHECK-P9-NEXT:    lxv vs1, 32(r3)
+; CHECK-P9-NEXT:    lxv vs2, 16(r3)
+; CHECK-P9-NEXT:    lxv vs3, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs4, vs3
+; CHECK-P9-NEXT:    xxswapd vs5, vs2
+; CHECK-P9-NEXT:    xxswapd vs6, vs1
+; CHECK-P9-NEXT:    xxswapd vs7, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    mfvsrwz r3, f3
+; CHECK-P9-NEXT:    mfvsrwz r5, f2
+; CHECK-P9-NEXT:    mfvsrwz r7, f1
+; CHECK-P9-NEXT:    mfvsrwz r9, f0
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mfvsrwz r4, f4
+; CHECK-P9-NEXT:    mfvsrwz r6, f5
+; CHECK-P9-NEXT:    mfvsrwz r8, f6
+; CHECK-P9-NEXT:    mfvsrwz r10, f7
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxswapd v4, vs2
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    xxswapd v0, vs4
+; CHECK-P9-NEXT:    xxswapd v6, vs6
+; CHECK-P9-NEXT:    xxswapd v3, vs1
+; CHECK-P9-NEXT:    xxswapd v5, vs3
+; CHECK-P9-NEXT:    xxswapd v1, vs5
+; CHECK-P9-NEXT:    xxswapd v7, vs7
+; CHECK-P9-NEXT:    vmrglb v2, v2, v3
+; CHECK-P9-NEXT:    vmrglb v3, v4, v5
+; CHECK-P9-NEXT:    vmrglb v4, v0, v1
+; CHECK-P9-NEXT:    vmrglb v5, v6, v7
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglh v3, v5, v4
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    xxswapd vs4, vs3
+; CHECK-BE-NEXT:    xxswapd vs5, vs2
+; CHECK-BE-NEXT:    xxswapd vs6, vs1
+; CHECK-BE-NEXT:    xxswapd vs7, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    mfvsrwz r3, f3
+; CHECK-BE-NEXT:    mfvsrwz r5, f2
+; CHECK-BE-NEXT:    mfvsrwz r7, f1
+; CHECK-BE-NEXT:    mfvsrwz r9, f0
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    sldi r7, r7, 56
+; CHECK-BE-NEXT:    sldi r9, r9, 56
+; CHECK-BE-NEXT:    mfvsrwz r4, f4
+; CHECK-BE-NEXT:    mfvsrwz r6, f5
+; CHECK-BE-NEXT:    mfvsrwz r8, f6
+; CHECK-BE-NEXT:    mfvsrwz r10, f7
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    sldi r8, r8, 56
+; CHECK-BE-NEXT:    sldi r10, r10, 56
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-NEXT:    vmrghb v3, v4, v5
+; CHECK-BE-NEXT:    vmrghb v4, v0, v1
+; CHECK-BE-NEXT:    vmrghb v5, v6, v7
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v5, v4
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x double>, <8 x double>* %0, align 64
+  %1 = fptosi <8 x double> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to i64
+  ret i64 %2
+}
+
+define <16 x i8> @test16elt_signed(<16 x double>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    lxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    li r4, 64
+; CHECK-P8-NEXT:    xscvdpsxws f4, f0
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    lxvd2x vs5, r3, r4
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xscvdpsxws f6, f1
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    lxvd2x vs7, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xscvdpsxws f8, f2
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    lxvd2x vs9, r3, r4
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    xscvdpsxws f10, f3
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    lxvd2x vs11, r3, r4
+; CHECK-P8-NEXT:    xscvdpsxws f12, f5
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    xscvdpsxws f13, f7
+; CHECK-P8-NEXT:    xxswapd vs7, vs7
+; CHECK-P8-NEXT:    xscvdpsxws v2, f9
+; CHECK-P8-NEXT:    xxswapd vs9, vs9
+; CHECK-P8-NEXT:    mfvsrwz r3, f4
+; CHECK-P8-NEXT:    xscvdpsxws v3, f11
+; CHECK-P8-NEXT:    xxswapd vs11, vs11
+; CHECK-P8-NEXT:    mfvsrwz r4, f6
+; CHECK-P8-NEXT:    xscvdpsxws f0, f0
+; CHECK-P8-NEXT:    mtvsrd f4, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f8
+; CHECK-P8-NEXT:    xscvdpsxws f1, f1
+; CHECK-P8-NEXT:    xxswapd v4, vs4
+; CHECK-P8-NEXT:    mtvsrd f6, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f10
+; CHECK-P8-NEXT:    xscvdpsxws f2, f2
+; CHECK-P8-NEXT:    xxswapd v5, vs6
+; CHECK-P8-NEXT:    mtvsrd f8, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f12
+; CHECK-P8-NEXT:    xscvdpsxws f3, f3
+; CHECK-P8-NEXT:    xxswapd v0, vs8
+; CHECK-P8-NEXT:    mtvsrd f10, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f13
+; CHECK-P8-NEXT:    xscvdpsxws f5, f5
+; CHECK-P8-NEXT:    xxswapd v1, vs10
+; CHECK-P8-NEXT:    mtvsrd f12, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, v2
+; CHECK-P8-NEXT:    xscvdpsxws f7, f7
+; CHECK-P8-NEXT:    xxswapd v6, vs12
+; CHECK-P8-NEXT:    mtvsrd f13, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, v3
+; CHECK-P8-NEXT:    mtvsrd v2, r3
+; CHECK-P8-NEXT:    xxswapd v7, vs13
+; CHECK-P8-NEXT:    mfvsrwz r3, f0
+; CHECK-P8-NEXT:    xscvdpsxws f9, f9
+; CHECK-P8-NEXT:    xxswapd v2, v2
+; CHECK-P8-NEXT:    xscvdpsxws f11, f11
+; CHECK-P8-NEXT:    mtvsrd v3, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f1
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v3, v3
+; CHECK-P8-NEXT:    mfvsrwz r3, f2
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    mfvsrwz r4, f3
+; CHECK-P8-NEXT:    mtvsrd f2, r3
+; CHECK-P8-NEXT:    xxswapd v9, vs1
+; CHECK-P8-NEXT:    mfvsrwz r3, f5
+; CHECK-P8-NEXT:    mtvsrd f3, r4
+; CHECK-P8-NEXT:    xxswapd v10, vs2
+; CHECK-P8-NEXT:    mfvsrwz r4, f7
+; CHECK-P8-NEXT:    mtvsrd f5, r3
+; CHECK-P8-NEXT:    mfvsrwz r3, f9
+; CHECK-P8-NEXT:    mtvsrd f7, r4
+; CHECK-P8-NEXT:    mfvsrwz r4, f11
+; CHECK-P8-NEXT:    vmrglb v4, v8, v4
+; CHECK-P8-NEXT:    xxswapd v8, vs3
+; CHECK-P8-NEXT:    vmrglb v5, v9, v5
+; CHECK-P8-NEXT:    xxswapd v9, vs5
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mtvsrd f1, r4
+; CHECK-P8-NEXT:    vmrglb v0, v10, v0
+; CHECK-P8-NEXT:    xxswapd v10, vs7
+; CHECK-P8-NEXT:    vmrglb v1, v8, v1
+; CHECK-P8-NEXT:    xxswapd v8, vs0
+; CHECK-P8-NEXT:    vmrglb v6, v9, v6
+; CHECK-P8-NEXT:    xxswapd v9, vs1
+; CHECK-P8-NEXT:    vmrglb v7, v10, v7
+; CHECK-P8-NEXT:    vmrglb v2, v8, v2
+; CHECK-P8-NEXT:    vmrglb v3, v9, v3
+; CHECK-P8-NEXT:    vmrglh v4, v5, v4
+; CHECK-P8-NEXT:    vmrglh v5, v1, v0
+; CHECK-P8-NEXT:    vmrglh v0, v7, v6
+; CHECK-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-P8-NEXT:    vmrglw v3, v5, v4
+; CHECK-P8-NEXT:    vmrglw v2, v2, v0
+; CHECK-P8-NEXT:    xxmrgld v2, v2, v3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs2, 48(r3)
+; CHECK-P9-NEXT:    lxv vs3, 32(r3)
+; CHECK-P9-NEXT:    lxv vs4, 16(r3)
+; CHECK-P9-NEXT:    lxv vs5, 0(r3)
+; CHECK-P9-NEXT:    lxv vs0, 112(r3)
+; CHECK-P9-NEXT:    lxv vs1, 96(r3)
+; CHECK-P9-NEXT:    lxv vs6, 80(r3)
+; CHECK-P9-NEXT:    lxv vs7, 64(r3)
+; CHECK-P9-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxswapd vs8, vs5
+; CHECK-P9-NEXT:    xxswapd vs9, vs4
+; CHECK-P9-NEXT:    xxswapd vs10, vs3
+; CHECK-P9-NEXT:    xxswapd vs11, vs2
+; CHECK-P9-NEXT:    xxswapd vs12, vs7
+; CHECK-P9-NEXT:    xxswapd vs13, vs6
+; CHECK-P9-NEXT:    xxswapd v2, vs1
+; CHECK-P9-NEXT:    xxswapd v3, vs0
+; CHECK-P9-NEXT:    xscvdpsxws f5, f5
+; CHECK-P9-NEXT:    xscvdpsxws f4, f4
+; CHECK-P9-NEXT:    xscvdpsxws f3, f3
+; CHECK-P9-NEXT:    xscvdpsxws f2, f2
+; CHECK-P9-NEXT:    xscvdpsxws f7, f7
+; CHECK-P9-NEXT:    xscvdpsxws f6, f6
+; CHECK-P9-NEXT:    xscvdpsxws f1, f1
+; CHECK-P9-NEXT:    xscvdpsxws f0, f0
+; CHECK-P9-NEXT:    xscvdpsxws f8, f8
+; CHECK-P9-NEXT:    xscvdpsxws f9, f9
+; CHECK-P9-NEXT:    xscvdpsxws f10, f10
+; CHECK-P9-NEXT:    xscvdpsxws f11, f11
+; CHECK-P9-NEXT:    xscvdpsxws f12, f12
+; CHECK-P9-NEXT:    xscvdpsxws f13, f13
+; CHECK-P9-NEXT:    xscvdpsxws v2, v2
+; CHECK-P9-NEXT:    xscvdpsxws v3, v3
+; CHECK-P9-NEXT:    mfvsrwz r3, f5
+; CHECK-P9-NEXT:    mfvsrwz r4, f4
+; CHECK-P9-NEXT:    mfvsrwz r5, f3
+; CHECK-P9-NEXT:    mfvsrwz r6, f2
+; CHECK-P9-NEXT:    mfvsrwz r11, f7
+; CHECK-P9-NEXT:    mfvsrwz r12, f6
+; CHECK-P9-NEXT:    mfvsrwz r0, f1
+; CHECK-P9-NEXT:    mfvsrwz r30, f0
+; CHECK-P9-NEXT:    mfvsrwz r7, f8
+; CHECK-P9-NEXT:    mfvsrwz r8, f9
+; CHECK-P9-NEXT:    mfvsrwz r9, f10
+; CHECK-P9-NEXT:    mfvsrwz r10, f11
+; CHECK-P9-NEXT:    mfvsrwz r29, f12
+; CHECK-P9-NEXT:    mfvsrwz r28, f13
+; CHECK-P9-NEXT:    mfvsrwz r27, v2
+; CHECK-P9-NEXT:    mfvsrwz r26, v3
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    mtvsrd f1, r4
+; CHECK-P9-NEXT:    mtvsrd f2, r5
+; CHECK-P9-NEXT:    mtvsrd f3, r6
+; CHECK-P9-NEXT:    mtvsrd f8, r11
+; CHECK-P9-NEXT:    mtvsrd f9, r12
+; CHECK-P9-NEXT:    mtvsrd f10, r0
+; CHECK-P9-NEXT:    mtvsrd f11, r30
+; CHECK-P9-NEXT:    mtvsrd f4, r7
+; CHECK-P9-NEXT:    mtvsrd f5, r8
+; CHECK-P9-NEXT:    mtvsrd f6, r9
+; CHECK-P9-NEXT:    mtvsrd f7, r10
+; CHECK-P9-NEXT:    mtvsrd f12, r29
+; CHECK-P9-NEXT:    mtvsrd f13, r28
+; CHECK-P9-NEXT:    mtvsrd v2, r27
+; CHECK-P9-NEXT:    mtvsrd v3, r26
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    xxswapd v5, vs1
+; CHECK-P9-NEXT:    xxswapd v0, vs2
+; CHECK-P9-NEXT:    xxswapd v1, vs3
+; CHECK-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v6, vs4
+; CHECK-P9-NEXT:    xxswapd v7, vs5
+; CHECK-P9-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xxswapd v8, vs6
+; CHECK-P9-NEXT:    xxswapd v9, vs7
+; CHECK-P9-NEXT:    xxswapd v10, vs8
+; CHECK-P9-NEXT:    xxswapd v11, vs12
+; CHECK-P9-NEXT:    xxswapd v12, vs9
+; CHECK-P9-NEXT:    xxswapd v13, vs13
+; CHECK-P9-NEXT:    xxswapd v14, vs10
+; CHECK-P9-NEXT:    xxswapd v2, v2
+; CHECK-P9-NEXT:    xxswapd v15, vs11
+; CHECK-P9-NEXT:    xxswapd v3, v3
+; CHECK-P9-NEXT:    vmrglb v4, v4, v6
+; CHECK-P9-NEXT:    vmrglb v5, v5, v7
+; CHECK-P9-NEXT:    vmrglb v0, v0, v8
+; CHECK-P9-NEXT:    vmrglb v1, v1, v9
+; CHECK-P9-NEXT:    vmrglb v6, v10, v11
+; CHECK-P9-NEXT:    vmrglb v7, v12, v13
+; CHECK-P9-NEXT:    vmrglb v2, v14, v2
+; CHECK-P9-NEXT:    vmrglb v3, v15, v3
+; CHECK-P9-NEXT:    vmrglh v4, v5, v4
+; CHECK-P9-NEXT:    vmrglh v5, v1, v0
+; CHECK-P9-NEXT:    vmrglh v0, v7, v6
+; CHECK-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-P9-NEXT:    vmrglw v3, v5, v4
+; CHECK-P9-NEXT:    vmrglw v2, v2, v0
+; CHECK-P9-NEXT:    xxmrgld v2, v2, v3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 64(r3)
+; CHECK-BE-NEXT:    lxv vs3, 80(r3)
+; CHECK-BE-NEXT:    lxv vs4, 96(r3)
+; CHECK-BE-NEXT:    lxv vs5, 112(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs6, 32(r3)
+; CHECK-BE-NEXT:    lxv vs7, 48(r3)
+; CHECK-BE-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxswapd vs8, vs5
+; CHECK-BE-NEXT:    xxswapd vs9, vs4
+; CHECK-BE-NEXT:    xxswapd vs10, vs3
+; CHECK-BE-NEXT:    xxswapd vs11, vs2
+; CHECK-BE-NEXT:    xxswapd vs12, vs7
+; CHECK-BE-NEXT:    xxswapd vs13, vs6
+; CHECK-BE-NEXT:    xxswapd v2, vs1
+; CHECK-BE-NEXT:    xxswapd v3, vs0
+; CHECK-BE-NEXT:    xscvdpsxws f5, f5
+; CHECK-BE-NEXT:    xscvdpsxws f4, f4
+; CHECK-BE-NEXT:    xscvdpsxws f3, f3
+; CHECK-BE-NEXT:    xscvdpsxws f2, f2
+; CHECK-BE-NEXT:    xscvdpsxws f7, f7
+; CHECK-BE-NEXT:    xscvdpsxws f6, f6
+; CHECK-BE-NEXT:    xscvdpsxws f1, f1
+; CHECK-BE-NEXT:    xscvdpsxws f0, f0
+; CHECK-BE-NEXT:    xscvdpsxws f8, f8
+; CHECK-BE-NEXT:    xscvdpsxws f9, f9
+; CHECK-BE-NEXT:    xscvdpsxws f10, f10
+; CHECK-BE-NEXT:    xscvdpsxws f11, f11
+; CHECK-BE-NEXT:    xscvdpsxws f12, f12
+; CHECK-BE-NEXT:    xscvdpsxws f13, f13
+; CHECK-BE-NEXT:    xscvdpsxws v2, v2
+; CHECK-BE-NEXT:    xscvdpsxws v3, v3
+; CHECK-BE-NEXT:    mfvsrwz r3, f5
+; CHECK-BE-NEXT:    mfvsrwz r4, f4
+; CHECK-BE-NEXT:    mfvsrwz r5, f3
+; CHECK-BE-NEXT:    mfvsrwz r6, f2
+; CHECK-BE-NEXT:    mfvsrwz r11, f7
+; CHECK-BE-NEXT:    mfvsrwz r12, f6
+; CHECK-BE-NEXT:    mfvsrwz r0, f1
+; CHECK-BE-NEXT:    mfvsrwz r30, f0
+; CHECK-BE-NEXT:    mfvsrwz r7, f8
+; CHECK-BE-NEXT:    mfvsrwz r8, f9
+; CHECK-BE-NEXT:    mfvsrwz r9, f10
+; CHECK-BE-NEXT:    mfvsrwz r10, f11
+; CHECK-BE-NEXT:    mfvsrwz r29, f12
+; CHECK-BE-NEXT:    mfvsrwz r28, f13
+; CHECK-BE-NEXT:    mfvsrwz r27, v2
+; CHECK-BE-NEXT:    mfvsrwz r26, v3
+; CHECK-BE-NEXT:    sldi r3, r3, 56
+; CHECK-BE-NEXT:    sldi r4, r4, 56
+; CHECK-BE-NEXT:    sldi r5, r5, 56
+; CHECK-BE-NEXT:    sldi r6, r6, 56
+; CHECK-BE-NEXT:    sldi r11, r11, 56
+; CHECK-BE-NEXT:    sldi r12, r12, 56
+; CHECK-BE-NEXT:    sldi r0, r0, 56
+; CHECK-BE-NEXT:    sldi r30, r30, 56
+; CHECK-BE-NEXT:    sldi r7, r7, 56
+; CHECK-BE-NEXT:    sldi r8, r8, 56
+; CHECK-BE-NEXT:    sldi r9, r9, 56
+; CHECK-BE-NEXT:    sldi r10, r10, 56
+; CHECK-BE-NEXT:    sldi r29, r29, 56
+; CHECK-BE-NEXT:    sldi r28, r28, 56
+; CHECK-BE-NEXT:    sldi r27, r27, 56
+; CHECK-BE-NEXT:    sldi r26, r26, 56
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    mtvsrd v3, r4
+; CHECK-BE-NEXT:    mtvsrd v4, r5
+; CHECK-BE-NEXT:    mtvsrd v5, r6
+; CHECK-BE-NEXT:    mtvsrd v8, r11
+; CHECK-BE-NEXT:    mtvsrd v10, r12
+; CHECK-BE-NEXT:    mtvsrd v12, r0
+; CHECK-BE-NEXT:    mtvsrd v14, r30
+; CHECK-BE-NEXT:    mtvsrd v0, r7
+; CHECK-BE-NEXT:    mtvsrd v1, r8
+; CHECK-BE-NEXT:    mtvsrd v6, r9
+; CHECK-BE-NEXT:    mtvsrd v7, r10
+; CHECK-BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v9, r29
+; CHECK-BE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrd v11, r28
+; CHECK-BE-NEXT:    mtvsrd v13, r27
+; CHECK-BE-NEXT:    mtvsrd v15, r26
+; CHECK-BE-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    vmrghb v2, v2, v0
+; CHECK-BE-NEXT:    vmrghb v3, v3, v1
+; CHECK-BE-NEXT:    vmrghb v4, v4, v6
+; CHECK-BE-NEXT:    vmrghb v5, v5, v7
+; CHECK-BE-NEXT:    vmrghb v0, v8, v9
+; CHECK-BE-NEXT:    vmrghb v1, v10, v11
+; CHECK-BE-NEXT:    vmrghb v6, v12, v13
+; CHECK-BE-NEXT:    vmrghb v7, v14, v15
+; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vmrghh v3, v5, v4
+; CHECK-BE-NEXT:    vmrghh v4, v1, v0
+; CHECK-BE-NEXT:    vmrghh v5, v7, v6
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    vmrghw v3, v5, v4
+; CHECK-BE-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x double>, <16 x double>* %0, align 128
+  %1 = fptosi <16 x double> %a to <16 x i8>
+  ret <16 x i8> %1
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_fp_to_i_4byte_elts.ll b/test/CodeGen/PowerPC/vec_conv_fp_to_i_4byte_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b1ef3d0d994f03ae84d34ea68688d96d998a2eee
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_fp_to_i_4byte_elts.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i64 @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xvcvspuxws vs0, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xvcvspuxws vs0, v2
+; CHECK-P9-NEXT:    mfvsrld r3, vs0
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xvcvspuxws vs0, vs0
+; CHECK-BE-NEXT:    mfvsrd r3, f0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x float>
+  %1 = fptoui <2 x float> %0 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to i64
+  ret i64 %2
+}
+
+define <4 x i32> @test4elt(<4 x float> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xvcvspuxws v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xvcvspuxws v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xvcvspuxws v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %0
+}
+
+define void @test8elt(<8 x i32>* noalias nocapture sret %agg.result, <8 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    xvcvspuxws v3, v3
+; CHECK-P8-NEXT:    xvcvspuxws v2, v2
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xvcvspuxws vs1, vs1
+; CHECK-P9-NEXT:    xvcvspuxws vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 16(r3)
+; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xvcvspuxws vs1, vs1
+; CHECK-BE-NEXT:    xvcvspuxws vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x float>, <8 x float>* %0, align 32
+  %1 = fptoui <8 x float> %a to <8 x i32>
+  store <8 x i32> %1, <8 x i32>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt(<16 x i32>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    lvx v5, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r6
+; CHECK-P8-NEXT:    lvx v4, r4, r7
+; CHECK-P8-NEXT:    xvcvspuxws v5, v5
+; CHECK-P8-NEXT:    xvcvspuxws v2, v2
+; CHECK-P8-NEXT:    xvcvspuxws v3, v3
+; CHECK-P8-NEXT:    xvcvspuxws v4, v4
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    stvx v3, r3, r6
+; CHECK-P8-NEXT:    stvx v4, r3, r7
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    xvcvspuxws vs3, vs3
+; CHECK-P9-NEXT:    xvcvspuxws vs2, vs2
+; CHECK-P9-NEXT:    xvcvspuxws vs1, vs1
+; CHECK-P9-NEXT:    xvcvspuxws vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs1, 32(r3)
+; CHECK-P9-NEXT:    stxv vs2, 16(r3)
+; CHECK-P9-NEXT:    stxv vs3, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    xvcvspuxws vs3, vs3
+; CHECK-BE-NEXT:    xvcvspuxws vs2, vs2
+; CHECK-BE-NEXT:    xvcvspuxws vs1, vs1
+; CHECK-BE-NEXT:    xvcvspuxws vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs3, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x float>, <16 x float>* %0, align 64
+  %1 = fptoui <16 x float> %a to <16 x i32>
+  store <16 x i32> %1, <16 x i32>* %agg.result, align 64
+  ret void
+}
+
+define i64 @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xvcvspsxws vs0, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xvcvspsxws vs0, v2
+; CHECK-P9-NEXT:    mfvsrld r3, vs0
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xvcvspsxws vs0, vs0
+; CHECK-BE-NEXT:    mfvsrd r3, f0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x float>
+  %1 = fptosi <2 x float> %0 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to i64
+  ret i64 %2
+}
+
+define <4 x i32> @test4elt_signed(<4 x float> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xvcvspsxws v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xvcvspsxws v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xvcvspsxws v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %0
+}
+
+define void @test8elt_signed(<8 x i32>* noalias nocapture sret %agg.result, <8 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    xvcvspsxws v3, v3
+; CHECK-P8-NEXT:    xvcvspsxws v2, v2
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xvcvspsxws vs1, vs1
+; CHECK-P9-NEXT:    xvcvspsxws vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 16(r3)
+; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xvcvspsxws vs1, vs1
+; CHECK-BE-NEXT:    xvcvspsxws vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x float>, <8 x float>* %0, align 32
+  %1 = fptosi <8 x float> %a to <8 x i32>
+  store <8 x i32> %1, <8 x i32>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt_signed(<16 x i32>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    lvx v5, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r6
+; CHECK-P8-NEXT:    lvx v4, r4, r7
+; CHECK-P8-NEXT:    xvcvspsxws v5, v5
+; CHECK-P8-NEXT:    xvcvspsxws v2, v2
+; CHECK-P8-NEXT:    xvcvspsxws v3, v3
+; CHECK-P8-NEXT:    xvcvspsxws v4, v4
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    stvx v3, r3, r6
+; CHECK-P8-NEXT:    stvx v4, r3, r7
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    xvcvspsxws vs3, vs3
+; CHECK-P9-NEXT:    xvcvspsxws vs2, vs2
+; CHECK-P9-NEXT:    xvcvspsxws vs1, vs1
+; CHECK-P9-NEXT:    xvcvspsxws vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs1, 32(r3)
+; CHECK-P9-NEXT:    stxv vs2, 16(r3)
+; CHECK-P9-NEXT:    stxv vs3, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    xvcvspsxws vs3, vs3
+; CHECK-BE-NEXT:    xvcvspsxws vs2, vs2
+; CHECK-BE-NEXT:    xvcvspsxws vs1, vs1
+; CHECK-BE-NEXT:    xvcvspsxws vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs3, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x float>, <16 x float>* %0, align 64
+  %1 = fptosi <16 x float> %a to <16 x i32>
+  store <16 x i32> %1, <16 x i32>* %agg.result, align 64
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_fp_to_i_8byte_elts.ll b/test/CodeGen/PowerPC/vec_conv_fp_to_i_8byte_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a5097b070c134eff1ad0a2964a77304ebf050316
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_fp_to_i_8byte_elts.ll
@@ -0,0 +1,438 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define <2 x i64> @test2elt(<2 x double> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xvcvdpuxds v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xvcvdpuxds v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xvcvdpuxds v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptoui <2 x double> %a to <2 x i64>
+  ret <2 x i64> %0
+}
+
+define void @test4elt(<4 x i64>* noalias nocapture sret %agg.result, <4 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 16(r3)
+; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x double>, <4 x double>* %0, align 32
+  %1 = fptoui <4 x double> %a to <4 x i64>
+  store <4 x i64> %1, <4 x i64>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt(<8 x i64>* noalias nocapture sret %agg.result, <8 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P8-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P8-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs1, 32(r3)
+; CHECK-P9-NEXT:    stxv vs2, 16(r3)
+; CHECK-P9-NEXT:    stxv vs3, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs3, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x double>, <8 x double>* %0, align 64
+  %1 = fptoui <8 x double> %a to <8 x i64>
+  store <8 x i64> %1, <8 x i64>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt(<16 x i64>* noalias nocapture sret %agg.result, <16 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 64
+; CHECK-P8-NEXT:    li r8, 96
+; CHECK-P8-NEXT:    li r9, 112
+; CHECK-P8-NEXT:    li r10, 80
+; CHECK-P8-NEXT:    li r11, 48
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r8
+; CHECK-P8-NEXT:    lxvd2x vs4, r4, r9
+; CHECK-P8-NEXT:    lxvd2x vs5, r4, r10
+; CHECK-P8-NEXT:    lxvd2x vs6, r4, r11
+; CHECK-P8-NEXT:    lxvd2x vs7, 0, r4
+; CHECK-P8-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P8-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P8-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P8-NEXT:    xvcvdpuxds vs4, vs4
+; CHECK-P8-NEXT:    xvcvdpuxds vs5, vs5
+; CHECK-P8-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-P8-NEXT:    xvcvdpuxds vs7, vs7
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r9
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r8
+; CHECK-P8-NEXT:    stxvd2x vs5, r3, r10
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs6, r3, r11
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs7, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    lxv vs4, 112(r4)
+; CHECK-P9-NEXT:    lxv vs5, 96(r4)
+; CHECK-P9-NEXT:    lxv vs6, 80(r4)
+; CHECK-P9-NEXT:    lxv vs7, 64(r4)
+; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs7, vs7
+; CHECK-P9-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-P9-NEXT:    xvcvdpuxds vs5, vs5
+; CHECK-P9-NEXT:    xvcvdpuxds vs4, vs4
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs1, 32(r3)
+; CHECK-P9-NEXT:    stxv vs2, 16(r3)
+; CHECK-P9-NEXT:    stxv vs3, 0(r3)
+; CHECK-P9-NEXT:    stxv vs4, 112(r3)
+; CHECK-P9-NEXT:    stxv vs5, 96(r3)
+; CHECK-P9-NEXT:    stxv vs6, 80(r3)
+; CHECK-P9-NEXT:    stxv vs7, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    lxv vs4, 112(r4)
+; CHECK-BE-NEXT:    lxv vs5, 96(r4)
+; CHECK-BE-NEXT:    lxv vs6, 80(r4)
+; CHECK-BE-NEXT:    lxv vs7, 64(r4)
+; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs7, vs7
+; CHECK-BE-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-BE-NEXT:    xvcvdpuxds vs5, vs5
+; CHECK-BE-NEXT:    xvcvdpuxds vs4, vs4
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs3, 0(r3)
+; CHECK-BE-NEXT:    stxv vs4, 112(r3)
+; CHECK-BE-NEXT:    stxv vs5, 96(r3)
+; CHECK-BE-NEXT:    stxv vs6, 80(r3)
+; CHECK-BE-NEXT:    stxv vs7, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x double>, <16 x double>* %0, align 128
+  %1 = fptoui <16 x double> %a to <16 x i64>
+  store <16 x i64> %1, <16 x i64>* %agg.result, align 128
+  ret void
+}
+
+define <2 x i64> @test2elt_signed(<2 x double> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xvcvdpsxds v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xvcvdpsxds v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xvcvdpsxds v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = fptosi <2 x double> %a to <2 x i64>
+  ret <2 x i64> %0
+}
+
+define void @test4elt_signed(<4 x i64>* noalias nocapture sret %agg.result, <4 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-P8-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 16(r3)
+; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x double>, <4 x double>* %0, align 32
+  %1 = fptosi <4 x double> %a to <4 x i64>
+  store <4 x i64> %1, <4 x i64>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt_signed(<8 x i64>* noalias nocapture sret %agg.result, <8 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    xvcvdpsxds vs3, vs3
+; CHECK-P8-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-P8-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-P8-NEXT:    xvcvdpsxds vs2, vs2
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    xvcvdpsxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpsxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs1, 32(r3)
+; CHECK-P9-NEXT:    stxv vs2, 16(r3)
+; CHECK-P9-NEXT:    stxv vs3, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    xvcvdpsxds vs3, vs3
+; CHECK-BE-NEXT:    xvcvdpsxds vs2, vs2
+; CHECK-BE-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs3, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x double>, <8 x double>* %0, align 64
+  %1 = fptosi <8 x double> %a to <8 x i64>
+  store <8 x i64> %1, <8 x i64>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt_signed(<16 x i64>* noalias nocapture sret %agg.result, <16 x double>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 64
+; CHECK-P8-NEXT:    li r8, 96
+; CHECK-P8-NEXT:    li r9, 112
+; CHECK-P8-NEXT:    li r10, 80
+; CHECK-P8-NEXT:    li r11, 48
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r8
+; CHECK-P8-NEXT:    lxvd2x vs4, r4, r9
+; CHECK-P8-NEXT:    lxvd2x vs5, r4, r10
+; CHECK-P8-NEXT:    lxvd2x vs6, r4, r11
+; CHECK-P8-NEXT:    lxvd2x vs7, 0, r4
+; CHECK-P8-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-P8-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-P8-NEXT:    xvcvdpsxds vs2, vs2
+; CHECK-P8-NEXT:    xvcvdpsxds vs3, vs3
+; CHECK-P8-NEXT:    xvcvdpsxds vs4, vs4
+; CHECK-P8-NEXT:    xvcvdpsxds vs5, vs5
+; CHECK-P8-NEXT:    xvcvdpsxds vs6, vs6
+; CHECK-P8-NEXT:    xvcvdpsxds vs7, vs7
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r9
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r8
+; CHECK-P8-NEXT:    stxvd2x vs5, r3, r10
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs6, r3, r11
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs7, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    lxv vs4, 112(r4)
+; CHECK-P9-NEXT:    lxv vs5, 96(r4)
+; CHECK-P9-NEXT:    lxv vs6, 80(r4)
+; CHECK-P9-NEXT:    lxv vs7, 64(r4)
+; CHECK-P9-NEXT:    xvcvdpsxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpsxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-P9-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpsxds vs7, vs7
+; CHECK-P9-NEXT:    xvcvdpsxds vs6, vs6
+; CHECK-P9-NEXT:    xvcvdpsxds vs5, vs5
+; CHECK-P9-NEXT:    xvcvdpsxds vs4, vs4
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs1, 32(r3)
+; CHECK-P9-NEXT:    stxv vs2, 16(r3)
+; CHECK-P9-NEXT:    stxv vs3, 0(r3)
+; CHECK-P9-NEXT:    stxv vs4, 112(r3)
+; CHECK-P9-NEXT:    stxv vs5, 96(r3)
+; CHECK-P9-NEXT:    stxv vs6, 80(r3)
+; CHECK-P9-NEXT:    stxv vs7, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    lxv vs4, 112(r4)
+; CHECK-BE-NEXT:    lxv vs5, 96(r4)
+; CHECK-BE-NEXT:    lxv vs6, 80(r4)
+; CHECK-BE-NEXT:    lxv vs7, 64(r4)
+; CHECK-BE-NEXT:    xvcvdpsxds vs3, vs3
+; CHECK-BE-NEXT:    xvcvdpsxds vs2, vs2
+; CHECK-BE-NEXT:    xvcvdpsxds vs1, vs1
+; CHECK-BE-NEXT:    xvcvdpsxds vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpsxds vs7, vs7
+; CHECK-BE-NEXT:    xvcvdpsxds vs6, vs6
+; CHECK-BE-NEXT:    xvcvdpsxds vs5, vs5
+; CHECK-BE-NEXT:    xvcvdpsxds vs4, vs4
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs3, 0(r3)
+; CHECK-BE-NEXT:    stxv vs4, 112(r3)
+; CHECK-BE-NEXT:    stxv vs5, 96(r3)
+; CHECK-BE-NEXT:    stxv vs6, 80(r3)
+; CHECK-BE-NEXT:    stxv vs7, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x double>, <16 x double>* %0, align 128
+  %1 = fptosi <16 x double> %a to <16 x i64>
+  store <16 x i64> %1, <16 x i64>* %agg.result, align 128
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll b/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b5221552ecd03ba21455aa50b09e9b577edf803c
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll
@@ -0,0 +1,1366 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i64 @test2elt(i32 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 48
+; CHECK-P8-NEXT:    rldicl r3, r3, 48, 48
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    rlwinm r3, r3, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r4
+; CHECK-P8-NEXT:    mtvsrwz f1, r3
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xscvdpspn vs0, f0
+; CHECK-P8-NEXT:    xscvdpspn vs1, f1
+; CHECK-P8-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P8-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 2
+; CHECK-P9-NEXT:    vextuhrx r3, r3, v2
+; CHECK-P9-NEXT:    vextuhrx r4, r4, v2
+; CHECK-P9-NEXT:    rlwinm r3, r3, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r3
+; CHECK-P9-NEXT:    mtvsrwz f1, r4
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvdpspn vs0, f0
+; CHECK-P9-NEXT:    xscvdpspn vs1, f1
+; CHECK-P9-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P9-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    li r3, 2
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    vextuhlx r3, r3, v2
+; CHECK-BE-NEXT:    vextuhlx r4, r4, v2
+; CHECK-BE-NEXT:    rlwinm r3, r3, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r3
+; CHECK-BE-NEXT:    mtvsrwz f1, r4
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvdpspn v2, f0
+; CHECK-BE-NEXT:    xscvdpspn v3, f1
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i32 %a.coerce to <2 x i16>
+  %1 = uitofp <2 x i16> %0 to <2 x float>
+  %2 = bitcast <2 x float> %1 to i64
+  ret i64 %2
+}
+
+define <4 x float> @test4elt(i64 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 48
+; CHECK-P8-NEXT:    rldicl r5, r3, 32, 48
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r4
+; CHECK-P8-NEXT:    rldicl r4, r3, 48, 48
+; CHECK-P8-NEXT:    rldicl r3, r3, 16, 48
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    rlwinm r3, r3, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f1, r5
+; CHECK-P8-NEXT:    mtvsrwz f2, r4
+; CHECK-P8-NEXT:    mtvsrwz f3, r3
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 4
+; CHECK-P9-NEXT:    li r5, 2
+; CHECK-P9-NEXT:    li r6, 6
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vextuhrx r3, r3, v2
+; CHECK-P9-NEXT:    vextuhrx r4, r4, v2
+; CHECK-P9-NEXT:    vextuhrx r5, r5, v2
+; CHECK-P9-NEXT:    vextuhrx r6, r6, v2
+; CHECK-P9-NEXT:    rlwinm r3, r3, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r3
+; CHECK-P9-NEXT:    mtvsrwz f1, r4
+; CHECK-P9-NEXT:    mtvsrwz f2, r5
+; CHECK-P9-NEXT:    mtvsrwz f3, r6
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r4, 6
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    li r3, 2
+; CHECK-BE-NEXT:    li r5, 4
+; CHECK-BE-NEXT:    li r6, 0
+; CHECK-BE-NEXT:    vextuhlx r4, r4, v2
+; CHECK-BE-NEXT:    vextuhlx r3, r3, v2
+; CHECK-BE-NEXT:    vextuhlx r5, r5, v2
+; CHECK-BE-NEXT:    vextuhlx r6, r6, v2
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r3, r3, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r4
+; CHECK-BE-NEXT:    mtvsrwz f1, r3
+; CHECK-BE-NEXT:    mtvsrwz f2, r5
+; CHECK-BE-NEXT:    mtvsrwz f3, r6
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <4 x i16>
+  %1 = uitofp <4 x i16> %0 to <4 x float>
+  ret <4 x float> %1
+}
+
+define void @test8elt(<8 x float>* noalias nocapture sret %agg.result, <8 x i16> %a) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mfvsrd r5, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    clrldi r6, r5, 48
+; CHECK-P8-NEXT:    rldicl r7, r5, 32, 48
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    mfvsrd r8, f0
+; CHECK-P8-NEXT:    rlwinm r7, r7, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f1, r6
+; CHECK-P8-NEXT:    rldicl r6, r5, 48, 48
+; CHECK-P8-NEXT:    rldicl r5, r5, 16, 48
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r7
+; CHECK-P8-NEXT:    mtvsrwz f2, r6
+; CHECK-P8-NEXT:    clrldi r6, r8, 48
+; CHECK-P8-NEXT:    mtvsrwz f3, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 32, 48
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 48, 48
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f5, r5
+; CHECK-P8-NEXT:    rlwinm r5, r6, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f6, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 16, 48
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    mtvsrwz f7, r5
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    xscvuxdsp f4, f4
+; CHECK-P8-NEXT:    xscvuxdsp f5, f5
+; CHECK-P8-NEXT:    xscvuxdsp f6, f6
+; CHECK-P8-NEXT:    xscvuxdsp f7, f7
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs5, vs4
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs4, vs7, vs6
+; CHECK-P8-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs4
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs2
+; CHECK-P8-NEXT:    vmrgew v3, v4, v3
+; CHECK-P8-NEXT:    vmrgew v2, v5, v2
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r4
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    li r4, 8
+; CHECK-P9-NEXT:    li r5, 12
+; CHECK-P9-NEXT:    li r6, 10
+; CHECK-P9-NEXT:    li r7, 14
+; CHECK-P9-NEXT:    li r8, 0
+; CHECK-P9-NEXT:    li r9, 4
+; CHECK-P9-NEXT:    li r10, 2
+; CHECK-P9-NEXT:    li r11, 6
+; CHECK-P9-NEXT:    vextuhrx r4, r4, v2
+; CHECK-P9-NEXT:    vextuhrx r5, r5, v2
+; CHECK-P9-NEXT:    vextuhrx r6, r6, v2
+; CHECK-P9-NEXT:    vextuhrx r7, r7, v2
+; CHECK-P9-NEXT:    vextuhrx r8, r8, v2
+; CHECK-P9-NEXT:    vextuhrx r9, r9, v2
+; CHECK-P9-NEXT:    vextuhrx r10, r10, v2
+; CHECK-P9-NEXT:    vextuhrx r11, r11, v2
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r7, r7, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r8, r8, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r9, r9, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r10, r10, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r11, r11, 0, 16, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r4
+; CHECK-P9-NEXT:    mtvsrwz f1, r5
+; CHECK-P9-NEXT:    mtvsrwz f2, r6
+; CHECK-P9-NEXT:    mtvsrwz f3, r7
+; CHECK-P9-NEXT:    mtvsrwz f4, r8
+; CHECK-P9-NEXT:    mtvsrwz f5, r9
+; CHECK-P9-NEXT:    mtvsrwz f6, r10
+; CHECK-P9-NEXT:    mtvsrwz f7, r11
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xscvuxdsp f4, f4
+; CHECK-P9-NEXT:    xscvuxdsp f5, f5
+; CHECK-P9-NEXT:    xscvuxdsp f6, f6
+; CHECK-P9-NEXT:    xscvuxdsp f7, f7
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    stxv v3, 0(r3)
+; CHECK-P9-NEXT:    stxv v2, 16(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r4, 12
+; CHECK-BE-NEXT:    li r5, 8
+; CHECK-BE-NEXT:    li r6, 10
+; CHECK-BE-NEXT:    li r7, 14
+; CHECK-BE-NEXT:    li r8, 6
+; CHECK-BE-NEXT:    li r9, 2
+; CHECK-BE-NEXT:    li r10, 4
+; CHECK-BE-NEXT:    li r11, 0
+; CHECK-BE-NEXT:    vextuhlx r4, r4, v2
+; CHECK-BE-NEXT:    vextuhlx r5, r5, v2
+; CHECK-BE-NEXT:    vextuhlx r6, r6, v2
+; CHECK-BE-NEXT:    vextuhlx r7, r7, v2
+; CHECK-BE-NEXT:    vextuhlx r8, r8, v2
+; CHECK-BE-NEXT:    vextuhlx r9, r9, v2
+; CHECK-BE-NEXT:    vextuhlx r10, r10, v2
+; CHECK-BE-NEXT:    vextuhlx r11, r11, v2
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r7, r7, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r8, r8, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r9, r9, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r10, r10, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r11, r11, 0, 16, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r4
+; CHECK-BE-NEXT:    mtvsrwz f1, r5
+; CHECK-BE-NEXT:    mtvsrwz f2, r6
+; CHECK-BE-NEXT:    mtvsrwz f3, r7
+; CHECK-BE-NEXT:    mtvsrwz f4, r8
+; CHECK-BE-NEXT:    mtvsrwz f5, r9
+; CHECK-BE-NEXT:    mtvsrwz f6, r10
+; CHECK-BE-NEXT:    mtvsrwz f7, r11
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xscvuxdsp f4, f4
+; CHECK-BE-NEXT:    xscvuxdsp f5, f5
+; CHECK-BE-NEXT:    xscvuxdsp f6, f6
+; CHECK-BE-NEXT:    xscvuxdsp f7, f7
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs3
+; CHECK-BE-NEXT:    vmrgew v2, v2, v3
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    stxv v3, 0(r3)
+; CHECK-BE-NEXT:    stxv v2, 16(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = uitofp <8 x i16> %a to <8 x float>
+  store <8 x float> %0, <8 x float>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt(<16 x float>* noalias nocapture sret %agg.result, <16 x i16>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    mfvsrd r7, v3
+; CHECK-P8-NEXT:    xxswapd vs8, v3
+; CHECK-P8-NEXT:    mfvsrd r6, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    clrldi r4, r6, 48
+; CHECK-P8-NEXT:    rldicl r8, r6, 32, 48
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    rlwinm r8, r8, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r4
+; CHECK-P8-NEXT:    rldicl r4, r6, 48, 48
+; CHECK-P8-NEXT:    rldicl r6, r6, 16, 48
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f1, r8
+; CHECK-P8-NEXT:    clrldi r8, r7, 48
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f3, r4
+; CHECK-P8-NEXT:    rlwinm r4, r8, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r7, 32, 48
+; CHECK-P8-NEXT:    mtvsrwz f5, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 48, 48
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    mfvsrd r8, f2
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f2, r6
+; CHECK-P8-NEXT:    rldicl r6, r7, 16, 48
+; CHECK-P8-NEXT:    mtvsrwz f6, r4
+; CHECK-P8-NEXT:    clrldi r4, r8, 48
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f7, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 32, 48
+; CHECK-P8-NEXT:    mtvsrwz f9, r4
+; CHECK-P8-NEXT:    rldicl r4, r8, 48, 48
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    mtvsrwz f10, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 16, 48
+; CHECK-P8-NEXT:    mtvsrwz f11, r4
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    mfvsrd r4, f8
+; CHECK-P8-NEXT:    mtvsrwz f8, r6
+; CHECK-P8-NEXT:    clrldi r6, r4, 48
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xscvuxdsp f4, f4
+; CHECK-P8-NEXT:    mtvsrwz f12, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 32, 48
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    xscvuxdsp f5, f5
+; CHECK-P8-NEXT:    mtvsrwz f13, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 48, 48
+; CHECK-P8-NEXT:    rldicl r4, r4, 16, 48
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    mtvsrwz v2, r6
+; CHECK-P8-NEXT:    mtvsrwz v3, r4
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xscvuxdsp f6, f6
+; CHECK-P8-NEXT:    xscvuxdsp f7, f7
+; CHECK-P8-NEXT:    xscvuxdsp f9, f9
+; CHECK-P8-NEXT:    xscvuxdsp f10, f10
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs5
+; CHECK-P8-NEXT:    xscvuxdsp f11, f11
+; CHECK-P8-NEXT:    xscvuxdsp f8, f8
+; CHECK-P8-NEXT:    xscvuxdsp f12, f12
+; CHECK-P8-NEXT:    xscvuxdsp f13, f13
+; CHECK-P8-NEXT:    xxmrghd vs5, vs7, vs6
+; CHECK-P8-NEXT:    xscvuxdsp f1, v2
+; CHECK-P8-NEXT:    xscvuxdsp f4, v3
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs10, vs9
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs3
+; CHECK-P8-NEXT:    xxmrghd vs3, vs8, vs11
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P8-NEXT:    xxmrghd vs2, vs13, vs12
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs5
+; CHECK-P8-NEXT:    xvcvdpsp v0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs4, vs1
+; CHECK-P8-NEXT:    xvcvdpsp v1, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v6, vs2
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    xvcvdpsp v7, vs1
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    vmrgew v4, v1, v0
+; CHECK-P8-NEXT:    stvx v2, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    vmrgew v5, v7, v6
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    stvx v4, r3, r4
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv v3, 0(r4)
+; CHECK-P9-NEXT:    lxv v2, 16(r4)
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    li r5, 4
+; CHECK-P9-NEXT:    li r6, 2
+; CHECK-P9-NEXT:    li r7, 6
+; CHECK-P9-NEXT:    li r8, 8
+; CHECK-P9-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r9, 12
+; CHECK-P9-NEXT:    li r10, 10
+; CHECK-P9-NEXT:    li r11, 14
+; CHECK-P9-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    vextuhrx r12, r4, v3
+; CHECK-P9-NEXT:    vextuhrx r0, r5, v3
+; CHECK-P9-NEXT:    vextuhrx r30, r6, v3
+; CHECK-P9-NEXT:    vextuhrx r29, r7, v3
+; CHECK-P9-NEXT:    vextuhrx r28, r8, v3
+; CHECK-P9-NEXT:    vextuhrx r27, r9, v3
+; CHECK-P9-NEXT:    vextuhrx r26, r10, v3
+; CHECK-P9-NEXT:    vextuhrx r25, r11, v3
+; CHECK-P9-NEXT:    vextuhrx r4, r4, v2
+; CHECK-P9-NEXT:    vextuhrx r5, r5, v2
+; CHECK-P9-NEXT:    vextuhrx r6, r6, v2
+; CHECK-P9-NEXT:    vextuhrx r7, r7, v2
+; CHECK-P9-NEXT:    vextuhrx r8, r8, v2
+; CHECK-P9-NEXT:    vextuhrx r9, r9, v2
+; CHECK-P9-NEXT:    vextuhrx r10, r10, v2
+; CHECK-P9-NEXT:    vextuhrx r11, r11, v2
+; CHECK-P9-NEXT:    rlwinm r12, r12, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r0, r0, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r30, r30, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r29, r29, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r28, r28, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r27, r27, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r26, r26, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r25, r25, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r7, r7, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r8, r8, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r9, r9, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r10, r10, 0, 16, 31
+; CHECK-P9-NEXT:    rlwinm r11, r11, 0, 16, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r12
+; CHECK-P9-NEXT:    mtvsrwz f1, r0
+; CHECK-P9-NEXT:    mtvsrwz f2, r30
+; CHECK-P9-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f3, r29
+; CHECK-P9-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f4, r28
+; CHECK-P9-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f5, r27
+; CHECK-P9-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f6, r26
+; CHECK-P9-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f7, r25
+; CHECK-P9-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f8, r4
+; CHECK-P9-NEXT:    mtvsrwz f9, r5
+; CHECK-P9-NEXT:    mtvsrwz f10, r6
+; CHECK-P9-NEXT:    mtvsrwz f11, r7
+; CHECK-P9-NEXT:    mtvsrwz f12, r8
+; CHECK-P9-NEXT:    mtvsrwz f13, r9
+; CHECK-P9-NEXT:    mtvsrwz v2, r10
+; CHECK-P9-NEXT:    mtvsrwz v3, r11
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xscvuxdsp f4, f4
+; CHECK-P9-NEXT:    xscvuxdsp f5, f5
+; CHECK-P9-NEXT:    xscvuxdsp f6, f6
+; CHECK-P9-NEXT:    xscvuxdsp f7, f7
+; CHECK-P9-NEXT:    xscvuxdsp f8, f8
+; CHECK-P9-NEXT:    xscvuxdsp f9, f9
+; CHECK-P9-NEXT:    xscvuxdsp f10, f10
+; CHECK-P9-NEXT:    xscvuxdsp f11, f11
+; CHECK-P9-NEXT:    xscvuxdsp f12, f12
+; CHECK-P9-NEXT:    xscvuxdsp f13, f13
+; CHECK-P9-NEXT:    xscvuxdsp f31, v2
+; CHECK-P9-NEXT:    xscvuxdsp f30, v3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-P9-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-P9-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P9-NEXT:    xvcvdpsp v0, vs4
+; CHECK-P9-NEXT:    xvcvdpsp v1, vs5
+; CHECK-P9-NEXT:    xvcvdpsp v6, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v7, vs7
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    vmrgew v4, v1, v0
+; CHECK-P9-NEXT:    vmrgew v5, v7, v6
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    stxv v5, 48(r3)
+; CHECK-P9-NEXT:    stxv v4, 32(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-NEXT:    lxv v2, 16(r4)
+; CHECK-BE-NEXT:    li r4, 6
+; CHECK-BE-NEXT:    li r5, 2
+; CHECK-BE-NEXT:    li r6, 4
+; CHECK-BE-NEXT:    li r7, 0
+; CHECK-BE-NEXT:    li r8, 14
+; CHECK-BE-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r9, 10
+; CHECK-BE-NEXT:    li r10, 12
+; CHECK-BE-NEXT:    li r11, 8
+; CHECK-BE-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    vextuhlx r12, r4, v3
+; CHECK-BE-NEXT:    vextuhlx r0, r5, v3
+; CHECK-BE-NEXT:    vextuhlx r30, r6, v3
+; CHECK-BE-NEXT:    vextuhlx r29, r7, v3
+; CHECK-BE-NEXT:    vextuhlx r28, r8, v3
+; CHECK-BE-NEXT:    vextuhlx r27, r9, v3
+; CHECK-BE-NEXT:    vextuhlx r26, r10, v3
+; CHECK-BE-NEXT:    vextuhlx r25, r11, v3
+; CHECK-BE-NEXT:    vextuhlx r4, r4, v2
+; CHECK-BE-NEXT:    vextuhlx r5, r5, v2
+; CHECK-BE-NEXT:    vextuhlx r6, r6, v2
+; CHECK-BE-NEXT:    vextuhlx r7, r7, v2
+; CHECK-BE-NEXT:    vextuhlx r8, r8, v2
+; CHECK-BE-NEXT:    vextuhlx r9, r9, v2
+; CHECK-BE-NEXT:    vextuhlx r10, r10, v2
+; CHECK-BE-NEXT:    vextuhlx r11, r11, v2
+; CHECK-BE-NEXT:    rlwinm r12, r12, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r0, r0, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r30, r30, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r29, r29, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r28, r28, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r27, r27, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r26, r26, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r25, r25, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r7, r7, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r8, r8, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r9, r9, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r10, r10, 0, 16, 31
+; CHECK-BE-NEXT:    rlwinm r11, r11, 0, 16, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r12
+; CHECK-BE-NEXT:    mtvsrwz f1, r0
+; CHECK-BE-NEXT:    mtvsrwz f2, r30
+; CHECK-BE-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f3, r29
+; CHECK-BE-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f4, r28
+; CHECK-BE-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f5, r27
+; CHECK-BE-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f6, r26
+; CHECK-BE-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f7, r25
+; CHECK-BE-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f8, r4
+; CHECK-BE-NEXT:    mtvsrwz f9, r5
+; CHECK-BE-NEXT:    mtvsrwz f10, r6
+; CHECK-BE-NEXT:    mtvsrwz f11, r7
+; CHECK-BE-NEXT:    mtvsrwz f12, r8
+; CHECK-BE-NEXT:    mtvsrwz f13, r9
+; CHECK-BE-NEXT:    mtvsrwz v2, r10
+; CHECK-BE-NEXT:    mtvsrwz v3, r11
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xscvuxdsp f4, f4
+; CHECK-BE-NEXT:    xscvuxdsp f5, f5
+; CHECK-BE-NEXT:    xscvuxdsp f6, f6
+; CHECK-BE-NEXT:    xscvuxdsp f7, f7
+; CHECK-BE-NEXT:    xscvuxdsp f8, f8
+; CHECK-BE-NEXT:    xscvuxdsp f9, f9
+; CHECK-BE-NEXT:    xscvuxdsp f10, f10
+; CHECK-BE-NEXT:    xscvuxdsp f11, f11
+; CHECK-BE-NEXT:    xscvuxdsp f12, f12
+; CHECK-BE-NEXT:    xscvuxdsp f13, f13
+; CHECK-BE-NEXT:    xscvuxdsp f31, v2
+; CHECK-BE-NEXT:    xscvuxdsp f30, v3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-BE-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs3
+; CHECK-BE-NEXT:    xvcvdpsp v0, vs4
+; CHECK-BE-NEXT:    xvcvdpsp v1, vs5
+; CHECK-BE-NEXT:    xvcvdpsp v6, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v7, vs7
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    vmrgew v4, v1, v0
+; CHECK-BE-NEXT:    vmrgew v5, v7, v6
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i16>, <16 x i16>* %0, align 32
+  %1 = uitofp <16 x i16> %a to <16 x float>
+  store <16 x float> %1, <16 x float>* %agg.result, align 64
+  ret void
+}
+
+define i64 @test2elt_signed(i32 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 48
+; CHECK-P8-NEXT:    rldicl r3, r3, 48, 48
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    extsh r3, r3
+; CHECK-P8-NEXT:    mtvsrwa f0, r4
+; CHECK-P8-NEXT:    mtvsrwa f1, r3
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xscvdpspn vs0, f0
+; CHECK-P8-NEXT:    xscvdpspn vs1, f1
+; CHECK-P8-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P8-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 2
+; CHECK-P9-NEXT:    vextuhrx r3, r3, v2
+; CHECK-P9-NEXT:    vextuhrx r4, r4, v2
+; CHECK-P9-NEXT:    extsh r3, r3
+; CHECK-P9-NEXT:    extsh r4, r4
+; CHECK-P9-NEXT:    mtvsrwa f0, r3
+; CHECK-P9-NEXT:    mtvsrwa f1, r4
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvdpspn vs0, f0
+; CHECK-P9-NEXT:    xscvdpspn vs1, f1
+; CHECK-P9-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P9-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    li r3, 2
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    vextuhlx r3, r3, v2
+; CHECK-BE-NEXT:    vextuhlx r4, r4, v2
+; CHECK-BE-NEXT:    extsh r3, r3
+; CHECK-BE-NEXT:    extsh r4, r4
+; CHECK-BE-NEXT:    mtvsrwa f0, r3
+; CHECK-BE-NEXT:    mtvsrwa f1, r4
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvdpspn v2, f0
+; CHECK-BE-NEXT:    xscvdpspn v3, f1
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i32 %a.coerce to <2 x i16>
+  %1 = sitofp <2 x i16> %0 to <2 x float>
+  %2 = bitcast <2 x float> %1 to i64
+  ret i64 %2
+}
+
+define <4 x float> @test4elt_signed(i64 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 48
+; CHECK-P8-NEXT:    rldicl r5, r3, 32, 48
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f0, r4
+; CHECK-P8-NEXT:    rldicl r4, r3, 48, 48
+; CHECK-P8-NEXT:    rldicl r3, r3, 16, 48
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    extsh r3, r3
+; CHECK-P8-NEXT:    mtvsrwa f1, r5
+; CHECK-P8-NEXT:    mtvsrwa f2, r4
+; CHECK-P8-NEXT:    mtvsrwa f3, r3
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 4
+; CHECK-P9-NEXT:    li r5, 2
+; CHECK-P9-NEXT:    li r6, 6
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vextuhrx r3, r3, v2
+; CHECK-P9-NEXT:    vextuhrx r4, r4, v2
+; CHECK-P9-NEXT:    vextuhrx r5, r5, v2
+; CHECK-P9-NEXT:    vextuhrx r6, r6, v2
+; CHECK-P9-NEXT:    extsh r3, r3
+; CHECK-P9-NEXT:    extsh r4, r4
+; CHECK-P9-NEXT:    extsh r5, r5
+; CHECK-P9-NEXT:    extsh r6, r6
+; CHECK-P9-NEXT:    mtvsrwa f0, r3
+; CHECK-P9-NEXT:    mtvsrwa f1, r4
+; CHECK-P9-NEXT:    mtvsrwa f2, r5
+; CHECK-P9-NEXT:    mtvsrwa f3, r6
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r4, 6
+; CHECK-BE-NEXT:    mtvsrd v2, r3
+; CHECK-BE-NEXT:    li r3, 2
+; CHECK-BE-NEXT:    li r5, 4
+; CHECK-BE-NEXT:    li r6, 0
+; CHECK-BE-NEXT:    vextuhlx r4, r4, v2
+; CHECK-BE-NEXT:    vextuhlx r3, r3, v2
+; CHECK-BE-NEXT:    vextuhlx r5, r5, v2
+; CHECK-BE-NEXT:    vextuhlx r6, r6, v2
+; CHECK-BE-NEXT:    extsh r4, r4
+; CHECK-BE-NEXT:    extsh r3, r3
+; CHECK-BE-NEXT:    extsh r5, r5
+; CHECK-BE-NEXT:    extsh r6, r6
+; CHECK-BE-NEXT:    mtvsrwa f0, r4
+; CHECK-BE-NEXT:    mtvsrwa f1, r3
+; CHECK-BE-NEXT:    mtvsrwa f2, r5
+; CHECK-BE-NEXT:    mtvsrwa f3, r6
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <4 x i16>
+  %1 = sitofp <4 x i16> %0 to <4 x float>
+  ret <4 x float> %1
+}
+
+define void @test8elt_signed(<8 x float>* noalias nocapture sret %agg.result, <8 x i16> %a) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mfvsrd r5, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    clrldi r6, r5, 48
+; CHECK-P8-NEXT:    rldicl r7, r5, 32, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mfvsrd r8, f0
+; CHECK-P8-NEXT:    extsh r7, r7
+; CHECK-P8-NEXT:    mtvsrwa f1, r6
+; CHECK-P8-NEXT:    rldicl r6, r5, 48, 48
+; CHECK-P8-NEXT:    rldicl r5, r5, 16, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f0, r7
+; CHECK-P8-NEXT:    mtvsrwa f2, r6
+; CHECK-P8-NEXT:    clrldi r6, r8, 48
+; CHECK-P8-NEXT:    mtvsrwa f3, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 32, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 48, 48
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f5, r5
+; CHECK-P8-NEXT:    extsh r5, r6
+; CHECK-P8-NEXT:    mtvsrwa f6, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 16, 48
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    mtvsrwa f7, r5
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    xscvsxdsp f4, f4
+; CHECK-P8-NEXT:    xscvsxdsp f5, f5
+; CHECK-P8-NEXT:    xscvsxdsp f6, f6
+; CHECK-P8-NEXT:    xscvsxdsp f7, f7
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs5, vs4
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs4, vs7, vs6
+; CHECK-P8-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs4
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs2
+; CHECK-P8-NEXT:    vmrgew v3, v4, v3
+; CHECK-P8-NEXT:    vmrgew v2, v5, v2
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r4
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    li r4, 8
+; CHECK-P9-NEXT:    li r5, 12
+; CHECK-P9-NEXT:    li r6, 10
+; CHECK-P9-NEXT:    li r7, 14
+; CHECK-P9-NEXT:    li r8, 0
+; CHECK-P9-NEXT:    li r9, 4
+; CHECK-P9-NEXT:    li r10, 2
+; CHECK-P9-NEXT:    li r11, 6
+; CHECK-P9-NEXT:    vextuhrx r4, r4, v2
+; CHECK-P9-NEXT:    vextuhrx r5, r5, v2
+; CHECK-P9-NEXT:    vextuhrx r6, r6, v2
+; CHECK-P9-NEXT:    vextuhrx r7, r7, v2
+; CHECK-P9-NEXT:    vextuhrx r8, r8, v2
+; CHECK-P9-NEXT:    vextuhrx r9, r9, v2
+; CHECK-P9-NEXT:    vextuhrx r10, r10, v2
+; CHECK-P9-NEXT:    vextuhrx r11, r11, v2
+; CHECK-P9-NEXT:    extsh r4, r4
+; CHECK-P9-NEXT:    extsh r5, r5
+; CHECK-P9-NEXT:    extsh r6, r6
+; CHECK-P9-NEXT:    extsh r7, r7
+; CHECK-P9-NEXT:    extsh r8, r8
+; CHECK-P9-NEXT:    extsh r9, r9
+; CHECK-P9-NEXT:    extsh r10, r10
+; CHECK-P9-NEXT:    extsh r11, r11
+; CHECK-P9-NEXT:    mtvsrwa f0, r4
+; CHECK-P9-NEXT:    mtvsrwa f1, r5
+; CHECK-P9-NEXT:    mtvsrwa f2, r6
+; CHECK-P9-NEXT:    mtvsrwa f3, r7
+; CHECK-P9-NEXT:    mtvsrwa f4, r8
+; CHECK-P9-NEXT:    mtvsrwa f5, r9
+; CHECK-P9-NEXT:    mtvsrwa f6, r10
+; CHECK-P9-NEXT:    mtvsrwa f7, r11
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xscvsxdsp f4, f4
+; CHECK-P9-NEXT:    xscvsxdsp f5, f5
+; CHECK-P9-NEXT:    xscvsxdsp f6, f6
+; CHECK-P9-NEXT:    xscvsxdsp f7, f7
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    stxv v3, 0(r3)
+; CHECK-P9-NEXT:    stxv v2, 16(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r4, 12
+; CHECK-BE-NEXT:    li r5, 8
+; CHECK-BE-NEXT:    li r6, 10
+; CHECK-BE-NEXT:    li r7, 14
+; CHECK-BE-NEXT:    li r8, 6
+; CHECK-BE-NEXT:    li r9, 2
+; CHECK-BE-NEXT:    li r10, 4
+; CHECK-BE-NEXT:    li r11, 0
+; CHECK-BE-NEXT:    vextuhlx r4, r4, v2
+; CHECK-BE-NEXT:    vextuhlx r5, r5, v2
+; CHECK-BE-NEXT:    vextuhlx r6, r6, v2
+; CHECK-BE-NEXT:    vextuhlx r7, r7, v2
+; CHECK-BE-NEXT:    vextuhlx r8, r8, v2
+; CHECK-BE-NEXT:    vextuhlx r9, r9, v2
+; CHECK-BE-NEXT:    vextuhlx r10, r10, v2
+; CHECK-BE-NEXT:    vextuhlx r11, r11, v2
+; CHECK-BE-NEXT:    extsh r4, r4
+; CHECK-BE-NEXT:    extsh r5, r5
+; CHECK-BE-NEXT:    extsh r6, r6
+; CHECK-BE-NEXT:    extsh r7, r7
+; CHECK-BE-NEXT:    extsh r8, r8
+; CHECK-BE-NEXT:    extsh r9, r9
+; CHECK-BE-NEXT:    extsh r10, r10
+; CHECK-BE-NEXT:    extsh r11, r11
+; CHECK-BE-NEXT:    mtvsrwa f0, r4
+; CHECK-BE-NEXT:    mtvsrwa f1, r5
+; CHECK-BE-NEXT:    mtvsrwa f2, r6
+; CHECK-BE-NEXT:    mtvsrwa f3, r7
+; CHECK-BE-NEXT:    mtvsrwa f4, r8
+; CHECK-BE-NEXT:    mtvsrwa f5, r9
+; CHECK-BE-NEXT:    mtvsrwa f6, r10
+; CHECK-BE-NEXT:    mtvsrwa f7, r11
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xscvsxdsp f4, f4
+; CHECK-BE-NEXT:    xscvsxdsp f5, f5
+; CHECK-BE-NEXT:    xscvsxdsp f6, f6
+; CHECK-BE-NEXT:    xscvsxdsp f7, f7
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs3
+; CHECK-BE-NEXT:    vmrgew v2, v2, v3
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    stxv v3, 0(r3)
+; CHECK-BE-NEXT:    stxv v2, 16(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = sitofp <8 x i16> %a to <8 x float>
+  store <8 x float> %0, <8 x float>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt_signed(<16 x float>* noalias nocapture sret %agg.result, <16 x i16>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    mfvsrd r7, v3
+; CHECK-P8-NEXT:    xxswapd vs8, v3
+; CHECK-P8-NEXT:    mfvsrd r6, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    clrldi r4, r6, 48
+; CHECK-P8-NEXT:    rldicl r8, r6, 32, 48
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    extsh r8, r8
+; CHECK-P8-NEXT:    mtvsrwa f0, r4
+; CHECK-P8-NEXT:    rldicl r4, r6, 48, 48
+; CHECK-P8-NEXT:    rldicl r6, r6, 16, 48
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f1, r8
+; CHECK-P8-NEXT:    clrldi r8, r7, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f3, r4
+; CHECK-P8-NEXT:    extsh r4, r8
+; CHECK-P8-NEXT:    mtvsrwa f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r7, 32, 48
+; CHECK-P8-NEXT:    mtvsrwa f5, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 48, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mfvsrd r8, f2
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f2, r6
+; CHECK-P8-NEXT:    rldicl r6, r7, 16, 48
+; CHECK-P8-NEXT:    mtvsrwa f6, r4
+; CHECK-P8-NEXT:    clrldi r4, r8, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f7, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 32, 48
+; CHECK-P8-NEXT:    mtvsrwa f9, r4
+; CHECK-P8-NEXT:    rldicl r4, r8, 48, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f10, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 16, 48
+; CHECK-P8-NEXT:    mtvsrwa f11, r4
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mfvsrd r4, f8
+; CHECK-P8-NEXT:    mtvsrwa f8, r6
+; CHECK-P8-NEXT:    clrldi r6, r4, 48
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xscvsxdsp f4, f4
+; CHECK-P8-NEXT:    mtvsrwa f12, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 32, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    xscvsxdsp f5, f5
+; CHECK-P8-NEXT:    mtvsrwa f13, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 48, 48
+; CHECK-P8-NEXT:    rldicl r4, r4, 16, 48
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    mtvsrwa v2, r6
+; CHECK-P8-NEXT:    mtvsrwa v3, r4
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xscvsxdsp f6, f6
+; CHECK-P8-NEXT:    xscvsxdsp f7, f7
+; CHECK-P8-NEXT:    xscvsxdsp f9, f9
+; CHECK-P8-NEXT:    xscvsxdsp f10, f10
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs5
+; CHECK-P8-NEXT:    xscvsxdsp f11, f11
+; CHECK-P8-NEXT:    xscvsxdsp f8, f8
+; CHECK-P8-NEXT:    xscvsxdsp f12, f12
+; CHECK-P8-NEXT:    xscvsxdsp f13, f13
+; CHECK-P8-NEXT:    xxmrghd vs5, vs7, vs6
+; CHECK-P8-NEXT:    xscvsxdsp f1, v2
+; CHECK-P8-NEXT:    xscvsxdsp f4, v3
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs10, vs9
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs3
+; CHECK-P8-NEXT:    xxmrghd vs3, vs8, vs11
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P8-NEXT:    xxmrghd vs2, vs13, vs12
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs5
+; CHECK-P8-NEXT:    xvcvdpsp v0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs4, vs1
+; CHECK-P8-NEXT:    xvcvdpsp v1, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v6, vs2
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    xvcvdpsp v7, vs1
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    vmrgew v4, v1, v0
+; CHECK-P8-NEXT:    stvx v2, r3, r4
+; CHECK-P8-NEXT:    li r4, 32
+; CHECK-P8-NEXT:    vmrgew v5, v7, v6
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    stvx v4, r3, r4
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv v3, 0(r4)
+; CHECK-P9-NEXT:    lxv v2, 16(r4)
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    li r5, 4
+; CHECK-P9-NEXT:    li r6, 2
+; CHECK-P9-NEXT:    li r7, 6
+; CHECK-P9-NEXT:    li r8, 8
+; CHECK-P9-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r9, 12
+; CHECK-P9-NEXT:    li r10, 10
+; CHECK-P9-NEXT:    li r11, 14
+; CHECK-P9-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    vextuhrx r12, r4, v3
+; CHECK-P9-NEXT:    vextuhrx r0, r5, v3
+; CHECK-P9-NEXT:    vextuhrx r30, r6, v3
+; CHECK-P9-NEXT:    vextuhrx r29, r7, v3
+; CHECK-P9-NEXT:    vextuhrx r28, r8, v3
+; CHECK-P9-NEXT:    vextuhrx r27, r9, v3
+; CHECK-P9-NEXT:    vextuhrx r26, r10, v3
+; CHECK-P9-NEXT:    vextuhrx r25, r11, v3
+; CHECK-P9-NEXT:    vextuhrx r4, r4, v2
+; CHECK-P9-NEXT:    vextuhrx r5, r5, v2
+; CHECK-P9-NEXT:    vextuhrx r6, r6, v2
+; CHECK-P9-NEXT:    vextuhrx r7, r7, v2
+; CHECK-P9-NEXT:    vextuhrx r8, r8, v2
+; CHECK-P9-NEXT:    vextuhrx r9, r9, v2
+; CHECK-P9-NEXT:    vextuhrx r10, r10, v2
+; CHECK-P9-NEXT:    vextuhrx r11, r11, v2
+; CHECK-P9-NEXT:    extsh r12, r12
+; CHECK-P9-NEXT:    extsh r0, r0
+; CHECK-P9-NEXT:    extsh r30, r30
+; CHECK-P9-NEXT:    extsh r29, r29
+; CHECK-P9-NEXT:    extsh r28, r28
+; CHECK-P9-NEXT:    extsh r27, r27
+; CHECK-P9-NEXT:    extsh r26, r26
+; CHECK-P9-NEXT:    extsh r25, r25
+; CHECK-P9-NEXT:    extsh r4, r4
+; CHECK-P9-NEXT:    extsh r5, r5
+; CHECK-P9-NEXT:    extsh r6, r6
+; CHECK-P9-NEXT:    extsh r7, r7
+; CHECK-P9-NEXT:    extsh r8, r8
+; CHECK-P9-NEXT:    extsh r9, r9
+; CHECK-P9-NEXT:    extsh r10, r10
+; CHECK-P9-NEXT:    extsh r11, r11
+; CHECK-P9-NEXT:    mtvsrwa f0, r12
+; CHECK-P9-NEXT:    mtvsrwa f1, r0
+; CHECK-P9-NEXT:    mtvsrwa f2, r30
+; CHECK-P9-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f3, r29
+; CHECK-P9-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f4, r28
+; CHECK-P9-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f5, r27
+; CHECK-P9-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f6, r26
+; CHECK-P9-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f7, r25
+; CHECK-P9-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f8, r4
+; CHECK-P9-NEXT:    mtvsrwa f9, r5
+; CHECK-P9-NEXT:    mtvsrwa f10, r6
+; CHECK-P9-NEXT:    mtvsrwa f11, r7
+; CHECK-P9-NEXT:    mtvsrwa f12, r8
+; CHECK-P9-NEXT:    mtvsrwa f13, r9
+; CHECK-P9-NEXT:    mtvsrwa v2, r10
+; CHECK-P9-NEXT:    mtvsrwa v3, r11
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xscvsxdsp f4, f4
+; CHECK-P9-NEXT:    xscvsxdsp f5, f5
+; CHECK-P9-NEXT:    xscvsxdsp f6, f6
+; CHECK-P9-NEXT:    xscvsxdsp f7, f7
+; CHECK-P9-NEXT:    xscvsxdsp f8, f8
+; CHECK-P9-NEXT:    xscvsxdsp f9, f9
+; CHECK-P9-NEXT:    xscvsxdsp f10, f10
+; CHECK-P9-NEXT:    xscvsxdsp f11, f11
+; CHECK-P9-NEXT:    xscvsxdsp f12, f12
+; CHECK-P9-NEXT:    xscvsxdsp f13, f13
+; CHECK-P9-NEXT:    xscvsxdsp f31, v2
+; CHECK-P9-NEXT:    xscvsxdsp f30, v3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-P9-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-P9-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P9-NEXT:    xvcvdpsp v0, vs4
+; CHECK-P9-NEXT:    xvcvdpsp v1, vs5
+; CHECK-P9-NEXT:    xvcvdpsp v6, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v7, vs7
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    vmrgew v4, v1, v0
+; CHECK-P9-NEXT:    vmrgew v5, v7, v6
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    stxv v5, 48(r3)
+; CHECK-P9-NEXT:    stxv v4, 32(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-NEXT:    lxv v2, 16(r4)
+; CHECK-BE-NEXT:    li r4, 6
+; CHECK-BE-NEXT:    li r5, 2
+; CHECK-BE-NEXT:    li r6, 4
+; CHECK-BE-NEXT:    li r7, 0
+; CHECK-BE-NEXT:    li r8, 14
+; CHECK-BE-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r9, 10
+; CHECK-BE-NEXT:    li r10, 12
+; CHECK-BE-NEXT:    li r11, 8
+; CHECK-BE-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    vextuhlx r12, r4, v3
+; CHECK-BE-NEXT:    vextuhlx r0, r5, v3
+; CHECK-BE-NEXT:    vextuhlx r30, r6, v3
+; CHECK-BE-NEXT:    vextuhlx r29, r7, v3
+; CHECK-BE-NEXT:    vextuhlx r28, r8, v3
+; CHECK-BE-NEXT:    vextuhlx r27, r9, v3
+; CHECK-BE-NEXT:    vextuhlx r26, r10, v3
+; CHECK-BE-NEXT:    vextuhlx r25, r11, v3
+; CHECK-BE-NEXT:    vextuhlx r4, r4, v2
+; CHECK-BE-NEXT:    vextuhlx r5, r5, v2
+; CHECK-BE-NEXT:    vextuhlx r6, r6, v2
+; CHECK-BE-NEXT:    vextuhlx r7, r7, v2
+; CHECK-BE-NEXT:    vextuhlx r8, r8, v2
+; CHECK-BE-NEXT:    vextuhlx r9, r9, v2
+; CHECK-BE-NEXT:    vextuhlx r10, r10, v2
+; CHECK-BE-NEXT:    vextuhlx r11, r11, v2
+; CHECK-BE-NEXT:    extsh r12, r12
+; CHECK-BE-NEXT:    extsh r0, r0
+; CHECK-BE-NEXT:    extsh r30, r30
+; CHECK-BE-NEXT:    extsh r29, r29
+; CHECK-BE-NEXT:    extsh r28, r28
+; CHECK-BE-NEXT:    extsh r27, r27
+; CHECK-BE-NEXT:    extsh r26, r26
+; CHECK-BE-NEXT:    extsh r25, r25
+; CHECK-BE-NEXT:    extsh r4, r4
+; CHECK-BE-NEXT:    extsh r5, r5
+; CHECK-BE-NEXT:    extsh r6, r6
+; CHECK-BE-NEXT:    extsh r7, r7
+; CHECK-BE-NEXT:    extsh r8, r8
+; CHECK-BE-NEXT:    extsh r9, r9
+; CHECK-BE-NEXT:    extsh r10, r10
+; CHECK-BE-NEXT:    extsh r11, r11
+; CHECK-BE-NEXT:    mtvsrwa f0, r12
+; CHECK-BE-NEXT:    mtvsrwa f1, r0
+; CHECK-BE-NEXT:    mtvsrwa f2, r30
+; CHECK-BE-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f3, r29
+; CHECK-BE-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f4, r28
+; CHECK-BE-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f5, r27
+; CHECK-BE-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f6, r26
+; CHECK-BE-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f7, r25
+; CHECK-BE-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f8, r4
+; CHECK-BE-NEXT:    mtvsrwa f9, r5
+; CHECK-BE-NEXT:    mtvsrwa f10, r6
+; CHECK-BE-NEXT:    mtvsrwa f11, r7
+; CHECK-BE-NEXT:    mtvsrwa f12, r8
+; CHECK-BE-NEXT:    mtvsrwa f13, r9
+; CHECK-BE-NEXT:    mtvsrwa v2, r10
+; CHECK-BE-NEXT:    mtvsrwa v3, r11
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xscvsxdsp f4, f4
+; CHECK-BE-NEXT:    xscvsxdsp f5, f5
+; CHECK-BE-NEXT:    xscvsxdsp f6, f6
+; CHECK-BE-NEXT:    xscvsxdsp f7, f7
+; CHECK-BE-NEXT:    xscvsxdsp f8, f8
+; CHECK-BE-NEXT:    xscvsxdsp f9, f9
+; CHECK-BE-NEXT:    xscvsxdsp f10, f10
+; CHECK-BE-NEXT:    xscvsxdsp f11, f11
+; CHECK-BE-NEXT:    xscvsxdsp f12, f12
+; CHECK-BE-NEXT:    xscvsxdsp f13, f13
+; CHECK-BE-NEXT:    xscvsxdsp f31, v2
+; CHECK-BE-NEXT:    xscvsxdsp f30, v3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-BE-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs3
+; CHECK-BE-NEXT:    xvcvdpsp v0, vs4
+; CHECK-BE-NEXT:    xvcvdpsp v1, vs5
+; CHECK-BE-NEXT:    xvcvdpsp v6, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v7, vs7
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    vmrgew v4, v1, v0
+; CHECK-BE-NEXT:    vmrgew v5, v7, v6
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i16>, <16 x i16>* %0, align 32
+  %1 = sitofp <16 x i16> %a to <16 x float>
+  store <16 x float> %1, <16 x float>* %agg.result, align 64
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll b/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a8f814c52189da26c16e7c3c3dbf7239034daf84
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
@@ -0,0 +1,828 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define <2 x double> @test2elt(i32 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI0_0@toc@ha
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    addi r3, r4, .LCPI0_0@toc@l
+; CHECK-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    lvx v3, 0, r3
+; CHECK-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-P8-NEXT:    xvcvuxddp v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI0_0@toc@ha
+; CHECK-P9-NEXT:    mtvsrws v3, r3
+; CHECK-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-P9-NEXT:    addi r4, r4, .LCPI0_0@toc@l
+; CHECK-P9-NEXT:    lxvx v2, 0, r4
+; CHECK-P9-NEXT:    vperm v2, v4, v3, v2
+; CHECK-P9-NEXT:    xvcvuxddp v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LCPI0_0@toc@ha
+; CHECK-BE-NEXT:    mtvsrws v3, r3
+; CHECK-BE-NEXT:    xxlxor v4, v4, v4
+; CHECK-BE-NEXT:    addi r4, r4, .LCPI0_0@toc@l
+; CHECK-BE-NEXT:    lxvx v2, 0, r4
+; CHECK-BE-NEXT:    vperm v2, v3, v4, v2
+; CHECK-BE-NEXT:    xvcvuxddp v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i32 %a.coerce to <2 x i16>
+  %1 = uitofp <2 x i16> %0 to <2 x double>
+  ret <2 x double> %1
+}
+
+define void @test4elt(<4 x double>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    addis r5, r2, .LCPI1_0@toc@ha
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI1_1@toc@ha
+; CHECK-P8-NEXT:    addi r5, r5, .LCPI1_0@toc@l
+; CHECK-P8-NEXT:    addi r4, r4, .LCPI1_1@toc@l
+; CHECK-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-P8-NEXT:    lvx v2, 0, r5
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    lvx v5, 0, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    vperm v2, v4, v3, v2
+; CHECK-P8-NEXT:    vperm v3, v4, v3, v5
+; CHECK-P8-NEXT:    xvcvuxddp vs0, v2
+; CHECK-P8-NEXT:    xvcvuxddp vs1, v3
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addis r5, r2, .LCPI1_0@toc@ha
+; CHECK-P9-NEXT:    addis r6, r2, .LCPI1_1@toc@ha
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    xxlxor v5, v5, v5
+; CHECK-P9-NEXT:    addi r5, r5, .LCPI1_0@toc@l
+; CHECK-P9-NEXT:    addi r6, r6, .LCPI1_1@toc@l
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    lxvx v2, 0, r5
+; CHECK-P9-NEXT:    lxvx v3, 0, r6
+; CHECK-P9-NEXT:    vperm v2, v5, v4, v2
+; CHECK-P9-NEXT:    vperm v3, v5, v4, v3
+; CHECK-P9-NEXT:    xvcvuxddp vs0, v2
+; CHECK-P9-NEXT:    xvcvuxddp vs1, v3
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI1_0@toc@ha
+; CHECK-BE-NEXT:    addis r6, r2, .LCPI1_1@toc@ha
+; CHECK-BE-NEXT:    mtvsrd v4, r4
+; CHECK-BE-NEXT:    xxlxor v5, v5, v5
+; CHECK-BE-NEXT:    addi r5, r5, .LCPI1_0@toc@l
+; CHECK-BE-NEXT:    addi r6, r6, .LCPI1_1@toc@l
+; CHECK-BE-NEXT:    lxvx v2, 0, r5
+; CHECK-BE-NEXT:    lxvx v3, 0, r6
+; CHECK-BE-NEXT:    vperm v2, v4, v5, v2
+; CHECK-BE-NEXT:    vperm v3, v5, v4, v3
+; CHECK-BE-NEXT:    xvcvuxddp vs0, v2
+; CHECK-BE-NEXT:    xvcvuxddp vs1, v3
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <4 x i16>
+  %1 = uitofp <4 x i16> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt(<8 x double>* noalias nocapture sret %agg.result, <8 x i16> %a) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-P8-NEXT:    addis r5, r2, .LCPI2_2@toc@ha
+; CHECK-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-P8-NEXT:    addi r4, r4, .LCPI2_0@toc@l
+; CHECK-P8-NEXT:    addi r5, r5, .LCPI2_2@toc@l
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI2_3@toc@ha
+; CHECK-P8-NEXT:    lvx v5, 0, r5
+; CHECK-P8-NEXT:    addis r5, r2, .LCPI2_1@toc@ha
+; CHECK-P8-NEXT:    addi r4, r4, .LCPI2_3@toc@l
+; CHECK-P8-NEXT:    addi r5, r5, .LCPI2_1@toc@l
+; CHECK-P8-NEXT:    lvx v0, 0, r4
+; CHECK-P8-NEXT:    lvx v1, 0, r5
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    vperm v3, v4, v2, v3
+; CHECK-P8-NEXT:    vperm v5, v4, v2, v5
+; CHECK-P8-NEXT:    vperm v0, v4, v2, v0
+; CHECK-P8-NEXT:    vperm v2, v4, v2, v1
+; CHECK-P8-NEXT:    xvcvuxddp vs0, v3
+; CHECK-P8-NEXT:    xvcvuxddp vs1, v5
+; CHECK-P8-NEXT:    xvcvuxddp vs2, v0
+; CHECK-P8-NEXT:    xvcvuxddp vs3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-P9-NEXT:    addis r5, r2, .LCPI2_1@toc@ha
+; CHECK-P9-NEXT:    addis r6, r2, .LCPI2_2@toc@ha
+; CHECK-P9-NEXT:    addis r7, r2, .LCPI2_3@toc@ha
+; CHECK-P9-NEXT:    xxlxor v1, v1, v1
+; CHECK-P9-NEXT:    addi r4, r4, .LCPI2_0@toc@l
+; CHECK-P9-NEXT:    addi r5, r5, .LCPI2_1@toc@l
+; CHECK-P9-NEXT:    addi r6, r6, .LCPI2_2@toc@l
+; CHECK-P9-NEXT:    addi r7, r7, .LCPI2_3@toc@l
+; CHECK-P9-NEXT:    lxvx v3, 0, r4
+; CHECK-P9-NEXT:    lxvx v4, 0, r5
+; CHECK-P9-NEXT:    lxvx v5, 0, r6
+; CHECK-P9-NEXT:    lxvx v0, 0, r7
+; CHECK-P9-NEXT:    vperm v3, v1, v2, v3
+; CHECK-P9-NEXT:    vperm v4, v1, v2, v4
+; CHECK-P9-NEXT:    vperm v5, v1, v2, v5
+; CHECK-P9-NEXT:    vperm v2, v1, v2, v0
+; CHECK-P9-NEXT:    xvcvuxddp vs0, v3
+; CHECK-P9-NEXT:    xvcvuxddp vs1, v4
+; CHECK-P9-NEXT:    xvcvuxddp vs2, v5
+; CHECK-P9-NEXT:    xvcvuxddp vs3, v2
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI2_1@toc@ha
+; CHECK-BE-NEXT:    addis r6, r2, .LCPI2_2@toc@ha
+; CHECK-BE-NEXT:    addis r7, r2, .LCPI2_3@toc@ha
+; CHECK-BE-NEXT:    xxlxor v1, v1, v1
+; CHECK-BE-NEXT:    addi r4, r4, .LCPI2_0@toc@l
+; CHECK-BE-NEXT:    addi r5, r5, .LCPI2_1@toc@l
+; CHECK-BE-NEXT:    addi r6, r6, .LCPI2_2@toc@l
+; CHECK-BE-NEXT:    addi r7, r7, .LCPI2_3@toc@l
+; CHECK-BE-NEXT:    lxvx v3, 0, r4
+; CHECK-BE-NEXT:    lxvx v4, 0, r5
+; CHECK-BE-NEXT:    lxvx v5, 0, r6
+; CHECK-BE-NEXT:    lxvx v0, 0, r7
+; CHECK-BE-NEXT:    vperm v3, v2, v1, v3
+; CHECK-BE-NEXT:    vperm v4, v1, v2, v4
+; CHECK-BE-NEXT:    vperm v5, v1, v2, v5
+; CHECK-BE-NEXT:    vperm v2, v1, v2, v0
+; CHECK-BE-NEXT:    xvcvuxddp vs0, v3
+; CHECK-BE-NEXT:    xvcvuxddp vs1, v4
+; CHECK-BE-NEXT:    xvcvuxddp vs2, v5
+; CHECK-BE-NEXT:    xvcvuxddp vs3, v2
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = uitofp <8 x i16> %a to <8 x double>
+  store <8 x double> %0, <8 x double>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt(<16 x double>* noalias nocapture sret %agg.result, <16 x i16>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    addis r6, r2, .LCPI3_2@toc@ha
+; CHECK-P8-NEXT:    addis r5, r2, .LCPI3_0@toc@ha
+; CHECK-P8-NEXT:    lvx v4, 0, r4
+; CHECK-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-P8-NEXT:    addi r6, r6, .LCPI3_2@toc@l
+; CHECK-P8-NEXT:    addi r5, r5, .LCPI3_0@toc@l
+; CHECK-P8-NEXT:    lvx v5, 0, r6
+; CHECK-P8-NEXT:    li r6, 16
+; CHECK-P8-NEXT:    lvx v2, 0, r5
+; CHECK-P8-NEXT:    addis r5, r2, .LCPI3_1@toc@ha
+; CHECK-P8-NEXT:    lvx v0, r4, r6
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI3_3@toc@ha
+; CHECK-P8-NEXT:    addi r5, r5, .LCPI3_1@toc@l
+; CHECK-P8-NEXT:    addi r4, r4, .LCPI3_3@toc@l
+; CHECK-P8-NEXT:    lvx v1, 0, r5
+; CHECK-P8-NEXT:    li r5, 96
+; CHECK-P8-NEXT:    lvx v8, 0, r4
+; CHECK-P8-NEXT:    vperm v6, v3, v4, v2
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    vperm v7, v3, v4, v5
+; CHECK-P8-NEXT:    vperm v2, v3, v0, v2
+; CHECK-P8-NEXT:    vperm v9, v3, v0, v1
+; CHECK-P8-NEXT:    vperm v5, v3, v0, v5
+; CHECK-P8-NEXT:    vperm v0, v3, v0, v8
+; CHECK-P8-NEXT:    vperm v1, v3, v4, v1
+; CHECK-P8-NEXT:    vperm v3, v3, v4, v8
+; CHECK-P8-NEXT:    xvcvuxddp vs1, v2
+; CHECK-P8-NEXT:    xvcvuxddp vs4, v9
+; CHECK-P8-NEXT:    xvcvuxddp vs2, v5
+; CHECK-P8-NEXT:    xvcvuxddp vs3, v0
+; CHECK-P8-NEXT:    xvcvuxddp vs0, v7
+; CHECK-P8-NEXT:    xvcvuxddp vs5, v3
+; CHECK-P8-NEXT:    xvcvuxddp vs6, v6
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xvcvuxddp vs7, v1
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r5
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    li r5, 64
+; CHECK-P8-NEXT:    xxswapd vs2, vs7
+; CHECK-P8-NEXT:    xxswapd vs3, vs6
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r5
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    stxvd2x vs5, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addis r5, r2, .LCPI3_0@toc@ha
+; CHECK-P9-NEXT:    addis r6, r2, .LCPI3_1@toc@ha
+; CHECK-P9-NEXT:    addis r7, r2, .LCPI3_2@toc@ha
+; CHECK-P9-NEXT:    addis r8, r2, .LCPI3_3@toc@ha
+; CHECK-P9-NEXT:    lxv v0, 0(r4)
+; CHECK-P9-NEXT:    lxv v1, 16(r4)
+; CHECK-P9-NEXT:    xxlxor v6, v6, v6
+; CHECK-P9-NEXT:    addi r5, r5, .LCPI3_0@toc@l
+; CHECK-P9-NEXT:    addi r6, r6, .LCPI3_1@toc@l
+; CHECK-P9-NEXT:    addi r7, r7, .LCPI3_2@toc@l
+; CHECK-P9-NEXT:    addi r8, r8, .LCPI3_3@toc@l
+; CHECK-P9-NEXT:    lxvx v2, 0, r5
+; CHECK-P9-NEXT:    lxvx v3, 0, r6
+; CHECK-P9-NEXT:    lxvx v4, 0, r7
+; CHECK-P9-NEXT:    lxvx v5, 0, r8
+; CHECK-P9-NEXT:    vperm v7, v6, v0, v2
+; CHECK-P9-NEXT:    vperm v8, v6, v0, v3
+; CHECK-P9-NEXT:    vperm v9, v6, v0, v4
+; CHECK-P9-NEXT:    vperm v0, v6, v0, v5
+; CHECK-P9-NEXT:    vperm v2, v6, v1, v2
+; CHECK-P9-NEXT:    vperm v3, v6, v1, v3
+; CHECK-P9-NEXT:    vperm v4, v6, v1, v4
+; CHECK-P9-NEXT:    vperm v5, v6, v1, v5
+; CHECK-P9-NEXT:    xvcvuxddp vs0, v7
+; CHECK-P9-NEXT:    xvcvuxddp vs1, v8
+; CHECK-P9-NEXT:    xvcvuxddp vs2, v9
+; CHECK-P9-NEXT:    xvcvuxddp vs3, v0
+; CHECK-P9-NEXT:    xvcvuxddp vs4, v2
+; CHECK-P9-NEXT:    xvcvuxddp vs5, v3
+; CHECK-P9-NEXT:    xvcvuxddp vs6, v4
+; CHECK-P9-NEXT:    xvcvuxddp vs7, v5
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs7, 112(r3)
+; CHECK-P9-NEXT:    stxv vs6, 96(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    stxv vs4, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI3_0@toc@ha
+; CHECK-BE-NEXT:    addis r6, r2, .LCPI3_1@toc@ha
+; CHECK-BE-NEXT:    addis r7, r2, .LCPI3_2@toc@ha
+; CHECK-BE-NEXT:    addis r8, r2, .LCPI3_3@toc@ha
+; CHECK-BE-NEXT:    lxv v0, 0(r4)
+; CHECK-BE-NEXT:    lxv v1, 16(r4)
+; CHECK-BE-NEXT:    xxlxor v6, v6, v6
+; CHECK-BE-NEXT:    addi r5, r5, .LCPI3_0@toc@l
+; CHECK-BE-NEXT:    addi r6, r6, .LCPI3_1@toc@l
+; CHECK-BE-NEXT:    addi r7, r7, .LCPI3_2@toc@l
+; CHECK-BE-NEXT:    addi r8, r8, .LCPI3_3@toc@l
+; CHECK-BE-NEXT:    lxvx v2, 0, r5
+; CHECK-BE-NEXT:    lxvx v3, 0, r6
+; CHECK-BE-NEXT:    lxvx v4, 0, r7
+; CHECK-BE-NEXT:    lxvx v5, 0, r8
+; CHECK-BE-NEXT:    vperm v7, v0, v6, v2
+; CHECK-BE-NEXT:    vperm v8, v6, v0, v3
+; CHECK-BE-NEXT:    vperm v9, v6, v0, v4
+; CHECK-BE-NEXT:    vperm v0, v6, v0, v5
+; CHECK-BE-NEXT:    vperm v2, v1, v6, v2
+; CHECK-BE-NEXT:    vperm v3, v6, v1, v3
+; CHECK-BE-NEXT:    vperm v4, v6, v1, v4
+; CHECK-BE-NEXT:    vperm v5, v6, v1, v5
+; CHECK-BE-NEXT:    xvcvuxddp vs0, v7
+; CHECK-BE-NEXT:    xvcvuxddp vs1, v8
+; CHECK-BE-NEXT:    xvcvuxddp vs2, v9
+; CHECK-BE-NEXT:    xvcvuxddp vs3, v0
+; CHECK-BE-NEXT:    xvcvuxddp vs4, v2
+; CHECK-BE-NEXT:    xvcvuxddp vs5, v3
+; CHECK-BE-NEXT:    xvcvuxddp vs6, v4
+; CHECK-BE-NEXT:    xvcvuxddp vs7, v5
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i16>, <16 x i16>* %0, align 32
+  %1 = uitofp <16 x i16> %a to <16 x double>
+  store <16 x double> %1, <16 x double>* %agg.result, align 128
+  ret void
+}
+
+define <2 x double> @test2elt_signed(i32 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 48
+; CHECK-P8-NEXT:    rldicl r3, r3, 48, 48
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    extsh r3, r3
+; CHECK-P8-NEXT:    mtvsrwa f0, r4
+; CHECK-P8-NEXT:    mtvsrwa f1, r3
+; CHECK-P8-NEXT:    xscvsxddp f0, f0
+; CHECK-P8-NEXT:    xscvsxddp f1, f1
+; CHECK-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI4_0@toc@ha
+; CHECK-P9-NEXT:    mtvsrws v3, r3
+; CHECK-P9-NEXT:    addi r4, r4, .LCPI4_0@toc@l
+; CHECK-P9-NEXT:    lxvx v2, 0, r4
+; CHECK-P9-NEXT:    vperm v2, v3, v3, v2
+; CHECK-P9-NEXT:    vextsh2d v2, v2
+; CHECK-P9-NEXT:    xvcvsxddp v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LCPI4_0@toc@ha
+; CHECK-BE-NEXT:    mtvsrws v3, r3
+; CHECK-BE-NEXT:    addi r4, r4, .LCPI4_0@toc@l
+; CHECK-BE-NEXT:    lxvx v2, 0, r4
+; CHECK-BE-NEXT:    vperm v2, v3, v3, v2
+; CHECK-BE-NEXT:    vextsh2d v2, v2
+; CHECK-BE-NEXT:    xvcvsxddp v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i32 %a.coerce to <2 x i16>
+  %1 = sitofp <2 x i16> %0 to <2 x double>
+  ret <2 x double> %1
+}
+
+define void @test4elt_signed(<4 x double>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    mfvsrd r4, f0
+; CHECK-P8-NEXT:    clrldi r5, r4, 48
+; CHECK-P8-NEXT:    rldicl r6, r4, 48, 48
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f0, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 32, 48
+; CHECK-P8-NEXT:    rldicl r4, r4, 16, 48
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f1, r6
+; CHECK-P8-NEXT:    mtvsrwa f2, r5
+; CHECK-P8-NEXT:    mtvsrwa f3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xscvsxddp f0, f0
+; CHECK-P8-NEXT:    xscvsxddp f1, f1
+; CHECK-P8-NEXT:    xscvsxddp f2, f2
+; CHECK-P8-NEXT:    xscvsxddp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addis r5, r2, .LCPI5_0@toc@ha
+; CHECK-P9-NEXT:    addis r6, r2, .LCPI5_1@toc@ha
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    addi r5, r5, .LCPI5_0@toc@l
+; CHECK-P9-NEXT:    addi r6, r6, .LCPI5_1@toc@l
+; CHECK-P9-NEXT:    xxswapd v4, vs0
+; CHECK-P9-NEXT:    lxvx v2, 0, r5
+; CHECK-P9-NEXT:    lxvx v3, 0, r6
+; CHECK-P9-NEXT:    vperm v2, v4, v4, v2
+; CHECK-P9-NEXT:    vperm v3, v4, v4, v3
+; CHECK-P9-NEXT:    vextsh2d v2, v2
+; CHECK-P9-NEXT:    vextsh2d v3, v3
+; CHECK-P9-NEXT:    xvcvsxddp vs0, v2
+; CHECK-P9-NEXT:    xvcvsxddp vs1, v3
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI5_0@toc@ha
+; CHECK-BE-NEXT:    addis r6, r2, .LCPI5_1@toc@ha
+; CHECK-BE-NEXT:    mtvsrd v4, r4
+; CHECK-BE-NEXT:    xxlxor v5, v5, v5
+; CHECK-BE-NEXT:    addi r5, r5, .LCPI5_0@toc@l
+; CHECK-BE-NEXT:    addi r6, r6, .LCPI5_1@toc@l
+; CHECK-BE-NEXT:    lxvx v2, 0, r5
+; CHECK-BE-NEXT:    lxvx v3, 0, r6
+; CHECK-BE-NEXT:    vperm v2, v5, v4, v2
+; CHECK-BE-NEXT:    vperm v3, v4, v4, v3
+; CHECK-BE-NEXT:    vextsh2d v2, v2
+; CHECK-BE-NEXT:    vextsh2d v3, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs0, v2
+; CHECK-BE-NEXT:    xvcvsxddp vs1, v3
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <4 x i16>
+  %1 = sitofp <4 x i16> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt_signed(<8 x double>* noalias nocapture sret %agg.result, <8 x i16> %a) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mfvsrd r4, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    clrldi r5, r4, 48
+; CHECK-P8-NEXT:    rldicl r6, r4, 48, 48
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    mfvsrd r7, f0
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f1, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 32, 48
+; CHECK-P8-NEXT:    rldicl r4, r4, 16, 48
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f0, r6
+; CHECK-P8-NEXT:    mtvsrwa f2, r5
+; CHECK-P8-NEXT:    clrldi r5, r7, 48
+; CHECK-P8-NEXT:    mtvsrwa f3, r4
+; CHECK-P8-NEXT:    extsh r4, r5
+; CHECK-P8-NEXT:    rldicl r5, r7, 16, 48
+; CHECK-P8-NEXT:    mtvsrwa f4, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 48, 48
+; CHECK-P8-NEXT:    extsh r5, r5
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f7, r5
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    mtvsrwa f5, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 32, 48
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    xscvsxddp f1, f1
+; CHECK-P8-NEXT:    mtvsrwa f6, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xscvsxddp f0, f0
+; CHECK-P8-NEXT:    xscvsxddp f2, f2
+; CHECK-P8-NEXT:    xscvsxddp f3, f3
+; CHECK-P8-NEXT:    xscvsxddp f4, f4
+; CHECK-P8-NEXT:    xscvsxddp f5, f5
+; CHECK-P8-NEXT:    xscvsxddp f6, f6
+; CHECK-P8-NEXT:    xscvsxddp f7, f7
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs2, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI6_0@toc@ha
+; CHECK-P9-NEXT:    addis r5, r2, .LCPI6_1@toc@ha
+; CHECK-P9-NEXT:    addis r6, r2, .LCPI6_2@toc@ha
+; CHECK-P9-NEXT:    addis r7, r2, .LCPI6_3@toc@ha
+; CHECK-P9-NEXT:    addi r4, r4, .LCPI6_0@toc@l
+; CHECK-P9-NEXT:    addi r5, r5, .LCPI6_1@toc@l
+; CHECK-P9-NEXT:    addi r6, r6, .LCPI6_2@toc@l
+; CHECK-P9-NEXT:    addi r7, r7, .LCPI6_3@toc@l
+; CHECK-P9-NEXT:    lxvx v3, 0, r4
+; CHECK-P9-NEXT:    lxvx v4, 0, r5
+; CHECK-P9-NEXT:    lxvx v5, 0, r6
+; CHECK-P9-NEXT:    lxvx v0, 0, r7
+; CHECK-P9-NEXT:    vperm v3, v2, v2, v3
+; CHECK-P9-NEXT:    vperm v4, v2, v2, v4
+; CHECK-P9-NEXT:    vperm v5, v2, v2, v5
+; CHECK-P9-NEXT:    vperm v2, v2, v2, v0
+; CHECK-P9-NEXT:    vextsh2d v3, v3
+; CHECK-P9-NEXT:    vextsh2d v4, v4
+; CHECK-P9-NEXT:    vextsh2d v5, v5
+; CHECK-P9-NEXT:    vextsh2d v2, v2
+; CHECK-P9-NEXT:    xvcvsxddp vs0, v3
+; CHECK-P9-NEXT:    xvcvsxddp vs1, v4
+; CHECK-P9-NEXT:    xvcvsxddp vs2, v5
+; CHECK-P9-NEXT:    xvcvsxddp vs3, v2
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_0@toc@ha
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI6_1@toc@ha
+; CHECK-BE-NEXT:    addis r6, r2, .LCPI6_2@toc@ha
+; CHECK-BE-NEXT:    addis r7, r2, .LCPI6_3@toc@ha
+; CHECK-BE-NEXT:    xxlxor v1, v1, v1
+; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_0@toc@l
+; CHECK-BE-NEXT:    addi r5, r5, .LCPI6_1@toc@l
+; CHECK-BE-NEXT:    addi r6, r6, .LCPI6_2@toc@l
+; CHECK-BE-NEXT:    addi r7, r7, .LCPI6_3@toc@l
+; CHECK-BE-NEXT:    lxvx v3, 0, r4
+; CHECK-BE-NEXT:    lxvx v4, 0, r5
+; CHECK-BE-NEXT:    lxvx v5, 0, r6
+; CHECK-BE-NEXT:    lxvx v0, 0, r7
+; CHECK-BE-NEXT:    vperm v3, v1, v2, v3
+; CHECK-BE-NEXT:    vperm v4, v1, v2, v4
+; CHECK-BE-NEXT:    vperm v5, v2, v2, v5
+; CHECK-BE-NEXT:    vperm v2, v2, v2, v0
+; CHECK-BE-NEXT:    vextsh2d v3, v3
+; CHECK-BE-NEXT:    vextsh2d v4, v4
+; CHECK-BE-NEXT:    vextsh2d v5, v5
+; CHECK-BE-NEXT:    vextsh2d v2, v2
+; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs1, v4
+; CHECK-BE-NEXT:    xvcvsxddp vs2, v5
+; CHECK-BE-NEXT:    xvcvsxddp vs3, v2
+; CHECK-BE-NEXT:    stxv vs1, 48(r3)
+; CHECK-BE-NEXT:    stxv vs3, 32(r3)
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs2, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = sitofp <8 x i16> %a to <8 x double>
+  store <8 x double> %0, <8 x double>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt_signed(<16 x double>* noalias nocapture sret %agg.result, <16 x i16>* nocapture readonly) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    mfvsrd r7, v3
+; CHECK-P8-NEXT:    xxswapd vs8, v3
+; CHECK-P8-NEXT:    mfvsrd r6, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    clrldi r4, r6, 48
+; CHECK-P8-NEXT:    rldicl r8, r6, 48, 48
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    extsh r8, r8
+; CHECK-P8-NEXT:    mtvsrwa f0, r4
+; CHECK-P8-NEXT:    rldicl r4, r6, 32, 48
+; CHECK-P8-NEXT:    rldicl r6, r6, 16, 48
+; CHECK-P8-NEXT:    mtvsrwa f1, r8
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    clrldi r8, r7, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f3, r4
+; CHECK-P8-NEXT:    extsh r4, r8
+; CHECK-P8-NEXT:    mtvsrwa f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r7, 48, 48
+; CHECK-P8-NEXT:    mtvsrwa f5, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 32, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mfvsrd r8, f2
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f2, r6
+; CHECK-P8-NEXT:    rldicl r6, r7, 16, 48
+; CHECK-P8-NEXT:    mtvsrwa f6, r4
+; CHECK-P8-NEXT:    clrldi r4, r8, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f7, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 48, 48
+; CHECK-P8-NEXT:    mtvsrwa f9, r4
+; CHECK-P8-NEXT:    rldicl r4, r8, 32, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f10, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 16, 48
+; CHECK-P8-NEXT:    mtvsrwa f11, r4
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    mfvsrd r4, f8
+; CHECK-P8-NEXT:    mtvsrwa f8, r6
+; CHECK-P8-NEXT:    clrldi r6, r4, 48
+; CHECK-P8-NEXT:    xscvsxddp f3, f3
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    xscvsxddp f4, f4
+; CHECK-P8-NEXT:    mtvsrwa f12, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 48, 48
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    xscvsxddp f0, f0
+; CHECK-P8-NEXT:    mtvsrwa f13, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 32, 48
+; CHECK-P8-NEXT:    rldicl r4, r4, 16, 48
+; CHECK-P8-NEXT:    xscvsxddp f1, f1
+; CHECK-P8-NEXT:    extsh r6, r6
+; CHECK-P8-NEXT:    extsh r4, r4
+; CHECK-P8-NEXT:    xscvsxddp f5, f5
+; CHECK-P8-NEXT:    xscvsxddp f2, f2
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    mtvsrwa v2, r6
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    mtvsrwa v3, r4
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    xscvsxddp f6, f6
+; CHECK-P8-NEXT:    xscvsxddp f7, f7
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xscvsxddp f9, f9
+; CHECK-P8-NEXT:    xscvsxddp f10, f10
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs5
+; CHECK-P8-NEXT:    xscvsxddp f11, f11
+; CHECK-P8-NEXT:    xxswapd vs2, vs3
+; CHECK-P8-NEXT:    xscvsxddp f8, f8
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xscvsxddp f12, f12
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xscvsxddp f13, f13
+; CHECK-P8-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P8-NEXT:    xscvsxddp f4, v2
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xscvsxddp f31, v3
+; CHECK-P8-NEXT:    xxmrghd vs5, vs10, vs9
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xxmrghd vs6, vs8, vs11
+; CHECK-P8-NEXT:    xxmrghd vs7, vs13, vs12
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs0, vs6
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    li r6, 64
+; CHECK-P8-NEXT:    xxmrghd vs2, vs31, vs4
+; CHECK-P8-NEXT:    xxswapd vs4, vs5
+; CHECK-P8-NEXT:    xxswapd vs5, vs7
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs5, 0, r3
+; CHECK-P8-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addis r5, r2, .LCPI7_0@toc@ha
+; CHECK-P9-NEXT:    addis r6, r2, .LCPI7_1@toc@ha
+; CHECK-P9-NEXT:    addis r7, r2, .LCPI7_2@toc@ha
+; CHECK-P9-NEXT:    addis r8, r2, .LCPI7_3@toc@ha
+; CHECK-P9-NEXT:    lxv v0, 0(r4)
+; CHECK-P9-NEXT:    lxv v1, 16(r4)
+; CHECK-P9-NEXT:    addi r5, r5, .LCPI7_0@toc@l
+; CHECK-P9-NEXT:    addi r6, r6, .LCPI7_1@toc@l
+; CHECK-P9-NEXT:    addi r7, r7, .LCPI7_2@toc@l
+; CHECK-P9-NEXT:    addi r8, r8, .LCPI7_3@toc@l
+; CHECK-P9-NEXT:    lxvx v2, 0, r5
+; CHECK-P9-NEXT:    lxvx v3, 0, r6
+; CHECK-P9-NEXT:    lxvx v4, 0, r7
+; CHECK-P9-NEXT:    lxvx v5, 0, r8
+; CHECK-P9-NEXT:    vperm v6, v0, v0, v2
+; CHECK-P9-NEXT:    vperm v7, v0, v0, v3
+; CHECK-P9-NEXT:    vperm v8, v0, v0, v4
+; CHECK-P9-NEXT:    vperm v0, v0, v0, v5
+; CHECK-P9-NEXT:    vperm v2, v1, v1, v2
+; CHECK-P9-NEXT:    vperm v3, v1, v1, v3
+; CHECK-P9-NEXT:    vperm v4, v1, v1, v4
+; CHECK-P9-NEXT:    vperm v5, v1, v1, v5
+; CHECK-P9-NEXT:    vextsh2d v1, v6
+; CHECK-P9-NEXT:    vextsh2d v6, v7
+; CHECK-P9-NEXT:    vextsh2d v7, v8
+; CHECK-P9-NEXT:    vextsh2d v0, v0
+; CHECK-P9-NEXT:    vextsh2d v2, v2
+; CHECK-P9-NEXT:    vextsh2d v3, v3
+; CHECK-P9-NEXT:    vextsh2d v4, v4
+; CHECK-P9-NEXT:    vextsh2d v5, v5
+; CHECK-P9-NEXT:    xvcvsxddp vs0, v1
+; CHECK-P9-NEXT:    xvcvsxddp vs1, v6
+; CHECK-P9-NEXT:    xvcvsxddp vs2, v7
+; CHECK-P9-NEXT:    xvcvsxddp vs3, v0
+; CHECK-P9-NEXT:    xvcvsxddp vs4, v2
+; CHECK-P9-NEXT:    xvcvsxddp vs5, v3
+; CHECK-P9-NEXT:    xvcvsxddp vs6, v4
+; CHECK-P9-NEXT:    xvcvsxddp vs7, v5
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs7, 112(r3)
+; CHECK-P9-NEXT:    stxv vs6, 96(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    stxv vs4, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI7_0@toc@ha
+; CHECK-BE-NEXT:    addis r6, r2, .LCPI7_1@toc@ha
+; CHECK-BE-NEXT:    addis r7, r2, .LCPI7_2@toc@ha
+; CHECK-BE-NEXT:    addis r8, r2, .LCPI7_3@toc@ha
+; CHECK-BE-NEXT:    lxv v2, 16(r4)
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-NEXT:    xxlxor v6, v6, v6
+; CHECK-BE-NEXT:    addi r5, r5, .LCPI7_0@toc@l
+; CHECK-BE-NEXT:    addi r6, r6, .LCPI7_1@toc@l
+; CHECK-BE-NEXT:    addi r7, r7, .LCPI7_2@toc@l
+; CHECK-BE-NEXT:    addi r8, r8, .LCPI7_3@toc@l
+; CHECK-BE-NEXT:    lxvx v4, 0, r5
+; CHECK-BE-NEXT:    lxvx v5, 0, r6
+; CHECK-BE-NEXT:    lxvx v0, 0, r7
+; CHECK-BE-NEXT:    lxvx v1, 0, r8
+; CHECK-BE-NEXT:    vperm v7, v6, v3, v4
+; CHECK-BE-NEXT:    vperm v8, v6, v3, v5
+; CHECK-BE-NEXT:    vperm v4, v6, v2, v4
+; CHECK-BE-NEXT:    vperm v5, v6, v2, v5
+; CHECK-BE-NEXT:    vperm v6, v3, v3, v0
+; CHECK-BE-NEXT:    vperm v3, v3, v3, v1
+; CHECK-BE-NEXT:    vperm v0, v2, v2, v0
+; CHECK-BE-NEXT:    vperm v2, v2, v2, v1
+; CHECK-BE-NEXT:    vextsh2d v1, v7
+; CHECK-BE-NEXT:    vextsh2d v7, v8
+; CHECK-BE-NEXT:    vextsh2d v4, v4
+; CHECK-BE-NEXT:    vextsh2d v5, v5
+; CHECK-BE-NEXT:    vextsh2d v6, v6
+; CHECK-BE-NEXT:    vextsh2d v3, v3
+; CHECK-BE-NEXT:    vextsh2d v0, v0
+; CHECK-BE-NEXT:    vextsh2d v2, v2
+; CHECK-BE-NEXT:    xvcvsxddp vs0, v1
+; CHECK-BE-NEXT:    xvcvsxddp vs1, v7
+; CHECK-BE-NEXT:    xvcvsxddp vs2, v4
+; CHECK-BE-NEXT:    xvcvsxddp vs3, v5
+; CHECK-BE-NEXT:    xvcvsxddp vs4, v6
+; CHECK-BE-NEXT:    xvcvsxddp vs5, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs6, v0
+; CHECK-BE-NEXT:    xvcvsxddp vs7, v2
+; CHECK-BE-NEXT:    stxv vs3, 112(r3)
+; CHECK-BE-NEXT:    stxv vs2, 80(r3)
+; CHECK-BE-NEXT:    stxv vs1, 48(r3)
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs7, 96(r3)
+; CHECK-BE-NEXT:    stxv vs6, 64(r3)
+; CHECK-BE-NEXT:    stxv vs5, 32(r3)
+; CHECK-BE-NEXT:    stxv vs4, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i16>, <16 x i16>* %0, align 32
+  %1 = sitofp <16 x i16> %a to <16 x double>
+  store <16 x double> %1, <16 x double>* %agg.result, align 128
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll b/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..39b88bab3986d5c119d9c2bd00aa23e8202e2a7d
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll
@@ -0,0 +1,518 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define <2 x double> @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xxmrglw v2, v2, v2
+; CHECK-P8-NEXT:    xvcvuxwdp v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxmrglw v2, v2, v2
+; CHECK-P9-NEXT:    xvcvuxwdp v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xxmrghw v2, vs0, vs0
+; CHECK-BE-NEXT:    xvcvuxwdp v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x i32>
+  %1 = uitofp <2 x i32> %0 to <2 x double>
+  ret <2 x double> %1
+}
+
+define void @test4elt(<4 x double>* noalias nocapture sret %agg.result, <4 x i32> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxmrglw v3, v2, v2
+; CHECK-P8-NEXT:    xxmrghw v2, v2, v2
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xvcvuxwdp vs0, v3
+; CHECK-P8-NEXT:    xvcvuxwdp vs1, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxmrglw v3, v2, v2
+; CHECK-P9-NEXT:    xxmrghw v2, v2, v2
+; CHECK-P9-NEXT:    xvcvuxwdp vs0, v3
+; CHECK-P9-NEXT:    xvcvuxwdp vs1, v2
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxmrghw v3, v2, v2
+; CHECK-BE-NEXT:    xxmrglw v2, v2, v2
+; CHECK-BE-NEXT:    xvcvuxwdp vs0, v3
+; CHECK-BE-NEXT:    xvcvuxwdp vs1, v2
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = uitofp <4 x i32> %a to <4 x double>
+  store <4 x double> %0, <4 x double>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt(<8 x double>* noalias nocapture sret %agg.result, <8 x i32>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xxmrglw v5, v3, v3
+; CHECK-P8-NEXT:    xxmrghw v3, v3, v3
+; CHECK-P8-NEXT:    xxmrglw v4, v2, v2
+; CHECK-P8-NEXT:    xxmrghw v2, v2, v2
+; CHECK-P8-NEXT:    xvcvuxwdp vs2, v5
+; CHECK-P8-NEXT:    xvcvuxwdp vs0, v4
+; CHECK-P8-NEXT:    xvcvuxwdp vs1, v2
+; CHECK-P8-NEXT:    xvcvuxwdp vs3, v3
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs2, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xxmrglw v2, vs1, vs1
+; CHECK-P9-NEXT:    xxmrghw v3, vs1, vs1
+; CHECK-P9-NEXT:    xxmrglw v4, vs0, vs0
+; CHECK-P9-NEXT:    xxmrghw v5, vs0, vs0
+; CHECK-P9-NEXT:    xvcvuxwdp vs0, v2
+; CHECK-P9-NEXT:    xvcvuxwdp vs1, v3
+; CHECK-P9-NEXT:    xvcvuxwdp vs2, v4
+; CHECK-P9-NEXT:    xvcvuxwdp vs3, v5
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xxmrghw v2, vs1, vs1
+; CHECK-BE-NEXT:    xxmrglw v3, vs1, vs1
+; CHECK-BE-NEXT:    xxmrghw v4, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw v5, vs0, vs0
+; CHECK-BE-NEXT:    xvcvuxwdp vs0, v2
+; CHECK-BE-NEXT:    xvcvuxwdp vs1, v3
+; CHECK-BE-NEXT:    xvcvuxwdp vs2, v4
+; CHECK-BE-NEXT:    xvcvuxwdp vs3, v5
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x i32>, <8 x i32>* %0, align 32
+  %1 = uitofp <8 x i32> %a to <8 x double>
+  store <8 x double> %1, <8 x double>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt(<16 x double>* noalias nocapture sret %agg.result, <16 x i32>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    li r7, 32
+; CHECK-P8-NEXT:    li r8, 64
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r6
+; CHECK-P8-NEXT:    lvx v0, r4, r7
+; CHECK-P8-NEXT:    xxmrglw v4, v2, v2
+; CHECK-P8-NEXT:    xxmrghw v5, v3, v3
+; CHECK-P8-NEXT:    xxmrghw v2, v2, v2
+; CHECK-P8-NEXT:    xxmrglw v3, v3, v3
+; CHECK-P8-NEXT:    xvcvuxwdp vs0, v4
+; CHECK-P8-NEXT:    lvx v4, 0, r4
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    xvcvuxwdp vs1, v5
+; CHECK-P8-NEXT:    xxmrghw v5, v0, v0
+; CHECK-P8-NEXT:    xxmrglw v0, v0, v0
+; CHECK-P8-NEXT:    xvcvuxwdp vs2, v2
+; CHECK-P8-NEXT:    xxmrglw v2, v4, v4
+; CHECK-P8-NEXT:    xvcvuxwdp vs3, v3
+; CHECK-P8-NEXT:    xxmrghw v3, v4, v4
+; CHECK-P8-NEXT:    xvcvuxwdp vs4, v5
+; CHECK-P8-NEXT:    xvcvuxwdp vs5, v0
+; CHECK-P8-NEXT:    xvcvuxwdp vs6, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xvcvuxwdp vs7, v3
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xxswapd vs1, vs5
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs5, vs6
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs3, vs7
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r8
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    lxv vs2, 48(r4)
+; CHECK-P9-NEXT:    lxv vs3, 32(r4)
+; CHECK-P9-NEXT:    xxmrglw v2, vs1, vs1
+; CHECK-P9-NEXT:    xxmrghw v3, vs1, vs1
+; CHECK-P9-NEXT:    xxmrglw v4, vs0, vs0
+; CHECK-P9-NEXT:    xxmrghw v5, vs0, vs0
+; CHECK-P9-NEXT:    xxmrglw v0, vs3, vs3
+; CHECK-P9-NEXT:    xxmrghw v1, vs3, vs3
+; CHECK-P9-NEXT:    xxmrglw v6, vs2, vs2
+; CHECK-P9-NEXT:    xxmrghw v7, vs2, vs2
+; CHECK-P9-NEXT:    xvcvuxwdp vs0, v2
+; CHECK-P9-NEXT:    xvcvuxwdp vs1, v3
+; CHECK-P9-NEXT:    xvcvuxwdp vs2, v4
+; CHECK-P9-NEXT:    xvcvuxwdp vs3, v5
+; CHECK-P9-NEXT:    xvcvuxwdp vs4, v0
+; CHECK-P9-NEXT:    xvcvuxwdp vs5, v1
+; CHECK-P9-NEXT:    xvcvuxwdp vs6, v6
+; CHECK-P9-NEXT:    xvcvuxwdp vs7, v7
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs7, 112(r3)
+; CHECK-P9-NEXT:    stxv vs6, 96(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    stxv vs4, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    lxv vs2, 48(r4)
+; CHECK-BE-NEXT:    lxv vs3, 32(r4)
+; CHECK-BE-NEXT:    xxmrghw v2, vs1, vs1
+; CHECK-BE-NEXT:    xxmrglw v3, vs1, vs1
+; CHECK-BE-NEXT:    xxmrghw v4, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw v5, vs0, vs0
+; CHECK-BE-NEXT:    xxmrghw v0, vs3, vs3
+; CHECK-BE-NEXT:    xxmrglw v1, vs3, vs3
+; CHECK-BE-NEXT:    xxmrghw v6, vs2, vs2
+; CHECK-BE-NEXT:    xxmrglw v7, vs2, vs2
+; CHECK-BE-NEXT:    xvcvuxwdp vs0, v2
+; CHECK-BE-NEXT:    xvcvuxwdp vs1, v3
+; CHECK-BE-NEXT:    xvcvuxwdp vs2, v4
+; CHECK-BE-NEXT:    xvcvuxwdp vs3, v5
+; CHECK-BE-NEXT:    xvcvuxwdp vs4, v0
+; CHECK-BE-NEXT:    xvcvuxwdp vs5, v1
+; CHECK-BE-NEXT:    xvcvuxwdp vs6, v6
+; CHECK-BE-NEXT:    xvcvuxwdp vs7, v7
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i32>, <16 x i32>* %0, align 64
+  %1 = uitofp <16 x i32> %a to <16 x double>
+  store <16 x double> %1, <16 x double>* %agg.result, align 128
+  ret void
+}
+
+define <2 x double> @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xxmrglw v2, v2, v2
+; CHECK-P8-NEXT:    xvcvsxwdp v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xxmrglw v2, v2, v2
+; CHECK-P9-NEXT:    xvcvsxwdp v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xxmrghw v2, vs0, vs0
+; CHECK-BE-NEXT:    xvcvsxwdp v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x i32>
+  %1 = sitofp <2 x i32> %0 to <2 x double>
+  ret <2 x double> %1
+}
+
+define void @test4elt_signed(<4 x double>* noalias nocapture sret %agg.result, <4 x i32> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxmrglw v3, v2, v2
+; CHECK-P8-NEXT:    xxmrghw v2, v2, v2
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xvcvsxwdp vs0, v3
+; CHECK-P8-NEXT:    xvcvsxwdp vs1, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxmrglw v3, v2, v2
+; CHECK-P9-NEXT:    xxmrghw v2, v2, v2
+; CHECK-P9-NEXT:    xvcvsxwdp vs0, v3
+; CHECK-P9-NEXT:    xvcvsxwdp vs1, v2
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxmrghw v3, v2, v2
+; CHECK-BE-NEXT:    xxmrglw v2, v2, v2
+; CHECK-BE-NEXT:    xvcvsxwdp vs0, v3
+; CHECK-BE-NEXT:    xvcvsxwdp vs1, v2
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = sitofp <4 x i32> %a to <4 x double>
+  store <4 x double> %0, <4 x double>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt_signed(<8 x double>* noalias nocapture sret %agg.result, <8 x i32>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xxmrglw v5, v3, v3
+; CHECK-P8-NEXT:    xxmrghw v3, v3, v3
+; CHECK-P8-NEXT:    xxmrglw v4, v2, v2
+; CHECK-P8-NEXT:    xxmrghw v2, v2, v2
+; CHECK-P8-NEXT:    xvcvsxwdp vs2, v5
+; CHECK-P8-NEXT:    xvcvsxwdp vs0, v4
+; CHECK-P8-NEXT:    xvcvsxwdp vs1, v2
+; CHECK-P8-NEXT:    xvcvsxwdp vs3, v3
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs2, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xxmrglw v2, vs1, vs1
+; CHECK-P9-NEXT:    xxmrghw v3, vs1, vs1
+; CHECK-P9-NEXT:    xxmrglw v4, vs0, vs0
+; CHECK-P9-NEXT:    xxmrghw v5, vs0, vs0
+; CHECK-P9-NEXT:    xvcvsxwdp vs0, v2
+; CHECK-P9-NEXT:    xvcvsxwdp vs1, v3
+; CHECK-P9-NEXT:    xvcvsxwdp vs2, v4
+; CHECK-P9-NEXT:    xvcvsxwdp vs3, v5
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xxmrghw v2, vs1, vs1
+; CHECK-BE-NEXT:    xxmrglw v3, vs1, vs1
+; CHECK-BE-NEXT:    xxmrghw v4, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw v5, vs0, vs0
+; CHECK-BE-NEXT:    xvcvsxwdp vs0, v2
+; CHECK-BE-NEXT:    xvcvsxwdp vs1, v3
+; CHECK-BE-NEXT:    xvcvsxwdp vs2, v4
+; CHECK-BE-NEXT:    xvcvsxwdp vs3, v5
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x i32>, <8 x i32>* %0, align 32
+  %1 = sitofp <8 x i32> %a to <8 x double>
+  store <8 x double> %1, <8 x double>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt_signed(<16 x double>* noalias nocapture sret %agg.result, <16 x i32>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    li r7, 32
+; CHECK-P8-NEXT:    li r8, 64
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r6
+; CHECK-P8-NEXT:    lvx v0, r4, r7
+; CHECK-P8-NEXT:    xxmrglw v4, v2, v2
+; CHECK-P8-NEXT:    xxmrghw v5, v3, v3
+; CHECK-P8-NEXT:    xxmrghw v2, v2, v2
+; CHECK-P8-NEXT:    xxmrglw v3, v3, v3
+; CHECK-P8-NEXT:    xvcvsxwdp vs0, v4
+; CHECK-P8-NEXT:    lvx v4, 0, r4
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    xvcvsxwdp vs1, v5
+; CHECK-P8-NEXT:    xxmrghw v5, v0, v0
+; CHECK-P8-NEXT:    xxmrglw v0, v0, v0
+; CHECK-P8-NEXT:    xvcvsxwdp vs2, v2
+; CHECK-P8-NEXT:    xxmrglw v2, v4, v4
+; CHECK-P8-NEXT:    xvcvsxwdp vs3, v3
+; CHECK-P8-NEXT:    xxmrghw v3, v4, v4
+; CHECK-P8-NEXT:    xvcvsxwdp vs4, v5
+; CHECK-P8-NEXT:    xvcvsxwdp vs5, v0
+; CHECK-P8-NEXT:    xvcvsxwdp vs6, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xvcvsxwdp vs7, v3
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xxswapd vs1, vs5
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs5, vs6
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs3, vs7
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r8
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    lxv vs2, 48(r4)
+; CHECK-P9-NEXT:    lxv vs3, 32(r4)
+; CHECK-P9-NEXT:    xxmrglw v2, vs1, vs1
+; CHECK-P9-NEXT:    xxmrghw v3, vs1, vs1
+; CHECK-P9-NEXT:    xxmrglw v4, vs0, vs0
+; CHECK-P9-NEXT:    xxmrghw v5, vs0, vs0
+; CHECK-P9-NEXT:    xxmrglw v0, vs3, vs3
+; CHECK-P9-NEXT:    xxmrghw v1, vs3, vs3
+; CHECK-P9-NEXT:    xxmrglw v6, vs2, vs2
+; CHECK-P9-NEXT:    xxmrghw v7, vs2, vs2
+; CHECK-P9-NEXT:    xvcvsxwdp vs0, v2
+; CHECK-P9-NEXT:    xvcvsxwdp vs1, v3
+; CHECK-P9-NEXT:    xvcvsxwdp vs2, v4
+; CHECK-P9-NEXT:    xvcvsxwdp vs3, v5
+; CHECK-P9-NEXT:    xvcvsxwdp vs4, v0
+; CHECK-P9-NEXT:    xvcvsxwdp vs5, v1
+; CHECK-P9-NEXT:    xvcvsxwdp vs6, v6
+; CHECK-P9-NEXT:    xvcvsxwdp vs7, v7
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs7, 112(r3)
+; CHECK-P9-NEXT:    stxv vs6, 96(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    stxv vs4, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    lxv vs2, 48(r4)
+; CHECK-BE-NEXT:    lxv vs3, 32(r4)
+; CHECK-BE-NEXT:    xxmrghw v2, vs1, vs1
+; CHECK-BE-NEXT:    xxmrglw v3, vs1, vs1
+; CHECK-BE-NEXT:    xxmrghw v4, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw v5, vs0, vs0
+; CHECK-BE-NEXT:    xxmrghw v0, vs3, vs3
+; CHECK-BE-NEXT:    xxmrglw v1, vs3, vs3
+; CHECK-BE-NEXT:    xxmrghw v6, vs2, vs2
+; CHECK-BE-NEXT:    xxmrglw v7, vs2, vs2
+; CHECK-BE-NEXT:    xvcvsxwdp vs0, v2
+; CHECK-BE-NEXT:    xvcvsxwdp vs1, v3
+; CHECK-BE-NEXT:    xvcvsxwdp vs2, v4
+; CHECK-BE-NEXT:    xvcvsxwdp vs3, v5
+; CHECK-BE-NEXT:    xvcvsxwdp vs4, v0
+; CHECK-BE-NEXT:    xvcvsxwdp vs5, v1
+; CHECK-BE-NEXT:    xvcvsxwdp vs6, v6
+; CHECK-BE-NEXT:    xvcvsxwdp vs7, v7
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i32>, <16 x i32>* %0, align 64
+  %1 = sitofp <16 x i32> %a to <16 x double>
+  store <16 x double> %1, <16 x double>* %agg.result, align 128
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll b/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cae556e27c630a9e7763fdbe5cb8445484adc81b
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll
@@ -0,0 +1,844 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i64 @test2elt(<2 x i64> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    xxlor vs1, v2, v2
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvdpspn vs1, f1
+; CHECK-P8-NEXT:    xscvdpspn vs0, f0
+; CHECK-P8-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P8-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxswapd vs0, v2
+; CHECK-P9-NEXT:    xxlor vs1, v2, v2
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvdpspn vs1, f1
+; CHECK-P9-NEXT:    xscvdpspn vs0, f0
+; CHECK-P9-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxswapd vs0, v2
+; CHECK-BE-NEXT:    xxlor vs1, v2, v2
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvdpspn v2, f1
+; CHECK-BE-NEXT:    xscvdpspn v3, f0
+; CHECK-BE-NEXT:    vmrghw v2, v2, v3
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = uitofp <2 x i64> %a to <2 x float>
+  %1 = bitcast <2 x float> %0 to i64
+  ret i64 %1
+}
+
+define <4 x float> @test4elt(<4 x i64>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs3, vs1
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xxswapd vs2, vs0
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs2, vs1
+; CHECK-P9-NEXT:    xxswapd vs3, vs0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P9-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs2
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxswapd vs2, vs1
+; CHECK-BE-NEXT:    xxswapd vs3, vs0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    vmrgew v2, v2, v3
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x i64>, <4 x i64>* %0, align 32
+  %1 = uitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define void @test8elt(<8 x float>* noalias nocapture sret %agg.result, <8 x i64>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r5
+; CHECK-P8-NEXT:    xxswapd vs7, vs3
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xxswapd vs4, vs0
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xxswapd vs5, vs1
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xxswapd vs6, vs2
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    xscvuxdsp f4, f4
+; CHECK-P8-NEXT:    xscvuxdsp f5, f5
+; CHECK-P8-NEXT:    xscvuxdsp f6, f6
+; CHECK-P8-NEXT:    xscvuxdsp f7, f7
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    xxmrghd vs0, vs6, vs7
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs0
+; CHECK-P8-NEXT:    vmrgew v2, v4, v2
+; CHECK-P8-NEXT:    vmrgew v3, v5, v3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    xxswapd vs4, vs3
+; CHECK-P9-NEXT:    xxswapd vs5, vs2
+; CHECK-P9-NEXT:    xxswapd vs6, vs1
+; CHECK-P9-NEXT:    xxswapd vs7, vs0
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f4, f4
+; CHECK-P9-NEXT:    xscvuxdsp f5, f5
+; CHECK-P9-NEXT:    xscvuxdsp f6, f6
+; CHECK-P9-NEXT:    xscvuxdsp f7, f7
+; CHECK-P9-NEXT:    xxmrghd vs2, vs2, vs3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P9-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs4
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs3
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 32(r4)
+; CHECK-BE-NEXT:    lxv vs1, 48(r4)
+; CHECK-BE-NEXT:    lxv vs2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs3, 16(r4)
+; CHECK-BE-NEXT:    xxswapd vs4, vs3
+; CHECK-BE-NEXT:    xxswapd vs5, vs2
+; CHECK-BE-NEXT:    xxswapd vs6, vs1
+; CHECK-BE-NEXT:    xxswapd vs7, vs0
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f4, f4
+; CHECK-BE-NEXT:    xscvuxdsp f5, f5
+; CHECK-BE-NEXT:    xscvuxdsp f6, f6
+; CHECK-BE-NEXT:    xscvuxdsp f7, f7
+; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-BE-NEXT:    xxmrghd vs3, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs1, vs7, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs3
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs1
+; CHECK-BE-NEXT:    vmrgew v2, v2, v3
+; CHECK-BE-NEXT:    vmrgew v3, v4, v5
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x i64>, <8 x i64>* %0, align 64
+  %1 = uitofp <8 x i64> %a to <8 x float>
+  store <8 x float> %1, <8 x float>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt(<16 x float>* noalias nocapture sret %agg.result, <16 x i64>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r7, 64
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    lxvd2x vs11, 0, r4
+; CHECK-P8-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    lxvd2x vs8, r4, r7
+; CHECK-P8-NEXT:    li r7, 80
+; CHECK-P8-NEXT:    lxvd2x vs6, r4, r5
+; CHECK-P8-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    lxvd2x vs7, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    li r7, 96
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r7
+; CHECK-P8-NEXT:    li r7, 112
+; CHECK-P8-NEXT:    xscvuxdsp f30, f11
+; CHECK-P8-NEXT:    xxswapd vs11, vs11
+; CHECK-P8-NEXT:    lxvd2x vs4, r4, r7
+; CHECK-P8-NEXT:    li r7, 16
+; CHECK-P8-NEXT:    xscvuxdsp f0, f6
+; CHECK-P8-NEXT:    xxswapd vs6, vs6
+; CHECK-P8-NEXT:    xscvuxdsp f1, f7
+; CHECK-P8-NEXT:    lxvd2x vs9, r4, r7
+; CHECK-P8-NEXT:    xxswapd vs7, vs7
+; CHECK-P8-NEXT:    xscvuxdsp f5, f8
+; CHECK-P8-NEXT:    xxswapd vs8, vs8
+; CHECK-P8-NEXT:    xscvuxdsp f10, f2
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvuxdsp f12, f3
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xscvuxdsp f13, f4
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xscvuxdsp f31, f9
+; CHECK-P8-NEXT:    xxswapd vs9, vs9
+; CHECK-P8-NEXT:    xscvuxdsp f6, f6
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xscvuxdsp f7, f7
+; CHECK-P8-NEXT:    xscvuxdsp f8, f8
+; CHECK-P8-NEXT:    xxmrghd vs5, vs10, vs5
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs10, vs13, vs12
+; CHECK-P8-NEXT:    xscvuxdsp f4, f4
+; CHECK-P8-NEXT:    xscvuxdsp f1, f9
+; CHECK-P8-NEXT:    xscvuxdsp f9, f11
+; CHECK-P8-NEXT:    xxmrghd vs11, vs31, vs30
+; CHECK-P8-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs7, vs6
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs8
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs5
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs10
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs11
+; CHECK-P8-NEXT:    xvcvdpsp v0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs1, vs9
+; CHECK-P8-NEXT:    xvcvdpsp v1, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v6, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v7, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v0, v2
+; CHECK-P8-NEXT:    vmrgew v3, v1, v3
+; CHECK-P8-NEXT:    vmrgew v4, v6, v4
+; CHECK-P8-NEXT:    vmrgew v5, v7, v5
+; CHECK-P8-NEXT:    stvx v2, r3, r7
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    stvx v4, r3, r6
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs4, 48(r4)
+; CHECK-P9-NEXT:    lxv vs5, 32(r4)
+; CHECK-P9-NEXT:    lxv vs6, 16(r4)
+; CHECK-P9-NEXT:    lxv vs7, 0(r4)
+; CHECK-P9-NEXT:    lxv vs8, 112(r4)
+; CHECK-P9-NEXT:    lxv vs9, 96(r4)
+; CHECK-P9-NEXT:    lxv vs10, 80(r4)
+; CHECK-P9-NEXT:    lxv vs11, 64(r4)
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxswapd vs0, vs7
+; CHECK-P9-NEXT:    xxswapd vs1, vs6
+; CHECK-P9-NEXT:    xxswapd vs2, vs5
+; CHECK-P9-NEXT:    xxswapd vs3, vs4
+; CHECK-P9-NEXT:    xxswapd vs12, vs11
+; CHECK-P9-NEXT:    xxswapd vs13, vs10
+; CHECK-P9-NEXT:    xxswapd vs31, vs9
+; CHECK-P9-NEXT:    xxswapd vs30, vs8
+; CHECK-P9-NEXT:    xscvuxdsp f7, f7
+; CHECK-P9-NEXT:    xscvuxdsp f6, f6
+; CHECK-P9-NEXT:    xscvuxdsp f5, f5
+; CHECK-P9-NEXT:    xscvuxdsp f4, f4
+; CHECK-P9-NEXT:    xscvuxdsp f11, f11
+; CHECK-P9-NEXT:    xscvuxdsp f10, f10
+; CHECK-P9-NEXT:    xscvuxdsp f9, f9
+; CHECK-P9-NEXT:    xscvuxdsp f8, f8
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xscvuxdsp f12, f12
+; CHECK-P9-NEXT:    xscvuxdsp f13, f13
+; CHECK-P9-NEXT:    xscvuxdsp f31, f31
+; CHECK-P9-NEXT:    xscvuxdsp f30, f30
+; CHECK-P9-NEXT:    xxmrghd vs6, vs6, vs7
+; CHECK-P9-NEXT:    xxmrghd vs4, vs4, vs5
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs10, vs11
+; CHECK-P9-NEXT:    xxmrghd vs3, vs8, vs9
+; CHECK-P9-NEXT:    xxmrghd vs5, vs13, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs4
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v0, vs5
+; CHECK-P9-NEXT:    xvcvdpsp v1, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v6, vs7
+; CHECK-P9-NEXT:    xvcvdpsp v7, vs3
+; CHECK-P9-NEXT:    vmrgew v2, v2, v4
+; CHECK-P9-NEXT:    vmrgew v3, v3, v5
+; CHECK-P9-NEXT:    vmrgew v4, v1, v0
+; CHECK-P9-NEXT:    vmrgew v5, v7, v6
+; CHECK-P9-NEXT:    stxv v4, 32(r3)
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    stxv v5, 48(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 32(r4)
+; CHECK-BE-NEXT:    lxv vs3, 48(r4)
+; CHECK-BE-NEXT:    lxv vs4, 0(r4)
+; CHECK-BE-NEXT:    lxv vs5, 16(r4)
+; CHECK-BE-NEXT:    lxv vs6, 96(r4)
+; CHECK-BE-NEXT:    lxv vs7, 112(r4)
+; CHECK-BE-NEXT:    lxv vs8, 64(r4)
+; CHECK-BE-NEXT:    lxv vs9, 80(r4)
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxswapd vs0, vs5
+; CHECK-BE-NEXT:    xxswapd vs1, vs4
+; CHECK-BE-NEXT:    xxswapd vs10, vs3
+; CHECK-BE-NEXT:    xxswapd vs11, vs2
+; CHECK-BE-NEXT:    xxswapd vs12, vs9
+; CHECK-BE-NEXT:    xxswapd vs13, vs8
+; CHECK-BE-NEXT:    xxswapd vs31, vs7
+; CHECK-BE-NEXT:    xxswapd vs30, vs6
+; CHECK-BE-NEXT:    xscvuxdsp f5, f5
+; CHECK-BE-NEXT:    xscvuxdsp f4, f4
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f9, f9
+; CHECK-BE-NEXT:    xscvuxdsp f8, f8
+; CHECK-BE-NEXT:    xscvuxdsp f7, f7
+; CHECK-BE-NEXT:    xscvuxdsp f6, f6
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f10, f10
+; CHECK-BE-NEXT:    xscvuxdsp f11, f11
+; CHECK-BE-NEXT:    xscvuxdsp f12, f12
+; CHECK-BE-NEXT:    xscvuxdsp f13, f13
+; CHECK-BE-NEXT:    xscvuxdsp f31, f31
+; CHECK-BE-NEXT:    xscvuxdsp f30, f30
+; CHECK-BE-NEXT:    xxmrghd vs4, vs4, vs5
+; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs3
+; CHECK-BE-NEXT:    xxmrghd vs3, vs8, vs9
+; CHECK-BE-NEXT:    xxmrghd vs5, vs6, vs7
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs11, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs4
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v0, vs3
+; CHECK-BE-NEXT:    xvcvdpsp v6, vs5
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v1, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v7, vs7
+; CHECK-BE-NEXT:    vmrgew v2, v2, v4
+; CHECK-BE-NEXT:    vmrgew v3, v3, v5
+; CHECK-BE-NEXT:    vmrgew v4, v0, v1
+; CHECK-BE-NEXT:    vmrgew v5, v6, v7
+; CHECK-BE-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i64>, <16 x i64>* %0, align 128
+  %1 = uitofp <16 x i64> %a to <16 x float>
+  store <16 x float> %1, <16 x float>* %agg.result, align 64
+  ret void
+}
+
+define i64 @test2elt_signed(<2 x i64> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    xxlor vs1, v2, v2
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xscvdpspn vs1, f1
+; CHECK-P8-NEXT:    xscvdpspn vs0, f0
+; CHECK-P8-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P8-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xxswapd vs0, v2
+; CHECK-P9-NEXT:    xxlor vs1, v2, v2
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvdpspn vs1, f1
+; CHECK-P9-NEXT:    xscvdpspn vs0, f0
+; CHECK-P9-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P9-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxswapd vs0, v2
+; CHECK-BE-NEXT:    xxlor vs1, v2, v2
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvdpspn v2, f1
+; CHECK-BE-NEXT:    xscvdpspn v3, f0
+; CHECK-BE-NEXT:    vmrghw v2, v2, v3
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = sitofp <2 x i64> %a to <2 x float>
+  %1 = bitcast <2 x float> %0 to i64
+  ret i64 %1
+}
+
+define <4 x float> @test4elt_signed(<4 x i64>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    lxvd2x vs0, r3, r4
+; CHECK-P8-NEXT:    xxswapd vs3, vs1
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xxswapd vs2, vs0
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r3)
+; CHECK-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-P9-NEXT:    xxswapd vs2, vs1
+; CHECK-P9-NEXT:    xxswapd vs3, vs0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P9-NEXT:    xxmrghd vs2, vs3, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs2
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxswapd vs2, vs1
+; CHECK-BE-NEXT:    xxswapd vs3, vs0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    vmrgew v2, v2, v3
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x i64>, <4 x i64>* %0, align 32
+  %1 = sitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define void @test8elt_signed(<8 x float>* noalias nocapture sret %agg.result, <8 x i64>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r5
+; CHECK-P8-NEXT:    xxswapd vs7, vs3
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xxswapd vs4, vs0
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xxswapd vs5, vs1
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xxswapd vs6, vs2
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    xscvsxdsp f4, f4
+; CHECK-P8-NEXT:    xscvsxdsp f5, f5
+; CHECK-P8-NEXT:    xscvsxdsp f6, f6
+; CHECK-P8-NEXT:    xscvsxdsp f7, f7
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    xxmrghd vs0, vs6, vs7
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs0
+; CHECK-P8-NEXT:    vmrgew v2, v4, v2
+; CHECK-P8-NEXT:    vmrgew v3, v5, v3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    xxswapd vs4, vs3
+; CHECK-P9-NEXT:    xxswapd vs5, vs2
+; CHECK-P9-NEXT:    xxswapd vs6, vs1
+; CHECK-P9-NEXT:    xxswapd vs7, vs0
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f4, f4
+; CHECK-P9-NEXT:    xscvsxdsp f5, f5
+; CHECK-P9-NEXT:    xscvsxdsp f6, f6
+; CHECK-P9-NEXT:    xscvsxdsp f7, f7
+; CHECK-P9-NEXT:    xxmrghd vs2, vs2, vs3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P9-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs4
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs3
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 32(r4)
+; CHECK-BE-NEXT:    lxv vs1, 48(r4)
+; CHECK-BE-NEXT:    lxv vs2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs3, 16(r4)
+; CHECK-BE-NEXT:    xxswapd vs4, vs3
+; CHECK-BE-NEXT:    xxswapd vs5, vs2
+; CHECK-BE-NEXT:    xxswapd vs6, vs1
+; CHECK-BE-NEXT:    xxswapd vs7, vs0
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f4, f4
+; CHECK-BE-NEXT:    xscvsxdsp f5, f5
+; CHECK-BE-NEXT:    xscvsxdsp f6, f6
+; CHECK-BE-NEXT:    xscvsxdsp f7, f7
+; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-BE-NEXT:    xxmrghd vs3, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs1, vs7, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs3
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs1
+; CHECK-BE-NEXT:    vmrgew v2, v2, v3
+; CHECK-BE-NEXT:    vmrgew v3, v4, v5
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x i64>, <8 x i64>* %0, align 64
+  %1 = sitofp <8 x i64> %a to <8 x float>
+  store <8 x float> %1, <8 x float>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt_signed(<16 x float>* noalias nocapture sret %agg.result, <16 x i64>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r7, 64
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    li r6, 48
+; CHECK-P8-NEXT:    lxvd2x vs11, 0, r4
+; CHECK-P8-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    lxvd2x vs8, r4, r7
+; CHECK-P8-NEXT:    li r7, 80
+; CHECK-P8-NEXT:    lxvd2x vs6, r4, r5
+; CHECK-P8-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    lxvd2x vs7, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    li r7, 96
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r7
+; CHECK-P8-NEXT:    li r7, 112
+; CHECK-P8-NEXT:    xscvsxdsp f30, f11
+; CHECK-P8-NEXT:    xxswapd vs11, vs11
+; CHECK-P8-NEXT:    lxvd2x vs4, r4, r7
+; CHECK-P8-NEXT:    li r7, 16
+; CHECK-P8-NEXT:    xscvsxdsp f0, f6
+; CHECK-P8-NEXT:    xxswapd vs6, vs6
+; CHECK-P8-NEXT:    xscvsxdsp f1, f7
+; CHECK-P8-NEXT:    lxvd2x vs9, r4, r7
+; CHECK-P8-NEXT:    xxswapd vs7, vs7
+; CHECK-P8-NEXT:    xscvsxdsp f5, f8
+; CHECK-P8-NEXT:    xxswapd vs8, vs8
+; CHECK-P8-NEXT:    xscvsxdsp f10, f2
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvsxdsp f12, f3
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xscvsxdsp f13, f4
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xscvsxdsp f31, f9
+; CHECK-P8-NEXT:    xxswapd vs9, vs9
+; CHECK-P8-NEXT:    xscvsxdsp f6, f6
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xscvsxdsp f7, f7
+; CHECK-P8-NEXT:    xscvsxdsp f8, f8
+; CHECK-P8-NEXT:    xxmrghd vs5, vs10, vs5
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs10, vs13, vs12
+; CHECK-P8-NEXT:    xscvsxdsp f4, f4
+; CHECK-P8-NEXT:    xscvsxdsp f1, f9
+; CHECK-P8-NEXT:    xscvsxdsp f9, f11
+; CHECK-P8-NEXT:    xxmrghd vs11, vs31, vs30
+; CHECK-P8-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs7, vs6
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs8
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs5
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs10
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs11
+; CHECK-P8-NEXT:    xvcvdpsp v0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs1, vs9
+; CHECK-P8-NEXT:    xvcvdpsp v1, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v6, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v7, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v0, v2
+; CHECK-P8-NEXT:    vmrgew v3, v1, v3
+; CHECK-P8-NEXT:    vmrgew v4, v6, v4
+; CHECK-P8-NEXT:    vmrgew v5, v7, v5
+; CHECK-P8-NEXT:    stvx v2, r3, r7
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    stvx v4, r3, r6
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs4, 48(r4)
+; CHECK-P9-NEXT:    lxv vs5, 32(r4)
+; CHECK-P9-NEXT:    lxv vs6, 16(r4)
+; CHECK-P9-NEXT:    lxv vs7, 0(r4)
+; CHECK-P9-NEXT:    lxv vs8, 112(r4)
+; CHECK-P9-NEXT:    lxv vs9, 96(r4)
+; CHECK-P9-NEXT:    lxv vs10, 80(r4)
+; CHECK-P9-NEXT:    lxv vs11, 64(r4)
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    xxswapd vs0, vs7
+; CHECK-P9-NEXT:    xxswapd vs1, vs6
+; CHECK-P9-NEXT:    xxswapd vs2, vs5
+; CHECK-P9-NEXT:    xxswapd vs3, vs4
+; CHECK-P9-NEXT:    xxswapd vs12, vs11
+; CHECK-P9-NEXT:    xxswapd vs13, vs10
+; CHECK-P9-NEXT:    xxswapd vs31, vs9
+; CHECK-P9-NEXT:    xxswapd vs30, vs8
+; CHECK-P9-NEXT:    xscvsxdsp f7, f7
+; CHECK-P9-NEXT:    xscvsxdsp f6, f6
+; CHECK-P9-NEXT:    xscvsxdsp f5, f5
+; CHECK-P9-NEXT:    xscvsxdsp f4, f4
+; CHECK-P9-NEXT:    xscvsxdsp f11, f11
+; CHECK-P9-NEXT:    xscvsxdsp f10, f10
+; CHECK-P9-NEXT:    xscvsxdsp f9, f9
+; CHECK-P9-NEXT:    xscvsxdsp f8, f8
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xscvsxdsp f12, f12
+; CHECK-P9-NEXT:    xscvsxdsp f13, f13
+; CHECK-P9-NEXT:    xscvsxdsp f31, f31
+; CHECK-P9-NEXT:    xscvsxdsp f30, f30
+; CHECK-P9-NEXT:    xxmrghd vs6, vs6, vs7
+; CHECK-P9-NEXT:    xxmrghd vs4, vs4, vs5
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs10, vs11
+; CHECK-P9-NEXT:    xxmrghd vs3, vs8, vs9
+; CHECK-P9-NEXT:    xxmrghd vs5, vs13, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs4
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v0, vs5
+; CHECK-P9-NEXT:    xvcvdpsp v1, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v6, vs7
+; CHECK-P9-NEXT:    xvcvdpsp v7, vs3
+; CHECK-P9-NEXT:    vmrgew v2, v2, v4
+; CHECK-P9-NEXT:    vmrgew v3, v3, v5
+; CHECK-P9-NEXT:    vmrgew v4, v1, v0
+; CHECK-P9-NEXT:    vmrgew v5, v7, v6
+; CHECK-P9-NEXT:    stxv v4, 32(r3)
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    stxv v5, 48(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs2, 32(r4)
+; CHECK-BE-NEXT:    lxv vs3, 48(r4)
+; CHECK-BE-NEXT:    lxv vs4, 0(r4)
+; CHECK-BE-NEXT:    lxv vs5, 16(r4)
+; CHECK-BE-NEXT:    lxv vs6, 96(r4)
+; CHECK-BE-NEXT:    lxv vs7, 112(r4)
+; CHECK-BE-NEXT:    lxv vs8, 64(r4)
+; CHECK-BE-NEXT:    lxv vs9, 80(r4)
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    xxswapd vs0, vs5
+; CHECK-BE-NEXT:    xxswapd vs1, vs4
+; CHECK-BE-NEXT:    xxswapd vs10, vs3
+; CHECK-BE-NEXT:    xxswapd vs11, vs2
+; CHECK-BE-NEXT:    xxswapd vs12, vs9
+; CHECK-BE-NEXT:    xxswapd vs13, vs8
+; CHECK-BE-NEXT:    xxswapd vs31, vs7
+; CHECK-BE-NEXT:    xxswapd vs30, vs6
+; CHECK-BE-NEXT:    xscvsxdsp f5, f5
+; CHECK-BE-NEXT:    xscvsxdsp f4, f4
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f9, f9
+; CHECK-BE-NEXT:    xscvsxdsp f8, f8
+; CHECK-BE-NEXT:    xscvsxdsp f7, f7
+; CHECK-BE-NEXT:    xscvsxdsp f6, f6
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f10, f10
+; CHECK-BE-NEXT:    xscvsxdsp f11, f11
+; CHECK-BE-NEXT:    xscvsxdsp f12, f12
+; CHECK-BE-NEXT:    xscvsxdsp f13, f13
+; CHECK-BE-NEXT:    xscvsxdsp f31, f31
+; CHECK-BE-NEXT:    xscvsxdsp f30, f30
+; CHECK-BE-NEXT:    xxmrghd vs4, vs4, vs5
+; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs3
+; CHECK-BE-NEXT:    xxmrghd vs3, vs8, vs9
+; CHECK-BE-NEXT:    xxmrghd vs5, vs6, vs7
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs11, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs4
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v0, vs3
+; CHECK-BE-NEXT:    xvcvdpsp v6, vs5
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v1, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v7, vs7
+; CHECK-BE-NEXT:    vmrgew v2, v2, v4
+; CHECK-BE-NEXT:    vmrgew v3, v3, v5
+; CHECK-BE-NEXT:    vmrgew v4, v0, v1
+; CHECK-BE-NEXT:    vmrgew v5, v6, v7
+; CHECK-BE-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i64>, <16 x i64>* %0, align 128
+  %1 = sitofp <16 x i64> %a to <16 x float>
+  store <16 x float> %1, <16 x float>* %agg.result, align 64
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll b/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9d44a88c6a23e0e70d488cc62ebc1ccacf9838e6
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
@@ -0,0 +1,1382 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i64 @test2elt(i16 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 56
+; CHECK-P8-NEXT:    rldicl r3, r3, 56, 56
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r4
+; CHECK-P8-NEXT:    mtvsrwz f1, r3
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xscvdpspn vs0, f0
+; CHECK-P8-NEXT:    xscvdpspn vs1, f1
+; CHECK-P8-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P8-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 1
+; CHECK-P9-NEXT:    vextubrx r3, r3, v2
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r3
+; CHECK-P9-NEXT:    mtvsrwz f1, r4
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvdpspn vs0, f0
+; CHECK-P9-NEXT:    xscvdpspn vs1, f1
+; CHECK-P9-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P9-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    li r3, 1
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    vextublx r3, r3, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r3
+; CHECK-BE-NEXT:    mtvsrwz f1, r4
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvdpspn v2, f0
+; CHECK-BE-NEXT:    xscvdpspn v3, f1
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i16 %a.coerce to <2 x i8>
+  %1 = uitofp <2 x i8> %0 to <2 x float>
+  %2 = bitcast <2 x float> %1 to i64
+  ret i64 %2
+}
+
+define <4 x float> @test4elt(i32 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 56
+; CHECK-P8-NEXT:    rldicl r5, r3, 48, 56
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r4
+; CHECK-P8-NEXT:    rldicl r4, r3, 56, 56
+; CHECK-P8-NEXT:    rldicl r3, r3, 40, 56
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f1, r5
+; CHECK-P8-NEXT:    mtvsrwz f2, r4
+; CHECK-P8-NEXT:    mtvsrwz f3, r3
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 2
+; CHECK-P9-NEXT:    li r5, 1
+; CHECK-P9-NEXT:    li r6, 3
+; CHECK-P9-NEXT:    vextubrx r3, r3, v2
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r3
+; CHECK-P9-NEXT:    mtvsrwz f1, r4
+; CHECK-P9-NEXT:    mtvsrwz f2, r5
+; CHECK-P9-NEXT:    mtvsrwz f3, r6
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    li r3, 3
+; CHECK-BE-NEXT:    li r4, 1
+; CHECK-BE-NEXT:    li r5, 2
+; CHECK-BE-NEXT:    li r6, 0
+; CHECK-BE-NEXT:    vextublx r3, r3, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r3
+; CHECK-BE-NEXT:    mtvsrwz f1, r4
+; CHECK-BE-NEXT:    mtvsrwz f2, r5
+; CHECK-BE-NEXT:    mtvsrwz f3, r6
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i32 %a.coerce to <4 x i8>
+  %1 = uitofp <4 x i8> %0 to <4 x float>
+  ret <4 x float> %1
+}
+
+define void @test8elt(<8 x float>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    mfvsrd r4, f0
+; CHECK-P8-NEXT:    clrldi r6, r4, 56
+; CHECK-P8-NEXT:    rldicl r7, r4, 48, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 56, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f1, r7
+; CHECK-P8-NEXT:    rldicl r7, r4, 40, 56
+; CHECK-P8-NEXT:    mtvsrwz f2, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 32, 56
+; CHECK-P8-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f3, r7
+; CHECK-P8-NEXT:    rldicl r7, r4, 16, 56
+; CHECK-P8-NEXT:    mtvsrwz f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 24, 56
+; CHECK-P8-NEXT:    rldicl r4, r4, 8, 56
+; CHECK-P8-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f5, r7
+; CHECK-P8-NEXT:    mtvsrwz f6, r6
+; CHECK-P8-NEXT:    mtvsrwz f7, r4
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xscvuxdsp f4, f4
+; CHECK-P8-NEXT:    xscvuxdsp f5, f5
+; CHECK-P8-NEXT:    xscvuxdsp f6, f6
+; CHECK-P8-NEXT:    xscvuxdsp f7, f7
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    stvx v2, 0, r3
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    li r5, 2
+; CHECK-P9-NEXT:    li r6, 1
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    li r8, 4
+; CHECK-P9-NEXT:    li r9, 6
+; CHECK-P9-NEXT:    li r10, 5
+; CHECK-P9-NEXT:    li r11, 7
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    vextubrx r8, r8, v2
+; CHECK-P9-NEXT:    vextubrx r9, r9, v2
+; CHECK-P9-NEXT:    vextubrx r10, r10, v2
+; CHECK-P9-NEXT:    vextubrx r11, r11, v2
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r9, r9, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r10, r10, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r11, r11, 0, 24, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r4
+; CHECK-P9-NEXT:    mtvsrwz f1, r5
+; CHECK-P9-NEXT:    mtvsrwz f2, r6
+; CHECK-P9-NEXT:    mtvsrwz f3, r7
+; CHECK-P9-NEXT:    mtvsrwz f4, r8
+; CHECK-P9-NEXT:    mtvsrwz f5, r9
+; CHECK-P9-NEXT:    mtvsrwz f6, r10
+; CHECK-P9-NEXT:    mtvsrwz f7, r11
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xscvuxdsp f4, f4
+; CHECK-P9-NEXT:    xscvuxdsp f5, f5
+; CHECK-P9-NEXT:    xscvuxdsp f6, f6
+; CHECK-P9-NEXT:    xscvuxdsp f7, f7
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 3
+; CHECK-BE-NEXT:    mtvsrd v2, r4
+; CHECK-BE-NEXT:    li r4, 1
+; CHECK-BE-NEXT:    li r6, 2
+; CHECK-BE-NEXT:    li r7, 0
+; CHECK-BE-NEXT:    li r8, 7
+; CHECK-BE-NEXT:    li r9, 5
+; CHECK-BE-NEXT:    li r10, 6
+; CHECK-BE-NEXT:    li r11, 4
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    vextublx r8, r8, v2
+; CHECK-BE-NEXT:    vextublx r9, r9, v2
+; CHECK-BE-NEXT:    vextublx r10, r10, v2
+; CHECK-BE-NEXT:    vextublx r11, r11, v2
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r9, r9, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r10, r10, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r11, r11, 0, 24, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r5
+; CHECK-BE-NEXT:    mtvsrwz f1, r4
+; CHECK-BE-NEXT:    mtvsrwz f2, r6
+; CHECK-BE-NEXT:    mtvsrwz f3, r7
+; CHECK-BE-NEXT:    mtvsrwz f4, r8
+; CHECK-BE-NEXT:    mtvsrwz f5, r9
+; CHECK-BE-NEXT:    mtvsrwz f6, r10
+; CHECK-BE-NEXT:    mtvsrwz f7, r11
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xscvuxdsp f4, f4
+; CHECK-BE-NEXT:    xscvuxdsp f5, f5
+; CHECK-BE-NEXT:    xscvuxdsp f6, f6
+; CHECK-BE-NEXT:    xscvuxdsp f7, f7
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs3
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <8 x i8>
+  %1 = uitofp <8 x i8> %0 to <8 x float>
+  store <8 x float> %1, <8 x float>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt(<16 x float>* noalias nocapture sret %agg.result, <16 x i8> %a) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mfvsrd r4, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    clrldi r5, r4, 56
+; CHECK-P8-NEXT:    rldicl r6, r4, 48, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 40, 56
+; CHECK-P8-NEXT:    rldicl r7, r4, 56, 56
+; CHECK-P8-NEXT:    mtvsrwz f1, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 32, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f4, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 16, 56
+; CHECK-P8-NEXT:    mtvsrwz f3, r7
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f5, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 24, 56
+; CHECK-P8-NEXT:    rldicl r4, r4, 8, 56
+; CHECK-P8-NEXT:    mfvsrd r7, f2
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f2, r5
+; CHECK-P8-NEXT:    rlwinm r5, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f6, r5
+; CHECK-P8-NEXT:    clrldi r5, r7, 56
+; CHECK-P8-NEXT:    mtvsrwz f7, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 48, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f8, r5
+; CHECK-P8-NEXT:    rldicl r5, r7, 56, 56
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f9, r4
+; CHECK-P8-NEXT:    rlwinm r4, r5, 0, 24, 31
+; CHECK-P8-NEXT:    rldicl r5, r7, 8, 56
+; CHECK-P8-NEXT:    mtvsrwz f10, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 40, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    mtvsrwz f11, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 32, 56
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xscvuxdsp f3, f3
+; CHECK-P8-NEXT:    xscvuxdsp f4, f4
+; CHECK-P8-NEXT:    mtvsrwz f12, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 16, 56
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    xscvuxdsp f5, f5
+; CHECK-P8-NEXT:    mtvsrwz f13, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 24, 56
+; CHECK-P8-NEXT:    xscvuxdsp f2, f2
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    mtvsrwz v3, r5
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    xscvuxdsp f6, f6
+; CHECK-P8-NEXT:    xscvuxdsp f7, f7
+; CHECK-P8-NEXT:    xscvuxdsp f8, f8
+; CHECK-P8-NEXT:    xscvuxdsp f9, f9
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs5
+; CHECK-P8-NEXT:    xscvuxdsp f10, f10
+; CHECK-P8-NEXT:    xscvuxdsp f11, f11
+; CHECK-P8-NEXT:    xscvuxdsp f12, f12
+; CHECK-P8-NEXT:    xscvuxdsp f13, f13
+; CHECK-P8-NEXT:    xxmrghd vs5, vs7, vs6
+; CHECK-P8-NEXT:    xscvuxdsp f1, v2
+; CHECK-P8-NEXT:    xscvuxdsp f4, v3
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs9, vs8
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs3
+; CHECK-P8-NEXT:    xxmrghd vs3, vs11, vs10
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P8-NEXT:    xxmrghd vs2, vs13, vs12
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs5
+; CHECK-P8-NEXT:    xvcvdpsp v0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs4, vs1
+; CHECK-P8-NEXT:    xvcvdpsp v1, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v6, vs2
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    xvcvdpsp v7, vs1
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    vmrgew v4, v1, v0
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    vmrgew v5, v7, v6
+; CHECK-P8-NEXT:    stvx v3, r3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    stvx v4, 0, r3
+; CHECK-P8-NEXT:    stvx v5, r3, r4
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r5, 2
+; CHECK-P9-NEXT:    li r6, 1
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    li r8, 4
+; CHECK-P9-NEXT:    li r9, 6
+; CHECK-P9-NEXT:    li r10, 5
+; CHECK-P9-NEXT:    li r11, 7
+; CHECK-P9-NEXT:    li r12, 8
+; CHECK-P9-NEXT:    li r0, 10
+; CHECK-P9-NEXT:    li r30, 9
+; CHECK-P9-NEXT:    li r29, 11
+; CHECK-P9-NEXT:    li r28, 12
+; CHECK-P9-NEXT:    li r27, 14
+; CHECK-P9-NEXT:    li r26, 13
+; CHECK-P9-NEXT:    li r25, 15
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    vextubrx r8, r8, v2
+; CHECK-P9-NEXT:    vextubrx r9, r9, v2
+; CHECK-P9-NEXT:    vextubrx r10, r10, v2
+; CHECK-P9-NEXT:    vextubrx r11, r11, v2
+; CHECK-P9-NEXT:    vextubrx r12, r12, v2
+; CHECK-P9-NEXT:    vextubrx r0, r0, v2
+; CHECK-P9-NEXT:    vextubrx r30, r30, v2
+; CHECK-P9-NEXT:    vextubrx r29, r29, v2
+; CHECK-P9-NEXT:    vextubrx r28, r28, v2
+; CHECK-P9-NEXT:    vextubrx r27, r27, v2
+; CHECK-P9-NEXT:    vextubrx r26, r26, v2
+; CHECK-P9-NEXT:    vextubrx r25, r25, v2
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r9, r9, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r10, r10, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r11, r11, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r12, r12, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r0, r0, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r30, r30, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r29, r29, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r28, r28, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r27, r27, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r26, r26, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r25, r25, 0, 24, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r4
+; CHECK-P9-NEXT:    mtvsrwz f1, r5
+; CHECK-P9-NEXT:    mtvsrwz f2, r6
+; CHECK-P9-NEXT:    mtvsrwz f3, r7
+; CHECK-P9-NEXT:    mtvsrwz f4, r8
+; CHECK-P9-NEXT:    mtvsrwz f5, r9
+; CHECK-P9-NEXT:    mtvsrwz f6, r10
+; CHECK-P9-NEXT:    mtvsrwz f7, r11
+; CHECK-P9-NEXT:    mtvsrwz f8, r12
+; CHECK-P9-NEXT:    mtvsrwz f9, r0
+; CHECK-P9-NEXT:    mtvsrwz f10, r30
+; CHECK-P9-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f11, r29
+; CHECK-P9-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f12, r28
+; CHECK-P9-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f13, r27
+; CHECK-P9-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz v2, r26
+; CHECK-P9-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz v3, r25
+; CHECK-P9-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xscvuxdsp f2, f2
+; CHECK-P9-NEXT:    xscvuxdsp f3, f3
+; CHECK-P9-NEXT:    xscvuxdsp f4, f4
+; CHECK-P9-NEXT:    xscvuxdsp f5, f5
+; CHECK-P9-NEXT:    xscvuxdsp f6, f6
+; CHECK-P9-NEXT:    xscvuxdsp f7, f7
+; CHECK-P9-NEXT:    xscvuxdsp f8, f8
+; CHECK-P9-NEXT:    xscvuxdsp f9, f9
+; CHECK-P9-NEXT:    xscvuxdsp f10, f10
+; CHECK-P9-NEXT:    xscvuxdsp f11, f11
+; CHECK-P9-NEXT:    xscvuxdsp f12, f12
+; CHECK-P9-NEXT:    xscvuxdsp f13, f13
+; CHECK-P9-NEXT:    xscvuxdsp f31, v2
+; CHECK-P9-NEXT:    xscvuxdsp f30, v3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-P9-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-P9-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P9-NEXT:    xvcvdpsp v0, vs4
+; CHECK-P9-NEXT:    xvcvdpsp v1, vs5
+; CHECK-P9-NEXT:    xvcvdpsp v6, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v7, vs7
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    vmrgew v4, v1, v0
+; CHECK-P9-NEXT:    vmrgew v5, v7, v6
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    stxv v5, 48(r3)
+; CHECK-P9-NEXT:    stxv v4, 32(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r4, 3
+; CHECK-BE-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r5, 1
+; CHECK-BE-NEXT:    li r6, 2
+; CHECK-BE-NEXT:    li r7, 0
+; CHECK-BE-NEXT:    li r8, 7
+; CHECK-BE-NEXT:    li r9, 5
+; CHECK-BE-NEXT:    li r10, 6
+; CHECK-BE-NEXT:    li r11, 4
+; CHECK-BE-NEXT:    li r12, 11
+; CHECK-BE-NEXT:    li r0, 9
+; CHECK-BE-NEXT:    li r30, 10
+; CHECK-BE-NEXT:    li r29, 8
+; CHECK-BE-NEXT:    li r28, 15
+; CHECK-BE-NEXT:    li r27, 13
+; CHECK-BE-NEXT:    li r26, 14
+; CHECK-BE-NEXT:    li r25, 12
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    vextublx r8, r8, v2
+; CHECK-BE-NEXT:    vextublx r9, r9, v2
+; CHECK-BE-NEXT:    vextublx r10, r10, v2
+; CHECK-BE-NEXT:    vextublx r11, r11, v2
+; CHECK-BE-NEXT:    vextublx r12, r12, v2
+; CHECK-BE-NEXT:    vextublx r0, r0, v2
+; CHECK-BE-NEXT:    vextublx r30, r30, v2
+; CHECK-BE-NEXT:    vextublx r29, r29, v2
+; CHECK-BE-NEXT:    vextublx r28, r28, v2
+; CHECK-BE-NEXT:    vextublx r27, r27, v2
+; CHECK-BE-NEXT:    vextublx r26, r26, v2
+; CHECK-BE-NEXT:    vextublx r25, r25, v2
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r9, r9, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r10, r10, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r11, r11, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r12, r12, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r0, r0, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r30, r30, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r29, r29, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r28, r28, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r27, r27, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r26, r26, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r25, r25, 0, 24, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r4
+; CHECK-BE-NEXT:    mtvsrwz f1, r5
+; CHECK-BE-NEXT:    mtvsrwz f2, r6
+; CHECK-BE-NEXT:    mtvsrwz f3, r7
+; CHECK-BE-NEXT:    mtvsrwz f4, r8
+; CHECK-BE-NEXT:    mtvsrwz f5, r9
+; CHECK-BE-NEXT:    mtvsrwz f6, r10
+; CHECK-BE-NEXT:    mtvsrwz f7, r11
+; CHECK-BE-NEXT:    mtvsrwz f8, r12
+; CHECK-BE-NEXT:    mtvsrwz f9, r0
+; CHECK-BE-NEXT:    mtvsrwz f10, r30
+; CHECK-BE-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f11, r29
+; CHECK-BE-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f12, r28
+; CHECK-BE-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f13, r27
+; CHECK-BE-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz v2, r26
+; CHECK-BE-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz v3, r25
+; CHECK-BE-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xscvuxdsp f0, f0
+; CHECK-BE-NEXT:    xscvuxdsp f1, f1
+; CHECK-BE-NEXT:    xscvuxdsp f2, f2
+; CHECK-BE-NEXT:    xscvuxdsp f3, f3
+; CHECK-BE-NEXT:    xscvuxdsp f4, f4
+; CHECK-BE-NEXT:    xscvuxdsp f5, f5
+; CHECK-BE-NEXT:    xscvuxdsp f6, f6
+; CHECK-BE-NEXT:    xscvuxdsp f7, f7
+; CHECK-BE-NEXT:    xscvuxdsp f8, f8
+; CHECK-BE-NEXT:    xscvuxdsp f9, f9
+; CHECK-BE-NEXT:    xscvuxdsp f10, f10
+; CHECK-BE-NEXT:    xscvuxdsp f11, f11
+; CHECK-BE-NEXT:    xscvuxdsp f12, f12
+; CHECK-BE-NEXT:    xscvuxdsp f13, f13
+; CHECK-BE-NEXT:    xscvuxdsp f31, v2
+; CHECK-BE-NEXT:    xscvuxdsp f30, v3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-BE-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs3
+; CHECK-BE-NEXT:    xvcvdpsp v0, vs4
+; CHECK-BE-NEXT:    xvcvdpsp v1, vs5
+; CHECK-BE-NEXT:    xvcvdpsp v6, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v7, vs7
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    vmrgew v4, v1, v0
+; CHECK-BE-NEXT:    vmrgew v5, v7, v6
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = uitofp <16 x i8> %a to <16 x float>
+  store <16 x float> %0, <16 x float>* %agg.result, align 64
+  ret void
+}
+
+define i64 @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 56
+; CHECK-P8-NEXT:    rldicl r3, r3, 56, 56
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    extsb r3, r3
+; CHECK-P8-NEXT:    mtvsrwa f0, r4
+; CHECK-P8-NEXT:    mtvsrwa f1, r3
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xscvdpspn vs0, f0
+; CHECK-P8-NEXT:    xscvdpspn vs1, f1
+; CHECK-P8-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P8-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P8-NEXT:    vmrglw v2, v3, v2
+; CHECK-P8-NEXT:    xxswapd vs0, v2
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 1
+; CHECK-P9-NEXT:    vextubrx r3, r3, v2
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    extsb r3, r3
+; CHECK-P9-NEXT:    extsb r4, r4
+; CHECK-P9-NEXT:    mtvsrwa f0, r3
+; CHECK-P9-NEXT:    mtvsrwa f1, r4
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvdpspn vs0, f0
+; CHECK-P9-NEXT:    xscvdpspn vs1, f1
+; CHECK-P9-NEXT:    xxsldwi v2, vs0, vs0, 1
+; CHECK-P9-NEXT:    xxsldwi v3, vs1, vs1, 1
+; CHECK-P9-NEXT:    vmrglw v2, v3, v2
+; CHECK-P9-NEXT:    mfvsrld r3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    li r3, 1
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    vextublx r3, r3, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    extsb r3, r3
+; CHECK-BE-NEXT:    extsb r4, r4
+; CHECK-BE-NEXT:    mtvsrwa f0, r3
+; CHECK-BE-NEXT:    mtvsrwa f1, r4
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvdpspn v2, f0
+; CHECK-BE-NEXT:    xscvdpspn v3, f1
+; CHECK-BE-NEXT:    vmrghw v2, v3, v2
+; CHECK-BE-NEXT:    mfvsrd r3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i16 %a.coerce to <2 x i8>
+  %1 = sitofp <2 x i8> %0 to <2 x float>
+  %2 = bitcast <2 x float> %1 to i64
+  ret i64 %2
+}
+
+define <4 x float> @test4elt_signed(i32 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 56
+; CHECK-P8-NEXT:    rldicl r5, r3, 48, 56
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f0, r4
+; CHECK-P8-NEXT:    rldicl r4, r3, 56, 56
+; CHECK-P8-NEXT:    rldicl r3, r3, 40, 56
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    extsb r3, r3
+; CHECK-P8-NEXT:    mtvsrwa f1, r5
+; CHECK-P8-NEXT:    mtvsrwa f2, r4
+; CHECK-P8-NEXT:    mtvsrwa f3, r3
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 2
+; CHECK-P9-NEXT:    li r5, 1
+; CHECK-P9-NEXT:    li r6, 3
+; CHECK-P9-NEXT:    vextubrx r3, r3, v2
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    extsb r3, r3
+; CHECK-P9-NEXT:    extsb r4, r4
+; CHECK-P9-NEXT:    extsb r5, r5
+; CHECK-P9-NEXT:    extsb r6, r6
+; CHECK-P9-NEXT:    mtvsrwa f0, r3
+; CHECK-P9-NEXT:    mtvsrwa f1, r4
+; CHECK-P9-NEXT:    mtvsrwa f2, r5
+; CHECK-P9-NEXT:    mtvsrwa f3, r6
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    li r3, 3
+; CHECK-BE-NEXT:    li r4, 1
+; CHECK-BE-NEXT:    li r5, 2
+; CHECK-BE-NEXT:    li r6, 0
+; CHECK-BE-NEXT:    vextublx r3, r3, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    extsb r3, r3
+; CHECK-BE-NEXT:    extsb r4, r4
+; CHECK-BE-NEXT:    extsb r5, r5
+; CHECK-BE-NEXT:    extsb r6, r6
+; CHECK-BE-NEXT:    mtvsrwa f0, r3
+; CHECK-BE-NEXT:    mtvsrwa f1, r4
+; CHECK-BE-NEXT:    mtvsrwa f2, r5
+; CHECK-BE-NEXT:    mtvsrwa f3, r6
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i32 %a.coerce to <4 x i8>
+  %1 = sitofp <4 x i8> %0 to <4 x float>
+  ret <4 x float> %1
+}
+
+define void @test8elt_signed(<8 x float>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    mfvsrd r4, f0
+; CHECK-P8-NEXT:    clrldi r6, r4, 56
+; CHECK-P8-NEXT:    rldicl r7, r4, 48, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    extsb r7, r7
+; CHECK-P8-NEXT:    mtvsrwa f0, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 56, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f1, r7
+; CHECK-P8-NEXT:    rldicl r7, r4, 40, 56
+; CHECK-P8-NEXT:    mtvsrwa f2, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 32, 56
+; CHECK-P8-NEXT:    extsb r7, r7
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f3, r7
+; CHECK-P8-NEXT:    rldicl r7, r4, 16, 56
+; CHECK-P8-NEXT:    mtvsrwa f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 24, 56
+; CHECK-P8-NEXT:    rldicl r4, r4, 8, 56
+; CHECK-P8-NEXT:    extsb r7, r7
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f5, r7
+; CHECK-P8-NEXT:    mtvsrwa f6, r6
+; CHECK-P8-NEXT:    mtvsrwa f7, r4
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xscvsxdsp f4, f4
+; CHECK-P8-NEXT:    xscvsxdsp f5, f5
+; CHECK-P8-NEXT:    xscvsxdsp f6, f6
+; CHECK-P8-NEXT:    xscvsxdsp f7, f7
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P8-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    stvx v2, 0, r3
+; CHECK-P8-NEXT:    stvx v3, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    li r5, 2
+; CHECK-P9-NEXT:    li r6, 1
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    li r8, 4
+; CHECK-P9-NEXT:    li r9, 6
+; CHECK-P9-NEXT:    li r10, 5
+; CHECK-P9-NEXT:    li r11, 7
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    vextubrx r8, r8, v2
+; CHECK-P9-NEXT:    vextubrx r9, r9, v2
+; CHECK-P9-NEXT:    vextubrx r10, r10, v2
+; CHECK-P9-NEXT:    vextubrx r11, r11, v2
+; CHECK-P9-NEXT:    extsb r4, r4
+; CHECK-P9-NEXT:    extsb r5, r5
+; CHECK-P9-NEXT:    extsb r6, r6
+; CHECK-P9-NEXT:    extsb r7, r7
+; CHECK-P9-NEXT:    extsb r8, r8
+; CHECK-P9-NEXT:    extsb r9, r9
+; CHECK-P9-NEXT:    extsb r10, r10
+; CHECK-P9-NEXT:    extsb r11, r11
+; CHECK-P9-NEXT:    mtvsrwa f0, r4
+; CHECK-P9-NEXT:    mtvsrwa f1, r5
+; CHECK-P9-NEXT:    mtvsrwa f2, r6
+; CHECK-P9-NEXT:    mtvsrwa f3, r7
+; CHECK-P9-NEXT:    mtvsrwa f4, r8
+; CHECK-P9-NEXT:    mtvsrwa f5, r9
+; CHECK-P9-NEXT:    mtvsrwa f6, r10
+; CHECK-P9-NEXT:    mtvsrwa f7, r11
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xscvsxdsp f4, f4
+; CHECK-P9-NEXT:    xscvsxdsp f5, f5
+; CHECK-P9-NEXT:    xscvsxdsp f6, f6
+; CHECK-P9-NEXT:    xscvsxdsp f7, f7
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 3
+; CHECK-BE-NEXT:    mtvsrd v2, r4
+; CHECK-BE-NEXT:    li r4, 1
+; CHECK-BE-NEXT:    li r6, 2
+; CHECK-BE-NEXT:    li r7, 0
+; CHECK-BE-NEXT:    li r8, 7
+; CHECK-BE-NEXT:    li r9, 5
+; CHECK-BE-NEXT:    li r10, 6
+; CHECK-BE-NEXT:    li r11, 4
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    vextublx r8, r8, v2
+; CHECK-BE-NEXT:    vextublx r9, r9, v2
+; CHECK-BE-NEXT:    vextublx r10, r10, v2
+; CHECK-BE-NEXT:    vextublx r11, r11, v2
+; CHECK-BE-NEXT:    extsb r5, r5
+; CHECK-BE-NEXT:    extsb r4, r4
+; CHECK-BE-NEXT:    extsb r6, r6
+; CHECK-BE-NEXT:    extsb r7, r7
+; CHECK-BE-NEXT:    extsb r8, r8
+; CHECK-BE-NEXT:    extsb r9, r9
+; CHECK-BE-NEXT:    extsb r10, r10
+; CHECK-BE-NEXT:    extsb r11, r11
+; CHECK-BE-NEXT:    mtvsrwa f0, r5
+; CHECK-BE-NEXT:    mtvsrwa f1, r4
+; CHECK-BE-NEXT:    mtvsrwa f2, r6
+; CHECK-BE-NEXT:    mtvsrwa f3, r7
+; CHECK-BE-NEXT:    mtvsrwa f4, r8
+; CHECK-BE-NEXT:    mtvsrwa f5, r9
+; CHECK-BE-NEXT:    mtvsrwa f6, r10
+; CHECK-BE-NEXT:    mtvsrwa f7, r11
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xscvsxdsp f4, f4
+; CHECK-BE-NEXT:    xscvsxdsp f5, f5
+; CHECK-BE-NEXT:    xscvsxdsp f6, f6
+; CHECK-BE-NEXT:    xscvsxdsp f7, f7
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs3
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <8 x i8>
+  %1 = sitofp <8 x i8> %0 to <8 x float>
+  store <8 x float> %1, <8 x float>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt_signed(<16 x float>* noalias nocapture sret %agg.result, <16 x i8> %a) local_unnamed_addr #3 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mfvsrd r4, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    clrldi r5, r4, 56
+; CHECK-P8-NEXT:    rldicl r6, r4, 48, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f0, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 40, 56
+; CHECK-P8-NEXT:    rldicl r7, r4, 56, 56
+; CHECK-P8-NEXT:    mtvsrwa f1, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 32, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    extsb r7, r7
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f4, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 16, 56
+; CHECK-P8-NEXT:    mtvsrwa f3, r7
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f5, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 24, 56
+; CHECK-P8-NEXT:    rldicl r4, r4, 8, 56
+; CHECK-P8-NEXT:    mfvsrd r7, f2
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f2, r5
+; CHECK-P8-NEXT:    extsb r5, r6
+; CHECK-P8-NEXT:    mtvsrwa f6, r5
+; CHECK-P8-NEXT:    clrldi r5, r7, 56
+; CHECK-P8-NEXT:    mtvsrwa f7, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 48, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f8, r5
+; CHECK-P8-NEXT:    rldicl r5, r7, 56, 56
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f9, r4
+; CHECK-P8-NEXT:    extsb r4, r5
+; CHECK-P8-NEXT:    rldicl r5, r7, 8, 56
+; CHECK-P8-NEXT:    mtvsrwa f10, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 40, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    xscvsxdsp f0, f0
+; CHECK-P8-NEXT:    mtvsrwa f11, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 32, 56
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    xscvsxdsp f1, f1
+; CHECK-P8-NEXT:    xscvsxdsp f3, f3
+; CHECK-P8-NEXT:    xscvsxdsp f4, f4
+; CHECK-P8-NEXT:    mtvsrwa f12, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 16, 56
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    xscvsxdsp f5, f5
+; CHECK-P8-NEXT:    mtvsrwa f13, r4
+; CHECK-P8-NEXT:    rldicl r4, r7, 24, 56
+; CHECK-P8-NEXT:    xscvsxdsp f2, f2
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    mtvsrwa v2, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    mtvsrwa v3, r5
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    xscvsxdsp f6, f6
+; CHECK-P8-NEXT:    xscvsxdsp f7, f7
+; CHECK-P8-NEXT:    xscvsxdsp f8, f8
+; CHECK-P8-NEXT:    xscvsxdsp f9, f9
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs5
+; CHECK-P8-NEXT:    xscvsxdsp f10, f10
+; CHECK-P8-NEXT:    xscvsxdsp f11, f11
+; CHECK-P8-NEXT:    xscvsxdsp f12, f12
+; CHECK-P8-NEXT:    xscvsxdsp f13, f13
+; CHECK-P8-NEXT:    xxmrghd vs5, vs7, vs6
+; CHECK-P8-NEXT:    xscvsxdsp f1, v2
+; CHECK-P8-NEXT:    xscvsxdsp f4, v3
+; CHECK-P8-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P8-NEXT:    xxmrghd vs0, vs9, vs8
+; CHECK-P8-NEXT:    xvcvdpsp v3, vs3
+; CHECK-P8-NEXT:    xxmrghd vs3, vs11, vs10
+; CHECK-P8-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P8-NEXT:    xxmrghd vs2, vs13, vs12
+; CHECK-P8-NEXT:    xvcvdpsp v5, vs5
+; CHECK-P8-NEXT:    xvcvdpsp v0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs4, vs1
+; CHECK-P8-NEXT:    xvcvdpsp v1, vs3
+; CHECK-P8-NEXT:    xvcvdpsp v6, vs2
+; CHECK-P8-NEXT:    vmrgew v2, v3, v2
+; CHECK-P8-NEXT:    xvcvdpsp v7, vs1
+; CHECK-P8-NEXT:    vmrgew v3, v5, v4
+; CHECK-P8-NEXT:    vmrgew v4, v1, v0
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    vmrgew v5, v7, v6
+; CHECK-P8-NEXT:    stvx v3, r3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    stvx v4, 0, r3
+; CHECK-P8-NEXT:    stvx v5, r3, r4
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r5, 2
+; CHECK-P9-NEXT:    li r6, 1
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    li r8, 4
+; CHECK-P9-NEXT:    li r9, 6
+; CHECK-P9-NEXT:    li r10, 5
+; CHECK-P9-NEXT:    li r11, 7
+; CHECK-P9-NEXT:    li r12, 8
+; CHECK-P9-NEXT:    li r0, 10
+; CHECK-P9-NEXT:    li r30, 9
+; CHECK-P9-NEXT:    li r29, 11
+; CHECK-P9-NEXT:    li r28, 12
+; CHECK-P9-NEXT:    li r27, 14
+; CHECK-P9-NEXT:    li r26, 13
+; CHECK-P9-NEXT:    li r25, 15
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    vextubrx r8, r8, v2
+; CHECK-P9-NEXT:    vextubrx r9, r9, v2
+; CHECK-P9-NEXT:    vextubrx r10, r10, v2
+; CHECK-P9-NEXT:    vextubrx r11, r11, v2
+; CHECK-P9-NEXT:    vextubrx r12, r12, v2
+; CHECK-P9-NEXT:    vextubrx r0, r0, v2
+; CHECK-P9-NEXT:    vextubrx r30, r30, v2
+; CHECK-P9-NEXT:    vextubrx r29, r29, v2
+; CHECK-P9-NEXT:    vextubrx r28, r28, v2
+; CHECK-P9-NEXT:    vextubrx r27, r27, v2
+; CHECK-P9-NEXT:    vextubrx r26, r26, v2
+; CHECK-P9-NEXT:    vextubrx r25, r25, v2
+; CHECK-P9-NEXT:    extsb r4, r4
+; CHECK-P9-NEXT:    extsb r5, r5
+; CHECK-P9-NEXT:    extsb r6, r6
+; CHECK-P9-NEXT:    extsb r7, r7
+; CHECK-P9-NEXT:    extsb r8, r8
+; CHECK-P9-NEXT:    extsb r9, r9
+; CHECK-P9-NEXT:    extsb r10, r10
+; CHECK-P9-NEXT:    extsb r11, r11
+; CHECK-P9-NEXT:    extsb r12, r12
+; CHECK-P9-NEXT:    extsb r0, r0
+; CHECK-P9-NEXT:    extsb r30, r30
+; CHECK-P9-NEXT:    extsb r29, r29
+; CHECK-P9-NEXT:    extsb r28, r28
+; CHECK-P9-NEXT:    extsb r27, r27
+; CHECK-P9-NEXT:    extsb r26, r26
+; CHECK-P9-NEXT:    extsb r25, r25
+; CHECK-P9-NEXT:    mtvsrwa f0, r4
+; CHECK-P9-NEXT:    mtvsrwa f1, r5
+; CHECK-P9-NEXT:    mtvsrwa f2, r6
+; CHECK-P9-NEXT:    mtvsrwa f3, r7
+; CHECK-P9-NEXT:    mtvsrwa f4, r8
+; CHECK-P9-NEXT:    mtvsrwa f5, r9
+; CHECK-P9-NEXT:    mtvsrwa f6, r10
+; CHECK-P9-NEXT:    mtvsrwa f7, r11
+; CHECK-P9-NEXT:    mtvsrwa f8, r12
+; CHECK-P9-NEXT:    mtvsrwa f9, r0
+; CHECK-P9-NEXT:    mtvsrwa f10, r30
+; CHECK-P9-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f11, r29
+; CHECK-P9-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f12, r28
+; CHECK-P9-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f13, r27
+; CHECK-P9-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa v2, r26
+; CHECK-P9-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa v3, r25
+; CHECK-P9-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xscvsxdsp f0, f0
+; CHECK-P9-NEXT:    xscvsxdsp f1, f1
+; CHECK-P9-NEXT:    xscvsxdsp f2, f2
+; CHECK-P9-NEXT:    xscvsxdsp f3, f3
+; CHECK-P9-NEXT:    xscvsxdsp f4, f4
+; CHECK-P9-NEXT:    xscvsxdsp f5, f5
+; CHECK-P9-NEXT:    xscvsxdsp f6, f6
+; CHECK-P9-NEXT:    xscvsxdsp f7, f7
+; CHECK-P9-NEXT:    xscvsxdsp f8, f8
+; CHECK-P9-NEXT:    xscvsxdsp f9, f9
+; CHECK-P9-NEXT:    xscvsxdsp f10, f10
+; CHECK-P9-NEXT:    xscvsxdsp f11, f11
+; CHECK-P9-NEXT:    xscvsxdsp f12, f12
+; CHECK-P9-NEXT:    xscvsxdsp f13, f13
+; CHECK-P9-NEXT:    xscvsxdsp f31, v2
+; CHECK-P9-NEXT:    xscvsxdsp f30, v3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-P9-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-P9-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xvcvdpsp v2, vs0
+; CHECK-P9-NEXT:    xvcvdpsp v3, vs1
+; CHECK-P9-NEXT:    xvcvdpsp v4, vs2
+; CHECK-P9-NEXT:    xvcvdpsp v5, vs3
+; CHECK-P9-NEXT:    xvcvdpsp v0, vs4
+; CHECK-P9-NEXT:    xvcvdpsp v1, vs5
+; CHECK-P9-NEXT:    xvcvdpsp v6, vs6
+; CHECK-P9-NEXT:    xvcvdpsp v7, vs7
+; CHECK-P9-NEXT:    vmrgew v2, v3, v2
+; CHECK-P9-NEXT:    vmrgew v3, v5, v4
+; CHECK-P9-NEXT:    vmrgew v4, v1, v0
+; CHECK-P9-NEXT:    vmrgew v5, v7, v6
+; CHECK-P9-NEXT:    stxv v3, 16(r3)
+; CHECK-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-P9-NEXT:    stxv v5, 48(r3)
+; CHECK-P9-NEXT:    stxv v4, 32(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r4, 3
+; CHECK-BE-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r5, 1
+; CHECK-BE-NEXT:    li r6, 2
+; CHECK-BE-NEXT:    li r7, 0
+; CHECK-BE-NEXT:    li r8, 7
+; CHECK-BE-NEXT:    li r9, 5
+; CHECK-BE-NEXT:    li r10, 6
+; CHECK-BE-NEXT:    li r11, 4
+; CHECK-BE-NEXT:    li r12, 11
+; CHECK-BE-NEXT:    li r0, 9
+; CHECK-BE-NEXT:    li r30, 10
+; CHECK-BE-NEXT:    li r29, 8
+; CHECK-BE-NEXT:    li r28, 15
+; CHECK-BE-NEXT:    li r27, 13
+; CHECK-BE-NEXT:    li r26, 14
+; CHECK-BE-NEXT:    li r25, 12
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    vextublx r8, r8, v2
+; CHECK-BE-NEXT:    vextublx r9, r9, v2
+; CHECK-BE-NEXT:    vextublx r10, r10, v2
+; CHECK-BE-NEXT:    vextublx r11, r11, v2
+; CHECK-BE-NEXT:    vextublx r12, r12, v2
+; CHECK-BE-NEXT:    vextublx r0, r0, v2
+; CHECK-BE-NEXT:    vextublx r30, r30, v2
+; CHECK-BE-NEXT:    vextublx r29, r29, v2
+; CHECK-BE-NEXT:    vextublx r28, r28, v2
+; CHECK-BE-NEXT:    vextublx r27, r27, v2
+; CHECK-BE-NEXT:    vextublx r26, r26, v2
+; CHECK-BE-NEXT:    vextublx r25, r25, v2
+; CHECK-BE-NEXT:    extsb r4, r4
+; CHECK-BE-NEXT:    extsb r5, r5
+; CHECK-BE-NEXT:    extsb r6, r6
+; CHECK-BE-NEXT:    extsb r7, r7
+; CHECK-BE-NEXT:    extsb r8, r8
+; CHECK-BE-NEXT:    extsb r9, r9
+; CHECK-BE-NEXT:    extsb r10, r10
+; CHECK-BE-NEXT:    extsb r11, r11
+; CHECK-BE-NEXT:    extsb r12, r12
+; CHECK-BE-NEXT:    extsb r0, r0
+; CHECK-BE-NEXT:    extsb r30, r30
+; CHECK-BE-NEXT:    extsb r29, r29
+; CHECK-BE-NEXT:    extsb r28, r28
+; CHECK-BE-NEXT:    extsb r27, r27
+; CHECK-BE-NEXT:    extsb r26, r26
+; CHECK-BE-NEXT:    extsb r25, r25
+; CHECK-BE-NEXT:    mtvsrwa f0, r4
+; CHECK-BE-NEXT:    mtvsrwa f1, r5
+; CHECK-BE-NEXT:    mtvsrwa f2, r6
+; CHECK-BE-NEXT:    mtvsrwa f3, r7
+; CHECK-BE-NEXT:    mtvsrwa f4, r8
+; CHECK-BE-NEXT:    mtvsrwa f5, r9
+; CHECK-BE-NEXT:    mtvsrwa f6, r10
+; CHECK-BE-NEXT:    mtvsrwa f7, r11
+; CHECK-BE-NEXT:    mtvsrwa f8, r12
+; CHECK-BE-NEXT:    mtvsrwa f9, r0
+; CHECK-BE-NEXT:    mtvsrwa f10, r30
+; CHECK-BE-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f11, r29
+; CHECK-BE-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f12, r28
+; CHECK-BE-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f13, r27
+; CHECK-BE-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa v2, r26
+; CHECK-BE-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa v3, r25
+; CHECK-BE-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xscvsxdsp f0, f0
+; CHECK-BE-NEXT:    xscvsxdsp f1, f1
+; CHECK-BE-NEXT:    xscvsxdsp f2, f2
+; CHECK-BE-NEXT:    xscvsxdsp f3, f3
+; CHECK-BE-NEXT:    xscvsxdsp f4, f4
+; CHECK-BE-NEXT:    xscvsxdsp f5, f5
+; CHECK-BE-NEXT:    xscvsxdsp f6, f6
+; CHECK-BE-NEXT:    xscvsxdsp f7, f7
+; CHECK-BE-NEXT:    xscvsxdsp f8, f8
+; CHECK-BE-NEXT:    xscvsxdsp f9, f9
+; CHECK-BE-NEXT:    xscvsxdsp f10, f10
+; CHECK-BE-NEXT:    xscvsxdsp f11, f11
+; CHECK-BE-NEXT:    xscvsxdsp f12, f12
+; CHECK-BE-NEXT:    xscvsxdsp f13, f13
+; CHECK-BE-NEXT:    xscvsxdsp f31, v2
+; CHECK-BE-NEXT:    xscvsxdsp f30, v3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-BE-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xvcvdpsp v2, vs0
+; CHECK-BE-NEXT:    xvcvdpsp v3, vs1
+; CHECK-BE-NEXT:    xvcvdpsp v4, vs2
+; CHECK-BE-NEXT:    xvcvdpsp v5, vs3
+; CHECK-BE-NEXT:    xvcvdpsp v0, vs4
+; CHECK-BE-NEXT:    xvcvdpsp v1, vs5
+; CHECK-BE-NEXT:    xvcvdpsp v6, vs6
+; CHECK-BE-NEXT:    xvcvdpsp v7, vs7
+; CHECK-BE-NEXT:    vmrgew v2, v3, v2
+; CHECK-BE-NEXT:    vmrgew v3, v5, v4
+; CHECK-BE-NEXT:    vmrgew v4, v1, v0
+; CHECK-BE-NEXT:    vmrgew v5, v7, v6
+; CHECK-BE-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = sitofp <16 x i8> %a to <16 x float>
+  store <16 x float> %0, <16 x float>* %agg.result, align 64
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll b/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bafc18bece2140fdd882256c8117d23d94f9d499
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
@@ -0,0 +1,1322 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define <2 x double> @test2elt(i16 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 56
+; CHECK-P8-NEXT:    rldicl r3, r3, 56, 56
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r4
+; CHECK-P8-NEXT:    mtvsrwz f1, r3
+; CHECK-P8-NEXT:    xscvuxddp f0, f0
+; CHECK-P8-NEXT:    xscvuxddp f1, f1
+; CHECK-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 1
+; CHECK-P9-NEXT:    vextubrx r3, r3, v2
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r3
+; CHECK-P9-NEXT:    mtvsrwz f1, r4
+; CHECK-P9-NEXT:    xscvuxddp f0, f0
+; CHECK-P9-NEXT:    xscvuxddp f1, f1
+; CHECK-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    li r3, 1
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    vextublx r3, r3, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    rlwinm r3, r3, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r3
+; CHECK-BE-NEXT:    mtvsrwz f1, r4
+; CHECK-BE-NEXT:    xscvuxddp f0, f0
+; CHECK-BE-NEXT:    xscvuxddp f1, f1
+; CHECK-BE-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i16 %a.coerce to <2 x i8>
+  %1 = uitofp <2 x i8> %0 to <2 x double>
+  ret <2 x double> %1
+}
+
+define void @test4elt(<4 x double>* noalias nocapture sret %agg.result, i32 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    mfvsrd r4, f0
+; CHECK-P8-NEXT:    clrldi r5, r4, 56
+; CHECK-P8-NEXT:    rldicl r6, r4, 56, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 48, 56
+; CHECK-P8-NEXT:    rldicl r4, r4, 40, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f1, r6
+; CHECK-P8-NEXT:    mtvsrwz f2, r5
+; CHECK-P8-NEXT:    mtvsrwz f3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xscvuxddp f0, f0
+; CHECK-P8-NEXT:    xscvuxddp f1, f1
+; CHECK-P8-NEXT:    xscvuxddp f2, f2
+; CHECK-P8-NEXT:    xscvuxddp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r4
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    li r5, 1
+; CHECK-P9-NEXT:    li r6, 2
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r4
+; CHECK-P9-NEXT:    mtvsrwz f1, r5
+; CHECK-P9-NEXT:    mtvsrwz f2, r6
+; CHECK-P9-NEXT:    mtvsrwz f3, r7
+; CHECK-P9-NEXT:    xscvuxddp f0, f0
+; CHECK-P9-NEXT:    xscvuxddp f1, f1
+; CHECK-P9-NEXT:    xscvuxddp f2, f2
+; CHECK-P9-NEXT:    xscvuxddp f3, f3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r4
+; CHECK-BE-NEXT:    li r4, 1
+; CHECK-BE-NEXT:    li r5, 0
+; CHECK-BE-NEXT:    li r6, 3
+; CHECK-BE-NEXT:    li r7, 2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r4
+; CHECK-BE-NEXT:    mtvsrwz f1, r5
+; CHECK-BE-NEXT:    mtvsrwz f2, r6
+; CHECK-BE-NEXT:    mtvsrwz f3, r7
+; CHECK-BE-NEXT:    xscvuxddp f0, f0
+; CHECK-BE-NEXT:    xscvuxddp f1, f1
+; CHECK-BE-NEXT:    xscvuxddp f2, f2
+; CHECK-BE-NEXT:    xscvuxddp f3, f3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i32 %a.coerce to <4 x i8>
+  %1 = uitofp <4 x i8> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt(<8 x double>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    mfvsrd r4, f0
+; CHECK-P8-NEXT:    clrldi r5, r4, 56
+; CHECK-P8-NEXT:    rldicl r6, r4, 56, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 48, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f1, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 40, 56
+; CHECK-P8-NEXT:    mtvsrwz f2, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 32, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f3, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 24, 56
+; CHECK-P8-NEXT:    mtvsrwz f4, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 16, 56
+; CHECK-P8-NEXT:    rldicl r4, r4, 8, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f5, r6
+; CHECK-P8-NEXT:    mtvsrwz f6, r5
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    mtvsrwz f7, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xscvuxddp f4, f4
+; CHECK-P8-NEXT:    xscvuxddp f5, f5
+; CHECK-P8-NEXT:    xscvuxddp f6, f6
+; CHECK-P8-NEXT:    xscvuxddp f7, f7
+; CHECK-P8-NEXT:    xscvuxddp f0, f0
+; CHECK-P8-NEXT:    xscvuxddp f1, f1
+; CHECK-P8-NEXT:    xscvuxddp f2, f2
+; CHECK-P8-NEXT:    xscvuxddp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-P8-NEXT:    xxmrghd vs5, vs7, vs6
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xxswapd vs2, vs5
+; CHECK-P8-NEXT:    xxswapd vs3, vs4
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    li r5, 1
+; CHECK-P9-NEXT:    li r6, 2
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    li r8, 4
+; CHECK-P9-NEXT:    li r9, 5
+; CHECK-P9-NEXT:    li r10, 6
+; CHECK-P9-NEXT:    li r11, 7
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    vextubrx r8, r8, v2
+; CHECK-P9-NEXT:    vextubrx r9, r9, v2
+; CHECK-P9-NEXT:    vextubrx r10, r10, v2
+; CHECK-P9-NEXT:    vextubrx r11, r11, v2
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r9, r9, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r10, r10, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r11, r11, 0, 24, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r4
+; CHECK-P9-NEXT:    mtvsrwz f1, r5
+; CHECK-P9-NEXT:    mtvsrwz f2, r6
+; CHECK-P9-NEXT:    mtvsrwz f3, r7
+; CHECK-P9-NEXT:    mtvsrwz f4, r8
+; CHECK-P9-NEXT:    mtvsrwz f5, r9
+; CHECK-P9-NEXT:    mtvsrwz f6, r10
+; CHECK-P9-NEXT:    mtvsrwz f7, r11
+; CHECK-P9-NEXT:    xscvuxddp f0, f0
+; CHECK-P9-NEXT:    xscvuxddp f1, f1
+; CHECK-P9-NEXT:    xscvuxddp f2, f2
+; CHECK-P9-NEXT:    xscvuxddp f3, f3
+; CHECK-P9-NEXT:    xscvuxddp f4, f4
+; CHECK-P9-NEXT:    xscvuxddp f5, f5
+; CHECK-P9-NEXT:    xscvuxddp f6, f6
+; CHECK-P9-NEXT:    xscvuxddp f7, f7
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 1
+; CHECK-BE-NEXT:    mtvsrd v2, r4
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    li r6, 3
+; CHECK-BE-NEXT:    li r7, 2
+; CHECK-BE-NEXT:    li r8, 5
+; CHECK-BE-NEXT:    li r9, 4
+; CHECK-BE-NEXT:    li r10, 7
+; CHECK-BE-NEXT:    li r11, 6
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    vextublx r8, r8, v2
+; CHECK-BE-NEXT:    vextublx r9, r9, v2
+; CHECK-BE-NEXT:    vextublx r10, r10, v2
+; CHECK-BE-NEXT:    vextublx r11, r11, v2
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r9, r9, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r10, r10, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r11, r11, 0, 24, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r5
+; CHECK-BE-NEXT:    mtvsrwz f1, r4
+; CHECK-BE-NEXT:    mtvsrwz f2, r6
+; CHECK-BE-NEXT:    mtvsrwz f3, r7
+; CHECK-BE-NEXT:    mtvsrwz f4, r8
+; CHECK-BE-NEXT:    mtvsrwz f5, r9
+; CHECK-BE-NEXT:    mtvsrwz f6, r10
+; CHECK-BE-NEXT:    mtvsrwz f7, r11
+; CHECK-BE-NEXT:    xscvuxddp f0, f0
+; CHECK-BE-NEXT:    xscvuxddp f1, f1
+; CHECK-BE-NEXT:    xscvuxddp f2, f2
+; CHECK-BE-NEXT:    xscvuxddp f3, f3
+; CHECK-BE-NEXT:    xscvuxddp f4, f4
+; CHECK-BE-NEXT:    xscvuxddp f5, f5
+; CHECK-BE-NEXT:    xscvuxddp f6, f6
+; CHECK-BE-NEXT:    xscvuxddp f7, f7
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <8 x i8>
+  %1 = uitofp <8 x i8> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt(<16 x double>* noalias nocapture sret %agg.result, <16 x i8> %a) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mfvsrd r5, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    clrldi r6, r5, 56
+; CHECK-P8-NEXT:    rldicl r7, r5, 56, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f0, r6
+; CHECK-P8-NEXT:    rldicl r6, r5, 40, 56
+; CHECK-P8-NEXT:    rldicl r8, r5, 48, 56
+; CHECK-P8-NEXT:    mtvsrwz f1, r7
+; CHECK-P8-NEXT:    rldicl r7, r5, 32, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-P8-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r5, 24, 56
+; CHECK-P8-NEXT:    mtvsrwz f3, r8
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f5, r7
+; CHECK-P8-NEXT:    rldicl r7, r5, 16, 56
+; CHECK-P8-NEXT:    rldicl r5, r5, 8, 56
+; CHECK-P8-NEXT:    mfvsrd r8, f2
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f2, r6
+; CHECK-P8-NEXT:    rlwinm r6, r7, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f6, r6
+; CHECK-P8-NEXT:    clrldi r6, r8, 56
+; CHECK-P8-NEXT:    mtvsrwz f7, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 56, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f8, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 48, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f9, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 40, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f10, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 32, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f11, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 24, 56
+; CHECK-P8-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz f12, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 16, 56
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    xscvuxddp f6, f6
+; CHECK-P8-NEXT:    xscvuxddp f7, f7
+; CHECK-P8-NEXT:    mtvsrwz f13, r5
+; CHECK-P8-NEXT:    rlwinm r5, r6, 0, 24, 31
+; CHECK-P8-NEXT:    mtvsrwz v2, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 8, 56
+; CHECK-P8-NEXT:    xscvuxddp f5, f5
+; CHECK-P8-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P8-NEXT:    xscvuxddp f2, f2
+; CHECK-P8-NEXT:    xscvuxddp f0, f0
+; CHECK-P8-NEXT:    xscvuxddp f1, f1
+; CHECK-P8-NEXT:    xxmrghd vs6, vs7, vs6
+; CHECK-P8-NEXT:    mtvsrwz v3, r5
+; CHECK-P8-NEXT:    li r5, 64
+; CHECK-P8-NEXT:    xscvuxddp f3, f3
+; CHECK-P8-NEXT:    xscvuxddp f4, f4
+; CHECK-P8-NEXT:    xscvuxddp f31, v2
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs5
+; CHECK-P8-NEXT:    xscvuxddp f7, v3
+; CHECK-P8-NEXT:    xscvuxddp f8, f8
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xscvuxddp f9, f9
+; CHECK-P8-NEXT:    xxswapd vs1, vs6
+; CHECK-P8-NEXT:    xscvuxddp f10, f10
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvuxddp f12, f12
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    xscvuxddp f13, f13
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xscvuxddp f11, f11
+; CHECK-P8-NEXT:    xxmrghd vs6, vs7, vs31
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs2, vs6
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    xxmrghd vs5, vs13, vs12
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xxmrghd vs1, vs11, vs10
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs5, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs4, 0, r3
+; CHECK-P8-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r5, 1
+; CHECK-P9-NEXT:    li r6, 2
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    li r8, 4
+; CHECK-P9-NEXT:    li r9, 5
+; CHECK-P9-NEXT:    li r10, 6
+; CHECK-P9-NEXT:    li r11, 7
+; CHECK-P9-NEXT:    li r12, 8
+; CHECK-P9-NEXT:    li r0, 9
+; CHECK-P9-NEXT:    li r30, 10
+; CHECK-P9-NEXT:    li r29, 11
+; CHECK-P9-NEXT:    li r28, 12
+; CHECK-P9-NEXT:    li r27, 13
+; CHECK-P9-NEXT:    li r26, 14
+; CHECK-P9-NEXT:    li r25, 15
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    vextubrx r8, r8, v2
+; CHECK-P9-NEXT:    vextubrx r9, r9, v2
+; CHECK-P9-NEXT:    vextubrx r10, r10, v2
+; CHECK-P9-NEXT:    vextubrx r11, r11, v2
+; CHECK-P9-NEXT:    vextubrx r12, r12, v2
+; CHECK-P9-NEXT:    vextubrx r0, r0, v2
+; CHECK-P9-NEXT:    vextubrx r30, r30, v2
+; CHECK-P9-NEXT:    vextubrx r29, r29, v2
+; CHECK-P9-NEXT:    vextubrx r28, r28, v2
+; CHECK-P9-NEXT:    vextubrx r27, r27, v2
+; CHECK-P9-NEXT:    vextubrx r26, r26, v2
+; CHECK-P9-NEXT:    vextubrx r25, r25, v2
+; CHECK-P9-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r9, r9, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r10, r10, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r11, r11, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r12, r12, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r0, r0, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r30, r30, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r29, r29, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r28, r28, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r27, r27, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r26, r26, 0, 24, 31
+; CHECK-P9-NEXT:    rlwinm r25, r25, 0, 24, 31
+; CHECK-P9-NEXT:    mtvsrwz f0, r4
+; CHECK-P9-NEXT:    mtvsrwz f1, r5
+; CHECK-P9-NEXT:    mtvsrwz f2, r6
+; CHECK-P9-NEXT:    mtvsrwz f3, r7
+; CHECK-P9-NEXT:    mtvsrwz f4, r8
+; CHECK-P9-NEXT:    mtvsrwz f5, r9
+; CHECK-P9-NEXT:    mtvsrwz f6, r10
+; CHECK-P9-NEXT:    mtvsrwz f7, r11
+; CHECK-P9-NEXT:    mtvsrwz f8, r12
+; CHECK-P9-NEXT:    mtvsrwz f9, r0
+; CHECK-P9-NEXT:    mtvsrwz f10, r30
+; CHECK-P9-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f11, r29
+; CHECK-P9-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f12, r28
+; CHECK-P9-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz f13, r27
+; CHECK-P9-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz v2, r26
+; CHECK-P9-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwz v3, r25
+; CHECK-P9-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xscvuxddp f0, f0
+; CHECK-P9-NEXT:    xscvuxddp f1, f1
+; CHECK-P9-NEXT:    xscvuxddp f2, f2
+; CHECK-P9-NEXT:    xscvuxddp f3, f3
+; CHECK-P9-NEXT:    xscvuxddp f4, f4
+; CHECK-P9-NEXT:    xscvuxddp f5, f5
+; CHECK-P9-NEXT:    xscvuxddp f6, f6
+; CHECK-P9-NEXT:    xscvuxddp f7, f7
+; CHECK-P9-NEXT:    xscvuxddp f8, f8
+; CHECK-P9-NEXT:    xscvuxddp f9, f9
+; CHECK-P9-NEXT:    xscvuxddp f10, f10
+; CHECK-P9-NEXT:    xscvuxddp f11, f11
+; CHECK-P9-NEXT:    xscvuxddp f12, f12
+; CHECK-P9-NEXT:    xscvuxddp f13, f13
+; CHECK-P9-NEXT:    xscvuxddp f31, v2
+; CHECK-P9-NEXT:    xscvuxddp f30, v3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-P9-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-P9-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs4, 64(r3)
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs7, 112(r3)
+; CHECK-P9-NEXT:    stxv vs6, 96(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r4, 1
+; CHECK-BE-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r5, 0
+; CHECK-BE-NEXT:    li r6, 3
+; CHECK-BE-NEXT:    li r7, 2
+; CHECK-BE-NEXT:    li r8, 5
+; CHECK-BE-NEXT:    li r9, 4
+; CHECK-BE-NEXT:    li r10, 7
+; CHECK-BE-NEXT:    li r11, 6
+; CHECK-BE-NEXT:    li r12, 9
+; CHECK-BE-NEXT:    li r0, 8
+; CHECK-BE-NEXT:    li r30, 11
+; CHECK-BE-NEXT:    li r29, 10
+; CHECK-BE-NEXT:    li r28, 13
+; CHECK-BE-NEXT:    li r27, 12
+; CHECK-BE-NEXT:    li r26, 15
+; CHECK-BE-NEXT:    li r25, 14
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    vextublx r8, r8, v2
+; CHECK-BE-NEXT:    vextublx r9, r9, v2
+; CHECK-BE-NEXT:    vextublx r10, r10, v2
+; CHECK-BE-NEXT:    vextublx r11, r11, v2
+; CHECK-BE-NEXT:    vextublx r12, r12, v2
+; CHECK-BE-NEXT:    vextublx r0, r0, v2
+; CHECK-BE-NEXT:    vextublx r30, r30, v2
+; CHECK-BE-NEXT:    vextublx r29, r29, v2
+; CHECK-BE-NEXT:    vextublx r28, r28, v2
+; CHECK-BE-NEXT:    vextublx r27, r27, v2
+; CHECK-BE-NEXT:    vextublx r26, r26, v2
+; CHECK-BE-NEXT:    vextublx r25, r25, v2
+; CHECK-BE-NEXT:    rlwinm r4, r4, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r5, r5, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r6, r6, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r7, r7, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r8, r8, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r9, r9, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r10, r10, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r11, r11, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r12, r12, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r0, r0, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r30, r30, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r29, r29, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r28, r28, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r27, r27, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r26, r26, 0, 24, 31
+; CHECK-BE-NEXT:    rlwinm r25, r25, 0, 24, 31
+; CHECK-BE-NEXT:    mtvsrwz f0, r4
+; CHECK-BE-NEXT:    mtvsrwz f1, r5
+; CHECK-BE-NEXT:    mtvsrwz f2, r6
+; CHECK-BE-NEXT:    mtvsrwz f3, r7
+; CHECK-BE-NEXT:    mtvsrwz f4, r8
+; CHECK-BE-NEXT:    mtvsrwz f5, r9
+; CHECK-BE-NEXT:    mtvsrwz f6, r10
+; CHECK-BE-NEXT:    mtvsrwz f7, r11
+; CHECK-BE-NEXT:    mtvsrwz f8, r12
+; CHECK-BE-NEXT:    mtvsrwz f9, r0
+; CHECK-BE-NEXT:    mtvsrwz f10, r30
+; CHECK-BE-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f11, r29
+; CHECK-BE-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f12, r28
+; CHECK-BE-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz f13, r27
+; CHECK-BE-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz v2, r26
+; CHECK-BE-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwz v3, r25
+; CHECK-BE-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xscvuxddp f0, f0
+; CHECK-BE-NEXT:    xscvuxddp f1, f1
+; CHECK-BE-NEXT:    xscvuxddp f2, f2
+; CHECK-BE-NEXT:    xscvuxddp f3, f3
+; CHECK-BE-NEXT:    xscvuxddp f4, f4
+; CHECK-BE-NEXT:    xscvuxddp f5, f5
+; CHECK-BE-NEXT:    xscvuxddp f6, f6
+; CHECK-BE-NEXT:    xscvuxddp f7, f7
+; CHECK-BE-NEXT:    xscvuxddp f8, f8
+; CHECK-BE-NEXT:    xscvuxddp f9, f9
+; CHECK-BE-NEXT:    xscvuxddp f10, f10
+; CHECK-BE-NEXT:    xscvuxddp f11, f11
+; CHECK-BE-NEXT:    xscvuxddp f12, f12
+; CHECK-BE-NEXT:    xscvuxddp f13, f13
+; CHECK-BE-NEXT:    xscvuxddp f31, v2
+; CHECK-BE-NEXT:    xscvuxddp f30, v3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-BE-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = uitofp <16 x i8> %a to <16 x double>
+  store <16 x double> %0, <16 x double>* %agg.result, align 128
+  ret void
+}
+
+define <2 x double> @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    clrldi r4, r3, 56
+; CHECK-P8-NEXT:    rldicl r3, r3, 56, 56
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    extsb r3, r3
+; CHECK-P8-NEXT:    mtvsrwa f0, r4
+; CHECK-P8-NEXT:    mtvsrwa f1, r3
+; CHECK-P8-NEXT:    xscvsxddp f0, f0
+; CHECK-P8-NEXT:    xscvsxddp f1, f1
+; CHECK-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    li r3, 0
+; CHECK-P9-NEXT:    li r4, 1
+; CHECK-P9-NEXT:    vextubrx r3, r3, v2
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    extsb r3, r3
+; CHECK-P9-NEXT:    extsb r4, r4
+; CHECK-P9-NEXT:    mtvsrwa f0, r3
+; CHECK-P9-NEXT:    mtvsrwa f1, r4
+; CHECK-P9-NEXT:    xscvsxddp f0, f0
+; CHECK-P9-NEXT:    xscvsxddp f1, f1
+; CHECK-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    li r3, 1
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    vextublx r3, r3, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    extsb r3, r3
+; CHECK-BE-NEXT:    extsb r4, r4
+; CHECK-BE-NEXT:    mtvsrwa f0, r3
+; CHECK-BE-NEXT:    mtvsrwa f1, r4
+; CHECK-BE-NEXT:    xscvsxddp f0, f0
+; CHECK-BE-NEXT:    xscvsxddp f1, f1
+; CHECK-BE-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i16 %a.coerce to <2 x i8>
+  %1 = sitofp <2 x i8> %0 to <2 x double>
+  ret <2 x double> %1
+}
+
+define void @test4elt_signed(<4 x double>* noalias nocapture sret %agg.result, i32 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    mfvsrd r4, f0
+; CHECK-P8-NEXT:    clrldi r5, r4, 56
+; CHECK-P8-NEXT:    rldicl r6, r4, 56, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f0, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 48, 56
+; CHECK-P8-NEXT:    rldicl r4, r4, 40, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f1, r6
+; CHECK-P8-NEXT:    mtvsrwa f2, r5
+; CHECK-P8-NEXT:    mtvsrwa f3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xscvsxddp f0, f0
+; CHECK-P8-NEXT:    xscvsxddp f1, f1
+; CHECK-P8-NEXT:    xscvsxddp f2, f2
+; CHECK-P8-NEXT:    xscvsxddp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrws v2, r4
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    li r5, 1
+; CHECK-P9-NEXT:    li r6, 2
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    extsb r4, r4
+; CHECK-P9-NEXT:    extsb r5, r5
+; CHECK-P9-NEXT:    extsb r6, r6
+; CHECK-P9-NEXT:    extsb r7, r7
+; CHECK-P9-NEXT:    mtvsrwa f0, r4
+; CHECK-P9-NEXT:    mtvsrwa f1, r5
+; CHECK-P9-NEXT:    mtvsrwa f2, r6
+; CHECK-P9-NEXT:    mtvsrwa f3, r7
+; CHECK-P9-NEXT:    xscvsxddp f0, f0
+; CHECK-P9-NEXT:    xscvsxddp f1, f1
+; CHECK-P9-NEXT:    xscvsxddp f2, f2
+; CHECK-P9-NEXT:    xscvsxddp f3, f3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrws v2, r4
+; CHECK-BE-NEXT:    li r4, 1
+; CHECK-BE-NEXT:    li r5, 0
+; CHECK-BE-NEXT:    li r6, 3
+; CHECK-BE-NEXT:    li r7, 2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    extsb r4, r4
+; CHECK-BE-NEXT:    extsb r5, r5
+; CHECK-BE-NEXT:    extsb r6, r6
+; CHECK-BE-NEXT:    extsb r7, r7
+; CHECK-BE-NEXT:    mtvsrwa f0, r4
+; CHECK-BE-NEXT:    mtvsrwa f1, r5
+; CHECK-BE-NEXT:    mtvsrwa f2, r6
+; CHECK-BE-NEXT:    mtvsrwa f3, r7
+; CHECK-BE-NEXT:    xscvsxddp f0, f0
+; CHECK-BE-NEXT:    xscvsxddp f1, f1
+; CHECK-BE-NEXT:    xscvsxddp f2, f2
+; CHECK-BE-NEXT:    xscvsxddp f3, f3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i32 %a.coerce to <4 x i8>
+  %1 = sitofp <4 x i8> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt_signed(<8 x double>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r4
+; CHECK-P8-NEXT:    mfvsrd r4, f0
+; CHECK-P8-NEXT:    clrldi r5, r4, 56
+; CHECK-P8-NEXT:    rldicl r6, r4, 56, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f0, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 48, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f1, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 40, 56
+; CHECK-P8-NEXT:    mtvsrwa f2, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 32, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f3, r6
+; CHECK-P8-NEXT:    rldicl r6, r4, 24, 56
+; CHECK-P8-NEXT:    mtvsrwa f4, r5
+; CHECK-P8-NEXT:    rldicl r5, r4, 16, 56
+; CHECK-P8-NEXT:    rldicl r4, r4, 8, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    extsb r4, r4
+; CHECK-P8-NEXT:    mtvsrwa f5, r6
+; CHECK-P8-NEXT:    mtvsrwa f6, r5
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    mtvsrwa f7, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    xscvsxddp f4, f4
+; CHECK-P8-NEXT:    xscvsxddp f5, f5
+; CHECK-P8-NEXT:    xscvsxddp f6, f6
+; CHECK-P8-NEXT:    xscvsxddp f7, f7
+; CHECK-P8-NEXT:    xscvsxddp f0, f0
+; CHECK-P8-NEXT:    xscvsxddp f1, f1
+; CHECK-P8-NEXT:    xscvsxddp f2, f2
+; CHECK-P8-NEXT:    xscvsxddp f3, f3
+; CHECK-P8-NEXT:    xxmrghd vs4, vs5, vs4
+; CHECK-P8-NEXT:    xxmrghd vs5, vs7, vs6
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P8-NEXT:    xxswapd vs2, vs5
+; CHECK-P8-NEXT:    xxswapd vs3, vs4
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r4
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    li r5, 1
+; CHECK-P9-NEXT:    li r6, 2
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    li r8, 4
+; CHECK-P9-NEXT:    li r9, 5
+; CHECK-P9-NEXT:    li r10, 6
+; CHECK-P9-NEXT:    li r11, 7
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    vextubrx r8, r8, v2
+; CHECK-P9-NEXT:    vextubrx r9, r9, v2
+; CHECK-P9-NEXT:    vextubrx r10, r10, v2
+; CHECK-P9-NEXT:    vextubrx r11, r11, v2
+; CHECK-P9-NEXT:    extsb r4, r4
+; CHECK-P9-NEXT:    extsb r5, r5
+; CHECK-P9-NEXT:    extsb r6, r6
+; CHECK-P9-NEXT:    extsb r7, r7
+; CHECK-P9-NEXT:    extsb r8, r8
+; CHECK-P9-NEXT:    extsb r9, r9
+; CHECK-P9-NEXT:    extsb r10, r10
+; CHECK-P9-NEXT:    extsb r11, r11
+; CHECK-P9-NEXT:    mtvsrwa f0, r4
+; CHECK-P9-NEXT:    mtvsrwa f1, r5
+; CHECK-P9-NEXT:    mtvsrwa f2, r6
+; CHECK-P9-NEXT:    mtvsrwa f3, r7
+; CHECK-P9-NEXT:    mtvsrwa f4, r8
+; CHECK-P9-NEXT:    mtvsrwa f5, r9
+; CHECK-P9-NEXT:    mtvsrwa f6, r10
+; CHECK-P9-NEXT:    mtvsrwa f7, r11
+; CHECK-P9-NEXT:    xscvsxddp f0, f0
+; CHECK-P9-NEXT:    xscvsxddp f1, f1
+; CHECK-P9-NEXT:    xscvsxddp f2, f2
+; CHECK-P9-NEXT:    xscvsxddp f3, f3
+; CHECK-P9-NEXT:    xscvsxddp f4, f4
+; CHECK-P9-NEXT:    xscvsxddp f5, f5
+; CHECK-P9-NEXT:    xscvsxddp f6, f6
+; CHECK-P9-NEXT:    xscvsxddp f7, f7
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 1
+; CHECK-BE-NEXT:    mtvsrd v2, r4
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    li r6, 3
+; CHECK-BE-NEXT:    li r7, 2
+; CHECK-BE-NEXT:    li r8, 5
+; CHECK-BE-NEXT:    li r9, 4
+; CHECK-BE-NEXT:    li r10, 7
+; CHECK-BE-NEXT:    li r11, 6
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    vextublx r8, r8, v2
+; CHECK-BE-NEXT:    vextublx r9, r9, v2
+; CHECK-BE-NEXT:    vextublx r10, r10, v2
+; CHECK-BE-NEXT:    vextublx r11, r11, v2
+; CHECK-BE-NEXT:    extsb r5, r5
+; CHECK-BE-NEXT:    extsb r4, r4
+; CHECK-BE-NEXT:    extsb r6, r6
+; CHECK-BE-NEXT:    extsb r7, r7
+; CHECK-BE-NEXT:    extsb r8, r8
+; CHECK-BE-NEXT:    extsb r9, r9
+; CHECK-BE-NEXT:    extsb r10, r10
+; CHECK-BE-NEXT:    extsb r11, r11
+; CHECK-BE-NEXT:    mtvsrwa f0, r5
+; CHECK-BE-NEXT:    mtvsrwa f1, r4
+; CHECK-BE-NEXT:    mtvsrwa f2, r6
+; CHECK-BE-NEXT:    mtvsrwa f3, r7
+; CHECK-BE-NEXT:    mtvsrwa f4, r8
+; CHECK-BE-NEXT:    mtvsrwa f5, r9
+; CHECK-BE-NEXT:    mtvsrwa f6, r10
+; CHECK-BE-NEXT:    mtvsrwa f7, r11
+; CHECK-BE-NEXT:    xscvsxddp f0, f0
+; CHECK-BE-NEXT:    xscvsxddp f1, f1
+; CHECK-BE-NEXT:    xscvsxddp f2, f2
+; CHECK-BE-NEXT:    xscvsxddp f3, f3
+; CHECK-BE-NEXT:    xscvsxddp f4, f4
+; CHECK-BE-NEXT:    xscvsxddp f5, f5
+; CHECK-BE-NEXT:    xscvsxddp f6, f6
+; CHECK-BE-NEXT:    xscvsxddp f7, f7
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <8 x i8>
+  %1 = sitofp <8 x i8> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt_signed(<16 x double>* noalias nocapture sret %agg.result, <16 x i8> %a) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mfvsrd r5, v2
+; CHECK-P8-NEXT:    xxswapd vs2, v2
+; CHECK-P8-NEXT:    li r4, 112
+; CHECK-P8-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    clrldi r6, r5, 56
+; CHECK-P8-NEXT:    rldicl r7, r5, 56, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    extsb r7, r7
+; CHECK-P8-NEXT:    mtvsrwa f0, r6
+; CHECK-P8-NEXT:    rldicl r6, r5, 40, 56
+; CHECK-P8-NEXT:    rldicl r8, r5, 48, 56
+; CHECK-P8-NEXT:    mtvsrwa f1, r7
+; CHECK-P8-NEXT:    rldicl r7, r5, 32, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    extsb r8, r8
+; CHECK-P8-NEXT:    extsb r7, r7
+; CHECK-P8-NEXT:    mtvsrwa f4, r6
+; CHECK-P8-NEXT:    rldicl r6, r5, 24, 56
+; CHECK-P8-NEXT:    mtvsrwa f3, r8
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f5, r7
+; CHECK-P8-NEXT:    rldicl r7, r5, 16, 56
+; CHECK-P8-NEXT:    rldicl r5, r5, 8, 56
+; CHECK-P8-NEXT:    mfvsrd r8, f2
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f2, r6
+; CHECK-P8-NEXT:    extsb r6, r7
+; CHECK-P8-NEXT:    mtvsrwa f6, r6
+; CHECK-P8-NEXT:    clrldi r6, r8, 56
+; CHECK-P8-NEXT:    mtvsrwa f7, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 56, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f8, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 48, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f9, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 40, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f10, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 32, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    mtvsrwa f11, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 24, 56
+; CHECK-P8-NEXT:    extsb r6, r6
+; CHECK-P8-NEXT:    mtvsrwa f12, r6
+; CHECK-P8-NEXT:    rldicl r6, r8, 16, 56
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    xscvsxddp f6, f6
+; CHECK-P8-NEXT:    xscvsxddp f7, f7
+; CHECK-P8-NEXT:    mtvsrwa f13, r5
+; CHECK-P8-NEXT:    extsb r5, r6
+; CHECK-P8-NEXT:    mtvsrwa v2, r5
+; CHECK-P8-NEXT:    rldicl r5, r8, 8, 56
+; CHECK-P8-NEXT:    xscvsxddp f5, f5
+; CHECK-P8-NEXT:    extsb r5, r5
+; CHECK-P8-NEXT:    xscvsxddp f2, f2
+; CHECK-P8-NEXT:    xscvsxddp f0, f0
+; CHECK-P8-NEXT:    xscvsxddp f1, f1
+; CHECK-P8-NEXT:    xxmrghd vs6, vs7, vs6
+; CHECK-P8-NEXT:    mtvsrwa v3, r5
+; CHECK-P8-NEXT:    li r5, 64
+; CHECK-P8-NEXT:    xscvsxddp f3, f3
+; CHECK-P8-NEXT:    xscvsxddp f4, f4
+; CHECK-P8-NEXT:    xscvsxddp f31, v2
+; CHECK-P8-NEXT:    xxmrghd vs2, vs2, vs5
+; CHECK-P8-NEXT:    xscvsxddp f7, v3
+; CHECK-P8-NEXT:    xscvsxddp f8, f8
+; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P8-NEXT:    xscvsxddp f9, f9
+; CHECK-P8-NEXT:    xxswapd vs1, vs6
+; CHECK-P8-NEXT:    xscvsxddp f10, f10
+; CHECK-P8-NEXT:    xxswapd vs2, vs2
+; CHECK-P8-NEXT:    xscvsxddp f12, f12
+; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs3
+; CHECK-P8-NEXT:    xscvsxddp f13, f13
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xscvsxddp f11, f11
+; CHECK-P8-NEXT:    xxmrghd vs6, vs7, vs31
+; CHECK-P8-NEXT:    xxswapd vs3, vs3
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs2, vs6
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r4
+; CHECK-P8-NEXT:    li r4, 48
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    li r5, 32
+; CHECK-P8-NEXT:    xxmrghd vs5, vs13, vs12
+; CHECK-P8-NEXT:    xxswapd vs4, vs4
+; CHECK-P8-NEXT:    xxmrghd vs1, vs11, vs10
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
+; CHECK-P8-NEXT:    li r4, 16
+; CHECK-P8-NEXT:    xxswapd vs5, vs5
+; CHECK-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-P8-NEXT:    stxvd2x vs5, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
+; CHECK-P8-NEXT:    stxvd2x vs4, 0, r3
+; CHECK-P8-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r4, 0
+; CHECK-P9-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    li r5, 1
+; CHECK-P9-NEXT:    li r6, 2
+; CHECK-P9-NEXT:    li r7, 3
+; CHECK-P9-NEXT:    li r8, 4
+; CHECK-P9-NEXT:    li r9, 5
+; CHECK-P9-NEXT:    li r10, 6
+; CHECK-P9-NEXT:    li r11, 7
+; CHECK-P9-NEXT:    li r12, 8
+; CHECK-P9-NEXT:    li r0, 9
+; CHECK-P9-NEXT:    li r30, 10
+; CHECK-P9-NEXT:    li r29, 11
+; CHECK-P9-NEXT:    li r28, 12
+; CHECK-P9-NEXT:    li r27, 13
+; CHECK-P9-NEXT:    li r26, 14
+; CHECK-P9-NEXT:    li r25, 15
+; CHECK-P9-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-P9-NEXT:    vextubrx r4, r4, v2
+; CHECK-P9-NEXT:    vextubrx r5, r5, v2
+; CHECK-P9-NEXT:    vextubrx r6, r6, v2
+; CHECK-P9-NEXT:    vextubrx r7, r7, v2
+; CHECK-P9-NEXT:    vextubrx r8, r8, v2
+; CHECK-P9-NEXT:    vextubrx r9, r9, v2
+; CHECK-P9-NEXT:    vextubrx r10, r10, v2
+; CHECK-P9-NEXT:    vextubrx r11, r11, v2
+; CHECK-P9-NEXT:    vextubrx r12, r12, v2
+; CHECK-P9-NEXT:    vextubrx r0, r0, v2
+; CHECK-P9-NEXT:    vextubrx r30, r30, v2
+; CHECK-P9-NEXT:    vextubrx r29, r29, v2
+; CHECK-P9-NEXT:    vextubrx r28, r28, v2
+; CHECK-P9-NEXT:    vextubrx r27, r27, v2
+; CHECK-P9-NEXT:    vextubrx r26, r26, v2
+; CHECK-P9-NEXT:    vextubrx r25, r25, v2
+; CHECK-P9-NEXT:    extsb r4, r4
+; CHECK-P9-NEXT:    extsb r5, r5
+; CHECK-P9-NEXT:    extsb r6, r6
+; CHECK-P9-NEXT:    extsb r7, r7
+; CHECK-P9-NEXT:    extsb r8, r8
+; CHECK-P9-NEXT:    extsb r9, r9
+; CHECK-P9-NEXT:    extsb r10, r10
+; CHECK-P9-NEXT:    extsb r11, r11
+; CHECK-P9-NEXT:    extsb r12, r12
+; CHECK-P9-NEXT:    extsb r0, r0
+; CHECK-P9-NEXT:    extsb r30, r30
+; CHECK-P9-NEXT:    extsb r29, r29
+; CHECK-P9-NEXT:    extsb r28, r28
+; CHECK-P9-NEXT:    extsb r27, r27
+; CHECK-P9-NEXT:    extsb r26, r26
+; CHECK-P9-NEXT:    extsb r25, r25
+; CHECK-P9-NEXT:    mtvsrwa f0, r4
+; CHECK-P9-NEXT:    mtvsrwa f1, r5
+; CHECK-P9-NEXT:    mtvsrwa f2, r6
+; CHECK-P9-NEXT:    mtvsrwa f3, r7
+; CHECK-P9-NEXT:    mtvsrwa f4, r8
+; CHECK-P9-NEXT:    mtvsrwa f5, r9
+; CHECK-P9-NEXT:    mtvsrwa f6, r10
+; CHECK-P9-NEXT:    mtvsrwa f7, r11
+; CHECK-P9-NEXT:    mtvsrwa f8, r12
+; CHECK-P9-NEXT:    mtvsrwa f9, r0
+; CHECK-P9-NEXT:    mtvsrwa f10, r30
+; CHECK-P9-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f11, r29
+; CHECK-P9-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f12, r28
+; CHECK-P9-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa f13, r27
+; CHECK-P9-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa v2, r26
+; CHECK-P9-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    mtvsrwa v3, r25
+; CHECK-P9-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    xscvsxddp f0, f0
+; CHECK-P9-NEXT:    xscvsxddp f1, f1
+; CHECK-P9-NEXT:    xscvsxddp f2, f2
+; CHECK-P9-NEXT:    xscvsxddp f3, f3
+; CHECK-P9-NEXT:    xscvsxddp f4, f4
+; CHECK-P9-NEXT:    xscvsxddp f5, f5
+; CHECK-P9-NEXT:    xscvsxddp f6, f6
+; CHECK-P9-NEXT:    xscvsxddp f7, f7
+; CHECK-P9-NEXT:    xscvsxddp f8, f8
+; CHECK-P9-NEXT:    xscvsxddp f9, f9
+; CHECK-P9-NEXT:    xscvsxddp f10, f10
+; CHECK-P9-NEXT:    xscvsxddp f11, f11
+; CHECK-P9-NEXT:    xscvsxddp f12, f12
+; CHECK-P9-NEXT:    xscvsxddp f13, f13
+; CHECK-P9-NEXT:    xscvsxddp f31, v2
+; CHECK-P9-NEXT:    xscvsxddp f30, v3
+; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-P9-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-P9-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-P9-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-P9-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-P9-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-P9-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-P9-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs4, 64(r3)
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs7, 112(r3)
+; CHECK-P9-NEXT:    stxv vs6, 96(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    std r25, -72(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r26, -64(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r27, -56(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r28, -48(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    std r29, -40(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r4, 1
+; CHECK-BE-NEXT:    std r30, -32(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    li r5, 0
+; CHECK-BE-NEXT:    li r6, 3
+; CHECK-BE-NEXT:    li r7, 2
+; CHECK-BE-NEXT:    li r8, 5
+; CHECK-BE-NEXT:    li r9, 4
+; CHECK-BE-NEXT:    li r10, 7
+; CHECK-BE-NEXT:    li r11, 6
+; CHECK-BE-NEXT:    li r12, 9
+; CHECK-BE-NEXT:    li r0, 8
+; CHECK-BE-NEXT:    li r30, 11
+; CHECK-BE-NEXT:    li r29, 10
+; CHECK-BE-NEXT:    li r28, 13
+; CHECK-BE-NEXT:    li r27, 12
+; CHECK-BE-NEXT:    li r26, 15
+; CHECK-BE-NEXT:    li r25, 14
+; CHECK-BE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; CHECK-BE-NEXT:    vextublx r4, r4, v2
+; CHECK-BE-NEXT:    vextublx r5, r5, v2
+; CHECK-BE-NEXT:    vextublx r6, r6, v2
+; CHECK-BE-NEXT:    vextublx r7, r7, v2
+; CHECK-BE-NEXT:    vextublx r8, r8, v2
+; CHECK-BE-NEXT:    vextublx r9, r9, v2
+; CHECK-BE-NEXT:    vextublx r10, r10, v2
+; CHECK-BE-NEXT:    vextublx r11, r11, v2
+; CHECK-BE-NEXT:    vextublx r12, r12, v2
+; CHECK-BE-NEXT:    vextublx r0, r0, v2
+; CHECK-BE-NEXT:    vextublx r30, r30, v2
+; CHECK-BE-NEXT:    vextublx r29, r29, v2
+; CHECK-BE-NEXT:    vextublx r28, r28, v2
+; CHECK-BE-NEXT:    vextublx r27, r27, v2
+; CHECK-BE-NEXT:    vextublx r26, r26, v2
+; CHECK-BE-NEXT:    vextublx r25, r25, v2
+; CHECK-BE-NEXT:    extsb r4, r4
+; CHECK-BE-NEXT:    extsb r5, r5
+; CHECK-BE-NEXT:    extsb r6, r6
+; CHECK-BE-NEXT:    extsb r7, r7
+; CHECK-BE-NEXT:    extsb r8, r8
+; CHECK-BE-NEXT:    extsb r9, r9
+; CHECK-BE-NEXT:    extsb r10, r10
+; CHECK-BE-NEXT:    extsb r11, r11
+; CHECK-BE-NEXT:    extsb r12, r12
+; CHECK-BE-NEXT:    extsb r0, r0
+; CHECK-BE-NEXT:    extsb r30, r30
+; CHECK-BE-NEXT:    extsb r29, r29
+; CHECK-BE-NEXT:    extsb r28, r28
+; CHECK-BE-NEXT:    extsb r27, r27
+; CHECK-BE-NEXT:    extsb r26, r26
+; CHECK-BE-NEXT:    extsb r25, r25
+; CHECK-BE-NEXT:    mtvsrwa f0, r4
+; CHECK-BE-NEXT:    mtvsrwa f1, r5
+; CHECK-BE-NEXT:    mtvsrwa f2, r6
+; CHECK-BE-NEXT:    mtvsrwa f3, r7
+; CHECK-BE-NEXT:    mtvsrwa f4, r8
+; CHECK-BE-NEXT:    mtvsrwa f5, r9
+; CHECK-BE-NEXT:    mtvsrwa f6, r10
+; CHECK-BE-NEXT:    mtvsrwa f7, r11
+; CHECK-BE-NEXT:    mtvsrwa f8, r12
+; CHECK-BE-NEXT:    mtvsrwa f9, r0
+; CHECK-BE-NEXT:    mtvsrwa f10, r30
+; CHECK-BE-NEXT:    ld r30, -32(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f11, r29
+; CHECK-BE-NEXT:    ld r29, -40(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f12, r28
+; CHECK-BE-NEXT:    ld r28, -48(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa f13, r27
+; CHECK-BE-NEXT:    ld r27, -56(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa v2, r26
+; CHECK-BE-NEXT:    ld r26, -64(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    mtvsrwa v3, r25
+; CHECK-BE-NEXT:    ld r25, -72(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    xscvsxddp f0, f0
+; CHECK-BE-NEXT:    xscvsxddp f1, f1
+; CHECK-BE-NEXT:    xscvsxddp f2, f2
+; CHECK-BE-NEXT:    xscvsxddp f3, f3
+; CHECK-BE-NEXT:    xscvsxddp f4, f4
+; CHECK-BE-NEXT:    xscvsxddp f5, f5
+; CHECK-BE-NEXT:    xscvsxddp f6, f6
+; CHECK-BE-NEXT:    xscvsxddp f7, f7
+; CHECK-BE-NEXT:    xscvsxddp f8, f8
+; CHECK-BE-NEXT:    xscvsxddp f9, f9
+; CHECK-BE-NEXT:    xscvsxddp f10, f10
+; CHECK-BE-NEXT:    xscvsxddp f11, f11
+; CHECK-BE-NEXT:    xscvsxddp f12, f12
+; CHECK-BE-NEXT:    xscvsxddp f13, f13
+; CHECK-BE-NEXT:    xscvsxddp f31, v2
+; CHECK-BE-NEXT:    xscvsxddp f30, v3
+; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghd vs1, vs3, vs2
+; CHECK-BE-NEXT:    xxmrghd vs2, vs5, vs4
+; CHECK-BE-NEXT:    xxmrghd vs3, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd vs4, vs9, vs8
+; CHECK-BE-NEXT:    xxmrghd vs5, vs11, vs10
+; CHECK-BE-NEXT:    xxmrghd vs6, vs13, vs12
+; CHECK-BE-NEXT:    xxmrghd vs7, vs30, vs31
+; CHECK-BE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = sitofp <16 x i8> %a to <16 x double>
+  store <16 x double> %0, <16 x double>* %agg.result, align 128
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_i_to_fp_4byte_elts.ll b/test/CodeGen/PowerPC/vec_conv_i_to_fp_4byte_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..384bcb69937825e5e0f40ee79ce66de165cb6937
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_i_to_fp_4byte_elts.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define i64 @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xvcvuxwsp vs0, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xvcvuxwsp vs0, v2
+; CHECK-P9-NEXT:    mfvsrld r3, vs0
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xvcvuxwsp vs0, vs0
+; CHECK-BE-NEXT:    mfvsrd r3, f0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x i32>
+  %1 = uitofp <2 x i32> %0 to <2 x float>
+  %2 = bitcast <2 x float> %1 to i64
+  ret i64 %2
+}
+
+define <4 x float> @test4elt(<4 x i32> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xvcvuxwsp v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xvcvuxwsp v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xvcvuxwsp v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %0
+}
+
+define void @test8elt(<8 x float>* noalias nocapture sret %agg.result, <8 x i32>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    xvcvuxwsp v3, v3
+; CHECK-P8-NEXT:    xvcvuxwsp v2, v2
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xvcvuxwsp vs1, vs1
+; CHECK-P9-NEXT:    xvcvuxwsp vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 16(r3)
+; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xvcvuxwsp vs1, vs1
+; CHECK-BE-NEXT:    xvcvuxwsp vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x i32>, <8 x i32>* %0, align 32
+  %1 = uitofp <8 x i32> %a to <8 x float>
+  store <8 x float> %1, <8 x float>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt(<16 x float>* noalias nocapture sret %agg.result, <16 x i32>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    lvx v5, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r6
+; CHECK-P8-NEXT:    lvx v4, r4, r7
+; CHECK-P8-NEXT:    xvcvuxwsp v5, v5
+; CHECK-P8-NEXT:    xvcvuxwsp v2, v2
+; CHECK-P8-NEXT:    xvcvuxwsp v3, v3
+; CHECK-P8-NEXT:    xvcvuxwsp v4, v4
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    stvx v3, r3, r6
+; CHECK-P8-NEXT:    stvx v4, r3, r7
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    xvcvuxwsp vs3, vs3
+; CHECK-P9-NEXT:    xvcvuxwsp vs2, vs2
+; CHECK-P9-NEXT:    xvcvuxwsp vs1, vs1
+; CHECK-P9-NEXT:    xvcvuxwsp vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs1, 32(r3)
+; CHECK-P9-NEXT:    stxv vs2, 16(r3)
+; CHECK-P9-NEXT:    stxv vs3, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    xvcvuxwsp vs3, vs3
+; CHECK-BE-NEXT:    xvcvuxwsp vs2, vs2
+; CHECK-BE-NEXT:    xvcvuxwsp vs1, vs1
+; CHECK-BE-NEXT:    xvcvuxwsp vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs3, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i32>, <16 x i32>* %0, align 64
+  %1 = uitofp <16 x i32> %a to <16 x float>
+  store <16 x float> %1, <16 x float>* %agg.result, align 64
+  ret void
+}
+
+define i64 @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mtvsrd f0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    xvcvsxwsp vs0, v2
+; CHECK-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-P8-NEXT:    mfvsrd r3, f0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    mtvsrd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
+; CHECK-P9-NEXT:    xvcvsxwsp vs0, v2
+; CHECK-P9-NEXT:    mfvsrld r3, vs0
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrd f0, r3
+; CHECK-BE-NEXT:    xvcvsxwsp vs0, vs0
+; CHECK-BE-NEXT:    mfvsrd r3, f0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i64 %a.coerce to <2 x i32>
+  %1 = sitofp <2 x i32> %0 to <2 x float>
+  %2 = bitcast <2 x float> %1 to i64
+  ret i64 %2
+}
+
+define <4 x float> @test4elt_signed(<4 x i32> %a) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xvcvsxwsp v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xvcvsxwsp v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xvcvsxwsp v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %0
+}
+
+define void @test8elt_signed(<8 x float>* noalias nocapture sret %agg.result, <8 x i32>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lvx v3, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    xvcvsxwsp v3, v3
+; CHECK-P8-NEXT:    xvcvsxwsp v2, v2
+; CHECK-P8-NEXT:    stvx v3, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xvcvsxwsp vs1, vs1
+; CHECK-P9-NEXT:    xvcvsxwsp vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 16(r3)
+; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 16(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xvcvsxwsp vs1, vs1
+; CHECK-BE-NEXT:    xvcvsxwsp vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x i32>, <8 x i32>* %0, align 32
+  %1 = sitofp <8 x i32> %a to <8 x float>
+  store <8 x float> %1, <8 x float>* %agg.result, align 32
+  ret void
+}
+
+define void @test16elt_signed(<16 x float>* noalias nocapture sret %agg.result, <16 x i32>* nocapture readonly) local_unnamed_addr #2 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    lvx v5, 0, r4
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r6
+; CHECK-P8-NEXT:    lvx v4, r4, r7
+; CHECK-P8-NEXT:    xvcvsxwsp v5, v5
+; CHECK-P8-NEXT:    xvcvsxwsp v2, v2
+; CHECK-P8-NEXT:    xvcvsxwsp v3, v3
+; CHECK-P8-NEXT:    xvcvsxwsp v4, v4
+; CHECK-P8-NEXT:    stvx v5, 0, r3
+; CHECK-P8-NEXT:    stvx v2, r3, r5
+; CHECK-P8-NEXT:    stvx v3, r3, r6
+; CHECK-P8-NEXT:    stvx v4, r3, r7
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 32(r4)
+; CHECK-P9-NEXT:    lxv vs2, 16(r4)
+; CHECK-P9-NEXT:    lxv vs3, 0(r4)
+; CHECK-P9-NEXT:    xvcvsxwsp vs3, vs3
+; CHECK-P9-NEXT:    xvcvsxwsp vs2, vs2
+; CHECK-P9-NEXT:    xvcvsxwsp vs1, vs1
+; CHECK-P9-NEXT:    xvcvsxwsp vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
+; CHECK-P9-NEXT:    stxv vs1, 32(r3)
+; CHECK-P9-NEXT:    stxv vs2, 16(r3)
+; CHECK-P9-NEXT:    stxv vs3, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 32(r4)
+; CHECK-BE-NEXT:    lxv vs2, 16(r4)
+; CHECK-BE-NEXT:    lxv vs3, 0(r4)
+; CHECK-BE-NEXT:    xvcvsxwsp vs3, vs3
+; CHECK-BE-NEXT:    xvcvsxwsp vs2, vs2
+; CHECK-BE-NEXT:    xvcvsxwsp vs1, vs1
+; CHECK-BE-NEXT:    xvcvsxwsp vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 32(r3)
+; CHECK-BE-NEXT:    stxv vs2, 16(r3)
+; CHECK-BE-NEXT:    stxv vs3, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i32>, <16 x i32>* %0, align 64
+  %1 = sitofp <16 x i32> %a to <16 x float>
+  store <16 x float> %1, <16 x float>* %agg.result, align 64
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_conv_i_to_fp_8byte_elts.ll b/test/CodeGen/PowerPC/vec_conv_i_to_fp_8byte_elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5bf15e3ae4f22bcbdcbf97b7670e73be9da9e99f
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv_i_to_fp_8byte_elts.ll
@@ -0,0 +1,438 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define <2 x double> @test2elt(<2 x i64> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xvcvuxddp v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xvcvuxddp v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xvcvuxddp v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = uitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %0
+}
+
+define void @test4elt(<4 x double>* noalias nocapture sret %agg.result, <4 x i64>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    xvcvuxddp vs1, vs1
+; CHECK-P8-NEXT:    xvcvuxddp vs0, vs0
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv v2, 16(r4)
+; CHECK-P9-NEXT:    lxv v3, 0(r4)
+; CHECK-P9-NEXT:    xvcvuxddp vs0, v3
+; CHECK-P9-NEXT:    xvcvuxddp vs1, v2
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 16(r4)
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-NEXT:    xvcvuxddp vs0, v3
+; CHECK-BE-NEXT:    xvcvuxddp vs1, v2
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x i64>, <4 x i64>* %0, align 32
+  %1 = uitofp <4 x i64> %a to <4 x double>
+  store <4 x double> %1, <4 x double>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt(<8 x double>* noalias nocapture sret %agg.result, <8 x i64>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test8elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    xvcvuxddp vs3, vs3
+; CHECK-P8-NEXT:    xvcvuxddp vs0, vs0
+; CHECK-P8-NEXT:    xvcvuxddp vs1, vs1
+; CHECK-P8-NEXT:    xvcvuxddp vs2, vs2
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv v2, 48(r4)
+; CHECK-P9-NEXT:    lxv v3, 32(r4)
+; CHECK-P9-NEXT:    lxv v4, 16(r4)
+; CHECK-P9-NEXT:    lxv v5, 0(r4)
+; CHECK-P9-NEXT:    xvcvuxddp vs0, v5
+; CHECK-P9-NEXT:    xvcvuxddp vs1, v4
+; CHECK-P9-NEXT:    xvcvuxddp vs2, v3
+; CHECK-P9-NEXT:    xvcvuxddp vs3, v2
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 48(r4)
+; CHECK-BE-NEXT:    lxv v3, 32(r4)
+; CHECK-BE-NEXT:    lxv v4, 16(r4)
+; CHECK-BE-NEXT:    lxv v5, 0(r4)
+; CHECK-BE-NEXT:    xvcvuxddp vs0, v5
+; CHECK-BE-NEXT:    xvcvuxddp vs1, v4
+; CHECK-BE-NEXT:    xvcvuxddp vs2, v3
+; CHECK-BE-NEXT:    xvcvuxddp vs3, v2
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x i64>, <8 x i64>* %0, align 64
+  %1 = uitofp <8 x i64> %a to <8 x double>
+  store <8 x double> %1, <8 x double>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt(<16 x double>* noalias nocapture sret %agg.result, <16 x i64>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test16elt:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 64
+; CHECK-P8-NEXT:    li r8, 96
+; CHECK-P8-NEXT:    li r9, 112
+; CHECK-P8-NEXT:    li r10, 80
+; CHECK-P8-NEXT:    li r11, 48
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r8
+; CHECK-P8-NEXT:    lxvd2x vs4, r4, r9
+; CHECK-P8-NEXT:    lxvd2x vs5, r4, r10
+; CHECK-P8-NEXT:    lxvd2x vs6, r4, r11
+; CHECK-P8-NEXT:    lxvd2x vs7, 0, r4
+; CHECK-P8-NEXT:    xvcvuxddp vs0, vs0
+; CHECK-P8-NEXT:    xvcvuxddp vs1, vs1
+; CHECK-P8-NEXT:    xvcvuxddp vs2, vs2
+; CHECK-P8-NEXT:    xvcvuxddp vs3, vs3
+; CHECK-P8-NEXT:    xvcvuxddp vs4, vs4
+; CHECK-P8-NEXT:    xvcvuxddp vs5, vs5
+; CHECK-P8-NEXT:    xvcvuxddp vs6, vs6
+; CHECK-P8-NEXT:    xvcvuxddp vs7, vs7
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r9
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r8
+; CHECK-P8-NEXT:    stxvd2x vs5, r3, r10
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs6, r3, r11
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs7, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv v2, 48(r4)
+; CHECK-P9-NEXT:    lxv v3, 32(r4)
+; CHECK-P9-NEXT:    lxv v4, 16(r4)
+; CHECK-P9-NEXT:    lxv v5, 0(r4)
+; CHECK-P9-NEXT:    lxv v0, 112(r4)
+; CHECK-P9-NEXT:    lxv v1, 96(r4)
+; CHECK-P9-NEXT:    lxv v6, 80(r4)
+; CHECK-P9-NEXT:    lxv v7, 64(r4)
+; CHECK-P9-NEXT:    xvcvuxddp vs0, v5
+; CHECK-P9-NEXT:    xvcvuxddp vs1, v4
+; CHECK-P9-NEXT:    xvcvuxddp vs2, v3
+; CHECK-P9-NEXT:    xvcvuxddp vs3, v2
+; CHECK-P9-NEXT:    xvcvuxddp vs4, v7
+; CHECK-P9-NEXT:    xvcvuxddp vs5, v6
+; CHECK-P9-NEXT:    xvcvuxddp vs6, v1
+; CHECK-P9-NEXT:    xvcvuxddp vs7, v0
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs7, 112(r3)
+; CHECK-P9-NEXT:    stxv vs6, 96(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    stxv vs4, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 48(r4)
+; CHECK-BE-NEXT:    lxv v3, 32(r4)
+; CHECK-BE-NEXT:    lxv v4, 16(r4)
+; CHECK-BE-NEXT:    lxv v5, 0(r4)
+; CHECK-BE-NEXT:    lxv v0, 112(r4)
+; CHECK-BE-NEXT:    lxv v1, 96(r4)
+; CHECK-BE-NEXT:    lxv v6, 80(r4)
+; CHECK-BE-NEXT:    lxv v7, 64(r4)
+; CHECK-BE-NEXT:    xvcvuxddp vs0, v5
+; CHECK-BE-NEXT:    xvcvuxddp vs1, v4
+; CHECK-BE-NEXT:    xvcvuxddp vs2, v3
+; CHECK-BE-NEXT:    xvcvuxddp vs3, v2
+; CHECK-BE-NEXT:    xvcvuxddp vs4, v7
+; CHECK-BE-NEXT:    xvcvuxddp vs5, v6
+; CHECK-BE-NEXT:    xvcvuxddp vs6, v1
+; CHECK-BE-NEXT:    xvcvuxddp vs7, v0
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i64>, <16 x i64>* %0, align 128
+  %1 = uitofp <16 x i64> %a to <16 x double>
+  store <16 x double> %1, <16 x double>* %agg.result, align 128
+  ret void
+}
+
+define <2 x double> @test2elt_signed(<2 x i64> %a) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: test2elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    xvcvsxddp v2, v2
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    xvcvsxddp v2, v2
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test2elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xvcvsxddp v2, v2
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = sitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %0
+}
+
+define void @test4elt_signed(<4 x double>* noalias nocapture sret %agg.result, <4 x i64>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test4elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    lxvd2x vs1, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    xvcvsxddp vs1, vs1
+; CHECK-P8-NEXT:    xvcvsxddp vs0, vs0
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test4elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv v2, 16(r4)
+; CHECK-P9-NEXT:    lxv v3, 0(r4)
+; CHECK-P9-NEXT:    xvcvsxddp vs0, v3
+; CHECK-P9-NEXT:    xvcvsxddp vs1, v2
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test4elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 16(r4)
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs1, v2
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <4 x i64>, <4 x i64>* %0, align 32
+  %1 = sitofp <4 x i64> %a to <4 x double>
+  store <4 x double> %1, <4 x double>* %agg.result, align 32
+  ret void
+}
+
+define void @test8elt_signed(<8 x double>* noalias nocapture sret %agg.result, <8 x i64>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test8elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    lxvd2x vs3, 0, r4
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    xvcvsxddp vs3, vs3
+; CHECK-P8-NEXT:    xvcvsxddp vs0, vs0
+; CHECK-P8-NEXT:    xvcvsxddp vs1, vs1
+; CHECK-P8-NEXT:    xvcvsxddp vs2, vs2
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs3, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test8elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv v2, 48(r4)
+; CHECK-P9-NEXT:    lxv v3, 32(r4)
+; CHECK-P9-NEXT:    lxv v4, 16(r4)
+; CHECK-P9-NEXT:    lxv v5, 0(r4)
+; CHECK-P9-NEXT:    xvcvsxddp vs0, v5
+; CHECK-P9-NEXT:    xvcvsxddp vs1, v4
+; CHECK-P9-NEXT:    xvcvsxddp vs2, v3
+; CHECK-P9-NEXT:    xvcvsxddp vs3, v2
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test8elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 48(r4)
+; CHECK-BE-NEXT:    lxv v3, 32(r4)
+; CHECK-BE-NEXT:    lxv v4, 16(r4)
+; CHECK-BE-NEXT:    lxv v5, 0(r4)
+; CHECK-BE-NEXT:    xvcvsxddp vs0, v5
+; CHECK-BE-NEXT:    xvcvsxddp vs1, v4
+; CHECK-BE-NEXT:    xvcvsxddp vs2, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs3, v2
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <8 x i64>, <8 x i64>* %0, align 64
+  %1 = sitofp <8 x i64> %a to <8 x double>
+  store <8 x double> %1, <8 x double>* %agg.result, align 64
+  ret void
+}
+
+define void @test16elt_signed(<16 x double>* noalias nocapture sret %agg.result, <16 x i64>* nocapture readonly) local_unnamed_addr #1 {
+; CHECK-P8-LABEL: test16elt_signed:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    li r5, 16
+; CHECK-P8-NEXT:    li r6, 32
+; CHECK-P8-NEXT:    li r7, 64
+; CHECK-P8-NEXT:    li r8, 96
+; CHECK-P8-NEXT:    li r9, 112
+; CHECK-P8-NEXT:    li r10, 80
+; CHECK-P8-NEXT:    li r11, 48
+; CHECK-P8-NEXT:    lxvd2x vs0, r4, r5
+; CHECK-P8-NEXT:    lxvd2x vs1, r4, r6
+; CHECK-P8-NEXT:    lxvd2x vs2, r4, r7
+; CHECK-P8-NEXT:    lxvd2x vs3, r4, r8
+; CHECK-P8-NEXT:    lxvd2x vs4, r4, r9
+; CHECK-P8-NEXT:    lxvd2x vs5, r4, r10
+; CHECK-P8-NEXT:    lxvd2x vs6, r4, r11
+; CHECK-P8-NEXT:    lxvd2x vs7, 0, r4
+; CHECK-P8-NEXT:    xvcvsxddp vs0, vs0
+; CHECK-P8-NEXT:    xvcvsxddp vs1, vs1
+; CHECK-P8-NEXT:    xvcvsxddp vs2, vs2
+; CHECK-P8-NEXT:    xvcvsxddp vs3, vs3
+; CHECK-P8-NEXT:    xvcvsxddp vs4, vs4
+; CHECK-P8-NEXT:    xvcvsxddp vs5, vs5
+; CHECK-P8-NEXT:    xvcvsxddp vs6, vs6
+; CHECK-P8-NEXT:    xvcvsxddp vs7, vs7
+; CHECK-P8-NEXT:    stxvd2x vs4, r3, r9
+; CHECK-P8-NEXT:    stxvd2x vs3, r3, r8
+; CHECK-P8-NEXT:    stxvd2x vs5, r3, r10
+; CHECK-P8-NEXT:    stxvd2x vs2, r3, r7
+; CHECK-P8-NEXT:    stxvd2x vs6, r3, r11
+; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
+; CHECK-P8-NEXT:    stxvd2x vs0, r3, r5
+; CHECK-P8-NEXT:    stxvd2x vs7, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: test16elt_signed:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxv v2, 48(r4)
+; CHECK-P9-NEXT:    lxv v3, 32(r4)
+; CHECK-P9-NEXT:    lxv v4, 16(r4)
+; CHECK-P9-NEXT:    lxv v5, 0(r4)
+; CHECK-P9-NEXT:    lxv v0, 112(r4)
+; CHECK-P9-NEXT:    lxv v1, 96(r4)
+; CHECK-P9-NEXT:    lxv v6, 80(r4)
+; CHECK-P9-NEXT:    lxv v7, 64(r4)
+; CHECK-P9-NEXT:    xvcvsxddp vs0, v5
+; CHECK-P9-NEXT:    xvcvsxddp vs1, v4
+; CHECK-P9-NEXT:    xvcvsxddp vs2, v3
+; CHECK-P9-NEXT:    xvcvsxddp vs3, v2
+; CHECK-P9-NEXT:    xvcvsxddp vs4, v7
+; CHECK-P9-NEXT:    xvcvsxddp vs5, v6
+; CHECK-P9-NEXT:    xvcvsxddp vs6, v1
+; CHECK-P9-NEXT:    xvcvsxddp vs7, v0
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs2, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-P9-NEXT:    stxv vs7, 112(r3)
+; CHECK-P9-NEXT:    stxv vs6, 96(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    stxv vs4, 64(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-BE-LABEL: test16elt_signed:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 48(r4)
+; CHECK-BE-NEXT:    lxv v3, 32(r4)
+; CHECK-BE-NEXT:    lxv v4, 16(r4)
+; CHECK-BE-NEXT:    lxv v5, 0(r4)
+; CHECK-BE-NEXT:    lxv v0, 112(r4)
+; CHECK-BE-NEXT:    lxv v1, 96(r4)
+; CHECK-BE-NEXT:    lxv v6, 80(r4)
+; CHECK-BE-NEXT:    lxv v7, 64(r4)
+; CHECK-BE-NEXT:    xvcvsxddp vs0, v5
+; CHECK-BE-NEXT:    xvcvsxddp vs1, v4
+; CHECK-BE-NEXT:    xvcvsxddp vs2, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs3, v2
+; CHECK-BE-NEXT:    xvcvsxddp vs4, v7
+; CHECK-BE-NEXT:    xvcvsxddp vs5, v6
+; CHECK-BE-NEXT:    xvcvsxddp vs6, v1
+; CHECK-BE-NEXT:    xvcvsxddp vs7, v0
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
+; CHECK-BE-NEXT:    blr
+entry:
+  %a = load <16 x i64>, <16 x i64>* %0, align 128
+  %1 = sitofp <16 x i64> %a to <16 x double>
+  store <16 x double> %1, <16 x double>* %agg.result, align 128
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vec_select.ll b/test/CodeGen/PowerPC/vec_select.ll
index cb2a12eab7d7a9b723d3e80de03e19701825a54e..3b4f390611d49a26286cd935a93bfc9f4c445692 100644
--- a/test/CodeGen/PowerPC/vec_select.ll
+++ b/test/CodeGen/PowerPC/vec_select.ll
@@ -1,7 +1,100 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s -check-prefix=CHECK-VSX
+; RUN: llc -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=-vsx | FileCheck %s -check-prefix=CHECK-NOVSX
+; RUN: llc -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s -mtriple=powerpc64le-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s -check-prefix=CHECK-VSX
+; RUN: llc -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s -mtriple=powerpc64le-linux-gnu -mcpu=pwr8 -mattr=-vsx | FileCheck %s -check-prefix=CHECK-NOVSX
 
-; CHECK: vsel_float
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
-  ret <4 x float> %vsel
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
+entry:
+  %m = fcmp oeq <4 x float> %c, %d
+  %v = select <4 x i1> %m, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %v
 }
+; CHECK-VSX-LABLE: test1
+; CHECK-VSX: xvcmpeqsp [[REG1:(vs|v)[0-9]+]], v4, v5
+; CHECK-VSX: xxsel v2, v3, v2, [[REG1]]
+; CHECK-VSX: blr
+
+; CHECK-NOVSX-LABLE: test1
+; CHECK-NOVSX: vcmpeqfp v[[REG1:[0-9]+]], v4, v5
+; CHECK-NOVSX: vsel v2, v3, v2, v[[REG1]]
+; CHECK-NOVSX: blr
+
+define <2 x double> @test2(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
+entry:
+  %m = fcmp oeq <2 x double> %c, %d
+  %v = select <2 x i1> %m, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %v
+}
+; CHECK-VSX-LABLE: test2
+; CHECK-VSX: xvcmpeqdp [[REG1:(vs|v)[0-9]+]], v4, v5
+; CHECK-VSX: xxsel v2, v3, v2, [[REG1]]
+; CHECK-VSX: blr
+
+; CHECK-NOVSX-LABLE: test2
+; CHECK-NOVSX: fcmp
+; CHECK-NOVSX: fcmp
+; CHECK-NOVSX: blr
+
+define <16 x i8> @test3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+entry:
+  %m = icmp eq <16 x i8> %c, %d
+  %v = select <16 x i1> %m, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %v
+}
+; CHECK-VSX-LABLE: test3
+; CHECK-VSX: vcmpequb v[[REG1:[0-9]+]], v4, v5
+; CHECK-VSX: xxsel v2, v3, v2, v[[REG1]]
+; CHECK-VSX: blr
+
+; CHECK-NOVSX-LABLE: test3
+; CHECK-NOVSX: vcmpequb v[[REG1:[0-9]+]], v4, v5
+; CHECK-NOVSX: vsel v2, v3, v2, v[[REG1]]
+; CHECK-NOVSX: blr
+
+define <8 x i16> @test4(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
+entry:
+  %m = icmp eq <8 x i16> %c, %d
+  %v = select <8 x i1> %m, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %v
+}
+; CHECK-VSX-LABLE: test4
+; CHECK-VSX: vcmpequh v[[REG1:[0-9]+]], v4, v5
+; CHECK-VSX: xxsel v2, v3, v2, v[[REG1]]
+; CHECK-VSX: blr
+
+; CHECK-NOVSX-LABLE: test4
+; CHECK-NOVSX: vcmpequh v[[REG1:[0-9]+]], v4, v5
+; CHECK-NOVSX: vsel v2, v3, v2, v[[REG1]]
+; CHECK-NOVSX: blr
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+entry:
+  %m = icmp eq <4 x i32> %c, %d
+  %v = select <4 x i1> %m, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %v
+}
+; CHECK-VSX-LABLE: test5
+; CHECK-VSX: vcmpequw v[[REG1:[0-9]+]], v4, v5
+; CHECK-VSX: xxsel v2, v3, v2, v[[REG1]]
+; CHECK-VSX: blr
+
+; CHECK-NOVSX-LABLE: test5
+; CHECK-NOVSX: vcmpequw v[[REG1:[0-9]+]], v4, v5
+; CHECK-NOVSX: vsel v2, v3, v2, v[[REG1]]
+; CHECK-NOVSX: blr
+
+define <2 x i64> @test6(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
+entry:
+  %m = icmp eq <2 x i64> %c, %d
+  %v = select <2 x i1> %m, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %v
+}
+; CHECK-VSX-LABLE: test6
+; CHECK-VSX: vcmpequd v[[REG1:[0-9]+]], v4, v5
+; CHECK-VSX: xxsel v2, v3, v2, v[[REG1]]
+; CHECK-VSX: blr
+
+; CHECK-NOVSX-LABLE: test6
+; CHECK-NOVSX: vcmpequd v[[REG1:[0-9]+]], v4, v5
+; CHECK-NOVSX: vsel v2, v3, v2, v[[REG1]]
+; CHECK-NOVSX: blr
diff --git a/test/CodeGen/PowerPC/vsel-prom.ll b/test/CodeGen/PowerPC/vsel-prom.ll
index dd219ec0da6f69c7bde8626dde1f69a0c25a3d17..79d1d83209cf7dab8c66cca6e85e40f9f95dd6b7 100644
--- a/test/CodeGen/PowerPC/vsel-prom.ll
+++ b/test/CodeGen/PowerPC/vsel-prom.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 < %s -verify-machineinstrs | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll b/test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll
index 06636f24f97ca0d5425352854a32f5d21c0c2671..ed6b49dd7f8b86eb754124ee98cbe733f14b75e9 100644
--- a/test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll b/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
index 4d45e6e42eb0aef38b75e53b835f3973784bb832..d09b2146e7269a012010e4bcd4f54e099298c2e4 100644
--- a/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
+++ b/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr8 -mattr=-direct-move | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: llc -relocation-model=pic -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr8 -mattr=-direct-move | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr9 -mattr=-direct-move | FileCheck %s -check-prefix=CHECK-P9
diff --git a/test/CodeGen/PowerPC/xray-ret-is-terminator.ll b/test/CodeGen/PowerPC/xray-ret-is-terminator.ll
index 7e63530912aa2154a7bbf75b2b1ac57485efd59d..7828f228c8d8194efa79c35d4b2d025d4f8a2d26 100644
--- a/test/CodeGen/PowerPC/xray-ret-is-terminator.ll
+++ b/test/CodeGen/PowerPC/xray-ret-is-terminator.ll
@@ -1,4 +1,4 @@
-; RUN: llc -o - -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
 
 define void @ILLBeBack() #0 {
 ; CHECK-LABEL @ILLBeBack
diff --git a/test/CodeGen/PowerPC/xray-tail-call-sled.ll b/test/CodeGen/PowerPC/xray-tail-call-sled.ll
index e8fe9dbaa45f639c4e4faf83cc42596aa2831973..90a928a90d07b341fcba3fd410750c4bae0d2b10 100644
--- a/test/CodeGen/PowerPC/xray-tail-call-sled.ll
+++ b/test/CodeGen/PowerPC/xray-tail-call-sled.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=asm -o - -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -filetype=asm -relocation-model=pic -o - -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @callee() nounwind noinline uwtable "function-instrument"="xray-always" {
 ; CHECK-LABEL: .Ltmp0:
diff --git a/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll b/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll
index fd8adff5a1d795decbb604f8be1554c1d155a493..263a7590cf99c8fdbfbb448199b2993af53bc90b 100644
--- a/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll
+++ b/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
 
diff --git a/test/CodeGen/RISCV/alu16.ll b/test/CodeGen/RISCV/alu16.ll
index 79e74ffc8a5c21df6c6fd8107a4be2f39d440170..15bf44e526e76fe55e687df49d7a6cc8671fe9de 100644
--- a/test/CodeGen/RISCV/alu16.ll
+++ b/test/CodeGen/RISCV/alu16.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
 
 ; These tests are identical to those in alu32.ll but operate on i16. They check
 ; that legalisation of these non-native types doesn't introduce unnecessary
@@ -11,6 +13,11 @@ define i16 @addi(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: addi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
   %1 = add i16 %a, 1
   ret i16 %1
 }
@@ -22,6 +29,13 @@ define i16 @slti(i16 %a) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slti a0, a0, 2
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slti:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    slti a0, a0, 2
+; RV64I-NEXT:    ret
   %1 = icmp slt i16 %a, 2
   %2 = zext i1 %1 to i16
   ret i16 %2
@@ -35,6 +49,14 @@ define i16 @sltiu(i16 %a) nounwind {
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    sltiu a0, a0, 3
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sltiu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    sltiu a0, a0, 3
+; RV64I-NEXT:    ret
   %1 = icmp ult i16 %a, 3
   %2 = zext i1 %1 to i16
   ret i16 %2
@@ -45,6 +67,11 @@ define i16 @xori(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xori a0, a0, 4
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: xori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xori a0, a0, 4
+; RV64I-NEXT:    ret
   %1 = xor i16 %a, 4
   ret i16 %1
 }
@@ -54,6 +81,11 @@ define i16 @ori(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ori a0, a0, 5
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ori a0, a0, 5
+; RV64I-NEXT:    ret
   %1 = or i16 %a, 5
   ret i16 %1
 }
@@ -63,6 +95,11 @@ define i16 @andi(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a0, a0, 6
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: andi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 6
+; RV64I-NEXT:    ret
   %1 = and i16 %a, 6
   ret i16 %1
 }
@@ -72,6 +109,11 @@ define i16 @slli(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 7
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    ret
   %1 = shl i16 %a, 7
   ret i16 %1
 }
@@ -84,6 +126,14 @@ define i16 @srli(i16 %a) nounwind {
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    srli a0, a0, 6
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -64
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a0, a0, 6
+; RV64I-NEXT:    ret
   %1 = lshr i16 %a, 6
   ret i16 %1
 }
@@ -94,6 +144,12 @@ define i16 @srai(i16 %a) nounwind {
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    srai a0, a0, 25
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srai:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 57
+; RV64I-NEXT:    ret
   %1 = ashr i16 %a, 9
   ret i16 %1
 }
@@ -104,6 +160,11 @@ define i16 @add(i16 %a, i16 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = add i16 %a, %b
   ret i16 %1
 }
@@ -113,6 +174,11 @@ define i16 @sub(i16 %a, i16 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sub:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = sub i16 %a, %b
   ret i16 %1
 }
@@ -122,6 +188,11 @@ define i16 @sll(i16 %a, i16 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sll:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = shl i16 %a, %b
   ret i16 %1
 }
@@ -135,6 +206,15 @@ define i16 @slt(i16 %a, i16 %b) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slt a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slt:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    slt a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = icmp slt i16 %a, %b
   %2 = zext i1 %1 to i16
   ret i16 %2
@@ -149,6 +229,15 @@ define i16 @sltu(i16 %a, i16 %b) nounwind {
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    sltu a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sltu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = icmp ult i16 %a, %b
   %2 = zext i1 %1 to i16
   ret i16 %2
@@ -159,6 +248,11 @@ define i16 @xor(i16 %a, i16 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: xor:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = xor i16 %a, %b
   ret i16 %1
 }
@@ -171,6 +265,14 @@ define i16 @srl(i16 %a, i16 %b) nounwind {
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srl:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = lshr i16 %a, %b
   ret i16 %1
 }
@@ -182,6 +284,13 @@ define i16 @sra(i16 %a, i16 %b) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sra:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = ashr i16 %a, %b
   ret i16 %1
 }
@@ -191,6 +300,11 @@ define i16 @or(i16 %a, i16 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: or:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = or i16 %a, %b
   ret i16 %1
 }
@@ -200,6 +314,11 @@ define i16 @and(i16 %a, i16 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: and:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = and i16 %a, %b
   ret i16 %1
 }
diff --git a/test/CodeGen/RISCV/alu32.ll b/test/CodeGen/RISCV/alu32.ll
index 6ecd08878dd6cb25d0fe50cf6688404fcc4c45f7..3776e53c30692af3363c5d9ae27b6f496be103e5 100644
--- a/test/CodeGen/RISCV/alu32.ll
+++ b/test/CodeGen/RISCV/alu32.ll
@@ -1,18 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
 
-; These tests are each targeted at a particular RISC-V ALU instruction. Other
-; files in this folder exercise LLVM IR instructions that don't directly match a
-; RISC-V instruction
+; These tests are each targeted at a particular RISC-V ALU instruction. Most
+; other files in this folder exercise LLVM IR instructions that don't directly
+; match a RISC-V instruction.
 
-; Register-immediate instructions
+; Register-immediate instructions.
+
+; TODO: Sign-extension would also work when promoting the operands of
+; sltu/sltiu on RV64 and is cheaper than zero-extension (1 instruction vs 2).
 
 define i32 @addi(i32 %a) nounwind {
 ; RV32I-LABEL: addi:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: addi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
   %1 = add i32 %a, 1
   ret i32 %1
 }
@@ -22,6 +32,12 @@ define i32 @slti(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slti a0, a0, 2
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slti:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    slti a0, a0, 2
+; RV64I-NEXT:    ret
   %1 = icmp slt i32 %a, 2
   %2 = zext i1 %1 to i32
   ret i32 %2
@@ -32,6 +48,12 @@ define i32 @sltiu(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltiu a0, a0, 3
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sltiu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sltiu a0, a0, 3
+; RV64I-NEXT:    ret
   %1 = icmp ult i32 %a, 3
   %2 = zext i1 %1 to i32
   ret i32 %2
@@ -42,6 +64,11 @@ define i32 @xori(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xori a0, a0, 4
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: xori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xori a0, a0, 4
+; RV64I-NEXT:    ret
   %1 = xor i32 %a, 4
   ret i32 %1
 }
@@ -51,6 +78,11 @@ define i32 @ori(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ori a0, a0, 5
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ori a0, a0, 5
+; RV64I-NEXT:    ret
   %1 = or i32 %a, 5
   ret i32 %1
 }
@@ -60,6 +92,11 @@ define i32 @andi(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a0, a0, 6
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: andi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 6
+; RV64I-NEXT:    ret
   %1 = and i32 %a, 6
   ret i32 %1
 }
@@ -69,6 +106,11 @@ define i32 @slli(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 7
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    ret
   %1 = shl i32 %a, 7
   ret i32 %1
 }
@@ -78,6 +120,11 @@ define i32 @srli(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 8
+; RV64I-NEXT:    ret
   %1 = lshr i32 %a, 8
   ret i32 %1
 }
@@ -87,6 +134,11 @@ define i32 @srai(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a0, a0, 9
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srai:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a0, a0, 9
+; RV64I-NEXT:    ret
   %1 = ashr i32 %a, 9
   ret i32 %1
 }
@@ -98,6 +150,11 @@ define i32 @add(i32 %a, i32 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = add i32 %a, %b
   ret i32 %1
 }
@@ -107,6 +164,11 @@ define i32 @sub(i32 %a, i32 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sub:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = sub i32 %a, %b
   ret i32 %1
 }
@@ -116,6 +178,11 @@ define i32 @sll(i32 %a, i32 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sll:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = shl i32 %a, %b
   ret i32 %1
 }
@@ -125,6 +192,13 @@ define i32 @slt(i32 %a, i32 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slt a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slt:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    slt a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = icmp slt i32 %a, %b
   %2 = zext i1 %1 to i32
   ret i32 %2
@@ -135,6 +209,13 @@ define i32 @sltu(i32 %a, i32 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sltu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = icmp ult i32 %a, %b
   %2 = zext i1 %1 to i32
   ret i32 %2
@@ -145,24 +226,46 @@ define i32 @xor(i32 %a, i32 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: xor:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = xor i32 %a, %b
   ret i32 %1
 }
 
+; TODO: should select srlw for RV64.
+
 define i32 @srl(i32 %a, i32 %b) nounwind {
 ; RV32I-LABEL: srl:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srl:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = lshr i32 %a, %b
   ret i32 %1
 }
 
+; TODO: should select sraw for RV64.
+
 define i32 @sra(i32 %a, i32 %b) nounwind {
 ; RV32I-LABEL: sra:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sra:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = ashr i32 %a, %b
   ret i32 %1
 }
@@ -172,6 +275,11 @@ define i32 @or(i32 %a, i32 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: or:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = or i32 %a, %b
   ret i32 %1
 }
@@ -181,6 +289,11 @@ define i32 @and(i32 %a, i32 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: and:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = and i32 %a, %b
   ret i32 %1
 }
diff --git a/test/CodeGen/RISCV/alu64.ll b/test/CodeGen/RISCV/alu64.ll
new file mode 100644
index 0000000000000000000000000000000000000000..021211b6f365d3301afd95adcc0c5cbeffe1ecbb
--- /dev/null
+++ b/test/CodeGen/RISCV/alu64.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+
+; These tests are each targeted at a particular RISC-V ALU instruction. Other
+; files in this folder exercise LLVM IR instructions that don't directly match a
+; RISC-V instruction. This file contains tests for the instructions common
+; between RV32I and RV64I as well as the *W instructions introduced in RV64I.
+
+; Register-immediate instructions
+
+define i64 @addi(i64 %a) nounwind {
+; RV64I-LABEL: addi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: addi:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a2, a0, 1
+; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    add a1, a1, a0
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+  %1 = add i64 %a, 1
+  ret i64 %1
+}
+
+define i64 @slti(i64 %a) nounwind {
+; RV64I-LABEL: slti:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slti a0, a0, 2
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: slti:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beqz a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slti a0, a1, 0
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    sltiu a0, a0, 2
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+  %1 = icmp slt i64 %a, 2
+  %2 = zext i1 %1 to i64
+  ret i64 %2
+}
+
+define i64 @sltiu(i64 %a) nounwind {
+; RV64I-LABEL: sltiu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltiu a0, a0, 3
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sltiu:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beqz a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a0, zero
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    sltiu a0, a0, 3
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+  %1 = icmp ult i64 %a, 3
+  %2 = zext i1 %1 to i64
+  ret i64 %2
+}
+
+define i64 @xori(i64 %a) nounwind {
+; RV64I-LABEL: xori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xori a0, a0, 4
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: xori:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xori a0, a0, 4
+; RV32I-NEXT:    ret
+  %1 = xor i64 %a, 4
+  ret i64 %1
+}
+
+define i64 @ori(i64 %a) nounwind {
+; RV64I-LABEL: ori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ori a0, a0, 5
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: ori:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ori a0, a0, 5
+; RV32I-NEXT:    ret
+  %1 = or i64 %a, 5
+  ret i64 %1
+}
+
+define i64 @andi(i64 %a) nounwind {
+; RV64I-LABEL: andi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 6
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: andi:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 6
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+  %1 = and i64 %a, 6
+  ret i64 %1
+}
+
+define i64 @slli(i64 %a) nounwind {
+; RV64I-LABEL: slli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: slli:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 7
+; RV32I-NEXT:    srli a2, a0, 25
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    slli a0, a0, 7
+; RV32I-NEXT:    ret
+  %1 = shl i64 %a, 7
+  ret i64 %1
+}
+
+define i64 @srli(i64 %a) nounwind {
+; RV64I-LABEL: srli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: srli:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    slli a2, a1, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    ret
+  %1 = lshr i64 %a, 8
+  ret i64 %1
+}
+
+define i64 @srai(i64 %a) nounwind {
+; RV64I-LABEL: srai:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srai a0, a0, 9
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: srai:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 9
+; RV32I-NEXT:    slli a2, a1, 23
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srai a1, a1, 9
+; RV32I-NEXT:    ret
+  %1 = ashr i64 %a, 9
+  ret i64 %1
+}
+
+; Register-register instructions
+
+define i64 @add(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    add a2, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    add a1, a1, a0
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+  %1 = add i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @sub(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sub:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sub:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sltu a3, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+  %1 = sub i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @sll(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sll:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sll:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp)
+; RV32I-NEXT:    call __ashldi3
+; RV32I-NEXT:    lw ra, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+  %1 = shl i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @slt(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: slt:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slt a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: slt:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB12_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a0, a1, a3
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB12_2:
+; RV32I-NEXT:    sltu a0, a0, a2
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+  %1 = icmp slt i64 %a, %b
+  %2 = zext i1 %1 to i64
+  ret i64 %2
+}
+
+define i64 @sltu(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sltu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sltu:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB13_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a0, a1, a3
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB13_2:
+; RV32I-NEXT:    sltu a0, a0, a2
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+  %1 = icmp ult i64 %a, %b
+  %2 = zext i1 %1 to i64
+  ret i64 %2
+}
+
+define i64 @xor(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: xor:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: xor:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
+; RV32I-NEXT:    ret
+  %1 = xor i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @srl(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: srl:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: srl:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp)
+; RV32I-NEXT:    call __lshrdi3
+; RV32I-NEXT:    lw ra, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+  %1 = lshr i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @sra(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sra:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sra:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp)
+; RV32I-NEXT:    call __ashrdi3
+; RV32I-NEXT:    lw ra, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+  %1 = ashr i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @or(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: or:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: or:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    ret
+  %1 = or i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @and(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: and:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: and:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+  %1 = and i64 %a, %b
+  ret i64 %1
+}
+
+; RV64I-only instructions
+
+define signext i32 @addiw(i32 signext %a) {
+; RV64I-LABEL: addiw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addiw a0, a0, 123
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: addiw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a0, a0, 123
+; RV32I-NEXT:    ret
+  %1 = add i32 %a, 123
+  ret i32 %1
+}
+
+define signext i32 @slliw(i32 signext %a) {
+; RV64I-LABEL: slliw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a0, a0, 17
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: slliw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 17
+; RV32I-NEXT:    ret
+  %1 = shl i32 %a, 17
+  ret i32 %1
+}
+
+define signext i32 @srliw(i32 %a) {
+; RV64I-LABEL: srliw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 8
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: srliw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    ret
+  %1 = lshr i32 %a, 8
+  ret i32 %1
+}
+
+define signext i32 @sraiw(i32 %a) {
+; RV64I-LABEL: sraiw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a0, a0, 9
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sraiw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srai a0, a0, 9
+; RV32I-NEXT:    ret
+  %1 = ashr i32 %a, 9
+  ret i32 %1
+}
+
+define signext i32 @sextw(i32 zeroext %a) {
+; RV64I-LABEL: sextw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sextw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+  ret i32 %a
+}
+
+define signext i32 @addw(i32 signext %a, i32 signext %b) {
+; RV64I-LABEL: addw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: addw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @subw(i32 signext %a, i32 signext %b) {
+; RV64I-LABEL: subw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: subw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+; TODO: should select sllw for RV64.
+
+define signext i32 @sllw(i32 signext %a, i32 zeroext %b) {
+; RV64I-LABEL: sllw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sllw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+; TODO: should select srlw for RV64.
+
+define signext i32 @srlw(i32 signext %a, i32 zeroext %b) {
+; RV64I-LABEL: srlw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: srlw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+; TODO: should select sraw for RV64.
+
+define signext i32 @sraw(i64 %a, i32 zeroext %b) {
+; RV64I-LABEL: sraw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: sraw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sra a0, a0, a2
+; RV32I-NEXT:    ret
+  %1 = trunc i64 %a to i32
+  %2 = ashr i32 %1, %b
+  ret i32 %2
+}
diff --git a/test/CodeGen/RISCV/alu8.ll b/test/CodeGen/RISCV/alu8.ll
index ad97e6203196a8865b2dbea2dc45f12cb2a41ba3..ed09174745b88357496d465ad17569bb70fd67a2 100644
--- a/test/CodeGen/RISCV/alu8.ll
+++ b/test/CodeGen/RISCV/alu8.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
 
 ; These tests are identical to those in alu32.ll but operate on i8. They check
 ; that legalisation of these non-native types doesn't introduce unnecessary
@@ -11,6 +13,11 @@ define i8 @addi(i8 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: addi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
   %1 = add i8 %a, 1
   ret i8 %1
 }
@@ -22,6 +29,13 @@ define i8 @slti(i8 %a) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slti a0, a0, 2
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slti:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    slti a0, a0, 2
+; RV64I-NEXT:    ret
   %1 = icmp slt i8 %a, 2
   %2 = zext i1 %1 to i8
   ret i8 %2
@@ -33,6 +47,12 @@ define i8 @sltiu(i8 %a) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    sltiu a0, a0, 3
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sltiu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    sltiu a0, a0, 3
+; RV64I-NEXT:    ret
   %1 = icmp ult i8 %a, 3
   %2 = zext i1 %1 to i8
   ret i8 %2
@@ -43,6 +63,11 @@ define i8 @xori(i8 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xori a0, a0, 4
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: xori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xori a0, a0, 4
+; RV64I-NEXT:    ret
   %1 = xor i8 %a, 4
   ret i8 %1
 }
@@ -52,6 +77,11 @@ define i8 @ori(i8 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ori a0, a0, 5
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ori a0, a0, 5
+; RV64I-NEXT:    ret
   %1 = or i8 %a, 5
   ret i8 %1
 }
@@ -61,6 +91,11 @@ define i8 @andi(i8 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a0, a0, 6
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: andi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 6
+; RV64I-NEXT:    ret
   %1 = and i8 %a, 6
   ret i8 %1
 }
@@ -70,6 +105,11 @@ define i8 @slli(i8 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 7
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    ret
   %1 = shl i8 %a, 7
   ret i8 %1
 }
@@ -80,6 +120,12 @@ define i8 @srli(i8 %a) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 192
 ; RV32I-NEXT:    srli a0, a0, 6
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 192
+; RV64I-NEXT:    srli a0, a0, 6
+; RV64I-NEXT:    ret
   %1 = lshr i8 %a, 6
   ret i8 %1
 }
@@ -90,6 +136,12 @@ define i8 @srai(i8 %a) nounwind {
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    srai a0, a0, 29
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srai:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 61
+; RV64I-NEXT:    ret
   %1 = ashr i8 %a, 5
   ret i8 %1
 }
@@ -100,6 +152,11 @@ define i8 @add(i8 %a, i8 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = add i8 %a, %b
   ret i8 %1
 }
@@ -109,6 +166,11 @@ define i8 @sub(i8 %a, i8 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sub:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = sub i8 %a, %b
   ret i8 %1
 }
@@ -118,6 +180,11 @@ define i8 @sll(i8 %a, i8 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sll:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = shl i8 %a, %b
   ret i8 %1
 }
@@ -131,6 +198,15 @@ define i8 @slt(i8 %a, i8 %b) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slt a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: slt:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    slt a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = icmp slt i8 %a, %b
   %2 = zext i1 %1 to i8
   ret i8 %2
@@ -143,6 +219,13 @@ define i8 @sltu(i8 %a, i8 %b) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    sltu a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sltu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a1, a1, 255
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = icmp ult i8 %a, %b
   %2 = zext i1 %1 to i8
   ret i8 %2
@@ -153,6 +236,11 @@ define i8 @xor(i8 %a, i8 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: xor:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = xor i8 %a, %b
   ret i8 %1
 }
@@ -163,6 +251,12 @@ define i8 @srl(i8 %a, i8 %b) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: srl:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = lshr i8 %a, %b
   ret i8 %1
 }
@@ -174,6 +268,13 @@ define i8 @sra(i8 %a, i8 %b) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sra:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = ashr i8 %a, %b
   ret i8 %1
 }
@@ -183,6 +284,11 @@ define i8 @or(i8 %a, i8 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: or:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = or i8 %a, %b
   ret i8 %1
 }
@@ -192,6 +298,11 @@ define i8 @and(i8 %a, i8 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: and:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = and i8 %a, %b
   ret i8 %1
 }
diff --git a/test/CodeGen/RISCV/atomic-cmpxchg.ll b/test/CodeGen/RISCV/atomic-cmpxchg.ll
index fafc8f7755f56f603201847bce17aa4ccb547b1a..85cd1691366401f77b38420eafae92c27f345118 100644
--- a/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IA %s
 
 define void @cmpxchg_i8_monotonic_monotonic(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-LABEL: cmpxchg_i8_monotonic_monotonic:
@@ -15,6 +17,30 @@ define void @cmpxchg_i8_monotonic_monotonic(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_monotonic_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB0_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB0_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB0_1
+; RV32IA-NEXT:  .LBB0_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val monotonic monotonic
   ret void
 }
@@ -32,6 +58,30 @@ define void @cmpxchg_i8_acquire_monotonic(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_acquire_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB1_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB1_1
+; RV32IA-NEXT:  .LBB1_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acquire monotonic
   ret void
 }
@@ -49,6 +99,30 @@ define void @cmpxchg_i8_acquire_acquire(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_acquire_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB2_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB2_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB2_1
+; RV32IA-NEXT:  .LBB2_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acquire acquire
   ret void
 }
@@ -66,6 +140,30 @@ define void @cmpxchg_i8_release_monotonic(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_release_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB3_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w.rl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB3_1
+; RV32IA-NEXT:  .LBB3_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release monotonic
   ret void
 }
@@ -83,6 +181,30 @@ define void @cmpxchg_i8_release_acquire(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_release_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB4_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w.rl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB4_1
+; RV32IA-NEXT:  .LBB4_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release acquire
   ret void
 }
@@ -100,6 +222,30 @@ define void @cmpxchg_i8_acq_rel_monotonic(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_acq_rel_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB5_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB5_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w.rl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB5_1
+; RV32IA-NEXT:  .LBB5_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acq_rel monotonic
   ret void
 }
@@ -117,6 +263,30 @@ define void @cmpxchg_i8_acq_rel_acquire(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_acq_rel_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB6_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB6_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w.rl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB6_1
+; RV32IA-NEXT:  .LBB6_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acq_rel acquire
   ret void
 }
@@ -134,6 +304,30 @@ define void @cmpxchg_i8_seq_cst_monotonic(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_seq_cst_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB7_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB7_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w.aqrl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB7_1
+; RV32IA-NEXT:  .LBB7_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst monotonic
   ret void
 }
@@ -151,6 +345,30 @@ define void @cmpxchg_i8_seq_cst_acquire(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_seq_cst_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB8_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB8_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w.aqrl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB8_1
+; RV32IA-NEXT:  .LBB8_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst acquire
   ret void
 }
@@ -168,6 +386,30 @@ define void @cmpxchg_i8_seq_cst_seq_cst(i8* %ptr, i8 %cmp, i8 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i8_seq_cst_seq_cst:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a3, a3, 24
+; RV32IA-NEXT:    addi a4, zero, 255
+; RV32IA-NEXT:    sll a4, a4, a3
+; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a2, a2, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a1, a1, a3
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a3, (a0)
+; RV32IA-NEXT:    and a5, a3, a4
+; RV32IA-NEXT:    bne a5, a1, .LBB9_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB9_1 Depth=1
+; RV32IA-NEXT:    xor a5, a3, a2
+; RV32IA-NEXT:    and a5, a5, a4
+; RV32IA-NEXT:    xor a5, a3, a5
+; RV32IA-NEXT:    sc.w.aqrl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB9_1
+; RV32IA-NEXT:  .LBB9_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst seq_cst
   ret void
 }
@@ -185,6 +427,31 @@ define void @cmpxchg_i16_monotonic_monotonic(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_monotonic_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB10_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB10_1
+; RV32IA-NEXT:  .LBB10_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val monotonic monotonic
   ret void
 }
@@ -202,6 +469,31 @@ define void @cmpxchg_i16_acquire_monotonic(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_acquire_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB11_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB11_1
+; RV32IA-NEXT:  .LBB11_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acquire monotonic
   ret void
 }
@@ -219,6 +511,31 @@ define void @cmpxchg_i16_acquire_acquire(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_acquire_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB12_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB12_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB12_1
+; RV32IA-NEXT:  .LBB12_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acquire acquire
   ret void
 }
@@ -236,6 +553,31 @@ define void @cmpxchg_i16_release_monotonic(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_release_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB13_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB13_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w.rl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB13_1
+; RV32IA-NEXT:  .LBB13_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release monotonic
   ret void
 }
@@ -253,6 +595,31 @@ define void @cmpxchg_i16_release_acquire(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_release_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB14_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB14_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w.rl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB14_1
+; RV32IA-NEXT:  .LBB14_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release acquire
   ret void
 }
@@ -270,6 +637,31 @@ define void @cmpxchg_i16_acq_rel_monotonic(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_acq_rel_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB15_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB15_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w.rl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB15_1
+; RV32IA-NEXT:  .LBB15_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acq_rel monotonic
   ret void
 }
@@ -287,6 +679,31 @@ define void @cmpxchg_i16_acq_rel_acquire(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_acq_rel_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB16_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB16_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w.rl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB16_1
+; RV32IA-NEXT:  .LBB16_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acq_rel acquire
   ret void
 }
@@ -304,6 +721,31 @@ define void @cmpxchg_i16_seq_cst_monotonic(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_seq_cst_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB17_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB17_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w.aqrl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB17_1
+; RV32IA-NEXT:  .LBB17_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst monotonic
   ret void
 }
@@ -321,6 +763,31 @@ define void @cmpxchg_i16_seq_cst_acquire(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_seq_cst_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB18_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB18_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w.aqrl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB18_1
+; RV32IA-NEXT:  .LBB18_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst acquire
   ret void
 }
@@ -338,6 +805,31 @@ define void @cmpxchg_i16_seq_cst_seq_cst(i16* %ptr, i16 %cmp, i16 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i16_seq_cst_seq_cst:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    and a2, a2, a3
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a4, a4, 24
+; RV32IA-NEXT:    sll a3, a3, a4
+; RV32IA-NEXT:    sll a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a4
+; RV32IA-NEXT:    andi a0, a0, -4
+; RV32IA-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a4, (a0)
+; RV32IA-NEXT:    and a5, a4, a3
+; RV32IA-NEXT:    bne a5, a1, .LBB19_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB19_1 Depth=1
+; RV32IA-NEXT:    xor a5, a4, a2
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    xor a5, a4, a5
+; RV32IA-NEXT:    sc.w.aqrl a5, a5, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB19_1
+; RV32IA-NEXT:  .LBB19_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst seq_cst
   ret void
 }
@@ -355,6 +847,17 @@ define void @cmpxchg_i32_monotonic_monotonic(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_monotonic_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB20_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB20_1 Depth=1
+; RV32IA-NEXT:    sc.w a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB20_1
+; RV32IA-NEXT:  .LBB20_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val monotonic monotonic
   ret void
 }
@@ -372,6 +875,17 @@ define void @cmpxchg_i32_acquire_monotonic(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_acquire_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB21_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
+; RV32IA-NEXT:    sc.w a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB21_1
+; RV32IA-NEXT:  .LBB21_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acquire monotonic
   ret void
 }
@@ -389,6 +903,17 @@ define void @cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_acquire_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB22_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
+; RV32IA-NEXT:    sc.w a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB22_1
+; RV32IA-NEXT:  .LBB22_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acquire acquire
   ret void
 }
@@ -406,6 +931,17 @@ define void @cmpxchg_i32_release_monotonic(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_release_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB23_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB23_1 Depth=1
+; RV32IA-NEXT:    sc.w.rl a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB23_1
+; RV32IA-NEXT:  .LBB23_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release monotonic
   ret void
 }
@@ -423,6 +959,17 @@ define void @cmpxchg_i32_release_acquire(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_release_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB24_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB24_1 Depth=1
+; RV32IA-NEXT:    sc.w.rl a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB24_1
+; RV32IA-NEXT:  .LBB24_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release acquire
   ret void
 }
@@ -440,6 +987,17 @@ define void @cmpxchg_i32_acq_rel_monotonic(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_acq_rel_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB25_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB25_1 Depth=1
+; RV32IA-NEXT:    sc.w.rl a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB25_1
+; RV32IA-NEXT:  .LBB25_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acq_rel monotonic
   ret void
 }
@@ -457,6 +1015,17 @@ define void @cmpxchg_i32_acq_rel_acquire(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_acq_rel_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aq a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB26_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB26_1 Depth=1
+; RV32IA-NEXT:    sc.w.rl a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB26_1
+; RV32IA-NEXT:  .LBB26_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acq_rel acquire
   ret void
 }
@@ -474,6 +1043,17 @@ define void @cmpxchg_i32_seq_cst_monotonic(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_seq_cst_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB27_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB27_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB27_1 Depth=1
+; RV32IA-NEXT:    sc.w.aqrl a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB27_1
+; RV32IA-NEXT:  .LBB27_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst monotonic
   ret void
 }
@@ -491,6 +1071,17 @@ define void @cmpxchg_i32_seq_cst_acquire(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_seq_cst_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB28_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB28_1 Depth=1
+; RV32IA-NEXT:    sc.w.aqrl a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB28_1
+; RV32IA-NEXT:  .LBB28_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst acquire
   ret void
 }
@@ -508,6 +1099,17 @@ define void @cmpxchg_i32_seq_cst_seq_cst(i32* %ptr, i32 %cmp, i32 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i32_seq_cst_seq_cst:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    lr.w.aqrl a3, (a0)
+; RV32IA-NEXT:    bne a3, a1, .LBB29_3
+; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB29_1 Depth=1
+; RV32IA-NEXT:    sc.w.aqrl a4, a2, (a0)
+; RV32IA-NEXT:    bnez a4, .LBB29_1
+; RV32IA-NEXT:  .LBB29_3:
+; RV32IA-NEXT:    ret
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst seq_cst
   ret void
 }
@@ -528,6 +1130,22 @@ define void @cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_monotonic_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a4, zero
+; RV32IA-NEXT:    mv a5, zero
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val monotonic monotonic
   ret void
 }
@@ -549,6 +1167,23 @@ define void @cmpxchg_i64_acquire_monotonic(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_acquire_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    mv a5, a4
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a4, zero, 2
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a5, zero
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acquire monotonic
   ret void
 }
@@ -569,6 +1204,22 @@ define void @cmpxchg_i64_acquire_acquire(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_acquire_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a5, zero, 2
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a4, a5
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acquire acquire
   ret void
 }
@@ -590,6 +1241,23 @@ define void @cmpxchg_i64_release_monotonic(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_release_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    mv a5, a4
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a4, zero, 3
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a5, zero
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release monotonic
   ret void
 }
@@ -611,6 +1279,23 @@ define void @cmpxchg_i64_release_acquire(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_release_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    mv a6, a4
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a4, zero, 3
+; RV32IA-NEXT:    addi a5, zero, 2
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a6
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release acquire
   ret void
 }
@@ -632,6 +1317,23 @@ define void @cmpxchg_i64_acq_rel_monotonic(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_acq_rel_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    mv a5, a4
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a4, zero, 4
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a5, zero
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acq_rel monotonic
   ret void
 }
@@ -653,6 +1355,23 @@ define void @cmpxchg_i64_acq_rel_acquire(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_acq_rel_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    mv a6, a4
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a4, zero, 4
+; RV32IA-NEXT:    addi a5, zero, 2
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a6
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acq_rel acquire
   ret void
 }
@@ -674,6 +1393,23 @@ define void @cmpxchg_i64_seq_cst_monotonic(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_seq_cst_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    mv a5, a4
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a4, zero, 5
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a5
+; RV32IA-NEXT:    mv a5, zero
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst monotonic
   ret void
 }
@@ -695,6 +1431,23 @@ define void @cmpxchg_i64_seq_cst_acquire(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_seq_cst_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    mv a6, a4
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a4, zero, 5
+; RV32IA-NEXT:    addi a5, zero, 2
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a6
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst acquire
   ret void
 }
@@ -715,6 +1468,22 @@ define void @cmpxchg_i64_seq_cst_seq_cst(i64* %ptr, i64 %cmp, i64 %val) {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: cmpxchg_i64_seq_cst_seq_cst:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp)
+; RV32IA-NEXT:    sw a2, 4(sp)
+; RV32IA-NEXT:    sw a1, 0(sp)
+; RV32IA-NEXT:    mv a1, sp
+; RV32IA-NEXT:    addi a5, zero, 5
+; RV32IA-NEXT:    mv a2, a3
+; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a4, a5
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw ra, 12(sp)
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst seq_cst
   ret void
 }
diff --git a/test/CodeGen/RISCV/calling-conv.ll b/test/CodeGen/RISCV/calling-conv.ll
index 8440045fd91a1c25fd18980752d99ccf7b65ea5e..536859372831dae4674d3a0278de726fd742ea43 100644
--- a/test/CodeGen/RISCV/calling-conv.ll
+++ b/test/CodeGen/RISCV/calling-conv.ll
@@ -135,7 +135,6 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-NEXT:    xor a0, a0, a1
 ; RV32I-FPELIM-NEXT:    or a0, a0, a3
 ; RV32I-FPELIM-NEXT:    or a0, a0, a2
-; RV32I-FPELIM-NEXT:    xor a0, a0, zero
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -160,7 +159,6 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    xor a0, a0, a1
 ; RV32I-WITHFP-NEXT:    or a0, a0, a3
 ; RV32I-WITHFP-NEXT:    or a0, a0, a2
-; RV32I-WITHFP-NEXT:    xor a0, a0, zero
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
@@ -243,7 +241,6 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-NEXT:    xor a0, a3, a0
 ; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    or a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a0, a0, zero
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -269,7 +266,6 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    xor a0, a3, a0
 ; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    or a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a0, a0, zero
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
@@ -398,7 +394,6 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-FPELIM-NEXT:    xor a4, a4, t0
 ; RV32I-FPELIM-NEXT:    xor a3, a3, a7
 ; RV32I-FPELIM-NEXT:    or a3, a3, a4
-; RV32I-FPELIM-NEXT:    xor a3, a3, zero
 ; RV32I-FPELIM-NEXT:    lui a4, 16
 ; RV32I-FPELIM-NEXT:    addi a4, a4, -1
 ; RV32I-FPELIM-NEXT:    and a1, a1, a4
@@ -423,7 +418,6 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-WITHFP-NEXT:    xor a4, a4, t0
 ; RV32I-WITHFP-NEXT:    xor a3, a3, a7
 ; RV32I-WITHFP-NEXT:    or a3, a3, a4
-; RV32I-WITHFP-NEXT:    xor a3, a3, zero
 ; RV32I-WITHFP-NEXT:    lui a4, 16
 ; RV32I-WITHFP-NEXT:    addi a4, a4, -1
 ; RV32I-WITHFP-NEXT:    and a1, a1, a4
@@ -845,7 +839,6 @@ define i32 @caller_small_scalar_ret() nounwind {
 ; RV32I-FPELIM-NEXT:    addi a2, a2, 647
 ; RV32I-FPELIM-NEXT:    xor a0, a0, a2
 ; RV32I-FPELIM-NEXT:    or a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a0, a0, zero
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    lw s1, 8(sp)
 ; RV32I-FPELIM-NEXT:    lw ra, 12(sp)
@@ -867,7 +860,6 @@ define i32 @caller_small_scalar_ret() nounwind {
 ; RV32I-WITHFP-NEXT:    addi a2, a2, 647
 ; RV32I-WITHFP-NEXT:    xor a0, a0, a2
 ; RV32I-WITHFP-NEXT:    or a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a0, a0, zero
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw s1, 4(sp)
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
@@ -997,14 +989,14 @@ define void @caller_large_scalar_ret() nounwind {
 define void @callee_large_struct_ret(%struct.large* noalias sret %agg.result) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_struct_ret:
 ; RV32I-FPELIM:       # %bb.0:
+; RV32I-FPELIM-NEXT:    addi a1, zero, 4
+; RV32I-FPELIM-NEXT:    sw a1, 12(a0)
+; RV32I-FPELIM-NEXT:    addi a1, zero, 3
+; RV32I-FPELIM-NEXT:    sw a1, 8(a0)
 ; RV32I-FPELIM-NEXT:    addi a1, zero, 2
 ; RV32I-FPELIM-NEXT:    sw a1, 4(a0)
 ; RV32I-FPELIM-NEXT:    addi a1, zero, 1
 ; RV32I-FPELIM-NEXT:    sw a1, 0(a0)
-; RV32I-FPELIM-NEXT:    addi a1, zero, 3
-; RV32I-FPELIM-NEXT:    sw a1, 8(a0)
-; RV32I-FPELIM-NEXT:    addi a1, zero, 4
-; RV32I-FPELIM-NEXT:    sw a1, 12(a0)
 ; RV32I-FPELIM-NEXT:    ret
 ;
 ; RV32I-WITHFP-LABEL: callee_large_struct_ret:
@@ -1013,14 +1005,14 @@ define void @callee_large_struct_ret(%struct.large* noalias sret %agg.result) no
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
+; RV32I-WITHFP-NEXT:    addi a1, zero, 4
+; RV32I-WITHFP-NEXT:    sw a1, 12(a0)
+; RV32I-WITHFP-NEXT:    addi a1, zero, 3
+; RV32I-WITHFP-NEXT:    sw a1, 8(a0)
 ; RV32I-WITHFP-NEXT:    addi a1, zero, 2
 ; RV32I-WITHFP-NEXT:    sw a1, 4(a0)
 ; RV32I-WITHFP-NEXT:    addi a1, zero, 1
 ; RV32I-WITHFP-NEXT:    sw a1, 0(a0)
-; RV32I-WITHFP-NEXT:    addi a1, zero, 3
-; RV32I-WITHFP-NEXT:    sw a1, 8(a0)
-; RV32I-WITHFP-NEXT:    addi a1, zero, 4
-; RV32I-WITHFP-NEXT:    sw a1, 12(a0)
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 16
diff --git a/test/CodeGen/RISCV/double-arith.ll b/test/CodeGen/RISCV/double-arith.ll
index a9bdf68e94d5a43c64cb46947212962dc1e23b90..cd3a1d963783236718f29b951806ee289ea4725a 100644
--- a/test/CodeGen/RISCV/double-arith.ll
+++ b/test/CodeGen/RISCV/double-arith.ll
@@ -2,6 +2,10 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+d -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32IFD %s
 
+; These tests are each targeted at a particular RISC-V FPU instruction. Most
+; other files in this folder exercise LLVM IR instructions that don't directly
+; match a RISC-V instruction.
+
 define double @fadd_d(double %a, double %b) nounwind {
 ; RV32IFD-LABEL: fadd_d:
 ; RV32IFD:       # %bb.0:
@@ -277,3 +281,118 @@ define i32 @fle_d(double %a, double %b) nounwind {
   %2 = zext i1 %1 to i32
   ret i32 %2
 }
+
+declare double @llvm.fma.f64(double, double, double)
+
+define double @fmadd_d(double %a, double %b, double %c) nounwind {
+; RV32IFD-LABEL: fmadd_d:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a4, 8(sp)
+; RV32IFD-NEXT:    sw a5, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft2, 8(sp)
+; RV32IFD-NEXT:    fmadd.d ft0, ft2, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %1
+}
+
+define double @fmsub_d(double %a, double %b, double %c) nounwind {
+; RV32IFD-LABEL: fmsub_d:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    sw a4, 8(sp)
+; RV32IFD-NEXT:    sw a5, 12(sp)
+; RV32IFD-NEXT:    fld ft2, 8(sp)
+; RV32IFD-NEXT:    lui a0, %hi(.LCPI15_0)
+; RV32IFD-NEXT:    addi a0, a0, %lo(.LCPI15_0)
+; RV32IFD-NEXT:    fld ft3, 0(a0)
+; RV32IFD-NEXT:    fadd.d ft2, ft2, ft3
+; RV32IFD-NEXT:    fmsub.d ft0, ft1, ft0, ft2
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %c_ = fadd double 0.0, %c ; avoid negation using xor
+  %negc = fsub double -0.0, %c_
+  %1 = call double @llvm.fma.f64(double %a, double %b, double %negc)
+  ret double %1
+}
+
+define double @fnmadd_d(double %a, double %b, double %c) nounwind {
+; RV32IFD-LABEL: fnmadd_d:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    sw a4, 8(sp)
+; RV32IFD-NEXT:    sw a5, 12(sp)
+; RV32IFD-NEXT:    fld ft2, 8(sp)
+; RV32IFD-NEXT:    lui a0, %hi(.LCPI16_0)
+; RV32IFD-NEXT:    addi a0, a0, %lo(.LCPI16_0)
+; RV32IFD-NEXT:    fld ft3, 0(a0)
+; RV32IFD-NEXT:    fadd.d ft2, ft2, ft3
+; RV32IFD-NEXT:    fadd.d ft1, ft1, ft3
+; RV32IFD-NEXT:    fnmadd.d ft0, ft1, ft0, ft2
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %a_ = fadd double 0.0, %a
+  %c_ = fadd double 0.0, %c
+  %nega = fsub double -0.0, %a_
+  %negc = fsub double -0.0, %c_
+  %1 = call double @llvm.fma.f64(double %nega, double %b, double %negc)
+  ret double %1
+}
+
+define double @fnmsub_d(double %a, double %b, double %c) nounwind {
+; RV32IFD-LABEL: fnmsub_d:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a4, 8(sp)
+; RV32IFD-NEXT:    sw a5, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft2, 8(sp)
+; RV32IFD-NEXT:    lui a0, %hi(.LCPI17_0)
+; RV32IFD-NEXT:    addi a0, a0, %lo(.LCPI17_0)
+; RV32IFD-NEXT:    fld ft3, 0(a0)
+; RV32IFD-NEXT:    fadd.d ft2, ft2, ft3
+; RV32IFD-NEXT:    fnmsub.d ft0, ft2, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %a_ = fadd double 0.0, %a
+  %nega = fsub double -0.0, %a_
+  %1 = call double @llvm.fma.f64(double %nega, double %b, double %c)
+  ret double %1
+}
diff --git a/test/CodeGen/RISCV/double-frem.ll b/test/CodeGen/RISCV/double-frem.ll
new file mode 100644
index 0000000000000000000000000000000000000000..07f84ac11ce2207bc3bc1ad752ae50e3841434d9
--- /dev/null
+++ b/test/CodeGen/RISCV/double-frem.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32ID %s
+
+define double @frem_f64(double %a, double %b) nounwind {
+; RV32ID-LABEL: frem_f64:
+; RV32ID:       # %bb.0:
+; RV32ID-NEXT:    addi sp, sp, -16
+; RV32ID-NEXT:    sw ra, 12(sp)
+; RV32ID-NEXT:    call fmod
+; RV32ID-NEXT:    lw ra, 12(sp)
+; RV32ID-NEXT:    addi sp, sp, 16
+; RV32ID-NEXT:    ret
+  %1 = frem double %a, %b
+  ret double %1
+}
diff --git a/test/CodeGen/RISCV/double-intrinsics.ll b/test/CodeGen/RISCV/double-intrinsics.ll
index 4a5239f4f01e2a46f91207b6a36ace931a00e753..cd14c1932d4712726db2bdec96cd07a1dd45a485 100644
--- a/test/CodeGen/RISCV/double-intrinsics.ll
+++ b/test/CodeGen/RISCV/double-intrinsics.ll
@@ -4,7 +4,7 @@
 
 declare double @llvm.sqrt.f64(double)
 
-define double @sqrt_f64(double %a) {
+define double @sqrt_f64(double %a) nounwind {
 ; RV32IFD-LABEL: sqrt_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -18,12 +18,12 @@ define double @sqrt_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.sqrt.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.powi.f64(double, i32)
 
-define double @powi_f64(double %a, i32 %b) {
+define double @powi_f64(double %a, i32 %b) nounwind {
 ; RV32IFD-LABEL: powi_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -33,12 +33,12 @@ define double @powi_f64(double %a, i32 %b) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.powi.f64(double %a, i32 %b)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.sin.f64(double)
 
-define double @sin_f64(double %a) {
+define double @sin_f64(double %a) nounwind {
 ; RV32IFD-LABEL: sin_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -48,12 +48,12 @@ define double @sin_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.sin.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.cos.f64(double)
 
-define double @cos_f64(double %a) {
+define double @cos_f64(double %a) nounwind {
 ; RV32IFD-LABEL: cos_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -63,11 +63,11 @@ define double @cos_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.cos.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 ; The sin+cos combination results in an FSINCOS SelectionDAG node.
-define double @sincos_f64(double %a) {
+define double @sincos_f64(double %a) nounwind {
 ; RV32IFD-LABEL: sincos_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -32
@@ -104,12 +104,12 @@ define double @sincos_f64(double %a) {
   %1 = call double @llvm.sin.f64(double %a)
   %2 = call double @llvm.cos.f64(double %a)
   %3 = fadd double %1, %2
-	ret double %3
+  ret double %3
 }
 
 declare double @llvm.pow.f64(double, double)
 
-define double @pow_f64(double %a, double %b) {
+define double @pow_f64(double %a, double %b) nounwind {
 ; RV32IFD-LABEL: pow_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -119,12 +119,12 @@ define double @pow_f64(double %a, double %b) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.pow.f64(double %a, double %b)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.exp.f64(double)
 
-define double @exp_f64(double %a) {
+define double @exp_f64(double %a) nounwind {
 ; RV32IFD-LABEL: exp_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -134,12 +134,12 @@ define double @exp_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.exp.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.exp2.f64(double)
 
-define double @exp2_f64(double %a) {
+define double @exp2_f64(double %a) nounwind {
 ; RV32IFD-LABEL: exp2_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -149,12 +149,12 @@ define double @exp2_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.exp2.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.log.f64(double)
 
-define double @log_f64(double %a) {
+define double @log_f64(double %a) nounwind {
 ; RV32IFD-LABEL: log_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -164,12 +164,12 @@ define double @log_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.log.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.log10.f64(double)
 
-define double @log10_f64(double %a) {
+define double @log10_f64(double %a) nounwind {
 ; RV32IFD-LABEL: log10_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -179,12 +179,12 @@ define double @log10_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.log10.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.log2.f64(double)
 
-define double @log2_f64(double %a) {
+define double @log2_f64(double %a) nounwind {
 ; RV32IFD-LABEL: log2_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -194,28 +194,64 @@ define double @log2_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.log2.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.fma.f64(double, double, double)
 
-; TODO: Select RISC-V FMA instruction.
-define double @fma_f64(double %a, double %b, double %c) {
+define double @fma_f64(double %a, double %b, double %c) nounwind {
 ; RV32IFD-LABEL: fma_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
-; RV32IFD-NEXT:    sw ra, 12(sp)
-; RV32IFD-NEXT:    call fma
-; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    sw a4, 8(sp)
+; RV32IFD-NEXT:    sw a5, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft2, 8(sp)
+; RV32IFD-NEXT:    fmadd.d ft0, ft2, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.fma.f64(double %a, double %b, double %c)
-	ret double %1
+  ret double %1
+}
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+define double @fmuladd_f64(double %a, double %b, double %c) nounwind {
+; Use of fmadd depends on TargetLowering::isFMAFasterthanFMulAndFAdd
+; RV32IFD-LABEL: fmuladd_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    fmul.d ft0, ft1, ft0
+; RV32IFD-NEXT:    sw a4, 8(sp)
+; RV32IFD-NEXT:    sw a5, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    fadd.d ft0, ft0, ft1
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
+  ret double %1
 }
 
 declare double @llvm.fabs.f64(double)
 
-define double @fabs_f64(double %a) {
+define double @fabs_f64(double %a) nounwind {
 ; RV32IFD-LABEL: fabs_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -229,7 +265,7 @@ define double @fabs_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.fabs.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.minnum.f64(double, double)
@@ -317,7 +353,7 @@ define double @copysign_f64(double %a, double %b) nounwind {
 
 declare double @llvm.floor.f64(double)
 
-define double @floor_f64(double %a) {
+define double @floor_f64(double %a) nounwind {
 ; RV32IFD-LABEL: floor_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -327,12 +363,12 @@ define double @floor_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.floor.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.ceil.f64(double)
 
-define double @ceil_f64(double %a) {
+define double @ceil_f64(double %a) nounwind {
 ; RV32IFD-LABEL: ceil_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -342,12 +378,12 @@ define double @ceil_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.ceil.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.trunc.f64(double)
 
-define double @trunc_f64(double %a) {
+define double @trunc_f64(double %a) nounwind {
 ; RV32IFD-LABEL: trunc_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -357,12 +393,12 @@ define double @trunc_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.trunc.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.rint.f64(double)
 
-define double @rint_f64(double %a) {
+define double @rint_f64(double %a) nounwind {
 ; RV32IFD-LABEL: rint_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -372,12 +408,12 @@ define double @rint_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.rint.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.nearbyint.f64(double)
 
-define double @nearbyint_f64(double %a) {
+define double @nearbyint_f64(double %a) nounwind {
 ; RV32IFD-LABEL: nearbyint_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -387,12 +423,12 @@ define double @nearbyint_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.nearbyint.f64(double %a)
-	ret double %1
+  ret double %1
 }
 
 declare double @llvm.round.f64(double)
 
-define double @round_f64(double %a) {
+define double @round_f64(double %a) nounwind {
 ; RV32IFD-LABEL: round_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
@@ -402,5 +438,5 @@ define double @round_f64(double %a) {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.round.f64(double %a)
-	ret double %1
+  ret double %1
 }
diff --git a/test/CodeGen/RISCV/fixups-relax-diff.ll b/test/CodeGen/RISCV/fixups-relax-diff.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d574a8af3388b4027903e9506028f26d8f2df3ba
--- /dev/null
+++ b/test/CodeGen/RISCV/fixups-relax-diff.ll
@@ -0,0 +1,20 @@
+; RUN: llc -filetype=obj -mtriple=riscv32 -mattr=+relax %s -o - \
+; RUN:     | llvm-readobj -r | FileCheck -check-prefix=RELAX %s
+; RUN: llc -filetype=obj -mtriple=riscv32 -mattr=-relax %s -o - \
+; RUN:     | llvm-readobj -r | FileCheck -check-prefix=NORELAX %s
+
+; This test checks that a diff inserted via inline assembly only causes
+; relocations when relaxation is enabled. This isn't an assembly test
+; as the assembler takes a different path through LLVM, which is
+; already covered by the fixups-expr.s test.
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  ; RELAX: R_RISCV_ADD64 b
+  ; RELAX: R_RISCV_SUB64 a
+  ; NORELAX-NOT: R_RISCV
+  call void asm sideeffect "a:\0Ab:\0A.dword b-a", ""()
+  ret i32 0
+}
diff --git a/test/CodeGen/RISCV/float-arith.ll b/test/CodeGen/RISCV/float-arith.ll
index f3ec61b435748d75fce1186bb6dfa63dae42faef..ab874476541ec69d98e3090d095268296b7e21fe 100644
--- a/test/CodeGen/RISCV/float-arith.ll
+++ b/test/CodeGen/RISCV/float-arith.ll
@@ -2,6 +2,10 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+f -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32IF %s
 
+; These tests are each targeted at a particular RISC-V FPU instruction. Most
+; other files in this folder exercise LLVM IR instructions that don't directly
+; match a RISC-V instruction.
+
 define float @fadd_s(float %a, float %b) nounwind {
 ; RV32IF-LABEL: fadd_s:
 ; RV32IF:       # %bb.0:
@@ -186,3 +190,78 @@ define i32 @fle_s(float %a, float %b) nounwind {
   %2 = zext i1 %1 to i32
   ret i32 %2
 }
+
+declare float @llvm.fma.f32(float, float, float)
+
+define float @fmadd_s(float %a, float %b, float %c) nounwind {
+; RV32IF-LABEL: fmadd_s:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a2
+; RV32IF-NEXT:    fmv.w.x ft1, a1
+; RV32IF-NEXT:    fmv.w.x ft2, a0
+; RV32IF-NEXT:    fmadd.s ft0, ft2, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %1
+}
+
+define float @fmsub_s(float %a, float %b, float %c) nounwind {
+; RV32IF-LABEL: fmsub_s:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a2
+; RV32IF-NEXT:    lui a2, %hi(.LCPI15_0)
+; RV32IF-NEXT:    addi a2, a2, %lo(.LCPI15_0)
+; RV32IF-NEXT:    flw ft1, 0(a2)
+; RV32IF-NEXT:    fadd.s ft0, ft0, ft1
+; RV32IF-NEXT:    fmv.w.x ft1, a1
+; RV32IF-NEXT:    fmv.w.x ft2, a0
+; RV32IF-NEXT:    fmsub.s ft0, ft2, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %c_ = fadd float 0.0, %c ; avoid negation using xor
+  %negc = fsub float -0.0, %c_
+  %1 = call float @llvm.fma.f32(float %a, float %b, float %negc)
+  ret float %1
+}
+
+define float @fnmadd_s(float %a, float %b, float %c) nounwind {
+; RV32IF-LABEL: fnmadd_s:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a2
+; RV32IF-NEXT:    lui a2, %hi(.LCPI16_0)
+; RV32IF-NEXT:    addi a2, a2, %lo(.LCPI16_0)
+; RV32IF-NEXT:    flw ft1, 0(a2)
+; RV32IF-NEXT:    fadd.s ft0, ft0, ft1
+; RV32IF-NEXT:    fmv.w.x ft2, a0
+; RV32IF-NEXT:    fadd.s ft1, ft2, ft1
+; RV32IF-NEXT:    fmv.w.x ft2, a1
+; RV32IF-NEXT:    fnmadd.s ft0, ft1, ft2, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %a_ = fadd float 0.0, %a
+  %c_ = fadd float 0.0, %c
+  %nega = fsub float -0.0, %a_
+  %negc = fsub float -0.0, %c_
+  %1 = call float @llvm.fma.f32(float %nega, float %b, float %negc)
+  ret float %1
+}
+
+define float @fnmsub_s(float %a, float %b, float %c) nounwind {
+; RV32IF-LABEL: fnmsub_s:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a0
+; RV32IF-NEXT:    lui a0, %hi(.LCPI17_0)
+; RV32IF-NEXT:    addi a0, a0, %lo(.LCPI17_0)
+; RV32IF-NEXT:    flw ft1, 0(a0)
+; RV32IF-NEXT:    fadd.s ft0, ft0, ft1
+; RV32IF-NEXT:    fmv.w.x ft1, a2
+; RV32IF-NEXT:    fmv.w.x ft2, a1
+; RV32IF-NEXT:    fnmsub.s ft0, ft0, ft2, ft1
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %a_ = fadd float 0.0, %a
+  %nega = fsub float -0.0, %a_
+  %1 = call float @llvm.fma.f32(float %nega, float %b, float %c)
+  ret float %1
+}
diff --git a/test/CodeGen/RISCV/float-frem.ll b/test/CodeGen/RISCV/float-frem.ll
new file mode 100644
index 0000000000000000000000000000000000000000..95042c5fde6e648fb9acdc7fc6acbe40a43af4f2
--- /dev/null
+++ b/test/CodeGen/RISCV/float-frem.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+f -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IF %s
+
+define float @frem_f32(float %a, float %b) nounwind {
+; RV32IF-LABEL: frem_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call fmodf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = frem float %a, %b
+  ret float %1
+}
diff --git a/test/CodeGen/RISCV/float-intrinsics.ll b/test/CodeGen/RISCV/float-intrinsics.ll
index 1da644f5f9e1508fa43d376d31d9cf826c128edb..57f3a28ee60e6b237e8ac56d007964b1ac2d32ae 100644
--- a/test/CodeGen/RISCV/float-intrinsics.ll
+++ b/test/CodeGen/RISCV/float-intrinsics.ll
@@ -6,7 +6,7 @@
 
 declare float @llvm.sqrt.f32(float)
 
-define float @sqrt_f32(float %a) {
+define float @sqrt_f32(float %a) nounwind {
 ; RV32IF-LABEL: sqrt_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    fmv.w.x ft0, a0
@@ -14,12 +14,12 @@ define float @sqrt_f32(float %a) {
 ; RV32IF-NEXT:    fmv.x.w a0, ft0
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.sqrt.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.powi.f32(float, i32)
 
-define float @powi_f32(float %a, i32 %b) {
+define float @powi_f32(float %a, i32 %b) nounwind {
 ; RV32IF-LABEL: powi_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -29,12 +29,12 @@ define float @powi_f32(float %a, i32 %b) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.powi.f32(float %a, i32 %b)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.sin.f32(float)
 
-define float @sin_f32(float %a) {
+define float @sin_f32(float %a) nounwind {
 ; RV32IF-LABEL: sin_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -44,12 +44,12 @@ define float @sin_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.sin.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.cos.f32(float)
 
-define float @cos_f32(float %a) {
+define float @cos_f32(float %a) nounwind {
 ; RV32IF-LABEL: cos_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -59,11 +59,11 @@ define float @cos_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.cos.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 ; The sin+cos combination results in an FSINCOS SelectionDAG node.
-define float @sincos_f32(float %a) {
+define float @sincos_f32(float %a) nounwind {
 ; RV32IF-LABEL: sincos_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -87,12 +87,12 @@ define float @sincos_f32(float %a) {
   %1 = call float @llvm.sin.f32(float %a)
   %2 = call float @llvm.cos.f32(float %a)
   %3 = fadd float %1, %2
-	ret float %3
+  ret float %3
 }
 
 declare float @llvm.pow.f32(float, float)
 
-define float @pow_f32(float %a, float %b) {
+define float @pow_f32(float %a, float %b) nounwind {
 ; RV32IF-LABEL: pow_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -102,12 +102,12 @@ define float @pow_f32(float %a, float %b) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.pow.f32(float %a, float %b)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.exp.f32(float)
 
-define float @exp_f32(float %a) {
+define float @exp_f32(float %a) nounwind {
 ; RV32IF-LABEL: exp_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -117,12 +117,12 @@ define float @exp_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.exp.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.exp2.f32(float)
 
-define float @exp2_f32(float %a) {
+define float @exp2_f32(float %a) nounwind {
 ; RV32IF-LABEL: exp2_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -132,12 +132,12 @@ define float @exp2_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.exp2.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.log.f32(float)
 
-define float @log_f32(float %a) {
+define float @log_f32(float %a) nounwind {
 ; RV32IF-LABEL: log_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -147,12 +147,12 @@ define float @log_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.log.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.log10.f32(float)
 
-define float @log10_f32(float %a) {
+define float @log10_f32(float %a) nounwind {
 ; RV32IF-LABEL: log10_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -162,12 +162,12 @@ define float @log10_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.log10.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.log2.f32(float)
 
-define float @log2_f32(float %a) {
+define float @log2_f32(float %a) nounwind {
 ; RV32IF-LABEL: log2_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -177,28 +177,44 @@ define float @log2_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.log2.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.fma.f32(float, float, float)
 
-; TODO: Select RISC-V FMA instruction.
-define float @fma_f32(float %a, float %b, float %c) {
+define float @fma_f32(float %a, float %b, float %c) nounwind {
 ; RV32IF-LABEL: fma_f32:
 ; RV32IF:       # %bb.0:
-; RV32IF-NEXT:    addi sp, sp, -16
-; RV32IF-NEXT:    sw ra, 12(sp)
-; RV32IF-NEXT:    call fmaf
-; RV32IF-NEXT:    lw ra, 12(sp)
-; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    fmv.w.x ft0, a2
+; RV32IF-NEXT:    fmv.w.x ft1, a1
+; RV32IF-NEXT:    fmv.w.x ft2, a0
+; RV32IF-NEXT:    fmadd.s ft0, ft2, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.fma.f32(float %a, float %b, float %c)
-	ret float %1
+  ret float %1
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+define float @fmuladd_f32(float %a, float %b, float %c) nounwind {
+; Use of fmadd depends on TargetLowering::isFMAFasterthanFMulAndFAdd
+; RV32IF-LABEL: fmuladd_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a1
+; RV32IF-NEXT:    fmv.w.x ft1, a0
+; RV32IF-NEXT:    fmul.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.w.x ft1, a2
+; RV32IF-NEXT:    fadd.s ft0, ft0, ft1
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  ret float %1
 }
 
 declare float @llvm.fabs.f32(float)
 
-define float @fabs_f32(float %a) {
+define float @fabs_f32(float %a) nounwind {
 ; RV32IF-LABEL: fabs_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    lui a1, 524288
@@ -206,7 +222,7 @@ define float @fabs_f32(float %a) {
 ; RV32IF-NEXT:    and a0, a0, a1
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.fabs.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.minnum.f32(float, float)
@@ -270,7 +286,7 @@ define float @copysign_f32(float %a, float %b) nounwind {
 
 declare float @llvm.floor.f32(float)
 
-define float @floor_f32(float %a) {
+define float @floor_f32(float %a) nounwind {
 ; RV32IF-LABEL: floor_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -280,12 +296,12 @@ define float @floor_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.floor.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.ceil.f32(float)
 
-define float @ceil_f32(float %a) {
+define float @ceil_f32(float %a) nounwind {
 ; RV32IF-LABEL: ceil_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -295,12 +311,12 @@ define float @ceil_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.ceil.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.trunc.f32(float)
 
-define float @trunc_f32(float %a) {
+define float @trunc_f32(float %a) nounwind {
 ; RV32IF-LABEL: trunc_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -310,12 +326,12 @@ define float @trunc_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.trunc.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.rint.f32(float)
 
-define float @rint_f32(float %a) {
+define float @rint_f32(float %a) nounwind {
 ; RV32IF-LABEL: rint_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -325,12 +341,12 @@ define float @rint_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.rint.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.nearbyint.f32(float)
 
-define float @nearbyint_f32(float %a) {
+define float @nearbyint_f32(float %a) nounwind {
 ; RV32IF-LABEL: nearbyint_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -340,12 +356,12 @@ define float @nearbyint_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.nearbyint.f32(float %a)
-	ret float %1
+  ret float %1
 }
 
 declare float @llvm.round.f32(float)
 
-define float @round_f32(float %a) {
+define float @round_f32(float %a) nounwind {
 ; RV32IF-LABEL: round_f32:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
@@ -355,5 +371,5 @@ define float @round_f32(float %a) {
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
   %1 = call float @llvm.round.f32(float %a)
-	ret float %1
+  ret float %1
 }
diff --git a/test/CodeGen/RISCV/flt-rounds.ll b/test/CodeGen/RISCV/flt-rounds.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cb6d166de4ff5badd21be7eda3b349f89bebfb7c
--- /dev/null
+++ b/test/CodeGen/RISCV/flt-rounds.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
+
+declare i32 @llvm.flt.rounds()
+
+define i32 @test_flt_rounds() nounwind {
+; RV32I-LABEL: test_flt_rounds:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a0, zero, 1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_flt_rounds:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, 1
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.flt.rounds()
+  ret i32 %1
+}
diff --git a/test/CodeGen/RISCV/fp128.ll b/test/CodeGen/RISCV/fp128.ll
index 504817e85e4f2867e16344382a292227687f660c..a928d69fe9ed8cde63b5be4f70f913a678ba6f1a 100644
--- a/test/CodeGen/RISCV/fp128.ll
+++ b/test/CodeGen/RISCV/fp128.ll
@@ -36,7 +36,6 @@ define i32 @test_load_and_cmp() nounwind {
 ; RV32I-NEXT:    addi a0, sp, 24
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    call __netf2
-; RV32I-NEXT:    xor a0, a0, zero
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    lw ra, 44(sp)
 ; RV32I-NEXT:    addi sp, sp, 48
diff --git a/test/CodeGen/RISCV/frameaddr-returnaddr.ll b/test/CodeGen/RISCV/frameaddr-returnaddr.ll
index 967f2246e56085cf3cdd3e3f47404a66b40c140b..255bf71553926c69e1b12f674479ccb37aafa36a 100644
--- a/test/CodeGen/RISCV/frameaddr-returnaddr.ll
+++ b/test/CodeGen/RISCV/frameaddr-returnaddr.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
 
 declare void @notdead(i8*)
 declare i8* @llvm.frameaddress(i32)
@@ -18,6 +20,18 @@ define i8* @test_frameaddress_0() nounwind {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_frameaddress_0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    sd s0, 0(sp)
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld s0, 0(sp)
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = call i8* @llvm.frameaddress(i32 0)
   ret i8* %1
 }
@@ -35,6 +49,19 @@ define i8* @test_frameaddress_2() nounwind {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_frameaddress_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    sd s0, 0(sp)
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    ld a0, -16(s0)
+; RV64I-NEXT:    ld a0, -16(a0)
+; RV64I-NEXT:    ld s0, 0(sp)
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = call i8* @llvm.frameaddress(i32 2)
   ret i8* %1
 }
@@ -55,6 +82,22 @@ define i8* @test_frameaddress_3_alloca() nounwind {
 ; RV32I-NEXT:    lw ra, 108(sp)
 ; RV32I-NEXT:    addi sp, sp, 112
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_frameaddress_3_alloca:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -128
+; RV64I-NEXT:    sd ra, 120(sp)
+; RV64I-NEXT:    sd s0, 112(sp)
+; RV64I-NEXT:    addi s0, sp, 128
+; RV64I-NEXT:    addi a0, s0, -116
+; RV64I-NEXT:    call notdead
+; RV64I-NEXT:    ld a0, -16(s0)
+; RV64I-NEXT:    ld a0, -16(a0)
+; RV64I-NEXT:    ld a0, -16(a0)
+; RV64I-NEXT:    ld s0, 112(sp)
+; RV64I-NEXT:    ld ra, 120(sp)
+; RV64I-NEXT:    addi sp, sp, 128
+; RV64I-NEXT:    ret
   %1 = alloca [100 x i8]
   %2 = bitcast [100 x i8]* %1 to i8*
   call void @notdead(i8* %2)
@@ -67,6 +110,11 @@ define i8* @test_returnaddress_0() nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a0, ra
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_returnaddress_0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    mv a0, ra
+; RV64I-NEXT:    ret
   %1 = call i8* @llvm.returnaddress(i32 0)
   ret i8* %1
 }
@@ -85,6 +133,20 @@ define i8* @test_returnaddress_2() nounwind {
 ; RV32I-NEXT:    lw ra, 12(sp)
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_returnaddress_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    sd s0, 0(sp)
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    ld a0, -16(s0)
+; RV64I-NEXT:    ld a0, -16(a0)
+; RV64I-NEXT:    ld a0, -8(a0)
+; RV64I-NEXT:    ld s0, 0(sp)
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = call i8* @llvm.returnaddress(i32 2)
   ret i8* %1
 }
diff --git a/test/CodeGen/RISCV/get-setcc-result-type.ll b/test/CodeGen/RISCV/get-setcc-result-type.ll
index 003f3b367cf4bfb3dd64a9edf7000c7a9e247f53..507f04822b8658eb9017cd28296bde1ff6724282 100644
--- a/test/CodeGen/RISCV/get-setcc-result-type.ll
+++ b/test/CodeGen/RISCV/get-setcc-result-type.ll
@@ -6,22 +6,18 @@ define void @getSetCCResultType(<4 x i32>* %p, <4 x i32>* %q) {
 ; RV32I-LABEL: getSetCCResultType:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a1, 12(a0)
-; RV32I-NEXT:    xor a1, a1, zero
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw a1, 8(a0)
-; RV32I-NEXT:    xor a1, a1, zero
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    sw a1, 8(a0)
 ; RV32I-NEXT:    lw a1, 4(a0)
-; RV32I-NEXT:    xor a1, a1, zero
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    sw a1, 4(a0)
 ; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    xor a1, a1, zero
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    sw a1, 0(a0)
diff --git a/test/CodeGen/RISCV/i32-icmp.ll b/test/CodeGen/RISCV/i32-icmp.ll
index e1154948812a824b86ab2bab7ea42cd2ff2a1e9d..3b89504bbc02effb2539fb2e19224ffe6c7e92b4 100644
--- a/test/CodeGen/RISCV/i32-icmp.ll
+++ b/test/CodeGen/RISCV/i32-icmp.ll
@@ -16,6 +16,16 @@ define i32 @icmp_eq(i32 %a, i32 %b) nounwind {
   ret i32 %2
 }
 
+define i32 @icmp_eqz(i32 %a) nounwind {
+; RV32I-LABEL: icmp_eqz:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    ret
+  %1 = icmp eq i32 %a, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
 define i32 @icmp_ne(i32 %a, i32 %b) nounwind {
 ; RV32I-LABEL: icmp_ne:
 ; RV32I:       # %bb.0:
@@ -27,6 +37,16 @@ define i32 @icmp_ne(i32 %a, i32 %b) nounwind {
   ret i32 %2
 }
 
+define i32 @icmp_nez(i32 %a) nounwind {
+; RV32I-LABEL: icmp_nez:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    ret
+  %1 = icmp ne i32 %a, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
 define i32 @icmp_ugt(i32 %a, i32 %b) nounwind {
 ; RV32I-LABEL: icmp_ugt:
 ; RV32I:       # %bb.0:
diff --git a/test/CodeGen/RISCV/imm.ll b/test/CodeGen/RISCV/imm.ll
index b9e2f8a40c6c93c488db4a478cb48808890c85c4..872b6d5a9864b1b42c81e4573681b63f7198e2d6 100644
--- a/test/CodeGen/RISCV/imm.ll
+++ b/test/CodeGen/RISCV/imm.ll
@@ -1,63 +1,260 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
 
 ; Materializing constants
 
-define i32 @zero() nounwind {
+; TODO: It would be preferable if anyext constant returns were sign rather
+; than zero extended. See PR39092. For now, mark returns as explicitly signext
+; (this matches what Clang would generate for equivalent C/C++ anyway).
+
+define signext i32 @zero() nounwind {
 ; RV32I-LABEL: zero:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a0, zero
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zero:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    mv a0, zero
+; RV64I-NEXT:    ret
   ret i32 0
 }
 
-define i32 @pos_small() nounwind {
+define signext i32 @pos_small() nounwind {
 ; RV32I-LABEL: pos_small:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, zero, 2047
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: pos_small:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, 2047
+; RV64I-NEXT:    ret
   ret i32 2047
 }
 
-define i32 @neg_small() nounwind {
+define signext i32 @neg_small() nounwind {
 ; RV32I-LABEL: neg_small:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, zero, -2048
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: neg_small:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, -2048
+; RV64I-NEXT:    ret
   ret i32 -2048
 }
 
-define i32 @pos_i32() nounwind {
+define signext i32 @pos_i32() nounwind {
 ; RV32I-LABEL: pos_i32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 423811
 ; RV32I-NEXT:    addi a0, a0, -1297
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: pos_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 423811
+; RV64I-NEXT:    addiw a0, a0, -1297
+; RV64I-NEXT:    ret
   ret i32 1735928559
 }
 
-define i32 @neg_i32() nounwind {
+define signext i32 @neg_i32() nounwind {
 ; RV32I-LABEL: neg_i32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 912092
 ; RV32I-NEXT:    addi a0, a0, -273
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: neg_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 912092
+; RV64I-NEXT:    addiw a0, a0, -273
+; RV64I-NEXT:    ret
   ret i32 -559038737
 }
 
-define i32 @pos_i32_hi20_only() nounwind {
+define signext i32 @pos_i32_hi20_only() nounwind {
 ; RV32I-LABEL: pos_i32_hi20_only:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 16
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: pos_i32_hi20_only:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 16
+; RV64I-NEXT:    ret
   ret i32 65536
 }
 
-define i32 @neg_i32_hi20_only() nounwind {
+define signext i32 @neg_i32_hi20_only() nounwind {
 ; RV32I-LABEL: neg_i32_hi20_only:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 1048560
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: neg_i32_hi20_only:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1048560
+; RV64I-NEXT:    ret
   ret i32 -65536
 }
+
+define i64 @imm64_1() nounwind {
+; RV32I-LABEL: imm64_1:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 524288
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, 1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    ret
+  ret i64 2147483648
+}
+
+; TODO: This and similar constants with all 0s in the upper bits and all 1s in
+; the lower bits could be lowered to addi a0, zero, -1 followed by a logical
+; right shift.
+define i64 @imm64_2() nounwind {
+; RV32I-LABEL: imm64_2:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a0, zero, -1
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    ret
+  ret i64 4294967295
+}
+
+define i64 @imm64_3() nounwind {
+; RV32I-LABEL: imm64_3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a1, zero, 1
+; RV32I-NEXT:    mv a0, zero
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    ret
+  ret i64 4294967296
+}
+
+define i64 @imm64_4() nounwind {
+; RV32I-LABEL: imm64_4:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    mv a0, zero
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_4:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, -1
+; RV64I-NEXT:    slli a0, a0, 63
+; RV64I-NEXT:    ret
+  ret i64 9223372036854775808
+}
+
+define i64 @imm64_5() nounwind {
+; RV32I-LABEL: imm64_5:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    mv a0, zero
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_5:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, -1
+; RV64I-NEXT:    slli a0, a0, 63
+; RV64I-NEXT:    ret
+  ret i64 -9223372036854775808
+}
+
+define i64 @imm64_6() nounwind {
+; RV32I-LABEL: imm64_6:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 74565
+; RV32I-NEXT:    addi a1, a0, 1656
+; RV32I-NEXT:    mv a0, zero
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 9321
+; RV64I-NEXT:    addiw a0, a0, -1329
+; RV64I-NEXT:    slli a0, a0, 35
+; RV64I-NEXT:    ret
+  ret i64 1311768464867721216
+}
+
+define i64 @imm64_7() nounwind {
+; RV32I-LABEL: imm64_7:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 45056
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    lui a1, 458752
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_7:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, 7
+; RV64I-NEXT:    slli a0, a0, 36
+; RV64I-NEXT:    addi a0, a0, 11
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    ret
+  ret i64 8070450532432478223
+}
+
+; TODO: it can be preferable to put constants that are expensive to materialise
+; into the constant pool, especially for -Os.
+define i64 @imm64_8() nounwind {
+; RV32I-LABEL: imm64_8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 633806
+; RV32I-NEXT:    addi a0, a0, -272
+; RV32I-NEXT:    lui a1, 74565
+; RV32I-NEXT:    addi a1, a1, 1656
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 583
+; RV64I-NEXT:    addiw a0, a0, -1875
+; RV64I-NEXT:    slli a0, a0, 14
+; RV64I-NEXT:    addi a0, a0, -947
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1511
+; RV64I-NEXT:    slli a0, a0, 13
+; RV64I-NEXT:    addi a0, a0, -272
+; RV64I-NEXT:    ret
+  ret i64 1311768467463790320
+}
+
+define i64 @imm64_9() nounwind {
+; RV32I-LABEL: imm64_9:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a0, zero, -1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_9:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, zero, -1
+; RV64I-NEXT:    ret
+  ret i64 -1
+}
diff --git a/test/CodeGen/RISCV/mem64.ll b/test/CodeGen/RISCV/mem64.ll
new file mode 100644
index 0000000000000000000000000000000000000000..32669e8aef36bfe56876288c365fd8e07e11c41a
--- /dev/null
+++ b/test/CodeGen/RISCV/mem64.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
+
+; Check indexed and unindexed, sext, zext and anyext loads
+
+define i64 @lb(i8 *%a) nounwind {
+; RV64I-LABEL: lb:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lb a1, 0(a0)
+; RV64I-NEXT:    lb a0, 1(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i8, i8* %a, i32 1
+  %2 = load i8, i8* %1
+  %3 = sext i8 %2 to i64
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i8, i8* %a
+  ret i64 %3
+}
+
+define i64 @lh(i16 *%a) nounwind {
+; RV64I-LABEL: lh:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lh a1, 0(a0)
+; RV64I-NEXT:    lh a0, 4(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i16, i16* %a, i32 2
+  %2 = load i16, i16* %1
+  %3 = sext i16 %2 to i64
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i16, i16* %a
+  ret i64 %3
+}
+
+define i64 @lw(i32 *%a) nounwind {
+; RV64I-LABEL: lw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lw a1, 0(a0)
+; RV64I-NEXT:    lw a0, 12(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i32, i32* %a, i32 3
+  %2 = load i32, i32* %1
+  %3 = sext i32 %2 to i64
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i32, i32* %a
+  ret i64 %3
+}
+
+define i64 @lbu(i8 *%a) nounwind {
+; RV64I-LABEL: lbu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    lbu a0, 4(a0)
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = getelementptr i8, i8* %a, i32 4
+  %2 = load i8, i8* %1
+  %3 = zext i8 %2 to i64
+  %4 = load volatile i8, i8* %a
+  %5 = zext i8 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define i64 @lhu(i16 *%a) nounwind {
+; RV64I-LABEL: lhu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 10(a0)
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = getelementptr i16, i16* %a, i32 5
+  %2 = load i16, i16* %1
+  %3 = zext i16 %2 to i64
+  %4 = load volatile i16, i16* %a
+  %5 = zext i16 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define i64 @lwu(i32 *%a) nounwind {
+; RV64I-LABEL: lwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lwu a1, 0(a0)
+; RV64I-NEXT:    lwu a0, 24(a0)
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = getelementptr i32, i32* %a, i32 6
+  %2 = load i32, i32* %1
+  %3 = zext i32 %2 to i64
+  %4 = load volatile i32, i32* %a
+  %5 = zext i32 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+; Check indexed and unindexed stores
+
+define void @sb(i8 *%a, i8 %b) nounwind {
+; RV64I-LABEL: sb:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sb a1, 7(a0)
+; RV64I-NEXT:    sb a1, 0(a0)
+; RV64I-NEXT:    ret
+  store i8 %b, i8* %a
+  %1 = getelementptr i8, i8* %a, i32 7
+  store i8 %b, i8* %1
+  ret void
+}
+
+define void @sh(i16 *%a, i16 %b) nounwind {
+; RV64I-LABEL: sh:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sh a1, 16(a0)
+; RV64I-NEXT:    sh a1, 0(a0)
+; RV64I-NEXT:    ret
+  store i16 %b, i16* %a
+  %1 = getelementptr i16, i16* %a, i32 8
+  store i16 %b, i16* %1
+  ret void
+}
+
+define void @sw(i32 *%a, i32 %b) nounwind {
+; RV64I-LABEL: sw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sw a1, 36(a0)
+; RV64I-NEXT:    sw a1, 0(a0)
+; RV64I-NEXT:    ret
+  store i32 %b, i32* %a
+  %1 = getelementptr i32, i32* %a, i32 9
+  store i32 %b, i32* %1
+  ret void
+}
+
+; 64-bit loads and stores
+
+define i64 @ld(i64 *%a) nounwind {
+; RV64I-LABEL: ld:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ld a1, 0(a0)
+; RV64I-NEXT:    ld a0, 80(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i64, i64* %a, i32 10
+  %2 = load i64, i64* %1
+  %3 = load volatile i64, i64* %a
+  ret i64 %2
+}
+
+define void @sd(i64 *%a, i64 %b) nounwind {
+; RV64I-LABEL: sd:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sd a1, 88(a0)
+; RV64I-NEXT:    sd a1, 0(a0)
+; RV64I-NEXT:    ret
+  store i64 %b, i64* %a
+  %1 = getelementptr i64, i64* %a, i32 11
+  store i64 %b, i64* %1
+  ret void
+}
+
+; Check load and store to an i1 location
+define i64 @load_sext_zext_anyext_i1(i1 *%a) nounwind {
+; RV64I-LABEL: load_sext_zext_anyext_i1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lb a1, 0(a0)
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a0, 2(a0)
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  ; sextload i1
+  %1 = getelementptr i1, i1* %a, i32 1
+  %2 = load i1, i1* %1
+  %3 = sext i1 %2 to i64
+  ; zextload i1
+  %4 = getelementptr i1, i1* %a, i32 2
+  %5 = load i1, i1* %4
+  %6 = zext i1 %5 to i64
+  %7 = add i64 %3, %6
+  ; extload i1 (anyext). Produced as the load is unused.
+  %8 = load volatile i1, i1* %a
+  ret i64 %7
+}
+
+define i16 @load_sext_zext_anyext_i1_i16(i1 *%a) nounwind {
+; RV64I-LABEL: load_sext_zext_anyext_i1_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lb a1, 0(a0)
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a0, 2(a0)
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  ; sextload i1
+  %1 = getelementptr i1, i1* %a, i32 1
+  %2 = load i1, i1* %1
+  %3 = sext i1 %2 to i16
+  ; zextload i1
+  %4 = getelementptr i1, i1* %a, i32 2
+  %5 = load i1, i1* %4
+  %6 = zext i1 %5 to i16
+  %7 = add i16 %3, %6
+  ; extload i1 (anyext). Produced as the load is unused.
+  %8 = load volatile i1, i1* %a
+  ret i16 %7
+}
+
+; Check load and store to a global
+@G = global i64 0
+
+define i64 @ld_sd_global(i64 %a) nounwind {
+; RV64I-LABEL: ld_sd_global:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, %hi(G)
+; RV64I-NEXT:    ld a1, %lo(G)(a2)
+; RV64I-NEXT:    sd a0, %lo(G)(a2)
+; RV64I-NEXT:    addi a2, a2, %lo(G)
+; RV64I-NEXT:    ld a3, 72(a2)
+; RV64I-NEXT:    sd a0, 72(a2)
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+  %1 = load volatile i64, i64* @G
+  store i64 %a, i64* @G
+  %2 = getelementptr i64, i64* @G, i64 9
+  %3 = load volatile i64, i64* %2
+  store i64 %a, i64* %2
+  ret i64 %1
+}
diff --git a/test/CodeGen/RISCV/option-norelax.ll b/test/CodeGen/RISCV/option-norelax.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ad2b324937902edba2f4f9bce920676369975e96
--- /dev/null
+++ b/test/CodeGen/RISCV/option-norelax.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=riscv32 -mattr=+relax -filetype=obj < %s \
+; RUN:     | llvm-objdump -d -r - | FileCheck %s
+
+; This test demonstrates that .option norelax has no effect on codegen
+; when emitting an ELF directly.
+
+declare i32 @foo(i32)
+
+define i32 @bar(i32 %a) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: R_RISCV_CALL
+; CHECK: R_RISCV_RELAX
+  tail call void asm sideeffect ".option norelax", ""()
+  %1 = call i32 @foo(i32 %a)
+  ret i32 %1
+}
diff --git a/test/CodeGen/RISCV/option-relax.ll b/test/CodeGen/RISCV/option-relax.ll
new file mode 100644
index 0000000000000000000000000000000000000000..262969905611d8c98fc22b2c4bd00f4742a35692
--- /dev/null
+++ b/test/CodeGen/RISCV/option-relax.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=riscv32 -mattr=-relax -filetype=obj < %s \
+; RUN:     | llvm-objdump -d -r - | FileCheck %s
+
+; This test demonstrates that .option relax has no effect on codegen
+; when emitting an ELF directly.
+
+declare i32 @foo(i32)
+
+define i32 @bar(i32 %a) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: R_RISCV_CALL
+; CHECK-NOT: R_RISCV_RELAX
+  tail call void asm sideeffect ".option relax", ""()
+  %1 = call i32 @foo(i32 %a)
+  ret i32 %1
+}
diff --git a/test/CodeGen/RISCV/prefetch.ll b/test/CodeGen/RISCV/prefetch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7891b2e8c7498de697bd384b60a6f3d046bad6aa
--- /dev/null
+++ b/test/CodeGen/RISCV/prefetch.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
+
+declare void @llvm.prefetch(i8*, i32, i32, i32)
+
+define void @test_prefetch(i8* %a) nounwind {
+; RV32I-LABEL: test_prefetch:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_prefetch:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
+  call void @llvm.prefetch(i8* %a, i32 0, i32 1, i32 2)
+  ret void
+}
diff --git a/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll b/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..52a59c04a3253cba4973c8508266d12e478449bd
--- /dev/null
+++ b/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
@@ -0,0 +1,1769 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+
+; The patterns for the 'W' suffixed RV64I instructions have the potential of
+; missing cases. This file checks all the variants of
+; sign-extended/zero-extended/any-extended inputs and outputs.
+
+; The 64-bit add instruction can safely be used when the result is anyext.
+
+define i32 @aext_addw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_addw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_addw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_addw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_addw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_addw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_addw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_addw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_addw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_addw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_addw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_addw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_addw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_addw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_addw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_addw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_addw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_addw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+; Always select addw when a signext result is required.
+
+define signext i32 @sext_addw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_addw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_addw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_addw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_addw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_addw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_addw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_addw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_addw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_addw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_addw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_addw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_addw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_addw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_addw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_addw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_addw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_addw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+; 64-bit add followed by zero-extension is a safe option when a zeroext result
+; is required.
+
+define zeroext i32 @zext_addw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_addw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_addw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_addw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_addw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_addw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_addw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_addw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_addw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_addw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+; 64-bit sub is safe for an anyext result.
+
+define i32 @aext_subw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_subw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_subw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_subw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_subw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_subw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_subw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_subw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_subw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_subw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+; Always select subw for a signext result.
+
+define signext i32 @sext_subw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_subw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_subw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_subw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_subw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_subw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_subw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_subw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_subw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_subw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+; 64-bit sub followed by zero-extension is safe for a zeroext result.
+
+define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_subw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_subw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_subw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_subw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_subw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_subw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_subw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_subw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_subw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+; 64-bit sll is a safe choice for an anyext result.
+
+define i32 @aext_sllw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_sllw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sllw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_sllw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sllw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_sllw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sllw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_sllw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sllw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_sllw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sllw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_sllw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sllw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_sllw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sllw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_sllw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sllw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_sllw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+; TODO: Select sllw for all cases witha signext result.
+
+define signext i32 @sext_sllw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_sllw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sllw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_sllw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sllw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_sllw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sllw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_sllw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sllw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_sllw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sllw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_sllw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sllw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_sllw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sllw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_sllw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sllw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_sllw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+; 64-bit sll followed by zero-extension for a zeroext result.
+
+define zeroext i32 @zext_sllw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_sllw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sllw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_sllw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sllw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_sllw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sllw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_sllw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sllw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_sllw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sllw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_sllw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sllw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_sllw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sllw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_sllw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sllw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_sllw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+; TODO: srlw should be selected for 32-bit lshr with variable arguments.
+
+define i32 @aext_srlw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_srlw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_srlw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_srlw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_srlw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_srlw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_srlw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_srlw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_srlw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_srlw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_srlw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_srlw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_srlw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_srlw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_srlw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_srlw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_srlw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_srlw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_srlw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_srlw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_srlw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_srlw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_srlw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_srlw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_srlw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_srlw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_srlw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_srlw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_srlw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_srlw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_srlw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_srlw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_srlw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_srlw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_srlw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_srlw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srlw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_srlw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+; TODO: sraw should be selected if the first operand is not sign-extended. If the
+; first operand is sign-extended, sra is equivalent for the test cases below.
+
+define i32 @aext_sraw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_sraw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sraw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_sraw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sraw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_sraw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sraw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_sraw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sraw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_sraw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sraw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_sraw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sraw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: aext_sraw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sraw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: aext_sraw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @aext_sraw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: aext_sraw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_sraw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_sraw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_sraw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_sraw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_sraw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_sraw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: sext_sraw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sext_sraw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define signext i32 @sext_sraw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: sext_sraw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_aext_aext(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_sraw_aext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_aext_sext(i32 %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_sraw_aext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_sraw_aext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_sext_aext(i32 signext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_sraw_sext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_sraw_sext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_sraw_sext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
+; RV64I-LABEL: zext_sraw_zext_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_sraw_zext_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64I-LABEL: zext_sraw_zext_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+; addiw should be selected when there is a signext result.
+
+define i32 @aext_addiw_aext(i32 %a) nounwind {
+; RV64I-LABEL: aext_addiw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 1
+  ret i32 %1
+}
+
+define i32 @aext_addiw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: aext_addiw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 2
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 2
+  ret i32 %1
+}
+
+define i32 @aext_addiw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: aext_addiw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 3
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 3
+  ret i32 %1
+}
+
+define signext i32 @sext_addiw_aext(i32 %a) nounwind {
+; RV64I-LABEL: sext_addiw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addiw a0, a0, 4
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 4
+  ret i32 %1
+}
+
+define signext i32 @sext_addiw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: sext_addiw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addiw a0, a0, 5
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 5
+  ret i32 %1
+}
+
+define signext i32 @sext_addiw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: sext_addiw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addiw a0, a0, 6
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 6
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addiw_aext(i32 %a) nounwind {
+; RV64I-LABEL: zext_addiw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 7
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addiw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: zext_addiw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 8
+  ret i32 %1
+}
+
+define zeroext i32 @zext_addiw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: zext_addiw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 9
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 9
+  ret i32 %1
+}
+
+; slliw should be selected whenever the return is signext.
+
+define i32 @aext_slliw_aext(i32 %a) nounwind {
+; RV64I-LABEL: aext_slliw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 1
+  ret i32 %1
+}
+
+define i32 @aext_slliw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: aext_slliw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 2
+  ret i32 %1
+}
+
+define i32 @aext_slliw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: aext_slliw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 3
+  ret i32 %1
+}
+
+define signext i32 @sext_slliw_aext(i32 %a) nounwind {
+; RV64I-LABEL: sext_slliw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a0, a0, 4
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 4
+  ret i32 %1
+}
+
+define signext i32 @sext_slliw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: sext_slliw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a0, a0, 5
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 5
+  ret i32 %1
+}
+
+define signext i32 @sext_slliw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: sext_slliw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a0, a0, 6
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 6
+  ret i32 %1
+}
+
+; TODO: the constant shifts could be combined.
+
+define zeroext i32 @zext_slliw_aext(i32 %a) nounwind {
+; RV64I-LABEL: zext_slliw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 7
+  ret i32 %1
+}
+
+define zeroext i32 @zext_slliw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: zext_slliw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 8
+  ret i32 %1
+}
+
+define zeroext i32 @zext_slliw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: zext_slliw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 9
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 9
+  ret i32 %1
+}
+
+; srliw should be selected unless the first operand is zeroext, when srli is
+; equivalent.
+
+define i32 @aext_srliw_aext(i32 %a) nounwind {
+; RV64I-LABEL: aext_srliw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 1
+  ret i32 %1
+}
+
+define i32 @aext_srliw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: aext_srliw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 2
+  ret i32 %1
+}
+
+define i32 @aext_srliw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: aext_srliw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 3
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 3
+  ret i32 %1
+}
+
+define signext i32 @sext_srliw_aext(i32 %a) nounwind {
+; RV64I-LABEL: sext_srliw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 4
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 4
+  ret i32 %1
+}
+
+define signext i32 @sext_srliw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: sext_srliw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 5
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 5
+  ret i32 %1
+}
+
+define signext i32 @sext_srliw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: sext_srliw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 6
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 6
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srliw_aext(i32 %a) nounwind {
+; RV64I-LABEL: zext_srliw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 7
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 7
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srliw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: zext_srliw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 8
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 8
+  ret i32 %1
+}
+
+define zeroext i32 @zext_srliw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: zext_srliw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 9
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 9
+  ret i32 %1
+}
+
+; srai is equivalent to sraiw if the first operand is sign-extended.
+
+define i32 @aext_sraiw_aext(i32 %a) nounwind {
+; RV64I-LABEL: aext_sraiw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a0, a0, 1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 1
+  ret i32 %1
+}
+
+define i32 @aext_sraiw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: aext_sraiw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srai a0, a0, 2
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 2
+  ret i32 %1
+}
+
+define i32 @aext_sraiw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: aext_sraiw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a0, a0, 3
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 3
+  ret i32 %1
+}
+
+define signext i32 @sext_sraiw_aext(i32 %a) nounwind {
+; RV64I-LABEL: sext_sraiw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a0, a0, 4
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 4
+  ret i32 %1
+}
+
+define signext i32 @sext_sraiw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: sext_sraiw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srai a0, a0, 5
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 5
+  ret i32 %1
+}
+
+define signext i32 @sext_sraiw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: sext_sraiw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a0, a0, 6
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 6
+  ret i32 %1
+}
+
+; TODO: sraiw could be selected rather than sext.w and srli. Alternatively,
+; the srli could be merged in to the shifts used for zero-extension.
+
+define zeroext i32 @zext_sraiw_aext(i32 %a) nounwind {
+; RV64I-LABEL: zext_sraiw_aext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    srli a0, a0, 7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 7
+  ret i32 %1
+}
+
+define zeroext i32 @zext_sraiw_sext(i32 signext %a) nounwind {
+; RV64I-LABEL: zext_sraiw_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 8
+  ret i32 %1
+}
+
+; TODO: sraiw could be selected rather than sext.w and srli. Alternatively,
+; the srli could be merged in to the shifts used for zero-extension.
+
+define zeroext i32 @zext_sraiw_zext(i32 zeroext %a) nounwind {
+; RV64I-LABEL: zext_sraiw_zext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    srli a0, a0, 9
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 9
+  ret i32 %1
+}
diff --git a/test/CodeGen/RISCV/rv64i-tricky-shifts.ll b/test/CodeGen/RISCV/rv64i-tricky-shifts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..73eeed7553cce9141f88bcf1aef22f215e1f5465
--- /dev/null
+++ b/test/CodeGen/RISCV/rv64i-tricky-shifts.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+
+; These tests must not compile to sllw/srlw/sraw, as this would be semantically
+; incorrect in the case that %b holds a value between 32 and 63. Selection
+; patterns might make the mistake of assuming that a (sext_inreg foo, i32) can
+; only be produced when sign-extending an i32 type.
+
+define i64 @tricky_shl(i64 %a, i64 %b) {
+; RV64I-LABEL: tricky_shl:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+  %1 = shl i64 %a, %b
+  %2 = shl i64 %1, 32
+  %3 = ashr i64 %2, 32
+  ret i64 %3
+}
+
+define i64 @tricky_lshr(i64 %a, i64 %b) {
+; RV64I-LABEL: tricky_lshr:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = and i64 %a, 4294967295
+  %2 = lshr i64 %1, %b
+  ret i64 %2
+}
+
+define i64 @tricky_ashr(i64 %a, i64 %b) {
+; RV64I-LABEL: tricky_ashr:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i64 %a, 32
+  %2 = ashr i64 %1, 32
+  %3 = ashr i64 %2, %b
+  ret i64 %3
+}
diff --git a/test/CodeGen/RISCV/sext-zext-trunc.ll b/test/CodeGen/RISCV/sext-zext-trunc.ll
index 88d0fedea980351c705897c7aae57a47fcbbabc1..280c68e2ab496f9eab8bd3df78c51e4d1c33c3f9 100644
--- a/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
 
 define i8 @sext_i1_to_i8(i1 %a) {
 ; RV32I-LABEL: sext_i1_to_i8:
@@ -8,6 +10,12 @@ define i8 @sext_i1_to_i8(i1 %a) {
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i1_to_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    ret
   %1 = sext i1 %a to i8
   ret i8 %1
 }
@@ -18,6 +26,12 @@ define i16 @sext_i1_to_i16(i1 %a) {
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i1_to_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    ret
   %1 = sext i1 %a to i16
   ret i16 %1
 }
@@ -28,6 +42,12 @@ define i32 @sext_i1_to_i32(i1 %a) {
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i1_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    ret
   %1 = sext i1 %a to i32
   ret i32 %1
 }
@@ -39,6 +59,12 @@ define i64 @sext_i1_to_i64(i1 %a) {
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i1_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    ret
   %1 = sext i1 %a to i64
   ret i64 %1
 }
@@ -49,6 +75,12 @@ define i16 @sext_i8_to_i16(i8 %a) {
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i8_to_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
   %1 = sext i8 %a to i16
   ret i16 %1
 }
@@ -59,6 +91,12 @@ define i32 @sext_i8_to_i32(i8 %a) {
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i8_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
   %1 = sext i8 %a to i32
   ret i32 %1
 }
@@ -70,6 +108,12 @@ define i64 @sext_i8_to_i64(i8 %a) {
 ; RV32I-NEXT:    srai a0, a1, 24
 ; RV32I-NEXT:    srai a1, a1, 31
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i8_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
   %1 = sext i8 %a to i64
   ret i64 %1
 }
@@ -80,6 +124,12 @@ define i32 @sext_i16_to_i32(i16 %a) {
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i16_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
   %1 = sext i16 %a to i32
   ret i32 %1
 }
@@ -91,6 +141,12 @@ define i64 @sext_i16_to_i64(i16 %a) {
 ; RV32I-NEXT:    srai a0, a1, 16
 ; RV32I-NEXT:    srai a1, a1, 31
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i16_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
   %1 = sext i16 %a to i64
   ret i64 %1
 }
@@ -100,6 +156,11 @@ define i64 @sext_i32_to_i64(i32 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: sext_i32_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
   %1 = sext i32 %a to i64
   ret i64 %1
 }
@@ -109,6 +170,11 @@ define i8 @zext_i1_to_i8(i1 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i1_to_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
   %1 = zext i1 %a to i8
   ret i8 %1
 }
@@ -118,6 +184,11 @@ define i16 @zext_i1_to_i16(i1 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i1_to_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
   %1 = zext i1 %a to i16
   ret i16 %1
 }
@@ -127,6 +198,11 @@ define i32 @zext_i1_to_i32(i1 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i1_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
   %1 = zext i1 %a to i32
   ret i32 %1
 }
@@ -137,6 +213,11 @@ define i64 @zext_i1_to_i64(i1 %a) {
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    mv a1, zero
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i1_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
   %1 = zext i1 %a to i64
   ret i64 %1
 }
@@ -146,6 +227,11 @@ define i16 @zext_i8_to_i16(i8 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i8_to_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    ret
   %1 = zext i8 %a to i16
   ret i16 %1
 }
@@ -155,6 +241,11 @@ define i32 @zext_i8_to_i32(i8 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i8_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    ret
   %1 = zext i8 %a to i32
   ret i32 %1
 }
@@ -165,6 +256,11 @@ define i64 @zext_i8_to_i64(i8 %a) {
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    mv a1, zero
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i8_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    ret
   %1 = zext i8 %a to i64
   ret i64 %1
 }
@@ -176,6 +272,13 @@ define i32 @zext_i16_to_i32(i16 %a) {
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i16_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = zext i16 %a to i32
   ret i32 %1
 }
@@ -188,6 +291,13 @@ define i64 @zext_i16_to_i64(i16 %a) {
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    mv a1, zero
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i16_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
   %1 = zext i16 %a to i64
   ret i64 %1
 }
@@ -197,17 +307,24 @@ define i64 @zext_i32_to_i64(i32 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a1, zero
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_i32_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
   %1 = zext i32 %a to i64
   ret i64 %1
 }
 
-; TODO: should the trunc tests explicitly ensure no code is generated before
-; jalr?
-
 define i1 @trunc_i8_to_i1(i8 %a) {
 ; RV32I-LABEL: trunc_i8_to_i1:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i8_to_i1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i8 %a to i1
   ret i1 %1
 }
@@ -216,6 +333,10 @@ define i1 @trunc_i16_to_i1(i16 %a) {
 ; RV32I-LABEL: trunc_i16_to_i1:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i16_to_i1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i16 %a to i1
   ret i1 %1
 }
@@ -224,6 +345,10 @@ define i1 @trunc_i32_to_i1(i32 %a) {
 ; RV32I-LABEL: trunc_i32_to_i1:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i32_to_i1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i32 %a to i1
   ret i1 %1
 }
@@ -232,6 +357,10 @@ define i1 @trunc_i64_to_i1(i64 %a) {
 ; RV32I-LABEL: trunc_i64_to_i1:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i64_to_i1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i64 %a to i1
   ret i1 %1
 }
@@ -240,6 +369,10 @@ define i8 @trunc_i16_to_i8(i16 %a) {
 ; RV32I-LABEL: trunc_i16_to_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i16_to_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i16 %a to i8
   ret i8 %1
 }
@@ -248,6 +381,10 @@ define i8 @trunc_i32_to_i8(i32 %a) {
 ; RV32I-LABEL: trunc_i32_to_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i32_to_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i32 %a to i8
   ret i8 %1
 }
@@ -256,6 +393,10 @@ define i8 @trunc_i64_to_i8(i64 %a) {
 ; RV32I-LABEL: trunc_i64_to_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i64_to_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i64 %a to i8
   ret i8 %1
 }
@@ -264,6 +405,10 @@ define i16 @trunc_i32_to_i16(i32 %a) {
 ; RV32I-LABEL: trunc_i32_to_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i32_to_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i32 %a to i16
   ret i16 %1
 }
@@ -272,6 +417,10 @@ define i16 @trunc_i64_to_i16(i64 %a) {
 ; RV32I-LABEL: trunc_i64_to_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i64_to_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i64 %a to i16
   ret i16 %1
 }
@@ -280,6 +429,10 @@ define i32 @trunc_i64_to_i32(i64 %a) {
 ; RV32I-LABEL: trunc_i64_to_i32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: trunc_i64_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ret
   %1 = trunc i64 %a to i32
   ret i32 %1
 }
diff --git a/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 9f48a2018ce714c5e6ce21218e379161db251b54..d00208ef6803b00df9850c58f1b35a4f9900a4db 100644
--- a/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -4,129 +4,121 @@
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; RISCV32-LABEL: muloti_test:
 ; RISCV32:       # %bb.0: # %start
-; RISCV32-NEXT:    addi sp, sp, -80
-; RISCV32-NEXT:    sw ra, 76(sp)
-; RISCV32-NEXT:    sw s1, 72(sp)
-; RISCV32-NEXT:    sw s2, 68(sp)
-; RISCV32-NEXT:    sw s3, 64(sp)
-; RISCV32-NEXT:    sw s4, 60(sp)
-; RISCV32-NEXT:    sw s5, 56(sp)
-; RISCV32-NEXT:    sw s6, 52(sp)
-; RISCV32-NEXT:    sw s7, 48(sp)
+; RISCV32-NEXT:    addi sp, sp, -96
+; RISCV32-NEXT:    sw ra, 92(sp)
+; RISCV32-NEXT:    sw s1, 88(sp)
+; RISCV32-NEXT:    sw s2, 84(sp)
+; RISCV32-NEXT:    sw s3, 80(sp)
+; RISCV32-NEXT:    sw s4, 76(sp)
+; RISCV32-NEXT:    sw s5, 72(sp)
+; RISCV32-NEXT:    sw s6, 68(sp)
+; RISCV32-NEXT:    sw s7, 64(sp)
+; RISCV32-NEXT:    sw s8, 60(sp)
 ; RISCV32-NEXT:    mv s3, a2
 ; RISCV32-NEXT:    mv s1, a1
 ; RISCV32-NEXT:    mv s2, a0
-; RISCV32-NEXT:    sw zero, 12(sp)
-; RISCV32-NEXT:    sw zero, 8(sp)
-; RISCV32-NEXT:    sw zero, 28(sp)
-; RISCV32-NEXT:    sw zero, 24(sp)
+; RISCV32-NEXT:    mv s4, zero
+; RISCV32-NEXT:    sw zero, 20(sp)
+; RISCV32-NEXT:    sw zero, 16(sp)
+; RISCV32-NEXT:    sw zero, 36(sp)
+; RISCV32-NEXT:    sw zero, 32(sp)
 ; RISCV32-NEXT:    lw s5, 4(a2)
-; RISCV32-NEXT:    sw s5, 4(sp)
-; RISCV32-NEXT:    lw s6, 0(a2)
-; RISCV32-NEXT:    sw s6, 0(sp)
-; RISCV32-NEXT:    lw s4, 4(a1)
-; RISCV32-NEXT:    sw s4, 20(sp)
-; RISCV32-NEXT:    lw s7, 0(a1)
-; RISCV32-NEXT:    sw s7, 16(sp)
-; RISCV32-NEXT:    addi a0, sp, 32
-; RISCV32-NEXT:    addi a1, sp, 16
-; RISCV32-NEXT:    mv a2, sp
+; RISCV32-NEXT:    sw s5, 12(sp)
+; RISCV32-NEXT:    lw s7, 0(a2)
+; RISCV32-NEXT:    sw s7, 8(sp)
+; RISCV32-NEXT:    lw s6, 4(a1)
+; RISCV32-NEXT:    sw s6, 28(sp)
+; RISCV32-NEXT:    lw s8, 0(a1)
+; RISCV32-NEXT:    sw s8, 24(sp)
+; RISCV32-NEXT:    addi a0, sp, 40
+; RISCV32-NEXT:    addi a1, sp, 24
+; RISCV32-NEXT:    addi a2, sp, 8
 ; RISCV32-NEXT:    call __multi3
-; RISCV32-NEXT:    lw t1, 12(s1)
+; RISCV32-NEXT:    lw t2, 12(s1)
 ; RISCV32-NEXT:    lw a1, 8(s1)
 ; RISCV32-NEXT:    mul a0, s5, a1
-; RISCV32-NEXT:    mul a2, t1, s6
+; RISCV32-NEXT:    mul a2, t2, s7
 ; RISCV32-NEXT:    add a0, a2, a0
-; RISCV32-NEXT:    lw t5, 12(s3)
+; RISCV32-NEXT:    lw t3, 12(s3)
 ; RISCV32-NEXT:    lw a3, 8(s3)
-; RISCV32-NEXT:    mul a2, s4, a3
-; RISCV32-NEXT:    mul a4, t5, s7
+; RISCV32-NEXT:    mul a2, s6, a3
+; RISCV32-NEXT:    mul a4, t3, s8
 ; RISCV32-NEXT:    add a2, a4, a2
-; RISCV32-NEXT:    mul a4, a3, s7
-; RISCV32-NEXT:    mul a5, a1, s6
-; RISCV32-NEXT:    add s1, a5, a4
-; RISCV32-NEXT:    sltu a4, s1, a5
-; RISCV32-NEXT:    mulhu a6, a3, s7
+; RISCV32-NEXT:    mul a4, a3, s8
+; RISCV32-NEXT:    mul a5, a1, s7
+; RISCV32-NEXT:    add a4, a5, a4
+; RISCV32-NEXT:    sltu a5, a4, a5
+; RISCV32-NEXT:    mulhu a6, a3, s8
 ; RISCV32-NEXT:    add a7, a6, a2
-; RISCV32-NEXT:    mulhu t2, a1, s6
-; RISCV32-NEXT:    add t4, t2, a0
-; RISCV32-NEXT:    add a0, t4, a7
-; RISCV32-NEXT:    add a0, a0, a4
-; RISCV32-NEXT:    xor a2, s5, zero
-; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    xor a4, t1, zero
-; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    and a2, a4, a2
-; RISCV32-NEXT:    xor a4, s4, zero
-; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    xor a5, t5, zero
-; RISCV32-NEXT:    snez a5, a5
-; RISCV32-NEXT:    and a4, a5, a4
-; RISCV32-NEXT:    mulhu a5, t5, s7
-; RISCV32-NEXT:    xor a5, a5, zero
-; RISCV32-NEXT:    snez a5, a5
-; RISCV32-NEXT:    or t0, a4, a5
-; RISCV32-NEXT:    mulhu a4, t1, s6
-; RISCV32-NEXT:    xor a4, a4, zero
-; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    or t3, a2, a4
-; RISCV32-NEXT:    lw a4, 44(sp)
-; RISCV32-NEXT:    add a5, a4, a0
-; RISCV32-NEXT:    lw a2, 40(sp)
-; RISCV32-NEXT:    add a0, a2, s1
-; RISCV32-NEXT:    sltu t6, a0, a2
-; RISCV32-NEXT:    add s1, a5, t6
-; RISCV32-NEXT:    beq s1, a4, .LBB0_2
+; RISCV32-NEXT:    mulhu t0, a1, s7
+; RISCV32-NEXT:    add t1, t0, a0
+; RISCV32-NEXT:    add a0, t1, a7
+; RISCV32-NEXT:    add a2, a0, a5
+; RISCV32-NEXT:    lw a0, 52(sp)
+; RISCV32-NEXT:    add a5, a0, a2
+; RISCV32-NEXT:    lw a2, 48(sp)
+; RISCV32-NEXT:    add t4, a2, a4
+; RISCV32-NEXT:    sltu s1, t4, a2
+; RISCV32-NEXT:    add a4, a5, s1
+; RISCV32-NEXT:    beq a4, a0, .LBB0_2
 ; RISCV32-NEXT:  # %bb.1: # %start
-; RISCV32-NEXT:    sltu t6, s1, a4
+; RISCV32-NEXT:    sltu s1, a4, a0
 ; RISCV32-NEXT:  .LBB0_2: # %start
-; RISCV32-NEXT:    xor a4, s1, a4
-; RISCV32-NEXT:    xor a2, a0, a2
-; RISCV32-NEXT:    or a2, a2, a4
-; RISCV32-NEXT:    sltu t2, t4, t2
-; RISCV32-NEXT:    mulhu a4, s5, a1
-; RISCV32-NEXT:    xor a4, a4, zero
-; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    or t3, t3, a4
-; RISCV32-NEXT:    sltu a6, a7, a6
-; RISCV32-NEXT:    mulhu a4, s4, a3
-; RISCV32-NEXT:    xor a4, a4, zero
-; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    or a4, t0, a4
-; RISCV32-NEXT:    lw a5, 36(sp)
-; RISCV32-NEXT:    sw a5, 4(s2)
-; RISCV32-NEXT:    lw a5, 32(sp)
-; RISCV32-NEXT:    sw a5, 0(s2)
-; RISCV32-NEXT:    sw a0, 8(s2)
-; RISCV32-NEXT:    sw s1, 12(s2)
-; RISCV32-NEXT:    mv a0, zero
-; RISCV32-NEXT:    beqz a2, .LBB0_4
+; RISCV32-NEXT:    xor a0, a4, a0
+; RISCV32-NEXT:    xor a2, t4, a2
+; RISCV32-NEXT:    or a0, a2, a0
+; RISCV32-NEXT:    beq a0, s4, .LBB0_4
 ; RISCV32-NEXT:  # %bb.3: # %start
-; RISCV32-NEXT:    mv a0, t6
+; RISCV32-NEXT:    mv s4, s1
 ; RISCV32-NEXT:  .LBB0_4: # %start
-; RISCV32-NEXT:    or a2, a4, a6
-; RISCV32-NEXT:    or a4, t3, t2
-; RISCV32-NEXT:    or a3, a3, t5
-; RISCV32-NEXT:    or a1, a1, t1
-; RISCV32-NEXT:    xor a1, a1, zero
-; RISCV32-NEXT:    xor a3, a3, zero
+; RISCV32-NEXT:    snez a0, s5
+; RISCV32-NEXT:    snez a2, t2
+; RISCV32-NEXT:    and a0, a2, a0
+; RISCV32-NEXT:    snez a2, s6
+; RISCV32-NEXT:    snez a5, t3
+; RISCV32-NEXT:    and a2, a5, a2
+; RISCV32-NEXT:    mulhu a5, t3, s8
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    or a2, a2, a5
+; RISCV32-NEXT:    mulhu a5, t2, s7
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    or a0, a0, a5
+; RISCV32-NEXT:    sltu t0, t1, t0
+; RISCV32-NEXT:    mulhu s1, s5, a1
+; RISCV32-NEXT:    snez s1, s1
+; RISCV32-NEXT:    or a0, a0, s1
+; RISCV32-NEXT:    sltu s1, a7, a6
+; RISCV32-NEXT:    mulhu a5, s6, a3
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    or a2, a2, a5
+; RISCV32-NEXT:    lw a5, 44(sp)
+; RISCV32-NEXT:    sw a5, 4(s2)
+; RISCV32-NEXT:    lw a5, 40(sp)
+; RISCV32-NEXT:    sw a5, 0(s2)
+; RISCV32-NEXT:    sw t4, 8(s2)
+; RISCV32-NEXT:    sw a4, 12(s2)
+; RISCV32-NEXT:    or a2, a2, s1
+; RISCV32-NEXT:    or a0, a0, t0
+; RISCV32-NEXT:    or a1, a1, t2
+; RISCV32-NEXT:    or a3, a3, t3
 ; RISCV32-NEXT:    snez a3, a3
 ; RISCV32-NEXT:    snez a1, a1
 ; RISCV32-NEXT:    and a1, a1, a3
-; RISCV32-NEXT:    or a1, a1, a4
-; RISCV32-NEXT:    or a1, a1, a2
 ; RISCV32-NEXT:    or a0, a1, a0
+; RISCV32-NEXT:    or a0, a0, a2
+; RISCV32-NEXT:    or a0, a0, s4
 ; RISCV32-NEXT:    andi a0, a0, 1
 ; RISCV32-NEXT:    sb a0, 16(s2)
-; RISCV32-NEXT:    lw s7, 48(sp)
-; RISCV32-NEXT:    lw s6, 52(sp)
-; RISCV32-NEXT:    lw s5, 56(sp)
-; RISCV32-NEXT:    lw s4, 60(sp)
-; RISCV32-NEXT:    lw s3, 64(sp)
-; RISCV32-NEXT:    lw s2, 68(sp)
-; RISCV32-NEXT:    lw s1, 72(sp)
-; RISCV32-NEXT:    lw ra, 76(sp)
-; RISCV32-NEXT:    addi sp, sp, 80
+; RISCV32-NEXT:    lw s8, 60(sp)
+; RISCV32-NEXT:    lw s7, 64(sp)
+; RISCV32-NEXT:    lw s6, 68(sp)
+; RISCV32-NEXT:    lw s5, 72(sp)
+; RISCV32-NEXT:    lw s4, 76(sp)
+; RISCV32-NEXT:    lw s3, 80(sp)
+; RISCV32-NEXT:    lw s2, 84(sp)
+; RISCV32-NEXT:    lw s1, 88(sp)
+; RISCV32-NEXT:    lw ra, 92(sp)
+; RISCV32-NEXT:    addi sp, sp, 96
 ; RISCV32-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/test/CodeGen/SPARC/codemodel.ll b/test/CodeGen/SPARC/codemodel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..68da48a0e950641256125ff25b48d5062dafe704
--- /dev/null
+++ b/test/CodeGen/SPARC/codemodel.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -verify-machineinstrs -o - -mtriple=sparc64-unknown-linux -code-model=tiny < %s 2>&1 | FileCheck %s --check-prefix=TINY
+; RUN: not llc -verify-machineinstrs -o - -mtriple=sparc64-unknown-linux -code-model=kernel < %s 2>&1 | FileCheck %s --check-prefix=KERNEL
+
+; TINY:    Target does not support the tiny CodeModel
+; KERNEL:    Target does not support the kernel CodeModel
+
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll
index a67a45e6b1d1a5d0979ef6168b6dc9cecb796a5c..814845459de78ab725b25016fda62172d673ed27 100644
--- a/test/CodeGen/SPARC/inlineasm.ll
+++ b/test/CodeGen/SPARC/inlineasm.ll
@@ -130,3 +130,16 @@ entry:
   tail call void asm sideeffect "faddd $0,$1,$2", "{f20},{f20},{f20}"(double 9.0, double 10.0, double 11.0)
   ret void
 }
+
+; CHECK-LABEL: test_constraint_f_e_i32_i64:
+; CHECK: ld [%o0+%lo(.LCPI13_0)], %f0
+; CHECK: ldd [%o0+%lo(.LCPI13_1)], %f2
+; CHECK: fadds %f0, %f0, %f0
+; CHECK: faddd %f2, %f2, %f0
+
+define void @test_constraint_f_e_i32_i64() {
+entry:
+  %0 = call float asm sideeffect "fadds $1, $2, $0", "=f,f,e"(i32 0, i32 0)
+  %1 = call double asm sideeffect "faddd $1, $2, $0", "=f,f,e"(i64 0, i64 0)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/codemodel.ll b/test/CodeGen/SystemZ/codemodel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4375366cfd79975da59ed8f93c6c35fdc2bb7312
--- /dev/null
+++ b/test/CodeGen/SystemZ/codemodel.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -verify-machineinstrs -o - -mtriple=s390x-linux-gnu -code-model=tiny < %s 2>&1 | FileCheck %s --check-prefix=TINY
+; RUN: not llc -verify-machineinstrs -o - -mtriple=s390x-linux-gnu -code-model=kernel < %s 2>&1 | FileCheck %s --check-prefix=KERNEL
+
+; TINY:    Target does not support the tiny CodeModel
+; KERNEL:    Target does not support the kernel CodeModel
+
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir b/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
new file mode 100644
index 0000000000000000000000000000000000000000..196763aaedbbe4e4b1b0e09030b68cb502e5c576
--- /dev/null
+++ b/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
@@ -0,0 +1,280 @@
+# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -start-before=greedy %s -o - \
+# RUN:   | FileCheck %s
+#
+# Test that the reg alloc hints are given in a good order that gives no more
+# than 5 LGRs in output.
+
+--- |
+  ; ModuleID = 'tc.ll'
+  source_filename = "tc.ll"
+  target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+  target triple = "s390x-unknown-linux-gnu"
+  
+  %0 = type { i32, i32, i32, float, float, float, i8*, i32 }
+  
+  define void @fun(i32 signext %arg, i32 zeroext %arg1) #0 {
+  bb:
+    %tmp = sext i32 %arg to i64
+    %tmp2 = tail call i8* @sre_malloc()
+    %tmp4 = tail call i8* @sre_malloc()
+    %tmp6 = add i32 %arg, -1
+    tail call void @malloc()
+    %0 = trunc i32 %arg to i2
+    %1 = add i2 %0, -1
+    br label %bb8
+  
+  bb8:                                              ; preds = %bb54, %bb
+    %lsr.iv6 = phi i2 [ %lsr.iv.next7, %bb54 ], [ %1, %bb ]
+    %tmp9 = phi i64 [ %tmp57, %bb54 ], [ 0, %bb ]
+    %tmp10 = phi i64 [ 0, %bb54 ], [ %tmp, %bb ]
+    %tmp12 = sub i64 %tmp, %tmp9
+    br label %bb14
+  
+  bb14:                                             ; preds = %bb39, %bb8
+    %lsr.iv8 = phi i2 [ %lsr.iv.next9, %bb39 ], [ %lsr.iv6, %bb8 ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %bb39 ], [ 1, %bb8 ]
+    %tmp15 = phi i64 [ 0, %bb8 ], [ %tmp19, %bb39 ]
+    %tmp17 = phi i32 [ %tmp6, %bb8 ], [ %tmp35, %bb39 ]
+    %tmp18 = phi i32 [ 0, %bb8 ], [ %tmp34, %bb39 ]
+    %2 = bitcast i8* %tmp2 to float**
+    %tmp19 = add nuw nsw i64 %tmp15, 1
+    %3 = zext i2 %lsr.iv8 to i64
+    %4 = mul i64 %3, -1
+    %tmp21 = getelementptr inbounds float*, float** %2, i64 %tmp15
+    %tmp22 = load float*, float** %tmp21
+    %tmp23 = trunc i64 %tmp15 to i32
+    %scevgep = getelementptr float, float* %tmp22, i64 %tmp15
+    br label %bb25
+  
+  bb25:                                             ; preds = %bb25, %bb14
+    %lsr.iv10 = phi i64 [ %lsr.iv.next11, %bb25 ], [ %4, %bb14 ]
+    %lsr.iv3 = phi float* [ %scevgep4, %bb25 ], [ %scevgep, %bb14 ]
+    %lsr.iv1 = phi i32 [ %lsr.iv.next2, %bb25 ], [ %lsr.iv, %bb14 ]
+    %tmp27 = phi i32 [ %tmp35, %bb25 ], [ %tmp17, %bb14 ]
+    %tmp28 = phi i32 [ %tmp34, %bb25 ], [ %tmp18, %bb14 ]
+    %scevgep5 = getelementptr float, float* %lsr.iv3, i64 1
+    %tmp31 = load float, float* %scevgep5
+    %tmp32 = fcmp olt float %tmp31, undef
+    %tmp34 = select i1 %tmp32, i32 %lsr.iv1, i32 %tmp28
+    %tmp35 = select i1 %tmp32, i32 %tmp23, i32 %tmp27
+    %lsr.iv.next2 = add i32 %lsr.iv1, 1
+    %scevgep4 = getelementptr float, float* %lsr.iv3, i64 1
+    %lsr.iv.next11 = add i64 %lsr.iv10, 1
+    %tmp38 = icmp eq i64 %lsr.iv.next11, 0
+    br i1 %tmp38, label %bb39, label %bb25
+  
+  bb39:                                             ; preds = %bb25
+    %lsr.iv.next = add i32 %lsr.iv, 1
+    %lsr.iv.next9 = add i2 %lsr.iv8, -1
+    %tmp41 = icmp eq i64 %tmp19, %tmp10
+    br i1 %tmp41, label %bb42, label %bb14
+  
+  bb42:                                             ; preds = %bb39
+    %5 = bitcast i8* %tmp4 to i32*
+    %tmp43 = getelementptr inbounds i32, i32* %5, i64 undef
+    %tmp44 = load i32, i32* %tmp43
+    %tmp45 = sub nsw i32 %tmp44, %arg
+    %tmp46 = sext i32 %tmp45 to i64
+    %tmp47 = getelementptr inbounds %0, %0* null, i64 %tmp46, i32 7
+    %tmp48 = load i32, i32* %tmp47
+    %tmp49 = add nsw i32 0, %tmp48
+    store i32 %tmp49, i32* undef
+    %cond = icmp eq i32 %arg1, 0
+    br i1 %cond, label %bb52, label %bb54
+  
+  bb52:                                             ; preds = %bb42
+    %tmp5312 = bitcast float* undef to float*
+    br label %bb54
+  
+  bb54:                                             ; preds = %bb42, %bb52
+    %6 = bitcast i8* %tmp4 to i32*
+    %tmp55 = add i32 0, %arg
+    %tmp56 = getelementptr inbounds i32, i32* %6, i64 undef
+    store i32 %tmp55, i32* %tmp56
+    %tmp57 = add i64 %tmp9, 1
+    %lsr.iv.next7 = add i2 %lsr.iv6, -1
+    br label %bb8
+  }
+  
+  declare i8* @sre_malloc() #0
+  
+  declare void @malloc() #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { "target-cpu"="z13" }
+  attributes #1 = { nounwind }
+
+...
+
+# CHECK: lgr
+# CHECK: lgr
+# CHECK: lgr
+# CHECK: lgr
+# CHECK: lgr
+# CHECK-NOT: lgr
+
+---
+name:            fun
+alignment:       4
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: gr64bit }
+  - { id: 1, class: addr64bit }
+  - { id: 2, class: addr64bit }
+  - { id: 3, class: gr32bit }
+  - { id: 4, class: gr32bit }
+  - { id: 5, class: grx32bit }
+  - { id: 6, class: addr64bit }
+  - { id: 7, class: gr64bit }
+  - { id: 8, class: grx32bit }
+  - { id: 9, class: grx32bit }
+  - { id: 10, class: addr64bit }
+  - { id: 11, class: grx32bit }
+  - { id: 12, class: grx32bit }
+  - { id: 13, class: gr64bit }
+  - { id: 14, class: gr64bit }
+  - { id: 15, class: grx32bit }
+  - { id: 16, class: gr64bit }
+  - { id: 17, class: addr64bit }
+  - { id: 18, class: addr64bit }
+  - { id: 19, class: grx32bit }
+  - { id: 20, class: grx32bit }
+  - { id: 21, class: grx32bit }
+  - { id: 22, class: grx32bit }
+  - { id: 23, class: grx32bit }
+  - { id: 24, class: grx32bit }
+  - { id: 25, class: gr64bit }
+  - { id: 26, class: gr64bit }
+  - { id: 27, class: grx32bit }
+  - { id: 28, class: grx32bit }
+  - { id: 29, class: gr64bit }
+  - { id: 30, class: grx32bit }
+  - { id: 31, class: gr64bit }
+  - { id: 32, class: gr64bit }
+  - { id: 33, class: gr32bit }
+  - { id: 34, class: grx32bit }
+  - { id: 35, class: gr64bit }
+  - { id: 36, class: gr64bit }
+  - { id: 37, class: gr64bit }
+  - { id: 38, class: grx32bit }
+  - { id: 39, class: gr64bit }
+  - { id: 40, class: grx32bit }
+  - { id: 41, class: gr64bit }
+  - { id: 42, class: gr64bit }
+  - { id: 43, class: gr64bit }
+  - { id: 44, class: gr64bit }
+  - { id: 45, class: addr64bit }
+  - { id: 46, class: gr64bit }
+  - { id: 47, class: fp32bit }
+  - { id: 48, class: fp32bit }
+  - { id: 49, class: gr32bit }
+  - { id: 50, class: gr32bit }
+  - { id: 51, class: gr64bit }
+  - { id: 52, class: addr64bit }
+  - { id: 53, class: addr64bit }
+  - { id: 54, class: gr64bit }
+  - { id: 55, class: gr32bit }
+  - { id: 56, class: addr64bit }
+  - { id: 57, class: gr64bit }
+  - { id: 58, class: grx32bit }
+  - { id: 59, class: grx32bit }
+  - { id: 60, class: addr64bit }
+  - { id: 61, class: grx32bit }
+  - { id: 62, class: grx32bit }
+  - { id: 63, class: addr64bit }
+  - { id: 64, class: addr64bit }
+  - { id: 65, class: grx32bit }
+  - { id: 66, class: grx32bit }
+  - { id: 67, class: grx32bit }
+liveins:         
+  - { reg: '$r2d', virtual-reg: '%31' }
+  - { reg: '$r3d', virtual-reg: '%32' }
+frameInfo:       
+  hasCalls:        true
+body:             |
+  bb.0.bb:
+    liveins: $r2d, $r3d
+  
+    %32:gr64bit = COPY $r3d
+    %0:gr64bit = COPY $r2d
+    ADJCALLSTACKDOWN 0, 0
+    CallBRASL @sre_malloc, csr_systemz, implicit-def dead $r14d, implicit-def dead $cc, implicit-def $r2d
+    %1:addr64bit = COPY $r2d
+    ADJCALLSTACKUP 0, 0
+    ADJCALLSTACKDOWN 0, 0
+    CallBRASL @sre_malloc, csr_systemz, implicit-def dead $r14d, implicit-def dead $cc, implicit-def $r2d
+    %2:addr64bit = COPY $r2d
+    ADJCALLSTACKUP 0, 0
+    %3:gr32bit = AHIMuxK %0.subreg_l32, -1, implicit-def dead $cc
+    ADJCALLSTACKDOWN 0, 0
+    CallBRASL @malloc, csr_systemz, implicit-def dead $r14d, implicit-def dead $cc
+    ADJCALLSTACKUP 0, 0
+    %55:gr32bit = AHIMuxK %0.subreg_l32, 3, implicit-def dead $cc
+    %56:addr64bit = LGHI 0
+    %57:gr64bit = COPY %0
+  
+  bb.1.bb8:
+    %62:grx32bit = LHIMux 0
+    %59:grx32bit = LHIMux 1
+    undef %41.subreg_l32:gr64bit = COPY %55
+    %60:addr64bit = LGHI 0
+    %61:grx32bit = COPY %3
+  
+  bb.2.bb14:
+    %10:addr64bit = COPY %60
+    %60:addr64bit = nuw nsw LA %10, 1, $noreg
+    %43:gr64bit = RISBGN undef %43, %41, 62, 191, 0
+    %63:addr64bit = LCGR %43, implicit-def dead $cc
+    %45:addr64bit = SLLG %10, $noreg, 3
+    %64:addr64bit = SLLG %10, $noreg, 2
+    %64:addr64bit = AG %64, %1, 0, %45, implicit-def dead $cc :: (load 8 from %ir.tmp21)
+    %65:grx32bit = COPY %59
+  
+  bb.3.bb25:
+    successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+  
+    %47:fp32bit = VL32 %64, 4, $noreg :: (load 4 from %ir.scevgep5)
+    %25:gr64bit = LA %64, 4, $noreg
+    CEBR %47, undef %48:fp32bit, implicit-def $cc
+    %62:grx32bit = LOCRMux %62, %65, 15, 4, implicit $cc
+    %61:grx32bit = LOCRMux %61, %10.subreg_l32, 15, 4, implicit killed $cc
+    %65:grx32bit = AHIMux %65, 1, implicit-def dead $cc
+    %63:addr64bit = LA %63, 1, $noreg
+    CGHI %63, 0, implicit-def $cc
+    %64:addr64bit = COPY %25
+    BRC 14, 6, %bb.3, implicit killed $cc
+    J %bb.4
+  
+  bb.4.bb39:
+    successors: %bb.5(0x04000000), %bb.2(0x7c000000)
+  
+    %59:grx32bit = AHIMux %59, 1, implicit-def dead $cc
+    %41.subreg_l32:gr64bit = AHIMux %41.subreg_l32, 3, implicit-def dead $cc
+    CGR %60, %57, implicit-def $cc
+    BRC 14, 6, %bb.2, implicit killed $cc
+    J %bb.5
+  
+  bb.5.bb42:
+    successors: %bb.6(0x30000000), %bb.7(0x50000000)
+  
+    %50:gr32bit = LMux %2, 0, $noreg :: (load 4 from %ir.tmp43)
+    %50:gr32bit = nsw SR %50, %0.subreg_l32, implicit-def dead $cc
+    %52:addr64bit = LGFR %50
+    %52:addr64bit = MGHI %52, 40
+    MVC undef %53:addr64bit, 0, 4, %52, 32 :: (store 4 into `i32* undef`), (load 4 from %ir.tmp47)
+    CHIMux %32.subreg_l32, 0, implicit-def $cc
+    BRC 14, 6, %bb.7, implicit killed $cc
+    J %bb.6
+  
+  bb.6.bb52:
+  
+  bb.7.bb54:
+    STMux %0.subreg_l32, %2, 0, $noreg :: (store 4 into %ir.tmp56)
+    %56:addr64bit = LA %56, 1, $noreg
+    %55:gr32bit = AHIMux %55, 3, implicit-def dead $cc
+    %57:gr64bit = LGHI 0
+    J %bb.1
+
+...
diff --git a/test/CodeGen/SystemZ/dag-combine-03.ll b/test/CodeGen/SystemZ/dag-combine-03.ll
index bcb75281d0a245dbd5dde74a7877e12314e80e76..c197c9a73a36dd445c9cfbb83e5b1e0fa9bbfe41 100644
--- a/test/CodeGen/SystemZ/dag-combine-03.ll
+++ b/test/CodeGen/SystemZ/dag-combine-03.ll
@@ -10,9 +10,12 @@ entry:
 lab0:
   %phi = phi i64 [ %sel, %lab0 ], [ 0, %entry ]
   %add = add nuw nsw i64 %phi, 1
+  %add2 = add nuw nsw i64 %phi, 2
   %cmp = icmp eq i64 %add, undef
+  %cmp2 = icmp eq i64 %add2, undef
   %ins = insertelement <2 x i1> undef, i1 %cmp, i32 0
-  %xor = xor <2 x i1> %ins, <i1 true, i1 true>
+  %ins2 = insertelement <2 x i1> undef, i1 %cmp2, i32 0
+  %xor = xor <2 x i1> %ins, %ins2
   %extr = extractelement <2 x i1> %xor, i32 0
 ; The EXTRACT_VECTOR_ELT is done first into an i32, and then AND:ed with
 ; 1. The AND is not actually necessary since the element contains a CC (i1)
diff --git a/test/CodeGen/SystemZ/fp-round-01.ll b/test/CodeGen/SystemZ/fp-round-01.ll
index b68a15bccdd50f6dbdebc42c8f8cb50e4a15ebdd..186fb3c2d0d4785361ddac753f1b0637eef17346 100644
--- a/test/CodeGen/SystemZ/fp-round-01.ll
+++ b/test/CodeGen/SystemZ/fp-round-01.ll
@@ -54,8 +54,17 @@ define double @f5(double %f) {
   ret double %res
 }
 
-; Test nearbyint for f128: omitted for now because we cannot handle
-; indirect arguments.
+; Test nearbyint for f128.
+declare fp128 @llvm.nearbyint.f128(fp128 %f)
+define void @f6(fp128 *%ptr) {
+; CHECK-LABEL: f6:
+; CHECK: brasl %r14, nearbyintl@PLT
+; CHECK: br %r14
+  %src = load fp128, fp128 *%ptr
+  %res = call fp128 @llvm.nearbyint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
 
 ; Test floor for f32.
 declare float @llvm.floor.f32(float %f)
@@ -77,8 +86,17 @@ define double @f8(double %f) {
   ret double %res
 }
 
-; Test floor for f128: omitted for now because we cannot handle
-; indirect arguments.
+; Test floor for f128.
+declare fp128 @llvm.floor.f128(fp128 %f)
+define void @f9(fp128 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: brasl %r14, floorl@PLT
+; CHECK: br %r14
+  %src = load fp128, fp128 *%ptr
+  %res = call fp128 @llvm.floor.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
 
 ; Test ceil for f32.
 declare float @llvm.ceil.f32(float %f)
@@ -100,8 +118,17 @@ define double @f11(double %f) {
   ret double %res
 }
 
-; Test ceil for f128: omitted for now because we cannot handle
-; indirect arguments.
+; Test ceil for f128.
+declare fp128 @llvm.ceil.f128(fp128 %f)
+define void @f12(fp128 *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: brasl %r14, ceill@PLT
+; CHECK: br %r14
+  %src = load fp128, fp128 *%ptr
+  %res = call fp128 @llvm.ceil.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
 
 ; Test trunc for f32.
 declare float @llvm.trunc.f32(float %f)
@@ -123,8 +150,17 @@ define double @f14(double %f) {
   ret double %res
 }
 
-; Test trunc for f128: omitted for now because we cannot handle
-; indirect arguments.
+; Test trunc for f128.
+declare fp128 @llvm.trunc.f128(fp128 %f)
+define void @f15(fp128 *%ptr) {
+; CHECK-LABEL: f15:
+; CHECK: brasl %r14, truncl@PLT
+; CHECK: br %r14
+  %src = load fp128, fp128 *%ptr
+  %res = call fp128 @llvm.trunc.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
 
 ; Test round for f32.
 declare float @llvm.round.f32(float %f)
@@ -146,5 +182,14 @@ define double @f17(double %f) {
   ret double %res
 }
 
-; Test round for f128: omitted for now because we cannot handle
-; indirect arguments.
+; Test round for f128.
+declare fp128 @llvm.round.f128(fp128 %f)
+define void @f18(fp128 *%ptr) {
+; CHECK-LABEL: f18:
+; CHECK: brasl %r14, roundl@PLT
+; CHECK: br %r14
+  %src = load fp128, fp128 *%ptr
+  %res = call fp128 @llvm.round.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/knownbits.ll b/test/CodeGen/SystemZ/knownbits.ll
index 703c0bf947992958455407919e6168f7ac9f7122..f23ffc59ad59fa41b82563e2a5d37742afbd5100 100644
--- a/test/CodeGen/SystemZ/knownbits.ll
+++ b/test/CodeGen/SystemZ/knownbits.ll
@@ -1,16 +1,19 @@
-; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode().
-;
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 < %s  | FileCheck %s
 
+; Test that DAGCombiner gets helped by computeKnownBitsForTargetNode().
+
 ; SystemZISD::REPLICATE
 define i32 @f0() {
 ; CHECK-LABEL: f0:
-; CHECK-LABEL: # %bb.0:
-; CHECK:       vlgvf
-; CHECK-NOT:   lhi %r2, 0
-; CHECK-NOT:   chi %r0, 0
-; CHECK-NOT:   lochilh %r2, 1
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vceqf %v0, %v0, %v0
+; CHECK-NEXT:    vrepif %v1, 1
+; CHECK-NEXT:    vnc %v0, %v1, %v0
+; CHECK-NEXT:    vlgvf %r2, %v0, 3
+; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT:    br %r14
   %cmp0 = icmp ne <4 x i32> undef, zeroinitializer
   %zxt0 = zext <4 x i1> %cmp0 to <4 x i32>
   %ext0 = extractelement <4 x i32> %zxt0, i32 3
@@ -25,23 +28,35 @@ exit:
 }
 
 ; SystemZISD::JOIN_DWORDS (and REPLICATE)
-define void @f1() {
 ; The DAG XOR has JOIN_DWORDS and REPLICATE operands. With KnownBits properly set
 ; for both these nodes, ICMP is used instead of TM during lowering because
 ; adjustForRedundantAnd() succeeds.
+define void @f1() {
 ; CHECK-LABEL: f1:
-; CHECK-LABEL: # %bb.0:
-; CHECK-NOT:   tmll
-; CHECK-NOT:   jne
-; CHECK:       cijlh
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clhhsi 0, 0
+; CHECK-NEXT:    lhi %r0, 0
+; CHECK-NEXT:    lochie %r0, 1
+; CHECK-NEXT:    lghi %r1, 1
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vrepig %v1, 1
+; CHECK-NEXT:    vx %v0, %v0, %v1
+; CHECK-NEXT:    vlgvf %r0, %v0, 1
+; CHECK-NEXT:    cijlh %r0, 0, .LBB1_3
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vlgvf %r0, %v0, 3
+; CHECK-NEXT:    cijlh %r0, 0, .LBB1_3
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:  .LBB1_3:
   %1 = load i16, i16* null, align 2
   %2 = icmp eq i16 %1, 0
   %3 = insertelement <2 x i1> undef, i1 %2, i32 0
   %4 = insertelement <2 x i1> %3, i1 true, i32 1
   %5 = xor <2 x i1> %4, <i1 true, i1 true>
   %6 = extractelement <2 x i1> %5, i32 0
-  %7 = or i1 %6, undef
-  br i1 %7, label %9, label %8
+  %7 = extractelement <2 x i1> %5, i32 1
+  %8 = or i1 %6, %7
+  br i1 %8, label %10, label %9
 
 ; <label>:8:                                      ; preds = %0
   unreachable
diff --git a/test/CodeGen/SystemZ/pr36164.ll b/test/CodeGen/SystemZ/pr36164.ll
index 312961fa4b9aaeb2b0838fcb0df7714117598889..2ed6fa901db082247eef72587afe1943d6b590d5 100644
--- a/test/CodeGen/SystemZ/pr36164.ll
+++ b/test/CodeGen/SystemZ/pr36164.ll
@@ -15,54 +15,39 @@
 define void @main() local_unnamed_addr #0 {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    stmg %r12, %r15, 96(%r15)
-; CHECK-NEXT:    .cfi_offset %r12, -64
-; CHECK-NEXT:    .cfi_offset %r13, -56
-; CHECK-NEXT:    .cfi_offset %r14, -48
-; CHECK-NEXT:    .cfi_offset %r15, -40
 ; CHECK-NEXT:    lhi %r0, 1
 ; CHECK-NEXT:    larl %r1, g_938
-; CHECK-NEXT:    lhi %r2, 2
-; CHECK-NEXT:    lhi %r3, 3
-; CHECK-NEXT:    lhi %r4, 0
-; CHECK-NEXT:    lhi %r5, 4
-; CHECK-NEXT:    larl %r14, g_11
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lhi %r3, 4
+; CHECK-NEXT:    larl %r4, g_11
 ; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    strl %r0, g_73
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    strl %r0, g_69
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-DAG:     lghi %r13, 24
-; CHECK-DAG:     strl %r2, g_69
-; CHECK-DAG:     ag %r13, 0(%r1)
-; CHECK-NEXT:    lrl %r12, g_832
-; CHECK-NEXT:    strl %r3, g_69
-; CHECK-NEXT:    lrl %r12, g_832
-; CHECK-NEXT:    strl %r4, g_69
-; CHECK-NEXT:    lrl %r12, g_832
-; CHECK-NEXT:    strl %r0, g_69
-; CHECK-NEXT:    lrl %r12, g_832
 ; CHECK-NEXT:    strl %r2, g_69
-; CHECK-NEXT:    lrl %r12, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    lrl %r5, g_832
+; CHECK-NEXT:    agsi 0(%r1), 24
+; CHECK-NEXT:    lrl %r5, g_832
 ; CHECK-NEXT:    strl %r3, g_69
-; CHECK-NEXT:    stgrl %r13, g_938
-; CHECK-NEXT:    lrl %r13, g_832
-; CHECK-NEXT:    strl %r5, g_69
-; CHECK-NEXT:    mvi 0(%r14), 1
+; CHECK-NEXT:    mvi 0(%r4), 1
 ; CHECK-NEXT:    j .LBB0_1
   br label %1
 
diff --git a/test/CodeGen/SystemZ/regalloc-GR128-02.mir b/test/CodeGen/SystemZ/regalloc-GR128-02.mir
new file mode 100644
index 0000000000000000000000000000000000000000..65758bea4fdb3eb60f97da3143453aee2fd17150
--- /dev/null
+++ b/test/CodeGen/SystemZ/regalloc-GR128-02.mir
@@ -0,0 +1,68 @@
+# RUN: llc %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+# RUN:   -start-before=simple-register-coalescing -o - 2>&1 > /dev/null
+
+# Test that the SystemZ shouldCoalesce() implementation does not crash in
+# case of an undef use in another MBB. This was discovered in testing with
+# -systemz-subreg-liveness.
+
+--- |
+  @g_74 = external dso_local unnamed_addr global i32, align 4
+  @g_193 = external dso_local unnamed_addr global i32, align 4
+  
+  define dso_local void @main() local_unnamed_addr {
+    %1 = load i32, i32* @g_193
+    %2 = or i32 %1, -1395153718
+    %3 = sdiv i32 -1395153718, %2
+    br i1 undef, label %5, label %4
+  
+  ; <label>:4:                                      ; preds = %0
+    store i32 %3, i32* @g_74
+    store i32 -9, i32* @g_74
+    ret void
+  
+  ; <label>:5:                                      ; preds = %0
+    unreachable
+  }
+
+...
+---
+name:            main
+alignment:       4
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: grx32bit }
+  - { id: 1, class: addr64bit }
+  - { id: 2, class: gr32bit }
+  - { id: 3, class: gr32bit }
+  - { id: 4, class: gr64bit }
+  - { id: 5, class: gr128bit }
+  - { id: 6, class: gr128bit }
+  - { id: 7, class: grx32bit }
+  - { id: 8, class: gr32bit }
+  - { id: 9, class: gr128bit }
+body:             |
+  bb.0 (%ir-block.0):
+    successors: %bb.2(0x00000001), %bb.1(0x7fffffff)
+  
+    %1:addr64bit = LARL @g_193
+    %2:gr32bit = IIFMux 2899813578
+    %3:gr32bit = COPY killed %2
+    %3:gr32bit = O %3, killed %1, 0, $noreg, implicit-def dead $cc :: (dereferenceable load 4 from @g_193)
+    %4:gr64bit = LGFI -1395153718
+    undef %5.subreg_l64:gr128bit = COPY killed %4
+    %6:gr128bit = COPY killed %5
+    dead %6:gr128bit = DSGFR %6, killed %3
+    %7:grx32bit = LHIMux 0
+    CHIMux killed %7, 0, implicit-def $cc
+    BRC 14, 6, %bb.2, implicit killed $cc
+    J %bb.1
+  
+  bb.1 (%ir-block.4):
+    %8:gr32bit = LHIMux -9
+    STRL killed %8, @g_74 :: (store 4 into @g_74)
+    Return
+  
+  bb.2 (%ir-block.5):
+    dead %0:grx32bit = COPY undef %6.subreg_l32
+
+...
diff --git a/test/CodeGen/SystemZ/subregliveness-04.ll b/test/CodeGen/SystemZ/subregliveness-04.ll
index 2727a67f67a482861937df7b465f42b2dda6c0e3..11ecc9bd9c7c439f0385058fba21481b594bb0f1 100644
--- a/test/CodeGen/SystemZ/subregliveness-04.ll
+++ b/test/CodeGen/SystemZ/subregliveness-04.ll
@@ -20,7 +20,7 @@ bb3:                                              ; preds = %bb15, %bb
 
 bb5:                                              ; preds = %bb3
   %tmp6 = or i1 %tmp2, false
-  %tmp7 = select i1 %tmp6, i32 0, i32 undef
+  %tmp7 = select i1 %tmp6, i32 0, i32 100
   %tmp8 = ashr i32 %tmp1, %tmp7
   %tmp9 = zext i32 %tmp8 to i64
   %tmp10 = shl i64 %tmp9, 48
diff --git a/test/CodeGen/SystemZ/subregliveness-05.ll b/test/CodeGen/SystemZ/subregliveness-05.ll
index 9aba18f3342f90d866685d9370ead292330aa9da..9b470cda1a913e43201e8aa39325b5476b72b743 100644
--- a/test/CodeGen/SystemZ/subregliveness-05.ll
+++ b/test/CodeGen/SystemZ/subregliveness-05.ll
@@ -9,7 +9,7 @@ target triple = "s390x-ibm-linux"
 @g_65 = external global i32, align 4
 
 ; Function Attrs: nounwind
-define void @main() #0 {
+define void @main(i1 %x) #0 {
 bb:
   br label %bb1
 
@@ -25,7 +25,7 @@ bb1:                                              ; preds = %bb
   br i1 undef, label %bb12, label %bb9
 
 bb9:                                              ; preds = %bb1
-  %tmp10 = select i1 undef, i64 0, i64 %tmp2
+  %tmp10 = select i1 %x, i64 0, i64 %tmp2
   %tmp11 = add nsw i64 %tmp10, %tmp8
   br label %bb12
 
diff --git a/test/CodeGen/SystemZ/vec-move-19.ll b/test/CodeGen/SystemZ/vec-move-19.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1bf1d0e67cf6559b2b5f1a87f04c6c734726862f
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-move-19.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that a loaded value which is replicated is not inserted also in any
+; elements.
+
+; CHECK:      vlvgp   %v0, %r0, %r0
+; CHECK-NEXT: vrepf   %v24, %v0, 1
+; CHECK-NOT:  vlvgf   %v24, %r0, 1
+; CHECK-NOT:  vlvgf   %v24, %r0, 2
+
+define <4 x i32> @fun(i32 %arg, i32* %dst) {
+  %tmp = load i32, i32* undef
+  %tmp8 = insertelement <4 x i32> undef, i32 %tmp, i32 0
+  %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp, i32 1
+  %tmp10 = insertelement <4 x i32> %tmp9, i32 %tmp, i32 2
+  %tmp11 = insertelement <4 x i32> %tmp10, i32 %arg, i32 3
+  store i32 %tmp, i32* %dst
+  ret <4 x i32> %tmp11
+}
+
diff --git a/test/CodeGen/SystemZ/vec-move-20.ll b/test/CodeGen/SystemZ/vec-move-20.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ebf1b7db52058851523e0131ea36cce99e187a36
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-move-20.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test a vector which is built with elements from two loads replicates the
+; load with most elements having its value.
+
+; CHECK:      vlef
+; CHECK-NOT:  vlvgf
+
+define void @update(i32* %src1, i32* %src2, <4 x i32>* %dst) {
+bb:
+  %tmp = load i32, i32* %src1
+  %tmp1 = load i32, i32* %src2
+  %tmp2 = insertelement <4 x i32> undef, i32 %tmp, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  store <4 x i32> %tmp5, <4 x i32>* %dst
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/vec-move-21.ll b/test/CodeGen/SystemZ/vec-move-21.ll
new file mode 100644
index 0000000000000000000000000000000000000000..47ad03717433d71a5391a5e89b5ccf14b7702c82
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-move-21.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that a replicate of a load gets folded to vlrep also in cases where
+; the load has multiple users.
+
+; CHECK-NOT: vrep
+
+
+define double @fun(double* %Vsrc, <2 x double> %T) {
+entry:
+  %Vgep1 = getelementptr double, double* %Vsrc, i64 0
+  %Vld1 = load double, double* %Vgep1
+  %Vgep2 = getelementptr double, double* %Vsrc, i64 1
+  %Vld2 = load double, double* %Vgep2
+  %Vgep3 = getelementptr double, double* %Vsrc, i64 2
+  %Vld3 = load double, double* %Vgep3
+  %Vgep4 = getelementptr double, double* %Vsrc, i64 3
+  %Vld4 = load double, double* %Vgep4
+  %Vgep5 = getelementptr double, double* %Vsrc, i64 4
+  %Vld5 = load double, double* %Vgep5
+  %Vgep6 = getelementptr double, double* %Vsrc, i64 5
+  %Vld6 = load double, double* %Vgep6
+
+  %V19 = insertelement <2 x double> undef, double %Vld1, i32 0
+  %V20 = shufflevector <2 x double> %V19, <2 x double> undef, <2 x i32> zeroinitializer
+  %V21 = insertelement <2 x double> undef, double %Vld4, i32 0
+  %V22 = insertelement <2 x double> %V21, double %Vld5, i32 1
+  %V23 = fmul <2 x double> %V20, %V22
+  %V24 = fadd <2 x double> %T, %V23
+  %V25 = insertelement <2 x double> %V19, double %Vld2, i32 1
+  %V26 = insertelement <2 x double> undef, double %Vld6, i32 0
+  %V27 = insertelement <2 x double> %V26, double %Vld6, i32 1
+  %V28 = fmul <2 x double> %V25, %V27
+  %V29 = fadd <2 x double> %T, %V28
+  %V30 = insertelement <2 x double> undef, double %Vld2, i32 0
+  %V31 = shufflevector <2 x double> %V30, <2 x double> undef, <2 x i32> zeroinitializer
+  %V32 = insertelement <2 x double> undef, double %Vld5, i32 0
+  %V33 = insertelement <2 x double> %V32, double %Vld6, i32 1
+  %V34 = fmul <2 x double> %V31, %V33
+  %V35 = fadd <2 x double> %T, %V34
+  %V36 = insertelement <2 x double> undef, double %Vld3, i32 0
+  %V37 = shufflevector <2 x double> %V36, <2 x double> undef, <2 x i32> zeroinitializer
+  %V38 = fmul <2 x double> %V37, %V22
+  %V39 = fadd <2 x double> %T, %V38
+  %Vmul37 = fmul double %Vld3, %Vld6
+  %Vadd38 = fadd double %Vmul37, %Vmul37
+
+  %VA0 = fadd <2 x double> %V24, %V29
+  %VA1 = fadd <2 x double> %VA0, %V35
+  %VA2 = fadd <2 x double> %VA1, %V39
+
+  %VE0 = extractelement <2 x double> %VA2, i32 0
+  %VS1 = fadd double %VE0, %Vadd38
+
+  ret double %VS1
+}
diff --git a/test/CodeGen/SystemZ/vec-move-22.ll b/test/CodeGen/SystemZ/vec-move-22.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2508a9e6da225e08d6a981644175f29d5fc801c3
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-move-22.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that a loaded value which is used both in a vector and scalar context
+; is not transformed to a vlrep + vlgvg.
+
+; CHECK-NOT: vlrep
+
+define void @fun(i64 %arg, i64** %Addr, <2 x i64*>* %Dst) {
+  %tmp10 = load i64*, i64** %Addr
+  store i64 %arg, i64* %tmp10
+  %tmp12 = insertelement <2 x i64*> undef, i64* %tmp10, i32 0
+  %tmp13 = insertelement <2 x i64*> %tmp12, i64* %tmp10, i32 1
+  store <2 x i64*> %tmp13, <2 x i64*>* %Dst
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/vec-trunc-to-i1.ll b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
index 77c93c6cda5bf43d6d6489e95481a19b66575c52..278f0bf2a30f6ef3ce0e5b228a7f52bf9edead4e 100644
--- a/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
+++ b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 ;
 ; Check that a widening truncate to a vector of i1 elements can be handled.
@@ -5,19 +6,14 @@
 define void @pr32275(<4 x i8> %B15) {
 ; CHECK-LABEL: pr32275:
 ; CHECK:       # %bb.0: # %BB
-; CHECK:         vlgvb %r0, %v24, 3
-; CHECK-NEXT:    vlgvb %r1, %v24, 1
-; CHECK-NEXT:    vlvgp [[REG1:%v[0-9]]], %r1, %r0
-; CHECK-NEXT:    vlgvb %r0, %v24, 0
-; CHECK-NEXT:    vlgvb [[REG3:%r[0-9]]], %v24, 2
-; CHECK-NEXT:    vrepif [[REG0:%v[0-9]]], 1
-; CHECK:       .LBB0_1:
-; CHECK-DAG:     vlr [[REG2:%v[0-9]]], [[REG1]]
-; CHECK-DAG:     vlvgf [[REG2]], %r0, 0
-; CHECK-NEXT:    vlvgf [[REG2]], [[REG3]], 2
-; CHECK-NEXT:    vn [[REG2]], [[REG2]], [[REG0]]
-; CHECK-NEXT:    vlgvf [[REG4:%r[0-9]]], [[REG2]], 3
-; CHECK-NEXT:    cijlh [[REG4]], 0, .LBB0_1
+; CHECK-NEXT:    vlgvb %r0, %v24, 3
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepif %v1, 1
+; CHECK-NEXT:    vn %v0, %v0, %v1
+; CHECK-NEXT:    vlgvf %r0, %v0, 3
+; CHECK-NEXT:  .LBB0_1: # %CF34
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cijlh %r0, 0, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %CF36
 ; CHECK-NEXT:    br %r14
 BB:
diff --git a/test/CodeGen/Thumb/branchless-cmp.ll b/test/CodeGen/Thumb/branchless-cmp.ll
index ed34d630733c7f367f9f0b74f3cf0b1bf958899a..40c5b8853da9c3f397797da8109d56671434323e 100644
--- a/test/CodeGen/Thumb/branchless-cmp.ll
+++ b/test/CodeGen/Thumb/branchless-cmp.ll
@@ -74,23 +74,17 @@ entry:
 ; CHECK-NEXT: lsls	r0, r1, #2
 }
 
-; FIXME: This one hasn't changed actually
-; but could look like test3b
 define i32 @test4a(i32 %a, i32 %b) {
 entry:
   %cmp = icmp ne i32 %a, %b
   %cond = select i1 %cmp, i32 0, i32 4
   ret i32 %cond
 ; CHECK-LABEL: test4a:
-; CHECK: bb.0:
-; CHECK-NEXT:  cmp     r0, r1
-; CHECK-NEXT:  bne     .LBB6_2
-; CHECK-NEXT: bb.1:
-; CHECK-NEXT:  movs    r0, #4
-; CHECK-NEXT:  bx      lr
-; CHECK-NEXT: .LBB6_2:
-; CHECK-NEXT:  movs    r0, #0
-; CHECK-NEXT:  bx      lr
+; CHECK-NOT: b{{(ne)|(eq)}}
+; CHECK:      subs	r0, r0, r1
+; CHECK-NEXT: rsbs	r1, r0, #0
+; CHECK-NEXT: adcs	r1, r0
+; CHECK-NEXT: lsls	r0, r1, #2
 }
 
 define i32 @test4b(i32 %a, i32 %b) {
diff --git a/test/CodeGen/Thumb/ragreedy-implicit-def.ll b/test/CodeGen/Thumb/ragreedy-implicit-def.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d16b25e1bf74041ca6cb999849c4277b1e6cbf48
--- /dev/null
+++ b/test/CodeGen/Thumb/ragreedy-implicit-def.ll
@@ -0,0 +1,99 @@
+; REQUIRES: asserts
+; RUN: llc -mtriple=thumbv6m -regalloc=greedy -stats < %s 2>&1 | FileCheck %s
+
+; Undef incoming values to phis end up creating IMPLICIT_DEF values. If we don't
+; prefer them to be in a register then we get fewer spilled live ranges (6
+; compared to 7).
+; CHECK: 6 regalloc - Number of spilled live ranges
+
+declare i32 @otherfn(i32)
+define void @fn(i32 %val, i32* %ptr) {
+entry:
+  %gep1 = getelementptr i32, i32* %ptr, i32 0
+  %gep2 = getelementptr i32, i32* %ptr, i32 1
+  %gep3 = getelementptr i32, i32* %ptr, i32 2
+  %gep4 = getelementptr i32, i32* %ptr, i32 3
+  %gep5 = getelementptr i32, i32* %ptr, i32 4
+  %gep6 = getelementptr i32, i32* %ptr, i32 5
+  %gep7 = getelementptr i32, i32* %ptr, i32 6
+  %gep8 = getelementptr i32, i32* %ptr, i32 7
+  %cmp1 = icmp uge i32 %val, 3
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %val1 = load i32, i32* %gep1, align 4
+  %val2 = load i32, i32* %gep2, align 4
+  %val3 = load i32, i32* %gep3, align 4
+  %val4 = load i32, i32* %gep4, align 4
+  %val5 = load i32, i32* %gep5, align 4
+  %val6 = load i32, i32* %gep6, align 4
+  %val7 = load i32, i32* %gep7, align 4
+  %val8 = load i32, i32* %gep8, align 4
+  br label %then
+
+then:
+  %phi1a = phi i32 [ %val1, %if ], [ undef, %entry ]
+  %phi2a = phi i32 [ %val2, %if ], [ undef, %entry ]
+  %phi3a = phi i32 [ %val3, %if ], [ undef, %entry ]
+  %phi4a = phi i32 [ %val4, %if ], [ undef, %entry ]
+  %phi5a = phi i32 [ %val5, %if ], [ undef, %entry ]
+  %phi6a = phi i32 [ %val6, %if ], [ undef, %entry ]
+  %phi7a = phi i32 [ %val7, %if ], [ undef, %entry ]
+  %phi8a = phi i32 [ %val8, %if ], [ undef, %entry ]
+  %switchval = call i32 @otherfn(i32 %val)
+  switch i32 %switchval, label %default [
+    i32 0, label %case0
+    i32 1, label %case1
+    i32 5, label %case5
+    i32 6, label %case6
+    i32 7, label %case7
+    i32 8, label %case8
+  ]
+
+default:
+  br label %switchend
+
+case0:
+  br label %switchend
+
+case1:
+  br label %switchend
+
+case5:
+  br label %switchend
+
+case6:
+  br label %switchend
+
+case7:
+  br label %switchend
+
+case8:
+  br label %switchend
+
+switchend:
+  %phi1b = phi i32 [ 0, %default ], [ undef, %case0 ], [ undef, %case1 ], [ %phi1a, %case5 ], [ 1, %case6 ], [ 2, %case7 ], [ 1, %case8 ]
+  %phi2b = phi i32 [ 0, %default ], [ undef, %case0 ], [ undef, %case1 ], [ %phi2a, %case5 ], [ 2, %case6 ], [ 2, %case7 ], [ 1, %case8 ]
+  %phi3b = phi i32 [ 0, %default ], [ undef, %case0 ], [ undef, %case1 ], [ %phi3a, %case5 ], [ 3, %case6 ], [ 2, %case7 ], [ 1, %case8 ]
+  %phi4b = phi i32 [ 0, %default ], [ undef, %case0 ], [ undef, %case1 ], [ %phi4a, %case5 ], [ 4, %case6 ], [ 2, %case7 ], [ 1, %case8 ]
+  %phi5b = phi i32 [ 0, %default ], [ undef, %case0 ], [ undef, %case1 ], [ %phi5a, %case5 ], [ 5, %case6 ], [ 2, %case7 ], [ 1, %case8 ]
+  %phi6b = phi i32 [ 0, %default ], [ undef, %case0 ], [ undef, %case1 ], [ %phi6a, %case5 ], [ 6, %case6 ], [ 2, %case7 ], [ 1, %case8 ]
+  %phi7b = phi i32 [ 0, %default ], [ undef, %case0 ], [ undef, %case1 ], [ %phi7a, %case5 ], [ 7, %case6 ], [ 2, %case7 ], [ 1, %case8 ]
+  %phi8b = phi i32 [ 0, %default ], [ undef, %case0 ], [ undef, %case1 ], [ %phi8a, %case5 ], [ 8, %case6 ], [ 2, %case7 ], [ 1, %case8 ]
+  %cmp2 = icmp uge i32 %val, 4
+  br i1 %cmp2, label %if2, label %end
+
+if2:
+  store i32 %phi1b, i32* %gep1, align 4
+  store i32 %phi2b, i32* %gep2, align 4
+  store i32 %phi3b, i32* %gep3, align 4
+  store i32 %phi4b, i32* %gep4, align 4
+  store i32 %phi5b, i32* %gep5, align 4
+  store i32 %phi6b, i32* %gep6, align 4
+  store i32 %phi7b, i32* %gep7, align 4
+  store i32 %phi8b, i32* %gep8, align 4
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/high-reg-spill.mir b/test/CodeGen/Thumb2/high-reg-spill.mir
new file mode 100644
index 0000000000000000000000000000000000000000..d9bfdcafa382b31228e599eafbe7fe0ac64b0591
--- /dev/null
+++ b/test/CodeGen/Thumb2/high-reg-spill.mir
@@ -0,0 +1,50 @@
+# RUN: llc -run-pass regallocfast %s -o - | FileCheck %s
+
+# This test examines register allocation and spilling with Fast Register
+# Allocator. The test uses inline assembler that requests an input variable to
+# be loaded in a high register but at the same time has r12 marked as clobbered.
+# The allocator initially satisfies the load request by selecting r12 but then
+# needs to spill this register when it reaches the INLINEASM instruction and
+# notices its clobber definition.
+#
+# The test checks that the compiler is able to spill a register from the hGPR
+# class in Thumb2 by inserting the t2STRi12/t2LDRi12 instructions.
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv7m-none-unknown-eabi"
+
+  define dso_local void @constraint_h() {
+  entry:
+    %i = alloca i32, align 4
+    %0 = load i32, i32* %i, align 4
+    call void asm sideeffect "@ $0", "h,~{r12}"(i32 %0)
+    ret void
+  }
+
+...
+---
+name:            constraint_h
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: hgpr }
+  - { id: 1, class: tgpr }
+stack:
+  - { id: 0, name: i, size: 4, alignment: 4, stack-id: 0, local-offset: -4 }
+body:             |
+  bb.0.entry:
+    %1:tgpr = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i)
+    %0:hgpr = COPY %1
+    INLINEASM &"@ $0", 1, 589833, %0, 12, implicit-def early-clobber $r12
+    tBX_RET 14, $noreg
+
+...
+# CHECK: bb.0.entry:
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i)
+# CHECK-NEXT: renamable $r12 = COPY killed renamable $r0
+# CHECK-NEXT: t2STRi12 killed $r12, %stack.1, 0, 14, $noreg :: (store 4 into %stack.1)
+# CHECK-NEXT: $r8 = t2LDRi12 %stack.1, 0, 14, $noreg :: (load 4 from %stack.1)
+# CHECK-NEXT: INLINEASM &"@ $0", 1, 589833, killed renamable $r8, 12, implicit-def early-clobber $r12
+# CHECK-NEXT: tBX_RET 14, $noreg
diff --git a/test/CodeGen/WebAssembly/add-prototypes.ll b/test/CodeGen/WebAssembly/add-prototypes.ll
index 2b227678b0775e13f87d6c808c8f4a9530b40de2..583cadea03b7ff293338e4c1ac41f617908232a5 100644
--- a/test/CodeGen/WebAssembly/add-prototypes.ll
+++ b/test/CodeGen/WebAssembly/add-prototypes.ll
@@ -3,18 +3,42 @@
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
+; CHECK: @foo_addr = global i64 (i32)* @foo, align 8
 @foo_addr = global i64 (i32)* bitcast (i64 (...)* @foo to i64 (i32)*), align 8
 
-define void @bar(i32 %a) {
-entry:
+; CHECK: @foo_addr_i8 = global i8* bitcast (i64 (i32)* @foo to i8*), align 8
+@foo_addr_i8 = global i8* bitcast (i64 (...)* @foo to i8*), align 8
+
+; CHECK-LABEL: @call_foo
+; CHECK: %call = call i64 @foo(i32 42)
+define void @call_foo(i32 %a) {
   %call = call i64 bitcast (i64 (...)* @foo to i64 (i32)*)(i32 42)
   ret void
 }
 
-declare i64 @foo(...) #1
+; CHECK-LABEL: @call_foo_ptr
+; CHECK: %call = call i64 @foo(i32 43)
+define i64 @call_foo_ptr(i32 %a) {
+  %1 = bitcast i64 (...)* @foo to i64 (i32)*
+  %call = call i64 (i32) %1(i32 43)
+  ret i64 %call
+}
 
-attributes #1 = { "no-prototype" }
+; CHECK-LABEL: @to_intptr_inst
+; CHECK: ret i8* bitcast (i64 (i32)* @foo to i8*)
+define i8* @to_intptr_inst() {
+  %1 = bitcast i64 (...)* @foo to i8*
+  ret i8* %1
+}
+
+; CHECK-LABEL: @to_intptr_constexpr
+; CHECK: ret i8* bitcast (i64 (i32)* @foo to i8*)
+define i8* @to_intptr_constexpr() {
+  ret i8* bitcast (i64 (...)* @foo to i8*)
+}
 
-; CHECK: %call = call i64 @foo(i32 42)
 ; CHECK: declare i64 @foo(i32)
+declare i64 @foo(...) #1
+
 ; CHECK-NOT: attributes {{.*}} = { {{.*}}"no-prototype"{{.*}} }
+attributes #1 = { "no-prototype" }
diff --git a/test/CodeGen/WebAssembly/address-offsets.ll b/test/CodeGen/WebAssembly/address-offsets.ll
index bee3c28e60e2b27fc66ec44560e08c3b293b6e2b..86d6f15e2a8f4bd9ff3a926f1db5b876fbd84cbf 100644
--- a/test/CodeGen/WebAssembly/address-offsets.ll
+++ b/test/CodeGen/WebAssembly/address-offsets.ll
@@ -9,7 +9,7 @@ target triple = "wasm32-unknown-unknown"
 @g = external global [0 x i32], align 4
 
 ; CHECK-LABEL: load_test0:
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test0 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: i32.load  $push1=, g+40($pop0){{$}}
 ; CHECK-NEXT: return    $pop1{{$}}
@@ -19,7 +19,7 @@ define i32 @load_test0() {
 }
 
 ; CHECK-LABEL: load_test0_noinbounds:
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test0_noinbounds () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: i32.load  $push1=, g+40($pop0){{$}}
 ; CHECK-NEXT: return    $pop1{{$}}
@@ -33,8 +33,7 @@ define i32 @load_test0_noinbounds() {
 ; Likewise for stores.
 
 ; CHECK-LABEL: load_test1:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test1 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEX T: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.load  $push2=, g+40($pop1){{$}}
@@ -47,8 +46,7 @@ define i32 @load_test1(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test2:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test2 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEX T: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.load  $push2=, g+40($pop1){{$}}
@@ -61,8 +59,7 @@ define i32 @load_test2(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test3:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test3 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEX T: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.load  $push2=, g+40($pop1){{$}}
@@ -75,8 +72,7 @@ define i32 @load_test3(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test4:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test4 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEX T: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.load  $push2=, g+40($pop1){{$}}
@@ -88,8 +84,7 @@ define i32 @load_test4(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test5:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test5 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEX T: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.load  $push2=, g+40($pop1){{$}}
@@ -101,8 +96,7 @@ define i32 @load_test5(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test6:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test6 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEX T: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.load  $push2=, g+40($pop1){{$}}
@@ -115,8 +109,7 @@ define i32 @load_test6(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test7:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test7 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEX T: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.load  $push2=, g+40($pop1){{$}}
@@ -129,8 +122,7 @@ define i32 @load_test7(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test8:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test8 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEX T: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.load  $push2=, g+40($pop1){{$}}
@@ -143,7 +135,7 @@ define i32 @load_test8(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test9:
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test9 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: i32.load  $push1=, g-40($pop0){{$}}
 ; CHECK-NEXT: return    $pop1{{$}}
@@ -153,8 +145,7 @@ define i32 @load_test9() {
 }
 
 ; CHECK-LABEL: load_test10:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test10 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEXT: i32.const $push2=, g-40{{$}}
@@ -169,8 +160,7 @@ define i32 @load_test10(i32 %n) {
 }
 
 ; CHECK-LABEL: load_test11:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test11 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load  $push0=, 40($0){{$}}
 ; CHECK-NEXT: return    $pop0{{$}}
 define i32 @load_test11(i32* %p) {
@@ -180,8 +170,7 @@ define i32 @load_test11(i32* %p) {
 }
 
 ; CHECK-LABEL: load_test11_noinbounds:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test11_noinbounds (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 40{{$}}
 ; CHECK-NEXT: i32.add   $push1=, $0, $pop0{{$}}
 ; CHECK-NEXT: i32.load  $push2=, 0($pop1){{$}}
@@ -193,8 +182,7 @@ define i32 @load_test11_noinbounds(i32* %p) {
 }
 
 ; CHECK-LABEL: load_test12:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test12 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -210,8 +198,7 @@ define i32 @load_test12(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: load_test13:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test13 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -227,8 +214,7 @@ define i32 @load_test13(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: load_test14:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test14 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -242,8 +228,7 @@ define i32 @load_test14(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: load_test15:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test15 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -259,8 +244,7 @@ define i32 @load_test15(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: load_test16:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test16 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -276,8 +260,7 @@ define i32 @load_test16(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: load_test17:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test17 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -293,8 +276,7 @@ define i32 @load_test17(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: load_test18:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test18 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -308,8 +290,7 @@ define i32 @load_test18(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: load_test19:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test19 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -325,8 +306,7 @@ define i32 @load_test19(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: load_test20:
-; CHECK-NEXT: param     i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test20 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, -40{{$}}
 ; CHECK-NEXT: i32.add   $push1=, $0, $pop0{{$}}
 ; CHECK-NEXT: i32.load  $push2=, 0($pop1){{$}}
@@ -338,8 +318,7 @@ define i32 @load_test20(i32* %p) {
 }
 
 ; CHECK-LABEL: load_test21:
-; CHECK-NEXT: param     i32, i32{{$}}
-; CHECK-NEXT: result    i32{{$}}
+; CHECK-NEXT: .functype load_test21 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -355,7 +334,7 @@ define i32 @load_test21(i32* %p, i32 %n) {
 }
 
 ; CHECK-LABEL: store_test0:
-; CHECK-NEXT: param     i32{{$}}
+; CHECK-NEXT: .functype store_test0 (i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: i32.store g+40($pop0), $0{{$}}
 ; CHECK-NEXT: return{{$}}
@@ -365,7 +344,7 @@ define void @store_test0(i32 %i) {
 }
 
 ; CHECK-LABEL: store_test0_noinbounds:
-; CHECK-NEXT: param     i32{{$}}
+; CHECK-NEXT: .functype store_test0_noinbounds (i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: i32.store g+40($pop0), $0{{$}}
 ; CHECK-NEXT: return{{$}}
@@ -375,7 +354,7 @@ define void @store_test0_noinbounds(i32 %i) {
 }
 
 ; CHECK-LABEL: store_test1:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test1 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.store g+40($pop1), $1{{$}}
@@ -388,7 +367,7 @@ define void @store_test1(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test2:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test2 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.store g+40($pop1), $1{{$}}
@@ -401,7 +380,7 @@ define void @store_test2(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test3:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test3 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.store g+40($pop1), $1{{$}}
@@ -414,7 +393,7 @@ define void @store_test3(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test4:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test4 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.store g+40($pop1), $1{{$}}
@@ -426,7 +405,7 @@ define void @store_test4(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test5:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test5 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.store g+40($pop1), $1{{$}}
@@ -438,7 +417,7 @@ define void @store_test5(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test6:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test6 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.store g+40($pop1), $1{{$}}
@@ -451,7 +430,7 @@ define void @store_test6(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test7:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test7 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.store g+40($pop1), $1{{$}}
@@ -464,7 +443,7 @@ define void @store_test7(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test8:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test8 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEX T: i32.store g+40($pop1), $1{{$}}
@@ -477,7 +456,7 @@ define void @store_test8(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test9:
-; CHECK-NEXT: param     i32{{$}}
+; CHECK-NEXT: .functype store_test9 (i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: i32.store g-40($pop0), $0{{$}}
 ; CHECK-NEXT: return{{$}}
@@ -487,7 +466,7 @@ define void @store_test9(i32 %i) {
 }
 
 ; CHECK-LABEL: store_test10:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test10 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $0, $pop0{{$}}
 ; CHECK-NEXT: i32.const $push2=, g-40{{$}}
@@ -502,7 +481,7 @@ define void @store_test10(i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test11:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test11 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store 40($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_test11(i32* %p, i32 %i) {
@@ -512,7 +491,7 @@ define void @store_test11(i32* %p, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test11_noinbounds:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test11_noinbounds (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 40{{$}}
 ; CHECK-NEXT: i32.add   $push1=, $0, $pop0{{$}}
 ; CHECK-NEXT: i32.store 0($pop1), $1{{$}}
@@ -524,7 +503,7 @@ define void @store_test11_noinbounds(i32* %p, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test12:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test12 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -540,7 +519,7 @@ define void @store_test12(i32* %p, i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test13:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test13 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -556,7 +535,7 @@ define void @store_test13(i32* %p, i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test14:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test14 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -570,7 +549,7 @@ define void @store_test14(i32* %p, i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test15:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test15 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -586,7 +565,7 @@ define void @store_test15(i32* %p, i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test16:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test16 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -602,7 +581,7 @@ define void @store_test16(i32* %p, i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test17:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test17 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -618,7 +597,7 @@ define void @store_test17(i32* %p, i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test18:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test18 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -632,7 +611,7 @@ define void @store_test18(i32* %p, i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test19:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test19 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
@@ -648,7 +627,7 @@ define void @store_test19(i32* %p, i32 %n, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test20:
-; CHECK-NEXT: param     i32, i32{{$}}
+; CHECK-NEXT: .functype store_test20 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, -40{{$}}
 ; CHECK-NEXT: i32.add   $push1=, $0, $pop0{{$}}
 ; CHECK-NEXT: i32.store 0($pop1), $1{{$}}
@@ -660,7 +639,7 @@ define void @store_test20(i32* %p, i32 %i) {
 }
 
 ; CHECK-LABEL: store_test21:
-; CHECK-NEXT: param     i32, i32, i32{{$}}
+; CHECK-NEXT: .functype store_test21 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
 ; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
diff --git a/test/CodeGen/WebAssembly/atomic-mem-consistency.ll b/test/CodeGen/WebAssembly/atomic-mem-consistency.ll
index f15296dac2197d88df7f5d54638fd3a719302939..3a714370d84cf019eb047712d0208eeeffda6b97 100644
--- a/test/CodeGen/WebAssembly/atomic-mem-consistency.ll
+++ b/test/CodeGen/WebAssembly/atomic-mem-consistency.ll
@@ -53,7 +53,7 @@ define i32 @load_i32_seq_cst(i32 *%p) {
 ; The 'acquire' and 'acq_rel' orderings aren’t valid on store instructions.
 
 ; CHECK-LABEL: store_i32_unordered:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype store_i32_unordered (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_i32_unordered(i32 *%p, i32 %v) {
@@ -62,7 +62,7 @@ define void @store_i32_unordered(i32 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: store_i32_monotonic:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype store_i32_monotonic (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_i32_monotonic(i32 *%p, i32 %v) {
@@ -71,7 +71,7 @@ define void @store_i32_monotonic(i32 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: store_i32_release:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype store_i32_release (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_i32_release(i32 *%p, i32 %v) {
@@ -80,7 +80,7 @@ define void @store_i32_release(i32 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: store_i32_seq_cst:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype store_i32_seq_cst (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_i32_seq_cst(i32 *%p, i32 %v) {
@@ -96,7 +96,7 @@ define void @store_i32_seq_cst(i32 *%p, i32 %v) {
 ; The 'unordered' ordering is not valid on atomicrmw instructions.
 
 ; CHECK-LABEL: add_i32_monotonic:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_i32_monotonic (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_monotonic(i32* %p, i32 %v) {
@@ -105,7 +105,7 @@ define i32 @add_i32_monotonic(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: add_i32_acquire:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_i32_acquire (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_acquire(i32* %p, i32 %v) {
@@ -114,7 +114,7 @@ define i32 @add_i32_acquire(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: add_i32_release:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_i32_release (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_release(i32* %p, i32 %v) {
@@ -123,7 +123,7 @@ define i32 @add_i32_release(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: add_i32_acq_rel:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_i32_acq_rel (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_acq_rel(i32* %p, i32 %v) {
@@ -132,7 +132,7 @@ define i32 @add_i32_acq_rel(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: add_i32_seq_cst:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_i32_seq_cst (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_seq_cst(i32* %p, i32 %v) {
@@ -147,7 +147,7 @@ define i32 @add_i32_seq_cst(i32* %p, i32 %v) {
 ; that on success, and the failure ordering cannot be either release or acq_rel.
 
 ; CHECK-LABEL: cmpxchg_i32_monotonic_monotonic:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_monotonic_monotonic (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_monotonic_monotonic(i32* %p, i32 %exp, i32 %new) {
@@ -157,7 +157,7 @@ define i32 @cmpxchg_i32_monotonic_monotonic(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_acquire_monotonic:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_acquire_monotonic (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_acquire_monotonic(i32* %p, i32 %exp, i32 %new) {
@@ -167,7 +167,7 @@ define i32 @cmpxchg_i32_acquire_monotonic(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_release_monotonic:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_release_monotonic (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_release_monotonic(i32* %p, i32 %exp, i32 %new) {
@@ -177,7 +177,7 @@ define i32 @cmpxchg_i32_release_monotonic(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_acq_rel_monotonic:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_acq_rel_monotonic (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_acq_rel_monotonic(i32* %p, i32 %exp, i32 %new) {
@@ -187,7 +187,7 @@ define i32 @cmpxchg_i32_acq_rel_monotonic(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_seq_cst_monotonic:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_seq_cst_monotonic (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_seq_cst_monotonic(i32* %p, i32 %exp, i32 %new) {
@@ -197,7 +197,7 @@ define i32 @cmpxchg_i32_seq_cst_monotonic(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_acquire_acquire:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_acquire_acquire (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_acquire_acquire(i32* %p, i32 %exp, i32 %new) {
@@ -207,7 +207,7 @@ define i32 @cmpxchg_i32_acquire_acquire(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_release_acquire:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_release_acquire (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_release_acquire(i32* %p, i32 %exp, i32 %new) {
@@ -217,7 +217,7 @@ define i32 @cmpxchg_i32_release_acquire(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_acq_rel_acquire:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_acq_rel_acquire (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_acq_rel_acquire(i32* %p, i32 %exp, i32 %new) {
@@ -227,7 +227,7 @@ define i32 @cmpxchg_i32_acq_rel_acquire(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_seq_cst_acquire:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_seq_cst_acquire (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_seq_cst_acquire(i32* %p, i32 %exp, i32 %new) {
@@ -237,7 +237,7 @@ define i32 @cmpxchg_i32_seq_cst_acquire(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_seq_cst_seq_cst:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_seq_cst_seq_cst (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_seq_cst_seq_cst(i32* %p, i32 %exp, i32 %new) {
diff --git a/test/CodeGen/WebAssembly/atomic-rmw.ll b/test/CodeGen/WebAssembly/atomic-rmw.ll
index f0983e370d942c0737805020bdedd79a1cc3673a..d4e66801347d2e53a95416e030400ce08a2e12cf 100644
--- a/test/CodeGen/WebAssembly/atomic-rmw.ll
+++ b/test/CodeGen/WebAssembly/atomic-rmw.ll
@@ -11,7 +11,7 @@ target triple = "wasm32-unknown-unknown"
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: add_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32(i32* %p, i32 %v) {
@@ -20,7 +20,7 @@ define i32 @add_i32(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: sub_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sub_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sub_i32(i32* %p, i32 %v) {
@@ -29,7 +29,7 @@ define i32 @sub_i32(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: and_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype and_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @and_i32(i32* %p, i32 %v) {
@@ -38,7 +38,7 @@ define i32 @and_i32(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: or_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype or_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @or_i32(i32* %p, i32 %v) {
@@ -47,7 +47,7 @@ define i32 @or_i32(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xor_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xor_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @xor_i32(i32* %p, i32 %v) {
@@ -56,7 +56,7 @@ define i32 @xor_i32(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xchg_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xchg_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @xchg_i32(i32* %p, i32 %v) {
@@ -65,7 +65,7 @@ define i32 @xchg_i32(i32* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_loaded_value:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_loaded_value (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_loaded_value(i32* %p, i32 %exp, i32 %new) {
@@ -75,7 +75,7 @@ define i32 @cmpxchg_i32_loaded_value(i32* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i32_success:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_success (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: i32.eq $push1=, $pop0, $1{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -142,7 +142,7 @@ define i32 @umin_i32(i32* %p, i32 %v) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: add_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype add_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @add_i64(i64* %p, i64 %v) {
@@ -151,7 +151,7 @@ define i64 @add_i64(i64* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: sub_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sub_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sub_i64(i64* %p, i64 %v) {
@@ -160,7 +160,7 @@ define i64 @sub_i64(i64* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: and_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype and_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @and_i64(i64* %p, i64 %v) {
@@ -169,7 +169,7 @@ define i64 @and_i64(i64* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: or_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype or_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @or_i64(i64* %p, i64 %v) {
@@ -178,7 +178,7 @@ define i64 @or_i64(i64* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: xor_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xor_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xor_i64(i64* %p, i64 %v) {
@@ -187,7 +187,7 @@ define i64 @xor_i64(i64* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: xchg_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xchg_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xchg_i64(i64* %p, i64 %v) {
@@ -196,7 +196,7 @@ define i64 @xchg_i64(i64* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: cmpxchg_i64_loaded_value:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_i64_loaded_value (i32, i64, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @cmpxchg_i64_loaded_value(i64* %p, i64 %exp, i64 %new) {
@@ -206,7 +206,7 @@ define i64 @cmpxchg_i64_loaded_value(i64* %p, i64 %exp, i64 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_i64_success:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_i64_success (i32, i64, i64) -> (i32){{$}}
 ; CHECK: i64.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: i64.eq $push1=, $pop0, $1{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -275,7 +275,7 @@ define i64 @umin_i64(i64* %p, i64 %v) {
 ; add
 
 ; CHECK-LABEL: add_sext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_sext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -287,7 +287,7 @@ define i32 @add_sext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: add_sext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_sext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -299,7 +299,7 @@ define i32 @add_sext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: add_sext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype add_sext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -311,7 +311,7 @@ define i64 @add_sext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: add_sext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype add_sext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -324,7 +324,7 @@ define i64 @add_sext_i16_i64(i16* %p, i64 %v) {
 
 ; 32->64 sext rmw gets selected as i32.atomic.rmw.add, i64_extend_s/i32
 ; CHECK-LABEL: add_sext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype add_sext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i32.wrap/i64 $push0=, $1{{$}}
 ; CHECK: i32.atomic.rmw.add $push1=, 0($0), $pop0{{$}}
 ; CHECK-NEXT: i64.extend_s/i32 $push2=, $pop1{{$}}
@@ -339,7 +339,7 @@ define i64 @add_sext_i32_i64(i32* %p, i64 %v) {
 ; sub
 
 ; CHECK-LABEL: sub_sext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sub_sext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -351,7 +351,7 @@ define i32 @sub_sext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: sub_sext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sub_sext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -363,7 +363,7 @@ define i32 @sub_sext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: sub_sext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sub_sext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -375,7 +375,7 @@ define i64 @sub_sext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: sub_sext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sub_sext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -388,7 +388,7 @@ define i64 @sub_sext_i16_i64(i16* %p, i64 %v) {
 
 ; 32->64 sext rmw gets selected as i32.atomic.rmw.sub, i64_extend_s/i32
 ; CHECK-LABEL: sub_sext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sub_sext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i32.wrap/i64 $push0=, $1
 ; CHECK: i32.atomic.rmw.sub $push1=, 0($0), $pop0{{$}}
 ; CHECK-NEXT: i64.extend_s/i32 $push2=, $pop1{{$}}
@@ -403,7 +403,7 @@ define i64 @sub_sext_i32_i64(i32* %p, i64 %v) {
 ; and
 
 ; CHECK-LABEL: and_sext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype and_sext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -415,7 +415,7 @@ define i32 @and_sext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: and_sext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype and_sext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -427,7 +427,7 @@ define i32 @and_sext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: and_sext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype and_sext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -439,7 +439,7 @@ define i64 @and_sext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: and_sext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype and_sext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -452,7 +452,7 @@ define i64 @and_sext_i16_i64(i16* %p, i64 %v) {
 
 ; 32->64 sext rmw gets selected as i32.atomic.rmw.and, i64_extend_s/i32
 ; CHECK-LABEL: and_sext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype and_sext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i32.wrap/i64 $push0=, $1{{$}}
 ; CHECK: i32.atomic.rmw.and $push1=, 0($0), $pop0{{$}}
 ; CHECK-NEXT: i64.extend_s/i32 $push2=, $pop1{{$}}
@@ -467,7 +467,7 @@ define i64 @and_sext_i32_i64(i32* %p, i64 %v) {
 ; or
 
 ; CHECK-LABEL: or_sext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype or_sext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -479,7 +479,7 @@ define i32 @or_sext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: or_sext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype or_sext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -491,7 +491,7 @@ define i32 @or_sext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: or_sext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype or_sext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -503,7 +503,7 @@ define i64 @or_sext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: or_sext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype or_sext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -516,7 +516,7 @@ define i64 @or_sext_i16_i64(i16* %p, i64 %v) {
 
 ; 32->64 sext rmw gets selected as i32.atomic.rmw.or, i64_extend_s/i32
 ; CHECK-LABEL: or_sext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype or_sext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i32.wrap/i64 $push0=, $1{{$}}
 ; CHECK: i32.atomic.rmw.or $push1=, 0($0), $pop0{{$}}
 ; CHECK-NEXT: i64.extend_s/i32 $push2=, $pop1{{$}}
@@ -531,7 +531,7 @@ define i64 @or_sext_i32_i64(i32* %p, i64 %v) {
 ; xor
 
 ; CHECK-LABEL: xor_sext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xor_sext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -543,7 +543,7 @@ define i32 @xor_sext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xor_sext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xor_sext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -555,7 +555,7 @@ define i32 @xor_sext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xor_sext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xor_sext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -567,7 +567,7 @@ define i64 @xor_sext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: xor_sext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xor_sext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -580,7 +580,7 @@ define i64 @xor_sext_i16_i64(i16* %p, i64 %v) {
 
 ; 32->64 sext rmw gets selected as i32.atomic.rmw.xor, i64_extend_s/i32
 ; CHECK-LABEL: xor_sext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xor_sext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i32.wrap/i64 $push0=, $1{{$}}
 ; CHECK: i32.atomic.rmw.xor $push1=, 0($0), $pop0{{$}}
 ; CHECK-NEXT: i64.extend_s/i32 $push2=, $pop1{{$}}
@@ -595,7 +595,7 @@ define i64 @xor_sext_i32_i64(i32* %p, i64 %v) {
 ; xchg
 
 ; CHECK-LABEL: xchg_sext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xchg_sext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -607,7 +607,7 @@ define i32 @xchg_sext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xchg_sext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xchg_sext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i32.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -619,7 +619,7 @@ define i32 @xchg_sext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xchg_sext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xchg_sext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -631,7 +631,7 @@ define i64 @xchg_sext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: xchg_sext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xchg_sext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: i64.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -644,7 +644,7 @@ define i64 @xchg_sext_i16_i64(i16* %p, i64 %v) {
 
 ; 32->64 sext rmw gets selected as i32.atomic.rmw.xchg, i64_extend_s/i32
 ; CHECK-LABEL: xchg_sext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xchg_sext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i32.wrap/i64 $push0=, $1{{$}}
 ; CHECK: i32.atomic.rmw.xchg $push1=, 0($0), $pop0{{$}}
 ; CHECK-NEXT: i64.extend_s/i32 $push2=, $pop1{{$}}
@@ -659,7 +659,7 @@ define i64 @xchg_sext_i32_i64(i32* %p, i64 %v) {
 ; cmpxchg
 
 ; CHECK-LABEL: cmpxchg_sext_i8_i32:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_sext_i8_i32 (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: i32.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -673,7 +673,7 @@ define i32 @cmpxchg_sext_i8_i32(i8* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_sext_i16_i32:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_sext_i16_i32 (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: i32.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -687,7 +687,7 @@ define i32 @cmpxchg_sext_i16_i32(i16* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_sext_i8_i64:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_sext_i8_i64 (i32, i64, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: i64.extend8_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -701,7 +701,7 @@ define i64 @cmpxchg_sext_i8_i64(i8* %p, i64 %exp, i64 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_sext_i16_i64:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_sext_i16_i64 (i32, i64, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: i64.extend16_s $push1=, $pop0{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -716,7 +716,7 @@ define i64 @cmpxchg_sext_i16_i64(i16* %p, i64 %exp, i64 %new) {
 
 ; 32->64 sext rmw gets selected as i32.atomic.rmw.cmpxchg, i64_extend_s/i32
 ; CHECK-LABEL: cmpxchg_sext_i32_i64:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_sext_i32_i64 (i32, i64, i64) -> (i64){{$}}
 ; CHECK: i32.wrap/i64 $push1=, $1{{$}}
 ; CHECK-NEXT: i32.wrap/i64 $push0=, $2{{$}}
 ; CHECK-NEXT: i32.atomic.rmw.cmpxchg $push2=, 0($0), $pop1, $pop0{{$}}
@@ -737,7 +737,7 @@ define i64 @cmpxchg_sext_i32_i64(i32* %p, i64 %exp, i64 %new) {
 ; nand
 
 ; CHECK-LABEL: nand_sext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype nand_sext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw8_u.cmpxchg
 ; CHECK: i32.extend8_s
@@ -749,7 +749,7 @@ define i32 @nand_sext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: nand_sext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype nand_sext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw16_u.cmpxchg
 ; CHECK: i32.extend16_s
@@ -762,7 +762,7 @@ define i32 @nand_sext_i16_i32(i16* %p, i32 %v) {
 
 ; FIXME Currently this cannot make use of i64.atomic.rmw8_u.cmpxchg
 ; CHECK-LABEL: nand_sext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype nand_sext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw8_u.cmpxchg
 ; CHECK: i64.extend_u/i32
@@ -776,7 +776,7 @@ define i64 @nand_sext_i8_i64(i8* %p, i64 %v) {
 
 ; FIXME Currently this cannot make use of i64.atomic.rmw16_u.cmpxchg
 ; CHECK-LABEL: nand_sext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype nand_sext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw16_u.cmpxchg
 ; CHECK: i64.extend_u/i32
@@ -790,7 +790,7 @@ define i64 @nand_sext_i16_i64(i16* %p, i64 %v) {
 
 ; 32->64 sext rmw gets selected as i32.atomic.rmw.nand, i64_extend_s/i32
 ; CHECK-LABEL: nand_sext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype nand_sext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw.cmpxchg
 ; CHECK: i64.extend_s/i32
@@ -808,7 +808,7 @@ define i64 @nand_sext_i32_i64(i32* %p, i64 %v) {
 ; add
 
 ; CHECK-LABEL: add_zext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_zext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_zext_i8_i32(i8* %p, i32 %v) {
@@ -819,7 +819,7 @@ define i32 @add_zext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: add_zext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype add_zext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_zext_i16_i32(i16* %p, i32 %v) {
@@ -830,7 +830,7 @@ define i32 @add_zext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: add_zext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype add_zext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @add_zext_i8_i64(i8* %p, i64 %v) {
@@ -841,7 +841,7 @@ define i64 @add_zext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: add_zext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype add_zext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @add_zext_i16_i64(i16* %p, i64 %v) {
@@ -852,7 +852,7 @@ define i64 @add_zext_i16_i64(i16* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: add_zext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype add_zext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw32_u.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @add_zext_i32_i64(i32* %p, i64 %v) {
@@ -865,7 +865,7 @@ define i64 @add_zext_i32_i64(i32* %p, i64 %v) {
 ; sub
 
 ; CHECK-LABEL: sub_zext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sub_zext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sub_zext_i8_i32(i8* %p, i32 %v) {
@@ -876,7 +876,7 @@ define i32 @sub_zext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: sub_zext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sub_zext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sub_zext_i16_i32(i16* %p, i32 %v) {
@@ -887,7 +887,7 @@ define i32 @sub_zext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: sub_zext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sub_zext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sub_zext_i8_i64(i8* %p, i64 %v) {
@@ -898,7 +898,7 @@ define i64 @sub_zext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: sub_zext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sub_zext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sub_zext_i16_i64(i16* %p, i64 %v) {
@@ -909,7 +909,7 @@ define i64 @sub_zext_i16_i64(i16* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: sub_zext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sub_zext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw32_u.sub $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sub_zext_i32_i64(i32* %p, i64 %v) {
@@ -922,7 +922,7 @@ define i64 @sub_zext_i32_i64(i32* %p, i64 %v) {
 ; and
 
 ; CHECK-LABEL: and_zext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype and_zext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @and_zext_i8_i32(i8* %p, i32 %v) {
@@ -933,7 +933,7 @@ define i32 @and_zext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: and_zext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype and_zext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @and_zext_i16_i32(i16* %p, i32 %v) {
@@ -944,7 +944,7 @@ define i32 @and_zext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: and_zext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype and_zext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @and_zext_i8_i64(i8* %p, i64 %v) {
@@ -955,7 +955,7 @@ define i64 @and_zext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: and_zext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype and_zext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @and_zext_i16_i64(i16* %p, i64 %v) {
@@ -966,7 +966,7 @@ define i64 @and_zext_i16_i64(i16* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: and_zext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype and_zext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw32_u.and $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @and_zext_i32_i64(i32* %p, i64 %v) {
@@ -979,7 +979,7 @@ define i64 @and_zext_i32_i64(i32* %p, i64 %v) {
 ; or
 
 ; CHECK-LABEL: or_zext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype or_zext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @or_zext_i8_i32(i8* %p, i32 %v) {
@@ -990,7 +990,7 @@ define i32 @or_zext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: or_zext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype or_zext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @or_zext_i16_i32(i16* %p, i32 %v) {
@@ -1001,7 +1001,7 @@ define i32 @or_zext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: or_zext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype or_zext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @or_zext_i8_i64(i8* %p, i64 %v) {
@@ -1012,7 +1012,7 @@ define i64 @or_zext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: or_zext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype or_zext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @or_zext_i16_i64(i16* %p, i64 %v) {
@@ -1023,7 +1023,7 @@ define i64 @or_zext_i16_i64(i16* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: or_zext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype or_zext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw32_u.or $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @or_zext_i32_i64(i32* %p, i64 %v) {
@@ -1036,7 +1036,7 @@ define i64 @or_zext_i32_i64(i32* %p, i64 %v) {
 ; xor
 
 ; CHECK-LABEL: xor_zext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xor_zext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @xor_zext_i8_i32(i8* %p, i32 %v) {
@@ -1047,7 +1047,7 @@ define i32 @xor_zext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xor_zext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xor_zext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @xor_zext_i16_i32(i16* %p, i32 %v) {
@@ -1058,7 +1058,7 @@ define i32 @xor_zext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xor_zext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xor_zext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xor_zext_i8_i64(i8* %p, i64 %v) {
@@ -1069,7 +1069,7 @@ define i64 @xor_zext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: xor_zext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xor_zext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xor_zext_i16_i64(i16* %p, i64 %v) {
@@ -1080,7 +1080,7 @@ define i64 @xor_zext_i16_i64(i16* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: xor_zext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xor_zext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw32_u.xor $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xor_zext_i32_i64(i32* %p, i64 %v) {
@@ -1093,7 +1093,7 @@ define i64 @xor_zext_i32_i64(i32* %p, i64 %v) {
 ; xchg
 
 ; CHECK-LABEL: xchg_zext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xchg_zext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @xchg_zext_i8_i32(i8* %p, i32 %v) {
@@ -1104,7 +1104,7 @@ define i32 @xchg_zext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xchg_zext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype xchg_zext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @xchg_zext_i16_i32(i16* %p, i32 %v) {
@@ -1115,7 +1115,7 @@ define i32 @xchg_zext_i16_i32(i16* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: xchg_zext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xchg_zext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xchg_zext_i8_i64(i8* %p, i64 %v) {
@@ -1126,7 +1126,7 @@ define i64 @xchg_zext_i8_i64(i8* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: xchg_zext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xchg_zext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xchg_zext_i16_i64(i16* %p, i64 %v) {
@@ -1137,7 +1137,7 @@ define i64 @xchg_zext_i16_i64(i16* %p, i64 %v) {
 }
 
 ; CHECK-LABEL: xchg_zext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype xchg_zext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw32_u.xchg $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xchg_zext_i32_i64(i32* %p, i64 %v) {
@@ -1150,7 +1150,7 @@ define i64 @xchg_zext_i32_i64(i32* %p, i64 %v) {
 ; cmpxchg
 
 ; CHECK-LABEL: cmpxchg_zext_i8_i32:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_zext_i8_i32 (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw8_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_zext_i8_i32(i8* %p, i32 %exp, i32 %new) {
@@ -1163,7 +1163,7 @@ define i32 @cmpxchg_zext_i8_i32(i8* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_zext_i16_i32:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_zext_i16_i32 (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw16_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_zext_i16_i32(i16* %p, i32 %exp, i32 %new) {
@@ -1176,7 +1176,7 @@ define i32 @cmpxchg_zext_i16_i32(i16* %p, i32 %exp, i32 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_zext_i8_i64:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_zext_i8_i64 (i32, i64, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw8_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @cmpxchg_zext_i8_i64(i8* %p, i64 %exp, i64 %new) {
@@ -1189,7 +1189,7 @@ define i64 @cmpxchg_zext_i8_i64(i8* %p, i64 %exp, i64 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_zext_i16_i64:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_zext_i16_i64 (i32, i64, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw16_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @cmpxchg_zext_i16_i64(i16* %p, i64 %exp, i64 %new) {
@@ -1202,7 +1202,7 @@ define i64 @cmpxchg_zext_i16_i64(i16* %p, i64 %exp, i64 %new) {
 }
 
 ; CHECK-LABEL: cmpxchg_zext_i32_i64:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_zext_i32_i64 (i32, i64, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw32_u.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @cmpxchg_zext_i32_i64(i32* %p, i64 %exp, i64 %new) {
@@ -1220,7 +1220,7 @@ define i64 @cmpxchg_zext_i32_i64(i32* %p, i64 %exp, i64 %new) {
 ; nand
 
 ; CHECK-LABEL: nand_zext_i8_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype nand_zext_i8_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw8_u.cmpxchg
 define i32 @nand_zext_i8_i32(i8* %p, i32 %v) {
@@ -1231,7 +1231,7 @@ define i32 @nand_zext_i8_i32(i8* %p, i32 %v) {
 }
 
 ; CHECK-LABEL: nand_zext_i16_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype nand_zext_i16_i32 (i32, i32) -> (i32){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw16_u.cmpxchg
 define i32 @nand_zext_i16_i32(i16* %p, i32 %v) {
@@ -1243,7 +1243,7 @@ define i32 @nand_zext_i16_i32(i16* %p, i32 %v) {
 
 ; FIXME Currently this cannot make use of i64.atomic.rmw8_u.cmpxchg
 ; CHECK-LABEL: nand_zext_i8_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype nand_zext_i8_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw8_u.cmpxchg
 ; CHECK: i64.extend_u/i32
@@ -1256,7 +1256,7 @@ define i64 @nand_zext_i8_i64(i8* %p, i64 %v) {
 
 ; FIXME Currently this cannot make use of i64.atomic.rmw16_u.cmpxchg
 ; CHECK-LABEL: nand_zext_i16_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype nand_zext_i16_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw16_u.cmpxchg
 ; CHECK: i64.extend_u/i32
@@ -1269,7 +1269,7 @@ define i64 @nand_zext_i16_i64(i16* %p, i64 %v) {
 
 ; FIXME Currently this cannot make use of i64.atomic.rmw32_u.cmpxchg
 ; CHECK-LABEL: nand_zext_i32_i64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype nand_zext_i32_i64 (i32, i64) -> (i64){{$}}
 ; CHECK: loop
 ; CHECK: i32.atomic.rmw.cmpxchg
 ; CHECK: i64.extend_u/i32
diff --git a/test/CodeGen/WebAssembly/byval.ll b/test/CodeGen/WebAssembly/byval.ll
index c4f4035931496e305ab345770f3c1115f5daa965..0caa72015d7878da604077a0fc1fedba12bf6328 100644
--- a/test/CodeGen/WebAssembly/byval.ll
+++ b/test/CodeGen/WebAssembly/byval.ll
@@ -10,19 +10,16 @@ target triple = "wasm32-unknown-unknown"
 %BigStruct = type { double, double, double, double, double, double, double, double, double, double, double, i8, i8, i8 }
 %EmptyStruct = type { }
 
-%BigArray = type { [33 x i8] }
-
 declare void @ext_func(%SmallStruct*)
 declare void @ext_func_empty(%EmptyStruct* byval)
 declare void @ext_byval_func(%SmallStruct* byval)
 declare void @ext_byval_func_align8(%SmallStruct* byval align 8)
 declare void @ext_byval_func_alignedstruct(%AlignedStruct* byval)
-declare void @ext_byval_func_bigarray(%BigArray* byval)
 declare void @ext_byval_func_empty(%EmptyStruct* byval)
 
 ; CHECK-LABEL: byval_arg
 define void @byval_arg(%SmallStruct* %ptr) {
- ; CHECK: .param i32
+ ; CHECK: .functype byval_arg (i32) -> ()
  ; Subtract 16 from SP (SP is 16-byte aligned)
  ; CHECK-NEXT: get_global $push[[L2:.+]]=, __stack_pointer@GLOBAL
  ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
@@ -48,7 +45,7 @@ define void @byval_arg(%SmallStruct* %ptr) {
 
 ; CHECK-LABEL: byval_arg_align8
 define void @byval_arg_align8(%SmallStruct* %ptr) {
- ; CHECK: .param i32
+ ; CHECK: .functype byval_arg_align8 (i32) -> ()
  ; Don't check the entire SP sequence, just enough to get the alignment.
  ; CHECK: i32.const $push[[L1:.+]]=, 16
  ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, {{.+}}, $pop[[L1]]
@@ -67,7 +64,7 @@ define void @byval_arg_align8(%SmallStruct* %ptr) {
 
 ; CHECK-LABEL: byval_arg_double
 define void @byval_arg_double(%AlignedStruct* %ptr) {
- ; CHECK: .param i32
+ ; CHECK: .functype byval_arg_double (i32) -> ()
  ; Subtract 16 from SP (SP is 16-byte aligned)
  ; CHECK: i32.const $push[[L1:.+]]=, 16
  ; CHECK-NEXT: i32.sub $push[[L14:.+]]=, {{.+}}, $pop[[L1]]
@@ -85,7 +82,7 @@ define void @byval_arg_double(%AlignedStruct* %ptr) {
 
 ; CHECK-LABEL: byval_param
 define void @byval_param(%SmallStruct* byval align 32 %ptr) {
- ; CHECK: .param i32
+ ; CHECK: .functype byval_param (i32) -> ()
  ; %ptr is just a pointer to a struct, so pass it directly through
  ; CHECK: call ext_func@FUNCTION, $0
  call void @ext_func(%SmallStruct* %ptr)
@@ -94,7 +91,7 @@ define void @byval_param(%SmallStruct* byval align 32 %ptr) {
 
 ; CHECK-LABEL: byval_empty_caller
 define void @byval_empty_caller(%EmptyStruct* %ptr) {
- ; CHECK: .param i32
+ ; CHECK: .functype byval_empty_caller (i32) -> ()
  ; CHECK: call ext_byval_func_empty@FUNCTION, $0
  call void @ext_byval_func_empty(%EmptyStruct* byval %ptr)
  ret void
@@ -102,7 +99,7 @@ define void @byval_empty_caller(%EmptyStruct* %ptr) {
 
 ; CHECK-LABEL: byval_empty_callee
 define void @byval_empty_callee(%EmptyStruct* byval %ptr) {
- ; CHECK: .param i32
+ ; CHECK: .functype byval_empty_callee (i32) -> ()
  ; CHECK: call ext_func_empty@FUNCTION, $0
  call void @ext_func_empty(%EmptyStruct* %ptr)
  ret void
diff --git a/test/CodeGen/WebAssembly/call.ll b/test/CodeGen/WebAssembly/call.ll
index eaa583f8a02e7d71ea58579e6407a3d5779cb3a1..3a849739e76ec6b18ed8443371c786b9c208711f 100644
--- a/test/CodeGen/WebAssembly/call.ll
+++ b/test/CodeGen/WebAssembly/call.ll
@@ -16,7 +16,7 @@ declare <16 x i8> @v128_nullary()
 declare void @void_nullary()
 
 ; CHECK-LABEL: call_i32_nullary:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype call_i32_nullary () -> (i32){{$}}
 ; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @call_i32_nullary() {
@@ -25,7 +25,7 @@ define i32 @call_i32_nullary() {
 }
 
 ; CHECK-LABEL: call_i64_nullary:
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype call_i64_nullary () -> (i64){{$}}
 ; CHECK-NEXT: {{^}} i64.call $push[[NUM:[0-9]+]]=, i64_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @call_i64_nullary() {
@@ -34,7 +34,7 @@ define i64 @call_i64_nullary() {
 }
 
 ; CHECK-LABEL: call_float_nullary:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype call_float_nullary () -> (f32){{$}}
 ; CHECK-NEXT: {{^}} f32.call $push[[NUM:[0-9]+]]=, float_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @call_float_nullary() {
@@ -43,7 +43,7 @@ define float @call_float_nullary() {
 }
 
 ; CHECK-LABEL: call_double_nullary:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype call_double_nullary () -> (f64){{$}}
 ; CHECK-NEXT: {{^}} f64.call $push[[NUM:[0-9]+]]=, double_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @call_double_nullary() {
@@ -52,7 +52,7 @@ define double @call_double_nullary() {
 }
 
 ; CHECK-LABEL: call_v128_nullary:
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype call_v128_nullary () -> (v128){{$}}
 ; CHECK-NEXT: {{^}} v128.call $push[[NUM:[0-9]+]]=, v128_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define <16 x i8> @call_v128_nullary() {
@@ -61,6 +61,7 @@ define <16 x i8> @call_v128_nullary() {
 }
 
 ; CHECK-LABEL: call_void_nullary:
+; CHECK-NEXT: .functype call_void_nullary () -> (){{$}}
 ; CHECK-NEXT: {{^}} call void_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @call_void_nullary() {
@@ -69,8 +70,7 @@ define void @call_void_nullary() {
 }
 
 ; CHECK-LABEL: call_i32_unary:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype call_i32_unary (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_unary@FUNCTION, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -80,8 +80,7 @@ define i32 @call_i32_unary(i32 %a) {
 }
 
 ; CHECK-LABEL: call_i32_binary:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype call_i32_binary (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_binary@FUNCTION, $pop[[L0]], $pop[[L1]]{{$}}
@@ -92,7 +91,7 @@ define i32 @call_i32_binary(i32 %a, i32 %b) {
 }
 
 ; CHECK-LABEL: call_indirect_void:
-; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .functype call_indirect_void (i32) -> (){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: {{^}} call_indirect $pop[[L0]]{{$}}
 ; CHECK-NEXT: return{{$}}
@@ -102,8 +101,7 @@ define void @call_indirect_void(void ()* %callee) {
 }
 
 ; CHECK-LABEL: call_indirect_i32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype call_indirect_i32 (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: {{^}} i32.call_indirect $push[[NUM:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -113,8 +111,7 @@ define i32 @call_indirect_i32(i32 ()* %callee) {
 }
 
 ; CHECK-LABEL: call_indirect_i64:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype call_indirect_i64 (i32) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: {{^}} i64.call_indirect $push[[NUM:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -124,8 +121,7 @@ define i64 @call_indirect_i64(i64 ()* %callee) {
 }
 
 ; CHECK-LABEL: call_indirect_float:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype call_indirect_float (i32) -> (f32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: {{^}} f32.call_indirect $push[[NUM:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -135,8 +131,7 @@ define float @call_indirect_float(float ()* %callee) {
 }
 
 ; CHECK-LABEL: call_indirect_double:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype call_indirect_double (i32) -> (f64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: {{^}} f64.call_indirect $push[[NUM:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -146,8 +141,7 @@ define double @call_indirect_double(double ()* %callee) {
 }
 
 ; CHECK-LABEL: call_indirect_v128:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype call_indirect_v128 (i32) -> (v128){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: {{^}} v128.call_indirect $push[[NUM:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -157,7 +151,7 @@ define <16 x i8> @call_indirect_v128(<16 x i8> ()* %callee) {
 }
 
 ; CHECK-LABEL: call_indirect_arg:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype call_indirect_arg (i32, i32) -> (){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: {{^}} call_indirect $pop[[L0]], $pop[[L1]]{{$}}
@@ -168,7 +162,7 @@ define void @call_indirect_arg(void (i32)* %callee, i32 %arg) {
 }
 
 ; CHECK-LABEL: call_indirect_arg_2:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype call_indirect_arg_2 (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 2{{$}}
 ; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
@@ -181,6 +175,7 @@ define void @call_indirect_arg_2(i32 (i32, i32)* %callee, i32 %arg, i32 %arg2) {
 }
 
 ; CHECK-LABEL: tail_call_void_nullary:
+; CHECK-NEXT: .functype tail_call_void_nullary () -> (){{$}}
 ; CHECK-NEXT: {{^}} call void_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @tail_call_void_nullary() {
@@ -189,6 +184,7 @@ define void @tail_call_void_nullary() {
 }
 
 ; CHECK-LABEL: fastcc_tail_call_void_nullary:
+; CHECK-NEXT: .functype fastcc_tail_call_void_nullary () -> (){{$}}
 ; CHECK-NEXT: {{^}} call void_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @fastcc_tail_call_void_nullary() {
@@ -197,6 +193,7 @@ define void @fastcc_tail_call_void_nullary() {
 }
 
 ; CHECK-LABEL: coldcc_tail_call_void_nullary:
+; CHECK-NEXT: .functype coldcc_tail_call_void_nullary () -> (){{$}}
 ; CHECK-NEXT: {{^}} call void_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @coldcc_tail_call_void_nullary() {
@@ -205,6 +202,7 @@ define void @coldcc_tail_call_void_nullary() {
 }
 
 ; CHECK-LABEL: call_constexpr:
+; CHECK-NEXT: .functype call_constexpr () -> (){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 2{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 3{{$}}
 ; CHECK-NEXT: call .Lvararg_func_bitcast@FUNCTION, $pop[[L0]], $pop[[L1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index 82cfa4cd60f1ab55c79aacdcd2870732005b7716..1ca5295bd936b45a900f9381ce6f7186e93f5fd5 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -514,7 +514,7 @@ if.end:
 ; Test switch lowering and block placement.
 
 ; CHECK-LABEL: test4:
-; CHECK-NEXT: .param       i32{{$}}
+; CHECK-NEXT: .functype test4 (i32) -> (){{$}}
 ; CHECK:      block   {{$}}
 ; CHECK-NEXT: block   {{$}}
 ; CHECK:      br_if       0, $pop{{[0-9]+}}{{$}}
@@ -532,7 +532,7 @@ if.end:
 ; CHECK-NEXT: end_block{{$}}
 ; CHECK-NEXT: return{{$}}
 ; OPT-LABEL: test4:
-; OPT-NEXT: .param       i32{{$}}
+; OPT-NEXT: .functype test4 (i32) -> (){{$}}
 ; OPT:      block   {{$}}
 ; OPT-NEXT: block   {{$}}
 ; OPT:      br_if       0, $pop{{[0-9]+}}{{$}}
@@ -1146,7 +1146,7 @@ bb7:
 ; optnone to disable optimizations to test this case.
 
 ; CHECK-LABEL: test13:
-; CHECK-NEXT:  block   {{$}}
+; CHECK:       block   {{$}}
 ; CHECK-NEXT:  block   {{$}}
 ; CHECK:       br_if 0, $pop0{{$}}
 ; CHECK:       block   {{$}}
@@ -1162,7 +1162,7 @@ bb7:
 ; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NEXT:  unreachable{{$}}
 ; OPT-LABEL: test13:
-; OPT-NEXT:  block   {{$}}
+; OPT:       block   {{$}}
 ; OPT-NEXT:  block   {{$}}
 ; OPT:       br_if 0, $pop0{{$}}
 ; OPT:       block   {{$}}
@@ -1198,7 +1198,7 @@ bb5:
 ; before the loop for the second.
 
 ; CHECK-LABEL: test14:
-; CHECK-NEXT: .LBB23_1:{{$}}
+; CHECK:      .LBB23_1:{{$}}
 ; CHECK-NEXT:     loop    {{$}}
 ; CHECK-NEXT:     i32.const   $push0=, 0{{$}}
 ; CHECK-NEXT:     br_if       0, $pop0{{$}}
diff --git a/test/CodeGen/WebAssembly/comparisons-f32.ll b/test/CodeGen/WebAssembly/comparisons-f32.ll
index 29a952660a720acdbe26a24fd2d12de5b00b73fd..7ccfe5e8a97aaf8ff4fd0766934ad8c87552ec7e 100644
--- a/test/CodeGen/WebAssembly/comparisons-f32.ll
+++ b/test/CodeGen/WebAssembly/comparisons-f32.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: ord_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ord_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -24,8 +23,7 @@ define i32 @ord_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: uno_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype uno_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -41,8 +39,7 @@ define i32 @uno_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: oeq_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype oeq_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -102,8 +99,7 @@ define i32 @oge_f32(float %x, float %y) {
 ; These simply rely on SDAG's Expand cond code action.
 
 ; CHECK-LABEL: ueq_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ueq_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -123,8 +119,7 @@ define i32 @ueq_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: one_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype one_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -144,8 +139,7 @@ define i32 @one_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: ult_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ult_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.ge $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -159,8 +153,7 @@ define i32 @ult_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: ule_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ule_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -174,8 +167,7 @@ define i32 @ule_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: ugt_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ugt_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.le $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -189,8 +181,7 @@ define i32 @ugt_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: uge_f32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype uge_f32 (f32, f32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.lt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/comparisons-f64.ll b/test/CodeGen/WebAssembly/comparisons-f64.ll
index d4c6b2b4716c4a6b7b48b3f3891d3cea30625360..46b24db67c1c43ca7a39c8add80400ec394f4927 100644
--- a/test/CodeGen/WebAssembly/comparisons-f64.ll
+++ b/test/CodeGen/WebAssembly/comparisons-f64.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: ord_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ord_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -24,8 +23,7 @@ define i32 @ord_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: uno_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype uno_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -41,8 +39,7 @@ define i32 @uno_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: oeq_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype oeq_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -101,8 +98,7 @@ define i32 @oge_f64(double %x, double %y) {
 ; Expanded comparisons, which also check for NaN.
 
 ; CHECK-LABEL: ueq_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ueq_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -122,8 +118,7 @@ define i32 @ueq_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: one_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype one_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -143,8 +138,7 @@ define i32 @one_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: ult_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ult_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.ge $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -158,8 +152,7 @@ define i32 @ult_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: ule_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ule_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -173,8 +166,7 @@ define i32 @ule_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: ugt_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ugt_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.le $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -188,8 +180,7 @@ define i32 @ugt_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: uge_f64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype uge_f64 (f64, f64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.lt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/comparisons-i32.ll b/test/CodeGen/WebAssembly/comparisons-i32.ll
index 6760b3621c614905d6e60b44a50c3920226fb940..e7d4e1b04a4e1d159ff6a33d7238e7dc54db38ae 100644
--- a/test/CodeGen/WebAssembly/comparisons-i32.ll
+++ b/test/CodeGen/WebAssembly/comparisons-i32.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: eq_i32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype eq_i32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/comparisons-i64.ll b/test/CodeGen/WebAssembly/comparisons-i64.ll
index 5bef748788642723e56a590ccb9975f401aef394..ffd460fe9abd14100ae479e2a134ac17ae3592e8 100644
--- a/test/CodeGen/WebAssembly/comparisons-i64.ll
+++ b/test/CodeGen/WebAssembly/comparisons-i64.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: eq_i64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype eq_i64 (i64, i64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/conv-trap.ll b/test/CodeGen/WebAssembly/conv-trap.ll
index 3ca742b1df47d9993f2dfab52ba6da68956a2b19..ca87f6b8dd94bd7522c13d72e908f6ee527092f1 100644
--- a/test/CodeGen/WebAssembly/conv-trap.ll
+++ b/test/CodeGen/WebAssembly/conv-trap.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: i32_trunc_s_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_s_f32 (f32) -> (i32){{$}}
 ; CHECK-NEXT: block
 ; CHECK-NEXT: f32.abs $push[[ABS:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: f32.const $push[[LIMIT:[0-9]+]]=, 0x1p31{{$}}
@@ -26,8 +25,7 @@ define i32 @i32_trunc_s_f32(float %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_u_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_u_f32 (f32) -> (i32){{$}}
 ; CHECK-NEXT: block
 ; CHECK-NEXT: f32.const $push[[LIMIT:[0-9]+]]=, 0x1p32{{$}}
 ; CHECK-NEXT: f32.lt $push[[LT:[0-9]+]]=, $0, $pop[[LIMIT]]{{$}}
@@ -47,8 +45,7 @@ define i32 @i32_trunc_u_f32(float %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_s_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_s_f64 (f64) -> (i32){{$}}
 ; CHECK-NEXT: block
 ; CHECK-NEXT: f64.abs $push[[ABS:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: f64.const $push[[LIMIT:[0-9]+]]=, 0x1p31{{$}}
@@ -66,8 +63,7 @@ define i32 @i32_trunc_s_f64(double %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_u_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_u_f64 (f64) -> (i32){{$}}
 ; CHECK-NEXT: block
 ; CHECK-NEXT: f64.const $push[[LIMIT:[0-9]+]]=, 0x1p32{{$}}
 ; CHECK-NEXT: f64.lt $push[[LT:[0-9]+]]=, $0, $pop[[LIMIT]]{{$}}
@@ -87,8 +83,7 @@ define i32 @i32_trunc_u_f64(double %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_s_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_s_f32 (f32) -> (i64){{$}}
 ; CHECK-NEXT: block
 ; CHECK-NEXT: f32.abs $push[[ABS:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: f32.const $push[[LIMIT:[0-9]+]]=, 0x1p63{{$}}
@@ -106,8 +101,7 @@ define i64 @i64_trunc_s_f32(float %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_u_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_u_f32 (f32) -> (i64){{$}}
 ; CHECK-NEXT: block
 ; CHECK-NEXT: f32.const $push[[LIMIT:[0-9]+]]=, 0x1p64{{$}}
 ; CHECK-NEXT: f32.lt $push[[LT:[0-9]+]]=, $0, $pop[[LIMIT]]{{$}}
@@ -127,8 +121,7 @@ define i64 @i64_trunc_u_f32(float %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_s_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_s_f64 (f64) -> (i64){{$}}
 ; CHECK-NEXT: block
 ; CHECK-NEXT: f64.abs $push[[ABS:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: f64.const $push[[LIMIT:[0-9]+]]=, 0x1p63{{$}}
@@ -146,8 +139,7 @@ define i64 @i64_trunc_s_f64(double %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_u_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_u_f64 (f64) -> (i64){{$}}
 ; CHECK-NEXT: block
 ; CHECK-NEXT: f64.const $push[[LIMIT:[0-9]+]]=, 0x1p64{{$}}
 ; CHECK-NEXT: f64.lt $push[[LT:[0-9]+]]=, $0, $pop[[LIMIT]]{{$}}
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index ea1ef9737c0c050eb0e22ba68d5ba4ae6579a135..8b729a67194bd5e54287a563679cc1a493f7f1b6 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -6,8 +6,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: i32_wrap_i64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_wrap_i64 (i64) -> (i32){{$}}
 ; CHECK-NEXT: i32.wrap/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @i32_wrap_i64(i64 %x) {
@@ -16,8 +15,7 @@ define i32 @i32_wrap_i64(i64 %x) {
 }
 
 ; CHECK-LABEL: i64_extend_s_i32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_extend_s_i32 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.extend_s/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @i64_extend_s_i32(i32 %x) {
@@ -26,8 +24,7 @@ define i64 @i64_extend_s_i32(i32 %x) {
 }
 
 ; CHECK-LABEL: i64_extend_u_i32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_extend_u_i32 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.extend_u/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @i64_extend_u_i32(i32 %x) {
@@ -36,8 +33,7 @@ define i64 @i64_extend_u_i32(i32 %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_s_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_s_f32 (f32) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @i32_trunc_s_f32(float %x) {
@@ -46,8 +42,7 @@ define i32 @i32_trunc_s_f32(float %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_sat_s_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_sat_s_f32 (f32) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 declare i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float)
@@ -57,8 +52,7 @@ define i32 @i32_trunc_sat_s_f32(float %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_u_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_u_f32 (f32) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @i32_trunc_u_f32(float %x) {
@@ -67,8 +61,7 @@ define i32 @i32_trunc_u_f32(float %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_sat_u_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_sat_u_f32 (f32) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float)
@@ -78,8 +71,7 @@ define i32 @i32_trunc_sat_u_f32(float %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_s_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_s_f64 (f64) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @i32_trunc_s_f64(double %x) {
@@ -88,8 +80,7 @@ define i32 @i32_trunc_s_f64(double %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_sat_s_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_sat_s_f64 (f64) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 declare i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double)
@@ -99,8 +90,7 @@ define i32 @i32_trunc_sat_s_f64(double %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_u_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_u_f64 (f64) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @i32_trunc_u_f64(double %x) {
@@ -109,8 +99,7 @@ define i32 @i32_trunc_u_f64(double %x) {
 }
 
 ; CHECK-LABEL: i32_trunc_sat_u_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_trunc_sat_u_f64 (f64) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double)
@@ -120,8 +109,7 @@ define i32 @i32_trunc_sat_u_f64(double %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_s_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_s_f32 (f32) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @i64_trunc_s_f32(float %x) {
@@ -130,8 +118,7 @@ define i64 @i64_trunc_s_f32(float %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_sat_s_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_sat_s_f32 (f32) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 declare i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float)
@@ -141,8 +128,7 @@ define i64 @i64_trunc_sat_s_f32(float %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_u_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_u_f32 (f32) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @i64_trunc_u_f32(float %x) {
@@ -151,8 +137,7 @@ define i64 @i64_trunc_u_f32(float %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_sat_u_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_sat_u_f32 (f32) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float)
@@ -162,8 +147,7 @@ define i64 @i64_trunc_sat_u_f32(float %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_s_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_s_f64 (f64) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @i64_trunc_s_f64(double %x) {
@@ -172,8 +156,7 @@ define i64 @i64_trunc_s_f64(double %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_sat_s_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_sat_s_f64 (f64) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 declare i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double)
@@ -183,8 +166,7 @@ define i64 @i64_trunc_sat_s_f64(double %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_u_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_u_f64 (f64) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @i64_trunc_u_f64(double %x) {
@@ -193,8 +175,7 @@ define i64 @i64_trunc_u_f64(double %x) {
 }
 
 ; CHECK-LABEL: i64_trunc_sat_u_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_trunc_sat_u_f64 (f64) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double)
@@ -204,8 +185,7 @@ define i64 @i64_trunc_sat_u_f64(double %x) {
 }
 
 ; CHECK-LABEL: f32_convert_s_i32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype f32_convert_s_i32 (i32) -> (f32){{$}}
 ; CHECK-NEXT: f32.convert_s/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @f32_convert_s_i32(i32 %x) {
@@ -214,8 +194,7 @@ define float @f32_convert_s_i32(i32 %x) {
 }
 
 ; CHECK-LABEL: f32_convert_u_i32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype f32_convert_u_i32 (i32) -> (f32){{$}}
 ; CHECK-NEXT: f32.convert_u/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @f32_convert_u_i32(i32 %x) {
@@ -224,8 +203,7 @@ define float @f32_convert_u_i32(i32 %x) {
 }
 
 ; CHECK-LABEL: f64_convert_s_i32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype f64_convert_s_i32 (i32) -> (f64){{$}}
 ; CHECK-NEXT: f64.convert_s/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @f64_convert_s_i32(i32 %x) {
@@ -234,8 +212,7 @@ define double @f64_convert_s_i32(i32 %x) {
 }
 
 ; CHECK-LABEL: f64_convert_u_i32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype f64_convert_u_i32 (i32) -> (f64){{$}}
 ; CHECK-NEXT: f64.convert_u/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @f64_convert_u_i32(i32 %x) {
@@ -244,8 +221,7 @@ define double @f64_convert_u_i32(i32 %x) {
 }
 
 ; CHECK-LABEL: f32_convert_s_i64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype f32_convert_s_i64 (i64) -> (f32){{$}}
 ; CHECK-NEXT: f32.convert_s/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @f32_convert_s_i64(i64 %x) {
@@ -254,8 +230,7 @@ define float @f32_convert_s_i64(i64 %x) {
 }
 
 ; CHECK-LABEL: f32_convert_u_i64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype f32_convert_u_i64 (i64) -> (f32){{$}}
 ; CHECK-NEXT: f32.convert_u/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @f32_convert_u_i64(i64 %x) {
@@ -264,8 +239,7 @@ define float @f32_convert_u_i64(i64 %x) {
 }
 
 ; CHECK-LABEL: f64_convert_s_i64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype f64_convert_s_i64 (i64) -> (f64){{$}}
 ; CHECK-NEXT: f64.convert_s/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @f64_convert_s_i64(i64 %x) {
@@ -274,8 +248,7 @@ define double @f64_convert_s_i64(i64 %x) {
 }
 
 ; CHECK-LABEL: f64_convert_u_i64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype f64_convert_u_i64 (i64) -> (f64){{$}}
 ; CHECK-NEXT: f64.convert_u/i64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @f64_convert_u_i64(i64 %x) {
@@ -284,8 +257,7 @@ define double @f64_convert_u_i64(i64 %x) {
 }
 
 ; CHECK-LABEL: f64_promote_f32:
-; CHECK-NEXT: .param f32{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype f64_promote_f32 (f32) -> (f64){{$}}
 ; CHECK-NEXT: f64.promote/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @f64_promote_f32(float %x) {
@@ -294,8 +266,7 @@ define double @f64_promote_f32(float %x) {
 }
 
 ; CHECK-LABEL: f32_demote_f64:
-; CHECK-NEXT: .param f64{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype f32_demote_f64 (f64) -> (f32){{$}}
 ; CHECK-NEXT: f32.demote/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @f32_demote_f64(double %x) {
diff --git a/test/CodeGen/WebAssembly/dead-vreg.ll b/test/CodeGen/WebAssembly/dead-vreg.ll
index 190a0856400183bb43857cc6b869316ff5200e39..c1b9ad6700d42d5aeab97678ab769c2a2e54d9af 100644
--- a/test/CodeGen/WebAssembly/dead-vreg.ll
+++ b/test/CodeGen/WebAssembly/dead-vreg.ll
@@ -7,7 +7,7 @@ target triple = "wasm32-unknown-unknown"
 
 define void @foo(i32* nocapture %a, i32 %w, i32 %h) {
 ; CHECK-LABEL: foo:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype foo (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: .local i32, i32, i32, i32, i32, i32{{$}}
 entry:
   %cmp.19 = icmp sgt i32 %h, 0
diff --git a/test/CodeGen/WebAssembly/exception.ll b/test/CodeGen/WebAssembly/exception.ll
index bd7935c3684c14c6e3c032edffc1e728dc591da5..b0365fc9d43c9766fe903bb3b3add4602d030953 100644
--- a/test/CodeGen/WebAssembly/exception.ll
+++ b/test/CodeGen/WebAssembly/exception.ll
@@ -1,5 +1,5 @@
 ; RUN: not llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
@@ -12,10 +12,11 @@ target triple = "wasm32-unknown-unknown"
 declare void @llvm.wasm.throw(i32, i8*)
 
 ; CHECK-LABEL: test_throw:
-; CHECK-NEXT: i32.const $push0=, 0
-; CHECK-NEXT: throw 0, $pop0
-define void @test_throw() {
-  call void @llvm.wasm.throw(i32 0, i8* null)
+; CHECK:      get_local $push0=, 0
+; CHECK-NEXT: throw __cpp_exception@EVENT, $pop0
+; CHECK-NOT:  unreachable
+define void @test_throw(i8* %p) {
+  call void @llvm.wasm.throw(i32 0, i8* %p)
   ret void
 }
 
@@ -259,3 +260,6 @@ declare void @__cxa_rethrow()
 declare void @__clang_call_terminate(i8*)
 declare void @_ZSt9terminatev()
 declare %struct.Cleanup* @_ZN7CleanupD1Ev(%struct.Cleanup* returned)
+
+; CHECK: __cpp_exception:
+; CHECK: .eventtype  __cpp_exception i32
diff --git a/test/CodeGen/WebAssembly/f16.ll b/test/CodeGen/WebAssembly/f16.ll
index 4daa44c37a3175f54543a0b8eed901b7b9283e17..6ffedd3f310a5970427e0ccc5dd6f98fdaf47d1c 100644
--- a/test/CodeGen/WebAssembly/f16.ll
+++ b/test/CodeGen/WebAssembly/f16.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: demote:
-; CHECK-NEXT: .param  	f32{{$}}
-; CHECK-NEXT: .result 	f32{{$}}
+; CHECK-NEXT: .functype demote (f32) -> (f32){{$}}
 ; CHECK-NEXT: get_local	$push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.call	$push[[L1:[0-9]+]]=, __gnu_f2h_ieee@FUNCTION, $pop[[L0]]{{$}}
 ; CHECK-NEXT: f32.call	$push[[L2:[0-9]+]]=, __gnu_h2f_ieee@FUNCTION, $pop[[L1]]{{$}}
@@ -19,8 +18,7 @@ define half @demote(float %f) {
 }
 
 ; CHECK-LABEL: promote:
-; CHECK-NEXT: .param  	f32{{$}}
-; CHECK-NEXT: .result 	f32{{$}}
+; CHECK-NEXT: .functype promote (f32) -> (f32){{$}}
 ; CHECK-NEXT: get_local	$push0=, 0{{$}}
 ; CHECK-NEXT: return  	$pop0{{$}}
 define float @promote(half %f) {
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
index 27520d035c9ebfa8e8c858669783ed0d40530de1..fd89261f1727a5f1585b2f66b17f9a95e450cd40 100644
--- a/test/CodeGen/WebAssembly/f32.ll
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -16,8 +16,7 @@ declare float @llvm.rint.f32(float)
 declare float @llvm.fma.f32(float, float, float)
 
 ; CHECK-LABEL: fadd32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype fadd32 (f32, f32) -> (f32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.add $push[[LR:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
index d02767fa3a15cfc6128bec58e7da1f80b926f626..98a23b193359c13f4ecd1b9a675f450e1cad7cc1 100644
--- a/test/CodeGen/WebAssembly/f64.ll
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -16,8 +16,7 @@ declare double @llvm.rint.f64(double)
 declare double @llvm.fma.f64(double, double, double)
 
 ; CHECK-LABEL: fadd64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype fadd64 (f64, f64) -> (f64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.add $push[[LR:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/fast-isel-i24.ll b/test/CodeGen/WebAssembly/fast-isel-i24.ll
index ef2494bdb08bfe7e6eeb3b6541ac7e2cf9ea0856..b0ca27674a1acce053c15f502fafa53be514c251 100644
--- a/test/CodeGen/WebAssembly/fast-isel-i24.ll
+++ b/test/CodeGen/WebAssembly/fast-isel-i24.ll
@@ -8,8 +8,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: add:
-; CHECK-NEXT: .param  	i32, i32{{$}}
-; CHECK-NEXT: .result 	i32{{$}}
+; CHECK-NEXT: .functype add (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local	$push2=, 0{{$}}
 ; CHECK-NEXT: get_local	$push1=, 1{{$}}
 ; CHECK-NEXT: i32.add 	$push0=, $pop2, $pop1{{$}}
@@ -20,7 +19,7 @@ define i24 @add(i24 %x, i24 %y) {
 }
 
 ; CHECK-LABEL: return_zero:
-; CHECK-NEXT: .result 	i32{{$}}
+; CHECK-NEXT: .functype return_zero () -> (i32){{$}}
 ; CHECK-NEXT: i32.const	$push0=, 0{{$}}
 ; CHECK-NEXT: end_function
 define i24 @return_zero() {
diff --git a/test/CodeGen/WebAssembly/fast-isel-i256.ll b/test/CodeGen/WebAssembly/fast-isel-i256.ll
index b4cb73596b1e81f814e16b2f9068aaf07800439d..a916a87d346d5b3c6b1b67c3fb083e640678b357 100644
--- a/test/CodeGen/WebAssembly/fast-isel-i256.ll
+++ b/test/CodeGen/WebAssembly/fast-isel-i256.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: add:
-; CHECK-NEXT: .param      i32, i64, i64, i64, i64, i64, i64, i64, i64{{$}}
+; CHECK-NEXT: .functype add (i32, i64, i64, i64, i64, i64, i64, i64, i64) -> (){{$}}
 ; CHECK-NOT:  .result
 ; CHECK: end_function
 define i256 @add(i256 %x, i256 %y) {
@@ -17,16 +17,14 @@ define i256 @add(i256 %x, i256 %y) {
 }
 
 ; CHECK-LABEL: return_zero:
-; CHECK-NEXT: .param      i32{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype return_zero (i32) -> (){{$}}
 ; CHECK: end_function
 define i256 @return_zero() {
     ret i256 0
 }
 
 ; CHECK-LABEL: return_zero_with_params:
-; CHECK-NEXT: .param      i32, f32{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype return_zero_with_params (i32, f32) -> (){{$}}
 ; CHECK: end_function
 define i256 @return_zero_with_params(float %x) {
     ret i256 0
diff --git a/test/CodeGen/WebAssembly/frem.ll b/test/CodeGen/WebAssembly/frem.ll
index 6b89aee29a9a3227a38527bf2f6412b6ef5c8b3d..a9c021d7177ffcc1547736d962fb53189d0640ff 100644
--- a/test/CodeGen/WebAssembly/frem.ll
+++ b/test/CodeGen/WebAssembly/frem.ll
@@ -6,8 +6,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: frem32:
-; CHECK-NEXT: .param f32, f32{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype frem32 (f32, f32) -> (f32){{$}}
 ; CHECK-NEXT: {{^}} f32.call $push0=, fmodf@FUNCTION, $0, $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define float @frem32(float %x, float %y) {
@@ -16,8 +15,7 @@ define float @frem32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: frem64:
-; CHECK-NEXT: .param f64, f64{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype frem64 (f64, f64) -> (f64){{$}}
 ; CHECK-NEXT: {{^}} f64.call $push0=, fmod@FUNCTION, $0, $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define double @frem64(double %x, double %y) {
diff --git a/test/CodeGen/WebAssembly/func.ll b/test/CodeGen/WebAssembly/func.ll
index 078f8fda3b900fdd37bf35519e031ed55c9787f4..13a1cf2d513ab4b5444928acf79808e092b45c62 100644
--- a/test/CodeGen/WebAssembly/func.ll
+++ b/test/CodeGen/WebAssembly/func.ll
@@ -14,7 +14,7 @@ define void @f0() {
 }
 
 ; CHECK-LABEL: f1:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype f1 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 ; CHECK: .size f1,
@@ -23,8 +23,7 @@ define i32 @f1() {
 }
 
 ; CHECK-LABEL: f2:
-; CHECK-NEXT: .param i32, f32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype f2 (i32, f32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 ; CHECK: .size f2,
@@ -33,7 +32,7 @@ define i32 @f2(i32 %p1, float %p2) {
 }
 
 ; CHECK-LABEL: f3:
-; CHECK-NEXT: .param i32, f32{{$}}
+; CHECK-NEXT: .functype f3 (i32, f32) -> (){{$}}
 ; CHECK-NOT: local
 ; CHECK-NEXT: return{{$}}
 ; CHECK: .size f3,
@@ -42,8 +41,7 @@ define void @f3(i32 %p1, float %p2) {
 }
 
 ; CHECK-LABEL: f4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype f4 (i32) -> (i32){{$}}
 ; CHECK-NOT: local
 ; CHECK: .size f4,
 define i32 @f4(i32 %x) {
@@ -57,7 +55,7 @@ false:
 }
 
 ; CHECK-LABEL: f5:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype f5 () -> (f32){{$}}
 ; CHECK-NEXT: unreachable
 define float @f5()  {
  unreachable
diff --git a/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll b/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
index 633871a599b3550e5a84deec9d9082c8e9ab25ae..515c5703d86c06c64249c83e5f6e7210826a60d6 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
@@ -23,9 +23,9 @@ declare void @specified(i32, i32)
 ; CHECK: call    	.Lspecified_bitcast@FUNCTION, $pop{{[0-9]+$}}
 
 ; CHECK: .Lunderspecified_bitcast:
-; CHECK-NEXT: .param  	i32, i32{{$}}
+; CHECK-NEXT: .functype .Lunderspecified_bitcast (i32, i32) -> (){{$}}
 ; CHECK: call    	underspecified@FUNCTION, $pop{{[0-9]+$}}
 
 ; CHECK: .Lspecified_bitcast:
-; CHECK-NEXT: .param  	i32{{$}}
+; CHECK-NEXT: .functype .Lspecified_bitcast (i32) -> (){{$}}
 ; CHECK: call    	specified@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+$}}
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
index 0e7fcd5d5701c36cdb267786dbfd0a27d3b3cce1..93c09cc5dd2d418f9e75578355e861ebe4c2d0fe 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -17,7 +17,7 @@ declare void @foo2()
 declare void @foo3()
 
 ; CHECK-LABEL: test:
-; CHECK-NEXT: call        .Lhas_i32_arg_bitcast.2@FUNCTION{{$}}
+; CHECK:      call        .Lhas_i32_arg_bitcast.2@FUNCTION{{$}}
 ; CHECK-NEXT: call        .Lhas_i32_arg_bitcast.2@FUNCTION{{$}}
 ; CHECK-NEXT: call        .Lhas_i32_ret_bitcast@FUNCTION{{$}}
 ; CHECK-NEXT: i32.call     $drop=, has_i32_ret@FUNCTION
@@ -96,7 +96,7 @@ define void @test_varargs() {
 @global_func = hidden local_unnamed_addr global void ()* null
 
 ; CHECK-LABEL: test_store:
-; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK:      i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const   $push[[L1:[0-9]+]]=, has_i32_ret@FUNCTION{{$}}
 ; CHECK-NEXT: i32.store   global_func($pop[[L0]]), $pop[[L1]]{{$}}
 define void @test_store() {
@@ -106,7 +106,7 @@ define void @test_store() {
 }
 
 ; CHECK-LABEL: test_load:
-; CHECK-NEXT: result      i32{{$}}
+; CHECK-NEXT: .functype test_load () -> (i32){{$}}
 ; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.load    $push[[L1:[0-9]+]]=, global_func($pop[[L0]]){{$}}
 ; CHECK-NEXT: i32.call_indirect $push{{[0-9]+}}=, $pop[[L1]]{{$}}
@@ -121,7 +121,7 @@ define i32 @test_load() {
 declare void @call_func(i32 ()*)
 
 ; CHECK-LABEL: test_argument:
-; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, has_i32_ret@FUNCTION{{$}}
+; CHECK:      i32.const   $push[[L0:[0-9]+]]=, has_i32_ret@FUNCTION{{$}}
 ; CHECK-NEXT: call        call_func@FUNCTION, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32.const   $push[[L1:[0-9]+]]=, has_i32_arg@FUNCTION{{$}}
 ; CHECK-NEXT: call        call_func@FUNCTION, $pop[[L1]]{{$}}
@@ -166,21 +166,21 @@ end:
 }
 
 ; CHECK-LABEL: .Lhas_i32_arg_bitcast:
-; CHECK-NEXT: .param      i32, i32
+; CHECK-NEXT: .functype .Lhas_i32_arg_bitcast (i32, i32) -> ()
 ; CHECK-NEXT: call        has_i32_arg@FUNCTION, $1{{$}}
 ; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: .Lhas_i32_arg_bitcast.1:
-; CHECK-NEXT: .param      i32, i32
+; CHECK-NEXT: .functype .Lhas_i32_arg_bitcast.1 (i32, i32) -> ()
 ; CHECK-NEXT: call        has_i32_arg@FUNCTION, $0{{$}}
 ; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: .Lhas_i32_arg_bitcast.2:
-; CHECK-NEXT: call        has_i32_arg@FUNCTION, $0{{$}}
+; CHECK:      call        has_i32_arg@FUNCTION, $0{{$}}
 ; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: .Lhas_i32_ret_bitcast:
-; CHECK-NEXT: call        $drop=, has_i32_ret@FUNCTION{{$}}
+; CHECK:      call        $drop=, has_i32_ret@FUNCTION{{$}}
 ; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: .Lvararg_bitcast:
@@ -192,12 +192,12 @@ end:
 ; CHECK: end_function
 
 ; CHECK-LABEL: .Lfoo0_bitcast:
-; CHECK-NEXT: .param      i32
+; CHECK-NEXT: .functype .Lfoo0_bitcast (i32) -> ()
 ; CHECK-NEXT: call        foo0@FUNCTION{{$}}
 ; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: .Lfoo1_bitcast:
-; CHECK-NEXT: .result     i32
+; CHECK-NEXT: .functype .Lfoo1_bitcast () -> (i32)
 ; CHECK-NEXT: call        foo1@FUNCTION{{$}}
 ; CHECK-NEXT: copy_local  $push0=, $0
 ; CHECK-NEXT: end_function
diff --git a/test/CodeGen/WebAssembly/global.ll b/test/CodeGen/WebAssembly/global.ll
index e4211670541b0c5e7555d7df406bcc6eb759037c..4e8f75e8d32d273aeae51a14f22ee1863e574bcc 100644
--- a/test/CodeGen/WebAssembly/global.ll
+++ b/test/CodeGen/WebAssembly/global.ll
@@ -19,8 +19,7 @@ define i32 @foo() {
 }
 
 ; CHECK-LABEL: call_memcpy:
-; CHECK-NEXT: .param          i32, i32, i32{{$}}
-; CHECK-NEXT: .result         i32{{$}}
+; CHECK-NEXT: .functype call_memcpy (i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.call        $push0=, memcpy@FUNCTION, $0, $1, $2{{$}}
 ; CHECK-NEXT: return          $pop0{{$}}
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1)
@@ -205,7 +204,7 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 ; CHECK-NEXT: .skip       8
 ; CHECK-NEXT: .size       array, 8
 ; CHECK: .type       pointer_to_array,@object
-; CHECK-NEXT: .section    .data.rel.ro.pointer_to_array,
+; CHECK-NEXT: .section    .rodata.pointer_to_array,
 ; CHECK-NEXT: .globl      pointer_to_array
 ; CHECK-NEXT: .p2align      2
 ; CHECK-NEXT: pointer_to_array:
diff --git a/test/CodeGen/WebAssembly/i128-returned.ll b/test/CodeGen/WebAssembly/i128-returned.ll
index f1e1a70711ba2ec4edc39a89c8f423bbb4c0fcba..a4a136589eabca8311815c1a9bf35b5061d708ee 100644
--- a/test/CodeGen/WebAssembly/i128-returned.ll
+++ b/test/CodeGen/WebAssembly/i128-returned.ll
@@ -14,7 +14,6 @@ define i128 @foo(i128) {
 }
 
 ; CHECK-LABEL: foo:
-; CHECK-NEXT: .param  	i32, i64, i64
-; CHECK-NOT:  .result
+; CHECK-NEXT: .functype foo (i32, i64, i64) -> ()
 
-; CHECK: .functype	bar, void, i32, i64, i64
+; CHECK: .functype bar (i32, i64, i64) -> ()
diff --git a/test/CodeGen/WebAssembly/i128.ll b/test/CodeGen/WebAssembly/i128.ll
index 753c79f7ad4e9a334a2dd8fa9621753c3907843d..24276a76cf807ca0fdc2fd3479e41074abe9772c 100644
--- a/test/CodeGen/WebAssembly/i128.ll
+++ b/test/CodeGen/WebAssembly/i128.ll
@@ -10,7 +10,7 @@ declare i128 @llvm.cttz.i128(i128, i1)
 declare i128 @llvm.ctpop.i128(i128)
 
 ; CHECK-LABEL: add128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NEXT: .functype add128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK-NOT:  .result
 ; CHECK:      i64.add
 ; CHECK:      i64.store
@@ -23,8 +23,7 @@ define i128 @add128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: sub128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype sub128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK:      i64.sub
 ; CHECK:      i64.store
 ; CHECK:      i64.sub
@@ -36,8 +35,7 @@ define i128 @sub128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: mul128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype mul128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __multi3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
 define i128 @mul128(i128 %x, i128 %y) {
@@ -46,8 +44,7 @@ define i128 @mul128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: sdiv128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype sdiv128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __divti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
 define i128 @sdiv128(i128 %x, i128 %y) {
@@ -56,8 +53,7 @@ define i128 @sdiv128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: udiv128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype udiv128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __udivti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
 define i128 @udiv128(i128 %x, i128 %y) {
@@ -66,8 +62,7 @@ define i128 @udiv128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: srem128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype srem128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __modti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
 define i128 @srem128(i128 %x, i128 %y) {
@@ -76,8 +71,7 @@ define i128 @srem128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: urem128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype urem128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __umodti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
 define i128 @urem128(i128 %x, i128 %y) {
@@ -86,7 +80,7 @@ define i128 @urem128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: and128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NEXT: .functype and128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK-NOT:  .result
 ; CHECK:      i64.and
 ; CHECK:      i64.store
@@ -99,8 +93,7 @@ define i128 @and128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: or128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype or128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK:      i64.or
 ; CHECK:      i64.store
 ; CHECK:      i64.or
@@ -112,8 +105,7 @@ define i128 @or128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: xor128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype xor128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK:      i64.xor
 ; CHECK:      i64.store
 ; CHECK:      i64.xor
@@ -125,8 +117,7 @@ define i128 @xor128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: shl128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype shl128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
 define i128 @shl128(i128 %x, i128 %y) {
@@ -135,8 +126,7 @@ define i128 @shl128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: shr128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype shr128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
 define i128 @shr128(i128 %x, i128 %y) {
@@ -145,8 +135,7 @@ define i128 @shr128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: sar128:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype sar128 (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __ashrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
 define i128 @sar128(i128 %x, i128 %y) {
@@ -155,7 +144,7 @@ define i128 @sar128(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: clz128:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype clz128 (i32, i64, i64) -> (){{$}}
 ; CHECK-NOT:  .result
 ; CHECK:      i64.clz
 ; CHECK:      i64.clz
@@ -166,8 +155,7 @@ define i128 @clz128(i128 %x) {
 }
 
 ; CHECK-LABEL: clz128_zero_undef:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype clz128_zero_undef (i32, i64, i64) -> (){{$}}
 ; CHECK:      i64.clz
 ; CHECK:      i64.clz
 ; CHECK:      return{{$}}
@@ -177,8 +165,7 @@ define i128 @clz128_zero_undef(i128 %x) {
 }
 
 ; CHECK-LABEL: ctz128:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype ctz128 (i32, i64, i64) -> (){{$}}
 ; CHECK:      i64.ctz
 ; CHECK:      i64.ctz
 ; CHECK:      return{{$}}
@@ -188,8 +175,7 @@ define i128 @ctz128(i128 %x) {
 }
 
 ; CHECK-LABEL: ctz128_zero_undef:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype ctz128_zero_undef (i32, i64, i64) -> (){{$}}
 ; CHECK:      i64.ctz
 ; CHECK:      i64.ctz
 ; CHECK:      return{{$}}
@@ -199,8 +185,7 @@ define i128 @ctz128_zero_undef(i128 %x) {
 }
 
 ; CHECK-LABEL: popcnt128:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype popcnt128 (i32, i64, i64) -> (){{$}}
 ; CHECK:      i64.popcnt
 ; CHECK:      i64.popcnt
 ; CHECK:      return{{$}}
@@ -210,8 +195,7 @@ define i128 @popcnt128(i128 %x) {
 }
 
 ; CHECK-LABEL: eqz128:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype eqz128 (i64, i64) -> (i32){{$}}
 ; CHECK:     i64.or
 ; CHECK:     i64.eqz
 ; CHECK:     return $
@@ -222,8 +206,7 @@ define i32 @eqz128(i128 %x) {
 }
 
 ; CHECK-LABEL: rotl:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype rotl (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
@@ -236,8 +219,7 @@ define i128 @rotl(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: masked_rotl:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype masked_rotl (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
@@ -251,8 +233,7 @@ define i128 @masked_rotl(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: rotr:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype rotr (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
@@ -265,8 +246,7 @@ define i128 @rotr(i128 %x, i128 %y) {
 }
 
 ; CHECK-LABEL: masked_rotr:
-; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
-; CHECK-NOT: .result
+; CHECK-NEXT: .functype masked_rotr (i32, i64, i64, i64, i64) -> (){{$}}
 ; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
 ; CHECK: return{{$}}
diff --git a/test/CodeGen/WebAssembly/i32-load-store-alignment.ll b/test/CodeGen/WebAssembly/i32-load-store-alignment.ll
index 27e84821020f92416da6fe1e6bc4578d0467566a..4fcc4254b69536e44522ea8ffd48b397d3de6f84 100644
--- a/test/CodeGen/WebAssembly/i32-load-store-alignment.ll
+++ b/test/CodeGen/WebAssembly/i32-load-store-alignment.ll
@@ -10,8 +10,7 @@ target triple = "wasm32-unknown-unknown"
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: ldi32_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi32_a1 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32_a1(i32 *%p) {
@@ -20,8 +19,7 @@ define i32 @ldi32_a1(i32 *%p) {
 }
 
 ; CHECK-LABEL: ldi32_a2:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi32_a2 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0):p2align=1{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32_a2(i32 *%p) {
@@ -32,8 +30,7 @@ define i32 @ldi32_a2(i32 *%p) {
 ; 4 is the default alignment for i32 so no attribute is needed.
 
 ; CHECK-LABEL: ldi32_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi32_a4 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32_a4(i32 *%p) {
@@ -44,8 +41,7 @@ define i32 @ldi32_a4(i32 *%p) {
 ; The default alignment in LLVM is the same as the defualt alignment in wasm.
 
 ; CHECK-LABEL: ldi32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi32 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32(i32 *%p) {
@@ -56,8 +52,7 @@ define i32 @ldi32(i32 *%p) {
 ; 8 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: ldi32_a8:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi32_a8 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32_a8(i32 *%p) {
@@ -70,8 +65,7 @@ define i32 @ldi32_a8(i32 *%p) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: ldi8_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi8_a1 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load8_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i8 @ldi8_a1(i8 *%p) {
@@ -80,8 +74,7 @@ define i8 @ldi8_a1(i8 *%p) {
 }
 
 ; CHECK-LABEL: ldi8_a2:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi8_a2 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load8_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i8 @ldi8_a2(i8 *%p) {
@@ -90,8 +83,7 @@ define i8 @ldi8_a2(i8 *%p) {
 }
 
 ; CHECK-LABEL: ldi16_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi16_a1 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load16_u $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i16 @ldi16_a1(i16 *%p) {
@@ -100,8 +92,7 @@ define i16 @ldi16_a1(i16 *%p) {
 }
 
 ; CHECK-LABEL: ldi16_a2:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi16_a2 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load16_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i16 @ldi16_a2(i16 *%p) {
@@ -110,8 +101,7 @@ define i16 @ldi16_a2(i16 *%p) {
 }
 
 ; CHECK-LABEL: ldi16_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi16_a4 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load16_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i16 @ldi16_a4(i16 *%p) {
@@ -124,7 +114,7 @@ define i16 @ldi16_a4(i16 *%p) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: sti32_a1:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti32_a1 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_a1(i32 *%p, i32 %v) {
@@ -133,7 +123,7 @@ define void @sti32_a1(i32 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: sti32_a2:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti32_a2 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store 0($0):p2align=1, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_a2(i32 *%p, i32 %v) {
@@ -144,7 +134,7 @@ define void @sti32_a2(i32 *%p, i32 %v) {
 ; 4 is the default alignment for i32 so no attribute is needed.
 
 ; CHECK-LABEL: sti32_a4:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti32_a4 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_a4(i32 *%p, i32 %v) {
@@ -155,7 +145,7 @@ define void @sti32_a4(i32 *%p, i32 %v) {
 ; The default alignment in LLVM is the same as the defualt alignment in wasm.
 
 ; CHECK-LABEL: sti32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti32 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32(i32 *%p, i32 %v) {
@@ -164,7 +154,7 @@ define void @sti32(i32 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: sti32_a8:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti32_a8 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_a8(i32 *%p, i32 %v) {
@@ -177,7 +167,7 @@ define void @sti32_a8(i32 *%p, i32 %v) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: sti8_a1:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti8_a1 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store8 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti8_a1(i8 *%p, i8 %v) {
@@ -186,7 +176,7 @@ define void @sti8_a1(i8 *%p, i8 %v) {
 }
 
 ; CHECK-LABEL: sti8_a2:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti8_a2 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store8 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti8_a2(i8 *%p, i8 %v) {
@@ -195,7 +185,7 @@ define void @sti8_a2(i8 *%p, i8 %v) {
 }
 
 ; CHECK-LABEL: sti16_a1:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti16_a1 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store16 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti16_a1(i16 *%p, i16 %v) {
@@ -204,7 +194,7 @@ define void @sti16_a1(i16 *%p, i16 %v) {
 }
 
 ; CHECK-LABEL: sti16_a2:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti16_a2 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store16 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti16_a2(i16 *%p, i16 %v) {
@@ -213,7 +203,7 @@ define void @sti16_a2(i16 *%p, i16 %v) {
 }
 
 ; CHECK-LABEL: sti16_a4:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti16_a4 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store16 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti16_a4(i16 *%p, i16 %v) {
@@ -229,8 +219,7 @@ define void @sti16_a4(i16 *%p, i16 %v) {
 ; natural alignment.
 
 ; CHECK-LABEL: ldi32_atomic_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi32_atomic_a4 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.atomic.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32_atomic_a4(i32 *%p) {
@@ -241,8 +230,7 @@ define i32 @ldi32_atomic_a4(i32 *%p) {
 ; 8 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: ldi32_atomic_a8:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi32_atomic_a8 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.atomic.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32_atomic_a8(i32 *%p) {
@@ -255,7 +243,7 @@ define i32 @ldi32_atomic_a8(i32 *%p) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: sti32_atomic_a4:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti32_atomic_a4 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_atomic_a4(i32 *%p, i32 %v) {
@@ -266,7 +254,7 @@ define void @sti32_atomic_a4(i32 *%p, i32 %v) {
 ; 8 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: sti32_atomic_a8:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti32_atomic_a8 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_atomic_a8(i32 *%p, i32 %v) {
diff --git a/test/CodeGen/WebAssembly/i32.ll b/test/CodeGen/WebAssembly/i32.ll
index 45c9a94bee7bff60f9be14add47da50991d246ea..352713779640be9c8f05065cd5c587a5d5fc52c9 100644
--- a/test/CodeGen/WebAssembly/i32.ll
+++ b/test/CodeGen/WebAssembly/i32.ll
@@ -10,8 +10,7 @@ declare i32 @llvm.cttz.i32(i32, i1)
 declare i32 @llvm.ctpop.i32(i32)
 
 ; CHECK-LABEL: add32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype add32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.add $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -22,8 +21,7 @@ define i32 @add32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: sub32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype sub32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.sub $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -34,8 +32,7 @@ define i32 @sub32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: mul32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype mul32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.mul $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -46,8 +43,7 @@ define i32 @mul32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: sdiv32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype sdiv32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.div_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -58,8 +54,7 @@ define i32 @sdiv32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: udiv32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype udiv32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.div_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -70,8 +65,7 @@ define i32 @udiv32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: srem32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype srem32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.rem_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -82,8 +76,7 @@ define i32 @srem32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: urem32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype urem32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.rem_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -94,8 +87,7 @@ define i32 @urem32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: and32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype and32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.and $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -106,8 +98,7 @@ define i32 @and32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: or32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype or32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.or $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -118,8 +109,7 @@ define i32 @or32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: xor32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype xor32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.xor $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -130,8 +120,7 @@ define i32 @xor32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: shl32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype shl32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.shl $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -142,8 +131,7 @@ define i32 @shl32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: shr32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype shr32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.shr_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -154,8 +142,7 @@ define i32 @shr32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: sar32:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype sar32 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.shr_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -166,8 +153,7 @@ define i32 @sar32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: clz32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype clz32 (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.clz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -177,8 +163,7 @@ define i32 @clz32(i32 %x) {
 }
 
 ; CHECK-LABEL: clz32_zero_undef:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype clz32_zero_undef (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.clz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -188,8 +173,7 @@ define i32 @clz32_zero_undef(i32 %x) {
 }
 
 ; CHECK-LABEL: ctz32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ctz32 (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.ctz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -199,8 +183,7 @@ define i32 @ctz32(i32 %x) {
 }
 
 ; CHECK-LABEL: ctz32_zero_undef:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ctz32_zero_undef (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.ctz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -210,8 +193,7 @@ define i32 @ctz32_zero_undef(i32 %x) {
 }
 
 ; CHECK-LABEL: popcnt32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype popcnt32 (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.popcnt $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -221,8 +203,7 @@ define i32 @popcnt32(i32 %x) {
 }
 
 ; CHECK-LABEL: eqz32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype eqz32 (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.eqz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -233,8 +214,7 @@ define i32 @eqz32(i32 %x) {
 }
 
 ; CHECK-LABEL: rotl:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype rotl (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.rotl $push0=, $pop[[L0]], $pop[[L1]]
@@ -248,8 +228,7 @@ define i32 @rotl(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: masked_rotl:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype masked_rotl (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.rotl $push0=, $pop[[L0]], $pop[[L1]]
@@ -264,8 +243,7 @@ define i32 @masked_rotl(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: rotr:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype rotr (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.rotr $push0=, $pop[[L0]], $pop[[L1]]
@@ -279,8 +257,7 @@ define i32 @rotr(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: masked_rotr:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype masked_rotr (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.rotr $push0=, $pop[[L0]], $pop[[L1]]
diff --git a/test/CodeGen/WebAssembly/i64-load-store-alignment.ll b/test/CodeGen/WebAssembly/i64-load-store-alignment.ll
index 472bc7a382d5182998a87af42606bbdfa8501f8b..32933c19fe34d93d826fa20247c3afda700f1cb6 100644
--- a/test/CodeGen/WebAssembly/i64-load-store-alignment.ll
+++ b/test/CodeGen/WebAssembly/i64-load-store-alignment.ll
@@ -10,8 +10,7 @@ target triple = "wasm32-unknown-unknown"
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: ldi64_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64_a1 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64_a1(i64 *%p) {
@@ -20,8 +19,7 @@ define i64 @ldi64_a1(i64 *%p) {
 }
 
 ; CHECK-LABEL: ldi64_a2:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64_a2 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0):p2align=1{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64_a2(i64 *%p) {
@@ -30,8 +28,7 @@ define i64 @ldi64_a2(i64 *%p) {
 }
 
 ; CHECK-LABEL: ldi64_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64_a4 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0):p2align=2{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64_a4(i64 *%p) {
@@ -42,8 +39,7 @@ define i64 @ldi64_a4(i64 *%p) {
 ; 8 is the default alignment for i64 so no attribute is needed.
 
 ; CHECK-LABEL: ldi64_a8:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64_a8 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64_a8(i64 *%p) {
@@ -54,8 +50,7 @@ define i64 @ldi64_a8(i64 *%p) {
 ; The default alignment in LLVM is the same as the defualt alignment in wasm.
 
 ; CHECK-LABEL: ldi64:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64(i64 *%p) {
@@ -66,8 +61,7 @@ define i64 @ldi64(i64 *%p) {
 ; 16 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: ldi64_a16:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64_a16 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64_a16(i64 *%p) {
@@ -80,8 +74,7 @@ define i64 @ldi64_a16(i64 *%p) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: ldi8_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi8_a1 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load8_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi8_a1(i8 *%p) {
@@ -91,8 +84,7 @@ define i64 @ldi8_a1(i8 *%p) {
 }
 
 ; CHECK-LABEL: ldi8_a2:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi8_a2 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load8_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi8_a2(i8 *%p) {
@@ -102,8 +94,7 @@ define i64 @ldi8_a2(i8 *%p) {
 }
 
 ; CHECK-LABEL: ldi16_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi16_a1 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load16_u $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi16_a1(i16 *%p) {
@@ -113,8 +104,7 @@ define i64 @ldi16_a1(i16 *%p) {
 }
 
 ; CHECK-LABEL: ldi16_a2:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi16_a2 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load16_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi16_a2(i16 *%p) {
@@ -124,8 +114,7 @@ define i64 @ldi16_a2(i16 *%p) {
 }
 
 ; CHECK-LABEL: ldi16_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi16_a4 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load16_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi16_a4(i16 *%p) {
@@ -135,8 +124,7 @@ define i64 @ldi16_a4(i16 *%p) {
 }
 
 ; CHECK-LABEL: ldi32_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi32_a1 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load32_u $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi32_a1(i32 *%p) {
@@ -146,8 +134,7 @@ define i64 @ldi32_a1(i32 *%p) {
 }
 
 ; CHECK-LABEL: ldi32_a2:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi32_a2 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load32_u $push[[NUM:[0-9]+]]=, 0($0):p2align=1{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi32_a2(i32 *%p) {
@@ -157,8 +144,7 @@ define i64 @ldi32_a2(i32 *%p) {
 }
 
 ; CHECK-LABEL: ldi32_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi32_a4 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load32_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi32_a4(i32 *%p) {
@@ -168,8 +154,7 @@ define i64 @ldi32_a4(i32 *%p) {
 }
 
 ; CHECK-LABEL: ldi32_a8:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi32_a8 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.load32_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi32_a8(i32 *%p) {
@@ -183,7 +168,7 @@ define i64 @ldi32_a8(i32 *%p) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: sti64_a1:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64_a1 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64_a1(i64 *%p, i64 %v) {
@@ -192,7 +177,7 @@ define void @sti64_a1(i64 *%p, i64 %v) {
 }
 
 ; CHECK-LABEL: sti64_a2:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64_a2 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store 0($0):p2align=1, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64_a2(i64 *%p, i64 %v) {
@@ -201,7 +186,7 @@ define void @sti64_a2(i64 *%p, i64 %v) {
 }
 
 ; CHECK-LABEL: sti64_a4:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64_a4 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store 0($0):p2align=2, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64_a4(i64 *%p, i64 %v) {
@@ -212,7 +197,7 @@ define void @sti64_a4(i64 *%p, i64 %v) {
 ; 8 is the default alignment for i32 so no attribute is needed.
 
 ; CHECK-LABEL: sti64_a8:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64_a8 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64_a8(i64 *%p, i64 %v) {
@@ -223,7 +208,7 @@ define void @sti64_a8(i64 *%p, i64 %v) {
 ; The default alignment in LLVM is the same as the defualt alignment in wasm.
 
 ; CHECK-LABEL: sti64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64(i64 *%p, i64 %v) {
@@ -232,7 +217,7 @@ define void @sti64(i64 *%p, i64 %v) {
 }
 
 ; CHECK-LABEL: sti64_a16:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64_a16 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64_a16(i64 *%p, i64 %v) {
@@ -245,7 +230,7 @@ define void @sti64_a16(i64 *%p, i64 %v) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: sti8_a1:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti8_a1 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store8 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti8_a1(i8 *%p, i64 %w) {
@@ -255,7 +240,7 @@ define void @sti8_a1(i8 *%p, i64 %w) {
 }
 
 ; CHECK-LABEL: sti8_a2:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti8_a2 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store8 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti8_a2(i8 *%p, i64 %w) {
@@ -265,7 +250,7 @@ define void @sti8_a2(i8 *%p, i64 %w) {
 }
 
 ; CHECK-LABEL: sti16_a1:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti16_a1 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store16 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti16_a1(i16 *%p, i64 %w) {
@@ -275,7 +260,7 @@ define void @sti16_a1(i16 *%p, i64 %w) {
 }
 
 ; CHECK-LABEL: sti16_a2:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti16_a2 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store16 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti16_a2(i16 *%p, i64 %w) {
@@ -285,7 +270,7 @@ define void @sti16_a2(i16 *%p, i64 %w) {
 }
 
 ; CHECK-LABEL: sti16_a4:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti16_a4 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store16 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti16_a4(i16 *%p, i64 %w) {
@@ -295,7 +280,7 @@ define void @sti16_a4(i16 *%p, i64 %w) {
 }
 
 ; CHECK-LABEL: sti32_a1:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti32_a1 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store32 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_a1(i32 *%p, i64 %w) {
@@ -305,7 +290,7 @@ define void @sti32_a1(i32 *%p, i64 %w) {
 }
 
 ; CHECK-LABEL: sti32_a2:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti32_a2 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store32 0($0):p2align=1, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_a2(i32 *%p, i64 %w) {
@@ -315,7 +300,7 @@ define void @sti32_a2(i32 *%p, i64 %w) {
 }
 
 ; CHECK-LABEL: sti32_a4:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti32_a4 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store32 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_a4(i32 *%p, i64 %w) {
@@ -325,7 +310,7 @@ define void @sti32_a4(i32 *%p, i64 %w) {
 }
 
 ; CHECK-LABEL: sti32_a8:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti32_a8 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.store32 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32_a8(i32 *%p, i64 %w) {
@@ -342,8 +327,7 @@ define void @sti32_a8(i32 *%p, i64 %w) {
 ; natural alignment.
 
 ; CHECK-LABEL: ldi64_atomic_a8:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64_atomic_a8 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.atomic.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64_atomic_a8(i64 *%p) {
@@ -354,8 +338,7 @@ define i64 @ldi64_atomic_a8(i64 *%p) {
 ; 16 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: ldi64_atomic_a16:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64_atomic_a16 (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.atomic.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64_atomic_a16(i64 *%p) {
@@ -368,7 +351,7 @@ define i64 @ldi64_atomic_a16(i64 *%p) {
 ;===----------------------------------------------------------------------------
 
 ; CHECK-LABEL: sti64_atomic_a4:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64_atomic_a4 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64_atomic_a4(i64 *%p, i64 %v) {
@@ -379,7 +362,7 @@ define void @sti64_atomic_a4(i64 *%p, i64 %v) {
 ; 16 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: sti64_atomic_a8:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64_atomic_a8 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64_atomic_a8(i64 *%p, i64 %v) {
diff --git a/test/CodeGen/WebAssembly/i64.ll b/test/CodeGen/WebAssembly/i64.ll
index 695f31cfa414fc94a1b097e5dfaaf4101e796c84..72adf7a277183ecfce87fe08a4811819341dfd4c 100644
--- a/test/CodeGen/WebAssembly/i64.ll
+++ b/test/CodeGen/WebAssembly/i64.ll
@@ -10,8 +10,7 @@ declare i64 @llvm.cttz.i64(i64, i1)
 declare i64 @llvm.ctpop.i64(i64)
 
 ; CHECK-LABEL: add64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype add64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.add $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -22,8 +21,7 @@ define i64 @add64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: sub64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype sub64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.sub $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -34,8 +32,7 @@ define i64 @sub64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: mul64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype mul64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.mul $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -46,8 +43,7 @@ define i64 @mul64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: sdiv64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype sdiv64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.div_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -58,8 +54,7 @@ define i64 @sdiv64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: udiv64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype udiv64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.div_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -70,8 +65,7 @@ define i64 @udiv64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: srem64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype srem64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.rem_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -82,8 +76,7 @@ define i64 @srem64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: urem64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype urem64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.rem_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -94,8 +87,7 @@ define i64 @urem64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: and64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype and64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.and $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -106,8 +98,7 @@ define i64 @and64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: or64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype or64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.or $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -118,8 +109,7 @@ define i64 @or64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: xor64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype xor64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.xor $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -130,8 +120,7 @@ define i64 @xor64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: shl64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype shl64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.shl $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -142,8 +131,7 @@ define i64 @shl64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: shr64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype shr64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.shr_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -154,8 +142,7 @@ define i64 @shr64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: sar64:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype sar64 (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.shr_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -166,8 +153,7 @@ define i64 @sar64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: clz64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype clz64 (i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.clz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -177,8 +163,7 @@ define i64 @clz64(i64 %x) {
 }
 
 ; CHECK-LABEL: clz64_zero_undef:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype clz64_zero_undef (i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.clz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -188,8 +173,7 @@ define i64 @clz64_zero_undef(i64 %x) {
 }
 
 ; CHECK-LABEL: ctz64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ctz64 (i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.ctz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -199,8 +183,7 @@ define i64 @ctz64(i64 %x) {
 }
 
 ; CHECK-LABEL: ctz64_zero_undef:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ctz64_zero_undef (i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.ctz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -210,8 +193,7 @@ define i64 @ctz64_zero_undef(i64 %x) {
 }
 
 ; CHECK-LABEL: popcnt64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype popcnt64 (i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.popcnt $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -221,8 +203,7 @@ define i64 @popcnt64(i64 %x) {
 }
 
 ; CHECK-LABEL: eqz64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype eqz64 (i64) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.eqz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
@@ -233,8 +214,7 @@ define i32 @eqz64(i64 %x) {
 }
 
 ; CHECK-LABEL: rotl:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype rotl (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.rotl $push0=, $pop[[L0]], $pop[[L1]]
@@ -248,8 +228,7 @@ define i64 @rotl(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: masked_rotl:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype masked_rotl (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.rotl $push0=, $pop[[L0]], $pop[[L1]]
@@ -264,8 +243,7 @@ define i64 @masked_rotl(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: rotr:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype rotr (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.rotr $push0=, $pop[[L0]], $pop[[L1]]
@@ -279,8 +257,7 @@ define i64 @rotr(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: masked_rotr:
-; CHECK-NEXT: .param i64, i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype masked_rotr (i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.rotr $push0=, $pop[[L0]], $pop[[L1]]
diff --git a/test/CodeGen/WebAssembly/immediates.ll b/test/CodeGen/WebAssembly/immediates.ll
index fdb7df7d21cbd747f7d5c9f263fd554537496582..b10b32915f239dd4d077ca2264f4792b9d9b22e9 100644
--- a/test/CodeGen/WebAssembly/immediates.ll
+++ b/test/CodeGen/WebAssembly/immediates.ll
@@ -1,12 +1,17 @@
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
 
+; Usually MIPS hosts uses a legacy (non IEEE 754-2008) encoding for NaNs.
+; Tests like `nan_f32` failed in attempt to compare hard-coded IEEE 754-2008
+; NaN value and a legacy NaN value provided by a system.
+; XFAIL: mips-, mipsel-, mips64-, mips64el-
+
 ; Test that basic immediates assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: zero_i32:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype zero_i32 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @zero_i32() {
@@ -14,7 +19,7 @@ define i32 @zero_i32() {
 }
 
 ; CHECK-LABEL: one_i32:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype one_i32 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @one_i32() {
@@ -22,7 +27,7 @@ define i32 @one_i32() {
 }
 
 ; CHECK-LABEL: max_i32:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype max_i32 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, 2147483647{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @max_i32() {
@@ -30,7 +35,7 @@ define i32 @max_i32() {
 }
 
 ; CHECK-LABEL: min_i32:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype min_i32 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM:[0-9]+]]=, -2147483648{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @min_i32() {
@@ -38,7 +43,7 @@ define i32 @min_i32() {
 }
 
 ; CHECK-LABEL: zero_i64:
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype zero_i64 () -> (i64){{$}}
 ; CHECK-NEXT: i64.const $push[[NUM:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @zero_i64() {
@@ -46,7 +51,7 @@ define i64 @zero_i64() {
 }
 
 ; CHECK-LABEL: one_i64:
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype one_i64 () -> (i64){{$}}
 ; CHECK-NEXT: i64.const $push[[NUM:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @one_i64() {
@@ -54,7 +59,7 @@ define i64 @one_i64() {
 }
 
 ; CHECK-LABEL: max_i64:
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype max_i64 () -> (i64){{$}}
 ; CHECK-NEXT: i64.const $push[[NUM:[0-9]+]]=, 9223372036854775807{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @max_i64() {
@@ -62,7 +67,7 @@ define i64 @max_i64() {
 }
 
 ; CHECK-LABEL: min_i64:
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype min_i64 () -> (i64){{$}}
 ; CHECK-NEXT: i64.const $push[[NUM:[0-9]+]]=, -9223372036854775808{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @min_i64() {
@@ -70,7 +75,7 @@ define i64 @min_i64() {
 }
 
 ; CHECK-LABEL: negzero_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype negzero_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -0x0p0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @negzero_f32() {
@@ -78,7 +83,7 @@ define float @negzero_f32() {
 }
 
 ; CHECK-LABEL: zero_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype zero_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, 0x0p0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @zero_f32() {
@@ -86,7 +91,7 @@ define float @zero_f32() {
 }
 
 ; CHECK-LABEL: one_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype one_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, 0x1p0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @one_f32() {
@@ -94,7 +99,7 @@ define float @one_f32() {
 }
 
 ; CHECK-LABEL: two_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype two_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, 0x1p1{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @two_f32() {
@@ -102,7 +107,7 @@ define float @two_f32() {
 }
 
 ; CHECK-LABEL: nan_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype nan_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, nan{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @nan_f32() {
@@ -110,7 +115,7 @@ define float @nan_f32() {
 }
 
 ; CHECK-LABEL: negnan_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype negnan_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -nan{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @negnan_f32() {
@@ -118,7 +123,7 @@ define float @negnan_f32() {
 }
 
 ; CHECK-LABEL: inf_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype inf_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, infinity{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @inf_f32() {
@@ -126,7 +131,7 @@ define float @inf_f32() {
 }
 
 ; CHECK-LABEL: neginf_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype neginf_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -infinity{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @neginf_f32() {
@@ -134,7 +139,7 @@ define float @neginf_f32() {
 }
 
 ; CHECK-LABEL: custom_nan_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype custom_nan_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -nan:0x6bcdef{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @custom_nan_f32() {
@@ -145,7 +150,7 @@ define float @custom_nan_f32() {
 ; conversion, so the bits of the NaN are not fully preserved.
 
 ; CHECK-LABEL: custom_nans_f32:
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype custom_nans_f32 () -> (f32){{$}}
 ; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -nan:0x6bcdef{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @custom_nans_f32() {
@@ -153,7 +158,7 @@ define float @custom_nans_f32() {
 }
 
 ; CHECK-LABEL: negzero_f64:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype negzero_f64 () -> (f64){{$}}
 ; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -0x0p0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @negzero_f64() {
@@ -161,7 +166,7 @@ define double @negzero_f64() {
 }
 
 ; CHECK-LABEL: zero_f64:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype zero_f64 () -> (f64){{$}}
 ; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, 0x0p0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @zero_f64() {
@@ -169,7 +174,7 @@ define double @zero_f64() {
 }
 
 ; CHECK-LABEL: one_f64:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype one_f64 () -> (f64){{$}}
 ; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, 0x1p0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @one_f64() {
@@ -177,7 +182,7 @@ define double @one_f64() {
 }
 
 ; CHECK-LABEL: two_f64:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype two_f64 () -> (f64){{$}}
 ; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, 0x1p1{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @two_f64() {
@@ -185,7 +190,7 @@ define double @two_f64() {
 }
 
 ; CHECK-LABEL: nan_f64:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype nan_f64 () -> (f64){{$}}
 ; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, nan{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @nan_f64() {
@@ -193,7 +198,7 @@ define double @nan_f64() {
 }
 
 ; CHECK-LABEL: negnan_f64:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype negnan_f64 () -> (f64){{$}}
 ; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -nan{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @negnan_f64() {
@@ -201,7 +206,7 @@ define double @negnan_f64() {
 }
 
 ; CHECK-LABEL: inf_f64:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype inf_f64 () -> (f64){{$}}
 ; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, infinity{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @inf_f64() {
@@ -209,25 +214,29 @@ define double @inf_f64() {
 }
 
 ; CHECK-LABEL: neginf_f64:
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype neginf_f64 () -> (f64){{$}}
 ; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -infinity{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @neginf_f64() {
   ret double 0xFFF0000000000000
 }
 
-; CHECK-LABEL: custom_nan_f64:
-; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -nan:0xabcdef0123456{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
-define double @custom_nan_f64() {
-  ret double 0xFFFABCDEF0123456
-}
-
-; CHECK-LABEL: custom_nans_f64:
-; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -nan:0x2bcdef0123456{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
-define double @custom_nans_f64() {
-  ret double 0xFFF2BCDEF0123456
-}
+;; Custom NaN playloads are currently not always preserved because of the use of
+;; native doubles in the MC layer. TODO: fix this problem or decide we don't
+;; care about preserving NaN payloads.
+
+; XXX-CHECK-LABEL: custom_nan_f64:
+; XXX-CHECK-NEXT: .functype custom_nan_f64 () -> (f64){{$}}
+; XXX-CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -nan:0xabcdef0123456{{$}}
+; XXX-CHECK-NEXT: return $pop[[NUM]]{{$}}
+; define double @custom_nan_f64() {
+;   ret double 0xFFFABCDEF0123456
+; }
+
+; XXX-CHECK-LABEL: custom_nans_f64:
+; XXX-CHECK-NEXT: .functype custom_nans_f64 () -> (f64){{$}}
+; XXX-CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -nan:0x2bcdef0123456{{$}}
+; XXX-CHECK-NEXT: return $pop[[NUM]]{{$}}
+; define double @custom_nans_f64() {
+;   ret double 0xFFF2BCDEF0123456
+; }
diff --git a/test/CodeGen/WebAssembly/indirect-import.ll b/test/CodeGen/WebAssembly/indirect-import.ll
index 2e7d81a20c78ef8dc306efcbca42db0c58edfffa..59c4194da7791dbc048ee828d0a0cebb67821e69 100644
--- a/test/CodeGen/WebAssembly/indirect-import.ll
+++ b/test/CodeGen/WebAssembly/indirect-import.ll
@@ -9,7 +9,7 @@ target triple = "wasm32"
 %struct.big = type { float, double, i32 }
 
 ; Function Attrs: nounwind
-; CHECK: bar:
+; CHECK-LABEL: bar:
 define hidden i32 @bar() #0 {
 entry:
   %fd = alloca float (double)*, align 4
@@ -18,6 +18,7 @@ entry:
   %ijidf = alloca i32 (i64, i32, double, float)*, align 4
   %vs = alloca void (%struct.big*)*, align 4
   %s = alloca void (%struct.big*)*, align 4
+  %i128ret = alloca i128 (i64)*, align 8
 
 ; CHECK-DAG: i32.const       {{.+}}=, extern_fd@FUNCTION
 ; CHECK-DAG: i32.const       {{.+}}=, extern_vj@FUNCTION
@@ -42,6 +43,12 @@ entry:
   store void (%struct.big*)* @extern_sret, void (%struct.big*)** %s, align 4
   %3 = load float (double)*, float (double)** %fd, align 4
   %4 = ptrtoint float (double)* %3 to i32
+
+; CHECK: i32.const       {{.+}}=, extern_i128ret@FUNCTION
+  store i128 (i64)* @extern_i128ret, i128 (i64)** %i128ret, align 8
+  %5 = load i128 (i64)*, i128 (i64)** %i128ret, align 8
+  %6 = call i128 %5(i64 1)
+
   ret i32 %4
 }
 
@@ -63,10 +70,10 @@ attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="fa
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 
-; CHECK: .functype       extern_fd, f32, f64
-; CHECK: .functype       extern_vj, void, i64
-; CHECK: .functype       extern_v, void
-; CHECK: .functype       extern_ijidf, i32, i64, i32, f64, f32
-; CHECK: .functype       extern_struct, void, i32
-; CHECK: .functype       extern_sret, void, i32
-; CHECK: .functype       extern_i128ret, void, i32, i64
+; CHECK: .functype extern_fd (f64) -> (f32)
+; CHECK: .functype extern_vj (i64) -> ()
+; CHECK: .functype extern_v () -> ()
+; CHECK: .functype extern_ijidf  (i64, i32, f64, f32) -> (i32)
+; CHECK: .functype extern_struct (i32) -> ()
+; CHECK: .functype extern_sret (i32) -> ()
+; CHECK: .functype extern_i128ret (i32, i64) -> ()
diff --git a/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll b/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll
index 7fcc3cf276a1f6e9884def468a08275dbff374a3..037d26eb2c4f8a12e9169e7aa2e76f4f6e2fe132 100644
--- a/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll
+++ b/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: main:
-; CHECK-NEXT:	.param  	i32, i32
+; CHECK-NEXT:	.functype main (i32, i32) -> (i32)
 ; CHECK-NEXT:	.local  	i32
 ; CHECK-NEXT:	i32.const	1
 ; CHECK-NEXT:	set_local	[[SRC:[0-9]+]]
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index 5f674f1b4deb8eec201daebd900fb97a48c716de..657f9a6e399f05d7b6b55697901f4fea46d7e488 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: foo:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype foo (i32) -> (i32){{$}}
 ; CHECK-NEXT: #APP{{$}}
 ; CHECK-NEXT: # 0 = aaa(0){{$}}
 ; CHECK-NEXT: #NO_APP{{$}}
@@ -21,7 +20,7 @@ entry:
 }
 
 ; CHECK-LABEL: imm:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype imm () -> (i32){{$}}
 ; CHECK-NEXT: .local i32{{$}}
 ; CHECK-NEXT: #APP{{$}}
 ; CHECK-NEXT: # 0 = ccc(42){{$}}
@@ -35,8 +34,7 @@ entry:
 }
 
 ; CHECK-LABEL: foo_i64:
-; CHECK-NEXT: .param i64{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype foo_i64 (i64) -> (i64){{$}}
 ; CHECK-NEXT: #APP{{$}}
 ; CHECK-NEXT: # 0 = aaa(0){{$}}
 ; CHECK-NEXT: #NO_APP{{$}}
diff --git a/test/CodeGen/WebAssembly/load.ll b/test/CodeGen/WebAssembly/load.ll
index a581affad8a370b084c87dc01abd549e7849ba61..1878c1cae7e0c84f8b939d2d7fd777b0fcc039a6 100644
--- a/test/CodeGen/WebAssembly/load.ll
+++ b/test/CodeGen/WebAssembly/load.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: ldi32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype ldi32 (i32) -> (i32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($pop[[L0]]){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -18,8 +17,7 @@ define i32 @ldi32(i32 *%p) {
 }
 
 ; CHECK-LABEL: ldi64:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype ldi64 (i32) -> (i64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($pop[[L0]]){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -29,8 +27,7 @@ define i64 @ldi64(i64 *%p) {
 }
 
 ; CHECK-LABEL: ldf32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: .functype ldf32 (i32) -> (f32){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: f32.load $push[[NUM:[0-9]+]]=, 0($pop[[L0]]){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
@@ -40,8 +37,7 @@ define float @ldf32(float *%p) {
 }
 
 ; CHECK-LABEL: ldf64:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: .functype ldf64 (i32) -> (f64){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: f64.load $push[[NUM:[0-9]+]]=, 0($pop[[L0]]){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
diff --git a/test/CodeGen/WebAssembly/lower-em-exceptions.ll b/test/CodeGen/WebAssembly/lower-em-exceptions.ll
index 575f4f9317b82d098efc74f6af72b0cc7e7b3ae6..e54cb2fd46bedfd9d81f1aff9fed656e239a06f9 100644
--- a/test/CodeGen/WebAssembly/lower-em-exceptions.ll
+++ b/test/CodeGen/WebAssembly/lower-em-exceptions.ll
@@ -7,7 +7,6 @@ target triple = "wasm32-unknown-unknown"
 @_ZTIc = external constant i8*
 ; CHECK-DAG: __THREW__ = external global i32
 ; CHECK-DAG: __threwValue = external global i32
-; CHECK-DAG: __tempRet0 = external global i32
 
 ; Test invoke instruction with clauses (try-catch block)
 define void @clause() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
@@ -36,7 +35,7 @@ lpad:                                             ; preds = %entry
 ; CHECK: lpad:
 ; CHECK-NEXT: %[[FMC:.*]] = call i8* @__cxa_find_matching_catch_4(i8* bitcast (i8** @_ZTIi to i8*), i8* null)
 ; CHECK-NEXT: %[[IVI1:.*]] = insertvalue { i8*, i32 } undef, i8* %[[FMC]], 0
-; CHECK-NEXT: %[[TEMPRET0_VAL:.*]] = load i32, i32* @__tempRet0
+; CHECK-NEXT: %[[TEMPRET0_VAL:.*]] = call i32 @getTempRet0()
 ; CHECK-NEXT: %[[IVI2:.*]] = insertvalue { i8*, i32 } %[[IVI1]], i32 %[[TEMPRET0_VAL]], 1
 ; CHECK-NEXT: extractvalue { i8*, i32 } %[[IVI2]], 0
 ; CHECK-NEXT: %[[CDR:.*]] = extractvalue { i8*, i32 } %[[IVI2]], 1
@@ -91,7 +90,7 @@ lpad:                                             ; preds = %entry
 ; CHECK: lpad:
 ; CHECK-NEXT: %[[FMC:.*]] = call i8* @__cxa_find_matching_catch_4(i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIc to i8*))
 ; CHECK-NEXT: %[[IVI1:.*]] = insertvalue { i8*, i32 } undef, i8* %[[FMC]], 0
-; CHECK-NEXT: %[[TEMPRET0_VAL:.*]] = load i32, i32* @__tempRet0
+; CHECK-NEXT: %[[TEMPRET0_VAL:.*]] = call i32 @getTempRet0()
 ; CHECK-NEXT: %[[IVI2:.*]] = insertvalue { i8*, i32 } %[[IVI1]], i32 %[[TEMPRET0_VAL]], 1
 ; CHECK-NEXT: extractvalue { i8*, i32 } %[[IVI2]], 0
 ; CHECK-NEXT: extractvalue { i8*, i32 } %[[IVI2]], 1
@@ -168,6 +167,8 @@ declare void @__cxa_end_catch()
 declare void @__cxa_call_unexpected(i8*)
 
 ; JS glue functions and invoke wrappers declaration
+; CHECK-DAG: declare i32 @getTempRet0()
+; CHECK-DAG: declare void @setTempRet0(i32)
 ; CHECK-DAG: declare void @__resumeException(i8*)
 ; CHECK-DAG: declare void @__invoke_void_i32(void (i32)*, i32)
 ; CHECK-DAG: declare i8* @__cxa_find_matching_catch_4(i8*, i8*)
diff --git a/test/CodeGen/WebAssembly/lower-em-sjlj-longjmp-only.ll b/test/CodeGen/WebAssembly/lower-em-sjlj-longjmp-only.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7d31c8ced3f3ab118bae59e3a95fff23d1eed977
--- /dev/null
+++ b/test/CodeGen/WebAssembly/lower-em-sjlj-longjmp-only.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -wasm-lower-em-ehsjlj -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+%struct.__jmp_buf_tag = type { [6 x i32], i32, %struct.__sigset_t }
+%struct.__sigset_t = type { [32 x i32] }
+
+@buffer = global [1 x %struct.__jmp_buf_tag] zeroinitializer, align 16
+
+; Tests if program does not crash when there's no setjmp function calls in the
+; module.
+
+; CHECK: call void @emscripten_longjmp_jmpbuf
+define void @longjmp_only() {
+entry:
+  call void @longjmp(%struct.__jmp_buf_tag* getelementptr inbounds ([1 x %struct.__jmp_buf_tag], [1 x %struct.__jmp_buf_tag]* @buffer, i32 0, i32 0), i32 1) #1
+  unreachable
+}
+
+; Function Attrs: noreturn nounwind
+declare void @longjmp(%struct.__jmp_buf_tag*, i32) #1
+
+attributes #1 = { noreturn nounwind }
diff --git a/test/CodeGen/WebAssembly/lower-em-sjlj.ll b/test/CodeGen/WebAssembly/lower-em-sjlj.ll
index 45c547339feeb9a251624782146b97db70c08539..bc851a4c06d5808413798d08a8ddacdd980d940a 100644
--- a/test/CodeGen/WebAssembly/lower-em-sjlj.ll
+++ b/test/CodeGen/WebAssembly/lower-em-sjlj.ll
@@ -8,7 +8,6 @@ target triple = "wasm32-unknown-unknown"
 @global_var = hidden global i32 0, align 4
 ; CHECK-DAG: __THREW__ = external global i32
 ; CHECK-DAG: __threwValue = external global i32
-; CHECK-DAG: __tempRet0 = external global i32
 
 ; Test a simple setjmp - longjmp sequence
 define hidden void @setjmp_longjmp() {
@@ -28,7 +27,7 @@ entry:
 ; CHECK-NEXT: %[[BUF:.*]] = alloca [1 x %struct.__jmp_buf_tag]
 ; CHECK-NEXT: %[[ARRAYDECAY:.*]] = getelementptr inbounds [1 x %struct.__jmp_buf_tag], [1 x %struct.__jmp_buf_tag]* %[[BUF]], i32 0, i32 0
 ; CHECK-NEXT: %[[SETJMP_TABLE1:.*]] = call i32* @saveSetjmp(%struct.__jmp_buf_tag* %[[ARRAYDECAY]], i32 1, i32* %[[SETJMP_TABLE]], i32 %[[SETJMP_TABLE_SIZE]])
-; CHECK-NEXT: %[[SETJMP_TABLE_SIZE1:.*]] = load i32, i32* @__tempRet0
+; CHECK-NEXT: %[[SETJMP_TABLE_SIZE1:.*]] = call i32 @getTempRet0()
 ; CHECK-NEXT: br label %entry.split
 
 ; CHECK: entry.split:
@@ -59,7 +58,7 @@ entry:
 
 ; CHECK: if.end:
 ; CHECK-NEXT: %[[LABEL_PHI:.*]] = phi i32 [ %[[LABEL:.*]], %if.end2 ], [ -1, %if.else1 ]
-; CHECK-NEXT: %[[LONGJMP_RESULT]] = load i32, i32* @__tempRet0
+; CHECK-NEXT: %[[LONGJMP_RESULT]] = call i32 @getTempRet0()
 ; CHECK-NEXT: switch i32 %[[LABEL_PHI]], label %entry.split.split [
 ; CHECK-NEXT:   i32 1, label %entry.split
 ; CHECK-NEXT: ]
@@ -69,7 +68,7 @@ entry:
 ; CHECK-NEXT: unreachable
 
 ; CHECK: if.end2:
-; CHECK-NEXT: store i32 %[[THREWVALUE_VAL]], i32* @__tempRet0
+; CHECK-NEXT: call void  @setTempRet0(i32 %[[THREWVALUE_VAL]])
 ; CHECK-NEXT: br label %if.end
 }
 
@@ -152,7 +151,7 @@ if.then:                                          ; preds = %entry
 ; CHECK: if.then:
 ; CHECK: %[[VAR0:.*]] = load i32, i32* @global_var, align 4
 ; CHECK: %[[SETJMP_TABLE1:.*]] = call i32* @saveSetjmp(
-; CHECK-NEXT: %[[SETJMP_TABLE_SIZE1:.*]] = load i32, i32* @__tempRet0
+; CHECK-NEXT: %[[SETJMP_TABLE_SIZE1:.*]] = call i32 @getTempRet0()
 
 ; CHECK: if.then.split:
 ; CHECK: %[[VAR1:.*]] = phi i32 [ %[[VAR0]], %if.then ], [ %[[VAR2:.*]], %if.end3 ]
@@ -201,6 +200,8 @@ declare i8* @malloc(i32)
 declare void @free(i8*)
 
 ; JS glue functions and invoke wrappers declaration
+; CHECK-DAG: declare i32 @getTempRet0()
+; CHECK-DAG: declare void @setTempRet0(i32)
 ; CHECK-DAG: declare i32* @saveSetjmp(%struct.__jmp_buf_tag*, i32, i32*, i32)
 ; CHECK-DAG: declare i32 @testSetjmp(i32, i32*, i32)
 ; CHECK-DAG: declare void @emscripten_longjmp_jmpbuf(%struct.__jmp_buf_tag*, i32)
diff --git a/test/CodeGen/WebAssembly/lower-global-dtors.ll b/test/CodeGen/WebAssembly/lower-global-dtors.ll
index b25fc05d5b89402da1e62fdc58b8bff8a908ad67..fd5d38623137fa5d3025eef71d5931798750f4fb 100644
--- a/test/CodeGen/WebAssembly/lower-global-dtors.ll
+++ b/test/CodeGen/WebAssembly/lower-global-dtors.ll
@@ -40,11 +40,11 @@ declare void @after_the_null()
 ]
 
 ; CHECK-LABEL: .Lcall_dtors.0:
-; CHECK-NEXT: .param          i32{{$}}
+; CHECK-NEXT: .functype .Lcall_dtors.0 (i32) -> (){{$}}
 ; CHECK-NEXT: call            orig_dtor0@FUNCTION{{$}}
 
 ; CHECK-LABEL: .Lregister_call_dtors.0:
-; CHECK-NEXT: block
+; CHECK:      block
 ; CHECK-NEXT: i32.const       $push2=, .Lcall_dtors.0@FUNCTION{{$}}
 ; CHECK-NEXT: i32.const       $push1=, 0
 ; CHECK-NEXT: i32.const       $push0=, __dso_handle
@@ -55,12 +55,12 @@ declare void @after_the_null()
 ; CHECK-NEXT: unreachable
 
 ; CHECK-LABEL: .Lcall_dtors.1:
-; CHECK-NEXT: .param          i32{{$}}
+; CHECK-NEXT: .functype .Lcall_dtors.1 (i32) -> (){{$}}
 ; CHECK-NEXT: call            orig_dtor1a@FUNCTION{{$}}
 ; CHECK-NEXT: call            orig_dtor1b@FUNCTION{{$}}
 
 ; CHECK-LABEL: .Lregister_call_dtors.1:
-; CHECK-NEXT: block
+; CHECK:      block
 ; CHECK-NEXT: i32.const       $push2=, .Lcall_dtors.1@FUNCTION{{$}}
 ; CHECK-NEXT: i32.const       $push1=, 0
 ; CHECK-NEXT: i32.const       $push0=, __dso_handle
@@ -71,11 +71,11 @@ declare void @after_the_null()
 ; CHECK-NEXT: unreachable
 
 ; CHECK-LABEL: .Lcall_dtors.1.associated1c0:
-; CHECK-NEXT: .param          i32{{$}}
+; CHECK-NEXT: .functype .Lcall_dtors.1.associated1c0 (i32) -> (){{$}}
 ; CHECK-NEXT: call            orig_dtor1c0@FUNCTION{{$}}
 
 ; CHECK-LABEL: .Lregister_call_dtors.1.associated1c0:
-; CHECK-NEXT: block
+; CHECK:      block
 ; CHECK-NEXT: i32.const       $push2=, .Lcall_dtors.1.associated1c0@FUNCTION{{$}}
 ; CHECK-NEXT: i32.const       $push1=, 0
 ; CHECK-NEXT: i32.const       $push0=, __dso_handle
@@ -86,12 +86,12 @@ declare void @after_the_null()
 ; CHECK-NEXT: unreachable
 
 ; CHECK-LABEL: .Lcall_dtors.1.associated1c1:
-; CHECK-NEXT: .param          i32{{$}}
+; CHECK-NEXT: .functype .Lcall_dtors.1.associated1c1 (i32) -> (){{$}}
 ; CHECK-NEXT: call            orig_dtor1c1a@FUNCTION{{$}}
 ; CHECK-NEXT: call            orig_dtor1c1b@FUNCTION{{$}}
 
 ; CHECK-LABEL: .Lregister_call_dtors.1.associated1c1:
-; CHECK-NEXT: block
+; CHECK:      block
 ; CHECK-NEXT: i32.const       $push2=, .Lcall_dtors.1.associated1c1@FUNCTION{{$}}
 ; CHECK-NEXT: i32.const       $push1=, 0
 ; CHECK-NEXT: i32.const       $push0=, __dso_handle
@@ -102,11 +102,11 @@ declare void @after_the_null()
 ; CHECK-NEXT: unreachable
 
 ; CHECK-LABEL: .Lcall_dtors:
-; CHECK-NEXT: .param          i32{{$}}
+; CHECK-NEXT: .functype .Lcall_dtors (i32) -> (){{$}}
 ; CHECK-NEXT: call            orig_dtor65536@FUNCTION{{$}}
 
 ; CHECK-LABEL: .Lregister_call_dtors:
-; CHECK-NEXT: block
+; CHECK:      block
 ; CHECK-NEXT: i32.const       $push2=, .Lcall_dtors@FUNCTION{{$}}
 ; CHECK-NEXT: i32.const       $push1=, 0
 ; CHECK-NEXT: i32.const       $push0=, __dso_handle
@@ -127,7 +127,7 @@ declare void @after_the_null()
 
 ; CHECK-LABEL: .weak __dso_handle
 
-; CHECK-LABEL: .functype __cxa_atexit, i32, i32, i32, i32{{$}}
+; CHECK-LABEL: .functype __cxa_atexit (i32, i32, i32) -> (i32){{$}}
 
 ; We shouldn't make use of a .fini_array section.
 
diff --git a/test/CodeGen/WebAssembly/main-declaration.ll b/test/CodeGen/WebAssembly/main-declaration.ll
index 8d9414f326e4dd6fa10bf73beee89b3fa1c3d138..f9d68db2bae8e7add4ed03917b1bb97ce15eccfe 100644
--- a/test/CodeGen/WebAssembly/main-declaration.ll
+++ b/test/CodeGen/WebAssembly/main-declaration.ll
@@ -14,6 +14,7 @@ define void @foo() {
 
 ; CHECK-NOT:   __original_main
 ; CHECK-LABEL: foo:
+; CHECK-NEXT:    .functype foo () -> ()
 ; CHECK-NEXT:    call main@FUNCTION
 ; CHECK-NEXT:    end_function
 ; CHECK-NOT:   __original_main
diff --git a/test/CodeGen/WebAssembly/main-no-args.ll b/test/CodeGen/WebAssembly/main-no-args.ll
index 09a4feaed14eab3cb882f924aeaa67fc5c7854ca..0bc46717d97be315d4102a38eeddde8141dfd739 100644
--- a/test/CodeGen/WebAssembly/main-no-args.ll
+++ b/test/CodeGen/WebAssembly/main-no-args.ll
@@ -10,9 +10,9 @@ define void @main() {
 }
 
 ; CHECK-LABEL: .L__original_main:
+; CHECK-NEXT: .functype .L__original_main () -> ()
 ; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: main:
-; CHECK-NEXT: .param i32, i32
-; CHECK-NEXT: .result i32
+; CHECK-NEXT: .functype main (i32, i32) -> (i32)
 ; CHECK:      call .L__original_main@FUNCTION
diff --git a/test/CodeGen/WebAssembly/main-with-args.ll b/test/CodeGen/WebAssembly/main-with-args.ll
index aa085409756b0cf8739c0d1fa2700c88c77c5f3d..d4a11ef14d46e51c9e8e04efda458b9fbd29b5ae 100644
--- a/test/CodeGen/WebAssembly/main-with-args.ll
+++ b/test/CodeGen/WebAssembly/main-with-args.ll
@@ -10,7 +10,6 @@ define i32 @main(i32 %a, i8** %b) {
 }
 
 ; CHECK-LABEL: main:
-; CHECK-NEXT: .param i32, i32
-; CHECK-NEXT: .result i32
+; CHECK-NEXT: .functype main (i32, i32) -> (i32)
 
 ; CHECK-NOT: __original_main:
diff --git a/test/CodeGen/WebAssembly/memory-addr32.ll b/test/CodeGen/WebAssembly/memory-addr32.ll
index ed385d9141c69e2049fdfe8d71d11e9418db69b7..0eb886fb0796fb23257e2e656ca1a6a904c52d96 100644
--- a/test/CodeGen/WebAssembly/memory-addr32.ll
+++ b/test/CodeGen/WebAssembly/memory-addr32.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.wasm.current.memory.i32() nounwind readonly
 declare i32 @llvm.wasm.grow.memory.i32(i32) nounwind
 
 ; CHECK-LABEL: memory_size:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype memory_size () -> (i32){{$}}
 ; CHECK-NEXT: memory.size $push0=, 0{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @memory_size() {
@@ -22,8 +22,7 @@ define i32 @memory_size() {
 }
 
 ; CHECK-LABEL: memory_grow:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype memory_grow (i32) -> (i32){{$}}
 ; CHECK: memory.grow $push0=, 0, $0{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @memory_grow(i32 %n) {
@@ -32,7 +31,7 @@ define i32 @memory_grow(i32 %n) {
 }
 
 ; CHECK-LABEL: mem_size:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype mem_size () -> (i32){{$}}
 ; CHECK-NEXT: mem.size $push0=, 0{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @mem_size() {
@@ -41,8 +40,7 @@ define i32 @mem_size() {
 }
 
 ; CHECK-LABEL: mem_grow:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype mem_grow (i32) -> (i32){{$}}
 ; CHECK: mem.grow $push0=, 0, $0{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @mem_grow(i32 %n) {
@@ -51,7 +49,7 @@ define i32 @mem_grow(i32 %n) {
 }
 
 ; CHECK-LABEL: current_memory:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype current_memory () -> (i32){{$}}
 ; CHECK-NEXT: current_memory $push0={{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @current_memory() {
@@ -60,8 +58,7 @@ define i32 @current_memory() {
 }
 
 ; CHECK-LABEL: grow_memory:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype grow_memory (i32) -> (i32){{$}}
 ; CHECK: grow_memory $push0=, $0{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @grow_memory(i32 %n) {
diff --git a/test/CodeGen/WebAssembly/null-streamer.ll b/test/CodeGen/WebAssembly/null-streamer.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eb86075466289aa710f0daf94d9775261c45ffae
--- /dev/null
+++ b/test/CodeGen/WebAssembly/null-streamer.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -O0 -filetype=null -exception-model=wasm -mattr=+exception-handling
+; RUN: llc < %s -O0 -filetype=asm -asm-verbose=false -exception-model=wasm -mattr=+exception-handling | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare void @llvm.wasm.throw(i32, i8*)
+declare void @g()
+
+define i32 @test(i8* %p)  {
+  %n = alloca i32
+  call void @llvm.wasm.throw(i32 0, i8* %p)
+  call void @g()
+  ret i32 0
+}
+
+; CHECK-DAG: .globaltype
+; CHECK-DAG: .eventtype
+; CHECK-DAG: .functype
diff --git a/test/CodeGen/WebAssembly/offset-atomics.ll b/test/CodeGen/WebAssembly/offset-atomics.ll
index be87878fa14db74207a28619f4b5ca658fe7757b..6406a51e0f8fef483747c543fe6ab1798c8da3a9 100644
--- a/test/CodeGen/WebAssembly/offset-atomics.ll
+++ b/test/CodeGen/WebAssembly/offset-atomics.ll
@@ -181,7 +181,7 @@ define i64 @load_i64_with_unfolded_gep_offset(i64* %p) {
 ; Basic store.
 
 ; CHECK-LABEL: store_i32_no_offset:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype store_i32_no_offset (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_i32_no_offset(i32 *%p, i32 %v) {
@@ -252,7 +252,7 @@ define void @store_i32_with_unfolded_gep_offset(i32* %p) {
 ; When storing from a fixed address, materialize a zero.
 
 ; CHECK-LABEL: store_i32_to_numeric_address:
-; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK:      i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: i32.const $push1=, 0{{$}}
 ; CHECK-NEXT: i32.atomic.store 42($pop0), $pop1{{$}}
 define void @store_i32_to_numeric_address() {
@@ -277,7 +277,7 @@ define void @store_i32_to_global_address() {
 ; Basic store.
 
 ; CHECK-LABEL: store_i64_no_offset:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype store_i64_no_offset (i32, i64) -> (){{$}}
 ; CHECK-NEXT: i64.atomic.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_i64_no_offset(i64 *%p, i64 %v) {
@@ -663,7 +663,7 @@ define void @store_i8_i64_with_folded_or_offset(i32 %x, i64 %v) {
 ; Basic RMW.
 
 ; CHECK-LABEL: rmw_add_i32_no_offset:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype rmw_add_i32_no_offset (i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @rmw_add_i32_no_offset(i32* %p, i32 %v) {
@@ -757,7 +757,7 @@ define i32 @rmw_add_i32_from_global_address(i32 %v) {
 ; Basic RMW.
 
 ; CHECK-LABEL: rmw_add_i64_no_offset:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype rmw_add_i64_no_offset (i32, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.add $push0=, 0($0), $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @rmw_add_i64_no_offset(i64* %p, i64 %v) {
@@ -1078,7 +1078,7 @@ define i8 @rmw_add_i8_i32_retvalue(i8 *%p, i32 %v) {
 ; Basic RMW.
 
 ; CHECK-LABEL: cmpxchg_i32_no_offset:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK-NEXT: .functype cmpxchg_i32_no_offset (i32, i32, i32) -> (i32){{$}}
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_no_offset(i32* %p, i32 %exp, i32 %new) {
@@ -1180,7 +1180,7 @@ define i32 @cmpxchg_i32_from_global_address(i32 %exp, i32 %new) {
 ; Basic RMW.
 
 ; CHECK-LABEL: cmpxchg_i64_no_offset:
-; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NEXT: .functype cmpxchg_i64_no_offset (i32, i64, i64) -> (i64){{$}}
 ; CHECK: i64.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @cmpxchg_i64_no_offset(i64* %p, i64 %exp, i64 %new) {
diff --git a/test/CodeGen/WebAssembly/offset-folding.ll b/test/CodeGen/WebAssembly/offset-folding.ll
index 335275d1b4d9f6d1bc157b2eeb35897d3904ca6a..54a8636b53e223217923e9058ec34fdcda205fe5 100644
--- a/test/CodeGen/WebAssembly/offset-folding.ll
+++ b/test/CodeGen/WebAssembly/offset-folding.ll
@@ -11,7 +11,7 @@ target triple = "wasm32-unknown-unknown"
 ; Test basic constant offsets of both defined and external symbols.
 
 ; CHECK-LABEL: test0:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype test0 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, x+188{{$}}
 ; CHECK=NEXT: return $pop0{{$}}
 define i32* @test0() {
@@ -19,7 +19,7 @@ define i32* @test0() {
 }
 
 ; CHECK-LABEL: test1:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype test1 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, y+188{{$}}
 ; CHECK=NEXT: return $pop0{{$}}
 define i32* @test1() {
@@ -29,7 +29,7 @@ define i32* @test1() {
 ; Test zero offsets.
 
 ; CHECK-LABEL: test2:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype test2 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, x{{$}}
 ; CHECK=NEXT: return $pop0{{$}}
 define i32* @test2() {
@@ -37,7 +37,7 @@ define i32* @test2() {
 }
 
 ; CHECK-LABEL: test3:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype test3 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, y{{$}}
 ; CHECK=NEXT: return $pop0{{$}}
 define i32* @test3() {
@@ -47,7 +47,7 @@ define i32* @test3() {
 ; Test negative offsets.
 
 ; CHECK-LABEL: test4:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype test4 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, x-188{{$}}
 ; CHECK=NEXT: return $pop0{{$}}
 define i32* @test4() {
@@ -55,7 +55,7 @@ define i32* @test4() {
 }
 
 ; CHECK-LABEL: test5:
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype test5 () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, y-188{{$}}
 ; CHECK=NEXT: return $pop0{{$}}
 define i32* @test5() {
diff --git a/test/CodeGen/WebAssembly/offset.ll b/test/CodeGen/WebAssembly/offset.ll
index 6b783bc8518e2523c9a4cba6eead4c73ef627aac..d0283386afdd8a89bd4cbe5ac05b7a2b866c6e12 100644
--- a/test/CodeGen/WebAssembly/offset.ll
+++ b/test/CodeGen/WebAssembly/offset.ll
@@ -180,7 +180,7 @@ define i64 @load_i64_with_unfolded_gep_offset(i64* %p) {
 ; Basic store.
 
 ; CHECK-LABEL: store_i32_no_offset:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype store_i32_no_offset (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_i32_no_offset(i32 *%p, i32 %v) {
@@ -251,7 +251,7 @@ define void @store_i32_with_unfolded_gep_offset(i32* %p) {
 ; When storing from a fixed address, materialize a zero.
 
 ; CHECK-LABEL: store_i32_to_numeric_address:
-; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK:      i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: i32.const $push1=, 0{{$}}
 ; CHECK-NEXT: i32.store 42($pop0), $pop1{{$}}
 define void @store_i32_to_numeric_address() {
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index af933dc64e43018fc7d84798df69a69ba6b4ab79..53a52a5f30a824a9d30288884e13d2657ef63015 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -100,8 +100,7 @@ define i32 @no_sink_readonly_call(i32 %x, i32 %y, i32* %p) {
 ; rearranged to make the stack contiguous.
 
 ; CHECK-LABEL: stack_uses:
-; CHECK: .param i32, i32, i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK: .functype stack_uses (i32, i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: block   {{$}}
 ; CHECK-NEXT: i32.const   $push[[L13:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.lt_s    $push[[L0:[0-9]+]]=, $0, $pop[[L13]]{{$}}
@@ -124,8 +123,7 @@ define i32 @no_sink_readonly_call(i32 %x, i32 %y, i32* %p) {
 ; CHECK-NEXT: i32.const   $push14=, 1{{$}}
 ; CHECK-NEXT: return      $pop14{{$}}
 ; NOREGS-LABEL: stack_uses:
-; NOREGS: .param i32, i32, i32, i32{{$}}
-; NOREGS-NEXT: .result i32{{$}}
+; NOREGS: .functype stack_uses (i32, i32, i32, i32) -> (i32){{$}}
 ; NOREGS-NEXT: block {{$}}
 ; NOREGS-NEXT: get_local 0{{$}}
 ; NOREGS-NEXT: i32.const   1{{$}}
@@ -171,7 +169,7 @@ false:
 ; be trivially stackified. However, it can be stackified with a tee_local.
 
 ; CHECK-LABEL: multiple_uses:
-; CHECK: .param       i32, i32, i32{{$}}
+; CHECK: .functype multiple_uses (i32, i32, i32) -> (){{$}}
 ; CHECK-NEXT: block   {{$}}
 ; CHECK-NEXT: i32.load    $push[[NUM0:[0-9]+]]=, 0($2){{$}}
 ; CHECK-NEXT: tee_local   $push[[NUM1:[0-9]+]]=, $3=, $pop[[NUM0]]{{$}}
@@ -184,7 +182,7 @@ false:
 ; CHECK-NEXT: end_block{{$}}
 ; CHECK-NEXT: return{{$}}
 ; NOREGS-LABEL: multiple_uses:
-; NOREGS: .param       i32, i32, i32{{$}}
+; NOREGS: .functype multiple_uses (i32, i32, i32) -> (){{$}}
 ; NOREGS: .local i32{{$}}
 ; NOREGS-NEXT: block {{$}}
 ; NOREGS-NEXT: get_local   2{{$}}
@@ -230,12 +228,12 @@ return:
 
 ; CHECK:      side_effects:
 ; CHECK:      store
-; CHECK-NEXT: call
+; CHECK:      call
 ; CHECK:      store
 ; CHECK-NEXT: call
 ; NOREGS:      side_effects:
 ; NOREGS:      store
-; NOREGS-NEXT: call
+; NOREGS:      call
 ; NOREGS:      store
 ; NOREGS-NEXT: call
 declare void @evoke_side_effects()
@@ -253,8 +251,7 @@ entry:
 ; tree order.
 
 ; CHECK-LABEL: div_tree:
-; CHECK: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}}
-; CHECK-NEXT: .result     i32{{$}}
+; CHECK: .functype div_tree (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.div_s   $push[[L0:[0-9]+]]=, $0, $1{{$}}
 ; CHECK-NEXT: i32.div_s   $push[[L1:[0-9]+]]=, $2, $3{{$}}
 ; CHECK-NEXT: i32.div_s   $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -272,8 +269,7 @@ entry:
 ; CHECK-NEXT: i32.div_s   $push[[L14:[0-9]+]]=, $pop[[L6]], $pop[[L13]]{{$}}
 ; CHECK-NEXT: return      $pop[[L14]]{{$}}
 ; NOREGS-LABEL: div_tree:
-; NOREGS: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}}
-; NOREGS-NEXT: .result     i32{{$}}
+; NOREGS: .functype div_tree (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32){{$}}
 ; NOREGS-NEXT: get_local 0{{$}}
 ; NOREGS-NEXT: get_local 1{{$}}
 ; NOREGS-NEXT: i32.div_s{{$}}
@@ -329,14 +325,14 @@ entry:
 ; A simple multiple-use case.
 
 ; CHECK-LABEL: simple_multiple_use:
-; CHECK:  .param      i32, i32{{$}}
+; CHECK:       .functype simple_multiple_use (i32, i32) -> (){{$}}
 ; CHECK-NEXT:  i32.mul     $push[[NUM0:[0-9]+]]=, $1, $0{{$}}
 ; CHECK-NEXT:  tee_local   $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT:  call        use_a@FUNCTION, $pop[[NUM1]]{{$}}
 ; CHECK-NEXT:  call        use_b@FUNCTION, $[[NUM2]]{{$}}
 ; CHECK-NEXT:  return{{$}}
 ; NOREGS-LABEL: simple_multiple_use:
-; NOREGS:  .param      i32, i32{{$}}
+; NOREGS:       .functype simple_multiple_use (i32, i32) -> (){{$}}
 ; NOREGS-NEXT:  get_local 1{{$}}
 ; NOREGS-NEXT:  get_local 0{{$}}
 ; NOREGS-NEXT:  i32.mul
@@ -357,13 +353,13 @@ define void @simple_multiple_use(i32 %x, i32 %y) {
 ; Multiple uses of the same value in one instruction.
 
 ; CHECK-LABEL: multiple_uses_in_same_insn:
-; CHECK:  .param      i32, i32{{$}}
+; CHECK:       .functype multiple_uses_in_same_insn (i32, i32) -> (){{$}}
 ; CHECK-NEXT:  i32.mul     $push[[NUM0:[0-9]+]]=, $1, $0{{$}}
 ; CHECK-NEXT:  tee_local   $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT:  call        use_2@FUNCTION, $pop[[NUM1]], $[[NUM2]]{{$}}
 ; CHECK-NEXT:  return{{$}}
 ; NOREGS-LABEL: multiple_uses_in_same_insn:
-; NOREGS:  .param      i32, i32{{$}}
+; NOREGS:       .functype multiple_uses_in_same_insn (i32, i32) -> (){{$}}
 ; NOREGS-NEXT:  get_local 1{{$}}
 ; NOREGS-NEXT:  get_local 0{{$}}
 ; NOREGS-NEXT:  i32.mul
@@ -381,8 +377,7 @@ define void @multiple_uses_in_same_insn(i32 %x, i32 %y) {
 ; Commute operands to achieve better stackifying.
 
 ; CHECK-LABEL: commute:
-; CHECK-NOT: param
-; CHECK:  .result     i32{{$}}
+; CHECK:  .functype commute () -> (i32){{$}}
 ; CHECK-NEXT:  i32.call    $push0=, red@FUNCTION{{$}}
 ; CHECK-NEXT:  i32.call    $push1=, green@FUNCTION{{$}}
 ; CHECK-NEXT:  i32.add     $push2=, $pop0, $pop1{{$}}
@@ -390,8 +385,7 @@ define void @multiple_uses_in_same_insn(i32 %x, i32 %y) {
 ; CHECK-NEXT:  i32.add     $push4=, $pop2, $pop3{{$}}
 ; CHECK-NEXT:  return      $pop4{{$}}
 ; NOREGS-LABEL: commute:
-; NOREGS-NOT: param
-; NOREGS:  .result     i32{{$}}
+; NOREGS:  .functype commute () -> (i32){{$}}
 ; NOREGS-NEXT:  i32.call    red@FUNCTION{{$}}
 ; NOREGS-NEXT:  i32.call    green@FUNCTION{{$}}
 ; NOREGS-NEXT:  i32.add {{$}}
@@ -578,11 +572,11 @@ define i32 @store_past_invar_load(i32 %a, i32* %p1, i32* dereferenceable(4) %p2)
 }
 
 ; CHECK-LABEL: ignore_dbg_value:
-; CHECK-NEXT: .Lfunc_begin
-; CHECK-NEXT: unreachable
+; CHECK:      .Lfunc_begin
+; CHECK:       unreachable
 ; NOREGS-LABEL: ignore_dbg_value:
-; NOREGS-NEXT: .Lfunc_begin
-; NOREGS-NEXT: unreachable
+; NOREGS:      .Lfunc_begin
+; NOREGS:       unreachable
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 define void @ignore_dbg_value() {
   call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !7, metadata !9), !dbg !10
diff --git a/test/CodeGen/WebAssembly/return-int32.ll b/test/CodeGen/WebAssembly/return-int32.ll
index ebca4bdbd560e87da97604441322704d6cfae6cf..066006ebf29aab74c458f41c447c10eeccda321b 100644
--- a/test/CodeGen/WebAssembly/return-int32.ll
+++ b/test/CodeGen/WebAssembly/return-int32.ll
@@ -5,8 +5,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: return_i32:
-; CHECK-NEXT:  .param i32{{$}}
-; CHECK-NEXT:  .result i32{{$}}
+; CHECK-NEXT:  .functype return_i32 (i32) -> (i32){{$}}
 ; CHECK-NEXT:  get_local  $push0=, 0
 ; CHECK-NEXT:  end_function{{$}}
 define i32 @return_i32(i32 %p) {
diff --git a/test/CodeGen/WebAssembly/return-void.ll b/test/CodeGen/WebAssembly/return-void.ll
index 51530eec89d698ee0e0d86bc289bf6df7bd07383..8986e1707ab95c2ee9e989d03dfa14f2dc9093b7 100644
--- a/test/CodeGen/WebAssembly/return-void.ll
+++ b/test/CodeGen/WebAssembly/return-void.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: return_void:
-; CHECK-NEXT: end_function{{$}}
+; CHECK: end_function{{$}}
 define void @return_void() {
   ret void
 }
diff --git a/test/CodeGen/WebAssembly/returned.ll b/test/CodeGen/WebAssembly/returned.ll
index 5f7dfc1176831f877f805499195a6db8833bdd5a..5861d94d389504deccc7f9a7a3ff44075160f420 100644
--- a/test/CodeGen/WebAssembly/returned.ll
+++ b/test/CodeGen/WebAssembly/returned.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: _Z3foov:
-; CHECK-NEXT: .result   i32{{$}}
+; CHECK-NEXT: .functype _Z3foov () -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 1{{$}}
 ; CHECK-NEXT: {{^}} i32.call      $push1=, _Znwm@FUNCTION, $pop0{{$}}
 ; CHECK-NEXT: {{^}} i32.call      $push2=, _ZN5AppleC1Ev@FUNCTION, $pop1{{$}}
@@ -23,8 +23,7 @@ entry:
 }
 
 ; CHECK-LABEL: _Z3barPvS_l:
-; CHECK-NEXT: .param   i32, i32, i32{{$}}
-; CHECK-NEXT: .result  i32{{$}}
+; CHECK-NEXT: .functype _Z3barPvS_l (i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: {{^}} i32.call     $push0=, memcpy@FUNCTION, $0, $1, $2{{$}}
 ; CHECK-NEXT: return   $pop0{{$}}
 declare i8* @memcpy(i8* returned, i8*, i32)
@@ -37,7 +36,7 @@ entry:
 ; Test that the optimization isn't performed on constant arguments.
 
 ; CHECK-LABEL: test_constant_arg:
-; CHECK-NEXT: i32.const   $push0=, global{{$}}
+; CHECK:      i32.const   $push0=, global{{$}}
 ; CHECK-NEXT: {{^}} i32.call        $drop=, returns_arg@FUNCTION, $pop0{{$}}
 ; CHECK-NEXT: return{{$}}
 @global = external global i32
@@ -52,7 +51,7 @@ declare i32* @returns_arg(i32* returned)
 ; "returned" attribute.
 
 ; CHECK-LABEL: test_other_skipped:
-; CHECK-NEXT: .param   i32, i32, f64{{$}}
+; CHECK-NEXT: .functype test_other_skipped (i32, i32, f64) -> (){{$}}
 ; CHECK-NEXT: {{^}} i32.call     $drop=, do_something@FUNCTION, $0, $1, $2{{$}}
 ; CHECK-NEXT: {{^}} call     do_something_with_i32@FUNCTION, $1{{$}}
 ; CHECK-NEXT: {{^}} call     do_something_with_double@FUNCTION, $2{{$}}
@@ -69,8 +68,7 @@ define void @test_other_skipped(i32 %a, i32 %b, double %c) {
 ; Test that the optimization is performed on arguments other than the first.
 
 ; CHECK-LABEL: test_second_arg:
-; CHECK-NEXT: .param   i32, i32{{$}}
-; CHECK-NEXT: .result  i32{{$}}
+; CHECK-NEXT: .functype test_second_arg (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: {{^}} i32.call     $push0=, do_something_else@FUNCTION, $0, $1{{$}}
 ; CHECK-NEXT: return   $pop0{{$}}
 declare i32 @do_something_else(i32, i32 returned)
diff --git a/test/CodeGen/WebAssembly/select.ll b/test/CodeGen/WebAssembly/select.ll
index 99b8d45d8e2e31de288f7751f4d65a7d000d254e..daa934f448448edd1bce96e42cf3fa1352921d41 100644
--- a/test/CodeGen/WebAssembly/select.ll
+++ b/test/CodeGen/WebAssembly/select.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: select_i32_bool:
-; CHECK-NEXT: .param     i32, i32, i32{{$}}
-; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .functype select_i32_bool (i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
@@ -17,8 +16,7 @@ define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
 }
 
 ; CHECK-LABEL: select_i32_bool_nozext:
-; CHECK-NEXT: .param     i32, i32, i32{{$}}
-; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .functype select_i32_bool_nozext (i32, i32, i32) -> (i32){{$}}
 ; SLOW-NEXT: i32.select $push0=, $1, $2, $0{{$}}
 ; SLOW-NEXT: return     $pop0{{$}}
 define i32 @select_i32_bool_nozext(i1 %a, i32 %b, i32 %c) {
@@ -27,8 +25,7 @@ define i32 @select_i32_bool_nozext(i1 %a, i32 %b, i32 %c) {
 }
 
 ; CHECK-LABEL: select_i32_eq:
-; CHECK-NEXT: .param     i32, i32, i32{{$}}
-; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .functype select_i32_eq (i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.select $push0=, $2, $1, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_eq(i32 %a, i32 %b, i32 %c) {
@@ -38,8 +35,7 @@ define i32 @select_i32_eq(i32 %a, i32 %b, i32 %c) {
 }
 
 ; CHECK-LABEL: select_i32_ne:
-; CHECK-NEXT: .param     i32, i32, i32{{$}}
-; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .functype select_i32_ne (i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_ne(i32 %a, i32 %b, i32 %c) {
@@ -49,8 +45,7 @@ define i32 @select_i32_ne(i32 %a, i32 %b, i32 %c) {
 }
 
 ; CHECK-LABEL: select_i64_bool:
-; CHECK-NEXT: .param     i32, i64, i64{{$}}
-; CHECK-NEXT: .result    i64{{$}}
+; CHECK-NEXT: .functype select_i64_bool (i32, i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: i64.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
@@ -59,8 +54,7 @@ define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
 }
 
 ; CHECK-LABEL: select_i64_bool_nozext:
-; CHECK-NEXT: .param     i32, i64, i64{{$}}
-; CHECK-NEXT: .result    i64{{$}}
+; CHECK-NEXT: .functype select_i64_bool_nozext (i32, i64, i64) -> (i64){{$}}
 ; SLOW-NEXT: i64.select $push0=, $1, $2, $0{{$}}
 ; SLOW-NEXT: return     $pop0{{$}}
 define i64 @select_i64_bool_nozext(i1 %a, i64 %b, i64 %c) {
@@ -69,8 +63,7 @@ define i64 @select_i64_bool_nozext(i1 %a, i64 %b, i64 %c) {
 }
 
 ; CHECK-LABEL: select_i64_eq:
-; CHECK-NEXT: .param     i32, i64, i64{{$}}
-; CHECK-NEXT: .result    i64{{$}}
+; CHECK-NEXT: .functype select_i64_eq (i32, i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: i64.select $push0=, $2, $1, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_eq(i32 %a, i64 %b, i64 %c) {
@@ -80,8 +73,7 @@ define i64 @select_i64_eq(i32 %a, i64 %b, i64 %c) {
 }
 
 ; CHECK-LABEL: select_i64_ne:
-; CHECK-NEXT: .param     i32, i64, i64{{$}}
-; CHECK-NEXT: .result    i64{{$}}
+; CHECK-NEXT: .functype select_i64_ne (i32, i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: i64.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_ne(i32 %a, i64 %b, i64 %c) {
@@ -91,8 +83,7 @@ define i64 @select_i64_ne(i32 %a, i64 %b, i64 %c) {
 }
 
 ; CHECK-LABEL: select_f32_bool:
-; CHECK-NEXT: .param     i32, f32, f32{{$}}
-; CHECK-NEXT: .result    f32{{$}}
+; CHECK-NEXT: .functype select_f32_bool (i32, f32, f32) -> (f32){{$}}
 ; CHECK-NEXT: f32.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
@@ -101,8 +92,7 @@ define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
 }
 
 ; CHECK-LABEL: select_f32_bool_nozext:
-; CHECK-NEXT: .param     i32, f32, f32{{$}}
-; CHECK-NEXT: .result    f32{{$}}
+; CHECK-NEXT: .functype select_f32_bool_nozext (i32, f32, f32) -> (f32){{$}}
 ; SLOW-NEXT: f32.select $push0=, $1, $2, $0{{$}}
 ; SLOW-NEXT: return     $pop0{{$}}
 define float @select_f32_bool_nozext(i1 %a, float %b, float %c) {
@@ -111,8 +101,7 @@ define float @select_f32_bool_nozext(i1 %a, float %b, float %c) {
 }
 
 ; CHECK-LABEL: select_f32_eq:
-; CHECK-NEXT: .param     i32, f32, f32{{$}}
-; CHECK-NEXT: .result    f32{{$}}
+; CHECK-NEXT: .functype select_f32_eq (i32, f32, f32) -> (f32){{$}}
 ; CHECK-NEXT: f32.select $push0=, $2, $1, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_eq(i32 %a, float %b, float %c) {
@@ -122,8 +111,7 @@ define float @select_f32_eq(i32 %a, float %b, float %c) {
 }
 
 ; CHECK-LABEL: select_f32_ne:
-; CHECK-NEXT: .param     i32, f32, f32{{$}}
-; CHECK-NEXT: .result    f32{{$}}
+; CHECK-NEXT: .functype select_f32_ne (i32, f32, f32) -> (f32){{$}}
 ; CHECK-NEXT: f32.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_ne(i32 %a, float %b, float %c) {
@@ -133,8 +121,7 @@ define float @select_f32_ne(i32 %a, float %b, float %c) {
 }
 
 ; CHECK-LABEL: select_f64_bool:
-; CHECK-NEXT: .param     i32, f64, f64{{$}}
-; CHECK-NEXT: .result    f64{{$}}
+; CHECK-NEXT: .functype select_f64_bool (i32, f64, f64) -> (f64){{$}}
 ; CHECK-NEXT: f64.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
@@ -143,8 +130,7 @@ define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
 }
 
 ; CHECK-LABEL: select_f64_bool_nozext:
-; CHECK-NEXT: .param     i32, f64, f64{{$}}
-; CHECK-NEXT: .result    f64{{$}}
+; CHECK-NEXT: .functype select_f64_bool_nozext (i32, f64, f64) -> (f64){{$}}
 ; SLOW-NEXT: f64.select $push0=, $1, $2, $0{{$}}
 ; SLOW-NEXT: return     $pop0{{$}}
 define double @select_f64_bool_nozext(i1 %a, double %b, double %c) {
@@ -153,8 +139,7 @@ define double @select_f64_bool_nozext(i1 %a, double %b, double %c) {
 }
 
 ; CHECK-LABEL: select_f64_eq:
-; CHECK-NEXT: .param     i32, f64, f64{{$}}
-; CHECK-NEXT: .result    f64{{$}}
+; CHECK-NEXT: .functype select_f64_eq (i32, f64, f64) -> (f64){{$}}
 ; CHECK-NEXT: f64.select $push0=, $2, $1, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_eq(i32 %a, double %b, double %c) {
@@ -164,8 +149,7 @@ define double @select_f64_eq(i32 %a, double %b, double %c) {
 }
 
 ; CHECK-LABEL: select_f64_ne:
-; CHECK-NEXT: .param     i32, f64, f64{{$}}
-; CHECK-NEXT: .result    f64{{$}}
+; CHECK-NEXT: .functype select_f64_ne (i32, f64, f64) -> (f64){{$}}
 ; CHECK-NEXT: f64.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_ne(i32 %a, double %b, double %c) {
diff --git a/test/CodeGen/WebAssembly/signext-inreg.ll b/test/CodeGen/WebAssembly/signext-inreg.ll
index 2cc53eb91bdae802c25fd35c140248272bc699af..b6e2c763740931c4123006f37f6c7294949bfece 100644
--- a/test/CodeGen/WebAssembly/signext-inreg.ll
+++ b/test/CodeGen/WebAssembly/signext-inreg.ll
@@ -5,8 +5,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: i32_extend8_s:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_extend8_s (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.extend8_s $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 
@@ -18,8 +17,7 @@ define i32 @i32_extend8_s(i8 %x) {
 }
 
 ; CHECK-LABEL: i32_extend16_s:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype i32_extend16_s (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.extend16_s $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 
@@ -31,8 +29,7 @@ define i32 @i32_extend16_s(i16 %x) {
 }
 
 ; CHECK-LABEL: i64_extend8_s:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_extend8_s (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.extend_u/i32 $push[[NUM1:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: i64.extend8_s $push[[NUM2:[0-9]+]]=, $pop[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM2]]{{$}}
@@ -45,8 +42,7 @@ define i64 @i64_extend8_s(i8 %x) {
 }
 
 ; CHECK-LABEL: i64_extend16_s:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_extend16_s (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.extend_u/i32 $push[[NUM1:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: i64.extend16_s $push[[NUM2:[0-9]+]]=, $pop[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM2]]{{$}}
@@ -60,8 +56,7 @@ define i64 @i64_extend16_s(i16 %x) {
 
 ; No SIGN_EXTEND_INREG is needed for 32->64 extension.
 ; CHECK-LABEL: i64_extend32_s:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: .functype i64_extend32_s (i32) -> (i64){{$}}
 ; CHECK-NEXT: i64.extend_s/i32 $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @i64_extend32_s(i32 %x) {
diff --git a/test/CodeGen/WebAssembly/signext-zeroext.ll b/test/CodeGen/WebAssembly/signext-zeroext.ll
index 0c13a0f433eeb37259da254db5328ce58b7b604f..e5e69dfd01d1c7a397971a6375efc8c1c8adc91f 100644
--- a/test/CodeGen/WebAssembly/signext-zeroext.ll
+++ b/test/CodeGen/WebAssembly/signext-zeroext.ll
@@ -6,8 +6,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: z2s_func:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype z2s_func (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 24{{$}}
 ; CHECK-NEXT: i32.shl $push[[NUM2:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT: i32.const $push[[NUM1:[0-9]+]]=, 24{{$}}
@@ -18,8 +17,7 @@ define signext i8 @z2s_func(i8 zeroext %t) {
 }
 
 ; CHECK-LABEL: s2z_func:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype s2z_func (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 255{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM1]]{{$}}
@@ -28,8 +26,7 @@ define zeroext i8 @s2z_func(i8 signext %t) {
 }
 
 ; CHECK-LABEL: z2s_call:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype z2s_call (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 255{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT: call $push[[NUM2:[0-9]+]]=, z2s_func@FUNCTION, $pop[[NUM1]]{{$}}
@@ -42,8 +39,7 @@ define i32 @z2s_call(i32 %t) {
 }
 
 ; CHECK-LABEL: s2z_call:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype s2z_call (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 24{{$}}
 ; CHECK-NEXT: i32.shl $push[[NUM1:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT: i32.const $push[[NUM6:[0-9]+]]=, 24{{$}}
diff --git a/test/CodeGen/WebAssembly/simd-arith.ll b/test/CodeGen/WebAssembly/simd-arith.ll
index e092cd98ecb7575bb98c4e9c8b4991343b18a145..6d3d04c9e494db9597a2fdb5ea02971df0e47032 100644
--- a/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/test/CodeGen/WebAssembly/simd-arith.ll
@@ -19,8 +19,7 @@ target triple = "wasm32-unknown-unknown"
 ; ==============================================================================
 ; CHECK-LABEL: add_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.add $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
@@ -30,8 +29,7 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: sub_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
@@ -41,8 +39,7 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: mul_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype mul_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
@@ -52,8 +49,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: neg_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype neg_v16i8 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.neg $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @neg_v16i8(<16 x i8> %x) {
@@ -65,8 +61,7 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 
 ; CHECK-LABEL: shl_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_v16i8 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
@@ -80,8 +75,7 @@ define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
 
 ; CHECK-LABEL: shl_const_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_const_v16i8 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5
 ; SIMD128-NEXT: i8x16.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -94,8 +88,7 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 
 ; CHECK-LABEL: shl_vec_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_vec_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -113,8 +106,7 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 
 ; CHECK-LABEL: shr_s_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_v16i8 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
@@ -128,8 +120,7 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 
 ; CHECK-LABEL: shr_s_vec_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_vec_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 24{{$}}
 ; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -155,8 +146,7 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 
 ; CHECK-LABEL: shr_u_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_v16i8 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
@@ -170,8 +160,7 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 
 ; CHECK-LABEL: shr_u_vec_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_vec_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -189,8 +178,7 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 
 ; CHECK-LABEL: and_v16i8:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype and_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
@@ -200,8 +188,7 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: or_v16i8:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype or_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
@@ -211,8 +198,7 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: xor_v16i8:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype xor_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.xor $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
@@ -222,8 +208,7 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: not_v16i8:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype not_v16i8 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @not_v16i8(<16 x i8> %x) {
@@ -236,8 +221,7 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 
 ; CHECK-LABEL: bitselect_v16i8:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v16i8 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 ; SIMD128-FAST-NEXT: v128.and
@@ -260,8 +244,7 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; ==============================================================================
 ; CHECK-LABEL: add_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.add $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
@@ -271,8 +254,7 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: sub_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
@@ -282,8 +264,7 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: mul_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype mul_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
@@ -293,8 +274,7 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: neg_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype neg_v8i16 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.neg $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @neg_v8i16(<8 x i16> %x) {
@@ -305,8 +285,7 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 
 ; CHECK-LABEL: shl_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_v8i16 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
@@ -319,8 +298,7 @@ define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
 
 ; CHECK-LABEL: shl_const_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_const_v8i16 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5
 ; SIMD128-NEXT: i16x8.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -332,8 +310,7 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 
 ; CHECK-LABEL: shl_vec_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_vec_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -351,8 +328,7 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 
 ; CHECK-LABEL: shr_s_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_v8i16 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
@@ -365,8 +341,7 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 
 ; CHECK-LABEL: shr_s_vec_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_vec_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -392,8 +367,7 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 
 ; CHECK-LABEL: shr_u_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_v8i16 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
@@ -406,8 +380,7 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 
 ; CHECK-LABEL: shr_u_vec_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_vec_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -425,8 +398,7 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 
 ; CHECK-LABEL: and_v8i16:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype and_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
@@ -436,8 +408,7 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: or_v8i16:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype or_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
@@ -447,8 +418,7 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: xor_v8i16:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype xor_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.xor $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
@@ -458,8 +428,7 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: not_v8i16:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype not_v8i16 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @not_v8i16(<8 x i16> %x) {
@@ -470,8 +439,7 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 
 ; CHECK-LABEL: bitselect_v8i16:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v8i16 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 ; SIMD128-FAST-NEXT: v128.and
@@ -494,8 +462,7 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; ==============================================================================
 ; CHECK-LABEL: add_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.add $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
@@ -505,8 +472,7 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: sub_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
@@ -516,8 +482,7 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: mul_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype mul_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
@@ -527,8 +492,7 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: neg_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype neg_v4i32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.neg $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @neg_v4i32(<4 x i32> %x) {
@@ -538,8 +502,7 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 
 ; CHECK-LABEL: shl_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_v4i32 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
@@ -552,8 +515,7 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 
 ; CHECK-LABEL: shl_const_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_const_v4i32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5
 ; SIMD128-NEXT: i32x4.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -564,8 +526,7 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 
 ; CHECK-LABEL: shl_vec_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_vec_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -583,8 +544,7 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 
 ; CHECK-LABEL: shr_s_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_v4i32 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
@@ -597,8 +557,7 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 
 ; CHECK-LABEL: shr_s_vec_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_vec_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i32.shr_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -616,8 +575,7 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 
 ; CHECK-LABEL: shr_u_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_v4i32 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
@@ -630,8 +588,7 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 
 ; CHECK-LABEL: shr_u_vec_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_vec_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -649,8 +606,7 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 
 ; CHECK-LABEL: and_v4i32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype and_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
@@ -660,8 +616,7 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: or_v4i32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype or_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
@@ -671,8 +626,7 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: xor_v4i32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype xor_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.xor $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
@@ -682,8 +636,7 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: not_v4i32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype not_v4i32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @not_v4i32(<4 x i32> %x) {
@@ -693,8 +646,7 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 
 ; CHECK-LABEL: bitselect_v4i32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v4i32 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 ; SIMD128-FAST-NEXT: v128.not
@@ -716,8 +668,7 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; CHECK-LABEL: add_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.add $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @add_v2i64(<2 x i64> %x, <2 x i64> %y) {
@@ -728,8 +679,7 @@ define <2 x i64> @add_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: sub_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @sub_v2i64(<2 x i64> %x, <2 x i64> %y) {
@@ -751,8 +701,7 @@ define <2 x i64> @mul_v2i64(<2 x i64> %x, <2 x i64> %y) {
 
 ; CHECK-LABEL: neg_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype neg_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.neg $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @neg_v2i64(<2 x i64> %x) {
@@ -762,8 +711,7 @@ define <2 x i64> @neg_v2i64(<2 x i64> %x) {
 
 ; CHECK-LABEL: shl_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_v2i64 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shl_v2i64(<2 x i64> %v, i32 %x) {
@@ -776,8 +724,7 @@ define <2 x i64> @shl_v2i64(<2 x i64> %v, i32 %x) {
 
 ; CHECK-LABEL: shl_nozext_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, i64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_nozext_v2i64 (v128, i64) -> (v128){{$}}
 ; SIMD128-NEXT: i32.wrap/i64 $push[[L0:[0-9]+]]=, $1{{$}}
 ; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -790,8 +737,7 @@ define <2 x i64> @shl_nozext_v2i64(<2 x i64> %v, i64 %x) {
 
 ; CHECK-LABEL: shl_const_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_const_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
 ; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -802,8 +748,7 @@ define <2 x i64> @shl_const_v2i64(<2 x i64> %v) {
 
 ; CHECK-LABEL: shl_vec_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shl_vec_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i64.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -820,8 +765,7 @@ define <2 x i64> @shl_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
 
 ; CHECK-LABEL: shr_s_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_v2i64 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_s_v2i64(<2 x i64> %v, i32 %x) {
@@ -834,8 +778,7 @@ define <2 x i64> @shr_s_v2i64(<2 x i64> %v, i32 %x) {
 
 ; CHECK-LABEL: shr_s_nozext_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, i64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_nozext_v2i64 (v128, i64) -> (v128){{$}}
 ; SIMD128-NEXT: i32.wrap/i64 $push[[L0:[0-9]+]]=, $1{{$}}
 ; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -848,8 +791,7 @@ define <2 x i64> @shr_s_nozext_v2i64(<2 x i64> %v, i64 %x) {
 
 ; CHECK-LABEL: shr_s_const_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_const_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
 ; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -860,8 +802,7 @@ define <2 x i64> @shr_s_const_v2i64(<2 x i64> %v) {
 
 ; CHECK-LABEL: shr_s_vec_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_s_vec_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i64.shr_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -878,8 +819,7 @@ define <2 x i64> @shr_s_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
 
 ; CHECK-LABEL: shr_u_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_v2i64 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_u_v2i64(<2 x i64> %v, i32 %x) {
@@ -892,8 +832,7 @@ define <2 x i64> @shr_u_v2i64(<2 x i64> %v, i32 %x) {
 
 ; CHECK-LABEL: shr_u_nozext_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, i64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_nozext_v2i64 (v128, i64) -> (v128){{$}}
 ; SIMD128-NEXT: i32.wrap/i64 $push[[L0:[0-9]+]]=, $1{{$}}
 ; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -906,8 +845,7 @@ define <2 x i64> @shr_u_nozext_v2i64(<2 x i64> %v, i64 %x) {
 
 ; CHECK-LABEL: shr_u_const_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_const_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
 ; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -918,8 +856,7 @@ define <2 x i64> @shr_u_const_v2i64(<2 x i64> %v) {
 
 ; CHECK-LABEL: shr_u_vec_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shr_u_vec_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
 ; SIMD128-NEXT: i64.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -937,8 +874,7 @@ define <2 x i64> @shr_u_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
 ; CHECK-LABEL: and_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype and_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @and_v2i64(<2 x i64> %x, <2 x i64> %y) {
@@ -949,8 +885,7 @@ define <2 x i64> @and_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: or_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype or_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @or_v2i64(<2 x i64> %x, <2 x i64> %y) {
@@ -961,8 +896,7 @@ define <2 x i64> @or_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: xor_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype xor_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.xor $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @xor_v2i64(<2 x i64> %x, <2 x i64> %y) {
@@ -973,8 +907,7 @@ define <2 x i64> @xor_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: not_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype not_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @not_v2i64(<2 x i64> %x) {
@@ -985,8 +918,7 @@ define <2 x i64> @not_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: bitselect_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v2i64 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 ; SIMD128-FAST-NEXT: v128.not
@@ -1007,8 +939,7 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
 ; ==============================================================================
 ; CHECK-LABEL: neg_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype neg_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.neg $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @neg_v4f32(<4 x float> %x) {
@@ -1019,8 +950,7 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 
 ; CHECK-LABEL: abs_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype abs_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.abs $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
@@ -1031,8 +961,7 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 
 ; CHECK-LABEL: min_unordered_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype min_unordered_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
 ; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
@@ -1046,8 +975,7 @@ define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
 
 ; CHECK-LABEL: max_unordered_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype max_unordered_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
 ; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
@@ -1061,8 +989,7 @@ define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
 
 ; CHECK-LABEL: min_ordered_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype min_ordered_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
 ; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
@@ -1076,8 +1003,7 @@ define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
 
 ; CHECK-LABEL: max_ordered_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype max_ordered_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
 ; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
@@ -1091,8 +1017,7 @@ define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
 
 ; CHECK-LABEL: min_intrinsic_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype min_intrinsic_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
@@ -1103,8 +1028,7 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: max_intrinsic_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype max_intrinsic_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
@@ -1115,7 +1039,7 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: min_const_intrinsic_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype min_const_intrinsic_v4f32 () -> (v128){{$}}
 ; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}}
 ; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1129,7 +1053,7 @@ define <4 x float> @min_const_intrinsic_v4f32() {
 
 ; CHECK-LABEL: max_const_intrinsic_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype max_const_intrinsic_v4f32 () -> (v128){{$}}
 ; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}}
 ; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1143,8 +1067,7 @@ define <4 x float> @max_const_intrinsic_v4f32() {
 
 ; CHECK-LABEL: add_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.add $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
@@ -1154,8 +1077,7 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: sub_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
@@ -1165,8 +1087,7 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: div_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype div_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.div $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
@@ -1176,8 +1097,7 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: mul_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype mul_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
@@ -1187,8 +1107,7 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: sqrt_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sqrt_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.sqrt $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
@@ -1202,8 +1121,7 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; ==============================================================================
 ; CHECK-LABEL: neg_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype neg_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.neg $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @neg_v2f64(<2 x double> %x) {
@@ -1214,8 +1132,7 @@ define <2 x double> @neg_v2f64(<2 x double> %x) {
 
 ; CHECK-LABEL: abs_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype abs_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.abs $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
@@ -1226,8 +1143,7 @@ define <2 x double> @abs_v2f64(<2 x double> %x) {
 
 ; CHECK-LABEL: min_unordered_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype min_unordered_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
 ; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
@@ -1241,8 +1157,7 @@ define <2 x double> @min_unordered_v2f64(<2 x double> %x) {
 
 ; CHECK-LABEL: max_unordered_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype max_unordered_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
 ; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
@@ -1256,8 +1171,7 @@ define <2 x double> @max_unordered_v2f64(<2 x double> %x) {
 
 ; CHECK-LABEL: min_ordered_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype min_ordered_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
 ; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
@@ -1271,8 +1185,7 @@ define <2 x double> @min_ordered_v2f64(<2 x double> %x) {
 
 ; CHECK-LABEL: max_ordered_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype max_ordered_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
 ; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
@@ -1286,8 +1199,7 @@ define <2 x double> @max_ordered_v2f64(<2 x double> %x) {
 
 ; CHECK-LABEL: min_intrinsic_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype min_intrinsic_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
@@ -1298,8 +1210,7 @@ define <2 x double> @min_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
 
 ; CHECK-LABEL: max_intrinsic_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype max_intrinsic_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
@@ -1310,7 +1221,7 @@ define <2 x double> @max_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
 
 ; CHECK-LABEL: min_const_intrinsic_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype min_const_intrinsic_v2f64 () -> (v128){{$}}
 ; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}}
 ; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1324,7 +1235,7 @@ define <2 x double> @min_const_intrinsic_v2f64() {
 
 ; CHECK-LABEL: max_const_intrinsic_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype max_const_intrinsic_v2f64 () -> (v128){{$}}
 ; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}}
 ; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1339,8 +1250,7 @@ define <2 x double> @max_const_intrinsic_v2f64() {
 ; CHECK-LABEL: add_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f62x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.add $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @add_v2f64(<2 x double> %x, <2 x double> %y) {
@@ -1351,8 +1261,7 @@ define <2 x double> @add_v2f64(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: sub_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f62x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @sub_v2f64(<2 x double> %x, <2 x double> %y) {
@@ -1363,8 +1272,7 @@ define <2 x double> @sub_v2f64(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: div_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f62x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype div_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.div $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @div_v2f64(<2 x double> %x, <2 x double> %y) {
@@ -1375,8 +1283,7 @@ define <2 x double> @div_v2f64(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: mul_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f62x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype mul_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @mul_v2f64(<2 x double> %x, <2 x double> %y) {
@@ -1386,8 +1293,7 @@ define <2 x double> @mul_v2f64(<2 x double> %x, <2 x double> %y) {
 
 ; CHECK-LABEL: sqrt_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sqrt_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.sqrt $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
diff --git a/test/CodeGen/WebAssembly/simd-comparisons.ll b/test/CodeGen/WebAssembly/simd-comparisons.ll
index 5f0a1e93b45bf04de735eb3a61be8a8bfb5572a6..5e4c51ab6aa9b4984ead2c3346deade1799537db 100644
--- a/test/CodeGen/WebAssembly/simd-comparisons.ll
+++ b/test/CodeGen/WebAssembly/simd-comparisons.ll
@@ -9,8 +9,7 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: compare_eq_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_eq_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_eq_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -20,8 +19,7 @@ define <16 x i1> @compare_eq_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_eq_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_eq_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_eq_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -32,8 +30,7 @@ define <16 x i8> @compare_sext_eq_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_ne_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ne_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_ne_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -43,8 +40,7 @@ define <16 x i1> @compare_ne_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_ne_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ne_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_ne_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -55,8 +51,7 @@ define <16 x i8> @compare_sext_ne_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_slt_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_slt_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_slt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -66,8 +61,7 @@ define <16 x i1> @compare_slt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_slt_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_slt_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_slt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -78,8 +72,7 @@ define <16 x i8> @compare_sext_slt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_ult_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ult_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_ult_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -89,8 +82,7 @@ define <16 x i1> @compare_ult_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_ult_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ult_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_ult_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -101,8 +93,7 @@ define <16 x i8> @compare_sext_ult_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sle_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sle_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_sle_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -112,8 +103,7 @@ define <16 x i1> @compare_sle_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_sle_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sle_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_sle_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -124,8 +114,7 @@ define <16 x i8> @compare_sext_sle_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_ule_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ule_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_ule_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -135,8 +124,7 @@ define <16 x i1> @compare_ule_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_ule_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ule_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_ule_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -147,8 +135,7 @@ define <16 x i8> @compare_sext_ule_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sgt_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sgt_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_sgt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -158,8 +145,7 @@ define <16 x i1> @compare_sgt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_sgt_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sgt_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_sgt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -170,8 +156,7 @@ define <16 x i8> @compare_sext_sgt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_ugt_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ugt_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_ugt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -181,8 +166,7 @@ define <16 x i1> @compare_ugt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_ugt_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ugt_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_ugt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -193,8 +177,7 @@ define <16 x i8> @compare_sext_ugt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sge_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sge_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_sge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -204,8 +187,7 @@ define <16 x i1> @compare_sge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_sge_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sge_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_sge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -216,8 +198,7 @@ define <16 x i8> @compare_sext_sge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_uge_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_uge_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i1> @compare_uge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -227,8 +208,7 @@ define <16 x i1> @compare_uge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_sext_uge_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_uge_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @compare_sext_uge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
@@ -239,8 +219,7 @@ define <16 x i8> @compare_sext_uge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: compare_eq_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_eq_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_eq_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -250,8 +229,7 @@ define <8 x i1> @compare_eq_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_eq_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_eq_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_eq_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -262,8 +240,7 @@ define <8 x i16> @compare_sext_eq_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_ne_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ne_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_ne_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -273,8 +250,7 @@ define <8 x i1> @compare_ne_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_ne_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ne_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_ne_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -285,8 +261,7 @@ define <8 x i16> @compare_sext_ne_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_slt_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_slt_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_slt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -296,8 +271,7 @@ define <8 x i1> @compare_slt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_slt_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_slt_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_slt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -308,8 +282,7 @@ define <8 x i16> @compare_sext_slt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_ult_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ult_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_ult_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -319,8 +292,7 @@ define <8 x i1> @compare_ult_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_ult_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ult_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_ult_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -331,8 +303,7 @@ define <8 x i16> @compare_sext_ult_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sle_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sle_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_sle_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -342,8 +313,7 @@ define <8 x i1> @compare_sle_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_sle_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sle_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_sle_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -354,8 +324,7 @@ define <8 x i16> @compare_sext_sle_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_ule_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ule_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_ule_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -365,8 +334,7 @@ define <8 x i1> @compare_ule_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_ule_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ule_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_ule_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -377,8 +345,7 @@ define <8 x i16> @compare_sext_ule_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sgt_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sgt_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_sgt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -388,8 +355,7 @@ define <8 x i1> @compare_sgt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_sgt_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sgt_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_sgt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -400,8 +366,7 @@ define <8 x i16> @compare_sext_sgt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_ugt_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ugt_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_ugt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -411,8 +376,7 @@ define <8 x i1> @compare_ugt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_ugt_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ugt_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_ugt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -423,8 +387,7 @@ define <8 x i16> @compare_sext_ugt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sge_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sge_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_sge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -434,8 +397,7 @@ define <8 x i1> @compare_sge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_sge_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sge_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_sge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -446,8 +408,7 @@ define <8 x i16> @compare_sext_sge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_uge_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_uge_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i1> @compare_uge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -457,8 +418,7 @@ define <8 x i1> @compare_uge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_sext_uge_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_uge_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @compare_sext_uge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
@@ -469,8 +429,7 @@ define <8 x i16> @compare_sext_uge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: compare_eq_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_eq_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_eq_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -480,8 +439,7 @@ define <4 x i1> @compare_eq_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_eq_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_eq_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_eq_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -492,8 +450,7 @@ define <4 x i32> @compare_sext_eq_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_ne_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ne_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ne_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -503,8 +460,7 @@ define <4 x i1> @compare_ne_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_ne_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ne_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_ne_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -515,8 +471,7 @@ define <4 x i32> @compare_sext_ne_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_slt_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_slt_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_slt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -526,8 +481,7 @@ define <4 x i1> @compare_slt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_slt_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_slt_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_slt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -538,8 +492,7 @@ define <4 x i32> @compare_sext_slt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_ult_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ult_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ult_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -549,8 +502,7 @@ define <4 x i1> @compare_ult_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_ult_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ult_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_ult_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -561,8 +513,7 @@ define <4 x i32> @compare_sext_ult_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sle_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sle_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_sle_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -572,8 +523,7 @@ define <4 x i1> @compare_sle_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_sle_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sle_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_sle_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -584,8 +534,7 @@ define <4 x i32> @compare_sext_sle_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_ule_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ule_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ule_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -595,8 +544,7 @@ define <4 x i1> @compare_ule_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_ule_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ule_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_ule_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -607,8 +555,7 @@ define <4 x i32> @compare_sext_ule_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sgt_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sgt_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_sgt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -618,8 +565,7 @@ define <4 x i1> @compare_sgt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_sgt_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sgt_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_sgt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -630,8 +576,7 @@ define <4 x i32> @compare_sext_sgt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_ugt_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ugt_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ugt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -641,8 +586,7 @@ define <4 x i1> @compare_ugt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_ugt_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ugt_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_ugt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -653,8 +597,7 @@ define <4 x i32> @compare_sext_ugt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sge_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sge_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_sge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -664,8 +607,7 @@ define <4 x i1> @compare_sge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_sge_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_sge_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_sge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -676,8 +618,7 @@ define <4 x i32> @compare_sext_sge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_uge_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_uge_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_uge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -687,8 +628,7 @@ define <4 x i1> @compare_uge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_sext_uge_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_uge_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_uge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
@@ -699,8 +639,7 @@ define <4 x i32> @compare_sext_uge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: compare_oeq_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_oeq_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_oeq_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -710,8 +649,7 @@ define <4 x i1> @compare_oeq_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_oeq_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_oeq_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_oeq_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -722,8 +660,7 @@ define <4 x i32> @compare_sext_oeq_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_ogt_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ogt_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.gt $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ogt_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -733,8 +670,7 @@ define <4 x i1> @compare_ogt_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_ogt_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ogt_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.gt $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_ogt_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -745,8 +681,7 @@ define <4 x i32> @compare_sext_ogt_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_oge_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_oge_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ge $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_oge_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -756,8 +691,7 @@ define <4 x i1> @compare_oge_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_oge_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_oge_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ge $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_oge_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -768,8 +702,7 @@ define <4 x i32> @compare_sext_oge_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_olt_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_olt_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.lt $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_olt_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -779,8 +712,7 @@ define <4 x i1> @compare_olt_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_olt_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_olt_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.lt $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_olt_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -791,8 +723,7 @@ define <4 x i32> @compare_sext_olt_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_ole_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ole_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.le $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ole_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -802,8 +733,7 @@ define <4 x i1> @compare_ole_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_ole_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ole_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.le $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_ole_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -814,8 +744,7 @@ define <4 x i32> @compare_sext_ole_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_one_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_one_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
@@ -829,8 +758,7 @@ define <4 x i1> @compare_one_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_one_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_one_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
@@ -845,8 +773,7 @@ define <4 x i32> @compare_sext_one_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_ord_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ord_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
 ; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
@@ -858,8 +785,7 @@ define <4 x i1> @compare_ord_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_ord_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ord_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
 ; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
@@ -872,8 +798,7 @@ define <4 x i32> @compare_sext_ord_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_ueq_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ueq_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
@@ -887,8 +812,7 @@ define <4 x i1> @compare_ueq_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_ueq_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ueq_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
@@ -903,8 +827,7 @@ define <4 x i32> @compare_sext_ueq_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_ugt_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ugt_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -915,8 +838,7 @@ define <4 x i1> @compare_ugt_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_ugt_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ugt_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -928,8 +850,7 @@ define <4 x i32> @compare_sext_ugt_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_uge_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_uge_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -940,8 +861,7 @@ define <4 x i1> @compare_uge_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_uge_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_uge_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -953,8 +873,7 @@ define <4 x i32> @compare_sext_uge_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_ult_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ult_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -965,8 +884,7 @@ define <4 x i1> @compare_ult_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_ult_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ult_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -978,8 +896,7 @@ define <4 x i32> @compare_sext_ult_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_ule_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ule_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -990,8 +907,7 @@ define <4 x i1> @compare_ule_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_ule_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ule_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1003,8 +919,7 @@ define <4 x i32> @compare_sext_ule_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_une_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_une_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_une_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -1014,8 +929,7 @@ define <4 x i1> @compare_une_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_une_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_une_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @compare_sext_une_v4f32 (<4 x float> %x, <4 x float> %y) {
@@ -1026,8 +940,7 @@ define <4 x i32> @compare_sext_une_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_uno_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_uno_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
 ; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
@@ -1039,8 +952,7 @@ define <4 x i1> @compare_uno_v4f32 (<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: compare_sext_uno_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_uno_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
 ; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
@@ -1054,8 +966,7 @@ define <4 x i32> @compare_sext_uno_v4f32 (<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: compare_oeq_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_oeq_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_oeq_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1066,8 +977,7 @@ define <2 x i1> @compare_oeq_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_oeq_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_oeq_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @compare_sext_oeq_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1079,8 +989,7 @@ define <2 x i64> @compare_sext_oeq_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_ogt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ogt_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.gt $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ogt_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1091,8 +1000,7 @@ define <2 x i1> @compare_ogt_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_ogt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ogt_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.gt $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @compare_sext_ogt_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1104,8 +1012,7 @@ define <2 x i64> @compare_sext_ogt_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_oge_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_oge_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ge $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_oge_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1116,8 +1023,7 @@ define <2 x i1> @compare_oge_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_oge_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_oge_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ge $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @compare_sext_oge_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1129,8 +1035,7 @@ define <2 x i64> @compare_sext_oge_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_olt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_olt_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.lt $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_olt_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1141,8 +1046,7 @@ define <2 x i1> @compare_olt_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_olt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_olt_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.lt $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @compare_sext_olt_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1154,8 +1058,7 @@ define <2 x i64> @compare_sext_olt_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_ole_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ole_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.le $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ole_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1166,8 +1069,7 @@ define <2 x i1> @compare_ole_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_ole_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ole_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.le $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @compare_sext_ole_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1179,8 +1081,7 @@ define <2 x i64> @compare_sext_ole_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_one_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_one_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
@@ -1195,8 +1096,7 @@ define <2 x i1> @compare_one_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_one_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_one_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
@@ -1212,8 +1112,7 @@ define <2 x i64> @compare_sext_one_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_ord_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ord_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
 ; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
@@ -1226,8 +1125,7 @@ define <2 x i1> @compare_ord_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_ord_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ord_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
 ; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
@@ -1241,8 +1139,7 @@ define <2 x i64> @compare_sext_ord_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_ueq_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ueq_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
@@ -1257,8 +1154,7 @@ define <2 x i1> @compare_ueq_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_ueq_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ueq_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
@@ -1274,8 +1170,7 @@ define <2 x i64> @compare_sext_ueq_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_ugt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ugt_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1287,8 +1182,7 @@ define <2 x i1> @compare_ugt_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_ugt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ugt_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1301,8 +1195,7 @@ define <2 x i64> @compare_sext_ugt_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_uge_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_uge_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1314,8 +1207,7 @@ define <2 x i1> @compare_uge_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_uge_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_uge_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1328,8 +1220,7 @@ define <2 x i64> @compare_sext_uge_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_ult_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ult_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1341,8 +1232,7 @@ define <2 x i1> @compare_ult_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_ult_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ult_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1355,8 +1245,7 @@ define <2 x i64> @compare_sext_ult_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_ule_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_ule_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1368,8 +1257,7 @@ define <2 x i1> @compare_ule_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_ule_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_ule_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1382,8 +1270,7 @@ define <2 x i64> @compare_sext_ule_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_une_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_une_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_une_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1394,8 +1281,7 @@ define <2 x i1> @compare_une_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_une_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_une_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @compare_sext_une_v2f64 (<2 x double> %x, <2 x double> %y) {
@@ -1407,8 +1293,7 @@ define <2 x i64> @compare_sext_une_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_uno_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_uno_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
 ; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
@@ -1421,8 +1306,7 @@ define <2 x i1> @compare_uno_v2f64 (<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: compare_sext_uno_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype compare_sext_uno_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
 ; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
 ; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/simd-conversions.ll b/test/CodeGen/WebAssembly/simd-conversions.ll
index 532e3e626b2c63052907a345ba7f76d510b0ed5c..9c3f80fa16ad3651c7a7dc5b677f132786680c5e 100644
--- a/test/CodeGen/WebAssembly/simd-conversions.ll
+++ b/test/CodeGen/WebAssembly/simd-conversions.ll
@@ -9,8 +9,7 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: convert_s_v4f32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype convert_s_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.convert_s/i32x4 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 define <4 x float> @convert_s_v4f32(<4 x i32> %x) {
@@ -20,8 +19,7 @@ define <4 x float> @convert_s_v4f32(<4 x i32> %x) {
 
 ; CHECK-LABEL: convert_u_v4f32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype convert_u_v4f32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.convert_u/i32x4 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 define <4 x float> @convert_u_v4f32(<4 x i32> %x) {
@@ -32,8 +30,7 @@ define <4 x float> @convert_u_v4f32(<4 x i32> %x) {
 ; CHECK-LABEL: convert_s_v2f64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: f64x2.convert_s/i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype convert_s_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.convert_s/i64x2 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 define <2 x double> @convert_s_v2f64(<2 x i64> %x) {
@@ -44,8 +41,7 @@ define <2 x double> @convert_s_v2f64(<2 x i64> %x) {
 ; CHECK-LABEL: convert_u_v2f64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: f64x2.convert_u/i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype convert_u_v2f64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.convert_u/i64x2 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 define <2 x double> @convert_u_v2f64(<2 x i64> %x) {
@@ -55,8 +51,7 @@ define <2 x double> @convert_u_v2f64(<2 x i64> %x) {
 
 ; CHECK-LABEL: trunc_sat_s_v4i32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype trunc_sat_s_v4i32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.trunc_sat_s/f32x4 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) {
@@ -66,8 +61,7 @@ define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) {
 
 ; CHECK-LABEL: trunc_sat_u_v4i32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype trunc_sat_u_v4i32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.trunc_sat_u/f32x4 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) {
@@ -78,8 +72,7 @@ define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) {
 ; CHECK-LABEL: trunc_sat_s_v2i64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: i64x2.trunc_sat_s/f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype trunc_sat_s_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.trunc_sat_s/f64x2 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) {
@@ -90,8 +83,7 @@ define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) {
 ; CHECK-LABEL: trunc_sat_u_v2i64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: i64x2.trunc_sat_u/f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype trunc_sat_u_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.trunc_sat_u/f64x2 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 define <2 x i64> @trunc_sat_u_v2i64(<2 x double> %x) {
diff --git a/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll b/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll
index f128483cb9add23c16e95bc87cf9a25d3135af27..cabb96ab9cd604a8a2546284216962f2d5c902d1 100644
--- a/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll
+++ b/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll
@@ -7,8 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: load_ext_2xi32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_ext_2xi32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: i64.load32_u $push[[L0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i64.load32_u $push[[L2:[0-9]+]]=, 4($0){{$}}
@@ -20,8 +19,7 @@ define <2 x i32> @load_ext_2xi32(<2 x i32>* %p) {
 }
 
 ; CHECK-LABEL: load_zext_2xi32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_zext_2xi32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: i64.load32_u $push[[L0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i64.load32_u $push[[L2:[0-9]+]]=, 4($0){{$}}
@@ -34,8 +32,7 @@ define <2 x i64> @load_zext_2xi32(<2 x i32>* %p) {
 }
 
 ; CHECK-LABEL: load_sext_2xi32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_sext_2xi32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: i64.load32_s $push[[L0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i64.load32_s $push[[L2:[0-9]+]]=, 4($0){{$}}
@@ -48,7 +45,7 @@ define <2 x i64> @load_sext_2xi32(<2 x i32>* %p) {
 }
 
 ; CHECK-LABEL: store_trunc_2xi32:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_trunc_2xi32 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $1, 1
 ; CHECK-NEXT: i64.store32 4($0), $pop[[L0]]
 ; CHECK-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0
diff --git a/test/CodeGen/WebAssembly/simd-intrinsics.ll b/test/CodeGen/WebAssembly/simd-intrinsics.ll
index 1cf990d11d45554394b2c127f05cd8dfe96c17ab..6245efd0a0b68a04fe5b1acdadaa7ebb41a6bb03 100644
--- a/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -12,8 +12,7 @@ target triple = "wasm32-unknown-unknown"
 ; 16 x i8
 ; ==============================================================================
 ; CHECK-LABEL: add_sat_s_v16i8:
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_sat_s_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>)
@@ -23,8 +22,7 @@ define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 }
 
 ; CHECK-LABEL: add_sat_u_v16i8:
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_sat_u_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>)
@@ -34,8 +32,7 @@ define <16 x i8> @add_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 }
 
 ; CHECK-LABEL: sub_sat_s_v16i8:
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_sat_s_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.sub_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <16 x i8> @llvm.wasm.sub.saturate.signed.v16i8(<16 x i8>, <16 x i8>)
@@ -47,8 +44,7 @@ define <16 x i8> @sub_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 }
 
 ; CHECK-LABEL: sub_sat_u_v16i8:
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_sat_u_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.sub_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <16 x i8> @llvm.wasm.sub.saturate.unsigned.v16i8(<16 x i8>, <16 x i8>)
@@ -60,8 +56,7 @@ define <16 x i8> @sub_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 }
 
 ; CHECK-LABEL: any_v16i8:
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype any_v16i8 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.anytrue.v16i8(<16 x i8>)
@@ -71,8 +66,7 @@ define i32 @any_v16i8(<16 x i8> %x) {
 }
 
 ; CHECK-LABEL: all_v16i8:
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype all_v16i8 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.alltrue.v16i8(<16 x i8>)
@@ -82,8 +76,7 @@ define i32 @all_v16i8(<16 x i8> %x) {
 }
 
 ; CHECK-LABEL: bitselect_v16i8:
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v16i8 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <16 x i8> @llvm.wasm.bitselect.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
@@ -98,8 +91,7 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; 8 x i16
 ; ==============================================================================
 ; CHECK-LABEL: add_sat_s_v8i16:
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_sat_s_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
@@ -109,8 +101,7 @@ define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 }
 
 ; CHECK-LABEL: add_sat_u_v8i16:
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype add_sat_u_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>)
@@ -120,8 +111,7 @@ define <8 x i16> @add_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 }
 
 ; CHECK-LABEL: sub_sat_s_v8i16:
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_sat_s_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.sub_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <8 x i16> @llvm.wasm.sub.saturate.signed.v8i16(<8 x i16>, <8 x i16>)
@@ -133,8 +123,7 @@ define <8 x i16> @sub_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 }
 
 ; CHECK-LABEL: sub_sat_u_v8i16:
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sub_sat_u_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.sub_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <8 x i16> @llvm.wasm.sub.saturate.unsigned.v8i16(<8 x i16>, <8 x i16>)
@@ -146,8 +135,7 @@ define <8 x i16> @sub_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 }
 
 ; CHECK-LABEL: any_v8i16:
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype any_v8i16 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.anytrue.v8i16(<8 x i16>)
@@ -157,8 +145,7 @@ define i32 @any_v8i16(<8 x i16> %x) {
 }
 
 ; CHECK-LABEL: all_v8i16:
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype all_v8i16 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.alltrue.v8i16(<8 x i16>)
@@ -168,8 +155,7 @@ define i32 @all_v8i16(<8 x i16> %x) {
 }
 
 ; CHECK-LABEL: bitselect_v8i16:
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v8i16 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <8 x i16> @llvm.wasm.bitselect.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
@@ -184,8 +170,7 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; 4 x i32
 ; ==============================================================================
 ; CHECK-LABEL: any_v4i32:
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype any_v4i32 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.anytrue.v4i32(<4 x i32>)
@@ -195,8 +180,7 @@ define i32 @any_v4i32(<4 x i32> %x) {
 }
 
 ; CHECK-LABEL: all_v4i32:
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype all_v4i32 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.alltrue.v4i32(<4 x i32>)
@@ -206,8 +190,7 @@ define i32 @all_v4i32(<4 x i32> %x) {
 }
 
 ; CHECK-LABEL: bitselect_v4i32:
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v4i32 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
@@ -220,8 +203,7 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 
 ; CHECK-LABEL: trunc_sat_s_v4i32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype trunc_sat_s_v4i32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.trunc_sat_s/f32x4 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 declare <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float>)
@@ -232,8 +214,7 @@ define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) {
 
 ; CHECK-LABEL: trunc_sat_u_v4i32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype trunc_sat_u_v4i32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.trunc_sat_u/f32x4 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 declare <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float>)
@@ -246,8 +227,7 @@ define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) {
 ; 2 x i64
 ; ==============================================================================
 ; CHECK-LABEL: any_v2i64:
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype any_v2i64 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.anytrue.v2i64(<2 x i64>)
@@ -257,8 +237,7 @@ define i32 @any_v2i64(<2 x i64> %x) {
 }
 
 ; CHECK-LABEL: all_v2i64:
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype all_v2i64 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.alltrue.v2i64(<2 x i64>)
@@ -268,8 +247,7 @@ define i32 @all_v2i64(<2 x i64> %x) {
 }
 
 ; CHECK-LABEL: bitselect_v2i64:
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v2i64 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x i64> @llvm.wasm.bitselect.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
@@ -282,8 +260,7 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
 
 ; CHECK-LABEL: trunc_sat_s_v2i64:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype trunc_sat_s_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.trunc_sat_s/f64x2 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 declare <2 x i64> @llvm.wasm.trunc.saturate.signed.v2i64.v2f64(<2 x double>)
@@ -294,8 +271,7 @@ define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) {
 
 ; CHECK-LABEL: trunc_sat_u_v2i64:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype trunc_sat_u_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.trunc_sat_u/f64x2 $push[[R:[0-9]+]]=, $0
 ; SIMD128-NEXT: return $pop[[R]]
 declare <2 x i64> @llvm.wasm.trunc.saturate.unsigned.v2i64.v2f64(<2 x double>)
@@ -308,8 +284,7 @@ define <2 x i64> @trunc_sat_u_v2i64(<2 x double> %x) {
 ; 4 x f32
 ; ==============================================================================
 ; CHECK-LABEL: bitselect_v4f32:
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v4f32 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.wasm.bitselect.v4f32(<4 x float>, <4 x float>, <4 x float>)
@@ -324,8 +299,7 @@ define <4 x float> @bitselect_v4f32(<4 x float> %c, <4 x float> %v1, <4 x float>
 ; 2 x f64
 ; ==============================================================================
 ; CHECK-LABEL: bitselect_v2f64:
-; SIMD128-NEXT: .param v128, v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype bitselect_v2f64 (v128, v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.wasm.bitselect.v2f64(<2 x double>, <2 x double>, <2 x double>)
diff --git a/test/CodeGen/WebAssembly/simd-load-store-alignment.ll b/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
index f113840c0499719fc066228102eab5fc5ffb6fce..50935b610985fac7f8a4330c85caa31ff09e4341 100644
--- a/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
+++ b/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
@@ -10,8 +10,7 @@ target triple = "wasm32-unknown-unknown"
 ; ==============================================================================
 
 ; CHECK-LABEL: load_v16i8_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v16i8_a1 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_a1(<16 x i8> *%p) {
@@ -20,8 +19,7 @@ define <16 x i8> @load_v16i8_a1(<16 x i8> *%p) {
 }
 
 ; CHECK-LABEL: load_v16i8_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v16i8_a4 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_a4(<16 x i8> *%p) {
@@ -32,8 +30,7 @@ define <16 x i8> @load_v16i8_a4(<16 x i8> *%p) {
 ; 16 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: load_v16i8_a16:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v16i8_a16 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_a16(<16 x i8> *%p) {
@@ -44,8 +41,7 @@ define <16 x i8> @load_v16i8_a16(<16 x i8> *%p) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: load_v16i8_a32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v16i8_a32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_a32(<16 x i8> *%p) {
@@ -54,7 +50,7 @@ define <16 x i8> @load_v16i8_a32(<16 x i8> *%p) {
 }
 
 ; CHECK-LABEL: store_v16i8_a1:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v16i8_a1 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v16i8_a1(<16 x i8> *%p, <16 x i8> %v) {
@@ -63,7 +59,7 @@ define void @store_v16i8_a1(<16 x i8> *%p, <16 x i8> %v) {
 }
 
 ; CHECK-LABEL: store_v16i8_a4:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v16i8_a4 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v16i8_a4(<16 x i8> *%p, <16 x i8> %v) {
@@ -74,7 +70,7 @@ define void @store_v16i8_a4(<16 x i8> *%p, <16 x i8> %v) {
 ; 16 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: store_v16i8_a16:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v16i8_a16 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v16i8_a16(<16 x i8> *%p, <16 x i8> %v) {
@@ -85,7 +81,7 @@ define void @store_v16i8_a16(<16 x i8> *%p, <16 x i8> %v) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: store_v16i8_a32:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v16i8_a32 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v16i8_a32(<16 x i8> *%p, <16 x i8> %v) {
@@ -98,8 +94,7 @@ define void @store_v16i8_a32(<16 x i8> *%p, <16 x i8> %v) {
 ; ==============================================================================
 
 ; CHECK-LABEL: load_v8i16_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v8i16_a1 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_a1(<8 x i16> *%p) {
@@ -108,8 +103,7 @@ define <8 x i16> @load_v8i16_a1(<8 x i16> *%p) {
 }
 
 ; CHECK-LABEL: load_v8i16_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v8i16_a4 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_a4(<8 x i16> *%p) {
@@ -120,8 +114,7 @@ define <8 x i16> @load_v8i16_a4(<8 x i16> *%p) {
 ; 8 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: load_v8i16_a16:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v8i16_a16 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_a16(<8 x i16> *%p) {
@@ -132,8 +125,7 @@ define <8 x i16> @load_v8i16_a16(<8 x i16> *%p) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: load_v8i16_a32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v8i16_a32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_a32(<8 x i16> *%p) {
@@ -142,7 +134,7 @@ define <8 x i16> @load_v8i16_a32(<8 x i16> *%p) {
 }
 
 ; CHECK-LABEL: store_v8i16_a1:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v8i16_a1 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v8i16_a1(<8 x i16> *%p, <8 x i16> %v) {
@@ -151,7 +143,7 @@ define void @store_v8i16_a1(<8 x i16> *%p, <8 x i16> %v) {
 }
 
 ; CHECK-LABEL: store_v8i16_a4:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v8i16_a4 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v8i16_a4(<8 x i16> *%p, <8 x i16> %v) {
@@ -162,7 +154,7 @@ define void @store_v8i16_a4(<8 x i16> *%p, <8 x i16> %v) {
 ; 16 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: store_v8i16_a16:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v8i16_a16 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v8i16_a16(<8 x i16> *%p, <8 x i16> %v) {
@@ -173,7 +165,7 @@ define void @store_v8i16_a16(<8 x i16> *%p, <8 x i16> %v) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: store_v8i16_a32:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v8i16_a32 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v8i16_a32(<8 x i16> *%p, <8 x i16> %v) {
@@ -186,8 +178,7 @@ define void @store_v8i16_a32(<8 x i16> *%p, <8 x i16> %v) {
 ; ==============================================================================
 
 ; CHECK-LABEL: load_v4i32_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v4i32_a1 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_a1(<4 x i32> *%p) {
@@ -196,8 +187,7 @@ define <4 x i32> @load_v4i32_a1(<4 x i32> *%p) {
 }
 
 ; CHECK-LABEL: load_v4i32_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v4i32_a4 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_a4(<4 x i32> *%p) {
@@ -208,8 +198,7 @@ define <4 x i32> @load_v4i32_a4(<4 x i32> *%p) {
 ; 4 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: load_v4i32_a16:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v4i32_a16 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_a16(<4 x i32> *%p) {
@@ -220,8 +209,7 @@ define <4 x i32> @load_v4i32_a16(<4 x i32> *%p) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: load_v4i32_a32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v4i32_a32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_a32(<4 x i32> *%p) {
@@ -230,7 +218,7 @@ define <4 x i32> @load_v4i32_a32(<4 x i32> *%p) {
 }
 
 ; CHECK-LABEL: store_v4i32_a1:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v4i32_a1 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v4i32_a1(<4 x i32> *%p, <4 x i32> %v) {
@@ -239,7 +227,7 @@ define void @store_v4i32_a1(<4 x i32> *%p, <4 x i32> %v) {
 }
 
 ; CHECK-LABEL: store_v4i32_a4:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v4i32_a4 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v4i32_a4(<4 x i32> *%p, <4 x i32> %v) {
@@ -250,7 +238,7 @@ define void @store_v4i32_a4(<4 x i32> *%p, <4 x i32> %v) {
 ; 16 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: store_v4i32_a16:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v4i32_a16 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v4i32_a16(<4 x i32> *%p, <4 x i32> %v) {
@@ -261,7 +249,7 @@ define void @store_v4i32_a16(<4 x i32> *%p, <4 x i32> %v) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: store_v4i32_a32:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v4i32_a32 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v4i32_a32(<4 x i32> *%p, <4 x i32> %v) {
@@ -274,8 +262,7 @@ define void @store_v4i32_a32(<4 x i32> *%p, <4 x i32> %v) {
 ; ==============================================================================
 
 ; CHECK-LABEL: load_v2i64_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v2i64_a1 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_a1(<2 x i64> *%p) {
@@ -284,8 +271,7 @@ define <2 x i64> @load_v2i64_a1(<2 x i64> *%p) {
 }
 
 ; CHECK-LABEL: load_v2i64_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v2i64_a4 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_a4(<2 x i64> *%p) {
@@ -296,8 +282,7 @@ define <2 x i64> @load_v2i64_a4(<2 x i64> *%p) {
 ; 2 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: load_v2i64_a16:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v2i64_a16 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_a16(<2 x i64> *%p) {
@@ -308,8 +293,7 @@ define <2 x i64> @load_v2i64_a16(<2 x i64> *%p) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: load_v2i64_a32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v2i64_a32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_a32(<2 x i64> *%p) {
@@ -318,7 +302,7 @@ define <2 x i64> @load_v2i64_a32(<2 x i64> *%p) {
 }
 
 ; CHECK-LABEL: store_v2i64_a1:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v2i64_a1 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v2i64_a1(<2 x i64> *%p, <2 x i64> %v) {
@@ -327,7 +311,7 @@ define void @store_v2i64_a1(<2 x i64> *%p, <2 x i64> %v) {
 }
 
 ; CHECK-LABEL: store_v2i64_a4:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v2i64_a4 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v2i64_a4(<2 x i64> *%p, <2 x i64> %v) {
@@ -338,7 +322,7 @@ define void @store_v2i64_a4(<2 x i64> *%p, <2 x i64> %v) {
 ; 16 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: store_v2i64_a16:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v2i64_a16 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v2i64_a16(<2 x i64> *%p, <2 x i64> %v) {
@@ -349,7 +333,7 @@ define void @store_v2i64_a16(<2 x i64> *%p, <2 x i64> %v) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: store_v2i64_a32:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v2i64_a32 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v2i64_a32(<2 x i64> *%p, <2 x i64> %v) {
@@ -362,8 +346,7 @@ define void @store_v2i64_a32(<2 x i64> *%p, <2 x i64> %v) {
 ; ==============================================================================
 
 ; CHECK-LABEL: load_v4f32_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v4f32_a1 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_a1(<4 x float> *%p) {
@@ -372,8 +355,7 @@ define <4 x float> @load_v4f32_a1(<4 x float> *%p) {
 }
 
 ; CHECK-LABEL: load_v4f32_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v4f32_a4 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_a4(<4 x float> *%p) {
@@ -384,8 +366,7 @@ define <4 x float> @load_v4f32_a4(<4 x float> *%p) {
 ; 4 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: load_v4f32_a16:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v4f32_a16 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_a16(<4 x float> *%p) {
@@ -396,8 +377,7 @@ define <4 x float> @load_v4f32_a16(<4 x float> *%p) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: load_v4f32_a32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v4f32_a32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_a32(<4 x float> *%p) {
@@ -406,7 +386,7 @@ define <4 x float> @load_v4f32_a32(<4 x float> *%p) {
 }
 
 ; CHECK-LABEL: store_v4f32_a1:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v4f32_a1 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v4f32_a1(<4 x float> *%p, <4 x float> %v) {
@@ -415,7 +395,7 @@ define void @store_v4f32_a1(<4 x float> *%p, <4 x float> %v) {
 }
 
 ; CHECK-LABEL: store_v4f32_a4:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v4f32_a4 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v4f32_a4(<4 x float> *%p, <4 x float> %v) {
@@ -426,7 +406,7 @@ define void @store_v4f32_a4(<4 x float> *%p, <4 x float> %v) {
 ; 16 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: store_v4f32_a16:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v4f32_a16 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v4f32_a16(<4 x float> *%p, <4 x float> %v) {
@@ -437,7 +417,7 @@ define void @store_v4f32_a16(<4 x float> *%p, <4 x float> %v) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: store_v4f32_a32:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v4f32_a32 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v4f32_a32(<4 x float> *%p, <4 x float> %v) {
@@ -450,8 +430,7 @@ define void @store_v4f32_a32(<4 x float> *%p, <4 x float> %v) {
 ; ==============================================================================
 
 ; CHECK-LABEL: load_v2f64_a1:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v2f64_a1 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_a1(<2 x double> *%p) {
@@ -460,8 +439,7 @@ define <2 x double> @load_v2f64_a1(<2 x double> *%p) {
 }
 
 ; CHECK-LABEL: load_v2f64_a4:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v2f64_a4 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_a4(<2 x double> *%p) {
@@ -472,8 +450,7 @@ define <2 x double> @load_v2f64_a4(<2 x double> *%p) {
 ; 2 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: load_v2f64_a16:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v2f64_a16 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_a16(<2 x double> *%p) {
@@ -484,8 +461,7 @@ define <2 x double> @load_v2f64_a16(<2 x double> *%p) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: load_v2f64_a32:
-; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: .functype load_v2f64_a32 (i32) -> (v128){{$}}
 ; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_a32(<2 x double> *%p) {
@@ -494,7 +470,7 @@ define <2 x double> @load_v2f64_a32(<2 x double> *%p) {
 }
 
 ; CHECK-LABEL: store_v2f64_a1:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v2f64_a1 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v2f64_a1(<2 x double> *%p, <2 x double> %v) {
@@ -503,7 +479,7 @@ define void @store_v2f64_a1(<2 x double> *%p, <2 x double> %v) {
 }
 
 ; CHECK-LABEL: store_v2f64_a4:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v2f64_a4 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v2f64_a4(<2 x double> *%p, <2 x double> %v) {
@@ -514,7 +490,7 @@ define void @store_v2f64_a4(<2 x double> *%p, <2 x double> %v) {
 ; 16 is the default alignment for v128 so no attribute is needed.
 
 ; CHECK-LABEL: store_v2f64_a16:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v2f64_a16 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v2f64_a16(<2 x double> *%p, <2 x double> %v) {
@@ -525,7 +501,7 @@ define void @store_v2f64_a16(<2 x double> *%p, <2 x double> %v) {
 ; 32 is greater than the default alignment so it is ignored.
 
 ; CHECK-LABEL: store_v2f64_a32:
-; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: .functype store_v2f64_a32 (i32, v128) -> (){{$}}
 ; CHECK-NEXT: v128.store 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @store_v2f64_a32(<2 x double> *%p, <2 x double> %v) {
diff --git a/test/CodeGen/WebAssembly/simd-offset.ll b/test/CodeGen/WebAssembly/simd-offset.ll
index ed20225f02199fdb67c5f6da36dbf04dcacdcd8d..6bc165f28c1cd2423b015c37ed064a7880c8df63 100644
--- a/test/CodeGen/WebAssembly/simd-offset.ll
+++ b/test/CodeGen/WebAssembly/simd-offset.ll
@@ -12,8 +12,7 @@ target triple = "wasm32-unknown-unknown"
 ; ==============================================================================
 ; CHECK-LABEL: load_v16i8:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v16i8 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8(<16 x i8>* %p) {
@@ -23,8 +22,7 @@ define <16 x i8> @load_v16i8(<16 x i8>* %p) {
 
 ; CHECK-LABEL: load_v16i8_with_folded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v16i8_with_folded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_folded_offset(<16 x i8>* %p) {
@@ -37,8 +35,7 @@ define <16 x i8> @load_v16i8_with_folded_offset(<16 x i8>* %p) {
 
 ; CHECK-LABEL: load_v16i8_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v16i8_with_folded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_folded_gep_offset(<16 x i8>* %p) {
@@ -49,8 +46,7 @@ define <16 x i8> @load_v16i8_with_folded_gep_offset(<16 x i8>* %p) {
 
 ; CHECK-LABEL: load_v16i8_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v16i8_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -63,8 +59,7 @@ define <16 x i8> @load_v16i8_with_unfolded_gep_negative_offset(<16 x i8>* %p) {
 
 ; CHECK-LABEL: load_v16i8_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v16i8_with_unfolded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -79,8 +74,7 @@ define <16 x i8> @load_v16i8_with_unfolded_offset(<16 x i8>* %p) {
 
 ; CHECK-LABEL: load_v16i8_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v16i8_with_unfolded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -93,7 +87,7 @@ define <16 x i8> @load_v16i8_with_unfolded_gep_offset(<16 x i8>* %p) {
 
 ; CHECK-LABEL: load_v16i8_from_numeric_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v16i8_from_numeric_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -105,7 +99,7 @@ define <16 x i8> @load_v16i8_from_numeric_address() {
 
 ; CHECK-LABEL: load_v16i8_from_global_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v16i8_from_global_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v16i8($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -117,7 +111,7 @@ define <16 x i8> @load_v16i8_from_global_address() {
 
 ; CHECK-LABEL: store_v16i8:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v16i8 (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v16i8(<16 x i8> %v, <16 x i8>* %p) {
   store <16 x i8> %v , <16 x i8>* %p
@@ -126,7 +120,7 @@ define void @store_v16i8(<16 x i8> %v, <16 x i8>* %p) {
 
 ; CHECK-LABEL: store_v16i8_with_folded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v16i8_with_folded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v16i8_with_folded_offset(<16 x i8> %v, <16 x i8>* %p) {
   %q = ptrtoint <16 x i8>* %p to i32
@@ -138,7 +132,7 @@ define void @store_v16i8_with_folded_offset(<16 x i8> %v, <16 x i8>* %p) {
 
 ; CHECK-LABEL: store_v16i8_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v16i8_with_folded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v16i8_with_folded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 1
@@ -148,7 +142,7 @@ define void @store_v16i8_with_folded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
 
 ; CHECK-LABEL: store_v16i8_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v16i8_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -160,7 +154,7 @@ define void @store_v16i8_with_unfolded_gep_negative_offset(<16 x i8> %v, <16 x i
 
 ; CHECK-LABEL: store_v16i8_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v16i8_with_unfolded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -172,7 +166,7 @@ define void @store_v16i8_with_unfolded_offset(<16 x i8> %v, <16 x i8>* %p) {
 
 ; CHECK-LABEL: store_v16i8_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v16i8_with_unfolded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -184,7 +178,7 @@ define void @store_v16i8_with_unfolded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
 
 ; CHECK-LABEL: store_v16i8_to_numeric_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v16i8_to_numeric_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store 32($pop[[R]]), $0{{$}}
 define void @store_v16i8_to_numeric_address(<16 x i8> %v) {
@@ -195,7 +189,7 @@ define void @store_v16i8_to_numeric_address(<16 x i8> %v) {
 
 ; CHECK-LABEL: store_v16i8_to_global_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v16i8_to_global_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store gv_v16i8($pop[[R]]), $0{{$}}
 define void @store_v16i8_to_global_address(<16 x i8> %v) {
@@ -208,8 +202,7 @@ define void @store_v16i8_to_global_address(<16 x i8> %v) {
 ; ==============================================================================
 ; CHECK-LABEL: load_v8i16:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v8i16 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16(<8 x i16>* %p) {
@@ -219,8 +212,7 @@ define <8 x i16> @load_v8i16(<8 x i16>* %p) {
 
 ; CHECK-LABEL: load_v8i16_with_folded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v8i16_with_folded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_folded_offset(<8 x i16>* %p) {
@@ -233,8 +225,7 @@ define <8 x i16> @load_v8i16_with_folded_offset(<8 x i16>* %p) {
 
 ; CHECK-LABEL: load_v8i16_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v8i16_with_folded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_folded_gep_offset(<8 x i16>* %p) {
@@ -245,8 +236,7 @@ define <8 x i16> @load_v8i16_with_folded_gep_offset(<8 x i16>* %p) {
 
 ; CHECK-LABEL: load_v8i16_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -259,8 +249,7 @@ define <8 x i16> @load_v8i16_with_unfolded_gep_negative_offset(<8 x i16>* %p) {
 
 ; CHECK-LABEL: load_v8i16_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v8i16_with_unfolded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[L0:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -275,8 +264,7 @@ define <8 x i16> @load_v8i16_with_unfolded_offset(<8 x i16>* %p) {
 
 ; CHECK-LABEL: load_v8i16_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v8i16_with_unfolded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -289,7 +277,7 @@ define <8 x i16> @load_v8i16_with_unfolded_gep_offset(<8 x i16>* %p) {
 
 ; CHECK-LABEL: load_v8i16_from_numeric_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v8i16_from_numeric_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -301,7 +289,7 @@ define <8 x i16> @load_v8i16_from_numeric_address() {
 
 ; CHECK-LABEL: load_v8i16_from_global_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v8i16_from_global_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v8i16($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -313,7 +301,7 @@ define <8 x i16> @load_v8i16_from_global_address() {
 
 ; CHECK-LABEL: store_v8i16:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v8i16 (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) {
   store <8 x i16> %v , <8 x i16>* %p
@@ -322,7 +310,7 @@ define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) {
 
 ; CHECK-LABEL: store_v8i16_with_folded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v8i16_with_folded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
   %q = ptrtoint <8 x i16>* %p to i32
@@ -334,7 +322,7 @@ define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
 
 ; CHECK-LABEL: store_v8i16_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v8i16_with_folded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 1
@@ -344,7 +332,7 @@ define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
 
 ; CHECK-LABEL: store_v8i16_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -356,7 +344,7 @@ define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i1
 
 ; CHECK-LABEL: store_v8i16_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v8i16_with_unfolded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -368,7 +356,7 @@ define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) {
 
 ; CHECK-LABEL: store_v8i16_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v8i16_with_unfolded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -380,7 +368,7 @@ define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
 
 ; CHECK-LABEL: store_v8i16_to_numeric_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v8i16_to_numeric_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
@@ -391,7 +379,7 @@ define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
 
 ; CHECK-LABEL: store_v8i16_to_global_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v8i16_to_global_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store gv_v8i16($pop[[R]]), $0{{$}}
 define void @store_v8i16_to_global_address(<8 x i16> %v) {
@@ -404,8 +392,7 @@ define void @store_v8i16_to_global_address(<8 x i16> %v) {
 ; ==============================================================================
 ; CHECK-LABEL: load_v4i32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4i32 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32(<4 x i32>* %p) {
@@ -415,8 +402,7 @@ define <4 x i32> @load_v4i32(<4 x i32>* %p) {
 
 ; CHECK-LABEL: load_v4i32_with_folded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4i32_with_folded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_folded_offset(<4 x i32>* %p) {
@@ -429,8 +415,7 @@ define <4 x i32> @load_v4i32_with_folded_offset(<4 x i32>* %p) {
 
 ; CHECK-LABEL: load_v4i32_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4i32_with_folded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_folded_gep_offset(<4 x i32>* %p) {
@@ -441,8 +426,7 @@ define <4 x i32> @load_v4i32_with_folded_gep_offset(<4 x i32>* %p) {
 
 ; CHECK-LABEL: load_v4i32_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -455,8 +439,7 @@ define <4 x i32> @load_v4i32_with_unfolded_gep_negative_offset(<4 x i32>* %p) {
 
 ; CHECK-LABEL: load_v4i32_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4i32_with_unfolded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -471,8 +454,7 @@ define <4 x i32> @load_v4i32_with_unfolded_offset(<4 x i32>* %p) {
 
 ; CHECK-LABEL: load_v4i32_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4i32_with_unfolded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -485,7 +467,7 @@ define <4 x i32> @load_v4i32_with_unfolded_gep_offset(<4 x i32>* %p) {
 
 ; CHECK-LABEL: load_v4i32_from_numeric_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4i32_from_numeric_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -497,7 +479,7 @@ define <4 x i32> @load_v4i32_from_numeric_address() {
 
 ; CHECK-LABEL: load_v4i32_from_global_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4i32_from_global_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4i32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -509,7 +491,7 @@ define <4 x i32> @load_v4i32_from_global_address() {
 
 ; CHECK-LABEL: store_v4i32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4i32 (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) {
   store <4 x i32> %v , <4 x i32>* %p
@@ -518,7 +500,7 @@ define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) {
 
 ; CHECK-LABEL: store_v4i32_with_folded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4i32_with_folded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
   %q = ptrtoint <4 x i32>* %p to i32
@@ -530,7 +512,7 @@ define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
 
 ; CHECK-LABEL: store_v4i32_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4i32_with_folded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 1
@@ -540,7 +522,7 @@ define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
 
 ; CHECK-LABEL: store_v4i32_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -552,7 +534,7 @@ define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i3
 
 ; CHECK-LABEL: store_v4i32_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4i32_with_unfolded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -564,7 +546,7 @@ define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) {
 
 ; CHECK-LABEL: store_v4i32_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4i32_with_unfolded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -576,7 +558,7 @@ define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
 
 ; CHECK-LABEL: store_v4i32_to_numeric_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v4i32_to_numeric_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
@@ -587,7 +569,7 @@ define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
 
 ; CHECK-LABEL: store_v4i32_to_global_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v4i32_to_global_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store gv_v4i32($pop[[R]]), $0{{$}}
 define void @store_v4i32_to_global_address(<4 x i32> %v) {
@@ -601,8 +583,7 @@ define void @store_v4i32_to_global_address(<4 x i32> %v) {
 ; CHECK-LABEL: load_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2i64 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64(<2 x i64>* %p) {
@@ -613,8 +594,7 @@ define <2 x i64> @load_v2i64(<2 x i64>* %p) {
 ; CHECK-LABEL: load_v2i64_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2i64_with_folded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_folded_offset(<2 x i64>* %p) {
@@ -628,8 +608,7 @@ define <2 x i64> @load_v2i64_with_folded_offset(<2 x i64>* %p) {
 ; CHECK-LABEL: load_v2i64_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2i64_with_folded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_folded_gep_offset(<2 x i64>* %p) {
@@ -641,8 +620,7 @@ define <2 x i64> @load_v2i64_with_folded_gep_offset(<2 x i64>* %p) {
 ; CHECK-LABEL: load_v2i64_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -656,8 +634,7 @@ define <2 x i64> @load_v2i64_with_unfolded_gep_negative_offset(<2 x i64>* %p) {
 ; CHECK-LABEL: load_v2i64_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2i64_with_unfolded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -673,8 +650,7 @@ define <2 x i64> @load_v2i64_with_unfolded_offset(<2 x i64>* %p) {
 ; CHECK-LABEL: load_v2i64_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2i64_with_unfolded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -688,7 +664,7 @@ define <2 x i64> @load_v2i64_with_unfolded_gep_offset(<2 x i64>* %p) {
 ; CHECK-LABEL: load_v2i64_from_numeric_address:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2i64_from_numeric_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -701,7 +677,7 @@ define <2 x i64> @load_v2i64_from_numeric_address() {
 ; CHECK-LABEL: load_v2i64_from_global_address:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2i64_from_global_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2i64($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -714,7 +690,7 @@ define <2 x i64> @load_v2i64_from_global_address() {
 ; CHECK-LABEL: store_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2i64 (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v2i64(<2 x i64> %v, <2 x i64>* %p) {
   store <2 x i64> %v , <2 x i64>* %p
@@ -724,7 +700,7 @@ define void @store_v2i64(<2 x i64> %v, <2 x i64>* %p) {
 ; CHECK-LABEL: store_v2i64_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2i64_with_folded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2i64_with_folded_offset(<2 x i64> %v, <2 x i64>* %p) {
   %q = ptrtoint <2 x i64>* %p to i32
@@ -737,7 +713,7 @@ define void @store_v2i64_with_folded_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; CHECK-LABEL: store_v2i64_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2i64_with_folded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2i64_with_folded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 1
@@ -748,7 +724,7 @@ define void @store_v2i64_with_folded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; CHECK-LABEL: store_v2i64_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2i64_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -761,7 +737,7 @@ define void @store_v2i64_with_unfolded_gep_negative_offset(<2 x i64> %v, <2 x i6
 ; CHECK-LABEL: store_v2i64_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2i64_with_unfolded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -774,7 +750,7 @@ define void @store_v2i64_with_unfolded_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; CHECK-LABEL: store_v2i64_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2i64_with_unfolded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -787,7 +763,7 @@ define void @store_v2i64_with_unfolded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; CHECK-LABEL: store_v2i64_to_numeric_address:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v2i64_to_numeric_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v2i64_to_numeric_address(<2 x i64> %v) {
@@ -799,7 +775,7 @@ define void @store_v2i64_to_numeric_address(<2 x i64> %v) {
 ; CHECK-LABEL: store_v2i64_to_global_address:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v2i64_to_global_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store gv_v2i64($pop[[R]]), $0{{$}}
 define void @store_v2i64_to_global_address(<2 x i64> %v) {
@@ -812,8 +788,7 @@ define void @store_v2i64_to_global_address(<2 x i64> %v) {
 ; ==============================================================================
 ; CHECK-LABEL: load_v4f32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4f32 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32(<4 x float>* %p) {
@@ -823,8 +798,7 @@ define <4 x float> @load_v4f32(<4 x float>* %p) {
 
 ; CHECK-LABEL: load_v4f32_with_folded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4f32_with_folded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_folded_offset(<4 x float>* %p) {
@@ -837,8 +811,7 @@ define <4 x float> @load_v4f32_with_folded_offset(<4 x float>* %p) {
 
 ; CHECK-LABEL: load_v4f32_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4f32_with_folded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_folded_gep_offset(<4 x float>* %p) {
@@ -849,8 +822,7 @@ define <4 x float> @load_v4f32_with_folded_gep_offset(<4 x float>* %p) {
 
 ; CHECK-LABEL: load_v4f32_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4f32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -863,8 +835,7 @@ define <4 x float> @load_v4f32_with_unfolded_gep_negative_offset(<4 x float>* %p
 
 ; CHECK-LABEL: load_v4f32_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4f32_with_unfolded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -879,8 +850,7 @@ define <4 x float> @load_v4f32_with_unfolded_offset(<4 x float>* %p) {
 
 ; CHECK-LABEL: load_v4f32_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4f32_with_unfolded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -893,7 +863,7 @@ define <4 x float> @load_v4f32_with_unfolded_gep_offset(<4 x float>* %p) {
 
 ; CHECK-LABEL: load_v4f32_from_numeric_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4f32_from_numeric_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -905,7 +875,7 @@ define <4 x float> @load_v4f32_from_numeric_address() {
 
 ; CHECK-LABEL: load_v4f32_from_global_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v4f32_from_global_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4f32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -917,7 +887,7 @@ define <4 x float> @load_v4f32_from_global_address() {
 
 ; CHECK-LABEL: store_v4f32:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4f32 (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v4f32(<4 x float> %v, <4 x float>* %p) {
   store <4 x float> %v , <4 x float>* %p
@@ -926,7 +896,7 @@ define void @store_v4f32(<4 x float> %v, <4 x float>* %p) {
 
 ; CHECK-LABEL: store_v4f32_with_folded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4f32_with_folded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4f32_with_folded_offset(<4 x float> %v, <4 x float>* %p) {
   %q = ptrtoint <4 x float>* %p to i32
@@ -938,7 +908,7 @@ define void @store_v4f32_with_folded_offset(<4 x float> %v, <4 x float>* %p) {
 
 ; CHECK-LABEL: store_v4f32_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4f32_with_folded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4f32_with_folded_gep_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 1
@@ -948,7 +918,7 @@ define void @store_v4f32_with_folded_gep_offset(<4 x float> %v, <4 x float>* %p)
 
 ; CHECK-LABEL: store_v4f32_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4f32_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -960,7 +930,7 @@ define void @store_v4f32_with_unfolded_gep_negative_offset(<4 x float> %v, <4 x
 
 ; CHECK-LABEL: store_v4f32_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4f32_with_unfolded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -972,7 +942,7 @@ define void @store_v4f32_with_unfolded_offset(<4 x float> %v, <4 x float>* %p) {
 
 ; CHECK-LABEL: store_v4f32_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v4f32_with_unfolded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -984,7 +954,7 @@ define void @store_v4f32_with_unfolded_gep_offset(<4 x float> %v, <4 x float>* %
 
 ; CHECK-LABEL: store_v4f32_to_numeric_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v4f32_to_numeric_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v4f32_to_numeric_address(<4 x float> %v) {
@@ -995,7 +965,7 @@ define void @store_v4f32_to_numeric_address(<4 x float> %v) {
 
 ; CHECK-LABEL: store_v4f32_to_global_address:
 ; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v4f32_to_global_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store gv_v4f32($pop[[R]]), $0{{$}}
 define void @store_v4f32_to_global_address(<4 x float> %v) {
@@ -1009,8 +979,7 @@ define void @store_v4f32_to_global_address(<4 x float> %v) {
 ; CHECK-LABEL: load_v2f64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2f64 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64(<2 x double>* %p) {
@@ -1021,8 +990,7 @@ define <2 x double> @load_v2f64(<2 x double>* %p) {
 ; CHECK-LABEL: load_v2f64_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2f64_with_folded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) {
@@ -1036,8 +1004,7 @@ define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) {
 ; CHECK-LABEL: load_v2f64_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2f64_with_folded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_folded_gep_offset(<2 x double>* %p) {
@@ -1049,8 +1016,7 @@ define <2 x double> @load_v2f64_with_folded_gep_offset(<2 x double>* %p) {
 ; CHECK-LABEL: load_v2f64_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2f64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -1064,8 +1030,7 @@ define <2 x double> @load_v2f64_with_unfolded_gep_negative_offset(<2 x double>*
 ; CHECK-LABEL: load_v2f64_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2f64_with_unfolded_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -1081,8 +1046,7 @@ define <2 x double> @load_v2f64_with_unfolded_offset(<2 x double>* %p) {
 ; CHECK-LABEL: load_v2f64_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2f64_with_unfolded_gep_offset (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
@@ -1096,7 +1060,7 @@ define <2 x double> @load_v2f64_with_unfolded_gep_offset(<2 x double>* %p) {
 ; CHECK-LABEL: load_v2f64_from_numeric_address:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2f64_from_numeric_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1109,7 +1073,7 @@ define <2 x double> @load_v2f64_from_numeric_address() {
 ; CHECK-LABEL: load_v2f64_from_global_address:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype load_v2f64_from_global_address () -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2f64($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1122,7 +1086,7 @@ define <2 x double> @load_v2f64_from_global_address() {
 ; CHECK-LABEL: store_v2f64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2f64 (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v2f64(<2 x double> %v, <2 x double>* %p) {
   store <2 x double> %v , <2 x double>* %p
@@ -1132,7 +1096,7 @@ define void @store_v2f64(<2 x double> %v, <2 x double>* %p) {
 ; CHECK-LABEL: store_v2f64_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2f64_with_folded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2f64_with_folded_offset(<2 x double> %v, <2 x double>* %p) {
   %q = ptrtoint <2 x double>* %p to i32
@@ -1145,7 +1109,7 @@ define void @store_v2f64_with_folded_offset(<2 x double> %v, <2 x double>* %p) {
 ; CHECK-LABEL: store_v2f64_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2f64_with_folded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2f64_with_folded_gep_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 1
@@ -1156,7 +1120,7 @@ define void @store_v2f64_with_folded_gep_offset(<2 x double> %v, <2 x double>* %
 ; CHECK-LABEL: store_v2f64_with_unfolded_gep_negative_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2f64_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -1169,7 +1133,7 @@ define void @store_v2f64_with_unfolded_gep_negative_offset(<2 x double> %v, <2 x
 ; CHECK-LABEL: store_v2f64_with_unfolded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2f64_with_unfolded_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -1182,7 +1146,7 @@ define void @store_v2f64_with_unfolded_offset(<2 x double> %v, <2 x double>* %p)
 ; CHECK-LABEL: store_v2f64_with_unfolded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .functype store_v2f64_with_unfolded_gep_offset (v128, i32) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
@@ -1195,7 +1159,7 @@ define void @store_v2f64_with_unfolded_gep_offset(<2 x double> %v, <2 x double>*
 ; CHECK-LABEL: store_v2f64_to_numeric_address:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v2f64_to_numeric_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v2f64_to_numeric_address(<2 x double> %v) {
@@ -1207,7 +1171,7 @@ define void @store_v2f64_to_numeric_address(<2 x double> %v) {
 ; CHECK-LABEL: store_v2f64_to_global_address:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .functype store_v2f64_to_global_address (v128) -> (){{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
 ; SIMD128-NEXT: v128.store gv_v2f64($pop[[R]]), $0{{$}}
 define void @store_v2f64_to_global_address(<2 x double> %v) {
diff --git a/test/CodeGen/WebAssembly/simd-select.ll b/test/CodeGen/WebAssembly/simd-select.ll
new file mode 100644
index 0000000000000000000000000000000000000000..70a9149d49c4462508494c6a3c7861e0713bd023
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-select.ll
@@ -0,0 +1,433 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext | FileCheck %s
+
+; Test that vector selects of various varieties lower correctly to bitselects.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; ==============================================================================
+; 16 x i8
+; ==============================================================================
+; CHECK-LABEL: vselect_v16i8:
+; CHECK-NEXT: .functype vselect_v16i8 (v128, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 7{{$}}
+; CHECK-NEXT: i8x16.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 7{{$}}
+; CHECK-NEXT: i8x16.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @vselect_v16i8(<16 x i1> %c, <16 x i8> %x, <16 x i8> %y) {
+  %res = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: select_v16i8:
+; CHECK-NEXT: .functype select_v16i8 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @select_v16i8(i1 %c, <16 x i8> %x, <16 x i8> %y) {
+  %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: select_cmp_v16i8:
+; CHECK-NEXT: .functype select_cmp_v16i8 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31
+; CHECK-NEXT: i32.shr_s $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i8x16.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @select_cmp_v16i8(i32 %i, <16 x i8> %x, <16 x i8> %y) {
+  %c = icmp slt i32 %i, 0
+  %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: select_ne_v16i8:
+; CHECK-NEXT: .functype select_ne_v16i8 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @select_ne_v16i8(i32 %i, <16 x i8> %x, <16 x i8> %y) {
+  %c = icmp ne i32 %i, 0
+  %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: select_eq_v16i8:
+; CHECK-NEXT: .functype select_eq_v16i8 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @select_eq_v16i8(i32 %i, <16 x i8> %x, <16 x i8> %y) {
+  %c = icmp eq i32 %i, 0
+  %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %res
+}
+
+; ==============================================================================
+; 8 x i16
+; ==============================================================================
+; CHECK-LABEL: vselect_v8i16:
+; CHECK-NEXT: .functype vselect_v8i16 (v128, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 15{{$}}
+; CHECK-NEXT: i16x8.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 15{{$}}
+; CHECK-NEXT: i16x8.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @vselect_v8i16(<8 x i1> %c, <8 x i16> %x, <8 x i16> %y) {
+  %res = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: select_v8i16:
+; CHECK-NEXT: .functype select_v8i16 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @select_v8i16(i1 %c, <8 x i16> %x, <8 x i16> %y) {
+  %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: select_cmp_v8i16:
+; CHECK-NEXT: .functype select_cmp_v8i16 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32.shr_s $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i16x8.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @select_cmp_v8i16(i32 %i, <8 x i16> %x, <8 x i16> %y) {
+  %c = icmp slt i32 %i, 0
+  %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: select_ne_v8i16:
+; CHECK-NEXT: .functype select_ne_v8i16 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @select_ne_v8i16(i32 %i, <8 x i16> %x, <8 x i16> %y) {
+  %c = icmp ne i32 %i, 0
+  %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: select_eq_v8i16:
+; CHECK-NEXT: .functype select_eq_v8i16 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @select_eq_v8i16(i32 %i, <8 x i16> %x, <8 x i16> %y) {
+  %c = icmp eq i32 %i, 0
+  %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %res
+}
+
+; ==============================================================================
+; 4 x i32
+; ==============================================================================
+; CHECK-LABEL: vselect_v4i32:
+; CHECK-NEXT: .functype vselect_v4i32 (v128, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @vselect_v4i32(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y) {
+  %res = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %res
+}
+
+
+; CHECK-LABEL: select_v4i32:
+; CHECK-NEXT: .functype select_v4i32 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @select_v4i32(i1 %c, <4 x i32> %x, <4 x i32> %y) {
+  %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: select_cmp_v4i32:
+; CHECK-NEXT: .functype select_cmp_v4i32 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32.shr_s $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32x4.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @select_cmp_v4i32(i32 %i, <4 x i32> %x, <4 x i32> %y) {
+  %c = icmp slt i32 %i, 0
+  %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: select_ne_v4i32:
+; CHECK-NEXT: .functype select_ne_v4i32 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @select_ne_v4i32(i32 %i, <4 x i32> %x, <4 x i32> %y) {
+  %c = icmp ne i32 %i, 0
+  %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: select_eq_v4i32:
+; CHECK-NEXT: .functype select_eq_v4i32 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @select_eq_v4i32(i32 %i, <4 x i32> %x, <4 x i32> %y) {
+  %c = icmp eq i32 %i, 0
+  %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %res
+}
+
+; ==============================================================================
+; 2 x i64
+; ==============================================================================
+; CHECK-LABEL: vselect_v2i64:
+; CHECK-NEXT: .functype vselect_v2i64 (v128, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @vselect_v2i64(<2 x i1> %c, <2 x i64> %x, <2 x i64> %y) {
+  %res = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %res
+}
+
+; CHECK-LABEL: select_v2i64:
+; CHECK-NEXT: .functype select_v2i64 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @select_v2i64(i1 %c, <2 x i64> %x, <2 x i64> %y) {
+  %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %res
+}
+
+; CHECK-LABEL: select_cmp_v2i64:
+; CHECK-NEXT: .functype select_cmp_v2i64 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.lt_s $push[[L3:[0-9]+]]=, $0, $pop[[L2]]{{$}}
+; CHECK-NEXT: i64.select $push[[L4:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $pop[[L3]]{{$}}
+; CHECK-NEXT: i64x2.splat $push[[L5:[0-9]+]]=, $pop[[L4]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L5]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @select_cmp_v2i64(i32 %i, <2 x i64> %x, <2 x i64> %y) {
+  %c = icmp slt i32 %i, 0
+  %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %res
+}
+
+; CHECK-LABEL: select_ne_v2i64:
+; CHECK-NEXT: .functype select_ne_v2i64 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @select_ne_v2i64(i32 %i, <2 x i64> %x, <2 x i64> %y) {
+  %c = icmp ne i32 %i, 0
+  %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %res
+}
+
+; CHECK-LABEL: select_eq_v2i64:
+; CHECK-NEXT: .functype select_eq_v2i64 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @select_eq_v2i64(i32 %i, <2 x i64> %x, <2 x i64> %y) {
+  %c = icmp eq i32 %i, 0
+  %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %res
+}
+
+; ==============================================================================
+; 4 x float
+; ==============================================================================
+; CHECK-LABEL: vselect_v4f32:
+; CHECK-NEXT: .functype vselect_v4f32 (v128, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @vselect_v4f32(<4 x i1> %c, <4 x float> %x, <4 x float> %y) {
+  %res = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: select_v4f32:
+; CHECK-NEXT: .functype select_v4f32 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @select_v4f32(i1 %c, <4 x float> %x, <4 x float> %y) {
+  %res = select i1 %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: select_cmp_v4f32:
+; CHECK-NEXT: .functype select_cmp_v4f32 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32.shr_s $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32x4.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @select_cmp_v4f32(i32 %i, <4 x float> %x, <4 x float> %y) {
+  %c = icmp slt i32 %i, 0
+  %res = select i1 %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: select_ne_v4f32:
+; CHECK-NEXT: .functype select_ne_v4f32 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @select_ne_v4f32(i32 %i, <4 x float> %x, <4 x float> %y) {
+  %c = icmp ne i32 %i, 0
+  %res = select i1 %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: select_eq_v4f32:
+; CHECK-NEXT: .functype select_eq_v4f32 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @select_eq_v4f32(i32 %i, <4 x float> %x, <4 x float> %y) {
+  %c = icmp eq i32 %i, 0
+  %res = select i1 %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %res
+}
+
+; ==============================================================================
+; 2 x double
+; ==============================================================================
+; CHECK-LABEL: vselect_v2f64:
+; CHECK-NEXT: .functype vselect_v2f64 (v128, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @vselect_v2f64(<2 x i1> %c, <2 x double> %x, <2 x double> %y) {
+  %res = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: select_v2f64:
+; CHECK-NEXT: .functype select_v2f64 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @select_v2f64(i1 %c, <2 x double> %x, <2 x double> %y) {
+  %res = select i1 %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: select_cmp_v2f64:
+; CHECK-NEXT: .functype select_cmp_v2f64 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.lt_s $push[[L3:[0-9]+]]=, $0, $pop[[L2]]{{$}}
+; CHECK-NEXT: i64.select $push[[L4:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $pop[[L3]]{{$}}
+; CHECK-NEXT: i64x2.splat $push[[L5:[0-9]+]]=, $pop[[L4]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L5]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @select_cmp_v2f64(i32 %i, <2 x double> %x, <2 x double> %y) {
+  %c = icmp slt i32 %i, 0
+  %res = select i1 %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: select_ne_v2f64:
+; CHECK-NEXT: .functype select_ne_v2f64 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @select_ne_v2f64(i32 %i, <2 x double> %x, <2 x double> %y) {
+  %c = icmp ne i32 %i, 0
+  %res = select i1 %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: select_eq_v2f64:
+; CHECK-NEXT: .functype select_eq_v2f64 (i32, v128, v128) -> (v128){{$}}
+; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, -1{{$}}
+; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
+; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @select_eq_v2f64(i32 %i, <2 x double> %x, <2 x double> %y) {
+  %c = icmp eq i32 %i, 0
+  %res = select i1 %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %res
+}
diff --git a/test/CodeGen/WebAssembly/simd-sext-inreg.ll b/test/CodeGen/WebAssembly/simd-sext-inreg.ll
index 1001d0db16822d82a4011f292e00c113c4324fb3..5fecb4d0bac44e4034de13acaa959972d298697c 100644
--- a/test/CodeGen/WebAssembly/simd-sext-inreg.ll
+++ b/test/CodeGen/WebAssembly/simd-sext-inreg.ll
@@ -9,8 +9,7 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: sext_inreg_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sext_inreg_v16i8 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 7{{$}}
 ; SIMD128-NEXT: i8x16.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 7{{$}}
@@ -23,8 +22,7 @@ define <16 x i8> @sext_inreg_v16i8(<16 x i1> %x) {
 
 ; CHECK-LABEL: sext_inreg_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sext_inreg_v8i16 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 15{{$}}
 ; SIMD128-NEXT: i16x8.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 15{{$}}
@@ -37,8 +35,7 @@ define <8 x i16> @sext_inreg_v8i16(<8 x i1> %x) {
 
 ; CHECK-LABEL: sext_inreg_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sext_inreg_v4i32 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 31{{$}}
 ; SIMD128-NEXT: i32x4.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 31{{$}}
@@ -52,8 +49,7 @@ define <4 x i32> @sext_inreg_v4i32(<4 x i1> %x) {
 ; CHECK-LABEL: sext_inreg_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SDIM128-VM-NOT: i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype sext_inreg_v2i64 (v128) -> (v128){{$}}
 ; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 63{{$}}
 ; SIMD128-NEXT: i64x2.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
 ; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 63{{$}}
diff --git a/test/CodeGen/WebAssembly/simd-vselect.ll b/test/CodeGen/WebAssembly/simd-vselect.ll
deleted file mode 100644
index fd020511cb19b210193ae578a3335997d692179c..0000000000000000000000000000000000000000
--- a/test/CodeGen/WebAssembly/simd-vselect.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext | FileCheck %s
-
-; Test that lanewise vector selects lower correctly to bitselects
-
-target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
-
-; CHECK-LABEL: vselect_v16i8:
-; CHECK-NEXT: .param v128, v128, v128{{$}}
-; CHECK-NEXT: .result v128{{$}}
-; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 7{{$}}
-; CHECK-NEXT: i8x16.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 7{{$}}
-; CHECK-NEXT: i8x16.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
-; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-define <16 x i8> @vselect_v16i8(<16 x i1> %c, <16 x i8> %x, <16 x i8> %y) {
-  %res = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
-  ret <16 x i8> %res
-}
-
-; CHECK-LABEL: vselect_v8i16:
-; CHECK-NEXT: .param v128, v128, v128{{$}}
-; CHECK-NEXT: .result v128{{$}}
-; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 15{{$}}
-; CHECK-NEXT: i16x8.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 15{{$}}
-; CHECK-NEXT: i16x8.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
-; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-define <8 x i16> @vselect_v8i16(<8 x i1> %c, <8 x i16> %x, <8 x i16> %y) {
-  %res = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
-  ret <8 x i16> %res
-}
-
-; CHECK-LABEL: vselect_v4i32:
-; CHECK-NEXT: .param v128, v128, v128{{$}}
-; CHECK-NEXT: .result v128{{$}}
-; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
-; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
-; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
-; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-define <4 x i32> @vselect_v4i32(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y) {
-  %res = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
-  ret <4 x i32> %res
-}
-
-; CHECK-LABEL: vselect_v2i64:
-; CHECK-NEXT: .param v128, v128, v128{{$}}
-; CHECK-NEXT: .result v128{{$}}
-; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
-; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
-; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
-; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-define <2 x i64> @vselect_v2i64(<2 x i1> %c, <2 x i64> %x, <2 x i64> %y) {
-  %res = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %y
-  ret <2 x i64> %res
-}
-
-; CHECK-LABEL: vselect_v4f32:
-; CHECK-NEXT: .param v128, v128, v128{{$}}
-; CHECK-NEXT: .result v128{{$}}
-; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
-; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
-; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
-; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-define <4 x float> @vselect_v4f32(<4 x i1> %c, <4 x float> %x, <4 x float> %y) {
-  %res = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
-  ret <4 x float> %res
-}
-
-; CHECK-LABEL: vselect_v2f64:
-; CHECK-NEXT: .param v128, v128, v128{{$}}
-; CHECK-NEXT: .result v128{{$}}
-; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
-; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
-; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
-; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-define <2 x double> @vselect_v2f64(<2 x i1> %c, <2 x double> %x, <2 x double> %y) {
-  %res = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %res
-}
diff --git a/test/CodeGen/WebAssembly/simd.ll b/test/CodeGen/WebAssembly/simd.ll
index 55a325b939c643905038bff6fb0312c794808320..29b51f16694b00d0108c13a3f66cf75e5dadc8b8 100644
--- a/test/CodeGen/WebAssembly/simd.ll
+++ b/test/CodeGen/WebAssembly/simd.ll
@@ -12,7 +12,7 @@ target triple = "wasm32-unknown-unknown"
 ; ==============================================================================
 ; CHECK-LABEL: const_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype const_v16i8 () -> (v128){{$}}
 ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=,
 ; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -23,8 +23,7 @@ define <16 x i8> @const_v16i8() {
 
 ; CHECK-LABEL: splat_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype splat_v16i8 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.splat $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @splat_v16i8(i8 %x) {
@@ -44,8 +43,7 @@ define <16 x i8> @const_splat_v16i8() {
 
 ; CHECK-LABEL: extract_v16i8_s:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_v16i8_s (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_s $push[[R:[0-9]+]]=, $0, 13{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v16i8_s(<16 x i8> %v) {
@@ -56,8 +54,7 @@ define i32 @extract_v16i8_s(<16 x i8> %v) {
 
 ; CHECK-LABEL: extract_var_v16i8_s:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_var_v16i8_s (v128, i32) -> (i32){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
@@ -76,8 +73,7 @@ define i32 @extract_var_v16i8_s(<16 x i8> %v, i32 %i) {
 
 ; CHECK-LABEL: extract_undef_v16i8_s:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_undef_v16i8_s (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_s $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v16i8_s(<16 x i8> %v) {
@@ -88,8 +84,7 @@ define i32 @extract_undef_v16i8_s(<16 x i8> %v) {
 
 ; CHECK-LABEL: extract_v16i8_u:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_v16i8_u (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 13{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v16i8_u(<16 x i8> %v) {
@@ -100,8 +95,7 @@ define i32 @extract_v16i8_u(<16 x i8> %v) {
 
 ; CHECK-LABEL: extract_var_v16i8_u:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_var_v16i8_u (v128, i32) -> (i32){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -120,8 +114,7 @@ define i32 @extract_var_v16i8_u(<16 x i8> %v, i32 %i) {
 
 ; CHECK-LABEL: extract_undef_v16i8_u:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_undef_v16i8_u (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v16i8_u(<16 x i8> %v) {
@@ -132,8 +125,7 @@ define i32 @extract_undef_v16i8_u(<16 x i8> %v) {
 
 ; CHECK-LABEL: extract_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_v16i8 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 13{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i8 @extract_v16i8(<16 x i8> %v) {
@@ -143,8 +135,7 @@ define i8 @extract_v16i8(<16 x i8> %v) {
 
 ; CHECK-LABEL: extract_var_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_var_v16i8 (v128, i32) -> (i32){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -162,8 +153,7 @@ define i8 @extract_var_v16i8(<16 x i8> %v, i32 %i) {
 
 ; CHECK-LABEL: extract_undef_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_undef_v16i8 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i8 @extract_undef_v16i8(<16 x i8> %v) {
@@ -173,8 +163,7 @@ define i8 @extract_undef_v16i8(<16 x i8> %v) {
 
 ; CHECK-LABEL: replace_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_v16i8 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $0, 11, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
@@ -184,8 +173,7 @@ define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
 
 ; CHECK-LABEL: replace_var_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_var_v16i8 (v128, i32, i32) -> (v128){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -204,8 +192,7 @@ define <16 x i8> @replace_var_v16i8(<16 x i8> %v, i32 %i, i8 %x) {
 
 ; CHECK-LABEL: replace_undef_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_undef_v16i8 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @replace_undef_v16i8(<16 x i8> %v, i8 %x) {
@@ -215,8 +202,7 @@ define <16 x i8> @replace_undef_v16i8(<16 x i8> %v, i8 %x) {
 
 ; CHECK-LABEL: shuffle_v16i8:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
 ; SIMD128-SAME: 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -229,8 +215,7 @@ define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: shuffle_undef_v16i8:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
 ; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -245,8 +230,7 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
 
 ; CHECK-LABEL: build_v16i8:
 ; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype build_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i8x16.splat $push[[L0:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: i8x16.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
 ; SIMD128-NEXT: i8x16.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $2{{$}}
@@ -292,7 +276,7 @@ define <16 x i8> @build_v16i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3,
 ; ==============================================================================
 ; CHECK-LABEL: const_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype const_v8i16 () -> (v128){{$}}
 ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 256, 770, 1284, 1798, 2312, 2826, 3340, 3854{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @const_v8i16() {
@@ -302,8 +286,7 @@ define <8 x i16> @const_v8i16() {
 
 ; CHECK-LABEL: splat_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype splat_v8i16 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.splat $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @splat_v8i16(i16 %x) {
@@ -321,8 +304,7 @@ define <8 x i16> @const_splat_v8i16() {
 
 ; CHECK-LABEL: extract_v8i16_s:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_v8i16_s (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_s $push[[R:[0-9]+]]=, $0, 5{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v8i16_s(<8 x i16> %v) {
@@ -333,8 +315,7 @@ define i32 @extract_v8i16_s(<8 x i16> %v) {
 
 ; CHECK-LABEL: extract_var_v8i16_s:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_var_v8i16_s (v128, i32) -> (i32){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -355,8 +336,7 @@ define i32 @extract_var_v8i16_s(<8 x i16> %v, i32 %i) {
 
 ; CHECK-LABEL: extract_undef_v8i16_s:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_undef_v8i16_s (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_s $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v8i16_s(<8 x i16> %v) {
@@ -367,8 +347,7 @@ define i32 @extract_undef_v8i16_s(<8 x i16> %v) {
 
 ; CHECK-LABEL: extract_v8i16_u:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_v8i16_u (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 5{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v8i16_u(<8 x i16> %v) {
@@ -379,8 +358,7 @@ define i32 @extract_v8i16_u(<8 x i16> %v) {
 
 ; CHECK-LABEL: extract_var_v8i16_u:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_var_v8i16_u (v128, i32) -> (i32){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -401,8 +379,7 @@ define i32 @extract_var_v8i16_u(<8 x i16> %v, i32 %i) {
 
 ; CHECK-LABEL: extract_undef_v8i16_u:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_undef_v8i16_u (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v8i16_u(<8 x i16> %v) {
@@ -413,8 +390,7 @@ define i32 @extract_undef_v8i16_u(<8 x i16> %v) {
 
 ; CHECK-LABEL: extract_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_v8i16 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 5{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i16 @extract_v8i16(<8 x i16> %v) {
@@ -424,8 +400,7 @@ define i16 @extract_v8i16(<8 x i16> %v) {
 
 ; CHECK-LABEL: extract_var_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_var_v8i16 (v128, i32) -> (i32){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -445,8 +420,7 @@ define i16 @extract_var_v8i16(<8 x i16> %v, i32 %i) {
 
 ; CHECK-LABEL: extract_undef_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_undef_v8i16 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i16 @extract_undef_v8i16(<8 x i16> %v) {
@@ -456,8 +430,7 @@ define i16 @extract_undef_v8i16(<8 x i16> %v) {
 
 ; CHECK-LABEL: replace_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_v8i16 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $0, 7, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
@@ -467,8 +440,7 @@ define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
 
 ; CHECK-LABEL: replace_var_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_var_v8i16 (v128, i32, i32) -> (v128){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -489,8 +461,7 @@ define <8 x i16> @replace_var_v8i16(<8 x i16> %v, i32 %i, i16 %x) {
 
 ; CHECK-LABEL: replace_undef_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_undef_v8i16 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @replace_undef_v8i16(<8 x i16> %v, i16 %x) {
@@ -500,8 +471,7 @@ define <8 x i16> @replace_undef_v8i16(<8 x i16> %v, i16 %x) {
 
 ; CHECK-LABEL: shuffle_v8i16:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
 ; SIMD128-SAME: 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -513,8 +483,7 @@ define <8 x i16> @shuffle_v8i16(<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: shuffle_undef_v8i16:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_undef_v8i16 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
 ; SIMD128-SAME: 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -527,8 +496,7 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
 
 ; CHECK-LABEL: build_v8i16:
 ; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype build_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i16x8.splat $push[[L0:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
 ; SIMD128-NEXT: i16x8.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $2{{$}}
@@ -556,7 +524,7 @@ define <8 x i16> @build_v8i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3,
 ; ==============================================================================
 ; CHECK-LABEL: const_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype const_v4i32 () -> (v128){{$}}
 ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 50462976, 117835012, 185207048, 252579084{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @const_v4i32() {
@@ -565,8 +533,7 @@ define <4 x i32> @const_v4i32() {
 
 ; CHECK-LABEL: splat_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype splat_v4i32 (i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.splat $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @splat_v4i32(i32 %x) {
@@ -584,8 +551,7 @@ define <4 x i32> @const_splat_v4i32() {
 
 ; CHECK-LABEL: extract_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_v4i32 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i32x4.extract_lane $push[[R:[0-9]+]]=, $0, 3{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v4i32(<4 x i32> %v) {
@@ -595,8 +561,7 @@ define i32 @extract_v4i32(<4 x i32> %v) {
 
 ; CHECK-LABEL: extract_var_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_var_v4i32 (v128, i32) -> (i32){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -616,8 +581,7 @@ define i32 @extract_var_v4i32(<4 x i32> %v, i32 %i) {
 
 ; CHECK-LABEL: extract_undef_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: .functype extract_undef_v4i32 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i32x4.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v4i32(<4 x i32> %v) {
@@ -627,8 +591,7 @@ define i32 @extract_undef_v4i32(<4 x i32> %v) {
 
 ; CHECK-LABEL: replace_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_v4i32 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $0, 2, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
@@ -638,8 +601,7 @@ define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
 
 ; CHECK-LABEL: replace_var_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, i32, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_var_v4i32 (v128, i32, i32) -> (v128){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -660,8 +622,7 @@ define <4 x i32> @replace_var_v4i32(<4 x i32> %v, i32 %i, i32 %x) {
 
 ; CHECK-LABEL: replace_undef_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_undef_v4i32 (v128, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @replace_undef_v4i32(<4 x i32> %v, i32 %x) {
@@ -671,8 +632,7 @@ define <4 x i32> @replace_undef_v4i32(<4 x i32> %v, i32 %x) {
 
 ; CHECK-LABEL: shuffle_v4i32:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
 ; SIMD128-SAME: 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -684,8 +644,7 @@ define <4 x i32> @shuffle_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: shuffle_undef_v4i32:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_undef_v4i32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
 ; SIMD128-SAME: 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -697,8 +656,7 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: build_v4i32:
 ; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .param i32, i32, i32, i32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype build_v4i32 (i32, i32, i32, i32) -> (v128){{$}}
 ; SIMD128-NEXT: i32x4.splat $push[[L0:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: i32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
 ; SIMD128-NEXT: i32x4.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $2{{$}}
@@ -718,7 +676,7 @@ define <4 x i32> @build_v4i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; CHECK-LABEL: const_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype const_v2i64 () -> (v128){{$}}
 ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 506097522914230528, 1084818905618843912{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @const_v2i64() {
@@ -728,8 +686,7 @@ define <2 x i64> @const_v2i64() {
 ; CHECK-LABEL: splat_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param i64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype splat_v2i64 (i64) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.splat $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @splat_v2i64(i64 %x) {
@@ -747,8 +704,7 @@ define <2 x i64> @const_splat_v2i64() {
 ; CHECK-LABEL: extract_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i64{{$}}
+; SIMD128-NEXT: .functype extract_v2i64 (v128) -> (i64){{$}}
 ; SIMD128-NEXT: i64x2.extract_lane $push[[R:[0-9]+]]=, $0, 1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i64 @extract_v2i64(<2 x i64> %v) {
@@ -758,8 +714,7 @@ define i64 @extract_v2i64(<2 x i64> %v) {
 
 ; CHECK-LABEL: extract_var_v2i64:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result i64{{$}}
+; SIMD128-NEXT: .functype extract_var_v2i64 (v128, i32) -> (i64){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -780,8 +735,7 @@ define i64 @extract_var_v2i64(<2 x i64> %v, i32 %i) {
 ; CHECK-LABEL: extract_undef_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result i64{{$}}
+; SIMD128-NEXT: .functype extract_undef_v2i64 (v128) -> (i64){{$}}
 ; SIMD128-NEXT: i64x2.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i64 @extract_undef_v2i64(<2 x i64> %v) {
@@ -792,8 +746,7 @@ define i64 @extract_undef_v2i64(<2 x i64> %v) {
 ; CHECK-LABEL: replace_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param v128, i64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_v2i64 (v128, i64) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @replace_v2i64(<2 x i64> %v, i64 %x) {
@@ -804,8 +757,7 @@ define <2 x i64> @replace_v2i64(<2 x i64> %v, i64 %x) {
 ; CHECK-LABEL: replace_var_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param v128, i32, i64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_var_v2i64 (v128, i32, i64) -> (v128){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -827,8 +779,7 @@ define <2 x i64> @replace_var_v2i64(<2 x i64> %v, i32 %i, i64 %x) {
 ; CHECK-LABEL: replace_undef_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param v128, i64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_undef_v2i64 (v128, i64) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @replace_undef_v2i64(<2 x i64> %v, i64 %x) {
@@ -838,8 +789,7 @@ define <2 x i64> @replace_undef_v2i64(<2 x i64> %v, i64 %x) {
 
 ; CHECK-LABEL: shuffle_v2i64:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
 ; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -850,8 +800,7 @@ define <2 x i64> @shuffle_v2i64(<2 x i64> %x, <2 x i64> %y) {
 
 ; CHECK-LABEL: shuffle_undef_v2i64:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_undef_v2i64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
 ; SIMD128-SAME: 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -864,8 +813,7 @@ define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: build_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
-; SIMD128-NEXT: .param i64, i64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype build_v2i64 (i64, i64) -> (v128){{$}}
 ; SIMD128-NEXT: i64x2.splat $push[[L0:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -880,7 +828,7 @@ define <2 x i64> @build_v2i64(i64 %x0, i64 %x1) {
 ; ==============================================================================
 ; CHECK-LABEL: const_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype const_v4f32 () -> (v128){{$}}
 ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=,
 ; SIMD128-SAME: 0x1.0402p-121, 0x1.0c0a08p-113, 0x1.14121p-105, 0x1.1c1a18p-97{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -891,8 +839,7 @@ define <4 x float> @const_v4f32() {
 
 ; CHECK-LABEL: splat_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param f32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype splat_v4f32 (f32) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @splat_v4f32(float %x) {
@@ -910,8 +857,7 @@ define <4 x float> @const_splat_v4f32() {
 
 ; CHECK-LABEL: extract_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result f32{{$}}
+; SIMD128-NEXT: .functype extract_v4f32 (v128) -> (f32){{$}}
 ; SIMD128-NEXT: f32x4.extract_lane $push[[R:[0-9]+]]=, $0, 3{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define float @extract_v4f32(<4 x float> %v) {
@@ -921,8 +867,7 @@ define float @extract_v4f32(<4 x float> %v) {
 
 ; CHECK-LABEL: extract_var_v4f32:
 ; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result f32{{$}}
+; SIMD128-NEXT: .functype extract_var_v4f32 (v128, i32) -> (f32){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -942,8 +887,7 @@ define float @extract_var_v4f32(<4 x float> %v, i32 %i) {
 
 ; CHECK-LABEL: extract_undef_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result f32{{$}}
+; SIMD128-NEXT: .functype extract_undef_v4f32 (v128) -> (f32){{$}}
 ; SIMD128-NEXT: f32x4.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define float @extract_undef_v4f32(<4 x float> %v) {
@@ -953,8 +897,7 @@ define float @extract_undef_v4f32(<4 x float> %v) {
 
 ; CHECK-LABEL: replace_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, f32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_v4f32 (v128, f32) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.replace_lane $push[[R:[0-9]+]]=, $0, 2, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
@@ -964,8 +907,7 @@ define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
 
 ; CHECK-LABEL: replace_var_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, i32, f32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_var_v4f32 (v128, i32, f32) -> (v128){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -986,8 +928,7 @@ define <4 x float> @replace_var_v4f32(<4 x float> %v, i32 %i, float %x) {
 
 ; CHECK-LABEL: replace_undef_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param v128, f32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_undef_v4f32 (v128, f32) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @replace_undef_v4f32(<4 x float> %v, float %x) {
@@ -997,8 +938,7 @@ define <4 x float> @replace_undef_v4f32(<4 x float> %v, float %x) {
 
 ; CHECK-LABEL: shuffle_v4f32:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
 ; SIMD128-SAME: 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1010,8 +950,7 @@ define <4 x float> @shuffle_v4f32(<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: shuffle_undef_v4f32:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_undef_v4f32 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
 ; SIMD128-SAME: 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1023,8 +962,7 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
 
 ; CHECK-LABEL: build_v4f32:
 ; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .param f32, f32, f32, f32{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype build_v4f32 (f32, f32, f32, f32) -> (v128){{$}}
 ; SIMD128-NEXT: f32x4.splat $push[[L0:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
 ; SIMD128-NEXT: f32x4.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $2{{$}}
@@ -1043,7 +981,7 @@ define <4 x float> @build_v4f32(float %x0, float %x1, float %x2, float %x3) {
 ; ==============================================================================
 ; CHECK-LABEL: const_v2f64:
 ; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype const_v2f64 () -> (v128){{$}}
 ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.60504030201p-911, 0x1.e0d0c0b0a0908p-783{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @const_v2f64() {
@@ -1053,8 +991,7 @@ define <2 x double> @const_v2f64() {
 ; CHECK-LABEL: splat_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param f64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype splat_v2f64 (f64) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @splat_v2f64(double %x) {
@@ -1072,8 +1009,7 @@ define <2 x double> @const_splat_v2f64() {
 ; CHECK-LABEL: extract_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result f64{{$}}
+; SIMD128-NEXT: .functype extract_v2f64 (v128) -> (f64){{$}}
 ; SIMD128-NEXT: f64x2.extract_lane $push[[R:[0-9]+]]=, $0, 1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define double @extract_v2f64(<2 x double> %v) {
@@ -1083,8 +1019,7 @@ define double @extract_v2f64(<2 x double> %v) {
 
 ; CHECK-LABEL: extract_var_v2f64:
 ; NO-SIMD128-NOT: i62x2
-; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: .result f64{{$}}
+; SIMD128-NEXT: .functype extract_var_v2f64 (v128, i32) -> (f64){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -1105,8 +1040,7 @@ define double @extract_var_v2f64(<2 x double> %v, i32 %i) {
 ; CHECK-LABEL: extract_undef_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128{{$}}
-; SIMD128-NEXT: .result f64{{$}}
+; SIMD128-NEXT: .functype extract_undef_v2f64 (v128) -> (f64){{$}}
 ; SIMD128-NEXT: f64x2.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define double @extract_undef_v2f64(<2 x double> %v) {
@@ -1117,8 +1051,7 @@ define double @extract_undef_v2f64(<2 x double> %v) {
 ; CHECK-LABEL: replace_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, f64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_v2f64 (v128, f64) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @replace_v2f64(<2 x double> %v, double %x) {
@@ -1129,8 +1062,7 @@ define <2 x double> @replace_v2f64(<2 x double> %v, double %x) {
 ; CHECK-LABEL: replace_var_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, i32, f64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_var_v2f64 (v128, i32, f64) -> (v128){{$}}
 ; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
 ; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
@@ -1152,8 +1084,7 @@ define <2 x double> @replace_var_v2f64(<2 x double> %v, i32 %i, double %x) {
 ; CHECK-LABEL: replace_undef_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param v128, f64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype replace_undef_v2f64 (v128, f64) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @replace_undef_v2f64(<2 x double> %v, double %x) {
@@ -1163,8 +1094,7 @@ define <2 x double> @replace_undef_v2f64(<2 x double> %v, double %x) {
 
 ; CHECK-LABEL: shuffle_v2f64:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
 ; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1176,8 +1106,7 @@ define <2 x double> @shuffle_v2f64(<2 x double> %x, <2 x double> %y) {
 
 ; CHECK-LABEL: shuffle_undef_v2f64:
 ; NO-SIMD128-NOT: v8x16
-; SIMD128-NEXT: .param v128, v128{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype shuffle_undef_v2f64 (v128, v128) -> (v128){{$}}
 ; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
 ; SIMD128-SAME: 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
@@ -1190,8 +1119,7 @@ define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: build_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-; SIMD128-NEXT: .param f64, f64{{$}}
-; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: .functype build_v2f64 (f64, f64) -> (v128){{$}}
 ; SIMD128-NEXT: f64x2.splat $push[[L0:[0-9]+]]=, $0{{$}}
 ; SIMD128-NEXT: f64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
diff --git a/test/CodeGen/WebAssembly/stack-insts.ll b/test/CodeGen/WebAssembly/stack-insts.ll
index 0876b4a42797d6c6fcb38ff0ee8e118e3f0e824d..c4ccdddf406fa7ba2a60ba9bbfffea35dd0fdf9d 100644
--- a/test/CodeGen/WebAssembly/stack-insts.ll
+++ b/test/CodeGen/WebAssembly/stack-insts.ll
@@ -8,8 +8,7 @@ declare void @foo1()
 
 ; Tests if br_table is printed correctly with a tab.
 ; CHECK-LABEL: test0:
-; CHECK-NOT: br_table0, 1, 0, 1, 0
-; CHECK: br_table 0, 1, 0, 1, 0
+; CHECK: br_table {0, 1, 0, 1, 0}
 define void @test0(i32 %n) {
 entry:
   switch i32 %n, label %sw.epilog [
diff --git a/test/CodeGen/WebAssembly/store.ll b/test/CodeGen/WebAssembly/store.ll
index fc5c96540f98d0dda6035f33ef3c43d3a8a0ef85..9e528bdf6b934e1bdb3286ee5e971a00bc2d0f0a 100644
--- a/test/CodeGen/WebAssembly/store.ll
+++ b/test/CodeGen/WebAssembly/store.ll
@@ -7,7 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: sti32:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype sti32 (i32, i32) -> (){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.store 0($pop[[L0]]), $pop[[L1]]{{$}}
@@ -18,7 +18,7 @@ define void @sti32(i32 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: sti64:
-; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: .functype sti64 (i32, i64) -> (){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.store 0($pop[[L0]]), $pop[[L1]]{{$}}
@@ -29,7 +29,7 @@ define void @sti64(i64 *%p, i64 %v) {
 }
 
 ; CHECK-LABEL: stf32:
-; CHECK-NEXT: .param i32, f32{{$}}
+; CHECK-NEXT: .functype stf32 (i32, f32) -> (){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f32.store 0($pop[[L0]]), $pop[[L1]]{{$}}
@@ -40,7 +40,7 @@ define void @stf32(float *%p, float %v) {
 }
 
 ; CHECK-LABEL: stf64:
-; CHECK-NEXT: .param i32, f64{{$}}
+; CHECK-NEXT: .functype stf64 (i32, f64) -> (){{$}}
 ; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: f64.store 0($pop[[L0]]), $pop[[L1]]{{$}}
diff --git a/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
index b8642b97501236421e1c5f65716b32b297831a95..61678d123c7ee2e7cf72665c56dfcf58ed97125f 100644
--- a/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -10,7 +10,7 @@ declare i32 @has_i64_arg(i64)
 declare i32 @has_ptr_arg(i8*)
 
 ; CHECK-LABEL: test_invalid_rtn:
-; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK:      i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2@FUNCTION, $pop[[L0]]{{$}}
 ; CHECK-NEXT: drop $pop[[L1]]{{$}}
 ; CHECK-NEXT: i64.const   $push[[L0:[0-9]+]]=, 0{{$}}
@@ -32,7 +32,7 @@ define void @test_struct_rtn() {
 }
 
 ; CHECK-LABEL: test_invalid_arg:
-; CHECK-NEXT: 	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
+; CHECK:      	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
 ; CHECK-NEXT: 	i32.call	$push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4@FUNCTION, $pop[[L0]]{{$}}
 ; CHECK-NEXT: 	drop	$pop[[L1]]{{$}}
 ; CHECK-NEXT: 	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
@@ -51,25 +51,21 @@ entry:
 }
 
 ; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid:
-; CHECK-NEXT:  .param  	i64
-; CHECK-NEXT:  .result 	i64
+; CHECK-NEXT:  .functype .Lhas_i64_arg_bitcast_invalid (i64) -> (i64)
 ; CHECK-NEXT:  unreachable
 ; CHECK-NEXT:  end_function
 
 ; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2:
-; CHECK-NEXT:  .param  	i32
-; CHECK-NEXT:  .result 	i32
+; CHECK-NEXT:  .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32)
 ; CHECK-NEXT:  unreachable
 ; CHECK-NEXT:  end_function
 
 ; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid:
-; CHECK-NEXT: 	.param  	i64
-; CHECK-NEXT: 	.result 	i32
+; CHECK-NEXT: 	.functype .Lhas_ptr_arg_bitcast_invalid (i64) -> (i32)
 ; CHECK-NEXT: 	unreachable
 ; CHECK-NEXT: 	end_function
 
 ; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4:
-; CHECK-NEXT: 	.param  	i32
-; CHECK-NEXT: 	.result 	i32
+; CHECK-NEXT: 	.functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32)
 ; CHECK-NEXT: 	unreachable
 ; CHECK-NEXT: 	end_function
diff --git a/test/CodeGen/WebAssembly/unused-argument.ll b/test/CodeGen/WebAssembly/unused-argument.ll
index 526765cb1652475e8d750d2eaf12c78b9eaf568d..09a9708c09f3c31bce756f3533d3208aec51e2d3 100644
--- a/test/CodeGen/WebAssembly/unused-argument.ll
+++ b/test/CodeGen/WebAssembly/unused-argument.ll
@@ -6,23 +6,21 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: unused_first:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype unused_first (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: return $1{{$}}
 define i32 @unused_first(i32 %x, i32 %y) {
   ret i32 %y
 }
 
 ; CHECK-LABEL: unused_second:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .functype unused_second (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: return $0{{$}}
 define i32 @unused_second(i32 %x, i32 %y) {
   ret i32 %x
 }
 
 ; CHECK-LABEL: call_something:
-; CHECK-NEXT: {{^}} i32.call $drop=, return_something@FUNCTION{{$}}
+; CHECK:      {{^}} i32.call $drop=, return_something@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 declare i32 @return_something()
 define void @call_something() {
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index aa4acae5e0791eae896c631ac7871aea7c4bff3e..9b4f28324c11d79efaef7f3c5209b226b2d993fd 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -309,7 +309,7 @@ define void @frameaddress_0() {
 ; Test __builtin_frame_address(1).
 
 ; CHECK-LABEL: frameaddress_1:
-; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK:      i32.const $push0=, 0{{$}}
 ; CHECK-NEXT: call use_i8_star@FUNCTION, $pop0{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @frameaddress_1() {
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
index 8516c0743cf660c230833e5d9fef208daf8e8e4d..0ea29762f11c5b19f1206ff917c2ddcf1af3ec0c 100644
--- a/test/CodeGen/WebAssembly/varargs.ll
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -9,7 +9,7 @@ target triple = "wasm32-unknown-unknown"
 
 ; TODO: Test va_start.
 ; CHECK-LABEL: start:
-; CHECK-NEXT: .param i32, i32
+; CHECK-NEXT: .functype start (i32, i32) -> ()
 ; CHECK-NOT: __stack_pointer
 define void @start(i8** %ap, ...) {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; Test va_end.
 
 ; CHECK-LABEL: end:
-; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .functype end (i32) -> (){{$}}
 ; CHECK-NEXT: return{{$}}
 define void @end(i8** %ap) {
 entry:
@@ -35,7 +35,7 @@ entry:
 ; Test va_copy.
 
 ; CHECK-LABEL: copy:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype copy (i32, i32) -> (){{$}}
 ; CHECK-NEXT: i32.load  $push0=, 0($1){{$}}
 ; CHECK-NEXT: i32.store 0($0), $pop0{{$}}
 ; CHECK-NEXT: return{{$}}
@@ -50,8 +50,7 @@ entry:
 ; Test va_arg with an i8 argument.
 
 ; CHECK-LABEL: arg_i8:
-; CHECK-NEXT: .param     i32{{$}}
-; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .functype arg_i8 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load   $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: tee_local  $push[[NUM1:[0-9]+]]=, $1=, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT: i32.const  $push[[NUM2:[0-9]+]]=, 4{{$}}
@@ -68,8 +67,7 @@ entry:
 ; Test va_arg with an i32 argument.
 
 ; CHECK-LABEL: arg_i32:
-; CHECK-NEXT: .param     i32{{$}}
-; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .functype arg_i32 (i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.load   $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: i32.const  $push[[NUM1:[0-9]+]]=, 3{{$}}
 ; CHECK-NEXT: i32.add    $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
@@ -90,7 +88,7 @@ entry:
 ; Test va_arg with an i128 argument.
 
 ; CHECK-LABEL: arg_i128:
-; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .functype arg_i128 (i32, i32) -> (){{$}}
 ; CHECK: i32.and
 ; CHECK: i64.load
 ; CHECK: i64.load
@@ -106,7 +104,7 @@ entry:
 declare void @callee(...)
 
 ; CHECK-LABEL: caller_none:
-; CHECK-NEXT: i32.const $push0=, 0
+; CHECK:      i32.const $push0=, 0
 ; CHECK-NEXT: call callee@FUNCTION, $pop0
 ; CHECK-NEXT: return{{$}}
 define void @caller_none() {
@@ -129,7 +127,7 @@ define void @caller_some() {
 
 ; Test a va_start call in a non-entry block
 ; CHECK-LABEL: startbb:
-; CHECK: .param i32, i32, i32
+; CHECK: .functype startbb (i32, i32, i32) -> ()
 define void @startbb(i1 %cond, i8** %ap, ...) {
 entry:
   br i1 %cond, label %bb0, label %bb1
@@ -160,7 +158,7 @@ define void @call_nonlegal_fixed() nounwind {
 ; Test a definition a varargs function with a non-legal fixed argument.
 
 ; CHECK-LABEL: nonlegal_fixed:
-; CHECK-NEXT: .param          i64, i64, i32{{$}}
+; CHECK-NEXT: .functype nonlegal_fixed (i64, i64, i32) -> (){{$}}
 define void @nonlegal_fixed(fp128 %x, ...) nounwind {
   ret void
 }
diff --git a/test/CodeGen/WebAssembly/vtable.ll b/test/CodeGen/WebAssembly/vtable.ll
index 0d0b53cfcb92b741e0940f0f0b3180e55741df0c..6a0d902254dd6b19eb1cec1945cbe585ef533789 100644
--- a/test/CodeGen/WebAssembly/vtable.ll
+++ b/test/CodeGen/WebAssembly/vtable.ll
@@ -36,7 +36,7 @@ target triple = "wasm32-unknown-unknown"
 @_ZTS1D = constant [3 x i8] c"1D\00"
 
 ; VTABLE:       .type _ZTV1A,@object
-; VTABLE-NEXT:  .section .data.rel.ro._ZTV1A,
+; VTABLE-NEXT:  .section .rodata._ZTV1A,
 ; VTABLE-NEXT:  .globl _ZTV1A
 ; VTABLE-LABEL: _ZTV1A:
 ; VTABLE-NEXT:  .int32 0
@@ -47,7 +47,7 @@ target triple = "wasm32-unknown-unknown"
 ; VTABLE-NEXT:  .size _ZTV1A, 20
 @_ZTV1A = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.A*)* @_ZN1AD0Ev to i8*), i8* bitcast (void (%struct.A*)* @_ZN1A3fooEv to i8*)], align 4
 ; VTABLE:       .type _ZTV1B,@object
-; VTABLE-NEXT:  .section .data.rel.ro._ZTV1B,
+; VTABLE-NEXT:  .section .rodata._ZTV1B,
 ; VTABLE-NEXT:  .globl _ZTV1B
 ; VTABLE-LABEL: _ZTV1B:
 ; VTABLE-NEXT:  .int32 0
@@ -58,7 +58,7 @@ target triple = "wasm32-unknown-unknown"
 ; VTABLE-NEXT:  .size _ZTV1B, 20
 @_ZTV1B = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1B to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.B*)* @_ZN1BD0Ev to i8*), i8* bitcast (void (%struct.B*)* @_ZN1B3fooEv to i8*)], align 4
 ; VTABLE:       .type _ZTV1C,@object
-; VTABLE-NEXT:  .section .data.rel.ro._ZTV1C,
+; VTABLE-NEXT:  .section .rodata._ZTV1C,
 ; VTABLE-NEXT:  .globl _ZTV1C
 ; VTABLE-LABEL: _ZTV1C:
 ; VTABLE-NEXT:  .int32 0
@@ -69,7 +69,7 @@ target triple = "wasm32-unknown-unknown"
 ; VTABLE-NEXT:  .size _ZTV1C, 20
 @_ZTV1C = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1C to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.C*)* @_ZN1CD0Ev to i8*), i8* bitcast (void (%struct.C*)* @_ZN1C3fooEv to i8*)], align 4
 ; VTABLE:       .type _ZTV1D,@object
-; VTABLE-NEXT:  .section .data.rel.ro._ZTV1D,
+; VTABLE-NEXT:  .section .rodata._ZTV1D,
 ; VTABLE-NEXT:  .globl _ZTV1D
 ; VTABLE-LABEL: _ZTV1D:
 ; VTABLE-NEXT:  .int32 0
diff --git a/test/CodeGen/WebAssembly/wasmehprepare.ll b/test/CodeGen/WebAssembly/wasmehprepare.ll
index 67e198eb05882e4f3b70b7140f9fa21c089a0ab0..6df7175b1ca7bd47709c9f47dc72bb7c5190a2da 100644
--- a/test/CodeGen/WebAssembly/wasmehprepare.ll
+++ b/test/CodeGen/WebAssembly/wasmehprepare.ll
@@ -298,6 +298,34 @@ try.cont10:                                       ; preds = %invoke.cont3, %catc
   ret void
 }
 
+; Tests if instructions after a call to @llvm.wasm.throw are deleted and the
+; BB's dead children are deleted.
+
+; CHECK-LABEL: @test6
+define i32 @test6(i1 %b, i8* %p) {
+entry:
+  br i1 %b, label %bb.true, label %bb.false
+
+; CHECK:      bb.true:
+; CHECK-NEXT:   call void @llvm.wasm.throw(i32 0, i8* %p)
+; CHECK-NEXT:   unreachable
+bb.true:                                          ; preds = %entry
+  call void @llvm.wasm.throw(i32 0, i8* %p)
+  br label %bb.true.0
+
+; CHECK-NOT:  bb.true.0
+bb.true.0:                                        ; preds = %bb.true
+  br label %merge
+
+; CHECK:      bb.false
+bb.false:                                         ; preds = %entry
+  br label %merge
+
+; CHECK:      merge
+merge:                                            ; preds = %bb.true.0, %bb.false
+  ret i32 0
+}
+
 declare void @foo()
 declare void @func(i32)
 declare %struct.Cleanup* @_ZN7CleanupD1Ev(%struct.Cleanup* returned)
@@ -305,6 +333,7 @@ declare i32 @__gxx_wasm_personality_v0(...)
 declare i8* @llvm.wasm.get.exception(token)
 declare i32 @llvm.wasm.get.ehselector(token)
 declare i32 @llvm.eh.typeid.for(i8*)
+declare void @llvm.wasm.throw(i32, i8*)
 declare i8* @__cxa_begin_catch(i8*)
 declare void @__cxa_end_catch()
 declare void @__cxa_rethrow()
@@ -314,4 +343,3 @@ declare void @__clang_call_terminate(i8*)
 ; CHECK-DAG: declare void @llvm.wasm.landingpad.index(token, i32)
 ; CHECK-DAG: declare i8* @llvm.wasm.lsda()
 ; CHECK-DAG: declare i32 @_Unwind_CallPersonality(i8*)
-
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
index 4d35d65431ab3f8dbad2c901d6c24cb031b34f96..cbccc90df13ca1435809f43b11f6376f2f7e9d1a 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -1,44 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686--
 ; RUN: llc -pre-RA-sched=source < %s -mtriple=i686-unknown-linux -mcpu=corei7 | FileCheck %s --check-prefix=SOURCE-SCHED
 ; PR2748
 
-@g_73 = external global i32		; <i32*> [#uses=1]
-@g_5 = external global i32		; <i32*> [#uses=1]
+@g_73 = external global i32
+@g_5 = external global i32
 
 define i32 @func_44(i16 signext %p_46) nounwind {
+; SOURCE-SCHED-LABEL: func_44:
+; SOURCE-SCHED:       # %bb.0: # %entry
+; SOURCE-SCHED-NEXT:    subl $12, %esp
+; SOURCE-SCHED-NEXT:    movl g_5, %eax
+; SOURCE-SCHED-NEXT:    sarl %eax
+; SOURCE-SCHED-NEXT:    xorl %ecx, %ecx
+; SOURCE-SCHED-NEXT:    cmpl $1, %eax
+; SOURCE-SCHED-NEXT:    setg %cl
+; SOURCE-SCHED-NEXT:    movb g_73, %dl
+; SOURCE-SCHED-NEXT:    xorl %eax, %eax
+; SOURCE-SCHED-NEXT:    subb {{[0-9]+}}(%esp), %al
+; SOURCE-SCHED-NEXT:    testb %dl, %dl
+; SOURCE-SCHED-NEXT:    jne .LBB0_2
+; SOURCE-SCHED-NEXT:  # %bb.1: # %bb11
+; SOURCE-SCHED-NEXT:    movzbl %al, %eax
+; SOURCE-SCHED-NEXT:    # kill: def $eax killed $eax def $ax
+; SOURCE-SCHED-NEXT:    divb %dl
+; SOURCE-SCHED-NEXT:    movzbl %ah, %eax
+; SOURCE-SCHED-NEXT:  .LBB0_2: # %bb12
+; SOURCE-SCHED-NEXT:    xorl %edx, %edx
+; SOURCE-SCHED-NEXT:    testb %al, %al
+; SOURCE-SCHED-NEXT:    setne %dl
+; SOURCE-SCHED-NEXT:    subl $4, %esp
+; SOURCE-SCHED-NEXT:    pushl $0
+; SOURCE-SCHED-NEXT:    pushl %ecx
+; SOURCE-SCHED-NEXT:    pushl %edx
+; SOURCE-SCHED-NEXT:    calll func_48
+; SOURCE-SCHED-NEXT:    addl $28, %esp
+; SOURCE-SCHED-NEXT:    retl
 entry:
-; SOURCE-SCHED: subl
-; SOURCE-SCHED: movl
-; SOURCE-SCHED: sarl
-; SOURCE-SCHED: xorl
-; SOURCE-SCHED: cmpl
-; SOURCE-SCHED: setg
-; SOURCE-SCHED: movb
-; SOURCE-SCHED: xorl
-; SOURCE-SCHED: subl
-; SOURCE-SCHED: testb
-; SOURCE-SCHED: jne
-	%0 = load i32, i32* @g_5, align 4		; <i32> [#uses=1]
-	%1 = ashr i32 %0, 1		; <i32> [#uses=1]
-	%2 = icmp sgt i32 %1, 1		; <i1> [#uses=1]
-	%3 = zext i1 %2 to i32		; <i32> [#uses=1]
-	%4 = load i32, i32* @g_73, align 4		; <i32> [#uses=1]
-	%5 = zext i16 %p_46 to i64		; <i64> [#uses=1]
-	%6 = sub i64 0, %5		; <i64> [#uses=1]
-	%7 = trunc i64 %6 to i8		; <i8> [#uses=2]
-	%8 = trunc i32 %4 to i8		; <i8> [#uses=2]
-	%9 = icmp eq i8 %8, 0		; <i1> [#uses=1]
+	%0 = load i32, i32* @g_5, align 4
+	%1 = ashr i32 %0, 1
+	%2 = icmp sgt i32 %1, 1
+	%3 = zext i1 %2 to i32
+	%4 = load i32, i32* @g_73, align 4
+	%5 = zext i16 %p_46 to i64
+	%6 = sub i64 0, %5
+	%7 = trunc i64 %6 to i8
+	%8 = trunc i32 %4 to i8
+	%9 = icmp eq i8 %8, 0
 	br i1 %9, label %bb11, label %bb12
 
-bb11:		; preds = %entry
-	%10 = urem i8 %7, %8		; <i8> [#uses=1]
+bb11:
+	%10 = urem i8 %7, %8
 	br label %bb12
 
-bb12:		; preds = %bb11, %entry
-	%.014.in = phi i8 [ %10, %bb11 ], [ %7, %entry ]		; <i8> [#uses=1]
-	%11 = icmp ne i8 %.014.in, 0		; <i1> [#uses=1]
-	%12 = zext i1 %11 to i32		; <i32> [#uses=1]
-	%13 = tail call i32 (...) @func_48( i32 %12, i32 %3, i32 0 ) nounwind		; <i32> [#uses=0]
+bb12:
+	%.014.in = phi i8 [ %10, %bb11 ], [ %7, %entry ]
+	%11 = icmp ne i8 %.014.in, 0
+	%12 = zext i1 %11 to i32
+	%13 = tail call i32 (...) @func_48( i32 %12, i32 %3, i32 0 ) nounwind
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 0cce34fb7bd65fa1984b6d2a1c0e441802dd12fd..25a7d057f66a805dd39ccb18965659e63dc7c68b 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -40,9 +40,9 @@ define %struct.__vv* @t(%struct.Key* %desc, i64 %p) nounwind ssp {
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    callq _xxGetOffsetForCode
+; CHECK-NEXT:    movq %rbx, %rdi
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movq %rbx, %rdi
 ; CHECK-NEXT:    callq _xxCalculateMidType
 ; CHECK-NEXT:    cmpl $1, %eax
 ; CHECK-NEXT:    jne LBB0_1
diff --git a/test/CodeGen/X86/2009-03-07-FPConstSelect.ll b/test/CodeGen/X86/2009-03-07-FPConstSelect.ll
deleted file mode 100644
index 298b81b90e46e30046c0cbc2fbe884d764a54c38..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/2009-03-07-FPConstSelect.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -mcpu=yonah | not grep xmm
-; This should do a single load into the fp stack for the return, not diddle with xmm registers.
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin7"
-
-define float @f(i32 %x) nounwind readnone {
-entry:
-	%0 = icmp eq i32 %x, 0		; <i1> [#uses=1]
-	%iftmp.0.0 = select i1 %0, float 4.200000e+01, float 2.300000e+01
-	ret float %iftmp.0.0
-}
diff --git a/test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll b/test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll
index 363053fe341e612ca43ca97b6b8ecec50872cb0f..38a414a2d8f78d19ff3c2321cff94f276c733be1 100644
--- a/test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll
+++ b/test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll
@@ -12,12 +12,10 @@ define fastcc i32 @test() nounwind {
 entry:
 ; CHECK-LABEL: test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    addl $0, %eax
-; CHECK-NEXT:    seto %cl
-; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; CHECK-NEXT:    movb %cl, -{{[0-9]+}}(%rsp) ## 1-byte Spill
-; CHECK-NEXT:    jo LBB0_2
+; CHECK-NEXT:    movl $1, [[REG:%e[a-z]+]]
+; CHECK-NEXT:    addl $0, [[REG]]
+; CHECK-NEXT:    seto {{%[a-z]+l}}
+; CHECK:         jo LBB0_2
 	%tmp1 = call %0 @llvm.sadd.with.overflow.i32(i32 1, i32 0)
 	%tmp2 = extractvalue %0 %tmp1, 1
 	br i1 %tmp2, label %.backedge, label %BB3
diff --git a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
index 4f6f0c9201bbec5d7f28f8506ce071ae8530e449..0abe3cba0d1e02d52677f355933afca25c4b9c87 100644
--- a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
+++ b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
@@ -11,7 +11,7 @@ define i32 @main() nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpq {{.*}}(%rip), %rax
-; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    sbbb %al, %al
 ; CHECK-NEXT:    testb $-106, %al
 ; CHECK-NEXT:    jle .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %if.then
diff --git a/test/CodeGen/X86/2011-05-09-loaduse.ll b/test/CodeGen/X86/2011-05-09-loaduse.ll
index 2212c5dc86e893ad2a8a243fb2b5bd6cc76d7bde..3dc77cb002e0f1eeed864b245824505167150471 100644
--- a/test/CodeGen/X86/2011-05-09-loaduse.ll
+++ b/test/CodeGen/X86/2011-05-09-loaduse.ll
@@ -1,9 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-- -mcpu=corei7 | FileCheck %s
 
-;CHECK-LABEL: test:
-;CHECK-NOT: pshufd
-;CHECK: ret
 define float @test(<4 x float>* %A) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    flds 12(%eax)
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
 entry:
   %T = load <4 x float>, <4 x float>* %A
   %R = extractelement <4 x float> %T, i32 3
diff --git a/test/CodeGen/X86/2011-12-28-vselecti8.ll b/test/CodeGen/X86/2011-12-28-vselecti8.ll
index c91646640b8f009a157e8ba9364fb7ae5557db45..d564e9f6ae26f3eb8c2b5216ca85fd5b187e6111 100644
--- a/test/CodeGen/X86/2011-12-28-vselecti8.ll
+++ b/test/CodeGen/X86/2011-12-28-vselecti8.ll
@@ -1,9 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin  -mcpu=corei7 | FileCheck %s
 ; ModuleID = '<stdin>'
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin11.2.0"
 
-; During legalization, the vselect mask is 'type legalized' into a 
+; During legalization, the vselect mask is 'type legalized' into a
 ; wider BUILD_VECTOR. This causes the introduction of a new
 ; sign_extend_inreg in the DAG.
 ;
@@ -13,12 +15,16 @@ target triple = "x86_64-apple-darwin11.2.0"
 ; Make sure that the sign_extend_inreg is simplified and that we
 ; don't generate psll, psraw and pblendvb from the vselect.
 
-; CHECK-LABEL: foo8
-; CHECK-NOT: psll
-; CHECK-NOT: psraw
-; CHECK-NOT: pblendvb
-; CHECK: ret
 define void @foo8(float* nocapture %RET) nounwind {
+; CHECK-LABEL: foo8:
+; CHECK:       ## %bb.0: ## %allocas
+; CHECK-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT:    cvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT:    cvtdq2ps %xmm1, %xmm1
+; CHECK-NEXT:    movups %xmm1, 16(%rdi)
+; CHECK-NEXT:    movups %xmm0, (%rdi)
+; CHECK-NEXT:    retq
 allocas:
   %resultvec.i = select <8 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, <8 x i8> <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
   %uint2float = uitofp <8 x i8> %resultvec.i to <8 x float>
diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll
index ed4afb7c50546f04d208c7a95dc9b6a4eae47574..0be9043579fc840c45f0edbc27070843a2acd3d3 100644
--- a/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -4,8 +4,7 @@
 define void @endless_loop() {
 ; CHECK-LABEL: endless_loop:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps (%eax), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps (%eax), %xmm0
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
diff --git a/test/CodeGen/X86/2012-08-07-CmpISelBug.ll b/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
index eba970e711ae068bb698d3f1228a2cfd4b44c49d..53e51261706836a037cba3469b2c0ba7fab64cd7 100644
--- a/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
+++ b/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
@@ -26,7 +26,7 @@ bb:
   %tmp113 = sub i8 %tmp106, 0
   %tmp114 = add i8 %tmp113, -72
   %tmp141 = icmp ne i32 %tmp67, -1263900958
-  %tmp142 = select i1 %tmp141, i8 %tmp114, i8 undef
+  %tmp142 = select i1 %tmp141, i8 %tmp114, i8 1
   %tmp143 = xor i8 %tmp142, 81
   %tmp144 = zext i8 %tmp143 to i32
   %tmp145 = add i32 %tmp144, 2062143348
diff --git a/test/CodeGen/X86/3dnow-intrinsics.ll b/test/CodeGen/X86/3dnow-intrinsics.ll
index 88dc7bec59c944bd726d173baa79db3c23af2fbe..109b9fc02f4e829408313ae8786d838d5f0c8184 100644
--- a/test/CodeGen/X86/3dnow-intrinsics.ll
+++ b/test/CodeGen/X86/3dnow-intrinsics.ll
@@ -897,8 +897,8 @@ define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone {
 ; X64-LABEL: test_pswapdsi:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    pswapd -{{[0-9]+}}(%rsp), %mm0 # mm0 = mem[1,0]
+; X64-NEXT:    movdq2q %xmm0, %mm0
+; X64-NEXT:    pswapd %mm0, %mm0 # mm0 = mm0[1,0]
 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
index 62cc06bd8b5e017740bdcbd54a8b43c5792ec271..e7b04ce221fbdd799aa3abbaa4670dccc7cdcf22 100644
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -57,8 +57,9 @@ define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
 define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
 ; X64-LABEL: test_add_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal (%rsi,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
index 706a63ec142e078baf2db4e3c29eba29025ab509..5f08877b4a24c38d9322af66625baa60ff49ce53 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
@@ -285,7 +285,7 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) {
   ; X32:   liveins: $xmm0, $xmm1
   ; X32:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $xmm0
   ; X32:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $xmm1
-  ; X32:   [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
+  ; X32:   [[MV:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
   ; X32:   [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[MV]](<8 x s32>)
   ; X32:   $xmm0 = COPY [[UV]](<4 x s32>)
   ; X32:   $xmm1 = COPY [[UV1]](<4 x s32>)
@@ -295,7 +295,7 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) {
   ; X64:   liveins: $xmm0, $xmm1
   ; X64:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $xmm0
   ; X64:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $xmm1
-  ; X64:   [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
+  ; X64:   [[MV:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
   ; X64:   [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[MV]](<8 x s32>)
   ; X64:   $xmm0 = COPY [[UV]](<4 x s32>)
   ; X64:   $xmm1 = COPY [[UV1]](<4 x s32>)
@@ -494,8 +494,8 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
   ; X32:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $xmm2
   ; X32:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
   ; X32:   [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 16 from %fixed-stack.0, align 0)
-  ; X32:   [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
-  ; X32:   [[MV1:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY2]](<4 x s32>), [[LOAD]](<4 x s32>)
+  ; X32:   [[MV:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
+  ; X32:   [[MV1:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY2]](<4 x s32>), [[LOAD]](<4 x s32>)
   ; X32:   ADJCALLSTACKDOWN32 0, 0, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
   ; X32:   [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[MV1]](<8 x s32>)
   ; X32:   $xmm0 = COPY [[UV]](<4 x s32>)
@@ -503,7 +503,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
   ; X32:   CALLpcrel32 @split_return_callee, csr_32, implicit $esp, implicit $ssp, implicit $xmm0, implicit $xmm1, implicit-def $xmm0, implicit-def $xmm1
   ; X32:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $xmm0
   ; X32:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $xmm1
-  ; X32:   [[MV2:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY3]](<4 x s32>), [[COPY4]](<4 x s32>)
+  ; X32:   [[MV2:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY3]](<4 x s32>), [[COPY4]](<4 x s32>)
   ; X32:   ADJCALLSTACKUP32 0, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
   ; X32:   [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[MV]], [[MV2]]
   ; X32:   [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
@@ -517,8 +517,8 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
   ; X64:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $xmm1
   ; X64:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $xmm2
   ; X64:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $xmm3
-  ; X64:   [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
-  ; X64:   [[MV1:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY2]](<4 x s32>), [[COPY3]](<4 x s32>)
+  ; X64:   [[MV:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY]](<4 x s32>), [[COPY1]](<4 x s32>)
+  ; X64:   [[MV1:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY2]](<4 x s32>), [[COPY3]](<4 x s32>)
   ; X64:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
   ; X64:   [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[MV1]](<8 x s32>)
   ; X64:   $xmm0 = COPY [[UV]](<4 x s32>)
@@ -526,7 +526,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
   ; X64:   CALL64pcrel32 @split_return_callee, csr_64, implicit $rsp, implicit $ssp, implicit $xmm0, implicit $xmm1, implicit-def $xmm0, implicit-def $xmm1
   ; X64:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $xmm0
   ; X64:   [[COPY5:%[0-9]+]]:_(<4 x s32>) = COPY $xmm1
-  ; X64:   [[MV2:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY4]](<4 x s32>), [[COPY5]](<4 x s32>)
+  ; X64:   [[MV2:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[COPY4]](<4 x s32>), [[COPY5]](<4 x s32>)
   ; X64:   ADJCALLSTACKUP64 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
   ; X64:   [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[MV]], [[MV2]]
   ; X64:   [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir b/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir
index 0a4790d282900f4e24e3e49dd788eae061209dca..659d6a0cdf33942c82b21041414abd8215443139 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir
@@ -49,8 +49,8 @@ body:             |
     ; SSE2: [[ADD1:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV1]], [[UV3]]
     ; AVX1: [[ADD:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV]], [[UV2]]
     ; AVX1: [[ADD1:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV1]], [[UV3]]
-    ; SSE2: [[MV:%[0-9]+]]:_(<32 x s8>) = G_MERGE_VALUES [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
-    ; AVX1: [[MV:%[0-9]+]]:_(<32 x s8>) = G_MERGE_VALUES [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
+    ; SSE2: [[MV:%[0-9]+]]:_(<32 x s8>) = G_CONCAT_VECTORS [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<32 x s8>) = G_CONCAT_VECTORS [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
     ; SSE2: $ymm0 = COPY [[MV]](<32 x s8>)
     ; AVX1: $ymm0 = COPY [[MV]](<32 x s8>)
     ; AVX2: [[ADD:%[0-9]+]]:_(<32 x s8>) = G_ADD [[DEF]], [[DEF1]]
@@ -83,12 +83,12 @@ body:             |
     ; SSE2: [[UV2:%[0-9]+]]:_(<8 x s16>), [[UV3:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF1]](<16 x s16>)
     ; SSE2: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV]], [[UV2]]
     ; SSE2: [[ADD1:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV1]], [[UV3]]
-    ; SSE2: [[MV:%[0-9]+]]:_(<16 x s16>) = G_MERGE_VALUES [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>)
+    ; SSE2: [[MV:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>)
     ; AVX1: [[UV:%[0-9]+]]:_(<8 x s16>), [[UV1:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF]](<16 x s16>)
     ; AVX1: [[UV2:%[0-9]+]]:_(<8 x s16>), [[UV3:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF1]](<16 x s16>)
     ; AVX1: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV]], [[UV2]]
     ; AVX1: [[ADD1:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV1]], [[UV3]]
-    ; AVX1: [[MV:%[0-9]+]]:_(<16 x s16>) = G_MERGE_VALUES [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>)
     ; SSE2: $ymm0 = COPY [[MV]](<16 x s16>)
     ; AVX1: $ymm0 = COPY [[MV]](<16 x s16>)
     ; AVX2: [[ADD:%[0-9]+]]:_(<16 x s16>) = G_ADD [[DEF]], [[DEF1]]
@@ -121,13 +121,13 @@ body:             |
     ; SSE2: [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF1]](<8 x s32>)
     ; SSE2: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV]], [[UV2]]
     ; SSE2: [[ADD1:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV1]], [[UV3]]
-    ; SSE2: [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>)
+    ; SSE2: [[MV:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>)
     ; SSE2: $ymm0 = COPY [[MV]](<8 x s32>)
     ; AVX1: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<8 x s32>)
     ; AVX1: [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF1]](<8 x s32>)
     ; AVX1: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV]], [[UV2]]
     ; AVX1: [[ADD1:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV1]], [[UV3]]
-    ; AVX1: [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>)
     ; AVX1: $ymm0 = COPY [[MV]](<8 x s32>)
     ; AVX2: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[DEF]], [[DEF1]]
     ; AVX2: $ymm0 = COPY [[ADD]](<8 x s32>)
@@ -159,12 +159,12 @@ body:             |
     ; SSE2: [[UV2:%[0-9]+]]:_(<2 x s64>), [[UV3:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF1]](<4 x s64>)
     ; SSE2: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV]], [[UV2]]
     ; SSE2: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV1]], [[UV3]]
-    ; SSE2: [[MV:%[0-9]+]]:_(<4 x s64>) = G_MERGE_VALUES [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>)
+    ; SSE2: [[MV:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>)
     ; AVX1: [[UV:%[0-9]+]]:_(<2 x s64>), [[UV1:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; AVX1: [[UV2:%[0-9]+]]:_(<2 x s64>), [[UV3:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF1]](<4 x s64>)
     ; AVX1: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV]], [[UV2]]
     ; AVX1: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV1]], [[UV3]]
-    ; AVX1: [[MV:%[0-9]+]]:_(<4 x s64>) = G_MERGE_VALUES [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>)
     ; SSE2: $ymm0 = COPY [[MV]](<4 x s64>)
     ; AVX1: $ymm0 = COPY [[MV]](<4 x s64>)
     ; AVX2: [[ADD:%[0-9]+]]:_(<4 x s64>) = G_ADD [[DEF]], [[DEF1]]
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir b/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir
index 22b747f853d8e15af948d15c17069058a39447b2..deb2b6d3809865ada318b55f3412a554c39bf16c 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir
@@ -51,13 +51,13 @@ body:             |
     ; AVX1: [[ADD1:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV1]], [[UV5]]
     ; AVX1: [[ADD2:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV2]], [[UV6]]
     ; AVX1: [[ADD3:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV3]], [[UV7]]
-    ; AVX1: [[MV:%[0-9]+]]:_(<64 x s8>) = G_MERGE_VALUES [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>), [[ADD2]](<16 x s8>), [[ADD3]](<16 x s8>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>), [[ADD2]](<16 x s8>), [[ADD3]](<16 x s8>)
     ; AVX1: $zmm0 = COPY [[MV]](<64 x s8>)
     ; AVX512F: [[UV:%[0-9]+]]:_(<32 x s8>), [[UV1:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[DEF]](<64 x s8>)
     ; AVX512F: [[UV2:%[0-9]+]]:_(<32 x s8>), [[UV3:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[DEF1]](<64 x s8>)
     ; AVX512F: [[ADD:%[0-9]+]]:_(<32 x s8>) = G_ADD [[UV]], [[UV2]]
     ; AVX512F: [[ADD1:%[0-9]+]]:_(<32 x s8>) = G_ADD [[UV1]], [[UV3]]
-    ; AVX512F: [[MV:%[0-9]+]]:_(<64 x s8>) = G_MERGE_VALUES [[ADD]](<32 x s8>), [[ADD1]](<32 x s8>)
+    ; AVX512F: [[MV:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[ADD]](<32 x s8>), [[ADD1]](<32 x s8>)
     ; AVX512F: $zmm0 = COPY [[MV]](<64 x s8>)
     ; AVX512BW: [[ADD:%[0-9]+]]:_(<64 x s8>) = G_ADD [[DEF]], [[DEF1]]
     ; AVX512BW: $zmm0 = COPY [[ADD]](<64 x s8>)
@@ -91,13 +91,13 @@ body:             |
     ; AVX1: [[ADD1:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV1]], [[UV5]]
     ; AVX1: [[ADD2:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV2]], [[UV6]]
     ; AVX1: [[ADD3:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV3]], [[UV7]]
-    ; AVX1: [[MV:%[0-9]+]]:_(<32 x s16>) = G_MERGE_VALUES [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>), [[ADD2]](<8 x s16>), [[ADD3]](<8 x s16>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>), [[ADD2]](<8 x s16>), [[ADD3]](<8 x s16>)
     ; AVX1: $zmm0 = COPY [[MV]](<32 x s16>)
     ; AVX512F: [[UV:%[0-9]+]]:_(<16 x s16>), [[UV1:%[0-9]+]]:_(<16 x s16>) = G_UNMERGE_VALUES [[DEF]](<32 x s16>)
     ; AVX512F: [[UV2:%[0-9]+]]:_(<16 x s16>), [[UV3:%[0-9]+]]:_(<16 x s16>) = G_UNMERGE_VALUES [[DEF1]](<32 x s16>)
     ; AVX512F: [[ADD:%[0-9]+]]:_(<16 x s16>) = G_ADD [[UV]], [[UV2]]
     ; AVX512F: [[ADD1:%[0-9]+]]:_(<16 x s16>) = G_ADD [[UV1]], [[UV3]]
-    ; AVX512F: [[MV:%[0-9]+]]:_(<32 x s16>) = G_MERGE_VALUES [[ADD]](<16 x s16>), [[ADD1]](<16 x s16>)
+    ; AVX512F: [[MV:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[ADD]](<16 x s16>), [[ADD1]](<16 x s16>)
     ; AVX512F: $zmm0 = COPY [[MV]](<32 x s16>)
     ; AVX512BW: [[ADD:%[0-9]+]]:_(<32 x s16>) = G_ADD [[DEF]], [[DEF1]]
     ; AVX512BW: $zmm0 = COPY [[ADD]](<32 x s16>)
@@ -131,7 +131,7 @@ body:             |
     ; AVX1: [[ADD1:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV1]], [[UV5]]
     ; AVX1: [[ADD2:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV2]], [[UV6]]
     ; AVX1: [[ADD3:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV3]], [[UV7]]
-    ; AVX1: [[MV:%[0-9]+]]:_(<16 x s32>) = G_MERGE_VALUES [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>), [[ADD2]](<4 x s32>), [[ADD3]](<4 x s32>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>), [[ADD2]](<4 x s32>), [[ADD3]](<4 x s32>)
     ; AVX1: $zmm0 = COPY [[MV]](<16 x s32>)
     ; AVX512F: [[ADD:%[0-9]+]]:_(<16 x s32>) = G_ADD [[DEF]], [[DEF1]]
     ; AVX512F: $zmm0 = COPY [[ADD]](<16 x s32>)
@@ -167,7 +167,7 @@ body:             |
     ; AVX1: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV1]], [[UV5]]
     ; AVX1: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV2]], [[UV6]]
     ; AVX1: [[ADD3:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV3]], [[UV7]]
-    ; AVX1: [[MV:%[0-9]+]]:_(<8 x s64>) = G_MERGE_VALUES [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>), [[ADD2]](<2 x s64>), [[ADD3]](<2 x s64>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<8 x s64>) = G_CONCAT_VECTORS [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>), [[ADD2]](<2 x s64>), [[ADD3]](<2 x s64>)
     ; AVX1: $zmm0 = COPY [[MV]](<8 x s64>)
     ; AVX512F: [[ADD:%[0-9]+]]:_(<8 x s64>) = G_ADD [[DEF]], [[DEF1]]
     ; AVX512F: $zmm0 = COPY [[ADD]](<8 x s64>)
@@ -215,16 +215,16 @@ body:             |
     ; AVX1: [[ADD1:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV1]], [[UV5]]
     ; AVX1: [[ADD2:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV2]], [[UV6]]
     ; AVX1: [[ADD3:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV3]], [[UV7]]
-    ; AVX1: [[MV:%[0-9]+]]:_(<32 x s8>) = G_MERGE_VALUES [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
-    ; AVX1: [[MV1:%[0-9]+]]:_(<32 x s8>) = G_MERGE_VALUES [[ADD2]](<16 x s8>), [[ADD3]](<16 x s8>)
+    ; AVX1: [[MV:%[0-9]+]]:_(<32 x s8>) = G_CONCAT_VECTORS [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
+    ; AVX1: [[MV1:%[0-9]+]]:_(<32 x s8>) = G_CONCAT_VECTORS [[ADD2]](<16 x s8>), [[ADD3]](<16 x s8>)
     ; AVX1: $ymm0 = COPY [[MV]](<32 x s8>)
     ; AVX1: $ymm1 = COPY [[MV1]](<32 x s8>)
     ; AVX512F: [[ADD:%[0-9]+]]:_(<32 x s8>) = G_ADD [[COPY]], [[COPY2]]
     ; AVX512F: [[ADD1:%[0-9]+]]:_(<32 x s8>) = G_ADD [[COPY1]], [[COPY3]]
     ; AVX512F: $ymm0 = COPY [[ADD]](<32 x s8>)
     ; AVX512F: $ymm1 = COPY [[ADD1]](<32 x s8>)
-    ; AVX512BW: [[MV:%[0-9]+]]:_(<64 x s8>) = G_MERGE_VALUES [[COPY]](<32 x s8>), [[COPY1]](<32 x s8>)
-    ; AVX512BW: [[MV1:%[0-9]+]]:_(<64 x s8>) = G_MERGE_VALUES [[COPY2]](<32 x s8>), [[COPY3]](<32 x s8>)
+    ; AVX512BW: [[MV:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[COPY]](<32 x s8>), [[COPY1]](<32 x s8>)
+    ; AVX512BW: [[MV1:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[COPY2]](<32 x s8>), [[COPY3]](<32 x s8>)
     ; AVX512BW: [[ADD:%[0-9]+]]:_(<64 x s8>) = G_ADD [[MV]], [[MV1]]
     ; AVX512BW: [[UV:%[0-9]+]]:_(<32 x s8>), [[UV1:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[ADD]](<64 x s8>)
     ; AVX512BW: $ymm0 = COPY [[UV]](<32 x s8>)
@@ -234,8 +234,8 @@ body:             |
     %3(<32 x s8>) = COPY $ymm1
     %4(<32 x s8>) = COPY $ymm2
     %5(<32 x s8>) = COPY $ymm3
-    %0(<64 x s8>) = G_MERGE_VALUES %2(<32 x s8>), %3(<32 x s8>)
-    %1(<64 x s8>) = G_MERGE_VALUES %4(<32 x s8>), %5(<32 x s8>)
+    %0(<64 x s8>) = G_CONCAT_VECTORS %2(<32 x s8>), %3(<32 x s8>)
+    %1(<64 x s8>) = G_CONCAT_VECTORS %4(<32 x s8>), %5(<32 x s8>)
     %6(<64 x s8>) = G_ADD %0, %1
     %7(<32 x s8>), %8(<32 x s8>) = G_UNMERGE_VALUES %6(<64 x s8>)
     $ymm0 = COPY %7(<32 x s8>)
diff --git a/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir b/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
index 9cd4495fa515e483052f9c7e17c5c52974a2327c..ab7532f0e0a5c6d0afe0df052482d12d6b85bcd5 100644
--- a/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
+++ b/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
@@ -33,7 +33,7 @@ body:             |
     ; AVX512VL: $ymm0 = COPY [[VINSERTF32x4Z256rr]]
     ; AVX512VL: RET 0, implicit $ymm0
     %0(<4 x s32>) = IMPLICIT_DEF
-    %1(<8 x s32>) = G_MERGE_VALUES %0(<4 x s32>), %0(<4 x s32>)
+    %1(<8 x s32>) = G_CONCAT_VECTORS %0(<4 x s32>), %0(<4 x s32>)
     $ymm0 = COPY %1(<8 x s32>)
     RET 0, implicit $ymm0
 
diff --git a/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir b/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
index 61f412d7649795521fe93364bc20a0a8cdd3c007..d188d3ef9adb22436264db360876d2d7b692bb1e 100644
--- a/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
+++ b/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
@@ -30,7 +30,7 @@ body:             |
     ; ALL: $zmm0 = COPY [[VINSERTF32x4Zrr2]]
     ; ALL: RET 0, implicit $zmm0
     %0(<4 x s32>) = IMPLICIT_DEF
-    %1(<16 x s32>) = G_MERGE_VALUES %0(<4 x s32>), %0(<4 x s32>), %0(<4 x s32>), %0(<4 x s32>)
+    %1(<16 x s32>) = G_CONCAT_VECTORS %0(<4 x s32>), %0(<4 x s32>), %0(<4 x s32>), %0(<4 x s32>)
     $zmm0 = COPY %1(<16 x s32>)
     RET 0, implicit $zmm0
 
@@ -53,7 +53,7 @@ body:             |
     ; ALL: $zmm0 = COPY [[VINSERTF64x4Zrr]]
     ; ALL: RET 0, implicit $zmm0
     %0(<8 x s32>) = IMPLICIT_DEF
-    %1(<16 x s32>) = G_MERGE_VALUES %0(<8 x s32>), %0(<8 x s32>)
+    %1(<16 x s32>) = G_CONCAT_VECTORS %0(<8 x s32>), %0(<8 x s32>)
     $zmm0 = COPY %1(<16 x s32>)
     RET 0, implicit $zmm0
 
diff --git a/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll b/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
index 96056c1638748df569fd321750c44d6c90c310a1..31057ef2ce0d58d9fc1a618dc550b04671c913c2 100644
--- a/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
+++ b/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
@@ -11,8 +11,9 @@ define i16 @test_shl_i4(i16 %v, i16 %a, i16 %b) {
 ; X64-LABEL: test_shl_i4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    addb %sil, %cl
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    leal (%rdx,%rsi), %ecx
 ; X64-NEXT:    andb $15, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shlb %cl, %al
diff --git a/test/CodeGen/X86/GlobalISel/shl-scalar.ll b/test/CodeGen/X86/GlobalISel/shl-scalar.ll
index 54345fb4038751aa4a96340ee91543f825da6657..4f2de3450f191d5cb154fbdb0317d4a172d01072 100644
--- a/test/CodeGen/X86/GlobalISel/shl-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/shl-scalar.ll
@@ -147,8 +147,8 @@ define i8 @test_shl_i8_imm(i32 %arg1) {
 define i8 @test_shl_i8_imm1(i32 %arg1) {
 ; X64-LABEL: test_shl_i8_imm1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %al, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %a = trunc i32 %arg1 to i8
diff --git a/test/CodeGen/X86/GlobalISel/undef.ll b/test/CodeGen/X86/GlobalISel/undef.ll
index 106c2673a2ee98cea53f25f14d8d95e431fc1d00..b81dab9b39b457a10f487cc98d00dd6e0c87e030 100644
--- a/test/CodeGen/X86/GlobalISel/undef.ll
+++ b/test/CodeGen/X86/GlobalISel/undef.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu            -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
 
 define i8 @test() {
 ; ALL-LABEL: test:
@@ -8,8 +8,8 @@ define i8 @test() {
   ret i8 undef
 }
 
-define i8 @test2(i8 %a) {
-; ALL-LABEL: test2:
+define i8 @add_undef_i8(i8 %a) {
+; ALL-LABEL: add_undef_i8:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    movl %edi, %eax
 ; ALL-NEXT:    addb %al, %al
@@ -19,6 +19,35 @@ define i8 @test2(i8 %a) {
   ret i8 %r
 }
 
+define i16 @add_undef_i16(i16 %a) {
+; ALL-LABEL: add_undef_i16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    movl %edi, %eax
+; ALL-NEXT:    addw %ax, %ax
+; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
+; ALL-NEXT:    retq
+  %r = add i16 %a, undef
+  ret i16 %r
+}
+
+define i16 @add_undef_i16_commute(i16 %a) {
+; ALL-LABEL: add_undef_i16_commute:
+; ALL:       # %bb.0:
+; ALL-NEXT:    addw %di, %ax
+; ALL-NEXT:    retq
+  %r = add i16 undef, %a
+  ret i16 %r
+}
+
+define i32 @add_undef_i32(i32 %a) {
+; ALL-LABEL: add_undef_i32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    movl %edi, %eax
+; ALL-NEXT:    addl %eax, %eax
+; ALL-NEXT:    retq
+  %r = add i32 %a, undef
+  ret i32 %r
+}
 
 define float @test3() {
 ; ALL-LABEL: test3:
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
index d9a093b8c59d51523a54bb497bb49123dfc89bba..67b1aceecf67e0a8db8b18c472d2bb3de9d4f26e 100644
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -58,6 +58,8 @@
 ; CHECK-NEXT:       Shadow Call Stack
 ; CHECK-NEXT:       X86 Indirect Branch Tracking
 ; CHECK-NEXT:       X86 vzeroupper inserter
+; CHECK-NEXT:       X86 Discriminate Memory Operands
+; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
diff --git a/test/CodeGen/X86/O3-pipeline.ll b/test/CodeGen/X86/O3-pipeline.ll
index 9828d1eeab104fd2ff6f237c6701ad45aacb2711..224c435ebd1da17aa0e5116a9a4d6a602b94939f 100644
--- a/test/CodeGen/X86/O3-pipeline.ll
+++ b/test/CodeGen/X86/O3-pipeline.ll
@@ -75,7 +75,6 @@
 ; CHECK-NEXT:       Merge disjoint stack slots
 ; CHECK-NEXT:       Local Stack Slot Allocation
 ; CHECK-NEXT:       Remove dead machine instructions
-; CHECK-NEXT:       X86 CondBr Folding
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Machine Trace Metrics
@@ -160,6 +159,8 @@
 ; CHECK-NEXT:       X86 Atom pad short functions
 ; CHECK-NEXT:       X86 LEA Fixup
 ; CHECK-NEXT:       Compressing EVEX instrs to VEX encoding when possible
+; CHECK-NEXT:       X86 Discriminate Memory Operands
+; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
diff --git a/test/CodeGen/X86/add-sub-nsw-nuw.ll b/test/CodeGen/X86/add-sub-nsw-nuw.ll
index be6d20ee02dd89e38f7449b6fe0c11520a47a2cb..6ff37de594ad9a7f51d2e160b79f6904bd8e6777 100644
--- a/test/CodeGen/X86/add-sub-nsw-nuw.ll
+++ b/test/CodeGen/X86/add-sub-nsw-nuw.ll
@@ -9,7 +9,7 @@ define i8 @PR30841(i64 %argc) {
 ; CHECK-LABEL: PR30841:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    negb %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retl
 entry:
diff --git a/test/CodeGen/X86/adx-commute.mir b/test/CodeGen/X86/adx-commute.mir
index 11fb1e641849d9305a539ff12205b437bf95e973..1e204e339134e55986f65b1e0dde81becf5770ba 100644
--- a/test/CodeGen/X86/adx-commute.mir
+++ b/test/CodeGen/X86/adx-commute.mir
@@ -7,7 +7,7 @@
   target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
   define void @adcx32_commute(i8 %cf, i32 %a, i32 %b, i32* %res) #0 {
-    %ret = call { i8, i32 } @llvm.x86.addcarryx.u32(i8 %cf, i32 %a, i32 %b)
+    %ret = call { i8, i32 } @llvm.x86.addcarry.32(i8 %cf, i32 %a, i32 %b)
     %1 = extractvalue { i8, i32 } %ret, 1
     %2 = mul i32 %a, %1
     store i32 %2, i32* %res
@@ -15,7 +15,7 @@
   }
 
   define void @adcx64_commute(i8 %cf, i64 %a, i64 %b, i64* %res) #0 {
-    %ret = call { i8, i64 } @llvm.x86.addcarryx.u64(i8 %cf, i64 %a, i64 %b)
+    %ret = call { i8, i64 } @llvm.x86.addcarry.64(i8 %cf, i64 %a, i64 %b)
     %1 = extractvalue { i8, i64 } %ret, 1
     %2 = mul i64 %a, %1
     store i64 %2, i64* %res
@@ -23,7 +23,7 @@
   }
 
   define void @adox32_commute(i8 %cf, i32 %a, i32 %b, i32* %res) #0 {
-    %ret = call { i8, i32 } @llvm.x86.addcarryx.u32(i8 %cf, i32 %a, i32 %b)
+    %ret = call { i8, i32 } @llvm.x86.addcarry.32(i8 %cf, i32 %a, i32 %b)
     %1 = extractvalue { i8, i32 } %ret, 1
     %2 = mul i32 %a, %1
     store i32 %2, i32* %res
@@ -31,7 +31,7 @@
   }
 
   define void @adox64_commute(i8 %cf, i64 %a, i64 %b, i64* %res) #0 {
-    %ret = call { i8, i64 } @llvm.x86.addcarryx.u64(i8 %cf, i64 %a, i64 %b)
+    %ret = call { i8, i64 } @llvm.x86.addcarry.64(i8 %cf, i64 %a, i64 %b)
     %1 = extractvalue { i8, i64 } %ret, 1
     %2 = mul i64 %a, %1
     store i64 %2, i64* %res
@@ -39,10 +39,10 @@
   }
 
   ; Function Attrs: nounwind readnone
-  declare { i8, i32 } @llvm.x86.addcarryx.u32(i8, i32, i32) #1
+  declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32) #1
 
   ; Function Attrs: nounwind readnone
-  declare { i8, i64 } @llvm.x86.addcarryx.u64(i8, i64, i64) #1
+  declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64) #1
 
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(i8*, i8**) #2
diff --git a/test/CodeGen/X86/adx-intrinsics-upgrade.ll b/test/CodeGen/X86/adx-intrinsics-upgrade.ll
index fcb7165fb617c29aacef0781aa5d820ec3064b6f..34f8ff8725958e0fe2cf20d84eac2e3c2dec7bc1 100644
--- a/test/CodeGen/X86/adx-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/adx-intrinsics-upgrade.ll
@@ -90,13 +90,11 @@ define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
 define i32 @load_crash(i64* nocapture readonly %a, i64* nocapture readonly %b, i64* %res)  {
 ; CHECK-LABEL: load_crash:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movq (%rdi), %rax ## encoding: [0x48,0x8b,0x07]
-; CHECK-NEXT:    xorl %ecx, %ecx ## encoding: [0x31,0xc9]
-; CHECK-NEXT:    addb $-1, %cl ## encoding: [0x80,0xc1,0xff]
-; CHECK-NEXT:    adcq (%rsi), %rax ## encoding: [0x48,0x13,0x06]
-; CHECK-NEXT:    setb %cl ## encoding: [0x0f,0x92,0xc1]
-; CHECK-NEXT:    movq %rax, (%rdx) ## encoding: [0x48,0x89,0x02]
-; CHECK-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; CHECK-NEXT:    movq (%rdi), %rcx ## encoding: [0x48,0x8b,0x0f]
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    addq (%rsi), %rcx ## encoding: [0x48,0x03,0x0e]
+; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT:    movq %rcx, (%rdx) ## encoding: [0x48,0x89,0x0a]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %1 = load i64, i64* %a, align 8
   %2 = load i64, i64* %b, align 8
@@ -111,9 +109,7 @@ define void @allzeros() {
 ; CHECK-LABEL: allzeros:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    addb $-1, %al ## encoding: [0x04,0xff]
-; CHECK-NEXT:    sbbq %rax, %rax ## encoding: [0x48,0x19,0xc0]
-; CHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT:    addq $0, %rax ## encoding: [0x48,0x83,0xc0,0x00]
 ; CHECK-NEXT:    movq %rax, 0 ## encoding: [0x48,0x89,0x04,0x25,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
 entry:
diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll
index ba820d01f50094b1469976c70208a7c83da3e92a..d37be0653289bbd36773f4941d48cc6b61771495 100644
--- a/test/CodeGen/X86/adx-intrinsics.ll
+++ b/test/CodeGen/X86/adx-intrinsics.ll
@@ -2,26 +2,8 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=NOADX
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=ADX
 
-declare { i8, i32 } @llvm.x86.addcarryx.u32(i8, i32, i32)
-
-define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarryx_u32:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
-; CHECK-NEXT:    adcl %edx, %esi ## encoding: [0x11,0xd6]
-; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT:    movl %esi, (%rcx) ## encoding: [0x89,0x31]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %ret = call { i8, i32 } @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b)
-  %1 = extractvalue { i8, i32 } %ret, 1
-  %2 = bitcast i8* %ptr to i32*
-  store i32 %1, i32* %2, align 1
-  %3 = extractvalue { i8, i32 } %ret, 0
-  ret i8 %3
-}
-
-define i8 @test_addcarryx_u32_load(i8 %c, i32* %aptr, i32 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarryx_u32_load:
+define i8 @test_addcarry_32_load(i8 %c, i32* %aptr, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_32_load:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
 ; CHECK-NEXT:    adcl (%rsi), %edx ## encoding: [0x13,0x16]
@@ -29,7 +11,7 @@ define i8 @test_addcarryx_u32_load(i8 %c, i32* %aptr, i32 %b, i8* %ptr) {
 ; CHECK-NEXT:    movl %edx, (%rcx) ## encoding: [0x89,0x11]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %a = load i32, i32* %aptr
-  %ret = call { i8, i32 } @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b)
+  %ret = call { i8, i32 } @llvm.x86.addcarry.32(i8 %c, i32 %a, i32 %b)
   %1 = extractvalue { i8, i32 } %ret, 1
   %2 = bitcast i8* %ptr to i32*
   store i32 %1, i32* %2, align 1
@@ -37,8 +19,8 @@ define i8 @test_addcarryx_u32_load(i8 %c, i32* %aptr, i32 %b, i8* %ptr) {
   ret i8 %3
 }
 
-define i8 @test_addcarryx_u32_load2(i8 %c, i32 %a, i32* %bptr, i8* %ptr) {
-; CHECK-LABEL: test_addcarryx_u32_load2:
+define i8 @test_addcarry_32_load2(i8 %c, i32 %a, i32* %bptr, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_32_load2:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
 ; CHECK-NEXT:    adcl (%rdx), %esi ## encoding: [0x13,0x32]
@@ -46,7 +28,7 @@ define i8 @test_addcarryx_u32_load2(i8 %c, i32 %a, i32* %bptr, i8* %ptr) {
 ; CHECK-NEXT:    movl %esi, (%rcx) ## encoding: [0x89,0x31]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load i32, i32* %bptr
-  %ret = call { i8, i32 } @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b)
+  %ret = call { i8, i32 } @llvm.x86.addcarry.32(i8 %c, i32 %a, i32 %b)
   %1 = extractvalue { i8, i32 } %ret, 1
   %2 = bitcast i8* %ptr to i32*
   store i32 %1, i32* %2, align 1
@@ -54,35 +36,17 @@ define i8 @test_addcarryx_u32_load2(i8 %c, i32 %a, i32* %bptr, i8* %ptr) {
   ret i8 %3
 }
 
-declare { i8, i64 } @llvm.x86.addcarryx.u64(i8, i64, i64)
-
-define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarryx_u64:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
-; CHECK-NEXT:    adcq %rdx, %rsi ## encoding: [0x48,0x11,0xd6]
-; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT:    movq %rsi, (%rcx) ## encoding: [0x48,0x89,0x31]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %ret = call { i8, i64 } @llvm.x86.addcarryx.u64(i8 %c, i64 %a, i64 %b)
-  %1 = extractvalue { i8, i64 } %ret, 1
-  %2 = bitcast i8* %ptr to i64*
-  store i64 %1, i64* %2, align 1
-  %3 = extractvalue { i8, i64 } %ret, 0
-  ret i8 %3
-}
-
-declare { i8, i32 } @llvm.x86.addcarry.u32(i8, i32, i32)
+declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
 
-define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarry_u32:
+define i8 @test_addcarry_32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
 ; CHECK-NEXT:    adcl %edx, %esi ## encoding: [0x11,0xd6]
 ; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; CHECK-NEXT:    movl %esi, (%rcx) ## encoding: [0x89,0x31]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %ret = call { i8, i32 } @llvm.x86.addcarry.u32(i8 %c, i32 %a, i32 %b)
+  %ret = call { i8, i32 } @llvm.x86.addcarry.32(i8 %c, i32 %a, i32 %b)
   %1 = extractvalue { i8, i32 } %ret, 1
   %2 = bitcast i8* %ptr to i32*
   store i32 %1, i32* %2, align 1
@@ -90,17 +54,17 @@ define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
   ret i8 %3
 }
 
-declare { i8, i64 } @llvm.x86.addcarry.u64(i8, i64, i64)
+declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64)
 
-define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarry_u64:
+define i8 @test_addcarry_64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_64:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
 ; CHECK-NEXT:    adcq %rdx, %rsi ## encoding: [0x48,0x11,0xd6]
 ; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; CHECK-NEXT:    movq %rsi, (%rcx) ## encoding: [0x48,0x89,0x31]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %ret = call { i8, i64 } @llvm.x86.addcarry.u64(i8 %c, i64 %a, i64 %b)
+  %ret = call { i8, i64 } @llvm.x86.addcarry.64(i8 %c, i64 %a, i64 %b)
   %1 = extractvalue { i8, i64 } %ret, 1
   %2 = bitcast i8* %ptr to i64*
   store i64 %1, i64* %2, align 1
@@ -108,17 +72,17 @@ define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
   ret i8 %3
 }
 
-declare { i8, i32 } @llvm.x86.subborrow.u32(i8, i32, i32)
+declare { i8, i32 } @llvm.x86.subborrow.32(i8, i32, i32)
 
-define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
-; CHECK-LABEL: test_subborrow_u32:
+define i8 @test_subborrow_32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_subborrow_32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
 ; CHECK-NEXT:    sbbl %edx, %esi ## encoding: [0x19,0xd6]
 ; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; CHECK-NEXT:    movl %esi, (%rcx) ## encoding: [0x89,0x31]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %ret = call { i8, i32 } @llvm.x86.subborrow.u32(i8 %c, i32 %a, i32 %b)
+  %ret = call { i8, i32 } @llvm.x86.subborrow.32(i8 %c, i32 %a, i32 %b)
   %1 = extractvalue { i8, i32 } %ret, 1
   %2 = bitcast i8* %ptr to i32*
   store i32 %1, i32* %2, align 1
@@ -126,17 +90,17 @@ define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
   ret i8 %3
 }
 
-declare { i8, i64 } @llvm.x86.subborrow.u64(i8, i64, i64)
+declare { i8, i64 } @llvm.x86.subborrow.64(i8, i64, i64)
 
-define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
-; CHECK-LABEL: test_subborrow_u64:
+define i8 @test_subborrow_64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_subborrow_64:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
 ; CHECK-NEXT:    sbbq %rdx, %rsi ## encoding: [0x48,0x19,0xd6]
 ; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; CHECK-NEXT:    movq %rsi, (%rcx) ## encoding: [0x48,0x89,0x31]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %ret = call { i8, i64 } @llvm.x86.subborrow.u64(i8 %c, i64 %a, i64 %b)
+  %ret = call { i8, i64 } @llvm.x86.subborrow.64(i8 %c, i64 %a, i64 %b)
   %1 = extractvalue { i8, i64 } %ret, 1
   %2 = bitcast i8* %ptr to i64*
   store i64 %1, i64* %2, align 1
@@ -148,18 +112,16 @@ define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
 define i32 @load_crash(i64* nocapture readonly %a, i64* nocapture readonly %b, i64* %res)  {
 ; CHECK-LABEL: load_crash:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movq (%rdi), %rax ## encoding: [0x48,0x8b,0x07]
-; CHECK-NEXT:    xorl %ecx, %ecx ## encoding: [0x31,0xc9]
-; CHECK-NEXT:    addb $-1, %cl ## encoding: [0x80,0xc1,0xff]
-; CHECK-NEXT:    adcq (%rsi), %rax ## encoding: [0x48,0x13,0x06]
-; CHECK-NEXT:    setb %cl ## encoding: [0x0f,0x92,0xc1]
-; CHECK-NEXT:    movq %rax, (%rdx) ## encoding: [0x48,0x89,0x02]
-; CHECK-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; CHECK-NEXT:    movq (%rdi), %rcx ## encoding: [0x48,0x8b,0x0f]
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    addq (%rsi), %rcx ## encoding: [0x48,0x03,0x0e]
+; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT:    movq %rcx, (%rdx) ## encoding: [0x48,0x89,0x0a]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %1 = load i64, i64* %a, align 8
   %2 = load i64, i64* %b, align 8
   %3 = bitcast i64* %res to i8*
-  %4 = call { i8, i64 } @llvm.x86.addcarryx.u64(i8 0, i64 %1, i64 %2)
+  %4 = call { i8, i64 } @llvm.x86.addcarry.64(i8 0, i64 %1, i64 %2)
   %5 = extractvalue { i8, i64 } %4, 1
   %6 = bitcast i8* %3 to i64*
   store i64 %5, i64* %6, align 1
@@ -173,13 +135,11 @@ define void @allzeros() {
 ; CHECK-LABEL: allzeros:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    addb $-1, %al ## encoding: [0x04,0xff]
-; CHECK-NEXT:    sbbq %rax, %rax ## encoding: [0x48,0x19,0xc0]
-; CHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT:    addq $0, %rax ## encoding: [0x48,0x83,0xc0,0x00]
 ; CHECK-NEXT:    movq %rax, 0 ## encoding: [0x48,0x89,0x04,0x25,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
 entry:
-  %0 = call { i8, i64 } @llvm.x86.addcarryx.u64(i8 0, i64 0, i64 0)
+  %0 = call { i8, i64 } @llvm.x86.addcarry.64(i8 0, i64 0, i64 0)
   %1 = extractvalue { i8, i64 } %0, 1
   store i64 %1, i64* null, align 1
   %2 = extractvalue { i8, i64 } %0, 0
diff --git a/test/CodeGen/X86/and-load-fold.ll b/test/CodeGen/X86/and-load-fold.ll
index 29ab3242ce266779ab1fa508dc8199263bfcb6f4..4c49df1c5bc15cbf6f08f7088951d0d13befccca 100644
--- a/test/CodeGen/X86/and-load-fold.ll
+++ b/test/CodeGen/X86/and-load-fold.ll
@@ -1,15 +1,22 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-- -mcpu=generic < %s | FileCheck %s
 
 ; Verify that the DAGCombiner doesn't wrongly remove the 'and' from the dag.
 
 define i8 @foo(<4 x i8>* %V) {
 ; CHECK-LABEL: foo:
-; CHECK: pand
-; CHECK: ret
-entry:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pextrw $4, %xmm0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %Vp = bitcast <4 x i8>* %V to <3 x i8>*
   %V3i8 = load <3 x i8>, <3 x i8>* %Vp, align 4
-  %0 = and <3 x i8> %V3i8, <i8 undef, i8 undef, i8 95>
-  %1 = extractelement <3 x i8> %0, i64 2
-  ret i8 %1
+  %t0 = and <3 x i8> %V3i8, <i8 undef, i8 undef, i8 95>
+  %t1 = extractelement <3 x i8> %t0, i64 2
+  ret i8 %t1
 }
+
diff --git a/test/CodeGen/X86/anyext.ll b/test/CodeGen/X86/anyext.ll
index 66be5216eb45b7aef4d21541d384dabbbd5d6b1a..93206737e795022f36b281cb4bb3afe38c665c6b 100644
--- a/test/CodeGen/X86/anyext.ll
+++ b/test/CodeGen/X86/anyext.ll
@@ -42,8 +42,8 @@ define i32 @bar(i32 %p, i16 zeroext %x) nounwind {
 ; X64-LABEL: bar:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divw %si
 ; X64-NEXT:    # kill: def $ax killed $ax def $eax
 ; X64-NEXT:    andl $1, %eax
diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll
index f0d2c1596f84bd217c28f26aa7f2ed0aff976089..353580bbf7e4b156a61bf104e0968a27a721f2fe 100644
--- a/test/CodeGen/X86/atomic_mi.ll
+++ b/test/CodeGen/X86/atomic_mi.ll
@@ -93,11 +93,11 @@ define void @store_atomic_imm_64(i64* %p) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    movl $42, %ebx
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    xorl %ecx, %ecx
-; X32-NEXT:    movl $42, %ebx
 ; X32-NEXT:    lock cmpxchg8b (%esi)
 ; X32-NEXT:    jne .LBB3_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
@@ -132,11 +132,11 @@ define void @store_atomic_imm_64_big(i64* %p) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
+; X32-NEXT:    movl $23, %ecx
+; X32-NEXT:    movl $1215752192, %ebx # imm = 0x4876E800
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl $23, %ecx
-; X32-NEXT:    movl $1215752192, %ebx # imm = 0x4876E800
 ; X32-NEXT:    lock cmpxchg8b (%esi)
 ; X32-NEXT:    jne .LBB4_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
@@ -753,10 +753,10 @@ define void @and_64i(i64* %p) {
 ; X32-NEXT:    andl $2, %ebx
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
+; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB31_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    lock cmpxchg8b (%esi)
 ; X32-NEXT:    jne .LBB31_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
@@ -2245,11 +2245,11 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) {
 ; X32-NEXT:    .cfi_offset %edi, -16
 ; X32-NEXT:    .cfi_offset %ebx, -12
 ; X32-NEXT:    movl 20(%ebp), %esi
+; X32-NEXT:    movl 8(%ebp), %edi
 ; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    xorl %ebx, %ebx
-; X32-NEXT:    movl 8(%ebp), %edi
 ; X32-NEXT:    lock cmpxchg8b (%edi,%esi,8)
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index f090585951bfa90ce4cf4e131fb323a7ea4fd602..93b7c3350e0998d2bf1eb63ff6a115a5ad73adff 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -100,13 +100,11 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT:    vpavgb 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -268,94 +266,97 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v48i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[3,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[3,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[3,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,0,1]
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
 ; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 ; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[3,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm8
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm4
+; AVX1-NEXT:    vmovdqa 32(%rsi), %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm2, %xmm11, %xmm11
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm2, %xmm7, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm12, %xmm12
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[3,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm5, %xmm13, %xmm13
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm8
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm1, %xmm15, %xmm15
+; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm14, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm6, %xmm10, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vpaddd %xmm4, %xmm14, %xmm14
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm7, %xmm7
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT:    vpsubd %xmm7, %xmm1, %xmm10
-; AVX1-NEXT:    vpsubd %xmm7, %xmm9, %xmm9
-; AVX1-NEXT:    vpsubd %xmm7, %xmm8, %xmm8
-; AVX1-NEXT:    vpsubd %xmm7, %xmm11, %xmm11
-; AVX1-NEXT:    vpsubd %xmm7, %xmm12, %xmm12
-; AVX1-NEXT:    vpsubd %xmm7, %xmm13, %xmm5
-; AVX1-NEXT:    vpsubd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm7, %xmm15, %xmm1
-; AVX1-NEXT:    vpsubd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm13, %xmm13
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX1-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm5, %xmm12, %xmm12
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm7, %xmm11, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm2, %xmm15, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm0, %xmm10, %xmm10
+; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm9
+; AVX1-NEXT:    vpsubd %xmm0, %xmm8, %xmm8
+; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm11
+; AVX1-NEXT:    vpsubd %xmm0, %xmm14, %xmm14
+; AVX1-NEXT:    vpsubd %xmm0, %xmm13, %xmm5
+; AVX1-NEXT:    vpsubd %xmm0, %xmm12, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm3
 ; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm3
 ; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm4
+; AVX1-NEXT:    vpsrld $1, %xmm14, %xmm4
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm4
 ; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm5
@@ -363,17 +364,17 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
 ; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm5
 ; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm6
 ; AVX1-NEXT:    vpackusdw %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -381,72 +382,66 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
 ;
 ; AVX2-LABEL: avg_v48i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[3,1,2,3]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpbroadcastq 24(%rdi), %ymm1
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[3,1,2,3]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm5, %ymm6, %ymm5
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm2, %ymm7, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = mem[2,3,0,1]
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm3, %ymm8, %ymm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = mem[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq 24(%rsi), %ymm6
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = mem[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm6, %ymm3, %ymm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = mem[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm6, %ymm5, %ymm5
 ; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT:    vpsubd %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vpsubd %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubd %ymm6, %ymm5, %ymm5
 ; AVX2-NEXT:    vpsubd %ymm6, %ymm0, %ymm7
+; AVX2-NEXT:    vpsubd %ymm6, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubd %ymm6, %ymm3, %ymm0
+; AVX2-NEXT:    vpsubd %ymm6, %ymm3, %ymm3
+; AVX2-NEXT:    vpsubd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpsubd %ymm6, %ymm5, %ymm0
 ; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT:    vpsrld $1, %ymm7, %ymm3
-; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5
 ; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4
+; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX2-NEXT:    vpackusdw %xmm6, %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm7
-; AVX2-NEXT:    vpackusdw %xmm7, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm4
-; AVX2-NEXT:    vpackusdw %xmm4, %xmm5, %xmm4
-; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT:    vpsrld $1, %ymm7, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT:    vpackusdw %xmm6, %xmm5, %xmm5
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
+; AVX2-NEXT:    vpackusdw %xmm7, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm5, %xmm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
+; AVX2-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
+; AVX2-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm5
 ; AVX2-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm4, %xmm2
+; AVX2-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX2-NEXT:    vzeroupper
@@ -454,16 +449,13 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
 ;
 ; AVX512F-LABEL: avg_v48i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm5
-; AVX512F-NEXT:    vpavgb %xmm5, %xmm4, %xmm4
-; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX512F-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm1
 ; AVX512F-NEXT:    vmovdqu %xmm1, (%rax)
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
@@ -471,37 +463,16 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
 ;
 ; AVX512BW-LABEL: avg_v48i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512BW-NEXT:    vpaddd %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
-; AVX512BW-NEXT:    vpaddd %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpsubd %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsubd %zmm1, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrld $1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpsrld $1, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpmovdw %zmm2, %ymm2
-; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu %ymm1, (%rax)
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, (%rax)
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <48 x i8>, <48 x i8>* %a
@@ -535,20 +506,16 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v64i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpavgb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rsi), %xmm3
+; AVX1-NEXT:    vpavgb 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpavgb 48(%rdi), %xmm3, %xmm1
+; AVX1-NEXT:    vpavgb 32(%rdi), %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -662,13 +629,11 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT:    vpavgw 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -719,20 +684,16 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v32i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpavgw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpavgw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rsi), %xmm3
+; AVX1-NEXT:    vpavgw 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpavgw 48(%rdi), %xmm3, %xmm1
+; AVX1-NEXT:    vpavgw 32(%rdi), %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -874,13 +835,11 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v32i8_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -931,16 +890,16 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v64i8_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX1-NEXT:    vpavgb %xmm0, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rsi), %xmm3
 ; AVX1-NEXT:    vpavgb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vpavgb %xmm1, %xmm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpavgb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpavgb %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vpavgb %xmm3, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -1055,13 +1014,11 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v16i16_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1112,20 +1069,16 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
 ;
 ; AVX1-LABEL: avg_v32i16_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpavgw 48(%rsi), %xmm3, %xmm1
+; AVX1-NEXT:    vpavgw 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -1254,12 +1207,10 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
 ;
 ; AVX1-LABEL: avg_v32i8_const:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275]
-; AVX1-NEXT:    # xmm2 = mem[0,0]
-; AVX1-NEXT:    vpavgb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX1-NEXT:    # xmm0 = mem[0,0]
+; AVX1-NEXT:    vpavgb 16(%rdi), %xmm0, %xmm1
+; AVX1-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -1308,20 +1259,16 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
 ;
 ; AVX1-LABEL: avg_v64i8_const:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275]
-; AVX1-NEXT:    # xmm3 = mem[0,0]
-; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpavgb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX1-NEXT:    # xmm0 = mem[0,0]
+; AVX1-NEXT:    vpavgb 16(%rdi), %xmm0, %xmm1
+; AVX1-NEXT:    vpavgb (%rdi), %xmm0, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpavgb 48(%rdi), %xmm0, %xmm2
+; AVX1-NEXT:    vpavgb 32(%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1420,11 +1367,9 @@ define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
 ;
 ; AVX1-LABEL: avg_v16i16_const:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpavgw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpavgw 16(%rdi), %xmm0, %xmm1
+; AVX1-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -1473,19 +1418,15 @@ define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
 ;
 ; AVX1-LABEL: avg_v32i16_const:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpavgw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpavgw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpavgw 16(%rdi), %xmm0, %xmm1
+; AVX1-NEXT:    vpavgw (%rdi), %xmm0, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpavgw 48(%rdi), %xmm0, %xmm2
+; AVX1-NEXT:    vpavgw 32(%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpavgw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1727,131 +1668,97 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
 ; AVX1-NEXT:    pushq %rbp
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
-; AVX1-NEXT:    subq $128, %rsp
+; AVX1-NEXT:    subq $96, %rsp
 ; AVX1-NEXT:    movq %rdi, %rax
-; AVX1-NEXT:    vmovdqa 144(%rbp), %ymm8
-; AVX1-NEXT:    vmovdqa 112(%rbp), %ymm9
-; AVX1-NEXT:    vmovdqa 80(%rbp), %ymm10
-; AVX1-NEXT:    vmovdqa 48(%rbp), %ymm11
-; AVX1-NEXT:    vmovdqa 16(%rbp), %ymm12
-; AVX1-NEXT:    vmovdqa 272(%rbp), %ymm13
-; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm14
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm15
-; AVX1-NEXT:    vpavgb %xmm14, %xmm15, %xmm14
-; AVX1-NEXT:    vmovdqa 304(%rbp), %ymm15
-; AVX1-NEXT:    vpavgb %xmm13, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm14
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vpavgb %xmm14, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa 336(%rbp), %ymm14
-; AVX1-NEXT:    vpavgb %xmm15, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpavgb 272(%rbp), %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpavgb 288(%rbp), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm0
 ; AVX1-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa 368(%rbp), %ymm1
-; AVX1-NEXT:    vpavgb %xmm14, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpavgb 304(%rbp), %xmm1, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpavgb 320(%rbp), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm8, %ymm0
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp) # 32-byte Spill
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa 400(%rbp), %ymm2
-; AVX1-NEXT:    vpavgb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm1
-; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa 432(%rbp), %ymm1
-; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm2
-; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa 464(%rbp), %ymm2
-; AVX1-NEXT:    vpavgb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
-; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa 496(%rbp), %ymm1
-; AVX1-NEXT:    vpavgb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm6
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm2
-; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa 528(%rbp), %ymm2
-; AVX1-NEXT:    vpavgb %xmm1, %xmm7, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm7
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm1
-; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa 560(%rbp), %ymm1
-; AVX1-NEXT:    vpavgb %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm12
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm11, %xmm2
-; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa 592(%rbp), %ymm2
-; AVX1-NEXT:    vpavgb %xmm1, %xmm11, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm11
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm1
-; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa 624(%rbp), %ymm1
-; AVX1-NEXT:    vpavgb %xmm2, %xmm10, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm10
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm2
-; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa 656(%rbp), %ymm2
-; AVX1-NEXT:    vpavgb %xmm1, %xmm9, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
-; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa 176(%rbp), %ymm1
-; AVX1-NEXT:    vpavgb %xmm2, %xmm8, %xmm2
-; AVX1-NEXT:    vmovdqa 688(%rbp), %ymm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm13
-; AVX1-NEXT:    vpavgb %xmm2, %xmm13, %xmm2
-; AVX1-NEXT:    vpavgb %xmm8, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa 208(%rbp), %ymm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm13
-; AVX1-NEXT:    vmovdqa 720(%rbp), %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm15
-; AVX1-NEXT:    vpavgb %xmm1, %xmm15, %xmm1
-; AVX1-NEXT:    vpavgb %xmm2, %xmm8, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vmovdqa 240(%rbp), %ymm15
-; AVX1-NEXT:    vmovdqa 752(%rbp), %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm14
-; AVX1-NEXT:    vpavgb %xmm2, %xmm14, %xmm2
-; AVX1-NEXT:    vpavgb %xmm8, %xmm15, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm8, %ymm2
-; AVX1-NEXT:    vmovaps %ymm2, 480(%rdi)
-; AVX1-NEXT:    vmovaps %ymm1, 448(%rdi)
-; AVX1-NEXT:    vmovaps %ymm13, 416(%rdi)
-; AVX1-NEXT:    vmovaps %ymm0, 384(%rdi)
-; AVX1-NEXT:    vmovaps %ymm9, 352(%rdi)
+; AVX1-NEXT:    vpavgb 336(%rbp), %xmm2, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpavgb 352(%rbp), %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm8, %ymm13
+; AVX1-NEXT:    vpavgb 368(%rbp), %xmm3, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vpavgb 384(%rbp), %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm8, %ymm14
+; AVX1-NEXT:    vpavgb 400(%rbp), %xmm4, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT:    vpavgb 416(%rbp), %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm8, %ymm15
+; AVX1-NEXT:    vpavgb 432(%rbp), %xmm5, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT:    vpavgb 448(%rbp), %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm8, %ymm12
+; AVX1-NEXT:    vpavgb 464(%rbp), %xmm6, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT:    vpavgb 480(%rbp), %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm6
+; AVX1-NEXT:    vpavgb 496(%rbp), %xmm7, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT:    vpavgb 512(%rbp), %xmm7, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT:    vmovdqa 16(%rbp), %xmm0
+; AVX1-NEXT:    vmovdqa 32(%rbp), %xmm1
+; AVX1-NEXT:    vpavgb 528(%rbp), %xmm0, %xmm0
+; AVX1-NEXT:    vpavgb 544(%rbp), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm8
+; AVX1-NEXT:    vmovdqa 48(%rbp), %xmm0
+; AVX1-NEXT:    vmovdqa 64(%rbp), %xmm1
+; AVX1-NEXT:    vpavgb 560(%rbp), %xmm0, %xmm0
+; AVX1-NEXT:    vpavgb 576(%rbp), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm9
+; AVX1-NEXT:    vmovdqa 80(%rbp), %xmm0
+; AVX1-NEXT:    vmovdqa 96(%rbp), %xmm1
+; AVX1-NEXT:    vpavgb 592(%rbp), %xmm0, %xmm0
+; AVX1-NEXT:    vpavgb 608(%rbp), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm10
+; AVX1-NEXT:    vmovdqa 112(%rbp), %xmm0
+; AVX1-NEXT:    vmovdqa 128(%rbp), %xmm1
+; AVX1-NEXT:    vpavgb 624(%rbp), %xmm0, %xmm0
+; AVX1-NEXT:    vpavgb 640(%rbp), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa 144(%rbp), %xmm1
+; AVX1-NEXT:    vmovdqa 160(%rbp), %xmm2
+; AVX1-NEXT:    vpavgb 656(%rbp), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgb 672(%rbp), %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa 176(%rbp), %xmm2
+; AVX1-NEXT:    vmovdqa 192(%rbp), %xmm3
+; AVX1-NEXT:    vpavgb 688(%rbp), %xmm2, %xmm2
+; AVX1-NEXT:    vpavgb 704(%rbp), %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vmovdqa 208(%rbp), %xmm3
+; AVX1-NEXT:    vmovdqa 224(%rbp), %xmm4
+; AVX1-NEXT:    vpavgb 720(%rbp), %xmm3, %xmm3
+; AVX1-NEXT:    vpavgb 736(%rbp), %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vmovdqa 240(%rbp), %xmm4
+; AVX1-NEXT:    vpavgb 752(%rbp), %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa 256(%rbp), %xmm11
+; AVX1-NEXT:    vpavgb 768(%rbp), %xmm11, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vmovaps %ymm4, 480(%rdi)
+; AVX1-NEXT:    vmovaps %ymm3, 448(%rdi)
+; AVX1-NEXT:    vmovaps %ymm2, 416(%rdi)
+; AVX1-NEXT:    vmovaps %ymm1, 384(%rdi)
+; AVX1-NEXT:    vmovaps %ymm0, 352(%rdi)
 ; AVX1-NEXT:    vmovaps %ymm10, 320(%rdi)
-; AVX1-NEXT:    vmovaps %ymm11, 288(%rdi)
-; AVX1-NEXT:    vmovaps %ymm12, 256(%rdi)
+; AVX1-NEXT:    vmovaps %ymm9, 288(%rdi)
+; AVX1-NEXT:    vmovaps %ymm8, 256(%rdi)
 ; AVX1-NEXT:    vmovaps %ymm7, 224(%rdi)
 ; AVX1-NEXT:    vmovaps %ymm6, 192(%rdi)
-; AVX1-NEXT:    vmovaps %ymm5, 160(%rdi)
-; AVX1-NEXT:    vmovaps %ymm4, 128(%rdi)
-; AVX1-NEXT:    vmovaps %ymm3, 96(%rdi)
+; AVX1-NEXT:    vmovaps %ymm12, 160(%rdi)
+; AVX1-NEXT:    vmovaps %ymm15, 128(%rdi)
+; AVX1-NEXT:    vmovaps %ymm14, 96(%rdi)
+; AVX1-NEXT:    vmovaps %ymm13, 64(%rdi)
 ; AVX1-NEXT:    vmovaps (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT:    vmovaps %ymm0, 64(%rdi)
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
@@ -2011,13 +1918,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; SSE2-NEXT:    pushq %r13
 ; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    movaps (%rdi), %xmm1
-; SSE2-NEXT:    movaps (%rsi), %xmm0
-; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movaps (%rdi), %xmm0
+; SSE2-NEXT:    movaps (%rsi), %xmm1
+; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
@@ -2025,34 +1931,37 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r14d
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r15d
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r12d
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-NEXT:    leal -1(%rdx,%rsi), %edx
+; SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT:    leal -1(%rbx,%rdx), %edx
+; SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    leal -1(%rbp,%rdx), %edx
+; SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    leal -1(%rdi,%rdx), %r8d
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    leal -1(%rax,%rdx), %edi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSE2-NEXT:    leaq -1(%rax,%r9), %rax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    leaq -1(%rbp,%rbx), %rbp
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    leaq -1(%rdx,%rbx), %rdx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    leaq -1(%rcx,%rbx), %rcx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    leaq -1(%rsi,%rbx), %rsi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    leaq -1(%rdi,%rbx), %r8
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    leaq -1(%r11,%rbx), %r9
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    leaq -1(%r10,%rbx), %r11
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    leaq -1(%r13,%rbx), %r13
+; SSE2-NEXT:    leal -1(%rcx,%rax), %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    leal -1(%r9,%rax), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    leal -1(%r10,%rsi), %eax
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    leaq -1(%r11,%rsi), %rsi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
 ; SSE2-NEXT:    leaq -1(%r12,%rbx), %r12
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
@@ -2060,71 +1969,67 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
 ; SSE2-NEXT:    leaq -1(%r14,%rbx), %r14
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE2-NEXT:    leaq -1(%rdi,%rbx), %rdi
-; SSE2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE2-NEXT:    leaq -1(%rbp,%rbx), %r11
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE2-NEXT:    leaq -1(%rdi,%rbx), %rbx
-; SSE2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE2-NEXT:    leaq -1(%rbp,%rbx), %r10
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE2-NEXT:    leaq -1(%rdi,%rbx), %rbx
-; SSE2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    leaq -1(%r13,%rbx), %r9
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    leaq -1(%r10,%rbx), %rbx
-; SSE2-NEXT:    shrq %rax
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; SSE2-NEXT:    leaq -1(%r13,%rbx), %rbx
+; SSE2-NEXT:    shrl %eax
 ; SSE2-NEXT:    movd %eax, %xmm8
-; SSE2-NEXT:    shrq %rbp
-; SSE2-NEXT:    movd %ebp, %xmm15
-; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    shrl %ecx
+; SSE2-NEXT:    movd %ecx, %xmm15
+; SSE2-NEXT:    shrl %edx
 ; SSE2-NEXT:    movd %edx, %xmm9
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    shrl %edi
+; SSE2-NEXT:    movd %edi, %xmm2
+; SSE2-NEXT:    shrl %r8d
+; SSE2-NEXT:    movd %r8d, %xmm10
+; SSE2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SSE2-NEXT:    shrl %eax
+; SSE2-NEXT:    movd %eax, %xmm6
+; SSE2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SSE2-NEXT:    shrl %eax
+; SSE2-NEXT:    movd %eax, %xmm11
+; SSE2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SSE2-NEXT:    shrl %eax
+; SSE2-NEXT:    movd %eax, %xmm4
 ; SSE2-NEXT:    shrq %rsi
-; SSE2-NEXT:    movd %esi, %xmm10
-; SSE2-NEXT:    shrq %r8
-; SSE2-NEXT:    movd %r8d, %xmm6
-; SSE2-NEXT:    shrq %r9
-; SSE2-NEXT:    movd %r9d, %xmm11
-; SSE2-NEXT:    shrq %r11
-; SSE2-NEXT:    movd %r11d, %xmm5
-; SSE2-NEXT:    shrq %r13
-; SSE2-NEXT:    movd %r13d, %xmm12
+; SSE2-NEXT:    movd %esi, %xmm12
 ; SSE2-NEXT:    shrq %r12
 ; SSE2-NEXT:    movd %r12d, %xmm3
 ; SSE2-NEXT:    shrq %r15
 ; SSE2-NEXT:    movd %r15d, %xmm13
 ; SSE2-NEXT:    shrq %r14
 ; SSE2-NEXT:    movd %r14d, %xmm7
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    movd %eax, %xmm14
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    shrq %rax
-; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    shrq %r11
+; SSE2-NEXT:    movd %r11d, %xmm14
+; SSE2-NEXT:    shrq %r10
+; SSE2-NEXT:    movd %r10d, %xmm5
+; SSE2-NEXT:    shrq %r9
+; SSE2-NEXT:    movd %r9d, %xmm0
 ; SSE2-NEXT:    shrq %rbx
 ; SSE2-NEXT:    movd %ebx, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT:    movdqu %xmm4, (%rax)
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
 ; SSE2-NEXT:    popq %r13
@@ -2141,236 +2046,115 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    pushq %r13
 ; AVX1-NEXT:    pushq %r12
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    subq $24, %rsp
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm6, %rdi
-; AVX1-NEXT:    vmovq %xmm6, %rbp
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm5, %rbx
-; AVX1-NEXT:    vmovq %xmm5, %rsi
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX1-NEXT:    vmovq %xmm5, %rcx
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm6, %r13
-; AVX1-NEXT:    vmovq %xmm6, %r12
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm5, %r11
-; AVX1-NEXT:    vmovq %xmm5, %r14
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm5, %r9
-; AVX1-NEXT:    vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %r15
+; AVX1-NEXT:    vmovq %xmm5, %r12
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm5, %rax
-; AVX1-NEXT:    addq %rdi, %rax
-; AVX1-NEXT:    movq %rax, %rdi
-; AVX1-NEXT:    vmovq %xmm5, %rax
-; AVX1-NEXT:    addq %rbp, %rax
-; AVX1-NEXT:    movq %rax, %rbp
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %r15
-; AVX1-NEXT:    addq %rbx, %r15
-; AVX1-NEXT:    vmovq %xmm4, %r10
-; AVX1-NEXT:    addq %rsi, %r10
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
-; AVX1-NEXT:    addq %rdx, %rax
-; AVX1-NEXT:    movq %rax, %rdx
-; AVX1-NEXT:    vmovq %xmm4, %r8
-; AVX1-NEXT:    addq %rcx, %r8
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm5, %rcx
-; AVX1-NEXT:    addq %r13, %rcx
-; AVX1-NEXT:    vmovq %xmm5, %rax
-; AVX1-NEXT:    addq %r12, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
-; AVX1-NEXT:    addq %r11, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vmovq %xmm4, %rax
-; AVX1-NEXT:    addq %r14, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX1-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    addq %r9, %rax
-; AVX1-NEXT:    movq %rax, %r13
-; AVX1-NEXT:    vmovq %xmm1, %rbx
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r13
+; AVX1-NEXT:    vmovq %xmm2, %r14
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX1-NEXT:    addq %rax, %rsi
-; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    vmovq %xmm0, %rsi
-; AVX1-NEXT:    addq %rax, %rsi
-; AVX1-NEXT:    addq $-1, %rdi
-; AVX1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rbp
-; AVX1-NEXT:    movq %rbp, (%rsp) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r15
-; AVX1-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r10
-; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rdx
-; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r8
-; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %r12d
-; AVX1-NEXT:    adcq $-1, %r12
-; AVX1-NEXT:    addq $-1, %rcx
-; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    addq $-1, %rax
-; AVX1-NEXT:    movl $0, %ecx
-; AVX1-NEXT:    adcq $-1, %rcx
-; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %edx
-; AVX1-NEXT:    adcq $-1, %rdx
-; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %r15d
-; AVX1-NEXT:    adcq $-1, %r15
-; AVX1-NEXT:    addq $-1, %r13
-; AVX1-NEXT:    movl $0, %r14d
-; AVX1-NEXT:    adcq $-1, %r14
-; AVX1-NEXT:    addq $-1, %rbx
-; AVX1-NEXT:    movl $0, %r11d
-; AVX1-NEXT:    adcq $-1, %r11
-; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %r8d
-; AVX1-NEXT:    adcq $-1, %r8
-; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %edi
-; AVX1-NEXT:    adcq $-1, %rdi
-; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %r10d
-; AVX1-NEXT:    adcq $-1, %r10
-; AVX1-NEXT:    movq %rsi, %rbp
-; AVX1-NEXT:    addq $-1, %rbp
-; AVX1-NEXT:    movl $0, %r9d
-; AVX1-NEXT:    adcq $-1, %r9
-; AVX1-NEXT:    shldq $63, %rbx, %r11
-; AVX1-NEXT:    shldq $63, %r13, %r14
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rbx, %r15
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rbx, %rdx
-; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    shldq $63, %rax, %rcx
-; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r12
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rsi
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rdx
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rcx
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %rbx
+; AVX1-NEXT:    vmovq %xmm5, %rdx
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX1-NEXT:    vmovq %xmm1, %r10
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
+; AVX1-NEXT:    vmovd %xmm6, %esi
+; AVX1-NEXT:    vpextrd $1, %xmm6, %edi
+; AVX1-NEXT:    vpextrd $2, %xmm6, %eax
+; AVX1-NEXT:    vpextrd $3, %xmm6, %ebp
+; AVX1-NEXT:    vpextrd $3, %xmm5, %ecx
+; AVX1-NEXT:    leal -1(%rbp,%rcx), %ecx
+; AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX1-NEXT:    vpextrd $2, %xmm5, %ecx
+; AVX1-NEXT:    leal -1(%rax,%rcx), %eax
+; AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX1-NEXT:    vpextrd $1, %xmm5, %ecx
+; AVX1-NEXT:    leal -1(%rdi,%rcx), %eax
+; AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX1-NEXT:    vmovd %xmm5, %ecx
+; AVX1-NEXT:    leal -1(%rsi,%rcx), %r8d
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rcx
+; AVX1-NEXT:    leal -1(%r15,%rbx), %r15d
+; AVX1-NEXT:    vmovq %xmm4, %rsi
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT:    leal -1(%r12,%rdx), %edx
+; AVX1-NEXT:    vmovd %xmm2, %r12d
+; AVX1-NEXT:    leal -1(%r11,%r9), %r11d
+; AVX1-NEXT:    vpextrd $1, %xmm2, %edi
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    movq (%rsp), %rbx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rbx, %rax
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %r13, %rbx
-; AVX1-NEXT:    shldq $63, %rbp, %r9
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rbp, %r10
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rbp, %rdi
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rbp, %r8
-; AVX1-NEXT:    vmovq %rbx, %xmm8
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vmovq %rcx, %xmm1
-; AVX1-NEXT:    vmovq %rdx, %xmm11
-; AVX1-NEXT:    vmovq %rsi, %xmm2
-; AVX1-NEXT:    vmovq %r12, %xmm13
-; AVX1-NEXT:    vmovq %r8, %xmm14
-; AVX1-NEXT:    vmovq %rdi, %xmm15
-; AVX1-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Reload
-; AVX1-NEXT:    # xmm9 = mem[0],zero
-; AVX1-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 8-byte Reload
-; AVX1-NEXT:    # xmm10 = mem[0],zero
-; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 8-byte Folded Reload
-; AVX1-NEXT:    # xmm12 = mem[0],zero
-; AVX1-NEXT:    vmovq %r15, %xmm3
-; AVX1-NEXT:    vmovq %r14, %xmm4
-; AVX1-NEXT:    vmovq %r11, %xmm5
-; AVX1-NEXT:    vmovq %r10, %xmm6
-; AVX1-NEXT:    vmovq %r9, %xmm7
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm11[0],xmm1[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm0[0,2],xmm8[0,2]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm13[0],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm15[0],xmm14[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm11 = xmm1[0,2],xmm0[0,2]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm11, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm9[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm2[0,2]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm3[0,2]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    leal -1(%rax,%r10), %r10d
+; AVX1-NEXT:    vpextrd $2, %xmm2, %ebx
+; AVX1-NEXT:    leal -1(%r13,%rcx), %r9d
+; AVX1-NEXT:    vpextrd $3, %xmm2, %ecx
+; AVX1-NEXT:    leal -1(%r14,%rsi), %esi
+; AVX1-NEXT:    vpextrd $3, %xmm3, %eax
+; AVX1-NEXT:    leal -1(%rcx,%rax), %ecx
+; AVX1-NEXT:    vpextrd $2, %xmm3, %eax
+; AVX1-NEXT:    leal -1(%rbx,%rax), %ebx
+; AVX1-NEXT:    vpextrd $1, %xmm3, %eax
+; AVX1-NEXT:    leal -1(%rdi,%rax), %eax
+; AVX1-NEXT:    vmovd %xmm3, %edi
+; AVX1-NEXT:    leal -1(%r12,%rdi), %edi
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r13
+; AVX1-NEXT:    leal -1(%r12,%r13), %r12d
+; AVX1-NEXT:    vmovq %xmm0, %r13
+; AVX1-NEXT:    vmovq %xmm1, %r14
+; AVX1-NEXT:    leal -1(%r13,%r14), %ebp
+; AVX1-NEXT:    shrl %ebp
+; AVX1-NEXT:    vmovd %ebp, %xmm0
+; AVX1-NEXT:    shrl %r12d
+; AVX1-NEXT:    vpinsrb $1, %r12d, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %esi
+; AVX1-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %r9d
+; AVX1-NEXT:    vpinsrb $3, %r9d, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %r10d
+; AVX1-NEXT:    vpinsrb $4, %r10d, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %r11d
+; AVX1-NEXT:    vpinsrb $5, %r11d, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %edx
+; AVX1-NEXT:    vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %r15d
+; AVX1-NEXT:    vpinsrb $7, %r15d, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %edi
+; AVX1-NEXT:    vpinsrb $8, %edi, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %eax
+; AVX1-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %ebx
+; AVX1-NEXT:    vpinsrb $10, %ebx, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %ecx
+; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    shrl %r8d
+; AVX1-NEXT:    vpinsrb $12, %r8d, %xmm0, %xmm0
+; AVX1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX1-NEXT:    shrl %eax
+; AVX1-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX1-NEXT:    shrl %eax
+; AVX1-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX1-NEXT:    shrl %eax
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
-; AVX1-NEXT:    addq $24, %rsp
 ; AVX1-NEXT:    popq %rbx
 ; AVX1-NEXT:    popq %r12
 ; AVX1-NEXT:    popq %r13
@@ -2387,237 +2171,123 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    pushq %r13
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $16, %rsp
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpextrq $1, %xmm4, %rbx
-; AVX2-NEXT:    vmovq %xmm4, %rbp
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rdi
-; AVX2-NEXT:    vmovq %xmm3, %rcx
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm4, %r14
+; AVX2-NEXT:    vmovq %xmm4, %r13
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX2-NEXT:    vmovq %xmm3, %r9
-; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
-; AVX2-NEXT:    vmovq %xmm2, %r12
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpextrq $1, %xmm3, %r15
-; AVX2-NEXT:    vmovq %xmm3, %rsi
-; AVX2-NEXT:    vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT:    vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX2-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
-; AVX2-NEXT:    addq %rbx, %rax
-; AVX2-NEXT:    movq %rax, %rbx
-; AVX2-NEXT:    vmovq %xmm4, %r13
-; AVX2-NEXT:    addq %rbp, %r13
-; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
-; AVX2-NEXT:    addq %rdi, %r10
-; AVX2-NEXT:    vmovq %xmm3, %r14
-; AVX2-NEXT:    addq %rcx, %r14
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm5
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm7
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX2-NEXT:    addq %rdx, %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    vmovq %xmm3, %r8
-; AVX2-NEXT:    addq %r9, %r8
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    addq %r11, %rax
-; AVX2-NEXT:    movq %rax, %r11
-; AVX2-NEXT:    vmovq %xmm2, %rax
-; AVX2-NEXT:    addq %r12, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX2-NEXT:    addq %r15, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    vmovq %xmm3, %rax
-; AVX2-NEXT:    addq %rsi, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    vmovq %xmm2, %rax
-; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rbp
-; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; AVX2-NEXT:    vmovq %xmm2, %r9
-; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
-; AVX2-NEXT:    addq %rax, %rdi
-; AVX2-NEXT:    vmovq %xmm1, %rdx
-; AVX2-NEXT:    vmovq %xmm0, %rsi
-; AVX2-NEXT:    addq %rdx, %rsi
-; AVX2-NEXT:    addq $-1, %rbx
-; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %eax
-; AVX2-NEXT:    adcq $-1, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r13
-; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %eax
-; AVX2-NEXT:    adcq $-1, %rax
-; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r10
-; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %eax
-; AVX2-NEXT:    adcq $-1, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %r13d
-; AVX2-NEXT:    adcq $-1, %r13
-; AVX2-NEXT:    addq $-1, %rcx
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %eax
-; AVX2-NEXT:    adcq $-1, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r8
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %r15d
-; AVX2-NEXT:    adcq $-1, %r15
-; AVX2-NEXT:    addq $-1, %r11
-; AVX2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %ebx
-; AVX2-NEXT:    adcq $-1, %rbx
-; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movl $0, %r8d
-; AVX2-NEXT:    adcq $-1, %r8
-; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movl $0, %eax
-; AVX2-NEXT:    adcq $-1, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movl $0, %eax
-; AVX2-NEXT:    adcq $-1, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movl $0, %r12d
-; AVX2-NEXT:    adcq $-1, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    addq $-1, %rcx
-; AVX2-NEXT:    movl $0, %r11d
-; AVX2-NEXT:    adcq $-1, %r11
-; AVX2-NEXT:    addq $-1, %rbp
-; AVX2-NEXT:    movl $0, %r14d
-; AVX2-NEXT:    adcq $-1, %r14
-; AVX2-NEXT:    addq $-1, %r9
-; AVX2-NEXT:    movl $0, %r10d
-; AVX2-NEXT:    adcq $-1, %r10
-; AVX2-NEXT:    addq $-1, %rdi
-; AVX2-NEXT:    movl $0, %edx
-; AVX2-NEXT:    adcq $-1, %rdx
-; AVX2-NEXT:    addq $-1, %rsi
-; AVX2-NEXT:    movl $0, %eax
-; AVX2-NEXT:    adcq $-1, %rax
-; AVX2-NEXT:    shldq $63, %rsi, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq $63, %rdi, %rdx
-; AVX2-NEXT:    shldq $63, %r9, %r10
-; AVX2-NEXT:    shldq $63, %rbp, %r14
-; AVX2-NEXT:    shldq $63, %rcx, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %r8
+; AVX2-NEXT:    vmovd %xmm4, %r12d
+; AVX2-NEXT:    vpextrd $2, %xmm4, %r15d
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vmovd %xmm7, %ecx
+; AVX2-NEXT:    vpextrd $2, %xmm7, %edi
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; AVX2-NEXT:    vmovd %xmm6, %ebx
+; AVX2-NEXT:    vpextrd $2, %xmm6, %esi
+; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm6
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vmovd %xmm5, %edx
+; AVX2-NEXT:    vpextrd $2, %xmm5, %ebp
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT:    vpextrd $2, %xmm6, %eax
+; AVX2-NEXT:    leal -1(%rbp,%rax), %eax
+; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vmovd %xmm6, %eax
+; AVX2-NEXT:    leal -1(%rdx,%rax), %eax
+; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vpextrd $2, %xmm7, %eax
+; AVX2-NEXT:    leal -1(%rsi,%rax), %r11d
+; AVX2-NEXT:    vmovd %xmm7, %eax
+; AVX2-NEXT:    leal -1(%rbx,%rax), %r10d
+; AVX2-NEXT:    vpextrd $2, %xmm5, %eax
+; AVX2-NEXT:    leal -1(%rdi,%rax), %r9d
+; AVX2-NEXT:    vmovd %xmm5, %eax
+; AVX2-NEXT:    leal -1(%rcx,%rax), %r8d
+; AVX2-NEXT:    vpextrd $2, %xmm3, %eax
+; AVX2-NEXT:    leal -1(%r15,%rax), %r15d
+; AVX2-NEXT:    vmovd %xmm3, %ecx
+; AVX2-NEXT:    leal -1(%r12,%rcx), %r12d
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rdx
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rax, %rbx
+; AVX2-NEXT:    leaq -1(%rax,%rdx), %rdx
+; AVX2-NEXT:    vmovq %xmm2, %rsi
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rax, %r15
+; AVX2-NEXT:    leaq -1(%rax,%rsi), %rsi
+; AVX2-NEXT:    vmovq %xmm4, %rbx
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %rbp
-; AVX2-NEXT:    movq (%rsp), %rdi # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %rdi
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %rsi
-; AVX2-NEXT:    vmovq %rsi, %xmm8
-; AVX2-NEXT:    vmovq %rdi, %xmm9
-; AVX2-NEXT:    vmovq %rbp, %xmm10
-; AVX2-NEXT:    vmovq %r13, %xmm11
-; AVX2-NEXT:    vmovq %rax, %xmm12
-; AVX2-NEXT:    vmovq %r15, %xmm13
-; AVX2-NEXT:    vmovq %rbx, %xmm14
-; AVX2-NEXT:    vmovq %r8, %xmm15
-; AVX2-NEXT:    vmovq %r9, %xmm0
-; AVX2-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload
-; AVX2-NEXT:    # xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovq %r12, %xmm2
-; AVX2-NEXT:    vmovq %r11, %xmm3
-; AVX2-NEXT:    vmovq %r14, %xmm4
-; AVX2-NEXT:    vmovq %r10, %xmm5
-; AVX2-NEXT:    vmovq %rdx, %xmm6
-; AVX2-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
-; AVX2-NEXT:    # xmm7 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm10[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm13[0],xmm12[0]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm15[0],xmm14[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm10, %ymm9
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm8, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    leaq -1(%rax,%rbx), %rbx
+; AVX2-NEXT:    vpextrq $1, %xmm4, %rbp
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    leaq -1(%rax,%rbp), %rbp
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    leaq -1(%r13,%rdi), %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    leaq -1(%r14,%rax), %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %r13
+; AVX2-NEXT:    leaq -1(%rcx,%r13), %r13
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX2-NEXT:    leaq -1(%rcx,%r14), %rcx
+; AVX2-NEXT:    shrq %rsi
+; AVX2-NEXT:    vmovd %esi, %xmm0
+; AVX2-NEXT:    shrq %rdx
+; AVX2-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    shrq %rbx
+; AVX2-NEXT:    vpinsrb $2, %ebx, %xmm0, %xmm0
+; AVX2-NEXT:    shrq %rbp
+; AVX2-NEXT:    vpinsrb $3, %ebp, %xmm0, %xmm0
+; AVX2-NEXT:    shrq %rdi
+; AVX2-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; AVX2-NEXT:    shrq %rax
+; AVX2-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    shrq %r13
+; AVX2-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    shrl %r12d
+; AVX2-NEXT:    vpinsrb $8, %r12d, %xmm0, %xmm0
+; AVX2-NEXT:    shrl %r15d
+; AVX2-NEXT:    vpinsrb $9, %r15d, %xmm0, %xmm0
+; AVX2-NEXT:    shrl %r8d
+; AVX2-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
+; AVX2-NEXT:    shrl %r9d
+; AVX2-NEXT:    vpinsrb $11, %r9d, %xmm0, %xmm0
+; AVX2-NEXT:    shrl %r10d
+; AVX2-NEXT:    vpinsrb $12, %r10d, %xmm0, %xmm0
+; AVX2-NEXT:    shrl %r11d
+; AVX2-NEXT:    vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX2-NEXT:    shrl %eax
+; AVX2-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX2-NEXT:    shrl %eax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
-; AVX2-NEXT:    addq $16, %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
 ; AVX2-NEXT:    popq %r13
@@ -2635,229 +2305,123 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    pushq %r13
 ; AVX512-NEXT:    pushq %r12
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $24, %rsp
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rbx
-; AVX512-NEXT:    vmovq %xmm4, %rbp
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rdi
-; AVX512-NEXT:    vmovq %xmm3, %rsi
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512-NEXT:    vmovq %xmm3, %r8
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
-; AVX512-NEXT:    vmovq %xmm2, %r12
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm0
+; AVX512-NEXT:    vpextrq $1, %xmm4, %r14
+; AVX512-NEXT:    vmovq %xmm4, %r13
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %r15
-; AVX512-NEXT:    vmovq %xmm3, %r14
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r9
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512-NEXT:    vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX512-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rax
-; AVX512-NEXT:    addq %rbx, %rax
-; AVX512-NEXT:    movq %rax, %rbx
-; AVX512-NEXT:    vmovq %xmm4, %rax
-; AVX512-NEXT:    addq %rbp, %rax
-; AVX512-NEXT:    movq %rax, %rbp
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512-NEXT:    addq %rdi, %rax
-; AVX512-NEXT:    movq %rax, %rdi
-; AVX512-NEXT:    vmovq %xmm3, %r10
-; AVX512-NEXT:    addq %rsi, %r10
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX512-NEXT:    addq %rdx, %rcx
-; AVX512-NEXT:    vmovq %xmm3, %rax
-; AVX512-NEXT:    addq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT:    addq %r13, %rsi
-; AVX512-NEXT:    vmovq %xmm2, %r11
-; AVX512-NEXT:    addq %r12, %r11
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512-NEXT:    addq %r15, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vmovq %xmm3, %rax
-; AVX512-NEXT:    addq %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512-NEXT:    addq %r9, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vmovq %xmm2, %rax
-; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm5
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm7
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm1
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vmovq %xmm2, %r14
-; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
-; AVX512-NEXT:    addq %rax, %r9
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vmovq %xmm1, %rdx
-; AVX512-NEXT:    addq %rax, %rdx
-; AVX512-NEXT:    addq $-1, %rbx
-; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %rbp
-; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %rdi
-; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %r10
-; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %rcx
-; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %r8
-; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %rsi
-; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %r13d
-; AVX512-NEXT:    adcq $-1, %r13
-; AVX512-NEXT:    addq $-1, %r11
-; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %r15d
-; AVX512-NEXT:    adcq $-1, %r15
-; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, %rsi
-; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movl $0, %r12d
-; AVX512-NEXT:    adcq $-1, %r12
-; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movl $0, %ebx
-; AVX512-NEXT:    adcq $-1, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    addq $-1, %rbp
-; AVX512-NEXT:    movl $0, %r11d
-; AVX512-NEXT:    adcq $-1, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    addq $-1, %rax
-; AVX512-NEXT:    movl $0, %r10d
-; AVX512-NEXT:    adcq $-1, %r10
-; AVX512-NEXT:    addq $-1, %r14
-; AVX512-NEXT:    movl $0, %r8d
-; AVX512-NEXT:    adcq $-1, %r8
-; AVX512-NEXT:    addq $-1, %r9
-; AVX512-NEXT:    movl $0, %edi
-; AVX512-NEXT:    adcq $-1, %rdi
-; AVX512-NEXT:    addq $-1, %rdx
-; AVX512-NEXT:    movl $0, %ecx
-; AVX512-NEXT:    adcq $-1, %rcx
-; AVX512-NEXT:    shldq $63, %rdx, %rcx
-; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq $63, %r9, %rdi
-; AVX512-NEXT:    shldq $63, %r14, %r8
-; AVX512-NEXT:    shldq $63, %rax, %r10
-; AVX512-NEXT:    shldq $63, %rbp, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %rsi
-; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %r15
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-NEXT:    vmovd %xmm4, %r12d
+; AVX512-NEXT:    vpextrd $2, %xmm4, %r15d
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512-NEXT:    vmovd %xmm7, %ecx
+; AVX512-NEXT:    vpextrd $2, %xmm7, %edi
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; AVX512-NEXT:    vmovd %xmm6, %ebx
+; AVX512-NEXT:    vpextrd $2, %xmm6, %esi
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm6
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX512-NEXT:    vmovd %xmm5, %edx
+; AVX512-NEXT:    vpextrd $2, %xmm5, %ebp
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX512-NEXT:    vpextrd $2, %xmm6, %eax
+; AVX512-NEXT:    leal -1(%rbp,%rax), %eax
+; AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512-NEXT:    vmovd %xmm6, %eax
+; AVX512-NEXT:    leal -1(%rdx,%rax), %eax
+; AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512-NEXT:    vpextrd $2, %xmm7, %eax
+; AVX512-NEXT:    leal -1(%rsi,%rax), %r11d
+; AVX512-NEXT:    vmovd %xmm7, %eax
+; AVX512-NEXT:    leal -1(%rbx,%rax), %r10d
+; AVX512-NEXT:    vpextrd $2, %xmm5, %eax
+; AVX512-NEXT:    leal -1(%rdi,%rax), %r9d
+; AVX512-NEXT:    vmovd %xmm5, %eax
+; AVX512-NEXT:    leal -1(%rcx,%rax), %r8d
+; AVX512-NEXT:    vpextrd $2, %xmm3, %eax
+; AVX512-NEXT:    leal -1(%r15,%rax), %r15d
+; AVX512-NEXT:    vmovd %xmm3, %ecx
+; AVX512-NEXT:    leal -1(%r12,%rcx), %r12d
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %r13
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    leaq -1(%rax,%rdx), %rdx
+; AVX512-NEXT:    vmovq %xmm2, %rsi
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %rsi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    leaq -1(%rax,%rsi), %rsi
+; AVX512-NEXT:    vmovq %xmm4, %rbx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %rcx
+; AVX512-NEXT:    leaq -1(%rax,%rbx), %rbx
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rbp
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %rax
-; AVX512-NEXT:    movq (%rsp), %r14 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %rbp
-; AVX512-NEXT:    vmovq %rbp, %xmm8
-; AVX512-NEXT:    vmovq %r9, %xmm9
-; AVX512-NEXT:    vmovq %r14, %xmm10
-; AVX512-NEXT:    vmovq %rax, %xmm11
-; AVX512-NEXT:    vmovq %rcx, %xmm12
-; AVX512-NEXT:    vmovq %rsi, %xmm13
-; AVX512-NEXT:    vmovq %r13, %xmm14
-; AVX512-NEXT:    vmovq %r15, %xmm15
-; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[0],zero
-; AVX512-NEXT:    vmovq %r12, %xmm1
-; AVX512-NEXT:    vmovq %rbx, %xmm2
-; AVX512-NEXT:    vmovq %r11, %xmm3
-; AVX512-NEXT:    vmovq %r10, %xmm4
-; AVX512-NEXT:    vmovq %r8, %xmm5
-; AVX512-NEXT:    vmovq %rdi, %xmm6
-; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
-; AVX512-NEXT:    # xmm7 = mem[0],zero
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm10[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm13[0],xmm12[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm15[0],xmm14[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm10, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
-; AVX512-NEXT:    vpmovqd %zmm8, %ymm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm7[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
-; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    leaq -1(%rax,%rbp), %rbp
+; AVX512-NEXT:    vmovq %xmm1, %rdi
+; AVX512-NEXT:    leaq -1(%r13,%rdi), %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512-NEXT:    leaq -1(%r14,%rax), %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %r13
+; AVX512-NEXT:    leaq -1(%rcx,%r13), %r13
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX512-NEXT:    leaq -1(%rcx,%r14), %rcx
+; AVX512-NEXT:    shrq %rsi
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    shrq %rdx
+; AVX512-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    shrq %rbx
+; AVX512-NEXT:    vpinsrb $2, %ebx, %xmm0, %xmm0
+; AVX512-NEXT:    shrq %rbp
+; AVX512-NEXT:    vpinsrb $3, %ebp, %xmm0, %xmm0
+; AVX512-NEXT:    shrq %rdi
+; AVX512-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; AVX512-NEXT:    shrq %rax
+; AVX512-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    shrq %r13
+; AVX512-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX512-NEXT:    shrq %rcx
+; AVX512-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX512-NEXT:    shrl %r12d
+; AVX512-NEXT:    vpinsrb $8, %r12d, %xmm0, %xmm0
+; AVX512-NEXT:    shrl %r15d
+; AVX512-NEXT:    vpinsrb $9, %r15d, %xmm0, %xmm0
+; AVX512-NEXT:    shrl %r8d
+; AVX512-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
+; AVX512-NEXT:    shrl %r9d
+; AVX512-NEXT:    vpinsrb $11, %r9d, %xmm0, %xmm0
+; AVX512-NEXT:    shrl %r10d
+; AVX512-NEXT:    vpinsrb $12, %r10d, %xmm0, %xmm0
+; AVX512-NEXT:    shrl %r11d
+; AVX512-NEXT:    vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX512-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX512-NEXT:    shrl %eax
+; AVX512-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX512-NEXT:    shrl %eax
+; AVX512-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    popq %r12
 ; AVX512-NEXT:    popq %r13
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll
index ea42aa34d8ce4d11858b24ef2b3cf37e05980b54..f4d0921ae499d60dfc670eb157bdf63f99037ae1 100644
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -255,8 +255,8 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups (%rsi), %xmm0
 ; CHECK-NEXT:    vmovups 16(%rsi), %xmm1
-; CHECK-NEXT:    vmovups %xmm1, 16(%rdi)
 ; CHECK-NEXT:    vmovups %xmm0, (%rdi)
+; CHECK-NEXT:    vmovups %xmm1, 16(%rdi)
 ; CHECK-NEXT:    retq
 ;
 ; CHECK_O0-LABEL: add8i32:
@@ -300,8 +300,8 @@ define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rsi), %xmm0
 ; CHECK-NEXT:    vmovaps 16(%rsi), %xmm1
-; CHECK-NEXT:    vmovaps %xmm1, 16(%rdi)
 ; CHECK-NEXT:    vmovaps %xmm0, (%rdi)
+; CHECK-NEXT:    vmovaps %xmm1, 16(%rdi)
 ; CHECK-NEXT:    retq
 ;
 ; CHECK_O0-LABEL: add4i64a16:
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index c9481ccdbf9c0667d04768e89ad8b6a12a87c9f1..e1db9bab294cd1db1ab5d52519bd99915a4d7adc 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -1843,42 +1843,42 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_extractf128:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
 ; SANDY-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
+; SANDY-NEXT:    vzeroupper # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_extractf128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_extractf128:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
 ; BROADWELL-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_extractf128:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
 ; SKYLAKE-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_extractf128:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
 ; SKX-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BDVER2-LABEL: test_extractf128:
@@ -3008,37 +3008,37 @@ define i32 @test_movmskpd(<4 x double> %a0) {
 ; GENERIC-LABEL: test_movmskpd:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movmskpd:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
+; SANDY-NEXT:    vzeroupper # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movmskpd:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vmovmskpd %ymm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_movmskpd:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vmovmskpd %ymm0, %eax # sched: [3:1.00]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_movmskpd:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_movmskpd:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BDVER2-LABEL: test_movmskpd:
@@ -3066,37 +3066,37 @@ define i32 @test_movmskps(<8 x float> %a0) {
 ; GENERIC-LABEL: test_movmskps:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovmskps %ymm0, %eax # sched: [2:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movmskps:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovmskps %ymm0, %eax # sched: [2:1.00]
-; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
+; SANDY-NEXT:    vzeroupper # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movmskps:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vmovmskps %ymm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_movmskps:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vmovmskps %ymm0, %eax # sched: [3:1.00]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_movmskps:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmovmskps %ymm0, %eax # sched: [2:1.00]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_movmskps:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovmskps %ymm0, %eax # sched: [2:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BDVER2-LABEL: test_movmskps:
@@ -3126,7 +3126,7 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; GENERIC-NEXT:    #APP
 ; GENERIC-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movntdq:
@@ -3134,7 +3134,7 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; SANDY-NEXT:    #APP
 ; SANDY-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
 ; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
+; SANDY-NEXT:    vzeroupper # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movntdq:
@@ -3142,7 +3142,7 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; HASWELL-NEXT:    #APP
 ; HASWELL-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_movntdq:
@@ -3150,7 +3150,7 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; BROADWELL-NEXT:    #APP
 ; BROADWELL-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_movntdq:
@@ -3158,7 +3158,7 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; SKYLAKE-NEXT:    #APP
 ; SKYLAKE-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_movntdq:
@@ -3166,7 +3166,7 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; SKX-NEXT:    #APP
 ; SKX-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BDVER2-LABEL: test_movntdq:
@@ -5240,7 +5240,7 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    vtestpd (%rdi), %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    adcl $0, %eax # sched: [2:0.67]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_testpd_ymm:
@@ -5250,7 +5250,7 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; SANDY-NEXT:    setb %al # sched: [1:0.50]
 ; SANDY-NEXT:    vtestpd (%rdi), %ymm0 # sched: [8:1.00]
 ; SANDY-NEXT:    adcl $0, %eax # sched: [2:0.67]
-; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
+; SANDY-NEXT:    vzeroupper # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_testpd_ymm:
@@ -5260,7 +5260,7 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; HASWELL-NEXT:    setb %al # sched: [1:0.50]
 ; HASWELL-NEXT:    vtestpd (%rdi), %ymm0 # sched: [8:1.00]
 ; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_testpd_ymm:
@@ -5270,7 +5270,7 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; BROADWELL-NEXT:    setb %al # sched: [1:0.50]
 ; BROADWELL-NEXT:    vtestpd (%rdi), %ymm0 # sched: [7:1.00]
 ; BROADWELL-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_testpd_ymm:
@@ -5280,7 +5280,7 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; SKYLAKE-NEXT:    setb %al # sched: [1:0.50]
 ; SKYLAKE-NEXT:    vtestpd (%rdi), %ymm0 # sched: [9:1.00]
 ; SKYLAKE-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_testpd_ymm:
@@ -5290,7 +5290,7 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; SKX-NEXT:    setb %al # sched: [1:0.50]
 ; SKX-NEXT:    vtestpd (%rdi), %ymm0 # sched: [9:1.00]
 ; SKX-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BDVER2-LABEL: test_testpd_ymm:
@@ -5426,7 +5426,7 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    vtestps (%rdi), %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    adcl $0, %eax # sched: [2:0.67]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_testps_ymm:
@@ -5436,7 +5436,7 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; SANDY-NEXT:    setb %al # sched: [1:0.50]
 ; SANDY-NEXT:    vtestps (%rdi), %ymm0 # sched: [8:1.00]
 ; SANDY-NEXT:    adcl $0, %eax # sched: [2:0.67]
-; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
+; SANDY-NEXT:    vzeroupper # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_testps_ymm:
@@ -5446,7 +5446,7 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; HASWELL-NEXT:    setb %al # sched: [1:0.50]
 ; HASWELL-NEXT:    vtestps (%rdi), %ymm0 # sched: [8:1.00]
 ; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_testps_ymm:
@@ -5456,7 +5456,7 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; BROADWELL-NEXT:    setb %al # sched: [1:0.50]
 ; BROADWELL-NEXT:    vtestps (%rdi), %ymm0 # sched: [7:1.00]
 ; BROADWELL-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_testps_ymm:
@@ -5466,7 +5466,7 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; SKYLAKE-NEXT:    setb %al # sched: [1:0.50]
 ; SKYLAKE-NEXT:    vtestps (%rdi), %ymm0 # sched: [9:1.00]
 ; SKYLAKE-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_testps_ymm:
@@ -5476,7 +5476,7 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; SKX-NEXT:    setb %al # sched: [1:0.50]
 ; SKX-NEXT:    vtestps (%rdi), %ymm0 # sched: [9:1.00]
 ; SKX-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BDVER2-LABEL: test_testps_ymm:
@@ -5976,32 +5976,32 @@ declare void @llvm.x86.avx.vzeroall() nounwind
 define void @test_zeroupper() {
 ; GENERIC-LABEL: test_zeroupper:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_zeroupper:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
+; SANDY-NEXT:    vzeroupper # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_zeroupper:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_zeroupper:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_zeroupper:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_zeroupper:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BDVER2-LABEL: test_zeroupper:
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index f1af384ce473d80f81eb23ddeb1d71002869f6b5..49f57fac70dce897a6501164c63c2e88871413ef 100644
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -29,11 +29,9 @@ define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
 define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
 ; CHECK-LABEL: trunc_16_8:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %B = trunc <16 x i16> %A to <16 x i8>
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index f470c97e4726d188ea86cf7983716b2af09c4908..91e161f15035347b86d72878afaf7220de05c67f 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -210,8 +210,8 @@ define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uw
 ;
 ; AVX2-LABEL: shuffle_v16i16_4501:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index 3662e39a641b088c2a45579d11bfcfdd0a26542d..1abeb76de0f4ac1fe5a3df4a08e9578cbf879b6b 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -145,9 +145,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
 ; VZ-NEXT:    callq do_sse
 ; VZ-NEXT:    callq do_sse
-; VZ-NEXT:    vmovaps {{.*}}(%rip), %ymm0
-; VZ-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    vmovaps g+{{.*}}(%rip), %xmm0
 ; VZ-NEXT:    callq do_sse
 ; VZ-NEXT:    decl %ebx
 ; VZ-NEXT:    jne .LBB3_3
@@ -176,8 +174,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; FAST-ymm-zmm-NEXT:    # =>This Inner Loop Header: Depth=1
 ; FAST-ymm-zmm-NEXT:    callq do_sse
 ; FAST-ymm-zmm-NEXT:    callq do_sse
-; FAST-ymm-zmm-NEXT:    vmovaps {{.*}}(%rip), %ymm0
-; FAST-ymm-zmm-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; FAST-ymm-zmm-NEXT:    vmovaps g+{{.*}}(%rip), %xmm0
 ; FAST-ymm-zmm-NEXT:    callq do_sse
 ; FAST-ymm-zmm-NEXT:    decl %ebx
 ; FAST-ymm-zmm-NEXT:    jne .LBB3_3
@@ -206,9 +203,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; BDVER2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; BDVER2-NEXT:    callq do_sse
 ; BDVER2-NEXT:    callq do_sse
-; BDVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm0
-; BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    vmovaps g+{{.*}}(%rip), %xmm0
 ; BDVER2-NEXT:    callq do_sse
 ; BDVER2-NEXT:    decl %ebx
 ; BDVER2-NEXT:    jne .LBB3_3
@@ -237,8 +232,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; BTVER2-NEXT:    callq do_sse
 ; BTVER2-NEXT:    callq do_sse
-; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm0
-; BTVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; BTVER2-NEXT:    vmovaps g+{{.*}}(%rip), %xmm0
 ; BTVER2-NEXT:    callq do_sse
 ; BTVER2-NEXT:    decl %ebx
 ; BTVER2-NEXT:    jne .LBB3_3
diff --git a/test/CodeGen/X86/avx1-logical-load-folding.ll b/test/CodeGen/X86/avx1-logical-load-folding.ll
index 88521dedc1c93bee0f921350daddf837986e963c..c1ee3182fb1b0e9d8d309a5b60e222ddb26b79b3 100644
--- a/test/CodeGen/X86/avx1-logical-load-folding.ll
+++ b/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -8,18 +8,16 @@ define void @test1(float* %A, float* %C) #0 {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %ymm0
-; X86-NEXT:    vandps LCPI0_0, %ymm0, %ymm0
+; X86-NEXT:    vmovaps (%ecx), %xmm0
+; X86-NEXT:    vandps LCPI0_0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
-; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test1:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %ymm0
-; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
-; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %tmp1 = bitcast float* %A to <8 x float>*
   %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32
@@ -37,18 +35,16 @@ define void @test2(float* %A, float* %C) #0 {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %ymm0
-; X86-NEXT:    vorps LCPI1_0, %ymm0, %ymm0
+; X86-NEXT:    vmovaps (%ecx), %xmm0
+; X86-NEXT:    vorps LCPI1_0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
-; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test2:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %ymm0
-; X64-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
-; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %tmp1 = bitcast float* %A to <8 x float>*
   %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32
@@ -66,18 +62,16 @@ define void @test3(float* %A, float* %C) #0 {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %ymm0
-; X86-NEXT:    vxorps LCPI2_0, %ymm0, %ymm0
+; X86-NEXT:    vmovaps (%ecx), %xmm0
+; X86-NEXT:    vxorps LCPI2_0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
-; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test3:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %ymm0
-; X64-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
-; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %tmp1 = bitcast float* %A to <8 x float>*
   %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32
@@ -94,18 +88,16 @@ define void @test4(float* %A, float* %C) #0 {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %ymm0
-; X86-NEXT:    vandnps LCPI3_0, %ymm0, %ymm0
+; X86-NEXT:    vmovaps (%ecx), %xmm0
+; X86-NEXT:    vandnps LCPI3_0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
-; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test4:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %ymm0
-; X64-NEXT:    vandnps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vandnps {{.*}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
-; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %tmp1 = bitcast float* %A to <8 x float>*
   %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32
diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll
index aa625be4ded588d20c5c44b68f25e9a7224bba99..a1e22d02f4f05d0c59d84e9baae66396ce3cd14c 100644
--- a/test/CodeGen/X86/avx2-arith.ll
+++ b/test/CodeGen/X86/avx2-arith.ll
@@ -145,27 +145,23 @@ define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone
 define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
 ; X32-LABEL: mul_v16i8:
 ; X32:       # %bb.0:
-; X32-NEXT:    vpmovsxbw %xmm1, %ymm1
-; X32-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X32-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; X32-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; X32-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
 ; X32-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X32-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; X32-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; X32-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mul_v16i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpmovsxbw %xmm1, %ymm1
-; X64-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X64-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; X64-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; X64-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X64-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; X64-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %x = mul <16 x i8> %i, %j
@@ -175,46 +171,30 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
 define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
 ; X32-LABEL: mul_v32i8:
 ; X32:       # %bb.0:
-; X32-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; X32-NEXT:    vpmovsxbw %xmm2, %ymm2
-; X32-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; X32-NEXT:    vpmovsxbw %xmm3, %ymm3
+; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; X32-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X32-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; X32-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; X32-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X32-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; X32-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; X32-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-NEXT:    vpmovsxbw %xmm1, %ymm1
-; X32-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X32-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X32-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; X32-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; X32-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; X32-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mul_v32i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; X64-NEXT:    vpmovsxbw %xmm2, %ymm2
-; X64-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; X64-NEXT:    vpmovsxbw %xmm3, %ymm3
+; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; X64-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X64-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; X64-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; X64-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X64-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; X64-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X64-NEXT:    vpmovsxbw %xmm1, %ymm1
-; X64-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X64-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X64-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; X64-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X64-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %x = mul <32 x i8> %i, %j
   ret <32 x i8> %x
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index d4c2330431b181e42b6b76f89358518817585009..1060d70d0d0652927ad0cfdc25b3018d5c834ab3 100644
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -163,21 +163,17 @@ define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) {
 define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
 ; X32-LABEL: trunc_16i16_16i8:
 ; X32:       # %bb.0:
+; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
 ; X32-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X32-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; X32-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; X32-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: trunc_16i16_16i8:
 ; X64:       # %bb.0:
+; X64-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X64-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; X64-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %t = trunc <16 x i16> %z to <16 x i8>
diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll
index 4cec0508f33e484aa9c34c70c2e3b5a958234c5e..2901827f401cf65f6098f275a26b55bf675c1e03 100644
--- a/test/CodeGen/X86/avx2-schedule.ll
+++ b/test/CodeGen/X86/avx2-schedule.ll
@@ -177,7 +177,7 @@ define <4 x i32> @test_extracti128(<8 x i16> %a0, <4 x i32> *%a1) {
 ; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
 ; GENERIC-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_extracti128:
@@ -185,7 +185,7 @@ define <4 x i32> @test_extracti128(<8 x i16> %a0, <4 x i32> *%a1) {
 ; HASWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
 ; HASWELL-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_extracti128:
@@ -193,7 +193,7 @@ define <4 x i32> @test_extracti128(<8 x i16> %a0, <4 x i32> *%a1) {
 ; BROADWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
 ; BROADWELL-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
 ; BROADWELL-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_extracti128:
@@ -201,7 +201,7 @@ define <4 x i32> @test_extracti128(<8 x i16> %a0, <4 x i32> *%a1) {
 ; SKYLAKE-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
 ; SKYLAKE-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
 ; SKYLAKE-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_extracti128:
@@ -209,7 +209,7 @@ define <4 x i32> @test_extracti128(<8 x i16> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
 ; SKX-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
 ; SKX-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_extracti128:
@@ -474,31 +474,31 @@ define <4 x float> @test_gatherqps_ymm(<4 x float> %a0, i8* %a1, <4 x i64> %a2,
 ; GENERIC-LABEL: test_gatherqps_ymm:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [5:0.50]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_gatherqps_ymm:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [28:3.67]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_gatherqps_ymm:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [24:5.00]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_gatherqps_ymm:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [25:1.00]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_gatherqps_ymm:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [25:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_gatherqps_ymm:
@@ -2920,31 +2920,31 @@ define <4 x i32> @test_pgatherqd_ymm(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x
 ; GENERIC-LABEL: test_pgatherqd_ymm:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [5:0.50]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pgatherqd_ymm:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [28:5.00]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pgatherqd_ymm:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [5:0.50]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pgatherqd_ymm:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [25:1.00]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pgatherqd_ymm:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [25:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_pgatherqd_ymm:
@@ -4089,31 +4089,31 @@ define i32 @test_pmovmskb(<32 x i8> %a0) {
 ; GENERIC-LABEL: test_pmovmskb:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmovmskb:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmovmskb %ymm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmovmskb:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmovmskb %ymm0, %eax # sched: [3:1.00]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmovmskb:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmovmskb:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_pmovmskb:
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index 89d811f868165ffc6f5b97207eba81df876dc53b..ecdca99bea31289d66ba5ad76c9139ffad8a46cc 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -69,10 +69,10 @@ define float @test5(float %p) #0 {
 ; ALL-NEXT:  ## %bb.2: ## %return
 ; ALL-NEXT:    retq
 ; ALL-NEXT:  LBB3_1: ## %if.end
-; ALL-NEXT:    seta %al
-; ALL-NEXT:    movzbl %al, %eax
-; ALL-NEXT:    leaq {{.*}}(%rip), %rcx
+; ALL-NEXT:    vcmpltss %xmm0, %xmm1, %k1
+; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; ALL-NEXT:    retq
 entry:
   %cmp = fcmp oeq float %p, 0.000000e+00
diff --git a/test/CodeGen/X86/avx512-cvt-widen.ll b/test/CodeGen/X86/avx512-cvt-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..849a814e640dac1e014a959b49ce6eec56fc2ec1
--- /dev/null
+++ b/test/CodeGen/X86/avx512-cvt-widen.ll
@@ -0,0 +1,2516 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQNOVL --check-prefix=AVX512DQ
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
+
+
+define <16 x float> @sitof32(<16 x i32> %a) nounwind {
+; ALL-LABEL: sitof32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = sitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @sltof864(<8 x i64> %a) {
+; NODQ-LABEL: sltof864:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sltof864:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: sltof864:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; DQNOVL-NEXT:    retq
+  %b = sitofp <8 x i64> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <4 x double> @slto4f64(<4 x i64> %a) {
+; NODQ-LABEL: slto4f64:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: slto4f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: slto4f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT:    retq
+  %b = sitofp <4 x i64> %a to <4 x double>
+  ret <4 x double> %b
+}
+
+define <2 x double> @slto2f64(<2 x i64> %a) {
+; NODQ-LABEL: slto2f64:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: slto2f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: slto2f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %b = sitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %b
+}
+
+define <2 x float> @sltof2f32(<2 x i64> %a) {
+; NODQ-LABEL: sltof2f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; NODQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sltof2f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: sltof2f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %b = sitofp <2 x i64> %a to <2 x float>
+  ret <2 x float>%b
+}
+
+define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
+; NODQ-LABEL: slto4f32_mem:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vmovdqu (%rdi), %xmm0
+; NODQ-NEXT:    vmovdqu 16(%rdi), %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: slto4f32_mem:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: slto4f32_mem:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vmovups (%rdi), %ymm0
+; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %a1 = load <4 x i64>, <4 x i64>* %a, align 8
+  %b = sitofp <4 x i64> %a1 to <4 x float>
+  ret <4 x float>%b
+}
+
+define <4 x i64> @f64to4sl(<4 x double> %a) {
+; NODQ-LABEL: f64to4sl:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm1
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm0
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NODQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: f64to4sl:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: f64to4sl:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; DQNOVL-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT:    retq
+  %b = fptosi <4 x double> %a to <4 x i64>
+  ret <4 x i64> %b
+}
+
+define <4 x i64> @f32to4sl(<4 x float> %a) {
+; NODQ-LABEL: f32to4sl:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; NODQ-NEXT:    vcvttss2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm1
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; NODQ-NEXT:    vcvttss2si %xmm2, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT:    vcvttss2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; NODQ-NEXT:    vcvttss2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm0
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NODQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: f32to4sl:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: f32to4sl:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; DQNOVL-NEXT:    vcvttps2qq %ymm0, %zmm0
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT:    retq
+  %b = fptosi <4 x float> %a to <4 x i64>
+  ret <4 x i64> %b
+}
+
+define <4 x float> @slto4f32(<4 x i64> %a) {
+; NODQ-LABEL: slto4f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    vzeroupper
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: slto4f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    vzeroupper
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: slto4f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %b = sitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define <4 x float> @ulto4f32(<4 x i64> %a) {
+; NODQ-LABEL: ulto4f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    vzeroupper
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: ulto4f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    vzeroupper
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: ulto4f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; DQNOVL-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %b = uitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define <8 x double> @ulto8f64(<8 x i64> %a) {
+; NODQ-LABEL: ulto8f64:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; NODQ-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; NODQ-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; NODQ-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; NODQ-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: ulto8f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: ulto8f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; DQNOVL-NEXT:    retq
+  %b = uitofp <8 x i64> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x double> @ulto16f64(<16 x i64> %a) {
+; NODQ-LABEL: ulto16f64:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
+; NODQ-NEXT:    vpandq %zmm2, %zmm0, %zmm3
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; NODQ-NEXT:    vporq %zmm4, %zmm3, %zmm3
+; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; NODQ-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; NODQ-NEXT:    vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; NODQ-NEXT:    vsubpd %zmm6, %zmm0, %zmm0
+; NODQ-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
+; NODQ-NEXT:    vpandq %zmm2, %zmm1, %zmm2
+; NODQ-NEXT:    vporq %zmm4, %zmm2, %zmm2
+; NODQ-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; NODQ-NEXT:    vporq %zmm5, %zmm1, %zmm1
+; NODQ-NEXT:    vsubpd %zmm6, %zmm1, %zmm1
+; NODQ-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: ulto16f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; VLDQ-NEXT:    vcvtuqq2pd %zmm1, %zmm1
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: ulto16f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; DQNOVL-NEXT:    vcvtuqq2pd %zmm1, %zmm1
+; DQNOVL-NEXT:    retq
+  %b = uitofp <16 x i64> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x i32> @f64to16si(<16 x float> %a) nounwind {
+; ALL-LABEL: f64to16si:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = fptosi <16 x float> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+define <16 x i8> @f32to16sc(<16 x float> %f) {
+; ALL-LABEL: f32to16sc:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %res = fptosi <16 x float> %f to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <16 x i16> @f32to16ss(<16 x float> %f) {
+; ALL-LABEL: f32to16ss:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %res = fptosi <16 x float> %f to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <16 x i32> @f32to16ui(<16 x float> %a) nounwind {
+; ALL-LABEL: f32to16ui:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = fptoui <16 x float> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+define <16 x i8> @f32to16uc(<16 x float> %f) {
+; ALL-LABEL: f32to16uc:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %res = fptoui <16 x float> %f to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <16 x i16> @f32to16us(<16 x float> %f) {
+; ALL-LABEL: f32to16us:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %res = fptoui <16 x float> %f to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @f32to8ui(<8 x float> %a) nounwind {
+; NOVL-LABEL: f32to8ui:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f32to8ui:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttps2udq %ymm0, %ymm0
+; VL-NEXT:    retq
+  %b = fptoui <8 x float> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
+; NOVL-LABEL: f32to4ui:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f32to4ui:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttps2udq %xmm0, %xmm0
+; VL-NEXT:    retq
+  %b = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
+define <8 x i32> @f64to8ui(<8 x double> %a) nounwind {
+; ALL-LABEL: f64to8ui:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %b = fptoui <8 x double> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i16> @f64to8us(<8 x double> %f) {
+; NOVL-LABEL: f64to8us:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f64to8us:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT:    vpmovdw %ymm0, %xmm0
+; VL-NEXT:    vzeroupper
+; VL-NEXT:    retq
+  %res = fptoui <8 x double> %f to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <8 x i8> @f64to8uc(<8 x double> %f) {
+; NOVL-LABEL: f64to8uc:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT:    vpmovdb %zmm0, %xmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f64to8uc:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT:    vpmovdb %ymm0, %xmm0
+; VL-NEXT:    vzeroupper
+; VL-NEXT:    retq
+  %res = fptoui <8 x double> %f to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
+; NOVL-LABEL: f64to4ui:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; NOVL-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f64to4ui:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; VL-NEXT:    vzeroupper
+; VL-NEXT:    retq
+  %b = fptoui <4 x double> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
+define <8 x double> @sito8f64(<8 x i32> %a) {
+; ALL-LABEL: sito8f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %b = sitofp <8 x i32> %a to <8 x double>
+  ret <8 x double> %b
+}
+define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
+; KNL-LABEL: i32to8f64_mask:
+; KNL:       # %bb.0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; VLBW-LABEL: i32to8f64_mask:
+; VLBW:       # %bb.0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; VLBW-NEXT:    retq
+;
+; VLNOBW-LABEL: i32to8f64_mask:
+; VLNOBW:       # %bb.0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; VLNOBW-NEXT:    retq
+;
+; DQNOVL-LABEL: i32to8f64_mask:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    kmovw %edi, %k1
+; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; DQNOVL-NEXT:    retq
+;
+; AVX512BW-LABEL: i32to8f64_mask:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+  %1 = bitcast i8 %c to <8 x i1>
+  %2 = sitofp <8 x i32> %b to <8 x double>
+  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
+  ret <8 x double> %3
+}
+define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
+; KNL-LABEL: sito8f64_maskz:
+; KNL:       # %bb.0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; VLBW-LABEL: sito8f64_maskz:
+; VLBW:       # %bb.0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; VLBW-NEXT:    retq
+;
+; VLNOBW-LABEL: sito8f64_maskz:
+; VLNOBW:       # %bb.0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; VLNOBW-NEXT:    retq
+;
+; DQNOVL-LABEL: sito8f64_maskz:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    kmovw %edi, %k1
+; DQNOVL-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    retq
+;
+; AVX512BW-LABEL: sito8f64_maskz:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+  %1 = bitcast i8 %b to <8 x i1>
+  %2 = sitofp <8 x i32> %a to <8 x double>
+  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <8 x i32> @f64to8si(<8 x double> %a) {
+; ALL-LABEL: f64to8si:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %b = fptosi <8 x double> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i16> @f64to8ss(<8 x double> %f) {
+; NOVL-LABEL: f64to8ss:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f64to8ss:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT:    vpmovdw %ymm0, %xmm0
+; VL-NEXT:    vzeroupper
+; VL-NEXT:    retq
+  %res = fptosi <8 x double> %f to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <8 x i8> @f64to8sc(<8 x double> %f) {
+; NOVL-LABEL: f64to8sc:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT:    vpmovdb %zmm0, %xmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f64to8sc:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT:    vpmovdb %ymm0, %xmm0
+; VL-NEXT:    vzeroupper
+; VL-NEXT:    retq
+  %res = fptosi <8 x double> %f to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <4 x i32> @f64to4si(<4 x double> %a) {
+; ALL-LABEL: f64to4si:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %b = fptosi <4 x double> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
+define <16 x float> @f64to16f32(<16 x double> %b) nounwind {
+; ALL-LABEL: f64to16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; ALL-NEXT:    vcvtpd2ps %zmm1, %ymm1
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %a = fptrunc <16 x double> %b to <16 x float>
+  ret <16 x float> %a
+}
+
+define <4 x float> @f64to4f32(<4 x double> %b) {
+; ALL-LABEL: f64to4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %a = fptrunc <4 x double> %b to <4 x float>
+  ret <4 x float> %a
+}
+
+define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
+; NOVLDQ-LABEL: f64to4f32_mask:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    vpslld $31, %xmm1, %xmm1
+; NOVLDQ-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NOVLDQ-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; NOVLDQ-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT:    vzeroupper
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: f64to4f32_mask:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vpslld $31, %xmm1, %xmm1
+; VLDQ-NEXT:    vpmovd2m %xmm1, %k1
+; VLDQ-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VLDQ-NEXT:    vzeroupper
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: f64to4f32_mask:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vpslld $31, %xmm1, %xmm1
+; VLNODQ-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; VLNODQ-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vzeroupper
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: f64to4f32_mask:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vpslld $31, %xmm1, %xmm1
+; DQNOVL-NEXT:    vpmovd2m %zmm1, %k1
+; DQNOVL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; DQNOVL-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %a = fptrunc <4 x double> %b to <4 x float>
+  %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
+  ret <4 x float> %c
+}
+
+define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: f64tof32_inreg:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    retq
+  %ext = extractelement <2 x double> %a0, i32 0
+  %cvt = fptrunc double %ext to float
+  %res = insertelement <4 x float> %a1, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
+define <8 x double> @f32to8f64(<8 x float> %b) nounwind {
+; ALL-LABEL: f32to8f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtps2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %a = fpext <8 x float> %b to <8 x double>
+  ret <8 x double> %a
+}
+
+define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) {
+; NOVL-LABEL: f32to4f64_mask:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; NOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVL-NEXT:    vcvtps2pd %xmm0, %ymm0
+; NOVL-NEXT:    vcmpltpd %zmm2, %zmm1, %k1
+; NOVL-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f32to4f64_mask:
+; VL:       # %bb.0:
+; VL-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
+; VL-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
+; VL-NEXT:    retq
+  %a = fpext <4 x float> %b to <4 x double>
+  %mask = fcmp ogt <4 x double> %a1, %b1
+  %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
+  ret <4 x double> %c
+}
+
+define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: f32tof64_inreg:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %ext = extractelement <4 x float> %a1, i32 0
+  %cvt = fpext float %ext to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define double @sltof64_load(i64* nocapture %e) {
+; ALL-LABEL: sltof64_load:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp1 = load i64, i64* %e, align 8
+  %conv = sitofp i64 %tmp1 to double
+  ret double %conv
+}
+
+define double @sitof64_load(i32* %e) {
+; ALL-LABEL: sitof64_load:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp1 = load i32, i32* %e, align 4
+  %conv = sitofp i32 %tmp1 to double
+  ret double %conv
+}
+
+define float @sitof32_load(i32* %e) {
+; ALL-LABEL: sitof32_load:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp1 = load i32, i32* %e, align 4
+  %conv = sitofp i32 %tmp1 to float
+  ret float %conv
+}
+
+define float @sltof32_load(i64* %e) {
+; ALL-LABEL: sltof32_load:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp1 = load i64, i64* %e, align 8
+  %conv = sitofp i64 %tmp1 to float
+  ret float %conv
+}
+
+define void @f32tof64_loadstore() {
+; ALL-LABEL: f32tof64_loadstore:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT:    retq
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load float, float* %f, align 4
+  %conv = fpext float %tmp to double
+  store double %conv, double* %d, align 8
+  ret void
+}
+
+define void @f64tof32_loadstore() nounwind uwtable {
+; ALL-LABEL: f64tof32_loadstore:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT:    retq
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load double, double* %d, align 8
+  %conv = fptrunc double %tmp to float
+  store float %conv, float* %f, align 4
+  ret void
+}
+
+define double @long_to_double(i64 %x) {
+; ALL-LABEL: long_to_double:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq %rdi, %xmm0
+; ALL-NEXT:    retq
+   %res = bitcast i64 %x to double
+   ret double %res
+}
+
+define i64 @double_to_long(double %x) {
+; ALL-LABEL: double_to_long:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq %xmm0, %rax
+; ALL-NEXT:    retq
+   %res = bitcast double %x to i64
+   ret i64 %res
+}
+
+define float @int_to_float(i32 %x) {
+; ALL-LABEL: int_to_float:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovd %edi, %xmm0
+; ALL-NEXT:    retq
+   %res = bitcast i32 %x to float
+   ret float %res
+}
+
+define i32 @float_to_int(float %x) {
+; ALL-LABEL: float_to_int:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovd %xmm0, %eax
+; ALL-NEXT:    retq
+   %res = bitcast float %x to i32
+   ret i32 %res
+}
+
+define <16 x double> @uito16f64(<16 x i32> %a) nounwind {
+; ALL-LABEL: uito16f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm1
+; ALL-NEXT:    vmovaps %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %b = uitofp <16 x i32> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <8 x float> @slto8f32(<8 x i64> %a) {
+; NODQ-LABEL: slto8f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: slto8f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: slto8f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; DQNOVL-NEXT:    retq
+  %b = sitofp <8 x i64> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <16 x float> @slto16f32(<16 x i64> %a) {
+; NODQ-LABEL: slto16f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm1
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: slto16f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; VLDQ-NEXT:    vcvtqq2ps %zmm1, %ymm1
+; VLDQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: slto16f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; DQNOVL-NEXT:    vcvtqq2ps %zmm1, %ymm1
+; DQNOVL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; DQNOVL-NEXT:    retq
+  %b = sitofp <16 x i64> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @slto8f64(<8 x i64> %a) {
+; NODQ-LABEL: slto8f64:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: slto8f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: slto8f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; DQNOVL-NEXT:    retq
+  %b = sitofp <8 x i64> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x double> @slto16f64(<16 x i64> %a) {
+; NODQ-LABEL: slto16f64:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm0
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm1
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: slto16f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; VLDQ-NEXT:    vcvtqq2pd %zmm1, %zmm1
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: slto16f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; DQNOVL-NEXT:    vcvtqq2pd %zmm1, %zmm1
+; DQNOVL-NEXT:    retq
+  %b = sitofp <16 x i64> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <8 x float> @ulto8f32(<8 x i64> %a) {
+; NODQ-LABEL: ulto8f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: ulto8f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: ulto8f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; DQNOVL-NEXT:    retq
+  %b = uitofp <8 x i64> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <16 x float> @ulto16f32(<16 x i64> %a) {
+; NODQ-LABEL: ulto16f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm1
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: ulto16f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; VLDQ-NEXT:    vcvtuqq2ps %zmm1, %ymm1
+; VLDQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: ulto16f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; DQNOVL-NEXT:    vcvtuqq2ps %zmm1, %ymm1
+; DQNOVL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; DQNOVL-NEXT:    retq
+  %b = uitofp <16 x i64> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
+; KNL-LABEL: uito8f64_mask:
+; KNL:       # %bb.0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; VLBW-LABEL: uito8f64_mask:
+; VLBW:       # %bb.0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; VLBW-NEXT:    retq
+;
+; VLNOBW-LABEL: uito8f64_mask:
+; VLNOBW:       # %bb.0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; VLNOBW-NEXT:    retq
+;
+; DQNOVL-LABEL: uito8f64_mask:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    kmovw %edi, %k1
+; DQNOVL-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; DQNOVL-NEXT:    retq
+;
+; AVX512BW-LABEL: uito8f64_mask:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+  %1 = bitcast i8 %c to <8 x i1>
+  %2 = uitofp <8 x i32> %b to <8 x double>
+  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
+  ret <8 x double> %3
+}
+define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
+; KNL-LABEL: uito8f64_maskz:
+; KNL:       # %bb.0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; VLBW-LABEL: uito8f64_maskz:
+; VLBW:       # %bb.0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; VLBW-NEXT:    retq
+;
+; VLNOBW-LABEL: uito8f64_maskz:
+; VLNOBW:       # %bb.0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; VLNOBW-NEXT:    retq
+;
+; DQNOVL-LABEL: uito8f64_maskz:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    kmovw %edi, %k1
+; DQNOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    retq
+;
+; AVX512BW-LABEL: uito8f64_maskz:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+  %1 = bitcast i8 %b to <8 x i1>
+  %2 = uitofp <8 x i32> %a to <8 x double>
+  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <4 x double> @uito4f64(<4 x i32> %a) nounwind {
+; NOVL-LABEL: uito4f64:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: uito4f64:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; VL-NEXT:    retq
+  %b = uitofp <4 x i32> %a to <4 x double>
+  ret <4 x double> %b
+}
+
+define <16 x float> @uito16f32(<16 x i32> %a) nounwind {
+; ALL-LABEL: uito16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = uitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @uito8f64(<8 x i32> %a) {
+; ALL-LABEL: uito8f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %b = uitofp <8 x i32> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <8 x float> @uito8f32(<8 x i32> %a) nounwind {
+; NOVL-LABEL: uito8f32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: uito8f32:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; VL-NEXT:    retq
+  %b = uitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
+; NOVL-LABEL: uito4f32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: uito4f32:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT:    retq
+  %b = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define i32 @fptosi(float %a) nounwind {
+; ALL-LABEL: fptosi:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttss2si %xmm0, %eax
+; ALL-NEXT:    retq
+  %b = fptosi float %a to i32
+  ret i32 %b
+}
+
+define i32 @fptoui(float %a) nounwind {
+; ALL-LABEL: fptoui:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvttss2usi %xmm0, %eax
+; ALL-NEXT:    retq
+  %b = fptoui float %a to i32
+  ret i32 %b
+}
+
+define float @uitof32(i32 %a) nounwind {
+; ALL-LABEL: uitof32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %b = uitofp i32 %a to float
+  ret float %b
+}
+
+define double @uitof64(i32 %a) nounwind {
+; ALL-LABEL: uitof64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %b = uitofp i32 %a to double
+  ret double %b
+}
+
+define <16 x float> @sbto16f32(<16 x i32> %a) {
+; NODQ-LABEL: sbto16f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sbto16f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %zmm0
+; VLDQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: sbto16f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k0
+; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
+; DQNOVL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; DQNOVL-NEXT:    retq
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %mask to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x float> @scto16f32(<16 x i8> %a) {
+; ALL-LABEL: scto16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %1 = sitofp <16 x i8> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x float> @ssto16f32(<16 x i16> %a) {
+; ALL-LABEL: ssto16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %1 = sitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+define <8 x double> @ssto16f64(<8 x i16> %a) {
+; ALL-LABEL: ssto16f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %1 = sitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+define <8 x double> @scto8f64(<8 x i8> %a) {
+; ALL-LABEL: scto8f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbd %xmm0, %ymm0
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %1 = sitofp <8 x i8> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+define <16 x double> @scto16f64(<16 x i8> %a) {
+; ALL-LABEL: scto16f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; ALL-NEXT:    retq
+  %b = sitofp <16 x i8> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x double> @sbto16f64(<16 x double> %a) {
+; NODQ-LABEL: sbto16f64:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; NODQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k0
+; NODQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; NODQ-NEXT:    kunpckbw %k0, %k1, %k1
+; NODQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; NODQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sbto16f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k0
+; VLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; VLDQ-NEXT:    kunpckbw %k0, %k1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %zmm1
+; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; VLDQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: sbto16f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; DQNOVL-NEXT:    vcmpltpd %zmm0, %zmm2, %k0
+; DQNOVL-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; DQNOVL-NEXT:    kunpckbw %k0, %k1, %k0
+; DQNOVL-NEXT:    vpmovm2d %k0, %zmm1
+; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; DQNOVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; DQNOVL-NEXT:    retq
+  %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %cmpres to <16 x double>
+  ret <16 x double> %1
+}
+
+define <8 x double> @sbto8f64(<8 x double> %a) {
+; NOVLDQ-LABEL: sbto8f64:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: sbto8f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
+; VLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sbto8f64:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: sbto8f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; DQNOVL-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
+; DQNOVL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; DQNOVL-NEXT:    retq
+  %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x double>
+  ret <8 x double> %1
+}
+
+define <8 x float> @sbto8f32(<8 x float> %a) {
+; ALL-LABEL: sbto8f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
+; ALL-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x float>
+  ret <8 x float> %1
+}
+
+define <4 x float> @sbto4f32(<4 x float> %a) {
+; ALL-LABEL: sbto4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
+  %1 = sitofp <4 x i1> %cmpres to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @sbto4f64(<4 x double> %a) {
+; NOVL-LABEL: sbto4f64:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: sbto4f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sbto4f64:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VLNODQ-NEXT:    retq
+  %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
+  %1 = sitofp <4 x i1> %cmpres to <4 x double>
+  ret <4 x double> %1
+}
+
+define <2 x float> @sbto2f32(<2 x float> %a) {
+; ALL-LABEL: sbto2f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
+  %1 = sitofp <2 x i1> %cmpres to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x double> @sbto2f64(<2 x double> %a) {
+; NOVL-LABEL: sbto2f64:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: sbto2f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sbto2f64:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VLNODQ-NEXT:    retq
+  %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
+  %1 = sitofp <2 x i1> %cmpres to <2 x double>
+  ret <2 x double> %1
+}
+
+define <16 x float> @ucto16f32(<16 x i8> %a) {
+; ALL-LABEL: ucto16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = uitofp <16 x i8> %a to <16 x float>
+  ret <16 x float>%b
+}
+
+define <8 x double> @ucto8f64(<8 x i8> %a) {
+; ALL-LABEL: ucto8f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %b = uitofp <8 x i8> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x float> @swto16f32(<16 x i16> %a) {
+; ALL-LABEL: swto16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = sitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @swto8f64(<8 x i16> %a) {
+; ALL-LABEL: swto8f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %b = sitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x double> @swto16f64(<16 x i16> %a) {
+; ALL-LABEL: swto16f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxwd %ymm0, %zmm1
+; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; ALL-NEXT:    retq
+  %b = sitofp <16 x i16> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x double> @ucto16f64(<16 x i8> %a) {
+; ALL-LABEL: ucto16f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; ALL-NEXT:    retq
+  %b = uitofp <16 x i8> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x float> @uwto16f32(<16 x i16> %a) {
+; ALL-LABEL: uwto16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = uitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @uwto8f64(<8 x i16> %a) {
+; ALL-LABEL: uwto8f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %b = uitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x double> @uwto16f64(<16 x i16> %a) {
+; ALL-LABEL: uwto16f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; ALL-NEXT:    retq
+  %b = uitofp <16 x i16> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x float> @sito16f32(<16 x i32> %a) {
+; ALL-LABEL: sito16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = sitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <16 x double> @sito16f64(<16 x i32> %a) {
+; ALL-LABEL: sito16f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm1
+; ALL-NEXT:    vmovaps %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %b = sitofp <16 x i32> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x float> @usto16f32(<16 x i16> %a) {
+; ALL-LABEL: usto16f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %b = uitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <16 x float> @ubto16f32(<16 x i32> %a) {
+; NODQ-LABEL: ubto16f32:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    vpsrld $31, %zmm0, %zmm0
+; NODQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: ubto16f32:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %zmm0
+; VLDQ-NEXT:    vpsrld $31, %zmm0, %zmm0
+; VLDQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: ubto16f32:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k0
+; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
+; DQNOVL-NEXT:    vpsrld $31, %zmm0, %zmm0
+; DQNOVL-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; DQNOVL-NEXT:    retq
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = uitofp <16 x i1> %mask to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x double> @ubto16f64(<16 x i32> %a) {
+; NODQ-LABEL: ubto16f64:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    vpsrld $31, %zmm0, %zmm1
+; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; NODQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: ubto16f64:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %zmm0
+; VLDQ-NEXT:    vpsrld $31, %zmm0, %zmm1
+; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; VLDQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: ubto16f64:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k0
+; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
+; DQNOVL-NEXT:    vpsrld $31, %zmm0, %zmm1
+; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; DQNOVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; DQNOVL-NEXT:    retq
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = uitofp <16 x i1> %mask to <16 x double>
+  ret <16 x double> %1
+}
+
+define <8 x float> @ubto8f32(<8 x i32> %a) {
+; NOVL-LABEL: ubto8f32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; NOVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216]
+; NOVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: ubto8f32:
+; VL:       # %bb.0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; VL-NEXT:    retq
+  %mask = icmp slt <8 x i32> %a, zeroinitializer
+  %1 = uitofp <8 x i1> %mask to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @ubto8f64(<8 x i32> %a) {
+; ALL-LABEL: ubto8f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; ALL-NEXT:    vpsrld $31, %ymm0, %ymm0
+; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT:    retq
+  %mask = icmp slt <8 x i32> %a, zeroinitializer
+  %1 = uitofp <8 x i1> %mask to <8 x double>
+  ret <8 x double> %1
+}
+
+define <4 x float> @ubto4f32(<4 x i32> %a) {
+; NOVL-LABEL: ubto4f32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
+; NOVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: ubto4f32:
+; VL:       # %bb.0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; VL-NEXT:    retq
+  %mask = icmp slt <4 x i32> %a, zeroinitializer
+  %1 = uitofp <4 x i1> %mask to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @ubto4f64(<4 x i32> %a) {
+; ALL-LABEL: ubto4f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    vpsrld $31, %xmm0, %xmm0
+; ALL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; ALL-NEXT:    retq
+  %mask = icmp slt <4 x i32> %a, zeroinitializer
+  %1 = uitofp <4 x i1> %mask to <4 x double>
+  ret <4 x double> %1
+}
+
+define <2 x float> @ubto2f32(<2 x i32> %a) {
+; NOVL-LABEL: ubto2f32:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
+; NOVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: ubto2f32:
+; VL:       # %bb.0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; VL-NEXT:    vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; VL-NEXT:    retq
+  %mask = icmp ne <2 x i32> %a, zeroinitializer
+  %1 = uitofp <2 x i1> %mask to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x double> @ubto2f64(<2 x i32> %a) {
+; NOVL-LABEL: ubto2f64:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; NOVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: ubto2f64:
+; VL:       # %bb.0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; VL-NEXT:    vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; VL-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VL-NEXT:    retq
+  %mask = icmp ne <2 x i32> %a, zeroinitializer
+  %1 = uitofp <2 x i1> %mask to <2 x double>
+  ret <2 x double> %1
+}
+
+define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) {
+; NOVLDQ-LABEL: test_2f64toub:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; NOVLDQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; NOVLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT:    vzeroupper
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_2f64toub:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_2f64toub:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_2f64toub:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; DQNOVL-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; DQNOVL-NEXT:    vpslld $31, %ymm0, %ymm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %mask = fptoui <2 x double> %a to <2 x i1>
+  %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
+  ret <2 x i64> %select
+}
+
+define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) {
+; NOVLDQ-LABEL: test_4f64toub:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVLDQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; NOVLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_4f64toub:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_4f64toub:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_4f64toub:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; DQNOVL-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; DQNOVL-NEXT:    vpslld $31, %xmm0, %xmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT:    retq
+  %mask = fptoui <4 x double> %a to <4 x i1>
+  %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
+  ret <4 x i64> %select
+}
+
+define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) {
+; NOVLDQ-LABEL: test_8f64toub:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_8f64toub:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_8f64toub:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VLNODQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; VLNODQ-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_8f64toub:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; DQNOVL-NEXT:    vpslld $31, %ymm0, %ymm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    retq
+  %mask = fptoui <8 x double> %a to <8 x i1>
+  %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
+  ret <8 x i64> %select
+}
+
+define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) {
+; NOVLDQ-LABEL: test_2f32toub:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; NOVLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT:    vzeroupper
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_2f32toub:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_2f32toub:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_2f32toub:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; DQNOVL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; DQNOVL-NEXT:    vpslld $31, %xmm0, %xmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %mask = fptoui <2 x float> %a to <2 x i1>
+  %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
+  ret <2 x i64> %select
+}
+
+define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) {
+; NOVLDQ-LABEL: test_4f32toub:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; NOVLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_4f32toub:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_4f32toub:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_4f32toub:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; DQNOVL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; DQNOVL-NEXT:    vpslld $31, %xmm0, %xmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT:    retq
+  %mask = fptoui <4 x float> %a to <4 x i1>
+  %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
+  ret <4 x i64> %select
+}
+
+define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) {
+; NOVLDQ-LABEL: test_8f32toub:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    vcvttps2dq %ymm0, %ymm0
+; NOVLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_8f32toub:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2dq %ymm0, %ymm0
+; VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_8f32toub:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttps2dq %ymm0, %ymm0
+; VLNODQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; VLNODQ-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_8f32toub:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; DQNOVL-NEXT:    vpslld $31, %ymm0, %ymm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    retq
+  %mask = fptoui <8 x float> %a to <8 x i1>
+  %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
+  ret <8 x i64> %select
+}
+
+define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) {
+; NODQ-LABEL: test_16f32toub:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vcvttps2dq %zmm0, %zmm0
+; NODQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; NODQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NODQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_16f32toub:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2dq %zmm0, %zmm0
+; VLDQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; VLDQ-NEXT:    vpmovd2m %zmm0, %k1
+; VLDQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_16f32toub:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvttps2dq %zmm0, %zmm0
+; DQNOVL-NEXT:    vpslld $31, %zmm0, %zmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    retq
+  %mask = fptoui <16 x float> %a to <16 x i1>
+  %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
+  ret <16 x i32> %select
+}
+
+define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) {
+; NOVLDQ-LABEL: test_2f64tosb:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVLDQ-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; NOVLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT:    vzeroupper
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_2f64tosb:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_2f64tosb:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_2f64tosb:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; DQNOVL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; DQNOVL-NEXT:    vpslld $31, %xmm0, %xmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %mask = fptosi <2 x double> %a to <2 x i1>
+  %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
+  ret <2 x i64> %select
+}
+
+define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) {
+; NOVLDQ-LABEL: test_4f64tosb:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVLDQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_4f64tosb:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_4f64tosb:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_4f64tosb:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; DQNOVL-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT:    retq
+  %mask = fptosi <4 x double> %a to <4 x i1>
+  %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
+  ret <4 x i64> %select
+}
+
+define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) {
+; NOVLDQ-LABEL: test_8f64tosb:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_8f64tosb:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_8f64tosb:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VLNODQ-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_8f64tosb:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    retq
+  %mask = fptosi <8 x double> %a to <8 x i1>
+  %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
+  ret <8 x i64> %select
+}
+
+define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) {
+; NOVLDQ-LABEL: test_2f32tosb:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT:    vzeroupper
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_2f32tosb:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_2f32tosb:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_2f32tosb:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; DQNOVL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT:    vzeroupper
+; DQNOVL-NEXT:    retq
+  %mask = fptosi <2 x float> %a to <2 x i1>
+  %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
+  ret <2 x i64> %select
+}
+
+define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) {
+; NOVLDQ-LABEL: test_4f32tosb:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_4f32tosb:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_4f32tosb:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_4f32tosb:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; DQNOVL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT:    retq
+  %mask = fptosi <4 x float> %a to <4 x i1>
+  %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
+  ret <4 x i64> %select
+}
+
+define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) {
+; NOVLDQ-LABEL: test_8f32tosb:
+; NOVLDQ:       # %bb.0:
+; NOVLDQ-NEXT:    vcvttps2dq %ymm0, %ymm0
+; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_8f32tosb:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2dq %ymm0, %ymm0
+; VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; VLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: test_8f32tosb:
+; VLNODQ:       # %bb.0:
+; VLNODQ-NEXT:    vcvttps2dq %ymm0, %ymm0
+; VLNODQ-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLNODQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_8f32tosb:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    retq
+  %mask = fptosi <8 x float> %a to <8 x i1>
+  %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
+  ret <8 x i64> %select
+}
+
+define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) {
+; NODQ-LABEL: test_16f32tosb:
+; NODQ:       # %bb.0:
+; NODQ-NEXT:    vcvttps2dq %zmm0, %zmm0
+; NODQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NODQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: test_16f32tosb:
+; VLDQ:       # %bb.0:
+; VLDQ-NEXT:    vcvttps2dq %zmm0, %zmm0
+; VLDQ-NEXT:    vpmovd2m %zmm0, %k1
+; VLDQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT:    retq
+;
+; DQNOVL-LABEL: test_16f32tosb:
+; DQNOVL:       # %bb.0:
+; DQNOVL-NEXT:    vcvttps2dq %zmm0, %zmm0
+; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT:    retq
+  %mask = fptosi <16 x float> %a to <16 x i1>
+  %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
+  ret <16 x i32> %select
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index e99cdaf1ce92d16b3d7b5870f6061469dcd5b500..264b1ea841fd3105f2b94ed84b306e5edbff2c52 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -6,7 +6,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=KNL_WIDEN
 
 
 define <16 x float> @sitof32(<16 x i32> %a) nounwind {
@@ -14,11 +13,6 @@ define <16 x float> @sitof32(<16 x i32> %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sitof32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <16 x i32> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -63,36 +57,6 @@ define <8 x double> @sltof864(<8 x i64> %a) {
 ; DQNOVL:       # %bb.0:
 ; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sltof864:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -125,22 +89,6 @@ define <4 x double> @slto4f64(<4 x i64> %a) {
 ; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
 ; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: slto4f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x double>
   ret <4 x double> %b
 }
@@ -167,15 +115,6 @@ define <2 x double> @slto2f64(<2 x i64> %a) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: slto2f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %b
 }
@@ -204,17 +143,6 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sltof2f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
-; KNL_WIDEN-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <2 x i64> %a to <2 x float>
   ret <2 x float>%b
 }
@@ -222,20 +150,19 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
 ; NODQ-LABEL: slto4f32_mem:
 ; NODQ:       # %bb.0:
-; NODQ-NEXT:    vmovdqu (%rdi), %ymm0
+; NODQ-NEXT:    vmovdqu (%rdi), %xmm0
+; NODQ-NEXT:    vmovdqu 16(%rdi), %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT:    vzeroupper
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: slto4f32_mem:
@@ -250,24 +177,6 @@ define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: slto4f32_mem:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vmovdqu (%rdi), %ymm0
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %a1 = load <4 x i64>, <4 x i64>* %a, align 8
   %b = sitofp <4 x i64> %a1 to <4 x float>
   ret <4 x float>%b
@@ -303,24 +212,6 @@ define <4 x i64> @f64to4sl(<4 x double> %a) {
 ; DQNOVL-NEXT:    vcvttpd2qq %zmm0, %zmm0
 ; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to4sl:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm1, %rax
-; KNL_WIDEN-NEXT:    vmovq %rax, %xmm2
-; KNL_WIDEN-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm1, %rax
-; KNL_WIDEN-NEXT:    vmovq %rax, %xmm1
-; KNL_WIDEN-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm0, %rax
-; KNL_WIDEN-NEXT:    vmovq %rax, %xmm2
-; KNL_WIDEN-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm0, %rax
-; KNL_WIDEN-NEXT:    vmovq %rax, %xmm0
-; KNL_WIDEN-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; KNL_WIDEN-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i64>
   ret <4 x i64> %b
 }
@@ -355,24 +246,6 @@ define <4 x i64> @f32to4sl(<4 x float> %a) {
 ; DQNOVL-NEXT:    vcvttps2qq %ymm0, %zmm0
 ; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to4sl:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; KNL_WIDEN-NEXT:    vcvttss2si %xmm1, %rax
-; KNL_WIDEN-NEXT:    vmovq %rax, %xmm1
-; KNL_WIDEN-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; KNL_WIDEN-NEXT:    vcvttss2si %xmm2, %rax
-; KNL_WIDEN-NEXT:    vmovq %rax, %xmm2
-; KNL_WIDEN-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; KNL_WIDEN-NEXT:    vcvttss2si %xmm0, %rax
-; KNL_WIDEN-NEXT:    vmovq %rax, %xmm2
-; KNL_WIDEN-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; KNL_WIDEN-NEXT:    vcvttss2si %xmm0, %rax
-; KNL_WIDEN-NEXT:    vmovq %rax, %xmm0
-; KNL_WIDEN-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; KNL_WIDEN-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %b = fptosi <4 x float> %a to <4 x i64>
   ret <4 x i64> %b
 }
@@ -408,23 +281,6 @@ define <4 x float> @slto4f32(<4 x i64> %a) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: slto4f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -460,23 +316,6 @@ define <4 x float> @ulto4f32(<4 x i64> %a) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ulto4f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -501,16 +340,6 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
 ; DQNOVL:       # %bb.0:
 ; DQNOVL-NEXT:    vcvtuqq2pd %zmm0, %zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ulto8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; KNL_WIDEN-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -547,26 +376,6 @@ define <16 x double> @ulto16f64(<16 x i64> %a) {
 ; DQNOVL-NEXT:    vcvtuqq2pd %zmm0, %zmm0
 ; DQNOVL-NEXT:    vcvtuqq2pd %zmm1, %zmm1
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ulto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
-; KNL_WIDEN-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; KNL_WIDEN-NEXT:    vporq %zmm4, %zmm3, %zmm3
-; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; KNL_WIDEN-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; KNL_WIDEN-NEXT:    vsubpd %zmm6, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
-; KNL_WIDEN-NEXT:    vpandq %zmm2, %zmm1, %zmm2
-; KNL_WIDEN-NEXT:    vporq %zmm4, %zmm2, %zmm2
-; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm1, %zmm1
-; KNL_WIDEN-NEXT:    vporq %zmm5, %zmm1, %zmm1
-; KNL_WIDEN-NEXT:    vsubpd %zmm6, %zmm1, %zmm1
-; KNL_WIDEN-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -576,11 +385,6 @@ define <16 x i32> @f64to16si(<16 x float> %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to16si:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = fptosi <16 x float> %a to <16 x i32>
   ret <16 x i32> %b
 }
@@ -592,13 +396,6 @@ define <16 x i8> @f32to16sc(<16 x float> %f) {
 ; ALL-NEXT:    vpmovdb %zmm0, %xmm0
 ; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to16sc:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %res = fptosi <16 x float> %f to <16 x i8>
   ret <16 x i8> %res
 }
@@ -609,12 +406,6 @@ define <16 x i16> @f32to16ss(<16 x float> %f) {
 ; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
 ; ALL-NEXT:    vpmovdw %zmm0, %ymm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to16ss:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %res = fptosi <16 x float> %f to <16 x i16>
   ret <16 x i16> %res
 }
@@ -624,11 +415,6 @@ define <16 x i32> @f32to16ui(<16 x float> %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to16ui:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = fptoui <16 x float> %a to <16 x i32>
   ret <16 x i32> %b
 }
@@ -640,13 +426,6 @@ define <16 x i8> @f32to16uc(<16 x float> %f) {
 ; ALL-NEXT:    vpmovdb %zmm0, %xmm0
 ; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to16uc:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %res = fptoui <16 x float> %f to <16 x i8>
   ret <16 x i8> %res
 }
@@ -657,12 +436,6 @@ define <16 x i16> @f32to16us(<16 x float> %f) {
 ; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
 ; ALL-NEXT:    vpmovdw %zmm0, %ymm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to16us:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %res = fptoui <16 x float> %f to <16 x i16>
   ret <16 x i16> %res
 }
@@ -679,13 +452,6 @@ define <8 x i32> @f32to8ui(<8 x float> %a) nounwind {
 ; VL:       # %bb.0:
 ; VL-NEXT:    vcvttps2udq %ymm0, %ymm0
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to8ui:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL_WIDEN-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = fptoui <8 x float> %a to <8 x i32>
   ret <8 x i32> %b
 }
@@ -703,14 +469,6 @@ define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
 ; VL:       # %bb.0:
 ; VL-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to4ui:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL_WIDEN-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %b = fptoui <4 x float> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -720,11 +478,6 @@ define <8 x i32> @f64to8ui(<8 x double> %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to8ui:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %b = fptoui <8 x double> %a to <8 x i32>
   ret <8 x i32> %b
 }
@@ -744,14 +497,6 @@ define <8 x i16> @f64to8us(<8 x double> %f) {
 ; VL-NEXT:    vpmovdw %ymm0, %xmm0
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to8us:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %res = fptoui <8 x double> %f to <8 x i16>
   ret <8 x i16> %res
 }
@@ -771,34 +516,6 @@ define <8 x i8> @f64to8uc(<8 x double> %f) {
 ; VL-NEXT:    vpmovdw %ymm0, %xmm0
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to8uc:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm1, %eax
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm0, %ecx
-; KNL_WIDEN-NEXT:    vmovd %ecx, %xmm1
-; KNL_WIDEN-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm2, %eax
-; KNL_WIDEN-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm2, %eax
-; KNL_WIDEN-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm2, %eax
-; KNL_WIDEN-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm2, %eax
-; KNL_WIDEN-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm0, %eax
-; KNL_WIDEN-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; KNL_WIDEN-NEXT:    vcvttsd2si %xmm0, %eax
-; KNL_WIDEN-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %res = fptoui <8 x double> %f to <8 x i8>
   ret <8 x i8> %res
 }
@@ -817,14 +534,6 @@ define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
 ; VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to4ui:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL_WIDEN-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %b = fptoui <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -834,11 +543,6 @@ define <8 x double> @sito8f64(<8 x i32> %a) {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sito8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <8 x i32> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -872,12 +576,6 @@ define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwi
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: i32to8f64_mask:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    kmovw %edi, %k1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; KNL_WIDEN-NEXT:    retq
   %1 = bitcast i8 %c to <8 x i1>
   %2 = sitofp <8 x i32> %b to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
@@ -913,12 +611,6 @@ define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sito8f64_maskz:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    kmovw %edi, %k1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    retq
   %1 = bitcast i8 %b to <8 x i1>
   %2 = sitofp <8 x i32> %a to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -930,27 +622,54 @@ define <8 x i32> @f64to8si(<8 x double> %a) {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvttpd2dq %zmm0, %ymm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to8si:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %b = fptosi <8 x double> %a to <8 x i32>
   ret <8 x i32> %b
 }
 
+define <8 x i16> @f64to8ss(<8 x double> %f) {
+; NOVL-LABEL: f64to8ss:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f64to8ss:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT:    vpmovdw %ymm0, %xmm0
+; VL-NEXT:    vzeroupper
+; VL-NEXT:    retq
+  %res = fptosi <8 x double> %f to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <8 x i8> @f64to8sc(<8 x double> %f) {
+; NOVL-LABEL: f64to8sc:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
+; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
+;
+; VL-LABEL: f64to8sc:
+; VL:       # %bb.0:
+; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT:    vpmovdw %ymm0, %xmm0
+; VL-NEXT:    vzeroupper
+; VL-NEXT:    retq
+  %res = fptosi <8 x double> %f to <8 x i8>
+  ret <8 x i8> %res
+}
+
 define <4 x i32> @f64to4si(<4 x double> %a) {
 ; ALL-LABEL: f64to4si:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvttpd2dq %ymm0, %xmm0
 ; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to4si:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -962,13 +681,6 @@ define <16 x float> @f64to16f32(<16 x double> %b) nounwind {
 ; ALL-NEXT:    vcvtpd2ps %zmm1, %ymm1
 ; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtpd2ps %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtpd2ps %zmm1, %ymm1
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %a = fptrunc <16 x double> %b to <16 x float>
   ret <16 x float> %a
 }
@@ -979,12 +691,6 @@ define <4 x float> @f64to4f32(<4 x double> %b) {
 ; ALL-NEXT:    vcvtpd2ps %ymm0, %xmm0
 ; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to4f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %a = fptrunc <4 x double> %b to <4 x float>
   ret <4 x float> %a
 }
@@ -1025,16 +731,6 @@ define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64to4f32_mask:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpslld $31, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; KNL_WIDEN-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %a = fptrunc <4 x double> %b to <4 x float>
   %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
   ret <4 x float> %c
@@ -1045,11 +741,6 @@ define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64tof32_inreg:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %ext = extractelement <2 x double> %a0, i32 0
   %cvt = fptrunc double %ext to float
   %res = insertelement <4 x float> %a1, float %cvt, i32 0
@@ -1061,11 +752,6 @@ define <8 x double> @f32to8f64(<8 x float> %b) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtps2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtps2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %a = fpext <8 x float> %b to <8 x double>
   ret <8 x double> %a
 }
@@ -1086,16 +772,6 @@ define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x doubl
 ; VL-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
 ; VL-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32to4f64_mask:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
-; KNL_WIDEN-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; KNL_WIDEN-NEXT:    vcvtps2pd %xmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcmpltpd %zmm2, %zmm1, %k1
-; KNL_WIDEN-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_WIDEN-NEXT:    retq
   %a = fpext <4 x float> %b to <4 x double>
   %mask = fcmp ogt <4 x double> %a1, %b1
   %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
@@ -1107,11 +783,6 @@ define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32tof64_inreg:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %ext = extractelement <4 x float> %a1, i32 0
   %cvt = fpext float %ext to double
   %res = insertelement <2 x double> %a0, double %cvt, i32 0
@@ -1123,11 +794,6 @@ define double @sltof64_load(i64* nocapture %e) {
 ; ALL:       # %bb.0: # %entry
 ; ALL-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sltof64_load:
-; KNL_WIDEN:       # %bb.0: # %entry
-; KNL_WIDEN-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
 entry:
   %tmp1 = load i64, i64* %e, align 8
   %conv = sitofp i64 %tmp1 to double
@@ -1139,11 +805,6 @@ define double @sitof64_load(i32* %e) {
 ; ALL:       # %bb.0: # %entry
 ; ALL-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sitof64_load:
-; KNL_WIDEN:       # %bb.0: # %entry
-; KNL_WIDEN-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
 entry:
   %tmp1 = load i32, i32* %e, align 4
   %conv = sitofp i32 %tmp1 to double
@@ -1155,11 +816,6 @@ define float @sitof32_load(i32* %e) {
 ; ALL:       # %bb.0: # %entry
 ; ALL-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sitof32_load:
-; KNL_WIDEN:       # %bb.0: # %entry
-; KNL_WIDEN-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
 entry:
   %tmp1 = load i32, i32* %e, align 4
   %conv = sitofp i32 %tmp1 to float
@@ -1171,11 +827,6 @@ define float @sltof32_load(i64* %e) {
 ; ALL:       # %bb.0: # %entry
 ; ALL-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sltof32_load:
-; KNL_WIDEN:       # %bb.0: # %entry
-; KNL_WIDEN-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
 entry:
   %tmp1 = load i64, i64* %e, align 8
   %conv = sitofp i64 %tmp1 to float
@@ -1189,13 +840,6 @@ define void @f32tof64_loadstore() {
 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; ALL-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f32tof64_loadstore:
-; KNL_WIDEN:       # %bb.0: # %entry
-; KNL_WIDEN-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL_WIDEN-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
-; KNL_WIDEN-NEXT:    retq
 entry:
   %f = alloca float, align 4
   %d = alloca double, align 8
@@ -1212,13 +856,6 @@ define void @f64tof32_loadstore() nounwind uwtable {
 ; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
 ; ALL-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: f64tof32_loadstore:
-; KNL_WIDEN:       # %bb.0: # %entry
-; KNL_WIDEN-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; KNL_WIDEN-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
-; KNL_WIDEN-NEXT:    retq
 entry:
   %f = alloca float, align 4
   %d = alloca double, align 8
@@ -1233,11 +870,6 @@ define double @long_to_double(i64 %x) {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vmovq %rdi, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: long_to_double:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vmovq %rdi, %xmm0
-; KNL_WIDEN-NEXT:    retq
    %res = bitcast i64 %x to double
    ret double %res
 }
@@ -1247,11 +879,6 @@ define i64 @double_to_long(double %x) {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vmovq %xmm0, %rax
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: double_to_long:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    retq
    %res = bitcast double %x to i64
    ret i64 %res
 }
@@ -1261,11 +888,6 @@ define float @int_to_float(i32 %x) {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vmovd %edi, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: int_to_float:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vmovd %edi, %xmm0
-; KNL_WIDEN-NEXT:    retq
    %res = bitcast i32 %x to float
    ret float %res
 }
@@ -1275,11 +897,6 @@ define i32 @float_to_int(float %x) {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vmovd %xmm0, %eax
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: float_to_int:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vmovd %xmm0, %eax
-; KNL_WIDEN-NEXT:    retq
    %res = bitcast float %x to i32
    ret i32 %res
 }
@@ -1292,14 +909,6 @@ define <16 x double> @uito16f64(<16 x i32> %a) nounwind {
 ; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm1
 ; ALL-NEXT:    vmovaps %zmm2, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uito16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; KNL_WIDEN-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtudq2pd %ymm0, %zmm1
-; KNL_WIDEN-NEXT:    vmovaps %zmm2, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -1344,36 +953,6 @@ define <8 x float> @slto8f32(<8 x i64> %a) {
 ; DQNOVL:       # %bb.0:
 ; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: slto8f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %b
 }
@@ -1449,63 +1028,6 @@ define <16 x float> @slto16f32(<16 x i64> %a) {
 ; DQNOVL-NEXT:    vcvtqq2ps %zmm1, %ymm1
 ; DQNOVL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: slto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm1
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm0
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -1550,36 +1072,6 @@ define <8 x double> @slto8f64(<8 x i64> %a) {
 ; DQNOVL:       # %bb.0:
 ; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: slto8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -1652,62 +1144,6 @@ define <16 x double> @slto16f64(<16 x i64> %a) {
 ; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
 ; DQNOVL-NEXT:    vcvtqq2pd %zmm1, %zmm1
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: slto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm1
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -1752,36 +1188,6 @@ define <8 x float> @ulto8f32(<8 x i64> %a) {
 ; DQNOVL:       # %bb.0:
 ; DQNOVL-NEXT:    vcvtuqq2ps %zmm0, %ymm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ulto8f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm0
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %b
 }
@@ -1857,63 +1263,6 @@ define <16 x float> @ulto16f32(<16 x i64> %a) {
 ; DQNOVL-NEXT:    vcvtuqq2ps %zmm1, %ymm1
 ; DQNOVL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ulto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm1
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm2
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm0
-; KNL_WIDEN-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -1948,12 +1297,6 @@ define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwin
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uito8f64_mask:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    kmovw %edi, %k1
-; KNL_WIDEN-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; KNL_WIDEN-NEXT:    retq
   %1 = bitcast i8 %c to <8 x i1>
   %2 = uitofp <8 x i32> %b to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
@@ -1989,12 +1332,6 @@ define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uito8f64_maskz:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    kmovw %edi, %k1
-; KNL_WIDEN-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    retq
   %1 = bitcast i8 %b to <8 x i1>
   %2 = uitofp <8 x i32> %a to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -2013,13 +1350,6 @@ define <4 x double> @uito4f64(<4 x i32> %a) nounwind {
 ; VL:       # %bb.0:
 ; VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uito4f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; KNL_WIDEN-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %b
 }
@@ -2029,11 +1359,6 @@ define <16 x float> @uito16f32(<16 x i32> %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtudq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uito16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i32> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -2043,11 +1368,6 @@ define <8 x double> @uito8f64(<8 x i32> %a) {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uito8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <8 x i32> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -2064,13 +1384,6 @@ define <8 x float> @uito8f32(<8 x i32> %a) nounwind {
 ; VL:       # %bb.0:
 ; VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uito8f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL_WIDEN-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %b
 }
@@ -2088,14 +1401,6 @@ define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
 ; VL:       # %bb.0:
 ; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uito4f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL_WIDEN-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -2105,11 +1410,6 @@ define i32 @fptosi(float %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvttss2si %xmm0, %eax
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: fptosi:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttss2si %xmm0, %eax
-; KNL_WIDEN-NEXT:    retq
   %b = fptosi float %a to i32
   ret i32 %b
 }
@@ -2119,11 +1419,6 @@ define i32 @fptoui(float %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvttss2usi %xmm0, %eax
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: fptoui:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttss2usi %xmm0, %eax
-; KNL_WIDEN-NEXT:    retq
   %b = fptoui float %a to i32
   ret i32 %b
 }
@@ -2133,11 +1428,6 @@ define float @uitof32(i32 %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uitof32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp i32 %a to float
   ret float %b
 }
@@ -2147,11 +1437,6 @@ define double @uitof64(i32 %a) nounwind {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uitof64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp i32 %a to double
   ret double %b
 }
@@ -2178,14 +1463,6 @@ define <16 x float> @sbto16f32(<16 x i32> %a) {
 ; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
 ; DQNOVL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sbto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL_WIDEN-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp slt <16 x i32> %a, zeroinitializer
   %1 = sitofp <16 x i1> %mask to <16 x float>
   ret <16 x float> %1
@@ -2197,12 +1474,6 @@ define <16 x float> @scto16f32(<16 x i8> %a) {
 ; ALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: scto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %1 = sitofp <16 x i8> %a to <16 x float>
   ret <16 x float> %1
 }
@@ -2213,12 +1484,6 @@ define <16 x float> @ssto16f32(<16 x i16> %a) {
 ; ALL-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ssto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovsxwd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %1 = sitofp <16 x i16> %a to <16 x float>
   ret <16 x float> %1
 }
@@ -2229,12 +1494,6 @@ define <8 x double> @ssto16f64(<8 x i16> %a) {
 ; ALL-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ssto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovsxwd %xmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %1 = sitofp <8 x i16> %a to <8 x double>
   ret <8 x double> %1
 }
@@ -2247,12 +1506,6 @@ define <8 x double> @scto8f64(<8 x i8> %a) {
 ; ALL-NEXT:    vpsrad $24, %ymm0, %ymm0
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: scto8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovsxbd %xmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %1 = sitofp <8 x i8> %a to <8 x double>
   ret <8 x double> %1
 }
@@ -2265,14 +1518,6 @@ define <16 x double> @scto16f64(<16 x i8> %a) {
 ; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: scto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovsxbd %xmm0, %zmm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; KNL_WIDEN-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <16 x i8> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -2313,18 +1558,6 @@ define <16 x double> @sbto16f64(<16 x double> %a) {
 ; DQNOVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm1
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sbto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vcmpltpd %zmm0, %zmm2, %k0
-; KNL_WIDEN-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
-; KNL_WIDEN-NEXT:    kunpckbw %k0, %k1, %k1
-; KNL_WIDEN-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; KNL_WIDEN-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; KNL_WIDEN-NEXT:    retq
   %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
   %1 = sitofp <16 x i1> %cmpres to <16 x double>
   ret <16 x double> %1
@@ -2363,14 +1596,6 @@ define <8 x double> @sbto8f64(<8 x double> %a) {
 ; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
 ; DQNOVL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sbto8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
-; KNL_WIDEN-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
   %1 = sitofp <8 x i1> %cmpres to <8 x double>
   ret <8 x double> %1
@@ -2383,13 +1608,6 @@ define <8 x float> @sbto8f32(<8 x float> %a) {
 ; ALL-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; ALL-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sbto8f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; KNL_WIDEN-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
   %1 = sitofp <8 x i1> %cmpres to <8 x float>
   ret <8 x float> %1
@@ -2402,13 +1620,6 @@ define <4 x float> @sbto4f32(<4 x float> %a) {
 ; ALL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
 ; ALL-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sbto4f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; KNL_WIDEN-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
   %1 = sitofp <4 x i1> %cmpres to <4 x float>
   ret <4 x float> %1
@@ -2439,14 +1650,6 @@ define <4 x double> @sbto4f64(<4 x double> %a) {
 ; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; VLNODQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; VLNODQ-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sbto4f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; KNL_WIDEN-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
   %1 = sitofp <4 x i1> %cmpres to <4 x double>
   ret <4 x double> %1
@@ -2459,13 +1662,6 @@ define <2 x float> @sbto2f32(<2 x float> %a) {
 ; ALL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
 ; ALL-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sbto2f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; KNL_WIDEN-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
   %1 = sitofp <2 x i1> %cmpres to <2 x float>
   ret <2 x float> %1
@@ -2479,15 +1675,6 @@ define <2 x double> @sbto2f64(<2 x double> %a) {
 ; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; ALL-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sbto2f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
-; KNL_WIDEN-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
   %1 = sitofp <2 x i1> %cmpres to <2 x double>
   ret <2 x double> %1
@@ -2499,12 +1686,6 @@ define <16 x float> @ucto16f32(<16 x i8> %a) {
 ; ALL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ucto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i8> %a to <16 x float>
   ret <16 x float>%b
 }
@@ -2516,12 +1697,6 @@ define <8 x double> @ucto8f64(<8 x i8> %a) {
 ; ALL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ucto8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <8 x i8> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -2532,12 +1707,6 @@ define <16 x float> @swto16f32(<16 x i16> %a) {
 ; ALL-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: swto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovsxwd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <16 x i16> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -2548,12 +1717,6 @@ define <8 x double> @swto8f64(<8 x i16> %a) {
 ; ALL-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: swto8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovsxwd %xmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <8 x i16> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -2566,14 +1729,6 @@ define <16 x double> @swto16f64(<16 x i16> %a) {
 ; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: swto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovsxwd %ymm0, %zmm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; KNL_WIDEN-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <16 x i16> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -2586,14 +1741,6 @@ define <16 x double> @ucto16f64(<16 x i8> %a) {
 ; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ucto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; KNL_WIDEN-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i8> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -2604,12 +1751,6 @@ define <16 x float> @uwto16f32(<16 x i16> %a) {
 ; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uwto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i16> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -2620,12 +1761,6 @@ define <8 x double> @uwto8f64(<8 x i16> %a) {
 ; ALL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uwto8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <8 x i16> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -2638,14 +1773,6 @@ define <16 x double> @uwto16f64(<16 x i16> %a) {
 ; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: uwto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; KNL_WIDEN-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i16> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -2655,11 +1782,6 @@ define <16 x float> @sito16f32(<16 x i32> %a) {
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sito16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <16 x i32> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -2672,14 +1794,6 @@ define <16 x double> @sito16f64(<16 x i32> %a) {
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm1
 ; ALL-NEXT:    vmovaps %zmm2, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: sito16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm2
-; KNL_WIDEN-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm1
-; KNL_WIDEN-NEXT:    vmovaps %zmm2, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = sitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -2690,12 +1804,6 @@ define <16 x float> @usto16f32(<16 x i16> %a) {
 ; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: usto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i16> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -2725,15 +1833,6 @@ define <16 x float> @ubto16f32(<16 x i32> %a) {
 ; DQNOVL-NEXT:    vpsrld $31, %zmm0, %zmm0
 ; DQNOVL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ubto16f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL_WIDEN-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    vpsrld $31, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp slt <16 x i32> %a, zeroinitializer
   %1 = uitofp <16 x i1> %mask to <16 x float>
   ret <16 x float> %1
@@ -2770,17 +1869,6 @@ define <16 x double> @ubto16f64(<16 x i32> %a) {
 ; DQNOVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm1
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ubto16f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL_WIDEN-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    vpsrld $31, %zmm0, %zmm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; KNL_WIDEN-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp slt <16 x i32> %a, zeroinitializer
   %1 = uitofp <16 x i1> %mask to <16 x double>
   ret <16 x double> %1
@@ -2801,14 +1889,6 @@ define <8 x float> @ubto8f32(<8 x i32> %a) {
 ; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
 ; VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ubto8f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; KNL_WIDEN-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216]
-; KNL_WIDEN-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp slt <8 x i32> %a, zeroinitializer
   %1 = uitofp <8 x i1> %mask to <8 x float>
   ret <8 x float> %1
@@ -2822,14 +1902,6 @@ define <8 x double> @ubto8f64(<8 x i32> %a) {
 ; ALL-NEXT:    vpsrld $31, %ymm0, %ymm0
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ubto8f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; KNL_WIDEN-NEXT:    vpsrld $31, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp slt <8 x i32> %a, zeroinitializer
   %1 = uitofp <8 x i1> %mask to <8 x double>
   ret <8 x double> %1
@@ -2850,14 +1922,6 @@ define <4 x float> @ubto4f32(<4 x i32> %a) {
 ; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
 ; VL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ubto4f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL_WIDEN-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
-; KNL_WIDEN-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp slt <4 x i32> %a, zeroinitializer
   %1 = uitofp <4 x i1> %mask to <4 x float>
   ret <4 x float> %1
@@ -2871,14 +1935,6 @@ define <4 x double> @ubto4f64(<4 x i32> %a) {
 ; ALL-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; ALL-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ubto4f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL_WIDEN-NEXT:    vpsrld $31, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp slt <4 x i32> %a, zeroinitializer
   %1 = uitofp <4 x i1> %mask to <4 x double>
   ret <4 x double> %1
@@ -2893,14 +1949,6 @@ define <2 x float> @ubto2f32(<2 x i32> %a) {
 ; ALL-NEXT:    vpandn {{.*}}(%rip), %xmm0, %xmm0
 ; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ubto2f32:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
-; KNL_WIDEN-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp ne <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x float>
   ret <2 x float> %1
@@ -2916,15 +1964,6 @@ define <2 x double> @ubto2f64(<2 x i32> %a) {
 ; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; ALL-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; ALL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: ubto2f64:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL_WIDEN-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; KNL_WIDEN-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = icmp ne <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x double>
   ret <2 x double> %1
@@ -2970,18 +2009,6 @@ define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_2f64toub:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL_WIDEN-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vpslld $31, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %mask = fptoui <2 x double> %a to <2 x i1>
   %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
   ret <2 x i64> %select
@@ -3023,16 +2050,6 @@ define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) {
 ; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_4f64toub:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; KNL_WIDEN-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vpslld $31, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = fptoui <4 x double> %a to <4 x i1>
   %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
   ret <4 x i64> %select
@@ -3070,14 +2087,6 @@ define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) {
 ; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
 ; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_8f64toub:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vpslld $31, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    retq
   %mask = fptoui <8 x double> %a to <8 x i1>
   %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
   ret <8 x i64> %select
@@ -3121,17 +2130,6 @@ define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_2f32toub:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; KNL_WIDEN-NEXT:    vcvttps2dq %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vpslld $31, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %mask = fptoui <2 x float> %a to <2 x i1>
   %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
   ret <2 x i64> %select
@@ -3173,16 +2171,6 @@ define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) {
 ; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_4f32toub:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; KNL_WIDEN-NEXT:    vcvttps2dq %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vpslld $31, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = fptoui <4 x float> %a to <4 x i1>
   %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
   ret <4 x i64> %select
@@ -3220,14 +2208,6 @@ define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) {
 ; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
 ; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_8f32toub:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vpslld $31, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    retq
   %mask = fptoui <8 x float> %a to <8 x i1>
   %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
   ret <8 x i64> %select
@@ -3257,14 +2237,6 @@ define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) {
 ; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
 ; DQNOVL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_16f32toub:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    retq
   %mask = fptoui <16 x float> %a to <16 x i1>
   %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
   ret <16 x i32> %select
@@ -3308,17 +2280,6 @@ define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_2f64tosb:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; KNL_WIDEN-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vpslld $31, %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %mask = fptosi <2 x double> %a to <2 x i1>
   %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
   ret <2 x i64> %select
@@ -3356,15 +2317,6 @@ define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) {
 ; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_4f64tosb:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; KNL_WIDEN-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = fptosi <4 x double> %a to <4 x i1>
   %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
   ret <4 x i64> %select
@@ -3398,13 +2350,6 @@ define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) {
 ; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
 ; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_8f64tosb:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    retq
   %mask = fptosi <8 x double> %a to <8 x i1>
   %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
   ret <8 x i64> %select
@@ -3444,16 +2389,6 @@ define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) {
 ; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; DQNOVL-NEXT:    vzeroupper
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_2f32tosb:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; KNL_WIDEN-NEXT:    vcvttps2dq %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; KNL_WIDEN-NEXT:    vzeroupper
-; KNL_WIDEN-NEXT:    retq
   %mask = fptosi <2 x float> %a to <2 x i1>
   %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
   ret <2 x i64> %select
@@ -3491,15 +2426,6 @@ define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) {
 ; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_4f32tosb:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; KNL_WIDEN-NEXT:    vcvttps2dq %xmm0, %xmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; KNL_WIDEN-NEXT:    retq
   %mask = fptosi <4 x float> %a to <4 x i1>
   %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
   ret <4 x i64> %select
@@ -3533,13 +2459,6 @@ define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) {
 ; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
 ; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_8f32tosb:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    retq
   %mask = fptosi <8 x float> %a to <8 x i1>
   %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
   ret <8 x i64> %select
@@ -3566,13 +2485,6 @@ define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) {
 ; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
 ; DQNOVL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
 ; DQNOVL-NEXT:    retq
-;
-; KNL_WIDEN-LABEL: test_16f32tosb:
-; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_WIDEN-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; KNL_WIDEN-NEXT:    retq
   %mask = fptosi <16 x float> %a to <16 x i1>
   %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
   ret <16 x i32> %select
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index d56cf0fe09ee9c8999a62f68f818c4bffeac8355..2381180af763a234b18745a907758110f6126037 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -475,7 +475,7 @@ define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
 define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x8mem_to_8x32:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -492,7 +492,7 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
 ;
 ; AVX512DQNOBW-LABEL: zext_8x8mem_to_8x32:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -506,7 +506,7 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
 define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_8x8mem_to_8x32:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovsxbd (%rdi), %ymm0
@@ -523,7 +523,7 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
 ;
 ; AVX512DQNOBW-LABEL: sext_8x8mem_to_8x32:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovsxbd (%rdi), %ymm0 {%k1} {z}
@@ -537,7 +537,7 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
 define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_16x8mem_to_16x32:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
@@ -552,7 +552,7 @@ define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
 ;
 ; AVX512DQNOBW-LABEL: zext_16x8mem_to_16x32:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %zmm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
@@ -566,7 +566,7 @@ define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
 define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_16x8mem_to_16x32:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovsxbd (%rdi), %zmm0 {%k1} {z}
@@ -581,7 +581,7 @@ define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
 ;
 ; AVX512DQNOBW-LABEL: sext_16x8mem_to_16x32:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %zmm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovsxbd (%rdi), %zmm0 {%k1} {z}
@@ -595,7 +595,7 @@ define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
 define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_16x8_to_16x32_mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -610,7 +610,7 @@ define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
 ;
 ; AVX512DQNOBW-LABEL: zext_16x8_to_16x32_mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %zmm1, %zmm1
 ; AVX512DQNOBW-NEXT:    vpmovd2m %zmm1, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -623,7 +623,7 @@ define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
 define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_16x8_to_16x32_mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z}
@@ -638,7 +638,7 @@ define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
 ;
 ; AVX512DQNOBW-LABEL: sext_16x8_to_16x32_mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %zmm1, %zmm1
 ; AVX512DQNOBW-NEXT:    vpmovd2m %zmm1, %k1
 ; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z}
@@ -777,7 +777,7 @@ define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone {
 define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x8mem_to_8x64:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
@@ -792,7 +792,7 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind re
 ;
 ; AVX512DQNOBW-LABEL: zext_8x8mem_to_8x64:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
@@ -806,7 +806,7 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind re
 define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_8x8mem_to_8x64mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovsxbq (%rdi), %zmm0 {%k1} {z}
@@ -821,7 +821,7 @@ define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwin
 ;
 ; AVX512DQNOBW-LABEL: sext_8x8mem_to_8x64mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovsxbq (%rdi), %zmm0 {%k1} {z}
@@ -902,7 +902,7 @@ define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone {
 define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x16mem_to_8x32:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -919,7 +919,7 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind
 ;
 ; AVX512DQNOBW-LABEL: zext_8x16mem_to_8x32:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -933,7 +933,7 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind
 define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_8x16mem_to_8x32mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovsxwd (%rdi), %ymm0
@@ -950,7 +950,7 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw
 ;
 ; AVX512DQNOBW-LABEL: sext_8x16mem_to_8x32mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovsxwd (%rdi), %ymm0 {%k1} {z}
@@ -974,7 +974,7 @@ define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone {
 define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x16_to_8x32mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -991,7 +991,7 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
 ;
 ; AVX512DQNOBW-LABEL: zext_8x16_to_8x32mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm1, %ymm1
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm1, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -1013,7 +1013,7 @@ define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
 define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_16x16mem_to_16x32:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
@@ -1028,7 +1028,7 @@ define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) noun
 ;
 ; AVX512DQNOBW-LABEL: zext_16x16mem_to_16x32:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %zmm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
@@ -1042,7 +1042,7 @@ define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) noun
 define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_16x16mem_to_16x32mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovsxwd (%rdi), %zmm0 {%k1} {z}
@@ -1057,7 +1057,7 @@ define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask)
 ;
 ; AVX512DQNOBW-LABEL: sext_16x16mem_to_16x32mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %zmm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovsxwd (%rdi), %zmm0 {%k1} {z}
@@ -1080,7 +1080,7 @@ define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
 define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_16x16_to_16x32mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
@@ -1095,7 +1095,7 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun
 ;
 ; AVX512DQNOBW-LABEL: zext_16x16_to_16x32mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %zmm1, %zmm1
 ; AVX512DQNOBW-NEXT:    vpmovd2m %zmm1, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
@@ -1227,7 +1227,7 @@ define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone {
 define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x16mem_to_8x64:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -1242,7 +1242,7 @@ define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind
 ;
 ; AVX512DQNOBW-LABEL: zext_8x16mem_to_8x64:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -1256,7 +1256,7 @@ define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind
 define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_8x16mem_to_8x64mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovsxwq (%rdi), %zmm0 {%k1} {z}
@@ -1271,7 +1271,7 @@ define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounw
 ;
 ; AVX512DQNOBW-LABEL: sext_8x16mem_to_8x64mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovsxwq (%rdi), %zmm0 {%k1} {z}
@@ -1295,7 +1295,7 @@ define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
 define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x16_to_8x64mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
@@ -1310,7 +1310,7 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind
 ;
 ; AVX512DQNOBW-LABEL: zext_8x16_to_8x64mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm1, %ymm1
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm1, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
@@ -1472,7 +1472,7 @@ define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind
 define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x32mem_to_8x64:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -1487,7 +1487,7 @@ define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind
 ;
 ; AVX512DQNOBW-LABEL: zext_8x32mem_to_8x64:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -1501,7 +1501,7 @@ define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind
 define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_8x32mem_to_8x64mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpmovsxdq (%rdi), %zmm0 {%k1} {z}
@@ -1516,7 +1516,7 @@ define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounw
 ;
 ; AVX512DQNOBW-LABEL: sext_8x32mem_to_8x64mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k1
 ; AVX512DQNOBW-NEXT:    vpmovsxdq (%rdi), %zmm0 {%k1} {z}
@@ -1549,7 +1549,7 @@ define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone {
 define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x32_to_8x64mask:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
@@ -1564,7 +1564,7 @@ define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind
 ;
 ; AVX512DQNOBW-LABEL: zext_8x32_to_8x64mask:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm1, %ymm1
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm1, %k1
 ; AVX512DQNOBW-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
@@ -1646,7 +1646,7 @@ define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
 define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
 ; KNL-LABEL: trunc_16i8_to_16i1:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1664,7 +1664,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
 ;
 ; AVX512DQNOBW-LABEL: trunc_16i8_to_16i1:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQNOBW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %zmm0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw %k0, %eax
@@ -1726,7 +1726,7 @@ define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
 define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
 ; KNL-LABEL: trunc_8i16_to_8i1:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1744,7 +1744,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
 ;
 ; AVX512DQNOBW-LABEL: trunc_8i16_to_8i1:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512DQNOBW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpmovd2m %ymm0, %k0
 ; AVX512DQNOBW-NEXT:    kmovw %k0, %eax
@@ -2123,10 +2123,10 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
 ; KNL-LABEL: zext_32xi1_to_32xi8:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
-; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -2141,10 +2141,10 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
 ; AVX512DQNOBW-LABEL: zext_32xi1_to_32xi8:
 ; AVX512DQNOBW:       # %bb.0:
 ; AVX512DQNOBW-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQNOBW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQNOBW-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512DQNOBW-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512DQNOBW-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQNOBW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index 00fad6fbf59bf020bec21add8249fa380fe8be33..e38ee562d578b4eb8b81fd05159524b36658cd4c 100644
--- a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -5,8 +5,7 @@
 define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $4, %k0, %k0
+; AVX512-NEXT:    kmovb 4(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm2
 ; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovq2m %xmm2, %k1
@@ -16,9 +15,8 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub
 ;
 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 4(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
@@ -35,8 +33,7 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub
 define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $6, %k0, %k0
+; AVX512-NEXT:    kmovb 6(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX512-NEXT:    vpmovq2m %xmm2, %k1
@@ -46,9 +43,8 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub
 ;
 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 6(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -65,8 +61,7 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub
 define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512-NEXT:    kmovb 8(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm2
 ; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovq2m %xmm2, %k1
@@ -76,8 +71,8 @@ define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x do
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 8(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
@@ -94,8 +89,7 @@ define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x do
 define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512-NEXT:    kmovb 8(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm2
 ; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovd2m %xmm2, %k1
@@ -105,8 +99,8 @@ define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x flo
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 8(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
@@ -123,8 +117,7 @@ define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x flo
 define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    kmovb 14(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX512-NEXT:    vpmovq2m %xmm2, %k1
@@ -134,8 +127,8 @@ define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x d
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 14(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -152,8 +145,7 @@ define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x d
 define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    kmovb 12(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
 ; AVX512-NEXT:    vpmovd2m %xmm2, %k1
@@ -163,8 +155,8 @@ define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 12(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
@@ -181,8 +173,7 @@ define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x fl
 define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512-NEXT:    kmovb 16(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm2
 ; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovq2m %xmm2, %k1
@@ -192,8 +183,8 @@ define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 16(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
@@ -210,8 +201,7 @@ define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d
 define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512-NEXT:    kmovb 16(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm2
 ; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovd2m %xmm2, %k1
@@ -221,8 +211,8 @@ define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 16(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
@@ -239,8 +229,7 @@ define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl
 define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512-NEXT:    kmovb 16(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %ymm2
 ; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512-NEXT:    vpmovd2m %ymm2, %k1
@@ -251,8 +240,8 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 16(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
@@ -270,8 +259,7 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
 define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $30, %k0, %k0
+; AVX512-NEXT:    kmovb 30(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX512-NEXT:    vpmovq2m %xmm2, %k1
@@ -281,8 +269,8 @@ define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $30, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 30(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -299,8 +287,7 @@ define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d
 define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $28, %k0, %k0
+; AVX512-NEXT:    kmovb 28(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
 ; AVX512-NEXT:    vpmovd2m %xmm2, %k1
@@ -310,8 +297,8 @@ define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $28, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 28(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
@@ -328,8 +315,7 @@ define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl
 define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
 ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $24, %k0, %k0
+; AVX512-NEXT:    kmovb 24(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %ymm2
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
 ; AVX512-NEXT:    vpermd %ymm2, %ymm3, %ymm2
@@ -341,8 +327,8 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $24, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 24(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
@@ -361,8 +347,7 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
 define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovb 32(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm2
 ; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovq2m %xmm2, %k1
@@ -372,8 +357,8 @@ define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 32(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
@@ -390,8 +375,7 @@ define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d
 define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovb 32(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm2
 ; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovd2m %xmm2, %k1
@@ -401,8 +385,8 @@ define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 32(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
@@ -419,8 +403,7 @@ define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl
 define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovb 32(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %ymm2
 ; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512-NEXT:    vpmovd2m %ymm2, %k1
@@ -431,8 +414,8 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 32(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
@@ -450,8 +433,7 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
 define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovw 32(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %zmm2
 ; AVX512-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512-NEXT:    vpmovd2m %zmm2, %k1
@@ -462,8 +444,7 @@ define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT:    kmovw 32(%rdi), %k1
 ; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512NOTDQ-NEXT:    vptestmd %zmm2, %zmm2, %k1
@@ -480,8 +461,7 @@ define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x
 define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $62, %k0, %k0
+; AVX512-NEXT:    kmovb 62(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX512-NEXT:    vpmovq2m %xmm2, %k1
@@ -491,8 +471,8 @@ define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $62, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 62(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -509,8 +489,7 @@ define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d
 define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $60, %k0, %k0
+; AVX512-NEXT:    kmovb 60(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
 ; AVX512-NEXT:    vpmovd2m %xmm2, %k1
@@ -520,8 +499,8 @@ define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $60, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 60(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
@@ -538,8 +517,7 @@ define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl
 define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $56, %k0, %k0
+; AVX512-NEXT:    kmovb 56(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %ymm2
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
 ; AVX512-NEXT:    vpermd %ymm2, %ymm3, %ymm2
@@ -551,8 +529,8 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $56, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 56(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
@@ -571,8 +549,7 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
 define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $48, %k0, %k0
+; AVX512-NEXT:    kmovw 48(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %zmm2
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512-NEXT:    vpermd %zmm2, %zmm3, %zmm2
@@ -584,8 +561,7 @@ define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $48, %k0, %k1
+; AVX512NOTDQ-NEXT:    kmovw 48(%rdi), %k1
 ; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512NOTDQ-NEXT:    vpermd %zmm2, %zmm3, %zmm2
@@ -603,17 +579,13 @@ define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x
 define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512-NEXT:    kmovb 1(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 1(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <2 x i1>, <2 x i1>* %a0
@@ -624,17 +596,13 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512-NEXT:    kmovb 1(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 1(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <3 x i1>, <3 x i1>* %a0
@@ -645,17 +613,13 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $2, %k0, %k0
+; AVX512-NEXT:    kmovb 2(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 2(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <3 x i1>, <3 x i1>* %a0
@@ -666,17 +630,13 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $2, %k0, %k0
+; AVX512-NEXT:    kmovb 2(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 2(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <4 x i1>, <4 x i1>* %a0
@@ -687,17 +647,13 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $3, %k0, %k0
+; AVX512-NEXT:    kmovb 3(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $3, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 3(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <4 x i1>, <4 x i1>* %a0
@@ -708,17 +664,13 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $4, %k0, %k0
+; AVX512-NEXT:    kmovb 4(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 4(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <8 x i1>, <8 x i1>* %a0
@@ -729,8 +681,7 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $4, %k0, %k0
+; AVX512-NEXT:    kmovb 4(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vpmovq2m %xmm0, %k0
@@ -739,9 +690,8 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 4(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
@@ -757,17 +707,13 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
 define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512-NEXT:    kmovb 7(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $7, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 7(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <8 x i1>, <8 x i1>* %a0
@@ -778,8 +724,7 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovb (%rdi), %k0
-; AVX512-NEXT:    kshiftrb $6, %k0, %k0
+; AVX512-NEXT:    kmovb 6(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX512-NEXT:    vpmovq2m %xmm0, %k0
@@ -788,9 +733,8 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
-; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 6(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
@@ -806,16 +750,13 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
 define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512-NEXT:    kmovb 8(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 8(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <16 x i1>, <16 x i1>* %a0
@@ -826,8 +767,7 @@ define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512-NEXT:    kmovb 8(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vpmovq2m %xmm0, %k0
@@ -836,8 +776,8 @@ define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 8(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
@@ -853,8 +793,7 @@ define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
 define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512-NEXT:    kmovb 8(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX512-NEXT:    vpmovd2m %xmm0, %k0
@@ -863,8 +802,8 @@ define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 8(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
@@ -880,16 +819,13 @@ define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
 define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512-NEXT:    kmovb 15(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 15(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <16 x i1>, <16 x i1>* %a0
@@ -900,8 +836,7 @@ define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512-NEXT:    kmovb 14(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX512-NEXT:    vpmovq2m %xmm0, %k0
@@ -910,8 +845,8 @@ define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 14(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
@@ -927,8 +862,7 @@ define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
 define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw (%rdi), %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    kmovb 12(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vpmovd2m %xmm0, %k0
@@ -937,8 +871,8 @@ define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 12(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
@@ -954,16 +888,13 @@ define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
 define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512-NEXT:    kmovb 16(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 16(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <32 x i1>, <32 x i1>* %a0
@@ -974,8 +905,7 @@ define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512-NEXT:    kmovb 16(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vpmovq2m %xmm0, %k0
@@ -984,8 +914,8 @@ define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 16(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
@@ -1001,8 +931,7 @@ define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
 define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512-NEXT:    kmovb 16(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX512-NEXT:    vpmovd2m %xmm0, %k0
@@ -1011,8 +940,8 @@ define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 16(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
@@ -1028,8 +957,7 @@ define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
 define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $16, %k0, %k0
+; AVX512-NEXT:    kmovb 16(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %ymm0
 ; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
 ; AVX512-NEXT:    vpmovd2m %ymm0, %k0
@@ -1039,8 +967,8 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 16(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
@@ -1057,16 +985,13 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $31, %k0, %k0
+; AVX512-NEXT:    kmovb 31(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $31, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 31(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <32 x i1>, <32 x i1>* %a0
@@ -1077,8 +1002,7 @@ define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $30, %k0, %k0
+; AVX512-NEXT:    kmovb 30(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX512-NEXT:    vpmovq2m %xmm0, %k0
@@ -1087,8 +1011,8 @@ define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $30, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 30(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
@@ -1104,8 +1028,7 @@ define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
 define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $28, %k0, %k0
+; AVX512-NEXT:    kmovb 28(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vpmovd2m %xmm0, %k0
@@ -1114,8 +1037,8 @@ define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $28, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 28(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
@@ -1131,8 +1054,7 @@ define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
 define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovd (%rdi), %k0
-; AVX512-NEXT:    kshiftrd $24, %k0, %k0
+; AVX512-NEXT:    kmovb 24(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %ymm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
 ; AVX512-NEXT:    vpermd %ymm0, %ymm1, %ymm0
@@ -1143,8 +1065,8 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrd $24, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 24(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
@@ -1162,16 +1084,13 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovb 32(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 32(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <64 x i1>, <64 x i1>* %a0
@@ -1182,8 +1101,7 @@ define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovb 32(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vpmovq2m %xmm0, %k0
@@ -1192,8 +1110,8 @@ define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 32(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
@@ -1209,8 +1127,7 @@ define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
 define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovb 32(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX512-NEXT:    vpmovd2m %xmm0, %k0
@@ -1219,8 +1136,8 @@ define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 32(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
@@ -1236,8 +1153,7 @@ define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
 define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovb 32(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %ymm0
 ; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
 ; AVX512-NEXT:    vpmovd2m %ymm0, %k0
@@ -1247,8 +1163,8 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 32(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
@@ -1265,8 +1181,7 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $32, %k0, %k0
+; AVX512-NEXT:    kmovw 32(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; AVX512-NEXT:    vpmovd2m %zmm0, %k0
@@ -1276,8 +1191,7 @@ define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1)
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT:    kmovw 32(%rdi), %k1
 ; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; AVX512NOTDQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1292,16 +1206,13 @@ define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1)
 define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $63, %k0, %k0
+; AVX512-NEXT:    kmovb 63(%rdi), %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $63, %k0, %k0
-; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
+; AVX512NOTDQ-NEXT:    movb 63(%rdi), %al
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    retq
     %d0 = load <64 x i1>, <64 x i1>* %a0
@@ -1312,8 +1223,7 @@ define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
 define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $62, %k0, %k0
+; AVX512-NEXT:    kmovb 62(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX512-NEXT:    vpmovq2m %xmm0, %k0
@@ -1322,8 +1232,8 @@ define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $62, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 62(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
@@ -1339,8 +1249,7 @@ define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
 define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $60, %k0, %k0
+; AVX512-NEXT:    kmovb 60(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vpmovd2m %xmm0, %k0
@@ -1349,8 +1258,8 @@ define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $60, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 60(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
@@ -1366,8 +1275,7 @@ define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
 define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $56, %k0, %k0
+; AVX512-NEXT:    kmovb 56(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %ymm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
 ; AVX512-NEXT:    vpermd %ymm0, %ymm1, %ymm0
@@ -1378,8 +1286,8 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $56, %k0, %k1
+; AVX512NOTDQ-NEXT:    movzbl 56(%rdi), %eax
+; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
 ; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
@@ -1397,8 +1305,7 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
 ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovq (%rdi), %k0
-; AVX512-NEXT:    kshiftrq $48, %k0, %k0
+; AVX512-NEXT:    kmovw 48(%rdi), %k0
 ; AVX512-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512-NEXT:    vpermd %zmm0, %zmm1, %zmm0
@@ -1409,8 +1316,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1)
 ;
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store:
 ; AVX512NOTDQ:       # %bb.0:
-; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
-; AVX512NOTDQ-NEXT:    kshiftrq $48, %k0, %k1
+; AVX512NOTDQ-NEXT:    kmovw 48(%rdi), %k1
 ; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512NOTDQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll
index 29ab76d4d37255cac8385e29a25e5696f63ad53a..e9a70f3bfe23c08d552da4fac9417c3503630844 100644
--- a/test/CodeGen/X86/avx512-fma.ll
+++ b/test/CodeGen/X86/avx512-fma.ll
@@ -5,7 +5,7 @@
 define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; ALL-LABEL: test_x86_fmadd_ps_z:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; ALL-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fadd <16 x float> %x, %a2
@@ -15,7 +15,7 @@ define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16
 define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; ALL-LABEL: test_x86_fmsub_ps_z:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; ALL-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fsub <16 x float> %x, %a2
@@ -25,7 +25,7 @@ define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16
 define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; ALL-LABEL: test_x86_fnmadd_ps_z:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfnmadd213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
 ; ALL-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %res = fsub <16 x float> %a2, %x
@@ -35,7 +35,7 @@ define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <1
 define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; ALL-LABEL: test_x86_fnmsub_ps_z:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; ALL-NEXT:    retq
   %x = fmul <16 x float> %a0, %a1
   %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
@@ -49,7 +49,7 @@ define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <1
 define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; ALL-LABEL: test_x86_fmadd_pd_z:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; ALL-NEXT:    retq
   %x = fmul <8 x double> %a0, %a1
   %res = fadd <8 x double> %x, %a2
@@ -59,7 +59,7 @@ define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8
 define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; ALL-LABEL: test_x86_fmsub_pd_z:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmsub213pd %zmm2, %zmm1, %zmm0
+; ALL-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; ALL-NEXT:    retq
   %x = fmul <8 x double> %a0, %a1
   %res = fsub <8 x double> %x, %a2
@@ -69,7 +69,7 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8
 define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
 ; ALL-LABEL: test_x86_fmsub_213:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
+; ALL-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; ALL-NEXT:    retq
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -79,7 +79,7 @@ define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
 define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
 ; ALL-LABEL: test_x86_fmsub_213_m:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0
+; ALL-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
 ; ALL-NEXT:    retq
   %a2 = load double , double *%a2_ptr
   %x = fmul double %a0, %a1
@@ -90,7 +90,7 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
 define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
 ; ALL-LABEL: test_x86_fmsub_231_m:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmsub132sd (%rdi), %xmm1, %xmm0
+; ALL-NEXT:    vfmsub132sd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1
 ; ALL-NEXT:    retq
   %a2 = load double , double *%a2_ptr
   %x = fmul double %a0, %a2
@@ -101,7 +101,7 @@ define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
 define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 ; ALL-LABEL: test231_br:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmadd132ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
+; ALL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1
 ; ALL-NEXT:    retq
   %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
   %b2 = fadd <16 x float> %b1, %a2
@@ -111,7 +111,7 @@ define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 ; ALL-LABEL: test213_br:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
+; ALL-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
 ; ALL-NEXT:    retq
   %b1 = fmul <16 x float> %a1, %a2
   %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
@@ -122,17 +122,17 @@ define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
 ; KNL-LABEL: test_x86_fmadd132_ps:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; KNL-NEXT:    vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_x86_fmadd132_ps:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovb2m %xmm2, %k1
-; SKX-NEXT:    vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1
 ; SKX-NEXT:    retq
   %a2   = load <16 x float>,<16 x float> *%a2_ptrt,align 1
   %x = fmul <16 x float> %a0, %a2
@@ -145,10 +145,10 @@ define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <1
 define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
 ; KNL-LABEL: test_x86_fmadd231_ps:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; KNL-NEXT:    vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vfmadd231ps {{.*#+}} zmm1 = (zmm0 * mem) + zmm1
 ; KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -156,7 +156,7 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <1
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovb2m %xmm2, %k1
-; SKX-NEXT:    vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vfmadd231ps {{.*#+}} zmm1 = (zmm0 * mem) + zmm1
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %a2   = load <16 x float>,<16 x float> *%a2_ptrt,align 1
@@ -170,10 +170,10 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <1
 define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
 ; KNL-LABEL: test_x86_fmadd213_ps:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; KNL-NEXT:    vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vfmadd213ps {{.*#+}} zmm1 = (zmm0 * zmm1) + mem
 ; KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -181,7 +181,7 @@ define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <1
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovb2m %xmm2, %k1
-; SKX-NEXT:    vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vfmadd213ps {{.*#+}} zmm1 = (zmm0 * zmm1) + mem
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %a2   = load <16 x float>,<16 x float> *%a2_ptrt,align 1
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 6944d3ea27b64d566a50b1b7bcce6a7219f9c3ef..05542ac1c97d29e343d72d17e8d4caf6d41f021c 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1710,11 +1710,11 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
 ; KNL-NEXT:    testb %dil, %dil
 ; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
 ; KNL-NEXT:    setne (%rsp,%rsi)
-; KNL-NEXT:    vpmovsxbd (%rsp), %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1776,25 +1776,21 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
 ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
 ; KNL-NEXT:    setne (%rsp,%rsi)
-; KNL-NEXT:    vmovdqa (%rsp), %ymm0
-; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %edx
-; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1961,51 +1957,43 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
 ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovdqa %ymm2, (%rsp)
 ; KNL-NEXT:    setne (%rsp,%rax)
-; KNL-NEXT:    vmovdqa (%rsp), %ymm2
-; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3
-; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
-; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm4
-; KNL-NEXT:    vpslld $31, %zmm4, %zmm4
-; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    vpmovsxbd %xmm3, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %edx
-; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm2
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    shll $16, %eax
 ; KNL-NEXT:    orl %edx, %eax
 ; KNL-NEXT:    shlq $32, %rax
 ; KNL-NEXT:    orq %rcx, %rax
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %esi
 ; KNL-NEXT:    shll $16, %esi
 ; KNL-NEXT:    orl %ecx, %esi
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
-; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %edx
@@ -2181,51 +2169,43 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
 ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
 ; KNL-NEXT:    setne (%rsp,%rsi)
-; KNL-NEXT:    vmovdqa (%rsp), %ymm2
-; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3
-; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
-; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm4
-; KNL-NEXT:    vpslld $31, %zmm4, %zmm4
-; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    vpmovsxbd %xmm3, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %edx
-; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm2
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    shll $16, %eax
 ; KNL-NEXT:    orl %edx, %eax
 ; KNL-NEXT:    shlq $32, %rax
 ; KNL-NEXT:    orq %rcx, %rax
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %esi
 ; KNL-NEXT:    shll $16, %esi
 ; KNL-NEXT:    orl %ecx, %esi
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
-; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %edx
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 8494bf2924bca0ab9c9ad72cc4577a26786fd7c5..53a076087fe3cc723c272dd13e7d2e32a8c5ed56 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -3354,8 +3354,8 @@ declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>,
 define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_getexp_sd:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4 {%k1}
 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z}
@@ -3393,11 +3393,11 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8
 define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmplesd %xmm1, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %edx
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %esi
 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
@@ -3438,11 +3438,11 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %
 define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpless %xmm1, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %edx
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %esi
 ; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
@@ -3500,8 +3500,8 @@ declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1}
 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z}
@@ -3525,8 +3525,8 @@ declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i
 define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm2
@@ -4220,9 +4220,9 @@ declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
 ; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4
 ; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
@@ -4299,9 +4299,9 @@ declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double
 define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
 ; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm4
 ; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 13ffb9f65bf76f9d809d70987b22342bfcb13039..5996de5455d75a4c67c48da309584f9ac1ffc721 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -152,8 +152,8 @@ define i16 @mand16(i16 %x, i16 %y) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    xorl %esi, %ecx
-; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    andl %esi, %ecx
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
@@ -613,7 +613,7 @@ false:
 define void @test7(<8 x i1> %mask)  {
 ; KNL-LABEL: test7:
 ; KNL:       ## %bb.0: ## %allocas
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -640,7 +640,7 @@ define void @test7(<8 x i1> %mask)  {
 ;
 ; AVX512DQ-LABEL: test7:
 ; AVX512DQ:       ## %bb.0: ## %allocas
-; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovq2m %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
@@ -763,10 +763,10 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
 ; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    jg LBB18_1
 ; KNL-NEXT:  ## %bb.2:
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL-NEXT:    jmp LBB18_3
 ; KNL-NEXT:  LBB18_1:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:  LBB18_3:
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
@@ -810,10 +810,10 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
 ; AVX512DQ-NEXT:    cmpl %esi, %edi
 ; AVX512DQ-NEXT:    jg LBB18_1
 ; AVX512DQ-NEXT:  ## %bb.2:
-; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512DQ-NEXT:    jmp LBB18_3
 ; AVX512DQ-NEXT:  LBB18_1:
-; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:  LBB18_3:
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
@@ -1671,7 +1671,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
 define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
 ; KNL-LABEL: store_v8i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1699,7 +1699,7 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
 ;
 ; AVX512DQ-LABEL: store_v8i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovq2m %zmm0, %k0
 ; AVX512DQ-NEXT:    knotb %k0, %k0
@@ -1723,7 +1723,7 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
 define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
 ; KNL-LABEL: store_v16i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
@@ -1749,7 +1749,7 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
 ;
 ; AVX512DQ-LABEL: store_v16i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
 ; AVX512DQ-NEXT:    knotw %k0, %k0
@@ -2463,7 +2463,7 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) {
 define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
 ; KNL-LABEL: store_8i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -2489,7 +2489,7 @@ define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
 ;
 ; AVX512DQ-LABEL: store_8i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovq2m %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
@@ -2510,7 +2510,7 @@ define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
 define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
 ; KNL-LABEL: store_8i1_1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -2536,7 +2536,7 @@ define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
 ;
 ; AVX512DQ-LABEL: store_8i1_1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovq2m %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
@@ -2558,7 +2558,7 @@ define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
 define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
 ; KNL-LABEL: store_16i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
@@ -2582,7 +2582,7 @@ define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
 ;
 ; AVX512DQ-LABEL: store_16i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
@@ -2603,11 +2603,11 @@ define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
 define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
 ; KNL-LABEL: store_32i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    kmovw %k1, 2(%rdi)
@@ -2633,11 +2633,11 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
 ;
 ; AVX512DQ-LABEL: store_32i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
 ; AVX512DQ-NEXT:    kmovw %k1, 2(%rdi)
@@ -2660,10 +2660,10 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
 define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
 ; KNL-LABEL: store_32i1_1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT:    vpmovsxwd %ymm1, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    kmovw %k1, 2(%rdi)
@@ -2689,10 +2689,10 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
 ;
 ; AVX512DQ-LABEL: store_32i1_1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
 ; AVX512DQ-NEXT:    kmovw %k1, 2(%rdi)
@@ -2718,16 +2718,16 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ;
 ; KNL-LABEL: store_64i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT:    vpmovsxbd %xmm3, %zmm0
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k3
 ; KNL-NEXT:    kmovw %k3, 6(%rdi)
@@ -2755,16 +2755,16 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ;
 ; AVX512DQ-LABEL: store_64i1:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
-; AVX512DQ-NEXT:    vpmovsxbd %xmm2, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
-; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
 ; AVX512DQ-NEXT:    kmovw %k3, 6(%rdi)
diff --git a/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index f199cb097aa472949876bf4c1f43bc3998ef7d59..89dfc69fcd5558b6fce1c7b68b71962835e028c7 100644
--- a/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -111,7 +111,7 @@ declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x
 define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
 ; AVX512F-LABEL: test24:
 ; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
@@ -134,7 +134,7 @@ define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
 define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
 ; AVX512F-LABEL: test_store_16i64:
 ; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
@@ -160,7 +160,7 @@ declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %p
 define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
 ; AVX512F-LABEL: test_store_16f64:
 ; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
@@ -186,7 +186,7 @@ declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x doubl
 define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
 ; AVX512F-LABEL: test_load_16i64:
 ; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
@@ -210,7 +210,7 @@ declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16
 define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
 ; AVX512F-LABEL: test_load_16f64:
 ; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
@@ -235,10 +235,10 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
 ; AVX512F-LABEL: test_load_32f64:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX512F-NEXT:    vpmovsxbd %xmm5, %zmm5
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero,xmm5[8],zero,zero,zero,xmm5[9],zero,zero,zero,xmm5[10],zero,zero,zero,xmm5[11],zero,zero,zero,xmm5[12],zero,zero,zero,xmm5[13],zero,zero,zero,xmm5[14],zero,zero,zero,xmm5[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm5, %zmm5
 ; AVX512F-NEXT:    vptestmd %zmm5, %zmm5, %k1
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k2}
diff --git a/test/CodeGen/X86/avx512-memfold.ll b/test/CodeGen/X86/avx512-memfold.ll
index 4e129505668d9f21d0a0264075c97f319ee5d558..d80891868d0b1f3c22ac381beb408b391ac60fce 100644
--- a/test/CodeGen/X86/avx512-memfold.ll
+++ b/test/CodeGen/X86/avx512-memfold.ll
@@ -72,8 +72,8 @@ define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x do
 define <4 x float> @test_mask_add_ss_double_use(<4 x float> %a, float* %b, i8 %mask, <4 x float> %c) {
 ; CHECK-LABEL: test_mask_add_ss_double_use:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vaddss %xmm2, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vaddss %xmm2, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vmulps %xmm0, %xmm1, %xmm0
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 5ce1705e37740f9fdcca63774543fdb024449aa4..1136a3a50693e8b6a9f695cca549f8d0a91828ec 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -635,38 +635,49 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a)  {
 }
 
 ; Test regcall when receiving/returning 128 bit vector
-define x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %b)  {
+define x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i1> %x, <4 x i32> %a, <4 x i32> %b)  {
 ; X32-LABEL: test_argRet128Vector:
 ; X32:       # %bb.0:
-; X32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; X32-NEXT:    vpslld $31, %xmm0, %xmm0
+; X32-NEXT:    vpmovd2m %xmm0, %k1
+; X32-NEXT:    vpblendmd %xmm1, %xmm2, %xmm0 {%k1}
 ; X32-NEXT:    retl
 ;
 ; WIN64-LABEL: test_argRet128Vector:
 ; WIN64:       # %bb.0:
-; WIN64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; WIN64-NEXT:    vpslld $31, %xmm0, %xmm0
+; WIN64-NEXT:    vpmovd2m %xmm0, %k1
+; WIN64-NEXT:    vpblendmd %xmm1, %xmm2, %xmm0 {%k1}
 ; WIN64-NEXT:    retq
 ;
 ; LINUXOSX64-LABEL: test_argRet128Vector:
 ; LINUXOSX64:       # %bb.0:
-; LINUXOSX64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; LINUXOSX64-NEXT:    vpslld $31, %xmm0, %xmm0
+; LINUXOSX64-NEXT:    vpmovd2m %xmm0, %k1
+; LINUXOSX64-NEXT:    vpblendmd %xmm1, %xmm2, %xmm0 {%k1}
 ; LINUXOSX64-NEXT:    retq
-  %d = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b
+  %d = select <4 x i1> %x, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %d
 }
 
 ; Test regcall when passing/retrieving 128 bit vector
-define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a)  {
+define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i1> %x, <4 x i32> %a)  {
 ; X32-LABEL: test_CallargRet128Vector:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %esp
-; X32-NEXT:    subl $24, %esp
-; X32-NEXT:    vmovups %xmm4, (%esp) # 16-byte Spill
-; X32-NEXT:    vmovdqa %xmm0, %xmm4
-; X32-NEXT:    vmovdqa %xmm0, %xmm1
+; X32-NEXT:    subl $40, %esp
+; X32-NEXT:    vmovups %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32-NEXT:    vmovdqa %xmm1, %xmm4
+; X32-NEXT:    vpslld $31, %xmm0, %xmm1
+; X32-NEXT:    vpmovd2m %xmm1, %k1
+; X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; X32-NEXT:    vmovdqa %xmm4, %xmm1
+; X32-NEXT:    vmovdqa %xmm4, %xmm2
 ; X32-NEXT:    calll _test_argRet128Vector
+; X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
 ; X32-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
-; X32-NEXT:    vmovups (%esp), %xmm4 # 16-byte Reload
-; X32-NEXT:    addl $24, %esp
+; X32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload
+; X32-NEXT:    addl $40, %esp
 ; X32-NEXT:    popl %esp
 ; X32-NEXT:    retl
 ;
@@ -674,17 +685,22 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a)  {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    pushq %rsp
 ; WIN64-NEXT:    .seh_pushreg 4
-; WIN64-NEXT:    subq $16, %rsp
-; WIN64-NEXT:    .seh_stackalloc 16
-; WIN64-NEXT:    vmovaps %xmm8, (%rsp) # 16-byte Spill
-; WIN64-NEXT:    .seh_savexmm 8, 0
+; WIN64-NEXT:    subq $32, %rsp
+; WIN64-NEXT:    .seh_stackalloc 32
+; WIN64-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    .seh_savexmm 8, 16
 ; WIN64-NEXT:    .seh_endprologue
-; WIN64-NEXT:    vmovdqa %xmm0, %xmm8
-; WIN64-NEXT:    vmovdqa %xmm0, %xmm1
+; WIN64-NEXT:    vmovdqa %xmm1, %xmm8
+; WIN64-NEXT:    vpslld $31, %xmm0, %xmm1
+; WIN64-NEXT:    vpmovd2m %xmm1, %k1
+; WIN64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; WIN64-NEXT:    vmovdqa %xmm8, %xmm1
+; WIN64-NEXT:    vmovdqa %xmm8, %xmm2
 ; WIN64-NEXT:    callq test_argRet128Vector
+; WIN64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; WIN64-NEXT:    vmovdqa32 %xmm8, %xmm0 {%k1}
-; WIN64-NEXT:    vmovaps (%rsp), %xmm8 # 16-byte Reload
-; WIN64-NEXT:    addq $16, %rsp
+; WIN64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; WIN64-NEXT:    addq $32, %rsp
 ; WIN64-NEXT:    popq %rsp
 ; WIN64-NEXT:    retq
 ; WIN64-NEXT:    .seh_handlerdata
@@ -695,58 +711,69 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a)  {
 ; LINUXOSX64:       # %bb.0:
 ; LINUXOSX64-NEXT:    pushq %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 16
-; LINUXOSX64-NEXT:    subq $16, %rsp
-; LINUXOSX64-NEXT:    vmovaps %xmm8, (%rsp) # 16-byte Spill
-; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 32
+; LINUXOSX64-NEXT:    subq $32, %rsp
+; LINUXOSX64-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 48
 ; LINUXOSX64-NEXT:    .cfi_offset %rsp, -16
 ; LINUXOSX64-NEXT:    .cfi_offset %xmm8, -32
-; LINUXOSX64-NEXT:    vmovdqa %xmm0, %xmm8
-; LINUXOSX64-NEXT:    vmovdqa %xmm0, %xmm1
+; LINUXOSX64-NEXT:    vmovdqa %xmm1, %xmm8
+; LINUXOSX64-NEXT:    vpslld $31, %xmm0, %xmm1
+; LINUXOSX64-NEXT:    vpmovd2m %xmm1, %k1
+; LINUXOSX64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; LINUXOSX64-NEXT:    vmovdqa %xmm8, %xmm1
+; LINUXOSX64-NEXT:    vmovdqa %xmm8, %xmm2
 ; LINUXOSX64-NEXT:    callq test_argRet128Vector
+; LINUXOSX64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; LINUXOSX64-NEXT:    vmovdqa32 %xmm8, %xmm0 {%k1}
-; LINUXOSX64-NEXT:    vmovaps (%rsp), %xmm8 # 16-byte Reload
-; LINUXOSX64-NEXT:    addq $16, %rsp
+; LINUXOSX64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; LINUXOSX64-NEXT:    addq $32, %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 16
 ; LINUXOSX64-NEXT:    popq %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 8
 ; LINUXOSX64-NEXT:    retq
-  %b = call x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %a)
-  %c = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b
+  %b = call x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i1> %x, <4 x i32> %a, <4 x i32> %a)
+  %c = select <4 x i1> %x, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %c
 }
 
 ; Test regcall when receiving/returning 256 bit vector
-define x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %b)  {
+define x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i1> %x, <8 x i32> %a, <8 x i32> %b)  {
 ; X32-LABEL: test_argRet256Vector:
 ; X32:       # %bb.0:
+; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; X32-NEXT:    retl
 ;
 ; WIN64-LABEL: test_argRet256Vector:
 ; WIN64:       # %bb.0:
+; WIN64-NEXT:    kmovd %eax, %k1
 ; WIN64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; WIN64-NEXT:    retq
 ;
 ; LINUXOSX64-LABEL: test_argRet256Vector:
 ; LINUXOSX64:       # %bb.0:
+; LINUXOSX64-NEXT:    kmovd %eax, %k1
 ; LINUXOSX64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; LINUXOSX64-NEXT:    retq
-  %d = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b
+  %d = select <8 x i1> %x, <8 x i32> %a, <8 x i32> %b
   ret <8 x i32> %d
 }
 
 ; Test regcall when passing/retrieving 256 bit vector
-define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a)  {
+define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i1> %x, <8 x i32> %a)  {
 ; X32-LABEL: test_CallargRet256Vector:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %esp
-; X32-NEXT:    subl $56, %esp
-; X32-NEXT:    vmovdqu %ymm0, (%esp) # 32-byte Spill
+; X32-NEXT:    subl $88, %esp
+; X32-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
+; X32-NEXT:    kmovd %eax, %k1
+; X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
 ; X32-NEXT:    vmovdqa %ymm0, %ymm1
 ; X32-NEXT:    calll _test_argRet256Vector
-; X32-NEXT:    vmovdqu (%esp), %ymm1 # 32-byte Reload
+; X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
+; X32-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %ymm1 # 32-byte Reload
 ; X32-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
-; X32-NEXT:    addl $56, %esp
+; X32-NEXT:    addl $88, %esp
 ; X32-NEXT:    popl %esp
 ; X32-NEXT:    retl
 ;
@@ -754,15 +781,18 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a)  {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    pushq %rsp
 ; WIN64-NEXT:    .seh_pushreg 4
-; WIN64-NEXT:    subq $48, %rsp
-; WIN64-NEXT:    .seh_stackalloc 48
+; WIN64-NEXT:    subq $80, %rsp
+; WIN64-NEXT:    .seh_stackalloc 80
 ; WIN64-NEXT:    .seh_endprologue
-; WIN64-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; WIN64-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; WIN64-NEXT:    kmovd %eax, %k1
+; WIN64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; WIN64-NEXT:    vmovdqa %ymm0, %ymm1
 ; WIN64-NEXT:    callq test_argRet256Vector
-; WIN64-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; WIN64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; WIN64-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; WIN64-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
-; WIN64-NEXT:    addq $48, %rsp
+; WIN64-NEXT:    addq $80, %rsp
 ; WIN64-NEXT:    popq %rsp
 ; WIN64-NEXT:    retq
 ; WIN64-NEXT:    .seh_handlerdata
@@ -773,56 +803,65 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a)  {
 ; LINUXOSX64:       # %bb.0:
 ; LINUXOSX64-NEXT:    pushq %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 16
-; LINUXOSX64-NEXT:    subq $48, %rsp
-; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 64
+; LINUXOSX64-NEXT:    subq $80, %rsp
+; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 96
 ; LINUXOSX64-NEXT:    .cfi_offset %rsp, -16
-; LINUXOSX64-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; LINUXOSX64-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; LINUXOSX64-NEXT:    kmovd %eax, %k1
+; LINUXOSX64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; LINUXOSX64-NEXT:    vmovdqa %ymm0, %ymm1
 ; LINUXOSX64-NEXT:    callq test_argRet256Vector
-; LINUXOSX64-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; LINUXOSX64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; LINUXOSX64-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; LINUXOSX64-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
-; LINUXOSX64-NEXT:    addq $48, %rsp
+; LINUXOSX64-NEXT:    addq $80, %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 16
 ; LINUXOSX64-NEXT:    popq %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 8
 ; LINUXOSX64-NEXT:    retq
-  %b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %a)
-  %c = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b
+  %b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i1> %x, <8 x i32> %a, <8 x i32> %a)
+  %c = select <8 x i1> %x, <8 x i32> %a, <8 x i32> %b
   ret <8 x i32> %c
 }
 
 ; Test regcall when receiving/returning 512 bit vector
-define x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %b)  {
+define x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i1> %x, <16 x i32> %a, <16 x i32> %b)  {
 ; X32-LABEL: test_argRet512Vector:
 ; X32:       # %bb.0:
+; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; X32-NEXT:    retl
 ;
 ; WIN64-LABEL: test_argRet512Vector:
 ; WIN64:       # %bb.0:
+; WIN64-NEXT:    kmovd %eax, %k1
 ; WIN64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; WIN64-NEXT:    retq
 ;
 ; LINUXOSX64-LABEL: test_argRet512Vector:
 ; LINUXOSX64:       # %bb.0:
+; LINUXOSX64-NEXT:    kmovd %eax, %k1
 ; LINUXOSX64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; LINUXOSX64-NEXT:    retq
-  %d = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b
+  %d = select <16 x i1> %x, <16 x i32> %a, <16 x i32> %b
   ret <16 x i32> %d
 }
 
 ; Test regcall when passing/retrieving 512 bit vector
-define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a)  {
+define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i1> %x, <16 x i32> %a)  {
 ; X32-LABEL: test_CallargRet512Vector:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %esp
-; X32-NEXT:    subl $120, %esp
-; X32-NEXT:    vmovdqu64 %zmm0, (%esp) # 64-byte Spill
+; X32-NEXT:    subl $184, %esp
+; X32-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
+; X32-NEXT:    kmovd %eax, %k1
+; X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
 ; X32-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; X32-NEXT:    calll _test_argRet512Vector
-; X32-NEXT:    vmovdqu64 (%esp), %zmm1 # 64-byte Reload
+; X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
+; X32-NEXT:    vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm1 # 64-byte Reload
 ; X32-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X32-NEXT:    addl $120, %esp
+; X32-NEXT:    addl $184, %esp
 ; X32-NEXT:    popl %esp
 ; X32-NEXT:    retl
 ;
@@ -830,15 +869,18 @@ define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a)  {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    pushq %rsp
 ; WIN64-NEXT:    .seh_pushreg 4
-; WIN64-NEXT:    subq $112, %rsp
-; WIN64-NEXT:    .seh_stackalloc 112
+; WIN64-NEXT:    subq $176, %rsp
+; WIN64-NEXT:    .seh_stackalloc 176
 ; WIN64-NEXT:    .seh_endprologue
-; WIN64-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; WIN64-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-NEXT:    kmovd %eax, %k1
+; WIN64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; WIN64-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; WIN64-NEXT:    callq test_argRet512Vector
-; WIN64-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; WIN64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; WIN64-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; WIN64-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; WIN64-NEXT:    addq $112, %rsp
+; WIN64-NEXT:    addq $176, %rsp
 ; WIN64-NEXT:    popq %rsp
 ; WIN64-NEXT:    retq
 ; WIN64-NEXT:    .seh_handlerdata
@@ -849,21 +891,24 @@ define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a)  {
 ; LINUXOSX64:       # %bb.0:
 ; LINUXOSX64-NEXT:    pushq %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 16
-; LINUXOSX64-NEXT:    subq $112, %rsp
-; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 128
+; LINUXOSX64-NEXT:    subq $176, %rsp
+; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 192
 ; LINUXOSX64-NEXT:    .cfi_offset %rsp, -16
-; LINUXOSX64-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; LINUXOSX64-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; LINUXOSX64-NEXT:    kmovd %eax, %k1
+; LINUXOSX64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; LINUXOSX64-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; LINUXOSX64-NEXT:    callq test_argRet512Vector
-; LINUXOSX64-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; LINUXOSX64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; LINUXOSX64-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; LINUXOSX64-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; LINUXOSX64-NEXT:    addq $112, %rsp
+; LINUXOSX64-NEXT:    addq $176, %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 16
 ; LINUXOSX64-NEXT:    popq %rsp
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 8
 ; LINUXOSX64-NEXT:    retq
-  %b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %a)
-  %c = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b
+  %b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i1> %x, <16 x i32> %a, <16 x i32> %a)
+  %c = select <16 x i1> %x, <16 x i32> %a, <16 x i32> %b
   ret <16 x i32> %c
 }
 
@@ -1252,17 +1297,17 @@ define x86_regcallcc %struct.complex @test_argMultiRet(float, double, i32, i8, i
 ; WIN64-LABEL: test_argMultiRet:
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    vaddsd __real@{{.*}}(%rip), %xmm1, %xmm1
+; WIN64-NEXT:    movl $999, %edx # imm = 0x3E7
 ; WIN64-NEXT:    movl $4, %eax
 ; WIN64-NEXT:    movb $7, %cl
-; WIN64-NEXT:    movl $999, %edx # imm = 0x3E7
 ; WIN64-NEXT:    retq
 ;
 ; LINUXOSX64-LABEL: test_argMultiRet:
 ; LINUXOSX64:       # %bb.0:
 ; LINUXOSX64-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
+; LINUXOSX64-NEXT:    movl $999, %edx # imm = 0x3E7
 ; LINUXOSX64-NEXT:    movl $4, %eax
 ; LINUXOSX64-NEXT:    movb $7, %cl
-; LINUXOSX64-NEXT:    movl $999, %edx # imm = 0x3E7
 ; LINUXOSX64-NEXT:    retq
   %6 = fadd double %1, 5.000000e+00
   %7 = insertvalue %struct.complex undef, float %0, 0
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 7ea2aef6ba4c77e58ee68acebbfaa11a09964435..5c44d8679b99c19b3a1f0a3225451ed149c6f69d 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -1148,9 +1148,10 @@ define float @test5(float %p) #0 {
 ; GENERIC-NEXT:  # %bb.2: # %return
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ; GENERIC-NEXT:  .LBB67_1: # %if.end
-; GENERIC-NEXT:    seta %al # sched: [2:1.00]
-; GENERIC-NEXT:    movzbl %al, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    vcmpltss %xmm0, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
 ; GENERIC-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test5:
@@ -1162,9 +1163,10 @@ define float @test5(float %p) #0 {
 ; SKX-NEXT:  # %bb.2: # %return
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ; SKX-NEXT:  .LBB67_1: # %if.end
-; SKX-NEXT:    seta %al # sched: [2:1.00]
-; SKX-NEXT:    movzbl %al, %eax # sched: [1:0.25]
+; SKX-NEXT:    vcmpltss %xmm0, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   %cmp = fcmp oeq float %p, 0.000000e+00
@@ -1447,13 +1449,13 @@ define <4 x float> @slto4f32(<4 x i64> %a) {
 ; GENERIC-LABEL: slto4f32:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvtqq2ps %ymm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: slto4f32:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcvtqq2ps %ymm0, %xmm0 # sched: [7:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %b = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
@@ -1463,13 +1465,13 @@ define <4 x float> @ulto4f32(<4 x i64> %a) {
 ; GENERIC-LABEL: ulto4f32:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvtuqq2ps %ymm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: ulto4f32:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcvtuqq2ps %ymm0, %xmm0 # sched: [7:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %b = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
@@ -1538,14 +1540,14 @@ define <16 x i8> @f32to16uc(<16 x float> %f) {
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpmovdb %zmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f32to16uc:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcvttps2dq %zmm0, %zmm0 # sched: [4:0.50]
 ; SKX-NEXT:    vpmovdb %zmm0, %xmm0 # sched: [4:2.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %res = fptoui <16 x float> %f to <16 x i8>
   ret <16 x i8> %res
@@ -1614,14 +1616,14 @@ define <8 x i16> @f64to8us(<8 x double> %f) {
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvttpd2dq %zmm0, %ymm0 # sched: [4:1.00]
 ; GENERIC-NEXT:    vpmovdw %ymm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f64to8us:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00]
 ; SKX-NEXT:    vpmovdw %ymm0, %xmm0 # sched: [4:2.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %res = fptoui <8 x double> %f to <8 x i16>
   ret <8 x i16> %res
@@ -1632,14 +1634,14 @@ define <8 x i8> @f64to8uc(<8 x double> %f) {
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvttpd2dq %zmm0, %ymm0 # sched: [4:1.00]
 ; GENERIC-NEXT:    vpmovdw %ymm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f64to8uc:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00]
 ; SKX-NEXT:    vpmovdw %ymm0, %xmm0 # sched: [4:2.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %res = fptoui <8 x double> %f to <8 x i8>
   ret <8 x i8> %res
@@ -1649,13 +1651,13 @@ define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
 ; GENERIC-LABEL: f64to4ui:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvttpd2udq %ymm0, %xmm0 # sched: [4:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f64to4ui:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcvttpd2udq %ymm0, %xmm0 # sched: [7:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %b = fptoui <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
@@ -1737,13 +1739,13 @@ define <4 x i32> @f64to4si(<4 x double> %a) {
 ; GENERIC-LABEL: f64to4si:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f64to4si:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %b = fptosi <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
@@ -1771,13 +1773,13 @@ define <4 x float> @f64to4f32(<4 x double> %b) {
 ; GENERIC-LABEL: f64to4f32:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f64to4f32:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %a = fptrunc <4 x double> %b to <4 x float>
   ret <4 x float> %a
@@ -1789,7 +1791,7 @@ define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
 ; GENERIC-NEXT:    vpslld $31, %xmm1, %xmm1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm1, %k1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [4:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f64to4f32_mask:
@@ -1797,7 +1799,7 @@ define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1 # sched: [1:0.50]
 ; SKX-NEXT:    vpmovd2m %xmm1, %k1 # sched: [1:1.00]
 ; SKX-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [7:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %a = fptrunc <4 x double> %b to <4 x float>
   %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
@@ -4307,7 +4309,7 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
 ; GENERIC-NEXT:    vpmovd2m %zmm0, %k0 # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovd %k0, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    # kill: def $ax killed $ax killed $eax
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: trunc_16i32_to_16i1:
@@ -4316,7 +4318,7 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
 ; SKX-NEXT:    vpmovd2m %zmm0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    kmovd %k0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask_b = trunc <16 x i32>%a to <16 x i1>
   %mask = bitcast <16 x i1> %mask_b to i16
@@ -4422,14 +4424,14 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0 # sched: [1:0.50]
 ; GENERIC-NEXT:    vpmovm2w %k0, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8i1_8i16:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0 # sched: [1:0.25]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %x = icmp slt <8 x i32> %a1, %a2
   %y = sext <8 x i1> %x to <8 x i16>
@@ -4475,14 +4477,14 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpmovsxbq (%rdi), %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: extload_v8i64:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %sign_load = load <8 x i8>, <8 x i8>* %a
   %c = sext <8 x i8> %sign_load to <8 x i64>
@@ -6124,13 +6126,13 @@ define void @mov_test18(i8 * %addr, <8 x i64> %data) {
 ; GENERIC-LABEL: mov_test18:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test18:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vaddr = bitcast i8* %addr to <8 x i64>*
   store <8 x i64>%data, <8 x i64>* %vaddr, align 64
@@ -6141,13 +6143,13 @@ define void @mov_test19(i8 * %addr, <16 x i32> %data) {
 ; GENERIC-LABEL: mov_test19:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test19:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vaddr = bitcast i8* %addr to <16 x i32>*
   store <16 x i32>%data, <16 x i32>* %vaddr, align 1
@@ -6158,13 +6160,13 @@ define void @mov_test20(i8 * %addr, <16 x i32> %data) {
 ; GENERIC-LABEL: mov_test20:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test20:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vaddr = bitcast i8* %addr to <16 x i32>*
   store <16 x i32>%data, <16 x i32>* %vaddr, align 64
@@ -6190,13 +6192,13 @@ define void @mov_test22(i8 * %addr, <8 x i64> %data) {
 ; GENERIC-LABEL: mov_test22:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test22:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vaddr = bitcast i8* %addr to <8 x i64>*
   store <8 x i64>%data, <8 x i64>* %vaddr, align 1
@@ -6222,13 +6224,13 @@ define void @mov_test24(i8 * %addr, <8 x double> %data) {
 ; GENERIC-LABEL: mov_test24:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test24:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vaddr = bitcast i8* %addr to <8 x double>*
   store <8 x double>%data, <8 x double>* %vaddr, align 64
@@ -6254,13 +6256,13 @@ define void @mov_test26(i8 * %addr, <16 x float> %data) {
 ; GENERIC-LABEL: mov_test26:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test26:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vaddr = bitcast i8* %addr to <16 x float>*
   store <16 x float>%data, <16 x float>* %vaddr, align 64
@@ -6286,13 +6288,13 @@ define void @mov_test28(i8 * %addr, <8 x double> %data) {
 ; GENERIC-LABEL: mov_test28:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test28:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vaddr = bitcast i8* %addr to <8 x double>*
   store <8 x double>%data, <8 x double>* %vaddr, align 1
@@ -6318,13 +6320,13 @@ define void @mov_test30(i8 * %addr, <16 x float> %data) {
 ; GENERIC-LABEL: mov_test30:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test30:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vaddr = bitcast i8* %addr to <16 x float>*
   store <16 x float>%data, <16 x float>* %vaddr, align 1
@@ -6793,8 +6795,8 @@ define i16 @mand16(i16 %x, i16 %y) {
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    movl %edi, %ecx # sched: [1:0.33]
-; GENERIC-NEXT:    xorl %esi, %ecx # sched: [1:0.33]
-; GENERIC-NEXT:    andl %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    andl %esi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT:    xorl %esi, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    orl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    # kill: def $ax killed $ax killed $eax
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -6803,8 +6805,8 @@ define i16 @mand16(i16 %x, i16 %y) {
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
 ; SKX-NEXT:    movl %edi, %ecx # sched: [1:0.25]
-; SKX-NEXT:    xorl %esi, %ecx # sched: [1:0.25]
-; SKX-NEXT:    andl %esi, %eax # sched: [1:0.25]
+; SKX-NEXT:    andl %esi, %ecx # sched: [1:0.25]
+; SKX-NEXT:    xorl %esi, %eax # sched: [1:0.25]
 ; SKX-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6877,7 +6879,7 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
 ; GENERIC-NEXT:    kshiftrw $5, %k0, %k0 # sched: [1:1.00]
 ; GENERIC-NEXT:    kmovd %k0, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    andl $1, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_test1:
@@ -6886,7 +6888,7 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
 ; SKX-NEXT:    kshiftrw $5, %k0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    kmovd %k0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    andl $1, %eax # sched: [1:0.25]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@@ -6902,7 +6904,7 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
 ; GENERIC-NEXT:    kmovd %k0, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    andl $1, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    # kill: def $ax killed $ax killed $eax
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_test2:
@@ -6912,7 +6914,7 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
 ; SKX-NEXT:    kmovd %k0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    andl $1, %eax # sched: [1:0.25]
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@@ -6928,7 +6930,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
 ; GENERIC-NEXT:    kmovd %k0, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    andb $1, %al # sched: [1:0.33]
 ; GENERIC-NEXT:    # kill: def $al killed $al killed $eax
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_test3:
@@ -6938,7 +6940,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
 ; SKX-NEXT:    kmovd %k0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    andb $1, %al # sched: [1:0.25]
 ; SKX-NEXT:    # kill: def $al killed $al killed $eax
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@@ -6976,7 +6978,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
 ; GENERIC-NEXT:    vpcmpleq %ymm1, %ymm0, %k1 # sched: [1:0.50]
 ; GENERIC-NEXT:    vpcmpgtq %ymm3, %ymm2, %k0 {%k1} # sched: [1:0.50]
 ; GENERIC-NEXT:    vpmovm2d %k0, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test4:
@@ -6984,7 +6986,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
 ; SKX-NEXT:    vpcmpleq %ymm1, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpcmpgtq %ymm3, %ymm2, %k0 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0 # sched: [1:0.25]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %x_gt_y = icmp sgt <4 x i64> %x, %y
   %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
@@ -7061,13 +7063,13 @@ define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
 ; GENERIC-NEXT:  # %bb.2:
 ; GENERIC-NEXT:    kxorw %k0, %k0, %k0 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpmovm2b %k0, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ; GENERIC-NEXT:  .LBB386_1:
 ; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [0:0.25]
 ; GENERIC-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 # sched: [1:0.50]
 ; GENERIC-NEXT:    vpmovm2b %k0, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vcmp_test8:
@@ -7077,13 +7079,13 @@ define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
 ; SKX-NEXT:  # %bb.2:
 ; SKX-NEXT:    kxorw %k0, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0 # sched: [1:0.25]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ; SKX-NEXT:  .LBB386_1:
 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0 # sched: [1:0.25]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %cond = icmp sgt i32 %a1, %b1
   %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
@@ -7617,11 +7619,11 @@ define void @ktest_1(<8 x double> %in, double * %base) {
 ; GENERIC-NEXT:    je .LBB410_2 # sched: [1:1.00]
 ; GENERIC-NEXT:  # %bb.1: # %L1
 ; GENERIC-NEXT:    vmovapd %zmm0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ; GENERIC-NEXT:  .LBB410_2: # %L2
 ; GENERIC-NEXT:    vmovapd %zmm0, 8(%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: ktest_1:
@@ -7634,11 +7636,11 @@ define void @ktest_1(<8 x double> %in, double * %base) {
 ; SKX-NEXT:    je .LBB410_2 # sched: [1:0.50]
 ; SKX-NEXT:  # %bb.1: # %L1
 ; SKX-NEXT:    vmovapd %zmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ; SKX-NEXT:  .LBB410_2: # %L2
 ; SKX-NEXT:    vmovapd %zmm0, 8(%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %addr1 = getelementptr double, double * %base, i64 0
   %addr2 = getelementptr double, double * %base, i64 1
@@ -7686,12 +7688,12 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; GENERIC-NEXT:  # %bb.1: # %L1
 ; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, 64(%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ; GENERIC-NEXT:  .LBB411_2: # %L2
 ; GENERIC-NEXT:    vmovaps %zmm0, 4(%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, 68(%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: ktest_2:
@@ -7711,12 +7713,12 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; SKX-NEXT:  # %bb.1: # %L1
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, 64(%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ; SKX-NEXT:  .LBB411_2: # %L2
 ; SKX-NEXT:    vmovaps %zmm0, 4(%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, 68(%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %addr1 = getelementptr float, float * %base, i64 0
   %addr2 = getelementptr float, float * %base, i64 1
@@ -7908,7 +7910,7 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
 ; GENERIC-NEXT:    vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %ymm0, %k0 # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovd %k0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: store_32i1:
@@ -7916,7 +7918,7 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
 ; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50]
 ; SKX-NEXT:    vpmovb2m %ymm0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    kmovd %k0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   store <32 x i1> %v, <32 x i1>* %a
   ret void
@@ -7928,7 +7930,7 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
 ; GENERIC-NEXT:    vpsllw $15, %zmm0, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %zmm0, %k0 # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovd %k0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: store_32i1_1:
@@ -7936,7 +7938,7 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
 ; SKX-NEXT:    vpsllw $15, %zmm0, %zmm0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovw2m %zmm0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    kmovd %k0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %v1 = trunc <32 x i16> %v to <32 x i1>
   store <32 x i1> %v1, <32 x i1>* %a
@@ -7951,7 +7953,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ; GENERIC-NEXT:    vpsllw $7, %zmm0, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %zmm0, %k0 # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovq %k0, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: store_64i1:
@@ -7959,7 +7961,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ; SKX-NEXT:    vpsllw $7, %zmm0, %zmm0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    kmovq %k0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   store <64 x i1> %v, <64 x i1>* %a
   ret void
@@ -7971,7 +7973,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k0 # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovb %k0, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    addl %eax, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_bitcast_v8i1_zext:
@@ -7979,7 +7981,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    kmovb %k0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    addl %eax, %eax # sched: [1:0.25]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
    %v1 = icmp eq <16 x i32> %a, zeroinitializer
    %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -7995,7 +7997,7 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k0 # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovw %k0, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    addl %eax, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_bitcast_v16i1_zext:
@@ -8003,7 +8005,7 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    kmovw %k0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    addl %eax, %eax # sched: [1:0.25]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
    %v1 = icmp eq <16 x i32> %a, zeroinitializer
    %mask1 = bitcast <16 x i1> %v1 to i16
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 90e533c09b7f6c31db8f611ac0b7ea696d546a81..1a5f78db17febcc445ca1f15603bc2fe79effee1 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -265,13 +265,13 @@ define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
 define i64 @pr30249() {
 ; X86-LABEL: pr30249:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl $2, %eax
+; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pr30249:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    movl $1, %eax
 ; X64-NEXT:    retq
   %v = select i1 undef , i64 1, i64 2
   ret i64 %v
@@ -318,27 +318,19 @@ define float @pr30561_f32(float %b, float %a, i1 %c) {
 define <16 x i16> @pr31515(<16 x i1> %a, <16 x i1> %b, <16 x i16> %c) nounwind {
 ; X86-LABEL: pr31515:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpmovsxbd %xmm1, %zmm1
-; X86-NEXT:    vpslld $31, %zmm1, %zmm1
-; X86-NEXT:    vpmovsxbd %xmm0, %zmm0
-; X86-NEXT:    vpslld $31, %zmm0, %zmm0
-; X86-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X86-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
-; X86-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; X86-NEXT:    vpmovdw %zmm0, %ymm0
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X86-NEXT:    vpsllw $15, %ymm0, %ymm0
+; X86-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; X86-NEXT:    vpandn %ymm2, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pr31515:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpmovsxbd %xmm1, %zmm1
-; X64-NEXT:    vpslld $31, %zmm1, %zmm1
-; X64-NEXT:    vpmovsxbd %xmm0, %zmm0
-; X64-NEXT:    vpslld $31, %zmm0, %zmm0
-; X64-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X64-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
-; X64-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; X64-NEXT:    vpmovdw %zmm0, %ymm0
+; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X64-NEXT:    vpsllw $15, %ymm0, %ymm0
+; X64-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; X64-NEXT:    vpandn %ymm2, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %mask = and <16 x i1> %a, %b
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 0768508cca977d6b3655716d5af21ea8d1f9a7b9..50dec563d575e205970539a1fbc48bf8cca48936 100644
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -162,12 +162,11 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x
 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -176,14 +175,13 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -195,14 +193,13 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -214,14 +211,13 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp,
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
@@ -233,14 +229,13 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
@@ -252,14 +247,12 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp,
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpsrld $16, %xmm2, %xmm3
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm2
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm3
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
@@ -271,14 +264,12 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vpsrld $16, %xmm1, %xmm2
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm1
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
@@ -290,11 +281,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp,
 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -303,13 +292,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = mem[0],xmm2[1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -321,13 +308,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -625,10 +610,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x
 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
@@ -637,12 +621,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
@@ -654,11 +637,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp,
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -671,12 +653,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
@@ -688,11 +669,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp,
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -705,12 +685,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
+; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
@@ -722,11 +701,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp,
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -739,10 +717,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp
 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
@@ -751,12 +728,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
@@ -768,11 +744,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp,
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -785,10 +760,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp
 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    vpermi2w (%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -799,12 +773,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -817,12 +790,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w (%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -835,12 +807,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp,
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -853,12 +824,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w (%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -871,12 +841,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp,
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -889,12 +858,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -907,10 +875,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp,
 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -921,12 +888,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -939,12 +905,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -1102,10 +1067,8 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i
 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,1],xmm0[0,0]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm0
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1],mem[0,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
@@ -1114,12 +1077,10 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0]
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,1],mem[0,0]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
@@ -1131,12 +1092,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,1],mem[0,0]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
@@ -1148,12 +1107,10 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1165,12 +1122,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1182,12 +1137,10 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
@@ -1199,12 +1152,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
@@ -1216,12 +1167,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4
 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,2,3]
+; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@@ -1230,14 +1178,11 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = mem[1,1,2,3]
+; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@@ -1249,14 +1194,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = mem[1,1,2,3]
+; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@@ -1592,12 +1534,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp,
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,3,6,11,0,1,5,15]
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
+; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
@@ -1609,11 +1550,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8
 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1626,12 +1566,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp,
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,14,1,5,4,2,8,10]
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
+; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
@@ -1643,11 +1582,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8
 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1660,10 +1598,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp,
 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
@@ -1672,12 +1609,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,4,1,13,15,4,6,12]
-; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
@@ -1689,11 +1625,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8
 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1706,10 +1641,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp,
 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = <13,0,0,6,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1720,12 +1654,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <13,0,0,6,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1738,12 +1671,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <13,0,0,6,u,u,u,u>
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1756,12 +1688,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp,
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6]
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
+; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1774,12 +1705,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,5,3,2,15,5,7,6]
+; CHECK-NEXT:    vpermi2d (%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1792,12 +1722,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp,
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,15,6,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1810,12 +1739,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,15,6,9,u,u,u,u>
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1828,16 +1756,15 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp,
 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; CHECK-NEXT:    vmovd %xmm0, %eax
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
 ; CHECK-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
 ; CHECK-NEXT:    vpextrd $3, %xmm1, %eax
 ; CHECK-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm1
 ; CHECK-NEXT:    vpextrd $2, %xmm0, %eax
 ; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
@@ -1846,9 +1773,9 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; CHECK-NEXT:    vmovd %xmm2, %eax
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
 ; CHECK-NEXT:    vpinsrd $1, %eax, %xmm4, %xmm4
 ; CHECK-NEXT:    vpextrd $3, %xmm3, %eax
@@ -1857,7 +1784,6 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4
 ; CHECK-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm2
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
@@ -1869,9 +1795,9 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; CHECK-NEXT:    vmovd %xmm1, %eax
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
 ; CHECK-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
 ; CHECK-NEXT:    vpextrd $3, %xmm2, %eax
@@ -1880,7 +1806,6 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp,
 ; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
@@ -1959,10 +1884,8 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i
 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -1971,11 +1894,9 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[1]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -1987,11 +1908,9 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x
 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[1]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -2003,12 +1922,10 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2
 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
@@ -2020,12 +1937,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x
 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
@@ -2402,12 +2317,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,4]
-; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,4]
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
@@ -2419,11 +2333,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2436,12 +2349,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,5,5,1]
-; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,5,5,1]
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
@@ -2453,11 +2365,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2470,10 +2381,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [7,0,0,2]
-; CHECK-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
@@ -2482,12 +2392,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,0,0,2]
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,0,0,2]
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
@@ -2499,11 +2408,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2516,12 +2424,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,4,6,1]
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,4,6,1]
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
@@ -2533,11 +2440,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2550,12 +2456,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,7,1]
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,7,1]
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
@@ -2567,11 +2472,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2584,10 +2488,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
-; CHECK-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2596,12 +2499,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,2,3,2]
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,2,3,2]
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2613,11 +2515,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2630,12 +2531,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,3,1,5]
-; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,1,5]
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
@@ -2647,11 +2547,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2664,11 +2563,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4
 define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovaps 32(%rdi), %xmm1
 ; CHECK-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2677,12 +2574,10 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti32x4 $2, %zmm2, %xmm3
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; CHECK-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2694,12 +2589,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x
 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; CHECK-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2711,9 +2604,8 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
@@ -2729,9 +2621,8 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x
 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
@@ -2894,11 +2785,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2907,13 +2797,12 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm3
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2925,13 +2814,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2943,13 +2831,11 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>*
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
+; CHECK-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -2961,13 +2847,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -2979,13 +2863,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>*
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm3
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -2997,13 +2880,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -3015,11 +2897,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>*
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3028,13 +2909,12 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm3
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0]
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3046,13 +2926,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2]
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3372,10 +3251,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %v
 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
@@ -3384,13 +3262,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [7,6,7,11,5,10,0,4]
-; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
@@ -3402,12 +3279,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>*
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
-; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqps %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3420,13 +3296,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [11,0,9,0,7,14,0,8]
-; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
+; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
@@ -3438,12 +3313,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>*
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
-; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqps %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3456,13 +3330,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
@@ -3474,12 +3347,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>*
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
-; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqps %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3492,10 +3364,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float
 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
-; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
@@ -3504,13 +3375,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [7,5,3,3,11,4,12,9]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
@@ -3522,12 +3392,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>*
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
-; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqps %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3540,12 +3409,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,3,3]
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,3,3]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = mem[3,1,2,3]
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3555,12 +3421,9 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm3 = xmm3[0,2,3,3]
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm2
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[3,1,2,3]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = mem[0,2,3,3]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm3 = mem[3,1,2,3]
+; CHECK-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1}
@@ -3576,12 +3439,9 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>*
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,3]
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,2,3,3]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = mem[3,1,2,3]
+; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z}
@@ -3597,13 +3457,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [0,10,6,15,4,14,6,15]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15]
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3616,13 +3475,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>*
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15]
-; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,10,6,15,4,14,6,15]
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3635,13 +3493,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [4,14,4,14,4,14,6,7]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7]
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3654,13 +3511,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>*
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7]
-; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,14,4,14,4,14,6,7]
+; CHECK-NEXT:    vpermi2ps (%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3673,10 +3529,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = <3,3,15,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -3687,13 +3542,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = <3,3,15,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u>
+; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3706,13 +3560,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>*
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = <3,3,15,9,u,u,u,u>
+; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3795,10 +3648,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double>
 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3807,13 +3658,11 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp)
 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vblendpd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; CHECK-NEXT:    vmovapd (%rdi), %xmm2
+; CHECK-NEXT:    vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3825,13 +3674,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double
 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-NEXT:    vmovapd (%rdi), %xmm1
+; CHECK-NEXT:    vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1]
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3843,12 +3690,10 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x doub
 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovapd 16(%rdi), %xmm2
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],mem[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -3860,12 +3705,10 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double
 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovapd 16(%rdi), %xmm1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],mem[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -4231,10 +4074,9 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double>
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovapd (%rdi), %ymm1
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [1,6,7,2]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
@@ -4243,13 +4085,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp)
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [1,6,7,2]
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [1,6,7,2]
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
@@ -4261,12 +4102,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [1,6,7,2]
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4279,13 +4119,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x doub
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [3,4,2,4]
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [3,4,2,4]
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
@@ -4297,12 +4136,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [3,4,2,4]
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4315,13 +4153,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [1,2,3,4]
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [1,2,3,4]
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -4333,12 +4170,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [1,2,3,4]
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4351,10 +4187,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x doub
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovapd (%rdi), %ymm1
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [4,2,1,0]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4363,13 +4198,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp)
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [4,2,1,0]
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [4,2,1,0]
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4381,12 +4215,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [4,2,1,0]
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4399,13 +4232,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x doub
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [2,4,1,5]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,4,1,5]
+; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
@@ -4417,12 +4249,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [2,4,1,5]
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4435,13 +4266,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x doub
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [6,1,1,1]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [6,1,1,1]
+; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
@@ -4453,12 +4283,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [6,1,1,1]
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4471,9 +4300,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x doub
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm1
+; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [0,2,6,1]
 ; CHECK-NEXT:    vpermi2pd %ymm1, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -4484,11 +4312,10 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp)
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm3
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm2
+; CHECK-NEXT:    vmovapd 32(%rdi), %ymm3
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [0,2,6,1]
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpermi2pd %ymm2, %ymm3, %ymm4
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
@@ -4503,9 +4330,8 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; CHECK-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm2
+; CHECK-NEXT:    vmovapd 32(%rdi), %ymm3
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,6,1]
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
@@ -4522,13 +4348,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x doub
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [0,5,2,5]
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [0,5,2,5]
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
@@ -4540,12 +4365,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [0,5,2,5]
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4558,10 +4382,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x doub
 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovapd (%rdi), %ymm1
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [1,6,3,6]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -4572,13 +4395,12 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp)
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [1,6,3,6]
-; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [1,6,3,6]
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovapd %xmm4, %xmm0 {%k1}
+; CHECK-NEXT:    vmovapd %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4591,13 +4413,12 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [1,6,3,6]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovapd (%rdi), %ymm1
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [1,6,3,6]
+; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm2
 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vmovapd %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4610,12 +4431,10 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x doub
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm2
-; CHECK-NEXT:    vextractf32x4 $2, %zmm2, %xmm3
-; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[0]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovapd (%rdi), %xmm2
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
@@ -4627,12 +4446,10 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[0]
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovapd (%rdi), %xmm1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
diff --git a/test/CodeGen/X86/avx512-trunc-widen.ll b/test/CodeGen/X86/avx512-trunc-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ba451973faa04d4e6a0bc31db4d800b6dc82373d
--- /dev/null
+++ b/test/CodeGen/X86/avx512-trunc-widen.ll
@@ -0,0 +1,1037 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX
+
+ attributes #0 = { nounwind }
+
+define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 {
+; ALL-LABEL: trunc_16x32_to_16x8:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x = trunc <16 x i32> %i to <16 x i8>
+  ret <16 x i8> %x
+}
+
+define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 {
+; ALL-LABEL: trunc_8x64_to_8x16:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovqw %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x = trunc <8 x i64> %i to <8 x i16>
+  ret <8 x i16> %x
+}
+
+define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 {
+; ALL-LABEL: trunc_v16i32_to_v16i16:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %1 = trunc <16 x i32> %x to <16 x i16>
+  ret <16 x i16> %1
+}
+
+define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 {
+; ALL-LABEL: trunc_qb_512:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovqb %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x = trunc <8 x i64> %i to <8 x i8>
+  ret <8 x i8> %x
+}
+
+define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
+; ALL-LABEL: trunc_qb_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovqb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+    %x = trunc <8 x i64> %i to <8 x i8>
+    store <8 x i8> %x, <8 x i8>* %res
+    ret void
+}
+
+define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
+; KNL-LABEL: trunc_qb_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovqb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qb_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqb %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x = trunc <4 x i64> %i to <4 x i8>
+  ret <4 x i8> %x
+}
+
+define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
+; KNL-LABEL: trunc_qb_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovqb %zmm0, %xmm0
+; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qb_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+    %x = trunc <4 x i64> %i to <4 x i8>
+    store <4 x i8> %x, <4 x i8>* %res
+    ret void
+}
+
+define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
+; ALL-LABEL: trunc_qb_128:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %x = trunc <2 x i64> %i to <2 x i8>
+  ret <2 x i8> %x
+}
+
+define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 {
+; KNL-LABEL: trunc_qb_128_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qb_128_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <2 x i64> %i to <2 x i8>
+    store <2 x i8> %x, <2 x i8>* %res
+    ret void
+}
+
+define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 {
+; ALL-LABEL: trunc_qw_512:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovqw %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x = trunc <8 x i64> %i to <8 x i16>
+  ret <8 x i16> %x
+}
+
+define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
+; ALL-LABEL: trunc_qw_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovqw %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+    %x = trunc <8 x i64> %i to <8 x i16>
+    store <8 x i16> %x, <8 x i16>* %res
+    ret void
+}
+
+define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
+; KNL-LABEL: trunc_qw_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qw_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqw %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x = trunc <4 x i64> %i to <4 x i16>
+  ret <4 x i16> %x
+}
+
+define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
+; KNL-LABEL: trunc_qw_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qw_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqw %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+    %x = trunc <4 x i64> %i to <4 x i16>
+    store <4 x i16> %x, <4 x i16>* %res
+    ret void
+}
+
+define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
+; KNL-LABEL: trunc_qw_128:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qw_128:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; SKX-NEXT:    retq
+  %x = trunc <2 x i64> %i to <2 x i16>
+  ret <2 x i16> %x
+}
+
+define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 {
+; KNL-LABEL: trunc_qw_128_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qw_128_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqw %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <2 x i64> %i to <2 x i16>
+    store <2 x i16> %x, <2 x i16>* %res
+    ret void
+}
+
+define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 {
+; ALL-LABEL: trunc_qd_512:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovqd %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %x = trunc <8 x i64> %i to <8 x i32>
+  ret <8 x i32> %x
+}
+
+define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
+; ALL-LABEL: trunc_qd_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovqd %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+    %x = trunc <8 x i64> %i to <8 x i32>
+    store <8 x i32> %x, <8 x i32>* %res
+    ret void
+}
+
+define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
+; KNL-LABEL: trunc_qd_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qd_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqd %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x = trunc <4 x i64> %i to <4 x i32>
+  ret <4 x i32> %x
+}
+
+define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
+; KNL-LABEL: trunc_qd_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vmovdqa %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qd_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqd %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+    %x = trunc <4 x i64> %i to <4 x i32>
+    store <4 x i32> %x, <4 x i32>* %res
+    ret void
+}
+
+define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 {
+; ALL-LABEL: trunc_qd_128:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; ALL-NEXT:    retq
+  %x = trunc <2 x i64> %i to <2 x i32>
+  ret <2 x i32> %x
+}
+
+define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 {
+; KNL-LABEL: trunc_qd_128_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT:    vmovlps %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_qd_128_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovqd %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <2 x i64> %i to <2 x i32>
+    store <2 x i32> %x, <2 x i32>* %res
+    ret void
+}
+
+define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 {
+; ALL-LABEL: trunc_db_512:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x = trunc <16 x i32> %i to <16 x i8>
+  ret <16 x i8> %x
+}
+
+define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
+; ALL-LABEL: trunc_db_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovdb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+    %x = trunc <16 x i32> %i to <16 x i8>
+    store <16 x i8> %x, <16 x i8>* %res
+    ret void
+}
+
+define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
+; KNL-LABEL: trunc_db_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_db_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovdb %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x = trunc <8 x i32> %i to <8 x i8>
+  ret <8 x i8> %x
+}
+
+define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
+; KNL-LABEL: trunc_db_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_db_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovdb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+    %x = trunc <8 x i32> %i to <8 x i8>
+    store <8 x i8> %x, <8 x i8>* %res
+    ret void
+}
+
+define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
+; ALL-LABEL: trunc_db_128:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %x = trunc <4 x i32> %i to <4 x i8>
+  ret <4 x i8> %x
+}
+
+define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 {
+; KNL-LABEL: trunc_db_128_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_db_128_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovdb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <4 x i32> %i to <4 x i8>
+    store <4 x i8> %x, <4 x i8>* %res
+    ret void
+}
+
+define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 {
+; ALL-LABEL: trunc_dw_512:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %x = trunc <16 x i32> %i to <16 x i16>
+  ret <16 x i16> %x
+}
+
+define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
+; ALL-LABEL: trunc_dw_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovdw %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+    %x = trunc <16 x i32> %i to <16 x i16>
+    store <16 x i16> %x, <16 x i16>* %res
+    ret void
+}
+
+define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
+; KNL-LABEL: trunc_dw_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_dw_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x = trunc <8 x i32> %i to <8 x i16>
+  ret <8 x i16> %x
+}
+
+define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
+; KNL-LABEL: trunc_dw_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    vmovdqa %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_dw_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovdw %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+    %x = trunc <8 x i32> %i to <8 x i16>
+    store <8 x i16> %x, <8 x i16>* %res
+    ret void
+}
+
+define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 {
+; KNL-LABEL: trunc_dw_128_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_dw_128_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovdw %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <4 x i32> %i to <4 x i16>
+    store <4 x i16> %x, <4 x i16>* %res
+    ret void
+}
+
+define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
+; KNL-LABEL: trunc_wb_512:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_512:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovwb %zmm0, %ymm0
+; SKX-NEXT:    retq
+  %x = trunc <32 x i16> %i to <32 x i8>
+  ret <32 x i8> %x
+}
+
+define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
+; KNL-LABEL: trunc_wb_512_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqa %ymm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_512_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+    %x = trunc <32 x i16> %i to <32 x i8>
+    store <32 x i8> %x, <32 x i8>* %res
+    ret void
+}
+
+define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
+; KNL-LABEL: trunc_wb_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovwb %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x = trunc <16 x i16> %i to <16 x i8>
+  ret <16 x i8> %x
+}
+
+define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
+; KNL-LABEL: trunc_wb_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovwb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+    %x = trunc <16 x i16> %i to <16 x i8>
+    store <16 x i8> %x, <16 x i8>* %res
+    ret void
+}
+
+define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
+; ALL-LABEL: trunc_wb_128:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %x = trunc <8 x i16> %i to <8 x i8>
+  ret <8 x i8> %x
+}
+
+define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
+; KNL-LABEL: trunc_wb_128_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_wb_128_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovwb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+    %x = trunc <8 x i16> %i to <8 x i8>
+    store <8 x i8> %x, <8 x i8>* %res
+    ret void
+}
+
+
+define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
+; KNL-LABEL: usat_trunc_wb_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
+; KNL-LABEL: usat_trunc_wb_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  ret <16 x i8> %x6
+}
+
+define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
+; KNL-LABEL: usat_trunc_wb_128_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_128_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovuswb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <8 x i16> %x5 to <8 x i8>
+  store <8 x i8> %x6, <8 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
+; ALL-LABEL: usat_trunc_db_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusdb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <16 x i32> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
+; ALL-LABEL: usat_trunc_qb_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusqb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %x6 = trunc <8 x i64> %x5 to <8 x i8>
+  store <8 x i8> %x6, <8 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
+; ALL-LABEL: usat_trunc_qd_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusqd %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %x6 = trunc <8 x i64> %x5 to <8 x i32>
+  store <8 x i32> %x6, <8 x i32>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
+; ALL-LABEL: usat_trunc_qw_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusqw %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x6 = trunc <8 x i64> %x5 to <8 x i16>
+  store <8 x i16> %x6, <8 x i16>* %res, align 1
+  ret void
+}
+
+define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
+; ALL-LABEL: usat_trunc_db_1024:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
+; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <32 x i32> %x5 to <32 x i8>
+  ret <32 x i8> %x6
+}
+
+define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
+; ALL-LABEL: usat_trunc_db_1024_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
+; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqu %ymm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <32 x i32> %x5 to <32 x i8>
+  store <32 x i8>%x6, <32 x i8>* %p, align 1
+  ret void
+}
+
+define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
+; ALL-LABEL: usat_trunc_dw_512:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x6 = trunc <16 x i32> %x5 to <16 x i16>
+  ret <16 x i16> %x6
+}
+
+define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
+; ALL-LABEL: usat_trunc_wb_128:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; ALL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <8 x i16> %x5 to <8 x i8>
+  ret <8 x i8>%x6
+}
+
+define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
+; ALL-LABEL: usat_trunc_qw_1024:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusqw %zmm0, %xmm0
+; ALL-NEXT:    vpmovusqw %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x6 = trunc <16 x i64> %x5 to <16 x i16>
+  ret <16 x i16> %x6
+}
+
+define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) {
+; KNL-LABEL: usat_trunc_db_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; KNL-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_db_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; SKX-NEXT:    vpmovdb %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %tmp1 = icmp ult <8 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %tmp3 = trunc <8 x i32> %tmp2 to <8 x i8>
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %tmp4
+}
+
+
+
+; Tests for the following unsigned saturation pattern:
+
+; %a = icmp sgt %x, C1
+; %b = select %a, %x, C2
+; %c = icmp slt %b, C2
+; %d = select %c, %b, C2
+; %res = trunc %d
+
+
+define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) {
+; KNL-LABEL: smax_usat_trunc_wb_256_mem1:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: smax_usat_trunc_wb_256_mem1:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x3 = icmp slt <16 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+; Test for smax(smin(x, C2), C1).
+define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) {
+; KNL-LABEL: smax_usat_trunc_wb_256_mem2:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: smax_usat_trunc_wb_256_mem2:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x1 = icmp slt <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x3 = icmp sgt <16 x i16> %x2, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) {
+; KNL-LABEL: smax_usat_trunc_wb_256:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: smax_usat_trunc_wb_256:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x3 = icmp slt <16 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  ret <16 x i8> %x6
+  }
+
+define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
+; KNL-LABEL: smax_usat_trunc_wb_128_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpminsw {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: smax_usat_trunc_wb_128_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    vpmovuswb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %x1 = icmp sgt <8 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x3 = icmp slt <8 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <8 x i1> %x3, <8 x i16> %x2, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <8 x i16> %x5 to <8 x i8>
+  store <8 x i8> %x6, <8 x i8>* %res, align 1
+  ret void
+}
+
+define void @smax_usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
+; ALL-LABEL: smax_usat_trunc_db_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusdb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x1 = icmp sgt <16 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %x3 = icmp slt <16 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <16 x i32> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define void @smax_usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
+; ALL-LABEL: smax_usat_trunc_qb_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusqb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
+  %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
+  %x3 = icmp slt <8 x i64> %x2, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %x6 = trunc <8 x i64> %x5 to <8 x i8>
+  store <8 x i8> %x6, <8 x i8>* %res, align 1
+  ret void
+}
+
+define void @smax_usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
+; ALL-LABEL: smax_usat_trunc_qd_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusqd %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
+  %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
+  %x3 = icmp slt <8 x i64> %x2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %x6 = trunc <8 x i64> %x5 to <8 x i32>
+  store <8 x i32> %x6, <8 x i32>* %res, align 1
+  ret void
+}
+
+define void @smax_usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
+; ALL-LABEL: smax_usat_trunc_qw_512_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusqw %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
+  %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
+  %x3 = icmp slt <8 x i64> %x2, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x6 = trunc <8 x i64> %x5 to <8 x i16>
+  store <8 x i16> %x6, <8 x i16>* %res, align 1
+  ret void
+}
+
+define <32 x i8> @smax_usat_trunc_db_1024(<32 x i32> %i) {
+; ALL-LABEL: smax_usat_trunc_db_1024:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
+; ALL-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
+; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <32 x i32> %x5 to <32 x i8>
+  ret <32 x i8> %x6
+}
+
+define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
+; ALL-LABEL: smax_usat_trunc_db_1024_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
+; ALL-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
+; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqu %ymm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+  %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <32 x i32> %x5 to <32 x i8>
+  store <32 x i8>%x6, <32 x i8>* %p, align 1
+  ret void
+}
+
+define <16 x i16> @smax_usat_trunc_dw_512(<16 x i32> %i) {
+; ALL-LABEL: smax_usat_trunc_dw_512:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %x1 = icmp sgt <16 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %x3 = icmp slt <16 x i32> %x2, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x6 = trunc <16 x i32> %x5 to <16 x i16>
+  ret <16 x i16> %x6
+}
+
+define void @negative_test1_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
+; KNL-LABEL: negative_test1_smax_usat_trunc_wb_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: negative_test1_smax_usat_trunc_wb_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    vpmovwb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+  %x3 = icmp slt <16 x i16> %x2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define void @negative_test2_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
+; KNL-LABEL: negative_test2_smax_usat_trunc_wb_256_mem:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: negative_test2_smax_usat_trunc_wb_256_mem:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    vpmovwb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x1 = icmp sgt <16 x i16> %i, <i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10>
+  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10>
+  %x3 = icmp slt <16 x i16> %x2, <i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5>
+  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index ca7e22316fb887dba160a74748d58b6ed1460ea9..c15d33222ca0e4f3e4b32d8918805f3b33b17844 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -440,9 +440,9 @@ define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 {
 define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
 ; KNL-LABEL: trunc_wb_512:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; KNL-NEXT:    retq
@@ -458,9 +458,9 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
 define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
 ; KNL-LABEL: trunc_wb_512_mem:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; KNL-NEXT:    vmovdqa %ymm0, (%rdi)
@@ -480,7 +480,7 @@ define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
 define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
 ; KNL-LABEL: trunc_wb_256:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -497,7 +497,7 @@ define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
 define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
 ; KNL-LABEL: trunc_wb_256_mem:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -541,7 +541,7 @@ define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
 ; KNL-LABEL: usat_trunc_wb_256_mem:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -562,7 +562,7 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
 ; KNL-LABEL: usat_trunc_wb_256:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -650,23 +650,12 @@ define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
 }
 
 define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
-; KNL-LABEL: usat_trunc_db_1024:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_db_1024:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SKX-NEXT:    vpminud %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpminud %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmovdw %zmm0, %ymm0
-; SKX-NEXT:    vpmovdw %zmm1, %ymm1
-; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    vpmovwb %zmm0, %ymm0
-; SKX-NEXT:    retq
+; ALL-LABEL: usat_trunc_db_1024:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
+; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
   %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %x6 = trunc <32 x i32> %x5 to <32 x i8>
@@ -674,26 +663,14 @@ define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
 }
 
 define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
-; KNL-LABEL: usat_trunc_db_1024_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    vmovdqu %ymm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_db_1024_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SKX-NEXT:    vpminud %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpminud %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmovdw %zmm0, %ymm0
-; SKX-NEXT:    vpmovdw %zmm1, %ymm1
-; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; ALL-LABEL: usat_trunc_db_1024_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
+; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqu %ymm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %x6 = trunc <32 x i32> %x5 to <32 x i8>
@@ -726,13 +703,9 @@ define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
 define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
 ; ALL-LABEL: usat_trunc_qw_1024:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; ALL-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
-; ALL-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
-; ALL-NEXT:    vpmovqd %zmm0, %ymm0
-; ALL-NEXT:    vpmovqd %zmm1, %ymm1
-; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    vpmovusqw %zmm0, %xmm0
+; ALL-NEXT:    vpmovusqw %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
   %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
@@ -781,7 +754,7 @@ define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) {
 ; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -809,7 +782,7 @@ define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) {
 ; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
 ; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -836,7 +809,7 @@ define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) {
 ; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -950,29 +923,15 @@ define void @smax_usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
 }
 
 define <32 x i8> @smax_usat_trunc_db_1024(<32 x i32> %i) {
-; KNL-LABEL: smax_usat_trunc_db_1024:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
-; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: smax_usat_trunc_db_1024:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SKX-NEXT:    vpminsd %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpminsd %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmovdw %zmm0, %ymm0
-; SKX-NEXT:    vpmovdw %zmm1, %ymm1
-; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    vpmovwb %zmm0, %ymm0
-; SKX-NEXT:    retq
+; ALL-LABEL: smax_usat_trunc_db_1024:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
+; ALL-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
+; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
   %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -982,32 +941,17 @@ define <32 x i8> @smax_usat_trunc_db_1024(<32 x i32> %i) {
 }
 
 define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
-; KNL-LABEL: smax_usat_trunc_db_1024_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
-; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    vmovdqu %ymm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: smax_usat_trunc_db_1024_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SKX-NEXT:    vpminsd %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpminsd %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmovdw %zmm0, %ymm0
-; SKX-NEXT:    vpmovdw %zmm1, %ymm1
-; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; ALL-LABEL: smax_usat_trunc_db_1024_mem:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
+; ALL-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
+; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
+; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vmovdqu %ymm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -1039,7 +983,7 @@ define void @negative_test1_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>*
 ; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; KNL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -1067,7 +1011,7 @@ define void @negative_test2_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>*
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
 ; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index 42647da95273d87769ba551e608e091f56ec5b89..cae7d418f91f0d0ef82a2be2cb2ab4991973895d 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -850,7 +850,8 @@ define <4  x float> @test42(<4  x float> %x, <4  x float> %x1, float* %ptr) noun
 define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x i1> %mask_in) nounwind {
 ; KNL-LABEL: test43:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xd2]
+; KNL-NEXT:    vpmovzxwq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd2]
+; KNL-NEXT:    ## zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xf2,0x3f]
 ; KNL-NEXT:    vptestmq %zmm2, %zmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x48,0x27,0xca]
 ; KNL-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xc2,0x0f,0x01]
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index 47470fd8f1623a22862ff2e65dad0cfe043cf985..0e18fa4ef26dcb16f9b5554915fe371e3215a74c 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -668,8 +668,8 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou
 ; X86-AVX512DQ-LABEL: test_int_x86_avx512_mask_range_sd:
 ; X86-AVX512DQ:       # %bb.0:
 ; X86-AVX512DQ-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-AVX512DQ-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x51,0xd9,0x04]
 ; X86-AVX512DQ-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x51,0xd1,0x04]
+; X86-AVX512DQ-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x51,0xd9,0x04]
 ; X86-AVX512DQ-NEXT:    vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x18,0x51,0xc1,0x04]
 ; X86-AVX512DQ-NEXT:    vaddpd %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0x58,0xc0]
 ; X86-AVX512DQ-NEXT:    vaddpd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0x58,0xc0]
@@ -678,8 +678,8 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou
 ; X86-AVX512DQVL-LABEL: test_int_x86_avx512_mask_range_sd:
 ; X86-AVX512DQVL:       # %bb.0:
 ; X86-AVX512DQVL-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-AVX512DQVL-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x51,0xd9,0x04]
 ; X86-AVX512DQVL-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x51,0xd1,0x04]
+; X86-AVX512DQVL-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x51,0xd9,0x04]
 ; X86-AVX512DQVL-NEXT:    vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x18,0x51,0xc1,0x04]
 ; X86-AVX512DQVL-NEXT:    vaddpd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0]
 ; X86-AVX512DQVL-NEXT:    vaddpd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0]
@@ -687,8 +687,8 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou
 ;
 ; X64-AVX512DQ-LABEL: test_int_x86_avx512_mask_range_sd:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x51,0xd9,0x04]
 ; X64-AVX512DQ-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-AVX512DQ-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x51,0xd9,0x04]
 ; X64-AVX512DQ-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x51,0xd1,0x04]
 ; X64-AVX512DQ-NEXT:    vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x18,0x51,0xc1,0x04]
 ; X64-AVX512DQ-NEXT:    vaddpd %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0x58,0xc0]
@@ -697,8 +697,8 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou
 ;
 ; X64-AVX512DQVL-LABEL: test_int_x86_avx512_mask_range_sd:
 ; X64-AVX512DQVL:       # %bb.0:
-; X64-AVX512DQVL-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x51,0xd9,0x04]
 ; X64-AVX512DQVL-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-AVX512DQVL-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x51,0xd9,0x04]
 ; X64-AVX512DQVL-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x51,0xd1,0x04]
 ; X64-AVX512DQVL-NEXT:    vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x18,0x51,0xc1,0x04]
 ; X64-AVX512DQVL-NEXT:    vaddpd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0]
@@ -752,8 +752,8 @@ declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vfpclasssd $2, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x67,0xc8,0x02]
-; CHECK-NEXT:    vfpclasssd $4, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x67,0xc0,0x04]
+; CHECK-NEXT:    vfpclasssd $4, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x67,0xc8,0x04]
+; CHECK-NEXT:    vfpclasssd $2, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x67,0xc0,0x02]
 ; CHECK-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
@@ -787,8 +787,8 @@ declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vfpclassss $2, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x67,0xc8,0x02]
-; CHECK-NEXT:    vfpclassss $4, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x67,0xc0,0x04]
+; CHECK-NEXT:    vfpclassss $4, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x67,0xc8,0x04]
+; CHECK-NEXT:    vfpclassss $2, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x67,0xc0,0x02]
 ; CHECK-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll
index 06bbeef08113ed565b793bc626bfc5340e6d83dc..3d6f53da493c3fc31e3e6abcb851b4af5967cf41 100644
--- a/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -33,10 +33,10 @@ define i8 @mand8(i8 %x, i8 %y) {
 ; CHECK-LABEL: mand8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    xorl %esi, %ecx
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    andb %sil, %cl
+; CHECK-NEXT:    xorb %sil, %al
+; CHECK-NEXT:    orb %cl, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %ma = bitcast i8 %x to <8 x i1>
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index 43df0b9d0eaa6ff2618be36577c2fbb6569578e1..d01f8b9e23d5c495cd7edd7d75d9b62c5eafe990 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -153,7 +153,7 @@ define <4 x float> @test_rsqrt28_ss_load(<4 x float> %a0, <4 x float>* %a1ptr) {
 define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
 ; X86-LABEL: test_rsqrt28_ss_maskz:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -170,7 +170,7 @@ define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
 define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) {
 ; X86-LABEL: test_rsqrt28_ss_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
 ; X86-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
@@ -189,7 +189,7 @@ define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x
 define <2 x double> @test_rcp28_sd_mask_load(<2 x double> %a0, <2 x double>* %a1ptr, <2 x double> %a2, i8 %mask) {
 ; X86-LABEL: test_rcp28_sd_mask_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x08]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vrcp28sd %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xcb,0xc8]
 ; X86-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
@@ -210,7 +210,7 @@ declare <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double>, <2 x double>, <2 x
 define <2 x double> @test_rsqrt28_sd_maskz_load(<2 x double> %a0, <2 x double>* %a1ptr, i8 %mask) {
 ; X86-LABEL: test_rsqrt28_sd_maskz_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x08]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -228,7 +228,7 @@ define <2 x double> @test_rsqrt28_sd_maskz_load(<2 x double> %a0, <2 x double>*
 define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
 ; X86-LABEL: test_rsqrt28_sd_maskz:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -245,7 +245,7 @@ define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
 define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) {
 ; X86-LABEL: test_rsqrt28_sd_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
 ; X86-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
@@ -266,9 +266,9 @@ declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2
 define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i8 %mask) {
 ; X86-LABEL: test_rsqrt28_sd_maskz_mem:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl # encoding: [0x8a,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vrsqrt28sd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -286,9 +286,9 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i
 define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr, i8 %mask) {
 ; X86-LABEL: test_rsqrt28_sd_maskz_mem_offset:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl # encoding: [0x8a,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vrsqrt28sd 144(%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x40,0x12]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll
index 2f0943f4af325a6f583b30c47893f91ab832b9bd..a09cf095aef70b1edcb727f00784541472e329ce 100644
--- a/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll
@@ -41,8 +41,8 @@ define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x
 ; X86-NEXT:    vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda]
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x75,0xe2]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe1]
+; X86-NEXT:    vpermi2b %zmm2, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xe2]
 ; X86-NEXT:    vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -53,13 +53,13 @@ define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x
 ; X64-NEXT:    vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda]
 ; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x75,0xe2]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe1]
+; X64-NEXT:    vpermi2b %zmm2, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xe2]
 ; X64-NEXT:    vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
-  %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3)
+  %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %res, <64 x i8> %x2, i64 %x3)
   %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res3 = add <64 x i8> %res, %res1
   %res4 = add <64 x i8> %res3, %res2
diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll
index fd512c7badec5f6bb8822b345485604c4b1e44f5..7572e40f0b2ad0165185b78bf63e711b8db54c1a 100644
--- a/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll
@@ -71,8 +71,8 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x
 ; X86-NEXT:    vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2]
+; X86-NEXT:    vmovdqa %xmm1, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe1]
+; X86-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xe2]
 ; X86-NEXT:    vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -83,13 +83,13 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x
 ; X64-NEXT:    vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2]
+; X64-NEXT:    vmovdqa %xmm1, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe1]
+; X64-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xe2]
 ; X64-NEXT:    vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
-  %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %res, <16 x i8> %x2, i16 %x3)
   %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
   %res3 = add <16 x i8> %res, %res1
   %res4 = add <16 x i8> %res3, %res2
@@ -105,8 +105,8 @@ define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x
 ; X86-NEXT:    vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2]
+; X86-NEXT:    vmovdqa %ymm1, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe1]
+; X86-NEXT:    vpermi2b %ymm2, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xe2]
 ; X86-NEXT:    vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -117,13 +117,13 @@ define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x
 ; X64-NEXT:    vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2]
+; X64-NEXT:    vmovdqa %ymm1, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe1]
+; X64-NEXT:    vpermi2b %ymm2, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xe2]
 ; X64-NEXT:    vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
-  %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3)
+  %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %res, <32 x i8> %x2, i32 %x3)
   %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   %res3 = add <32 x i8> %res, %res1
   %res4 = add <32 x i8> %res3, %res2
diff --git a/test/CodeGen/X86/avx512vpopcntdq-schedule.ll b/test/CodeGen/X86/avx512vpopcntdq-schedule.ll
index 53df4bfaba14ebdb71918e28fed550771f4232ae..f44374e802ded6a43f9473540f51b5285031e0dd 100644
--- a/test/CodeGen/X86/avx512vpopcntdq-schedule.ll
+++ b/test/CodeGen/X86/avx512vpopcntdq-schedule.ll
@@ -18,7 +18,7 @@ define void @test_vpopcntd(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> *%a2, i16
 ; GENERIC-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [8:0.50]
 ; GENERIC-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [8:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ICELAKE-LABEL: test_vpopcntd:
@@ -35,7 +35,7 @@ define void @test_vpopcntd(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> *%a2, i16
 ; ICELAKE-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [8:1.00]
 ; ICELAKE-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; ICELAKE-NEXT:    #NO_APP
-; ICELAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; ICELAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; ICELAKE-NEXT:    retq # sched: [7:1.00]
   tail call void asm "vpopcntd $1, $0 \0A\09 vpopcntd $1, $0 {$3} \0A\09 vpopcntd $1, $0 {$3} {z} \0A\09 vpopcntd $2, $0 \0A\09 vpopcntd $2, $0 {$3} \0A\09 vpopcntd $2, $0 {$3} {z} \0A\09 vpopcntd $2{1to16}, $0 \0A\09 vpopcntd $2{1to16}, $0 {$3} \0A\09 vpopcntd $2{1to16}, $0 {$3} {z}", "v,v,*m,^Yk"(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> *%a2, i16 %a3) nounwind
   ret void
@@ -56,7 +56,7 @@ define void @test_vpopcntq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> *%a2, i8 %a3)
 ; GENERIC-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [8:0.50]
 ; GENERIC-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [8:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ICELAKE-LABEL: test_vpopcntq:
@@ -73,7 +73,7 @@ define void @test_vpopcntq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> *%a2, i8 %a3)
 ; ICELAKE-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [8:1.00]
 ; ICELAKE-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; ICELAKE-NEXT:    #NO_APP
-; ICELAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; ICELAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; ICELAKE-NEXT:    retq # sched: [7:1.00]
   tail call void asm "vpopcntq $1, $0 \0A\09 vpopcntq $1, $0 {$3} \0A\09 vpopcntq $1, $0 {$3} {z} \0A\09 vpopcntq $2, $0 \0A\09 vpopcntq $2, $0 {$3} \0A\09 vpopcntq $2, $0 {$3} {z} \0A\09 vpopcntq $2{1to8}, $0 \0A\09 vpopcntq $2{1to8}, $0 {$3} \0A\09 vpopcntq $2{1to8}, $0 {$3} {z}", "v,v,*m,^Yk"(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> *%a2, i8 %a3) nounwind
   ret void
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
index 364e4ce083dc848405dbc0329ce26e0062445d2b..0a2e154f5b93f8666b0e44fe9d7d3470b7ac444c 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -30,11 +30,10 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
 ; AVX512F-LABEL: v8i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpcmpgtw %xmm3, %xmm2, %xmm1
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1
-; AVX512F-NEXT:    vpcmpgtw %xmm3, %xmm2, %xmm0
-; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0 {%k1}
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512F-NEXT:    vzeroupper
@@ -158,11 +157,10 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
 ; AVX512F-LABEL: v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpcmpgtb %xmm3, %xmm2, %xmm1
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpcmpgtb %xmm3, %xmm2, %xmm0
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512F-NEXT:    vzeroupper
@@ -217,25 +215,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm5, %xmm1
 ; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    movmskpd %xmm3, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-SSSE3-NEXT:    retq
 ;
@@ -370,25 +364,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm5, %xmm1
 ; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    movmskpd %xmm3, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-SSSE3-NEXT:    retq
 ;
@@ -515,25 +505,21 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm0, %xmm3
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSE2-SSSE3-NEXT:    por %xmm3, %xmm0
 ; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm4
-; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
 ; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT:    movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    por %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    movmskpd %xmm2, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-SSSE3-NEXT:    retq
 ;
@@ -624,25 +610,21 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm5, %xmm1
 ; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    movmskpd %xmm3, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-SSSE3-NEXT:    retq
 ;
@@ -913,10 +895,9 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; AVX512F-NEXT:    vpsllw $8, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1
-; AVX512F-NEXT:    vpmovsxwd %xmm2, %ymm0
-; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0 {%k1}
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512F-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index f2a6c66ceee720e320a8cff3a7cafbe887b05c10..0f3b8c945404ec983a600e84dc2bcc022d368daf 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -14,20 +14,18 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm8, %xmm1
 ; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm9
 ; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm9
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
 ; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm1
 ; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm10, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm9, %xmm1
 ; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
 ; SSE2-SSSE3-NEXT:    por %xmm1, %xmm3
 ; SSE2-SSSE3-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-SSSE3-NEXT:    pxor %xmm8, %xmm0
 ; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2]
 ; SSE2-SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
 ; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm9, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
@@ -35,22 +33,20 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm8, %xmm5
 ; SSE2-SSSE3-NEXT:    movdqa %xmm5, %xmm1
 ; SSE2-SSSE3-NEXT:    pcmpgtd %xmm7, %xmm1
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; SSE2-SSSE3-NEXT:    pcmpeqd %xmm7, %xmm5
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm3, %xmm1
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm8, %xmm6
 ; SSE2-SSSE3-NEXT:    pxor %xmm8, %xmm4
 ; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE2-SSSE3-NEXT:    pcmpeqd %xmm6, %xmm4
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    por %xmm3, %xmm2
 ; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
 ; SSE2-SSSE3-NEXT:    andps %xmm0, %xmm2
 ; SSE2-SSSE3-NEXT:    movmskps %xmm2, %eax
@@ -59,17 +55,17 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
 ;
 ; AVX1-LABEL: v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vmovmskps %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -78,12 +74,10 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovmskps %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -130,12 +124,10 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
 ; AVX12-LABEL: v4f64:
 ; AVX12:       # %bb.0:
 ; AVX12-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX12-NEXT:    vcmpltpd %ymm2, %ymm3, %ymm1
+; AVX12-NEXT:    vandpd %ymm1, %ymm0, %ymm0
 ; AVX12-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX12-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX12-NEXT:    vcmpltpd %ymm2, %ymm3, %ymm1
-; AVX12-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX12-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
-; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX12-NEXT:    vmovmskps %xmm0, %eax
 ; AVX12-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX12-NEXT:    vzeroupper
@@ -181,17 +173,17 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
 ;
 ; AVX1-LABEL: v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpcmpgtw %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpacksswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -200,12 +192,10 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
 ; AVX2-LABEL: v16i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -214,11 +204,10 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
 ; AVX512F-LABEL: v16i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm1
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512F-NEXT:    vzeroupper
@@ -256,17 +245,17 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
 ;
 ; AVX1-LABEL: v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
@@ -276,12 +265,10 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
 ; AVX2-LABEL: v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm2, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
@@ -330,12 +317,10 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
 ; AVX12-LABEL: v8f32:
 ; AVX12:       # %bb.0:
 ; AVX12-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
+; AVX12-NEXT:    vcmpltps %ymm2, %ymm3, %ymm1
+; AVX12-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX12-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX12-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX12-NEXT:    vcmpltps %ymm2, %ymm3, %ymm1
-; AVX12-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX12-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
-; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX12-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX12-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX12-NEXT:    # kill: def $al killed $al killed $eax
@@ -412,18 +397,14 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
 ; AVX512F-LABEL: v32i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
-; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k2}
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm1
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; AVX512F-NEXT:    kmovw %k0, %ecx
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k1}
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    shll $16, %eax
 ; AVX512F-NEXT:    orl %ecx, %eax
diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll
index 183e32a518b8c75a989ede6a97d77ddef50cc4ff..bab37c34888a846e6b01c0fc35a558ae66b5e7fe 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -304,18 +304,16 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
 ; AVX512F-LABEL: v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpgtw %ymm7, %ymm5, %ymm2
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpcmpgtw %ymm6, %ymm4, %ymm2
+; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
-; AVX512F-NEXT:    vpcmpgtw %ymm7, %ymm5, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vpcmpgtw %ymm6, %ymm4, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %ecx
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    shll $16, %eax
 ; AVX512F-NEXT:    orl %ecx, %eax
@@ -615,33 +613,29 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
-; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k2
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k3
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k4
-; AVX512F-NEXT:    vpcmpgtb %ymm7, %ymm5, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vpcmpgtb %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX512F-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpgtb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm6
+; AVX512F-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; AVX512F-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vpcmpgtb %ymm6, %ymm4, %ymm2
-; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
-; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
-; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0 {%k4}
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
-; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k0 {%k3}
+; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %ecx
 ; AVX512F-NEXT:    shll $16, %ecx
 ; AVX512F-NEXT:    orl %eax, %ecx
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k2}
+; AVX512F-NEXT:    vpand %xmm5, %xmm1, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %edx
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k1}
+; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    shll $16, %eax
 ; AVX512F-NEXT:    orl %edx, %eax
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index 3f78d0c9c5ccfaef6148b7b5a0cd68d314626749..f803901c0e78a49cb5807664ddd5007fc3877666 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -167,12 +167,10 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
 ; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    movmskpd %xmm1, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
@@ -259,12 +257,10 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
 ; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    movmskpd %xmm1, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
@@ -347,14 +343,12 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    movmskpd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movmskpd %xmm0, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-SSSE3-NEXT:    retq
 ;
@@ -417,12 +411,10 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
 ; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    movmskpd %xmm1, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
@@ -741,3 +733,147 @@ entry:
   %e = bitcast <64 x i1> %d to i64
   ret i64 %e
 }
+
+define void @bitcast_16i8_store(i16* %p, <16 x i8> %a0) {
+; SSE2-SSSE3-LABEL: bitcast_16i8_store:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT:    movw %ax, (%rdi)
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: bitcast_16i8_store:
+; AVX12:       # %bb.0:
+; AVX12-NEXT:    vpmovmskb %xmm0, %eax
+; AVX12-NEXT:    movw %ax, (%rdi)
+; AVX12-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_16i8_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_16i8_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovb2m %xmm0, %k0
+; AVX512BW-NEXT:    kmovw %k0, (%rdi)
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <16 x i8> %a0, zeroinitializer
+  %a2 = bitcast <16 x i1> %a1 to i16
+  store i16 %a2, i16* %p
+  ret void
+}
+
+define void @bitcast_8i16_store(i8* %p, <8 x i16> %a0) {
+; SSE2-SSSE3-LABEL: bitcast_8i16_store:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    packsswb %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    pmovmskb %xmm1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, (%rdi)
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: bitcast_8i16_store:
+; AVX12:       # %bb.0:
+; AVX12-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX12-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; AVX12-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX12-NEXT:    vpmovmskb %xmm0, %eax
+; AVX12-NEXT:    movb %al, (%rdi)
+; AVX12-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_8i16_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    movb %al, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_8i16_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovw2m %xmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <8 x i16> %a0, zeroinitializer
+  %a2 = bitcast <8 x i1> %a1 to i8
+  store i8 %a2, i8* %p
+  ret void
+}
+
+define void @bitcast_4i32_store(i4* %p, <4 x i32> %a0) {
+; SSE2-SSSE3-LABEL: bitcast_4i32_store:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT:    movb %al, (%rdi)
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: bitcast_4i32_store:
+; AVX12:       # %bb.0:
+; AVX12-NEXT:    vmovmskps %xmm0, %eax
+; AVX12-NEXT:    movb %al, (%rdi)
+; AVX12-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_4i32_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    movb %al, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_4i32_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <4 x i32> %a0, zeroinitializer
+  %a2 = bitcast <4 x i1> %a1 to i4
+  store i4 %a2, i4* %p
+  ret void
+}
+
+define void @bitcast_2i64_store(i2* %p, <2 x i64> %a0) {
+; SSE2-SSSE3-LABEL: bitcast_2i64_store:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    movb %al, (%rdi)
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: bitcast_2i64_store:
+; AVX12:       # %bb.0:
+; AVX12-NEXT:    vmovmskpd %xmm0, %eax
+; AVX12-NEXT:    movb %al, (%rdi)
+; AVX12-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_2i64_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    movb %al, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_2i64_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <2 x i64> %a0, zeroinitializer
+  %a2 = bitcast <2 x i1> %a1 to i2
+  store i2 %a2, i2* %p
+  ret void
+}
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index 5f468f3b0d1f86c589c93be8070e6d2096313b5a..d349ae3bc40dcd598f5e20957b00c820e0d53853 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -214,25 +214,21 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm1
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm1
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm5, %xmm3
 ; SSE2-SSSE3-NEXT:    por %xmm1, %xmm3
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    packssdw %xmm3, %xmm1
-; SSE2-SSSE3-NEXT:    movmskps %xmm1, %eax
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    movmskps %xmm2, %eax
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-SSSE3-NEXT:    retq
 ;
@@ -313,3 +309,214 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) {
   %res = bitcast <4 x i1> %x to i4
   ret i4 %res
 }
+
+define void @bitcast_32i8_store(i32* %p, <32 x i8> %a0) {
+; SSE2-SSSE3-LABEL: bitcast_32i8_store:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT:    pmovmskb %xmm1, %ecx
+; SSE2-SSSE3-NEXT:    shll $16, %ecx
+; SSE2-SSSE3-NEXT:    orl %eax, %ecx
+; SSE2-SSSE3-NEXT:    movl %ecx, (%rdi)
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: bitcast_32i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpmovmskb %xmm2, %eax
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    orl %eax, %ecx
+; AVX1-NEXT:    movl %ecx, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: bitcast_32i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    movl %eax, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_32i8_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    kmovw %k1, 2(%rdi)
+; AVX512F-NEXT:    kmovw %k0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_32i8_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovb2m %ymm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <32 x i8> %a0, zeroinitializer
+  %a2 = bitcast <32 x i1> %a1 to i32
+  store i32 %a2, i32* %p
+  ret void
+}
+
+define void @bitcast_16i16_store(i16* %p, <16 x i16> %a0) {
+; SSE2-SSSE3-LABEL: bitcast_16i16_store:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSE2-SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    packsswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pmovmskb %xmm2, %eax
+; SSE2-SSSE3-NEXT:    movw %ax, (%rdi)
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: bitcast_16i16_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    movw %ax, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: bitcast_16i16_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    movw %ax, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_16i16_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_16i16_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovw2m %ymm0, %k0
+; AVX512BW-NEXT:    kmovw %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <16 x i16> %a0, zeroinitializer
+  %a2 = bitcast <16 x i1> %a1 to i16
+  store i16 %a2, i16* %p
+  ret void
+}
+
+define void @bitcast_8i32_store(i8* %p, <8 x i32> %a0) {
+; SSE2-SSSE3-LABEL: bitcast_8i32_store:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSE2-SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    packsswb %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pmovmskb %xmm2, %eax
+; SSE2-SSSE3-NEXT:    movb %al, (%rdi)
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: bitcast_8i32_store:
+; AVX12:       # %bb.0:
+; AVX12-NEXT:    vmovmskps %ymm0, %eax
+; AVX12-NEXT:    movb %al, (%rdi)
+; AVX12-NEXT:    vzeroupper
+; AVX12-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_8i32_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    movb %al, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_8i32_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <8 x i32> %a0, zeroinitializer
+  %a2 = bitcast <8 x i1> %a1 to i8
+  store i8 %a2, i8* %p
+  ret void
+}
+
+define void @bitcast_4i64_store(i4* %p, <4 x i64> %a0) {
+; SSE2-SSSE3-LABEL: bitcast_4i64_store:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT:    por %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT:    movb %al, (%rdi)
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: bitcast_4i64_store:
+; AVX12:       # %bb.0:
+; AVX12-NEXT:    vmovmskpd %ymm0, %eax
+; AVX12-NEXT:    movb %al, (%rdi)
+; AVX12-NEXT:    vzeroupper
+; AVX12-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_4i64_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtq %ymm0, %ymm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    movb %al, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_4i64_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtq %ymm0, %ymm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <4 x i64> %a0, zeroinitializer
+  %a2 = bitcast <4 x i1> %a1 to i4
+  store i4 %a2, i4* %p
+  ret void
+}
diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll
index 743172a6f47fb16df7573fc4575267f56b46272c..1911aed297384bb3adc1770df458b500ff9ea832 100644
--- a/test/CodeGen/X86/bitcast-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -415,3 +415,296 @@ define i8 @v8f64(<8 x double> %a, <8 x double> %b) {
   %res = bitcast <8 x i1> %x to i8
   ret i8 %res
 }
+
+define void @bitcast_64i8_store(i64* %p, <64 x i8> %a0) {
+; SSE-LABEL: bitcast_64i8_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    pmovmskb %xmm1, %ecx
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    orl %eax, %ecx
+; SSE-NEXT:    pmovmskb %xmm2, %eax
+; SSE-NEXT:    pmovmskb %xmm3, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %eax, %edx
+; SSE-NEXT:    shlq $32, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: bitcast_64i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpmovmskb %xmm3, %eax
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    orl %eax, %ecx
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %eax, %edx
+; AVX1-NEXT:    shlq $32, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    movq %rdx, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: bitcast_64i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovmskb %ymm1, %eax
+; AVX2-NEXT:    shlq $32, %rax
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    orq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_64i8_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm3
+; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-NEXT:    kmovw %k3, 6(%rdi)
+; AVX512F-NEXT:    kmovw %k2, 4(%rdi)
+; AVX512F-NEXT:    kmovw %k1, 2(%rdi)
+; AVX512F-NEXT:    kmovw %k0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_64i8_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <64 x i8> %a0, zeroinitializer
+  %a2 = bitcast <64 x i1> %a1 to i64
+  store i64 %a2, i64* %p
+  ret void
+}
+
+define void @bitcast_32i16_store(i32* %p, <32 x i16> %a0) {
+; SSE-LABEL: bitcast_32i16_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtw %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE-NEXT:    packsswb %xmm5, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pcmpgtw %xmm3, %xmm0
+; SSE-NEXT:    pcmpgtw %xmm2, %xmm4
+; SSE-NEXT:    packsswb %xmm0, %xmm4
+; SSE-NEXT:    pmovmskb %xmm4, %ecx
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    orl %eax, %ecx
+; SSE-NEXT:    movl %ecx, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: bitcast_32i16_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    orl %eax, %ecx
+; AVX1-NEXT:    movl %ecx, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: bitcast_32i16_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    movl %eax, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_32i16_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    kmovw %k1, 2(%rdi)
+; AVX512F-NEXT:    kmovw %k0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_32i16_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <32 x i16> %a0, zeroinitializer
+  %a2 = bitcast <32 x i1> %a1 to i32
+  store i32 %a2, i32* %p
+  ret void
+}
+
+define void @bitcast_16i32_store(i16* %p, <16 x i32> %a0) {
+; SSE-LABEL: bitcast_16i32_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    packssdw %xmm5, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    packssdw %xmm2, %xmm4
+; SSE-NEXT:    packsswb %xmm3, %xmm4
+; SSE-NEXT:    pmovmskb %xmm4, %eax
+; SSE-NEXT:    movw %ax, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: bitcast_16i32_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    movw %ax, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: bitcast_16i32_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    movw %ax, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: bitcast_16i32_store:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; AVX512-NEXT:    kmovw %k0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %a1 = icmp slt <16 x i32> %a0, zeroinitializer
+  %a2 = bitcast <16 x i1> %a1 to i16
+  store i16 %a2, i16* %p
+  ret void
+}
+
+define void @bitcast_8i64_store(i8* %p, <8 x i64> %a0) {
+; SSE-LABEL: bitcast_8i64_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtq %xmm3, %xmm5
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtq %xmm2, %xmm3
+; SSE-NEXT:    packssdw %xmm5, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpgtq %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtq %xmm0, %xmm4
+; SSE-NEXT:    packssdw %xmm2, %xmm4
+; SSE-NEXT:    packssdw %xmm3, %xmm4
+; SSE-NEXT:    packsswb %xmm0, %xmm4
+; SSE-NEXT:    pmovmskb %xmm4, %eax
+; SSE-NEXT:    movb %al, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: bitcast_8i64_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovmskps %ymm0, %eax
+; AVX1-NEXT:    movb %al, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: bitcast_8i64_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovmskps %ymm0, %eax
+; AVX2-NEXT:    movb %al, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitcast_8i64_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    movb %al, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: bitcast_8i64_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %a1 = icmp slt <8 x i64> %a0, zeroinitializer
+  %a2 = bitcast <8 x i1> %a1 to i8
+  store i8 %a2, i8* %p
+  ret void
+}
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index be6f193414a84a82375214ccb54bf9a3469e28a2..5642f7c6e435b4ab1811c442380fc3b406af7d57 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW-BEXTR
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW-BEXTR
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+bmi | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW-BEXTR
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW-BEXTR
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW-BEXTR
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW-BEXTR
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+fast-bextr | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-BEXTR
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+bmi,+fast-bextr | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-BEXTR
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+fast-bextr | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-BEXTR
 
 define i32 @andn32(i32 %x, i32 %y)   {
@@ -494,6 +494,56 @@ define i32 @blsi32_load(i32* %x)   {
   ret i32 %tmp2
 }
 
+define i32 @blsi32_z(i32 %a, i32 %b) nounwind {
+; X86-LABEL: blsi32_z:
+; X86:       # %bb.0:
+; X86-NEXT:    blsil {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    jne .LBB24_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB24_2:
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsi32_z:
+; X64:       # %bb.0:
+; X64-NEXT:    blsil %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %t0 = sub i32 0, %a
+  %t1 = and i32 %t0, %a
+  %t2 = icmp eq i32 %t1, 0
+  %t3 = select i1 %t2, i32 %b, i32 %t1
+  ret i32 %t3
+}
+
+define i32 @blsi32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; X86-LABEL: blsi32_z2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    testl %eax, %ecx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsi32_z2:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    testl %edi, %ecx
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    retq
+  %t0 = sub i32 0, %a
+  %t1 = and i32 %t0, %a
+  %t2 = icmp eq i32 %t1, 0
+  %t3 = select i1 %t2, i32 %b, i32 %c
+  ret i32 %t3
+}
+
 define i64 @blsi64(i64 %x)   {
 ; X86-LABEL: blsi64:
 ; X86:       # %bb.0:
@@ -521,6 +571,76 @@ define i64 @blsi64(i64 %x)   {
   ret i64 %tmp2
 }
 
+define i64 @blsi64_z(i64 %a, i64 %b) nounwind {
+; X86-LABEL: blsi64_z:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    jne .LBB27_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB27_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsi64_z:
+; X64:       # %bb.0:
+; X64-NEXT:    blsiq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %t0 = sub i64 0, %a
+  %t1 = and i64 %t0, %a
+  %t2 = icmp eq i64 %t1, 0
+  %t3 = select i1 %t2, i64 %b, i64 %t1
+  ret i64 %t3
+}
+
+define i64 @blsi64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; X86-LABEL: blsi64_z2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsi64_z2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    testq %rdi, %rcx
+; X64-NEXT:    cmovneq %rdx, %rax
+; X64-NEXT:    retq
+  %t0 = sub i64 0, %a
+  %t1 = and i64 %t0, %a
+  %t2 = icmp eq i64 %t1, 0
+  %t3 = select i1 %t2, i64 %b, i64 %c
+  ret i64 %t3
+}
+
 define i32 @blsmsk32(i32 %x)   {
 ; X86-LABEL: blsmsk32:
 ; X86:       # %bb.0:
@@ -553,6 +673,55 @@ define i32 @blsmsk32_load(i32* %x)   {
   ret i32 %tmp2
 }
 
+define i32 @blsmsk32_z(i32 %a, i32 %b) nounwind {
+; X86-LABEL: blsmsk32_z:
+; X86:       # %bb.0:
+; X86-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    jne .LBB31_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB31_2:
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsmsk32_z:
+; X64:       # %bb.0:
+; X64-NEXT:    blsmskl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %t0 = sub i32 %a, 1
+  %t1 = xor i32 %t0, %a
+  %t2 = icmp eq i32 %t1, 0
+  %t3 = select i1 %t2, i32 %b, i32 %t1
+  ret i32 %t3
+}
+
+define i32 @blsmsk32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; X86-LABEL: blsmsk32_z2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsmsk32_z2:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal -1(%rdi), %ecx
+; X64-NEXT:    xorl %edi, %ecx
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    retq
+  %t0 = sub i32 %a, 1
+  %t1 = xor i32 %t0, %a
+  %t2 = icmp eq i32 %t1, 0
+  %t3 = select i1 %t2, i32 %b, i32 %c
+  ret i32 %t3
+}
+
 define i64 @blsmsk64(i64 %x)   {
 ; X86-LABEL: blsmsk64:
 ; X86:       # %bb.0:
@@ -580,6 +749,75 @@ define i64 @blsmsk64(i64 %x)   {
   ret i64 %tmp2
 }
 
+define i64 @blsmsk64_z(i64 %a, i64 %b) nounwind {
+; X86-LABEL: blsmsk64_z:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    jne .LBB34_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:  .LBB34_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsmsk64_z:
+; X64:       # %bb.0:
+; X64-NEXT:    blsmskq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %t0 = sub i64 %a, 1
+  %t1 = xor i64 %t0, %a
+  %t2 = icmp eq i64 %t1, 0
+  %t3 = select i1 %t2, i64 %b, i64 %t1
+  ret i64 %t3
+}
+
+define i64 @blsmsk64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; X86-LABEL: blsmsk64_z2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsmsk64_z2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    leaq -1(%rdi), %rcx
+; X64-NEXT:    xorq %rdi, %rcx
+; X64-NEXT:    cmovneq %rdx, %rax
+; X64-NEXT:    retq
+  %t0 = sub i64 %a, 1
+  %t1 = xor i64 %t0, %a
+  %t2 = icmp eq i64 %t1, 0
+  %t3 = select i1 %t2, i64 %b, i64 %c
+  ret i64 %t3
+}
+
 define i32 @blsr32(i32 %x)   {
 ; X86-LABEL: blsr32:
 ; X86:       # %bb.0:
@@ -612,6 +850,55 @@ define i32 @blsr32_load(i32* %x)   {
   ret i32 %tmp2
 }
 
+define i32 @blsr32_z(i32 %a, i32 %b) nounwind {
+; X86-LABEL: blsr32_z:
+; X86:       # %bb.0:
+; X86-NEXT:    blsrl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    jne .LBB38_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB38_2:
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr32_z:
+; X64:       # %bb.0:
+; X64-NEXT:    blsrl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %t0 = sub i32 %a, 1
+  %t1 = and i32 %t0, %a
+  %t2 = icmp eq i32 %t1, 0
+  %t3 = select i1 %t2, i32 %b, i32 %t1
+  ret i32 %t3
+}
+
+define i32 @blsr32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; X86-LABEL: blsr32_z2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    testl %eax, %ecx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr32_z2:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal -1(%rdi), %ecx
+; X64-NEXT:    testl %edi, %ecx
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    retq
+  %t0 = sub i32 %a, 1
+  %t1 = and i32 %t0, %a
+  %t2 = icmp eq i32 %t1, 0
+  %t3 = select i1 %t2, i32 %b, i32 %c
+  ret i32 %t3
+}
+
 define i64 @blsr64(i64 %x)   {
 ; X86-LABEL: blsr64:
 ; X86:       # %bb.0:
@@ -639,6 +926,75 @@ define i64 @blsr64(i64 %x)   {
   ret i64 %tmp2
 }
 
+define i64 @blsr64_z(i64 %a, i64 %b) nounwind {
+; X86-LABEL: blsr64_z:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    jne .LBB41_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:  .LBB41_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr64_z:
+; X64:       # %bb.0:
+; X64-NEXT:    blsrq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %t0 = sub i64 %a, 1
+  %t1 = and i64 %t0, %a
+  %t2 = icmp eq i64 %t1, 0
+  %t3 = select i1 %t2, i64 %b, i64 %t1
+  ret i64 %t3
+}
+
+define i64 @blsr64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; X86-LABEL: blsr64_z2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    andl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr64_z2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    leaq -1(%rdi), %rcx
+; X64-NEXT:    testq %rdi, %rcx
+; X64-NEXT:    cmovneq %rdx, %rax
+; X64-NEXT:    retq
+  %t0 = sub i64 %a, 1
+  %t1 = and i64 %t0, %a
+  %t2 = icmp eq i64 %t1, 0
+  %t3 = select i1 %t2, i64 %b, i64 %c
+  ret i64 %t3
+}
+
 ; PR35792 - https://bugs.llvm.org/show_bug.cgi?id=35792
 
 define i64 @blsr_disguised_constant(i64 %x) {
@@ -681,3 +1037,35 @@ define i64 @blsr_disguised_shrunk_add(i64 %x) {
   %c = and i64 %b, %a
   ret i64 %c
 }
+
+; FIXME: We should not be using the S flag from BEXTR.
+define void @pr40060(i32, i32) {
+; X86-LABEL: pr40060:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    js .LBB45_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    jmp bar # TAILCALL
+; X86-NEXT:  .LBB45_1:
+; X86-NEXT:    retl
+;
+; X64-LABEL: pr40060:
+; X64:       # %bb.0:
+; X64-NEXT:    bextrl %esi, %edi, %eax
+; X64-NEXT:    js .LBB45_1
+; X64-NEXT:  # %bb.2:
+; X64-NEXT:    jmp bar # TAILCALL
+; X64-NEXT:  .LBB45_1:
+; X64-NEXT:    retq
+  %3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %0, i32 %1)
+  %4 = icmp sgt i32 %3, -1
+  br i1 %4, label %5, label %6
+
+  tail call void @bar()
+  br label %6
+
+  ret void
+}
+
+declare void @bar()
diff --git a/test/CodeGen/X86/bmi2-schedule.ll b/test/CodeGen/X86/bmi2-schedule.ll
index 5232e51ac73b3fe440de6622ef30a9f07f1917ae..f235e793ab9fefef67388d0d11e8df95948dd458 100644
--- a/test/CodeGen/X86/bmi2-schedule.ll
+++ b/test/CodeGen/X86/bmi2-schedule.ll
@@ -158,72 +158,56 @@ define void @test_mulx_i32(i32 %a0, i32 %a1, i32* %a2) optsize {
   ret void
 }
 
-define i64 @test_mulx_i64(i64 %a0, i64 %a1, i64 *%a2) {
+define void @test_mulx_i64(i64 %a0, i64 %a1, i64 *%a2) {
 ; GENERIC-LABEL: test_mulx_i64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movq %rdx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    movq %rdi, %rdx # sched: [1:0.33]
-; GENERIC-NEXT:    mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
-; GENERIC-NEXT:    mulxq (%rax), %rdx, %rax # sched: [9:1.00]
-; GENERIC-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
+; GENERIC-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_mulx_i64:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movq %rdx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    movq %rdi, %rdx # sched: [1:0.25]
-; HASWELL-NEXT:    mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    mulxq (%rax), %rdx, %rax # sched: [9:1.00]
-; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
+; HASWELL-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_mulx_i64:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movq %rdx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    movq %rdi, %rdx # sched: [1:0.25]
-; BROADWELL-NEXT:    mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
-; BROADWELL-NEXT:    mulxq (%rax), %rdx, %rax # sched: [9:1.00]
-; BROADWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
+; BROADWELL-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_mulx_i64:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movq %rdx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    movq %rdi, %rdx # sched: [1:0.25]
-; SKYLAKE-NEXT:    mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
-; SKYLAKE-NEXT:    mulxq (%rax), %rdx, %rax # sched: [9:1.00]
-; SKYLAKE-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
+; SKYLAKE-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_mulx_i64:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    movq %rdx, %rax # sched: [1:0.25]
-; KNL-NEXT:    movq %rdi, %rdx # sched: [1:0.25]
-; KNL-NEXT:    mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
-; KNL-NEXT:    mulxq (%rax), %rdx, %rax # sched: [9:1.00]
-; KNL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT:    #APP
+; KNL-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
+; KNL-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_mulx_i64:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movq %rdx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    movq %rdi, %rdx # sched: [1:0.25]
-; ZNVER1-NEXT:    mulxq %rsi, %rsi, %rcx # sched: [3:1.00]
-; ZNVER1-NEXT:    mulxq (%rax), %rdx, %rax # sched: [8:1.00]
-; ZNVER1-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [3:1.00]
+; ZNVER1-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [8:1.00]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1  = load i64, i64 *%a2
-  %2  = zext i64 %a0 to i128
-  %3  = zext i64 %a1 to i128
-  %4  = zext i64 %1 to i128
-  %5  = mul i128 %2, %3
-  %6  = mul i128 %2, %4
-  %7  = lshr i128 %5, 64
-  %8  = lshr i128 %6, 64
-  %9  = trunc i128 %7 to i64
-  %10 = trunc i128 %8 to i64
-  %11 = or i64 %9, %10
-  ret i64 %11
+  tail call void asm "mulx $1, $1, $0 \0A\09 mulx $2, $1, $0 ", "r,r,*m"(i64 %a0, i64 %a1, i64* %a2) nounwind
+  ret void
 }
 
 define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) {
diff --git a/test/CodeGen/X86/bmi2-x86_64.ll b/test/CodeGen/X86/bmi2-x86_64.ll
index bb03138ccf76345c85381c4fe158a9a4c63265db..6333732ae0f263f32011da0482fdbdd5f7e57b51 100644
--- a/test/CodeGen/X86/bmi2-x86_64.ll
+++ b/test/CodeGen/X86/bmi2-x86_64.ll
@@ -68,8 +68,8 @@ define i64 @mulx64(i64 %x, i64 %y, i64* %p)   {
 ; CHECK-LABEL: mulx64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdx, %rcx
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    mulxq %rsi, %rax, %rdx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    mulq %rsi
 ; CHECK-NEXT:    movq %rdx, (%rcx)
 ; CHECK-NEXT:    retq
   %x1 = zext i64 %x to i128
@@ -86,8 +86,8 @@ define i64 @mulx64_load(i64 %x, i64* %y, i64* %p)   {
 ; CHECK-LABEL: mulx64_load:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdx, %rcx
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    mulxq (%rsi), %rax, %rdx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    mulq (%rsi)
 ; CHECK-NEXT:    movq %rdx, (%rcx)
 ; CHECK-NEXT:    retq
   %y1 = load i64, i64* %y
diff --git a/test/CodeGen/X86/bmi2.ll b/test/CodeGen/X86/bmi2.ll
index bf78cb4f72efb50f7eb0b07d06662e9101c8df2e..114f9ac5479af1eaff2a4b949eee98ba40d107cf 100644
--- a/test/CodeGen/X86/bmi2.ll
+++ b/test/CodeGen/X86/bmi2.ll
@@ -120,11 +120,11 @@ define i32 @mulx32(i32 %x, i32 %y, i32* %p)   {
 ; X86-LABEL: mulx32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    mulxl %eax, %eax, %edx
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
@@ -156,10 +156,10 @@ define i32 @mulx32_load(i32 %x, i32* %y, i32* %p)   {
 ; X86-LABEL: mulx32_load:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    mulxl (%eax), %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    mull (%edx)
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/bool-math.ll b/test/CodeGen/X86/bool-math.ll
index 86cda144518e967b6f6e54678e2510dbbbb63806..3a7193bd631014dbe292c390a418d6ddef29962f 100644
--- a/test/CodeGen/X86/bool-math.ll
+++ b/test/CodeGen/X86/bool-math.ll
@@ -1,13 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686--   | FileCheck %s --check-prefix=X32
 
 define i32 @sub_zext_cmp_mask_same_size_result(i32 %x) {
-; CHECK-LABEL: sub_zext_cmp_mask_same_size_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    leal -28(%rdi), %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: sub_zext_cmp_mask_same_size_result:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal -28(%rdi), %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_zext_cmp_mask_same_size_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    orl $-28, %eax
+; X32-NEXT:    retl
   %a = and i32 %x, 1
   %c = icmp eq i32 %a, 0
   %z = zext i1 %c to i32
@@ -16,12 +24,19 @@ define i32 @sub_zext_cmp_mask_same_size_result(i32 %x) {
 }
 
 define i32 @sub_zext_cmp_mask_wider_result(i8 %x) {
-; CHECK-LABEL: sub_zext_cmp_mask_wider_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    leal 26(%rdi), %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: sub_zext_cmp_mask_wider_result:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal 26(%rdi), %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_zext_cmp_mask_wider_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    orl $26, %eax
+; X32-NEXT:    retl
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
   %z = zext i1 %c to i32
@@ -30,13 +45,20 @@ define i32 @sub_zext_cmp_mask_wider_result(i8 %x) {
 }
 
 define i8 @sub_zext_cmp_mask_narrower_result(i32 %x) {
-; CHECK-LABEL: sub_zext_cmp_mask_narrower_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    orb $46, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; X64-LABEL: sub_zext_cmp_mask_narrower_result:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    orb $46, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_zext_cmp_mask_narrower_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    orb $46, %al
+; X32-NEXT:    retl
   %a = and i32 %x, 1
   %c = icmp eq i32 %a, 0
   %z = zext i1 %c to i8
@@ -45,13 +67,20 @@ define i8 @sub_zext_cmp_mask_narrower_result(i32 %x) {
 }
 
 define i8 @add_zext_cmp_mask_same_size_result(i8 %x) {
-; CHECK-LABEL: add_zext_cmp_mask_same_size_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    xorb $27, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; X64-LABEL: add_zext_cmp_mask_same_size_result:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    xorb $27, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: add_zext_cmp_mask_same_size_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    xorb $27, %al
+; X32-NEXT:    retl
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
   %z = zext i1 %c to i8
@@ -60,12 +89,19 @@ define i8 @add_zext_cmp_mask_same_size_result(i8 %x) {
 }
 
 define i32 @add_zext_cmp_mask_wider_result(i8 %x) {
-; CHECK-LABEL: add_zext_cmp_mask_wider_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    xorl $27, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: add_zext_cmp_mask_wider_result:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl $27, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: add_zext_cmp_mask_wider_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    xorl $27, %eax
+; X32-NEXT:    retl
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
   %z = zext i1 %c to i32
@@ -74,13 +110,20 @@ define i32 @add_zext_cmp_mask_wider_result(i8 %x) {
 }
 
 define i8 @add_zext_cmp_mask_narrower_result(i32 %x) {
-; CHECK-LABEL: add_zext_cmp_mask_narrower_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    xorb $43, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; X64-LABEL: add_zext_cmp_mask_narrower_result:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    xorb $43, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: add_zext_cmp_mask_narrower_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    xorb $43, %al
+; X32-NEXT:    retl
   %a = and i32 %x, 1
   %c = icmp eq i32 %a, 0
   %z = zext i1 %c to i8
@@ -89,12 +132,19 @@ define i8 @add_zext_cmp_mask_narrower_result(i32 %x) {
 }
 
 define i32 @low_bit_select_constants_bigger_false_same_size_result(i32 %x) {
-; CHECK-LABEL: low_bit_select_constants_bigger_false_same_size_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    leal 42(%rdi), %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: low_bit_select_constants_bigger_false_same_size_result:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal 42(%rdi), %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: low_bit_select_constants_bigger_false_same_size_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    orl $42, %eax
+; X32-NEXT:    retl
   %a = and i32 %x, 1
   %c = icmp eq i32 %a, 0
   %r = select i1 %c, i32 42, i32 43
@@ -102,12 +152,20 @@ define i32 @low_bit_select_constants_bigger_false_same_size_result(i32 %x) {
 }
 
 define i64 @low_bit_select_constants_bigger_false_wider_result(i32 %x) {
-; CHECK-LABEL: low_bit_select_constants_bigger_false_wider_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    leaq 26(%rdi), %rax
-; CHECK-NEXT:    retq
+; X64-LABEL: low_bit_select_constants_bigger_false_wider_result:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leaq 26(%rdi), %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: low_bit_select_constants_bigger_false_wider_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    orl $26, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    retl
   %a = and i32 %x, 1
   %c = icmp eq i32 %a, 0
   %r = select i1 %c, i64 26, i64 27
@@ -115,13 +173,21 @@ define i64 @low_bit_select_constants_bigger_false_wider_result(i32 %x) {
 }
 
 define i16 @low_bit_select_constants_bigger_false_narrower_result(i32 %x) {
-; CHECK-LABEL: low_bit_select_constants_bigger_false_narrower_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    leal 36(%rdi), %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; X64-LABEL: low_bit_select_constants_bigger_false_narrower_result:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal 36(%rdi), %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: low_bit_select_constants_bigger_false_narrower_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    orl $36, %eax
+; X32-NEXT:    # kill: def $ax killed $ax killed $eax
+; X32-NEXT:    retl
   %a = and i32 %x, 1
   %c = icmp eq i32 %a, 0
   %r = select i1 %c, i16 36, i16 37
@@ -129,13 +195,20 @@ define i16 @low_bit_select_constants_bigger_false_narrower_result(i32 %x) {
 }
 
 define i8 @low_bit_select_constants_bigger_true_same_size_result(i8 %x) {
-; CHECK-LABEL: low_bit_select_constants_bigger_true_same_size_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    xorb $-29, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; X64-LABEL: low_bit_select_constants_bigger_true_same_size_result:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    xorb $-29, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: low_bit_select_constants_bigger_true_same_size_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    xorb $-29, %al
+; X32-NEXT:    retl
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
   %r = select i1 %c, i8 227, i8 226
@@ -143,12 +216,19 @@ define i8 @low_bit_select_constants_bigger_true_same_size_result(i8 %x) {
 }
 
 define i32 @low_bit_select_constants_bigger_true_wider_result(i8 %x) {
-; CHECK-LABEL: low_bit_select_constants_bigger_true_wider_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    xorl $227, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: low_bit_select_constants_bigger_true_wider_result:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl $227, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: low_bit_select_constants_bigger_true_wider_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    xorl $227, %eax
+; X32-NEXT:    retl
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
   %r = select i1 %c, i32 227, i32 226
@@ -156,16 +236,52 @@ define i32 @low_bit_select_constants_bigger_true_wider_result(i8 %x) {
 }
 
 define i8 @low_bit_select_constants_bigger_true_narrower_result(i16 %x) {
-; CHECK-LABEL: low_bit_select_constants_bigger_true_narrower_result:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    xorb $41, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; X64-LABEL: low_bit_select_constants_bigger_true_narrower_result:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    xorb $41, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: low_bit_select_constants_bigger_true_narrower_result:
+; X32:       # %bb.0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    xorb $41, %al
+; X32-NEXT:    retl
   %a = and i16 %x, 1
   %c = icmp eq i16 %a, 0
   %r = select i1 %c, i8 41, i8 40
   ret i8 %r
 }
 
+; Truncation hoisting must not occur with opaque constants
+; because that can induce infinite looping.
+
+define i1 @opaque_constant(i48 %x, i48 %y) {
+; X64-LABEL: opaque_constant:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    shrq $32, %rdi
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    xorl %edi, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: opaque_constant:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    # kill: def $al killed $al killed $eax
+; X32-NEXT:    retl
+  %andx = and i48 %x, 4294967296
+  %andy = and i48 %y, 4294967296
+  %cmp1 = icmp ne i48 %andx, 0
+  %cmp2 = icmp ne i48 %andy, 0
+  %r = xor i1 %cmp1, %cmp2
+  ret i1 %r
+}
+
diff --git a/test/CodeGen/X86/bss_pagealigned.ll b/test/CodeGen/X86/bss_pagealigned.ll
index 0a3bd014937c5ea78cfa50f0998013c3c96ffbcc..3635e67f22c0daa7e3bcdca8eed40c69bbf26dc7 100644
--- a/test/CodeGen/X86/bss_pagealigned.ll
+++ b/test/CodeGen/X86/bss_pagealigned.ll
@@ -7,9 +7,9 @@ declare i8* @memset(i8*, i32, i64)
 define void @unxlate_dev_mem_ptr(i64 %phis, i8* %addr) nounwind {
   %pte.addr.i = alloca %struct.kmem_cache_order_objects*
   %call8 = call i8* @memset(i8* bitcast ([512 x %struct.kmem_cache_order_objects]* @bm_pte to i8*), i32 0, i64 4096)
-; CHECK: movq    $bm_pte, %rdi
+; CHECK:      movl    $4096, %edx
+; CHECK-NEXT: movq    $bm_pte, %rdi
 ; CHECK-NEXT: xorl    %esi, %esi
-; CHECK-NEXT: movl    $4096, %edx
 ; CHECK-NEXT: callq   memset
   ret void
 }
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index e7cb91a42f8b7854355c898238b0cc6c59404d0e..cfbb5ecc544722efb33cf9a9e33931985fd475a6 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -43,7 +43,7 @@ entry:
 
 define <4 x i32> @test2(<4 x i32> %v) {
 ; CHECK-NOSSSE3-LABEL: test2:
-; CHECK-NOSSSE3:       # %bb.0: # %entry
+; CHECK-NOSSSE3:       # %bb.0:
 ; CHECK-NOSSSE3-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm2
 ; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -56,24 +56,61 @@ define <4 x i32> @test2(<4 x i32> %v) {
 ; CHECK-NOSSSE3-NEXT:    retq
 ;
 ; CHECK-SSSE3-LABEL: test2:
-; CHECK-SSSE3:       # %bb.0: # %entry
+; CHECK-SSSE3:       # %bb.0:
 ; CHECK-SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-SSSE3-NEXT:    retq
 ;
 ; CHECK-AVX-LABEL: test2:
-; CHECK-AVX:       # %bb.0: # %entry
+; CHECK-AVX:       # %bb.0:
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-AVX-NEXT:    retq
 ;
 ; CHECK-WIDE-AVX-LABEL: test2:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
+; CHECK-WIDE-AVX:       # %bb.0:
 ; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-WIDE-AVX-NEXT:    retq
-entry:
   %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
   ret <4 x i32> %r
 }
 
+define <4 x i32> @or_bswap(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %p1, <4 x i32>* %p2) {
+; CHECK-NOSSSE3-LABEL: or_bswap:
+; CHECK-NOSSSE3:       # %bb.0:
+; CHECK-NOSSSE3-NEXT:    por %xmm1, %xmm0
+; CHECK-NOSSSE3-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm2, %xmm0
+; CHECK-NOSSSE3-NEXT:    retq
+;
+; CHECK-SSSE3-LABEL: or_bswap:
+; CHECK-SSSE3:       # %bb.0:
+; CHECK-SSSE3-NEXT:    por %xmm1, %xmm0
+; CHECK-SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-SSSE3-NEXT:    retq
+;
+; CHECK-AVX-LABEL: or_bswap:
+; CHECK-AVX:       # %bb.0:
+; CHECK-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-AVX-NEXT:    retq
+;
+; CHECK-WIDE-AVX-LABEL: or_bswap:
+; CHECK-WIDE-AVX:       # %bb.0:
+; CHECK-WIDE-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-WIDE-AVX-NEXT:    retq
+  %xt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %x)
+  %yt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %y)
+  %r = or <4 x i32> %xt, %yt
+  ret <4 x i32> %r
+}
+
 define <2 x i64> @test3(<2 x i64> %v) {
 ; CHECK-NOSSSE3-LABEL: test3:
 ; CHECK-NOSSSE3:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/bswap.ll b/test/CodeGen/X86/bswap.ll
index 4753fc27cc01cc453739557ba398f3c919923654..89e5b3c861912009b1035c28974c5d9d898ae46a 100644
--- a/test/CodeGen/X86/bswap.ll
+++ b/test/CodeGen/X86/bswap.ll
@@ -5,9 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=CHECK64
 
 declare i16 @llvm.bswap.i16(i16)
-
 declare i32 @llvm.bswap.i32(i32)
-
 declare i64 @llvm.bswap.i64(i64)
 
 define i16 @W(i16 %A) {
@@ -61,23 +59,61 @@ define i64 @Y(i64 %A) {
         ret i64 %Z
 }
 
+; This isn't really a bswap test, but the potential probem is
+; easier to see with bswap vs. other ops. The transform in
+; question starts with a bitwise logic op and tries to hoist
+; those ahead of other ops. But that's not generally profitable
+; when the other ops have other uses (and it might not be safe
+; either due to unconstrained instruction count growth).
+
+define i32 @bswap_multiuse(i32 %x, i32 %y, i32* %p1, i32* %p2) nounwind {
+; CHECK-LABEL: bswap_multiuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    bswapl %esi
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    movl %esi, (%edx)
+; CHECK-NEXT:    movl %eax, (%ecx)
+; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: bswap_multiuse:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movl %esi, %eax
+; CHECK64-NEXT:    bswapl %edi
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    movl %edi, (%rdx)
+; CHECK64-NEXT:    movl %eax, (%rcx)
+; CHECK64-NEXT:    orl %edi, %eax
+; CHECK64-NEXT:    retq
+  %xt = call i32 @llvm.bswap.i32(i32 %x)
+  %yt = call i32 @llvm.bswap.i32(i32 %y)
+  store i32 %xt, i32* %p1
+  store i32 %yt, i32* %p2
+  %r = or i32 %xt, %yt
+  ret i32 %r
+}
+
 ; rdar://9164521
 define i32 @test1(i32 %a) nounwind readnone {
 ; CHECK-LABEL: test1:
-; CHECK:       # %bb.0: # %entry
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    bswapl %eax
 ; CHECK-NEXT:    shrl $16, %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: test1:
-; CHECK64:       # %bb.0: # %entry
+; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    movl %edi, %eax
 ; CHECK64-NEXT:    bswapl %eax
 ; CHECK64-NEXT:    shrl $16, %eax
 ; CHECK64-NEXT:    retq
-entry:
-
   %and = lshr i32 %a, 8
   %shr3 = and i32 %and, 255
   %and2 = shl i32 %a, 8
@@ -88,20 +124,18 @@ entry:
 
 define i32 @test2(i32 %a) nounwind readnone {
 ; CHECK-LABEL: test2:
-; CHECK:       # %bb.0: # %entry
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    bswapl %eax
 ; CHECK-NEXT:    sarl $16, %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: test2:
-; CHECK64:       # %bb.0: # %entry
+; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    movl %edi, %eax
 ; CHECK64-NEXT:    bswapl %eax
 ; CHECK64-NEXT:    sarl $16, %eax
 ; CHECK64-NEXT:    retq
-entry:
-
   %and = lshr i32 %a, 8
   %shr4 = and i32 %and, 255
   %and2 = shl i32 %a, 8
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index 83c751f349e910e65249abd5771c1b056967ee58..7f6fbba9e3bbaf506764892fe4b8c89b1cebdc0e 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1067,7 +1067,7 @@ define zeroext i1 @extend(i32 %bit, i64 %bits) {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %eax, %ecx
+; X86-NEXT:    btl %ecx, %eax
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/bypass-slow-division-32.ll b/test/CodeGen/X86/bypass-slow-division-32.ll
index 6ab62223a53e316d3dce3c0bbc9adb6173089da4..1533a393cfbffdf616d37c0e92770888eea51816 100644
--- a/test/CodeGen/X86/bypass-slow-division-32.ll
+++ b/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -97,8 +97,8 @@ define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT:    testl $-256, %edi
 ; CHECK-NEXT:    je .LBB3_4
 ; CHECK-NEXT:  .LBB3_5:
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    jmp .LBB3_6
 ; CHECK-NEXT:  .LBB3_1:
diff --git a/test/CodeGen/X86/bypass-slow-division-64.ll b/test/CodeGen/X86/bypass-slow-division-64.ll
index 24d73f5dbe02e97c73330d839ab3766aa776ae1a..11fc0df2443ced125e1ec86fb134a3410ed3cace 100644
--- a/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -17,8 +17,8 @@ define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB0_1:
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %esi
 ; CHECK-NEXT:    # kill: def $eax killed $eax def $rax
 ; CHECK-NEXT:    retq
@@ -40,8 +40,8 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB1_1:
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %esi
 ; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    retq
@@ -63,8 +63,8 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB2_1:
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %esi
 ; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    # kill: def $eax killed $eax def $rax
diff --git a/test/CodeGen/X86/clear-lowbits.ll b/test/CodeGen/X86/clear-lowbits.ll
index f29717e630ff45710c8bdc9b812be326fc8cbb1a..be251eb99f2ef2f88643f0ff2db416b4de2aee2f 100644
--- a/test/CodeGen/X86/clear-lowbits.ll
+++ b/test/CodeGen/X86/clear-lowbits.ll
@@ -866,10 +866,9 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits16_ic0:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    movw $16, %cx
-; X86-NOBMI2-NEXT:    subw {{[0-9]+}}(%esp), %cx
+; X86-NOBMI2-NEXT:    movb $16, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
-; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $cx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NOBMI2-NEXT:    retl
@@ -877,8 +876,8 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind {
 ; X86-BMI2-LABEL: clear_lowbits16_ic0:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movw $16, %cx
-; X86-BMI2-NEXT:    subw {{[0-9]+}}(%esp), %cx
+; X86-BMI2-NEXT:    movb $16, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -887,10 +886,9 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind {
 ; X64-NOBMI2-LABEL: clear_lowbits16_ic0:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movzwl %di, %eax
-; X64-NOBMI2-NEXT:    movl $16, %ecx
-; X64-NOBMI2-NEXT:    subl %esi, %ecx
+; X64-NOBMI2-NEXT:    movb $16, %cl
+; X64-NOBMI2-NEXT:    subb %sil, %cl
 ; X64-NOBMI2-NEXT:    shrl %cl, %eax
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NOBMI2-NEXT:    retq
@@ -898,8 +896,8 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind {
 ; X64-BMI2-LABEL: clear_lowbits16_ic0:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl %di, %eax
-; X64-BMI2-NEXT:    movl $16, %ecx
-; X64-BMI2-NEXT:    subl %esi, %ecx
+; X64-BMI2-NEXT:    movb $16, %cl
+; X64-BMI2-NEXT:    subb %sil, %cl
 ; X64-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X64-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X64-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -962,10 +960,9 @@ define i16 @clear_lowbits16_ic2_load(i16* %w, i16 %numlowbits) nounwind {
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
-; X86-NOBMI2-NEXT:    movw $16, %cx
-; X86-NOBMI2-NEXT:    subw {{[0-9]+}}(%esp), %cx
+; X86-NOBMI2-NEXT:    movb $16, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
-; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $cx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NOBMI2-NEXT:    retl
@@ -974,8 +971,8 @@ define i16 @clear_lowbits16_ic2_load(i16* %w, i16 %numlowbits) nounwind {
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzwl (%eax), %eax
-; X86-BMI2-NEXT:    movw $16, %cx
-; X86-BMI2-NEXT:    subw {{[0-9]+}}(%esp), %cx
+; X86-BMI2-NEXT:    movb $16, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -984,10 +981,9 @@ define i16 @clear_lowbits16_ic2_load(i16* %w, i16 %numlowbits) nounwind {
 ; X64-NOBMI2-LABEL: clear_lowbits16_ic2_load:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movzwl (%rdi), %eax
-; X64-NOBMI2-NEXT:    movl $16, %ecx
-; X64-NOBMI2-NEXT:    subl %esi, %ecx
+; X64-NOBMI2-NEXT:    movb $16, %cl
+; X64-NOBMI2-NEXT:    subb %sil, %cl
 ; X64-NOBMI2-NEXT:    shrl %cl, %eax
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NOBMI2-NEXT:    retq
@@ -995,8 +991,8 @@ define i16 @clear_lowbits16_ic2_load(i16* %w, i16 %numlowbits) nounwind {
 ; X64-BMI2-LABEL: clear_lowbits16_ic2_load:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-BMI2-NEXT:    movl $16, %ecx
-; X64-BMI2-NEXT:    subl %esi, %ecx
+; X64-BMI2-NEXT:    movb $16, %cl
+; X64-BMI2-NEXT:    subb %sil, %cl
 ; X64-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X64-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X64-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1062,10 +1058,9 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind
 ; X86-NOBMI2-LABEL: clear_lowbits16_ic4_commutative:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    movw $16, %cx
-; X86-NOBMI2-NEXT:    subw {{[0-9]+}}(%esp), %cx
+; X86-NOBMI2-NEXT:    movb $16, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
-; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $cx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NOBMI2-NEXT:    retl
@@ -1073,8 +1068,8 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind
 ; X86-BMI2-LABEL: clear_lowbits16_ic4_commutative:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movw $16, %cx
-; X86-BMI2-NEXT:    subw {{[0-9]+}}(%esp), %cx
+; X86-BMI2-NEXT:    movb $16, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1083,10 +1078,9 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind
 ; X64-NOBMI2-LABEL: clear_lowbits16_ic4_commutative:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movzwl %di, %eax
-; X64-NOBMI2-NEXT:    movl $16, %ecx
-; X64-NOBMI2-NEXT:    subl %esi, %ecx
+; X64-NOBMI2-NEXT:    movb $16, %cl
+; X64-NOBMI2-NEXT:    subb %sil, %cl
 ; X64-NOBMI2-NEXT:    shrl %cl, %eax
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NOBMI2-NEXT:    retq
@@ -1094,8 +1088,8 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind
 ; X64-BMI2-LABEL: clear_lowbits16_ic4_commutative:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl %di, %eax
-; X64-BMI2-NEXT:    movl $16, %ecx
-; X64-BMI2-NEXT:    subl %esi, %ecx
+; X64-BMI2-NEXT:    movb $16, %cl
+; X64-BMI2-NEXT:    subb %sil, %cl
 ; X64-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X64-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X64-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1113,7 +1107,7 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1122,7 +1116,7 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI2-LABEL: clear_lowbits32_ic0:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
@@ -1131,7 +1125,7 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind {
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
-; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    negb %cl
 ; X64-NOBMI2-NEXT:    shrl %cl, %eax
 ; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI2-NEXT:    shll %cl, %eax
@@ -1139,7 +1133,7 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_ic0:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    negl %esi
+; X64-BMI2-NEXT:    negb %sil
 ; X64-BMI2-NEXT:    shrxl %esi, %edi, %eax
 ; X64-BMI2-NEXT:    shlxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    retq
@@ -1197,7 +1191,7 @@ define i32 @clear_lowbits32_ic2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movl (%eax), %eax
 ; X86-NOBMI2-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1207,7 +1201,7 @@ define i32 @clear_lowbits32_ic2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    xorl %ecx, %ecx
-; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    shrxl %ecx, (%eax), %eax
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
@@ -1216,7 +1210,7 @@ define i32 @clear_lowbits32_ic2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
 ; X64-NOBMI2-NEXT:    movl (%rdi), %eax
-; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    negb %cl
 ; X64-NOBMI2-NEXT:    shrl %cl, %eax
 ; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI2-NEXT:    shll %cl, %eax
@@ -1224,7 +1218,7 @@ define i32 @clear_lowbits32_ic2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_ic2_load:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    negl %esi
+; X64-BMI2-NEXT:    negb %sil
 ; X64-BMI2-NEXT:    shrxl %esi, (%rdi), %eax
 ; X64-BMI2-NEXT:    shlxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    retq
@@ -1285,7 +1279,7 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1294,7 +1288,7 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind
 ; X86-BMI2-LABEL: clear_lowbits32_ic4_commutative:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
@@ -1303,7 +1297,7 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
-; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    negb %cl
 ; X64-NOBMI2-NEXT:    shrl %cl, %eax
 ; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI2-NEXT:    shll %cl, %eax
@@ -1311,7 +1305,7 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_ic4_commutative:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    negl %esi
+; X64-BMI2-NEXT:    negb %sil
 ; X64-BMI2-NEXT:    shrxl %esi, %edi, %eax
 ; X64-BMI2-NEXT:    shlxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    retq
@@ -1326,8 +1320,8 @@ define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind
 define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits64_ic0:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl $64, %ecx
-; X86-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movb $64, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1344,8 +1338,8 @@ define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl $64, %ecx
-; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
@@ -1363,7 +1357,7 @@ define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind {
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
-; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    negb %cl
 ; X64-NOBMI2-NEXT:    shrq %cl, %rax
 ; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI2-NEXT:    shlq %cl, %rax
@@ -1371,7 +1365,7 @@ define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_ic0:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    negl %esi
+; X64-BMI2-NEXT:    negb %sil
 ; X64-BMI2-NEXT:    shrxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    retq
@@ -1446,8 +1440,8 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl $64, %ecx
-; X86-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movb $64, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1467,8 +1461,8 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl $64, %ecx
-; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
@@ -1487,7 +1481,7 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI2-NEXT:    movq (%rdi), %rax
-; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    negb %cl
 ; X64-NOBMI2-NEXT:    shrq %cl, %rax
 ; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI2-NEXT:    shlq %cl, %rax
@@ -1495,7 +1489,7 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_ic2_load:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    negl %esi
+; X64-BMI2-NEXT:    negb %sil
 ; X64-BMI2-NEXT:    shrxq %rsi, (%rdi), %rax
 ; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    retq
@@ -1576,8 +1570,8 @@ define i64 @clear_lowbits64_ic3_load_indexzext(i64* %w, i8 %numlowbits) nounwind
 define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits64_ic4_commutative:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl $64, %ecx
-; X86-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movb $64, %cl
+; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1594,8 +1588,8 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic4_commutative:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl $64, %ecx
-; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
@@ -1613,7 +1607,7 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
-; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    negb %cl
 ; X64-NOBMI2-NEXT:    shrq %cl, %rax
 ; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI2-NEXT:    shlq %cl, %rax
@@ -1621,7 +1615,7 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_ic4_commutative:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    negl %esi
+; X64-BMI2-NEXT:    negb %sil
 ; X64-BMI2-NEXT:    shrxq %rsi, %rdi, %rax
 ; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    retq
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 2e92d8e1ad1a7906a81215e43fb2492ef2688f04..a504538b29dba8e59dc34964717c764c61292cde 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -81,7 +81,7 @@ define i1 @test4() nounwind {
 ; CHECK-NEXT:    movsbl {{.*}}(%rip), %edx
 ; CHECK-NEXT:    movzbl %dl, %ecx
 ; CHECK-NEXT:    shrl $7, %ecx
-; CHECK-NEXT:    xorl $1, %ecx
+; CHECK-NEXT:    xorb $1, %cl
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    sarl %cl, %edx
 ; CHECK-NEXT:    movb {{.*}}(%rip), %al
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index b6ecda0208c81352d22ea55bb8f237a3779117f6..2747eca6b9aa9626a8cc6f95b198736559ccba17 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -465,3 +465,21 @@ entry:
   ret i32 %ret
 
 }
+
+define { i64, i64 } @pr39968(i64, i64, i32) {
+; CHECK-LABEL: pr39968:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    testb $64, %dl # encoding: [0xf6,0xc2,0x40]
+; CHECK-NEXT:    cmovneq %rdi, %rsi # encoding: [0x48,0x0f,0x45,0xf7]
+; CHECK-NEXT:    cmovneq %rdi, %rax # encoding: [0x48,0x0f,0x45,0xc7]
+; CHECK-NEXT:    movq %rsi, %rdx # encoding: [0x48,0x89,0xf2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %4 = and i32 %2, 64
+  %5 = icmp ne i32 %4, 0
+  %6 = select i1 %5, i64 %0, i64 %1
+  %7 = select i1 %5, i64 %0, i64 0
+  %8 = insertvalue { i64, i64 } undef, i64 %7, 0
+  %9 = insertvalue { i64, i64 } %8, i64 %6, 1
+  ret { i64, i64 } %9
+}
diff --git a/test/CodeGen/X86/cmpxchg-i128-i1.ll b/test/CodeGen/X86/cmpxchg-i128-i1.ll
index 42f29f3a1d2d4e97177077b34d18a682678ea5e8..d9c83756c345466f8722551b0e1f51450662419f 100644
--- a/test/CodeGen/X86/cmpxchg-i128-i1.ll
+++ b/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -91,8 +91,8 @@ define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) {
 ; CHECK-NEXT:    movq %r8, %rcx
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    sete %sil
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/cmpxchg16b.ll b/test/CodeGen/X86/cmpxchg16b.ll
index aa29cc5da139ee8cb9a6ac7aa9c996d252394717..805f5d022e2864d96c37c5507250d8fd4c195268 100644
--- a/test/CodeGen/X86/cmpxchg16b.ll
+++ b/test/CodeGen/X86/cmpxchg16b.ll
@@ -6,10 +6,10 @@ define void @t1(i128* nocapture %p) nounwind ssp {
 ; CHECK-LABEL: t1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl $1, %ebx
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movl $1, %ebx
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/code-model-elf-memset.ll b/test/CodeGen/X86/code-model-elf-memset.ll
index 2f429f32eab067301d97c7de4a7b639992dc9b99..f349b6412850f2f15a71d4c1151ac47921a349aa 100644
--- a/test/CodeGen/X86/code-model-elf-memset.ll
+++ b/test/CodeGen/X86/code-model-elf-memset.ll
@@ -30,8 +30,8 @@ define i32 @main() #0 {
 ; SMALL-PIC-NEXT:    .cfi_def_cfa_offset 432
 ; SMALL-PIC-NEXT:    movl $0, {{[0-9]+}}(%rsp)
 ; SMALL-PIC-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SMALL-PIC-NEXT:    xorl %esi, %esi
 ; SMALL-PIC-NEXT:    movl $400, %edx # imm = 0x190
+; SMALL-PIC-NEXT:    xorl %esi, %esi
 ; SMALL-PIC-NEXT:    callq memset@PLT
 ; SMALL-PIC-NEXT:    xorl %eax, %eax
 ; SMALL-PIC-NEXT:    addq $424, %rsp # imm = 0x1A8
@@ -44,8 +44,8 @@ define i32 @main() #0 {
 ; MEDIUM-PIC-NEXT:    .cfi_def_cfa_offset 432
 ; MEDIUM-PIC-NEXT:    movl $0, {{[0-9]+}}(%rsp)
 ; MEDIUM-PIC-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; MEDIUM-PIC-NEXT:    xorl %esi, %esi
 ; MEDIUM-PIC-NEXT:    movl $400, %edx # imm = 0x190
+; MEDIUM-PIC-NEXT:    xorl %esi, %esi
 ; MEDIUM-PIC-NEXT:    callq memset@PLT
 ; MEDIUM-PIC-NEXT:    xorl %eax, %eax
 ; MEDIUM-PIC-NEXT:    addq $424, %rsp # imm = 0x1A8
@@ -57,14 +57,14 @@ define i32 @main() #0 {
 ; LARGE-PIC-NEXT:    subq $424, %rsp # imm = 0x1A8
 ; LARGE-PIC-NEXT:    .cfi_def_cfa_offset 432
 ; LARGE-PIC-NEXT:  .L0$pb:
-; LARGE-PIC-NEXT:    leaq .L0$pb(%rip), %rax
+; LARGE-PIC-NEXT:    leaq .L0${{.*}}(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movl $0, {{[0-9]+}}(%rsp)
 ; LARGE-PIC-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; LARGE-PIC-NEXT:    movabsq $memset@GOT, %rax
-; LARGE-PIC-NEXT:    xorl %esi, %esi
 ; LARGE-PIC-NEXT:    movl $400, %edx # imm = 0x190
+; LARGE-PIC-NEXT:    xorl %esi, %esi
 ; LARGE-PIC-NEXT:    callq *(%rcx,%rax)
 ; LARGE-PIC-NEXT:    xorl %eax, %eax
 ; LARGE-PIC-NEXT:    addq $424, %rsp # imm = 0x1A8
diff --git a/test/CodeGen/X86/codemodel.ll b/test/CodeGen/X86/codemodel.ll
index edea6326027083117932309f7ff14df022eb2951..8febb3f0b0a5dc29de8ee7de4f45b06533ac5c60 100644
--- a/test/CodeGen/X86/codemodel.ll
+++ b/test/CodeGen/X86/codemodel.ll
@@ -1,10 +1,13 @@
 ; RUN: llc < %s -code-model=small  | FileCheck -check-prefix CHECK-SMALL %s
 ; RUN: llc < %s -code-model=kernel | FileCheck -check-prefix CHECK-KERNEL %s
+; RUN: not llc < %s -code-model=tiny 2>&1 | FileCheck -check-prefix CHECK-TINY %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 @data = external global [0 x i32]		; <[0 x i32]*> [#uses=5]
 
+; CHECK-TINY:    Target does not support the tiny CodeModel
+
 define i32 @foo() nounwind readonly {
 entry:
 ; CHECK-SMALL-LABEL:  foo:
diff --git a/test/CodeGen/X86/combine-concatvectors.ll b/test/CodeGen/X86/combine-concatvectors.ll
new file mode 100644
index 0000000000000000000000000000000000000000..40568d2388949d4f96795d5dc3e57876d9d3c275
--- /dev/null
+++ b/test/CodeGen/X86/combine-concatvectors.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
+
+define void @PR32957(<2 x float>* %in, <8 x float>* %out) {
+; CHECK-LABEL: PR32957:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovaps %ymm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %ld = load <2 x float>, <2 x float>* %in, align 8
+  %ext = extractelement <2 x float> %ld, i64 0
+  %ext2 = extractelement <2 x float> %ld, i64 1
+  %ins = insertelement <8 x float> <float undef, float undef, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, float %ext, i64 0
+  %ins2 = insertelement <8 x float> %ins, float %ext2, i64 1
+  store <8 x float> %ins2, <8 x float>* %out, align 32
+  ret void
+}
+
+declare { i8, double } @fun()
+
+; Check that this does not fail to combine concat_vectors of a value from
+; merge_values through a bitcast.
+define void @d(i1 %cmp) {
+; CHECK-LABEL: d:
+; CHECK:       # %bb.0: # %bar
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq fun
+bar:
+  %val = call { i8, double } @fun()
+  %extr = extractvalue { i8, double } %val, 1
+  %bc = bitcast double %extr to <2 x float>
+  br label %baz
+
+baz:
+  %extr1 = extractelement <2 x float> %bc, i64 0
+  unreachable
+}
diff --git a/test/CodeGen/X86/combine-mul.ll b/test/CodeGen/X86/combine-mul.ll
index 78278409ebd023c131da73d167ab5112498bea1f..f05bbbb885e49141c32e70b690c732ea8d2fc623 100644
--- a/test/CodeGen/X86/combine-mul.ll
+++ b/test/CodeGen/X86/combine-mul.ll
@@ -289,11 +289,9 @@ define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
 define <16 x i8> @PR35579(<16 x i8> %x) {
 ; SSE-LABEL: PR35579:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE-NEXT:    pmullw %xmm2, %xmm0
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
@@ -304,13 +302,11 @@ define <16 x i8> @PR35579(<16 x i8> %x) {
 ;
 ; AVX-LABEL: PR35579:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
   %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1>
diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll
index a78ecd27c99057693f5d3bad8cea85b41655f5ba..6fff298dbb3e3011bb8a20227f1b66a1f806d522 100644
--- a/test/CodeGen/X86/combine-sdiv.ll
+++ b/test/CodeGen/X86/combine-sdiv.ll
@@ -1015,8 +1015,7 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
 ; SSE2-NEXT:    psrld $29, %xmm3
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
 ; SSE2-NEXT:    psrld $30, %xmm1
-; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrad $4, %xmm2
@@ -1039,18 +1038,17 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
 ; SSE41-NEXT:    psrld $30, %xmm3
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; SSE41-NEXT:    psrld $29, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT:    paddd %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psrad $4, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrad $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psrad $2, %xmm3
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    psrad $3, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    psrad $3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
@@ -1060,8 +1058,6 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
@@ -1104,8 +1100,7 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 ; SSE2-NEXT:    psrld $29, %xmm4
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 ; SSE2-NEXT:    psrld $30, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    psrad $4, %xmm3
@@ -1123,8 +1118,7 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 ; SSE2-NEXT:    psrld $29, %xmm4
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 ; SSE2-NEXT:    psrld $30, %xmm2
-; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    psrad $4, %xmm3
@@ -1147,28 +1141,25 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 ; SSE41-NEXT:    psrld $30, %xmm4
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; SSE41-NEXT:    psrld $29, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
 ; SSE41-NEXT:    paddd %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad $4, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    psrad $4, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    psrad $2, %xmm5
-; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    psrad $2, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; SSE41-NEXT:    psrad $3, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrld $28, %xmm3
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    psrld $28, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
-; SSE41-NEXT:    psrld $30, %xmm5
-; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    psrld $30, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; SSE41-NEXT:    psrld $29, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
 ; SSE41-NEXT:    paddd %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    psrad $4, %xmm3
@@ -1189,8 +1180,6 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 ; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
@@ -1201,10 +1190,9 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
 ; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
-; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm5
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
@@ -1258,8 +1246,7 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ; SSE2-NEXT:    psrld $29, %xmm6
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
 ; SSE2-NEXT:    psrld $30, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    psrad $4, %xmm5
@@ -1277,8 +1264,7 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ; SSE2-NEXT:    psrld $29, %xmm6
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
 ; SSE2-NEXT:    psrld $30, %xmm1
-; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    psrad $4, %xmm5
@@ -1296,8 +1282,7 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ; SSE2-NEXT:    psrld $29, %xmm6
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
 ; SSE2-NEXT:    psrld $30, %xmm4
-; SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
 ; SSE2-NEXT:    paddd %xmm2, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; SSE2-NEXT:    psrad $4, %xmm5
@@ -1315,8 +1300,7 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ; SSE2-NEXT:    psrld $29, %xmm6
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
 ; SSE2-NEXT:    psrld $30, %xmm5
-; SSE2-NEXT:    pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7]
-; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
 ; SSE2-NEXT:    paddd %xmm3, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; SSE2-NEXT:    psrad $4, %xmm2
@@ -1337,70 +1321,65 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ; SSE41-NEXT:    psrad $31, %xmm0
 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
 ; SSE41-NEXT:    psrld $28, %xmm5
-; SSE41-NEXT:    movdqa %xmm0, %xmm7
-; SSE41-NEXT:    psrld $30, %xmm7
-; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    psrld $30, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
 ; SSE41-NEXT:    psrld $29, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm6
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
 ; SSE41-NEXT:    paddd %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
 ; SSE41-NEXT:    psrad $4, %xmm5
-; SSE41-NEXT:    movdqa %xmm0, %xmm7
-; SSE41-NEXT:    psrad $2, %xmm7
-; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    psrad $2, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
 ; SSE41-NEXT:    psrad $3, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
 ; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    psrld $28, %xmm5
-; SSE41-NEXT:    movdqa %xmm1, %xmm7
-; SSE41-NEXT:    psrld $30, %xmm7
-; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    psrld $30, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
 ; SSE41-NEXT:    psrld $29, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
 ; SSE41-NEXT:    paddd %xmm4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    psrad $4, %xmm5
-; SSE41-NEXT:    movdqa %xmm1, %xmm7
-; SSE41-NEXT:    psrad $2, %xmm7
-; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    psrad $2, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
 ; SSE41-NEXT:    psrad $3, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    psrad $31, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    psrld $28, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm7
-; SSE41-NEXT:    psrld $30, %xmm7
-; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    psrld $30, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
 ; SSE41-NEXT:    psrld $29, %xmm4
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
 ; SSE41-NEXT:    paddd %xmm2, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    psrad $4, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm7
-; SSE41-NEXT:    psrad $2, %xmm7
-; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    psrad $2, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
 ; SSE41-NEXT:    psrad $3, %xmm4
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm5
 ; SSE41-NEXT:    psrad $31, %xmm5
 ; SSE41-NEXT:    movdqa %xmm5, %xmm2
 ; SSE41-NEXT:    psrld $28, %xmm2
-; SSE41-NEXT:    movdqa %xmm5, %xmm7
-; SSE41-NEXT:    psrld $30, %xmm7
-; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm5, %xmm6
+; SSE41-NEXT:    psrld $30, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
 ; SSE41-NEXT:    psrld $29, %xmm5
-; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
 ; SSE41-NEXT:    paddd %xmm3, %xmm5
 ; SSE41-NEXT:    movdqa %xmm5, %xmm2
 ; SSE41-NEXT:    psrad $4, %xmm2
@@ -1416,68 +1395,63 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm4
-; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm5
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpsrad $4, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrad $2, %xmm3, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
 ; AVX1-NEXT:    vpsrad $3, %xmm3, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm4
-; AVX1-NEXT:    vpsrld $28, %xmm4, %xmm5
-; AVX1-NEXT:    vpsrld $30, %xmm4, %xmm6
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT:    vpsrld $29, %xmm4, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
-; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm4
-; AVX1-NEXT:    vpsrad $4, %xmm4, %xmm5
-; AVX1-NEXT:    vpsrad $2, %xmm4, %xmm6
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT:    vpsrad $3, %xmm4, %xmm6
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm4
-; AVX1-NEXT:    vpsrld $28, %xmm4, %xmm5
-; AVX1-NEXT:    vpsrld $30, %xmm4, %xmm6
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT:    vpsrld $29, %xmm4, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsrad $4, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrad $2, %xmm3, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
 ; AVX1-NEXT:    vpsrad $3, %xmm3, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm4
-; AVX1-NEXT:    vpsrld $28, %xmm4, %xmm5
-; AVX1-NEXT:    vpsrld $30, %xmm4, %xmm6
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT:    vpsrld $29, %xmm4, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm4
-; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm5
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm5
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
 ; AVX1-NEXT:    retq
 ;
@@ -1559,16 +1533,11 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    psrlq $62, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT:    paddq %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    paddq %xmm0, %xmm1
 ; SSE2-NEXT:    psrlq $2, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
-; SSE2-NEXT:    xorpd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
@@ -1578,29 +1547,23 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    psrlq $62, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    paddq %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    paddq %xmm0, %xmm1
 ; SSE41-NEXT:    psrlq $2, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
 ; SSE41-NEXT:    pxor %xmm2, %xmm1
 ; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpsrlq $62, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
@@ -1684,16 +1647,11 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    psrlq $62, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT:    paddq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    paddq %xmm0, %xmm2
 ; SSE2-NEXT:    psrlq $2, %xmm2
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
-; SSE2-NEXT:    xorpd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
+; SSE2-NEXT:    pxor %xmm3, %xmm2
 ; SSE2-NEXT:    psubq %xmm3, %xmm2
 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
 ; SSE2-NEXT:    movapd %xmm2, %xmm0
@@ -1718,18 +1676,14 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; SSE41-NEXT:    psubq %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psrad $31, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    psrlq $62, %xmm2
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT:    paddq %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    paddq %xmm0, %xmm2
 ; SSE41-NEXT:    psrlq $2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
 ; SSE41-NEXT:    pxor %xmm3, %xmm2
 ; SSE41-NEXT:    psubq %xmm3, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
@@ -1747,9 +1701,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vpsrlq $62, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $62, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpsrlq $2, %xmm2, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
@@ -1850,30 +1803,20 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; SSE2-NEXT:    psubq %xmm5, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    psrlq $62, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1]
-; SSE2-NEXT:    paddq %xmm0, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    paddq %xmm0, %xmm4
 ; SSE2-NEXT:    psrlq $2, %xmm4
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm7 = [9223372036854775808,2305843009213693952]
-; SSE2-NEXT:    xorpd %xmm7, %xmm4
-; SSE2-NEXT:    psubq %xmm7, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952]
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    psubq %xmm6, %xmm4
 ; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    psrlq $62, %xmm0
-; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm0[1]
-; SSE2-NEXT:    paddq %xmm2, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    psrlq $62, %xmm5
+; SSE2-NEXT:    paddq %xmm2, %xmm5
 ; SSE2-NEXT:    psrlq $2, %xmm5
-; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
-; SSE2-NEXT:    xorpd %xmm7, %xmm5
-; SSE2-NEXT:    psubq %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm5
+; SSE2-NEXT:    psubq %xmm6, %xmm5
 ; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
 ; SSE2-NEXT:    movapd %xmm4, %xmm0
 ; SSE2-NEXT:    movapd %xmm5, %xmm2
@@ -1913,30 +1856,23 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; SSE41-NEXT:    psubq %xmm5, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    psrad $31, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    psrlq $62, %xmm4
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
 ; SSE41-NEXT:    paddq %xmm0, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
-; SSE41-NEXT:    psrlq $2, %xmm6
-; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,2305843009213693952]
-; SSE41-NEXT:    pxor %xmm4, %xmm6
-; SSE41-NEXT:    psubq %xmm4, %xmm6
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT:    movdqa %xmm2, %xmm6
-; SSE41-NEXT:    psrad $31, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    psrlq $62, %xmm6
-; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT:    paddq %xmm2, %xmm6
-; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    psrlq $2, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952]
+; SSE41-NEXT:    pxor %xmm6, %xmm4
+; SSE41-NEXT:    psubq %xmm6, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    psrad $31, %xmm5
+; SSE41-NEXT:    psrlq $62, %xmm5
+; SSE41-NEXT:    paddq %xmm2, %xmm5
 ; SSE41-NEXT:    psrlq $2, %xmm5
-; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT:    pxor %xmm4, %xmm5
-; SSE41-NEXT:    psubq %xmm4, %xmm5
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm6, %xmm5
+; SSE41-NEXT:    psubq %xmm6, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
@@ -1956,7 +1892,6 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
 ; AVX1-NEXT:    vpsrlq $62, %xmm5, %xmm5
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
 ; AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpsrlq $2, %xmm5, %xmm6
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
@@ -1976,9 +1911,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm4
-; AVX1-NEXT:    vpsrlq $62, %xmm4, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $62, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrlq $2, %xmm2, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
@@ -2081,8 +2015,7 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; SSE2-NEXT:    psrld $29, %xmm3
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
 ; SSE2-NEXT:    psrld $30, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrad $4, %xmm2
@@ -2102,27 +2035,27 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psrld $28, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psrld $30, %xmm3
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT:    psrld $29, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; SSE41-NEXT:    paddd %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psrad $4, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    psrad $2, %xmm4
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT:    psrad $3, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT:    psubd %xmm0, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    psrld $29, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrad $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrad $2, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    psrad $3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
@@ -2132,17 +2065,16 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
-; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; AVX1-NEXT:    retq
 ;
@@ -2450,42 +2382,38 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
 ; SSE2-LABEL: non_splat_minus_one_divisor_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrld $31, %xmm2
-; SSE2-NEXT:    xorpd %xmm1, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    psrad $1, %xmm2
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT:    psubd %xmm2, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $31, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    psrad $1, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    psubd %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: non_splat_minus_one_divisor_2:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrld $31, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    paddd %xmm0, %xmm1
 ; SSE41-NEXT:    psrad $1, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    psubd %xmm1, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: non_splat_minus_one_divisor_2:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
 ; AVX1-NEXT:    retq
 ;
@@ -3054,68 +2982,51 @@ define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
 define <16 x i8> @pr38658(<16 x i8> %x) {
 ; SSE2-LABEL: pr38658:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT:    psraw $8, %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pmullw %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    paddb %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT:    psraw $8, %xmm1
-; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    psllw $8, %xmm2
-; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    packuswb %xmm1, %xmm2
-; SSE2-NEXT:    psrlw $7, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $7, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: pr38658:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
 ; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    packuswb %xmm2, %xmm0
-; SSE41-NEXT:    paddb %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE41-NEXT:    psraw $8, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psllw $6, %xmm2
-; SSE41-NEXT:    psllw $8, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
-; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    packuswb %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT:    psraw $8, %xmm0
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE41-NEXT:    psraw $8, %xmm2
-; SSE41-NEXT:    psllw $8, %xmm2
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $7, %xmm0
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    paddb %xmm2, %xmm0
+; SSE41-NEXT:    psllw $6, %xmm2
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $7, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: pr38658:
@@ -3133,14 +3044,10 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -3159,7 +3066,6 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -3169,12 +3075,11 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
 ; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
@@ -3191,7 +3096,6 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index 8e3b03e9a4b0683836b375a2014c0fa9cec5f60a..3cb1dd65e9ce925dced13c6789851dd027d10e53 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -45,6 +45,20 @@ define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) {
   ret <4 x i32> %2
 }
 
+define <4 x i32> @combine_vec_shl_outofrange3(<4 x i32> %a0) {
+; SSE-LABEL: combine_vec_shl_outofrange3:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_shl_outofrange3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = shl <4 x i32> %a0, <i32 33, i32 34, i32 35, i32 undef>
+  ret <4 x i32> %1
+}
+
 ; fold (shl x, 0) -> x
 define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) {
 ; CHECK-LABEL: combine_vec_shl_by_zero:
diff --git a/test/CodeGen/X86/combine-sra.ll b/test/CodeGen/X86/combine-sra.ll
index 9bce1a7c3b654313648ca13033988382c6fc8660..f8b536ac553ae8569167be8f942d100c3c96fd75 100644
--- a/test/CodeGen/X86/combine-sra.ll
+++ b/test/CodeGen/X86/combine-sra.ll
@@ -50,6 +50,19 @@ define <4 x i32> @combine_vec_ashr_outofrange1(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
+define <4 x i32> @combine_vec_ashr_outofrange2(<4 x i32> %x) {
+; SSE-LABEL: combine_vec_ashr_outofrange2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_ashr_outofrange2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 undef>
+  ret <4 x i32> %1
+}
+
 ; fold (sra x, 0) -> x
 define <4 x i32> @combine_vec_ashr_by_zero(<4 x i32> %x) {
 ; CHECK-LABEL: combine_vec_ashr_by_zero:
diff --git a/test/CodeGen/X86/combine-srem.ll b/test/CodeGen/X86/combine-srem.ll
index dab3bdcedb264653e1b36037e2fc7f3cc152bb89..4878d708e48155355e5fbfd6cb5b836b04cbaa9d 100644
--- a/test/CodeGen/X86/combine-srem.ll
+++ b/test/CodeGen/X86/combine-srem.ll
@@ -226,7 +226,7 @@ define <4 x i32> @combine_vec_srem_by_pow2a_neg(<4 x i32> %x) {
 ; SSE-NEXT:    psrad $31, %xmm1
 ; SSE-NEXT:    psrld $30, %xmm1
 ; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    psrad $2, %xmm1
+; SSE-NEXT:    psrld $2, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
 ; SSE-NEXT:    psubd %xmm1, %xmm2
 ; SSE-NEXT:    pslld $2, %xmm2
@@ -238,7 +238,7 @@ define <4 x i32> @combine_vec_srem_by_pow2a_neg(<4 x i32> %x) {
 ; AVX-NEXT:    vpsrad $31, %xmm0, %xmm1
 ; AVX-NEXT:    vpsrld $30, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpsrad $2, %xmm1, %xmm1
+; AVX-NEXT:    vpsrld $2, %xmm1, %xmm1
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpslld $2, %xmm1, %xmm1
@@ -252,15 +252,13 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_srem_by_pow2b:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $31, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrad $31, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    psrld $29, %xmm3
-; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; SSE-NEXT:    psrld $30, %xmm2
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrld $29, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    psrld $31, %xmm3
+; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT:    psrld $30, %xmm1
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
 ; SSE-NEXT:    paddd %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
@@ -277,14 +275,12 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) {
 ;
 ; AVX1-LABEL: combine_vec_srem_by_pow2b:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
@@ -321,10 +317,10 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; SSE-NEXT:    movdqa %xmm1, %xmm3
 ; SSE-NEXT:    psrld $30, %xmm3
 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrld $31, %xmm2
-; SSE-NEXT:    psrld $29, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrld $29, %xmm2
+; SSE-NEXT:    psrld $31, %xmm1
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
 ; SSE-NEXT:    paddd %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
@@ -348,9 +344,9 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm3
-; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
@@ -399,8 +395,8 @@ define i32 @ossfuzz6883() {
 ; CHECK-NEXT:    cltd
 ; CHECK-NEXT:    idivl %esi
 ; CHECK-NEXT:    movl %edx, %edi
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %esi
 ; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 80dcb29209b431545752d03bfdb40d12505b85d9..2dd1fa74871af05e2952b0e098cccb1550149795 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -35,6 +35,19 @@ define <4 x i32> @combine_vec_lshr_outofrange1(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
+define <4 x i32> @combine_vec_lshr_outofrange2(<4 x i32> %x) {
+; SSE-LABEL: combine_vec_lshr_outofrange2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_lshr_outofrange2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 undef>
+  ret <4 x i32> %1
+}
+
 ; fold (srl x, 0) -> x
 define <4 x i32> @combine_vec_lshr_by_zero(<4 x i32> %x) {
 ; CHECK-LABEL: combine_vec_lshr_by_zero:
@@ -237,25 +250,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) {
 define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
 ; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    psrlq $51, %xmm2
-; SSE-NEXT:    psrlq $50, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrlq $49, %xmm2
-; SSE-NEXT:    psrlq $48, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    packusdw %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $27, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrld $25, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $26, %xmm1
-; SSE-NEXT:    psrld $24, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1:
@@ -347,50 +342,48 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pshufb %xmm0, %xmm3
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    psrlw $4, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pshufb %xmm1, %xmm3
-; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE-NEXT:    pand %xmm0, %xmm5
-; SSE-NEXT:    pshufb %xmm5, %xmm4
-; SSE-NEXT:    pand %xmm1, %xmm4
-; SSE-NEXT:    paddb %xmm4, %xmm3
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pshufb %xmm1, %xmm2
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm1
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm3
-; SSE-NEXT:    paddw %xmm1, %xmm3
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE-NEXT:    psrlw $8, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    psrld $16, %xmm3
-; SSE-NEXT:    paddd %xmm3, %xmm0
-; SSE-NEXT:    psrld $5, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    psrld $5, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm3
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm4
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
-; AVX-NEXT:    vpand %xmm3, %xmm4, %xmm3
-; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
-; AVX-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
+; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index 346e54476273ddca8193293715d0cb57c76005d4..bfd3095ab966f5f3e28e94b8c18c654937cd5b44 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -647,18 +647,17 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movl $255, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movl $171, %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    pmullw %xmm2, %xmm3
-; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
-; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pmullw %xmm2, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    movl $255, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_udiv_nonuniform4:
@@ -670,14 +669,11 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE41-NEXT:    pmullw %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    psllw $8, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    packuswb %xmm3, %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    psllw $1, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; SSE41-NEXT:    psllw $1, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -690,14 +686,11 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
@@ -714,8 +707,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -726,13 +718,9 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; XOP-NEXT:    movl $171, %eax
 ; XOP-NEXT:    vmovd %eax, %xmm1
 ; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOP-NEXT:    vpmullw %xmm1, %xmm2, %xmm2
-; XOP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
+; XOP-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
 ; XOP-NEXT:    movl $249, %eax
 ; XOP-NEXT:    vmovd %eax, %xmm2
 ; XOP-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/commandline-metadata.ll b/test/CodeGen/X86/commandline-metadata.ll
new file mode 100644
index 0000000000000000000000000000000000000000..de77dd27b160b4a617bddcbe9d9b2e62e5ca3fc9
--- /dev/null
+++ b/test/CodeGen/X86/commandline-metadata.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+; Verify that llvm.commandline metadata is emitted to a section named
+; .GCC.command.line with each line separated with null bytes.
+
+; CHECK: .section .GCC.command.line,"MS",@progbits,1
+; CHECK-NEXT: .zero 1
+; CHECK-NEXT: .ascii "clang -command -line"
+; CHECK-NEXT: .zero 1
+; CHECK-NEXT: .ascii "something else"
+; CHECK-NEXT: .zero 1
+!llvm.commandline = !{!0, !1}
+!0 = !{!"clang -command -line"}
+!1 = !{!"something else"}
diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll
index 209991506da4d17a592fd268981b785fe24fdedb..c0e78348668b11f062208c4124f79fcc4378dda9 100644
--- a/test/CodeGen/X86/compress_expand.ll
+++ b/test/CodeGen/X86/compress_expand.ll
@@ -5,17 +5,15 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-
-
-define <16 x float> @test1(float* %base) {
-; SKX-LABEL: test1:
+define <16 x float> @expandload_v16f32_const_undef(float* %base) {
+; SKX-LABEL: expandload_v16f32_const_undef:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    movw $-2049, %ax # imm = 0xF7FF
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test1:
+; KNL-LABEL: expandload_v16f32_const_undef:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    movw $-2049, %ax # imm = 0xF7FF
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -25,15 +23,15 @@ define <16 x float> @test1(float* %base) {
   ret <16 x float>%res
 }
 
-define <16 x float> @test2(float* %base, <16 x float> %src0) {
-; SKX-LABEL: test2:
+define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) {
+; SKX-LABEL: expandload_v16f32_const:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    movw $30719, %ax # imm = 0x77FF
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test2:
+; KNL-LABEL: expandload_v16f32_const:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    movw $30719, %ax # imm = 0x77FF
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -43,17 +41,17 @@ define <16 x float> @test2(float* %base, <16 x float> %src0) {
   ret <16 x float>%res
 }
 
-define <8 x double> @test3(double* %base, <8 x double> %src0, <8 x i1> %mask) {
-; SKX-LABEL: test3:
+define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 x i1> %mask) {
+; SKX-LABEL: expandload_v8f64_v8i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovw2m %xmm1, %k1
 ; SKX-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test3:
+; KNL-LABEL: expandload_v8f64_v8i1:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
@@ -62,15 +60,15 @@ define <8 x double> @test3(double* %base, <8 x double> %src0, <8 x i1> %mask) {
   ret <8 x double>%res
 }
 
-define <4 x float> @test4(float* %base, <4 x float> %src0) {
-; SKX-LABEL: test4:
+define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
+; SKX-LABEL: expandload_v4f32_const:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    movb $7, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test4:
+; KNL-LABEL: expandload_v4f32_const:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    movw $7, %ax
@@ -82,15 +80,15 @@ define <4 x float> @test4(float* %base, <4 x float> %src0) {
   ret <4 x float>%res
 }
 
-define <2 x i64> @test5(i64* %base, <2 x i64> %src0) {
-; SKX-LABEL: test5:
+define <2 x i64> @expandload_v2i64_const(i64* %base, <2 x i64> %src0) {
+; SKX-LABEL: expandload_v2i64_const:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    movb $2, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpexpandq (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test5:
+; KNL-LABEL: expandload_v2i64_const:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    movb $2, %al
@@ -107,8 +105,8 @@ declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x doubl
 declare <4 x float>  @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>)
 declare <2 x i64>    @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
 
-define void @test6(float* %base, <16 x float> %V) {
-; SKX-LABEL: test6:
+define void @compressstore_v16f32_const(float* %base, <16 x float> %V) {
+; SKX-LABEL: compressstore_v16f32_const:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    movw $-2049, %ax # imm = 0xF7FF
 ; SKX-NEXT:    kmovd %eax, %k1
@@ -116,7 +114,7 @@ define void @test6(float* %base, <16 x float> %V) {
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test6:
+; KNL-LABEL: compressstore_v16f32_const:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    movw $-2049, %ax # imm = 0xF7FF
 ; KNL-NEXT:    kmovw %eax, %k1
@@ -126,8 +124,8 @@ define void @test6(float* %base, <16 x float> %V) {
   ret void
 }
 
-define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
-; SKX-LABEL: test7:
+define void @compressstore_v8f32_v8i1(float* %base, <8 x float> %V, <8 x i1> %mask) {
+; SKX-LABEL: compressstore_v8f32_v8i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovw2m %xmm1, %k1
@@ -135,10 +133,10 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test7:
+; KNL-LABEL: compressstore_v8f32_v8i1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
@@ -147,8 +145,8 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
   ret void
 }
 
-define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) {
-; SKX-LABEL: test8:
+define void @compressstore_v8f64_v8i1(double* %base, <8 x double> %V, <8 x i1> %mask) {
+; SKX-LABEL: compressstore_v8f64_v8i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovw2m %xmm1, %k1
@@ -156,9 +154,9 @@ define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) {
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test8:
+; KNL-LABEL: compressstore_v8f64_v8i1:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
@@ -167,8 +165,8 @@ define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) {
   ret void
 }
 
-define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
-; SKX-LABEL: test9:
+define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
+; SKX-LABEL: compressstore_v8i64_v8i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovw2m %xmm1, %k1
@@ -176,9 +174,9 @@ define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test9:
+; KNL-LABEL: compressstore_v8i64_v8i1:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpcompressq %zmm0, (%rdi) {%k1}
@@ -187,8 +185,8 @@ define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
   ret void
 }
 
-define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
-; SKX-LABEL: test10:
+define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
+; SKX-LABEL: compressstore_v4i64_v4i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovd2m %xmm1, %k1
@@ -196,7 +194,7 @@ define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test10:
+; KNL-LABEL: compressstore_v4i64_v4i1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -209,15 +207,15 @@ define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
   ret void
 }
 
-define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
-; SKX-LABEL: test11:
+define void @compressstore_v2i64_v2i1(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
+; SKX-LABEL: compressstore_v2i64_v2i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovq2m %xmm1, %k1
 ; SKX-NEXT:    vpcompressq %xmm0, (%rdi) {%k1}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test11:
+; KNL-LABEL: compressstore_v2i64_v2i1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
@@ -230,15 +228,15 @@ define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
   ret void
 }
 
-define void @test12(float* %base, <4 x float> %V, <4 x i1> %mask) {
-; SKX-LABEL: test12:
+define void @compressstore_v4f32_v4i1(float* %base, <4 x float> %V, <4 x i1> %mask) {
+; SKX-LABEL: compressstore_v4f32_v4i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovd2m %xmm1, %k1
 ; SKX-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test12:
+; KNL-LABEL: compressstore_v4f32_v4i1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -251,8 +249,8 @@ define void @test12(float* %base, <4 x float> %V, <4 x i1> %mask) {
   ret void
 }
 
-define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) {
-; SKX-LABEL: test13:
+define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x i32> %trigger) {
+; SKX-LABEL: expandload_v2f32_v2i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
@@ -260,7 +258,7 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger)
 ; SKX-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test13:
+; KNL-LABEL: expandload_v2f32_v2i1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -276,8 +274,8 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger)
   ret <2 x float> %res
 }
 
-define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
-; SKX-LABEL: test14:
+define void @compressstore_v2f32_v2i32(float* %base, <2 x float> %V, <2 x i32> %trigger) {
+; SKX-LABEL: compressstore_v2f32_v2i32:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
@@ -285,7 +283,7 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
 ; SKX-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test14:
+; KNL-LABEL: compressstore_v2f32_v2i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -300,8 +298,8 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
   ret void
 }
 
-define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigger) {
-; ALL-LABEL: test15:
+define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, <32 x i32> %trigger) {
+; ALL-LABEL: expandload_v32f32_v32i32:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vptestnmd %zmm3, %zmm3, %k1
 ; ALL-NEXT:    vptestnmd %zmm2, %zmm2, %k2
@@ -315,8 +313,8 @@ define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigge
   ret <32 x float> %res
 }
 
-define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %trigger) {
-; SKX-LABEL: test16:
+define <16 x double> @compressstore_v16f64_v16i32(double* %base, <16 x double> %src0, <16 x i32> %trigger) {
+; SKX-LABEL: compressstore_v16f64_v16i32:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1
@@ -327,7 +325,7 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri
 ; SKX-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test16:
+; KNL-LABEL: compressstore_v16f64_v16i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; KNL-NEXT:    vptestnmd %zmm3, %zmm3, %k1
@@ -343,8 +341,8 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri
   ret <16 x double> %res
 }
 
-define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
-; SKX-LABEL: test17:
+define void @compressstore_v32f32_v32i32(float* %base, <32 x float> %V, <32 x i32> %trigger) {
+; SKX-LABEL: compressstore_v32f32_v32i32:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1
 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k2
@@ -355,7 +353,7 @@ define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test17:
+; KNL-LABEL: compressstore_v32f32_v32i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vptestnmd %zmm3, %zmm3, %k1
 ; KNL-NEXT:    vptestnmd %zmm2, %zmm2, %k2
@@ -369,8 +367,8 @@ define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
   ret void
 }
 
-define void @test18(double* %base, <16 x double> %V, <16 x i1> %mask) {
-; SKX-LABEL: test18:
+define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i1> %mask) {
+; SKX-LABEL: compressstore_v16f64_v16i1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovb2m %xmm2, %k1
@@ -382,9 +380,9 @@ define void @test18(double* %base, <16 x double> %V, <16 x i1> %mask) {
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
-; KNL-LABEL: test18:
+; KNL-LABEL: compressstore_v16f64_v16i1:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL-NEXT:    kshiftrw $8, %k1, %k2
diff --git a/test/CodeGen/X86/condbr_if.ll b/test/CodeGen/X86/condbr_if.ll
index 80f188f98bb7e958623d46dfb7f9b9ba1a74a419..7b92f712be6dd27bba1526605b5d848fc102e11b 100644
--- a/test/CodeGen/X86/condbr_if.ll
+++ b/test/CodeGen/X86/condbr_if.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=sandybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=ivybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=haswell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=broadwell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skylake %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=NOTMERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=sandybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=ivybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=haswell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=broadwell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=skylake %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=skx %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=NOTMERGE
 
 define i32 @length2_1(i32) {
   %2 = icmp slt i32 %0, 3
diff --git a/test/CodeGen/X86/condbr_switch.ll b/test/CodeGen/X86/condbr_switch.ll
index ce1a21e171752407f153a99c68504b3b046f126e..079c0e2ab2026f1f07d95c22d400a3dc8956a5c0 100644
--- a/test/CodeGen/X86/condbr_switch.ll
+++ b/test/CodeGen/X86/condbr_switch.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=sandybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=ivybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=haswell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=broadwell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skylake %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=NOTMERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=sandybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=ivybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=haswell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=broadwell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=skylake %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=skx %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE
+; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=NOTMERGE
 
 @v1 = common dso_local local_unnamed_addr global i32 0, align 4
 @v2 = common dso_local local_unnamed_addr global i32 0, align 4
diff --git a/test/CodeGen/X86/consecutive-load-shuffle.ll b/test/CodeGen/X86/consecutive-load-shuffle.ll
new file mode 100644
index 0000000000000000000000000000000000000000..69739b98d22401044a8d5b343b6f0ed184d1701e
--- /dev/null
+++ b/test/CodeGen/X86/consecutive-load-shuffle.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-windows | FileCheck %s
+
+; We should be able to prodcue a single 128-bit load for these two 64-bit loads.
+; But we previously weren't because we weren't consistently looking through
+; WrapperRIP.
+
+@f = local_unnamed_addr global [4 x float] zeroinitializer, align 16
+@ms = common local_unnamed_addr global <4 x float> zeroinitializer, align 16
+
+define void @foo2() {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movaps %xmm0, {{.*}}(%rip)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @f, i64 0, i64 2) to <2 x float>*), align 8
+  %shuffle.i10 = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %1 = load <2 x float>, <2 x float>* bitcast ([4 x float]* @f to <2 x float>*), align 16
+  %shuffle.i7 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle.i = shufflevector <4 x float> %shuffle.i7, <4 x float> %shuffle.i10, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  store <4 x float> %shuffle.i, <4 x float>* @ms, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/copysign-constant-magnitude.ll b/test/CodeGen/X86/copysign-constant-magnitude.ll
index 61cb6d0960d1a3e5e40ed44becdd13597e51340d..aea7b3580879142c3efee3271c8f69cbbfd5ab61 100644
--- a/test/CodeGen/X86/copysign-constant-magnitude.ll
+++ b/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -26,7 +26,6 @@ define double @mag_neg0_double(double %x) nounwind {
 ; CHECK-LABEL: mag_neg0_double:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movsd [[SIGNMASK2]](%rip), %xmm1
-; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
@@ -34,18 +33,17 @@ define double @mag_neg0_double(double %x) nounwind {
   ret double %y
 }
 
+; CHECK:        [[ONE3:L.+]]:
+; CHECK-NEXT:   .quad 4607182418800017408     ## double 1
 ; CHECK:        [[SIGNMASK3:L.+]]:
 ; CHECK-NEXT:   .quad -9223372036854775808    ## double -0
 ; CHECK-NEXT:   .quad -9223372036854775808    ## double -0
-; CHECK:        [[ONE3:L.+]]:
-; CHECK-NEXT:   .quad 4607182418800017408     ## double 1
 
 define double @mag_pos1_double(double %x) nounwind {
 ; CHECK-LABEL: mag_pos1_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps [[SIGNMASK3]](%rip), %xmm0
 ; CHECK-NEXT:    movsd [[ONE3]](%rip), %xmm1
-; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
+; CHECK-NEXT:    andps [[SIGNMASK3]](%rip), %xmm0
 ; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
@@ -74,6 +72,8 @@ define double @mag_neg1_double(double %x) nounwind {
 ; CHECK:       [[SIGNMASK5:L.+]]:
 ; CHECK-NEXT:  .long 2147483648              ## float -0
 ; CHECK-NEXT:  .long 2147483648              ## float -0
+; CHECK-NEXT:  .long 2147483648              ## float -0
+; CHECK-NEXT:  .long 2147483648              ## float -0
 
 define float @mag_pos0_float(float %x) nounwind {
 ; CHECK-LABEL: mag_pos0_float:
@@ -92,7 +92,6 @@ define float @mag_neg0_float(float %x) nounwind {
 ; CHECK-LABEL: mag_neg0_float:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movss [[SIGNMASK6]](%rip), %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
@@ -100,18 +99,19 @@ define float @mag_neg0_float(float %x) nounwind {
   ret float %y
 }
 
+; CHECK:        [[ONE7:L.+]]:
+; CHECK-NEXT:  .long 1065353216              ## float 1
 ; CHECK:       [[SIGNMASK7:L.+]]:
 ; CHECK-NEXT:  .long 2147483648              ## float -0
 ; CHECK-NEXT:  .long 2147483648              ## float -0
-; CHECK:        [[ONE7:L.+]]:
-; CHECK-NEXT:  .long 1065353216              ## float 1
+; CHECK-NEXT:  .long 2147483648              ## float -0
+; CHECK-NEXT:  .long 2147483648              ## float -0
 
 define float @mag_pos1_float(float %x) nounwind {
 ; CHECK-LABEL: mag_pos1_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps [[SIGNMASK7]](%rip), %xmm0
 ; CHECK-NEXT:    movss [[ONE7]](%rip), %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; CHECK-NEXT:    andps [[SIGNMASK7]](%rip), %xmm0
 ; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
@@ -122,6 +122,8 @@ define float @mag_pos1_float(float %x) nounwind {
 ; CHECK:       [[SIGNMASK8:L.+]]:
 ; CHECK-NEXT:  .long 2147483648              ## float -0
 ; CHECK-NEXT:  .long 2147483648              ## float -0
+; CHECK-NEXT:  .long 2147483648              ## float -0
+; CHECK-NEXT:  .long 2147483648              ## float -0
 ; CHECK:        [[ONE8:L.+]]:
 ; CHECK-NEXT:  .long 1065353216              ## float 1
 ; CHECK-NEXT:  .long 1065353216              ## float 1
diff --git a/test/CodeGen/X86/cpus-intel.ll b/test/CodeGen/X86/cpus-intel.ll
index 654c9661e183da12c8983d5497329b9607435716..6eb2138444bcd3cb2c9e34b42faf7140364e2684 100644
--- a/test/CodeGen/X86/cpus-intel.ll
+++ b/test/CodeGen/X86/cpus-intel.ll
@@ -34,6 +34,7 @@
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=skylake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=skx 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=cascadelake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=cannonlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=icelake-client 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=icelake-server 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 5616b063008dc1c5efdea1a0cbb32ce0c3e06b5b..d3116f1941cd68c350a137d6b64d2bedeb0b1484 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -7,9 +7,9 @@
 ; CHECK:      callq   _Z3fooPcjPKc
 ; CHECK:      callq   _Z3fooPcjPKc
 ; CHECK:      movq    %rsp, %rdi
-; CHECK:      movl    $4, %esi
 ; CHECK:      testl   {{%[a-z]+}}, {{%[a-z]+}}
 ; CHECK:      je     .LBB0_4
+; CHECK:      movl    $4, %esi
 
 ; Regenerate test with this command: 
 ;   clang++ -emit-llvm -S -O2 -g
diff --git a/test/CodeGen/X86/discriminate-mem-ops.ll b/test/CodeGen/X86/discriminate-mem-ops.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b77a91fafd2cd8681ed64d8882f4b4544f77f209
--- /dev/null
+++ b/test/CodeGen/X86/discriminate-mem-ops.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s | FileCheck %s
+;
+; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
+; int sum(int* arr, int pos1, int pos2) {
+;   return arr[pos1] + arr[pos2];
+; }
+;
+; ModuleID = 'test.cc'
+source_filename = "test.cc"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @sum(i32* %arr, i32 %pos1, i32 %pos2) !dbg !7 {
+entry:
+  %idxprom = sext i32 %pos1 to i64, !dbg !9
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom, !dbg !9
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !9, !tbaa !10
+  %idxprom1 = sext i32 %pos2 to i64, !dbg !14
+  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1, !dbg !14
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !14, !tbaa !10
+  %add = add nsw i32 %1, %0, !dbg !15
+  ret i32 %add, !dbg !16
+}
+
+attributes #0 = { "target-cpu"="x86-64" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "test.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
+!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 2, column: 10, scope: !7)
+!10 = !{!11, !11, i64 0}
+!11 = !{!"int", !12, i64 0}
+!12 = !{!"omnipotent char", !13, i64 0}
+!13 = !{!"Simple C++ TBAA"}
+!14 = !DILocation(line: 2, column: 22, scope: !7)
+!15 = !DILocation(line: 2, column: 20, scope: !7)
+!16 = !DILocation(line: 2, column: 3, scope: !7)
+
+;CHECK-LABEL: sum:
+;CHECK:       # %bb.0:
+;CHECK:       movl (%rdi,%rax,4), %eax
+;CHECK-NEXT:  .loc 1 2 20 discriminator 2  # test.cc:2:20
+;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
+;CHECK-NEXT:  .loc 1 2 3                   # test.cc:2:3
diff --git a/test/CodeGen/X86/divrem.ll b/test/CodeGen/X86/divrem.ll
index 9cc19c6a417750d90048031be0eb5a275db788fb..0592dc9b35ca8fb42d5904b2e687a110303d9bd2 100644
--- a/test/CodeGen/X86/divrem.ll
+++ b/test/CodeGen/X86/divrem.ll
@@ -245,8 +245,8 @@ define void @ui16(i16 %x, i16 %y, i16* %p, i16* %q) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divw %si
 ; X64-NEXT:    movw %ax, (%r8)
 ; X64-NEXT:    movw %dx, (%rcx)
diff --git a/test/CodeGen/X86/dwarf-headers.ll b/test/CodeGen/X86/dwarf-headers.ll
index fa2080d1e1d31c743573ca52bfb62f1a00f88551..ef626ad7003bb5d1bc7ad9a9f7db7a57903041af 100644
--- a/test/CodeGen/X86/dwarf-headers.ll
+++ b/test/CodeGen/X86/dwarf-headers.ll
@@ -59,16 +59,16 @@
 ; DWO-4: 0x00000000: Type Unit: {{.*}} version = 0x0004 abbr_offset
 ; DWO-4: 0x00000017: DW_TAG_type_unit
 
-; Verify the v5 non-split headers.
+; Verify the v5 non-split headers. Type units come first.
+; All .debug_info sections are reported in one go, but the offset resets for
+; each new section.
 ;
 ; SINGLE-5: .debug_info contents:
-; SINGLE-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_compile abbr_offset
-; SINGLE-5: 0x0000000c: DW_TAG_compile_unit
-;
-; FIXME: V5 wants type units in .debug_info not .debug_types.
-; SINGLE-5: .debug_types contents:
 ; SINGLE-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_type abbr_offset
 ; SINGLE-5: 0x00000018: DW_TAG_type_unit
+; SINGLE-5-NOT: contents:
+; SINGLE-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_compile abbr_offset
+; SINGLE-5: 0x0000000c: DW_TAG_compile_unit
 
 ; Verify the v5 split headers.
 ;
@@ -78,14 +78,11 @@
 ; O-5: 0x00000014: DW_TAG_compile_unit
 ;
 ; DWO-5: .debug_info.dwo contents:
-; DWO-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_compile abbr_offset
-; DWO-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
-; DWO-5: 0x00000014: DW_TAG_compile_unit
-;
-; FIXME: V5 wants type units in .debug_info.dwo not .debug_types.dwo.
-; DWO-5: .debug_types.dwo contents:
 ; DWO-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_type abbr_offset
 ; DWO-5: 0x00000018: DW_TAG_type_unit
+; DWO-5: 0x00000033: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_compile abbr_offset
+; DWO-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
+; DWO-5: 0x00000047: DW_TAG_compile_unit
 
 
 ; ModuleID = 't.cpp'
diff --git a/test/CodeGen/X86/dwarf-split-line-1.ll b/test/CodeGen/X86/dwarf-split-line-1.ll
index 6a403668d73ce61c43946cf85a243bbe092ff097..cdc3e205b59ef690a35b2c07f44665324a3bf3fb 100644
--- a/test/CodeGen/X86/dwarf-split-line-1.ll
+++ b/test/CodeGen/X86/dwarf-split-line-1.ll
@@ -6,13 +6,13 @@
 ; RUN:     -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s
 ; RUN: llvm-dwarfdump -v %t.dwo | FileCheck %s
 
-; FIXME: V5 wants type units in .debug_info.dwo not .debug_types.dwo.
 ; CHECK-NOT: .debug_line.dwo
-; CHECK: .debug_types.dwo contents:
+; CHECK: .debug_info.dwo contents:
 ; CHECK: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_type abbr_offset
 ; CHECK: 0x00000018: DW_TAG_type_unit
 ; CHECK-NOT: DW_AT_stmt_list
 ; CHECK-NOT: DW_AT_decl_file
+; CHECK: Compile Unit:
 ; CHECK-NOT: .debug_line.dwo
 
 ; ModuleID = 't.cpp'
diff --git a/test/CodeGen/X86/dwarf-split-line-2.ll b/test/CodeGen/X86/dwarf-split-line-2.ll
index 3cdad4ddac834cc7f2586f78e233b0c3148e675b..d8a2d9b4bc75cc364e86206f983eff74a7d01e3b 100644
--- a/test/CodeGen/X86/dwarf-split-line-2.ll
+++ b/test/CodeGen/X86/dwarf-split-line-2.ll
@@ -8,8 +8,7 @@
 ; RUN: llvm-dwarfdump -v %t.dwo | FileCheck %s
 
 ; Currently the no-source-location type comes out first.
-; FIXME: V5 wants type units in .debug_info.dwo not .debug_types.dwo.
-; CHECK: .debug_types.dwo contents:
+; CHECK: .debug_info.dwo contents:
 ; CHECK: 0x00000000: Type Unit: {{.*}} name = 'S'
 ; CHECK-SAME: (next unit at [[TU2:0x[0-9a-f]+]])
 ; CHECK: DW_TAG_type_unit
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index cfe6ba571df8f1c2609925b365a2e1ca1132e973..b69c0c1f48b7572e2a46b77430768bed2c9c04ba 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -49,11 +49,10 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bextr32_a0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    orl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a0:
@@ -78,11 +77,10 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_a0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a0:
@@ -183,11 +181,10 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1NOTBM-LABEL: bextr32_a1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    orl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a1_indexzext:
@@ -212,11 +209,10 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_a1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a1_indexzext:
@@ -252,13 +248,12 @@ define i32 @bextr32_a2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a2_load:
@@ -285,12 +280,10 @@ define i32 @bextr32_a2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a2_load:
@@ -325,13 +318,12 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a3_load_indexzext:
@@ -358,12 +350,10 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a3_load_indexzext:
@@ -400,11 +390,10 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X86-BMI1NOTBM-LABEL: bextr32_a4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    orl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a4_commutative:
@@ -429,11 +418,10 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_a4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a4_commutative:
@@ -476,13 +464,13 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
-; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl %al, %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -523,10 +511,10 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_a5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -677,11 +665,10 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_a0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a0:
@@ -967,11 +954,10 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X64-BMI1NOTBM-LABEL: bextr64_a1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a1_indexzext:
@@ -1118,12 +1104,10 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a2_load:
@@ -1268,12 +1252,10 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X64-BMI1NOTBM-LABEL: bextr64_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a3_load_indexzext:
@@ -1417,11 +1399,10 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_a4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a4_commutative:
@@ -1605,10 +1586,10 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_a5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
@@ -1656,11 +1637,10 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bextr32_b0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    orl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b0:
@@ -1685,11 +1665,10 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_b0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b0:
@@ -1723,11 +1702,10 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1NOTBM-LABEL: bextr32_b1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    orl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b1_indexzext:
@@ -1752,11 +1730,10 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_b1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b1_indexzext:
@@ -1792,13 +1769,12 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b2_load:
@@ -1825,12 +1801,10 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_b2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b2_load:
@@ -1865,13 +1839,12 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b3_load_indexzext:
@@ -1898,12 +1871,10 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_b3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b3_load_indexzext:
@@ -1940,11 +1911,10 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X86-BMI1NOTBM-LABEL: bextr32_b4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    orl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b4_commutative:
@@ -1969,11 +1939,10 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_b4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b4_commutative:
@@ -2016,13 +1985,13 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
-; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl %al, %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -2063,10 +2032,10 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_b5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -2214,11 +2183,10 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_b0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b0:
@@ -2355,11 +2323,10 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X64-BMI1NOTBM-LABEL: bextr64_b1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b1_indexzext:
@@ -2503,12 +2470,10 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_b2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b2_load:
@@ -2650,12 +2615,10 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X64-BMI1NOTBM-LABEL: bextr64_b3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b3_load_indexzext:
@@ -2796,11 +2759,10 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_b4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b4_commutative:
@@ -2979,10 +2941,10 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_b5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
@@ -3021,7 +2983,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
@@ -3043,7 +3005,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
@@ -3058,22 +3020,22 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %edi
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
-; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, %esi, %eax
 ; X86-BMI1BMI2-NEXT:    addl $4, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
-; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c0:
@@ -3085,7 +3047,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebx
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebp
@@ -3107,7 +3069,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    negb %dl
 ; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
@@ -3127,8 +3089,8 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1BMI2-NEXT:    pushq %rax
 ; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
 ; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %ebp
-; X64-BMI1BMI2-NEXT:    movl %edx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
@@ -3292,7 +3254,7 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    movl (%eax), %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
@@ -3315,7 +3277,7 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    movl (%eax), %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
@@ -3330,23 +3292,23 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, (%eax), %edi
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, (%eax), %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
-; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, %esi, %eax
 ; X86-BMI1BMI2-NEXT:    addl $4, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
-; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c2_load:
@@ -3358,7 +3320,7 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-NOBMI-NEXT:    movl (%rdi), %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebp
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl $-1, %ebx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebx
@@ -3380,7 +3342,7 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
-; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    negb %dl
 ; X64-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
@@ -3400,8 +3362,8 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1BMI2-NEXT:    pushq %rax
 ; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
 ; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %ebp
-; X64-BMI1BMI2-NEXT:    movl %edx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
@@ -3569,7 +3531,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
@@ -3591,7 +3553,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
@@ -3606,22 +3568,22 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %edi
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
-; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, %esi, %eax
 ; X86-BMI1BMI2-NEXT:    addl $4, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
-; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c4_commutative:
@@ -3633,7 +3595,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebx
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebp
@@ -3655,7 +3617,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    negb %dl
 ; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
@@ -3675,8 +3637,8 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1BMI2-NEXT:    pushq %rax
 ; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
 ; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %ebp
-; X64-BMI1BMI2-NEXT:    movl %edx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
@@ -3705,7 +3667,7 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-NOBMI-NEXT:    movl %ebx, %ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
@@ -3732,7 +3694,7 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
@@ -3754,16 +3716,16 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    subl $16, %esp
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1BMI2-NEXT:    shrxl %edi, {{[0-9]+}}(%esp), %ebx
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    shrxl %edi, {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
-; X86-BMI1BMI2-NEXT:    bzhil %esi, %ebx, %esi
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    movl %edi, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
@@ -3782,7 +3744,7 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-NOBMI-NEXT:    movl %edi, %ebp
 ; X64-NOBMI-NEXT:    movl %r14d, %ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebp
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl $-1, %ebx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebx
@@ -3806,7 +3768,7 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-NEXT:    movl %edi, %ebp
 ; X64-BMI1NOTBM-NEXT:    movl %r14d, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
-; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    negb %dl
 ; X64-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
@@ -3829,8 +3791,8 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
 ; X64-BMI1BMI2-NEXT:    movl %esi, %ebp
 ; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %r14d
-; X64-BMI1BMI2-NEXT:    movl %edx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
@@ -3873,8 +3835,8 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB32_2:
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
@@ -3920,8 +3882,8 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB32_2:
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
@@ -3966,8 +3928,8 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB32_2:
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
@@ -4002,7 +3964,7 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %r14
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
@@ -4024,7 +3986,7 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
-; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    negb %dl
 ; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
@@ -4045,7 +4007,7 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
 ; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r14
 ; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
 ; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
@@ -4295,8 +4257,8 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB34_2:
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
@@ -4343,8 +4305,8 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB34_2:
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
@@ -4390,8 +4352,8 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB34_2:
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
@@ -4426,7 +4388,7 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-NOBMI-NEXT:    movq (%rdi), %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %r14
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
@@ -4448,7 +4410,7 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
-; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    negb %dl
 ; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
@@ -4469,7 +4431,7 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
 ; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %r14
 ; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
 ; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
@@ -4723,8 +4685,8 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB36_2:
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
@@ -4770,8 +4732,8 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB36_2:
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
@@ -4816,8 +4778,8 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB36_2:
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
@@ -4852,7 +4814,7 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %r14
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
@@ -4874,7 +4836,7 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
-; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    negb %dl
 ; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
@@ -4895,7 +4857,7 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
 ; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r14
 ; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
 ; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
@@ -4932,8 +4894,8 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB37_2:
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
@@ -4984,8 +4946,8 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB37_2:
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebp
@@ -5035,8 +4997,8 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB37_2:
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebp, %ebp
@@ -5076,7 +5038,7 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-NOBMI-NEXT:    movq %rdi, %r15
 ; X64-NOBMI-NEXT:    movl %r14d, %ecx
 ; X64-NOBMI-NEXT:    shrq %cl, %r15
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
@@ -5100,7 +5062,7 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-NEXT:    movq %rdi, %r15
 ; X64-BMI1NOTBM-NEXT:    movl %r14d, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %r15
-; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    negb %dl
 ; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
@@ -5124,7 +5086,7 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1BMI2-NEXT:    movq %rsi, %r14
 ; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r15
 ; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
 ; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
@@ -5156,7 +5118,7 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -5164,17 +5126,16 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    orl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
@@ -5186,7 +5147,7 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-NOBMI-NEXT:    movl %edi, %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
@@ -5194,11 +5155,10 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d0:
@@ -5229,11 +5189,10 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    orl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -5258,11 +5217,10 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -5287,7 +5245,7 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    movl (%eax), %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -5297,16 +5255,15 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d2_load:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-BMI1BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
@@ -5319,7 +5276,7 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-NOBMI-NEXT:    movl (%rdi), %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
@@ -5327,12 +5284,10 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -5364,13 +5319,12 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -5396,12 +5350,10 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -5429,7 +5381,7 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-NOBMI-NEXT:    movl %eax, %ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
@@ -5444,13 +5396,13 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
-; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    movzbl %al, %edx
+; X86-BMI1NOTBM-NEXT:    orl %ecx, %edx
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -5461,7 +5413,7 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %edx
 ; X86-BMI1BMI2-NEXT:    bzhil %eax, %edx, %esi
@@ -5478,7 +5430,7 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebx
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shll %cl, %ebx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebx
@@ -5491,10 +5443,10 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orl %edx, %eax
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -5540,8 +5492,8 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:  .LBB43_2:
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
@@ -5588,8 +5540,8 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB43_2:
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
@@ -5634,8 +5586,8 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:  .LBB43_2:
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
@@ -5665,7 +5617,7 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-NOBMI-NEXT:    movq %rdi, %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
@@ -5673,11 +5625,10 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d0:
@@ -5845,11 +5796,10 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X64-BMI1NOTBM-LABEL: bextr64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d1_indexzext:
@@ -5888,8 +5838,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:  .LBB45_2:
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
@@ -5937,8 +5887,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB45_2:
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
@@ -5984,8 +5934,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:  .LBB45_2:
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
@@ -6015,7 +5965,7 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-NOBMI-NEXT:    movq (%rdi), %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
@@ -6023,12 +5973,10 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d2_load:
@@ -6200,12 +6148,10 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X64-BMI1NOTBM-LABEL: bextr64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d3_load_indexzext:
@@ -6247,8 +6193,8 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    movl %esi, %ebx
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB47_2:
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
@@ -6308,8 +6254,8 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:  .LBB47_2:
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
@@ -6366,8 +6312,8 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    movl %edx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB47_2:
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
@@ -6406,7 +6352,7 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-NOBMI-NEXT:    movq %rdi, %rbx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
-; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    negb %dl
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shlq %cl, %rbx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
@@ -6419,10 +6365,10 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
 ; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    movzbl %sil, %eax
+; X64-BMI1NOTBM-NEXT:    orq %rdx, %rax
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
diff --git a/test/CodeGen/X86/extract-fp.ll b/test/CodeGen/X86/extract-fp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f72764c044c10b3ceb3cb4870290b536b69e71ea
--- /dev/null
+++ b/test/CodeGen/X86/extract-fp.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2  | FileCheck %s
+
+define float @ext_fadd_v4f32(<4 x float> %x) {
+; CHECK-LABEL: ext_fadd_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    retq
+  %bo = fadd <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 42.0>
+  %ext = extractelement <4 x float> %bo, i32 2
+  ret float %ext
+}
+
+define float @ext_fsub_v4f32(<4 x float> %x) {
+; CHECK-LABEL: ext_fsub_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = <u,2.0E+0,u,u>
+; CHECK-NEXT:    subps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %bo = fsub <4 x float> <float 1.0, float 2.0, float 3.0, float 42.0>, %x
+  %ext = extractelement <4 x float> %bo, i32 1
+  ret float %ext
+}
+
+define float @ext_fmul_v4f32(<4 x float> %x) {
+; CHECK-LABEL: ext_fmul_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-NEXT:    retq
+  %bo = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 42.0>
+  %ext = extractelement <4 x float> %bo, i32 3
+  ret float %ext
+}
+
+define float @ext_fdiv_v4f32(<4 x float> %x) {
+; CHECK-LABEL: ext_fdiv_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %bo = fdiv <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 42.0>
+  %ext = extractelement <4 x float> %bo, i32 0
+  ret float %ext
+}
+
+define float @ext_fdiv_v4f32_constant_op0(<4 x float> %x) {
+; CHECK-LABEL: ext_fdiv_v4f32_constant_op0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = <u,2.0E+0,u,u>
+; CHECK-NEXT:    divps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %bo = fdiv <4 x float> <float 1.0, float 2.0, float 3.0, float 42.0>, %x
+  %ext = extractelement <4 x float> %bo, i32 1
+  ret float %ext
+}
+
+define float @ext_frem_v4f32(<4 x float> %x) {
+; CHECK-LABEL: ext_frem_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    jmp fmodf # TAILCALL
+  %bo = frem <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 42.0>
+  %ext = extractelement <4 x float> %bo, i32 2
+  ret float %ext
+}
+
+define float @ext_frem_v4f32_constant_op0(<4 x float> %x) {
+; CHECK-LABEL: ext_frem_v4f32_constant_op0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    jmp fmodf # TAILCALL
+  %bo = frem <4 x float> <float 1.0, float 2.0, float 3.0, float 42.0>, %x
+  %ext = extractelement <4 x float> %bo, i32 1
+  ret float %ext
+}
+
diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll
index 8d18f29d332ece2083506d2f1fd482f7d1b4edf0..ac85edd16f2ac306de2cff31a25f3cf75f85e05d 100644
--- a/test/CodeGen/X86/extract-lowbits.ll
+++ b/test/CodeGen/X86/extract-lowbits.ll
@@ -1436,7 +1436,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
@@ -1453,7 +1453,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
@@ -1467,18 +1467,18 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
-; X86-BMI1BMI2-NEXT:    bzhil %esi, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    addl $8, %esp
-; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c0:
@@ -1488,7 +1488,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
 ; X64-NOBMI-NEXT:    movl %edi, %ebx
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebp
@@ -1508,7 +1508,7 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    negb %cl
 ; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
@@ -1528,8 +1528,8 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X64-BMI1BMI2-NEXT:    pushq %rax
 ; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
 ; X64-BMI1BMI2-NEXT:    movl %edi, %ebp
-; X64-BMI1BMI2-NEXT:    movl %esi, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
@@ -1668,7 +1668,7 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %edx
@@ -1687,7 +1687,7 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
@@ -1705,9 +1705,10 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %esi
-; X86-BMI1BMI2-NEXT:    negl %ecx
+; X86-BMI1BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-BMI1BMI2-NEXT:    negb %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
@@ -1721,7 +1722,7 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
@@ -1737,7 +1738,7 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    negb %cl
 ; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
@@ -1753,7 +1754,8 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X64-BMI1BMI2:       # %bb.0:
 ; X64-BMI1BMI2-NEXT:    pushq %rbx
 ; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %ebx
-; X64-BMI1BMI2-NEXT:    negl %esi
+; X64-BMI1BMI2-NEXT:    # kill: def $sil killed $sil killed $esi def $esi
+; X64-BMI1BMI2-NEXT:    negb %sil
 ; X64-BMI1BMI2-NEXT:    movl $-1, %eax
 ; X64-BMI1BMI2-NEXT:    shrxl %esi, %eax, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
@@ -1884,7 +1886,7 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
@@ -1901,7 +1903,7 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
@@ -1915,18 +1917,18 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
-; X86-BMI1BMI2-NEXT:    bzhil %esi, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    addl $8, %esp
-; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c4_commutative:
@@ -1936,7 +1938,7 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
 ; X64-NOBMI-NEXT:    movl %edi, %ebx
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebp
@@ -1956,7 +1958,7 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    negb %cl
 ; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
@@ -1976,8 +1978,8 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X64-BMI1BMI2-NEXT:    pushq %rax
 ; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
 ; X64-BMI1BMI2-NEXT:    movl %edi, %ebp
-; X64-BMI1BMI2-NEXT:    movl %esi, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
 ; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
@@ -2003,8 +2005,8 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
@@ -2034,8 +2036,8 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    pushl %eax
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
@@ -2065,8 +2067,8 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
@@ -2097,7 +2099,7 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movq %rdi, %r14
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
@@ -2117,7 +2119,7 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
-; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    negb %cl
 ; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
@@ -2138,7 +2140,7 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X64-BMI1BMI2-NEXT:    movq %rsi, %rbx
 ; X64-BMI1BMI2-NEXT:    movq %rdi, %r14
 ; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
 ; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
@@ -2318,26 +2320,26 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB27_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB27_2:
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    andl %edx, %esi
-; X86-NOBMI-NEXT:    movl 4(%eax), %edi
+; X86-NOBMI-NEXT:    movl (%edx), %esi
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    movl 4(%edx), %edi
 ; X86-NOBMI-NEXT:    andl %ebx, %edi
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebx
-; X86-NOBMI-NEXT:    pushl %edx
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    calll use64
 ; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    movl %esi, %eax
@@ -2352,26 +2354,26 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB27_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %ebx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB27_2:
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
-; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
-; X86-BMI1NOTBM-NEXT:    movl 4(%eax), %edi
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %esi
+; X86-BMI1NOTBM-NEXT:    andl %eax, %esi
+; X86-BMI1NOTBM-NEXT:    movl 4(%edx), %edi
 ; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
-; X86-BMI1NOTBM-NEXT:    pushl %edx
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    calll use64
 ; X86-BMI1NOTBM-NEXT:    addl $16, %esp
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
@@ -2386,25 +2388,25 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %edx
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %ebx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    movl $-1, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %ebx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB27_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB27_2:
-; X86-BMI1BMI2-NEXT:    movl (%eax), %esi
-; X86-BMI1BMI2-NEXT:    andl %edx, %esi
-; X86-BMI1BMI2-NEXT:    movl 4(%eax), %edi
+; X86-BMI1BMI2-NEXT:    movl (%edx), %esi
+; X86-BMI1BMI2-NEXT:    andl %eax, %esi
+; X86-BMI1BMI2-NEXT:    movl 4(%edx), %edi
 ; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
-; X86-BMI1BMI2-NEXT:    pushl %edx
+; X86-BMI1BMI2-NEXT:    pushl %eax
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
@@ -2418,7 +2420,7 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    movq $-1, %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
@@ -2434,7 +2436,7 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    negb %cl
 ; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
@@ -2450,8 +2452,8 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X64-BMI1BMI2:       # %bb.0:
 ; X64-BMI1BMI2-NEXT:    pushq %rbx
 ; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rbx
-; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
-; X64-BMI1BMI2-NEXT:    negl %esi
+; X64-BMI1BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI1BMI2-NEXT:    negb %sil
 ; X64-BMI1BMI2-NEXT:    movq $-1, %rax
 ; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rax, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
@@ -2628,8 +2630,8 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
@@ -2659,8 +2661,8 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    pushl %eax
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
@@ -2690,8 +2692,8 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
@@ -2722,7 +2724,7 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movq %rdi, %r14
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
@@ -2742,7 +2744,7 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
-; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    negb %cl
 ; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
@@ -2763,7 +2765,7 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X64-BMI1BMI2-NEXT:    movq %rsi, %rbx
 ; X64-BMI1BMI2-NEXT:    movq %rdi, %r14
 ; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
-; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    negb %al
 ; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
 ; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
@@ -2788,7 +2790,7 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -2796,14 +2798,14 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    shll $8, %eax
 ; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    retl
 ;
@@ -2811,7 +2813,7 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
 ; X64-NOBMI-NEXT:    movl %edi, %eax
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
@@ -2890,7 +2892,7 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -2899,7 +2901,7 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
 ; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
@@ -2907,15 +2909,15 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1BMI2-LABEL: bzhi32_d2_load:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, (%ecx), %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %eax
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_d2_load:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
 ; X64-NOBMI-NEXT:    movl (%rdi), %eax
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
@@ -3003,8 +3005,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl %edx, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %eax
@@ -3042,8 +3044,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edx, %eax
@@ -3080,8 +3082,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
@@ -3110,7 +3112,7 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movq %rdi, %rax
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
@@ -3281,8 +3283,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %edx
 ; X86-NOBMI-NEXT:    movl 4(%eax), %eax
-; X86-NOBMI-NEXT:    movl $64, %ecx
-; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl %edx, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %eax
@@ -3321,8 +3323,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl (%eax), %edx
 ; X86-BMI1NOTBM-NEXT:    movl 4(%eax), %eax
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movb $64, %cl
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edx, %eax
@@ -3360,8 +3362,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl (%eax), %edx
 ; X86-BMI1BMI2-NEXT:    movl 4(%eax), %esi
-; X86-BMI1BMI2-NEXT:    movl $64, %ecx
-; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb $64, %cl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %edx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
@@ -3390,7 +3392,7 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movq (%rdi), %rax
-; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    negb %cl
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll
index 9eb161f58042dac0a9acd3bfaf0cb4f2b25ef2e2..c4a597b90ed6b299ab256e8d9efb9ef368517a98 100644
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll
@@ -93,8 +93,7 @@ define i64 @t4(<2 x double>* %a) {
 ;
 ; X64-SSSE3-LABEL: t4:
 ; X64-SSSE3:       # %bb.0:
-; X64-SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSSE3-NEXT:    movq %xmm0, %rax
+; X64-SSSE3-NEXT:    movq (%rdi), %rax
 ; X64-SSSE3-NEXT:    retq
 ;
 ; X64-AVX-LABEL: t4:
diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll
index 534c63f708c09889317ec23cc1eebbe05eaf8d26..b4d73ce9e5abca40cbb3c095106b3d47d1b0a6d3 100644
--- a/test/CodeGen/X86/f16c-schedule.ll
+++ b/test/CodeGen/X86/f16c-schedule.ll
@@ -197,35 +197,35 @@ define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16>
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:1.00]
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; IVY-LABEL: test_vcvtps2ph_256:
 ; IVY:       # %bb.0:
 ; IVY-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
 ; IVY-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:1.00]
-; IVY-NEXT:    vzeroupper # sched: [100:0.33]
+; IVY-NEXT:    vzeroupper # sched: [1:1.00]
 ; IVY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_vcvtps2ph_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00]
 ; HASWELL-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vcvtps2ph_256:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00]
 ; BROADWELL-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00]
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vcvtps2ph_256:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [7:1.00]
 ; SKYLAKE-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BDVER2-LABEL: test_vcvtps2ph_256:
diff --git a/test/CodeGen/X86/fixup-bw-copy.ll b/test/CodeGen/X86/fixup-bw-copy.ll
index 46cb1b19e1521a5f5c05fafba9ee99b43c9c6d24..9e434ef7333bbd4e2796892c1bf903b48d29b0e3 100644
--- a/test/CodeGen/X86/fixup-bw-copy.ll
+++ b/test/CodeGen/X86/fixup-bw-copy.ll
@@ -43,9 +43,10 @@ define i16 @test_movw(i16 %a0) {
 define i8 @test_movb_hreg(i16 %a0) {
 ; X64-LABEL: test_movb_hreg:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrl $8, %eax
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    leal (%rax,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/fma-schedule.ll b/test/CodeGen/X86/fma-schedule.ll
index 6cdc615b231a86567ebb3fae9d9dcc85087c2fc7..ce8e00e5bc00df00e5e08212bbcd7102a218d78f 100644
--- a/test/CodeGen/X86/fma-schedule.ll
+++ b/test/CodeGen/X86/fma-schedule.ll
@@ -123,7 +123,7 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfmaddpd_256:
@@ -149,7 +149,7 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; HASWELL-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfmaddpd_256:
@@ -162,7 +162,7 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BROADWELL-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfmaddpd_256:
@@ -175,7 +175,7 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKYLAKE-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfmaddpd_256:
@@ -200,7 +200,7 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [11:0.50]
 ; SKX-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmaddpd_256:
@@ -330,7 +330,7 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfmaddps_256:
@@ -356,7 +356,7 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; HASWELL-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfmaddps_256:
@@ -369,7 +369,7 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; BROADWELL-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfmaddps_256:
@@ -382,7 +382,7 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; SKYLAKE-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfmaddps_256:
@@ -407,7 +407,7 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; SKX-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [11:0.50]
 ; SKX-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmaddps_256:
@@ -741,7 +741,7 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfmaddsubpd_256:
@@ -767,7 +767,7 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; HASWELL-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfmaddsubpd_256:
@@ -780,7 +780,7 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; BROADWELL-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfmaddsubpd_256:
@@ -793,7 +793,7 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; SKYLAKE-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfmaddsubpd_256:
@@ -818,7 +818,7 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; SKX-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [11:0.50]
 ; SKX-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmaddsubpd_256:
@@ -948,7 +948,7 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfmaddsubps_256:
@@ -974,7 +974,7 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; HASWELL-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfmaddsubps_256:
@@ -987,7 +987,7 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; BROADWELL-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfmaddsubps_256:
@@ -1000,7 +1000,7 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKYLAKE-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfmaddsubps_256:
@@ -1025,7 +1025,7 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [11:0.50]
 ; SKX-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmaddsubps_256:
@@ -1159,7 +1159,7 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfmsubaddpd_256:
@@ -1185,7 +1185,7 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; HASWELL-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfmsubaddpd_256:
@@ -1198,7 +1198,7 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; BROADWELL-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfmsubaddpd_256:
@@ -1211,7 +1211,7 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; SKYLAKE-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfmsubaddpd_256:
@@ -1236,7 +1236,7 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; SKX-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [11:0.50]
 ; SKX-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsubaddpd_256:
@@ -1366,7 +1366,7 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfmsubaddps_256:
@@ -1392,7 +1392,7 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; HASWELL-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfmsubaddps_256:
@@ -1405,7 +1405,7 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; BROADWELL-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfmsubaddps_256:
@@ -1418,7 +1418,7 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKYLAKE-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfmsubaddps_256:
@@ -1443,7 +1443,7 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [11:0.50]
 ; SKX-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsubaddps_256:
@@ -1577,7 +1577,7 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfmsubpd_256:
@@ -1603,7 +1603,7 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; HASWELL-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfmsubpd_256:
@@ -1616,7 +1616,7 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BROADWELL-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfmsubpd_256:
@@ -1629,7 +1629,7 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKYLAKE-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfmsubpd_256:
@@ -1654,7 +1654,7 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [11:0.50]
 ; SKX-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsubpd_256:
@@ -1784,7 +1784,7 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfmsubps_256:
@@ -1810,7 +1810,7 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfmsubps_256:
@@ -1823,7 +1823,7 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; BROADWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfmsubps_256:
@@ -1836,7 +1836,7 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; SKYLAKE-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfmsubps_256:
@@ -1861,7 +1861,7 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; SKX-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [11:0.50]
 ; SKX-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsubps_256:
@@ -2195,7 +2195,7 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfnmaddpd_256:
@@ -2221,7 +2221,7 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; HASWELL-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfnmaddpd_256:
@@ -2234,7 +2234,7 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BROADWELL-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfnmaddpd_256:
@@ -2247,7 +2247,7 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKYLAKE-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfnmaddpd_256:
@@ -2272,7 +2272,7 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
 ; SKX-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmaddpd_256:
@@ -2402,7 +2402,7 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfnmaddps_256:
@@ -2428,7 +2428,7 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfnmaddps_256:
@@ -2441,7 +2441,7 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; BROADWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfnmaddps_256:
@@ -2454,7 +2454,7 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; SKYLAKE-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfnmaddps_256:
@@ -2479,7 +2479,7 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
 ; SKX-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmaddps_256:
@@ -2813,7 +2813,7 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfnmsubpd_256:
@@ -2839,7 +2839,7 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; HASWELL-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfnmsubpd_256:
@@ -2852,7 +2852,7 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BROADWELL-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfnmsubpd_256:
@@ -2865,7 +2865,7 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKYLAKE-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfnmsubpd_256:
@@ -2890,7 +2890,7 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [11:0.50]
 ; SKX-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmsubpd_256:
@@ -3020,7 +3020,7 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:0.50]
 ; GENERIC-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER2-LABEL: test_vfnmsubps_256:
@@ -3046,7 +3046,7 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; HASWELL-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [12:0.50]
 ; HASWELL-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [12:0.50]
 ; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_vfnmsubps_256:
@@ -3059,7 +3059,7 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; BROADWELL-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [11:0.50]
 ; BROADWELL-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [11:0.50]
 ; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT:    vzeroupper # sched: [0:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_vfnmsubps_256:
@@ -3072,7 +3072,7 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; SKYLAKE-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [11:0.50]
 ; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; KNL-LABEL: test_vfnmsubps_256:
@@ -3097,7 +3097,7 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; SKX-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [11:0.50]
 ; SKX-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [11:0.50]
 ; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
+; SKX-NEXT:    vzeroupper # sched: [0:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmsubps_256:
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index c894a9f3d40cc77c5163fce80e5083b245024c86..fa4dd3db1891c68a3d19a64cfa6c94cc30dcbe81 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1071,9 +1071,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
 ; FMACALL32_BDVER2-NEXT:    subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    ## imm = 0x1C0
-; FMACALL32_BDVER2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps 56(%ebp), %xmm4 ## encoding: [0xc5,0xf8,0x28,0x65,0x38]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
@@ -1082,46 +1080,43 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xc0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xb0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
-; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm4, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x64,0x24,0x08,0x02]
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xb0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
 ; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
 ; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
@@ -1134,7 +1129,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
@@ -1147,7 +1142,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
@@ -1160,7 +1155,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
@@ -1173,36 +1168,32 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
 ; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x60]
 ; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
 ; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
 ; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
@@ -1210,31 +1201,29 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x60]
 ; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
 ; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
 ; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
 ; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
@@ -1247,7 +1236,7 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
 ; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
@@ -1260,9 +1249,9 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
@@ -1273,9 +1262,9 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xd0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
@@ -1286,93 +1275,92 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xb0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
 ; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03]
 ; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xd0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x50]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48]
 ; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44]
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10]
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
-; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10]
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c]
 ; FMACALL32_BDVER2-NEXT:    ## xmm2 = mem[0],zero,zero,zero
-; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10]
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x44,0x20]
 ; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
-; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
-; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30]
 ; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0,1,2],mem[0]
 ; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
-; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10]
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x58,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x54,0x20]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x50,0x30]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
 ; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
 ; FMACALL32_BDVER2-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
@@ -1844,11 +1832,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    pushl %ebp ## encoding: [0x55]
 ; FMACALL32_BDVER2-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
 ; FMACALL32_BDVER2-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
-; FMACALL32_BDVER2-NEXT:    subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    ## imm = 0x180
+; FMACALL32_BDVER2-NEXT:    subl $352, %esp ## encoding: [0x81,0xec,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x160
 ; FMACALL32_BDVER2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
@@ -1857,21 +1845,18 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x40,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x30,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
 ; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x40,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],xmm2[0]
 ; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovapd 40(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x28]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
@@ -1884,7 +1869,7 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
@@ -1897,12 +1882,9 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovapd 24(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x18]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32_BDVER2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30]
 ; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
@@ -1910,30 +1892,29 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x20]
 ; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x30]
 ; FMACALL32_BDVER2-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
 ; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
 ; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
 ; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x44,0x24,0x20]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
 ; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovapd 8(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x08]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x30]
 ; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
@@ -1946,7 +1927,7 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x20]
 ; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
@@ -1959,13 +1940,12 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovapd 56(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x38]
 ; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x48,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
 ; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
 ; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60]
diff --git a/test/CodeGen/X86/fma4-schedule.ll b/test/CodeGen/X86/fma4-schedule.ll
index c8b5debd3fbb61c54c97165a21a43c2471607939..118ad2b10be09413ec7b08e43252fb6795b79d7f 100644
--- a/test/CodeGen/X86/fma4-schedule.ll
+++ b/test/CodeGen/X86/fma4-schedule.ll
@@ -48,7 +48,7 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfmaddpd_256:
@@ -113,7 +113,7 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfmaddps_256:
@@ -244,7 +244,7 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfmaddsubpd_256:
@@ -309,7 +309,7 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfmaddsubps_256:
@@ -378,7 +378,7 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfmsubaddpd_256:
@@ -443,7 +443,7 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfmsubaddps_256:
@@ -512,7 +512,7 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfmsubpd_256:
@@ -577,7 +577,7 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfmsubps_256:
@@ -708,7 +708,7 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfnmaddpd_256:
@@ -773,7 +773,7 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfnmaddps_256:
@@ -904,7 +904,7 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfnmsubpd_256:
@@ -969,7 +969,7 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfnmsubps_256:
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index 5ae46e2c2e2a27a2cfb09e08b409421c9e821867..263ad72ebc14c8bd497078d12cbd347397e7b5c4 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-- | FileCheck %s
 	%struct._obstack_chunk = type { i8*, %struct._obstack_chunk*, [4 x i8] }
 	%struct.obstack = type { i32, %struct._obstack_chunk*, i8*, i8*, i8*, i32, i32, %struct._obstack_chunk* (...)*, void (...)*, i8*, i8 }
@@ -5,6 +6,17 @@
 
 ; This should just not crash.
 define void @test1() nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testb $1, stmt_obstack+40
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %cond_false30.i
+; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    calll 0
+; CHECK-NEXT:    addl $4, %esp
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1: # %cond_true23.i
+; CHECK-NEXT:    retl
 entry:
 	br i1 true, label %cond_true, label %cond_next
 
@@ -30,6 +42,14 @@ cond_next:		; preds = %entry
 
 
 define i32 @test2(i16* %P, i16* %Q) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzwl (%eax), %edx
+; CHECK-NEXT:    movzbl %dl, %eax
+; CHECK-NEXT:    movw %dx, (%ecx)
+; CHECK-NEXT:    retl
   %A = load i16, i16* %P, align 4                      ; <i16> [#uses=11]
   %C = zext i16 %A to i32                         ; <i32> [#uses=1]
   %D = and i32 %C, 255                            ; <i32> [#uses=1]
@@ -39,9 +59,6 @@ L:
   store i16 %A, i16* %Q
   ret i32 %D
 
-; CHECK-LABEL: test2:
-; CHECK: 	movl	4(%esp), %eax
-; CHECK-NEXT:	movzwl	(%eax), %e{{..}}
 
 }
 
@@ -49,10 +66,22 @@ L:
 ; xor in exit block will be CSE'ed and load will be folded to xor in entry.
 define i1 @test3(i32* %P, i32* %Q) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK: movl 8(%esp), %e
-; CHECK: movl 4(%esp), %e
-; CHECK: xorl (%e
-; CHECK: j
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    xorl (%ecx), %eax
+; CHECK-NEXT:    testl $89947, %eax # imm = 0x15F5B
+; CHECK-NEXT:    je .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB2_2: # %exit
+; CHECK-NEXT:    testl $-838178173, %eax # imm = 0xCE0A6A83
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retl
 entry:
   %0 = load i32, i32* %P, align 4
   %1 = load i32, i32* %Q, align 4
diff --git a/test/CodeGen/X86/fold-vector-sext-zext.ll b/test/CodeGen/X86/fold-vector-sext-zext.ll
index 16274a0d8191eaa6c50a2cde50cd19b8851a875c..a8e78cc5565fe91aea42d879234ee5bb233fe3ce 100644
--- a/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -119,12 +119,12 @@ define <4 x i64> @test_sext_4i8_4i64_undef() {
 define <8 x i16> @test_sext_8i8_8i16() {
 ; X32-LABEL: test_sext_8i8_8i16:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [0,65535,2,65533,4,65531,6,65529]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_sext_8i8_8i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [0,65535,2,65533,4,65531,6,65529]
 ; X64-NEXT:    retq
   %1 = insertelement <8 x i8> undef, i8 0, i32 0
   %2 = insertelement <8 x i8> %1, i8 -1, i32 1
@@ -134,19 +134,19 @@ define <8 x i16> @test_sext_8i8_8i16() {
   %6 = insertelement <8 x i8> %5, i8 -5, i32 5
   %7 = insertelement <8 x i8> %6, i8 6, i32 6
   %8 = insertelement <8 x i8> %7, i8 -7, i32 7
-  %9 = sext <8 x i8> %4 to <8 x i16>
+  %9 = sext <8 x i8> %8 to <8 x i16>
   ret <8 x i16> %9
 }
 
 define <8 x i32> @test_sext_8i8_8i32() {
 ; X32-LABEL: test_sext_8i8_8i32:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = <0,4294967295,2,4294967293,u,u,u,u>
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,2,4294967293,4,4294967291,6,4294967289]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_sext_8i8_8i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = <0,4294967295,2,4294967293,u,u,u,u>
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,2,4294967293,4,4294967291,6,4294967289]
 ; X64-NEXT:    retq
   %1 = insertelement <8 x i8> undef, i8 0, i32 0
   %2 = insertelement <8 x i8> %1, i8 -1, i32 1
@@ -156,19 +156,19 @@ define <8 x i32> @test_sext_8i8_8i32() {
   %6 = insertelement <8 x i8> %5, i8 -5, i32 5
   %7 = insertelement <8 x i8> %6, i8 6, i32 6
   %8 = insertelement <8 x i8> %7, i8 -7, i32 7
-  %9 = sext <8 x i8> %4 to <8 x i32>
+  %9 = sext <8 x i8> %8 to <8 x i32>
   ret <8 x i32> %9
 }
 
 define <8 x i16> @test_sext_8i8_8i16_undef() {
 ; X32-LABEL: test_sext_8i8_8i16_undef:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,u,u,u>
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,65531,u,65529>
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_sext_8i8_8i16_undef:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,u,u,u>
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,65531,u,65529>
 ; X64-NEXT:    retq
   %1 = insertelement <8 x i8> undef, i8 undef, i32 0
   %2 = insertelement <8 x i8> %1, i8 -1, i32 1
@@ -178,19 +178,19 @@ define <8 x i16> @test_sext_8i8_8i16_undef() {
   %6 = insertelement <8 x i8> %5, i8 -5, i32 5
   %7 = insertelement <8 x i8> %6, i8 undef, i32 6
   %8 = insertelement <8 x i8> %7, i8 -7, i32 7
-  %9 = sext <8 x i8> %4 to <8 x i16>
+  %9 = sext <8 x i8> %8 to <8 x i16>
   ret <8 x i16> %9
 }
 
 define <8 x i32> @test_sext_8i8_8i32_undef() {
 ; X32-LABEL: test_sext_8i8_8i32_undef:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = <0,u,2,u,u,u,u,u>
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = <0,u,2,u,4,u,6,u>
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_sext_8i8_8i32_undef:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = <0,u,2,u,u,u,u,u>
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = <0,u,2,u,4,u,6,u>
 ; X64-NEXT:    retq
   %1 = insertelement <8 x i8> undef, i8 0, i32 0
   %2 = insertelement <8 x i8> %1, i8 undef, i32 1
@@ -200,7 +200,7 @@ define <8 x i32> @test_sext_8i8_8i32_undef() {
   %6 = insertelement <8 x i8> %5, i8 undef, i32 5
   %7 = insertelement <8 x i8> %6, i8 6, i32 6
   %8 = insertelement <8 x i8> %7, i8 undef, i32 7
-  %9 = sext <8 x i8> %4 to <8 x i32>
+  %9 = sext <8 x i8> %8 to <8 x i32>
   ret <8 x i32> %9
 }
 
diff --git a/test/CodeGen/X86/fp-cvt.ll b/test/CodeGen/X86/fp-cvt.ll
index 7a4d303163a699b778694701da7aae2ff9e27513..ab3d40ddcaa527bb038672b22918f396d5a8cbae 100644
--- a/test/CodeGen/X86/fp-cvt.ll
+++ b/test/CodeGen/X86/fp-cvt.ll
@@ -483,29 +483,20 @@ define i64 @fptoui_i64_fp80(x86_fp80 %a0) nounwind {
 ; X64-X87-NEXT:    flds {{.*}}(%rip)
 ; X64-X87-NEXT:    fld %st(1)
 ; X64-X87-NEXT:    fsub %st(1)
+; X64-X87-NEXT:    xorl %eax, %eax
+; X64-X87-NEXT:    fxch %st(1)
+; X64-X87-NEXT:    fucompi %st(2)
+; X64-X87-NEXT:    fcmovnbe %st(1), %st(0)
+; X64-X87-NEXT:    fstp %st(1)
 ; X64-X87-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-X87-NEXT:    movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
-; X64-X87-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fistpll -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; X64-X87-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
 ; X64-X87-NEXT:    movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
 ; X64-X87-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fld %st(1)
+; X64-X87-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
 ; X64-X87-NEXT:    fistpll -{{[0-9]+}}(%rsp)
 ; X64-X87-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fucompi %st(1)
-; X64-X87-NEXT:    fstp %st(0)
-; X64-X87-NEXT:    jbe .LBB10_1
-; X64-X87-NEXT:  # %bb.2:
-; X64-X87-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-X87-NEXT:    retq
-; X64-X87-NEXT:  .LBB10_1:
-; X64-X87-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-X87-NEXT:    setbe %al
+; X64-X87-NEXT:    shlq $63, %rax
 ; X64-X87-NEXT:    xorq -{{[0-9]+}}(%rsp), %rax
 ; X64-X87-NEXT:    retq
 ;
@@ -515,17 +506,14 @@ define i64 @fptoui_i64_fp80(x86_fp80 %a0) nounwind {
 ; X64-SSSE3-NEXT:    flds {{.*}}(%rip)
 ; X64-SSSE3-NEXT:    fld %st(1)
 ; X64-SSSE3-NEXT:    fsub %st(1)
+; X64-SSSE3-NEXT:    xorl %eax, %eax
+; X64-SSSE3-NEXT:    fxch %st(1)
+; X64-SSSE3-NEXT:    fucompi %st(2)
+; X64-SSSE3-NEXT:    fcmovnbe %st(1), %st(0)
+; X64-SSSE3-NEXT:    fstp %st(1)
 ; X64-SSSE3-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT:    fld %st(1)
-; X64-SSSE3-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT:    fucompi %st(1)
-; X64-SSSE3-NEXT:    fstp %st(0)
-; X64-SSSE3-NEXT:    jbe .LBB10_1
-; X64-SSSE3-NEXT:  # %bb.2:
-; X64-SSSE3-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSSE3-NEXT:    retq
-; X64-SSSE3-NEXT:  .LBB10_1:
-; X64-SSSE3-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-SSSE3-NEXT:    setbe %al
+; X64-SSSE3-NEXT:    shlq $63, %rax
 ; X64-SSSE3-NEXT:    xorq -{{[0-9]+}}(%rsp), %rax
 ; X64-SSSE3-NEXT:    retq
   %1 = fptoui x86_fp80 %a0 to i64
@@ -577,29 +565,20 @@ define i64 @fptoui_i64_fp80_ld(x86_fp80 *%a0) nounwind {
 ; X64-X87-NEXT:    flds {{.*}}(%rip)
 ; X64-X87-NEXT:    fld %st(1)
 ; X64-X87-NEXT:    fsub %st(1)
+; X64-X87-NEXT:    xorl %eax, %eax
+; X64-X87-NEXT:    fxch %st(1)
+; X64-X87-NEXT:    fucompi %st(2)
+; X64-X87-NEXT:    fcmovnbe %st(1), %st(0)
+; X64-X87-NEXT:    fstp %st(1)
 ; X64-X87-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-X87-NEXT:    movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
-; X64-X87-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fistpll -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; X64-X87-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
 ; X64-X87-NEXT:    movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
 ; X64-X87-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fld %st(1)
+; X64-X87-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
 ; X64-X87-NEXT:    fistpll -{{[0-9]+}}(%rsp)
 ; X64-X87-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-X87-NEXT:    fucompi %st(1)
-; X64-X87-NEXT:    fstp %st(0)
-; X64-X87-NEXT:    jbe .LBB11_1
-; X64-X87-NEXT:  # %bb.2:
-; X64-X87-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-X87-NEXT:    retq
-; X64-X87-NEXT:  .LBB11_1:
-; X64-X87-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-X87-NEXT:    setbe %al
+; X64-X87-NEXT:    shlq $63, %rax
 ; X64-X87-NEXT:    xorq -{{[0-9]+}}(%rsp), %rax
 ; X64-X87-NEXT:    retq
 ;
@@ -609,17 +588,14 @@ define i64 @fptoui_i64_fp80_ld(x86_fp80 *%a0) nounwind {
 ; X64-SSSE3-NEXT:    flds {{.*}}(%rip)
 ; X64-SSSE3-NEXT:    fld %st(1)
 ; X64-SSSE3-NEXT:    fsub %st(1)
+; X64-SSSE3-NEXT:    xorl %eax, %eax
+; X64-SSSE3-NEXT:    fxch %st(1)
+; X64-SSSE3-NEXT:    fucompi %st(2)
+; X64-SSSE3-NEXT:    fcmovnbe %st(1), %st(0)
+; X64-SSSE3-NEXT:    fstp %st(1)
 ; X64-SSSE3-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT:    fld %st(1)
-; X64-SSSE3-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT:    fucompi %st(1)
-; X64-SSSE3-NEXT:    fstp %st(0)
-; X64-SSSE3-NEXT:    jbe .LBB11_1
-; X64-SSSE3-NEXT:  # %bb.2:
-; X64-SSSE3-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSSE3-NEXT:    retq
-; X64-SSSE3-NEXT:  .LBB11_1:
-; X64-SSSE3-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-SSSE3-NEXT:    setbe %al
+; X64-SSSE3-NEXT:    shlq $63, %rax
 ; X64-SSSE3-NEXT:    xorq -{{[0-9]+}}(%rsp), %rax
 ; X64-SSSE3-NEXT:    retq
   %1 = load x86_fp80, x86_fp80 *%a0
diff --git a/test/CodeGen/X86/fp-undef.ll b/test/CodeGen/X86/fp-undef.ll
index 00dca39d7c1fa36149121971c2c9ec41159a0ddb..d46bea703fdf0bfe2b1960ad08043f04f8161fe9 100644
--- a/test/CodeGen/X86/fp-undef.ll
+++ b/test/CodeGen/X86/fp-undef.ll
@@ -192,7 +192,6 @@ define float @frem_undef_op1_fast(float %x) {
 define double @fadd_undef_undef(double %x) {
 ; ANY-LABEL: fadd_undef_undef:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; ANY-NEXT:    retq
   %r = fadd double undef, undef
   ret double %r
@@ -201,7 +200,6 @@ define double @fadd_undef_undef(double %x) {
 define double @fsub_undef_undef(double %x) {
 ; ANY-LABEL: fsub_undef_undef:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; ANY-NEXT:    retq
   %r = fsub double undef, undef
   ret double %r
@@ -210,7 +208,6 @@ define double @fsub_undef_undef(double %x) {
 define double @fmul_undef_undef(double %x) {
 ; ANY-LABEL: fmul_undef_undef:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; ANY-NEXT:    retq
   %r = fmul double undef, undef
   ret double %r
@@ -219,7 +216,6 @@ define double @fmul_undef_undef(double %x) {
 define double @fdiv_undef_undef(double %x) {
 ; ANY-LABEL: fdiv_undef_undef:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; ANY-NEXT:    retq
   %r = fdiv double undef, undef
   ret double %r
@@ -228,7 +224,6 @@ define double @fdiv_undef_undef(double %x) {
 define double @frem_undef_undef(double %x) {
 ; ANY-LABEL: frem_undef_undef:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; ANY-NEXT:    retq
   %r = frem double undef, undef
   ret double %r
@@ -510,3 +505,93 @@ define double @frem_undef_op1_fast_constant_inf(double %x) {
   ret double %r
 }
 
+define <2 x double> @fadd_undef_op1_constant_vec(<2 x double> %x) {
+; ANY-LABEL: fadd_undef_op1_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    retq
+  %r = fadd <2 x double> <double 42.0, double undef>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @fadd_undef_op0_constant_vec(<2 x double> %x) {
+; ANY-LABEL: fadd_undef_op0_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movaps {{.*#+}} xmm0 = <u,NaN>
+; ANY-NEXT:    retq
+  %r = fadd <2 x double> undef, <double undef, double 42.0>
+  ret <2 x double> %r
+}
+
+define <2 x double> @fsub_undef_op1_constant_vec(<2 x double> %x) {
+; ANY-LABEL: fsub_undef_op1_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movaps {{.*#+}} xmm0 = <u,NaN>
+; ANY-NEXT:    retq
+  %r = fsub <2 x double> <double undef, double 42.0>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @fsub_undef_op0_constant_vec(<2 x double> %x) {
+; ANY-LABEL: fsub_undef_op0_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    retq
+  %r = fsub <2 x double> undef, <double 42.0, double undef>
+  ret <2 x double> %r
+}
+
+define <2 x double> @fmul_undef_op1_constant_vec(<2 x double> %x) {
+; ANY-LABEL: fmul_undef_op1_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    retq
+  %r = fmul <2 x double> <double 42.0, double undef>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @fmul_undef_op0_constant_vec(<2 x double> %x) {
+; ANY-LABEL: fmul_undef_op0_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movaps {{.*#+}} xmm0 = <u,NaN>
+; ANY-NEXT:    retq
+  %r = fmul <2 x double> undef, <double undef, double 42.0>
+  ret <2 x double> %r
+}
+
+define <2 x double> @fdiv_undef_op1_constant_vec(<2 x double> %x) {
+; ANY-LABEL: fdiv_undef_op1_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    retq
+  %r = fdiv <2 x double> <double 42.0, double undef>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @fdiv_undef_op0_constant_vec(<2 x double> %x) {
+; ANY-LABEL: fdiv_undef_op0_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movaps {{.*#+}} xmm0 = <u,NaN>
+; ANY-NEXT:    retq
+  %r = fdiv <2 x double> undef, <double undef, double 42.0>
+  ret <2 x double> %r
+}
+
+define <2 x double> @frem_undef_op1_constant_vec(<2 x double> %x) {
+; ANY-LABEL: frem_undef_op1_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movaps {{.*#+}} xmm0 = <u,NaN>
+; ANY-NEXT:    retq
+  %r = frem <2 x double> <double undef, double 42.0>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @frem_undef_op0_constant_vec(<2 x double> %x) {
+; ANY-LABEL: frem_undef_op0_constant_vec:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    retq
+  %r = frem <2 x double> undef, <double 42.0, double undef>
+  ret <2 x double> %r
+}
+
diff --git a/test/CodeGen/X86/fp128-cast.ll b/test/CodeGen/X86/fp128-cast.ll
index 0e76da74e09658f6acb6fd8de4a2fc2b239ce009..71b9c3f7f7bbd9e7ea4203f4234a756cff8f16f6 100644
--- a/test/CodeGen/X86/fp128-cast.ll
+++ b/test/CodeGen/X86/fp128-cast.ll
@@ -845,7 +845,6 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind {
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    callq __trunctfdf2
 ; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
 ; X64-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-NEXT:    orps %xmm1, %xmm0
 ; X64-NEXT:    callq __extenddftf2
@@ -867,7 +866,6 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind {
 ; X64_NO_MMX-NEXT:    callq __trunctfdf2
 ; X64_NO_MMX-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64_NO_MMX-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64_NO_MMX-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
 ; X64_NO_MMX-NEXT:    orps %xmm1, %xmm0
 ; X64_NO_MMX-NEXT:    callq __extenddftf2
 ; X64_NO_MMX-NEXT:    addq $8, %rsp
diff --git a/test/CodeGen/X86/fshl.ll b/test/CodeGen/X86/fshl.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ccf451e045147eaf38b70e9f070f90cf7232116c
--- /dev/null
+++ b/test/CodeGen/X86/fshl.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+slow-shld | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+slow-shld | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW
+
+declare i8 @llvm.fshl.i8(i8, i8, i8) nounwind readnone
+declare i16 @llvm.fshl.i16(i16, i16, i16) nounwind readnone
+declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.fshl.i64(i64, i64, i64) nounwind readnone
+
+;
+; Variable Funnel Shift
+;
+
+define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
+; X86-LABEL: var_shift_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    andb $7, %dl
+; X86-NEXT:    movb %al, %ch
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movb $8, %cl
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    shrb %cl, %ah
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    orb %ah, %ch
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $7, %dl
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    shlb %cl, %dil
+; X64-NEXT:    movb $8, %cl
+; X64-NEXT:    subb %dl, %cl
+; X64-NEXT:    shrb %cl, %sil
+; X64-NEXT:    testb %dl, %dl
+; X64-NEXT:    je .LBB0_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    orb %sil, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:  .LBB0_2:
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %tmp = tail call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %z)
+  ret i8 %tmp
+}
+
+define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
+; X86-FAST-LABEL: var_shift_i16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-FAST-NEXT:    andb $15, %cl
+; X86-FAST-NEXT:    shldw %cl, %dx, %ax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: var_shift_i16:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %edi
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-SLOW-NEXT:    andb $15, %dl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    movb $16, %cl
+; X86-SLOW-NEXT:    subb %dl, %cl
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    testb %dl, %dl
+; X86-SLOW-NEXT:    je .LBB1_2
+; X86-SLOW-NEXT:  # %bb.1:
+; X86-SLOW-NEXT:    orl %esi, %edi
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:  .LBB1_2:
+; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: var_shift_i16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movl %edx, %ecx
+; X64-FAST-NEXT:    movl %edi, %eax
+; X64-FAST-NEXT:    andb $15, %cl
+; X64-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-FAST-NEXT:    shldw %cl, %si, %ax
+; X64-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: var_shift_i16:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movzwl %si, %eax
+; X64-SLOW-NEXT:    andb $15, %dl
+; X64-SLOW-NEXT:    movl %edi, %esi
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    shll %cl, %esi
+; X64-SLOW-NEXT:    movb $16, %cl
+; X64-SLOW-NEXT:    subb %dl, %cl
+; X64-SLOW-NEXT:    shrl %cl, %eax
+; X64-SLOW-NEXT:    orl %esi, %eax
+; X64-SLOW-NEXT:    testb %dl, %dl
+; X64-SLOW-NEXT:    cmovel %edi, %eax
+; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
+  ret i16 %tmp
+}
+
+define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
+; X86-FAST-LABEL: var_shift_i32:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shldl %cl, %edx, %eax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: var_shift_i32:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %edi
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    andb $31, %dl
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    negb %cl
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    testb %dl, %dl
+; X86-SLOW-NEXT:    je .LBB2_2
+; X86-SLOW-NEXT:  # %bb.1:
+; X86-SLOW-NEXT:    orl %esi, %edi
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:  .LBB2_2:
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: var_shift_i32:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movl %edx, %ecx
+; X64-FAST-NEXT:    movl %edi, %eax
+; X64-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-FAST-NEXT:    shldl %cl, %esi, %eax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: var_shift_i32:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movl %esi, %eax
+; X64-SLOW-NEXT:    movl %edi, %esi
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    shll %cl, %esi
+; X64-SLOW-NEXT:    andb $31, %dl
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    negb %cl
+; X64-SLOW-NEXT:    shrl %cl, %eax
+; X64-SLOW-NEXT:    orl %esi, %eax
+; X64-SLOW-NEXT:    testb %dl, %dl
+; X64-SLOW-NEXT:    cmovel %edi, %eax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
+define i32 @var_shift_i32_optsize(i32 %x, i32 %y, i32 %z) nounwind optsize {
+; X86-LABEL: var_shift_i32_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i32_optsize:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldl %cl, %esi, %eax
+; X64-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
+define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
+; X86-FAST-LABEL: var_shift_i64:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    pushl %ebx
+; X86-FAST-NEXT:    pushl %edi
+; X86-FAST-NEXT:    pushl %esi
+; X86-FAST-NEXT:    pushl %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    andl $63, %ebx
+; X86-FAST-NEXT:    movl %eax, %edi
+; X86-FAST-NEXT:    movl %ebx, %ecx
+; X86-FAST-NEXT:    shll %cl, %edi
+; X86-FAST-NEXT:    shldl %cl, %eax, %ebp
+; X86-FAST-NEXT:    testb $32, %bl
+; X86-FAST-NEXT:    je .LBB4_2
+; X86-FAST-NEXT:  # %bb.1:
+; X86-FAST-NEXT:    movl %edi, %ebp
+; X86-FAST-NEXT:    xorl %edi, %edi
+; X86-FAST-NEXT:  .LBB4_2:
+; X86-FAST-NEXT:    movb $64, %cl
+; X86-FAST-NEXT:    subb %bl, %cl
+; X86-FAST-NEXT:    movl %edx, %esi
+; X86-FAST-NEXT:    shrl %cl, %esi
+; X86-FAST-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    jne .LBB4_3
+; X86-FAST-NEXT:  # %bb.4:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-FAST-NEXT:    testl %ebx, %ebx
+; X86-FAST-NEXT:    jne .LBB4_6
+; X86-FAST-NEXT:    jmp .LBB4_7
+; X86-FAST-NEXT:  .LBB4_3:
+; X86-FAST-NEXT:    movl %esi, %ecx
+; X86-FAST-NEXT:    xorl %esi, %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    testl %ebx, %ebx
+; X86-FAST-NEXT:    je .LBB4_7
+; X86-FAST-NEXT:  .LBB4_6:
+; X86-FAST-NEXT:    orl %esi, %ebp
+; X86-FAST-NEXT:    orl %ecx, %edi
+; X86-FAST-NEXT:    movl %edi, %eax
+; X86-FAST-NEXT:    movl %ebp, %edx
+; X86-FAST-NEXT:  .LBB4_7:
+; X86-FAST-NEXT:    addl $4, %esp
+; X86-FAST-NEXT:    popl %esi
+; X86-FAST-NEXT:    popl %edi
+; X86-FAST-NEXT:    popl %ebx
+; X86-FAST-NEXT:    popl %ebp
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: var_shift_i64:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    pushl %ebx
+; X86-SLOW-NEXT:    pushl %edi
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    subl $8, %esp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    andl $63, %ebx
+; X86-SLOW-NEXT:    movb $64, %dh
+; X86-SLOW-NEXT:    subb %bl, %dh
+; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movb %dh, %cl
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    movb %dh, %dl
+; X86-SLOW-NEXT:    andb $31, %dl
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    negb %cl
+; X86-SLOW-NEXT:    movl %esi, %ebp
+; X86-SLOW-NEXT:    shll %cl, %ebp
+; X86-SLOW-NEXT:    testb %dl, %dl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    je .LBB4_2
+; X86-SLOW-NEXT:  # %bb.1:
+; X86-SLOW-NEXT:    orl %eax, %ebp
+; X86-SLOW-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB4_2:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    movl %ebp, %eax
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    movb %bl, %ch
+; X86-SLOW-NEXT:    andb $31, %ch
+; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    negb %cl
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    testb %ch, %ch
+; X86-SLOW-NEXT:    je .LBB4_4
+; X86-SLOW-NEXT:  # %bb.3:
+; X86-SLOW-NEXT:    orl %edi, %eax
+; X86-SLOW-NEXT:    movl %eax, %ebp
+; X86-SLOW-NEXT:  .LBB4_4:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    je .LBB4_6
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:    xorl %edi, %edi
+; X86-SLOW-NEXT:  .LBB4_6:
+; X86-SLOW-NEXT:    movb %dh, %cl
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %dh
+; X86-SLOW-NEXT:    jne .LBB4_7
+; X86-SLOW-NEXT:  # %bb.8:
+; X86-SLOW-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SLOW-NEXT:    testl %ebx, %ebx
+; X86-SLOW-NEXT:    jne .LBB4_10
+; X86-SLOW-NEXT:    jmp .LBB4_11
+; X86-SLOW-NEXT:  .LBB4_7:
+; X86-SLOW-NEXT:    movl %esi, %ecx
+; X86-SLOW-NEXT:    xorl %esi, %esi
+; X86-SLOW-NEXT:    testl %ebx, %ebx
+; X86-SLOW-NEXT:    je .LBB4_11
+; X86-SLOW-NEXT:  .LBB4_10:
+; X86-SLOW-NEXT:    orl %esi, %ebp
+; X86-SLOW-NEXT:    orl %ecx, %edi
+; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:  .LBB4_11:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    popl %ebx
+; X86-SLOW-NEXT:    popl %ebp
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: var_shift_i64:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movq %rdx, %rcx
+; X64-FAST-NEXT:    movq %rdi, %rax
+; X64-FAST-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-FAST-NEXT:    shldq %cl, %rsi, %rax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: var_shift_i64:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movq %rsi, %rax
+; X64-SLOW-NEXT:    movq %rdi, %rsi
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    shlq %cl, %rsi
+; X64-SLOW-NEXT:    andb $63, %dl
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    negb %cl
+; X64-SLOW-NEXT:    shrq %cl, %rax
+; X64-SLOW-NEXT:    orq %rsi, %rax
+; X64-SLOW-NEXT:    testb %dl, %dl
+; X64-SLOW-NEXT:    cmoveq %rdi, %rax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
+  ret i64 %tmp
+}
+
+;
+; Const Funnel Shift
+;
+
+define i8 @const_shift_i8(i8 %x, i8 %y) nounwind {
+; X86-LABEL: const_shift_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shrb %cl
+; X86-NEXT:    shlb $7, %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: const_shift_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrb %sil
+; X64-NEXT:    shlb $7, %al
+; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %tmp = tail call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 7)
+  ret i8 %tmp
+}
+
+define i16 @const_shift_i16(i16 %x, i16 %y) nounwind {
+; X86-FAST-LABEL: const_shift_i16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shldw $7, %cx, %ax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: const_shift_i16:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl $9, %ecx
+; X86-SLOW-NEXT:    shll $7, %eax
+; X86-SLOW-NEXT:    orl %ecx, %eax
+; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: const_shift_i16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movl %edi, %eax
+; X64-FAST-NEXT:    shldw $7, %si, %ax
+; X64-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: const_shift_i16:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movzwl %si, %eax
+; X64-SLOW-NEXT:    shll $7, %edi
+; X64-SLOW-NEXT:    shrl $9, %eax
+; X64-SLOW-NEXT:    orl %edi, %eax
+; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 7)
+  ret i16 %tmp
+}
+
+define i32 @const_shift_i32(i32 %x, i32 %y) nounwind {
+; X86-FAST-LABEL: const_shift_i32:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shldl $7, %ecx, %eax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: const_shift_i32:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl $25, %ecx
+; X86-SLOW-NEXT:    shll $7, %eax
+; X86-SLOW-NEXT:    orl %ecx, %eax
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: const_shift_i32:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movl %edi, %eax
+; X64-FAST-NEXT:    shldl $7, %esi, %eax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: const_shift_i32:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-SLOW-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-SLOW-NEXT:    shrl $25, %esi
+; X64-SLOW-NEXT:    shll $7, %edi
+; X64-SLOW-NEXT:    leal (%rdi,%rsi), %eax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
+  ret i32 %tmp
+}
+
+define i64 @const_shift_i64(i64 %x, i64 %y) nounwind {
+; X86-FAST-LABEL: const_shift_i64:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    shrdl $25, %ecx, %eax
+; X86-FAST-NEXT:    shldl $7, %ecx, %edx
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: const_shift_i64:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    shrl $25, %esi
+; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    shll $7, %eax
+; X86-SLOW-NEXT:    orl %esi, %eax
+; X86-SLOW-NEXT:    shrl $25, %ecx
+; X86-SLOW-NEXT:    shll $7, %edx
+; X86-SLOW-NEXT:    orl %ecx, %edx
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: const_shift_i64:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movq %rdi, %rax
+; X64-FAST-NEXT:    shldq $7, %rsi, %rax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: const_shift_i64:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    shrq $57, %rsi
+; X64-SLOW-NEXT:    shlq $7, %rdi
+; X64-SLOW-NEXT:    leaq (%rdi,%rsi), %rax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 7)
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/X86/fshr.ll b/test/CodeGen/X86/fshr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..09d63b665342968f3d84ab5234534e90e612d3f1
--- /dev/null
+++ b/test/CodeGen/X86/fshr.ll
@@ -0,0 +1,503 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+slow-shld | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+slow-shld | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW
+
+declare i8 @llvm.fshr.i8(i8, i8, i8) nounwind readnone
+declare i16 @llvm.fshr.i16(i16, i16, i16) nounwind readnone
+declare i32 @llvm.fshr.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.fshr.i64(i64, i64, i64) nounwind readnone
+
+;
+; Variable Funnel Shift
+;
+
+define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
+; X86-LABEL: var_shift_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    andb $7, %dl
+; X86-NEXT:    movb %al, %ch
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    movb $8, %cl
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    shlb %cl, %ah
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    orb %ch, %ah
+; X86-NEXT:    movb %ah, %al
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andb $7, %dl
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    shrb %cl, %sil
+; X64-NEXT:    movb $8, %cl
+; X64-NEXT:    subb %dl, %cl
+; X64-NEXT:    shlb %cl, %dil
+; X64-NEXT:    testb %dl, %dl
+; X64-NEXT:    je .LBB0_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    orb %sil, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:  .LBB0_2:
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %tmp = tail call i8 @llvm.fshr.i8(i8 %x, i8 %y, i8 %z)
+  ret i8 %tmp
+}
+
+define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
+; X86-FAST-LABEL: var_shift_i16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-FAST-NEXT:    andb $15, %cl
+; X86-FAST-NEXT:    shrdw %cl, %dx, %ax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: var_shift_i16:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %edi
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-SLOW-NEXT:    andb $15, %dl
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movb $16, %cl
+; X86-SLOW-NEXT:    subb %dl, %cl
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    testb %dl, %dl
+; X86-SLOW-NEXT:    je .LBB1_2
+; X86-SLOW-NEXT:  # %bb.1:
+; X86-SLOW-NEXT:    orl %edi, %esi
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:  .LBB1_2:
+; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: var_shift_i16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movl %edx, %ecx
+; X64-FAST-NEXT:    movl %esi, %eax
+; X64-FAST-NEXT:    andb $15, %cl
+; X64-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-FAST-NEXT:    shrdw %cl, %di, %ax
+; X64-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: var_shift_i16:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movzwl %si, %eax
+; X64-SLOW-NEXT:    andb $15, %dl
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    shrl %cl, %eax
+; X64-SLOW-NEXT:    movb $16, %cl
+; X64-SLOW-NEXT:    subb %dl, %cl
+; X64-SLOW-NEXT:    shll %cl, %edi
+; X64-SLOW-NEXT:    orl %edi, %eax
+; X64-SLOW-NEXT:    testb %dl, %dl
+; X64-SLOW-NEXT:    cmovel %esi, %eax
+; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
+  ret i16 %tmp
+}
+
+define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
+; X86-FAST-LABEL: var_shift_i32:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shrdl %cl, %edx, %eax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: var_shift_i32:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %edi
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    andb $31, %dl
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    negb %cl
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    testb %dl, %dl
+; X86-SLOW-NEXT:    je .LBB2_2
+; X86-SLOW-NEXT:  # %bb.1:
+; X86-SLOW-NEXT:    orl %edi, %esi
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:  .LBB2_2:
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: var_shift_i32:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movl %edx, %ecx
+; X64-FAST-NEXT:    movl %esi, %eax
+; X64-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-FAST-NEXT:    shrdl %cl, %edi, %eax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: var_shift_i32:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movl %edi, %eax
+; X64-SLOW-NEXT:    movl %esi, %edi
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    shrl %cl, %edi
+; X64-SLOW-NEXT:    andb $31, %dl
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    negb %cl
+; X64-SLOW-NEXT:    shll %cl, %eax
+; X64-SLOW-NEXT:    orl %edi, %eax
+; X64-SLOW-NEXT:    testb %dl, %dl
+; X64-SLOW-NEXT:    cmovel %esi, %eax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
+define i32 @var_shift_i32_optsize(i32 %x, i32 %y, i32 %z) nounwind optsize {
+; X86-LABEL: var_shift_i32_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i32_optsize:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %edi, %eax
+; X64-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
+define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
+; X86-FAST-LABEL: var_shift_i64:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    pushl %ebx
+; X86-FAST-NEXT:    pushl %edi
+; X86-FAST-NEXT:    pushl %esi
+; X86-FAST-NEXT:    pushl %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    andl $63, %ebx
+; X86-FAST-NEXT:    movb $64, %cl
+; X86-FAST-NEXT:    subb %bl, %cl
+; X86-FAST-NEXT:    movl %eax, %edi
+; X86-FAST-NEXT:    shll %cl, %edi
+; X86-FAST-NEXT:    shldl %cl, %eax, %esi
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    je .LBB4_2
+; X86-FAST-NEXT:  # %bb.1:
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    xorl %edi, %edi
+; X86-FAST-NEXT:  .LBB4_2:
+; X86-FAST-NEXT:    movl %edx, %ebp
+; X86-FAST-NEXT:    movl %ebx, %ecx
+; X86-FAST-NEXT:    shrl %cl, %ebp
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shrdl %cl, %edx, %eax
+; X86-FAST-NEXT:    testb $32, %bl
+; X86-FAST-NEXT:    je .LBB4_4
+; X86-FAST-NEXT:  # %bb.3:
+; X86-FAST-NEXT:    movl %ebp, %eax
+; X86-FAST-NEXT:    xorl %ebp, %ebp
+; X86-FAST-NEXT:  .LBB4_4:
+; X86-FAST-NEXT:    testl %ebx, %ebx
+; X86-FAST-NEXT:    je .LBB4_6
+; X86-FAST-NEXT:  # %bb.5:
+; X86-FAST-NEXT:    orl %ebp, %esi
+; X86-FAST-NEXT:    orl %eax, %edi
+; X86-FAST-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %edx
+; X86-FAST-NEXT:  .LBB4_6:
+; X86-FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-FAST-NEXT:    addl $4, %esp
+; X86-FAST-NEXT:    popl %esi
+; X86-FAST-NEXT:    popl %edi
+; X86-FAST-NEXT:    popl %ebx
+; X86-FAST-NEXT:    popl %ebp
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: var_shift_i64:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    pushl %ebx
+; X86-SLOW-NEXT:    pushl %edi
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    subl $8, %esp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    andl $63, %ebx
+; X86-SLOW-NEXT:    movb $64, %al
+; X86-SLOW-NEXT:    subb %bl, %al
+; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %edx
+; X86-SLOW-NEXT:    movb %al, %ch
+; X86-SLOW-NEXT:    andb $31, %ch
+; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    negb %cl
+; X86-SLOW-NEXT:    movl %esi, %edi
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    testb %ch, %ch
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    je .LBB4_2
+; X86-SLOW-NEXT:  # %bb.1:
+; X86-SLOW-NEXT:    orl %edi, %edx
+; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB4_2:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    movb %bl, %ah
+; X86-SLOW-NEXT:    andb $31, %ah
+; X86-SLOW-NEXT:    movb %ah, %cl
+; X86-SLOW-NEXT:    negb %cl
+; X86-SLOW-NEXT:    movl %ebp, %edi
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    testb %ah, %ah
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    je .LBB4_4
+; X86-SLOW-NEXT:  # %bb.3:
+; X86-SLOW-NEXT:    orl %edx, %edi
+; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:  .LBB4_4:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    je .LBB4_6
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:    xorl %edi, %edi
+; X86-SLOW-NEXT:  .LBB4_6:
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %al
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    jne .LBB4_7
+; X86-SLOW-NEXT:  # %bb.8:
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    testl %ebx, %ebx
+; X86-SLOW-NEXT:    jne .LBB4_10
+; X86-SLOW-NEXT:    jmp .LBB4_11
+; X86-SLOW-NEXT:  .LBB4_7:
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:    xorl %esi, %esi
+; X86-SLOW-NEXT:    testl %ebx, %ebx
+; X86-SLOW-NEXT:    je .LBB4_11
+; X86-SLOW-NEXT:  .LBB4_10:
+; X86-SLOW-NEXT:    orl %ebp, %esi
+; X86-SLOW-NEXT:    orl %edi, %eax
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %edx
+; X86-SLOW-NEXT:  .LBB4_11:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    popl %ebx
+; X86-SLOW-NEXT:    popl %ebp
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: var_shift_i64:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movq %rdx, %rcx
+; X64-FAST-NEXT:    movq %rsi, %rax
+; X64-FAST-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-FAST-NEXT:    shrdq %cl, %rdi, %rax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: var_shift_i64:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movq %rdi, %rax
+; X64-SLOW-NEXT:    movq %rsi, %rdi
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    shrq %cl, %rdi
+; X64-SLOW-NEXT:    andb $63, %dl
+; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    negb %cl
+; X64-SLOW-NEXT:    shlq %cl, %rax
+; X64-SLOW-NEXT:    orq %rdi, %rax
+; X64-SLOW-NEXT:    testb %dl, %dl
+; X64-SLOW-NEXT:    cmoveq %rsi, %rax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
+  ret i64 %tmp
+}
+
+;
+; Const Funnel Shift
+;
+
+define i8 @const_shift_i8(i8 %x, i8 %y) nounwind {
+; X86-LABEL: const_shift_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shrb $7, %cl
+; X86-NEXT:    addb %al, %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: const_shift_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shrb $7, %sil
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %tmp = tail call i8 @llvm.fshr.i8(i8 %x, i8 %y, i8 7)
+  ret i8 %tmp
+}
+
+define i16 @const_shift_i16(i16 %x, i16 %y) nounwind {
+; X86-FAST-LABEL: const_shift_i16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shrdw $7, %cx, %ax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: const_shift_i16:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl $7, %ecx
+; X86-SLOW-NEXT:    shll $9, %eax
+; X86-SLOW-NEXT:    orl %ecx, %eax
+; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: const_shift_i16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movl %esi, %eax
+; X64-FAST-NEXT:    shrdw $7, %di, %ax
+; X64-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: const_shift_i16:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movzwl %si, %eax
+; X64-SLOW-NEXT:    shll $9, %edi
+; X64-SLOW-NEXT:    shrl $7, %eax
+; X64-SLOW-NEXT:    orl %edi, %eax
+; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 7)
+  ret i16 %tmp
+}
+
+define i32 @const_shift_i32(i32 %x, i32 %y) nounwind {
+; X86-FAST-LABEL: const_shift_i32:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shrdl $7, %ecx, %eax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: const_shift_i32:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl $7, %ecx
+; X86-SLOW-NEXT:    shll $25, %eax
+; X86-SLOW-NEXT:    orl %ecx, %eax
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: const_shift_i32:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movl %edi, %eax
+; X64-FAST-NEXT:    shldl $25, %esi, %eax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: const_shift_i32:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-SLOW-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-SLOW-NEXT:    shrl $7, %esi
+; X64-SLOW-NEXT:    shll $25, %edi
+; X64-SLOW-NEXT:    leal (%rdi,%rsi), %eax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
+  ret i32 %tmp
+}
+
+define i64 @const_shift_i64(i64 %x, i64 %y) nounwind {
+; X86-FAST-LABEL: const_shift_i64:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    shldl $25, %ecx, %edx
+; X86-FAST-NEXT:    shrdl $7, %ecx, %eax
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: const_shift_i64:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, %esi
+; X86-SLOW-NEXT:    shll $25, %esi
+; X86-SLOW-NEXT:    shrl $7, %eax
+; X86-SLOW-NEXT:    orl %esi, %eax
+; X86-SLOW-NEXT:    shrl $7, %ecx
+; X86-SLOW-NEXT:    shll $25, %edx
+; X86-SLOW-NEXT:    orl %ecx, %edx
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: const_shift_i64:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movq %rdi, %rax
+; X64-FAST-NEXT:    shldq $57, %rsi, %rax
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: const_shift_i64:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    shrq $7, %rsi
+; X64-SLOW-NEXT:    shlq $57, %rdi
+; X64-SLOW-NEXT:    leaq (%rdi,%rsi), %rax
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 7)
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/X86/funnel-shift-rot.ll b/test/CodeGen/X86/funnel-shift-rot.ll
index 19e75aba3bb9d4649fe76bf0a21071cc3fa93ace..e07d34f9e6421654d9ec2aaf90974f5355b604de 100644
--- a/test/CodeGen/X86/funnel-shift-rot.ll
+++ b/test/CodeGen/X86/funnel-shift-rot.ll
@@ -231,30 +231,25 @@ define i64 @rotr_i64(i64 %x, i64 %z) nounwind {
 ; X32-SSE2-NEXT:    pushl %ebx
 ; X32-SSE2-NEXT:    pushl %edi
 ; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-SSE2-NEXT:    movl %ebx, %ecx
-; X32-SSE2-NEXT:    andl $63, %ecx
 ; X32-SSE2-NEXT:    movl %edx, %edi
 ; X32-SSE2-NEXT:    shrl %cl, %edi
-; X32-SSE2-NEXT:    movl %esi, %ebp
-; X32-SSE2-NEXT:    shrdl %cl, %edx, %ebp
-; X32-SSE2-NEXT:    xorl %eax, %eax
+; X32-SSE2-NEXT:    movl %esi, %ebx
+; X32-SSE2-NEXT:    shrdl %cl, %edx, %ebx
+; X32-SSE2-NEXT:    xorl %ebp, %ebp
 ; X32-SSE2-NEXT:    testb $32, %cl
-; X32-SSE2-NEXT:    cmovnel %edi, %ebp
-; X32-SSE2-NEXT:    cmovnel %eax, %edi
-; X32-SSE2-NEXT:    negl %ebx
-; X32-SSE2-NEXT:    andl $63, %ebx
+; X32-SSE2-NEXT:    cmovnel %edi, %ebx
+; X32-SSE2-NEXT:    cmovnel %ebp, %edi
+; X32-SSE2-NEXT:    negb %cl
 ; X32-SSE2-NEXT:    movl %esi, %eax
-; X32-SSE2-NEXT:    movl %ebx, %ecx
 ; X32-SSE2-NEXT:    shll %cl, %eax
 ; X32-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X32-SSE2-NEXT:    testb $32, %bl
+; X32-SSE2-NEXT:    testb $32, %cl
 ; X32-SSE2-NEXT:    cmovnel %eax, %edx
-; X32-SSE2-NEXT:    movl $0, %ecx
-; X32-SSE2-NEXT:    cmovnel %ecx, %eax
-; X32-SSE2-NEXT:    orl %ebp, %eax
+; X32-SSE2-NEXT:    cmovnel %ebp, %eax
+; X32-SSE2-NEXT:    orl %ebx, %eax
 ; X32-SSE2-NEXT:    orl %edi, %edx
 ; X32-SSE2-NEXT:    popl %esi
 ; X32-SSE2-NEXT:    popl %edi
diff --git a/test/CodeGen/X86/funnel-shift.ll b/test/CodeGen/X86/funnel-shift.ll
index 678e447f97dc995e34cf5a1380080ef86ece0c02..0969d6d190c244ea3a5ec0578797fd632c5069b9 100644
--- a/test/CodeGen/X86/funnel-shift.ll
+++ b/test/CodeGen/X86/funnel-shift.ll
@@ -14,31 +14,23 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
 declare i64 @llvm.fshr.i64(i64, i64, i64)
 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 
-; General case - all operands can be variables - x86 has shld, but the mask and cmov are not needed?
+; General case - all operands can be variables
 
 define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X32-SSE2-LABEL: fshl_i32:
 ; X32-SSE2:       # %bb.0:
-; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE2-NEXT:    andl $31, %ecx
-; X32-SSE2-NEXT:    movl %esi, %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE2-NEXT:    shldl %cl, %edx, %eax
-; X32-SSE2-NEXT:    testl %ecx, %ecx
-; X32-SSE2-NEXT:    cmovel %esi, %eax
-; X32-SSE2-NEXT:    popl %esi
 ; X32-SSE2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: fshl_i32:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    andl $31, %ecx
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-AVX2-NEXT:    shldl %cl, %esi, %eax
-; X64-AVX2-NEXT:    testl %ecx, %ecx
-; X64-AVX2-NEXT:    cmovel %edi, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
   ret i32 %f
@@ -78,8 +70,8 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X32-SSE2-NEXT:    cmovnel %ebp, %eax
 ; X32-SSE2-NEXT:    cmovnel %ecx, %ebp
 ; X32-SSE2-NEXT:    xorl %edx, %edx
-; X32-SSE2-NEXT:    movl $37, %ecx
-; X32-SSE2-NEXT:    subl %ebx, %ecx
+; X32-SSE2-NEXT:    movb $37, %cl
+; X32-SSE2-NEXT:    subb %bl, %cl
 ; X32-SSE2-NEXT:    shrdl %cl, %esi, %edi
 ; X32-SSE2-NEXT:    shrl %cl, %esi
 ; X32-SSE2-NEXT:    testb $32, %cl
@@ -212,31 +204,23 @@ define i8 @fshl_i8_const_fold() nounwind {
 
 ; Repeat everything for funnel shift right.
 
-; General case - all operands can be variables - x86 has 'shrd', but the mask and cmov are not needed?
+; General case - all operands can be variables
 
 define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X32-SSE2-LABEL: fshr_i32:
 ; X32-SSE2:       # %bb.0:
-; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE2-NEXT:    andl $31, %ecx
-; X32-SSE2-NEXT:    movl %esi, %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE2-NEXT:    shrdl %cl, %edx, %eax
-; X32-SSE2-NEXT:    testl %ecx, %ecx
-; X32-SSE2-NEXT:    cmovel %esi, %eax
-; X32-SSE2-NEXT:    popl %esi
 ; X32-SSE2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: fshr_i32:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    andl $31, %ecx
 ; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    testl %ecx, %ecx
-; X64-AVX2-NEXT:    cmovel %esi, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
   ret i32 %f
@@ -264,8 +248,8 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X32-SSE2-NEXT:    calll __umoddi3
 ; X32-SSE2-NEXT:    addl $16, %esp
 ; X32-SSE2-NEXT:    movl %eax, %ebx
-; X32-SSE2-NEXT:    movl $37, %ecx
-; X32-SSE2-NEXT:    subl %eax, %ecx
+; X32-SSE2-NEXT:    movb $37, %cl
+; X32-SSE2-NEXT:    subb %bl, %cl
 ; X32-SSE2-NEXT:    movl %ebp, %eax
 ; X32-SSE2-NEXT:    shll %cl, %ebp
 ; X32-SSE2-NEXT:    shldl %cl, %eax, %edi
@@ -341,7 +325,7 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
 ; X32-SSE2:       # %bb.0:
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    shldl $23, %ecx, %eax
+; X32-SSE2-NEXT:    shrdl $9, %ecx, %eax
 ; X32-SSE2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: fshr_i32_const_shift:
@@ -353,14 +337,14 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
   ret i32 %f
 }
 
-; Check modulo math on shift amount. 41-32=9, but right-shift became left, so 32-9=23.
+; Check modulo math on shift amount. 41-32=9, but right-shift may became left, so 32-9=23.
 
 define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) nounwind {
 ; X32-SSE2-LABEL: fshr_i32_const_overshift:
 ; X32-SSE2:       # %bb.0:
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    shldl $23, %ecx, %eax
+; X32-SSE2-NEXT:    shrdl $9, %ecx, %eax
 ; X32-SSE2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: fshr_i32_const_overshift:
diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll
index e0590a766159d13f71640132c266485a90b9bd06..166528b08ef66b298b3b4233f869f758a10475c8 100644
--- a/test/CodeGen/X86/haddsub-undef.ll
+++ b/test/CodeGen/X86/haddsub-undef.ll
@@ -696,7 +696,7 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) {
 ;
 ; AVX2-SLOW-LABEL: add_ps_007_2:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vbroadcastss %xmm0, %xmm1
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    retq
@@ -826,7 +826,7 @@ define <4 x float> @add_ps_018(<4 x float> %x) {
 ;
 ; AVX2-SLOW-LABEL: add_ps_018:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vbroadcastss %xmm0, %xmm1
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/test/CodeGen/X86/hoist-spill.ll b/test/CodeGen/X86/hoist-spill.ll
index 6a3f5ca01e8699a1adc9cbec20ea80eff085993b..3df4cdbf2173b28076e0702ce37b6ad4d8ebe08c 100644
--- a/test/CodeGen/X86/hoist-spill.ll
+++ b/test/CodeGen/X86/hoist-spill.ll
@@ -27,10 +27,10 @@ for.cond:                                         ; preds = %for.inc14, %entry
   %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ]
   %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ]
   %tmp3 = icmp sgt i32 undef, 0
-  %smax52 = select i1 %tmp3, i32 undef, i32 0
+  %smax52 = select i1 %tmp3, i32 %c.0, i32 0
   %tmp4 = zext i32 %smax52 to i64
   %tmp5 = icmp sgt i64 undef, %tmp4
-  %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4
+  %smax53 = select i1 %tmp5, i64 %tmp2, i64 %tmp4
   %tmp6 = add nsw i64 %smax53, 1
   %tmp7 = sub nsw i64 %tmp6, %tmp4
   %tmp8 = add nsw i64 %tmp7, -8
diff --git a/test/CodeGen/X86/i128-mul.ll b/test/CodeGen/X86/i128-mul.ll
index 8069eab327c37fd05a79be602f81188080437741..3b74e041f98d94a0807793dc737ad3cf60903ebb 100644
--- a/test/CodeGen/X86/i128-mul.ll
+++ b/test/CodeGen/X86/i128-mul.ll
@@ -7,108 +7,61 @@
 ; PR1198
 
 define i64 @foo(i64 %x, i64 %y) nounwind {
-; X86-NOBMI-LABEL: foo:
-; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebp
-; X86-NOBMI-NEXT:    pushl %ebx
-; X86-NOBMI-NEXT:    pushl %edi
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %ecx, %eax
-; X86-NOBMI-NEXT:    mull %ebp
-; X86-NOBMI-NEXT:    movl %edx, %ebx
-; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    mull %ebp
-; X86-NOBMI-NEXT:    movl %edx, %ebp
-; X86-NOBMI-NEXT:    movl %eax, %esi
-; X86-NOBMI-NEXT:    addl %ebx, %esi
-; X86-NOBMI-NEXT:    adcl $0, %ebp
-; X86-NOBMI-NEXT:    movl %ecx, %eax
-; X86-NOBMI-NEXT:    mull %edi
-; X86-NOBMI-NEXT:    movl %edx, %ebx
-; X86-NOBMI-NEXT:    addl %esi, %eax
-; X86-NOBMI-NEXT:    adcl %ebp, %ebx
-; X86-NOBMI-NEXT:    setb %al
-; X86-NOBMI-NEXT:    movzbl %al, %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    mull %edi
-; X86-NOBMI-NEXT:    movl %edx, %esi
-; X86-NOBMI-NEXT:    movl %eax, %ebp
-; X86-NOBMI-NEXT:    addl %ebx, %ebp
-; X86-NOBMI-NEXT:    adcl %ecx, %esi
-; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    mull %ecx
-; X86-NOBMI-NEXT:    movl %edx, %edi
-; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    mull %ecx
-; X86-NOBMI-NEXT:    addl %ebx, %eax
-; X86-NOBMI-NEXT:    adcl %edi, %edx
-; X86-NOBMI-NEXT:    addl %ebp, %eax
-; X86-NOBMI-NEXT:    adcl %esi, %edx
-; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
-; X86-NOBMI-NEXT:    retl
+; X86-LABEL: foo:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    setb %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
 ;
-; X86-BMI-LABEL: foo:
-; X86-BMI:       # %bb.0:
-; X86-BMI-NEXT:    pushl %ebp
-; X86-BMI-NEXT:    pushl %ebx
-; X86-BMI-NEXT:    pushl %edi
-; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    mulxl %esi, %edx, %ebx
-; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    mulxl %esi, %ebp, %eax
-; X86-BMI-NEXT:    addl %ebx, %ebp
-; X86-BMI-NEXT:    adcl $0, %eax
-; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    mulxl %edi, %edx, %ebx
-; X86-BMI-NEXT:    addl %ebp, %edx
-; X86-BMI-NEXT:    adcl %eax, %ebx
-; X86-BMI-NEXT:    setb %al
-; X86-BMI-NEXT:    movzbl %al, %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    mulxl %edi, %edi, %ebp
-; X86-BMI-NEXT:    addl %ebx, %edi
-; X86-BMI-NEXT:    adcl %eax, %ebp
-; X86-BMI-NEXT:    xorl %eax, %eax
-; X86-BMI-NEXT:    movl %esi, %edx
-; X86-BMI-NEXT:    mulxl %eax, %ebx, %esi
-; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    mulxl %eax, %eax, %edx
-; X86-BMI-NEXT:    addl %ebx, %eax
-; X86-BMI-NEXT:    adcl %esi, %edx
-; X86-BMI-NEXT:    addl %edi, %eax
-; X86-BMI-NEXT:    adcl %ebp, %edx
-; X86-BMI-NEXT:    popl %esi
-; X86-BMI-NEXT:    popl %edi
-; X86-BMI-NEXT:    popl %ebx
-; X86-BMI-NEXT:    popl %ebp
-; X86-BMI-NEXT:    retl
-;
-; X64-NOBMI-LABEL: foo:
-; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movq %rdi, %rax
-; X64-NOBMI-NEXT:    mulq %rsi
-; X64-NOBMI-NEXT:    movq %rdx, %rax
-; X64-NOBMI-NEXT:    retq
-;
-; X64-BMI-LABEL: foo:
-; X64-BMI:       # %bb.0:
-; X64-BMI-NEXT:    movq %rdi, %rdx
-; X64-BMI-NEXT:    mulxq %rsi, %rcx, %rax
-; X64-BMI-NEXT:    retq
+; X64-LABEL: foo:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    retq
   %tmp0 = zext i64 %x to i128
   %tmp1 = zext i64 %y to i128
   %tmp2 = mul i128 %tmp0, %tmp1
@@ -122,236 +75,125 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; zero-extended value.
 
 define i64 @mul1(i64 %n, i64* nocapture %z, i64* nocapture %x, i64 %y) nounwind {
-; X86-NOBMI-LABEL: mul1:
-; X86-NOBMI:       # %bb.0: # %entry
-; X86-NOBMI-NEXT:    pushl %ebp
-; X86-NOBMI-NEXT:    pushl %ebx
-; X86-NOBMI-NEXT:    pushl %edi
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $28, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    orl %ecx, %eax
-; X86-NOBMI-NEXT:    je .LBB1_3
-; X86-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
-; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:    movl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X86-NOBMI-NEXT:    .p2align 4, 0x90
-; X86-NOBMI-NEXT:  .LBB1_2: # %for.body
-; X86-NOBMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOBMI-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    movl (%eax,%ebx,8), %ebp
-; X86-NOBMI-NEXT:    movl 4(%eax,%ebx,8), %esi
-; X86-NOBMI-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %ebp, %eax
-; X86-NOBMI-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    mull %ecx
-; X86-NOBMI-NEXT:    movl %edx, %edi
-; X86-NOBMI-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    mull %ecx
-; X86-NOBMI-NEXT:    movl %edx, %ecx
-; X86-NOBMI-NEXT:    movl %eax, %esi
-; X86-NOBMI-NEXT:    addl %edi, %esi
-; X86-NOBMI-NEXT:    adcl $0, %ecx
-; X86-NOBMI-NEXT:    movl %ebp, %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    mull %edx
-; X86-NOBMI-NEXT:    movl %edx, %ebp
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    addl %esi, %edi
-; X86-NOBMI-NEXT:    adcl %ecx, %ebp
-; X86-NOBMI-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT:    movl %edx, %ecx
-; X86-NOBMI-NEXT:    movl %eax, %esi
-; X86-NOBMI-NEXT:    addl %ebp, %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl %eax, %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:    mull %edx
-; X86-NOBMI-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %eax, %ebp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:    mull %edx
-; X86-NOBMI-NEXT:    addl %ebp, %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOBMI-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    addl %esi, %eax
-; X86-NOBMI-NEXT:    adcl %ecx, %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X86-NOBMI-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl $0, %eax
-; X86-NOBMI-NEXT:    adcl $0, %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl %esi, (%ecx,%ebx,8)
-; X86-NOBMI-NEXT:    movl %edi, 4(%ecx,%ebx,8)
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl %ecx, %edi
-; X86-NOBMI-NEXT:    addl $1, %ebx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X86-NOBMI-NEXT:    adcl $0, %esi
-; X86-NOBMI-NEXT:    movl %ebx, %ecx
-; X86-NOBMI-NEXT:    xorl %ebp, %ecx
-; X86-NOBMI-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    xorl %edi, %esi
-; X86-NOBMI-NEXT:    orl %ecx, %esi
-; X86-NOBMI-NEXT:    jne .LBB1_2
-; X86-NOBMI-NEXT:  .LBB1_3: # %for.end
-; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:    addl $28, %esp
-; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
-; X86-NOBMI-NEXT:    retl
-;
-; X86-BMI-LABEL: mul1:
-; X86-BMI:       # %bb.0: # %entry
-; X86-BMI-NEXT:    pushl %ebp
-; X86-BMI-NEXT:    pushl %ebx
-; X86-BMI-NEXT:    pushl %edi
-; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    subl $20, %esp
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    orl %ecx, %eax
-; X86-BMI-NEXT:    je .LBB1_3
-; X86-BMI-NEXT:  # %bb.1: # %for.body.preheader
-; X86-BMI-NEXT:    xorl %ecx, %ecx
-; X86-BMI-NEXT:    xorl %edx, %edx
-; X86-BMI-NEXT:    xorl %edi, %edi
-; X86-BMI-NEXT:    movl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X86-BMI-NEXT:    .p2align 4, 0x90
-; X86-BMI-NEXT:  .LBB1_2: # %for.body
-; X86-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-BMI-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl (%eax,%edi,8), %ecx
-; X86-BMI-NEXT:    movl 4(%eax,%edi,8), %ebx
-; X86-BMI-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl %eax, %esi
-; X86-BMI-NEXT:    mulxl %eax, %eax, %ebp
-; X86-BMI-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl %ebx, %edx
-; X86-BMI-NEXT:    mulxl %esi, %eax, %esi
-; X86-BMI-NEXT:    addl %ebp, %eax
-; X86-BMI-NEXT:    adcl $0, %esi
-; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %ebp, %ebx
-; X86-BMI-NEXT:    addl %eax, %ebp
-; X86-BMI-NEXT:    adcl %esi, %ebx
-; X86-BMI-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %eax, %esi
-; X86-BMI-NEXT:    setb %dl
-; X86-BMI-NEXT:    addl %ebx, %eax
-; X86-BMI-NEXT:    movzbl %dl, %edx
-; X86-BMI-NEXT:    adcl %edx, %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    xorl %ebx, %ebx
-; X86-BMI-NEXT:    mulxl %ebx, %ebx, %edx
-; X86-BMI-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    xorl %ecx, %ecx
-; X86-BMI-NEXT:    mulxl %ecx, %ecx, %edx
-; X86-BMI-NEXT:    addl %ebx, %ecx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X86-BMI-NEXT:    addl %eax, %ecx
-; X86-BMI-NEXT:    adcl %esi, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X86-BMI-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl $0, %ecx
-; X86-BMI-NEXT:    adcl $0, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl %esi, (%eax,%edi,8)
-; X86-BMI-NEXT:    movl %ebp, 4(%eax,%edi,8)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl %eax, %esi
-; X86-BMI-NEXT:    addl $1, %edi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
-; X86-BMI-NEXT:    adcl $0, %ebp
-; X86-BMI-NEXT:    movl %edi, %eax
-; X86-BMI-NEXT:    xorl %esi, %eax
-; X86-BMI-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl %ebp, %esi
-; X86-BMI-NEXT:    xorl %ebx, %esi
-; X86-BMI-NEXT:    orl %eax, %esi
-; X86-BMI-NEXT:    jne .LBB1_2
-; X86-BMI-NEXT:  .LBB1_3: # %for.end
-; X86-BMI-NEXT:    xorl %eax, %eax
-; X86-BMI-NEXT:    xorl %edx, %edx
-; X86-BMI-NEXT:    addl $20, %esp
-; X86-BMI-NEXT:    popl %esi
-; X86-BMI-NEXT:    popl %edi
-; X86-BMI-NEXT:    popl %ebx
-; X86-BMI-NEXT:    popl %ebp
-; X86-BMI-NEXT:    retl
-;
-; X64-NOBMI-LABEL: mul1:
-; X64-NOBMI:       # %bb.0: # %entry
-; X64-NOBMI-NEXT:    testq %rdi, %rdi
-; X64-NOBMI-NEXT:    je .LBB1_3
-; X64-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
-; X64-NOBMI-NEXT:    movq %rcx, %r8
-; X64-NOBMI-NEXT:    movq %rdx, %r9
-; X64-NOBMI-NEXT:    xorl %r10d, %r10d
-; X64-NOBMI-NEXT:    xorl %ecx, %ecx
-; X64-NOBMI-NEXT:    .p2align 4, 0x90
-; X64-NOBMI-NEXT:  .LBB1_2: # %for.body
-; X64-NOBMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NOBMI-NEXT:    movq %r8, %rax
-; X64-NOBMI-NEXT:    mulq (%r9,%rcx,8)
-; X64-NOBMI-NEXT:    addq %r10, %rax
-; X64-NOBMI-NEXT:    adcq $0, %rdx
-; X64-NOBMI-NEXT:    movq %rax, (%rsi,%rcx,8)
-; X64-NOBMI-NEXT:    incq %rcx
-; X64-NOBMI-NEXT:    cmpq %rcx, %rdi
-; X64-NOBMI-NEXT:    movq %rdx, %r10
-; X64-NOBMI-NEXT:    jne .LBB1_2
-; X64-NOBMI-NEXT:  .LBB1_3: # %for.end
-; X64-NOBMI-NEXT:    xorl %eax, %eax
-; X64-NOBMI-NEXT:    retq
+; X86-LABEL: mul1:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je .LBB1_3
+; X86-NEXT:  # %bb.1: # %for.body.preheader
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:  .LBB1_2: # %for.body
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl (%eax,%ebx,8), %ebp
+; X86-NEXT:    movl 4(%eax,%ebx,8), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, (%ecx,%ebx,8)
+; X86-NEXT:    movl %edi, 4(%ecx,%ebx,8)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorl %ebp, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    jne .LBB1_2
+; X86-NEXT:  .LBB1_3: # %for.end
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
 ;
-; X64-BMI-LABEL: mul1:
-; X64-BMI:       # %bb.0: # %entry
-; X64-BMI-NEXT:    testq %rdi, %rdi
-; X64-BMI-NEXT:    je .LBB1_3
-; X64-BMI-NEXT:  # %bb.1: # %for.body.preheader
-; X64-BMI-NEXT:    movq %rcx, %r8
-; X64-BMI-NEXT:    movq %rdx, %r9
-; X64-BMI-NEXT:    xorl %r10d, %r10d
-; X64-BMI-NEXT:    xorl %ecx, %ecx
-; X64-BMI-NEXT:    .p2align 4, 0x90
-; X64-BMI-NEXT:  .LBB1_2: # %for.body
-; X64-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-BMI-NEXT:    movq %r8, %rdx
-; X64-BMI-NEXT:    mulxq (%r9,%rcx,8), %rax, %rdx
-; X64-BMI-NEXT:    addq %r10, %rax
-; X64-BMI-NEXT:    adcq $0, %rdx
-; X64-BMI-NEXT:    movq %rax, (%rsi,%rcx,8)
-; X64-BMI-NEXT:    incq %rcx
-; X64-BMI-NEXT:    cmpq %rcx, %rdi
-; X64-BMI-NEXT:    movq %rdx, %r10
-; X64-BMI-NEXT:    jne .LBB1_2
-; X64-BMI-NEXT:  .LBB1_3: # %for.end
-; X64-BMI-NEXT:    xorl %eax, %eax
-; X64-BMI-NEXT:    retq
+; X64-LABEL: mul1:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    je .LBB1_3
+; X64-NEXT:  # %bb.1: # %for.body.preheader
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    xorl %r10d, %r10d
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB1_2: # %for.body
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq (%r9,%rcx,8)
+; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    movq %rax, (%rsi,%rcx,8)
+; X64-NEXT:    incq %rcx
+; X64-NEXT:    cmpq %rcx, %rdi
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    jne .LBB1_2
+; X64-NEXT:  .LBB1_3: # %for.end
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
 entry:
   %conv = zext i64 %y to i128
   %cmp11 = icmp eq i64 %n, 0
diff --git a/test/CodeGen/X86/i64-mem-copy.ll b/test/CodeGen/X86/i64-mem-copy.ll
index e14293797e86ae449e6d11ca7c8b74be4946ea77..d31f5f7780704938899edfaa5cc75b0c77d4acbd 100644
--- a/test/CodeGen/X86/i64-mem-copy.ll
+++ b/test/CodeGen/X86/i64-mem-copy.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=X32
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X32AVX
@@ -19,6 +20,14 @@ define void @foo(i64* %x, i64* %y) {
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    movsd %xmm0, (%eax)
 ; X32-NEXT:    retl
+;
+; X32AVX-LABEL: foo:
+; X32AVX:       # %bb.0:
+; X32AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32AVX-NEXT:    vmovsd %xmm0, (%eax)
+; X32AVX-NEXT:    retl
   %tmp1 = load i64, i64* %y, align 8
   store i64 %tmp1, i64* %x, align 8
   ret void
@@ -40,6 +49,13 @@ define void @store_i64_from_vector(<8 x i16> %x, <8 x i16> %y, i64* %i) {
 ; X32-NEXT:    paddw %xmm1, %xmm0
 ; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    retl
+;
+; X32AVX-LABEL: store_i64_from_vector:
+; X32AVX:       # %bb.0:
+; X32AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; X32AVX-NEXT:    vmovq %xmm0, (%eax)
+; X32AVX-NEXT:    retl
   %z = add <8 x i16> %x, %y                          ; force execution domain
   %bc = bitcast <8 x i16> %z to <2 x i64>
   %vecext = extractelement <2 x i64> %bc, i32 0
@@ -48,11 +64,35 @@ define void @store_i64_from_vector(<8 x i16> %x, <8 x i16> %y, i64* %i) {
 }
 
 define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) {
+; X64-LABEL: store_i64_from_vector256:
+; X64:       # %bb.0:
+; X64-NEXT:    paddw %xmm3, %xmm1
+; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: store_i64_from_vector256:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 24(%ebp), %eax
+; X32-NEXT:    paddw 8(%ebp), %xmm1
+; X32-NEXT:    movq %xmm1, (%eax)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa %esp, 4
+; X32-NEXT:    retl
+;
 ; X32AVX-LABEL: store_i64_from_vector256:
 ; X32AVX:       # %bb.0:
 ; X32AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; X32AVX-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; X32AVX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; X32AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; X32AVX-NEXT:    vmovq %xmm0, (%eax)
 ; X32AVX-NEXT:    vzeroupper
 ; X32AVX-NEXT:    retl
@@ -66,13 +106,73 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) {
 ; PR23476
 ; Handle extraction from a non-simple / pre-legalization type.
 
-define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) {
+define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) nounwind {
+; X64-LABEL: PR23476:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    andq $-64, %rsp
+; X64-NEXT:    subq $128, %rsp
+; X64-NEXT:    movq %rsi, %xmm0
+; X64-NEXT:    movq %rdi, %xmm1
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT:    movq %rcx, %xmm0
+; X64-NEXT:    movq %rdx, %xmm2
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; X64-NEXT:    movl 16(%rbp), %eax
+; X64-NEXT:    andl $7, %eax
+; X64-NEXT:    movq %r8, %xmm0
+; X64-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movdqa %xmm1, (%rsp)
+; X64-NEXT:    movq (%rsp,%rax,8), %rax
+; X64-NEXT:    movq %rax, (%r9)
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
 ; X32-LABEL: PR23476:
-; X32: andl $7, %eax
-; X32:         movsd {{.*#+}} xmm0 = mem[0],zero
-; X32:         movsd {{.*#+}} xmm0 = mem[0],zero
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-64, %esp
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl 52(%ebp), %eax
+; X32-NEXT:    andl $7, %eax
+; X32-NEXT:    movl 48(%ebp), %ecx
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    movups 8(%ebp), %xmm1
+; X32-NEXT:    movups 24(%ebp), %xmm2
+; X32-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movaps %xmm1, (%esp)
+; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    movsd %xmm0, (%ecx)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X32AVX-LABEL: PR23476:
+; X32AVX:       # %bb.0:
+; X32AVX-NEXT:    pushl %ebp
+; X32AVX-NEXT:    movl %esp, %ebp
+; X32AVX-NEXT:    andl $-64, %esp
+; X32AVX-NEXT:    subl $128, %esp
+; X32AVX-NEXT:    movl 52(%ebp), %eax
+; X32AVX-NEXT:    andl $7, %eax
+; X32AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32AVX-NEXT:    movl 48(%ebp), %ecx
+; X32AVX-NEXT:    vmovups 8(%ebp), %ymm1
+; X32AVX-NEXT:    vmovaps %ymm1, (%esp)
+; X32AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32AVX-NEXT:    vmovsd %xmm0, (%ecx)
+; X32AVX-NEXT:    movl %ebp, %esp
+; X32AVX-NEXT:    popl %ebp
+; X32AVX-NEXT:    vzeroupper
+; X32AVX-NEXT:    retl
   %ext = extractelement <5 x i64> %in, i32 %index
   store i64 %ext, i64* %out, align 8
   ret void
 }
+
diff --git a/test/CodeGen/X86/iabs.ll b/test/CodeGen/X86/iabs.ll
index a731e53b2a06b288cd674f76f7a38bd43196ff74..99495700f5adb65e8ab9ac72bfe8e3f3559ad8f0 100644
--- a/test/CodeGen/X86/iabs.ll
+++ b/test/CodeGen/X86/iabs.ll
@@ -21,10 +21,10 @@ define i8 @test_i8(i8 %a) nounwind {
 ;
 ; X64-LABEL: test_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    sarb $7, %cl
-; X64-NEXT:    addb %cl, %al
+; X64-NEXT:    leal (%rdi,%rcx), %eax
 ; X64-NEXT:    xorb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/test/CodeGen/X86/insert-into-constant-vector.ll b/test/CodeGen/X86/insert-into-constant-vector.ll
index 9a70bc8fffd30235570b11c46c966ea53d61a0fd..9d95f98a57b6f40a6ea34837e0a6f9511915256b 100644
--- a/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -3,6 +3,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=X64SSE --check-prefix=X64SSE2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+sse4.1   | FileCheck %s --check-prefix=X32SSE --check-prefix=X32SSE4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1   | FileCheck %s --check-prefix=X64SSE --check-prefix=X64SSE4
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx      | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX1
 ; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx2     | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2     | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx512f  | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX512F
@@ -269,21 +271,51 @@ define <8 x i32> @elt7_v8i32(i32 %x) {
 ; X64SSE4-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
 ; X64SSE4-NEXT:    retq
 ;
-; X32AVX-LABEL: elt7_v8i32:
-; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u>
-; X32AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X32AVX-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X32AVX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X32AVX-NEXT:    retl
+; X32AVX1-LABEL: elt7_v8i32:
+; X32AVX1:       # %bb.0:
+; X32AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; X32AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32AVX1-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
+; X32AVX1-NEXT:    retl
+;
+; X64AVX1-LABEL: elt7_v8i32:
+; X64AVX1:       # %bb.0:
+; X64AVX1-NEXT:    vmovd %edi, %xmm0
+; X64AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; X64AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64AVX1-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
+; X64AVX1-NEXT:    retq
+;
+; X32AVX2-LABEL: elt7_v8i32:
+; X32AVX2:       # %bb.0:
+; X32AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm0
+; X32AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32AVX2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
+; X32AVX2-NEXT:    retl
 ;
-; X64AVX-LABEL: elt7_v8i32:
-; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u>
-; X64AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64AVX-NEXT:    vpinsrd $3, %edi, %xmm1, %xmm1
-; X64AVX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64AVX-NEXT:    retq
+; X64AVX2-LABEL: elt7_v8i32:
+; X64AVX2:       # %bb.0:
+; X64AVX2-NEXT:    vmovd %edi, %xmm0
+; X64AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; X64AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
+; X64AVX2-NEXT:    retq
+;
+; X32AVX512F-LABEL: elt7_v8i32:
+; X32AVX512F:       # %bb.0:
+; X32AVX512F-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm0
+; X32AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32AVX512F-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
+; X32AVX512F-NEXT:    retl
+;
+; X64AVX512F-LABEL: elt7_v8i32:
+; X64AVX512F:       # %bb.0:
+; X64AVX512F-NEXT:    vmovd %edi, %xmm0
+; X64AVX512F-NEXT:    vpbroadcastd %xmm0, %xmm0
+; X64AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
+; X64AVX512F-NEXT:    retq
    %ins = insertelement <8 x i32> <i32 42, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i32 %x, i32 7
    ret <8 x i32> %ins
 }
@@ -322,19 +354,28 @@ define <8 x float> @elt6_v8f32(float %x) {
 ;
 ; X32AVX-LABEL: elt6_v8f32:
 ; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    vmovaps {{.*#+}} ymm0 = <4.2E+1,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,u,7.0E+0>
-; X32AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X32AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; X32AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32AVX-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm0
+; X32AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7]
 ; X32AVX-NEXT:    retl
 ;
-; X64AVX-LABEL: elt6_v8f32:
-; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vmovaps {{.*#+}} ymm1 = <4.2E+1,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,u,7.0E+0>
-; X64AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; X64AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
-; X64AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64AVX-NEXT:    retq
+; X64AVX1-LABEL: elt6_v8f32:
+; X64AVX1:       # %bb.0:
+; X64AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64AVX1-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7]
+; X64AVX1-NEXT:    retq
+;
+; X64AVX2-LABEL: elt6_v8f32:
+; X64AVX2:       # %bb.0:
+; X64AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64AVX2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7]
+; X64AVX2-NEXT:    retq
+;
+; X64AVX512F-LABEL: elt6_v8f32:
+; X64AVX512F:       # %bb.0:
+; X64AVX512F-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64AVX512F-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7]
+; X64AVX512F-NEXT:    retq
    %ins = insertelement <8 x float> <float 42.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, float %x, i32 6
    ret <8 x float> %ins
 }
@@ -370,6 +411,24 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X64SSE4-NEXT:    movaps {{.*#+}} xmm3 = [6,7]
 ; X64SSE4-NEXT:    retq
 ;
+; X32AVX1-LABEL: elt5_v8i64:
+; X32AVX1:       # %bb.0:
+; X32AVX1-NEXT:    movl $4, %eax
+; X32AVX1-NEXT:    vmovd %eax, %xmm0
+; X32AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X32AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32AVX1-NEXT:    vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm1
+; X32AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
+; X32AVX1-NEXT:    retl
+;
+; X64AVX1-LABEL: elt5_v8i64:
+; X64AVX1:       # %bb.0:
+; X64AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,u,6,7>
+; X64AVX1-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
+; X64AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; X64AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,1,2,3]
+; X64AVX1-NEXT:    retq
+;
 ; X32AVX2-LABEL: elt5_v8i64:
 ; X32AVX2:       # %bb.0:
 ; X32AVX2-NEXT:    movl $4, %eax
@@ -382,9 +441,9 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X64AVX2-LABEL: elt5_v8i64:
 ; X64AVX2:       # %bb.0:
-; X64AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = <4,u,6,7>
-; X64AVX2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
-; X64AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,u,6,7>
+; X64AVX2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
+; X64AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,1,2,3]
 ; X64AVX2-NEXT:    retq
 ;
@@ -401,10 +460,10 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X64AVX512F-LABEL: elt5_v8i64:
 ; X64AVX512F:       # %bb.0:
+; X64AVX512F-NEXT:    vmovq %rdi, %xmm1
+; X64AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,8,6,7]
 ; X64AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <42,1,2,3,4,u,6,7>
-; X64AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; X64AVX512F-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
-; X64AVX512F-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; X64AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
 ; X64AVX512F-NEXT:    retq
    %ins = insertelement <8 x i64> <i64 42, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, i64 %x, i32 5
    ret <8 x i64> %ins
@@ -430,33 +489,51 @@ define <8 x double> @elt1_v8f64(double %x) {
 ; X64SSE-NEXT:    movaps %xmm4, %xmm0
 ; X64SSE-NEXT:    retq
 ;
+; X32AVX1-LABEL: elt1_v8f64:
+; X32AVX1:       # %bb.0:
+; X32AVX1-NEXT:    vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X32AVX1-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3]
+; X32AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X32AVX1-NEXT:    retl
+;
+; X64AVX1-LABEL: elt1_v8f64:
+; X64AVX1:       # %bb.0:
+; X64AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X64AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; X64AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X64AVX1-NEXT:    retq
+;
 ; X32AVX2-LABEL: elt1_v8f64:
 ; X32AVX2:       # %bb.0:
-; X32AVX2-NEXT:    vmovapd {{.*#+}} ymm0 = <4.2E+1,u,2.0E+0,3.0E+0>
-; X32AVX2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; X32AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X32AVX2-NEXT:    vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X32AVX2-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3]
 ; X32AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X32AVX2-NEXT:    retl
 ;
 ; X64AVX2-LABEL: elt1_v8f64:
 ; X64AVX2:       # %bb.0:
-; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X64AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64AVX2-NEXT:    retq
 ;
 ; X32AVX512F-LABEL: elt1_v8f64:
 ; X32AVX512F:       # %bb.0:
-; X32AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
-; X32AVX512F-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; X32AVX512F-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; X32AVX512F-NEXT:    vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
+; X32AVX512F-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
+; X32AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X32AVX512F-NEXT:    retl
 ;
 ; X64AVX512F-LABEL: elt1_v8f64:
 ; X64AVX512F:       # %bb.0:
-; X64AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
+; X64AVX512F-NEXT:    vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
 ; X64AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
 ; X64AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X64AVX512F-NEXT:    retq
    %ins = insertelement <8 x double> <double 42.0, double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, double %x, i32 1
diff --git a/test/CodeGen/X86/insert-prefetch-inline.afdo b/test/CodeGen/X86/insert-prefetch-inline.afdo
new file mode 100644
index 0000000000000000000000000000000000000000..83b30f6e210ea007f79f95ecee1564056bcd2a5e
--- /dev/null
+++ b/test/CodeGen/X86/insert-prefetch-inline.afdo
@@ -0,0 +1,4 @@
+caller:0:0
+ 2:sum:0
+  3: 0 __prefetch_nta_0:23456
+  3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64
\ No newline at end of file
diff --git a/test/CodeGen/X86/insert-prefetch-inline.ll b/test/CodeGen/X86/insert-prefetch-inline.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5f8373f9480c9624f0d0222b301bd8c1ada51ebe
--- /dev/null
+++ b/test/CodeGen/X86/insert-prefetch-inline.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -prefetch-hints-file=%S/insert-prefetch-inline.afdo | FileCheck %s
+;
+; Verify we can insert prefetch instructions in code belonging to inlined
+; functions.
+;
+; ModuleID = 'test.cc'
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define dso_local i32 @sum(i32* nocapture readonly %arr, i32 %pos1, i32 %pos2) local_unnamed_addr #0 !dbg !7 {
+entry:
+  %idxprom = sext i32 %pos1 to i64, !dbg !10
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom, !dbg !10
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !10, !tbaa !11
+  %idxprom1 = sext i32 %pos2 to i64, !dbg !15
+  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1, !dbg !15
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !15, !tbaa !11
+  %add = add nsw i32 %1, %0, !dbg !16
+  ret i32 %add, !dbg !17
+}
+
+; "caller" inlines "sum". The associated .afdo file references instructions
+; in "caller" that came from "sum"'s inlining.
+;
+; Function Attrs: norecurse nounwind readonly uwtable
+define dso_local i32 @caller(i32* nocapture readonly %arr) local_unnamed_addr #0 !dbg !18 {
+entry:
+  %0 = load i32, i32* %arr, align 4, !dbg !19, !tbaa !11
+  %arrayidx2.i = getelementptr inbounds i32, i32* %arr, i64 2, !dbg !21
+  %1 = load i32, i32* %arrayidx2.i, align 4, !dbg !21, !tbaa !11
+  %add.i = add nsw i32 %1, %0, !dbg !22
+  ret i32 %add.i, !dbg !23
+}
+
+attributes #0 = { "target-cpu"="x86-64" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "test.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)"}
+!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !8, file: !8, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!8 = !DIFile(filename: "./test.h", directory: "/tmp")
+!9 = !DISubroutineType(types: !2)
+!10 = !DILocation(line: 6, column: 10, scope: !7)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C++ TBAA"}
+!15 = !DILocation(line: 6, column: 22, scope: !7)
+!16 = !DILocation(line: 6, column: 20, scope: !7)
+!17 = !DILocation(line: 6, column: 3, scope: !7)
+!18 = distinct !DISubprogram(name: "caller", linkageName: "caller", scope: !1, file: !1, line: 4, type: !9, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!19 = !DILocation(line: 6, column: 10, scope: !7, inlinedAt: !20)
+!20 = distinct !DILocation(line: 6, column: 10, scope: !18)
+!21 = !DILocation(line: 6, column: 22, scope: !7, inlinedAt: !20)
+!22 = !DILocation(line: 6, column: 20, scope: !7, inlinedAt: !20)
+!23 = !DILocation(line: 6, column: 3, scope: !18)
+
+; CHECK-LABEL: caller:
+; CHECK-LABEL: # %bb.0:
+; CHECK-NEXT: .loc 1 6 22 prologue_end
+; CHECK-NEXT: prefetchnta 23464(%rdi)
+; CHECK-NEXT: movl 8(%rdi), %eax
+; CHECK-NEXT: .loc 1 6 20 is_stmt 0 discriminator 2
+; CHECK-NEXT: prefetchnta 8764(%rdi)
+; CHECK-NEXT: prefetchnta 64(%rdi)
+; CHECK-NEXT: addl (%rdi), %eax
diff --git a/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo b/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
new file mode 100644
index 0000000000000000000000000000000000000000..6385a498b8f925b2169ba4a6cfab57996dfada35
--- /dev/null
+++ b/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
@@ -0,0 +1,2 @@
+main:0:0
+ 6: 0 __prefetch_nta_0:42
\ No newline at end of file
diff --git a/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..004fb56a56eb8a289705cc1ff2911b3f22e0617d
--- /dev/null
+++ b/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -prefetch-hints-file=%S/insert-prefetch-invalid-instr.afdo | FileCheck %s
+; ModuleID = 'prefetch.cc'
+source_filename = "prefetch.cc"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
+entry:
+  tail call void @llvm.prefetch(i8* inttoptr (i64 291 to i8*), i32 0, i32 0, i32 1), !dbg !9
+  tail call void @llvm.x86.avx512.gatherpf.dpd.512(i8 97, <8 x i32> undef, i8* null, i32 1, i32 2), !dbg !10
+  ret i32 291, !dbg !11
+}
+
+; Function Attrs: inaccessiblemem_or_argmemonly nounwind
+declare void @llvm.prefetch(i8* nocapture readonly, i32, i32, i32) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.x86.avx512.gatherpf.dpd.512(i8, <8 x i32>, i8*, i32, i32) #2
+
+attributes #0 = {"target-cpu"="x86-64" "target-features"="+avx512pf,+sse4.2,+ssse3"}
+attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
+attributes #2 = { argmemonly nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "prefetch.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 7.0.0 (trunk 327078) (llvm/trunk 327086)"}
+!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 12, column: 3, scope: !7)
+!10 = !DILocation(line: 14, column: 3, scope: !7)
+!11 = !DILocation(line: 15, column: 3, scope: !7)
+
+;CHECK-LABEL: main:
+;CHECK:       # %bb.0:
+;CHECK:       prefetchnta 291
+;CHECK-NOT:   prefetchnta 42(%rax,%ymm0)
+;CHECK:       vgatherpf1dpd (%rax,%ymm0) {%k1}
diff --git a/test/CodeGen/X86/insert-prefetch-other.afdo b/test/CodeGen/X86/insert-prefetch-other.afdo
new file mode 100644
index 0000000000000000000000000000000000000000..783da34f7f84c10b5ceb4342cb60cdbe302d2508
--- /dev/null
+++ b/test/CodeGen/X86/insert-prefetch-other.afdo
@@ -0,0 +1,3 @@
+sum:0:0
+ 1: 0 __prefetch_t0_1:0 __prefetch_t2_0:42
+ 1.1: 0 __prefetch_t1_0:18446744073709551615
diff --git a/test/CodeGen/X86/insert-prefetch.afdo b/test/CodeGen/X86/insert-prefetch.afdo
new file mode 100644
index 0000000000000000000000000000000000000000..96487e85eaaf271391b336a28fb15e4102339292
--- /dev/null
+++ b/test/CodeGen/X86/insert-prefetch.afdo
@@ -0,0 +1,3 @@
+sum:0:0
+ 1: 0 __prefetch_nta_1:0 __prefetch_nta_0:42
+ 1.1: 0 __prefetch_nta_0:18446744073709551615
diff --git a/test/CodeGen/X86/insert-prefetch.ll b/test/CodeGen/X86/insert-prefetch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9e77772df7746b96c42aefa3172da73d54fc986d
--- /dev/null
+++ b/test/CodeGen/X86/insert-prefetch.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -prefetch-hints-file=%S/insert-prefetch.afdo | FileCheck %s
+; RUN: llc < %s -prefetch-hints-file=%S/insert-prefetch-other.afdo | FileCheck %s -check-prefix=OTHERS
+;
+; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
+; int sum(int* arr, int pos1, int pos2) {
+;   return arr[pos1] + arr[pos2];
+; }
+;
+; NOTE: debug line numbers were adjusted such that the function would start
+; at line 15 (an arbitrary number). The sample profile file format uses
+; offsets from the start of the symbol instead of file-relative line numbers.
+; The .afdo file reflects that - the instructions are offset '1'.
+;
+; ModuleID = 'test.cc'
+source_filename = "test.cc"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @sum(i32* %arr, i32 %pos1, i32 %pos2) !dbg !35 !prof !37 {
+entry:
+  %idxprom = sext i32 %pos1 to i64, !dbg !38
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom, !dbg !38
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !38, !tbaa !39
+  %idxprom1 = sext i32 %pos2 to i64, !dbg !43
+  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1, !dbg !43
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !43, !tbaa !39
+  %add = add nsw i32 %1, %0, !dbg !44
+  ret i32 %add, !dbg !45
+}
+
+attributes #0 = { "target-cpu"="x86-64" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!33}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "test.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 1, !"ProfileSummary", !7}
+!7 = !{!8, !9, !10, !11, !12, !13, !14, !15}
+!8 = !{!"ProfileFormat", !"SampleProfile"}
+!9 = !{!"TotalCount", i64 0}
+!10 = !{!"MaxCount", i64 0}
+!11 = !{!"MaxInternalCount", i64 0}
+!12 = !{!"MaxFunctionCount", i64 0}
+!13 = !{!"NumCounts", i64 2}
+!14 = !{!"NumFunctions", i64 1}
+!15 = !{!"DetailedSummary", !16}
+!16 = !{!17, !18, !19, !20, !21, !22, !22, !23, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
+!17 = !{i32 10000, i64 0, i32 0}
+!18 = !{i32 100000, i64 0, i32 0}
+!19 = !{i32 200000, i64 0, i32 0}
+!20 = !{i32 300000, i64 0, i32 0}
+!21 = !{i32 400000, i64 0, i32 0}
+!22 = !{i32 500000, i64 0, i32 0}
+!23 = !{i32 600000, i64 0, i32 0}
+!24 = !{i32 700000, i64 0, i32 0}
+!25 = !{i32 800000, i64 0, i32 0}
+!26 = !{i32 900000, i64 0, i32 0}
+!27 = !{i32 950000, i64 0, i32 0}
+!28 = !{i32 990000, i64 0, i32 0}
+!29 = !{i32 999000, i64 0, i32 0}
+!30 = !{i32 999900, i64 0, i32 0}
+!31 = !{i32 999990, i64 0, i32 0}
+!32 = !{i32 999999, i64 0, i32 0}
+!33 = !{!"clang version 7.0.0 (trunk 322593) (llvm/trunk 322526)"}
+!35 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 15, type: !36, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!36 = !DISubroutineType(types: !2)
+!37 = !{!"function_entry_count", i64 -1}
+!38 = !DILocation(line: 16, column: 10, scope: !35)
+!39 = !{!40, !40, i64 0}
+!40 = !{!"int", !41, i64 0}
+!41 = !{!"omnipotent char", !42, i64 0}
+!42 = !{!"Simple C++ TBAA"}
+!43 = !DILocation(line: 16, column: 22, scope: !35)
+!44 = !DILocation(line: 16, column: 20, scope: !35)
+!45 = !DILocation(line: 16, column: 3, scope: !35)
+
+;CHECK-LABEL: sum:
+;CHECK:       # %bb.0:
+;CHECK:       prefetchnta 42(%rdi,%rax,4)
+;CHECK-NEXT:  prefetchnta (%rdi,%rax,4)
+;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
+;CHECK-NEXT:  .loc 1 16 20 discriminator 2  # test.cc:16:20
+;CHECK-NEXT:  prefetchnta -1(%rdi,%rcx,4)
+;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
+;CHECK-NEXT:  .loc 1 16 3                   # test.cc:16:3
+
+;OTHERS-LABEL: sum:
+;OTHERS:       # %bb.0:
+;OTHERS:       prefetcht2 42(%rdi,%rax,4)
+;OTHERS-NEXT:  prefetcht0 (%rdi,%rax,4)
+;OTHERS-NEXT:  movl (%rdi,%rax,4), %eax
+;OTHERS-NEXT:  .loc 1 16 20 discriminator 2  # test.cc:16:20
+;OTHERS-NEXT:  prefetcht1 -1(%rdi,%rcx,4)
+;OTHERS-NEXT:  addl (%rdi,%rcx,4), %eax
+;OTHERS-NEXT:  .loc 1 16 3                   # test.cc:16:3
diff --git a/test/CodeGen/X86/insertelement-ones.ll b/test/CodeGen/X86/insertelement-ones.ll
index 61f3c9673f6dd59b92f103ccee482a07291997c2..1d64053e9f6afe4f73cac13fc30472e0b1182784 100644
--- a/test/CodeGen/X86/insertelement-ones.ll
+++ b/test/CodeGen/X86/insertelement-ones.ll
@@ -344,15 +344,14 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
 ;
 ; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; SSSE3-NEXT:    movl $255, %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
+; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
 ; SSSE3-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: insert_v16i8_x123456789ABCDEx:
@@ -420,22 +419,20 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
 ;
 ; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; SSSE3-NEXT:    movl $255, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm3, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSSE3-NEXT:    por %xmm0, %xmm2
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[15]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pshufb %xmm3, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero
+; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
diff --git a/test/CodeGen/X86/insertps-combine.ll b/test/CodeGen/X86/insertps-combine.ll
index 6807048a05e5fc5761ea366a50d5d08317e772e5..b99605ad50be00354f9a087b9c841aaeab15cebb 100644
--- a/test/CodeGen/X86/insertps-combine.ll
+++ b/test/CodeGen/X86/insertps-combine.ll
@@ -284,13 +284,12 @@ define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
 define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) {
 ; SSE-LABEL: extract_lane_insertps_6123:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps (%rdi), %xmm0
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract_lane_insertps_6123:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
   %a1 = load <4 x float>, <4 x float> *%p1
   %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
diff --git a/test/CodeGen/X86/ipra-reg-alias.ll b/test/CodeGen/X86/ipra-reg-alias.ll
index 92bf1d1761005169c53d5f848c31821c65e31d3e..29b2111bf34ac876b4ddeb66eed2a410f020d2b4 100644
--- a/test/CodeGen/X86/ipra-reg-alias.ll
+++ b/test/CodeGen/X86/ipra-reg-alias.ll
@@ -1,12 +1,16 @@
-; RUN: llc -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
-target triple = "x86_64--"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-- -enable-ipra -o - < %s | FileCheck %s
 
 define i8 @main(i8 %X) {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal (%rdi,%rdi,4), %eax
+; CHECK-NEXT:    addb $5, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %inc = add i8 %X, 1
   %inc2 = mul i8 %inc, 5
-; Here only CL is clobbred so CH should not be clobbred, but CX, ECX and RCX
-; should be clobbered.
-; CHECK: main Clobbered Registers: $ah $al $ax $cl $cx $eax $ecx $eflags $hax $rax $rcx
   ret i8 %inc2
 }
 
diff --git a/test/CodeGen/X86/known-bits-vector.ll b/test/CodeGen/X86/known-bits-vector.ll
index 3da72f0b2f61b867e31f117cbd8114953e1bdd4b..d3e4843f4abed947be28bd6ef1e27d9bf92f754d 100644
--- a/test/CodeGen/X86/known-bits-vector.ll
+++ b/test/CodeGen/X86/known-bits-vector.ll
@@ -5,13 +5,17 @@
 define i32 @knownbits_mask_extract_sext(<8 x i16> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_extract_sext:
 ; X32:       # %bb.0:
-; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    movl $15, %eax
+; X32-NEXT:    vmovd %eax, %xmm1
+; X32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpextrw $0, %xmm0, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_mask_extract_sext:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    movl $15, %eax
+; X64-NEXT:    vmovd %eax, %xmm1
+; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpextrw $0, %xmm0, %eax
 ; X64-NEXT:    retq
   %1 = and <8 x i16> %a0, <i16 15, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -158,19 +162,17 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
 define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_or_shuffle_uitofp:
 ; X32:       # %bb.0:
-; X32-NEXT:    vpor {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; X32-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vorps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; X32-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_mask_or_shuffle_uitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; X64-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
   %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -497,14 +499,12 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 define <4 x i32> @knownbits_umax_shuffle_ashr(<4 x i32> %a0) {
 ; X32-LABEL: knownbits_umax_shuffle_ashr:
 ; X32:       # %bb.0:
-; X32-NEXT:    vpmaxud {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; X32-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_umax_shuffle_ashr:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; X64-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> <i32 65535, i32 -1, i32 -1, i32 262143>)
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
diff --git a/test/CodeGen/X86/known-bits.ll b/test/CodeGen/X86/known-bits.ll
index 5066e4777cc6947198454f6922262ed12340030f..8f3b983251d10f2b8e557a21feb3e93cca6135c4 100644
--- a/test/CodeGen/X86/known-bits.ll
+++ b/test/CodeGen/X86/known-bits.ll
@@ -37,17 +37,17 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
 ; X32-NEXT:  .LBB0_1: # %CF
 ; X32-NEXT:    # =>This Loop Header: Depth=1
 ; X32-NEXT:    # Child Loop BB0_2 Depth 2
-; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    divl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    divl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    divl (%esp) # 4-byte Folded Reload
 ; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    divl (%esp) # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    divl %esi
 ; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    divl %esi
 ; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    divl %ebp
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB0_2: # %CF237
@@ -87,17 +87,17 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
 ; X64-NEXT:  .LBB0_1: # %CF
 ; X64-NEXT:    # =>This Loop Header: Depth=1
 ; X64-NEXT:    # Child Loop BB0_2 Depth 2
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    movl %r8d, %eax
-; X64-NEXT:    divl %r9d
 ; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %r9d
 ; X64-NEXT:    movl %r10d, %eax
-; X64-NEXT:    divl %r11d
 ; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %r11d
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    divl %ebx
 ; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ebx
 ; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divl %ebp
 ; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB0_2: # %CF237
@@ -298,3 +298,36 @@ declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
 declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+
+define i32 @knownbits_fshl(i32 %a0) nounwind {
+; X32-LABEL: knownbits_fshl:
+; X32:       # %bb.0:
+; X32-NEXT:    movl $3, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_fshl:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $3, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a0, i32 -1, i32 5)
+  %2 = and i32 %1, 3
+  ret i32 %2
+}
+
+define i32 @knownbits_fshr(i32 %a0) nounwind {
+; X32-LABEL: knownbits_fshr:
+; X32:       # %bb.0:
+; X32-NEXT:    movl $3, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_fshr:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $3, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a0, i32 -1, i32 5)
+  %2 = and i32 %1, 3
+  ret i32 %2
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
+declare i32 @llvm.fshr.i32(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll
index 64bca7330689598cb0857000d7e5b38ec6aaca32..0e27c71362b8a76f5048a211b34d675ddb9ab445 100644
--- a/test/CodeGen/X86/known-signbits-vector.ll
+++ b/test/CodeGen/X86/known-signbits-vector.ll
@@ -28,10 +28,9 @@ define <4 x float> @signbits_sext_v4i64_sitofp_v4f32(i8 signext %a0, i16 signext
 ; X32-NEXT:    movswl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    vmovd %ecx, %xmm0
-; X32-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
-; X32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X32-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
@@ -75,11 +74,9 @@ define float @signbits_ashr_extract_sitofp_0(<2 x i64> %a0) nounwind {
 ;
 ; X64-LABEL: signbits_ashr_extract_sitofp_0:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrad $31, %xmm0, %xmm1
-; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X64-NEXT:    vpsrlq $32, %xmm0, %xmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
-; X64-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; X64-NEXT:    vcvtsi2ssl %eax, %xmm1, %xmm0
 ; X64-NEXT:    retq
   %1 = ashr <2 x i64> %a0, <i64 32, i64 32>
   %2 = extractelement <2 x i64> %1, i32 0
@@ -91,9 +88,7 @@ define float @signbits_ashr_extract_sitofp_1(<2 x i64> %a0) nounwind {
 ; X32-LABEL: signbits_ashr_extract_sitofp_1:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    vpsrlq $63, %xmm0, %xmm1
 ; X32-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0]
 ; X32-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
@@ -106,9 +101,7 @@ define float @signbits_ashr_extract_sitofp_1(<2 x i64> %a0) nounwind {
 ;
 ; X64-LABEL: signbits_ashr_extract_sitofp_1:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrlq $63, %xmm0, %xmm1
 ; X64-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483648,1]
 ; X64-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
@@ -125,9 +118,7 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
 ; X32-LABEL: signbits_ashr_shl_extract_sitofp:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    vpsrlq $60, %xmm0, %xmm1
 ; X32-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0]
 ; X32-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
@@ -141,9 +132,7 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
 ;
 ; X64-LABEL: signbits_ashr_shl_extract_sitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrlq $60, %xmm0, %xmm1
 ; X64-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,8]
 ; X64-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
@@ -168,8 +157,6 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
 ; X32-NEXT:    sarl $30, %ecx
 ; X32-NEXT:    vmovd %eax, %xmm0
 ; X32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
-; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X32-NEXT:    vpsrlq $3, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
 ; X32-NEXT:    vcvtsi2ssl %eax, %xmm1, %xmm0
@@ -181,14 +168,10 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
 ; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp:
 ; X64:       # %bb.0:
 ; X64-NEXT:    sarq $30, %rdi
-; X64-NEXT:    vmovq %rsi, %xmm0
-; X64-NEXT:    vmovq %rdi, %xmm1
-; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-NEXT:    vpsrad $3, %xmm0, %xmm1
+; X64-NEXT:    vmovq %rdi, %xmm0
 ; X64-NEXT:    vpsrlq $3, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; X64-NEXT:    vmovq %xmm0, %rax
-; X64-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; X64-NEXT:    vcvtsi2ssl %eax, %xmm1, %xmm0
 ; X64-NEXT:    retq
   %1 = ashr i64 %a0, 30
   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
@@ -231,6 +214,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1)
   ret <4 x double> %3
 }
 
+; TODO: Fix vpshufd+vpsrlq -> vpshufd/vpermilps
 define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
 ; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
 ; X32:       # %bb.0:
@@ -240,7 +224,8 @@ define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4
 ;
 ; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X64-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; X64-NEXT:    retq
   %1 = ashr <2 x i64> %a0, <i64 16, i64 16>
@@ -256,23 +241,14 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
 ; X32-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpsrlq $60, %xmm0, %xmm2
 ; X32-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
-; X32-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; X32-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; X32-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
-; X32-NEXT:    sarl $31, %eax
-; X32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X32-NEXT:    vpsllq $20, %xmm1, %xmm1
-; X32-NEXT:    vpsrad $20, %xmm1, %xmm2
-; X32-NEXT:    vpsrlq $20, %xmm1, %xmm1
-; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0]
+; X32-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X32-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; X32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -280,21 +256,15 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
 ;
 ; X64-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrlq $60, %xmm0, %xmm2
 ; X64-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,8]
-; X64-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; X64-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,8]
+; X64-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    vpinsrq $0, %rax, %xmm1, %xmm1
-; X64-NEXT:    vpsllq $20, %xmm1, %xmm1
-; X64-NEXT:    vpsrad $20, %xmm1, %xmm2
-; X64-NEXT:    vpsrlq $20, %xmm1, %xmm1
-; X64-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X64-NEXT:    vmovq %rax, %xmm1
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
-; X64-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
+; X64-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; X64-NEXT:    retq
   %1 = ashr <2 x i64> %a0, <i64 61, i64 60>
   %2 = sext i32 %a2 to i64
@@ -364,15 +334,11 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X32-NEXT:    vpmovsxdq 16(%ebp), %xmm3
 ; X32-NEXT:    vpmovsxdq 8(%ebp), %xmm4
 ; X32-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; X32-NEXT:    vpsrlq $63, %xmm5, %xmm6
 ; X32-NEXT:    vpsrlq $33, %xmm5, %xmm5
-; X32-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
 ; X32-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,16384,0,0,1,0,0,0]
 ; X32-NEXT:    vpxor %xmm6, %xmm5, %xmm5
 ; X32-NEXT:    vpsubq %xmm6, %xmm5, %xmm5
-; X32-NEXT:    vpsrlq $63, %xmm2, %xmm7
 ; X32-NEXT:    vpsrlq $33, %xmm2, %xmm2
-; X32-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7]
 ; X32-NEXT:    vpxor %xmm6, %xmm2, %xmm2
 ; X32-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
 ; X32-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
@@ -395,15 +361,11 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X64-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; X64-NEXT:    vpsrlq $63, %xmm4, %xmm5
 ; X64-NEXT:    vpsrlq $33, %xmm4, %xmm4
-; X64-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
 ; X64-NEXT:    vmovdqa {{.*#+}} xmm5 = [1073741824,1]
 ; X64-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; X64-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; X64-NEXT:    vpsrlq $63, %xmm2, %xmm6
 ; X64-NEXT:    vpsrlq $33, %xmm2, %xmm2
-; X64-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
 ; X64-NEXT:    vpxor %xmm5, %xmm2, %xmm2
 ; X64-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
 ; X64-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
diff --git a/test/CodeGen/X86/legalize-shl-vec.ll b/test/CodeGen/X86/legalize-shl-vec.ll
index dcce43715039576c6cafea9926c4b03a1c8f3b1a..ff8b3332075f9c5db690e1b52e6becab8d03f706 100644
--- a/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/test/CodeGen/X86/legalize-shl-vec.ll
@@ -30,7 +30,9 @@ define <2 x i256> @test_shl(<2 x i256> %In) {
 ; X32-NEXT:    movl %ecx, 36(%eax)
 ; X32-NEXT:    shll $2, %edx
 ; X32-NEXT:    movl %edx, 32(%eax)
-; X32-NEXT:    movl $0, 28(%eax)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    shll $31, %ecx
+; X32-NEXT:    movl %ecx, 28(%eax)
 ; X32-NEXT:    movl $0, 24(%eax)
 ; X32-NEXT:    movl $0, 20(%eax)
 ; X32-NEXT:    movl $0, 16(%eax)
@@ -45,20 +47,22 @@ define <2 x i256> @test_shl(<2 x i256> %In) {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; X64-NEXT:    shldq $2, %rcx, %rdx
-; X64-NEXT:    shldq $2, %rsi, %rcx
-; X64-NEXT:    shldq $2, %r9, %rsi
+; X64-NEXT:    shldq $2, %rdi, %rcx
+; X64-NEXT:    shldq $2, %r9, %rdi
+; X64-NEXT:    shlq $63, %rsi
 ; X64-NEXT:    shlq $2, %r9
-; X64-NEXT:    movq %rdx, 56(%rdi)
-; X64-NEXT:    movq %rcx, 48(%rdi)
-; X64-NEXT:    movq %rsi, 40(%rdi)
-; X64-NEXT:    movq %r9, 32(%rdi)
+; X64-NEXT:    movq %rdx, 56(%rax)
+; X64-NEXT:    movq %rcx, 48(%rax)
+; X64-NEXT:    movq %rdi, 40(%rax)
+; X64-NEXT:    movq %r9, 32(%rax)
+; X64-NEXT:    movq %rsi, 24(%rax)
 ; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    movaps %xmm0, 16(%rdi)
-; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    movaps %xmm0, (%rax)
+; X64-NEXT:    movq $0, 16(%rax)
 ; X64-NEXT:    retq
-  %Amt = insertelement <2 x i256> <i256 1, i256 2>, i256 -1, i32 0
+  %Amt = insertelement <2 x i256> <i256 1, i256 2>, i256 255, i32 0
   %Out = shl <2 x i256> %In, %Amt
   ret <2 x i256> %Out
 }
@@ -110,6 +114,9 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
 ; X32-NEXT:    movl %ebx, 40(%eax)
 ; X32-NEXT:    movl %ebp, 36(%eax)
 ; X32-NEXT:    movl %ecx, 32(%eax)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    shrl $31, %ecx
+; X32-NEXT:    movl %ecx, (%eax)
 ; X32-NEXT:    movl $0, 28(%eax)
 ; X32-NEXT:    movl $0, 24(%eax)
 ; X32-NEXT:    movl $0, 20(%eax)
@@ -117,7 +124,6 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
 ; X32-NEXT:    movl $0, 12(%eax)
 ; X32-NEXT:    movl $0, 8(%eax)
 ; X32-NEXT:    movl $0, 4(%eax)
-; X32-NEXT:    movl $0, (%eax)
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 20
 ; X32-NEXT:    popl %esi
@@ -138,17 +144,19 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; X64-NEXT:    shrdq $4, %rsi, %r9
 ; X64-NEXT:    shrdq $4, %rcx, %rsi
+; X64-NEXT:    shrq $63, %r8
 ; X64-NEXT:    shrdq $4, %rdx, %rcx
 ; X64-NEXT:    shrq $4, %rdx
 ; X64-NEXT:    movq %rdx, 56(%rdi)
 ; X64-NEXT:    movq %rcx, 48(%rdi)
 ; X64-NEXT:    movq %rsi, 40(%rdi)
 ; X64-NEXT:    movq %r9, 32(%rdi)
+; X64-NEXT:    movq %r8, (%rdi)
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, 16(%rdi)
-; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    movq $0, 8(%rdi)
 ; X64-NEXT:    retq
-  %Amt = insertelement <2 x i256> <i256 3, i256 4>, i256 -1, i32 0
+  %Amt = insertelement <2 x i256> <i256 3, i256 4>, i256 255, i32 0
   %Out = lshr <2 x i256> %In, %Amt
   ret <2 x i256> %Out
 }
@@ -242,7 +250,7 @@ define <2 x i256> @test_sra(<2 x i256> %In) {
 ; X64-NEXT:    movq %r8, 8(%rdi)
 ; X64-NEXT:    movq %r8, (%rdi)
 ; X64-NEXT:    retq
-  %Amt = insertelement <2 x i256> <i256 5, i256 6>, i256 -1, i32 0
+  %Amt = insertelement <2 x i256> <i256 5, i256 6>, i256 255, i32 0
   %Out = ashr <2 x i256> %In, %Amt
   ret <2 x i256> %Out
 }
diff --git a/test/CodeGen/X86/llc-start-stop-instance.ll b/test/CodeGen/X86/llc-start-stop-instance.ll
new file mode 100644
index 0000000000000000000000000000000000000000..89a77fc44f3e9573661eb9b7de1c0b224f29d79a
--- /dev/null
+++ b/test/CodeGen/X86/llc-start-stop-instance.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple=x86_64-- -debug-pass=Structure -stop-after=dead-mi-elimination,1 %s -o /dev/null 2>&1 \
+; RUN:   | FileCheck -check-prefix=STOP-AFTER-DEAD1 %s
+
+; RUN: llc -mtriple=x86_64-- -debug-pass=Structure -stop-after=dead-mi-elimination,0 %s -o /dev/null 2>&1 \
+; RUN:   | FileCheck -check-prefix=STOP-AFTER-DEAD0 %s
+
+
+; RUN: llc -mtriple=x86_64-- -debug-pass=Structure -stop-before=dead-mi-elimination,1 %s -o /dev/null 2>&1 \
+; RUN:   | FileCheck -check-prefix=STOP-BEFORE-DEAD1 %s
+
+
+; RUN: llc -mtriple=x86_64-- -debug-pass=Structure -start-before=dead-mi-elimination,1 %s -o /dev/null 2>&1 \
+; RUN:   | FileCheck -check-prefix=START-BEFORE-DEAD1 %s
+
+; RUN: llc -mtriple=x86_64-- -debug-pass=Structure -start-after=dead-mi-elimination,1 %s -o /dev/null 2>&1 \
+; RUN:   | FileCheck -check-prefix=START-AFTER-DEAD1 %s
+
+
+
+; STOP-AFTER-DEAD1: -dead-mi-elimination
+; STOP-AFTER-DEAD1-SAME: -dead-mi-elimination
+
+; STOP-AFTER-DEAD1: Remove dead machine instructions
+; STOP-AFTER-DEAD1: Remove dead machine instructions
+
+
+
+; STOP-AFTER-DEAD0:     -dead-mi-elimination
+
+; STOP-AFTER-DEAD0-NOT: Remove dead machine instructions
+; STOP-AFTER-DEAD0: Remove dead machine instructions
+; STOP-AFTER-DEAD0-NOT: Remove dead machine instructions
+
+
+
+; STOP-BEFORE-DEAD1:     -dead-mi-elimination
+; STOP-BEFORE-DEAD1: Remove dead machine instructions
+; STOP-BEFORE-DEAD1-NOT: Remove dead machine instructions
+
+
+
+; START-BEFORE-DEAD1:     -dead-mi-elimination
+; START-BEFORE-DEAD1-NOT: Remove dead machine instructions
+; START-BEFORE-DEAD1: Remove dead machine instructions
+; START-BEFORE-DEAD1-NOT: Remove dead machine instructions
+
+
+
+; START-AFTER-DEAD1-NOT: -dead-mi-elimination
+; START-AFTER-DEAD1-NOT: Remove dead machine instructions
diff --git a/test/CodeGen/X86/load-scalar-as-vector.ll b/test/CodeGen/X86/load-scalar-as-vector.ll
index 6a7bcf658e4a6789d7d9a7fbedea2b4ba2a1114c..7bc9ee0fd6b75a9d8a36242311eef6ee63d72700 100644
--- a/test/CodeGen/X86/load-scalar-as-vector.ll
+++ b/test/CodeGen/X86/load-scalar-as-vector.ll
@@ -518,33 +518,29 @@ define <2 x i64> @urem_op0_constant(i64* %p) nounwind {
 define <16 x i8> @urem_op1_constant(i8* %p) nounwind {
 ; SSE-LABEL: urem_op1_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movb (%rdi), %cl
-; SSE-NEXT:    movl %ecx, %eax
-; SSE-NEXT:    shrb %al
+; SSE-NEXT:    movb (%rdi), %al
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shrb %cl
+; SSE-NEXT:    movzbl %cl, %ecx
+; SSE-NEXT:    imull $49, %ecx, %ecx
+; SSE-NEXT:    shrl $10, %ecx
+; SSE-NEXT:    imull $42, %ecx, %ecx
+; SSE-NEXT:    subb %cl, %al
 ; SSE-NEXT:    movzbl %al, %eax
-; SSE-NEXT:    imull $49, %eax, %eax
-; SSE-NEXT:    shrl $10, %eax
-; SSE-NEXT:    movb $42, %dl
-; SSE-NEXT:    # kill: def $al killed $al killed $eax
-; SSE-NEXT:    mulb %dl
-; SSE-NEXT:    subb %al, %cl
-; SSE-NEXT:    movzbl %cl, %eax
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: urem_op1_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movb (%rdi), %cl
-; AVX-NEXT:    movl %ecx, %eax
-; AVX-NEXT:    shrb %al
+; AVX-NEXT:    movb (%rdi), %al
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    shrb %cl
+; AVX-NEXT:    movzbl %cl, %ecx
+; AVX-NEXT:    imull $49, %ecx, %ecx
+; AVX-NEXT:    shrl $10, %ecx
+; AVX-NEXT:    imull $42, %ecx, %ecx
+; AVX-NEXT:    subb %cl, %al
 ; AVX-NEXT:    movzbl %al, %eax
-; AVX-NEXT:    imull $49, %eax, %eax
-; AVX-NEXT:    shrl $10, %eax
-; AVX-NEXT:    movb $42, %dl
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    mulb %dl
-; AVX-NEXT:    subb %al, %cl
-; AVX-NEXT:    movzbl %cl, %eax
 ; AVX-NEXT:    vmovd %eax, %xmm0
 ; AVX-NEXT:    retq
   %x = load i8, i8* %p
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index b55b43fafa5e37b30e0374637b688966330886b0..281aaca7c8d1fca69394060065d60b799b868bbb 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -63,8 +63,8 @@ define void @commute(i32 %test_case, i32 %scale) nounwind ssp {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    imull %edi, %esi
 ; CHECK-NEXT:    leal (%rsi,%rsi,2), %esi
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    # kill: def $edi killed $edi killed $rdi
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    callq printf
 ; CHECK-NEXT:    addq $8, %rsp
 ; CHECK-NEXT:    .p2align 4, 0x90
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index bf46887b0748243f4e2d71040cf84aaa06cd6b44..78c6dbe4af0d00bdae85cab7662c99cce8015d58 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -234,16 +234,14 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB2_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqu (%rdi,%rcx,2), %ymm2
-; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm2
+; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm3
+; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $16, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB2_1
@@ -398,46 +396,41 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-LABEL: _Z10test_shortPsS_i_1024:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    movl %edx, %eax
-; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    xorl %ecx, %ecx
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB3_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqu (%rdi,%rcx,2), %ymm3
-; AVX1-NEXT:    vmovdqu 32(%rdi,%rcx,2), %ymm4
-; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm5
-; AVX1-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm6
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
-; AVX1-NEXT:    vpmaddwd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm3
+; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm4
+; AVX1-NEXT:    vmovdqu 32(%rsi,%rcx,2), %xmm5
+; AVX1-NEXT:    vmovdqu 48(%rsi,%rcx,2), %xmm6
+; AVX1-NEXT:    vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaddwd %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $16, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB3_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
-; AVX1-NEXT:    vpaddd %xmm8, %xmm2, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm4
+; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -954,38 +947,38 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB7_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm6
 ; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm7
 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm6
-; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm5
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmaddwd %xmm7, %xmm2
+; SSE2-NEXT:    pmaddwd %xmm5, %xmm2
 ; SSE2-NEXT:    paddd %xmm2, %xmm9
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
+; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
 ; SSE2-NEXT:    paddd %xmm2, %xmm4
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    pmaddwd %xmm6, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    pmaddwd %xmm7, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm3
 ; SSE2-NEXT:    addq $32, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB7_1
@@ -1042,8 +1035,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
@@ -2355,13 +2347,11 @@ define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) {
 ;
 ; AVX1-LABEL: pmaddwd_256:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vpmaddwd 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX256-LABEL: pmaddwd_256:
@@ -2400,20 +2390,16 @@ define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) {
 ;
 ; AVX1-LABEL: pmaddwd_512:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT:    vpmaddwd 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpmaddwd 48(%rsi), %xmm3, %xmm1
+; AVX1-NEXT:    vpmaddwd 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: pmaddwd_512:
@@ -2458,61 +2444,53 @@ define <32 x i32> @pmaddwd_1024(<64 x i16>* %Aptr, <64 x i16>* %Bptr) {
 ; SSE2-LABEL: pmaddwd_1024:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    movdqa 112(%rsi), %xmm0
-; SSE2-NEXT:    movdqa 96(%rsi), %xmm1
-; SSE2-NEXT:    movdqa 80(%rsi), %xmm2
-; SSE2-NEXT:    movdqa 64(%rsi), %xmm3
-; SSE2-NEXT:    movdqa (%rsi), %xmm4
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm5
-; SSE2-NEXT:    movdqa 32(%rsi), %xmm6
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm7
-; SSE2-NEXT:    pmaddwd (%rdx), %xmm4
-; SSE2-NEXT:    pmaddwd 16(%rdx), %xmm5
-; SSE2-NEXT:    pmaddwd 32(%rdx), %xmm6
-; SSE2-NEXT:    pmaddwd 48(%rdx), %xmm7
-; SSE2-NEXT:    pmaddwd 64(%rdx), %xmm3
-; SSE2-NEXT:    pmaddwd 80(%rdx), %xmm2
-; SSE2-NEXT:    pmaddwd 96(%rdx), %xmm1
-; SSE2-NEXT:    pmaddwd 112(%rdx), %xmm0
-; SSE2-NEXT:    movdqa %xmm0, 112(%rdi)
-; SSE2-NEXT:    movdqa %xmm1, 96(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
-; SSE2-NEXT:    movdqa %xmm3, 64(%rdi)
-; SSE2-NEXT:    movdqa %xmm7, 48(%rdi)
-; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
-; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
-; SSE2-NEXT:    movdqa %xmm4, (%rdi)
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
+; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
+; SSE2-NEXT:    pmaddwd (%rdx), %xmm0
+; SSE2-NEXT:    pmaddwd 16(%rdx), %xmm1
+; SSE2-NEXT:    pmaddwd 32(%rdx), %xmm2
+; SSE2-NEXT:    pmaddwd 48(%rdx), %xmm3
+; SSE2-NEXT:    movdqa 64(%rsi), %xmm4
+; SSE2-NEXT:    pmaddwd 64(%rdx), %xmm4
+; SSE2-NEXT:    movdqa 80(%rsi), %xmm5
+; SSE2-NEXT:    pmaddwd 80(%rdx), %xmm5
+; SSE2-NEXT:    movdqa 96(%rsi), %xmm6
+; SSE2-NEXT:    pmaddwd 96(%rdx), %xmm6
+; SSE2-NEXT:    movdqa 112(%rsi), %xmm7
+; SSE2-NEXT:    pmaddwd 112(%rdx), %xmm7
+; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
+; SSE2-NEXT:    movdqa %xmm6, 96(%rdi)
+; SSE2-NEXT:    movdqa %xmm5, 80(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
+; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: pmaddwd_1024:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm8
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX1-NEXT:    vmovdqa 64(%rsi), %ymm6
-; AVX1-NEXT:    vmovdqa 96(%rsi), %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm9, %xmm8, %xmm4
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT:    vpmaddwd 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpmaddwd 48(%rsi), %xmm3, %xmm1
+; AVX1-NEXT:    vpmaddwd 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovdqa 80(%rdi), %xmm2
+; AVX1-NEXT:    vpmaddwd 80(%rsi), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX1-NEXT:    vpmaddwd 64(%rsi), %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vmovdqa 112(%rdi), %xmm3
+; AVX1-NEXT:    vpmaddwd 112(%rsi), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX1-NEXT:    vpmaddwd 96(%rsi), %xmm4, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/masked_gather.ll b/test/CodeGen/X86/masked_gather.ll
new file mode 100644
index 0000000000000000000000000000000000000000..356ec1f00b5393075fa22812a710d40208082e44
--- /dev/null
+++ b/test/CodeGen/X86/masked_gather.ll
@@ -0,0 +1,1156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+sse4.2 < %s | FileCheck %s --check-prefixes=ALL,SSE,SSE42
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL,AVX,AVX1
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx2 < %s | FileCheck %s --check-prefixes=ALL,AVX,AVX2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL,AVX512
+
+;
+; vXf32
+;
+
+define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger, <4 x float> %passthru) {
+; SSE-LABEL: gather_v4f32_ptr_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE-NEXT:    pextrb $0, %xmm4, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB0_2
+; SSE-NEXT:  # %bb.1: # %cond.load
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; SSE-NEXT:    blendps {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3]
+; SSE-NEXT:  .LBB0_2: # %else
+; SSE-NEXT:    pextrb $4, %xmm4, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB0_4
+; SSE-NEXT:  # %bb.3: # %cond.load1
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; SSE-NEXT:  .LBB0_4: # %else2
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pextrb $8, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB0_6
+; SSE-NEXT:  # %bb.5: # %cond.load4
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; SSE-NEXT:  .LBB0_6: # %else5
+; SSE-NEXT:    pextrb $12, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB0_8
+; SSE-NEXT:  # %bb.7: # %cond.load7
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; SSE-NEXT:  .LBB0_8: # %else8
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: gather_v4f32_ptr_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB0_2
+; AVX1-NEXT:  # %bb.1: # %cond.load
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
+; AVX1-NEXT:  .LBB0_2: # %else
+; AVX1-NEXT:    vpextrb $4, %xmm3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB0_4
+; AVX1-NEXT:  # %bb.3: # %cond.load1
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT:  .LBB0_4: # %else2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB0_6
+; AVX1-NEXT:  # %bb.5: # %cond.load4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT:  .LBB0_6: # %else5
+; AVX1-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB0_8
+; AVX1-NEXT:  # %bb.7: # %cond.load7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT:  .LBB0_8: # %else8
+; AVX1-NEXT:    vmovaps %xmm2, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: gather_v4f32_ptr_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX2-NEXT:    vpextrb $0, %xmm3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB0_2
+; AVX2-NEXT:  # %bb.1: # %cond.load
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
+; AVX2-NEXT:  .LBB0_2: # %else
+; AVX2-NEXT:    vpextrb $4, %xmm3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB0_4
+; AVX2-NEXT:  # %bb.3: # %cond.load1
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX2-NEXT:  .LBB0_4: # %else2
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB0_6
+; AVX2-NEXT:  # %bb.5: # %cond.load4
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX2-NEXT:  .LBB0_6: # %else5
+; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB0_8
+; AVX2-NEXT:  # %bb.7: # %cond.load7
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX2-NEXT:  .LBB0_8: # %else8
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: gather_v4f32_ptr_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vptestnmd %zmm1, %zmm1, %k0
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
+; AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptr, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  ret <4 x float> %res
+}
+
+define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x i32> %trigger, <4 x float> %passthru) {
+; SSE-LABEL: gather_v4f32_v4i32_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; SSE-NEXT:    pmovsxdq %xmm0, %xmm4
+; SSE-NEXT:    psllq $2, %xmm4
+; SSE-NEXT:    paddq %xmm3, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmovsxdq %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE-NEXT:    pextrb $0, %xmm5, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB1_2
+; SSE-NEXT:  # %bb.1: # %cond.load
+; SSE-NEXT:    movq %xmm4, %rax
+; SSE-NEXT:    movss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSE-NEXT:    blendps {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3]
+; SSE-NEXT:  .LBB1_2: # %else
+; SSE-NEXT:    psllq $2, %xmm0
+; SSE-NEXT:    pextrb $4, %xmm5, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB1_4
+; SSE-NEXT:  # %bb.3: # %cond.load1
+; SSE-NEXT:    pextrq $1, %xmm4, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; SSE-NEXT:  .LBB1_4: # %else2
+; SSE-NEXT:    paddq %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE-NEXT:    pextrb $8, %xmm1, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB1_6
+; SSE-NEXT:  # %bb.5: # %cond.load4
+; SSE-NEXT:    movq %xmm3, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; SSE-NEXT:  .LBB1_6: # %else5
+; SSE-NEXT:    pextrb $12, %xmm1, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB1_8
+; SSE-NEXT:  # %bb.7: # %cond.load7
+; SSE-NEXT:    pextrq $1, %xmm3, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; SSE-NEXT:  .LBB1_8: # %else8
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: gather_v4f32_v4i32_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %rdi, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm4, %xmm4
+; AVX1-NEXT:    vpsllq $2, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq $2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB1_2
+; AVX1-NEXT:  # %bb.1: # %cond.load
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7]
+; AVX1-NEXT:  .LBB1_2: # %else
+; AVX1-NEXT:    vpextrb $4, %xmm3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB1_4
+; AVX1-NEXT:  # %bb.3: # %cond.load1
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT:  .LBB1_4: # %else2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB1_6
+; AVX1-NEXT:  # %bb.5: # %cond.load4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT:  .LBB1_6: # %else5
+; AVX1-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB1_8
+; AVX1-NEXT:  # %bb.7: # %cond.load7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT:  .LBB1_8: # %else8
+; AVX1-NEXT:    vmovaps %xmm2, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: gather_v4f32_v4i32_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %rdi, %xmm3
+; AVX2-NEXT:    vpbroadcastq %xmm3, %ymm3
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpsllq $2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX2-NEXT:    vpextrb $0, %xmm3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB1_2
+; AVX2-NEXT:  # %bb.1: # %cond.load
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
+; AVX2-NEXT:  .LBB1_2: # %else
+; AVX2-NEXT:    vpextrb $4, %xmm3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB1_4
+; AVX2-NEXT:  # %bb.3: # %cond.load1
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX2-NEXT:  .LBB1_4: # %else2
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB1_6
+; AVX2-NEXT:  # %bb.5: # %cond.load4
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX2-NEXT:  .LBB1_6: # %else5
+; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB1_8
+; AVX2-NEXT:  # %bb.7: # %cond.load7
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX2-NEXT:  .LBB1_8: # %else8
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: gather_v4f32_v4i32_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512-NEXT:    vptestnmd %zmm1, %zmm1, %k0
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
+; AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vptr0 = insertelement <4 x float*> undef, float* %base, i32 0
+  %vptr1 = shufflevector <4 x float*> %vptr0, <4 x float*> undef, <4 x i32> zeroinitializer
+  %vptr2 = getelementptr float, <4 x float*> %vptr1, <4 x i32> %idx
+
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %vptr2, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  ret <4 x float> %res
+}
+
+define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x i32> %trigger, <4 x float> %passthru) {
+; SSE-LABEL: gather_v4f32_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE-NEXT:    psllq $2, %xmm0
+; SSE-NEXT:    paddq %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSE-NEXT:    pextrb $0, %xmm5, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB2_2
+; SSE-NEXT:  # %bb.1: # %cond.load
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSE-NEXT:    blendps {{.*#+}} xmm3 = xmm6[0],xmm3[1,2,3]
+; SSE-NEXT:  .LBB2_2: # %else
+; SSE-NEXT:    psllq $2, %xmm1
+; SSE-NEXT:    pextrb $4, %xmm5, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB2_4
+; SSE-NEXT:  # %bb.3: # %cond.load1
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; SSE-NEXT:  .LBB2_4: # %else2
+; SSE-NEXT:    paddq %xmm1, %xmm4
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pextrb $8, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB2_6
+; SSE-NEXT:  # %bb.5: # %cond.load4
+; SSE-NEXT:    movq %xmm4, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; SSE-NEXT:  .LBB2_6: # %else5
+; SSE-NEXT:    pextrb $12, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB2_8
+; SSE-NEXT:  # %bb.7: # %cond.load7
+; SSE-NEXT:    pextrq $1, %xmm4, %rax
+; SSE-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; SSE-NEXT:  .LBB2_8: # %else8
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: gather_v4f32_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsllq $2, %xmm3, %xmm3
+; AVX1-NEXT:    vmovq %rdi, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsllq $2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB2_2
+; AVX1-NEXT:  # %bb.1: # %cond.load
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7]
+; AVX1-NEXT:  .LBB2_2: # %else
+; AVX1-NEXT:    vpextrb $4, %xmm3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB2_4
+; AVX1-NEXT:  # %bb.3: # %cond.load1
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT:  .LBB2_4: # %else2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB2_6
+; AVX1-NEXT:  # %bb.5: # %cond.load4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT:  .LBB2_6: # %else5
+; AVX1-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB2_8
+; AVX1-NEXT:  # %bb.7: # %cond.load7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT:  .LBB2_8: # %else8
+; AVX1-NEXT:    vmovaps %xmm2, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: gather_v4f32_v4i64_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %rdi, %xmm3
+; AVX2-NEXT:    vpbroadcastq %xmm3, %ymm3
+; AVX2-NEXT:    vpsllq $2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX2-NEXT:    vpextrb $0, %xmm3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB2_2
+; AVX2-NEXT:  # %bb.1: # %cond.load
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
+; AVX2-NEXT:  .LBB2_2: # %else
+; AVX2-NEXT:    vpextrb $4, %xmm3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB2_4
+; AVX2-NEXT:  # %bb.3: # %cond.load1
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX2-NEXT:  .LBB2_4: # %else2
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB2_6
+; AVX2-NEXT:  # %bb.5: # %cond.load4
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX2-NEXT:  .LBB2_6: # %else5
+; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB2_8
+; AVX2-NEXT:  # %bb.7: # %cond.load7
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX2-NEXT:  .LBB2_8: # %else8
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: gather_v4f32_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vptestnmd %zmm1, %zmm1, %k0
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
+; AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vptr0 = insertelement <4 x float*> undef, float* %base, i32 0
+  %vptr1 = shufflevector <4 x float*> %vptr0, <4 x float*> undef, <4 x i32> zeroinitializer
+  %vptr2 = getelementptr float, <4 x float*> %vptr1, <4 x i64> %idx
+
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %vptr2, i32 4, <4 x i1> %mask, <4 x float> %passthru)
+  ret <4 x float> %res
+}
+
+;
+; vXi8
+;
+
+define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8> %trigger, <16 x i8> %passthru) {
+; SSE-LABEL: gather_v16i8_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmovsxdq %xmm0, %xmm0
+; SSE-NEXT:    paddq %xmm8, %xmm0
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm6
+; SSE-NEXT:    pextrb $0, %xmm6, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_2
+; SSE-NEXT:  # %bb.1: # %cond.load
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pinsrb $0, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_2: # %else
+; SSE-NEXT:    pmovsxdq %xmm7, %xmm7
+; SSE-NEXT:    pextrb $1, %xmm6, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_4
+; SSE-NEXT:  # %bb.3: # %cond.load1
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    pinsrb $1, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_4: # %else2
+; SSE-NEXT:    paddq %xmm8, %xmm7
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm6
+; SSE-NEXT:    pextrb $2, %xmm6, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_6
+; SSE-NEXT:  # %bb.5: # %cond.load4
+; SSE-NEXT:    movq %xmm7, %rax
+; SSE-NEXT:    pinsrb $2, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_6: # %else5
+; SSE-NEXT:    pmovsxdq %xmm1, %xmm0
+; SSE-NEXT:    pextrb $3, %xmm6, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_8
+; SSE-NEXT:  # %bb.7: # %cond.load7
+; SSE-NEXT:    pextrq $1, %xmm7, %rax
+; SSE-NEXT:    pinsrb $3, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_8: # %else8
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    paddq %xmm8, %xmm0
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm6
+; SSE-NEXT:    pextrb $4, %xmm6, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_10
+; SSE-NEXT:  # %bb.9: # %cond.load10
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pinsrb $4, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_10: # %else11
+; SSE-NEXT:    pmovsxdq %xmm1, %xmm1
+; SSE-NEXT:    pextrb $5, %xmm6, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_12
+; SSE-NEXT:  # %bb.11: # %cond.load13
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    pinsrb $5, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_12: # %else14
+; SSE-NEXT:    paddq %xmm8, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm6
+; SSE-NEXT:    pextrb $6, %xmm6, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_14
+; SSE-NEXT:  # %bb.13: # %cond.load16
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    pinsrb $6, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_14: # %else17
+; SSE-NEXT:    pmovsxdq %xmm2, %xmm0
+; SSE-NEXT:    pextrb $7, %xmm6, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_16
+; SSE-NEXT:  # %bb.15: # %cond.load19
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    pinsrb $7, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_16: # %else20
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT:    paddq %xmm8, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE-NEXT:    pextrb $8, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_18
+; SSE-NEXT:  # %bb.17: # %cond.load22
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pinsrb $8, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_18: # %else23
+; SSE-NEXT:    pmovsxdq %xmm1, %xmm1
+; SSE-NEXT:    pextrb $9, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_20
+; SSE-NEXT:  # %bb.19: # %cond.load25
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    pinsrb $9, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_20: # %else26
+; SSE-NEXT:    paddq %xmm8, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE-NEXT:    pextrb $10, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_22
+; SSE-NEXT:  # %bb.21: # %cond.load28
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    pinsrb $10, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_22: # %else29
+; SSE-NEXT:    pmovsxdq %xmm3, %xmm0
+; SSE-NEXT:    pextrb $11, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_24
+; SSE-NEXT:  # %bb.23: # %cond.load31
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    pinsrb $11, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_24: # %else32
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE-NEXT:    paddq %xmm8, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE-NEXT:    pextrb $12, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_26
+; SSE-NEXT:  # %bb.25: # %cond.load34
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pinsrb $12, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_26: # %else35
+; SSE-NEXT:    pmovsxdq %xmm1, %xmm1
+; SSE-NEXT:    pextrb $13, %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_28
+; SSE-NEXT:  # %bb.27: # %cond.load37
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    pinsrb $13, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_28: # %else38
+; SSE-NEXT:    paddq %xmm1, %xmm8
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pcmpeqb %xmm0, %xmm4
+; SSE-NEXT:    pextrb $14, %xmm4, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_30
+; SSE-NEXT:  # %bb.29: # %cond.load40
+; SSE-NEXT:    movq %xmm8, %rax
+; SSE-NEXT:    pinsrb $14, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_30: # %else41
+; SSE-NEXT:    pextrb $15, %xmm4, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    je .LBB3_32
+; SSE-NEXT:  # %bb.31: # %cond.load43
+; SSE-NEXT:    pextrq $1, %xmm8, %rax
+; SSE-NEXT:    pinsrb $15, (%rax), %xmm5
+; SSE-NEXT:  .LBB3_32: # %else44
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: gather_v16i8_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %rdi, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm6
+; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpextrb $0, %xmm6, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_2
+; AVX1-NEXT:  # %bb.1: # %cond.load
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $0, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_2: # %else
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vpextrb $1, %xmm6, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_4
+; AVX1-NEXT:  # %bb.3: # %cond.load1
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $1, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_4: # %else2
+; AVX1-NEXT:    vpmovsxdq %xmm7, %xmm6
+; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm8
+; AVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm7, %xmm2, %xmm7
+; AVX1-NEXT:    vpextrb $2, %xmm7, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_6
+; AVX1-NEXT:  # %bb.5: # %cond.load4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vmovq %xmm5, %rax
+; AVX1-NEXT:    vpinsrb $2, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_6: # %else5
+; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpextrb $3, %xmm7, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_8
+; AVX1-NEXT:  # %bb.7: # %cond.load7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $3, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_8: # %else8
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpextrb $4, %xmm5, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_10
+; AVX1-NEXT:  # %bb.9: # %cond.load10
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $4, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_10: # %else11
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm7
+; AVX1-NEXT:    vpextrb $5, %xmm5, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_12
+; AVX1-NEXT:  # %bb.11: # %cond.load13
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $5, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_12: # %else14
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm6
+; AVX1-NEXT:    vpaddq %xmm7, %xmm4, %xmm8
+; AVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm7, %xmm2, %xmm7
+; AVX1-NEXT:    vpextrb $6, %xmm7, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_14
+; AVX1-NEXT:  # %bb.13: # %cond.load16
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vmovq %xmm5, %rax
+; AVX1-NEXT:    vpinsrb $6, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_14: # %else17
+; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpextrb $7, %xmm7, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_16
+; AVX1-NEXT:  # %bb.15: # %cond.load19
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $7, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_16: # %else20
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpextrb $8, %xmm5, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_18
+; AVX1-NEXT:  # %bb.17: # %cond.load22
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $8, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_18: # %else23
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpextrb $9, %xmm5, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_20
+; AVX1-NEXT:  # %bb.19: # %cond.load25
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $9, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_20: # %else26
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm5
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpextrb $10, %xmm6, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_22
+; AVX1-NEXT:  # %bb.21: # %cond.load28
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vmovq %xmm7, %rax
+; AVX1-NEXT:    vpinsrb $10, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_22: # %else29
+; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpextrb $11, %xmm6, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_24
+; AVX1-NEXT:  # %bb.23: # %cond.load31
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $11, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_24: # %else32
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_26
+; AVX1-NEXT:  # %bb.25: # %cond.load34
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $12, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_26: # %else35
+; AVX1-NEXT:    vpextrb $13, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_28
+; AVX1-NEXT:  # %bb.27: # %cond.load37
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $13, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_28: # %else38
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpextrb $14, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_30
+; AVX1-NEXT:  # %bb.29: # %cond.load40
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovq %xmm2, %rax
+; AVX1-NEXT:    vpinsrb $14, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_30: # %else41
+; AVX1-NEXT:    vpextrb $15, %xmm1, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je .LBB3_32
+; AVX1-NEXT:  # %bb.31: # %cond.load43
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpinsrb $15, (%rax), %xmm3, %xmm3
+; AVX1-NEXT:  .LBB3_32: # %else44
+; AVX1-NEXT:    vmovdqa %xmm3, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: gather_v16i8_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %rdi, %xmm4
+; AVX2-NEXT:    vpbroadcastq %xmm4, %ymm4
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm5
+; AVX2-NEXT:    vpaddq %ymm5, %ymm4, %ymm5
+; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm6
+; AVX2-NEXT:    vpextrb $0, %xmm6, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_2
+; AVX2-NEXT:  # %bb.1: # %cond.load
+; AVX2-NEXT:    vmovq %xmm5, %rax
+; AVX2-NEXT:    vpinsrb $0, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_2: # %else
+; AVX2-NEXT:    vpextrb $1, %xmm6, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_4
+; AVX2-NEXT:  # %bb.3: # %cond.load1
+; AVX2-NEXT:    vpextrq $1, %xmm5, %rax
+; AVX2-NEXT:    vpinsrb $1, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_4: # %else2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm6
+; AVX2-NEXT:    vpextrb $2, %xmm6, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_6
+; AVX2-NEXT:  # %bb.5: # %cond.load4
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX2-NEXT:    vmovq %xmm7, %rax
+; AVX2-NEXT:    vpinsrb $2, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_6: # %else5
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpextrb $3, %xmm6, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_8
+; AVX2-NEXT:  # %bb.7: # %cond.load7
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vpextrq $1, %xmm5, %rax
+; AVX2-NEXT:    vpinsrb $3, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_8: # %else8
+; AVX2-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT:    vpextrb $4, %xmm5, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_10
+; AVX2-NEXT:  # %bb.9: # %cond.load10
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $4, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_10: # %else11
+; AVX2-NEXT:    vpextrb $5, %xmm5, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_12
+; AVX2-NEXT:  # %bb.11: # %cond.load13
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $5, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_12: # %else14
+; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT:    vpextrb $6, %xmm5, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_14
+; AVX2-NEXT:  # %bb.13: # %cond.load16
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; AVX2-NEXT:    vmovq %xmm6, %rax
+; AVX2-NEXT:    vpinsrb $6, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_14: # %else17
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm6
+; AVX2-NEXT:    vpextrb $7, %xmm5, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_16
+; AVX2-NEXT:  # %bb.15: # %cond.load19
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $7, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_16: # %else20
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm0
+; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT:    vpextrb $8, %xmm5, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_18
+; AVX2-NEXT:  # %bb.17: # %cond.load22
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $8, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_18: # %else23
+; AVX2-NEXT:    vpextrb $9, %xmm5, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_20
+; AVX2-NEXT:  # %bb.19: # %cond.load25
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $9, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_20: # %else26
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT:    vpextrb $10, %xmm5, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_22
+; AVX2-NEXT:  # %bb.21: # %cond.load28
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; AVX2-NEXT:    vmovq %xmm6, %rax
+; AVX2-NEXT:    vpinsrb $10, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_22: # %else29
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpextrb $11, %xmm5, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_24
+; AVX2-NEXT:  # %bb.23: # %cond.load31
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $11, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_24: # %else32
+; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_26
+; AVX2-NEXT:  # %bb.25: # %cond.load34
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $12, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_26: # %else35
+; AVX2-NEXT:    vpextrb $13, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_28
+; AVX2-NEXT:  # %bb.27: # %cond.load37
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $13, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_28: # %else38
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_30
+; AVX2-NEXT:  # %bb.29: # %cond.load40
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, %rax
+; AVX2-NEXT:    vpinsrb $14, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_30: # %else41
+; AVX2-NEXT:    vpextrb $15, %xmm1, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    je .LBB3_32
+; AVX2-NEXT:  # %bb.31: # %cond.load43
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpinsrb $15, (%rax), %xmm3, %xmm3
+; AVX2-NEXT:  .LBB3_32: # %else44
+; AVX2-NEXT:    vmovdqa %xmm3, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: gather_v16i8_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %rdi, %zmm3
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm4
+; AVX512-NEXT:    vpaddq %zmm4, %zmm3, %zmm4
+; AVX512-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT:    vpmovsxbd %xmm5, %zmm5
+; AVX512-NEXT:    vptestmd %zmm5, %zmm5, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_2
+; AVX512-NEXT:  # %bb.1: # %cond.load
+; AVX512-NEXT:    vmovq %xmm4, %rax
+; AVX512-NEXT:    vpinsrb $0, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_2: # %else
+; AVX512-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_4
+; AVX512-NEXT:  # %bb.3: # %cond.load1
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX512-NEXT:    vpinsrb $1, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_4: # %else2
+; AVX512-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT:    vpmovsxbd %xmm5, %zmm5
+; AVX512-NEXT:    vptestmd %zmm5, %zmm5, %k0
+; AVX512-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512-NEXT:    kmovw %k1, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_6
+; AVX512-NEXT:  # %bb.5: # %cond.load4
+; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512-NEXT:    vmovq %xmm5, %rax
+; AVX512-NEXT:    vpinsrb $2, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_6: # %else5
+; AVX512-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_8
+; AVX512-NEXT:  # %bb.7: # %cond.load7
+; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rax
+; AVX512-NEXT:    vpinsrb $3, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_8: # %else8
+; AVX512-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT:    vpmovsxbd %xmm5, %zmm5
+; AVX512-NEXT:    vptestmd %zmm5, %zmm5, %k0
+; AVX512-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512-NEXT:    kmovw %k1, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_10
+; AVX512-NEXT:  # %bb.9: # %cond.load10
+; AVX512-NEXT:    vextracti32x4 $2, %zmm4, %xmm5
+; AVX512-NEXT:    vmovq %xmm5, %rax
+; AVX512-NEXT:    vpinsrb $4, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_10: # %else11
+; AVX512-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_12
+; AVX512-NEXT:  # %bb.11: # %cond.load13
+; AVX512-NEXT:    vextracti32x4 $2, %zmm4, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rax
+; AVX512-NEXT:    vpinsrb $5, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_12: # %else14
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT:    vpmovsxbd %xmm5, %zmm5
+; AVX512-NEXT:    vptestmd %zmm5, %zmm5, %k0
+; AVX512-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512-NEXT:    kmovw %k1, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_14
+; AVX512-NEXT:  # %bb.13: # %cond.load16
+; AVX512-NEXT:    vextracti32x4 $3, %zmm4, %xmm5
+; AVX512-NEXT:    vmovq %xmm5, %rax
+; AVX512-NEXT:    vpinsrb $6, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_14: # %else17
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_16
+; AVX512-NEXT:  # %bb.15: # %cond.load19
+; AVX512-NEXT:    vextracti32x4 $3, %zmm4, %xmm4
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX512-NEXT:    vpinsrb $7, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_16: # %else20
+; AVX512-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512-NEXT:    kmovw %k1, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_18
+; AVX512-NEXT:  # %bb.17: # %cond.load22
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpinsrb $8, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_18: # %else23
+; AVX512-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_20
+; AVX512-NEXT:  # %bb.19: # %cond.load25
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vpinsrb $9, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_20: # %else26
+; AVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512-NEXT:    kmovw %k1, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_22
+; AVX512-NEXT:  # %bb.21: # %cond.load28
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-NEXT:    vmovq %xmm3, %rax
+; AVX512-NEXT:    vpinsrb $10, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_22: # %else29
+; AVX512-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_24
+; AVX512-NEXT:  # %bb.23: # %cond.load31
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512-NEXT:    vpinsrb $11, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_24: # %else32
+; AVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512-NEXT:    kmovw %k1, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_26
+; AVX512-NEXT:  # %bb.25: # %cond.load34
+; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; AVX512-NEXT:    vmovq %xmm3, %rax
+; AVX512-NEXT:    vpinsrb $12, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_26: # %else35
+; AVX512-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_28
+; AVX512-NEXT:  # %bb.27: # %cond.load37
+; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512-NEXT:    vpinsrb $13, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_28: # %else38
+; AVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512-NEXT:    kmovw %k1, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_30
+; AVX512-NEXT:  # %bb.29: # %cond.load40
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    vpinsrb $14, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_30: # %else41
+; AVX512-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    testb $1, %al
+; AVX512-NEXT:    je .LBB3_32
+; AVX512-NEXT:  # %bb.31: # %cond.load43
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vpinsrb $15, (%rax), %xmm2, %xmm2
+; AVX512-NEXT:  .LBB3_32: # %else44
+; AVX512-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vptr0 = insertelement <16 x i8*> undef, i8* %base, i32 0
+  %vptr1 = shufflevector <16 x i8*> %vptr0, <16 x i8*> undef, <16 x i32> zeroinitializer
+  %vptr2 = getelementptr i8, <16 x i8*> %vptr1, <16 x i32> %idx
+
+  %mask = icmp eq <16 x i8> %trigger, zeroinitializer
+  %res = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %vptr2, i32 4, <16 x i1> %mask, <16 x i8> %passthru)
+  ret <16 x i8> %res
+}
+
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
+
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
+
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>)
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 0aef996e54e15b11456b2a88f6782d2afeec2021..82120537f913d530c2f3ba832d63c8a81f467978 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1850,7 +1850,7 @@ define <16 x float*> @test31(<16 x float**> %ptrs) {
 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
 ; KNL_64-LABEL: test_gather_16i32:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
@@ -1862,7 +1862,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
 ;
 ; KNL_32-LABEL: test_gather_16i32:
 ; KNL_32:       # %bb.0:
-; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
@@ -1871,7 +1871,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
 ;
 ; SKX-LABEL: test_gather_16i32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k1
 ; SKX-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
@@ -1883,7 +1883,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
 ;
 ; SKX_32-LABEL: test_gather_16i32:
 ; SKX_32:       # %bb.0:
-; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
@@ -1895,7 +1895,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
 ; KNL_64-LABEL: test_gather_16i64:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
@@ -1914,7 +1914,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
-; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
@@ -1930,7 +1930,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
 ;
 ; SKX-LABEL: test_gather_16i64:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
@@ -1949,7 +1949,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
 ; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
 ; SKX_32-NEXT:    andl $-64, %esp
 ; SKX_32-NEXT:    subl $64, %esp
-; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
 ; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
@@ -1969,7 +1969,7 @@ declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <
 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
 ; KNL_64-LABEL: test_gather_16f32:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
@@ -1981,7 +1981,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
 ;
 ; KNL_32-LABEL: test_gather_16f32:
 ; KNL_32:       # %bb.0:
-; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
@@ -1990,7 +1990,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
 ;
 ; SKX-LABEL: test_gather_16f32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k1
 ; SKX-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
@@ -2002,7 +2002,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
 ;
 ; SKX_32-LABEL: test_gather_16f32:
 ; SKX_32:       # %bb.0:
-; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
 ; SKX_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
@@ -2014,7 +2014,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
 ; KNL_64-LABEL: test_gather_16f64:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
@@ -2033,7 +2033,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
-; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
@@ -2049,7 +2049,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
 ;
 ; SKX-LABEL: test_gather_16f64:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
@@ -2068,7 +2068,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
 ; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
 ; SKX_32-NEXT:    andl $-64, %esp
 ; SKX_32-NEXT:    subl $64, %esp
-; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
 ; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
@@ -2088,7 +2088,7 @@ declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs,
 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
 ; KNL_64-LABEL: test_scatter_16i32:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
@@ -2100,7 +2100,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ;
 ; KNL_32-LABEL: test_scatter_16i32:
 ; KNL_32:       # %bb.0:
-; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
@@ -2109,7 +2109,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ;
 ; SKX-LABEL: test_scatter_16i32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
@@ -2121,7 +2121,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ;
 ; SKX_32-LABEL: test_scatter_16i32:
 ; SKX_32:       # %bb.0:
-; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
 ; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
@@ -2133,7 +2133,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
 ; KNL_64-LABEL: test_scatter_16i64:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
@@ -2151,7 +2151,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
-; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
@@ -2167,7 +2167,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ;
 ; SKX-LABEL: test_scatter_16i64:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
@@ -2185,7 +2185,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
 ; SKX_32-NEXT:    andl $-64, %esp
 ; SKX_32-NEXT:    subl $64, %esp
-; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
 ; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
@@ -2205,7 +2205,7 @@ declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*>
 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
 ; KNL_64-LABEL: test_scatter_16f32:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
@@ -2217,7 +2217,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ;
 ; KNL_32-LABEL: test_scatter_16f32:
 ; KNL_32:       # %bb.0:
-; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
@@ -2226,7 +2226,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ;
 ; SKX-LABEL: test_scatter_16f32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
@@ -2238,7 +2238,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ;
 ; SKX_32-LABEL: test_scatter_16f32:
 ; SKX_32:       # %bb.0:
-; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
 ; SKX_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
@@ -2251,7 +2251,7 @@ declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x floa
 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
 ; KNL_64-LABEL: test_scatter_16f64:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
@@ -2269,7 +2269,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
-; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
@@ -2285,7 +2285,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ;
 ; SKX-LABEL: test_scatter_16f64:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
@@ -2303,7 +2303,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
 ; SKX_32-NEXT:    andl $-64, %esp
 ; SKX_32-NEXT:    subl $64, %esp
-; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
 ; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
diff --git a/test/CodeGen/X86/masked_load.ll b/test/CodeGen/X86/masked_load.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5a456bd2d718ed7b205a21dab44deacb4d2f6c3a
--- /dev/null
+++ b/test/CodeGen/X86/masked_load.ll
@@ -0,0 +1,2132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse2    | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse4.2  | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
+
+define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
+; SSE-LABEL: load_v1f64_v1i64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    jne LBB0_2
+; SSE-NEXT:  ## %bb.1: ## %cond.load
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:  LBB0_2: ## %else
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_v1f64_v1i64:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    testq %rdi, %rdi
+; AVX-NEXT:    jne LBB0_2
+; AVX-NEXT:  ## %bb.1: ## %cond.load
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:  LBB0_2: ## %else
+; AVX-NEXT:    retq
+  %mask = icmp eq <1 x i64> %trigger, zeroinitializer
+  %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst)
+  ret <1 x double> %res
+}
+
+define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
+; SSE2-LABEL: load_v2f64_v2i64:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB1_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT:  LBB1_2: ## %else
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB1_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:  LBB1_4: ## %else2
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_v2f64_v2i64:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pcmpeqq %xmm0, %xmm2
+; SSE42-NEXT:    pextrb $0, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB1_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT:  LBB1_2: ## %else
+; SSE42-NEXT:    pextrb $8, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB1_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT:  LBB1_4: ## %else2
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1OR2-LABEL: load_v2f64_v2i64:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
+; AVX1OR2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: load_v2f64_v2i64:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_v2f64_v2i64:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
+; SSE2-LABEL: load_v4f32_v4i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
+; SSE2-NEXT:  LBB2_2: ## %else
+; SSE2-NEXT:    pextrw $2, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:  LBB2_4: ## %else2
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.load4
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT:  LBB2_6: ## %else5
+; SSE2-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.load7
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:  LBB2_8: ## %else8
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_v4f32_v4i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE42-NEXT:    pextrb $0, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB2_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
+; SSE42-NEXT:  LBB2_2: ## %else
+; SSE42-NEXT:    pextrb $4, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB2_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; SSE42-NEXT:  LBB2_4: ## %else2
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB2_6
+; SSE42-NEXT:  ## %bb.5: ## %cond.load4
+; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; SSE42-NEXT:  LBB2_6: ## %else5
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB2_8
+; SSE42-NEXT:  ## %bb.7: ## %cond.load7
+; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; SSE42-NEXT:  LBB2_8: ## %else8
+; SSE42-NEXT:    movaps %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1OR2-LABEL: load_v4f32_v4i32:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1OR2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: load_v4f32_v4i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_v4f32_v4i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
+  ret <4 x float> %res
+}
+
+define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
+; SSE2-LABEL: load_v4i32_v4i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB3_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
+; SSE2-NEXT:  LBB3_2: ## %else
+; SSE2-NEXT:    pextrw $2, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB3_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:  LBB3_4: ## %else2
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB3_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.load4
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT:  LBB3_6: ## %else5
+; SSE2-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB3_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.load7
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:  LBB3_8: ## %else8
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_v4i32_v4i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE42-NEXT:    pextrb $0, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB3_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm1
+; SSE42-NEXT:  LBB3_2: ## %else
+; SSE42-NEXT:    pextrb $4, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB3_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm1
+; SSE42-NEXT:  LBB3_4: ## %else2
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB3_6
+; SSE42-NEXT:  ## %bb.5: ## %cond.load4
+; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm1
+; SSE42-NEXT:  LBB3_6: ## %else5
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB3_8
+; SSE42-NEXT:  ## %bb.7: ## %cond.load7
+; SSE42-NEXT:    pinsrd $3, 12(%rdi), %xmm1
+; SSE42-NEXT:  LBB3_8: ## %else8
+; SSE42-NEXT:    movdqa %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_v4i32_v4i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_v4i32_v4i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_v4i32_v4i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_v4i32_v4i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
+; SSE2-LABEL: load_v4f64_v4i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB4_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT:  LBB4_2: ## %else
+; SSE2-NEXT:    pextrw $2, %xmm3, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB4_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:  LBB4_4: ## %else2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB4_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.load4
+; SSE2-NEXT:    movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE2-NEXT:  LBB4_6: ## %else5
+; SSE2-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB4_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.load7
+; SSE2-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE2-NEXT:  LBB4_8: ## %else8
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_v4f64_v4i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pxor %xmm3, %xmm3
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE42-NEXT:    pextrb $0, %xmm3, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB4_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT:  LBB4_2: ## %else
+; SSE42-NEXT:    pextrb $4, %xmm3, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB4_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT:  LBB4_4: ## %else2
+; SSE42-NEXT:    pxor %xmm3, %xmm3
+; SSE42-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB4_6
+; SSE42-NEXT:  ## %bb.5: ## %cond.load4
+; SSE42-NEXT:    movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE42-NEXT:  LBB4_6: ## %else5
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB4_8
+; SSE42-NEXT:  ## %bb.7: ## %cond.load7
+; SSE42-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE42-NEXT:  LBB4_8: ## %else8
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    movapd %xmm2, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_v4f64_v4i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_v4f64_v4i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_v4f64_v4i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_v4f64_v4i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
+  ret <4 x double> %res
+}
+
+define <4 x double> @load_zero_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
+; SSE2-LABEL: load_zero_v4f64_v4i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    je LBB5_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:  LBB5_2: ## %else
+; SSE2-NEXT:    pextrw $2, %xmm3, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB5_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE2-NEXT:  LBB5_4: ## %else2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pextrw $4, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB5_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.load4
+; SSE2-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT:  LBB5_6: ## %else5
+; SSE2-NEXT:    pextrw $6, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB5_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.load7
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:  LBB5_8: ## %else8
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_zero_v4f64_v4i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pxor %xmm0, %xmm0
+; SSE42-NEXT:    movdqa %xmm2, %xmm3
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE42-NEXT:    pextrb $0, %xmm3, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    xorpd %xmm1, %xmm1
+; SSE42-NEXT:    je LBB5_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE42-NEXT:    xorpd %xmm1, %xmm1
+; SSE42-NEXT:  LBB5_2: ## %else
+; SSE42-NEXT:    pextrb $4, %xmm3, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB5_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE42-NEXT:  LBB5_4: ## %else2
+; SSE42-NEXT:    pxor %xmm3, %xmm3
+; SSE42-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE42-NEXT:    pextrb $8, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB5_6
+; SSE42-NEXT:  ## %bb.5: ## %cond.load4
+; SSE42-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT:  LBB5_6: ## %else5
+; SSE42-NEXT:    pextrb $12, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB5_8
+; SSE42-NEXT:  ## %bb.7: ## %cond.load7
+; SSE42-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT:  LBB5_8: ## %else8
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_zero_v4f64_v4i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zero_v4f64_v4i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_zero_v4f64_v4i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_zero_v4f64_v4i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
+  ret <4 x double> %res
+}
+
+define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+; SSE2-LABEL: load_v8f32_v8i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    packssdw %xmm0, %xmm5
+; SSE2-NEXT:    movd %xmm5, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB6_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3]
+; SSE2-NEXT:  LBB6_2: ## %else
+; SSE2-NEXT:    psrlq $16, %xmm4
+; SSE2-NEXT:    movd %xmm4, %eax
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB6_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm2[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3]
+; SSE2-NEXT:    movaps %xmm4, %xmm2
+; SSE2-NEXT:  LBB6_4: ## %else2
+; SSE2-NEXT:    xorps %xmm4, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB6_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.load4
+; SSE2-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm2[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2]
+; SSE2-NEXT:  LBB6_6: ## %else5
+; SSE2-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB6_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.load7
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
+; SSE2-NEXT:  LBB6_8: ## %else8
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB6_10
+; SSE2-NEXT:  ## %bb.9: ## %cond.load10
+; SSE2-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
+; SSE2-NEXT:  LBB6_10: ## %else11
+; SSE2-NEXT:    pextrw $2, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB6_12
+; SSE2-NEXT:  ## %bb.11: ## %cond.load13
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:  LBB6_12: ## %else14
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pextrw $4, %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB6_14
+; SSE2-NEXT:  ## %bb.13: ## %cond.load16
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2]
+; SSE2-NEXT:  LBB6_14: ## %else17
+; SSE2-NEXT:    pextrw $6, %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB6_16
+; SSE2-NEXT:  ## %bb.15: ## %cond.load19
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
+; SSE2-NEXT:  LBB6_16: ## %else20
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_v8f32_v8i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pxor %xmm4, %xmm4
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE42-NEXT:    pextrb $0, %xmm4, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB6_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; SSE42-NEXT:    blendps {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3]
+; SSE42-NEXT:  LBB6_2: ## %else
+; SSE42-NEXT:    pextrb $4, %xmm4, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB6_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; SSE42-NEXT:  LBB6_4: ## %else2
+; SSE42-NEXT:    pxor %xmm4, %xmm4
+; SSE42-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB6_6
+; SSE42-NEXT:  ## %bb.5: ## %cond.load4
+; SSE42-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; SSE42-NEXT:  LBB6_6: ## %else5
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB6_8
+; SSE42-NEXT:  ## %bb.7: ## %cond.load7
+; SSE42-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; SSE42-NEXT:  LBB6_8: ## %else8
+; SSE42-NEXT:    pxor %xmm0, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB6_10
+; SSE42-NEXT:  ## %bb.9: ## %cond.load10
+; SSE42-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7]
+; SSE42-NEXT:  LBB6_10: ## %else11
+; SSE42-NEXT:    pextrb $4, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB6_12
+; SSE42-NEXT:  ## %bb.11: ## %cond.load13
+; SSE42-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; SSE42-NEXT:  LBB6_12: ## %else14
+; SSE42-NEXT:    pxor %xmm0, %xmm0
+; SSE42-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE42-NEXT:    pextrb $8, %xmm1, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB6_14
+; SSE42-NEXT:  ## %bb.13: ## %cond.load16
+; SSE42-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; SSE42-NEXT:  LBB6_14: ## %else17
+; SSE42-NEXT:    pextrb $12, %xmm1, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB6_16
+; SSE42-NEXT:  ## %bb.15: ## %cond.load19
+; SSE42-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; SSE42-NEXT:  LBB6_16: ## %else20
+; SSE42-NEXT:    movaps %xmm2, %xmm0
+; SSE42-NEXT:    movaps %xmm3, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_v8f32_v8i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_v8f32_v8i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_v8f32_v8i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_v8f32_v8i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX512VLBW-NEXT:    vblendmps (%rdi), %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
+; SSE2-LABEL: load_v8i32_v8i1:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB7_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
+; SSE2-NEXT:  LBB7_2: ## %else
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB7_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:  LBB7_4: ## %else2
+; SSE2-NEXT:    pextrw $2, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB7_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.load4
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2]
+; SSE2-NEXT:  LBB7_6: ## %else5
+; SSE2-NEXT:    pextrw $3, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB7_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.load7
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSE2-NEXT:  LBB7_8: ## %else8
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB7_10
+; SSE2-NEXT:  ## %bb.9: ## %cond.load10
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
+; SSE2-NEXT:  LBB7_10: ## %else11
+; SSE2-NEXT:    pextrw $5, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB7_12
+; SSE2-NEXT:  ## %bb.11: ## %cond.load13
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:  LBB7_12: ## %else14
+; SSE2-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB7_14
+; SSE2-NEXT:  ## %bb.13: ## %cond.load16
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2]
+; SSE2-NEXT:  LBB7_14: ## %else17
+; SSE2-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB7_16
+; SSE2-NEXT:  ## %bb.15: ## %cond.load19
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
+; SSE2-NEXT:  LBB7_16: ## %else20
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_v8i32_v8i1:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB7_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm1
+; SSE42-NEXT:  LBB7_2: ## %else
+; SSE42-NEXT:    pextrb $2, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB7_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm1
+; SSE42-NEXT:  LBB7_4: ## %else2
+; SSE42-NEXT:    pextrb $4, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB7_6
+; SSE42-NEXT:  ## %bb.5: ## %cond.load4
+; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm1
+; SSE42-NEXT:  LBB7_6: ## %else5
+; SSE42-NEXT:    pextrb $6, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB7_8
+; SSE42-NEXT:  ## %bb.7: ## %cond.load7
+; SSE42-NEXT:    pinsrd $3, 12(%rdi), %xmm1
+; SSE42-NEXT:  LBB7_8: ## %else8
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB7_10
+; SSE42-NEXT:  ## %bb.9: ## %cond.load10
+; SSE42-NEXT:    pinsrd $0, 16(%rdi), %xmm2
+; SSE42-NEXT:  LBB7_10: ## %else11
+; SSE42-NEXT:    pextrb $10, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB7_12
+; SSE42-NEXT:  ## %bb.11: ## %cond.load13
+; SSE42-NEXT:    pinsrd $1, 20(%rdi), %xmm2
+; SSE42-NEXT:  LBB7_12: ## %else14
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB7_14
+; SSE42-NEXT:  ## %bb.13: ## %cond.load16
+; SSE42-NEXT:    pinsrd $2, 24(%rdi), %xmm2
+; SSE42-NEXT:  LBB7_14: ## %else17
+; SSE42-NEXT:    pextrb $14, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB7_16
+; SSE42-NEXT:  ## %bb.15: ## %cond.load19
+; SSE42-NEXT:    pinsrd $3, 28(%rdi), %xmm2
+; SSE42-NEXT:  LBB7_16: ## %else20
+; SSE42-NEXT:    movdqa %xmm1, %xmm0
+; SSE42-NEXT:    movdqa %xmm2, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_v8i32_v8i1:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_v8i32_v8i1:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2
+; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_v8i32_v8i1:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_v8i32_v8i1:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpmovw2m %xmm0, %k1
+; AVX512VLBW-NEXT:    vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
+  ret <8 x i32> %res
+}
+
+define <8 x float> @load_zero_v8f32_v8i1(<8 x i1> %mask, <8 x float>* %addr) {
+; SSE2-LABEL: load_zero_v8f32_v8i1:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    je LBB8_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:  LBB8_2: ## %else
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB8_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
+; SSE2-NEXT:    movaps %xmm3, %xmm0
+; SSE2-NEXT:  LBB8_4: ## %else2
+; SSE2-NEXT:    pextrw $2, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB8_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.load4
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2]
+; SSE2-NEXT:  LBB8_6: ## %else5
+; SSE2-NEXT:    pextrw $3, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB8_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.load7
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
+; SSE2-NEXT:  LBB8_8: ## %else8
+; SSE2-NEXT:    pextrw $4, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB8_10
+; SSE2-NEXT:  ## %bb.9: ## %cond.load10
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
+; SSE2-NEXT:  LBB8_10: ## %else11
+; SSE2-NEXT:    pextrw $5, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB8_12
+; SSE2-NEXT:  ## %bb.11: ## %cond.load13
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:  LBB8_12: ## %else14
+; SSE2-NEXT:    pextrw $6, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB8_14
+; SSE2-NEXT:  ## %bb.13: ## %cond.load16
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2]
+; SSE2-NEXT:  LBB8_14: ## %else17
+; SSE2-NEXT:    pextrw $7, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB8_16
+; SSE2-NEXT:  ## %bb.15: ## %cond.load19
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSE2-NEXT:  LBB8_16: ## %else20
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_zero_v8f32_v8i1:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; SSE42-NEXT:    pxor %xmm0, %xmm0
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    xorps %xmm1, %xmm1
+; SSE42-NEXT:    je LBB8_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE42-NEXT:    xorps %xmm1, %xmm1
+; SSE42-NEXT:  LBB8_2: ## %else
+; SSE42-NEXT:    pextrb $2, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB8_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE42-NEXT:  LBB8_4: ## %else2
+; SSE42-NEXT:    pextrb $4, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB8_6
+; SSE42-NEXT:  ## %bb.5: ## %cond.load4
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE42-NEXT:  LBB8_6: ## %else5
+; SSE42-NEXT:    pextrb $6, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB8_8
+; SSE42-NEXT:  ## %bb.7: ## %cond.load7
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE42-NEXT:  LBB8_8: ## %else8
+; SSE42-NEXT:    pextrb $8, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB8_10
+; SSE42-NEXT:  ## %bb.9: ## %cond.load10
+; SSE42-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
+; SSE42-NEXT:  LBB8_10: ## %else11
+; SSE42-NEXT:    pextrb $10, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB8_12
+; SSE42-NEXT:  ## %bb.11: ## %cond.load13
+; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; SSE42-NEXT:  LBB8_12: ## %else14
+; SSE42-NEXT:    pextrb $12, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB8_14
+; SSE42-NEXT:  ## %bb.13: ## %cond.load16
+; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; SSE42-NEXT:  LBB8_14: ## %else17
+; SSE42-NEXT:    pextrb $14, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB8_16
+; SSE42-NEXT:  ## %bb.15: ## %cond.load19
+; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; SSE42-NEXT:  LBB8_16: ## %else20
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_zero_v8f32_v8i1:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zero_v8f32_v8i1:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_zero_v8f32_v8i1:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_zero_v8f32_v8i1:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpmovw2m %xmm0, %k1
+; AVX512VLBW-NEXT:    vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT:    retq
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
+  ret <8 x float> %res
+}
+
+define <8 x i32> @load_zero_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr) {
+; SSE2-LABEL: load_zero_v8i32_v8i1:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    je LBB9_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:  LBB9_2: ## %else
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB9_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
+; SSE2-NEXT:    movaps %xmm3, %xmm0
+; SSE2-NEXT:  LBB9_4: ## %else2
+; SSE2-NEXT:    pextrw $2, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB9_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.load4
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2]
+; SSE2-NEXT:  LBB9_6: ## %else5
+; SSE2-NEXT:    pextrw $3, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB9_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.load7
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
+; SSE2-NEXT:  LBB9_8: ## %else8
+; SSE2-NEXT:    pextrw $4, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB9_10
+; SSE2-NEXT:  ## %bb.9: ## %cond.load10
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
+; SSE2-NEXT:  LBB9_10: ## %else11
+; SSE2-NEXT:    pextrw $5, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB9_12
+; SSE2-NEXT:  ## %bb.11: ## %cond.load13
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:  LBB9_12: ## %else14
+; SSE2-NEXT:    pextrw $6, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB9_14
+; SSE2-NEXT:  ## %bb.13: ## %cond.load16
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2]
+; SSE2-NEXT:  LBB9_14: ## %else17
+; SSE2-NEXT:    pextrw $7, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB9_16
+; SSE2-NEXT:  ## %bb.15: ## %cond.load19
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSE2-NEXT:  LBB9_16: ## %else20
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_zero_v8i32_v8i1:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; SSE42-NEXT:    pxor %xmm0, %xmm0
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    je LBB9_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:  LBB9_2: ## %else
+; SSE42-NEXT:    pextrb $2, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB9_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm0
+; SSE42-NEXT:  LBB9_4: ## %else2
+; SSE42-NEXT:    pextrb $4, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB9_6
+; SSE42-NEXT:  ## %bb.5: ## %cond.load4
+; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm0
+; SSE42-NEXT:  LBB9_6: ## %else5
+; SSE42-NEXT:    pextrb $6, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB9_8
+; SSE42-NEXT:  ## %bb.7: ## %cond.load7
+; SSE42-NEXT:    pinsrd $3, 12(%rdi), %xmm0
+; SSE42-NEXT:  LBB9_8: ## %else8
+; SSE42-NEXT:    pextrb $8, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB9_10
+; SSE42-NEXT:  ## %bb.9: ## %cond.load10
+; SSE42-NEXT:    pinsrd $0, 16(%rdi), %xmm1
+; SSE42-NEXT:  LBB9_10: ## %else11
+; SSE42-NEXT:    pextrb $10, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB9_12
+; SSE42-NEXT:  ## %bb.11: ## %cond.load13
+; SSE42-NEXT:    pinsrd $1, 20(%rdi), %xmm1
+; SSE42-NEXT:  LBB9_12: ## %else14
+; SSE42-NEXT:    pextrb $12, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB9_14
+; SSE42-NEXT:  ## %bb.13: ## %cond.load16
+; SSE42-NEXT:    pinsrd $2, 24(%rdi), %xmm1
+; SSE42-NEXT:  LBB9_14: ## %else17
+; SSE42-NEXT:    pextrb $14, %xmm2, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB9_16
+; SSE42-NEXT:  ## %bb.15: ## %cond.load19
+; SSE42-NEXT:    pinsrd $3, 28(%rdi), %xmm1
+; SSE42-NEXT:  LBB9_16: ## %else20
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_zero_v8i32_v8i1:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zero_v8i32_v8i1:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_zero_v8i32_v8i1:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_zero_v8i32_v8i1:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpmovw2m %xmm0, %k1
+; AVX512VLBW-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT:    retq
+  %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
+  ret <8 x i32> %res
+}
+
+define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
+; SSE2-LABEL: load_v2f32_v2i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB10_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE2-NEXT:  LBB10_2: ## %else
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB10_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:  LBB10_4: ## %else2
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_v2f32_v2i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    movdqa %xmm0, %xmm3
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE42-NEXT:    pcmpeqq %xmm2, %xmm3
+; SSE42-NEXT:    pextrb $0, %xmm3, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB10_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:  LBB10_2: ## %else
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT:    pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB10_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; SSE42-NEXT:  LBB10_4: ## %else2
+; SSE42-NEXT:    movaps %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_v2f32_v2i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_v2f32_v2i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_v2f32_v2i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_v2f32_v2i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
+; SSE2-LABEL: load_v2i32_v2i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB11_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movl (%rdi), %eax
+; SSE2-NEXT:    movq %rax, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:  LBB11_2: ## %else
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB11_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movl 4(%rdi), %eax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:  LBB11_4: ## %else2
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_v2i32_v2i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    movdqa %xmm0, %xmm3
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE42-NEXT:    pcmpeqq %xmm2, %xmm3
+; SSE42-NEXT:    pextrb $0, %xmm3, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB11_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movl (%rdi), %eax
+; SSE42-NEXT:    pinsrq $0, %rax, %xmm1
+; SSE42-NEXT:  LBB11_2: ## %else
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT:    pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB11_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    movl 4(%rdi), %eax
+; SSE42-NEXT:    pinsrq $1, %rax, %xmm1
+; SSE42-NEXT:  LBB11_4: ## %else2
+; SSE42-NEXT:    movdqa %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_v2i32_v2i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_v2i32_v2i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_v2i32_v2i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_v2i32_v2i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX512VLBW-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
+; AVX512VLBW-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x float> @load_undef_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr) {
+; SSE2-LABEL: load_undef_v2f32_v2i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    ## implicit-def: $xmm0
+; SSE2-NEXT:    je LBB12_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.load
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:  LBB12_2: ## %else
+; SSE2-NEXT:    pextrw $4, %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB12_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.load1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:  LBB12_4: ## %else2
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_undef_v2f32_v2i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT:    pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    ## implicit-def: $xmm0
+; SSE42-NEXT:    je LBB12_2
+; SSE42-NEXT:  ## %bb.1: ## %cond.load
+; SSE42-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE42-NEXT:  LBB12_2: ## %else
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE42-NEXT:    pcmpeqq %xmm2, %xmm1
+; SSE42-NEXT:    pextrb $8, %xmm1, %eax
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    je LBB12_4
+; SSE42-NEXT:  ## %bb.3: ## %cond.load1
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE42-NEXT:  LBB12_4: ## %else2
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_undef_v2f32_v2i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_undef_v2f32_v2i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_undef_v2f32_v2i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_undef_v2f32_v2i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
+  ret <2 x float> %res
+}
+
+define <4 x float> @load_all_v4f32_v4i32(<4 x i32> %trigger, <4 x float>* %addr) {
+; SSE-LABEL: load_all_v4f32_v4i32:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: load_all_v4f32_v4i32:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vmovups (%rdi), %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: load_all_v4f32_v4i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    movw $15, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_all_v4f32_v4i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512VLBW-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
+  ret <4 x float> %res
+}
+
+;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
+
+; 128-bit FP vectors are supported with AVX.
+
+define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
+; SSE2-LABEL: mload_constmask_v4f32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: mload_constmask_v4f32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE42-NEXT:    retq
+;
+; AVX1OR2-LABEL: mload_constmask_v4f32:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v4f32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    movw $13, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v4f32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $13, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovups (%rdi), %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
+  ret <4 x float> %res
+}
+
+define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) {
+; SSE-LABEL: mload_constmask_v2f64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: mload_constmask_v2f64:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    retq
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
+  ret <2 x double> %res
+}
+
+; 128-bit integer vectors are supported with AVX2.
+
+define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
+; SSE2-LABEL: mload_constmask_v4i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,2]
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: mload_constmask_v4i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm0
+; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm0
+; SSE42-NEXT:    pinsrd $3, 12(%rdi), %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: mload_constmask_v4i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: mload_constmask_v4i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v4i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    movw $14, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v4i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $14, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
+; SSE2-LABEL: mload_constmask_v2i64:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: mload_constmask_v2i64:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pinsrq $1, 8(%rdi), %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: mload_constmask_v2i64:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
+  ret <2 x i64> %res
+}
+
+; 256-bit FP vectors are supported with AVX.
+
+define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
+; SSE2-LABEL: mload_constmask_v8f32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: mload_constmask_v8f32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE42-NEXT:    retq
+;
+; AVX1OR2-LABEL: mload_constmask_v8f32:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmaskmovps (%rdi), %ymm1, %ymm1
+; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v8f32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    movw $7, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v8f32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $7, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovups (%rdi), %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
+  ret <8 x float> %res
+}
+
+define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
+; SSE-LABEL: mload_constmask_v4f64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: mload_constmask_v4f64:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX1OR2-NEXT:    vmaskmovpd (%rdi), %ymm1, %ymm1
+; AVX1OR2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v4f64:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    movb $7, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v4f64:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $7, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
+  ret <4 x double> %res
+}
+
+; 256-bit integer vectors are supported with AVX2.
+
+define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
+; SSE2-LABEL: mload_constmask_v8i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: mload_constmask_v8i32:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm0
+; SSE42-NEXT:    pinsrd $1, 4(%rdi), %xmm0
+; SSE42-NEXT:    pinsrd $2, 8(%rdi), %xmm0
+; SSE42-NEXT:    pinsrd $3, 28(%rdi), %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1OR2-LABEL: mload_constmask_v8i32:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v8i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    movw $135, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v8i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $-121, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
+; SSE2-LABEL: mload_constmask_v4i64:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: mload_constmask_v4i64:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pinsrq $0, (%rdi), %xmm0
+; SSE42-NEXT:    pinsrq $1, 24(%rdi), %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1OR2-LABEL: mload_constmask_v4i64:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v4i64:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    movb $9, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v4i64:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $9, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
+  ret <4 x i64> %res
+}
+
+; 512-bit FP vectors are supported with AVX512.
+
+define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
+; SSE-LABEL: mload_constmask_v8f64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE-NEXT:    movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: mload_constmask_v8f64:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
+; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v8f64:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    movb $-121, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v8f64:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $-121, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
+  ret <8 x double> %res
+}
+
+; If the pass-through operand is undef, no blend is needed.
+
+define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
+; SSE-LABEL: mload_constmask_v4f64_undef_passthrough:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: mload_constmask_v4f64_undef_passthrough:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX1OR2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    movb $7, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v4f64_undef_passthrough:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $7, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT:    retq
+  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
+  ret <4 x double> %res
+}
+
+define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
+; SSE-LABEL: mload_constmask_v4i64_undef_passthrough:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
+; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    movb $6, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $6, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT:    retq
+  %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
+  ret <4 x i64> %res
+}
+
+;  When only one element of the mask is set, reduce to a scalar load.
+
+define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
+; SSE2-LABEL: load_one_mask_bit_set1:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_one_mask_bit_set1:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pinsrd $0, (%rdi), %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: load_one_mask_bit_set1:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
+  ret <4 x i32> %res
+}
+
+; Choose a different element to show that the correct address offset is produced.
+
+define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
+; SSE2-LABEL: load_one_mask_bit_set2:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_one_mask_bit_set2:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: load_one_mask_bit_set2:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT:    retq
+  %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
+  ret <4 x float> %res
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
+; SSE2-LABEL: load_one_mask_bit_set3:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: load_one_mask_bit_set3:
+; SSE42:       ## %bb.0:
+; SSE42-NEXT:    pinsrq $0, 16(%rdi), %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: load_one_mask_bit_set3:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_one_mask_bit_set3:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_one_mask_bit_set3:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
+  ret <4 x i64> %res
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
+; SSE-LABEL: load_one_mask_bit_set4:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_one_mask_bit_set4:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
+  ret <4 x double> %res
+}
+
+; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
+
+define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
+; SSE-LABEL: load_one_mask_bit_set5:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: load_one_mask_bit_set5:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1OR2-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: load_one_mask_bit_set5:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
+  ret <8 x double> %res
+}
+
+define i32 @pr38986(i1 %c, i32* %p) {
+; SSE-LABEL: pr38986:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    testb $1, %dil
+; SSE-NEXT:    ## implicit-def: $eax
+; SSE-NEXT:    je LBB30_2
+; SSE-NEXT:  ## %bb.1: ## %cond.load
+; SSE-NEXT:    movl (%rsi), %eax
+; SSE-NEXT:  LBB30_2: ## %else
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: pr38986:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    ## implicit-def: $eax
+; AVX-NEXT:    je LBB30_2
+; AVX-NEXT:  ## %bb.1: ## %cond.load
+; AVX-NEXT:    movl (%rsi), %eax
+; AVX-NEXT:  LBB30_2: ## %else
+; AVX-NEXT:    retq
+ %vc = insertelement <1 x i1> undef, i1 %c, i32 0
+ %vp = bitcast i32* %p to <1 x i32>*
+ %L = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32 (<1 x i32>* %vp, i32 4, <1 x i1> %vc, <1 x i32> undef)
+ %ret = bitcast <1 x i32> %L to i32
+ ret i32 %ret
+}
+declare <1 x i32>  @llvm.masked.load.v1i32.p0v1i32 (<1 x i32>*, i32, <1 x i1>, <1 x i32>)
+
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
+declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
+declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
deleted file mode 100644
index f1ab2aeefa818f5030a4ef87638793d88f2656b6..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/masked_memop.ll
+++ /dev/null
@@ -1,1434 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX  --check-prefix=AVX1
-; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX
-
-; To test for the case where masked load/store is not legal, we should add a run with a target
-; that does not have AVX, but that case should probably be a separate test file using less tests
-; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov.
-
-define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
-; AVX-LABEL: loadv1:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    testq %rdi, %rdi
-; AVX-NEXT:    jne LBB0_2
-; AVX-NEXT:  ## %bb.1: ## %cond.load
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:  LBB0_2: ## %else
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: loadv1:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    testq %rdi, %rdi
-; AVX512-NEXT:    jne LBB0_2
-; AVX512-NEXT:  ## %bb.1: ## %cond.load
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:  LBB0_2: ## %else
-; AVX512-NEXT:    retq
-  %mask = icmp eq <1 x i64> %trigger, zeroinitializer
-  %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst)
-  ret <1 x double> %res
-}
-declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
-
-define void @storev1(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) {
-; AVX-LABEL: storev1:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    testl %edi, %edi
-; AVX-NEXT:    jne LBB1_2
-; AVX-NEXT:  ## %bb.1: ## %cond.store
-; AVX-NEXT:    movl %edx, (%rsi)
-; AVX-NEXT:  LBB1_2: ## %else
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: storev1:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    testl %edi, %edi
-; AVX512-NEXT:    jne LBB1_2
-; AVX512-NEXT:  ## %bb.1: ## %cond.store
-; AVX512-NEXT:    movl %edx, (%rsi)
-; AVX512-NEXT:  LBB1_2: ## %else
-; AVX512-NEXT:    retq
-  %mask = icmp eq <1 x i32> %trigger, zeroinitializer
-  call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask)
-  ret void
-}
-declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>)
-
-define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
-; AVX-LABEL: test6:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: test6:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test6:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; SKX-NEXT:    vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
-  ret <2 x double> %res
-}
-
-define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
-; AVX-LABEL: test7:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: test7:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test7:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; SKX-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-  %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
-  ret <4 x float> %res
-}
-
-define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
-; AVX1-LABEL: test8:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test8:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
-; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test8:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test8:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; SKX-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
-  ret <4 x i32> %res
-}
-
-define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
-; AVX1-LABEL: test9:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test9:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test9:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test9:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; SKX-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
-  ret void
-}
-
-define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
-; AVX1-LABEL: test10:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test10:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test10:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test10:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; SKX-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
-  ret <4 x double> %res
-}
-
-define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
-; AVX1-LABEL: test10b:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test10b:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test10b:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test10b:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; SKX-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
-  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
-  ret <4 x double> %res
-}
-
-define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
-; AVX1-LABEL: test11a:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
-; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test11a:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test11a:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test11a:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; SKX-NEXT:    vblendmps (%rdi), %ymm1, %ymm0 {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-  %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
-  ret <8 x float> %res
-}
-
-define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
-; AVX1-LABEL: test11b:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
-; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test11b:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
-; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test11b:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test11b:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
-; SKX-NEXT:    vpmovw2m %xmm0, %k1
-; SKX-NEXT:    vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
-; SKX-NEXT:    retq
-  %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
-  ret <8 x i32> %res
-}
-
-define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
-; AVX1-LABEL: test11c:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test11c:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
-; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test11c:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test11c:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
-; SKX-NEXT:    vpmovw2m %xmm0, %k1
-; SKX-NEXT:    vmovaps (%rdi), %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
-  %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
-  ret <8 x float> %res
-}
-
-define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
-; AVX1-LABEL: test11d:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test11d:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
-; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test11d:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test11d:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
-; SKX-NEXT:    vpmovw2m %xmm0, %k1
-; SKX-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
-  %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
-  ret <8 x i32> %res
-}
-
-define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
-; AVX1-LABEL: test12:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test12:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test12:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test12:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; SKX-NEXT:    vmovdqu32 %ymm1, (%rdi) {%k1}
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-  call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
-  ret void
-}
-
-define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
-; AVX1-LABEL: test14:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test14:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test14:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-NEXT:    vmovups %zmm1, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test14:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; SKX-NEXT:    vmovups %xmm1, (%rdi) {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-  call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
-  ret void
-}
-
-define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
-; AVX1-LABEL: test15:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test15:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test15:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test15:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; SKX-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-  call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
-  ret void
-}
-
-define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
-; AVX1-LABEL: test16:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test16:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test16:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test16:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; SKX-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-  %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
-  ret <2 x float> %res
-}
-
-define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
-; AVX1-LABEL: test17:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test17:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test17:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test17:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; SKX-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
-; SKX-NEXT:    vpmovsxdq %xmm0, %xmm0
-; SKX-NEXT:    retq
-  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-  %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
-  ret <2 x i32> %res
-}
-
-define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
-; AVX1-LABEL: test18:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test18:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test18:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test18:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
-; SKX-NEXT:    retq
-  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-  %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
-  ret <2 x float> %res
-}
-
-define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
-; AVX-LABEL: load_all:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovups (%rdi), %xmm0
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: load_all:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    movw $15, %ax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: load_all:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
-; SKX-NEXT:    retq
-  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-  %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
-  ret <4 x float> %res
-}
-
-;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
-
-; 128-bit FP vectors are supported with AVX.
-
-define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
-; AVX-LABEL: mload_constmask_v4f32:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v4f32:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    movw $13, %ax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v4f32:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $13, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1}
-; SKX-NEXT:    retq
-  %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
-  ret <4 x float> %res
-}
-
-define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) {
-; AVX-LABEL: mload_constmask_v2f64:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: mload_constmask_v2f64:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512-NEXT:    retq
-  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
-  ret <2 x double> %res
-}
-
-; 128-bit integer vectors are supported with AVX2.
-
-define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
-; AVX1-LABEL: mload_constmask_v4i32:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
-; AVX1-NEXT:    vmaskmovps (%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: mload_constmask_v4i32:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
-; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v4i32:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    movw $14, %ax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v4i32:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $14, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
-; SKX-NEXT:    retq
-  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
-  ret <4 x i32> %res
-}
-
-define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
-; AVX-LABEL: mload_constmask_v2i64:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: mload_constmask_v2i64:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm0
-; AVX512-NEXT:    retq
-  %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
-  ret <2 x i64> %res
-}
-
-; 256-bit FP vectors are supported with AVX.
-
-define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
-; AVX-LABEL: mload_constmask_v8f32:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
-; AVX-NEXT:    vmaskmovps (%rdi), %ymm1, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v8f32:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    movw $7, %ax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v8f32:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $7, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovups (%rdi), %ymm0 {%k1}
-; SKX-NEXT:    retq
-  %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
-  ret <8 x float> %res
-}
-
-define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
-; AVX-LABEL: mload_constmask_v4f64:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
-; AVX-NEXT:    vmaskmovpd (%rdi), %ymm1, %ymm1
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v4f64:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    movb $7, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v4f64:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $7, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
-; SKX-NEXT:    retq
-  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
-  ret <4 x double> %res
-}
-
-; 256-bit integer vectors are supported with AVX2.
-
-define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
-; AVX-LABEL: mload_constmask_v8i32:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v8i32:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    movw $135, %ax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v8i32:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $-121, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1}
-; SKX-NEXT:    retq
-  %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
-  ret <8 x i32> %res
-}
-
-define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
-; AVX-LABEL: mload_constmask_v4i64:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v4i64:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    movb $9, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v4i64:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $9, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1}
-; SKX-NEXT:    retq
-  %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
-  ret <4 x i64> %res
-}
-
-; 512-bit FP vectors are supported with AVX512.
-
-define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
-; AVX-LABEL: mload_constmask_v8f64:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v8f64:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    movb $-121, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v8f64:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $-121, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
-; SKX-NEXT:    retq
-  %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
-  ret <8 x double> %res
-}
-
-; If the pass-through operand is undef, no blend is needed.
-
-define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
-; AVX-LABEL: mload_constmask_v4f64_undef_passthrough:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
-; AVX-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    movb $7, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v4f64_undef_passthrough:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $7, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
-  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
-  ret <4 x double> %res
-}
-
-define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
-; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
-; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
-; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    movb $6, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: mload_constmask_v4i64_undef_passthrough:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movb $6, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
-  %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
-  ret <4 x i64> %res
-}
-
-define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
-; AVX1-LABEL: test21:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test21:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test21:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    movw $15, %ax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: test21:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
-; SKX-NEXT:    retq
-  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
-  ret void
-}
-
-;  When only one element of the mask is set, reduce to a scalar store.
-
-define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
-; AVX-LABEL: one_mask_bit_set1:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovss %xmm0, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: one_mask_bit_set1:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovss %xmm0, (%rdi)
-; AVX512-NEXT:    retq
-  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
-  ret void
-}
-
-; Choose a different element to show that the correct address offset is produced.
-
-define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
-; AVX-LABEL: one_mask_bit_set2:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vextractps $2, %xmm0, 8(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: one_mask_bit_set2:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextractps $2, %xmm0, 8(%rdi)
-; AVX512-NEXT:    retq
-  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
-  ret void
-}
-
-; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
-
-define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
-; AVX-LABEL: one_mask_bit_set3:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovlps %xmm0, 16(%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: one_mask_bit_set3:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovlps %xmm0, 16(%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
-  ret void
-}
-
-; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
-
-define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
-; AVX-LABEL: one_mask_bit_set4:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovhpd %xmm0, 24(%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: one_mask_bit_set4:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovhpd %xmm0, 24(%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
-  ret void
-}
-
-; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
-
-define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
-; AVX-LABEL: one_mask_bit_set5:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX-NEXT:    vmovlps %xmm0, 48(%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: one_mask_bit_set5:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
-; AVX512-NEXT:    vmovlps %xmm0, 48(%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
-  ret void
-}
-
-;  When only one element of the mask is set, reduce to a scalar load.
-
-define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
-; AVX-LABEL: load_one_mask_bit_set1:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: load_one_mask_bit_set1:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
-; AVX512-NEXT:    retq
-  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
-  ret <4 x i32> %res
-}
-
-; Choose a different element to show that the correct address offset is produced.
-
-define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
-; AVX-LABEL: load_one_mask_bit_set2:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: load_one_mask_bit_set2:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX512-NEXT:    retq
-  %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
-  ret <4 x float> %res
-}
-
-; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
-
-define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
-; AVX1-LABEL: load_one_mask_bit_set3:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_one_mask_bit_set3:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_one_mask_bit_set3:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
-  ret <4 x i64> %res
-}
-
-; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
-
-define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
-; AVX-LABEL: load_one_mask_bit_set4:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: load_one_mask_bit_set4:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
-  ret <4 x double> %res
-}
-
-; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
-
-define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
-; AVX-LABEL: load_one_mask_bit_set5:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: load_one_mask_bit_set5:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
-  %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
-  ret <8 x double> %res
-}
-
-; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
-; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
-
-define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
-; AVX-LABEL: trunc_mask:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmaskmovps %xmm0, %xmm2, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_mask:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm2 killed $xmm2 def $zmm2
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-NEXT:    vmovups %zmm0, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: trunc_mask:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpgtd %xmm2, %xmm1, %k1
-; SKX-NEXT:    vmovups %xmm0, (%rdi) {%k1}
-; SKX-NEXT:    retq
-  %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
-  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask)
-  ret void
-}
-
-; SimplifyDemandedBits eliminates an ashr here.
-
-define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x double>* %p, <4 x i32> %masksrc) {
-; AVX1-LABEL: masked_store_bool_mask_demand_trunc_sext:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vmaskmovpd %ymm0, %ymm1, (%rdi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: masked_store_bool_mask_demand_trunc_sext:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT:    vmaskmovpd %ymm0, %ymm1, (%rdi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: masked_store_bool_mask_demand_trunc_sext:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: masked_store_bool_mask_demand_trunc_sext:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
-; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT:    vmovupd %ymm0, (%rdi) {%k1}
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %sext = sext <4 x i32> %masksrc to <4 x i64>
-  %boolmask = trunc <4 x i64> %sext to <4 x i1>
-  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %x, <4 x double>* %p, i32 4, <4 x i1> %boolmask)
-  ret void
-}
-
-; This needs to be widened to v4i32.
-; This used to assert in type legalization. PR38436
-; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask.
-define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
-; AVX1-LABEL: widen_masked_store:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vmovd %edx, %xmm1
-; AVX1-NEXT:    vmovd %esi, %xmm2
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT:    vmaskmovps %xmm0, %xmm1, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: widen_masked_store:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovd %edx, %xmm1
-; AVX2-NEXT:    vmovd %esi, %xmm2
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaskmovd %xmm0, %xmm1, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: widen_masked_store:
-; AVX512F:       ## %bb.0:
-; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; SKX-LABEL: widen_masked_store:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
-; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vmovdqa32 %xmm1, %xmm1 {%k1} {z}
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
-; SKX-NEXT:    retq
-  call void @llvm.masked.store.v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)
-  ret void
-}
-declare void @llvm.masked.store.v3i32(<3 x i32>, <3 x i32>*, i32, <3 x i1>)
-
-define i32 @pr38986(i1 %c, i32* %p) {
-; AVX-LABEL: pr38986:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    testb $1, %dil
-; AVX-NEXT:    ## implicit-def: $eax
-; AVX-NEXT:    je LBB44_2
-; AVX-NEXT:  ## %bb.1: ## %cond.load
-; AVX-NEXT:    movl (%rsi), %eax
-; AVX-NEXT:  LBB44_2: ## %else
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: pr38986:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    testb $1, %dil
-; AVX512-NEXT:    ## implicit-def: $eax
-; AVX512-NEXT:    je LBB44_2
-; AVX512-NEXT:  ## %bb.1: ## %cond.load
-; AVX512-NEXT:    movl (%rsi), %eax
-; AVX512-NEXT:  LBB44_2: ## %else
-; AVX512-NEXT:    retq
- %vc = insertelement <1 x i1> undef, i1 %c, i32 0
- %vp = bitcast i32* %p to <1 x i32>*
- %L = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32 (<1 x i32>* %vp, i32 4, <1 x i1> %vc, <1 x i32> undef)
- %ret = bitcast <1 x i32> %L to i32
- ret i32 %ret
-}
-declare <1 x i32>  @llvm.masked.load.v1i32.p0v1i32 (<1 x i32>*, i32, <1 x i1>, <1 x i32>)
-
-
-declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
-declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
-declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
-declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
-declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
-declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
-declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
-declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
-declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
-declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
-declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
-declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
-declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
-declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
-declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
-declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
-
diff --git a/test/CodeGen/X86/masked_store.ll b/test/CodeGen/X86/masked_store.ll
new file mode 100644
index 0000000000000000000000000000000000000000..61edacdba9586edb8e27613078d3af8277e5b613
--- /dev/null
+++ b/test/CodeGen/X86/masked_store.ll
@@ -0,0 +1,940 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse2    | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse4.2  | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
+
+define void @store_v1i32_v1i32(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) {
+; SSE-LABEL: store_v1i32_v1i32:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    testl %edi, %edi
+; SSE-NEXT:    jne LBB0_2
+; SSE-NEXT:  ## %bb.1: ## %cond.store
+; SSE-NEXT:    movl %edx, (%rsi)
+; SSE-NEXT:  LBB0_2: ## %else
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: store_v1i32_v1i32:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    testl %edi, %edi
+; AVX-NEXT:    jne LBB0_2
+; AVX-NEXT:  ## %bb.1: ## %cond.store
+; AVX-NEXT:    movl %edx, (%rsi)
+; AVX-NEXT:  LBB0_2: ## %else
+; AVX-NEXT:    retq
+  %mask = icmp eq <1 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask)
+  ret void
+}
+
+define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+; SSE2-LABEL: store_v4i32_v4i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB1_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.store
+; SSE2-NEXT:    movd %xmm1, (%rdi)
+; SSE2-NEXT:  LBB1_2: ## %else
+; SSE2-NEXT:    pextrw $2, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB1_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.store1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, 4(%rdi)
+; SSE2-NEXT:  LBB1_4: ## %else2
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB1_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.store3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, 8(%rdi)
+; SSE2-NEXT:  LBB1_6: ## %else4
+; SSE2-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB1_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.store5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm0, 12(%rdi)
+; SSE2-NEXT:  LBB1_8: ## %else6
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: store_v4i32_v4i32:
+; SSE4:       ## %bb.0:
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE4-NEXT:    pextrb $0, %xmm2, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB1_2
+; SSE4-NEXT:  ## %bb.1: ## %cond.store
+; SSE4-NEXT:    movss %xmm1, (%rdi)
+; SSE4-NEXT:  LBB1_2: ## %else
+; SSE4-NEXT:    pextrb $4, %xmm2, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB1_4
+; SSE4-NEXT:  ## %bb.3: ## %cond.store1
+; SSE4-NEXT:    extractps $1, %xmm1, 4(%rdi)
+; SSE4-NEXT:  LBB1_4: ## %else2
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE4-NEXT:    pextrb $8, %xmm0, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB1_6
+; SSE4-NEXT:  ## %bb.5: ## %cond.store3
+; SSE4-NEXT:    extractps $2, %xmm1, 8(%rdi)
+; SSE4-NEXT:  LBB1_6: ## %else4
+; SSE4-NEXT:    pextrb $12, %xmm0, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB1_8
+; SSE4-NEXT:  ## %bb.7: ## %cond.store5
+; SSE4-NEXT:    extractps $3, %xmm1, 12(%rdi)
+; SSE4-NEXT:  LBB1_8: ## %else6
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: store_v4i32_v4i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: store_v4i32_v4i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: store_v4i32_v4i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: store_v4i32_v4i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
+; SSE2-LABEL: store_v8i32_v8i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    packssdw %xmm0, %xmm5
+; SSE2-NEXT:    movd %xmm5, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.store
+; SSE2-NEXT:    movd %xmm2, (%rdi)
+; SSE2-NEXT:  LBB2_2: ## %else
+; SSE2-NEXT:    psrlq $16, %xmm4
+; SSE2-NEXT:    movd %xmm4, %eax
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.store1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movd %xmm4, 4(%rdi)
+; SSE2-NEXT:  LBB2_4: ## %else2
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.store3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm4, 8(%rdi)
+; SSE2-NEXT:  LBB2_6: ## %else4
+; SSE2-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.store5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE2-NEXT:    movd %xmm0, 12(%rdi)
+; SSE2-NEXT:  LBB2_8: ## %else6
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_10
+; SSE2-NEXT:  ## %bb.9: ## %cond.store7
+; SSE2-NEXT:    movd %xmm3, 16(%rdi)
+; SSE2-NEXT:  LBB2_10: ## %else8
+; SSE2-NEXT:    pextrw $2, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_12
+; SSE2-NEXT:  ## %bb.11: ## %cond.store9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    movd %xmm0, 20(%rdi)
+; SSE2-NEXT:  LBB2_12: ## %else10
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pextrw $4, %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_14
+; SSE2-NEXT:  ## %bb.13: ## %cond.store11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, 24(%rdi)
+; SSE2-NEXT:  LBB2_14: ## %else12
+; SSE2-NEXT:    pextrw $6, %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB2_16
+; SSE2-NEXT:  ## %bb.15: ## %cond.store13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
+; SSE2-NEXT:    movd %xmm0, 28(%rdi)
+; SSE2-NEXT:  LBB2_16: ## %else14
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: store_v8i32_v8i32:
+; SSE4:       ## %bb.0:
+; SSE4-NEXT:    pxor %xmm4, %xmm4
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE4-NEXT:    pextrb $0, %xmm4, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB2_2
+; SSE4-NEXT:  ## %bb.1: ## %cond.store
+; SSE4-NEXT:    movss %xmm2, (%rdi)
+; SSE4-NEXT:  LBB2_2: ## %else
+; SSE4-NEXT:    pextrb $4, %xmm4, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB2_4
+; SSE4-NEXT:  ## %bb.3: ## %cond.store1
+; SSE4-NEXT:    extractps $1, %xmm2, 4(%rdi)
+; SSE4-NEXT:  LBB2_4: ## %else2
+; SSE4-NEXT:    pxor %xmm4, %xmm4
+; SSE4-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE4-NEXT:    pextrb $8, %xmm0, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB2_6
+; SSE4-NEXT:  ## %bb.5: ## %cond.store3
+; SSE4-NEXT:    extractps $2, %xmm2, 8(%rdi)
+; SSE4-NEXT:  LBB2_6: ## %else4
+; SSE4-NEXT:    pextrb $12, %xmm0, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB2_8
+; SSE4-NEXT:  ## %bb.7: ## %cond.store5
+; SSE4-NEXT:    extractps $3, %xmm2, 12(%rdi)
+; SSE4-NEXT:  LBB2_8: ## %else6
+; SSE4-NEXT:    pxor %xmm0, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE4-NEXT:    pextrb $0, %xmm0, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB2_10
+; SSE4-NEXT:  ## %bb.9: ## %cond.store7
+; SSE4-NEXT:    movss %xmm3, 16(%rdi)
+; SSE4-NEXT:  LBB2_10: ## %else8
+; SSE4-NEXT:    pextrb $4, %xmm0, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB2_12
+; SSE4-NEXT:  ## %bb.11: ## %cond.store9
+; SSE4-NEXT:    extractps $1, %xmm3, 20(%rdi)
+; SSE4-NEXT:  LBB2_12: ## %else10
+; SSE4-NEXT:    pxor %xmm0, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE4-NEXT:    pextrb $8, %xmm1, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB2_14
+; SSE4-NEXT:  ## %bb.13: ## %cond.store11
+; SSE4-NEXT:    extractps $2, %xmm3, 24(%rdi)
+; SSE4-NEXT:  LBB2_14: ## %else12
+; SSE4-NEXT:    pextrb $12, %xmm1, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB2_16
+; SSE4-NEXT:  ## %bb.15: ## %cond.store13
+; SSE4-NEXT:    extractps $3, %xmm3, 28(%rdi)
+; SSE4-NEXT:  LBB2_16: ## %else14
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: store_v8i32_v8i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: store_v8i32_v8i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: store_v8i32_v8i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: store_v8i32_v8i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX512VLBW-NEXT:    vmovdqu32 %ymm1, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
+  ret void
+}
+
+define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
+; SSE2-LABEL: store_v2f32_v2i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB3_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.store
+; SSE2-NEXT:    movss %xmm1, (%rdi)
+; SSE2-NEXT:  LBB3_2: ## %else
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB3_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.store1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movss %xmm1, 4(%rdi)
+; SSE2-NEXT:  LBB3_4: ## %else2
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: store_v2f32_v2i32:
+; SSE4:       ## %bb.0:
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    movdqa %xmm0, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE4-NEXT:    pcmpeqq %xmm2, %xmm3
+; SSE4-NEXT:    pextrb $0, %xmm3, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB3_2
+; SSE4-NEXT:  ## %bb.1: ## %cond.store
+; SSE4-NEXT:    movss %xmm1, (%rdi)
+; SSE4-NEXT:  LBB3_2: ## %else
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE4-NEXT:    pcmpeqq %xmm2, %xmm0
+; SSE4-NEXT:    pextrb $8, %xmm0, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB3_4
+; SSE4-NEXT:  ## %bb.3: ## %cond.store1
+; SSE4-NEXT:    extractps $1, %xmm1, 4(%rdi)
+; SSE4-NEXT:  LBB3_4: ## %else2
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: store_v2f32_v2i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: store_v2f32_v2i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: store_v2f32_v2i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-NEXT:    vmovups %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: store_v2f32_v2i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vmovups %xmm1, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
+; SSE2-LABEL: store_v2i32_v2i32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB4_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.store
+; SSE2-NEXT:    movd %xmm1, (%rdi)
+; SSE2-NEXT:  LBB4_2: ## %else
+; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB4_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.store1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, 4(%rdi)
+; SSE2-NEXT:  LBB4_4: ## %else2
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: store_v2i32_v2i32:
+; SSE4:       ## %bb.0:
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    movdqa %xmm0, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE4-NEXT:    pcmpeqq %xmm2, %xmm3
+; SSE4-NEXT:    pextrb $0, %xmm3, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB4_2
+; SSE4-NEXT:  ## %bb.1: ## %cond.store
+; SSE4-NEXT:    movss %xmm1, (%rdi)
+; SSE4-NEXT:  LBB4_2: ## %else
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE4-NEXT:    pcmpeqq %xmm2, %xmm0
+; SSE4-NEXT:    pextrb $8, %xmm0, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB4_4
+; SSE4-NEXT:  ## %bb.3: ## %cond.store1
+; SSE4-NEXT:    extractps $2, %xmm1, 4(%rdi)
+; SSE4-NEXT:  LBB4_4: ## %else2
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: store_v2i32_v2i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: store_v2i32_v2i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: store_v2i32_v2i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: store_v2i32_v2i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VLBW-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @const_store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+; SSE-LABEL: const_store_v4i32_v4i32:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movups %xmm1, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: const_store_v4i32_v4i32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: const_store_v4i32_v4i32:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: const_store_v4i32_v4i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    movw $15, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: const_store_v4i32_v4i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512VLBW-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+;  When only one element of the mask is set, reduce to a scalar store.
+
+define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
+; SSE-LABEL: one_mask_bit_set1:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movss %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: one_mask_bit_set1:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vmovss %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
+  ret void
+}
+
+; Choose a different element to show that the correct address offset is produced.
+
+define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
+; SSE2-LABEL: one_mask_bit_set2:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movss %xmm0, 8(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: one_mask_bit_set2:
+; SSE4:       ## %bb.0:
+; SSE4-NEXT:    extractps $2, %xmm0, 8(%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: one_mask_bit_set2:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vextractps $2, %xmm0, 8(%rdi)
+; AVX-NEXT:    retq
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
+  ret void
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
+; SSE-LABEL: one_mask_bit_set3:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movlps %xmm1, 16(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: one_mask_bit_set3:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vmovlps %xmm0, 16(%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
+  ret void
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
+; SSE-LABEL: one_mask_bit_set4:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movhpd %xmm1, 24(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: one_mask_bit_set4:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vmovhpd %xmm0, 24(%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
+  ret void
+}
+
+; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
+
+define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
+; SSE-LABEL: one_mask_bit_set5:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movlps %xmm3, 48(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: one_mask_bit_set5:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1OR2-NEXT:    vmovlps %xmm0, 48(%rdi)
+; AVX1OR2-NEXT:    vzeroupper
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: one_mask_bit_set5:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT:    vmovlps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
+  ret void
+}
+
+; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
+; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
+
+define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
+; SSE2-LABEL: trunc_mask:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB11_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.store
+; SSE2-NEXT:    movss %xmm0, (%rdi)
+; SSE2-NEXT:  LBB11_2: ## %else
+; SSE2-NEXT:    pextrw $2, %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB11_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.store1
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; SSE2-NEXT:    movss %xmm1, 4(%rdi)
+; SSE2-NEXT:  LBB11_4: ## %else2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pextrw $4, %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB11_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.store3
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE2-NEXT:    movss %xmm2, 8(%rdi)
+; SSE2-NEXT:  LBB11_6: ## %else4
+; SSE2-NEXT:    pextrw $6, %xmm1, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB11_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.store5
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movss %xmm0, 12(%rdi)
+; SSE2-NEXT:  LBB11_8: ## %else6
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: trunc_mask:
+; SSE4:       ## %bb.0:
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE4-NEXT:    pextrb $0, %xmm1, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB11_2
+; SSE4-NEXT:  ## %bb.1: ## %cond.store
+; SSE4-NEXT:    movss %xmm0, (%rdi)
+; SSE4-NEXT:  LBB11_2: ## %else
+; SSE4-NEXT:    pextrb $4, %xmm1, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB11_4
+; SSE4-NEXT:  ## %bb.3: ## %cond.store1
+; SSE4-NEXT:    extractps $1, %xmm0, 4(%rdi)
+; SSE4-NEXT:  LBB11_4: ## %else2
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE4-NEXT:    pextrb $8, %xmm1, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB11_6
+; SSE4-NEXT:  ## %bb.5: ## %cond.store3
+; SSE4-NEXT:    extractps $2, %xmm0, 8(%rdi)
+; SSE4-NEXT:  LBB11_6: ## %else4
+; SSE4-NEXT:    pextrb $12, %xmm1, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB11_8
+; SSE4-NEXT:  ## %bb.7: ## %cond.store5
+; SSE4-NEXT:    extractps $3, %xmm0, 12(%rdi)
+; SSE4-NEXT:  LBB11_8: ## %else6
+; SSE4-NEXT:    retq
+;
+; AVX1OR2-LABEL: trunc_mask:
+; AVX1OR2:       ## %bb.0:
+; AVX1OR2-NEXT:    vmaskmovps %xmm0, %xmm2, (%rdi)
+; AVX1OR2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_mask:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vmovups %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: trunc_mask:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpcmpgtd %xmm2, %xmm1, %k1
+; AVX512VLBW-NEXT:    vmovups %xmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    retq
+  %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask)
+  ret void
+}
+
+; SimplifyDemandedBits eliminates an ashr here.
+
+define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x double>* %p, <4 x i32> %masksrc) {
+; SSE2-LABEL: masked_store_bool_mask_demand_trunc_sext:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB12_2
+; SSE2-NEXT:  ## %bb.1: ## %cond.store
+; SSE2-NEXT:    movlpd %xmm0, (%rdi)
+; SSE2-NEXT:  LBB12_2: ## %else
+; SSE2-NEXT:    pextrw $2, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB12_4
+; SSE2-NEXT:  ## %bb.3: ## %cond.store1
+; SSE2-NEXT:    movhpd %xmm0, 8(%rdi)
+; SSE2-NEXT:  LBB12_4: ## %else2
+; SSE2-NEXT:    pextrw $4, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB12_6
+; SSE2-NEXT:  ## %bb.5: ## %cond.store3
+; SSE2-NEXT:    movlpd %xmm1, 16(%rdi)
+; SSE2-NEXT:  LBB12_6: ## %else4
+; SSE2-NEXT:    pextrw $6, %xmm2, %eax
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    je LBB12_8
+; SSE2-NEXT:  ## %bb.7: ## %cond.store5
+; SSE2-NEXT:    movhpd %xmm1, 24(%rdi)
+; SSE2-NEXT:  LBB12_8: ## %else6
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: masked_store_bool_mask_demand_trunc_sext:
+; SSE4:       ## %bb.0:
+; SSE4-NEXT:    pextrb $0, %xmm2, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB12_2
+; SSE4-NEXT:  ## %bb.1: ## %cond.store
+; SSE4-NEXT:    movlpd %xmm0, (%rdi)
+; SSE4-NEXT:  LBB12_2: ## %else
+; SSE4-NEXT:    pextrb $4, %xmm2, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB12_4
+; SSE4-NEXT:  ## %bb.3: ## %cond.store1
+; SSE4-NEXT:    movhpd %xmm0, 8(%rdi)
+; SSE4-NEXT:  LBB12_4: ## %else2
+; SSE4-NEXT:    pextrb $8, %xmm2, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB12_6
+; SSE4-NEXT:  ## %bb.5: ## %cond.store3
+; SSE4-NEXT:    movlpd %xmm1, 16(%rdi)
+; SSE4-NEXT:  LBB12_6: ## %else4
+; SSE4-NEXT:    pextrb $12, %xmm2, %eax
+; SSE4-NEXT:    testb $1, %al
+; SSE4-NEXT:    je LBB12_8
+; SSE4-NEXT:  ## %bb.7: ## %cond.store5
+; SSE4-NEXT:    movhpd %xmm1, 24(%rdi)
+; SSE4-NEXT:  LBB12_8: ## %else6
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: masked_store_bool_mask_demand_trunc_sext:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmaskmovpd %ymm0, %ymm1, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: masked_store_bool_mask_demand_trunc_sext:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vmaskmovpd %ymm0, %ymm1, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: masked_store_bool_mask_demand_trunc_sext:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: masked_store_bool_mask_demand_trunc_sext:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; AVX512VLBW-NEXT:    vmovupd %ymm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+  %sext = sext <4 x i32> %masksrc to <4 x i64>
+  %boolmask = trunc <4 x i64> %sext to <4 x i1>
+  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %x, <4 x double>* %p, i32 4, <4 x i1> %boolmask)
+  ret void
+}
+
+; This needs to be widened to v4i32.
+; This used to assert in type legalization. PR38436
+; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask.
+define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
+; SSE2-LABEL: widen_masked_store:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    testb $1, %sil
+; SSE2-NEXT:    jne LBB13_1
+; SSE2-NEXT:  ## %bb.2: ## %else
+; SSE2-NEXT:    testb $1, %dl
+; SSE2-NEXT:    jne LBB13_3
+; SSE2-NEXT:  LBB13_4: ## %else2
+; SSE2-NEXT:    testb $1, %cl
+; SSE2-NEXT:    jne LBB13_5
+; SSE2-NEXT:  LBB13_6: ## %else4
+; SSE2-NEXT:    retq
+; SSE2-NEXT:  LBB13_1: ## %cond.store
+; SSE2-NEXT:    movd %xmm0, (%rdi)
+; SSE2-NEXT:    testb $1, %dl
+; SSE2-NEXT:    je LBB13_4
+; SSE2-NEXT:  LBB13_3: ## %cond.store1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, 4(%rdi)
+; SSE2-NEXT:    testb $1, %cl
+; SSE2-NEXT:    je LBB13_6
+; SSE2-NEXT:  LBB13_5: ## %cond.store3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, 8(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: widen_masked_store:
+; SSE4:       ## %bb.0:
+; SSE4-NEXT:    testb $1, %sil
+; SSE4-NEXT:    jne LBB13_1
+; SSE4-NEXT:  ## %bb.2: ## %else
+; SSE4-NEXT:    testb $1, %dl
+; SSE4-NEXT:    jne LBB13_3
+; SSE4-NEXT:  LBB13_4: ## %else2
+; SSE4-NEXT:    testb $1, %cl
+; SSE4-NEXT:    jne LBB13_5
+; SSE4-NEXT:  LBB13_6: ## %else4
+; SSE4-NEXT:    retq
+; SSE4-NEXT:  LBB13_1: ## %cond.store
+; SSE4-NEXT:    movss %xmm0, (%rdi)
+; SSE4-NEXT:    testb $1, %dl
+; SSE4-NEXT:    je LBB13_4
+; SSE4-NEXT:  LBB13_3: ## %cond.store1
+; SSE4-NEXT:    extractps $1, %xmm0, 4(%rdi)
+; SSE4-NEXT:    testb $1, %cl
+; SSE4-NEXT:    je LBB13_6
+; SSE4-NEXT:  LBB13_5: ## %cond.store3
+; SSE4-NEXT:    extractps $2, %xmm0, 8(%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: widen_masked_store:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmovd %edx, %xmm1
+; AVX1-NEXT:    vmovd %esi, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT:    vmovd %ecx, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vmaskmovps %xmm0, %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: widen_masked_store:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vmovd %edx, %xmm1
+; AVX2-NEXT:    vmovd %esi, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-NEXT:    vmovd %ecx, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmaskmovd %xmm0, %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: widen_masked_store:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLBW-LABEL: widen_masked_store:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; AVX512VLBW-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vmovdqa32 %xmm1, %xmm1 {%k1} {z}
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512VLBW-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    retq
+  call void @llvm.masked.store.v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.v3i32(<3 x i32>, <3 x i32>*, i32, <3 x i1>)
+
+declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
+
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 1b4dd630244c8104d930068b7727fbb1492b6d8e..0bb46eeeac92bdb7c4fa13f7f03a604365d30447 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -359,6 +359,33 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
   ret i1 %c
 }
 
+define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length7_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $7
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length7_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $7, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
 define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
@@ -370,7 +397,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB13_2
+; X86-NEXT:    jne .LBB14_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 4(%esi), %ecx
 ; X86-NEXT:    movl 4(%eax), %edx
@@ -378,13 +405,13 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB13_3
-; X86-NEXT:  .LBB13_2: # %res_block
+; X86-NEXT:    je .LBB14_3
+; X86-NEXT:  .LBB14_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB13_3: # %endblock
+; X86-NEXT:  .LBB14_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -450,6 +477,89 @@ define i1 @length8_eq_const(i8* %X) nounwind {
   ret i1 %c
 }
 
+define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length9_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $9
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length9_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    xorq (%rsi), %rax
+; X64-NEXT:    movb 8(%rdi), %cl
+; X64-NEXT:    xorb 8(%rsi), %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length10_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $10
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length10_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    xorq (%rsi), %rax
+; X64-NEXT:    movzwl 8(%rdi), %ecx
+; X64-NEXT:    xorw 8(%rsi), %cx
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length11_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $11
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length11_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $11, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
 define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length12_eq:
 ; X86:       # %bb.0:
@@ -495,7 +605,7 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB17_2
+; X64-NEXT:    jne .LBB21_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
@@ -503,18 +613,99 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB17_3
-; X64-NEXT:  .LBB17_2: # %res_block
+; X64-NEXT:    je .LBB21_3
+; X64-NEXT:  .LBB21_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB17_3: # %endblock
+; X64-NEXT:  .LBB21_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
 }
 
+define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length13_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $13
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length13_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $13, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length14_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $14
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length14_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $14, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length15_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $15, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
 ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
 
 define i32 @length16(i8* %X, i8* %Y) nounwind {
@@ -535,7 +726,7 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB18_2
+; X64-NEXT:    jne .LBB25_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rcx
 ; X64-NEXT:    movq 8(%rsi), %rdx
@@ -543,13 +734,13 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB18_3
-; X64-NEXT:  .LBB18_2: # %res_block
+; X64-NEXT:    je .LBB25_3
+; X64-NEXT:  .LBB25_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB18_3: # %endblock
+; X64-NEXT:  .LBB25_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
@@ -1100,5 +1291,3 @@ define i32 @huge_length(i8* %X, i8* %Y) nounwind {
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind
   ret i32 %m
 }
-
-
diff --git a/test/CodeGen/X86/memcpy-from-string.ll b/test/CodeGen/X86/memcpy-from-string.ll
index 8e2444ebe0e9eacbbb58f4f75f2daf8e274b5e08..af88ffa843dfd0406655947b64fc6cb75f2e8ab5 100644
--- a/test/CodeGen/X86/memcpy-from-string.ll
+++ b/test/CodeGen/X86/memcpy-from-string.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s --check-prefix=X86
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -12,11 +13,12 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Memcpy lowering should emit stores of immediates containing string data from
 ; the correct offsets.
 
-; CHECK-LABEL: foo:
-; CHECK: movb  $0, 6(%rdi)
-; CHECK: movw  $15212, 4(%rdi)
-; CHECK: movl  $1802117222, (%rdi)
 define void @foo(i8* %tmp2) {
+; X86-LABEL: foo:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $3894379, 3(%rdi) # imm = 0x3B6C6B
+; X86-NEXT:    movl $1802117222, (%rdi) # imm = 0x6B6A2066
+; X86-NEXT:    retq
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @0, i64 0, i64 3), i64 7, i1 false)
   ret void
 }
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index a0511f2804a34b41c22021816db13dc43e4eeb5e..0b91628ef3e95555f7d5d6565e13a0cde22b0e59 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -10,6 +10,7 @@ define fastcc void @t1() nounwind {
 ; CHECK-NEXT:    pushl $0
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $16, %esp
+; CHECK-NEXT:    ud2
 entry:
   call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i1 false)
   unreachable
@@ -22,6 +23,7 @@ define fastcc void @t2(i8 signext %c) nounwind {
 ; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $76, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    calll _memset
+; CHECK-NEXT:    ud2
 entry:
   call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i1 false)
   unreachable
@@ -49,11 +51,10 @@ define void @t4(i8* nocapture %s, i8 %a) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    imull $16843009, %ecx, %ecx ## imm = 0x1010101
+; CHECK-NEXT:    movl %ecx, 11(%eax)
 ; CHECK-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-NEXT:    movl %ecx, 4(%eax)
 ; CHECK-NEXT:    movl %ecx, (%eax)
-; CHECK-NEXT:    movw %cx, 12(%eax)
-; CHECK-NEXT:    movb %cl, 14(%eax)
 ; CHECK-NEXT:    retl
 entry:
   tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 15, i1 false)
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
index 199bf75a00e9d1ece452b80d80b55fcdf7c071e0..c5b701b84d46c275756bc55760834ccfc7579961 100644
--- a/test/CodeGen/X86/memset-nonzero.ll
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -144,8 +144,8 @@ define void @memset_256_nonzero_bytes(i8* %x) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movl $42, %esi
 ; SSE-NEXT:    movl $256, %edx # imm = 0x100
+; SSE-NEXT:    movl $42, %esi
 ; SSE-NEXT:    callq memset
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
diff --git a/test/CodeGen/X86/memset-zero.ll b/test/CodeGen/X86/memset-zero.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f2f0d0730f56490adbf772c474830ab2675db9ab
--- /dev/null
+++ b/test/CodeGen/X86/memset-zero.ll
@@ -0,0 +1,344 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=core2 | FileCheck %s --check-prefix=CORE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+define void @memset_0(i8* %a) nounwind  {
+; X86-LABEL: memset_0:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_0:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_0:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 0, i1 false)
+	ret void
+}
+
+define void @memset_4(i8* %a) nounwind  {
+; X86-LABEL: memset_4:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_4:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movl $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_4:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    movl $0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 4, i1 false)
+	ret void
+}
+
+define void @memset_5(i8* %a) nounwind  {
+; X86-LABEL: memset_5:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_5:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movb $0, 4(%rdi)
+; CORE2-NEXT:    movl $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_5:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    movb $0, 4(%rdi)
+; NEHALEM-NEXT:    movl $0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 5, i1 false)
+	ret void
+}
+
+define void @memset_7(i8* %a) nounwind  {
+; X86-LABEL: memset_7:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, 3(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_7:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movl $0, 3(%rdi)
+; CORE2-NEXT:    movl $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_7:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    movl $0, 3(%rdi)
+; NEHALEM-NEXT:    movl $0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 7, i1 false)
+	ret void
+}
+
+define void @memset_8(i8* %a) nounwind  {
+; X86-LABEL: memset_8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_8:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_8:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    movq $0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 8, i1 false)
+	ret void
+}
+
+define void @memset_11(i8* %a) nounwind  {
+; X86-LABEL: memset_11:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, 7(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_11:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movl $0, 7(%rdi)
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_11:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    movl $0, 7(%rdi)
+; NEHALEM-NEXT:    movq $0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 11, i1 false)
+	ret void
+}
+
+define void @memset_13(i8* %a) nounwind  {
+; X86-LABEL: memset_13:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_13:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movq $0, 5(%rdi)
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_13:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    movq $0, 5(%rdi)
+; NEHALEM-NEXT:    movq $0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 13, i1 false)
+	ret void
+}
+
+define void @memset_15(i8* %a) nounwind  {
+; X86-LABEL: memset_15:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, 11(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_15:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movq $0, 7(%rdi)
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_15:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    movq $0, 7(%rdi)
+; NEHALEM-NEXT:    movq $0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 15, i1 false)
+	ret void
+}
+
+define void @memset_16(i8* %a) nounwind  {
+; X86-LABEL: memset_16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_16:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movq $0, 8(%rdi)
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_16:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    xorps %xmm0, %xmm0
+; NEHALEM-NEXT:    movups %xmm0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 16, i1 false)
+	ret void
+}
+
+define void @memset_17(i8* %a) nounwind  {
+; X86-LABEL: memset_17:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb $0, 16(%eax)
+; X86-NEXT:    movl $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_17:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movb $0, 16(%rdi)
+; CORE2-NEXT:    movq $0, 8(%rdi)
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_17:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    xorps %xmm0, %xmm0
+; NEHALEM-NEXT:    movups %xmm0, (%rdi)
+; NEHALEM-NEXT:    movb $0, 16(%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 17, i1 false)
+	ret void
+}
+
+define void @memset_19(i8* %a) nounwind  {
+; X86-LABEL: memset_19:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, 15(%eax)
+; X86-NEXT:    movl $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_19:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movl $0, 15(%rdi)
+; CORE2-NEXT:    movq $0, 8(%rdi)
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_19:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    xorps %xmm0, %xmm0
+; NEHALEM-NEXT:    movups %xmm0, (%rdi)
+; NEHALEM-NEXT:    movl $0, 15(%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 19, i1 false)
+	ret void
+}
+
+define void @memset_31(i8* %a) nounwind  {
+; X86-LABEL: memset_31:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, 27(%eax)
+; X86-NEXT:    movl $0, 24(%eax)
+; X86-NEXT:    movl $0, 20(%eax)
+; X86-NEXT:    movl $0, 16(%eax)
+; X86-NEXT:    movl $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_31:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movq $0, 23(%rdi)
+; CORE2-NEXT:    movq $0, 16(%rdi)
+; CORE2-NEXT:    movq $0, 8(%rdi)
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_31:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    xorps %xmm0, %xmm0
+; NEHALEM-NEXT:    movups %xmm0, 15(%rdi)
+; NEHALEM-NEXT:    movups %xmm0, (%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 31, i1 false)
+	ret void
+}
+
+define void @memset_35(i8* %a) nounwind  {
+; X86-LABEL: memset_35:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, 31(%eax)
+; X86-NEXT:    movl $0, 28(%eax)
+; X86-NEXT:    movl $0, 24(%eax)
+; X86-NEXT:    movl $0, 20(%eax)
+; X86-NEXT:    movl $0, 16(%eax)
+; X86-NEXT:    movl $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; CORE2-LABEL: memset_35:
+; CORE2:       # %bb.0: # %entry
+; CORE2-NEXT:    movl $0, 31(%rdi)
+; CORE2-NEXT:    movq $0, 24(%rdi)
+; CORE2-NEXT:    movq $0, 16(%rdi)
+; CORE2-NEXT:    movq $0, 8(%rdi)
+; CORE2-NEXT:    movq $0, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: memset_35:
+; NEHALEM:       # %bb.0: # %entry
+; NEHALEM-NEXT:    xorps %xmm0, %xmm0
+; NEHALEM-NEXT:    movups %xmm0, 16(%rdi)
+; NEHALEM-NEXT:    movups %xmm0, (%rdi)
+; NEHALEM-NEXT:    movl $0, 31(%rdi)
+; NEHALEM-NEXT:    retq
+entry:
+	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 35, i1 false)
+	ret void
+}
diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll
index 9fc12e6a094035c5649cba592a0cdb782b7f717b..70ce1809967da6a84cb9c056051a40d54d1b1539 100644
--- a/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/test/CodeGen/X86/min-legal-vector-width.ll
@@ -588,12 +588,12 @@ define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru)
 ; CHECK-LABEL: test_16f32toub_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
-; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
-; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovdw %ymm2, %xmm2
-; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT:    vpsllw $15, %ymm1, %ymm1
-; CHECK-NEXT:    vpmovw2m %ymm1, %k1
+; CHECK-NEXT:    vpslld $31, %ymm1, %ymm1
+; CHECK-NEXT:    vpmovd2m %ymm1, %k0
+; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm1
+; CHECK-NEXT:    vpslld $31, %ymm1, %ymm1
+; CHECK-NEXT:    vpmovd2m %ymm1, %k1
+; CHECK-NEXT:    kunpckbw %k0, %k1, %k1
 ; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %a = load <16 x float>, <16 x float>* %ptr
@@ -620,12 +620,10 @@ define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru)
 ; CHECK-LABEL: test_16f32tosb_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
-; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
-; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovdw %ymm2, %xmm2
-; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT:    vpsllw $15, %ymm1, %ymm1
-; CHECK-NEXT:    vpmovw2m %ymm1, %k1
+; CHECK-NEXT:    vpmovd2m %ymm1, %k0
+; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm1
+; CHECK-NEXT:    vpmovd2m %ymm1, %k1
+; CHECK-NEXT:    kunpckbw %k0, %k1, %k1
 ; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %a = load <16 x float>, <16 x float>* %ptr
@@ -646,3 +644,65 @@ define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru)
   %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
   ret <16 x i16> %select
 }
+
+define void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="256" {
+; CHECK-LABEL: mul256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa (%rsi), %ymm2
+; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm3
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; CHECK-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; CHECK-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; CHECK-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; CHECK-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
+; CHECK-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; CHECK-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; CHECK-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; CHECK-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %d = load <64 x i8>, <64 x i8>* %a
+  %e = load <64 x i8>, <64 x i8>* %b
+  %f = mul <64 x i8> %d, %e
+  store <64 x i8> %f, <64 x i8>* %c
+  ret void
+}
+
+define void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="512" {
+; CHECK-LABEL: mul512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm1
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; CHECK-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; CHECK-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; CHECK-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; CHECK-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %d = load <64 x i8>, <64 x i8>* %a
+  %e = load <64 x i8>, <64 x i8>* %b
+  %f = mul <64 x i8> %d, %e
+  store <64 x i8> %f, <64 x i8>* %c
+  ret void
+}
diff --git a/test/CodeGen/X86/mingw-comdats.ll b/test/CodeGen/X86/mingw-comdats.ll
index 2e9ebd8c9fc4c4bab3a5e0dc92bfe877eb1bffca..35f4fd12670c219f9d6bbf18f160b5e390f12fd9 100644
--- a/test/CodeGen/X86/mingw-comdats.ll
+++ b/test/CodeGen/X86/mingw-comdats.ll
@@ -1,13 +1,14 @@
-; RUN: llc -mtriple=x86_64-windows-itanium < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU
-; RUN: llc -mtriple=i686-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU32
-; RUN: llc -mtriple=x86_64-w64-windows-gnu < %s -filetype=obj | llvm-objdump - -headers | FileCheck %s --check-prefix=GNUOBJ
+; RUN: llc -function-sections -mtriple=x86_64-windows-itanium < %s | FileCheck %s
+; RUN: llc -function-sections -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+; RUN: llc -function-sections -mtriple=x86_64-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU
+; RUN: llc -function-sections -mtriple=i686-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU32
+; RUN: llc -function-sections -mtriple=x86_64-w64-windows-gnu < %s -filetype=obj | llvm-objdump - -headers | FileCheck %s --check-prefix=GNUOBJ
 
 ; GCC and MSVC handle comdats completely differently. Make sure we do the right
 ; thing for each.
 
-; Generated with this C++ source:
+; Modeled on this C++ source, with additional modifications for
+; -ffunction-sections:
 ; int bar(int);
 ; __declspec(selectany) int gv = 42;
 ; inline int foo(int x) { return bar(x) + gv; }
@@ -26,8 +27,24 @@ entry:
   ret i32 %call
 }
 
+; CHECK: .section        .text,"xr",one_only,main
 ; CHECK: main:
+; GNU: .section        .text$main,"xr",one_only,main
 ; GNU: main:
+; GNU32: .section        .text$main,"xr",one_only,_main
+; GNU32: _main:
+
+define dso_local x86_fastcallcc i32 @fastcall(i32 %x, i32 %y) {
+  %rv = add i32 %x, %y
+  ret i32 %rv
+}
+
+; CHECK: .section        .text,"xr",one_only,fastcall
+; CHECK: fastcall:
+; GNU: .section        .text$fastcall,"xr",one_only,fastcall
+; GNU: fastcall:
+; GNU32: .section        .text$fastcall,"xr",one_only,@fastcall@8
+; GNU32: @fastcall@8:
 
 ; Function Attrs: inlinehint uwtable
 define linkonce_odr dso_local i32 @_Z3fooi(i32 %x) #1 comdat {
@@ -50,9 +67,9 @@ entry:
 ; GNU: gv:
 ; GNU: .long 42
 
-; GNU32: .section        .text$__Z3fooi,"xr",discard,__Z3fooi
+; GNU32: .section        .text$_Z3fooi,"xr",discard,__Z3fooi
 ; GNU32: __Z3fooi:
-; GNU32: .section        .data$_gv,"dw",discard,_gv
+; GNU32: .section        .data$gv,"dw",discard,_gv
 ; GNU32: _gv:
 ; GNU32: .long 42
 
diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll
index 53cf0157059b6f61173626536560179264fdb5fc..ad7c1b800ff196bd51c33ef721376ed4d971eb5f 100644
--- a/test/CodeGen/X86/misched-code-difference-with-debug.ll
+++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll
@@ -28,7 +28,7 @@ source_filename = "test/CodeGen/X86/misched-code-difference-with-debug.ll"
 declare i32 @test_function(%class.C*, i8 signext, i8 signext, i8 signext, ...)
 ; CHECK-LABEL: test_without_debug
 ; CHECK: movl [[A:%[a-z]+]], [[B:%[a-z]+]]
-; CHECK-NEXT: movl [[A]], [[C:%[a-z]+]]
+; CHECK: movl [[A]], [[C:%[a-z]+]]
 
 define void @test_without_debug() {
 entry:
@@ -42,7 +42,7 @@ entry:
 }
 ; CHECK-LABEL: test_with_debug
 ; CHECK: movl [[A]], [[B]]
-; CHECK-NEXT: movl [[A]], [[C]]
+; CHECK: movl [[A]], [[C]]
 
 define void @test_with_debug() !dbg !17 {
 entry:
diff --git a/test/CodeGen/X86/misched_phys_reg_assign_order.ll b/test/CodeGen/X86/misched_phys_reg_assign_order.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d9548b98839087d08a84368da8119e22c9c5a7d9
--- /dev/null
+++ b/test/CodeGen/X86/misched_phys_reg_assign_order.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O2 -mtriple=i686-unknown-linux-gnu -o - | FileCheck %s
+
+@f = global i8* zeroinitializer
+
+; PR39391 - The load of %v1 should be scheduled before the zeroing of the A-D registers.
+
+define void @g() #0 {
+; CHECK-LABEL: g:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:    .cfi_offset %esi, -16
+; CHECK-NEXT:    .cfi_offset %ebx, -12
+; CHECK-NEXT:    movl f, %esi
+; CHECK-NEXT:    movb (%esi), %al
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    lock cmpxchg8b (%esi)
+; CHECK-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %k.end
+; CHECK-NEXT:  .LBB0_1: # %.
+; CHECK-NEXT:    calll m
+entry:
+  %p = load i8*, i8** @f
+  %v1 = load atomic i8, i8* %p monotonic, align 1
+  %d.h.h.h.h.h = bitcast i8* %p to i64*
+  %v2 = load atomic i64, i64* %d.h.h.h.h.h monotonic, align 8
+  %j.h = icmp eq i8 %v1, 0
+  br i1 %j.h, label %k.end, label %.
+
+.:                                                ; preds = %entry
+  %v3 = call i32 @m()
+  unreachable
+
+k.end:                                            ; preds = %entry
+  unreachable
+}
+
+declare i32 @m()
+
+attributes #0 = { "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index 60735fba4cd71f63f754cc096a19be4ce06945d4..5df75efb1ea7ea4fd93f988fcfb9a092f049fbd0 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -14,69 +14,59 @@ define void @test0(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    movl 12(%ebp), %ecx
 ; X32-NEXT:    movl 8(%ebp), %eax
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT:    paddw %xmm0, %xmm1
-; X32-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; X32-NEXT:    pand %xmm0, %xmm1
-; X32-NEXT:    packuswb %xmm1, %xmm1
-; X32-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; X32-NEXT:    paddb %xmm0, %xmm1
+; X32-NEXT:    movdq2q %xmm1, %mm0
 ; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    paddsb (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    paddusb (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    movq %mm0, (%eax)
-; X32-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-NEXT:    psubw %xmm2, %xmm1
-; X32-NEXT:    pand %xmm0, %xmm1
-; X32-NEXT:    packuswb %xmm1, %xmm1
-; X32-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
-; X32-NEXT:    movq %xmm1, (%eax)
+; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:    psubb %xmm1, %xmm0
+; X32-NEXT:    movdq2q %xmm0, %mm0
+; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    psubsb (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    psubusb (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%esp)
+; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT:    movq %mm0, (%eax)
+; X32-NEXT:    pmullw %xmm0, %xmm1
+; X32-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; X32-NEXT:    movdqa %xmm1, %xmm2
+; X32-NEXT:    pand %xmm0, %xmm2
+; X32-NEXT:    packuswb %xmm2, %xmm2
+; X32-NEXT:    movq %xmm2, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-NEXT:    pmullw %xmm1, %xmm2
+; X32-NEXT:    pand %xmm1, %xmm2
 ; X32-NEXT:    movdqa %xmm2, %xmm1
 ; X32-NEXT:    pand %xmm0, %xmm1
 ; X32-NEXT:    packuswb %xmm1, %xmm1
 ; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT:    pand %xmm2, %xmm1
+; X32-NEXT:    por %xmm2, %xmm1
 ; X32-NEXT:    movdqa %xmm1, %xmm2
 ; X32-NEXT:    pand %xmm0, %xmm2
 ; X32-NEXT:    packuswb %xmm2, %xmm2
 ; X32-NEXT:    movq %xmm2, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-NEXT:    por %xmm1, %xmm2
-; X32-NEXT:    movdqa %xmm2, %xmm1
-; X32-NEXT:    pand %xmm0, %xmm1
-; X32-NEXT:    packuswb %xmm1, %xmm1
-; X32-NEXT:    movq %xmm1, (%eax)
-; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT:    pxor %xmm2, %xmm1
-; X32-NEXT:    pand %xmm0, %xmm1
-; X32-NEXT:    packuswb %xmm1, %xmm1
-; X32-NEXT:    movq %xmm1, (%eax)
+; X32-NEXT:    pxor %xmm1, %xmm2
+; X32-NEXT:    pand %xmm0, %xmm2
+; X32-NEXT:    packuswb %xmm2, %xmm2
+; X32-NEXT:    movq %xmm2, (%eax)
 ; X32-NEXT:    emms
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
@@ -86,65 +76,55 @@ define void @test0(x86_mmx* %A, x86_mmx* %B) {
 ; X64-LABEL: test0:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    paddw %xmm0, %xmm1
-; X64-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; X64-NEXT:    pand %xmm0, %xmm1
-; X64-NEXT:    packuswb %xmm1, %xmm1
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
+; X64-NEXT:    paddb %xmm0, %xmm1
+; X64-NEXT:    movdq2q %xmm1, %mm0
 ; X64-NEXT:    movq %xmm1, (%rdi)
 ; X64-NEXT:    paddsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    movq %mm0, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT:    psubw %xmm2, %xmm1
-; X64-NEXT:    pand %xmm0, %xmm1
-; X64-NEXT:    packuswb %xmm1, %xmm1
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
-; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    psubb %xmm1, %xmm0
+; X64-NEXT:    movdq2q %xmm0, %mm0
+; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    psubsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    pmullw %xmm0, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    packuswb %xmm2, %xmm2
+; X64-NEXT:    movq %xmm2, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT:    pmullw %xmm1, %xmm2
+; X64-NEXT:    pand %xmm1, %xmm2
 ; X64-NEXT:    movdqa %xmm2, %xmm1
 ; X64-NEXT:    pand %xmm0, %xmm1
 ; X64-NEXT:    packuswb %xmm1, %xmm1
 ; X64-NEXT:    movq %xmm1, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    pand %xmm2, %xmm1
+; X64-NEXT:    por %xmm2, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm2
 ; X64-NEXT:    pand %xmm0, %xmm2
 ; X64-NEXT:    packuswb %xmm2, %xmm2
 ; X64-NEXT:    movq %xmm2, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT:    por %xmm1, %xmm2
-; X64-NEXT:    movdqa %xmm2, %xmm1
-; X64-NEXT:    pand %xmm0, %xmm1
-; X64-NEXT:    packuswb %xmm1, %xmm1
-; X64-NEXT:    movq %xmm1, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    pxor %xmm2, %xmm1
-; X64-NEXT:    pand %xmm0, %xmm1
-; X64-NEXT:    packuswb %xmm1, %xmm1
-; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    pxor %xmm1, %xmm2
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    packuswb %xmm2, %xmm2
+; X64-NEXT:    movq %xmm2, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
@@ -315,58 +295,34 @@ define void @test2(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    subl $24, %esp
 ; X32-NEXT:    movl 12(%ebp), %ecx
 ; X32-NEXT:    movl 8(%ebp), %eax
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-NEXT:    paddd %xmm0, %xmm1
-; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
-; X32-NEXT:    movq %xmm0, (%eax)
+; X32-NEXT:    paddw %xmm0, %xmm1
+; X32-NEXT:    movdq2q %xmm1, %mm0
+; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    paddsw (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    paddusw (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-NEXT:    psubd %xmm1, %xmm0
-; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; X32-NEXT:    psubw %xmm1, %xmm0
+; X32-NEXT:    movdq2q %xmm0, %mm0
 ; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    psubsw (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    psubusw (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-NEXT:    pmuludq %xmm1, %xmm0
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-NEXT:    pmuludq %xmm2, %xmm1
-; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
-; X32-NEXT:    movq %xmm0, (%eax)
+; X32-NEXT:    pmullw %xmm0, %xmm1
+; X32-NEXT:    movdq2q %xmm1, %mm0
+; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    pmulhw (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    pmaddwd (%ecx), %mm0
@@ -404,54 +360,30 @@ define void @test2(x86_mmx* %A, x86_mmx* %B) {
 ; X64-LABEL: test2:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT:    paddd %xmm0, %xmm1
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
-; X64-NEXT:    movq %xmm0, (%rdi)
+; X64-NEXT:    paddw %xmm0, %xmm1
+; X64-NEXT:    movdq2q %xmm1, %mm0
+; X64-NEXT:    movq %xmm1, (%rdi)
 ; X64-NEXT:    paddsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT:    psubd %xmm1, %xmm0
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
+; X64-NEXT:    psubw %xmm1, %xmm0
+; X64-NEXT:    movdq2q %xmm0, %mm0
 ; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    psubsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-NEXT:    pmuludq %xmm1, %xmm0
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-NEXT:    pmuludq %xmm2, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
-; X64-NEXT:    movq %xmm0, (%rdi)
+; X64-NEXT:    pmullw %xmm0, %xmm1
+; X64-NEXT:    movdq2q %xmm1, %mm0
+; X64-NEXT:    movq %xmm1, (%rdi)
 ; X64-NEXT:    pmulhw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    pmaddwd (%rsi), %mm0
diff --git a/test/CodeGen/X86/mmx-schedule.ll b/test/CodeGen/X86/mmx-schedule.ll
index 6a8a487d7c1e35df9aa9d51674c1e2914d55a560..51dc5e102ff1578aa59144b24d37e4623ba23cf3 100644
--- a/test/CodeGen/X86/mmx-schedule.ll
+++ b/test/CodeGen/X86/mmx-schedule.ll
@@ -787,7 +787,7 @@ define i32 @test_movd(x86_mmx %a0, i32 %a1, i32 *%a2) {
 ; BDVER2-NEXT:    paddd %mm2, %mm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    movd %mm2, %ecx # sched: [10:1.00]
 ; BDVER2-NEXT:    movd %mm0, %eax # sched: [10:1.00]
-; BDVER2-NEXT:    movl %ecx, (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    movl %ecx, (%rsi) # sched: [1:1.00]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movd:
diff --git a/test/CodeGen/X86/movmsk-cmp.ll b/test/CodeGen/X86/movmsk-cmp.ll
index 452676e19db4ffc522bc5bda8308fc18631b16dd..718ade024356687b2a7ac2c927ecea56f7be6073 100644
--- a/test/CodeGen/X86/movmsk-cmp.ll
+++ b/test/CodeGen/X86/movmsk-cmp.ll
@@ -1101,24 +1101,21 @@ define i1 @allones_v4i64_sign(<4 x i64> %arg) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    packssdw %xmm3, %xmm1
-; SSE2-NEXT:    movmskps %xmm1, %eax
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    movmskps %xmm0, %eax
 ; SSE2-NEXT:    cmpb $15, %al
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -1164,24 +1161,21 @@ define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    packssdw %xmm3, %xmm1
-; SSE2-NEXT:    movmskps %xmm1, %eax
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    movmskps %xmm0, %eax
 ; SSE2-NEXT:    testb %al, %al
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -4830,24 +4824,21 @@ define i32 @movmskpd256(<4 x double> %x) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    packssdw %xmm3, %xmm1
-; SSE2-NEXT:    movmskps %xmm1, %eax
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    movmskps %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: movmskpd256:
diff --git a/test/CodeGen/X86/movmsk.ll b/test/CodeGen/X86/movmsk.ll
index 4a724f15c4ab62de6f583e9de99a1a0f8edd50f7..eebcf1811965f7cb961d7fda412f7f06523e0b66 100644
--- a/test/CodeGen/X86/movmsk.ll
+++ b/test/CodeGen/X86/movmsk.ll
@@ -112,6 +112,40 @@ entry:
 }
 declare void @float_call_signbit_callee(i1 zeroext)
 
+; Known zeros
+define i32 @knownbits_v2f64(<2 x double> %x) {
+; CHECK-LABEL: knownbits_v2f64:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movmskpd %xmm0, %eax
+; CHECK-NEXT:    retq
+  %1 = tail call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %x)
+  %2 = and i32 %1, 3
+  ret i32 %2
+}
+
+; Don't demand any movmsk signbits -> zero
+define i32 @demandedbits_v16i8(<16 x i8> %x) {
+; CHECK-LABEL: demandedbits_v16i8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+  %1 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %x)
+  %2 = and i32 %1, 65536
+  ret i32 %2
+}
+
+; Simplify demanded vector elts
+define i32 @demandedelts_v4f32(<4 x float> %x) {
+; CHECK-LABEL: demandedelts_v4f32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movmskps %xmm0, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %1)
+  %3 = and i32 %2, 1
+  ret i32 %3
+}
 
 ; rdar://10247336
 ; movmskp{s|d} only set low 4/2 bits, high bits are known zero
@@ -145,5 +179,6 @@ entry:
   ret i32 %2
 }
 
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/mul-constant-i8.ll b/test/CodeGen/X86/mul-constant-i8.ll
new file mode 100644
index 0000000000000000000000000000000000000000..43afd2462a40878e8cce8b7c9524d43b84c621ba
--- /dev/null
+++ b/test/CodeGen/X86/mul-constant-i8.ll
@@ -0,0 +1,485 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
+
+define i8 @test_mul_by_1(i8 %x) {
+; X64-LABEL: test_mul_by_1:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 1
+  ret i8 %m
+}
+
+define i8 @test_mul_by_2(i8 %x) {
+; X64-LABEL: test_mul_by_2:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 2
+  ret i8 %m
+}
+
+define i8 @test_mul_by_3(i8 %x) {
+; X64-LABEL: test_mul_by_3:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 3
+  ret i8 %m
+}
+
+define i8 @test_mul_by_4(i8 %x) {
+; X64-LABEL: test_mul_by_4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb $2, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 4
+  ret i8 %m
+}
+
+define i8 @test_mul_by_5(i8 %x) {
+; X64-LABEL: test_mul_by_5:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 5
+  ret i8 %m
+}
+
+define i8 @test_mul_by_6(i8 %x) {
+; X64-LABEL: test_mul_by_6:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 6
+  ret i8 %m
+}
+
+define i8 @test_mul_by_7(i8 %x) {
+; X64-LABEL: test_mul_by_7:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (,%rdi,8), %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 7
+  ret i8 %m
+}
+
+define i8 @test_mul_by_8(i8 %x) {
+; X64-LABEL: test_mul_by_8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb $3, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 8
+  ret i8 %m
+}
+
+define i8 @test_mul_by_9(i8 %x) {
+; X64-LABEL: test_mul_by_9:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 9
+  ret i8 %m
+}
+
+define i8 @test_mul_by_10(i8 %x) {
+; X64-LABEL: test_mul_by_10:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 10
+  ret i8 %m
+}
+
+define i8 @test_mul_by_11(i8 %x) {
+; X64-LABEL: test_mul_by_11:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 11
+  ret i8 %m
+}
+
+define i8 @test_mul_by_12(i8 %x) {
+; X64-LABEL: test_mul_by_12:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shll $2, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 12
+  ret i8 %m
+}
+
+define i8 @test_mul_by_13(i8 %x) {
+; X64-LABEL: test_mul_by_13:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 13
+  ret i8 %m
+}
+
+define i8 @test_mul_by_14(i8 %x) {
+; X64-LABEL: test_mul_by_14:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 14
+  ret i8 %m
+}
+
+define i8 @test_mul_by_15(i8 %x) {
+; X64-LABEL: test_mul_by_15:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 15
+  ret i8 %m
+}
+
+define i8 @test_mul_by_16(i8 %x) {
+; X64-LABEL: test_mul_by_16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 16
+  ret i8 %m
+}
+
+define i8 @test_mul_by_17(i8 %x) {
+; X64-LABEL: test_mul_by_17:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    leal (%rax,%rdi), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 17
+  ret i8 %m
+}
+
+define i8 @test_mul_by_18(i8 %x) {
+; X64-LABEL: test_mul_by_18:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 18
+  ret i8 %m
+}
+
+define i8 @test_mul_by_19(i8 %x) {
+; X64-LABEL: test_mul_by_19:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rdi,%rax,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 19
+  ret i8 %m
+}
+
+define i8 @test_mul_by_20(i8 %x) {
+; X64-LABEL: test_mul_by_20:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shll $2, %edi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 20
+  ret i8 %m
+}
+
+define i8 @test_mul_by_21(i8 %x) {
+; X64-LABEL: test_mul_by_21:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 21
+  ret i8 %m
+}
+
+define i8 @test_mul_by_22(i8 %x) {
+; X64-LABEL: test_mul_by_22:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 22
+  ret i8 %m
+}
+
+define i8 @test_mul_by_23(i8 %x) {
+; X64-LABEL: test_mul_by_23:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    shll $3, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 23
+  ret i8 %m
+}
+
+define i8 @test_mul_by_24(i8 %x) {
+; X64-LABEL: test_mul_by_24:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shll $3, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 24
+  ret i8 %m
+}
+
+define i8 @test_mul_by_25(i8 %x) {
+; X64-LABEL: test_mul_by_25:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rax,%rax,4), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 25
+  ret i8 %m
+}
+
+define i8 @test_mul_by_26(i8 %x) {
+; X64-LABEL: test_mul_by_26:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rax,%rax,4), %eax
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 26
+  ret i8 %m
+}
+
+define i8 @test_mul_by_27(i8 %x) {
+; X64-LABEL: test_mul_by_27:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 27
+  ret i8 %m
+}
+
+define i8 @test_mul_by_28(i8 %x) {
+; X64-LABEL: test_mul_by_28:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 28
+  ret i8 %m
+}
+
+define i8 @test_mul_by_29(i8 %x) {
+; X64-LABEL: test_mul_by_29:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 29
+  ret i8 %m
+}
+
+define i8 @test_mul_by_30(i8 %x) {
+; X64-LABEL: test_mul_by_30:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $5, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 30
+  ret i8 %m
+}
+
+define i8 @test_mul_by_31(i8 %x) {
+; X64-LABEL: test_mul_by_31:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $5, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 31
+  ret i8 %m
+}
+
+define i8 @test_mul_by_32(i8 %x) {
+; X64-LABEL: test_mul_by_32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb $5, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 32
+  ret i8 %m
+}
+
+define i8 @test_mul_by_37(i8 %x) {
+; X64-LABEL: test_mul_by_37:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 37
+  ret i8 %m
+}
+
+define i8 @test_mul_by_41(i8 %x) {
+; X64-LABEL: test_mul_by_41:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,8), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 41
+  ret i8 %m
+}
+
+define i8 @test_mul_by_62(i8 %x) {
+; X64-LABEL: test_mul_by_62:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $6, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 62
+  ret i8 %m
+}
+
+define i8 @test_mul_by_66(i8 %x) {
+; X64-LABEL: test_mul_by_66:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $6, %eax
+; X64-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 66
+  ret i8 %m
+}
+
+define i8 @test_mul_by_73(i8 %x) {
+; X64-LABEL: test_mul_by_73:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rdi,%rax,8), %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 73
+  ret i8 %m
+}
+
+define i8 @test_mul_by_520(i8 %x) {
+; X64-LABEL: test_mul_by_520:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb $3, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, 520
+  ret i8 %m
+}
+
+define i8 @test_mul_by_neg10(i8 %x) {
+; X64-LABEL: test_mul_by_neg10:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, -10
+  ret i8 %m
+}
+
+define i8 @test_mul_by_neg36(i8 %x) {
+; X64-LABEL: test_mul_by_neg36:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shll $2, %edi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %m = mul i8 %x, -36
+  ret i8 %m
+}
+
diff --git a/test/CodeGen/X86/mulx32.ll b/test/CodeGen/X86/mulx32.ll
index 872e72d503aa369f57d338b6fc64d3198bdbe3b8..faf299f3a2dfaa85d2c3d3feac90f8ef8041918f 100644
--- a/test/CodeGen/X86/mulx32.ll
+++ b/test/CodeGen/X86/mulx32.ll
@@ -5,8 +5,8 @@
 define i64 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    mulxl {{[0-9]+}}(%esp), %eax, %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    mull {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    retl
   %x = zext i32 %a to i64
   %y = zext i32 %b to i64
@@ -17,9 +17,9 @@ define i64 @f1(i32 %a, i32 %b) {
 define i64 @f2(i32 %a, i32* %p) {
 ; CHECK-LABEL: f2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    mulxl (%eax), %eax, %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    mull (%ecx)
 ; CHECK-NEXT:    retl
   %b = load i32, i32* %p
   %x = zext i32 %a to i64
diff --git a/test/CodeGen/X86/mulx64.ll b/test/CodeGen/X86/mulx64.ll
index e038f33000937f84a11be09f2edc45b7c2e8c588..38f1d3ea5ab32fc21a2260689b202245b066e417 100644
--- a/test/CodeGen/X86/mulx64.ll
+++ b/test/CodeGen/X86/mulx64.ll
@@ -5,8 +5,8 @@
 define i128 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    mulxq %rsi, %rax, %rdx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    mulq %rsi
 ; CHECK-NEXT:    retq
   %x = zext i64 %a to i128
   %y = zext i64 %b to i128
@@ -17,8 +17,8 @@ define i128 @f1(i64 %a, i64 %b) {
 define i128 @f2(i64 %a, i64* %p) {
 ; CHECK-LABEL: f2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    mulxq (%rsi), %rax, %rdx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    mulq (%rsi)
 ; CHECK-NEXT:    retq
   %b = load i64, i64* %p
   %x = zext i64 %a to i128
diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll
index efc08ca1718d7e92a6bb74511db3befad212d83d..4b9e39e573139771dee33ee462e34c8b5466beea 100644
--- a/test/CodeGen/X86/nontemporal-loads.ll
+++ b/test/CodeGen/X86/nontemporal-loads.ll
@@ -852,12 +852,12 @@ define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovntdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa (%rdi), %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_arg_v8i32:
@@ -932,12 +932,12 @@ define <4 x i64> @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovntdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa (%rdi), %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_arg_v4i64:
@@ -973,12 +973,12 @@ define <16 x i16> @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovntdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa (%rdi), %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_arg_v16i16:
@@ -1014,12 +1014,12 @@ define <32 x i8> @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovntdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa (%rdi), %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_arg_v32i8:
@@ -1114,18 +1114,18 @@ define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v16i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm3
-; AVX1-NEXT:    vmovntdqa (%rdi), %xmm4
-; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpaddd %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovntdqa (%rdi), %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_arg_v16i32:
@@ -1220,18 +1220,18 @@ define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v8i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm3
-; AVX1-NEXT:    vmovntdqa (%rdi), %xmm4
-; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovntdqa (%rdi), %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_arg_v8i64:
@@ -1275,18 +1275,18 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v32i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm3
-; AVX1-NEXT:    vmovntdqa (%rdi), %xmm4
-; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpaddw %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovntdqa (%rdi), %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_arg_v32i16:
@@ -1346,18 +1346,18 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v64i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm3
-; AVX1-NEXT:    vmovntdqa (%rdi), %xmm4
-; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpaddb %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovntdqa (%rdi), %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_arg_v64i8:
diff --git a/test/CodeGen/X86/not-and-simplify.ll b/test/CodeGen/X86/not-and-simplify.ll
index dfd22df1e23eccc8b19b9ada5eea13ccfbe965b9..0e36a75441c71d93caf0ade45ef9b150a2e38d1b 100644
--- a/test/CodeGen/X86/not-and-simplify.ll
+++ b/test/CodeGen/X86/not-and-simplify.ll
@@ -8,8 +8,8 @@ define i32 @shrink_xor_constant1(i32 %x) {
 ; ALL-LABEL: shrink_xor_constant1:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    movl %edi, %eax
+; ALL-NEXT:    notl %eax
 ; ALL-NEXT:    shrl $31, %eax
-; ALL-NEXT:    xorl $1, %eax
 ; ALL-NEXT:    retq
   %sh = lshr i32 %x, 31
   %not = xor i32 %sh, -1
@@ -35,8 +35,8 @@ define i8 @shrink_xor_constant2(i8 %x) {
 ; ALL-LABEL: shrink_xor_constant2:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    movl %edi, %eax
+; ALL-NEXT:    notb %al
 ; ALL-NEXT:    shlb $5, %al
-; ALL-NEXT:    xorb $-32, %al
 ; ALL-NEXT:    # kill: def $al killed $al killed $eax
 ; ALL-NEXT:    retq
   %sh = shl i8 %x, 5
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index 8528be3ca59319c653312c81d69fa46b6506c823..28a457f64d4a85995eeaca4471c8b85b0e03d67a 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -14,7 +14,7 @@ define void @f1() {
 }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!7, !8}
+!llvm.module.flags = !{!7, !8, !9}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: " ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !3, imports: !2)
 !1 = !DIFile(filename: "file.c", directory: "")
@@ -25,4 +25,4 @@ define void @f1() {
 !6 = !DIBasicType(size: 32, align: 32, encoding: DW_ATE_signed)
 !7 = !{i32 2, !"Dwarf Version", i32 3}
 !8 = !{i32 1, !"Debug Info Version", i32 3}
-
+!9 = !{i32 2, !"CodeView", i32 1}
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index 9216cad5882782db55ecc752783503cd9564dd95..0d501fe5233c03ca69bad093bc49408541ed82d6 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -1055,26 +1055,25 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
 ;
 ; AVX1-LABEL: interleave_24i16_out:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm0
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
-; AVX1-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
 ; AVX1-NEXT:    vmovdqu %xmm3, (%rsi)
 ; AVX1-NEXT:    vmovdqu %xmm4, (%rdx)
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rcx)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: interleave_24i16_out:
@@ -1101,19 +1100,18 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
 ;
 ; XOP-LABEL: interleave_24i16_out:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqu 32(%rdi), %xmm0
-; XOP-NEXT:    vmovdqu (%rdi), %ymm1
-; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; XOP-NEXT:    vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm0[4,5,10,11]
-; XOP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; XOP-NEXT:    vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm0[0,1,6,7,12,13]
-; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[4,5,10,11],xmm2[0,1,6,7,12,13,14,15,0,1,2,3]
-; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7,8,9],xmm0[2,3,8,9,14,15]
+; XOP-NEXT:    vmovdqu (%rdi), %xmm0
+; XOP-NEXT:    vmovdqu 16(%rdi), %xmm1
+; XOP-NEXT:    vmovdqu 32(%rdi), %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; XOP-NEXT:    vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm2[4,5,10,11]
+; XOP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; XOP-NEXT:    vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm2[0,1,6,7,12,13]
+; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11],xmm1[0,1,6,7,12,13,14,15,0,1,2,3]
+; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9],xmm2[2,3,8,9,14,15]
 ; XOP-NEXT:    vmovdqu %xmm3, (%rsi)
 ; XOP-NEXT:    vmovdqu %xmm4, (%rdx)
 ; XOP-NEXT:    vmovdqu %xmm0, (%rcx)
-; XOP-NEXT:    vzeroupper
 ; XOP-NEXT:    retq
   %wide.vec = load <24 x i16>, <24 x i16>* %p, align 4
   %s1 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -1353,9 +1351,9 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1]
+; AVX1-NEXT:    vmovups 80(%rdi), %xmm2
+; AVX1-NEXT:    vmovups 64(%rdi), %xmm3
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
@@ -1364,7 +1362,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
+; AVX1-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm2[2],xmm3[3]
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
@@ -1374,7 +1372,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -1451,9 +1449,9 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vmovups (%rdi), %ymm0
 ; XOP-NEXT:    vmovups 32(%rdi), %ymm1
-; XOP-NEXT:    vmovups 64(%rdi), %ymm2
-; XOP-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; XOP-NEXT:    vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1]
+; XOP-NEXT:    vmovups 80(%rdi), %xmm2
+; XOP-NEXT:    vmovups 64(%rdi), %xmm3
+; XOP-NEXT:    vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[1]
 ; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; XOP-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; XOP-NEXT:    vextractf128 $1, %ymm5, %xmm6
@@ -1462,7 +1460,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
 ; XOP-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
 ; XOP-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
 ; XOP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; XOP-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
+; XOP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm2[2],xmm3[3]
 ; XOP-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
 ; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
 ; XOP-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
@@ -1472,7 +1470,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
 ; XOP-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
 ; XOP-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
 ; XOP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
-; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3]
+; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3]
 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -1583,35 +1581,36 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ;
 ; AVX1-LABEL: interleave_24i32_in:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovups (%rsi), %ymm0
-; AVX1-NEXT:    vmovups (%rdx), %ymm1
-; AVX1-NEXT:    vmovupd (%rcx), %ymm2
-; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = xmm2[0,0]
+; AVX1-NEXT:    vmovupd (%rsi), %ymm0
+; AVX1-NEXT:    vmovupd (%rcx), %ymm1
+; AVX1-NEXT:    vmovups 16(%rcx), %xmm2
+; AVX1-NEXT:    vmovups (%rdx), %xmm3
+; AVX1-NEXT:    vmovups 16(%rdx), %xmm4
+; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
+; AVX1-NEXT:    vmovups (%rsi), %xmm4
+; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = xmm1[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2]
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX1-NEXT:    vmovups %ymm4, 64(%rdi)
 ; AVX1-NEXT:    vmovups %ymm3, (%rdi)
+; AVX1-NEXT:    vmovups %ymm2, 64(%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1620,17 +1619,17 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-SLOW-NEXT:    vmovups (%rsi), %ymm0
 ; AVX2-SLOW-NEXT:    vmovups (%rdx), %ymm1
 ; AVX2-SLOW-NEXT:    vmovups (%rcx), %ymm2
-; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm3 = ymm2[2,1,3,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
-; AVX2-SLOW-NEXT:    vbroadcastsd %xmm2, %ymm4
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
-; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm4 = mem[1,0,2,2]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
+; AVX2-SLOW-NEXT:    vbroadcastsd (%rcx), %ymm5
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
@@ -1638,8 +1637,8 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX2-SLOW-NEXT:    vmovups %ymm4, 64(%rdi)
-; AVX2-SLOW-NEXT:    vmovups %ymm3, (%rdi)
+; AVX2-SLOW-NEXT:    vmovups %ymm4, (%rdi)
+; AVX2-SLOW-NEXT:    vmovups %ymm3, 64(%rdi)
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -1648,58 +1647,59 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-FAST-NEXT:    vmovups (%rsi), %ymm0
 ; AVX2-FAST-NEXT:    vmovups (%rdx), %ymm1
 ; AVX2-FAST-NEXT:    vmovups (%rcx), %ymm2
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2]
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm3, %ymm3
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
-; AVX2-FAST-NEXT:    vbroadcastsd %xmm2, %ymm4
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2]
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm4, %ymm4
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT:    vbroadcastsd (%rcx), %ymm5
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7]
-; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm5, %ymm1
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,3,3]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT:    vmovups %ymm0, 64(%rdi)
-; AVX2-FAST-NEXT:    vmovups %ymm4, 32(%rdi)
-; AVX2-FAST-NEXT:    vmovups %ymm3, (%rdi)
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm4, (%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm3, 64(%rdi)
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
 ; XOP-LABEL: interleave_24i32_in:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovups (%rsi), %ymm0
-; XOP-NEXT:    vmovups (%rdx), %ymm1
-; XOP-NEXT:    vmovupd (%rcx), %ymm2
-; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0]
-; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
-; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0]
-; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
-; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOP-NEXT:    vmovddup {{.*#+}} xmm4 = xmm2[0,0]
+; XOP-NEXT:    vmovupd (%rsi), %ymm0
+; XOP-NEXT:    vmovupd (%rcx), %ymm1
+; XOP-NEXT:    vmovups 16(%rcx), %xmm2
+; XOP-NEXT:    vmovups (%rdx), %xmm3
+; XOP-NEXT:    vmovups 16(%rdx), %xmm4
+; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0]
+; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2]
+; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0]
+; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2]
+; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; XOP-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; XOP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
+; XOP-NEXT:    vmovups (%rsi), %xmm4
+; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0]
+; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2]
+; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
+; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
+; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOP-NEXT:    vmovddup {{.*#+}} xmm4 = xmm1[0,0]
 ; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
 ; XOP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; XOP-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; XOP-NEXT:    vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0]
-; XOP-NEXT:    vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2]
-; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0]
-; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2]
-; XOP-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; XOP-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3]
-; XOP-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; XOP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; XOP-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm2[2],ymm0[3],ymm2[2,3],ymm0[4],ymm2[5,4],ymm0[5]
-; XOP-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; XOP-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5]
+; XOP-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
 ; XOP-NEXT:    vmovups %ymm0, 32(%rdi)
-; XOP-NEXT:    vmovups %ymm4, 64(%rdi)
 ; XOP-NEXT:    vmovups %ymm3, (%rdi)
+; XOP-NEXT:    vmovups %ymm2, 64(%rdi)
 ; XOP-NEXT:    vzeroupper
 ; XOP-NEXT:    retq
   %s1 = load <8 x i32>, <8 x i32>* %q1, align 4
diff --git a/test/CodeGen/X86/opt_phis2.mir b/test/CodeGen/X86/opt_phis2.mir
new file mode 100644
index 0000000000000000000000000000000000000000..55523f9ea5ead4b6a473d8abcab23dbb4a59c0e0
--- /dev/null
+++ b/test/CodeGen/X86/opt_phis2.mir
@@ -0,0 +1,72 @@
+# RUN: llc -run-pass opt-phis -march=x86-64 -o - %s | FileCheck %s
+# All PHIs should be removed since they can be securely replaced
+# by %8 register.
+# CHECK-NOT: PHI
+--- |
+  define void @test() {
+    ret void
+  }
+...
+---
+name:            test
+alignment:       4
+tracksRegLiveness: true
+jumpTable:
+  kind:            block-address
+  entries:
+    - id:              0
+      blocks:          [ '%bb.3', '%bb.2', '%bb.1', '%bb.4' ]
+body:             |
+  bb.0:
+    liveins: $edi, $ymm0, $rsi
+
+    %9:gr64 = COPY $rsi
+    %8:vr256 = COPY $ymm0
+    %7:gr32 = COPY $edi
+    %11:gr32 = SAR32ri %7, 31, implicit-def dead $eflags
+    %12:gr32 = SHR32ri %11, 30, implicit-def dead $eflags
+    %13:gr32 = ADD32rr %7, killed %12, implicit-def dead $eflags
+    %14:gr32 = AND32ri8 %13, -4, implicit-def dead $eflags
+    %15:gr32 = SUB32rr %7, %14, implicit-def dead $eflags
+    %10:gr64_nosp = SUBREG_TO_REG 0, %15, %subreg.sub_32bit
+    %16:gr32 = SUB32ri8 %15, 3, implicit-def $eflags
+    JA_1 %bb.8, implicit $eflags
+
+  bb.9:
+    JMP64m $noreg, 8, %10, %jump-table.0, $noreg :: (load 8 from jump-table)
+
+  bb.1:
+    %0:vr256 = COPY %8
+    JMP_1 %bb.5
+
+  bb.2:
+    %1:vr256 = COPY %8
+    JMP_1 %bb.6
+
+  bb.3:
+    %2:vr256 = COPY %8
+    JMP_1 %bb.7
+
+  bb.4:
+    %3:vr256 = COPY %8
+    %17:vr128 = VEXTRACTF128rr %8, 1
+    VPEXTRDmr %9, 1, $noreg, 12, $noreg, killed %17, 2
+
+  bb.5:
+    %4:vr256 = PHI %0, %bb.1, %3, %bb.4
+    %18:vr128 = VEXTRACTF128rr %4, 1
+    VPEXTRDmr %9, 1, $noreg, 8, $noreg, killed %18, 1
+
+  bb.6:
+    %5:vr256 = PHI %1, %bb.2, %4, %bb.5
+    %19:vr128 = VEXTRACTF128rr %5, 1
+    VMOVPDI2DImr %9, 1, $noreg, 4, $noreg, killed %19
+
+  bb.7:
+    %6:vr256 = PHI %2, %bb.3, %5, %bb.6
+    %20:vr128 = COPY %6.sub_xmm
+    VPEXTRDmr %9, 1, $noreg, 0, $noreg, killed %20, 3
+
+  bb.8:
+    RET 0
+...
diff --git a/test/CodeGen/X86/packss.ll b/test/CodeGen/X86/packss.ll
index 3feb0d04f04d530d7979aa73f1c88e6d16492d02..eecfab2c999a639a6403d3df24718c97f2c1e1df 100644
--- a/test/CodeGen/X86/packss.ll
+++ b/test/CodeGen/X86/packss.ll
@@ -156,22 +156,14 @@ define <8 x i16> @trunc_ashr_v4i32_icmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwi
 define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ; X86-SSE-LABEL: trunc_ashr_v4i64_demandedelts:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE-NEXT:    psllq $63, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    psllq $63, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
-; X86-SSE-NEXT:    psrlq $63, %xmm4
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; X86-SSE-NEXT:    movapd {{.*#+}} xmm2 = [4.9406564584124654E-324,-0.0E+0]
-; X86-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X86-SSE-NEXT:    psllq $63, %xmm1
+; X86-SSE-NEXT:    psllq $63, %xmm0
+; X86-SSE-NEXT:    psrlq $63, %xmm0
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4.9406564584124654E-324,-0.0E+0]
+; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    psubq %xmm2, %xmm0
-; X86-SSE-NEXT:    psrlq $63, %xmm3
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; X86-SSE-NEXT:    xorpd %xmm2, %xmm1
+; X86-SSE-NEXT:    psrlq $63, %xmm1
+; X86-SSE-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE-NEXT:    psubq %xmm2, %xmm1
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
@@ -181,10 +173,8 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ; X86-AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    vpsllq $63, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; X86-AVX1-NEXT:    vpsllq $63, %xmm2, %xmm3
-; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; X86-AVX1-NEXT:    vpsrlq $63, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,0,0,0,0,0,32768]
@@ -206,10 +196,10 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0]
 ; X86-AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
-; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X86-AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
@@ -218,22 +208,14 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ;
 ; X64-SSE-LABEL: trunc_ashr_v4i64_demandedelts:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE-NEXT:    psllq $63, %xmm2
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X64-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE-NEXT:    psllq $63, %xmm2
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm4
-; X64-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
-; X64-SSE-NEXT:    psrlq $63, %xmm4
-; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; X64-SSE-NEXT:    movapd {{.*#+}} xmm2 = [1,9223372036854775808]
-; X64-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X64-SSE-NEXT:    psllq $63, %xmm1
+; X64-SSE-NEXT:    psllq $63, %xmm0
+; X64-SSE-NEXT:    psrlq $63, %xmm0
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,9223372036854775808]
+; X64-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE-NEXT:    psubq %xmm2, %xmm0
-; X64-SSE-NEXT:    psrlq $63, %xmm3
-; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; X64-SSE-NEXT:    xorpd %xmm2, %xmm1
+; X64-SSE-NEXT:    psrlq $63, %xmm1
+; X64-SSE-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE-NEXT:    psubq %xmm2, %xmm1
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
@@ -243,10 +225,8 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ; X64-AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    vpsllq $63, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; X64-AVX1-NEXT:    vpsllq $63, %xmm2, %xmm3
-; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; X64-AVX1-NEXT:    vpsrlq $63, %xmm3, %xmm3
 ; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,9223372036854775808]
@@ -265,7 +245,8 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ;
 ; X64-AVX2-LABEL: trunc_ashr_v4i64_demandedelts:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
+; X64-AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9223372036854775808,1,9223372036854775808]
 ; X64-AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X64-AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
index e8b75ba0d501c2dc3a4822b03b8d0d680762cc6b..abdf37549fd35377f5d154e75fd2235f4977978b 100644
--- a/test/CodeGen/X86/patchpoint.ll
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -103,8 +103,8 @@ define i64 @test_patchpoint_with_attributes() {
 entry:
 ; CHECK-LABEL: test_patchpoint_with_attributes:
 ; CHECK: movl $42, %edi
-; CHECK: xorl %r10d, %r10d
 ; CHECK: movl $17, %esi
+; CHECK: xorl %r10d, %r10d
 ; CHECK: movabsq $_consume_attributes, %r11
 ; CHECK-NEXT: callq *%r11
 ; CHECK-NEXT: xchgw %ax, %ax
diff --git a/test/CodeGen/X86/pmaddubsw.ll b/test/CodeGen/X86/pmaddubsw.ll
index 51dff75981546bffb0e705d67652772fa7319b15..3422429009c8d72735d0bbadcb3d73568e76466a 100644
--- a/test/CodeGen/X86/pmaddubsw.ll
+++ b/test/CodeGen/X86/pmaddubsw.ll
@@ -51,13 +51,11 @@ define <16 x i16> @pmaddubsw_256(<32 x i8>* %Aptr, <32 x i8>* %Bptr) {
 ;
 ; AVX1-LABEL: pmaddubsw_256:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT:    vpmaddubsw 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX256-LABEL: pmaddubsw_256:
@@ -90,61 +88,53 @@ define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) {
 ; SSE-LABEL: pmaddubsw_512:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %rax
-; SSE-NEXT:    movdqa 112(%rdx), %xmm0
-; SSE-NEXT:    movdqa 96(%rdx), %xmm1
-; SSE-NEXT:    movdqa 80(%rdx), %xmm2
-; SSE-NEXT:    movdqa 64(%rdx), %xmm3
-; SSE-NEXT:    movdqa (%rdx), %xmm4
-; SSE-NEXT:    movdqa 16(%rdx), %xmm5
-; SSE-NEXT:    movdqa 32(%rdx), %xmm6
-; SSE-NEXT:    movdqa 48(%rdx), %xmm7
-; SSE-NEXT:    pmaddubsw (%rsi), %xmm4
-; SSE-NEXT:    pmaddubsw 16(%rsi), %xmm5
-; SSE-NEXT:    pmaddubsw 32(%rsi), %xmm6
-; SSE-NEXT:    pmaddubsw 48(%rsi), %xmm7
-; SSE-NEXT:    pmaddubsw 64(%rsi), %xmm3
-; SSE-NEXT:    pmaddubsw 80(%rsi), %xmm2
-; SSE-NEXT:    pmaddubsw 96(%rsi), %xmm1
-; SSE-NEXT:    pmaddubsw 112(%rsi), %xmm0
-; SSE-NEXT:    movdqa %xmm0, 112(%rdi)
-; SSE-NEXT:    movdqa %xmm1, 96(%rdi)
-; SSE-NEXT:    movdqa %xmm2, 80(%rdi)
-; SSE-NEXT:    movdqa %xmm3, 64(%rdi)
-; SSE-NEXT:    movdqa %xmm7, 48(%rdi)
-; SSE-NEXT:    movdqa %xmm6, 32(%rdi)
-; SSE-NEXT:    movdqa %xmm5, 16(%rdi)
-; SSE-NEXT:    movdqa %xmm4, (%rdi)
+; SSE-NEXT:    movdqa (%rdx), %xmm0
+; SSE-NEXT:    movdqa 16(%rdx), %xmm1
+; SSE-NEXT:    movdqa 32(%rdx), %xmm2
+; SSE-NEXT:    movdqa 48(%rdx), %xmm3
+; SSE-NEXT:    pmaddubsw (%rsi), %xmm0
+; SSE-NEXT:    pmaddubsw 16(%rsi), %xmm1
+; SSE-NEXT:    pmaddubsw 32(%rsi), %xmm2
+; SSE-NEXT:    pmaddubsw 48(%rsi), %xmm3
+; SSE-NEXT:    movdqa 64(%rdx), %xmm4
+; SSE-NEXT:    pmaddubsw 64(%rsi), %xmm4
+; SSE-NEXT:    movdqa 80(%rdx), %xmm5
+; SSE-NEXT:    pmaddubsw 80(%rsi), %xmm5
+; SSE-NEXT:    movdqa 96(%rdx), %xmm6
+; SSE-NEXT:    pmaddubsw 96(%rsi), %xmm6
+; SSE-NEXT:    movdqa 112(%rdx), %xmm7
+; SSE-NEXT:    pmaddubsw 112(%rsi), %xmm7
+; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
+; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
+; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
+; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
+; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: pmaddubsw_512:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm8
-; AVX1-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX1-NEXT:    vmovdqa 64(%rsi), %ymm6
-; AVX1-NEXT:    vmovdqa 96(%rsi), %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm7
-; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vpmaddubsw %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
-; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm4
-; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm4
-; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm9, %xmm4
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rsi), %xmm3
+; AVX1-NEXT:    vpmaddubsw 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpmaddubsw 48(%rdi), %xmm3, %xmm1
+; AVX1-NEXT:    vpmaddubsw 32(%rdi), %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovdqa 80(%rsi), %xmm2
+; AVX1-NEXT:    vpmaddubsw 80(%rdi), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa 64(%rsi), %xmm3
+; AVX1-NEXT:    vpmaddubsw 64(%rdi), %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vmovdqa 112(%rsi), %xmm3
+; AVX1-NEXT:    vpmaddubsw 112(%rdi), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa 96(%rsi), %xmm4
+; AVX1-NEXT:    vpmaddubsw 96(%rdi), %xmm4, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 36f06a64a65400f0e4df4697c5f1fb232f4c1240..2d09877155093ad339f6d4cb2b41a3dc266cb70e 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -10,7 +10,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind  {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
 ; SSE2-NEXT:    pmullw %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; SSE2-NEXT:    pand %xmm3, %xmm1
@@ -24,7 +24,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind  {
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pand %xmm3, %xmm0
@@ -36,28 +36,26 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind  {
 ;
 ; AVX2-LABEL: mul_v16i8c:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mul_v16i8c:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v16i8c:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -156,7 +154,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind  {
 ; SSE41-LABEL: mul_v16i8:
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm1, %xmm0
@@ -170,31 +168,29 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind  {
 ;
 ; AVX2-LABEL: mul_v16i8:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mul_v16i8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v16i8:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -378,7 +374,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
 ; SSE2-NEXT:    pand %xmm4, %xmm2
@@ -400,7 +396,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
 ; SSE41-NEXT:    pmullw %xmm4, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pand %xmm5, %xmm0
@@ -408,7 +404,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
 ; SSE41-NEXT:    pand %xmm5, %xmm2
 ; SSE41-NEXT:    packuswb %xmm0, %xmm2
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm4, %xmm1
 ; SSE41-NEXT:    pand %xmm5, %xmm1
 ; SSE41-NEXT:    pmullw %xmm4, %xmm3
@@ -420,42 +416,33 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
 ;
 ; AVX2-LABEL: mul_v32i8c:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mul_v32i8c:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v32i8c:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -579,7 +566,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
 ; SSE41-LABEL: mul_v32i8:
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
@@ -589,9 +576,9 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
 ; SSE41-NEXT:    pand %xmm6, %xmm4
 ; SSE41-NEXT:    packuswb %xmm0, %xmm4
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm3, %xmm1
 ; SSE41-NEXT:    pand %xmm6, %xmm1
 ; SSE41-NEXT:    pmullw %xmm0, %xmm2
@@ -603,47 +590,36 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
 ;
 ; AVX2-LABEL: mul_v32i8:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mul_v32i8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm2
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm3
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v32i8:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -748,7 +724,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
 ; SSE2-NEXT:    pmullw %xmm4, %xmm6
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
 ; SSE2-NEXT:    pand %xmm5, %xmm6
@@ -787,8 +763,8 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117]
 ; SSE41-NEXT:    pmullw %xmm6, %xmm1
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pand %xmm7, %xmm1
@@ -796,21 +772,21 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; SSE41-NEXT:    pand %xmm7, %xmm0
 ; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm6, %xmm4
 ; SSE41-NEXT:    pand %xmm7, %xmm4
 ; SSE41-NEXT:    pmullw %xmm6, %xmm1
 ; SSE41-NEXT:    pand %xmm7, %xmm1
 ; SSE41-NEXT:    packuswb %xmm4, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm6, %xmm2
 ; SSE41-NEXT:    pand %xmm7, %xmm2
 ; SSE41-NEXT:    pmullw %xmm6, %xmm4
 ; SSE41-NEXT:    pand %xmm7, %xmm4
 ; SSE41-NEXT:    packuswb %xmm2, %xmm4
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm6, %xmm3
 ; SSE41-NEXT:    pand %xmm7, %xmm3
 ; SSE41-NEXT:    pmullw %xmm6, %xmm5
@@ -822,74 +798,55 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ;
 ; AVX2-LABEL: mul_v64i8c:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mul_v64i8c:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v64i8c:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
@@ -951,9 +908,9 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm8
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm4, %xmm1
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pand %xmm9, %xmm1
@@ -961,27 +918,27 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; SSE41-NEXT:    pand %xmm9, %xmm0
 ; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm5, %xmm8
 ; SSE41-NEXT:    pand %xmm9, %xmm8
 ; SSE41-NEXT:    pmullw %xmm4, %xmm1
 ; SSE41-NEXT:    pand %xmm9, %xmm1
 ; SSE41-NEXT:    packuswb %xmm8, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm6, %xmm2
 ; SSE41-NEXT:    pand %xmm9, %xmm2
 ; SSE41-NEXT:    pmullw %xmm5, %xmm4
 ; SSE41-NEXT:    pand %xmm9, %xmm4
 ; SSE41-NEXT:    packuswb %xmm2, %xmm4
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm7, %xmm3
 ; SSE41-NEXT:    pand %xmm9, %xmm3
 ; SSE41-NEXT:    pmullw %xmm2, %xmm5
@@ -993,86 +950,62 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ;
 ; AVX2-LABEL: mul_v64i8:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vpmovsxbw %xmm4, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
-; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
-; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
-; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
-; AVX2-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
-; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mul_v64i8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm4
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
-; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
-; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX512F-NEXT:    vpmovsxbw %xmm3, %ymm2
-; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512F-NEXT:    vpmovsxbw %xmm3, %ymm3
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v64i8:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm2
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm3
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <64 x i8> %i, %j
@@ -1318,76 +1251,76 @@ entry:
 define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
 ; SSE2-LABEL: mul_v8i64_sext:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm0, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
-; SSE2-NEXT:    pmuludq %xmm5, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; SSE2-NEXT:    pmuludq %xmm0, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pmuludq %xmm5, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE2-NEXT:    pmuludq %xmm4, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm6
+; SSE2-NEXT:    paddq %xmm3, %xmm6
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    paddq %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm5
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; SSE2-NEXT:    pmuludq %xmm3, %xmm4
-; SSE2-NEXT:    paddq %xmm5, %xmm4
-; SSE2-NEXT:    movdqa %xmm6, %xmm5
-; SSE2-NEXT:    psrad $31, %xmm5
-; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    paddq %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE2-NEXT:    pmuludq %xmm2, %xmm5
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; SSE2-NEXT:    pmuludq %xmm6, %xmm4
-; SSE2-NEXT:    paddq %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pmuludq %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm6
-; SSE2-NEXT:    psrad $16, %xmm5
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    paddq %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; SSE2-NEXT:    psllq $32, %xmm6
+; SSE2-NEXT:    paddq %xmm6, %xmm0
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm6
-; SSE2-NEXT:    pmuludq %xmm5, %xmm4
-; SSE2-NEXT:    paddq %xmm6, %xmm4
-; SSE2-NEXT:    pmuludq %xmm5, %xmm3
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    paddq %xmm4, %xmm3
+; SSE2-NEXT:    paddq %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE2-NEXT:    psllq $32, %xmm6
+; SSE2-NEXT:    paddq %xmm6, %xmm1
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
+; SSE2-NEXT:    pmuludq %xmm7, %xmm6
+; SSE2-NEXT:    paddq %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pmuludq %xmm7, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    psllq $32, %xmm6
+; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm5
+; SSE2-NEXT:    pmuludq %xmm4, %xmm6
+; SSE2-NEXT:    paddq %xmm5, %xmm6
+; SSE2-NEXT:    pmuludq %xmm4, %xmm3
+; SSE2-NEXT:    psllq $32, %xmm6
+; SSE2-NEXT:    paddq %xmm6, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: mul_v8i64_sext:
@@ -1418,9 +1351,9 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
 ; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
 ; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm2, %ymm1
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/pmulh.ll b/test/CodeGen/X86/pmulh.ll
index 9fe8a66b98dd726e35fc970893256593936e4e3f..7068650110e98bef74f75be14d89ce9404a0db57 100644
--- a/test/CodeGen/X86/pmulh.ll
+++ b/test/CodeGen/X86/pmulh.ll
@@ -1,32 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-WIDEN
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-WIDEN
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
 
 define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
-; SSE2-LABEL: mulhuw_v4i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmulhuw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    retq
+; SSE2-PROMOTE-LABEL: mulhuw_v4i16:
+; SSE2-PROMOTE:       # %bb.0:
+; SSE2-PROMOTE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-PROMOTE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-PROMOTE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-PROMOTE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-PROMOTE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-PROMOTE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-PROMOTE-NEXT:    pmulhuw %xmm1, %xmm0
+; SSE2-PROMOTE-NEXT:    pxor %xmm1, %xmm1
+; SSE2-PROMOTE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-PROMOTE-NEXT:    retq
 ;
-; SSE41-LABEL: mulhuw_v4i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT:    pmulld %xmm1, %xmm0
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    retq
+; SSE2-WIDEN-LABEL: mulhuw_v4i16:
+; SSE2-WIDEN:       # %bb.0:
+; SSE2-WIDEN-NEXT:    pmulhuw %xmm1, %xmm0
+; SSE2-WIDEN-NEXT:    retq
+;
+; SSE41-PROMOTE-LABEL: mulhuw_v4i16:
+; SSE41-PROMOTE:       # %bb.0:
+; SSE41-PROMOTE-NEXT:    pxor %xmm2, %xmm2
+; SSE41-PROMOTE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-PROMOTE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-PROMOTE-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-PROMOTE-NEXT:    psrld $16, %xmm0
+; SSE41-PROMOTE-NEXT:    retq
+;
+; SSE41-WIDEN-LABEL: mulhuw_v4i16:
+; SSE41-WIDEN:       # %bb.0:
+; SSE41-WIDEN-NEXT:    pmulhuw %xmm1, %xmm0
+; SSE41-WIDEN-NEXT:    retq
 ;
 ; AVX-LABEL: mulhuw_v4i16:
 ; AVX:       # %bb.0:
@@ -45,28 +57,38 @@ define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
 }
 
 define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
-; SSE2-LABEL: mulhw_v4i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmulhw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    retq
+; SSE2-PROMOTE-LABEL: mulhw_v4i16:
+; SSE2-PROMOTE:       # %bb.0:
+; SSE2-PROMOTE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-PROMOTE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-PROMOTE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-PROMOTE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-PROMOTE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-PROMOTE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-PROMOTE-NEXT:    pmulhw %xmm1, %xmm0
+; SSE2-PROMOTE-NEXT:    pxor %xmm1, %xmm1
+; SSE2-PROMOTE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-PROMOTE-NEXT:    retq
+;
+; SSE2-WIDEN-LABEL: mulhw_v4i16:
+; SSE2-WIDEN:       # %bb.0:
+; SSE2-WIDEN-NEXT:    pmulhw %xmm1, %xmm0
+; SSE2-WIDEN-NEXT:    retq
+;
+; SSE41-PROMOTE-LABEL: mulhw_v4i16:
+; SSE41-PROMOTE:       # %bb.0:
+; SSE41-PROMOTE-NEXT:    pslld $16, %xmm0
+; SSE41-PROMOTE-NEXT:    psrad $16, %xmm0
+; SSE41-PROMOTE-NEXT:    pslld $16, %xmm1
+; SSE41-PROMOTE-NEXT:    psrad $16, %xmm1
+; SSE41-PROMOTE-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-PROMOTE-NEXT:    psrld $16, %xmm0
+; SSE41-PROMOTE-NEXT:    retq
 ;
-; SSE41-LABEL: mulhw_v4i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pslld $16, %xmm0
-; SSE41-NEXT:    psrad $16, %xmm0
-; SSE41-NEXT:    pslld $16, %xmm1
-; SSE41-NEXT:    psrad $16, %xmm1
-; SSE41-NEXT:    pmulld %xmm1, %xmm0
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    retq
+; SSE41-WIDEN-LABEL: mulhw_v4i16:
+; SSE41-WIDEN:       # %bb.0:
+; SSE41-WIDEN-NEXT:    pmulhw %xmm1, %xmm0
+; SSE41-WIDEN-NEXT:    retq
 ;
 ; AVX-LABEL: mulhw_v4i16:
 ; AVX:       # %bb.0:
diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll
index 5f70ce26d22f5fb34ef6e9d7e787b0628cdc6c6a..08e6ee0fe85af181158fb6e72894c5aafcf6af5e 100644
--- a/test/CodeGen/X86/popcnt.ll
+++ b/test/CodeGen/X86/popcnt.ll
@@ -25,6 +25,7 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ;
 ; X64-LABEL: cnt8:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrb %al
 ; X64-NEXT:    andb $85, %al
@@ -36,8 +37,9 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ; X64-NEXT:    addb %al, %dil
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    leal (%rax,%rdi), %eax
 ; X64-NEXT:    andb $15, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-POPCNT-LABEL: cnt8:
diff --git a/test/CodeGen/X86/pr22774.ll b/test/CodeGen/X86/pr22774.ll
index acd394a4b43bcb68493f151050f435cccbdaa86f..00fb4539aff55f69273d06fc55dad3d593455487 100644
--- a/test/CodeGen/X86/pr22774.ll
+++ b/test/CodeGen/X86/pr22774.ll
@@ -7,11 +7,9 @@
 define i32 @_Z3foov() {
 ; CHECK-LABEL: _Z3foov:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovdqa {{.*}}(%rip), %ymm0
-; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; CHECK-NEXT:    vmovdqa %xmm0, {{.*}}(%rip)
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
   %0 = load <4 x i64>, <4 x i64>* @in, align 32
diff --git a/test/CodeGen/X86/pr23664.ll b/test/CodeGen/X86/pr23664.ll
index 4b9720ff772ce60d3e148112e2db2567dd8aed30..453e5db2bed61c861482c68b7b967df382dac1cd 100644
--- a/test/CodeGen/X86/pr23664.ll
+++ b/test/CodeGen/X86/pr23664.ll
@@ -1,15 +1,17 @@
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
 
 define i2 @f(i32 %arg) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal (%rdi,%rdi), %eax
+; CHECK-NEXT:    orb $1, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %trunc = trunc i32 %arg to i1
   %sext = sext i1 %trunc to i2
   %or = or i2 %sext, 1
   ret i2 %or
-
-; CHECK-LABEL: f:
-; CHECK:      movl    %edi, %eax
-; CHECK-NEXT: addb    %al, %al
-; CHECK-NEXT: orb     $1, %al
-; CHECK-NEXT: # kill
-; CHECK-NEXT: retq
 }
+
diff --git a/test/CodeGen/X86/pr28444.ll b/test/CodeGen/X86/pr28444.ll
index 452f01c166b7b06f68d7544e41e098d5ac36a84f..23383209e38c15b982f27c5616753eb09634ca90 100644
--- a/test/CodeGen/X86/pr28444.ll
+++ b/test/CodeGen/X86/pr28444.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 < %s | FileCheck %s
 ; https://llvm.org/bugs/show_bug.cgi?id=28444
 
@@ -7,16 +8,18 @@
 ;  i8 = extract_vector_elt v1i1, Constant:i64<0>
 ;  i1 = extract_vector_elt v1i1, Constant:i64<0>
 
-
-; CHECK-LABEL: {{^}}extractelt_mismatch_vector_element_type:
-; CHECK: movb $1, %al
-; CHECK: movb %al
-; CHECK: movb %al
-define void @extractelt_mismatch_vector_element_type(i32 %arg) {
+define void @extractelt_mismatch_vector_element_type(i32 %arg, i1 %x) {
+; CHECK-LABEL: extractelt_mismatch_vector_element_type:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    movb %al, (%rax)
+; CHECK-NEXT:    movb %al, (%rax)
+; CHECK-NEXT:    retq
 bb:
   %tmp = icmp ult i32 %arg, 0
   %tmp2 = insertelement <1 x i1> undef, i1 true, i32 0
-  %tmp3 = select i1 %tmp, <1 x i1> undef, <1 x i1> %tmp2
+  %f = insertelement <1 x i1> undef, i1 %x, i32 0
+  %tmp3 = select i1 %tmp, <1 x i1> %f, <1 x i1> %tmp2
   %tmp6 = extractelement <1 x i1> %tmp3, i32 0
   br label %bb1
 
@@ -25,3 +28,4 @@ bb1:
   store volatile i1 %tmp6, i1* undef
   ret void
 }
+
diff --git a/test/CodeGen/X86/pr30284.ll b/test/CodeGen/X86/pr30284.ll
index c6a688ebdc41b146636de129aebcdc0475f824a9..a9005377213fcad5325b45b858d29d6fde37285a 100644
--- a/test/CodeGen/X86/pr30284.ll
+++ b/test/CodeGen/X86/pr30284.ll
@@ -1,22 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=avx512dq | FileCheck %s
 
-define void @f_f___un_3C_unf_3E_un_3C_unf_3E_() {
+define void @undef_cond() {
+; CHECK-LABEL: undef_cond:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retl
+  %a_load22 = load <16 x i64>, <16 x i64>* null, align 1
+  %bitop = or <16 x i64> %a_load22, <i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736>
+  %v.i = load <16 x i64>, <16 x i64>* null
+  %v1.i41 = select <16 x i1> undef, <16 x i64> %bitop, <16 x i64> %v.i
+  store <16 x i64> %v1.i41, <16 x i64>* null
+  ret void
+}
+
+define void @f_f___un_3C_unf_3E_un_3C_unf_3E_(<16 x i1> %x) {
 ; CHECK-LABEL: f_f___un_3C_unf_3E_un_3C_unf_3E_:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT:    vpslld $31, %zmm0, %zmm0
+; CHECK-NEXT:    vpmovd2m %zmm0, %k1
 ; CHECK-NEXT:    vmovapd 0, %zmm0
 ; CHECK-NEXT:    vmovapd 64, %zmm1
 ; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
+; CHECK-NEXT:    kshiftrw $8, %k1, %k2
+; CHECK-NEXT:    vorpd %zmm2, %zmm1, %zmm1 {%k2}
 ; CHECK-NEXT:    vorpd %zmm2, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT:    vorpd %zmm2, %zmm1, %zmm1 {%k1}
-; CHECK-NEXT:    vmovapd %zmm1, 64
 ; CHECK-NEXT:    vmovapd %zmm0, 0
+; CHECK-NEXT:    vmovapd %zmm1, 64
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
   %a_load22 = load <16 x i64>, <16 x i64>* null, align 1
   %bitop = or <16 x i64> %a_load22, <i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736>
   %v.i = load <16 x i64>, <16 x i64>* null
-  %v1.i41 = select <16 x i1> undef, <16 x i64> %bitop, <16 x i64> %v.i
+  %v1.i41 = select <16 x i1> %x, <16 x i64> %bitop, <16 x i64> %v.i
   store <16 x i64> %v1.i41, <16 x i64>* null
   ret void
 }
diff --git a/test/CodeGen/X86/pr30511.ll b/test/CodeGen/X86/pr30511.ll
index 7372980b41e4540b66451e70110727a5f7eda002..69d0a94cc11caef8a35e2305e13f9ee0bb0d03bc 100644
--- a/test/CodeGen/X86/pr30511.ll
+++ b/test/CodeGen/X86/pr30511.ll
@@ -8,7 +8,6 @@ define i64 @PR30511(<2 x double> %a) {
 ; CHECK-LABEL: PR30511:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addpd {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; CHECK-NEXT:    mulpd {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    movq %xmm0, %rax
diff --git a/test/CodeGen/X86/pr31045.ll b/test/CodeGen/X86/pr31045.ll
index c87b4b39a4fa2854cd6350852658aa64cc8a14b6..d706d583f34ad5d227966d42ef1ef020d6d65cf8 100644
--- a/test/CodeGen/X86/pr31045.ll
+++ b/test/CodeGen/X86/pr31045.ll
@@ -19,28 +19,16 @@ define void @_Z1av() local_unnamed_addr #0 {
 ; CHECK-LABEL: _Z1av:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl struct_obj_3+{{.*}}(%rip), %eax
-; CHECK-NEXT:    movsbl {{.*}}(%rip), %ecx
+; CHECK-NEXT:    movzbl {{.*}}(%rip), %ecx
 ; CHECK-NEXT:    movzbl {{.*}}(%rip), %edx
 ; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    leal (%rax,%rax), %esi
-; CHECK-NEXT:    subl %ecx, %esi
-; CHECK-NEXT:    subl %edx, %esi
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    cmovel %eax, %ecx
-; CHECK-NEXT:    movzbl {{.*}}(%rip), %edx
-; CHECK-NEXT:    andl struct_obj_8+{{.*}}(%rip), %ecx
-; CHECK-NEXT:    andl $1, %ecx
-; CHECK-NEXT:    negl %ecx
-; CHECK-NEXT:    andl %edx, %ecx
-; CHECK-NEXT:    negl %ecx
-; CHECK-NEXT:    andl %eax, %ecx
-; CHECK-NEXT:    negl %ecx
-; CHECK-NEXT:    testl %ecx, %esi
-; CHECK-NEXT:    notl %esi
-; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    addl %eax, %eax
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    subl %edx, %eax
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    movw %ax, struct_obj_12+{{.*}}(%rip)
-; CHECK-NEXT:    setne {{.*}}(%rip)
+; CHECK-NEXT:    movb $0, {{.*}}(%rip)
 ; CHECK-NEXT:    retq
 entry:
   %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363, %struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363* @struct_obj_3, i64 0, i32 0, i32 2) to i32*), align 2
diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
index 6b08e2fa4139c99e23dd1137d616b056d191bedd..19afc7ea86b37a61e029ad025f4b24a714a0ad9e 100644
--- a/test/CodeGen/X86/pr32282.ll
+++ b/test/CodeGen/X86/pr32282.ll
@@ -52,8 +52,8 @@ define void @foo(i64 %x) nounwind {
 ; X64-NEXT:    idivq %rcx
 ; X64-NEXT:    jmp .LBB0_3
 ; X64-NEXT:  .LBB0_1:
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divl %ecx
 ; X64-NEXT:    # kill: def $eax killed $eax def $rax
 ; X64-NEXT:  .LBB0_3:
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index 3998fcec9c7816dfe40fa3c4a6772166255bf7fe..acbdf0e084b0b8a182b2312005914904370bd61f 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -81,9 +81,8 @@ define void @foo() {
 ; 686-NEXT:    movzbl c, %eax
 ; 686-NEXT:    xorl %ecx, %ecx
 ; 686-NEXT:    testl %eax, %eax
-; 686-NEXT:    setne %cl
-; 686-NEXT:    testb %al, %al
 ; 686-NEXT:    setne {{[0-9]+}}(%esp)
+; 686-NEXT:    setne %cl
 ; 686-NEXT:    xorl %edx, %edx
 ; 686-NEXT:    cmpl %eax, %ecx
 ; 686-NEXT:    setle %dl
diff --git a/test/CodeGen/X86/pr32588.ll b/test/CodeGen/X86/pr32588.ll
index eee1d651c3e8283d8564e34b6494d9a4f85dfbc3..5b875cdc991f284fd676de38a0187928048986db 100644
--- a/test/CodeGen/X86/pr32588.ll
+++ b/test/CodeGen/X86/pr32588.ll
@@ -1,25 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
 
 @c = external local_unnamed_addr global i32, align 4
 @b = external local_unnamed_addr global i32, align 4
 @d = external local_unnamed_addr global i32, align 4
 
-; CHECK: cmpl    $1, c(%rip)
-; CHECK-NEXT: sbbl    %eax, %eax
-; CHECK-NEXT: andl    $1, %eax
-; CHECK-NEXT: movl    %eax, d(%rip)
-; CHECK-NEXT: retq
-
 define void @fn1() {
-entry:
-  %0 = load i32, i32* @c, align 4
-  %tobool1 = icmp eq i32 %0, 0
+; CHECK-LABEL: fn1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpl $1, {{.*}}(%rip)
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    movl %eax, {{.*}}(%rip)
+; CHECK-NEXT:    retq
+  %t0 = load i32, i32* @c, align 4
+  %tobool1 = icmp eq i32 %t0, 0
   %xor = zext i1 %tobool1 to i32
-  %1 = load i32, i32* @b, align 4
-  %tobool2 = icmp ne i32 %1, 0
+  %t1 = load i32, i32* @b, align 4
+  %tobool2 = icmp ne i32 %t1, 0
   %tobool4 = icmp ne i32 undef, 0
-  %2 = and i1 %tobool4, %tobool2
-  %sub = sext i1 %2 to i32
+  %t2 = and i1 %tobool4, %tobool2
+  %sub = sext i1 %t2 to i32
   %div = sdiv i32 %sub, 2
   %add = add nsw i32 %div, %xor
   store i32 %add, i32* @d, align 4
diff --git a/test/CodeGen/X86/pr32610.ll b/test/CodeGen/X86/pr32610.ll
index 1116cf6f1b29ac4c2146063fcf1f0707dea5acde..8585b88139236590de1dc9c68bde74f3c27c6d4d 100644
--- a/test/CodeGen/X86/pr32610.ll
+++ b/test/CodeGen/X86/pr32610.ll
@@ -20,11 +20,11 @@ entry:
   %conv = zext i1 %cmp to i32
   %tobool1.i = icmp ne i32 undef, 0
   %or.cond.i = and i1 %cmp, %tobool1.i
-  %cond.i = select i1 %or.cond.i, i32 %conv, i32 undef
+  %cond.i = select i1 %or.cond.i, i32 %conv, i32 1
   store i32 %cond.i, i32* @c, align 4, !tbaa !1
   %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i32 0, i32 0), align 4
   %tobool = icmp ne i32 %1, 0
-  %2 = select i1 %tobool, i32 %1, i32 undef
+  %2 = select i1 %tobool, i32 %1, i32 2
   store i32 %2, i32* @d, align 4, !tbaa !1
   ret void
 }
diff --git a/test/CodeGen/X86/pr34137.ll b/test/CodeGen/X86/pr34137.ll
index 4e81cb82b5ba0e1cef332e61bff34252d005d1f5..1a85a667b3d8af96188520539078386f397ab48f 100644
--- a/test/CodeGen/X86/pr34137.ll
+++ b/test/CodeGen/X86/pr34137.ll
@@ -11,11 +11,12 @@ define void @pr34127() {
 ; CHECK-NEXT:    movzwl {{.*}}(%rip), %eax
 ; CHECK-NEXT:    movzwl {{.*}}(%rip), %ecx
 ; CHECK-NEXT:    andl %eax, %ecx
-; CHECK-NEXT:    andl %eax, %ecx
-; CHECK-NEXT:    movzwl %cx, %ecx
-; CHECK-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    andl %ecx, %edx
+; CHECK-NEXT:    movzwl %dx, %edx
+; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    testw %cx, %cx
+; CHECK-NEXT:    testw %cx, %ax
 ; CHECK-NEXT:    sete %dl
 ; CHECK-NEXT:    andl %eax, %edx
 ; CHECK-NEXT:    movq %rdx, {{.*}}(%rip)
diff --git a/test/CodeGen/X86/pr34605.ll b/test/CodeGen/X86/pr34605.ll
index 95459ce960a01e7a8dd9ea21231421be03172340..4c1a3d7781e25137797f37b3c8ea4083e2bfab20 100644
--- a/test/CodeGen/X86/pr34605.ll
+++ b/test/CodeGen/X86/pr34605.ll
@@ -18,15 +18,15 @@ define void @pr34605(i8* nocapture %s, i32 %p) {
 ; CHECK-NEXT:    kmovd %k1, %k1
 ; CHECK-NEXT:    kandq %k1, %k0, %k1
 ; CHECK-NEXT:    vmovdqu8 {{\.LCPI.*}}, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%eax)
-; CHECK-NEXT:    vmovups %zmm1, 64(%eax)
-; CHECK-NEXT:    vmovups %zmm1, 128(%eax)
-; CHECK-NEXT:    vmovups %zmm1, 192(%eax)
-; CHECK-NEXT:    vmovups %zmm1, 256(%eax)
-; CHECK-NEXT:    vmovups %zmm1, 320(%eax)
-; CHECK-NEXT:    vmovups %zmm1, 384(%eax)
-; CHECK-NEXT:    vmovups %zmm1, 448(%eax)
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, 64(%eax)
+; CHECK-NEXT:    vmovdqu64 %zmm0, 128(%eax)
+; CHECK-NEXT:    vmovdqu64 %zmm0, 192(%eax)
+; CHECK-NEXT:    vmovdqu64 %zmm0, 256(%eax)
+; CHECK-NEXT:    vmovdqu64 %zmm0, 320(%eax)
+; CHECK-NEXT:    vmovdqu64 %zmm0, 384(%eax)
+; CHECK-NEXT:    vmovdqu64 %zmm0, 448(%eax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
 entry:
diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll
index 3578806596f5987cfeb50e28e86d6665136812d6..733249770beca3fbfac005f7e3fbc98f33b42feb 100644
--- a/test/CodeGen/X86/pr34653.ll
+++ b/test/CodeGen/X86/pr34653.ll
@@ -12,68 +12,74 @@ define void @pr34653() {
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    andq $-512, %rsp # imm = 0xFE00
-; CHECK-NEXT:    subq $1536, %rsp # imm = 0x600
+; CHECK-NEXT:    subq $2048, %rsp # imm = 0x800
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; CHECK-NEXT:    callq test
 ; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm1
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vmovaps %xmm3, %xmm4
-; CHECK-NEXT:    vmovaps %xmm2, %xmm5
-; CHECK-NEXT:    vmovaps %xmm5, %xmm6
-; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm7
-; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm8
-; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm9
-; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm10
-; CHECK-NEXT:    vextractf32x4 $3, %zmm10, %xmm11
-; CHECK-NEXT:    vmovaps %xmm11, %xmm12
-; CHECK-NEXT:    vextractf32x4 $2, %zmm10, %xmm13
-; CHECK-NEXT:    vmovaps %xmm13, %xmm14
-; CHECK-NEXT:    vmovaps %xmm10, %xmm15
-; CHECK-NEXT:    vmovaps %zmm15, %zmm16
-; CHECK-NEXT:    vextractf32x4 $3, %zmm9, %xmm2
-; CHECK-NEXT:    vmovaps %zmm2, %zmm17
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vextractf32x4 $2, %zmm9, %xmm0
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm4
+; CHECK-NEXT:    vmovaps %xmm4, %xmm5
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm6
+; CHECK-NEXT:    vmovaps %xmm6, %xmm7
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm8
+; CHECK-NEXT:    vmovaps %xmm8, %xmm9
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm10
+; CHECK-NEXT:    vmovaps %xmm10, %xmm11
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm12
+; CHECK-NEXT:    vmovaps %xmm12, %xmm13
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm14
+; CHECK-NEXT:    vmovaps %xmm14, %xmm15
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm16
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm17
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm18
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps %xmm9, %xmm0
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm19
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf32x4 $3, %zmm8, %xmm0
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm20
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf32x4 $2, %zmm8, %xmm0
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm21
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps %xmm8, %xmm0
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm22
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf32x4 $3, %zmm7, %xmm0
-; CHECK-NEXT:    vmovaps %zmm0, %zmm23
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf32x4 $2, %zmm7, %xmm0
-; CHECK-NEXT:    vmovaps %zmm0, %zmm24
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps %xmm7, %xmm0
-; CHECK-NEXT:    vmovaps %zmm0, %zmm25
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
-; CHECK-NEXT:    # kill: def $ymm10 killed $ymm10 killed $zmm10
-; CHECK-NEXT:    vextractf128 $1, %ymm10, %xmm10
-; CHECK-NEXT:    vmovaps %zmm10, %zmm26
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm23
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm12 = xmm12[1,0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm14 = xmm14[1,0]
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    # kill: def $ymm9 killed $ymm9 killed $zmm9
-; CHECK-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; CHECK-NEXT:    vmovaps %zmm9, %zmm27
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -83,9 +89,6 @@ define void @pr34653() {
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    # kill: def $ymm8 killed $ymm8 killed $zmm8
-; CHECK-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; CHECK-NEXT:    vmovaps %zmm8, %zmm28
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -95,16 +98,30 @@ define void @pr34653() {
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    # kill: def $ymm7 killed $ymm7 killed $zmm7
-; CHECK-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; CHECK-NEXT:    vmovaps %zmm7, %zmm29
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 8-byte Reload
+; CHECK-NEXT:    # xmm24 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload
+; CHECK-NEXT:    # xmm25 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload
+; CHECK-NEXT:    # xmm26 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 8-byte Reload
+; CHECK-NEXT:    # xmm27 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 8-byte Reload
+; CHECK-NEXT:    # xmm28 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 8-byte Reload
+; CHECK-NEXT:    # xmm29 = mem[0],zero
 ; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 8-byte Reload
 ; CHECK-NEXT:    # xmm30 = mem[0],zero
 ; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 8-byte Reload
@@ -131,11 +148,13 @@ define void @pr34653() {
 ; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -146,22 +165,20 @@ define void @pr34653() {
 ; CHECK-NEXT:    vmovsd %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm7, (%rsp) # 8-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/test/CodeGen/X86/pr35636.ll b/test/CodeGen/X86/pr35636.ll
index 07fb37f4b62a8e9b4c082de81cad7db974fc0dde..a97af6a1ac67fceb027f2541ffe79a6b89f8fe12 100644
--- a/test/CodeGen/X86/pr35636.ll
+++ b/test/CodeGen/X86/pr35636.ll
@@ -5,11 +5,11 @@
 define void @_Z15uint64_to_asciimPc(i64 %arg) {
 ; HSW-LABEL: _Z15uint64_to_asciimPc:
 ; HSW:       # %bb.0: # %bb
-; HSW-NEXT:    movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
-; HSW-NEXT:    movq %rdi, %rdx
-; HSW-NEXT:    mulxq %rax, %rax, %rcx
-; HSW-NEXT:    shrq $42, %rcx
-; HSW-NEXT:    imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
+; HSW-NEXT:    movq %rdi, %rax
+; HSW-NEXT:    movabsq $811296384146066817, %rcx # imm = 0xB424DC35095CD81
+; HSW-NEXT:    mulq %rcx
+; HSW-NEXT:    shrq $42, %rdx
+; HSW-NEXT:    imulq $281474977, %rdx, %rax # imm = 0x10C6F7A1
 ; HSW-NEXT:    shrq $20, %rax
 ; HSW-NEXT:    leal (%rax,%rax,4), %eax
 ; HSW-NEXT:    addl $5, %eax
@@ -22,11 +22,11 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
 ;
 ; ZN-LABEL: _Z15uint64_to_asciimPc:
 ; ZN:       # %bb.0: # %bb
-; ZN-NEXT:    movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
-; ZN-NEXT:    movq %rdi, %rdx
-; ZN-NEXT:    mulxq %rax, %rax, %rcx
-; ZN-NEXT:    shrq $42, %rcx
-; ZN-NEXT:    imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
+; ZN-NEXT:    movq %rdi, %rax
+; ZN-NEXT:    movabsq $811296384146066817, %rcx # imm = 0xB424DC35095CD81
+; ZN-NEXT:    mulq %rcx
+; ZN-NEXT:    shrq $42, %rdx
+; ZN-NEXT:    imulq $281474977, %rdx, %rax # imm = 0x10C6F7A1
 ; ZN-NEXT:    shrq $20, %rax
 ; ZN-NEXT:    leal 5(%rax,%rax,4), %eax
 ; ZN-NEXT:    andl $134217727, %eax # imm = 0x7FFFFFF
diff --git a/test/CodeGen/X86/pr35918.ll b/test/CodeGen/X86/pr35918.ll
index 5c84bd946fddccffbc6ed71c034d38822752566b..6ce97061a8fa625b1472d52324d3fa490997721d 100644
--- a/test/CodeGen/X86/pr35918.ll
+++ b/test/CodeGen/X86/pr35918.ll
@@ -15,9 +15,8 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X86-SKYLAKE-NEXT:    vpsrad $16, %xmm0, %xmm0
 ; X86-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-SKYLAKE-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; X86-SKYLAKE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
 ; X86-SKYLAKE-NEXT:    vpsrld $7, %xmm0, %xmm0
-; X86-SKYLAKE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-SKYLAKE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X86-SKYLAKE-NEXT:    vmovd %xmm0, %ecx
 ; X86-SKYLAKE-NEXT:    orl $-16777216, %ecx # imm = 0xFF000000
 ; X86-SKYLAKE-NEXT:    movl %ecx, (%eax)
@@ -54,9 +53,8 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X64-SKYLAKE-NEXT:    vpsrad $16, %xmm0, %xmm0
 ; X64-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKYLAKE-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-SKYLAKE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
 ; X64-SKYLAKE-NEXT:    vpsrld $7, %xmm0, %xmm0
-; X64-SKYLAKE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-SKYLAKE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X64-SKYLAKE-NEXT:    vmovd %xmm0, %eax
 ; X64-SKYLAKE-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
 ; X64-SKYLAKE-NEXT:    movl %eax, (%rdi)
diff --git a/test/CodeGen/X86/pr35972.ll b/test/CodeGen/X86/pr35972.ll
index 09363fbc89bba44f807b856f8f6143de24d66d61..e7e60666d5bcf5196e01fd924ab2dd4ee421a7bc 100644
--- a/test/CodeGen/X86/pr35972.ll
+++ b/test/CodeGen/X86/pr35972.ll
@@ -5,6 +5,7 @@ define void @test3(i32 %c, <64 x i1>* %ptr) {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    sbbl %ecx, %ecx
 ; CHECK-NEXT:    kmovd %ecx, %k0
diff --git a/test/CodeGen/X86/pr36199.ll b/test/CodeGen/X86/pr36199.ll
index 9df409aa11bae4b391c1b75993e744a207427e3f..1dc9a3ab94b36e4150d5ea180631eac9e0d3bad9 100644
--- a/test/CodeGen/X86/pr36199.ll
+++ b/test/CodeGen/X86/pr36199.ll
@@ -4,7 +4,7 @@
 define void @foo(<16 x float> %x) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vaddps %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
 ; CHECK-NEXT:    vmovups %zmm0, (%rax)
diff --git a/test/CodeGen/X86/pr36865.ll b/test/CodeGen/X86/pr36865.ll
index 397b09f10de524cf91efd862687e8e5a13f0782e..82b2e86695eac0e9e703191bc6e8a6bf3eb49331 100644
--- a/test/CodeGen/X86/pr36865.ll
+++ b/test/CodeGen/X86/pr36865.ll
@@ -7,8 +7,8 @@ define void @main() {
 ; CHECK-NEXT:    subq $424, %rsp # imm = 0x1A8
 ; CHECK-NEXT:    .cfi_def_cfa_offset 432
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    movl $400, %edx # imm = 0x190
+; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    callq memset
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movl (%rax), %ecx
diff --git a/test/CodeGen/X86/pr37499.ll b/test/CodeGen/X86/pr37499.ll
index c2d73b21546d5d7bdff6c7a51576006468ce1cb3..3fd2ca6b1d84438a30b5390c1059d161e6109b51 100644
--- a/test/CodeGen/X86/pr37499.ll
+++ b/test/CodeGen/X86/pr37499.ll
@@ -1,37 +1,45 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s
 
-define <2 x i64> @foo() {
+define <2 x i64> @undef_tval() {
+; CHECK-LABEL: undef_tval:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    retq
+  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> undef, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i8 1) #3
+  %2 = bitcast <8 x i16> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @foo(<8 x i64> %x) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vmovdqa32 %ymm1, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
-; CHECK-NEXT:    vpblendvb %xmm1, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
-  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> undef, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i8 1) #3
+  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i8 1) #3
   %2 = bitcast <8 x i16> %1 to <2 x i64>
   ret <2 x i64> %2
 }
 
-define <4 x i64> @goo() {
+define <4 x i64> @goo(<16 x i32> %x) {
 ; CHECK-LABEL: goo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    movw $1, %ax
 ; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; CHECK-NEXT:    vpmovdw %zmm1, %ymm1
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
-  %1 = tail call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> undef, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i16 1) #3
+  %1 = tail call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i16 1) #3
   %2 = bitcast <16 x i16> %1 to <4 x i64>
   ret <4 x i64> %2
 }
 
+
 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
 declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
diff --git a/test/CodeGen/X86/pr37879.ll b/test/CodeGen/X86/pr37879.ll
index 1dc5e325066054027efda0badbeab1ba61ea0f93..64cf3eaf9ece78b5113a9b1a5d2d2b78217456d6 100644
--- a/test/CodeGen/X86/pr37879.ll
+++ b/test/CodeGen/X86/pr37879.ll
@@ -6,8 +6,6 @@ define double @foo(i32** nocapture readonly) #0 {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movq (%rax), %rax
 ; CHECK-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
-; CHECK-NEXT:    ## kill: def $eax killed $eax killed $rax
-; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
diff --git a/test/CodeGen/X86/pr38795.ll b/test/CodeGen/X86/pr38795.ll
index 6cb2a0859e34817e06f95538af6767dc87b18837..11534a920c2705a459691cd7764a71dfafa98752 100644
--- a/test/CodeGen/X86/pr38795.ll
+++ b/test/CodeGen/X86/pr38795.ll
@@ -24,22 +24,23 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-NEXT:    .cfi_offset %ebp, -8
 ; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    # implicit-def: $esi
+; CHECK-NEXT:    # implicit-def: $ecx
 ; CHECK-NEXT:    # implicit-def: $edi
-; CHECK-NEXT:    # implicit-def: $ah
 ; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    # implicit-def: $edx
+; CHECK-NEXT:    # kill: killed $al
+; CHECK-NEXT:    # implicit-def: $dl
+; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_16: # %for.inc
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; CHECK-NEXT:    movb %cl, %al
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, %dl
 ; CHECK-NEXT:  .LBB0_1: # %for.cond
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_20 Depth 2
-; CHECK-NEXT:    cmpb $8, %al
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    cmpb $8, %dl
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    ja .LBB0_3
 ; CHECK-NEXT:  # %bb.2: # %for.cond
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
@@ -47,28 +48,27 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.4: # %if.end
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %edx, %ebp
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    cltd
 ; CHECK-NEXT:    idivl a
-; CHECK-NEXT:    movl %eax, %esi
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
+; CHECK-NEXT:    movb %cl, %dh
 ; CHECK-NEXT:    movl $0, h
-; CHECK-NEXT:    cmpb $8, %al
+; CHECK-NEXT:    cmpb $8, %dl
 ; CHECK-NEXT:    jg .LBB0_8
 ; CHECK-NEXT:  # %bb.5: # %if.then13
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    movl $.str, (%esp)
-; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    calll printf
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
 ; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    # implicit-def: $eax
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT:    movl %ebp, %edx
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, %dl
 ; CHECK-NEXT:    jne .LBB0_16
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  # %bb.6: # %for.cond35
@@ -91,19 +91,16 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    calll printf
 ; CHECK-NEXT:  .LBB0_19: # %for.end46
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $ch
-; CHECK-NEXT:    # implicit-def: $cl
-; CHECK-NEXT:    # implicit-def: $edx
+; CHECK-NEXT:    # implicit-def: $dl
+; CHECK-NEXT:    # implicit-def: $dh
+; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    jmp .LBB0_20
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_3: # %if.then
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl $.str, (%esp)
-; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    calll printf
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
 ; CHECK-NEXT:    # implicit-def: $eax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    jne .LBB0_11
@@ -111,25 +108,15 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_8: # %if.end21
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $edx
+; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    je .LBB0_13
+; CHECK-NEXT:    jmp .LBB0_10
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_10: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $eax
-; CHECK-NEXT:    testb %bl, %bl
-; CHECK-NEXT:    je .LBB0_17
-; CHECK-NEXT:  .LBB0_12: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $edi
-; CHECK-NEXT:    # implicit-def: $ch
-; CHECK-NEXT:    # implicit-def: $cl
-; CHECK-NEXT:    # kill: killed $cl
-; CHECK-NEXT:    # implicit-def: $edx
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    jne .LBB0_11
 ; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-NEXT:    movb %dl, %dh
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_20: # %for.cond47
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -140,25 +127,38 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    # in Loop: Header=BB0_20 Depth=2
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne .LBB0_20
-; CHECK-NEXT:  # %bb.22: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb %ch, %al
+; CHECK-NEXT:  # %bb.9: # %ae
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne .LBB0_10
 ; CHECK-NEXT:  .LBB0_13: # %if.end26
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb %dl, %dl
 ; CHECK-NEXT:    je .LBB0_16
 ; CHECK-NEXT:  # %bb.14: # %if.end26
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    testl %ebp, %ebp
 ; CHECK-NEXT:    jne .LBB0_16
 ; CHECK-NEXT:  # %bb.15: # %if.then31
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:    jmp .LBB0_16
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_10: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    # implicit-def: $eax
+; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    je .LBB0_17
+; CHECK-NEXT:  .LBB0_12: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    # implicit-def: $edi
+; CHECK-NEXT:    # implicit-def: $cl
+; CHECK-NEXT:    # kill: killed $cl
+; CHECK-NEXT:    # implicit-def: $dl
+; CHECK-NEXT:    # implicit-def: $ebp
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    jne .LBB0_11
+; CHECK-NEXT:    jmp .LBB0_7
 entry:
   br label %for.cond
 
diff --git a/test/CodeGen/X86/pr38865.ll b/test/CodeGen/X86/pr38865.ll
index 5c8689203c72885ab1e1a5712b1b9d642686b6ac..1fa59aeadec32a4fa3f0353c5f82b28062f7b155 100644
--- a/test/CodeGen/X86/pr38865.ll
+++ b/test/CodeGen/X86/pr38865.ll
@@ -15,11 +15,11 @@ define void @e() nounwind {
 ; CHECK-NEXT:    subl $528, %esp # encoding: [0x81,0xec,0x10,0x02,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0x210
 ; CHECK-NEXT:    leal {{[0-9]+}}(%rsp), %ebx # encoding: [0x8d,0x9c,0x24,0x08,0x01,0x00,0x00]
+; CHECK-NEXT:    movl %ebx, %edi # encoding: [0x89,0xdf]
 ; CHECK-NEXT:    movl $c, %esi # encoding: [0xbe,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 1, value: c, kind: FK_Data_4
 ; CHECK-NEXT:    movl $260, %edx # encoding: [0xba,0x04,0x01,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0x104
-; CHECK-NEXT:    movl %ebx, %edi # encoding: [0x89,0xdf]
 ; CHECK-NEXT:    callq memcpy # encoding: [0xe8,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 1, value: memcpy-4, kind: FK_PCRel_4
 ; CHECK-NEXT:    movl $32, %ecx # encoding: [0xb9,0x20,0x00,0x00,0x00]
diff --git a/test/CodeGen/X86/pr39187-g.ll b/test/CodeGen/X86/pr39187-g.ll
new file mode 100644
index 0000000000000000000000000000000000000000..46da64111224411654c93500a5f6e663138cf663
--- /dev/null
+++ b/test/CodeGen/X86/pr39187-g.ll
@@ -0,0 +1,121 @@
+; RUN: opt < %s -S -simplifycfg | FileCheck %s
+
+; SimplifyCFG can hoist any common code in the 'then' and 'else' blocks to
+; the 'if' basic block.
+;
+; For the special case, when hoisting the terminator instruction, its debug
+; location keep references to its basic block, causing the debug information
+; to become ambiguous. It causes the debugger to display unreached lines.
+
+; Change the debug location associated with the hoisted instruction, to
+; the debug location from the insertion point in the 'if' block.
+
+; The insertion point is the previous non-debug instruction before the
+; terminator in the parent basic block of the hoisted instruction.
+
+; IR with '-g':
+;
+;  [...]
+;  %frombool = zext i1 %cmp to i8, !dbg !26
+;  call void @llvm.dbg.value(metadata i8 %frombool, metadata !15, metadata !DIExpression()), !dbg !26
+;  call void @llvm.dbg.value(metadata i32 0, metadata !17, metadata !DIExpression()), !dbg !27
+;  br i1 %cmp, label %if.then, label %if.else
+;  [...]
+;
+; Insertion point is: %frombool = zext i1 %cmp to i8, !dbg !26
+
+; IR generated with:
+; clang -S -g -gno-column-info -O2 -emit-llvm pr39187.cpp -o pr39187-g.ll -mllvm -opt-bisect-limit=10
+
+; // pr39187.cpp
+; int main() {
+;   volatile int foo = 0;
+;
+;   int beards = 0;
+;   bool cond = foo == 4;
+;   int bar = 0;
+;   if (cond)
+;     beards = 8;
+;   else
+;     beards = 4;
+;
+;   volatile bool face = cond;
+;
+;   return face ? beards : 0;
+; }
+
+; CHECK-LABEL: entry
+; CHECK:  %foo = alloca i32, align 4
+; CHECK:  %face = alloca i8, align 1
+; CHECK:  %foo.0..sroa_cast = bitcast i32* %foo to i8*
+; CHECK:  store volatile i32 0, i32* %foo, align 4
+; CHECK:  %foo.0. = load volatile i32, i32* %foo, align 4, !dbg !16
+; CHECK:  %cmp = icmp eq i32 %foo.0., 4, !dbg !16
+; CHECK:  %frombool = zext i1 %cmp to i8, !dbg !16
+; CHECK:  call void @llvm.dbg.value(metadata i8 %frombool, metadata !13, metadata !DIExpression()), !dbg !16
+; CHECK:  call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !DIExpression()), !dbg !17
+; CHECK:  %. = select i1 %cmp, i32 8, i32 4, !dbg !16
+
+; ModuleID = 'pr39187.cpp'
+source_filename = "pr39187.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
+entry:
+  %foo = alloca i32, align 4
+  %face = alloca i8, align 1
+  %foo.0..sroa_cast = bitcast i32* %foo to i8*
+  store volatile i32 0, i32* %foo, align 4
+  %foo.0. = load volatile i32, i32* %foo, align 4, !dbg !26
+  %cmp = icmp eq i32 %foo.0., 4, !dbg !26
+  %frombool = zext i1 %cmp to i8, !dbg !26
+  call void @llvm.dbg.value(metadata i8 %frombool, metadata !15, metadata !DIExpression()), !dbg !26
+  call void @llvm.dbg.value(metadata i32 0, metadata !17, metadata !DIExpression()), !dbg !27
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  call void @llvm.dbg.value(metadata i32 8, metadata !14, metadata !DIExpression()), !dbg !25
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  call void @llvm.dbg.value(metadata i32 4, metadata !14, metadata !DIExpression()), !dbg !25
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %beards.0 = phi i32 [ 8, %if.then ], [ 4, %if.else ]
+  store volatile i8 %frombool, i8* %face, align 1
+  %face.0. = load volatile i8, i8* %face, align 1
+  %0 = and i8 %face.0., 1
+  %tobool3 = icmp eq i8 %0, 0
+  %cond4 = select i1 %tobool3, i32 0, i32 %beards.0
+  ret i32 %cond4
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 346301)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "pr39187.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 346301)"}
+!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!14, !15, !17}
+!14 = !DILocalVariable(name: "beards", scope: !7, file: !1, line: 4, type: !10)
+!15 = !DILocalVariable(name: "cond", scope: !7, file: !1, line: 5, type: !16)
+!16 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
+!17 = !DILocalVariable(name: "bar", scope: !7, file: !1, line: 6, type: !10)
+!25 = !DILocation(line: 4, scope: !7)
+!26 = !DILocation(line: 5, scope: !7)
+!27 = !DILocation(line: 6, scope: !7)
diff --git a/test/CodeGen/X86/pr39733.ll b/test/CodeGen/X86/pr39733.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4c7153852d22cd38e8d299cd33571ad1b25e14ad
--- /dev/null
+++ b/test/CodeGen/X86/pr39733.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx -O0 | FileCheck %s
+
+; We should not be emitting a sign extend using a %ymm register.
+
+define void @test55() {
+; CHECK-LABEL: test55:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    andq $-32, %rsp
+; CHECK-NEXT:    subq $96, %rsp
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [26680,34632,63774,2423,35015,60307,6240,1951]
+; CHECK-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm1
+; CHECK-NEXT:    # implicit-def: $ymm2
+; CHECK-NEXT:    vmovaps %xmm1, %xmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
+; CHECK-NEXT:    vmovdqa %ymm2, (%rsp)
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %id11762 = alloca <8 x i16>, align 16
+  %.compoundliteral = alloca <8 x i16>, align 16
+  %id11761 = alloca <8 x i32>, align 32
+  store <8 x i16> <i16 26680, i16 -30904, i16 -1762, i16 2423, i16 -30521, i16 -5229, i16 6240, i16 1951>, <8 x i16>* %.compoundliteral, align 16
+  %0 = load <8 x i16>, <8 x i16>* %.compoundliteral, align 16
+  store <8 x i16> %0, <8 x i16>* %id11762, align 16
+  %1 = load <8 x i16>, <8 x i16>* %id11762, align 16
+  %conv = sext <8 x i16> %1 to <8 x i32>
+  store <8 x i32> %conv, <8 x i32>* %id11761, align 32
+  ret void
+}
diff --git a/test/CodeGen/X86/pr39896.ll b/test/CodeGen/X86/pr39896.ll
new file mode 100644
index 0000000000000000000000000000000000000000..210230e3c8584e9541d3438318ab721624c96d26
--- /dev/null
+++ b/test/CodeGen/X86/pr39896.ll
@@ -0,0 +1,54 @@
+; RUN: llc %s -start-after=codegenprepare -stop-after=expand-isel-pseudos -o - | FileCheck %s
+
+; PR39896: When code such as %conv below is dropped by SelectionDAG for having
+; no users, don't just drop the dbg.value record associated with it. Instead,
+; the corresponding variable should receive a "DBG_VALUE" undef to terminate
+; earlier variable live ranges.
+
+; ModuleID = 'run.c'
+source_filename = "run.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+@a = global i8 25, align 1, !dbg !0
+
+define signext i16 @b() !dbg !12 {
+entry:
+; CHECK: DBG_VALUE 23680, $noreg, ![[VARNUM:[0-9]+]],
+  call void @llvm.dbg.value(metadata i16 23680, metadata !17, metadata !DIExpression()), !dbg !18
+  %0 = load i8, i8* @a, align 1, !dbg !18
+  %conv = sext i8 %0 to i16, !dbg !18
+; CHECK: DBG_VALUE $noreg, $noreg, ![[VARNUM]],
+  call void @llvm.dbg.value(metadata i16 %conv, metadata !17, metadata !DIExpression()), !dbg !18
+  %call = call i32 (...) @optimize_me_not(), !dbg !18
+  %1 = load i8, i8* @a, align 1, !dbg !18
+  %conv1 = sext i8 %1 to i16, !dbg !18
+  ret i16 %conv1, !dbg !18
+}
+declare i32 @optimize_me_not(...)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10}
+!llvm.ident = !{!11}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 8.0.0 (trunk 348101) (llvm/trunk 348109)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: GNU)
+!3 = !DIFile(filename: "run.c", directory: "/Users/dcci/work/llvm/build-debug/bin")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 7, !"PIC Level", i32 2}
+!11 = !{!"clang version something"}
+!12 = distinct !DISubprogram(name: "b", scope: !3, file: !3, line: 2, type: !13, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !16)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!15}
+!15 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+!16 = !{!17}
+!17 = !DILocalVariable(name: "i", scope: !12, file: !3, line: 3, type: !15)
+!18 = !DILocation(line: 3, scope: !12)
diff --git a/test/CodeGen/X86/pr39926.ll b/test/CodeGen/X86/pr39926.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c22e4f2f9a8b400af1597ec9c626812d202f4f5d
--- /dev/null
+++ b/test/CodeGen/X86/pr39926.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx | FileCheck %s
+define i8 @test_offset(i8* %base) {
+; CHECK-LABEL: test_offset:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movb $0, 7(%rdi)
+; CHECK-NEXT:    movw $0, 5(%rdi)
+; CHECK-NEXT:    movl $0, 1(%rdi)
+; CHECK-NEXT:    movl -4(%rdi), %eax
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb (%rdi), %al
+; CHECK-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl 1(%rdi), %eax
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movzwl 5(%rdi), %eax
+; CHECK-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb 7(%rdi), %al
+; CHECK-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl 8(%rdi), %eax
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %z = alloca [128 x i8], align 16
+  %gep0 = getelementptr inbounds i8, i8* %base, i64 7
+  store volatile i8 0, i8* %gep0
+  %gep1 = getelementptr inbounds i8, i8* %base, i64 5
+  %bc1 = bitcast i8* %gep1 to i16*
+  store volatile i16 0, i16* %bc1
+  %gep2 = getelementptr inbounds i8, i8* %base, i64 1
+  %bc2 = bitcast i8* %gep2 to i32*
+  store volatile i32 0, i32* %bc2
+
+  %y1 = getelementptr inbounds i8, i8* %base, i64 -4
+  %y2 = bitcast [128 x i8]* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %y2, i8* %y1, i64 16, i1 false)
+
+  %gep4 = getelementptr inbounds [128 x i8], [128 x i8]* %z, i64 0, i64 4
+  %ret = load i8, i8* %gep4
+  ret i8 %ret
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)
diff --git a/test/CodeGen/X86/pr40090.ll b/test/CodeGen/X86/pr40090.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d1c38e4104d798fa327825f4190417dde46cf570
--- /dev/null
+++ b/test/CodeGen/X86/pr40090.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i64 @foo(i64 %x, i64 %y) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bsrq %rdi, %rax
+; CHECK-NEXT:    xorq $64, %rax
+; CHECK-NEXT:    bsrq %rsi, %rcx
+; CHECK-NEXT:    cmoveq %rax, %rcx
+; CHECK-NEXT:    movl $63, %eax
+; CHECK-NEXT:    subq %rcx, %rax
+; CHECK-NEXT:    retq
+  %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %2 = xor i64 %1, 127
+  %3 = tail call i64 @llvm.ctlz.i64(i64 %y, i1 true)
+  %4 = xor i64 %3, 63
+  %5 = icmp eq i64 %y, 0
+  %6 = select i1 %5, i64 %2, i64 %4
+  %7 = sub nsw i64 63, %6
+  ret i64 %7
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
diff --git a/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/test/CodeGen/X86/prefer-avx256-lzcnt.ll
index dfec51484153979743f5b2400bce6e9d4763e98a..6cd5cd8f036e2b87a428a8ec2c79b06544916ac2 100644
--- a/test/CodeGen/X86/prefer-avx256-lzcnt.ll
+++ b/test/CodeGen/X86/prefer-avx256-lzcnt.ll
@@ -38,17 +38,15 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
 define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; AVX256-LABEL: testv16i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX256-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX256-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX256-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX256-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX256-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX256-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX256-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX256-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX256-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX256-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX256-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX256-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX256-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX256-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX256-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX256-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX256-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i8:
@@ -93,17 +91,15 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
 define <32 x i8> @testv32i8(<32 x i8> %in) {
 ; AVX256-LABEL: testv32i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX256-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX256-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX256-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX256-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX256-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX256-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX256-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX256-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX256-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX256-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX256-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX256-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX256-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX256-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX256-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX256-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX256-NEXT:    retq
 ;
 ; AVX512-LABEL: testv32i8:
diff --git a/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/test/CodeGen/X86/prefer-avx256-mask-extend.ll
index 829dd42b8bc23d92ed960ff938b3c7580016e327..b4f8e5b0b6c83ff10f1d5fd4018c7bf6bc9b938d 100644
--- a/test/CodeGen/X86/prefer-avx256-mask-extend.ll
+++ b/test/CodeGen/X86/prefer-avx256-mask-extend.ll
@@ -48,11 +48,11 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
 ; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
-; AVX256-NEXT:    vpacksswb %xmm0, %xmm1, %xmm1
+; AVX256-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX256-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX256-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX256-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index 8bd0f1c98ae96074621a7915368113d28965edc5..92f6e27112660226e6a155cc55aa7d924bc63ddc 100644
--- a/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -21,24 +21,24 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
 ; AVX256VL-NEXT:    vpmovdw %ymm2, %xmm2
 ; AVX256VL-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
 ; AVX256VL-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1]
-; AVX256VL-NEXT:    vpmovsxwd %xmm3, %ymm3
+; AVX256VL-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 ; AVX256VL-NEXT:    vpslld $31, %ymm3, %ymm3
 ; AVX256VL-NEXT:    vptestmd %ymm3, %ymm3, %k1
 ; AVX256VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
 ; AVX256VL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,7,12,13,2,3,14,15,6,7,6,7,14,15,0,1]
 ; AVX256VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7]
-; AVX256VL-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX256VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX256VL-NEXT:    vpslld $31, %ymm1, %ymm1
 ; AVX256VL-NEXT:    vptestmd %ymm1, %ymm1, %k0
 ; AVX256VL-NEXT:    kunpckbw %k1, %k0, %k0
 ; AVX256VL-NEXT:    kshiftrw $8, %k0, %k2
 ; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
 ; AVX256VL-NEXT:    vpmovdw %ymm1, %xmm1
-; AVX256VL-NEXT:    vpacksswb %xmm0, %xmm1, %xmm1
+; AVX256VL-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX256VL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX256VL-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX256VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX256VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX256VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX256VL-NEXT:    vzeroupper
 ; AVX256VL-NEXT:    retq
 ;
@@ -158,22 +158,22 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
 ; AVX256VL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,2,1]
 ; AVX256VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
 ; AVX256VL-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX256VL-NEXT:    vpmovsxwd %xmm1, %ymm2
+; AVX256VL-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX256VL-NEXT:    vpslld $31, %ymm2, %ymm2
 ; AVX256VL-NEXT:    vptestmd %ymm2, %ymm2, %k1
 ; AVX256VL-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX256VL-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX256VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX256VL-NEXT:    vpslld $31, %ymm1, %ymm1
 ; AVX256VL-NEXT:    vptestmd %ymm1, %ymm1, %k0
 ; AVX256VL-NEXT:    kunpckbw %k1, %k0, %k0
 ; AVX256VL-NEXT:    kshiftrw $8, %k0, %k2
 ; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
 ; AVX256VL-NEXT:    vpmovdw %ymm1, %xmm1
-; AVX256VL-NEXT:    vpacksswb %xmm0, %xmm1, %xmm1
+; AVX256VL-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX256VL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX256VL-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX256VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX256VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX256VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX256VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX256VL-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/prefer-avx256-trunc.ll b/test/CodeGen/X86/prefer-avx256-trunc.ll
index 447e6d1455d4309df0153ba15543edd42d3c356a..3b1ff44d7234a12572347869d8a7a1ad87fe9d96 100644
--- a/test/CodeGen/X86/prefer-avx256-trunc.ll
+++ b/test/CodeGen/X86/prefer-avx256-trunc.ll
@@ -11,17 +11,15 @@
 define <16 x i8> @testv16i16_trunc_v16i8(<16 x i16> %x) {
 ; AVX256NOBW-LABEL: testv16i16_trunc_v16i8:
 ; AVX256NOBW:       # %bb.0:
+; AVX256NOBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX256NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX256NOBW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX256NOBW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX256NOBW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX256NOBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX256NOBW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX256NOBW-NEXT:    vzeroupper
 ; AVX256NOBW-NEXT:    retq
 ;
 ; AVX512NOBW-LABEL: testv16i16_trunc_v16i8:
 ; AVX512NOBW:       # %bb.0:
-; AVX512NOBW-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512NOBW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512NOBW-NEXT:    vzeroupper
 ; AVX512NOBW-NEXT:    retq
diff --git a/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index 7930467604b3500baa0bfa291988fc49e55d4c7c..443cce87f702fc4d88931cad51ba7d6937f7af27 100644
--- a/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -7,16 +7,15 @@
 define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
 ; AVX256BW-LABEL: test_div7_32i8:
 ; AVX256BW:       # %bb.0:
-; AVX256BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX256BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX256BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX256BW-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX256BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX256BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX256BW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX256BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX256BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX256BW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX256BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX256BW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
-; AVX256BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX256BW-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX256BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX256BW-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 ; AVX256BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX256BW-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX256BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -41,3 +40,29 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
   %res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <32 x i8> %res
 }
+
+define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) {
+; AVX256BW-LABEL: test_mul_32i8:
+; AVX256BW:       # %bb.0:
+; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX256BW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX256BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX256BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX256BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX256BW-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX256BW-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX256BW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_mul_32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+  %res = mul <32 x i8> %a, %b
+  ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/promote-vec3.ll b/test/CodeGen/X86/promote-vec3.ll
index 29832a23104be64c57cfe279715a8b01927c0039..db337b2f55723a6c0d7edcc1a6d56106a9636b5c 100644
--- a/test/CodeGen/X86/promote-vec3.ll
+++ b/test/CodeGen/X86/promote-vec3.ll
@@ -78,11 +78,9 @@ define <3 x i16> @sext_i8(<3 x i8>) {
 ; SSE3-NEXT:    pinsrw $2, %eax, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
 ; SSE3-NEXT:    psraw $8, %xmm0
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE3-NEXT:    psrad $16, %xmm0
-; SSE3-NEXT:    movd %xmm0, %eax
-; SSE3-NEXT:    pextrw $2, %xmm0, %edx
-; SSE3-NEXT:    pextrw $4, %xmm0, %ecx
+; SSE3-NEXT:    pextrw $0, %xmm0, %eax
+; SSE3-NEXT:    pextrw $1, %xmm0, %edx
+; SSE3-NEXT:    pextrw $2, %xmm0, %ecx
 ; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE3-NEXT:    # kill: def $dx killed $dx killed $edx
 ; SSE3-NEXT:    # kill: def $cx killed $cx killed $ecx
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index 6e2e97980c7bd5c987e8c5a4798853b98698cc13..2fc70291b0a090479fb3e954786adddb92a324c0 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -824,12 +824,12 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpandn %xmm0, %xmm4, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -936,19 +936,16 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm0, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -957,14 +954,11 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -1067,22 +1061,19 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm1, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm5, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1091,14 +1082,11 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -1681,64 +1669,57 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
 ;
 ; SSE41-LABEL: psubus_8i64_max:
 ; SSE41:       # %bb.0: # %vector.ph
-; SSE41-NEXT:    movdqa %xmm0, %xmm10
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT:    movdqa %xmm8, %xmm7
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT:    movdqa %xmm9, %xmm7
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm9, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [65535,65535]
-; SSE41-NEXT:    movapd %xmm7, %xmm11
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm11
+; SSE41-NEXT:    movapd %xmm7, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm5
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm8, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm9, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
-; SSE41-NEXT:    packusdw %xmm11, %xmm4
+; SSE41-NEXT:    packusdw %xmm5, %xmm4
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    movdqa %xmm9, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm9, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    pxor %xmm1, %xmm6
-; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm2, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
 ; SSE41-NEXT:    packusdw %xmm3, %xmm7
 ; SSE41-NEXT:    packusdw %xmm4, %xmm7
-; SSE41-NEXT:    psubusw %xmm7, %xmm10
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
-; SSE41-NEXT:    packusdw %xmm10, %xmm0
+; SSE41-NEXT:    psubusw %xmm7, %xmm8
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: psubus_8i64_max:
@@ -2430,3 +2411,317 @@ define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) {
   ret void
 }
 
+define <16 x i8> @test19(<16 x i8> %x) {
+; SSE-LABEL: test19:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test19:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %0 = icmp ugt <16 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
+  %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
+  %2 = add <16 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @test20(<16 x i8> %x) {
+; SSE-LABEL: test20:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test20:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %0 = icmp ugt <16 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
+  %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
+  %2 = add <16 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test21(<8 x i16> %x) {
+; SSE-LABEL: test21:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test21:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %0 = icmp ugt <8 x i16> %x, <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
+  %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
+  %2 = add <8 x i16> %1, <i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @test22(<8 x i16> %x) {
+; SSE-LABEL: test22:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test22:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
+  %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
+  %2 = add <8 x i16> %1, <i16 -1, i16 22000, i16 770, i16 -98, i16 -19, i16 -1000, i16 -3456, i16 -70>
+  ret <8 x i16> %2
+}
+
+define <32 x i8> @test23(<32 x i8> %x) {
+; SSE-LABEL: test23:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
+; SSE-NEXT:    psubusb %xmm2, %xmm0
+; SSE-NEXT:    psubusb %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test23:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
+; AVX1-NEXT:    vpsubusb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubusb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test23:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test23:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = icmp ugt <32 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
+  %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
+  %2 = add <32 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
+  ret <32 x i8> %2
+}
+
+define <32 x i8> @test24(<32 x i8> %x) {
+; SSE-LABEL: test24:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test24:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test24:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test24:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = icmp ugt <32 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
+  %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
+  %2 = add <32 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
+  ret <32 x i8> %2
+}
+
+define <16 x i16> @test25(<16 x i16> %x) {
+; SSE-LABEL: test25:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
+; SSE-NEXT:    psubusw %xmm2, %xmm0
+; SSE-NEXT:    psubusw %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test25:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
+; AVX1-NEXT:    vpsubusw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test25:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test25:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = icmp ugt <16 x i16> %x, <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
+  %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
+  %2 = add <16 x i16> %1, <i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000>
+  ret <16 x i16> %2
+}
+
+define <16 x i16> @test26(<16 x i16> %x) {
+; SSE-LABEL: test26:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test26:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test26:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test26:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = icmp ugt <16 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
+  %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
+  %2 = add <16 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70>
+  ret <16 x i16> %2
+}
+
+define <64 x i8> @test27(<64 x i8> %x) {
+; SSE-LABEL: test27:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
+; SSE-NEXT:    psubusb %xmm4, %xmm0
+; SSE-NEXT:    psubusb %xmm4, %xmm1
+; SSE-NEXT:    psubusb %xmm4, %xmm2
+; SSE-NEXT:    psubusb %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test27:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
+; AVX1-NEXT:    vpsubusb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubusb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsubusb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubusb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test27:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
+; AVX2-NEXT:    vpsubusb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubusb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test27:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsubusb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = icmp ugt <64 x i8> %x, <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
+  %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
+  %2 = add <64 x i8> %1, <i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154>
+  ret <64 x i8> %2
+}
+
+define <64 x i8> @test28(<64 x i8> %x) {
+; SSE-LABEL: test28:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
+; SSE-NEXT:    psubusb %xmm4, %xmm0
+; SSE-NEXT:    psubusb %xmm4, %xmm2
+; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test28:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
+; AVX1-NEXT:    vpsubusb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpsubusb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsubusb {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test28:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test28:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsubusb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = icmp ugt <64 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
+  %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
+  %2 = add <64 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70, i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 116, i8 77, i8 70, i8 -123, i8 -98, i8 -67, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
+  ret <64 x i8> %2
+}
+
+define <32 x i16> @test29(<32 x i16> %x) {
+; SSE-LABEL: test29:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm2
+; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test29:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpsubusw {{.*}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsubusw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test29:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test29:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsubusw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = icmp ugt <32 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
+  %1 = select <32 x i1> %0, <32 x i16> %x, <32 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
+  %2 = add <32 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70, i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9805, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -346, i16 -55, i16 -70>
+  ret <32 x i16> %2
+}
diff --git a/test/CodeGen/X86/reduce-trunc-shl.ll b/test/CodeGen/X86/reduce-trunc-shl.ll
index 4f2babe09109c0d5532b6d492bbba5db37a3a278..c14acc5e35c196ff06ee6692096883e683726ae4 100644
--- a/test/CodeGen/X86/reduce-trunc-shl.ll
+++ b/test/CodeGen/X86/reduce-trunc-shl.ll
@@ -72,11 +72,7 @@ define <8 x i16> @trunc_shl_17_v8i16_v8i32(<8 x i32> %a) {
 ;
 ; AVX2-LABEL: trunc_shl_17_v8i16_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpslld $17, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %shl = shl <8 x i32> %a, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
   %conv = trunc <8 x i32> %shl to <8 x i16>
diff --git a/test/CodeGen/X86/rot16.ll b/test/CodeGen/X86/rot16.ll
index 3b2a01b72bab4c4e354dcbad6313e63eba434d89..69ea7013cc78ef90d0fa6ffdc8bd3aa25b760d2a 100644
--- a/test/CodeGen/X86/rot16.ll
+++ b/test/CodeGen/X86/rot16.ll
@@ -15,7 +15,7 @@ define i16 @foo(i16 %x, i16 %y, i16 %z) nounwind {
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shldw %cl, %ax, %ax
+; X64-NEXT:    rolw %cl, %ax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = shl i16 %x, %z
@@ -62,7 +62,7 @@ define i16 @un(i16 %x, i16 %y, i16 %z) nounwind {
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrdw %cl, %ax, %ax
+; X64-NEXT:    rorw %cl, %ax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = lshr i16 %x, %z
diff --git a/test/CodeGen/X86/rotate-extract-vector.ll b/test/CodeGen/X86/rotate-extract-vector.ll
index e2679dded8b5c4c4d0ea46504a7125285e91aebe..6301f3bf747c7c63c2e29436fff3ca6a13898bf0 100644
--- a/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/test/CodeGen/X86/rotate-extract-vector.ll
@@ -221,12 +221,10 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovd %eax, %xmm0
-; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 7f287eb4de9f92abfaee68a5fdba1a6e28fee5c6..0d92e267f98f33e76a521fec5b2bd478757205d7 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -653,29 +653,29 @@ define i64 @truncated_rot(i64 %x, i32 %amt) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    jne .LBB28_2
 ; X86-NEXT:  # %bb.1: # %entry
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:  .LBB28_2: # %entry
-; X86-NEXT:    movl $64, %edx
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movb $64, %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NEXT:    testb $32, %dl
 ; X86-NEXT:    jne .LBB28_4
 ; X86-NEXT:  # %bb.3: # %entry
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:  .LBB28_4: # %entry
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/test/CodeGen/X86/rotate4.ll b/test/CodeGen/X86/rotate4.ll
index 5347fecfee1f1df4e1bd8519b5307ad6509020f5..91edff1a73982555d238a4869cf33e0503c1ac97 100644
--- a/test/CodeGen/X86/rotate4.ll
+++ b/test/CodeGen/X86/rotate4.ll
@@ -65,9 +65,9 @@ define i64 @rotate_left_64(i64 %a, i64 %b) {
 ; X86-NEXT:    .cfi_offset %esi, -16
 ; X86-NEXT:    .cfi_offset %edi, -12
 ; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    movl %edi, %edx
@@ -78,7 +78,7 @@ define i64 @rotate_left_64(i64 %a, i64 %b) {
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB2_2:
-; X86-NEXT:    negl %ecx
+; X86-NEXT:    negb %cl
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    shrl %cl, %ebx
 ; X86-NEXT:    shrdl %cl, %edi, %esi
@@ -126,9 +126,9 @@ define i64 @rotate_right_64(i64 %a, i64 %b) {
 ; X86-NEXT:    .cfi_offset %esi, -16
 ; X86-NEXT:    .cfi_offset %edi, -12
 ; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    movl %edi, %eax
@@ -139,7 +139,7 @@ define i64 @rotate_right_64(i64 %a, i64 %b) {
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:  .LBB3_2:
-; X86-NEXT:    negl %ecx
+; X86-NEXT:    negb %cl
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    shldl %cl, %edi, %esi
@@ -242,7 +242,7 @@ define void @rotate_left_m64(i64 *%pa, i64 %b) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    movl 4(%eax), %ebx
@@ -256,7 +256,7 @@ define void @rotate_left_m64(i64 *%pa, i64 %b) {
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:  .LBB6_2:
-; X86-NEXT:    negl %ecx
+; X86-NEXT:    negb %cl
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    shrl %cl, %ebp
 ; X86-NEXT:    shrdl %cl, %ebx, %edx
@@ -312,33 +312,33 @@ define void @rotate_right_m64(i64 *%pa, i64 %b) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %ebx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    shrdl %cl, %esi, %edi
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB7_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:  .LBB7_2:
-; X86-NEXT:    negl %ecx
+; X86-NEXT:    negb %cl
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    shldl %cl, %ebx, %esi
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB7_4
 ; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    xorl %ebp, %ebp
 ; X86-NEXT:  .LBB7_4:
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
@@ -642,9 +642,9 @@ define i32 @rotate_demanded_bits_3(i32, i32) {
 ;
 ; X64-LABEL: rotate_demanded_bits_3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    leal (%rsi,%rsi), %ecx
 ; X64-NEXT:    andb $30, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, %eax
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index 51ac1d5caeac74919d917831c39b739b0b453c77..f64364340658cf0a3ca19fb4b54964f9c90748a2 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -293,25 +293,21 @@ define i32 @sad_32i8() nounwind {
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB1_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqa a+1024(%rax), %ymm2
-; AVX1-NEXT:    vmovdqa b+1024(%rax), %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm2
+; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $4, %rax
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm4, %xmm4, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
@@ -1344,17 +1340,14 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ;
 ; AVX1-LABEL: sad_nonloop_32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqu (%rdx), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT:    vpsadbw 16(%rdx), %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw (%rdx), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: sad_nonloop_32i8:
@@ -1364,7 +1357,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1376,7 +1369,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/sadd_sat_vec.ll b/test/CodeGen/X86/sadd_sat_vec.ll
new file mode 100644
index 0000000000000000000000000000000000000000..421dc6c29f52046591799bfc4d9e3b5abc45efe6
--- /dev/null
+++ b/test/CodeGen/X86/sadd_sat_vec.ll
@@ -0,0 +1,1380 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8>, <12 x i8>)
+declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
+
+declare <1 x i16> @llvm.sadd.sat.v1i16(<1 x i16>, <1 x i16>)
+declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <12 x i16> @llvm.sadd.sat.v12i16(<12 x i16>, <12 x i16>)
+declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>)
+
+declare <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1>, <16 x i1>)
+declare <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4>, <16 x i4>)
+
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i24> @llvm.sadd.sat.v4i24(<4 x i24>, <4 x i24>)
+declare <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128>, <2 x i128>)
+
+; Legal types, depending on architecture.
+
+define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE-LABEL: v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddsb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
+  ret <16 x i8> %z
+}
+
+define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
+; SSE-LABEL: v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddsb %xmm2, %xmm0
+; SSE-NEXT:    paddsb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
+  ret <32 x i8> %z
+}
+
+define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
+; SSE-LABEL: v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddsb %xmm4, %xmm0
+; SSE-NEXT:    paddsb %xmm5, %xmm1
+; SSE-NEXT:    paddsb %xmm6, %xmm2
+; SSE-NEXT:    paddsb %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddsb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
+  ret <64 x i8> %z
+}
+
+define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE-LABEL: v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
+  ret <8 x i16> %z
+}
+
+define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
+; SSE-LABEL: v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddsw %xmm2, %xmm0
+; SSE-NEXT:    paddsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
+  ret <16 x i16> %z
+}
+
+define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
+; SSE-LABEL: v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddsw %xmm4, %xmm0
+; SSE-NEXT:    paddsw %xmm5, %xmm1
+; SSE-NEXT:    paddsw %xmm6, %xmm2
+; SSE-NEXT:    paddsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
+  ret <32 x i16> %z
+}
+
+; Too narrow vectors, legalized by widening.
+
+define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
+; SSE-LABEL: v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    paddsb %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmovwb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <8 x i8>, <8 x i8>* %px
+  %y = load <8 x i8>, <8 x i8>* %py
+  %z = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %x, <8 x i8> %y)
+  store <8 x i8> %z, <8 x i8>* %pz
+  ret void
+}
+
+define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
+; SSE-LABEL: v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    paddsb %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT:    vpmovdb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <4 x i8>, <4 x i8>* %px
+  %y = load <4 x i8>, <4 x i8>* %py
+  %z = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
+  store <4 x i8> %z, <4 x i8>* %pz
+  ret void
+}
+
+define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
+; SSE2-LABEL: v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl (%rsi), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    paddsb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movw %ax, (%rdx)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v2i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl (%rsi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    paddsb %xmm0, %xmm1
+; SSSE3-NEXT:    movd %xmm1, %eax
+; SSSE3-NEXT:    movw %ax, (%rdx)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    movzwl (%rsi), %eax
+; SSE41-NEXT:    movd %eax, %xmm1
+; SSE41-NEXT:    paddsb %xmm0, %xmm1
+; SSE41-NEXT:    pextrw $0, %xmm1, (%rdx)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movzwl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    movzwl (%rsi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movzwl (%rdi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    movzwl (%rsi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movzwl (%rdi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    movzwl (%rsi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmovqb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <2 x i8>, <2 x i8>* %px
+  %y = load <2 x i8>, <2 x i8>* %py
+  %z = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %x, <2 x i8> %y)
+  store <2 x i8> %z, <2 x i8>* %pz
+  ret void
+}
+
+define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
+; SSE-LABEL: v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    paddsw %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpmovdw %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <4 x i16>, <4 x i16>* %px
+  %y = load <4 x i16>, <4 x i16>* %py
+  %z = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %x, <4 x i16> %y)
+  store <4 x i16> %z, <4 x i16>* %pz
+  ret void
+}
+
+define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
+; SSE-LABEL: v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    paddsw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vpmovqw %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <2 x i16>, <2 x i16>* %px
+  %y = load <2 x i16>, <2 x i16>* %py
+  %z = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
+  store <2 x i16> %z, <2 x i16>* %pz
+  ret void
+}
+
+define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
+; SSE-LABEL: v12i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddsb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v12i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
+  ret <12 x i8> %z
+}
+
+define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind {
+; SSE-LABEL: v12i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm0
+; SSE-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE-NEXT:    paddsw (%rsi), %xmm0
+; SSE-NEXT:    paddsw 16(%rsi), %xmm1
+; SSE-NEXT:    movq %xmm1, 16(%rdx)
+; SSE-NEXT:    movdqa %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v12i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vpaddsw (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddsw 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v12i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpaddsw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v12i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vpaddsw (%rsi), %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = load <12 x i16>, <12 x i16>* %px
+  %y = load <12 x i16>, <12 x i16>* %py
+  %z = call <12 x i16> @llvm.sadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
+  store <12 x i16> %z, <12 x i16>* %pz
+  ret void
+}
+
+; Scalarization
+
+define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
+; SSE-LABEL: v1i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movb (%rdi), %cl
+; SSE-NEXT:    movb (%rsi), %dil
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    addb %dil, %al
+; SSE-NEXT:    setns %sil
+; SSE-NEXT:    addb %dil, %cl
+; SSE-NEXT:    jno .LBB13_2
+; SSE-NEXT:  # %bb.1:
+; SSE-NEXT:    addb $127, %sil
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:  .LBB13_2:
+; SSE-NEXT:    movb %cl, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v1i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movb (%rdi), %cl
+; AVX-NEXT:    movb (%rsi), %dil
+; AVX-NEXT:    movl %ecx, %eax
+; AVX-NEXT:    addb %dil, %al
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addb %dil, %cl
+; AVX-NEXT:    jno .LBB13_2
+; AVX-NEXT:  # %bb.1:
+; AVX-NEXT:    addb $127, %sil
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:  .LBB13_2:
+; AVX-NEXT:    movb %cl, (%rdx)
+; AVX-NEXT:    retq
+  %x = load <1 x i8>, <1 x i8>* %px
+  %y = load <1 x i8>, <1 x i8>* %py
+  %z = call <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8> %x, <1 x i8> %y)
+  store <1 x i8> %z, <1 x i8>* %pz
+  ret void
+}
+
+define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
+; SSE-LABEL: v1i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movzwl (%rdi), %eax
+; SSE-NEXT:    movzwl (%rsi), %ecx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    movl %eax, %edi
+; SSE-NEXT:    addw %cx, %di
+; SSE-NEXT:    setns %sil
+; SSE-NEXT:    addl $32767, %esi # imm = 0x7FFF
+; SSE-NEXT:    addw %cx, %ax
+; SSE-NEXT:    cmovol %esi, %eax
+; SSE-NEXT:    movw %ax, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v1i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    movzwl (%rsi), %ecx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %eax, %edi
+; AVX-NEXT:    addw %cx, %di
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $32767, %esi # imm = 0x7FFF
+; AVX-NEXT:    addw %cx, %ax
+; AVX-NEXT:    cmovol %esi, %eax
+; AVX-NEXT:    movw %ax, (%rdx)
+; AVX-NEXT:    retq
+  %x = load <1 x i16>, <1 x i16>* %px
+  %y = load <1 x i16>, <1 x i16>* %py
+  %z = call <1 x i16> @llvm.sadd.sat.v1i16(<1 x i16> %x, <1 x i16> %y)
+  store <1 x i16> %z, <1 x i16>* %pz
+  ret void
+}
+
+; Promotion
+
+define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
+; SSE-LABEL: v16i4:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $4, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    psllw $4, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    paddsb %xmm1, %xmm0
+; SSE-NEXT:    psrlw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v16i4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
+  ret <16 x i4> %z
+}
+
+define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
+; SSE-LABEL: v16i1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $7, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    paddsb %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v16i1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovb2m %xmm1, %k0
+; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovb2m %xmm0, %k1
+; AVX512-NEXT:    korw %k0, %k1, %k0
+; AVX512-NEXT:    vpmovm2b %k0, %xmm0
+; AVX512-NEXT:    retq
+  %z = call <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
+  ret <16 x i1> %z
+}
+
+; Expanded
+
+define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; SSE2-LABEL: v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %r8d
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    movl %r8d, %esi
+; SSE2-NEXT:    addl %ecx, %esi
+; SSE2-NEXT:    setns %dl
+; SSE2-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSE2-NEXT:    addl %ecx, %r8d
+; SSE2-NEXT:    cmovol %edx, %r8d
+; SSE2-NEXT:    movd %xmm1, %edx
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    xorl %esi, %esi
+; SSE2-NEXT:    movl %ecx, %edi
+; SSE2-NEXT:    addl %edx, %edi
+; SSE2-NEXT:    setns %sil
+; SSE2-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    addl %edx, %ecx
+; SSE2-NEXT:    cmovol %esi, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %edx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movl %eax, %esi
+; SSE2-NEXT:    addl %edx, %esi
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    addl %edx, %eax
+; SSE2-NEXT:    cmovol %edi, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %r9d
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movd %xmm0, %edx
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movl %edx, %esi
+; SSE2-NEXT:    addl %r9d, %esi
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    addl %r9d, %edx
+; SSE2-NEXT:    cmovol %edi, %edx
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd %r8d, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %r8d
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    movl %r8d, %esi
+; SSSE3-NEXT:    addl %ecx, %esi
+; SSSE3-NEXT:    setns %dl
+; SSSE3-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    addl %ecx, %r8d
+; SSSE3-NEXT:    cmovol %edx, %r8d
+; SSSE3-NEXT:    movd %xmm1, %edx
+; SSSE3-NEXT:    movd %xmm0, %ecx
+; SSSE3-NEXT:    xorl %esi, %esi
+; SSSE3-NEXT:    movl %ecx, %edi
+; SSSE3-NEXT:    addl %edx, %edi
+; SSSE3-NEXT:    setns %sil
+; SSSE3-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    addl %edx, %ecx
+; SSSE3-NEXT:    cmovol %esi, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %edx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movl %eax, %esi
+; SSSE3-NEXT:    addl %edx, %esi
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    addl %edx, %eax
+; SSSE3-NEXT:    cmovol %edi, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %r9d
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm0, %edx
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movl %edx, %esi
+; SSSE3-NEXT:    addl %r9d, %esi
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    addl %r9d, %edx
+; SSSE3-NEXT:    cmovol %edi, %edx
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movd %r8d, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
+; SSE41-NEXT:    pextrd $3, %xmm0, %r8d
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    movl %r8d, %esi
+; SSE41-NEXT:    addl %ecx, %esi
+; SSE41-NEXT:    setns %dl
+; SSE41-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSE41-NEXT:    addl %ecx, %r8d
+; SSE41-NEXT:    cmovol %edx, %r8d
+; SSE41-NEXT:    pextrd $2, %xmm1, %edx
+; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
+; SSE41-NEXT:    xorl %esi, %esi
+; SSE41-NEXT:    movl %ecx, %edi
+; SSE41-NEXT:    addl %edx, %edi
+; SSE41-NEXT:    setns %sil
+; SSE41-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    addl %edx, %ecx
+; SSE41-NEXT:    cmovol %esi, %ecx
+; SSE41-NEXT:    movd %xmm1, %edx
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movl %eax, %esi
+; SSE41-NEXT:    addl %edx, %esi
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    addl %edx, %eax
+; SSE41-NEXT:    cmovol %edi, %eax
+; SSE41-NEXT:    pextrd $1, %xmm1, %r9d
+; SSE41-NEXT:    pextrd $1, %xmm0, %edx
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movl %edx, %esi
+; SSE41-NEXT:    addl %r9d, %esi
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    addl %r9d, %edx
+; SSE41-NEXT:    cmovol %edi, %edx
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrd $3, %xmm1, %ecx
+; AVX-NEXT:    vpextrd $3, %xmm0, %r9d
+; AVX-NEXT:    xorl %edx, %edx
+; AVX-NEXT:    movl %r9d, %esi
+; AVX-NEXT:    addl %ecx, %esi
+; AVX-NEXT:    setns %dl
+; AVX-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; AVX-NEXT:    addl %ecx, %r9d
+; AVX-NEXT:    cmovol %edx, %r9d
+; AVX-NEXT:    vpextrd $2, %xmm1, %edx
+; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %ecx, %edi
+; AVX-NEXT:    addl %edx, %edi
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; AVX-NEXT:    addl %edx, %ecx
+; AVX-NEXT:    cmovol %esi, %ecx
+; AVX-NEXT:    vmovd %xmm1, %r8d
+; AVX-NEXT:    vmovd %xmm0, %edx
+; AVX-NEXT:    xorl %edi, %edi
+; AVX-NEXT:    movl %edx, %esi
+; AVX-NEXT:    addl %r8d, %esi
+; AVX-NEXT:    setns %dil
+; AVX-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; AVX-NEXT:    addl %r8d, %edx
+; AVX-NEXT:    cmovol %edi, %edx
+; AVX-NEXT:    vpextrd $1, %xmm1, %r8d
+; AVX-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %eax, %edi
+; AVX-NEXT:    addl %r8d, %edi
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; AVX-NEXT:    addl %r8d, %eax
+; AVX-NEXT:    cmovol %esi, %eax
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $3, %r9d, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
+  ret <4 x i32> %z
+}
+
+define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
+; SSE2-LABEL: v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    movq %rcx, %rsi
+; SSE2-NEXT:    addq %rax, %rsi
+; SSE2-NEXT:    setns %dl
+; SSE2-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE2-NEXT:    addq %r8, %rdx
+; SSE2-NEXT:    addq %rax, %rcx
+; SSE2-NEXT:    cmovoq %rdx, %rcx
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movq %rsi, %rdx
+; SSE2-NEXT:    addq %rax, %rdx
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addq %r8, %rdi
+; SSE2-NEXT:    addq %rax, %rsi
+; SSE2-NEXT:    cmovoq %rdi, %rsi
+; SSE2-NEXT:    movq %rsi, %xmm1
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    psllq $32, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movq %xmm2, %rax
+; SSSE3-NEXT:    psllq $32, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movq %xmm2, %rcx
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    movq %rcx, %rsi
+; SSSE3-NEXT:    addq %rax, %rsi
+; SSSE3-NEXT:    setns %dl
+; SSSE3-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; SSSE3-NEXT:    addq %r8, %rdx
+; SSSE3-NEXT:    addq %rax, %rcx
+; SSSE3-NEXT:    cmovoq %rdx, %rcx
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    movq %xmm0, %rsi
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movq %rsi, %rdx
+; SSSE3-NEXT:    addq %rax, %rdx
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addq %r8, %rdi
+; SSSE3-NEXT:    addq %rax, %rsi
+; SSSE3-NEXT:    cmovoq %rdi, %rsi
+; SSSE3-NEXT:    movq %rsi, %xmm1
+; SSSE3-NEXT:    movq %rcx, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    psllq $32, %xmm0
+; SSE41-NEXT:    movq %xmm0, %rcx
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    movq %rcx, %rsi
+; SSE41-NEXT:    addq %rax, %rsi
+; SSE41-NEXT:    setns %dl
+; SSE41-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE41-NEXT:    addq %r8, %rdx
+; SSE41-NEXT:    addq %rax, %rcx
+; SSE41-NEXT:    cmovoq %rdx, %rcx
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    pextrq $1, %xmm0, %rsi
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movq %rsi, %rdx
+; SSE41-NEXT:    addq %rax, %rdx
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addq %r8, %rdi
+; SSE41-NEXT:    addq %rax, %rsi
+; SSE41-NEXT:    cmovoq %rdi, %rsi
+; SSE41-NEXT:    movq %rsi, %xmm1
+; SSE41-NEXT:    movq %rcx, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rcx
+; AVX1-NEXT:    xorl %edx, %edx
+; AVX1-NEXT:    movq %rcx, %rsi
+; AVX1-NEXT:    addq %rax, %rsi
+; AVX1-NEXT:    setns %dl
+; AVX1-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX1-NEXT:    addq %r8, %rdx
+; AVX1-NEXT:    addq %rax, %rcx
+; AVX1-NEXT:    cmovoq %rdx, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT:    xorl %edi, %edi
+; AVX1-NEXT:    movq %rsi, %rdx
+; AVX1-NEXT:    addq %rax, %rdx
+; AVX1-NEXT:    setns %dil
+; AVX1-NEXT:    addq %r8, %rdi
+; AVX1-NEXT:    addq %rax, %rsi
+; AVX1-NEXT:    cmovoq %rdi, %rsi
+; AVX1-NEXT:    vmovq %rsi, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    movq %rcx, %rsi
+; AVX2-NEXT:    addq %rax, %rsi
+; AVX2-NEXT:    setns %dl
+; AVX2-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX2-NEXT:    addq %r8, %rdx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    cmovoq %rdx, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    movq %rsi, %rdx
+; AVX2-NEXT:    addq %rax, %rdx
+; AVX2-NEXT:    setns %dil
+; AVX2-NEXT:    addq %r8, %rdi
+; AVX2-NEXT:    addq %rax, %rsi
+; AVX2-NEXT:    cmovoq %rdi, %rsi
+; AVX2-NEXT:    vmovq %rsi, %xmm0
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    movq %rcx, %rsi
+; AVX512-NEXT:    addq %rax, %rsi
+; AVX512-NEXT:    setns %dl
+; AVX512-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX512-NEXT:    addq %r8, %rdx
+; AVX512-NEXT:    addq %rax, %rcx
+; AVX512-NEXT:    cmovoq %rdx, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT:    xorl %edi, %edi
+; AVX512-NEXT:    movq %rsi, %rdx
+; AVX512-NEXT:    addq %rax, %rdx
+; AVX512-NEXT:    setns %dil
+; AVX512-NEXT:    addq %r8, %rdi
+; AVX512-NEXT:    addq %rax, %rsi
+; AVX512-NEXT:    cmovoq %rdi, %rsi
+; AVX512-NEXT:    vmovq %rsi, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
+  ret <2 x i32> %z
+}
+
+define <4 x i24> @v4i24(<4 x i24> %x, <4 x i24> %y) nounwind {
+; SSE2-LABEL: v4i24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %ecx
+; SSE2-NEXT:    pslld $8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %r8d
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    movl %r8d, %esi
+; SSE2-NEXT:    addl %ecx, %esi
+; SSE2-NEXT:    setns %dl
+; SSE2-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSE2-NEXT:    addl %ecx, %r8d
+; SSE2-NEXT:    cmovol %edx, %r8d
+; SSE2-NEXT:    movd %xmm1, %edx
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    xorl %esi, %esi
+; SSE2-NEXT:    movl %ecx, %edi
+; SSE2-NEXT:    addl %edx, %edi
+; SSE2-NEXT:    setns %sil
+; SSE2-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    addl %edx, %ecx
+; SSE2-NEXT:    cmovol %esi, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %edx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movl %eax, %esi
+; SSE2-NEXT:    addl %edx, %esi
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    addl %edx, %eax
+; SSE2-NEXT:    cmovol %edi, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %r9d
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movd %xmm0, %edx
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movl %edx, %esi
+; SSE2-NEXT:    addl %r9d, %esi
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    addl %r9d, %edx
+; SSE2-NEXT:    cmovol %edi, %edx
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd %r8d, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    psrad $8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v4i24:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pslld $8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %ecx
+; SSSE3-NEXT:    pslld $8, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %r8d
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    movl %r8d, %esi
+; SSSE3-NEXT:    addl %ecx, %esi
+; SSSE3-NEXT:    setns %dl
+; SSSE3-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    addl %ecx, %r8d
+; SSSE3-NEXT:    cmovol %edx, %r8d
+; SSSE3-NEXT:    movd %xmm1, %edx
+; SSSE3-NEXT:    movd %xmm0, %ecx
+; SSSE3-NEXT:    xorl %esi, %esi
+; SSSE3-NEXT:    movl %ecx, %edi
+; SSSE3-NEXT:    addl %edx, %edi
+; SSSE3-NEXT:    setns %sil
+; SSSE3-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    addl %edx, %ecx
+; SSSE3-NEXT:    cmovol %esi, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %edx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movl %eax, %esi
+; SSSE3-NEXT:    addl %edx, %esi
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    addl %edx, %eax
+; SSSE3-NEXT:    cmovol %edi, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %r9d
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm0, %edx
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movl %edx, %esi
+; SSSE3-NEXT:    addl %r9d, %esi
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    addl %r9d, %edx
+; SSSE3-NEXT:    cmovol %edi, %edx
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movd %r8d, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    psrad $8, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v4i24:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $8, %xmm1
+; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
+; SSE41-NEXT:    pslld $8, %xmm0
+; SSE41-NEXT:    pextrd $3, %xmm0, %r8d
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    movl %r8d, %esi
+; SSE41-NEXT:    addl %ecx, %esi
+; SSE41-NEXT:    setns %dl
+; SSE41-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSE41-NEXT:    addl %ecx, %r8d
+; SSE41-NEXT:    cmovol %edx, %r8d
+; SSE41-NEXT:    pextrd $2, %xmm1, %edx
+; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
+; SSE41-NEXT:    xorl %esi, %esi
+; SSE41-NEXT:    movl %ecx, %edi
+; SSE41-NEXT:    addl %edx, %edi
+; SSE41-NEXT:    setns %sil
+; SSE41-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    addl %edx, %ecx
+; SSE41-NEXT:    cmovol %esi, %ecx
+; SSE41-NEXT:    movd %xmm1, %edx
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movl %eax, %esi
+; SSE41-NEXT:    addl %edx, %esi
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    addl %edx, %eax
+; SSE41-NEXT:    cmovol %edi, %eax
+; SSE41-NEXT:    pextrd $1, %xmm1, %r9d
+; SSE41-NEXT:    pextrd $1, %xmm0, %edx
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movl %edx, %esi
+; SSE41-NEXT:    addl %r9d, %esi
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    addl %r9d, %edx
+; SSE41-NEXT:    cmovol %edi, %edx
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT:    psrad $8, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v4i24:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX-NEXT:    vpextrd $3, %xmm1, %ecx
+; AVX-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX-NEXT:    vpextrd $3, %xmm0, %r9d
+; AVX-NEXT:    xorl %edx, %edx
+; AVX-NEXT:    movl %r9d, %esi
+; AVX-NEXT:    addl %ecx, %esi
+; AVX-NEXT:    setns %dl
+; AVX-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; AVX-NEXT:    addl %ecx, %r9d
+; AVX-NEXT:    cmovol %edx, %r9d
+; AVX-NEXT:    vpextrd $2, %xmm1, %edx
+; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %ecx, %edi
+; AVX-NEXT:    addl %edx, %edi
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; AVX-NEXT:    addl %edx, %ecx
+; AVX-NEXT:    cmovol %esi, %ecx
+; AVX-NEXT:    vmovd %xmm1, %r8d
+; AVX-NEXT:    vmovd %xmm0, %edx
+; AVX-NEXT:    xorl %edi, %edi
+; AVX-NEXT:    movl %edx, %esi
+; AVX-NEXT:    addl %r8d, %esi
+; AVX-NEXT:    setns %dil
+; AVX-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; AVX-NEXT:    addl %r8d, %edx
+; AVX-NEXT:    cmovol %edi, %edx
+; AVX-NEXT:    vpextrd $1, %xmm1, %r8d
+; AVX-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %eax, %edi
+; AVX-NEXT:    addl %r8d, %edi
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; AVX-NEXT:    addl %r8d, %eax
+; AVX-NEXT:    cmovol %esi, %eax
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $3, %r9d, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <4 x i24> @llvm.sadd.sat.v4i24(<4 x i24> %x, <4 x i24> %y)
+  ret <4 x i24> %z
+}
+
+define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
+; SSE-LABEL: v2i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movq %r8, %r13
+; SSE-NEXT:    adcq %r14, %r13
+; SSE-NEXT:    movq %r13, %r10
+; SSE-NEXT:    sarq $63, %r10
+; SSE-NEXT:    xorl %edi, %edi
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    setns %dil
+; SSE-NEXT:    movabsq $9223372036854775807, %r12 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT:    leaq (%rdi,%r12), %r15
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    setns %r8b
+; SSE-NEXT:    cmpb %dil, %r8b
+; SSE-NEXT:    setne %dil
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    setns %bl
+; SSE-NEXT:    cmpb %bl, %r8b
+; SSE-NEXT:    sete %bl
+; SSE-NEXT:    testb %dil, %bl
+; SSE-NEXT:    cmoveq %r13, %r15
+; SSE-NEXT:    cmoveq %rcx, %r10
+; SSE-NEXT:    addq %r9, %rsi
+; SSE-NEXT:    movq %rdx, %rdi
+; SSE-NEXT:    adcq %r11, %rdi
+; SSE-NEXT:    setns %bl
+; SSE-NEXT:    movzbl %bl, %ebx
+; SSE-NEXT:    addq %rbx, %r12
+; SSE-NEXT:    movq %rdi, %rcx
+; SSE-NEXT:    sarq $63, %rcx
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    setns %r8b
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    setns %dl
+; SSE-NEXT:    cmpb %r8b, %dl
+; SSE-NEXT:    sete %r8b
+; SSE-NEXT:    cmpb %bl, %dl
+; SSE-NEXT:    setne %dl
+; SSE-NEXT:    testb %dl, %r8b
+; SSE-NEXT:    cmoveq %rsi, %rcx
+; SSE-NEXT:    cmoveq %rdi, %r12
+; SSE-NEXT:    movq %r15, 24(%rax)
+; SSE-NEXT:    movq %r10, 16(%rax)
+; SSE-NEXT:    movq %r12, 8(%rax)
+; SSE-NEXT:    movq %rcx, (%rax)
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v2i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %r15
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %r13
+; AVX-NEXT:    pushq %r12
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movq %r8, %r13
+; AVX-NEXT:    adcq %r14, %r13
+; AVX-NEXT:    movq %r13, %r10
+; AVX-NEXT:    sarq $63, %r10
+; AVX-NEXT:    xorl %edi, %edi
+; AVX-NEXT:    testq %r13, %r13
+; AVX-NEXT:    setns %dil
+; AVX-NEXT:    movabsq $9223372036854775807, %r12 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX-NEXT:    leaq (%rdi,%r12), %r15
+; AVX-NEXT:    testq %r8, %r8
+; AVX-NEXT:    setns %r8b
+; AVX-NEXT:    cmpb %dil, %r8b
+; AVX-NEXT:    setne %dil
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    setns %bl
+; AVX-NEXT:    cmpb %bl, %r8b
+; AVX-NEXT:    sete %bl
+; AVX-NEXT:    testb %dil, %bl
+; AVX-NEXT:    cmoveq %r13, %r15
+; AVX-NEXT:    cmoveq %rcx, %r10
+; AVX-NEXT:    addq %r9, %rsi
+; AVX-NEXT:    movq %rdx, %rdi
+; AVX-NEXT:    adcq %r11, %rdi
+; AVX-NEXT:    setns %bl
+; AVX-NEXT:    movzbl %bl, %ebx
+; AVX-NEXT:    addq %rbx, %r12
+; AVX-NEXT:    movq %rdi, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    testq %r11, %r11
+; AVX-NEXT:    setns %r8b
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    setns %dl
+; AVX-NEXT:    cmpb %r8b, %dl
+; AVX-NEXT:    sete %r8b
+; AVX-NEXT:    cmpb %bl, %dl
+; AVX-NEXT:    setne %dl
+; AVX-NEXT:    testb %dl, %r8b
+; AVX-NEXT:    cmoveq %rsi, %rcx
+; AVX-NEXT:    cmoveq %rdi, %r12
+; AVX-NEXT:    movq %r15, 24(%rax)
+; AVX-NEXT:    movq %r10, 16(%rax)
+; AVX-NEXT:    movq %r12, 8(%rax)
+; AVX-NEXT:    movq %rcx, (%rax)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r12
+; AVX-NEXT:    popq %r13
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    popq %r15
+; AVX-NEXT:    retq
+  %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
+  ret <2 x i128> %z
+}
diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll
index 7e6272998f358138e57f004eef60670becc5dd8d..239ab1d8a3c31093479f0e205ee10bd460f2c2a1 100644
--- a/test/CodeGen/X86/sandybridge-loads.ll
+++ b/test/CodeGen/X86/sandybridge-loads.ll
@@ -30,9 +30,10 @@ define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounw
 ; CHECK-LABEL: widestores:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vmovaps (%rsi), %ymm1
+; CHECK-NEXT:    vmovaps (%rsi), %xmm1
+; CHECK-NEXT:    vmovaps 16(%rsi), %xmm2
 ; CHECK-NEXT:    vmovaps %ymm0, (%rsi)
-; CHECK-NEXT:    vextractf128 $1, %ymm1, 16(%rdi)
+; CHECK-NEXT:    vmovaps %xmm2, 16(%rdi)
 ; CHECK-NEXT:    vmovaps %xmm1, (%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/sat-add.ll b/test/CodeGen/X86/sat-add.ll
index f0989e8b081af7d8e23b541c1fc4ab53ef3c5bdd..e09c2416d4896c4b55fe418adacc75dcab724f48 100644
--- a/test/CodeGen/X86/sat-add.ll
+++ b/test/CodeGen/X86/sat-add.ll
@@ -526,11 +526,9 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [42,42,42,42]
 ; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4294967253,4294967253,4294967253,4294967253]
-; SSE41-NEXT:    pminud %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4294967254,4294967254,4294967254,4294967254]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
@@ -570,10 +568,9 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    paddq {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -867,10 +864,9 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64>
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    paddq %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
diff --git a/test/CodeGen/X86/scalar-fp-to-i64.ll b/test/CodeGen/X86/scalar-fp-to-i64.ll
index 77305f4303461eea4b2a0e3dc20c9e549b3b0020..7ed61f8fdc7d24b73da97f075e5605ad13b588f8 100644
--- a/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -1147,25 +1147,21 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ;
 ; SSE3_64_WIN-LABEL: x_to_u64:
 ; SSE3_64_WIN:       # %bb.0:
-; SSE3_64_WIN-NEXT:    subq $16, %rsp
+; SSE3_64_WIN-NEXT:    pushq %rax
 ; SSE3_64_WIN-NEXT:    fldt (%rcx)
 ; SSE3_64_WIN-NEXT:    flds __real@{{.*}}(%rip)
 ; SSE3_64_WIN-NEXT:    fld %st(1)
 ; SSE3_64_WIN-NEXT:    fsub %st(1)
-; SSE3_64_WIN-NEXT:    fisttpll {{[0-9]+}}(%rsp)
-; SSE3_64_WIN-NEXT:    fld %st(1)
+; SSE3_64_WIN-NEXT:    xorl %eax, %eax
+; SSE3_64_WIN-NEXT:    fxch %st(1)
+; SSE3_64_WIN-NEXT:    fucompi %st(2)
+; SSE3_64_WIN-NEXT:    fcmovnbe %st(1), %st(0)
+; SSE3_64_WIN-NEXT:    fstp %st(1)
 ; SSE3_64_WIN-NEXT:    fisttpll (%rsp)
-; SSE3_64_WIN-NEXT:    fucompi %st(1)
-; SSE3_64_WIN-NEXT:    fstp %st(0)
-; SSE3_64_WIN-NEXT:    jbe .LBB4_1
-; SSE3_64_WIN-NEXT:  # %bb.2:
-; SSE3_64_WIN-NEXT:    movq (%rsp), %rax
-; SSE3_64_WIN-NEXT:    addq $16, %rsp
-; SSE3_64_WIN-NEXT:    retq
-; SSE3_64_WIN-NEXT:  .LBB4_1:
-; SSE3_64_WIN-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE3_64_WIN-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
-; SSE3_64_WIN-NEXT:    addq $16, %rsp
+; SSE3_64_WIN-NEXT:    setbe %al
+; SSE3_64_WIN-NEXT:    shlq $63, %rax
+; SSE3_64_WIN-NEXT:    xorq (%rsp), %rax
+; SSE3_64_WIN-NEXT:    popq %rcx
 ; SSE3_64_WIN-NEXT:    retq
 ;
 ; SSE3_64_LIN-LABEL: x_to_u64:
@@ -1174,17 +1170,14 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; SSE3_64_LIN-NEXT:    flds {{.*}}(%rip)
 ; SSE3_64_LIN-NEXT:    fld %st(1)
 ; SSE3_64_LIN-NEXT:    fsub %st(1)
+; SSE3_64_LIN-NEXT:    xorl %eax, %eax
+; SSE3_64_LIN-NEXT:    fxch %st(1)
+; SSE3_64_LIN-NEXT:    fucompi %st(2)
+; SSE3_64_LIN-NEXT:    fcmovnbe %st(1), %st(0)
+; SSE3_64_LIN-NEXT:    fstp %st(1)
 ; SSE3_64_LIN-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
-; SSE3_64_LIN-NEXT:    fld %st(1)
-; SSE3_64_LIN-NEXT:    fisttpll -{{[0-9]+}}(%rsp)
-; SSE3_64_LIN-NEXT:    fucompi %st(1)
-; SSE3_64_LIN-NEXT:    fstp %st(0)
-; SSE3_64_LIN-NEXT:    jbe .LBB4_1
-; SSE3_64_LIN-NEXT:  # %bb.2:
-; SSE3_64_LIN-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE3_64_LIN-NEXT:    retq
-; SSE3_64_LIN-NEXT:  .LBB4_1:
-; SSE3_64_LIN-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE3_64_LIN-NEXT:    setbe %al
+; SSE3_64_LIN-NEXT:    shlq $63, %rax
 ; SSE3_64_LIN-NEXT:    xorq -{{[0-9]+}}(%rsp), %rax
 ; SSE3_64_LIN-NEXT:    retq
 ;
@@ -1246,37 +1239,27 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ;
 ; SSE2_64_WIN-LABEL: x_to_u64:
 ; SSE2_64_WIN:       # %bb.0:
-; SSE2_64_WIN-NEXT:    subq $24, %rsp
+; SSE2_64_WIN-NEXT:    subq $16, %rsp
 ; SSE2_64_WIN-NEXT:    fldt (%rcx)
 ; SSE2_64_WIN-NEXT:    flds __real@{{.*}}(%rip)
 ; SSE2_64_WIN-NEXT:    fld %st(1)
 ; SSE2_64_WIN-NEXT:    fsub %st(1)
+; SSE2_64_WIN-NEXT:    xorl %eax, %eax
+; SSE2_64_WIN-NEXT:    fxch %st(1)
+; SSE2_64_WIN-NEXT:    fucompi %st(2)
+; SSE2_64_WIN-NEXT:    fcmovnbe %st(1), %st(0)
+; SSE2_64_WIN-NEXT:    fstp %st(1)
 ; SSE2_64_WIN-NEXT:    fnstcw {{[0-9]+}}(%rsp)
-; SSE2_64_WIN-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSE2_64_WIN-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
 ; SSE2_64_WIN-NEXT:    movw $3199, {{[0-9]+}}(%rsp) # imm = 0xC7F
 ; SSE2_64_WIN-NEXT:    fldcw {{[0-9]+}}(%rsp)
-; SSE2_64_WIN-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; SSE2_64_WIN-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
 ; SSE2_64_WIN-NEXT:    fistpll {{[0-9]+}}(%rsp)
 ; SSE2_64_WIN-NEXT:    fldcw {{[0-9]+}}(%rsp)
-; SSE2_64_WIN-NEXT:    fnstcw {{[0-9]+}}(%rsp)
-; SSE2_64_WIN-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
-; SSE2_64_WIN-NEXT:    movw $3199, {{[0-9]+}}(%rsp) # imm = 0xC7F
-; SSE2_64_WIN-NEXT:    fldcw {{[0-9]+}}(%rsp)
-; SSE2_64_WIN-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE2_64_WIN-NEXT:    fld %st(1)
-; SSE2_64_WIN-NEXT:    fistpll {{[0-9]+}}(%rsp)
-; SSE2_64_WIN-NEXT:    fldcw {{[0-9]+}}(%rsp)
-; SSE2_64_WIN-NEXT:    fucompi %st(1)
-; SSE2_64_WIN-NEXT:    fstp %st(0)
-; SSE2_64_WIN-NEXT:    jbe .LBB4_1
-; SSE2_64_WIN-NEXT:  # %bb.2:
-; SSE2_64_WIN-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2_64_WIN-NEXT:    addq $24, %rsp
-; SSE2_64_WIN-NEXT:    retq
-; SSE2_64_WIN-NEXT:  .LBB4_1:
-; SSE2_64_WIN-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE2_64_WIN-NEXT:    setbe %al
+; SSE2_64_WIN-NEXT:    shlq $63, %rax
 ; SSE2_64_WIN-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
-; SSE2_64_WIN-NEXT:    addq $24, %rsp
+; SSE2_64_WIN-NEXT:    addq $16, %rsp
 ; SSE2_64_WIN-NEXT:    retq
 ;
 ; SSE2_64_LIN-LABEL: x_to_u64:
@@ -1285,29 +1268,20 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; SSE2_64_LIN-NEXT:    flds {{.*}}(%rip)
 ; SSE2_64_LIN-NEXT:    fld %st(1)
 ; SSE2_64_LIN-NEXT:    fsub %st(1)
+; SSE2_64_LIN-NEXT:    xorl %eax, %eax
+; SSE2_64_LIN-NEXT:    fxch %st(1)
+; SSE2_64_LIN-NEXT:    fucompi %st(2)
+; SSE2_64_LIN-NEXT:    fcmovnbe %st(1), %st(0)
+; SSE2_64_LIN-NEXT:    fstp %st(1)
 ; SSE2_64_LIN-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; SSE2_64_LIN-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2_64_LIN-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2_64_LIN-NEXT:    movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
 ; SSE2_64_LIN-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; SSE2_64_LIN-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2_64_LIN-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
 ; SSE2_64_LIN-NEXT:    fistpll -{{[0-9]+}}(%rsp)
 ; SSE2_64_LIN-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; SSE2_64_LIN-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; SSE2_64_LIN-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2_64_LIN-NEXT:    movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
-; SSE2_64_LIN-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; SSE2_64_LIN-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; SSE2_64_LIN-NEXT:    fld %st(1)
-; SSE2_64_LIN-NEXT:    fistpll -{{[0-9]+}}(%rsp)
-; SSE2_64_LIN-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; SSE2_64_LIN-NEXT:    fucompi %st(1)
-; SSE2_64_LIN-NEXT:    fstp %st(0)
-; SSE2_64_LIN-NEXT:    jbe .LBB4_1
-; SSE2_64_LIN-NEXT:  # %bb.2:
-; SSE2_64_LIN-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE2_64_LIN-NEXT:    retq
-; SSE2_64_LIN-NEXT:  .LBB4_1:
-; SSE2_64_LIN-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE2_64_LIN-NEXT:    setbe %al
+; SSE2_64_LIN-NEXT:    shlq $63, %rax
 ; SSE2_64_LIN-NEXT:    xorq -{{[0-9]+}}(%rsp), %rax
 ; SSE2_64_LIN-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/scalar_widen_div.ll b/test/CodeGen/X86/scalar_widen_div.ll
index 46781649cecd4df9a8039c28f766046fd6e3c1af..11dda4bc77a1eba04887a5399deb5816df0ae442 100644
--- a/test/CodeGen/X86/scalar_widen_div.ll
+++ b/test/CodeGen/X86/scalar_widen_div.ll
@@ -237,12 +237,12 @@ define <3 x i64> @test_ulong_div(<3 x i64> %num, <3 x i64> %div) {
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq %rcx
 ; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq %r8
 ; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq %r9
 ; CHECK-NEXT:    movq %rax, %rdi
 ; CHECK-NEXT:    movq %rcx, %rax
@@ -372,22 +372,22 @@ define <5 x i64> @test_ulong_rem(<5 x i64> %num, <5 x i64> %rem) {
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, %xmm1
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, %xmm2
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movq %r9, %rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, 32(%rdi)
 ; CHECK-NEXT:    movdqa %xmm2, 16(%rdi)
diff --git a/test/CodeGen/X86/scalarize-fp.ll b/test/CodeGen/X86/scalarize-fp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d9606b8735254ac3aff05e2555fcf888e0f3d1dd
--- /dev/null
+++ b/test/CodeGen/X86/scalarize-fp.ll
@@ -0,0 +1,383 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=ALL,AVX
+
+define <4 x float> @fadd_op1_constant_v4f32(float %x) nounwind {
+; SSE-LABEL: fadd_op1_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fadd_op1_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fadd <4 x float> %v, <float 42.0, float undef, float undef, float undef>
+  ret <4 x float> %b
+}
+
+define <4 x float> @load_fadd_op1_constant_v4f32(float* %p) nounwind {
+; SSE-LABEL: load_fadd_op1_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fadd_op1_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x = load float, float* %p
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fadd <4 x float> %v, <float 42.0, float undef, float undef, float undef>
+  ret <4 x float> %b
+}
+
+define <4 x float> @fsub_op0_constant_v4f32(float %x) nounwind {
+; SSE-LABEL: fsub_op0_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    subps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fsub_op0_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fsub <4 x float> <float 42.0, float undef, float undef, float undef>, %v
+  ret <4 x float> %b
+}
+
+define <4 x float> @load_fsub_op0_constant_v4f32(float* %p) nounwind {
+; SSE-LABEL: load_fsub_op0_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    subps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fsub_op0_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %x = load float, float* %p
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fsub <4 x float> <float 42.0, float undef, float undef, float undef>, %v
+  ret <4 x float> %b
+}
+
+define <4 x float> @fmul_op1_constant_v4f32(float %x) nounwind {
+; SSE-LABEL: fmul_op1_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    mulps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fmul_op1_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fmul <4 x float> %v, <float 42.0, float undef, float undef, float undef>
+  ret <4 x float> %b
+}
+
+define <4 x float> @load_fmul_op1_constant_v4f32(float* %p) nounwind {
+; SSE-LABEL: load_fmul_op1_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    mulps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fmul_op1_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x = load float, float* %p
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fmul <4 x float> %v, <float 42.0, float undef, float undef, float undef>
+  ret <4 x float> %b
+}
+
+define <4 x float> @fdiv_op1_constant_v4f32(float %x) nounwind {
+; SSE-LABEL: fdiv_op1_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    divps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fdiv_op1_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vdivps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fdiv <4 x float> %v, <float 42.0, float undef, float undef, float undef>
+  ret <4 x float> %b
+}
+
+define <4 x float> @load_fdiv_op1_constant_v4f32(float* %p) nounwind {
+; SSE-LABEL: load_fdiv_op1_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    divps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fdiv_op1_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vdivps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x = load float, float* %p
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fdiv <4 x float> %v, <float 42.0, float undef, float undef, float undef>
+  ret <4 x float> %b
+}
+
+define <4 x float> @fdiv_op0_constant_v4f32(float %x) nounwind {
+; SSE-LABEL: fdiv_op0_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    divps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fdiv_op0_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fdiv <4 x float> <float 42.0, float undef, float undef, float undef>, %v
+  ret <4 x float> %b
+}
+
+define <4 x float> @load_fdiv_op0_constant_v4f32(float* %p) nounwind {
+; SSE-LABEL: load_fdiv_op0_constant_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    divps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fdiv_op0_constant_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %x = load float, float* %p
+  %v = insertelement <4 x float> undef, float %x, i32 0
+  %b = fdiv <4 x float> <float 42.0, float undef, float undef, float undef>, %v
+  ret <4 x float> %b
+}
+
+define <4 x double> @fadd_op1_constant_v4f64(double %x) nounwind {
+; SSE-LABEL: fadd_op1_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fadd_op1_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fadd <4 x double> %v, <double 42.0, double undef, double undef, double undef>
+  ret <4 x double> %b
+}
+
+define <4 x double> @load_fadd_op1_constant_v4f64(double* %p) nounwind {
+; SSE-LABEL: load_fadd_op1_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fadd_op1_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %x = load double, double* %p
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fadd <4 x double> %v, <double 42.0, double undef, double undef, double undef>
+  ret <4 x double> %b
+}
+
+define <4 x double> @fsub_op0_constant_v4f64(double %x) nounwind {
+; SSE-LABEL: fsub_op0_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    subpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fsub_op0_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vsubpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fsub <4 x double> <double 42.0, double undef, double undef, double undef>, %v
+  ret <4 x double> %b
+}
+
+define <4 x double> @load_fsub_op0_constant_v4f64(double* %p) nounwind {
+; SSE-LABEL: load_fsub_op0_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    subpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fsub_op0_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vsubpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    retq
+  %x = load double, double* %p
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fsub <4 x double> <double 42.0, double undef, double undef, double undef>, %v
+  ret <4 x double> %b
+}
+
+define <4 x double> @fmul_op1_constant_v4f64(double %x) nounwind {
+; SSE-LABEL: fmul_op1_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    mulpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fmul_op1_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fmul <4 x double> %v, <double 42.0, double undef, double undef, double undef>
+  ret <4 x double> %b
+}
+
+define <4 x double> @load_fmul_op1_constant_v4f64(double* %p) nounwind {
+; SSE-LABEL: load_fmul_op1_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    mulpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fmul_op1_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %x = load double, double* %p
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fmul <4 x double> %v, <double 42.0, double undef, double undef, double undef>
+  ret <4 x double> %b
+}
+
+define <4 x double> @fdiv_op1_constant_v4f64(double %x) nounwind {
+; SSE-LABEL: fdiv_op1_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    divpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fdiv_op1_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fdiv <4 x double> %v, <double 42.0, double undef, double undef, double undef>
+  ret <4 x double> %b
+}
+
+define <4 x double> @load_fdiv_op1_constant_v4f64(double* %p) nounwind {
+; SSE-LABEL: load_fdiv_op1_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    divpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fdiv_op1_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %x = load double, double* %p
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fdiv <4 x double> %v, <double 42.0, double undef, double undef, double undef>
+  ret <4 x double> %b
+}
+
+define <4 x double> @fdiv_op0_constant_v4f64(double %x) nounwind {
+; SSE-LABEL: fdiv_op0_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    divpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fdiv_op0_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fdiv <4 x double> <double 42.0, double undef, double undef, double undef>, %v
+  ret <4 x double> %b
+}
+
+define <4 x double> @load_fdiv_op0_constant_v4f64(double* %p) nounwind {
+; SSE-LABEL: load_fdiv_op0_constant_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    divpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_fdiv_op0_constant_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    retq
+  %x = load double, double* %p
+  %v = insertelement <4 x double> undef, double %x, i32 0
+  %b = fdiv <4 x double> <double 42.0, double undef, double undef, double undef>, %v
+  ret <4 x double> %b
+}
+
diff --git a/test/CodeGen/X86/schedule-x86-64-shld.ll b/test/CodeGen/X86/schedule-x86-64-shld.ll
index 0e66329f7b457895d9567708739892a9e8068685..b0faf4cc5fa68f853fec6c7246b757cf10d76021 100644
--- a/test/CodeGen/X86/schedule-x86-64-shld.ll
+++ b/test/CodeGen/X86/schedule-x86-64-shld.ll
@@ -170,7 +170,7 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
+; BDVER12-NEXT:    negb %cl # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BDVER12-NEXT:    shrq %cl, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
@@ -181,7 +181,7 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rsi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    negl %ecx # sched: [1:0.50]
+; BTVER2-NEXT:    negb %cl # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shrq %cl, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
@@ -246,7 +246,7 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
+; BDVER12-NEXT:    negb %cl # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BDVER12-NEXT:    shlq %cl, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
@@ -257,7 +257,7 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rsi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    negl %ecx # sched: [1:0.50]
+; BTVER2-NEXT:    negb %cl # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shlq %cl, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
@@ -321,11 +321,11 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
 ; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
 ; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
 ; BDVER12-NEXT:    shlq %cl, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
+; BDVER12-NEXT:    negb %cl # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BDVER12-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
 ; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
 ; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_cl:
@@ -333,7 +333,7 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
 ; BTVER2-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    shlq %cl, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    negl %ecx # sched: [1:0.50]
+; BTVER2-NEXT:    negb %cl # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    orq %rax, %rdi # sched: [1:0.50]
@@ -361,7 +361,7 @@ define void @lshift_mem(i64 %a) nounwind readnone {
 ; BDVER12-NEXT:    shrq $54, %rdi # sched: [1:0.50]
 ; BDVER12-NEXT:    shlq $10, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
 ; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem:
@@ -419,7 +419,7 @@ define void @lshift_mem_b(i64 %b) nounwind readnone {
 ; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
 ; BDVER12-NEXT:    shrq $54, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_b:
@@ -451,7 +451,7 @@ define void @lshift_mem_b_optsize(i64 %b) nounwind readnone optsize {
 ; BDVER12:       # %bb.0: # %entry
 ; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
 ; BDVER12-NEXT:    shrdq $54, %rdi, %rax # sched: [4:3.00]
-; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_b_optsize:
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
index 6b8ad906fec6788c6bf70778aa1e847cdde96631..917f1a7bfa3228adbfff00a95adfe0b0670e2a53 100644
--- a/test/CodeGen/X86/schedule-x86_32.ll
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -644,7 +644,7 @@ define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
 ;
 ; BDVER2-LABEL: test_bound:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pushl %esi # sched: [1:0.50]
+; BDVER2-NEXT:    pushl %esi # sched: [1:1.00]
 ; BDVER2-NEXT:    .cfi_def_cfa_offset 8
 ; BDVER2-NEXT:    .cfi_offset %esi, -8
 ; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
@@ -1924,11 +1924,11 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    popw %ax # sched: [5:0.50]
 ; BDVER2-NEXT:    popw (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    pushw %ax # sched: [1:0.50]
+; BDVER2-NEXT:    pushw %ax # sched: [1:1.00]
 ; BDVER2-NEXT:    pushw (%ecx) # sched: [6:1.00]
 ; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:0.50]
-; BDVER2-NEXT:    pushw $7 # sched: [1:0.50]
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    pushw $7 # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
@@ -2091,11 +2091,11 @@ define i32 @test_pop_push_32(i32 %a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    popl %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    popl (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    pushl %eax # sched: [1:0.50]
+; BDVER2-NEXT:    pushl %eax # sched: [1:1.00]
 ; BDVER2-NEXT:    pushl (%ecx) # sched: [6:1.00]
 ; BDVER2-NEXT:    pushl $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:0.50]
-; BDVER2-NEXT:    pushl $7 # sched: [1:0.50]
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    pushl $7 # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
@@ -2218,8 +2218,8 @@ define void @test_popa_popf_pusha_pushf() optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    popal # sched: [5:0.50]
 ; BDVER2-NEXT:    popfl # sched: [5:0.50]
-; BDVER2-NEXT:    pushal # sched: [1:0.50]
-; BDVER2-NEXT:    pushfl # sched: [1:0.50]
+; BDVER2-NEXT:    pushal # sched: [1:1.00]
+; BDVER2-NEXT:    pushfl # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll
index 18541184eb8486afed092feb1258b8b33505796f..c2772cc23183c80d9a76570fb4124f80e9bd11e7 100644
--- a/test/CodeGen/X86/schedule-x86_64.ll
+++ b/test/CodeGen/X86/schedule-x86_64.ll
@@ -8475,8 +8475,8 @@ define void @test_movnti(i32 %a0, i32 *%a1, i64 %a2, i64 *%a3) optsize {
 ; BDVER2-LABEL: test_movnti:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    movntil %edi, (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    movntiq %rdx, (%rcx) # sched: [1:0.50]
+; BDVER2-NEXT:    movntil %edi, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    movntiq %rdx, (%rcx) # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -10658,11 +10658,11 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    popw %ax # sched: [5:0.50]
 ; BDVER2-NEXT:    popw (%rsi) # sched: [6:1.00]
-; BDVER2-NEXT:    pushw %di # sched: [1:0.50]
+; BDVER2-NEXT:    pushw %di # sched: [1:1.00]
 ; BDVER2-NEXT:    pushw (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:0.50]
-; BDVER2-NEXT:    pushw $7 # sched: [1:0.50]
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    pushw $7 # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -10804,11 +10804,11 @@ define i64 @test_pop_push_64(i64 %a0, i64 *%a1) optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    popq %rax # sched: [5:0.50]
 ; BDVER2-NEXT:    popq (%rsi) # sched: [6:1.00]
-; BDVER2-NEXT:    pushq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    pushq %rdi # sched: [1:1.00]
 ; BDVER2-NEXT:    pushq (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    pushq $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:0.50]
-; BDVER2-NEXT:    pushq $7 # sched: [1:0.50]
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    pushq $7 # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -10910,7 +10910,7 @@ define void @test_popf_pushf() optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    popfq # sched: [5:0.50]
-; BDVER2-NEXT:    pushfq # sched: [1:0.50]
+; BDVER2-NEXT:    pushfq # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -15157,18 +15157,18 @@ define void @test_setcc(i8 %a0, i8 *%a1) optsize {
 ; BDVER2-NEXT:    setge %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    setle %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    setg %dil # sched: [1:0.50]
-; BDVER2-NEXT:    seto (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    setno (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    setb (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    setae (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    sete (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    setne (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    setbe (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    seta (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    sets (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    setns (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    setp (%rsi) # sched: [1:0.50]
-; BDVER2-NEXT:    setnp (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    seto (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setno (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setb (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setae (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    sete (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setne (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setbe (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    seta (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    sets (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setns (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setp (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setnp (%rsi) # sched: [1:1.00]
 ; BDVER2-NEXT:    setl (%rsi) # sched: [1:1.00]
 ; BDVER2-NEXT:    setge (%rsi) # sched: [1:1.00]
 ; BDVER2-NEXT:    setle (%rsi) # sched: [1:1.00]
diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
index 811bd9bd031c2fc850cf4c7bab03bc7e50e5a03b..cb8571bf61a18225d22a738425c83bba42c69eca 100644
--- a/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -12,450 +12,230 @@
 define i256 @test1(i256 %a) nounwind {
 ; ILP-LABEL: test1:
 ; ILP:       # %bb.0:
-; ILP-NEXT:    pushq %rbp
-; ILP-NEXT:    pushq %r15
 ; ILP-NEXT:    pushq %r14
-; ILP-NEXT:    pushq %r13
-; ILP-NEXT:    pushq %r12
 ; ILP-NEXT:    pushq %rbx
-; ILP-NEXT:    movq %rcx, %r9
-; ILP-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; ILP-NEXT:    xorl %eax, %eax
-; ILP-NEXT:    addq $1, %rsi
-; ILP-NEXT:    adcq $0, %rdx
-; ILP-NEXT:    adcq $0, %r9
-; ILP-NEXT:    adcq $0, %r8
-; ILP-NEXT:    leal 1(%rsi,%rsi), %edi
-; ILP-NEXT:    movl $1, %ebp
+; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorl %r8d, %r8d
+; ILP-NEXT:    incl %esi
+; ILP-NEXT:    addb %sil, %sil
+; ILP-NEXT:    orb $1, %sil
+; ILP-NEXT:    movl $1, %r10d
 ; ILP-NEXT:    xorl %r14d, %r14d
-; ILP-NEXT:    movl %edi, %ecx
-; ILP-NEXT:    shldq %cl, %rbp, %r14
-; ILP-NEXT:    movl $1, %r11d
-; ILP-NEXT:    shlq %cl, %r11
-; ILP-NEXT:    movb $-128, %r10b
-; ILP-NEXT:    subb %dil, %r10b
-; ILP-NEXT:    movq %r9, %r13
-; ILP-NEXT:    movl %r10d, %ecx
-; ILP-NEXT:    shlq %cl, %r13
-; ILP-NEXT:    movl $1, %r12d
-; ILP-NEXT:    shrdq %cl, %rax, %r12
-; ILP-NEXT:    xorl %r15d, %r15d
-; ILP-NEXT:    movl %edi, %ecx
-; ILP-NEXT:    shldq %cl, %r15, %r15
-; ILP-NEXT:    movq %rsi, %rbx
-; ILP-NEXT:    shrdq %cl, %rdx, %rbx
-; ILP-NEXT:    shrq %cl, %rdx
-; ILP-NEXT:    addb $-128, %cl
-; ILP-NEXT:    shrdq %cl, %r8, %r9
-; ILP-NEXT:    testb $64, %dil
-; ILP-NEXT:    cmovneq %r11, %r14
-; ILP-NEXT:    cmoveq %rbx, %rdx
-; ILP-NEXT:    cmovneq %rax, %r15
-; ILP-NEXT:    cmovneq %rax, %r11
-; ILP-NEXT:    testb $64, %r10b
-; ILP-NEXT:    cmovneq %rax, %r12
-; ILP-NEXT:    cmovneq %rax, %r13
-; ILP-NEXT:    movl $1, %ebx
-; ILP-NEXT:    shlq %cl, %rbx
-; ILP-NEXT:    orl %edx, %r13d
-; ILP-NEXT:    xorl %edx, %edx
-; ILP-NEXT:    movl $1, %ebp
-; ILP-NEXT:    shldq %cl, %rbp, %rdx
-; ILP-NEXT:    shrq %cl, %r8
-; ILP-NEXT:    testb $64, %cl
-; ILP-NEXT:    cmoveq %r9, %r8
-; ILP-NEXT:    cmovneq %rbx, %rdx
-; ILP-NEXT:    cmovneq %rax, %rbx
-; ILP-NEXT:    testb %dil, %dil
-; ILP-NEXT:    cmovsq %rax, %r14
-; ILP-NEXT:    cmovsq %rax, %r11
-; ILP-NEXT:    jns .LBB0_2
-; ILP-NEXT:  # %bb.1:
-; ILP-NEXT:    movl %r8d, %r13d
-; ILP-NEXT:  .LBB0_2:
-; ILP-NEXT:    je .LBB0_4
-; ILP-NEXT:  # %bb.3:
-; ILP-NEXT:    movl %r13d, %esi
-; ILP-NEXT:  .LBB0_4:
-; ILP-NEXT:    cmovnsq %r12, %rbx
-; ILP-NEXT:    cmoveq %rax, %rbx
-; ILP-NEXT:    cmovnsq %r15, %rdx
-; ILP-NEXT:    cmoveq %rax, %rdx
-; ILP-NEXT:    testb $1, %sil
-; ILP-NEXT:    cmovneq %rax, %rdx
-; ILP-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; ILP-NEXT:    movq %rdx, 24(%rax)
-; ILP-NEXT:    cmovneq %rax, %rbx
-; ILP-NEXT:    movq %rbx, 16(%rax)
-; ILP-NEXT:    cmovneq %rax, %r14
+; ILP-NEXT:    movl %esi, %ecx
+; ILP-NEXT:    shldq %cl, %r10, %r14
+; ILP-NEXT:    movl $1, %edx
+; ILP-NEXT:    shlq %cl, %rdx
+; ILP-NEXT:    leal -128(%rsi), %r9d
+; ILP-NEXT:    movb $-128, %r11b
+; ILP-NEXT:    xorl %ebx, %ebx
+; ILP-NEXT:    movl %r9d, %ecx
+; ILP-NEXT:    shldq %cl, %r10, %rbx
+; ILP-NEXT:    testb $64, %sil
+; ILP-NEXT:    cmovneq %rdx, %r14
+; ILP-NEXT:    cmovneq %r8, %rdx
+; ILP-NEXT:    movl $1, %edi
+; ILP-NEXT:    shlq %cl, %rdi
+; ILP-NEXT:    subb %sil, %r11b
+; ILP-NEXT:    movl %r11d, %ecx
+; ILP-NEXT:    shrdq %cl, %r8, %r10
+; ILP-NEXT:    testb $64, %r11b
+; ILP-NEXT:    cmovneq %r8, %r10
+; ILP-NEXT:    testb $64, %r9b
+; ILP-NEXT:    cmovneq %rdi, %rbx
+; ILP-NEXT:    cmovneq %r8, %rdi
+; ILP-NEXT:    testb %sil, %sil
+; ILP-NEXT:    cmovsq %r8, %r14
+; ILP-NEXT:    cmovsq %r8, %rdx
 ; ILP-NEXT:    movq %r14, 8(%rax)
-; ILP-NEXT:    cmovneq %rax, %r11
-; ILP-NEXT:    movq %r11, (%rax)
+; ILP-NEXT:    movq %rdx, (%rax)
+; ILP-NEXT:    cmovnsq %r8, %rbx
+; ILP-NEXT:    cmoveq %r8, %rbx
+; ILP-NEXT:    movq %rbx, 24(%rax)
+; ILP-NEXT:    cmovnsq %r10, %rdi
+; ILP-NEXT:    cmoveq %r8, %rdi
+; ILP-NEXT:    movq %rdi, 16(%rax)
 ; ILP-NEXT:    popq %rbx
-; ILP-NEXT:    popq %r12
-; ILP-NEXT:    popq %r13
 ; ILP-NEXT:    popq %r14
-; ILP-NEXT:    popq %r15
-; ILP-NEXT:    popq %rbp
 ; ILP-NEXT:    retq
 ;
 ; HYBRID-LABEL: test1:
 ; HYBRID:       # %bb.0:
-; HYBRID-NEXT:    pushq %rbp
-; HYBRID-NEXT:    pushq %r15
-; HYBRID-NEXT:    pushq %r14
-; HYBRID-NEXT:    pushq %r13
-; HYBRID-NEXT:    pushq %r12
-; HYBRID-NEXT:    pushq %rbx
-; HYBRID-NEXT:    movq %rcx, %r9
 ; HYBRID-NEXT:    movq %rdi, %rax
-; HYBRID-NEXT:    addq $1, %rsi
-; HYBRID-NEXT:    adcq $0, %rdx
-; HYBRID-NEXT:    adcq $0, %r9
-; HYBRID-NEXT:    adcq $0, %r8
-; HYBRID-NEXT:    xorl %r10d, %r10d
-; HYBRID-NEXT:    leal 1(%rsi,%rsi), %edi
-; HYBRID-NEXT:    xorl %r14d, %r14d
-; HYBRID-NEXT:    movl %edi, %ecx
-; HYBRID-NEXT:    shldq %cl, %r14, %r14
-; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    cmovneq %r10, %r14
-; HYBRID-NEXT:    movl $1, %ebp
-; HYBRID-NEXT:    movl $1, %r12d
-; HYBRID-NEXT:    shlq %cl, %r12
-; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    movq %r12, %r11
-; HYBRID-NEXT:    cmovneq %r10, %r11
-; HYBRID-NEXT:    movq %rsi, %rbx
-; HYBRID-NEXT:    shrdq %cl, %rdx, %rbx
-; HYBRID-NEXT:    shrq %cl, %rdx
-; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    cmoveq %rbx, %rdx
-; HYBRID-NEXT:    xorl %r15d, %r15d
-; HYBRID-NEXT:    shldq %cl, %rbp, %r15
-; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    cmovneq %r12, %r15
+; HYBRID-NEXT:    incl %esi
+; HYBRID-NEXT:    addb %sil, %sil
+; HYBRID-NEXT:    orb $1, %sil
 ; HYBRID-NEXT:    movb $-128, %cl
-; HYBRID-NEXT:    subb %dil, %cl
-; HYBRID-NEXT:    movq %r9, %r13
-; HYBRID-NEXT:    shlq %cl, %r13
-; HYBRID-NEXT:    movl $1, %r12d
-; HYBRID-NEXT:    shrdq %cl, %r10, %r12
-; HYBRID-NEXT:    testb $64, %cl
-; HYBRID-NEXT:    cmovneq %r10, %r12
-; HYBRID-NEXT:    cmovneq %r10, %r13
-; HYBRID-NEXT:    orl %edx, %r13d
-; HYBRID-NEXT:    movl %edi, %ecx
-; HYBRID-NEXT:    addb $-128, %cl
+; HYBRID-NEXT:    subb %sil, %cl
+; HYBRID-NEXT:    xorl %r8d, %r8d
+; HYBRID-NEXT:    movl $1, %r11d
+; HYBRID-NEXT:    movl $1, %r9d
 ; HYBRID-NEXT:    shrdq %cl, %r8, %r9
-; HYBRID-NEXT:    shrq %cl, %r8
-; HYBRID-NEXT:    xorl %edx, %edx
-; HYBRID-NEXT:    shldq %cl, %rbp, %rdx
-; HYBRID-NEXT:    shlq %cl, %rbp
 ; HYBRID-NEXT:    testb $64, %cl
-; HYBRID-NEXT:    cmovneq %rbp, %rdx
-; HYBRID-NEXT:    cmoveq %r9, %r8
-; HYBRID-NEXT:    cmovneq %r10, %rbp
-; HYBRID-NEXT:    testb %dil, %dil
-; HYBRID-NEXT:    jns .LBB0_2
-; HYBRID-NEXT:  # %bb.1:
-; HYBRID-NEXT:    movl %r8d, %r13d
-; HYBRID-NEXT:  .LBB0_2:
-; HYBRID-NEXT:    je .LBB0_4
-; HYBRID-NEXT:  # %bb.3:
-; HYBRID-NEXT:    movl %r13d, %esi
-; HYBRID-NEXT:  .LBB0_4:
-; HYBRID-NEXT:    cmovsq %r10, %r15
-; HYBRID-NEXT:    cmovnsq %r12, %rbp
-; HYBRID-NEXT:    cmoveq %r10, %rbp
-; HYBRID-NEXT:    cmovnsq %r14, %rdx
-; HYBRID-NEXT:    cmoveq %r10, %rdx
-; HYBRID-NEXT:    cmovsq %r10, %r11
-; HYBRID-NEXT:    testb $1, %sil
-; HYBRID-NEXT:    cmovneq %rax, %rdx
-; HYBRID-NEXT:    movq %rdx, 24(%rax)
-; HYBRID-NEXT:    cmovneq %rax, %rbp
-; HYBRID-NEXT:    movq %rbp, 16(%rax)
-; HYBRID-NEXT:    cmovneq %rax, %r15
-; HYBRID-NEXT:    movq %r15, 8(%rax)
-; HYBRID-NEXT:    cmovneq %rax, %r11
+; HYBRID-NEXT:    cmovneq %r8, %r9
+; HYBRID-NEXT:    xorl %r10d, %r10d
+; HYBRID-NEXT:    movl %esi, %ecx
+; HYBRID-NEXT:    shldq %cl, %r11, %r10
+; HYBRID-NEXT:    leal -128(%rsi), %ecx
+; HYBRID-NEXT:    xorl %edi, %edi
+; HYBRID-NEXT:    shldq %cl, %r11, %rdi
+; HYBRID-NEXT:    movl $1, %edx
+; HYBRID-NEXT:    shlq %cl, %rdx
+; HYBRID-NEXT:    testb $64, %cl
+; HYBRID-NEXT:    cmovneq %rdx, %rdi
+; HYBRID-NEXT:    cmovneq %r8, %rdx
+; HYBRID-NEXT:    movl %esi, %ecx
+; HYBRID-NEXT:    shlq %cl, %r11
+; HYBRID-NEXT:    testb $64, %sil
+; HYBRID-NEXT:    cmovneq %r11, %r10
+; HYBRID-NEXT:    cmovneq %r8, %r11
+; HYBRID-NEXT:    testb %sil, %sil
+; HYBRID-NEXT:    cmovsq %r8, %r10
+; HYBRID-NEXT:    movq %r10, 8(%rax)
+; HYBRID-NEXT:    cmovsq %r8, %r11
 ; HYBRID-NEXT:    movq %r11, (%rax)
-; HYBRID-NEXT:    popq %rbx
-; HYBRID-NEXT:    popq %r12
-; HYBRID-NEXT:    popq %r13
-; HYBRID-NEXT:    popq %r14
-; HYBRID-NEXT:    popq %r15
-; HYBRID-NEXT:    popq %rbp
+; HYBRID-NEXT:    cmovnsq %r8, %rdi
+; HYBRID-NEXT:    cmoveq %r8, %rdi
+; HYBRID-NEXT:    movq %rdi, 24(%rax)
+; HYBRID-NEXT:    cmovnsq %r9, %rdx
+; HYBRID-NEXT:    cmoveq %r8, %rdx
+; HYBRID-NEXT:    movq %rdx, 16(%rax)
 ; HYBRID-NEXT:    retq
 ;
 ; BURR-LABEL: test1:
 ; BURR:       # %bb.0:
-; BURR-NEXT:    pushq %rbp
-; BURR-NEXT:    pushq %r15
-; BURR-NEXT:    pushq %r14
-; BURR-NEXT:    pushq %r13
-; BURR-NEXT:    pushq %r12
-; BURR-NEXT:    pushq %rbx
-; BURR-NEXT:    movq %rcx, %r9
 ; BURR-NEXT:    movq %rdi, %rax
-; BURR-NEXT:    addq $1, %rsi
-; BURR-NEXT:    adcq $0, %rdx
-; BURR-NEXT:    adcq $0, %r9
-; BURR-NEXT:    adcq $0, %r8
-; BURR-NEXT:    xorl %r10d, %r10d
-; BURR-NEXT:    leal 1(%rsi,%rsi), %edi
-; BURR-NEXT:    xorl %r14d, %r14d
-; BURR-NEXT:    movl %edi, %ecx
-; BURR-NEXT:    shldq %cl, %r14, %r14
-; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    cmovneq %r10, %r14
-; BURR-NEXT:    movl $1, %ebp
-; BURR-NEXT:    movl $1, %r12d
-; BURR-NEXT:    shlq %cl, %r12
-; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    movq %r12, %r11
-; BURR-NEXT:    cmovneq %r10, %r11
-; BURR-NEXT:    movq %rsi, %rbx
-; BURR-NEXT:    shrdq %cl, %rdx, %rbx
-; BURR-NEXT:    shrq %cl, %rdx
-; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    cmoveq %rbx, %rdx
-; BURR-NEXT:    xorl %r15d, %r15d
-; BURR-NEXT:    shldq %cl, %rbp, %r15
-; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    cmovneq %r12, %r15
+; BURR-NEXT:    incl %esi
+; BURR-NEXT:    addb %sil, %sil
+; BURR-NEXT:    orb $1, %sil
 ; BURR-NEXT:    movb $-128, %cl
-; BURR-NEXT:    subb %dil, %cl
-; BURR-NEXT:    movq %r9, %r13
-; BURR-NEXT:    shlq %cl, %r13
-; BURR-NEXT:    movl $1, %r12d
-; BURR-NEXT:    shrdq %cl, %r10, %r12
-; BURR-NEXT:    testb $64, %cl
-; BURR-NEXT:    cmovneq %r10, %r12
-; BURR-NEXT:    cmovneq %r10, %r13
-; BURR-NEXT:    orl %edx, %r13d
-; BURR-NEXT:    movl %edi, %ecx
-; BURR-NEXT:    addb $-128, %cl
+; BURR-NEXT:    subb %sil, %cl
+; BURR-NEXT:    xorl %r8d, %r8d
+; BURR-NEXT:    movl $1, %r11d
+; BURR-NEXT:    movl $1, %r9d
 ; BURR-NEXT:    shrdq %cl, %r8, %r9
-; BURR-NEXT:    xorl %edx, %edx
-; BURR-NEXT:    shldq %cl, %rbp, %rdx
-; BURR-NEXT:    shrq %cl, %r8
-; BURR-NEXT:    shlq %cl, %rbp
 ; BURR-NEXT:    testb $64, %cl
-; BURR-NEXT:    cmovneq %rbp, %rdx
-; BURR-NEXT:    cmoveq %r9, %r8
-; BURR-NEXT:    cmovneq %r10, %rbp
-; BURR-NEXT:    testb %dil, %dil
-; BURR-NEXT:    jns .LBB0_2
-; BURR-NEXT:  # %bb.1:
-; BURR-NEXT:    movl %r8d, %r13d
-; BURR-NEXT:  .LBB0_2:
-; BURR-NEXT:    je .LBB0_4
-; BURR-NEXT:  # %bb.3:
-; BURR-NEXT:    movl %r13d, %esi
-; BURR-NEXT:  .LBB0_4:
-; BURR-NEXT:    cmovsq %r10, %r15
-; BURR-NEXT:    cmovnsq %r12, %rbp
-; BURR-NEXT:    cmoveq %r10, %rbp
-; BURR-NEXT:    cmovnsq %r14, %rdx
-; BURR-NEXT:    cmoveq %r10, %rdx
-; BURR-NEXT:    cmovsq %r10, %r11
-; BURR-NEXT:    testb $1, %sil
-; BURR-NEXT:    cmovneq %rax, %rdx
-; BURR-NEXT:    movq %rdx, 24(%rax)
-; BURR-NEXT:    cmovneq %rax, %rbp
-; BURR-NEXT:    movq %rbp, 16(%rax)
-; BURR-NEXT:    cmovneq %rax, %r15
-; BURR-NEXT:    movq %r15, 8(%rax)
-; BURR-NEXT:    cmovneq %rax, %r11
+; BURR-NEXT:    cmovneq %r8, %r9
+; BURR-NEXT:    xorl %r10d, %r10d
+; BURR-NEXT:    movl %esi, %ecx
+; BURR-NEXT:    shldq %cl, %r11, %r10
+; BURR-NEXT:    leal -128(%rsi), %ecx
+; BURR-NEXT:    xorl %edi, %edi
+; BURR-NEXT:    shldq %cl, %r11, %rdi
+; BURR-NEXT:    movl $1, %edx
+; BURR-NEXT:    shlq %cl, %rdx
+; BURR-NEXT:    testb $64, %cl
+; BURR-NEXT:    cmovneq %rdx, %rdi
+; BURR-NEXT:    cmovneq %r8, %rdx
+; BURR-NEXT:    movl %esi, %ecx
+; BURR-NEXT:    shlq %cl, %r11
+; BURR-NEXT:    testb $64, %sil
+; BURR-NEXT:    cmovneq %r11, %r10
+; BURR-NEXT:    cmovneq %r8, %r11
+; BURR-NEXT:    testb %sil, %sil
+; BURR-NEXT:    cmovsq %r8, %r10
+; BURR-NEXT:    movq %r10, 8(%rax)
+; BURR-NEXT:    cmovsq %r8, %r11
 ; BURR-NEXT:    movq %r11, (%rax)
-; BURR-NEXT:    popq %rbx
-; BURR-NEXT:    popq %r12
-; BURR-NEXT:    popq %r13
-; BURR-NEXT:    popq %r14
-; BURR-NEXT:    popq %r15
-; BURR-NEXT:    popq %rbp
+; BURR-NEXT:    cmovnsq %r8, %rdi
+; BURR-NEXT:    cmoveq %r8, %rdi
+; BURR-NEXT:    movq %rdi, 24(%rax)
+; BURR-NEXT:    cmovnsq %r9, %rdx
+; BURR-NEXT:    cmoveq %r8, %rdx
+; BURR-NEXT:    movq %rdx, 16(%rax)
 ; BURR-NEXT:    retq
 ;
 ; SRC-LABEL: test1:
 ; SRC:       # %bb.0:
-; SRC-NEXT:    pushq %rbp
-; SRC-NEXT:    pushq %r15
-; SRC-NEXT:    pushq %r14
-; SRC-NEXT:    pushq %r13
-; SRC-NEXT:    pushq %r12
 ; SRC-NEXT:    pushq %rbx
-; SRC-NEXT:    movq %rcx, %r9
 ; SRC-NEXT:    movq %rdi, %rax
-; SRC-NEXT:    addq $1, %rsi
-; SRC-NEXT:    adcq $0, %rdx
-; SRC-NEXT:    adcq $0, %r9
-; SRC-NEXT:    adcq $0, %r8
-; SRC-NEXT:    leal 1(%rsi,%rsi), %r11d
-; SRC-NEXT:    movb $-128, %r10b
-; SRC-NEXT:    subb %r11b, %r10b
-; SRC-NEXT:    movq %r9, %r12
-; SRC-NEXT:    movl %r10d, %ecx
-; SRC-NEXT:    shlq %cl, %r12
-; SRC-NEXT:    movq %rsi, %rbp
-; SRC-NEXT:    movl %r11d, %ecx
-; SRC-NEXT:    shrdq %cl, %rdx, %rbp
-; SRC-NEXT:    shrq %cl, %rdx
-; SRC-NEXT:    xorl %r15d, %r15d
+; SRC-NEXT:    incl %esi
+; SRC-NEXT:    addb %sil, %sil
+; SRC-NEXT:    orb $1, %sil
+; SRC-NEXT:    movb $-128, %cl
+; SRC-NEXT:    subb %sil, %cl
+; SRC-NEXT:    xorl %r8d, %r8d
 ; SRC-NEXT:    movl $1, %edi
-; SRC-NEXT:    xorl %r14d, %r14d
-; SRC-NEXT:    shldq %cl, %rdi, %r14
-; SRC-NEXT:    xorl %r13d, %r13d
-; SRC-NEXT:    shldq %cl, %r13, %r13
-; SRC-NEXT:    movl $1, %ebx
-; SRC-NEXT:    shlq %cl, %rbx
-; SRC-NEXT:    testb $64, %r11b
-; SRC-NEXT:    cmoveq %rbp, %rdx
-; SRC-NEXT:    cmovneq %rbx, %r14
-; SRC-NEXT:    cmovneq %r15, %rbx
-; SRC-NEXT:    cmovneq %r15, %r13
-; SRC-NEXT:    movl $1, %ebp
-; SRC-NEXT:    movl %r10d, %ecx
-; SRC-NEXT:    shrdq %cl, %r15, %rbp
-; SRC-NEXT:    testb $64, %r10b
-; SRC-NEXT:    cmovneq %r15, %r12
-; SRC-NEXT:    cmovneq %r15, %rbp
-; SRC-NEXT:    orl %edx, %r12d
-; SRC-NEXT:    movl %r11d, %ecx
-; SRC-NEXT:    addb $-128, %cl
-; SRC-NEXT:    shrdq %cl, %r8, %r9
-; SRC-NEXT:    shrq %cl, %r8
+; SRC-NEXT:    movl $1, %r10d
+; SRC-NEXT:    shrdq %cl, %r8, %r10
+; SRC-NEXT:    testb $64, %cl
+; SRC-NEXT:    cmovneq %r8, %r10
+; SRC-NEXT:    leal -128(%rsi), %r9d
 ; SRC-NEXT:    xorl %edx, %edx
+; SRC-NEXT:    movl %r9d, %ecx
 ; SRC-NEXT:    shldq %cl, %rdi, %rdx
+; SRC-NEXT:    xorl %r11d, %r11d
+; SRC-NEXT:    movl %esi, %ecx
+; SRC-NEXT:    shldq %cl, %rdi, %r11
+; SRC-NEXT:    movl $1, %ebx
+; SRC-NEXT:    shlq %cl, %rbx
+; SRC-NEXT:    testb $64, %sil
+; SRC-NEXT:    cmovneq %rbx, %r11
+; SRC-NEXT:    cmovneq %r8, %rbx
+; SRC-NEXT:    movl %r9d, %ecx
 ; SRC-NEXT:    shlq %cl, %rdi
-; SRC-NEXT:    testb $64, %cl
-; SRC-NEXT:    cmoveq %r9, %r8
+; SRC-NEXT:    testb $64, %r9b
 ; SRC-NEXT:    cmovneq %rdi, %rdx
-; SRC-NEXT:    cmovneq %r15, %rdi
-; SRC-NEXT:    testb %r11b, %r11b
-; SRC-NEXT:    jns .LBB0_2
-; SRC-NEXT:  # %bb.1:
-; SRC-NEXT:    movl %r8d, %r12d
-; SRC-NEXT:  .LBB0_2:
-; SRC-NEXT:    je .LBB0_4
-; SRC-NEXT:  # %bb.3:
-; SRC-NEXT:    movl %r12d, %esi
-; SRC-NEXT:  .LBB0_4:
-; SRC-NEXT:    cmovnsq %r13, %rdx
-; SRC-NEXT:    cmoveq %r15, %rdx
-; SRC-NEXT:    cmovnsq %rbp, %rdi
-; SRC-NEXT:    cmoveq %r15, %rdi
-; SRC-NEXT:    cmovsq %r15, %r14
-; SRC-NEXT:    cmovsq %r15, %rbx
-; SRC-NEXT:    testb $1, %sil
-; SRC-NEXT:    cmovneq %rax, %rbx
-; SRC-NEXT:    cmovneq %rax, %r14
-; SRC-NEXT:    cmovneq %rax, %rdi
-; SRC-NEXT:    cmovneq %rax, %rdx
+; SRC-NEXT:    cmovneq %r8, %rdi
+; SRC-NEXT:    testb %sil, %sil
+; SRC-NEXT:    cmovnsq %r10, %rdi
+; SRC-NEXT:    cmoveq %r8, %rdi
+; SRC-NEXT:    cmovnsq %r8, %rdx
+; SRC-NEXT:    cmoveq %r8, %rdx
+; SRC-NEXT:    cmovsq %r8, %r11
+; SRC-NEXT:    cmovsq %r8, %rbx
+; SRC-NEXT:    movq %r11, 8(%rax)
+; SRC-NEXT:    movq %rbx, (%rax)
 ; SRC-NEXT:    movq %rdx, 24(%rax)
 ; SRC-NEXT:    movq %rdi, 16(%rax)
-; SRC-NEXT:    movq %r14, 8(%rax)
-; SRC-NEXT:    movq %rbx, (%rax)
 ; SRC-NEXT:    popq %rbx
-; SRC-NEXT:    popq %r12
-; SRC-NEXT:    popq %r13
-; SRC-NEXT:    popq %r14
-; SRC-NEXT:    popq %r15
-; SRC-NEXT:    popq %rbp
 ; SRC-NEXT:    retq
 ;
 ; LIN-LABEL: test1:
 ; LIN:       # %bb.0:
-; LIN-NEXT:    pushq %rbp
-; LIN-NEXT:    pushq %r15
-; LIN-NEXT:    pushq %r14
-; LIN-NEXT:    pushq %r12
-; LIN-NEXT:    pushq %rbx
-; LIN-NEXT:    movq %rcx, %r9
 ; LIN-NEXT:    movq %rdi, %rax
-; LIN-NEXT:    xorl %r15d, %r15d
-; LIN-NEXT:    movl $1, %r14d
-; LIN-NEXT:    addq $1, %rsi
-; LIN-NEXT:    leal 1(%rsi,%rsi), %ebp
-; LIN-NEXT:    movl $1, %r12d
-; LIN-NEXT:    movl %ebp, %ecx
-; LIN-NEXT:    shlq %cl, %r12
-; LIN-NEXT:    testb $64, %bpl
-; LIN-NEXT:    movq %r12, %rbx
-; LIN-NEXT:    cmovneq %r15, %rbx
-; LIN-NEXT:    testb %bpl, %bpl
-; LIN-NEXT:    cmovsq %r15, %rbx
-; LIN-NEXT:    adcq $0, %rdx
-; LIN-NEXT:    adcq $0, %r9
-; LIN-NEXT:    adcq $0, %r8
-; LIN-NEXT:    movl %ebp, %r10d
-; LIN-NEXT:    addb $-128, %r10b
-; LIN-NEXT:    movq %r9, %rdi
-; LIN-NEXT:    movl %r10d, %ecx
-; LIN-NEXT:    shrdq %cl, %r8, %rdi
-; LIN-NEXT:    shrq %cl, %r8
-; LIN-NEXT:    testb $64, %r10b
-; LIN-NEXT:    cmoveq %rdi, %r8
-; LIN-NEXT:    movq %rsi, %rdi
-; LIN-NEXT:    movl %ebp, %ecx
-; LIN-NEXT:    shrdq %cl, %rdx, %rdi
-; LIN-NEXT:    shrq %cl, %rdx
-; LIN-NEXT:    cmoveq %rdi, %rdx
-; LIN-NEXT:    movb $-128, %r11b
-; LIN-NEXT:    subb %bpl, %r11b
-; LIN-NEXT:    movl %r11d, %ecx
-; LIN-NEXT:    shlq %cl, %r9
-; LIN-NEXT:    testb $64, %r11b
-; LIN-NEXT:    cmovneq %r15, %r9
-; LIN-NEXT:    orl %edx, %r9d
-; LIN-NEXT:    jns .LBB0_2
-; LIN-NEXT:  # %bb.1:
-; LIN-NEXT:    movl %r8d, %r9d
-; LIN-NEXT:  .LBB0_2:
-; LIN-NEXT:    je .LBB0_4
-; LIN-NEXT:  # %bb.3:
-; LIN-NEXT:    movl %r9d, %esi
-; LIN-NEXT:  .LBB0_4:
-; LIN-NEXT:    testb $1, %sil
-; LIN-NEXT:    cmovneq %rax, %rbx
-; LIN-NEXT:    movq %rbx, (%rax)
-; LIN-NEXT:    xorl %edx, %edx
-; LIN-NEXT:    movl %ebp, %ecx
-; LIN-NEXT:    shldq %cl, %r14, %rdx
-; LIN-NEXT:    cmovneq %r12, %rdx
-; LIN-NEXT:    cmovsq %r15, %rdx
-; LIN-NEXT:    cmovneq %rax, %rdx
-; LIN-NEXT:    movq %rdx, 8(%rax)
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    movl $1, %r8d
+; LIN-NEXT:    incl %esi
+; LIN-NEXT:    addb %sil, %sil
+; LIN-NEXT:    orb $1, %sil
+; LIN-NEXT:    movl $1, %edx
+; LIN-NEXT:    movl %esi, %ecx
+; LIN-NEXT:    shlq %cl, %rdx
+; LIN-NEXT:    testb $64, %sil
+; LIN-NEXT:    movq %rdx, %rcx
+; LIN-NEXT:    cmovneq %r9, %rcx
+; LIN-NEXT:    testb %sil, %sil
+; LIN-NEXT:    cmovsq %r9, %rcx
+; LIN-NEXT:    movq %rcx, (%rdi)
+; LIN-NEXT:    xorl %edi, %edi
+; LIN-NEXT:    movl %esi, %ecx
+; LIN-NEXT:    shldq %cl, %r8, %rdi
+; LIN-NEXT:    cmovneq %rdx, %rdi
+; LIN-NEXT:    cmovsq %r9, %rdi
+; LIN-NEXT:    movq %rdi, 8(%rax)
+; LIN-NEXT:    leal -128(%rsi), %r10d
 ; LIN-NEXT:    movl $1, %edx
 ; LIN-NEXT:    movl %r10d, %ecx
 ; LIN-NEXT:    shlq %cl, %rdx
-; LIN-NEXT:    movq %rdx, %rsi
-; LIN-NEXT:    cmovneq %r15, %rsi
-; LIN-NEXT:    movl $1, %edi
-; LIN-NEXT:    movl %r11d, %ecx
-; LIN-NEXT:    shrdq %cl, %r15, %rdi
-; LIN-NEXT:    cmovneq %r15, %rdi
-; LIN-NEXT:    cmovsq %rsi, %rdi
-; LIN-NEXT:    cmoveq %r15, %rdi
-; LIN-NEXT:    cmovneq %rax, %rdi
-; LIN-NEXT:    movq %rdi, 16(%rax)
+; LIN-NEXT:    testb $64, %r10b
+; LIN-NEXT:    movq %rdx, %rdi
+; LIN-NEXT:    cmovneq %r9, %rdi
+; LIN-NEXT:    movb $-128, %cl
+; LIN-NEXT:    subb %sil, %cl
+; LIN-NEXT:    movl $1, %esi
+; LIN-NEXT:    shrdq %cl, %r9, %rsi
+; LIN-NEXT:    testb $64, %cl
+; LIN-NEXT:    cmovneq %r9, %rsi
+; LIN-NEXT:    cmovsq %rdi, %rsi
+; LIN-NEXT:    cmoveq %r9, %rsi
+; LIN-NEXT:    movq %rsi, 16(%rax)
 ; LIN-NEXT:    xorl %esi, %esi
 ; LIN-NEXT:    movl %r10d, %ecx
-; LIN-NEXT:    shldq %cl, %r14, %rsi
+; LIN-NEXT:    shldq %cl, %r8, %rsi
 ; LIN-NEXT:    cmovneq %rdx, %rsi
-; LIN-NEXT:    xorl %edx, %edx
-; LIN-NEXT:    movl %ebp, %ecx
-; LIN-NEXT:    shldq %cl, %rdx, %rdx
-; LIN-NEXT:    cmovneq %r15, %rdx
-; LIN-NEXT:    cmovsq %rsi, %rdx
-; LIN-NEXT:    cmoveq %r15, %rdx
-; LIN-NEXT:    cmovneq %rax, %rdx
-; LIN-NEXT:    movq %rdx, 24(%rax)
-; LIN-NEXT:    popq %rbx
-; LIN-NEXT:    popq %r12
-; LIN-NEXT:    popq %r14
-; LIN-NEXT:    popq %r15
-; LIN-NEXT:    popq %rbp
+; LIN-NEXT:    cmovnsq %r9, %rsi
+; LIN-NEXT:    cmoveq %r9, %rsi
+; LIN-NEXT:    movq %rsi, 24(%rax)
 ; LIN-NEXT:    retq
   %b = add i256 %a, 1
   %m = shl i256 %b, 1
@@ -917,8 +697,8 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; ILP-NEXT:    cmpq %rdi, %rsi
 ; ILP-NEXT:    sbbq $0, %rdx
 ; ILP-NEXT:    movl $0, %edx
-; ILP-NEXT:    sbbq $0, %rdx
-; ILP-NEXT:    sbbq $0, %rcx
+; ILP-NEXT:    sbbq %rdx, %rdx
+; ILP-NEXT:    sbbq %rcx, %rcx
 ; ILP-NEXT:    setae %cl
 ; ILP-NEXT:    movzbl %cl, %ecx
 ; ILP-NEXT:    subq %rcx, %rax
@@ -933,8 +713,8 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; HYBRID-NEXT:    cmpq %rdi, %rsi
 ; HYBRID-NEXT:    sbbq $0, %rcx
 ; HYBRID-NEXT:    movl $0, %ecx
-; HYBRID-NEXT:    sbbq $0, %rcx
-; HYBRID-NEXT:    sbbq $0, %rax
+; HYBRID-NEXT:    sbbq %rcx, %rcx
+; HYBRID-NEXT:    sbbq %rax, %rax
 ; HYBRID-NEXT:    setae %al
 ; HYBRID-NEXT:    movzbl %al, %ecx
 ; HYBRID-NEXT:    movl $2, %eax
@@ -950,8 +730,8 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; BURR-NEXT:    cmpq %rdi, %rsi
 ; BURR-NEXT:    sbbq $0, %rcx
 ; BURR-NEXT:    movl $0, %ecx
-; BURR-NEXT:    sbbq $0, %rcx
-; BURR-NEXT:    sbbq $0, %rax
+; BURR-NEXT:    sbbq %rcx, %rcx
+; BURR-NEXT:    sbbq %rax, %rax
 ; BURR-NEXT:    setae %al
 ; BURR-NEXT:    movzbl %al, %ecx
 ; BURR-NEXT:    movl $2, %eax
@@ -967,8 +747,8 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; SRC-NEXT:    cmpq %rdi, %rsi
 ; SRC-NEXT:    sbbq $0, %rax
 ; SRC-NEXT:    movl $0, %eax
-; SRC-NEXT:    sbbq $0, %rax
-; SRC-NEXT:    sbbq $0, %rcx
+; SRC-NEXT:    sbbq %rax, %rax
+; SRC-NEXT:    sbbq %rcx, %rcx
 ; SRC-NEXT:    setae %al
 ; SRC-NEXT:    movzbl %al, %ecx
 ; SRC-NEXT:    movl $2, %eax
@@ -985,8 +765,8 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; LIN-NEXT:    cmpq %rdi, %rsi
 ; LIN-NEXT:    sbbq $0, %rdx
 ; LIN-NEXT:    movl $0, %edx
-; LIN-NEXT:    sbbq $0, %rdx
-; LIN-NEXT:    sbbq $0, %rcx
+; LIN-NEXT:    sbbq %rdx, %rdx
+; LIN-NEXT:    sbbq %rcx, %rcx
 ; LIN-NEXT:    setae %cl
 ; LIN-NEXT:    movzbl %cl, %ecx
 ; LIN-NEXT:    subq %rcx, %rax
diff --git a/test/CodeGen/X86/segmented-stacks-standalone.ll b/test/CodeGen/X86/segmented-stacks-standalone.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a7205e6b33e3c7aabccb5b0483be1c7632f8c1a3
--- /dev/null
+++ b/test/CodeGen/X86/segmented-stacks-standalone.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+
+; This test is standalone because segmented-stacks.ll generates
+; object-files with both .note.GNU-split-stack (for the split-stack
+; functions) and .note.GNU-no-split-stack sections (for the
+; non-split-stack functions). But a split-stack function without a
+; stack frame should have a .note.GNU-split-stack section regardless
+; of any other contents of the compilation unit.
+
+define void @test_nostack() #0 {
+	ret void
+}
+
+attributes #0 = { "split-stack" }
+
+; X32-Linux: .section ".note.GNU-split-stack","",@progbits
+; X32-Linux: .section ".note.GNU-no-split-stack","",@progbits
+
+; X64-Linux: .section ".note.GNU-split-stack","",@progbits
+; X64-Linux: .section ".note.GNU-no-split-stack","",@progbits
+
+; X64-FreeBSD: .section ".note.GNU-split-stack","",@progbits
+; X64-FreeBSD: .section ".note.GNU-no-split-stack","",@progbits
+
+; X32-DFlyBSD: .section ".note.GNU-split-stack","",@progbits
+; X32-DFlyBSD: .section ".note.GNU-no-split-stack","",@progbits
+
+; X64-DFlyBSD: .section ".note.GNU-split-stack","",@progbits
+; X64-DFlyBSD: .section ".note.GNU-no-split-stack","",@progbits
diff --git a/test/CodeGen/X86/select-of-fp-constants.ll b/test/CodeGen/X86/select-of-fp-constants.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9ab12bc89b1a907e932a668531fb11c1481c8d7b
--- /dev/null
+++ b/test/CodeGen/X86/select-of-fp-constants.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386--   -mattr=sse2     | FileCheck %s --check-prefixes=X32,X32_SSE,X32_SSE2
+; RUN: llc < %s -mtriple=i386--   -mattr=sse4.1   | FileCheck %s --check-prefixes=X32,X32_SSE,X32_SSE4
+; RUN: llc < %s -mtriple=i386--   -mattr=avx2     | FileCheck %s --check-prefixes=X32,X32_AVX,X32_AVX2
+; RUN: llc < %s -mtriple=i386--   -mattr=avx512f  | FileCheck %s --check-prefixes=X32,X32_AVX,X32_AVX512F
+; RUN: llc < %s -mtriple=x86_64-- -mattr=sse2     | FileCheck %s --check-prefixes=X64,X64_SSE,X64_SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=sse4.1   | FileCheck %s --check-prefixes=X64,X64_SSE,X64_SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2     | FileCheck %s --check-prefixes=X64,X64_AVX,X64_AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f  | FileCheck %s --check-prefixes=X64,X64_AVX,X64_AVX512F
+
+; This should do a single load into the fp stack for the return, not diddle with xmm registers.
+
+define float @icmp_select_fp_constants(i32 %x) nounwind readnone {
+; X32-LABEL: icmp_select_fp_constants:
+; X32:       # %bb.0:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    flds {{\.LCPI.*}}(,%eax,4)
+; X32-NEXT:    retl
+;
+; X64_SSE-LABEL: icmp_select_fp_constants:
+; X64_SSE:       # %bb.0:
+; X64_SSE-NEXT:    xorl %eax, %eax
+; X64_SSE-NEXT:    testl %edi, %edi
+; X64_SSE-NEXT:    sete %al
+; X64_SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64_SSE-NEXT:    retq
+;
+; X64_AVX-LABEL: icmp_select_fp_constants:
+; X64_AVX:       # %bb.0:
+; X64_AVX-NEXT:    xorl %eax, %eax
+; X64_AVX-NEXT:    testl %edi, %edi
+; X64_AVX-NEXT:    sete %al
+; X64_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64_AVX-NEXT:    retq
+	%c = icmp eq i32 %x, 0
+	%r = select i1 %c, float 42.0, float 23.0
+	ret float %r
+}
+
+define float @fcmp_select_fp_constants(float %x) nounwind readnone {
+; X32_SSE-LABEL: fcmp_select_fp_constants:
+; X32_SSE:       # %bb.0:
+; X32_SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32_SSE-NEXT:    cmpneqss {{[0-9]+}}(%esp), %xmm0
+; X32_SSE-NEXT:    movd %xmm0, %eax
+; X32_SSE-NEXT:    andl $1, %eax
+; X32_SSE-NEXT:    flds {{\.LCPI.*}}(,%eax,4)
+; X32_SSE-NEXT:    retl
+;
+; X32_AVX2-LABEL: fcmp_select_fp_constants:
+; X32_AVX2:       # %bb.0:
+; X32_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32_AVX2-NEXT:    vcmpneqss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32_AVX2-NEXT:    vmovd %xmm0, %eax
+; X32_AVX2-NEXT:    andl $1, %eax
+; X32_AVX2-NEXT:    flds {{\.LCPI.*}}(,%eax,4)
+; X32_AVX2-NEXT:    retl
+;
+; X32_AVX512F-LABEL: fcmp_select_fp_constants:
+; X32_AVX512F:       # %bb.0:
+; X32_AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32_AVX512F-NEXT:    vcmpneqss {{[0-9]+}}(%esp), %xmm0, %k0
+; X32_AVX512F-NEXT:    kmovw %k0, %eax
+; X32_AVX512F-NEXT:    flds {{\.LCPI.*}}(,%eax,4)
+; X32_AVX512F-NEXT:    retl
+;
+; X64_SSE-LABEL: fcmp_select_fp_constants:
+; X64_SSE:       # %bb.0:
+; X64_SSE-NEXT:    cmpneqss {{.*}}(%rip), %xmm0
+; X64_SSE-NEXT:    movd %xmm0, %eax
+; X64_SSE-NEXT:    andl $1, %eax
+; X64_SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64_SSE-NEXT:    retq
+;
+; X64_AVX2-LABEL: fcmp_select_fp_constants:
+; X64_AVX2:       # %bb.0:
+; X64_AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64_AVX2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64_AVX2-NEXT:    vcmpneqss {{.*}}(%rip), %xmm0, %xmm0
+; X64_AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; X64_AVX2-NEXT:    retq
+;
+; X64_AVX512F-LABEL: fcmp_select_fp_constants:
+; X64_AVX512F:       # %bb.0:
+; X64_AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64_AVX512F-NEXT:    vcmpneqss {{.*}}(%rip), %xmm0, %k1
+; X64_AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64_AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X64_AVX512F-NEXT:    retq
+ %c = fcmp une float %x, -4.0
+ %r = select i1 %c, float 42.0, float 23.0
+ ret float %r
+}
+
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index f35da3b8b2c507b727eafd99bacd9f964b296a93..947c95137206de701d5c01474b269cde991adb1a 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -624,21 +624,13 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ;; Test integer select between values and constants.
 
 define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
-; GENERIC-LABEL: test9:
-; GENERIC:       ## %bb.0:
-; GENERIC-NEXT:    cmpq $1, %rdi
-; GENERIC-NEXT:    sbbq %rax, %rax
-; GENERIC-NEXT:    orq %rsi, %rax
-; GENERIC-NEXT:    retq
-;
-; ATOM-LABEL: test9:
-; ATOM:       ## %bb.0:
-; ATOM-NEXT:    cmpq $1, %rdi
-; ATOM-NEXT:    sbbq %rax, %rax
-; ATOM-NEXT:    orq %rsi, %rax
-; ATOM-NEXT:    nop
-; ATOM-NEXT:    nop
-; ATOM-NEXT:    retq
+; CHECK-LABEL: test9:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $1, %rdi
+; CHECK-NEXT:    sbbq %rax, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: test9:
 ; ATHLON:       ## %bb.0:
@@ -672,21 +664,13 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 
 ;; Same as test9
 define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
-; GENERIC-LABEL: test9a:
-; GENERIC:       ## %bb.0:
-; GENERIC-NEXT:    cmpq $1, %rdi
-; GENERIC-NEXT:    sbbq %rax, %rax
-; GENERIC-NEXT:    orq %rsi, %rax
-; GENERIC-NEXT:    retq
-;
-; ATOM-LABEL: test9a:
-; ATOM:       ## %bb.0:
-; ATOM-NEXT:    cmpq $1, %rdi
-; ATOM-NEXT:    sbbq %rax, %rax
-; ATOM-NEXT:    orq %rsi, %rax
-; ATOM-NEXT:    nop
-; ATOM-NEXT:    nop
-; ATOM-NEXT:    retq
+; CHECK-LABEL: test9a:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $1, %rdi
+; CHECK-NEXT:    sbbq %rax, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: test9a:
 ; ATHLON:       ## %bb.0:
@@ -803,6 +787,7 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK-LABEL: test11:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpq $1, %rdi
 ; CHECK-NEXT:    sbbq %rax, %rax
 ; CHECK-NEXT:    notq %rax
@@ -842,6 +827,7 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK-LABEL: test11a:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpq $1, %rdi
 ; CHECK-NEXT:    sbbq %rax, %rax
 ; CHECK-NEXT:    notq %rax
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
index 68c83391e60d26466eacdd48abc6493679c91a38..0f252c8261f465f4348bf5b736d628a149c5d5a6 100644
--- a/test/CodeGen/X86/select_const.ll
+++ b/test/CodeGen/X86/select_const.ll
@@ -503,3 +503,12 @@ define i64 @opaque_constant(i1 %cond, i64 %x) {
   ret i64 %add
 }
 
+define float @select_undef_fp(float %x) {
+; CHECK-LABEL: select_undef_fp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    retq
+  %f = select i1 undef, float 4.0, float %x
+  ret float %f
+}
+
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 100461d22c995ffccb3229071ae7a8c888b1a9b8..9bcb4fd1e63ea4289b30df76c470bb0d3c16e8be 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -66,7 +66,7 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
 ; KNL-32-NEXT:    pushl %esi
 ; KNL-32-NEXT:    .cfi_def_cfa_offset 8
 ; KNL-32-NEXT:    .cfi_offset %esi, -8
-; KNL-32-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; KNL-32-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-32-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll
index d8cc50cb01d36682bbee0ea1cc436de792d22375..539649b7cd4765349a23338096ffd5378a511f29 100644
--- a/test/CodeGen/X86/shift-folding.ll
+++ b/test/CodeGen/X86/shift-folding.ll
@@ -71,3 +71,15 @@ define i64 @test5(i16 %i, i32* %arr) {
   %sum = add i64 %val.zext, %index.zext
   ret i64 %sum
 }
+
+; We should not crash because an undef shift was created.
+
+define i32 @overshift(i32 %a) {
+; CHECK-LABEL: overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retl
+  %shr = lshr i32 %a, 33
+  %xor = xor i32 1, %shr
+  ret i32 %xor
+}
+
diff --git a/test/CodeGen/X86/shl-crash-on-legalize.ll b/test/CodeGen/X86/shl-crash-on-legalize.ll
index 22735f07b0a1ba88d6fdedf779d8e07ec292bd80..d9de001c5a86f018f643497a8c320b93f49f6c8c 100644
--- a/test/CodeGen/X86/shl-crash-on-legalize.ll
+++ b/test/CodeGen/X86/shl-crash-on-legalize.ll
@@ -8,26 +8,34 @@ target triple = "x86_64-unknown-linux-gnu"
 
 @structMember = external local_unnamed_addr global i64, align 8
 
-; Function Attrs: norecurse nounwind uwtable
-define i32 @_Z3foov() local_unnamed_addr #0 {
-; CHECK-LABEL: _Z3foov:
+define i32 @PR29058(i8 %x, i32 %y) {
+; CHECK-LABEL: PR29058:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    testb %dil, %dil
+; CHECK-NEXT:    movl $2147483646, %eax # imm = 0x7FFFFFFE
+; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    sbbb %dl, %dl
+; CHECK-NEXT:    orb %dl, %cl
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    retq
 entry:
-  %bool_1 = icmp ne i8 undef, 0
-  %bool_2 = icmp eq i8 undef, 0
-  %0 = select i1 %bool_2, i32 2147483646, i32 undef
-  %or_1 = select i1 %bool_1, i32 undef, i32 -1
+  %bool_1 = icmp ne i8 %x, 0
+  %bool_2 = icmp eq i8 %x, 0
+  %0 = select i1 %bool_2, i32 2147483646, i32 %y
+  %or_1 = select i1 %bool_1, i32 %y, i32 -1
   %shl_1 = shl i32 %0, %or_1
   %conv = zext i32 %shl_1 to i64
   store i64 %conv, i64* @structMember, align 8
-  %tmp = select i1 %bool_2, i32 2147483646, i32 undef
-  %lnot = icmp eq i8 undef, 0
-  %or_2 = select i1 %lnot, i32 -1, i32 undef
+  %tmp = select i1 %bool_2, i32 2147483646, i32 %y
+  %lnot = icmp eq i8 %x, 0
+  %or_2 = select i1 %lnot, i32 -1, i32 %y
   %shl_2 = shl i32 %tmp, %or_2
   ret i32 %shl_2
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/test/CodeGen/X86/shrink_vmul-widen.ll b/test/CodeGen/X86/shrink_vmul-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7a24743d5620b0602158d1f1c89d5a7f69f91ca5
--- /dev/null
+++ b/test/CodeGen/X86/shrink_vmul-widen.ll
@@ -0,0 +1,2575 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+
+@c = external global i32*, align 8
+
+; %val1 = load <2 x i8>
+; %op1 = zext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_2xi8:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
+; X86-SSE-NEXT:    movd %eax, %xmm1
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl c, %esi
+; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
+; X86-AVX-NEXT:    vmovd %edx, %xmm0
+; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
+; X86-AVX-NEXT:    vmovd %eax, %xmm1
+; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm1
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm1
+; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <4 x i8>
+; %op1 = zext<4 x i32> %val1
+; %val2 = load <4 x i8>
+; %op2 = zext<4 x i32> %val2
+; %rst = mul <4 x i32> %op1, %op2
+;
+define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_4xi8:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_4xi8:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl c, %esi
+; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_4xi8:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_4xi8:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
+  %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
+  %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
+  %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
+  %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
+  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+  store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <8 x i8>
+; %op1 = zext<8 x i32> %val1
+; %val2 = load <8 x i8>
+; %op2 = zext<8 x i32> %val2
+; %rst = mul <8 x i32> %op1, %op2
+;
+define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_8xi8:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: mul_8xi8:
+; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    pushl %esi
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    movl c, %esi
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
+; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: mul_8xi8:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT:    movl c, %esi
+; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
+; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_8xi8:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: mul_8xi8:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rax,%rdx,4)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: mul_8xi8:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
+  %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
+  %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
+  %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
+  %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
+  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
+  store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <16 x i8>
+; %op1 = zext<16 x i32> %val1
+; %val2 = load <16 x i8>
+; %op2 = zext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_16xi8:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
+; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X86-SSE-NEXT:    pmullw %xmm3, %xmm4
+; X86-SSE-NEXT:    movdqa %xmm4, %xmm3
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm0, 32(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm4, 16(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm3, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: mul_16xi8:
+; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    pushl %esi
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    movl c, %esi
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
+; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
+; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: mul_16xi8:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT:    movl c, %esi
+; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
+; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
+; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_16xi8:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
+; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X64-SSE-NEXT:    pmullw %xmm3, %xmm4
+; X64-SSE-NEXT:    movdqa %xmm4, %xmm3
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm0, 32(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm4, 16(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm3, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: mul_16xi8:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovups %ymm0, 32(%rax,%rdx,4)
+; X64-AVX1-NEXT:    vmovups %ymm2, (%rax,%rdx,4)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: mul_16xi8:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
+; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
+  %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
+  %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
+  %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
+  %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
+  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = zext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_2xi16:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi16:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl c, %esi
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi16:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi16:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <4 x i16>
+; %op1 = zext<4 x i32> %val1
+; %val2 = load <4 x i16>
+; %op2 = zext<4 x i32> %val2
+; %rst = mul <4 x i32> %op1, %op2
+;
+define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_4xi16:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_4xi16:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl c, %esi
+; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_4xi16:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_4xi16:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
+  %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
+  %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
+  %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
+  %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
+  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+  store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <8 x i16>
+; %op1 = zext<8 x i32> %val1
+; %val2 = load <8 x i16>
+; %op2 = zext<8 x i32> %val2
+; %rst = mul <8 x i32> %op1, %op2
+;
+define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_8xi16:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
+; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: mul_8xi16:
+; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    pushl %esi
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    movl c, %esi
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
+; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: mul_8xi16:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT:    movl c, %esi
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
+; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_8xi16:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
+; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: mul_8xi16:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rax,%rdx,4)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: mul_8xi16:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
+  %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
+  %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
+  %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
+  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
+  store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <16 x i16>
+; %op1 = zext<16 x i32> %val1
+; %val2 = load <16 x i16>
+; %op2 = zext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_16xi16:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
+; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
+; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
+; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm4
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm4
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-SSE-NEXT:    movdqu %xmm3, 48(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm1, 32(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm2, 16(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: mul_16xi16:
+; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    pushl %esi
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    movl c, %esi
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
+; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
+; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: mul_16xi16:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT:    movl c, %esi
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
+; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
+; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_16xi16:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
+; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
+; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
+; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm4
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm4
+; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X64-SSE-NEXT:    movdqu %xmm3, 48(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm1, 32(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm2, 16(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: mul_16xi16:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovups %ymm0, 32(%rax,%rdx,4)
+; X64-AVX1-NEXT:    vmovups %ymm2, (%rax,%rdx,4)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: mul_16xi16:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
+; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
+  %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
+  %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
+  %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
+  %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
+  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <2 x i8>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = sext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_2xi8_sext:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
+; X86-SSE-NEXT:    movd %eax, %xmm1
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT:    psraw $8, %xmm0
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT:    psraw $8, %xmm1
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    psrad $16, %xmm0
+; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8_sext:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl c, %esi
+; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
+; X86-AVX-NEXT:    vmovd %edx, %xmm0
+; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
+; X86-AVX-NEXT:    vmovd %eax, %xmm1
+; X86-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
+; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8_sext:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm1
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT:    psraw $8, %xmm0
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT:    psraw $8, %xmm1
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT:    psrad $16, %xmm0
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8_sext:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm1
+; X64-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
+; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+  %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <2 x i8>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_2xi8_sext_zext:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
+; X86-SSE-NEXT:    movd %eax, %xmm1
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT:    psraw $8, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE-NEXT:    pmulhw %xmm0, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8_sext_zext:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl c, %esi
+; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
+; X86-AVX-NEXT:    vmovd %edx, %xmm0
+; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
+; X86-AVX-NEXT:    vmovd %eax, %xmm1
+; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8_sext_zext:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm1
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT:    psraw $8, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE-NEXT:    pmulhw %xmm0, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8_sext_zext:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm1
+; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = sext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_2xi16_sext:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE-NEXT:    pmulhw %xmm0, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi16_sext:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl c, %esi
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
+; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi16_sext:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE-NEXT:    pmulhw %xmm0, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi16_sext:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
+; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+  %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_2xi16_sext_zext:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X86-SSE-NEXT:    psrad $16, %xmm0
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi16_sext_zext:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl c, %esi
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi16_sext_zext:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X64-SSE-NEXT:    psrad $16, %xmm0
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi16_sext_zext:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val1 = load <16 x i16>
+; %op1 = sext<16 x i32> %val1
+; %val2 = load <16 x i16>
+; %op2 = sext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; X86-SSE-LABEL: mul_16xi16_sext:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
+; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
+; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
+; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE-NEXT:    pmulhw %xmm0, %xmm4
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE-NEXT:    pmulhw %xmm1, %xmm4
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-SSE-NEXT:    movdqu %xmm3, 48(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm1, 32(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm2, 16(%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: mul_16xi16_sext:
+; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    pushl %esi
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    movl c, %esi
+; X86-AVX1-NEXT:    vpmovsxwd 16(%edx,%ecx), %xmm0
+; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%ecx), %xmm1
+; X86-AVX1-NEXT:    vpmovsxwd (%edx,%ecx), %xmm2
+; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%ecx), %xmm3
+; X86-AVX1-NEXT:    vpmovsxwd 16(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT:    vpmovsxwd 24(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT:    vpmovsxwd (%eax,%ecx), %xmm4
+; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpmovsxwd 8(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
+; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
+; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: mul_16xi16_sext:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT:    movl c, %esi
+; X86-AVX2-NEXT:    vpmovsxwd 16(%edx,%ecx), %ymm0
+; X86-AVX2-NEXT:    vpmovsxwd (%edx,%ecx), %ymm1
+; X86-AVX2-NEXT:    vpmovsxwd 16(%eax,%ecx), %ymm2
+; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vpmovsxwd (%eax,%ecx), %ymm2
+; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
+; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
+; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_16xi16_sext:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
+; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
+; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
+; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X64-SSE-NEXT:    pmulhw %xmm0, %xmm4
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE-NEXT:    pmulhw %xmm1, %xmm4
+; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X64-SSE-NEXT:    movdqu %xmm3, 48(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm1, 32(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm2, 16(%rax,%rdx,4)
+; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: mul_16xi16_sext:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX1-NEXT:    vpmovsxwd 16(%rdi,%rdx), %xmm0
+; X64-AVX1-NEXT:    vpmovsxwd 24(%rdi,%rdx), %xmm1
+; X64-AVX1-NEXT:    vpmovsxwd (%rdi,%rdx), %xmm2
+; X64-AVX1-NEXT:    vpmovsxwd 8(%rdi,%rdx), %xmm3
+; X64-AVX1-NEXT:    vpmovsxwd 16(%rsi,%rdx), %xmm4
+; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
+; X64-AVX1-NEXT:    vpmovsxwd 24(%rsi,%rdx), %xmm4
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
+; X64-AVX1-NEXT:    vpmovsxwd (%rsi,%rdx), %xmm4
+; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpmovsxwd 8(%rsi,%rdx), %xmm4
+; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovups %ymm0, 32(%rax,%rdx,4)
+; X64-AVX1-NEXT:    vmovups %ymm2, (%rax,%rdx,4)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: mul_16xi16_sext:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX2-NEXT:    vpmovsxwd 16(%rdi,%rdx), %ymm0
+; X64-AVX2-NEXT:    vpmovsxwd (%rdi,%rdx), %ymm1
+; X64-AVX2-NEXT:    vpmovsxwd 16(%rsi,%rdx), %ymm2
+; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vpmovsxwd (%rsi,%rdx), %ymm2
+; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
+; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
+  %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
+  %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
+  %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
+  %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
+  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi8_varconst1:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst1:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
+; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst1:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst1:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi8_varconst2:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm0
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT:    psraw $8, %xmm0
+; X86-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE-NEXT:    psrad $16, %xmm0
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst2:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
+; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst2:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT:    psraw $8, %xmm0
+; X64-SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE-NEXT:    psrad $16, %xmm0
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst2:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi8_varconst3:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst3:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
+; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst3:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst3:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi8_varconst4:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst4:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
+; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst4:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst4:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi8_varconst5:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm0
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT:    psraw $8, %xmm0
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst5:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
+; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst5:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT:    psraw $8, %xmm0
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst5:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi8_varconst6:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm0
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT:    psraw $8, %xmm0
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi8_varconst6:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
+; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi8_varconst6:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-SSE-NEXT:    movd %ecx, %xmm0
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT:    psraw $8, %xmm0
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi8_varconst6:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT:    vmovd %ecx, %xmm0
+; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi16_varconst1:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi16_varconst1:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi16_varconst1:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi16_varconst1:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi16_varconst2:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi16_varconst2:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi16_varconst2:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi16_varconst2:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi16_varconst3:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65536,u,u>
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi16_varconst3:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi16_varconst3:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65536,u,u>
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi16_varconst3:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
+; X86-SSE-LABEL: mul_2xi16_varconst4:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl c, %edx
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X86-SSE-NEXT:    psrad $16, %xmm0
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,32768,u,u>
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: mul_2xi16_varconst4:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl c, %edx
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: mul_2xi16_varconst4:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
+; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X64-SSE-NEXT:    psrad $16, %xmm0
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,32768,u,u>
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_2xi16_varconst4:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
+; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
+; X64-AVX-NEXT:    retq
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
+
+;
+; Illegal Types
+;
+
+define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
+; X86-SSE-LABEL: PR34947:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movdqa (%eax), %xmm5
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movdqa (%ecx), %xmm3
+; X86-SSE-NEXT:    movdqa 16(%ecx), %xmm6
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
+; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
+; X86-SSE-NEXT:    movd %xmm0, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
+; X86-SSE-NEXT:    movd %xmm2, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[2,3,0,1]
+; X86-SSE-NEXT:    movd %xmm2, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm7
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
+; X86-SSE-NEXT:    movd %xmm5, %eax
+; X86-SSE-NEXT:    movd %xmm6, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm2
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; X86-SSE-NEXT:    movd %xmm5, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; X86-SSE-NEXT:    movd %xmm5, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm5
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
+; X86-SSE-NEXT:    movd %xmm6, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3]
+; X86-SSE-NEXT:    movd %xmm6, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm6
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
+; X86-SSE-NEXT:    movd %xmm7, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
+; X86-SSE-NEXT:    movd %xmm7, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm7
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; X86-SSE-NEXT:    movd %xmm4, %eax
+; X86-SSE-NEXT:    movd %xmm3, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; X86-SSE-NEXT:    movd %xmm4, %eax
+; X86-SSE-NEXT:    movd %edx, %xmm4
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; X86-SSE-NEXT:    movd %xmm3, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm3
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm6[0,0]
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm4
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0]
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl 32(%ecx)
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm5
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT:    movl $8199, %eax # imm = 0x2007
+; X86-SSE-NEXT:    movd %eax, %xmm2
+; X86-SSE-NEXT:    pmuludq %xmm0, %xmm2
+; X86-SSE-NEXT:    movd %xmm2, (%eax)
+; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
+; X86-SSE-NEXT:    movdqa %xmm4, (%eax)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: PR34947:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    pushl %ebp
+; X86-AVX1-NEXT:    pushl %ebx
+; X86-AVX1-NEXT:    pushl %edi
+; X86-AVX1-NEXT:    pushl %esi
+; X86-AVX1-NEXT:    subl $16, %esp
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vmovd %xmm1, %eax
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl 32(%ecx)
+; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
+; X86-AVX1-NEXT:    vmovdqa (%ecx), %xmm3
+; X86-AVX1-NEXT:    vmovdqa 16(%ecx), %xmm1
+; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %ecx
+; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %ecx
+; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %ecx
+; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-AVX1-NEXT:    vmovd %xmm2, %eax
+; X86-AVX1-NEXT:    vmovd %xmm3, %ecx
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %ecx
+; X86-AVX1-NEXT:    movl %edx, %ebp
+; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %ecx
+; X86-AVX1-NEXT:    movl %edx, %ebx
+; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %esi
+; X86-AVX1-NEXT:    movl %edx, %esi
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %edi
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %edi
+; X86-AVX1-NEXT:    movl %edx, %edi
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %ecx
+; X86-AVX1-NEXT:    vmovd %edx, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
+; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [8199,8199,8199,8199]
+; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)
+; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
+; X86-AVX1-NEXT:    addl $16, %esp
+; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    popl %edi
+; X86-AVX1-NEXT:    popl %ebx
+; X86-AVX1-NEXT:    popl %ebp
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: PR34947:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    pushl %edi
+; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm2
+; X86-AVX2-NEXT:    vmovdqa 16(%esi), %xmm3
+; X86-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; X86-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    movl %edx, %ecx
+; X86-AVX2-NEXT:    vmovd %xmm3, %edi
+; X86-AVX2-NEXT:    vmovd %xmm4, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl %edi
+; X86-AVX2-NEXT:    vmovd %edx, %xmm5
+; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
+; X86-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
+; X86-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; X86-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
+; X86-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
+; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
+; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    movl %edx, %ecx
+; X86-AVX2-NEXT:    vmovd %xmm2, %edi
+; X86-AVX2-NEXT:    vmovd %xmm1, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl %edi
+; X86-AVX2-NEXT:    vmovd %edx, %xmm4
+; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
+; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
+; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
+; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
+; X86-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    xorl %edx, %edx
+; X86-AVX2-NEXT:    divl 32(%esi)
+; X86-AVX2-NEXT:    vmovd %edx, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
+; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovd %xmm0, (%eax)
+; X86-AVX2-NEXT:    vmovdqa %ymm1, (%eax)
+; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    popl %edi
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: PR34947:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movdqa (%rdi), %xmm5
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movdqa (%rsi), %xmm2
+; X64-SSE-NEXT:    movdqa 16(%rsi), %xmm6
+; X64-SSE-NEXT:    pxor %xmm0, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
+; X64-SSE-NEXT:    movd %xmm0, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
+; X64-SSE-NEXT:    movd %xmm0, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm8
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
+; X64-SSE-NEXT:    movd %xmm3, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
+; X64-SSE-NEXT:    movd %xmm3, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm7
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; X64-SSE-NEXT:    movd %xmm5, %eax
+; X64-SSE-NEXT:    movd %xmm6, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm3
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; X64-SSE-NEXT:    movd %xmm5, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; X64-SSE-NEXT:    movd %xmm5, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm5
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
+; X64-SSE-NEXT:    movd %xmm6, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
+; X64-SSE-NEXT:    movd %xmm6, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm6
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
+; X64-SSE-NEXT:    movd %xmm7, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
+; X64-SSE-NEXT:    movd %xmm7, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm7
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; X64-SSE-NEXT:    movd %xmm4, %eax
+; X64-SSE-NEXT:    movd %xmm2, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; X64-SSE-NEXT:    movd %xmm4, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X64-SSE-NEXT:    movd %xmm2, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm2
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl 32(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm1
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
+; X64-SSE-NEXT:    pmuludq %xmm4, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
+; X64-SSE-NEXT:    pmuludq %xmm4, %xmm2
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-SSE-NEXT:    pmuludq %xmm4, %xmm3
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0]
+; X64-SSE-NEXT:    pmuludq %xmm4, %xmm5
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-SSE-NEXT:    movl $8199, %eax # imm = 0x2007
+; X64-SSE-NEXT:    movd %eax, %xmm3
+; X64-SSE-NEXT:    pmuludq %xmm1, %xmm3
+; X64-SSE-NEXT:    movd %xmm3, (%rax)
+; X64-SSE-NEXT:    movdqa %xmm2, (%rax)
+; X64-SSE-NEXT:    movdqa %xmm0, (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: PR34947:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    pushq %rbp
+; X64-AVX1-NEXT:    pushq %rbx
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vmovd %xmm1, %eax
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl 32(%rsi)
+; X64-AVX1-NEXT:    movl %edx, %r8d
+; X64-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
+; X64-AVX1-NEXT:    vmovdqa (%rsi), %xmm3
+; X64-AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; X64-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl %ecx
+; X64-AVX1-NEXT:    movl %edx, %r9d
+; X64-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl %ecx
+; X64-AVX1-NEXT:    movl %edx, %r10d
+; X64-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl %ecx
+; X64-AVX1-NEXT:    movl %edx, %r11d
+; X64-AVX1-NEXT:    vmovd %xmm2, %eax
+; X64-AVX1-NEXT:    vmovd %xmm3, %ecx
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl %ecx
+; X64-AVX1-NEXT:    movl %edx, %esi
+; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl %ecx
+; X64-AVX1-NEXT:    movl %edx, %edi
+; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl %ecx
+; X64-AVX1-NEXT:    movl %edx, %ecx
+; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl %ebx
+; X64-AVX1-NEXT:    movl %edx, %ebx
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
+; X64-AVX1-NEXT:    xorl %edx, %edx
+; X64-AVX1-NEXT:    divl %ebp
+; X64-AVX1-NEXT:    vmovd %edx, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %esi, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; X64-AVX1-NEXT:    vmovd %r8d, %xmm2
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT:    vmovd %xmm1, (%rax)
+; X64-AVX1-NEXT:    vmovaps %ymm0, (%rax)
+; X64-AVX1-NEXT:    popq %rbx
+; X64-AVX1-NEXT:    popq %rbp
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: PR34947:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vmovdqa (%rsi), %xmm2
+; X64-AVX2-NEXT:    vmovdqa 16(%rsi), %xmm3
+; X64-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; X64-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    vmovd %xmm3, %edi
+; X64-AVX2-NEXT:    vmovd %xmm4, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl %edi
+; X64-AVX2-NEXT:    vmovd %edx, %xmm5
+; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
+; X64-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
+; X64-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
+; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
+; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    vmovd %xmm2, %edi
+; X64-AVX2-NEXT:    vmovd %xmm1, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl %edi
+; X64-AVX2-NEXT:    vmovd %edx, %xmm4
+; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
+; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
+; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
+; X64-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    xorl %edx, %edx
+; X64-AVX2-NEXT:    divl 32(%rsi)
+; X64-AVX2-NEXT:    vmovd %edx, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
+; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovd %xmm0, (%rax)
+; X64-AVX2-NEXT:    vmovdqa %ymm1, (%rax)
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %a0 = load <9 x i16>, <9 x i16>* %p0, align 64
+  %a1 = load <9 x i32>, <9 x i32>* %p1, align 64
+  %ext0 = zext <9 x i16> %a0 to <9 x i32>
+  %rem = urem <9 x i32> %ext0, %a1
+  %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
+  store <9 x i32> %mul, <9 x i32>* undef, align 64
+  ret void
+}
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index f190a4174197ba24fc4a444231ac54aeaea3e240..97290730d90f7d89613a348b942a38e2a1cdb159 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -106,14 +106,13 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl c, %esi
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X86-SSE-NEXT:    pmaddwd %xmm0, %xmm2
-; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
@@ -135,14 +134,13 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    pxor %xmm1, %xmm1
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X64-SSE-NEXT:    pmaddwd %xmm0, %xmm2
-; X64-SSE-NEXT:    movdqu %xmm2, (%rax,%rdx,4)
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mul_4xi8:
@@ -1156,10 +1154,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
 ; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
-; X86-SSE-NEXT:    pmuludq %xmm0, %xmm2
-; X86-SSE-NEXT:    psllq $32, %xmm2
-; X86-SSE-NEXT:    paddq %xmm1, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
@@ -1193,10 +1188,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
 ; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
-; X64-SSE-NEXT:    psllq $32, %xmm2
-; X64-SSE-NEXT:    paddq %xmm1, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
 ; X64-SSE-NEXT:    retq
 ;
@@ -1954,15 +1946,7 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <0,u,65536,u>
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE-NEXT:    psrlq $32, %xmm3
-; X86-SSE-NEXT:    pmuludq %xmm0, %xmm3
-; X86-SSE-NEXT:    paddq %xmm1, %xmm3
-; X86-SSE-NEXT:    psllq $32, %xmm3
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
-; X86-SSE-NEXT:    paddq %xmm3, %xmm0
+; X86-SSE-NEXT:    pmuludq {{\.LCPI.*}}, %xmm0
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
@@ -1988,13 +1972,10 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; X64-SSE-NEXT:    movl $65536, %ecx # imm = 0x10000
-; X64-SSE-NEXT:    movq %rcx, %xmm2
-; X64-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; X64-SSE-NEXT:    pmuludq %xmm2, %xmm0
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
-; X64-SSE-NEXT:    psllq $32, %xmm2
-; X64-SSE-NEXT:    paddq %xmm0, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X64-SSE-NEXT:    movq %rcx, %xmm1
+; X64-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
 ; X64-SSE-NEXT:    retq
 ;
@@ -2039,16 +2020,7 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X86-SSE-NEXT:    psrad $16, %xmm0
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,u,32768,u>
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE-NEXT:    psrlq $32, %xmm3
-; X86-SSE-NEXT:    pmuludq %xmm0, %xmm3
-; X86-SSE-NEXT:    paddq %xmm2, %xmm3
-; X86-SSE-NEXT:    psllq $32, %xmm3
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT:    paddq %xmm3, %xmm0
+; X86-SSE-NEXT:    pmuludq {{\.LCPI.*}}, %xmm0
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
@@ -2074,12 +2046,8 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
 ; X64-SSE-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X64-SSE-NEXT:    movq %rcx, %xmm1
 ; X64-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT:    pxor %xmm2, %xmm2
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
-; X64-SSE-NEXT:    psllq $32, %xmm2
-; X64-SSE-NEXT:    paddq %xmm0, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
 ; X64-SSE-NEXT:    retq
 ;
@@ -2216,80 +2184,75 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-AVX1-NEXT:    pushl %ebx
 ; X86-AVX1-NEXT:    pushl %edi
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    subl $8, %esp
+; X86-AVX1-NEXT:    subl $16, %esp
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    vmovdqa (%eax), %ymm2
-; X86-AVX1-NEXT:    vmovdqa (%ecx), %ymm1
-; X86-AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; X86-AVX1-NEXT:    vmovd %xmm2, %eax
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vmovd %xmm1, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl 32(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %eax
+; X86-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
+; X86-AVX1-NEXT:    vmovdqa (%ecx), %xmm3
+; X86-AVX1-NEXT:    vmovdqa 16(%ecx), %xmm1
+; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %eax
+; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, %edi
-; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %eax
+; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, %ebx
-; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
-; X86-AVX1-NEXT:    vmovd %xmm3, %eax
+; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-AVX1-NEXT:    vmovd %xmm2, %eax
+; X86-AVX1-NEXT:    vmovd %xmm3, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, %ebp
 ; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %ecx
+; X86-AVX1-NEXT:    movl %edx, %ebx
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
+; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %esi
 ; X86-AVX1-NEXT:    movl %edx, %esi
-; X86-AVX1-NEXT:    vmovd %ebp, %xmm2
-; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %ebp
-; X86-AVX1-NEXT:    divl %ebp
-; X86-AVX1-NEXT:    movl %edx, %ebp
-; X86-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vmovd %xmm0, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %edi, %xmm2, %xmm0
-; X86-AVX1-NEXT:    vmovd %xmm1, %edi
-; X86-AVX1-NEXT:    vpinsrd $3, (%esp), %xmm0, %xmm0 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %edi
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %edi
-; X86-AVX1-NEXT:    vmovd %edx, %xmm1
-; X86-AVX1-NEXT:    vpinsrd $1, %ebp, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
+; X86-AVX1-NEXT:    movl %edx, %edi
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %ecx
+; X86-AVX1-NEXT:    vmovd %edx, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
+; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
 ; X86-AVX1-NEXT:    vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Folded Reload
 ; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
-; X86-AVX1-NEXT:    vmovd %eax, %xmm3
-; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
-; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [8199,8199,8199,8199]
+; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)
 ; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
-; X86-AVX1-NEXT:    addl $8, %esp
+; X86-AVX1-NEXT:    addl $16, %esp
 ; X86-AVX1-NEXT:    popl %esi
 ; X86-AVX1-NEXT:    popl %edi
 ; X86-AVX1-NEXT:    popl %ebx
@@ -2303,13 +2266,12 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-AVX2-NEXT:    pushl %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    vmovdqa (%eax), %ymm2
-; X86-AVX2-NEXT:    vmovdqa (%esi), %ymm1
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm2
+; X86-AVX2-NEXT:    vmovdqa 16(%esi), %xmm3
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
@@ -2330,37 +2292,34 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
 ; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
-; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
+; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
 ; X86-AVX2-NEXT:    movl %edx, %ecx
-; X86-AVX2-NEXT:    vmovd %xmm1, %edi
-; X86-AVX2-NEXT:    vmovd %xmm2, %eax
+; X86-AVX2-NEXT:    vmovd %xmm2, %edi
+; X86-AVX2-NEXT:    vmovd %xmm1, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %edi
 ; X86-AVX2-NEXT:    vmovd %edx, %xmm4
 ; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
-; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
+; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
+; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
 ; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
+; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
+; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
 ; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
 ; X86-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl 32(%esi)
 ; X86-AVX2-NEXT:    vmovd %edx, %xmm0
 ; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
 ; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
-; X86-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
-; X86-AVX2-NEXT:    vmovd %eax, %xmm2
 ; X86-AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, (%eax)
 ; X86-AVX2-NEXT:    vmovdqa %ymm1, (%eax)
@@ -2467,39 +2426,36 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    pushq %rbp
 ; X64-AVX1-NEXT:    pushq %rbx
-; X64-AVX1-NEXT:    vmovdqa (%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; X64-AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; X64-AVX1-NEXT:    vmovd %xmm2, %eax
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vmovd %xmm1, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl 32(%rsi)
 ; X64-AVX1-NEXT:    movl %edx, %r8d
-; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; X64-AVX1-NEXT:    vpextrd $3, %xmm3, %eax
+; X64-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
+; X64-AVX1-NEXT:    vmovdqa (%rsi), %xmm3
+; X64-AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
+; X64-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r9d
-; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; X64-AVX1-NEXT:    vpextrd $2, %xmm3, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r10d
-; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; X64-AVX1-NEXT:    vpextrd $1, %xmm3, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r11d
-; X64-AVX1-NEXT:    vmovd %xmm1, %ecx
-; X64-AVX1-NEXT:    vmovd %xmm3, %eax
+; X64-AVX1-NEXT:    vmovd %xmm2, %eax
+; X64-AVX1-NEXT:    vmovd %xmm3, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %esi
 ; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
@@ -2518,22 +2474,20 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ebp
-; X64-AVX1-NEXT:    vmovd %esi, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %edx, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vmovd %edx, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vmovd %esi, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; X64-AVX1-NEXT:    vmovd %r8d, %xmm2
 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vmovd %r8d, %xmm1
-; X64-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
-; X64-AVX1-NEXT:    vmovd %eax, %xmm2
-; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vmovd %xmm1, (%rax)
 ; X64-AVX1-NEXT:    vmovaps %ymm0, (%rax)
 ; X64-AVX1-NEXT:    popq %rbx
@@ -2543,13 +2497,12 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ;
 ; X64-AVX2-LABEL: PR34947:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqa (%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqa (%rsi), %ymm1
-; X64-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-AVX2-NEXT:    vmovdqa (%rsi), %xmm2
+; X64-AVX2-NEXT:    vmovdqa 16(%rsi), %xmm3
 ; X64-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
-; X64-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; X64-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
@@ -2570,37 +2523,34 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
 ; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
-; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
+; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
+; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
 ; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    vmovd %xmm1, %edi
-; X64-AVX2-NEXT:    vmovd %xmm2, %eax
+; X64-AVX2-NEXT:    vmovd %xmm2, %edi
+; X64-AVX2-NEXT:    vmovd %xmm1, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %edi
 ; X64-AVX2-NEXT:    vmovd %edx, %xmm4
 ; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
-; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
+; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
+; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
 ; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
+; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
+; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
 ; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
 ; X64-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl 32(%rsi)
 ; X64-AVX2-NEXT:    vmovd %edx, %xmm0
 ; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
 ; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
-; X64-AVX2-NEXT:    vmovd %eax, %xmm2
 ; X64-AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, (%rax)
 ; X64-AVX2-NEXT:    vmovdqa %ymm1, (%rax)
diff --git a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
index ba6c994a522e72c162e7db59b94eb7aea07f900d..6d9ba45903e785596bc1b1d8c0d555080ce72c5b 100644
--- a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
+++ b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
@@ -94,10 +94,8 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind {
 define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind {
 ; AVX2-LABEL: undef_splatmask5:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm1
-; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
+; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>
   %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index 86c83a782ff02ce8e420fbb5a5d5ac61cd3137eb..51b8c686e66c0401c5a6b708d81e4799f6e6df74 100644
--- a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -8,40 +8,26 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
 
 define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -50,40 +36,26 @@ define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX1-LABEL: shuffle_v16i16_to_v8i16_1:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v16i16_to_v8i16_1:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v16i16_to_v8i16_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -94,20 +66,16 @@ define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %ymm0
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
 ; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %L
   %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -116,76 +84,59 @@ define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
@@ -194,76 +145,59 @@ define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
@@ -272,76 +206,59 @@ define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
@@ -352,89 +269,76 @@ define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -445,83 +349,68 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512BWVL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
@@ -532,89 +421,76 @@ define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
@@ -623,76 +499,59 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
@@ -701,76 +560,59 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
@@ -779,76 +621,59 @@ define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
@@ -857,70 +682,51 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512BWVL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
@@ -929,76 +735,59 @@ define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
@@ -1007,76 +796,59 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
@@ -1085,76 +857,59 @@ define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index 7d1b7da48014b1203b81dd8bb28eb320d99f275b..1209bf86e327c691e6b15329058dbd9fde310e2c 100644
--- a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -31,8 +31,8 @@ define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
@@ -43,8 +43,8 @@ define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
@@ -85,8 +85,8 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
@@ -97,11 +97,10 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
@@ -113,9 +112,8 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
 define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
 ; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512F-NEXT:    vmovaps %ymm0, (%rsi)
 ; AVX512F-NEXT:    vzeroupper
@@ -123,19 +121,17 @@ define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ;
 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
-; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; AVX512VL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
+; AVX512VL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512BW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX512BW-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
 ; AVX512BW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512BW-NEXT:    vmovaps %ymm0, (%rsi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -143,11 +139,10 @@ define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
-; AVX512BWVL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; AVX512BWVL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %L
@@ -157,81 +152,23 @@ define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 }
 
 define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v16i8_1:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_1:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_1:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_1:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: shuffle_v64i8_to_v16i8_1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
   store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -239,81 +176,23 @@ define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v16i8_2:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_2:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_2:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_2:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: shuffle_v64i8_to_v16i8_2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
   store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -321,81 +200,23 @@ define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v16i8_3:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_3:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_3:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_3:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: shuffle_v64i8_to_v16i8_3:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
   store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -405,70 +226,62 @@ define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
@@ -480,70 +293,62 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[3,1,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
@@ -555,70 +360,62 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[3,1,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
@@ -630,78 +427,74 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
@@ -712,68 +505,64 @@ define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vpmovwb %xmm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
@@ -785,78 +574,74 @@ define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
@@ -867,68 +652,64 @@ define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vpmovwb %xmm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
@@ -940,78 +721,74 @@ define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
@@ -1022,68 +799,64 @@ define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vpmovwb %xmm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
@@ -1095,78 +868,74 @@ define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll b/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fae69ea5d80e583331d2cc997dc86b06d66f0d87
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll
@@ -0,0 +1,574 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v8i8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    movq %xmm0, (%rsi)
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
+; SSE2-LABEL: trunc_v8i16_to_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: trunc_v8i16_to_v8i8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    movq %xmm0, (%rsi)
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: trunc_v8i16_to_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i16_to_v8i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i16_to_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i16_to_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <8 x i16>
+  %strided.vec = trunc <8 x i16> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
+; SSE2-LABEL: shuffle_v8i16_to_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movq %xmm0, (%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: shuffle_v8i16_to_v4i16:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT:    movq %xmm0, (%rsi)
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_to_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
+; SSE2-LABEL: trunc_v4i32_to_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movq %xmm0, (%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: trunc_v4i32_to_v4i16:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT:    movq %xmm0, (%rsi)
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: trunc_v4i32_to_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i32_to_v4i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i32_to_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i32_to_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %bc = bitcast <8 x i16> %vec to <4 x i32>
+  %strided.vec = trunc <4 x i32> %bc to <4 x i16>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
+; SSE-LABEL: shuffle_v4i32_to_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; SSE-NEXT:    movq %xmm0, (%rsi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_to_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <4 x i32>, <4 x i32>* %L
+  %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  store <2 x i32> %strided.vec, <2 x i32>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
+; SSE-LABEL: trunc_v2i64_to_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; SSE-NEXT:    movq %xmm0, (%rsi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc_v2i64_to_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <4 x i32>, <4 x i32>* %L
+  %bc = bitcast <4 x i32> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i32>
+  store <2 x i32> %strided.vec, <2 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v4i8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    movd %xmm0, (%rsi)
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; SSE2-LABEL: trunc_v4i32_to_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: trunc_v4i32_to_v4i8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    movd %xmm0, (%rsi)
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: trunc_v4i32_to_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i32_to_v4i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i32_to_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i32_to_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <4 x i32>
+  %strided.vec = trunc <4 x i32> %bc to <4 x i8>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; SSE-LABEL: shuffle_v8i16_to_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    movd %xmm0, (%rsi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_to_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+  store <2 x i16> %strided.vec, <2 x i16>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; SSE-LABEL: trunc_v2i64_to_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    movd %xmm0, (%rsi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_v2i64_to_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %bc = bitcast <8 x i16> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i16>
+  store <2 x i16> %strided.vec, <2 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v2i8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
+  store <2 x i8> %strided.vec, <2 x i8>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: trunc_v2i64_to_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: trunc_v2i64_to_v2i8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: trunc_v2i64_to_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i8>
+  store <2 x i8> %strided.vec, <2 x i8>* %S
+  ret void
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll b/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..629d37bf81fea72ad45bd9ca383a24823618d4b3
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
@@ -0,0 +1,1300 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX1-LABEL: trunc_v16i16_to_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps (%rdi), %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v16i16_to_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v16i16_to_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovwb %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <16 x i16>
+  %strided.vec = trunc <16 x i16> %bc to <16 x i8>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i16_to_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX1-LABEL: trunc_v8i32_to_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v8i32_to_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %bc = bitcast <16 x i16> %vec to <8 x i32>
+  %strided.vec = trunc <8 x i32> %bc to <8 x i16>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
+; AVX-LABEL: shuffle_v8i32_to_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <8 x i32>, <8 x i32>* %L
+  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i32> %strided.vec, <4 x i32>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
+; AVX1-LABEL: trunc_v4i64_to_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps (%rdi), %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <8 x i32>, <8 x i32>* %L
+  %bc = bitcast <8 x i32> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i32>
+  store <4 x i32> %strided.vec, <4 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v8i32_to_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <8 x i32>
+  %strided.vec = trunc <8 x i32> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
+; IR generated from:
+; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
+; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
+  %bc = bitcast <8 x i8> %truncated.vec to i64
+  %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
+  ret <2 x i64> %result
+}
+
+define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
+; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated = trunc <8 x i32> %vec to <8 x i8>
+  %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
+  %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
+  %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
+; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated = trunc <8 x i32> %vec to <8 x i16>
+  %bc = bitcast <8 x i16> %truncated to <16 x i8>
+  %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
+; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated = trunc <8 x i32> %vec to <8 x i8>
+  %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %result
+}
+
+define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
+; IR generated from:
+; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
+; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated = trunc <4 x i64> %vec to <4 x i16>
+  %bc = bitcast <4 x i16> %truncated to i64
+  %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
+  ret <2 x i64> %result
+}
+
+define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
+; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated = trunc <4 x i64> %vec to <4 x i16>
+  %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
+  %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
+  %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  ret <8 x i16> %result
+}
+
+define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
+; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated = trunc <4 x i64> %vec to <4 x i32>
+  %bc = bitcast <4 x i32> %truncated to <8 x i16>
+  %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
+  ret <8 x i16> %result
+}
+
+define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
+; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated = trunc <4 x i64> %vec to <4 x i16>
+  %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
+; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovqb %ymm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovqb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMIVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %truncated = trunc <4 x i64> %vec to <4 x i8>
+  %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
+  ret <16 x i8> %result
+}
+
+define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX1-LABEL: shuffle_v16i16_to_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VBMIVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VBMIVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VBMIVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VBMIVL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX1-LABEL: trunc_v4i64_to_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %bc = bitcast <16 x i16> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i16>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v4i64_to_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i8>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+; In this case not all elements are collected from the same source vector, so
+; the resulting BUILD_VECTOR should not be combined to a truncate.
+define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
+; AVX1-LABEL: negative:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: negative:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: negative:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: negative:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: negative:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: negative:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT:    movl $65537, %eax # imm = 0x10001
+; AVX512BWVL-NEXT:    kmovd %eax, %k1
+; AVX512BWVL-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
+; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: negative:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512VBMIVL-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %w0 = extractelement <32 x i8> %w, i32 0
+  %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
+  ret <16 x i8> %merged
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 152742c80163e1c914cc079e54c5a9f28aee6cd0..fc6a2cd6441ba38115510a7abab809ba4a3aed56 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -13,40 +13,26 @@
 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
 
 define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: shuffle_v32i8_to_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -57,12 +43,10 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX1-LABEL: trunc_v16i16_to_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovaps (%rdi), %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -70,25 +54,23 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX2-LABEL: trunc_v16i16_to_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_v16i16_to_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -122,40 +104,26 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX1-LABEL: shuffle_v16i16_to_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v16i16_to_v8i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v16i16_to_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: shuffle_v16i16_to_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -166,14 +134,13 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX1-LABEL: trunc_v8i32_to_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_v8i32_to_v8i16:
@@ -231,20 +198,16 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX-LABEL: shuffle_v8i32_to_v4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %ymm0
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: shuffle_v8i32_to_v4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %L
   %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -255,11 +218,9 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX1-LABEL: trunc_v4i64_to_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vmovaps (%rdi), %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
@@ -322,88 +283,70 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX512VBMIVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VBMIVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VBMIVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VBMIVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -414,14 +357,13 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX1-LABEL: trunc_v8i32_to_v8i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_v8i32_to_v8i8:
@@ -1041,92 +983,75 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VBMIVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VBMIVL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512VBMIVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX512VBMIVL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1137,12 +1062,10 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-LABEL: trunc_v4i64_to_v4i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vmovaps (%rdi), %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
@@ -1209,79 +1132,58 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v4i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v4i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VBMIVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VBMIVL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512VBMIVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX512VBMIVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
@@ -1292,12 +1194,10 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX1-LABEL: trunc_v4i64_to_v4i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vmovaps (%rdi), %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll b/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e78d0e050a85388df5384805c2c9deb95904458a
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll
@@ -0,0 +1,924 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
+; AVX512VL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
+; AVX512BWVL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VBMI-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VBMI-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
+; AVX512VBMIVL-NEXT:    vpermi2b 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+  store <32 x i8> %strided.vec, <32 x i8>* %S
+  ret void
+}
+
+define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
+; AVX512F-LABEL: trunc_v32i16_to_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512VBMI-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512VBMIVL-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <32 x i16>
+  %strided.vec = trunc <32 x i16> %bc to <32 x i8>
+  store <32 x i8> %strided.vec, <32 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
+; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
+; AVX512BW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512BW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
+; AVX512VBMI-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VBMI-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512VBMI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512VBMI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  store <16 x i16> %strided.vec, <16 x i16>* %S
+  ret void
+}
+
+define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
+; AVX512-LABEL: trunc_v16i32_to_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %bc = bitcast <32 x i16> %vec to <16 x i32>
+  %strided.vec = trunc <16 x i32> %bc to <16 x i16>
+  store <16 x i16> %strided.vec, <16 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
+; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
+; AVX512VL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
+; AVX512VL-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
+; AVX512BW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
+; AVX512VBMI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
+; AVX512VBMIVL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <16 x i32>, <16 x i32>* %L
+  %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i32> %strided.vec, <8 x i32>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = load <16 x i32>, <16 x i32>* %L
+  %bc = bitcast <16 x i32> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i32>
+  store <8 x i32> %strided.vec, <8 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2b 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512-LABEL: trunc_v16i32_to_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <16 x i32>
+  %strided.vec = trunc <16 x i32> %bc to <16 x i8>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %bc = bitcast <32 x i16> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i16>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BWVL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VBMI-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,8,16,24,32,40,48,56,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2b 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
+; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2b %ymm2, %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
+; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2b %ymm2, %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VBMIVL-NEXT:    vzeroupper
+; AVX512VBMIVL-NEXT:    retq
+  %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
+  ret <16 x i8> %res
+}
+
+define <4 x double> @PR34175(<32 x i16>* %p) {
+; AVX512F-LABEL: PR34175:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512F-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: PR34175:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: PR34175:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512BW-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512BW-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: PR34175:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512BWVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: PR34175:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512VBMI-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VBMI-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VBMI-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512VBMI-NEXT:    retq
+;
+; AVX512VBMIVL-LABEL: PR34175:
+; AVX512VBMIVL:       # %bb.0:
+; AVX512VBMIVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512VBMIVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512VBMIVL-NEXT:    retq
+  %v = load <32 x i16>, <32 x i16>* %p, align 2
+  %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+  %tofp = uitofp <4 x i16> %shuf to <4 x double>
+  ret <4 x double> %tofp
+}
+
+define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %truncated = trunc <8 x i64> %vec to <8 x i8>
+  %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %result
+}
+
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 7c100cb165e3bfce84b429b4493d2c7aac564b2f..0fe1887884fd9f4a521a0a4c7a8c1594ff0f36a2 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -37,8 +37,8 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
@@ -49,8 +49,8 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
@@ -61,8 +61,8 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512VBMI-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
 ; AVX512VBMI-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
@@ -73,11 +73,10 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
-; AVX512VBMIVL-NEXT:    vpermi2b %ymm1, %ymm0, %ymm2
-; AVX512VBMIVL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
+; AVX512VBMIVL-NEXT:    vpermi2b 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
@@ -89,9 +88,9 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwd 32(%rdi), %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
@@ -100,9 +99,9 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ;
 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vpmovsxwd 32(%rdi), %zmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
@@ -171,8 +170,8 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
 ; AVX512BW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
@@ -184,18 +183,17 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
 ; AVX512VBMI-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512VBMI-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
@@ -207,11 +205,10 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512VBMIVL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
@@ -237,9 +234,8 @@ define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
 ; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512F-NEXT:    vmovaps %ymm0, (%rsi)
 ; AVX512F-NEXT:    vzeroupper
@@ -247,19 +243,17 @@ define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ;
 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
+; AVX512VL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
+; AVX512VL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512BW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512BW-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
 ; AVX512BW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512BW-NEXT:    vmovaps %ymm0, (%rsi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -267,19 +261,17 @@ define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; AVX512VBMI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512VBMI-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512VBMI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
 ; AVX512VBMI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsi)
 ; AVX512VBMI-NEXT:    vzeroupper
@@ -287,11 +279,10 @@ define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VBMIVL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512VBMIVL-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
+; AVX512VBMIVL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
 ; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %L
@@ -317,106 +308,100 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT:    vpermi2b %ymm1, %ymm0, %ymm2
-; AVX512VBMIVL-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2b 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
 ; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
@@ -442,99 +427,89 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512VBMIVL-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
 ; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
@@ -560,97 +535,91 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vpmovwb %xmm1, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512VBMI-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512VBMIVL-NEXT:    vpmovwb %xmm2, (%rsi)
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vpmovwb %xmm1, (%rsi)
 ; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
@@ -863,12 +832,10 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
 define <4 x double> @PR34175(<32 x i16>* %p) {
 ; AVX512F-LABEL: PR34175:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
 ; AVX512F-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -877,12 +844,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
 ;
 ; AVX512VL-LABEL: PR34175:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqu 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
 ; AVX512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -891,49 +856,43 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
 ;
 ; AVX512BW-LABEL: PR34175:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512BW-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512BW-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512BW-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512BW-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: PR34175:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqu64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX512BWVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: PR34175:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqu64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VBMI-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512VBMI-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512VBMI-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512VBMI-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512VBMI-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512VBMI-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: PR34175:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqu64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512VBMIVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX512VBMIVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512VBMIVL-NEXT:    retq
   %v = load <32 x i16>, <32 x i16>* %p, align 2
diff --git a/test/CodeGen/X86/smul_fix.ll b/test/CodeGen/X86/smul_fix.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2e69a2666f9ab9e85ff2fc7dfcc8e54f0004c7e4
--- /dev/null
+++ b/test/CodeGen/X86/smul_fix.ll
@@ -0,0 +1,458 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare  i4  @llvm.smul.fix.i4   (i4,  i4, i32)
+declare  i32 @llvm.smul.fix.i32  (i32, i32, i32)
+declare  i64 @llvm.smul.fix.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movslq %esi, %rax
+; X64-NEXT:    movslq %edi, %rcx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    shldl $30, %ecx, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    retl
+  %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 2);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $2, %rdx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    subl %ecx, %ebp
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    cmovnsl %esi, %ebp
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %ebp, %edx
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    shldl $30, %ebx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    shlb $4, %dil
+; X64-NEXT:    sarb $4, %dil
+; X64-NEXT:    shlb $4, %sil
+; X64-NEXT:    sarb $4, %sil
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    imull %ecx, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shrb $2, %cl
+; X64-NEXT:    shrl $8, %eax
+; X64-NEXT:    shlb $6, %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func3:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    sarb $4, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    shlb $6, %ah
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 2);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    movslq %ecx, %rcx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    shldl $30, %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    movslq %ecx, %rcx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    shldl $30, %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm3
+; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    movslq %ecx, %rcx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    shldl $30, %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    movslq %ecx, %rcx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    shldl $30, %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    shldl $30, %eax, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shldl $30, %eax, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shldl $30, %eax, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl %ebx, 4(%ecx)
+; X86-NEXT:    movl %ebp, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2);
+  ret <4 x i32> %tmp;
+}
+
+; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    imull %esi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+  %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 0);
+  ret i32 %tmp;
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 0);
+  ret i64 %tmp;
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func6:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb $4, %al
+; X64-NEXT:    sarb $4, %al
+; X64-NEXT:    shlb $4, %sil
+; X64-NEXT:    sarb $4, %sil
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    mulb %sil
+; X64-NEXT:    retq
+;
+; X86-LABEL: func6:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    sarb $4, %cl
+; X86-NEXT:    mulb %cl
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 0);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec2:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    imull %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    imull %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm3
+; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    imull %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    imull %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0);
+  ret <4 x i32> %tmp;
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func7:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $32, %rdx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func7:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    subl %ebp, %ebx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovnsl %edi, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %ebx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 32);
+  ret i64 %tmp;
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func8:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $63, %rdx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %ebx, %esi
+; X86-NEXT:    cmovnsl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %esi, %edx
+; X86-NEXT:    cmovnsl %ecx, %edi
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    shrdl $31, %edi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 63);
+  ret i64 %tmp;
+}
diff --git a/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll b/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
index 5f4bbb0a4344267ffedda593547a41f54aa747ee..65bf1cb65848a6787e354fb469d58e108980e7a2 100644
--- a/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
+++ b/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
@@ -302,9 +302,9 @@ define i32 @test_call_setjmp(i32 *%ptr) nounwind {
 ; X64-NOPIC-NEXT:    cmpq $.Lslh_ret_addr4, %rbp
 ; X64-NOPIC-NEXT:    cmovneq %r15, %rax
 ; X64-NOPIC-NEXT:    movl (%rbx), %ebp
-; X64-NOPIC-NEXT:    movl $42, %esi
 ; X64-NOPIC-NEXT:    shlq $47, %rax
 ; X64-NOPIC-NEXT:    movq %r14, %rdi
+; X64-NOPIC-NEXT:    movl $42, %esi
 ; X64-NOPIC-NEXT:    orq %rax, %rsp
 ; X64-NOPIC-NEXT:    movq $.Lslh_ret_addr5, %r12
 ; X64-NOPIC-NEXT:    callq sigsetjmp
@@ -314,10 +314,10 @@ define i32 @test_call_setjmp(i32 *%ptr) nounwind {
 ; X64-NOPIC-NEXT:    cmpq $.Lslh_ret_addr5, %r12
 ; X64-NOPIC-NEXT:    cmovneq %r15, %rax
 ; X64-NOPIC-NEXT:    addl (%rbx), %ebp
-; X64-NOPIC-NEXT:    movl $42, %edx
 ; X64-NOPIC-NEXT:    shlq $47, %rax
 ; X64-NOPIC-NEXT:    movq %r14, %rdi
 ; X64-NOPIC-NEXT:    movq %r14, %rsi
+; X64-NOPIC-NEXT:    movl $42, %edx
 ; X64-NOPIC-NEXT:    orq %rax, %rsp
 ; X64-NOPIC-NEXT:    movq $.Lslh_ret_addr6, %r14
 ; X64-NOPIC-NEXT:    callq __sigsetjmp
@@ -364,9 +364,9 @@ define i32 @test_call_setjmp(i32 *%ptr) nounwind {
 ; X64-NOPIC-MCM-NEXT:    cmpq %rcx, %rbp
 ; X64-NOPIC-MCM-NEXT:    cmovneq %r15, %rax
 ; X64-NOPIC-MCM-NEXT:    movl (%rbx), %ebp
-; X64-NOPIC-MCM-NEXT:    movl $42, %esi
 ; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
 ; X64-NOPIC-MCM-NEXT:    movq %r14, %rdi
+; X64-NOPIC-MCM-NEXT:    movl $42, %esi
 ; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
 ; X64-NOPIC-MCM-NEXT:    leaq .Lslh_ret_addr5(%rip), %r12
 ; X64-NOPIC-MCM-NEXT:    callq sigsetjmp
@@ -377,10 +377,10 @@ define i32 @test_call_setjmp(i32 *%ptr) nounwind {
 ; X64-NOPIC-MCM-NEXT:    cmpq %rcx, %r12
 ; X64-NOPIC-MCM-NEXT:    cmovneq %r15, %rax
 ; X64-NOPIC-MCM-NEXT:    addl (%rbx), %ebp
-; X64-NOPIC-MCM-NEXT:    movl $42, %edx
 ; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
 ; X64-NOPIC-MCM-NEXT:    movq %r14, %rdi
 ; X64-NOPIC-MCM-NEXT:    movq %r14, %rsi
+; X64-NOPIC-MCM-NEXT:    movl $42, %edx
 ; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
 ; X64-NOPIC-MCM-NEXT:    leaq .Lslh_ret_addr6(%rip), %r14
 ; X64-NOPIC-MCM-NEXT:    callq __sigsetjmp
@@ -428,9 +428,9 @@ define i32 @test_call_setjmp(i32 *%ptr) nounwind {
 ; X64-PIC-NEXT:    cmpq %rcx, %rbp
 ; X64-PIC-NEXT:    cmovneq %r15, %rax
 ; X64-PIC-NEXT:    movl (%rbx), %ebp
-; X64-PIC-NEXT:    movl $42, %esi
 ; X64-PIC-NEXT:    shlq $47, %rax
 ; X64-PIC-NEXT:    movq %r14, %rdi
+; X64-PIC-NEXT:    movl $42, %esi
 ; X64-PIC-NEXT:    orq %rax, %rsp
 ; X64-PIC-NEXT:    leaq .Lslh_ret_addr5(%rip), %r12
 ; X64-PIC-NEXT:    callq sigsetjmp@PLT
@@ -441,10 +441,10 @@ define i32 @test_call_setjmp(i32 *%ptr) nounwind {
 ; X64-PIC-NEXT:    cmpq %rcx, %r12
 ; X64-PIC-NEXT:    cmovneq %r15, %rax
 ; X64-PIC-NEXT:    addl (%rbx), %ebp
-; X64-PIC-NEXT:    movl $42, %edx
 ; X64-PIC-NEXT:    shlq $47, %rax
 ; X64-PIC-NEXT:    movq %r14, %rdi
 ; X64-PIC-NEXT:    movq %r14, %rsi
+; X64-PIC-NEXT:    movl $42, %edx
 ; X64-PIC-NEXT:    orq %rax, %rsp
 ; X64-PIC-NEXT:    leaq .Lslh_ret_addr6(%rip), %r14
 ; X64-PIC-NEXT:    callq __sigsetjmp@PLT
diff --git a/test/CodeGen/X86/speculative-load-hardening-indirect.ll b/test/CodeGen/X86/speculative-load-hardening-indirect.ll
index 0d04a85d36790f78611a0ecd52fd2ec5087d5adc..0e6852125fe5c66785b794e0a976837c78559827 100644
--- a/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ b/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections | FileCheck %s --check-prefix=X64
 ; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39451.
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -relocation-model pic -data-sections -verify-machineinstrs=0 | FileCheck %s --check-prefix=X64-PIC
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections -mattr=+retpoline -verify-machineinstrs=0 | FileCheck %s --check-prefix=X64-RETPOLINE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections -mattr=+retpoline | FileCheck %s --check-prefix=X64-RETPOLINE
 ;
 ; FIXME: Add support for 32-bit.
 
@@ -248,27 +248,36 @@ define i32 @test_indirectbr(i8** %ptr) nounwind {
 ; X64-NEXT:    movq (%rdi), %rdx
 ; X64-NEXT:    orq %rcx, %rdx
 ; X64-NEXT:    jmpq *%rdx
-; X64-NEXT:  .LBB4_1: # %bb0
+; X64-NEXT:  .LBB4_1: # Block address taken
+; X64-NEXT:    # %bb0
 ; X64-NEXT:    cmpq $.LBB4_1, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $2, %eax
-; X64-NEXT:    jmp .LBB4_2
-; X64-NEXT:  .LBB4_4: # %bb2
-; X64-NEXT:    cmpq $.LBB4_4, %rdx
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_3: # Block address taken
+; X64-NEXT:    # %bb2
+; X64-NEXT:    cmpq $.LBB4_3, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $13, %eax
-; X64-NEXT:    jmp .LBB4_2
-; X64-NEXT:  .LBB4_5: # %bb3
-; X64-NEXT:    cmpq $.LBB4_5, %rdx
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_4: # Block address taken
+; X64-NEXT:    # %bb3
+; X64-NEXT:    cmpq $.LBB4_4, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $42, %eax
-; X64-NEXT:    jmp .LBB4_2
-; X64-NEXT:  .LBB4_3: # %bb1
-; X64-NEXT:    cmpq $.LBB4_3, %rdx
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_2: # Block address taken
+; X64-NEXT:    # %bb1
+; X64-NEXT:    cmpq $.LBB4_2, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
-; X64-NEXT:    movl $7, %eax
-; X64-NEXT:  .LBB4_2: # %bb0
 ; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ;
@@ -280,31 +289,40 @@ define i32 @test_indirectbr(i8** %ptr) nounwind {
 ; X64-PIC-NEXT:    movq (%rdi), %rdx
 ; X64-PIC-NEXT:    orq %rcx, %rdx
 ; X64-PIC-NEXT:    jmpq *%rdx
-; X64-PIC-NEXT:  .LBB4_1: # %bb0
+; X64-PIC-NEXT:  .LBB4_1: # Block address taken
+; X64-PIC-NEXT:    # %bb0
 ; X64-PIC-NEXT:    leaq .LBB4_1(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $2, %eax
-; X64-PIC-NEXT:    jmp .LBB4_2
-; X64-PIC-NEXT:  .LBB4_4: # %bb2
-; X64-PIC-NEXT:    leaq .LBB4_4(%rip), %rsi
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB4_3: # Block address taken
+; X64-PIC-NEXT:    # %bb2
+; X64-PIC-NEXT:    leaq .LBB4_3(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $13, %eax
-; X64-PIC-NEXT:    jmp .LBB4_2
-; X64-PIC-NEXT:  .LBB4_5: # %bb3
-; X64-PIC-NEXT:    leaq .LBB4_5(%rip), %rsi
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB4_4: # Block address taken
+; X64-PIC-NEXT:    # %bb3
+; X64-PIC-NEXT:    leaq .LBB4_4(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $42, %eax
-; X64-PIC-NEXT:    jmp .LBB4_2
-; X64-PIC-NEXT:  .LBB4_3: # %bb1
-; X64-PIC-NEXT:    leaq .LBB4_3(%rip), %rsi
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB4_2: # Block address taken
+; X64-PIC-NEXT:    # %bb1
+; X64-PIC-NEXT:    leaq .LBB4_2(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
-; X64-PIC-NEXT:    movl $7, %eax
-; X64-PIC-NEXT:  .LBB4_2: # %bb0
 ; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl $7, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ;
@@ -341,27 +359,32 @@ define i32 @test_indirectbr_global(i32 %idx) nounwind {
 ; X64-NEXT:  .LBB5_1: # %bb0
 ; X64-NEXT:    cmpq $.LBB5_1, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $2, %eax
-; X64-NEXT:    jmp .LBB5_2
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp1: # Block address taken
-; X64-NEXT:  .LBB5_4: # %bb2
-; X64-NEXT:    cmpq $.LBB5_4, %rdx
+; X64-NEXT:  .LBB5_3: # %bb2
+; X64-NEXT:    cmpq $.LBB5_3, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $13, %eax
-; X64-NEXT:    jmp .LBB5_2
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp2: # Block address taken
-; X64-NEXT:  .LBB5_5: # %bb3
-; X64-NEXT:    cmpq $.LBB5_5, %rdx
+; X64-NEXT:  .LBB5_4: # %bb3
+; X64-NEXT:    cmpq $.LBB5_4, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $42, %eax
-; X64-NEXT:    jmp .LBB5_2
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp3: # Block address taken
-; X64-NEXT:  .LBB5_3: # %bb1
-; X64-NEXT:    cmpq $.LBB5_3, %rdx
+; X64-NEXT:  .LBB5_2: # %bb1
+; X64-NEXT:    cmpq $.LBB5_2, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
-; X64-NEXT:    movl $7, %eax
-; X64-NEXT:  .LBB5_2: # %bb0
 ; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ;
@@ -380,30 +403,35 @@ define i32 @test_indirectbr_global(i32 %idx) nounwind {
 ; X64-PIC-NEXT:    leaq .LBB5_1(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $2, %eax
-; X64-PIC-NEXT:    jmp .LBB5_2
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp1: # Block address taken
-; X64-PIC-NEXT:  .LBB5_4: # %bb2
-; X64-PIC-NEXT:    leaq .LBB5_4(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_3: # %bb2
+; X64-PIC-NEXT:    leaq .LBB5_3(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $13, %eax
-; X64-PIC-NEXT:    jmp .LBB5_2
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp2: # Block address taken
-; X64-PIC-NEXT:  .LBB5_5: # %bb3
-; X64-PIC-NEXT:    leaq .LBB5_5(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_4: # %bb3
+; X64-PIC-NEXT:    leaq .LBB5_4(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $42, %eax
-; X64-PIC-NEXT:    jmp .LBB5_2
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp3: # Block address taken
-; X64-PIC-NEXT:  .LBB5_3: # %bb1
-; X64-PIC-NEXT:    leaq .LBB5_3(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_2: # %bb1
+; X64-PIC-NEXT:    leaq .LBB5_2(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
-; X64-PIC-NEXT:    movl $7, %eax
-; X64-PIC-NEXT:  .LBB5_2: # %bb0
 ; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl $7, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ;
@@ -416,36 +444,41 @@ define i32 @test_indirectbr_global(i32 %idx) nounwind {
 ; X64-RETPOLINE-NEXT:    movq global_blockaddrs(,%rdx,8), %rdx
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rdx
 ; X64-RETPOLINE-NEXT:    cmpq $2, %rdx
-; X64-RETPOLINE-NEXT:    je .LBB6_5
+; X64-RETPOLINE-NEXT:    je .LBB6_4
 ; X64-RETPOLINE-NEXT:  # %bb.1: # %entry
 ; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    cmpq $3, %rdx
-; X64-RETPOLINE-NEXT:    je .LBB6_6
+; X64-RETPOLINE-NEXT:    je .LBB6_5
 ; X64-RETPOLINE-NEXT:  # %bb.2: # %entry
 ; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    cmpq $4, %rdx
 ; X64-RETPOLINE-NEXT:    jne .LBB6_3
 ; X64-RETPOLINE-NEXT:  .Ltmp0: # Block address taken
-; X64-RETPOLINE-NEXT:  # %bb.7: # %bb3
+; X64-RETPOLINE-NEXT:  # %bb.6: # %bb3
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    movl $42, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB6_4
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
 ; X64-RETPOLINE-NEXT:  .Ltmp1: # Block address taken
-; X64-RETPOLINE-NEXT:  .LBB6_5: # %bb1
+; X64-RETPOLINE-NEXT:  .LBB6_4: # %bb1
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    movl $7, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB6_4
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
 ; X64-RETPOLINE-NEXT:  .Ltmp2: # Block address taken
-; X64-RETPOLINE-NEXT:  .LBB6_6: # %bb2
+; X64-RETPOLINE-NEXT:  .LBB6_5: # %bb2
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    movl $13, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB6_4
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
 ; X64-RETPOLINE-NEXT:  .Ltmp3: # Block address taken
 ; X64-RETPOLINE-NEXT:  .LBB6_3: # %bb0
 ; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
-; X64-RETPOLINE-NEXT:    movl $2, %eax
-; X64-RETPOLINE-NEXT:  .LBB6_4: # %bb0
 ; X64-RETPOLINE-NEXT:    shlq $47, %rcx
+; X64-RETPOLINE-NEXT:    movl $2, %eax
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
 ; X64-RETPOLINE-NEXT:    retq
 entry:
@@ -482,31 +515,42 @@ define i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-NEXT:    movq .LJTI6_0(,%rdx,8), %rdx
 ; X64-NEXT:    orq %rcx, %rdx
 ; X64-NEXT:    jmpq *%rdx
-; X64-NEXT:  .LBB6_4: # %bb1
-; X64-NEXT:    cmpq $.LBB6_4, %rdx
+; X64-NEXT:  .LBB6_3: # Block address taken
+; X64-NEXT:    # %bb1
+; X64-NEXT:    cmpq $.LBB6_3, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $7, %eax
-; X64-NEXT:    jmp .LBB6_3
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
 ; X64-NEXT:  .LBB6_2: # %bb0
 ; X64-NEXT:    cmovbeq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $2, %eax
-; X64-NEXT:    jmp .LBB6_3
-; X64-NEXT:  .LBB6_5: # %bb2
-; X64-NEXT:    cmpq $.LBB6_5, %rdx
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_4: # Block address taken
+; X64-NEXT:    # %bb2
+; X64-NEXT:    cmpq $.LBB6_4, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $13, %eax
-; X64-NEXT:    jmp .LBB6_3
-; X64-NEXT:  .LBB6_6: # %bb3
-; X64-NEXT:    cmpq $.LBB6_6, %rdx
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_5: # Block address taken
+; X64-NEXT:    # %bb3
+; X64-NEXT:    cmpq $.LBB6_5, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movl $42, %eax
-; X64-NEXT:    jmp .LBB6_3
-; X64-NEXT:  .LBB6_7: # %bb5
-; X64-NEXT:    cmpq $.LBB6_7, %rdx
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_6: # Block address taken
+; X64-NEXT:    # %bb5
+; X64-NEXT:    cmpq $.LBB6_6, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
-; X64-NEXT:    movl $11, %eax
-; X64-NEXT:  .LBB6_3: # %bb0
 ; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    movl $11, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ;
@@ -525,35 +569,46 @@ define i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-PIC-NEXT:    addq %rsi, %rdx
 ; X64-PIC-NEXT:    orq %rcx, %rdx
 ; X64-PIC-NEXT:    jmpq *%rdx
-; X64-PIC-NEXT:  .LBB6_4: # %bb1
-; X64-PIC-NEXT:    leaq .LBB6_4(%rip), %rsi
+; X64-PIC-NEXT:  .LBB6_3: # Block address taken
+; X64-PIC-NEXT:    # %bb1
+; X64-PIC-NEXT:    leaq .LBB6_3(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $7, %eax
-; X64-PIC-NEXT:    jmp .LBB6_3
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .LBB6_2: # %bb0
 ; X64-PIC-NEXT:    cmovbeq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $2, %eax
-; X64-PIC-NEXT:    jmp .LBB6_3
-; X64-PIC-NEXT:  .LBB6_5: # %bb2
-; X64-PIC-NEXT:    leaq .LBB6_5(%rip), %rsi
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB6_4: # Block address taken
+; X64-PIC-NEXT:    # %bb2
+; X64-PIC-NEXT:    leaq .LBB6_4(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $13, %eax
-; X64-PIC-NEXT:    jmp .LBB6_3
-; X64-PIC-NEXT:  .LBB6_6: # %bb3
-; X64-PIC-NEXT:    leaq .LBB6_6(%rip), %rsi
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB6_5: # Block address taken
+; X64-PIC-NEXT:    # %bb3
+; X64-PIC-NEXT:    leaq .LBB6_5(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    movl $42, %eax
-; X64-PIC-NEXT:    jmp .LBB6_3
-; X64-PIC-NEXT:  .LBB6_7: # %bb5
-; X64-PIC-NEXT:    leaq .LBB6_7(%rip), %rsi
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB6_6: # Block address taken
+; X64-PIC-NEXT:    # %bb5
+; X64-PIC-NEXT:    leaq .LBB6_6(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
-; X64-PIC-NEXT:    movl $11, %eax
-; X64-PIC-NEXT:  .LBB6_3: # %bb0
 ; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl $11, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ;
@@ -567,40 +622,47 @@ define i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-RETPOLINE-NEXT:  # %bb.1: # %entry
 ; X64-RETPOLINE-NEXT:    cmovgq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    testl %edi, %edi
-; X64-RETPOLINE-NEXT:    je .LBB7_8
+; X64-RETPOLINE-NEXT:    je .LBB7_7
 ; X64-RETPOLINE-NEXT:  # %bb.2: # %entry
 ; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    cmpl $1, %edi
 ; X64-RETPOLINE-NEXT:    jne .LBB7_6
 ; X64-RETPOLINE-NEXT:  # %bb.3: # %bb2
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    movl $13, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB7_7
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
 ; X64-RETPOLINE-NEXT:  .LBB7_4: # %entry
 ; X64-RETPOLINE-NEXT:    cmovleq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    cmpl $2, %edi
-; X64-RETPOLINE-NEXT:    je .LBB7_9
+; X64-RETPOLINE-NEXT:    je .LBB7_8
 ; X64-RETPOLINE-NEXT:  # %bb.5: # %entry
 ; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    cmpl $3, %edi
 ; X64-RETPOLINE-NEXT:    jne .LBB7_6
-; X64-RETPOLINE-NEXT:  # %bb.10: # %bb5
+; X64-RETPOLINE-NEXT:  # %bb.9: # %bb5
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    movl $11, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB7_7
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
 ; X64-RETPOLINE-NEXT:  .LBB7_6:
 ; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    movl $2, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB7_7
-; X64-RETPOLINE-NEXT:  .LBB7_8: # %bb1
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
+; X64-RETPOLINE-NEXT:  .LBB7_7: # %bb1
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    movl $7, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB7_7
-; X64-RETPOLINE-NEXT:  .LBB7_9: # %bb3
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
+; X64-RETPOLINE-NEXT:  .LBB7_8: # %bb3
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
-; X64-RETPOLINE-NEXT:    movl $42, %eax
-; X64-RETPOLINE-NEXT:  .LBB7_7: # %bb0
 ; X64-RETPOLINE-NEXT:    shlq $47, %rcx
+; X64-RETPOLINE-NEXT:    movl $42, %eax
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
 ; X64-RETPOLINE-NEXT:    retq
 entry:
@@ -651,25 +713,29 @@ define i32 @test_switch_jumptable_fallthrough(i32 %idx, i32* %a.ptr, i32* %b.ptr
 ; X64-NEXT:    movl (%rsi), %eax
 ; X64-NEXT:    orl %r9d, %eax
 ; X64-NEXT:    movq $.LBB7_3, %rsi
-; X64-NEXT:  .LBB7_3: # %bb1
+; X64-NEXT:  .LBB7_3: # Block address taken
+; X64-NEXT:    # %bb1
 ; X64-NEXT:    cmpq $.LBB7_3, %rsi
 ; X64-NEXT:    cmovneq %r10, %r9
 ; X64-NEXT:    addl (%rdx), %eax
 ; X64-NEXT:    orl %r9d, %eax
 ; X64-NEXT:    movq $.LBB7_4, %rsi
-; X64-NEXT:  .LBB7_4: # %bb2
+; X64-NEXT:  .LBB7_4: # Block address taken
+; X64-NEXT:    # %bb2
 ; X64-NEXT:    cmpq $.LBB7_4, %rsi
 ; X64-NEXT:    cmovneq %r10, %r9
 ; X64-NEXT:    addl (%rcx), %eax
 ; X64-NEXT:    orl %r9d, %eax
 ; X64-NEXT:    movq $.LBB7_5, %rsi
-; X64-NEXT:  .LBB7_5: # %bb3
+; X64-NEXT:  .LBB7_5: # Block address taken
+; X64-NEXT:    # %bb3
 ; X64-NEXT:    cmpq $.LBB7_5, %rsi
 ; X64-NEXT:    cmovneq %r10, %r9
 ; X64-NEXT:    addl (%r8), %eax
 ; X64-NEXT:    orl %r9d, %eax
 ; X64-NEXT:    movq $.LBB7_6, %rsi
-; X64-NEXT:  .LBB7_6: # %bb4
+; X64-NEXT:  .LBB7_6: # Block address taken
+; X64-NEXT:    # %bb4
 ; X64-NEXT:    cmpq $.LBB7_6, %rsi
 ; X64-NEXT:    cmovneq %r10, %r9
 ; X64-NEXT:    shlq $47, %r9
@@ -697,28 +763,32 @@ define i32 @test_switch_jumptable_fallthrough(i32 %idx, i32* %a.ptr, i32* %b.ptr
 ; X64-PIC-NEXT:    movl (%rsi), %eax
 ; X64-PIC-NEXT:    orl %r9d, %eax
 ; X64-PIC-NEXT:    leaq .LBB7_3(%rip), %rsi
-; X64-PIC-NEXT:  .LBB7_3: # %bb1
+; X64-PIC-NEXT:  .LBB7_3: # Block address taken
+; X64-PIC-NEXT:    # %bb1
 ; X64-PIC-NEXT:    leaq .LBB7_3(%rip), %rdi
 ; X64-PIC-NEXT:    cmpq %rdi, %rsi
 ; X64-PIC-NEXT:    cmovneq %r10, %r9
 ; X64-PIC-NEXT:    addl (%rdx), %eax
 ; X64-PIC-NEXT:    orl %r9d, %eax
 ; X64-PIC-NEXT:    leaq .LBB7_4(%rip), %rsi
-; X64-PIC-NEXT:  .LBB7_4: # %bb2
+; X64-PIC-NEXT:  .LBB7_4: # Block address taken
+; X64-PIC-NEXT:    # %bb2
 ; X64-PIC-NEXT:    leaq .LBB7_4(%rip), %rdx
 ; X64-PIC-NEXT:    cmpq %rdx, %rsi
 ; X64-PIC-NEXT:    cmovneq %r10, %r9
 ; X64-PIC-NEXT:    addl (%rcx), %eax
 ; X64-PIC-NEXT:    orl %r9d, %eax
 ; X64-PIC-NEXT:    leaq .LBB7_5(%rip), %rsi
-; X64-PIC-NEXT:  .LBB7_5: # %bb3
+; X64-PIC-NEXT:  .LBB7_5: # Block address taken
+; X64-PIC-NEXT:    # %bb3
 ; X64-PIC-NEXT:    leaq .LBB7_5(%rip), %rcx
 ; X64-PIC-NEXT:    cmpq %rcx, %rsi
 ; X64-PIC-NEXT:    cmovneq %r10, %r9
 ; X64-PIC-NEXT:    addl (%r8), %eax
 ; X64-PIC-NEXT:    orl %r9d, %eax
 ; X64-PIC-NEXT:    leaq .LBB7_6(%rip), %rsi
-; X64-PIC-NEXT:  .LBB7_6: # %bb4
+; X64-PIC-NEXT:  .LBB7_6: # Block address taken
+; X64-PIC-NEXT:    # %bb4
 ; X64-PIC-NEXT:    leaq .LBB7_6(%rip), %rcx
 ; X64-PIC-NEXT:    cmpq %rcx, %rsi
 ; X64-PIC-NEXT:    cmovneq %r10, %r9
diff --git a/test/CodeGen/X86/speculative-load-hardening.ll b/test/CodeGen/X86/speculative-load-hardening.ll
index f5949dbf467f669ae4b16cd959e2e3de177c769a..32ad43634fc740b985d3517bf098c1215dd6b8f9 100644
--- a/test/CodeGen/X86/speculative-load-hardening.ll
+++ b/test/CodeGen/X86/speculative-load-hardening.ll
@@ -538,10 +538,10 @@ define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) speculative_load_hard
 ; X64-NEXT:    cmovneq %r15, %rcx
 ; X64-NEXT:    movl %ebp, (%rax)
 ; X64-NEXT:  .Ltmp0:
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    callq __cxa_throw
 ; X64-NEXT:  .Lslh_ret_addr5:
@@ -612,9 +612,9 @@ define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) speculative_load_hard
 ; X64-LFENCE-NEXT:    callq __cxa_allocate_exception
 ; X64-LFENCE-NEXT:    movl %ebp, (%rax)
 ; X64-LFENCE-NEXT:  .Ltmp0:
+; X64-LFENCE-NEXT:    movq %rax, %rdi
 ; X64-LFENCE-NEXT:    xorl %esi, %esi
 ; X64-LFENCE-NEXT:    xorl %edx, %edx
-; X64-LFENCE-NEXT:    movq %rax, %rdi
 ; X64-LFENCE-NEXT:    callq __cxa_throw
 ; X64-LFENCE-NEXT:  .Ltmp1:
 ; X64-LFENCE-NEXT:  .LBB4_2: # %exit
diff --git a/test/CodeGen/X86/split-extend-vector-inreg.ll b/test/CodeGen/X86/split-extend-vector-inreg.ll
index 26f3e38cd297aac34be77df66a5c9f1f75adbdc3..7f0e2cfa44b02ee0998822a1bec650d3a045fdaa 100644
--- a/test/CodeGen/X86/split-extend-vector-inreg.ll
+++ b/test/CodeGen/X86/split-extend-vector-inreg.ll
@@ -1,37 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64
 
 define <4 x i64> @autogen_SD88863() {
-; X32-LABEL: autogen_SD88863:
-; X32:       # %bb.0: # %BB
-; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; X32-NEXT:    movb $1, %al
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB0_1: # %CF
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    testb %al, %al
-; X32-NEXT:    jne .LBB0_1
-; X32-NEXT:  # %bb.2: # %CF240
-; X32-NEXT:    retl
-;
-; X64-LABEL: autogen_SD88863:
-; X64:       # %bb.0: # %BB
-; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-NEXT:    movb $1, %al
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB0_1: # %CF
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    jne .LBB0_1
-; X64-NEXT:  # %bb.2: # %CF240
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; X64-NEXT:    retq
+; CHECK-LABEL: autogen_SD88863:
+; CHECK:       # %bb.0: # %BB
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %CF
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %CF240
+; CHECK-NEXT:    ret{{[l|q]}}
 BB:
   %I26 = insertelement <4 x i64> undef, i64 undef, i32 2
   br label %CF
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index aca2ec8c5f59e2d10a01ee9414c596e78a4d3639..40cf517de4f87a7c683e4aa0d0a3742f5e5b2e62 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -2230,13 +2230,13 @@ define void @test_ldmxcsr(i32 %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_ldmxcsr:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:0.50]
 ; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_ldmxcsr:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
 ; BDVER2-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -5290,12 +5290,12 @@ define void @test_sfence() {
 ;
 ; BDVER2-SSE-LABEL: test_sfence:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    sfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sfence # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_sfence:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    sfence # sched: [1:0.50]
+; BDVER2-NEXT:    sfence # sched: [1:1.00]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_sfence:
@@ -5847,13 +5847,13 @@ define i32 @test_stmxcsr() {
 ;
 ; BDVER2-SSE-LABEL: test_stmxcsr:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    stmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    stmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
 ; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_stmxcsr:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
 ; BDVER2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index a833dcf0735058d04c1ed32d68c7998b4a3a2b46..660ba8ebb791d0f511cef4f68c31ec2cfb5d7437 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -3821,12 +3821,12 @@ define void @test_lfence() {
 ;
 ; BDVER2-SSE-LABEL: test_lfence:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    lfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    lfence # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_lfence:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    lfence # sched: [1:0.50]
+; BDVER2-NEXT:    lfence # sched: [1:1.00]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_lfence:
@@ -3927,12 +3927,12 @@ define void @test_mfence() {
 ;
 ; BDVER2-SSE-LABEL: test_mfence:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    mfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    mfence # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_mfence:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    mfence # sched: [1:0.50]
+; BDVER2-NEXT:    mfence # sched: [1:1.00]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mfence:
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index be019aff5142fb419780d5ff1de4453013ae33c6..943e898d56997354c9bfa71c9860dc1a2f1f1625 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -574,10 +574,8 @@ define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocap
 ; X86-AVX-LABEL: test16:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovaps 96(%eax), %ymm0
-; X86-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX-NEXT:    vzeroupper
+; X86-AVX-NEXT:    vmovaps 96(%eax), %xmm0
+; X86-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: test16:
@@ -588,10 +586,8 @@ define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocap
 ;
 ; X64-AVX-LABEL: test16:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovaps 96(%rdi), %ymm0
-; X64-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    vmovaps 96(%rdi), %xmm0
+; X64-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X64-AVX-NEXT:    retq
   %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
   %i6 = load <4 x double>, <4 x double>* %i5, align 32
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86.ll b/test/CodeGen/X86/sse42-intrinsics-x86.ll
index 120e3c0d1ef9f43794e8d44eaedd026bfca165a4..2d8d1fb75179ab135ec27e86d091a13c8357ab55 100644
--- a/test/CodeGen/X86/sse42-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -99,9 +99,9 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-SSE-LABEL: test_x86_sse42_pcmpestria128:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X86-SSE-NEXT:    seta %bl ## encoding: [0x0f,0x97,0xc3]
 ; X86-SSE-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -111,9 +111,9 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-AVX-LABEL: test_x86_sse42_pcmpestria128:
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X86-AVX-NEXT:    seta %bl ## encoding: [0x0f,0x97,0xc3]
 ; X86-AVX-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -122,9 +122,9 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-SSE-LABEL: test_x86_sse42_pcmpestria128:
 ; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X64-SSE-NEXT:    seta %sil ## encoding: [0x40,0x0f,0x97,0xc6]
 ; X64-SSE-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -132,9 +132,9 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-AVX-LABEL: test_x86_sse42_pcmpestria128:
 ; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X64-AVX-NEXT:    seta %sil ## encoding: [0x40,0x0f,0x97,0xc6]
 ; X64-AVX-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -149,9 +149,9 @@ define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-SSE-LABEL: test_x86_sse42_pcmpestric128:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X86-SSE-NEXT:    setb %bl ## encoding: [0x0f,0x92,0xc3]
 ; X86-SSE-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -161,9 +161,9 @@ define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-AVX-LABEL: test_x86_sse42_pcmpestric128:
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X86-AVX-NEXT:    setb %bl ## encoding: [0x0f,0x92,0xc3]
 ; X86-AVX-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -172,9 +172,9 @@ define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-SSE-LABEL: test_x86_sse42_pcmpestric128:
 ; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X64-SSE-NEXT:    setb %sil ## encoding: [0x40,0x0f,0x92,0xc6]
 ; X64-SSE-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -182,9 +182,9 @@ define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-AVX-LABEL: test_x86_sse42_pcmpestric128:
 ; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X64-AVX-NEXT:    setb %sil ## encoding: [0x40,0x0f,0x92,0xc6]
 ; X64-AVX-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -199,9 +199,9 @@ define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-SSE-LABEL: test_x86_sse42_pcmpestrio128:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X86-SSE-NEXT:    seto %bl ## encoding: [0x0f,0x90,0xc3]
 ; X86-SSE-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -211,9 +211,9 @@ define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-AVX-LABEL: test_x86_sse42_pcmpestrio128:
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X86-AVX-NEXT:    seto %bl ## encoding: [0x0f,0x90,0xc3]
 ; X86-AVX-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -222,9 +222,9 @@ define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-SSE-LABEL: test_x86_sse42_pcmpestrio128:
 ; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X64-SSE-NEXT:    seto %sil ## encoding: [0x40,0x0f,0x90,0xc6]
 ; X64-SSE-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -232,9 +232,9 @@ define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-AVX-LABEL: test_x86_sse42_pcmpestrio128:
 ; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X64-AVX-NEXT:    seto %sil ## encoding: [0x40,0x0f,0x90,0xc6]
 ; X64-AVX-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -249,9 +249,9 @@ define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-SSE-LABEL: test_x86_sse42_pcmpestris128:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X86-SSE-NEXT:    sets %bl ## encoding: [0x0f,0x98,0xc3]
 ; X86-SSE-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -261,9 +261,9 @@ define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-AVX-LABEL: test_x86_sse42_pcmpestris128:
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X86-AVX-NEXT:    sets %bl ## encoding: [0x0f,0x98,0xc3]
 ; X86-AVX-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -272,9 +272,9 @@ define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-SSE-LABEL: test_x86_sse42_pcmpestris128:
 ; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X64-SSE-NEXT:    sets %sil ## encoding: [0x40,0x0f,0x98,0xc6]
 ; X64-SSE-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -282,9 +282,9 @@ define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-AVX-LABEL: test_x86_sse42_pcmpestris128:
 ; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X64-AVX-NEXT:    sets %sil ## encoding: [0x40,0x0f,0x98,0xc6]
 ; X64-AVX-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -299,9 +299,9 @@ define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-SSE-LABEL: test_x86_sse42_pcmpestriz128:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-SSE-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X86-SSE-NEXT:    sete %bl ## encoding: [0x0f,0x94,0xc3]
 ; X86-SSE-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -311,9 +311,9 @@ define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ; X86-AVX-LABEL: test_x86_sse42_pcmpestriz128:
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    pushl %ebx ## encoding: [0x53]
+; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X86-AVX-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; X86-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X86-AVX-NEXT:    sete %bl ## encoding: [0x0f,0x94,0xc3]
 ; X86-AVX-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
@@ -322,9 +322,9 @@ define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-SSE-LABEL: test_x86_sse42_pcmpestriz128:
 ; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-SSE-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
 ; X64-SSE-NEXT:    sete %sil ## encoding: [0x40,0x0f,0x94,0xc6]
 ; X64-SSE-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
@@ -332,9 +332,9 @@ define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 ;
 ; X64-AVX-LABEL: test_x86_sse42_pcmpestriz128:
 ; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X64-AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; X64-AVX-NEXT:    xorl %esi, %esi ## encoding: [0x31,0xf6]
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
 ; X64-AVX-NEXT:    sete %sil ## encoding: [0x40,0x0f,0x94,0xc6]
 ; X64-AVX-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
diff --git a/test/CodeGen/X86/ssub_sat_vec.ll b/test/CodeGen/X86/ssub_sat_vec.ll
new file mode 100644
index 0000000000000000000000000000000000000000..28a8ee27a456b246239934dbcfadb2a5912d16c8
--- /dev/null
+++ b/test/CodeGen/X86/ssub_sat_vec.ll
@@ -0,0 +1,1380 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8>, <12 x i8>)
+declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
+
+declare <1 x i16> @llvm.ssub.sat.v1i16(<1 x i16>, <1 x i16>)
+declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <12 x i16> @llvm.ssub.sat.v12i16(<12 x i16>, <12 x i16>)
+declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>)
+
+declare <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1>, <16 x i1>)
+declare <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4>, <16 x i4>)
+
+declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i24> @llvm.ssub.sat.v4i24(<4 x i24>, <4 x i24>)
+declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>)
+
+; Legal types, depending on architecture.
+
+define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE-LABEL: v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubsb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
+  ret <16 x i8> %z
+}
+
+define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
+; SSE-LABEL: v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubsb %xmm2, %xmm0
+; SSE-NEXT:    psubsb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsubsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
+  ret <32 x i8> %z
+}
+
+define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
+; SSE-LABEL: v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubsb %xmm4, %xmm0
+; SSE-NEXT:    psubsb %xmm5, %xmm1
+; SSE-NEXT:    psubsb %xmm6, %xmm2
+; SSE-NEXT:    psubsb %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsubsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsubsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubsb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsubsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubsb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
+  ret <64 x i8> %z
+}
+
+define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE-LABEL: v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
+  ret <8 x i16> %z
+}
+
+define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
+; SSE-LABEL: v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubsw %xmm2, %xmm0
+; SSE-NEXT:    psubsw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsubsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
+  ret <16 x i16> %z
+}
+
+define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
+; SSE-LABEL: v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubsw %xmm4, %xmm0
+; SSE-NEXT:    psubsw %xmm5, %xmm1
+; SSE-NEXT:    psubsw %xmm6, %xmm2
+; SSE-NEXT:    psubsw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsubsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsubsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubsw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsubsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
+  ret <32 x i16> %z
+}
+
+; Too narrow vectors, legalized by widening.
+
+define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
+; SSE-LABEL: v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    psubsb %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmovwb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <8 x i8>, <8 x i8>* %px
+  %y = load <8 x i8>, <8 x i8>* %py
+  %z = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %x, <8 x i8> %y)
+  store <8 x i8> %z, <8 x i8>* %pz
+  ret void
+}
+
+define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
+; SSE-LABEL: v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    psubsb %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT:    vpmovdb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <4 x i8>, <4 x i8>* %px
+  %y = load <4 x i8>, <4 x i8>* %py
+  %z = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
+  store <4 x i8> %z, <4 x i8>* %pz
+  ret void
+}
+
+define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
+; SSE2-LABEL: v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl (%rsi), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psubsb %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rdx)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v2i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl (%rsi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    psubsb %xmm1, %xmm0
+; SSSE3-NEXT:    movd %xmm0, %eax
+; SSSE3-NEXT:    movw %ax, (%rdx)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    movzwl (%rsi), %eax
+; SSE41-NEXT:    movd %eax, %xmm1
+; SSE41-NEXT:    psubsb %xmm1, %xmm0
+; SSE41-NEXT:    pextrw $0, %xmm0, (%rdx)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movzwl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    movzwl (%rsi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movzwl (%rdi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    movzwl (%rsi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movzwl (%rdi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    movzwl (%rsi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmovqb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <2 x i8>, <2 x i8>* %px
+  %y = load <2 x i8>, <2 x i8>* %py
+  %z = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %x, <2 x i8> %y)
+  store <2 x i8> %z, <2 x i8>* %pz
+  ret void
+}
+
+define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
+; SSE-LABEL: v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    psubsw %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpmovdw %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <4 x i16>, <4 x i16>* %px
+  %y = load <4 x i16>, <4 x i16>* %py
+  %z = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %x, <4 x i16> %y)
+  store <4 x i16> %z, <4 x i16>* %pz
+  ret void
+}
+
+define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
+; SSE-LABEL: v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    psubsw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vpmovqw %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <2 x i16>, <2 x i16>* %px
+  %y = load <2 x i16>, <2 x i16>* %py
+  %z = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
+  store <2 x i16> %z, <2 x i16>* %pz
+  ret void
+}
+
+define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
+; SSE-LABEL: v12i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubsb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v12i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
+  ret <12 x i8> %z
+}
+
+define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind {
+; SSE-LABEL: v12i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm0
+; SSE-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE-NEXT:    psubsw (%rsi), %xmm0
+; SSE-NEXT:    psubsw 16(%rsi), %xmm1
+; SSE-NEXT:    movq %xmm1, 16(%rdx)
+; SSE-NEXT:    movdqa %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v12i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vpsubsw (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vpsubsw 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v12i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpsubsw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v12i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vpsubsw (%rsi), %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = load <12 x i16>, <12 x i16>* %px
+  %y = load <12 x i16>, <12 x i16>* %py
+  %z = call <12 x i16> @llvm.ssub.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
+  store <12 x i16> %z, <12 x i16>* %pz
+  ret void
+}
+
+; Scalarization
+
+define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
+; SSE-LABEL: v1i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movb (%rdi), %cl
+; SSE-NEXT:    movb (%rsi), %dil
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    subb %dil, %al
+; SSE-NEXT:    setns %sil
+; SSE-NEXT:    subb %dil, %cl
+; SSE-NEXT:    jno .LBB13_2
+; SSE-NEXT:  # %bb.1:
+; SSE-NEXT:    addb $127, %sil
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:  .LBB13_2:
+; SSE-NEXT:    movb %cl, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v1i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movb (%rdi), %cl
+; AVX-NEXT:    movb (%rsi), %dil
+; AVX-NEXT:    movl %ecx, %eax
+; AVX-NEXT:    subb %dil, %al
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    subb %dil, %cl
+; AVX-NEXT:    jno .LBB13_2
+; AVX-NEXT:  # %bb.1:
+; AVX-NEXT:    addb $127, %sil
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:  .LBB13_2:
+; AVX-NEXT:    movb %cl, (%rdx)
+; AVX-NEXT:    retq
+  %x = load <1 x i8>, <1 x i8>* %px
+  %y = load <1 x i8>, <1 x i8>* %py
+  %z = call <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8> %x, <1 x i8> %y)
+  store <1 x i8> %z, <1 x i8>* %pz
+  ret void
+}
+
+define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
+; SSE-LABEL: v1i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movzwl (%rdi), %eax
+; SSE-NEXT:    movzwl (%rsi), %ecx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    movl %eax, %edi
+; SSE-NEXT:    subw %cx, %di
+; SSE-NEXT:    setns %sil
+; SSE-NEXT:    addl $32767, %esi # imm = 0x7FFF
+; SSE-NEXT:    subw %cx, %ax
+; SSE-NEXT:    cmovol %esi, %eax
+; SSE-NEXT:    movw %ax, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v1i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    movzwl (%rsi), %ecx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %eax, %edi
+; AVX-NEXT:    subw %cx, %di
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $32767, %esi # imm = 0x7FFF
+; AVX-NEXT:    subw %cx, %ax
+; AVX-NEXT:    cmovol %esi, %eax
+; AVX-NEXT:    movw %ax, (%rdx)
+; AVX-NEXT:    retq
+  %x = load <1 x i16>, <1 x i16>* %px
+  %y = load <1 x i16>, <1 x i16>* %py
+  %z = call <1 x i16> @llvm.ssub.sat.v1i16(<1 x i16> %x, <1 x i16> %y)
+  store <1 x i16> %z, <1 x i16>* %pz
+  ret void
+}
+
+; Promotion
+
+define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
+; SSE-LABEL: v16i4:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $4, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    psllw $4, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psubsb %xmm1, %xmm0
+; SSE-NEXT:    psrlw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v16i4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
+  ret <16 x i4> %z
+}
+
+define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
+; SSE-LABEL: v16i1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $7, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psubsb %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v16i1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovb2m %xmm0, %k0
+; AVX512-NEXT:    vpsllw $7, %xmm1, %xmm0
+; AVX512-NEXT:    vpmovb2m %xmm0, %k1
+; AVX512-NEXT:    kandnw %k0, %k1, %k0
+; AVX512-NEXT:    vpmovm2b %k0, %xmm0
+; AVX512-NEXT:    retq
+  %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
+  ret <16 x i1> %z
+}
+
+; Expanded
+
+define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; SSE2-LABEL: v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %r8d
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    movl %r8d, %esi
+; SSE2-NEXT:    subl %ecx, %esi
+; SSE2-NEXT:    setns %dl
+; SSE2-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSE2-NEXT:    subl %ecx, %r8d
+; SSE2-NEXT:    cmovol %edx, %r8d
+; SSE2-NEXT:    movd %xmm1, %edx
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    xorl %esi, %esi
+; SSE2-NEXT:    movl %ecx, %edi
+; SSE2-NEXT:    subl %edx, %edi
+; SSE2-NEXT:    setns %sil
+; SSE2-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    subl %edx, %ecx
+; SSE2-NEXT:    cmovol %esi, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %edx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movl %eax, %esi
+; SSE2-NEXT:    subl %edx, %esi
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    subl %edx, %eax
+; SSE2-NEXT:    cmovol %edi, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %r9d
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movd %xmm0, %edx
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movl %edx, %esi
+; SSE2-NEXT:    subl %r9d, %esi
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    subl %r9d, %edx
+; SSE2-NEXT:    cmovol %edi, %edx
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd %r8d, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %r8d
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    movl %r8d, %esi
+; SSSE3-NEXT:    subl %ecx, %esi
+; SSSE3-NEXT:    setns %dl
+; SSSE3-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    subl %ecx, %r8d
+; SSSE3-NEXT:    cmovol %edx, %r8d
+; SSSE3-NEXT:    movd %xmm1, %edx
+; SSSE3-NEXT:    movd %xmm0, %ecx
+; SSSE3-NEXT:    xorl %esi, %esi
+; SSSE3-NEXT:    movl %ecx, %edi
+; SSSE3-NEXT:    subl %edx, %edi
+; SSSE3-NEXT:    setns %sil
+; SSSE3-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    subl %edx, %ecx
+; SSSE3-NEXT:    cmovol %esi, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %edx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movl %eax, %esi
+; SSSE3-NEXT:    subl %edx, %esi
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    subl %edx, %eax
+; SSSE3-NEXT:    cmovol %edi, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %r9d
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm0, %edx
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movl %edx, %esi
+; SSSE3-NEXT:    subl %r9d, %esi
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    subl %r9d, %edx
+; SSSE3-NEXT:    cmovol %edi, %edx
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movd %r8d, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
+; SSE41-NEXT:    pextrd $3, %xmm0, %r8d
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    movl %r8d, %esi
+; SSE41-NEXT:    subl %ecx, %esi
+; SSE41-NEXT:    setns %dl
+; SSE41-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSE41-NEXT:    subl %ecx, %r8d
+; SSE41-NEXT:    cmovol %edx, %r8d
+; SSE41-NEXT:    pextrd $2, %xmm1, %edx
+; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
+; SSE41-NEXT:    xorl %esi, %esi
+; SSE41-NEXT:    movl %ecx, %edi
+; SSE41-NEXT:    subl %edx, %edi
+; SSE41-NEXT:    setns %sil
+; SSE41-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    subl %edx, %ecx
+; SSE41-NEXT:    cmovol %esi, %ecx
+; SSE41-NEXT:    movd %xmm1, %edx
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movl %eax, %esi
+; SSE41-NEXT:    subl %edx, %esi
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    subl %edx, %eax
+; SSE41-NEXT:    cmovol %edi, %eax
+; SSE41-NEXT:    pextrd $1, %xmm1, %r9d
+; SSE41-NEXT:    pextrd $1, %xmm0, %edx
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movl %edx, %esi
+; SSE41-NEXT:    subl %r9d, %esi
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    subl %r9d, %edx
+; SSE41-NEXT:    cmovol %edi, %edx
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrd $3, %xmm1, %ecx
+; AVX-NEXT:    vpextrd $3, %xmm0, %r9d
+; AVX-NEXT:    xorl %edx, %edx
+; AVX-NEXT:    movl %r9d, %esi
+; AVX-NEXT:    subl %ecx, %esi
+; AVX-NEXT:    setns %dl
+; AVX-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; AVX-NEXT:    subl %ecx, %r9d
+; AVX-NEXT:    cmovol %edx, %r9d
+; AVX-NEXT:    vpextrd $2, %xmm1, %edx
+; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %ecx, %edi
+; AVX-NEXT:    subl %edx, %edi
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; AVX-NEXT:    subl %edx, %ecx
+; AVX-NEXT:    cmovol %esi, %ecx
+; AVX-NEXT:    vmovd %xmm1, %r8d
+; AVX-NEXT:    vmovd %xmm0, %edx
+; AVX-NEXT:    xorl %edi, %edi
+; AVX-NEXT:    movl %edx, %esi
+; AVX-NEXT:    subl %r8d, %esi
+; AVX-NEXT:    setns %dil
+; AVX-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; AVX-NEXT:    subl %r8d, %edx
+; AVX-NEXT:    cmovol %edi, %edx
+; AVX-NEXT:    vpextrd $1, %xmm1, %r8d
+; AVX-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %eax, %edi
+; AVX-NEXT:    subl %r8d, %edi
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; AVX-NEXT:    subl %r8d, %eax
+; AVX-NEXT:    cmovol %esi, %eax
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $3, %r9d, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
+  ret <4 x i32> %z
+}
+
+define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
+; SSE2-LABEL: v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    movq %rcx, %rsi
+; SSE2-NEXT:    subq %rax, %rsi
+; SSE2-NEXT:    setns %dl
+; SSE2-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE2-NEXT:    addq %r8, %rdx
+; SSE2-NEXT:    subq %rax, %rcx
+; SSE2-NEXT:    cmovoq %rdx, %rcx
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movq %rsi, %rdx
+; SSE2-NEXT:    subq %rax, %rdx
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addq %r8, %rdi
+; SSE2-NEXT:    subq %rax, %rsi
+; SSE2-NEXT:    cmovoq %rdi, %rsi
+; SSE2-NEXT:    movq %rsi, %xmm1
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    psllq $32, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movq %xmm2, %rax
+; SSSE3-NEXT:    psllq $32, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movq %xmm2, %rcx
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    movq %rcx, %rsi
+; SSSE3-NEXT:    subq %rax, %rsi
+; SSSE3-NEXT:    setns %dl
+; SSSE3-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; SSSE3-NEXT:    addq %r8, %rdx
+; SSSE3-NEXT:    subq %rax, %rcx
+; SSSE3-NEXT:    cmovoq %rdx, %rcx
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    movq %xmm0, %rsi
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movq %rsi, %rdx
+; SSSE3-NEXT:    subq %rax, %rdx
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addq %r8, %rdi
+; SSSE3-NEXT:    subq %rax, %rsi
+; SSSE3-NEXT:    cmovoq %rdi, %rsi
+; SSSE3-NEXT:    movq %rsi, %xmm1
+; SSSE3-NEXT:    movq %rcx, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    psllq $32, %xmm0
+; SSE41-NEXT:    movq %xmm0, %rcx
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    movq %rcx, %rsi
+; SSE41-NEXT:    subq %rax, %rsi
+; SSE41-NEXT:    setns %dl
+; SSE41-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE41-NEXT:    addq %r8, %rdx
+; SSE41-NEXT:    subq %rax, %rcx
+; SSE41-NEXT:    cmovoq %rdx, %rcx
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    pextrq $1, %xmm0, %rsi
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movq %rsi, %rdx
+; SSE41-NEXT:    subq %rax, %rdx
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addq %r8, %rdi
+; SSE41-NEXT:    subq %rax, %rsi
+; SSE41-NEXT:    cmovoq %rdi, %rsi
+; SSE41-NEXT:    movq %rsi, %xmm1
+; SSE41-NEXT:    movq %rcx, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rcx
+; AVX1-NEXT:    xorl %edx, %edx
+; AVX1-NEXT:    movq %rcx, %rsi
+; AVX1-NEXT:    subq %rax, %rsi
+; AVX1-NEXT:    setns %dl
+; AVX1-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX1-NEXT:    addq %r8, %rdx
+; AVX1-NEXT:    subq %rax, %rcx
+; AVX1-NEXT:    cmovoq %rdx, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT:    xorl %edi, %edi
+; AVX1-NEXT:    movq %rsi, %rdx
+; AVX1-NEXT:    subq %rax, %rdx
+; AVX1-NEXT:    setns %dil
+; AVX1-NEXT:    addq %r8, %rdi
+; AVX1-NEXT:    subq %rax, %rsi
+; AVX1-NEXT:    cmovoq %rdi, %rsi
+; AVX1-NEXT:    vmovq %rsi, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    movq %rcx, %rsi
+; AVX2-NEXT:    subq %rax, %rsi
+; AVX2-NEXT:    setns %dl
+; AVX2-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX2-NEXT:    addq %r8, %rdx
+; AVX2-NEXT:    subq %rax, %rcx
+; AVX2-NEXT:    cmovoq %rdx, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    movq %rsi, %rdx
+; AVX2-NEXT:    subq %rax, %rdx
+; AVX2-NEXT:    setns %dil
+; AVX2-NEXT:    addq %r8, %rdi
+; AVX2-NEXT:    subq %rax, %rsi
+; AVX2-NEXT:    cmovoq %rdi, %rsi
+; AVX2-NEXT:    vmovq %rsi, %xmm0
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    movq %rcx, %rsi
+; AVX512-NEXT:    subq %rax, %rsi
+; AVX512-NEXT:    setns %dl
+; AVX512-NEXT:    movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX512-NEXT:    addq %r8, %rdx
+; AVX512-NEXT:    subq %rax, %rcx
+; AVX512-NEXT:    cmovoq %rdx, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT:    xorl %edi, %edi
+; AVX512-NEXT:    movq %rsi, %rdx
+; AVX512-NEXT:    subq %rax, %rdx
+; AVX512-NEXT:    setns %dil
+; AVX512-NEXT:    addq %r8, %rdi
+; AVX512-NEXT:    subq %rax, %rsi
+; AVX512-NEXT:    cmovoq %rdi, %rsi
+; AVX512-NEXT:    vmovq %rsi, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
+  ret <2 x i32> %z
+}
+
+define <4 x i24> @v4i24(<4 x i24> %x, <4 x i24> %y) nounwind {
+; SSE2-LABEL: v4i24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %ecx
+; SSE2-NEXT:    pslld $8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %r8d
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    movl %r8d, %esi
+; SSE2-NEXT:    subl %ecx, %esi
+; SSE2-NEXT:    setns %dl
+; SSE2-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSE2-NEXT:    subl %ecx, %r8d
+; SSE2-NEXT:    cmovol %edx, %r8d
+; SSE2-NEXT:    movd %xmm1, %edx
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    xorl %esi, %esi
+; SSE2-NEXT:    movl %ecx, %edi
+; SSE2-NEXT:    subl %edx, %edi
+; SSE2-NEXT:    setns %sil
+; SSE2-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    subl %edx, %ecx
+; SSE2-NEXT:    cmovol %esi, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %edx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movl %eax, %esi
+; SSE2-NEXT:    subl %edx, %esi
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    subl %edx, %eax
+; SSE2-NEXT:    cmovol %edi, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %r9d
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movd %xmm0, %edx
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    movl %edx, %esi
+; SSE2-NEXT:    subl %r9d, %esi
+; SSE2-NEXT:    setns %dil
+; SSE2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE2-NEXT:    subl %r9d, %edx
+; SSE2-NEXT:    cmovol %edi, %edx
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd %r8d, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    psrad $8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v4i24:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pslld $8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %ecx
+; SSSE3-NEXT:    pslld $8, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %r8d
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    movl %r8d, %esi
+; SSSE3-NEXT:    subl %ecx, %esi
+; SSSE3-NEXT:    setns %dl
+; SSSE3-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    subl %ecx, %r8d
+; SSSE3-NEXT:    cmovol %edx, %r8d
+; SSSE3-NEXT:    movd %xmm1, %edx
+; SSSE3-NEXT:    movd %xmm0, %ecx
+; SSSE3-NEXT:    xorl %esi, %esi
+; SSSE3-NEXT:    movl %ecx, %edi
+; SSSE3-NEXT:    subl %edx, %edi
+; SSSE3-NEXT:    setns %sil
+; SSSE3-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    subl %edx, %ecx
+; SSSE3-NEXT:    cmovol %esi, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %edx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movl %eax, %esi
+; SSSE3-NEXT:    subl %edx, %esi
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    subl %edx, %eax
+; SSSE3-NEXT:    cmovol %edi, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %r9d
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm0, %edx
+; SSSE3-NEXT:    xorl %edi, %edi
+; SSSE3-NEXT:    movl %edx, %esi
+; SSSE3-NEXT:    subl %r9d, %esi
+; SSSE3-NEXT:    setns %dil
+; SSSE3-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSSE3-NEXT:    subl %r9d, %edx
+; SSSE3-NEXT:    cmovol %edi, %edx
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movd %r8d, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    psrad $8, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v4i24:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $8, %xmm1
+; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
+; SSE41-NEXT:    pslld $8, %xmm0
+; SSE41-NEXT:    pextrd $3, %xmm0, %r8d
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    movl %r8d, %esi
+; SSE41-NEXT:    subl %ecx, %esi
+; SSE41-NEXT:    setns %dl
+; SSE41-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; SSE41-NEXT:    subl %ecx, %r8d
+; SSE41-NEXT:    cmovol %edx, %r8d
+; SSE41-NEXT:    pextrd $2, %xmm1, %edx
+; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
+; SSE41-NEXT:    xorl %esi, %esi
+; SSE41-NEXT:    movl %ecx, %edi
+; SSE41-NEXT:    subl %edx, %edi
+; SSE41-NEXT:    setns %sil
+; SSE41-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    subl %edx, %ecx
+; SSE41-NEXT:    cmovol %esi, %ecx
+; SSE41-NEXT:    movd %xmm1, %edx
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movl %eax, %esi
+; SSE41-NEXT:    subl %edx, %esi
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    subl %edx, %eax
+; SSE41-NEXT:    cmovol %edi, %eax
+; SSE41-NEXT:    pextrd $1, %xmm1, %r9d
+; SSE41-NEXT:    pextrd $1, %xmm0, %edx
+; SSE41-NEXT:    xorl %edi, %edi
+; SSE41-NEXT:    movl %edx, %esi
+; SSE41-NEXT:    subl %r9d, %esi
+; SSE41-NEXT:    setns %dil
+; SSE41-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; SSE41-NEXT:    subl %r9d, %edx
+; SSE41-NEXT:    cmovol %edi, %edx
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT:    psrad $8, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v4i24:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX-NEXT:    vpextrd $3, %xmm1, %ecx
+; AVX-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX-NEXT:    vpextrd $3, %xmm0, %r9d
+; AVX-NEXT:    xorl %edx, %edx
+; AVX-NEXT:    movl %r9d, %esi
+; AVX-NEXT:    subl %ecx, %esi
+; AVX-NEXT:    setns %dl
+; AVX-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; AVX-NEXT:    subl %ecx, %r9d
+; AVX-NEXT:    cmovol %edx, %r9d
+; AVX-NEXT:    vpextrd $2, %xmm1, %edx
+; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %ecx, %edi
+; AVX-NEXT:    subl %edx, %edi
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; AVX-NEXT:    subl %edx, %ecx
+; AVX-NEXT:    cmovol %esi, %ecx
+; AVX-NEXT:    vmovd %xmm1, %r8d
+; AVX-NEXT:    vmovd %xmm0, %edx
+; AVX-NEXT:    xorl %edi, %edi
+; AVX-NEXT:    movl %edx, %esi
+; AVX-NEXT:    subl %r8d, %esi
+; AVX-NEXT:    setns %dil
+; AVX-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; AVX-NEXT:    subl %r8d, %edx
+; AVX-NEXT:    cmovol %edi, %edx
+; AVX-NEXT:    vpextrd $1, %xmm1, %r8d
+; AVX-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    movl %eax, %edi
+; AVX-NEXT:    subl %r8d, %edi
+; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; AVX-NEXT:    subl %r8d, %eax
+; AVX-NEXT:    cmovol %esi, %eax
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $3, %r9d, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <4 x i24> @llvm.ssub.sat.v4i24(<4 x i24> %x, <4 x i24> %y)
+  ret <4 x i24> %z
+}
+
+define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
+; SSE-LABEL: v2i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movq %r8, %r13
+; SSE-NEXT:    sbbq %r14, %r13
+; SSE-NEXT:    movq %r13, %r10
+; SSE-NEXT:    sarq $63, %r10
+; SSE-NEXT:    xorl %edi, %edi
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    setns %dil
+; SSE-NEXT:    movabsq $9223372036854775807, %r12 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT:    leaq (%rdi,%r12), %r15
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    setns %r8b
+; SSE-NEXT:    cmpb %dil, %r8b
+; SSE-NEXT:    setne %dil
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    setns %bl
+; SSE-NEXT:    cmpb %bl, %r8b
+; SSE-NEXT:    setne %bl
+; SSE-NEXT:    testb %dil, %bl
+; SSE-NEXT:    cmoveq %r13, %r15
+; SSE-NEXT:    cmoveq %rcx, %r10
+; SSE-NEXT:    subq %r9, %rsi
+; SSE-NEXT:    movq %rdx, %rdi
+; SSE-NEXT:    sbbq %r11, %rdi
+; SSE-NEXT:    setns %bl
+; SSE-NEXT:    movzbl %bl, %ebx
+; SSE-NEXT:    addq %rbx, %r12
+; SSE-NEXT:    movq %rdi, %rcx
+; SSE-NEXT:    sarq $63, %rcx
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    setns %r8b
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    setns %dl
+; SSE-NEXT:    cmpb %r8b, %dl
+; SSE-NEXT:    setne %r8b
+; SSE-NEXT:    cmpb %bl, %dl
+; SSE-NEXT:    setne %dl
+; SSE-NEXT:    testb %dl, %r8b
+; SSE-NEXT:    cmoveq %rsi, %rcx
+; SSE-NEXT:    cmoveq %rdi, %r12
+; SSE-NEXT:    movq %r15, 24(%rax)
+; SSE-NEXT:    movq %r10, 16(%rax)
+; SSE-NEXT:    movq %r12, 8(%rax)
+; SSE-NEXT:    movq %rcx, (%rax)
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v2i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %r15
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %r13
+; AVX-NEXT:    pushq %r12
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movq %r8, %r13
+; AVX-NEXT:    sbbq %r14, %r13
+; AVX-NEXT:    movq %r13, %r10
+; AVX-NEXT:    sarq $63, %r10
+; AVX-NEXT:    xorl %edi, %edi
+; AVX-NEXT:    testq %r13, %r13
+; AVX-NEXT:    setns %dil
+; AVX-NEXT:    movabsq $9223372036854775807, %r12 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX-NEXT:    leaq (%rdi,%r12), %r15
+; AVX-NEXT:    testq %r8, %r8
+; AVX-NEXT:    setns %r8b
+; AVX-NEXT:    cmpb %dil, %r8b
+; AVX-NEXT:    setne %dil
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    setns %bl
+; AVX-NEXT:    cmpb %bl, %r8b
+; AVX-NEXT:    setne %bl
+; AVX-NEXT:    testb %dil, %bl
+; AVX-NEXT:    cmoveq %r13, %r15
+; AVX-NEXT:    cmoveq %rcx, %r10
+; AVX-NEXT:    subq %r9, %rsi
+; AVX-NEXT:    movq %rdx, %rdi
+; AVX-NEXT:    sbbq %r11, %rdi
+; AVX-NEXT:    setns %bl
+; AVX-NEXT:    movzbl %bl, %ebx
+; AVX-NEXT:    addq %rbx, %r12
+; AVX-NEXT:    movq %rdi, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    testq %r11, %r11
+; AVX-NEXT:    setns %r8b
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    setns %dl
+; AVX-NEXT:    cmpb %r8b, %dl
+; AVX-NEXT:    setne %r8b
+; AVX-NEXT:    cmpb %bl, %dl
+; AVX-NEXT:    setne %dl
+; AVX-NEXT:    testb %dl, %r8b
+; AVX-NEXT:    cmoveq %rsi, %rcx
+; AVX-NEXT:    cmoveq %rdi, %r12
+; AVX-NEXT:    movq %r15, 24(%rax)
+; AVX-NEXT:    movq %r10, 16(%rax)
+; AVX-NEXT:    movq %r12, 8(%rax)
+; AVX-NEXT:    movq %rcx, (%rax)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r12
+; AVX-NEXT:    popq %r13
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    popq %r15
+; AVX-NEXT:    retq
+  %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
+  ret <2 x i128> %z
+}
diff --git a/test/CodeGen/X86/stack-folding-adx-x86_64.ll b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
index 22d9c39125cc0b0e658ad841ed4695382a2ab944..fa00b7cd7fcc8b7b6b9b17be9cb707c977d9e1d2 100644
--- a/test/CodeGen/X86/stack-folding-adx-x86_64.ll
+++ b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
@@ -13,52 +13,75 @@ define i8 @stack_fold_addcarry_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
   ;CHECK-LABEL: stack_fold_addcarry_u32
   ;CHECK:       adcl {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  %2 = tail call i8 @llvm.x86.addcarry.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
-  ret i8 %2;
+  %2 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2)
+  %3 = extractvalue { i8, i32 } %2, 1
+  %4 = bitcast i8* %a3 to i32*
+  store i32 %3, i32* %4, align 1
+  %5 = extractvalue { i8, i32 } %2, 0
+  ret i8 %5
 }
-declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)
 
 define i8 @stack_fold_addcarry_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
   ;CHECK-LABEL: stack_fold_addcarry_u64
   ;CHECK:       adcq {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 8-byte Folded Reload
   %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  %2 = tail call i8 @llvm.x86.addcarry.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
-  ret i8 %2;
+  %2 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2)
+  %3 = extractvalue { i8, i64 } %2, 1
+  %4 = bitcast i8* %a3 to i64*
+  store i64 %3, i64* %4, align 1
+  %5 = extractvalue { i8, i64 } %2, 0
+  ret i8 %5
 }
-declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)
 
 define i8 @stack_fold_addcarryx_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
   ;CHECK-LABEL: stack_fold_addcarryx_u32
   ;CHECK:       adcl {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  %2 = tail call i8 @llvm.x86.addcarryx.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
-  ret i8 %2;
+  %2 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2)
+  %3 = extractvalue { i8, i32 } %2, 1
+  %4 = bitcast i8* %a3 to i32*
+  store i32 %3, i32* %4, align 1
+  %5 = extractvalue { i8, i32 } %2, 0
+  ret i8 %5
 }
-declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)
 
 define i8 @stack_fold_addcarryx_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
   ;CHECK-LABEL: stack_fold_addcarryx_u64
   ;CHECK:       adcq {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 8-byte Folded Reload
   %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  %2 = tail call i8 @llvm.x86.addcarryx.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
-  ret i8 %2;
+  %2 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2)
+  %3 = extractvalue { i8, i64 } %2, 1
+  %4 = bitcast i8* %a3 to i64*
+  store i64 %3, i64* %4, align 1
+  %5 = extractvalue { i8, i64 } %2, 0
+  ret i8 %5
 }
-declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)
 
 define i8 @stack_fold_subborrow_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
   ;CHECK-LABEL: stack_fold_subborrow_u32
   ;CHECK:       sbbl {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 4-byte Folded Reload
   %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  %2 = tail call i8 @llvm.x86.subborrow.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
-  ret i8 %2;
+  %2 = call { i8, i32 } @llvm.x86.subborrow.32(i8 %a0, i32 %a1, i32 %a2)
+  %3 = extractvalue { i8, i32 } %2, 1
+  %4 = bitcast i8* %a3 to i32*
+  store i32 %3, i32* %4, align 1
+  %5 = extractvalue { i8, i32 } %2, 0
+  ret i8 %5
 }
-declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*)
 
 define i8 @stack_fold_subborrow_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
   ;CHECK-LABEL: stack_fold_subborrow_u64
   ;CHECK:       sbbq {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 8-byte Folded Reload
   %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  %2 = tail call i8 @llvm.x86.subborrow.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
-  ret i8 %2;
+  %2 = call { i8, i64 } @llvm.x86.subborrow.64(i8 %a0, i64 %a1, i64 %a2)
+  %3 = extractvalue { i8, i64 } %2, 1
+  %4 = bitcast i8* %a3 to i64*
+  store i64 %3, i64* %4, align 1
+  %5 = extractvalue { i8, i64 } %2, 0
+  ret i8 %5
 }
-declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*)
+
+declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
+declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64)
+declare { i8, i32 } @llvm.x86.subborrow.32(i8, i32, i32)
+declare { i8, i64 } @llvm.x86.subborrow.64(i8, i64, i64)
diff --git a/test/CodeGen/X86/stack-folding-adx.mir b/test/CodeGen/X86/stack-folding-adx.mir
index b51d48095c208b355f3f5d0dcd9f783286e882b7..6e977821853df7eb30c582a6fba2cc697f6c8a8d 100644
--- a/test/CodeGen/X86/stack-folding-adx.mir
+++ b/test/CodeGen/X86/stack-folding-adx.mir
@@ -5,7 +5,7 @@
   ; Function Attrs: nounwind
   define i8 @stack_fold_adcx32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) #0 {
     %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-    %2 = call { i8, i32 } @llvm.x86.addcarryx.u32(i8 %a0, i32 %a1, i32 %a2)
+    %2 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2)
     %3 = extractvalue { i8, i32 } %2, 1
     %4 = bitcast i8* %a3 to i32*
     store i32 %3, i32* %4, align 1
@@ -16,7 +16,7 @@
   ; Function Attrs: nounwind
   define i8 @stack_fold_adcx64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) #0 {
     %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-    %2 = call { i8, i64 } @llvm.x86.addcarryx.u64(i8 %a0, i64 %a1, i64 %a2)
+    %2 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2)
     %3 = extractvalue { i8, i64 } %2, 1
     %4 = bitcast i8* %a3 to i64*
     store i64 %3, i64* %4, align 1
@@ -26,7 +26,7 @@
 
   define i8 @stack_fold_adox32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) #0 {
     %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-    %2 = call { i8, i32 } @llvm.x86.addcarryx.u32(i8 %a0, i32 %a1, i32 %a2)
+    %2 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2)
     %3 = extractvalue { i8, i32 } %2, 1
     %4 = bitcast i8* %a3 to i32*
     store i32 %3, i32* %4, align 1
@@ -37,7 +37,7 @@
   ; Function Attrs: nounwind
   define i8 @stack_fold_adox64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) #0 {
     %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-    %2 = call { i8, i64 } @llvm.x86.addcarryx.u64(i8 %a0, i64 %a1, i64 %a2)
+    %2 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2)
     %3 = extractvalue { i8, i64 } %2, 1
     %4 = bitcast i8* %a3 to i64*
     store i64 %3, i64* %4, align 1
@@ -46,10 +46,10 @@
   }
 
   ; Function Attrs: nounwind readnone
-  declare { i8, i32 } @llvm.x86.addcarryx.u32(i8, i32, i32) #1
+  declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32) #1
 
   ; Function Attrs: nounwind readnone
-  declare { i8, i64 } @llvm.x86.addcarryx.u64(i8, i64, i64) #1
+  declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64) #1
 
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(i8*, i8**) #2
diff --git a/test/CodeGen/X86/stack-folding-bmi2.ll b/test/CodeGen/X86/stack-folding-bmi2.ll
index b70f7c668d016560213a4c0c308b8cae9d27445d..0d0be6cd6dd5a455e8de0fcbb6c5008fe4e6d8b8 100644
--- a/test/CodeGen/X86/stack-folding-bmi2.ll
+++ b/test/CodeGen/X86/stack-folding-bmi2.ll
@@ -26,20 +26,6 @@ define i64 @stack_fold_bzhi_u64(i64 %a0, i64 %a1)   {
 }
 declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
 
-define i64 @stack_fold_mulx_u64(i64 %a0, i64 %a1, i64 *%a2)   {
-  ;CHECK-LABEL: stack_fold_mulx_u64
-  ;CHECK:       mulxq {{-?[0-9]*}}(%rsp), %rax, %rcx {{.*#+}} 8-byte Folded Reload
-  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  %2 = zext i64 %a0 to i128
-  %3 = zext i64 %a1 to i128
-  %4 = mul i128 %2, %3
-  %5 = lshr i128 %4, 64
-  %6 = trunc i128 %4 to i64
-  %7 = trunc i128 %5 to i64
-  store i64 %7, i64 *%a2
-  ret i64 %6
-}
-
 define i32 @stack_fold_pdep_u32(i32 %a0, i32 %a1)   {
   ;CHECK-LABEL: stack_fold_pdep_u32
   ;CHECK:       pdepl {{-?[0-9]*}}(%rsp), %eax, %eax {{.*#+}} 4-byte Folded Reload
diff --git a/test/CodeGen/X86/stack-folding-bmi2.mir b/test/CodeGen/X86/stack-folding-bmi2.mir
new file mode 100644
index 0000000000000000000000000000000000000000..ab97872e44320a2cbf7355eb0a4602bee472180e
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-bmi2.mir
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=x86_64-- -run-pass=greedy %s | FileCheck %s
+# Tests for stack folding MULX. Doing this as a MIR test to ensure MULX is used and not legacy MUL.
+--- |
+  ; Function Attrs: nounwind
+  define i32 @stack_fold_mulx_u32(i32 %a0, i32 %a1) #0 {
+    %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+    %2 = zext i32 %a0 to i128
+    %3 = zext i32 %a1 to i128
+    %4 = mul i128 %2, %3
+    %5 = lshr i128 %4, 32
+    %6 = trunc i128 %5 to i32
+    ret i32 %6
+  }
+
+  ; Function Attrs: nounwind
+  define i64 @stack_fold_mulx_u64(i64 %a0, i64 %a1) #0 {
+    %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+    %2 = zext i64 %a0 to i128
+    %3 = zext i64 %a1 to i128
+    %4 = mul i128 %2, %3
+    %5 = lshr i128 %4, 64
+    %6 = trunc i128 %5 to i64
+    ret i64 %6
+  }
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+
+  attributes #0 = { nounwind "target-features"="+bmi2" }
+  attributes #1 = { nounwind }
+
+...
+---
+name:            stack_fold_mulx_u32
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: fr64 }
+  - { id: 3, class: gr32 }
+  - { id: 4, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $edi, $esi
+
+    ; CHECK-LABEL: name: stack_fold_mulx_u32
+    ; CHECK: liveins: $edi, $esi
+    ; CHECK: MOV32mr %stack.0, 1, $noreg, 0, $noreg, $esi :: (store 4 into %stack.0)
+    ; CHECK: MOV32mr %stack.1, 1, $noreg, 0, $noreg, $edi :: (store 4 into %stack.1)
+    ; CHECK: INLINEASM &nop, 1, 4063242, def dead %2, 12, implicit-def dead early-clobber $rax, 12, implicit-def dead early-clobber $rbx, 12, implicit-def dead early-clobber $rcx, 12, implicit-def dead early-clobber $rdx, 12, implicit-def dead early-clobber $rsi, 12, implicit-def dead early-clobber $rdi, 12, implicit-def dead early-clobber $rbp, 12, implicit-def dead early-clobber $r8, 12, implicit-def dead early-clobber $r9, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15
+    ; CHECK: $edx = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load 4 from %stack.1)
+    ; CHECK: %3:gr32, dead %4:gr32 = MULX32rm %stack.0, 1, $noreg, 0, $noreg, implicit $edx :: (load 4 from %stack.0)
+    ; CHECK: $eax = COPY %3
+    ; CHECK: RET 0, $eax
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    INLINEASM &nop, 1, 4063242, def dead %2, 12, implicit-def dead early-clobber $rax, 12, implicit-def dead early-clobber $rbx, 12, implicit-def dead early-clobber $rcx, 12, implicit-def dead early-clobber $rdx, 12, implicit-def dead early-clobber $rsi, 12, implicit-def dead early-clobber $rdi, 12, implicit-def dead early-clobber $rbp, 12, implicit-def dead early-clobber $r8, 12, implicit-def dead early-clobber $r9, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15
+    $edx = COPY %0
+    %3:gr32, dead %4:gr32 = MULX32rr %1, implicit killed $edx
+    $eax = COPY %3
+    RET 0, killed $eax
+
+...
+---
+name:            stack_fold_mulx_u64
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64 }
+  - { id: 1, class: gr64 }
+  - { id: 2, class: fr64 }
+  - { id: 3, class: gr64 }
+  - { id: 4, class: gr64 }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%1' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $rdi, $rsi
+
+    ; CHECK-LABEL: name: stack_fold_mulx_u64
+    ; CHECK: liveins: $rdi, $rsi
+    ; CHECK: MOV64mr %stack.0, 1, $noreg, 0, $noreg, $rsi :: (store 8 into %stack.0)
+    ; CHECK: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $rdi :: (store 8 into %stack.1)
+    ; CHECK: INLINEASM &nop, 1, 4063242, def dead %2, 12, implicit-def dead early-clobber $rax, 12, implicit-def dead early-clobber $rbx, 12, implicit-def dead early-clobber $rcx, 12, implicit-def dead early-clobber $rdx, 12, implicit-def dead early-clobber $rsi, 12, implicit-def dead early-clobber $rdi, 12, implicit-def dead early-clobber $rbp, 12, implicit-def dead early-clobber $r8, 12, implicit-def dead early-clobber $r9, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15
+    ; CHECK: $rdx = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load 8 from %stack.1)
+    ; CHECK: %3:gr64, dead %4:gr64 = MULX64rm %stack.0, 1, $noreg, 0, $noreg, implicit $rdx :: (load 8 from %stack.0)
+    ; CHECK: $rax = COPY %3
+    ; CHECK: RET 0, $rax
+    %1:gr64 = COPY $rsi
+    %0:gr64 = COPY $rdi
+    INLINEASM &nop, 1, 4063242, def dead %2, 12, implicit-def dead early-clobber $rax, 12, implicit-def dead early-clobber $rbx, 12, implicit-def dead early-clobber $rcx, 12, implicit-def dead early-clobber $rdx, 12, implicit-def dead early-clobber $rsi, 12, implicit-def dead early-clobber $rdi, 12, implicit-def dead early-clobber $rbp, 12, implicit-def dead early-clobber $r8, 12, implicit-def dead early-clobber $r9, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15
+    $rdx = COPY %0
+    %3:gr64, dead %4:gr64 = MULX64rr %1, implicit killed $rdx
+    $rax = COPY %3
+    RET 0, killed $rax
+
+...
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 8bcebd20dbd9cd16dec7a9fe48c1c9a762a9371c..0d903efdbe2fc2be92fcbe60c31613abefc7027a 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -881,12 +881,12 @@ define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {
   ret <4 x float> %1
 }
 
-define i32 @stack_fold_extractps(<4 x float> %a0) {
+define i32 @stack_fold_extractps(<4 x float> %a0, <4 x float> %a1) {
   ;CHECK-LABEL: stack_fold_extractps
   ;CHECK:       vextractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
   ; fadd forces execution domain
-  %1 = fadd <4 x float> %a0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %1 = fadd <4 x float> %a0, %a1
   %2 = extractelement <4 x float> %1, i32 1
   %3 = bitcast float %2 to i32
   %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index a60e85df1ab7831bfa6be967901c5a54c68d63ef..3604aaa35fc7c318515227eb6bd3a7a483fb4234 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -624,12 +624,12 @@ define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define i32 @stack_fold_extractps(<4 x float> %a0) {
+define i32 @stack_fold_extractps(<4 x float> %a0, <4 x float> %a1) {
   ;CHECK-LABEL: stack_fold_extractps
   ;CHECK:       extractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
   ; fadd forces execution domain
-  %1 = fadd <4 x float> %a0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %1 = fadd <4 x float> %a0, %a1
   %2 = extractelement <4 x float> %1, i32 1
   %3 = bitcast float %2 to i32
   %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
diff --git a/test/CodeGen/X86/stack-folding-int-avx1.ll b/test/CodeGen/X86/stack-folding-int-avx1.ll
index 874672035736d66d246b7161fdfc6b61c7f61ea6..91bf9e2876387f6b95bdaeef57071d31cb86137b 100644
--- a/test/CodeGen/X86/stack-folding-int-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx1.ll
@@ -72,11 +72,11 @@ define <4 x i32> @stack_fold_movd_load(i32 %a0) {
   ret <4 x i32> %3
 }
 
-define i32 @stack_fold_movd_store(<4 x i32> %a0) {
+define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_movd_store
   ;CHECK:       movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ; add forces execution domain
-  %1 = add <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
+  %1 = add <4 x i32> %a0, %a1
   %2 = extractelement <4 x i32> %1, i32 0
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i32 %2
@@ -92,11 +92,11 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
   ret <2 x i64> %3
 }
 
-define i64 @stack_fold_movq_store(<2 x i64> %a0) {
+define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_movq_store
   ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
   ; add forces execution domain
-  %1 = add <2 x i64> %a0, <i64 1, i64 1>
+  %1 = add <2 x i64> %a0, %a1
   %2 = extractelement <2 x i64> %1, i32 0
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i64 %2
@@ -436,12 +436,12 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
 
 ; TODO stack_fold_pextrb
 
-define i32 @stack_fold_pextrd(<4 x i32> %a0) {
+define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_pextrd
   ;CHECK:       vpextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
   ; add forces execution domain
-  %1 = add <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %1 = add <4 x i32> %a0, %a1
   %2 = extractelement <4 x i32> %1, i32 1
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i32 %2
diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll
index 01ae7ff6d43f0d659cfbda02784191d1258466ed..5464775a43a5ba4d264bbfd8140002d69cb89c6a 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512.ll
@@ -836,12 +836,12 @@ define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32
   ret <32 x i16> %4
 }
 
-define i32 @stack_fold_pextrd(<4 x i32> %a0) {
+define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_pextrd
   ;CHECK:       vpextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
   ; add forces execution domain
-  %1 = add <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %1 = add <4 x i32> %a0, %a1
   %2 = extractelement <4 x i32> %1, i32 1
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i32 %2
diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll
index cabfa92e330904445a058d096267905f1e01c480..3c5aeb7841fb54953c6bd095fc02b66c12e82500 100644
--- a/test/CodeGen/X86/stack-folding-int-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-int-sse42.ll
@@ -108,11 +108,11 @@ define <4 x i32> @stack_fold_movd_load(i32 %a0) {
   ret <4 x i32> %3
 }
 
-define i32 @stack_fold_movd_store(<4 x i32> %a0) {
+define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_movd_store
   ;CHECK:       movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ; add forces execution domain
-  %1 = add <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
+  %1 = add <4 x i32> %a0, %a1
   %2 = extractelement <4 x i32> %1, i32 0
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i32 %2
@@ -128,11 +128,11 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
   ret <2 x i64> %3
 }
 
-define i64 @stack_fold_movq_store(<2 x i64> %a0) {
+define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_movq_store
   ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
   ; add forces execution domain
-  %1 = add <2 x i64> %a0, <i64 1, i64 1>
+  %1 = add <2 x i64> %a0, %a1
   %2 = extractelement <2 x i64> %1, i32 0
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i64 %2
@@ -487,12 +487,12 @@ entry:
   ret i16 %extract
 }
 
-define i32 @stack_fold_pextrd(<4 x i32> %a0) {
+define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_pextrd
   ;CHECK:       pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
   ; add forces execution domain
-  %1 = add <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %1 = add <4 x i32> %a0, %a1
   %2 = extractelement <4 x i32> %1, i32 1
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i32 %2
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
index 966f66815f92ea9aa74e630358b32c5c6f6db543..ee71f6e70ef40581f21190702b0eb2fc295606f2 100644
--- a/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -71,8 +71,58 @@ entry:
   ret i1 %call1
 }
 
+; This test checks that when SP is changed in the function
+; (e.g. passing arguments on stack), the stack map entry
+; takes this adjustment into account.
+declare void @many_arg(i64, i64, i64, i64, i64, i64, i64, i64)
+
+define i32 @test_spadj(i32 addrspace(1)* %p) gc "statepoint-example" {
+  ; CHECK-LABEL: test_spadj
+  ; CHECK: movq %rdi, (%rsp)
+  ; CHECK: xorl %edi, %edi
+  ; CHECK: xorl %esi, %esi
+  ; CHECK: xorl %edx, %edx
+  ; CHECK: xorl %ecx, %ecx
+  ; CHECK: xorl %r8d, %r8d
+  ; CHECK: xorl %r9d, %r9d
+  ; CHECK: pushq $0
+  ; CHECK: pushq $0
+  ; CHECK: callq many_arg
+  ; CHECK: addq $16, %rsp
+  ; CHECK: movq (%rsp)
+  %statepoint_token = call token (i64, i32, void (i64, i64, i64, i64, i64, i64, i64, i64)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi64i64i64i64i64i64i64i64f(i64 0, i32 0, void (i64, i64, i64, i64, i64, i64, i64, i64)* @many_arg, i32 8, i32 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0, i32 0, i32 addrspace(1)* %p)
+  %p.relocated = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %statepoint_token, i32 15, i32 15) ; (%p, %p)
+  %ld = load i32, i32 addrspace(1)* %p.relocated
+  ret i32 %ld
+}
+
+; Test that function arguments at fixed stack offset
+; can be directly encoded in the stack map, without
+; spilling.
+%struct = type { i64, i64, i64 }
+
+declare void @use(%struct*)
+
+define void @test_fixed_arg(%struct* byval %x) gc "statepoint-example" {
+; CHECK-LABEL: test_fixed_arg
+; CHECK: pushq %rax
+; CHECK: leaq 16(%rsp), %rdi
+; Should not spill fixed stack address.
+; CHECK-NOT: movq %rdi, (%rsp)
+; CHECK: callq use
+; CHECK: popq %rax
+; CHECK: retq
+entry:
+  br label %bb
+
+bb:                                               ; preds = %entry
+  %statepoint_token = call token (i64, i32, void (%struct*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp0s_structsf(i64 0, i32 0, void (%struct*)* @use, i32 1, i32 0, %struct* %x, i32 0, i32 1, %struct* %x)
+  ret void
+}
 
 declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidi64i64i64i64i64i64i64i64f(i64, i32, void (i64, i64, i64, i64, i64, i64, i64, i64)*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidp0s_structsf(i64, i32, void (%struct*)*, i32, i32, ...)
 declare i1 @llvm.experimental.gc.result.i1(token)
 declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32) #3
 
@@ -83,11 +133,11 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .short 0
 ; Num Functions
-; CHECK-NEXT:   .long 3
+; CHECK-NEXT:   .long 5
 ; Num LargeConstants
 ; CHECK-NEXT:   .long 0
 ; Num Callsites
-; CHECK-NEXT:   .long 3
+; CHECK-NEXT:   .long 5
 
 ; Functions and stack size
 ; CHECK-NEXT:   .quad test
@@ -99,6 +149,12 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
 ; CHECK-NEXT:   .quad test_id
 ; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
+; CHECK-NEXT:   .quad test_spadj
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad 1
+; CHECK-NEXT:   .quad test_fixed_arg
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad 1
 
 ;
 ; test
@@ -327,3 +383,124 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
 ; CHECK: .short	0
 ; CHECK: .short	0
 ; CHECK: .p2align	3
+
+;
+; test_spadj
+
+; Statepoint ID
+; CHECK-NEXT: .quad	0
+
+; Instruction Offset
+; CHECK-NEXT: .long	.Ltmp3-test_spadj
+
+; Reserved:
+; CHECK: .short	0
+
+; NumLocations:
+; CHECK: .short	5
+
+; StkMapRecord[0]:
+; SmallConstant(0):
+; CHECK: .byte	4
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	0
+; CHECK-NEXT:   .short  0
+; CHECK: .long	0
+
+; StkMapRecord[1]:
+; SmallConstant(0):
+; CHECK: .byte	4
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	0
+; CHECK-NEXT:   .short  0
+; CHECK: .long	0
+
+; StkMapRecord[2]:
+; SmallConstant(0):
+; CHECK: .byte	4
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	0
+; CHECK-NEXT:   .short  0
+; CHECK: .long	0
+
+; StkMapRecord[3]:
+; Indirect Spill Slot [RSP+16]
+; CHECK: .byte	3
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	7
+; CHECK-NEXT:   .short  0
+; CHECK: .long	16
+
+; StkMapRecord[4]:
+; Indirect Spill Slot [RSP+16]
+; CHECK: .byte	3
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	7
+; CHECK-NEXT:   .short  0
+; CHECK: .long	16
+
+; No padding or LiveOuts
+; CHECK: .short	0
+; CHECK: .short	0
+; CHECK: .p2align	3
+
+;
+; test_fixed_arg
+
+; Statepoint ID
+; CHECK-NEXT: .quad	0
+
+; Instruction Offset
+; CHECK-NEXT: .long	.Ltmp4-test_fixed_arg
+
+; Reserved:
+; CHECK: .short	0
+
+; NumLocations:
+; CHECK: .short	4
+
+; StkMapRecord[0]:
+; SmallConstant(0):
+; CHECK: .byte	4
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	0
+; CHECK-NEXT:   .short  0
+; CHECK: .long	0
+
+; StkMapRecord[1]:
+; SmallConstant(0):
+; CHECK: .byte	4
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	0
+; CHECK-NEXT:   .short  0
+; CHECK: .long	0
+
+; StkMapRecord[2]:
+; SmallConstant(1):
+; CHECK: .byte	4
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	0
+; CHECK-NEXT:   .short  0
+; CHECK: .long	1
+
+; StkMapRecord[3]:
+; Direct RSP+16
+; CHECK: .byte	2
+; CHECK-NEXT:   .byte   0
+; CHECK: .short 8
+; CHECK: .short	7
+; CHECK-NEXT:   .short  0
+; CHECK: .long	16
+
+; No padding or LiveOuts
+; CHECK: .short	0
+; CHECK: .short	0
+; CHECK: .p2align	3
diff --git a/test/CodeGen/X86/stores-merging.ll b/test/CodeGen/X86/stores-merging.ll
index 5ccb5825934b6685be1f805656ca00229b6c5f51..eda7f3a2abfd4071be2120c6b8dbb1db6eec95e5 100644
--- a/test/CodeGen/X86/stores-merging.ll
+++ b/test/CodeGen/X86/stores-merging.ll
@@ -13,9 +13,8 @@
 define void @redundant_stores_merging() {
 ; CHECK-LABEL: redundant_stores_merging:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $528280977409, %rax # imm = 0x7B00000001
+; CHECK-NEXT:    movabsq $1958505086977, %rax # imm = 0x1C800000001
 ; CHECK-NEXT:    movq %rax, e+{{.*}}(%rip)
-; CHECK-NEXT:    movl $456, e+{{.*}}(%rip) # imm = 0x1C8
 ; CHECK-NEXT:    retq
   store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
   store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
index 2b335ea426898302b687c83a8309b1551a720fd2..7ebc5dd0d129aaff17b53a739eb38ee3ea1206ab 100644
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -723,7 +723,6 @@ define i32 @test_x86_tbm_t1mskc_u32_z(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: test_x86_tbm_t1mskc_u32_z:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    t1mskcl %edi, %eax
-; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    cmovel %esi, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
@@ -767,7 +766,6 @@ define i64 @test_x86_tbm_t1mskc_u64_z(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: test_x86_tbm_t1mskc_u64_z:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    t1mskcq %rdi, %rax
-; CHECK-NEXT:    testq %rax, %rax
 ; CHECK-NEXT:    cmoveq %rsi, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
@@ -811,7 +809,6 @@ define i32 @test_x86_tbm_tzmsk_u32_z(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: test_x86_tbm_tzmsk_u32_z:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    tzmskl %edi, %eax
-; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    cmovel %esi, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
@@ -855,7 +852,6 @@ define i64 @test_x86_tbm_tzmsk_u64_z(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: test_x86_tbm_tzmsk_u64_z:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    tzmskq %rdi, %rax
-; CHECK-NEXT:    testq %rax, %rax
 ; CHECK-NEXT:    cmoveq %rsi, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
diff --git a/test/CodeGen/X86/test-shrink-bug.ll b/test/CodeGen/X86/test-shrink-bug.ll
index 4fba792459c018026571c6565b190fd39cf1e8ae..ca2316cd063636e97bdec1525201b3c0d4f13448 100644
--- a/test/CodeGen/X86/test-shrink-bug.ll
+++ b/test/CodeGen/X86/test-shrink-bug.ll
@@ -68,7 +68,7 @@ define void @fail(i16 %a, <2 x i8> %b) {
 ; CHECK-X64:       # %bb.0:
 ; CHECK-X64-NEXT:    pushq %rax
 ; CHECK-X64-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-X64-NEXT:    testw $263, %di # imm = 0x107
+; CHECK-X64-NEXT:    testl $263, %edi # imm = 0x107
 ; CHECK-X64-NEXT:    je .LBB1_2
 ; CHECK-X64-NEXT:  # %bb.1:
 ; CHECK-X64-NEXT:    pand {{.*}}(%rip), %xmm0
diff --git a/test/CodeGen/X86/test-shrink.ll b/test/CodeGen/X86/test-shrink.ll
index 5e9f49571dd8f64a75eac734f73c264fe16b4de7..993b78c3f40eb757058e506a548a5a59ad0e30a0 100644
--- a/test/CodeGen/X86/test-shrink.ll
+++ b/test/CodeGen/X86/test-shrink.ll
@@ -590,7 +590,6 @@ define void @and16_trunc_8_sign(i16 %x) nounwind {
 ; CHECK-WIN32-64-LABEL: and16_trunc_8_sign:
 ; CHECK-WIN32-64:       # %bb.0:
 ; CHECK-WIN32-64-NEXT:    subq $40, %rsp
-; CHECK-WIN32-64-NEXT:    # kill: def $cx killed $cx def $ecx
 ; CHECK-WIN32-64-NEXT:    testb $-128, %cl
 ; CHECK-WIN32-64-NEXT:    jg .LBB13_2
 ; CHECK-WIN32-64-NEXT:  # %bb.1: # %yes
@@ -601,8 +600,7 @@ define void @and16_trunc_8_sign(i16 %x) nounwind {
 ;
 ; CHECK-X86-LABEL: and16_trunc_8_sign:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    testb $-128, %al
+; CHECK-X86-NEXT:    testb $-128, {{[0-9]+}}(%esp)
 ; CHECK-X86-NEXT:    jg .LBB13_2
 ; CHECK-X86-NEXT:  # %bb.1: # %yes
 ; CHECK-X86-NEXT:    calll bar
@@ -733,8 +731,8 @@ define void @and32_trunc_16_sign(i32 %x) nounwind {
 ;
 ; CHECK-X86-LABEL: and32_trunc_16_sign:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    movl $32768, %eax # imm = 0x8000
-; CHECK-X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    andl $32768, %eax # imm = 0x8000
 ; CHECK-X86-NEXT:    testw %ax, %ax
 ; CHECK-X86-NEXT:    jg .LBB16_2
 ; CHECK-X86-NEXT:  # %bb.1: # %yes
@@ -778,7 +776,8 @@ define void @and32_trunc_16_sign_minsize(i32 %x) minsize nounwind {
 ;
 ; CHECK-X86-LABEL: and32_trunc_16_sign_minsize:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    testw $-32768, {{[0-9]+}}(%esp) # imm = 0x8000
+; CHECK-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    testw $-32768, %ax # imm = 0x8000
 ; CHECK-X86-NEXT:    jg .LBB17_2
 ; CHECK-X86-NEXT:  # %bb.1: # %yes
 ; CHECK-X86-NEXT:    calll bar
diff --git a/test/CodeGen/X86/test_x86condbr_globaladdr.mir b/test/CodeGen/X86/test_x86condbr_globaladdr.mir
new file mode 100644
index 0000000000000000000000000000000000000000..054789b7ef60b6cbbaec11e11c96767f8f63db96
--- /dev/null
+++ b/test/CodeGen/X86/test_x86condbr_globaladdr.mir
@@ -0,0 +1,30 @@
+# RUN: llc -o - %s -mtriple=i686-- -mcpu=ivybridge --run-pass X86CondBrFolding | FileCheck %s
+
+# Test wrong assertion when meet SUB32ri with global address
+# in X86CondBrFoldingiPass
+--- |
+  @img2buf_normal = external global i32
+  define void @func() { ret void }
+...
+---
+# CHECK: bb.0:
+# CHECK: %2:gr32 = SUB32ri %1, @img2buf_normal, implicit-def $eflags
+
+name: func
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $edx
+    %1:gr32 = COPY $edx
+    %2:gr32 = MOV32rm %1:gr32, 1, $noreg, 850256, $noreg
+    %3:gr32 = SUB32ri %2:gr32, @img2buf_normal, implicit-def $eflags
+    JE_1 %bb.2, implicit $eflags
+    JMP_1 %bb.3
+
+  bb.2:
+    RET 0, undef $eax
+
+  bb.3:
+    $eax = MOV32rr %3:gr32
+    RET 0, $eax
+...
diff --git a/test/CodeGen/X86/trunc-subvector.ll b/test/CodeGen/X86/trunc-subvector.ll
index 88830ee72881a86a832348030edc438d0aee026f..0355241ae5ffae7d2e6516a73760d67f34728d13 100644
--- a/test/CodeGen/X86/trunc-subvector.ll
+++ b/test/CodeGen/X86/trunc-subvector.ll
@@ -41,7 +41,8 @@ define <2 x i32> @test3(<8 x i32> %v) {
 ; SSE2-LABEL: test3:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
@@ -67,8 +68,8 @@ define <2 x i32> @test3(<8 x i32> %v) {
 define <2 x i32> @test4(<8 x i32> %v) {
 ; SSE2-LABEL: test4:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
@@ -93,12 +94,12 @@ define <2 x i32> @test4(<8 x i32> %v) {
 define <2 x i32> @test5(<8 x i32> %v) {
 ; SSE2-LABEL: test5:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
 ; SSE2-NEXT:    retq
@@ -107,6 +108,7 @@ define <2 x i32> @test5(<8 x i32> %v) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
@@ -226,6 +228,7 @@ define <2 x i32> @test10(<8 x i32> %v) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
diff --git a/test/CodeGen/X86/uadd_sat_vec.ll b/test/CodeGen/X86/uadd_sat_vec.ll
new file mode 100644
index 0000000000000000000000000000000000000000..98f6d0f19237c226192a2e504a1ba8c92134519b
--- /dev/null
+++ b/test/CodeGen/X86/uadd_sat_vec.ll
@@ -0,0 +1,1015 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <12 x i8> @llvm.uadd.sat.v12i8(<12 x i8>, <12 x i8>)
+declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
+
+declare <1 x i16> @llvm.uadd.sat.v1i16(<1 x i16>, <1 x i16>)
+declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <12 x i16> @llvm.uadd.sat.v12i16(<12 x i16>, <12 x i16>)
+declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>)
+
+declare <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1>, <16 x i1>)
+declare <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4>, <16 x i4>)
+
+declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i24> @llvm.uadd.sat.v4i24(<4 x i24>, <4 x i24>)
+declare <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128>, <2 x i128>)
+
+; Legal types, depending on architecture.
+
+define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE-LABEL: v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddusb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
+  ret <16 x i8> %z
+}
+
+define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
+; SSE-LABEL: v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddusb %xmm2, %xmm0
+; SSE-NEXT:    paddusb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddusb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
+  ret <32 x i8> %z
+}
+
+define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
+; SSE-LABEL: v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddusb %xmm4, %xmm0
+; SSE-NEXT:    paddusb %xmm5, %xmm1
+; SSE-NEXT:    paddusb %xmm6, %xmm2
+; SSE-NEXT:    paddusb %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddusb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddusb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddusb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddusb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddusb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddusb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddusb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
+  ret <64 x i8> %z
+}
+
+define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE-LABEL: v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddusw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
+  ret <8 x i16> %z
+}
+
+define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
+; SSE-LABEL: v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddusw %xmm2, %xmm0
+; SSE-NEXT:    paddusw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddusw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
+  ret <16 x i16> %z
+}
+
+define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
+; SSE-LABEL: v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddusw %xmm4, %xmm0
+; SSE-NEXT:    paddusw %xmm5, %xmm1
+; SSE-NEXT:    paddusw %xmm6, %xmm2
+; SSE-NEXT:    paddusw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddusw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddusw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddusw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddusw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddusw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddusw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
+  ret <32 x i16> %z
+}
+
+; Too narrow vectors, legalized by widening.
+
+define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
+; SSE-LABEL: v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    paddusb %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmovwb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <8 x i8>, <8 x i8>* %px
+  %y = load <8 x i8>, <8 x i8>* %py
+  %z = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %x, <8 x i8> %y)
+  store <8 x i8> %z, <8 x i8>* %pz
+  ret void
+}
+
+define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
+; SSE-LABEL: v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    paddusb %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT:    vpmovdb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <4 x i8>, <4 x i8>* %px
+  %y = load <4 x i8>, <4 x i8>* %py
+  %z = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
+  store <4 x i8> %z, <4 x i8>* %pz
+  ret void
+}
+
+define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
+; SSE2-LABEL: v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl (%rsi), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    paddusb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movw %ax, (%rdx)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v2i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl (%rsi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    paddusb %xmm0, %xmm1
+; SSSE3-NEXT:    movd %xmm1, %eax
+; SSSE3-NEXT:    movw %ax, (%rdx)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    movzwl (%rsi), %eax
+; SSE41-NEXT:    movd %eax, %xmm1
+; SSE41-NEXT:    paddusb %xmm0, %xmm1
+; SSE41-NEXT:    pextrw $0, %xmm1, (%rdx)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movzwl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    movzwl (%rsi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movzwl (%rdi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    movzwl (%rsi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movzwl (%rdi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    movzwl (%rsi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmovqb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <2 x i8>, <2 x i8>* %px
+  %y = load <2 x i8>, <2 x i8>* %py
+  %z = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %x, <2 x i8> %y)
+  store <2 x i8> %z, <2 x i8>* %pz
+  ret void
+}
+
+define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
+; SSE-LABEL: v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    paddusw %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpmovdw %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <4 x i16>, <4 x i16>* %px
+  %y = load <4 x i16>, <4 x i16>* %py
+  %z = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %x, <4 x i16> %y)
+  store <4 x i16> %z, <4 x i16>* %pz
+  ret void
+}
+
+define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
+; SSE-LABEL: v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    paddusw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vpmovqw %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <2 x i16>, <2 x i16>* %px
+  %y = load <2 x i16>, <2 x i16>* %py
+  %z = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
+  store <2 x i16> %z, <2 x i16>* %pz
+  ret void
+}
+
+define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
+; SSE-LABEL: v12i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddusb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v12i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <12 x i8> @llvm.uadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
+  ret <12 x i8> %z
+}
+
+define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind {
+; SSE-LABEL: v12i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm0
+; SSE-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE-NEXT:    paddusw (%rsi), %xmm0
+; SSE-NEXT:    paddusw 16(%rsi), %xmm1
+; SSE-NEXT:    movq %xmm1, 16(%rdx)
+; SSE-NEXT:    movdqa %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v12i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vpaddusw (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddusw 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v12i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpaddusw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v12i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vpaddusw (%rsi), %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = load <12 x i16>, <12 x i16>* %px
+  %y = load <12 x i16>, <12 x i16>* %py
+  %z = call <12 x i16> @llvm.uadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
+  store <12 x i16> %z, <12 x i16>* %pz
+  ret void
+}
+
+; Scalarization
+
+define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
+; SSE-LABEL: v1i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movb (%rdi), %al
+; SSE-NEXT:    addb (%rsi), %al
+; SSE-NEXT:    movb $-1, %cl
+; SSE-NEXT:    jb .LBB13_2
+; SSE-NEXT:  # %bb.1:
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:  .LBB13_2:
+; SSE-NEXT:    movb %cl, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v1i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movb (%rdi), %al
+; AVX-NEXT:    addb (%rsi), %al
+; AVX-NEXT:    movb $-1, %cl
+; AVX-NEXT:    jb .LBB13_2
+; AVX-NEXT:  # %bb.1:
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:  .LBB13_2:
+; AVX-NEXT:    movb %cl, (%rdx)
+; AVX-NEXT:    retq
+  %x = load <1 x i8>, <1 x i8>* %px
+  %y = load <1 x i8>, <1 x i8>* %py
+  %z = call <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8> %x, <1 x i8> %y)
+  store <1 x i8> %z, <1 x i8>* %pz
+  ret void
+}
+
+define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
+; SSE-LABEL: v1i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movzwl (%rdi), %eax
+; SSE-NEXT:    addw (%rsi), %ax
+; SSE-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; SSE-NEXT:    cmovael %eax, %ecx
+; SSE-NEXT:    movw %cx, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v1i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    addw (%rsi), %ax
+; AVX-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; AVX-NEXT:    cmovael %eax, %ecx
+; AVX-NEXT:    movw %cx, (%rdx)
+; AVX-NEXT:    retq
+  %x = load <1 x i16>, <1 x i16>* %px
+  %y = load <1 x i16>, <1 x i16>* %py
+  %z = call <1 x i16> @llvm.uadd.sat.v1i16(<1 x i16> %x, <1 x i16> %y)
+  store <1 x i16> %z, <1 x i16>* %pz
+  ret void
+}
+
+; Promotion
+
+define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
+; SSE-LABEL: v16i4:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $4, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    psllw $4, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    paddusb %xmm1, %xmm0
+; SSE-NEXT:    psrlw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v16i4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
+  ret <16 x i4> %z
+}
+
+define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
+; SSE-LABEL: v16i1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $7, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    paddusb %xmm1, %xmm0
+; SSE-NEXT:    psrlw $7, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v16i1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovb2m %xmm1, %k0
+; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovb2m %xmm0, %k1
+; AVX512-NEXT:    korw %k0, %k1, %k0
+; AVX512-NEXT:    vpmovm2b %k0, %xmm0
+; AVX512-NEXT:    retq
+  %z = call <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
+  ret <16 x i1> %z
+}
+
+; Expanded
+
+define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; SSE2-LABEL: v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %ecx
+; SSE2-NEXT:    addl %eax, %ecx
+; SSE2-NEXT:    movl $-1, %eax
+; SSE2-NEXT:    cmovbl %eax, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm3, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm3, %edx
+; SSE2-NEXT:    addl %ecx, %edx
+; SSE2-NEXT:    cmovbl %eax, %edx
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    movd %xmm0, %edx
+; SSE2-NEXT:    addl %ecx, %edx
+; SSE2-NEXT:    cmovbl %eax, %edx
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm0, %edx
+; SSE2-NEXT:    addl %ecx, %edx
+; SSE2-NEXT:    cmovbl %eax, %edx
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %ecx
+; SSSE3-NEXT:    addl %eax, %ecx
+; SSSE3-NEXT:    movl $-1, %eax
+; SSSE3-NEXT:    cmovbl %eax, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm3, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm3, %edx
+; SSSE3-NEXT:    addl %ecx, %edx
+; SSSE3-NEXT:    cmovbl %eax, %edx
+; SSSE3-NEXT:    movd %edx, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSSE3-NEXT:    movd %xmm1, %ecx
+; SSSE3-NEXT:    movd %xmm0, %edx
+; SSSE3-NEXT:    addl %ecx, %edx
+; SSSE3-NEXT:    cmovbl %eax, %edx
+; SSSE3-NEXT:    movd %edx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm0, %edx
+; SSSE3-NEXT:    addl %ecx, %edx
+; SSSE3-NEXT:    cmovbl %eax, %edx
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrd $1, %xmm1, %eax
+; SSE41-NEXT:    pextrd $1, %xmm0, %ecx
+; SSE41-NEXT:    addl %eax, %ecx
+; SSE41-NEXT:    movl $-1, %eax
+; SSE41-NEXT:    cmovbl %eax, %ecx
+; SSE41-NEXT:    movd %xmm1, %edx
+; SSE41-NEXT:    movd %xmm0, %esi
+; SSE41-NEXT:    addl %edx, %esi
+; SSE41-NEXT:    cmovbl %eax, %esi
+; SSE41-NEXT:    movd %esi, %xmm2
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm2
+; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
+; SSE41-NEXT:    pextrd $2, %xmm0, %edx
+; SSE41-NEXT:    addl %ecx, %edx
+; SSE41-NEXT:    cmovbl %eax, %edx
+; SSE41-NEXT:    pinsrd $2, %edx, %xmm2
+; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
+; SSE41-NEXT:    pextrd $3, %xmm0, %edx
+; SSE41-NEXT:    addl %ecx, %edx
+; SSE41-NEXT:    cmovbl %eax, %edx
+; SSE41-NEXT:    pinsrd $3, %edx, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrd $1, %xmm1, %eax
+; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
+; AVX-NEXT:    addl %eax, %ecx
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovbl %eax, %ecx
+; AVX-NEXT:    vmovd %xmm1, %edx
+; AVX-NEXT:    vmovd %xmm0, %esi
+; AVX-NEXT:    addl %edx, %esi
+; AVX-NEXT:    cmovbl %eax, %esi
+; AVX-NEXT:    vmovd %esi, %xmm2
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX-NEXT:    vpextrd $2, %xmm0, %edx
+; AVX-NEXT:    addl %ecx, %edx
+; AVX-NEXT:    cmovbl %eax, %edx
+; AVX-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrd $3, %xmm1, %ecx
+; AVX-NEXT:    vpextrd $3, %xmm0, %edx
+; AVX-NEXT:    addl %ecx, %edx
+; AVX-NEXT:    cmovbl %eax, %edx
+; AVX-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %z = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
+  ret <4 x i32> %z
+}
+
+define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
+; SSE2-LABEL: v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm1
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    addq %rax, %rcx
+; SSE2-NEXT:    movq $-1, %rax
+; SSE2-NEXT:    cmovbq %rax, %rcx
+; SSE2-NEXT:    movq %rcx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rcx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    addq %rcx, %rdx
+; SSE2-NEXT:    cmovbq %rax, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    psllq $32, %xmm1
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    psllq $32, %xmm0
+; SSSE3-NEXT:    movq %xmm0, %rcx
+; SSSE3-NEXT:    addq %rax, %rcx
+; SSSE3-NEXT:    movq $-1, %rax
+; SSSE3-NEXT:    cmovbq %rax, %rcx
+; SSSE3-NEXT:    movq %rcx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movq %xmm1, %rcx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movq %xmm0, %rdx
+; SSSE3-NEXT:    addq %rcx, %rdx
+; SSSE3-NEXT:    cmovbq %rax, %rdx
+; SSSE3-NEXT:    movq %rdx, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSSE3-NEXT:    psrlq $32, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    psllq $32, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE41-NEXT:    addq %rax, %rcx
+; SSE41-NEXT:    movq $-1, %rax
+; SSE41-NEXT:    cmovbq %rax, %rcx
+; SSE41-NEXT:    movq %rcx, %xmm2
+; SSE41-NEXT:    movq %xmm1, %rcx
+; SSE41-NEXT:    movq %xmm0, %rdx
+; SSE41-NEXT:    addq %rcx, %rdx
+; SSE41-NEXT:    cmovbq %rax, %rdx
+; SSE41-NEXT:    movq %rdx, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX-NEXT:    addq %rax, %rcx
+; AVX-NEXT:    movq $-1, %rax
+; AVX-NEXT:    cmovbq %rax, %rcx
+; AVX-NEXT:    vmovq %rcx, %xmm2
+; AVX-NEXT:    vmovq %xmm1, %rcx
+; AVX-NEXT:    vmovq %xmm0, %rdx
+; AVX-NEXT:    addq %rcx, %rdx
+; AVX-NEXT:    cmovbq %rax, %rdx
+; AVX-NEXT:    vmovq %rdx, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
+  ret <2 x i32> %z
+}
+
+define <4 x i24> @v4i24(<4 x i24> %x, <4 x i24> %y) nounwind {
+; SSE2-LABEL: v4i24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    pslld $8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %ecx
+; SSE2-NEXT:    addl %eax, %ecx
+; SSE2-NEXT:    movl $-1, %eax
+; SSE2-NEXT:    cmovbl %eax, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm3, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm3, %edx
+; SSE2-NEXT:    addl %ecx, %edx
+; SSE2-NEXT:    cmovbl %eax, %edx
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    movd %xmm0, %edx
+; SSE2-NEXT:    addl %ecx, %edx
+; SSE2-NEXT:    cmovbl %eax, %edx
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm0, %edx
+; SSE2-NEXT:    addl %ecx, %edx
+; SSE2-NEXT:    cmovbl %eax, %edx
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    psrld $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v4i24:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pslld $8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    pslld $8, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %ecx
+; SSSE3-NEXT:    addl %eax, %ecx
+; SSSE3-NEXT:    movl $-1, %eax
+; SSSE3-NEXT:    cmovbl %eax, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm3, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm3, %edx
+; SSSE3-NEXT:    addl %ecx, %edx
+; SSSE3-NEXT:    cmovbl %eax, %edx
+; SSSE3-NEXT:    movd %edx, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSSE3-NEXT:    movd %xmm1, %ecx
+; SSSE3-NEXT:    movd %xmm0, %edx
+; SSSE3-NEXT:    addl %ecx, %edx
+; SSSE3-NEXT:    cmovbl %eax, %edx
+; SSSE3-NEXT:    movd %edx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %ecx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm0, %edx
+; SSSE3-NEXT:    addl %ecx, %edx
+; SSSE3-NEXT:    cmovbl %eax, %edx
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    psrld $8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v4i24:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $8, %xmm1
+; SSE41-NEXT:    pextrd $1, %xmm1, %eax
+; SSE41-NEXT:    pslld $8, %xmm0
+; SSE41-NEXT:    pextrd $1, %xmm0, %ecx
+; SSE41-NEXT:    addl %eax, %ecx
+; SSE41-NEXT:    movl $-1, %eax
+; SSE41-NEXT:    cmovbl %eax, %ecx
+; SSE41-NEXT:    movd %xmm1, %edx
+; SSE41-NEXT:    movd %xmm0, %esi
+; SSE41-NEXT:    addl %edx, %esi
+; SSE41-NEXT:    cmovbl %eax, %esi
+; SSE41-NEXT:    movd %esi, %xmm2
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm2
+; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
+; SSE41-NEXT:    pextrd $2, %xmm0, %edx
+; SSE41-NEXT:    addl %ecx, %edx
+; SSE41-NEXT:    cmovbl %eax, %edx
+; SSE41-NEXT:    pinsrd $2, %edx, %xmm2
+; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
+; SSE41-NEXT:    pextrd $3, %xmm0, %edx
+; SSE41-NEXT:    addl %ecx, %edx
+; SSE41-NEXT:    cmovbl %eax, %edx
+; SSE41-NEXT:    pinsrd $3, %edx, %xmm2
+; SSE41-NEXT:    psrld $8, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v4i24:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX-NEXT:    vpextrd $1, %xmm1, %eax
+; AVX-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
+; AVX-NEXT:    addl %eax, %ecx
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovbl %eax, %ecx
+; AVX-NEXT:    vmovd %xmm1, %edx
+; AVX-NEXT:    vmovd %xmm0, %esi
+; AVX-NEXT:    addl %edx, %esi
+; AVX-NEXT:    cmovbl %eax, %esi
+; AVX-NEXT:    vmovd %esi, %xmm2
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX-NEXT:    vpextrd $2, %xmm0, %edx
+; AVX-NEXT:    addl %ecx, %edx
+; AVX-NEXT:    cmovbl %eax, %edx
+; AVX-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrd $3, %xmm1, %ecx
+; AVX-NEXT:    vpextrd $3, %xmm0, %edx
+; AVX-NEXT:    addl %ecx, %edx
+; AVX-NEXT:    cmovbl %eax, %edx
+; AVX-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
+; AVX-NEXT:    vpsrld $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <4 x i24> @llvm.uadd.sat.v4i24(<4 x i24> %x, <4 x i24> %y)
+  ret <4 x i24> %z
+}
+
+define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
+; SSE-LABEL: v2i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    addq %r9, %rsi
+; SSE-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    movq $-1, %rdi
+; SSE-NEXT:    cmovbq %rdi, %rsi
+; SSE-NEXT:    cmovbq %rdi, %rdx
+; SSE-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    cmovbq %rdi, %r8
+; SSE-NEXT:    cmovbq %rdi, %rcx
+; SSE-NEXT:    movq %r8, 24(%rax)
+; SSE-NEXT:    movq %rcx, 16(%rax)
+; SSE-NEXT:    movq %rdx, 8(%rax)
+; SSE-NEXT:    movq %rsi, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v2i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    addq %r9, %rsi
+; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT:    movq $-1, %rdi
+; AVX-NEXT:    cmovbq %rdi, %rsi
+; AVX-NEXT:    cmovbq %rdi, %rdx
+; AVX-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; AVX-NEXT:    cmovbq %rdi, %r8
+; AVX-NEXT:    cmovbq %rdi, %rcx
+; AVX-NEXT:    movq %r8, 24(%rax)
+; AVX-NEXT:    movq %rcx, 16(%rax)
+; AVX-NEXT:    movq %rdx, 8(%rax)
+; AVX-NEXT:    movq %rsi, (%rax)
+; AVX-NEXT:    retq
+  %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
+  ret <2 x i128> %z
+}
diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll
index 1ceca2cd06d7e70458b4a00c04206c119f1de226..be8ca0132d9a0098703df72ce0fe5c3b1043a870 100644
--- a/test/CodeGen/X86/unaligned-load.ll
+++ b/test/CodeGen/X86/unaligned-load.ll
@@ -1,11 +1,55 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0   | FileCheck -check-prefix=I386 %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic    | FileCheck -check-prefix=I386 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic  | FileCheck -check-prefix=CORE2 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic  | FileCheck -check-prefix=COREI7 %s
 
 @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8
 
+; This can be improved; see PR39952.
+
 define void @func() nounwind ssp {
+; I386-LABEL: func:
+; I386:       ## %bb.0: ## %entry
+; I386-NEXT:    subl $32, %esp
+; I386-NEXT:    .p2align 4, 0x90
+; I386-NEXT:  LBB0_1: ## %bb
+; I386-NEXT:    ## =>This Inner Loop Header: Depth=1
+; I386-NEXT:    movl $4673097, {{[0-9]+}}(%esp) ## imm = 0x474E49
+; I386-NEXT:    movl $1230132307, {{[0-9]+}}(%esp) ## imm = 0x49525453
+; I386-NEXT:    movl $541347367, {{[0-9]+}}(%esp) ## imm = 0x20444E27
+; I386-NEXT:    movl $840969293, {{[0-9]+}}(%esp) ## imm = 0x32202C4D
+; I386-NEXT:    movl $1095911247, {{[0-9]+}}(%esp) ## imm = 0x4152474F
+; I386-NEXT:    movl $1380982853, {{[0-9]+}}(%esp) ## imm = 0x52502045
+; I386-NEXT:    movl $1313821779, {{[0-9]+}}(%esp) ## imm = 0x4E4F5453
+; I386-NEXT:    movl $1498564676, (%esp) ## imm = 0x59524844
+; I386-NEXT:    jmp LBB0_1
+;
+; CORE2-LABEL: func:
+; CORE2:       ## %bb.0: ## %entry
+; CORE2-NEXT:    movabsq $20070800167293728, %rax ## imm = 0x474E4952545320
+; CORE2-NEXT:    movabsq $2325069237881678925, %rcx ## imm = 0x20444E2732202C4D
+; CORE2-NEXT:    movabsq $4706902966564560965, %rdx ## imm = 0x4152474F52502045
+; CORE2-NEXT:    movabsq $5642821575076104260, %rsi ## imm = 0x4E4F545359524844
+; CORE2-NEXT:    .p2align 4, 0x90
+; CORE2-NEXT:  LBB0_1: ## %bb
+; CORE2-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CORE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CORE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; CORE2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; CORE2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; CORE2-NEXT:    jmp LBB0_1
+;
+; COREI7-LABEL: func:
+; COREI7:       ## %bb.0: ## %entry
+; COREI7-NEXT:    movups _.str3+{{.*}}(%rip), %xmm0
+; COREI7-NEXT:    movups {{.*}}(%rip), %xmm1
+; COREI7-NEXT:    .p2align 4, 0x90
+; COREI7-NEXT:  LBB0_1: ## %bb
+; COREI7-NEXT:    ## =>This Inner Loop Header: Depth=1
+; COREI7-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; COREI7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; COREI7-NEXT:    jmp LBB0_1
 entry:
   %String2Loc = alloca [31 x i8], align 1
   br label %bb
@@ -19,19 +63,57 @@ return:                                           ; No predecessors!
   ret void
 }
 
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
-
-; I386: calll {{_?}}memcpy
+define void @func_aligned() nounwind ssp {
+; I386-LABEL: func_aligned:
+; I386:       ## %bb.0: ## %entry
+; I386-NEXT:    subl $44, %esp
+; I386-NEXT:    movaps {{.*#+}} xmm0 = [1498564676,1313821779,1380982853,1095911247]
+; I386-NEXT:    .p2align 4, 0x90
+; I386-NEXT:  LBB1_1: ## %bb
+; I386-NEXT:    ## =>This Inner Loop Header: Depth=1
+; I386-NEXT:    movaps %xmm0, (%esp)
+; I386-NEXT:    movl $4673097, {{[0-9]+}}(%esp) ## imm = 0x474E49
+; I386-NEXT:    movl $1230132307, {{[0-9]+}}(%esp) ## imm = 0x49525453
+; I386-NEXT:    movl $541347367, {{[0-9]+}}(%esp) ## imm = 0x20444E27
+; I386-NEXT:    movl $840969293, {{[0-9]+}}(%esp) ## imm = 0x32202C4D
+; I386-NEXT:    jmp LBB1_1
+;
+; CORE2-LABEL: func_aligned:
+; CORE2:       ## %bb.0: ## %entry
+; CORE2-NEXT:    movabsq $20070800167293728, %rax ## imm = 0x474E4952545320
+; CORE2-NEXT:    movabsq $2325069237881678925, %rcx ## imm = 0x20444E2732202C4D
+; CORE2-NEXT:    movabsq $4706902966564560965, %rdx ## imm = 0x4152474F52502045
+; CORE2-NEXT:    movabsq $5642821575076104260, %rsi ## imm = 0x4E4F545359524844
+; CORE2-NEXT:    .p2align 4, 0x90
+; CORE2-NEXT:  LBB1_1: ## %bb
+; CORE2-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CORE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CORE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; CORE2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; CORE2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; CORE2-NEXT:    jmp LBB1_1
+;
+; COREI7-LABEL: func_aligned:
+; COREI7:       ## %bb.0: ## %entry
+; COREI7-NEXT:    movups _.str3+{{.*}}(%rip), %xmm0
+; COREI7-NEXT:    movups {{.*}}(%rip), %xmm1
+; COREI7-NEXT:    .p2align 4, 0x90
+; COREI7-NEXT:  LBB1_1: ## %bb
+; COREI7-NEXT:    ## =>This Inner Loop Header: Depth=1
+; COREI7-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; COREI7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; COREI7-NEXT:    jmp LBB1_1
+entry:
+  %String2Loc = alloca [31 x i8], align 16
+  br label %bb
 
-; CORE2: movabsq
-; CORE2: movabsq
-; CORE2: movabsq
+bb:                                               ; preds = %bb, %entry
+  %String2Loc9 = getelementptr inbounds [31 x i8], [31 x i8]* %String2Loc, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str3, i64 0, i64 0), i64 31, i1 false)
+  br label %bb
 
-; COREI7: movups _.str3
+return:                                           ; No predecessors!
+  ret void
+}
 
-; CORE2: .section
-; CORE2: .p2align  3
-; CORE2-NEXT: _.str1:
-; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
-; CORE2: .p2align 3
-; CORE2-NEXT: _.str3:
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
diff --git a/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index 4061f47aa1ff106df65d411ea3cdd3c22c2799c9..8923c393df138b33bb75d0443affce1b509067ac 100644
--- a/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -38,8 +38,8 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movl %r8d, %eax
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
 ; CHECK-BASELINE-NEXT:    notb %al
 ; CHECK-BASELINE-NEXT:    notb %r9b
 ; CHECK-BASELINE-NEXT:    andb %cl, %r9b
@@ -53,8 +53,8 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movl %r8d, %eax
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
 ; CHECK-SSE1-NEXT:    notb %al
 ; CHECK-SSE1-NEXT:    notb %r9b
 ; CHECK-SSE1-NEXT:    andb %cl, %r9b
@@ -2587,12 +2587,12 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movl %edi, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
-; CHECK-BASELINE-NEXT:    andl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
@@ -2600,12 +2600,12 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-SSE1-LABEL: in_v2i8:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movl %edi, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
-; CHECK-SSE1-NEXT:    andl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
 ; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
@@ -2650,10 +2650,10 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v4i8:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
 ; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
 ; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
@@ -2674,10 +2674,10 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-SSE1-LABEL: in_v4i8:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorb %r11b, %dl
 ; CHECK-SSE1-NEXT:    xorb %r10b, %cl
 ; CHECK-SSE1-NEXT:    xorb %dil, %r8b
@@ -2716,12 +2716,12 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind
 ; CHECK-BASELINE-LABEL: in_v2i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movl %edi, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
-; CHECK-BASELINE-NEXT:    andl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
@@ -2729,12 +2729,12 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind
 ; CHECK-SSE1-LABEL: in_v2i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movl %edi, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
-; CHECK-SSE1-NEXT:    andl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
 ; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
@@ -2915,12 +2915,12 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r11d, %ecx
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    xorl %edi, %edx
 ; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
+; CHECK-BASELINE-NEXT:    xorl %r11d, %ecx
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
@@ -2939,12 +2939,12 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorl %r11d, %ecx
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    xorl %edi, %edx
 ; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorl %edi, %edx
+; CHECK-SSE1-NEXT:    xorl %r11d, %ecx
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
@@ -3288,15 +3288,11 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
 ; CHECK-BASELINE-LABEL: in_v8i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    pushq %rbp
-; CHECK-BASELINE-NEXT:    pushq %r14
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r11d, %r8d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    xorl %edi, %ecx
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %esi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
@@ -3305,29 +3301,32 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorl %ebx, %ecx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT:    xorl %edi, %ecx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    xorl %ebx, %ecx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorl %ebx, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
-; CHECK-BASELINE-NEXT:    xorl %r11d, %r8d
+; CHECK-BASELINE-NEXT:    xorl %ebx, %r8d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
-; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
+; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT:    xorw %bx, %bp
+; CHECK-BASELINE-NEXT:    xorw %di, %bp
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bp
-; CHECK-BASELINE-NEXT:    xorl %ebx, %ebp
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    xorw %di, %bx
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
-; CHECK-BASELINE-NEXT:    xorl %edi, %ebx
+; CHECK-BASELINE-NEXT:    xorl %edi, %ebp
 ; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    xorw %r14w, %di
+; CHECK-BASELINE-NEXT:    xorw %r11w, %di
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %di
-; CHECK-BASELINE-NEXT:    xorl %r14d, %edi
-; CHECK-BASELINE-NEXT:    movw %di, 14(%rax)
-; CHECK-BASELINE-NEXT:    movw %bx, 12(%rax)
+; CHECK-BASELINE-NEXT:    xorl %r11d, %edi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorw %r10w, %bx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
+; CHECK-BASELINE-NEXT:    xorl %r10d, %ebx
+; CHECK-BASELINE-NEXT:    movw %bx, 14(%rax)
+; CHECK-BASELINE-NEXT:    movw %di, 12(%rax)
 ; CHECK-BASELINE-NEXT:    movw %bp, 10(%rax)
 ; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
@@ -3335,22 +3334,17 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
 ; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
-; CHECK-BASELINE-NEXT:    popq %r14
 ; CHECK-BASELINE-NEXT:    popq %rbp
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v8i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    pushq %rbp
-; CHECK-SSE1-NEXT:    pushq %r14
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    xorl %r10d, %r9d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorl %r11d, %r8d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    xorl %edi, %ecx
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-SSE1-NEXT:    xorl %ebx, %esi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
@@ -3359,29 +3353,32 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
 ; CHECK-SSE1-NEXT:    xorl %ebx, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %ebx, %edx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorl %ebx, %ecx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT:    xorl %edi, %ecx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    xorl %ebx, %ecx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorl %ebx, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
-; CHECK-SSE1-NEXT:    xorl %r11d, %r8d
+; CHECK-SSE1-NEXT:    xorl %ebx, %r8d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
-; CHECK-SSE1-NEXT:    xorl %r10d, %r9d
+; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
 ; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT:    xorw %bx, %bp
+; CHECK-SSE1-NEXT:    xorw %di, %bp
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bp
-; CHECK-SSE1-NEXT:    xorl %ebx, %ebp
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    xorw %di, %bx
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
-; CHECK-SSE1-NEXT:    xorl %edi, %ebx
+; CHECK-SSE1-NEXT:    xorl %edi, %ebp
 ; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    xorw %r14w, %di
+; CHECK-SSE1-NEXT:    xorw %r11w, %di
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %di
-; CHECK-SSE1-NEXT:    xorl %r14d, %edi
-; CHECK-SSE1-NEXT:    movw %di, 14(%rax)
-; CHECK-SSE1-NEXT:    movw %bx, 12(%rax)
+; CHECK-SSE1-NEXT:    xorl %r11d, %edi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorw %r10w, %bx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
+; CHECK-SSE1-NEXT:    xorl %r10d, %ebx
+; CHECK-SSE1-NEXT:    movw %bx, 14(%rax)
+; CHECK-SSE1-NEXT:    movw %di, 12(%rax)
 ; CHECK-SSE1-NEXT:    movw %bp, 10(%rax)
 ; CHECK-SSE1-NEXT:    movw %r9w, 8(%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
@@ -3389,7 +3386,6 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
 ; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
-; CHECK-SSE1-NEXT:    popq %r14
 ; CHECK-SSE1-NEXT:    popq %rbp
 ; CHECK-SSE1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/urem-i8-constant.ll b/test/CodeGen/X86/urem-i8-constant.ll
index d4fd92c046475afd34d09e2ea7b19010d300e715..ae218405c0ef0392b6d59f981cf264afae61b111 100644
--- a/test/CodeGen/X86/urem-i8-constant.ll
+++ b/test/CodeGen/X86/urem-i8-constant.ll
@@ -6,14 +6,13 @@
 define i8 @foo(i8 %tmp325) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    imull $111, %ecx, %eax
-; CHECK-NEXT:    shrl $12, %eax
-; CHECK-NEXT:    movb $37, %dl
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    imull $111, %eax, %ecx
+; CHECK-NEXT:    shrl $12, %ecx
+; CHECK-NEXT:    leal (%ecx,%ecx,8), %edx
+; CHECK-NEXT:    leal (%ecx,%edx,4), %ecx
+; CHECK-NEXT:    subb %cl, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    mulb %dl
-; CHECK-NEXT:    subb %al, %cl
-; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    retl
   %t546 = urem i8 %tmp325, 37
   ret i8 %t546
diff --git a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 82385386c88e18061079f0f583d398fc7a39db84..4364d3d9363d9febe22202ae99e602c534508d4e 100644
--- a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -629,12 +629,12 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone {
 ;
 ; CHECK-AVX1-LABEL: test_urem_both:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-9.255967385052751E+61,-9.255967385052751E+61]
-; CHECK-AVX1-NEXT:    # xmm1 = mem[0,0]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
@@ -645,7 +645,7 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone {
 ;
 ; CHECK-AVX2-LABEL: test_urem_both:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [14757395262689946283,14757395262689946283]
+; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837]
 ; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
@@ -661,7 +661,7 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_both:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [14757395262689946283,14757395262689946283]
+; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837]
 ; CHECK-AVX512VL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; CHECK-AVX512VL-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-AVX512VL-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
diff --git a/test/CodeGen/X86/usub_sat_vec.ll b/test/CodeGen/X86/usub_sat_vec.ll
new file mode 100644
index 0000000000000000000000000000000000000000..47a95423ee762cf5fb4bd08bb709cf911a7484aa
--- /dev/null
+++ b/test/CodeGen/X86/usub_sat_vec.ll
@@ -0,0 +1,1013 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <12 x i8> @llvm.usub.sat.v12i8(<12 x i8>, <12 x i8>)
+declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
+
+declare <1 x i16> @llvm.usub.sat.v1i16(<1 x i16>, <1 x i16>)
+declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <12 x i16> @llvm.usub.sat.v12i16(<12 x i16>, <12 x i16>)
+declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)
+
+declare <16 x i1> @llvm.usub.sat.v16i1(<16 x i1>, <16 x i1>)
+declare <16 x i4> @llvm.usub.sat.v16i4(<16 x i4>, <16 x i4>)
+
+declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i24> @llvm.usub.sat.v4i24(<4 x i24>, <4 x i24>)
+declare <2 x i128> @llvm.usub.sat.v2i128(<2 x i128>, <2 x i128>)
+
+; Legal types, depending on architecture.
+
+define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE-LABEL: v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubusb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
+  ret <16 x i8> %z
+}
+
+define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
+; SSE-LABEL: v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubusb %xmm2, %xmm0
+; SSE-NEXT:    psubusb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsubusb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
+  ret <32 x i8> %z
+}
+
+define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
+; SSE-LABEL: v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubusb %xmm4, %xmm0
+; SSE-NEXT:    psubusb %xmm5, %xmm1
+; SSE-NEXT:    psubusb %xmm6, %xmm2
+; SSE-NEXT:    psubusb %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsubusb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsubusb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubusb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsubusb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubusb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubusb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubusb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
+  ret <64 x i8> %z
+}
+
+define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE-LABEL: v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubusw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
+  ret <8 x i16> %z
+}
+
+define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
+; SSE-LABEL: v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubusw %xmm2, %xmm0
+; SSE-NEXT:    psubusw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsubusw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
+  ret <16 x i16> %z
+}
+
+define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
+; SSE-LABEL: v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubusw %xmm4, %xmm0
+; SSE-NEXT:    psubusw %xmm5, %xmm1
+; SSE-NEXT:    psubusw %xmm6, %xmm2
+; SSE-NEXT:    psubusw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsubusw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubusw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsubusw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubusw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubusw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
+  ret <32 x i16> %z
+}
+
+; Too narrow vectors, legalized by widening.
+
+define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
+; SSE-LABEL: v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    psubusb %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmovwb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <8 x i8>, <8 x i8>* %px
+  %y = load <8 x i8>, <8 x i8>* %py
+  %z = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %x, <8 x i8> %y)
+  store <8 x i8> %z, <8 x i8>* %pz
+  ret void
+}
+
+define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
+; SSE-LABEL: v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    psubusb %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT:    vpmovdb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <4 x i8>, <4 x i8>* %px
+  %y = load <4 x i8>, <4 x i8>* %py
+  %z = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
+  store <4 x i8> %z, <4 x i8>* %pz
+  ret void
+}
+
+define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
+; SSE2-LABEL: v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl (%rsi), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psubusb %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rdx)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v2i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl (%rsi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    psubusb %xmm1, %xmm0
+; SSSE3-NEXT:    movd %xmm0, %eax
+; SSSE3-NEXT:    movw %ax, (%rdx)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    movzwl (%rsi), %eax
+; SSE41-NEXT:    movd %eax, %xmm1
+; SSE41-NEXT:    psubusb %xmm1, %xmm0
+; SSE41-NEXT:    pextrw $0, %xmm0, (%rdx)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movzwl (%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    movzwl (%rsi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movzwl (%rdi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    movzwl (%rsi), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movzwl (%rdi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    movzwl (%rsi), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmovqb %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <2 x i8>, <2 x i8>* %px
+  %y = load <2 x i8>, <2 x i8>* %py
+  %z = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %x, <2 x i8> %y)
+  store <2 x i8> %z, <2 x i8>* %pz
+  ret void
+}
+
+define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
+; SSE-LABEL: v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    psubusw %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpmovdw %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <4 x i16>, <4 x i16>* %px
+  %y = load <4 x i16>, <4 x i16>* %py
+  %z = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %x, <4 x i16> %y)
+  store <4 x i16> %z, <4 x i16>* %pz
+  ret void
+}
+
+define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
+; SSE-LABEL: v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    psubusw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rdx)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vpmovqw %xmm0, (%rdx)
+; AVX512-NEXT:    retq
+  %x = load <2 x i16>, <2 x i16>* %px
+  %y = load <2 x i16>, <2 x i16>* %py
+  %z = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
+  store <2 x i16> %z, <2 x i16>* %pz
+  ret void
+}
+
+define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
+; SSE-LABEL: v12i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubusb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v12i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <12 x i8> @llvm.usub.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
+  ret <12 x i8> %z
+}
+
+define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind {
+; SSE-LABEL: v12i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm0
+; SSE-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE-NEXT:    psubusw (%rsi), %xmm0
+; SSE-NEXT:    psubusw 16(%rsi), %xmm1
+; SSE-NEXT:    movq %xmm1, 16(%rdx)
+; SSE-NEXT:    movdqa %xmm0, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v12i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vpsubusw (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vpsubusw 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v12i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpsubusw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v12i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vpsubusw (%rsi), %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = load <12 x i16>, <12 x i16>* %px
+  %y = load <12 x i16>, <12 x i16>* %py
+  %z = call <12 x i16> @llvm.usub.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
+  store <12 x i16> %z, <12 x i16>* %pz
+  ret void
+}
+
+; Scalarization
+
+define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
+; SSE-LABEL: v1i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movb (%rdi), %al
+; SSE-NEXT:    subb (%rsi), %al
+; SSE-NEXT:    jae .LBB13_2
+; SSE-NEXT:  # %bb.1:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:  .LBB13_2:
+; SSE-NEXT:    movb %al, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v1i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movb (%rdi), %al
+; AVX-NEXT:    subb (%rsi), %al
+; AVX-NEXT:    jae .LBB13_2
+; AVX-NEXT:  # %bb.1:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:  .LBB13_2:
+; AVX-NEXT:    movb %al, (%rdx)
+; AVX-NEXT:    retq
+  %x = load <1 x i8>, <1 x i8>* %px
+  %y = load <1 x i8>, <1 x i8>* %py
+  %z = call <1 x i8> @llvm.usub.sat.v1i8(<1 x i8> %x, <1 x i8> %y)
+  store <1 x i8> %z, <1 x i8>* %pz
+  ret void
+}
+
+define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
+; SSE-LABEL: v1i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movzwl (%rdi), %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    subw (%rsi), %ax
+; SSE-NEXT:    cmovbl %ecx, %eax
+; SSE-NEXT:    movw %ax, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v1i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    subw (%rsi), %ax
+; AVX-NEXT:    cmovbl %ecx, %eax
+; AVX-NEXT:    movw %ax, (%rdx)
+; AVX-NEXT:    retq
+  %x = load <1 x i16>, <1 x i16>* %px
+  %y = load <1 x i16>, <1 x i16>* %py
+  %z = call <1 x i16> @llvm.usub.sat.v1i16(<1 x i16> %x, <1 x i16> %y)
+  store <1 x i16> %z, <1 x i16>* %pz
+  ret void
+}
+
+; Promotion
+
+define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
+; SSE-LABEL: v16i4:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $4, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    psllw $4, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psubusb %xmm1, %xmm0
+; SSE-NEXT:    psrlw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v16i4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
+  ret <16 x i4> %z
+}
+
+define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
+; SSE-LABEL: v16i1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $7, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psubusb %xmm1, %xmm0
+; SSE-NEXT:    psrlw $7, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: v16i1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovb2m %xmm0, %k0
+; AVX512-NEXT:    vpsllw $7, %xmm1, %xmm0
+; AVX512-NEXT:    vpmovb2m %xmm0, %k1
+; AVX512-NEXT:    kandnw %k0, %k1, %k0
+; AVX512-NEXT:    vpmovm2b %k0, %xmm0
+; AVX512-NEXT:    retq
+  %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
+  ret <16 x i1> %z
+}
+
+; Expanded
+
+define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; SSE2-LABEL: v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %ecx
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    subl %eax, %ecx
+; SSE2-NEXT:    cmovbl %edx, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm3, %ecx
+; SSE2-NEXT:    subl %eax, %ecx
+; SSE2-NEXT:    cmovbl %edx, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    subl %eax, %ecx
+; SSE2-NEXT:    cmovbl %edx, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    subl %eax, %ecx
+; SSE2-NEXT:    cmovbl %edx, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %ecx
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    subl %eax, %ecx
+; SSSE3-NEXT:    cmovbl %edx, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm3, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm3, %ecx
+; SSSE3-NEXT:    subl %eax, %ecx
+; SSSE3-NEXT:    cmovbl %edx, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSSE3-NEXT:    movd %xmm1, %eax
+; SSSE3-NEXT:    movd %xmm0, %ecx
+; SSSE3-NEXT:    subl %eax, %ecx
+; SSSE3-NEXT:    cmovbl %edx, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm0, %ecx
+; SSSE3-NEXT:    subl %eax, %ecx
+; SSSE3-NEXT:    cmovbl %edx, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrd $1, %xmm1, %eax
+; SSE41-NEXT:    pextrd $1, %xmm0, %ecx
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    subl %eax, %ecx
+; SSE41-NEXT:    cmovbl %edx, %ecx
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    movd %xmm0, %esi
+; SSE41-NEXT:    subl %eax, %esi
+; SSE41-NEXT:    cmovbl %edx, %esi
+; SSE41-NEXT:    movd %esi, %xmm2
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm2
+; SSE41-NEXT:    pextrd $2, %xmm1, %eax
+; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
+; SSE41-NEXT:    subl %eax, %ecx
+; SSE41-NEXT:    cmovbl %edx, %ecx
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm2
+; SSE41-NEXT:    pextrd $3, %xmm1, %eax
+; SSE41-NEXT:    pextrd $3, %xmm0, %ecx
+; SSE41-NEXT:    subl %eax, %ecx
+; SSE41-NEXT:    cmovbl %edx, %ecx
+; SSE41-NEXT:    pinsrd $3, %ecx, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrd $1, %xmm1, %eax
+; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
+; AVX-NEXT:    xorl %edx, %edx
+; AVX-NEXT:    subl %eax, %ecx
+; AVX-NEXT:    cmovbl %edx, %ecx
+; AVX-NEXT:    vmovd %xmm1, %eax
+; AVX-NEXT:    vmovd %xmm0, %esi
+; AVX-NEXT:    subl %eax, %esi
+; AVX-NEXT:    cmovbl %edx, %esi
+; AVX-NEXT:    vmovd %esi, %xmm2
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrd $2, %xmm1, %eax
+; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
+; AVX-NEXT:    subl %eax, %ecx
+; AVX-NEXT:    cmovbl %edx, %ecx
+; AVX-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
+; AVX-NEXT:    subl %eax, %ecx
+; AVX-NEXT:    cmovbl %edx, %ecx
+; AVX-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %z = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
+  ret <4 x i32> %z
+}
+
+define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
+; SSE2-LABEL: v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm1
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    subq %rax, %rcx
+; SSE2-NEXT:    cmovbq %rdx, %rcx
+; SSE2-NEXT:    movq %rcx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    subq %rax, %rcx
+; SSE2-NEXT:    cmovbq %rdx, %rcx
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    psllq $32, %xmm1
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    psllq $32, %xmm0
+; SSSE3-NEXT:    movq %xmm0, %rcx
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    subq %rax, %rcx
+; SSSE3-NEXT:    cmovbq %rdx, %rcx
+; SSSE3-NEXT:    movq %rcx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movq %xmm0, %rcx
+; SSSE3-NEXT:    subq %rax, %rcx
+; SSSE3-NEXT:    cmovbq %rdx, %rcx
+; SSSE3-NEXT:    movq %rcx, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSSE3-NEXT:    psrlq $32, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    psllq $32, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    subq %rax, %rcx
+; SSE41-NEXT:    cmovbq %rdx, %rcx
+; SSE41-NEXT:    movq %rcx, %xmm2
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    movq %xmm0, %rcx
+; SSE41-NEXT:    subq %rax, %rcx
+; SSE41-NEXT:    cmovbq %rdx, %rcx
+; SSE41-NEXT:    movq %rcx, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX-NEXT:    xorl %edx, %edx
+; AVX-NEXT:    subq %rax, %rcx
+; AVX-NEXT:    cmovbq %rdx, %rcx
+; AVX-NEXT:    vmovq %rcx, %xmm2
+; AVX-NEXT:    vmovq %xmm1, %rax
+; AVX-NEXT:    vmovq %xmm0, %rcx
+; AVX-NEXT:    subq %rax, %rcx
+; AVX-NEXT:    cmovbq %rdx, %rcx
+; AVX-NEXT:    vmovq %rcx, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
+  ret <2 x i32> %z
+}
+
+define <4 x i24> @v4i24(<4 x i24> %x, <4 x i24> %y) nounwind {
+; SSE2-LABEL: v4i24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    pslld $8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE2-NEXT:    movd %xmm2, %ecx
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    subl %eax, %ecx
+; SSE2-NEXT:    cmovbl %edx, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm3, %ecx
+; SSE2-NEXT:    subl %eax, %ecx
+; SSE2-NEXT:    cmovbl %edx, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    subl %eax, %ecx
+; SSE2-NEXT:    cmovbl %edx, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    subl %eax, %ecx
+; SSE2-NEXT:    cmovbl %edx, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    psrld $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: v4i24:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pslld $8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    pslld $8, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm2, %ecx
+; SSSE3-NEXT:    xorl %edx, %edx
+; SSSE3-NEXT:    subl %eax, %ecx
+; SSSE3-NEXT:    cmovbl %edx, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm3, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm3, %ecx
+; SSSE3-NEXT:    subl %eax, %ecx
+; SSSE3-NEXT:    cmovbl %edx, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSSE3-NEXT:    movd %xmm1, %eax
+; SSSE3-NEXT:    movd %xmm0, %ecx
+; SSSE3-NEXT:    subl %eax, %ecx
+; SSSE3-NEXT:    cmovbl %edx, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm0, %ecx
+; SSSE3-NEXT:    subl %eax, %ecx
+; SSSE3-NEXT:    cmovbl %edx, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    psrld $8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: v4i24:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $8, %xmm1
+; SSE41-NEXT:    pextrd $1, %xmm1, %eax
+; SSE41-NEXT:    pslld $8, %xmm0
+; SSE41-NEXT:    pextrd $1, %xmm0, %ecx
+; SSE41-NEXT:    xorl %edx, %edx
+; SSE41-NEXT:    subl %eax, %ecx
+; SSE41-NEXT:    cmovbl %edx, %ecx
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    movd %xmm0, %esi
+; SSE41-NEXT:    subl %eax, %esi
+; SSE41-NEXT:    cmovbl %edx, %esi
+; SSE41-NEXT:    movd %esi, %xmm2
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm2
+; SSE41-NEXT:    pextrd $2, %xmm1, %eax
+; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
+; SSE41-NEXT:    subl %eax, %ecx
+; SSE41-NEXT:    cmovbl %edx, %ecx
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm2
+; SSE41-NEXT:    pextrd $3, %xmm1, %eax
+; SSE41-NEXT:    pextrd $3, %xmm0, %ecx
+; SSE41-NEXT:    subl %eax, %ecx
+; SSE41-NEXT:    cmovbl %edx, %ecx
+; SSE41-NEXT:    pinsrd $3, %ecx, %xmm2
+; SSE41-NEXT:    psrld $8, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: v4i24:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX-NEXT:    vpextrd $1, %xmm1, %eax
+; AVX-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
+; AVX-NEXT:    xorl %edx, %edx
+; AVX-NEXT:    subl %eax, %ecx
+; AVX-NEXT:    cmovbl %edx, %ecx
+; AVX-NEXT:    vmovd %xmm1, %eax
+; AVX-NEXT:    vmovd %xmm0, %esi
+; AVX-NEXT:    subl %eax, %esi
+; AVX-NEXT:    cmovbl %edx, %esi
+; AVX-NEXT:    vmovd %esi, %xmm2
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrd $2, %xmm1, %eax
+; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
+; AVX-NEXT:    subl %eax, %ecx
+; AVX-NEXT:    cmovbl %edx, %ecx
+; AVX-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
+; AVX-NEXT:    subl %eax, %ecx
+; AVX-NEXT:    cmovbl %edx, %ecx
+; AVX-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm0
+; AVX-NEXT:    vpsrld $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %z = call <4 x i24> @llvm.usub.sat.v4i24(<4 x i24> %x, <4 x i24> %y)
+  ret <4 x i24> %z
+}
+
+define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
+; SSE-LABEL: v2i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    xorl %edi, %edi
+; SSE-NEXT:    subq %r9, %rsi
+; SSE-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    cmovbq %rdi, %rsi
+; SSE-NEXT:    cmovbq %rdi, %rdx
+; SSE-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    cmovbq %rdi, %r8
+; SSE-NEXT:    cmovbq %rdi, %rcx
+; SSE-NEXT:    movq %r8, 24(%rax)
+; SSE-NEXT:    movq %rcx, 16(%rax)
+; SSE-NEXT:    movq %rdx, 8(%rax)
+; SSE-NEXT:    movq %rsi, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: v2i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    xorl %edi, %edi
+; AVX-NEXT:    subq %r9, %rsi
+; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT:    cmovbq %rdi, %rsi
+; AVX-NEXT:    cmovbq %rdi, %rdx
+; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
+; AVX-NEXT:    cmovbq %rdi, %r8
+; AVX-NEXT:    cmovbq %rdi, %rcx
+; AVX-NEXT:    movq %r8, 24(%rax)
+; AVX-NEXT:    movq %rcx, 16(%rax)
+; AVX-NEXT:    movq %rdx, 8(%rax)
+; AVX-NEXT:    movq %rsi, (%rax)
+; AVX-NEXT:    retq
+  %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
+  ret <2 x i128> %z
+}
diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll
index f2e6f3029263862d4ce3071da15b9625fc2f056d..7e1780449fd19617ed627f30a57f553c1f496d96 100644
--- a/test/CodeGen/X86/var-permute-256.ll
+++ b/test/CodeGen/X86/var-permute-256.ll
@@ -25,20 +25,18 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
 ;
 ; AVX1-LABEL: var_shuffle_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,2,2]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddq %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm4
+; AVX1-NEXT:    vpermilpd %ymm4, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_v4i64:
@@ -90,17 +88,15 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
 ;
 ; AVX1-LABEL: var_shuffle_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX1-NEXT:    vpermilps %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; INT256-LABEL: var_shuffle_v8i32:
@@ -455,20 +451,18 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun
 ;
 ; AVX1-LABEL: var_shuffle_v4f64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,2,2]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddq %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm4
+; AVX1-NEXT:    vpermilpd %ymm4, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_v4f64:
@@ -520,17 +514,15 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
 ;
 ; AVX1-LABEL: var_shuffle_v8f32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX1-NEXT:    vpermilps %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; INT256-LABEL: var_shuffle_v8f32:
@@ -584,20 +576,18 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices)
 ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,2,2]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddq %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm4
+; AVX1-NEXT:    vpermilpd %ymm4, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64:
@@ -653,17 +643,15 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices)
 ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX1-NEXT:    vpermilps %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; INT256-LABEL: var_shuffle_v8i32_from_v4i32:
@@ -888,11 +876,10 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
 ; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
-; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %k1
-; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm2 {%k1}
-; AVX512VLBW-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
 ; VLVBMI-LABEL: var_shuffle_v32i8_from_v16i8:
@@ -1015,20 +1002,18 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in
 ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,2,2]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddq %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm4
+; AVX1-NEXT:    vpermilpd %ymm4, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64:
@@ -1084,17 +1069,15 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indi
 ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX1-NEXT:    vpermilps %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; INT256-LABEL: var_shuffle_v8f32_from_v4f32:
@@ -1148,11 +1131,7 @@ define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices)
 ; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,3,3,3,3,3,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm0, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX1-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vec_cast.ll b/test/CodeGen/X86/vec_cast.ll
index dc406c42eccc012242fee1067adfd3e4f515df71..23870bcf5654aa3684eae1c813c8d14241180eb2 100644
--- a/test/CodeGen/X86/vec_cast.ll
+++ b/test/CodeGen/X86/vec_cast.ll
@@ -28,10 +28,10 @@ define <8 x i32> @a(<8 x i16> %a) nounwind {
 define <3 x i32> @b(<3 x i16> %a) nounwind {
 ; CHECK-LIN-LABEL: b:
 ; CHECK-LIN:       # %bb.0:
-; CHECK-LIN-NEXT:    movd %edi, %xmm0
-; CHECK-LIN-NEXT:    pinsrw $1, %esi, %xmm0
-; CHECK-LIN-NEXT:    pinsrw $2, %edx, %xmm0
-; CHECK-LIN-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-LIN-NEXT:    pxor %xmm0, %xmm0
+; CHECK-LIN-NEXT:    pinsrw $1, %edi, %xmm0
+; CHECK-LIN-NEXT:    pinsrw $3, %esi, %xmm0
+; CHECK-LIN-NEXT:    pinsrw $5, %edx, %xmm0
 ; CHECK-LIN-NEXT:    psrad $16, %xmm0
 ; CHECK-LIN-NEXT:    retq
 ;
@@ -40,10 +40,10 @@ define <3 x i32> @b(<3 x i16> %a) nounwind {
 ; CHECK-WIN-NEXT:    # kill: def $r8w killed $r8w def $r8d
 ; CHECK-WIN-NEXT:    # kill: def $dx killed $dx def $edx
 ; CHECK-WIN-NEXT:    # kill: def $cx killed $cx def $ecx
-; CHECK-WIN-NEXT:    movd %ecx, %xmm0
-; CHECK-WIN-NEXT:    pinsrw $1, %edx, %xmm0
-; CHECK-WIN-NEXT:    pinsrw $2, %r8d, %xmm0
-; CHECK-WIN-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-WIN-NEXT:    pxor %xmm0, %xmm0
+; CHECK-WIN-NEXT:    pinsrw $1, %ecx, %xmm0
+; CHECK-WIN-NEXT:    pinsrw $3, %edx, %xmm0
+; CHECK-WIN-NEXT:    pinsrw $5, %r8d, %xmm0
 ; CHECK-WIN-NEXT:    psrad $16, %xmm0
 ; CHECK-WIN-NEXT:    retq
   %c = sext <3 x i16> %a to <3 x i32>
@@ -88,11 +88,10 @@ define <8 x i32> @d(<8 x i16> %a) nounwind {
 define <3 x i32> @e(<3 x i16> %a) nounwind {
 ; CHECK-LIN-LABEL: e:
 ; CHECK-LIN:       # %bb.0:
-; CHECK-LIN-NEXT:    movd %edi, %xmm0
-; CHECK-LIN-NEXT:    pinsrw $1, %esi, %xmm0
-; CHECK-LIN-NEXT:    pinsrw $2, %edx, %xmm0
-; CHECK-LIN-NEXT:    pxor %xmm1, %xmm1
-; CHECK-LIN-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-LIN-NEXT:    pxor %xmm0, %xmm0
+; CHECK-LIN-NEXT:    pinsrw $0, %edi, %xmm0
+; CHECK-LIN-NEXT:    pinsrw $2, %esi, %xmm0
+; CHECK-LIN-NEXT:    pinsrw $4, %edx, %xmm0
 ; CHECK-LIN-NEXT:    retq
 ;
 ; CHECK-WIN-LABEL: e:
@@ -100,11 +99,10 @@ define <3 x i32> @e(<3 x i16> %a) nounwind {
 ; CHECK-WIN-NEXT:    # kill: def $r8w killed $r8w def $r8d
 ; CHECK-WIN-NEXT:    # kill: def $dx killed $dx def $edx
 ; CHECK-WIN-NEXT:    # kill: def $cx killed $cx def $ecx
-; CHECK-WIN-NEXT:    movd %ecx, %xmm0
-; CHECK-WIN-NEXT:    pinsrw $1, %edx, %xmm0
-; CHECK-WIN-NEXT:    pinsrw $2, %r8d, %xmm0
-; CHECK-WIN-NEXT:    pxor %xmm1, %xmm1
-; CHECK-WIN-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-WIN-NEXT:    pxor %xmm0, %xmm0
+; CHECK-WIN-NEXT:    pinsrw $0, %ecx, %xmm0
+; CHECK-WIN-NEXT:    pinsrw $2, %edx, %xmm0
+; CHECK-WIN-NEXT:    pinsrw $4, %r8d, %xmm0
 ; CHECK-WIN-NEXT:    retq
   %c = zext <3 x i16> %a to <3 x i32>
   ret <3 x i32> %c
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 1bc4b690487f5f09bb9700d22673d7178f530e63..8e1ffe6834f5dc62565b034ce0dba8d151775116 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -172,29 +172,10 @@ define <8 x i8> @cvt_v8f32_v8i8(<8 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v8f32_v8i8:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    vmovd %ecx, %xmm1
-; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm0
+; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-WIDE-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    vzeroupper
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <8 x float> %src to <8 x i8>
@@ -229,17 +210,8 @@ define <4 x i8> @cvt_v4f32_v4i8(<4 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v4f32_v4i8:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    vmovd %ecx, %xmm1
-; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm0
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <4 x float> %src to <4 x i8>
   ret <4 x i8> %res
@@ -253,11 +225,8 @@ define <4 x i16> @cvt_v4f32_v4i16(<4 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v4f32_v4i16:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vzeroupper
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <4 x float> %src to <4 x i16>
   ret <4 x i16> %res
@@ -274,29 +243,10 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v8f32_v8u8:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    vmovd %ecx, %xmm1
-; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm0
+; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-WIDE-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    vzeroupper
 ; CHECK-WIDE-NEXT:    retl
   %res = fptoui <8 x float> %src to <8 x i8>
@@ -331,17 +281,8 @@ define <4 x i8> @cvt_v4f32_v4u8(<4 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v4f32_v4u8:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    vmovd %ecx, %xmm1
-; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm0
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; CHECK-WIDE-NEXT:    retl
   %res = fptoui <4 x float> %src to <4 x i8>
   ret <4 x i8> %res
@@ -355,11 +296,8 @@ define <4 x i16> @cvt_v4f32_v4u16(<4 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v4f32_v4u16:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vzeroupper
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    retl
   %res = fptoui <4 x float> %src to <4 x i16>
   ret <4 x i16> %res
diff --git a/test/CodeGen/X86/vec_cast3.ll b/test/CodeGen/X86/vec_cast3.ll
index e8662b8cc34d9394940fe9fcc3c2ff230c61ec0d..e4e6aa52ff56c85c911acb026f8f0c3b590cd631 100644
--- a/test/CodeGen/X86/vec_cast3.ll
+++ b/test/CodeGen/X86/vec_cast3.ll
@@ -117,11 +117,8 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2i8:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    vmovd %ecx, %xmm0
-; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <2 x float> %src to <2 x i8>
   ret <2 x i8> %res
@@ -136,11 +133,8 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2i16:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vzeroupper
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <2 x float> %src to <2 x i16>
   ret <2 x i16> %res
@@ -170,11 +164,8 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2u8:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    vmovd %ecx, %xmm0
-; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; CHECK-WIDE-NEXT:    retl
   %res = fptoui <2 x float> %src to <2 x i8>
   ret <2 x i8> %res
@@ -189,11 +180,8 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2u16:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vzeroupper
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; CHECK-WIDE-NEXT:    retl
   %res = fptoui <2 x float> %src to <2 x i16>
   ret <2 x i16> %res
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index 2933c567fbb8c6ee97db7f46c3d7146134b2c202..ca3d87c4c612c76e08e8650105e31fa8c65e1987 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -25,17 +25,13 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind {
 define float @t2(<4 x float>* %P1) nounwind {
 ; X32-LABEL: t2:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    movss %xmm0, (%esp)
-; X32-NEXT:    flds (%esp)
-; X32-NEXT:    popl %eax
+; X32-NEXT:    flds 8(%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: t2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    retq
   %X = load <4 x float>, <4 x float>* %P1
   %tmp = extractelement <4 x float> %X, i32 2
diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index e84f7163bbef65dbaa5a4ef6070a8fe7f1f07d37..f0c9899068387d2aca83019576f1444ccbe38f58 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll
@@ -127,3 +127,18 @@ define <4 x float> @fsub0_undef_elts_v4f32(<4 x float> %x) {
   ret <4 x float> %r
 }
 
+define <4 x float> @fneg(<4 x float> %Q) nounwind {
+; X32-SSE-LABEL: fneg:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    xorps {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: fneg:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT:    retq
+  %tmp = fneg <4 x float> %Q
+  ret <4 x float> %tmp
+}
+
+
diff --git a/test/CodeGen/X86/vec_fp_to_int-widen.ll b/test/CodeGen/X86/vec_fp_to_int-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6c579f50c1a7e2395531282d5a45e70fc9c41485
--- /dev/null
+++ b/test/CodeGen/X86/vec_fp_to_int-widen.ll
@@ -0,0 +1,2646 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ
+;
+; 32-bit tests to make sure we're not doing anything stupid.
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2
+
+;
+; Double to Signed Integer
+;
+
+define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f64_to_2i64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttsd2si %xmm0, %rax
+; VEX-NEXT:    vmovq %rax, %xmm1
+; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; VEX-NEXT:    vcvttsd2si %xmm0, %rax
+; VEX-NEXT:    vmovq %rax, %xmm0
+; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f64_to_2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f64_to_2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f64_to_2i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i64>
+  ret <2 x i64> %cvt
+}
+
+define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f64_to_4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i32>
+  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %ext
+}
+
+define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f64_to_2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i32>
+  ret <2 x i32> %cvt
+}
+
+define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
+; SSE-LABEL: fptosi_4f64_to_2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f64_to_2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %cvt = fptosi <4 x double> %ext to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
+; SSE-LABEL: fptosi_4f64_to_4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptosi_4f64_to_4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptosi_4f64_to_4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_4f64_to_4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512F-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_4f64_to_4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512VL-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_4f64_to_4i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptosi <4 x double> %a to <4 x i64>
+  ret <4 x i64> %cvt
+}
+
+define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
+; SSE-LABEL: fptosi_4f64_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f64_to_4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %cvt = fptosi <4 x double> %a to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+;
+; Double to Unsigned Integer
+;
+
+define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    subsd %xmm2, %xmm1
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE-NEXT:    ucomisd %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    movapd %xmm0, %xmm3
+; SSE-NEXT:    subsd %xmm2, %xmm3
+; SSE-NEXT:    cvttsd2si %xmm3, %rax
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    ucomisd %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rcx
+; SSE-NEXT:    movq %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f64_to_2i64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; VEX-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
+; VEX-NEXT:    vcvttsd2si %xmm2, %rax
+; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vcvttsd2si %xmm0, %rdx
+; VEX-NEXT:    vucomisd %xmm1, %xmm0
+; VEX-NEXT:    cmovaeq %rax, %rdx
+; VEX-NEXT:    vmovq %rdx, %xmm2
+; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; VEX-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
+; VEX-NEXT:    vcvttsd2si %xmm3, %rax
+; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vcvttsd2si %xmm0, %rcx
+; VEX-NEXT:    vucomisd %xmm1, %xmm0
+; VEX-NEXT:    cmovaeq %rax, %rcx
+; VEX-NEXT:    vmovq %rcx, %xmm0
+; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i64>
+  ret <2 x i64> %cvt
+}
+
+define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movd %ecx, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_2f64_to_4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vpackssdw %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm3
+; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_2f64_to_4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpackssdw %xmm0, %xmm2, %xmm2
+; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_4i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i32>
+  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %ext
+}
+
+define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_2f64_to_2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm3
+; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_2f64_to_2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i32>
+  %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x i32> %ext
+}
+
+define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_4f64_to_2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd %xmm0, %xmm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f64_to_2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovapd %xmm0, %xmm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_4f64_to_2i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovaps %xmm0, %xmm0
+; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_4f64_to_2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovaps %xmm0, %xmm0
+; AVX512VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_4f64_to_2i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps %xmm0, %xmm0
+; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vmovaps %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %ext = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = fptoui <4 x double> %ext to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE-NEXT:    subsd %xmm3, %xmm0
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm2, %rdx
+; SSE-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    movapd %xmm2, %xmm4
+; SSE-NEXT:    subsd %xmm3, %xmm4
+; SSE-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm2, %rdx
+; SSE-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT:    movapd %xmm1, %xmm2
+; SSE-NEXT:    subsd %xmm3, %xmm2
+; SSE-NEXT:    cvttsd2si %xmm2, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm1, %rdx
+; SSE-NEXT:    ucomisd %xmm3, %xmm1
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    movapd %xmm1, %xmm4
+; SSE-NEXT:    subsd %xmm3, %xmm4
+; SSE-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    ucomisd %xmm3, %xmm1
+; SSE-NEXT:    cmovaeq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_4f64_to_4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vsubsd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vcvttsd2si %xmm3, %rax
+; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttsd2si %xmm2, %rdx
+; AVX1-NEXT:    vucomisd %xmm1, %xmm2
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm3
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX1-NEXT:    vsubsd %xmm1, %xmm2, %xmm4
+; AVX1-NEXT:    vcvttsd2si %xmm4, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttsd2si %xmm2, %rdx
+; AVX1-NEXT:    vucomisd %xmm1, %xmm2
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vcvttsd2si %xmm3, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttsd2si %xmm0, %rdx
+; AVX1-NEXT:    vucomisd %xmm1, %xmm0
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm3
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vcvttsd2si %xmm4, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX1-NEXT:    vucomisd %xmm1, %xmm0
+; AVX1-NEXT:    cmovaeq %rax, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f64_to_4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vsubsd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttsd2si %xmm3, %rax
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttsd2si %xmm2, %rdx
+; AVX2-NEXT:    vucomisd %xmm1, %xmm2
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX2-NEXT:    vsubsd %xmm1, %xmm2, %xmm4
+; AVX2-NEXT:    vcvttsd2si %xmm4, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttsd2si %xmm2, %rdx
+; AVX2-NEXT:    vucomisd %xmm1, %xmm2
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
+; AVX2-NEXT:    vcvttsd2si %xmm3, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rdx
+; AVX2-NEXT:    vucomisd %xmm1, %xmm0
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm4
+; AVX2-NEXT:    vcvttsd2si %xmm4, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT:    vucomisd %xmm1, %xmm0
+; AVX2-NEXT:    cmovaeq %rax, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_4f64_to_4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vcvttsd2usi %xmm1, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512F-NEXT:    vcvttsd2usi %xmm1, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_4f64_to_4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vcvttsd2usi %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512VL-NEXT:    vcvttsd2usi %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_4f64_to_4i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2uqq %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <4 x double> %a to <4 x i64>
+  ret <4 x i64> %cvt
+}
+
+define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvttsd2si %xmm1, %rax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_4f64_to_4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f64_to_4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_4f64_to_4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_4f64_to_4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_4f64_to_4i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <4 x double> %a to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+;
+; Float to Signed Integer
+;
+
+define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f32_to_2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %cvt
+}
+
+define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) {
+; SSE-LABEL: fptosi_4f32_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f32_to_4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f32_to_2i64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttss2si %xmm0, %rax
+; VEX-NEXT:    vmovq %rax, %xmm1
+; VEX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; VEX-NEXT:    vcvttss2si %xmm0, %rax
+; VEX-NEXT:    vmovq %rax, %xmm0
+; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f32_to_2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f32_to_2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f32_to_2i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %cvt = fptosi <2 x float> %shuf to <2 x i64>
+  ret <2 x i64> %cvt
+}
+
+define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
+; SSE-LABEL: fptosi_4f32_to_2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_4f32_to_2i64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; VEX-NEXT:    vcvttss2si %xmm1, %rax
+; VEX-NEXT:    vcvttss2si %xmm0, %rcx
+; VEX-NEXT:    vmovq %rcx, %xmm0
+; VEX-NEXT:    vmovq %rax, %xmm1
+; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_4f32_to_2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vcvttss2si %xmm1, %rax
+; AVX512F-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_4f32_to_2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vcvttss2si %xmm1, %rax
+; AVX512VL-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %rcx, %xmm0
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_4f32_to_2i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptosi <4 x float> %a to <4 x i64>
+  %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %shuf
+}
+
+define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) {
+; SSE-LABEL: fptosi_8f32_to_8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_8f32_to_8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %cvt = fptosi <8 x float> %a to <8 x i32>
+  ret <8 x i32> %cvt
+}
+
+define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
+; SSE-LABEL: fptosi_4f32_to_4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptosi_4f32_to_4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vcvttss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-NEXT:    vcvttss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvttss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vcvttss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptosi_4f32_to_4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_4f32_to_4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT:    vcvttss2si %xmm1, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512F-NEXT:    vcvttss2si %xmm2, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_4f32_to_4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512VL-NEXT:    vcvttss2si %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-NEXT:    vcvttss2si %xmm2, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_4f32_to_4i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = fptosi <4 x float> %shuf to <4 x i64>
+  ret <4 x i64> %cvt
+}
+
+define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
+; SSE-LABEL: fptosi_8f32_to_4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptosi_8f32_to_4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vcvttss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-NEXT:    vcvttss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvttss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vcvttss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptosi_8f32_to_4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_8f32_to_4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vcvttss2si %xmm1, %rax
+; AVX512F-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512F-NEXT:    vcvttss2si %xmm1, %rdx
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT:    vcvttss2si %xmm0, %rsi
+; AVX512F-NEXT:    vmovq %rsi, %xmm0
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vmovq %rcx, %xmm1
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_8f32_to_4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vcvttss2si %xmm1, %rax
+; AVX512VL-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT:    vcvttss2si %xmm1, %rdx
+; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VL-NEXT:    vcvttss2si %xmm0, %rsi
+; AVX512VL-NEXT:    vmovq %rsi, %xmm0
+; AVX512VL-NEXT:    vmovq %rdx, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    vmovq %rcx, %xmm1
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_8f32_to_4i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_8f32_to_4i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512VLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptosi <8 x float> %a to <8 x i64>
+  %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %shuf
+}
+
+;
+; Float to Unsigned Integer
+;
+
+define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    cmpltps %xmm2, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE-NEXT:    subps %xmm2, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    andnps %xmm0, %xmm1
+; SSE-NEXT:    orps %xmm3, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_2f32_to_2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_2f32_to_2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorps %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2udq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2udq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %cvt
+}
+
+define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
+; SSE-LABEL: fptoui_4f32_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    cmpltps %xmm2, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE-NEXT:    subps %xmm2, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    andnps %xmm0, %xmm1
+; SSE-NEXT:    orps %xmm3, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_4f32_to_4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f32_to_4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorps %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_4f32_to_4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_4f32_to_4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2udq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_4f32_to_4i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2udq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    subss %xmm2, %xmm1
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rdx
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    subss %xmm2, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rcx
+; SSE-NEXT:    movq %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f32_to_2i64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
+; VEX-NEXT:    vcvttss2si %xmm2, %rax
+; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vcvttss2si %xmm0, %rdx
+; VEX-NEXT:    vucomiss %xmm1, %xmm0
+; VEX-NEXT:    cmovaeq %rax, %rdx
+; VEX-NEXT:    vmovq %rdx, %xmm2
+; VEX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm3
+; VEX-NEXT:    vcvttss2si %xmm3, %rax
+; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vcvttss2si %xmm0, %rcx
+; VEX-NEXT:    vucomiss %xmm1, %xmm0
+; VEX-NEXT:    cmovaeq %rax, %rcx
+; VEX-NEXT:    vmovq %rcx, %xmm0
+; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %cvt = fptoui <2 x float> %shuf to <2 x i64>
+  ret <2 x i64> %cvt
+}
+
+define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
+; SSE-LABEL: fptoui_4f32_to_2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    subss %xmm2, %xmm1
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rdx
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    subss %xmm2, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    xorq %rcx, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
+; SSE-NEXT:    cmovaeq %rax, %rcx
+; SSE-NEXT:    movq %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_4f32_to_2i64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; VEX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; VEX-NEXT:    vsubss %xmm2, %xmm1, %xmm3
+; VEX-NEXT:    vcvttss2si %xmm3, %rax
+; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vcvttss2si %xmm1, %rdx
+; VEX-NEXT:    vucomiss %xmm2, %xmm1
+; VEX-NEXT:    cmovaeq %rax, %rdx
+; VEX-NEXT:    vsubss %xmm2, %xmm0, %xmm1
+; VEX-NEXT:    vcvttss2si %xmm1, %rax
+; VEX-NEXT:    xorq %rcx, %rax
+; VEX-NEXT:    vcvttss2si %xmm0, %rcx
+; VEX-NEXT:    vucomiss %xmm2, %xmm0
+; VEX-NEXT:    cmovaeq %rax, %rcx
+; VEX-NEXT:    vmovq %rcx, %xmm0
+; VEX-NEXT:    vmovq %rdx, %xmm1
+; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_4f32_to_2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vcvttss2usi %xmm1, %rax
+; AVX512F-NEXT:    vcvttss2usi %xmm0, %rcx
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_4f32_to_2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vcvttss2usi %xmm1, %rax
+; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %rcx, %xmm0
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_4f32_to_2i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <4 x float> %a to <4 x i64>
+  %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %shuf
+}
+
+define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
+; SSE-LABEL: fptoui_8f32_to_8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    cmpltps %xmm4, %xmm2
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE-NEXT:    subps %xmm4, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    xorps %xmm5, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm3
+; SSE-NEXT:    andnps %xmm0, %xmm2
+; SSE-NEXT:    orps %xmm3, %xmm2
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    cmpltps %xmm4, %xmm3
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE-NEXT:    subps %xmm4, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    xorps %xmm5, %xmm1
+; SSE-NEXT:    andps %xmm3, %xmm0
+; SSE-NEXT:    andnps %xmm1, %xmm3
+; SSE-NEXT:    orps %xmm0, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_8f32_to_8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vsubps %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_8f32_to_8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vsubps %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_8f32_to_8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_8f32_to_8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2udq %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_8f32_to_8i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_8f32_to_8i32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2udq %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <8 x float> %a to <8 x i32>
+  ret <8 x i32> %cvt
+}
+
+define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
+; SSE-LABEL: fptoui_4f32_to_4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    subss %xmm1, %xmm2
+; SSE-NEXT:    cvttss2si %xmm2, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm0, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm3, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm3
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm3, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm3
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    movaps %xmm0, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_4f32_to_4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vcvttss2si %xmm3, %rax
+; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttss2si %xmm2, %rdx
+; AVX1-NEXT:    vucomiss %xmm1, %xmm2
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm2
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX1-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vcvttss2si %xmm4, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttss2si %xmm3, %rdx
+; AVX1-NEXT:    vucomiss %xmm1, %xmm3
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vcvttss2si %xmm3, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttss2si %xmm0, %rdx
+; AVX1-NEXT:    vucomiss %xmm1, %xmm0
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vcvttss2si %xmm4, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX1-NEXT:    vucomiss %xmm1, %xmm0
+; AVX1-NEXT:    cmovaeq %rax, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f32_to_4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rdx
+; AVX2-NEXT:    vucomiss %xmm1, %xmm2
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm4, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttss2si %xmm3, %rdx
+; AVX2-NEXT:    vucomiss %xmm1, %xmm3
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttss2si %xmm0, %rdx
+; AVX2-NEXT:    vucomiss %xmm1, %xmm0
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm4, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    vucomiss %xmm1, %xmm0
+; AVX2-NEXT:    cmovaeq %rax, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_4f32_to_4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT:    vcvttss2usi %xmm1, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512F-NEXT:    vcvttss2usi %xmm2, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_4f32_to_4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512VL-NEXT:    vcvttss2usi %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm1
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-NEXT:    vcvttss2usi %xmm2, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_4f32_to_4i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = fptoui <4 x float> %shuf to <4 x i64>
+  ret <4 x i64> %cvt
+}
+
+define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
+; SSE-LABEL: fptoui_8f32_to_4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    subss %xmm1, %xmm2
+; SSE-NEXT:    cvttss2si %xmm2, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm0, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm3, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm3
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm3, %rdx
+; SSE-NEXT:    ucomiss %xmm1, %xmm3
+; SSE-NEXT:    cmovaeq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    movaps %xmm0, %xmm4
+; SSE-NEXT:    subss %xmm1, %xmm4
+; SSE-NEXT:    cvttss2si %xmm4, %rcx
+; SSE-NEXT:    xorq %rax, %rcx
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    cmovaeq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_8f32_to_4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vcvttss2si %xmm3, %rax
+; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttss2si %xmm2, %rdx
+; AVX1-NEXT:    vucomiss %xmm1, %xmm2
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm2
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX1-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vcvttss2si %xmm4, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttss2si %xmm3, %rdx
+; AVX1-NEXT:    vucomiss %xmm1, %xmm3
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vcvttss2si %xmm3, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttss2si %xmm0, %rdx
+; AVX1-NEXT:    vucomiss %xmm1, %xmm0
+; AVX1-NEXT:    cmovaeq %rax, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vcvttss2si %xmm4, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX1-NEXT:    vucomiss %xmm1, %xmm0
+; AVX1-NEXT:    cmovaeq %rax, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_8f32_to_4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rdx
+; AVX2-NEXT:    vucomiss %xmm1, %xmm2
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm4, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttss2si %xmm3, %rdx
+; AVX2-NEXT:    vucomiss %xmm1, %xmm3
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttss2si %xmm0, %rdx
+; AVX2-NEXT:    vucomiss %xmm1, %xmm0
+; AVX2-NEXT:    cmovaeq %rax, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm4, %rax
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    vucomiss %xmm1, %xmm0
+; AVX2-NEXT:    cmovaeq %rax, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_8f32_to_4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vcvttss2usi %xmm1, %rax
+; AVX512F-NEXT:    vcvttss2usi %xmm0, %rcx
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512F-NEXT:    vcvttss2usi %xmm1, %rdx
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT:    vcvttss2usi %xmm0, %rsi
+; AVX512F-NEXT:    vmovq %rsi, %xmm0
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vmovq %rcx, %xmm1
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_8f32_to_4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vcvttss2usi %xmm1, %rax
+; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rcx
+; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT:    vcvttss2usi %xmm1, %rdx
+; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rsi
+; AVX512VL-NEXT:    vmovq %rsi, %xmm0
+; AVX512VL-NEXT:    vmovq %rdx, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    vmovq %rcx, %xmm1
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_8f32_to_4i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_8f32_to_4i64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512VLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <8 x float> %a to <8 x i64>
+  %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %shuf
+}
+
+;
+; Constant Folding
+;
+
+define <2 x i64> @fptosi_2f64_to_2i64_const() {
+; SSE-LABEL: fptosi_2f64_to_2i64_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f64_to_2i64_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x double> <double 1.0, double -1.0> to <2 x i64>
+  ret <2 x i64> %cvt
+}
+
+define <4 x i32> @fptosi_2f64_to_2i32_const() {
+; SSE-LABEL: fptosi_2f64_to_2i32_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = <4294967295,1,u,u>
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f64_to_2i32_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
+  %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x i32> %ext
+}
+
+define <4 x i64> @fptosi_4f64_to_4i64_const() {
+; SSE-LABEL: fptosi_4f64_to_4i64_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,18446744073709551613]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f64_to_4i64_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
+; AVX-NEXT:    retq
+  %cvt = fptosi <4 x double> <double 1.0, double -1.0, double 2.0, double -3.0> to <4 x i64>
+  ret <4 x i64> %cvt
+}
+
+define <4 x i32> @fptosi_4f64_to_4i32_const() {
+; SSE-LABEL: fptosi_4f64_to_4i32_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f64_to_4i32_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
+; AVX-NEXT:    retq
+  %cvt = fptosi <4 x double> <double -1.0, double 1.0, double -2.0, double 3.0> to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <2 x i64> @fptoui_2f64_to_2i64_const() {
+; SSE-LABEL: fptoui_2f64_to_2i64_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f64_to_2i64_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4]
+; AVX-NEXT:    retq
+  %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i64>
+  ret <2 x i64> %cvt
+}
+
+define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i32_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = <2,4,u,u>
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f64_to_2i32_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <2,4,u,u>
+; AVX-NEXT:    retq
+  %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
+  %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x i32> %ext
+}
+
+define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_4i64_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [6,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f64_to_4i64_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [2,4,6,8]
+; AVX-NEXT:    retq
+  %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i64>
+  ret <4 x i64> %cvt
+}
+
+define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
+; SSE-LABEL: fptoui_4f64_to_4i32_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4,6,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f64_to_4i32_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4,6,8]
+; AVX-NEXT:    retq
+  %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <4 x i32> @fptosi_4f32_to_4i32_const() {
+; SSE-LABEL: fptosi_4f32_to_4i32_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f32_to_4i32_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
+; AVX-NEXT:    retq
+  %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <4 x i64> @fptosi_4f32_to_4i64_const() {
+; SSE-LABEL: fptosi_4f32_to_4i64_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_4f32_to_4i64_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
+; AVX-NEXT:    retq
+  %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i64>
+  ret <4 x i64> %cvt
+}
+
+define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
+; SSE-LABEL: fptosi_8f32_to_8i32_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_8f32_to_8i32_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
+; AVX-NEXT:    retq
+  %cvt = fptosi <8 x float> <float 1.0, float -1.0, float 2.0, float 3.0, float 6.0, float -8.0, float 2.0, float -1.0> to <8 x i32>
+  ret <8 x i32> %cvt
+}
+
+define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
+; SSE-LABEL: fptoui_4f32_to_4i32_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f32_to_4i32_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,4,6]
+; AVX-NEXT:    retq
+  %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 6.0> to <4 x i32>
+  ret <4 x i32> %cvt
+}
+
+define <4 x i64> @fptoui_4f32_to_4i64_const() {
+; SSE-LABEL: fptoui_4f32_to_4i64_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4,8]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_4f32_to_4i64_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8]
+; AVX-NEXT:    retq
+  %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 8.0> to <4 x i64>
+  ret <4 x i64> %cvt
+}
+
+define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
+; SSE-LABEL: fptoui_8f32_to_8i32_const:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [8,6,4,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_8f32_to_8i32_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
+; AVX-NEXT:    retq
+  %cvt = fptoui <8 x float> <float 1.0, float 2.0, float 4.0, float 6.0, float 8.0, float 6.0, float 4.0, float 1.0> to <8 x i32>
+  ret <8 x i32> %cvt
+}
+
+;
+; Special Cases
+;
+
+define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
+; SSE-LABEL: fptosi_2f16_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    callq __gnu_f2h_ieee
+; SSE-NEXT:    movzwl %ax, %edi
+; SSE-NEXT:    callq __gnu_h2f_ieee
+; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    callq __gnu_f2h_ieee
+; SSE-NEXT:    movzwl %ax, %edi
+; SSE-NEXT:    callq __gnu_h2f_ieee
+; SSE-NEXT:    cvttss2si %xmm0, %eax
+; SSE-NEXT:    cvttss2si (%rsp), %ecx # 4-byte Folded Reload
+; SSE-NEXT:    movd %ecx, %xmm0
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT:    popq %rax
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f16_to_4i32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    pushq %rax
+; VEX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; VEX-NEXT:    vmovaps %xmm1, %xmm0
+; VEX-NEXT:    callq __gnu_f2h_ieee
+; VEX-NEXT:    movzwl %ax, %edi
+; VEX-NEXT:    callq __gnu_h2f_ieee
+; VEX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; VEX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; VEX-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; VEX-NEXT:    callq __gnu_f2h_ieee
+; VEX-NEXT:    movzwl %ax, %edi
+; VEX-NEXT:    callq __gnu_h2f_ieee
+; VEX-NEXT:    vcvttss2si %xmm0, %eax
+; VEX-NEXT:    vcvttss2si (%rsp), %ecx # 4-byte Folded Reload
+; VEX-NEXT:    vmovd %ecx, %xmm0
+; VEX-NEXT:    vmovd %eax, %xmm1
+; VEX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; VEX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; VEX-NEXT:    popq %rax
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_2f16_to_4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; AVX512-NEXT:    vcvttss2si %xmm1, %ecx
+; AVX512-NEXT:    vmovd %ecx, %xmm0
+; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512-NEXT:    retq
+  %cvt = fptosi <2 x half> %a to <2 x i32>
+  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %ext
+}
+
+define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind {
+; SSE-LABEL: fptosi_2f80_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
+; SSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    fistpl -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F
+; SSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    fistpl -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f80_to_4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX-NEXT:    fisttpl -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    fisttpl -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x x86_fp80> %a to <2 x i32>
+  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %ext
+}
+
+define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
+; SSE-LABEL: fptosi_2f128_to_4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %rcx, %r14
+; SSE-NEXT:    movq %rdx, %rbx
+; SSE-NEXT:    callq __fixtfsi
+; SSE-NEXT:    movl %eax, %ebp
+; SSE-NEXT:    movq %rbx, %rdi
+; SSE-NEXT:    movq %r14, %rsi
+; SSE-NEXT:    callq __fixtfsi
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movd %ebp, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f128_to_4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rcx, %r14
+; AVX-NEXT:    movq %rdx, %rbx
+; AVX-NEXT:    callq __fixtfsi
+; AVX-NEXT:    movl %eax, %ebp
+; AVX-NEXT:    movq %rbx, %rdi
+; AVX-NEXT:    movq %r14, %rsi
+; AVX-NEXT:    callq __fixtfsi
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vmovd %ebp, %xmm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x fp128> %a to <2 x i32>
+  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %ext
+}
+
+define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f32_to_2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x float> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f32_to_2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x float> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f32_to_2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %cvt = fptoui <2 x float> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f32_to_2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %cvt = fptoui <2 x float> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    andpd {{.*}}(%rip), %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f64_to_2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptosi_2f64_to_2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    andpd {{.*}}(%rip), %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f64_to_2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fptoui_2f64_to_2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) {
+; SSE-LABEL: fptosi_8f64_to_8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm3, %xmm3
+; SSE-NEXT:    cvttpd2dq %xmm2, %xmm2
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    packssdw %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_8f64_to_8i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; VEX-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; VEX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    vzeroupper
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_8f64_to_8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_8f64_to_8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_8f64_to_8i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_8f64_to_8i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptosi <8 x double> %a to <8 x i16>
+  ret <8 x i16> %cvt
+}
+
+define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) {
+; SSE-LABEL: fptoui_8f64_to_8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm3, %xmm3
+; SSE-NEXT:    cvttpd2dq %xmm2, %xmm2
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_8f64_to_8i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; VEX-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; VEX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    vzeroupper
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_8f64_to_8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_8f64_to_8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_8f64_to_8i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_8f64_to_8i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <8 x double> %a to <8 x i16>
+  ret <8 x i16> %cvt
+}
+
+define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
+; SSE-LABEL: fptosi_16f32_to_16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptosi_16f32_to_16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptosi_16f32_to_16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_16f32_to_16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvttps2dq %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = fptosi <16 x float> %a to <16 x i8>
+  ret <16 x i8> %cvt
+}
+
+define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
+; SSE-LABEL: fptoui_16f32_to_16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_16f32_to_16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_16f32_to_16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_16f32_to_16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvttps2dq %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = fptoui <16 x float> %a to <16 x i8>
+  ret <16 x i8> %cvt
+}
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 651c0e65aa054e3065b05ba22138656c5a117483..6ef7f20fddec9699464a88585ab237b29701668b 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -6,8 +6,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=WIDEN --check-prefix=WIDEN_SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=WIDEN --check-prefix=WIDEN_KNL
 ;
 ; 32-bit tests to make sure we're not doing anything stupid.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown
@@ -72,21 +70,6 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptosi_2f64_to_2i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttpd2qq %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptosi_2f64_to_2i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vcvttsd2si %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; WIDEN_KNL-NEXT:    vcvttsd2si %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm0
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptosi <2 x double> %a to <2 x i64>
   ret <2 x i64> %cvt
 }
@@ -101,11 +84,6 @@ define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f64_to_4i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x double> %a to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %ext
@@ -123,11 +101,6 @@ define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
 ; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
 ; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f64_to_2i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x double> %a to <2 x i32>
   ret <2 x i32> %cvt
 }
@@ -146,13 +119,6 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
 ; AVX-NEXT:    vcvttpd2dq %ymm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_4f64_to_2i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; WIDEN-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; WIDEN-NEXT:    vzeroupper
-; WIDEN-NEXT:    retq
   %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %cvt = fptosi <4 x double> %ext to <4 x i32>
   ret <4 x i32> %cvt
@@ -260,29 +226,6 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptosi_4f64_to_4i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttpd2qq %ymm0, %ymm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptosi_4f64_to_4i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; WIDEN_KNL-NEXT:    vcvttsd2si %xmm1, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; WIDEN_KNL-NEXT:    vcvttsd2si %xmm1, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; WIDEN_KNL-NEXT:    vcvttsd2si %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; WIDEN_KNL-NEXT:    vcvttsd2si %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm0
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptosi <4 x double> %a to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -300,12 +243,6 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
 ; AVX-NEXT:    vcvttpd2dq %ymm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_4f64_to_4i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; WIDEN-NEXT:    vzeroupper
-; WIDEN-NEXT:    retq
   %cvt = fptosi <4 x double> %a to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -394,21 +331,6 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_2f64_to_2i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttpd2uqq %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_2f64_to_2i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vcvttsd2usi %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; WIDEN_KNL-NEXT:    vcvttsd2usi %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm0
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <2 x double> %a to <2 x i64>
   ret <2 x i64> %cvt
 }
@@ -489,19 +411,6 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_2f64_to_4i32:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttpd2udq %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_2f64_to_4i32:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN_KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <2 x double> %a to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %ext
@@ -581,19 +490,6 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_2f64_to_2i32:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttpd2udq %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_2f64_to_2i32:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN_KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <2 x double> %a to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i32> %ext
@@ -687,21 +583,6 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; AVX512VLDQ-NEXT:    vcvttpd2udq %ymm0, %xmm0
 ; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_4f64_to_2i32:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vmovaps %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; WIDEN_SKX-NEXT:    vzeroupper
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_4f64_to_2i32:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vmovaps %xmm0, %xmm0
-; WIDEN_KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %ext = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cvt = fptoui <4 x double> %ext to <4 x i32>
   ret <4 x i32> %cvt
@@ -878,29 +759,6 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2uqq %ymm0, %ymm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_4f64_to_4i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttpd2uqq %ymm0, %ymm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_4f64_to_4i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; WIDEN_KNL-NEXT:    vcvttsd2usi %xmm1, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; WIDEN_KNL-NEXT:    vcvttsd2usi %xmm1, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; WIDEN_KNL-NEXT:    vcvttsd2usi %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; WIDEN_KNL-NEXT:    vcvttsd2usi %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm0
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <4 x double> %a to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -1006,20 +864,6 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ; AVX512VLDQ-NEXT:    vcvttpd2udq %ymm0, %xmm0
 ; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_4f64_to_4i32:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; WIDEN_SKX-NEXT:    vzeroupper
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_4f64_to_4i32:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; WIDEN_KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <4 x double> %a to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -1040,11 +884,6 @@ define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) {
 ; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f32_to_2i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vcvttps2dq %xmm0, %xmm0
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x float> %a to <2 x i32>
   ret <2 x i32> %cvt
 }
@@ -1059,11 +898,6 @@ define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_4f32_to_4i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vcvttps2dq %xmm0, %xmm0
-; WIDEN-NEXT:    retq
   %cvt = fptosi <4 x float> %a to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -1122,21 +956,6 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptosi_2f32_to_2i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2qq %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptosi_2f32_to_2i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm0
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; WIDEN_KNL-NEXT:    retq
   %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
   %cvt = fptosi <2 x float> %shuf to <2 x i64>
   ret <2 x i64> %cvt
@@ -1198,23 +1017,6 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
 ; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptosi_4f32_to_2i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2qq %xmm0, %ymm0
-; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_SKX-NEXT:    vzeroupper
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptosi_4f32_to_2i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm1, %rax
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm0, %rcx
-; WIDEN_KNL-NEXT:    vmovq %rcx, %xmm0
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptosi <4 x float> %a to <4 x i64>
   %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %shuf
@@ -1231,11 +1033,6 @@ define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttps2dq %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_8f32_to_8i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vcvttps2dq %ymm0, %ymm0
-; WIDEN-NEXT:    retq
   %cvt = fptosi <8 x float> %a to <8 x i32>
   ret <8 x i32> %cvt
 }
@@ -1343,29 +1140,6 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptosi_4f32_to_4i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2qq %xmm0, %ymm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptosi_4f32_to_4i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm1, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm2, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm0
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; WIDEN_KNL-NEXT:    retq
   %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cvt = fptosi <4 x float> %shuf to <4 x i64>
   ret <4 x i64> %cvt
@@ -1475,30 +1249,6 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
 ; AVX512VLDQ-NEXT:    vcvttps2qq %ymm0, %zmm0
 ; AVX512VLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptosi_8f32_to_4i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2qq %ymm0, %zmm0
-; WIDEN_SKX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptosi_8f32_to_4i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm1, %rax
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm0, %rcx
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm1, %rdx
-; WIDEN_KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; WIDEN_KNL-NEXT:    vcvttss2si %xmm0, %rsi
-; WIDEN_KNL-NEXT:    vmovq %rsi, %xmm0
-; WIDEN_KNL-NEXT:    vmovq %rdx, %xmm1
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; WIDEN_KNL-NEXT:    vmovq %rcx, %xmm1
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptosi <8 x float> %a to <8 x i64>
   %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i64> %shuf
@@ -1583,19 +1333,6 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
 ; AVX512VLDQ-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_2f32_to_2i32:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2udq %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_2f32_to_2i32:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN_KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <2 x float> %a to <2 x i32>
   ret <2 x i32> %cvt
 }
@@ -1664,19 +1401,6 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_4f32_to_4i32:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2udq %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_4f32_to_4i32:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN_KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <4 x float> %a to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -1761,21 +1485,6 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_2f32_to_2i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2uqq %xmm0, %xmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_2f32_to_2i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm0
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; WIDEN_KNL-NEXT:    retq
   %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
   %cvt = fptoui <2 x float> %shuf to <2 x i64>
   ret <2 x i64> %cvt
@@ -1863,23 +1572,6 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 ; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_4f32_to_2i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2uqq %xmm0, %ymm0
-; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_SKX-NEXT:    vzeroupper
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_4f32_to_2i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm1, %rax
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm0, %rcx
-; WIDEN_KNL-NEXT:    vmovq %rcx, %xmm0
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <4 x float> %a to <4 x i64>
   %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %shuf
@@ -1958,18 +1650,6 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2udq %ymm0, %ymm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_8f32_to_8i32:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2udq %ymm0, %ymm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_8f32_to_8i32:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; WIDEN_KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; WIDEN_KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <8 x float> %a to <8 x i32>
   ret <8 x i32> %cvt
 }
@@ -2147,29 +1827,6 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %ymm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_4f32_to_4i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2uqq %xmm0, %ymm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_4f32_to_4i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm1, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm1
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm2, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm0, %rax
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm0
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; WIDEN_KNL-NEXT:    retq
   %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cvt = fptoui <4 x float> %shuf to <4 x i64>
   ret <4 x i64> %cvt
@@ -2349,30 +2006,6 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
 ; AVX512VLDQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
 ; AVX512VLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_8f32_to_4i64:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    vcvttps2uqq %ymm0, %zmm0
-; WIDEN_SKX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_8f32_to_4i64:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm1, %rax
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm0, %rcx
-; WIDEN_KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm1, %rdx
-; WIDEN_KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; WIDEN_KNL-NEXT:    vcvttss2usi %xmm0, %rsi
-; WIDEN_KNL-NEXT:    vmovq %rsi, %xmm0
-; WIDEN_KNL-NEXT:    vmovq %rdx, %xmm1
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; WIDEN_KNL-NEXT:    vmovq %rcx, %xmm1
-; WIDEN_KNL-NEXT:    vmovq %rax, %xmm2
-; WIDEN_KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; WIDEN_KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <8 x float> %a to <8 x i64>
   %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i64> %shuf
@@ -2392,11 +2025,6 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f64_to_2i64_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x double> <double 1.0, double -1.0> to <2 x i64>
   ret <2 x i64> %cvt
 }
@@ -2411,11 +2039,6 @@ define <4 x i32> @fptosi_2f64_to_2i32_const() {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f64_to_2i32_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i32> %ext
@@ -2432,11 +2055,6 @@ define <4 x i64> @fptosi_4f64_to_4i64_const() {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_4f64_to_4i64_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
-; WIDEN-NEXT:    retq
   %cvt = fptosi <4 x double> <double 1.0, double -1.0, double 2.0, double -3.0> to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -2451,11 +2069,6 @@ define <4 x i32> @fptosi_4f64_to_4i32_const() {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_4f64_to_4i32_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
-; WIDEN-NEXT:    retq
   %cvt = fptosi <4 x double> <double -1.0, double 1.0, double -2.0, double 3.0> to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -2470,11 +2083,6 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_2f64_to_2i64_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4]
-; WIDEN-NEXT:    retq
   %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i64>
   ret <2 x i64> %cvt
 }
@@ -2489,11 +2097,6 @@ define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <2,4,u,u>
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_2f64_to_2i32_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} xmm0 = <2,4,u,u>
-; WIDEN-NEXT:    retq
   %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i32> %ext
@@ -2510,11 +2113,6 @@ define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [2,4,6,8]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_4f64_to_4i64_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} ymm0 = [2,4,6,8]
-; WIDEN-NEXT:    retq
   %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -2529,11 +2127,6 @@ define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4,6,8]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_4f64_to_4i32_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4,6,8]
-; WIDEN-NEXT:    retq
   %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -2548,11 +2141,6 @@ define <4 x i32> @fptosi_4f32_to_4i32_const() {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_4f32_to_4i32_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; WIDEN-NEXT:    retq
   %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -2568,11 +2156,6 @@ define <4 x i64> @fptosi_4f32_to_4i64_const() {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_4f32_to_4i64_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
-; WIDEN-NEXT:    retq
   %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -2588,11 +2171,6 @@ define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_8f32_to_8i32_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
-; WIDEN-NEXT:    retq
   %cvt = fptosi <8 x float> <float 1.0, float -1.0, float 2.0, float 3.0, float 6.0, float -8.0, float 2.0, float -1.0> to <8 x i32>
   ret <8 x i32> %cvt
 }
@@ -2607,11 +2185,6 @@ define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,4,6]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_4f32_to_4i32_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,4,6]
-; WIDEN-NEXT:    retq
   %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 6.0> to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -2627,11 +2200,6 @@ define <4 x i64> @fptoui_4f32_to_4i64_const() {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_4f32_to_4i64_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8]
-; WIDEN-NEXT:    retq
   %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 8.0> to <4 x i64>
   ret <4 x i64> %cvt
 }
@@ -2647,11 +2215,6 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_8f32_to_8i32_const:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
-; WIDEN-NEXT:    retq
   %cvt = fptoui <8 x float> <float 1.0, float 2.0, float 4.0, float 6.0, float 8.0, float 6.0, float 4.0, float 1.0> to <8 x i32>
   ret <8 x i32> %cvt
 }
@@ -2719,20 +2282,6 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX512-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f16_to_4i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; WIDEN-NEXT:    vcvtph2ps %xmm1, %xmm1
-; WIDEN-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; WIDEN-NEXT:    vcvtph2ps %xmm0, %xmm0
-; WIDEN-NEXT:    vcvttss2si %xmm0, %eax
-; WIDEN-NEXT:    vcvttss2si %xmm1, %ecx
-; WIDEN-NEXT:    vmovd %ecx, %xmm0
-; WIDEN-NEXT:    vmovd %eax, %xmm1
-; WIDEN-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; WIDEN-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x half> %a to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %ext
@@ -2775,18 +2324,6 @@ define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind {
 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f80_to_4i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    fldt {{[0-9]+}}(%rsp)
-; WIDEN-NEXT:    fldt {{[0-9]+}}(%rsp)
-; WIDEN-NEXT:    fisttpl -{{[0-9]+}}(%rsp)
-; WIDEN-NEXT:    fisttpl -{{[0-9]+}}(%rsp)
-; WIDEN-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; WIDEN-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; WIDEN-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; WIDEN-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x x86_fp80> %a to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %ext
@@ -2841,27 +2378,6 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r14
 ; AVX-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f128_to_4i32:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    pushq %rbp
-; WIDEN-NEXT:    pushq %r14
-; WIDEN-NEXT:    pushq %rbx
-; WIDEN-NEXT:    movq %rcx, %r14
-; WIDEN-NEXT:    movq %rdx, %rbx
-; WIDEN-NEXT:    callq __fixtfsi
-; WIDEN-NEXT:    movl %eax, %ebp
-; WIDEN-NEXT:    movq %rbx, %rdi
-; WIDEN-NEXT:    movq %r14, %rsi
-; WIDEN-NEXT:    callq __fixtfsi
-; WIDEN-NEXT:    vmovd %eax, %xmm0
-; WIDEN-NEXT:    vmovd %ebp, %xmm1
-; WIDEN-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; WIDEN-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; WIDEN-NEXT:    popq %rbx
-; WIDEN-NEXT:    popq %r14
-; WIDEN-NEXT:    popq %rbp
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x fp128> %a to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %ext
@@ -2871,8 +2387,8 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
 ; SSE-LABEL: fptosi_2f32_to_2i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
@@ -2906,14 +2422,6 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f32_to_2i8:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; WIDEN-NEXT:    vpmovdb %zmm0, %xmm0
-; WIDEN-NEXT:    vzeroupper
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x float> %a to <2 x i8>
   ret <2 x i8> %cvt
 }
@@ -2922,8 +2430,8 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
 ; SSE-LABEL: fptosi_2f32_to_2i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
@@ -2957,23 +2465,6 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptosi_2f32_to_2i16:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; WIDEN_SKX-NEXT:    vcvttps2dq %ymm0, %ymm0
-; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
-; WIDEN_SKX-NEXT:    vzeroupper
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptosi_2f32_to_2i16:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; WIDEN_KNL-NEXT:    vcvttps2dq %ymm0, %ymm0
-; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptosi <2 x float> %a to <2 x i16>
   ret <2 x i16> %cvt
 }
@@ -3016,14 +2507,6 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_2f32_to_2i8:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
-; WIDEN-NEXT:    vpmovdb %zmm0, %xmm0
-; WIDEN-NEXT:    vzeroupper
-; WIDEN-NEXT:    retq
   %cvt = fptoui <2 x float> %a to <2 x i8>
   ret <2 x i8> %cvt
 }
@@ -3066,23 +2549,6 @@ define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_2f32_to_2i16:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; WIDEN_SKX-NEXT:    vcvttps2dq %ymm0, %ymm0
-; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
-; WIDEN_SKX-NEXT:    vzeroupper
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_2f32_to_2i16:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; WIDEN_KNL-NEXT:    vcvttps2dq %ymm0, %ymm0
-; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <2 x float> %a to <2 x i16>
   ret <2 x i16> %cvt
 }
@@ -3091,9 +2557,9 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
 ; SSE-LABEL: fptosi_2f64_to_2i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movapd %xmm0, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
 ; VEX-LABEL: fptosi_2f64_to_2i8:
@@ -3126,15 +2592,6 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN-LABEL: fptosi_2f64_to_2i8:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; WIDEN-NEXT:    vcvttsd2si %xmm1, %eax
-; WIDEN-NEXT:    vcvttsd2si %xmm0, %ecx
-; WIDEN-NEXT:    vmovd %ecx, %xmm0
-; WIDEN-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; WIDEN-NEXT:    retq
   %cvt = fptosi <2 x double> %a to <2 x i8>
   ret <2 x i8> %cvt
 }
@@ -3143,9 +2600,9 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
 ; SSE-LABEL: fptosi_2f64_to_2i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movapd %xmm0, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
 ; VEX-LABEL: fptosi_2f64_to_2i16:
@@ -3178,23 +2635,6 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptosi_2f64_to_2i16:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN_SKX-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
-; WIDEN_SKX-NEXT:    vzeroupper
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptosi_2f64_to_2i16:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN_KNL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptosi <2 x double> %a to <2 x i16>
   ret <2 x i16> %cvt
 }
@@ -3237,15 +2677,6 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN-LABEL: fptoui_2f64_to_2i8:
-; WIDEN:       # %bb.0:
-; WIDEN-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; WIDEN-NEXT:    vcvttsd2si %xmm1, %eax
-; WIDEN-NEXT:    vcvttsd2si %xmm0, %ecx
-; WIDEN-NEXT:    vmovd %ecx, %xmm0
-; WIDEN-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; WIDEN-NEXT:    retq
   %cvt = fptoui <2 x double> %a to <2 x i8>
   ret <2 x i8> %cvt
 }
@@ -3288,23 +2719,210 @@ define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
 ; AVX512VLDQ:       # %bb.0:
 ; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
-;
-; WIDEN_SKX-LABEL: fptoui_2f64_to_2i16:
-; WIDEN_SKX:       # %bb.0:
-; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN_SKX-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
-; WIDEN_SKX-NEXT:    vzeroupper
-; WIDEN_SKX-NEXT:    retq
-;
-; WIDEN_KNL-LABEL: fptoui_2f64_to_2i16:
-; WIDEN_KNL:       # %bb.0:
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; WIDEN_KNL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; WIDEN_KNL-NEXT:    vzeroupper
-; WIDEN_KNL-NEXT:    retq
   %cvt = fptoui <2 x double> %a to <2 x i16>
   ret <2 x i16> %cvt
 }
+
+define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) {
+; SSE-LABEL: fptosi_8f64_to_8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm3, %xmm3
+; SSE-NEXT:    cvttpd2dq %xmm2, %xmm2
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    packssdw %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_8f64_to_8i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; VEX-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; VEX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    vzeroupper
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_8f64_to_8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_8f64_to_8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_8f64_to_8i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_8f64_to_8i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptosi <8 x double> %a to <8 x i16>
+  ret <8 x i16> %cvt
+}
+
+define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) {
+; SSE-LABEL: fptoui_8f64_to_8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm3, %xmm3
+; SSE-NEXT:    cvttpd2dq %xmm2, %xmm2
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_8f64_to_8i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; VEX-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; VEX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    vzeroupper
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_8f64_to_8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_8f64_to_8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_8f64_to_8i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_8f64_to_8i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = fptoui <8 x double> %a to <8 x i16>
+  ret <8 x i16> %cvt
+}
+
+define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
+; SSE-LABEL: fptosi_16f32_to_16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptosi_16f32_to_16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptosi_16f32_to_16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: fptosi_16f32_to_16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvttps2dq %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = fptosi <16 x float> %a to <16 x i8>
+  ret <16 x i8> %cvt
+}
+
+define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
+; SSE-LABEL: fptoui_16f32_to_16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fptoui_16f32_to_16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_16f32_to_16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_16f32_to_16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvttps2dq %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = fptoui <16 x float> %a to <16 x i8>
+  ret <16 x i8> %cvt
+}
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index e883f975b0c94abf5a803f11dff34cc426a4103b..e0dd070bf4e87ac4c1bbdba65e2d6e409248afbc 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -8,18 +8,17 @@
 define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
 ; X32-LABEL: mmx_movzl:
 ; X32:       ## %bb.0:
-; X32-NEXT:    subl $20, %esp
+; X32-NEXT:    subl $12, %esp
 ; X32-NEXT:    movl $32, %eax
 ; X32-NEXT:    movd %eax, %xmm0
-; X32-NEXT:    movq %xmm0, (%esp)
-; X32-NEXT:    movq (%esp), %mm0
-; X32-NEXT:    addl $20, %esp
+; X32-NEXT:    movdq2q %xmm0, %mm0
+; X32-NEXT:    addl $12, %esp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mmx_movzl:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    movq %rax, %xmm0
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    retq
   %tmp = bitcast x86_mmx %x to <2 x i32>
   %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll
index 8b807d9cb257e16e60844509a57905073833b1c7..07fdbcae4986f5911c2d8a2a5cfe778f4ae96b5e 100644
--- a/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/test/CodeGen/X86/vec_insert-mmx.ll
@@ -13,10 +13,8 @@ define x86_mmx @t0(i32 %A) nounwind {
 ;
 ; X64-LABEL: t0:
 ; X64:       ## %bb.0:
-; X64-NEXT:    ## kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movq %rdi, %xmm0
-; X64-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; X64-NEXT:    retq
   %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
   %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
diff --git a/test/CodeGen/X86/vec_int_to_fp-widen.ll b/test/CodeGen/X86/vec_int_to_fp-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7441ce580271ecdcd7e0aba29ad7d35f532328f2
--- /dev/null
+++ b/test/CodeGen/X86/vec_int_to_fp-widen.ll
@@ -0,0 +1,5554 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ
+;
+; 32-bit tests to make sure we're not doing anything stupid.
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1
+
+;
+; Signed Integer to Double
+;
+
+define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
+; SSE2-LABEL: sitofp_2i64_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_2i64_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_2i64_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_2i64_to_2f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_2i64_to_2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_2i64_to_2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = sitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
+; SSE-LABEL: sitofp_2i32_to_2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_2i32_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %cvt = sitofp <2 x i32> %shuf to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
+; SSE-LABEL: sitofp_4i32_to_2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i32_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <4 x i32> %a to <4 x double>
+  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
+; SSE2-LABEL: sitofp_2i16_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_2i16_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_2i16_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %cvt = sitofp <2 x i16> %shuf to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
+; SSE2-LABEL: sitofp_8i16_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_8i16_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_8i16_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_8i16_to_2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = sitofp <8 x i16> %a to <8 x double>
+  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
+; SSE2-LABEL: sitofp_2i8_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_2i8_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_2i8_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %cvt = sitofp <2 x i8> %shuf to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
+; SSE2-LABEL: sitofp_16i8_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_16i8_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_16i8_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_16i8_to_2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = sitofp <16 x i8> %a to <16 x double>
+  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
+; SSE2-LABEL: sitofp_4i64_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_4i64_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm2
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm2
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_4i64_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_4i64_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_4i64_to_4f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_4i64_to_4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_4i64_to_4f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = sitofp <4 x i64> %a to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
+; SSE-LABEL: sitofp_4i32_to_4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i32_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <4 x i32> %a to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
+; SSE2-LABEL: sitofp_4i16_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_4i16_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i16_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = sitofp <4 x i16> %shuf to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
+; SSE2-LABEL: sitofp_8i16_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_8i16_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_8i16_to_4f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_8i16_to_4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+  %cvt = sitofp <8 x i16> %a to <8 x double>
+  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %shuf
+}
+
+define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
+; SSE2-LABEL: sitofp_4i8_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_4i8_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i8_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = sitofp <4 x i8> %shuf to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
+; SSE2-LABEL: sitofp_16i8_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_16i8_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_16i8_to_4f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_16i8_to_4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+  %cvt = sitofp <16 x i8> %a to <16 x double>
+  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %shuf
+}
+
+;
+; Unsigned Integer to Double
+;
+
+define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
+; SSE2-LABEL: uitofp_2i64_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_2i64_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_2i64_to_2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_2i64_to_2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_2i64_to_2f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_2i64_to_2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
+; SSE2-LABEL: uitofp_2i32_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_2i32_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_2i32_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
+; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
+; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_2i32_to_2f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_2i32_to_2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_2i32_to_2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %cvt = uitofp <2 x i32> %shuf to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
+; SSE2-LABEL: uitofp_4i32_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i32_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_4i32_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
+; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_4i32_to_2f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_4i32_to_2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <4 x i32> %a to <4 x double>
+  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
+; SSE2-LABEL: uitofp_2i16_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_2i16_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_2i16_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %cvt = uitofp <2 x i16> %shuf to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
+; SSE2-LABEL: uitofp_8i16_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_8i16_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_8i16_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_8i16_to_2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = uitofp <8 x i16> %a to <8 x double>
+  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
+; SSE2-LABEL: uitofp_2i8_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_2i8_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_2i8_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %cvt = uitofp <2 x i8> %shuf to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
+; SSE2-LABEL: uitofp_16i8_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_16i8_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_16i8_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_16i8_to_2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = uitofp <16 x i8> %a to <16 x double>
+  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuf
+}
+
+define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
+; SSE2-LABEL: uitofp_4i64_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT:    subpd %xmm6, %xmm0
+; SSE2-NEXT:    addpd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    subpd %xmm6, %xmm1
+; SSE2-NEXT:    addpd %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i64_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE41-NEXT:    subpd %xmm6, %xmm0
+; SSE41-NEXT:    addpd %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    por %xmm4, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    subpd %xmm6, %xmm1
+; SSE41-NEXT:    addpd %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i64_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_4i64_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_4i64_to_4f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_4i64_to_4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2pd %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <4 x i64> %a to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
+; SSE2-LABEL: uitofp_4i32_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
+; SSE2-NEXT:    mulpd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm5
+; SSE2-NEXT:    mulpd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    cvtdq2pd %xmm4, %xmm1
+; SSE2-NEXT:    addpd %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i32_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
+; SSE41-NEXT:    mulpd %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm5
+; SSE41-NEXT:    mulpd %xmm2, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm4, %xmm1
+; SSE41-NEXT:    addpd %xmm5, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i32_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_4i32_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_4i32_to_4f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_4i32_to_4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_4i32_to_4f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <4 x i32> %a to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
+; SSE2-LABEL: uitofp_4i16_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i16_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i16_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = uitofp <4 x i16> %shuf to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
+; SSE2-LABEL: uitofp_8i16_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_8i16_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_8i16_to_4f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_8i16_to_4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+  %cvt = uitofp <8 x i16> %a to <8 x double>
+  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %shuf
+}
+
+define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
+; SSE2-LABEL: uitofp_4i8_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i8_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i8_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = uitofp <4 x i8> %shuf to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
+; SSE2-LABEL: uitofp_16i8_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_16i8_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_16i8_to_4f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VEX-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_16i8_to_4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+  %cvt = uitofp <16 x i8> %a to <16 x double>
+  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %shuf
+}
+
+;
+; Signed Integer to Float
+;
+
+define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
+; SSE2-LABEL: sitofp_2i64_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_2i64_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_2i64_to_4f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; VEX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_2i64_to_4f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_2i64_to_4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_2i64_to_4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = sitofp <2 x i64> %a to <2 x float>
+  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x float> %ext
+}
+
+define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
+; SSE2-LABEL: sitofp_2i64_to_4f32_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_2i64_to_4f32_zero:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_2i64_to_4f32_zero:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_2i64_to_4f32_zero:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = sitofp <2 x i64> %a to <2 x float>
+  %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %ext
+}
+
+define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
+; SSE2-LABEL: sitofp_4i64_to_4f32_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_4i64_to_4f32_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_4i64_to_4f32_undef:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; VEX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %cvt = sitofp <4 x i64> %ext to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
+; SSE-LABEL: sitofp_4i32_to_4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i32_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
+; SSE2-LABEL: sitofp_4i16_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_4i16_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i16_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = sitofp <4 x i16> %shuf to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
+; SSE2-LABEL: sitofp_8i16_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_8i16_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_8i16_to_4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_8i16_to_4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_8i16_to_4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = sitofp <8 x i16> %a to <8 x float>
+  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
+; SSE2-LABEL: sitofp_4i8_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_4i8_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_4i8_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = sitofp <4 x i8> %shuf to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
+; SSE2-LABEL: sitofp_16i8_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_16i8_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_16i8_to_4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_16i8_to_4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_16i8_to_4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = sitofp <16 x i8> %a to <16 x float>
+  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
+; SSE2-LABEL: sitofp_4i64_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_4i64_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_4i64_to_4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_4i64_to_4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_4i64_to_4f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_4i64_to_4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = sitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
+; SSE-LABEL: sitofp_8i32_to_8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_8i32_to_8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
+; SSE2-LABEL: sitofp_8i16_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_8i16_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_8i16_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_8i16_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_8i16_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %cvt = sitofp <8 x i16> %a to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
+; SSE2-LABEL: sitofp_8i8_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_8i8_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_8i8_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_8i8_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_8i8_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %cvt = sitofp <8 x i8> %shuf to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
+; SSE2-LABEL: sitofp_16i8_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_16i8_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_16i8_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_16i8_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_16i8_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+  %cvt = sitofp <16 x i8> %a to <16 x float>
+  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuf
+}
+
+;
+; Unsigned Integer to Float
+;
+
+define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
+; SSE2-LABEL: uitofp_2i64_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB39_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    jmp .LBB39_3
+; SSE2-NEXT:  .LBB39_1:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:  .LBB39_3:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB39_4
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+; SSE2-NEXT:  .LBB39_4:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_2i64_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB39_1
+; SSE41-NEXT:  # %bb.2:
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    jmp .LBB39_3
+; SSE41-NEXT:  .LBB39_1:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    addss %xmm1, %xmm1
+; SSE41-NEXT:  .LBB39_3:
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB39_4
+; SSE41-NEXT:  # %bb.5:
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE41-NEXT:    retq
+; SSE41-NEXT:  .LBB39_4:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    addss %xmm0, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_2i64_to_4f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB39_1
+; VEX-NEXT:  # %bb.2:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    jmp .LBB39_3
+; VEX-NEXT:  .LBB39_1:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; VEX-NEXT:  .LBB39_3:
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB39_4
+; VEX-NEXT:  # %bb.5:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    jmp .LBB39_6
+; VEX-NEXT:  .LBB39_4:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; VEX-NEXT:  .LBB39_6:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VEX-NEXT:    js .LBB39_8
+; VEX-NEXT:  # %bb.7:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; VEX-NEXT:  .LBB39_8:
+; VEX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_2i64_to_4f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm1
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_2i64_to_4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_2i64_to_4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2ps %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <2 x i64> %a to <2 x float>
+  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x float> %ext
+}
+
+define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
+; SSE2-LABEL: uitofp_2i64_to_2f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB40_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    jmp .LBB40_3
+; SSE2-NEXT:  .LBB40_1:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:  .LBB40_3:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB40_4
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    jmp .LBB40_6
+; SSE2-NEXT:  .LBB40_4:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:  .LBB40_6:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_2i64_to_2f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB40_1
+; SSE41-NEXT:  # %bb.2:
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    jmp .LBB40_3
+; SSE41-NEXT:  .LBB40_1:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    addss %xmm0, %xmm0
+; SSE41-NEXT:  .LBB40_3:
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB40_4
+; SSE41-NEXT:  # %bb.5:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; SSE41-NEXT:    retq
+; SSE41-NEXT:  .LBB40_4:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    addss %xmm1, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_2i64_to_2f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB40_1
+; VEX-NEXT:  # %bb.2:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    jmp .LBB40_3
+; VEX-NEXT:  .LBB40_1:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; VEX-NEXT:  .LBB40_3:
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB40_4
+; VEX-NEXT:  # %bb.5:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; VEX-NEXT:    retq
+; VEX-NEXT:  .LBB40_4:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_2i64_to_2f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_2i64_to_2f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_2i64_to_2f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2ps %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <2 x i64> %a to <2 x float>
+  %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %ext
+}
+
+define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
+; SSE2-LABEL: uitofp_4i64_to_4f32_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB41_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    jmp .LBB41_3
+; SSE2-NEXT:  .LBB41_1:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:  .LBB41_3:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB41_4
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    jmp .LBB41_6
+; SSE2-NEXT:  .LBB41_4:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:  .LBB41_6:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    js .LBB41_8
+; SSE2-NEXT:  # %bb.7:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:  .LBB41_8:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i64_to_4f32_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB41_1
+; SSE41-NEXT:  # %bb.2:
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    jmp .LBB41_3
+; SSE41-NEXT:  .LBB41_1:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    addss %xmm1, %xmm1
+; SSE41-NEXT:  .LBB41_3:
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB41_4
+; SSE41-NEXT:  # %bb.5:
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    jmp .LBB41_6
+; SSE41-NEXT:  .LBB41_4:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    addss %xmm0, %xmm0
+; SSE41-NEXT:  .LBB41_6:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    js .LBB41_8
+; SSE41-NEXT:  # %bb.7:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:  .LBB41_8:
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_4i64_to_4f32_undef:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB41_1
+; VEX-NEXT:  # %bb.2:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    jmp .LBB41_3
+; VEX-NEXT:  .LBB41_1:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; VEX-NEXT:  .LBB41_3:
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB41_4
+; VEX-NEXT:  # %bb.5:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    jmp .LBB41_6
+; VEX-NEXT:  .LBB41_4:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; VEX-NEXT:  .LBB41_6:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VEX-NEXT:    js .LBB41_8
+; VEX-NEXT:  # %bb.7:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; VEX-NEXT:  .LBB41_8:
+; VEX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm1
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %cvt = uitofp <4 x i64> %ext to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
+; SSE2-LABEL: uitofp_4i32_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i32_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; SSE41-NEXT:    addps {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i32_to_4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_4i32_to_4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
+; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_4i32_to_4f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_4i32_to_4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_4i32_to_4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
+; SSE2-LABEL: uitofp_4i16_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i16_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i16_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = uitofp <4 x i16> %shuf to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
+; SSE2-LABEL: uitofp_8i16_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_8i16_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i16_to_4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_8i16_to_4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_8i16_to_4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = uitofp <8 x i16> %a to <8 x float>
+  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
+; SSE2-LABEL: uitofp_4i8_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i8_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_4i8_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cvt = uitofp <4 x i8> %shuf to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
+; SSE2-LABEL: uitofp_16i8_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_16i8_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_16i8_to_4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_16i8_to_4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_16i8_to_4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cvt = uitofp <16 x i8> %a to <16 x float>
+  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
+; SSE2-LABEL: uitofp_4i64_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB47_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    jmp .LBB47_3
+; SSE2-NEXT:  .LBB47_1:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:  .LBB47_3:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB47_4
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE2-NEXT:    jmp .LBB47_6
+; SSE2-NEXT:  .LBB47_4:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE2-NEXT:    addss %xmm3, %xmm3
+; SSE2-NEXT:  .LBB47_6:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB47_7
+; SSE2-NEXT:  # %bb.8:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    jmp .LBB47_9
+; SSE2-NEXT:  .LBB47_7:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:  .LBB47_9:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB47_10
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    jmp .LBB47_12
+; SSE2-NEXT:  .LBB47_10:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:  .LBB47_12:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_4i64_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB47_1
+; SSE41-NEXT:  # %bb.2:
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    jmp .LBB47_3
+; SSE41-NEXT:  .LBB47_1:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    addss %xmm2, %xmm2
+; SSE41-NEXT:  .LBB47_3:
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB47_4
+; SSE41-NEXT:  # %bb.5:
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    jmp .LBB47_6
+; SSE41-NEXT:  .LBB47_4:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    addss %xmm0, %xmm0
+; SSE41-NEXT:  .LBB47_6:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB47_7
+; SSE41-NEXT:  # %bb.8:
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    jmp .LBB47_9
+; SSE41-NEXT:  .LBB47_7:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    addss %xmm2, %xmm2
+; SSE41-NEXT:  .LBB47_9:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB47_10
+; SSE41-NEXT:  # %bb.11:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+; SSE41-NEXT:  .LBB47_10:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    addss %xmm1, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_4i64_to_4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    testq %rax, %rax
+; AVX1-NEXT:    js .LBB47_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX1-NEXT:    jmp .LBB47_3
+; AVX1-NEXT:  .LBB47_1:
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq %rcx
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    orq %rcx, %rax
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:  .LBB47_3:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    testq %rax, %rax
+; AVX1-NEXT:    js .LBB47_4
+; AVX1-NEXT:  # %bb.5:
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX1-NEXT:    jmp .LBB47_6
+; AVX1-NEXT:  .LBB47_4:
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq %rcx
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    orq %rcx, %rax
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:  .LBB47_6:
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    testq %rax, %rax
+; AVX1-NEXT:    js .LBB47_7
+; AVX1-NEXT:  # %bb.8:
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX1-NEXT:    jmp .LBB47_9
+; AVX1-NEXT:  .LBB47_7:
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq %rcx
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    orq %rcx, %rax
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:  .LBB47_9:
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    testq %rax, %rax
+; AVX1-NEXT:    js .LBB47_10
+; AVX1-NEXT:  # %bb.11:
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB47_10:
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq %rcx
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    orq %rcx, %rax
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_4i64_to_4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    js .LBB47_1
+; AVX2-NEXT:  # %bb.2:
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX2-NEXT:    jmp .LBB47_3
+; AVX2-NEXT:  .LBB47_1:
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    orq %rcx, %rax
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:  .LBB47_3:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    js .LBB47_4
+; AVX2-NEXT:  # %bb.5:
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX2-NEXT:    jmp .LBB47_6
+; AVX2-NEXT:  .LBB47_4:
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    orq %rcx, %rax
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:  .LBB47_6:
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    js .LBB47_7
+; AVX2-NEXT:  # %bb.8:
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX2-NEXT:    jmp .LBB47_9
+; AVX2-NEXT:  .LBB47_7:
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    orq %rcx, %rax
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:  .LBB47_9:
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    js .LBB47_10
+; AVX2-NEXT:  # %bb.11:
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB47_10:
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    orq %rcx, %rax
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_4i64_to_4f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_4i64_to_4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
+; SSE2-LABEL: uitofp_8i32_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
+; SSE2-NEXT:    addps %xmm6, %xmm0
+; SSE2-NEXT:    addps %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    addps %xmm6, %xmm1
+; SSE2-NEXT:    addps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_8i32_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE41-NEXT:    movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
+; SSE41-NEXT:    addps %xmm5, %xmm0
+; SSE41-NEXT:    addps %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE41-NEXT:    addps %xmm5, %xmm1
+; SSE41-NEXT:    addps %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i32_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_8i32_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
+; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_8i32_to_8f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_8i32_to_8f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_8i32_to_8f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %cvt = uitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
+; SSE2-LABEL: uitofp_8i16_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_8i16_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i16_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_8i16_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_8i16_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %cvt = uitofp <8 x i16> %a to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
+; SSE2-LABEL: uitofp_8i8_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_8i8_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_8i8_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_8i8_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_8i8_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %cvt = uitofp <8 x i8> %shuf to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
+; SSE2-LABEL: uitofp_16i8_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_16i8_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_16i8_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_16i8_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_16i8_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+  %cvt = uitofp <16 x i8> %a to <16 x float>
+  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuf
+}
+
+;
+; Load Signed Integer to Double
+;
+
+define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
+; SSE2-LABEL: sitofp_load_2i64_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_2i64_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_load_2i64_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2pd (%rdi), %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <2 x i64>, <2 x i64> *%a
+  %cvt = sitofp <2 x i64> %ld to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
+; SSE-LABEL: sitofp_load_2i32_to_2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_2i32_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %ld = load <2 x i32>, <2 x i32> *%a
+  %cvt = sitofp <2 x i32> %ld to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
+; SSE2-LABEL: sitofp_load_2i16_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_2i16_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_2i16_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ld = load <2 x i16>, <2 x i16> *%a
+  %cvt = sitofp <2 x i16> %ld to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
+; SSE2-LABEL: sitofp_load_2i8_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_2i8_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_2i8_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ld = load <2 x i8>, <2 x i8> *%a
+  %cvt = sitofp <2 x i8> %ld to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
+; SSE2-LABEL: sitofp_load_4i64_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2sdq %rax, %xmm2
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_4i64_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm2
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm2
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2sdq %rax, %xmm1
+; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_load_4i64_to_4f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; VEX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2pd (%rdi), %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <4 x i64>, <4 x i64> *%a
+  %cvt = sitofp <4 x i64> %ld to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
+; SSE-LABEL: sitofp_load_4i32_to_4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_4i32_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2pd (%rdi), %ymm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i32>, <4 x i32> *%a
+  %cvt = sitofp <4 x i32> %ld to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
+; SSE2-LABEL: sitofp_load_4i16_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_4i16_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_4i16_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i16>, <4 x i16> *%a
+  %cvt = sitofp <4 x i16> %ld to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
+; SSE2-LABEL: sitofp_load_4i8_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_4i8_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_4i8_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i8>, <4 x i8> *%a
+  %cvt = sitofp <4 x i8> %ld to <4 x double>
+  ret <4 x double> %cvt
+}
+
+;
+; Load Unsigned Integer to Double
+;
+
+define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
+; SSE2-LABEL: uitofp_load_2i64_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_2i64_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_load_2i64_to_2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_2i64_to_2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2pd (%rdi), %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <2 x i64>, <2 x i64> *%a
+  %cvt = uitofp <2 x i64> %ld to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
+; SSE2-LABEL: uitofp_load_2i32_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_2i32_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_load_2i32_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
+; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
+; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2pd (%rdi), %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2pd (%rdi), %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <2 x i32>, <2 x i32> *%a
+  %cvt = uitofp <2 x i32> %ld to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
+; SSE2-LABEL: uitofp_load_2i16_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_2i16_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_load_2i16_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ld = load <2 x i16>, <2 x i16> *%a
+  %cvt = uitofp <2 x i16> %ld to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
+; SSE2-LABEL: uitofp_load_2i8_to_2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_2i8_to_2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_load_2i8_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ld = load <2 x i8>, <2 x i8> *%a
+  %cvt = uitofp <2 x i8> %ld to <2 x double>
+  ret <2 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
+; SSE2-LABEL: uitofp_load_4i64_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT:    subpd %xmm6, %xmm0
+; SSE2-NEXT:    addpd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    subpd %xmm6, %xmm1
+; SSE2-NEXT:    addpd %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_4i64_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE41-NEXT:    subpd %xmm6, %xmm0
+; SSE41-NEXT:    addpd %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    por %xmm4, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    subpd %xmm6, %xmm1
+; SSE41-NEXT:    addpd %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_load_4i64_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
+; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_4i64_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2pd (%rdi), %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <4 x i64>, <4 x i64> *%a
+  %cvt = uitofp <4 x i64> %ld to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
+; SSE2-LABEL: uitofp_load_4i32_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
+; SSE2-NEXT:    mulpd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm5
+; SSE2-NEXT:    mulpd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    cvtdq2pd %xmm4, %xmm1
+; SSE2-NEXT:    addpd %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_4i32_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
+; SSE41-NEXT:    mulpd %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm5
+; SSE41-NEXT:    mulpd %xmm2, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm4, %xmm1
+; SSE41-NEXT:    addpd %xmm5, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_load_4i32_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_4i32_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_load_4i32_to_4f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_load_4i32_to_4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2pd (%rdi), %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2pd (%rdi), %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <4 x i32>, <4 x i32> *%a
+  %cvt = uitofp <4 x i32> %ld to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
+; SSE2-LABEL: uitofp_load_4i16_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_4i16_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_load_4i16_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i16>, <4 x i16> *%a
+  %cvt = uitofp <4 x i16> %ld to <4 x double>
+  ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
+; SSE2-LABEL: uitofp_load_4i8_to_4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_4i8_to_4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_load_4i8_to_4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i8>, <4 x i8> *%a
+  %cvt = uitofp <4 x i8> %ld to <4 x double>
+  ret <4 x double> %cvt
+}
+
+;
+; Load Signed Integer to Float
+;
+
+define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
+; SSE2-LABEL: sitofp_load_4i64_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_4i64_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_load_4i64_to_4f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <4 x i64>, <4 x i64> *%a
+  %cvt = sitofp <4 x i64> %ld to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
+; SSE-LABEL: sitofp_load_4i32_to_4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_4i32_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2ps (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i32>, <4 x i32> *%a
+  %cvt = sitofp <4 x i32> %ld to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
+; SSE2-LABEL: sitofp_load_4i16_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_4i16_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_4i16_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i16>, <4 x i16> *%a
+  %cvt = sitofp <4 x i16> %ld to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
+; SSE2-LABEL: sitofp_load_4i8_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_4i8_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_4i8_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i8>, <4 x i8> *%a
+  %cvt = sitofp <4 x i8> %ld to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
+; SSE2-LABEL: sitofp_load_8i64_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    xorps %xmm4, %xmm4
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_8i64_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE41-NEXT:    movdqa 48(%rdi), %xmm3
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm4, %xmm4
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    pextrq $1, %xmm2, %rax
+; SSE41-NEXT:    xorps %xmm4, %xmm4
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
+; SSE41-NEXT:    movq %xmm3, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; SSE41-NEXT:    pextrq $1, %xmm3, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: sitofp_load_8i64_to_8f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vmovdqa 32(%rdi), %xmm2
+; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
+; VEX-NEXT:    vpextrq $1, %xmm2, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; VEX-NEXT:    vmovq %xmm2, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; VEX-NEXT:    vmovq %xmm3, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; VEX-NEXT:    vpextrq $1, %xmm3, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm1
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; VEX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX512F-NEXT:    vmovq %xmm2, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512F-NEXT:    vmovq %xmm3, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm1
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX512VL-NEXT:    vmovq %xmm2, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512VL-NEXT:    vmovq %xmm3, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm1
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtqq2ps (%rdi), %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtqq2ps (%rdi), %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <8 x i64>, <8 x i64> *%a
+  %cvt = sitofp <8 x i64> %ld to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
+; SSE-LABEL: sitofp_load_8i32_to_8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
+; SSE-NEXT:    cvtdq2ps 16(%rdi), %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_load_8i32_to_8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2ps (%rdi), %ymm0
+; AVX-NEXT:    retq
+  %ld = load <8 x i32>, <8 x i32> *%a
+  %cvt = sitofp <8 x i32> %ld to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
+; SSE2-LABEL: sitofp_load_8i16_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_8i16_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_load_8i16_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_load_8i16_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_load_8i16_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %ld = load <8 x i16>, <8 x i16> *%a
+  %cvt = sitofp <8 x i16> %ld to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
+; SSE2-LABEL: sitofp_load_8i8_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sitofp_load_8i8_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sitofp_load_8i8_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sitofp_load_8i8_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sitofp_load_8i8_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %ld = load <8 x i8>, <8 x i8> *%a
+  %cvt = sitofp <8 x i8> %ld to <8 x float>
+  ret <8 x float> %cvt
+}
+
+;
+; Load Unsigned Integer to Float
+;
+
+define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
+; SSE2-LABEL: uitofp_load_4i64_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm2
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB76_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    jmp .LBB76_3
+; SSE2-NEXT:  .LBB76_1:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:  .LBB76_3:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB76_4
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE2-NEXT:    jmp .LBB76_6
+; SSE2-NEXT:  .LBB76_4:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE2-NEXT:    addss %xmm3, %xmm3
+; SSE2-NEXT:  .LBB76_6:
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB76_7
+; SSE2-NEXT:  # %bb.8:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    jmp .LBB76_9
+; SSE2-NEXT:  .LBB76_7:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:  .LBB76_9:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB76_10
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    jmp .LBB76_12
+; SSE2-NEXT:  .LBB76_10:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:  .LBB76_12:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_4i64_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB76_1
+; SSE41-NEXT:  # %bb.2:
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    jmp .LBB76_3
+; SSE41-NEXT:  .LBB76_1:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    addss %xmm2, %xmm2
+; SSE41-NEXT:  .LBB76_3:
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB76_4
+; SSE41-NEXT:  # %bb.5:
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    jmp .LBB76_6
+; SSE41-NEXT:  .LBB76_4:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    addss %xmm0, %xmm0
+; SSE41-NEXT:  .LBB76_6:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB76_7
+; SSE41-NEXT:  # %bb.8:
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    jmp .LBB76_9
+; SSE41-NEXT:  .LBB76_7:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    addss %xmm2, %xmm2
+; SSE41-NEXT:  .LBB76_9:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB76_10
+; SSE41-NEXT:  # %bb.11:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+; SSE41-NEXT:  .LBB76_10:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    addss %xmm1, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_load_4i64_to_4f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm2
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm0
+; VEX-NEXT:    vpextrq $1, %xmm2, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB76_1
+; VEX-NEXT:  # %bb.2:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    jmp .LBB76_3
+; VEX-NEXT:  .LBB76_1:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; VEX-NEXT:  .LBB76_3:
+; VEX-NEXT:    vmovq %xmm2, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB76_4
+; VEX-NEXT:  # %bb.5:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    jmp .LBB76_6
+; VEX-NEXT:  .LBB76_4:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; VEX-NEXT:  .LBB76_6:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB76_7
+; VEX-NEXT:  # %bb.8:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    jmp .LBB76_9
+; VEX-NEXT:  .LBB76_7:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; VEX-NEXT:  .LBB76_9:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB76_10
+; VEX-NEXT:  # %bb.11:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VEX-NEXT:    retq
+; VEX-NEXT:  .LBB76_10:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2psy (%rdi), %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <4 x i64>, <4 x i64> *%a
+  %cvt = uitofp <4 x i64> %ld to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
+; SSE2-LABEL: uitofp_load_4i32_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_4i32_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; SSE41-NEXT:    addps {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_load_4i32_to_4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_4i32_to_4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
+; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_load_4i32_to_4f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2ps (%rdi), %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2ps (%rdi), %xmm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <4 x i32>, <4 x i32> *%a
+  %cvt = uitofp <4 x i32> %ld to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
+; SSE2-LABEL: uitofp_load_4i16_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_4i16_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_load_4i16_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i16>, <4 x i16> *%a
+  %cvt = uitofp <4 x i16> %ld to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
+; SSE2-LABEL: uitofp_load_4i8_to_4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_4i8_to_4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uitofp_load_4i8_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %ld = load <4 x i8>, <4 x i8> *%a
+  %cvt = uitofp <4 x i8> %ld to <4 x float>
+  ret <4 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
+; SSE2-LABEL: uitofp_load_8i64_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm5
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB80_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE2-NEXT:    jmp .LBB80_3
+; SSE2-NEXT:  .LBB80_1:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE2-NEXT:    addss %xmm3, %xmm3
+; SSE2-NEXT:  .LBB80_3:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB80_4
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE2-NEXT:    jmp .LBB80_6
+; SSE2-NEXT:  .LBB80_4:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE2-NEXT:    addss %xmm4, %xmm4
+; SSE2-NEXT:  .LBB80_6:
+; SSE2-NEXT:    movq %xmm5, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB80_7
+; SSE2-NEXT:  # %bb.8:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    jmp .LBB80_9
+; SSE2-NEXT:  .LBB80_7:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:  .LBB80_9:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE2-NEXT:    movq %xmm5, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB80_10
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm6
+; SSE2-NEXT:    jmp .LBB80_12
+; SSE2-NEXT:  .LBB80_10:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm6
+; SSE2-NEXT:    addss %xmm6, %xmm6
+; SSE2-NEXT:  .LBB80_12:
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB80_13
+; SSE2-NEXT:  # %bb.14:
+; SSE2-NEXT:    xorps %xmm5, %xmm5
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm5
+; SSE2-NEXT:    jmp .LBB80_15
+; SSE2-NEXT:  .LBB80_13:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm5, %xmm5
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm5
+; SSE2-NEXT:    addss %xmm5, %xmm5
+; SSE2-NEXT:  .LBB80_15:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB80_16
+; SSE2-NEXT:  # %bb.17:
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm7
+; SSE2-NEXT:    jmp .LBB80_18
+; SSE2-NEXT:  .LBB80_16:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm7
+; SSE2-NEXT:    addss %xmm7, %xmm7
+; SSE2-NEXT:  .LBB80_18:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB80_19
+; SSE2-NEXT:  # %bb.20:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    jmp .LBB80_21
+; SSE2-NEXT:  .LBB80_19:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:  .LBB80_21:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB80_22
+; SSE2-NEXT:  # %bb.23:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    jmp .LBB80_24
+; SSE2-NEXT:  .LBB80_22:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:  .LBB80_24:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_8i64_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm4
+; SSE41-NEXT:    movdqa 32(%rdi), %xmm1
+; SSE41-NEXT:    movdqa 48(%rdi), %xmm2
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB80_1
+; SSE41-NEXT:  # %bb.2:
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE41-NEXT:    jmp .LBB80_3
+; SSE41-NEXT:  .LBB80_1:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE41-NEXT:    addss %xmm3, %xmm3
+; SSE41-NEXT:  .LBB80_3:
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB80_4
+; SSE41-NEXT:  # %bb.5:
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    jmp .LBB80_6
+; SSE41-NEXT:  .LBB80_4:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE41-NEXT:    addss %xmm0, %xmm0
+; SSE41-NEXT:  .LBB80_6:
+; SSE41-NEXT:    movq %xmm4, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB80_7
+; SSE41-NEXT:  # %bb.8:
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm5
+; SSE41-NEXT:    jmp .LBB80_9
+; SSE41-NEXT:  .LBB80_7:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm5
+; SSE41-NEXT:    addss %xmm5, %xmm5
+; SSE41-NEXT:  .LBB80_9:
+; SSE41-NEXT:    pextrq $1, %xmm4, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB80_10
+; SSE41-NEXT:  # %bb.11:
+; SSE41-NEXT:    xorps %xmm4, %xmm4
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE41-NEXT:    jmp .LBB80_12
+; SSE41-NEXT:  .LBB80_10:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm4, %xmm4
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm4
+; SSE41-NEXT:    addss %xmm4, %xmm4
+; SSE41-NEXT:  .LBB80_12:
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB80_13
+; SSE41-NEXT:  # %bb.14:
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm6
+; SSE41-NEXT:    jmp .LBB80_15
+; SSE41-NEXT:  .LBB80_13:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm6
+; SSE41-NEXT:    addss %xmm6, %xmm6
+; SSE41-NEXT:  .LBB80_15:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB80_16
+; SSE41-NEXT:  # %bb.17:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    jmp .LBB80_18
+; SSE41-NEXT:  .LBB80_16:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm1
+; SSE41-NEXT:    addss %xmm1, %xmm1
+; SSE41-NEXT:  .LBB80_18:
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3]
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB80_19
+; SSE41-NEXT:  # %bb.20:
+; SSE41-NEXT:    xorps %xmm3, %xmm3
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE41-NEXT:    jmp .LBB80_21
+; SSE41-NEXT:  .LBB80_19:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm3, %xmm3
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm3
+; SSE41-NEXT:    addss %xmm3, %xmm3
+; SSE41-NEXT:  .LBB80_21:
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; SSE41-NEXT:    pextrq $1, %xmm2, %rax
+; SSE41-NEXT:    testq %rax, %rax
+; SSE41-NEXT:    js .LBB80_22
+; SSE41-NEXT:  # %bb.23:
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; SSE41-NEXT:    retq
+; SSE41-NEXT:  .LBB80_22:
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shrq %rcx
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    orq %rcx, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ssq %rax, %xmm2
+; SSE41-NEXT:    addss %xmm2, %xmm2
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; VEX-LABEL: uitofp_load_8i64_to_8f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm1
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 32(%rdi), %xmm4
+; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
+; VEX-NEXT:    vpextrq $1, %xmm4, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_1
+; VEX-NEXT:  # %bb.2:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; VEX-NEXT:    jmp .LBB80_3
+; VEX-NEXT:  .LBB80_1:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; VEX-NEXT:  .LBB80_3:
+; VEX-NEXT:    vmovq %xmm4, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_4
+; VEX-NEXT:  # %bb.5:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
+; VEX-NEXT:    jmp .LBB80_6
+; VEX-NEXT:  .LBB80_4:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; VEX-NEXT:    vaddss %xmm4, %xmm4, %xmm5
+; VEX-NEXT:  .LBB80_6:
+; VEX-NEXT:    vmovq %xmm3, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_7
+; VEX-NEXT:  # %bb.8:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
+; VEX-NEXT:    jmp .LBB80_9
+; VEX-NEXT:  .LBB80_7:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
+; VEX-NEXT:    vaddss %xmm4, %xmm4, %xmm4
+; VEX-NEXT:  .LBB80_9:
+; VEX-NEXT:    vpextrq $1, %xmm3, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_10
+; VEX-NEXT:  # %bb.11:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
+; VEX-NEXT:    jmp .LBB80_12
+; VEX-NEXT:  .LBB80_10:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
+; VEX-NEXT:    vaddss %xmm3, %xmm3, %xmm3
+; VEX-NEXT:  .LBB80_12:
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_13
+; VEX-NEXT:  # %bb.14:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm6
+; VEX-NEXT:    jmp .LBB80_15
+; VEX-NEXT:  .LBB80_13:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm6
+; VEX-NEXT:    vaddss %xmm6, %xmm6, %xmm6
+; VEX-NEXT:  .LBB80_15:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3]
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_16
+; VEX-NEXT:  # %bb.17:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm1
+; VEX-NEXT:    jmp .LBB80_18
+; VEX-NEXT:  .LBB80_16:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm1
+; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; VEX-NEXT:  .LBB80_18:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3]
+; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3]
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_19
+; VEX-NEXT:  # %bb.20:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm2
+; VEX-NEXT:    jmp .LBB80_21
+; VEX-NEXT:  .LBB80_19:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm2
+; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; VEX-NEXT:  .LBB80_21:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3]
+; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_22
+; VEX-NEXT:  # %bb.23:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm0
+; VEX-NEXT:    jmp .LBB80_24
+; VEX-NEXT:  .LBB80_22:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm0
+; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; VEX-NEXT:  .LBB80_24:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm4
+; AVX512F-NEXT:    vmovq %xmm2, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512F-NEXT:    vmovq %xmm3, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm1
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm4
+; AVX512VL-NEXT:    vmovq %xmm2, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512VL-NEXT:    vmovq %xmm3, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm1
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtuqq2ps (%rdi), %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtuqq2ps (%rdi), %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <8 x i64>, <8 x i64> *%a
+  %cvt = uitofp <8 x i64> %ld to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
+; SSE2-LABEL: uitofp_load_8i32_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
+; SSE2-NEXT:    addps %xmm6, %xmm0
+; SSE2-NEXT:    addps %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    addps %xmm6, %xmm1
+; SSE2-NEXT:    addps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_8i32_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE41-NEXT:    movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
+; SSE41-NEXT:    addps %xmm5, %xmm0
+; SSE41-NEXT:    addps %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE41-NEXT:    addps %xmm5, %xmm1
+; SSE41-NEXT:    addps %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_load_8i32_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_8i32_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
+; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: uitofp_load_8i32_to_8f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: uitofp_load_8i32_to_8f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtudq2ps (%rdi), %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvtudq2ps (%rdi), %ymm0
+; AVX512VLDQ-NEXT:    retq
+  %ld = load <8 x i32>, <8 x i32> *%a
+  %cvt = uitofp <8 x i32> %ld to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
+; SSE2-LABEL: uitofp_load_8i16_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_8i16_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_load_8i16_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_8i16_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_load_8i16_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %ld = load <8 x i16>, <8 x i16> *%a
+  %cvt = uitofp <8 x i16> %ld to <8 x float>
+  ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
+; SSE2-LABEL: uitofp_load_8i8_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uitofp_load_8i8_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uitofp_load_8i8_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_8i8_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_load_8i8_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %ld = load <8 x i8>, <8 x i8> *%a
+  %cvt = uitofp <8 x i8> %ld to <8 x float>
+  ret <8 x float> %cvt
+}
+
+;
+; Aggregates
+;
+
+%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
+define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
+; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq 24(%rdi), %rax
+; SSE2-NEXT:    movdqu 8(%rdi), %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE2-NEXT:    movaps %xmm0, 16(%rax)
+; SSE2-NEXT:    movaps %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq 24(%rdi), %rax
+; SSE41-NEXT:    pmovsxwd 16(%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
+; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
+; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
+; SSE41-NEXT:    movaps %xmm0, 16(%rax)
+; SSE41-NEXT:    movaps %xmm1, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq 24(%rdi), %rax
+; AVX1-NEXT:    vpmovsxwd 16(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps %ymm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq 24(%rdi), %rax
+; AVX2-NEXT:    vpmovsxwd 8(%rdi), %ymm0
+; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT:    vmovaps %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq 24(%rdi), %rax
+; AVX512-NEXT:    vpmovsxwd 8(%rdi), %ymm0
+; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512-NEXT:    vmovaps %ymm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+ %1 = load %Arguments, %Arguments* %a0, align 1
+ %2 = extractvalue %Arguments %1, 1
+ %3 = extractvalue %Arguments %1, 2
+ %4 = sitofp <8 x i16> %2 to <8 x float>
+ store <8 x float> %4, <8 x float>* %3, align 32
+ ret void
+}
+
+define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
+; SSE-LABEL: sitofp_i32_to_2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2sdl %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i32_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i32 %a1 to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
+; SSE-LABEL: sitofp_i32_to_4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2ssl %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i32_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i32 %a1 to float
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
+define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
+; SSE-LABEL: sitofp_i64_to_2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2sdq %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i64_to_2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i64 %a1 to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
+; SSE-LABEL: sitofp_i64_to_4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2ssq %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i64_to_4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i64 %a1 to float
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 9ea75e493517729b5b3366859332129fbb3fc6da..1ccd636a7c172eafe444f37a6f27f97214a63869 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -579,7 +579,7 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
 define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
 ; SSE2-LABEL: uitofp_2i32_to_2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm0
@@ -591,7 +591,7 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
 ; SSE41-LABEL: uitofp_2i32_to_2f64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; SSE41-NEXT:    psrld $16, %xmm0
 ; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
@@ -602,7 +602,7 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
 ; VEX-LABEL: uitofp_2i32_to_2f64:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
 ; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
 ; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
@@ -643,7 +643,7 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
 define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
 ; SSE2-LABEL: uitofp_4i32_to_2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm0
@@ -655,7 +655,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
 ; SSE41-LABEL: uitofp_4i32_to_2f64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; SSE41-NEXT:    psrld $16, %xmm0
 ; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
@@ -940,17 +940,18 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
 ; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE2-NEXT:    mulpd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm4
-; SSE2-NEXT:    mulpd %xmm2, %xmm4
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT:    cvtdq2pd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm4, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm5
+; SSE2-NEXT:    mulpd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    cvtdq2pd %xmm4, %xmm1
+; SSE2-NEXT:    addpd %xmm5, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_4i32_to_4f64:
@@ -962,15 +963,16 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
 ; SSE41-NEXT:    mulpd %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3,4,5,6,7]
-; SSE41-NEXT:    psrld $16, %xmm4
-; SSE41-NEXT:    cvtdq2pd %xmm4, %xmm4
-; SSE41-NEXT:    mulpd %xmm2, %xmm4
-; SSE41-NEXT:    cvtdq2pd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm5
+; SSE41-NEXT:    mulpd %xmm2, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm4, %xmm1
+; SSE41-NEXT:    addpd %xmm5, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: uitofp_4i32_to_4f64:
@@ -1259,29 +1261,29 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
 ;
 ; VEX-LABEL: sitofp_2i64_to_4f32_zero:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
 ; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
 ; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_2i64_to_4f32_zero:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -2058,7 +2060,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
 ;
 ; VEX-LABEL: uitofp_2i64_to_2f32:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vmovq %xmm0, %rax
 ; VEX-NEXT:    testq %rax, %rax
 ; VEX-NEXT:    js .LBB40_1
 ; VEX-NEXT:  # %bb.2:
@@ -2072,12 +2074,12 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
 ; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
 ; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
 ; VEX-NEXT:  .LBB40_3:
-; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
 ; VEX-NEXT:    testq %rax, %rax
 ; VEX-NEXT:    js .LBB40_4
 ; VEX-NEXT:  # %bb.5:
 ; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; VEX-NEXT:    retq
 ; VEX-NEXT:  .LBB40_4:
 ; VEX-NEXT:    movq %rax, %rcx
@@ -2086,25 +2088,25 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
 ; VEX-NEXT:    orq %rcx, %rax
 ; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
 ; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_2i64_to_2f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_2i64_to_2f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -3254,44 +3256,27 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: sitofp_load_4i64_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_load_4i64_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; VEX-LABEL: sitofp_load_4i64_to_4f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; VEX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX512F-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm1, %rax
@@ -3307,8 +3292,8 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ;
 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX512VL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vmovq %xmm1, %rax
@@ -3525,7 +3510,7 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; SSE41-NEXT:    psrld $16, %xmm0
 ; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
@@ -3537,7 +3522,7 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
 ; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
 ; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
@@ -3680,17 +3665,17 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ;
 ; AVX1-LABEL: uitofp_load_4i64_to_4f64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
+; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: uitofp_load_4i64_to_4f64:
@@ -3759,17 +3744,18 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
 ; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE2-NEXT:    mulpd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm4
-; SSE2-NEXT:    mulpd %xmm2, %xmm4
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT:    cvtdq2pd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm4, %xmm1
+; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm5
+; SSE2-NEXT:    mulpd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    cvtdq2pd %xmm4, %xmm1
+; SSE2-NEXT:    addpd %xmm5, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_load_4i32_to_4f64:
@@ -3782,15 +3768,16 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
 ; SSE41-NEXT:    mulpd %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3,4,5,6,7]
-; SSE41-NEXT:    psrld $16, %xmm4
-; SSE41-NEXT:    cvtdq2pd %xmm4, %xmm4
-; SSE41-NEXT:    mulpd %xmm2, %xmm4
-; SSE41-NEXT:    cvtdq2pd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm5
+; SSE41-NEXT:    mulpd %xmm2, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
+; SSE41-NEXT:    cvtdq2pd %xmm4, %xmm1
+; SSE41-NEXT:    addpd %xmm5, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: uitofp_load_4i32_to_4f64:
@@ -3952,76 +3939,55 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: sitofp_load_4i64_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_load_4i64_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; VEX-LABEL: sitofp_load_4i64_to_4f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
@@ -4186,128 +4152,97 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: sitofp_load_8i64_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm1
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_load_8i64_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm1
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; VEX-LABEL: sitofp_load_8i64_to_8f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vmovdqa 32(%rdi), %xmm2
+; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
+; VEX-NEXT:    vpextrq $1, %xmm2, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; VEX-NEXT:    vmovq %xmm2, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; VEX-NEXT:    vmovq %xmm3, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; VEX-NEXT:    vpextrq $1, %xmm3, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm1
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; VEX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT:    vmovq %xmm2, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
 ; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX512F-NEXT:    vmovq %xmm2, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512F-NEXT:    vmovq %xmm3, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm1
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm2, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
 ; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX512VL-NEXT:    vmovq %xmm2, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512VL-NEXT:    vmovq %xmm3, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm1
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
@@ -4581,174 +4516,103 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: uitofp_load_4i64_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB76_1
-; AVX1-NEXT:  # %bb.2:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX1-NEXT:    jmp .LBB76_3
-; AVX1-NEXT:  .LBB76_1:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:  .LBB76_3:
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB76_4
-; AVX1-NEXT:  # %bb.5:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX1-NEXT:    jmp .LBB76_6
-; AVX1-NEXT:  .LBB76_4:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:  .LBB76_6:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB76_7
-; AVX1-NEXT:  # %bb.8:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX1-NEXT:    jmp .LBB76_9
-; AVX1-NEXT:  .LBB76_7:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:  .LBB76_9:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB76_10
-; AVX1-NEXT:  # %bb.11:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-; AVX1-NEXT:  .LBB76_10:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_4i64_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB76_1
-; AVX2-NEXT:  # %bb.2:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX2-NEXT:    jmp .LBB76_3
-; AVX2-NEXT:  .LBB76_1:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:  .LBB76_3:
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB76_4
-; AVX2-NEXT:  # %bb.5:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX2-NEXT:    jmp .LBB76_6
-; AVX2-NEXT:  .LBB76_4:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:  .LBB76_6:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB76_7
-; AVX2-NEXT:  # %bb.8:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX2-NEXT:    jmp .LBB76_9
-; AVX2-NEXT:  .LBB76_7:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:  .LBB76_9:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB76_10
-; AVX2-NEXT:  # %bb.11:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-; AVX2-NEXT:  .LBB76_10:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; VEX-LABEL: uitofp_load_4i64_to_4f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm2
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm0
+; VEX-NEXT:    vpextrq $1, %xmm2, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB76_1
+; VEX-NEXT:  # %bb.2:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    jmp .LBB76_3
+; VEX-NEXT:  .LBB76_1:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; VEX-NEXT:  .LBB76_3:
+; VEX-NEXT:    vmovq %xmm2, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB76_4
+; VEX-NEXT:  # %bb.5:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    jmp .LBB76_6
+; VEX-NEXT:  .LBB76_4:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; VEX-NEXT:  .LBB76_6:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB76_7
+; VEX-NEXT:  # %bb.8:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    jmp .LBB76_9
+; VEX-NEXT:  .LBB76_7:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; VEX-NEXT:  .LBB76_9:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB76_10
+; VEX-NEXT:  # %bb.11:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VEX-NEXT:    retq
+; VEX-NEXT:  .LBB76_10:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
 ; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
@@ -5171,320 +5035,193 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: uitofp_load_8i64_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB80_1
-; AVX1-NEXT:  # %bb.2:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX1-NEXT:    jmp .LBB80_3
-; AVX1-NEXT:  .LBB80_1:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:  .LBB80_3:
-; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB80_4
-; AVX1-NEXT:  # %bb.5:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm4
-; AVX1-NEXT:    jmp .LBB80_6
-; AVX1-NEXT:  .LBB80_4:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm4
-; AVX1-NEXT:  .LBB80_6:
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB80_7
-; AVX1-NEXT:  # %bb.8:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; AVX1-NEXT:    jmp .LBB80_9
-; AVX1-NEXT:  .LBB80_7:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:  .LBB80_9:
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB80_10
-; AVX1-NEXT:  # %bb.11:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
-; AVX1-NEXT:    jmp .LBB80_12
-; AVX1-NEXT:  .LBB80_10:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
-; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:  .LBB80_12:
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB80_13
-; AVX1-NEXT:  # %bb.14:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
-; AVX1-NEXT:    jmp .LBB80_15
-; AVX1-NEXT:  .LBB80_13:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
-; AVX1-NEXT:    vaddss %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:  .LBB80_15:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB80_16
-; AVX1-NEXT:  # %bb.17:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
-; AVX1-NEXT:    jmp .LBB80_18
-; AVX1-NEXT:  .LBB80_16:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
-; AVX1-NEXT:    vaddss %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:  .LBB80_18:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB80_19
-; AVX1-NEXT:  # %bb.20:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX1-NEXT:    jmp .LBB80_21
-; AVX1-NEXT:  .LBB80_19:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:  .LBB80_21:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB80_22
-; AVX1-NEXT:  # %bb.23:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
-; AVX1-NEXT:    jmp .LBB80_24
-; AVX1-NEXT:  .LBB80_22:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
-; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:  .LBB80_24:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_8i64_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB80_1
-; AVX2-NEXT:  # %bb.2:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX2-NEXT:    jmp .LBB80_3
-; AVX2-NEXT:  .LBB80_1:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:  .LBB80_3:
-; AVX2-NEXT:    vmovq %xmm2, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB80_4
-; AVX2-NEXT:  # %bb.5:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm4
-; AVX2-NEXT:    jmp .LBB80_6
-; AVX2-NEXT:  .LBB80_4:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm4
-; AVX2-NEXT:  .LBB80_6:
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vmovq %xmm2, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB80_7
-; AVX2-NEXT:  # %bb.8:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; AVX2-NEXT:    jmp .LBB80_9
-; AVX2-NEXT:  .LBB80_7:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:  .LBB80_9:
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB80_10
-; AVX2-NEXT:  # %bb.11:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
-; AVX2-NEXT:    jmp .LBB80_12
-; AVX2-NEXT:  .LBB80_10:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm2
-; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:  .LBB80_12:
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB80_13
-; AVX2-NEXT:  # %bb.14:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
-; AVX2-NEXT:    jmp .LBB80_15
-; AVX2-NEXT:  .LBB80_13:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
-; AVX2-NEXT:    vaddss %xmm5, %xmm5, %xmm5
-; AVX2-NEXT:  .LBB80_15:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB80_16
-; AVX2-NEXT:  # %bb.17:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
-; AVX2-NEXT:    jmp .LBB80_18
-; AVX2-NEXT:  .LBB80_16:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
-; AVX2-NEXT:    vaddss %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:  .LBB80_18:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vmovq %xmm3, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB80_19
-; AVX2-NEXT:  # %bb.20:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX2-NEXT:    jmp .LBB80_21
-; AVX2-NEXT:  .LBB80_19:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:  .LBB80_21:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB80_22
-; AVX2-NEXT:  # %bb.23:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
-; AVX2-NEXT:    jmp .LBB80_24
-; AVX2-NEXT:  .LBB80_22:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
-; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:  .LBB80_24:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; VEX-LABEL: uitofp_load_8i64_to_8f32:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vmovdqa (%rdi), %xmm1
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 32(%rdi), %xmm4
+; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
+; VEX-NEXT:    vpextrq $1, %xmm4, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_1
+; VEX-NEXT:  # %bb.2:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; VEX-NEXT:    jmp .LBB80_3
+; VEX-NEXT:  .LBB80_1:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; VEX-NEXT:  .LBB80_3:
+; VEX-NEXT:    vmovq %xmm4, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_4
+; VEX-NEXT:  # %bb.5:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
+; VEX-NEXT:    jmp .LBB80_6
+; VEX-NEXT:  .LBB80_4:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm4
+; VEX-NEXT:    vaddss %xmm4, %xmm4, %xmm5
+; VEX-NEXT:  .LBB80_6:
+; VEX-NEXT:    vmovq %xmm3, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_7
+; VEX-NEXT:  # %bb.8:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
+; VEX-NEXT:    jmp .LBB80_9
+; VEX-NEXT:  .LBB80_7:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
+; VEX-NEXT:    vaddss %xmm4, %xmm4, %xmm4
+; VEX-NEXT:  .LBB80_9:
+; VEX-NEXT:    vpextrq $1, %xmm3, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_10
+; VEX-NEXT:  # %bb.11:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
+; VEX-NEXT:    jmp .LBB80_12
+; VEX-NEXT:  .LBB80_10:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
+; VEX-NEXT:    vaddss %xmm3, %xmm3, %xmm3
+; VEX-NEXT:  .LBB80_12:
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_13
+; VEX-NEXT:  # %bb.14:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm6
+; VEX-NEXT:    jmp .LBB80_15
+; VEX-NEXT:  .LBB80_13:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm6
+; VEX-NEXT:    vaddss %xmm6, %xmm6, %xmm6
+; VEX-NEXT:  .LBB80_15:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3]
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_16
+; VEX-NEXT:  # %bb.17:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm1
+; VEX-NEXT:    jmp .LBB80_18
+; VEX-NEXT:  .LBB80_16:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm1
+; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; VEX-NEXT:  .LBB80_18:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3]
+; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3]
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_19
+; VEX-NEXT:  # %bb.20:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm2
+; VEX-NEXT:    jmp .LBB80_21
+; VEX-NEXT:  .LBB80_19:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm2
+; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; VEX-NEXT:  .LBB80_21:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3]
+; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    testq %rax, %rax
+; VEX-NEXT:    js .LBB80_22
+; VEX-NEXT:  # %bb.23:
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm0
+; VEX-NEXT:    jmp .LBB80_24
+; VEX-NEXT:  .LBB80_22:
+; VEX-NEXT:    movq %rax, %rcx
+; VEX-NEXT:    shrq %rcx
+; VEX-NEXT:    andl $1, %eax
+; VEX-NEXT:    orq %rcx, %rax
+; VEX-NEXT:    vcvtsi2ssq %rax, %xmm7, %xmm0
+; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; VEX-NEXT:  .LBB80_24:
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT:    vmovq %xmm2, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
 ; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm4
+; AVX512F-NEXT:    vmovq %xmm2, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512F-NEXT:    vmovq %xmm3, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm1
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm2, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
 ; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm4
+; AVX512VL-NEXT:    vmovq %xmm2, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512VL-NEXT:    vmovq %xmm3, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm4
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm5, %xmm1
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
@@ -5547,9 +5284,10 @@ define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
 ;
 ; AVX1-LABEL: uitofp_load_8i32_to_8f32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovaps (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
diff --git a/test/CodeGen/X86/vec_minmax_match.ll b/test/CodeGen/X86/vec_minmax_match.ll
index c3652f3a84ddc41f7aab6469704b6f9e8bb24f86..4d6bb799fe3e35628f2abb6e7c600c28f8a8f715 100644
--- a/test/CodeGen/X86/vec_minmax_match.ll
+++ b/test/CodeGen/X86/vec_minmax_match.ll
@@ -223,12 +223,11 @@ define <4 x i32> @wrong_pred_for_smin_with_not(<4 x i32> %x) {
 ; CHECK-LABEL: wrong_pred_for_smin_with_not:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4294967291,4294967291,4294967291,4294967291]
-; CHECK-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; CHECK-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [4294967291,4294967291,4294967291,4294967291]
+; CHECK-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %not_x = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   %cmp = icmp ugt <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
diff --git a/test/CodeGen/X86/vec_minmax_sint.ll b/test/CodeGen/X86/vec_minmax_sint.ll
index a0b620aef653a3094a1a614d0f151887966294be..a6afe94f0b78c54891dcfcb05d626c459b7cca0b 100644
--- a/test/CodeGen/X86/vec_minmax_sint.ll
+++ b/test/CodeGen/X86/vec_minmax_sint.ll
@@ -42,10 +42,9 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -122,17 +121,16 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
 ; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    movdqa %xmm6, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -142,10 +140,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
@@ -435,10 +432,9 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -515,17 +511,16 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
 ; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    movdqa %xmm6, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -535,10 +530,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
@@ -828,10 +822,9 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -916,10 +909,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
@@ -928,10 +920,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
@@ -1216,10 +1207,9 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -1304,10 +1294,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
@@ -1316,10 +1305,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
diff --git a/test/CodeGen/X86/vec_minmax_uint.ll b/test/CodeGen/X86/vec_minmax_uint.ll
index 3f1b8ac94e561e7c55cfde3d837916b5d5882111..4f8477a73c468c75e6e2b4387b983e56d5bdbc1c 100644
--- a/test/CodeGen/X86/vec_minmax_uint.ll
+++ b/test/CodeGen/X86/vec_minmax_uint.ll
@@ -42,10 +42,9 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -132,17 +131,16 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
 ; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    movdqa %xmm6, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -152,10 +150,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
@@ -463,10 +460,9 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -553,17 +549,16 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
 ; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    movdqa %xmm6, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -573,10 +568,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
@@ -884,10 +878,9 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -981,10 +974,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
@@ -993,10 +985,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
@@ -1304,10 +1295,9 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -1401,10 +1391,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
@@ -1413,10 +1402,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
diff --git a/test/CodeGen/X86/vec_setcc-2.ll b/test/CodeGen/X86/vec_setcc-2.ll
index ba9ff7722b470285284cfad3a2dbe286c8a891a6..946c9fcdaa32f9e164fa28bc2e5e250e76099568 100644
--- a/test/CodeGen/X86/vec_setcc-2.ll
+++ b/test/CodeGen/X86/vec_setcc-2.ll
@@ -32,17 +32,15 @@ define void @loop_no_const_reload(<2 x i64>*  %in, <2 x i64>* %out, i32 %n) {
 ; SSE41-NEXT:    je LBB0_3
 ; SSE41-NEXT:  ## %bb.1: ## %for.body.preheader
 ; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [26,26,26,26,26,26,26,26]
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25]
 ; SSE41-NEXT:    .p2align 4, 0x90
 ; SSE41-NEXT:  LBB0_2: ## %for.body
 ; SSE41-NEXT:    ## =>This Inner Loop Header: Depth=1
-; SSE41-NEXT:    movdqa (%rdi,%rax), %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pmaxuw %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, (%rsi,%rax)
+; SSE41-NEXT:    movdqa (%rdi,%rax), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pminuw %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, (%rsi,%rax)
 ; SSE41-NEXT:    addq $16, %rax
 ; SSE41-NEXT:    decl %edx
 ; SSE41-NEXT:    jne LBB0_2
@@ -146,11 +144,9 @@ for.end:                                          ; preds = %for.body, %entry
 define <16 x i8> @test_ult_byte(<16 x i8> %a) {
 ; CHECK-LABEL: test_ult_byte:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
-; CHECK-NEXT:    pmaxub %xmm0, %xmm1
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; CHECK-NEXT:    pminub %xmm0, %xmm1
 ; CHECK-NEXT:    pcmpeqb %xmm1, %xmm0
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %icmp = icmp ult <16 x i8> %a, <i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11>
@@ -183,3 +179,390 @@ entry:
   %sext = sext <8 x i1> %icmp to <8 x i16>
   ret <8 x i16> %sext
 }
+
+define <16 x i1> @ugt_v16i8_splat(<16 x i8> %x) {
+; CHECK-LABEL: ugt_v16i8_splat:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [43,43,43,43,43,43,43,43,43,43,43,43,43,43,43,43]
+; CHECK-NEXT:    pmaxub %xmm0, %xmm1
+; CHECK-NEXT:    pcmpeqb %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = icmp ugt <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @ugt_v8i16_splat(<8 x i16> %x) {
+; SSE2-LABEL: ugt_v8i16_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ugt_v8i16_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243]
+; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ugt <8 x i16> %x, <i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242>
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @ugt_v4i32_splat(<4 x i32> %x) {
+; SSE2-LABEL: ugt_v4i32_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ugt_v4i32_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967255,4294967255,4294967255,4294967255]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> %x, <i32 -42, i32 -42, i32 -42, i32 -42>
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @ugt_v2i64_splat(<2 x i64> %x) {
+; SSE2-LABEL: ugt_v2i64_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ugt_v2i64_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtq {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ugt <2 x i64> %x, <i64 442, i64 442>
+  ret <2 x i1> %cmp
+}
+
+define <16 x i1> @uge_v16i8_splat(<16 x i8> %x) {
+; CHECK-LABEL: uge_v16i8_splat:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; CHECK-NEXT:    pmaxub %xmm0, %xmm1
+; CHECK-NEXT:    pcmpeqb %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = icmp uge <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @uge_v8i16_splat(<8 x i16> %x) {
+; SSE2-LABEL: uge_v8i16_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242]
+; SSE2-NEXT:    psubusw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uge_v8i16_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242]
+; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp uge <8 x i16> %x, <i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242>
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @uge_v4i32_splat(<4 x i32> %x) {
+; SSE2-LABEL: uge_v4i32_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483606,2147483606,2147483606,2147483606]
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uge_v4i32_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp uge <4 x i32> %x, <i32 -42, i32 -42, i32 -42, i32 -42>
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @uge_v2i64_splat(<2 x i64> %x) {
+; SSE2-LABEL: uge_v2i64_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uge_v2i64_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854776250,9223372036854776250]
+; SSE41-NEXT:    pcmpgtq %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp uge <2 x i64> %x, <i64 442, i64 442>
+  ret <2 x i1> %cmp
+}
+
+define <16 x i1> @ult_v16i8_splat(<16 x i8> %x) {
+; CHECK-LABEL: ult_v16i8_splat:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41]
+; CHECK-NEXT:    pminub %xmm0, %xmm1
+; CHECK-NEXT:    pcmpeqb %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = icmp ult <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @ult_v8i16_splat(<8 x i16> %x) {
+; SSE2-LABEL: ult_v8i16_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    psubusw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ult_v8i16_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [241,241,241,241,241,241,241,241]
+; SSE41-NEXT:    pminuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ult <8 x i16> %x, <i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242>
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @ult_v4i32_splat(<4 x i32> %x) {
+; SSE2-LABEL: ult_v4i32_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483606,2147483606,2147483606,2147483606]
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ult_v4i32_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ult <4 x i32> %x, <i32 -42, i32 -42, i32 -42, i32 -42>
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @ult_v2i64_splat(<2 x i64> %x) {
+; SSE2-LABEL: ult_v2i64_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ult_v2i64_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854776250,9223372036854776250]
+; SSE41-NEXT:    pcmpgtq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ult <2 x i64> %x, <i64 442, i64 442>
+  ret <2 x i1> %cmp
+}
+
+define <16 x i1> @ule_v16i8_splat(<16 x i8> %x) {
+; CHECK-LABEL: ule_v16i8_splat:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; CHECK-NEXT:    pminub %xmm0, %xmm1
+; CHECK-NEXT:    pcmpeqb %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = icmp ule <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @ule_v8i16_splat(<8 x i16> %x) {
+; SSE2-LABEL: ule_v8i16_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    psubusw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ule_v8i16_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242]
+; SSE41-NEXT:    pminuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ule <8 x i16> %x, <i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242, i16 242>
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @ule_v4i32_splat(<4 x i32> %x) {
+; SSE2-LABEL: ule_v4i32_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ule_v4i32_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ule <4 x i32> %x, <i32 -42, i32 -42, i32 -42, i32 -42>
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @ule_v2i64_splat(<2 x i64> %x) {
+; SSE2-LABEL: ule_v2i64_splat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ule_v2i64_splat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpgtq {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ule <2 x i64> %x, <i64 442, i64 442>
+  ret <2 x i1> %cmp
+}
+
+; This should be simplified before we reach lowering, but
+; make sure that we are not getting it wrong by underflowing.
+
+define <4 x i1> @ult_v4i32_splat_0_simplify(<4 x i32> %x) {
+; CHECK-LABEL: ult_v4i32_splat_0_simplify:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = icmp ult <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i1> %cmp
+}
+
+; This should be simplified before we reach lowering, but
+; make sure that we are not getting it wrong by overflowing.
+
+define <4 x i1> @ugt_v4i32_splat_maxval_simplify(<4 x i32> %x) {
+; CHECK-LABEL: ugt_v4i32_splat_maxval_simplify:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @ugt_v4i32_nonsplat(<4 x i32> %x) {
+; SSE2-LABEL: ugt_v4i32_nonsplat:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ugt_v4i32_nonsplat:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967253,4294967254,4294967255,4294967256]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> %x, <i32 -43, i32 -42, i32 -41, i32 -40>
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @ugt_v4i32_splat_commute(<4 x i32> %x) {
+; SSE2-LABEL: ugt_v4i32_splat_commute:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483652,2147483652,2147483652,2147483652]
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ugt_v4i32_splat_commute:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [3,3,3,3]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %x
+  ret <4 x i1> %cmp
+}
+
+define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) {
+; SSE2-LABEL: PR39859:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: PR39859:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [43,43,43,43,43,43,43,43]
+; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+  %cmp = icmp ugt <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
+  %sel = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x
+  ret <8 x i16> %sel
+}
+
diff --git a/test/CodeGen/X86/vector-compare-any_of.ll b/test/CodeGen/X86/vector-compare-any_of.ll
index b7fa5cb64dfd756109d94355a232bf498a6ac8d3..4b2f98b369841fc4dbc2179b7b664779cb7e6045 100644
--- a/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/test/CodeGen/X86/vector-compare-any_of.ll
@@ -50,8 +50,9 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
 ; AVX-LABEL: test_v4f64_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovmskpd %ymm0, %eax
-; AVX-NEXT:    negl %eax
+; AVX-NEXT:    vmovmskpd %ymm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl %ecx, %eax
 ; AVX-NEXT:    sbbq %rax, %rax
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -83,9 +84,10 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm2
 ; SSE-NEXT:    packssdw %xmm3, %xmm2
 ; SSE-NEXT:    movmskps %xmm2, %eax
-; SSE-NEXT:    negl %eax
-; SSE-NEXT:    sbbl %eax, %eax
-; SSE-NEXT:    cltq
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl %eax, %ecx
+; SSE-NEXT:    sbbl %ecx, %ecx
+; SSE-NEXT:    movslq %ecx, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64_legal_sext:
@@ -94,9 +96,10 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovmskps %xmm0, %eax
-; AVX-NEXT:    negl %eax
-; AVX-NEXT:    sbbl %eax, %eax
-; AVX-NEXT:    cltq
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl %eax, %ecx
+; AVX-NEXT:    sbbl %ecx, %ecx
+; AVX-NEXT:    movslq %ecx, %rax
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -128,16 +131,18 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_v4f32_sext:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cmpltps %xmm0, %xmm1
-; SSE-NEXT:    movmskps %xmm1, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    movmskps %xmm1, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f32_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovmskps %xmm0, %eax
-; AVX-NEXT:    negl %eax
+; AVX-NEXT:    vmovmskps %xmm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl %ecx, %eax
 ; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    retq
 ;
@@ -166,16 +171,18 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
 ; SSE-NEXT:    cmpltps %xmm1, %xmm3
 ; SSE-NEXT:    cmpltps %xmm0, %xmm2
 ; SSE-NEXT:    orps %xmm3, %xmm2
-; SSE-NEXT:    movmskps %xmm2, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    movmskps %xmm2, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f32_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovmskps %ymm0, %eax
-; AVX-NEXT:    negl %eax
+; AVX-NEXT:    vmovmskps %ymm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl %ecx, %eax
 ; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -210,8 +217,9 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
 ; SSE-NEXT:    cmpltps %xmm1, %xmm3
 ; SSE-NEXT:    cmpltps %xmm0, %xmm2
 ; SSE-NEXT:    packssdw %xmm3, %xmm2
-; SSE-NEXT:    pmovmskb %xmm2, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    pmovmskb %xmm2, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
@@ -220,8 +228,9 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpmovmskb %xmm0, %eax
-; AVX-NEXT:    negl %eax
+; AVX-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl %ecx, %eax
 ; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -303,8 +312,9 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovmskpd %ymm0, %eax
-; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    vmovmskpd %ymm0, %ecx
+; AVX1-NEXT:    xorl %eax, %eax
+; AVX1-NEXT:    cmpl %ecx, %eax
 ; AVX1-NEXT:    sbbq %rax, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -312,8 +322,9 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX2-LABEL: test_v4i64_sext:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovmskpd %ymm0, %eax
-; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    vmovmskpd %ymm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl %ecx, %eax
 ; AVX2-NEXT:    sbbq %rax, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -345,9 +356,10 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
 ; SSE-NEXT:    movmskps %xmm0, %eax
-; SSE-NEXT:    negl %eax
-; SSE-NEXT:    sbbl %eax, %eax
-; SSE-NEXT:    cltq
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl %eax, %ecx
+; SSE-NEXT:    sbbl %ecx, %ecx
+; SSE-NEXT:    movslq %ecx, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v4i64_legal_sext:
@@ -358,9 +370,10 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovmskps %xmm0, %eax
-; AVX1-NEXT:    negl %eax
-; AVX1-NEXT:    sbbl %eax, %eax
-; AVX1-NEXT:    cltq
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    cmpl %eax, %ecx
+; AVX1-NEXT:    sbbl %ecx, %ecx
+; AVX1-NEXT:    movslq %ecx, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -370,9 +383,10 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovmskps %xmm0, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    sbbl %eax, %eax
-; AVX2-NEXT:    cltq
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    cmpl %eax, %ecx
+; AVX2-NEXT:    sbbl %ecx, %ecx
+; AVX2-NEXT:    movslq %ecx, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -404,16 +418,18 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: test_v4i32_sext:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    movmskps %xmm0, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    movmskps %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4i32_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovmskps %xmm0, %eax
-; AVX-NEXT:    negl %eax
+; AVX-NEXT:    vmovmskps %xmm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl %ecx, %eax
 ; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    retq
 ;
@@ -442,8 +458,9 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movmskps %xmm0, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    movmskps %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
@@ -454,8 +471,9 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovmskps %ymm0, %eax
-; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    vmovmskps %ymm0, %ecx
+; AVX1-NEXT:    xorl %eax, %eax
+; AVX1-NEXT:    cmpl %ecx, %eax
 ; AVX1-NEXT:    sbbl %eax, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -463,8 +481,9 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX2-LABEL: test_v8i32_sext:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovmskps %ymm0, %eax
-; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    vmovmskps %ymm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl %ecx, %eax
 ; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -499,8 +518,9 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
@@ -511,8 +531,9 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovmskb %xmm0, %eax
-; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    xorl %eax, %eax
+; AVX1-NEXT:    cmpl %ecx, %eax
 ; AVX1-NEXT:    sbbl %eax, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -522,8 +543,9 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpmovmskb %xmm0, %eax
-; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl %ecx, %eax
 ; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -559,8 +581,9 @@ define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_v8i16_sext:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpgtw %xmm1, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
@@ -568,8 +591,9 @@ define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
 ; AVX-LABEL: test_v8i16_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpmovmskb %xmm0, %eax
-; AVX-NEXT:    negl %eax
+; AVX-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl %ecx, %eax
 ; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
@@ -604,8 +628,9 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; SSE-NEXT:    pcmpgtw %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtw %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
@@ -632,8 +657,9 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX2-LABEL: test_v16i16_sext:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovmskb %ymm0, %eax
-; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl %ecx, %eax
 ; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -674,8 +700,9 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; SSE-NEXT:    pcmpgtw %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtw %xmm2, %xmm0
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
@@ -687,8 +714,9 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX1-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovmskb %xmm0, %eax
-; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    xorl %eax, %eax
+; AVX1-NEXT:    cmpl %ecx, %eax
 ; AVX1-NEXT:    sbbl %eax, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -699,8 +727,9 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpmovmskb %xmm0, %eax
-; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl %ecx, %eax
 ; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -742,8 +771,9 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-LABEL: test_v16i8_sext:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
@@ -751,8 +781,9 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
 ; AVX-LABEL: test_v16i8_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpmovmskb %xmm0, %eax
-; AVX-NEXT:    negl %eax
+; AVX-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl %ecx, %eax
 ; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
@@ -791,8 +822,9 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; SSE-NEXT:    pcmpgtb %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtb %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
-; SSE-NEXT:    negl %eax
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl %ecx, %eax
 ; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
@@ -821,8 +853,9 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX2-LABEL: test_v32i8_sext:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovmskb %ymm0, %eax
-; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl %ecx, %eax
 ; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
index 21f5a9b3752683d73b577df8e66d67b25ca6ef08..a01249d64b3be5b4f5f3dc38dc74b23c7c340f3f 100644
--- a/test/CodeGen/X86/vector-compare-results.ll
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -317,7 +317,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; AVX512F-LABEL: test_cmp_v16i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -325,7 +325,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; AVX512DQ-LABEL: test_cmp_v16i16:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -342,257 +342,17 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 }
 
 define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v32i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
-; SSE2-NEXT:    pcmpgtb %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v32i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movq %rdi, %rax
-; SSE42-NEXT:    pcmpgtb %xmm2, %xmm0
-; SSE42-NEXT:    pcmpgtb %xmm3, %xmm1
-; SSE42-NEXT:    pextrb $1, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm1, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm0, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, (%rdi)
-; SSE42-NEXT:    retq
+; SSE-LABEL: test_cmp_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    pcmpgtb %xmm2, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    pcmpgtb %xmm3, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %ecx, %edx
+; SSE-NEXT:    movl %edx, (%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32i8:
 ; AVX1:       # %bb.0:
@@ -931,263 +691,21 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 }
 
 define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v32i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    pcmpgtw %xmm5, %xmm1
-; SSE2-NEXT:    pcmpgtw %xmm4, %xmm0
-; SSE2-NEXT:    packsswb %xmm1, %xmm0
-; SSE2-NEXT:    pcmpgtw %xmm7, %xmm3
-; SSE2-NEXT:    pcmpgtw %xmm6, %xmm2
-; SSE2-NEXT:    packsswb %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v32i16:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movq %rdi, %rax
-; SSE42-NEXT:    pcmpgtw %xmm5, %xmm1
-; SSE42-NEXT:    pcmpgtw %xmm4, %xmm0
-; SSE42-NEXT:    pcmpgtw %xmm7, %xmm3
-; SSE42-NEXT:    pcmpgtw %xmm6, %xmm2
-; SSE42-NEXT:    pextrb $2, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $6, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $14, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $2, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $4, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $6, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $10, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $14, %xmm3, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $6, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $14, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $2, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $4, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $6, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $10, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $14, %xmm1, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, (%rdi)
-; SSE42-NEXT:    retq
+; SSE-LABEL: test_cmp_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    pcmpgtw %xmm5, %xmm1
+; SSE-NEXT:    pcmpgtw %xmm4, %xmm0
+; SSE-NEXT:    packsswb %xmm1, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    pcmpgtw %xmm7, %xmm3
+; SSE-NEXT:    pcmpgtw %xmm6, %xmm2
+; SSE-NEXT:    packsswb %xmm3, %xmm2
+; SSE-NEXT:    pmovmskb %xmm2, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %ecx, %edx
+; SSE-NEXT:    movl %edx, (%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32i16:
 ; AVX1:       # %bb.0:
@@ -1215,10 +733,10 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; AVX512F-LABEL: test_cmp_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -1226,10 +744,10 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; AVX512DQ-LABEL: test_cmp_v32i16:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    retq
@@ -1245,757 +763,48 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 }
 
 define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v64i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    pcmpgtb %xmm4, %xmm0
-; SSE2-NEXT:    pcmpgtb %xmm5, %xmm1
-; SSE2-NEXT:    pcmpgtb %xmm6, %xmm2
-; SSE2-NEXT:    pcmpgtb %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 6(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 4(%rdi)
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v64i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movq %rdi, %rax
-; SSE42-NEXT:    pcmpgtb %xmm4, %xmm0
-; SSE42-NEXT:    pcmpgtb %xmm5, %xmm1
-; SSE42-NEXT:    pcmpgtb %xmm6, %xmm2
-; SSE42-NEXT:    pcmpgtb %xmm7, %xmm3
-; SSE42-NEXT:    pextrb $1, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm3, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 6(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm2, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 4(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm1, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm0, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, (%rdi)
-; SSE42-NEXT:    retq
+; SSE-LABEL: test_cmp_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    pcmpgtb %xmm4, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    pcmpgtb %xmm5, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %ecx, %edx
+; SSE-NEXT:    pcmpgtb %xmm6, %xmm2
+; SSE-NEXT:    pmovmskb %xmm2, %ecx
+; SSE-NEXT:    pcmpgtb %xmm7, %xmm3
+; SSE-NEXT:    pmovmskb %xmm3, %esi
+; SSE-NEXT:    shll $16, %esi
+; SSE-NEXT:    orl %ecx, %esi
+; SSE-NEXT:    shlq $32, %rsi
+; SSE-NEXT:    orq %rdx, %rsi
+; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v64i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    movq %rdi, %rax
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vpmovmskb %xmm4, %ecx
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX1-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $4, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $5, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $6, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $7, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $8, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $9, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $10, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $11, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $12, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $13, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $14, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $15, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
 ; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $1, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $17, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $2, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $18, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $3, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $19, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $20, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $21, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $6, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $22, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $23, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $24, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $25, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $26, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $27, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $28, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $29, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $30, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm2, %esi
-; AVX1-NEXT:    shll $31, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    movl %esi, 4(%rdi)
-; AVX1-NEXT:    vpextrb $1, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX1-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $5, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $6, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm0, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $7, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $8, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm0, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $9, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $10, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm0, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $11, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $12, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm0, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $13, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $14, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm0, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $15, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $1, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $17, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $2, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $18, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $3, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $19, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $20, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $21, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $22, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $23, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $24, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $25, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $26, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $27, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $28, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $29, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $30, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm4, %esi
-; AVX1-NEXT:    shll $31, %esi
-; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %esi
+; AVX1-NEXT:    shll $16, %esi
 ; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    movl %esi, (%rdi)
+; AVX1-NEXT:    shlq $32, %rsi
+; AVX1-NEXT:    orq %rdx, %rsi
+; AVX1-NEXT:    movq %rsi, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -2003,255 +812,12 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $4, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $5, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $6, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $7, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $8, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $9, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $10, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $11, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $13, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $14, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $15, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $17, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $3, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $19, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $20, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $21, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $23, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $24, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $25, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $26, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $27, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $28, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $29, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $30, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %esi
-; AVX2-NEXT:    shll $31, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    movl %esi, 4(%rdi)
-; AVX2-NEXT:    vpextrb $1, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $4, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $5, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $6, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $7, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $8, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $9, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $10, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $11, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $13, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $14, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $15, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $1, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $17, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $3, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $19, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $20, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $21, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $23, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $24, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $25, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $26, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $27, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $28, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $29, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $30, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %esi
-; AVX2-NEXT:    shll $31, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    movl %esi, (%rdi)
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %edx
+; AVX2-NEXT:    shlq $32, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    movq %rdx, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -2392,300 +958,37 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
 }
 
 define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v32f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT:    cmpltps %xmm3, %xmm15
-; SSE2-NEXT:    movaps {{.*#+}} xmm3 = [255,255,255,255]
-; SSE2-NEXT:    andps %xmm3, %xmm15
-; SSE2-NEXT:    cmpltps %xmm2, %xmm14
-; SSE2-NEXT:    andps %xmm3, %xmm14
-; SSE2-NEXT:    packuswb %xmm15, %xmm14
-; SSE2-NEXT:    cmpltps %xmm1, %xmm13
-; SSE2-NEXT:    andps %xmm3, %xmm13
-; SSE2-NEXT:    cmpltps %xmm0, %xmm8
-; SSE2-NEXT:    andps %xmm3, %xmm8
-; SSE2-NEXT:    packuswb %xmm13, %xmm8
-; SSE2-NEXT:    packuswb %xmm14, %xmm8
-; SSE2-NEXT:    cmpltps %xmm7, %xmm12
-; SSE2-NEXT:    andps %xmm3, %xmm12
-; SSE2-NEXT:    cmpltps %xmm6, %xmm10
-; SSE2-NEXT:    andps %xmm3, %xmm10
-; SSE2-NEXT:    packuswb %xmm12, %xmm10
-; SSE2-NEXT:    cmpltps %xmm5, %xmm11
-; SSE2-NEXT:    andps %xmm3, %xmm11
-; SSE2-NEXT:    cmpltps %xmm4, %xmm9
-; SSE2-NEXT:    andps %xmm3, %xmm9
-; SSE2-NEXT:    packuswb %xmm11, %xmm9
-; SSE2-NEXT:    packuswb %xmm10, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v32f32:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movq %rdi, %rax
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE42-NEXT:    cmpltps %xmm3, %xmm8
-; SSE42-NEXT:    cmpltps %xmm2, %xmm9
-; SSE42-NEXT:    cmpltps %xmm1, %xmm10
-; SSE42-NEXT:    cmpltps %xmm0, %xmm11
-; SSE42-NEXT:    cmpltps %xmm7, %xmm12
-; SSE42-NEXT:    cmpltps %xmm6, %xmm13
-; SSE42-NEXT:    cmpltps %xmm5, %xmm14
-; SSE42-NEXT:    cmpltps %xmm4, %xmm15
-; SSE42-NEXT:    pextrb $4, %xmm15, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm15, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm15, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $12, %xmm15, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $0, %xmm14, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm14, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm14, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $12, %xmm14, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm13, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $4, %xmm13, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm13, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $12, %xmm13, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm12, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $4, %xmm12, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm12, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $12, %xmm12, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm11, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm11, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm11, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $12, %xmm11, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $0, %xmm10, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm10, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm10, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $12, %xmm10, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm9, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $4, %xmm9, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm9, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $12, %xmm9, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm8, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $4, %xmm8, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm8, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $12, %xmm8, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, (%rdi)
-; SSE42-NEXT:    retq
+; SSE-LABEL: test_cmp_v32f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE-NEXT:    cmpltps %xmm3, %xmm15
+; SSE-NEXT:    cmpltps %xmm2, %xmm14
+; SSE-NEXT:    packssdw %xmm15, %xmm14
+; SSE-NEXT:    cmpltps %xmm1, %xmm13
+; SSE-NEXT:    cmpltps %xmm0, %xmm12
+; SSE-NEXT:    packssdw %xmm13, %xmm12
+; SSE-NEXT:    packsswb %xmm14, %xmm12
+; SSE-NEXT:    pmovmskb %xmm12, %ecx
+; SSE-NEXT:    cmpltps %xmm7, %xmm11
+; SSE-NEXT:    cmpltps %xmm6, %xmm9
+; SSE-NEXT:    packssdw %xmm11, %xmm9
+; SSE-NEXT:    cmpltps %xmm5, %xmm10
+; SSE-NEXT:    cmpltps %xmm4, %xmm8
+; SSE-NEXT:    packssdw %xmm10, %xmm8
+; SSE-NEXT:    packsswb %xmm9, %xmm8
+; SSE-NEXT:    pmovmskb %xmm8, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %ecx, %edx
+; SSE-NEXT:    movl %edx, (%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32f32:
 ; AVX1:       # %bb.0:
@@ -2952,284 +1255,29 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
 }
 
 define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v32i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255]
-; SSE2-NEXT:    pand %xmm8, %xmm3
-; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    packuswb %xmm3, %xmm2
-; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    packuswb %xmm1, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    pand %xmm8, %xmm7
-; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT:    pand %xmm8, %xmm6
-; SSE2-NEXT:    packuswb %xmm7, %xmm6
-; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    pand %xmm8, %xmm5
-; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    pand %xmm8, %xmm4
-; SSE2-NEXT:    packuswb %xmm5, %xmm4
-; SSE2-NEXT:    packuswb %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v32i32:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movq %rdi, %rax
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm1
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT:    pextrb $4, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $12, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $0, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $12, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $4, %xmm6, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $12, %xmm6, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $4, %xmm7, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $12, %xmm7, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $12, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $0, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $12, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $4, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $12, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $4, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $12, %xmm3, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, (%rdi)
-; SSE42-NEXT:    retq
+; SSE-LABEL: test_cmp_v32i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm2, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    packssdw %xmm7, %xmm6
+; SSE-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    packssdw %xmm5, %xmm4
+; SSE-NEXT:    packsswb %xmm6, %xmm4
+; SSE-NEXT:    pmovmskb %xmm4, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %ecx, %edx
+; SSE-NEXT:    movl %edx, (%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32i32:
 ; AVX1:       # %bb.0:
@@ -3307,1037 +1355,93 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
 }
 
 define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v64i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    packsswb %xmm1, %xmm0
-; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    packsswb %xmm3, %xmm2
-; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    packsswb %xmm5, %xmm4
-; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT:    packsswb %xmm7, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 6(%rdi)
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 4(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v64i16:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movq %rdi, %rax
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm1
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    pextrb $2, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $6, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $14, %xmm6, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $2, %xmm7, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $4, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $6, %xmm7, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $10, %xmm7, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $14, %xmm7, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 6(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $6, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $14, %xmm4, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $2, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $4, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $6, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $10, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $14, %xmm5, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 4(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $6, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $14, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $2, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $4, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $6, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $10, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $14, %xmm3, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $6, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $14, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $2, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $4, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $6, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $10, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $14, %xmm1, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, (%rdi)
-; SSE42-NEXT:    retq
+; SSE-LABEL: test_cmp_v64i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    packsswb %xmm1, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    packsswb %xmm3, %xmm2
+; SSE-NEXT:    pmovmskb %xmm2, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %ecx, %edx
+; SSE-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    packsswb %xmm5, %xmm4
+; SSE-NEXT:    pmovmskb %xmm4, %ecx
+; SSE-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    packsswb %xmm7, %xmm6
+; SSE-NEXT:    pmovmskb %xmm6, %esi
+; SSE-NEXT:    shll $16, %esi
+; SSE-NEXT:    orl %ecx, %esi
+; SSE-NEXT:    shlq $32, %rsi
+; SSE-NEXT:    orq %rdx, %rsi
+; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v64i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    movq %rdi, %rax
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
 ; AVX1-NEXT:    vpcmpgtw %xmm8, %xmm9, %xmm8
-; AVX1-NEXT:    vpcmpgtw %xmm5, %xmm1, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpcmpgtw %xmm5, %xmm1, %xmm5
 ; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpcmpgtw %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpgtw %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX1-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX1-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $4, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $5, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $6, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $7, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm7, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $8, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm7, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $9, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm7, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $10, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm7, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $11, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm7, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $12, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm7, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $13, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm7, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $14, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm7, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $15, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    vpacksswb %xmm8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpcmpgtw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
 ; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $17, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $18, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $19, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $20, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $21, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $22, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $23, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $24, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $25, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $26, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $27, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $28, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $29, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $30, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm4, %esi
-; AVX1-NEXT:    shll $31, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    movl %esi, 4(%rdi)
-; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX1-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX1-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $5, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $6, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm0, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $7, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $8, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $9, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $10, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $11, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $12, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $13, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $14, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $15, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $17, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $18, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $19, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $20, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $21, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $22, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $23, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $24, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $25, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $26, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $27, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $28, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $29, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $30, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm8, %esi
-; AVX1-NEXT:    shll $31, %esi
-; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpgtw %xmm6, %xmm2, %xmm1
+; AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpgtw %xmm7, %xmm3, %xmm1
+; AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %esi
+; AVX1-NEXT:    shll $16, %esi
 ; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    movl %esi, (%rdi)
+; AVX1-NEXT:    shlq $32, %rsi
+; AVX1-NEXT:    orq %rdx, %rsi
+; AVX1-NEXT:    movq %rsi, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_cmp_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    movq %rdi, %rax
-; AVX2-NEXT:    vpcmpgtw %ymm5, %ymm1, %ymm5
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm1
-; AVX2-NEXT:    vpcmpgtw %ymm4, %ymm0, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm0
-; AVX2-NEXT:    vpcmpgtw %ymm7, %ymm3, %ymm7
-; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm3
-; AVX2-NEXT:    vpcmpgtw %ymm6, %ymm2, %ymm6
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm6, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm6, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm6, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm6, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm6, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $4, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm6, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $5, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm6, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $6, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm6, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $7, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $8, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $9, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $10, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $11, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $13, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $14, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $15, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $0, %xmm7, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm7, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $17, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm7, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm7, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $19, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm7, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $20, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm7, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $21, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm7, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm7, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $23, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $24, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $25, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $26, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $27, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $28, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $29, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $30, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm3, %esi
-; AVX2-NEXT:    shll $31, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    movl %esi, 4(%rdi)
-; AVX2-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $4, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $5, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $6, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm4, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $7, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $8, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $9, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $10, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $11, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $13, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $14, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $15, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $0, %xmm5, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm5, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $17, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm5, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm5, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $19, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm5, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $20, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm5, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $21, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm5, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm5, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $23, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $24, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $25, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $26, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $27, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $28, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $29, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $30, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm1, %esi
-; AVX2-NEXT:    shll $31, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    movl %esi, (%rdi)
+; AVX2-NEXT:    vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpacksswb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpacksswb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    vpcmpgtw %ymm7, %ymm3, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtw %ymm6, %ymm2, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %edx
+; AVX2-NEXT:    shlq $32, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    movq %rdx, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -4347,13 +1451,13 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 ; AVX512F-NEXT:    vpcmpgtw %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpcmpgtw %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpcmpgtw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vpmovsxwd %ymm3, %zmm3
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -4364,13 +1468,13 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 ; AVX512DQ-NEXT:    vpcmpgtw %ymm6, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpcmpgtw %ymm5, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpcmpgtw %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512DQ-NEXT:    vpmovsxwd %ymm3, %zmm3
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm3, %xmm3
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -4387,1497 +1491,82 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 }
 
 define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v128i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 14(%rdi)
-; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 12(%rdi)
-; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 10(%rdi)
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 8(%rdi)
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 6(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 4(%rdi)
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
-; SSE2-NEXT:    popq %rcx
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v128i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movq %rdi, %rax
-; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm1
-; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    pextrb $1, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm7, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm7, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm7, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm7, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm7, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 14(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm6, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm6, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm6, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm6, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm6, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 12(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm5, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 10(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm4, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm4, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm4, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm4, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm4, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 8(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm3, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 6(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm2, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm2, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 4(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm1, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm1, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $2, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $3, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $4, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $5, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $7, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $8, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $9, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $10, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $11, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $12, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $13, %xmm0, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $14, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $15, %xmm0, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, (%rdi)
-; SSE42-NEXT:    retq
+; SSE-LABEL: test_cmp_v128i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %ecx, %edx
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    pmovmskb %xmm2, %esi
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    pmovmskb %xmm3, %ecx
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    orl %esi, %ecx
+; SSE-NEXT:    shlq $32, %rcx
+; SSE-NEXT:    orq %rdx, %rcx
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    pmovmskb %xmm4, %edx
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    pmovmskb %xmm5, %esi
+; SSE-NEXT:    shll $16, %esi
+; SSE-NEXT:    orl %edx, %esi
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    pmovmskb %xmm6, %edx
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    pmovmskb %xmm7, %edi
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    orl %edx, %edi
+; SSE-NEXT:    shlq $32, %rdi
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    movq %rdi, 8(%rax)
+; SSE-NEXT:    movq %rcx, (%rax)
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v128i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    movq %rdi, %rax
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
-; AVX1-NEXT:    vpcmpgtb %xmm8, %xmm9, %xmm8
-; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm0, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm0, %xmm4
-; AVX1-NEXT:    vpcmpgtb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpgtb %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX1-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $4, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $5, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $6, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $7, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $8, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $9, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $10, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $11, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $12, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $13, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $14, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm3, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $15, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm6, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $1, %xmm6, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $17, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $2, %xmm6, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $18, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $3, %xmm6, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $19, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm6, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $20, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm6, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $21, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $6, %xmm6, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $22, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm6, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $23, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm6, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $24, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm6, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $25, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm6, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $26, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm6, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $27, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm6, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $28, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm6, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $29, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm6, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $30, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm6, %esi
-; AVX1-NEXT:    shll $31, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    movl %esi, 12(%rdi)
-; AVX1-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX1-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $4, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $5, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $6, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $7, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $8, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $9, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $10, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $11, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $12, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $13, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm2, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $14, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm2, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $15, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm0, %xmm8
+; AVX1-NEXT:    vpmovmskb %xmm8, %ecx
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
 ; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $1, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $17, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $2, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $18, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $3, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $19, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $20, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $21, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $6, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $22, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $23, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $24, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $25, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $26, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $27, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $28, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm5, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $29, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm5, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $30, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm5, %esi
-; AVX1-NEXT:    shll $31, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    movl %esi, 8(%rdi)
-; AVX1-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX1-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $5, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $6, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $7, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $8, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $9, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $10, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $11, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $12, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $13, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $14, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm1, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $15, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $1, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $17, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $2, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $18, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $3, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $19, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $20, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $21, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $22, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $23, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $24, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $25, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $26, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $27, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $28, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm4, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $29, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $30, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm4, %esi
-; AVX1-NEXT:    shll $31, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    movl %esi, 4(%rdi)
-; AVX1-NEXT:    vpextrb $1, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX1-NEXT:    vpextrb $4, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $4, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $5, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $6, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $7, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $8, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $9, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $10, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $11, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $12, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $13, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm9, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $14, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm9, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $15, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $0, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $16, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $1, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $17, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $2, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $18, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $3, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $19, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $4, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $20, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $5, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $21, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $6, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $22, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $7, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $23, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $8, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $24, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $9, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $25, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $10, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $26, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $11, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $27, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $12, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $28, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $13, %xmm8, %esi
-; AVX1-NEXT:    andl $1, %esi
-; AVX1-NEXT:    shll $29, %esi
-; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    vpextrb $14, %xmm8, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $30, %edx
-; AVX1-NEXT:    orl %esi, %edx
-; AVX1-NEXT:    vpextrb $15, %xmm8, %esi
-; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    vpcmpgtb %xmm5, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %esi
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    orl %esi, %ecx
+; AVX1-NEXT:    shlq $32, %rcx
+; AVX1-NEXT:    orq %rdx, %rcx
+; AVX1-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %esi
+; AVX1-NEXT:    shll $16, %esi
 ; AVX1-NEXT:    orl %edx, %esi
-; AVX1-NEXT:    orl %ecx, %esi
-; AVX1-NEXT:    movl %esi, (%rdi)
+; AVX1-NEXT:    vpcmpgtb %xmm7, %xmm3, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edi
+; AVX1-NEXT:    shll $16, %edi
+; AVX1-NEXT:    orl %edx, %edi
+; AVX1-NEXT:    shlq $32, %rdi
+; AVX1-NEXT:    orq %rsi, %rdi
+; AVX1-NEXT:    movq %rdi, 8(%rax)
+; AVX1-NEXT:    movq %rcx, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -5885,505 +1574,19 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm3
-; AVX2-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $4, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $5, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $6, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $7, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $8, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $9, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $10, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $11, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $13, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $14, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $15, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX2-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $1, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $17, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $2, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $3, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $19, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $20, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $21, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $23, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $24, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $25, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $26, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $27, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $28, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm3, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $29, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $30, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm3, %esi
-; AVX2-NEXT:    shll $31, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    movl %esi, 12(%rdi)
-; AVX2-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $4, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $5, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $6, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $7, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $8, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $9, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $10, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $11, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $13, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $14, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $15, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $1, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $17, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $2, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $3, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $19, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $20, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $21, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $6, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $23, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $24, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $25, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $26, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $27, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $28, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm2, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $29, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm2, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $30, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm2, %esi
-; AVX2-NEXT:    shll $31, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    movl %esi, 8(%rdi)
-; AVX2-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $4, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $5, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $6, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $7, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $8, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $9, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $10, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $11, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $13, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $14, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $15, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $17, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $3, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $19, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $20, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $21, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $23, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $24, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $25, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $26, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $27, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $28, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $29, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $30, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %esi
-; AVX2-NEXT:    shll $31, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    movl %esi, 4(%rdi)
-; AVX2-NEXT:    vpextrb $1, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $4, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $5, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $6, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $7, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $8, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $9, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $10, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $11, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $13, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $14, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $15, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $16, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $1, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $17, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $3, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $19, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $20, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $21, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $23, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $24, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $25, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $26, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $27, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $28, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    shll $29, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $30, %edx
-; AVX2-NEXT:    orl %esi, %edx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %esi
-; AVX2-NEXT:    shll $31, %esi
-; AVX2-NEXT:    orl %edx, %esi
-; AVX2-NEXT:    orl %ecx, %esi
-; AVX2-NEXT:    movl %esi, (%rdi)
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %edx
+; AVX2-NEXT:    shlq $32, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %esi
+; AVX2-NEXT:    shlq $32, %rsi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    movq %rdx, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -6479,386 +1682,61 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
 ;
 
 define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v32f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT:    cmpltpd %xmm1, %xmm8
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    cmpltpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd {{.*#+}} xmm9 = [255,255]
-; SSE2-NEXT:    andpd %xmm9, %xmm8
-; SSE2-NEXT:    andpd %xmm9, %xmm1
-; SSE2-NEXT:    packuswb %xmm8, %xmm1
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    cmpltpd %xmm3, %xmm0
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    cmpltpd %xmm2, %xmm3
-; SSE2-NEXT:    andpd %xmm9, %xmm0
-; SSE2-NEXT:    andpd %xmm9, %xmm3
-; SSE2-NEXT:    packuswb %xmm0, %xmm3
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    cmpltpd %xmm7, %xmm0
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    cmpltpd %xmm6, %xmm3
-; SSE2-NEXT:    andpd %xmm9, %xmm0
-; SSE2-NEXT:    andpd %xmm9, %xmm3
-; SSE2-NEXT:    packuswb %xmm0, %xmm3
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    cmpltpd %xmm5, %xmm0
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT:    cmpltpd %xmm4, %xmm6
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    andpd %xmm9, %xmm0
-; SSE2-NEXT:    andpd %xmm9, %xmm6
-; SSE2-NEXT:    packuswb %xmm0, %xmm6
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    packuswb %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm6
-; SSE2-NEXT:    packuswb %xmm6, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,1,2,2]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    andpd %xmm9, %xmm0
-; SSE2-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    andpd %xmm9, %xmm3
-; SSE2-NEXT:    packuswb %xmm0, %xmm3
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    andpd %xmm9, %xmm0
-; SSE2-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    andpd %xmm9, %xmm5
-; SSE2-NEXT:    packuswb %xmm0, %xmm5
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm5, %xmm5
-; SSE2-NEXT:    packuswb %xmm5, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    andpd %xmm9, %xmm0
-; SSE2-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    andpd %xmm9, %xmm1
-; SSE2-NEXT:    packuswb %xmm0, %xmm1
-; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    andpd %xmm9, %xmm0
-; SSE2-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    andpd %xmm9, %xmm2
-; SSE2-NEXT:    packuswb %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,2,2]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT:    movapd %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movapd %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v32f64:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    pushq %rbp
-; SSE42-NEXT:    pushq %r15
-; SSE42-NEXT:    pushq %r14
-; SSE42-NEXT:    pushq %r13
-; SSE42-NEXT:    pushq %r12
-; SSE42-NEXT:    pushq %rbx
-; SSE42-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
-; SSE42-NEXT:    cmpltpd %xmm7, %xmm8
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    cmpltpd %xmm6, %xmm7
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    cmpltpd %xmm5, %xmm6
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    cmpltpd %xmm4, %xmm5
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT:    cmpltpd %xmm3, %xmm4
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT:    cmpltpd %xmm2, %xmm3
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    cmpltpd %xmm1, %xmm2
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
-; SSE42-NEXT:    cmpltpd %xmm0, %xmm1
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $8, %xmm0, %r8d
-; SSE42-NEXT:    pextrb $0, %xmm0, %r10d
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %ebp
-; SSE42-NEXT:    pextrb $8, %xmm0, %edi
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %r15d
-; SSE42-NEXT:    pextrb $8, %xmm0, %r11d
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %r14d
-; SSE42-NEXT:    pextrb $8, %xmm0, %r9d
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
-; SSE42-NEXT:    pextrb $8, %xmm0, %r12d
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %esi
-; SSE42-NEXT:    pextrb $8, %xmm0, %ebx
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm0, %r13d
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    andl $1, %r8d
-; SSE42-NEXT:    andl $1, %r10d
-; SSE42-NEXT:    leal (%r10,%r8,2), %eax
-; SSE42-NEXT:    andl $1, %ebp
-; SSE42-NEXT:    leal (%rax,%rbp,4), %r8d
-; SSE42-NEXT:    pextrb $0, %xmm0, %eax
-; SSE42-NEXT:    pextrb $8, %xmm0, %ebp
-; SSE42-NEXT:    andl $1, %edi
-; SSE42-NEXT:    leal (%r8,%rdi,8), %r8d
-; SSE42-NEXT:    andl $1, %r15d
-; SSE42-NEXT:    shll $4, %r15d
-; SSE42-NEXT:    orl %r8d, %r15d
-; SSE42-NEXT:    pextrb $8, %xmm1, %edi
-; SSE42-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE42-NEXT:    pextrb $0, %xmm1, %r8d
-; SSE42-NEXT:    andl $1, %r11d
-; SSE42-NEXT:    shll $5, %r11d
-; SSE42-NEXT:    orl %r15d, %r11d
-; SSE42-NEXT:    andl $1, %r14d
-; SSE42-NEXT:    shll $6, %r14d
-; SSE42-NEXT:    andl $1, %r9d
-; SSE42-NEXT:    shll $7, %r9d
-; SSE42-NEXT:    orl %r14d, %r9d
-; SSE42-NEXT:    pextrb $0, %xmm2, %r10d
-; SSE42-NEXT:    pextrb $8, %xmm2, %edi
-; SSE42-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %r9d, %edx
-; SSE42-NEXT:    andl $1, %r12d
-; SSE42-NEXT:    shll $9, %r12d
-; SSE42-NEXT:    orl %edx, %r12d
-; SSE42-NEXT:    pextrb $0, %xmm3, %edi
-; SSE42-NEXT:    pextrb $8, %xmm3, %r9d
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $10, %esi
-; SSE42-NEXT:    orl %r12d, %esi
-; SSE42-NEXT:    andl $1, %ebx
-; SSE42-NEXT:    shll $11, %ebx
-; SSE42-NEXT:    orl %esi, %ebx
-; SSE42-NEXT:    pextrb $0, %xmm4, %r15d
-; SSE42-NEXT:    pextrb $8, %xmm4, %r12d
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %ebx, %ecx
-; SSE42-NEXT:    andl $1, %r13d
-; SSE42-NEXT:    shll $13, %r13d
-; SSE42-NEXT:    orl %ecx, %r13d
-; SSE42-NEXT:    pextrb $0, %xmm5, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm5, %ebx
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $14, %eax
-; SSE42-NEXT:    orl %r13d, %eax
-; SSE42-NEXT:    shll $15, %ebp
-; SSE42-NEXT:    orl %eax, %ebp
-; SSE42-NEXT:    pextrb $0, %xmm6, %r13d
-; SSE42-NEXT:    pextrb $8, %xmm6, %esi
-; SSE42-NEXT:    orl %r11d, %ebp
-; SSE42-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE42-NEXT:    movw %bp, 2(%r14)
-; SSE42-NEXT:    pextrb $0, %xmm7, %r11d
-; SSE42-NEXT:    pextrb $8, %xmm7, %eax
-; SSE42-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    andl $1, %r8d
-; SSE42-NEXT:    leal (%r8,%rdx,2), %r8d
-; SSE42-NEXT:    andl $1, %r10d
-; SSE42-NEXT:    leal (%r8,%r10,4), %r8d
-; SSE42-NEXT:    pextrb $0, %xmm8, %r10d
-; SSE42-NEXT:    pextrb $8, %xmm8, %ebp
-; SSE42-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%r8,%rdx,8), %r8d
-; SSE42-NEXT:    andl $1, %edi
-; SSE42-NEXT:    shll $4, %edi
-; SSE42-NEXT:    orl %r8d, %edi
-; SSE42-NEXT:    andl $1, %r9d
-; SSE42-NEXT:    shll $5, %r9d
-; SSE42-NEXT:    orl %edi, %r9d
-; SSE42-NEXT:    andl $1, %r15d
-; SSE42-NEXT:    shll $6, %r15d
-; SSE42-NEXT:    andl $1, %r12d
-; SSE42-NEXT:    shll $7, %r12d
-; SSE42-NEXT:    orl %r15d, %r12d
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %r12d, %ecx
-; SSE42-NEXT:    andl $1, %ebx
-; SSE42-NEXT:    shll $9, %ebx
-; SSE42-NEXT:    orl %ecx, %ebx
-; SSE42-NEXT:    andl $1, %r13d
-; SSE42-NEXT:    shll $10, %r13d
-; SSE42-NEXT:    orl %ebx, %r13d
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %r13d, %esi
-; SSE42-NEXT:    andl $1, %r11d
-; SSE42-NEXT:    shll $12, %r11d
-; SSE42-NEXT:    orl %esi, %r11d
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $13, %eax
-; SSE42-NEXT:    orl %r11d, %eax
-; SSE42-NEXT:    andl $1, %r10d
-; SSE42-NEXT:    shll $14, %r10d
-; SSE42-NEXT:    orl %eax, %r10d
-; SSE42-NEXT:    shll $15, %ebp
-; SSE42-NEXT:    orl %r10d, %ebp
-; SSE42-NEXT:    orl %r9d, %ebp
-; SSE42-NEXT:    movw %bp, (%r14)
-; SSE42-NEXT:    movq %r14, %rax
-; SSE42-NEXT:    popq %rbx
-; SSE42-NEXT:    popq %r12
-; SSE42-NEXT:    popq %r13
-; SSE42-NEXT:    popq %r14
-; SSE42-NEXT:    popq %r15
-; SSE42-NEXT:    popq %rbp
-; SSE42-NEXT:    retq
+; SSE-LABEL: test_cmp_v32f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT:    cmpltpd %xmm7, %xmm8
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    cmpltpd %xmm6, %xmm7
+; SSE-NEXT:    packssdw %xmm8, %xmm7
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    cmpltpd %xmm5, %xmm6
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    cmpltpd %xmm4, %xmm5
+; SSE-NEXT:    packssdw %xmm6, %xmm5
+; SSE-NEXT:    packssdw %xmm7, %xmm5
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    cmpltpd %xmm2, %xmm3
+; SSE-NEXT:    packssdw %xmm4, %xmm3
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    cmpltpd %xmm1, %xmm2
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    cmpltpd %xmm0, %xmm1
+; SSE-NEXT:    packssdw %xmm2, %xmm1
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    packssdw %xmm3, %xmm1
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    packsswb %xmm5, %xmm1
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    pmovmskb %xmm1, %ecx
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    packssdw %xmm1, %xmm3
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    packssdw %xmm1, %xmm2
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    packssdw %xmm3, %xmm1
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    packssdw %xmm3, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm2, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %edx
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    orl %ecx, %edx
+; SSE-NEXT:    movl %edx, (%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32f64:
 ; AVX1:       # %bb.0:
@@ -7000,507 +1878,256 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm7
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
 ; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm7, %xmm10
 ; SSE2-NEXT:    pcmpgtd %xmm9, %xmm10
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
-; SSE2-NEXT:    pand %xmm11, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm7
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [255,255]
-; SSE2-NEXT:    pand %xmm10, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm11
-; SSE2-NEXT:    pand %xmm10, %xmm11
-; SSE2-NEXT:    packuswb %xmm9, %xmm11
-; SSE2-NEXT:    packuswb %xmm11, %xmm11
-; SSE2-NEXT:    packuswb %xmm11, %xmm11
+; SSE2-NEXT:    por %xmm7, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm10
+; SSE2-NEXT:    packssdw %xmm9, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    packssdw %xmm6, %xmm4
+; SSE2-NEXT:    packssdw %xmm10, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm9, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm10, %xmm0
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    packssdw %xmm5, %xmm2
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm9, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm10, %xmm2
-; SSE2-NEXT:    packuswb %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm7, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm10, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    packssdw %xmm3, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    packsswb %xmm4, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %ecx
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm10, %xmm2
-; SSE2-NEXT:    packuswb %xmm1, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm10, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm10, %xmm3
-; SSE2-NEXT:    packuswb %xmm1, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm11[0],xmm3[1]
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm10, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2
 ; SSE2-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm10, %xmm2
-; SSE2-NEXT:    packuswb %xmm1, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm10, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    packuswb %xmm1, %xmm4
-; SSE2-NEXT:    packuswb %xmm4, %xmm4
-; SSE2-NEXT:    packuswb %xmm4, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm10, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm10, %xmm0
-; SSE2-NEXT:    packuswb %xmm1, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT:    movdqa %xmm8, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    packuswb %xmm4, %xmm5
-; SSE2-NEXT:    packuswb %xmm5, %xmm5
-; SSE2-NEXT:    packuswb %xmm5, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,1,2,2]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT:    movapd %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, 2(%rdi)
-; SSE2-NEXT:    movapd %xmm3, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $4, %edx
+; SSE2-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    packssdw %xmm3, %xmm4
+; SSE2-NEXT:    packssdw %xmm1, %xmm4
+; SSE2-NEXT:    packsswb %xmm0, %xmm4
+; SSE2-NEXT:    pmovmskb %xmm4, %edx
+; SSE2-NEXT:    shll $16, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $5, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $6, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $7, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $8, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $9, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $11, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $12, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    andl $1, %esi
-; SSE2-NEXT:    shll $13, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT:    shll $15, %esi
-; SSE2-NEXT:    orl %edx, %esi
-; SSE2-NEXT:    orl %ecx, %esi
-; SSE2-NEXT:    movw %si, (%rdi)
+; SSE2-NEXT:    movl %edx, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v32i64:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movq %rdi, %rax
-; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm15
-; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm14
-; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13
-; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm12
+; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm12
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm14
+; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13
+; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm15
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm7
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT:    packssdw %xmm7, %xmm6
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm5
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT:    packssdw %xmm5, %xmm4
+; SSE42-NEXT:    packssdw %xmm6, %xmm4
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm3
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT:    packssdw %xmm3, %xmm2
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm1
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT:    packssdw %xmm1, %xmm0
+; SSE42-NEXT:    packssdw %xmm2, %xmm0
+; SSE42-NEXT:    packsswb %xmm4, %xmm0
+; SSE42-NEXT:    pmovmskb %xmm0, %ecx
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm15
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm13
+; SSE42-NEXT:    packssdw %xmm15, %xmm13
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm14
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT:    packssdw %xmm14, %xmm9
+; SSE42-NEXT:    packssdw %xmm13, %xmm9
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm12
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm10
+; SSE42-NEXT:    packssdw %xmm12, %xmm10
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm11
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm12
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm13
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm14
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm15
-; SSE42-NEXT:    pextrb $8, %xmm15, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm15, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $0, %xmm14, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm14, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $0, %xmm13, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm13, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm12, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $8, %xmm12, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm11, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $8, %xmm11, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm10, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $8, %xmm10, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm9, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $8, %xmm9, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm8, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $8, %xmm8, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, 2(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
-; SSE42-NEXT:    pextrb $0, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
-; SSE42-NEXT:    pextrb $8, %xmm1, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
-; SSE42-NEXT:    pextrb $0, %xmm2, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $4, %edx
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT:    packssdw %xmm11, %xmm8
+; SSE42-NEXT:    packssdw %xmm10, %xmm8
+; SSE42-NEXT:    packsswb %xmm9, %xmm8
+; SSE42-NEXT:    pmovmskb %xmm8, %edx
+; SSE42-NEXT:    shll $16, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $5, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $0, %xmm3, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $6, %edx
-; SSE42-NEXT:    pextrb $8, %xmm3, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $7, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm4, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $8, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $8, %xmm4, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $9, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm5, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $8, %xmm5, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $11, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm6, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $12, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $8, %xmm6, %esi
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $13, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    pextrb $0, %xmm7, %edx
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $14, %edx
-; SSE42-NEXT:    orl %esi, %edx
-; SSE42-NEXT:    pextrb $8, %xmm7, %esi
-; SSE42-NEXT:    shll $15, %esi
-; SSE42-NEXT:    orl %edx, %esi
-; SSE42-NEXT:    orl %ecx, %esi
-; SSE42-NEXT:    movw %si, (%rdi)
+; SSE42-NEXT:    movl %edx, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32i64:
@@ -7509,57 +2136,41 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $32, %rsp
-; AVX1-NEXT:    vmovdqa 240(%rbp), %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm10
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm10, %xmm9
-; AVX1-NEXT:    vmovdqa 208(%rbp), %ymm10
-; AVX1-NEXT:    vpcmpgtq %xmm8, %xmm7, %xmm7
-; AVX1-NEXT:    vpackssdw %xmm9, %xmm7, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm9
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT:    vpcmpgtq 256(%rbp), %xmm8, %xmm8
+; AVX1-NEXT:    vpcmpgtq 240(%rbp), %xmm7, %xmm7
+; AVX1-NEXT:    vpackssdw %xmm8, %xmm7, %xmm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm7, %xmm7
-; AVX1-NEXT:    vpcmpgtq %xmm10, %xmm6, %xmm6
-; AVX1-NEXT:    vmovdqa 176(%rbp), %ymm9
+; AVX1-NEXT:    vpcmpgtq 224(%rbp), %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq 208(%rbp), %xmm6, %xmm6
 ; AVX1-NEXT:    vpackssdw %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpackssdw %xmm8, %xmm6, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vmovdqa 144(%rbp), %ymm10
-; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm5, %xmm5
-; AVX1-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm6
+; AVX1-NEXT:    vpackssdw %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm7
+; AVX1-NEXT:    vpcmpgtq 192(%rbp), %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq 176(%rbp), %xmm5, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm7
-; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm10, %xmm4, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vmovdqa 112(%rbp), %ymm6
+; AVX1-NEXT:    vpcmpgtq 160(%rbp), %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpgtq 144(%rbp), %xmm4, %xmm4
+; AVX1-NEXT:    vpackssdw %xmm7, %xmm4, %xmm4
 ; AVX1-NEXT:    vpackssdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpacksswb %xmm8, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vmovdqa 80(%rbp), %ymm7
-; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpacksswb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtq 128(%rbp), %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq 112(%rbp), %xmm3, %xmm3
 ; AVX1-NEXT:    vpackssdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa 48(%rbp), %ymm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq 96(%rbp), %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq 80(%rbp), %xmm2, %xmm2
 ; AVX1-NEXT:    vpackssdw %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vmovdqa 16(%rbp), %ymm5
-; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq 64(%rbp), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq 48(%rbp), %xmm1, %xmm1
 ; AVX1-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq 32(%rbp), %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq 16(%rbp), %xmm0, %xmm0
 ; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-fshl-128.ll b/test/CodeGen/X86/vector-fshl-128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eb3dadf0c27561859c17f907babf61a7aa6a34fe
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshl-128.ll
@@ -0,0 +1,2609 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
+
+; Just one 32-bit run to make sure we do reasonable things for i64 cases.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
+
+declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psllq %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psllq %xmm4, %xmm5
+; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [64,64]
+; SSE2-NEXT:    psubq %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psrlq %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm3, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
+; SSE2-NEXT:    orpd %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    psllq %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psllq %xmm5, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [64,64]
+; SSE41-NEXT:    psubq %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrlq %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm0, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    por %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqq %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VL-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v2i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VLBW-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psllq %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psllq %xmm4, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
+; X32-SSE-NEXT:    psubq %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
+; X32-SSE-NEXT:    orpd %xmm5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; X32-SSE-NEXT:    pand %xmm3, %xmm2
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32,32,32,32]
+; SSE2-NEXT:    psubd %xmm2, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrld %xmm3, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrld %xmm6, %xmm3
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    psrld %xmm5, %xmm6
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm4, %xmm1
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT:    pslld $23, %xmm2
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [32,32,32,32]
+; SSE41-NEXT:    psubd %xmm2, %xmm0
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm7
+; SSE41-NEXT:    psrld %xmm6, %xmm7
+; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrld %xmm0, %xmm5
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm0, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pslld $23, %xmm2
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT:    pmulld %xmm3, %xmm2
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm3
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT:    vpmulld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VL-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v4i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VLBW-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [32,32,32,32]
+; X32-SSE-NEXT:    psubd %xmm2, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrld %xmm3, %xmm5
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    psrld %xmm6, %xmm3
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm6
+; X32-SSE-NEXT:    psrld %xmm5, %xmm6
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrld %xmm4, %xmm1
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm1
+; X32-SSE-NEXT:    pslld $23, %xmm2
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm5, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; X32-SSE-NEXT:    por %xmm3, %xmm4
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pandn %xmm4, %xmm1
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT:    psubw %xmm2, %xmm3
+; SSE2-NEXT:    psllw $12, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    psraw $15, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSE2-NEXT:    pslld $23, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm6, %xmm5
+; SSE2-NEXT:    cvttps2dq %xmm5, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; SSE2-NEXT:    pslld $23, %xmm7
+; SSE2-NEXT:    paddd %xmm6, %xmm7
+; SSE2-NEXT:    cvttps2dq %xmm7, %xmm6
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE2-NEXT:    pmullw %xmm0, %xmm6
+; SSE2-NEXT:    por %xmm4, %xmm6
+; SSE2-NEXT:    por %xmm1, %xmm6
+; SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
+; SSE41-NEXT:    psubw %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psllw $12, %xmm4
+; SSE41-NEXT:    psllw $4, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrlw $8, %xmm5
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrlw $4, %xmm5
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrlw $2, %xmm5
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrlw $1, %xmm5
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE41-NEXT:    pslld $23, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm5, %xmm4
+; SSE41-NEXT:    cvttps2dq %xmm4, %xmm6
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT:    pslld $23, %xmm4
+; SSE41-NEXT:    paddd %xmm5, %xmm4
+; SSE41-NEXT:    cvttps2dq %xmm4, %xmm4
+; SSE41-NEXT:    packusdw %xmm6, %xmm4
+; SSE41-NEXT:    pmullw %xmm3, %xmm4
+; SSE41-NEXT:    por %xmm1, %xmm4
+; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm3, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpsllvd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512F-NEXT:    vpsllvd %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512F-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512VL-NEXT:    vpsllvd %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VL-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpmovdw %ymm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpsllvw %xmm4, %xmm0, %xmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VLBW-NEXT:    vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: var_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vpshlw %xmm2, %xmm0, %xmm3
+; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm4
+; XOP-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOP-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    psubw %xmm2, %xmm3
+; X32-SSE-NEXT:    psllw $12, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlw $8, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlw $2, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    paddw %xmm3, %xmm3
+; X32-SSE-NEXT:    psraw $15, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm1, %xmm4
+; X32-SSE-NEXT:    psrlw $1, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
+; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; X32-SSE-NEXT:    pslld $23, %xmm5
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT:    paddd %xmm6, %xmm5
+; X32-SSE-NEXT:    cvttps2dq %xmm5, %xmm5
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm7
+; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; X32-SSE-NEXT:    pslld $23, %xmm7
+; X32-SSE-NEXT:    paddd %xmm6, %xmm7
+; X32-SSE-NEXT:    cvttps2dq %xmm7, %xmm6
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; X32-SSE-NEXT:    pmullw %xmm0, %xmm6
+; X32-SSE-NEXT:    por %xmm4, %xmm6
+; X32-SSE-NEXT:    por %xmm1, %xmm6
+; X32-SSE-NEXT:    pcmpeqw %xmm3, %xmm2
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm6, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psllw $5, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psllw $4, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm7
+; SSE2-NEXT:    psllw $2, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm8
+; SSE2-NEXT:    pandn %xmm3, %xmm8
+; SSE2-NEXT:    paddb %xmm3, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT:    psubb %xmm2, %xmm6
+; SSE2-NEXT:    psllw $5, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    paddb %xmm6, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pandn %xmm1, %xmm7
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    paddb %xmm6, %xmm6
+; SSE2-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    psubb %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $5, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    psllw $4, %xmm6
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm6
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    psllw $2, %xmm6
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm6
+; SSE41-NEXT:    paddb %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    paddb %xmm2, %xmm6
+; SSE41-NEXT:    paddb %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    paddb %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm1, %xmm7
+; SSE41-NEXT:    psrlw $4, %xmm7
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm7
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrlw $2, %xmm4
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrlw $1, %xmm4
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    paddb %xmm6, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_funnnel_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $5, %xmm2, %xmm3
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm3, %xmm4, %xmm0, %xmm4
+; AVX-NEXT:    vpsllw $2, %xmm4, %xmm5
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm5, %xmm5
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; AVX-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX-NEXT:    vpsllw $5, %xmm4, %xmm4
+; AVX-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; AVX-NEXT:    vpsrlw $4, %xmm1, %xmm6
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm6, %xmm6
+; AVX-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $2, %xmm1, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm5, %ymm6, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vpmovwb %ymm1, %xmm1
+; AVX512VLBW-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: var_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm3
+; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm4
+; XOP-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOP-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X32-SSE-NEXT:    psllw $5, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psllw $4, %xmm3
+; X32-SSE-NEXT:    pand %xmm5, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    por %xmm5, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm3, %xmm6
+; X32-SSE-NEXT:    psllw $2, %xmm3
+; X32-SSE-NEXT:    pand %xmm5, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    por %xmm6, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X32-SSE-NEXT:    pandn %xmm3, %xmm4
+; X32-SSE-NEXT:    paddb %xmm3, %xmm3
+; X32-SSE-NEXT:    pand %xmm5, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X32-SSE-NEXT:    psubb %xmm2, %xmm5
+; X32-SSE-NEXT:    psllw $5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm7
+; X32-SSE-NEXT:    pandn %xmm1, %xmm7
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    pand %xmm6, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    por %xmm7, %xmm1
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm7
+; X32-SSE-NEXT:    pandn %xmm1, %xmm7
+; X32-SSE-NEXT:    psrlw $2, %xmm1
+; X32-SSE-NEXT:    pand %xmm6, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    por %xmm7, %xmm1
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpeqb %xmm6, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm5
+; X32-SSE-NEXT:    pandn %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlw $1, %xmm1
+; X32-SSE-NEXT:    pand %xmm6, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    por %xmm4, %xmm1
+; X32-SSE-NEXT:    por %xmm3, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
+  ret <16 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psllq %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [64,64]
+; SSE2-NEXT:    psubq %xmm2, %xmm4
+; SSE2-NEXT:    psrlq %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq %xmm4, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [64,64]
+; SSE41-NEXT:    psubq %xmm4, %xmm0
+; SSE41-NEXT:    psrlq %xmm0, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqq %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VL-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VLBW-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psllq %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psllq %xmm4, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
+; X32-SSE-NEXT:    psubq %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
+; X32-SSE-NEXT:    orpd %xmm5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; X32-SSE-NEXT:    pand %xmm3, %xmm2
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
+  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    xorps %xmm4, %xmm4
+; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pslld %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32,32,32,32]
+; SSE2-NEXT:    psubd %xmm2, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
+; SSE2-NEXT:    psrld %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,0,0]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    pslld %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [32,32,32,32]
+; SSE41-NEXT:    psubd %xmm4, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    psrld %xmm0, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX2-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512F-NEXT:    vpslld %xmm5, %xmm0, %xmm5
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512F-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512VL-NEXT:    vpslld %xmm5, %xmm0, %xmm5
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VL-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512BW-NEXT:    vpslld %xmm5, %xmm0, %xmm5
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512BW-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512VLBW-NEXT:    vpslld %xmm5, %xmm0, %xmm5
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512VLBW-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VLBW-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX2-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    xorps %xmm4, %xmm4
+; X32-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    pslld %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [32,32,32,32]
+; X32-SSE-NEXT:    psubd %xmm2, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm3, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
+; X32-SSE-NEXT:    psrld %xmm3, %xmm1
+; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT:    psubw %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpeqw %xmm2, %xmm4
+; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psllw %xmm2, %xmm5
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    psllw %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
+; SSE41-NEXT:    psubw %xmm4, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm0, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm5, %xmm0, %xmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm5, %xmm0, %xmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX512VLBW-NEXT:    vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    psubw %xmm2, %xmm3
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm4
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psllw %xmm2, %xmm5
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm1
+; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm4
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT:    psubb %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpeqb %xmm2, %xmm4
+; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psllw %xmm2, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT:    psllw %xmm2, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm3, %xmm1
+; SSE2-NEXT:    psrlw %xmm3, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psllw %xmm5, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT:    psllw %xmm5, %xmm7
+; SSE41-NEXT:    pshufb %xmm0, %xmm7
+; SSE41-NEXT:    pand %xmm7, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    psubb %xmm2, %xmm5
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm5, %xmm1
+; SSE41-NEXT:    psrlw %xmm5, %xmm6
+; SSE41-NEXT:    pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm1, %xmm6
+; SSE41-NEXT:    por %xmm6, %xmm4
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpsllw %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm4
+; AVX2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpbroadcastb %xmm3, %xmm3
+; AVX2-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlw %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX2-NEXT:    vpbroadcastb %xmm4, %xmm4
+; AVX2-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm5, %ymm6, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vpmovwb %ymm1, %xmm1
+; AVX512VLBW-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm4
+; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X32-SSE-NEXT:    psubb %xmm2, %xmm3
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm4
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psllw %xmm2, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm6, %xmm6
+; X32-SSE-NEXT:    psllw %xmm2, %xmm6
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm5, %xmm6
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm1
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    pandn %xmm2, %xmm4
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
+  ret <16 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
+; SSE2-LABEL: constant_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlq $60, %xmm2
+; SSE2-NEXT:    psrlq $50, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq $4, %xmm2
+; SSE2-NEXT:    psllq $14, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    orpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq $50, %xmm2
+; SSE41-NEXT:    psrlq $60, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq $14, %xmm2
+; SSE41-NEXT:    psllq $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $50, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_funnnel_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlq $60, %xmm2
+; X32-SSE-NEXT:    psrlq $50, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq $4, %xmm2
+; X32-SSE-NEXT:    psllq $14, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; SSE2-LABEL: constant_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrld $25, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrld $26, %xmm3
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrld $27, %xmm2
+; SSE2-NEXT:    psrld $28, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrld $25, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrld $27, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrld $26, %xmm2
+; SSE41-NEXT:    psrld $28, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $25, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrld $27, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrld $26, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_funnnel_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrld $25, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    psrld $26, %xmm3
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrld $27, %xmm2
+; X32-SSE-NEXT:    psrld $28, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm3, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE2-LABEL: constant_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; SSE2-NEXT:    pmulhuw %xmm2, %xmm1
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
+; SSE41-NEXT:    pmullw %xmm0, %xmm2
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; AVX-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; AVX512F-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
+; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; AVX512VL-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
+; AVX512VL-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm2
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512VLBW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
+; X32-SSE-NEXT:    pmulhuw %xmm2, %xmm1
+; X32-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    por %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE2-LABEL: constant_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2]
+; SSE2-NEXT:    pmullw %xmm4, %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; SSE2-NEXT:    pmullw %xmm5, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm4, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [1,128,64,32,16,8,4,2]
+; SSE41-NEXT:    pmullw %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; SSE41-NEXT:    pmullw %xmm4, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    packuswb %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    pmullw %xmm4, %xmm5
+; SSE41-NEXT:    pand %xmm0, %xmm5
+; SSE41-NEXT:    packuswb %xmm1, %xmm5
+; SSE41-NEXT:    por %xmm3, %xmm5
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,128,64,32,16,8,4,2]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT:    vpord %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT:    vpord %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VLBW-NEXT:    vpmovwb %ymm1, %xmm1
+; AVX512VLBW-NEXT:    movw $257, %ax # imm = 0x101
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2]
+; X32-SSE-NEXT:    pmullw %xmm4, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; X32-SSE-NEXT:    pmullw %xmm5, %xmm1
+; X32-SSE-NEXT:    psrlw $8, %xmm1
+; X32-SSE-NEXT:    packuswb %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; X32-SSE-NEXT:    pmullw %xmm4, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; X32-SSE-NEXT:    pand %xmm4, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT:    pmullw %xmm5, %xmm2
+; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    packuswb %xmm3, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <16 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlq $50, %xmm1
+; SSE-NEXT:    psllq $14, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlq $50, %xmm1, %xmm1
+; AVX-NEXT:    vpsllq $14, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $50, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $14, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrlq $50, %xmm1, %xmm1
+; XOP-NEXT:    vpsllq $14, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlq $50, %xmm1
+; X32-SSE-NEXT:    psllq $14, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld $28, %xmm1
+; SSE-NEXT:    pslld $4, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $28, %xmm1, %xmm1
+; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $28, %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrld $28, %xmm1, %xmm1
+; XOP-NEXT:    vpslld $4, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrld $28, %xmm1
+; X32-SSE-NEXT:    pslld $4, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $9, %xmm1
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $9, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $9, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrlw $9, %xmm1, %xmm1
+; XOP-NEXT:    vpsllw $7, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $9, %xmm1
+; X32-SSE-NEXT:    psllw $7, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $4, %xmm1
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psllw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshl-256.ll b/test/CodeGen/X86/vector-fshl-256.ll
new file mode 100644
index 0000000000000000000000000000000000000000..aac4662cff0bf82630135ff00afa4fb54c2569c3
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshl-256.ll
@@ -0,0 +1,2140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
+
+declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
+declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpsllq %xmm4, %xmm3, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm4, %xmm8, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm7, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpsubq %xmm2, %xmm8, %xmm6
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm1, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX2-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
+; AVX2-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
+; AVX512F-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
+; AVX512VL-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VL-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
+; AVX512BW-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v4i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
+; AVX512VLBW-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm6
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT:    vpshlq %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm5
+; XOPAVX1-NEXT:    vpshlq %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; XOPAVX2-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm3, %xmm8, %xmm6
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm6[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpsrld %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrld %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX1-NEXT:    vpsrld %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vpsrld %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; AVX1-NEXT:    vpsubd %xmm2, %xmm8, %xmm5
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; AVX1-NEXT:    vpsrld %xmm7, %xmm1, %xmm7
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
+; AVX1-NEXT:    vpsrld %xmm7, %xmm1, %xmm7
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vpslld $23, %xmm3, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmulld %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm6
+; AVX1-NEXT:    vpaddd %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT:    vpmulld %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vorps %ymm1, %ymm4, %ymm1
+; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX2-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
+; AVX2-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VL-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v8i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm5, %xmm4, %xmm6
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT:    vpshld %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm5
+; XOPAVX1-NEXT:    vpshld %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; XOPAVX2-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm3, %xmm8, %xmm5
+; AVX1-NEXT:    vpsllw $12, %xmm5, %xmm6
+; AVX1-NEXT:    vpsllw $4, %xmm5, %xmm5
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $2, %xmm4, %xmm5
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm5
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubw %xmm2, %xmm8, %xmm5
+; AVX1-NEXT:    vpsllw $12, %xmm5, %xmm6
+; AVX1-NEXT:    vpsllw $4, %xmm5, %xmm5
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm5
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm5
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm8
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX1-NEXT:    vpslld $23, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddd %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpmullw %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX1-NEXT:    vpslld $23, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddd %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm8, %ymm1, %ymm1
+; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX2-NEXT:    vpsllvd %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX2-NEXT:    vpsllvd %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpackusdw %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %ymm2, %ymm6, %ymm6
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
+; AVX2-NEXT:    vpsrlvd %ymm7, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
+; AVX2-NEXT:    vpsrlvd %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpackusdw %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm0, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; XOPAVX1-NEXT:    vpsubw %xmm5, %xmm4, %xmm6
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT:    vpshlw %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubw %xmm5, %xmm2, %xmm5
+; XOPAVX1-NEXT:    vpshlw %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm4
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; XOPAVX2-NEXT:    vpsubw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
+; XOPAVX2-NEXT:    vpshlw %xmm5, %xmm7, %xmm5
+; XOPAVX2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm6
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm3, %xmm9, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $2, %xmm4, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm10, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT:    vpand %xmm8, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubb %xmm2, %xmm9, %xmm6
+; AVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm5
+; AVX1-NEXT:    vpand %xmm10, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm3, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT:    vpand %xmm1, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpsllw $2, %xmm5, %xmm7
+; AVX1-NEXT:    vpand %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm5
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm8, %ymm1, %ymm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpsllw $5, %ymm2, %ymm3
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpsllw $2, %ymm4, %ymm5
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; AVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm3
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpsllw $2, %ymm4, %ymm5
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512VLBW-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX1-NEXT:    vpsubb %xmm5, %xmm4, %xmm6
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubb %xmm5, %xmm2, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm4
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; XOPAVX2-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm7, %xmm5
+; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
+  ret <32 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VL-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm4
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm6
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT:    vpsrlq %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsrlq %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
+  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %splat)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpslld %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpsrld %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512F-NEXT:    vpslld %xmm5, %ymm0, %ymm5
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512F-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512VL-NEXT:    vpslld %xmm5, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VL-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512BW-NEXT:    vpslld %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512BW-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512VLBW-NEXT:    vpslld %xmm5, %ymm0, %ymm5
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512VLBW-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpslld %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT:    vpsrld %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; XOPAVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %splat)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpsrlw %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm5, %ymm0, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT:    vpsrlw %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsrlw %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %splat)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm9, %xmm9
+; AVX1-NEXT:    vpsllw %xmm4, %xmm9, %xmm7
+; AVX1-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm5, %xmm10, %xmm3
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpsrlw %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw %xmm3, %xmm9, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsubb %xmm2, %xmm10, %xmm6
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw %xmm6, %xmm9, %xmm6
+; AVX1-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm4, %ymm1
+; AVX1-NEXT:    vpcmpeqb %xmm8, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm8, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpsllw %xmm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw %xmm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX2-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm3
+; AVX512VL-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX512VL-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512VLBW-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; XOPAVX1-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm6
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm4, %xmm7
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm7, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm5, %ymm1
+; XOPAVX1-NEXT:    vpcomeqb %xmm8, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vpcomeqb %xmm8, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm4
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; XOPAVX2-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm7, %xmm5
+; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat)
+  ret <32 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
+; AVX1-LABEL: constant_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrlq $4, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlq $14, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrlq $50, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllq $60, %xmm2, %xmm3
+; AVX1-NEXT:    vpsllq $50, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_funnnel_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
+; AVX1-LABEL: constant_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrld $21, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrld $23, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrld $22, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrld $25, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld $27, %xmm1, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrld $26, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_funnnel_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
+; AVX1-LABEL: constant_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT:    vpmulhuw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmulhuw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm2
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VLBW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm3, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; XOPAVX2-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
+; AVX1-LABEL: constant_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpsllw $2, %ymm2, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm4
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vporq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm2, %zmm1
+; AVX512VLBW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512VLBW-NEXT:    movl $16843009, %eax # imm = 0x1010101
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [248,249,250,251,252,253,254,255,248,255,254,253,252,251,250,249]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [248,249,250,251,252,253,254,255,248,255,254,253,252,251,250,249]
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <32 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $50, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $50, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlq $50, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllq $14, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $50, %ymm1, %ymm1
+; AVX512-NEXT:    vpsllq $14, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrlq $50, %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq $50, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpsllq $14, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsllq $14, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlq $50, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllq $14, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpslld $4, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $28, %ymm1, %ymm1
+; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $28, %ymm1, %ymm1
+; AVX512-NEXT:    vpslld $4, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpslld $4, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpslld $4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrld $28, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpslld $4, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $9, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $9, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $9, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $9, %ymm1, %ymm1
+; AVX512-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrlw $9, %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlw $9, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpsllw $7, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlw $9, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshl-512.ll b/test/CodeGen/X86/vector-fshl-512.ll
new file mode 100644
index 0000000000000000000000000000000000000000..58b471c1f5b74ffe74cd5a0678c7b2f56a2aca04
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshl-512.ll
@@ -0,0 +1,971 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+
+declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
+declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
+; AVX512-LABEL: var_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpsllvq %zmm4, %zmm0, %zmm5
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
+; AVX512-LABEL: var_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpsllvd %zmm4, %zmm0, %zmm5
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512-NEXT:    vpord %zmm1, %zmm5, %zmm1
+; AVX512-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
+; AVX512F-LABEL: var_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm8 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm7, %zmm8, %zmm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm4, %ymm8, %ymm9
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm9, %zmm2, %zmm2
+; AVX512F-NEXT:    vpord %zmm2, %zmm7, %zmm2
+; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX512F-NEXT:    vpcmpeqw %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm8, %ymm5
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm5, %zmm3, %zmm3
+; AVX512F-NEXT:    vpord %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT:    vpcmpeqw %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm8 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm7, %zmm8, %zmm7
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm4, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm9, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpord %zmm2, %zmm7, %zmm2
+; AVX512VL-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX512VL-NEXT:    vpcmpeqw %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm5, %zmm4
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm8, %ymm5
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm5, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpord %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT:    vpcmpeqw %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
+; AVX512F-LABEL: var_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm6, %ymm7, %ymm8
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm9
+; AVX512F-NEXT:    vpsllw $5, %ymm9, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm8, %ymm0, %ymm8
+; AVX512F-NEXT:    vpsllw $2, %ymm8, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm4, %ymm11, %ymm11
+; AVX512F-NEXT:    vpaddb %ymm10, %ymm10, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm11, %ymm8, %ymm8
+; AVX512F-NEXT:    vpaddb %ymm8, %ymm8, %ymm11
+; AVX512F-NEXT:    vpaddb %ymm10, %ymm10, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm11, %ymm8, %ymm10
+; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm8, %ymm11, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %ymm9, %ymm12, %ymm13
+; AVX512F-NEXT:    vpsllw $5, %ymm13, %ymm13
+; AVX512F-NEXT:    vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm14, %ymm11, %ymm11
+; AVX512F-NEXT:    vpaddb %ymm13, %ymm13, %ymm13
+; AVX512F-NEXT:    vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm15, %ymm11, %ymm11
+; AVX512F-NEXT:    vpaddb %ymm13, %ymm13, %ymm13
+; AVX512F-NEXT:    vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm2, %ymm10, %ymm2
+; AVX512F-NEXT:    vpxor %xmm10, %xmm10, %xmm10
+; AVX512F-NEXT:    vpcmpeqb %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpblendvb %ymm9, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw $5, %ymm5, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm7
+; AVX512F-NEXT:    vpand %ymm4, %ymm7, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $4, %ymm3, %ymm4
+; AVX512F-NEXT:    vpand %ymm8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm12, %ymm6
+; AVX512F-NEXT:    vpsllw $5, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpand %ymm14, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm4
+; AVX512F-NEXT:    vpand %ymm15, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpcmpeqb %ymm10, %ymm5, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %ymm8, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm9
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpsllw $2, %ymm6, %ymm10
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm11, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm10
+; AVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm9
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm10, %ymm9, %ymm9
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %ymm4, %ymm12, %ymm13
+; AVX512VL-NEXT:    vpsllw $5, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm9
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm14, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpaddb %ymm13, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm9
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpand %ymm15, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpaddb %ymm13, %ymm13, %ymm13
+; AVX512VL-NEXT:    vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm8, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm7
+; AVX512VL-NEXT:    vpand %ymm11, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpand %ymm10, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsubb %ymm4, %ymm12, %ymm7
+; AVX512VL-NEXT:    vpsllw $5, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $2, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpand %ymm14, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $1, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpand %ymm15, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %zmm4, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpsllw $5, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
+; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm5, %k2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k2}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
+; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsllw $5, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
+; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k2
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
+; AVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm6
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm6, %zmm6
+; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4 {%k1}
+; AVX512BW-NEXT:    vporq %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %zmm4, %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
+; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k2
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k2}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm1, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm1, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
+; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k2
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm4, %zmm6
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm6, %zmm6
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm6, %zmm4 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm4, %zmm1
+; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
+  ret <64 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
+; AVX512-LABEL: splatvar_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm2, %zmm2
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpsllq %xmm4, %zmm0, %zmm5
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %splat)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
+; AVX512-LABEL: splatvar_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm2, %zmm2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512-NEXT:    vpslld %xmm5, %zmm0, %zmm5
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
+; AVX512-NEXT:    vpord %zmm1, %zmm5, %zmm1
+; AVX512-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %splat)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
+; AVX512F-LABEL: splatvar_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastw %xmm4, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm5, %ymm0, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm4, %xmm7, %xmm7
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpcmpeqw %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm5, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlw %xmm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw %xmm4, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm5, %ymm0, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm4, %xmm7, %xmm7
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT:    vpcmpeqw %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm5, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw %xmm7, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm5, %zmm0, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm5, %zmm0, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %splat)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
+; AVX512F-LABEL: splatvar_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm5, %ymm0, %ymm6
+; AVX512F-NEXT:    vpcmpeqd %ymm9, %ymm9, %ymm9
+; AVX512F-NEXT:    vpsllw %xmm5, %ymm9, %ymm8
+; AVX512F-NEXT:    vpbroadcastb %xmm8, %ymm8
+; AVX512F-NEXT:    vpand %ymm8, %ymm6, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm4, %xmm7, %xmm7
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw %xmm7, %ymm9, %ymm9
+; AVX512F-NEXT:    vpsrlw $8, %ymm9, %ymm9
+; AVX512F-NEXT:    vpbroadcastb %xmm9, %ymm9
+; AVX512F-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm5, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw %xmm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm9, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm5, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpcmpeqd %ymm9, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpsllw %xmm5, %ymm9, %ymm8
+; AVX512VL-NEXT:    vpbroadcastb %xmm8, %ymm8
+; AVX512VL-NEXT:    vpand %ymm8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm4, %xmm7, %xmm7
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw %xmm7, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpsrlw $8, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpbroadcastb %xmm9, %ymm9
+; AVX512VL-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm5, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw %xmm7, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpand %ymm9, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm5, %zmm0, %zmm6
+; AVX512BW-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
+; AVX512BW-NEXT:    vpsllw %xmm5, %zmm7, %zmm5
+; AVX512BW-NEXT:    vpbroadcastb %xmm5, %zmm5
+; AVX512BW-NEXT:    vpandq %zmm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512BW-NEXT:    vpsrlw $8, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpbroadcastb %xmm4, %zmm4
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm5, %zmm0, %zmm6
+; AVX512VLBW-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
+; AVX512VLBW-NEXT:    vpsllw %xmm5, %zmm7, %zmm5
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm5, %zmm5
+; AVX512VLBW-NEXT:    vpandq %zmm5, %zmm6, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm4, %zmm4
+; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat)
+  ret <64 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
+; AVX512-LABEL: constant_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
+; AVX512-LABEL: constant_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
+; AVX512F-LABEL: constant_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512F-NEXT:    vpmulhuw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm0, %ymm5
+; AVX512F-NEXT:    vpor %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vpmulhuw %ymm4, %ymm3, %ymm2
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm3
+; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512VL-NEXT:    vpmulhuw %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmullw %ymm4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpor %ymm2, %ymm5, %ymm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT:    vpmulhuw %ymm4, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpmullw %ymm4, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm2
+; AVX512BW-NEXT:    vporq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    movl $65537, %eax # imm = 0x10001
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm2
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm2, %zmm1
+; AVX512VLBW-NEXT:    movl $65537, %eax # imm = 0x10001
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
+; AVX512F-LABEL: constant_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpsllw $2, %ymm4, %ymm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm8, %ymm7, %ymm7
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm9
+; AVX512F-NEXT:    vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512F-NEXT:    vpaddb %ymm9, %ymm9, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; AVX512F-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm12, %ymm11, %ymm11
+; AVX512F-NEXT:    vpsrlw $8, %ymm11, %ymm11
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX512F-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm13, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm11, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512F-NEXT:    vpand %ymm8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm9, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15],ymm3[24],ymm7[24],ymm3[25],ymm7[25],ymm3[26],ymm7[26],ymm3[27],ymm7[27],ymm3[28],ymm7[28],ymm3[29],ymm7[29],ymm3[30],ymm7[30],ymm3[31],ymm7[31]
+; AVX512F-NEXT:    vpmullw %ymm12, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[16],ymm7[16],ymm3[17],ymm7[17],ymm3[18],ymm7[18],ymm3[19],ymm7[19],ymm3[20],ymm7[20],ymm3[21],ymm7[21],ymm3[22],ymm7[22],ymm3[23],ymm7[23]
+; AVX512F-NEXT:    vpmullw %ymm13, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm7
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm8, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm9
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm10
+; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm7, %ymm7
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; AVX512VL-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm11, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpsrlw $8, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX512VL-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm12, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpackuswb %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpand %ymm8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpmullw %ymm11, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpmullw %ymm12, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm3, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm3, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; AVX512VLBW-NEXT:    kmovq %rax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <64 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
+; AVX512-LABEL: splatconstant_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $50, %zmm1, %zmm1
+; AVX512-NEXT:    vpsllq $14, %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
+; AVX512-LABEL: splatconstant_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $28, %zmm1, %zmm1
+; AVX512-NEXT:    vpslld $4, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
+; AVX512F-LABEL: splatconstant_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $9, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $9, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsllw $7, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $9, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $9, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsllw $7, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $9, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $9, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $7, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
+; AVX512F-LABEL: splatconstant_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm3, %ymm2
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshl-rot-128.ll b/test/CodeGen/X86/vector-fshl-rot-128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a358df441cee7e6aa80e7f9330a707e92e3bc069
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -0,0 +1,2219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
+
+; Just one 32-bit run to make sure we do reasonable things for i64 cases.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
+
+declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [63,63]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllq %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psllq %xmm1, %xmm5
+; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    orpd %xmm5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [63,63]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psllq %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psllq %xmm1, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsllvq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_funnnel_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsllvq %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpsllvq %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,0,63,0]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubq %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllq %xmm1, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psllq %xmm1, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm5, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrld %xmm2, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm6, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    psrld %xmm5, %xmm6
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrld %xmm3, %xmm5
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    orps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [31,31,31,31]
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    psrld %xmm6, %xmm7
+; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm2, %xmm5
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrld %xmm2, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT:    vpsrld %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsllvd %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_funnnel_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsllvd %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpshld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpsllvd %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubd %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; X32-SSE-NEXT:    pand %xmm4, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrld %xmm2, %xmm5
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld %xmm6, %xmm2
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm6
+; X32-SSE-NEXT:    psrld %xmm5, %xmm6
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrld %xmm3, %xmm5
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,3]
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm3, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    orps %xmm0, %xmm2
+; X32-SSE-NEXT:    movaps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %amt)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    psubw %xmm1, %xmm4
+; SSE2-NEXT:    psllw $12, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrlw $8, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    psraw $15, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psrlw $4, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    psraw $15, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psrlw $2, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT:    pslld $23, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm6, %xmm4
+; SSE2-NEXT:    cvttps2dq %xmm4, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd %xmm6, %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-NEXT:    pmullw %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    psubw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psllw $12, %xmm3
+; SSE41-NEXT:    psllw $4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    paddw %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm7
+; SSE41-NEXT:    psrlw $8, %xmm7
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm7
+; SSE41-NEXT:    psrlw $4, %xmm7
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm7
+; SSE41-NEXT:    psrlw $2, %xmm7
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm7
+; SSE41-NEXT:    psrlw $1, %xmm7
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm5
+; SSE41-NEXT:    pand %xmm6, %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pslld $23, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pmullw %xmm0, %xmm2
+; SSE41-NEXT:    por %xmm5, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm3, %xmm5
+; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm6
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm6
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsllvd %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-NEXT:    vpsllvd %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512F-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VL-NEXT:    vpsllvd %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpsllvw %xmm3, %xmm0, %xmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: var_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOP-NEXT:    vpshlw %xmm3, %xmm0, %xmm3
+; XOP-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOP-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    psubw %xmm1, %xmm4
+; X32-SSE-NEXT:    psllw $12, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrlw $8, %xmm5
+; X32-SSE-NEXT:    pand %xmm2, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm5, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    psraw $15, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psrlw $4, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    psraw $15, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psrlw $2, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm2, %xmm5
+; X32-SSE-NEXT:    psrlw $1, %xmm2
+; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; X32-SSE-NEXT:    pslld $23, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT:    paddd %xmm6, %xmm4
+; X32-SSE-NEXT:    cvttps2dq %xmm4, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd %xmm6, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; X32-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    por %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %amt)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    psubb %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    psllw $5, %xmm5
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $4, %xmm1
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pandn %xmm1, %xmm7
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    psllw $5, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubb %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    psllw $5, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    psllw $4, %xmm5
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    psllw $2, %xmm5
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    paddb %xmm6, %xmm5
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    psllw $5, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm4
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    por %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_funnnel_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpsllw $5, %xmm3, %xmm3
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm4
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm5
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm3, %xmm5, %xmm0, %xmm3
+; AVX-NEXT:    vpsllw $2, %xmm3, %xmm5
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm5
+; AVX-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: var_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOP-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOP-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOP-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    psubb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    pand %xmm2, %xmm5
+; X32-SSE-NEXT:    psllw $5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw $4, %xmm1
+; X32-SSE-NEXT:    pand %xmm6, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm6
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    por %xmm6, %xmm1
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm7
+; X32-SSE-NEXT:    pandn %xmm1, %xmm7
+; X32-SSE-NEXT:    psllw $2, %xmm1
+; X32-SSE-NEXT:    pand %xmm6, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    por %xmm7, %xmm1
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm5
+; X32-SSE-NEXT:    pandn %xmm1, %xmm5
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pand %xmm6, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm4
+; X32-SSE-NEXT:    psllw $5, %xmm4
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm6
+; X32-SSE-NEXT:    pandn %xmm0, %xmm6
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm6, %xmm0
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm6
+; X32-SSE-NEXT:    pandn %xmm0, %xmm6
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm6, %xmm0
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %amt)
+  ret <16 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
+; SSE-LABEL: splatvar_funnnel_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,63]
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    psubq %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    psllq %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    psrlq %xmm3, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,0,63,0]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubq %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllq %xmm1, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psllq %xmm1, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm5, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
+  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    xorps %xmm4, %xmm4
+; SSE2-NEXT:    xorps %xmm5, %xmm5
+; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pslld %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
+; SSE2-NEXT:    psrld %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubd %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pslld %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX512-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; XOPAVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; XOPAVX2-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOPAVX2-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubd %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    xorps %xmm4, %xmm4
+; X32-SSE-NEXT:    xorps %xmm5, %xmm5
+; X32-SSE-NEXT:    movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    pslld %xmm5, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
+; X32-SSE-NEXT:    psrld %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubw %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllw %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubw %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psllw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubw %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw %xmm1, %xmm4
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %splat)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw %xmm4, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT:    psllw %xmm4, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm3, %xmm0
+; SSE2-NEXT:    psrlw %xmm3, %xmm5
+; SSE2-NEXT:    psrlw $8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw %xmm5, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT:    psllw %xmm5, %xmm7
+; SSE41-NEXT:    pshufb %xmm3, %xmm7
+; SSE41-NEXT:    pand %xmm7, %xmm2
+; SSE41-NEXT:    psubb %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    psrlw %xmm1, %xmm6
+; SSE41-NEXT:    pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm0, %xmm6
+; SSE41-NEXT:    por %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpsllw %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm4
+; AVX2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpbroadcastb %xmm3, %xmm3
+; AVX2-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm5, %xmm1
+; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm0, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    pand %xmm2, %xmm4
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw %xmm4, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm6, %xmm6
+; X32-SSE-NEXT:    psllw %xmm4, %xmm6
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm0
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm5
+; X32-SSE-NEXT:    psrlw $8, %xmm5
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)
+  ret <16 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind {
+; SSE2-LABEL: constant_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $60, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq $50, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllq $4, %xmm1
+; SSE2-NEXT:    psllq $14, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $50, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq $60, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $14, %xmm1
+; SSE41-NEXT:    psllq $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
+; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v2i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $60, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq $50, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq $4, %xmm1
+; X32-SSE-NEXT:    psllq $14, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 4, i64 14>)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
+; SSE2-LABEL: constant_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,32,64,128]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
+; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v4i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind {
+; SSE2-LABEL: constant_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
+; SSE2-NEXT:    pmulhuw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,2,4,8,16,32,64,128>
+; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,15,14,13,12,11,10,9]
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
+; X32-SSE-NEXT:    pmulhuw %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
+; SSE2-LABEL: constant_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; SSE41-NEXT:    pmullw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    packuswb %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    por %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512F-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <16 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlq $50, %xmm1
+; SSE-NEXT:    psllq $14, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlq $50, %xmm0, %xmm1
+; AVX-NEXT:    vpsllq $14, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: splatconstant_funnnel_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolq $14, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolq $14, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $50, %xmm1
+; X32-SSE-NEXT:    psllq $14, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 14, i64 14>)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $28, %xmm1
+; SSE-NEXT:    pslld $4, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
+; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: splatconstant_funnnel_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprold $4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $28, %xmm1
+; X32-SSE-NEXT:    pslld $4, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $9, %xmm1
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $9, %xmm0, %xmm1
+; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $9, %xmm0, %xmm1
+; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $9, %xmm1
+; X32-SSE-NEXT:    psllw $7, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $4, %xmm1
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psllw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshl-rot-256.ll b/test/CodeGen/X86/vector-fshl-rot-256.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6f72c14f7436d4c5036ca0c4388415083e1e1342
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -0,0 +1,1808 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
+
+declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
+declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpsllq %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63]
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpsllvq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_funnnel_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX512-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512-NEXT:    vpsllvq %ymm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; AVX512-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63]
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpsllvq %ymm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %amt)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; AVX1-NEXT:    vpsrld %xmm7, %xmm2, %xmm7
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX1-NEXT:    vpsrld %xmm7, %xmm2, %xmm7
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
+; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm6
+; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm7
+; AVX1-NEXT:    vpsrld %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpsllvd %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_funnnel_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
+; AVX512-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512-NEXT:    vpsllvd %ymm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; AVX512-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpshld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [31,31,31,31]
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpsllvd %ymm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %amt)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm3, %xmm5
+; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm5, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $2, %xmm5, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm6
+; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpsllw $12, %xmm4, %xmm6
+; AVX1-NEXT:    vpsllw $4, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm0, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $2, %xmm4, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
+; AVX2-NEXT:    vpsllvd %ymm6, %ymm3, %ymm6
+; AVX2-NEXT:    vpsrld $16, %ymm6, %ymm6
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
+; AVX2-NEXT:    vpsllvd %ymm5, %ymm0, %ymm5
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpackusdw %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpsubw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX2-NEXT:    vpsrlvd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpsllvw %ymm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpshlw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm5, %xmm2
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %amt)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT:    vpsubb %xmm5, %xmm9, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT:    vpand %xmm10, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm11, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubb %xmm1, %xmm9, %xmm5
+; AVX1-NEXT:    vpand %xmm10, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpsrlw $2, %xmm4, %xmm6
+; AVX1-NEXT:    vpand %xmm11, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm6
+; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm5
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm0, %ymm3
+; AVX2-NEXT:    vpsllw $2, %ymm3, %ymm5
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm5
+; AVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $2, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm5, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %amt)
+  ret <32 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm4, %xmm7
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllq %xmm1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsllq %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsllq %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; XOPAVX1-NEXT:    vmovapd {{.*#+}} xmm2 = [63,63]
+; XOPAVX1-NEXT:    vandpd %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpsllq %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpsllq %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
+  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %splat)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT:    vpsubd %xmm10, %xmm9, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpsrld %xmm5, %xmm6, %xmm7
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrld %xmm4, %xmm6, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm11 = xmm2[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX1-NEXT:    vpsrld %xmm7, %xmm6, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm6, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5],xmm11[6,7]
+; AVX1-NEXT:    vpsrld %xmm5, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrld %xmm7, %xmm0, %xmm4
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpand %xmm8, %xmm10, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpslld %xmm2, %xmm6, %xmm3
+; AVX1-NEXT:    vpslld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX512-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpslld %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; XOPAVX2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %splat)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm9, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpsrlw $4, %xmm6, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $2, %xmm6, %xmm7
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $1, %xmm6, %xmm7
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm7, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vpand %xmm8, %xmm9, %xmm2
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm2, %xmm5, %xmm3
+; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %splat)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT:    vpshufb %xmm9, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm10, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubb %xmm3, %xmm9, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT:    vpand %xmm8, %xmm6, %xmm6
+; AVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrlw $2, %xmm4, %xmm7
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm11, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm7
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm12, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm10, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm0, %xmm6
+; AVX1-NEXT:    vpsrlw $2, %xmm6, %xmm7
+; AVX1-NEXT:    vpand %xmm11, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm6
+; AVX1-NEXT:    vpand %xmm12, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpsllw %xmm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw %xmm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm3
+; AVX512VL-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX512VL-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm5, %ymm1
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm3
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm5, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm5, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %splat)
+  ret <32 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
+; AVX1-LABEL: constant_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vpsllq $60, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllq $50, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
+; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v4i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
+; AVX1-LABEL: constant_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
+; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v8i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind {
+; AVX1-LABEL: constant_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; XOPAVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
+; AVX1-LABEL: constant_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm7, %xmm4, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsllw $2, %ymm1, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,249,250,251,252,253,254,255,0,255,254,253,252,251,250,249]
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,249,250,251,252,253,254,255,0,255,254,253,252,251,250,249]
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <32 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrlq $50, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq $14, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlq $50, %ymm0, %ymm1
+; AVX2-NEXT:    vpsllq $14, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatconstant_funnnel_v4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolq $14, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolq $14, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
+; AVX1-NEXT:    vpslld $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm2
+; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm1
+; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatconstant_funnnel_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprold $4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlw $9, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $9, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $9, %ymm0, %ymm1
+; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $9, %ymm0, %ymm1
+; AVX512-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshl-rot-512.ll b/test/CodeGen/X86/vector-fshl-rot-512.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3510a22e13f23accd77c3326ffbc6eed863118ab
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -0,0 +1,857 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+
+declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
+declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
+; AVX512-LABEL: var_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [63,63,63,63,63,63,63,63]
+; AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512-NEXT:    vpsllvq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %zmm1, %zmm4, %zmm1
+; AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
+; AVX512-LABEL: var_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512-NEXT:    vpandd %zmm2, %zmm1, %zmm3
+; AVX512-NEXT:    vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %zmm1, %zmm4, %zmm1
+; AVX512-NEXT:    vpandd %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm0, %zmm3, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
+; AVX512F-LABEL: var_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm5, %zmm0, %zmm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm5, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm5, %zmm0, %zmm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm0, %zmm5, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubw %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm4, %zmm1
+; AVX512VLBW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
+; AVX512F-LABEL: var_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT:    vpand %ymm8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm9
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpor %ymm4, %ymm9, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm11
+; AVX512F-NEXT:    vpor %ymm4, %ymm11, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $6, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm10, %ymm4, %ymm4
+; AVX512F-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
+; AVX512F-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VL-NEXT:    vpand %ymm8, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm9
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm10, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpor %ymm4, %ymm9, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm11
+; AVX512VL-NEXT:    vpor %ymm4, %ymm11, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $6, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand %ymm10, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm2, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm2, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsubb %zmm1, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k2
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm2, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm2, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm3, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
+  ret <64 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
+; AVX512-LABEL: splatvar_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm1, %zmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsllq %xmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT:    retq
+  %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
+; AVX512-LABEL: splatvar_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm1, %zmm1
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX512-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm0, %zmm3, %zmm0
+; AVX512-NEXT:    retq
+  %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
+; AVX512F-LABEL: splatvar_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm5, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm5, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastw %xmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
+; AVX512F-LABEL: splatvar_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm5, %ymm6
+; AVX512F-NEXT:    vpbroadcastb %xmm6, %ymm6
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm7, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpbroadcastb %xmm6, %ymm6
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm7, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpsllw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VLBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm1
+; AVX512VLBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
+  ret <64 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind {
+; AVX512-LABEL: constant_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vprolvq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind {
+; AVX512-LABEL: constant_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vprolvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
+; AVX512F-LABEL: constant_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
+; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
+; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512VL-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
+; AVX512F-LABEL: constant_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpsrlw $8, %ymm9, %ymm9
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm9, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm3
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm9, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpmullw %ymm9, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm2, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <64 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind {
+; AVX512-LABEL: splatconstant_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
+; AVX512-LABEL: splatconstant_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
+; AVX512F-LABEL: splatconstant_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $9, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $9, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $7, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $9, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $9, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $7, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $9, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $9, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $7, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
+; AVX512F-LABEL: splatconstant_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshr-128.ll b/test/CodeGen/X86/vector-fshr-128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..616a3780e346f031affdd11cf2430c18b2f1081d
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshr-128.ll
@@ -0,0 +1,2622 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
+
+; Just one 32-bit run to make sure we do reasonable things for i64 cases.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
+
+declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlq %xmm4, %xmm5
+; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [64,64]
+; SSE2-NEXT:    psubq %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllq %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE2-NEXT:    psllq %xmm3, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE2-NEXT:    orpd %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrlq %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [64,64]
+; SSE41-NEXT:    psubq %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psllq %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    por %xmm5, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqq %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v2i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpshlq %xmm5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsrlvq %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
+; X32-SSE-NEXT:    psubq %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllq %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm3, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm5, %xmm0
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; X32-SSE-NEXT:    pand %xmm3, %xmm2
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrld %xmm5, %xmm3
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    psrld %xmm5, %xmm6
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrld %xmm4, %xmm5
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32,32,32,32]
+; SSE2-NEXT:    psubd %xmm2, %xmm4
+; SSE2-NEXT:    pslld $23, %xmm4
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    cvttps2dq %xmm4, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; SSE2-NEXT:    por %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm6, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrld %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    psrld %xmm5, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrld %xmm0, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [32,32,32,32]
+; SSE41-NEXT:    psubd %xmm2, %xmm0
+; SSE41-NEXT:    pslld $23, %xmm0
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE41-NEXT:    pmulld %xmm0, %xmm3
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpslld $23, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT:    vpmulld %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v4i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpshld %xmm5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsrlvd %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psrld %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    psrld %xmm5, %xmm3
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm6
+; X32-SSE-NEXT:    psrld %xmm5, %xmm6
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrld %xmm4, %xmm5
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [32,32,32,32]
+; X32-SSE-NEXT:    psubd %xmm2, %xmm4
+; X32-SSE-NEXT:    pslld $23, %xmm4
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT:    cvttps2dq %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm4, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm5, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; X32-SSE-NEXT:    por %xmm3, %xmm6
+; X32-SSE-NEXT:    pxor %xmm0, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm6, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    psllw $12, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    psraw $15, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw $8, %xmm5
+; SSE2-NEXT:    pand %xmm3, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    psraw $15, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    psrlw $4, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    psraw $15, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    psrlw $2, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT:    psubw %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; SSE2-NEXT:    pslld $23, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm6, %xmm7
+; SSE2-NEXT:    cvttps2dq %xmm7, %xmm7
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; SSE2-NEXT:    pslld $23, %xmm4
+; SSE2-NEXT:    paddd %xmm6, %xmm4
+; SSE2-NEXT:    cvttps2dq %xmm4, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
+; SSE2-NEXT:    pmullw %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pcmpeqw %xmm8, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; SSE41-NEXT:    psubw %xmm2, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE41-NEXT:    pcmpeqw %xmm2, %xmm4
+; SSE41-NEXT:    psllw $12, %xmm2
+; SSE41-NEXT:    psllw $4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm7
+; SSE41-NEXT:    psrlw $8, %xmm7
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm7
+; SSE41-NEXT:    psrlw $4, %xmm7
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm7
+; SSE41-NEXT:    psrlw $2, %xmm7
+; SSE41-NEXT:    paddw %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm7
+; SSE41-NEXT:    psrlw $1, %xmm7
+; SSE41-NEXT:    paddw %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm3
+; SSE41-NEXT:    pslld $23, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm0, %xmm5
+; SSE41-NEXT:    cvttps2dq %xmm5, %xmm2
+; SSE41-NEXT:    pslld $23, %xmm6
+; SSE41-NEXT:    paddd %xmm0, %xmm6
+; SSE41-NEXT:    cvttps2dq %xmm6, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    pmullw %xmm0, %xmm8
+; SSE41-NEXT:    por %xmm3, %xmm8
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm8
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm4
+; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm5
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm5
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vpackusdw %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsllvd %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512F-NEXT:    vpsrlvd %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX512VL-NEXT:    vpsrlvd %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VL-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsllvw %xmm4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: var_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOP-NEXT:    vpsubw %xmm2, %xmm3, %xmm4
+; XOP-NEXT:    vpshlw %xmm4, %xmm1, %xmm4
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; XOP-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
+; XOP-NEXT:    vpshlw %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; XOP-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    subl $28, %esp
+; X32-SSE-NEXT:    movups %xmm0, (%esp) # 16-byte Spill
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X32-SSE-NEXT:    psllw $12, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm3
+; X32-SSE-NEXT:    psraw $15, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlw $8, %xmm5
+; X32-SSE-NEXT:    pand %xmm3, %xmm5
+; X32-SSE-NEXT:    pandn %xmm1, %xmm3
+; X32-SSE-NEXT:    por %xmm5, %xmm3
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    psraw $15, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm3, %xmm6
+; X32-SSE-NEXT:    psrlw $4, %xmm3
+; X32-SSE-NEXT:    pand %xmm5, %xmm3
+; X32-SSE-NEXT:    por %xmm6, %xmm3
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    psraw $15, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm3, %xmm6
+; X32-SSE-NEXT:    psrlw $2, %xmm3
+; X32-SSE-NEXT:    pand %xmm5, %xmm3
+; X32-SSE-NEXT:    por %xmm6, %xmm3
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm3, %xmm5
+; X32-SSE-NEXT:    psrlw $1, %xmm3
+; X32-SSE-NEXT:    pand %xmm4, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    psubw %xmm2, %xmm4
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm7
+; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; X32-SSE-NEXT:    pslld $23, %xmm7
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT:    paddd %xmm0, %xmm7
+; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; X32-SSE-NEXT:    pslld $23, %xmm4
+; X32-SSE-NEXT:    paddd %xmm0, %xmm4
+; X32-SSE-NEXT:    cvttps2dq %xmm7, %xmm0
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    cvttps2dq %xmm4, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; X32-SSE-NEXT:    movdqu (%esp), %xmm0 # 16-byte Reload
+; X32-SSE-NEXT:    pmullw %xmm0, %xmm4
+; X32-SSE-NEXT:    por %xmm5, %xmm4
+; X32-SSE-NEXT:    por %xmm3, %xmm4
+; X32-SSE-NEXT:    pcmpeqw %xmm6, %xmm2
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm4, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    addl $28, %esp
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psllw $5, %xmm5
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psrlw $4, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pandn %xmm4, %xmm7
+; SSE2-NEXT:    psrlw $2, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm4, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT:    psubb %xmm2, %xmm5
+; SSE2-NEXT:    psllw $5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pandn %xmm0, %xmm7
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pandn %xmm0, %xmm7
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    psubb %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $5, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    paddb %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    paddb %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm6
+; SSE41-NEXT:    psllw $5, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm2
+; SSE41-NEXT:    paddb %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm7
+; SSE41-NEXT:    psllw $4, %xmm7
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm7
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm7, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psllw $2, %xmm4
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    paddb %xmm3, %xmm4
+; SSE41-NEXT:    paddb %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    por %xmm6, %xmm3
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_funnnel_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $5, %xmm2, %xmm3
+; AVX-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm4
+; AVX-NEXT:    vpsrlw $2, %xmm4, %xmm5
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm5, %xmm5
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; AVX-NEXT:    vpsrlw $1, %xmm4, %xmm5
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm5, %xmm5
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX-NEXT:    vpsllw $5, %xmm4, %xmm4
+; AVX-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm6
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm6, %xmm6
+; AVX-NEXT:    vpblendvb %xmm4, %xmm6, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $2, %xmm0, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
+; AVX-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm5, %ymm6, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: var_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOP-NEXT:    vpsubb %xmm2, %xmm3, %xmm4
+; XOP-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOP-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
+; XOP-NEXT:    vpshlb %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; XOP-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
+; X32-SSE-NEXT:    psllw $5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm4
+; X32-SSE-NEXT:    pand %xmm6, %xmm4
+; X32-SSE-NEXT:    pandn %xmm1, %xmm6
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT:    por %xmm6, %xmm4
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm7
+; X32-SSE-NEXT:    pandn %xmm4, %xmm7
+; X32-SSE-NEXT:    psrlw $2, %xmm4
+; X32-SSE-NEXT:    pand %xmm6, %xmm4
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT:    por %xmm7, %xmm4
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm5
+; X32-SSE-NEXT:    pandn %xmm4, %xmm5
+; X32-SSE-NEXT:    psrlw $1, %xmm4
+; X32-SSE-NEXT:    pand %xmm6, %xmm4
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT:    por %xmm5, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X32-SSE-NEXT:    psubb %xmm2, %xmm5
+; X32-SSE-NEXT:    psllw $5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm7
+; X32-SSE-NEXT:    pandn %xmm0, %xmm7
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm6, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm7, %xmm0
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm7
+; X32-SSE-NEXT:    pandn %xmm0, %xmm7
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm6, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm7, %xmm0
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpeqb %xmm3, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    por %xmm4, %xmm5
+; X32-SSE-NEXT:    paddb %xmm0, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
+  ret <16 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [64,64]
+; SSE2-NEXT:    psubq %xmm2, %xmm4
+; SSE2-NEXT:    psllq %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [64,64]
+; SSE41-NEXT:    psubq %xmm2, %xmm4
+; SSE41-NEXT:    psllq %xmm4, %xmm3
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqq %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
+; X32-SSE-NEXT:    psubq %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllq %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm3, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm5, %xmm0
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; X32-SSE-NEXT:    pand %xmm3, %xmm2
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
+  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    xorps %xmm4, %xmm4
+; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrld %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32,32,32,32]
+; SSE2-NEXT:    psubd %xmm2, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
+; SSE2-NEXT:    pslld %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrld %xmm0, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [32,32,32,32]
+; SSE41-NEXT:    psubd %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pslld %xmm0, %xmm3
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX2-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512F-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512F-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512VL-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512BW-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512BW-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512VLBW-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512VLBW-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX1-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX2-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    xorps %xmm4, %xmm4
+; X32-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrld %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [32,32,32,32]
+; X32-SSE-NEXT:    psubd %xmm2, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm3, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
+; X32-SSE-NEXT:    pslld %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT:    psubw %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw %xmm3, %xmm5
+; SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrlw %xmm0, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
+; SSE41-NEXT:    psubw %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    psllw %xmm0, %xmm3
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm5, %xmm1, %xmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm5, %xmm1, %xmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    psubw %xmm3, %xmm4
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpeqw %xmm3, %xmm2
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm5
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT:    psubb %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw %xmm3, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT:    psrlw %xmm3, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm4, %xmm0
+; SSE2-NEXT:    psllw %xmm4, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrlw %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT:    psrlw %xmm4, %xmm7
+; SSE41-NEXT:    pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm5, %xmm7
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    psubb %xmm2, %xmm4
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psllw %xmm4, %xmm3
+; SSE41-NEXT:    psllw %xmm4, %xmm6
+; SSE41-NEXT:    pshufb %xmm0, %xmm6
+; SSE41-NEXT:    pand %xmm6, %xmm3
+; SSE41-NEXT:    por %xmm7, %xmm3
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm3, %xmm1, %xmm4
+; AVX2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpsrlw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX2-NEXT:    vpbroadcastb %xmm3, %xmm3
+; AVX2-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vpbroadcastb %xmm4, %xmm4
+; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm5, %ymm6, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X32-SSE-NEXT:    psubb %xmm3, %xmm4
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm3, %xmm2
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm6, %xmm6
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm6
+; X32-SSE-NEXT:    pcmpeqd %xmm3, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm6
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm5, %xmm6
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm4, %xmm0
+; X32-SSE-NEXT:    psllw %xmm4, %xmm3
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm0, %xmm3
+; X32-SSE-NEXT:    por %xmm6, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm3, %xmm2
+; X32-SSE-NEXT:    por %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
+  ret <16 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
+; SSE2-LABEL: constant_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlq $4, %xmm2
+; SSE2-NEXT:    psrlq $14, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq $60, %xmm2
+; SSE2-NEXT:    psllq $50, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    orpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq $14, %xmm2
+; SSE41-NEXT:    psrlq $4, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq $50, %xmm2
+; SSE41-NEXT:    psllq $60, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsllq $50, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllq $60, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_funnnel_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlq $4, %xmm2
+; X32-SSE-NEXT:    psrlq $14, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq $60, %xmm2
+; X32-SSE-NEXT:    psllq $50, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; SSE2-LABEL: constant_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrld $7, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrld $6, %xmm3
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrld $5, %xmm2
+; SSE2-NEXT:    psrld $4, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrld $7, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrld $5, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrld $6, %xmm2
+; SSE41-NEXT:    psrld $4, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $7, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrld $5, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrld $6, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_funnnel_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrld $7, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    psrld $6, %xmm3
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psrld $5, %xmm2
+; X32-SSE-NEXT:    psrld $4, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm3, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE2-LABEL: constant_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE2-NEXT:    pmulhuw %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pmullw %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pmulhuw %xmm2, %xmm3
+; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
+; AVX-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
+; AVX512F-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
+; AVX512VL-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,15,14,13,12,11,10,9]
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %xmm1, %xmm2
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
+; X32-SSE-NEXT:    pmulhuw %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pmullw %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE2-LABEL: constant_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    psrlw $8, %xmm4
+; SSE2-NEXT:    packuswb %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    packuswb %xmm3, %xmm4
+; SSE41-NEXT:    por %xmm2, %xmm4
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    movw $257, %ax # imm = 0x101
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; XOP-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT:    psrlw $8, %xmm4
+; X32-SSE-NEXT:    packuswb %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X32-SSE-NEXT:    pand %xmm3, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <16 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlq $14, %xmm1
+; SSE-NEXT:    psllq $50, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlq $14, %xmm1, %xmm1
+; AVX-NEXT:    vpsllq $50, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $14, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $50, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrlq $14, %xmm1, %xmm1
+; XOP-NEXT:    vpsllq $50, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlq $14, %xmm1
+; X32-SSE-NEXT:    psllq $50, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld $4, %xmm1
+; SSE-NEXT:    pslld $28, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $4, %xmm1, %xmm1
+; AVX-NEXT:    vpslld $28, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $4, %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $28, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrld $4, %xmm1, %xmm1
+; XOP-NEXT:    vpslld $28, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrld $4, %xmm1
+; X32-SSE-NEXT:    pslld $28, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $7, %xmm1
+; SSE-NEXT:    psllw $9, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $9, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $9, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; XOP-NEXT:    vpsllw $9, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $7, %xmm1
+; X32-SSE-NEXT:    psllw $9, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $4, %xmm1
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psllw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshr-256.ll b/test/CodeGen/X86/vector-fshr-256.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9672e8d4b53a901faa34a646d8e1d70c98977d9f
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshr-256.ll
@@ -0,0 +1,2147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
+
+declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <16 x i16> @llvm.fshr.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
+declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm3, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm4, %xmm8, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpsllq %xmm6, %xmm7, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpsubq %xmm2, %xmm8, %xmm6
+; AVX1-NEXT:    vpsllq %xmm6, %xmm0, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX2-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
+; AVX2-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
+; AVX512F-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
+; AVX512VL-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
+; AVX512BW-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v4i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
+; AVX512VLBW-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; XOPAVX1-NEXT:    vpshlq %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm6
+; XOPAVX1-NEXT:    vpshlq %xmm6, %xmm1, %xmm6
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm3, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT:    vpshlq %xmm7, %xmm6, %xmm6
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vpshlq %xmm7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorpd %ymm5, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vpcomeqq %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqq %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpsrlvq %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; XOPAVX2-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT:    vpsrld %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT:    vpsrld %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpsrld %xmm7, %xmm1, %xmm7
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm3, %xmm9, %xmm6
+; AVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmulld %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubd %xmm2, %xmm9, %xmm6
+; AVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT:    vpmulld %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX2-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
+; AVX2-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v8i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; XOPAVX1-NEXT:    vpshld %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm6
+; XOPAVX1-NEXT:    vpshld %xmm6, %xmm1, %xmm6
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT:    vpshld %xmm7, %xmm6, %xmm6
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vpshld %xmm7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm5, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vpcomeqd %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqd %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpsrlvd %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; XOPAVX2-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm3, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm5
+; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm6, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $2, %xmm4, %xmm6
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm6
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm5
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm6
+; AVX1-NEXT:    vpor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm1, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm5, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $2, %xmm5, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm7
+; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm8
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm3, %xmm9, %xmm6
+; AVX1-NEXT:    vpxor %xmm10, %xmm10, %xmm10
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
+; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmullw %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpsubw %xmm2, %xmm9, %xmm6
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
+; AVX1-NEXT:    vpslld $23, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddd %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm8, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpeqw %xmm10, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm10, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX2-NEXT:    vpsrlvd %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX2-NEXT:    vpsrlvd %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpackusdw %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %ymm2, %ymm6, %ymm6
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
+; AVX2-NEXT:    vpsllvd %ymm7, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
+; AVX2-NEXT:    vpsllvd %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubw %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; XOPAVX1-NEXT:    vpshlw %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm6
+; XOPAVX1-NEXT:    vpshlw %xmm6, %xmm1, %xmm6
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16]
+; XOPAVX1-NEXT:    vpsubw %xmm3, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT:    vpshlw %xmm7, %xmm6, %xmm6
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vpshlw %xmm7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm5, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vpcomeqw %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqw %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm1, %xmm4
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT:    vpshlw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm4, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm6
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm3, %xmm9, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm8, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubb %xmm2, %xmm9, %xmm6
+; AVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm5
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm9, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm3, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $2, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm10, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT:    vpand %xmm9, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpsrlw $2, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm10, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpsllw $5, %ymm2, %ymm3
+; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm4
+; AVX2-NEXT:    vpsrlw $2, %ymm4, %ymm5
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm5
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; AVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; AVX2-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm3
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpsrlw $2, %ymm4, %ymm5
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm5
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpsrlw $2, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsrlw $1, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm6, %ymm6
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm6
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm7, %xmm6, %xmm6
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vpshlb %xmm7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm5, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vpcomeqb %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqb %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
+  ret <32 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpsllq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm6
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; XOPAVX1-NEXT:    vpsllq %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsllq %xmm5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
+  %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %splat)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsrld %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpslld %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX1-NEXT:    vpslld %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512F-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512F-NEXT:    vpslld %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512VL-NEXT:    vpslld %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512BW-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512BW-NEXT:    vpslld %xmm4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512VLBW-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512VLBW-NEXT:    vpslld %xmm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; XOPAVX1-NEXT:    vpslld %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; XOPAVX1-NEXT:    vpslld %xmm5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %splat)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsrlw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpsllw %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm5, %ymm1, %ymm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm5, %ymm1, %ymm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; XOPAVX1-NEXT:    vpsllw %xmm6, %xmm7, %xmm6
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsllw %xmm5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %splat)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm8, %xmm7
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm9
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm5, %xmm7, %xmm6
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsllw %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw %xmm6, %xmm8, %xmm6
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubb %xmm2, %xmm7, %xmm6
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw %xmm6, %xmm8, %xmm6
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm9, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrlw %xmm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw %xmm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm3, %ymm1, %ymm4
+; AVX512F-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw %xmm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm5, %ymm3
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX512VL-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm3, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm6
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm7, %xmm6, %xmm6
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vpshlb %xmm7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm5, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat)
+  ret <32 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
+; AVX1-LABEL: constant_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrlq $60, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlq $50, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllq $4, %xmm2, %xmm3
+; AVX1-NEXT:    vpsllq $14, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsllq $50, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllq $60, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_funnnel_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
+; AVX1-LABEL: constant_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrld $11, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrld $9, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrld $10, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrld $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrld $7, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld $5, %xmm1, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsrld $6, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_funnnel_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
+; AVX1-LABEL: constant_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
+; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpmulhuw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmulhuw %xmm4, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX2-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512VL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm1, %ymm2
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm3, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vpcmov {{.*}}(%rip), %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; XOPAVX2-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; XOPAVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
+; AVX1-LABEL: constant_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [256,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [256,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,2,4,8,16,32,64,128>
+; AVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,128,64,32,16,8,4,2>
+; AVX1-NEXT:    vpmullw %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    movl $16843009, %eax # imm = 0x1010101
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,255,254,253,252,251,250,249,0,249,250,251,252,253,254,255]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,255,254,253,252,251,250,249,0,249,250,251,252,253,254,255]
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <32 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpsllq $50, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsllq $50, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlq $14, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllq $50, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $14, %ymm1, %ymm1
+; AVX512-NEXT:    vpsllq $50, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrlq $14, %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq $14, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpsllq $50, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsllq $50, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlq $14, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllq $50, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpslld $28, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpslld $28, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $4, %ymm1, %ymm1
+; AVX2-NEXT:    vpslld $28, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $4, %ymm1, %ymm1
+; AVX512-NEXT:    vpslld $28, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrld $4, %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpsrld $4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpslld $28, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpslld $28, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrld $4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpslld $28, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpsllw $9, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsllw $9, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $7, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $9, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $7, %ymm1, %ymm1
+; AVX512-NEXT:    vpsllw $9, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsrlw $7, %xmm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; XOPAVX1-NEXT:    vpsllw $9, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vpsllw $9, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlw $7, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllw $9, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshr-512.ll b/test/CodeGen/X86/vector-fshr-512.ll
new file mode 100644
index 0000000000000000000000000000000000000000..50a4803b6bf9705df2beafe121a1053f9d46389c
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshr-512.ll
@@ -0,0 +1,963 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+
+declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
+declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
+; AVX512-LABEL: var_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm5
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512-NEXT:    vpsllvq %zmm4, %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
+; AVX512-LABEL: var_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm5
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm5, %zmm0, %zmm0
+; AVX512-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
+; AVX512F-LABEL: var_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm8 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm7, %zmm8, %zmm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm4, %ymm8, %ymm9
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm9, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm7, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX512F-NEXT:    vpcmpeqw %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm8, %ymm5
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm5, %zmm1, %zmm1
+; AVX512F-NEXT:    vpord %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpcmpeqw %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm8 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm7, %zmm8, %zmm7
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm4, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm9, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm7, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX512VL-NEXT:    vpcmpeqw %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm5, %zmm4
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm8, %ymm5
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm5, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpord %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqw %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
+; AVX512F-LABEL: var_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm6, %ymm7, %ymm8
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm9
+; AVX512F-NEXT:    vpsllw $5, %ymm9, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm8, %ymm2, %ymm8
+; AVX512F-NEXT:    vpsrlw $2, %ymm8, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm4, %ymm11, %ymm11
+; AVX512F-NEXT:    vpaddb %ymm10, %ymm10, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm11, %ymm8, %ymm11
+; AVX512F-NEXT:    vpsrlw $1, %ymm11, %ymm12
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm8, %ymm12, %ymm12
+; AVX512F-NEXT:    vpaddb %ymm10, %ymm10, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm12, %ymm11, %ymm10
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm12, %ymm11, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %ymm9, %ymm13, %ymm14
+; AVX512F-NEXT:    vpsllw $5, %ymm14, %ymm14
+; AVX512F-NEXT:    vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm11
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm15, %ymm11, %ymm11
+; AVX512F-NEXT:    vpaddb %ymm14, %ymm14, %ymm14
+; AVX512F-NEXT:    vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm11
+; AVX512F-NEXT:    vpaddb %ymm14, %ymm14, %ymm14
+; AVX512F-NEXT:    vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm10, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm10, %xmm10, %xmm10
+; AVX512F-NEXT:    vpcmpeqb %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpblendvb %ymm9, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm3, %ymm2
+; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw $5, %ymm5, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm7
+; AVX512F-NEXT:    vpand %ymm4, %ymm7, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm4
+; AVX512F-NEXT:    vpand %ymm8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm12, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm13, %ymm6
+; AVX512F-NEXT:    vpsllw $5, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm15, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpcmpeqb %ymm10, %ymm5, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %ymm8, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm9
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm6, %ymm2, %ymm6
+; AVX512VL-NEXT:    vpsrlw $2, %ymm6, %ymm10
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm11, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsrlw $1, %ymm6, %ymm10
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpand %ymm12, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm9
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm10 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm10, %ymm9, %ymm9
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %ymm4, %ymm13, %ymm14
+; AVX512VL-NEXT:    vpsllw $5, %ymm14, %ymm14
+; AVX512VL-NEXT:    vpblendvb %ymm14, %ymm9, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm9
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm15, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpaddb %ymm14, %ymm14, %ymm14
+; AVX512VL-NEXT:    vpblendvb %ymm14, %ymm9, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm9
+; AVX512VL-NEXT:    vpaddb %ymm14, %ymm14, %ymm14
+; AVX512VL-NEXT:    vpblendvb %ymm14, %ymm9, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm6, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm8, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm7
+; AVX512VL-NEXT:    vpand %ymm11, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm7
+; AVX512VL-NEXT:    vpand %ymm12, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm10, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsubb %ymm4, %ymm13, %ymm7
+; AVX512VL-NEXT:    vpsllw $5, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm15, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpsllw $5, %zmm4, %zmm5
+; AVX512BW-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
+; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm5, %k2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm5, %zmm7
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
+; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm5, %zmm7
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
+; AVX512BW-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
+; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpsllw $5, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm6
+; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k2
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm6, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm4, %zmm5
+; AVX512VLBW-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
+; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k2
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm5, %zmm7
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm5, %zmm7
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
+; AVX512VLBW-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
+; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm6
+; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k2
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k2}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm0, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
+  ret <64 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
+; AVX512-LABEL: splatvar_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm2, %zmm2
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpsrlq %xmm4, %zmm1, %zmm5
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
+; AVX512-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    retq
+  %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %splat)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
+; AVX512-LABEL: splatvar_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm2, %zmm2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX512-NEXT:    vpsrld %xmm5, %zmm1, %zmm5
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
+; AVX512-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX512-NEXT:    vpslld %xmm4, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm5, %zmm0, %zmm0
+; AVX512-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    retq
+  %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %splat)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
+; AVX512F-LABEL: splatvar_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastw %xmm4, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm5, %ymm2, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm4, %xmm7, %xmm7
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm7, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpcmpeqw %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw %xmm5, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsllw %xmm7, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw %xmm4, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm5, %ymm2, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm4, %xmm7, %xmm7
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm7, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm6, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT:    vpcmpeqw %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm5, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsllw %xmm7, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm5, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm5, %zmm1, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %splat)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
+; AVX512F-LABEL: splatvar_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm5, %ymm2, %ymm6
+; AVX512F-NEXT:    vpcmpeqd %ymm9, %ymm9, %ymm9
+; AVX512F-NEXT:    vpsrlw %xmm5, %ymm9, %ymm8
+; AVX512F-NEXT:    vpsrlw $8, %ymm8, %ymm8
+; AVX512F-NEXT:    vpbroadcastb %xmm8, %ymm8
+; AVX512F-NEXT:    vpand %ymm8, %ymm6, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm4, %xmm7, %xmm7
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm7, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm7, %ymm9, %ymm9
+; AVX512F-NEXT:    vpbroadcastb %xmm9, %ymm9
+; AVX512F-NEXT:    vpand %ymm9, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw %xmm5, %ymm3, %ymm2
+; AVX512F-NEXT:    vpand %ymm8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw %xmm7, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm9, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm5, %ymm2, %ymm6
+; AVX512VL-NEXT:    vpcmpeqd %ymm9, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpsrlw %xmm5, %ymm9, %ymm8
+; AVX512VL-NEXT:    vpsrlw $8, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpbroadcastb %xmm8, %ymm8
+; AVX512VL-NEXT:    vpand %ymm8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm4, %xmm7, %xmm7
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm7, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm7, %ymm9, %ymm9
+; AVX512VL-NEXT:    vpbroadcastb %xmm9, %ymm9
+; AVX512VL-NEXT:    vpand %ymm9, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm6, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm5, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpand %ymm8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw %xmm7, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm9, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm5, %zmm1, %zmm6
+; AVX512BW-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
+; AVX512BW-NEXT:    vpsrlw %xmm5, %zmm7, %zmm5
+; AVX512BW-NEXT:    vpsrlw $8, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpbroadcastb %xmm5, %zmm5
+; AVX512BW-NEXT:    vpandq %zmm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw %xmm4, %zmm7, %zmm4
+; AVX512BW-NEXT:    vpbroadcastb %xmm4, %zmm4
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm5, %zmm1, %zmm6
+; AVX512VLBW-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
+; AVX512VLBW-NEXT:    vpsrlw %xmm5, %zmm7, %zmm5
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm5, %zmm5
+; AVX512VLBW-NEXT:    vpandq %zmm5, %zmm6, %zmm5
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm4, %zmm4
+; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat)
+  ret <64 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
+; AVX512-LABEL: constant_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
+; AVX512-LABEL: constant_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
+; AVX512F-LABEL: constant_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512F-NEXT:    vpmulhuw %ymm4, %ymm2, %ymm5
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vpmulhuw %ymm4, %ymm3, %ymm2
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm5 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512VL-NEXT:    vpmulhuw %ymm4, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512VL-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-NEXT:    vpmulhuw %ymm4, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm5 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    movl $65537, %eax # imm = 0x10001
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm2
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    movl $65537, %eax # imm = 0x10001
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
+; AVX512F-LABEL: constant_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm8, %ymm8, %ymm9
+; AVX512F-NEXT:    vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpsrlw $8, %ymm10, %ymm10
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm13, %ymm12, %ymm12
+; AVX512F-NEXT:    vpsrlw $8, %ymm12, %ymm12
+; AVX512F-NEXT:    vpackuswb %ymm10, %ymm12, %ymm10
+; AVX512F-NEXT:    vpor %ymm10, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm9, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
+; AVX512F-NEXT:    vpmullw %ymm13, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm9
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm11 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm11, %ymm11
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm12, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpsrlw $8, %ymm11, %ymm11
+; AVX512VL-NEXT:    vpackuswb %ymm4, %ymm11, %ymm4
+; AVX512VL-NEXT:    vpor %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpmullw %ymm12, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm5, %ymm2
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpackuswb %zmm2, %zmm3, %zmm2
+; AVX512VLBW-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; AVX512VLBW-NEXT:    kmovq %rax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <64 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
+; AVX512-LABEL: splatconstant_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $14, %zmm1, %zmm1
+; AVX512-NEXT:    vpsllq $50, %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
+; AVX512-LABEL: splatconstant_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $4, %zmm1, %zmm1
+; AVX512-NEXT:    vpslld $28, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
+; AVX512F-LABEL: splatconstant_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $9, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsllw $9, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $7, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $9, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $7, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsllw $9, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $9, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $7, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $9, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
+; AVX512F-LABEL: splatconstant_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm3, %ymm2
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshr-rot-128.ll b/test/CodeGen/X86/vector-fshr-rot-128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..17246fb1db9b4c3f37bb3039d390bc3ec3b137f9
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -0,0 +1,2217 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
+
+; Just one 32-bit run to make sure we do reasonable things for i64 cases.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
+
+declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [63,63]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrlq %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrlq %xmm1, %xmm5
+; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllq %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE2-NEXT:    psllq %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    orpd %xmm5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [63,63]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrlq %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrlq %xmm1, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsrlvq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_funnnel_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsrlvq %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpsrlvq %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,0,63,0]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubq %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq %xmm3, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm2, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm5, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrld %xmm1, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld %xmm6, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    psrld %xmm5, %xmm6
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrld %xmm4, %xmm5
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pslld $23, %xmm3
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    cvttps2dq %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    orps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubd %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    psrld %xmm6, %xmm7
+; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm1, %xmm5
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrld %xmm1, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    pslld $23, %xmm3
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    cvttps2dq %xmm3, %xmm1
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrld %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT:    vpsrld %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsrlvd %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_funnnel_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsrlvd %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vpshld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpsrlvd %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubd %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    pand %xmm2, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrld %xmm1, %xmm5
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld %xmm6, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm6
+; X32-SSE-NEXT:    psrld %xmm5, %xmm6
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrld %xmm4, %xmm5
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3]
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    pslld $23, %xmm3
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    cvttps2dq %xmm3, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm3, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    orps %xmm0, %xmm1
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %amt)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    psubw %xmm1, %xmm2
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psraw $15, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $8, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT:    pslld $23, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm6, %xmm5
+; SSE2-NEXT:    cvttps2dq %xmm5, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    pslld $23, %xmm2
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    pmullw %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    psubw %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllw $12, %xmm1
+; SSE41-NEXT:    psllw $4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    psrlw $8, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    psrlw $4, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    psrlw $2, %xmm5
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    psrlw $1, %xmm5
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE41-NEXT:    pslld $23, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm0, %xmm4
+; SSE41-NEXT:    cvttps2dq %xmm4, %xmm3
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm3, %xmm0
+; SSE41-NEXT:    pmullw %xmm0, %xmm2
+; SSE41-NEXT:    por %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_funnnel_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm3, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm5
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm5
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-NEXT:    vpsrlvd %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512F-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VL-NEXT:    vpsrlvd %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpsrlvw %xmm3, %xmm0, %xmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: var_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOP-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOP-NEXT:    vpsubw %xmm3, %xmm4, %xmm3
+; XOP-NEXT:    vpshlw %xmm3, %xmm0, %xmm3
+; XOP-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    psubw %xmm1, %xmm2
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    psraw $15, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $8, %xmm4
+; X32-SSE-NEXT:    pand %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    por %xmm4, %xmm3
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm3, %xmm5
+; X32-SSE-NEXT:    psrlw $4, %xmm3
+; X32-SSE-NEXT:    pand %xmm4, %xmm3
+; X32-SSE-NEXT:    por %xmm5, %xmm3
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psraw $15, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm3, %xmm5
+; X32-SSE-NEXT:    psrlw $2, %xmm3
+; X32-SSE-NEXT:    pand %xmm4, %xmm3
+; X32-SSE-NEXT:    por %xmm5, %xmm3
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    pandn %xmm3, %xmm4
+; X32-SSE-NEXT:    psrlw $1, %xmm3
+; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
+; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; X32-SSE-NEXT:    pslld $23, %xmm5
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT:    paddd %xmm6, %xmm5
+; X32-SSE-NEXT:    cvttps2dq %xmm5, %xmm5
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE-NEXT:    pslld $23, %xmm2
+; X32-SSE-NEXT:    paddd %xmm6, %xmm2
+; X32-SSE-NEXT:    cvttps2dq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X32-SSE-NEXT:    pmullw %xmm0, %xmm1
+; X32-SSE-NEXT:    por %xmm4, %xmm1
+; X32-SSE-NEXT:    por %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %amt)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
+; SSE2-LABEL: var_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    psubb %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    psllw $5, %xmm5
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pandn %xmm3, %xmm7
+; SSE2-NEXT:    psrlw $2, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    paddb %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    psllw $5, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubb %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    psllw $5, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    psrlw $4, %xmm5
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    psrlw $2, %xmm5
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    psrlw $1, %xmm5
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    psllw $5, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psllw $4, %xmm4
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psllw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    por %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_funnnel_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpsllw $5, %xmm3, %xmm3
+; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm4
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm5
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm3, %xmm5, %xmm0, %xmm3
+; AVX-NEXT:    vpsrlw $2, %xmm3, %xmm5
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX-NEXT:    vpsrlw $1, %xmm3, %xmm5
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm5, %xmm5
+; AVX-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: var_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOP-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOP-NEXT:    vpsubb %xmm3, %xmm4, %xmm3
+; XOP-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOP-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: var_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    psubb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    pand %xmm2, %xmm5
+; X32-SSE-NEXT:    psllw $5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $4, %xmm3
+; X32-SSE-NEXT:    pand %xmm6, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm6
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    por %xmm6, %xmm3
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm7
+; X32-SSE-NEXT:    pandn %xmm3, %xmm7
+; X32-SSE-NEXT:    psrlw $2, %xmm3
+; X32-SSE-NEXT:    pand %xmm6, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    por %xmm7, %xmm3
+; X32-SSE-NEXT:    paddb %xmm5, %xmm5
+; X32-SSE-NEXT:    pxor %xmm6, %xmm6
+; X32-SSE-NEXT:    pcmpgtb %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm6, %xmm5
+; X32-SSE-NEXT:    pandn %xmm3, %xmm5
+; X32-SSE-NEXT:    psrlw $1, %xmm3
+; X32-SSE-NEXT:    pand %xmm6, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    por %xmm5, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm4
+; X32-SSE-NEXT:    psllw $5, %xmm4
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddb %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm3, %xmm2
+; X32-SSE-NEXT:    paddb %xmm0, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %amt)
+  ret <16 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
+; SSE-LABEL: splatvar_funnnel_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,63]
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    psubq %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    psrlq %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    psllq %xmm3, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,0,63,0]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubq %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm5
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq %xmm3, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm2, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm5, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
+  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    xorps %xmm4, %xmm4
+; SSE2-NEXT:    xorps %xmm5, %xmm5
+; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
+; SSE2-NEXT:    pslld %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubd %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrld %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    pslld %xmm1, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX512-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOPAVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; XOPAVX2-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOPAVX2-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubd %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    xorps %xmm4, %xmm4
+; X32-SSE-NEXT:    xorps %xmm5, %xmm5
+; X32-SSE-NEXT:    movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld %xmm5, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
+; X32-SSE-NEXT:    pslld %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubw %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrlw %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    psubw %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrlw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubw %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm4
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %splat)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
+; SSE2-LABEL: splatvar_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    psubb %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrlw %xmm1, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT:    psrlw %xmm1, %xmm6
+; SSE2-NEXT:    psrlw $8, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm3, %xmm0
+; SSE2-NEXT:    psllw %xmm3, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    psubb %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrlw %xmm1, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT:    psrlw %xmm1, %xmm7
+; SSE41-NEXT:    pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm5, %xmm7
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
+; SSE41-NEXT:    psllw %xmm1, %xmm6
+; SSE41-NEXT:    pshufb %xmm2, %xmm6
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm3, %xmm0, %xmm4
+; AVX2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpsrlw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX2-NEXT:    vpbroadcastb %xmm3, %xmm3
+; AVX2-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw %xmm1, %xmm5, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm0, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    psubb %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm6, %xmm6
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm6
+; X32-SSE-NEXT:    psrlw $8, %xmm6
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm3, %xmm0
+; X32-SSE-NEXT:    psllw %xmm3, %xmm5
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)
+  ret <16 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind {
+; SSE2-LABEL: constant_funnnel_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $4, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq $14, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllq $60, %xmm1
+; SSE2-NEXT:    psllq $50, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $14, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq $4, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $50, %xmm1
+; SSE41-NEXT:    psllq $60, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $14, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $4, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsllq $50, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllq $60, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [60,50]
+; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [60,50]
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v2i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $4, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq $14, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq $60, %xmm1
+; X32-SSE-NEXT:    psllq $50, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    orpd %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 4, i64 14>)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
+; SSE2-LABEL: constant_funnnel_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [28,27,26,25]
+; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [28,27,26,25]
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v4i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind {
+; SSE2-LABEL: constant_funnnel_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE2-NEXT:    pmulhuw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9]
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v8i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
+; X32-SSE-NEXT:    pmulhuw %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
+; SSE2-LABEL: constant_funnnel_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_funnnel_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2]
+; SSE41-NEXT:    pmullw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    packuswb %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    por %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512F-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: constant_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <16 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlq $14, %xmm1
+; SSE-NEXT:    psllq $50, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlq $14, %xmm0, %xmm1
+; AVX-NEXT:    vpsllq $50, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: splatconstant_funnnel_v2i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vprolq $50, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolq $50, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vprolq $50, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolq $50, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotq $50, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $14, %xmm1
+; X32-SSE-NEXT:    psllq $50, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 14, i64 14>)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $4, %xmm1
+; SSE-NEXT:    pslld $28, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $4, %xmm0, %xmm1
+; AVX-NEXT:    vpslld $28, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: splatconstant_funnnel_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vprold $28, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprold $28, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vprold $28, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprold $28, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotd $28, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $4, %xmm1
+; X32-SSE-NEXT:    pslld $28, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $7, %xmm1
+; SSE-NEXT:    psllw $9, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm1
+; AVX-NEXT:    vpsllw $9, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $7, %xmm0, %xmm1
+; AVX512-NEXT:    vpsllw $9, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotw $9, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $7, %xmm1
+; X32-SSE-NEXT:    psllw $9, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
+; SSE-LABEL: splatconstant_funnnel_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $4, %xmm1
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    psllw $4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_funnnel_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_funnnel_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshr-rot-256.ll b/test/CodeGen/X86/vector-fshr-rot-256.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d42d2e54fc34744b73d41389d6e34c34cf4fb572
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -0,0 +1,1780 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
+
+declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <16 x i16> @llvm.fshr.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
+declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63]
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllq %xmm4, %xmm2, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpsrlvq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_funnnel_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; AVX512-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512-NEXT:    vpsrlvq %ymm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; AVX512-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63]
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpsrlvq %ymm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %amt)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT:    vpsrld %xmm7, %xmm2, %xmm7
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm7
+; AVX1-NEXT:    vpsrld %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; AVX1-NEXT:    vpsrld %xmm7, %xmm0, %xmm7
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpsrlvd %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_funnnel_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
+; AVX512-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512-NEXT:    vpsrlvd %ymm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; AVX512-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpshld %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [31,31,31,31]
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshld %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpsrlvd %ymm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %amt)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $2, %xmm4, %xmm6
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm6
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $12, %xmm3, %xmm5
+; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm6
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm6
+; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX1-NEXT:    vpslld $23, %xmm7, %xmm7
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vpackusdw %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm8, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
+; AVX2-NEXT:    vpsrlvd %ymm6, %ymm3, %ymm6
+; AVX2-NEXT:    vpsrld $16, %ymm6, %ymm6
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
+; AVX2-NEXT:    vpsrlvd %ymm5, %ymm0, %ymm5
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpackusdw %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpsubw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX2-NEXT:    vpsllvd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpsrlvw %ymm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm6, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %amt)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
+; AVX1-LABEL: var_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT:    vpsubb %xmm5, %xmm9, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpsllw $2, %xmm3, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm4
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubb %xmm1, %xmm9, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm7
+; AVX1-NEXT:    vpand %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm8
+; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm5
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm0, %ymm3
+; AVX2-NEXT:    vpsrlw $2, %ymm3, %ymm5
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrlw $1, %ymm3, %ymm5
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: var_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm5
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm5
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsrlw $2, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $1, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm6, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %amt)
+  ret <32 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsllq %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm6, %xmm4, %xmm7
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllq %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsrlq %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsrlq %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; XOPAVX1-NEXT:    vmovapd {{.*#+}} xmm2 = [63,63]
+; XOPAVX1-NEXT:    vandpd %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpsrlq %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpsrlq %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
+  %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %splat)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsrld %xmm3, %xmm4, %xmm5
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm2
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpsrld %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX512-NEXT:    vpsrld %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; XOPAVX2-NEXT:    vpsrld %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOPAVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %splat)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm5
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_funnnel_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm4, %xmm2
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %splat)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
+; AVX1-LABEL: splatvar_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsllw $4, %xmm3, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm9, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm4
+; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm7
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT:    vpand %xmm10, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm7
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm9, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm7, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm7
+; AVX1-NEXT:    vpand %xmm10, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vpand %xmm8, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm3, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrlw %xmm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllw %xmm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw %xmm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm5, %ymm3
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpbroadcastb %xmm3, %ymm3
+; AVX512VL-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm1, %ymm5, %ymm1
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm3
+; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm3, %ymm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm5, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; XOPAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm6, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %splat)
+  ret <32 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
+; AVX1-LABEL: constant_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $50, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrlq $14, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vpsllq $4, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllq $14, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpsllq $50, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllq $60, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [60,50,14,4]
+; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [60,50,14,4]
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v4i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
+; AVX1-LABEL: constant_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16777216,8388608,4194304,2097152]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [28,27,26,25,24,23,22,21]
+; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [28,27,26,25,24,23,22,21]
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v8i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind {
+; AVX1-LABEL: constant_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [256,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; XOPAVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
+; AVX1-LABEL: constant_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [256,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [256,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm7, %xmm4, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsllw $2, %ymm1, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
+; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_funnnel_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,0,249,250,251,252,253,254,255]
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,0,249,250,251,252,253,254,255]
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <32 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $14, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrlq $14, %xmm2, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsllq $50, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq $50, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlq $14, %ymm0, %ymm1
+; AVX2-NEXT:    vpsllq $50, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatconstant_funnnel_v4i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vprolq $50, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprolq $50, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vprolq $50, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprolq $50, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotq $50, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotq $50, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotq $50, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotq $50, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm2
+; AVX1-NEXT:    vpslld $28, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm2
+; AVX1-NEXT:    vpslld $28, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpslld $28, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: splatconstant_funnnel_v8i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vprold $28, %zmm0, %zmm0
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vprold $28, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vprold $28, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vprold $28, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotd $28, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotd $28, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotd $28, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotd $28, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
+  ret <8 x i32> %res
+}
+
+define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $9, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $9, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm1
+; AVX2-NEXT:    vpsllw $9, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $7, %ymm0, %ymm1
+; AVX512-NEXT:    vpsllw $9, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotw $9, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotw $9, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotw $9, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotw $9, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <16 x i16> %res
+}
+
+define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
+; AVX1-LABEL: splatconstant_funnnel_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_funnnel_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_funnnel_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-fshr-rot-512.ll b/test/CodeGen/X86/vector-fshr-rot-512.ll
new file mode 100644
index 0000000000000000000000000000000000000000..792f172e4d169b94014281349b1dfe45bd7040f4
--- /dev/null
+++ b/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -0,0 +1,877 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+
+declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
+declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
+
+;
+; Variable Shifts
+;
+
+define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
+; AVX512-LABEL: var_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [63,63,63,63,63,63,63,63]
+; AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512-NEXT:    vpsrlvq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %zmm1, %zmm4, %zmm1
+; AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
+; AVX512-LABEL: var_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512-NEXT:    vpandd %zmm2, %zmm1, %zmm3
+; AVX512-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %zmm1, %zmm4, %zmm1
+; AVX512-NEXT:    vpandd %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
+; AVX512F-LABEL: var_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm6 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm6, %zmm0, %zmm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm7, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm6, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpsubw %ymm3, %ymm4, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm7, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm6 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm6, %zmm0, %zmm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm7, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm0, %zmm6, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpsubw %ymm3, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm7, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubw %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm4, %zmm1
+; AVX512VLBW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
+; AVX512F-LABEL: var_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpand %ymm8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm10
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm11, %ymm10, %ymm10
+; AVX512F-NEXT:    vpor %ymm4, %ymm10, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpand %ymm10, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm12
+; AVX512F-NEXT:    vpor %ymm4, %ymm12, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT:    vpand %ymm8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $6, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm11, %ymm4, %ymm4
+; AVX512F-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm10, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
+; AVX512F-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpand %ymm8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm10
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm11, %ymm10, %ymm10
+; AVX512VL-NEXT:    vpor %ymm4, %ymm10, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm10 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpand %ymm10, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm12
+; AVX512VL-NEXT:    vpor %ymm4, %ymm12, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpsubb %ymm3, %ymm6, %ymm3
+; AVX512VL-NEXT:    vpand %ymm8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $6, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand %ymm11, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm10, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm3, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm3, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm3, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm3, %k2
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm3, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm3, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %zmm1, %zmm4, %zmm1
+; AVX512VLBW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
+  ret <64 x i8> %res
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
+; AVX512-LABEL: splatvar_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm1, %zmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
+; AVX512-LABEL: splatvar_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm1, %zmm1
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX512-NEXT:    vpsrld %xmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpslld %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
+; AVX512F-LABEL: splatvar_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm5, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm5, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastw %xmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
+; AVX512F-LABEL: splatvar_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm5, %ymm6
+; AVX512F-NEXT:    vpbroadcastb %xmm6, %ymm6
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm7, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpbroadcastb %xmm6, %ymm6
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm7, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw %xmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm0, %zmm4
+; AVX512VLBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm1
+; AVX512VLBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
+  ret <64 x i8> %res
+}
+
+;
+; Constant Shifts
+;
+
+define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind {
+; AVX512-LABEL: constant_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vprolvq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind {
+; AVX512-LABEL: constant_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vprolvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
+; AVX512F-LABEL: constant_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
+; AVX512F-LABEL: constant_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpsrlw $8, %ymm9, %ymm9
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm9, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm3
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm9, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpmullw %ymm9, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm2, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <64 x i8> %res
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind {
+; AVX512-LABEL: splatconstant_funnnel_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vprolq $50, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
+; AVX512-LABEL: splatconstant_funnnel_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vprold $28, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
+  ret <16 x i32> %res
+}
+
+define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
+; AVX512F-LABEL: splatconstant_funnnel_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $9, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $9, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $9, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $9, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsllw $9, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $7, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $9, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
+; AVX512F-LABEL: splatconstant_funnnel_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
+  ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index 4c5c348302b778d8b456dc005bbbfe17a3b86c37..6b3e709e40ee89b9d004caa446701a27ac3847b4 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -1,13 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7-avx | FileCheck %s
 ; RUN: opt -instsimplify -disable-output < %s
 
-;CHECK-LABEL: AGEP0:
 define <4 x i32*> @AGEP0(i32* %ptr) nounwind {
-entry:
-;CHECK-LABEL: AGEP0
-;CHECK: vbroadcast
-;CHECK-NEXT: vpaddd
-;CHECK-NEXT: ret
+; CHECK-LABEL: AGEP0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    vpaddd {{\.LCPI.*}}, %xmm0, %xmm0
+; CHECK-NEXT:    retl
   %vecinit.i = insertelement <4 x i32*> undef, i32* %ptr, i32 0
   %vecinit2.i = insertelement <4 x i32*> %vecinit.i, i32* %ptr, i32 1
   %vecinit4.i = insertelement <4 x i32*> %vecinit2.i, i32* %ptr, i32 2
@@ -17,109 +17,185 @@ entry:
   ret <4 x i32*> %A3
 }
 
-;CHECK-LABEL: AGEP1:
 define i32 @AGEP1(<4 x i32*> %param) nounwind {
-entry:
-;CHECK-LABEL: AGEP1
-;CHECK: vpaddd
-;CHECK-NEXT: vpextrd
-;CHECK-NEXT: movl
+; CHECK-LABEL: AGEP1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpaddd {{\.LCPI.*}}, %xmm0, %xmm0
+; CHECK-NEXT:    vpextrd $3, %xmm0, %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    retl
   %A2 = getelementptr i32, <4 x i32*> %param, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   %k = extractelement <4 x i32*> %A2, i32 3
   %v = load i32, i32* %k
   ret i32 %v
-;CHECK: ret
 }
 
-;CHECK-LABEL: AGEP2:
 define i32 @AGEP2(<4 x i32*> %param, <4 x i32> %off) nounwind {
-entry:
-;CHECK-LABEL: AGEP2
-;CHECK: vpslld $2
-;CHECK-NEXT: vpadd
+; CHECK-LABEL: AGEP2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpslld $2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpextrd $3, %xmm0, %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    retl
   %A2 = getelementptr i32, <4 x i32*> %param, <4 x i32> %off
   %k = extractelement <4 x i32*> %A2, i32 3
   %v = load i32, i32* %k
   ret i32 %v
-;CHECK: ret
 }
 
-;CHECK-LABEL: AGEP3:
 define <4 x i32*> @AGEP3(<4 x i32*> %param, <4 x i32> %off) nounwind {
-entry:
-;CHECK-LABEL: AGEP3
-;CHECK: vpslld $2
-;CHECK-NEXT: vpadd
+; CHECK-LABEL: AGEP3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    vpslld $2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
   %A2 = getelementptr i32, <4 x i32*> %param, <4 x i32> %off
   %v = alloca i32
   %k = insertelement <4 x i32*> %A2, i32* %v, i32 3
   ret <4 x i32*> %k
-;CHECK: ret
 }
 
-;CHECK-LABEL: AGEP4:
 define <4 x i16*> @AGEP4(<4 x i16*> %param, <4 x i32> %off) nounwind {
-entry:
-;CHECK-LABEL: AGEP4
 ; Multiply offset by two (add it to itself).
-;CHECK: vpadd
 ; add the base to the offset
-;CHECK-NEXT: vpadd
+; CHECK-LABEL: AGEP4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retl
   %A = getelementptr i16, <4 x i16*> %param, <4 x i32> %off
   ret <4 x i16*> %A
-;CHECK: ret
 }
 
-;CHECK-LABEL: AGEP5:
 define <4 x i8*> @AGEP5(<4 x i8*> %param, <4 x i8> %off) nounwind {
-entry:
-;CHECK-LABEL: AGEP5
-;CHECK: vpaddd
+; CHECK-LABEL: AGEP5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpslld $24, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrad $24, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retl
   %A = getelementptr i8, <4 x i8*> %param, <4 x i8> %off
   ret <4 x i8*> %A
-;CHECK: ret
 }
 
 
 ; The size of each element is 1 byte. No need to multiply by element size.
-;CHECK-LABEL: AGEP6:
 define <4 x i8*> @AGEP6(<4 x i8*> %param, <4 x i32> %off) nounwind {
-entry:
-;CHECK-LABEL: AGEP6
-;CHECK-NOT: pslld
+; CHECK-LABEL: AGEP6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retl
   %A = getelementptr i8, <4 x i8*> %param, <4 x i32> %off
   ret <4 x i8*> %A
-;CHECK: ret
 }
 
-;CHECK-LABEL: AGEP7:
 define <4 x i8*> @AGEP7(<4 x i8*> %param, i32 %off) nounwind {
-entry:
-;CHECK: vbroadcastss
-;CHECK: vpadd
+; CHECK-LABEL: AGEP7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retl
   %A = getelementptr i8, <4 x i8*> %param, i32 %off
   ret <4 x i8*> %A
-;CHECK: ret
 }
 
-;CHECK-LABEL: AGEP8:
 define <4 x i16*> @AGEP8(i16* %param, <4 x i32> %off) nounwind {
-entry:
 ; Multiply offset by two (add it to itself).
-;CHECK: vpadd
 ; add the base to the offset
-;CHECK: vbroadcastss
-;CHECK-NEXT: vpadd
+; CHECK-LABEL: AGEP8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retl
   %A = getelementptr i16, i16* %param, <4 x i32> %off
   ret <4 x i16*> %A
-;CHECK: ret
 }
 
-;CHECK-LABEL: AGEP9:
 define <64 x i16*> @AGEP9(i16* %param, <64 x i32> %off) nounwind {
-entry:
-;CHECK: vbroadcastss
+; CHECK-LABEL: AGEP9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-32, %esp
+; CHECK-NEXT:    subl $96, %esp
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm4
+; CHECK-NEXT:    vbroadcastss 12(%ebp), %xmm3
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; CHECK-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm0
+; CHECK-NEXT:    vmovaps %ymm0, (%esp) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; CHECK-NEXT:    vmovdqa 40(%ebp), %xmm4
+; CHECK-NEXT:    vmovdqa 56(%ebp), %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm5, %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm3, %xmm5
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; CHECK-NEXT:    vmovdqa 72(%ebp), %xmm5
+; CHECK-NEXT:    vmovdqa 88(%ebp), %xmm6
+; CHECK-NEXT:    vpaddd %xmm6, %xmm6, %xmm6
+; CHECK-NEXT:    vpaddd %xmm6, %xmm3, %xmm6
+; CHECK-NEXT:    vpaddd %xmm5, %xmm5, %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm3, %xmm5
+; CHECK-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; CHECK-NEXT:    vmovdqa 104(%ebp), %xmm6
+; CHECK-NEXT:    vmovdqa 120(%ebp), %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm3, %xmm7
+; CHECK-NEXT:    vpaddd %xmm6, %xmm6, %xmm6
+; CHECK-NEXT:    vpaddd %xmm6, %xmm3, %xmm6
+; CHECK-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
+; CHECK-NEXT:    vmovdqa 152(%ebp), %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm3, %xmm7
+; CHECK-NEXT:    vmovdqa 136(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; CHECK-NEXT:    vmovdqa 184(%ebp), %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm3, %xmm7
+; CHECK-NEXT:    vmovdqa 168(%ebp), %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    vmovaps %ymm1, 224(%eax)
+; CHECK-NEXT:    vmovaps %ymm0, 192(%eax)
+; CHECK-NEXT:    vmovaps %ymm6, 160(%eax)
+; CHECK-NEXT:    vmovaps %ymm5, 128(%eax)
+; CHECK-NEXT:    vmovaps %ymm4, 96(%eax)
+; CHECK-NEXT:    vmovaps %ymm2, 64(%eax)
+; CHECK-NEXT:    vmovaps (%esp), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovaps %ymm0, 32(%eax)
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovaps %ymm0, (%eax)
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl $4
   %A = getelementptr i16, i16* %param, <64 x i32> %off
   ret <64 x i16*> %A
-;CHECK: ret
 }
+
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index e4874cce7e60eecafeb0604c3222d69eb4183798..53274d60b9e97af355a26fa59247701eb0ac76a6 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -22,224 +22,63 @@ define float @cvt_i16_to_f32(i16 %a0) nounwind {
 }
 
 define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_4i16_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    movq %rax, %rdx
-; AVX1-NEXT:    movswl %ax, %esi
-; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX1-NEXT:    shrl $16, %eax
-; AVX1-NEXT:    shrq $32, %rcx
-; AVX1-NEXT:    shrq $48, %rdx
-; AVX1-NEXT:    movswl %dx, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm0
-; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT:    movswl %cx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm1
-; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT:    cwtl
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT:    vmovd %esi, %xmm3
-; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: cvt_4i16_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    movq %rax, %rdx
-; AVX2-NEXT:    movswl %ax, %esi
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    shrl $16, %eax
-; AVX2-NEXT:    shrq $32, %rcx
-; AVX2-NEXT:    shrq $48, %rdx
-; AVX2-NEXT:    movswl %dx, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm0
-; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT:    movswl %cx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm1
-; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT:    cwtl
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT:    vmovd %esi, %xmm3
-; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: cvt_4i16_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    movq %rax, %rdx
-; AVX512F-NEXT:    movswl %ax, %esi
-; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512F-NEXT:    shrl $16, %eax
-; AVX512F-NEXT:    shrq $32, %rcx
-; AVX512F-NEXT:    shrq $48, %rdx
-; AVX512F-NEXT:    movswl %dx, %edx
-; AVX512F-NEXT:    vmovd %edx, %xmm0
-; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT:    movswl %cx, %ecx
-; AVX512F-NEXT:    vmovd %ecx, %xmm1
-; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT:    cwtl
-; AVX512F-NEXT:    vmovd %eax, %xmm2
-; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    vmovd %esi, %xmm3
-; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: cvt_4i16_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    movq %rax, %rdx
-; AVX512VL-NEXT:    movswl %ax, %esi
-; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512VL-NEXT:    shrl $16, %eax
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    shrq $48, %rdx
-; AVX512VL-NEXT:    movswl %dx, %edx
-; AVX512VL-NEXT:    vmovd %edx, %xmm0
-; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    movswl %cx, %ecx
-; AVX512VL-NEXT:    vmovd %ecx, %xmm1
-; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT:    cwtl
-; AVX512VL-NEXT:    vmovd %eax, %xmm2
-; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovd %esi, %xmm3
-; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: cvt_4i16_to_4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT:    vmovq %xmm0, %rax
+; ALL-NEXT:    movq %rax, %rcx
+; ALL-NEXT:    movq %rax, %rdx
+; ALL-NEXT:    movswl %ax, %esi
+; ALL-NEXT:    # kill: def $eax killed $eax killed $rax
+; ALL-NEXT:    shrl $16, %eax
+; ALL-NEXT:    shrq $32, %rcx
+; ALL-NEXT:    shrq $48, %rdx
+; ALL-NEXT:    movswl %dx, %edx
+; ALL-NEXT:    vmovd %edx, %xmm0
+; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT:    movswl %cx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm1
+; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT:    cwtl
+; ALL-NEXT:    vmovd %eax, %xmm2
+; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT:    vmovd %esi, %xmm3
+; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT:    retq
   %1 = bitcast <4 x i16> %a0 to <4 x half>
   %2 = fpext <4 x half> %1 to <4 x float>
   ret <4 x float> %2
 }
 
 define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_8i16_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    movq %rax, %rdx
-; AVX1-NEXT:    movswl %ax, %esi
-; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX1-NEXT:    shrl $16, %eax
-; AVX1-NEXT:    shrq $32, %rcx
-; AVX1-NEXT:    shrq $48, %rdx
-; AVX1-NEXT:    movswl %dx, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm0
-; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT:    movswl %cx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm1
-; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT:    cwtl
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT:    vmovd %esi, %xmm3
-; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: cvt_8i16_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    movq %rax, %rdx
-; AVX2-NEXT:    movswl %ax, %esi
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    shrl $16, %eax
-; AVX2-NEXT:    shrq $32, %rcx
-; AVX2-NEXT:    shrq $48, %rdx
-; AVX2-NEXT:    movswl %dx, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm0
-; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT:    movswl %cx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm1
-; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT:    cwtl
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT:    vmovd %esi, %xmm3
-; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: cvt_8i16_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    movq %rax, %rdx
-; AVX512F-NEXT:    movswl %ax, %esi
-; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512F-NEXT:    shrl $16, %eax
-; AVX512F-NEXT:    shrq $32, %rcx
-; AVX512F-NEXT:    shrq $48, %rdx
-; AVX512F-NEXT:    movswl %dx, %edx
-; AVX512F-NEXT:    vmovd %edx, %xmm0
-; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT:    movswl %cx, %ecx
-; AVX512F-NEXT:    vmovd %ecx, %xmm1
-; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT:    cwtl
-; AVX512F-NEXT:    vmovd %eax, %xmm2
-; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    vmovd %esi, %xmm3
-; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    movq %rax, %rdx
-; AVX512VL-NEXT:    movswl %ax, %esi
-; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512VL-NEXT:    shrl $16, %eax
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    shrq $48, %rdx
-; AVX512VL-NEXT:    movswl %dx, %edx
-; AVX512VL-NEXT:    vmovd %edx, %xmm0
-; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    movswl %cx, %ecx
-; AVX512VL-NEXT:    vmovd %ecx, %xmm1
-; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT:    cwtl
-; AVX512VL-NEXT:    vmovd %eax, %xmm2
-; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovd %esi, %xmm3
-; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: cvt_8i16_to_4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq %xmm0, %rax
+; ALL-NEXT:    movq %rax, %rcx
+; ALL-NEXT:    movq %rax, %rdx
+; ALL-NEXT:    movswl %ax, %esi
+; ALL-NEXT:    # kill: def $eax killed $eax killed $rax
+; ALL-NEXT:    shrl $16, %eax
+; ALL-NEXT:    shrq $32, %rcx
+; ALL-NEXT:    shrq $48, %rdx
+; ALL-NEXT:    movswl %dx, %edx
+; ALL-NEXT:    vmovd %edx, %xmm0
+; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT:    movswl %cx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm1
+; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT:    cwtl
+; ALL-NEXT:    vmovd %eax, %xmm2
+; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT:    vmovd %esi, %xmm3
+; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT:    retq
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = bitcast <4 x i16> %1 to <4 x half>
   %3 = fpext <4 x half> %2 to <4 x float>
@@ -730,111 +569,31 @@ define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
 }
 
 define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_8i16_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    movq (%rdi), %rax
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    movq %rax, %rdx
-; AVX1-NEXT:    movswl %ax, %esi
-; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX1-NEXT:    shrl $16, %eax
-; AVX1-NEXT:    shrq $32, %rcx
-; AVX1-NEXT:    shrq $48, %rdx
-; AVX1-NEXT:    movswl %dx, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm0
-; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT:    movswl %cx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm1
-; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT:    cwtl
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT:    vmovd %esi, %xmm3
-; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_cvt_8i16_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movq (%rdi), %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    movq %rax, %rdx
-; AVX2-NEXT:    movswl %ax, %esi
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    shrl $16, %eax
-; AVX2-NEXT:    shrq $32, %rcx
-; AVX2-NEXT:    shrq $48, %rdx
-; AVX2-NEXT:    movswl %dx, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm0
-; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT:    movswl %cx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm1
-; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT:    cwtl
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT:    vmovd %esi, %xmm3
-; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: load_cvt_8i16_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq (%rdi), %rax
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    movq %rax, %rdx
-; AVX512F-NEXT:    movswl %ax, %esi
-; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512F-NEXT:    shrl $16, %eax
-; AVX512F-NEXT:    shrq $32, %rcx
-; AVX512F-NEXT:    shrq $48, %rdx
-; AVX512F-NEXT:    movswl %dx, %edx
-; AVX512F-NEXT:    vmovd %edx, %xmm0
-; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT:    movswl %cx, %ecx
-; AVX512F-NEXT:    vmovd %ecx, %xmm1
-; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT:    cwtl
-; AVX512F-NEXT:    vmovd %eax, %xmm2
-; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    vmovd %esi, %xmm3
-; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    movq %rax, %rdx
-; AVX512VL-NEXT:    movswl %ax, %esi
-; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512VL-NEXT:    shrl $16, %eax
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    shrq $48, %rdx
-; AVX512VL-NEXT:    movswl %dx, %edx
-; AVX512VL-NEXT:    vmovd %edx, %xmm0
-; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    movswl %cx, %ecx
-; AVX512VL-NEXT:    vmovd %ecx, %xmm1
-; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT:    cwtl
-; AVX512VL-NEXT:    vmovd %eax, %xmm2
-; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovd %esi, %xmm3
-; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: load_cvt_8i16_to_4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    movq (%rdi), %rax
+; ALL-NEXT:    movq %rax, %rcx
+; ALL-NEXT:    movq %rax, %rdx
+; ALL-NEXT:    movswl %ax, %esi
+; ALL-NEXT:    # kill: def $eax killed $eax killed $rax
+; ALL-NEXT:    shrl $16, %eax
+; ALL-NEXT:    shrq $32, %rcx
+; ALL-NEXT:    shrq $48, %rdx
+; ALL-NEXT:    movswl %dx, %edx
+; ALL-NEXT:    vmovd %edx, %xmm0
+; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT:    movswl %cx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm1
+; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT:    cwtl
+; ALL-NEXT:    vmovd %eax, %xmm2
+; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT:    vmovd %esi, %xmm3
+; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = bitcast <4 x i16> %2 to <4 x half>
@@ -1261,125 +1020,35 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
 }
 
 define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_4i16_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    movl %eax, %edx
-; AVX1-NEXT:    movswl %ax, %esi
-; AVX1-NEXT:    shrq $48, %rax
-; AVX1-NEXT:    shrq $32, %rcx
-; AVX1-NEXT:    shrl $16, %edx
-; AVX1-NEXT:    movswl %dx, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm0
-; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %esi, %xmm1
-; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT:    movswl %cx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT:    cwtl
-; AVX1-NEXT:    vmovd %eax, %xmm3
-; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: cvt_4i16_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    movl %eax, %edx
-; AVX2-NEXT:    movswl %ax, %esi
-; AVX2-NEXT:    shrq $48, %rax
-; AVX2-NEXT:    shrq $32, %rcx
-; AVX2-NEXT:    shrl $16, %edx
-; AVX2-NEXT:    movswl %dx, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm0
-; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %esi, %xmm1
-; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT:    movswl %cx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT:    cwtl
-; AVX2-NEXT:    vmovd %eax, %xmm3
-; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: cvt_4i16_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    movl %eax, %edx
-; AVX512F-NEXT:    movswl %ax, %esi
-; AVX512F-NEXT:    shrq $48, %rax
-; AVX512F-NEXT:    shrq $32, %rcx
-; AVX512F-NEXT:    shrl $16, %edx
-; AVX512F-NEXT:    movswl %dx, %edx
-; AVX512F-NEXT:    vmovd %edx, %xmm0
-; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT:    vmovd %esi, %xmm1
-; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT:    movswl %cx, %ecx
-; AVX512F-NEXT:    vmovd %ecx, %xmm2
-; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    cwtl
-; AVX512F-NEXT:    vmovd %eax, %xmm3
-; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: cvt_4i16_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    movl %eax, %edx
-; AVX512VL-NEXT:    movswl %ax, %esi
-; AVX512VL-NEXT:    shrq $48, %rax
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    shrl $16, %edx
-; AVX512VL-NEXT:    movswl %dx, %edx
-; AVX512VL-NEXT:    vmovd %edx, %xmm0
-; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovd %esi, %xmm1
-; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT:    movswl %cx, %ecx
-; AVX512VL-NEXT:    vmovd %ecx, %xmm2
-; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    cwtl
-; AVX512VL-NEXT:    vmovd %eax, %xmm3
-; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: cvt_4i16_to_4f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT:    vmovq %xmm0, %rax
+; ALL-NEXT:    movq %rax, %rcx
+; ALL-NEXT:    movl %eax, %edx
+; ALL-NEXT:    movswl %ax, %esi
+; ALL-NEXT:    shrq $48, %rax
+; ALL-NEXT:    shrq $32, %rcx
+; ALL-NEXT:    shrl $16, %edx
+; ALL-NEXT:    movswl %dx, %edx
+; ALL-NEXT:    vmovd %edx, %xmm0
+; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT:    vmovd %esi, %xmm1
+; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT:    movswl %cx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm2
+; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT:    cwtl
+; ALL-NEXT:    vmovd %eax, %xmm3
+; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    retq
   %1 = bitcast <4 x i16> %a0 to <4 x half>
   %2 = fpext <4 x half> %1 to <4 x double>
   ret <4 x double> %2
@@ -1454,123 +1123,34 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
 }
 
 define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_8i16_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    movl %eax, %edx
-; AVX1-NEXT:    movswl %ax, %esi
-; AVX1-NEXT:    shrq $48, %rax
-; AVX1-NEXT:    shrq $32, %rcx
-; AVX1-NEXT:    shrl $16, %edx
-; AVX1-NEXT:    movswl %dx, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm0
-; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %esi, %xmm1
-; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT:    movswl %cx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT:    cwtl
-; AVX1-NEXT:    vmovd %eax, %xmm3
-; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: cvt_8i16_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    movl %eax, %edx
-; AVX2-NEXT:    movswl %ax, %esi
-; AVX2-NEXT:    shrq $48, %rax
-; AVX2-NEXT:    shrq $32, %rcx
-; AVX2-NEXT:    shrl $16, %edx
-; AVX2-NEXT:    movswl %dx, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm0
-; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %esi, %xmm1
-; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT:    movswl %cx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT:    cwtl
-; AVX2-NEXT:    vmovd %eax, %xmm3
-; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: cvt_8i16_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    movl %eax, %edx
-; AVX512F-NEXT:    movswl %ax, %esi
-; AVX512F-NEXT:    shrq $48, %rax
-; AVX512F-NEXT:    shrq $32, %rcx
-; AVX512F-NEXT:    shrl $16, %edx
-; AVX512F-NEXT:    movswl %dx, %edx
-; AVX512F-NEXT:    vmovd %edx, %xmm0
-; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT:    vmovd %esi, %xmm1
-; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT:    movswl %cx, %ecx
-; AVX512F-NEXT:    vmovd %ecx, %xmm2
-; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    cwtl
-; AVX512F-NEXT:    vmovd %eax, %xmm3
-; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    movl %eax, %edx
-; AVX512VL-NEXT:    movswl %ax, %esi
-; AVX512VL-NEXT:    shrq $48, %rax
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    shrl $16, %edx
-; AVX512VL-NEXT:    movswl %dx, %edx
-; AVX512VL-NEXT:    vmovd %edx, %xmm0
-; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovd %esi, %xmm1
-; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT:    movswl %cx, %ecx
-; AVX512VL-NEXT:    vmovd %ecx, %xmm2
-; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    cwtl
-; AVX512VL-NEXT:    vmovd %eax, %xmm3
-; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: cvt_8i16_to_4f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq %xmm0, %rax
+; ALL-NEXT:    movq %rax, %rcx
+; ALL-NEXT:    movl %eax, %edx
+; ALL-NEXT:    movswl %ax, %esi
+; ALL-NEXT:    shrq $48, %rax
+; ALL-NEXT:    shrq $32, %rcx
+; ALL-NEXT:    shrl $16, %edx
+; ALL-NEXT:    movswl %dx, %edx
+; ALL-NEXT:    vmovd %edx, %xmm0
+; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT:    vmovd %esi, %xmm1
+; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT:    movswl %cx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm2
+; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT:    cwtl
+; ALL-NEXT:    vmovd %eax, %xmm3
+; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    retq
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = bitcast <4 x i16> %1 to <4 x half>
   %3 = fpext <4 x half> %2 to <4 x double>
@@ -1812,123 +1392,34 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
 }
 
 define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_8i16_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    movq (%rdi), %rax
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    movl %eax, %edx
-; AVX1-NEXT:    movswl %ax, %esi
-; AVX1-NEXT:    shrq $48, %rax
-; AVX1-NEXT:    shrq $32, %rcx
-; AVX1-NEXT:    shrl $16, %edx
-; AVX1-NEXT:    movswl %dx, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm0
-; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %esi, %xmm1
-; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT:    movswl %cx, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT:    cwtl
-; AVX1-NEXT:    vmovd %eax, %xmm3
-; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_cvt_8i16_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movq (%rdi), %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    movl %eax, %edx
-; AVX2-NEXT:    movswl %ax, %esi
-; AVX2-NEXT:    shrq $48, %rax
-; AVX2-NEXT:    shrq $32, %rcx
-; AVX2-NEXT:    shrl $16, %edx
-; AVX2-NEXT:    movswl %dx, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm0
-; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %esi, %xmm1
-; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT:    movswl %cx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT:    cwtl
-; AVX2-NEXT:    vmovd %eax, %xmm3
-; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: load_cvt_8i16_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq (%rdi), %rax
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    movl %eax, %edx
-; AVX512F-NEXT:    movswl %ax, %esi
-; AVX512F-NEXT:    shrq $48, %rax
-; AVX512F-NEXT:    shrq $32, %rcx
-; AVX512F-NEXT:    shrl $16, %edx
-; AVX512F-NEXT:    movswl %dx, %edx
-; AVX512F-NEXT:    vmovd %edx, %xmm0
-; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT:    vmovd %esi, %xmm1
-; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT:    movswl %cx, %ecx
-; AVX512F-NEXT:    vmovd %ecx, %xmm2
-; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    cwtl
-; AVX512F-NEXT:    vmovd %eax, %xmm3
-; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    movl %eax, %edx
-; AVX512VL-NEXT:    movswl %ax, %esi
-; AVX512VL-NEXT:    shrq $48, %rax
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    shrl $16, %edx
-; AVX512VL-NEXT:    movswl %dx, %edx
-; AVX512VL-NEXT:    vmovd %edx, %xmm0
-; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovd %esi, %xmm1
-; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT:    movswl %cx, %ecx
-; AVX512VL-NEXT:    vmovd %ecx, %xmm2
-; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    cwtl
-; AVX512VL-NEXT:    vmovd %eax, %xmm3
-; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: load_cvt_8i16_to_4f64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    movq (%rdi), %rax
+; ALL-NEXT:    movq %rax, %rcx
+; ALL-NEXT:    movl %eax, %edx
+; ALL-NEXT:    movswl %ax, %esi
+; ALL-NEXT:    shrq $48, %rax
+; ALL-NEXT:    shrq $32, %rcx
+; ALL-NEXT:    shrl $16, %edx
+; ALL-NEXT:    movswl %dx, %edx
+; ALL-NEXT:    vmovd %edx, %xmm0
+; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT:    vmovd %esi, %xmm1
+; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT:    movswl %cx, %ecx
+; ALL-NEXT:    vmovd %ecx, %xmm2
+; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT:    cwtl
+; ALL-NEXT:    vmovd %eax, %xmm3
+; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = bitcast <4 x i16> %2 to <4 x half>
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index f428a55bf513895c7578ffdb64bb319f9ee01588..2c41ee31a101d7f54e71f8bea7ab171443ac60aa 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -86,8 +86,8 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
 ; SSE2-NEXT:    pmuludq %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm3, %xmm1
@@ -170,13 +170,11 @@ define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
 define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: test_div7_16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
 ; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    pmullw %xmm3, %xmm1
@@ -281,6 +279,165 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
   ret <16 x i8> %res
 }
 
+;
+; sdiv by non-splat constant
+;
+
+define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_divconstant_16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $7, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_divconstant_16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packuswb %xmm1, %xmm2
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmovsxbw %xmm3, %xmm1
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    paddb %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE41-NEXT:    psraw $8, %xmm2
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    packuswb %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $7, %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    paddb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_divconstant_16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2NOBW-LABEL: test_divconstant_16i8:
+; AVX2NOBW:       # %bb.0:
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX2NOBW-NEXT:    vzeroupper
+; AVX2NOBW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_divconstant_16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2]
+; AVX512BW-NEXT:    vpsravw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
+  ret <16 x i8> %res
+}
+
 ;
 ; srem by 7
 ;
@@ -386,8 +543,8 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm3, %xmm2
@@ -484,13 +641,11 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
 define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: test_rem7_16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
 ; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    pmullw %xmm3, %xmm1
@@ -510,8 +665,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    psllw $3, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    psubb %xmm2, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test_rem7_16i8:
@@ -615,3 +769,538 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
   %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <16 x i8> %res
 }
+
+;
+; srem by non-splat constant
+;
+
+define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_remconstant_16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    psrlw $8, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    packuswb %xmm4, %xmm2
+; SSE2-NEXT:    paddb %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    psrlw $8, %xmm4
+; SSE2-NEXT:    packuswb %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $7, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    paddb %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    psubb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_remconstant_16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    packuswb %xmm2, %xmm3
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxbw %xmm4, %xmm4
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    packuswb %xmm4, %xmm2
+; SSE41-NEXT:    paddb %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT:    psraw $8, %xmm3
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE41-NEXT:    psraw $8, %xmm4
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    packuswb %xmm3, %xmm4
+; SSE41-NEXT:    psrlw $7, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    paddb %xmm4, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    packuswb %xmm2, %xmm3
+; SSE41-NEXT:    psubb %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_remconstant_16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2NOBW-LABEL: test_remconstant_16i8:
+; AVX2NOBW:       # %bb.0:
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2NOBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2NOBW-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm3
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2NOBW-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm3
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2NOBW-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vzeroupper
+; AVX2NOBW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_remconstant_16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT:    vpsrlw $7, %xmm1, %xmm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2]
+; AVX512BW-NEXT:    vpsravw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %res = srem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
+  ret <16 x i8> %res
+}
+
+; This test is just to show what an scalarized v16i8 division looks like.
+define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE2-LABEL: test_rem_variable_16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movl %ecx, %eax
+; SSE2-NEXT:    cbtw
+; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movsbl %ah, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_rem_variable_16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrb $1, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pextrb $1, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pextrb $0, %xmm1, %edx
+; SSE41-NEXT:    idivb %dl
+; SSE41-NEXT:    movsbl %ah, %eax
+; SSE41-NEXT:    movd %eax, %xmm2
+; SSE41-NEXT:    pextrb $2, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $2, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $3, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $3, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $4, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $4, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $5, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $5, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $6, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $6, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $7, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $7, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $8, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $8, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $9, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $9, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $10, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $10, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $11, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $11, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $12, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $12, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $13, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $13, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $14, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $14, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %ecx
+; SSE41-NEXT:    pextrb $15, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    cbtw
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm2
+; SSE41-NEXT:    pextrb $15, %xmm1, %ecx
+; SSE41-NEXT:    idivb %cl
+; SSE41-NEXT:    movsbl %ah, %eax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_rem_variable_16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpextrb $1, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpextrb $0, %xmm1, %edx
+; AVX-NEXT:    idivb %dl
+; AVX-NEXT:    movsbl %ah, %eax
+; AVX-NEXT:    vmovd %eax, %xmm2
+; AVX-NEXT:    vpextrb $2, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $2, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $3, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $3, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $4, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $5, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $5, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $6, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $6, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $7, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $7, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $8, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $9, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $9, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $10, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $10, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $11, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $11, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $12, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $13, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $13, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $14, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $14, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %ecx
+; AVX-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    cbtw
+; AVX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm0
+; AVX-NEXT:    vpextrb $15, %xmm1, %ecx
+; AVX-NEXT:    idivb %cl
+; AVX-NEXT:    movsbl %ah, %eax
+; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %res = srem <16 x i8> %a, %b
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index eb59603a35ac265d14b545dda8af4e22acbe0437..d612d73448754d430e6d49b87d9be88cb5a7fe14 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -241,6 +241,130 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
   ret <32 x i8> %res
 }
 
+;
+; sdiv by non-splat constant
+;
+
+define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_divconstant_32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm4
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $7, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2NOBW-LABEL: test_divconstant_32i8:
+; AVX2NOBW:       # %bb.0:
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2NOBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpand %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2NOBW-NEXT:    vpsraw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2NOBW-NEXT:    vpsraw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2NOBW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_divconstant_32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpsrlw $7, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+  %res = sdiv <32 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
+  ret <32 x i8> %res
+}
+
 ;
 ; srem by 7
 ;
@@ -544,3 +668,155 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
   %res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <32 x i8> %res
 }
+
+;
+; srem by non-splat constant
+;
+
+define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_remconstant_32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm4
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm5
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsraw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlw $7, %xmm3, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm1, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm1, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm5
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm6, %xmm6
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsraw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $7, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2NOBW-LABEL: test_remconstant_32i8:
+; AVX2NOBW:       # %bb.0:
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm2
+; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2NOBW-NEXT:    vpand %ymm1, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpand %ymm1, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm4
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX2NOBW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
+; AVX2NOBW-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2NOBW-NEXT:    vpsraw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2NOBW-NEXT:    vpsraw $8, %ymm4, %ymm4
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $7, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpand %ymm1, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_remconstant_32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm2
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vpsrlw $7, %ymm1, %ymm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+  %res = srem <32 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
+  ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index f21dc96aa0319e96d5e3be3e5dc001df06dcc12e..86c706c03a70c08ab6bb6a4a71e49f37e2e4700a 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -194,6 +194,113 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
   ret <64 x i8> %res
 }
 
+;
+; sdiv by non-splat constant
+;
+
+define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_divconstant_64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT:    vpmovsxbw %xmm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT:    vpsraw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpsraw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT:    vpsraw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpsraw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_divconstant_64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm2
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = sdiv <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
+  ret <64 x i8> %res
+}
+
 ;
 ; srem by 7
 ;
@@ -439,3 +546,134 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
   %res = srem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <64 x i8> %res
 }
+
+;
+; srem by non-splat constant
+;
+
+define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_remconstant_64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT:    vpmovsxbw %xmm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT:    vpsraw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpsraw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpand %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsubb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT:    vpmovsxbw %xmm5, %ymm5
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm6
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm6, %ymm6
+; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3]
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT:    vpsraw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpsraw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm6, %ymm6
+; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT:    vpsrlw $7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_remconstant_64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm3
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovwb %zmm3, %ymm3
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512BW-NEXT:    vpmovsxbw %ymm4, %zmm4
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vpsrlw $8, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovwb %zmm4, %ymm4
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm4 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $8, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vpsrlw $8, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = srem <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
+  ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
index ee13b3c06f86baeaa79c628f9862f17a1fb910ac..ac6fe217e38e2a48579824617e69f331b2e8773c 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -172,7 +172,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
 ; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -190,34 +190,35 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ;
 ; SSE41-LABEL: test_div7_16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
-; SSE41-NEXT:    pmullw %xmm2, %xmm1
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    pmullw %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
+; SSE41-NEXT:    pmullw %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw %xmm1, %xmm3
 ; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    packuswb %xmm3, %xmm1
-; SSE41-NEXT:    psubb %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm3
+; SSE41-NEXT:    psubb %xmm3, %xmm0
 ; SSE41-NEXT:    psrlw $1, %xmm0
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    paddb %xmm3, %xmm0
 ; SSE41-NEXT:    psrlw $2, %xmm0
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test_div7_16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -260,6 +261,180 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
   ret <16 x i8> %res
 }
 
+;
+; sdiv by non-splat constant
+;
+
+define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_divconstant_16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    psubb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_divconstant_16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE41-NEXT:    psllw $7, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [147,79,171,117,205,57,57,37]
+; SSE41-NEXT:    pmullw %xmm3, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    psllw $7, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,6],xmm5[7]
+; SSE41-NEXT:    psrlw $8, %xmm5
+; SSE41-NEXT:    packuswb %xmm3, %xmm5
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    packuswb %xmm2, %xmm3
+; SSE41-NEXT:    psubb %xmm3, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    paddb %xmm3, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_divconstant_16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpsllw $7, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpsllw $7, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2NOBW-LABEL: test_divconstant_16i8:
+; AVX2NOBW:       # %bb.0:
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vzeroupper
+; AVX2NOBW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_divconstant_16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2]
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %res = udiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
+  ret <16 x i8> %res
+}
+
 ;
 ; urem by 7
 ;
@@ -470,7 +645,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
 ; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -495,20 +670,21 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ;
 ; SSE41-LABEL: test_rem7_16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
 ; SSE41-NEXT:    pmullw %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    pmullw %xmm1, %xmm3
 ; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psubb %xmm2, %xmm1
+; SSE41-NEXT:    psubb %xmm3, %xmm1
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
 ; SSE41-NEXT:    psrlw $2, %xmm1
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
@@ -521,15 +697,15 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ;
 ; AVX1-LABEL: test_rem7_16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
@@ -583,3 +759,208 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
   %res = urem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <16 x i8> %res
 }
+
+;
+; srem by non-splat constant
+;
+
+define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_remconstant_16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psubb %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm4
+; SSE2-NEXT:    psrlw $8, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    packuswb %xmm4, %xmm2
+; SSE2-NEXT:    paddb %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    psubb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_remconstant_16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE41-NEXT:    psllw $7, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [147,79,171,117,205,57,57,37]
+; SSE41-NEXT:    pmullw %xmm3, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    psllw $7, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,6],xmm5[7]
+; SSE41-NEXT:    psrlw $8, %xmm5
+; SSE41-NEXT:    packuswb %xmm3, %xmm5
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    packuswb %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psubb %xmm3, %xmm4
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    packuswb %xmm4, %xmm2
+; SSE41-NEXT:    paddb %xmm3, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    packuswb %xmm2, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm1
+; SSE41-NEXT:    psubb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_remconstant_16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpsllw $7, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpsllw $7, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2NOBW-LABEL: test_remconstant_16i8:
+; AVX2NOBW:       # %bb.0:
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2NOBW-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vzeroupper
+; AVX2NOBW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_remconstant_16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %res = urem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll
index cc90feadb3f074e95cdde4e5f8e5658c49028b01..32b49a7bb3e86d33a82f30c50c67e6ff646a6330 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -170,52 +170,50 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: test_div7_32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
-; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw %xmm3, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; AVX1-NEXT:    vpmullw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2NOBW-LABEL: test_div7_32i8:
 ; AVX2NOBW:       # %bb.0:
-; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 ; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -241,6 +239,139 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
   ret <32 x i8> %res
 }
 
+;
+; udiv by non-splat constant
+;
+
+define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_divconstant_32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT:    vpsllw $7, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpsllw $7, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpsllw $7, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpsllw $7, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7]
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2NOBW-LABEL: test_divconstant_32i8:
+; AVX2NOBW:       # %bb.0:
+; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
+; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_divconstant_32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+  %res = udiv <32 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
+  ret <32 x i8> %res
+}
+
 ;
 ; urem by 7
 ;
@@ -457,61 +588,59 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: test_rem7_32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
-; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $3, %xmm3, %xmm7
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX1-NEXT:    vpand %xmm2, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm4, %xmm7, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm4
 ; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $3, %xmm2, %xmm6
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; AVX1-NEXT:    vpmullw %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $3, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $3, %xmm3, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2NOBW-LABEL: test_rem7_32i8:
 ; AVX2NOBW:       # %bb.0:
-; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 ; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -544,3 +673,159 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
   %res = urem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <32 x i8> %res
 }
+
+;
+; urem by non-splat constant
+;
+
+define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_remconstant_32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX1-NEXT:    vpsllw $7, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpsllw $7, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsubb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpsllw $7, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpsllw $7, %xmm7, %xmm7
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7]
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubb %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2NOBW-LABEL: test_remconstant_32i8:
+; AVX2NOBW:       # %bb.0:
+; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
+; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpsubb %ymm2, %ymm0, %ymm3
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpackuswb %ymm4, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2NOBW-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    retq
+;
+; AVX512BW-LABEL: test_remconstant_32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+  %res = urem <32 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
+  ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-512.ll b/test/CodeGen/X86/vector-idiv-udiv-512.ll
index 29f24654a3e83d7ad830a466bd88e9f10e11a8f2..18ecac073dfa8b22f8e20e8f4a81864e2c1f40bc 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -138,54 +138,49 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
 define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-LABEL: test_div7_64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpackuswb %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm5, %ymm5
-; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm6, %ymm3
-; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
-; AVX512F-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_div7_64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
-; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
@@ -197,6 +192,112 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
   ret <64 x i8> %res
 }
 
+;
+; udiv by non-splat constant
+;
+
+define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_divconstant_64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_divconstant_64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = udiv <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
+  ret <64 x i8> %res
+}
+
 ;
 ; urem by 7
 ;
@@ -374,63 +475,58 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
 define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-LABEL: test_rem7_64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpackuswb %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm0, %ymm4
-; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $3, %ymm2, %ymm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT:    vpsubb %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512F-NEXT:    vpmullw %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm0, %ymm5
+; AVX512F-NEXT:    vpsrlw $1, %ymm5, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsllw $3, %ymm3, %ymm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX512F-NEXT:    vpand %ymm8, %ymm7, %ymm7
+; AVX512F-NEXT:    vpsubb %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsllw $3, %ymm2, %ymm3
-; AVX512F-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_rem7_64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
-; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -445,3 +541,126 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
   %res = urem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <64 x i8> %res
 }
+
+;
+; urem by non-splat constant
+;
+
+define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_remconstant_64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsubb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpsubb %ymm4, %ymm1, %ymm5
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15],ymm5[24],ymm2[24],ymm5[25],ymm2[25],ymm5[26],ymm2[26],ymm5[27],ymm2[27],ymm5[28],ymm2[28],ymm5[29],ymm2[29],ymm5[30],ymm2[30],ymm5[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm6, %ymm6
+; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpackuswb %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_remconstant_64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm3[8],zmm2[8],zmm3[9],zmm2[9],zmm3[10],zmm2[10],zmm3[11],zmm2[11],zmm3[12],zmm2[12],zmm3[13],zmm2[13],zmm3[14],zmm2[14],zmm3[15],zmm2[15],zmm3[24],zmm2[24],zmm3[25],zmm2[25],zmm3[26],zmm2[26],zmm3[27],zmm2[27],zmm3[28],zmm2[28],zmm3[29],zmm2[29],zmm3[30],zmm2[30],zmm3[31],zmm2[31],zmm3[40],zmm2[40],zmm3[41],zmm2[41],zmm3[42],zmm2[42],zmm3[43],zmm2[43],zmm3[44],zmm2[44],zmm3[45],zmm2[45],zmm3[46],zmm2[46],zmm3[47],zmm2[47],zmm3[56],zmm2[56],zmm3[57],zmm2[57],zmm3[58],zmm2[58],zmm3[59],zmm2[59],zmm3[60],zmm2[60],zmm3[61],zmm2[61],zmm3[62],zmm2[62],zmm3[63],zmm2[63]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vpsrlw $8, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm3[0],zmm2[0],zmm3[1],zmm2[1],zmm3[2],zmm2[2],zmm3[3],zmm2[3],zmm3[4],zmm2[4],zmm3[5],zmm2[5],zmm3[6],zmm2[6],zmm3[7],zmm2[7],zmm3[16],zmm2[16],zmm3[17],zmm2[17],zmm3[18],zmm2[18],zmm3[19],zmm2[19],zmm3[20],zmm2[20],zmm3[21],zmm2[21],zmm3[22],zmm2[22],zmm3[23],zmm2[23],zmm3[32],zmm2[32],zmm3[33],zmm2[33],zmm3[34],zmm2[34],zmm3[35],zmm2[35],zmm3[36],zmm2[36],zmm3[37],zmm2[37],zmm3[38],zmm2[38],zmm3[39],zmm2[39],zmm3[48],zmm2[48],zmm3[49],zmm2[49],zmm3[50],zmm2[50],zmm3[51],zmm2[51],zmm3[52],zmm2[52],zmm3[53],zmm2[53],zmm3[54],zmm2[54],zmm3[55],zmm2[55]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpackuswb %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[1,u,3,u,5,u,7,u,9,u,11,u,13,u,15,u,17,u,19,u,21,u,23,u,25,u,27,u,29,u,31,u,33,u,35,u,37,u,39,u,41,u,43,u,45,u,47,u,49,u,51,u,53,u,55,u,57,u,59,u,61,u,63,u]
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = urem <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
+  ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-v2i32.ll b/test/CodeGen/X86/vector-idiv-v2i32.ll
index 00126d675322aa2001e5a9a7a95de1826f2dbcf4..333f5518f3700f1df25194b9b8e297ef6aa4bf32 100644
--- a/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -65,54 +65,40 @@ define void @test_udiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X64_WIDEN-LABEL: test_udiv7_v2i32:
 ; X64_WIDEN:       # %bb.0:
 ; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    movd %xmm0, %eax
-; X64_WIDEN-NEXT:    imulq $613566757, %rax, %rcx # imm = 0x24924925
-; X64_WIDEN-NEXT:    shrq $32, %rcx
-; X64_WIDEN-NEXT:    subl %ecx, %eax
-; X64_WIDEN-NEXT:    shrl %eax
-; X64_WIDEN-NEXT:    addl %ecx, %eax
-; X64_WIDEN-NEXT:    shrl $2, %eax
-; X64_WIDEN-NEXT:    movd %eax, %xmm1
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X64_WIDEN-NEXT:    movd %xmm0, %eax
-; X64_WIDEN-NEXT:    imulq $613566757, %rax, %rcx # imm = 0x24924925
-; X64_WIDEN-NEXT:    shrq $32, %rcx
-; X64_WIDEN-NEXT:    subl %ecx, %eax
-; X64_WIDEN-NEXT:    shrl %eax
-; X64_WIDEN-NEXT:    addl %ecx, %eax
-; X64_WIDEN-NEXT:    shrl $2, %eax
-; X64_WIDEN-NEXT:    movd %eax, %xmm0
-; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64_WIDEN-NEXT:    movq %xmm1, (%rsi)
+; X64_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm2
+; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64_WIDEN-NEXT:    psubd %xmm2, %xmm0
+; X64_WIDEN-NEXT:    psrld $1, %xmm0
+; X64_WIDEN-NEXT:    paddd %xmm2, %xmm0
+; X64_WIDEN-NEXT:    psrld $2, %xmm0
+; X64_WIDEN-NEXT:    movq %xmm0, (%rsi)
 ; X64_WIDEN-NEXT:    retq
 ;
 ; X86_WIDEN-LABEL: test_udiv7_v2i32:
 ; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    pushl %ebx
-; X86_WIDEN-NEXT:    pushl %edi
-; X86_WIDEN-NEXT:    pushl %esi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl (%eax), %ecx
-; X86_WIDEN-NEXT:    movl 4(%eax), %esi
-; X86_WIDEN-NEXT:    movl $613566757, %ebx # imm = 0x24924925
-; X86_WIDEN-NEXT:    movl %ecx, %eax
-; X86_WIDEN-NEXT:    mull %ebx
-; X86_WIDEN-NEXT:    subl %edx, %ecx
-; X86_WIDEN-NEXT:    shrl %ecx
-; X86_WIDEN-NEXT:    addl %edx, %ecx
-; X86_WIDEN-NEXT:    shrl $2, %ecx
-; X86_WIDEN-NEXT:    movl %esi, %eax
-; X86_WIDEN-NEXT:    mull %ebx
-; X86_WIDEN-NEXT:    subl %edx, %esi
-; X86_WIDEN-NEXT:    shrl %esi
-; X86_WIDEN-NEXT:    addl %edx, %esi
-; X86_WIDEN-NEXT:    shrl $2, %esi
-; X86_WIDEN-NEXT:    movl %esi, 4(%edi)
-; X86_WIDEN-NEXT:    movl %ecx, (%edi)
-; X86_WIDEN-NEXT:    popl %esi
-; X86_WIDEN-NEXT:    popl %edi
-; X86_WIDEN-NEXT:    popl %ebx
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm2
+; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
+; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm3
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
+; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86_WIDEN-NEXT:    psubd %xmm2, %xmm0
+; X86_WIDEN-NEXT:    psrld $1, %xmm0
+; X86_WIDEN-NEXT:    paddd %xmm2, %xmm0
+; X86_WIDEN-NEXT:    psrld $2, %xmm0
+; X86_WIDEN-NEXT:    movq %xmm0, (%eax)
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = udiv <2 x i32> %a, <i32 7, i32 7>
@@ -199,72 +185,50 @@ define void @test_urem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X64_WIDEN-LABEL: test_urem7_v2i32:
 ; X64_WIDEN:       # %bb.0:
 ; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    movd %xmm0, %eax
-; X64_WIDEN-NEXT:    imulq $613566757, %rax, %rcx # imm = 0x24924925
-; X64_WIDEN-NEXT:    shrq $32, %rcx
-; X64_WIDEN-NEXT:    movl %eax, %edx
-; X64_WIDEN-NEXT:    subl %ecx, %edx
-; X64_WIDEN-NEXT:    shrl %edx
-; X64_WIDEN-NEXT:    addl %ecx, %edx
-; X64_WIDEN-NEXT:    shrl $2, %edx
-; X64_WIDEN-NEXT:    leal (,%rdx,8), %ecx
-; X64_WIDEN-NEXT:    subl %ecx, %edx
-; X64_WIDEN-NEXT:    addl %eax, %edx
-; X64_WIDEN-NEXT:    movd %edx, %xmm1
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X64_WIDEN-NEXT:    movd %xmm0, %eax
-; X64_WIDEN-NEXT:    imulq $613566757, %rax, %rcx # imm = 0x24924925
-; X64_WIDEN-NEXT:    shrq $32, %rcx
-; X64_WIDEN-NEXT:    movl %eax, %edx
-; X64_WIDEN-NEXT:    subl %ecx, %edx
-; X64_WIDEN-NEXT:    shrl %edx
-; X64_WIDEN-NEXT:    addl %ecx, %edx
-; X64_WIDEN-NEXT:    shrl $2, %edx
-; X64_WIDEN-NEXT:    leal (,%rdx,8), %ecx
-; X64_WIDEN-NEXT:    subl %ecx, %edx
-; X64_WIDEN-NEXT:    addl %eax, %edx
-; X64_WIDEN-NEXT:    movd %edx, %xmm0
-; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm2
+; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm1
+; X64_WIDEN-NEXT:    psubd %xmm2, %xmm1
+; X64_WIDEN-NEXT:    psrld $1, %xmm1
+; X64_WIDEN-NEXT:    paddd %xmm2, %xmm1
+; X64_WIDEN-NEXT:    psrld $2, %xmm1
+; X64_WIDEN-NEXT:    movdqa %xmm1, %xmm2
+; X64_WIDEN-NEXT:    pslld $3, %xmm2
+; X64_WIDEN-NEXT:    psubd %xmm2, %xmm1
+; X64_WIDEN-NEXT:    paddd %xmm0, %xmm1
 ; X64_WIDEN-NEXT:    movq %xmm1, (%rsi)
 ; X64_WIDEN-NEXT:    retq
 ;
 ; X86_WIDEN-LABEL: test_urem7_v2i32:
 ; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    pushl %ebp
-; X86_WIDEN-NEXT:    pushl %ebx
-; X86_WIDEN-NEXT:    pushl %edi
-; X86_WIDEN-NEXT:    pushl %esi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl (%eax), %esi
-; X86_WIDEN-NEXT:    movl 4(%eax), %ecx
-; X86_WIDEN-NEXT:    movl $613566757, %ebx # imm = 0x24924925
-; X86_WIDEN-NEXT:    movl %esi, %eax
-; X86_WIDEN-NEXT:    mull %ebx
-; X86_WIDEN-NEXT:    movl %esi, %ebp
-; X86_WIDEN-NEXT:    subl %edx, %ebp
-; X86_WIDEN-NEXT:    shrl %ebp
-; X86_WIDEN-NEXT:    addl %edx, %ebp
-; X86_WIDEN-NEXT:    shrl $2, %ebp
-; X86_WIDEN-NEXT:    leal (,%ebp,8), %eax
-; X86_WIDEN-NEXT:    subl %eax, %ebp
-; X86_WIDEN-NEXT:    addl %esi, %ebp
-; X86_WIDEN-NEXT:    movl %ecx, %eax
-; X86_WIDEN-NEXT:    mull %ebx
-; X86_WIDEN-NEXT:    movl %ecx, %eax
-; X86_WIDEN-NEXT:    subl %edx, %eax
-; X86_WIDEN-NEXT:    shrl %eax
-; X86_WIDEN-NEXT:    addl %edx, %eax
-; X86_WIDEN-NEXT:    shrl $2, %eax
-; X86_WIDEN-NEXT:    leal (,%eax,8), %edx
-; X86_WIDEN-NEXT:    subl %edx, %eax
-; X86_WIDEN-NEXT:    addl %ecx, %eax
-; X86_WIDEN-NEXT:    movl %eax, 4(%edi)
-; X86_WIDEN-NEXT:    movl %ebp, (%edi)
-; X86_WIDEN-NEXT:    popl %esi
-; X86_WIDEN-NEXT:    popl %edi
-; X86_WIDEN-NEXT:    popl %ebx
-; X86_WIDEN-NEXT:    popl %ebp
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm2
+; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
+; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm3
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
+; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm1
+; X86_WIDEN-NEXT:    psubd %xmm2, %xmm1
+; X86_WIDEN-NEXT:    psrld $1, %xmm1
+; X86_WIDEN-NEXT:    paddd %xmm2, %xmm1
+; X86_WIDEN-NEXT:    psrld $2, %xmm1
+; X86_WIDEN-NEXT:    movdqa %xmm1, %xmm2
+; X86_WIDEN-NEXT:    pslld $3, %xmm2
+; X86_WIDEN-NEXT:    psubd %xmm2, %xmm1
+; X86_WIDEN-NEXT:    paddd %xmm0, %xmm1
+; X86_WIDEN-NEXT:    movq %xmm1, (%eax)
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = urem <2 x i32> %a, <i32 7, i32 7>
@@ -342,63 +306,52 @@ define void @test_sdiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X64_WIDEN-LABEL: test_sdiv7_v2i32:
 ; X64_WIDEN:       # %bb.0:
 ; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    movd %xmm0, %eax
-; X64_WIDEN-NEXT:    cltq
-; X64_WIDEN-NEXT:    imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; X64_WIDEN-NEXT:    shrq $32, %rcx
-; X64_WIDEN-NEXT:    addl %ecx, %eax
-; X64_WIDEN-NEXT:    movl %eax, %ecx
-; X64_WIDEN-NEXT:    shrl $31, %ecx
-; X64_WIDEN-NEXT:    sarl $2, %eax
-; X64_WIDEN-NEXT:    addl %ecx, %eax
-; X64_WIDEN-NEXT:    movd %eax, %xmm1
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X64_WIDEN-NEXT:    movd %xmm0, %eax
-; X64_WIDEN-NEXT:    cltq
-; X64_WIDEN-NEXT:    imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; X64_WIDEN-NEXT:    shrq $32, %rcx
-; X64_WIDEN-NEXT:    addl %ecx, %eax
-; X64_WIDEN-NEXT:    movl %eax, %ecx
-; X64_WIDEN-NEXT:    shrl $31, %ecx
-; X64_WIDEN-NEXT:    sarl $2, %eax
-; X64_WIDEN-NEXT:    addl %ecx, %eax
-; X64_WIDEN-NEXT:    movd %eax, %xmm0
-; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64_WIDEN-NEXT:    movq %xmm1, (%rsi)
+; X64_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm2
+; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64_WIDEN-NEXT:    pxor %xmm3, %xmm3
+; X64_WIDEN-NEXT:    pcmpgtd %xmm0, %xmm3
+; X64_WIDEN-NEXT:    pand %xmm1, %xmm3
+; X64_WIDEN-NEXT:    paddd %xmm0, %xmm3
+; X64_WIDEN-NEXT:    psubd %xmm3, %xmm2
+; X64_WIDEN-NEXT:    paddd %xmm0, %xmm2
+; X64_WIDEN-NEXT:    movdqa %xmm2, %xmm0
+; X64_WIDEN-NEXT:    psrld $31, %xmm0
+; X64_WIDEN-NEXT:    psrad $2, %xmm2
+; X64_WIDEN-NEXT:    paddd %xmm0, %xmm2
+; X64_WIDEN-NEXT:    movq %xmm2, (%rsi)
 ; X64_WIDEN-NEXT:    retq
 ;
 ; X86_WIDEN-LABEL: test_sdiv7_v2i32:
 ; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    pushl %ebp
-; X86_WIDEN-NEXT:    pushl %ebx
-; X86_WIDEN-NEXT:    pushl %edi
-; X86_WIDEN-NEXT:    pushl %esi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl (%eax), %ecx
-; X86_WIDEN-NEXT:    movl 4(%eax), %esi
-; X86_WIDEN-NEXT:    movl $-1840700269, %ebp # imm = 0x92492493
-; X86_WIDEN-NEXT:    movl %ecx, %eax
-; X86_WIDEN-NEXT:    imull %ebp
-; X86_WIDEN-NEXT:    movl %edx, %edi
-; X86_WIDEN-NEXT:    addl %ecx, %edi
-; X86_WIDEN-NEXT:    movl %edi, %eax
-; X86_WIDEN-NEXT:    shrl $31, %eax
-; X86_WIDEN-NEXT:    sarl $2, %edi
-; X86_WIDEN-NEXT:    addl %eax, %edi
-; X86_WIDEN-NEXT:    movl %esi, %eax
-; X86_WIDEN-NEXT:    imull %ebp
-; X86_WIDEN-NEXT:    addl %esi, %edx
-; X86_WIDEN-NEXT:    movl %edx, %eax
-; X86_WIDEN-NEXT:    shrl $31, %eax
-; X86_WIDEN-NEXT:    sarl $2, %edx
-; X86_WIDEN-NEXT:    addl %eax, %edx
-; X86_WIDEN-NEXT:    movl %edx, 4(%ebx)
-; X86_WIDEN-NEXT:    movl %edi, (%ebx)
-; X86_WIDEN-NEXT:    popl %esi
-; X86_WIDEN-NEXT:    popl %edi
-; X86_WIDEN-NEXT:    popl %ebx
-; X86_WIDEN-NEXT:    popl %ebp
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm2
+; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
+; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm3
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
+; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86_WIDEN-NEXT:    pxor %xmm3, %xmm3
+; X86_WIDEN-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86_WIDEN-NEXT:    pand %xmm1, %xmm3
+; X86_WIDEN-NEXT:    paddd %xmm0, %xmm3
+; X86_WIDEN-NEXT:    psubd %xmm3, %xmm2
+; X86_WIDEN-NEXT:    paddd %xmm0, %xmm2
+; X86_WIDEN-NEXT:    movdqa %xmm2, %xmm0
+; X86_WIDEN-NEXT:    psrld $31, %xmm0
+; X86_WIDEN-NEXT:    psrad $2, %xmm2
+; X86_WIDEN-NEXT:    paddd %xmm0, %xmm2
+; X86_WIDEN-NEXT:    movq %xmm2, (%eax)
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = sdiv <2 x i32> %a, <i32 7, i32 7>
@@ -488,75 +441,60 @@ define void @test_srem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X64_WIDEN-LABEL: test_srem7_v2i32:
 ; X64_WIDEN:       # %bb.0:
 ; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    movd %xmm0, %eax
-; X64_WIDEN-NEXT:    cltq
-; X64_WIDEN-NEXT:    imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; X64_WIDEN-NEXT:    shrq $32, %rcx
-; X64_WIDEN-NEXT:    addl %eax, %ecx
-; X64_WIDEN-NEXT:    movl %ecx, %edx
-; X64_WIDEN-NEXT:    shrl $31, %edx
-; X64_WIDEN-NEXT:    sarl $2, %ecx
-; X64_WIDEN-NEXT:    addl %edx, %ecx
-; X64_WIDEN-NEXT:    leal (,%rcx,8), %edx
-; X64_WIDEN-NEXT:    subl %edx, %ecx
-; X64_WIDEN-NEXT:    addl %eax, %ecx
-; X64_WIDEN-NEXT:    movd %ecx, %xmm1
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X64_WIDEN-NEXT:    movd %xmm0, %eax
-; X64_WIDEN-NEXT:    cltq
-; X64_WIDEN-NEXT:    imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; X64_WIDEN-NEXT:    shrq $32, %rcx
-; X64_WIDEN-NEXT:    addl %eax, %ecx
-; X64_WIDEN-NEXT:    movl %ecx, %edx
-; X64_WIDEN-NEXT:    shrl $31, %edx
-; X64_WIDEN-NEXT:    sarl $2, %ecx
-; X64_WIDEN-NEXT:    addl %edx, %ecx
-; X64_WIDEN-NEXT:    leal (,%rcx,8), %edx
-; X64_WIDEN-NEXT:    subl %edx, %ecx
-; X64_WIDEN-NEXT:    addl %eax, %ecx
-; X64_WIDEN-NEXT:    movd %ecx, %xmm0
-; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64_WIDEN-NEXT:    movq %xmm1, (%rsi)
+; X64_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm2
+; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
+; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64_WIDEN-NEXT:    pxor %xmm3, %xmm3
+; X64_WIDEN-NEXT:    pcmpgtd %xmm0, %xmm3
+; X64_WIDEN-NEXT:    pand %xmm1, %xmm3
+; X64_WIDEN-NEXT:    paddd %xmm0, %xmm3
+; X64_WIDEN-NEXT:    psubd %xmm3, %xmm2
+; X64_WIDEN-NEXT:    paddd %xmm0, %xmm2
+; X64_WIDEN-NEXT:    movdqa %xmm2, %xmm1
+; X64_WIDEN-NEXT:    psrld $31, %xmm1
+; X64_WIDEN-NEXT:    psrad $2, %xmm2
+; X64_WIDEN-NEXT:    paddd %xmm1, %xmm2
+; X64_WIDEN-NEXT:    movdqa %xmm2, %xmm1
+; X64_WIDEN-NEXT:    pslld $3, %xmm1
+; X64_WIDEN-NEXT:    psubd %xmm1, %xmm2
+; X64_WIDEN-NEXT:    paddd %xmm0, %xmm2
+; X64_WIDEN-NEXT:    movq %xmm2, (%rsi)
 ; X64_WIDEN-NEXT:    retq
 ;
 ; X86_WIDEN-LABEL: test_srem7_v2i32:
 ; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    pushl %ebp
-; X86_WIDEN-NEXT:    pushl %ebx
-; X86_WIDEN-NEXT:    pushl %edi
-; X86_WIDEN-NEXT:    pushl %esi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl (%eax), %edi
-; X86_WIDEN-NEXT:    movl 4(%eax), %ecx
-; X86_WIDEN-NEXT:    movl $-1840700269, %ebp # imm = 0x92492493
-; X86_WIDEN-NEXT:    movl %edi, %eax
-; X86_WIDEN-NEXT:    imull %ebp
-; X86_WIDEN-NEXT:    movl %edx, %esi
-; X86_WIDEN-NEXT:    addl %edi, %esi
-; X86_WIDEN-NEXT:    movl %esi, %eax
-; X86_WIDEN-NEXT:    shrl $31, %eax
-; X86_WIDEN-NEXT:    sarl $2, %esi
-; X86_WIDEN-NEXT:    addl %eax, %esi
-; X86_WIDEN-NEXT:    leal (,%esi,8), %eax
-; X86_WIDEN-NEXT:    subl %eax, %esi
-; X86_WIDEN-NEXT:    addl %edi, %esi
-; X86_WIDEN-NEXT:    movl %ecx, %eax
-; X86_WIDEN-NEXT:    imull %ebp
-; X86_WIDEN-NEXT:    addl %ecx, %edx
-; X86_WIDEN-NEXT:    movl %edx, %eax
-; X86_WIDEN-NEXT:    shrl $31, %eax
-; X86_WIDEN-NEXT:    sarl $2, %edx
-; X86_WIDEN-NEXT:    addl %eax, %edx
-; X86_WIDEN-NEXT:    leal (,%edx,8), %eax
-; X86_WIDEN-NEXT:    subl %eax, %edx
-; X86_WIDEN-NEXT:    addl %ecx, %edx
-; X86_WIDEN-NEXT:    movl %edx, 4(%ebx)
-; X86_WIDEN-NEXT:    movl %esi, (%ebx)
-; X86_WIDEN-NEXT:    popl %esi
-; X86_WIDEN-NEXT:    popl %edi
-; X86_WIDEN-NEXT:    popl %ebx
-; X86_WIDEN-NEXT:    popl %ebp
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm2
+; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
+; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm3
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
+; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86_WIDEN-NEXT:    pxor %xmm3, %xmm3
+; X86_WIDEN-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86_WIDEN-NEXT:    pand %xmm1, %xmm3
+; X86_WIDEN-NEXT:    paddd %xmm0, %xmm3
+; X86_WIDEN-NEXT:    psubd %xmm3, %xmm2
+; X86_WIDEN-NEXT:    paddd %xmm0, %xmm2
+; X86_WIDEN-NEXT:    movdqa %xmm2, %xmm1
+; X86_WIDEN-NEXT:    psrld $31, %xmm1
+; X86_WIDEN-NEXT:    psrad $2, %xmm2
+; X86_WIDEN-NEXT:    paddd %xmm1, %xmm2
+; X86_WIDEN-NEXT:    movdqa %xmm2, %xmm1
+; X86_WIDEN-NEXT:    pslld $3, %xmm1
+; X86_WIDEN-NEXT:    psubd %xmm1, %xmm2
+; X86_WIDEN-NEXT:    paddd %xmm0, %xmm2
+; X86_WIDEN-NEXT:    movq %xmm2, (%eax)
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = srem <2 x i32> %a, <i32 7, i32 7>
@@ -600,9 +538,7 @@ define void @test_udiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86_WIDEN-NEXT:    psrld $3, %xmm0
-; X86_WIDEN-NEXT:    movd %xmm0, (%eax)
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86_WIDEN-NEXT:    movd %xmm0, 4(%eax)
+; X86_WIDEN-NEXT:    movq %xmm0, (%eax)
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = udiv <2 x i32> %a, <i32 8, i32 8>
@@ -645,11 +581,9 @@ define void @test_urem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86_WIDEN:       # %bb.0:
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT:    pand {{\.LCPI.*}}, %xmm0
-; X86_WIDEN-NEXT:    movd %xmm0, (%eax)
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86_WIDEN-NEXT:    movd %xmm0, 4(%eax)
+; X86_WIDEN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X86_WIDEN-NEXT:    movlps %xmm0, (%eax)
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = urem <2 x i32> %a, <i32 8, i32 8>
@@ -661,8 +595,7 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X64-LABEL: test_sdiv_pow2_v2i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pxor %xmm1, %xmm1
-; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,1]
 ; X64-NEXT:    psrad $31, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
@@ -685,14 +618,13 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    pxor %xmm1, %xmm1
-; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT:    psrad $31, %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; X86-NEXT:    movdqa %xmm0, %xmm1
-; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    psrad $31, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; X86-NEXT:    movdqa %xmm1, %xmm0
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
 ; X86-NEXT:    movdqa {{.*#+}} xmm2 = [31,0,31,0]
 ; X86-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
 ; X86-NEXT:    movdqa %xmm3, %xmm4
@@ -701,22 +633,22 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86-NEXT:    movd %ecx, %xmm5
 ; X86-NEXT:    psrlq %xmm5, %xmm3
 ; X86-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; X86-NEXT:    movdqa %xmm1, %xmm4
+; X86-NEXT:    movdqa %xmm0, %xmm4
 ; X86-NEXT:    psrlq %xmm2, %xmm4
-; X86-NEXT:    psrlq %xmm5, %xmm1
-; X86-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; X86-NEXT:    xorpd %xmm3, %xmm1
-; X86-NEXT:    psubq %xmm3, %xmm1
-; X86-NEXT:    pand {{\.LCPI.*}}, %xmm1
-; X86-NEXT:    psrlq $29, %xmm1
-; X86-NEXT:    paddq %xmm0, %xmm1
-; X86-NEXT:    psllq $32, %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; X86-NEXT:    psrad $31, %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT:    psrlq $3, %xmm0
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT:    psrlq %xmm5, %xmm0
+; X86-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; X86-NEXT:    xorpd %xmm3, %xmm0
+; X86-NEXT:    psubq %xmm3, %xmm0
+; X86-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    psrlq $29, %xmm0
+; X86-NEXT:    paddq %xmm1, %xmm0
+; X86-NEXT:    psllq $32, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; X86-NEXT:    psrad $31, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    psrlq $3, %xmm1
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -741,9 +673,7 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86_WIDEN-NEXT:    psrld $29, %xmm1
 ; X86_WIDEN-NEXT:    paddd %xmm0, %xmm1
 ; X86_WIDEN-NEXT:    psrad $3, %xmm1
-; X86_WIDEN-NEXT:    movd %xmm1, (%eax)
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86_WIDEN-NEXT:    movd %xmm0, 4(%eax)
+; X86_WIDEN-NEXT:    movq %xmm1, (%eax)
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = sdiv <2 x i32> %a, <i32 8, i32 8>
@@ -787,9 +717,7 @@ define void @test_srem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86_WIDEN-NEXT:    psrld $3, %xmm0
-; X86_WIDEN-NEXT:    movd %xmm0, (%eax)
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86_WIDEN-NEXT:    movd %xmm0, 4(%eax)
+; X86_WIDEN-NEXT:    movq %xmm0, (%eax)
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = udiv <2 x i32> %a, <i32 8, i32 8>
@@ -839,8 +767,8 @@ define void @test_udiv_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %esi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ebx
 ; X86-NEXT:    movd %eax, %xmm0
 ; X86-NEXT:    movd %esi, %xmm1
@@ -874,25 +802,27 @@ define void @test_udiv_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
 ;
 ; X86_WIDEN-LABEL: test_udiv_v2i32:
 ; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    pushl %ebx
-; X86_WIDEN-NEXT:    pushl %edi
 ; X86_WIDEN-NEXT:    pushl %esi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movl (%ecx), %eax
-; X86_WIDEN-NEXT:    movl 4(%ecx), %ecx
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86_WIDEN-NEXT:    movd %xmm0, %eax
+; X86_WIDEN-NEXT:    movd %xmm1, %esi
 ; X86_WIDEN-NEXT:    xorl %edx, %edx
-; X86_WIDEN-NEXT:    divl (%ebx)
-; X86_WIDEN-NEXT:    movl %eax, %esi
+; X86_WIDEN-NEXT:    divl %esi
+; X86_WIDEN-NEXT:    movd %eax, %xmm2
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86_WIDEN-NEXT:    movd %xmm0, %eax
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; X86_WIDEN-NEXT:    movd %xmm1, %esi
 ; X86_WIDEN-NEXT:    xorl %edx, %edx
-; X86_WIDEN-NEXT:    movl %ecx, %eax
-; X86_WIDEN-NEXT:    divl 4(%ebx)
-; X86_WIDEN-NEXT:    movl %eax, 4(%edi)
-; X86_WIDEN-NEXT:    movl %esi, (%edi)
+; X86_WIDEN-NEXT:    divl %esi
+; X86_WIDEN-NEXT:    movd %eax, %xmm0
+; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86_WIDEN-NEXT:    movq %xmm2, (%ecx)
 ; X86_WIDEN-NEXT:    popl %esi
-; X86_WIDEN-NEXT:    popl %edi
-; X86_WIDEN-NEXT:    popl %ebx
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = load <2 x i32>, <2 x i32>* %y
@@ -943,8 +873,8 @@ define void @test_urem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ebx
 ; X86-NEXT:    movd %edx, %xmm0
 ; X86-NEXT:    movd %esi, %xmm1
@@ -978,25 +908,27 @@ define void @test_urem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
 ;
 ; X86_WIDEN-LABEL: test_urem_v2i32:
 ; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    pushl %ebx
-; X86_WIDEN-NEXT:    pushl %edi
 ; X86_WIDEN-NEXT:    pushl %esi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movl (%ecx), %eax
-; X86_WIDEN-NEXT:    movl 4(%ecx), %ecx
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86_WIDEN-NEXT:    movd %xmm0, %eax
+; X86_WIDEN-NEXT:    movd %xmm1, %esi
 ; X86_WIDEN-NEXT:    xorl %edx, %edx
-; X86_WIDEN-NEXT:    divl (%ebx)
-; X86_WIDEN-NEXT:    movl %edx, %esi
+; X86_WIDEN-NEXT:    divl %esi
+; X86_WIDEN-NEXT:    movd %edx, %xmm2
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86_WIDEN-NEXT:    movd %xmm0, %eax
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; X86_WIDEN-NEXT:    movd %xmm1, %esi
 ; X86_WIDEN-NEXT:    xorl %edx, %edx
-; X86_WIDEN-NEXT:    movl %ecx, %eax
-; X86_WIDEN-NEXT:    divl 4(%ebx)
-; X86_WIDEN-NEXT:    movl %edx, 4(%edi)
-; X86_WIDEN-NEXT:    movl %esi, (%edi)
+; X86_WIDEN-NEXT:    divl %esi
+; X86_WIDEN-NEXT:    movd %edx, %xmm0
+; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86_WIDEN-NEXT:    movq %xmm2, (%ecx)
 ; X86_WIDEN-NEXT:    popl %esi
-; X86_WIDEN-NEXT:    popl %edi
-; X86_WIDEN-NEXT:    popl %ebx
 ; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = load <2 x i32>, <2 x i32>* %y
@@ -1085,19 +1017,26 @@ define void @test_sdiv_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
 ; X86_WIDEN-NEXT:    pushl %ebx
 ; X86_WIDEN-NEXT:    pushl %edi
 ; X86_WIDEN-NEXT:    pushl %esi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movl (%ecx), %eax
-; X86_WIDEN-NEXT:    movl 4(%ecx), %ecx
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86_WIDEN-NEXT:    movd %xmm0, %ecx
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86_WIDEN-NEXT:    movd %xmm0, %eax
+; X86_WIDEN-NEXT:    movd %xmm1, %edi
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; X86_WIDEN-NEXT:    movd %xmm1, %ebx
 ; X86_WIDEN-NEXT:    cltd
-; X86_WIDEN-NEXT:    idivl (%ebx)
-; X86_WIDEN-NEXT:    movl %eax, %esi
+; X86_WIDEN-NEXT:    idivl %ebx
+; X86_WIDEN-NEXT:    movd %eax, %xmm0
 ; X86_WIDEN-NEXT:    movl %ecx, %eax
 ; X86_WIDEN-NEXT:    cltd
-; X86_WIDEN-NEXT:    idivl 4(%ebx)
-; X86_WIDEN-NEXT:    movl %eax, 4(%edi)
-; X86_WIDEN-NEXT:    movl %esi, (%edi)
+; X86_WIDEN-NEXT:    idivl %edi
+; X86_WIDEN-NEXT:    movd %eax, %xmm1
+; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86_WIDEN-NEXT:    movq %xmm1, (%esi)
 ; X86_WIDEN-NEXT:    popl %esi
 ; X86_WIDEN-NEXT:    popl %edi
 ; X86_WIDEN-NEXT:    popl %ebx
@@ -1189,19 +1128,26 @@ define void @test_srem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
 ; X86_WIDEN-NEXT:    pushl %ebx
 ; X86_WIDEN-NEXT:    pushl %edi
 ; X86_WIDEN-NEXT:    pushl %esi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movl (%ecx), %eax
-; X86_WIDEN-NEXT:    movl 4(%ecx), %ecx
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86_WIDEN-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86_WIDEN-NEXT:    movd %xmm0, %ecx
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86_WIDEN-NEXT:    movd %xmm0, %eax
+; X86_WIDEN-NEXT:    movd %xmm1, %edi
+; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; X86_WIDEN-NEXT:    movd %xmm1, %ebx
 ; X86_WIDEN-NEXT:    cltd
-; X86_WIDEN-NEXT:    idivl (%ebx)
-; X86_WIDEN-NEXT:    movl %eax, %esi
+; X86_WIDEN-NEXT:    idivl %ebx
+; X86_WIDEN-NEXT:    movd %eax, %xmm0
 ; X86_WIDEN-NEXT:    movl %ecx, %eax
 ; X86_WIDEN-NEXT:    cltd
-; X86_WIDEN-NEXT:    idivl 4(%ebx)
-; X86_WIDEN-NEXT:    movl %eax, 4(%edi)
-; X86_WIDEN-NEXT:    movl %esi, (%edi)
+; X86_WIDEN-NEXT:    idivl %edi
+; X86_WIDEN-NEXT:    movd %eax, %xmm1
+; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86_WIDEN-NEXT:    movq %xmm1, (%esi)
 ; X86_WIDEN-NEXT:    popl %esi
 ; X86_WIDEN-NEXT:    popl %edi
 ; X86_WIDEN-NEXT:    popl %ebx
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index 50090b4f8191162c032484a35cfda03c76bba678..33779a9cc78868053e1c16322ec16d802fb5fea8 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -24,19 +24,19 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: PR20355:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    psubd %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    psrld $31, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: PR20355:
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index 34ea33d576c68ff54e3ab2543273cbb2e863f4ed..4d4326f08ea4d0617134236dde1fac49f3f3188a 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -96,33 +96,30 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv2i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSSE3-NEXT:    psrlw $8, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSSE3-NEXT:    psrlw $8, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrlw $8, %xmm1
-; SSSE3-NEXT:    paddw %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSSE3-NEXT:    psrld $16, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    paddw %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm2
+; SSSE3-NEXT:    psrld $16, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrld $16, %xmm1
-; SSSE3-NEXT:    paddd %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    paddd %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSSE3-NEXT:    psrlq $32, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrlq $32, %xmm1
@@ -132,33 +129,30 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; SSE41-LABEL: testv2i64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSE41-NEXT:    psrld $16, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm1
@@ -168,16 +162,14 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; AVX-LABEL: testv2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -198,16 +190,14 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv2i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -241,33 +231,30 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm3
-; X32-SSE-NEXT:    psrld $16, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    paddw %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm2
+; X32-SSE-NEXT:    psrld $16, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrld $16, %xmm1
-; X32-SSE-NEXT:    paddd %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; X32-SSE-NEXT:    paddd %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm1
@@ -362,33 +349,30 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv2i64u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSSE3-NEXT:    psrlw $8, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSSE3-NEXT:    psrlw $8, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrlw $8, %xmm1
-; SSSE3-NEXT:    paddw %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSSE3-NEXT:    psrld $16, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    paddw %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm2
+; SSSE3-NEXT:    psrld $16, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrld $16, %xmm1
-; SSSE3-NEXT:    paddd %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    paddd %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSSE3-NEXT:    psrlq $32, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrlq $32, %xmm1
@@ -398,33 +382,30 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; SSE41-LABEL: testv2i64u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSE41-NEXT:    psrld $16, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm1
@@ -434,16 +415,14 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX-LABEL: testv2i64u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -464,16 +443,14 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv2i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -507,33 +484,30 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv2i64u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm3
-; X32-SSE-NEXT:    psrld $16, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    paddw %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm2
+; X32-SSE-NEXT:    psrld $16, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrld $16, %xmm1
-; X32-SSE-NEXT:    paddd %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; X32-SSE-NEXT:    paddd %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm1
@@ -632,27 +606,24 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSSE3-NEXT:    psrlw $8, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSSE3-NEXT:    psrlw $8, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrlw $8, %xmm1
-; SSSE3-NEXT:    paddw %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSSE3-NEXT:    paddw %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSSE3-NEXT:    psrld $16, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrld $16, %xmm1
@@ -662,27 +633,24 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; SSE41-LABEL: testv4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSE41-NEXT:    psrld $16, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrld $16, %xmm1
@@ -692,16 +660,14 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX-LABEL: testv4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -717,16 +683,14 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv4i32:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -755,27 +719,24 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv4i32:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X32-SSE-NEXT:    paddw %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm1
@@ -874,27 +835,24 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv4i32u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSSE3-NEXT:    psrlw $8, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSSE3-NEXT:    psrlw $8, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrlw $8, %xmm1
-; SSSE3-NEXT:    paddw %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSSE3-NEXT:    paddw %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSSE3-NEXT:    psrld $16, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrld $16, %xmm1
@@ -904,27 +862,24 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; SSE41-LABEL: testv4i32u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSE41-NEXT:    psrld $16, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrld $16, %xmm1
@@ -934,16 +889,14 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX-LABEL: testv4i32u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -959,16 +912,14 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv4i32u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -997,27 +948,24 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv4i32u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X32-SSE-NEXT:    paddw %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm1
@@ -1104,21 +1052,18 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSSE3-NEXT:    psrlw $8, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrlw $8, %xmm1
@@ -1128,21 +1073,18 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSE41-LABEL: testv8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrlw $8, %xmm1
@@ -1152,16 +1094,14 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX-LABEL: testv8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -1172,16 +1112,14 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv8i16:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -1210,21 +1148,18 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrlw $8, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
@@ -1310,21 +1245,18 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv8i16u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSSE3-NEXT:    psrlw $8, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrlw $8, %xmm1
@@ -1334,21 +1266,18 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSE41-LABEL: testv8i16u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrlw $8, %xmm1
@@ -1358,16 +1287,14 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX-LABEL: testv8i16u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -1378,16 +1305,14 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv8i16u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -1416,21 +1341,18 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrlw $8, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
@@ -1510,68 +1432,58 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    psrlw $4, %xmm0
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
-; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    paddb %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    pshufb %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    pshufb %xmm0, %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: testv16i8:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i8:
@@ -1585,19 +1497,16 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm2
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
 ; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
-; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
@@ -1673,68 +1582,58 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv16i8u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    psrlw $4, %xmm0
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
-; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    paddb %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    pshufb %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    pshufb %xmm0, %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: testv16i8u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i8u:
@@ -1748,19 +1647,16 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm2
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
 ; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
-; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index e943d73c769d25ffda34eedbea9d7d05625a9ad6..306917265f185af61b1e80610a17d66daeafa293 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -13,40 +13,38 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm6, %xmm7
-; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpaddb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vpaddb %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddw %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrld $16, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrld $16, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm6
-; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm3, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm5, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
@@ -67,16 +65,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -97,16 +93,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv4i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -127,16 +121,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv4i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -169,16 +161,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv4i64:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -205,40 +195,38 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm6, %xmm7
-; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpaddb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vpaddb %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddw %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrld $16, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrld $16, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm6
-; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm3, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm5, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
@@ -259,16 +247,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i64u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -289,16 +275,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv4i64u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -319,16 +303,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv4i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -361,16 +343,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv4i64u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -397,34 +377,32 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm7
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm5
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqw %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm5
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
@@ -441,16 +419,14 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -466,16 +442,14 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv8i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -491,16 +465,14 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv8i32:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -528,16 +500,14 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv8i32:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -559,34 +529,32 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm7
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm5
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqw %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm5
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
@@ -603,16 +571,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv8i32u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -628,16 +594,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv8i32u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -653,16 +617,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv8i32u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -690,16 +652,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv8i32u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -721,29 +681,27 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm7
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm5
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -755,16 +713,14 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -775,16 +731,14 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -795,16 +749,14 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv16i16:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -823,16 +775,14 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv16i16:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -848,29 +798,27 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm7
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm5
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -882,16 +830,14 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -902,16 +848,14 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv16i16u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -922,16 +866,14 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv16i16u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -950,16 +892,14 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv16i16u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -975,71 +915,63 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: testv32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: testv32i8:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; AVX512-LABEL: testv32i8:
@@ -1059,17 +991,15 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv32i8:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X32-AVX-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; X32-AVX-NEXT:    retl
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0)
   ret <32 x i8> %out
@@ -1079,71 +1009,63 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: testv32i8u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: testv32i8u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; AVX512-LABEL: testv32i8u:
@@ -1163,17 +1085,15 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv32i8u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X32-AVX-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; X32-AVX-NEXT:    retl
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1)
   ret <32 x i8> %out
diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll
index 71a9ba19396864dc50c93e5e17067bf12ed85c32..f70971332681387580bd3382d3a6ccf153ac3ebc 100644
--- a/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -359,16 +359,15 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm4
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm3, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm5
 ; AVX512BW-NEXT:    vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm4, %zmm1
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
@@ -380,29 +379,27 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512DQ-LABEL: testv32i16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm5, %ymm7
+; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm7
 ; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vpaddb %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpand %ymm0, %ymm3, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpaddw %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm5
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm2, %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm5
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
@@ -445,16 +442,15 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv32i16u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm4
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm3, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm5
 ; AVX512BW-NEXT:    vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm4, %zmm1
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
@@ -466,29 +462,27 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; AVX512DQ-LABEL: testv32i16u:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm5, %ymm7
+; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm7
 ; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vpaddb %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpand %ymm0, %ymm3, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpaddw %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm5
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm2, %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm5
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
@@ -555,40 +549,37 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm4
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv64i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm0, %ymm6
 ; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
 ; AVX512DQ-NEXT:    retq
   %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
   ret <64 x i8> %out
@@ -649,40 +640,37 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv64i8u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm4
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv64i8u:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm0, %ymm6
 ; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
 ; AVX512DQ-NEXT:    retq
   %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
   ret <64 x i8> %out
diff --git a/test/CodeGen/X86/vector-mul.ll b/test/CodeGen/X86/vector-mul.ll
index f039fa15d3b31c4ac1a2ec6ecf368d77c163e3d6..d2f7b45615d18edfd44ccf4ddc4bc4423cfa9fbd 100644
--- a/test/CodeGen/X86/vector-mul.ll
+++ b/test/CodeGen/X86/vector-mul.ll
@@ -177,30 +177,28 @@ define <8 x i16> @mul_v8i16_1_2_4_8_16_32_64_128(<8 x i16> %a0) nounwind {
 define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounwind {
 ; X86-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
-; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X86-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X86-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
 ; X86-NEXT:    pmullw %xmm2, %xmm0
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X86-NEXT:    pand %xmm2, %xmm0
-; X86-NEXT:    pmullw {{\.LCPI.*}}, %xmm1
-; X86-NEXT:    pand %xmm2, %xmm1
+; X86-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-NEXT:    pand %xmm3, %xmm0
+; X86-NEXT:    pmullw %xmm2, %xmm1
+; X86-NEXT:    pand %xmm3, %xmm1
 ; X86-NEXT:    packuswb %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
-; X64-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X64-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
 ; X64-NEXT:    pmullw %xmm2, %xmm0
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X64-NEXT:    pand %xmm2, %xmm0
-; X64-NEXT:    pmullw {{.*}}(%rip), %xmm1
-; X64-NEXT:    pand %xmm2, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X64-NEXT:    pand %xmm3, %xmm0
+; X64-NEXT:    pmullw %xmm2, %xmm1
+; X64-NEXT:    pand %xmm3, %xmm1
 ; X64-NEXT:    packuswb %xmm0, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -212,13 +210,11 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
 ;
 ; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; X64-AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X64-AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
   %1 = mul <16 x i8> %a0, <i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8>
@@ -534,11 +530,9 @@ define <8 x i16> @mul_v8i16_2_3_9_17_33_65_129_257(<8 x i16> %a0) nounwind {
 define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> %a0) nounwind {
 ; X86-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
-; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X86-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X86-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X86-NEXT:    pmullw %xmm2, %xmm0
+; X86-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; X86-NEXT:    pand %xmm2, %xmm0
 ; X86-NEXT:    pmullw {{\.LCPI.*}}, %xmm1
@@ -549,11 +543,9 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
 ;
 ; X64-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
-; X64-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X64-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-NEXT:    pmullw %xmm2, %xmm0
+; X64-NEXT:    pmullw {{.*}}(%rip), %xmm0
 ; X64-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; X64-NEXT:    pand %xmm2, %xmm0
 ; X64-NEXT:    pmullw {{.*}}(%rip), %xmm1
@@ -564,10 +556,8 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
 ;
 ; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
 ; X64-XOP:       # %bb.0:
-; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
-; X64-XOP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-XOP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-XOP-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; X64-XOP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X64-XOP-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
 ; X64-XOP-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-XOP-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
@@ -575,13 +565,11 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
 ;
 ; X64-AVX2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; X64-AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X64-AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
   %1 = mul <16 x i8> %a0, <i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3>
@@ -1115,54 +1103,49 @@ define <8 x i16> @mul_v8i16_0_1_7_15_31_63_127_255(<8 x i16> %a0) nounwind {
 define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> %a0) nounwind {
 ; X86-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
-; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X86-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X86-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
 ; X86-NEXT:    pmullw %xmm2, %xmm0
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X86-NEXT:    pand %xmm2, %xmm0
-; X86-NEXT:    pmullw {{\.LCPI.*}}, %xmm1
-; X86-NEXT:    pand %xmm2, %xmm1
+; X86-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-NEXT:    pand %xmm3, %xmm0
+; X86-NEXT:    pmullw %xmm2, %xmm1
+; X86-NEXT:    pand %xmm3, %xmm1
 ; X86-NEXT:    packuswb %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
-; X64-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X64-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
 ; X64-NEXT:    pmullw %xmm2, %xmm0
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X64-NEXT:    pand %xmm2, %xmm0
-; X64-NEXT:    pmullw {{.*}}(%rip), %xmm1
-; X64-NEXT:    pand %xmm2, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X64-NEXT:    pand %xmm3, %xmm0
+; X64-NEXT:    pmullw %xmm2, %xmm1
+; X64-NEXT:    pand %xmm3, %xmm1
 ; X64-NEXT:    packuswb %xmm0, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
 ; X64-XOP:       # %bb.0:
-; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
-; X64-XOP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-XOP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-XOP-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; X64-XOP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
+; X64-XOP-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; X64-XOP-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X64-XOP-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
 ; X64-XOP-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; X64-AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; X64-AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
   %1 = mul <16 x i8> %a0, <i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127, i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127>
@@ -1240,3 +1223,95 @@ define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
   %mul = mul <2 x i64> %x, <i64 60, i64 124>
   ret <2 x i64> %mul
 }
+
+; We unfortunately can't see the zext that lives in the other basic block so we
+; don't know that we only need one pmuludq to compute the full 64 bits. This
+; sort of issue is more likely to occur when there is a loop and one of the
+; multiply inputs is loop invariant.
+; FIXME: We should be able to insert an AssertZExt for this.
+define <2 x i64> @mul_v2i64_zext_cross_bb(<2 x i32>* %in, <2 x i32>* %y) {
+; X86-LABEL: mul_v2i64_zext_cross_bb:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X86-NEXT:    pmuludq %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: mul_v2i64_zext_cross_bb:
+; X64:       # %bb.0:
+; X64-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X64-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X64-NEXT:    pmuludq %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X64-AVX-LABEL: mul_v2i64_zext_cross_bb:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X64-AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+  %a = load <2 x i32>, <2 x i32>* %in
+  %b = zext <2 x i32> %a to <2 x i64>
+  br label %foo
+
+foo:
+  %c = load <2 x i32>, <2 x i32>* %y
+  %d = zext <2 x i32> %c to <2 x i64>
+  %e = mul <2 x i64> %b, %d
+  ret <2 x i64> %e
+}
+
+define <4 x i64> @mul_v4i64_zext_cross_bb(<4 x i32>* %in, <4 x i32>* %y) {
+; X86-LABEL: mul_v4i64_zext_cross_bb:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X86-NEXT:    pmuludq %xmm2, %xmm1
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X86-NEXT:    pmuludq %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: mul_v4i64_zext_cross_bb:
+; X64:       # %bb.0:
+; X64-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X64-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X64-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X64-NEXT:    pmuludq %xmm2, %xmm1
+; X64-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X64-NEXT:    pmuludq %xmm2, %xmm0
+; X64-NEXT:    retq
+;
+; X64-XOP-LABEL: mul_v4i64_zext_cross_bb:
+; X64-XOP:       # %bb.0:
+; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X64-XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-XOP-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; X64-XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-XOP-NEXT:    retq
+;
+; X64-AVX2-LABEL: mul_v4i64_zext_cross_bb:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    retq
+  %a = load <4 x i32>, <4 x i32>* %in
+  %b = zext <4 x i32> %a to <4 x i64>
+  br label %foo
+
+foo:
+  %c = load <4 x i32>, <4 x i32>* %y
+  %d = zext <4 x i32> %c to <4 x i64>
+  %e = mul <4 x i64> %b, %d
+  ret <4 x i64> %e
+}
diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll
index c20dc09a6b2999d6508139cca3f99ae6bc3d23e4..4836b490c7d6b5bbafbeebe48516810df30b3728 100644
--- a/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/test/CodeGen/X86/vector-narrow-binop.ll
@@ -98,3 +98,72 @@ define <3 x float> @PR39511(<4 x float> %t0, <3 x float>* %b) {
   ret <3 x float> %ext
 }
 
+; When extracting from a vector binop, we need to be extracting
+; by a width of at least 1 of the original vector elements.
+; https://bugs.llvm.org/show_bug.cgi?id=39893
+
+define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) {
+; SSE-LABEL: PR39893:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    psubd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR39893:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR39893:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR39893:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT:    retq
+  %sub = sub <2 x i32> <i32 0, i32 undef>, %x
+  %bc = bitcast <2 x i32> %sub to <8 x i8>
+  %shuffle = shufflevector <8 x i8> %y, <8 x i8> %bc, <2 x i32> <i32 10, i32 4>
+  ret <2 x i8> %shuffle
+}
+
+define <2 x i8> @PR39893_2(<2 x float> %x) {
+; SSE-LABEL: PR39893_2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    subps %xmm0, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR39893_2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %fsub = fsub <2 x float> zeroinitializer, %x
+  %bc = bitcast <2 x float> %fsub to <8 x i8>
+  %shuffle = shufflevector <8 x i8> %bc, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %shuffle
+}
+
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index 16539f1b2d464f5bddd0e1484030ee013e3df15a..c91b2e111b66519b3cb42996acbdb198ec584c8c 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -308,7 +308,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BITALG-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-reduce-add-widen.ll b/test/CodeGen/X86/vector-reduce-add-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..397c533e2b2712077520cc2701a695cae92475cc
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-add-widen.ll
@@ -0,0 +1,1297 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE-LABEL: test_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE-LABEL: test_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE-LABEL: test_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddq %xmm3, %xmm1
+; SSE-NEXT:    paddq %xmm2, %xmm1
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE-LABEL: test_v16i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddq %xmm6, %xmm2
+; SSE-NEXT:    paddq %xmm7, %xmm3
+; SSE-NEXT:    paddq %xmm5, %xmm3
+; SSE-NEXT:    paddq %xmm1, %xmm3
+; SSE-NEXT:    paddq %xmm4, %xmm2
+; SSE-NEXT:    paddq %xmm3, %xmm2
+; SSE-NEXT:    paddq %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE-LABEL: test_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE-LABEL: test_v32i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm6, %xmm2
+; SSE-NEXT:    paddd %xmm7, %xmm3
+; SSE-NEXT:    paddd %xmm5, %xmm3
+; SSE-NEXT:    paddd %xmm1, %xmm3
+; SSE-NEXT:    paddd %xmm4, %xmm2
+; SSE-NEXT:    paddd %xmm3, %xmm2
+; SSE-NEXT:    paddd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE-LABEL: test_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE-LABEL: test_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm3, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE-LABEL: test_v64i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm6, %xmm2
+; SSE-NEXT:    paddw %xmm7, %xmm3
+; SSE-NEXT:    paddw %xmm5, %xmm3
+; SSE-NEXT:    paddw %xmm1, %xmm3
+; SSE-NEXT:    paddw %xmm4, %xmm2
+; SSE-NEXT:    paddw %xmm3, %xmm2
+; SSE-NEXT:    paddw %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    paddb %xmm3, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddb %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    paddb %xmm6, %xmm2
+; SSE2-NEXT:    paddb %xmm7, %xmm3
+; SSE2-NEXT:    paddb %xmm5, %xmm3
+; SSE2-NEXT:    paddb %xmm1, %xmm3
+; SSE2-NEXT:    paddb %xmm4, %xmm2
+; SSE2-NEXT:    paddb %xmm3, %xmm2
+; SSE2-NEXT:    paddb %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddb %xmm6, %xmm2
+; SSE41-NEXT:    paddb %xmm7, %xmm3
+; SSE41-NEXT:    paddb %xmm5, %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm2
+; SSE41-NEXT:    paddb %xmm3, %xmm2
+; SSE41-NEXT:    paddb %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT:    paddb %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v128i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-add.ll b/test/CodeGen/X86/vector-reduce-add.ll
index e0f6f194f50220b013a06c8d21164fabe1cf67af..5d92c9a62d29486d1b015eb3dc1042e93c4b35e2 100644
--- a/test/CodeGen/X86/vector-reduce-add.ll
+++ b/test/CodeGen/X86/vector-reduce-add.ll
@@ -194,6 +194,31 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE-LABEL: test_v4i32:
 ; SSE:       # %bb.0:
@@ -408,6 +433,68 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE-LABEL: test_v8i16:
 ; SSE:       # %bb.0:
@@ -671,6 +758,140 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
 define i8 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i8:
 ; SSE2:       # %bb.0:
@@ -1044,16 +1265,22 @@ declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-and-widen.ll b/test/CodeGen/X86/vector-reduce-and-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..13415561cf757ced0702e4ddc0eb4e01a5cad860
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-and-widen.ll
@@ -0,0 +1,1265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE-LABEL: test_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE-LABEL: test_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE-LABEL: test_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE-LABEL: test_v16i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm6, %xmm2
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE-LABEL: test_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE-LABEL: test_v32i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm6, %xmm2
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE-LABEL: test_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE-LABEL: test_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE-LABEL: test_v64i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm6, %xmm2
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand %xmm6, %xmm2
+; SSE41-NEXT:    pand %xmm7, %xmm3
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v128i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-and.ll b/test/CodeGen/X86/vector-reduce-and.ll
index 305464e3707141ba67775ff9af33bfe1e6e18a69..a3448f545ff06d9cbbe31184f0eb7dafa8021f07 100644
--- a/test/CodeGen/X86/vector-reduce-and.ll
+++ b/test/CodeGen/X86/vector-reduce-and.ll
@@ -186,6 +186,31 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE-LABEL: test_v4i32:
 ; SSE:       # %bb.0:
@@ -392,6 +417,68 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE-LABEL: test_v8i16:
 ; SSE:       # %bb.0:
@@ -647,6 +734,140 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
 define i8 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i8:
 ; SSE2:       # %bb.0:
@@ -1012,16 +1233,22 @@ declare i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-mul-widen.ll b/test/CodeGen/X86/vector-reduce-mul-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7fbd7245c410714f9b5a55db99b1c0bf60d9168a
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-mul-widen.ll
@@ -0,0 +1,3377 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE-LABEL: test_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    pmuludq %xmm0, %xmm3
+; SSE-NEXT:    paddq %xmm2, %xmm3
+; SSE-NEXT:    psllq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v2i64:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v2i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovq %xmm0, %rax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v2i64:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
+; AVX512DQVL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE-LABEL: test_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    psrlq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm0, %xmm3
+; SSE-NEXT:    paddq %xmm2, %xmm3
+; SSE-NEXT:    psllq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    pmuludq %xmm0, %xmm3
+; SSE-NEXT:    paddq %xmm2, %xmm3
+; SSE-NEXT:    psllq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmuludq %ymm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm3, %ymm3
+; AVX512BW-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v4i64:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX512BWVL-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; AVX512BWVL-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX512BWVL-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; AVX512BWVL-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v4i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovq %xmm0, %rax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v4i64:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE-LABEL: test_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    psrlq $32, %xmm5
+; SSE-NEXT:    pmuludq %xmm1, %xmm5
+; SSE-NEXT:    paddq %xmm4, %xmm5
+; SSE-NEXT:    psllq $32, %xmm5
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    paddq %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrlq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm0, %xmm4
+; SSE-NEXT:    paddq %xmm3, %xmm4
+; SSE-NEXT:    psllq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    paddq %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    psrlq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm0, %xmm3
+; SSE-NEXT:    paddq %xmm2, %xmm3
+; SSE-NEXT:    psllq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    pmuludq %xmm0, %xmm3
+; SSE-NEXT:    paddq %xmm2, %xmm3
+; SSE-NEXT:    psllq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v8i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v8i64:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v8i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovq %xmm0, %rax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v8i64:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE-LABEL: test_v16i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm2, %xmm8
+; SSE-NEXT:    psrlq $32, %xmm8
+; SSE-NEXT:    pmuludq %xmm6, %xmm8
+; SSE-NEXT:    movdqa %xmm6, %xmm9
+; SSE-NEXT:    psrlq $32, %xmm9
+; SSE-NEXT:    pmuludq %xmm2, %xmm9
+; SSE-NEXT:    paddq %xmm8, %xmm9
+; SSE-NEXT:    psllq $32, %xmm9
+; SSE-NEXT:    pmuludq %xmm6, %xmm2
+; SSE-NEXT:    paddq %xmm9, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm8
+; SSE-NEXT:    psrlq $32, %xmm8
+; SSE-NEXT:    pmuludq %xmm4, %xmm8
+; SSE-NEXT:    movdqa %xmm4, %xmm6
+; SSE-NEXT:    psrlq $32, %xmm6
+; SSE-NEXT:    pmuludq %xmm0, %xmm6
+; SSE-NEXT:    paddq %xmm8, %xmm6
+; SSE-NEXT:    psllq $32, %xmm6
+; SSE-NEXT:    pmuludq %xmm4, %xmm0
+; SSE-NEXT:    paddq %xmm6, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm7, %xmm4
+; SSE-NEXT:    movdqa %xmm7, %xmm6
+; SSE-NEXT:    psrlq $32, %xmm6
+; SSE-NEXT:    pmuludq %xmm3, %xmm6
+; SSE-NEXT:    paddq %xmm4, %xmm6
+; SSE-NEXT:    psllq $32, %xmm6
+; SSE-NEXT:    pmuludq %xmm7, %xmm3
+; SSE-NEXT:    paddq %xmm6, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm5, %xmm4
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    psrlq $32, %xmm6
+; SSE-NEXT:    pmuludq %xmm1, %xmm6
+; SSE-NEXT:    paddq %xmm4, %xmm6
+; SSE-NEXT:    psllq $32, %xmm6
+; SSE-NEXT:    pmuludq %xmm5, %xmm1
+; SSE-NEXT:    paddq %xmm6, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    psrlq $32, %xmm5
+; SSE-NEXT:    pmuludq %xmm1, %xmm5
+; SSE-NEXT:    paddq %xmm4, %xmm5
+; SSE-NEXT:    psllq $32, %xmm5
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    paddq %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrlq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm0, %xmm4
+; SSE-NEXT:    paddq %xmm3, %xmm4
+; SSE-NEXT:    psllq $32, %xmm4
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    paddq %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    psrlq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm0, %xmm3
+; SSE-NEXT:    paddq %xmm2, %xmm3
+; SSE-NEXT:    psllq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    pmuludq %xmm0, %xmm3
+; SSE-NEXT:    paddq %xmm2, %xmm3
+; SSE-NEXT:    psllq $32, %xmm3
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm6
+; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm6
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm7
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
+; AVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpaddq %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm4
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm5
+; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm5
+; AVX2-NEXT:    vpaddq %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpsllq $32, %ymm4, %ymm4
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm4
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v16i64:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
+; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v16i64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovq %xmm0, %rax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v16i64:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE2-LABEL: test_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE2-LABEL: test_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,0,0]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE2-LABEL: test_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm4, %xmm6
+; SSE2-NEXT:    pmuludq %xmm5, %xmm6
+; SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,2,0,0]
+; SSE2-NEXT:    pmuludq %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmulld %xmm3, %xmm1
+; SSE41-NEXT:    pmulld %xmm2, %xmm0
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE2-LABEL: test_v32i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm9, %xmm3
+; SSE2-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm10, %xmm2
+; SSE2-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,2,0,0]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmulld %xmm6, %xmm2
+; SSE41-NEXT:    pmulld %xmm4, %xmm0
+; SSE41-NEXT:    pmulld %xmm2, %xmm0
+; SSE41-NEXT:    pmulld %xmm7, %xmm3
+; SSE41-NEXT:    pmulld %xmm5, %xmm1
+; SSE41-NEXT:    pmulld %xmm3, %xmm1
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE-LABEL: test_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE-LABEL: test_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw %xmm3, %xmm1
+; SSE-NEXT:    pmullw %xmm2, %xmm0
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v32i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v32i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovd %xmm0, %eax
+; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v32i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
+; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE-LABEL: test_v64i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw %xmm6, %xmm2
+; SSE-NEXT:    pmullw %xmm4, %xmm0
+; SSE-NEXT:    pmullw %xmm2, %xmm0
+; SSE-NEXT:    pmullw %xmm7, %xmm3
+; SSE-NEXT:    pmullw %xmm5, %xmm1
+; SSE-NEXT:    pmullw %xmm3, %xmm1
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v64i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v64i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v64i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovd %xmm0, %eax
+; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v64i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
+; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmullw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmullw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmullw %xmm3, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmullw %xmm3, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQVL-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pmullw %xmm2, %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    packuswb %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pmullw %xmm2, %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    packuswb %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE41-NEXT:    pmullw %xmm2, %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    packuswb %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    pmullw %xmm0, %xmm3
+; SSE41-NEXT:    pextrb $0, %xmm3, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v32i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v32i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v32i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    packuswb %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pmullw %xmm5, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    packuswb %xmm0, %xmm4
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm3, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pmullw %xmm0, %xmm5
+; SSE41-NEXT:    pand %xmm2, %xmm5
+; SSE41-NEXT:    packuswb %xmm1, %xmm5
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm5, %xmm4
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    pmullw %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    packuswb %xmm4, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmullw %xmm3, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmullw %xmm3, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pmullw %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v64i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm2, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm2, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v64i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v64i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm5, %xmm8
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm8, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm8, %xmm9
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    packuswb %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm9, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    packuswb %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    packuswb %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    packuswb %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    packuswb %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    pmullw %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm5, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    pmullw %xmm9, %xmm8
+; SSE41-NEXT:    pand %xmm5, %xmm8
+; SSE41-NEXT:    packuswb %xmm1, %xmm8
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm7, %xmm3
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    pmullw %xmm9, %xmm1
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    packuswb %xmm3, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm4, %xmm0
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    pmullw %xmm7, %xmm3
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    packuswb %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm6, %xmm2
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    pmullw %xmm0, %xmm4
+; SSE41-NEXT:    pand %xmm5, %xmm4
+; SSE41-NEXT:    packuswb %xmm2, %xmm4
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    packuswb %xmm3, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm1, %xmm8
+; SSE41-NEXT:    pand %xmm5, %xmm8
+; SSE41-NEXT:    pmullw %xmm2, %xmm3
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    packuswb %xmm8, %xmm3
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm3, %xmm0
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    pmullw %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmullw %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT:    pmullw %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE41-NEXT:    pmullw %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pmullw %xmm0, %xmm2
+; SSE41-NEXT:    pextrb $0, %xmm2, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpmullw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; AVX1-NEXT:    vpmullw %xmm9, %xmm7, %xmm7
+; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm6
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpmullw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v128i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpmullw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v128i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpmullw %zmm4, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpmullw %zmm4, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm3, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm3, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpmullw %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v128i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v128i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm5, %ymm5
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-mul.ll b/test/CodeGen/X86/vector-reduce-mul.ll
index 58d712c35aacecaa16c3fede79828c30694339f5..ea80a5be2a6b9949faca34f5d87fe0ca5857a1d1 100644
--- a/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/test/CodeGen/X86/vector-reduce-mul.ll
@@ -775,6 +775,54 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmuludq %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v2i32:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v2i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovd %xmm0, %eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v2i32:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
+; AVX512DQVL-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE2-LABEL: test_v4i32:
 ; SSE2:       # %bb.0:
@@ -1091,6 +1139,110 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmuludq %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovd %xmm0, %eax
+; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v2i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
+; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512DQVL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE2-LABEL: test_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pextrw $0, %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE-LABEL: test_v8i16:
 ; SSE:       # %bb.0:
@@ -1457,215 +1609,365 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pmullw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    packuswb %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    packuswb %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT:    pmullw %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    packuswb %xmm3, %xmm2
-; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-NEXT:    retq
 ;
-; SSE41-LABEL: test_v16i8:
+; SSE41-LABEL: test_v2i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    packuswb %xmm2, %xmm0
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmullw %xmm3, %xmm0
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    packuswb %xmm2, %xmm0
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pmullw %xmm3, %xmm0
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    packuswb %xmm2, %xmm0
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    pmullw %xmm1, %xmm0
-; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmuludq %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: test_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    # kill: def $al killed $al killed $eax
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
 ;
-; AVX512BW-LABEL: test_v16i8:
+; AVX512BW-LABEL: test_v2i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
-; AVX512BWVL-LABEL: test_v16i8:
+; AVX512BWVL-LABEL: test_v2i8:
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
-; AVX512DQ-LABEL: test_v16i8:
+; AVX512DQ-LABEL: test_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512DQVL-LABEL: test_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512DQVL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmullw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmullw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmullw %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pmullw %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmullw %xmm3, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmullw %xmm3, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v16i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
@@ -1675,28 +1977,25 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX512DQVL-LABEL: test_v16i8:
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQVL-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
@@ -1726,39 +2025,35 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,2,3,3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSE2-NEXT:    pmullw %xmm0, %xmm3
 ; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test_v32i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm1, %xmm0
@@ -1768,7 +2063,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; SSE41-NEXT:    pand %xmm1, %xmm3
 ; SSE41-NEXT:    packuswb %xmm0, %xmm3
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm0, %xmm3
 ; SSE41-NEXT:    pand %xmm1, %xmm3
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
@@ -1794,56 +2089,42 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ;
 ; AVX1-LABEL: test_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -1851,43 +2132,40 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ;
 ; AVX2-LABEL: test_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1896,28 +2174,24 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512BW-LABEL: test_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
@@ -1928,28 +2202,24 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512BWVL-LABEL: test_v32i8:
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
@@ -1959,36 +2229,40 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ;
 ; AVX512DQ-LABEL: test_v32i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1997,35 +2271,39 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512DQVL-LABEL: test_v32i8:
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -2076,39 +2354,35 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test_v64i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
@@ -2118,25 +2392,25 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE41-NEXT:    pand %xmm2, %xmm4
 ; SSE41-NEXT:    packuswb %xmm0, %xmm4
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm3, %xmm1
 ; SSE41-NEXT:    pand %xmm2, %xmm1
 ; SSE41-NEXT:    pmullw %xmm0, %xmm5
 ; SSE41-NEXT:    pand %xmm2, %xmm5
 ; SSE41-NEXT:    packuswb %xmm1, %xmm5
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm5, %xmm4
 ; SSE41-NEXT:    pand %xmm2, %xmm4
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pand %xmm2, %xmm1
 ; SSE41-NEXT:    packuswb %xmm4, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pand %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
@@ -2162,7 +2436,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ;
 ; AVX1-LABEL: test_v64i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
@@ -2173,7 +2447,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
@@ -2184,7 +2458,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -2193,44 +2467,30 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm3
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -2238,58 +2498,49 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ;
 ; AVX2-LABEL: test_v64i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm2
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm3
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm1
-; AVX2-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -2298,35 +2549,49 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512BW-LABEL: test_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512BW-NEXT:    vzeroupper
@@ -2335,35 +2600,49 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512BWVL-LABEL: test_v64i8:
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpmullw %zmm4, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm2, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm2, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -2371,47 +2650,49 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ;
 ; AVX512DQ-LABEL: test_v64i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm2
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm3
-; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm2, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2419,47 +2700,49 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ;
 ; AVX512DQVL-LABEL: test_v64i8:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm2
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm3
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512DQVL-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512DQVL-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm2, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -2554,41 +2837,37 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
 ; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm8, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test_v128i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm5, %xmm1
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pand %xmm5, %xmm1
@@ -2596,16 +2875,16 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE41-NEXT:    pand %xmm5, %xmm8
 ; SSE41-NEXT:    packuswb %xmm1, %xmm8
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm7, %xmm3
 ; SSE41-NEXT:    pand %xmm5, %xmm3
 ; SSE41-NEXT:    pmullw %xmm9, %xmm1
 ; SSE41-NEXT:    pand %xmm5, %xmm1
 ; SSE41-NEXT:    packuswb %xmm3, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm4, %xmm0
@@ -2614,34 +2893,34 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE41-NEXT:    pand %xmm5, %xmm3
 ; SSE41-NEXT:    packuswb %xmm0, %xmm3
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm6, %xmm2
 ; SSE41-NEXT:    pand %xmm5, %xmm2
 ; SSE41-NEXT:    pmullw %xmm0, %xmm4
 ; SSE41-NEXT:    pand %xmm5, %xmm4
 ; SSE41-NEXT:    packuswb %xmm2, %xmm4
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm4, %xmm3
 ; SSE41-NEXT:    pand %xmm5, %xmm3
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
 ; SSE41-NEXT:    pand %xmm5, %xmm0
 ; SSE41-NEXT:    packuswb %xmm3, %xmm0
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm1, %xmm8
 ; SSE41-NEXT:    pand %xmm5, %xmm8
 ; SSE41-NEXT:    pmullw %xmm2, %xmm3
 ; SSE41-NEXT:    pand %xmm5, %xmm3
 ; SSE41-NEXT:    packuswb %xmm8, %xmm3
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm3, %xmm0
@@ -2650,7 +2929,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE41-NEXT:    pand %xmm5, %xmm2
 ; SSE41-NEXT:    packuswb %xmm0, %xmm2
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE41-NEXT:    pmullw %xmm0, %xmm2
 ; SSE41-NEXT:    pand %xmm5, %xmm2
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
@@ -2677,9 +2956,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-LABEL: test_v128i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
 ; AVX1-NEXT:    vpmullw %xmm4, %xmm7, %xmm7
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
 ; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
@@ -2689,9 +2968,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
 ; AVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
 ; AVX1-NEXT:    vpmullw %xmm9, %xmm7, %xmm7
 ; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
@@ -2699,7 +2978,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
 ; AVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm6
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpmullw %xmm5, %xmm7, %xmm5
 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
@@ -2708,8 +2987,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
@@ -2717,7 +2996,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
@@ -2726,8 +3005,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
@@ -2735,7 +3014,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
@@ -2745,44 +3024,30 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -2790,88 +3055,67 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ;
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vpmovsxbw %xmm4, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
-; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX2-NEXT:    vpmovsxbw %xmm6, %ymm6
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
-; AVX2-NEXT:    vpmovsxbw %xmm7, %ymm7
-; AVX2-NEXT:    vpmullw %ymm6, %ymm7, %ymm6
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
-; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm2
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpmovsxbw %xmm6, %ymm1
-; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm2
-; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -2879,45 +3123,59 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ;
 ; AVX512BW-LABEL: test_v128i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm2
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm3
-; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm2, %zmm1
-; AVX512BW-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BW-NEXT:    vpmullw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512BW-NEXT:    vpmullw %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpmullw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512BW-NEXT:    vzeroupper
@@ -2925,45 +3183,59 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ;
 ; AVX512BWVL-LABEL: test_v128i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm2
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm3
-; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm2, %zmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpmullw %zmm4, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; AVX512BWVL-NEXT:    vpmullw %zmm4, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm3, %zmm3
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm3, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm3, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpmullw %zmm3, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -2971,69 +3243,67 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ;
 ; AVX512DQ-LABEL: test_v128i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512DQ-NEXT:    vpmovsxbw %xmm4, %ymm4
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm5
-; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQ-NEXT:    vpmovsxwd %ymm4, %zmm4
-; AVX512DQ-NEXT:    vpmovdb %zmm4, %xmm4
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm5
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-NEXT:    vpmovsxbw %xmm6, %ymm6
-; AVX512DQ-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
-; AVX512DQ-NEXT:    vpmovsxwd %ymm5, %zmm5
-; AVX512DQ-NEXT:    vpmovdb %zmm5, %xmm5
-; AVX512DQ-NEXT:    vpmovsxbw %xmm2, %ymm2
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm3, %ymm2
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm4, %ymm2
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -3041,69 +3311,67 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ;
 ; AVX512DQVL-LABEL: test_v128i8:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm4, %ymm4
-; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm5
-; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm4, %zmm4
-; AVX512DQVL-NEXT:    vpmovdb %zmm4, %xmm4
-; AVX512DQVL-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm5
-; AVX512DQVL-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm6, %ymm6
-; AVX512DQVL-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm5, %zmm5
-; AVX512DQVL-NEXT:    vpmovdb %zmm5, %xmm5
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm2, %ymm2
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm5, %ymm5
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm3, %ymm2
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm4, %ymm2
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm2, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -3117,16 +3385,22 @@ declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-or-widen.ll b/test/CodeGen/X86/vector-reduce-or-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4de56b0b805f5ee0e68b96888a76bef7922169c6
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-or-widen.ll
@@ -0,0 +1,1265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE-LABEL: test_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE-LABEL: test_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE-LABEL: test_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE-LABEL: test_v16i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    por %xmm4, %xmm2
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE-LABEL: test_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE-LABEL: test_v32i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    por %xmm4, %xmm2
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE-LABEL: test_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE-LABEL: test_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE-LABEL: test_v64i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    por %xmm4, %xmm2
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    por %xmm3, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    por %xmm6, %xmm2
+; SSE41-NEXT:    por %xmm7, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm3
+; SSE41-NEXT:    por %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm4, %xmm2
+; SSE41-NEXT:    por %xmm3, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v128i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-or.ll b/test/CodeGen/X86/vector-reduce-or.ll
index 1b67c94e4ec5b07073b4a50b5f9ab6e81281edc9..1cb6af66bf4651f8075c274d8c4a5f10df242cb6 100644
--- a/test/CodeGen/X86/vector-reduce-or.ll
+++ b/test/CodeGen/X86/vector-reduce-or.ll
@@ -186,6 +186,31 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE-LABEL: test_v4i32:
 ; SSE:       # %bb.0:
@@ -392,6 +417,68 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE-LABEL: test_v8i16:
 ; SSE:       # %bb.0:
@@ -647,6 +734,140 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
 define i8 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i8:
 ; SSE2:       # %bb.0:
@@ -1012,16 +1233,22 @@ declare i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-smax-widen.ll b/test/CodeGen/X86/vector-reduce-smax-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b31c2cf5dbb3ec0073ee0c8ce69f5851eae1a4d4
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-smax-widen.ll
@@ -0,0 +1,2031 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE2-LABEL: test_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE2-LABEL: test_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm4
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE2-LABEL: test_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm6
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm1
+; SSE41-NEXT:    xorpd %xmm5, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE2-LABEL: test_v16i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm9
+; SSE2-NEXT:    por %xmm1, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm9
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm9, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm5, %xmm10
+; SSE41-NEXT:    pxor %xmm9, %xmm10
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm11
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm0
+; SSE41-NEXT:    por %xmm11, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm10
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    pxor %xmm9, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; SSE41-NEXT:    movapd %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm5, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm6, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE2-LABEL: test_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE2-LABEL: test_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE2-LABEL: test_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE41-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE2-LABEL: test_v32i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm8
+; SSE2-NEXT:    por %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsd %xmm6, %xmm2
+; SSE41-NEXT:    pmaxsd %xmm4, %xmm0
+; SSE41-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsd %xmm7, %xmm3
+; SSE41-NEXT:    pmaxsd %xmm5, %xmm1
+; SSE41-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxsd %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpmaxsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpmaxsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsd %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmaxsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE2-LABEL: test_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE2-LABEL: test_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE2-LABEL: test_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE41-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE2-LABEL: test_v64i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pmaxsw %xmm6, %xmm2
+; SSE2-NEXT:    pmaxsw %xmm4, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE2-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE41-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE41-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE41-NEXT:    pmaxsw %xmm6, %xmm2
+; SSE41-NEXT:    pmaxsw %xmm4, %xmm0
+; SSE41-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmaxsw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pmaxsb %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pmaxsb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmaxsb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pmaxsb %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE41-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm8
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm8
+; SSE2-NEXT:    por %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxsb %xmm7, %xmm3
+; SSE41-NEXT:    pmaxsb %xmm5, %xmm1
+; SSE41-NEXT:    pmaxsb %xmm3, %xmm1
+; SSE41-NEXT:    pmaxsb %xmm6, %xmm2
+; SSE41-NEXT:    pmaxsb %xmm4, %xmm0
+; SSE41-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE41-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmaxsb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v128i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-smax.ll b/test/CodeGen/X86/vector-reduce-smax.ll
index 107b6f8f9530d0d31d02c5f250476ba380f8b693..01f453bd982d59413cb2dd904285e8be678a6e24 100644
--- a/test/CodeGen/X86/vector-reduce-smax.ll
+++ b/test/CodeGen/X86/vector-reduce-smax.ll
@@ -36,17 +36,16 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movq %xmm2, %rax
@@ -120,17 +119,16 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    pxor %xmm3, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
 ; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
@@ -141,10 +139,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movq %xmm2, %rax
 ; SSE41-NEXT:    retq
@@ -270,17 +267,16 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
 ; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    movdqa %xmm6, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -291,10 +287,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm3, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
@@ -304,10 +299,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -317,10 +311,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    retq
@@ -513,10 +506,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm11
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm12, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm0
+; SSE41-NEXT:    por %xmm11, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
@@ -526,22 +518,20 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm11, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm8, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    pxor %xmm9, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    pxor %xmm9, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm10, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE41-NEXT:    movdqa %xmm6, %xmm0
@@ -552,10 +542,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE41-NEXT:    movapd %xmm6, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -565,10 +554,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -578,10 +566,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -591,10 +578,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
@@ -604,10 +590,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    retq
@@ -682,6 +667,116 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movd %xmm3, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpsraq $32, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT:    vpsraq $32, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpsraq $32, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmaxsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE2-LABEL: test_v4i32:
 ; SSE2:       # %bb.0:
@@ -1019,6 +1114,216 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE2-LABEL: test_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psllq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    psllq $48, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    psllq $48, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    psllq $48, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpsraq $48, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsraq $48, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpsraq $48, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmaxsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE2-LABEL: test_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pslld $16, %xmm1
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpslld $16, %xmm1, %xmm1
+; AVX-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpslld $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-LABEL: test_v8i16:
 ; SSE2:       # %bb.0:
@@ -1311,6 +1616,302 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psllq $56, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    psllq $56, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    psllq $56, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    psllq $56, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllq $56, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpsraq $56, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsraq $56, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $56, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpsraq $56, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmaxsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pslld $24, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pslld $24, %xmm1
+; SSE41-NEXT:    psrad $24, %xmm1
+; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $24, %xmm0, %xmm1
+; AVX-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpslld $24, %xmm1, %xmm1
+; AVX-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpslld $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    psraw $8, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    psraw $8, %xmm0
+; SSE41-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
 define i8 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i8:
 ; SSE2:       # %bb.0:
@@ -1758,16 +2359,22 @@ declare i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-smin-widen.ll b/test/CodeGen/X86/vector-reduce-smin-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..140df76e8ec2d9f2a37f7c54c228eb053f6fff75
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-smin-widen.ll
@@ -0,0 +1,2030 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE2-LABEL: test_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE2-LABEL: test_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpminsq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpminsq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE2-LABEL: test_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    xorpd %xmm5, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE2-LABEL: test_v16i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm7, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm9, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    pxor %xmm9, %xmm10
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm11
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm0
+; SSE41-NEXT:    por %xmm11, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm2
+; SSE41-NEXT:    pxor %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm7, %xmm2
+; SSE41-NEXT:    pxor %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm8
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm0
+; SSE41-NEXT:    por %xmm8, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
+; SSE41-NEXT:    pxor %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm6, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
+; SSE41-NEXT:    movapd %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm9, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pminsd %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE2-LABEL: test_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pminsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE2-LABEL: test_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pminsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE2-LABEL: test_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsd %xmm3, %xmm1
+; SSE41-NEXT:    pminsd %xmm2, %xmm0
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pminsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE2-LABEL: test_v32i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm5, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm8
+; SSE2-NEXT:    por %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm8
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsd %xmm6, %xmm2
+; SSE41-NEXT:    pminsd %xmm4, %xmm0
+; SSE41-NEXT:    pminsd %xmm2, %xmm0
+; SSE41-NEXT:    pminsd %xmm7, %xmm3
+; SSE41-NEXT:    pminsd %xmm5, %xmm1
+; SSE41-NEXT:    pminsd %xmm3, %xmm1
+; SSE41-NEXT:    pminsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pminsd %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpminsd %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpminsd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpminsd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsd %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminsd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpminsd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pminsw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pminsw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE2-LABEL: test_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE2-LABEL: test_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE2-LABEL: test_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pminsw %xmm3, %xmm1
+; SSE2-NEXT:    pminsw %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsw %xmm3, %xmm1
+; SSE41-NEXT:    pminsw %xmm2, %xmm0
+; SSE41-NEXT:    pminsw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE2-LABEL: test_v64i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pminsw %xmm6, %xmm2
+; SSE2-NEXT:    pminsw %xmm4, %xmm0
+; SSE2-NEXT:    pminsw %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm7, %xmm3
+; SSE2-NEXT:    pminsw %xmm5, %xmm1
+; SSE2-NEXT:    pminsw %xmm3, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsw %xmm7, %xmm3
+; SSE41-NEXT:    pminsw %xmm5, %xmm1
+; SSE41-NEXT:    pminsw %xmm3, %xmm1
+; SSE41-NEXT:    pminsw %xmm6, %xmm2
+; SSE41-NEXT:    pminsw %xmm4, %xmm0
+; SSE41-NEXT:    pminsw %xmm2, %xmm0
+; SSE41-NEXT:    pminsw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpminsw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminsw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pminsb %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pminsb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pminsb %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pminsb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pminsb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pminsb %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsb %xmm3, %xmm1
+; SSE41-NEXT:    pminsb %xmm2, %xmm0
+; SSE41-NEXT:    pminsb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm5, %xmm8
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm8
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm8
+; SSE2-NEXT:    por %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm8
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminsb %xmm7, %xmm3
+; SSE41-NEXT:    pminsb %xmm5, %xmm1
+; SSE41-NEXT:    pminsb %xmm3, %xmm1
+; SSE41-NEXT:    pminsb %xmm6, %xmm2
+; SSE41-NEXT:    pminsb %xmm4, %xmm0
+; SSE41-NEXT:    pminsb %xmm2, %xmm0
+; SSE41-NEXT:    pminsb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpminsb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminsb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminsb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpminsb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v128i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-smin.ll b/test/CodeGen/X86/vector-reduce-smin.ll
index 7edcc16688fbd01125a76ad9e4d0477129127bbf..e8b6a242b8bf83387d3043ad21b273e2063e808b 100644
--- a/test/CodeGen/X86/vector-reduce-smin.ll
+++ b/test/CodeGen/X86/vector-reduce-smin.ll
@@ -44,10 +44,9 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movq %xmm2, %rax
 ; SSE41-NEXT:    retq
@@ -127,10 +126,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -140,10 +138,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movq %xmm2, %rax
 ; SSE41-NEXT:    retq
@@ -277,10 +274,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
@@ -290,10 +286,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
@@ -303,10 +298,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -316,10 +310,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    retq
@@ -512,10 +505,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm11
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm12, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm0
+; SSE41-NEXT:    por %xmm11, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
@@ -525,10 +517,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pand %xmm11, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
@@ -538,10 +529,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm0
+; SSE41-NEXT:    por %xmm8, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
@@ -551,10 +541,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE41-NEXT:    movapd %xmm5, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -564,10 +553,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -577,10 +565,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
 ; SSE41-NEXT:    movapd %xmm6, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -590,10 +577,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
@@ -603,10 +589,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    retq
@@ -681,6 +666,116 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movd %xmm3, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpsraq $32, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT:    vpsraq $32, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpsraq $32, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpminsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE2-LABEL: test_v4i32:
 ; SSE2:       # %bb.0:
@@ -1018,6 +1113,216 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE2-LABEL: test_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psllq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    psllq $48, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    psllq $48, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    psllq $48, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpsraq $48, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsraq $48, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpsraq $48, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpminsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE2-LABEL: test_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pslld $16, %xmm1
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    pminsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX-NEXT:    vpminsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpslld $16, %xmm1, %xmm1
+; AVX-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpminsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpslld $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-LABEL: test_v8i16:
 ; SSE2:       # %bb.0:
@@ -1310,6 +1615,302 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psllq $56, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    psllq $56, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    psllq $56, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    psllq $56, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllq $56, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpsraq $56, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsraq $56, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $56, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpsraq $56, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpminsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pslld $24, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pslld $24, %xmm1
+; SSE41-NEXT:    psrad $24, %xmm1
+; SSE41-NEXT:    pminsd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pminsd %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $24, %xmm0, %xmm1
+; AVX-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX-NEXT:    vpminsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpslld $24, %xmm1, %xmm1
+; AVX-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpminsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpslld $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    psraw $8, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    pminsw %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    psraw $8, %xmm0
+; SSE41-NEXT:    pminsw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    pminsw %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpminsw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpminsw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
 define i8 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i8:
 ; SSE2:       # %bb.0:
@@ -1757,16 +2358,22 @@ declare i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-umax-widen.ll b/test/CodeGen/X86/vector-reduce-umax-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3b6cdad368277c17d9bfb1d73dece02273942e4d
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-umax-widen.ll
@@ -0,0 +1,2203 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE2-LABEL: test_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE2-LABEL: test_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm4
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE2-LABEL: test_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm6
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm1
+; SSE41-NEXT:    xorpd %xmm5, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm2
+; AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE2-LABEL: test_v16i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm9
+; SSE2-NEXT:    por %xmm1, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm9
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm9, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm5, %xmm10
+; SSE41-NEXT:    pxor %xmm9, %xmm10
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm11
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm0
+; SSE41-NEXT:    por %xmm11, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm10
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    pxor %xmm9, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; SSE41-NEXT:    movapd %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm5, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm6, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vxorpd %xmm4, %xmm1, %xmm3
+; AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm2
+; AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm2
+; AVX1-NEXT:    vxorpd %xmm4, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm2
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vxorpd %ymm4, %ymm1, %ymm2
+; AVX2-NEXT:    vxorpd %ymm4, %ymm0, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vxorpd %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vxorpd %ymm4, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vxorpd %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vxorpd %ymm4, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE2-LABEL: test_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE2-LABEL: test_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE2-LABEL: test_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxud %xmm3, %xmm1
+; SSE41-NEXT:    pmaxud %xmm2, %xmm0
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE2-LABEL: test_v32i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm9
+; SSE2-NEXT:    por %xmm1, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm9
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm9, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxud %xmm6, %xmm2
+; SSE41-NEXT:    pmaxud %xmm4, %xmm0
+; SSE41-NEXT:    pmaxud %xmm2, %xmm0
+; SSE41-NEXT:    pmaxud %xmm7, %xmm3
+; SSE41-NEXT:    pmaxud %xmm5, %xmm1
+; SSE41-NEXT:    pmaxud %xmm3, %xmm1
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE2-LABEL: test_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE2-LABEL: test_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE2-LABEL: test_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE2-LABEL: test_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE2-LABEL: test_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE2-LABEL: test_v64i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pmaxsw %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm4, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxuw %xmm7, %xmm3
+; SSE41-NEXT:    pmaxuw %xmm5, %xmm1
+; SSE41-NEXT:    pmaxuw %xmm3, %xmm1
+; SSE41-NEXT:    pmaxuw %xmm6, %xmm2
+; SSE41-NEXT:    pmaxuw %xmm4, %xmm0
+; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmaxuw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxuw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmaxuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v64i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v64i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pmaxub %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pmaxub %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmaxub %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmaxub %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmaxub %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pmaxub %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxub %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pmaxub %xmm3, %xmm1
+; SSE2-NEXT:    pmaxub %xmm2, %xmm0
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxub %xmm3, %xmm1
+; SSE41-NEXT:    pmaxub %xmm2, %xmm0
+; SSE41-NEXT:    pmaxub %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pmaxub %xmm6, %xmm2
+; SSE2-NEXT:    pmaxub %xmm4, %xmm0
+; SSE2-NEXT:    pmaxub %xmm2, %xmm0
+; SSE2-NEXT:    pmaxub %xmm7, %xmm3
+; SSE2-NEXT:    pmaxub %xmm5, %xmm1
+; SSE2-NEXT:    pmaxub %xmm3, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmaxub %xmm7, %xmm3
+; SSE41-NEXT:    pmaxub %xmm5, %xmm1
+; SSE41-NEXT:    pmaxub %xmm3, %xmm1
+; SSE41-NEXT:    pmaxub %xmm6, %xmm2
+; SSE41-NEXT:    pmaxub %xmm4, %xmm0
+; SSE41-NEXT:    pmaxub %xmm2, %xmm0
+; SSE41-NEXT:    pmaxub %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pminub %xmm0, %xmm2
+; SSE41-NEXT:    phminposuw %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmaxub %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxub %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmaxub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v128i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v128i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-umax.ll b/test/CodeGen/X86/vector-reduce-umax.ll
index 52b42ce9bcbf9dd6e6c4ce7bc164b93df9f3ac60..ffc98bb289d762543ff792ef1a6a020620cd2f8e 100644
--- a/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/test/CodeGen/X86/vector-reduce-umax.ll
@@ -36,17 +36,16 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movq %xmm2, %rax
@@ -123,17 +122,16 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    pxor %xmm3, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
 ; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
@@ -144,10 +142,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movq %xmm2, %rax
 ; SSE41-NEXT:    retq
@@ -283,17 +280,16 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
 ; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    movdqa %xmm6, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm8, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -304,10 +300,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm3, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
@@ -317,10 +312,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -330,10 +324,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    retq
@@ -542,10 +535,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm11
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm12, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm0
+; SSE41-NEXT:    por %xmm11, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
@@ -555,22 +547,20 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm11, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    movdqa %xmm8, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    pxor %xmm9, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    pxor %xmm9, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm10, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE41-NEXT:    movdqa %xmm6, %xmm0
@@ -581,10 +571,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE41-NEXT:    movapd %xmm6, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -594,10 +583,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -607,10 +595,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -620,10 +607,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
@@ -633,10 +619,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    retq
@@ -739,6 +724,98 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movd %xmm2, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512BW-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    vpmaxuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE2-LABEL: test_v4i32:
 ; SSE2:       # %bb.0:
@@ -1133,6 +1210,167 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE2-LABEL: test_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movd %xmm2, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512VL-NEXT:    vpmaxuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE2-LABEL: test_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7]
+; SSE41-NEXT:    pmaxud %xmm0, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX512-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-LABEL: test_v8i16:
 ; SSE2:       # %bb.0:
@@ -1526,6 +1764,274 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    por %xmm3, %xmm4
+; SSE41-NEXT:    por %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    pextrb $0, %xmm2, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpmaxuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmaxud %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
 define i8 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i8:
 ; SSE2:       # %bb.0:
@@ -1921,16 +2427,22 @@ declare i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-umin-widen.ll b/test/CodeGen/X86/vector-reduce-umin-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..51d2a25130d0a39cf0f69b797800d55317baef0c
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-umin-widen.ll
@@ -0,0 +1,2008 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE2-LABEL: test_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpminuq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE2-LABEL: test_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v4i64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpminuq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpminuq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE2-LABEL: test_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    movdqa %xmm6, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    xorpd %xmm5, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE2-LABEL: test_v16i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm10, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm7, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm9, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    pxor %xmm9, %xmm10
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm11
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm11
+; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm0
+; SSE41-NEXT:    por %xmm11, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm2
+; SSE41-NEXT:    pxor %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm7, %xmm2
+; SSE41-NEXT:    pxor %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm8
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm0
+; SSE41-NEXT:    por %xmm8, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
+; SSE41-NEXT:    pxor %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm6, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
+; SSE41-NEXT:    movapd %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm1
+; SSE41-NEXT:    xorpd %xmm9, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm9, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vxorpd %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm2
+; AVX1-NEXT:    vxorpd %xmm4, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm5
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vxorpd %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vxorpd %ymm4, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vxorpd %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vxorpd %ymm4, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vxorpd %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vxorpd %ymm4, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE2-LABEL: test_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE2-LABEL: test_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE2-LABEL: test_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm1, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm6, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movd %xmm4, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminud %xmm3, %xmm1
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE2-LABEL: test_v32i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm7, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm7, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm9, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm8
+; SSE2-NEXT:    por %xmm3, %xmm8
+; SSE2-NEXT:    movd %xmm8, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminud %xmm6, %xmm2
+; SSE41-NEXT:    pminud %xmm4, %xmm0
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pminud %xmm7, %xmm3
+; SSE41-NEXT:    pminud %xmm5, %xmm1
+; SSE41-NEXT:    pminud %xmm3, %xmm1
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpminud %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE2-LABEL: test_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pminuw %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE2-LABEL: test_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pminuw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pminuw %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE2-LABEL: test_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE2-LABEL: test_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminuw %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE2-LABEL: test_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pminsw %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pminsw %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminuw %xmm3, %xmm1
+; SSE41-NEXT:    pminuw %xmm2, %xmm0
+; SSE41-NEXT:    pminuw %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE2-LABEL: test_v64i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pminsw %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pminsw %xmm4, %xmm0
+; SSE2-NEXT:    pminsw %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pminsw %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pminsw %xmm5, %xmm1
+; SSE2-NEXT:    pminsw %xmm3, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminuw %xmm7, %xmm3
+; SSE41-NEXT:    pminuw %xmm5, %xmm1
+; SSE41-NEXT:    pminuw %xmm3, %xmm1
+; SSE41-NEXT:    pminuw %xmm6, %xmm2
+; SSE41-NEXT:    pminuw %xmm4, %xmm0
+; SSE41-NEXT:    pminuw %xmm2, %xmm0
+; SSE41-NEXT:    pminuw %xmm1, %xmm0
+; SSE41-NEXT:    phminposuw %xmm0, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpminuw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminuw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminuw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pminub %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pminub %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    phminposuw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminub %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    phminposuw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pminub %xmm3, %xmm1
+; SSE2-NEXT:    pminub %xmm2, %xmm0
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminub %xmm3, %xmm1
+; SSE41-NEXT:    pminub %xmm2, %xmm0
+; SSE41-NEXT:    pminub %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    phminposuw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pminub %xmm6, %xmm2
+; SSE2-NEXT:    pminub %xmm4, %xmm0
+; SSE2-NEXT:    pminub %xmm2, %xmm0
+; SSE2-NEXT:    pminub %xmm7, %xmm3
+; SSE2-NEXT:    pminub %xmm5, %xmm1
+; SSE2-NEXT:    pminub %xmm3, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminub %xmm7, %xmm3
+; SSE41-NEXT:    pminub %xmm5, %xmm1
+; SSE41-NEXT:    pminub %xmm3, %xmm1
+; SSE41-NEXT:    pminub %xmm6, %xmm2
+; SSE41-NEXT:    pminub %xmm4, %xmm0
+; SSE41-NEXT:    pminub %xmm2, %xmm0
+; SSE41-NEXT:    pminub %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    phminposuw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpminub %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminub %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminub %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminub %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpminub %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v128i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-umin.ll b/test/CodeGen/X86/vector-reduce-umin.ll
index 32a1cdf0f171ccfef29229db76a269acc3b69f43..a2116377563fb9a5f4007ae834e495f93c56c09f 100644
--- a/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/test/CodeGen/X86/vector-reduce-umin.ll
@@ -44,10 +44,9 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movq %xmm2, %rax
 ; SSE41-NEXT:    retq
@@ -130,10 +129,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -143,10 +141,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movq %xmm2, %rax
 ; SSE41-NEXT:    retq
@@ -290,10 +287,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
@@ -303,10 +299,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
@@ -316,10 +311,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -329,10 +323,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    retq
@@ -541,10 +534,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm11
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm12, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm12, %xmm0
+; SSE41-NEXT:    por %xmm11, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
@@ -554,10 +546,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pand %xmm11, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm11, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
@@ -567,10 +558,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm8
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm10, %xmm0
+; SSE41-NEXT:    por %xmm8, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
@@ -580,10 +570,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE41-NEXT:    movapd %xmm5, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -593,10 +582,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -606,10 +594,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
 ; SSE41-NEXT:    movapd %xmm6, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -619,10 +606,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
@@ -632,10 +618,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    retq
@@ -738,6 +723,98 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movd %xmm2, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512BW-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    vpminuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE2-LABEL: test_v4i32:
 ; SSE2:       # %bb.0:
@@ -1132,6 +1209,167 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE2-LABEL: test_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movd %xmm2, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512VL-NEXT:    vpminuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE2-LABEL: test_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7]
+; SSE41-NEXT:    pminud %xmm0, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX-NEXT:    vpminud %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX512-NEXT:    vpminud %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-LABEL: test_v8i16:
 ; SSE2:       # %bb.0:
@@ -1432,6 +1670,274 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    pextrb $0, %xmm2, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpminuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpminud %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpminud %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpminud %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pminsw %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pminsw %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pminuw %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pminuw %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pminuw %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpminuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpminuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpminuw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpminuw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpminuw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpminuw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
 define i8 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i8:
 ; SSE2:       # %bb.0:
@@ -1726,16 +2232,22 @@ declare i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-xor-widen.ll b/test/CodeGen/X86/vector-reduce-xor-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4e17200b3332eb3ed196cc5f8d875f49b3a057ad
--- /dev/null
+++ b/test/CodeGen/X86/vector-reduce-xor-widen.ll
@@ -0,0 +1,1265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64(<2 x i64> %a0) {
+; SSE-LABEL: test_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v4i64(<4 x i64> %a0) {
+; SSE-LABEL: test_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v8i64(<8 x i64> %a0) {
+; SSE-LABEL: test_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> %a0)
+  ret i64 %1
+}
+
+define i64 @test_v16i64(<16 x i64> %a0) {
+; SSE-LABEL: test_v16i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm5, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm3, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> %a0)
+  ret i64 %1
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v8i32(<8 x i32> %a0) {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v16i32(<16 x i32> %a0) {
+; SSE-LABEL: test_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> %a0)
+  ret i32 %1
+}
+
+define i32 @test_v32i32(<32 x i32> %a0) {
+; SSE-LABEL: test_v32i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm5, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm3, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> %a0)
+  ret i32 %1
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v8i16(<8 x i16> %a0) {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v16i16(<16 x i16> %a0) {
+; SSE-LABEL: test_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v32i16(<32 x i16> %a0) {
+; SSE-LABEL: test_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v64i16(<64 x i16> %a0) {
+; SSE-LABEL: test_v64i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm5, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm3, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> %a0)
+  ret i16 %1
+}
+
+;
+; vXi8
+;
+
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v16i8(<16 x i8> %a0) {
+; SSE2-LABEL: test_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v32i8(<32 x i8> %a0) {
+; SSE2-LABEL: test_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v64i8(<64 x i8> %a0) {
+; SSE2-LABEL: test_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v128i8(<128 x i8> %a0) {
+; SSE2-LABEL: test_v128i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v128i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm6, %xmm2
+; SSE41-NEXT:    pxor %xmm7, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v128i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v128i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v128i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> %a0)
+  ret i8 %1
+}
+
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8>)
diff --git a/test/CodeGen/X86/vector-reduce-xor.ll b/test/CodeGen/X86/vector-reduce-xor.ll
index 0192ff3c923a5d0b243ea83df5fa49cc9ee4eec3..d98ee3882152584412d7c86d07043117d1d2796d 100644
--- a/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/test/CodeGen/X86/vector-reduce-xor.ll
@@ -186,6 +186,31 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; vXi32
 ;
 
+define i32 @test_v2i32(<2 x i32> %a0) {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> %a0)
+  ret i32 %1
+}
+
 define i32 @test_v4i32(<4 x i32> %a0) {
 ; SSE-LABEL: test_v4i32:
 ; SSE:       # %bb.0:
@@ -392,6 +417,68 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; vXi16
 ;
 
+define i16 @test_v2i16(<2 x i16> %a0) {
+; SSE-LABEL: test_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> %a0)
+  ret i16 %1
+}
+
+define i16 @test_v4i16(<4 x i16> %a0) {
+; SSE-LABEL: test_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> %a0)
+  ret i16 %1
+}
+
 define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE-LABEL: test_v8i16:
 ; SSE:       # %bb.0:
@@ -647,6 +734,140 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; vXi8
 ;
 
+define i8 @test_v2i8(<2 x i8> %a0) {
+; SSE2-LABEL: test_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v4i8(<4 x i8> %a0) {
+; SSE2-LABEL: test_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrb $0, %xmm0, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> %a0)
+  ret i8 %1
+}
+
+define i8 @test_v8i8(<8 x i8> %a0) {
+; SSE2-LABEL: test_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pextrb $0, %xmm1, %eax
+; SSE41-NEXT:    # kill: def $al killed $al killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+  %1 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> %a0)
+  ret i8 %1
+}
+
 define i8 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i8:
 ; SSE2:       # %bb.0:
@@ -1012,16 +1233,22 @@ declare i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64>)
 declare i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64>)
 declare i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64>)
 
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32>)
 declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32>)
 declare i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32>)
 declare i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32>)
 declare i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32>)
 
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16>)
 declare i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16>)
 
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8>)
 declare i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8>)
 declare i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8>)
 declare i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8>)
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index 9d0e65143d085565f864549b5bb84d668ad19a03..6258229392c4e266f917982061656871e3b79dbd 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -138,6 +138,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: var_rotate_v4i32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    pslld $23, %xmm1
 ; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
@@ -157,6 +158,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE41-LABEL: var_rotate_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    pslld $23, %xmm1
 ; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
@@ -173,6 +175,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX1-LABEL: var_rotate_v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
@@ -188,6 +191,8 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v4i32:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
@@ -230,6 +235,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; X32-SSE-LABEL: var_rotate_v4i32:
 ; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    pslld $23, %xmm1
 ; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
@@ -255,6 +261,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: var_rotate_v8i16:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -281,16 +288,16 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: var_rotate_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE41-NEXT:    pslld $23, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm3, %xmm1
 ; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT:    pslld $23, %xmm3
-; SSE41-NEXT:    paddd %xmm2, %xmm3
-; SSE41-NEXT:    cvttps2dq %xmm3, %xmm2
+; SSE41-NEXT:    pslld $23, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
 ; SSE41-NEXT:    packusdw %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
@@ -300,8 +307,8 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX1-LABEL: var_rotate_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
@@ -318,8 +325,9 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
@@ -336,38 +344,39 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX512F-LABEL: var_rotate_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512F-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512F-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_rotate_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512VL-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpmovdw %ymm2, %xmm2
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_rotate_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
@@ -378,6 +387,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX512VLBW-LABEL: var_rotate_v8i16:
 ; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
@@ -392,6 +402,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; X32-SSE-LABEL: var_rotate_v8i16:
 ; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
 ; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -575,31 +586,30 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_rotate_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_rotate_v16i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT:    vpsllvw %ymm2, %ymm0, %ymm2
-; AVX512VLBW-NEXT:    vpmovwb %ymm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512VLBW-NEXT:    vzeroupper
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -664,34 +674,22 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_rotate_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,64]
-; SSE-NEXT:    psubq %xmm2, %xmm3
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psllq %xmm1, %xmm2
-; SSE-NEXT:    psrlq %xmm3, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
+; SSE-NEXT:    psubq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psllq %xmm1, %xmm3
+; SSE-NEXT:    psrlq %xmm2, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: splatvar_rotate_v2i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splatvar_rotate_v2i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
-; AVX2-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: splatvar_rotate_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_rotate_v2i64:
 ; AVX512F:       # %bb.0:
@@ -760,6 +758,8 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: splatvar_rotate_v4i32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
 ; SSE2-NEXT:    xorps %xmm3, %xmm3
 ; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
@@ -774,8 +774,9 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_rotate_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
 ; SSE41-NEXT:    pslld %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
@@ -787,8 +788,9 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX1-LABEL: splatvar_rotate_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpslld %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
@@ -799,12 +801,15 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_rotate_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-NEXT:    vpslld %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
-; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -852,6 +857,8 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; X32-SSE-LABEL: splatvar_rotate_v4i32:
 ; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
 ; X32-SSE-NEXT:    xorps %xmm3, %xmm3
 ; X32-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
@@ -874,6 +881,9 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: splatvar_rotate_v8i16:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
 ; SSE2-NEXT:    psubw %xmm1, %xmm2
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
@@ -888,9 +898,10 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_rotate_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
 ; SSE41-NEXT:    psllw %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
@@ -902,9 +913,10 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX1-LABEL: splatvar_rotate_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
@@ -915,8 +927,9 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_rotate_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
@@ -925,53 +938,18 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: splatvar_rotate_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX512F-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: splatvar_rotate_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: splatvar_rotate_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
-; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512VLBW-NEXT:    retq
+; AVX512-LABEL: splatvar_rotate_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_rotate_v8i16:
 ; XOPAVX1:       # %bb.0:
@@ -988,6 +966,9 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; X32-SSE-LABEL: splatvar_rotate_v8i16:
 ; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
 ; X32-SSE-NEXT:    psubw %xmm1, %xmm2
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
@@ -1010,218 +991,163 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: splatvar_rotate_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    psllw $4, %xmm3
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    psllw $5, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    psrlw $6, %xmm2
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    psllw $2, %xmm3
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pandn %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    paddb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrlw $7, %xmm4
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    paddb %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    pandn %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT:    psubb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw %xmm3, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT:    psllw %xmm3, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm2, %xmm0
+; SSE2-NEXT:    psrlw %xmm2, %xmm4
+; SSE2-NEXT:    psrlw $8, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_rotate_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pshufb %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psllw $4, %xmm3
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    por %xmm0, %xmm3
-; SSE41-NEXT:    psllw $5, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrlw $6, %xmm0
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psllw $2, %xmm3
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    por %xmm0, %xmm3
-; SSE41-NEXT:    paddb %xmm2, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    paddb %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psrlw $7, %xmm3
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    por %xmm0, %xmm3
-; SSE41-NEXT:    paddb %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufb %xmm3, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw %xmm4, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT:    psllw %xmm4, %xmm6
+; SSE41-NEXT:    pshufb %xmm3, %xmm6
+; SSE41-NEXT:    pand %xmm6, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    psubb %xmm1, %xmm3
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    psrlw %xmm1, %xmm5
+; SSE41-NEXT:    pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm0, %xmm5
+; SSE41-NEXT:    por %xmm5, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_rotate_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm2
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_rotate_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw $6, %xmm0, %xmm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
-; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
+; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_rotate_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpsrlw $6, %xmm0, %xmm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
-; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_rotate_v16i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpsrlw $6, %xmm0, %xmm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
-; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512VL-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_rotate_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatvar_rotate_v16i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT:    vpsllvw %ymm2, %ymm0, %ymm2
-; AVX512VLBW-NEXT:    vpmovwb %ymm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512VLBW-NEXT:    vzeroupper
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -1240,48 +1166,35 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; X32-SSE-LABEL: splatvar_rotate_v16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    psllw $4, %xmm3
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
-; X32-SSE-NEXT:    por %xmm0, %xmm3
-; X32-SSE-NEXT:    psllw $5, %xmm1
-; X32-SSE-NEXT:    pxor %xmm0, %xmm0
-; X32-SSE-NEXT:    pxor %xmm4, %xmm4
-; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm4
-; X32-SSE-NEXT:    pand %xmm4, %xmm3
-; X32-SSE-NEXT:    pandn %xmm2, %xmm4
-; X32-SSE-NEXT:    por %xmm3, %xmm4
-; X32-SSE-NEXT:    movdqa %xmm4, %xmm2
-; X32-SSE-NEXT:    psrlw $6, %xmm2
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa %xmm4, %xmm3
-; X32-SSE-NEXT:    psllw $2, %xmm3
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
-; X32-SSE-NEXT:    por %xmm2, %xmm3
-; X32-SSE-NEXT:    paddb %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
-; X32-SSE-NEXT:    pandn %xmm4, %xmm2
-; X32-SSE-NEXT:    por %xmm3, %xmm2
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    paddb %xmm2, %xmm3
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X32-SSE-NEXT:    psrlw $7, %xmm4
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT:    por %xmm3, %xmm4
-; X32-SSE-NEXT:    paddb %xmm1, %xmm1
-; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm0
-; X32-SSE-NEXT:    pand %xmm0, %xmm4
-; X32-SSE-NEXT:    pandn %xmm2, %xmm0
-; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X32-SSE-NEXT:    psubb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw %xmm3, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm5, %xmm5
+; X32-SSE-NEXT:    psllw %xmm3, %xmm5
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm2, %xmm0
+; X32-SSE-NEXT:    psrlw %xmm2, %xmm4
+; X32-SSE-NEXT:    psrlw $8, %xmm4
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm0, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
@@ -1574,171 +1487,126 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
 define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: constant_rotate_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psrlw $4, %xmm4
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    psllw $4, %xmm5
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm5
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrlw $6, %xmm1
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psllw $2, %xmm4
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    paddb %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    paddb %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psrlw $7, %xmm4
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    paddb %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_rotate_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psllw $4, %xmm2
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,57600,41152,24704,8256]
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $6, %xmm2
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psllw $2, %xmm3
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    por %xmm2, %xmm3
-; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psrlw $7, %xmm3
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    por %xmm2, %xmm3
-; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; SSE41-NEXT:    pmullw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    packuswb %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    por %xmm4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: constant_rotate_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $6, %xmm0, %xmm1
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: constant_rotate_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_rotate_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: constant_rotate_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpsrlw $6, %xmm0, %xmm1
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
-; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512F-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512F-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: constant_rotate_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpsrlw $6, %xmm0, %xmm1
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX512VL-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
-; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX512VL-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_rotate_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
-; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: constant_rotate_v16i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
-; AVX512VLBW-NEXT:    vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vzeroupper
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -1749,45 +1617,26 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
 ;
 ; X32-SSE-LABEL: constant_rotate_v16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
-; X32-SSE-NEXT:    pxor %xmm0, %xmm0
-; X32-SSE-NEXT:    pxor %xmm3, %xmm3
-; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X32-SSE-NEXT:    psrlw $4, %xmm4
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
-; X32-SSE-NEXT:    psllw $4, %xmm5
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm5
-; X32-SSE-NEXT:    por %xmm4, %xmm5
-; X32-SSE-NEXT:    pand %xmm3, %xmm5
-; X32-SSE-NEXT:    pandn %xmm1, %xmm3
-; X32-SSE-NEXT:    por %xmm5, %xmm3
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X32-SSE-NEXT:    psrlw $6, %xmm1
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    psllw $2, %xmm4
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT:    por %xmm1, %xmm4
-; X32-SSE-NEXT:    paddb %xmm2, %xmm2
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm1, %xmm4
-; X32-SSE-NEXT:    pandn %xmm3, %xmm1
-; X32-SSE-NEXT:    por %xmm4, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X32-SSE-NEXT:    paddb %xmm1, %xmm3
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X32-SSE-NEXT:    psrlw $7, %xmm4
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT:    por %xmm3, %xmm4
-; X32-SSE-NEXT:    paddb %xmm2, %xmm2
-; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm0
-; X32-SSE-NEXT:    pand %xmm0, %xmm4
-; X32-SSE-NEXT:    pandn %xmm1, %xmm0
-; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
   %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
@@ -1980,14 +1829,41 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: splatconstant_rotate_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VLBW-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_v16i8:
 ; XOP:       # %bb.0:
@@ -2211,15 +2087,43 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VLBW-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_mask_v16i8:
 ; XOP:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index 75fe0b322ca12e3b626e1a7bba0ed541545cd81b..b0d9cd9a3cb6456b99151f649c6806c756e13db5 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -104,22 +104,25 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: var_rotate_v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; AVX1-NEXT:    vpor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; AVX1-NEXT:    vpor %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
@@ -135,6 +138,8 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v8i32:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm3, %ymm1
@@ -196,28 +201,31 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: var_rotate_v16i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX1-NEXT:    vpslld $23, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
 ; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpmulhuw %xmm2, %xmm4, %xmm6
-; AVX1-NEXT:    vpmullw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpor %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmulhuw %xmm2, %xmm5, %xmm7
+; AVX1-NEXT:    vpmullw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpor %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
 ; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm3
@@ -229,59 +237,60 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX2-LABEL: var_rotate_v16i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
-; AVX2-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX2-NEXT:    vpsllvd %ymm4, %ymm3, %ymm4
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; AVX2-NEXT:    vpsllvd %ymm5, %ymm0, %ymm5
 ; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
-; AVX2-NEXT:    vpackusdw %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpackusdw %ymm4, %ymm5, %ymm4
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    vpsubw %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT:    vpsrlvd %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; AVX2-NEXT:    vpsrlvd %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: var_rotate_v16i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
-; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_rotate_v16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
-; AVX512VL-NEXT:    vpmovdw %zmm2, %ymm2
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VL-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm0, %zmm2, %zmm0
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_rotate_v16i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
@@ -291,6 +300,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ;
 ; AVX512VLBW-LABEL: var_rotate_v16i16:
 ; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
@@ -449,30 +459,28 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_rotate_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_rotate_v32i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512VLBW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_rotate_v32i8:
@@ -521,12 +529,11 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_rotate_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm2
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [64,64,64,64]
-; AVX2-NEXT:    vpsubq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_rotate_v4i64:
@@ -586,30 +593,34 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_rotate_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpslld %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpslld %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
-; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsrld %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_rotate_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm2
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT:    vpslld %xmm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
+; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
-; AVX2-NEXT:    vpsubd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_rotate_v8i32:
@@ -669,80 +680,48 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: splatvar_rotate_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_rotate_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm2
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX2-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: splatvar_rotate_v16i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm2
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: splatvar_rotate_v16i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastw %xmm1, %ymm2
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: splatvar_rotate_v16i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastw %xmm1, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
-; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512VLBW-LABEL: splatvar_rotate_v16i16:
-; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %ymm2
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
-; AVX512VLBW-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512VLBW-NEXT:    retq
+; AVX512-LABEL: splatvar_rotate_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_rotate_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -776,155 +755,119 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
-; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX1-NEXT:    vpand %xmm9, %xmm5, %xmm5
-; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $6, %xmm2, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX1-NEXT:    vpand %xmm10, %xmm3, %xmm3
-; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm7
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX1-NEXT:    vpand %xmm11, %xmm7, %xmm7
-; AVX1-NEXT:    vpor %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm7
-; AVX1-NEXT:    vpblendvb %xmm7, %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm5
-; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm5
-; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
-; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
-; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm9, %xmm4, %xmm4
-; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm10, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX1-NEXT:    vpand %xmm11, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
-; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm5, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpsllw %xmm3, %xmm6, %xmm7
+; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm6, %xmm6
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_rotate_v32i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw $6, %ymm0, %ymm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
-; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm3
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpsllw %xmm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw %xmm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_rotate_v32i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
-; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
+; AVX512F-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpsllw %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_rotate_v32i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512VL-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw %xmm2, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_rotate_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatvar_rotate_v32i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
-; AVX512VLBW-NEXT:    vpmovwb %zmm2, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_rotate_v32i8:
@@ -1195,141 +1138,132 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: constant_rotate_v32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX1-NEXT:    vpand %xmm9, %xmm4, %xmm4
-; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $6, %xmm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX1-NEXT:    vpand %xmm10, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm7
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX1-NEXT:    vpand %xmm11, %xmm7, %xmm7
-; AVX1-NEXT:    vpor %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm7
-; AVX1-NEXT:    vpblendvb %xmm7, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm6
-; AVX1-NEXT:    vpor %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm6
-; AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
-; AVX1-NEXT:    vpand %xmm9, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm2
-; AVX1-NEXT:    vpand %xmm10, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
-; AVX1-NEXT:    vpand %xmm11, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm7, %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
-; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm9, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm7
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
+; AVX1-NEXT:    vpmullw %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
+; AVX1-NEXT:    vpmullw %xmm9, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_rotate_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw $6, %ymm0, %ymm1
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsllw $2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
-; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm3
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: constant_rotate_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm1
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm1
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
-; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: constant_rotate_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm1
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm3
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
-; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512VL-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
 ; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_rotate_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
-; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: constant_rotate_v32i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
-; AVX512VLBW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_rotate_v32i8:
@@ -1565,14 +1499,41 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v32i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm1
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: splatconstant_rotate_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
 ; XOPAVX1:       # %bb.0:
@@ -1824,15 +1785,43 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v32i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm1
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
 ; XOPAVX1:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll
index f838f1b54dbf8424ecb59d2210a11bb13c1b7899..896d6c07387ef926b24d46eaeecc361c9df39c5b 100644
--- a/test/CodeGen/X86/vector-rotate-512.ll
+++ b/test/CodeGen/X86/vector-rotate-512.ll
@@ -35,48 +35,50 @@ define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512F-LABEL: var_rotate_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT:    vpsllvd %zmm4, %zmm0, %zmm4
-; AVX512F-NEXT:    vpmovdw %zmm4, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT:    vpsllvd %zmm5, %zmm0, %zmm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm5, %zmm0
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT:    vpsllvd %zmm2, %zmm1, %zmm2
-; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT:    vpsubw %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT:    vpsrlvd %zmm3, %zmm1, %zmm1
+; AVX512F-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_rotate_v32i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm4
-; AVX512VL-NEXT:    vpmovdw %zmm4, %ymm4
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %ymm2, %ymm5, %ymm2
+; AVX512VL-NEXT:    vpsllvd %zmm5, %zmm0, %zmm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm0, %zmm5, %zmm0
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm1, %zmm2
-; AVX512VL-NEXT:    vpmovdw %zmm2, %ymm2
-; AVX512VL-NEXT:    vpsubw %ymm3, %ymm5, %ymm3
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT:    vpsrlvd %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
 ; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
-; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_rotate_v32i16:
@@ -313,54 +315,56 @@ define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
 define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512F-LABEL: splatvar_rotate_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm3
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm4
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm5, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT:    vpsllw %xmm2, %ymm1, %ymm2
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_rotate_v32i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm3
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm5, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_rotate_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastw %xmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatvar_rotate_v32i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %zmm2, %zmm3, %zmm2
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm2, %zmm0, %zmm2
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm1
-; AVX512VLBW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512VLBW-NEXT:    retq
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
@@ -374,95 +378,53 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512F-LABEL: splatvar_rotate_v64i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm2
-; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT:    vpand %ymm8, %ymm7, %ymm7
-; AVX512F-NEXT:    vpor %ymm3, %ymm7, %ymm3
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm7
-; AVX512F-NEXT:    vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT:    vpand %ymm9, %ymm3, %ymm3
-; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm10
-; AVX512F-NEXT:    vpor %ymm3, %ymm10, %ymm3
-; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm10
-; AVX512F-NEXT:    vpblendvb %ymm10, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm3
-; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm5, %ymm6
+; AVX512F-NEXT:    vpbroadcastb %xmm6, %ymm6
 ; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $6, %ymm1, %ymm2
-; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm3
-; AVX512F-NEXT:    vpand %ymm8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; AVX512F-NEXT:    vpand %ymm9, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
-; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpblendvb %ymm10, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm7, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_rotate_v64i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX512VL-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm7
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT:    vpand %ymm8, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpor %ymm3, %ymm7, %ymm3
-; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm7
-; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-NEXT:    vpand %ymm9, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm10
-; AVX512VL-NEXT:    vpor %ymm3, %ymm10, %ymm3
-; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm10
-; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm3
-; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm6
+; AVX512VL-NEXT:    vpbroadcastb %xmm6, %ymm6
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlw $6, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm3
-; AVX512VL-NEXT:    vpand %ymm8, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpand %ymm9, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
-; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm7, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
@@ -583,94 +545,97 @@ define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
 ; AVX512F-LABEL: constant_rotate_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT:    vpand %ymm8, %ymm7, %ymm7
-; AVX512F-NEXT:    vpor %ymm2, %ymm7, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
-; AVX512F-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT:    vpand %ymm9, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm10
-; AVX512F-NEXT:    vpor %ymm2, %ymm10, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm10
-; AVX512F-NEXT:    vpblendvb %ymm10, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm9, %ymm9
+; AVX512F-NEXT:    vpsrlw $8, %ymm9, %ymm9
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm9, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm3
-; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $6, %ymm1, %ymm2
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm3
-; AVX512F-NEXT:    vpand %ymm8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; AVX512F-NEXT:    vpand %ymm9, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
-; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpblendvb %ymm10, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm3
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
+; AVX512F-NEXT:    vpmullw %ymm10, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
+; AVX512F-NEXT:    vpmullw %ymm11, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: constant_rotate_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
-; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; AVX512VL-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm7
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT:    vpand %ymm8, %ymm7, %ymm7
-; AVX512VL-NEXT:    vpor %ymm2, %ymm7, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
-; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-NEXT:    vpand %ymm9, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm10
-; AVX512VL-NEXT:    vpor %ymm2, %ymm10, %ymm2
-; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm10
-; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm9, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm3
-; AVX512VL-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlw $6, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm3
-; AVX512VL-NEXT:    vpand %ymm8, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpand %ymm9, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
-; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpmullw %ymm9, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmullw %ymm10, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_rotate_v64i8:
diff --git a/test/CodeGen/X86/vector-sext-widen.ll b/test/CodeGen/X86/vector-sext-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..895d0e57bbe9ebaa1dfab4d4af27958b8902945a
--- /dev/null
+++ b/test/CodeGen/X86/vector-sext-widen.ll
@@ -0,0 +1,5945 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+;
+; Just two 32-bit runs to make sure we do reasonable things there.
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
+
+define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_16i8_to_8i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i8_to_8i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psraw $8, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i8_to_8i16:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = sext <8 x i8> %B to <8 x i16>
+  ret <8 x i16> %C
+}
+
+define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_16i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_16i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    psraw $8, %xmm2
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSSE3-NEXT:    psraw $8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_16i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i8_to_16i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i8_to_16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_16i8_to_16i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i8_to_16i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    psraw $8, %xmm2
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE2-NEXT:    psraw $8, %xmm1
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i8_to_16i16:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <16 x i8> %A to <16 x i16>
+  ret <16 x i16> %B
+}
+
+define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_32i8_to_32i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_32i8_to_32i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT:    psraw $8, %xmm4
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSSE3-NEXT:    psraw $8, %xmm5
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT:    psraw $8, %xmm2
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSSE3-NEXT:    psraw $8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_32i8_to_32i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
+; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_32i8_to_32i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_32i8_to_32i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sext_32i8_to_32i16:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm2
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
+; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: sext_32i8_to_32i16:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_32i8_to_32i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; X32-SSE2-NEXT:    psraw $8, %xmm4
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; X32-SSE2-NEXT:    psraw $8, %xmm5
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-SSE2-NEXT:    psraw $8, %xmm2
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; X32-SSE2-NEXT:    psraw $8, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_32i8_to_32i16:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
+; X32-SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
+; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
+; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <32 x i8> %A to <32 x i16>
+  ret <32 x i16> %B
+}
+
+define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_16i8_to_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i8_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i8_to_4i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = sext <4 x i8> %B to <4 x i32>
+  ret <4 x i32> %C
+}
+
+define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i8_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i8_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_16i8_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i8_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm2
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i8_to_8i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = sext <8 x i8> %B to <8 x i32>
+  ret <8 x i32> %C
+}
+
+define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_16i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE2-NEXT:    psrad $24, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    psrad $24, %xmm3
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_16i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSSE3-NEXT:    psrad $24, %xmm4
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    psrad $24, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_16i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i8_to_16i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i8_to_16i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_16i8_to_16i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i8_to_16i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm4
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm2
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $24, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i8_to_16i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
+; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <16 x i8> %A to <16 x i32>
+  ret <16 x i32> %B
+}
+
+define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_16i8_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i8_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i8_to_2i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %C = sext <2 x i8> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i8_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i8_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_16i8_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i8_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i8_to_4i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
+; X32-SSE41-NEXT:    psrld $16, %xmm0
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = sext <4 x i8> %B to <4 x i64>
+  ret <4 x i64> %C
+}
+
+define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_8i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    psrad $24, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_8i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    psrad $24, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_8i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
+; SSE41-NEXT:    psrlq $48, %xmm0
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i8_to_8i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i8_to_8i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_16i8_to_8i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i8_to_8i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $24, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X32-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i8_to_8i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
+; X32-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE41-NEXT:    psrld $16, %xmm1
+; X32-SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
+; X32-SSE41-NEXT:    psrlq $48, %xmm0
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
+; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = sext <8 x i8> %B to <8 x i64>
+  ret <8 x i64> %C
+}
+
+define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i16_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i16_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_8i16_to_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_8i16_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_8i16_to_4i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = sext <4 x i16> %B to <4 x i32>
+  ret <4 x i32> %C
+}
+
+define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i16_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psrad $16, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i16_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_8i16_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_8i16_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_8i16_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_8i16_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm2
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_8i16_to_8i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <8 x i16> %A to <8 x i32>
+  ret <8 x i32> %B
+}
+
+define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i16_to_16i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i16_to_16i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSSE3-NEXT:    psrad $16, %xmm4
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSSE3-NEXT:    psrad $16, %xmm5
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    psrad $16, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSSE3-NEXT:    psrad $16, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i16_to_16i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
+; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i16_to_16i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i16_to_16i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_16i16_to_16i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_16i16_to_16i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm4
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $16, %xmm5
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm2
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X32-SSE2-NEXT:    psrad $16, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_16i16_to_16i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
+; X32-SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
+; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
+; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <16 x i16> %A to <16 x i32>
+  ret <16 x i32> %B
+}
+
+define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i16_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i16_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_8i16_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_8i16_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_8i16_to_2i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %C = sext <2 x i16> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i16_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i16_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_8i16_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_8i16_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_8i16_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_8i16_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_8i16_to_4i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = sext <4 x i16> %B to <4 x i64>
+  ret <4 x i64> %C
+}
+
+define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_8i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i16_to_8i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    psrad $16, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i16_to_8i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_8i16_to_8i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_8i16_to_8i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_8i16_to_8i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_8i16_to_8i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $16, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X32-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_8i16_to_8i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
+; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <8 x i16> %A to <8 x i64>
+  ret <8 x i64> %B
+}
+
+define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_4i32_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i32_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i32_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_4i32_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_4i32_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_4i32_to_2i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %C = sext <2 x i32> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_4i32_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i32_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i32_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i32_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i32_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_4i32_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_4i32_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_4i32_to_4i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <4 x i32> %A to <4 x i64>
+  ret <4 x i64> %B
+}
+
+define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i32_to_8i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i32_to_8i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i32_to_8i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
+; SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_8i32_to_8i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_8i32_to_8i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_8i32_to_8i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_8i32_to_8i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_8i32_to_8i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
+; X32-SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
+; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
+; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <8 x i32> %A to <8 x i64>
+  ret <8 x i64> %B
+}
+
+define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
+; SSE-LABEL: load_sext_2i1_to_2i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movzbl (%rdi), %eax
+; SSE-NEXT:    movq %rax, %rcx
+; SSE-NEXT:    shlq $62, %rcx
+; SSE-NEXT:    sarq $63, %rcx
+; SSE-NEXT:    movq %rcx, %xmm1
+; SSE-NEXT:    shlq $63, %rax
+; SSE-NEXT:    sarq $63, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_2i1_to_2i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    movzbl (%rdi), %eax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $62, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    shlq $63, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_2i1_to_2i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movzbl (%rdi), %eax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $62, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    shlq $63, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_sext_2i1_to_2i64:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movzbl (%rdi), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_2i1_to_2i64:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_2i1_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $30, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; X32-SSE2-NEXT:    shll $31, %eax
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzbl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $31, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movd %ecx, %xmm0
+; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
+; X32-SSE41-NEXT:    shll $30, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i1>, <2 x i1>* %ptr
+ %Y = sext <2 x i1> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_2i8_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_2i8_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_2i8_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_2i8_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_2i8_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzwl (%eax), %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i8>, <2 x i8>* %ptr
+ %Y = sext <2 x i8> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_4i1_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $60, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $61, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $62, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    shlq $63, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i1_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $60, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $61, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $62, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    shlq $63, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i1_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movzbl (%rdi), %eax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    shlq $60, %rax
+; SSE41-NEXT:    sarq $63, %rax
+; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i1_to_4i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    movzbl (%rdi), %eax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $62, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $60, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i1_to_4i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movzbl (%rdi), %eax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $62, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $63, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm0
+; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $60, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_sext_4i1_to_4i32:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movzbl (%rdi), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_4i1_to_4i32:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_4i1_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $28, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $29, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $30, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    shll $31, %eax
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $30, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    shll $31, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm0
+; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $29, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; X32-SSE41-NEXT:    shll $28, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i1>, <4 x i1>* %ptr
+ %Y = sext <4 x i1> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_4i8_to_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_4i8_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i8>, <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_4i1_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movl (%rdi), %eax
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $3, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $2, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    shrl %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSE2-NEXT:    psllq $63, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSE2-NEXT:    psllq $63, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i1_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movl (%rdi), %eax
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $3, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $2, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    shrl %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSSE3-NEXT:    psllq $63, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSSE3-NEXT:    psllq $63, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i1_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movl (%rdi), %eax
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl %ecx
+; SSE41-NEXT:    movd %eax, %xmm1
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $2, %ecx
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
+; SSE41-NEXT:    shrl $3, %eax
+; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    psllq $63, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT:    psllq $63, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i1_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    movzbl (%rdi), %eax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $62, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $60, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i1_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movzbl (%rdi), %eax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $62, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    shlq $63, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_sext_4i1_to_4i64:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movzbl (%rdi), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_4i1_to_4i64:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_4i1_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $3, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $2, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    movd %eax, %xmm2
+; X32-SSE2-NEXT:    shrl %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X32-SSE2-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; X32-SSE2-NEXT:    psllq $63, %xmm0
+; X32-SSE2-NEXT:    psrad $31, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; X32-SSE2-NEXT:    psllq $63, %xmm1
+; X32-SSE2-NEXT:    psrad $31, %xmm1
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzbl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl %ecx
+; X32-SSE41-NEXT:    movd %eax, %xmm1
+; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $2, %ecx
+; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
+; X32-SSE41-NEXT:    shrl $3, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; X32-SSE41-NEXT:    psllq $63, %xmm0
+; X32-SSE41-NEXT:    psrad $31, %xmm0
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; X32-SSE41-NEXT:    psllq $63, %xmm1
+; X32-SSE41-NEXT:    psrad $31, %xmm1
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i1>, <4 x i1>* %ptr
+ %Y = sext <4 x i1> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movsbq 1(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movsbq (%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movsbq 3(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm2
+; SSE2-NEXT:    movsbq 2(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movsbq 1(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm1
+; SSSE3-NEXT:    movsbq (%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movsbq 3(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm2
+; SSSE3-NEXT:    movsbq 2(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i8_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbq (%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i8_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_4i8_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_4i8_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i8>, <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i64_extract:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsbq 3(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movsbq 2(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i64_extract:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movsbq 3(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm1
+; SSSE3-NEXT:    movsbq 2(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i64_extract:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i8_to_4i64_extract:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i8_to_4i64_extract:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_4i8_to_4i64_extract:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_4i8_to_4i64_extract:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
+; X32-SSE41:       # %bb.0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+ %ld = load <4 x i8>, <4 x i8>* %ptr
+ %sext = sext <4 x i8> %ld to <4 x i64>
+ %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ ret <2 x i64> %extract
+}
+
+define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_8i1_to_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movsbq (%rdi), %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq $7, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $57, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $58, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $59, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $60, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $61, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shlq $62, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    shlq $63, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i1_to_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movsbq (%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shrq $7, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $57, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $58, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $59, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $60, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $61, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    shlq $62, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    shlq $63, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i1_to_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movsbq (%rdi), %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $60, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $59, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $58, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $57, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
+; SSE41-NEXT:    shrq $7, %rax
+; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_8i1_to_8i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    movsbq (%rdi), %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $62, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $60, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $59, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $58, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $57, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    shrq $7, %rax
+; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_8i1_to_8i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movsbq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $62, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $63, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm0
+; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $59, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $58, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $57, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    shrq $7, %rax
+; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_sext_8i1_to_8i16:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movzbl (%rdi), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_8i1_to_8i16:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_8i1_to_8i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movsbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $7, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $25, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $26, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $27, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $28, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $29, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $30, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    shll $31, %eax
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsbl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $30, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    shll $31, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm0
+; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $29, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $28, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $27, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $26, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $25, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
+; X32-SSE41-NEXT:    shrl $7, %eax
+; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i1>, <8 x i1>* %ptr
+ %Y = sext <8 x i1> %X to <8 x i16>
+ ret <8 x i16> %Y
+}
+
+define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_8i8_to_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i8_to_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i8_to_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_8i8_to_8i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_8i8_to_8i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psraw $8, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i16>
+ ret <8 x i16> %Y
+}
+
+define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_8i8_to_8i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movsbq 1(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movsbq (%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movsbq 3(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm2
+; SSE2-NEXT:    movsbq 2(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movsbq 5(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm3
+; SSE2-NEXT:    movsbq 4(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    movsbq 7(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm4
+; SSE2-NEXT:    movsbq 6(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm3
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i8_to_8i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movsbq 1(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm1
+; SSSE3-NEXT:    movsbq (%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movsbq 3(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm2
+; SSSE3-NEXT:    movsbq 2(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    movsbq 5(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm3
+; SSSE3-NEXT:    movsbq 4(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    movsbq 7(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm4
+; SSSE3-NEXT:    movsbq 6(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm3
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i8_to_8i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
+; SSE41-NEXT:    pmovsxbq 4(%rdi), %xmm2
+; SSE41-NEXT:    pmovsxbq 6(%rdi), %xmm3
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_8i8_to_8i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbq 6(%rdi), %xmm1
+; AVX1-NEXT:    vpmovsxbq 4(%rdi), %xmm2
+; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbq (%rdi), %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_8i8_to_8i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT:    vpmovsxbq 4(%rdi), %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_8i8_to_8i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbq (%rdi), %zmm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_8i8_to_8i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE2-NEXT:    psrad $24, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X32-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
+; X32-SSE41-NEXT:    pmovsxbq 4(%eax), %xmm2
+; X32-SSE41-NEXT:    pmovsxbq 6(%eax), %xmm3
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i64>
+ ret <8 x i64> %Y
+}
+
+define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_8i1_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $7, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $6, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $5, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $4, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $3, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $2, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    shrl %eax
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i1_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $7, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $6, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $5, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $4, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $3, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $2, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    shrl %eax
+; SSSE3-NEXT:    andl $1, %eax
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i1_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movzbl (%rdi), %eax
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    movl %eax, %edx
+; SSE41-NEXT:    andl $1, %edx
+; SSE41-NEXT:    movd %edx, %xmm1
+; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $2, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $3, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $4, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $5, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $6, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
+; SSE41-NEXT:    shrl $7, %eax
+; SSE41-NEXT:    pinsrw $7, %eax, %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_8i1_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    movsbq (%rdi), %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $58, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $59, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $57, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $7, %rcx
+; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $62, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm1
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $60, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_8i1_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movsbq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $58, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $59, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm0
+; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $57, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $7, %rcx
+; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $62, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $63, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm1
+; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $60, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_sext_8i1_to_8i32:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movzbl (%rdi), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_8i1_to_8i32:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_8i1_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $7, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $6, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $5, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $4, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $3, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $2, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    shrl %eax
+; X32-SSE2-NEXT:    andl $1, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm3
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    pslld $31, %xmm0
+; X32-SSE2-NEXT:    psrad $31, %xmm0
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    pslld $31, %xmm1
+; X32-SSE2-NEXT:    psrad $31, %xmm1
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzbl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    andl $1, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm1
+; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $2, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $3, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $4, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $5, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $6, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
+; X32-SSE41-NEXT:    shrl $7, %eax
+; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm1
+; X32-SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X32-SSE41-NEXT:    pslld $31, %xmm0
+; X32-SSE41-NEXT:    psrad $31, %xmm0
+; X32-SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE41-NEXT:    pslld $31, %xmm1
+; X32-SSE41-NEXT:    psrad $31, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i1>, <8 x i1>* %ptr
+ %Y = sext <8 x i1> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_8i8_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i8_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i8_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_8i8_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_8i8_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_8i8_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_8i8_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
+; SSE2-LABEL: load_sext_16i1_to_16i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movswq (%rdi), %rax
+; SSE2-NEXT:    movq %rax, %r8
+; SSE2-NEXT:    movq %rax, %r9
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    movq %rax, %r11
+; SSE2-NEXT:    movq %rax, %r14
+; SSE2-NEXT:    movq %rax, %r15
+; SSE2-NEXT:    movq %rax, %r12
+; SSE2-NEXT:    movq %rax, %r13
+; SSE2-NEXT:    movq %rax, %rbx
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    movq %rax, %rdx
+; SSE2-NEXT:    movq %rax, %rsi
+; SSE2-NEXT:    movq %rax, %rdi
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    shrq $15, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm0
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    shlq $49, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movd %r8d, %xmm1
+; SSE2-NEXT:    shlq $50, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    movd %r9d, %xmm2
+; SSE2-NEXT:    shlq $51, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movd %r10d, %xmm3
+; SSE2-NEXT:    shlq $52, %r11
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movd %r11d, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    shlq $53, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    movd %r14d, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    shlq $54, %r15
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movd %r15d, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT:    shlq $55, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movd %r12d, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    shlq $60, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movd %r13d, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    shlq $61, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movd %ebx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    shlq $62, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    shlq $63, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    shlq $58, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movd %esi, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT:    shlq $59, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movd %edi, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT:    shlq $57, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm2
+; SSE2-NEXT:    shrq $7, %rax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_16i1_to_16i8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pushq %rbp
+; SSSE3-NEXT:    pushq %r15
+; SSSE3-NEXT:    pushq %r14
+; SSSE3-NEXT:    pushq %r13
+; SSSE3-NEXT:    pushq %r12
+; SSSE3-NEXT:    pushq %rbx
+; SSSE3-NEXT:    movswq (%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %r8
+; SSSE3-NEXT:    movq %rax, %r9
+; SSSE3-NEXT:    movq %rax, %r10
+; SSSE3-NEXT:    movq %rax, %r11
+; SSSE3-NEXT:    movq %rax, %r14
+; SSSE3-NEXT:    movq %rax, %r15
+; SSSE3-NEXT:    movq %rax, %r12
+; SSSE3-NEXT:    movq %rax, %r13
+; SSSE3-NEXT:    movq %rax, %rbx
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    movq %rax, %rdx
+; SSSE3-NEXT:    movq %rax, %rsi
+; SSSE3-NEXT:    movq %rax, %rdi
+; SSSE3-NEXT:    movq %rax, %rbp
+; SSSE3-NEXT:    shrq $15, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm0
+; SSSE3-NEXT:    movq %rax, %rbp
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    shlq $49, %r8
+; SSSE3-NEXT:    sarq $63, %r8
+; SSSE3-NEXT:    movd %r8d, %xmm1
+; SSSE3-NEXT:    shlq $50, %r9
+; SSSE3-NEXT:    sarq $63, %r9
+; SSSE3-NEXT:    movd %r9d, %xmm2
+; SSSE3-NEXT:    shlq $51, %r10
+; SSSE3-NEXT:    sarq $63, %r10
+; SSSE3-NEXT:    movd %r10d, %xmm3
+; SSSE3-NEXT:    shlq $52, %r11
+; SSSE3-NEXT:    sarq $63, %r11
+; SSSE3-NEXT:    movd %r11d, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    shlq $53, %r14
+; SSSE3-NEXT:    sarq $63, %r14
+; SSSE3-NEXT:    movd %r14d, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    shlq $54, %r15
+; SSSE3-NEXT:    sarq $63, %r15
+; SSSE3-NEXT:    movd %r15d, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSSE3-NEXT:    shlq $55, %r12
+; SSSE3-NEXT:    sarq $63, %r12
+; SSSE3-NEXT:    movd %r12d, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT:    shlq $60, %r13
+; SSSE3-NEXT:    sarq $63, %r13
+; SSSE3-NEXT:    movd %r13d, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    shlq $61, %rbx
+; SSSE3-NEXT:    sarq $63, %rbx
+; SSSE3-NEXT:    movd %ebx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    shlq $62, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm5
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT:    shlq $63, %rdx
+; SSSE3-NEXT:    sarq $63, %rdx
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSSE3-NEXT:    shlq $58, %rsi
+; SSSE3-NEXT:    sarq $63, %rsi
+; SSSE3-NEXT:    movd %esi, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSSE3-NEXT:    shlq $59, %rdi
+; SSSE3-NEXT:    sarq $63, %rdi
+; SSSE3-NEXT:    movd %edi, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSSE3-NEXT:    shlq $57, %rbp
+; SSSE3-NEXT:    sarq $63, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm2
+; SSSE3-NEXT:    shrq $7, %rax
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    popq %rbx
+; SSSE3-NEXT:    popq %r12
+; SSSE3-NEXT:    popq %r13
+; SSSE3-NEXT:    popq %r14
+; SSSE3-NEXT:    popq %r15
+; SSSE3-NEXT:    popq %rbp
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_16i1_to_16i8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movswq (%rdi), %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $60, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $59, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $58, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $57, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
+; SSE41-NEXT:    movsbq %al, %rcx
+; SSE41-NEXT:    shrq $7, %rcx
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $55, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $54, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $53, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $52, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $51, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $50, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $49, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT:    shrq $15, %rax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_16i1_to_16i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    movswq (%rdi), %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $62, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $60, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $59, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $58, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $57, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movsbq %al, %rcx
+; AVX1-NEXT:    shrq $7, %rcx
+; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $55, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $54, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $53, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $52, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $51, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $50, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $49, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    shrq $15, %rax
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_16i1_to_16i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movswq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $62, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $63, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm0
+; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $59, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $58, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $57, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movsbq %al, %rcx
+; AVX2-NEXT:    shrq $7, %rcx
+; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $55, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $54, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $53, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $52, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $51, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $50, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $49, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    shrq $15, %rax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_sext_16i1_to_16i8:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    kmovw (%rdi), %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_16i1_to_16i8:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_16i1_to_16i8:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movswl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    movl %eax, %esi
+; X32-SSE2-NEXT:    movl %eax, %edi
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shrl $15, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm2
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shll $17, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    shll $18, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm1
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    shll $19, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $20, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm4
+; X32-SSE2-NEXT:    movl %eax, %esi
+; X32-SSE2-NEXT:    shll $21, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm6
+; X32-SSE2-NEXT:    movl %eax, %edi
+; X32-SSE2-NEXT:    shll $22, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm7
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shll $23, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm5
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $28, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm2
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    movsbl %al, %eax
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; X32-SSE2-NEXT:    shll $29, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; X32-SSE2-NEXT:    shll $30, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X32-SSE2-NEXT:    shll $31, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $26, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-SSE2-NEXT:    shll $27, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm3
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $25, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm1
+; X32-SSE2-NEXT:    shrl $7, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movswl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $30, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    shll $31, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm0
+; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $29, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $28, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $27, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $26, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $25, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
+; X32-SSE41-NEXT:    movsbl %al, %ecx
+; X32-SSE41-NEXT:    shrl $7, %ecx
+; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $23, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $22, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $21, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $20, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $19, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $18, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $17, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
+; X32-SSE41-NEXT:    shrl $15, %eax
+; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <16 x i1>, <16 x i1>* %ptr
+ %Y = sext <16 x i1> %X to <16 x i8>
+ ret <16 x i8> %Y
+}
+
+define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
+; SSE2-LABEL: load_sext_16i1_to_16i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $15, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $14, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $13, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $12, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $11, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $10, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $9, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $8, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $7, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $6, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $5, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $4, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $3, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $2, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    shrl %eax
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psllw $15, %xmm0
+; SSE2-NEXT:    psraw $15, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    psllw $15, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_16i1_to_16i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $15, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $14, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $13, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $12, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $11, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $10, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $9, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $8, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $7, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $6, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $5, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $4, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $3, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    shrl $2, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    movl %eax, %ecx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    shrl %eax
+; SSSE3-NEXT:    andl $1, %eax
+; SSSE3-NEXT:    movd %eax, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psllw $15, %xmm0
+; SSSE3-NEXT:    psraw $15, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    psllw $15, %xmm1
+; SSSE3-NEXT:    psraw $15, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_16i1_to_16i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    movl %eax, %edx
+; SSE41-NEXT:    andl $1, %edx
+; SSE41-NEXT:    movd %edx, %xmm1
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $2, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $3, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $4, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $5, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $6, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $7, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $9, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $10, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $11, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $12, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $13, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
+; SSE41-NEXT:    movl %eax, %ecx
+; SSE41-NEXT:    shrl $14, %ecx
+; SSE41-NEXT:    andl $1, %ecx
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
+; SSE41-NEXT:    shrl $15, %eax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    psllw $15, %xmm0
+; SSE41-NEXT:    psraw $15, %xmm0
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    psllw $15, %xmm1
+; SSE41-NEXT:    psraw $15, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_16i1_to_16i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    movswq (%rdi), %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $55, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    movq %rax, %r8
+; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    movq %rax, %r11
+; AVX1-NEXT:    movq %rax, %r14
+; AVX1-NEXT:    movq %rax, %r15
+; AVX1-NEXT:    movq %rax, %r9
+; AVX1-NEXT:    movq %rax, %r12
+; AVX1-NEXT:    movq %rax, %r13
+; AVX1-NEXT:    movq %rax, %rbx
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    movsbq %al, %rbp
+; AVX1-NEXT:    shlq $54, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $53, %r8
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $52, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $51, %r11
+; AVX1-NEXT:    sarq $63, %r11
+; AVX1-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $50, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $49, %r15
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
+; AVX1-NEXT:    shrq $15, %r9
+; AVX1-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX1-NEXT:    shlq $63, %r13
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vmovd %r13d, %xmm1
+; AVX1-NEXT:    shlq $62, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $61, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $60, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $59, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $58, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $57, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $7, %rbp
+; AVX1-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_16i1_to_16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    movswq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $55, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovd %ecx, %xmm0
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    movq %rax, %r15
+; AVX2-NEXT:    movq %rax, %r9
+; AVX2-NEXT:    movq %rax, %r12
+; AVX2-NEXT:    movq %rax, %r13
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    movsbq %al, %rbp
+; AVX2-NEXT:    shlq $54, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $53, %r8
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $52, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $51, %r11
+; AVX2-NEXT:    sarq $63, %r11
+; AVX2-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $50, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $49, %r15
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
+; AVX2-NEXT:    shrq $15, %r9
+; AVX2-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX2-NEXT:    shlq $63, %r13
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vmovd %r13d, %xmm1
+; AVX2-NEXT:    shlq $62, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $61, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $60, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $59, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $58, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $57, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $7, %rbp
+; AVX2-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_sext_16i1_to_16i16:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    kmovw (%rdi), %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_16i1_to_16i16:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_16i1_to_16i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzwl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $15, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $14, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $13, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $12, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $11, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $10, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $9, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $8, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $7, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $6, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $5, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $4, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $3, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $2, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    shrl %eax
+; X32-SSE2-NEXT:    andl $1, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm4
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psllw $15, %xmm0
+; X32-SSE2-NEXT:    psraw $15, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    psllw $15, %xmm1
+; X32-SSE2-NEXT:    psraw $15, %xmm1
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzwl (%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    andl $1, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm1
+; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $2, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $3, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $4, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $5, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $6, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $7, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $8, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $9, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $10, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $11, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $12, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $13, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shrl $14, %ecx
+; X32-SSE41-NEXT:    andl $1, %ecx
+; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
+; X32-SSE41-NEXT:    shrl $15, %eax
+; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; X32-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-SSE41-NEXT:    psllw $15, %xmm0
+; X32-SSE41-NEXT:    psraw $15, %xmm0
+; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE41-NEXT:    psllw $15, %xmm1
+; X32-SSE41-NEXT:    psraw $15, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <16 x i1>, <16 x i1>* %ptr
+ %Y = sext <16 x i1> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
+; SSE2-LABEL: load_sext_32i1_to_32i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movswq (%rdi), %rax
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    movq %rax, %r8
+; SSE2-NEXT:    movq %rax, %r9
+; SSE2-NEXT:    movq %rax, %r11
+; SSE2-NEXT:    movq %rax, %r14
+; SSE2-NEXT:    movq %rax, %r15
+; SSE2-NEXT:    movq %rax, %r12
+; SSE2-NEXT:    movq %rax, %r13
+; SSE2-NEXT:    movq %rax, %rdx
+; SSE2-NEXT:    movq %rax, %rsi
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    movq %rax, %rbx
+; SSE2-NEXT:    shrq $15, %rbx
+; SSE2-NEXT:    movd %ebx, %xmm0
+; SSE2-NEXT:    movq %rax, %rbx
+; SSE2-NEXT:    shlq $49, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movd %r10d, %xmm15
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    shlq $50, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movd %r8d, %xmm8
+; SSE2-NEXT:    shlq $51, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    movd %r9d, %xmm3
+; SSE2-NEXT:    shlq $52, %r11
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movd %r11d, %xmm9
+; SSE2-NEXT:    shlq $53, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    movd %r14d, %xmm6
+; SSE2-NEXT:    shlq $54, %r15
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movd %r15d, %xmm10
+; SSE2-NEXT:    shlq $55, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movd %r12d, %xmm2
+; SSE2-NEXT:    shlq $60, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movd %r13d, %xmm11
+; SSE2-NEXT:    shlq $61, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    movd %edx, %xmm5
+; SSE2-NEXT:    shlq $62, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movd %esi, %xmm12
+; SSE2-NEXT:    shlq $63, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    shlq $58, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm13
+; SSE2-NEXT:    shlq $59, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movd %ebx, %xmm7
+; SSE2-NEXT:    shlq $57, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movd %r10d, %xmm4
+; SSE2-NEXT:    shrq $7, %rax
+; SSE2-NEXT:    movd %eax, %xmm14
+; SSE2-NEXT:    movswq 2(%rdi), %rsi
+; SSE2-NEXT:    movq %rsi, %r8
+; SSE2-NEXT:    movq %rsi, %r9
+; SSE2-NEXT:    movq %rsi, %r10
+; SSE2-NEXT:    movq %rsi, %r11
+; SSE2-NEXT:    movq %rsi, %r14
+; SSE2-NEXT:    movq %rsi, %r15
+; SSE2-NEXT:    movq %rsi, %r12
+; SSE2-NEXT:    movq %rsi, %r13
+; SSE2-NEXT:    movq %rsi, %rbx
+; SSE2-NEXT:    movq %rsi, %rax
+; SSE2-NEXT:    movq %rsi, %rcx
+; SSE2-NEXT:    movq %rsi, %rdx
+; SSE2-NEXT:    movq %rsi, %rdi
+; SSE2-NEXT:    movq %rsi, %rbp
+; SSE2-NEXT:    shrq $15, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm1
+; SSE2-NEXT:    movq %rsi, %rbp
+; SSE2-NEXT:    movsbq %sil, %rsi
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-NEXT:    shlq $49, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movd %r8d, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT:    shlq $50, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    movd %r9d, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE2-NEXT:    shlq $51, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movd %r10d, %xmm5
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    shlq $52, %r11
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movd %r11d, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    shlq $53, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    movd %r14d, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT:    shlq $54, %r15
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movd %r15d, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSE2-NEXT:    shlq $55, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movd %r12d, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    shlq $60, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movd %r13d, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    shlq $61, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movd %ebx, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT:    shlq $62, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movd %eax, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-NEXT:    shlq $63, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    shlq $58, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    shlq $59, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movd %edi, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    shlq $57, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    movd %ebp, %xmm2
+; SSE2-NEXT:    shrq $7, %rsi
+; SSE2-NEXT:    movd %esi, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_32i1_to_32i8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pushq %rbp
+; SSSE3-NEXT:    pushq %r15
+; SSSE3-NEXT:    pushq %r14
+; SSSE3-NEXT:    pushq %r13
+; SSSE3-NEXT:    pushq %r12
+; SSSE3-NEXT:    pushq %rbx
+; SSSE3-NEXT:    movswq (%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %r10
+; SSSE3-NEXT:    movq %rax, %r8
+; SSSE3-NEXT:    movq %rax, %r9
+; SSSE3-NEXT:    movq %rax, %r11
+; SSSE3-NEXT:    movq %rax, %r14
+; SSSE3-NEXT:    movq %rax, %r15
+; SSSE3-NEXT:    movq %rax, %r12
+; SSSE3-NEXT:    movq %rax, %r13
+; SSSE3-NEXT:    movq %rax, %rdx
+; SSSE3-NEXT:    movq %rax, %rsi
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    movq %rax, %rbp
+; SSSE3-NEXT:    movq %rax, %rbx
+; SSSE3-NEXT:    shrq $15, %rbx
+; SSSE3-NEXT:    movd %ebx, %xmm0
+; SSSE3-NEXT:    movq %rax, %rbx
+; SSSE3-NEXT:    shlq $49, %r10
+; SSSE3-NEXT:    sarq $63, %r10
+; SSSE3-NEXT:    movd %r10d, %xmm15
+; SSSE3-NEXT:    movq %rax, %r10
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSSE3-NEXT:    shlq $50, %r8
+; SSSE3-NEXT:    sarq $63, %r8
+; SSSE3-NEXT:    movd %r8d, %xmm8
+; SSSE3-NEXT:    shlq $51, %r9
+; SSSE3-NEXT:    sarq $63, %r9
+; SSSE3-NEXT:    movd %r9d, %xmm3
+; SSSE3-NEXT:    shlq $52, %r11
+; SSSE3-NEXT:    sarq $63, %r11
+; SSSE3-NEXT:    movd %r11d, %xmm9
+; SSSE3-NEXT:    shlq $53, %r14
+; SSSE3-NEXT:    sarq $63, %r14
+; SSSE3-NEXT:    movd %r14d, %xmm6
+; SSSE3-NEXT:    shlq $54, %r15
+; SSSE3-NEXT:    sarq $63, %r15
+; SSSE3-NEXT:    movd %r15d, %xmm10
+; SSSE3-NEXT:    shlq $55, %r12
+; SSSE3-NEXT:    sarq $63, %r12
+; SSSE3-NEXT:    movd %r12d, %xmm2
+; SSSE3-NEXT:    shlq $60, %r13
+; SSSE3-NEXT:    sarq $63, %r13
+; SSSE3-NEXT:    movd %r13d, %xmm11
+; SSSE3-NEXT:    shlq $61, %rdx
+; SSSE3-NEXT:    sarq $63, %rdx
+; SSSE3-NEXT:    movd %edx, %xmm5
+; SSSE3-NEXT:    shlq $62, %rsi
+; SSSE3-NEXT:    sarq $63, %rsi
+; SSSE3-NEXT:    movd %esi, %xmm12
+; SSSE3-NEXT:    shlq $63, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    shlq $58, %rbp
+; SSSE3-NEXT:    sarq $63, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm13
+; SSSE3-NEXT:    shlq $59, %rbx
+; SSSE3-NEXT:    sarq $63, %rbx
+; SSSE3-NEXT:    movd %ebx, %xmm7
+; SSSE3-NEXT:    shlq $57, %r10
+; SSSE3-NEXT:    sarq $63, %r10
+; SSSE3-NEXT:    movd %r10d, %xmm4
+; SSSE3-NEXT:    shrq $7, %rax
+; SSSE3-NEXT:    movd %eax, %xmm14
+; SSSE3-NEXT:    movswq 2(%rdi), %rsi
+; SSSE3-NEXT:    movq %rsi, %r8
+; SSSE3-NEXT:    movq %rsi, %r9
+; SSSE3-NEXT:    movq %rsi, %r10
+; SSSE3-NEXT:    movq %rsi, %r11
+; SSSE3-NEXT:    movq %rsi, %r14
+; SSSE3-NEXT:    movq %rsi, %r15
+; SSSE3-NEXT:    movq %rsi, %r12
+; SSSE3-NEXT:    movq %rsi, %r13
+; SSSE3-NEXT:    movq %rsi, %rbx
+; SSSE3-NEXT:    movq %rsi, %rax
+; SSSE3-NEXT:    movq %rsi, %rcx
+; SSSE3-NEXT:    movq %rsi, %rdx
+; SSSE3-NEXT:    movq %rsi, %rdi
+; SSSE3-NEXT:    movq %rsi, %rbp
+; SSSE3-NEXT:    shrq $15, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm1
+; SSSE3-NEXT:    movq %rsi, %rbp
+; SSSE3-NEXT:    movsbq %sil, %rsi
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSSE3-NEXT:    shlq $49, %r8
+; SSSE3-NEXT:    sarq $63, %r8
+; SSSE3-NEXT:    movd %r8d, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSSE3-NEXT:    shlq $50, %r9
+; SSSE3-NEXT:    sarq $63, %r9
+; SSSE3-NEXT:    movd %r9d, %xmm4
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSSE3-NEXT:    shlq $51, %r10
+; SSSE3-NEXT:    sarq $63, %r10
+; SSSE3-NEXT:    movd %r10d, %xmm5
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT:    shlq $52, %r11
+; SSSE3-NEXT:    sarq $63, %r11
+; SSSE3-NEXT:    movd %r11d, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSSE3-NEXT:    shlq $53, %r14
+; SSSE3-NEXT:    sarq $63, %r14
+; SSSE3-NEXT:    movd %r14d, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSSE3-NEXT:    shlq $54, %r15
+; SSSE3-NEXT:    sarq $63, %r15
+; SSSE3-NEXT:    movd %r15d, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSSE3-NEXT:    shlq $55, %r12
+; SSSE3-NEXT:    sarq $63, %r12
+; SSSE3-NEXT:    movd %r12d, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    shlq $60, %r13
+; SSSE3-NEXT:    sarq $63, %r13
+; SSSE3-NEXT:    movd %r13d, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT:    shlq $61, %rbx
+; SSSE3-NEXT:    sarq $63, %rbx
+; SSSE3-NEXT:    movd %ebx, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSSE3-NEXT:    shlq $62, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movd %eax, %xmm6
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSSE3-NEXT:    shlq $63, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSSE3-NEXT:    shlq $58, %rdx
+; SSSE3-NEXT:    sarq $63, %rdx
+; SSSE3-NEXT:    movd %edx, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSSE3-NEXT:    shlq $59, %rdi
+; SSSE3-NEXT:    sarq $63, %rdi
+; SSSE3-NEXT:    movd %edi, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSSE3-NEXT:    shlq $57, %rbp
+; SSSE3-NEXT:    sarq $63, %rbp
+; SSSE3-NEXT:    movd %ebp, %xmm2
+; SSSE3-NEXT:    shrq $7, %rsi
+; SSSE3-NEXT:    movd %esi, %xmm5
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSSE3-NEXT:    popq %rbx
+; SSSE3-NEXT:    popq %r12
+; SSSE3-NEXT:    popq %r13
+; SSSE3-NEXT:    popq %r14
+; SSSE3-NEXT:    popq %r15
+; SSSE3-NEXT:    popq %rbp
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_32i1_to_32i8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movswq (%rdi), %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $60, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $59, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $58, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $57, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
+; SSE41-NEXT:    movsbq %al, %rcx
+; SSE41-NEXT:    shrq $7, %rcx
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $55, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $54, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $53, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $52, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $51, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $50, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $49, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT:    shrq $15, %rax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
+; SSE41-NEXT:    movswq 2(%rdi), %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $62, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rax, %rdx
+; SSE41-NEXT:    shlq $63, %rdx
+; SSE41-NEXT:    sarq $63, %rdx
+; SSE41-NEXT:    movd %edx, %xmm1
+; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $61, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $60, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $59, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $58, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $57, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
+; SSE41-NEXT:    movsbq %al, %rcx
+; SSE41-NEXT:    shrq $7, %rcx
+; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $55, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $54, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $53, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $52, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $51, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $50, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    shlq $49, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
+; SSE41-NEXT:    shrq $15, %rax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_32i1_to_32i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    movslq (%rdi), %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $47, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    movq %rax, %r8
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    movq %rax, %r13
+; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    movq %rax, %r11
+; AVX1-NEXT:    movq %rax, %r9
+; AVX1-NEXT:    movq %rax, %rbx
+; AVX1-NEXT:    movq %rax, %r14
+; AVX1-NEXT:    movq %rax, %r15
+; AVX1-NEXT:    movq %rax, %r12
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    shlq $46, %rbp
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    shlq $45, %r8
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r8
+; AVX1-NEXT:    shlq $44, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    shlq $43, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shlq $42, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    shlq $41, %r13
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r13
+; AVX1-NEXT:    shlq $40, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    shlq $39, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    shlq $38, %r11
+; AVX1-NEXT:    sarq $63, %r11
+; AVX1-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX1-NEXT:    movsbq %al, %r11
+; AVX1-NEXT:    shlq $37, %r9
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r9
+; AVX1-NEXT:    shlq $36, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rbx
+; AVX1-NEXT:    shlq $35, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r14
+; AVX1-NEXT:    shlq $34, %r15
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r15
+; AVX1-NEXT:    shlq $33, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %r12
+; AVX1-NEXT:    shrq $31, %rbp
+; AVX1-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    shlq $63, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm1
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    movswq %ax, %rax
+; AVX1-NEXT:    shlq $62, %r8
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $61, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $60, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $59, %r13
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $58, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $57, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $7, %r11
+; AVX1-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $55, %r9
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $54, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $53, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $52, %r15
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $51, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $50, %rbp
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
+; AVX1-NEXT:    shlq $49, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $15, %rax
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_32i1_to_32i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movslq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $47, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vmovd %ecx, %xmm0
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    movq %rax, %r13
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    movq %rax, %r9
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    movq %rax, %r15
+; AVX2-NEXT:    movq %rax, %r12
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    shlq $46, %rbp
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    shlq $45, %r8
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    shlq $44, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shlq $43, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shlq $42, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    shlq $41, %r13
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r13
+; AVX2-NEXT:    shlq $40, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    shlq $39, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    shlq $38, %r11
+; AVX2-NEXT:    sarq $63, %r11
+; AVX2-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX2-NEXT:    movsbq %al, %r11
+; AVX2-NEXT:    shlq $37, %r9
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r9
+; AVX2-NEXT:    shlq $36, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    shlq $35, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    shlq $34, %r15
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r15
+; AVX2-NEXT:    shlq $33, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %r12
+; AVX2-NEXT:    shrq $31, %rbp
+; AVX2-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    shlq $63, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovd %edx, %xmm1
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    movswq %ax, %rax
+; AVX2-NEXT:    shlq $62, %r8
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $61, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $60, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $59, %r13
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $58, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $57, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $7, %r11
+; AVX2-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $55, %r9
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $54, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $53, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $52, %r15
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $51, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $50, %rbp
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
+; AVX2-NEXT:    shlq $49, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $15, %rax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_sext_32i1_to_32i8:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    kmovw (%rdi), %k1
+; AVX512F-NEXT:    kmovw 2(%rdi), %k2
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_32i1_to_32i8:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_32i1_to_32i8:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $28, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movswl (%eax), %edx
+; X32-SSE2-NEXT:    movl %edx, %ebp
+; X32-SSE2-NEXT:    movl %edx, %esi
+; X32-SSE2-NEXT:    movl %edx, %edi
+; X32-SSE2-NEXT:    movl %edx, %ebx
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    shrl $15, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    shll $17, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm4
+; X32-SSE2-NEXT:    movl %edx, %ebp
+; X32-SSE2-NEXT:    shll $18, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm1
+; X32-SSE2-NEXT:    movl %edx, %esi
+; X32-SSE2-NEXT:    shll $19, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm2
+; X32-SSE2-NEXT:    movl %edx, %edi
+; X32-SSE2-NEXT:    shll $20, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm5
+; X32-SSE2-NEXT:    movl %edx, %ebx
+; X32-SSE2-NEXT:    shll $21, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm6
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    shll $22, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm7
+; X32-SSE2-NEXT:    movl %edx, %ebp
+; X32-SSE2-NEXT:    shll $23, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm3
+; X32-SSE2-NEXT:    movl %edx, %esi
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; X32-SSE2-NEXT:    shll $28, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm0
+; X32-SSE2-NEXT:    movl %edx, %edi
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-SSE2-NEXT:    shll $29, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm1
+; X32-SSE2-NEXT:    movl %edx, %ebx
+; X32-SSE2-NEXT:    movsbl %dl, %edx
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    shll $30, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    shll $31, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $26, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm7
+; X32-SSE2-NEXT:    shll $27, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm2
+; X32-SSE2-NEXT:    shll $25, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm6
+; X32-SSE2-NEXT:    shrl $7, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm5
+; X32-SSE2-NEXT:    movswl 2(%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    movl %eax, %esi
+; X32-SSE2-NEXT:    movl %eax, %edi
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shrl $15, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm4
+; X32-SSE2-NEXT:    movdqu %xmm4, (%esp) # 16-byte Spill
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-SSE2-NEXT:    shll $17, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm4
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; X32-SSE2-NEXT:    shll $18, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm7
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; X32-SSE2-NEXT:    shll $19, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm5
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; X32-SSE2-NEXT:    shll $20, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm6
+; X32-SSE2-NEXT:    movl %eax, %esi
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    shll $21, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm1
+; X32-SSE2-NEXT:    movl %eax, %edi
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; X32-SSE2-NEXT:    shll $22, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm3
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    movdqu (%esp), %xmm2 # 16-byte Reload
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $23, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm2
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; X32-SSE2-NEXT:    shll $28, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm7
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    movsbl %al, %eax
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; X32-SSE2-NEXT:    shll $29, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE2-NEXT:    shll $30, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; X32-SSE2-NEXT:    shll $31, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; X32-SSE2-NEXT:    shll $26, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm5
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-SSE2-NEXT:    shll $27, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm3
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; X32-SSE2-NEXT:    shll $25, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm4
+; X32-SSE2-NEXT:    shrl $7, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm5
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE2-NEXT:    addl $28, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pushl %esi
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movswl (%eax), %ecx
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $30, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movl %ecx, %esi
+; X32-SSE41-NEXT:    shll $31, %esi
+; X32-SSE41-NEXT:    sarl $31, %esi
+; X32-SSE41-NEXT:    movd %esi, %xmm0
+; X32-SSE41-NEXT:    pinsrb $1, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $29, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $2, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $28, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $3, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $27, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $4, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $26, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $5, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $25, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $6, %edx, %xmm0
+; X32-SSE41-NEXT:    movsbl %cl, %edx
+; X32-SSE41-NEXT:    shrl $7, %edx
+; X32-SSE41-NEXT:    pinsrb $7, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $23, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $8, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $22, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $9, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $21, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $10, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $20, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $11, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $19, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $12, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $18, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $13, %edx, %xmm0
+; X32-SSE41-NEXT:    movl %ecx, %edx
+; X32-SSE41-NEXT:    shll $17, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    pinsrb $14, %edx, %xmm0
+; X32-SSE41-NEXT:    shrl $15, %ecx
+; X32-SSE41-NEXT:    pinsrb $15, %ecx, %xmm0
+; X32-SSE41-NEXT:    movswl 2(%eax), %eax
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $30, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    movl %eax, %edx
+; X32-SSE41-NEXT:    shll $31, %edx
+; X32-SSE41-NEXT:    sarl $31, %edx
+; X32-SSE41-NEXT:    movd %edx, %xmm1
+; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $29, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $28, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $27, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $26, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $25, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
+; X32-SSE41-NEXT:    movsbl %al, %ecx
+; X32-SSE41-NEXT:    shrl $7, %ecx
+; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $23, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $22, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $21, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $20, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $19, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $18, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
+; X32-SSE41-NEXT:    movl %eax, %ecx
+; X32-SSE41-NEXT:    shll $17, %ecx
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
+; X32-SSE41-NEXT:    shrl $15, %eax
+; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; X32-SSE41-NEXT:    popl %esi
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <32 x i1>, <32 x i1>* %ptr
+ %Y = sext <32 x i1> %X to <32 x i8>
+ ret <32 x i8> %Y
+}
+
+define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_16i8_to_16i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_16i8_to_16i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    psraw $8, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_16i8_to_16i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_16i8_to_16i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_16i8_to_16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_16i8_to_16i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxbw (%rdi), %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_16i8_to_16i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movdqa (%eax), %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE2-NEXT:    psraw $8, %xmm0
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE2-NEXT:    psraw $8, %xmm1
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <16 x i8>, <16 x i8>* %ptr
+ %Y = sext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_2i16_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_2i16_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_2i16_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_2i16_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_2i16_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i16>, <2 x i16>* %ptr
+ %Y = sext <2 x i16> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_4i16_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i16_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i16_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_4i16_to_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_4i16_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i16>, <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_4i16_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i16_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i16_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i16_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxwq 4(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxwq (%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i16_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_4i16_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxwq (%rdi), %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_4i16_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i16>, <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_8i16_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_8i16_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_8i16_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_8i16_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_8i16_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_8i16_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_8i16_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movdqa (%eax), %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i16>, <8 x i16>* %ptr
+ %Y = sext <8 x i16> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
+; SSE2-LABEL: load_sext_2i32_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_2i32_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_2i32_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_2i32_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_2i32_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i32>, <2 x i32>* %ptr
+ %Y = sext <2 x i32> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
+; SSE2-LABEL: load_sext_4i32_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i32_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i32_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i32_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i32_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_4i32_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovsxdq (%rdi), %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: load_sext_4i32_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movdqa (%eax), %xmm0
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i32>, <4 x i32>* %ptr
+ %Y = sext <4 x i32> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_2i8_to_i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_2i8_to_i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    movd %xmm0, %eax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_2i8_to_i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_2i8_to_i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_2i8_to_i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psraw $8, %xmm0
+; X32-SSE2-NEXT:    movd %xmm0, %eax
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_2i8_to_i32:
+; X32-SSE41:       # %bb.0: # %entry
+; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; X32-SSE41-NEXT:    movd %xmm0, %eax
+; X32-SSE41-NEXT:    retl
+entry:
+  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %Ex = sext <2 x i8> %Shuf to <2 x i16>
+  %Bc = bitcast <2 x i16> %Ex to i32
+  ret i32 %Bc
+}
+
+define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
+; SSE2-LABEL: sext_4i1_to_4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i1_to_4i64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i1_to_4i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i1_to_4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i1_to_4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_4i1_to_4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_4i1_to_4i64:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pslld $31, %xmm0
+; X32-SSE2-NEXT:    psrad $31, %xmm0
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_4i1_to_4i64:
+; X32-SSE41:       # %bb.0:
+; X32-SSE41-NEXT:    pslld $31, %xmm0
+; X32-SSE41-NEXT:    psrad $31, %xmm0
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+  %extmask = sext <4 x i1> %mask to <4 x i64>
+  ret <4 x i64> %extmask
+}
+
+define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
+; SSE2-LABEL: sext_4i8_to_4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i8_to_4i64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i8_to_4i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i8_to_4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i8_to_4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sext_4i8_to_4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_4i8_to_4i64:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_4i8_to_4i64:
+; X32-SSE41:       # %bb.0:
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
+; X32-SSE41-NEXT:    psrld $16, %xmm0
+; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+  %extmask = sext <4 x i8> %mask to <4 x i64>
+  ret <4 x i64> %extmask
+}
+
+define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
+; SSE-LABEL: sext_32xi1_to_32xi8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqw %xmm5, %xmm1
+; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
+; SSE-NEXT:    packsswb %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqw %xmm7, %xmm3
+; SSE-NEXT:    pcmpeqw %xmm6, %xmm2
+; SSE-NEXT:    packsswb %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: sext_32xi1_to_32xi8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_32xi1_to_32xi8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sext_32xi1_to_32xi8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: sext_32xi1_to_32xi8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_32xi1_to_32xi8:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    movl %esp, %ebp
+; X32-SSE2-NEXT:    andl $-16, %esp
+; X32-SSE2-NEXT:    subl $16, %esp
+; X32-SSE2-NEXT:    movdqa 8(%ebp), %xmm3
+; X32-SSE2-NEXT:    pcmpeqw 40(%ebp), %xmm1
+; X32-SSE2-NEXT:    pcmpeqw 24(%ebp), %xmm0
+; X32-SSE2-NEXT:    packsswb %xmm1, %xmm0
+; X32-SSE2-NEXT:    pcmpeqw 72(%ebp), %xmm3
+; X32-SSE2-NEXT:    pcmpeqw 56(%ebp), %xmm2
+; X32-SSE2-NEXT:    packsswb %xmm3, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE2-NEXT:    movl %ebp, %esp
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_32xi1_to_32xi8:
+; X32-SSE41:       # %bb.0:
+; X32-SSE41-NEXT:    pushl %ebp
+; X32-SSE41-NEXT:    movl %esp, %ebp
+; X32-SSE41-NEXT:    andl $-16, %esp
+; X32-SSE41-NEXT:    subl $16, %esp
+; X32-SSE41-NEXT:    movdqa 8(%ebp), %xmm3
+; X32-SSE41-NEXT:    pcmpeqw 40(%ebp), %xmm1
+; X32-SSE41-NEXT:    pcmpeqw 24(%ebp), %xmm0
+; X32-SSE41-NEXT:    packsswb %xmm1, %xmm0
+; X32-SSE41-NEXT:    pcmpeqw 72(%ebp), %xmm3
+; X32-SSE41-NEXT:    pcmpeqw 56(%ebp), %xmm2
+; X32-SSE41-NEXT:    packsswb %xmm3, %xmm2
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE41-NEXT:    movl %ebp, %esp
+; X32-SSE41-NEXT:    popl %ebp
+; X32-SSE41-NEXT:    retl
+  %a = icmp eq <32 x i16> %c1, %c2
+  %b = sext <32 x i1> %a to <32 x i8>
+  ret <32 x i8> %b
+}
+
+define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
+; SSE2-LABEL: sext_2i8_to_2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_2i8_to_2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    paddd %xmm0, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_2i8_to_2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT:    paddd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: sext_2i8_to_2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: sext_2i8_to_2i32:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzwl (%eax), %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE41-LABEL: sext_2i8_to_2i32:
+; X32-SSE41:       # %bb.0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movzwl (%eax), %eax
+; X32-SSE41-NEXT:    movd %eax, %xmm0
+; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
+; X32-SSE41-NEXT:    paddd %xmm0, %xmm0
+; X32-SSE41-NEXT:    retl
+  %x = load <2 x i8>, <2 x i8>* %addr, align 1
+  %y = sext <2 x i8> %x to <2 x i32>
+  %z = add <2 x i32>%y, %y
+  ret <2 x i32>%z
+}
+
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index 9576f6482fd7a71dfba192109ab274261f592abc..42fd2d1c74827dd5842281781b76c1327954370d 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -7,7 +7,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
-; Just one 32-bit run to make sure we do reasonable things there.
+; Just two 32-bit runs to make sure we do reasonable things there.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
 
 define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
@@ -33,6 +34,12 @@ define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp
 ; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i8_to_8i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psraw $8, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i8_to_8i16:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
@@ -46,20 +53,20 @@ entry:
 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_16i16:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_16i16:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    psraw $8, %xmm0
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    psraw $8, %xmm2
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSSE3-NEXT:    psraw $8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_16i16:
@@ -88,6 +95,15 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
 ; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i8_to_16i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    psraw $8, %xmm2
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE2-NEXT:    psraw $8, %xmm1
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i8_to_16i16:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
@@ -103,32 +119,30 @@ entry:
 define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_32i8_to_32i16:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    psraw $8, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
 ; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_32i8_to_32i16:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    psraw $8, %xmm0
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT:    psraw $8, %xmm1
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT:    psraw $8, %xmm4
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSSE3-NEXT:    psraw $8, %xmm5
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSSE3-NEXT:    psraw $8, %xmm2
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
 ; SSSE3-NEXT:    psraw $8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_32i8_to_32i16:
@@ -178,6 +192,20 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_32i8_to_32i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; X32-SSE2-NEXT:    psraw $8, %xmm4
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; X32-SSE2-NEXT:    psraw $8, %xmm5
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-SSE2-NEXT:    psraw $8, %xmm2
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; X32-SSE2-NEXT:    psraw $8, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_32i8_to_32i16:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
@@ -219,6 +247,13 @@ define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp
 ; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i8_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i8_to_4i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
@@ -276,6 +311,16 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
 ; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i8_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm2
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i8_to_8i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
@@ -361,6 +406,22 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
 ; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i8_to_16i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm4
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $24, %xmm2
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $24, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i8_to_16i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
@@ -382,8 +443,8 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
@@ -392,8 +453,8 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $24, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
@@ -408,6 +469,16 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i8_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i8_to_2i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
@@ -423,16 +494,15 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $24, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -440,16 +510,15 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSSE3-NEXT:    psrad $24, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSSE3-NEXT:    psrad $24, %xmm1
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -479,6 +548,22 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i8_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X32-SSE2-NEXT:    psrad $24, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i8_to_4i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
@@ -498,28 +583,26 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
 ; SSE2-NEXT:    psrad $24, %xmm4
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $24, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
 ; SSE2-NEXT:    psrad $24, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
 ; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -528,28 +611,26 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
 ; SSSE3-NEXT:    psrad $24, %xmm4
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    psrad $24, %xmm1
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSSE3-NEXT:    psrad $24, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
 ; SSSE3-NEXT:    psrad $24, %xmm3
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -593,6 +674,34 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i8_to_8i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X32-SSE2-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; X32-SSE2-NEXT:    psrad $24, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X32-SSE2-NEXT:    psrad $24, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; X32-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; X32-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X32-SSE2-NEXT:    psrad $24, %xmm3
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i8_to_8i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
@@ -634,6 +743,12 @@ define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp
 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_8i16_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_8i16_to_4i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
@@ -689,6 +804,15 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
 ; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_8i16_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm2
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_8i16_to_8i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
@@ -769,6 +893,20 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
 ; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_16i16_to_16i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm4
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; X32-SSE2-NEXT:    psrad $16, %xmm5
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE2-NEXT:    psrad $16, %xmm2
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X32-SSE2-NEXT:    psrad $16, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_16i16_to_16i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
@@ -789,8 +927,8 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp
 ; SSE2-LABEL: sext_8i16_to_2i64:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
@@ -798,8 +936,8 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp
 ; SSSE3-LABEL: sext_8i16_to_2i64:
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $16, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
@@ -814,6 +952,15 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp
 ; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_8i16_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_8i16_to_2i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
@@ -828,30 +975,30 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
 ; SSE2-LABEL: sext_8i16_to_4i64:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_8i16_to_4i64:
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSSE3-NEXT:    psrad $16, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSSE3-NEXT:    psrad $16, %xmm1
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -881,6 +1028,21 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
 ; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_8i16_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X32-SSE2-NEXT:    psrad $16, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_8i16_to_4i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
@@ -898,52 +1060,52 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
 ; SSE2-LABEL: sext_8i16_to_8i64:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm4
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSE2-NEXT:    psrad $16, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
 ; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
 ; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_8i16_to_8i64:
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
 ; SSSE3-NEXT:    psrad $16, %xmm4
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSSE3-NEXT:    psrad $16, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSSE3-NEXT:    psrad $16, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
 ; SSSE3-NEXT:    psrad $16, %xmm3
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -986,6 +1148,32 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
 ; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_8i16_to_8i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X32-SSE2-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; X32-SSE2-NEXT:    psrad $16, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X32-SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X32-SSE2-NEXT:    psrad $16, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
+; X32-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X32-SSE2-NEXT:    psrad $16, %xmm3
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; X32-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_8i16_to_8i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
@@ -1005,15 +1193,15 @@ entry:
 define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_4i32_to_2i64:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_4i32_to_2i64:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -1027,6 +1215,13 @@ define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp
 ; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_4i32_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_4i32_to_2i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
@@ -1040,23 +1235,23 @@ entry:
 define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_4i32_to_4i64:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_4i32_to_4i64:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -1086,6 +1281,17 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
 ; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_4i32_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_4i32_to_4i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
@@ -1102,38 +1308,38 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
 ; SSE2-LABEL: sext_8i32_to_8i64:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_8i32_to_8i64:
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psrad $31, %xmm3
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -1176,6 +1382,25 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
 ; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_8i32_to_8i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_8i32_to_8i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
@@ -1250,6 +1475,22 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_2i1_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $30, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; X32-SSE2-NEXT:    shll $31, %eax
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1277,8 +1518,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
 ; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
@@ -1289,8 +1530,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
 ; SSSE3-NEXT:    movd %eax, %xmm0
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $24, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
@@ -1305,6 +1546,19 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
 ; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_2i8_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzwl (%eax), %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1441,6 +1695,30 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_4i1_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $28, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $29, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $30, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    shll $31, %eax
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1494,6 +1772,15 @@ define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
 ; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_4i8_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1644,6 +1931,33 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_4i1_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $3, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $2, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    movd %eax, %xmm2
+; X32-SSE2-NEXT:    shrl %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X32-SSE2-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; X32-SSE2-NEXT:    psllq $63, %xmm0
+; X32-SSE2-NEXT:    psrad $31, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; X32-SSE2-NEXT:    psllq $63, %xmm1
+; X32-SSE2-NEXT:    psrad $31, %xmm1
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1727,6 +2041,33 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
 ; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_4i8_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movsbl 1(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    movsbl (%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT:    movsbl 3(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    movsbl 2(%eax), %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm1
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm3
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1784,6 +2125,22 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_4i8_to_4i64_extract:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movsbl 3(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    movsbl 2(%eax), %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
 ; X32-SSE41:       # %bb.0:
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2004,6 +2361,49 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_8i1_to_8i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movsbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $7, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $25, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $26, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $27, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $28, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $29, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $30, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    shll $31, %eax
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2070,6 +2470,14 @@ define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
 ; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_8i8_to_8i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psraw $8, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2163,6 +2571,55 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
 ; AVX512-NEXT:    vpmovsxbq (%rdi), %zmm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_8i8_to_8i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movsbl 1(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    movsbl (%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT:    movsbl 3(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    movsbl 2(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE2-NEXT:    movsbl 5(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X32-SSE2-NEXT:    movsbl 4(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-SSE2-NEXT:    movsbl 7(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm4
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X32-SSE2-NEXT:    movsbl 6(%eax), %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm3
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm5
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2398,6 +2855,53 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_8i1_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $7, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $6, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $5, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $4, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $3, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $2, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    shrl %eax
+; X32-SSE2-NEXT:    andl $1, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm3
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    pslld $31, %xmm0
+; X32-SSE2-NEXT:    psrad $31, %xmm0
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT:    pslld $31, %xmm1
+; X32-SSE2-NEXT:    psrad $31, %xmm1
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2494,6 +2998,19 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
 ; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_8i8_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm1
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2909,6 +3426,97 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_16i1_to_16i8:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movswl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    movl %eax, %esi
+; X32-SSE2-NEXT:    movl %eax, %edi
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shrl $15, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm2
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shll $17, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    shll $18, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm1
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    shll $19, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll $20, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm4
+; X32-SSE2-NEXT:    movl %eax, %esi
+; X32-SSE2-NEXT:    shll $21, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm6
+; X32-SSE2-NEXT:    movl %eax, %edi
+; X32-SSE2-NEXT:    shll $22, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm7
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shll $23, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm5
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $28, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm2
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    movsbl %al, %eax
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; X32-SSE2-NEXT:    shll $29, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; X32-SSE2-NEXT:    shll $30, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X32-SSE2-NEXT:    shll $31, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $26, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-SSE2-NEXT:    shll $27, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm3
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $25, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm1
+; X32-SSE2-NEXT:    shrl $7, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3221,7 +3829,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; SSE41-NEXT:    psllw $15, %xmm0
 ; SSE41-NEXT:    psraw $15, %xmm0
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE41-NEXT:    psllw $15, %xmm1
 ; SSE41-NEXT:    psraw $15, %xmm1
 ; SSE41-NEXT:    retq
@@ -3434,6 +4042,93 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_16i1_to_16i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzwl (%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $15, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $14, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $13, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $12, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $11, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $10, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $9, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $8, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $7, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $6, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $5, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $4, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $3, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrl $2, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    andl $1, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    shrl %eax
+; X32-SSE2-NEXT:    andl $1, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm4
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psllw $15, %xmm0
+; X32-SSE2-NEXT:    psraw $15, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    psllw $15, %xmm1
+; X32-SSE2-NEXT:    psraw $15, %xmm1
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3502,7 +4197,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
 ; X32-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X32-SSE41-NEXT:    psllw $15, %xmm0
 ; X32-SSE41-NEXT:    psraw $15, %xmm0
-; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; X32-SSE41-NEXT:    psllw $15, %xmm1
 ; X32-SSE41-NEXT:    psraw $15, %xmm1
 ; X32-SSE41-NEXT:    retl
@@ -4289,6 +4984,179 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_32i1_to_32i8:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $28, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movswl (%eax), %edx
+; X32-SSE2-NEXT:    movl %edx, %ebp
+; X32-SSE2-NEXT:    movl %edx, %esi
+; X32-SSE2-NEXT:    movl %edx, %edi
+; X32-SSE2-NEXT:    movl %edx, %ebx
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    shrl $15, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    shll $17, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm4
+; X32-SSE2-NEXT:    movl %edx, %ebp
+; X32-SSE2-NEXT:    shll $18, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm1
+; X32-SSE2-NEXT:    movl %edx, %esi
+; X32-SSE2-NEXT:    shll $19, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm2
+; X32-SSE2-NEXT:    movl %edx, %edi
+; X32-SSE2-NEXT:    shll $20, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm5
+; X32-SSE2-NEXT:    movl %edx, %ebx
+; X32-SSE2-NEXT:    shll $21, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm6
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    shll $22, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm7
+; X32-SSE2-NEXT:    movl %edx, %ebp
+; X32-SSE2-NEXT:    shll $23, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm3
+; X32-SSE2-NEXT:    movl %edx, %esi
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; X32-SSE2-NEXT:    shll $28, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm0
+; X32-SSE2-NEXT:    movl %edx, %edi
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-SSE2-NEXT:    shll $29, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm1
+; X32-SSE2-NEXT:    movl %edx, %ebx
+; X32-SSE2-NEXT:    movsbl %dl, %edx
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE2-NEXT:    shll $30, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    shll $31, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $26, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm7
+; X32-SSE2-NEXT:    shll $27, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm2
+; X32-SSE2-NEXT:    shll $25, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm6
+; X32-SSE2-NEXT:    shrl $7, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm5
+; X32-SSE2-NEXT:    movswl 2(%eax), %eax
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    movl %eax, %esi
+; X32-SSE2-NEXT:    movl %eax, %edi
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shrl $15, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm4
+; X32-SSE2-NEXT:    movdqu %xmm4, (%esp) # 16-byte Spill
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-SSE2-NEXT:    shll $17, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm4
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; X32-SSE2-NEXT:    shll $18, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm7
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; X32-SSE2-NEXT:    shll $19, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm5
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; X32-SSE2-NEXT:    shll $20, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm6
+; X32-SSE2-NEXT:    movl %eax, %esi
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    shll $21, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm1
+; X32-SSE2-NEXT:    movl %eax, %edi
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; X32-SSE2-NEXT:    shll $22, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm3
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    movdqu (%esp), %xmm2 # 16-byte Reload
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X32-SSE2-NEXT:    shll $23, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm2
+; X32-SSE2-NEXT:    movl %eax, %edx
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; X32-SSE2-NEXT:    shll $28, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm7
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    movsbl %al, %eax
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; X32-SSE2-NEXT:    shll $29, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm3
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE2-NEXT:    shll $30, %esi
+; X32-SSE2-NEXT:    sarl $31, %esi
+; X32-SSE2-NEXT:    movd %esi, %xmm4
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; X32-SSE2-NEXT:    shll $31, %edi
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    movd %edi, %xmm1
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; X32-SSE2-NEXT:    shll $26, %ebx
+; X32-SSE2-NEXT:    sarl $31, %ebx
+; X32-SSE2-NEXT:    movd %ebx, %xmm5
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-SSE2-NEXT:    shll $27, %edx
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movd %edx, %xmm3
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; X32-SSE2-NEXT:    shll $25, %ebp
+; X32-SSE2-NEXT:    sarl $31, %ebp
+; X32-SSE2-NEXT:    movd %ebp, %xmm4
+; X32-SSE2-NEXT:    shrl $7, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm5
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE2-NEXT:    addl $28, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pushl %esi
@@ -4469,6 +5337,17 @@ define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
 ; AVX512-NEXT:    vpmovsxbw (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_16i8_to_16i16:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psraw $8, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psraw $8, %xmm1
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -4486,8 +5365,8 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
@@ -4496,8 +5375,8 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $16, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
@@ -4512,6 +5391,17 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
 ; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_2i16_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -4548,6 +5438,14 @@ define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
 ; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_4i16_to_4i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -4613,6 +5511,33 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
 ; AVX512-NEXT:    vpmovsxwq (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_4i16_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movswl 2(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE2-NEXT:    movswl (%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm0
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT:    movswl 6(%eax), %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm2
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movd %ecx, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE2-NEXT:    movswl 4(%eax), %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm1
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm3
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -4669,6 +5594,17 @@ define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
 ; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_8i16_to_8i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $16, %xmm0
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $16, %xmm1
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -4685,16 +5621,16 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
 ; SSE2-LABEL: load_sext_2i32_to_2i64:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_sext_2i32_to_2i64:
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -4708,6 +5644,15 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
 ; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_2i32_to_2i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -4723,24 +5668,24 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
 ; SSE2-LABEL: load_sext_4i32_to_4i64:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_sext_4i32_to_4i64:
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movdqa (%rdi), %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -4767,6 +5712,19 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
 ; AVX512-NEXT:    vpmovsxdq (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: load_sext_4i32_to_4i64:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movdqa (%eax), %xmm0
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -4806,6 +5764,17 @@ define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_2i8_to_i32:
+; X32-SSE2:       # %bb.0: # %entry
+; X32-SSE2-NEXT:    pushl %eax
+; X32-SSE2-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    psraw $8, %xmm0
+; X32-SSE2-NEXT:    movd %xmm0, %eax
+; X32-SSE2-NEXT:    popl %ecx
+; X32-SSE2-NEXT:    .cfi_def_cfa_offset 4
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_2i8_to_i32:
 ; X32-SSE41:       # %bb.0: # %entry
 ; X32-SSE41-NEXT:    pushl %eax
@@ -4827,12 +5796,12 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pslld $31, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
@@ -4840,12 +5809,12 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pslld $31, %xmm0
 ; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -4883,6 +5852,19 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
 ; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_4i1_to_4i64:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pslld $31, %xmm0
+; X32-SSE2-NEXT:    psrad $31, %xmm0
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_4i1_to_4i64:
 ; X32-SSE41:       # %bb.0:
 ; X32-SSE41-NEXT:    pslld $31, %xmm0
@@ -4901,12 +5883,12 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pslld $24, %xmm0
 ; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
@@ -4914,12 +5896,12 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pslld $24, %xmm0
 ; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -4957,6 +5939,19 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
 ; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_4i8_to_4i64:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pslld $24, %xmm0
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_4i8_to_4i64:
 ; X32-SSE41:       # %bb.0:
 ; X32-SSE41-NEXT:    pslld $24, %xmm0
@@ -5008,10 +6003,10 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
 ; AVX512F-LABEL: sext_32xi1_to_32xi8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -5023,6 +6018,24 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_32xi1_to_32xi8:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    movl %esp, %ebp
+; X32-SSE2-NEXT:    andl $-16, %esp
+; X32-SSE2-NEXT:    subl $16, %esp
+; X32-SSE2-NEXT:    movdqa 8(%ebp), %xmm3
+; X32-SSE2-NEXT:    pcmpeqw 40(%ebp), %xmm1
+; X32-SSE2-NEXT:    pcmpeqw 24(%ebp), %xmm0
+; X32-SSE2-NEXT:    packsswb %xmm1, %xmm0
+; X32-SSE2-NEXT:    pcmpeqw 72(%ebp), %xmm3
+; X32-SSE2-NEXT:    pcmpeqw 56(%ebp), %xmm2
+; X32-SSE2-NEXT:    packsswb %xmm3, %xmm2
+; X32-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE2-NEXT:    movl %ebp, %esp
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_32xi1_to_32xi8:
 ; X32-SSE41:       # %bb.0:
 ; X32-SSE41-NEXT:    pushl %ebp
@@ -5080,6 +6093,18 @@ define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
 ; AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; X32-SSE2-LABEL: sext_2i8_to_2i32:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzwl (%eax), %eax
+; X32-SSE2-NEXT:    movd %eax, %xmm0
+; X32-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-SSE2-NEXT:    psrad $24, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X32-SSE2-NEXT:    retl
+;
 ; X32-SSE41-LABEL: sext_2i8_to_2i32:
 ; X32-SSE41:       # %bb.0:
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 584a54e68e8673cc9179f35a00fffa69a0054ddc..8b309827752a3ca17797b972714ebe9c01a1c4e2 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -67,10 +67,10 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-LABEL: var_shift_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: var_shift_v2i64:
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 6d79996164f2a7425fe0058a2b24014928df4311..7c3387669301950c14f234e472610cd88980b40d 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -46,10 +46,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-LABEL: var_shift_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_shift_v4i64:
@@ -67,10 +67,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-LABEL: var_shift_v4i64:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
-; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i64:
@@ -115,10 +115,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; X32-AVX2-LABEL: var_shift_v4i64:
 ; X32-AVX2:       # %bb.0:
 ; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
-; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
-; X32-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
 ; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    retl
   %shift = ashr <4 x i64> %a, %b
   ret <4 x i64> %shift
@@ -1086,10 +1086,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; X32-AVX2:       # %bb.0:
 ; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
 ; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
-; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
-; X32-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
 ; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    retl
   %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
diff --git a/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll b/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e910c9c74dac8f4973a0062f03048f1fd57dbbc0
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll
@@ -0,0 +1,2505 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
+
+;
+; Variable Shifts
+;
+
+define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrad %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad %xmm4, %xmm2
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrad %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrad %xmm1, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i32> %a, %b
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psraw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v4i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i16> %a, %b
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psraw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v2i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i16> %a, %b
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psraw $4, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psraw $2, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    psraw $1, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $4, %xmm4
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $2, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $1, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psraw $4, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psraw $2, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X32-SSE-NEXT:    pandn %xmm2, %xmm4
+; X32-SSE-NEXT:    psraw $1, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <8 x i8> %a, %b
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psraw $4, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psraw $2, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    psraw $1, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $4, %xmm4
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $2, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $1, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psraw $4, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psraw $2, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X32-SSE-NEXT:    pandn %xmm2, %xmm4
+; X32-SSE-NEXT:    psraw $1, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i8> %a, %b
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psraw $4, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psraw $2, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    psraw $1, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $4, %xmm4
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $2, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $1, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psraw $4, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
+; X32-SSE-NEXT:    pandn %xmm2, %xmm6
+; X32-SSE-NEXT:    psraw $2, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm6, %xmm2
+; X32-SSE-NEXT:    paddw %xmm4, %xmm4
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X32-SSE-NEXT:    pandn %xmm2, %xmm4
+; X32-SSE-NEXT:    psraw $1, %xmm2
+; X32-SSE-NEXT:    pand %xmm5, %xmm2
+; X32-SSE-NEXT:    por %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
+; X32-SSE-NEXT:    pandn %xmm0, %xmm5
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i8> %a, %b
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    psrad %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    xorps %xmm2, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT:    psrad %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
+  %shift = ashr <2 x i32> %a, %splat
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psraw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psraw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psraw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
+  %shift = ashr <4 x i16> %a, %splat
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psraw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psraw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psraw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
+  %shift = ashr <2 x i16> %a, %splat
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    psubb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v8i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    pxor %xmm2, %xmm0
+; X32-SSE-NEXT:    psubb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
+  %shift = ashr <8 x i8> %a, %splat
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    psubb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    pxor %xmm2, %xmm0
+; X32-SSE-NEXT:    psubb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
+  %shift = ashr <4 x i8> %a, %splat
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    psubb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    pxor %xmm2, %xmm0
+; X32-SSE-NEXT:    psubb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
+  %shift = ashr <2 x i8> %a, %splat
+  ret <2 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $4, %xmm1
+; SSE2-NEXT:    psrad $5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $5, %xmm1
+; SSE41-NEXT:    psrad $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $4, %xmm1
+; X32-SSE-NEXT:    psrad $5, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i32> %a, <i32 4, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psraw $2, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    psraw $1, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,1,2,3,u,u,u,u>
+; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT:    pmulhw %xmm1, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    psraw $1, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX1-NEXT:    # xmm2 = mem[0,0]
+; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm0, %xmm3, %xmm1
+; AVX1-NEXT:    vpcmpeqw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [844433520132096,844433520132096]
+; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm3
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm0, %xmm3, %xmm1
+; AVX2-NEXT:    vpcmpeqw {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $2, %xmm1
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; X32-SSE-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    andps %xmm2, %xmm0
+; X32-SSE-NEXT:    psraw $1, %xmm1
+; X32-SSE-NEXT:    andnps %xmm1, %xmm2
+; X32-SSE-NEXT:    orps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psraw $3, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psraw $3, %xmm1
+; SSE41-NEXT:    psraw $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsraw $3, %xmm0, %xmm1
+; AVX-NEXT:    vpsraw $2, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsraw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpsraw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsraw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT:    vpsraw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $3, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i16> %a, <i16 2, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE-LABEL: constant_shift_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    psraw $8, %xmm0
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    psraw $8, %xmm0
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE-LABEL: constant_shift_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    psraw $8, %xmm0
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    psraw $8, %xmm0
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE-LABEL: constant_shift_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    psraw $8, %xmm0
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    psraw $8, %xmm0
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i8> %a, <i8 2, i8 3>
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrad $5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrad $5, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrad $5, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrad $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrad $5, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i32> %a, <i32 5, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psraw $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psraw $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psraw $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psraw $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i16> %a, <i16 3, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    pxor %xmm1, %xmm0
+; X32-SSE-NEXT:    psubb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    pxor %xmm1, %xmm0
+; X32-SSE-NEXT:    psubb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    pxor %xmm1, %xmm0
+; X32-SSE-NEXT:    psubb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i8> %a, <i8 3, i8 3>
+  ret <2 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/test/CodeGen/X86/vector-shift-ashr-sub128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..732b9019049f70119d701252f02963131d2d5ac8
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -0,0 +1,2932 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
+
+;
+; Variable Shifts
+;
+
+define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT:    xorpd %xmm0, %xmm2
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    psrlq %xmm0, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlq %xmm0, %xmm4
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm3, %xmm2
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; XOPAVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; XOPAVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $32, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $32, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    psrad $31, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    xorps %xmm5, %xmm5
+; X32-SSE-NEXT:    movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3]
+; X32-SSE-NEXT:    psrlq %xmm5, %xmm3
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    psrlq %xmm5, %xmm0
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm3, %xmm0
+; X32-SSE-NEXT:    psubq %xmm3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i32> %a, %b
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    psrad %xmm4, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad %xmm1, %xmm2
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX2-NEXT:    vpslld $16, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512VL-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $16, %xmm0
+; X32-SSE-NEXT:    psrad $16, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrad %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad %xmm4, %xmm2
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrad %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrad %xmm1, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i16> %a, %b
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm4, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT:    xorpd %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $48, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    psrlq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlq %xmm2, %xmm4
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    psubq %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOP-NEXT:    vpsllq $48, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $48, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512VL-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $48, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad $31, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X32-SSE-NEXT:    psrad $16, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i16> %a, %b
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psraw $15, %xmm0
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psllw $8, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    paddw %xmm2, %xmm4
+; SSE41-NEXT:    psraw $15, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllw $8, %xmm0, %xmm0
+; XOP-NEXT:    vpsraw $8, %xmm0, %xmm0
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllw $8, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psraw $8, %xmm3
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    psraw $15, %xmm0
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    pand %xmm0, %xmm2
+; X32-SSE-NEXT:    pandn %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <8 x i8> %a, %b
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $24, %xmm0
+; X32-SSE-NEXT:    psrad $24, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrad %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad %xmm4, %xmm2
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrad %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrad %xmm1, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i8> %a, %b
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $56, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm4, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT:    xorpd %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $56, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm4, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlq %xmm4, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    psubq %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $56, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $56, %zmm0, %zmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $56, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad $31, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X32-SSE-NEXT:    psrad $24, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i8> %a, %b
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm4, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq %xmm0, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT:    xorpd %xmm1, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq %xmm0, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlq %xmm1, %xmm4
+; SSE41-NEXT:    psrlq %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm3, %xmm2
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $32, %zmm0, %zmm0
+; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $32, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    psrad $31, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    xorps %xmm4, %xmm4
+; X32-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm1
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm1, %xmm0
+; X32-SSE-NEXT:    psubq %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
+  %shift = ashr <2 x i32> %a, %splat
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm1, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad %xmm4, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm2, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    psrad %xmm4, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad %xmm1, %xmm2
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpslld $16, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $16, %xmm0
+; X32-SSE-NEXT:    psrad $16, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrad %xmm1, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad %xmm4, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrad %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrad %xmm2, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
+  %shift = ashr <4 x i16> %a, %splat
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm4, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT:    xorpd %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $48, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    psrlq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlq %xmm2, %xmm4
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    psubq %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v2i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v2i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllq $48, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $48, %zmm0, %zmm0
+; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $48, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad $31, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X32-SSE-NEXT:    psrad $16, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
+  %shift = ashr <2 x i16> %a, %splat
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psraw $15, %xmm0
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psllw $8, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    paddw %xmm2, %xmm4
+; SSE41-NEXT:    psraw $15, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllw $8, %xmm0, %xmm0
+; XOP-NEXT:    vpsraw $8, %xmm0, %xmm0
+; XOP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllw $8, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psraw $8, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    psraw $15, %xmm0
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    pand %xmm0, %xmm2
+; X32-SSE-NEXT:    pandn %xmm3, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psraw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
+  %shift = ashr <8 x i8> %a, %splat
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm1, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad %xmm4, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm2, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $24, %xmm0
+; X32-SSE-NEXT:    psrad $24, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrad %xmm1, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad %xmm4, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrad %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrad %xmm2, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
+  %shift = ashr <4 x i8> %a, %splat
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $56, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm4, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrlq %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT:    xorpd %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $56, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm4, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlq %xmm4, %xmm3
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    psubq %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $56, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $56, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $56, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad $31, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X32-SSE-NEXT:    psrad $24, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
+  %shift = ashr <2 x i8> %a, %splat
+  ret <2 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlq $4, %xmm0
+; SSE2-NEXT:    psrlq $5, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd {{.*#+}} xmm0 = [576460752303423488,288230376151711744]
+; SSE2-NEXT:    xorpd %xmm0, %xmm1
+; SSE2-NEXT:    psubq %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq $5, %xmm0
+; SSE41-NEXT:    psrlq $4, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [576460752303423488,288230376151711744]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    psubq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrlq $5, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [576460752303423488,288230376151711744]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [576460752303423488,288230376151711744]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $32, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5]
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $32, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    psrad $31, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    psrlq $4, %xmm0
+; X32-SSE-NEXT:    psrlq $5, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-SSE-NEXT:    movapd {{.*#+}} xmm0 = [3.7857669957336791E-270,2.0522684006491881E-289]
+; X32-SSE-NEXT:    xorpd %xmm0, %xmm1
+; X32-SSE-NEXT:    psubq %xmm0, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i32> %a, <i32 4, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $19, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad $18, %xmm3
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE2-NEXT:    psrad $17, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $19, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad $17, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    psrad $18, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $19, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrad $18, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpslld $16, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $16, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $16, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad $19, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrad $18, %xmm3
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; X32-SSE-NEXT:    psrad $17, %xmm0
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $2, %xmm1
+; SSE2-NEXT:    psrlq $3, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
+; SSE2-NEXT:    xorpd %xmm1, %xmm0
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $48, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $3, %xmm1
+; SSE41-NEXT:    psrlq $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    psubq %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrlq $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $48, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $48, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3]
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $48, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $31, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    psrad $16, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,0,2,0]
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i16> %a, <i16 2, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    psraw $12, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    psraw $1, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE41-NEXT:    pmulhw %xmm1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT:    psraw $9, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm1, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpsraw $9, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllw $8, %xmm0, %xmm0
+; XOP-NEXT:    vpsraw $8, %xmm0, %xmm0
+; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $8, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psraw $8, %xmm1
+; X32-SSE-NEXT:    psraw $12, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    movapd %xmm0, %xmm1
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; X32-SSE-NEXT:    psraw $2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    andps %xmm2, %xmm0
+; X32-SSE-NEXT:    psraw $1, %xmm1
+; X32-SSE-NEXT:    andnps %xmm1, %xmm2
+; X32-SSE-NEXT:    orps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $27, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad $26, %xmm3
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE2-NEXT:    psrad $25, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $24, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $27, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad $25, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    psrad $26, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $27, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $25, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrad $26, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $24, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $24, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrad $27, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrad $26, %xmm3
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; X32-SSE-NEXT:    psrad $25, %xmm0
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $56, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $2, %xmm1
+; SSE2-NEXT:    psrlq $3, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
+; SSE2-NEXT:    xorpd %xmm1, %xmm0
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $56, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $3, %xmm1
+; SSE41-NEXT:    psrlq $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    psubq %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrlq $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $56, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $56, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3]
+; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $56, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $31, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    psrad $24, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i8> %a, <i8 2, i8 3>
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE2-LABEL: splatconstant_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrad $5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrlq $5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatconstant_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrad $5, %xmm0
+; SSE41-NEXT:    psrlq $5, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatconstant_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $5, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsrad $5, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq $5, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $37, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $37, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $32, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    psrad $31, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    psrad $5, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    psrlq $5, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i32> %a, <i32 5, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $19, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $19, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpslld $16, %xmm0, %xmm0
+; XOP-NEXT:    vpsrad $19, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $19, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrad $19, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $16, %xmm0
+; X32-SSE-NEXT:    psrad $19, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE2-LABEL: splatconstant_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrlq $3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatconstant_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $48, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $3, %xmm1
+; SSE41-NEXT:    psrlq $3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatconstant_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsrad $3, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $48, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $51, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $51, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $48, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $31, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    psrad $16, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm2[0,1]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm0[0,1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i16> %a, <i16 3, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $8, %xmm0
+; SSE-NEXT:    psraw $11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $11, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllw $8, %xmm0, %xmm0
+; XOP-NEXT:    vpsraw $11, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $11, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraw $11, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $8, %xmm0
+; X32-SSE-NEXT:    psraw $11, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $24, %xmm0
+; SSE-NEXT:    psrad $27, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $27, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpslld $24, %xmm0, %xmm0
+; XOP-NEXT:    vpsrad $27, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $27, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrad $27, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $24, %xmm0
+; X32-SSE-NEXT:    psrad $27, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE2-LABEL: splatconstant_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $56, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrlq $3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatconstant_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psllq $56, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $3, %xmm1
+; SSE41-NEXT:    psrlq $3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatconstant_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsrad $3, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $56, %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $59, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsraq $59, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $56, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrad $31, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT:    psrad $24, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
+; X32-SSE-NEXT:    psubq %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = ashr <2 x i8> %a, <i8 3, i8 3>
+  ret <2 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll b/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7a59f80f55e7b9173f6565d28d85f28ab39b971f
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll
@@ -0,0 +1,2163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
+
+;
+; Variable Shifts
+;
+
+define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrld %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld %xmm4, %xmm2
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrld %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrld %xmm1, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i32> %a, %b
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v4i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i16> %a, %b
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v2i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i16> %a, %b
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <8 x i8> %a, %b
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i8> %a, %b
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i8> %a, %b
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    psrld %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    xorps %xmm2, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT:    psrld %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
+  %shift = lshr <2 x i32> %a, %splat
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
+  %shift = lshr <4 x i16> %a, %splat
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
+  %shift = lshr <2 x i16> %a, %splat
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v8i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
+  %shift = lshr <8 x i8> %a, %splat
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
+  %shift = lshr <4 x i8> %a, %splat
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psrlw %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psrlw %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psrlw %xmm1, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
+  %shift = lshr <2 x i8> %a, %splat
+  ret <2 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $4, %xmm1
+; SSE2-NEXT:    psrld $5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $5, %xmm1
+; SSE41-NEXT:    psrld $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $4, %xmm1
+; X32-SSE-NEXT:    psrld $5, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i32> %a, <i32 4, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm2
+; AVX-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; X32-SSE-NEXT:    pmulhuw %xmm0, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpeqw {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $3, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $3, %xmm1
+; SSE41-NEXT:    psrlw $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm1
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsrlw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $3, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i16> %a, <i16 2, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i8> %a, <i8 2, i8 3>
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld $5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrld $5, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrld $5, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i32> %a, <i32 5, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i16> %a, <i16 3, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i8> %a, <i8 3, i8 3>
+  ret <2 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/test/CodeGen/X86/vector-shift-lshr-sub128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b321ca9af74aacd04870fcee15b128f9add67bfb
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -0,0 +1,2340 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
+
+;
+; Variable Shifts
+;
+
+define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    xorps %xmm3, %xmm3
+; X32-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i32> %a, %b
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    psrld %xmm4, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrld %xmm1, %xmm2
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrld %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld %xmm4, %xmm2
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrld %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrld %xmm1, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i16> %a, %b
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i16> %a, %b
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psraw $15, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v8i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v8i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; X32-SSE-NEXT:    pand %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    psraw $15, %xmm0
+; X32-SSE-NEXT:    pandn %xmm2, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <8 x i8> %a, %b
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; XOPAVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrld %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld %xmm4, %xmm2
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrld %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrld %xmm1, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i8> %a, %b
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm2 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; XOPAVX1-NEXT:    # xmm2 = mem[0,0]
+; XOPAVX1-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vandpd %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i8> %a, %b
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm3
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
+  %shift = lshr <2 x i32> %a, %splat
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm1, %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld %xmm4, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm2, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm6
+; SSE41-NEXT:    psrld %xmm4, %xmm6
+; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrld %xmm1, %xmm2
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX512-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld %xmm1, %xmm2
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld %xmm4, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrld %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrld %xmm2, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
+  %shift = lshr <4 x i16> %a, %splat
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v2i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v2i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
+  %shift = lshr <2 x i16> %a, %splat
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psraw $15, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v8i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BWVL-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; X32-SSE-NEXT:    pand %xmm0, %xmm2
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    psllw $12, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    psraw $15, %xmm0
+; X32-SSE-NEXT:    pandn %xmm2, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    psraw $15, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pandn %xmm0, %xmm3
+; X32-SSE-NEXT:    psrlw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm3, %xmm0
+; X32-SSE-NEXT:    paddw %xmm1, %xmm1
+; X32-SSE-NEXT:    psraw $15, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlw $1, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
+  %shift = lshr <8 x i8> %a, %splat
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm1, %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld %xmm4, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm2, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; XOPAVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; XOPAVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
+; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld %xmm1, %xmm2
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld %xmm4, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psrld %xmm3, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; X32-SSE-NEXT:    psrld %xmm2, %xmm0
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
+  %shift = lshr <4 x i8> %a, %splat
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm2 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v2i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; XOPAVX1-NEXT:    # xmm2 = mem[0,0]
+; XOPAVX1-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; XOPAVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v2i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
+; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
+  %shift = lshr <2 x i8> %a, %splat
+  ret <2 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $4, %xmm1
+; SSE2-NEXT:    psrlq $5, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq $5, %xmm0
+; SSE41-NEXT:    psrlq $4, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrlq $5, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $4, %xmm1
+; X32-SSE-NEXT:    psrlq $5, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i32> %a, <i32 4, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $3, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld $2, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    psrld $3, %xmm1
+; SSE41-NEXT:    psrld $1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld $2, %xmm2
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $1, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $2, %xmm1
+; SSE2-NEXT:    psrlq $3, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq $3, %xmm0
+; SSE41-NEXT:    psrlq $2, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX1-NEXT:    vpsrlq $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    psrlq {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i16> %a, <i16 2, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    pmulhuw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pandn %xmm0, %xmm2
+; X32-SSE-NEXT:    pmulhuw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $3, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld $2, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $2, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    psrld $3, %xmm0
+; SSE41-NEXT:    psrld $1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrld $2, %xmm2
+; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrld $1, %xmm1
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $2, %xmm1
+; SSE2-NEXT:    psrlq $3, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $3, %xmm1
+; SSE41-NEXT:    psrlq $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i8> %a, <i8 2, i8 3>
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE2-LABEL: splatconstant_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlq $5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatconstant_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    psrlq $5, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatconstant_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrlq $5, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsrlq $5, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatconstant_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; XOPAVX1-NEXT:    vpsrlq $5, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatconstant_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; XOPAVX2-NEXT:    vpsrlq $5, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512-NEXT:    vpsrlq $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512VL-NEXT:    vpsrlq $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlq $5, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i32> %a, <i32 5, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE2-LABEL: splatconstant_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrld $3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatconstant_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT:    psrld $3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT:    vpsrld $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; XOP-NEXT:    vpsrld $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT:    vpsrld $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512VL-NEXT:    vpsrld $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrld $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE2-LABEL: splatconstant_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlq $3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatconstant_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT:    psrlq $3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; XOP-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512VL-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlq {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm0[0,1]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i16> %a, <i16 3, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrlw $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrld $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpsrld $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrld $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    psrld $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrlq $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = lshr <2 x i8> %a, <i8 3, i8 3>
+  ret <2 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index a26fccd44c8e144ee429ac41ae40e3e5588cb80b..6088c965c52ff6a87e0b5d4a61ff89c18c409e72 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -180,24 +180,22 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: var_shift_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE41-NEXT:    pslld $23, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm3, %xmm1
 ; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT:    pslld $23, %xmm3
-; SSE41-NEXT:    paddd %xmm2, %xmm3
-; SSE41-NEXT:    cvttps2dq %xmm3, %xmm2
+; SSE41-NEXT:    pslld $23, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
 ; SSE41-NEXT:    packusdw %xmm1, %xmm2
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
@@ -861,28 +859,22 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT:    pmullw %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    packuswb %xmm3, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pand %xmm2, %xmm0
 ; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
@@ -893,10 +885,8 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ;
 ; AVX1-LABEL: constant_shift_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -907,13 +897,11 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: constant_shift_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -958,19 +946,15 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; X32-SSE-NEXT:    pmullw %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
-; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index b50836b56549a9f39e733d3701ef2a2a75e38c75..fe6109d8e86eb248d9b4d09e90bf9b53f49ecd93 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -933,26 +933,25 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT:    vpmullw %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v32i8:
@@ -1034,26 +1033,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ;
 ; X32-AVX1-LABEL: constant_shift_v32i8:
 ; X32-AVX1:       # %bb.0:
-; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
-; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1]
+; X32-AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
 ; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; X32-AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
-; X32-AVX1-NEXT:    vpmullw %xmm5, %xmm2, %xmm2
 ; X32-AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; X32-AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
+; X32-AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; X32-AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
 ; X32-AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; X32-AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X32-AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
 ; X32-AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX2-LABEL: constant_shift_v32i8:
diff --git a/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll b/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1323f66bdc44a396637f2670374e1e95d8420408
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll
@@ -0,0 +1,1944 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
+
+;
+; Variable Shifts
+;
+
+define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i32> %a, %b
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    pslld $23, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    cvttps2dq %xmm3, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pslld $23, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT:    packusdw %xmm1, %xmm2
+; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v4i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE-NEXT:    pslld $23, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT:    paddd %xmm4, %xmm3
+; X32-SSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd %xmm4, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i16> %a, %b
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    pslld $23, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    cvttps2dq %xmm3, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pslld $23, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT:    packusdw %xmm1, %xmm2
+; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v2i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE-NEXT:    pslld $23, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT:    paddd %xmm4, %xmm3
+; X32-SSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd %xmm4, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i16> %a, %b
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psllw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psllw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm0, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <8 x i8> %a, %b
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psllw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psllw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm0, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i8> %a, %b
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psllw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psllw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: var_shift_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $5, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pandn %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw $2, %xmm0
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    paddb %xmm1, %xmm1
+; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X32-SSE-NEXT:    pandn %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm0, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i8> %a, %b
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    pslld %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    pslld %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    xorps %xmm2, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT:    pslld %xmm2, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
+  %shift = shl <2 x i32> %a, %splat
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
+  %shift = shl <4 x i16> %a, %splat
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
+  %shift = shl <2 x i16> %a, %splat
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psllw %xmm1, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psllw %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v8i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psllw %xmm1, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
+  %shift = shl <8 x i8> %a, %splat
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psllw %xmm1, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psllw %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psllw %xmm1, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
+  %shift = shl <4 x i8> %a, %splat
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psllw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psllw %xmm1, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    psllw %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psllw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT:    psllw %xmm1, %xmm2
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
+  %shift = shl <2 x i8> %a, %splat
+  ret <2 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pslld $4, %xmm1
+; SSE2-NEXT:    pslld $5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pslld $5, %xmm1
+; SSE41-NEXT:    pslld $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $5, %xmm0, %xmm1
+; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    pslld $4, %xmm1
+; X32-SSE-NEXT:    pslld $5, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i32> %a, <i32 4, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE-LABEL: constant_shift_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: constant_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllw $3, %xmm1
+; SSE41-NEXT:    psllw $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $3, %xmm0, %xmm1
+; AVX-NEXT:    vpsllw $2, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsllw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpsllw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsllw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT:    vpsllw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i16> %a, <i16 2, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i8> %a, <i8 2, i8 3>
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpslld $5, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $5, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i32> %a, <i32 5, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i16> %a, <i16 3, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i8> %a, <i8 3, i8 3>
+  ret <2 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-shl-sub128.ll b/test/CodeGen/X86/vector-shift-shl-sub128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d3f06317e103eefef434615db213af18733feea0
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -0,0 +1,1715 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
+
+;
+; Variable Shifts
+;
+
+define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psllq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    xorps %xmm3, %xmm3
+; X32-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
+; X32-SSE-NEXT:    psllq %xmm3, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i32> %a, %b
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i16> %a, %b
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psllq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i16> %a, %b
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    pslld $23, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    cvttps2dq %xmm3, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT:    pslld $23, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT:    packusdw %xmm1, %xmm2
+; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE-NEXT:    pslld $23, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT:    paddd %xmm4, %xmm3
+; X32-SSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd %xmm4, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <8 x i8> %a, %b
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i8> %a, %b
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psllq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: var_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: var_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: var_shift_v2i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_shift_v2i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: var_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i8> %a, %b
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psllq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psllq %xmm2, %xmm3
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT:    psllq %xmm2, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
+  %shift = shl <2 x i32> %a, %splat
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
+  %shift = shl <4 x i16> %a, %splat
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psllq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v2i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v2i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
+  %shift = shl <2 x i16> %a, %splat
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    pslld $23, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    cvttps2dq %xmm3, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pslld $23, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT:    packusdw %xmm1, %xmm2
+; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: splatvar_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: splatvar_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BWVL-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-SSE-NEXT:    pslld $23, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT:    paddd %xmm4, %xmm3
+; X32-SSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd %xmm4, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
+  %shift = shl <8 x i8> %a, %splat
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pslld $23, %xmm1
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
+  %shift = shl <4 x i8> %a, %splat
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psllq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    psllq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: splatvar_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: splatvar_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: splatvar_shift_v2i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v2i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: splatvar_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatvar_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
+  %shift = shl <2 x i8> %a, %splat
+  ret <2 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllq $4, %xmm1
+; SSE2-NEXT:    psllq $5, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $5, %xmm1
+; SSE41-NEXT:    psllq $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $5, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i32:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i32:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq $4, %xmm1
+; X32-SSE-NEXT:    psllq $5, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i32> %a, <i32 4, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllq $2, %xmm1
+; SSE2-NEXT:    psllq $3, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $3, %xmm1
+; SSE41-NEXT:    psllq $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllq $2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psllq {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    psllq {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i16> %a, <i16 2, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE-LABEL: constant_shift_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: constant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: constant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512DQ-LABEL: constant_shift_v8i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v4i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v4i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE2-LABEL: constant_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllq $2, %xmm1
+; SSE2-NEXT:    psllq $3, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $3, %xmm1
+; SSE41-NEXT:    psllq $2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $3, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllq $2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: constant_shift_v2i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: constant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: constant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i8> %a, <i8 2, i8 3>
+  ret <2 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllq $5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllq $5, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $5, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq $5, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i32> %a, <i32 5, i32 5>
+  ret <2 x i32> %shift
+}
+
+define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpslld $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %shift
+}
+
+define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllq $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllq $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllq {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm0[0,1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i16> %a, <i16 3, i16 3>
+  ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v8i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllw $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v8i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    psllw $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %shift
+}
+
+define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v4i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpslld $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v4i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pslld $3, %xmm0
+; X32-SSE-NEXT:    retl
+  %shift = shl <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
+  ret <4 x i8> %shift
+}
+
+define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
+; SSE-LABEL: splatconstant_shift_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllq $3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: splatconstant_shift_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllq $3, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; XOP-LABEL: splatconstant_shift_v2i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpsllq $3, %xmm0, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX512-LABEL: splatconstant_shift_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $3, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllq $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; X32-SSE-LABEL: splatconstant_shift_v2i8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psllq %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    retl
+  %shift = shl <2 x i8> %a, <i8 3, i8 3>
+  ret <2 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index bf34c0332dd5fd7a16ef3a60c8309eb36edbeb43..d9790aa9a75b99a86c5efd83f3fa40ea16f6a374 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -579,6 +579,68 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
   ret <16 x i8> %shuffle
 }
 
+; PR39387
+define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,255]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,65535]
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,0]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,5,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
+; SSE2-NEXT:    packuswb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX1OR2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,6,7,8,9,10,27,28,29,30,30,1,1,2,3,4]
+; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT:    retq
+  %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 30, i32 1, i32 1, i32 2, i32 3, i32 4>
+  ret <16 x i8> %1
+}
+
 ; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780
 
 define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) {
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 0e4d5dcd3865e3e04143ad519083403e33d4f18c..149fe3d9dcb983fbd1b99d238471f4335bc82459 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -403,9 +403,9 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v4i32_0142:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
-; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2OR512VL-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX2OR512VL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
   ret <4 x i32> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index c4759ab54f531dc463fa0629fa0174fd3f1d0d61..ba0e2086f4888d8d07e176b9a2d592290180cdc8 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -913,9 +913,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,8,8,8,8,8,8,0,8,8,8,8,8,8,8,8]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -953,9 +952,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,0,7,7,7,7,7,7,7,7,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -993,9 +991,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,0,6,6,6,6,6,6,6,6,6,6]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1033,9 +1030,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,0,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1073,9 +1069,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,0,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1113,9 +1108,8 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1153,9 +1147,8 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1192,12 +1185,9 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    movl $128, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1700,17 +1690,10 @@ define <32 x i8> @load_fold_pblendvb_commute(<32 x i8>* %px, <32 x i8> %y) {
 }
 
 define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
-; AVX1OR2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm0 {%k1} {z}
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; ALL-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
   ret <32 x i8> %shuffle
 }
@@ -2791,9 +2774,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
 ; AVX512VLBW-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512VLBW-NEXT:    movl $286331153, %eax # imm = 0x11111111
-; AVX512VLBW-NEXT:    kmovd %eax, %k1
-; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u,u,u]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8],zero,zero,zero,ymm0[9],zero,zero,zero,ymm0[10],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[28],zero,zero,zero,ymm0[29],zero,zero,zero,ymm0[30],zero,zero,zero,ymm0[31],zero,zero,zero
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index e35a0e99efe4a1d8fa676a3c96cb2b4131e519e9..b7ea04d23df96502840550457f88b475cc3a9060 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -582,7 +582,7 @@ define <16 x i32> @test_vshufi32x4_512(<16 x i32> %x, <16 x i32> %x1) nounwind {
 define <16 x float> @test_vshuff32x4_512_mask(<16 x float> %x, <16 x float> %x1, <16 x float> %y, <16 x i1> %mask) nounwind {
 ; AVX512F-LABEL: test_vshuff32x4_512_mask:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpmovd2m %zmm3, %k1
 ; AVX512F-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
@@ -604,7 +604,7 @@ define <16 x float> @test_vshuff32x4_512_mask(<16 x float> %x, <16 x float> %x1,
 define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x i32> %y, <16 x i1> %mask) nounwind {
 ; AVX512F-LABEL: test_vshufi32x4_512_mask:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpmovd2m %zmm3, %k1
 ; AVX512F-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 0786cabdc0052f044e8a13faa4238c9c5e984e4a..741861bc6c6c53958c3515349731585dc50fd2a3 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -574,3 +574,45 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
   %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
   ret <64 x i8> %5
 }
+
+define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    retq
+  %1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %2 = lshr <32 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %3 = bitcast <32 x i16> %1 to <64 x i8>
+  %4 = bitcast <32 x i16> %2 to <64 x i8>
+  %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
+  ret <64 x i8> %5
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 97b89a0be37ba9abdaab680fe288493cad321f41..dce7f02605207a63f54fb2d48268802ce50ac3c1 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1845,7 +1845,7 @@ define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) noun
 define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
 ; ALL-LABEL: test_vshuff64x2_512_maskz:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxwq %xmm2, %zmm2
+; ALL-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
 ; ALL-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; ALL-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; ALL-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[2,3,0,1]
@@ -1858,7 +1858,7 @@ define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1
 define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
 ; ALL-LABEL: test_vshufi64x2_512_mask:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxwq %xmm2, %zmm2
+; ALL-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
 ; ALL-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; ALL-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; ALL-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],zmm1[2,3,0,1]
@@ -1887,7 +1887,7 @@ define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr
 define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
 ; AVX512F-LABEL: test_vshuff64x2_512_mem_mask:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
@@ -1895,7 +1895,7 @@ define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double>
 ;
 ; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-32-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX512F-32-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; AVX512F-32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1910,7 +1910,7 @@ define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double>
 define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
 ; AVX512F-LABEL: test_vshuff64x2_512_mem_maskz:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
@@ -1918,7 +1918,7 @@ define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double>
 ;
 ; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-32-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX512F-32-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; AVX512F-32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 678feb8b3307e93b889f0490d9b5745f35ae51b2..03fe8c49b4ffc017d08ba6649abcf7bf53a3344b 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -440,9 +440,8 @@ define void @PR39483() {
 ; X32-AVX1-LABEL: PR39483:
 ; X32-AVX1:       # %bb.0: # %entry
 ; X32-AVX1-NEXT:    vmovups 32, %ymm0
-; X32-AVX1-NEXT:    vmovups 64, %ymm1
-; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; X32-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3]
+; X32-AVX1-NEXT:    vmovups 64, %xmm1
+; X32-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,3]
 ; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; X32-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -484,9 +483,8 @@ define void @PR39483() {
 ; X64-AVX1-LABEL: PR39483:
 ; X64-AVX1:       # %bb.0: # %entry
 ; X64-AVX1-NEXT:    vmovups 32, %ymm0
-; X64-AVX1-NEXT:    vmovups 64, %ymm1
-; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3]
+; X64-AVX1-NEXT:    vmovups 64, %xmm1
+; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,3]
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 95d53ace5d3222712f684035dea3ee4d0037296d..2ea0f1ab3e7188e1c7a01358c54c93b61d702b33 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -313,7 +313,7 @@ define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %
 define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
@@ -324,7 +324,7 @@ define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a
 define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index 64dd50639d3179d94f68fc5b9629d28109f9ec59..b8d3824f1caf56e889b85d4011c8f804bb121a8a 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -124,21 +124,21 @@ define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x dou
 define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
 ; X32-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X32-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X32-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X32-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X32-NEXT:    vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X32-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; X64-NEXT:    vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X64-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X64-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-NEXT:    vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X64-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
   %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
@@ -205,21 +205,21 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1)
 define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
 ; X32-LABEL: combine_vpermt2var_8i64_identity_mask:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X32-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X32-NEXT:    vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X32-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8i64_identity_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; X64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X64-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-NEXT:    vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X64-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
@@ -241,20 +241,20 @@ define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x f
 define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
 ; X32-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X32-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X32-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X32-NEXT:    vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X32-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X64-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X64-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-NEXT:    vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X64-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
   %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
@@ -264,14 +264,14 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
 define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
 ; X32-LABEL: combine_vpermt2var_16f32_vmovddup:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X32-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X32-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X64-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X64-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    retq
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
   ret <16 x float> %res0
@@ -280,18 +280,16 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <
 ; X32-LABEL: combine_vpermt2var_16f32_vmovddup_load:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovaps (%eax), %zmm2
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X32-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
-; X32-NEXT:    vmovaps %zmm1, %zmm0
+; X32-NEXT:    vmovaps (%eax), %zmm1
+; X32-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X32-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %zmm2
-; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X64-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
-; X64-NEXT:    vmovaps %zmm1, %zmm0
+; X64-NEXT:    vmovaps (%rdi), %zmm1
+; X64-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X64-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    retq
   %x0 = load <16 x float>, <16 x float> *%p0
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
@@ -300,16 +298,16 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <
 define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
 ; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
 ; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
+; X32-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
+; X64-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
   ret <16 x float> %res0
@@ -318,20 +316,18 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%
 ; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovaps (%eax), %zmm2
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X32-NEXT:    vmovaps (%eax), %zmm1
+; X32-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
 ; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
-; X32-NEXT:    vmovaps %zmm1, %zmm0
+; X32-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %zmm2
-; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X64-NEXT:    vmovaps (%rdi), %zmm1
+; X64-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
 ; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
-; X64-NEXT:    vmovaps %zmm1, %zmm0
+; X64-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %x0 = load <16 x float>, <16 x float> *%p0
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
@@ -519,20 +515,20 @@ define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32>
 define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
 ; X32-LABEL: combine_vpermt2var_16i32_identity_mask:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X32-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X32-NEXT:    vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X32-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16i32_identity_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X64-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-NEXT:    vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X64-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
@@ -554,20 +550,20 @@ define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16>
 define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) {
 ; X32-LABEL: combine_vpermt2var_32i16_identity_mask:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X32-NEXT:    vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
-; X32-NEXT:    vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X32-NEXT:    vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
+; X64-NEXT:    vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
 ; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
-; X64-NEXT:    vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
+; X64-NEXT:    vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index 0d56a24c5ff3cdd4adeba19727cb977f0c3adaff..c4ab922d068ceacd01fc555675f44cb4606a1c1b 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -20,20 +20,20 @@ define <16 x i16> @combine_vpermt2var_16i16_identity(<16 x i16> %x0, <16 x i16>
 define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x i16> %x1, i16 %m) {
 ; X32-LABEL: combine_vpermt2var_16i16_identity_mask:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
+; X32-NEXT:    vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z}
 ; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X32-NEXT:    vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z}
+; X32-NEXT:    vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
+; X64-NEXT:    vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z}
 ; X64-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-NEXT:    vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z}
+; X64-NEXT:    vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x0, <16 x i16> %x1, i16 %m)
   %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 30, i16 13, i16 28, i16 11, i16 26, i16 9, i16 24, i16 7, i16 22, i16 5, i16 20, i16 3, i16 18, i16 1, i16 16>, <16 x i16> %res0, <16 x i16> %res0, i16 %m)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index a9054102460c0351b7ba20a115fe7a6d204c1002..5350ddab5ef7dc126acf7b7da56275208e92af3b 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -33,20 +33,20 @@ define <16 x i8> @combine_vpermt2var_16i8_identity(<16 x i8> %x0, <16 x i8> %x1)
 define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8> %x1, i16 %m) {
 ; X32-LABEL: combine_vpermt2var_16i8_identity_mask:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
+; X32-NEXT:    vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z}
 ; X32-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X32-NEXT:    vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}
+; X32-NEXT:    vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
+; X64-NEXT:    vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z}
 ; X64-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-NEXT:    vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}
+; X64-NEXT:    vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x0, <16 x i8> %x1, i16 %m)
   %res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 30, i8 13, i8 28, i8 11, i8 26, i8 9, i8 24, i8 7, i8 22, i8 5, i8 20, i8 3, i8 18, i8 1, i8 16>, <16 x i8> %res0, <16 x i8> %res0, i16 %m)
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 9c7163f39da60f44d524bc6f319495e15f5a79e5..199a05eb2b8fcfd50b562ae7de514ea4ee0803e0 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -802,7 +802,7 @@ define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
 ;
 ; AVX2-LABEL: combine_nested_undef_test12:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
@@ -2823,3 +2823,22 @@ define <4 x float> @PR30264(<4 x float> %x) {
   %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
   ret <4 x float> %shuf2
 }
+
+define <8 x i16> @PR39549(<16 x i8> %x) {
+; SSE-LABEL: PR39549:
+; SSE:       # %bb.0:
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE-NEXT:    psraw $8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR39549:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef>
+  %b = bitcast <16 x i8> %a to <8 x i16>
+  %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %d
+}
diff --git a/test/CodeGen/X86/vector-shuffle-sse41.ll b/test/CodeGen/X86/vector-shuffle-sse41.ll
index bcf706fc06f169fafb9bcd682c5d4068a6f7cca9..9eeff51970a8c1156b4a99634648de21b11ce47d 100644
--- a/test/CodeGen/X86/vector-shuffle-sse41.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse41.ll
@@ -37,15 +37,15 @@ define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8
 define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
 ; SSE41-LABEL: blend_packusdw_packuswb:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    packusdw %xmm1, %xmm0
-; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    packusdw %xmm0, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm2
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: blend_packusdw_packuswb:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm2, %xmm1
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
   %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 19c3deb31e78d7c840b49976a74e7eba22687a09..621058c29e6f7432c2e1cf1820917d04fbcc0e8f 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -214,11 +214,11 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
@@ -233,11 +233,11 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
 ;
 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
@@ -745,7 +745,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -760,7 +760,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
 ;
 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/vector-trunc-math-widen.ll b/test/CodeGen/X86/vector-trunc-math-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1cda401a82dfa7491833e9e5c32c0a433911d77f
--- /dev/null
+++ b/test/CodeGen/X86/vector-trunc-math-widen.ll
@@ -0,0 +1,5388 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
+
+;
+; add
+;
+
+define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddq %xmm3, %xmm1
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <4 x i64> %a0, %a1
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddq %xmm6, %xmm2
+; SSE-NEXT:    paddq %xmm7, %xmm3
+; SSE-NEXT:    paddq %xmm4, %xmm0
+; SSE-NEXT:    paddq %xmm5, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <8 x i64> %a0, %a1
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_add_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    paddd %xmm3, %xmm1
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_add_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <8 x i32> %a0, %a1
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm7 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <16 x i64> %a0, %a1
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm4, %xmm0
+; SSE-NEXT:    paddd %xmm5, %xmm1
+; SSE-NEXT:    paddd %xmm6, %xmm2
+; SSE-NEXT:    paddd %xmm7, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_add_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <16 x i32> %a0, %a1
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    paddw %xmm3, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_add_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_add_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = add <16 x i16> %a0, %a1
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
+; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $16, %xmm2
+; SSE-NEXT:    psrad $16, %xmm2
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    packssdw %xmm2, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    psraw $8, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = sext <8 x i8> %1 to <8 x i32>
+  %3 = add <8 x i32> %2, %a1
+  %4 = trunc <8 x i32> %3 to <8 x i16>
+  ret <8 x i16> %4
+}
+
+;
+; add to constant
+;
+
+define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    paddw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    paddw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; sub
+;
+
+define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubq %xmm3, %xmm1
+; SSE-NEXT:    psubq %xmm2, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <4 x i64> %a0, %a1
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubq %xmm6, %xmm2
+; SSE-NEXT:    psubq %xmm7, %xmm3
+; SSE-NEXT:    psubq %xmm4, %xmm0
+; SSE-NEXT:    psubq %xmm5, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <8 x i64> %a0, %a1
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubd %xmm2, %xmm0
+; SSE-NEXT:    psubd %xmm3, %xmm1
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_sub_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <8 x i32> %a0, %a1
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm7 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <16 x i64> %a0, %a1
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubd %xmm4, %xmm0
+; SSE-NEXT:    psubd %xmm5, %xmm1
+; SSE-NEXT:    psubd %xmm6, %xmm2
+; SSE-NEXT:    psubd %xmm7, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_sub_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <16 x i32> %a0, %a1
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubw %xmm2, %xmm0
+; SSE-NEXT:    psubw %xmm3, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_sub_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = sub <16 x i16> %a0, %a1
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = zext <16 x i8> %x to <16 x i16>
+  %b = zext <16 x i8> %y to <16 x i16>
+  %c = sub <16 x i16> %a, %b
+  %d = trunc <16 x i16> %c to <16 x i8>
+  ret <16 x i8> %d
+}
+
+;
+; sub to constant
+;
+
+define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
+; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = zext <16 x i8> %x to <16 x i16>
+  %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %c = trunc <16 x i16> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+
+define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
+; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE-NEXT:    psubb %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a = zext <16 x i8> %x to <16 x i16>
+  %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
+  %c = trunc <16 x i16> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+
+;
+; mul
+;
+
+define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = mul <4 x i64> %a0, %a1
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    pmullw %xmm6, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = mul <8 x i64> %a0, %a1
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_mul_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_mul_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = mul <8 x i32> %a0, %a1
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm8
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm8, %xmm8
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm9
+; AVX1-NEXT:    vpmuludq %xmm9, %xmm0, %xmm9
+; AVX1-NEXT:    vpaddq %xmm8, %xmm9, %xmm8
+; AVX1-NEXT:    vpsllq $32, %xmm8, %xmm8
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm9
+; AVX1-NEXT:    vpaddq %xmm8, %xmm9, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm9
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm9, %xmm4, %xmm10
+; AVX1-NEXT:    vpsrlq $32, %xmm9, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpaddq %xmm10, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm9, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm9
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm0
+; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm4
+; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm10
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm0, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm0
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm6, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm7, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm7
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm7
+; AVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm10, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm9, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld %xmm7, %xmm3, %xmm3
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld %xmm6, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld %xmm5, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm8, %ymm7
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm8, %ymm3
+; AVX2-FAST-NEXT:    vpmulld %xmm7, %xmm3, %xmm3
+; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm8, %ymm6
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm8, %ymm2
+; AVX2-FAST-NEXT:    vpmulld %xmm6, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm8, %ymm5
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm8, %ymm1
+; AVX2-FAST-NEXT:    vpmulld %xmm5, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm8, %ymm4
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm8, %ymm0
+; AVX2-FAST-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovqd %zmm3, %ymm3
+; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovqd %zmm2, %ymm2
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovqd %zmm3, %ymm3
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovqd %zmm2, %ymm2
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullq %zmm3, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = mul <16 x i64> %a0, %a1
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm8, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm6, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm7, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_mul_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_mul_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = mul <16 x i32> %a0, %a1
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw %xmm2, %xmm0
+; SSE-NEXT:    pmullw %xmm3, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_mul_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = mul <16 x i16> %a0, %a1
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
+; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE-NEXT:    pslld $16, %xmm2
+; SSE-NEXT:    psrad $16, %xmm2
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    packssdw %xmm2, %xmm1
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = mul <8 x i32> %2, %a1
+  %4 = trunc <8 x i32> %3 to <8 x i16>
+  ret <8 x i16> %4
+}
+
+;
+; mul to constant
+;
+
+define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl $1, %eax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl $1, %eax
+; SSE-NEXT:    movq %rax, %xmm8
+; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT:    pmuludq %xmm8, %xmm0
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm2
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm3
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm4
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm5
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm6
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl $1, %eax
+; AVX1-NEXT:    vmovq %rax, %xmm4
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,3]
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm6
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm9
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5]
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm7
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7]
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm7
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9]
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm7
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11]
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm7
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13]
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm7
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm0
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm7, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [14,15]
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm7
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm6 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm6, %xmm9, %xmm2
+; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,9,10,11]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [12,13,14,15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmulld {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; and
+;
+
+define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps %xmm3, %xmm1
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <4 x i64> %a0, %a1
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm6, %xmm2
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <8 x i64> %a0, %a1
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_and_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_and_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <8 x i32> %a0, %a1
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vandpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vandpd %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <16 x i64> %a0, %a1
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm3, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    pand %xmm2, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm1, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm5, %xmm0
+; SSE-NEXT:    packuswb %xmm6, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_and_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <16 x i32> %a0, %a1
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_and_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_and_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = and <16 x i16> %a0, %a1
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; and to constant
+;
+
+define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    andpd {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; xor
+;
+
+define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm3, %xmm1
+; SSE-NEXT:    xorps %xmm2, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <4 x i64> %a0, %a1
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <8 x i64> %a0, %a1
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_xor_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <8 x i32> %a0, %a1
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vxorpd %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpxor %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpxor %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT:    vpxor %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpxor %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpxor %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT:    vpxor %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <16 x i64> %a0, %a1
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_xor_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <16 x i32> %a0, %a1
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_xor_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = xor <16 x i16> %a0, %a1
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; xor to constant
+;
+
+define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    xorpd {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; or
+;
+
+define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    orps %xmm3, %xmm1
+; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <4 x i64> %a0, %a1
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <8 x i64> %a0, %a1
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_or_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_or_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <8 x i32> %a0, %a1
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vorpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vorpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vorpd %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpor %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpor %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpor %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT:    vpor %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpor %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpor %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpor %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT:    vpor %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <16 x i64> %a0, %a1
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_or_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <16 x i32> %a0, %a1
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_or_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_or_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = or <16 x i16> %a0, %a1
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; or to constant
+;
+
+define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    orps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+  %2 = trunc <4 x i64> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v8i64_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    orpd {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %2 = trunc <8 x i64> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i64_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    pand %xmm8, %xmm4
+; SSE-NEXT:    packuswb %xmm5, %xmm4
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %2 = trunc <16 x i64> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+  %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+;
+; complex patterns - often created by vectorizer
+;
+
+define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
+; SSE-LABEL: mul_add_const_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: mul_add_const_v4i64_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = sext <4 x i32> %a0 to <4 x i64>
+  %2 = sext <4 x i32> %a1 to <4 x i64>
+  %3 = mul <4 x i64> %1, %2
+  %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
+  %5 = trunc <4 x i64> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+
+define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
+; SSE-LABEL: mul_add_self_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    paddd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: mul_add_self_v4i64_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = sext <4 x i32> %a0 to <4 x i64>
+  %2 = sext <4 x i32> %a1 to <4 x i64>
+  %3 = mul <4 x i64> %1, %2
+  %4 = add <4 x i64> %3, %3
+  %5 = trunc <4 x i64> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+
+define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
+; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
+; SSE-NEXT:    paddd %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = sext <4 x i32> %a0 to <4 x i64>
+  %2 = sext <4 x i32> %a1 to <4 x i64>
+  %3 = mul <4 x i64> %1, %2
+  %4 = add <4 x i64> %1, %3
+  %5 = trunc <4 x i64> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index 7dc850391bca4f95b704fc866b26d4883a83d4a0..b06f3294e0a27375a580c2cddba4ec494977c387 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -267,8 +267,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -276,8 +276,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -294,15 +294,15 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -366,12 +366,12 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -403,28 +403,26 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_add_v16i16_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_add_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -440,7 +438,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -715,8 +713,8 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -724,8 +722,8 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
@@ -739,15 +737,15 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
@@ -801,12 +799,12 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -834,29 +832,25 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ;
 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -872,7 +866,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ;
 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1142,8 +1136,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -1151,8 +1145,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -1169,15 +1163,15 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -1241,12 +1235,12 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -1278,28 +1272,26 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_sub_v16i16_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1315,7 +1307,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1348,49 +1340,39 @@ define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
 define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    movq %rax, %xmm2
-; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    psubq %xmm2, %xmm0
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    movl $1, %eax
-; AVX1-NEXT:    vmovq %rax, %xmm1
-; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
@@ -1401,53 +1383,38 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    movq %rax, %xmm4
-; SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    psubq %xmm4, %xmm0
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm2
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    movl $1, %eax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -1455,28 +1422,26 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
@@ -1487,41 +1452,38 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
-; SSE-NEXT:    psubd {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    pslld $16, %xmm1
 ; SSE-NEXT:    psrad $16, %xmm1
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1532,17 +1494,6 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    movq %rax, %xmm8
-; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    psubq %xmm8, %xmm0
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm2
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm3
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm4
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm5
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm6
-; SSE-NEXT:    psubq {{.*}}(%rip), %xmm7
 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
 ; SSE-NEXT:    pand %xmm8, %xmm7
 ; SSE-NEXT:    pand %xmm8, %xmm6
@@ -1559,51 +1510,38 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    packuswb %xmm2, %xmm0
 ; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    movl $1, %eax
-; AVX1-NEXT:    vmovq %rax, %xmm4
-; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; AVX1-NEXT:    # xmm4 = mem[0,0]
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -1612,8 +1550,8 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -1621,17 +1559,14 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
@@ -1639,26 +1574,26 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
@@ -1669,10 +1604,6 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
-; SSE-NEXT:    psubd {{.*}}(%rip), %xmm1
-; SSE-NEXT:    psubd {{.*}}(%rip), %xmm2
-; SSE-NEXT:    psubd {{.*}}(%rip), %xmm3
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE-NEXT:    pand %xmm4, %xmm3
 ; SSE-NEXT:    pand %xmm4, %xmm2
@@ -1681,47 +1612,44 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubd {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1732,58 +1660,52 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
-; SSE-NEXT:    psubw {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -1833,26 +1755,8 @@ define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
 define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE-LABEL: trunc_mul_v4i64_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    psrlq $32, %xmm4
-; SSE-NEXT:    pmuludq %xmm3, %xmm4
-; SSE-NEXT:    movdqa %xmm3, %xmm5
-; SSE-NEXT:    psrlq $32, %xmm5
-; SSE-NEXT:    pmuludq %xmm1, %xmm5
-; SSE-NEXT:    paddq %xmm4, %xmm5
-; SSE-NEXT:    psllq $32, %xmm5
 ; SSE-NEXT:    pmuludq %xmm3, %xmm1
-; SSE-NEXT:    paddq %xmm5, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrlq $32, %xmm3
-; SSE-NEXT:    pmuludq %xmm2, %xmm3
-; SSE-NEXT:    movdqa %xmm2, %xmm4
-; SSE-NEXT:    psrlq $32, %xmm4
-; SSE-NEXT:    pmuludq %xmm0, %xmm4
-; SSE-NEXT:    paddq %xmm3, %xmm4
-; SSE-NEXT:    psllq $32, %xmm4
 ; SSE-NEXT:    pmuludq %xmm2, %xmm0
-; SSE-NEXT:    paddq %xmm4, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSE-NEXT:    retq
 ;
@@ -2099,94 +2003,14 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
 ; SSE-LABEL: trunc_mul_v16i64_v16i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT:    movdqa %xmm0, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    movdqa %xmm8, %xmm10
-; SSE-NEXT:    psrlq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm0, %xmm10
-; SSE-NEXT:    paddq %xmm9, %xmm10
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT:    psllq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm8, %xmm0
-; SSE-NEXT:    paddq %xmm10, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm8
-; SSE-NEXT:    psrlq $32, %xmm8
-; SSE-NEXT:    pmuludq %xmm9, %xmm8
-; SSE-NEXT:    movdqa %xmm9, %xmm10
-; SSE-NEXT:    psrlq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm1, %xmm10
-; SSE-NEXT:    paddq %xmm8, %xmm10
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT:    psllq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm9, %xmm1
-; SSE-NEXT:    paddq %xmm10, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    movdqa %xmm8, %xmm10
-; SSE-NEXT:    psrlq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm2, %xmm10
-; SSE-NEXT:    paddq %xmm9, %xmm10
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT:    psllq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm8, %xmm2
-; SSE-NEXT:    paddq %xmm10, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm8
-; SSE-NEXT:    psrlq $32, %xmm8
-; SSE-NEXT:    pmuludq %xmm9, %xmm8
-; SSE-NEXT:    movdqa %xmm9, %xmm10
-; SSE-NEXT:    psrlq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm3, %xmm10
-; SSE-NEXT:    paddq %xmm8, %xmm10
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT:    psllq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm9, %xmm3
-; SSE-NEXT:    paddq %xmm10, %xmm3
-; SSE-NEXT:    movdqa %xmm4, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    movdqa %xmm8, %xmm10
-; SSE-NEXT:    psrlq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm4, %xmm10
-; SSE-NEXT:    paddq %xmm9, %xmm10
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT:    psllq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm8, %xmm4
-; SSE-NEXT:    paddq %xmm10, %xmm4
-; SSE-NEXT:    movdqa %xmm5, %xmm8
-; SSE-NEXT:    psrlq $32, %xmm8
-; SSE-NEXT:    pmuludq %xmm9, %xmm8
-; SSE-NEXT:    movdqa %xmm9, %xmm10
-; SSE-NEXT:    psrlq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm5, %xmm10
-; SSE-NEXT:    paddq %xmm8, %xmm10
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT:    psllq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm9, %xmm5
-; SSE-NEXT:    paddq %xmm10, %xmm5
-; SSE-NEXT:    movdqa %xmm6, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    movdqa %xmm8, %xmm10
-; SSE-NEXT:    psrlq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm6, %xmm10
-; SSE-NEXT:    paddq %xmm9, %xmm10
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT:    psllq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm8, %xmm6
-; SSE-NEXT:    paddq %xmm10, %xmm6
-; SSE-NEXT:    movdqa %xmm7, %xmm8
-; SSE-NEXT:    psrlq $32, %xmm8
-; SSE-NEXT:    pmuludq %xmm9, %xmm8
-; SSE-NEXT:    movdqa %xmm9, %xmm10
-; SSE-NEXT:    psrlq $32, %xmm10
-; SSE-NEXT:    pmuludq %xmm7, %xmm10
-; SSE-NEXT:    paddq %xmm8, %xmm10
-; SSE-NEXT:    pmuludq %xmm9, %xmm7
-; SSE-NEXT:    psllq $32, %xmm10
-; SSE-NEXT:    paddq %xmm10, %xmm7
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm7
 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
 ; SSE-NEXT:    pand %xmm8, %xmm7
 ; SSE-NEXT:    pand %xmm8, %xmm6
@@ -2315,8 +2139,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -2330,8 +2154,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -2348,8 +2172,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm8, %ymm5
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm8, %ymm1
 ; AVX2-FAST-NEXT:    vpmulld %xmm5, %xmm1, %xmm1
@@ -2359,8 +2183,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -2474,12 +2298,12 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -2511,28 +2335,26 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_mul_v16i16_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2548,7 +2370,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -2613,22 +2435,11 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
 define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,3]
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    pmuludq %xmm2, %xmm3
-; SSE-NEXT:    psrlq $32, %xmm1
-; SSE-NEXT:    pmuludq %xmm2, %xmm1
-; SSE-NEXT:    psllq $32, %xmm1
-; SSE-NEXT:    paddq %xmm3, %xmm1
 ; SSE-NEXT:    movl $1, %eax
 ; SSE-NEXT:    movq %rax, %xmm2
 ; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pmuludq %xmm2, %xmm3
-; SSE-NEXT:    psrlq $32, %xmm0
 ; SSE-NEXT:    pmuludq %xmm2, %xmm0
-; SSE-NEXT:    psllq $32, %xmm0
-; SSE-NEXT:    paddq %xmm3, %xmm0
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSE-NEXT:    retq
 ;
@@ -2785,61 +2596,14 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; SSE-NEXT:    movl $1, %eax
 ; SSE-NEXT:    movq %rax, %xmm8
 ; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm0
 ; SSE-NEXT:    pmuludq %xmm8, %xmm0
-; SSE-NEXT:    psllq $32, %xmm0
-; SSE-NEXT:    paddq %xmm9, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [2,3]
-; SSE-NEXT:    movdqa %xmm1, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm1
-; SSE-NEXT:    pmuludq %xmm8, %xmm1
-; SSE-NEXT:    psllq $32, %xmm1
-; SSE-NEXT:    paddq %xmm9, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [4,5]
-; SSE-NEXT:    movdqa %xmm2, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm2
-; SSE-NEXT:    pmuludq %xmm8, %xmm2
-; SSE-NEXT:    psllq $32, %xmm2
-; SSE-NEXT:    paddq %xmm9, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [6,7]
-; SSE-NEXT:    movdqa %xmm3, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm3
-; SSE-NEXT:    pmuludq %xmm8, %xmm3
-; SSE-NEXT:    psllq $32, %xmm3
-; SSE-NEXT:    paddq %xmm9, %xmm3
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [8,9]
-; SSE-NEXT:    movdqa %xmm4, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm4
-; SSE-NEXT:    pmuludq %xmm8, %xmm4
-; SSE-NEXT:    psllq $32, %xmm4
-; SSE-NEXT:    paddq %xmm9, %xmm4
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [10,11]
-; SSE-NEXT:    movdqa %xmm5, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm5
-; SSE-NEXT:    pmuludq %xmm8, %xmm5
-; SSE-NEXT:    psllq $32, %xmm5
-; SSE-NEXT:    paddq %xmm9, %xmm5
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [12,13]
-; SSE-NEXT:    movdqa %xmm6, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm6
-; SSE-NEXT:    pmuludq %xmm8, %xmm6
-; SSE-NEXT:    psllq $32, %xmm6
-; SSE-NEXT:    paddq %xmm9, %xmm6
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [14,15]
-; SSE-NEXT:    movdqa %xmm7, %xmm9
-; SSE-NEXT:    pmuludq %xmm8, %xmm9
-; SSE-NEXT:    psrlq $32, %xmm7
-; SSE-NEXT:    pmuludq %xmm8, %xmm7
-; SSE-NEXT:    psllq $32, %xmm7
-; SSE-NEXT:    paddq %xmm9, %xmm7
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm2
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm3
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm4
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm5
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm6
+; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm7
 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
 ; SSE-NEXT:    pand %xmm8, %xmm7
 ; SSE-NEXT:    pand %xmm8, %xmm6
@@ -2946,8 +2710,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
@@ -2957,8 +2721,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -2973,8 +2737,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
@@ -2982,8 +2746,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -3072,13 +2836,13 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -3109,28 +2873,26 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -3146,7 +2908,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -3399,8 +3161,8 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -3408,8 +3170,8 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -3426,15 +3188,15 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -3494,12 +3256,12 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -3528,29 +3290,25 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX1-LABEL: trunc_and_v16i16_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_and_v16i16_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_and_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -3566,7 +3324,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -3792,8 +3550,8 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -3801,8 +3559,8 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
@@ -3816,15 +3574,15 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
@@ -3878,12 +3636,12 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -3911,29 +3669,25 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ;
 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -3949,7 +3703,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ;
 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -4203,8 +3957,8 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -4212,8 +3966,8 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -4230,15 +3984,15 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -4298,12 +4052,12 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -4332,29 +4086,25 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX1-LABEL: trunc_xor_v16i16_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_xor_v16i16_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -4370,7 +4120,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -4596,8 +4346,8 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -4605,8 +4355,8 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
@@ -4620,15 +4370,15 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
@@ -4682,12 +4432,12 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -4715,29 +4465,25 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ;
 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -4753,7 +4499,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ;
 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -5007,8 +4753,8 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -5016,8 +4762,8 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -5034,15 +4780,15 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -5102,12 +4848,12 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -5136,29 +4882,25 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
 ; AVX1-LABEL: trunc_or_v16i16_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_or_v16i16_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_or_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -5174,7 +4916,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -5400,8 +5142,8 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -5409,8 +5151,8 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
@@ -5424,15 +5166,15 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
@@ -5486,12 +5228,12 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -5519,29 +5261,25 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ;
 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -5557,7 +5295,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ;
 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -5576,17 +5314,10 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT:    pmuludq %xmm1, %xmm3
-; SSE-NEXT:    pxor %xmm0, %xmm0
-; SSE-NEXT:    pmuludq %xmm0, %xmm1
-; SSE-NEXT:    psllq $32, %xmm1
-; SSE-NEXT:    paddq %xmm3, %xmm1
-; SSE-NEXT:    pmuludq %xmm4, %xmm2
-; SSE-NEXT:    pmuludq %xmm4, %xmm0
-; SSE-NEXT:    psllq $32, %xmm0
-; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
@@ -5607,40 +5338,14 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: mul_add_self_v4i64_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    psrad $31, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT:    movdqa %xmm0, %xmm6
-; SSE-NEXT:    psrad $31, %xmm6
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE-NEXT:    movdqa %xmm4, %xmm5
-; SSE-NEXT:    psrad $31, %xmm5
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE-NEXT:    movdqa %xmm1, %xmm7
-; SSE-NEXT:    psrad $31, %xmm7
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; SSE-NEXT:    pxor %xmm8, %xmm8
-; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
-; SSE-NEXT:    pmuludq %xmm1, %xmm6
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; SSE-NEXT:    pmuludq %xmm0, %xmm7
-; SSE-NEXT:    paddq %xmm6, %xmm7
-; SSE-NEXT:    psllq $32, %xmm7
-; SSE-NEXT:    pmuludq %xmm0, %xmm1
-; SSE-NEXT:    paddq %xmm7, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; SSE-NEXT:    pmuludq %xmm4, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
-; SSE-NEXT:    pmuludq %xmm2, %xmm5
-; SSE-NEXT:    paddq %xmm3, %xmm5
-; SSE-NEXT:    psllq $32, %xmm5
-; SSE-NEXT:    pmuludq %xmm2, %xmm4
-; SSE-NEXT:    paddq %xmm5, %xmm4
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; SSE-NEXT:    paddd %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: mul_add_self_v4i64_v4i32:
@@ -5662,18 +5367,11 @@ define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nou
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT:    pmuludq %xmm1, %xmm3
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pmuludq %xmm5, %xmm1
-; SSE-NEXT:    psllq $32, %xmm1
-; SSE-NEXT:    paddq %xmm3, %xmm1
-; SSE-NEXT:    pmuludq %xmm4, %xmm2
-; SSE-NEXT:    pmuludq %xmm4, %xmm5
-; SSE-NEXT:    psllq $32, %xmm5
-; SSE-NEXT:    paddq %xmm2, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
-; SSE-NEXT:    paddd %xmm5, %xmm0
+; SSE-NEXT:    pmuludq %xmm3, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
+; SSE-NEXT:    paddd %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
diff --git a/test/CodeGen/X86/vector-trunc-packus-widen.ll b/test/CodeGen/X86/vector-trunc-packus-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..232c15a402698e3be64f4b4620a534fd1e9cfc4d
--- /dev/null
+++ b/test/CodeGen/X86/vector-trunc-packus-widen.ll
@@ -0,0 +1,3169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+
+;
+; PACKUS saturation truncation to vXi32
+;
+
+define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
+; SSE2-LABEL: trunc_packus_v4i64_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_packus_v4i64_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4294967295,4294967295]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    xorpd %xmm1, %xmm1
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; SSE41-NEXT:    movapd %xmm5, %xmm0
+; SSE41-NEXT:    xorpd %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-SLOW-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
+; AVX2-SLOW-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-FAST-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
+; AVX2-FAST-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_packus_v4i64_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_packus_v4i64_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_packus_v4i64_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX512BW-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %3 = icmp sgt <4 x i64> %2, zeroinitializer
+  %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
+  %5 = trunc <4 x i64> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+
+
+define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_packus_v8i64_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm6
+; SSE2-NEXT:    por %xmm2, %xmm6
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_packus_v8i64_v8i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm0, %xmm5
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm6
+; SSSE3-NEXT:    por %xmm2, %xmm6
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm6, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm10, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm5, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_packus_v8i64_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [4294967295,4294967295]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [2147483647,2147483647]
+; SSE41-NEXT:    movdqa %xmm11, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE41-NEXT:    movapd %xmm9, %xmm0
+; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm3
+; SSE41-NEXT:    movapd %xmm8, %xmm0
+; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v8i64_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-SLOW-NEXT:    vpand %ymm1, %ymm3, %ymm1
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
+; AVX2-SLOW-NEXT:    vpand %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-FAST-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-FAST-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-FAST-NEXT:    vpand %ymm1, %ymm3, %ymm1
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
+; AVX2-FAST-NEXT:    vpand %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_packus_v8i64_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovusqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %3 = icmp sgt <8 x i64> %2, zeroinitializer
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
+  %5 = trunc <8 x i64> %4 to <8 x i32>
+  ret <8 x i32> %5
+}
+
+;
+; PACKUS saturation truncation to vXi16
+;
+
+define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_packus_v8i64_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm6
+; SSE2-NEXT:    por %xmm3, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm10, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_packus_v8i64_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm6
+; SSSE3-NEXT:    por %xmm3, %xmm6
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm9, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa %xmm6, %xmm2
+; SSSE3-NEXT:    pxor %xmm10, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pxor %xmm10, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm5, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_packus_v8i64_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm9
+; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [65535,65535]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183]
+; SSE41-NEXT:    movdqa %xmm6, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm8
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm5
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    packusdw %xmm5, %xmm1
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    movapd %xmm8, %xmm0
+; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
+; SSE41-NEXT:    packusdw %xmm4, %xmm3
+; SSE41-NEXT:    packusdw %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65535,65535]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v8i64_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpand %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_packus_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovusqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %3 = icmp sgt <8 x i64> %2, zeroinitializer
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
+  %5 = trunc <8 x i64> %4 to <8 x i16>
+  ret <8 x i16> %5
+}
+
+define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
+; SSE2-LABEL: trunc_packus_v8i32_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_packus_v8i32_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm3
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm2, %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_packus_v8i32_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_packus_v8i32_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_packus_v8i32_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovusdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_packus_v8i32_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512BW-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovusdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %3 = icmp sgt <8 x i32> %2, zeroinitializer
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  %5 = trunc <8 x i32> %4 to <8 x i16>
+  ret <8 x i16> %5
+}
+
+define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32> %a0) {
+; SSE2-LABEL: trunc_packus_v16i32_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm6, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm6, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pslld $16, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm5, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_packus_v16i32_v16i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
+; SSSE3-NEXT:    movdqa %xmm6, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm6, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa %xmm6, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    pandn %xmm6, %xmm5
+; SSSE3-NEXT:    por %xmm0, %xmm5
+; SSSE3-NEXT:    movdqa %xmm6, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSSE3-NEXT:    pandn %xmm6, %xmm0
+; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa %xmm6, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm6, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm5, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    pand %xmm4, %xmm5
+; SSSE3-NEXT:    pslld $16, %xmm5
+; SSSE3-NEXT:    psrad $16, %xmm5
+; SSSE3-NEXT:    pslld $16, %xmm0
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    packssdw %xmm5, %xmm0
+; SSSE3-NEXT:    pslld $16, %xmm3
+; SSSE3-NEXT:    psrad $16, %xmm3
+; SSSE3-NEXT:    pslld $16, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    packssdw %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_packus_v16i32_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v16i32_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v16i32_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_packus_v16i32_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovusdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %3 = icmp sgt <16 x i32> %2, zeroinitializer
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  %5 = trunc <16 x i32> %4 to <16 x i16>
+  ret <16 x i16> %5
+}
+
+;
+; PACKUS saturation truncation to v16i8
+;
+
+define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_packus_v8i64_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm11
+; SSE2-NEXT:    por %xmm3, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm10, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm7
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm11, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm11, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    packuswb %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pand %xmm2, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm7, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm11
+; SSSE3-NEXT:    pand %xmm11, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm11
+; SSSE3-NEXT:    por %xmm3, %xmm11
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm10, %xmm3
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm10, %xmm2
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm7
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm4
+; SSSE3-NEXT:    movdqa %xmm11, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pand %xmm8, %xmm6
+; SSSE3-NEXT:    pand %xmm11, %xmm6
+; SSSE3-NEXT:    pand %xmm8, %xmm4
+; SSSE3-NEXT:    pand %xmm3, %xmm4
+; SSSE3-NEXT:    packuswb %xmm6, %xmm4
+; SSSE3-NEXT:    pand %xmm8, %xmm7
+; SSSE3-NEXT:    pand %xmm2, %xmm7
+; SSSE3-NEXT:    pand %xmm8, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm7, %xmm0
+; SSSE3-NEXT:    packuswb %xmm4, %xmm0
+; SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_packus_v8i64_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm9
+; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903]
+; SSE41-NEXT:    movdqa %xmm5, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm10
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm10
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm5
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movapd %xmm10, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
+; SSE41-NEXT:    andpd %xmm8, %xmm2
+; SSE41-NEXT:    andpd %xmm8, %xmm4
+; SSE41-NEXT:    packusdw %xmm2, %xmm4
+; SSE41-NEXT:    andpd %xmm8, %xmm5
+; SSE41-NEXT:    andpd %xmm8, %xmm1
+; SSE41-NEXT:    packusdw %xmm5, %xmm1
+; SSE41-NEXT:    packusdw %xmm4, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v8i64_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v8i64_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpand %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_packus_v8i64_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %3 = icmp sgt <8 x i64> %2, zeroinitializer
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
+  %5 = trunc <8 x i64> %4 to <8 x i8>
+  ret <8 x i8> %5
+}
+
+define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
+; SSE2-LABEL: trunc_packus_v8i64_v8i8_store:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm11
+; SSE2-NEXT:    pand %xmm11, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm11
+; SSE2-NEXT:    por %xmm3, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm10, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm7
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm11, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm11, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    packuswb %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pand %xmm2, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm7, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm11
+; SSSE3-NEXT:    pand %xmm11, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm11
+; SSSE3-NEXT:    por %xmm3, %xmm11
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm10, %xmm3
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm10, %xmm2
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm7
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm4
+; SSSE3-NEXT:    movdqa %xmm11, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pand %xmm8, %xmm6
+; SSSE3-NEXT:    pand %xmm11, %xmm6
+; SSSE3-NEXT:    pand %xmm8, %xmm4
+; SSSE3-NEXT:    pand %xmm3, %xmm4
+; SSSE3-NEXT:    packuswb %xmm6, %xmm4
+; SSSE3-NEXT:    pand %xmm8, %xmm7
+; SSSE3-NEXT:    pand %xmm2, %xmm7
+; SSSE3-NEXT:    pand %xmm8, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm7, %xmm0
+; SSSE3-NEXT:    packuswb %xmm4, %xmm0
+; SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_packus_v8i64_v8i8_store:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm9
+; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903]
+; SSE41-NEXT:    movdqa %xmm5, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm10
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm10
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm5
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movapd %xmm10, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
+; SSE41-NEXT:    andpd %xmm8, %xmm2
+; SSE41-NEXT:    andpd %xmm8, %xmm4
+; SSE41-NEXT:    packusdw %xmm2, %xmm4
+; SSE41-NEXT:    andpd %xmm8, %xmm5
+; SSE41-NEXT:    andpd %xmm8, %xmm1
+; SSE41-NEXT:    packusdw %xmm5, %xmm1
+; SSE41-NEXT:    packusdw %xmm4, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm1
+; SSE41-NEXT:    movq %xmm1, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v8i64_v8i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v8i64_v8i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpand %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vmovq %xmm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_packus_v8i64_v8i8_store:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovusqb %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %3 = icmp sgt <8 x i64> %2, zeroinitializer
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
+  %5 = trunc <8 x i64> %4 to <8 x i8>
+  store <8 x i8> %5, <8 x i8> *%p1
+  ret void
+}
+
+define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
+; SSE2-LABEL: trunc_packus_v16i64_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm10, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm12
+; SSE2-NEXT:    pand %xmm12, %xmm7
+; SSE2-NEXT:    pandn %xmm10, %xmm12
+; SSE2-NEXT:    por %xmm7, %xmm12
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm11, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm13
+; SSE2-NEXT:    pand %xmm13, %xmm4
+; SSE2-NEXT:    pandn %xmm10, %xmm13
+; SSE2-NEXT:    por %xmm4, %xmm13
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm11, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm14
+; SSE2-NEXT:    pand %xmm14, %xmm5
+; SSE2-NEXT:    pandn %xmm10, %xmm14
+; SSE2-NEXT:    por %xmm5, %xmm14
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm11, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm10, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm11, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm10, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm11, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm10, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm10, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm14, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm14, %xmm2
+; SSE2-NEXT:    movdqa %xmm13, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm13, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm12, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm9, %xmm4
+; SSE2-NEXT:    packuswb %xmm3, %xmm4
+; SSE2-NEXT:    packuswb %xmm4, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_packus_v16i64_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm6, %xmm9
+; SSSE3-NEXT:    pxor %xmm8, %xmm9
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
+; SSSE3-NEXT:    movdqa %xmm11, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm14
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm14, %xmm9
+; SSSE3-NEXT:    pand %xmm9, %xmm6
+; SSSE3-NEXT:    pandn %xmm10, %xmm9
+; SSSE3-NEXT:    por %xmm6, %xmm9
+; SSSE3-NEXT:    movdqa %xmm7, %xmm6
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    movdqa %xmm11, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm12
+; SSSE3-NEXT:    pand %xmm12, %xmm7
+; SSSE3-NEXT:    pandn %xmm10, %xmm12
+; SSSE3-NEXT:    por %xmm7, %xmm12
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    movdqa %xmm11, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm13
+; SSSE3-NEXT:    pand %xmm13, %xmm4
+; SSSE3-NEXT:    pandn %xmm10, %xmm13
+; SSSE3-NEXT:    por %xmm4, %xmm13
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    movdqa %xmm11, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm14
+; SSSE3-NEXT:    pand %xmm14, %xmm5
+; SSSE3-NEXT:    pandn %xmm10, %xmm14
+; SSSE3-NEXT:    por %xmm5, %xmm14
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    movdqa %xmm11, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pandn %xmm10, %xmm5
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm11, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm10, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm11, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn %xmm10, %xmm3
+; SSSE3-NEXT:    por %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    movdqa %xmm11, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm10, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm14, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm14, %xmm2
+; SSSE3-NEXT:    movdqa %xmm13, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pand %xmm13, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm12, %xmm2
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm12, %xmm3
+; SSSE3-NEXT:    movdqa %xmm9, %xmm2
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm9, %xmm4
+; SSSE3-NEXT:    packuswb %xmm3, %xmm4
+; SSSE3-NEXT:    packuswb %xmm4, %xmm1
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_packus_v16i64_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movapd {{.*#+}} xmm11 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm12 = [2147483903,2147483903]
+; SSE41-NEXT:    movdqa %xmm12, %xmm10
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm10
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm10
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm13
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm13
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm14
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm14
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm15
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm15
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm11
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movapd %xmm11, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm7
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    packusdw %xmm7, %xmm1
+; SSE41-NEXT:    movapd %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
+; SSE41-NEXT:    movapd %xmm5, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT:    packusdw %xmm3, %xmm4
+; SSE41-NEXT:    packusdw %xmm4, %xmm1
+; SSE41-NEXT:    movapd %xmm15, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm15, %xmm4
+; SSE41-NEXT:    movapd %xmm14, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm14, %xmm3
+; SSE41-NEXT:    packusdw %xmm4, %xmm3
+; SSE41-NEXT:    movapd %xmm13, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm13, %xmm4
+; SSE41-NEXT:    movapd %xmm10, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
+; SSE41-NEXT:    packusdw %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm2, %xmm3
+; SSE41-NEXT:    packuswb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255]
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm6, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm3, %ymm4, %ymm15
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm6, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm6, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX1-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm14
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm14, %xmm9
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm10
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm7, %xmm11
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm12
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm13
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm15, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpand %xmm15, %xmm6, %xmm4
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm13, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm12, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm7, %xmm11, %xmm3
+; AVX1-NEXT:    vpand %xmm1, %xmm10, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm14, %xmm9, %xmm3
+; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v16i64_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm4, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm4, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpand %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm0, %ymm5
+; AVX2-NEXT:    vpand %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm1
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm3
+; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_packus_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
+; AVX512-NEXT:    vpminsq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpminsq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpmaxsq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpmaxsq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %3 = icmp sgt <16 x i64> %2, zeroinitializer
+  %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer
+  %5 = trunc <16 x i64> %4 to <16 x i8>
+  ret <16 x i8> %5
+}
+
+define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
+; SSE-LABEL: trunc_packus_v8i32_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v8i32_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v8i32_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_packus_v8i32_v8i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_packus_v8i32_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_packus_v8i32_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %3 = icmp sgt <8 x i32> %2, zeroinitializer
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  %5 = trunc <8 x i32> %4 to <8 x i8>
+  ret <8 x i8> %5
+}
+
+define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
+; SSE-LABEL: trunc_packus_v8i32_v8i8_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v8i32_v8i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v8i32_v8i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovusdb %ymm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovusdb %ymm0, (%rdi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %3 = icmp sgt <8 x i32> %2, zeroinitializer
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  %5 = trunc <8 x i32> %4 to <8 x i8>
+  store <8 x i8> %5, <8 x i8> *%p1
+  ret void
+}
+
+define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32> %a0) {
+; SSE-LABEL: trunc_packus_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_packus_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovusdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %3 = icmp sgt <16 x i32> %2, zeroinitializer
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  %5 = trunc <16 x i32> %4 to <16 x i8>
+  ret <16 x i8> %5
+}
+
+define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) {
+; SSE-LABEL: trunc_packus_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_packus_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_packus_v16i16_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_packus_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovuswb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = icmp sgt <16 x i16> %2, zeroinitializer
+  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
+  %5 = trunc <16 x i16> %4 to <16 x i8>
+  ret <16 x i8> %5
+}
+
+define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) {
+; SSE-LABEL: trunc_packus_v32i16_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_packus_v32i16_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v32i16_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_packus_v32i16_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpminsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_packus_v32i16_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpminsw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmaxsw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_packus_v32i16_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = icmp sgt <32 x i16> %2, zeroinitializer
+  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
+  %5 = trunc <32 x i16> %4 to <32 x i8>
+  ret <32 x i8> %5
+}
diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll
index 61935dce8f820097b46376ec1400776c23d81bcd..f49ee8f8445a56a9a5bf4d80611c5c16a0ee99d9 100644
--- a/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/test/CodeGen/X86/vector-trunc-packus.ll
@@ -131,53 +131,49 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4294967295,4294967295]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE41-NEXT:    pxor %xmm8, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
 ; SSE41-NEXT:    movdqa %xmm6, %xmm5
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm4, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
 ; SSE41-NEXT:    movdqa %xmm6, %xmm2
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; SSE41-NEXT:    xorpd %xmm1, %xmm1
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
-; SSE41-NEXT:    xorpd %xmm8, %xmm0
+; SSE41-NEXT:    xorpd %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movapd %xmm5, %xmm0
-; SSE41-NEXT:    xorpd %xmm8, %xmm0
-; SSE41-NEXT:    movapd %xmm0, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
@@ -498,12 +494,11 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [2147483647,2147483647]
 ; SSE41-NEXT:    movdqa %xmm11, %xmm6
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm8
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -512,10 +507,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm9
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm9
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -524,10 +518,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -536,10 +529,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm2, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
@@ -548,10 +540,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
@@ -560,10 +551,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
@@ -573,10 +563,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm3
 ; SSE41-NEXT:    movapd %xmm8, %xmm0
@@ -585,10 +574,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
 ; SSE41-NEXT:    movaps %xmm2, %xmm0
@@ -915,51 +903,47 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [2147549183,2147549183]
-; SSE41-NEXT:    movdqa %xmm11, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183]
+; SSE41-NEXT:    movdqa %xmm6, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm8
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm8
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm2
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm9, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm3
+; SSE41-NEXT:    movdqa %xmm6, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm3
+; SSE41-NEXT:    movdqa %xmm6, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
@@ -968,10 +952,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm5
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
@@ -980,10 +963,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; SSE41-NEXT:    packusdw %xmm5, %xmm1
@@ -993,10 +975,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    pxor %xmm4, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE41-NEXT:    movapd %xmm8, %xmm0
@@ -1005,10 +986,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
 ; SSE41-NEXT:    packusdw %xmm4, %xmm3
 ; SSE41-NEXT:    packusdw %xmm3, %xmm1
@@ -1555,51 +1535,47 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
-; SSE41-NEXT:    movdqa %xmm11, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
+; SSE41-NEXT:    movdqa %xmm6, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm8
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm8
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm2
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm9, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm3
+; SSE41-NEXT:    movdqa %xmm6, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
-; SSE41-NEXT:    movdqa %xmm11, %xmm3
+; SSE41-NEXT:    movdqa %xmm6, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
@@ -1608,10 +1584,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm5
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
@@ -1620,10 +1595,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; SSE41-NEXT:    packusdw %xmm5, %xmm1
@@ -1633,10 +1607,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    pxor %xmm4, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE41-NEXT:    movapd %xmm8, %xmm0
@@ -1645,10 +1618,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm10, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
 ; SSE41-NEXT:    packusdw %xmm4, %xmm3
 ; SSE41-NEXT:    packusdw %xmm3, %xmm1
@@ -1950,104 +1922,96 @@ define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm9
 ; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [255,255]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903]
 ; SSE41-NEXT:    movdqa %xmm5, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
-; SSE41-NEXT:    movapd %xmm8, %xmm11
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm11
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm10
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm10
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa %xmm5, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm8, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa %xmm5, %xmm2
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm8, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm2, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm8, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm5
 ; SSE41-NEXT:    movapd %xmm5, %xmm0
-; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm2, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
-; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm5
 ; SSE41-NEXT:    movapd %xmm3, %xmm0
-; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    pxor %xmm4, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
-; SSE41-NEXT:    movapd %xmm11, %xmm0
-; SSE41-NEXT:    xorpd %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm10, %xmm0
+; SSE41-NEXT:    xorpd %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
 ; SSE41-NEXT:    andpd %xmm8, %xmm2
 ; SSE41-NEXT:    andpd %xmm8, %xmm4
 ; SSE41-NEXT:    packusdw %xmm2, %xmm4
@@ -2564,10 +2528,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm10
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm10
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
@@ -2576,22 +2539,20 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm13
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm13
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    movdqa %xmm12, %xmm6
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm14, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm14
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm14
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
@@ -2600,10 +2561,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm15
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm15
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -2612,10 +2572,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -2624,10 +2583,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm6
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
@@ -2636,10 +2594,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -2648,113 +2605,104 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm11
-; SSE41-NEXT:    xorpd %xmm8, %xmm8
+; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    movapd %xmm11, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm7
 ; SSE41-NEXT:    movapd %xmm3, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT:    packusdw %xmm4, %xmm1
+; SSE41-NEXT:    packusdw %xmm7, %xmm1
 ; SSE41-NEXT:    movapd %xmm6, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm0, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
-; SSE41-NEXT:    movapd %xmm5, %xmm0
-; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm3
-; SSE41-NEXT:    packusdw %xmm2, %xmm3
-; SSE41-NEXT:    packusdw %xmm3, %xmm1
-; SSE41-NEXT:    movapd %xmm15, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
+; SSE41-NEXT:    movapd %xmm5, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm0, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
 ; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm15, %xmm2
-; SSE41-NEXT:    movapd %xmm14, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT:    packusdw %xmm3, %xmm4
+; SSE41-NEXT:    packusdw %xmm4, %xmm1
+; SSE41-NEXT:    movapd %xmm15, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm15, %xmm4
+; SSE41-NEXT:    movapd %xmm14, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm14, %xmm3
-; SSE41-NEXT:    packusdw %xmm2, %xmm3
+; SSE41-NEXT:    packusdw %xmm4, %xmm3
 ; SSE41-NEXT:    movapd %xmm13, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
-; SSE41-NEXT:    movapd %xmm0, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm13, %xmm2
-; SSE41-NEXT:    movapd %xmm10, %xmm0
-; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm9, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm8
-; SSE41-NEXT:    packusdw %xmm2, %xmm8
-; SSE41-NEXT:    packusdw %xmm8, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm13, %xmm4
+; SSE41-NEXT:    movapd %xmm10, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
+; SSE41-NEXT:    packusdw %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm2, %xmm3
 ; SSE41-NEXT:    packuswb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -3137,7 +3085,7 @@ define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) {
 ; AVX512F-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -3147,7 +3095,7 @@ define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) {
 ; AVX512VL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -3208,9 +3156,9 @@ define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) {
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -3223,9 +3171,9 @@ define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) {
 ; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmaxsw %ymm2, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-trunc-ssat-widen.ll b/test/CodeGen/X86/vector-trunc-ssat-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7ba3dc89aa0576cb68d0240f2de4284451b0b33f
--- /dev/null
+++ b/test/CodeGen/X86/vector-trunc-ssat-widen.ll
@@ -0,0 +1,3109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+
+;
+; Signed saturation truncation to vXi32
+;
+
+define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
+; SSE2-LABEL: trunc_ssat_v4i64_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    por %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_ssat_v4i64_v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm4
+; SSSE3-NEXT:    pandn %xmm1, %xmm6
+; SSSE3-NEXT:    por %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_ssat_v4i64_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [2147483647,2147483647]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm3, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [18446744069414584320,18446744069414584320]
+; SSE41-NEXT:    movapd %xmm0, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm1, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
+; SSE41-NEXT:    xorpd %xmm5, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483647,2147483647]
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_ssat_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-SLOW-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-SLOW-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_ssat_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-FAST-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-FAST-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_ssat_v4i64_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_ssat_v4i64_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsqd %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX512BW-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i32:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovsqd %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <4 x i64> %a0, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
+  %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
+  %3 = icmp sgt <4 x i64> %2, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
+  %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
+  %5 = trunc <4 x i64> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+
+
+define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_ssat_v8i64_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm6
+; SSE2-NEXT:    por %xmm3, %xmm6
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_ssat_v8i64_v8i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [4294967295,4294967295]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm0, %xmm5
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm2, %xmm7
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm6
+; SSSE3-NEXT:    por %xmm3, %xmm6
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
+; SSSE3-NEXT:    movdqa %xmm6, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm6
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm7, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm7
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm5
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_ssat_v8i64_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [2147483647,2147483647]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [4294967295,4294967295]
+; SSE41-NEXT:    movdqa %xmm10, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320]
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
+; SSE41-NEXT:    movapd %xmm9, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
+; SSE41-NEXT:    xorpd %xmm8, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v8i64_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [2147483647,2147483647]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_ssat_v8i64_v8i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-FAST-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-FAST-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-FAST-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-FAST-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_ssat_v8i64_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i64> %a0, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
+  %3 = icmp sgt <8 x i64> %2, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
+  %5 = trunc <8 x i64> %4 to <8 x i32>
+  ret <8 x i32> %5
+}
+
+;
+; Signed saturation truncation to vXi16
+;
+
+define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_ssat_v8i64_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147516415,2147516415]
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm1, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    packssdw %xmm3, %xmm1
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_ssat_v8i64_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147516415,2147516415]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm1, %xmm7
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
+; SSSE3-NEXT:    movdqa %xmm7, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm7
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    packssdw %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm5
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    packssdw %xmm3, %xmm1
+; SSSE3-NEXT:    packssdw %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_ssat_v8i64_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm10
+; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [32767,32767]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [2147516415,2147516415]
+; SSE41-NEXT:    movdqa %xmm11, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm8
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm9
+; SSE41-NEXT:    movdqa %xmm10, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200]
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    packssdw %xmm6, %xmm1
+; SSE41-NEXT:    movapd %xmm9, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm2
+; SSE41-NEXT:    xorpd %xmm8, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
+; SSE41-NEXT:    packssdw %xmm2, %xmm3
+; SSE41-NEXT:    packssdw %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [32767,32767,32767,32767]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32767,32767]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v8i64_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [32767,32767,32767,32767]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_ssat_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i64> %a0, <i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767>
+  %3 = icmp sgt <8 x i64> %2, <i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768>
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768>
+  %5 = trunc <8 x i64> %4 to <8 x i16>
+  ret <8 x i16> %5
+}
+
+define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) {
+; SSE-LABEL: trunc_ssat_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_ssat_v8i32_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_ssat_v8i32_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_ssat_v8i32_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512BW-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
+; AVX512BW-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovsdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <8 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %3 = icmp sgt <8 x i32> %2, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %5 = trunc <8 x i32> %4 to <8 x i16>
+  ret <8 x i16> %5
+}
+
+define <16 x i16> @trunc_ssat_v16i32_v16i16(<16 x i32> %a0) {
+; SSE-LABEL: trunc_ssat_v16i32_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v16i32_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v16i32_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_ssat_v16i32_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp slt <16 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %3 = icmp sgt <16 x i32> %2, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %5 = trunc <16 x i32> %4 to <16 x i16>
+  ret <16 x i16> %5
+}
+
+;
+; Signed saturation truncation to v16i8
+;
+
+define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_ssat_v8i64_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [127,127]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm0, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    packuswb %xmm7, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_ssat_v8i64_v8i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [127,127]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm3, %xmm5
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm0, %xmm7
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT:    movdqa %xmm7, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm7
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm2, %xmm7
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm5
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    packuswb %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm3, %xmm7
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    packuswb %xmm7, %xmm0
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_ssat_v8i64_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [127,127]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483775,2147483775]
+; SSE41-NEXT:    movdqa %xmm10, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm9
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm11
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm11
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840]
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    movapd %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT:    movapd %xmm11, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm3
+; SSE41-NEXT:    xorpd %xmm9, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm2
+; SSE41-NEXT:    movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE41-NEXT:    andpd %xmm0, %xmm2
+; SSE41-NEXT:    andpd %xmm0, %xmm3
+; SSE41-NEXT:    packusdw %xmm2, %xmm3
+; SSE41-NEXT:    andpd %xmm0, %xmm7
+; SSE41-NEXT:    andpd %xmm0, %xmm1
+; SSE41-NEXT:    packusdw %xmm7, %xmm1
+; SSE41-NEXT:    packusdw %xmm3, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v8i64_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [127,127,127,127]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v8i64_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_ssat_v8i64_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
+  %3 = icmp sgt <8 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
+  %5 = trunc <8 x i64> %4 to <8 x i8>
+  ret <8 x i8> %5
+}
+
+define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
+; SSE2-LABEL: trunc_ssat_v8i64_v8i8_store:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [127,127]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm0, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    packuswb %xmm7, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [127,127]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm3, %xmm5
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm0, %xmm7
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT:    movdqa %xmm7, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm7
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm2, %xmm7
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm5
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    packuswb %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm3, %xmm7
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    packuswb %xmm7, %xmm0
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [127,127]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483775,2147483775]
+; SSE41-NEXT:    movdqa %xmm10, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm9
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm9
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm11
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm11
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm7, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840]
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm2
+; SSE41-NEXT:    movapd %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm1, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT:    movapd %xmm11, %xmm0
+; SSE41-NEXT:    xorpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm3
+; SSE41-NEXT:    xorpd %xmm9, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm1
+; SSE41-NEXT:    movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE41-NEXT:    andpd %xmm0, %xmm1
+; SSE41-NEXT:    andpd %xmm0, %xmm3
+; SSE41-NEXT:    packusdw %xmm1, %xmm3
+; SSE41-NEXT:    andpd %xmm0, %xmm7
+; SSE41-NEXT:    andpd %xmm0, %xmm2
+; SSE41-NEXT:    packusdw %xmm7, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm2
+; SSE41-NEXT:    movq %xmm2, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v8i64_v8i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [127,127,127,127]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v8i64_v8i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vmovq %xmm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_ssat_v8i64_v8i8_store:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsqb %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <8 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
+  %3 = icmp sgt <8 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
+  %5 = trunc <8 x i64> %4 to <8 x i8>
+  store <8 x i8> %5, <8 x i8> *%p1
+  ret void
+}
+
+define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
+; SSE2-LABEL: trunc_ssat_v16i64_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [127,127]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483775,2147483775]
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    pandn %xmm10, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm11, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm12
+; SSE2-NEXT:    pand %xmm12, %xmm7
+; SSE2-NEXT:    pandn %xmm10, %xmm12
+; SSE2-NEXT:    por %xmm7, %xmm12
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm11, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm13
+; SSE2-NEXT:    pand %xmm13, %xmm4
+; SSE2-NEXT:    pandn %xmm10, %xmm13
+; SSE2-NEXT:    por %xmm4, %xmm13
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm11, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm14
+; SSE2-NEXT:    pand %xmm14, %xmm5
+; SSE2-NEXT:    pandn %xmm10, %xmm14
+; SSE2-NEXT:    por %xmm5, %xmm14
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm11, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm10, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm11, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pandn %xmm10, %xmm6
+; SSE2-NEXT:    por %xmm3, %xmm6
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm11, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm10, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm10, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm10, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm10, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm10, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm5
+; SSE2-NEXT:    pandn %xmm10, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    packssdw %xmm2, %xmm3
+; SSE2-NEXT:    packssdw %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm14, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm14
+; SSE2-NEXT:    pandn %xmm10, %xmm2
+; SSE2-NEXT:    por %xmm14, %xmm2
+; SSE2-NEXT:    movdqa %xmm13, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm13
+; SSE2-NEXT:    pandn %xmm10, %xmm3
+; SSE2-NEXT:    por %xmm13, %xmm3
+; SSE2-NEXT:    packssdw %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm12, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm12
+; SSE2-NEXT:    pandn %xmm10, %xmm2
+; SSE2-NEXT:    por %xmm12, %xmm2
+; SSE2-NEXT:    pxor %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm11, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm9
+; SSE2-NEXT:    pandn %xmm10, %xmm1
+; SSE2-NEXT:    por %xmm9, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    packssdw %xmm1, %xmm3
+; SSE2-NEXT:    packsswb %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_ssat_v16i64_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [127,127]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm6, %xmm9
+; SSSE3-NEXT:    pxor %xmm8, %xmm9
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [2147483775,2147483775]
+; SSSE3-NEXT:    movdqa %xmm11, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm14
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm14, %xmm9
+; SSSE3-NEXT:    pand %xmm9, %xmm6
+; SSSE3-NEXT:    pandn %xmm10, %xmm9
+; SSSE3-NEXT:    por %xmm6, %xmm9
+; SSSE3-NEXT:    movdqa %xmm7, %xmm6
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    movdqa %xmm11, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm12
+; SSSE3-NEXT:    pand %xmm12, %xmm7
+; SSSE3-NEXT:    pandn %xmm10, %xmm12
+; SSSE3-NEXT:    por %xmm7, %xmm12
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    movdqa %xmm11, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm13
+; SSSE3-NEXT:    pand %xmm13, %xmm4
+; SSSE3-NEXT:    pandn %xmm10, %xmm13
+; SSSE3-NEXT:    por %xmm4, %xmm13
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    movdqa %xmm11, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm14
+; SSSE3-NEXT:    pand %xmm14, %xmm5
+; SSSE3-NEXT:    pandn %xmm10, %xmm14
+; SSSE3-NEXT:    por %xmm5, %xmm14
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    movdqa %xmm11, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pandn %xmm10, %xmm5
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm11, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pandn %xmm10, %xmm6
+; SSSE3-NEXT:    por %xmm3, %xmm6
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm11, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn %xmm10, %xmm3
+; SSSE3-NEXT:    por %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    movdqa %xmm11, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm10, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm10, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSSE3-NEXT:    pandn %xmm10, %xmm0
+; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    packssdw %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm6, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm6
+; SSSE3-NEXT:    pandn %xmm10, %xmm2
+; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm5
+; SSSE3-NEXT:    pandn %xmm10, %xmm3
+; SSSE3-NEXT:    por %xmm5, %xmm3
+; SSSE3-NEXT:    packssdw %xmm2, %xmm3
+; SSSE3-NEXT:    packssdw %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa %xmm14, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm14
+; SSSE3-NEXT:    pandn %xmm10, %xmm2
+; SSSE3-NEXT:    por %xmm14, %xmm2
+; SSSE3-NEXT:    movdqa %xmm13, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm13
+; SSSE3-NEXT:    pandn %xmm10, %xmm3
+; SSSE3-NEXT:    por %xmm13, %xmm3
+; SSSE3-NEXT:    packssdw %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm12, %xmm1
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm12
+; SSSE3-NEXT:    pandn %xmm10, %xmm2
+; SSSE3-NEXT:    por %xmm12, %xmm2
+; SSSE3-NEXT:    pxor %xmm9, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm8
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm9
+; SSSE3-NEXT:    pandn %xmm10, %xmm1
+; SSSE3-NEXT:    por %xmm9, %xmm1
+; SSSE3-NEXT:    packssdw %xmm2, %xmm1
+; SSSE3-NEXT:    packssdw %xmm1, %xmm3
+; SSSE3-NEXT:    packsswb %xmm3, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_ssat_v16i64_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movapd {{.*#+}} xmm11 = [127,127]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm12 = [2147483775,2147483775]
+; SSE41-NEXT:    movdqa %xmm12, %xmm10
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm10
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm10
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm13
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm13
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm14
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm14
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm15
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm15
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm11, %xmm7
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm12, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm11
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
+; SSE41-NEXT:    movapd %xmm11, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840]
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm4
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    packssdw %xmm4, %xmm1
+; SSE41-NEXT:    movapd %xmm6, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
+; SSE41-NEXT:    movapd %xmm5, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT:    packssdw %xmm3, %xmm4
+; SSE41-NEXT:    packssdw %xmm4, %xmm1
+; SSE41-NEXT:    movapd %xmm15, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm15, %xmm3
+; SSE41-NEXT:    movapd %xmm14, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm14, %xmm4
+; SSE41-NEXT:    packssdw %xmm3, %xmm4
+; SSE41-NEXT:    movapd %xmm13, %xmm0
+; SSE41-NEXT:    xorpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm13, %xmm3
+; SSE41-NEXT:    xorpd %xmm10, %xmm9
+; SSE41-NEXT:    movapd %xmm9, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm9
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
+; SSE41-NEXT:    packssdw %xmm3, %xmm2
+; SSE41-NEXT:    packssdw %xmm2, %xmm4
+; SSE41-NEXT:    packsswb %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [127,127,127,127]
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [127,127]
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm6, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm6, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm6, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm0, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm1, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm2, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v16i64_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127]
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm4, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm4, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm0, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_ssat_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127]
+; AVX512-NEXT:    vpminsq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpminsq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX512-NEXT:    vpmaxsq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpmaxsq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <16 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
+  %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
+  %3 = icmp sgt <16 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
+  %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
+  %5 = trunc <16 x i64> %4 to <16 x i8>
+  ret <16 x i8> %5
+}
+
+define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) {
+; SSE-LABEL: trunc_ssat_v8i32_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v8i32_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v8i32_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_ssat_v8i32_v8i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
+; AVX512BW-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
+; AVX512BW-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <8 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+  %3 = icmp sgt <8 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
+  %5 = trunc <8 x i32> %4 to <8 x i8>
+  ret <8 x i8> %5
+}
+
+define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
+; SSE-LABEL: trunc_ssat_v8i32_v8i8_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_ssat_v8i32_v8i8_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
+; AVX512F-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8_store:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsdb %ymm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
+; AVX512BW-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
+; AVX512BW-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8_store:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovsdb %ymm0, (%rdi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <8 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+  %3 = icmp sgt <8 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
+  %5 = trunc <8 x i32> %4 to <8 x i8>
+  store <8 x i8> %5, <8 x i8> *%p1
+  ret void
+}
+
+define <16 x i8> @trunc_ssat_v16i32_v16i8(<16 x i32> %a0) {
+; SSE-LABEL: trunc_ssat_v16i32_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_ssat_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp slt <16 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+  %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+  %3 = icmp sgt <16 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
+  %5 = trunc <16 x i32> %4 to <16 x i8>
+  ret <16 x i8> %5
+}
+
+define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) {
+; SSE-LABEL: trunc_ssat_v16i16_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packsswb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_ssat_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_ssat_v16i16_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_ssat_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_ssat_v16i16_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovswb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <16 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %3 = icmp sgt <16 x i16> %2, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %5 = trunc <16 x i16> %4 to <16 x i8>
+  ret <16 x i8> %5
+}
+
+define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) {
+; SSE-LABEL: trunc_ssat_v32i16_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packsswb %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc_ssat_v32i16_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v32i16_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_ssat_v32i16_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpminsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408]
+; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpminsw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408]
+; AVX512VL-NEXT:    vpmaxsw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_ssat_v32i16_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_ssat_v32i16_v32i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovswb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp slt <32 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %3 = icmp sgt <32 x i16> %2, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %5 = trunc <32 x i16> %4 to <32 x i8>
+  ret <32 x i8> %5
+}
diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll
index 500d8ba1511d8c96a3e1e25d0fc790c67477135b..d690c0f2ef0994452389b40291345ad87551d7f5 100644
--- a/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -143,53 +143,49 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
 ; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [2147483647,2147483647]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
 ; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE41-NEXT:    movdqa %xmm8, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSE41-NEXT:    movdqa %xmm6, %xmm5
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm4, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    movdqa %xmm8, %xmm2
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
 ; SSE41-NEXT:    xorpd %xmm3, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [18446744069414584320,18446744069414584320]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [18446744069414584320,18446744069414584320]
 ; SSE41-NEXT:    movapd %xmm0, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm7, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm2, %xmm0
-; SSE41-NEXT:    movapd %xmm1, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT:    xorpd %xmm5, %xmm3
-; SSE41-NEXT:    movapd %xmm3, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm8, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm1, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
+; SSE41-NEXT:    xorpd %xmm5, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -517,7 +513,7 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) {
 ;
 ; SSE41-LABEL: trunc_ssat_v8i64_v8i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm9
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [2147483647,2147483647]
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
@@ -526,22 +522,20 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm8
-; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm8
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
 ; SSE41-NEXT:    movdqa %xmm10, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm9, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm9
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm9
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -550,10 +544,9 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -562,61 +555,56 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm2, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320]
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm3
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
 ; SSE41-NEXT:    movapd %xmm4, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm2, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
 ; SSE41-NEXT:    movapd %xmm9, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
-; SSE41-NEXT:    movapd %xmm0, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm3
-; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm3
-; SSE41-NEXT:    xorpd %xmm8, %xmm5
-; SSE41-NEXT:    movapd %xmm5, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
+; SSE41-NEXT:    xorpd %xmm8, %xmm5
+; SSE41-NEXT:    movapd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
 ; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -947,12 +935,11 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [2147516415,2147516415]
 ; SSE41-NEXT:    movdqa %xmm11, %xmm6
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm8
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm8
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -961,10 +948,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm9
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm9
 ; SSE41-NEXT:    movdqa %xmm10, %xmm0
@@ -973,10 +959,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -985,59 +970,54 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
 ; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [18446744071562035200,18446744071562035200]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200]
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm4
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    packssdw %xmm4, %xmm1
+; SSE41-NEXT:    packssdw %xmm6, %xmm1
 ; SSE41-NEXT:    movapd %xmm9, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm2
 ; SSE41-NEXT:    xorpd %xmm8, %xmm5
-; SSE41-NEXT:    movapd %xmm5, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
 ; SSE41-NEXT:    packssdw %xmm2, %xmm3
 ; SSE41-NEXT:    packssdw %xmm3, %xmm1
@@ -1451,12 +1431,11 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [2147483775,2147483775]
 ; SSE41-NEXT:    movdqa %xmm11, %xmm6
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm8
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm8
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -1465,10 +1444,9 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm9
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm9
 ; SSE41-NEXT:    movdqa %xmm10, %xmm0
@@ -1477,10 +1455,9 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -1489,59 +1466,54 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
 ; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840]
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movapd %xmm3, %xmm4
-; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    packssdw %xmm4, %xmm1
+; SSE41-NEXT:    packssdw %xmm6, %xmm1
 ; SSE41-NEXT:    movapd %xmm9, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm2
 ; SSE41-NEXT:    xorpd %xmm8, %xmm5
-; SSE41-NEXT:    movapd %xmm5, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
 ; SSE41-NEXT:    packssdw %xmm2, %xmm3
 ; SSE41-NEXT:    packssdw %xmm3, %xmm1
@@ -1871,12 +1843,11 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [2147483775,2147483775]
 ; SSE41-NEXT:    movdqa %xmm10, %xmm6
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm9, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm9
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm9
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -1885,10 +1856,9 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm11
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm11
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -1897,10 +1867,9 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm7, %xmm6
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
@@ -1909,10 +1878,9 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm2, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
 ; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
@@ -1922,45 +1890,41 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm1, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm2
 ; SSE41-NEXT:    movapd %xmm6, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm1, %xmm7
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
 ; SSE41-NEXT:    movapd %xmm11, %xmm0
 ; SSE41-NEXT:    xorpd %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm1, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm3
 ; SSE41-NEXT:    xorpd %xmm9, %xmm5
-; SSE41-NEXT:    movapd %xmm5, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movapd %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm1
 ; SSE41-NEXT:    movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    andpd %xmm0, %xmm1
@@ -2514,10 +2478,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm10
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm10, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm10
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm10
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
@@ -2526,22 +2489,20 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm13
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm13
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
 ; SSE41-NEXT:    movdqa %xmm12, %xmm6
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm14, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm14
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm14
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
@@ -2550,10 +2511,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm15
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm15
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -2562,10 +2522,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -2574,10 +2533,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm6
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
@@ -2586,10 +2544,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm11, %xmm7
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -2598,10 +2555,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm12, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm11
 ; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
 ; SSE41-NEXT:    movapd %xmm11, %xmm0
@@ -2611,35 +2567,32 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    movapd %xmm2, %xmm3
-; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm4
 ; SSE41-NEXT:    movapd %xmm7, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm11, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm2, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT:    packssdw %xmm3, %xmm1
+; SSE41-NEXT:    packssdw %xmm4, %xmm1
 ; SSE41-NEXT:    movapd %xmm6, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm2, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
 ; SSE41-NEXT:    movapd %xmm5, %xmm0
@@ -2648,10 +2601,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm2, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
 ; SSE41-NEXT:    packssdw %xmm3, %xmm4
@@ -2662,10 +2614,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm2, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm15, %xmm3
 ; SSE41-NEXT:    movapd %xmm14, %xmm0
@@ -2674,10 +2625,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm2, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm14, %xmm4
 ; SSE41-NEXT:    packssdw %xmm3, %xmm4
@@ -2687,21 +2637,19 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm2, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm13, %xmm3
 ; SSE41-NEXT:    xorpd %xmm10, %xmm9
-; SSE41-NEXT:    movapd %xmm9, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movapd %xmm9, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm9
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
 ; SSE41-NEXT:    packssdw %xmm3, %xmm2
 ; SSE41-NEXT:    packssdw %xmm2, %xmm4
@@ -3091,7 +3039,7 @@ define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -3100,7 +3048,7 @@ define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -3158,9 +3106,9 @@ define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) {
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408]
 ; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -3173,9 +3121,9 @@ define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) {
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408]
 ; AVX512VL-NEXT:    vpmaxsw %ymm2, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-trunc-usat-widen.ll b/test/CodeGen/X86/vector-trunc-usat-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6922fde61b66c0e1bae8d2b2f01c2e11a7baee01
--- /dev/null
+++ b/test/CodeGen/X86/vector-trunc-usat-widen.ll
@@ -0,0 +1,2482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+
+;
+; Unsigned saturation truncation to vXi32
+;
+
+define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
+; SSE2-LABEL: trunc_usat_v4i64_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn {{.*}}(%rip), %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v4i64_v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm3
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    pandn {{.*}}(%rip), %xmm5
+; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v4i64_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4294967295,4294967295]
+; SSE41-NEXT:    movapd {{.*#+}} xmm5 = [4294967295,429496729]
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; SSE41-NEXT:    movaps %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v4i64_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-SLOW-NEXT:    vpxor %ymm1, %ymm0, %ymm1
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
+; AVX2-SLOW-NEXT:    vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_usat_v4i64_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-FAST-NEXT:    vpxor %ymm1, %ymm0, %ymm1
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
+; AVX2-FAST-NEXT:    vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_usat_v4i64_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX512F-NEXT:    vpcmpltuq %zmm1, %zmm0, %k1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpmovqd %zmm1, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_usat_v4i64_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
+; AVX512VL-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_usat_v4i64_v4i32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX512BW-NEXT:    vpcmpltuq %zmm1, %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i32:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
+; AVX512BWVL-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp ult <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 429496729>
+  %3 = trunc <4 x i64> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_usat_v8i64_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v8i64_v8i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm7
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm3
+; SSSE3-NEXT:    movdqa %xmm9, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm0, %xmm5
+; SSSE3-NEXT:    movdqa %xmm9, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm2, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v8i64_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT:    movdqa %xmm9, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm6, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm6, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm6, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm8, %xmm7
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm6
+; SSE41-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,2]
+; SSE41-NEXT:    movaps %xmm6, %xmm0
+; SSE41-NEXT:    movaps %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v8i64_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_usat_v8i64_v8i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-SLOW-NEXT:    vpxor %ymm3, %ymm1, %ymm4
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm4, %ymm5, %ymm4
+; AVX2-SLOW-NEXT:    vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpxor %ymm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT:    vpcmpgtq %ymm3, %ymm5, %ymm3
+; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_usat_v8i64_v8i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-FAST-NEXT:    vpxor %ymm3, %ymm1, %ymm4
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm4, %ymm5, %ymm4
+; AVX2-FAST-NEXT:    vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpxor %ymm3, %ymm0, %ymm3
+; AVX2-FAST-NEXT:    vpcmpgtq %ymm3, %ymm5, %ymm3
+; AVX2-FAST-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc_usat_v8i64_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovusqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ult <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %3 = trunc <8 x i64> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+;
+; Unsigned saturation truncation to vXi16
+;
+
+define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_usat_v8i64_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm6
+; SSE2-NEXT:    movdqa %xmm9, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v8i64_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm6, %xmm3
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm1, %xmm6
+; SSSE3-NEXT:    movdqa %xmm9, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v8i64_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movapd {{.*#+}} xmm5 = [65535,65535]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT:    movdqa %xmm9, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    movapd %xmm5, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm5, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm1
+; SSE41-NEXT:    packusdw %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm5, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm6
+; SSE41-NEXT:    movdqa %xmm9, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT:    packusdw %xmm4, %xmm5
+; SSE41-NEXT:    packusdw %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v8i64_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v8i64_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_usat_v8i64_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovusqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp ult <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %3 = trunc <8 x i64> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
+; SSE2-LABEL: trunc_usat_v8i32_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pslld $16, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v8i32_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pandn %xmm2, %xmm6
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm1, %xmm5
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v8i32_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE41-NEXT:    pminud %xmm2, %xmm1
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vpminud %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_usat_v8i32_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512F-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_usat_v8i32_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovusdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_usat_v8i32_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512BW-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i16:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovusdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp ult <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %3 = trunc <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) {
+; SSE2-LABEL: trunc_usat_v16i32_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm7, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm7, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
+; SSE2-NEXT:    pslld $16, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm5, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v16i32_v16i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm1, %xmm8
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm7
+; SSSE3-NEXT:    pxor %xmm6, %xmm7
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm7, %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm6, %xmm4
+; SSSE3-NEXT:    movdqa %xmm5, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm7, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm6, %xmm3
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm7, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm7, %xmm5
+; SSSE3-NEXT:    pslld $16, %xmm5
+; SSSE3-NEXT:    psrad $16, %xmm5
+; SSSE3-NEXT:    pslld $16, %xmm0
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    packssdw %xmm5, %xmm0
+; SSSE3-NEXT:    pslld $16, %xmm2
+; SSSE3-NEXT:    psrad $16, %xmm2
+; SSSE3-NEXT:    pslld $16, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    packssdw %xmm2, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v16i32_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSE41-NEXT:    pminud %xmm4, %xmm3
+; SSE41-NEXT:    pminud %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pminud %xmm4, %xmm1
+; SSE41-NEXT:    pminud %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v16i32_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v16i32_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_usat_v16i32_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovusdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = icmp ult <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %3 = trunc <16 x i32> %2 to <16 x i16>
+  ret <16 x i16> %3
+}
+
+;
+; Unsigned saturation truncation to v16i8
+;
+
+define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) {
+; SSE2-LABEL: trunc_usat_v8i64_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    pxor %xmm6, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm9, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v8i64_v8i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    pxor %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm4
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm6, %xmm4
+; SSSE3-NEXT:    movdqa %xmm9, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm6, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    movdqa %xmm9, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    pand %xmm8, %xmm2
+; SSSE3-NEXT:    pand %xmm8, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm8, %xmm4
+; SSSE3-NEXT:    pand %xmm8, %xmm0
+; SSSE3-NEXT:    packuswb %xmm4, %xmm0
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v8i64_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT:    movdqa %xmm9, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm7
+; SSE41-NEXT:    movdqa %xmm9, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    andpd %xmm8, %xmm2
+; SSE41-NEXT:    andpd %xmm8, %xmm1
+; SSE41-NEXT:    packusdw %xmm2, %xmm1
+; SSE41-NEXT:    andpd %xmm8, %xmm5
+; SSE41-NEXT:    andpd %xmm8, %xmm4
+; SSE41-NEXT:    packusdw %xmm5, %xmm4
+; SSE41-NEXT:    packusdw %xmm1, %xmm4
+; SSE41-NEXT:    packuswb %xmm4, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v8i64_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v8i64_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm4
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_usat_v8i64_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp ult <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %3 = trunc <8 x i64> %2 to <8 x i8>
+  ret <8 x i8> %3
+}
+
+define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
+; SSE2-LABEL: trunc_usat_v8i64_v8i8_store:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm6, %xmm0
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm9, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm10, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    packuswb %xmm0, %xmm5
+; SSE2-NEXT:    packuswb %xmm1, %xmm5
+; SSE2-NEXT:    packuswb %xmm5, %xmm5
+; SSE2-NEXT:    movq %xmm5, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v8i64_v8i8_store:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT:    movdqa %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm0, %xmm5
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm6, %xmm1
+; SSSE3-NEXT:    movdqa %xmm9, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm10, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm7, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    movdqa %xmm9, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    pand %xmm8, %xmm2
+; SSSE3-NEXT:    pand %xmm8, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm8, %xmm0
+; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    packuswb %xmm0, %xmm5
+; SSSE3-NEXT:    packuswb %xmm1, %xmm5
+; SSSE3-NEXT:    packuswb %xmm5, %xmm5
+; SSSE3-NEXT:    movq %xmm5, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v8i64_v8i8_store:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movapd {{.*#+}} xmm8 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT:    movdqa %xmm9, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm4
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm7
+; SSE41-NEXT:    movdqa %xmm9, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    andpd %xmm8, %xmm2
+; SSE41-NEXT:    andpd %xmm8, %xmm1
+; SSE41-NEXT:    packusdw %xmm2, %xmm1
+; SSE41-NEXT:    andpd %xmm8, %xmm4
+; SSE41-NEXT:    andpd %xmm8, %xmm6
+; SSE41-NEXT:    packusdw %xmm4, %xmm6
+; SSE41-NEXT:    packusdw %xmm1, %xmm6
+; SSE41-NEXT:    packuswb %xmm6, %xmm6
+; SSE41-NEXT:    movq %xmm6, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v8i64_v8i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v8i64_v8i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm4
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vmovq %xmm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_usat_v8i64_v8i8_store:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovusqb %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp ult <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %3 = trunc <8 x i64> %2 to <8 x i8>
+  store <8 x i8> %3, <8 x i8> *%p1
+  ret void
+}
+
+define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) {
+; SSE2-LABEL: trunc_usat_v16i64_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm9, %xmm11
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm11, %xmm12
+; SSE2-NEXT:    pand %xmm12, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm12
+; SSE2-NEXT:    por %xmm1, %xmm12
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm13, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm14, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm12, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm13, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm9, %xmm3
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm12, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm13, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    packuswb %xmm1, %xmm3
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm10, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm7, %xmm2
+; SSE2-NEXT:    pxor %xmm9, %xmm2
+; SSE2-NEXT:    movdqa %xmm10, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    pandn %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm6, %xmm9
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm6
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v16i64_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm11
+; SSSE3-NEXT:    pxor %xmm9, %xmm11
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT:    movdqa %xmm10, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm11, %xmm12
+; SSSE3-NEXT:    pand %xmm12, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm12
+; SSSE3-NEXT:    por %xmm1, %xmm12
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
+; SSSE3-NEXT:    movdqa %xmm10, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm13, %xmm14
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    por %xmm14, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm12, %xmm0
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
+; SSSE3-NEXT:    movdqa %xmm10, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm12, %xmm13
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    por %xmm13, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm9, %xmm3
+; SSSE3-NEXT:    movdqa %xmm10, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm12, %xmm13
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    por %xmm13, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    packuswb %xmm1, %xmm3
+; SSSE3-NEXT:    packuswb %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
+; SSSE3-NEXT:    movdqa %xmm10, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm5
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
+; SSSE3-NEXT:    movdqa %xmm10, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm11, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm7, %xmm2
+; SSSE3-NEXT:    pxor %xmm9, %xmm2
+; SSSE3-NEXT:    movdqa %xmm10, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm7
+; SSSE3-NEXT:    pandn %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm7, %xmm3
+; SSSE3-NEXT:    pxor %xmm6, %xmm9
+; SSSE3-NEXT:    movdqa %xmm10, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm9
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm6
+; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v16i64_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    movapd {{.*#+}} xmm9 = [255,255]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm11 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT:    movdqa %xmm11, %xmm12
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm12
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm12, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm12
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm12
+; SSE41-NEXT:    movdqa %xmm8, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm13
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm13
+; SSE41-NEXT:    packusdw %xmm12, %xmm13
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    packusdw %xmm1, %xmm3
+; SSE41-NEXT:    packusdw %xmm3, %xmm13
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm7, %xmm0
+; SSE41-NEXT:    pxor %xmm10, %xmm0
+; SSE41-NEXT:    movdqa %xmm11, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT:    pxor %xmm6, %xmm10
+; SSE41-NEXT:    movdqa %xmm11, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm11, %xmm10
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm9
+; SSE41-NEXT:    packusdw %xmm1, %xmm9
+; SSE41-NEXT:    packusdw %xmm9, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm13
+; SSE41-NEXT:    movdqa %xmm13, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm8 = [255,255,255,255]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm0, %ymm8, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm1, %ymm8, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm2, %ymm8, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm3, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm8, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v16i64_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm5, %ymm1, %ymm6
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
+; AVX2-NEXT:    vpcmpgtq %ymm6, %ymm7, %ymm6
+; AVX2-NEXT:    vblendvpd %ymm6, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm5, %ymm0, %ymm6
+; AVX2-NEXT:    vpcmpgtq %ymm6, %ymm7, %ymm6
+; AVX2-NEXT:    vblendvpd %ymm6, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm5, %ymm3, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm7, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm5, %ymm2, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm7, %ymm3
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_usat_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
+; AVX512-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp ult <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %3 = trunc <16 x i64> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) {
+; SSE2-LABEL: trunc_usat_v8i32_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm1, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    packuswb %xmm6, %xmm2
+; SSE2-NEXT:    packuswb %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v8i32_v8i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pandn %xmm2, %xmm6
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm1, %xmm5
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v8i32_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255]
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pminud %xmm2, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v8i32_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v8i32_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_usat_v8i32_v8i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_usat_v8i32_v8i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_usat_v8i32_v8i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp ult <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %3 = trunc <8 x i32> %2 to <8 x i8>
+  ret <8 x i8> %3
+}
+
+define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
+; SSE2-LABEL: trunc_usat_v8i32_v8i8_store:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm6
+; SSE2-NEXT:    packuswb %xmm5, %xmm6
+; SSE2-NEXT:    packuswb %xmm6, %xmm6
+; SSE2-NEXT:    movq %xmm6, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v8i32_v8i8_store:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pandn %xmm2, %xmm6
+; SSSE3-NEXT:    por %xmm0, %xmm6
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm0, %xmm5
+; SSSE3-NEXT:    pshufb %xmm0, %xmm6
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSSE3-NEXT:    movq %xmm6, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v8i32_v8i8_store:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255]
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pminud %xmm2, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v8i32_v8i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v8i32_v8i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_usat_v8i32_v8i8_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_usat_v8i32_v8i8_store:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovusdb %ymm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_usat_v8i32_v8i8_store:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8_store:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovusdb %ymm0, (%rdi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp ult <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %3 = trunc <8 x i32> %2 to <8 x i8>
+  store <8 x i8> %3, <8 x i8> *%p1
+  ret void
+}
+
+define <16 x i8> @trunc_usat_v16i32_v16i8(<16 x i32> %a0) {
+; SSE2-LABEL: trunc_usat_v16i32_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    packuswb %xmm4, %xmm5
+; SSE2-NEXT:    packuswb %xmm5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v16i32_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm7
+; SSSE3-NEXT:    pxor %xmm6, %xmm7
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pandn %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm6, %xmm1
+; SSSE3-NEXT:    movdqa %xmm5, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pandn %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm7, %xmm0
+; SSSE3-NEXT:    packuswb %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm6, %xmm1
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pandn %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pandn %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    packuswb %xmm4, %xmm5
+; SSSE3-NEXT:    packuswb %xmm5, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v16i32_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255]
+; SSE41-NEXT:    pminud %xmm4, %xmm1
+; SSE41-NEXT:    pminud %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pminud %xmm4, %xmm3
+; SSE41-NEXT:    pminud %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v16i32_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
+; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc_usat_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovusdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = icmp ult <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %3 = trunc <16 x i32> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) {
+; SSE2-LABEL: trunc_usat_v16i16_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023]
+; SSE2-NEXT:    pminsw %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v16i16_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023]
+; SSSE3-NEXT:    pminsw %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    pminsw %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v16i16_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pminuw %xmm2, %xmm1
+; SSE41-NEXT:    pminuw %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v16i16_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpminuw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_usat_v16i16_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_usat_v16i16_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_usat_v16i16_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_usat_v16i16_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovuswb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp ult <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = trunc <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) {
+; SSE2-LABEL: trunc_usat_v32i16_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023]
+; SSE2-NEXT:    pminsw %xmm5, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pminsw %xmm5, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pminsw %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pminsw %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_usat_v32i16_v32i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023]
+; SSSE3-NEXT:    pminsw %xmm5, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pminsw %xmm5, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pminsw %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    pminsw %xmm5, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_usat_v32i16_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pminuw %xmm4, %xmm3
+; SSE41-NEXT:    pminuw %xmm4, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    pminuw %xmm4, %xmm1
+; SSE41-NEXT:    pminuw %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc_usat_v32i16_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpminuw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpminuw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpminuw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpminuw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v32i16_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpminuw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_usat_v32i16_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpminuw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_usat_v32i16_v32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpminuw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_usat_v32i16_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_usat_v32i16_v32i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+  %1 = icmp ult <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = trunc <32 x i16> %2 to <32 x i8>
+  ret <32 x i8> %3
+}
diff --git a/test/CodeGen/X86/vector-trunc-usat.ll b/test/CodeGen/X86/vector-trunc-usat.ll
index 0c3766ac9783849c317e56ac7a608a91831a26b9..c5a0bab332f6f8a6e1d3096d4806f980c27951a6 100644
--- a/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/test/CodeGen/X86/vector-trunc-usat.ll
@@ -90,19 +90,17 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm3
 ; SSE41-NEXT:    pxor %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm6, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4294967295,4294967295]
 ; SSE41-NEXT:    movapd {{.*#+}} xmm5 = [4294967295,429496729]
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
@@ -337,24 +335,22 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
 ; SSE41-NEXT:    movdqa %xmm9, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm6, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm5
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm7, %xmm0
 ; SSE41-NEXT:    movdqa %xmm9, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    movapd %xmm6, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
@@ -364,21 +360,19 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm6, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    pxor %xmm8, %xmm7
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE41-NEXT:    pand %xmm1, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm6
 ; SSE41-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,2]
 ; SSE41-NEXT:    movaps %xmm6, %xmm0
@@ -609,24 +603,22 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
 ; SSE41-NEXT:    movdqa %xmm9, %xmm7
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    movapd %xmm5, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa %xmm9, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm5, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm1
 ; SSE41-NEXT:    packusdw %xmm4, %xmm1
@@ -634,23 +626,21 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) {
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa %xmm9, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm5, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
 ; SSE41-NEXT:    pxor %xmm2, %xmm6
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm9, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
 ; SSE41-NEXT:    packusdw %xmm4, %xmm5
 ; SSE41-NEXT:    packusdw %xmm5, %xmm1
@@ -1094,24 +1084,22 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
 ; SSE41-NEXT:    movdqa %xmm9, %xmm7
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    movapd %xmm5, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa %xmm9, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm5, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm1
 ; SSE41-NEXT:    packusdw %xmm4, %xmm1
@@ -1119,23 +1107,21 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa %xmm9, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm5, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
 ; SSE41-NEXT:    pxor %xmm2, %xmm6
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm9, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT:    pand %xmm7, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
 ; SSE41-NEXT:    packusdw %xmm4, %xmm5
 ; SSE41-NEXT:    packusdw %xmm5, %xmm1
@@ -1343,47 +1329,43 @@ define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
 ; SSE41-NEXT:    movdqa %xmm9, %xmm6
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    movapd %xmm8, %xmm6
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm7, %xmm0
 ; SSE41-NEXT:    movdqa %xmm9, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    movapd %xmm8, %xmm4
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm7, %xmm0
 ; SSE41-NEXT:    movdqa %xmm9, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm10, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm8, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm7
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm9, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm9, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE41-NEXT:    pand %xmm2, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm8, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    andpd %xmm8, %xmm2
@@ -1715,10 +1697,9 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm12
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm12, %xmm0
 ; SSE41-NEXT:    movapd %xmm9, %xmm12
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm12
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
@@ -1727,10 +1708,9 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm13, %xmm14
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm14, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm13, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm9, %xmm13
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm13
 ; SSE41-NEXT:    packusdw %xmm12, %xmm13
@@ -1740,36 +1720,33 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm12
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm12, %xmm0
-; SSE41-NEXT:    movapd %xmm9, %xmm12
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm12
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
 ; SSE41-NEXT:    movdqa %xmm11, %xmm3
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm8, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm1, %xmm0
-; SSE41-NEXT:    movapd %xmm9, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    packusdw %xmm12, %xmm1
-; SSE41-NEXT:    packusdw %xmm1, %xmm13
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm9, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    packusdw %xmm1, %xmm3
+; SSE41-NEXT:    packusdw %xmm3, %xmm13
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
 ; SSE41-NEXT:    movdqa %xmm11, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm9, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
@@ -1778,10 +1755,9 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm9, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    packusdw %xmm1, %xmm2
@@ -1791,21 +1767,19 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm9, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
 ; SSE41-NEXT:    pxor %xmm6, %xmm10
-; SSE41-NEXT:    movdqa %xmm11, %xmm0
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa %xmm11, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE41-NEXT:    pcmpeqd %xmm11, %xmm10
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE41-NEXT:    pand %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm9
 ; SSE41-NEXT:    packusdw %xmm1, %xmm9
 ; SSE41-NEXT:    packusdw %xmm9, %xmm2
@@ -2310,7 +2284,7 @@ define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) {
 ; AVX512F-LABEL: trunc_usat_v16i16_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2318,7 +2292,7 @@ define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) {
 ; AVX512VL-LABEL: trunc_usat_v16i16_v16i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2425,9 +2399,9 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) {
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpminuw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -2437,9 +2411,9 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) {
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpminuw %ymm2, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-trunc-widen.ll b/test/CodeGen/X86/vector-trunc-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..497393786aa99085e5f84cea1183b1d5eddf2a3b
--- /dev/null
+++ b/test/CodeGen/X86/vector-trunc-widen.ll
@@ -0,0 +1,2012 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+
+define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc8i64_8i32:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i32:
+; AVX2-FAST:       # %bb.0: # %entry
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <8 x i64> %a to <8 x i32>
+  ret <8 x i32> %0
+}
+
+define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i32_ashr:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i32_ashr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i32_ashr:
+; AVX2-FAST:       # %bb.0: # %entry
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i32_ashr:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsraq $32, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %1 = trunc <8 x i64> %0 to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i32_lshr:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i32_lshr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
+; AVX2-FAST:       # %bb.0: # %entry
+; AVX2-FAST-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i32_lshr:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %1 = trunc <8 x i64> %0 to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
+; SSE2-LABEL: trunc8i64_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i64_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc8i64_8i16:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i16:
+; AVX2-FAST:       # %bb.0: # %entry
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <8 x i64> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define void @trunc8i64_8i8(<8 x i64> %a) {
+; SSE2-LABEL: trunc8i64_8i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i64_8i8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <8 x i64> %a to <8 x i8>
+  store <8 x i8> %0, <8 x i8>* undef, align 4
+  ret void
+}
+
+define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i16:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc <8 x i32> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
+; SSE-LABEL: trunc8i32_8i16_ashr:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i16_ashr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i16_ashr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i16_ashr:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpsrad $16, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16_ashr:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpsrad $16, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16_ashr:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpsrad $16, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpsrad $16, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %1 = trunc <8 x i32> %0 to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i16_lshr:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i16_lshr:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i16_lshr:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i16_lshr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i16_lshr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i16_lshr:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16_lshr:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16_lshr:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %1 = trunc <8 x i32> %0 to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define void @trunc8i32_8i8(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i8:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i8:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i8:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i8:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc <8 x i32> %a to <8 x i8>
+  store <8 x i8> %0, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i16(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i32_16i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pslld $16, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    pslld $16, %xmm0
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    packssdw %xmm1, %xmm0
+; SSSE3-NEXT:    pslld $16, %xmm3
+; SSSE3-NEXT:    psrad $16, %xmm3
+; SSSE3-NEXT:    pslld $16, %xmm2
+; SSSE3-NEXT:    psrad $16, %xmm2
+; SSSE3-NEXT:    packssdw %xmm3, %xmm2
+; SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <16 x i32> %a to <16 x i16>
+  store <16 x i16> %0, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
+; SSE-LABEL: trunc16i32_16i16_ashr:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psrad $16, %xmm3
+; SSE-NEXT:    psrad $16, %xmm2
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    movdqu %xmm2, (%rax)
+; SSE-NEXT:    movdqu %xmm0, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i16_ashr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i16_ashr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrad $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i16_ashr:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %1 = trunc <16 x i32> %0 to <16 x i16>
+  store <16 x i16> %1, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i16_lshr:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    psrld $16, %xmm2
+; SSE2-NEXT:    psrld $16, %xmm3
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i32_16i16_lshr:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    psrld $16, %xmm2
+; SSSE3-NEXT:    psrld $16, %xmm3
+; SSSE3-NEXT:    psrld $16, %xmm0
+; SSSE3-NEXT:    psrld $16, %xmm1
+; SSSE3-NEXT:    pslld $16, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    pslld $16, %xmm0
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    packssdw %xmm1, %xmm0
+; SSSE3-NEXT:    pslld $16, %xmm3
+; SSSE3-NEXT:    psrad $16, %xmm3
+; SSSE3-NEXT:    pslld $16, %xmm2
+; SSSE3-NEXT:    psrad $16, %xmm2
+; SSSE3-NEXT:    packssdw %xmm3, %xmm2
+; SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i16_lshr:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    psrld $16, %xmm3
+; SSE41-NEXT:    psrld $16, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i16_lshr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i16_lshr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i16_lshr:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %1 = trunc <16 x i32> %0 to <16 x i16>
+  store <16 x i16> %1, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i8(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i32_16i8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <16 x i32> %a to <16 x i8>
+  store <16 x i8> %0, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
+; SSE-LABEL: trunc16i32_16i8_ashr:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psrad $24, %xmm1
+; SSE-NEXT:    psrad $24, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    psrad $24, %xmm3
+; SSE-NEXT:    psrad $24, %xmm2
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    packsswb %xmm2, %xmm0
+; SSE-NEXT:    movdqu %xmm0, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i8_ashr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i8_ashr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrad $24, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i8_ashr:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %1 = trunc <16 x i32> %0 to <16 x i8>
+  store <16 x i8> %1, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i8_lshr:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    psrld $24, %xmm1
+; SSE2-NEXT:    psrld $24, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    psrld $24, %xmm3
+; SSE2-NEXT:    psrld $24, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i32_16i8_lshr:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    psrld $24, %xmm1
+; SSSE3-NEXT:    psrld $24, %xmm0
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSSE3-NEXT:    psrld $24, %xmm3
+; SSSE3-NEXT:    psrld $24, %xmm2
+; SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i8_lshr:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    psrld $24, %xmm1
+; SSE41-NEXT:    psrld $24, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    psrld $24, %xmm3
+; SSE41-NEXT:    psrld $24, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i8_lshr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i8_lshr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i8_lshr:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %1 = trunc <16 x i32> %0 to <16 x i8>
+  store <16 x i8> %1, <16 x i8>* undef, align 4
+  ret void
+}
+
+;PR25684
+define void @trunc16i16_16i8(<16 x i16> %a) {
+; SSE2-LABEL: trunc16i16_16i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i16_16i8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i16_16i8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i16_16i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i16_16i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc16i16_16i8:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc <16 x i16> %a to <16 x i8>
+  store <16 x i8> %0, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
+; SSE-LABEL: trunc16i16_16i8_ashr:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psraw $8, %xmm1
+; SSE-NEXT:    psraw $8, %xmm0
+; SSE-NEXT:    packsswb %xmm1, %xmm0
+; SSE-NEXT:    movdqu %xmm0, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i16_16i8_ashr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i16_16i8_ashr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsraw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc16i16_16i8_ashr:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpsraw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8_ashr:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpsraw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8_ashr:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpsraw $8, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %1 = trunc <16 x i16> %0 to <16 x i8>
+  store <16 x i8> %1, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
+; SSE-LABEL: trunc16i16_16i8_lshr:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    movdqu %xmm0, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i16_16i8_lshr:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i16_16i8_lshr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc16i16_16i8_lshr:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8_lshr:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8_lshr:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %1 = trunc <16 x i16> %0 to <16 x i8>
+  store <16 x i8> %1, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @trunc32i16_32i8(<32 x i16> %a) {
+; SSE2-LABEL: trunc32i16_32i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc32i16_32i8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm4, %xmm1
+; SSSE3-NEXT:    pshufb %xmm4, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm3
+; SSSE3-NEXT:    pshufb %xmm4, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc32i16_32i8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm4, %xmm1
+; SSE41-NEXT:    pshufb %xmm4, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    pshufb %xmm4, %xmm3
+; SSE41-NEXT:    pshufb %xmm4, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc32i16_32i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc32i16_32i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc32i16_32i8:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc32i16_32i8:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc32i16_32i8:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc32i16_32i8:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc <32 x i16> %a to <32 x i8>
+  store <32 x i8> %0, <32 x i8>* undef, align 4
+  ret void
+}
+
+define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
+; SSE-LABEL: trunc2x4i64_8i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc2x4i64_8i32:
+; AVX2-FAST:       # %bb.0: # %entry
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i64_8i32:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i64_8i32:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i64_8i32:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i64_8i32:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc <4 x i64> %a to <4 x i32>
+  %1 = trunc <4 x i64> %b to <4 x i32>
+  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: trunc2x4i64_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i64_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i64_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc2x4i64_8i16:
+; AVX2-FAST:       # %bb.0: # %entry
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i64_8i16:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i64_8i16:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i64_8i16:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i64_8i16:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc <4 x i64> %a to <4 x i16>
+  %1 = trunc <4 x i64> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: trunc2x2i64_4i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc2x2i64_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: trunc2x2i64_4i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <2 x i64> %a to <2 x i32>
+  %1 = trunc <2 x i64> %b to <2 x i32>
+  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define i64 @trunc2i64_i64(<2 x i64> %inval) {
+; SSE-LABEL: trunc2i64_i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc2i64_i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: trunc2i64_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <2 x i64> %inval to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to i64
+  ret i64 %1
+}
+
+define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: trunc2x4i32_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i32_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i32_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x4i32_8i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: trunc2x4i32_8i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <4 x i32> %a to <4 x i16>
+  %1 = trunc <4 x i32> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
+define i64 @trunc4i32_i64(<4 x i32> %inval) {
+; SSE2-LABEL: trunc4i32_i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc4i32_i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    movq %xmm0, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc4i32_i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc4i32_i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: trunc4i32_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <4 x i32> %inval to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to i64
+  ret i64 %1
+}
+
+define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: trunc2x8i16_16i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x8i16_16i8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x8i16_16i8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x8i16_16i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: trunc2x8i16_16i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <8 x i16> %a to <8 x i8>
+  %1 = trunc <8 x i16> %b to <8 x i8>
+  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %2
+}
+
+; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
+define i64 @trunc8i16_i64(<8 x i16> %inval) {
+; SSE2-LABEL: trunc8i16_i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i16_i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    movq %xmm0, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i16_i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc8i16_i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i16_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc <8 x i16> %inval to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to i64
+  ret i64 %1
+}
+
+define <16 x i8> @trunc16i64_16i8_const() {
+; SSE-LABEL: trunc16i64_16i8_const:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc16i64_16i8_const:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i64_16i8_const:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+
+entry:
+  %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
+  %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @PR32160(<8 x i32> %x) {
+; SSE-LABEL: PR32160:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR32160:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: PR32160:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: PR32160:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: PR32160:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: PR32160:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: PR32160:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: PR32160:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %shuf = trunc <8 x i32> %x to <8 x i16>
+  %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %trunc
+}
+
+define void @PR34773(i16* %a0, i8* %a1) {
+; SSE-LABEL: PR34773:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    movdqu 16(%rdi), %xmm1
+; SSE-NEXT:    movdqu 32(%rdi), %xmm2
+; SSE-NEXT:    movdqu 48(%rdi), %xmm3
+; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    psrlw $8, %xmm3
+; SSE-NEXT:    psrlw $8, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    movdqu %xmm0, (%rsi)
+; SSE-NEXT:    movdqu %xmm2, 16(%rsi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR34773:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR34773:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: PR34773:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, 16(%rsi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: PR34773:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqu 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, 16(%rsi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: PR34773:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm1
+; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: PR34773:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpsrlw $8, 32(%rdi), %ymm1
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vpmovwb %ymm1, 16(%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+  %1  = getelementptr i16, i16* %a0, i64 16
+  %2  = getelementptr i8, i8* %a1, i64 16
+  %3  = bitcast i16* %a0 to <16 x i16>*
+  %4  = bitcast i16* %1 to <16 x i16>*
+  %5  = bitcast i8* %a1 to <16 x i8>*
+  %6  = bitcast i8* %2 to <16 x i8>*
+  %7  = load <16 x i16>, <16 x i16>* %3, align 2
+  %8  = load <16 x i16>, <16 x i16>* %4, align 2
+  %9  = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %11 = trunc <16 x i16> %9  to <16 x i8>
+  %12 = trunc <16 x i16> %10 to <16 x i8>
+  store <16 x i8> %11, <16 x i8>* %5, align 1
+  store <16 x i8> %12, <16 x i8>* %6, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index 79cbb8cc924fdefba8bc92ea82de8524af58e439..c17a618f28b0fc25d0a2beaaef10d17f77f18aa2 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -926,12 +926,12 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1108,36 +1108,32 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
 ;
 ; AVX1-LABEL: trunc16i16_16i8:
 ; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: trunc16i16_16i8:
 ; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc16i16_16i8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc16i16_16i8:
 ; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1192,7 +1188,7 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
 ; AVX512F-LABEL: trunc16i16_16i8_ashr:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vpsraw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1200,7 +1196,7 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
 ; AVX512VL-LABEL: trunc16i16_16i8_ashr:
 ; AVX512VL:       # %bb.0: # %entry
 ; AVX512VL-NEXT:    vpsraw $8, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1257,7 +1253,7 @@ define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
 ; AVX512F-LABEL: trunc16i16_16i8_lshr:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1265,7 +1261,7 @@ define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
 ; AVX512VL-LABEL: trunc16i16_16i8_lshr:
 ; AVX512VL:       # %bb.0: # %entry
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1333,15 +1329,13 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
 ;
 ; AVX1-LABEL: trunc32i16_32i8:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -1349,15 +1343,13 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
 ;
 ; AVX2-LABEL: trunc32i16_32i8:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
@@ -1365,9 +1357,9 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
 ;
 ; AVX512F-LABEL: trunc32i16_32i8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
@@ -1376,9 +1368,9 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
 ;
 ; AVX512VL-LABEL: trunc32i16_32i8:
 ; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vmovdqu %ymm0, (%rax)
@@ -1647,29 +1639,11 @@ define i64 @trunc2i64_i64(<2 x i64> %inval) {
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc2i64_i64:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc2i64_i64:
-; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc2i64_i64:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc2i64_i64:
-; AVX512BWVL:       # %bb.0: # %entry
-; AVX512BWVL-NEXT:    vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: trunc2i64_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
 entry:
   %0 = trunc <2 x i64> %inval to <2 x i32>
   %1 = bitcast <2 x i32> %0 to i64
@@ -1754,29 +1728,11 @@ define i64 @trunc4i32_i64(<4 x i32> %inval) {
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc4i32_i64:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc4i32_i64:
-; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc4i32_i64:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc4i32_i64:
-; AVX512BWVL:       # %bb.0: # %entry
-; AVX512BWVL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: trunc4i32_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
 entry:
   %0 = trunc <4 x i32> %inval to <4 x i16>
   %1 = bitcast <4 x i16> %0 to i64
@@ -1857,29 +1813,11 @@ define i64 @trunc8i16_i64(<8 x i16> %inval) {
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc8i16_i64:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc8i16_i64:
-; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc8i16_i64:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc8i16_i64:
-; AVX512BWVL:       # %bb.0: # %entry
-; AVX512BWVL-NEXT:    vpmovwb %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: trunc8i16_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
 entry:
   %0 = trunc <8 x i16> %inval to <8 x i8>
   %1 = bitcast <8 x i8> %0 to i64
@@ -1988,19 +1926,18 @@ define void @PR34773(i16* %a0, i8* %a1) {
 ;
 ; AVX1-LABEL: PR34773:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqu 32(%rdi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rsi)
 ; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR34773:
@@ -2024,10 +1961,10 @@ define void @PR34773(i16* %a0, i8* %a1) {
 ; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
-; AVX512F-NEXT:    vpmovdb %zmm1, 16(%rsi)
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, 16(%rsi)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -2037,10 +1974,10 @@ define void @PR34773(i16* %a0, i8* %a1) {
 ; AVX512VL-NEXT:    vmovdqu 32(%rdi), %ymm1
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
-; AVX512VL-NEXT:    vpmovdb %zmm1, 16(%rsi)
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, 16(%rsi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 21142ff3970c412e00bb12e26ae5c550d4b0ce2e..8c32bf86700c749064add68c26179a9212480592 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -633,7 +633,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BITALG-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
@@ -876,7 +876,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BITALG-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-zext-widen.ll b/test/CodeGen/X86/vector-zext-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2eafc7cdaf08bde0044b57213dc909c41f119bb3
--- /dev/null
+++ b/test/CodeGen/X86/vector-zext-widen.ll
@@ -0,0 +1,2283 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
+
+define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: zext_16i8_to_8i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = zext <8 x i8> %B to <8 x i16>
+  ret <8 x i16> %C
+}
+
+; PR17654
+define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
+; SSE2-LABEL: zext_16i8_to_16i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_16i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_16i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_16i8_to_16i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_16i8_to_16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_16i8_to_16i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = zext <16 x i8> %A to <16 x i16>
+  ret <16 x i16> %B
+}
+
+define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
+; SSE2-LABEL: zext_32i8_to_32i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_32i8_to_32i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_32i8_to_32i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_32i8_to_32i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_32i8_to_32i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: zext_32i8_to_32i16:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: zext_32i8_to_32i16:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    retq
+entry:
+  %B = zext <32 x i8> %A to <32 x i16>
+  ret <32 x i16> %B
+}
+
+define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: zext_16i8_to_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = zext <4 x i8> %B to <4 x i32>
+  ret <4 x i32> %C
+}
+
+define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_16i8_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_16i8_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_16i8_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = zext <8 x i8> %B to <8 x i32>
+  ret <8 x i32> %C
+}
+
+define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_16i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_16i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_16i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_16i8_to_16i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_16i8_to_16i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_16i8_to_16i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = zext <16 x i8> %A to <16 x i32>
+  ret <16 x i32> %B
+}
+
+define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: zext_16i8_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %C = zext <2 x i8> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_16i8_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_16i8_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_16i8_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = zext <4 x i8> %B to <4 x i64>
+  ret <4 x i64> %C
+}
+
+define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i8_to_8i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_8i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_8i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlq $48, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_16i8_to_8i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_16i8_to_8i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_16i8_to_8i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %C = zext <8 x i8> %B to <8 x i64>
+  ret <8 x i64> %C
+}
+
+define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i16_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i16_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: zext_8i16_to_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = zext <4 x i16> %B to <4 x i32>
+  ret <4 x i32> %C
+}
+
+define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i16_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i16_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i16_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i16_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_8i16_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = zext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+
+define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_16i16_to_16i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i16_to_16i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i16_to_16i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_16i16_to_16i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_16i16_to_16i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_16i16_to_16i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = zext <16 x i16> %A to <16 x i32>
+  ret <16 x i32> %B
+}
+
+define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i16_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i16_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: zext_8i16_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %C = zext <2 x i16> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i16_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i16_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i16_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i16_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_8i16_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %C = zext <4 x i16> %B to <4 x i64>
+  ret <4 x i64> %C
+}
+
+define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_8i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i16_to_8i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i16_to_8i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i16_to_8i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i16_to_8i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_8i16_to_8i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = zext <8 x i16> %A to <8 x i64>
+  ret <8 x i64> %B
+}
+
+define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_4i32_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_4i32_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_4i32_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: zext_4i32_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %C = zext <2 x i32> %B to <2 x i64>
+  ret <2 x i64> %C
+}
+
+define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_4i32_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_4i32_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movaps %xmm0, %xmm1
+; SSSE3-NEXT:    xorps %xmm2, %xmm2
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_4i32_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_4i32_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_4i32_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_4i32_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = zext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i32_to_8i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    xorps %xmm4, %xmm4
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i32_to_8i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movaps %xmm1, %xmm3
+; SSSE3-NEXT:    movaps %xmm0, %xmm1
+; SSSE3-NEXT:    xorps %xmm4, %xmm4
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSSE3-NEXT:    movaps %xmm3, %xmm2
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i32_to_8i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i32_to_8i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i32_to_8i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_8i32_to_8i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = zext <8 x i32> %A to <8 x i64>
+  ret <8 x i64>%B
+}
+
+define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_2i8_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_2i8_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_2i8_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_2i8_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <2 x i8>, <2 x i8>* %ptr
+ %Y = zext <2 x i8> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_4i8_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i8_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i8_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_4i8_to_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <4 x i8>, <4 x i8>* %ptr
+ %Y = zext <4 x i8> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_4i8_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i8_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i8_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_4i8_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_4i8_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_zext_4i8_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+ %X = load <4 x i8>, <4 x i8>* %ptr
+ %Y = zext <4 x i8> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i16> @load_zext_8i8_to_8i16(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_8i8_to_8i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_8i8_to_8i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_8i8_to_8i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_8i8_to_8i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = zext <8 x i8> %X to <8 x i16>
+ ret <8 x i16> %Y
+}
+
+define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_8i8_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_8i8_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_8i8_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_8i8_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_8i8_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_zext_8i8_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = zext <8 x i8> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_16i8_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_16i8_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_16i8_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_16i8_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_16i8_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_zext_16i8_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+ %X = load <16 x i8>, <16 x i8>* %ptr
+ %Y = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %Z = zext <8 x i8> %Y to <8 x i32>
+ ret <8 x i32> %Z
+}
+
+define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_8i8_to_8i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_8i8_to_8i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_8i8_to_8i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_8i8_to_8i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_8i8_to_8i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_zext_8i8_to_8i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = zext <8 x i8> %X to <8 x i64>
+ ret <8 x i64> %Y
+}
+
+define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_16i8_to_16i16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_16i8_to_16i16:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_16i8_to_16i16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_16i8_to_16i16:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_16i8_to_16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_zext_16i8_to_16i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512-NEXT:    retq
+entry:
+ %X = load <16 x i8>, <16 x i8>* %ptr
+ %Y = zext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_2i16_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_2i16_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_2i16_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_2i16_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <2 x i16>, <2 x i16>* %ptr
+ %Y = zext <2 x i16> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i32> @load_zext_4i16_to_4i32(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_4i16_to_4i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i16_to_4i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i16_to_4i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_4i16_to_4i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <4 x i16>, <4 x i16>* %ptr
+ %Y = zext <4 x i16> %X to <4 x i32>
+ ret <4 x i32> %Y
+}
+
+define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_4i16_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i16_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i16_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_4i16_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_4i16_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_zext_4i16_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+ %X = load <4 x i16>, <4 x i16>* %ptr
+ %Y = zext <4 x i16> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_8i16_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_8i16_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_8i16_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_8i16_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_8i16_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_zext_8i16_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX512-NEXT:    retq
+entry:
+ %X = load <8 x i16>, <8 x i16>* %ptr
+ %Y = zext <8 x i16> %X to <8 x i32>
+ ret <8 x i32> %Y
+}
+
+define <2 x i64> @load_zext_2i32_to_2i64(<2 x i32> *%ptr) {
+; SSE2-LABEL: load_zext_2i32_to_2i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_2i32_to_2i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_2i32_to_2i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_zext_2i32_to_2i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX-NEXT:    retq
+entry:
+ %X = load <2 x i32>, <2 x i32>* %ptr
+ %Y = zext <2 x i32> %X to <2 x i64>
+ ret <2 x i64> %Y
+}
+
+define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
+; SSE2-LABEL: load_zext_4i32_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movaps (%rdi), %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i32_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movaps (%rdi), %xmm1
+; SSSE3-NEXT:    xorps %xmm2, %xmm2
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i32_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_4i32_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_4i32_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_zext_4i32_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX512-NEXT:    retq
+entry:
+ %X = load <4 x i32>, <4 x i32>* %ptr
+ %Y = zext <4 x i32> %X to <4 x i64>
+ ret <4 x i64> %Y
+}
+
+define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
+; SSE2-LABEL: zext_8i8_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i8_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i8_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i8_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i8_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_8i8_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %t = zext <8 x i8> %z to <8 x i32>
+  ret <8 x i32> %t
+}
+
+define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i16_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i16_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
+  %Z = bitcast <16 x i16> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
+
+define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_4i32_to_4i64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movaps %xmm0, %xmm1
+; SSSE3-NEXT:    xorps %xmm2, %xmm2
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_4i32_to_4i64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_4i32_to_4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_4i32_to_4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_4i32_to_4i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
+  %Z = bitcast <8 x i32> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
+
+define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
+; SSE2-LABEL: shuf_zext_8i8_to_8i32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i8_to_8i32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i8_to_8i32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i8_to_8i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_8i8_to_8i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
+  %Z = bitcast <32 x i8> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
+
+define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    psrlq $48, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuf_zext_16i8_to_2i64_offset6:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> <i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %Z = bitcast <16 x i8> %B to <2 x i64>
+  ret <2 x i64> %Z
+}
+
+define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %Z = bitcast <32 x i8> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
+
+define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuf_zext_8i16_to_2i64_offset6:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8>
+  %Z = bitcast <8 x i16> %B to <2 x i64>
+  ret <2 x i64> %Z
+}
+
+define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
+  %Z = bitcast <16 x i16> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
+
+define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX2-FAST:       # %bb.0: # %entry
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
+; AVX512BW-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
+  %Z = bitcast <8 x i16> %B to <4 x i32>
+  ret <4 x i32> %Z
+}
+
+define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>
+  %Z = bitcast <16 x i16> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
+
+define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>
+  %Z = bitcast <16 x i16> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
+
+define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 3, i32 4>
+  %Z = bitcast <4 x i32> %B to <2 x i64>
+  ret <2 x i64> %Z
+}
+
+define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>
+  %Z = bitcast <8 x i32> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
+
+define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
+; SSE2-LABEL: zext_32i8_to_32i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, 112(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, 96(%rdi)
+; SSE2-NEXT:    movdqa %xmm6, 80(%rdi)
+; SSE2-NEXT:    movdqa %xmm7, 64(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm5, 32(%rdi)
+; SSE2-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm8, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_32i8_to_32i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq %rdi, %rax
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm8
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm6, %xmm7
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, 112(%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, 96(%rdi)
+; SSSE3-NEXT:    movdqa %xmm6, 80(%rdi)
+; SSSE3-NEXT:    movdqa %xmm7, 64(%rdi)
+; SSSE3-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSSE3-NEXT:    movdqa %xmm5, 32(%rdi)
+; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm8, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_32i8_to_32i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq %rdi, %rax
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm1, 112(%rdi)
+; SSE41-NEXT:    movdqa %xmm7, 96(%rdi)
+; SSE41-NEXT:    movdqa %xmm6, 80(%rdi)
+; SSE41-NEXT:    movdqa %xmm5, 64(%rdi)
+; SSE41-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSE41-NEXT:    movdqa %xmm4, 32(%rdi)
+; SSE41-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_32i8_to_32i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX1-NEXT:    vmovaps %ymm4, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_32i8_to_32i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_32i8_to_32i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %res = zext <32 x i8>%x to <32 x i32>
+  ret <32 x i32> %res
+}
+
+define <2 x i32> @zext_2i8_to_2i32(<2 x i8>* %addr) {
+; SSE2-LABEL: zext_2i8_to_2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movzwl (%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddd %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_2i8_to_2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movzwl (%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    paddd %xmm0, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_2i8_to_2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movzwl (%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    paddd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: zext_2i8_to_2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x = load <2 x i8>, <2 x i8>* %addr, align 1
+  %y = zext <2 x i8> %x to <2 x i32>
+  %z = add <2 x i32>%y, %y
+  ret <2 x i32>%z
+}
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index fa8bbaa6554ab86c971149386b1698e45827c7fa..af7e7964173f71440bf8083eb72f4dccbf5a7bb3 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -643,24 +643,24 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
 ;
 ; AVX1-LABEL: test_abs_le_v8i64_fold:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqu 32(%rdi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm6
-; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm6, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm5
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm7
+; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm7, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm5
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vselect-packss.ll b/test/CodeGen/X86/vselect-packss.ll
index 8d517151181eb1eb66a4c0572ae8a574381d121a..34b336c8820f45a9267932a3cd47565d6127f574 100644
--- a/test/CodeGen/X86/vselect-packss.ll
+++ b/test/CodeGen/X86/vselect-packss.ll
@@ -55,7 +55,7 @@ define <16 x i8> @vselect_packss_v16i16(<16 x i16> %a0, <16 x i16> %a1, <16 x i8
 ; AVX512NOBW-LABEL: vselect_packss_v16i16:
 ; AVX512NOBW:       # %bb.0:
 ; AVX512NOBW-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512NOBW-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512NOBW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512NOBW-NEXT:    vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
 ; AVX512NOBW-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vselect-zero.ll b/test/CodeGen/X86/vselect-zero.ll
index 8eb137a61ff7d67c1ee6fbe02c9a9e0998959048..70998b92bbbd85c689faba6432d297380d3369a2 100644
--- a/test/CodeGen/X86/vselect-zero.ll
+++ b/test/CodeGen/X86/vselect-zero.ll
@@ -41,19 +41,132 @@ define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %x) {
   ret <4 x i32> %r
 }
 
-define float @fsel(float %a, float %b, float %x) {
-; SSE-LABEL: fsel:
+define float @fsel_zero_false_val(float %a, float %b, float %x) {
+; SSE-LABEL: fsel_zero_false_val:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpeqss %xmm1, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fsel_zero_false_val:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cond = fcmp oeq float %a, %b
+  %r = select i1 %cond, float %x, float 0.0
+  ret float %r
+}
+
+define float @fsel_zero_true_val(float %a, float %b, float %x) {
+; SSE-LABEL: fsel_zero_true_val:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cmpeqss %xmm1, %xmm0
 ; SSE-NEXT:    andnps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: fsel:
+; AVX-LABEL: fsel_zero_true_val:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandnps %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %cond = fcmp oeq float %a, %b
-  %sel = select i1 %cond, float 0.0, float %x
-  ret float %sel
+  %r = select i1 %cond, float 0.0, float %x
+  ret float %r
+}
+
+define double @fsel_nonzero_false_val(double %x, double %y, double %z) {
+; SSE-LABEL: fsel_nonzero_false_val:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpeqsd %xmm1, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    andnpd %xmm1, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fsel_nonzero_false_val:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %cond = fcmp oeq double %x, %y
+  %r = select i1 %cond, double %z, double 42.0
+  ret double %r
+}
+
+define double @fsel_nonzero_true_val(double %x, double %y, double %z) {
+; SSE-LABEL: fsel_nonzero_true_val:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpeqsd %xmm1, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    andpd %xmm0, %xmm1
+; SSE-NEXT:    andnpd %xmm2, %xmm0
+; SSE-NEXT:    orpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fsel_nonzero_true_val:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %cond = fcmp oeq double %x, %y
+  %r = select i1 %cond, double 42.0, double %z
+  ret double %r
+}
+
+define double @fsel_nonzero_constants(double %x, double %y) {
+; SSE-LABEL: fsel_nonzero_constants:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpeqsd %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fsel_nonzero_constants:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %cond = fcmp oeq double %x, %y
+  %r = select i1 %cond, double 12.0, double 42.0
+  ret double %r
+}
+
+define <2 x double> @vsel_nonzero_constants(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: vsel_nonzero_constants:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    cmplepd %xmm0, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    andnpd %xmm0, %xmm2
+; SSE2-NEXT:    andpd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    orpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vsel_nonzero_constants:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    cmplepd %xmm0, %xmm1
+; SSE42-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE42-NEXT:    movapd %xmm1, %xmm0
+; SSE42-NEXT:    blendvpd %xmm0, {{.*}}(%rip), %xmm2
+; SSE42-NEXT:    movapd %xmm2, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: vsel_nonzero_constants:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmplepd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vblendvpd %xmm0, {{.*}}(%rip), %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %cond = fcmp oge <2 x double> %x, %y
+  %r = select <2 x i1> %cond, <2 x double> <double 12.0, double -1.0>, <2 x double> <double 42.0, double 0.0>
+  ret <2 x double> %r
 }
+
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 2d08e21d941b694a9f3e12a83e65234bca63a669..5c19a01db81dd63b3560632eb4a04800e4a23cb6 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -559,3 +559,54 @@ define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i
   ret <2 x i64> %z
 }
 
+; This turns into a SHRUNKBLEND with SSE4 or later, and via
+; late shuffle magic, both sides of the blend are the same
+; value. If that is not simplified before isel, it can fail
+; to match (crash).
+
+define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) {
+; SSE2-LABEL: simplify_select:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
+; SSE2-NEXT:    psllq $63, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    movq %rdi, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: simplify_select:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
+; SSE41-NEXT:    movq %rdi, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: simplify_select:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT:    vmovq %rdi, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: simplify_select:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a = insertelement <2 x i32> <i32 0, i32 undef>, i32 %x, i32 1
+  %b = insertelement <2 x i32> <i32 undef, i32 0>, i32 %x, i32 0
+  %y = or <2 x i32> %a, %b
+  %p16 = extractelement <2 x i32> %y, i32 1
+  %p17 = insertelement <2 x i32> undef, i32 %p16, i32 0
+  %p18 = insertelement <2 x i32> %p17, i32 %x, i32 1
+  %r = select <2 x i1> %z, <2 x i32> %y, <2 x i32> %p18
+  ret <2 x i32> %r
+}
+
diff --git a/test/CodeGen/X86/vshift-1.ll b/test/CodeGen/X86/vshift-1.ll
index a2e1e7a641c5b7a873a238cd9234961682380cea..a5b2c35418c16a43b2036378452d85859085a821 100644
--- a/test/CodeGen/X86/vshift-1.ll
+++ b/test/CodeGen/X86/vshift-1.ll
@@ -29,7 +29,6 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; X32-NEXT:    psllq %xmm1, %xmm0
 ; X32-NEXT:    movdqa %xmm0, (%eax)
 ; X32-NEXT:    retl
diff --git a/test/CodeGen/X86/vshift-2.ll b/test/CodeGen/X86/vshift-2.ll
index 6b01a8acdf4ec4a521e9761e16956f6490c13d4a..8e57551d1e20e686ff9c58ed220afd666de494ac 100644
--- a/test/CodeGen/X86/vshift-2.ll
+++ b/test/CodeGen/X86/vshift-2.ll
@@ -29,7 +29,6 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; X32-NEXT:    psrlq %xmm1, %xmm0
 ; X32-NEXT:    movdqa %xmm0, (%eax)
 ; X32-NEXT:    retl
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index c3fa2f5454e2ae5093c96185484c07e08e066142..fe8b7b7eee21def6134cc57a21c0f44db9583fbe 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -57,15 +57,13 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; WIDE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; WIDE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIDE-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; WIDE-NEXT:    pinsrd $1, 4(%ecx,%eax,8), %xmm3
+; WIDE-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
 ; WIDE-NEXT:    psubb %xmm0, %xmm3
 ; WIDE-NEXT:    psrlw $2, %xmm3
 ; WIDE-NEXT:    pand %xmm1, %xmm3
 ; WIDE-NEXT:    pxor %xmm2, %xmm3
 ; WIDE-NEXT:    psubb %xmm2, %xmm3
-; WIDE-NEXT:    pextrd $1, %xmm3, 4(%edx,%eax,8)
-; WIDE-NEXT:    movd %xmm3, (%edx,%eax,8)
+; WIDE-NEXT:    movq %xmm3, (%edx,%eax,8)
 ; WIDE-NEXT:    incl (%esp)
 ; WIDE-NEXT:  .LBB0_1: # %forcond
 ; WIDE-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index 038c6cb33b6389279cef9d5f9ca6bc3be50ea7c4..33c81886faf5d1f867e34e825727410d164c9e9c 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -47,16 +47,16 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X86-SSE2-NEXT:    movd %edx, %xmm0
 ; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X86-SSE2-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-SSE2-NEXT:    movdqa %xmm0, (%esp)
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl (%esp), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE2-NEXT:    shll $8, %edx
-; X86-SSE2-NEXT:    movzbl (%esp), %esi
-; X86-SSE2-NEXT:    orl %edx, %esi
-; X86-SSE2-NEXT:    movd %esi, %xmm0
-; X86-SSE2-NEXT:    pinsrw $1, %ecx, %xmm0
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT:    pinsrw $1, %edx, %xmm0
+; X86-SSE2-NEXT:    shll $8, %esi
+; X86-SSE2-NEXT:    pinsrw $3, %esi, %xmm0
+; X86-SSE2-NEXT:    movzbl 2(%ecx), %ecx
+; X86-SSE2-NEXT:    shll $8, %ecx
+; X86-SSE2-NEXT:    pinsrw $5, %ecx, %xmm0
 ; X86-SSE2-NEXT:    psrad $24, %xmm0
 ; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movss %xmm0, (%eax)
@@ -95,16 +95,16 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X64-SSE2-NEXT:    movq %rax, %xmm0
 ; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X64-SSE2-NEXT:    movzbl 2(%rsi), %eax
 ; X64-SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
 ; X64-SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE2-NEXT:    shll $8, %ecx
-; X64-SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; X64-SSE2-NEXT:    orl %ecx, %edx
-; X64-SSE2-NEXT:    movd %edx, %xmm0
+; X64-SSE2-NEXT:    shll $8, %eax
 ; X64-SSE2-NEXT:    pinsrw $1, %eax, %xmm0
-; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT:    shll $8, %ecx
+; X64-SSE2-NEXT:    pinsrw $3, %ecx, %xmm0
+; X64-SSE2-NEXT:    movzbl 2(%rsi), %eax
+; X64-SSE2-NEXT:    shll $8, %eax
+; X64-SSE2-NEXT:    pinsrw $5, %eax, %xmm0
 ; X64-SSE2-NEXT:    psrad $24, %xmm0
 ; X64-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-SSE2-NEXT:    movlps %xmm0, (%rdi)
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index 3f8e203d407676c577882f70880f1d431369b3db..5bdd7562d1e5ec19d311b25612c95e33e3702610 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -77,7 +77,6 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
-; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $32, %esp
 ; X86-SSE2-NEXT:    movl 8(%ebp), %eax
@@ -88,15 +87,11 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X86-SSE2-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-SSE2-NEXT:    movdqa %xmm0, (%esp)
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    shll $8, %edx
-; X86-SSE2-NEXT:    movzbl (%esp), %esi
-; X86-SSE2-NEXT:    orl %edx, %esi
-; X86-SSE2-NEXT:    movd %esi, %xmm0
-; X86-SSE2-NEXT:    pinsrw $1, %ecx, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE2-NEXT:    movzbl (%esp), %edx
+; X86-SSE2-NEXT:    movd %edx, %xmm0
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    pinsrw $2, %edx, %xmm0
+; X86-SSE2-NEXT:    pinsrw $4, %ecx, %xmm0
 ; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movss %xmm0, (%eax)
 ; X86-SSE2-NEXT:    movaps %xmm0, %xmm1
@@ -104,8 +99,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X86-SSE2-NEXT:    movss %xmm1, 8(%eax)
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    movss %xmm0, 4(%eax)
-; X86-SSE2-NEXT:    leal -4(%ebp), %esp
-; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
@@ -135,15 +129,11 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X64-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X64-SSE2-NEXT:    movzbl 2(%rsi), %eax
 ; X64-SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE2-NEXT:    shll $8, %ecx
-; X64-SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; X64-SSE2-NEXT:    orl %ecx, %edx
-; X64-SSE2-NEXT:    movd %edx, %xmm0
-; X64-SSE2-NEXT:    pinsrw $1, %eax, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
-; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE2-NEXT:    movd %ecx, %xmm0
+; X64-SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE2-NEXT:    pinsrw $2, %ecx, %xmm0
+; X64-SSE2-NEXT:    pinsrw $4, %eax, %xmm0
 ; X64-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-SSE2-NEXT:    movlps %xmm0, (%rdi)
 ; X64-SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
diff --git a/test/CodeGen/X86/widen_load-3.ll b/test/CodeGen/X86/widen_load-3.ll
index 51e4abf2bbb9fd421dd8431a16972ea56fd1b04c..15c06b2ba082feaf588592b55ed7f2fa506de576 100644
--- a/test/CodeGen/X86/widen_load-3.ll
+++ b/test/CodeGen/X86/widen_load-3.ll
@@ -30,12 +30,12 @@ define <7 x i64> @load7_aligned(<7 x i64>* %x) {
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vmovaps (%ecx), %ymm0
-; X86-AVX-NEXT:    vmovaps 32(%ecx), %ymm1
-; X86-AVX-NEXT:    vmovaps %ymm0, (%eax)
-; X86-AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; X86-AVX-NEXT:    vextractps $1, %xmm0, 52(%eax)
-; X86-AVX-NEXT:    vmovss %xmm0, 48(%eax)
+; X86-AVX-NEXT:    vmovaps 48(%ecx), %xmm1
+; X86-AVX-NEXT:    vextractps $1, %xmm1, 52(%eax)
+; X86-AVX-NEXT:    vmovss %xmm1, 48(%eax)
+; X86-AVX-NEXT:    vmovaps 32(%ecx), %xmm1
 ; X86-AVX-NEXT:    vmovaps %xmm1, 32(%eax)
+; X86-AVX-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl $4
 ;
@@ -56,11 +56,11 @@ define <7 x i64> @load7_aligned(<7 x i64>* %x) {
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movq %rdi, %rax
 ; X64-AVX-NEXT:    vmovaps (%rsi), %ymm0
-; X64-AVX-NEXT:    vmovaps 32(%rsi), %ymm1
-; X64-AVX-NEXT:    vmovaps %ymm0, (%rdi)
-; X64-AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; X64-AVX-NEXT:    vmovlps %xmm0, 48(%rdi)
+; X64-AVX-NEXT:    movq 48(%rsi), %rcx
+; X64-AVX-NEXT:    movq %rcx, 48(%rdi)
+; X64-AVX-NEXT:    vmovaps 32(%rsi), %xmm1
 ; X64-AVX-NEXT:    vmovaps %xmm1, 32(%rdi)
+; X64-AVX-NEXT:    vmovaps %ymm0, (%rdi)
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
   %x1 = load <7 x i64>, <7 x i64>* %x
diff --git a/test/CodeGen/X86/widen_mul.ll b/test/CodeGen/X86/widen_mul.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b3ccb961c55061414c0b7dd91ba52b89bd9cbb01
--- /dev/null
+++ b/test/CodeGen/X86/widen_mul.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+; Test multiplies of various narrow types.
+
+define <2 x i8> @mul_v2i8(<2 x i8> %x, <2 x i8> %y) {
+; SSE2-LABEL: mul_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: mul_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: mul_v2i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %res = mul <2 x i8> %x, %y
+  ret <2 x i8> %res
+}
+
+define <4 x i8> @mul_v4i8(<4 x i8> %x, <4 x i8> %y) {
+; SSE2-LABEL: mul_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: mul_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: mul_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %res = mul <4 x i8> %x, %y
+  ret <4 x i8> %res
+}
+
+define <8 x i8> @mul_v8i8(<8 x i8> %x, <8 x i8> %y) {
+; SSE2-LABEL: mul_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: mul_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: mul_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %res = mul <8 x i8> %x, %y
+  ret <8 x i8> %res
+}
+
+define <2 x i16> @mul_v2i16(<2 x i16> %x, <2 x i16> %y) {
+; SSE-LABEL: mul_v2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: mul_v2i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %res = mul <2 x i16> %x, %y
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @mul_v4i16(<4 x i16> %x, <4 x i16> %y) {
+; SSE-LABEL: mul_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmullw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: mul_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %res = mul <4 x i16> %x, %y
+  ret <4 x i16> %res
+}
+
+define <2 x i32> @mul_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; SSE2-LABEL: mul_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: mul_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: mul_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %res = mul <2 x i32> %x, %y
+  ret <2 x i32> %res
+}
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 41d69e544aa48eeee58cae41ec9440ce8a855f0b..261dc3c8379d8d6d6e5215a2bbf6818ef237769d 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1,26 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512
 
 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
-; AVX1-LABEL: load_factorf64_4:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovupd (%rdi), %ymm0
-; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX1-NEXT:    vhaddpd %ymm5, %ymm4, %ymm4
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1-NEXT:    vaddpd %ymm2, %ymm4, %ymm2
-; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX1-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    retq
-;
 ; AVX-LABEL: load_factorf64_4:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovupd (%rdi), %ymm0
@@ -49,21 +32,6 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
 }
 
 define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
-; AVX1-LABEL: load_factorf64_2:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovupd (%rdi), %ymm0
-; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX1-NEXT:    vmulpd %ymm0, %ymm4, %ymm0
-; AVX1-NEXT:    retq
-;
 ; AVX-LABEL: load_factorf64_2:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovupd (%rdi), %ymm0
@@ -86,16 +54,6 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
 }
 
 define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
-; AVX1-LABEL: load_factorf64_1:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovupd (%rdi), %ymm0
-; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
 ; AVX-LABEL: load_factorf64_1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovupd (%rdi), %ymm0
@@ -140,24 +98,24 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX-LABEL: load_factori64_4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX-NEXT:    vmovdqu 32(%rdi), %ymm1
-; AVX-NEXT:    vmovdqu 64(%rdi), %ymm2
-; AVX-NEXT:    vmovdqu 96(%rdi), %ymm3
-; AVX-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
-; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
-; AVX-NEXT:    retq
+; AVX2OR512-LABEL: load_factori64_4:
+; AVX2OR512:       # %bb.0:
+; AVX2OR512-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2OR512-NEXT:    vmovdqu 32(%rdi), %ymm1
+; AVX2OR512-NEXT:    vmovdqu 64(%rdi), %ymm2
+; AVX2OR512-NEXT:    vmovdqu 96(%rdi), %ymm3
+; AVX2OR512-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX2OR512-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
+; AVX2OR512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2OR512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX2OR512-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX2OR512-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2OR512-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX2OR512-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
+; AVX2OR512-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2OR512-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
+; AVX2OR512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2OR512-NEXT:    retq
   %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
   %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -305,14 +263,14 @@ define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
@@ -431,60 +389,31 @@ ret void
 }
 
 define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
-; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX1-NEXT:    vpaddw %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
 ; AVX-LABEL: interleaved_load_vf8_i8_stride4:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm3[0]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
 ; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX-NEXT:    vpaddw %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT:    vpaddw %xmm0, %xmm4, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[1,0,3,2,4,5,6,7]
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
 ; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
   %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -501,146 +430,144 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
 define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
 ; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm4
+; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
+; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
+; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
+; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
+; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpxor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm6
-; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm6
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
+; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX2-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm6
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX2-NEXT:    vpcmpeqb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
+; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm4, %xmm0
 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
+; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm3[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
+; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
+; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
-; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
+; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm7, %xmm4, %xmm3
+; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm4
 ; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512-NEXT:    vpcmpeqb %zmm5, %zmm8, %k0
-; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm3, %k1
+; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm4, %k1
 ; AVX512-NEXT:    kxnorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -662,84 +589,84 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
 define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
 ; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm14
-; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm12
-; AVX1-NEXT:    vpshufb %xmm6, %xmm12, %xmm5
-; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm13
-; AVX1-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
-; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vmovdqa 112(%rdi), %xmm11
+; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm1
+; AVX1-NEXT:    vmovdqa 96(%rdi), %xmm12
+; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm3
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vmovdqa 80(%rdi), %xmm14
+; AVX1-NEXT:    vpshufb %xmm2, %xmm14, %xmm4
+; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm15
-; AVX1-NEXT:    vpshufb %xmm6, %xmm15, %xmm5
-; AVX1-NEXT:    vpshufb %xmm6, %xmm14, %xmm6
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX1-NEXT:    vextractf128 $1, %ymm11, %xmm6
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm4
-; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm8
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm13
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm15
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
+; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm4
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm5
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm5, %xmm13, %xmm1
-; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm7
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm1
+; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm14, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
-; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm4
-; AVX1-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm15, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm4, %xmm13, %xmm5
-; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm7
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm1
+; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm14, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm5
-; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm6, %xmm5
-; AVX1-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm15, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm13, %xmm4
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm1
+; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm14, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm2
-; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm2
-; AVX1-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm15, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm1
@@ -758,192 +685,188 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
 ;
 ; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm7
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm9
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm11
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm13
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm6, %xmm9, %xmm3
-; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm4
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm10
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX2-NEXT:    vpshufb %xmm2, %xmm11, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm4[2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm12
-; AVX2-NEXT:    vpshufb %xmm6, %xmm12, %xmm3
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm13
-; AVX2-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm6
-; AVX2-NEXT:    vpshufb %xmm2, %xmm6, %xmm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX2-NEXT:    vpshufb %xmm6, %xmm13, %xmm4
+; AVX2-NEXT:    vpshufb %xmm6, %xmm12, %xmm5
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
+; AVX2-NEXT:    vpshufb %xmm0, %xmm9, %xmm7
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
+; AVX2-NEXT:    vmovdqa 112(%rdi), %xmm14
+; AVX2-NEXT:    vpshufb %xmm6, %xmm14, %xmm7
+; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = mem[2,3,0,1]
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
+; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm6
+; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = mem[2,3,0,1]
 ; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
-; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
-; AVX2-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm1, %xmm13, %xmm0
+; AVX2-NEXT:    vpshufb %xmm1, %xmm12, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
+; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm14, %xmm3
+; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
+; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
-; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpcmpeqb %ymm0, %ymm8, %ymm8
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
-; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
+; AVX2-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX2-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
+; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm14, %xmm3
+; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
+; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm1, %xmm13, %xmm2
+; AVX2-NEXT:    vpshufb %xmm1, %xmm12, %xmm3
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
-; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm12, %xmm4
-; AVX2-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
+; AVX2-NEXT:    vpshufb %xmm3, %xmm9, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm14, %xmm2
+; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm2
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
-; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
-; AVX2-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
-; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpcmpeqb %ymm0, %ymm10, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm8, %ymm0
 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm7
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm10
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm6, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm4
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm5
-; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm3[0,1],xmm4[2,3]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm5
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm12
-; AVX512-NEXT:    vpshufb %xmm6, %xmm12, %xmm3
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm13
-; AVX512-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm14
-; AVX512-NEXT:    vpshufb %xmm2, %xmm14, %xmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm11
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm13
+; AVX512-NEXT:    vpshufb %xmm6, %xmm13, %xmm4
+; AVX512-NEXT:    vpshufb %xmm6, %xmm12, %xmm5
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm7
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm14
+; AVX512-NEXT:    vpshufb %xmm6, %xmm14, %xmm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = mem[2,3,0,1]
+; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm9
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm6
+; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm1
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = mem[2,3,0,1]
 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
+; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
+; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm4
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm6
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm5
-; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm3
+; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
+; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm4
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm6
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm5
-; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm3
+; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
-; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
-; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm2
+; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm3
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
+; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm0
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm14, %xmm2
+; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm3
-; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm2
+; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vpcmpeqb %zmm9, %zmm8, %k0
-; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm2, %k1
+; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm15, %k1
 ; AVX512-NEXT:    kxnord %k1, %k0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -981,21 +904,21 @@ define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX-LABEL: interleaved_store_vf8_i8_stride4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
-; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX-NEXT:    vmovdqa %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX2OR512-LABEL: interleaved_store_vf8_i8_stride4:
+; AVX2OR512:       # %bb.0:
+; AVX2OR512-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2OR512-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX2OR512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX2OR512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2OR512-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
+; AVX2OR512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2OR512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX2OR512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2OR512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2OR512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2OR512-NEXT:    vmovdqa %ymm0, (%rdi)
+; AVX2OR512-NEXT:    vzeroupper
+; AVX2OR512-NEXT:    retq
 %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
@@ -1027,52 +950,48 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm2
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX1-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm5
 ; AVX1-NEXT:    vorps %ymm2, %ymm5, %ymm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
-; AVX1-NEXT:    vpor %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddb %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX-LABEL: interleaved_load_vf32_i8_stride3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
-; AVX-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
-; AVX-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
-; AVX-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX-NEXT:    vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
-; AVX-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
-; AVX-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
-; AVX-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
-; AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
-; AVX-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
-; AVX-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
-; AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
+; AVX2OR512:       # %bb.0:
+; AVX2OR512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2OR512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2OR512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX2OR512-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX2OR512-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX2OR512-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX2OR512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2OR512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX2OR512-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX2OR512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2OR512-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2OR512-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
+; AVX2OR512-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
+; AVX2OR512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2OR512-NEXT:    retq
 	%wide.vec = load <96 x i8>, <96 x i8>* %ptr
 	%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
 	%v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
@@ -1083,28 +1002,6 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
 }
 
 define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
-; AVX1-LABEL: interleaved_load_vf16_i8_stride3:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX1-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
-; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
 ; AVX-LABEL: interleaved_load_vf16_i8_stride3:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
@@ -1121,9 +1018,8 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
 ; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 	%wide.vec = load <48 x i8>, <48 x i8>* %ptr
@@ -1136,28 +1032,10 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
 }
 
 define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
-; AVX1-LABEL: interleaved_load_vf8_i8_stride3:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
-; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
-; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
 ; AVX-LABEL: interleaved_load_vf8_i8_stride3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
@@ -1169,7 +1047,6 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
 ; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 	%wide.vec = load <24 x i8>, <24 x i8>* %ptr
 	%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32  9,i32 12,i32 15,i32 18,i32 21>
@@ -1181,23 +1058,6 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
 }
 
 define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
-; AVX1-LABEL: interleaved_store_vf8_i8_stride3:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
-; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, 16(%rdi)
-; AVX1-NEXT:    vmovdqu %xmm2, (%rdi)
-; AVX1-NEXT:    retq
-;
 ; AVX-LABEL: interleaved_store_vf8_i8_stride3:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -1225,23 +1085,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
 ; AVX1-LABEL: interleaved_store_vf16_i8_stride3:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
-; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
-; AVX1-NEXT:    vmovdqu %xmm1, 32(%rdi)
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovdqu %xmm2, 32(%rdi)
 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1249,23 +1105,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
 ; AVX2-LABEL: interleaved_store_vf16_i8_stride3:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
-; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-NEXT:    vpor %xmm4, %xmm6, %xmm4
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
-; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-NEXT:    vpor %xmm6, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-NEXT:    vmovdqu %xmm1, 32(%rdi)
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqu %xmm2, 32(%rdi)
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1273,23 +1125,19 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
 ; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX512-NEXT:    vpor %xmm4, %xmm6, %xmm4
-; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
-; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm6, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX512-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX512-NEXT:    vextracti32x4 $2, %zmm1, 32(%rdi)
 ; AVX512-NEXT:    vzeroupper
@@ -1561,71 +1409,64 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm4, %xmm11, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm11, %xmm11
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm4, %xmm10, %xmm11
+; AVX1-NEXT:    vpshufb %xmm4, %xmm10, %xmm10
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm12, %xmm12
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm4, %xmm13, %xmm0
-; AVX1-NEXT:    vpshufb %xmm4, %xmm15, %xmm7
-; AVX1-NEXT:    vpshufb %xmm4, %xmm8, %xmm13
+; AVX1-NEXT:    vpshufb %xmm4, %xmm13, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm15, %xmm0
+; AVX1-NEXT:    vpshufb %xmm4, %xmm8, %xmm7
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm15 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm10 = xmm13[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm0[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm13 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm15 = xmm7[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm0[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm2[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm7
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm1
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm14[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm14
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm14[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm11, %ymm14
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm12
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX1-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX1-NEXT:    vandnps %ymm12, %ymm13, %ymm12
-; AVX1-NEXT:    vandps %ymm13, %ymm14, %ymm14
-; AVX1-NEXT:    vorps %ymm12, %ymm14, %ymm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm14
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vandnps %ymm14, %ymm13, %ymm14
-; AVX1-NEXT:    vandps %ymm13, %ymm7, %ymm7
-; AVX1-NEXT:    vorps %ymm14, %ymm7, %ymm13
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
-; AVX1-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm7, %xmm15, %xmm4
-; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm10[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm7, %xmm10, %xmm4
-; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpshufb %xmm14, %xmm5, %xmm4
-; AVX1-NEXT:    vpshufb %xmm7, %xmm9, %xmm5
-; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpshufb %xmm14, %xmm6, %xmm5
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm8[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpshufb %xmm7, %xmm8, %xmm0
-; AVX1-NEXT:    vpor %xmm5, %xmm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm0
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
-; AVX1-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm10[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm10
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX1-NEXT:    vandnps %ymm10, %ymm12, %ymm10
+; AVX1-NEXT:    vandps %ymm12, %ymm14, %ymm14
+; AVX1-NEXT:    vorps %ymm10, %ymm14, %ymm10
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm14
+; AVX1-NEXT:    vandnps %ymm14, %ymm12, %ymm14
+; AVX1-NEXT:    vandps %ymm12, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm14, %ymm1, %ymm1
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm13[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm12 = xmm15[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm8[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm5[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm0
+; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm11, %xmm12, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vpaddb %xmm6, %xmm13, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpaddb %xmm12, %xmm10, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm7[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpaddb %xmm9, %xmm1, %xmm1
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
@@ -1663,11 +1504,11 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX2-NEXT:    vpaddb %ymm5, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendvb %ymm8, %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendvb %ymm8, %ymm7, %ymm3, %ymm1
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
 ;
@@ -1696,18 +1537,16 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
 ; AVX512-NEXT:    movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
 ; AVX512-NEXT:    kmovq %rax, %k1
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm5
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
+; AVX512-NEXT:    vpalignr {{.*#+}} ymm4 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 {%k1} = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
 ; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm4, %ymm2, %ymm6, %ymm2
-; AVX512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm5[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
+; AVX512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    retq
@@ -1723,13 +1562,13 @@ ret <64 x i8> %add2
 define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, <256 x i8>* %p) {
 ; AVX1-LABEL: interleaved_store_vf64_i8_stride4:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    subq $24, %rsp
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm11
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm12
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX1-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm13
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm14
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
@@ -1739,73 +1578,66 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm3
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm1
-; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
 ; AVX1-NEXT:    vmovdqa %xmm8, %xmm2
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm8, %ymm13
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm11, %ymm11
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm8, %ymm5
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
+; AVX1-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm9, %ymm14
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm9, %ymm6
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm9, %ymm9
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm9, %ymm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm8, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm13, %ymm8
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm9, %ymm6
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm10, %ymm2
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm3, %ymm0
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm7, %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm14, %ymm7
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm9
+; AVX1-NEXT:    vinsertf128 $1, %xmm14, %ymm5, %ymm10
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm4
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm11, %ymm8
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm7
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
 ; AVX1-NEXT:    vmovaps %ymm3, 224(%rdi)
 ; AVX1-NEXT:    vmovaps %ymm2, 192(%rdi)
 ; AVX1-NEXT:    vmovaps %ymm7, 160(%rdi)
-; AVX1-NEXT:    vmovaps %ymm0, 128(%rdi)
+; AVX1-NEXT:    vmovaps %ymm8, 128(%rdi)
 ; AVX1-NEXT:    vmovaps %ymm1, 96(%rdi)
 ; AVX1-NEXT:    vmovaps %ymm5, 64(%rdi)
-; AVX1-NEXT:    vmovaps %ymm6, 32(%rdi)
-; AVX1-NEXT:    vmovaps %ymm8, (%rdi)
-; AVX1-NEXT:    addq $24, %rsp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    vmovaps %ymm4, 32(%rdi)
+; AVX1-NEXT:    vmovaps %ymm10, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 28e396c8cf422310fa157e990f270d06b5d67fe2..3d65eed6ebba0895312b28d1be2a3de925e4aa17 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -126,7 +126,7 @@ for.preheader:
 for.body:                                         ; preds = %entry, %for.body
   %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
   %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
-  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
   %add = add nsw i32 %call, %sum.04
   %inc = add nuw nsw i32 %i.05, 1
   %exitcond = icmp eq i32 %inc, 10
@@ -178,7 +178,7 @@ for.preheader:
 for.body:                                         ; preds = %for.body, %entry
   %i.04 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
   %sum.03 = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
-  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
   %add = add nsw i32 %call, %sum.03
   %inc = add nuw nsw i32 %i.04, 1
   %exitcond = icmp eq i32 %inc, 10
@@ -248,7 +248,7 @@ for.preheader:
 for.body:                                         ; preds = %entry, %for.body
   %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
   %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
-  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
   %add = add nsw i32 %call, %sum.04
   %inc = add nuw nsw i32 %i.05, 1
   %exitcond = icmp eq i32 %inc, 10
@@ -324,7 +324,7 @@ if.then:                                          ; preds = %entry
 for.body:                                         ; preds = %for.body, %if.then
   %i.05 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
   %sum.04 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
-  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
   %add = add nsw i32 %call, %sum.04
   %inc = add nuw nsw i32 %i.05, 1
   %exitcond = icmp eq i32 %inc, 10
@@ -427,41 +427,38 @@ if.end:                                           ; preds = %for.body, %if.else
 ; Check that we handle calls to variadic functions correctly.
 ; CHECK-LABEL: callVariadicFunc:
 ;
+; ENABLE: movl %esi, %eax
 ; ENABLE: testl %edi, %edi
 ; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
 ;
 ; Prologue code.
 ; CHECK: pushq
 ;
+; DISABLE: movl %esi, %eax
 ; DISABLE: testl %edi, %edi
 ; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
 ;
 ; Setup of the varags.
-; CHECK: movl %esi, (%rsp)
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: %esi, %edi
-; CHECK-NEXT: %esi, %edx
-; CHECK-NEXT: %esi, %ecx
-; CHECK-NEXT: %esi, %r8d
-; CHECK-NEXT: %esi, %r9d
+; CHECK:       movl	%eax, (%rsp)
+; CHECK-NEXT:  movl	%eax, %edi
+; CHECK-NEXT:  movl	%eax, %esi
+; CHECK-NEXT:  movl	%eax, %edx
+; CHECK-NEXT:  movl	%eax, %ecx
+; CHECK-NEXT:  movl	%eax, %r8d
+; CHECK-NEXT:  movl	%eax, %r9d
+; CHECK-NEXT:  xorl	%eax, %eax
 ; CHECK-NEXT: callq _someVariadicFunc
-; CHECK-NEXT: movl %eax, %esi
-; CHECK-NEXT: shll $3, %esi
+; CHECK-NEXT: shll $3, %eax
 ;
 ; ENABLE-NEXT: addq $8, %rsp
-; ENABLE-NEXT: movl %esi, %eax
 ; ENABLE-NEXT: retq
 ;
-; DISABLE: jmp [[IFEND_LABEL:LBB[0-9_]+]]
-;
+
 ; CHECK: [[ELSE_LABEL]]: ## %if.else
 ; Shift second argument by one and store into returned register.
-; CHECK: addl %esi, %esi
-;
-; DISABLE: [[IFEND_LABEL]]: ## %if.end
+; CHECK: addl %eax, %eax
 ;
 ; Epilogue code.
-; CHECK-NEXT: movl %esi, %eax
 ; DISABLE-NEXT: popq
 ; CHECK-NEXT: retq
 define i32 @callVariadicFunc(i32 %cond, i32 %N) {
diff --git a/test/CodeGen/X86/x86-win64-shrink-wrapping.ll b/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
index 5d9b2ba3267a60936ac146a29f7f06ec5e719e38..9142267d157c417229640464ebd6d244cc7c1757 100644
--- a/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
@@ -100,7 +100,7 @@ for.preheader:                                    ; preds = %entry
 for.body:                                         ; preds = %for.body, %for.preheader
   %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
   %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
-  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
   %add = add nsw i32 %call, %sum.04
   %inc = add nuw nsw i32 %i.05, 1
   %exitcond = icmp eq i32 %inc, 10
diff --git a/test/CodeGen/X86/x87-schedule.ll b/test/CodeGen/X86/x87-schedule.ll
index 937a2c4561b64719a1543bc7e851908e4c1f5822..1921f8c75a3d10ce302c3470f5d19f082d70aed2 100644
--- a/test/CodeGen/X86/x87-schedule.ll
+++ b/test/CodeGen/X86/x87-schedule.ll
@@ -2792,14 +2792,14 @@ define void @test_fist_fistp_fisttp(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fists (%edx) # sched: [1:0.50]
-; BDVER2-NEXT:    fistl (%ecx) # sched: [1:0.50]
-; BDVER2-NEXT:    fistps (%edx) # sched: [1:0.50]
-; BDVER2-NEXT:    fistpl (%ecx) # sched: [1:0.50]
-; BDVER2-NEXT:    fistpll (%eax) # sched: [1:0.50]
-; BDVER2-NEXT:    fisttps (%edx) # sched: [1:0.50]
-; BDVER2-NEXT:    fisttpl (%ecx) # sched: [1:0.50]
-; BDVER2-NEXT:    fisttpll (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    fists (%edx) # sched: [1:1.00]
+; BDVER2-NEXT:    fistl (%ecx) # sched: [1:1.00]
+; BDVER2-NEXT:    fistps (%edx) # sched: [1:1.00]
+; BDVER2-NEXT:    fistpl (%ecx) # sched: [1:1.00]
+; BDVER2-NEXT:    fistpll (%eax) # sched: [1:1.00]
+; BDVER2-NEXT:    fisttps (%edx) # sched: [1:1.00]
+; BDVER2-NEXT:    fisttpl (%ecx) # sched: [1:1.00]
+; BDVER2-NEXT:    fisttpll (%eax) # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
@@ -4672,12 +4672,12 @@ define void @test_fst_fstp(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    fst %st(0) # sched: [1:0.50]
-; BDVER2-NEXT:    fsts (%edx) # sched: [1:0.50]
-; BDVER2-NEXT:    fstl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fsts (%edx) # sched: [1:1.00]
+; BDVER2-NEXT:    fstl (%ecx) # sched: [1:1.00]
 ; BDVER2-NEXT:    fstp %st(0) # sched: [1:0.50]
-; BDVER2-NEXT:    fstpl (%edx) # sched: [1:0.50]
-; BDVER2-NEXT:    fstpl (%ecx) # sched: [1:0.50]
-; BDVER2-NEXT:    fstpt (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    fstpl (%edx) # sched: [1:1.00]
+; BDVER2-NEXT:    fstpl (%ecx) # sched: [1:1.00]
+; BDVER2-NEXT:    fstpt (%eax) # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
 ; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
diff --git a/test/CodeGen/X86/xchg-nofold.ll b/test/CodeGen/X86/xchg-nofold.ll
index 2e24f8bd64dc2daeb218b411c278818a6140c8dd..673fa74d02f184edcd0d74fedc810f042428c2bd 100644
--- a/test/CodeGen/X86/xchg-nofold.ll
+++ b/test/CodeGen/X86/xchg-nofold.ll
@@ -17,7 +17,7 @@ define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture derefer
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    andl $7, %edx
+; CHECK-NEXT:    andb $7, %dl
 ; CHECK-NEXT:    cmpb %cl, %dl
 ; CHECK-NEXT:    jge .LBB0_2
 ; CHECK-NEXT:  .LBB0_3:
diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
index 911ab945c5d0edddd68cf51b9baf4ace50a99010..f31658319b03fad91189af36271b810abbc618ab 100644
--- a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
@@ -5,15 +5,10 @@
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/xop-builtins.c
 
 define <2 x i64> @test_mm_maccs_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maccs_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_maccs_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_maccs_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
@@ -24,15 +19,10 @@ define <2 x i64> @test_mm_maccs_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a
 declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_macc_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_macc_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_macc_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_macc_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
@@ -43,15 +33,10 @@ define <2 x i64> @test_mm_macc_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2
 declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_maccsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maccsd_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_maccsd_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_maccsd_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -62,15 +47,10 @@ define <2 x i64> @test_mm_maccsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %
 declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_maccd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maccd_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_maccd_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_maccd_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -81,15 +61,10 @@ define <2 x i64> @test_mm_maccd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a
 declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_maccs_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maccs_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_maccs_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_maccs_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -100,15 +75,10 @@ define <2 x i64> @test_mm_maccs_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a
 declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_macc_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_macc_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_macc_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_macc_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -119,15 +89,10 @@ define <2 x i64> @test_mm_macc_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2
 declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_maccslo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maccslo_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_maccslo_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_maccslo_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
@@ -136,15 +101,10 @@ define <2 x i64> @test_mm_maccslo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>
 declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_mm_macclo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_macclo_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_macclo_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_macclo_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
@@ -153,15 +113,10 @@ define <2 x i64> @test_mm_macclo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %
 declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_mm_maccshi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maccshi_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_maccshi_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_maccshi_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
@@ -170,15 +125,10 @@ define <2 x i64> @test_mm_maccshi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>
 declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_mm_macchi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_macchi_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_macchi_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_macchi_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
@@ -187,15 +137,10 @@ define <2 x i64> @test_mm_macchi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %
 declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_mm_maddsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maddsd_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_maddsd_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_maddsd_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -206,15 +151,10 @@ define <2 x i64> @test_mm_maddsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %
 declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_maddd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maddd_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_maddd_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_maddd_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -225,15 +165,10 @@ define <2 x i64> @test_mm_maddd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a
 declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_haddw_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddw_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddbw %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddw_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddbw %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddw_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddbw %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %arg0)
   %bc = bitcast <8 x i16> %res to <2 x i64>
@@ -242,15 +177,10 @@ define <2 x i64> @test_mm_haddw_epi8(<2 x i64> %a0) {
 declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_haddd_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddd_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddbd %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddd_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddbd %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddd_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddbd %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %arg0)
   %bc = bitcast <4 x i32> %res to <2 x i64>
@@ -259,15 +189,10 @@ define <2 x i64> @test_mm_haddd_epi8(<2 x i64> %a0) {
 declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_haddq_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddq_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddbq %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddq_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddbq %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddq_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddbq %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %arg0)
   ret <2 x i64> %res
@@ -275,15 +200,10 @@ define <2 x i64> @test_mm_haddq_epi8(<2 x i64> %a0) {
 declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_haddd_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddd_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddwd %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddd_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddwd %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddd_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddwd %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %arg0)
   %bc = bitcast <4 x i32> %res to <2 x i64>
@@ -292,15 +212,10 @@ define <2 x i64> @test_mm_haddd_epi16(<2 x i64> %a0) {
 declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_haddq_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddq_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddwq %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddq_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddwq %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddq_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddwq %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %arg0)
   ret <2 x i64> %res
@@ -308,15 +223,10 @@ define <2 x i64> @test_mm_haddq_epi16(<2 x i64> %a0) {
 declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_haddq_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddq_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vphadddq %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddq_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vphadddq %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddq_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphadddq %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %arg0)
   ret <2 x i64> %res
@@ -324,15 +234,10 @@ define <2 x i64> @test_mm_haddq_epi32(<2 x i64> %a0) {
 declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_haddw_epu8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddw_epu8:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddubw %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddw_epu8:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddubw %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddw_epu8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddubw %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %arg0)
   %bc = bitcast <8 x i16> %res to <2 x i64>
@@ -341,15 +246,10 @@ define <2 x i64> @test_mm_haddw_epu8(<2 x i64> %a0) {
 declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_haddd_epu8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddd_epu8:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddubd %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddd_epu8:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddubd %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddd_epu8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddubd %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %arg0)
   %bc = bitcast <4 x i32> %res to <2 x i64>
@@ -358,15 +258,10 @@ define <2 x i64> @test_mm_haddd_epu8(<2 x i64> %a0) {
 declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_haddq_epu8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddq_epu8:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddubq %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddq_epu8:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddubq %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddq_epu8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddubq %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %arg0)
   ret <2 x i64> %res
@@ -374,15 +269,10 @@ define <2 x i64> @test_mm_haddq_epu8(<2 x i64> %a0) {
 declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_haddd_epu16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddd_epu16:
-; X32:       # %bb.0:
-; X32-NEXT:    vphadduwd %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddd_epu16:
-; X64:       # %bb.0:
-; X64-NEXT:    vphadduwd %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddd_epu16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphadduwd %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %arg0)
   %bc = bitcast <4 x i32> %res to <2 x i64>
@@ -392,15 +282,10 @@ declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
 
 
 define <2 x i64> @test_mm_haddq_epu16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddq_epu16:
-; X32:       # %bb.0:
-; X32-NEXT:    vphadduwq %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddq_epu16:
-; X64:       # %bb.0:
-; X64-NEXT:    vphadduwq %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddq_epu16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphadduwq %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %arg0)
   ret <2 x i64> %res
@@ -408,15 +293,10 @@ define <2 x i64> @test_mm_haddq_epu16(<2 x i64> %a0) {
 declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_haddq_epu32(<2 x i64> %a0) {
-; X32-LABEL: test_mm_haddq_epu32:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddudq %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_haddq_epu32:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddudq %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_haddq_epu32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphaddudq %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %arg0)
   ret <2 x i64> %res
@@ -424,15 +304,10 @@ define <2 x i64> @test_mm_haddq_epu32(<2 x i64> %a0) {
 declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_hsubw_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_hsubw_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vphsubbw %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_hsubw_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vphsubbw %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_hsubw_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphsubbw %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %arg0)
   %bc = bitcast <8 x i16> %res to <2 x i64>
@@ -441,15 +316,10 @@ define <2 x i64> @test_mm_hsubw_epi8(<2 x i64> %a0) {
 declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_hsubd_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_hsubd_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vphsubwd %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_hsubd_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vphsubwd %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_hsubd_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphsubwd %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %arg0)
   %bc = bitcast <4 x i32> %res to <2 x i64>
@@ -458,15 +328,10 @@ define <2 x i64> @test_mm_hsubd_epi16(<2 x i64> %a0) {
 declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_hsubq_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm_hsubq_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vphsubdq %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_hsubq_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vphsubdq %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_hsubq_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vphsubdq %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %res = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %arg0)
   ret <2 x i64> %res
@@ -474,63 +339,39 @@ define <2 x i64> @test_mm_hsubq_epi32(<2 x i64> %a0) {
 declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_cmov_si128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm_cmov_si128:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; X32-NEXT:    vpxor %xmm3, %xmm2, %xmm3
-; X32-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; X32-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; X32-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmov_si128:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; X64-NEXT:    vpxor %xmm3, %xmm2, %xmm3
-; X64-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; X64-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_cmov_si128:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; ALL-NEXT:    vpxor %xmm3, %xmm2, %xmm3
+; ALL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; ALL-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; ALL-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
 
 define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
-; X32-LABEL: test_mm256_cmov_si256:
-; X32:       # %bb.0:
-; X32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X32-NEXT:    vcmptrueps %ymm3, %ymm3, %ymm3
-; X32-NEXT:    vxorps %ymm3, %ymm2, %ymm3
-; X32-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; X32-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cmov_si256:
-; X64:       # %bb.0:
-; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X64-NEXT:    vcmptrueps %ymm3, %ymm3, %ymm3
-; X64-NEXT:    vxorps %ymm3, %ymm2, %ymm3
-; X64-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; X64-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_cmov_si256:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; ALL-NEXT:    vcmptrueps %ymm3, %ymm3, %ymm3
+; ALL-NEXT:    vxorps %ymm3, %ymm2, %ymm3
+; ALL-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; ALL-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; ALL-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
   ret <4 x i64> %res
 }
 declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
 
 define <2 x i64> @test_mm_perm_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm_perm_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_perm_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_perm_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
@@ -541,15 +382,10 @@ define <2 x i64> @test_mm_perm_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2)
 declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_rot_epi8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_rot_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vprotb %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_rot_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vprotb %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_rot_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vprotb %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %arg0, <16 x i8> %arg1)
@@ -559,15 +395,10 @@ define <2 x i64> @test_mm_rot_epi8(<2 x i64> %a0, <2 x i64> %a1) {
 declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_rot_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_rot_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vprotw %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_rot_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vprotw %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_rot_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vprotw %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %res = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -577,15 +408,10 @@ define <2 x i64> @test_mm_rot_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_rot_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_rot_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vprotd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_rot_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vprotd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_rot_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vprotd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %arg0, <4 x i32> %arg1)
@@ -595,30 +421,20 @@ define <2 x i64> @test_mm_rot_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_rot_epi64(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_rot_epi64:
-; X32:       # %bb.0:
-; X32-NEXT:    vprotq %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_rot_epi64:
-; X64:       # %bb.0:
-; X64-NEXT:    vprotq %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_rot_epi64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vprotq %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_mm_roti_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_roti_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vprotb $1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_roti_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vprotb $1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_roti_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vprotb $1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %arg0, i8 1)
   %bc = bitcast <16 x i8> %res to <2 x i64>
@@ -627,15 +443,10 @@ define <2 x i64> @test_mm_roti_epi8(<2 x i64> %a0) {
 declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_roti_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_roti_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vprotw $50, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_roti_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vprotw $50, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_roti_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vprotw $50, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %arg0, i8 50)
   %bc = bitcast <8 x i16> %res to <2 x i64>
@@ -644,15 +455,10 @@ define <2 x i64> @test_mm_roti_epi16(<2 x i64> %a0) {
 declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_roti_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm_roti_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vprotd $226, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_roti_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vprotd $226, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_roti_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vprotd $226, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %res = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %arg0, i8 -30)
   %bc = bitcast <4 x i32> %res to <2 x i64>
@@ -661,30 +467,20 @@ define <2 x i64> @test_mm_roti_epi32(<2 x i64> %a0) {
 declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_roti_epi64(<2 x i64> %a0) {
-; X32-LABEL: test_mm_roti_epi64:
-; X32:       # %bb.0:
-; X32-NEXT:    vprotq $100, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_roti_epi64:
-; X64:       # %bb.0:
-; X64-NEXT:    vprotq $100, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_roti_epi64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vprotq $100, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 100)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_shl_epi8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_shl_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_shl_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_shl_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %arg0, <16 x i8> %arg1)
@@ -694,15 +490,10 @@ define <2 x i64> @test_mm_shl_epi8(<2 x i64> %a0, <2 x i64> %a1) {
 declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_shl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_shl_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_shl_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_shl_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -712,15 +503,10 @@ define <2 x i64> @test_mm_shl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_shl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_shl_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshld %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_shl_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshld %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_shl_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %arg0, <4 x i32> %arg1)
@@ -730,30 +516,20 @@ define <2 x i64> @test_mm_shl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_shl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_shl_epi64:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_shl_epi64:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_shl_epi64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_mm_sha_epi8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sha_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshab %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sha_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshab %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_sha_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %arg0, <16 x i8> %arg1)
@@ -763,15 +539,10 @@ define <2 x i64> @test_mm_sha_epi8(<2 x i64> %a0, <2 x i64> %a1) {
 declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_sha_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sha_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sha_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_sha_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %res = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -781,15 +552,10 @@ define <2 x i64> @test_mm_sha_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_sha_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sha_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshad %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sha_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshad %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_sha_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %arg0, <4 x i32> %arg1)
@@ -799,30 +565,20 @@ define <2 x i64> @test_mm_sha_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_sha_epi64(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sha_epi64:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sha_epi64:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_sha_epi64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @test_mm_com_epu8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_com_epu8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcomltub %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_com_epu8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcomltub %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_com_epu8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcomltub %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %arg0, <16 x i8> %arg1, i8 0)
@@ -832,15 +588,10 @@ define <2 x i64> @test_mm_com_epu8(<2 x i64> %a0, <2 x i64> %a1) {
 declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_com_epu16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_com_epu16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcomltuw %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_com_epu16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcomltuw %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_com_epu16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcomltuw %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %arg0, <8 x i16> %arg1, i8 0)
@@ -850,15 +601,10 @@ define <2 x i64> @test_mm_com_epu16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_com_epu32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_com_epu32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcomltud %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_com_epu32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcomltud %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_com_epu32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcomltud %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %arg0, <4 x i32> %arg1, i8 0)
@@ -868,30 +614,20 @@ define <2 x i64> @test_mm_com_epu32(<2 x i64> %a0, <2 x i64> %a1) {
 declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_com_epu64(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_com_epu64:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcomltuq %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_com_epu64:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcomltuq %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_com_epu64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcomltuq %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_com_epi8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_com_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcomltb %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_com_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcomltb %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_com_epi8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcomltb %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %arg0, <16 x i8> %arg1, i8 0)
@@ -901,15 +637,10 @@ define <2 x i64> @test_mm_com_epi8(<2 x i64> %a0, <2 x i64> %a1) {
 declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_com_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_com_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcomltw %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_com_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcomltw %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_com_epi16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcomltw %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %arg0, <8 x i16> %arg1, i8 0)
@@ -919,15 +650,10 @@ define <2 x i64> @test_mm_com_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_com_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_com_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcomltd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_com_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcomltd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_com_epi32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcomltd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %arg0, <4 x i32> %arg1, i8 0)
@@ -937,60 +663,40 @@ define <2 x i64> @test_mm_com_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
 
 define <2 x i64> @test_mm_com_epi64(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_com_epi64:
-; X32:       # %bb.0:
-; X32-NEXT:    vpcomltq %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_com_epi64:
-; X64:       # %bb.0:
-; X64-NEXT:    vpcomltq %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_com_epi64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpcomltq %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
 
 define <2 x double> @test_mm_permute2_pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm_permute2_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_permute2_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_permute2_pd:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
 
 define <4 x double> @test_mm256_permute2_pd(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
-; X32-LABEL: test_mm256_permute2_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permute2_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permute2_pd:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
 
 define <4 x float> @test_mm_permute2_ps(<4 x float> %a0, <4 x float> %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm_permute2_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_permute2_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_permute2_ps:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %arg2, i8 0)
   ret <4 x float> %res
@@ -998,15 +704,10 @@ define <4 x float> @test_mm_permute2_ps(<4 x float> %a0, <4 x float> %a1, <2 x i
 declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
 
 define <8 x float> @test_mm256_permute2_ps(<8 x float> %a0, <8 x float> %a1, <4 x i64> %a2) {
-; X32-LABEL: test_mm256_permute2_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permute2_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_permute2_ps:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
   %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %arg2, i8 0)
   ret <8 x float> %res
@@ -1014,90 +715,60 @@ define <8 x float> @test_mm256_permute2_ps(<8 x float> %a0, <8 x float> %a1, <4
 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
 
 define <4 x float> @test_mm_frcz_ss(<4 x float> %a0) {
-; X32-LABEL: test_mm_frcz_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    vfrczss %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_frcz_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    vfrczss %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_frcz_ss:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vfrczss %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
 
 define <2 x double> @test_mm_frcz_sd(<2 x double> %a0) {
-; X32-LABEL: test_mm_frcz_sd:
-; X32:       # %bb.0:
-; X32-NEXT:    vfrczsd %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_frcz_sd:
-; X64:       # %bb.0:
-; X64-NEXT:    vfrczsd %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_frcz_sd:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vfrczsd %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
 
 define <4 x float> @test_mm_frcz_ps(<4 x float> %a0) {
-; X32-LABEL: test_mm_frcz_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    vfrczps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_frcz_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    vfrczps %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_frcz_ps:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vfrczps %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
 
 define <2 x double> @test_mm_frcz_pd(<2 x double> %a0) {
-; X32-LABEL: test_mm_frcz_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    vfrczpd %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_frcz_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    vfrczpd %xmm0, %xmm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm_frcz_pd:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vfrczpd %xmm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
 
 define <8 x float> @test_mm256_frcz_ps(<8 x float> %a0) {
-; X32-LABEL: test_mm256_frcz_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    vfrczps %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_frcz_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    vfrczps %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_frcz_ps:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vfrczps %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0)
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_frcz_pd(<4 x double> %a0) {
-; X32-LABEL: test_mm256_frcz_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    vfrczpd %ymm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_frcz_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    vfrczpd %ymm0, %ymm0
-; X64-NEXT:    retq
+; ALL-LABEL: test_mm256_frcz_pd:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vfrczpd %ymm0, %ymm0
+; ALL-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0)
   ret <4 x double> %res
 }
diff --git a/test/CodeGen/X86/xop-schedule.ll b/test/CodeGen/X86/xop-schedule.ll
index ba0073bc63db9cbfbea82930677e960bc9fa6b92..ed621039d38c5d0f410e88c6ef987de9a3668ce1 100644
--- a/test/CodeGen/X86/xop-schedule.ll
+++ b/test/CodeGen/X86/xop-schedule.ll
@@ -14,7 +14,7 @@ define void @test_vfrczpd(<2 x double> %a0, <4 x double> %a1, <2 x double> *%a2,
 ; GENERIC-NEXT:    vfrczpd (%rdi), %xmm0 # sched: [9:1.00]
 ; GENERIC-NEXT:    vfrczpd (%rsi), %ymm1 # sched: [10:1.00]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfrczpd:
@@ -62,7 +62,7 @@ define void @test_vfrczps(<4 x float> %a0, <4 x double> %a1, <4 x float> *%a2, <
 ; GENERIC-NEXT:    vfrczps (%rdi), %xmm0 # sched: [9:1.00]
 ; GENERIC-NEXT:    vfrczps (%rsi), %ymm1 # sched: [10:1.00]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vfrczps:
@@ -221,7 +221,7 @@ define void @test_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i
 ; GENERIC-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vpcmov_256:
@@ -425,7 +425,7 @@ define void @test_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; GENERIC-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vpermil2pd_256:
@@ -509,7 +509,7 @@ define void @test_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %
 ; GENERIC-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER12-LABEL: test_vpermil2ps_256:
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index 3aef8ddf681b7975786cdf6731466b5a20e93cee..65e5e83d7275dcf395d9ad2adb3f231c4b400972 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -493,18 +493,18 @@ define %struct.ref_s* @test12(%struct.ref_s* %op, i64 %osbot, i64 %intval) {
 ;
 ; X64-LIN-LABEL: test12:
 ; X64-LIN:       # %bb.0:
-; X64-LIN-NEXT:    xorq $-1, %rdx
-; X64-LIN-NEXT:    shlq $32, %rdx
-; X64-LIN-NEXT:    sarq $28, %rdx
-; X64-LIN-NEXT:    leaq (%rdx,%rdi), %rax
+; X64-LIN-NEXT:    notl %edx
+; X64-LIN-NEXT:    movslq %edx, %rax
+; X64-LIN-NEXT:    shlq $4, %rax
+; X64-LIN-NEXT:    addq %rdi, %rax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: test12:
 ; X64-WIN:       # %bb.0:
-; X64-WIN-NEXT:    xorq $-1, %r8
-; X64-WIN-NEXT:    shlq $32, %r8
-; X64-WIN-NEXT:    sarq $28, %r8
-; X64-WIN-NEXT:    leaq (%r8,%rcx), %rax
+; X64-WIN-NEXT:    notl %r8d
+; X64-WIN-NEXT:    movslq %r8d, %rax
+; X64-WIN-NEXT:    shlq $4, %rax
+; X64-WIN-NEXT:    addq %rcx, %rax
 ; X64-WIN-NEXT:    retq
   %neg = shl i64 %intval, 32
   %sext = xor i64 %neg, -4294967296
@@ -512,3 +512,32 @@ define %struct.ref_s* @test12(%struct.ref_s* %op, i64 %osbot, i64 %intval) {
   %add.ptr = getelementptr inbounds %struct.ref_s, %struct.ref_s* %op, i64 %idx.ext
   ret %struct.ref_s* %add.ptr
 }
+
+define i32 @PR39657(i8* %p, i64 %x) {
+; X32-LABEL: PR39657:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    notl %ecx
+; X32-NEXT:    movl (%eax,%ecx,4), %eax
+; X32-NEXT:    retl
+;
+; X64-LIN-LABEL: PR39657:
+; X64-LIN:       # %bb.0:
+; X64-LIN-NEXT:    notq %rsi
+; X64-LIN-NEXT:    movl (%rdi,%rsi,4), %eax
+; X64-LIN-NEXT:    retq
+;
+; X64-WIN-LABEL: PR39657:
+; X64-WIN:       # %bb.0:
+; X64-WIN-NEXT:    notq %rdx
+; X64-WIN-NEXT:    movl (%rcx,%rdx,4), %eax
+; X64-WIN-NEXT:    retq
+  %sh = shl i64 %x, 2
+  %mul = xor i64 %sh, -4
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 %mul
+  %bc = bitcast i8* %add.ptr to i32*
+  %load = load i32, i32* %bc, align 4
+  ret i32 %load
+}
+
diff --git a/test/CodeGen/X86/zext-extract_subreg.ll b/test/CodeGen/X86/zext-extract_subreg.ll
index 9e34abb69b398355b1c70c09b3ef8a5a919b983e..0b924d6ea953624695b29e1fbdd73bf908f651bf 100644
--- a/test/CodeGen/X86/zext-extract_subreg.ll
+++ b/test/CodeGen/X86/zext-extract_subreg.ll
@@ -1,7 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
 
 define void @t() nounwind ssp {
 ; CHECK-LABEL: t:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne LBB0_6
+; CHECK-NEXT:  ## %bb.1: ## %if.end.i
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je LBB0_2
+; CHECK-NEXT:  LBB0_6: ## %return
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB0_2: ## %if.end
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne LBB0_5
+; CHECK-NEXT:  ## %bb.3: ## %cond.true190
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne LBB0_5
+; CHECK-NEXT:  ## %bb.4: ## %cond.true225
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:  LBB0_5: ## %cond.false205
+; CHECK-NEXT:    ud2
 entry:
   br i1 undef, label %return, label %if.end.i
 
@@ -10,11 +36,6 @@ if.end.i:                                         ; preds = %entry
   br i1 undef, label %return, label %if.end
 
 if.end:                                           ; preds = %if.end.i
-; CHECK: %if.end
-; CHECK: movl (%{{.*}}), [[REG:%[a-z]+]]
-; CHECK-NOT: movl [[REG]], [[REG]]
-; CHECK-NEXT: testl [[REG]], [[REG]]
-; CHECK-NEXT: xorl
   %tmp138 = select i1 undef, i32 0, i32 %tmp7.i
   %tmp867 = zext i32 %tmp138 to i64
   br label %while.cond
diff --git a/test/CodeGen/X86/zext-logicop-shift-load.ll b/test/CodeGen/X86/zext-logicop-shift-load.ll
index 26182fe3e4cdba7a2fba77e4cebd070fca412b10..319c177b30a6c9bd3abbb8b8f412831717e6d049 100644
--- a/test/CodeGen/X86/zext-logicop-shift-load.ll
+++ b/test/CodeGen/X86/zext-logicop-shift-load.ll
@@ -1,12 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 
 define i64 @test1(i8* %data) {
 ; CHECK-LABEL: test1:
-; CHECK:       movzbl
-; CHECK-NEXT:  shlq
-; CHECK-NEXT:  andl
-; CHECK-NEXT:  retq
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    shlq $2, %rax
+; CHECK-NEXT:    andl $60, %eax
+; CHECK-NEXT:    retq
 entry:
   %bf.load = load i8, i8* %data, align 4
   %bf.clear = shl i8 %bf.load, 2
@@ -17,10 +19,11 @@ entry:
 
 define i8* @test2(i8* %data) {
 ; CHECK-LABEL: test2:
-; CHECK:       movzbl
-; CHECK-NEXT:  andl
-; CHECK-NEXT:  leaq
-; CHECK-NEXT:  retq
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    andl $15, %eax
+; CHECK-NEXT:    leaq (%rdi,%rax,4), %rax
+; CHECK-NEXT:    retq
 entry:
   %bf.load = load i8, i8* %data, align 4
   %bf.clear = shl i8 %bf.load, 2
@@ -33,11 +36,12 @@ entry:
 ; If the shift op is SHL, the logic op can only be AND.
 define i64 @test3(i8* %data) {
 ; CHECK-LABEL: test3:
-; CHECK:       movb
-; CHECK-NEXT:  shlb
-; CHECK-NEXT:  xorb
-; CHECK-NEXT:  movzbl
-; CHECK-NEXT:  retq
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb (%rdi), %al
+; CHECK-NEXT:    shlb $2, %al
+; CHECK-NEXT:    xorb $60, %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
 entry:
   %bf.load = load i8, i8* %data, align 4
   %bf.clear = shl i8 %bf.load, 2
@@ -48,10 +52,11 @@ entry:
 
 define i64 @test4(i8* %data) {
 ; CHECK-LABEL: test4:
-; CHECK:       movzbl
-; CHECK-NEXT:  shrq
-; CHECK-NEXT:  andl
-; CHECK-NEXT:  retq
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    shrq $2, %rax
+; CHECK-NEXT:    andl $60, %eax
+; CHECK-NEXT:    retq
 entry:
   %bf.load = load i8, i8* %data, align 4
   %bf.clear = lshr i8 %bf.load, 2
@@ -62,10 +67,11 @@ entry:
 
 define i64 @test5(i8* %data) {
 ; CHECK-LABEL: test5:
-; CHECK:       movzbl
-; CHECK-NEXT:  shrq
-; CHECK-NEXT:  xorq
-; CHECK-NEXT:  retq
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    shrq $2, %rax
+; CHECK-NEXT:    xorq $60, %rax
+; CHECK-NEXT:    retq
 entry:
   %bf.load = load i8, i8* %data, align 4
   %bf.clear = lshr i8 %bf.load, 2
@@ -76,10 +82,11 @@ entry:
 
 define i64 @test6(i8* %data) {
 ; CHECK-LABEL: test6:
-; CHECK:       movzbl
-; CHECK-NEXT:  shrq
-; CHECK-NEXT:  orq
-; CHECK-NEXT:  retq
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    shrq $2, %rax
+; CHECK-NEXT:    orq $60, %rax
+; CHECK-NEXT:    retq
 entry:
   %bf.load = load i8, i8* %data, align 4
   %bf.clear = lshr i8 %bf.load, 2
@@ -91,10 +98,13 @@ entry:
 ; Load is folded with sext.
 define i64 @test8(i8* %data) {
 ; CHECK-LABEL: test8:
-; CHECK:       movsbl
-; CHECK-NEXT:  movzwl
-; CHECK-NEXT:  shrl
-; CHECK-NEXT:  orl
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movsbl (%rdi), %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    shrl $2, %eax
+; CHECK-NEXT:    orl $60, %eax
+; CHECK-NEXT:    movl %eax, %eax
+; CHECK-NEXT:    retq
 entry:
   %bf.load = load i8, i8* %data, align 4
   %ext = sext i8 %bf.load to i16
diff --git a/test/CodeGen/XCore/codemodel.ll b/test/CodeGen/XCore/codemodel.ll
index d18525f3a9be7168458d696c1300a9435855cfb7..93b9d6d911df24b9a975d0e2c6f06c5ce1868779 100644
--- a/test/CodeGen/XCore/codemodel.ll
+++ b/test/CodeGen/XCore/codemodel.ll
@@ -1,6 +1,7 @@
 
 ; RUN: not llc < %s -march=xcore -code-model=medium 2>&1 | FileCheck %s -check-prefix=BAD_CM
 ; RUN: not llc < %s -march=xcore -code-model=kernel 2>&1 | FileCheck %s -check-prefix=BAD_CM
+; RUN: not llc < %s -march=xcore -code-model=tiny 2>&1 | FileCheck %s -check-prefix=BAD_CM
 ; BAD_CM: Target only supports CodeModel Small or Large
 
 
diff --git a/test/DebugInfo/COFF/defer-complete-type.ll b/test/DebugInfo/COFF/defer-complete-type.ll
index 109046c914d29e8fad7a372d8f5c066421888cac..7a5d552be4f8800e832aa7b850db27bfde61f49f 100644
--- a/test/DebugInfo/COFF/defer-complete-type.ll
+++ b/test/DebugInfo/COFF/defer-complete-type.ll
@@ -30,7 +30,6 @@
 ; CHECK:   Pointer (0x1001) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: A (0x1000)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -72,7 +71,6 @@
 ; CHECK:   Pointer (0x1005) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: B (0x1004)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
diff --git a/test/DebugInfo/COFF/frameproc-flags.ll b/test/DebugInfo/COFF/frameproc-flags.ll
index a9d0fe067da6e77b9019c07a95c84c701504318a..b2a1ef97cfb5a9090c6e3327fc631568b98d74d4 100644
--- a/test/DebugInfo/COFF/frameproc-flags.ll
+++ b/test/DebugInfo/COFF/frameproc-flags.ll
@@ -62,44 +62,44 @@
 ; }
 ; }
 
-; CHECK-LABEL: S_GPROC32_ID [size = 50] `use_alloca`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_GPROC32_ID [size = 52] `use_alloca`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = VFRAME, param fp reg = EBP
 ; CHECK:   flags = has alloca | secure checks | opt speed
-; CHECK-LABEL: S_GPROC32_ID [size = 51] `call_setjmp`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_GPROC32_ID [size = 52] `call_setjmp`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = NONE, param fp reg = NONE
 ; CHECK:   flags = has setjmp | opt speed
-; CHECK-LABEL: S_GPROC32_ID [size = 53] `use_inlineasm`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_GPROC32_ID [size = 56] `use_inlineasm`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = NONE, param fp reg = NONE
 ; CHECK:   flags = has inline asm | opt speed
-; CHECK-LABEL: S_GPROC32_ID [size = 46] `cpp_eh`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_GPROC32_ID [size = 48] `cpp_eh`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = EBP, param fp reg = EBP
 ; CHECK:   flags = has eh | opt speed
-; CHECK-LABEL: S_GPROC32_ID [size = 50] `use_inline`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_GPROC32_ID [size = 52] `use_inline`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = NONE, param fp reg = NONE
 ; CHECK:   flags = opt speed
 ; CHECK-LABEL: S_LPROC32_ID [size = 56] `is_marked_inline`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = NONE, param fp reg = NONE
 ; CHECK:   flags = marked inline | opt speed
-; CHECK-LABEL: S_GPROC32_ID [size = 43] `seh`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_GPROC32_ID [size = 44] `seh`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = EBP, param fp reg = EBP
 ; CHECK:   flags = has seh | opt speed
-; CHECK-LABEL: S_LPROC32_ID [size = 55] `?filt$0@0@seh@@`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_LPROC32_ID [size = 56] `?filt$0@0@seh@@`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = EBP, param fp reg = EBP
 ; CHECK:   flags = opt speed
-; CHECK-LABEL: S_GPROC32_ID [size = 49] `use_naked`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_GPROC32_ID [size = 52] `use_naked`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = NONE, param fp reg = NONE
 ; CHECK:   flags = has inline asm | naked | opt speed
-; CHECK-LABEL: S_GPROC32_ID [size = 51] `stack_guard`
-; CHECK: S_FRAMEPROC [size = 30]
+; CHECK-LABEL: S_GPROC32_ID [size = 52] `stack_guard`
+; CHECK: S_FRAMEPROC [size = 32]
 ; CHECK:   local fp reg = VFRAME, param fp reg = EBP
 ; CHECK:   flags = secure checks | opt speed
 
diff --git a/test/DebugInfo/COFF/function-options.ll b/test/DebugInfo/COFF/function-options.ll
index 0a8c3fba9ecc1947ec8df1d58eace55462b81da7..57d644724e54a73b91378ae31d7eae7a6cb287c9 100644
--- a/test/DebugInfo/COFF/function-options.ll
+++ b/test/DebugInfo/COFF/function-options.ll
@@ -82,7 +82,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: BClass ({{.*}})
-; CHECK:     ThisType: BClass* ({{.*}})
+; CHECK:     ThisType: BClass* const ({{.*}})
 ; CHECK:     CallingConvention: NearC (0x0)
 ; CHECK:     FunctionOptions [ (0x2)
 ; CHECK:       Constructor (0x2)
@@ -119,7 +119,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: C1Class ({{.*}})
-; CHECK:     ThisType: C1Class* ({{.*}})
+; CHECK:     ThisType: C1Class* const ({{.*}})
 ; CHECK:     CallingConvention: NearC (0x0)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
@@ -156,7 +156,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: C2Class ({{.*}})
-; CHECK:     ThisType: C2Class* ({{.*}})
+; CHECK:     ThisType: C2Class* const ({{.*}})
 ; CHECK:     CallingConvention: NearC (0x0)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
@@ -257,7 +257,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: BStruct ({{.*}})
-; CHECK:     ThisType: BStruct* ({{.*}})
+; CHECK:     ThisType: BStruct* const ({{.*}})
 ; CHECK:     CallingConvention: NearC (0x0)
 ; CHECK:     FunctionOptions [ (0x2)
 ; CHECK:       Constructor (0x2)
@@ -309,7 +309,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: BUnion ({{.*}})
-; CHECK:     ThisType: BUnion* ({{.*}})
+; CHECK:     ThisType: BUnion* const ({{.*}})
 ; CHECK:     CallingConvention: NearC (0x0)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
diff --git a/test/DebugInfo/COFF/global-type-hashes.ll b/test/DebugInfo/COFF/global-type-hashes.ll
index 2262a5458c3f91a506b9b815cffc8dca25793657..0bb68d40a7d8317e1bd42dad2312ee3a0d626da3 100644
--- a/test/DebugInfo/COFF/global-type-hashes.ll
+++ b/test/DebugInfo/COFF/global-type-hashes.ll
@@ -1,7 +1,6 @@
-; RUN: llc -filetype=obj -emit-codeview-ghash-section < %s > %t.obj
+; RUN: llc -filetype=obj < %s > %t.obj
 ; RUN: obj2yaml %t.obj | FileCheck %s --check-prefix=YAML
-; RUN: llc -filetype=asm -emit-codeview-ghash-section < %s \
-; RUN:   | FileCheck %s --check-prefix=ASM
+; RUN: llc -filetype=asm < %s | FileCheck %s --check-prefix=ASM
 
 ; C++ source to regenerate:
 ; $ cat t.cpp
@@ -96,14 +95,16 @@ attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.module.flags = !{!3, !4, !5, !6, !100}
 !llvm.ident = !{!7}
 
+
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
 !1 = !DIFile(filename: "<stdin>", directory: "D:\5Csrc\5Cllvmbuild\5Cclang\5CDebug\5Cx86", checksumkind: CSK_MD5, checksum: "6279449503d9075c38e615e8387667c3")
 !2 = !{}
 !3 = !{i32 1, !"NumRegisterParameters", i32 0}
 !4 = !{i32 2, !"CodeView", i32 1}
+!100 = !{i32 2, !"CodeViewGHash", i32 1}
 !5 = !{i32 2, !"Debug Info Version", i32 3}
 !6 = !{i32 1, !"wchar_size", i32 2}
 !7 = !{!"clang version 6.0.0 "}
@@ -186,7 +187,7 @@ attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-ma
 ; YAML:       - Kind:            LF_POINTER
 ; YAML:         Pointer:
 ; YAML:           ReferentType:    4100
-; YAML:           Attrs:           32778
+; YAML:           Attrs:           33802
 ; YAML:       - Kind:            LF_ARGLIST
 ; YAML:         ArgList:
 ; YAML:           ArgIndices:      [ 116, 116 ]
@@ -280,17 +281,21 @@ attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-ma
 ; YAML:         - DF04AA3125BBC50E
 ; YAML:         - 95CEBA304A2C4493
 ; YAML:         - C324F82D24D22283
-; YAML:         - 74698BE366891D3D
+; YAML:         - BB039258F2425BCF
 ; YAML:         - DDE23757322DB7C3
-; YAML:         - 1692305447C3D360
+; YAML:         - C935E8E1F016CC27
 ; YAML:         - D341E2F9BE57A1C7
-; YAML:         - 243F76AED6D8FB79
-; YAML:         - C75E2FD5D79D363D
-; YAML:         - 30FB9353B361B4B8
+; YAML:         - DD327744BE6783A4
+; YAML:         - 479521BB013A4AEC
+; YAML:         - 7820AAA31FC8CC67
 ; YAML:         - 0634944401BCC520
-; YAML:         - DF0B7BE0B5AD5B4F
-; YAML:         - 0916E74F3860DA92
-; YAML:         - 17D3CDC3384402DD
+; YAML:         - F380373F5DE8E9A3
+; YAML:         - 2ADB463E9E726E20
+; YAML:         - 74698BE366891D3D
+; YAML:         - 4470750F2E319329
+; YAML:         - 0FB556FD1FAB66D7
+; YAML:         - 5970EFB4874D0F3F
+; YAML:         - EDB1D74C120CF44A
 ; ...
 
 
diff --git a/test/DebugInfo/COFF/globals.ll b/test/DebugInfo/COFF/globals.ll
index fac53e1805ccc90d1f5be7d932bef912a7bb74b5..f26e11c94e9f42f075daa1a19e9692513000af5f 100644
--- a/test/DebugInfo/COFF/globals.ll
+++ b/test/DebugInfo/COFF/globals.ll
@@ -59,7 +59,6 @@
 ; OBJ:   Pointer (0x1001) {
 ; OBJ:     TypeLeafKind: LF_POINTER (0x1002)
 ; OBJ:     PointeeType: const int (0x1000)
-; OBJ:     PointerAttributes: 0x1000C
 ; OBJ:     PtrType: Near64 (0xC)
 ; OBJ:     PtrMode: Pointer (0x0)
 ; OBJ:     IsFlat: 0
diff --git a/test/DebugInfo/COFF/multifunction.ll b/test/DebugInfo/COFF/multifunction.ll
index c74b332d8b848f7fdf84e8b3454d1e1b90374645..1c19b34ab9386685ebde0c26937b74718a2b5c14 100644
--- a/test/DebugInfo/COFF/multifunction.ll
+++ b/test/DebugInfo/COFF/multifunction.ll
@@ -80,6 +80,7 @@
 ; X86-NEXT: .secidx _x
 ; X86-NEXT: .byte   0
 ; X86-NEXT: .asciz "x"
+; X86-NEXT: .p2align 2
 ; X86-NEXT: [[PROC_SEGMENT_END]]:
 ; X86-NEXT: .short  [[FPROC_END:[^ ]*]]-[[FPROC_BEG:[^ ]*]]           # Record length
 ; X86-NEXT: [[FPROC_BEG]]:
@@ -91,6 +92,7 @@
 ; X86-NEXT: .long   0                       # Exception handler offset
 ; X86-NEXT: .short  0                       # Exception handler section
 ; X86-NEXT: .long   0                       # Flags (defines frame register)
+; X86-NEXT: .p2align 2
 ; X86-NEXT: [[FPROC_END]]:
 ; X86-NEXT: .short  2
 ; X86-NEXT: .short  4431
@@ -117,6 +119,7 @@
 ; X86-NEXT: .secidx _y
 ; X86-NEXT: .byte   0
 ; X86-NEXT: .asciz "y"
+; X86-NEXT: .p2align 2
 ; X86-NEXT: [[PROC_SEGMENT_END]]:
 ; X86-NEXT: .short  [[FPROC_END:[^ ]*]]-[[FPROC_BEG:[^ ]*]]           # Record length
 ; X86-NEXT: [[FPROC_BEG]]:
@@ -128,6 +131,7 @@
 ; X86-NEXT: .long   0                       # Exception handler offset
 ; X86-NEXT: .short  0                       # Exception handler section
 ; X86-NEXT: .long   0                       # Flags (defines frame register)
+; X86-NEXT: .p2align 2
 ; X86-NEXT: [[FPROC_END]]:
 ; X86-NEXT: .short  2
 ; X86-NEXT: .short  4431
@@ -154,6 +158,7 @@
 ; X86-NEXT: .secidx _f
 ; X86-NEXT: .byte   0
 ; X86-NEXT: .asciz "f"
+; X86-NEXT: .p2align 2
 ; X86-NEXT: [[PROC_SEGMENT_END]]:
 ; X86-NEXT: .short  [[FPROC_END:[^ ]*]]-[[FPROC_BEG:[^ ]*]]           # Record length
 ; X86-NEXT: [[FPROC_BEG]]:
@@ -165,6 +170,7 @@
 ; X86-NEXT: .long   0                       # Exception handler offset
 ; X86-NEXT: .short  0                       # Exception handler section
 ; X86-NEXT: .long   0                       # Flags (defines frame register)
+; X86-NEXT: .p2align 2
 ; X86-NEXT: [[FPROC_END]]:
 ; X86-NEXT: .short  2
 ; X86-NEXT: .short  4431
@@ -386,6 +392,7 @@
 ; X64-NEXT: .secidx x
 ; X64-NEXT: .byte   0
 ; X64-NEXT: .asciz "x"
+; X64-NEXT: .p2align 2
 ; X64-NEXT: [[PROC_SEGMENT_END]]:
 ; X64-NEXT: .short  [[FPROC_END:[^ ]*]]-[[FPROC_BEG:[^ ]*]]           # Record length
 ; X64-NEXT: [[FPROC_BEG]]:
@@ -397,6 +404,7 @@
 ; X64-NEXT: .long   0                       # Exception handler offset
 ; X64-NEXT: .short  0                       # Exception handler section
 ; X64-NEXT: .long   81920                       # Flags (defines frame register)
+; X64-NEXT: .p2align 2
 ; X64-NEXT: [[FPROC_END]]:
 ; X64-NEXT: .short  2
 ; X64-NEXT: .short  4431
@@ -422,6 +430,7 @@
 ; X64-NEXT: .secidx y
 ; X64-NEXT: .byte   0
 ; X64-NEXT: .asciz "y"
+; X64-NEXT: .p2align 2
 ; X64-NEXT: [[PROC_SEGMENT_END]]:
 ; X64-NEXT: .short  [[FPROC_END:[^ ]*]]-[[FPROC_BEG:[^ ]*]]           # Record length
 ; X64-NEXT: [[FPROC_BEG]]:
@@ -433,6 +442,7 @@
 ; X64-NEXT: .long   0                       # Exception handler offset
 ; X64-NEXT: .short  0                       # Exception handler section
 ; X64-NEXT: .long   81920                       # Flags (defines frame register)
+; X64-NEXT: .p2align 2
 ; X64-NEXT: [[FPROC_END]]:
 ; X64-NEXT: .short  2
 ; X64-NEXT: .short  4431
@@ -458,6 +468,7 @@
 ; X64-NEXT: .secidx f
 ; X64-NEXT: .byte   0
 ; X64-NEXT: .asciz "f"
+; X64-NEXT: .p2align 2
 ; X64-NEXT: [[PROC_SEGMENT_END]]:
 ; X64-NEXT: .short  [[FPROC_END:[^ ]*]]-[[FPROC_BEG:[^ ]*]]           # Record length
 ; X64-NEXT: [[FPROC_BEG]]:
@@ -469,6 +480,7 @@
 ; X64-NEXT: .long   0                       # Exception handler offset
 ; X64-NEXT: .short  0                       # Exception handler section
 ; X64-NEXT: .long   81920                       # Flags (defines frame register)
+; X64-NEXT: .p2align 2
 ; X64-NEXT: [[FPROC_END]]:
 ; X64-NEXT: .short  2
 ; X64-NEXT: .short  4431
@@ -488,16 +500,16 @@
 ; OBJ64:      Relocations [
 ; OBJ64-NEXT:   0x64 IMAGE_REL_AMD64_SECREL x
 ; OBJ64-NEXT:   0x68 IMAGE_REL_AMD64_SECTION x
-; OBJ64-NEXT:   0x98 IMAGE_REL_AMD64_SECREL x
-; OBJ64-NEXT:   0x9C IMAGE_REL_AMD64_SECTION x
-; OBJ64-NEXT:   0xFC IMAGE_REL_AMD64_SECREL y
-; OBJ64-NEXT:   0x100 IMAGE_REL_AMD64_SECTION y
-; OBJ64-NEXT:   0x130 IMAGE_REL_AMD64_SECREL y
-; OBJ64-NEXT:   0x134 IMAGE_REL_AMD64_SECTION y
-; OBJ64-NEXT:   0x194 IMAGE_REL_AMD64_SECREL f
-; OBJ64-NEXT:   0x198 IMAGE_REL_AMD64_SECTION f
-; OBJ64-NEXT:   0x1C8 IMAGE_REL_AMD64_SECREL f
-; OBJ64-NEXT:   0x1CC IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x9C IMAGE_REL_AMD64_SECREL x
+; OBJ64-NEXT:   0xA0 IMAGE_REL_AMD64_SECTION x
+; OBJ64-NEXT:   0x100 IMAGE_REL_AMD64_SECREL y
+; OBJ64-NEXT:   0x104 IMAGE_REL_AMD64_SECTION y
+; OBJ64-NEXT:   0x138 IMAGE_REL_AMD64_SECREL y
+; OBJ64-NEXT:   0x13C IMAGE_REL_AMD64_SECTION y
+; OBJ64-NEXT:   0x19C IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x1A0 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x1D4 IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x1D8 IMAGE_REL_AMD64_SECTION f
 ; OBJ64-NEXT: ]
 ; OBJ64:      Subsection [
 ; OBJ64-NEXT:   SubSectionType: Symbols (0xF1)
diff --git a/test/DebugInfo/COFF/simple.ll b/test/DebugInfo/COFF/simple.ll
index 3c495a33b8a25770ff38371759d9609c78c7b12b..58d0f42a67ada08da8e28f6a5350df9d2f1f0747 100644
--- a/test/DebugInfo/COFF/simple.ll
+++ b/test/DebugInfo/COFF/simple.ll
@@ -58,6 +58,7 @@
 ; X86-NEXT: .secidx _f
 ; X86-NEXT: .byte   0
 ; X86-NEXT: .asciz "f"
+; X86-NEXT: .p2align 2
 ; X86-NEXT: [[PROC_SEGMENT_END]]:
 ; X86-NEXT: .short  [[FPROC_END:[^ ]*]]-[[FPROC_BEG:[^ ]*]]           # Record length
 ; X86-NEXT: [[FPROC_BEG]]:
@@ -69,6 +70,7 @@
 ; X86-NEXT: .long   0                       # Exception handler offset
 ; X86-NEXT: .short  0                       # Exception handler section
 ; X86-NEXT: .long   0                       # Flags (defines frame register)
+; X86-NEXT: .p2align 2
 ; X86-NEXT: [[FPROC_END]]:
 ; X86-NEXT: .short  2
 ; X86-NEXT: .short  4431
@@ -89,8 +91,8 @@
 ; OBJ32-NEXT:   0x44 IMAGE_REL_I386_DIR32NB _f
 ; OBJ32-NEXT:   0x90 IMAGE_REL_I386_SECREL _f
 ; OBJ32-NEXT:   0x94 IMAGE_REL_I386_SECTION _f
-; OBJ32-NEXT:   0xC4 IMAGE_REL_I386_SECREL _f
-; OBJ32-NEXT:   0xC8 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0xC8 IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0xCC IMAGE_REL_I386_SECTION _f
 ; OBJ32-NEXT: ]
 ; OBJ32:      Subsection [
 ; OBJ32-NEXT:   SubSectionType: Symbols (0xF1)
@@ -184,6 +186,7 @@
 ; X64-NEXT: .secidx f
 ; X64-NEXT: .byte   0
 ; X64-NEXT: .asciz "f"
+; X64-NEXT: .p2align 2
 ; X64-NEXT: [[PROC_SEGMENT_END]]:
 ; X64-NEXT: .short  [[FPROC_END:[^ ]*]]-[[FPROC_BEG:[^ ]*]]           # Record length
 ; X64-NEXT: [[FPROC_BEG]]:
@@ -195,6 +198,7 @@
 ; X64-NEXT: .long   0                       # Exception handler offset
 ; X64-NEXT: .short  0                       # Exception handler section
 ; X64-NEXT: .long   81920                       # Flags (defines frame register)
+; X64-NEXT: .p2align 2
 ; X64-NEXT: [[FPROC_END]]:
 ; X64-NEXT: .short  2
 ; X64-NEXT: .short  4431
@@ -214,8 +218,8 @@
 ; OBJ64:      Relocations [
 ; OBJ64-NEXT:   0x64 IMAGE_REL_AMD64_SECREL f
 ; OBJ64-NEXT:   0x68 IMAGE_REL_AMD64_SECTION f
-; OBJ64-NEXT:   0x98 IMAGE_REL_AMD64_SECREL f
-; OBJ64-NEXT:   0x9C IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x9C IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0xA0 IMAGE_REL_AMD64_SECTION f
 ; OBJ64-NEXT: ]
 ; OBJ64:      Subsection [
 ; OBJ64-NEXT:   SubSectionType: Symbols (0xF1)
diff --git a/test/DebugInfo/COFF/static-methods.ll b/test/DebugInfo/COFF/static-methods.ll
index 1ff2650a3292a79b19d23d1c26b931d9946e474c..19c422ccaf3dc9f4e1c1520f9bdb4960e56dabd9 100644
--- a/test/DebugInfo/COFF/static-methods.ll
+++ b/test/DebugInfo/COFF/static-methods.ll
@@ -28,7 +28,7 @@
 ; CHECK-NEXT:    TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK-NEXT:    ReturnType: void (0x3)
 ; CHECK-NEXT:    ClassType: A ({{.*}})
-; CHECK-NEXT:    ThisType: A* ({{.*}})
+; CHECK-NEXT:    ThisType: A* const ({{.*}})
 ; CHECK-NEXT:    CallingConvention: ThisCall (0xB)
 ; CHECK-NEXT:    FunctionOptions [ (0x0)
 ; CHECK-NEXT:    ]
diff --git a/test/DebugInfo/COFF/thunk.ll b/test/DebugInfo/COFF/thunk.ll
index eac48ba1284f6541b9df1ca6b83c96c360c9a140..c8f5c5ad4367fcaca649aa2ba54b5a465c4ef904 100644
--- a/test/DebugInfo/COFF/thunk.ll
+++ b/test/DebugInfo/COFF/thunk.ll
@@ -70,6 +70,7 @@
 ; ASM-NEXT:   .short  Lfunc_end{{.*}}-"[[NAME1]]" # Code size 
 ; ASM-NEXT:   .byte   0                       # Ordinal 
 ; ASM-NEXT:   .asciz  "[[NAME1]]"             # Function name 
+; ASM-NEXT:   .p2align 2
 ; ASM-NEXT: [[END1]]:
 ; ASM-NEXT:   .short  2                       # Record length 
 ; ASM-NEXT:   .short  4431                    # Record kind: S_PROC_ID_END 
@@ -88,6 +89,7 @@
 ; ASM-NEXT:   .short Lfunc_end{{.*}}-"[[NAME2]]" # Code size
 ; ASM-NEXT:   .byte 0                         # Ordinal
 ; ASM-NEXT:   .asciz "[[NAME2]]"              # Function name
+; ASM-NEXT:   .p2align 2
 ; ASM-NEXT: [[END2]]:
 ; ASM-NEXT:   .short 2                        # Record length
 ; ASM-NEXT:   .short 4431                     # Record kind: S_PROC_ID_END 
diff --git a/test/DebugInfo/COFF/type-quals.ll b/test/DebugInfo/COFF/type-quals.ll
index 9bcc58f3556be58a02c0e8c3f65ac26b278acd4c..c81daa81d0c0efecef8acca0e1a6aa0edeb349fb 100644
--- a/test/DebugInfo/COFF/type-quals.ll
+++ b/test/DebugInfo/COFF/type-quals.ll
@@ -52,7 +52,6 @@
 ; CHECK:   Pointer (0x1001) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: const volatile int (0x1000)
-; CHECK:     PointerAttributes: 0x1100C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -99,7 +98,6 @@
 ; CHECK:   Pointer (0x1006) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x1100C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -112,7 +110,6 @@
 ; CHECK:   Pointer (0x1007) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: float (0x40)
-; CHECK:     PointerAttributes: 0x1100C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -125,7 +122,6 @@
 ; CHECK:   Pointer (0x1008) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x1120C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -170,7 +166,6 @@
 ; CHECK:   Pointer (0x100E) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: const int (0x100D)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -183,7 +178,6 @@
 ; CHECK:   Pointer (0x100F) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x1102C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: LValueReference (0x1)
 ; CHECK:     IsFlat: 0
@@ -252,11 +246,10 @@
 ; CHECK:   Pointer (0x1017) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: h::Foo (0x1016)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
-; CHECK:     IsConst: 0
+; CHECK:     IsConst: 1
 ; CHECK:     IsVolatile: 0
 ; CHECK:     IsUnaligned: 0
 ; CHECK:     IsRestrict: 0
@@ -273,7 +266,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: int (0x74)
 ; CHECK:     ClassType: h::Foo (0x1016)
-; CHECK:     ThisType: h::Foo* (0x1017)
+; CHECK:     ThisType: h::Foo* const (0x1017)
 ; CHECK:     CallingConvention: NearC (0x0)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
@@ -313,7 +306,6 @@
 ; CHECK:   Pointer (0x101D) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x904C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToDataMember (0x2)
 ; CHECK:     IsFlat: 0
@@ -328,7 +320,6 @@
 ; CHECK:   Pointer (0x101E) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int h::Foo::(int) (0x1019)
-; CHECK:     PointerAttributes: 0x1006C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToMemberFunction (0x3)
 ; CHECK:     IsFlat: 0
@@ -346,16 +337,28 @@
 ; CHECK:     FunctionType: int h::Foo::(int) (0x1019)
 ; CHECK:     Name: func
 ; CHECK:   }
-; CHECK:   Modifier (0x1020) {
+; CHECK:   Pointer (0x1020) {
+; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
+; CHECK:     PointeeType: h::Foo (0x1016)
+; CHECK:     PtrType: Near64 (0xC)
+; CHECK:     PtrMode: Pointer (0x0)
+; CHECK:     IsFlat: 0
+; CHECK:     IsConst: 0
+; CHECK:     IsVolatile: 0
+; CHECK:     IsUnaligned: 0
+; CHECK:     IsRestrict: 0
+; CHECK:     SizeOf: 8
+; CHECK:   }
+; CHECK:   Modifier (0x1021) {
 ; CHECK:     TypeLeafKind: LF_MODIFIER (0x1001)
 ; CHECK:     ModifiedType: char (0x70)
 ; CHECK:     Modifiers [ (0x1)
 ; CHECK:       Const (0x1)
 ; CHECK:     ]
 ; CHECK:   }
-; CHECK:   Array (0x1021) {
+; CHECK:   Array (0x1022) {
 ; CHECK:     TypeLeafKind: LF_ARRAY (0x1503)
-; CHECK:     ElementType: const char (0x1020)
+; CHECK:     ElementType: const char (0x1021)
 ; CHECK:     IndexType: unsigned __int64 (0x23)
 ; CHECK:     SizeOf: 4
 ; CHECK:     Name:
diff --git a/test/DebugInfo/COFF/types-array-advanced.ll b/test/DebugInfo/COFF/types-array-advanced.ll
index 7ddca6ca5164bb6f971c0b0923db9e8348feaac5..e1a8c1841bb526ec39dafa84007f344b1c8ddc6a 100644
--- a/test/DebugInfo/COFF/types-array-advanced.ll
+++ b/test/DebugInfo/COFF/types-array-advanced.ll
@@ -53,7 +53,6 @@
 ; CHECK:   Pointer (0x1004) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: 0x1003
-; CHECK:     PointerAttributes: 0x2A
 ; CHECK:     PtrType: Near32 (0xA)
 ; CHECK:     PtrMode: LValueReference (0x1)
 ; CHECK:     IsFlat: 0
@@ -107,7 +106,6 @@
 ; CHECK:   Pointer (0x100A) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: 0x1009
-; CHECK:     PointerAttributes: 0x800A
 ; CHECK:     PtrType: Near32 (0xA)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
diff --git a/test/DebugInfo/COFF/types-basic.ll b/test/DebugInfo/COFF/types-basic.ll
index 57282dad6de23d53a0edd87e161fa6bb0e12ceed..7b1bb8525020ed99aada70a0dbed24e388367d20 100644
--- a/test/DebugInfo/COFF/types-basic.ll
+++ b/test/DebugInfo/COFF/types-basic.ll
@@ -69,7 +69,6 @@
 ; CHECK:   Pointer (0x1004) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: const int (0x1003)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -92,7 +91,6 @@
 ; CHECK:   Pointer (0x1006) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x804C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToDataMember (0x2)
 ; CHECK:     IsFlat: 0
@@ -105,11 +103,10 @@
 ; CHECK:   Pointer (0x1007) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: A (0x1005)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
-; CHECK:     IsConst: 0
+; CHECK:     IsConst: 1
 ; CHECK:     IsVolatile: 0
 ; CHECK:     IsUnaligned: 0
 ; CHECK:   }
@@ -123,7 +120,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: A (0x1005)
-; CHECK:     ThisType: A* (0x1007)
+; CHECK:     ThisType: A* const (0x1007)
 ; CHECK:     CallingConvention: NearC (0x0)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
@@ -170,7 +167,6 @@
 ; CHECK:   Pointer (0x100E) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: void A::() (0x1009)
-; CHECK:     PointerAttributes: 0x1006C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToMemberFunction (0x3)
 ; CHECK:     IsFlat: 0
@@ -190,7 +186,6 @@
 ; CHECK:   Pointer (0x1010) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: const void (0x100F)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
diff --git a/test/DebugInfo/COFF/types-calling-conv.ll b/test/DebugInfo/COFF/types-calling-conv.ll
index 4be18256f8ab406dcc90581dac3ba861f6ae342b..3e0d9f1e36415c7e5aa6905cd328af392c18ad1a 100644
--- a/test/DebugInfo/COFF/types-calling-conv.ll
+++ b/test/DebugInfo/COFF/types-calling-conv.ll
@@ -30,11 +30,10 @@
 ; CHECK:   Pointer (0x1001) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: A (0x1000)
-; CHECK:     PointerAttributes: 0x800A
 ; CHECK:     PtrType: Near32 (0xA)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
-; CHECK:     IsConst: 0
+; CHECK:     IsConst: 1
 ; CHECK:     IsVolatile: 0
 ; CHECK:     IsUnaligned: 0
 ; CHECK:     SizeOf: 4
@@ -49,7 +48,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: A (0x1000)
-; CHECK:     ThisType: A* (0x1001)
+; CHECK:     ThisType: A* const (0x1001)
 ; CHECK:     CallingConvention: ThisCall (0xB)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
@@ -93,7 +92,19 @@
 ; CHECK:     FunctionType: void A::() (0x1003)
 ; CHECK:     Name: A::thiscallcc
 ; CHECK:   }
-; CHECK:   Procedure (0x1009) {
+; CHECK:   Pointer (0x1009) {
+; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
+; CHECK:     PointeeType: A (0x1000)
+; CHECK:     PtrType: Near32 (0xA)
+; CHECK:     PtrMode: Pointer (0x0)
+; CHECK:     IsFlat: 0
+; CHECK:     IsConst: 0
+; CHECK:     IsVolatile: 0
+; CHECK:     IsUnaligned: 0
+; CHECK:     IsRestrict: 0
+; CHECK:     SizeOf: 4
+; CHECK:   }
+; CHECK:   Procedure (0x100A) {
 ; CHECK:     TypeLeafKind: LF_PROCEDURE (0x1008)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     CallingConvention: NearC (0x0)
@@ -102,13 +113,13 @@
 ; CHECK:     NumParameters: 0
 ; CHECK:     ArgListType: () (0x1002)
 ; CHECK:   }
-; CHECK:   FuncId (0x100A) {
+; CHECK:   FuncId (0x100B) {
 ; CHECK:     TypeLeafKind: LF_FUNC_ID (0x1601)
 ; CHECK:     ParentScope: 0x0
-; CHECK:     FunctionType: void () (0x1009)
+; CHECK:     FunctionType: void () (0x100A)
 ; CHECK:     Name: cdeclcc
 ; CHECK:   }
-; CHECK:   Procedure (0x100B) {
+; CHECK:   Procedure (0x100C) {
 ; CHECK:     TypeLeafKind: LF_PROCEDURE (0x1008)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     CallingConvention: NearFast (0x4)
@@ -117,13 +128,13 @@
 ; CHECK:     NumParameters: 0
 ; CHECK:     ArgListType: () (0x1002)
 ; CHECK:   }
-; CHECK:   FuncId (0x100C) {
+; CHECK:   FuncId (0x100D) {
 ; CHECK:     TypeLeafKind: LF_FUNC_ID (0x1601)
 ; CHECK:     ParentScope: 0x0
-; CHECK:     FunctionType: void () (0x100B)
+; CHECK:     FunctionType: void () (0x100C)
 ; CHECK:     Name: fastcallcc
 ; CHECK:   }
-; CHECK:   Procedure (0x100D) {
+; CHECK:   Procedure (0x100E) {
 ; CHECK:     TypeLeafKind: LF_PROCEDURE (0x1008)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     CallingConvention: NearStdCall (0x7)
@@ -132,13 +143,13 @@
 ; CHECK:     NumParameters: 0
 ; CHECK:     ArgListType: () (0x1002)
 ; CHECK:   }
-; CHECK:   FuncId (0x100E) {
+; CHECK:   FuncId (0x100F) {
 ; CHECK:     TypeLeafKind: LF_FUNC_ID (0x1601)
 ; CHECK:     ParentScope: 0x0
-; CHECK:     FunctionType: void () (0x100D)
+; CHECK:     FunctionType: void () (0x100E)
 ; CHECK:     Name: stdcallcc
 ; CHECK:   }
-; CHECK:   Procedure (0x100F) {
+; CHECK:   Procedure (0x1010) {
 ; CHECK:     TypeLeafKind: LF_PROCEDURE (0x1008)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     CallingConvention: NearVector (0x18)
@@ -147,10 +158,10 @@
 ; CHECK:     NumParameters: 0
 ; CHECK:     ArgListType: () (0x1002)
 ; CHECK:   }
-; CHECK:   FuncId (0x1010) {
+; CHECK:   FuncId (0x1011) {
 ; CHECK:     TypeLeafKind: LF_FUNC_ID (0x1601)
 ; CHECK:     ParentScope: 0x0
-; CHECK:     FunctionType: void () (0x100F)
+; CHECK:     FunctionType: void () (0x1010)
 ; CHECK:     Name: vectorcallcc
 ; CHECK:   }
 ; CHECK: ]
diff --git a/test/DebugInfo/COFF/types-cvarargs.ll b/test/DebugInfo/COFF/types-cvarargs.ll
index 5f1df233e447b8e526480042d8d8e7c223a7a9ff..5e366f0e981f97c830ac7de4c65e630c2542898c 100644
--- a/test/DebugInfo/COFF/types-cvarargs.ll
+++ b/test/DebugInfo/COFF/types-cvarargs.ll
@@ -25,7 +25,7 @@
 ; CHECK:  }
 ; CHECK:  Subsection [
 ; CHECK:    SubSectionType: Symbols (0xF1)
-; CHECK:    SubSectionSize: 0x2A
+; CHECK:    SubSectionSize:
 ; CHECK:    UDTSym {
 ; CHECK:      Kind: S_UDT (0x1108)
 ; CHECK:      Type: MemberTest::A (0x1008)
@@ -33,7 +33,7 @@
 ; CHECK:    }
 ; CHECK:    UDTSym {
 ; CHECK:      Kind: S_UDT (0x1108)
-; CHECK:      Type: void (int, float, <no type>)* (0x100E)
+; CHECK:      Type: void (int, float, <no type>)* (0x100F)
 ; CHECK:      UDTName: FuncTypedef
 ; CHECK:    }
 ; CHECK:  ]
diff --git a/test/DebugInfo/COFF/types-data-members.ll b/test/DebugInfo/COFF/types-data-members.ll
index c7d3c65632380783e54fa1e1a7beabe44cb32567..e82a9370f66d11168e50d859bd43cac20fc02006 100644
--- a/test/DebugInfo/COFF/types-data-members.ll
+++ b/test/DebugInfo/COFF/types-data-members.ll
@@ -247,7 +247,6 @@
 ; CHECK:   Pointer (0x1012) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: const int (0x1004)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -263,7 +262,6 @@
 ; CHECK:   Pointer (0x1014) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: <vftable 1 methods> (0x1013)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
@@ -362,11 +360,10 @@
 ; CHECK:   Pointer (0x101C) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: DerivedClass (0x1011)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
-; CHECK:     IsConst: 0
+; CHECK:     IsConst: 1
 ; CHECK:     IsVolatile: 0
 ; CHECK:     IsUnaligned: 0
 ; CHECK:     SizeOf: 8
@@ -375,7 +372,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: DerivedClass (0x1011)
-; CHECK:     ThisType: DerivedClass* (0x101C)
+; CHECK:     ThisType: DerivedClass* const (0x101C)
 ; CHECK:     CallingConvention: NearC (0x0)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
diff --git a/test/DebugInfo/COFF/types-method-ref-qualifiers.ll b/test/DebugInfo/COFF/types-method-ref-qualifiers.ll
new file mode 100644
index 0000000000000000000000000000000000000000..305eeeb713a474847307e144f974da24f23fbc10
--- /dev/null
+++ b/test/DebugInfo/COFF/types-method-ref-qualifiers.ll
@@ -0,0 +1,205 @@
+; RUN: llc < %s -filetype=obj | llvm-readobj - -codeview | FileCheck %s
+
+; C++ source to regenerate:
+; struct A {
+;   int NoRefQual();
+;
+;   int RefQual() &;
+;   int RefQual() &&;
+;
+;   int LValueRef() &;
+;
+;   int RValueRef() &&;
+; };
+;
+; void foo() {
+;   A *GenericPtr = nullptr;
+;   A a;
+; }
+
+
+; ModuleID = 'foo.cpp'
+source_filename = "foo.cpp"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.15.26732"
+
+%struct.A = type { i8 }
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @"?foo@@YAXXZ"() #0 !dbg !10 {
+entry:
+  %GenericPtr = alloca %struct.A*, align 8
+  %a = alloca %struct.A, align 1
+  call void @llvm.dbg.declare(metadata %struct.A** %GenericPtr, metadata !13, metadata !DIExpression()), !dbg !28
+  store %struct.A* null, %struct.A** %GenericPtr, align 8, !dbg !28
+  call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !29, metadata !DIExpression()), !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.linker.options = !{!3, !4}
+!llvm.module.flags = !{!5, !6, !7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "foo.cpp", directory: "D:\5C\5Csrc\5C\5Cllvmbuild\5C\5Cninja-x64", checksumkind: CSK_MD5, checksum: "d1b6ae9dc9ab85ca0a41c8b8c79a0b6a")
+!2 = !{}
+!3 = !{!"/DEFAULTLIB:libcmt.lib"}
+!4 = !{!"/DEFAULTLIB:oldnames.lib"}
+!5 = !{i32 2, !"CodeView", i32 1}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 2}
+!8 = !{i32 7, !"PIC Level", i32 2}
+!9 = !{!"clang version 8.0.0 "}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "?foo@@YAXXZ", scope: !1, file: !1, line: 12, type: !11, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!11 = !DISubroutineType(types: !12)
+!12 = !{null}
+!13 = !DILocalVariable(name: "GenericPtr", scope: !10, file: !1, line: 13, type: !14)
+!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !15, size: 64)
+!15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "A", file: !1, line: 1, size: 8, flags: DIFlagTypePassByValue | DIFlagTrivial, elements: !16, identifier: ".?AUA@@")
+!16 = !{!17, !22, !24, !26, !27}
+!17 = !DISubprogram(name: "NoRefQual", linkageName: "?NoRefQual@A@@QEAAHXZ", scope: !15, file: !1, line: 2, type: !18, isLocal: false, isDefinition: false, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!20, !21}
+!20 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !15, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DISubprogram(name: "RefQual", linkageName: "?RefQual@A@@QEGAAHXZ", scope: !15, file: !1, line: 4, type: !23, isLocal: false, isDefinition: false, scopeLine: 4, flags: DIFlagPrototyped | DIFlagLValueReference, isOptimized: false)
+!23 = !DISubroutineType(flags: DIFlagLValueReference, types: !19)
+!24 = !DISubprogram(name: "RefQual", linkageName: "?RefQual@A@@QEHAAHXZ", scope: !15, file: !1, line: 5, type: !25, isLocal: false, isDefinition: false, scopeLine: 5, flags: DIFlagPrototyped | DIFlagRValueReference, isOptimized: false)
+!25 = !DISubroutineType(flags: DIFlagRValueReference, types: !19)
+!26 = !DISubprogram(name: "LValueRef", linkageName: "?LValueRef@A@@QEGAAHXZ", scope: !15, file: !1, line: 7, type: !23, isLocal: false, isDefinition: false, scopeLine: 7, flags: DIFlagPrototyped | DIFlagLValueReference, isOptimized: false)
+!27 = !DISubprogram(name: "RValueRef", linkageName: "?RValueRef@A@@QEHAAHXZ", scope: !15, file: !1, line: 9, type: !25, isLocal: false, isDefinition: false, scopeLine: 9, flags: DIFlagPrototyped | DIFlagRValueReference, isOptimized: false)
+!28 = !DILocation(line: 13, scope: !10)
+!29 = !DILocalVariable(name: "a", scope: !10, file: !1, line: 14, type: !15)
+!30 = !DILocation(line: 14, scope: !10)
+!31 = !DILocation(line: 15, scope: !10)
+
+
+
+
+; CHECK: CodeViewTypes [
+; CHECK:   Section: .debug$T (7)
+; CHECK:   Magic: 0x4
+; CHECK:   Pointer (0x1005) {
+; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
+; CHECK:     PointeeType: A (0x1003)
+; CHECK:     PtrType: Near64 (0xC)
+; CHECK:     PtrMode: Pointer (0x0)
+; CHECK:     IsFlat: 0
+; CHECK:     IsConst: 1
+; CHECK:     IsVolatile: 0
+; CHECK:     IsUnaligned: 0
+; CHECK:     IsRestrict: 0
+; CHECK:     IsThisPtr&: 0
+; CHECK:     IsThisPtr&&: 0
+; CHECK:     SizeOf: 8
+; CHECK:   }
+; CHECK:   MemberFunction (0x1006) {
+; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
+; CHECK:     ReturnType: int (0x74)
+; CHECK:     ClassType: A (0x1003)
+; CHECK:     ThisType: A* const (0x1005)
+; CHECK:     CallingConvention: NearC (0x0)
+; CHECK:     FunctionOptions [ (0x0)
+; CHECK:     ]
+; CHECK:     NumParameters: 0
+; CHECK:     ArgListType: () (0x1000)
+; CHECK:     ThisAdjustment: 0
+; CHECK:   }
+; CHECK:   Pointer (0x1007) {
+; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
+; CHECK:     PointeeType: A (0x1003)
+; CHECK:     PtrType: Near64 (0xC)
+; CHECK:     PtrMode: Pointer (0x0)
+; CHECK:     IsFlat: 0
+; CHECK:     IsConst: 1
+; CHECK:     IsVolatile: 0
+; CHECK:     IsUnaligned: 0
+; CHECK:     IsRestrict: 0
+; CHECK:     IsThisPtr&: 1
+; CHECK:     IsThisPtr&&: 0
+; CHECK:     SizeOf: 136
+; CHECK:   }
+; CHECK:   MemberFunction (0x1008) {
+; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
+; CHECK:     ReturnType: int (0x74)
+; CHECK:     ClassType: A (0x1003)
+; CHECK:     ThisType: A* const (0x1007)
+; CHECK:     CallingConvention: NearC (0x0)
+; CHECK:     FunctionOptions [ (0x0)
+; CHECK:     ]
+; CHECK:     NumParameters: 0
+; CHECK:     ArgListType: () (0x1000)
+; CHECK:     ThisAdjustment: 0
+; CHECK:   }
+; CHECK:   Pointer (0x1009) {
+; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
+; CHECK:     PointeeType: A (0x1003)
+; CHECK:     PtrType: Near64 (0xC)
+; CHECK:     PtrMode: Pointer (0x0)
+; CHECK:     IsFlat: 0
+; CHECK:     IsConst: 1
+; CHECK:     IsVolatile: 0
+; CHECK:     IsUnaligned: 0
+; CHECK:     IsRestrict: 0
+; CHECK:     IsThisPtr&: 0
+; CHECK:     IsThisPtr&&: 1
+; CHECK:     SizeOf: 8
+; CHECK:   }
+; CHECK:   MemberFunction (0x100A) {
+; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
+; CHECK:     ReturnType: int (0x74)
+; CHECK:     ClassType: A (0x1003)
+; CHECK:     ThisType: A* const (0x1009)
+; CHECK:     CallingConvention: NearC (0x0)
+; CHECK:     FunctionOptions [ (0x0)
+; CHECK:     ]
+; CHECK:     NumParameters: 0
+; CHECK:     ArgListType: () (0x1000)
+; CHECK:     ThisAdjustment: 0
+; CHECK:   }
+; CHECK:   MethodOverloadList (0x100B) {
+; CHECK:     TypeLeafKind: LF_METHODLIST (0x1206)
+; CHECK:     Method [
+; CHECK:       AccessSpecifier: Public (0x3)
+; CHECK:       Type: int A::() (0x1008)
+; CHECK:     ]
+; CHECK:     Method [
+; CHECK:       AccessSpecifier: Public (0x3)
+; CHECK:       Type: int A::() (0x100A)
+; CHECK:     ]
+; CHECK:   }
+; CHECK:   FieldList (0x100C) {
+; CHECK:     TypeLeafKind: LF_FIELDLIST (0x1203)
+; CHECK:     OneMethod {
+; CHECK:       TypeLeafKind: LF_ONEMETHOD (0x1511)
+; CHECK:       AccessSpecifier: Public (0x3)
+; CHECK:       Type: int A::() (0x1006)
+; CHECK:       Name: NoRefQual
+; CHECK:     }
+; CHECK:     OverloadedMethod {
+; CHECK:       TypeLeafKind: LF_METHOD (0x150F)
+; CHECK:       MethodCount: 0x2
+; CHECK:       MethodListIndex: 0x100B
+; CHECK:       Name: RefQual
+; CHECK:     }
+; CHECK:     OneMethod {
+; CHECK:       TypeLeafKind: LF_ONEMETHOD (0x1511)
+; CHECK:       AccessSpecifier: Public (0x3)
+; CHECK:       Type: int A::() (0x1008)
+; CHECK:       Name: LValueRef
+; CHECK:     }
+; CHECK:     OneMethod {
+; CHECK:       TypeLeafKind: LF_ONEMETHOD (0x1511)
+; CHECK:       AccessSpecifier: Public (0x3)
+; CHECK:       Type: int A::() (0x100A)
+; CHECK:       Name: RValueRef
+; CHECK:     }
+; CHECK:   }
+; CHECK: ]
diff --git a/test/DebugInfo/COFF/types-non-virtual-methods.ll b/test/DebugInfo/COFF/types-non-virtual-methods.ll
index 687d1bdf6f5f58dc6a5f0285289568565c6d15df..a06ffb0d9958691c315f01488e978af981f5616f 100644
--- a/test/DebugInfo/COFF/types-non-virtual-methods.ll
+++ b/test/DebugInfo/COFF/types-non-virtual-methods.ll
@@ -64,11 +64,10 @@
 ; CHECK:   Pointer (0x1004) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: A (0x1003)
-; CHECK:     PointerAttributes: 0x800A
 ; CHECK:     PtrType: Near32 (0xA)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
-; CHECK:     IsConst: 0
+; CHECK:     IsConst: 1
 ; CHECK:     IsVolatile: 0
 ; CHECK:     IsUnaligned: 0
 ; CHECK:   }
@@ -76,7 +75,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: A (0x1003)
-; CHECK:     ThisType: A* (0x1004)
+; CHECK:     ThisType: A* const (0x1004)
 ; CHECK:     CallingConvention: ThisCall (0xB)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
@@ -144,11 +143,10 @@
 ; CHECK:   Pointer (0x100B) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: B (0x100A)
-; CHECK:     PointerAttributes: 0x800A
 ; CHECK:     PtrType: Near32 (0xA)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
-; CHECK:     IsConst: 0
+; CHECK:     IsConst: 1
 ; CHECK:     IsVolatile: 0
 ; CHECK:     IsUnaligned: 0
 ; CHECK:     SizeOf: 4
@@ -157,7 +155,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: B (0x100A)
-; CHECK:     ThisType: B* (0x100B)
+; CHECK:     ThisType: B* const (0x100B)
 ; CHECK:     CallingConvention: ThisCall (0xB)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
@@ -176,7 +174,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: B (0x100A)
-; CHECK:     ThisType: B* (0x100B)
+; CHECK:     ThisType: B* const (0x100B)
 ; CHECK:     CallingConvention: ThisCall (0xB)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
@@ -195,7 +193,7 @@
 ; CHECK:     TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK:     ReturnType: void (0x3)
 ; CHECK:     ClassType: B (0x100A)
-; CHECK:     ThisType: B* (0x100B)
+; CHECK:     ThisType: B* const (0x100B)
 ; CHECK:     CallingConvention: ThisCall (0xB)
 ; CHECK:     FunctionOptions [ (0x0)
 ; CHECK:     ]
diff --git a/test/DebugInfo/COFF/types-ptr-to-member.ll b/test/DebugInfo/COFF/types-ptr-to-member.ll
index 52640ec472a0238a3139650b3071b9f5a296c533..3961be68a4d87cb16a2b067d623c85ebdc8acac8 100644
--- a/test/DebugInfo/COFF/types-ptr-to-member.ll
+++ b/test/DebugInfo/COFF/types-ptr-to-member.ll
@@ -27,7 +27,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x804C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToDataMember (0x2)
 ; CHECK:     IsFlat: 0
@@ -41,7 +40,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x804C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToDataMember (0x2)
 ; CHECK:     IsFlat: 0
@@ -55,7 +53,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x1004C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToDataMember (0x2)
 ; CHECK:     IsFlat: 0
@@ -69,7 +66,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int (0x74)
-; CHECK:     PointerAttributes: 0x1804C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToDataMember (0x2)
 ; CHECK:     IsFlat: 0
@@ -86,7 +82,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: void A::()
-; CHECK:     PointerAttributes: 0x1006C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToMemberFunction (0x3)
 ; CHECK:     IsFlat: 0
@@ -100,7 +95,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: void C::()
-; CHECK:     PointerAttributes: 0x2006C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToMemberFunction (0x3)
 ; CHECK:     IsFlat: 0
@@ -114,7 +108,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: void D::()
-; CHECK:     PointerAttributes: 0x2006C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToMemberFunction (0x3)
 ; CHECK:     IsFlat: 0
@@ -128,7 +121,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: void E::()
-; CHECK:     PointerAttributes: 0x3006C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToMemberFunction (0x3)
 ; CHECK:     IsFlat: 0
@@ -144,7 +136,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: int
-; CHECK:     PointerAttributes: 0x4C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToDataMember (0x2)
 ; CHECK:     IsFlat: 0
@@ -160,7 +151,6 @@
 ; CHECK:   Pointer ({{.*}}) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: void Incomplete::()
-; CHECK:     PointerAttributes: 0x6C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: PointerToMemberFunction (0x3)
 ; CHECK:     IsFlat: 0
diff --git a/test/DebugInfo/COFF/types-recursive-struct.ll b/test/DebugInfo/COFF/types-recursive-struct.ll
index 2a3af295f0bc0706970614ce7eb492f0db273074..3920826523b5ed24349e9fede1df704fd10d239e 100644
--- a/test/DebugInfo/COFF/types-recursive-struct.ll
+++ b/test/DebugInfo/COFF/types-recursive-struct.ll
@@ -69,7 +69,6 @@
 ; CHECK:   Pointer (0x1005) {
 ; CHECK:     TypeLeafKind: LF_POINTER (0x1002)
 ; CHECK:     PointeeType: B (0x1004)
-; CHECK:     PointerAttributes: 0x1000C
 ; CHECK:     PtrType: Near64 (0xC)
 ; CHECK:     PtrMode: Pointer (0x0)
 ; CHECK:     IsFlat: 0
diff --git a/test/DebugInfo/COFF/udts-complete.ll b/test/DebugInfo/COFF/udts-complete.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ace86ee4e668fb73b7688fb67e1a373de44134aa
--- /dev/null
+++ b/test/DebugInfo/COFF/udts-complete.ll
@@ -0,0 +1,72 @@
+; RUN: llc -filetype=obj %s -o %t.obj
+; RUN: llvm-pdbutil dump -symbols -types %t.obj | FileCheck %s
+
+; Make sure that `gv`, `Bar`, and `Baz` all point to the complete type index of
+; Foo (0x1002), and not the forward declaration index (0x1000).
+
+; C++ source:
+; struct Foo {
+;   int x;
+; };
+; typedef Foo Bar;
+; typedef Bar Baz;
+; static_assert(sizeof(Foo) == 4, "");
+; Baz gv;
+
+; CHECK:                       Types (.debug$T)
+; CHECK-NEXT: ============================================================
+; CHECK: 0x1000 | LF_STRUCTURE [size = 36] `Foo`
+; CHECK:          unique name: `.?AUFoo@@`
+; CHECK:          vtable: <no type>, base list: <no type>, field list: <no type>
+; CHECK:          options: forward ref | has unique name, sizeof 0
+; CHECK: 0x1001 | LF_FIELDLIST [size = 16]
+; CHECK:          - LF_MEMBER [name = `x`, Type = 0x0074 (int), offset = 0, attrs = public]
+; CHECK: 0x1002 | LF_STRUCTURE [size = 36] `Foo`
+; CHECK:          unique name: `.?AUFoo@@`
+; CHECK:          vtable: <no type>, base list: <no type>, field list: 0x1001
+; CHECK:          options: has unique name, sizeof 4
+; CHECK: 0x1004 | LF_UDT_SRC_LINE [size = 16]
+; CHECK:          udt = 0x1002, file = 4099, line = 1
+
+; CHECK:                           Symbols
+; CHECK: ============================================================
+; CHECK:   Mod 0000 | `.debug$S`:
+; CHECK:        0 | S_GDATA32 [size = 20] `gv`
+; CHECK:            type = 0x1002 (Foo), addr = 0000:0000
+; CHECK:        0 | S_UDT [size = 12] `Bar`
+; CHECK:            original type = 0x1002
+; CHECK:        0 | S_UDT [size = 12] `Baz`
+; CHECK:            original type = 0x1002
+; CHECK:        0 | S_UDT [size = 12] `Foo`
+; CHECK:            original type = 0x1002
+
+; ModuleID = 't.cpp'
+source_filename = "t.cpp"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.14.26433"
+
+%struct.Foo = type { i32 }
+
+@"?gv@@3UFoo@@A" = dso_local global %struct.Foo zeroinitializer, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!12, !13, !14, !15}
+!llvm.ident = !{!16}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "gv", linkageName: "?gv@@3UFoo@@A", scope: !2, file: !3, line: 7, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 8.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: None)
+!3 = !DIFile(filename: "t.cpp", directory: "C:\5Csrc\5Cllvm-project\5Cbuild", checksumkind: CSK_MD5, checksum: "e21cd263d43db2ff55050be3e41e789f")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "Baz", file: !3, line: 5, baseType: !7)
+!7 = !DIDerivedType(tag: DW_TAG_typedef, name: "Bar", file: !3, line: 4, baseType: !8)
+!8 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo", file: !3, line: 1, size: 32, flags: DIFlagTypePassByValue | DIFlagTrivial, elements: !9, identifier: ".?AUFoo@@")
+!9 = !{!10}
+!10 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !8, file: !3, line: 2, baseType: !11, size: 32)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{i32 2, !"CodeView", i32 1}
+!13 = !{i32 2, !"Debug Info Version", i32 3}
+!14 = !{i32 1, !"wchar_size", i32 2}
+!15 = !{i32 7, !"PIC Level", i32 2}
+!16 = !{!"clang version 8.0.0 "}
diff --git a/test/DebugInfo/COFF/udts.ll b/test/DebugInfo/COFF/udts.ll
index 114e11a70e8c2d20a0908a25885e25c8dda8ee6d..d17f5cd4921b8485dee6ccfe45eb60dcfde1838f 100644
--- a/test/DebugInfo/COFF/udts.ll
+++ b/test/DebugInfo/COFF/udts.ll
@@ -79,13 +79,13 @@
 ; PDBUTIL:                           Symbols
 ; PDBUTIL-NEXT: ============================================================
 ; PDBUTIL-NOT:   S_UDT {{.*}} `A::C`
-; PDBUTIL:       S_UDT [size = 15] `f::FOO`
-; PDBUTIL:       S_UDT [size = 15] `g::pun`
-; PDBUTIL:       S_UDT [size = 10] `S`
-; PDBUTIL:       S_UDT [size = 10] `A`
-; PDBUTIL:       S_UDT [size = 13] `A::D`
-; PDBUTIL:       S_UDT [size = 10] `U`
-; PDBUTIL:       S_UDT [size = 10] `U`
+; PDBUTIL:       S_UDT [size = 16] `f::FOO`
+; PDBUTIL:       S_UDT [size = 16] `g::pun`
+; PDBUTIL:       S_UDT [size = 12] `S`
+; PDBUTIL:       S_UDT [size = 12] `A`
+; PDBUTIL:       S_UDT [size = 16] `A::D`
+; PDBUTIL:       S_UDT [size = 12] `U`
+; PDBUTIL:       S_UDT [size = 12] `U`
 
 source_filename = "test/DebugInfo/COFF/udts.ll"
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
@@ -190,4 +190,4 @@ attributes #1 = { nounwind readnone speculatable }
 !53 = !DILocation(line: 9, column: 5, scope: !38)
 !54 = !DILocation(line: 9, column: 7, scope: !38)
 !55 = !DILocation(line: 10, column: 12, scope: !38)
-!56 = !DILocation(line: 10, column: 3, scope: !38)
\ No newline at end of file
+!56 = !DILocation(line: 10, column: 3, scope: !38)
diff --git a/test/DebugInfo/COFF/vframe-fpo.ll b/test/DebugInfo/COFF/vframe-fpo.ll
index b3b71530214cf2970ac353d9673c2950eea52f8d..27273cd5c00121517d7a66ca07604629781b3b9a 100644
--- a/test/DebugInfo/COFF/vframe-fpo.ll
+++ b/test/DebugInfo/COFF/vframe-fpo.ll
@@ -64,14 +64,14 @@
 ; CODEVIEW-NEXT:   Magic: 0x4
 ; CODEVIEW-NEXT:   Subsection [
 ; CODEVIEW-NEXT:     SubSectionType: Symbols (0xF1)
-; CODEVIEW-NEXT:     SubSectionSize: 0x2F
+; CODEVIEW-NEXT:     SubSectionSize:
 ; CODEVIEW-NEXT:     Compile3Sym {
 ; CODEVIEW-NEXT:       Kind: S_COMPILE3 (0x113C)
 ; CODEVIEW:          }
 ; CODEVIEW:        ]
 ; CODEVIEW:        Subsection [
 ; CODEVIEW-NEXT:     SubSectionType: FrameData (0xF5)
-; CODEVIEW-NEXT:     SubSectionSize: 0xA4
+; CODEVIEW-NEXT:     SubSectionSize:
 ; CODEVIEW-NEXT:     LinkageName: _main
 ; CODEVIEW:          FrameData {
 ; CODEVIEW:          }
@@ -109,7 +109,7 @@
 
 ; CODEVIEW:      Subsection [
 ; CODEVIEW-NEXT:   SubSectionType: Symbols (0xF1)
-; CODEVIEW-NEXT:   SubSectionSize: 0xA2
+; CODEVIEW-NEXT:   SubSectionSize:
 ; CODEVIEW-NEXT:   GlobalProcIdSym {
 ; CODEVIEW-NEXT:     Kind: S_GPROC32_ID (0x1147)
 ; CODEVIEW:          DisplayName: main
diff --git a/test/DebugInfo/COFF/virtual-methods.ll b/test/DebugInfo/COFF/virtual-methods.ll
index 78a9489d32851da3274383f4c85c506875ff1d08..2691ff6de21cda1f5d6d172ab1d90d611ba280ed 100644
--- a/test/DebugInfo/COFF/virtual-methods.ll
+++ b/test/DebugInfo/COFF/virtual-methods.ll
@@ -25,7 +25,7 @@
 ; CHECK-NEXT:   TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK-NEXT:   ReturnType: int (0x74)
 ; CHECK-NEXT:   ClassType: A ({{.*}})
-; CHECK-NEXT:   ThisType: A* ({{.*}})
+; CHECK-NEXT:   ThisType: A* const ({{.*}})
 ; CHECK-NEXT:   CallingConvention: NearC (0x0)
 ; CHECK-NEXT:   FunctionOptions [ (0x0)
 ; CHECK-NEXT:   ]
@@ -39,7 +39,7 @@
 ; CHECK-NEXT:   TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK-NEXT:   ReturnType: int (0x74)
 ; CHECK-NEXT:   ClassType: B ({{.*}})
-; CHECK-NEXT:   ThisType: B* ({{.*}})
+; CHECK-NEXT:   ThisType: B* const ({{.*}})
 ; CHECK-NEXT:   CallingConvention: NearC (0x0)
 ; CHECK-NEXT:   FunctionOptions [ (0x0)
 ; CHECK-NEXT:   ]
@@ -53,7 +53,7 @@
 ; CHECK-NEXT:   TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK-NEXT:   ReturnType: int (0x74)
 ; CHECK-NEXT:   ClassType: C ({{.*}})
-; CHECK-NEXT:   ThisType: C* ({{.*}})
+; CHECK-NEXT:   ThisType: C* const ({{.*}})
 ; CHECK-NEXT:   CallingConvention: NearC (0x0)
 ; CHECK-NEXT:   FunctionOptions [ (0x0)
 ; CHECK-NEXT:   ]
@@ -77,7 +77,7 @@
 ; CHECK-NEXT:   TypeLeafKind: LF_MFUNCTION (0x1009)
 ; CHECK-NEXT:   ReturnType: int (0x74)
 ; CHECK-NEXT:   ClassType: D ({{.*}})
-; CHECK-NEXT:   ThisType: D* ({{.*}})
+; CHECK-NEXT:   ThisType: D* const ({{.*}})
 ; CHECK-NEXT:   CallingConvention: NearC (0x0)
 ; CHECK-NEXT:   FunctionOptions [ (0x0)
 ; CHECK-NEXT:   ]
diff --git a/test/DebugInfo/Generic/codegenprep-value.ll b/test/DebugInfo/Generic/codegenprep-value.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cf394383b10b0efbdfbaff445baf188d96024735
--- /dev/null
+++ b/test/DebugInfo/Generic/codegenprep-value.ll
@@ -0,0 +1,75 @@
+; RUN: opt -codegenprepare -S %s | FileCheck %s
+;
+; Generated from the following source with:
+; clang -O2 -g -S -emit-llvm -mllvm -stop-after=indirectbr-expand test.cpp
+;
+; extern void use(int);
+; extern int foo(long long);
+; 
+; void test(int p)
+; {
+;   int i = p + 4;
+;   (void)foo(i);  // sign extension of i
+;   if (p)
+;     return;
+;   use(i);        // use of original i
+; }
+;
+; PR39845: Check that CodeGenPrepare does not drop the reference to a local when it is
+;          sign-extended and used later.
+;
+; CHECK: define{{.*}}test
+; CHECK: call{{.*}}dbg.value(metadata i32 %p
+; CHECK: call{{.*}}dbg.value(metadata i32 %add
+;
+define dso_local void @_Z4testi(i32 %p) local_unnamed_addr !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %p, metadata !12, metadata !DIExpression()), !dbg !14
+  %add = add nsw i32 %p, 4, !dbg !15
+  call void @llvm.dbg.value(metadata i32 %add, metadata !13, metadata !DIExpression()), !dbg !16
+  %conv = sext i32 %add to i64, !dbg !17
+  %call = tail call i32 @_Z3foox(i64 %conv), !dbg !18
+  %tobool = icmp eq i32 %p, 0, !dbg !19
+  br i1 %tobool, label %if.end, label %cleanup, !dbg !21
+
+if.end:
+  tail call void @_Z3usei(i32 %add), !dbg !22
+  br label %cleanup, !dbg !23
+
+cleanup:
+  ret void, !dbg !23
+}
+
+declare dso_local i32 @_Z3foox(i64) local_unnamed_addr
+declare dso_local void @_Z3usei(i32) local_unnamed_addr
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 348209)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "test.cpp", directory: "/home/test/src")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 348209)"}
+!7 = distinct !DISubprogram(name: "test", linkageName: "_Z4testi", scope: !1, file: !1, line: 4, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!12, !13}
+!12 = !DILocalVariable(name: "p", arg: 1, scope: !7, file: !1, line: 4, type: !10)
+!13 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 6, type: !10)
+!14 = !DILocation(line: 4, column: 15, scope: !7)
+!15 = !DILocation(line: 6, column: 13, scope: !7)
+!16 = !DILocation(line: 6, column: 7, scope: !7)
+!17 = !DILocation(line: 7, column: 13, scope: !7)
+!18 = !DILocation(line: 7, column: 9, scope: !7)
+!19 = !DILocation(line: 8, column: 7, scope: !20)
+!20 = distinct !DILexicalBlock(scope: !7, file: !1, line: 8, column: 7)
+!21 = !DILocation(line: 8, column: 7, scope: !7)
+!22 = !DILocation(line: 10, column: 3, scope: !7)
+!23 = !DILocation(line: 11, column: 1, scope: !7)
diff --git a/test/DebugInfo/Generic/invalid.ll b/test/DebugInfo/Generic/invalid.ll
index 9bf9e6374fc9b54641662a56c8cea323b5ed5b5a..2ff26c0860e4c6ff3ecec02d96f6baaff8c50851 100644
--- a/test/DebugInfo/Generic/invalid.ll
+++ b/test/DebugInfo/Generic/invalid.ll
@@ -3,9 +3,9 @@
 ; Make sure we emit this diagnostic only once (which means we don't visit the
 ; same DISubprogram twice.
 ; CHECK: subprogram definitions must have a compile unit
-; CHECK-NEXT: !3 = distinct !DISubprogram(name: "patatino", scope: null, isLocal: false, isDefinition: true, isOptimized: false)
+; CHECK-NEXT: !3 = distinct !DISubprogram(name: "patatino", scope: null, spFlags: DISPFlagDefinition)
 ; CHECK-NOT: subprogram definitions must have a compile unit
-; CHECK-NOT: !3 = distinct !DISubprogram(name: "patatino", scope: null, isLocal: false, isDefinition: true, isOptimized: false)
+; CHECK-NOT: !3 = distinct !DISubprogram(name: "patatino", scope: null, spFlags: DISPFlagDefinition)
 ; CHECK: warning: ignoring invalid debug info
 
 define void @tinkywinky() !dbg !3 { ret void }
diff --git a/test/DebugInfo/NVPTX/debug-file-loc-only.ll b/test/DebugInfo/NVPTX/debug-file-loc-only.ll
index 389a7c65781ae19990111331cd1f15a2bc1a7faa..f47e95624a20ff744d4ca0d33c42f59706fa8791 100644
--- a/test/DebugInfo/NVPTX/debug-file-loc-only.ll
+++ b/test/DebugInfo/NVPTX/debug-file-loc-only.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
 
-; // Bitcode int this test case is reduced version of compiled code below:
+; // Bitcode in this test case is reduced version of compiled code below:
 ;extern "C" {
 ;#line 1 "/source/dir/foo.h"
 ;__device__ void foo() {}
diff --git a/test/DebugInfo/NVPTX/debug-info.ll b/test/DebugInfo/NVPTX/debug-info.ll
index 02e6240aa3eea49d4499d768f7db6e5462b73e73..47101b00ee79eebfb3c82385bfcbb5e5c3d70e01 100644
--- a/test/DebugInfo/NVPTX/debug-info.ll
+++ b/test/DebugInfo/NVPTX/debug-info.ll
@@ -687,12 +687,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // }
 ; CHECK: // .section .debug_info
 ; CHECK: // {
-; CHECK: // .b32 10025                           // Length of Unit
+; CHECK: // .b32 10030                           // Length of Unit
 ; CHECK: // .b8 2                                // DWARF version number
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
-; CHECK: // .b8 1                                // Abbrev [1] 0xb:0x2722 DW_TAG_compile_unit
+; CHECK: // .b8 1                                // Abbrev [1] 0xb:0x2727 DW_TAG_compile_unit
 ; CHECK: // .b8 0                                // DW_AT_producer
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
@@ -4719,7 +4719,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 3                                // DW_AT_decl_line
 ; CHECK: // .b32 3345                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK: // .b8 40                               // Abbrev [40] 0x2671:0xbb DW_TAG_subprogram
+; CHECK: // .b8 40                               // Abbrev [40] 0x2671:0xc0 DW_TAG_subprogram
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
@@ -4779,7 +4779,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b64 Ltmp3                           // DW_AT_high_pc
 ; CHECK: // .b8 12                               // DW_AT_call_file
 ; CHECK: // .b8 6                                // DW_AT_call_line
-; CHECK: // .b8 43                               // Abbrev [43] 0x270e:0x1d DW_TAG_inlined_subroutine
+; CHECK: // .b8 43                               // Abbrev [43] 0x270e:0x22 DW_TAG_inlined_subroutine
 ; CHECK: // .b32 9791                            // DW_AT_abstract_origin
 ; CHECK: // .b64 Ltmp8                           // DW_AT_low_pc
 ; CHECK: // .b64 Ltmp9                           // DW_AT_high_pc
diff --git a/test/DebugInfo/PDB/module-bytes.test b/test/DebugInfo/PDB/module-bytes.test
index f325331ac5329e9a7c03f2af74a896aea1e3d11a..2f8cae4e5b9a0c3157687137fad42fa195c836f0 100644
--- a/test/DebugInfo/PDB/module-bytes.test
+++ b/test/DebugInfo/PDB/module-bytes.test
@@ -42,44 +42,45 @@ SYMS:                             Module Symbols
 SYMS-NEXT: ============================================================
 SYMS-NEXT:     Mod 0000 | `d:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj`:
 SYMS-NEXT:       Symbols (
-SYMS-NEXT:         6004: 36000111 00000000 643A5C73 72635C6C 6C766D5C 74657374 5C446562 7567496E  |6.......d:\src\llvm\test\DebugIn|
-SYMS-NEXT:         6024: 666F5C50 44425C49 6E707574 735C656D 7074792E 6F626A00 3A003C11 01200000  |fo\PDB\Inputs\empty.obj.:.<.. ..|
-SYMS-NEXT:         6044: 07001200 00007D79 00001200 00007D79 00004D69 63726F73 6F667420 28522920  |......}y......}y..Microsoft (R) |
-SYMS-NEXT:         6064: 4F707469 6D697A69 6E672043 6F6D7069 6C657200 2A001011 00000000 C4000000  |Optimizing Compiler.*...........|
-SYMS-NEXT:         6084: 00000000 0A000000 03000000 08000000 01100000 10000000 0100016D 61696E00  |...........................main.|
-SYMS-NEXT:         60A4: 1E001210 00000000 00000000 00000000 00000000 00000000 00000082 12000000  |................................|
-SYMS-NEXT:         60C4: 02000600 06004C11 0E100000                                               |......L.....|
+SYMS-NEXT:         6000: 04000000 36000111 00000000 643A5C73 72635C6C 6C766D5C 74657374 5C446562  |....6.......d:\src\llvm\test\Deb|
+SYMS-NEXT:         6020: 7567496E 666F5C50 44425C49 6E707574 735C656D 7074792E 6F626A00 3A003C11  |ugInfo\PDB\Inputs\empty.obj.:.<.|
+SYMS-NEXT:         6040: 01200000 07001200 00007D79 00001200 00007D79 00004D69 63726F73 6F667420  |. ........}y......}y..Microsoft |
+SYMS-NEXT:         6060: 28522920 4F707469 6D697A69 6E672043 6F6D7069 6C657200 2A001011 00000000  |(R) Optimizing Compiler.*.......|
+SYMS-NEXT:         6080: C4000000 00000000 0A000000 03000000 08000000 01100000 10000000 0100016D  |...............................m|
+SYMS-NEXT:         60A0: 61696E00 1E001210 00000000 00000000 00000000 00000000 00000000 00000082  |ain.............................|
+SYMS-NEXT:         60C0: 12000000 02000600 06004C11 0E100000                                      |..........L.....|
 SYMS-NEXT:       )
 SYMS-NEXT:     Mod 0001 | `* Linker *`:
 SYMS-NEXT:       Symbols (
-SYMS-NEXT:         7004: 12000111 00000000 2A204C69 6E6B6572 202A0000 2E003C11 07000000 03000000  |........* Linker *....<.........|
-SYMS-NEXT:         7024: 00000000 00000C00 00007D79 00004D69 63726F73 6F667420 28522920 4C494E4B  |..........}y..Microsoft (R) LINK|
-SYMS-NEXT:         7044: 00000000 AA003D11 00637764 00643A5C 7372635C 6C6C766D 5C746573 745C4465  |......=..cwd.d:\src\llvm\test\De|
-SYMS-NEXT:         7064: 62756749 6E666F5C 5044425C 496E7075 74730065 78650043 3A5C5072 6F677261  |bugInfo\PDB\Inputs.exe.C:\Progra|
-SYMS-NEXT:         7084: 6D204669 6C657320 28783836 295C4D69 63726F73 6F667420 56697375 616C2053  |m Files (x86)\Microsoft Visual S|
-SYMS-NEXT:         70A4: 74756469 6F203132 2E305C56 435C4249 4E5C6C69 6E6B2E65 78650070 64620064  |tudio 12.0\VC\BIN\link.exe.pdb.d|
-SYMS-NEXT:         70C4: 3A5C7372 635C6C6C 766D5C74 6573745C 44656275 67496E66 6F5C5044 425C496E  |:\src\llvm\test\DebugInfo\PDB\In|
-SYMS-NEXT:         70E4: 70757473 5C656D70 74792E70 64620000 12002C11 00000500 05000000 10000000  |puts\empty.pdb....,.............|
-SYMS-NEXT:         7104: 01000100 1A003611 01000C00 00100000 1A100000 20000060 2E746578 74000000  |......6............. ..`.text...|
-SYMS-NEXT:         7124: 1A003711 1A100000 20000060 00000000 01002E74 65787424 6D6E0000 1A003611  |..7..... ..`.......text$mn....6.|
-SYMS-NEXT:         7144: 02000C00 00300000 B2020000 40000040 2E726461 74610000 1A003711 43010000  |.....0......@..@.rdata....7.C...|
-SYMS-NEXT:         7164: 40000040 00000000 02002E72 64617461 00000000 1A003711 00000000 40000040  |@..@.......rdata......7.....@..@|
-SYMS-NEXT:         7184: 43010000 02002E65 64617461 00000000 1E003711 6E010000 40000040 44010000  |C......edata......7.n...@..@D...|
-SYMS-NEXT:         71A4: 02002E72 64617461 24646562 75670000 1A003611 03000C00 00400000 04000000  |...rdata$debug....6......@......|
-SYMS-NEXT:         71C4: 400000C0 2E646174 61000000 16003711 04000000 800000C0 00000000 03002E62  |@....data.....7................b|
-SYMS-NEXT:         71E4: 73730000 1A003611 04000C00 00500000 08000000 40000042 2E72656C 6F630000  |ss....6......P......@..B.reloc..|
+SYMS-NEXT:         7000: 04000000 12000111 00000000 2A204C69 6E6B6572 202A0000 2E003C11 07000000  |............* Linker *....<.....|
+SYMS-NEXT:         7020: 03000000 00000000 00000C00 00007D79 00004D69 63726F73 6F667420 28522920  |..............}y..Microsoft (R) |
+SYMS-NEXT:         7040: 4C494E4B 00000000 AA003D11 00637764 00643A5C 7372635C 6C6C766D 5C746573  |LINK......=..cwd.d:\src\llvm\tes|
+SYMS-NEXT:         7060: 745C4465 62756749 6E666F5C 5044425C 496E7075 74730065 78650043 3A5C5072  |t\DebugInfo\PDB\Inputs.exe.C:\Pr|
+SYMS-NEXT:         7080: 6F677261 6D204669 6C657320 28783836 295C4D69 63726F73 6F667420 56697375  |ogram Files (x86)\Microsoft Visu|
+SYMS-NEXT:         70A0: 616C2053 74756469 6F203132 2E305C56 435C4249 4E5C6C69 6E6B2E65 78650070  |al Studio 12.0\VC\BIN\link.exe.p|
+SYMS-NEXT:         70C0: 64620064 3A5C7372 635C6C6C 766D5C74 6573745C 44656275 67496E66 6F5C5044  |db.d:\src\llvm\test\DebugInfo\PD|
+SYMS-NEXT:         70E0: 425C496E 70757473 5C656D70 74792E70 64620000 12002C11 00000500 05000000  |B\Inputs\empty.pdb....,.........|
+SYMS-NEXT:         7100: 10000000 01000100 1A003611 01000C00 00100000 1A100000 20000060 2E746578  |..........6............. ..`.tex|
+SYMS-NEXT:         7120: 74000000 1A003711 1A100000 20000060 00000000 01002E74 65787424 6D6E0000  |t.....7..... ..`.......text$mn..|
+SYMS-NEXT:         7140: 1A003611 02000C00 00300000 B2020000 40000040 2E726461 74610000 1A003711  |..6......0......@..@.rdata....7.|
+SYMS-NEXT:         7160: 43010000 40000040 00000000 02002E72 64617461 00000000 1A003711 00000000  |C...@..@.......rdata......7.....|
+SYMS-NEXT:         7180: 40000040 43010000 02002E65 64617461 00000000 1E003711 6E010000 40000040  |@..@C......edata......7.n...@..@|
+SYMS-NEXT:         71A0: 44010000 02002E72 64617461 24646562 75670000 1A003611 03000C00 00400000  |D......rdata$debug....6......@..|
+SYMS-NEXT:         71C0: 04000000 400000C0 2E646174 61000000 16003711 04000000 800000C0 00000000  |....@....data.....7.............|
+SYMS-NEXT:         71E0: 03002E62 73730000 1A003611 04000C00 00500000 08000000 40000042 2E72656C  |...bss....6......P......@..B.rel|
+SYMS-NEXT:         7200: 6F630000                                                                 |oc..|
 SYMS-NEXT:       )
 
 FILTERED-SYMS:                             Module Symbols
 FILTERED-SYMS-NEXT: ============================================================
 FILTERED-SYMS-NEXT:     Mod 0000 | `d:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj`:
 FILTERED-SYMS-NEXT:       Symbols (
-FILTERED-SYMS-NEXT:         6004: 36000111 00000000 643A5C73 72635C6C 6C766D5C 74657374 5C446562 7567496E  |6.......d:\src\llvm\test\DebugIn|
-FILTERED-SYMS-NEXT:         6024: 666F5C50 44425C49 6E707574 735C656D 7074792E 6F626A00 3A003C11 01200000  |fo\PDB\Inputs\empty.obj.:.<.. ..|
-FILTERED-SYMS-NEXT:         6044: 07001200 00007D79 00001200 00007D79 00004D69 63726F73 6F667420 28522920  |......}y......}y..Microsoft (R) |
-FILTERED-SYMS-NEXT:         6064: 4F707469 6D697A69 6E672043 6F6D7069 6C657200 2A001011 00000000 C4000000  |Optimizing Compiler.*...........|
-FILTERED-SYMS-NEXT:         6084: 00000000 0A000000 03000000 08000000 01100000 10000000 0100016D 61696E00  |...........................main.|
-FILTERED-SYMS-NEXT:         60A4: 1E001210 00000000 00000000 00000000 00000000 00000000 00000082 12000000  |................................|
-FILTERED-SYMS-NEXT:         60C4: 02000600 06004C11 0E100000                                               |......L.....|
+FILTERED-SYMS-NEXT:         6000: 04000000 36000111 00000000 643A5C73 72635C6C 6C766D5C 74657374 5C446562  |....6.......d:\src\llvm\test\Deb|
+FILTERED-SYMS-NEXT:         6020: 7567496E 666F5C50 44425C49 6E707574 735C656D 7074792E 6F626A00 3A003C11  |ugInfo\PDB\Inputs\empty.obj.:.<.|
+FILTERED-SYMS-NEXT:         6040: 01200000 07001200 00007D79 00001200 00007D79 00004D69 63726F73 6F667420  |. ........}y......}y..Microsoft |
+FILTERED-SYMS-NEXT:         6060: 28522920 4F707469 6D697A69 6E672043 6F6D7069 6C657200 2A001011 00000000  |(R) Optimizing Compiler.*.......|
+FILTERED-SYMS-NEXT:         6080: C4000000 00000000 0A000000 03000000 08000000 01100000 10000000 0100016D  |...............................m|
+FILTERED-SYMS-NEXT:         60A0: 61696E00 1E001210 00000000 00000000 00000000 00000000 00000000 00000082  |ain.............................|
+FILTERED-SYMS-NEXT:         60C0: 12000000 02000600 06004C11 0E100000                                      |..........L.....|
 FILTERED-SYMS-NEXT:       )
 FILTERED-SYMS-NOT:      Mod 0001
diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index da3cf9dc781c0c4a3325f0bb50de6a96cc666540..5caa80c2ea4386865d880a8445fecdbd407a9205 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -32,10 +32,10 @@
 ;CHECK-NEXT: DW_AT_call_line
 
 ;CHECK: DW_TAG_formal_parameter
-;FIXME: Linux shouldn't drop this parameter either...
 ;CHECK-NOT: DW_TAG
-;DARWIN:   DW_AT_abstract_origin {{.*}} "sp"
-;DARWIN: DW_TAG_formal_parameter
+;CHECK:   DW_AT_abstract_origin {{.*}} "sp"
+;CHECK: DW_TAG_formal_parameter
+;CHECK-NOT: DW_TAG
 ;CHECK: DW_AT_abstract_origin {{.*}} "nums"
 ;CHECK-NOT: DW_TAG_formal_parameter
 
diff --git a/test/DebugInfo/X86/fission-cu.ll b/test/DebugInfo/X86/fission-cu.ll
index 1551bedc981e4f7493d6eab45dcfe720c5fb297c..95669b6dd598476dec73f454fdd462f9fb68058e 100644
--- a/test/DebugInfo/X86/fission-cu.ll
+++ b/test/DebugInfo/X86/fission-cu.ll
@@ -25,18 +25,18 @@ source_filename = "test/DebugInfo/X86/fission-cu.ll"
 ; CHECK: Abbrev table for offset: 0x00000000
 ; CHECK: [1] DW_TAG_compile_unit DW_CHILDREN_no
 ; CHECK: DW_AT_stmt_list DW_FORM_sec_offset
-; CHECK: DW_AT_GNU_dwo_name      DW_FORM_strp
 ; CHECK: DW_AT_comp_dir  DW_FORM_strp
+; CHECK: DW_AT_GNU_dwo_name      DW_FORM_strp
 ; CHECK: DW_AT_GNU_dwo_id        DW_FORM_data8
 
 ; Check that we're using the right forms.
 ; CHECK: .debug_abbrev.dwo contents:
 ; CHECK: Abbrev table for offset: 0x00000000
 ; CHECK: [1] DW_TAG_compile_unit DW_CHILDREN_yes
-; CHECK: DW_AT_GNU_dwo_name  DW_FORM_GNU_str_index
 ; CHECK: DW_AT_producer  DW_FORM_GNU_str_index
 ; CHECK: DW_AT_language  DW_FORM_data2
 ; CHECK: DW_AT_name      DW_FORM_GNU_str_index
+; CHECK: DW_AT_GNU_dwo_name  DW_FORM_GNU_str_index
 ; CHECK-NOT: DW_AT_low_pc
 ; CHECK-NOT: DW_AT_stmt_list
 ; CHECK-NOT: DW_AT_comp_dir
@@ -58,48 +58,48 @@ source_filename = "test/DebugInfo/X86/fission-cu.ll"
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
 ; CHECK-NEXT: DW_AT_stmt_list [DW_FORM_sec_offset]   (0x00000000)
-; CHECK-NEXT: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000000] = "baz.dwo")
-; CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp]     ( .debug_str[0x00000008] = "/usr/local/google/home/echristo/tmp")
+; CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp]     ( .debug_str[0x00000000] = "/usr/local/google/home/echristo/tmp")
+; CHECK-NEXT: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000024] = "baz.dwo")
 ; CHECK-NEXT: DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x1f1f859683d49324)
 
 ; Check that the rest of the compile units have information.
 ; CHECK: .debug_info.dwo contents:
 ; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_GNU_dwo_name [DW_FORM_GNU_str_index] ( indexed (00000000) string = "baz.dwo")
-; CHECK: DW_AT_producer [DW_FORM_GNU_str_index] ( indexed (00000001) string = "clang version 3.3 (trunk 169021) (llvm/trunk 169020)")
+; CHECK: DW_AT_producer [DW_FORM_GNU_str_index] ( indexed (00000002) string = "clang version 3.3 (trunk 169021) (llvm/trunk 169020)")
 ; CHECK: DW_AT_language [DW_FORM_data2]        (DW_LANG_C99)
-; CHECK: DW_AT_name [DW_FORM_GNU_str_index]    ( indexed (00000002) string = "baz.c")
+; CHECK: DW_AT_name [DW_FORM_GNU_str_index]    ( indexed (00000003) string = "baz.c")
+; CHECK: DW_AT_GNU_dwo_name [DW_FORM_GNU_str_index] ( indexed (00000004) string = "baz.dwo")
 ; CHECK-NOT: DW_AT_low_pc
 ; CHECK-NOT: DW_AT_stmt_list
 ; CHECK-NOT: DW_AT_comp_dir
 ; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x1f1f859683d49324)
 ; CHECK: DW_TAG_variable
-; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000003) string = "a")
+; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000000) string = "a")
 ; CHECK: DW_AT_type [DW_FORM_ref4]       (cu + 0x{{[0-9a-f]*}} => {[[TYPE:0x[0-9a-f]*]]}
 ; CHECK: DW_AT_external [DW_FORM_flag_present]   (true)
 ; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
 ; CHECK: DW_AT_decl_line [DW_FORM_data1] (1)
 ; CHECK: DW_AT_location [DW_FORM_exprloc] (DW_OP_GNU_addr_index 0x0)
 ; CHECK: [[TYPE]]: DW_TAG_base_type
-; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000004) string = "int")
+; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000001) string = "int")
 
 ; CHECK: .debug_str contents:
-; CHECK: 0x00000000: "baz.dwo"
-; CHECK: 0x00000008: "/usr/local/google/home/echristo/tmp"
+; CHECK: 0x00000000: "/usr/local/google/home/echristo/tmp"
+; CHECK: 0x00000024: "baz.dwo"
 
 ; CHECK: .debug_str.dwo contents:
-; CHECK: 0x00000000: "baz.dwo"
-; CHECK: 0x00000008: "clang version 3.3 (trunk 169021) (llvm/trunk 169020)"
-; CHECK: 0x0000003d: "baz.c"
-; CHECK: 0x00000043: "a"
-; CHECK: 0x00000045: "int"
+; CHECK: 0x00000000: "a"
+; CHECK: 0x00000002: "int"
+; CHECK: 0x00000006: "clang version 3.3 (trunk 169021) (llvm/trunk 169020)"
+; CHECK: 0x0000003b: "baz.c"
+; CHECK: 0x00000041: "baz.dwo"
 
 ; CHECK: .debug_str_offsets.dwo contents:
 ; CHECK: 0x00000000: 00000000
-; CHECK: 0x00000004: 00000008
-; CHECK: 0x00000008: 0000003d
-; CHECK: 0x0000000c: 00000043
-; CHECK: 0x00000010: 00000045
+; CHECK: 0x00000004: 00000002
+; CHECK: 0x00000008: 00000006
+; CHECK: 0x0000000c: 0000003b
+; CHECK: 0x00000010: 00000041
 
 ; Object file checks
 ; For x86-64-linux we should have this set of relocations for the debug info section
diff --git a/test/DebugInfo/X86/fission-local-import.ll b/test/DebugInfo/X86/fission-local-import.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4f43a22b70889a2b3a7fa6b9a44aa69ee56124b0
--- /dev/null
+++ b/test/DebugInfo/X86/fission-local-import.ll
@@ -0,0 +1,33 @@
+; RUN: llc -split-dwarf-file=foo.dwo -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: .debug_info.dwo contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_TAG_imported_module
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_Z4testv() !dbg !5 {
+entry:
+  ret void, !dbg !13
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10, !11}
+!llvm.ident = !{!12}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 349508) (llvm/trunk 349520)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, imports: !3, nameTableKind: None)
+!1 = !DIFile(filename: "test.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !8, file: !1, line: 2)
+!5 = distinct !DISubprogram(name: "test", linkageName: "_Z4testv", scope: !1, file: !1, line: 2, type: !6, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = !DINamespace(name: "ns1", scope: null)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{i32 1, !"wchar_size", i32 4}
+!12 = !{!"clang version 8.0.0 (trunk 349508) (llvm/trunk 349520)"}
+!13 = !DILocation(line: 2, column: 36, scope: !5)
+
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 5883d2b0c4087fb82b673ca180e104ad16fce534..1a1dc22870adfc3fe6639dd18183e45497da8222 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -8,8 +8,8 @@
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
 ; CHECK-NEXT: DW_AT_stmt_list
-; CHECK-NEXT: DW_AT_GNU_dwo_name
 ; CHECK-NEXT: DW_AT_comp_dir
+; CHECK-NEXT: DW_AT_GNU_dwo_name
 ; CHECK-NEXT: DW_AT_GNU_dwo_id
 ; CHECK-NEXT: DW_AT_GNU_ranges_base
 ; CHECK-NEXT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]                   (0x00000000)
diff --git a/test/DebugInfo/X86/gnu-public-names-multiple-cus-2.s b/test/DebugInfo/X86/gnu-public-names-multiple-cus-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..7bdaefe49b5e0c6459c7f730522f706046b3010f
--- /dev/null
+++ b/test/DebugInfo/X86/gnu-public-names-multiple-cus-2.s
@@ -0,0 +1,35 @@
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
+# RUN: llvm-dwarfdump -debug-gnu-pubnames %t.o | FileCheck %s
+
+# CHECK: unit_offset = 0x00000000
+# CHECK: unit_offset = 0x0000000c
+
+	.section .debug_abbrev,"",@progbits
+	.byte	1
+
+	.section .debug_info,"",@progbits
+.Lcu_begin0:
+	.long	8           # Length of Unit
+	.short	4           # DWARF version number
+	.long	0           # Offset Into Abbrev. Section
+	.byte	4           # Address Size
+	.byte	0           # NULL
+.Lcu_begin1:
+	.long	8           # Length of Unit
+	.short	4           # DWARF version number
+	.long	0           # Offset Into Abbrev. Section
+	.byte	4           # Address Size
+	.byte	0           # NULL
+
+	.section .debug_gnu_pubnames,"",@progbits
+	.long	14
+	.short	2           # DWARF Version
+	.long	.Lcu_begin0
+	.long	12          # Compilation Unit Length
+	.long	0
+
+	.long	14
+	.short	2           # DWARF Version
+	.long	.Lcu_begin1
+	.long	12          # Compilation Unit Length
+	.long	0
diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
index 3dc639ca696e5e1968c6024e1e07577dbe673345..a7ac9e2cb0b1846458b6611409c4bc363d185ac6 100644
--- a/test/DebugInfo/X86/gnu-public-names.ll
+++ b/test/DebugInfo/X86/gnu-public-names.ll
@@ -65,11 +65,11 @@
 ; }
 
 ; ASM: .section        .debug_gnu_pubnames
-; ASM: .byte   32                      # Kind: VARIABLE, EXTERNAL
+; ASM: .byte   32                      # Attributes: VARIABLE, EXTERNAL
 ; ASM-NEXT: .asciz  "global_variable"       # External Name
 
 ; ASM: .section        .debug_gnu_pubtypes
-; ASM: .byte   16                      # Kind: TYPE, EXTERNAL
+; ASM: .byte   16                      # Attributes: TYPE, EXTERNAL
 ; ASM-NEXT: .asciz  "C"                     # External Name
 
 ; CHECK: .debug_info contents:
diff --git a/test/DebugInfo/X86/length_symbol_difference.ll b/test/DebugInfo/X86/length_symbol_difference.ll
new file mode 100644
index 0000000000000000000000000000000000000000..78684e788cfad8f04055a3ec49dc9a674dc978a6
--- /dev/null
+++ b/test/DebugInfo/X86/length_symbol_difference.ll
@@ -0,0 +1,30 @@
+; RUN: llc -filetype=asm -O0 -mtriple=x86_64-linux-gnu < %s | FileCheck %s
+
+; CHECK:      .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+; CHECK-NEXT: .Ldebug_info_start0:
+; CHECK-NOT:  .byte   0
+; CHECK:      .byte   0                       # End Of Children Mark
+; CHECK-NEXT: .Ldebug_info_end0:
+; CHECK-NEXT: .section
+
+
+define dso_local void @_Z2f1v() !dbg !7 {
+entry:
+  ret void, !dbg !10
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 349394) (llvm/trunk 349377)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "foo.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 349394) (llvm/trunk 349377)"}
+!7 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocation(line: 1, column: 12, scope: !7)
diff --git a/test/DebugInfo/X86/live-debug-values.ll b/test/DebugInfo/X86/live-debug-values.ll
index 4884733af3cf4060c1e95f414ca666e692cfd396..fc8517a671cedd25806f15b2bcfcb29a949febdb 100644
--- a/test/DebugInfo/X86/live-debug-values.ll
+++ b/test/DebugInfo/X86/live-debug-values.ll
@@ -33,7 +33,7 @@
 ; CHECK-NEXT:  #DEBUG_VALUE: main:n <- $ebx
 ;   Other register values have been clobbered.
 ; CHECK-NOT:   #DEBUG_VALUE:
-; CHECK:         movl    %esi, m(%rip)
+; CHECK:         movl    %e{{..}}, m(%rip)
 
 ; ModuleID = 'LiveDebugValues.c'
 source_filename = "test/DebugInfo/X86/live-debug-values.ll"
diff --git a/test/DebugInfo/X86/pieces-3.ll b/test/DebugInfo/X86/pieces-3.ll
index e67e51e292b7775588967b9f90a798eccf68cfd5..0f705f14f8f3a45e42b3555d1917b3b808795c46 100644
--- a/test/DebugInfo/X86/pieces-3.ll
+++ b/test/DebugInfo/X86/pieces-3.ll
@@ -40,8 +40,8 @@ define i32 @foo(i64 %outer.coerce0, i64 %outer.coerce1) #0 !dbg !4 {
   call void @llvm.dbg.value(metadata i32 %outer.sroa.1.8.extract.trunc, metadata !34, metadata !35), !dbg !33
   %outer.sroa.1.12.extract.shift = lshr i64 %outer.coerce1, 32, !dbg !33
   %outer.sroa.1.12.extract.trunc = trunc i64 %outer.sroa.1.12.extract.shift to i32, !dbg !33
-  call void @llvm.dbg.value(metadata i64 %outer.sroa.1.12.extract.shift, metadata !34, metadata !35), !dbg !33
-  call void @llvm.dbg.value(metadata i32 %outer.sroa.1.12.extract.trunc, metadata !34, metadata !35), !dbg !33
+  call void @llvm.dbg.value(metadata i64 %outer.sroa.1.12.extract.shift, metadata !34, metadata !37), !dbg !33
+  call void @llvm.dbg.value(metadata i32 %outer.sroa.1.12.extract.trunc, metadata !34, metadata !37), !dbg !33
   call void @llvm.dbg.declare(metadata !{null}, metadata !34, metadata !35), !dbg !33
   ret i32 %outer.sroa.1.8.extract.trunc, !dbg !36
 }
@@ -99,3 +99,4 @@ attributes #2 = { nounwind }
 !34 = !DILocalVariable(name: "i1", line: 11, scope: !4, file: !5, type: !14)
 !35 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 !36 = !DILocation(line: 12, scope: !4)
+!37 = !DIExpression(DW_OP_LLVM_fragment, 32, 32)
diff --git a/test/DebugInfo/X86/range_reloc.ll b/test/DebugInfo/X86/range_reloc.ll
index 5c40cf31944eb15e6826f357431538304011ef37..37ed4a76fd8cdcdf18fb3f13a8124997f53d6a10 100644
--- a/test/DebugInfo/X86/range_reloc.ll
+++ b/test/DebugInfo/X86/range_reloc.ll
@@ -1,5 +1,4 @@
-; RUN: llc -filetype=asm -mtriple=x86_64-pc-linux-gnu %s -o - -use-dwarf-ranges-base-address-specifier | FileCheck --check-prefix=COMMON --check-prefix=BASE %s
-; RUN: llc -filetype=asm -mtriple=x86_64-pc-linux-gnu %s -o - | FileCheck --check-prefix=COMMON --check-prefix=NOBASE %s
+; RUN: llc -filetype=asm -mtriple=x86_64-pc-linux-gnu %s -o - | FileCheck %s
 ; RUN: llc -filetype=asm -mtriple=x86_64-pc-linux-gnu %s -o - -dwarf-version 5 | FileCheck --check-prefix=DWARF5 %s
 
 ; Group ranges in a range list that apply to the same section and use a base
@@ -15,38 +14,51 @@
 ; in the linked executable. Without compression in the objects, the win would be
 ; smaller (the growth of debug_ranges itself would be more significant).
 
-; COMMON: {{^.Ldebug_ranges0}}
-; NOBASE-NEXT:   .quad   .Lfunc_begin0
-; NOBASE-NEXT:   .quad   .Lfunc_end0
-; NOBASE-NEXT:   .quad   .Lfunc_begin1
-; NOBASE-NEXT:   .quad   .Lfunc_end1
-; NOBASE-NEXT:   .quad   .Lfunc_begin3
-; NOBASE-NEXT:   .quad   .Lfunc_end3
-; NOBASE-NEXT:   .quad   .Lfunc_begin4
-; NOBASE-NEXT:   .quad   .Lfunc_end4
-; NOBASE-NEXT:   .quad   .Lfunc_begin5
-; NOBASE-NEXT:   .quad   .Lfunc_end5
-
-; BASE-NEXT:   .quad   -1
-; BASE-NEXT:   .quad   .Lfunc_begin0
-; BASE-NEXT:   .quad   .Lfunc_begin0-.Lfunc_begin0
-; BASE-NEXT:   .quad   .Lfunc_end0-.Lfunc_begin0
-; BASE-NEXT:   .quad   -1
-; BASE-NEXT:   .quad   .Lfunc_begin1
-; BASE-NEXT:   .quad   .Lfunc_begin1-.Lfunc_begin1
-; BASE-NEXT:   .quad   .Lfunc_end1-.Lfunc_begin1
-; BASE-NEXT:   .quad   .Lfunc_begin3-.Lfunc_begin1
-; BASE-NEXT:   .quad   .Lfunc_end3-.Lfunc_begin1
-; BASE-NEXT:   .quad   -1
-; BASE-NEXT:   .quad   .Lfunc_begin4
-; BASE-NEXT:   .quad   .Lfunc_begin4-.Lfunc_begin4
-; BASE-NEXT:   .quad   .Lfunc_end4-.Lfunc_begin4
-; BASE-NEXT:   .quad   -1
-; BASE-NEXT:   .quad   .Lfunc_begin5
-; BASE-NEXT:   .quad   .Lfunc_begin5-.Lfunc_begin5
-; BASE-NEXT:   .quad   .Lfunc_end5-.Lfunc_begin5
-; COMMON-NEXT:   .quad   0
-; COMMON-NEXT:   .quad   0
+; This is a merged module containing two CUs, one that uses range base address
+; specifiers and exercises different cases there, and another that does not
+
+; ranges.cpp
+; Single range entry
+; __attribute__((section("a"))) void f1() {}
+; Single address with two ranges due to the whole caused by f3
+; __attribute__((section("b"))) void f2() {}
+; __attribute__((section("b"))) __attribute__((nodebug)) void f3() {}
+; __attribute__((section("b"))) void f4() {}
+; Reset the base address & emit a couple more single range entries
+; __attribute__((section("c"))) void f5() {}
+; __attribute__((section("d"))) void f6() {}
+; ranges_no_base.cpp:
+; Include enough complexity to cause ranges to be emitted, so it can be checked
+; that those ranges don't use base address specifiers
+; __attribute__((section("e"))) void f7() {}
+; __attribute__((section("f"))) void f8() {}
+
+; CHECK: {{^.Ldebug_ranges0}}
+; CHECK-NEXT:   .quad   -1
+; CHECK-NEXT:   .quad   .Lfunc_begin0
+; CHECK-NEXT:   .quad   .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT:   .quad   .Lfunc_end0-.Lfunc_begin0
+; CHECK-NEXT:   .quad   -1
+; CHECK-NEXT:   .quad   .Lfunc_begin1
+; CHECK-NEXT:   .quad   .Lfunc_begin1-.Lfunc_begin1
+; CHECK-NEXT:   .quad   .Lfunc_end1-.Lfunc_begin1
+; CHECK-NEXT:   .quad   .Lfunc_begin3-.Lfunc_begin1
+; CHECK-NEXT:   .quad   .Lfunc_end3-.Lfunc_begin1
+; CHECK-NEXT:   .quad   -1
+; CHECK-NEXT:   .quad   .Lfunc_begin4
+; CHECK-NEXT:   .quad   .Lfunc_begin4-.Lfunc_begin4
+; CHECK-NEXT:   .quad   .Lfunc_end4-.Lfunc_begin4
+; CHECK-NEXT:   .quad   -1
+; CHECK-NEXT:   .quad   .Lfunc_begin5
+; CHECK-NEXT:   .quad   .Lfunc_begin5-.Lfunc_begin5
+; CHECK-NEXT:   .quad   .Lfunc_end5-.Lfunc_begin5
+; CHECK-NEXT:   .quad   0
+; CHECK-NEXT:   .quad   0
+; CHECK-NEXT: {{^.Ldebug_ranges1}}
+; CHECK-NEXT:   .quad   .Lfunc_begin6
+; CHECK-NEXT:   .quad   .Lfunc_end6
+; CHECK-NEXT:   .quad   .Lfunc_begin7
+; CHECK-NEXT:   .quad   .Lfunc_end7
 
 ; DWARF5: {{^.Ldebug_ranges0}}
 ; DWARF5-NEXT:                                      # DW_RLE_startx_length
@@ -67,65 +79,89 @@
 ; DWARF5-NEXT: .byte 4                              #   start index
 ; DWARF5-NEXT: .uleb128 .Lfunc_end5-.Lfunc_begin5   #   length
 ; DWARF5-NEXT:                                      # DW_RLE_end_of_list
+; DWARF5-NEXT: {{^.Ldebug_ranges1}}
+; DWARF5-NEXT:                                      # DW_RLE_startx_length
+; DWARF5-NEXT: .byte 5                              #   start index
+; DWARF5-NEXT: .uleb128 .Lfunc_end6-.Lfunc_begin6   #   length
+; DWARF5-NEXT:                                      # DW_RLE_startx_length
+; DWARF5-NEXT: .byte 6                              #   start index
+; DWARF5-NEXT: .uleb128 .Lfunc_end7-.Lfunc_begin7   #   length
+; DWARF5-NEXT:                                      # DW_RLE_end_of_list
 
 ; Function Attrs: noinline nounwind optnone uwtable
-define void @_Z2f1v() #0 section "a" !dbg !7 {
+define dso_local void @_Z2f1v() section "a" !dbg !9 {
 entry:
-  ret void, !dbg !10
+  ret void, !dbg !12
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
-define void @_Z2f2v() #0 section "b" !dbg !11 {
+define dso_local void @_Z2f2v() section "b" !dbg !13 {
 entry:
-  ret void, !dbg !12
+  ret void, !dbg !14
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
-define void @_Z2f3v() #0 section "b" {
+define dso_local void @_Z2f3v() section "b" {
 entry:
   ret void
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
-define void @_Z2f4v() #0 section "b" !dbg !13 {
+define dso_local void @_Z2f4v() section "b" !dbg !15 {
 entry:
-  ret void, !dbg !14
+  ret void, !dbg !16
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
-define void @_Z2f5v() #0 section "e" !dbg !15 {
+define dso_local void @_Z2f5v() section "c" !dbg !17 {
 entry:
-  ret void, !dbg !16
+  ret void, !dbg !18
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
-define void @_Z2f6v() #0 section "f" !dbg !17 {
+define dso_local void @_Z2f6v() section "d" !dbg !19 {
 entry:
-  ret void, !dbg !18
+  ret void, !dbg !20
+}
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_Z2f7v() section "e" !dbg !21 {
+entry:
+  ret void, !dbg !22
 }
 
-attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_Z2f8v() section "f" !dbg !23 {
+entry:
+  ret void, !dbg !24
+}
 
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7, !8}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 309523) (llvm/trunk 309526)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "funcs.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 346343) (llvm/trunk 346350)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None, debugBaseAddress: true)
+!1 = !DIFile(filename: "ranges.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
 !2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 6.0.0 (trunk 309523) (llvm/trunk 309526)"}
-!7 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null}
-!10 = !DILocation(line: 1, column: 42, scope: !7)
-!11 = distinct !DISubprogram(name: "f2", linkageName: "_Z2f2v", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
-!12 = !DILocation(line: 2, column: 42, scope: !11)
-!13 = distinct !DISubprogram(name: "f4", linkageName: "_Z2f4v", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
-!14 = !DILocation(line: 4, column: 42, scope: !13)
-!15 = distinct !DISubprogram(name: "f5", linkageName: "_Z2f5v", scope: !1, file: !1, line: 5, type: !8, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
-!16 = !DILocation(line: 5, column: 42, scope: !15)
-!17 = distinct !DISubprogram(name: "f6", linkageName: "_Z2f6v", scope: !1, file: !1, line: 6, type: !8, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
-!18 = !DILocation(line: 6, column: 42, scope: !17)
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 8.0.0 (trunk 346343) (llvm/trunk 346350)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!4 = !DIFile(filename: "ranges_no_base.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!5 = !{!"clang version 8.0.0 (trunk 346343) (llvm/trunk 346350)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v", scope: !1, file: !1, line: 1, type: !10, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!10 = !DISubroutineType(types: !11)
+!11 = !{null}
+!12 = !DILocation(line: 1, column: 42, scope: !9)
+!13 = distinct !DISubprogram(name: "f2", linkageName: "_Z2f2v", scope: !1, file: !1, line: 2, type: !10, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!14 = !DILocation(line: 2, column: 42, scope: !13)
+!15 = distinct !DISubprogram(name: "f4", linkageName: "_Z2f4v", scope: !1, file: !1, line: 4, type: !10, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!16 = !DILocation(line: 4, column: 42, scope: !15)
+!17 = distinct !DISubprogram(name: "f5", linkageName: "_Z2f5v", scope: !1, file: !1, line: 5, type: !10, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!18 = !DILocation(line: 5, column: 42, scope: !17)
+!19 = distinct !DISubprogram(name: "f6", linkageName: "_Z2f6v", scope: !1, file: !1, line: 6, type: !10, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!20 = !DILocation(line: 6, column: 42, scope: !19)
+!21 = distinct !DISubprogram(name: "f7", linkageName: "_Z2f7v", scope: !4, file: !4, line: 1, type: !10, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !3, retainedNodes: !2)
+!22 = !DILocation(line: 1, column: 42, scope: !21)
+!23 = distinct !DISubprogram(name: "f8", linkageName: "_Z2f8v", scope: !4, file: !4, line: 2, type: !10, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !3, retainedNodes: !2)
+!24 = !DILocation(line: 2, column: 42, scope: !23)
diff --git a/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll b/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
index 0266f6108e7a7c648891953bab07a0d3d0af6270..ca7cfb825716cd54dc6d21983a1c81cad2ef717b 100644
--- a/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
+++ b/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
@@ -63,9 +63,9 @@ target triple = "x86_64-apple-macosx10.4.0"
 @S = global %struct.SS { i32 23, i32 -17 }, align 4, !dbg !0
 
 ; Verify that the def comes before the for foo1.
-; TODO: Currently dbg.value for bar1 is dropped(?), is that expected?
 define i32 @test1() local_unnamed_addr #0 !dbg !17 {
 ; CHECK-LABEL: bb.0.entry1
+; CHECK-NEXT:    DBG_VALUE 0, $noreg, ![[BAR1]], !DIExpression()
 ; CHECK-NEXT:    [[REG1:%[0-9]+]]:gr64 =
 ; CHECK-NEXT:    DBG_VALUE [[REG1]], $noreg, ![[FOO1]], !DIExpression()
 entry1:
@@ -99,11 +99,9 @@ entry3:
 }
 
 ; Verify that the def comes before the for bar4.
-; TODO: Currently dbg.value for foo4 is dropped. It is set to null and not
-;       used. Just like in test1 it can be discussed if there should be a
-;       DBG_VALUE for foo4 here.
 define i32 @test4() local_unnamed_addr #0 !dbg !40 {
 ; CHECK-LABEL: bb.0.entry4
+; CHECK-NEXT:    DBG_VALUE 0, $noreg, ![[FOO4]], !DIExpression()
 ; CHECK-NEXT:    [[REG4:%[0-9]+]]:gr64 =
 ; CHECK-NEXT:    DBG_VALUE [[REG4]], $noreg, ![[BAR4]], !DIExpression()
 entry4:
@@ -114,10 +112,9 @@ entry4:
 }
 
 ; Verify that we do not get a DBG_VALUE that maps foo5 to @S here.
-; TODO: foo5 is set to null, and it is not really used. Just like in test1 it
-;       can be discussed if there should be a DBG_VALUE for foo5 here.
 define i32 @test5() local_unnamed_addr #0 !dbg !47 {
 ; CHECK-LABEL: bb.0.entry5:
+; CHECK-NEXT:    DBG_VALUE 0, $noreg, ![[FOO5]], !DIExpression()
 ; CHECK-NEXT:    [[REG5:%[0-9]+]]:gr64 =
 ; CHECK-NEXT:    DBG_VALUE [[REG5]], $noreg, ![[BAR5]], !DIExpression()
 ; CHECK-NOT:     DBG_VALUE [[REG5]], $noreg, ![[FOO5]], !DIExpression()
diff --git a/test/DebugInfo/X86/string-offsets-multiple-cus.ll b/test/DebugInfo/X86/string-offsets-multiple-cus.ll
index 2f479fa5af074fdb581016bda0df44da62a3f975..7851dbfefd5556521dc1fb6456b0db5acf66ae7e 100644
--- a/test/DebugInfo/X86/string-offsets-multiple-cus.ll
+++ b/test/DebugInfo/X86/string-offsets-multiple-cus.ll
@@ -31,39 +31,13 @@
 ; with the correct offsets. Check that strings referenced by compile units 2 and 3
 ; are displayed correctly.
 ;
-; CU 1
 ; BOTH:        .debug_info contents:
-; BOTH-NOT:    .contents:
-; BOTH:        DW_TAG_compile_unit
-; BOTH-NOT:    {{DW_TAG|NULL}}
-; BOTH:        DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF:[0-9a-f]+]])
-;
-; CU 2
-; BOTH-NOT:    contents:
-; BOTH:        DW_TAG_compile_unit
-; BOTH-NOT:    {{DW_TAG|NULL}}
-; BOTH:        DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF]])
-; BOTH-NOT:    NULL
-; BOTH:        DW_TAG_variable
-; BOTH-NOT:    {{DW_TAG|NULL}}
-; BOTH:        DW_AT_name [DW_FORM_strx1] ( indexed (00000009) string = "glob2")
-;
-; CU 3
-; BOTH-NOT:    contents:
-; BOTH:        DW_TAG_compile_unit
-; BOTH-NOT:    {{DW_TAG|NULL}}
-; BOTH:        DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF]])
-; BOTH-NOT:    NULL
-; BOTH:        DW_TAG_variable
-; BOTH-NOT:    {{DW_TAG|NULL}}
-; BOTH:        DW_AT_name [DW_FORM_strx1] ( indexed (0000000f) string = "glob3")
 ;
 ; Verify that all 3 type units have the proper DW_AT_str_offsets_base attribute.
-; TYPEUNITS:      .debug_types contents:
 ; TYPEUNITS-NOT:  contents:
 ; TYPEUNITS:      DW_TAG_type_unit
 ; TYPEUNITS-NOT:  {{DW_TAG|NULL}}
-; TYPEUNITS:      DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF]])
+; TYPEUNITS:      DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF:[0-9a-f]+]])
 ; TYPEUNITS-NOT:  NULL
 ; TYPEUNITS:      DW_TAG_enumerator
 ; TYPEUNITS-NOT:  NULL
@@ -84,6 +58,33 @@
 ; TYPEUNITS-NOT:  NULL
 ; TYPEUNITS:      DW_TAG_enumeration_type
 ; TYPEUNITS:      DW_AT_name [DW_FORM_strx1] ( indexed (00000013) string = "E3")
+
+; CU 1
+; BOTH-NOT:    .contents:
+; BOTH:        DW_TAG_compile_unit
+; BOTH-NOT:    {{DW_TAG|NULL}}
+; DEFAULT:     DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF:[0-9a-f]+]])
+; TYPEUNITS:   DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF]])
+;
+; CU 2
+; BOTH-NOT:    contents:
+; BOTH:        DW_TAG_compile_unit
+; BOTH-NOT:    {{DW_TAG|NULL}}
+; BOTH:        DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF]])
+; BOTH-NOT:    NULL
+; BOTH:        DW_TAG_variable
+; BOTH-NOT:    {{DW_TAG|NULL}}
+; BOTH:        DW_AT_name [DW_FORM_strx1] ( indexed (00000009) string = "glob2")
+;
+; CU 3
+; BOTH-NOT:    contents:
+; BOTH:        DW_TAG_compile_unit
+; BOTH-NOT:    {{DW_TAG|NULL}}
+; BOTH:        DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x[[CU1_STROFF]])
+; BOTH-NOT:    NULL
+; BOTH:        DW_TAG_variable
+; BOTH-NOT:    {{DW_TAG|NULL}}
+; BOTH:        DW_AT_name [DW_FORM_strx1] ( indexed (0000000f) string = "glob3")
 ;
 ; Extract the offset of a string to verify that it is referenced in the string
 ; offsets section.
diff --git a/test/DebugInfo/X86/string-offsets-table-order.ll b/test/DebugInfo/X86/string-offsets-table-order.ll
index 79c0ffcaa3ef7b1ab7d22d15c5fc5a425a53ca31..1378b134659c76c936284bcb00b5552d3567eda3 100644
--- a/test/DebugInfo/X86/string-offsets-table-order.ll
+++ b/test/DebugInfo/X86/string-offsets-table-order.ll
@@ -13,11 +13,11 @@
 
 ; CHECK: .debug_info contents:
 ; CHECK:   DW_TAG_compile_unit
-; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] ( indexed (00000001) string = "X3")
+; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] ( indexed (00000000) string = "X3")
 ; CHECK:   DW_TAG_compile_unit
-; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] ( indexed (00000002) string = "X2")
+; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] ( indexed (00000001) string = "X2")
 ; CHECK:   DW_TAG_compile_unit
-; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] ( indexed (00000003) string = "X1")
+; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] ( indexed (00000002) string = "X1")
 ; CHECK: .debug_info.dwo contents:
 
 ; CHECK: .debug_str contents:
@@ -27,10 +27,10 @@
 
 ; CHECK: .debug_str_offsets contents:
 ; CHECK: Format = DWARF32, Version = 5
-; CHECK-NEXT: 00000000 "foo.dwo"
 ; CHECK-NEXT: [[X3]] "X3"
 ; CHECK-NEXT: [[X2]] "X2"
 ; CHECK-NEXT: [[X1]] "X1"
+; CHECK-NEXT: "foo.dwo"
 ; CHECK-EMPTY:
 
 
diff --git a/test/DebugInfo/X86/string-offsets-table.ll b/test/DebugInfo/X86/string-offsets-table.ll
index b0615106de420a637abde94daec8e58bf5e11506..ae7797a3055625cb12952e1c7810f8250474d2de 100644
--- a/test/DebugInfo/X86/string-offsets-table.ll
+++ b/test/DebugInfo/X86/string-offsets-table.ll
@@ -59,8 +59,8 @@
 ; SPLIT:      DW_TAG_compile_unit
 ; SPLIT-NOT:  {{DW_TAG|contents:}}
 ; SPLIT:      DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
-; SPLIT:      DW_AT_GNU_dwo_name [DW_FORM_strx1] ( indexed (00000000) string = "foo.dwo")
-; SPLIT:      DW_AT_comp_dir [DW_FORM_strx1] ( indexed (00000001) string = "/home/test")
+; SPLIT:      DW_AT_comp_dir [DW_FORM_strx1] ( indexed (00000000) string = "/home/test")
+; SPLIT:      DW_AT_GNU_dwo_name [DW_FORM_strx1] ( indexed (00000001) string = "foo.dwo")
 
 ; Check for the split CU in .debug_info.dwo.
 ; SPLIT:      .debug_info.dwo contents:
@@ -73,18 +73,18 @@
 ; SPLIT-NOT:  contents:
 ; SPLIT:      DW_TAG_enumerator
 ; SPLIT-NOT:  {{DW_TAG|NULL}}
-; SPLIT:      DW_AT_name [DW_FORM_strx1]    ( indexed (00000004) string = "a")
+; SPLIT:      DW_AT_name [DW_FORM_strx1]    ( indexed (00000001) string = "a")
 ; SPLIT-NOT:  contents:
 ; SPLIT:      DW_TAG_enumerator
 ; SPLIT-NOT:  {{DW_TAG|NULL}}
-; SPLIT:      DW_AT_name [DW_FORM_strx1]    ( indexed (00000005) string = "b")
+; SPLIT:      DW_AT_name [DW_FORM_strx1]    ( indexed (00000002) string = "b")
 ;
 ; Extract the string offsets referenced in the main file by the skeleton unit.
 ; SPLIT:      .debug_str contents:
-; SPLIT-NEXT: 0x00000000: "foo.dwo"
-; SPLIT-NEXT: 0x[[STRING2SPLIT:[0-9a-f]*]]: "/home/test"
-; SPLIT-NEXT: 0x[[STRING3SPLIT:[0-9a-f]*]]: "E"
-; SPLIT-NEXT: 0x[[STRING4SPLIT:[0-9a-f]*]]: "glob"
+; SPLIT-NEXT: 0x[[STRHOMETESTSPLIT:[0-9a-f]*]]: "/home/test"
+; SPLIT-NEXT: 0x[[STRESPLIT:[0-9a-f]*]]: "E"
+; SPLIT-NEXT: 0x[[STRGLOBSPLIT:[0-9a-f]*]]: "glob"
+; SPLIT-NEXT: 0x[[STRFOODWOSPLIT:[0-9a-f]*]]: "foo.dwo"
 ;
 ; Extract the string offsets referenced in the .dwo file by the split unit.
 ; SPLIT:      .debug_str.dwo contents:
@@ -98,8 +98,8 @@
 ; referenced by the debug info.
 ; SPLIT:      .debug_str_offsets contents:
 ; SPLIT-NEXT: 0x00000000: Contribution size = 12, Format = DWARF32, Version = 5
-; SPLIT-NEXT: 0x00000008: 00000000 "foo.dwo"
-; SPLIT-NEXT: 0x0000000c: [[STRING2SPLIT]] "/home/test"
+; SPLIT-NEXT: 0x00000008: [[STRHOMETESTSPLIT]] "/home/test"
+; SPLIT-NEXT: 0x0000000c: [[STRFOODWOSPLIT]] "foo.dwo"
 ; SPLIT-EMPTY:
 
 ; SPLIT:      .debug_str_offsets.dwo contents:
diff --git a/test/DebugInfo/debugify.ll b/test/DebugInfo/debugify.ll
index 07c62533fad86cd4e37b45f404f3295741ed8fde..16dd635858116b2a187b3f6441db1b5940fa2bc5 100644
--- a/test/DebugInfo/debugify.ll
+++ b/test/DebugInfo/debugify.ll
@@ -63,8 +63,8 @@ define i32 @boom() {
 
 ; CHECK-DAG: ![[CU]] = distinct !DICompileUnit(language: DW_LANG_C, file: {{.*}}, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: {{.*}})
 ; CHECK-DAG: !DIFile(filename: "<stdin>", directory: "/")
-; CHECK-DAG: distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: {{.*}}, line: 1, type: {{.*}}, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: {{.*}}, retainedNodes: {{.*}})
-; CHECK-DAG: distinct !DISubprogram(name: "bar", linkageName: "bar", scope: null, file: {{.*}}, line: 2, type: {{.*}}, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: true, unit: {{.*}}, retainedNodes: {{.*}})
+; CHECK-DAG: distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: {{.*}}, line: 1, type: {{.*}}, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: {{.*}}, retainedNodes: {{.*}})
+; CHECK-DAG: distinct !DISubprogram(name: "bar", linkageName: "bar", scope: null, file: {{.*}}, line: 2, type: {{.*}}, scopeLine: 2, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: {{.*}}, retainedNodes: {{.*}})
 
 ; --- DILocations
 ; CHECK-DAG: ![[RET1]] = !DILocation(line: 1, column: 1
diff --git a/test/Demangle/invalid-manglings.test b/test/Demangle/invalid-manglings.test
new file mode 100644
index 0000000000000000000000000000000000000000..365b14cd1a557e1dcf03d26446f402701ed506fe
--- /dev/null
+++ b/test/Demangle/invalid-manglings.test
@@ -0,0 +1,11 @@
+; Run llvm-undname with invalid inputs and make sure it doesn't crash.
+; RUN: llvm-undname < %s 2>&1 | FileCheck %s
+
+?ff@@$$J0YAXAU?$AS_@$0A@PEAU?$AS_@$0A@H@__clang@@@__clang@@@Z
+; CHECK: ?ff@@$$J0YAXAU?$AS_@$0A@PEAU?$AS_@$0A@H@__clang@@@__clang@@@Z
+; CHECK-NEXT: error: Invalid mangled name
+
+?f0@@YAXPEU?$AS_@$00$$CAD@__clang@@@Z
+; CHECK-EMPTY:
+; CHECK-NEXT: ?f0@@YAXPEU?$AS_@$00$$CAD@__clang@@@Z
+; CHECK-NEXT: error: Invalid mangled name
\ No newline at end of file
diff --git a/test/Demangle/ms-cxx11.test b/test/Demangle/ms-cxx11.test
index fa756a54bb802433ceec8583dd680eef7082600d..b9815da7f2aa4acba757692f5e06d2cde143d257 100644
--- a/test/Demangle/ms-cxx11.test
+++ b/test/Demangle/ms-cxx11.test
@@ -111,10 +111,10 @@
 ; CHECK: int __cdecl PR18204::f<union PR18204::<unnamed-type-$S1>>(union PR18204::<unnamed-type-$S1> *)
 
 ??R<lambda_0>@?0??PR26105@@YAHXZ@QBE@H@Z
-; CHECK: __thiscall `int __cdecl PR26105(void)'::`1'::<lambda_0>::operator()(int) const
+; CHECK: public: __thiscall `int __cdecl PR26105(void)'::`1'::<lambda_0>::operator()(int) const
 
 ??R<lambda_1>@?0???R<lambda_0>@?0??PR26105@@YAHXZ@QBE@H@Z@QBE@H@Z
-; CHECK: __thiscall `__thiscall `int __cdecl PR26105(void)'::`1'::<lambda_0>::operator()(int) const'::`1'::<lambda_1>::operator()(int) const
+; CHECK: public: __thiscall `public: __thiscall `int __cdecl PR26105(void)'::`1'::<lambda_0>::operator()(int) const'::`1'::<lambda_1>::operator()(int) const
 
 ?unaligned_foo1@@YAPFAHXZ
 ; CHECK: int __unaligned * __cdecl unaligned_foo1(void)
@@ -144,7 +144,7 @@
 ; CHECK: __thiscall PR31197::A::x::<lambda_1>::operator()(void) const
 
 ?white@?1???R<lambda_1>@x@A@PR31197@@QBE@XZ@4HA
-; CHECK: int `__thiscall PR31197::A::x::<lambda_1>::operator()(void) const'::`2'::white
+; CHECK: int `public: __thiscall PR31197::A::x::<lambda_1>::operator()(void) const'::`2'::white
 
 ?f@@YAXW4<unnamed-enum-enumerator>@@@Z
 ; CHECK: void __cdecl f(enum <unnamed-enum-enumerator>)
diff --git a/test/Demangle/ms-cxx14.test b/test/Demangle/ms-cxx14.test
index a630db86296fbeacafeb8704f956cbdf1816284c..8fef3e6b9d1fbccbc3d5e34702140168f2a1be17 100644
--- a/test/Demangle/ms-cxx14.test
+++ b/test/Demangle/ms-cxx14.test
@@ -16,10 +16,10 @@
 ; CHECK: <auto> __thiscall <lambda_0>::operator()(void) const
 
 ?ValueFromLambdaWithLocalType@@3ULocalType@?1???R<lambda_0>@@QBE?A?<auto>@@XZ@A
-; CHECK: struct `<auto> __thiscall <lambda_0>::operator()(void) const'::`2'::LocalType ValueFromLambdaWithLocalType
+; CHECK: struct `public: <auto> __thiscall <lambda_0>::operator()(void) const'::`2'::LocalType ValueFromLambdaWithLocalType
 
 ?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?2???R<lambda_1>@?0???$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A
-; CHECK: struct `<auto> __thiscall `<auto> __cdecl TemplateFuncionWithLocalLambda<int>(int)'::`1'::<lambda_1>::operator()(void) const'::`3'::LocalType ValueFromTemplateFuncionWithLocalLambda
+; CHECK: struct `public: <auto> __thiscall `<auto> __cdecl TemplateFuncionWithLocalLambda<int>(int)'::`1'::<lambda_1>::operator()(void) const'::`3'::LocalType ValueFromTemplateFuncionWithLocalLambda
 
 ??$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z
 ; CHECK: <auto> __cdecl TemplateFuncionWithLocalLambda<int>(int)
diff --git a/test/Demangle/ms-nested-scopes.test b/test/Demangle/ms-nested-scopes.test
index 952b138630ccec9ad4fb0171ebe3b77f35fed140..450ab6662aaf467f1310fcb80476dc7b09de31b6 100644
--- a/test/Demangle/ms-nested-scopes.test
+++ b/test/Demangle/ms-nested-scopes.test
@@ -81,10 +81,10 @@
 ; CHECK: int `int __cdecl M(void)'::`2'::M
 
 ?L@?2??M@0?2??0@YAHXZ@QEAAHXZ@4HA
-; CHECK: int `int __cdecl `int __cdecl L(void)'::`3'::L::M(void)'::`3'::L
+; CHECK: int `public: int __cdecl `int __cdecl L(void)'::`3'::L::M(void)'::`3'::L
 
 ?M@?2??0L@?2??1@YAHXZ@QEAAHXZ@4HA
-; CHECK: int `int __cdecl `int __cdecl L(void)'::`3'::L::M(void)'::`3'::M
+; CHECK: int `public: int __cdecl `int __cdecl L(void)'::`3'::L::M(void)'::`3'::M
 
 ; Function local scopes of template functions
 ?M@?1???$L@H@@YAHXZ@4HA
@@ -95,16 +95,16 @@
 ; CHECK: int __cdecl NS::NS<int>::SN(void)
 
 ?NS@?1??SN@?$NS@H@0@QEAAHXZ@4HA
-; CHECK: int `int __cdecl NS::NS<int>::SN(void)'::`2'::NS
+; CHECK: int `public: int __cdecl NS::NS<int>::SN(void)'::`2'::NS
 
 ?SN@?1??0?$NS@H@NS@@QEAAHXZ@4HA
-; CHECK: int `int __cdecl NS::NS<int>::SN(void)'::`2'::SN
+; CHECK: int `public: int __cdecl NS::NS<int>::SN(void)'::`2'::SN
 
 ?NS@?1??SN@?$NS@H@10@QEAAHXZ@4HA
-; CHECK: int `int __cdecl NS::SN::NS<int>::SN(void)'::`2'::NS
+; CHECK: int `public: int __cdecl NS::SN::NS<int>::SN(void)'::`2'::NS
 
 ?SN@?1??0?$NS@H@0NS@@QEAAHXZ@4HA
-; CHECK: int `int __cdecl NS::SN::NS<int>::SN(void)'::`2'::SN
+; CHECK: int `public: int __cdecl NS::SN::NS<int>::SN(void)'::`2'::SN
 
 ; Make sure instantiated templates participate in back-referencing.
 ; In the next 3 examples there should be 3 back-references:
@@ -137,10 +137,10 @@
 ; } } } } }
 
 ?C@?1??B@?$C@H@0101A@@QEAAHXZ@4U201013@A
-; CHECK: struct A::B::C::B::C::C<int> `int __cdecl A::B::C::B::C::C<int>::B(void)'::`2'::C
+; CHECK: struct A::B::C::B::C::C<int> `public: int __cdecl A::B::C::B::C::C<int>::B(void)'::`2'::C
 
 ?B@?1??0?$C@H@C@020A@@QEAAHXZ@4HA
-; CHECK: int `int __cdecl A::B::C::B::C::C<int>::B(void)'::`2'::B
+; CHECK: int `public: int __cdecl A::B::C::B::C::C<int>::B(void)'::`2'::B
 
 ?A@?1??B@?$C@H@C@1310@QEAAHXZ@4HA
-; CHECK: int `int __cdecl A::B::C::B::C::C<int>::B(void)'::`2'::A
+; CHECK: int `public: int __cdecl A::B::C::B::C::C<int>::B(void)'::`2'::A
diff --git a/test/Demangle/ms-operators.test b/test/Demangle/ms-operators.test
index ad3d87807b4dab90cd515f7fc208931b2fa3da60..d18c2794d2d41ebc4d4d633008d81ec60a2abe0d 100644
--- a/test/Demangle/ms-operators.test
+++ b/test/Demangle/ms-operators.test
@@ -153,7 +153,7 @@
 ; CHECK: `struct S & __cdecl getS(void)'::`2'::`local static guard'{2}
 
 ??_C@_02PCEFGMJL@hi?$AA@
-; CHECK: const char * {"hi"}
+; CHECK: "hi"
 
 ??_DDiamond@@QEAAXXZ
 ; CHECK: void __cdecl Diamond::`vbase dtor'(void)
@@ -162,7 +162,7 @@
 ; CHECK: virtual void * __cdecl Base::`vector deleting dtor'(unsigned int)
 
 ??_EBase@@G3AEPAXI@Z
-; CHECK: [thunk]: void * __thiscall Base::`vector deleting dtor'`adjustor{4}'(unsigned int)
+; CHECK: [thunk]: private: void * __thiscall Base::`vector deleting dtor'`adjustor{4}'(unsigned int)
 
 ??_F?$SomeTemplate@H@@QAEXXZ
 ; CHECK: void __thiscall SomeTemplate<int>::`default ctor closure'(void)
@@ -228,7 +228,7 @@
 ; CHECK: void __cdecl `dynamic atexit destructor for 'Foo''(void)
 
 ??__F_decisionToDFA@XPathLexer@@0V?$vector@VDFA@dfa@antlr4@@V?$allocator@VDFA@dfa@antlr4@@@std@@@std@@A@YAXXZ
-; CHECK: void __cdecl `dynamic atexit destructor for `static class std::vector<class antlr4::dfa::DFA, class std::allocator<class antlr4::dfa::DFA>> XPathLexer::_decisionToDFA''(void)
+; CHECK: void __cdecl `dynamic atexit destructor for `private: static class std::vector<class antlr4::dfa::DFA, class std::allocator<class antlr4::dfa::DFA>> XPathLexer::_decisionToDFA''(void)
 
 ??__K_deg@@YAHO@Z
 ; CHECK: int __cdecl operator ""_deg(long double)
diff --git a/test/Demangle/ms-string-literals.test b/test/Demangle/ms-string-literals.test
index 132637c267d7a554c1ad165143bc61def8f59285..1e634099124b3d46159fbf9ef7affe4f14676458 100644
--- a/test/Demangle/ms-string-literals.test
+++ b/test/Demangle/ms-string-literals.test
@@ -259,268 +259,268 @@
 ??_C@_01KHDNNMEB@?$AB?$AA@
 ??_C@_01LOCGONAA@?$AA?$AA@
 
-; CHECK: const char * {"\xFF"}
-; CHECK: const char * {"\xFE"}
-; CHECK: const char * {"\xFD"}
-; CHECK: const char * {"\xFC"}
-; CHECK: const char * {"\xFB"}
-; CHECK: const char * {"\xFA"}
-; CHECK: const char * {"\xF9"}
-; CHECK: const char * {"\xF8"}
-; CHECK: const char * {"\xF7"}
-; CHECK: const char * {"\xF6"}
-; CHECK: const char * {"\xF5"}
-; CHECK: const char * {"\xF4"}
-; CHECK: const char * {"\xF3"}
-; CHECK: const char * {"\xF2"}
-; CHECK: const char * {"\xF1"}
-; CHECK: const char * {"\xF0"}
-; CHECK: const char * {"\xEF"}
-; CHECK: const char * {"\xEE"}
-; CHECK: const char * {"\xED"}
-; CHECK: const char * {"\xEC"}
-; CHECK: const char * {"\xEB"}
-; CHECK: const char * {"\xEA"}
-; CHECK: const char * {"\xE9"}
-; CHECK: const char * {"\xE8"}
-; CHECK: const char * {"\xE7"}
-; CHECK: const char * {"\xE6"}
-; CHECK: const char * {"\xE5"}
-; CHECK: const char * {"\xE4"}
-; CHECK: const char * {"\xE3"}
-; CHECK: const char * {"\xE2"}
-; CHECK: const char * {"\xE1"}
-; CHECK: const char * {"\xE0"}
-; CHECK: const char * {"\xDF"}
-; CHECK: const char * {"\xDE"}
-; CHECK: const char * {"\xDD"}
-; CHECK: const char * {"\xDC"}
-; CHECK: const char * {"\xDB"}
-; CHECK: const char * {"\xDA"}
-; CHECK: const char * {"\xD9"}
-; CHECK: const char * {"\xD8"}
-; CHECK: const char * {"\xD7"}
-; CHECK: const char * {"\xD6"}
-; CHECK: const char * {"\xD5"}
-; CHECK: const char * {"\xD4"}
-; CHECK: const char * {"\xD3"}
-; CHECK: const char * {"\xD2"}
-; CHECK: const char * {"\xD1"}
-; CHECK: const char * {"\xD0"}
-; CHECK: const char * {"\xCF"}
-; CHECK: const char * {"\xCE"}
-; CHECK: const char * {"\xCD"}
-; CHECK: const char * {"\xCC"}
-; CHECK: const char * {"\xCB"}
-; CHECK: const char * {"\xCA"}
-; CHECK: const char * {"\xC9"}
-; CHECK: const char * {"\xC8"}
-; CHECK: const char * {"\xC7"}
-; CHECK: const char * {"\xC6"}
-; CHECK: const char * {"\xC5"}
-; CHECK: const char * {"\xC4"}
-; CHECK: const char * {"\xC3"}
-; CHECK: const char * {"\xC2"}
-; CHECK: const char * {"\xC1"}
-; CHECK: const char * {"\xC0"}
-; CHECK: const char * {"\xBF"}
-; CHECK: const char * {"\xBE"}
-; CHECK: const char * {"\xBD"}
-; CHECK: const char * {"\xBC"}
-; CHECK: const char * {"\xBB"}
-; CHECK: const char * {"\xBA"}
-; CHECK: const char * {"\xB9"}
-; CHECK: const char * {"\xB8"}
-; CHECK: const char * {"\xB7"}
-; CHECK: const char * {"\xB6"}
-; CHECK: const char * {"\xB5"}
-; CHECK: const char * {"\xB4"}
-; CHECK: const char * {"\xB3"}
-; CHECK: const char * {"\xB2"}
-; CHECK: const char * {"\xB1"}
-; CHECK: const char * {"\xB0"}
-; CHECK: const char * {"\xAF"}
-; CHECK: const char * {"\xAE"}
-; CHECK: const char * {"\xAD"}
-; CHECK: const char * {"\xAC"}
-; CHECK: const char * {"\xAB"}
-; CHECK: const char * {"\xAA"}
-; CHECK: const char * {"\xA9"}
-; CHECK: const char * {"\xA8"}
-; CHECK: const char * {"\xA7"}
-; CHECK: const char * {"\xA6"}
-; CHECK: const char * {"\xA5"}
-; CHECK: const char * {"\xA4"}
-; CHECK: const char * {"\xA3"}
-; CHECK: const char * {"\xA2"}
-; CHECK: const char * {"\xA1"}
-; CHECK: const char * {"\xA0"}
-; CHECK: const char * {"\x9F"}
-; CHECK: const char * {"\x9E"}
-; CHECK: const char * {"\x9D"}
-; CHECK: const char * {"\x9C"}
-; CHECK: const char * {"\x9B"}
-; CHECK: const char * {"\x9A"}
-; CHECK: const char * {"\x99"}
-; CHECK: const char * {"\x98"}
-; CHECK: const char * {"\x97"}
-; CHECK: const char * {"\x96"}
-; CHECK: const char * {"\x95"}
-; CHECK: const char * {"\x94"}
-; CHECK: const char * {"\x93"}
-; CHECK: const char * {"\x92"}
-; CHECK: const char * {"\x91"}
-; CHECK: const char * {"\x90"}
-; CHECK: const char * {"\x8F"}
-; CHECK: const char * {"\x8E"}
-; CHECK: const char * {"\x8D"}
-; CHECK: const char * {"\x8C"}
-; CHECK: const char * {"\x8B"}
-; CHECK: const char * {"\x8A"}
-; CHECK: const char * {"\x89"}
-; CHECK: const char * {"\x88"}
-; CHECK: const char * {"\x87"}
-; CHECK: const char * {"\x86"}
-; CHECK: const char * {"\x85"}
-; CHECK: const char * {"\x84"}
-; CHECK: const char * {"\x83"}
-; CHECK: const char * {"\x82"}
-; CHECK: const char * {"\x81"}
-; CHECK: const char * {"\x80"}
-; CHECK: const char * {"\x7F"}
-; CHECK: const char * {"~"}
-; CHECK: const char * {"}"}
-; CHECK: const char * {"|"}
-; CHECK: const char * {"{"}
-; CHECK: const char * {"z"}
-; CHECK: const char * {"y"}
-; CHECK: const char * {"x"}
-; CHECK: const char * {"w"}
-; CHECK: const char * {"v"}
-; CHECK: const char * {"u"}
-; CHECK: const char * {"t"}
-; CHECK: const char * {"s"}
-; CHECK: const char * {"r"}
-; CHECK: const char * {"q"}
-; CHECK: const char * {"p"}
-; CHECK: const char * {"o"}
-; CHECK: const char * {"n"}
-; CHECK: const char * {"m"}
-; CHECK: const char * {"l"}
-; CHECK: const char * {"k"}
-; CHECK: const char * {"j"}
-; CHECK: const char * {"i"}
-; CHECK: const char * {"h"}
-; CHECK: const char * {"g"}
-; CHECK: const char * {"f"}
-; CHECK: const char * {"e"}
-; CHECK: const char * {"d"}
-; CHECK: const char * {"c"}
-; CHECK: const char * {"b"}
-; CHECK: const char * {"a"}
-; CHECK: const char * {"`"}
-; CHECK: const char * {"_"}
-; CHECK: const char * {"^"}
-; CHECK: const char * {"]"}
-; CHECK: const char * {"\\"}
-; CHECK: const char * {"["}
-; CHECK: const char * {"Z"}
-; CHECK: const char * {"Y"}
-; CHECK: const char * {"X"}
-; CHECK: const char * {"W"}
-; CHECK: const char * {"V"}
-; CHECK: const char * {"U"}
-; CHECK: const char * {"T"}
-; CHECK: const char * {"S"}
-; CHECK: const char * {"R"}
-; CHECK: const char * {"Q"}
-; CHECK: const char * {"P"}
-; CHECK: const char * {"O"}
-; CHECK: const char * {"N"}
-; CHECK: const char * {"M"}
-; CHECK: const char * {"L"}
-; CHECK: const char * {"K"}
-; CHECK: const char * {"J"}
-; CHECK: const char * {"I"}
-; CHECK: const char * {"H"}
-; CHECK: const char * {"G"}
-; CHECK: const char * {"F"}
-; CHECK: const char * {"E"}
-; CHECK: const char * {"D"}
-; CHECK: const char * {"C"}
-; CHECK: const char * {"B"}
-; CHECK: const char * {"A"}
-; CHECK: const char * {"@"}
-; CHECK: const char * {"?"}
-; CHECK: const char * {">"}
-; CHECK: const char * {"="}
-; CHECK: const char * {"<"}
-; CHECK: const char * {";"}
-; CHECK: const char * {":"}
-; CHECK: const char * {"9"}
-; CHECK: const char * {"8"}
-; CHECK: const char * {"7"}
-; CHECK: const char * {"6"}
-; CHECK: const char * {"5"}
-; CHECK: const char * {"4"}
-; CHECK: const char * {"3"}
-; CHECK: const char * {"2"}
-; CHECK: const char * {"1"}
-; CHECK: const char * {"0"}
-; CHECK: const char * {"/"}
-; CHECK: const char * {"."}
-; CHECK: const char * {"-"}
-; CHECK: const char * {","}
-; CHECK: const char * {"+"}
-; CHECK: const char * {"*"}
-; CHECK: const char * {")"}
-; CHECK: const char * {"("}
-; CHECK: const char * {"\'"}
-; CHECK: const char * {"&"}
-; CHECK: const char * {"%"}
-; CHECK: const char * {"$"}
-; CHECK: const char * {"#"}
-; CHECK: const char * {"\""}
-; CHECK: const char * {"!"}
-; CHECK: const char * {" "}
-; CHECK: const char * {"\x1F"}
-; CHECK: const char * {"\x1E"}
-; CHECK: const char * {"\x1D"}
-; CHECK: const char * {"\x1C"}
-; CHECK: const char * {"\x1B"}
-; CHECK: const char * {"\x1A"}
-; CHECK: const char * {"\x19"}
-; CHECK: const char * {"\x18"}
-; CHECK: const char * {"\x17"}
-; CHECK: const char * {"\x16"}
-; CHECK: const char * {"\x15"}
-; CHECK: const char * {"\x14"}
-; CHECK: const char * {"\x13"}
-; CHECK: const char * {"\x12"}
-; CHECK: const char * {"\x11"}
-; CHECK: const char * {"\x10"}
-; CHECK: const char * {"\x0F"}
-; CHECK: const char * {"\x0E"}
-; CHECK: const char * {"\r"}
-; CHECK: const char * {"\f"}
-; CHECK: const char * {"\v"}
-; CHECK: const char * {"\n"}
-; CHECK: const char * {"\t"}
-; CHECK: const char * {"\b"}
-; CHECK: const char * {"\a"}
-; CHECK: const char * {"\x06"}
-; CHECK: const char * {"\x05"}
-; CHECK: const char * {"\x04"}
-; CHECK: const char * {"\x03"}
-; CHECK: const char * {"\x02"}
-; CHECK: const char * {"\x01"}
+; CHECK: "\xFF"
+; CHECK: "\xFE"
+; CHECK: "\xFD"
+; CHECK: "\xFC"
+; CHECK: "\xFB"
+; CHECK: "\xFA"
+; CHECK: "\xF9"
+; CHECK: "\xF8"
+; CHECK: "\xF7"
+; CHECK: "\xF6"
+; CHECK: "\xF5"
+; CHECK: "\xF4"
+; CHECK: "\xF3"
+; CHECK: "\xF2"
+; CHECK: "\xF1"
+; CHECK: "\xF0"
+; CHECK: "\xEF"
+; CHECK: "\xEE"
+; CHECK: "\xED"
+; CHECK: "\xEC"
+; CHECK: "\xEB"
+; CHECK: "\xEA"
+; CHECK: "\xE9"
+; CHECK: "\xE8"
+; CHECK: "\xE7"
+; CHECK: "\xE6"
+; CHECK: "\xE5"
+; CHECK: "\xE4"
+; CHECK: "\xE3"
+; CHECK: "\xE2"
+; CHECK: "\xE1"
+; CHECK: "\xE0"
+; CHECK: "\xDF"
+; CHECK: "\xDE"
+; CHECK: "\xDD"
+; CHECK: "\xDC"
+; CHECK: "\xDB"
+; CHECK: "\xDA"
+; CHECK: "\xD9"
+; CHECK: "\xD8"
+; CHECK: "\xD7"
+; CHECK: "\xD6"
+; CHECK: "\xD5"
+; CHECK: "\xD4"
+; CHECK: "\xD3"
+; CHECK: "\xD2"
+; CHECK: "\xD1"
+; CHECK: "\xD0"
+; CHECK: "\xCF"
+; CHECK: "\xCE"
+; CHECK: "\xCD"
+; CHECK: "\xCC"
+; CHECK: "\xCB"
+; CHECK: "\xCA"
+; CHECK: "\xC9"
+; CHECK: "\xC8"
+; CHECK: "\xC7"
+; CHECK: "\xC6"
+; CHECK: "\xC5"
+; CHECK: "\xC4"
+; CHECK: "\xC3"
+; CHECK: "\xC2"
+; CHECK: "\xC1"
+; CHECK: "\xC0"
+; CHECK: "\xBF"
+; CHECK: "\xBE"
+; CHECK: "\xBD"
+; CHECK: "\xBC"
+; CHECK: "\xBB"
+; CHECK: "\xBA"
+; CHECK: "\xB9"
+; CHECK: "\xB8"
+; CHECK: "\xB7"
+; CHECK: "\xB6"
+; CHECK: "\xB5"
+; CHECK: "\xB4"
+; CHECK: "\xB3"
+; CHECK: "\xB2"
+; CHECK: "\xB1"
+; CHECK: "\xB0"
+; CHECK: "\xAF"
+; CHECK: "\xAE"
+; CHECK: "\xAD"
+; CHECK: "\xAC"
+; CHECK: "\xAB"
+; CHECK: "\xAA"
+; CHECK: "\xA9"
+; CHECK: "\xA8"
+; CHECK: "\xA7"
+; CHECK: "\xA6"
+; CHECK: "\xA5"
+; CHECK: "\xA4"
+; CHECK: "\xA3"
+; CHECK: "\xA2"
+; CHECK: "\xA1"
+; CHECK: "\xA0"
+; CHECK: "\x9F"
+; CHECK: "\x9E"
+; CHECK: "\x9D"
+; CHECK: "\x9C"
+; CHECK: "\x9B"
+; CHECK: "\x9A"
+; CHECK: "\x99"
+; CHECK: "\x98"
+; CHECK: "\x97"
+; CHECK: "\x96"
+; CHECK: "\x95"
+; CHECK: "\x94"
+; CHECK: "\x93"
+; CHECK: "\x92"
+; CHECK: "\x91"
+; CHECK: "\x90"
+; CHECK: "\x8F"
+; CHECK: "\x8E"
+; CHECK: "\x8D"
+; CHECK: "\x8C"
+; CHECK: "\x8B"
+; CHECK: "\x8A"
+; CHECK: "\x89"
+; CHECK: "\x88"
+; CHECK: "\x87"
+; CHECK: "\x86"
+; CHECK: "\x85"
+; CHECK: "\x84"
+; CHECK: "\x83"
+; CHECK: "\x82"
+; CHECK: "\x81"
+; CHECK: "\x80"
+; CHECK: "\x7F"
+; CHECK: "~"
+; CHECK: "}"
+; CHECK: "|"
+; CHECK: "{"
+; CHECK: "z"
+; CHECK: "y"
+; CHECK: "x"
+; CHECK: "w"
+; CHECK: "v"
+; CHECK: "u"
+; CHECK: "t"
+; CHECK: "s"
+; CHECK: "r"
+; CHECK: "q"
+; CHECK: "p"
+; CHECK: "o"
+; CHECK: "n"
+; CHECK: "m"
+; CHECK: "l"
+; CHECK: "k"
+; CHECK: "j"
+; CHECK: "i"
+; CHECK: "h"
+; CHECK: "g"
+; CHECK: "f"
+; CHECK: "e"
+; CHECK: "d"
+; CHECK: "c"
+; CHECK: "b"
+; CHECK: "a"
+; CHECK: "`"
+; CHECK: "_"
+; CHECK: "^"
+; CHECK: "]"
+; CHECK: "\\"
+; CHECK: "["
+; CHECK: "Z"
+; CHECK: "Y"
+; CHECK: "X"
+; CHECK: "W"
+; CHECK: "V"
+; CHECK: "U"
+; CHECK: "T"
+; CHECK: "S"
+; CHECK: "R"
+; CHECK: "Q"
+; CHECK: "P"
+; CHECK: "O"
+; CHECK: "N"
+; CHECK: "M"
+; CHECK: "L"
+; CHECK: "K"
+; CHECK: "J"
+; CHECK: "I"
+; CHECK: "H"
+; CHECK: "G"
+; CHECK: "F"
+; CHECK: "E"
+; CHECK: "D"
+; CHECK: "C"
+; CHECK: "B"
+; CHECK: "A"
+; CHECK: "@"
+; CHECK: "?"
+; CHECK: ">"
+; CHECK: "="
+; CHECK: "<"
+; CHECK: ";"
+; CHECK: ":"
+; CHECK: "9"
+; CHECK: "8"
+; CHECK: "7"
+; CHECK: "6"
+; CHECK: "5"
+; CHECK: "4"
+; CHECK: "3"
+; CHECK: "2"
+; CHECK: "1"
+; CHECK: "0"
+; CHECK: "/"
+; CHECK: "."
+; CHECK: "-"
+; CHECK: ","
+; CHECK: "+"
+; CHECK: "*"
+; CHECK: ")"
+; CHECK: "("
+; CHECK: "\'"
+; CHECK: "&"
+; CHECK: "%"
+; CHECK: "$"
+; CHECK: "#"
+; CHECK: "\""
+; CHECK: "!"
+; CHECK: " "
+; CHECK: "\x1F"
+; CHECK: "\x1E"
+; CHECK: "\x1D"
+; CHECK: "\x1C"
+; CHECK: "\x1B"
+; CHECK: "\x1A"
+; CHECK: "\x19"
+; CHECK: "\x18"
+; CHECK: "\x17"
+; CHECK: "\x16"
+; CHECK: "\x15"
+; CHECK: "\x14"
+; CHECK: "\x13"
+; CHECK: "\x12"
+; CHECK: "\x11"
+; CHECK: "\x10"
+; CHECK: "\x0F"
+; CHECK: "\x0E"
+; CHECK: "\r"
+; CHECK: "\f"
+; CHECK: "\v"
+; CHECK: "\n"
+; CHECK: "\t"
+; CHECK: "\b"
+; CHECK: "\a"
+; CHECK: "\x06"
+; CHECK: "\x05"
+; CHECK: "\x04"
+; CHECK: "\x03"
+; CHECK: "\x02"
+; CHECK: "\x01"
 
 ; The mangling doesn't distinguish between char and char16 types, so even though
-; this was originally written as a char * with one embedded null, it mangles
-; identically to a char16_t * that is empty.  So when demangling, we choose the
+; this was originally written as a char[] with one embedded null, it mangles
+; identically to a char16_t[] that is empty.  So when demangling, we choose the
 ; "smartest" one, which happened to be wrong, but it's still a "better"
 ; demangling.
-; CHECK: const char16_t * {u""}
+; CHECK: u""
 
 
 ??_C@_13KDLDGPGJ@?$AA?7?$AA?$AA@
@@ -622,152 +622,152 @@
 ??_C@_13EHOOFIKC@?$AA?$HN?$AA?$AA@
 ??_C@_13FFFLPHEM@?$AA?$HO?$AA?$AA@
 
-; CHECK: const wchar_t * {L"\t"}
-; CHECK: const wchar_t * {L"\n"}
-; CHECK: const wchar_t * {L"\v"}
-; CHECK: const wchar_t * {L" "}
-; CHECK: const wchar_t * {L"!"}
-; CHECK: const wchar_t * {L"\""}
-; CHECK: const wchar_t * {L"#"}
-; CHECK: const wchar_t * {L"$"}
-; CHECK: const wchar_t * {L"%"}
-; CHECK: const wchar_t * {L"&"}
-; CHECK: const wchar_t * {L"\'"}
-; CHECK: const wchar_t * {L"("}
-; CHECK: const wchar_t * {L")"}
-; CHECK: const wchar_t * {L"*"}
-; CHECK: const wchar_t * {L"+"}
-; CHECK: const wchar_t * {L","}
-; CHECK: const wchar_t * {L"-"}
-; CHECK: const wchar_t * {L"."}
-; CHECK: const wchar_t * {L"/"}
-; CHECK: const wchar_t * {L"0"}
-; CHECK: const wchar_t * {L"1"}
-; CHECK: const wchar_t * {L"2"}
-; CHECK: const wchar_t * {L"3"}
-; CHECK: const wchar_t * {L"4"}
-; CHECK: const wchar_t * {L"5"}
-; CHECK: const wchar_t * {L"6"}
-; CHECK: const wchar_t * {L"7"}
-; CHECK: const wchar_t * {L"8"}
-; CHECK: const wchar_t * {L"9"}
-; CHECK: const wchar_t * {L":"}
-; CHECK: const wchar_t * {L";"}
-; CHECK: const wchar_t * {L"<"}
-; CHECK: const wchar_t * {L"="}
-; CHECK: const wchar_t * {L">"}
-; CHECK: const wchar_t * {L"?"}
-; CHECK: const wchar_t * {L"@"}
-; CHECK: const wchar_t * {L"A"}
-; CHECK: const wchar_t * {L"B"}
-; CHECK: const wchar_t * {L"C"}
-; CHECK: const wchar_t * {L"D"}
-; CHECK: const wchar_t * {L"E"}
-; CHECK: const wchar_t * {L"F"}
-; CHECK: const wchar_t * {L"G"}
-; CHECK: const wchar_t * {L"H"}
-; CHECK: const wchar_t * {L"I"}
-; CHECK: const wchar_t * {L"J"}
-; CHECK: const wchar_t * {L"K"}
-; CHECK: const wchar_t * {L"L"}
-; CHECK: const wchar_t * {L"M"}
-; CHECK: const wchar_t * {L"N"}
-; CHECK: const wchar_t * {L"O"}
-; CHECK: const wchar_t * {L"P"}
-; CHECK: const wchar_t * {L"Q"}
-; CHECK: const wchar_t * {L"R"}
-; CHECK: const wchar_t * {L"S"}
-; CHECK: const wchar_t * {L"T"}
-; CHECK: const wchar_t * {L"U"}
-; CHECK: const wchar_t * {L"V"}
-; CHECK: const wchar_t * {L"W"}
-; CHECK: const wchar_t * {L"X"}
-; CHECK: const wchar_t * {L"Y"}
-; CHECK: const wchar_t * {L"Z"}
-; CHECK: const wchar_t * {L"["}
-; CHECK: const wchar_t * {L"\\"}
-; CHECK: const wchar_t * {L"]"}
-; CHECK: const wchar_t * {L"^"}
-; CHECK: const wchar_t * {L"_"}
-; CHECK: const wchar_t * {L"`"}
-; CHECK: const wchar_t * {L"a"}
-; CHECK: const wchar_t * {L"b"}
-; CHECK: const wchar_t * {L"c"}
-; CHECK: const wchar_t * {L"d"}
-; CHECK: const wchar_t * {L"e"}
-; CHECK: const wchar_t * {L"f"}
-; CHECK: const wchar_t * {L"g"}
-; CHECK: const wchar_t * {L"h"}
-; CHECK: const wchar_t * {L"i"}
-; CHECK: const wchar_t * {L"j"}
-; CHECK: const wchar_t * {L"k"}
-; CHECK: const wchar_t * {L"l"}
-; CHECK: const wchar_t * {L"m"}
-; CHECK: const wchar_t * {L"n"}
-; CHECK: const wchar_t * {L"o"}
-; CHECK: const wchar_t * {L"p"}
-; CHECK: const wchar_t * {L"q"}
-; CHECK: const wchar_t * {L"r"}
-; CHECK: const wchar_t * {L"s"}
-; CHECK: const wchar_t * {L"t"}
-; CHECK: const wchar_t * {L"u"}
-; CHECK: const wchar_t * {L"v"}
-; CHECK: const wchar_t * {L"w"}
-; CHECK: const wchar_t * {L"x"}
-; CHECK: const wchar_t * {L"y"}
-; CHECK: const wchar_t * {L"z"}
-; CHECK: const wchar_t * {L"{"}
-; CHECK: const wchar_t * {L"|"}
-; CHECK: const wchar_t * {L"}"}
-; CHECK: const wchar_t * {L"~"}
+; CHECK: L"\t"
+; CHECK: L"\n"
+; CHECK: L"\v"
+; CHECK: L" "
+; CHECK: L"!"
+; CHECK: L"\""
+; CHECK: L"#"
+; CHECK: L"$"
+; CHECK: L"%"
+; CHECK: L"&"
+; CHECK: L"\'"
+; CHECK: L"("
+; CHECK: L")"
+; CHECK: L"*"
+; CHECK: L"+"
+; CHECK: L","
+; CHECK: L"-"
+; CHECK: L"."
+; CHECK: L"/"
+; CHECK: L"0"
+; CHECK: L"1"
+; CHECK: L"2"
+; CHECK: L"3"
+; CHECK: L"4"
+; CHECK: L"5"
+; CHECK: L"6"
+; CHECK: L"7"
+; CHECK: L"8"
+; CHECK: L"9"
+; CHECK: L":"
+; CHECK: L";"
+; CHECK: L"<"
+; CHECK: L"="
+; CHECK: L">"
+; CHECK: L"?"
+; CHECK: L"@"
+; CHECK: L"A"
+; CHECK: L"B"
+; CHECK: L"C"
+; CHECK: L"D"
+; CHECK: L"E"
+; CHECK: L"F"
+; CHECK: L"G"
+; CHECK: L"H"
+; CHECK: L"I"
+; CHECK: L"J"
+; CHECK: L"K"
+; CHECK: L"L"
+; CHECK: L"M"
+; CHECK: L"N"
+; CHECK: L"O"
+; CHECK: L"P"
+; CHECK: L"Q"
+; CHECK: L"R"
+; CHECK: L"S"
+; CHECK: L"T"
+; CHECK: L"U"
+; CHECK: L"V"
+; CHECK: L"W"
+; CHECK: L"X"
+; CHECK: L"Y"
+; CHECK: L"Z"
+; CHECK: L"["
+; CHECK: L"\\"
+; CHECK: L"]"
+; CHECK: L"^"
+; CHECK: L"_"
+; CHECK: L"`"
+; CHECK: L"a"
+; CHECK: L"b"
+; CHECK: L"c"
+; CHECK: L"d"
+; CHECK: L"e"
+; CHECK: L"f"
+; CHECK: L"g"
+; CHECK: L"h"
+; CHECK: L"i"
+; CHECK: L"j"
+; CHECK: L"k"
+; CHECK: L"l"
+; CHECK: L"m"
+; CHECK: L"n"
+; CHECK: L"o"
+; CHECK: L"p"
+; CHECK: L"q"
+; CHECK: L"r"
+; CHECK: L"s"
+; CHECK: L"t"
+; CHECK: L"u"
+; CHECK: L"v"
+; CHECK: L"w"
+; CHECK: L"x"
+; CHECK: L"y"
+; CHECK: L"z"
+; CHECK: L"{"
+; CHECK: L"|"
+; CHECK: L"}"
+; CHECK: L"~"
 
 
 
 ??_C@_0CF@LABBIIMO@012345678901234567890123456789AB@
-; CHECK: const char * {"012345678901234567890123456789AB"...}
+; CHECK: "012345678901234567890123456789AB"...
 
 ??_C@_1EK@KFPEBLPK@?$AA0?$AA1?$AA2?$AA3?$AA4?$AA5?$AA6?$AA7?$AA8?$AA9?$AA0?$AA1?$AA2?$AA3?$AA4?$AA5?$AA6?$AA7?$AA8?$AA9?$AA0?$AA1?$AA2?$AA3?$AA4?$AA5?$AA6?$AA7?$AA8?$AA9?$AAA?$AAB@
-; CHECK: const wchar_t * {L"012345678901234567890123456789AB"...}
+; CHECK: L"012345678901234567890123456789AB"...
 
 ??_C@_13IIHIAFKH@?W?$PP?$AA?$AA@
-; CHECK: const wchar_t * {L"\xD7\xFF"}
+; CHECK: L"\xD7\xFF"
 
 ??_C@_02PCEFGMJL@hi?$AA@
-; CHECK: const char * {"hi"}
+; CHECK: "hi"
 
 ??_C@_05OMLEGLOC@h?$AAi?$AA?$AA?$AA@
-; CHECK: const char16_t * {u"hi"}
+; CHECK: u"hi"
 
 ??_C@_0EK@FEAOBHPP@o?$AA1?$AA2?$AA3?$AA4?$AA5?$AA6?$AA7?$AA8?$AA9?$AA0?$AA1?$AA2?$AA3?$AA4?$AA5?$AA@
-; CHECK: const char16_t * {u"o123456789012345"...}
+; CHECK: u"o123456789012345"...
 
 ??_C@_0M@GFNAJIPG@h?$AA?$AA?$AAi?$AA?$AA?$AA?$AA?$AA?$AA?$AA@
-; CHECK: const char32_t * {U"hi"}
+; CHECK: U"hi"
 
 ??_C@_0JE@IMHFEDAA@0?$AA?$AA?$AA1?$AA?$AA?$AA2?$AA?$AA?$AA3?$AA?$AA?$AA4?$AA?$AA?$AA5?$AA?$AA?$AA6?$AA?$AA?$AA7?$AA?$AA?$AA@
-; CHECK: const char32_t * {U"01234567"...}
+; CHECK: U"01234567"...
 
 ; These all have just the right length that the trailing 0 just fits.
 ??_C@_0CA@NMANGEKF@012345678901234567890123456789A?$AA@
-; CHECK: const char * {"012345678901234567890123456789A"}
+; CHECK: "012345678901234567890123456789A"
 
 ??_C@_1EA@LJAFPILO@?$AA0?$AA1?$AA2?$AA3?$AA4?$AA5?$AA6?$AA7?$AA8?$AA9?$AA0?$AA1?$AA2?$AA3?$AA4?$AA5?$AA6?$AA7?$AA8?$AA9?$AA0?$AA1?$AA2?$AA3?$AA4?$AA5?$AA6?$AA7?$AA8?$AA9?$AAA?$AA?$AA@
-; CHECK: const wchar_t * {L"012345678901234567890123456789A"}
+; CHECK: L"012345678901234567890123456789A"
 
 ??_C@_0CA@NMANGEKF@012345678901234567890123456789A?$AA@
-; CHECK: const char * {"012345678901234567890123456789A"}
+; CHECK: "012345678901234567890123456789A"
 
 ??_C@_0CA@NFEFHIFO@0?$AA1?$AA2?$AA3?$AA4?$AA5?$AA6?$AA7?$AA8?$AA9?$AA0?$AA1?$AA2?$AA3?$AA4?$AA?$AA?$AA@
-; CHECK: const char16_t * {u"012345678901234"}
+; CHECK: u"012345678901234"
 
 ??_C@_0CA@KFPHPCC@0?$AA?$AA?$AA1?$AA?$AA?$AA2?$AA?$AA?$AA3?$AA?$AA?$AA4?$AA?$AA?$AA5?$AA?$AA?$AA6?$AA?$AA?$AA?$AA?$AA?$AA?$AA@
-; CHECK: const char32_t * {U"0123456"}
+; CHECK: U"0123456"
 
 ; There are too many bytes encoded in this string literal (it should encode a max of 32 bytes)
 ; but some buggy compilers will incorrectly generate this, so we need to be able to demangle
 ; both the correct and incorrect versions.
 ??_C@_0CG@HJGBPLNO@l?$AAo?$AAo?$AAk?$AAA?$AAh?$AAe?$AAa?$AAd?$AAH?$AAa?$AAr?$AAd?$AAB?$AAr?$AAe?$AAa?$AAk?$AA?$AA?$AA@
-; CHECK: const char16_t * {u"lookAheadHardBreak"}
+; CHECK: u"lookAheadHardBreak"
 
 ??_C@_0CG@HJGBPLNO@l?$AAo?$AAo?$AAk?$AAA?$AAh?$AAe?$AAa?$AAd?$AAH?$AAa?$AAr?$AAd?$AAB?$AAr?$AAe?$AA@
-; CHECK: const char16_t * {u"lookAheadHardBre"...}
\ No newline at end of file
+; CHECK: u"lookAheadHardBre"...
diff --git a/test/Demangle/ms-template-callback.test b/test/Demangle/ms-template-callback.test
index 88c4493d2bb6eb90fe55791274ff13a3f1b4e0e7..488cd7fdc132727ecd3528db82815df609d7e67a 100644
--- a/test/Demangle/ms-template-callback.test
+++ b/test/Demangle/ms-template-callback.test
@@ -47,7 +47,7 @@
 ; CHECK: void __cdecl WrapFnRef<&void __cdecl VoidFn(void)>(void)
 
 ??$WrapFnPtr@$1?VoidStaticMethod@Thing@@SAXXZ@@YAXXZ
-; CHECK: void __cdecl WrapFnPtr<&static void __cdecl Thing::VoidStaticMethod(void)>(void)
+; CHECK: void __cdecl WrapFnPtr<&public: static void __cdecl Thing::VoidStaticMethod(void)>(void)
 
 ??$WrapFnRef@$1?VoidStaticMethod@Thing@@SAXXZ@@YAXXZ
-; CHECK: void __cdecl WrapFnRef<&static void __cdecl Thing::VoidStaticMethod(void)>(void)
+; CHECK: void __cdecl WrapFnRef<&public: static void __cdecl Thing::VoidStaticMethod(void)>(void)
diff --git a/test/Demangle/ms-templates-memptrs.test b/test/Demangle/ms-templates-memptrs.test
index f9d1d454508e91c65b474b89a9c1ea5c2b9a0a15..16bb653d46c48864d50ec8041f6b9a4fe8c3765e 100644
--- a/test/Demangle/ms-templates-memptrs.test
+++ b/test/Demangle/ms-templates-memptrs.test
@@ -12,7 +12,7 @@
 ; CHECK: void __cdecl CallMethod<struct M, {[thunk]: __thiscall M::`vcall'{0, {flat}}, 0}>(struct M &)
 
 ??$CallMethod@UM@@$H?f@1@QAEXXZA@@@YAXAAUM@@@Z
-; CHECK: void __cdecl CallMethod<struct M, {void __thiscall M::f(void), 0}>(struct M &)
+; CHECK: void __cdecl CallMethod<struct M, {public: void __thiscall M::f(void), 0}>(struct M &)
 
 ??$CallMethod@UO@@$H??_91@$BA@AE3@@YAXAAUO@@@Z
 ; CHECK: void __cdecl CallMethod<struct O, {[thunk]: __thiscall O::`vcall'{0, {flat}}, 4}>(struct O &)
@@ -24,7 +24,7 @@
 ; CHECK: void __cdecl CallMethod<struct S, &[thunk]: __thiscall S::`vcall'{0, {flat}}>(struct S &)
 
 ??$CallMethod@US@@$1?f@1@QAEXXZ@@YAXAAUS@@@Z
-; CHECK: void __cdecl CallMethod<struct S, &void __thiscall S::f(void)>(struct S &)
+; CHECK: void __cdecl CallMethod<struct S, &public: void __thiscall S::f(void)>(struct S &)
 
 ??$CallMethod@UU@@$0A@@@YAXAAUU@@@Z
 ; CHECK: void __cdecl CallMethod<struct U, 0>(struct U &)
@@ -33,7 +33,7 @@
 ; CHECK: void __cdecl CallMethod<struct U, {[thunk]: __thiscall U::`vcall'{0, {flat}}, 0, 0, 0}>(struct U &)
 
 ??$CallMethod@UU@@$J?f@1@QAEXXZA@A@A@@@YAXAAUU@@@Z
-; CHECK: void __cdecl CallMethod<struct U, {void __thiscall U::f(void), 0, 0, 0}>(struct U &)
+; CHECK: void __cdecl CallMethod<struct U, {public: void __thiscall U::f(void), 0, 0, 0}>(struct U &)
 
 ??$CallMethod@UV@@$0A@@@YAXAAUV@@@Z
 ; CHECK: void __cdecl CallMethod<struct V, 0>(struct V &)
@@ -42,7 +42,7 @@
 ; CHECK: void __cdecl CallMethod<struct V, {[thunk]: __thiscall V::`vcall'{0, {flat}}, 0, 0}>(struct V &)
 
 ??$CallMethod@UV@@$I?f@1@QAEXXZA@A@@@YAXAAUV@@@Z
-; CHECK: void __cdecl CallMethod<struct V, {void __thiscall V::f(void), 0, 0}>(struct V &)
+; CHECK: void __cdecl CallMethod<struct V, {public: void __thiscall V::f(void), 0, 0}>(struct V &)
 
 ??$ReadField@UA@@$0?0@@YAHAAUA@@@Z
 ; CHECK: int __cdecl ReadField<struct A, -1>(struct A &)
diff --git a/test/Demangle/ms-templates.test b/test/Demangle/ms-templates.test
index bddc92ffb504639704ca05ce0c01a238473a9505..91e85f412ef5477c00a77dbc4bf179705b3586cb 100644
--- a/test/Demangle/ms-templates.test
+++ b/test/Demangle/ms-templates.test
@@ -195,7 +195,7 @@
 ; CHECK: void __thiscall UUIDType4<&struct __s_GUID const _GUID_12345678_1234_1234_1234_1234567890ab>::bar(void)
 
 ??$f@US@@$1?g@1@QEAAXXZ@@YAXXZ
-; CHECK: void __cdecl f<struct S, &void __cdecl S::g(void)>(void)
+; CHECK: void __cdecl f<struct S, &public: void __cdecl S::g(void)>(void)
 
 ??$?0N@?$Foo@H@@QEAA@N@Z
-; CHECK: __cdecl Foo<int>::Foo<int><double>(double)
\ No newline at end of file
+; CHECK: __cdecl Foo<int>::Foo<int><double>(double)
diff --git a/test/Demangle/ms-thunks.test b/test/Demangle/ms-thunks.test
index 668daa830474c1aee705f7a51734b32ae5250e4b..bb41299fb41832ed9df17a20b008b670d24ae6f8 100644
--- a/test/Demangle/ms-thunks.test
+++ b/test/Demangle/ms-thunks.test
@@ -3,13 +3,13 @@
 ; CHECK-NOT: Invalid mangled name
 
 ?f@C@@WBA@EAAHXZ
-; CHECK: [thunk]: virtual int __cdecl C::f`adjustor{16}'(void)
+; CHECK: [thunk]: public: virtual int __cdecl C::f`adjustor{16}'(void)
 
 ??_EDerived@@$4PPPPPPPM@A@EAAPEAXI@Z
-; CHECK: [thunk]: virtual void * __cdecl Derived::`vector deleting dtor'`vtordisp{-4, 0}'(unsigned int)
+; CHECK: [thunk]: public: virtual void * __cdecl Derived::`vector deleting dtor'`vtordisp{-4, 0}'(unsigned int)
 
 ?f@A@simple@@$R477PPPPPPPM@7AEXXZ
-; CHECK: [thunk]: virtual void __thiscall simple::A::f`vtordispex{8, 8, -4, 8}'(void)
+; CHECK: [thunk]: public: virtual void __thiscall simple::A::f`vtordispex{8, 8, -4, 8}'(void)
 
 ??_9Base@@$B7AA
 ; CHECK: [thunk]: __cdecl Base::`vcall'{8, {flat}}
diff --git a/test/ExecutionEngine/Interpreter/call-no-args.ll b/test/ExecutionEngine/Interpreter/call-no-args.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bd1f906fb04e64346822d6a8ff34b0cf029b1590
--- /dev/null
+++ b/test/ExecutionEngine/Interpreter/call-no-args.ll
@@ -0,0 +1,10 @@
+; RUN: %lli -force-interpreter %s
+
+declare void @exit(i32)
+declare i32 @rand()
+
+define i32 @main() {
+  %ret = call i32 @rand()
+  call void @exit(i32 0)
+  ret i32 %ret
+}
diff --git a/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s b/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
index ac097c44e5fd8431238bbde4c0375e881d6dad17..3522bee81eae6f354a6fbf1910580946db9e55f6 100644
--- a/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
+++ b/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
@@ -24,3 +24,17 @@ inst1:
 # rtdyld-check: decode_operand(inst2, 3) = section_addr(COFF_x86_64_IMGREL.o, .rdata)+5-40960000000000
 inst2:
     mov %ebx, (__constdata@imgrel+5)
+        .data
+        .space 375
+rel1:
+# rtdyld-check: *{4}rel1 = string - section_addr(COFF_x86_64_IMGREL.o, .data)
+	.secrel32 string
+
+# We explicitly add padding to put string outside of the 16bit address space
+# (absolute and as an offset from .data), so that relocations involving
+# 32bit addresses / offsets are not accidentally truncated to 16 bits.
+        .space 65536
+        .global string
+        .align 1
+string:
+        .asciz "Hello World\n"
diff --git a/test/Feature/intrinsics.ll b/test/Feature/intrinsics.ll
index bbf30d3cc23193b725c6eed55b8a3d6d1d7c1a5c..71bb73cde18c42adc87bf221edf20696aa6d4eb8 100644
--- a/test/Feature/intrinsics.ll
+++ b/test/Feature/intrinsics.ll
@@ -70,4 +70,4 @@ define void @trap() {
 }
 
 ; CHECK: attributes #0 = { nounwind readnone speculatable }
-; CHECK: attributes #1 = { noreturn nounwind }
+; CHECK: attributes #1 = { cold noreturn nounwind }
diff --git a/test/FileCheck/check-count.txt b/test/FileCheck/check-count.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1782728949a2bc70a5e2fde61e984cf464a347df
--- /dev/null
+++ b/test/FileCheck/check-count.txt
@@ -0,0 +1,100 @@
+;
+; Basic error checking.
+;
+
+this is something else
+
+; RUN: not FileCheck %s --input-file %s --check-prefix=CHECK-ERR1 2>&1 | FileCheck %s --check-prefix=ERRCOUNT1
+CHECK-ERR1-COUNT-xx: this
+ERRCOUNT1: [[@LINE-1]]:18: error: invalid count in -COUNT specification on prefix 'CHECK-ERR1'
+
+; RUN: not FileCheck %s --input-file %s --check-prefix=CHECK-ERR2 2>&1 | FileCheck %s --check-prefix=ERRCOUNT2
+CHECK-ERR2-COUNT-0x1: something
+ERRCOUNT2: [[@LINE-1]]:19: error: invalid count in -COUNT specification on prefix 'CHECK-ERR2'
+
+; RUN: not FileCheck %s --input-file %s --check-prefix=CHECK-ERR3 2>&1 | FileCheck %s --check-prefix=ERRCOUNT3
+CHECK-ERR3-COUNT-100x: else
+ERRCOUNT3: [[@LINE-1]]:21: error: invalid count in -COUNT specification on prefix 'CHECK-ERR3'
+
+; RUN: not FileCheck %s --input-file %s --check-prefix=CHECK-ERR4 2>&1 | FileCheck %s --check-prefix=ERRCOUNT4
+CHECK-ERR4-COUNT-0: else
+ERRCOUNT4: [[@LINE-1]]:19: error: invalid count in -COUNT specification on prefix 'CHECK-ERR4'
+
+;
+; Main functionality
+;
+
+this is duplicate
+this is duplicate
+this is not duplicate
+this is duplicate
+this is duplicate
+this is duplicate
+
+; RUN: FileCheck %s --input-file %s --check-prefix=CHECK-CNT1
+CHECK-CNT1-COUNT-1: this is duplicate
+CHECK-CNT1: 	    this is duplicate
+CHECK-CNT1-NEXT:    this is not duplicate
+
+; RUN: FileCheck %s --input-file %s --check-prefix=CHECK-CNT2
+CHECK-CNT2-COUNT-2: this is duplicate
+CHECK-CNT2:         this is not duplicate
+
+; RUN: FileCheck %s --input-file %s --check-prefix=CHECK-CNT3
+CHECK-CNT3-COUNT-2: this is duplicate
+CHECK-CNT3:         this is not duplicate
+CHECK-CNT3-COUNT-3: this is duplicate
+CHECK-CNT3-NOT:     {{^}}this is duplicate
+
+; RUN: FileCheck %s --input-file %s --check-prefix=CHECK-CNT4
+CHECK-CNT4-COUNT-5: this is duplicate
+CHECK-CNT4-EMPTY:
+
+Many-label:
+
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+-many-
+
+; RUN: FileCheck %s --input-file %s --check-prefix=CHECK-CNTMANY
+CHECK-CNTMANY-COUNT-2: this is duplicate
+CHECK-CNTMANY-LABEL: Many-label:
+CHECK-CNTMANY-EMPTY:
+CHECK-CNTMANY-COUNT-16: {{^}}-many-
+CHECK-CNTMANY-EMPTY:
+
+;
+; Tests on mismatches:
+;
+
+; RUN: not FileCheck %s --input-file %s --check-prefix=CHECK-MIS1 2>&1 | FileCheck %s --check-prefix=MISCOUNT1
+CHECK-MIS1-COUNT-3: this is duplicate
+CHECK-MIS1: {{^}}this is not duplicate
+MISCOUNT1: [[@LINE-1]]:13: error: CHECK-MIS1: expected string not found in input
+
+; RUN: not FileCheck %s --input-file %s --check-prefix=CHECK-MIS2 2>&1 | FileCheck %s --check-prefix=MISCOUNT2
+CHECK-MIS2-COUNT-10: {{^this is duplicate}}
+CHECK-MIS2: {{^}}this is not duplicate
+MISCOUNT2: [[@LINE-2]]:22: error: CHECK-MIS2-COUNT: expected string not found in input
+
+; RUN: not FileCheck %s --input-file %s --check-prefix=CHECK-MIS3 2>&1 | FileCheck %s --check-prefix=MISCOUNT3
+CHECK-MIS3-COUNT-5: this is duplicate
+CHECK-MIS3-EMPTY:
+CHECK-MIS3-LABEL: Many-label:
+CHECK-MIS3-EMPTY:
+CHECK-MIS3-COUNT-160: {{^}}-many-
+CHECK-MIS3-EMPTY:
+MISCOUNT3: [[@LINE-2]]:23: error: CHECK-MIS3-COUNT: expected string not found in input (17 out of 160)
diff --git a/test/FileCheck/dump-input-annotations.txt b/test/FileCheck/dump-input-annotations.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eaf1c941e672adbb659147a2f943873d7f4f90fd
--- /dev/null
+++ b/test/FileCheck/dump-input-annotations.txt
@@ -0,0 +1,394 @@
+;--------------------------------------------------
+; Use -strict-whitespace to check marker and note alignment here.
+; (Also check multiline marker where start/end columns vary across lines.)
+;
+; In the remaining checks, don't use -strict-whitespace and thus check just the
+; presence, order, and lengths of markers.  That way, if we ever change padding
+; within line labels, we don't have to adjust so many tests.
+;--------------------------------------------------
+
+; RUN: echo 'hello world' > %t.in
+; RUN: echo 'goodbye' >> %t.in
+; RUN: echo 'world' >> %t.in
+; RUN: echo 'unicorn' >> %t.in
+
+; RUN: echo 'CHECK: hello' > %t.chk
+; RUN: echo 'CHECK: universe' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -strict-whitespace -match-full-lines -check-prefix=ALIGN %s
+
+; ALIGN:Full input was:
+; ALIGN-NEXT:<<<<<<
+; ALIGN-NEXT:           1: hello world
+; ALIGN-NEXT:check:1       ^~~~~
+; ALIGN-NEXT:check:2'0           X~~~~
+; ALIGN-NEXT:           2: goodbye
+; ALIGN-NEXT:check:2'0     ~~~~~~~
+; ALIGN-NEXT:           3: world
+; ALIGN-NEXT:check:2'0     ~~~~~
+; ALIGN-NEXT:           4: unicorn
+; ALIGN-NEXT:check:2'0     ~~~~~~~ error: no match found
+; ALIGN-NEXT:check:2'1     ?       possible intended match
+; ALIGN-NEXT:>>>>>>
+; ALIGN-NOT:{{.}}
+
+;--------------------------------------------------
+; CHECK (also: multi-line search range, fuzzy match)
+;--------------------------------------------------
+
+; Good match and no match.
+
+; RUN: echo 'hello'   > %t.in
+; RUN: echo 'again'   >> %t.in
+; RUN: echo 'whirled' >> %t.in
+
+; RUN: echo 'CHECK: hello' > %t.chk
+; RUN: echo 'CHECK: world' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefix=CHK
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=CHK,CHK-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=CHK,CHK-V
+
+; CHK:        <<<<<<
+; CHK-NEXT:              1: hello
+; CHK-V-NEXT: check:1       ^~~~~
+; CHK-NEXT:              2: again
+; CHK-NEXT:   check:2'0     X~~~~
+; CHK-NEXT:              3: whirled
+; CHK-NEXT:   check:2'0     ~~~~~~~ error: no match found
+; CHK-NEXT:   check:2'1     ?       possible intended match
+; CHK-NEXT:   >>>>>>
+; CHK-NOT:    {{.}}
+
+;--------------------------------------------------
+; CHECK-COUNT-<num>
+;--------------------------------------------------
+
+; Good match and no match.
+
+; RUN: echo 'pete'   > %t.in
+; RUN: echo 'repete' >> %t.in
+; RUN: echo 'repeat' >> %t.in
+
+; RUN: echo 'CHECK-COUNT-3: pete' > %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=CNT,CNT-Q
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=CNT,CNT-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=CNT,CNT-V
+
+; CNT:          <<<<<<
+; CNT-NEXT:                1: pete
+; CNT-V-NEXT:   count:1'0     ^~~~
+; CNT-NEXT:                2: repete
+; CNT-V-NEXT:   count:1'1       ^~~~
+; CNT-NEXT:                3: repeat
+; CNT-Q-NEXT:   count:1       X~~~~~ error: no match found
+; CNT-V-NEXT:   count:1'2     X~~~~~ error: no match found
+; CNT-NEXT:     >>>>>>
+; CNT-NOT:      {{.}}
+
+;--------------------------------------------------
+; CHECK-NEXT (also: EOF search-range, wrong-line match)
+;--------------------------------------------------
+
+; Good match and no match.
+
+; RUN: echo 'hello' > %t.in
+; RUN: echo 'again' >> %t.in
+
+; RUN: echo 'CHECK: hello' > %t.chk
+; RUN: echo 'CHECK-NEXT: again' >> %t.chk
+; RUN: echo 'CHECK-NEXT: world' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefix=NXT
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=NXT,NXT-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=NXT,NXT-V,NXT-VV
+
+; NXT:        <<<<<<
+; NXT-NEXT:            1: hello
+; NXT-V-NEXT: check:1     ^~~~~
+; NXT-NEXT:            2: again
+; NXT-V-NEXT: next:2      ^~~~~
+; NXT-NEXT:            3:
+; NXT-NEXT:   next:3      X error: no match found
+; NXT-NEXT:   >>>>>>
+; NXT-NOT:    {{.}}
+
+; Wrong-line match.
+
+; RUN: echo 'yonder' >> %t.in
+; RUN: echo 'world' >> %t.in
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefix=NXT2
+
+; NXT2:      <<<<<<
+; NXT2-NEXT:         1: hello
+; NXT2-NEXT:         2: again
+; NXT2-NEXT:         3: yonder
+; NXT2-NEXT:         4: world
+; NXT2-NEXT: next:3     !~~~~ error: match on wrong line
+; NXT2-NEXT: >>>>>>
+; NXT2-NOT:  {{.}}
+
+;--------------------------------------------------
+; CHECK-SAME (also: multiple annotations per line, single-char search range,
+; wrong-line match)
+;--------------------------------------------------
+
+; Good match and no match.
+
+; RUN: echo 'hello world!' > %t.in
+
+; RUN: echo 'CHECK: hello' > %t.chk
+; RUN: echo 'CHECK-SAME: world' >> %t.chk
+; RUN: echo 'CHECK-SAME: again' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefix=SAM
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=SAM,SAM-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=SAM,SAM-V,SAM-VV
+
+; SAM:        <<<<<<
+; SAM-NEXT:            1: hello world!
+; SAM-V-NEXT: check:1     ^~~~~
+; SAM-V-NEXT: same:2            ^~~~~
+; SAM-NEXT:   same:3                 X error: no match found
+; SAM-NEXT:   >>>>>>
+; SAM-NOT:    {{.}}
+
+; Wrong-line match.
+
+; RUN: echo 'again' >> %t.in
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=SAM2
+
+; SAM2:      <<<<<<
+; SAM2-NEXT:          1: hello world!
+; SAM2-NEXT: check:1     ^~~~~
+; SAM2-NEXT: same:2            ^~~~~
+; SAM2-NEXT:          2: again
+; SAM2-NEXT: same:3      !~~~~ error: match on wrong line
+; SAM2-NEXT: >>>>>>
+; SAM2-NOT:  {{.}}
+
+;--------------------------------------------------
+; CHECK-EMPTY (also: search range ends at label, single-char match, wrong-line
+; match)
+;--------------------------------------------------
+
+; Good match and no match.
+;
+; CHECK-EMPTY always seems to match an empty line at EOF (illegally when it's
+; not the next line) unless either (1) the last line is non-empty and has no
+; newline or (2) there's a CHECK-LABEL to end the search range before EOF.  We
+; choose scenario 2 to check the case of no match.
+
+; RUN: echo 'hello' > %t.in
+; RUN: echo '' >> %t.in
+; RUN: echo 'world' >> %t.in
+; RUN: echo 'label' >> %t.in
+
+; RUN: echo 'CHECK: hello' > %t.chk
+; RUN: echo 'CHECK-EMPTY:' >> %t.chk
+; RUN: echo 'CHECK-EMPTY:' >> %t.chk
+; RUN: echo 'CHECK-LABEL: label' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefix=EMP
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=EMP,EMP-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=EMP,EMP-V,EMP-VV
+
+; EMP:        <<<<<<
+; EMP-NEXT:            1: hello
+; EMP-V-NEXT: check:1     ^~~~~
+; EMP-NEXT:            2:
+; EMP-V-NEXT: empty:2     ^
+; EMP-NEXT:            3: world
+; EMP-NEXT:   empty:3     X~~~~
+; EMP-NEXT:            4: label
+; EMP-NEXT:   empty:3     ~~~~~ error: no match found
+; EMP-V-NEXT: label:4     ^~~~~
+; EMP-NEXT:   >>>>>>
+; EMP-NOT:    {{.}}
+
+; Wrong-line match.
+
+; RUN: echo 'hello' > %t.in
+; RUN: echo 'world' >> %t.in
+
+; RUN: echo 'CHECK: hello' > %t.chk
+; RUN: echo 'CHECK-EMPTY:' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefix=EMP2
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=EMP2,EMP2-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=EMP2,EMP2-V,EMP2-VV
+
+; EMP2:        <<<<<<
+; EMP2-NEXT:            1: hello
+; EMP2-V-NEXT: check:1     ^~~~~
+; EMP2-NEXT:            2: world
+; EMP2-NEXT:            3:
+; EMP2-NEXT:   empty:2     !     error: match on wrong line
+; EMP2-NEXT:   >>>>>>
+; EMP2-NOT:    {{.}}
+
+;--------------------------------------------------
+; CHECK-NOT (also: EOF pattern, and multiline range that ends before EOL)
+;--------------------------------------------------
+
+; No match (success) and unexpected match (error).
+
+; RUN: echo 'hello' > %t.in
+; RUN: echo 'world' >> %t.in
+; RUN: echo 'again' >> %t.in
+
+; RUN: echo 'CHECK-NOT: goodbye' > %t.chk
+; RUN: echo 'CHECK-NOT: world' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefix=NOT
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=NOT,NOT-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=NOT,NOT-V,NOT-VV
+
+; NOT:         <<<<<<
+; NOT-NEXT:           1: hello
+; NOT-VV-NEXT: not:1     X~~~~
+; NOT-NEXT:           2: world
+; NOT-VV-NEXT: not:1     ~~~~~
+; NOT-NEXT:    not:2     !~~~~ error: no match expected
+; NOT-NEXT:           3: again
+; NOT-VV-NEXT: not:1     ~~~~~
+; NOT-VV-NEXT:        4:
+; NOT-VV-NEXT: eof:2     ^
+; NOT-NEXT:    >>>>>>
+; NOT-NOT:     {{.}}
+
+; Again, but with a CHECK instead of EOF as search range end.
+
+; RUN: echo 'CHECK: ain' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefix=NOT2
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=NOT2,NOT2-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=NOT2,NOT2-V,NOT2-VV
+
+; NOT2:         <<<<<<
+; NOT2-NEXT:             1: hello
+; NOT2-VV-NEXT: not:1       X~~~~
+; NOT2-NEXT:             2: world
+; NOT2-VV-NEXT: not:1       ~~~~~
+; NOT2-NEXT:    not:2       !~~~~ error: no match expected
+; NOT2-NEXT:             3: again
+; NOT2-VV-NEXT: not:1       ~~
+; NOT2-V-NEXT:  check:3       ^~~
+; NOT2-NEXT:    >>>>>>
+; NOT2-NOT:     {{.}}
+
+;--------------------------------------------------
+; CHECK-DAG (also: matches in different order than directives, discarded match)
+;--------------------------------------------------
+
+; Good match, discarded match plus good match, and no match.
+
+; RUN: echo 'abc' > %t.in
+; RUN: echo 'def' >> %t.in
+; RUN: echo 'abc' >> %t.in
+
+; RUN: echo 'CHECK-DAG: def' > %t.chk
+; RUN: echo 'CHECK-DAG: abc' >> %t.chk
+; RUN: echo 'CHECK-DAG: abc' >> %t.chk
+; RUN: echo 'CHECK-DAG: def' >> %t.chk
+
+; Prefixes used here:
+; DAG    = quiet, -v, or -vv
+; DAG-Q  = quiet
+; DAG-V  = -v or -vv (-vv implies -v)
+; DAG-VQ = -v and not -vv
+; DAG-VV = -vv
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=DAG,DAG-Q
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=DAG,DAG-V,DAG-VQ
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=DAG,DAG-V,DAG-VV
+
+; DAG:         <<<<<<
+; DAG-NEXT:             1: abc
+; DAG-V-NEXT:  dag:2       ^~~
+; DAG-VV-NEXT: dag:3'0     !~~ discard: overlaps earlier match
+; DAG-NEXT:             2: def
+; DAG-V-NEXT:  dag:1       ^~~
+; DAG-VV-NEXT: dag:4'0     !~~ discard: overlaps earlier match
+; DAG-NEXT:             3: abc
+; DAG-VQ-NEXT: dag:3       ^~~
+; DAG-VV-NEXT: dag:3'1     ^~~
+; DAG-Q-NEXT:  dag:4       X~~ error: no match found
+; DAG-VQ-NEXT: dag:4       X~~ error: no match found
+; DAG-VV-NEXT: dag:4'1     X~~ error: no match found
+; DAG-NEXT:    >>>>>>
+; DAG-NOT:     {{.}}
+
+;--------------------------------------------------
+; CHECK-LABEL
+;
+; FIXME: Labels sometimes produce redundant diagnostics for good matches.
+; That bug is independent of but affects -dump-input.
+;--------------------------------------------------
+
+; Good match and no match.
+
+; RUN: echo 'lab0' > %t.in
+; RUN: echo 'foo' >> %t.in
+; RUN: echo 'lab1' >> %t.in
+; RUN: echo 'bar' >> %t.in
+
+; RUN: echo 'CHECK-LABEL: lab0' > %t.chk
+; RUN: echo 'CHECK: foo' >> %t.chk
+; RUN: echo 'CHECK-LABEL: lab2' >> %t.chk
+
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=LAB
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -v 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=LAB,LAB-V
+; RUN: not FileCheck -dump-input=always -input-file %t.in %t.chk -vv 2>&1 \
+; RUN: | FileCheck -match-full-lines %s -check-prefixes=LAB,LAB-V,LAB-VV
+
+; LAB:         <<<<<<
+; LAB-NEXT:               1: lab0
+; LAB-V-NEXT:  label:1'0     ^~~~
+; LAB-V-NEXT:  label:1'1     ^~~~
+; LAB-NEXT:               2: foo
+; LAB-NEXT:    label:3'0     X~~
+; LAB-NEXT:               3: lab1
+; LAB-NEXT:    label:3'0     ~~~~
+; LAB-NEXT:    label:3'1     ?    possible intended match
+; LAB-NEXT:               4: bar
+; LAB-NEXT:    label:3'0     ~~~ error: no match found
+; LAB-NEXT:    >>>>>>
+; LAB-NOT:     {{.}}
+
+
diff --git a/test/FileCheck/dump-input-enable.txt b/test/FileCheck/dump-input-enable.txt
new file mode 100644
index 0000000000000000000000000000000000000000..71e3d5dd9d6c3be739e9dffb24eb38b7e46d6f71
--- /dev/null
+++ b/test/FileCheck/dump-input-enable.txt
@@ -0,0 +1,128 @@
+; RUN: echo ciao > %t.good
+; RUN: echo world >> %t.good
+
+; RUN: echo hello > %t.err
+; RUN: echo world >> %t.err
+
+; RUN: echo 'CHECK: ciao' > %t.check
+; RUN: echo 'CHECK-NEXT: world' >> %t.check
+
+;--------------------------------------------------
+; unknown value
+;--------------------------------------------------
+
+; RUN: not FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines -dump-input=foobar 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=BADVAL
+
+; No positional arg.
+; RUN: not FileCheck -dump-input=foobar 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=BADVAL
+
+BADVAL: FileCheck{{.*}}: for the -dump-input option: Cannot find option named 'foobar'!
+
+;--------------------------------------------------
+; help
+;--------------------------------------------------
+
+; Appended to normal command line.
+; RUN: FileCheck -input-file %t.err -color %t.check -dump-input=help \
+; RUN: | FileCheck %s -check-prefix=HELP
+
+; No positional arg.
+; RUN: FileCheck -dump-input=help | FileCheck %s -check-prefix=HELP
+
+HELP-NOT: {{.}}
+HELP: The following description was requested by -dump-input=help
+HELP: try{{.*}}-color
+HELP-NOT: {{.}}
+
+;--------------------------------------------------
+; never
+;--------------------------------------------------
+
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines -dump-input=never 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-NODUMP -allow-empty
+
+; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines -dump-input=never 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-NODUMP
+
+;--------------------------------------------------
+; default: never
+;--------------------------------------------------
+
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-NODUMP -allow-empty
+
+; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-NODUMP
+
+;--------------------------------------------------
+; fail
+;--------------------------------------------------
+
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines -dump-input=fail 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-NODUMP -allow-empty
+
+; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines -dump-input=fail 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-ERR
+
+;--------------------------------------------------
+; -dump-input-on-failure
+;--------------------------------------------------
+
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines -dump-input-on-failure 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-NODUMP -allow-empty
+
+; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines -dump-input-on-failure 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-ERR
+
+; RUN: env FILECHECK_DUMP_INPUT_ON_FAILURE=1 \
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-NODUMP -allow-empty
+
+; RUN: env FILECHECK_DUMP_INPUT_ON_FAILURE=1 \
+; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-ERR
+
+;--------------------------------------------------
+; always
+;--------------------------------------------------
+
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines -dump-input=always -v 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-GOOD
+
+; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines -dump-input=always 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefix=CHECK-ERR
+
+; END.
+
+; CHECK-GOOD: Full input was:
+; CHECK-GOOD-NEXT: <<<<<<
+; CHECK-GOOD-NEXT:          1: ciao
+; CHECK-GOOD-NEXT: check:1     ^~~~
+; CHECK-GOOD-NEXT:          2: world
+; CHECK-GOOD-NEXT: next:2      ^~~~~
+; CHECK-GOOD-NEXT: >>>>>>
+
+; CHECK-ERR: Full input was:
+; CHECK-ERR-NEXT: <<<<<<
+; CHECK-ERR-NEXT:          1: hello
+; CHECK-ERR-NEXT: check:1     X~~~~
+; CHECK-ERR-NEXT:          2: world
+; CHECK-ERR-NEXT: check:1     ~~~~~ error: no match found
+; CHECK-ERR-NEXT: >>>>>>
+
+; CHECK-NODUMP-NOT: <<<<<<
diff --git a/test/FileCheck/no-check-file.txt b/test/FileCheck/no-check-file.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e9325d611af4e1345ba24fc1cea3f070d505d93f
--- /dev/null
+++ b/test/FileCheck/no-check-file.txt
@@ -0,0 +1,3 @@
+; RUN: not FileCheck 2>&1 | FileCheck %s
+
+CHECK: <check-file> not specified
diff --git a/test/FileCheck/verbose_mode.txt b/test/FileCheck/verbose_mode.txt
deleted file mode 100644
index 2146d6a95c46f8d4720cfac038790d88e5d6841c..0000000000000000000000000000000000000000
--- a/test/FileCheck/verbose_mode.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: not FileCheck -input-file %s %s --check-prefix=CHECK1 --match-full-lines --dump-input-on-failure 2>&1 | FileCheck %s --check-prefix=CHECKERROR --match-full-lines
-; RUN: env FILECHECK_DUMP_INPUT_ON_FAILURE=1 not FileCheck -input-file %s %s --check-prefix=CHECK1 --match-full-lines 2>&1 |  FileCheck %s --check-prefix=CHECKERROR --match-full-lines
-; RUN: env FILECHECK_DUMP_INPUT_ON_FAILURE=1 not FileCheck -input-file %s %s --check-prefix=CHECK1 --match-full-lines --dump-input-on-failure=0 2>&1 |  FileCheck %s --check-prefix=CHECKERRORNOVERBOSE --match-full-lines
-
-hello
-world
-
-; CHECK1: ciao
-; CHECK1-NEXT: world
-
-; CHECKERROR: Full input was:
-; CHECKERROR-NEXT: <<<<<<
-; CHECKERROR: hello
-; CHECKERROR: world
-; CHECKERROR: >>>>>>
-
-; CHECKERRORNOVERBOSE-NOT: <<<<<<
diff --git a/test/Instrumentation/AddressSanitizer/do-not-touch-odr-global.ll b/test/Instrumentation/AddressSanitizer/do-not-touch-odr-global.ll
index 97752612297d94e65adcfaf8b6977f43931c7935..bdcd6595a0f61f7e7c57a8f20cc0d1f6e89ad25d 100644
--- a/test/Instrumentation/AddressSanitizer/do-not-touch-odr-global.ll
+++ b/test/Instrumentation/AddressSanitizer/do-not-touch-odr-global.ll
@@ -5,7 +5,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 ; no action should be taken for these globals
 @global_noinst = linkonce_odr constant [2 x i8] [i8 1, i8 2]
+@global_weak_noinst = weak_odr constant [2 x i8] [i8 1, i8 2]
 @global_inst = private constant [2 x i8] [i8 1, i8 2]
 ; CHECK-NOT: {{asan_gen.*global_noinst}}
+; CHECK-NOT: {{asan_gen.*global_weak_noinst}}
 ; CHECK: {{asan_gen.*global_inst}}
 ; CHECK: @asan.module_ctor
diff --git a/test/Instrumentation/AddressSanitizer/global_metadata.ll b/test/Instrumentation/AddressSanitizer/global_metadata.ll
index ec3d4b680f91996a03c854d56482df4c2bde5633..ee42f1cef97af1568b15adef91267fae6b4531f9 100644
--- a/test/Instrumentation/AddressSanitizer/global_metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/global_metadata.ll
@@ -22,7 +22,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: [[FILENAME:@___asan_gen_.[0-9]+]] = private unnamed_addr constant [22 x i8] c"/tmp/asan-globals.cpp\00", align 1
 ; CHECK: [[LOCDESCR:@___asan_gen_.[0-9]+]] = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* [[FILENAME]], i32 5, i32 5 }
 ; CHECK: @__asan_global_global = {{.*}}i64 ptrtoint ({ i32, [60 x i8] }* @global to i64){{.*}} section "asan_globals"{{.*}}, !associated
-; CHECK: @__asan_global_.str = {{.*}}i64 ptrtoint ({ [14 x i8], [50 x i8] }* @.str to i64){{.*}} section "asan_globals"{{.*}}, !associated
+; CHECK: @__asan_global_.str = {{.*}}i64 ptrtoint ({ [14 x i8], [50 x i8] }* @{{.str|1}} to i64){{.*}} section "asan_globals"{{.*}}, !associated
 
 ; The metadata has to be inserted to llvm.compiler.used to avoid being stripped
 ; during LTO.
diff --git a/test/Instrumentation/AddressSanitizer/global_metadata_bitcasts.ll b/test/Instrumentation/AddressSanitizer/global_metadata_bitcasts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..324a04e3b8321c82928ef14478bed4d769b81fc0
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/global_metadata_bitcasts.ll
@@ -0,0 +1,13 @@
+; Test that the compiler doesn't crash when the llvm.asan.globals containts
+; an entry that points to a BitCast instruction.
+
+; RUN: opt < %s -asan -asan-module -asan-globals-live-support=1 -S
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+@g = global [1 x i32] zeroinitializer, align 4
+
+!llvm.asan.globals = !{!0, !1}
+!0 = !{[1 x i32]* @g, null, !"name", i1 false, i1 false}
+!1 = !{i8* bitcast ([1 x i32]* @g to i8*), null, !"name", i1 false, i1 false}
diff --git a/test/Instrumentation/AddressSanitizer/local_alias.ll b/test/Instrumentation/AddressSanitizer/local_alias.ll
index 3db241d097f459dec0b75140cc45ede783db7e9b..9b95bb3fa6b0408c464f5a361b78bce8f7c0cad0 100644
--- a/test/Instrumentation/AddressSanitizer/local_alias.ll
+++ b/test/Instrumentation/AddressSanitizer/local_alias.ll
@@ -1,13 +1,25 @@
-; RUN: opt < %s -asan -asan-module -asan-use-private-alias=1 -S | FileCheck %s
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s --check-prefixes=CHECK-NOALIAS,CHECK-NOINDICATOR
+; RUN: opt < %s -asan -asan-module -asan-use-private-alias=1 -S | FileCheck %s --check-prefixes=CHECK-ALIAS,CHECK-NOINDICATOR
+; RUN: opt < %s -asan -asan-module -asan-use-odr-indicator=1 -S | FileCheck %s --check-prefixes=CHECK-INDICATOR,CHECK-NOALIAS
+; RUN: opt < %s -asan -asan-module -asan-use-private-alias=1 -asan-use-odr-indicator=1 -S | FileCheck %s --check-prefixes=CHECK-ALIAS,CHECK-INDICATOR
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-@a = internal global [2 x i32] zeroinitializer, align 4
+@a = dso_local global [2 x i32] zeroinitializer, align 4
+@b = private global [2 x i32] zeroinitializer, align 4
+@c = internal global [2 x i32] zeroinitializer, align 4
+@d = unnamed_addr global [2 x i32] zeroinitializer, align 4
 
 ; Check that we generate internal alias and odr indicator symbols for global to be protected.
-; CHECK: @__odr_asan_gen_a = internal global i8 0, align 1
-; CHECK: @"a<stdin>" = internal alias { [2 x i32], [56 x i8] }, { [2 x i32], [56 x i8] }* @a
+; CHECK-NOINDICATOR-NOT: __odr_asan_gen_a
+; CHECK-NOALIAS-NOT: private alias
+; CHECK-INDICATOR: @__odr_asan_gen_a = global i8 0, align 1
+; CHECK-ALIAS: @0 = private alias { [2 x i32], [56 x i8] }, { [2 x i32], [56 x i8] }* @a
+
+; CHECK-ALIAS: @1 = private alias { [2 x i32], [56 x i8] }, { [2 x i32], [56 x i8] }* @b
+; CHECK-ALIAS: @2 = private alias { [2 x i32], [56 x i8] }, { [2 x i32], [56 x i8] }* @c
+; CHECK-ALIAS: @3 = private alias { [2 x i32], [56 x i8] }, { [2 x i32], [56 x i8] }* @d
 
 ; Function Attrs: nounwind sanitize_address uwtable
 define i32 @foo(i32 %M) #0 {
diff --git a/test/Instrumentation/AddressSanitizer/odr-check-ignore.ll b/test/Instrumentation/AddressSanitizer/odr-check-ignore.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cf48d19c16c2dfb45e27ba99dbb97196a20ef0fc
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/odr-check-ignore.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s 
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = global [2 x i32] zeroinitializer, align 4
+@b = private global [2 x i32] zeroinitializer, align 4
+@c = internal global [2 x i32] zeroinitializer, align 4
+@d = unnamed_addr global [2 x i32] zeroinitializer, align 4
+
+; CHECK: @__asan_global_a = private global { i64, i64, i64, i64, i64, i64, i64, i64 } { i64 ptrtoint ({ [2 x i32], [56 x i8] }* @a to i64), i64 8, i64 64, i64 ptrtoint ([2 x i8]* @___asan_gen_.1 to i64), i64 ptrtoint ([8 x i8]* @___asan_gen_ to i64), i64 0, i64 0, i64 0 }
+
+; CHECK: @__asan_global_b = private global { i64, i64, i64, i64, i64, i64, i64, i64 } { i64 ptrtoint ({ [2 x i32], [56 x i8] }* @b to i64), i64 8, i64 64, i64 ptrtoint ([2 x i8]* @___asan_gen_.2 to i64), i64 ptrtoint ([8 x i8]* @___asan_gen_ to i64), i64 0, i64 0, i64 -1 }
+
+; CHECK: @__asan_global_c = private global { i64, i64, i64, i64, i64, i64, i64, i64 } { i64 ptrtoint ({ [2 x i32], [56 x i8] }* @c to i64), i64 8, i64 64, i64 ptrtoint ([2 x i8]* @___asan_gen_.3 to i64), i64 ptrtoint ([8 x i8]* @___asan_gen_ to i64), i64 0, i64 0, i64 -1 }
+
+; CHECK: @__asan_global_d = private global { i64, i64, i64, i64, i64, i64, i64, i64 } { i64 ptrtoint ({ [2 x i32], [56 x i8] }* @d to i64), i64 8, i64 64, i64 ptrtoint ([2 x i8]* @___asan_gen_.4 to i64), i64 ptrtoint ([8 x i8]* @___asan_gen_ to i64), i64 0, i64 0, i64 0 }
diff --git a/test/Instrumentation/AddressSanitizer/win-string-literal.ll b/test/Instrumentation/AddressSanitizer/win-string-literal.ll
index 4fef094832aaf65b10777e3330b0b01f4537658b..a86581c1e8e52b68f80b4aa228c3623082cc34ef 100644
--- a/test/Instrumentation/AddressSanitizer/win-string-literal.ll
+++ b/test/Instrumentation/AddressSanitizer/win-string-literal.ll
@@ -15,7 +15,7 @@
 ; CHECK-SAME: { i64 ptrtoint ({ [5 x i8], [59 x i8] }* @"??_C@_04JIHMPGLA@asdf?$AA@" to i64),
 ; CHECK-SAME:   i64 5, i64 64, i64 ptrtoint ([17 x i8]* @___asan_gen_.1 to i64),
 ; CHECK-SAME:   i64 ptrtoint ([8 x i8]* @___asan_gen_ to i64), i64 0,
-; CHECK-SAME:   i64 ptrtoint ({ [6 x i8]*, i32, i32 }* @___asan_gen_.3 to i64), i64 0 },
+; CHECK-SAME:   i64 ptrtoint ({ [6 x i8]*, i32, i32 }* @___asan_gen_.3 to i64), i64 -1 },
 ; CHECK-SAME:   section ".ASAN$GL", comdat($"??_C@_04JIHMPGLA@asdf?$AA@"), align 64
 
 ; ModuleID = 't.cpp'
diff --git a/test/Instrumentation/HWAddressSanitizer/basic.ll b/test/Instrumentation/HWAddressSanitizer/basic.ll
index 8253016d97b4adff5d0c2f7b4c55822045f3a127..e02e5fc283b8406ee03851b48d20827caebb00a3 100644
--- a/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -5,6 +5,9 @@
 ; RUN: opt < %s -hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=CHECK,ABORT,ZERO-BASED-SHADOW
 ; RUN: opt < %s -hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=CHECK,RECOVER,ZERO-BASED-SHADOW
 
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @hwasan.module_ctor, i8* bitcast (void ()* @hwasan.module_ctor to i8*) }]
+; CHECK: @__hwasan = private constant [0 x i8] zeroinitializer, section "__hwasan_frames", comdat($hwasan.module_ctor)
+
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android"
 
@@ -352,7 +355,7 @@ entry:
 
 ; CHECK: declare void @__hwasan_init()
 
-; CHECK:      define internal void @hwasan.module_ctor() {
+; CHECK:      define internal void @hwasan.module_ctor() comdat {
 ; CHECK-NEXT:   call void @__hwasan_init()
 ; CHECK-NEXT:   call void @__hwasan_init_frames(
 ; CHECK-NEXT:   ret void
diff --git a/test/Instrumentation/HWAddressSanitizer/with-calls.ll b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
index 8d6068c343894f890c61d972bb7cfa64d7c09cbf..9f5bc66bd4d3b0e0be006cc3cb7621a87c0713ad 100644
--- a/test/Instrumentation/HWAddressSanitizer/with-calls.ll
+++ b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
@@ -197,7 +197,7 @@ entry:
 
 ; CHECK: declare void @__hwasan_init()
 
-; CHECK:      define internal void @hwasan.module_ctor() {
+; CHECK:      define internal void @hwasan.module_ctor() comdat {
 ; CHECK-NEXT:   call void @__hwasan_init()
 ; CHECK-NEXT:   call void @__hwasan_init_frames(
 ; CHECK-NEXT:   ret void
diff --git a/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll b/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
index 4dcca26fd0b1c54c75e340639d2976b2704fcc2f..bc6c44c2cb2a4d39d5e0fb70d35f5ca3e44d4fb2 100644
--- a/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
@@ -1,6 +1,6 @@
 ; Test for handling of asm constraints in MSan instrumentation.
-; RUN: opt < %s -msan -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S | FileCheck -check-prefixes=CHECK,CHECK-NONCONS %s
-; RUN: opt < %s -msan -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S | FileCheck -check-prefixes=CHECK,CHECK-CONS %s
+; RUN: opt < %s -msan -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S | FileCheck -check-prefixes=CHECK,CHECK-NONCONS %s
+; RUN: opt < %s -msan -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S | FileCheck -check-prefixes=CHECK,CHECK-CONS %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -40,9 +40,12 @@ entry:
 
 ; CHECK-LABEL: @f_1i_1o_reg
 ; CHECK: [[IS1_F1:%.*]] = load i32, i32* @is1, align 4
-; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning
 ; CHECK: call i32 asm "",{{.*}}(i32 [[IS1_F1]])
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: [[PACK1_F1:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; CHECK: [[EXT1_F1:%.*]] = extractvalue { i8*, i32* } [[PACK1_F1]], 0
+; CHECK: [[CAST1_F1:%.*]] = bitcast i8* [[EXT1_F1]] to i32*
+; CHECK: store i32 0, i32* [[CAST1_F1]]
 
 
 ; Two input registers, two output registers:
@@ -62,11 +65,17 @@ entry:
 ; CHECK-LABEL: @f_2i_2o_reg
 ; CHECK: [[IS1_F2:%.*]] = load i32, i32* @is1, align 4
 ; CHECK: [[IS2_F2:%.*]] = load i32, i32* @is2, align 4
-; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning
+; CHECK: call void @__msan_warning
 ; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[IS1_F2]], i32 [[IS2_F2]])
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+; CHECK: [[PACK1_F2:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; CHECK: [[EXT1_F2:%.*]] = extractvalue { i8*, i32* } [[PACK1_F2]], 0
+; CHECK: [[CAST1_F2:%.*]] = bitcast i8* [[EXT1_F2]] to i32*
+; CHECK: store i32 0, i32* [[CAST1_F2]]
+; CHECK: [[PACK2_F2:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
+; CHECK: [[EXT2_F2:%.*]] = extractvalue { i8*, i32* } [[PACK2_F2]], 0
+; CHECK: [[CAST2_F2:%.*]] = bitcast i8* [[EXT2_F2]] to i32*
+; CHECK: store i32 0, i32* [[CAST2_F2]]
 
 ; Input same as output, used twice:
 ;   asm("" : "=r" (id1), "=r" (id2) : "r" (id1), "r" (id2));
@@ -85,11 +94,17 @@ entry:
 ; CHECK-LABEL: @f_2i_2o_reuse2_reg
 ; CHECK: [[ID1_F3:%.*]] = load i32, i32* @id1, align 4
 ; CHECK: [[ID2_F3:%.*]] = load i32, i32* @id2, align 4
-; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning
+; CHECK: call void @__msan_warning
 ; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[ID1_F3]], i32 [[ID2_F3]])
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+; CHECK: [[PACK1_F3:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; CHECK: [[EXT1_F3:%.*]] = extractvalue { i8*, i32* } [[PACK1_F3]], 0
+; CHECK: [[CAST1_F3:%.*]] = bitcast i8* [[EXT1_F3]] to i32*
+; CHECK: store i32 0, i32* [[CAST1_F3]]
+; CHECK: [[PACK2_F3:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
+; CHECK: [[EXT2_F3:%.*]] = extractvalue { i8*, i32* } [[PACK2_F3]], 0
+; CHECK: [[CAST2_F3:%.*]] = bitcast i8* [[EXT2_F3]] to i32*
+; CHECK: store i32 0, i32* [[CAST2_F3]]
 
 
 ; One of the input registers is also an output:
@@ -109,11 +124,17 @@ entry:
 ; CHECK-LABEL: @f_2i_2o_reuse1_reg
 ; CHECK: [[ID1_F4:%.*]] = load i32, i32* @id1, align 4
 ; CHECK: [[IS1_F4:%.*]] = load i32, i32* @is1, align 4
-; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning
+; CHECK: call void @__msan_warning
 ; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[ID1_F4]], i32 [[IS1_F4]])
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+; CHECK: [[PACK1_F4:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; CHECK: [[EXT1_F4:%.*]] = extractvalue { i8*, i32* } [[PACK1_F4]], 0
+; CHECK: [[CAST1_F4:%.*]] = bitcast i8* [[EXT1_F4]] to i32*
+; CHECK: store i32 0, i32* [[CAST1_F4]]
+; CHECK: [[PACK2_F4:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
+; CHECK: [[EXT2_F4:%.*]] = extractvalue { i8*, i32* } [[PACK2_F4]], 0
+; CHECK: [[CAST2_F4:%.*]] = bitcast i8* [[EXT2_F4]] to i32*
+; CHECK: store i32 0, i32* [[CAST2_F4]]
 
 
 ; One input register, three output registers:
@@ -133,11 +154,20 @@ entry:
 
 ; CHECK-LABEL: @f_1i_3o_reg
 ; CHECK: [[IS1_F5:%.*]] = load i32, i32* @is1, align 4
-; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning
 ; CHECK: call { i32, i32, i32 } asm "",{{.*}}(i32 [[IS1_F5]])
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
-; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id3 to i64)
+; CHECK: [[PACK1_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; CHECK: [[EXT1_F5:%.*]] = extractvalue { i8*, i32* } [[PACK1_F5]], 0
+; CHECK: [[CAST1_F5:%.*]] = bitcast i8* [[EXT1_F5]] to i32*
+; CHECK: store i32 0, i32* [[CAST1_F5]]
+; CHECK: [[PACK2_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
+; CHECK: [[EXT2_F5:%.*]] = extractvalue { i8*, i32* } [[PACK2_F5]], 0
+; CHECK: [[CAST2_F5:%.*]] = bitcast i8* [[EXT2_F5]] to i32*
+; CHECK: store i32 0, i32* [[CAST2_F5]]
+; CHECK: [[PACK3_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id3{{.*}})
+; CHECK: [[EXT3_F5:%.*]] = extractvalue { i8*, i32* } [[PACK3_F5]], 0
+; CHECK: [[CAST3_F5:%.*]] = bitcast i8* [[EXT3_F5]] to i32*
+; CHECK: store i32 0, i32* [[CAST3_F5]]
 
 
 ; 2 input memory args, 2 output memory args:
@@ -170,7 +200,7 @@ entry:
 ; CHECK: [[IS1_F7:%.*]] = load i32, i32* @is1, align 4
 ; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@is1{{.*}}, i64 4)
 ; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id1{{.*}}, i64 4)
-; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning
 ; CHECK: call i32 asm "", "=r,=*m,r,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32 [[IS1_F7]], i32* @is1)
 
 
@@ -212,9 +242,9 @@ entry:
 ; CHECK: [[C1_F9:%.*]] = load {{.*}} @c1
 ; CHECK: [[MEMCPY_S1_F9:%.*]] = load {{.*}} @memcpy_s1
 ; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@pair2{{.*}}, i64 8)
-; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning
+; CHECK: call void @__msan_warning
+; CHECK: call void @__msan_warning
 ; CHECK: call { i8, i8* (i8*, i8*, i32)* } asm "", "=*r,=r,=r,r,r,r,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, {{.*}}[[PAIR1_F9]], i8 [[C1_F9]], {{.*}} [[MEMCPY_S1_F9]])
 
 ; Three inputs and three outputs of different types: a pair, a char, a function pointer.
diff --git a/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll b/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
index 7240e1086dae89274d36e09bb9653697e1ff560e..c9e6f1afe21d0be9716bd302cb65380e0ef81ca8 100644
--- a/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
@@ -1,6 +1,6 @@
 ; Test for the conservative assembly handling mode used by KMSAN.
-; RUN: opt < %s -msan -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S | FileCheck -check-prefixes=CHECK,CHECK-NONCONS %s
-; RUN: opt < %s -msan -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S | FileCheck -check-prefixes=CHECK,CHECK-CONS %s
+; RUN: opt < %s -msan -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S | FileCheck -check-prefixes=CHECK,CHECK-NONCONS %s
+; RUN: opt < %s -msan -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S | FileCheck -check-prefixes=CHECK,CHECK-CONS %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -52,9 +52,16 @@ if.else:                                          ; preds = %entry
   ret i32 1
 }
 
+; %nr is first poisoned, then unpoisoned (written to). Need to optimize this in the future.
+; CHECK: [[NRC1:%.*]] = bitcast i64* %nr to i8*
+; CHECK: call void @__msan_poison_alloca(i8* [[NRC1]]{{.*}})
+; CHECK: [[NRC2:%.*]] = bitcast i64* %nr to i8*
+; CHECK: call { i8*, i32* } @__msan_metadata_ptr_for_store_8(i8* [[NRC2]])
+
 ; Hooks for inputs usually go before the assembly statement. But here we have none,
 ; because %nr is passed by value. However we check %nr for being initialized.
-; CHECK-CONS: [[NRC:%.*]] = ptrtoint i64* %nr to i64
+; CHECK-CONS: [[NRC3:%.*]] = bitcast i64* %nr to i8*
+; CHECK-CONS: call { i8*, i32* } @__msan_metadata_ptr_for_load_8(i8* [[NRC3]])
 
 ; In the conservative mode, call the store hooks for %bit and %addr:
 ; CHECK-CONS: call void @__msan_instrument_asm_store(i8* %bit, i64 1)
@@ -62,14 +69,17 @@ if.else:                                          ; preds = %entry
 ; CHECK-CONS: call void @__msan_instrument_asm_store(i8* [[ADDR8S]], i64 8)
 
 ; Landing pad for the %nr check above.
-; CHECK-CONS: call void @__msan_warning_noreturn()
+; CHECK-CONS: call void @__msan_warning
 
 ; CHECK: call void asm "btsq $2, $1; setc $0"
 
 ; Calculating the shadow offset of %bit.
-; CHECK: [[PTR:%.*]] = ptrtoint {{.*}} %bit to i64
-; CHECK: [[SH_NUM:%.*]] = xor i64 [[PTR]]
-; CHECK: [[SHADOW:%.*]] = inttoptr i64 [[SH_NUM]] {{.*}}
+; CHECKz: [[PTR:%.*]] = ptrtoint {{.*}} %bit to i64
+; CHECKz: [[SH_NUM:%.*]] = xor i64 [[PTR]]
+; CHECKz: [[SHADOW:%.*]] = inttoptr i64 [[SH_NUM]] {{.*}}
+
+; CHECK: [[META:%.*]] = call {{.*}} @__msan_metadata_ptr_for_load_1(i8* %bit)
+; CHECK: [[SHADOW:%.*]] = extractvalue { i8*, i32* } [[META]], 0
 
 ; Now load the shadow value for the boolean.
 ; CHECK: [[MSLD:%.*]] = load {{.*}} [[SHADOW]]
@@ -81,5 +91,5 @@ if.else:                                          ; preds = %entry
 
 ; If yes, raise a warning.
 ; CHECK: <label>:[[IFTRUE]]
-; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning
 
diff --git a/test/LTO/Resolution/X86/available-externally.ll b/test/LTO/Resolution/X86/available-externally.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eb17693c0a9419e094ff5ba806d96d090ffd3e5f
--- /dev/null
+++ b/test/LTO/Resolution/X86/available-externally.ll
@@ -0,0 +1,16 @@
+; RUN: opt -module-summary -o %t.bc %s
+; RUN: llvm-lto2 run %t.bc -r %t.bc,foo,px -r %t.bc,bar, -o %t2
+; RUN: llvm-nm %t2.1 | FileCheck %s
+
+; CHECK: U bar
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void ()* @foo() {
+  ret void ()* @bar
+}
+
+define available_externally void @bar() {
+  ret void
+}
diff --git a/test/LTO/Resolution/X86/dead-strip-fulllto.ll b/test/LTO/Resolution/X86/dead-strip-fulllto.ll
index 773b4385378fee19cba849011b3f3a957dd58e93..11a8981aceb0a3c3e3ea3461e21746395cf5eb8e 100644
--- a/test/LTO/Resolution/X86/dead-strip-fulllto.ll
+++ b/test/LTO/Resolution/X86/dead-strip-fulllto.ll
@@ -1,14 +1,14 @@
 ; RUN: opt -module-summary -o %t %s
 ; RUN: opt -module-summary -o %t2 %S/Inputs/dead-strip-fulllto.ll
 
-; RUN: llvm-lto2 run %t -r %t,main,px -r %t,live1,p -r %t,live2,p -r %t,dead2,p \
-; RUN:               %t2 -r %t2,live1, -r %t2,live2, -r %t2,dead1,p -r %t2,dead2, -r %t2,odr, \
+; RUN: llvm-lto2 run %t -r %t,main,px -r %t,live1, -r %t,live2,p -r %t,dead2,p \
+; RUN:               %t2 -r %t2,live1,p -r %t2,live2, -r %t2,dead1,p -r %t2,dead2, -r %t2,odr, \
 ; RUN: -save-temps -o %t3
 ; RUN: llvm-nm %t3.0 | FileCheck --check-prefix=FULL %s
 ; RUN: llvm-nm %t3.1 | FileCheck --check-prefix=THIN %s
 
-; RUN: llvm-lto2 run %t -r %t,main,px -r %t,live1,p -r %t,live2,p -r %t,dead2,p \
-; RUN:               %t2 -r %t2,live1, -r %t2,live2, -r %t2,dead1,p -r %t2,dead2, -r %t2,odr, \
+; RUN: llvm-lto2 run %t -r %t,main,px -r %t,live1, -r %t,live2,p -r %t,dead2,p \
+; RUN:               %t2 -r %t2,live1,p -r %t2,live2, -r %t2,dead1,p -r %t2,dead2, -r %t2,odr, \
 ; RUN: -save-temps -o %t3 -O0
 ; RUN: llvm-nm %t3.0 | FileCheck --check-prefix=FULL %s
 ; RUN: llvm-nm %t3.1 | FileCheck --check-prefix=THIN %s
diff --git a/test/LTO/Resolution/X86/load-sample-prof-lto.ll b/test/LTO/Resolution/X86/load-sample-prof-lto.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bdf4f08dba1781d0e361f034f8789d3d2609fc07
--- /dev/null
+++ b/test/LTO/Resolution/X86/load-sample-prof-lto.ll
@@ -0,0 +1,46 @@
+; Test that LTO pipeline loads profile.
+;
+; RUN: opt < %s -o %t.bc
+
+; Run the old pm LTO pipeline.
+; RUN: llvm-lto2 run -o %t.out %t.bc -save-temps \
+; RUN:   -r %t.bc,foo,px -r %t.bc,bar,x \
+; RUN:   -lto-sample-profile-file=%S/Inputs/load-sample-prof.prof
+; RUN: llvm-dis %t.out.0.4.opt.bc -o - | FileCheck %s
+
+; Run the new pm LTO pipeline.
+; RUN: llvm-lto2 run -o %t.out %t.bc -save-temps -use-new-pm \
+; RUN:   -r %t.bc,foo,px -r %t.bc,bar,x \
+; RUN:   -lto-sample-profile-file=%S/Inputs/load-sample-prof.prof
+; RUN: llvm-dis %t.out.0.4.opt.bc -o - | FileCheck %s
+
+; Make sure profile information is attached.
+; CHECK: !prof
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo() local_unnamed_addr !dbg !7 {
+entry:
+  tail call void @bar(), !dbg !10
+  ret void, !dbg !11
+}
+
+declare void @bar() local_unnamed_addr
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 6.0.0 "}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocation(line: 4, column: 5, scope: !7)
+!11 = !DILocation(line: 5, column: 1, scope: !7)
diff --git a/test/Linker/Inputs/commandline.a.ll b/test/Linker/Inputs/commandline.a.ll
new file mode 100644
index 0000000000000000000000000000000000000000..445ee74301e2f20e685a06bb89c69dd22044d66a
--- /dev/null
+++ b/test/Linker/Inputs/commandline.a.ll
@@ -0,0 +1,3 @@
+!llvm.commandline = !{!0, !1}
+!0 = !{!"compiler -v1"}
+!1 = !{!"compiler -v2"}
diff --git a/test/Linker/Inputs/commandline.b.ll b/test/Linker/Inputs/commandline.b.ll
new file mode 100644
index 0000000000000000000000000000000000000000..16011fc40557a2ae060dbe8aa8c6da6f87839692
--- /dev/null
+++ b/test/Linker/Inputs/commandline.b.ll
@@ -0,0 +1,2 @@
+!llvm.commandline = !{!0}
+!0 = !{!"compiler -v3"}
diff --git a/test/Linker/commandline.ll b/test/Linker/commandline.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ba201c82be91079161de2962f6aaea374f6dc70b
--- /dev/null
+++ b/test/Linker/commandline.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-link %S/Inputs/commandline.a.ll %S/Inputs/commandline.b.ll -S | FileCheck %s
+
+; Verify that multiple input llvm.commandline metadata are linked together.
+
+; CHECK-DAG: !llvm.commandline = !{!0, !1, !2}
+; CHECK-DAG: !{{[0-2]}} = !{!"compiler -v1"}
+; CHECK-DAG: !{{[0-2]}} = !{!"compiler -v2"}
+; CHECK-DAG: !{{[0-2]}} = !{!"compiler -v3"}
diff --git a/test/Linker/replaced-function-matches-first-subprogram.ll b/test/Linker/replaced-function-matches-first-subprogram.ll
index fc2c361a246d2789d37b340aca7d31e41b10810d..7d40ac02f158059f169242997646622bc5d2bce2 100644
--- a/test/Linker/replaced-function-matches-first-subprogram.ll
+++ b/test/Linker/replaced-function-matches-first-subprogram.ll
@@ -54,7 +54,7 @@ entry:
 
 ; We can't use CHECK-NOT/CHECK-SAME with a CHECK-DAG, so rely on field order to
 ; prove that there's no function: here.
-; CHECK-DAG: ![[SP2r:.*]] = {{.*}}!DISubprogram({{.*}} isOptimized: false, unit: ![[CU1]], retainedNodes:
+; CHECK-DAG: ![[SP2r:.*]] = {{.*}}!DISubprogram({{.*}} unit: ![[CU1]], retainedNodes:
 !7 = distinct !DISubprogram(name: "foo", line: 2, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 2, file: !8, scope: !9, type: !6, retainedNodes: !2)
 
 ; The new subprogram should be pointing at the new directory.
diff --git a/test/MC/AArch64/armv8.1a-lse.s b/test/MC/AArch64/armv8.1a-lse.s
index 3da5e3c97b74405b99939d93ecc7f280f2678099..e6424ad01c29574ce503d3403fc8cc6fbca5687d 100644
--- a/test/MC/AArch64/armv8.1a-lse.s
+++ b/test/MC/AArch64/armv8.1a-lse.s
@@ -3,6 +3,8 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mcpu=cortex-a55 -show-encoding < %s 2> %t | FileCheck %s
 // RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mcpu=cortex-a75 -show-encoding < %s 2> %t | FileCheck %s
+// RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mcpu=tsv110 -show-encoding < %s 2> %t | FileCheck %s
 // RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s
   .text
 
diff --git a/test/MC/AArch64/armv8.2a-at.s b/test/MC/AArch64/armv8.2a-at.s
index 741f6922a9e4e59f53687094c3a3c4225e140259..3c26fb9ea3a1355e6949ab68d1779f5619ab2ea2 100644
--- a/test/MC/AArch64/armv8.2a-at.s
+++ b/test/MC/AArch64/armv8.2a-at.s
@@ -1,9 +1,11 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.2a < %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.2a < %s 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.2a %s -o - | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.2a,+pan-rwv %s -o - | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.2a %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.2a,-pan-rwv %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 
   at s1e1rp, x1
   at s1e1wp, x2
 // CHECK: at      s1e1rp, x1              // encoding: [0x01,0x79,0x08,0xd5]
 // CHECK: at      s1e1wp, x2              // encoding: [0x22,0x79,0x08,0xd5]
-// ERROR: error: AT S1E1RP requires ARMv8.2a
-// ERROR: error: AT S1E1WP requires ARMv8.2a
+// ERROR: error: AT S1E1RP requires pan-rwv
+// ERROR: error: AT S1E1WP requires pan-rwv
diff --git a/test/MC/AArch64/armv8.2a-dotprod.s b/test/MC/AArch64/armv8.2a-dotprod.s
index f5ff91289bc388e73d19f5d6f046092fc6f93e4d..0cb7420de23ab3e93f9e524133026a0923822d9d 100644
--- a/test/MC/AArch64/armv8.2a-dotprod.s
+++ b/test/MC/AArch64/armv8.2a-dotprod.s
@@ -1,6 +1,7 @@
 // RUN: llvm-mc -triple aarch64 -mattr=+dotprod -show-encoding < %s | FileCheck %s  --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=cortex-a75 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=cortex-a55 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
+// RUN: llvm-mc -triple aarch64 -mcpu=tsv110 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: not llvm-mc -triple aarch64 -mattr=+v8.2a -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
 
diff --git a/test/MC/AArch64/armv8.2a-mmfr2.s b/test/MC/AArch64/armv8.2a-mmfr2.s
deleted file mode 100644
index 5a9b1f1f42a04403843f4b405ba6625c125ee247..0000000000000000000000000000000000000000
--- a/test/MC/AArch64/armv8.2a-mmfr2.s
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.2a < %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.2a < %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-  mrs x3, id_aa64mmfr2_el1
-// CHECK: mrs x3, ID_AA64MMFR2_EL1       // encoding: [0x43,0x07,0x38,0xd5]
-// ERROR: error: expected readable system register
diff --git a/test/MC/AArch64/armv8.2a-persistent-memory.s b/test/MC/AArch64/armv8.2a-persistent-memory.s
index 8a7600ee904dbb1283f959b812fa77042b452a60..73acd1c6eb6d51bddf995cfa1729861e0ee5d460 100644
--- a/test/MC/AArch64/armv8.2a-persistent-memory.s
+++ b/test/MC/AArch64/armv8.2a-persistent-memory.s
@@ -1,6 +1,7 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.2a < %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.2a < %s 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.2a -o - %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+ccpp -o - %s | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.2a -o - %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
   dc cvap, x7
 // CHECK: dc cvap, x7   // encoding: [0x27,0x7c,0x0b,0xd5]
-// ERROR: error: DC CVAP requires ARMv8.2a
+// ERROR: error: DC CVAP requires ccpp
diff --git a/test/MC/AArch64/armv8.3a-complex.s b/test/MC/AArch64/armv8.3a-complex.s
index 1a4975b3e892763f8964af88eec2d964ecebad78..dae5d4779510054259f3ea6569c93714206a49e2 100644
--- a/test/MC/AArch64/armv8.3a-complex.s
+++ b/test/MC/AArch64/armv8.3a-complex.s
@@ -1,150 +1,54 @@
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.3a,-fullfp16 < %s 2>%t | FileCheck %s --check-prefix=CHECK --check-prefix=NO-FP16
-// RUN: FileCheck --check-prefix=STDERR --check-prefix=STDERR-NO-FP16 %s < %t
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.3a,+fullfp16 < %s 2>%t | FileCheck %s --check-prefix=CHECK --check-prefix=FP16
-// RUN: FileCheck --check-prefix=STDERR --check-prefix=STDERR-FP16 %s < %t
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.2a,-v8.3a,+fullfp16 < %s 2>&1 | FileCheck %s --check-prefix=NO-V83A
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+complxnum,+fullfp16 -o - %s 2>%t | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.3a,+fullfp16 -o - %s 2>%t | FileCheck %s
+fcmla v0.4h, v1.4h, v2.4h, #0
+fcmla v0.8h, v1.8h, v2.8h, #0
+fcmla v0.2s, v1.2s, v2.2s, #0
+fcmla v0.4s, v1.4s, v2.4s, #0
+fcmla v0.2d, v1.2d, v2.2d, #0
+fcmla v0.2s, v1.2s, v2.2s, #0
+fcmla v0.2s, v1.2s, v2.2s, #90
+fcmla v0.2s, v1.2s, v2.2s, #180
+fcmla v0.2s, v1.2s, v2.2s, #270
+fcadd v0.4h, v1.4h, v2.4h, #90
+fcadd v0.8h, v1.8h, v2.8h, #90
+fcadd v0.2s, v1.2s, v2.2s, #90
+fcadd v0.4s, v1.4s, v2.4s, #90
+fcadd v0.2d, v1.2d, v2.2d, #90
+fcadd v0.2s, v1.2s, v2.2s, #90
+fcadd v0.2s, v1.2s, v2.2s, #270
+fcmla v0.4h, v1.4h, v2.h[0], #0
+fcmla v0.8h, v1.8h, v2.h[0], #0
+fcmla v0.4s, v1.4s, v2.s[0], #0
+fcmla v0.4s, v1.4s, v2.s[0], #90
+fcmla v0.4s, v1.4s, v2.s[0], #180
+fcmla v0.4s, v1.4s, v2.s[0], #270
+fcmla v0.4h, v1.4h, v2.h[1], #0
+fcmla v0.8h, v1.8h, v2.h[3], #0
+fcmla v0.4s, v1.4s, v2.s[1], #0
+//CHECK: 	.text
+//CHECK-NEXT: 	fcmla	v0.4h, v1.4h, v2.4h, #0 // encoding: [0x20,0xc4,0x42,0x2e]
+//CHECK-NEXT: 	fcmla	v0.8h, v1.8h, v2.8h, #0 // encoding: [0x20,0xc4,0x42,0x6e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #0 // encoding: [0x20,0xc4,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.4s, #0 // encoding: [0x20,0xc4,0x82,0x6e]
+//CHECK-NEXT: 	fcmla	v0.2d, v1.2d, v2.2d, #0 // encoding: [0x20,0xc4,0xc2,0x6e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #0 // encoding: [0x20,0xc4,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xcc,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #180 // encoding: [0x20,0xd4,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #270 // encoding: [0x20,0xdc,0x82,0x2e]
+//CHECK-NEXT: 	fcadd	v0.4h, v1.4h, v2.4h, #90 // encoding: [0x20,0xe4,0x42,0x2e]
+//CHECK-NEXT: 	fcadd	v0.8h, v1.8h, v2.8h, #90 // encoding: [0x20,0xe4,0x42,0x6e]
+//CHECK-NEXT: 	fcadd	v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xe4,0x82,0x2e]
+//CHECK-NEXT: 	fcadd	v0.4s, v1.4s, v2.4s, #90 // encoding: [0x20,0xe4,0x82,0x6e]
+//CHECK-NEXT: 	fcadd	v0.2d, v1.2d, v2.2d, #90 // encoding: [0x20,0xe4,0xc2,0x6e]
+//CHECK-NEXT: 	fcadd	v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xe4,0x82,0x2e]
+//CHECK-NEXT: 	fcadd	v0.2s, v1.2s, v2.2s, #270 // encoding: [0x20,0xf4,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.4h, v1.4h, v2.h[0], #0 // encoding: [0x20,0x10,0x42,0x2f]
+//CHECK-NEXT: 	fcmla	v0.8h, v1.8h, v2.h[0], #0 // encoding: [0x20,0x10,0x42,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[0], #0 // encoding: [0x20,0x10,0x82,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[0], #90 // encoding: [0x20,0x30,0x82,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[0], #180 // encoding: [0x20,0x50,0x82,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[0], #270 // encoding: [0x20,0x70,0x82,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4h, v1.4h, v2.h[1], #0 // encoding: [0x20,0x10,0x62,0x2f]
+//CHECK-NEXT: 	fcmla	v0.8h, v1.8h, v2.h[3], #0 // encoding: [0x20,0x18,0x62,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[1], #0 // encoding: [0x20,0x18,0x82,0x6f]
 
-
-// ==== FCMLA vector ====
-// Types
-  fcmla v0.4h, v1.4h, v2.4h, #0
-// FP16: fcmla   v0.4h, v1.4h, v2.4h, #0 // encoding: [0x20,0xc4,0x42,0x2e]
-// STDERR-NO-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: fullfp16
-// NO-V83A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.8h, v1.8h, v2.8h, #0
-// FP16: fcmla   v0.8h, v1.8h, v2.8h, #0 // encoding: [0x20,0xc4,0x42,0x6e]
-// STDERR-NO-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: fullfp16
-// NO-V83A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.2s, v1.2s, v2.2s, #0
-// CHECK: fcmla   v0.2s, v1.2s, v2.2s, #0 // encoding: [0x20,0xc4,0x82,0x2e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.4s, v1.4s, v2.4s, #0
-// CHECK: fcmla   v0.4s, v1.4s, v2.4s, #0 // encoding: [0x20,0xc4,0x82,0x6e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.2d, v1.2d, v2.2d, #0
-// CHECK: fcmla   v0.2d, v1.2d, v2.2d, #0 // encoding: [0x20,0xc4,0xc2,0x6e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-
-// Rotations
-  fcmla v0.2s, v1.2s, v2.2s, #0
-// CHECK: fcmla   v0.2s, v1.2s, v2.2s, #0 // encoding: [0x20,0xc4,0x82,0x2e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.2s, v1.2s, v2.2s, #90
-// CHECK: fcmla   v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xcc,0x82,0x2e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.2s, v1.2s, v2.2s, #180
-// CHECK: fcmla   v0.2s, v1.2s, v2.2s, #180 // encoding: [0x20,0xd4,0x82,0x2e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.2s, v1.2s, v2.2s, #270
-// CHECK: fcmla   v0.2s, v1.2s, v2.2s, #270 // encoding: [0x20,0xdc,0x82,0x2e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-
-// Invalid rotations
-  fcmla v0.2s, v1.2s, v2.2s, #1
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 0, 90, 180 or 270.
-  fcmla v0.2s, v1.2s, v2.2s, #360
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 0, 90, 180 or 270.
-  fcmla v0.2s, v1.2s, v2.2s, #-90
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 0, 90, 180 or 270.
-
-// ==== FCADD vector ====
-// Types
-  fcadd v0.4h, v1.4h, v2.4h, #90
-// FP16: fcadd   v0.4h, v1.4h, v2.4h, #90 // encoding: [0x20,0xe4,0x42,0x2e]
-// STDERR-NO-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: fullfp16
-// NO-V83A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcadd v0.8h, v1.8h, v2.8h, #90
-// FP16: fcadd   v0.8h, v1.8h, v2.8h, #90 // encoding: [0x20,0xe4,0x42,0x6e]
-// STDERR-NO-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: fullfp16
-// NO-V83A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcadd v0.2s, v1.2s, v2.2s, #90
-// CHECK: fcadd   v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xe4,0x82,0x2e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcadd v0.4s, v1.4s, v2.4s, #90
-// CHECK: fcadd   v0.4s, v1.4s, v2.4s, #90 // encoding: [0x20,0xe4,0x82,0x6e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcadd v0.2d, v1.2d, v2.2d, #90
-// CHECK: fcadd   v0.2d, v1.2d, v2.2d, #90 // encoding: [0x20,0xe4,0xc2,0x6e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-
-// Rotations
-  fcadd v0.2s, v1.2s, v2.2s, #90
-// CHECK: fcadd   v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xe4,0x82,0x2e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcadd v0.2s, v1.2s, v2.2s, #270
-// CHECK: fcadd   v0.2s, v1.2s, v2.2s, #270 // encoding: [0x20,0xf4,0x82,0x2e]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-
-// Invalid rotations
-  fcadd v0.2s, v1.2s, v2.2s, #1
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 90 or 270.
-  fcadd v0.2s, v1.2s, v2.2s, #360
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 90 or 270.
-  fcadd v0.2s, v1.2s, v2.2s, #-90
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 90 or 270.
-  fcadd v0.2s, v1.2s, v2.2s, #0
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 90 or 270.
-  fcadd v0.2s, v1.2s, v2.2s, #180
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 90 or 270.
-
-// ==== FCMLA indexed ====
-// Types
-  fcmla v0.4h, v1.4h, v2.h[0], #0
-// FP16: fcmla   v0.4h, v1.4h, v2.h[0], #0 // encoding: [0x20,0x10,0x42,0x2f]
-// STDERR-NO-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: fullfp16
-// NO-V83A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.8h, v1.8h, v2.h[0], #0
-// FP16: fcmla   v0.8h, v1.8h, v2.h[0], #0 // encoding: [0x20,0x10,0x42,0x6f]
-// STDERR-NO-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: fullfp16
-// NO-V83A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.2s, v1.2s, v2.s[0], #0
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: invalid operand for instruction
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: invalid operand for instruction
-  fcmla v0.4s, v1.4s, v2.s[0], #0
-// CHECK: fcmla   v0.4s, v1.4s, v2.s[0], #0 // encoding: [0x20,0x10,0x82,0x6f]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.2d, v1.2d, v2.d[0], #0
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: invalid operand for instruction
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: invalid operand for instruction
-
-// Rotations
-  fcmla v0.4s, v1.4s, v2.s[0], #90
-// CHECK: fcmla   v0.4s, v1.4s, v2.s[0], #90 // encoding: [0x20,0x30,0x82,0x6f]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.4s, v1.4s, v2.s[0], #180
-// CHECK: fcmla   v0.4s, v1.4s, v2.s[0], #180 // encoding: [0x20,0x50,0x82,0x6f]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.4s, v1.4s, v2.s[0], #270
-// CHECK: fcmla   v0.4s, v1.4s, v2.s[0], #270 // encoding: [0x20,0x70,0x82,0x6f]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-
-// Valid indices
-  fcmla v0.4h, v1.4h, v2.h[1], #0
-// FP16: fcmla   v0.4h, v1.4h, v2.h[1], #0 // encoding: [0x20,0x10,0x62,0x2f]
-// STDERR-NO-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: fullfp16
-// NO-V83A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.8h, v1.8h, v2.h[3], #0
-// FP16: fcmla   v0.8h, v1.8h, v2.h[3], #0 // encoding: [0x20,0x18,0x62,0x6f]
-// STDERR-NO-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: fullfp16
-// NO-V83A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-  fcmla v0.4s, v1.4s, v2.s[1], #0
-// CHECK: fcmla   v0.4s, v1.4s, v2.s[1], #0 // encoding: [0x20,0x18,0x82,0x6f]
-// NO-V83A: :[[@LINE-2]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-
-// Invalid indices
-  fcmla v0.4h, v1.4h, v2.h[2], #0
-// STDERR-NO-FP16: :[[@LINE-1]]:{{[0-9]*}}: error: invalid operand for instruction
-// STDERR-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: vector lane must be an integer in range [0, 1].
-  fcmla v0.8h, v1.8h, v2.h[4], #0
-// STDERR-NO-FP16: :[[@LINE-1]]:{{[0-9]*}}: error: invalid operand for instruction
-// STDERR-FP16: :[[@LINE-2]]:{{[0-9]*}}: error: vector lane must be an integer in range [0, 3].
-  fcmla v0.4s, v1.4s, v2.s[2], #0
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: vector lane must be an integer in range [0, 1].
-
-// Invalid rotations
-  fcmla v0.4s, v1.4s, v2.s[0], #1
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 0, 90, 180 or 270.
-  fcmla v0.4s, v1.4s, v2.s[0], #360
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 0, 90, 180 or 270.
-  fcmla v0.4s, v1.4s, v2.s[0], #-90
-// STDERR: :[[@LINE-1]]:{{[0-9]*}}: error: complex rotation must be 0, 90, 180 or 270.
diff --git a/test/MC/AArch64/armv8.3a-complex_bad.s b/test/MC/AArch64/armv8.3a-complex_bad.s
new file mode 100644
index 0000000000000000000000000000000000000000..cdc023445ca077e4bbf9c4fd3b3e5ead6c52ea6a
--- /dev/null
+++ b/test/MC/AArch64/armv8.3a-complex_bad.s
@@ -0,0 +1,58 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+complxnum,+fullfp16 -o - %s 2>&1 | FileCheck %s
+fcmla v0.2s, v1.2s, v2.2s, #1
+fcmla v0.2s, v1.2s, v2.2s, #360
+fcmla v0.2s, v1.2s, v2.2s, #-90
+fcadd v0.2s, v1.2s, v2.2s, #1
+fcadd v0.2s, v1.2s, v2.2s, #360
+fcadd v0.2s, v1.2s, v2.2s, #-90
+fcadd v0.2s, v1.2s, v2.2s, #0
+fcadd v0.2s, v1.2s, v2.2s, #180
+fcmla v0.4h, v1.4h, v2.h[2], #0
+fcmla v0.8h, v1.8h, v2.h[4], #0
+fcmla v0.4s, v1.4s, v2.s[2], #0
+fcmla v0.4s, v1.4s, v2.s[0], #1
+fcmla v0.4s, v1.4s, v2.s[0], #360
+fcmla v0.4s, v1.4s, v2.s[0], #-90
+//CHECK: {{.*}}error: complex rotation must be 0, 90, 180 or 270.
+//CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #1
+//CHECK-NEXT:                            ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 0, 90, 180 or 270.
+//CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #360
+//CHECK-NEXT:                            ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 0, 90, 180 or 270.
+//CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #-90
+//CHECK-NEXT:                            ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 90 or 270.
+//CHECK-NEXT: fcadd v0.2s, v1.2s, v2.2s, #1
+//CHECK-NEXT:                            ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 90 or 270.
+//CHECK-NEXT: fcadd v0.2s, v1.2s, v2.2s, #360
+//CHECK-NEXT:                            ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 90 or 270.
+//CHECK-NEXT: fcadd v0.2s, v1.2s, v2.2s, #-90
+//CHECK-NEXT:                            ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 90 or 270.
+//CHECK-NEXT: fcadd v0.2s, v1.2s, v2.2s, #0
+//CHECK-NEXT:                            ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 90 or 270.
+//CHECK-NEXT: fcadd v0.2s, v1.2s, v2.2s, #180
+//CHECK-NEXT:                            ^
+//CHECK-NEXT: {{.*}}error: vector lane must be an integer in range [0, 1].
+//CHECK-NEXT: fcmla v0.4h, v1.4h, v2.h[2], #0
+//CHECK-NEXT:                         ^
+//CHECK-NEXT: {{.*}}error: vector lane must be an integer in range [0, 3].
+//CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[4], #0
+//CHECK-NEXT:                         ^
+//CHECK-NEXT: {{.*}}error: vector lane must be an integer in range [0, 1].
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[2], #0
+//CHECK-NEXT:                         ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 0, 90, 180 or 270.
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[0], #1
+//CHECK-NEXT:                              ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 0, 90, 180 or 270.
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[0], #360
+//CHECK-NEXT:                              ^
+//CHECK-NEXT: {{.*}}error: complex rotation must be 0, 90, 180 or 270.
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[0], #-90
+//CHECK-NEXT:                              ^
+
diff --git a/test/MC/AArch64/armv8.3a-complex_missing.s b/test/MC/AArch64/armv8.3a-complex_missing.s
new file mode 100644
index 0000000000000000000000000000000000000000..75008106575e91d1adf92daef4313783be8e2c93
--- /dev/null
+++ b/test/MC/AArch64/armv8.3a-complex_missing.s
@@ -0,0 +1,103 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.3a,-complxnum,+fullfp16 -o - %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+fullfp16 -o - %s 2>&1 | FileCheck %s
+fcmla v0.4h, v1.4h, v2.4h, #0
+fcmla v0.8h, v1.8h, v2.8h, #0
+fcmla v0.2s, v1.2s, v2.2s, #0
+fcmla v0.4s, v1.4s, v2.4s, #0
+fcmla v0.2d, v1.2d, v2.2d, #0
+fcmla v0.2s, v1.2s, v2.2s, #0
+fcmla v0.2s, v1.2s, v2.2s, #90
+fcmla v0.2s, v1.2s, v2.2s, #180
+fcmla v0.2s, v1.2s, v2.2s, #270
+fcadd v0.4h, v1.4h, v2.4h, #90
+fcadd v0.8h, v1.8h, v2.8h, #90
+fcadd v0.2s, v1.2s, v2.2s, #90
+fcadd v0.4s, v1.4s, v2.4s, #90
+fcadd v0.2d, v1.2d, v2.2d, #90
+fcadd v0.2s, v1.2s, v2.2s, #90
+fcadd v0.2s, v1.2s, v2.2s, #270
+fcmla v0.4h, v1.4h, v2.h[0], #0
+fcmla v0.8h, v1.8h, v2.h[0], #0
+fcmla v0.4s, v1.4s, v2.s[0], #0
+fcmla v0.4s, v1.4s, v2.s[0], #90
+fcmla v0.4s, v1.4s, v2.s[0], #180
+fcmla v0.4s, v1.4s, v2.s[0], #270
+fcmla v0.4h, v1.4h, v2.h[1], #0
+fcmla v0.8h, v1.8h, v2.h[3], #0
+fcmla v0.4s, v1.4s, v2.s[1], #0
+//CHECK: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.8h, v1.8h, v2.8h, #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #180
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #270
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcadd v0.4h, v1.4h, v2.4h, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcadd v0.8h, v1.8h, v2.8h, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcadd v0.2s, v1.2s, v2.2s, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcadd v0.4s, v1.4s, v2.4s, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcadd v0.2d, v1.2d, v2.2d, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcadd v0.2s, v1.2s, v2.2s, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcadd v0.2s, v1.2s, v2.2s, #270
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4h, v1.4h, v2.h[0], #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[0], #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[0], #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[0], #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[0], #180
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[0], #270
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4h, v1.4h, v2.h[1], #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[3], #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}} error: instruction requires: complxnum
+//CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[1], #0
+//CHECK-NEXT: ^
+
diff --git a/test/MC/AArch64/armv8.3a-complex_nofp16.s b/test/MC/AArch64/armv8.3a-complex_nofp16.s
new file mode 100644
index 0000000000000000000000000000000000000000..63d6d3c52c97d0ae257ea2763f1f264c46c6205e
--- /dev/null
+++ b/test/MC/AArch64/armv8.3a-complex_nofp16.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+complxnum -o - %s 2>%t | FileCheck %s
+fcmla v0.2s, v1.2s, v2.2s, #0
+fcmla v0.4s, v1.4s, v2.4s, #0
+fcmla v0.2d, v1.2d, v2.2d, #0
+fcmla v0.2s, v1.2s, v2.2s, #0
+fcmla v0.2s, v1.2s, v2.2s, #90
+fcmla v0.2s, v1.2s, v2.2s, #180
+fcmla v0.2s, v1.2s, v2.2s, #270
+fcadd v0.2s, v1.2s, v2.2s, #90
+fcadd v0.4s, v1.4s, v2.4s, #90
+fcadd v0.2d, v1.2d, v2.2d, #90
+fcadd v0.2s, v1.2s, v2.2s, #90
+fcadd v0.2s, v1.2s, v2.2s, #270
+fcmla v0.4s, v1.4s, v2.s[0], #0
+fcmla v0.4s, v1.4s, v2.s[0], #90
+fcmla v0.4s, v1.4s, v2.s[0], #180
+fcmla v0.4s, v1.4s, v2.s[0], #270
+fcmla v0.4s, v1.4s, v2.s[1], #0
+//CHECK: 	.text
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #0 // encoding: [0x20,0xc4,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.4s, #0 // encoding: [0x20,0xc4,0x82,0x6e]
+//CHECK-NEXT: 	fcmla	v0.2d, v1.2d, v2.2d, #0 // encoding: [0x20,0xc4,0xc2,0x6e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #0 // encoding: [0x20,0xc4,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xcc,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #180 // encoding: [0x20,0xd4,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.2s, v1.2s, v2.2s, #270 // encoding: [0x20,0xdc,0x82,0x2e]
+//CHECK-NEXT: 	fcadd	v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xe4,0x82,0x2e]
+//CHECK-NEXT: 	fcadd	v0.4s, v1.4s, v2.4s, #90 // encoding: [0x20,0xe4,0x82,0x6e]
+//CHECK-NEXT: 	fcadd	v0.2d, v1.2d, v2.2d, #90 // encoding: [0x20,0xe4,0xc2,0x6e]
+//CHECK-NEXT: 	fcadd	v0.2s, v1.2s, v2.2s, #90 // encoding: [0x20,0xe4,0x82,0x2e]
+//CHECK-NEXT: 	fcadd	v0.2s, v1.2s, v2.2s, #270 // encoding: [0x20,0xf4,0x82,0x2e]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[0], #0 // encoding: [0x20,0x10,0x82,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[0], #90 // encoding: [0x20,0x30,0x82,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[0], #180 // encoding: [0x20,0x50,0x82,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[0], #270 // encoding: [0x20,0x70,0x82,0x6f]
+//CHECK-NEXT: 	fcmla	v0.4s, v1.4s, v2.s[1], #0 // encoding: [0x20,0x18,0x82,0x6f]
+
diff --git a/test/MC/AArch64/armv8.3a-complex_nofp16_bad.s b/test/MC/AArch64/armv8.3a-complex_nofp16_bad.s
new file mode 100644
index 0000000000000000000000000000000000000000..2d4679805549f14712a5335e686aef9232f3af3a
--- /dev/null
+++ b/test/MC/AArch64/armv8.3a-complex_nofp16_bad.s
@@ -0,0 +1,34 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+complxnum -o - %s 2>&1 | FileCheck %s
+fcmla v0.4h, v1.4h, v2.4h, #0
+fcmla v0.8h, v1.8h, v2.8h, #0
+fcadd v0.4h, v1.4h, v2.4h, #90
+fcadd v0.8h, v1.8h, v2.8h, #90
+fcmla v0.4h, v1.4h, v2.h[0], #0
+fcmla v0.8h, v1.8h, v2.h[0], #0
+fcmla v0.4h, v1.4h, v2.h[1], #0
+fcmla v0.8h, v1.8h, v2.h[3], #0
+//CHECK: {{.*}}error: instruction requires: fullfp16
+//CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}}error: instruction requires: fullfp16
+//CHECK-NEXT: fcmla v0.8h, v1.8h, v2.8h, #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}}error: instruction requires: fullfp16
+//CHECK-NEXT: fcadd v0.4h, v1.4h, v2.4h, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}}error: instruction requires: fullfp16
+//CHECK-NEXT: fcadd v0.8h, v1.8h, v2.8h, #90
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}}error: instruction requires: fullfp16
+//CHECK-NEXT: fcmla v0.4h, v1.4h, v2.h[0], #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}}error: instruction requires: fullfp16
+//CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[0], #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}}error: instruction requires: fullfp16
+//CHECK-NEXT: fcmla v0.4h, v1.4h, v2.h[1], #0
+//CHECK-NEXT: ^
+//CHECK-NEXT: {{.*}}error: instruction requires: fullfp16
+//CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[3], #0
+//CHECK-NEXT: ^
+
diff --git a/test/MC/AArch64/armv8.3a-js.s b/test/MC/AArch64/armv8.3a-js.s
index 23572890338d9d1ce2acf60dd244fe1ed841638a..e3e4f9121acf984348974050cd03ae23d6213d6d 100644
--- a/test/MC/AArch64/armv8.3a-js.s
+++ b/test/MC/AArch64/armv8.3a-js.s
@@ -1,10 +1,20 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.3a < %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-REQ < %t %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.3a,-fp-armv8 < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-NOFP < %t %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.3a -o - %s 2>&1 | \
+// RUN: FileCheck %s
+
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+jsconv -o - %s 2>&1 | \
+// RUN: FileCheck %s
+
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu %s 2>&1 | \
+// RUN: FileCheck --check-prefix=CHECK-JS %s
+
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+jsconv,-fp-armv8 -o - %s 2>&1 |\
+// RUN: FileCheck --check-prefix=CHECK-REQ %s
 
   fjcvtzs w0, d0
 // CHECK: fjcvtzs w0, d0    // encoding: [0x00,0x00,0x7e,0x1e]
-// CHECK-REQ: error: instruction requires: armv8.3a
-// CHECK-NOFP: error: instruction requires: fp-armv8
+
+// CHECK-JS: error: instruction requires: jsconv
+
+// NOJS: error: instruction requires: jsconv
+
+// CHECK-REQ: error: instruction requires: fp-armv8 jsconv
diff --git a/test/MC/AArch64/armv8.3a-signed-pointer.s b/test/MC/AArch64/armv8.3a-signed-pointer.s
index 156ab08e36ac344751284e79ba5e6d1fcbd5435b..fe34002680e3c3ed4bf77d3dbcc47c21dca375a9 100644
--- a/test/MC/AArch64/armv8.3a-signed-pointer.s
+++ b/test/MC/AArch64/armv8.3a-signed-pointer.s
@@ -1,7 +1,11 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.3a < %s 2> %t | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-REQ %s < %t
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.3a -o - %s 2>&1 | \
+// RUN: FileCheck --check-prefixes=CHECK,ALL %s
 
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu %s -o - > %t.1 2>%t.2
+// RUN: FileCheck --check-prefixes=ALL,NOENC %s < %t.1
+// RUN: FileCheck --check-prefix=CHECK-REQ %s < %t.2
+
+// ALL: .text
   mrs x0, apiakeylo_el1
   mrs x0, apiakeyhi_el1
   mrs x0, apibkeylo_el1
@@ -12,28 +16,39 @@
   mrs x0, apdbkeyhi_el1
   mrs x0, apgakeylo_el1
   mrs x0, apgakeyhi_el1
-
-// CHECK: mrs x0, APIAKeyLo_EL1     // encoding: [0x00,0x21,0x38,0xd5]
-// CHECK: mrs x0, APIAKeyHi_EL1     // encoding: [0x20,0x21,0x38,0xd5]
-// CHECK: mrs x0, APIBKeyLo_EL1     // encoding: [0x40,0x21,0x38,0xd5]
-// CHECK: mrs x0, APIBKeyHi_EL1     // encoding: [0x60,0x21,0x38,0xd5]
-// CHECK: mrs x0, APDAKeyLo_EL1     // encoding: [0x00,0x22,0x38,0xd5]
-// CHECK: mrs x0, APDAKeyHi_EL1     // encoding: [0x20,0x22,0x38,0xd5]
-// CHECK: mrs x0, APDBKeyLo_EL1     // encoding: [0x40,0x22,0x38,0xd5]
-// CHECK: mrs x0, APDBKeyHi_EL1     // encoding: [0x60,0x22,0x38,0xd5]
-// CHECK: mrs x0, APGAKeyLo_EL1     // encoding: [0x00,0x23,0x38,0xd5]
-// CHECK: mrs x0, APGAKeyHi_EL1     // encoding: [0x20,0x23,0x38,0xd5]
+// ALL-EMPTY:
+// ALL-EMPTY:
+// CHECK-NEXT: mrs x0, APIAKeyLo_EL1     // encoding: [0x00,0x21,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APIAKeyHi_EL1     // encoding: [0x20,0x21,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APIBKeyLo_EL1     // encoding: [0x40,0x21,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APIBKeyHi_EL1     // encoding: [0x60,0x21,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APDAKeyLo_EL1     // encoding: [0x00,0x22,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APDAKeyHi_EL1     // encoding: [0x20,0x22,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APDBKeyLo_EL1     // encoding: [0x40,0x22,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APDBKeyHi_EL1     // encoding: [0x60,0x22,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APGAKeyLo_EL1     // encoding: [0x00,0x23,0x38,0xd5]
+// CHECK-NEXT: mrs x0, APGAKeyHi_EL1     // encoding: [0x20,0x23,0x38,0xd5]
 
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apiakeylo_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apiakeyhi_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apibkeylo_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apibkeyhi_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apdakeylo_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apdakeyhi_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apdbkeylo_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apdbkeyhi_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apgakeylo_el1
 // CHECK-REQ: error: expected readable system register
+// CHECK-REQ-NEXT: mrs x0, apgakeyhi_el1
 
   msr apiakeylo_el1, x0
   msr apiakeyhi_el1, x0
@@ -45,191 +60,249 @@
   msr apdbkeyhi_el1, x0
   msr apgakeylo_el1, x0
   msr apgakeyhi_el1, x0
-
-// CHECK: msr APIAKeyLo_EL1, x0     // encoding: [0x00,0x21,0x18,0xd5]
-// CHECK: msr APIAKeyHi_EL1, x0     // encoding: [0x20,0x21,0x18,0xd5]
-// CHECK: msr APIBKeyLo_EL1, x0     // encoding: [0x40,0x21,0x18,0xd5]
-// CHECK: msr APIBKeyHi_EL1, x0     // encoding: [0x60,0x21,0x18,0xd5]
-// CHECK: msr APDAKeyLo_EL1, x0     // encoding: [0x00,0x22,0x18,0xd5]
-// CHECK: msr APDAKeyHi_EL1, x0     // encoding: [0x20,0x22,0x18,0xd5]
-// CHECK: msr APDBKeyLo_EL1, x0     // encoding: [0x40,0x22,0x18,0xd5]
-// CHECK: msr APDBKeyHi_EL1, x0     // encoding: [0x60,0x22,0x18,0xd5]
-// CHECK: msr APGAKeyLo_EL1, x0     // encoding: [0x00,0x23,0x18,0xd5]
-// CHECK: msr APGAKeyHi_EL1, x0     // encoding: [0x20,0x23,0x18,0xd5]
+// ALL-EMPTY:
+// ALL-EMPTY:
+// CHECK-NEXT: msr APIAKeyLo_EL1, x0     // encoding: [0x00,0x21,0x18,0xd5]
+// CHECK-NEXT: msr APIAKeyHi_EL1, x0     // encoding: [0x20,0x21,0x18,0xd5]
+// CHECK-NEXT: msr APIBKeyLo_EL1, x0     // encoding: [0x40,0x21,0x18,0xd5]
+// CHECK-NEXT: msr APIBKeyHi_EL1, x0     // encoding: [0x60,0x21,0x18,0xd5]
+// CHECK-NEXT: msr APDAKeyLo_EL1, x0     // encoding: [0x00,0x22,0x18,0xd5]
+// CHECK-NEXT: msr APDAKeyHi_EL1, x0     // encoding: [0x20,0x22,0x18,0xd5]
+// CHECK-NEXT: msr APDBKeyLo_EL1, x0     // encoding: [0x40,0x22,0x18,0xd5]
+// CHECK-NEXT: msr APDBKeyHi_EL1, x0     // encoding: [0x60,0x22,0x18,0xd5]
+// CHECK-NEXT: msr APGAKeyLo_EL1, x0     // encoding: [0x00,0x23,0x18,0xd5]
+// CHECK-NEXT: msr APGAKeyHi_EL1, x0     // encoding: [0x20,0x23,0x18,0xd5]
 
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apiakeylo_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apiakeyhi_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apibkeylo_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apibkeyhi_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apdakeylo_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apdakeyhi_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apdbkeylo_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apdbkeyhi_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apgakeylo_el1, x0
 // CHECK-REQ: error: expected writable system register or pstate
+// CHECK-REQ-NEXT:  msr apgakeyhi_el1, x0
 
+// ALL-EMPTY:
+// ALL-EMPTY:
   paciasp
-// CHECK: paciasp        // encoding: [0x3f,0x23,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: paciasp        // encoding: [0x3f,0x23,0x03,0xd5]
+// NOENC-NEXT: paciasp
   autiasp
-// CHECK: autiasp        // encoding: [0xbf,0x23,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: autiasp        // encoding: [0xbf,0x23,0x03,0xd5]
+// NOENC-NEXT: autiasp
   paciaz
-// CHECK: paciaz         // encoding: [0x1f,0x23,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: paciaz         // encoding: [0x1f,0x23,0x03,0xd5]
+// NOENC-NEXT: paciaz
   autiaz
-// CHECK: autiaz         // encoding: [0x9f,0x23,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: autiaz         // encoding: [0x9f,0x23,0x03,0xd5]
+// NOENC-NEXT: autiaz
   pacia1716
-// CHECK: pacia1716      // encoding: [0x1f,0x21,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacia1716      // encoding: [0x1f,0x21,0x03,0xd5]
+// NOENC-NEXT: pacia1716
   autia1716
-// CHECK: autia1716      // encoding: [0x9f,0x21,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: autia1716      // encoding: [0x9f,0x21,0x03,0xd5]
+// NOENC-NEXT: autia1716
   pacibsp
-// CHECK: pacibsp        // encoding: [0x7f,0x23,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacibsp        // encoding: [0x7f,0x23,0x03,0xd5]
+// NOENC-NEXT: pacibsp
   autibsp
-// CHECK: autibsp        // encoding: [0xff,0x23,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: autibsp        // encoding: [0xff,0x23,0x03,0xd5]
+// NOENC-NEXT: autibsp
   pacibz
-// CHECK: pacibz         // encoding: [0x5f,0x23,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacibz         // encoding: [0x5f,0x23,0x03,0xd5]
+// NOENC-NEXT: pacibz
   autibz
-// CHECK: autibz         // encoding: [0xdf,0x23,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: autibz         // encoding: [0xdf,0x23,0x03,0xd5]
+// NOENC-NEXT: autibz
   pacib1716
-// CHECK: pacib1716      // encoding: [0x5f,0x21,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacib1716      // encoding: [0x5f,0x21,0x03,0xd5]
+// NOENC-NEXT: pacib1716
   autib1716
-// CHECK: autib1716      // encoding: [0xdf,0x21,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: autib1716      // encoding: [0xdf,0x21,0x03,0xd5]
+// NOENC-NEXT: autib1716
   xpaclri
-// CHECK: xpaclri           // encoding: [0xff,0x20,0x03,0xd5]
-// CHECK-REQ-NOT: error: instruction requires: armv8.3a
+// CHECK-NEXT: xpaclri           // encoding: [0xff,0x20,0x03,0xd5]
+// NOENC-NEXT: xpaclri
 
+// ALL-EMPTY:
   pacia x0, x1
-// CHECK: pacia x0, x1     // encoding: [0x20,0x00,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacia x0, x1     // encoding: [0x20,0x00,0xc1,0xda]
+// CHECK-REQ-NEXT:      ^
+// CHECK-REQ-NEXT: error: instruction requires: pa
+// CHECK-REQ-NEXT: pacia x0, x1
   autia x0, x1
-// CHECK: autia x0, x1     // encoding: [0x20,0x10,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: autia x0, x1     // encoding: [0x20,0x10,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT: autia x0, x1
   pacda x0, x1
-// CHECK: pacda x0, x1     // encoding: [0x20,0x08,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacda x0, x1     // encoding: [0x20,0x08,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  pacda x0, x1
   autda x0, x1
-// CHECK: autda x0, x1     // encoding: [0x20,0x18,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: autda x0, x1     // encoding: [0x20,0x18,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  autda x0, x1
   pacib x0, x1
-// CHECK: pacib x0, x1     // encoding: [0x20,0x04,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacib x0, x1     // encoding: [0x20,0x04,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  pacib x0, x1
   autib x0, x1
-// CHECK: autib x0, x1     // encoding: [0x20,0x14,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: autib x0, x1     // encoding: [0x20,0x14,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  autib x0, x1
   pacdb x0, x1
-// CHECK: pacdb x0, x1     // encoding: [0x20,0x0c,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacdb x0, x1     // encoding: [0x20,0x0c,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  pacdb x0, x1
   autdb x0, x1
-// CHECK: autdb x0, x1     // encoding: [0x20,0x1c,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: autdb x0, x1     // encoding: [0x20,0x1c,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  autdb x0, x1
   pacga x0, x1, x2
-// CHECK: pacga x0, x1, x2  // encoding: [0x20,0x30,0xc2,0x9a]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacga x0, x1, x2  // encoding: [0x20,0x30,0xc2,0x9a]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  pacga x0, x1, x2
   paciza x0
-// CHECK: paciza x0         // encoding: [0xe0,0x23,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: paciza x0         // encoding: [0xe0,0x23,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  paciza x0
   autiza x0
-// CHECK: autiza x0         // encoding: [0xe0,0x33,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: autiza x0         // encoding: [0xe0,0x33,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  autiza x0
   pacdza x0
-// CHECK: pacdza x0         // encoding: [0xe0,0x2b,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacdza x0         // encoding: [0xe0,0x2b,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  pacdza x0
   autdza x0
-// CHECK: autdza x0         // encoding: [0xe0,0x3b,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: autdza x0         // encoding: [0xe0,0x3b,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  autdza x0
   pacizb x0
-// CHECK: pacizb x0         // encoding: [0xe0,0x27,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacizb x0         // encoding: [0xe0,0x27,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  pacizb x0
   autizb x0
-// CHECK: autizb x0         // encoding: [0xe0,0x37,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: autizb x0         // encoding: [0xe0,0x37,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  autizb x0
   pacdzb x0
-// CHECK: pacdzb x0         // encoding: [0xe0,0x2f,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: pacdzb x0         // encoding: [0xe0,0x2f,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  pacdzb x0
   autdzb x0
-// CHECK: autdzb x0         // encoding: [0xe0,0x3f,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: autdzb x0         // encoding: [0xe0,0x3f,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  autdzb x0
   xpaci x0
-// CHECK: xpaci x0          // encoding: [0xe0,0x43,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: xpaci x0          // encoding: [0xe0,0x43,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  xpaci x0
   xpacd x0
-// CHECK: xpacd x0          // encoding: [0xe0,0x47,0xc1,0xda]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: xpacd x0          // encoding: [0xe0,0x47,0xc1,0xda]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  xpacd x0
 
   braa x0, x1
-// CHECK: braa x0, x1       // encoding: [0x01,0x08,0x1f,0xd7]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-EMPTY:
+// CHECK-NEXT: braa x0, x1       // encoding: [0x01,0x08,0x1f,0xd7]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  braa x0, x1
   brab x0, x1
-// CHECK: brab x0, x1       // encoding: [0x01,0x0c,0x1f,0xd7]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: brab x0, x1       // encoding: [0x01,0x0c,0x1f,0xd7]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  brab x0, x1
   blraa x0, x1
-// CHECK: blraa x0, x1      // encoding: [0x01,0x08,0x3f,0xd7]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: blraa x0, x1      // encoding: [0x01,0x08,0x3f,0xd7]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  blraa x0, x1
   blrab x0, x1
-// CHECK: blrab x0, x1      // encoding: [0x01,0x0c,0x3f,0xd7]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: blrab x0, x1      // encoding: [0x01,0x0c,0x3f,0xd7]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  blrab x0, x1
 
   braaz x0
-// CHECK: braaz x0          // encoding: [0x1f,0x08,0x1f,0xd6]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-EMPTY:
+// CHECK-NEXT: braaz x0          // encoding: [0x1f,0x08,0x1f,0xd6]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  braaz x0
   brabz x0
-// CHECK: brabz x0          // encoding: [0x1f,0x0c,0x1f,0xd6]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: brabz x0          // encoding: [0x1f,0x0c,0x1f,0xd6]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  brabz x0
   blraaz x0
-// CHECK: blraaz x0         // encoding: [0x1f,0x08,0x3f,0xd6]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: blraaz x0         // encoding: [0x1f,0x08,0x3f,0xd6]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  blraaz x0
   blrabz x0
-// CHECK: blrabz x0         // encoding: [0x1f,0x0c,0x3f,0xd6]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: blrabz x0         // encoding: [0x1f,0x0c,0x3f,0xd6]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  blrabz x0
   retaa
-// CHECK: retaa             // encoding: [0xff,0x0b,0x5f,0xd6]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: retaa             // encoding: [0xff,0x0b,0x5f,0xd6]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  retaa
   retab
-// CHECK: retab             // encoding: [0xff,0x0f,0x5f,0xd6]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: retab             // encoding: [0xff,0x0f,0x5f,0xd6]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  retab
   eretaa
-// CHECK: eretaa            // encoding: [0xff,0x0b,0x9f,0xd6]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: eretaa            // encoding: [0xff,0x0b,0x9f,0xd6]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  eretaa
   eretab
-// CHECK: eretab            // encoding: [0xff,0x0f,0x9f,0xd6]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: eretab            // encoding: [0xff,0x0f,0x9f,0xd6]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  eretab
   ldraa x0, [x1, 4088]
-// CHECK: ldraa x0, [x1, #4088]  // encoding: [0x20,0xf4,0x3f,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldraa x0, [x1, #4088]  // encoding: [0x20,0xf4,0x3f,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldraa x0, [x1, 4088]
   ldraa x0, [x1, -4096]
-// CHECK: ldraa x0, [x1, #-4096] // encoding: [0x20,0x04,0x60,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldraa x0, [x1, #-4096] // encoding: [0x20,0x04,0x60,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldraa x0, [x1, -4096]
   ldrab x0, [x1, 4088]
-// CHECK: ldrab x0, [x1, #4088]  // encoding: [0x20,0xf4,0xbf,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldrab x0, [x1, #4088]  // encoding: [0x20,0xf4,0xbf,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldrab x0, [x1, 4088]
   ldrab x0, [x1, -4096]
-// CHECK: ldrab x0, [x1, #-4096] // encoding: [0x20,0x04,0xe0,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldrab x0, [x1, #-4096] // encoding: [0x20,0x04,0xe0,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldrab x0, [x1, -4096]
   ldraa x0, [x1, 4088]!
-// CHECK: ldraa x0, [x1, #4088]!  // encoding: [0x20,0xfc,0x3f,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldraa x0, [x1, #4088]!  // encoding: [0x20,0xfc,0x3f,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldraa x0, [x1, 4088]!
   ldraa x0, [x1, -4096]!
-// CHECK: ldraa x0, [x1, #-4096]! // encoding: [0x20,0x0c,0x60,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldraa x0, [x1, #-4096]! // encoding: [0x20,0x0c,0x60,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldraa x0, [x1, -4096]!
   ldrab x0, [x1, 4088]!
-// CHECK: ldrab x0, [x1, #4088]!  // encoding: [0x20,0xfc,0xbf,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldrab x0, [x1, #4088]!  // encoding: [0x20,0xfc,0xbf,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldrab x0, [x1, 4088]!
   ldrab x0, [x1, -4096]!
-// CHECK: ldrab x0, [x1, #-4096]! // encoding: [0x20,0x0c,0xe0,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldrab x0, [x1, #-4096]! // encoding: [0x20,0x0c,0xe0,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldrab x0, [x1, -4096]!
   ldraa x0, [x1]
-// CHECK: ldraa x0, [x1]  // encoding: [0x20,0x04,0x20,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldraa x0, [x1]  // encoding: [0x20,0x04,0x20,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldraa x0, [x1]
   ldrab x0, [x1]
-// CHECK: ldrab x0, [x1]  // encoding: [0x20,0x04,0xa0,0xf8]
-// CHECK-REQ: error: instruction requires: armv8.3a
+// CHECK-NEXT: ldrab x0, [x1]  // encoding: [0x20,0x04,0xa0,0xf8]
+// CHECK-REQ: error: instruction requires: pa
+// CHECK-REQ-NEXT:  ldrab x0, [x1]
diff --git a/test/MC/AArch64/armv8.4a-flag.s b/test/MC/AArch64/armv8.4a-flag.s
index 3c0d80bbedd513b720aa8c33017c0bc52a02cf3c..154ed425bb606a33cbb968a41eba6913c2ce4205 100644
--- a/test/MC/AArch64/armv8.4a-flag.s
+++ b/test/MC/AArch64/armv8.4a-flag.s
@@ -1,5 +1,14 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a < %s | FileCheck %s --check-prefix=CHECK
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a %s -o - | \
+// RUN: FileCheck %s
+
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+fmi %s -o - 2>&1 | \
+// RUN: FileCheck %s
+
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a %s -o - 2>&1 | \
+// RUN: FileCheck %s --check-prefix=ERROR
+
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a,-fmi %s -o - 2>&1 | \
+// RUN: FileCheck %s --check-prefix=ERROR
 
 //------------------------------------------------------------------------------
 // Armv8.4-A flag manipulation instructions
@@ -21,24 +30,24 @@
 //CHECK-NEXT: rmif x1, #63, #15            // encoding: [0x2f,0x84,0x1f,0xba]
 //CHECK-NEXT: rmif xzr, #63, #15           // encoding: [0xef,0x87,0x1f,0xba]
 
-//CHECK-ERROR:      error: instruction requires: armv8.4a
-//CHECK-ERROR-NEXT: cfinv
-//CHECK-ERROR-NEXT: ^
-//CHECK-ERROR-NEXT: error: instruction requires: armv8.4a
-//CHECK-ERROR-NEXT: setf8 w1
-//CHECK-ERROR-NEXT: ^
-//CHECK-ERROR-NEXT: error: instruction requires: armv8.4a
-//CHECK-ERROR-NEXT: setf8 wzr
-//CHECK-ERROR-NEXT: ^
-//CHECK-ERROR-NEXT: error: instruction requires: armv8.4a
-//CHECK-ERROR-NEXT: setf16 w1
-//CHECK-ERROR-NEXT: ^
-//CHECK-ERROR-NEXT: error: instruction requires: armv8.4a
-//CHECK-ERROR-NEXT: setf16 wzr
-//CHECK-ERROR-NEXT: ^
-//CHECK-ERROR-NEXT: error: instruction requires: armv8.4a
-//CHECK-ERROR-NEXT: rmif x1, #63, #15
-//CHECK-ERROR-NEXT: ^
-//CHECK-ERROR-NEXT: error: instruction requires: armv8.4a
-//CHECK-ERROR-NEXT: rmif xzr, #63, #15
-//CHECK-ERROR-NEXT: ^
+//ERROR:      error: instruction requires: fmi
+//ERROR-NEXT: cfinv
+//ERROR-NEXT: ^
+//ERROR-NEXT: error: instruction requires: fmi
+//ERROR-NEXT: setf8 w1
+//ERROR-NEXT: ^
+//ERROR-NEXT: error: instruction requires: fmi
+//ERROR-NEXT: setf8 wzr
+//ERROR-NEXT: ^
+//ERROR-NEXT: error: instruction requires: fmi
+//ERROR-NEXT: setf16 w1
+//ERROR-NEXT: ^
+//ERROR-NEXT: error: instruction requires: fmi
+//ERROR-NEXT: setf16 wzr
+//ERROR-NEXT: ^
+//ERROR-NEXT: error: instruction requires: fmi
+//ERROR-NEXT: rmif x1, #63, #15
+//ERROR-NEXT: ^
+//ERROR-NEXT: error: instruction requires: fmi
+//ERROR-NEXT: rmif xzr, #63, #15
+//ERROR-NEXT: ^
diff --git a/test/MC/AArch64/armv8.4a-ldst.s b/test/MC/AArch64/armv8.4a-ldst.s
index bf2a45ad7dfb7d6c5ab55213fe9d423b4bb1e197..8d05c6acc5fc601657ce98635c96beff5c5b80b3 100644
--- a/test/MC/AArch64/armv8.4a-ldst.s
+++ b/test/MC/AArch64/armv8.4a-ldst.s
@@ -1,5 +1,8 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a < %s | FileCheck %s --check-prefix=CHECK
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-V84
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a -o - %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a,+rcpc-immo -o - %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a,-rcpc-immo -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-V84
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-V84
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-V84
 
 //------------------------------------------------------------------------------
 // Armv8.4-A LDAPR and STLR instructions with immediate offsets
@@ -140,168 +143,168 @@ ldapur   x14, [sp, #9]
 //CHECK-NEXT:  ldapur  x13, [x4, #255]         // encoding: [0x8d,0xf0,0x4f,0xd9]
 //CHECK-NEXT:  ldapur  x14, [sp, #9]           // encoding: [0xee,0x93,0x40,0xd9]
 
-//CHECK-NO-V84:      error: instruction requires: armv8.4a
+//CHECK-NO-V84:      error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLURB   WZR, [X10]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLURB   W1, [X10]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLURB   W1, [X10, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: stlurb   w2, [x11, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLURB   W3, [SP, #-3]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapurb  wzr, [x12]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapurb  w4, [x12]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapurb  w4, [x12, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURB  W5, [X13, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURB  W6, [SP, #-2]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSB W7, [X14]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSB W7, [X14, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursb w8, [x15, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursb w9, [sp, #-1]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSB X0, [X16]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSB X0, [X16, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSB X1, [X17, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursb x2, [sp, #0]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursb x2, [sp]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: stlurh   w10, [x18]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: stlurh   w10, [x18, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLURH   W11, [X19, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLURH   W12, [SP, #1]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURH  W13, [X20]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURH  W13, [X20, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapurh  w14, [x21, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURH  W15, [SP, #2]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSH W16, [X22]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSH W16, [X22, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSH W17, [X23, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursh w18, [sp, #3]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursh x3, [x24]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursh x3, [x24, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSH X4, [X25, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSH X5, [SP, #4]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLUR    W19, [X26]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLUR    W19, [X26, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: stlur    w20, [x27, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLUR    W21, [SP, #5]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPUR   W22, [X28]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPUR   W22, [X28, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPUR   W23, [X29, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapur   w24, [sp, #6]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursw x6, [x30]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapursw x6, [x30, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSW X7, [X0, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPURSW X8, [SP, #7]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLUR    X9, [X1]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLUR    X9, [X1, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: stlur    x10, [x2, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: STLUR    X11, [SP, #8]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPUR   X12, [X3]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPUR   X12, [X3, #-256]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: LDAPUR   X13, [X4, #255]
 //CHECK-NO-V84-NEXT: ^
-//CHECK-NO-V84-NEXT: error: instruction requires: armv8.4a
+//CHECK-NO-V84-NEXT: error: instruction requires: rcpc-immo
 //CHECK-NO-V84-NEXT: ldapur   x14, [sp, #9]
 //CHECK-NO-V84-NEXT: ^
diff --git a/test/MC/AArch64/armv8.4a-tlb.s b/test/MC/AArch64/armv8.4a-tlb.s
index e64be16b6619f2f03c7fb4a7764361e3bdb965ed..39f8d0dc214bf4ebacd00316bcf1b1d881aa1f8c 100644
--- a/test/MC/AArch64/armv8.4a-tlb.s
+++ b/test/MC/AArch64/armv8.4a-tlb.s
@@ -1,6 +1,9 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a < %s 2> %t | FileCheck %s --check-prefix=CHECK
 // RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+tlb-rmi < %s 2> %t | FileCheck %s --check-prefix=CHECK
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-V84
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a,-tlb-rmi < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-V84
 
 // Outer shareable TLB maintenance instructions:
 tlbi vmalle1os
@@ -45,55 +48,55 @@ tlbi vae1os, sp
 //CHECK-ERROR-NEXT: tlbi vae1os, sp
 //CHECK-ERROR-NEXT:              ^
 
-//CHECK-NO-V84:      error: TLBI VMALLE1OS requires ARMv8.4a
+//CHECK-NO-V84:      error: TLBI VMALLE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vmalle1os
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VAE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VAE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vae1os, xzr
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VAE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VAE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vae1os, x0
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI ASIDE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI ASIDE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi aside1os, x1
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VAAE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VAAE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vaae1os, x2
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VALE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VALE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vale1os, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VAALE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VAALE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vaale1os, x4
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI IPAS2E1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI IPAS2E1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi ipas2e1os, x5
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI IPAS2LE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI IPAS2LE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi ipas2le1os, x6
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VAE2OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VAE2OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vae2os, x7
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VALE2OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VALE2OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vale2os, x8
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VMALLS12E1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VMALLS12E1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vmalls12e1os
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VAE3OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VAE3OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vae3os, x9
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI VALE3OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI VALE3OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi vale3os, x10
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI ALLE2OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI ALLE2OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi alle2os
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI ALLE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI ALLE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi alle1os
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI ALLE3OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI ALLE3OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi alle3os
 //CHECK-NO-V84-NEXT:      ^
 
@@ -168,96 +171,96 @@ tlbi rvae1, sp
 //CHECK-ERROR-NEXT: tlbi rvae1, sp
 //CHECK-ERROR-NEXT:             ^
 
-//CHECK-NO-V84:      error: TLBI RVAE1 requires ARMv8.4a
+//CHECK-NO-V84:      error: TLBI RVAE1 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae1, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAAE1 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAAE1 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvaae1, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE1 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE1 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale1, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAALE1 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAALE1 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvaale1, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT:  error: TLBI RVAE1IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT:  error: TLBI RVAE1IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae1is, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAAE1IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAAE1IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvaae1is, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE1IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE1IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale1is, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAALE1IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAALE1IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvaale1is, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae1os, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAAE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAAE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvaae1os, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale1os, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAALE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAALE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvaale1os, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RIPAS2E1IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RIPAS2E1IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi ripas2e1is, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RIPAS2LE1IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RIPAS2LE1IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi ripas2le1is, x3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RIPAS2E1 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RIPAS2E1 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi ripas2e1, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RIPAS2LE1 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RIPAS2LE1 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi ripas2le1, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RIPAS2E1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RIPAS2E1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi ripas2e1os, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RIPAS2LE1OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RIPAS2LE1OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi ripas2le1os, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAE2 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAE2 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae2, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE2 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE2 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale2, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAE2IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAE2IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae2is, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE2IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE2IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale2is, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAE2OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAE2OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae2os, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE2OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE2OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale2os, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAE3 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAE3 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae3, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE3 requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE3 requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale3, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAE3IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAE3IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae3is, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE3IS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE3IS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale3is, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVAE3OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVAE3OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvae3os, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE3OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE3OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale3os, X3
 //CHECK-NO-V84-NEXT:      ^
-//CHECK-NO-V84-NEXT: error: TLBI RVALE3OS requires ARMv8.4a
+//CHECK-NO-V84-NEXT: error: TLBI RVALE3OS requires tlb-rmi
 //CHECK-NO-V84-NEXT: tlbi rvale3os, XZR
 //CHECK-NO-V84-NEXT:      ^
diff --git a/test/MC/AArch64/armv8.4a-trace.s b/test/MC/AArch64/armv8.4a-trace.s
index d14b0e0ef7eace8254615e78ef4cdc4cef773dd4..7227b500be164247ffeb21f4c4e198cc57896d83 100644
--- a/test/MC/AArch64/armv8.4a-trace.s
+++ b/test/MC/AArch64/armv8.4a-trace.s
@@ -1,5 +1,14 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a < %s  | FileCheck %s --check-prefix=CHECK
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a -o - 2>&1 %s  | \
+// RUN: FileCheck %s
+
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+tracev8.4 -o - 2>&1 %s  | \
+// RUN: FileCheck %s
+
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a -o - %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=CHECK-ERROR
+
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a,-tracev8.4 -o - %s 2>&1 | \
+// RUN: FileCheck %s --check-prefixes NOFEATURE,CHECK-ERROR
 
 //------------------------------------------------------------------------------
 // ARMV8.4-A Debug, Trace and PMU Extensions
@@ -45,4 +54,4 @@ tsb csync
 //CHECK-ERROR: mrs x0, TRFCR_EL12
 //CHECK-ERROR:         ^
 
-//CHECK-ERROR: error: instruction requires: armv8.4a
+//CHECK-ERROR: error: instruction requires: tracev8.4
diff --git a/test/MC/AArch64/armv8.5a-specrestrict.s b/test/MC/AArch64/armv8.5a-specrestrict.s
index e19bbb1b424358970f06cc3d92b06194583ee019..88526b0d6a0c66cf7381c70a31bdc6e89caa1b2c 100644
--- a/test/MC/AArch64/armv8.5a-specrestrict.s
+++ b/test/MC/AArch64/armv8.5a-specrestrict.s
@@ -51,19 +51,3 @@ msr SCXTNUM_EL12, x4
 // NOSPECID-NEXT: {{scxtnum_el3|SCXTNUM_EL3}}
 // NOSPECID:      error: expected writable system register
 // NOSPECID-NEXT: {{scxtnum_el12|SCXTNUM_EL12}}
-
-mrs x2, SSBS
-
-// CHECK:         mrs x2, {{ssbs|SSBS}} // encoding: [0xc2,0x42,0x3b,0xd5]
-// NOSPECID:      error: expected readable system register
-// NOSPECID-NEXT: mrs x2, {{ssbs|SSBS}}
-
-msr SSBS, x3
-msr SSBS, #1
-
-// CHECK:         msr {{ssbs|SSBS}}, x3 // encoding: [0xc3,0x42,0x1b,0xd5]
-// CHECK:         msr {{ssbs|SSBS}}, #1 // encoding: [0x3f,0x41,0x03,0xd5]
-// NOSPECID:      error: expected writable system register or pstate
-// NOSPECID-NEXT: msr {{ssbs|SSBS}}, x3
-// NOSPECID:      error: expected writable system register or pstate
-// NOSPECID-NEXT: msr {{ssbs|SSBS}}, #1
diff --git a/test/MC/AArch64/armv8.5a-specrestrict-error.s b/test/MC/AArch64/armv8.5a-ssbs-error.s
similarity index 53%
rename from test/MC/AArch64/armv8.5a-specrestrict-error.s
rename to test/MC/AArch64/armv8.5a-ssbs-error.s
index ef56860e2d7c36ea7291ed8221fff0b577928bda..59819705e46cee2b23450695d7fd884c8264b5ac 100644
--- a/test/MC/AArch64/armv8.5a-specrestrict-error.s
+++ b/test/MC/AArch64/armv8.5a-ssbs-error.s
@@ -1,6 +1,6 @@
-// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+specrestrict < %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.5a        < %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=-specrestrict < %s 2>&1 | FileCheck %s --check-prefix=NOSPECID
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+ssbs  < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.5a < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=-ssbs  < %s 2>&1 | FileCheck %s --check-prefix=NOSPECID
 
 msr SSBS, #2
 
diff --git a/test/MC/AArch64/armv8.5a-ssbs.s b/test/MC/AArch64/armv8.5a-ssbs.s
new file mode 100644
index 0000000000000000000000000000000000000000..36ae98436b94a4d106edfa7873c7c6dd11989a66
--- /dev/null
+++ b/test/MC/AArch64/armv8.5a-ssbs.s
@@ -0,0 +1,19 @@
+// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+ssbs  < %s      | FileCheck %s
+// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+v8.5a < %s      | FileCheck %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=-ssbs  < %s 2>&1 | FileCheck %s --check-prefix=NOSPECID
+
+mrs x2, SSBS
+
+// CHECK:         mrs x2, {{ssbs|SSBS}} // encoding: [0xc2,0x42,0x3b,0xd5]
+// NOSPECID:      error: expected readable system register
+// NOSPECID-NEXT: mrs x2, {{ssbs|SSBS}}
+
+msr SSBS, x3
+msr SSBS, #1
+
+// CHECK:         msr {{ssbs|SSBS}}, x3 // encoding: [0xc3,0x42,0x1b,0xd5]
+// CHECK:         msr {{ssbs|SSBS}}, #1 // encoding: [0x3f,0x41,0x03,0xd5]
+// NOSPECID:      error: expected writable system register or pstate
+// NOSPECID-NEXT: msr {{ssbs|SSBS}}, x3
+// NOSPECID:      error: expected writable system register or pstate
+// NOSPECID-NEXT: msr {{ssbs|SSBS}}, #1
diff --git a/test/MC/AArch64/coff-function-type-info.ll b/test/MC/AArch64/coff-function-type-info.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1f924bca874d9b93a6cfc394e64c1b0897d11c6f
--- /dev/null
+++ b/test/MC/AArch64/coff-function-type-info.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple arm64-windows -filetype asm -o - %s \
+; RUN:    | FileCheck %s -check-prefix CHECK-ASM
+
+; RUN: llc -mtriple arm64-windows -filetype obj -o - %s \
+; RUN:    | llvm-readobj -t | FileCheck %s -check-prefix CHECK-OBJECT
+
+define arm_aapcs_vfpcc void @external() {
+entry:
+  ret void
+}
+
+; CHECK-ASM: .def external
+; CHECK-ASM:   .scl 2
+; CHECK-ASM:   .type 32
+; CHECK-ASM: .endef
+; CHECK-ASM: .globl external
+
+define internal arm_aapcs_vfpcc void @internal() {
+entry:
+  ret void
+}
+
+; CHECK-ASM: .def internal
+; CHECK-ASM:    .scl 3
+; CHECK-ASM:    .type 32
+; CHECK-ASM: .endef
+; CHECK-ASM-NOT: .globl internal
+
+; CHECK-OBJECT: Symbol {
+; CHECK-OBJECT:   Name: external
+; CHECK-OBJECT:   Section: .text
+; CHECK-OBJECT:   BaseType: Null
+; CHECK-OBJECT:   ComplexType: Function
+; CHECK-OBJECT:   StorageClass: External
+; CHECK-OBJECT:   AuxSymbolCount: 0
+; CHECK-OBJECT: }
+; CHECK-OBJECT: Symbol {
+; CHECK-OBJECT:   Name: internal
+; CHECK-OBJECT:   Section: .text
+; CHECK-OBJECT:   BaseType: Null
+; CHECK-OBJECT:   ComplexType: Function
+; CHECK-OBJECT:   StorageClass: Static
+; CHECK-OBJECT:   AuxSymbolCount: 0
+; CHECK-OBJECT: }
+
diff --git a/test/MC/AArch64/crc.s b/test/MC/AArch64/crc.s
index 597ba3c257a9f1d2e6704c2a6bfc4a282dc6a55f..77809ed8fb19ad04763c3211a20b710166b827b9 100644
--- a/test/MC/AArch64/crc.s
+++ b/test/MC/AArch64/crc.s
@@ -5,6 +5,8 @@
 // RUN:   FileCheck %s --check-prefix=CRC
 // RUN: llvm-mc -triple aarch64-- -mcpu=cortex-a75 %s 2>&1 |\
 // RUN:   FileCheck %s --check-prefix=CRC
+// RUN: llvm-mc -triple aarch64-- -mcpu=tsv110 %s 2>&1 |\
+// RUN:   FileCheck %s --check-prefix=CRC
 
 // RUN: not llvm-mc -triple aarch64-- %s 2>&1 |\
 // RUN:   FileCheck %s --check-prefix=NOCRC
diff --git a/test/MC/AArch64/ras-extension.s b/test/MC/AArch64/ras-extension.s
index e8d1f62e6bd1e86e9072dd012277c05c63dbfe2a..ef429edad8d5db6ddcd280241512abf2c23c167e 100644
--- a/test/MC/AArch64/ras-extension.s
+++ b/test/MC/AArch64/ras-extension.s
@@ -1,6 +1,7 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+ras < %s | FileCheck %s
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mcpu=cortex-a55 < %s | FileCheck %s
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mcpu=cortex-a75 < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mcpu=tsv110 < %s | FileCheck %s
 
   esb
 // CHECK: esb                             // encoding: [0x1f,0x22,0x03,0xd5]
diff --git a/test/MC/AArch64/seh.s b/test/MC/AArch64/seh.s
new file mode 100644
index 0000000000000000000000000000000000000000..ca862cb4e549248d098a4b68d0bac7df356aa908
--- /dev/null
+++ b/test/MC/AArch64/seh.s
@@ -0,0 +1,84 @@
+// This test checks that the SEH directives emit the correct unwind data.
+
+// RUN: llvm-mc -triple aarch64-pc-win32 -filetype=obj %s | llvm-readobj -s -r | FileCheck %s
+
+// CHECK:      Sections [
+// CHECK:        Section {
+// CHECK:          Name: .text
+// CHECK:          RelocationCount: 0
+// CHECK:          Characteristics [
+// CHECK-NEXT:       ALIGN_4BYTES
+// CHECK-NEXT:       CNT_CODE
+// CHECK-NEXT:       MEM_EXECUTE
+// CHECK-NEXT:       MEM_READ
+// CHECK-NEXT:     ]
+// CHECK-NEXT:   }
+// CHECK:        Section {
+// CHECK:          Name: .xdata
+// CHECK:          RawDataSize: 24
+// CHECK:          RelocationCount: 1
+// CHECK:          Characteristics [
+// CHECK-NEXT:       ALIGN_4BYTES
+// CHECK-NEXT:       CNT_INITIALIZED_DATA
+// CHECK-NEXT:       MEM_READ
+// CHECK-NEXT:     ]
+// CHECK-NEXT:   }
+// CHECK:        Section {
+// CHECK:          Name: .pdata
+// CHECK:          RelocationCount: 6
+// CHECK:          Characteristics [
+// CHECK-NEXT:       ALIGN_4BYTES
+// CHECK-NEXT:       CNT_INITIALIZED_DATA
+// CHECK-NEXT:       MEM_READ
+// CHECK-NEXT:     ]
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+// CHECK-NEXT: Relocations [
+// CHECK-NEXT:   Section (4) .xdata {
+// CHECK-NEXT:     0x8 IMAGE_REL_ARM64_ADDR32NB __C_specific_handler
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Section (5) .pdata {
+// CHECK-NEXT:     0x0 IMAGE_REL_ARM64_ADDR32NB func
+// CHECK-NEXT:     0x4 IMAGE_REL_ARM64_ADDR32NB .xdata
+// CHECK-NEXT:     0x8 IMAGE_REL_ARM64_ADDR32NB func
+// CHECK-NEXT:     0xC IMAGE_REL_ARM64_ADDR32NB .xdata
+// CHECK-NEXT:     0x10 IMAGE_REL_ARM64_ADDR32NB smallFunc
+// CHECK-NEXT:     0x14 IMAGE_REL_ARM64_ADDR32NB .xdata
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+
+    .text
+    .globl func
+    .def func
+    .scl 2
+    .type 32
+    .endef
+    .seh_proc func
+func:
+    sub sp, sp, #24
+    .seh_stackalloc 24
+    mov x29, sp
+    .seh_endprologue
+    .seh_handler __C_specific_handler, @except
+    .seh_handlerdata
+    .long 0
+    .text
+    .seh_startchained
+    .seh_endprologue
+    .seh_endchained
+    add sp, sp, #24
+    ret
+    .seh_endproc
+
+// Test emission of small functions.
+    .globl smallFunc
+    .def smallFunc
+    .scl 2
+    .type 32
+    .endef
+    .seh_proc smallFunc
+smallFunc:
+    ret
+    .seh_endproc
diff --git a/test/MC/AMDGPU/hsa-diag-v3.s b/test/MC/AMDGPU/hsa-diag-v3.s
index b05b8d197d419c027f35449e7df7f0bddb8b025f..5f2d89da75efb0ff9f0a9d7e38a9198050a39bd0 100644
--- a/test/MC/AMDGPU/hsa-diag-v3.s
+++ b/test/MC/AMDGPU/hsa-diag-v3.s
@@ -21,7 +21,7 @@
 .end_amdhsa_kernel
 
 .amdhsa_kernel foo
-  // NOT-AMDHSA: error: directive only supported for amdhsa OS
+  // NOT-AMDHSA: error: unknown directive
 .end_amdhsa_kernel
 
 .amdhsa_kernel foo
diff --git a/test/MC/AMDGPU/hsa-exp.s b/test/MC/AMDGPU/hsa-exp.s
index b13755a19cce1bac8468932af048e4c6b3565570..8900a0638c90d4ab9e3dc36f1b8e295d69289cd7 100644
--- a/test/MC/AMDGPU/hsa-exp.s
+++ b/test/MC/AMDGPU/hsa-exp.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 // ELF: Section {
 // ELF: Name: .text
diff --git a/test/MC/AMDGPU/hsa-metadata-kernel-args-v3.s b/test/MC/AMDGPU/hsa-metadata-kernel-args-v3.s
new file mode 100644
index 0000000000000000000000000000000000000000..7ca154028f201cd324763be947ea3aff5d870438
--- /dev/null
+++ b/test/MC/AMDGPU/hsa-metadata-kernel-args-v3.s
@@ -0,0 +1,96 @@
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amdgpu_metadata
+// CHECK:    amdhsa.kernels:
+// CHECK-NEXT:      - .max_flat_workgroup_size: 256
+// CHECK-NEXT:        .wavefront_size: 128
+// CHECK-NEXT:        .symbol:      'test_kernel@kd'
+// CHECK-NEXT:        .kernarg_segment_size: 8
+// CHECK-NEXT:        .private_segment_fixed_size: 32
+// CHECK-NEXT:        .name:            test_kernel
+// CHECK-NEXT:        .language:        OpenCL C
+// CHECK-NEXT:        .sgpr_count: 14
+// CHECK-NEXT:        .kernarg_segment_align: 64
+// CHECK-NEXT:        .vgpr_count: 40
+// CHECK-NEXT:        .group_segment_fixed_size: 16
+// CHECK-NEXT:        .language_version:
+// CHECK-NEXT:          - 2
+// CHECK-NEXT:          - 0
+// CHECK-NEXT:        .args:
+// CHECK-NEXT:          - .type_name:      char
+// CHECK-NEXT:            .value_kind:     by_value
+// CHECK-NEXT:            .offset:         1
+// CHECK-NEXT:            .size:          1
+// CHECK-NEXT:            .value_type:     i8
+// CHECK-NEXT:          - .value_kind:     hidden_global_offset_x
+// CHECK-NEXT:            .offset:         8
+// CHECK-NEXT:            .size:          8
+// CHECK-NEXT:            .value_type:     i64
+// CHECK-NEXT:          - .value_kind:     hidden_global_offset_y
+// CHECK-NEXT:            .offset:         8
+// CHECK-NEXT:            .size:          8
+// CHECK-NEXT:            .value_type:     i64
+// CHECK-NEXT:          - .value_kind:     hidden_global_offset_z
+// CHECK-NEXT:            .offset:         8
+// CHECK-NEXT:            .size:          8
+// CHECK-NEXT:            .value_type:     i64
+// CHECK-NEXT:          - .value_kind:     hidden_printf_buffer
+// CHECK-NEXT:            .offset:         8
+// CHECK-NEXT:            .size:          8
+// CHECK-NEXT:            .value_type:     i8
+// CHECK-NEXT:            .address_space: global
+// CHECK:    amdhsa.version:
+// CHECK-NEXT:       - 1
+// CHECK-NEXT:       - 0
+// CHECK:    amdhsa.printf:
+// CHECK-NEXT:      - '1:1:4:%d\n'
+// CHECK-NEXt:      - '2:1:8:%g\n'
+// CHECK:  .end_amdgpu_metadata
+.amdgpu_metadata
+  amdhsa.version:
+    - 1
+    - 0
+  amdhsa.printf:
+    - '1:1:4:%d\n'
+    - '2:1:8:%g\n'
+  amdhsa.kernels:
+    - .name:            test_kernel
+      .symbol:      test_kernel@kd
+      .language:        OpenCL C
+      .language_version:
+        - 2
+        - 0
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+      .args:
+        - .type_name:      char
+          .size:          1
+          .offset:         1
+          .value_kind:     by_value
+          .value_type:     i8
+        - .size:          8
+          .offset:         8
+          .value_kind:     hidden_global_offset_x
+          .value_type:     i64
+        - .size:          8
+          .offset:         8
+          .value_kind:     hidden_global_offset_y
+          .value_type:     i64
+        - .size:          8
+          .offset:         8
+          .value_kind:     hidden_global_offset_z
+          .value_type:     i64
+        - .size:          8
+          .offset:         8
+          .value_kind:     hidden_printf_buffer
+          .value_type:     i8
+          .address_space: global
+.end_amdgpu_metadata
diff --git a/test/MC/AMDGPU/hsa-metadata-kernel-args.s b/test/MC/AMDGPU/hsa-metadata-kernel-args.s
index 2f9960ffaa0f4a59408748ed2fd7565a06996ed5..f0c290822440980701ef27737247804ab611bb63 100644
--- a/test/MC/AMDGPU/hsa-metadata-kernel-args.s
+++ b/test/MC/AMDGPU/hsa-metadata-kernel-args.s
@@ -1,6 +1,6 @@
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
 
 // CHECK:  .amd_amdgpu_hsa_metadata
 // CHECK:    Version: [ 1, 0 ]
diff --git a/test/MC/AMDGPU/hsa-metadata-kernel-attrs-v3.s b/test/MC/AMDGPU/hsa-metadata-kernel-attrs-v3.s
new file mode 100644
index 0000000000000000000000000000000000000000..cb1df7dcf931fed26cfbf3af1e08682650c37eea
--- /dev/null
+++ b/test/MC/AMDGPU/hsa-metadata-kernel-attrs-v3.s
@@ -0,0 +1,67 @@
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amdgpu_metadata
+// CHECK:    amdhsa.kernels:
+// CHECK:      - .max_flat_workgroup_size: 256
+// CHECK:        .wavefront_size: 128
+// CHECK:        .symbol:      'test_kernel@kd'
+// CHECK:        .reqd_workgroup_size:
+// CHECK-NEXT:     - 1
+// CHECK-NEXT:     - 2
+// CHECK-NEXT:     - 4
+// CHECK:        .kernarg_segment_size: 8
+// CHECK:        .private_segment_fixed_size: 32
+// CHECK:        .workgroup_size_hint:
+// CHECK-NEXT:     - 8
+// CHECK-NEXT:     - 16
+// CHECK-NEXT:     - 32
+// CHECK:        .name:            test_kernel
+// CHECK:        .language:        OpenCL C
+// CHECK:        .sgpr_count:      14
+// CHECK:        .kernarg_segment_align: 64
+// CHECK:        .vgpr_count:      40
+// CHECK:        .language_version:
+// CHECK-NEXT:     - 2
+// CHECK-NEXT:     - 0
+// CHECK:        .vec_type_hint:       int
+// CHECK:    amdhsa.version:
+// CHECK-NEXT: - 1
+// CHECK-NEXT: - 0
+// CHECK:    amdhsa.printf:
+// CHECK:      - '1:1:4:%d\n'
+// CHECK:      - '2:1:8:%g\n'
+// CHECK: .end_amdgpu_metadata
+.amdgpu_metadata
+  amdhsa.version:
+    - 1
+    - 0
+  amdhsa.printf:
+    - '1:1:4:%d\n'
+    - '2:1:8:%g\n'
+  amdhsa.kernels:
+    - .name:            test_kernel
+      .symbol:      test_kernel@kd
+      .language:        OpenCL C
+      .language_version:
+        - 2
+        - 0
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+      .reqd_workgroup_size:
+        - 1
+        - 2
+        - 4
+      .workgroup_size_hint:
+        - 8
+        - 16
+        - 32
+      .vec_type_hint:       int
+.end_amdgpu_metadata
diff --git a/test/MC/AMDGPU/hsa-metadata-kernel-attrs.s b/test/MC/AMDGPU/hsa-metadata-kernel-attrs.s
index 5a9fdd22069c72bc97d35c550b114b240d657c40..e19bf445e6af6921a26f5e757e68fb9b9237fd09 100644
--- a/test/MC/AMDGPU/hsa-metadata-kernel-attrs.s
+++ b/test/MC/AMDGPU/hsa-metadata-kernel-attrs.s
@@ -1,6 +1,6 @@
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
 
 // CHECK:  .amd_amdgpu_hsa_metadata
 // CHECK:    Version: [ 1, 0 ]
diff --git a/test/MC/AMDGPU/hsa-metadata-kernel-code-props-v3.s b/test/MC/AMDGPU/hsa-metadata-kernel-code-props-v3.s
new file mode 100644
index 0000000000000000000000000000000000000000..82a3b61c7a8037ae35ce8617efb8cd2dc1796564
--- /dev/null
+++ b/test/MC/AMDGPU/hsa-metadata-kernel-code-props-v3.s
@@ -0,0 +1,42 @@
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amdgpu_metadata
+// CHECK:    amdhsa.kernels:
+// CHECK:      - .sgpr_count: 40
+// CHECK:        .max_flat_workgroup_size:    256
+// CHECK:        .symbol: 'test_kernel@kd'
+// CHECK:        .kernarg_segment_size:      24
+// CHECK:        .group_segment_fixed_size:   24
+// CHECK:        .private_segment_fixed_size: 16
+// CHECK:        .vgpr_count: 14
+// CHECK:        .vgpr_spill_count: 1
+// CHECK:        .kernarg_segment_align:     16
+// CHECK:        .sgpr_spill_count: 1
+// CHECK:        .wavefront_size:           64
+// CHECK:        .name:       test_kernel
+// CHECK:    amdhsa.version:
+// CHECK-NEXT: - 1
+// CHECK-NEXT: - 0
+.amdgpu_metadata
+  amdhsa.version:
+    - 1
+    - 0
+  amdhsa.printf:
+    - '1:1:4:%d\n'
+    - '2:1:8:%g\n'
+  amdhsa.kernels:
+    - .name:            test_kernel
+      .symbol:      test_kernel@kd
+      .kernarg_segment_size:      24
+      .group_segment_fixed_size:   24
+      .private_segment_fixed_size: 16
+      .kernarg_segment_align:     16
+      .wavefront_size:           64
+      .max_flat_workgroup_size:    256
+      .sgpr_count:               40
+      .vgpr_count:               14
+      .sgpr_spill_count:         1
+      .vgpr_spill_count:         1
+.end_amdgpu_metadata
diff --git a/test/MC/AMDGPU/hsa-metadata-kernel-code-props.s b/test/MC/AMDGPU/hsa-metadata-kernel-code-props.s
index 0b0404295cf7989a4ea99e9dea9fd77fe6e46a73..c8cc3cf3fc55235a39d5c7d491808a42d60dca06 100644
--- a/test/MC/AMDGPU/hsa-metadata-kernel-code-props.s
+++ b/test/MC/AMDGPU/hsa-metadata-kernel-code-props.s
@@ -1,6 +1,6 @@
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
 
 // CHECK:  .amd_amdgpu_hsa_metadata
 // CHECK:    Version: [ 1, 0 ]
diff --git a/test/MC/AMDGPU/hsa-metadata-kernel-debug-props.s b/test/MC/AMDGPU/hsa-metadata-kernel-debug-props.s
index 7efb8207ae4917a707fc5e7e8b3c32bc68ff5274..e3bd87faf9875ba149b497084ef5c8653f1cb0f5 100644
--- a/test/MC/AMDGPU/hsa-metadata-kernel-debug-props.s
+++ b/test/MC/AMDGPU/hsa-metadata-kernel-debug-props.s
@@ -1,6 +1,6 @@
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
-// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
 
 // CHECK:  .amd_amdgpu_hsa_metadata
 // CHECK:    Version: [ 1, 0 ]
diff --git a/test/MC/AMDGPU/hsa-metadata-unknown-key.s b/test/MC/AMDGPU/hsa-metadata-unknown-key.s
index f532930c7c651baa2999009f64b704364124330d..2fbbde7cee07be6c99da9b6db0edaf98247f230e 100644
--- a/test/MC/AMDGPU/hsa-metadata-unknown-key.s
+++ b/test/MC/AMDGPU/hsa-metadata-unknown-key.s
@@ -1,9 +1,9 @@
-// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-code-object-v3 %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-code-object-v3 -filetype=obj %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj %s 2>&1 | FileCheck %s
 
 // CHECK: error: unknown key 'UnknownKey'
 .amd_amdgpu_hsa_metadata
diff --git a/test/MC/AMDGPU/hsa-text.s b/test/MC/AMDGPU/hsa-text.s
index afe696af0a2610a203492262ad4e8f1343f2d490..f4463fc5936412c304a515beb254e8907db16f29 100644
--- a/test/MC/AMDGPU/hsa-text.s
+++ b/test/MC/AMDGPU/hsa-text.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF
 
 // For compatibility reasons we treat convert .text sections to .hsatext
 
diff --git a/test/MC/AMDGPU/hsa-v3.s b/test/MC/AMDGPU/hsa-v3.s
index e8bd1a1ba4eba10fdf4ecdc8b1d811b763697098..6adcdb8447854abcd95c58dc1128f912ac03c58d 100644
--- a/test/MC/AMDGPU/hsa-v3.s
+++ b/test/MC/AMDGPU/hsa-v3.s
@@ -213,3 +213,59 @@ v_mov_b32_e32 v16, s3
 // ASM: .byte 17
 .byte .amdgcn.next_free_sgpr
 // ASM: .byte 4
+
+// Metadata
+
+.amdgpu_metadata
+  amdhsa.version:
+    - 3
+    - 0
+  amdhsa.kernels:
+    - .name:       amd_kernel_code_t_test_all
+      .symbol: amd_kernel_code_t_test_all@kd
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+    - .name:       amd_kernel_code_t_minimal
+      .symbol: amd_kernel_code_t_minimal@kd
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+.end_amdgpu_metadata
+
+// ASM: .amdgpu_metadata
+// ASM:    amdhsa.kernels:
+// ASM:  - .sgpr_count:     14
+// ASM:    .max_flat_workgroup_size: 256
+// ASM:    .symbol:         'amd_kernel_code_t_test_all@kd'
+// ASM:    .kernarg_segment_size: 8
+// ASM:    .group_segment_fixed_size: 16
+// ASM:    .private_segment_fixed_size: 32
+// ASM:    .vgpr_count:     40
+// ASM:    .kernarg_segment_align: 64
+// ASM:    .wavefront_size: 128
+// ASM:    .name:           amd_kernel_code_t_test_all
+// ASM:  - .sgpr_count:     14
+// ASM:    .max_flat_workgroup_size: 256
+// ASM:    .symbol:         'amd_kernel_code_t_minimal@kd'
+// ASM:    .kernarg_segment_size: 8
+// ASM:    .group_segment_fixed_size: 16
+// ASM:    .private_segment_fixed_size: 32
+// ASM:    .vgpr_count:     40
+// ASM:    .kernarg_segment_align: 64
+// ASM:    .wavefront_size: 128
+// ASM:    .name:           amd_kernel_code_t_minimal
+// ASM:    amdhsa.version:
+// ASM-NEXT: - 3
+// ASM-NEXT: - 0
+// ASM: .end_amdgpu_metadata
diff --git a/test/MC/AMDGPU/hsa.s b/test/MC/AMDGPU/hsa.s
index 5ebc0a60e0fa5bd60a67df97857f8a0f5398498b..0521c10e1a8069f0dab23f41a19046420080bee9 100644
--- a/test/MC/AMDGPU/hsa.s
+++ b/test/MC/AMDGPU/hsa.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 // ELF: Section {
 // ELF: Name: .text
diff --git a/test/MC/AMDGPU/hsa_code_object_isa_args.s b/test/MC/AMDGPU/hsa_code_object_isa_args.s
index 1c47c83e3e92e3963df5cbb844dd64ebee6b9638..950f32cd19a583e66a608831d6b379cbc01e2ade 100644
--- a/test/MC/AMDGPU/hsa_code_object_isa_args.s
+++ b/test/MC/AMDGPU/hsa_code_object_isa_args.s
@@ -1,9 +1,9 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_700
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=gfx803 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_803
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=stoney -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_810
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_700
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=gfx803 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_803
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=stoney -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_810
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_700
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_803
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=stoney -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_810
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_700
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_803
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=stoney -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_810
 
 // ELF: SHT_NOTE
 // ELF: 0000: 04000000 08000000 01000000 414D4400
diff --git a/test/MC/AMDGPU/hsa_isa_version_attrs.s b/test/MC/AMDGPU/hsa_isa_version_attrs.s
index 631e1a4509749d90ab11212a8813c140bb18255d..ddd76fcf91895885c4cea7bbd01511890007f88b 100644
--- a/test/MC/AMDGPU/hsa_isa_version_attrs.s
+++ b/test/MC/AMDGPU/hsa_isa_version_attrs.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=gfx801 -mattr=-fast-fmaf -show-encoding %s | FileCheck --check-prefix=GFX8 %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -mattr=-mad-mix-insts -show-encoding %s | FileCheck --check-prefix=GFX9 %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx801 -mattr=-code-object-v3,-fast-fmaf -show-encoding %s | FileCheck --check-prefix=GFX8 %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -mattr=-code-object-v3,-mad-mix-insts -show-encoding %s | FileCheck --check-prefix=GFX9 %s
 
 .hsa_code_object_isa
 // GFX8:  .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
diff --git a/test/MC/AMDGPU/isa-version-hsa.s b/test/MC/AMDGPU/isa-version-hsa.s
index 9004e1c3ac5a65f2651414ab21e948128f176d0f..74e688163bc1f1d5736848c850345bd21819bfd9 100644
--- a/test/MC/AMDGPU/isa-version-hsa.s
+++ b/test/MC/AMDGPU/isa-version-hsa.s
@@ -1,10 +1,10 @@
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
 
 // OSABI-HSA: .amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/isa-version-pal.s b/test/MC/AMDGPU/isa-version-pal.s
index 42051b62c0df2e21e1a5462888528c1cd5221d03..a872ff842585b2d37c3ba0644cb09990cb9f9351 100644
--- a/test/MC/AMDGPU/isa-version-pal.s
+++ b/test/MC/AMDGPU/isa-version-pal.s
@@ -1,10 +1,10 @@
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
-// RUN: llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
+// RUN: llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
 
 // OSABI-PAL: .amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/isa-version-unk.s b/test/MC/AMDGPU/isa-version-unk.s
index 81792ade0830ce7f0635932d204b0c79a988d094..2b20ecb92855f43ddfe56f454532e12c1a63df24 100644
--- a/test/MC/AMDGPU/isa-version-unk.s
+++ b/test/MC/AMDGPU/isa-version-unk.s
@@ -1,10 +1,10 @@
-// RUN: llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
-// RUN: llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
+// RUN: llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
 
 // OSABI-UNK: .amd_amdgpu_isa "amdgcn-amd-unknown--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/sym_option.s b/test/MC/AMDGPU/sym_option.s
index 8bc9495c9ed9145d2803f70d5a02574099547384..98b4067168e2fe7183bc23f79a7bb937bd8e0d77 100644
--- a/test/MC/AMDGPU/sym_option.s
+++ b/test/MC/AMDGPU/sym_option.s
@@ -1,12 +1,12 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti      %s | FileCheck %s --check-prefix=SI
-// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire %s | FileCheck %s --check-prefix=BONAIRE
-// RUN: llvm-mc -arch=amdgcn -mcpu=hawaii %s | FileCheck %s --check-prefix=HAWAII
-// RUN: llvm-mc -arch=amdgcn -mcpu=kabini  %s | FileCheck %s --check-prefix=KABINI
-// RUN: llvm-mc -arch=amdgcn -mcpu=iceland %s | FileCheck %s --check-prefix=ICELAND
-// RUN: llvm-mc -arch=amdgcn -mcpu=carrizo %s | FileCheck %s --check-prefix=CARRIZO
-// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s | FileCheck %s --check-prefix=TONGA
-// RUN: llvm-mc -arch=amdgcn -mcpu=fiji %s | FileCheck %s --check-prefix=FIJI
-// RUN: llvm-mc -arch=amdgcn -mcpu=stoney  %s | FileCheck %s --check-prefix=STONEY
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=tahiti %s | FileCheck %s --check-prefix=SI
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=bonaire %s | FileCheck %s --check-prefix=BONAIRE
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=hawaii %s | FileCheck %s --check-prefix=HAWAII
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=kabini  %s | FileCheck %s --check-prefix=KABINI
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck %s --check-prefix=ICELAND
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=carrizo %s | FileCheck %s --check-prefix=CARRIZO
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=tonga %s | FileCheck %s --check-prefix=TONGA
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=fiji %s | FileCheck %s --check-prefix=FIJI
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=stoney  %s | FileCheck %s --check-prefix=STONEY
 
 .byte .option.machine_version_major
 // SI: .byte 6
diff --git a/test/MC/AMDGPU/vop_dpp.s b/test/MC/AMDGPU/vop_dpp.s
index 87467f1fce8976ddc1bd1b21fe4026a6d26af6fe..eb68abb93f2d37bd5ec14b6d237d8026d578ae63 100644
--- a/test/MC/AMDGPU/vop_dpp.s
+++ b/test/MC/AMDGPU/vop_dpp.s
@@ -116,7 +116,6 @@ v_add_f32 v0, |v0|, -v0 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// VI9: v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x00,0x00,0x7e,0x00,0x01,0x09,0xa1]
 v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 
 // NOSICI: error:
diff --git a/test/MC/ARM/elf-execute-only-section.ll b/test/MC/ARM/elf-execute-only-section.ll
index 2be3f4c70382532972b0f9c8196f4c29c494aaf0..555294b61a7e52296a0894b379da59be023b33f9 100644
--- a/test/MC/ARM/elf-execute-only-section.ll
+++ b/test/MC/ARM/elf-execute-only-section.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -mtriple=thumbv8m.base-eabi -mattr=+execute-only -filetype=obj %s -o - | \
-; RUN: llvm-readelf -s | FileCheck %s
+; RUN: llvm-readelf -S | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv8m.main-eabi -mattr=+execute-only -filetype=obj %s -o - | \
-; RUN: llvm-readelf -s | FileCheck %s
+; RUN: llvm-readelf -S | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv7m-eabi -mattr=+execute-only -filetype=obj %s -o - | \
-; RUN: llvm-readelf -s | FileCheck %s
+; RUN: llvm-readelf -S | FileCheck %s
 
 ; CHECK-NOT: {{.text[ ]+PROGBITS[ ]+[0-9]+ [0-9]+ [0-9]+ [0-9]+ AX[^p]}}
 ; CHECK: {{.text[ ]+PROGBITS[ ]+[0-9]+ [0-9]+ [0-9]+ [0-9]+ AXp}}
diff --git a/test/MC/AsmParser/floating-literals.s b/test/MC/AsmParser/floating-literals.s
index de0b4ca0640103e29dda0eb30f31236d49e25ec1..9dca77f8ea02aee8181c2e5312899af38807fa96 100644
--- a/test/MC/AsmParser/floating-literals.s
+++ b/test/MC/AsmParser/floating-literals.s
@@ -10,12 +10,21 @@
 # CHECK: .long	2139095040
 .single InFinIty
 
+# CHECK: .quad	9218868437227405312
+.double infinity
+
 # CHECK: .long	4286578688
 .single -iNf
 
+# CHECK: .quad	-4503599627370496
+.double -inf
+
 # CHECK: .long	2147483647
 .single nAN
 
+# CHECK: .quad	9223372036854775807
+.double NaN
+
 # CHECK: .long  1067928519
 .float 1.307
         
diff --git a/test/MC/COFF/cv-def-range-align.s b/test/MC/COFF/cv-def-range-align.s
new file mode 100644
index 0000000000000000000000000000000000000000..57bd3bf2af547c7f5a7eb502132f5163a0b59627
--- /dev/null
+++ b/test/MC/COFF/cv-def-range-align.s
@@ -0,0 +1,161 @@
+# RUN: llvm-mc -triple x86_64-windows-msvc %s -filetype=obj -o %t.o
+# RUN: llvm-pdbutil dump -symbols %t.o | FileCheck %s
+
+# We used to have a label flushing bug down below by the "BUG" comments that
+# would cause the S_DEFRANGE_FRAMEPOINTER_REL records to appear missing. In
+# practice, the label would extend past the def range, so it would appear that
+# every local was optimized out or had no def ranges.
+
+# CHECK: S_GPROC32_ID {{.*}} `max`
+# CHECK: S_LOCAL [size = {{.*}}] `a`
+# CHECK: S_DEFRANGE_FRAMEPOINTER_REL
+# CHECK: S_LOCAL [size = {{.*}}] `b`
+# CHECK: S_DEFRANGE_FRAMEPOINTER_REL
+
+	.text
+	.def	 @feat.00;
+	.scl	3;
+	.type	0;
+	.endef
+	.globl	@feat.00
+.set @feat.00, 0
+	.def	 max;
+	.scl	2;
+	.type	32;
+	.endef
+	.globl	max                     # -- Begin function max
+	.p2align	4, 0x90
+max:                                    # @max
+.Lfunc_begin0:
+	.cv_func_id 0
+	.cv_file	1 "C:\\src\\llvm-project\\build\\t.c" "44649E6EBC4FC8880991A1AF1F2D2990" 1
+	.cv_loc	0 1 1 0                 # t.c:1:0
+.seh_proc max
+# %bb.0:                                # %entry
+	pushq	%rax
+	.seh_stackalloc 8
+	.seh_endprologue
+	movl	%edx, 4(%rsp)
+	movl	%ecx, (%rsp)
+.Ltmp0:
+	.cv_loc	0 1 2 0                 # t.c:2:0
+	movl	(%rsp), %eax
+	cmpl	4(%rsp), %eax
+	jle	.LBB0_2
+# %bb.1:                                # %cond.true
+	movl	(%rsp), %eax
+	jmp	.LBB0_3
+.LBB0_2:                                # %cond.false
+	movl	4(%rsp), %eax
+.LBB0_3:                                # %cond.end
+	popq	%rcx
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.seh_handlerdata
+	.text
+	.seh_endproc
+                                        # -- End function
+	.section	.debug$S,"dr"
+	.p2align	2
+	.long 4
+	.long	241                     # Symbol subsection for max
+	.long	.Ltmp7-.Ltmp6           # Subsection size
+.Ltmp6:
+	.short	.Ltmp9-.Ltmp8           # Record length
+.Ltmp8:
+	.short	4423                    # Record kind: S_GPROC32_ID
+	.long	0                       # PtrParent
+	.long	0                       # PtrEnd
+	.long	0                       # PtrNext
+	.long	.Lfunc_end0-max         # Code size
+	.long	0                       # Offset after prologue
+	.long	0                       # Offset before epilogue
+	.long	4098                    # Function type index
+	.secrel32	max             # Function section relative address
+	.secidx	max                     # Function section index
+	.byte	0                       # Flags
+	.asciz	"max"                   # Function name
+.Ltmp9:
+	.short	.Ltmp11-.Ltmp10         # Record length
+.Ltmp10:
+	.short	4114                    # Record kind: S_FRAMEPROC
+	.long	8                       # FrameSize
+	.long	0                       # Padding
+	.long	0                       # Offset of padding
+	.long	0                       # Bytes of callee saved registers
+	.long	0                       # Exception handler offset
+	.short	0                       # Exception handler section
+	.long	81920                   # Flags (defines frame register)
+.Ltmp11:
+	.short	.Ltmp13-.Ltmp12         # Record length
+.Ltmp12:
+	.short	4414                    # Record kind: S_LOCAL
+	.long	18                      # TypeIndex
+	.short	1                       # Flags
+	.asciz	"a"
+	# BUG
+	.p2align 2
+.Ltmp13:
+	.cv_def_range	 .Ltmp0 .Ltmp1, "B\021\000\000\000\000"
+	.short	.Ltmp15-.Ltmp14         # Record length
+.Ltmp14:
+	.short	4414                    # Record kind: S_LOCAL
+	.long	18                      # TypeIndex
+	.short	1                       # Flags
+	.asciz	"b"
+	# BUG
+	.p2align 2
+.Ltmp15:
+	.cv_def_range	 .Ltmp0 .Ltmp1, "B\021\004\000\000\000"
+	.short	2                       # Record length
+	.short	4431                    # Record kind: S_PROC_ID_END
+.Ltmp7:
+	.p2align	2
+	.cv_linetable	0, max, .Lfunc_end0
+	.cv_filechecksums               # File index to string table offset subsection
+	.cv_stringtable                 # String table
+	.long	241
+	.long	.Ltmp17-.Ltmp16         # Subsection size
+.Ltmp16:
+.Ltmp17:
+	.p2align	2
+	.section	.debug$T,"dr"
+	.p2align	2
+	.long	4                       # Debug section magic
+	# ArgList (0x1000) {
+	#   TypeLeafKind: LF_ARGLIST (0x1201)
+	#   NumArgs: 2
+	#   Arguments [
+	#     ArgType: long (0x12)
+	#     ArgType: long (0x12)
+	#   ]
+	# }
+	.byte	0x0e, 0x00, 0x01, 0x12
+	.byte	0x02, 0x00, 0x00, 0x00
+	.byte	0x12, 0x00, 0x00, 0x00
+	.byte	0x12, 0x00, 0x00, 0x00
+	# Procedure (0x1001) {
+	#   TypeLeafKind: LF_PROCEDURE (0x1008)
+	#   ReturnType: long (0x12)
+	#   CallingConvention: NearC (0x0)
+	#   FunctionOptions [ (0x0)
+	#   ]
+	#   NumParameters: 2
+	#   ArgListType: (long, long) (0x1000)
+	# }
+	.byte	0x0e, 0x00, 0x08, 0x10
+	.byte	0x12, 0x00, 0x00, 0x00
+	.byte	0x00, 0x00, 0x02, 0x00
+	.byte	0x00, 0x10, 0x00, 0x00
+	# FuncId (0x1002) {
+	#   TypeLeafKind: LF_FUNC_ID (0x1601)
+	#   ParentScope: 0x0
+	#   FunctionType: long (long, long) (0x1001)
+	#   Name: max
+	# }
+	.byte	0x0e, 0x00, 0x01, 0x16
+	.byte	0x00, 0x00, 0x00, 0x00
+	.byte	0x01, 0x10, 0x00, 0x00
+	.byte	0x6d, 0x61, 0x78, 0x00
+
diff --git a/test/MC/COFF/eh-frame.s b/test/MC/COFF/eh-frame.s
index e606b764d64759fc9fa6e720287facba4fed8e46..2c5ff202dc6c657ba171ef8d69cbcdf6637dd8a4 100644
--- a/test/MC/COFF/eh-frame.s
+++ b/test/MC/COFF/eh-frame.s
@@ -12,3 +12,16 @@ _main:
 	.cfi_endproc
 
 // CHECK:    Name: .eh_frame
+// CHECK-NEXT:    VirtualSize:
+// CHECK-NEXT:    VirtualAddress:
+// CHECK-NEXT:    RawDataSize:
+// CHECK-NEXT:    PointerToRawData:
+// CHECK-NEXT:    PointerToRelocations:
+// CHECK-NEXT:    PointerToLineNumbers:
+// CHECK-NEXT:    RelocationCount:
+// CHECK-NEXT:    LineNumberCount:
+// CHECK-NEXT:    Characteristics [
+// CHECK-NEXT:      IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:      IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:      IMAGE_SCN_MEM_READ
+// CHECK-NEXT:    ]
diff --git a/test/MC/Disassembler/AArch64/armv8.2a-dotprod.txt b/test/MC/Disassembler/AArch64/armv8.2a-dotprod.txt
index abcd8bb3a7bdf627e858881db88e390231df3339..90f3df6d10735448b45f49ca8d607d322b596cd7 100644
--- a/test/MC/Disassembler/AArch64/armv8.2a-dotprod.txt
+++ b/test/MC/Disassembler/AArch64/armv8.2a-dotprod.txt
@@ -1,6 +1,7 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+dotprod --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=cortex-a75 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=cortex-a55 --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=tsv110 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=-dotprod --disassemble < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 
 0x20,0x94,0x82,0x2e
diff --git a/test/MC/Disassembler/AArch64/armv8.5a-specrestrict.txt b/test/MC/Disassembler/AArch64/armv8.5a-specrestrict.txt
index c26ab94630ae5a360c9d2529705de284490b7044..3a7af1f0822df80b587fec851e31fca355039d71 100644
--- a/test/MC/Disassembler/AArch64/armv8.5a-specrestrict.txt
+++ b/test/MC/Disassembler/AArch64/armv8.5a-specrestrict.txt
@@ -40,13 +40,3 @@
 # NOSPECID: msr S3_4_C13_C0_7, x6
 # NOSPECID: msr S3_6_C13_C0_7, x5
 # NOSPECID: msr S3_5_C13_C0_7, x4
-
-[0x3f 0x41 0x03 0xd5]
-[0xc3 0x42 0x1b 0xd5]
-[0xc2 0x42 0x3b 0xd5]
-# CHECK:    msr SSBS, #1
-# CHECK:    msr SSBS, x3
-# CHECK:    mrs x2, SSBS
-# NOSPECID: msr S0_3_C4_C1_1, xzr
-# NOSPECID: msr S3_3_C4_C2_6, x3
-# NOSPECID: mrs x2, S3_3_C4_C2_6
diff --git a/test/MC/Disassembler/AArch64/armv8.5a-ssbs.txt b/test/MC/Disassembler/AArch64/armv8.5a-ssbs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7f0b24a311759d36a3bfd947f1fe2e18767f1b73
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/armv8.5a-ssbs.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -triple=aarch64 -mattr=+ssbs  -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple=aarch64 -mattr=+v8.5a -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple=aarch64 -mattr=-ssbs  -disassemble < %s | FileCheck %s --check-prefix=NOSPECID
+
+[0x3f 0x41 0x03 0xd5]
+[0xc3 0x42 0x1b 0xd5]
+[0xc2 0x42 0x3b 0xd5]
+# CHECK:    msr SSBS, #1
+# CHECK:    msr SSBS, x3
+# CHECK:    mrs x2, SSBS
+# NOSPECID: msr S0_3_C4_C1_1, xzr
+# NOSPECID: msr S3_3_C4_C2_6, x3
+# NOSPECID: mrs x2, S3_3_C4_C2_6
diff --git a/test/MC/Disassembler/RISCV/invalid-instruction.txt b/test/MC/Disassembler/RISCV/invalid-instruction.txt
index 1bf033cbe3e6f94a8062370442f165766860f1af..79e73e243d18bbfec6e429310d64e7bc085e949d 100644
--- a/test/MC/Disassembler/RISCV/invalid-instruction.txt
+++ b/test/MC/Disassembler/RISCV/invalid-instruction.txt
@@ -4,10 +4,6 @@
 # Test generated by a LLVM MC Disassembler Protocol Buffer Fuzzer
 # for the RISC-V assembly language.
 
-# This should not decode as c.addi4spn with 0 imm when compression is enabled.
-[0x00 0x00]
-# CHECK: warning: invalid instruction encoding
-
 # This should not decode as c.addi16sp with 0 imm when compression is enabled.
 [0x01 0x61]
 # CHECK: warning: invalid instruction encoding
diff --git a/test/MC/Disassembler/Sparc/sparc-v9.txt b/test/MC/Disassembler/Sparc/sparc-v9.txt
index 0a81b8df4cdaafaccf9ddd299900a5b362ef6395..8f68513387e09daf1993b652c86a9c2c47a3abc9 100644
--- a/test/MC/Disassembler/Sparc/sparc-v9.txt
+++ b/test/MC/Disassembler/Sparc/sparc-v9.txt
@@ -115,4 +115,19 @@
 0x9f 0xd0 0x30 0x52
 
 # CHECK: tvs %xcc, %g1 + %i2   
-0x8f 0xd0 0x50 0x1a
\ No newline at end of file
+0x8f 0xd0 0x50 0x1a
+
+# CHECK: membar 5000
+0x81 0x43 0xf3 0x88
+
+# CHECK: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore
+0x81 0x43 0xe0 0x0f
+
+# CHECK: membar #LoadLoad
+0x81 0x43 0xe0 0x01
+
+# CHECK: membar #LoadLoad | #StoreStore
+0x81 0x43 0xe0 0x09
+
+# CHECK: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore | #Lookaside | #MemIssue | #Sync
+0x81 0x43 0xe0 0x7f
diff --git a/test/MC/Disassembler/WebAssembly/wasm.txt b/test/MC/Disassembler/WebAssembly/wasm.txt
index 785c57826a2c27c0f6c420f228bcd9e7293b6a14..fd21db9ad471564664ccd7a3a058c37e07e8ba52 100644
--- a/test/MC/Disassembler/WebAssembly/wasm.txt
+++ b/test/MC/Disassembler/WebAssembly/wasm.txt
@@ -33,7 +33,16 @@
 
 # v128.const is arbitrarily disassembled as v16i8
 # CHECK: v128.const 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-0xFD 0x00 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0A 0x0B 0x0C 0x0D 0x0E 0x0F
+0xFD 0x02 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0A 0x0B 0x0C 0x0D 0x0E 0x0F
 
 # CHECK: v8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-0xFD 0x17 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0A 0x0B 0x0C 0x0D 0x0E 0x0F
+0xFD 0x03 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0A 0x0B 0x0C 0x0D 0x0E 0x0F
+
+# Check LEB128 encoding of SIMD instructions
+# CHECK: i64x2.all_true
+0xFD 0x86 0x01
+
+# Including non-canonical LEB128 encodings
+# CHECK: i64x2.any_true
+# CHECK-NOT: i64.div_u
+0xFD 0x85 0x81 0x80 0x80 0x80 0x80 0x00
diff --git a/test/MC/ELF/cfi-restore-extended.s b/test/MC/ELF/cfi-restore-extended.s
new file mode 100644
index 0000000000000000000000000000000000000000..e7371089934d9911751a70416037cde518c814f8
--- /dev/null
+++ b/test/MC/ELF/cfi-restore-extended.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-dwarfdump -debug-frame - | FileCheck %s
+
+// Check that register numbers greater than 63 can be used in .cfi_restore directives
+f:
+  .cfi_startproc
+  nop
+// CHECK: DW_CFA_advance_loc: 1
+  .cfi_restore %rbp
+// CHECK-NEXT: DW_CFA_restore: reg6
+  nop
+// CHECK-NEXT: DW_CFA_advance_loc: 1
+  .cfi_restore 89
+// CHECK-NEXT: DW_CFA_restore_extended: reg89
+// CHECK-NEXT: DW_CFA_nop:
+  nop
+  .cfi_endproc
+
diff --git a/test/MC/ELF/common-redeclare.s b/test/MC/ELF/common-redeclare.s
index f8ee17d84e2ed827c2ba116a7fff32605d488c48..66de9b0ec59ef5cc477a9c846c7cd17ad5a8424a 100644
--- a/test/MC/ELF/common-redeclare.s
+++ b/test/MC/ELF/common-redeclare.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux %s | llvm-objdump -t - | FileCheck %s
 
-# CHECK: 0000000000000004 g       *COM*  00000004 C
+# CHECK: 0000000000000004 g     O *COM*  00000004 C
+        .comm   C,4,4
         .comm   C,4,4
-        .comm   C,4,4
\ No newline at end of file
diff --git a/test/MC/ELF/undefined-debug.s b/test/MC/ELF/undefined-debug.s
new file mode 100644
index 0000000000000000000000000000000000000000..95ead70ef97149feb1f83fe2aa480689bc921b6c
--- /dev/null
+++ b/test/MC/ELF/undefined-debug.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t 2>&1 | FileCheck %s
+// CHECK: error: Undefined section reference: .debug_pubnames
+
+.section .foo,"",@progbits
+  .long  .debug_pubnames
diff --git a/test/MC/Hexagon/common-redeclare.s b/test/MC/Hexagon/common-redeclare.s
index 52b77992a871d64fc0159c27377e5cef075e09a0..2b92bf763e344ad78b54fe90a99dfd982e3dacfb 100644
--- a/test/MC/Hexagon/common-redeclare.s
+++ b/test/MC/Hexagon/common-redeclare.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -t - | FileCheck %s
 
-# CHECK: 00000062 g       *COM*  00000008 quartet_table_isqrt
+# CHECK: 00000062 g     O *COM*           00000008 quartet_table_isqrt
 
 .common quartet_table_isqrt, 98, 8
 .common quartet_table_isqrt, 98, 8
diff --git a/test/MC/Hexagon/gprel-shflag.s b/test/MC/Hexagon/gprel-shflag.s
new file mode 100644
index 0000000000000000000000000000000000000000..6a8a9a027b69175cd5a04c3c9b13955dd0f5ab5c
--- /dev/null
+++ b/test/MC/Hexagon/gprel-shflag.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -arch=hexagon -filetype=asm %s | FileCheck %s
+
+# Make sure the assembler can parse and print the "s" flag for Hexaon's
+# small-data section.
+# CHECK: .section .sdata,"aws",@progbits
+
+  .section ".sdata", "aws", @progbits
+var:
+  .word 0
+
diff --git a/test/MC/Hexagon/lcomm.s b/test/MC/Hexagon/lcomm.s
index ba44bc842c7b01594716262f77d2be96d4eeab42..78a7e89a47c26777c62fc2184598922879497fff 100644
--- a/test/MC/Hexagon/lcomm.s
+++ b/test/MC/Hexagon/lcomm.s
@@ -9,11 +9,11 @@ r0 = add(pc, ##dst2@PCREL)
 r0 = add(pc, ##dst4@PCREL)
 r0 = add(pc, ##dst8@PCREL)
 
-# CHECK: 00000000 l       .sbss.1                 00000001 dst1
-# CHECK: 00000000 l       .sbss.2                 00000002 dst2
-# CHECK: 00000000 l       .sbss.4                 00000004 dst4
-# CHECK: 00000000 l       .sbss.8                 00000008 dst8
+# CHECK: 00000000 l     O .sbss.1                 00000001 dst1
+# CHECK: 00000000 l     O .sbss.2                 00000002 dst2
+# CHECK: 00000000 l     O .sbss.4                 00000004 dst4
+# CHECK: 00000000 l     O .sbss.8                 00000008 dst8
 # CHECK: 00000000 l    d  .sbss.1                 00000000 .sbss.1
 # CHECK: 00000000 l    d  .sbss.2                 00000000 .sbss.2
 # CHECK: 00000000 l    d  .sbss.4                 00000000 .sbss.4
-# CHECK: 00000000 l    d  .sbss.8                 00000000 .sbss.8
\ No newline at end of file
+# CHECK: 00000000 l    d  .sbss.8                 00000000 .sbss.8
diff --git a/test/MC/Hexagon/quad_regs.s b/test/MC/Hexagon/quad_regs.s
new file mode 100644
index 0000000000000000000000000000000000000000..6805f3bdb35a401eed9d3a5b94b987795c24cdb5
--- /dev/null
+++ b/test/MC/Hexagon/quad_regs.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv66 -mhvx -filetype=obj %s | llvm-objdump -mcpu=hexagonv66 -mhvx -d - | FileCheck %s
+
+# Test for quad register parsing and printing
+# CHECK: { v3:0.w = vrmpyz(v0.b,r0.b) }
+v3:0.w = vrmpyz(v0.b,r0.b)
diff --git a/test/MC/Hexagon/v66.s b/test/MC/Hexagon/v66.s
new file mode 100644
index 0000000000000000000000000000000000000000..465345fb35e77165fc729c603f40e0b576ed64a9
--- /dev/null
+++ b/test/MC/Hexagon/v66.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv66 -mhvx -filetype=obj %s | llvm-objdump -mcpu=hexagonv66 -mhvx -d - | FileCheck %s
+
+# CHECK: 1d8362e4 { v4.w = vsatdw(v2.w,v3.w)
+{
+  v4.w = vsatdw(v2.w, v3.w)
+  vmem(r16+#0) = v4.new
+}
+
+# CHECK: 1aaae5e0 { v1:0.w = vasrinto(v5.w,v10.w) }
+  v1:0.w = vasrinto(v5.w, v10.w)
+
+# CHECK: 1aaae5e0 { v1:0.w = vasrinto(v5.w,v10.w) }
+  v1:0 = vasrinto(v5, v10)
+
+# CHECK: 1d89ef14 { v20.w = vadd(v15.w,v9.w,q0):carry:sat }
+  v20.w = vadd(v15.w, v9.w, q0):carry:sat
+
diff --git a/test/MC/Hexagon/z-instructions.s b/test/MC/Hexagon/z-instructions.s
new file mode 100644
index 0000000000000000000000000000000000000000..2a2482e17d9e62b43a9280331a8b59a74838fc63
--- /dev/null
+++ b/test/MC/Hexagon/z-instructions.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv66 -mhvx -filetype=obj %s | llvm-objdump -mcpu=hexagonv66 -mhvx -d - | FileCheck --implicit-check-not='{' %s
+
+# CHECK:      2d00c000 { z = vmem(r0++#0) }
+z = vmem(r0++#0)
+
+# CHECK-NEXT: 2c00c000 { z = vmem(r0+#0) }
+z = vmem(r0+#0)
+
+# CHECK-NEXT: 2d00c001 { z = vmem(r0++m0) }
+z = vmem(r0++m0)
+
+# CHECK-NEXT: { v3:0.w += vrmpyz(v13.b,r3.b++)
+# CHECK-NEXT:   v13.tmp = vmem(r2++#1)
+# CHECK-NEXT:   z = vmem(r3+#0) }
+{ v13.tmp = vmem(r2++#1)
+  v3:0.w += vrmpyz(v13.b,r3.b++)
+  z = vmem(r3+#0) }
diff --git a/test/MC/MSP430/refsym.s b/test/MC/MSP430/refsym.s
new file mode 100644
index 0000000000000000000000000000000000000000..684894ca99200c5fa1c0314b63e8ee3fc3ffb875
--- /dev/null
+++ b/test/MC/MSP430/refsym.s
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -filetype=obj -triple=msp430 %s | llvm-readobj -t - | FileCheck %s
+
+foo:
+  .refsym __hook
+
+; CHECK:       Symbol {
+; CHECK:         Name: __hook (30)
+; CHECK-NEXT:    Value: 0x0
+; CHECK-NEXT:    Size: 0
+; CHECK-NEXT:    Binding: Global (0x1)
+; CHECK-NEXT:    Type: None (0x0)
+; CHECK-NEXT:    Other: 0
+; CHECK-NEXT:    Section: Undefined (0x0)
+; CHECK-NEXT:  }
diff --git a/test/MC/MSP430/reloc.s b/test/MC/MSP430/reloc.s
index 42dd64a43c555d850874d39badcb2d885331818a..61a25f494735e30085bbc04947da645020232384 100644
--- a/test/MC/MSP430/reloc.s
+++ b/test/MC/MSP430/reloc.s
@@ -20,3 +20,8 @@
          jmp    foo
 ; CHECK: jmp    foo             ; encoding: [A,0b001111AA]
 ; CHECK:                        ;   fixup A - offset: 0, value: foo, kind: fixup_10_pcrel
+
+; RUN: llvm-mc -filetype=obj -triple msp430 < %s | llvm-readobj -r \
+; RUN:   | FileCheck -check-prefix=RELOC %s
+.short  _ctype+3
+; RELOC: R_MSP430_16_BYTE _ctype 0x3
diff --git a/test/MC/MachO/ARM/bad-darwin-ARM-offset-scattered.s b/test/MC/MachO/ARM/bad-darwin-ARM-offset-scattered.s
new file mode 100644
index 0000000000000000000000000000000000000000..e5b279a8af7eb940cf8deb47a39497b41fa1774d
--- /dev/null
+++ b/test/MC/MachO/ARM/bad-darwin-ARM-offset-scattered.s
@@ -0,0 +1,15 @@
+@ RUN: not llvm-mc -n -triple armv7-apple-darwin10 %s -filetype=obj -o - 2> %t.err > %t
+@ RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+.text
+.space 0x1029eb8
+
+fn:
+    movw  r0, :lower16:(fn2-L1)
+    andeq r0, r0, r0
+L1:
+    andeq r0, r0, r0
+
+fn2:
+
+@ CHECK-ERROR: error: can not encode offset '0x1029EB8' in resulting scattered relocation.
diff --git a/test/MC/MachO/ARM/build-version-sdk-version-errors.c b/test/MC/MachO/ARM/build-version-sdk-version-errors.c
new file mode 100644
index 0000000000000000000000000000000000000000..bd568d58129c5e68a973adcead7ce7ca50f5031a
--- /dev/null
+++ b/test/MC/MachO/ARM/build-version-sdk-version-errors.c
@@ -0,0 +1,19 @@
+// RUN: not llvm-mc -triple x86_64-apple-macos %s 2>&1 | FileCheck %s
+
+.build_version macos,3,4,5 sdk_version
+// CHECK: invalid SDK major version number, integer expected
+
+.build_version macos,3,4,5 sdk_version 10
+// CHECK: SDK minor version number required, comma expected
+
+.build_version macos,3,4,5 sdk_version 10,
+// CHECK: invalid SDK minor version number, integer expected
+
+.build_version macos,3,4,5 sdk_version 10,1,
+// CHECK: invalid SDK subminor version number, integer expected
+
+.build_version macos,3,4,5 sdk_version 10,10000
+// CHECK: invalid SDK minor version number
+
+.build_version macos,3,4,5 sdk_version 10,255,10000
+// CHECK: invalid SDK subminor version number
diff --git a/test/MC/MachO/ARM/build-version-sdk-version.s b/test/MC/MachO/ARM/build-version-sdk-version.s
new file mode 100644
index 0000000000000000000000000000000000000000..98f6d4d7bd2da69b8701e4404c0181a93d4ab369
--- /dev/null
+++ b/test/MC/MachO/ARM/build-version-sdk-version.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple x86_64-apple-macos %s | FileCheck %s
+
+.build_version macos,3,4,5 sdk_version 10,14
+// CHECK: .build_version macos, 3, 4, 5 sdk_version 10, 14
+
+.build_version ios,6,7 sdk_version 6,1,0
+// CHECK: .build_version ios, 6, 7 sdk_version 6, 1, 0
+
+.build_version tvos,8,9 sdk_version 9,0,10
+// CHECK: .build_version tvos, 8, 9 sdk_version 9, 0, 10
+
+.build_version watchos,10,11 sdk_version 10,11
+// CHECK: .build_version watchos, 10, 11 sdk_version 10, 11
diff --git a/test/MC/MachO/build-version-with-sdk-version.s b/test/MC/MachO/build-version-with-sdk-version.s
new file mode 100644
index 0000000000000000000000000000000000000000..e96c2d520e172045d6defa13c13168e3d6936886
--- /dev/null
+++ b/test/MC/MachO/build-version-with-sdk-version.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple x86_64-apple-macos %s -filetype=obj -o - | llvm-readobj --macho-version-min | FileCheck %s
+
+// Test the formation of the sdk_version component of the version load
+// command in the MachO.
+.build_version macos, 10,13,2 sdk_version 10,14,1
+
+// CHECK: MinVersion {
+// CHECK:   Cmd: LC_BUILD_VERSION
+// CHECK:   Size: 24
+// CHECK:   Platform: macos
+// CHECK:   Version: 10.13.2
+// CHECK:   SDK: 10.14.1
+// CHECK: }
diff --git a/test/MC/MachO/cstexpr-gotpcrel-32.ll b/test/MC/MachO/cstexpr-gotpcrel-32.ll
index 8e00b4ace067397268c7b69e8d4265157bd1053e..9b08ee7b867768bc9d4863404d5be5d1ff3a221c 100644
--- a/test/MC/MachO/cstexpr-gotpcrel-32.ll
+++ b/test/MC/MachO/cstexpr-gotpcrel-32.ll
@@ -72,3 +72,24 @@ define i32 @t0(i32 %a) {
                     i32 ptrtoint (i32 (i32)* @t0 to i32)), %a
   ret i32 %x
 }
+
+; Text indirect local symbols.
+; CHECK-LABEL: _localindirect
+; CHECK: .long 65603
+@localindirect = internal constant i32  65603
+@got.localindirect = private unnamed_addr constant i32* @localindirect
+
+; CHECK-LABEL: _localindirectuser:
+; CHECK: .long   L_localindirect$non_lazy_ptr-_localindirectuser
+@localindirectuser = internal constant
+  i32 sub (i32 ptrtoint (i32** @got.localindirect to i32),
+           i32 ptrtoint (i32* @localindirectuser to i32))
+
+; CHECK-LABEL: L_localindirect$non_lazy_ptr:
+; CHECK: .indirect_symbol _localindirect
+; CHECK-NOT: .long 0
+; CHECK-NEXT: .long _localindirect
+define i8* @testRelativeIndirectSymbol() {
+  %1 = bitcast i32* @localindirectuser to i8*
+  ret i8* %1
+}
diff --git a/test/MC/MachO/darwin-sdk-version.ll b/test/MC/MachO/darwin-sdk-version.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5052e058eadd49b799056fbdf94234ae6675c2af
--- /dev/null
+++ b/test/MC/MachO/darwin-sdk-version.ll
@@ -0,0 +1,18 @@
+; RUN: llc %s -filetype=obj -o - | llvm-objdump -macho -private-headers - | FileCheck %s
+; RUN: llc %s -filetype=asm -o - | FileCheck --check-prefix=ASM %s
+
+target triple = "x86_64-apple-macos10.14";
+!llvm.module.flags = !{!0};
+!0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 14, i32 2 ] };
+
+define void @foo() {
+entry:
+  ret void
+}
+
+; CHECK:           cmd LC_VERSION_MIN_MACOSX
+; CHECK-NEXT:  cmdsize 16
+; CHECK-NEXT:  version 10.14
+; CHECK-NEXT:      sdk 10.14.2
+
+; ASM: .macosx_version_min 10, 14 sdk_version 10, 14, 2
diff --git a/test/MC/MachO/osx-version-min-load-command-with-sdk-errors.s b/test/MC/MachO/osx-version-min-load-command-with-sdk-errors.s
new file mode 100644
index 0000000000000000000000000000000000000000..9547d07e3192f5bb7090e0ca2bc8f1dfef8f41b9
--- /dev/null
+++ b/test/MC/MachO/osx-version-min-load-command-with-sdk-errors.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple x86_64-apple-macos %s -o - 2>&1 | FileCheck %s
+
+.macosx_version_min 10,13,2 sdk_version 10
+// CHECK: SDK minor version number required, comma expected
+
+.macosx_version_min 10,13,2 sdk_version 10,
+// CHECK: invalid SDK minor version number, integer expected
+
+.macosx_version_min 10,13,2 sdk_version 10
+// CHECK: SDK minor version number required, comma expected
diff --git a/test/MC/MachO/osx-version-min-load-command-with-sdk.s b/test/MC/MachO/osx-version-min-load-command-with-sdk.s
new file mode 100644
index 0000000000000000000000000000000000000000..d2ad463d1d6e565902b99331d0c0de5a10cb2f36
--- /dev/null
+++ b/test/MC/MachO/osx-version-min-load-command-with-sdk.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -triple x86_64-apple-macos %s -filetype=obj -o - | llvm-readobj --macho-version-min | FileCheck %s
+
+// Test the formation of the sdk_version component of the version load
+// command in the MachO.
+.macosx_version_min 10,13,2 sdk_version 10,14
+
+// CHECK: MinVersion {
+// CHECK:   Cmd: LC_VERSION_MIN_MACOSX
+// CHECK:   Size: 16
+// CHECK:   Version: 10.13.2
+// CHECK:   SDK: 10.14
+// CHECK: }
diff --git a/test/MC/Mips/cpload.s b/test/MC/Mips/cpload.s
index 3bbad60e578dd9c0b8accec80ce9c8658dae3db8..2c09fa2bbeca09e90b4ad9b37670c50a6e6634df 100644
--- a/test/MC/Mips/cpload.s
+++ b/test/MC/Mips/cpload.s
@@ -1,13 +1,17 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=ASM
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r2 \
+# RUN:   | FileCheck %s -check-prefix=ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -filetype=obj -o -| \
-# RUN:  llvm-objdump -d -r - | FileCheck %s -check-prefix=OBJ-O32
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r2 \
+# RUN:            -filetype=obj -o - \
+# RUN:  | llvm-objdump -d -r - | FileCheck %s -check-prefix=OBJ-O32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -target-abi n32 -filetype=obj -o -| \
-# RUN:  llvm-objdump -d -r - | FileCheck %s -check-prefix=OBJ-N32
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -mcpu=mips64r2 \
+# RUN:            -filetype=obj -o - \
+# RUN:  | llvm-objdump -d -r - | FileCheck %s -check-prefix=OBJ-N32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -filetype=obj -o -| \
-# RUN:  llvm-objdump -d -r - | FileCheck %s -check-prefix=OBJ-N64
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu -mcpu=mips64r2 \
+# RUN:            -filetype=obj -o - \
+# RUN:  | llvm-objdump -d -r - | FileCheck %s -check-prefix=OBJ-N64
 
 # ASM:    .text
 # ASM:    .option pic2
diff --git a/test/MC/Mips/cprestore-noreorder-noat.s b/test/MC/Mips/cprestore-noreorder-noat.s
index 07c4dd2ea358d6837ee9551a3f558ab946be536c..cba299eaf6abe65b997deb3f969f394bd751f7e6 100644
--- a/test/MC/Mips/cprestore-noreorder-noat.s
+++ b/test/MC/Mips/cprestore-noreorder-noat.s
@@ -1,18 +1,20 @@
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips32 --position-independent -filetype=obj \
-# RUN:   -o /dev/null 2>&1 | FileCheck %s -check-prefix=O32
+# RUN: not llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32 \
+# RUN:                --position-independent -filetype=obj -o /dev/null 2>&1 \
+# RUN:  | FileCheck %s -check-prefix=O32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 \
-# RUN:   -filetype=obj -o /dev/null 2>&1 | FileCheck %s -allow-empty -check-prefix=N32
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu -filetype=obj \
+# RUN:            -o /dev/null 2>&1 \
+# RUN:   | FileCheck %s -allow-empty -check-prefix=N64
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi=n32 \
-# RUN:   -filetype=obj -o /dev/null 2>&1 | FileCheck %s -allow-empty -check-prefix=N64
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -filetype=obj \
+# RUN:            -o /dev/null 2>&1 \
+# RUN:   | FileCheck %s -allow-empty -check-prefix=N32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi=n32 \
-# RUN:   -filetype=obj -o - | llvm-objdump -d -r - | \
-# RUN:   FileCheck %s -check-prefix=NO-STORE
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -filetype=obj -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s -check-prefix=NO-STORE
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -filetype=obj -o - | \
-# RUN:   llvm-objdump -d -r - | FileCheck %s -check-prefix=NO-STORE
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu -filetype=obj -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s -check-prefix=NO-STORE
 
   .text
   .ent foo
diff --git a/test/MC/Mips/cprestore-noreorder.s b/test/MC/Mips/cprestore-noreorder.s
index cce43d333fae002faaba121c7945fc7918fb6e53..cd7a8c0b84e5a66cfaa27d1307f07d8c59bc8a23 100644
--- a/test/MC/Mips/cprestore-noreorder.s
+++ b/test/MC/Mips/cprestore-noreorder.s
@@ -1,20 +1,24 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 --position-independent -show-encoding | \
-# RUN:  FileCheck %s
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu --position-independent \
+# RUN:            -show-encoding | FileCheck %s
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 --position-independent -filetype=obj -o -| \
-# RUN:  llvm-objdump -d -r - | FileCheck %s -check-prefix=CHECK-FOR-STORE
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu --position-independent \
+# RUN:            -filetype=obj -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s -check-prefix=CHECK-FOR-STORE
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+micromips --position-independent -show-encoding | \
-# RUN:  FileCheck %s -check-prefix=MICROMIPS
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+micromips \
+# RUN:            --position-independent -show-encoding \
+# RUN:   | FileCheck %s -check-prefix=MICROMIPS
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -show-encoding | \
-# RUN:  FileCheck %s -check-prefix=NO-PIC
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -show-encoding \
+# RUN:   | FileCheck %s -check-prefix=NO-PIC
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 --position-independent -show-encoding | \
-# RUN:  FileCheck %s -check-prefixes=BAD-ABI,BAD-ABI-N32
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 \
+# RUN:            --position-independent -show-encoding \
+# RUN:   | FileCheck %s -check-prefixes=BAD-ABI,BAD-ABI-N32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 --position-independent -show-encoding | \
-# RUN:  FileCheck %s -check-prefixes=BAD-ABI,BAD-ABI-N64
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu --position-independent \
+# RUN:            -show-encoding \
+# RUN:   | FileCheck %s -check-prefixes=BAD-ABI,BAD-ABI-N64
 
   .text
   .ent foo
diff --git a/test/MC/Mips/cprestore-reorder.s b/test/MC/Mips/cprestore-reorder.s
index 885977992978c7f2721e3e64124176656f187c8e..9666969b015942ab421290262ead7fa43fe5ba58 100644
--- a/test/MC/Mips/cprestore-reorder.s
+++ b/test/MC/Mips/cprestore-reorder.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 --position-independent -show-encoding | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu --position-independent -show-encoding | \
 # RUN:  FileCheck %s
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 --position-independent -filetype=obj -o -| \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu --position-independent -filetype=obj -o -| \
 # RUN:  llvm-objdump -d -r - | FileCheck %s -check-prefix=CHECK-FOR-STORE
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+micromips --position-independent -show-encoding | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+micromips --position-independent -show-encoding | \
 # RUN:  FileCheck %s -check-prefix=MICROMIPS
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -show-encoding | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -show-encoding | \
 # RUN:  FileCheck %s -check-prefix=NO-PIC
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 --position-independent -show-encoding | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 --position-independent -show-encoding | \
 # RUN:  FileCheck %s -check-prefixes=BAD-ABI,BAD-ABI-N32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 --position-independent -show-encoding | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu --position-independent -show-encoding | \
 # RUN:  FileCheck %s -check-prefixes=BAD-ABI,BAD-ABI-N64
 
   .text
diff --git a/test/MC/Mips/cprestore-warning-unused.s b/test/MC/Mips/cprestore-warning-unused.s
index e4999b48fb994c44dd82de167da12db68c421580..5a80970836253a01ec7c5b9e4756084b67dd439e 100644
--- a/test/MC/Mips/cprestore-warning-unused.s
+++ b/test/MC/Mips/cprestore-warning-unused.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 --position-independent 2>%t1
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu --position-independent 2>%t1
 # RUN: FileCheck %s < %t1
 
   .text
diff --git a/test/MC/Mips/eh-frame.s b/test/MC/Mips/eh-frame.s
index 91032572fb16068b9d994523031bb33df2eaec02..e03027a20a4fddf3d44f61f8ae4a4eb847a4c9de 100644
--- a/test/MC/Mips/eh-frame.s
+++ b/test/MC/Mips/eh-frame.s
@@ -1,18 +1,18 @@
 // Test the bits of .eh_frame on mips that are already implemented correctly.
 
-// RUN: llvm-mc -filetype=obj %s -o %t.o -arch=mips
+// RUN: llvm-mc -filetype=obj %s -o %t.o -triple mips-unknown-linux-gnu
 // RUN: llvm-objdump -r -section=.rel.eh_frame %t.o | FileCheck --check-prefix=REL32 %s
 // RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefix=DWARF32 %s
 
-// RUN: llvm-mc -filetype=obj %s -o %t.o -arch=mipsel
+// RUN: llvm-mc -filetype=obj %s -o %t.o -triple mipsel-unknown-linux-gnu
 // RUN: llvm-objdump -r -section=.rel.eh_frame %t.o | FileCheck --check-prefix=REL32 %s
 // RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefix=DWARF32 %s
 
-// RUN: llvm-mc -filetype=obj %s -o %t.o -arch=mips64
+// RUN: llvm-mc -filetype=obj %s -o %t.o -triple mips64-unknown-linux-gnu
 // RUN: llvm-objdump -r -section=.rela.eh_frame %t.o | FileCheck --check-prefix=REL64 %s
 // RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefix=DWARF64 %s
 
-// RUN: llvm-mc -filetype=obj %s -o %t.o -arch=mips64el
+// RUN: llvm-mc -filetype=obj %s -o %t.o -triple mips64el-unknown-linux-gnu
 // RUN: llvm-objdump -r -section=.rela.eh_frame %t.o | FileCheck --check-prefix=REL64 %s
 // RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefix=DWARF64 %s
 
diff --git a/test/MC/Mips/expansion-j-sym-pic.s b/test/MC/Mips/expansion-j-sym-pic.s
index 6bb67a87eda62ea296eaa2fe1e3774672fbee4bf..b04355593c66d710cd84af895d9d5b1a88196c5a 100644
--- a/test/MC/Mips/expansion-j-sym-pic.s
+++ b/test/MC/Mips/expansion-j-sym-pic.s
@@ -1,22 +1,22 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -show-encoding |\
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -show-encoding |\
 # RUN:   FileCheck %s -check-prefix=NORMAL
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 -show-encoding |\
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -show-encoding |\
 # RUN:   FileCheck %s -check-prefix=NORMAL
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 -show-encoding |\
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu -show-encoding |\
 # RUN:   FileCheck %s -check-prefix=NORMAL
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=micromips -show-encoding |\
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=micromips -show-encoding |\
 # RUN:   FileCheck %s -check-prefix=MICRO
 
 # Repeat the tests using ELF output.
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj | \
 # RUN:   llvm-objdump -d -r - | FileCheck %s -check-prefixes=ELF-O32
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 -filetype=obj | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -filetype=obj | \
 # RUN:   llvm-objdump -d -r - | FileCheck %s -check-prefixes=ELF-NXX,ELF-N32
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 -filetype=obj | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu -filetype=obj | \
 # RUN:   llvm-objdump -d -r - | FileCheck %s -check-prefixes=ELF-NXX,ELF-N64
 
   .weak weak_label
diff --git a/test/MC/Mips/expansion-jal-sym-pic.s b/test/MC/Mips/expansion-jal-sym-pic.s
index dddd7aa275f340fcde4d1615daa7236cb2215794..e20562a88a5f70f29c9b18c60288315a54827e06 100644
--- a/test/MC/Mips/expansion-jal-sym-pic.s
+++ b/test/MC/Mips/expansion-jal-sym-pic.s
@@ -1,13 +1,13 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -show-encoding |\
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -show-encoding |\
 # RUN:   FileCheck %s -check-prefixes=ALL,MIPS,O32
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 -show-encoding |\
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -show-encoding |\
 # RUN:   FileCheck %s -check-prefixes=ALL,MIPS,N32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 -show-encoding |\
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu -show-encoding |\
 # RUN:   FileCheck %s -check-prefixes=ALL,MIPS,N64
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=micromips -show-encoding |\
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=micromips -show-encoding |\
 # RUN:   FileCheck %s -check-prefixes=ALL,MM,O32-MM
 
 # Repeat the tests but using ELF output. An initial version of this patch did
@@ -15,11 +15,11 @@
 # MCAsmStreamer or MCELFStreamer. This ensures that the assembly expansion and
 # direct objection emission match.
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj | \
 # RUN:   llvm-objdump -d -r - | FileCheck %s -check-prefixes=ELF-O32
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 -filetype=obj | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -filetype=obj | \
 # RUN:   llvm-objdump -d -r - | FileCheck %s -check-prefixes=ELF-N32
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 -filetype=obj | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu -filetype=obj | \
 # RUN:   llvm-objdump -d -r - | FileCheck %s -check-prefixes=ELF-N64
 
   .weak weak_label
@@ -40,27 +40,38 @@ local_label:
 # O32:                                      #   fixup A - offset: 0, value: %got(local_label), kind:   fixup_Mips_GOT
 # O32: addiu  $25, $25, %lo(local_label)    # encoding: [0x27,0x39,A,A]
 # O32:                                      #   fixup A - offset: 0, value: %lo(local_label), kind:   fixup_Mips_LO16
+# O32-NEXT: .reloc ($tmp0), R_MIPS_JALR, local_label
+
 # ELF-O32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-O32-NEXT:                 R_MIPS_GOT16 .text
 # ELF-O32-NEXT: 27 39 00 00 addiu $25, $25, 0
 # ELF-O32-NEXT:                 R_MIPS_LO16 .text
+# ELF-O32-NEXT: 03 20 f8 09 jalr $25
+# ELF-O32-NEXT:                 R_MIPS_JALR local_label
 
 # N32: lw  $25, %got_disp(local_label)($gp) # encoding: [0x8f,0x99,A,A]
 # N32:                                      #   fixup A - offset: 0, value: %got_disp(local_label), kind:   fixup_Mips_GOT_DISP
+# N32-NEXT: .reloc .Ltmp0, R_MIPS_JALR, local_label
 
 # ELF-N32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-N32-NEXT:                 R_MIPS_GOT_DISP local_label
+# ELF-N32-NEXT: 03 20 f8 09 jalr $25
+# ELF-N32-NEXT:                 R_MIPS_JALR local_label
 
 # N64: ld  $25, %got_disp(local_label)($gp) # encoding: [0xdf,0x99,A,A]
 # N64:                                      #   fixup A - offset: 0, value: %got_disp(local_label), kind:   fixup_Mips_GOT_DISP
+# N64-NEXT: .reloc .Ltmp0, R_MIPS_JALR, local_label
 
 # ELF-N64:      df 99 00 00 ld $25, 0($gp)
 # ELF-N64-NEXT:                 R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE local_label
+# ELF-N64-NEXT: 03 20 f8 09 jalr $25
+# ELF-N64-NEXT: R_MIPS_JALR/R_MIPS_NONE/R_MIPS_NONE local_label
 
 # O32-MM: lw    $25, %got(local_label)($gp)      # encoding: [0xff,0x3c,A,A]
 # O32-MM:                                        #   fixup A - offset: 0, value: %got(local_label), kind:   fixup_MICROMIPS_GOT16
 # O32-MM: addiu $25, $25, %lo(local_label)       # encoding: [0x33,0x39,A,A]
 # O32-MM:                                        #   fixup A - offset: 0, value: %lo(local_label), kind:   fixup_MICROMIPS_LO16
+# O32-MM-NEXT: .reloc ($tmp0), R_MICROMIPS_JALR, local_label
 
 # MIPS: jalr $25      # encoding: [0x03,0x20,0xf8,0x09]
 # MM:   jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c]
@@ -72,24 +83,34 @@ local_label:
 # Expanding "jal weak_label":
 # O32: lw  $25, %call16(weak_label)($gp) # encoding: [0x8f,0x99,A,A]
 # O32:                                   #   fixup A - offset: 0, value: %call16(weak_label), kind:   fixup_Mips_CALL16
+# O32-NEXT: .reloc ($tmp1), R_MIPS_JALR, weak_label
 
 # ELF-O32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-O32-NEXT:                 R_MIPS_CALL16 weak_label
+# ELF-O32-NEXT: 03 20 f8 09 jalr $25
+# ELF-O32-NEXT:                 R_MIPS_JALR weak_label
 
 # N32: lw  $25, %call16(weak_label)($gp) # encoding: [0x8f,0x99,A,A]
 # N32:                                   #   fixup A - offset: 0, value: %call16(weak_label), kind:   fixup_Mips_CALL16
+# N32-NEXT: .reloc .Ltmp1, R_MIPS_JALR, weak_label
 
 # ELF-N32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-N32-NEXT:                 R_MIPS_CALL16 weak_label
+# ELF-N32-NEXT: 03 20 f8 09 jalr $25
+# ELF-N32-NEXT:                 R_MIPS_JALR weak_label
 
 # N64: ld  $25, %call16(weak_label)($gp) # encoding: [0xdf,0x99,A,A]
 # N64:                                   #   fixup A - offset: 0, value: %call16(weak_label), kind:   fixup_Mips_CALL16
+# N64-NEXT: .reloc .Ltmp1, R_MIPS_JALR, weak_label
 
 # ELF-N64:      df 99 00 00 ld $25, 0($gp)
 # ELF-N64-NEXT:                 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE weak_label
+# ELF-N64-NEXT: 03 20 f8 09 jalr $25
+# ELF-N64-NEXT:                 R_MIPS_JALR/R_MIPS_NONE/R_MIPS_NONE weak_label
 
 # O32-MM: lw  $25, %call16(weak_label)($gp) # encoding: [0xff,0x3c,A,A]
 # O32-MM:                                   #   fixup A - offset: 0, value: %call16(weak_label), kind:   fixup_MICROMIPS_CALL16
+# O32-MM-NEXT: .reloc ($tmp1), R_MICROMIPS_JALR, weak_label
 
 # MIPS: jalr $25      # encoding: [0x03,0x20,0xf8,0x09]
 # MM:   jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c]
@@ -101,24 +122,34 @@ local_label:
 # Expanding "jal global_label":
 # O32: lw  $25, %call16(global_label)($gp) # encoding: [0x8f,0x99,A,A]
 # O32:                                     #   fixup A - offset: 0, value: %call16(global_label), kind:   fixup_Mips_CALL16
+# O32-NEXT: .reloc ($tmp2), R_MIPS_JALR, global_label
 
 # ELF-O32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-O32-NEXT:                 R_MIPS_CALL16 global_label
+# ELF-O32-NEXT: 03 20 f8 09 jalr $25
+# ELF-O32-NEXT:                 R_MIPS_JALR global_label
 
 # N32: lw  $25, %call16(global_label)($gp) # encoding: [0x8f,0x99,A,A]
 # N32:                                     #   fixup A - offset: 0, value: %call16(global_label), kind:   fixup_Mips_CALL16
+# N32-NEXT: .reloc .Ltmp2, R_MIPS_JALR, global_label
 
 # ELF-N32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-N32-NEXT:                 R_MIPS_CALL16 global_label
+# ELF-N32-NEXT: 03 20 f8 09 jalr $25
+# ELF-N32-NEXT:                 R_MIPS_JALR global_label
 
 # N64: ld  $25, %call16(global_label)($gp) # encoding: [0xdf,0x99,A,A]
 # N64:                                     #   fixup A - offset: 0, value: %call16(global_label), kind:   fixup_Mips_CALL16
+# N64-NEXT: .reloc .Ltmp2, R_MIPS_JALR, global_label
 
 # ELF-N64:      df 99 00 00 ld $25, 0($gp)
 # ELF-N64-NEXT:                 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE global_label
+# ELF-N64-NEXT: 03 20 f8 09 jalr $25
+# ELF-N64-NEXT:                 R_MIPS_JALR/R_MIPS_NONE/R_MIPS_NONE global_label
 
 # O32-MM: lw  $25, %call16(global_label)($gp) # encoding: [0xff,0x3c,A,A]
 # O32-MM:                                     #   fixup A - offset: 0, value: %call16(global_label), kind: fixup_MICROMIPS_CALL16
+# O32-MM-NEXT: .reloc ($tmp2), R_MICROMIPS_JALR, global_label
 
 # MIPS: jalr $25      # encoding: [0x03,0x20,0xf8,0x09]
 # MM:   jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c]
@@ -150,6 +181,7 @@ local_label:
 # O32-MM-NEXT:                                  #   fixup A - offset: 0, value: %got(.text), kind: fixup_MICROMIPS_GOT16
 # O32-MM-NEXT: addiu $25, $25, %lo(.text)       # encoding: [0x33,0x39,A,A]
 # O32-MM-NEXT:                                  #   fixup A - offset: 0, value: %lo(.text), kind: fixup_MICROMIPS_LO16
+# O32-MM-NEXT: .reloc ($tmp3), R_MICROMIPS_JALR, .text
 
 # MIPS: jalr $25      # encoding: [0x03,0x20,0xf8,0x09]
 # MM:   jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c]
@@ -160,32 +192,36 @@ local_label:
   nop
 
 # Expanding "jal 1f":
-# O32: lw     $25, %got($tmp0)($gp)   # encoding: [0x8f,0x99,A,A]
-# O32:                                #   fixup A - offset: 0, value: %got($tmp0), kind:   fixup_Mips_GOT
-# O32: addiu  $25, $25, %lo($tmp0)    # encoding: [0x27,0x39,A,A]
-# O32:                                #   fixup A - offset: 0, value: %lo($tmp0), kind:   fixup_Mips_LO16
+# O32: lw     $25, %got($tmp4)($gp)   # encoding: [0x8f,0x99,A,A]
+# O32:                                #   fixup A - offset: 0, value: %got($tmp4), kind:   fixup_Mips_GOT
+# O32: addiu  $25, $25, %lo($tmp4)    # encoding: [0x27,0x39,A,A]
+# O32:                                #   fixup A - offset: 0, value: %lo($tmp4), kind:   fixup_Mips_LO16
+# O32-NEXT: .reloc ($tmp5), R_MIPS_JALR, ($tmp4)
 
 # ELF-O32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-O32-NEXT:                 R_MIPS_GOT16 .text
 # ELF-O32-NEXT: 27 39 00 58 	addiu	$25, $25, 88
 # ELF-O32-NEXT:                 R_MIPS_LO16 .text
+# ELF-O32-NEXT: 03 20 f8 09 jalr $25
+# ELF-O32-NEXT:                 R_MIPS_JALR $tmp0
 
-# N32: lw  $25, %got_disp($tmp0)($gp) # encoding: [0x8f,0x99,A,A]
-# N32:                                #   fixup A - offset: 0, value: %got_disp($tmp0), kind:   fixup_Mips_GOT_DISP
+# N32: lw  $25, %got_disp(.Ltmp4)($gp) # encoding: [0x8f,0x99,A,A]
+# N32:                                #   fixup A - offset: 0, value: %got_disp(.Ltmp4), kind:   fixup_Mips_GOT_DISP
 
 # ELF-N32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-N32-NEXT:                 R_MIPS_GOT_DISP .Ltmp0
 
-# N64: ld  $25, %got_disp(.Ltmp0)($gp) # encoding: [0xdf,0x99,A,A]
-# N64:                                 #   fixup A - offset: 0, value: %got_disp(.Ltmp0), kind:   fixup_Mips_GOT_DISP
+# N64: ld  $25, %got_disp(.Ltmp4)($gp) # encoding: [0xdf,0x99,A,A]
+# N64:                                 #   fixup A - offset: 0, value: %got_disp(.Ltmp4), kind:   fixup_Mips_GOT_DISP
 
 # ELF-N64:      df 99 00 00 ld $25, 0($gp)
 # ELF-N64-NEXT:                 R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE .Ltmp0
 
-# O32-MM: lw    $25, %got($tmp0)($gp)    # encoding: [0xff,0x3c,A,A]
-# O32-MM:                                #   fixup A - offset: 0, value: %got($tmp0), kind: fixup_MICROMIPS_GOT16
-# O32-MM: addiu $25, $25, %lo($tmp0)     # encoding: [0x33,0x39,A,A]
-# O32-MM:                                #   fixup A - offset: 0, value: %lo($tmp0), kind: fixup_MICROMIPS_LO16
+# O32-MM: lw    $25, %got($tmp4)($gp)    # encoding: [0xff,0x3c,A,A]
+# O32-MM:                                #   fixup A - offset: 0, value: %got($tmp4), kind: fixup_MICROMIPS_GOT16
+# O32-MM: addiu $25, $25, %lo($tmp4)     # encoding: [0x33,0x39,A,A]
+# O32-MM:                                #   fixup A - offset: 0, value: %lo($tmp4), kind: fixup_MICROMIPS_LO16
+# O32-MM-NEXT: .reloc ($tmp5), R_MICROMIPS_JALR, ($tmp4)
 
 # MIPS: jalr $25      # encoding: [0x03,0x20,0xf8,0x09]
 # MM:   jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c]
@@ -200,11 +236,14 @@ local_label:
 # O32-FIXME:                                                         #   fixup A - offset: 0, value: %got(forward_local), kind:   fixup_Mips_GOT
 # O32-FIXME: addiu  $25, $25, %lo(forward_local)                     # encoding: [0x27,0x39,A,A]
 # O32-FIXME::                                                         #   fixup A - offset: 0, value: %lo(forward_local), kind:   fixup_Mips_LO16
+# O32-FIXME: .reloc ($tmp6), R_MIPS_JALR, forward_local
 
 # ELF-O32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-O32-NEXT:                 R_MIPS_GOT16 .text
 # ELF-O32-NEXT: 27 39 00 64 	addiu	$25, $25, 100
 # ELF-O32-NEXT:                 R_MIPS_LO16 .text
+# ELF-O32-NEXT: 03 20 f8 09 jalr $25
+# ELF-O32-NEXT:                 R_MIPS_JALR forward_local
 
 # N32-FIXME: lw  $25, %got_disp(forward_local)($gp)            # encoding: [0x8f,0x99,A,A]
 # N32-FIXME:                                                   #   fixup A - offset: 0, value: %got_disp(forward_local), kind:   fixup_Mips_GOT_DISP
@@ -222,6 +261,7 @@ local_label:
 # O32-MM-FIXME:                                                #   fixup A - offset: 0, value: %got(forward_local), kind:   fixup_MICROMIPS_GOT16
 # O32-MM-FIXME: addiu $25, $25, %lo(forward_local)             # encoding: [0x33,0x39,A,A]
 # O32-MM-FIXME:                                                #   fixup A - offset: 0, value: %lo(forward_local), kind:   fixup_MICROMIPS_LO16
+# O32-MM-FIXME: .reloc ($tmp6), R_MIPS_JALR, forward_local
 
 # MIPS: jalr $25      # encoding: [0x03,0x20,0xf8,0x09]
 # MM:   jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c]
diff --git a/test/MC/Mips/ginv/invalid.s b/test/MC/Mips/ginv/invalid.s
index 4bc77891c4c60de9e2def1d7664fc43e498b1f39..aa02ac182f4d3c99b8b978502c34bdf5bfc3cef2 100644
--- a/test/MC/Mips/ginv/invalid.s
+++ b/test/MC/Mips/ginv/invalid.s
@@ -1,11 +1,13 @@
 # Instructions that are invalid.
 #
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r6 -mattr=+ginv 2>%t1
+# RUN: not llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r6 \
+# RUN:                -mattr=+ginv 2>%t1
 # RUN: FileCheck %s < %t1
-# RUN: not llvm-mc %s -arch=mips64 -mcpu=mips64r6 -mattr=+ginv 2>%t1
+# RUN: not llvm-mc %s -triple mips64-unknown-linux-gnu -mcpu=mips64r6 \
+# RUN:                -mattr=+ginv 2>%t1
 # RUN: FileCheck %s < %t1
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r6 \
-# RUN:     -mattr=+micromips,+ginv 2>%t1
+# RUN: not llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r6 \
+# RUN:                -mattr=+micromips,+ginv 2>%t1
 # RUN: FileCheck %s < %t1
 
   ginvi            # CHECK: :[[@LINE]]:3: error: too few operands for instruction
diff --git a/test/MC/Mips/invalid-instructions-spellcheck.s b/test/MC/Mips/invalid-instructions-spellcheck.s
index c242fd05b7ae8707ab33be968b85dd73aaaaccfc..459e71c8912a1a0090e1792933b7d348a0f542df 100644
--- a/test/MC/Mips/invalid-instructions-spellcheck.s
+++ b/test/MC/Mips/invalid-instructions-spellcheck.s
@@ -1,8 +1,10 @@
-# RUN: not llvm-mc -arch mips -mcpu=mips32r2 -mattr=+micromips %s 2>&1 \
+# RUN: not llvm-mc -triple mips-unknown-linux-gnu -mcpu=mips32r2 \
+# RUN:             -mattr=+micromips %s 2>&1 \
 # RUN:   | FileCheck --check-prefixes=ALL,MMR2 %s
-# RUN: not llvm-mc -arch mips -mcpu=mips32r6 -mattr=+micromips %s 2>&1 \
+# RUN: not llvm-mc -triple mips-unknown-linux-gnu -mcpu=mips32r6 \
+# RUN:             -mattr=+micromips %s 2>&1 \
 # RUN:   | FileCheck --check-prefixes=ALL,MMR6 %s
-# RUN: not llvm-mc -arch mips -mcpu=mips32r6 %s 2>&1 \
+# RUN: not llvm-mc -triple mips-unknown-linux-gnu -mcpu=mips32r6 %s 2>&1 \
 # RUN:   | FileCheck --check-prefixes=ALL,MIPS32R6 %s
 
 # This tests the mnemonic spell checker.
diff --git a/test/MC/Mips/j-macro-insn.s b/test/MC/Mips/j-macro-insn.s
index cffcc97242f5a061aacf68fd3af8c0e4b3aa4021..555e468626ee86b02eef7ceb1994f0c04c8471c9 100644
--- a/test/MC/Mips/j-macro-insn.s
+++ b/test/MC/Mips/j-macro-insn.s
@@ -1,7 +1,12 @@
-# RUN: llvm-mc -arch=mips -show-inst %s | FileCheck --check-prefix=STATIC %s
-# RUN: llvm-mc -arch=mips -position-independent -show-inst %s | FileCheck --check-prefix=PIC %s
-# RUN: llvm-mc -arch=mips -mattr=+micromips -show-inst %s | FileCheck --check-prefix=STATIC-MM %s
-# RUN: llvm-mc -arch=mips -mattr=+micromips -position-independent -show-inst %s | FileCheck --check-prefix=PIC-MM %s
+# RUN: llvm-mc -triple mips-unknown-linux-gnu -show-inst %s \
+# RUN:   | FileCheck --check-prefix=STATIC %s
+# RUN: llvm-mc -triple mips-unknown-linux-gnu -position-independent \
+# RUN:         -show-inst %s | FileCheck --check-prefix=PIC %s
+# RUN: llvm-mc -triple mips-unknown-linux-gnu -mattr=+micromips -show-inst %s \
+# RUN:   | FileCheck --check-prefix=STATIC-MM %s
+# RUN: llvm-mc -triple mips-unknown-linux-gnu -mattr=+micromips \
+# RUN:         -position-independent -show-inst %s \
+# RUN:   | FileCheck --check-prefix=PIC-MM %s
 
   .text
   j foo
diff --git a/test/MC/Mips/macro-aliases-invalid-wrong-error.s b/test/MC/Mips/macro-aliases-invalid-wrong-error.s
index b87646d0b09a071b2dc64676efb5c3e6e6fef9ed..39c127253362a8118c9afeb23661d502b6ce9cd3 100644
--- a/test/MC/Mips/macro-aliases-invalid-wrong-error.s
+++ b/test/MC/Mips/macro-aliases-invalid-wrong-error.s
@@ -1,7 +1,7 @@
-# RUN: not llvm-mc -arch=mips %s 2>%t1
+# RUN: not llvm-mc -triple mips-unknown-linux-gnu %s 2>%t1
 # RUN: FileCheck --check-prefix=O32 %s < %t1
 
-# RUN: not llvm-mc -arch=mips64 %s 2>%t1
+# RUN: not llvm-mc -triple mips64-unknown-linux-gnu %s 2>%t1
 # RUN: FileCheck --check-prefix=N64 %s < %t1
 
 # Check that subu only rejects any non-constant values.
diff --git a/test/MC/Mips/micromips/valid-fp64.s b/test/MC/Mips/micromips/valid-fp64.s
index d825b48a70f98f016e746f4a310b363bb8a65787..300e3565b45dc59008eb45c074ab4e18ad1826f4 100644
--- a/test/MC/Mips/micromips/valid-fp64.s
+++ b/test/MC/Mips/micromips/valid-fp64.s
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -arch=mips -mcpu=mips32r3 -mattr=+micromips,+fp64 -show-encoding -show-inst %s \
+# RUN: llvm-mc -triple mips-unknown-linux-gnu -mcpu=mips32r3 \
+# RUN:         -mattr=+micromips,+fp64 -show-encoding -show-inst %s \
 # RUN: | FileCheck %s 
 
 abs.d   $f0, $f12     # CHECK: abs.d  $f0, $f12      # encoding: [0x54,0x0c,0x23,0x7b]
diff --git a/test/MC/Mips/mips-cop0-reginfo.s b/test/MC/Mips/mips-cop0-reginfo.s
index baa3681594c5fa5b3e2d4d5e95f4e724898723a3..b75b0171fa3c8a4a36ac0d92f67b90c785e833cb 100644
--- a/test/MC/Mips/mips-cop0-reginfo.s
+++ b/test/MC/Mips/mips-cop0-reginfo.s
@@ -1,6 +1,5 @@
-# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -filetype=obj %s -o - | \
-# RUN:   llvm-readobj -sections -section-data - | \
-# RUN:     FileCheck %s
+# RUN: llvm-mc -triple mips-unknown-linux-gnu -mcpu=mips32r2 -filetype=obj %s -o - \
+# RUN:   | llvm-readobj -sections -section-data - | FileCheck %s
 	mfc0	$16, $15, 1
 	mfc0	$16, $16, 1
 
diff --git a/test/MC/Mips/mips32/abiflags.s b/test/MC/Mips/mips32/abiflags.s
index dd772c0ba5a7bc53aa8f332898766335a6a760f2..c8d74c1b7037b0c1437bcbca52ce6666d03b85a4 100644
--- a/test/MC/Mips/mips32/abiflags.s
+++ b/test/MC/Mips/mips32/abiflags.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
diff --git a/test/MC/Mips/mips32r2/abiflags.s b/test/MC/Mips/mips32r2/abiflags.s
index e3bb15bdddeebddcde687a52a9c5293701dc8c80..200a8025e8d78f71ab9aa566b4efbf8b05979f09 100644
--- a/test/MC/Mips/mips32r2/abiflags.s
+++ b/test/MC/Mips/mips32r2/abiflags.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r2 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r2 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
diff --git a/test/MC/Mips/mips32r3/abiflags.s b/test/MC/Mips/mips32r3/abiflags.s
index a6c7057809f54e87c13e87d5b0d523eb8b3ed569..4d69145326273fbc9beca14eca55f5e24dc14f59 100644
--- a/test/MC/Mips/mips32r3/abiflags.s
+++ b/test/MC/Mips/mips32r3/abiflags.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r3 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r3 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r3 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r3 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
diff --git a/test/MC/Mips/mips32r5/abiflags.s b/test/MC/Mips/mips32r5/abiflags.s
index fa5befeaddb684f2624f76b3f2e47ee0077c5830..ab2a8579043d3cb8d298c994599bf19df9ee309b 100644
--- a/test/MC/Mips/mips32r5/abiflags.s
+++ b/test/MC/Mips/mips32r5/abiflags.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r5 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r5 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r5 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r5 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
diff --git a/test/MC/Mips/mips_abi_flags_xx.s b/test/MC/Mips/mips_abi_flags_xx.s
index 9c8754193e1259ab0485b4df4418d24168edd0b7..fb27192b2a9865f62526468fffcc0a5f69c19fbe 100644
--- a/test/MC/Mips/mips_abi_flags_xx.s
+++ b/test/MC/Mips/mips_abi_flags_xx.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations -mips-abi-flags - | \
 # RUN:   FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R1,CHECK-OBJ-MIPS
 
-# RUN: llvm-mc /dev/null -arch=mips -mcpu=mips32 -mattr=fpxx -filetype=obj -o - | \
+# RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mattr=fpxx -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations -mips-abi-flags - | \
 # RUN:   FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R1,CHECK-OBJ-MIPS
 
-# RUN: llvm-mc /dev/null -arch=mips -mcpu=mips32r6 -mattr=fpxx -filetype=obj -o - | \
+# RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mcpu=mips32r6 -mattr=fpxx -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations -mips-abi-flags - | \
 # RUN:   FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R6,CHECK-OBJ-MIPS
 
-# RUN: llvm-mc /dev/null -arch=mips -mcpu=octeon -target-abi n64 -filetype=obj -o - | \
+# RUN: llvm-mc /dev/null -triple mips64-unknown-linux-gnu -mcpu=octeon -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations -mips-abi-flags - | \
 # RUN:   FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-64R2,CHECK-OBJ-OCTEON
 
diff --git a/test/MC/Mips/mips_abi_flags_xx_set.s b/test/MC/Mips/mips_abi_flags_xx_set.s
index b31e295f6350f90bb88bbdfa2576b5fb02cabfcc..b38d1b9a49d864a771a14a8f68a8c2af79940fb6 100644
--- a/test/MC/Mips/mips_abi_flags_xx_set.s
+++ b/test/MC/Mips/mips_abi_flags_xx_set.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations -mips-abi-flags - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
diff --git a/test/MC/Mips/module-hardfloat.s b/test/MC/Mips/module-hardfloat.s
index 51f72487b3944aa6b3dfff091cd37ddf69bc5b5d..8a54fa69dd6b8a1daebd45a201ffe59657866c6e 100644
--- a/test/MC/Mips/module-hardfloat.s
+++ b/test/MC/Mips/module-hardfloat.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \
 # RUN:   llvm-readobj -mips-abi-flags - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
diff --git a/test/MC/Mips/msa/abiflags.s b/test/MC/Mips/msa/abiflags.s
index 136c035593e41ad169a5e6afb467c008f165b748..d4d8cadb8f24d9a8157b446cb9a4490f3315f150 100644
--- a/test/MC/Mips/msa/abiflags.s
+++ b/test/MC/Mips/msa/abiflags.s
@@ -1,9 +1,10 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa | \
-# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r2 -mattr=+msa \
+# RUN:   | FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
-# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r2 -mattr=+msa \
+# RUN:            -filetype=obj -o - \
+# RUN:   | llvm-readobj -sections -section-data -section-relocations - \
+# RUN:   | FileCheck %s -check-prefix=CHECK-OBJ
 
 # CHECK-ASM: .module fp=32
 # CHECK-ASM: .set fp=64
diff --git a/test/MC/Mips/nooddspreg-cmdarg.s b/test/MC/Mips/nooddspreg-cmdarg.s
index 28ac7b723343f2172990282198625ca9be5c5883..3bbb8e893e6fdcae1a68a1f4bdecaf561d3bb3f0 100644
--- a/test/MC/Mips/nooddspreg-cmdarg.s
+++ b/test/MC/Mips/nooddspreg-cmdarg.s
@@ -1,14 +1,14 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64,+nooddspreg | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+fp64,+nooddspreg | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64,+nooddspreg -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+fp64,+nooddspreg -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 -mattr=+nooddspreg 2> %t0
+# RUN: not llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -mattr=+nooddspreg 2> %t0
 # RUN: FileCheck %s -check-prefix=INVALID < %t0
 #
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n64 -mattr=+nooddspreg 2> %t0
+# RUN: not llvm-mc %s -triple mips64-unknown-linux-gnu -mattr=+nooddspreg 2> %t0
 # RUN: FileCheck %s -check-prefix=INVALID < %t0
 #
 # CHECK-ASM-NOT: .module nooddspreg
diff --git a/test/MC/Mips/nooddspreg-error.s b/test/MC/Mips/nooddspreg-error.s
index b4aabbef7280625c00d27946ca623a61dc418b1c..9044485812e64ab64d7265ae3de8333634b6da5f 100644
--- a/test/MC/Mips/nooddspreg-error.s
+++ b/test/MC/Mips/nooddspreg-error.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 2> %t0 | \
+# RUN: not llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+fp64 2> %t0 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 # RUN: FileCheck %s -check-prefix=CHECK-ERROR < %t0
 #
diff --git a/test/MC/Mips/nooddspreg.s b/test/MC/Mips/nooddspreg.s
index 2b5154d427b20ec38395c66a70baf1e20b75ac84..a5530ef8f523072252b97e3225aa1afab9d7dfa5 100644
--- a/test/MC/Mips/nooddspreg.s
+++ b/test/MC/Mips/nooddspreg.s
@@ -1,14 +1,14 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+fp64 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+fp64 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 2> %t1
+# RUN: not llvm-mc %s -triple mips64-unknown-linux-gnuabin32 2> %t1
 # RUN: FileCheck %s -check-prefix=INVALID < %t1
 #
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n64 2> %t2
+# RUN: not llvm-mc %s -triple mips64-unknown-linux-gnu 2> %t2
 # RUN: FileCheck %s -check-prefix=INVALID < %t2
 #
 # CHECK-ASM: .module nooddspreg
diff --git a/test/MC/Mips/oddspreg.s b/test/MC/Mips/oddspreg.s
index 3b9aaeecddd01ecd6671bc5014fdf73f3efd00b2..c65a27d08281c91512bf45da60225f15eaf4f478 100644
--- a/test/MC/Mips/oddspreg.s
+++ b/test/MC/Mips/oddspreg.s
@@ -1,35 +1,35 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+fp64 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+fp64 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefixes=CHECK-OBJ-ALL,CHECK-OBJ-O32
 #
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 -filetype=obj -o - | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnuabin32 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefixes=CHECK-OBJ-ALL,CHECK-OBJ-N32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 | \
+# RUN: llvm-mc %s -triple mips64-unknown-linux-gnu | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
 # Repeat the -filetype=obj tests but this time use an empty assembly file. The
 # output should be unchanged.
-# RUN: llvm-mc /dev/null -arch=mips64 -mcpu=mips64 -filetype=obj -o - | \
+# RUN: llvm-mc /dev/null -triple mips64-unknown-linux-gnu -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefixes=CHECK-OBJ-ALL,CHECK-OBJ-N64
 
-# RUN: llvm-mc /dev/null -arch=mips -mcpu=mips32 -mattr=+fp64 -filetype=obj -o - | \
+# RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mattr=+fp64 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefixes=CHECK-OBJ-ALL,CHECK-OBJ-O32
 #
-# RUN: llvm-mc /dev/null -arch=mips64 -mcpu=mips64 -target-abi n32 -filetype=obj -o - | \
+# RUN: llvm-mc /dev/null -triple mips64-unknown-linux-gnuabin32 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefixes=CHECK-OBJ-ALL,CHECK-OBJ-N32
 
-# RUN: llvm-mc /dev/null -arch=mips64 -mcpu=mips64 -filetype=obj -o - | \
+# RUN: llvm-mc /dev/null -triple mips64-unknown-linux-gnu -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefixes=CHECK-OBJ-ALL,CHECK-OBJ-N64
 
diff --git a/test/MC/Mips/reloc-directive-bad-obj.s b/test/MC/Mips/reloc-directive-bad-obj.s
new file mode 100644
index 0000000000000000000000000000000000000000..94317d0830df7086dfb6f33b3118d58f609940bf
--- /dev/null
+++ b/test/MC/Mips/reloc-directive-bad-obj.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -triple mips-unknown-linux %s -show-encoding \
+# RUN:     -target-abi=o32 -filetype=obj 2>&1 | FileCheck %s
+	.text
+	nop
+	.reloc foo, R_MIPS_32, .text  # CHECK: :[[@LINE]]:2: error: unresolved relocation offset
+	nop
+	nop
+	.reloc bar, R_MIPS_32, .text  # CHECK: :[[@LINE]]:2: error: unresolved relocation offset
+	nop
diff --git a/test/MC/Mips/reloc-directive-bad.s b/test/MC/Mips/reloc-directive-bad.s
index 41f21ed36de71a0e2912360b7db0c9b810474c15..929643b914afa152f9e1012f1076e997dd5a3fe9 100644
--- a/test/MC/Mips/reloc-directive-bad.s
+++ b/test/MC/Mips/reloc-directive-bad.s
@@ -1,6 +1,13 @@
-# RUN: not llvm-mc -triple mips-unknown-linux < %s -show-encoding -target-abi=o32 \
-# RUN:     2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple mips-unknown-linux < %s -show-encoding \
+# RUN:     -target-abi=o32 2>&1 | FileCheck %s
 	.text
 foo:
-	.reloc 0, R_MIPS_32, .text+.text # CHECK: :[[@LINE]]:23: error: expression must be relocatable
+	.reloc foo+4, R_MIPS_32, .text    # CHECK: :[[@LINE]]:9: error: expected non-negative number or a label
+	.reloc foo+foo, R_MIPS_32, .text  # CHECK: :[[@LINE]]:9: error: expected non-negative number or a label
+	.reloc 0, R_MIPS_32, .text+.text  # CHECK: :[[@LINE]]:23: error: expression must be relocatable
+	.reloc 0 R_MIPS_32, .text         # CHECK: :[[@LINE]]:11: error: expected comma
+	.reloc 0, 0, R_MIPS_32, .text     # CHECK: :[[@LINE]]:12: error: expected relocation name
+	.reloc -1, R_MIPS_32, .text       # CHECK: :[[@LINE]]:9: error: expression is negative
+	.reloc 1b, R_MIPS_32, .text       # CHECK: :[[@LINE]]:9: error: directional label undefined
+	.reloc 1f, R_MIPS_32, .text       # CHECK: :[[@LINE]]:9: error: directional label undefined
 	nop
diff --git a/test/MC/Mips/reloc-directive-label-offset.s b/test/MC/Mips/reloc-directive-label-offset.s
new file mode 100644
index 0000000000000000000000000000000000000000..05ee4dee0f36aa1d0919b3919d9783dfab5e42b4
--- /dev/null
+++ b/test/MC/Mips/reloc-directive-label-offset.s
@@ -0,0 +1,75 @@
+# RUN: llvm-mc -triple mips-unknown-linux %s -show-encoding -target-abi=o32 \
+# RUN:     | FileCheck --check-prefixes=ASM,ASM-32 %s
+# RUN: llvm-mc -triple mips64-unknown-linux %s -show-encoding -target-abi=n32 \
+# RUN:     | FileCheck --check-prefixes=ASM,ASM-64 %s
+# RUN: llvm-mc -triple mips64-unknown-linux %s -show-encoding -target-abi=n64 \
+# RUN:     | FileCheck --check-prefixes=ASM,ASM-64 %s
+# RUN: llvm-mc -triple mips-unknown-linux %s -show-encoding -target-abi=o32 \
+# RUN:     -filetype=obj | llvm-readobj -r | FileCheck -check-prefix=OBJ-O32 %s
+# RUN: llvm-mc -triple mips64-unknown-linux %s -show-encoding -target-abi=n32 \
+# RUN:     -filetype=obj | llvm-readobj -r | FileCheck -check-prefix=OBJ-N32 %s
+# RUN: llvm-mc -triple mips64-unknown-linux %s -show-encoding -target-abi=n64 \
+# RUN:     -filetype=obj | llvm-readobj -r | FileCheck -check-prefix=OBJ-N64 %s
+
+  .text
+foo: # ASM-LABEL: foo:
+  nop
+1:
+  nop
+  .reloc 1b, R_MIPS_NONE, foo       # ASM-32: .reloc ($tmp0), R_MIPS_NONE, foo
+                                    # ASM-64: .reloc .Ltmp0, R_MIPS_NONE, foo
+  nop
+  .reloc 1f, R_MIPS_32, foo         # ASM-32: .reloc ($tmp1), R_MIPS_32, foo
+                                    # ASM-64: .reloc .Ltmp1, R_MIPS_32, foo
+1:
+  nop
+  .reloc 1f, R_MIPS_CALL16, foo     # ASM-32: .reloc ($tmp2), R_MIPS_CALL16, foo
+                                    # ASM-64: .reloc .Ltmp2, R_MIPS_CALL16, foo
+1:
+  nop
+  .reloc 2f, R_MIPS_GOT_DISP, foo   # ASM-32: .reloc ($tmp3), R_MIPS_GOT_DISP, foo
+                                    # ASM-64: .reloc .Ltmp3, R_MIPS_GOT_DISP, foo
+  nop
+
+  .reloc 3f, R_MIPS_GOT_PAGE, foo   # ASM-32: .reloc ($tmp4), R_MIPS_GOT_PAGE, foo
+                                    # ASM-64: .reloc .Ltmp4, R_MIPS_GOT_PAGE, foo
+  nop
+bar:
+  nop
+2:
+  nop
+3:
+  nop
+  .reloc bar, R_MIPS_GOT_OFST, foo  # ASM: .reloc bar, R_MIPS_GOT_OFST, foo
+  nop
+  .reloc foo, R_MIPS_32, foo        # ASM: .reloc foo, R_MIPS_32, foo
+  nop
+1:
+  nop
+
+# OBJ-O32-LABEL: Relocations [
+# OBJ-O32:           0x0 R_MIPS_32 .text 0x0
+# OBJ-O32-NEXT:      0x4 R_MIPS_NONE .text 0x0
+# OBJ-O32-NEXT:      0xC R_MIPS_32 .text 0x0
+# OBJ-O32-NEXT:      0x10 R_MIPS_CALL16 foo 0x0
+# OBJ-O32-NEXT:      0x1C R_MIPS_GOT_OFST .text 0x0
+# OBJ-O32-NEXT:      0x20 R_MIPS_GOT_DISP foo 0x0
+# OBJ-O32-NEXT:      0x24 R_MIPS_GOT_PAGE .text 0x0
+
+# OBJ-N32-LABEL: Relocations [
+# OBJ-N32:           0x4 R_MIPS_NONE .text 0x0
+# OBJ-N32-NEXT:      0x1C R_MIPS_GOT_OFST .text 0x0
+# OBJ-N32-NEXT:      0x0 R_MIPS_32 .text 0x0
+# OBJ-N32-NEXT:      0xC R_MIPS_32 .text 0x0
+# OBJ-N32-NEXT:      0x10 R_MIPS_CALL16 foo 0x0
+# OBJ-N32-NEXT:      0x20 R_MIPS_GOT_DISP foo 0x0
+# OBJ-N32-NEXT:      0x24 R_MIPS_GOT_PAGE .text 0x0
+
+# OBJ-N64-LABEL: Relocations [
+# OBJ-N64:           0x4 R_MIPS_NONE/R_MIPS_NONE/R_MIPS_NONE .text 0x0
+# OBJ-N64-NEXT:      0x1C R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE .text 0x0
+# OBJ-N64-NEXT:      0x0 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0
+# OBJ-N64-NEXT:      0xC R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0
+# OBJ-N64-NEXT:      0x10 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE foo 0x0
+# OBJ-N64-NEXT:      0x20 R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE foo 0x0
+# OBJ-N64-NEXT:      0x24 R_MIPS_GOT_PAGE/R_MIPS_NONE/R_MIPS_NONE .text 0x0
diff --git a/test/MC/Mips/reloc-directive-negative.s b/test/MC/Mips/reloc-directive-negative.s
deleted file mode 100644
index 41dd39a17f62d124ecb6d9765873fcfbb93b048b..0000000000000000000000000000000000000000
--- a/test/MC/Mips/reloc-directive-negative.s
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: not llvm-mc -triple mips-unknown-linux < %s -show-encoding -target-abi=o32 \
-# RUN:     2>&1 | FileCheck %s
-	.text
-foo:
-	.reloc -1, R_MIPS_32, .text # CHECK: :[[@LINE]]:9: error: expression is negative
-	nop
diff --git a/test/MC/Mips/reloc-directive.s b/test/MC/Mips/reloc-directive.s
index be5d6934c250627f2a8241a06191d89afbe25841..bf7c91b851f1a26eac6a9c5f3b30885edeee7d2b 100644
--- a/test/MC/Mips/reloc-directive.s
+++ b/test/MC/Mips/reloc-directive.s
@@ -79,6 +79,10 @@ foo:
   nop
   .reloc 124, R_MICROMIPS_TLS_TPREL_LO16, 4     # ASM: .reloc 124, R_MICROMIPS_TLS_TPREL_LO16, 4
   nop
+  .reloc 128, R_MIPS_JALR, 4            # ASM: .reloc 128, R_MIPS_JALR, 4
+  nop
+  .reloc 132, R_MICROMIPS_JALR, 4       # ASM: .reloc 132, R_MICROMIPS_JALR, 4
+  nop
 
 # OBJ-O32-LABEL: Name: .text
 # OBJ-O32:       0000: 00000000 00000000 00000008 00000000
@@ -89,6 +93,7 @@ foo:
 # OBJ-O32-NEXT:  0050: 00000000 00000004 00000004 00000004
 # OBJ-O32-NEXT:  0060: 00000000 00000000 00000000 00000000
 # OBJ-O32-NEXT:  0070: 00000000 00000000 00000000 00000000
+# OBJ-O32-NEXT:  0080: 00000000 00000000
 # OBJ-O32-LABEL: }
 # OBJ-O32-LABEL: Relocations [
 # OBJ-O32:       0x0 R_MIPS_NONE .text 0x0
@@ -121,6 +126,8 @@ foo:
 # OBJ-O32-NEXT:  0x74 R_MICROMIPS_TLS_LDM - 0x0
 # OBJ-O32-NEXT:  0x78 R_MICROMIPS_TLS_TPREL_HI16 - 0x0
 # OBJ-O32-NEXT:  0x7C R_MICROMIPS_TLS_TPREL_LO16 - 0x0
+# OBJ-O32-NEXT:  0x80 R_MIPS_JALR - 0x0
+# OBJ-O32-NEXT:  0x84 R_MICROMIPS_JALR - 0x0
 # OBJ-O32-NEXT:  0x1C R_MIPS_GOT16 - 0x0
 # OBJ-O32-NEXT:  0x60 R_MICROMIPS_GOT16 - 0x0
 
@@ -133,6 +140,7 @@ foo:
 # OBJ-N32-NEXT:  0050: 00000000 00000000 00000000 00000000
 # OBJ-N32-NEXT:  0060: 00000000 00000000 00000000 00000000
 # OBJ-N32-NEXT:  0070: 00000000 00000000 00000000 00000000
+# OBJ-N32-NEXT:  0080: 00000000 00000000
 # OBJ-N32-LABEL: }
 # OBJ-N32-LABEL: Relocations [
 
@@ -168,6 +176,8 @@ foo:
 # OBJ-N32-NEXT:  0x74 R_MICROMIPS_TLS_LDM - 0x4
 # OBJ-N32-NEXT:  0x78 R_MICROMIPS_TLS_TPREL_HI16 - 0x4
 # OBJ-N32-NEXT:  0x7C R_MICROMIPS_TLS_TPREL_LO16 - 0x4
+# OBJ-N32-NEXT:  0x80 R_MIPS_JALR - 0x4
+# OBJ-N32-NEXT:  0x84 R_MICROMIPS_JALR - 0x4
 
 # OBJ-N64-LABEL: Name: .text
 # OBJ-N64:       0000: 00000000 00000000 00000000 00000000
@@ -178,6 +188,7 @@ foo:
 # OBJ-N64-NEXT:  0050: 00000000 00000000 00000000 00000000
 # OBJ-N64-NEXT:  0060: 00000000 00000000 00000000 00000000
 # OBJ-N64-NEXT:  0070: 00000000 00000000 00000000 00000000
+# OBJ-N64-NEXT:  0080: 00000000 00000000
 # OBJ-N64-LABEL: }
 # OBJ-N64-LABEL: Relocations [
 # OBJ-N64:       0x4 R_MIPS_NONE/R_MIPS_NONE/R_MIPS_NONE .text 0x0
@@ -212,3 +223,5 @@ foo:
 # OBJ-N64-NEXT:  0x74 R_MICROMIPS_TLS_LDM/R_MIPS_NONE/R_MIPS_NONE - 0x4
 # OBJ-N64-NEXT:  0x78 R_MICROMIPS_TLS_TPREL_HI16/R_MIPS_NONE/R_MIPS_NONE - 0x4
 # OBJ-N64-NEXT:  0x7C R_MICROMIPS_TLS_TPREL_LO16/R_MIPS_NONE/R_MIPS_NONE - 0x4
+# OBJ-N64-NEXT:  0x80 R_MIPS_JALR/R_MIPS_NONE/R_MIPS_NONE - 0x4
+# OBJ-N64-NEXT:  0x84 R_MICROMIPS_JALR/R_MIPS_NONE/R_MIPS_NONE - 0x4
diff --git a/test/MC/Mips/set-nomacro-micromips.s b/test/MC/Mips/set-nomacro-micromips.s
index c4d8b7c2d3e571278f2f6a8d1e5da4c014cc9c1f..e1f49141e5f3593642c639ac02f18d2f00bcafdd 100644
--- a/test/MC/Mips/set-nomacro-micromips.s
+++ b/test/MC/Mips/set-nomacro-micromips.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=micromips 2>&1 | FileCheck %s
+# RUN: llvm-mc %s -triple mips-unknown-linux-gnu -mattr=micromips 2>&1 | FileCheck %s
 
   .text
   .type main, @function
diff --git a/test/MC/Mips/update-module-level-options.s b/test/MC/Mips/update-module-level-options.s
index 3d6e97cb597db9c638c50d1c56e8007a1151d0b4..f83bf22b5d0c5ccc1dd633082684fa80b8317405 100644
--- a/test/MC/Mips/update-module-level-options.s
+++ b/test/MC/Mips/update-module-level-options.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64,-nooddspreg 2>&1 | \
+# RUN: not llvm-mc %s -triple mips-unknown-linux-gnu -mattr=+fp64,-nooddspreg 2>&1 | \
 # RUN:   FileCheck %s
 
   .module nooddspreg
diff --git a/test/MC/Mips/virt/invalid.s b/test/MC/Mips/virt/invalid.s
index 9cf664e2dd6e5ae8a8429b9527976657f8482dad..cfd3ce030fafdf5510326a1b8daa9c1fae4d5c3f 100644
--- a/test/MC/Mips/virt/invalid.s
+++ b/test/MC/Mips/virt/invalid.s
@@ -1,10 +1,10 @@
 # Instructions that are invalid.
 #
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r5 -mattr=+virt 2>%t1
+# RUN: not llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r5 -mattr=+virt 2>%t1
 # RUN: FileCheck %s < %t1
-# RUN: not llvm-mc %s -arch=mips64 -mcpu=mips64r5 -mattr=+virt 2>%t1
+# RUN: not llvm-mc %s -triple mips64-unknown-linux-gnu -mcpu=mips64r5 -mattr=+virt 2>%t1
 # RUN: FileCheck %s < %t1
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r5 -mattr=+micromips,+virt 2>%t1
+# RUN: not llvm-mc %s -triple mips-unknown-linux-gnu -mcpu=mips32r5 -mattr=+micromips,+virt 2>%t1
 # RUN: FileCheck %s < %t1
 
   mfgc0                   # CHECK: :[[@LINE]]:3: error: too few operands for instruction
diff --git a/test/MC/RISCV/compress-rv32i.s b/test/MC/RISCV/compress-rv32i.s
index 0436a99e5baa49c234875b16fbef9da38158ced6..149279c433044e8a0e0d62877544902e67a7b9a9 100644
--- a/test/MC/RISCV/compress-rv32i.s
+++ b/test/MC/RISCV/compress-rv32i.s
@@ -211,3 +211,9 @@ add s0, s0, a5
 # CHECK-INST: c.swsp zero, 252(sp)
 # CHECK: # encoding: [0x82,0xdf]
 sw zero, 252(sp)
+
+# CHECK-BYTES: 00 00
+# CHECK-ALIAS: unimp
+# CHECK-INST: c.unimp
+# CHECK: # encoding: [0x00,0x00]
+unimp
diff --git a/test/MC/RISCV/option-invalid.s b/test/MC/RISCV/option-invalid.s
index ec9e8e030d765b2a3c8727071e8f5396229459b0..8333f1c19978bf093a4811aeb82811bfd418cc63 100644
--- a/test/MC/RISCV/option-invalid.s
+++ b/test/MC/RISCV/option-invalid.s
@@ -13,5 +13,14 @@
 # CHECK: error: unexpected token, expected end of statement
 .option rvc foo
 
-# CHECK: warning: unknown option, expected 'rvc' or 'norvc'
+# CHECK: warning: unknown option, expected 'push', 'pop', 'rvc', 'norvc', 'relax' or 'norelax'
 .option bar
+
+# CHECK: error: .option pop with no .option push
+.option pop
+
+# CHECK: error: unexpected token, expected end of statement
+.option push 123
+
+# CHECK: error: unexpected token, expected end of statement
+.option pop 123
diff --git a/test/MC/RISCV/option-pushpop.s b/test/MC/RISCV/option-pushpop.s
new file mode 100644
index 0000000000000000000000000000000000000000..9e3ae4156a44ab3907904719241b81c5ea69e081
--- /dev/null
+++ b/test/MC/RISCV/option-pushpop.s
@@ -0,0 +1,74 @@
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
+# RUN:     | llvm-readobj -r | FileCheck -check-prefix=CHECK-RELOC %s
+# RUN: llvm-mc -triple riscv32 -filetype=obj < %s \
+# RUN:     | llvm-objdump  -triple riscv32 -mattr=+c -d - \
+# RUN:     | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
+
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
+# RUN:     | llvm-readobj -r | FileCheck -check-prefix=CHECK-RELOC %s
+# RUN: llvm-mc -triple riscv64 -filetype=obj < %s \
+# RUN:     | llvm-objdump  -triple riscv64 -mattr=+c -d - \
+# RUN:     | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
+
+# Test the operation of the push and pop assembler directives when
+# using .option relax and .option rvc. Checks that using .option pop
+# correctly restores all target features to their state at the point
+# where .option pop was last used.
+
+# CHECK-INST: call foo
+# CHECK-RELOC: R_RISCV_CALL foo 0x0
+# CHECK-RELOC-NOT: R_RISCV_RELAX foo 0x0
+call foo
+
+# CHECK-INST: addi s0, sp, 1020
+# CHECK-BYTES: 13 04 c1 3f
+# CHECK-ALIAS: addi s0, sp, 1020
+addi s0, sp, 1020
+
+.option push    # Push relax=false, rvc=false
+# CHECK-INST: .option push
+
+.option relax
+# CHECK-INST: .option relax
+# CHECK-INST: call bar
+# CHECK-RELOC-NEXT: R_RISCV_CALL bar 0x0
+# CHECK-RELOC-NEXT: R_RISCV_RELAX bar 0x0
+call bar
+
+.option push    # Push relax=true, rvc=false
+# CHECK-INST: .option push
+
+.option rvc
+# CHECK-INST: .option rvc
+# CHECK-INST: c.addi4spn s0, sp, 1020
+# CHECK-BYTES: e0 1f
+# CHECK-ALIAS: addi s0, sp, 1020
+addi s0, sp, 1020
+
+.option pop     # Pop relax=true, rvc=false
+# CHECK-INST: .option pop
+# CHECK-INST: addi s0, sp, 1020
+# CHECK-BYTES: 13 04 c1 3f
+# CHECK-ALIAS: addi s0, sp, 1020
+addi s0, sp, 1020
+
+# CHECK-INST: call bar
+# CHECK-RELOC-NEXT: R_RISCV_CALL bar 0x0
+# CHECK-RELOC-NEXT: R_RISCV_RELAX bar 0x0
+call bar
+
+.option pop     # Pop relax=false, rvc=false
+# CHECK-INST: .option pop
+# CHECK-INST: call baz
+# CHECK-RELOC: R_RISCV_CALL baz 0x0
+# CHECK-RELOC-NOT: R_RISCV_RELAX baz 0x0
+call baz
+
+# CHECK-INST: addi s0, sp, 1020
+# CHECK-BYTES: 13 04 c1 3f
+# CHECK-ALIAS: addi s0, sp, 1020
+addi s0, sp, 1020
diff --git a/test/MC/RISCV/option-relax.s b/test/MC/RISCV/option-relax.s
new file mode 100644
index 0000000000000000000000000000000000000000..82ec24b710c127c5ee50a1b1e4bd282f9565e3b9
--- /dev/null
+++ b/test/MC/RISCV/option-relax.s
@@ -0,0 +1,66 @@
+# RUN: llvm-mc -triple riscv32 < %s \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
+# RUN:     | llvm-readobj -r | FileCheck -check-prefix=CHECK-RELOC %s
+
+# RUN: llvm-mc -triple riscv64 < %s \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
+# RUN:     | llvm-readobj -r | FileCheck -check-prefix=CHECK-RELOC %s
+
+# Check .option relax causes R_RISCV_RELAX to be emitted, and .option
+# norelax suppresses it. Also check that if .option relax was enabled
+# at any point and an instruction may have been relaxed, diff & branch
+# relocations are emitted to ensure correct codegen. See
+# linker-relaxation.s and fixups-expr.s for behaviour of the relax
+# attribute.
+
+.L1:
+.option norelax
+# CHECK-INST: .option norelax
+
+# CHECK-INST: call foo
+# CHECK-RELOC: R_RISCV_CALL foo 0x0
+# CHECK-RELOC-NOT: R_RISCV_RELAX foo 0x0
+call foo
+
+# CHECK-RELOC-NEXT: R_RISCV_ADD64
+# CHECK-RELOC-NEXT: R_RISCV_SUB64
+.dword .L2-.L1
+# CHECK-RELOC-NEXT: R_RISCV_JAL
+jal zero, .L1
+# CHECK-RELOC-NEXT: R_RISCV_BRANCH
+beq s1, s1, .L1
+
+.L2:
+.option relax
+# CHECK-INST: .option relax
+
+# CHECK-INST: call bar
+# CHECK-RELOC-NEXT: R_RISCV_CALL bar 0x0
+# CHECK-RELOC-NEXT: R_RISCV_RELAX bar 0x0
+call bar
+
+# CHECK-RELOC-NEXT: R_RISCV_ADD64
+# CHECK-RELOC-NEXT: R_RISCV_SUB64
+.dword .L2-.L1
+# CHECK-RELOC-NEXT: R_RISCV_JAL
+jal zero, .L1
+# CHECK-RELOC-NEXT: R_RISCV_BRANCH
+beq s1, s1, .L1
+
+.option norelax
+# CHECK-INST: .option norelax
+
+# CHECK-INST: call baz
+# CHECK-RELOC-NEXT: R_RISCV_CALL baz 0x0
+# CHECK-RELOC-NOT: R_RISCV_RELAX baz 0x0
+call baz
+
+# CHECK-RELOC-NEXT: R_RISCV_ADD64
+# CHECK-RELOC-NEXT: R_RISCV_SUB64
+.dword .L2-.L1
+# CHECK-RELOC-NEXT: R_RISCV_JAL
+jal zero, .L1
+# CHECK-RELOC-NEXT: R_RISCV_BRANCH
+beq s1, s1, .L1
diff --git a/test/MC/RISCV/rv32c-valid.s b/test/MC/RISCV/rv32c-valid.s
index c761c712789882b9068fc8ed83d544e662e74c43..b72e4922a5767c52e934be3efacfa64eeda331b1 100644
--- a/test/MC/RISCV/rv32c-valid.s
+++ b/test/MC/RISCV/rv32c-valid.s
@@ -103,3 +103,6 @@ c.lui s0, 0xfffe0
 # CHECK-ASM-AND-OBJ: c.lui s0, 1048575
 # CHECK-ASM: encoding: [0x7d,0x74]
 c.lui s0, 0xfffff
+# CHECK-ASM-AND-OBJ: c.unimp
+# CHECK-ASM: encoding: [0x00,0x00]
+c.unimp
diff --git a/test/MC/RISCV/rv32i-valid.s b/test/MC/RISCV/rv32i-valid.s
index 816aa275fe1e467c103f1c2a957de46020a0b941..0df53e661b90961374efd49c8bffa0c9d5b3f429 100644
--- a/test/MC/RISCV/rv32i-valid.s
+++ b/test/MC/RISCV/rv32i-valid.s
@@ -248,6 +248,9 @@ ecall
 # CHECK-ASM-AND-OBJ: ebreak
 # CHECK-ASM: encoding: [0x73,0x00,0x10,0x00]
 ebreak
+# CHECK-ASM-AND-OBJ: unimp
+# CHECK-ASM: encoding: [0x73,0x10,0x00,0xc0]
+unimp
 
 # CHECK-ASM-AND-OBJ: csrrw t0, 4095, t1
 # CHECK-ASM: encoding: [0xf3,0x12,0xf3,0xff]
diff --git a/test/MC/RISCV/rv64c-aliases-valid.s b/test/MC/RISCV/rv64c-aliases-valid.s
index 4f5cf71426f8c062914278eb9461750ecdebc37c..a645763c7886868e466adc7392022e588ed91830 100644
--- a/test/MC/RISCV/rv64c-aliases-valid.s
+++ b/test/MC/RISCV/rv64c-aliases-valid.s
@@ -14,14 +14,14 @@ li x10, 0
 li x10, 1
 # CHECK-EXPAND: c.li a0, -1
 li x10, -1
-# CHECK-EXPAND: addiw a0, zero, 2047
+# CHECK-EXPAND: addi a0, zero, 2047
 li x10, 2047
-# CHECK-EXPAND: addiw a0, zero, -2047
+# CHECK-EXPAND: addi a0, zero, -2047
 li x10, -2047
 # CHECK-EXPAND: c.lui a1, 1
 # CHECK-EXPAND: addiw a1, a1, -2048
 li x11, 2048
-# CHECK-EXPAND: addiw a1, zero, -2048
+# CHECK-EXPAND: addi a1, zero, -2048
 li x11, -2048
 # CHECK-EXPAND: c.lui a1, 1
 # CHECK-EXPAND: addiw a1, a1, -2047
diff --git a/test/MC/RISCV/rv64i-aliases-valid.s b/test/MC/RISCV/rv64i-aliases-valid.s
index bc0dc527cc42fc9ffa538cb05f02dab800c3262c..64bb36eaad6fd1918ef92edfeae1cf5fdeb20c01 100644
--- a/test/MC/RISCV/rv64i-aliases-valid.s
+++ b/test/MC/RISCV/rv64i-aliases-valid.s
@@ -17,21 +17,21 @@
 # TODO ld
 # TODO sd
 
-# CHECK-INST: addiw a0, zero, 0
-# CHECK-ALIAS: sext.w a0, zero
+# CHECK-INST: addi a0, zero, 0
+# CHECK-ALIAS: mv a0, zero
 li x10, 0
-# CHECK-EXPAND: addiw a0, zero, 1
+# CHECK-EXPAND: addi a0, zero, 1
 li x10, 1
-# CHECK-EXPAND: addiw a0, zero, -1
+# CHECK-EXPAND: addi a0, zero, -1
 li x10, -1
-# CHECK-EXPAND: addiw a0, zero, 2047
+# CHECK-EXPAND: addi a0, zero, 2047
 li x10, 2047
-# CHECK-EXPAND: addiw a0, zero, -2047
+# CHECK-EXPAND: addi a0, zero, -2047
 li x10, -2047
 # CHECK-EXPAND: lui a1, 1
 # CHECK-EXPAND: addiw a1, a1, -2048
 li x11, 2048
-# CHECK-EXPAND: addiw a1, zero, -2048
+# CHECK-EXPAND: addi a1, zero, -2048
 li x11, -2048
 # CHECK-EXPAND: lui a1, 1
 # CHECK-EXPAND: addiw a1, a1, -2047
@@ -66,28 +66,28 @@ li x12, -2147483648
 # CHECK-EXPAND: lui a2, 524288
 li x12, -0x80000000
 
-# CHECK-EXPAND: addiw a2, zero, 1
+# CHECK-EXPAND: addi a2, zero, 1
 # CHECK-EXPAND: slli a2, a2, 31
 li x12, 0x80000000
-# CHECK-EXPAND: addiw a2, zero, 1
+# CHECK-EXPAND: addi a2, zero, 1
 # CHECK-EXPAND: slli a2, a2, 32
 # CHECK-EXPAND: addi a2, a2, -1
 li x12, 0xFFFFFFFF
 
-# CHECK-EXPAND: addiw t0, zero, 1
+# CHECK-EXPAND: addi t0, zero, 1
 # CHECK-EXPAND: slli t0, t0, 32
 li t0, 0x100000000
-# CHECK-EXPAND: addiw t1, zero, -1
+# CHECK-EXPAND: addi t1, zero, -1
 # CHECK-EXPAND: slli t1, t1, 63
 li t1, 0x8000000000000000
-# CHECK-EXPAND: addiw t1, zero, -1
+# CHECK-EXPAND: addi t1, zero, -1
 # CHECK-EXPAND: slli t1, t1, 63
 li t1, -0x8000000000000000
 # CHECK-EXPAND: lui t2, 9321
 # CHECK-EXPAND: addiw t2, t2, -1329
 # CHECK-EXPAND: slli t2, t2, 35
 li t2, 0x1234567800000000
-# CHECK-EXPAND: addiw t3, zero, 7
+# CHECK-EXPAND: addi t3, zero, 7
 # CHECK-EXPAND: slli t3, t3, 36
 # CHECK-EXPAND: addi t3, t3, 11
 # CHECK-EXPAND: slli t3, t3, 24
@@ -102,7 +102,7 @@ li t3, 0x700000000B00000F
 # CHECK-EXPAND: slli t4, t4, 13
 # CHECK-EXPAND: addi t4, t4, -272
 li t4, 0x123456789abcdef0
-# CHECK-EXPAND: addiw t5, zero, -1
+# CHECK-EXPAND: addi t5, zero, -1
 li t5, 0xFFFFFFFFFFFFFFFF
 
 # CHECK-INST: subw t6, zero, ra
diff --git a/test/MC/RISCV/rvi-aliases-valid.s b/test/MC/RISCV/rvi-aliases-valid.s
index 4acabe216a1392ae54a05317c2267656208821a0..a4aef05f8be4665278736e2e8736072b3abe6311 100644
--- a/test/MC/RISCV/rvi-aliases-valid.s
+++ b/test/MC/RISCV/rvi-aliases-valid.s
@@ -181,6 +181,26 @@ csrsi 0xfff, 0x10
 # CHECK-S-OBJ: csrci sscratch, 17
 csrci 0x140, 0x11
 
+# CHECK-S-OBJ-NOALIAS: csrrwi zero, 336, 7
+# CHECK-S-OBJ: csrwi 336, 7
+csrw 0x150, 7
+# CHECK-S-OBJ-NOALIAS: csrrsi zero, 336, 7
+# CHECK-S-OBJ: csrsi 336, 7
+csrs 0x150, 7
+# CHECK-S-OBJ-NOALIAS: csrrci zero, 336, 7
+# CHECK-S-OBJ: csrci 336, 7
+csrc 0x150, 7
+
+# CHECK-S-OBJ-NOALIAS: csrrwi t0, 336, 15
+# CHECK-S-OBJ: csrrwi t0, 336, 15
+csrrw t0, 0x150, 0xf
+# CHECK-S-OBJ-NOALIAS: csrrsi t0, 4095, 16
+# CHECK-S-OBJ: csrrsi t0, 4095, 16
+csrrs t0, 0xfff, 0x10
+# CHECK-S-OBJ-NOALIAS: csrrci t0, sscratch, 17
+# CHECK-S-OBJ: csrrci t0, sscratch, 17
+csrrc t0, 0x140, 0x11
+
 # CHECK-S-OBJ-NOALIAS: sfence.vma zero, zero
 # CHECK-S-OBJ: sfence.vma
 sfence.vma
diff --git a/test/MC/Sparc/sparc-asm-errors.s b/test/MC/Sparc/sparc-asm-errors.s
index 6a4128f683f81212aaa4f03b4bd57d6dbc5039a7..4af723ed788deda699429acd1b3bf0378b36bdbd 100644
--- a/test/MC/Sparc/sparc-asm-errors.s
+++ b/test/MC/Sparc/sparc-asm-errors.s
@@ -1,8 +1,16 @@
-! RUN: not llvm-mc %s -arch=sparc   -show-encoding 2>&1 | FileCheck %s
-! RUN: not llvm-mc %s -arch=sparcv9 -show-encoding 2>&1 | FileCheck %s
+! RUN: not llvm-mc %s -arch=sparc   -show-encoding 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=V8
+! RUN: not llvm-mc %s -arch=sparcv9 -show-encoding 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=V9
 
 ! Test the lower and upper bounds of 'set'
         ! CHECK: argument must be between
         set -2147483649, %o1
         ! CHECK: argument must be between
         set 4294967296, %o1
+
+        ! V8: unexpected token
+        ! V9: unknown membar tag
+        membar #BadTag
+
+        ! V8: instruction requires a CPU feature not currently enabled
+        ! V9: invalid membar mask number
+        membar -127
diff --git a/test/MC/Sparc/sparcv9-atomic-instructions.s b/test/MC/Sparc/sparcv9-atomic-instructions.s
index 40619a75548ecce01a4a3646faca7ce409f6653c..449450f70f62b971b7db3cc230618a27a6925f6c 100644
--- a/test/MC/Sparc/sparcv9-atomic-instructions.s
+++ b/test/MC/Sparc/sparcv9-atomic-instructions.s
@@ -1,8 +1,17 @@
 ! RUN: llvm-mc %s -arch=sparcv9 -show-encoding | FileCheck %s
 
-        ! CHECK: membar 15             ! encoding: [0x81,0x43,0xe0,0x0f]
+        ! CHECK: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore  ! encoding: [0x81,0x43,0xe0,0x0f]
         membar 15
 
+        ! CHECK: membar #LoadLoad  ! encoding: [0x81,0x43,0xe0,0x01]
+        membar #LoadLoad
+
+        ! CHECK: membar #LoadLoad | #StoreStore  ! encoding: [0x81,0x43,0xe0,0x09]
+        membar #LoadLoad | #StoreStore
+
+        ! CHECK: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore | #Lookaside | #MemIssue | #Sync  ! encoding: [0x81,0x43,0xe0,0x7f]
+        membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore | #Lookaside | #MemIssue | #Sync
+
         ! CHECK: cas [%i0], %l6, %o2   ! encoding: [0xd5,0xe6,0x10,0x16]
         cas [%i0], %l6, %o2
 
diff --git a/test/MC/WebAssembly/assembler-binary.ll b/test/MC/WebAssembly/assembler-binary.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3667a522ada51643dce866ecfd9c7c5943ab5a10
--- /dev/null
+++ b/test/MC/WebAssembly/assembler-binary.ll
@@ -0,0 +1,92 @@
+; RUN: llc -filetype=asm -asm-verbose=false %s -o %t.s
+; RUN: FileCheck -check-prefix=ASM -input-file %t.s %s
+; RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=asm %t.s -o - | FileCheck -check-prefix=ASM %s
+; RUN: llc -filetype=obj %s -o - | obj2yaml | FileCheck %s
+; RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj %t.s -o - | obj2yaml | FileCheck %s
+
+; This specifically tests that we can generate a binary from the assembler
+; that produces the same binary as the backend would.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare void @bar()
+
+define void @foo(i32 %n) {
+entry:
+  call void @bar()
+  ret void
+}
+
+; Checking assembly is not the point of this test, but if something breaks
+; it is easier to spot it here than in the yaml output.
+
+; ASM:     	.text
+; ASM:      	.file	"assembler-binary.ll"
+; ASM:      	.globl	foo
+; ASM:      foo:
+; ASM-NEXT: 	.functype	foo (i32) -> ()
+; ASM-NEXT: 	call	bar@FUNCTION
+; ASM-NEXT: 	end_function
+; ASM-NEXT: .Lfunc_end0:
+; ASM-NEXT: 	.size	foo, .Lfunc_end0-foo
+; ASM:       	.functype	bar () -> ()
+
+
+; CHECK:      --- !WASM
+; CHECK-NEXT: FileHeader:
+; CHECK-NEXT:   Version:         0x00000001
+; CHECK-NEXT: Sections:
+; CHECK-NEXT:   - Type:            TYPE
+; CHECK-NEXT:     Signatures:
+; CHECK-NEXT:       - Index:           0
+; CHECK-NEXT:         ReturnType:      NORESULT
+; CHECK-NEXT:         ParamTypes:
+; CHECK-NEXT:           - I32
+; CHECK-NEXT:       - Index:           1
+; CHECK-NEXT:         ReturnType:      NORESULT
+; CHECK-NEXT:         ParamTypes:      []
+; CHECK-NEXT:   - Type:            IMPORT
+; CHECK-NEXT:     Imports:
+; CHECK-NEXT:       - Module:          env
+; CHECK-NEXT:         Field:           __linear_memory
+; CHECK-NEXT:         Kind:            MEMORY
+; CHECK-NEXT:         Memory:
+; CHECK-NEXT:           Initial:         0x00000000
+; CHECK-NEXT:       - Module:          env
+; CHECK-NEXT:         Field:           __indirect_function_table
+; CHECK-NEXT:         Kind:            TABLE
+; CHECK-NEXT:         Table:
+; CHECK-NEXT:           ElemType:        ANYFUNC
+; CHECK-NEXT:           Limits:
+; CHECK-NEXT:             Initial:         0x00000000
+; CHECK-NEXT:       - Module:          env
+; CHECK-NEXT:         Field:           bar
+; CHECK-NEXT:         Kind:            FUNCTION
+; CHECK-NEXT:         SigIndex:        1
+; CHECK-NEXT:   - Type:            FUNCTION
+; CHECK-NEXT:     FunctionTypes:   [ 0 ]
+; CHECK-NEXT:   - Type:            CODE
+; CHECK-NEXT:     Relocations:
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_FUNCTION_INDEX_LEB
+; CHECK-NEXT:         Index:           1
+; CHECK-NEXT:         Offset:          0x00000004
+; CHECK-NEXT:     Functions:
+; CHECK-NEXT:       - Index:           1
+; CHECK-NEXT:         Locals:          []
+; CHECK-NEXT:         Body:            1080808080000B
+; CHECK-NEXT:   - Type:            CUSTOM
+; CHECK-NEXT:     Name:            linking
+; CHECK-NEXT:     Version:         1
+; CHECK-NEXT:     SymbolTable:
+; CHECK-NEXT:       - Index:           0
+; CHECK-NEXT:         Kind:            FUNCTION
+; CHECK-NEXT:         Name:            foo
+; CHECK-NEXT:         Flags:           [  ]
+; CHECK-NEXT:         Function:        1
+; CHECK-NEXT:       - Index:           1
+; CHECK-NEXT:         Kind:            FUNCTION
+; CHECK-NEXT:         Name:            bar
+; CHECK-NEXT:         Flags:           [ UNDEFINED ]
+; CHECK-NEXT:         Function:        0
+; CHECK-NEXT: ...
diff --git a/test/MC/WebAssembly/basic-assembly.s b/test/MC/WebAssembly/basic-assembly.s
index c2b316c9243829ec587319af507e47e5b1e2d1e0..6d95db482917262d71f4f00c333390a37fa8b2c5 100644
--- a/test/MC/WebAssembly/basic-assembly.s
+++ b/test/MC/WebAssembly/basic-assembly.s
@@ -3,10 +3,12 @@
 # RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s
 
     .text
+    .section .text.main,"",@
     .type    test0,@function
 test0:
     # Test all types:
-    .param      i32, i64
+    .functype   test0 (i32, i64) -> (i32)
+    .eventtype  __cpp_exception i32
     .local      f32, f64, v128, v128
     # Explicit getlocal/setlocal:
     get_local   2
@@ -43,6 +45,20 @@ test0:
     end_block                       # label0:
     get_local   4
     get_local   5
+    block
+    block
+    block
+    block
+    br_table {0, 1, 2}   # 2 entries, default
+    end_block            # first entry jumps here.
+    i32.const 1
+    br 2
+    end_block            # second entry jumps here.
+    i32.const 2
+    br 1
+    end_block            # default jumps here.
+    i32.const 3
+    end_block            # "switch" exit.
     f32x4.add
     # Test correct parsing of instructions with / and : in them:
     # TODO: enable once instruction has been added.
@@ -64,7 +80,8 @@ test0:
 
 # CHECK:           .text
 # CHECK-LABEL: test0:
-# CHECK-NEXT:      .param      i32, i64
+# CHECK-NEXT:      .functype test0 (i32, i64) -> (i32)
+# CHECK-NEXT:      .eventtype  __cpp_exception i32
 # CHECK-NEXT:      .local      f32, f64
 # CHECK-NEXT:      get_local   2
 # CHECK-NEXT:      set_local   2
@@ -97,6 +114,21 @@ test0:
 # CHECK-NEXT:      end_block                       # label0:
 # CHECK-NEXT:      get_local   4
 # CHECK-NEXT:      get_local   5
+# CHECK-NEXT:      block
+# CHECK-NEXT:      block
+# CHECK-NEXT:      block
+# CHECK-NEXT:      block
+# CHECK-NEXT:      br_table {0, 1, 2}  # 1: down to label4
+# CHECK-NEXT:                          # 2: down to label3
+# CHECK-NEXT:      end_block           # label5:
+# CHECK-NEXT:      i32.const 1
+# CHECK-NEXT:      br 2                # 2: down to label2
+# CHECK-NEXT:      end_block           # label4:
+# CHECK-NEXT:      i32.const 2
+# CHECK-NEXT:      br 1                # 1: down to label2
+# CHECK-NEXT:      end_block           # label3:
+# CHECK-NEXT:      i32.const 3
+# CHECK-NEXT:      end_block           # label2:
 # CHECK-NEXT:      f32x4.add
 # CHECK-NEXT:      i32.trunc_s/f32
 # CHECK-NEXT:      try
diff --git a/test/MC/WebAssembly/dwarfdump.ll b/test/MC/WebAssembly/dwarfdump.ll
index 3a761f6e9a0e6d86fab882a2d8f328f106fd2c30..0873a1a8f2920a22377f341bedbe37e7bf6709f0 100644
--- a/test/MC/WebAssembly/dwarfdump.ll
+++ b/test/MC/WebAssembly/dwarfdump.ll
@@ -31,14 +31,14 @@
 
 ; CHECK: 0x00000043:   DW_TAG_variable
 ; CHECK-NEXT:                DW_AT_name	("ptr2")
-; CHECK-NEXT:                DW_AT_type	(0x00000054 "subroutine *")
+; CHECK-NEXT:                DW_AT_type	(0x00000054 "void()*")
 ; CHECK-NEXT:                DW_AT_external	(true)
 ; CHECK-NEXT:                DW_AT_decl_file	("/usr/local/google/home/sbc/dev/wasm/simple{{[/\\]}}test.c")
 ; CHECK-NEXT:                DW_AT_decl_line	(5)
 ; CHECK-NEXT:                DW_AT_location	(DW_OP_addr 0x4)
 
 ; CHECK: 0x00000054:   DW_TAG_pointer_type
-; CHECK-NEXT:                DW_AT_type	(0x00000059 "subroutine ")
+; CHECK-NEXT:                DW_AT_type	(0x00000059 "void()")
 
 ; CHECK: 0x00000059:   DW_TAG_subroutine_type
 ; CHECK-NEXT:                DW_AT_prototyped	(true)
diff --git a/test/MC/WebAssembly/event-section.ll b/test/MC/WebAssembly/event-section.ll
new file mode 100644
index 0000000000000000000000000000000000000000..213817007f0287fac2ce1f768c36ebda8dab09f5
--- /dev/null
+++ b/test/MC/WebAssembly/event-section.ll
@@ -0,0 +1,58 @@
+; RUN: llc -filetype=obj -exception-model=wasm -mattr=+exception-handling %s -o - | obj2yaml | FileCheck %s
+; RUN: llc -filetype=obj -exception-model=wasm -mattr=+exception-handling %s -o - | llvm-readobj -s | FileCheck -check-prefix=SEC %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare void @llvm.wasm.throw(i32, i8*)
+
+define i32 @test_throw0(i8* %p) {
+  call void @llvm.wasm.throw(i32 0, i8* %p)
+  ret i32 0
+}
+
+define i32 @test_throw1(i8* %p) {
+  call void @llvm.wasm.throw(i32 0, i8* %p)
+  ret i32 1
+}
+
+; CHECK:      Sections:
+; CHECK-NEXT:   - Type:            TYPE
+; CHECK-NEXT:     Signatures:
+; CHECK-NEXT:       - Index:           0
+; CHECK-NEXT:         ReturnType:      I32
+; CHECK-NEXT:         ParamTypes:
+; CHECK-NEXT:           - I32
+; CHECK-NEXT:       - Index:           1
+; CHECK-NEXT:         ReturnType:      NORESULT
+; CHECK-NEXT:         ParamTypes:
+; CHECK-NEXT:           - I32
+
+; CHECK:        - Type:            EVENT
+; CHECK-NEXT:     Events:
+; CHECK-NEXT:       - Index:           0
+; CHECK-NEXT:         Attribute:       0
+; CHECK-NEXT:         SigIndex:        1
+
+; CHECK-NEXT:   - Type:            CODE
+; CHECK-NEXT:     Relocations:
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_EVENT_INDEX_LEB
+; CHECK-NEXT:         Index:           1
+; CHECK-NEXT:         Offset:          0x00000006
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_EVENT_INDEX_LEB
+; CHECK-NEXT:         Index:           1
+; CHECK-NEXT:         Offset:          0x00000011
+
+; CHECK:        - Type:            CUSTOM
+; CHECK-NEXT:     Name:            linking
+; CHECK-NEXT:     Version:         1
+; CHECK-NEXT:     SymbolTable:
+
+; CHECK:            - Index:           1
+; CHECK-NEXT:         Kind:            EVENT
+; CHECK-NEXT:         Name:            __cpp_exception
+; CHECK-NEXT:         Flags:           [ BINDING_WEAK ]
+; CHECK-NEXT:         Event:           0
+
+; SEC:          Type: EVENT (0xD)
+; SEC-NEXT:     Size: 3
+; SEC-NEXT:     Offset: 97
diff --git a/test/MC/WebAssembly/simd-encodings.s b/test/MC/WebAssembly/simd-encodings.s
index 8cd4bc9cd342b1ef2b92f0fe01f7288c4c6afb1b..abb033ca2150604bb62e0694eb5e934c7e563d75 100644
--- a/test/MC/WebAssembly/simd-encodings.s
+++ b/test/MC/WebAssembly/simd-encodings.s
@@ -1,13 +1,19 @@
 # RUN: llvm-mc -show-encoding -triple=wasm32-unkown-unknown -mattr=+sign-ext,+simd128 < %s | FileCheck %s
 
+    # CHECK: v128.load 48:p2align=0 # encoding: [0xfd,0x00,0x00,0x30]
+    v128.load 48
+
+    # CHECK: v128.store 48:p2align=0 # encoding: [0xfd,0x01,0x00,0x30]
+    v128.store 48
+
     # CHECK: v128.const 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-    # CHECK-SAME: # encoding: [0xfd,0x00,
+    # CHECK-SAME: # encoding: [0xfd,0x02,
     # CHECK-SAME: 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
     # CHECK-SAME: 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f]
     v128.const 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 
     # CHECK: v128.const 256, 770, 1284, 1798, 2312, 2826, 3340, 3854
-    # CHECK-SAME: # encoding: [0xfd,0x00,
+    # CHECK-SAME: # encoding: [0xfd,0x02,
     # CHECK-SAME: 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
     # CHECK-SAME: 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f]
     v128.const 256, 770, 1284, 1798, 2312, 2826, 3340, 3854
@@ -16,436 +22,430 @@
 
     # CHECK: v128.const 0x1.0402p-121, 0x1.0c0a08p-113,
     # CHECK-SAME:       0x1.14121p-105, 0x1.1c1a18p-97
-    # CHECK-SAME: # encoding: [0xfd,0x00,
+    # CHECK-SAME: # encoding: [0xfd,0x02,
     # CHECK-SAME: 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
     # CHECK-SAME: 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f]
     v128.const 0x1.0402p-121, 0x1.0c0a08p-113, 0x1.14121p-105, 0x1.1c1a18p-97
 
     # CHECK: v128.const 0x1.60504030201p-911, 0x1.e0d0c0b0a0908p-783
-    # CHECK-SAME: # encoding: [0xfd,0x00,
+    # CHECK-SAME: # encoding: [0xfd,0x02,
     # CHECK-SAME: 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
     # CHECK-SAME: 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f]
     v128.const 0x1.60504030201p-911, 0x1.e0d0c0b0a0908p-783
 
-    # CHECK: v128.load 48:p2align=0 # encoding: [0xfd,0x01,0x00,0x30]
-    v128.load 48
-
-    # CHECK: v128.store 48:p2align=0 # encoding: [0xfd,0x02,0x00,0x30]
-    v128.store 48
+    # CHECK: v8x16.shuffle 0, 17, 2, 19, 4, 21, 6, 23,
+    # CHECK-SAME:          8, 25, 10, 27, 12, 29, 14, 31
+    # CHECK-SAME: # encoding: [0xfd,0x03,
+    # CHECK-SAME: 0x00,0x11,0x02,0x13,0x04,0x15,0x06,0x17,
+    # CHECK-SAME: 0x08,0x19,0x0a,0x1b,0x0c,0x1d,0x0e,0x1f]
+    v8x16.shuffle 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31
 
-    # CHECK: i8x16.splat # encoding: [0xfd,0x03]
+    # CHECK: i8x16.splat # encoding: [0xfd,0x04]
     i8x16.splat
 
-    # CHECK: i16x8.splat # encoding: [0xfd,0x04]
-    i16x8.splat
-
-    # CHECK: i32x4.splat # encoding: [0xfd,0x05]
-    i32x4.splat
-
-    # CHECK: i64x2.splat # encoding: [0xfd,0x06]
-    i64x2.splat
-
-    # CHECK: f32x4.splat # encoding: [0xfd,0x07]
-    f32x4.splat
-
-    # CHECK: f64x2.splat # encoding: [0xfd,0x08]
-    f64x2.splat
-
-    # CHECK: i8x16.extract_lane_s 15 # encoding: [0xfd,0x09,0x0f]
+    # CHECK: i8x16.extract_lane_s 15 # encoding: [0xfd,0x05,0x0f]
     i8x16.extract_lane_s 15
 
-    # CHECK: i8x16.extract_lane_u 15 # encoding: [0xfd,0x0a,0x0f]
+    # CHECK: i8x16.extract_lane_u 15 # encoding: [0xfd,0x06,0x0f]
     i8x16.extract_lane_u 15
 
-    # CHECK: i16x8.extract_lane_s 7 # encoding: [0xfd,0x0b,0x07]
+    # CHECK: i8x16.replace_lane 15 # encoding: [0xfd,0x07,0x0f]
+    i8x16.replace_lane 15
+
+    # CHECK: i16x8.splat # encoding: [0xfd,0x08]
+    i16x8.splat
+
+    # CHECK: i16x8.extract_lane_s 7 # encoding: [0xfd,0x09,0x07]
     i16x8.extract_lane_s 7
 
-    # CHECK: i16x8.extract_lane_u 7 # encoding: [0xfd,0x0c,0x07]
+    # CHECK: i16x8.extract_lane_u 7 # encoding: [0xfd,0x0a,0x07]
     i16x8.extract_lane_u 7
 
+    # CHECK: i16x8.replace_lane 7 # encoding: [0xfd,0x0b,0x07]
+    i16x8.replace_lane 7
+
+    # CHECK: i32x4.splat # encoding: [0xfd,0x0c]
+    i32x4.splat
+
     # CHECK: i32x4.extract_lane 3 # encoding: [0xfd,0x0d,0x03]
     i32x4.extract_lane 3
 
-    # CHECK: i64x2.extract_lane 1 # encoding: [0xfd,0x0e,0x01]
+    # CHECK: i32x4.replace_lane 3 # encoding: [0xfd,0x0e,0x03]
+    i32x4.replace_lane 3
+
+    # CHECK: i64x2.splat # encoding: [0xfd,0x0f]
+    i64x2.splat
+
+    # CHECK: i64x2.extract_lane 1 # encoding: [0xfd,0x10,0x01]
     i64x2.extract_lane 1
 
-    # CHECK: f32x4.extract_lane 3 # encoding: [0xfd,0x0f,0x03]
+    # CHECK: i64x2.replace_lane 1 # encoding: [0xfd,0x11,0x01]
+    i64x2.replace_lane 1
+
+    # CHECK: f32x4.splat # encoding: [0xfd,0x12]
+    f32x4.splat
+
+    # CHECK: f32x4.extract_lane 3 # encoding: [0xfd,0x13,0x03]
     f32x4.extract_lane 3
 
-    # CHECK: f64x2.extract_lane 1 # encoding: [0xfd,0x10,0x01]
+    # CHECK: f32x4.replace_lane 3 # encoding: [0xfd,0x14,0x03]
+    f32x4.replace_lane 3
+
+    # CHECK: f64x2.splat # encoding: [0xfd,0x15]
+    f64x2.splat
+
+    # CHECK: f64x2.extract_lane 1 # encoding: [0xfd,0x16,0x01]
     f64x2.extract_lane 1
 
-    # CHECK: i8x16.replace_lane 15 # encoding: [0xfd,0x11,0x0f]
-    i8x16.replace_lane 15
+    # CHECK: f64x2.replace_lane 1 # encoding: [0xfd,0x17,0x01]
+    f64x2.replace_lane 1
 
-    # CHECK: i16x8.replace_lane 7 # encoding: [0xfd,0x12,0x07]
-    i16x8.replace_lane 7
+    # CHECK: i8x16.eq # encoding: [0xfd,0x18]
+    i8x16.eq
 
-    # CHECK: i32x4.replace_lane 3 # encoding: [0xfd,0x13,0x03]
-    i32x4.replace_lane 3
+    # CHECK: i8x16.ne # encoding: [0xfd,0x19]
+    i8x16.ne
 
-    # CHECK: i64x2.replace_lane 1 # encoding: [0xfd,0x14,0x01]
-    i64x2.replace_lane 1
+    # CHECK: i8x16.lt_s # encoding: [0xfd,0x1a]
+    i8x16.lt_s
 
-    # CHECK: f32x4.replace_lane 3 # encoding: [0xfd,0x15,0x03]
-    f32x4.replace_lane 3
+    # CHECK: i8x16.lt_u # encoding: [0xfd,0x1b]
+    i8x16.lt_u
 
-    # CHECK: f64x2.replace_lane 1 # encoding: [0xfd,0x16,0x01]
-    f64x2.replace_lane 1
+    # CHECK: i8x16.gt_s # encoding: [0xfd,0x1c]
+    i8x16.gt_s
 
-    # CHECK: v8x16.shuffle 0, 17, 2, 19, 4, 21, 6, 23,
-    # CHECK-SAME:          8, 25, 10, 27, 12, 29, 14, 31
-    # CHECK-SAME: # encoding: [0xfd,0x17,
-    # CHECK-SAME: 0x00,0x11,0x02,0x13,0x04,0x15,0x06,0x17,
-    # CHECK-SAME: 0x08,0x19,0x0a,0x1b,0x0c,0x1d,0x0e,0x1f]
-    v8x16.shuffle 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31
+    # CHECK: i8x16.gt_u # encoding: [0xfd,0x1d]
+    i8x16.gt_u
 
-    # CHECK: i8x16.add # encoding: [0xfd,0x18]
-    i8x16.add
+    # CHECK: i8x16.le_s # encoding: [0xfd,0x1e]
+    i8x16.le_s
 
-    # CHECK: i16x8.add # encoding: [0xfd,0x19]
-    i16x8.add
+    # CHECK: i8x16.le_u # encoding: [0xfd,0x1f]
+    i8x16.le_u
 
-    # CHECK: i32x4.add # encoding: [0xfd,0x1a]
-    i32x4.add
+    # CHECK: i8x16.ge_s # encoding: [0xfd,0x20]
+    i8x16.ge_s
 
-    # CHECK: i64x2.add # encoding: [0xfd,0x1b]
-    i64x2.add
+    # CHECK: i8x16.ge_u # encoding: [0xfd,0x21]
+    i8x16.ge_u
 
-    # CHECK: i8x16.sub # encoding: [0xfd,0x1c]
-    i8x16.sub
+    # CHECK: i16x8.eq # encoding: [0xfd,0x22]
+    i16x8.eq
 
-    # CHECK: i16x8.sub # encoding: [0xfd,0x1d]
-    i16x8.sub
+    # CHECK: i16x8.ne # encoding: [0xfd,0x23]
+    i16x8.ne
 
-    # CHECK: i32x4.sub # encoding: [0xfd,0x1e]
-    i32x4.sub
+    # CHECK: i16x8.lt_s # encoding: [0xfd,0x24]
+    i16x8.lt_s
 
-    # CHECK: i64x2.sub # encoding: [0xfd,0x1f]
-    i64x2.sub
+    # CHECK: i16x8.lt_u # encoding: [0xfd,0x25]
+    i16x8.lt_u
 
-    # CHECK: i8x16.mul # encoding: [0xfd,0x20]
-    i8x16.mul
+    # CHECK: i16x8.gt_s # encoding: [0xfd,0x26]
+    i16x8.gt_s
 
-    # CHECK: i16x8.mul # encoding: [0xfd,0x21]
-    i16x8.mul
+    # CHECK: i16x8.gt_u # encoding: [0xfd,0x27]
+    i16x8.gt_u
 
-    # CHECK: i32x4.mul # encoding: [0xfd,0x22]
-    i32x4.mul
+    # CHECK: i16x8.le_s # encoding: [0xfd,0x28]
+    i16x8.le_s
 
-    # CHECK: i8x16.neg # encoding: [0xfd,0x24]
-    i8x16.neg
+    # CHECK: i16x8.le_u # encoding: [0xfd,0x29]
+    i16x8.le_u
 
-    # CHECK: i16x8.neg # encoding: [0xfd,0x25]
-    i16x8.neg
+    # CHECK: i16x8.ge_s # encoding: [0xfd,0x2a]
+    i16x8.ge_s
 
-    # CHECK: i32x4.neg # encoding: [0xfd,0x26]
-    i32x4.neg
+    # CHECK: i16x8.ge_u # encoding: [0xfd,0x2b]
+    i16x8.ge_u
 
-    # CHECK: i64x2.neg # encoding: [0xfd,0x27]
-    i64x2.neg
+    # CHECK: i32x4.eq # encoding: [0xfd,0x2c]
+    i32x4.eq
 
-    # CHECK: i8x16.add_saturate_s # encoding: [0xfd,0x28]
-    i8x16.add_saturate_s
+    # CHECK: i32x4.ne # encoding: [0xfd,0x2d]
+    i32x4.ne
 
-    # CHECK: i8x16.add_saturate_u # encoding: [0xfd,0x29]
-    i8x16.add_saturate_u
+    # CHECK: i32x4.lt_s # encoding: [0xfd,0x2e]
+    i32x4.lt_s
 
-    # CHECK: i16x8.add_saturate_s # encoding: [0xfd,0x2a]
-    i16x8.add_saturate_s
+    # CHECK: i32x4.lt_u # encoding: [0xfd,0x2f]
+    i32x4.lt_u
 
-    # CHECK: i16x8.add_saturate_u # encoding: [0xfd,0x2b]
-    i16x8.add_saturate_u
+    # CHECK: i32x4.gt_s # encoding: [0xfd,0x30]
+    i32x4.gt_s
 
-    # CHECK: i8x16.sub_saturate_s # encoding: [0xfd,0x2c]
-    i8x16.sub_saturate_s
+    # CHECK: i32x4.gt_u # encoding: [0xfd,0x31]
+    i32x4.gt_u
 
-    # CHECK: i8x16.sub_saturate_u # encoding: [0xfd,0x2d]
-    i8x16.sub_saturate_u
+    # CHECK: i32x4.le_s # encoding: [0xfd,0x32]
+    i32x4.le_s
 
-    # CHECK: i16x8.sub_saturate_s # encoding: [0xfd,0x2e]
-    i16x8.sub_saturate_s
+    # CHECK: i32x4.le_u # encoding: [0xfd,0x33]
+    i32x4.le_u
 
-    # CHECK: i16x8.sub_saturate_u # encoding: [0xfd,0x2f]
-    i16x8.sub_saturate_u
+    # CHECK: i32x4.ge_s # encoding: [0xfd,0x34]
+    i32x4.ge_s
 
-    # CHECK: i8x16.shl # encoding: [0xfd,0x30]
-    i8x16.shl
+    # CHECK: i32x4.ge_u # encoding: [0xfd,0x35]
+    i32x4.ge_u
 
-    # CHECK: i16x8.shl # encoding: [0xfd,0x31]
-    i16x8.shl
+    # CHECK: f32x4.eq # encoding: [0xfd,0x40]
+    f32x4.eq
 
-    # CHECK: i32x4.shl # encoding: [0xfd,0x32]
-    i32x4.shl
+    # CHECK: f32x4.ne # encoding: [0xfd,0x41]
+    f32x4.ne
 
-    # CHECK: i64x2.shl # encoding: [0xfd,0x33]
-    i64x2.shl
+    # CHECK: f32x4.lt # encoding: [0xfd,0x42]
+    f32x4.lt
 
-    # CHECK: i8x16.shr_s # encoding: [0xfd,0x34]
-    i8x16.shr_s
+    # CHECK: f32x4.gt # encoding: [0xfd,0x43]
+    f32x4.gt
 
-    # CHECK: i8x16.shr_u # encoding: [0xfd,0x35]
-    i8x16.shr_u
+    # CHECK: f32x4.le # encoding: [0xfd,0x44]
+    f32x4.le
 
-    # CHECK: i16x8.shr_s # encoding: [0xfd,0x36]
-    i16x8.shr_s
+    # CHECK: f32x4.ge # encoding: [0xfd,0x45]
+    f32x4.ge
 
-    # CHECK: i16x8.shr_u # encoding: [0xfd,0x37]
-    i16x8.shr_u
+    # CHECK: f64x2.eq # encoding: [0xfd,0x46]
+    f64x2.eq
 
-    # CHECK: i32x4.shr_s # encoding: [0xfd,0x38]
-    i32x4.shr_s
+    # CHECK: f64x2.ne # encoding: [0xfd,0x47]
+    f64x2.ne
 
-    # CHECK: i32x4.shr_u # encoding: [0xfd,0x39]
-    i32x4.shr_u
+    # CHECK: f64x2.lt # encoding: [0xfd,0x48]
+    f64x2.lt
 
-    # CHECK: i64x2.shr_s # encoding: [0xfd,0x3a]
-    i64x2.shr_s
+    # CHECK: f64x2.gt # encoding: [0xfd,0x49]
+    f64x2.gt
 
-    # CHECK: i64x2.shr_u # encoding: [0xfd,0x3b]
-    i64x2.shr_u
+    # CHECK: f64x2.le # encoding: [0xfd,0x4a]
+    f64x2.le
+
+    # CHECK: f64x2.ge # encoding: [0xfd,0x4b]
+    f64x2.ge
+
+    # CHECK: v128.not # encoding: [0xfd,0x4c]
+    v128.not
 
-    # CHECK: v128.and # encoding: [0xfd,0x3c]
+    # CHECK: v128.and # encoding: [0xfd,0x4d]
     v128.and
 
-    # CHECK: v128.or # encoding: [0xfd,0x3d]
+    # CHECK: v128.or # encoding: [0xfd,0x4e]
     v128.or
 
-    # CHECK: v128.xor # encoding: [0xfd,0x3e]
+    # CHECK: v128.xor # encoding: [0xfd,0x4f]
     v128.xor
 
-    # CHECK: v128.not # encoding: [0xfd,0x3f]
-    v128.not
-
-    # CHECK: v128.bitselect # encoding: [0xfd,0x40]
+    # CHECK: v128.bitselect # encoding: [0xfd,0x50]
     v128.bitselect
 
-    # CHECK: i8x16.any_true # encoding: [0xfd,0x41]
-    i8x16.any_true
-
-    # CHECK: i16x8.any_true # encoding: [0xfd,0x42]
-    i16x8.any_true
-
-    # CHECK: i32x4.any_true # encoding: [0xfd,0x43]
-    i32x4.any_true
+    # CHECK: i8x16.neg # encoding: [0xfd,0x51]
+    i8x16.neg
 
-    # CHECK: i64x2.any_true # encoding: [0xfd,0x44]
-    i64x2.any_true
+    # CHECK: i8x16.any_true # encoding: [0xfd,0x52]
+    i8x16.any_true
 
-    # CHECK: i8x16.all_true # encoding: [0xfd,0x45]
+    # CHECK: i8x16.all_true # encoding: [0xfd,0x53]
     i8x16.all_true
 
-    # CHECK: i16x8.all_true # encoding: [0xfd,0x46]
-    i16x8.all_true
-
-    # CHECK: i32x4.all_true # encoding: [0xfd,0x47]
-    i32x4.all_true
-
-    # CHECK: i64x2.all_true # encoding: [0xfd,0x48]
-    i64x2.all_true
+    # CHECK: i8x16.shl # encoding: [0xfd,0x54]
+    i8x16.shl
 
-    # CHECK: i8x16.eq # encoding: [0xfd,0x49]
-    i8x16.eq
+    # CHECK: i8x16.shr_s # encoding: [0xfd,0x55]
+    i8x16.shr_s
 
-    # CHECK: i16x8.eq # encoding: [0xfd,0x4a]
-    i16x8.eq
+    # CHECK: i8x16.shr_u # encoding: [0xfd,0x56]
+    i8x16.shr_u
 
-    # CHECK: i32x4.eq # encoding: [0xfd,0x4b]
-    i32x4.eq
+    # CHECK: i8x16.add # encoding: [0xfd,0x57]
+    i8x16.add
 
-    # CHECK: f32x4.eq # encoding: [0xfd,0x4d]
-    f32x4.eq
+    # CHECK: i8x16.add_saturate_s # encoding: [0xfd,0x58]
+    i8x16.add_saturate_s
 
-    # CHECK: f64x2.eq # encoding: [0xfd,0x4e]
-    f64x2.eq
+    # CHECK: i8x16.add_saturate_u # encoding: [0xfd,0x59]
+    i8x16.add_saturate_u
 
-    # CHECK: i8x16.ne # encoding: [0xfd,0x4f]
-    i8x16.ne
+    # CHECK: i8x16.sub # encoding: [0xfd,0x5a]
+    i8x16.sub
 
-    # CHECK: i16x8.ne # encoding: [0xfd,0x50]
-    i16x8.ne
+    # CHECK: i8x16.sub_saturate_s # encoding: [0xfd,0x5b]
+    i8x16.sub_saturate_s
 
-    # CHECK: i32x4.ne # encoding: [0xfd,0x51]
-    i32x4.ne
+    # CHECK: i8x16.sub_saturate_u # encoding: [0xfd,0x5c]
+    i8x16.sub_saturate_u
 
-    # CHECK: f32x4.ne # encoding: [0xfd,0x53]
-    f32x4.ne
+    # CHECK: i8x16.mul # encoding: [0xfd,0x5d]
+    i8x16.mul
 
-    # CHECK: f64x2.ne # encoding: [0xfd,0x54]
-    f64x2.ne
+    # CHECK: i16x8.neg # encoding: [0xfd,0x62]
+    i16x8.neg
 
-    # CHECK: i8x16.lt_s # encoding: [0xfd,0x55]
-    i8x16.lt_s
+    # CHECK: i16x8.any_true # encoding: [0xfd,0x63]
+    i16x8.any_true
 
-    # CHECK: i8x16.lt_u # encoding: [0xfd,0x56]
-    i8x16.lt_u
+    # CHECK: i16x8.all_true # encoding: [0xfd,0x64]
+    i16x8.all_true
 
-    # CHECK: i16x8.lt_s # encoding: [0xfd,0x57]
-    i16x8.lt_s
+    # CHECK: i16x8.shl # encoding: [0xfd,0x65]
+    i16x8.shl
 
-    # CHECK: i16x8.lt_u # encoding: [0xfd,0x58]
-    i16x8.lt_u
+    # CHECK: i16x8.shr_s # encoding: [0xfd,0x66]
+    i16x8.shr_s
 
-    # CHECK: i32x4.lt_s # encoding: [0xfd,0x59]
-    i32x4.lt_s
+    # CHECK: i16x8.shr_u # encoding: [0xfd,0x67]
+    i16x8.shr_u
 
-    # CHECK: i32x4.lt_u # encoding: [0xfd,0x5a]
-    i32x4.lt_u
+    # CHECK: i16x8.add # encoding: [0xfd,0x68]
+    i16x8.add
 
-    # CHECK: f32x4.lt # encoding: [0xfd,0x5d]
-    f32x4.lt
+    # CHECK: i16x8.add_saturate_s # encoding: [0xfd,0x69]
+    i16x8.add_saturate_s
 
-    # CHECK: f64x2.lt # encoding: [0xfd,0x5e]
-    f64x2.lt
+    # CHECK: i16x8.add_saturate_u # encoding: [0xfd,0x6a]
+    i16x8.add_saturate_u
 
-    # CHECK: i8x16.le_s # encoding: [0xfd,0x5f]
-    i8x16.le_s
+    # CHECK: i16x8.sub # encoding: [0xfd,0x6b]
+    i16x8.sub
 
-    # CHECK: i8x16.le_u # encoding: [0xfd,0x60]
-    i8x16.le_u
+    # CHECK: i16x8.sub_saturate_s # encoding: [0xfd,0x6c]
+    i16x8.sub_saturate_s
 
-    # CHECK: i16x8.le_s # encoding: [0xfd,0x61]
-    i16x8.le_s
+    # CHECK: i16x8.sub_saturate_u # encoding: [0xfd,0x6d]
+    i16x8.sub_saturate_u
 
-    # CHECK: i16x8.le_u # encoding: [0xfd,0x62]
-    i16x8.le_u
+    # CHECK: i16x8.mul # encoding: [0xfd,0x6e]
+    i16x8.mul
 
-    # CHECK: i32x4.le_s # encoding: [0xfd,0x63]
-    i32x4.le_s
+    # CHECK: i32x4.neg # encoding: [0xfd,0x73]
+    i32x4.neg
 
-    # CHECK: i32x4.le_u # encoding: [0xfd,0x64]
-    i32x4.le_u
+    # CHECK: i32x4.any_true # encoding: [0xfd,0x74]
+    i32x4.any_true
 
-    # CHECK: f32x4.le # encoding: [0xfd,0x67]
-    f32x4.le
+    # CHECK: i32x4.all_true # encoding: [0xfd,0x75]
+    i32x4.all_true
 
-    # CHECK: f64x2.le # encoding: [0xfd,0x68]
-    f64x2.le
+    # CHECK: i32x4.shl # encoding: [0xfd,0x76]
+    i32x4.shl
 
-    # CHECK: i8x16.gt_s # encoding: [0xfd,0x69]
-    i8x16.gt_s
+    # CHECK: i32x4.shr_s # encoding: [0xfd,0x77]
+    i32x4.shr_s
 
-    # CHECK: i8x16.gt_u # encoding: [0xfd,0x6a]
-    i8x16.gt_u
+    # CHECK: i32x4.shr_u # encoding: [0xfd,0x78]
+    i32x4.shr_u
 
-    # CHECK: i16x8.gt_s # encoding: [0xfd,0x6b]
-    i16x8.gt_s
+    # CHECK: i32x4.add # encoding: [0xfd,0x79]
+    i32x4.add
 
-    # CHECK: i16x8.gt_u # encoding: [0xfd,0x6c]
-    i16x8.gt_u
+    # CHECK: i32x4.sub # encoding: [0xfd,0x7c]
+    i32x4.sub
 
-    # CHECK: i32x4.gt_s # encoding: [0xfd,0x6d]
-    i32x4.gt_s
+    # CHECK: i32x4.mul # encoding: [0xfd,0x7f]
+    i32x4.mul
 
-    # CHECK: i32x4.gt_u # encoding: [0xfd,0x6e]
-    i32x4.gt_u
+    # CHECK: i64x2.neg # encoding: [0xfd,0x84,0x01]
+    i64x2.neg
 
-    # CHECK: f32x4.gt # encoding: [0xfd,0x71]
-    f32x4.gt
+    # CHECK: i64x2.any_true # encoding: [0xfd,0x85,0x01]
+    i64x2.any_true
 
-    # CHECK: f64x2.gt # encoding: [0xfd,0x72]
-    f64x2.gt
+    # CHECK: i64x2.all_true # encoding: [0xfd,0x86,0x01]
+    i64x2.all_true
 
-    # CHECK: i8x16.ge_s # encoding: [0xfd,0x73]
-    i8x16.ge_s
+    # CHECK: i64x2.shl # encoding: [0xfd,0x87,0x01]
+    i64x2.shl
 
-    # CHECK: i8x16.ge_u # encoding: [0xfd,0x74]
-    i8x16.ge_u
+    # CHECK: i64x2.shr_s # encoding: [0xfd,0x88,0x01]
+    i64x2.shr_s
 
-    # CHECK: i16x8.ge_s # encoding: [0xfd,0x75]
-    i16x8.ge_s
+    # CHECK: i64x2.shr_u # encoding: [0xfd,0x89,0x01]
+    i64x2.shr_u
 
-    # CHECK: i16x8.ge_u # encoding: [0xfd,0x76]
-    i16x8.ge_u
+    # CHECK: i64x2.add # encoding: [0xfd,0x8a,0x01]
+    i64x2.add
 
-    # CHECK: i32x4.ge_s # encoding: [0xfd,0x77]
-    i32x4.ge_s
+    # CHECK: i64x2.sub # encoding: [0xfd,0x8d,0x01]
+    i64x2.sub
 
-    # CHECK: i32x4.ge_u # encoding: [0xfd,0x78]
-    i32x4.ge_u
+    # CHECK: f32x4.abs # encoding: [0xfd,0x95,0x01]
+    f32x4.abs
 
-    # CHECK: f32x4.ge # encoding: [0xfd,0x7b]
-    f32x4.ge
+    # CHECK: f32x4.neg # encoding: [0xfd,0x96,0x01]
+    f32x4.neg
 
-    # CHECK: f64x2.ge # encoding: [0xfd,0x7c]
-    f64x2.ge
+    # CHECK: f32x4.sqrt # encoding: [0xfd,0x97,0x01]
+    f32x4.sqrt
 
-    # CHECK: f32x4.neg # encoding: [0xfd,0x7d]
-    f32x4.neg
+    # CHECK: f32x4.add # encoding: [0xfd,0x9a,0x01]
+    f32x4.add
 
-    # CHECK: f64x2.neg # encoding: [0xfd,0x7e]
-    f64x2.neg
+    # CHECK: f32x4.sub # encoding: [0xfd,0x9b,0x01]
+    f32x4.sub
 
-    # CHECK: f32x4.abs # encoding: [0xfd,0x7f]
-    f32x4.abs
+    # CHECK: f32x4.mul # encoding: [0xfd,0x9c,0x01]
+    f32x4.mul
 
-    # CHECK: f64x2.abs # encoding: [0xfd,0x80]
-    f64x2.abs
+    # CHECK: f32x4.div # encoding: [0xfd,0x9d,0x01]
+    f32x4.div
 
-    # CHECK: f32x4.min # encoding: [0xfd,0x81]
+    # CHECK: f32x4.min # encoding: [0xfd,0x9e,0x01]
     f32x4.min
 
-    # CHECK: f64x2.min # encoding: [0xfd,0x82]
-    f64x2.min
-
-    # CHECK: f32x4.max # encoding: [0xfd,0x83]
+    # CHECK: f32x4.max # encoding: [0xfd,0x9f,0x01]
     f32x4.max
 
-    # CHECK: f64x2.max # encoding: [0xfd,0x84]
-    f64x2.max
+    # CHECK: f64x2.abs # encoding: [0xfd,0xa0,0x01]
+    f64x2.abs
 
-    # CHECK: f32x4.add # encoding: [0xfd,0x85]
-    f32x4.add
+    # CHECK: f64x2.neg # encoding: [0xfd,0xa1,0x01]
+    f64x2.neg
 
-    # CHECK: f64x2.add # encoding: [0xfd,0x86]
-    f64x2.add
+    # CHECK: f64x2.sqrt # encoding: [0xfd,0xa2,0x01]
+    f64x2.sqrt
 
-    # CHECK: f32x4.sub # encoding: [0xfd,0x87]
-    f32x4.sub
+    # CHECK: f64x2.add # encoding: [0xfd,0xa5,0x01]
+    f64x2.add
 
-    # CHECK: f64x2.sub # encoding: [0xfd,0x88]
+    # CHECK: f64x2.sub # encoding: [0xfd,0xa6,0x01]
     f64x2.sub
 
-    # CHECK: f32x4.div # encoding: [0xfd,0x89]
-    f32x4.div
+    # CHECK: f64x2.mul # encoding: [0xfd,0xa7,0x01]
+    f64x2.mul
 
-    # CHECK: f64x2.div # encoding: [0xfd,0x8a]
+    # CHECK: f64x2.div # encoding: [0xfd,0xa8,0x01]
     f64x2.div
 
-    # CHECK: f32x4.mul # encoding: [0xfd,0x8b]
-    f32x4.mul
+    # CHECK: f64x2.min # encoding: [0xfd,0xa9,0x01]
+    f64x2.min
 
-    # CHECK: f64x2.mul # encoding: [0xfd,0x8c]
-    f64x2.mul
+    # CHECK: f64x2.max # encoding: [0xfd,0xaa,0x01]
+    f64x2.max
 
-    # CHECK: f32x4.sqrt # encoding: [0xfd,0x8d]
-    f32x4.sqrt
+    # CHECK: i32x4.trunc_sat_s/f32x4 # encoding: [0xfd,0xab,0x01]
+    i32x4.trunc_sat_s/f32x4
 
-    # CHECK: f64x2.sqrt # encoding: [0xfd,0x8e]
-    f64x2.sqrt
+    # CHECK: i32x4.trunc_sat_u/f32x4 # encoding: [0xfd,0xac,0x01]
+    i32x4.trunc_sat_u/f32x4
+
+    # CHECK: i64x2.trunc_sat_s/f64x2 # encoding: [0xfd,0xad,0x01]
+    i64x2.trunc_sat_s/f64x2
+
+    # CHECK: i64x2.trunc_sat_u/f64x2 # encoding: [0xfd,0xae,0x01]
+    i64x2.trunc_sat_u/f64x2
 
-    # CHECK: f32x4.convert_s/i32x4 # encoding: [0xfd,0x8f]
+    # CHECK: f32x4.convert_s/i32x4 # encoding: [0xfd,0xaf,0x01]
     f32x4.convert_s/i32x4
 
-    # CHECK: f32x4.convert_u/i32x4 # encoding: [0xfd,0x90]
+    # CHECK: f32x4.convert_u/i32x4 # encoding: [0xfd,0xb0,0x01]
     f32x4.convert_u/i32x4
 
-    # CHECK: f64x2.convert_s/i64x2 # encoding: [0xfd,0x91]
+    # CHECK: f64x2.convert_s/i64x2 # encoding: [0xfd,0xb1,0x01]
     f64x2.convert_s/i64x2
 
-    # CHECK: f64x2.convert_u/i64x2 # encoding: [0xfd,0x92]
+    # CHECK: f64x2.convert_u/i64x2 # encoding: [0xfd,0xb2,0x01]
     f64x2.convert_u/i64x2
 
-    # CHECK: i32x4.trunc_sat_s/f32x4 # encoding: [0xfd,0x93]
-    i32x4.trunc_sat_s/f32x4
-
-    # CHECK: i32x4.trunc_sat_u/f32x4 # encoding: [0xfd,0x94]
-    i32x4.trunc_sat_u/f32x4
-
-    # CHECK: i64x2.trunc_sat_s/f64x2 # encoding: [0xfd,0x95]
-    i64x2.trunc_sat_s/f64x2
-
-    # CHECK: i64x2.trunc_sat_u/f64x2 # encoding: [0xfd,0x96]
-    i64x2.trunc_sat_u/f64x2
-
     end_function
diff --git a/test/MC/WebAssembly/weak-alias.ll b/test/MC/WebAssembly/weak-alias.ll
index bd70e74d3ac07b001a258bdfadffe1aacad004ba..bf636953ea4ebdb4f08216d91acab292c7d79ec7 100644
--- a/test/MC/WebAssembly/weak-alias.ll
+++ b/test/MC/WebAssembly/weak-alias.ll
@@ -215,8 +215,8 @@ entry:
 ; CHECK-SYMS-NEXT: 00000002 g     F CODE	.hidden call_alias
 ; CHECK-SYMS-NEXT: 00000000 gw    F CODE	.hidden foo_alias
 ; CHECK-SYMS-NEXT: 00000003 g     F CODE	.hidden call_direct_ptr
-; CHECK-SYMS-NEXT: 00000008 g       DATA	direct_address
+; CHECK-SYMS-NEXT: 00000008 g     O DATA	direct_address
 ; CHECK-SYMS-NEXT: 00000004 g     F CODE	.hidden call_alias_ptr
-; CHECK-SYMS-NEXT: 00000010 g       DATA	alias_address
-; CHECK-SYMS-NEXT: 00000000 g       DATA	bar
-; CHECK-SYMS-NEXT: 00000000 gw      DATA	.hidden bar_alias
+; CHECK-SYMS-NEXT: 00000010 g     O DATA	alias_address
+; CHECK-SYMS-NEXT: 00000000 g     O DATA	bar
+; CHECK-SYMS-NEXT: 00000000 gw    O DATA	.hidden bar_alias
diff --git a/test/Object/AMDGPU/objdump.s b/test/Object/AMDGPU/objdump.s
index 31306ee90d822fe18bf9f2d8601f38f118f6672b..3c3f4a11df41e88c363ff2d371a7024f8010e58e 100644
--- a/test/Object/AMDGPU/objdump.s
+++ b/test/Object/AMDGPU/objdump.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s -filetype=obj | llvm-objdump -disassemble -arch-name=amdgcn -mcpu=tonga - | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s -mattr=-code-object-v3 -filetype=obj | llvm-objdump -disassemble -arch-name=amdgcn -mcpu=tonga - | FileCheck %s
 
 	.text
 
diff --git a/test/Object/Inputs/WASM/invalid-section-order.wasm b/test/Object/Inputs/WASM/invalid-section-order.wasm
new file mode 100644
index 0000000000000000000000000000000000000000..f2726ff176f132987228ac526427604fa008dc45
Binary files /dev/null and b/test/Object/Inputs/WASM/invalid-section-order.wasm differ
diff --git a/test/Object/objdump-symbol-table.test b/test/Object/objdump-symbol-table.test
index e66faecf4cc6439b9e7896dd0f381fe26493744d..8faee111638106f6afba6f2c9422375bbc50264e 100644
--- a/test/Object/objdump-symbol-table.test
+++ b/test/Object/objdump-symbol-table.test
@@ -37,7 +37,7 @@ macho-i386: 00000000         *UND*          _puts
 ELF-shared: shared-object-test.elf-i386:     file format
 ELF-shared: SYMBOL TABLE:
 ELF-shared: 00000200 l     F .text 00000003 local_func
-ELF-shared: 000012b8 l       *ABS* 00000000 .hidden _GLOBAL_OFFSET_TABLE_
-ELF-shared: 00001248 l       *ABS* 00000000 .hidden _DYNAMIC
-ELF-shared: 000012c4 g       .data 00000004 defined_sym
+ELF-shared: 000012b8 l     O *ABS*           00000000 .hidden _GLOBAL_OFFSET_TABLE_
+ELF-shared: 00001248 l     O *ABS*           00000000 .hidden _DYNAMIC
+ELF-shared: 000012c4 g     O .data           00000004 defined_sym
 ELF-shared: 000001f0 g     F .text 00000003 global_func
diff --git a/test/Object/wasm-invalid-section-order.test b/test/Object/wasm-invalid-section-order.test
new file mode 100644
index 0000000000000000000000000000000000000000..bb008ffac49ce8465dfc481f1b59fd9fe4a9223a
--- /dev/null
+++ b/test/Object/wasm-invalid-section-order.test
@@ -0,0 +1,16 @@
+# RUN: not obj2yaml %p/Inputs/WASM/invalid-section-order.wasm 2>&1 | FileCheck %s
+# CHECK: {{.*}}: Out of order section type: 10
+
+# Inputs/WASM/invalid-section-order.wasm is generated from this ll file, by
+# modifying WasmObjectWriter to incorrectly write the data section before the
+# code section.
+#
+# target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+# target triple = "wasm32-unknown-unknown"
+#
+# @data = global i32 0, align 4
+#
+# define void @foo() {
+# entry:
+#   ret void
+# }
diff --git a/test/ObjectYAML/wasm/dylink_section.yaml b/test/ObjectYAML/wasm/dylink_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9cc3f7d0a4ed60933210e1a9473d68117a71f129
--- /dev/null
+++ b/test/ObjectYAML/wasm/dylink_section.yaml
@@ -0,0 +1,28 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+
+Sections:
+  - Type:            CUSTOM
+    Name:            dylink
+    MemorySize:      4
+    MemoryAlignment: 2
+    TableSize:       1
+    TableAlignment:  0
+    Needed:          [ libfoo.so, libbar.so ]
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            CUSTOM
+# CHECK:     Name:            dylink
+# CHECK:     MemorySize:      4
+# CHECK:     MemoryAlignment: 2
+# CHECK:     TableSize:       1
+# CHECK:     TableAlignment:  0
+# CHECK:     Needed:
+# CHECK:       - libfoo.so
+# CHECK:       - libbar.so
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/event_section.yaml b/test/ObjectYAML/wasm/event_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..017efda056ce14dfd2ff939dcaf446a81c9ee1c5
--- /dev/null
+++ b/test/ObjectYAML/wasm/event_section.yaml
@@ -0,0 +1,92 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ReturnType:      I32
+        ParamTypes:
+          - I32
+      - Index:           1
+        ReturnType:      NORESULT
+        ParamTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            EVENT
+    Events:
+      - Index:           0
+        Attribute:       0
+        SigIndex:        1
+  - Type:            CODE
+    Relocations:
+      - Type:            R_WEBASSEMBLY_EVENT_INDEX_LEB
+        Index:           1
+        Offset:          0x00000006
+    Functions:
+      - Index:           0
+        Locals:
+        Body:            200008808080800041000B
+  - Type:            CUSTOM
+    Name:            linking
+    Version:         1
+    SymbolTable:
+      - Index:           0
+        Kind:            FUNCTION
+        Name:            test_throw0
+        Flags:           [  ]
+        Function:        0
+      - Index:           1
+        Kind:            EVENT
+        Name:            __cpp_exception
+        Flags:           [ BINDING_WEAK ]
+        Event:           0
+...
+
+# CHECK:      --- !WASM
+# CHECK-NEXT: FileHeader:
+# CHECK-NEXT:   Version:         0x00000001
+# CHECK-NEXT: Sections:
+# CHECK-NEXT:   - Type:            TYPE
+# CHECK-NEXT:     Signatures:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         ReturnType:      I32
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:           - I32
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         ReturnType:      NORESULT
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:           - I32
+# CHECK-NEXT:   - Type:            FUNCTION
+# CHECK-NEXT:     FunctionTypes:   [ 0 ]
+# CHECK-NEXT:   - Type:            EVENT
+# CHECK-NEXT:     Events:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Attribute:       0
+# CHECK-NEXT:         SigIndex:        1
+# CHECK-NEXT:   - Type:            CODE
+# CHECK-NEXT:     Relocations:
+# CHECK-NEXT:       - Type:            R_WEBASSEMBLY_EVENT_INDEX_LEB
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         Offset:          0x00000006
+# CHECK-NEXT:     Functions:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Locals:
+# CHECK-NEXT:         Body:            200008808080800041000B
+# CHECK-NEXT:   - Type:            CUSTOM
+# CHECK-NEXT:     Name:            linking
+# CHECK-NEXT:     Version:         1
+# CHECK-NEXT:     SymbolTable:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Name:            test_throw0
+# CHECK-NEXT:         Flags:           [  ]
+# CHECK-NEXT:         Function:        0
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         Kind:            EVENT
+# CHECK-NEXT:         Name:            __cpp_exception
+# CHECK-NEXT:         Flags:           [ BINDING_WEAK ]
+# CHECK-NEXT:         Event:           0
diff --git a/test/ObjectYAML/wasm/invalid_section_order.yaml b/test/ObjectYAML/wasm/invalid_section_order.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52ad13619622ac1d5d58bd5f02fd7e685048b137
--- /dev/null
+++ b/test/ObjectYAML/wasm/invalid_section_order.yaml
@@ -0,0 +1,20 @@
+# RUN: not yaml2obj %s -o /dev/null 2>&1 | FileCheck %s
+
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ReturnType:      NORESULT
+        ParamTypes:      []
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            0B
+  # CHECK: Out of order section type: 3
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+...
diff --git a/test/Other/2010-05-06-Printer.ll b/test/Other/2010-05-06-Printer.ll
index dcc0e752bb5adbeecbc9e29deab85846a609ed9f..9e7c9cb6ab4a8d7ce5042567588bcaddc934abdf 100644
--- a/test/Other/2010-05-06-Printer.ll
+++ b/test/Other/2010-05-06-Printer.ll
@@ -10,10 +10,12 @@ define void @foo(){
   ret void
 }
 
+;ALL-NOT: IR Dump After {{.*}}; ModuleID =
 ;ALL: define void @tester()
 ;ALL: define void @foo()
 ;ALL: ModuleID =
 
 ;FOO: IR Dump After
+;FOO-EMPTY:
 ;FOO-NEXT: define void @foo()
 ;FOO-NOT: define void @tester
diff --git a/test/Other/loop-deletion-printer.ll b/test/Other/loop-deletion-printer.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d344568506cd7e80bfd7499615c4ee842a00e2f4
--- /dev/null
+++ b/test/Other/loop-deletion-printer.ll
@@ -0,0 +1,24 @@
+; Make sure that Loop which was invalidated by loop-deletion
+; does not lead to problems for -print-after-all and is just skipped.
+;
+; RUN: opt < %s -disable-output \
+; RUN:     -passes=loop-instsimplify -print-after-all  2>&1 | FileCheck %s -check-prefix=SIMPLIFY
+; RUN: opt < %s -disable-output \
+; RUN:     -passes=loop-deletion,loop-instsimplify -print-after-all  2>&1 | FileCheck %s -check-prefix=DELETED
+;
+; SIMPLIFY: IR Dump {{.*}} LoopInstSimplifyPass
+; DELETED-NOT: IR Dump {{.*}}LoopInstSimplifyPass
+; DELETED-NOT: IR Dump {{.*}}LoopDeletionPass
+
+define void @deleteme() {
+entry:
+  br label %loop
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %check = icmp ult i32 %iv.next, 3
+  br i1 %check, label %loop, label %exit
+exit:
+  ret void
+}
+
diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll
index f86257b406af2b8b8c0defaa44636e766cb4416f..3ea52063c464691930d909d7abbf9c7dcad7f59f 100644
--- a/test/Other/new-pm-defaults.ll
+++ b/test/Other/new-pm-defaults.ll
@@ -66,6 +66,11 @@
 ; RUN:     -passes='lto-pre-link<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
 ; RUN:     --check-prefix=CHECK-EP-PIPELINE-START
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN:     -passes-ep-optimizer-last='no-op-function' \
+; RUN:     -passes='default<O3>' -S  %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN:     --check-prefix=CHECK-EP-OPTIMIZER-LAST
 
 ; CHECK-O: Running analysis: PassInstrumentationAnalysis
 ; CHECK-O-NEXT: Starting llvm::Module pass manager run.
@@ -241,6 +246,7 @@
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
+; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
 ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass
@@ -254,6 +260,7 @@
 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: SpeculateAroundPHIsPass
+; CHECK-EP-OPTIMIZER-LAST: Running pass: NoOpFunctionPass
 ; CHECK-O-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-O-NEXT: Running pass: CGProfilePass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll
index 03f4804a2ba7c29998bf396c568d1f169a3fd4cb..ee2f0414e6285d9965e588a0ca13ad847d6f43ca 100644
--- a/test/Other/new-pm-lto-defaults.ll
+++ b/test/Other/new-pm-lto-defaults.ll
@@ -38,13 +38,13 @@
 ; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo
 ; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo
 ; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis on foo
+; CHECK-O2-NEXT: Running analysis: DominatorTreeAnalysis on foo
 ; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-O2-NEXT: PGOIndirectCallPromotion
 ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O2-NEXT: Running pass: IPSCCPPass
-; CHECK-O2-DAG: Running analysis: AssumptionAnalysis on foo
-; CHECK-O2-DAG: Running analysis: DominatorTreeAnalysis on foo
+; CHECK-O2-NEXT: Running analysis: AssumptionAnalysis on foo
 ; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}PostOrderFunctionAttrsPass>
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}SCC
diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll
index c68aa1d05aa3024f637c3a7efe41d52669ceb075..3d1c9092c5b4a382cb7fe004d6f91338915ea62e 100644
--- a/test/Other/new-pm-thinlto-defaults.ll
+++ b/test/Other/new-pm-thinlto-defaults.ll
@@ -224,6 +224,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
+; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass
diff --git a/test/Other/opt-O2-pipeline.ll b/test/Other/opt-O2-pipeline.ll
index a7f64631b6ea0a0618fdd843560839d9130588e3..1e48c86d16da8e7d572c8d4f669b676bacc79d3c 100644
--- a/test/Other/opt-O2-pipeline.ll
+++ b/test/Other/opt-O2-pipeline.ll
@@ -250,6 +250,10 @@
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Loop Invariant Code Motion
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Warn about non-applied transformations
 ; CHECK-NEXT:       Alignment from assumptions
 ; CHECK-NEXT:     Strip Unused Function Prototypes
 ; CHECK-NEXT:     Dead Global Elimination
diff --git a/test/Other/opt-O3-pipeline.ll b/test/Other/opt-O3-pipeline.ll
index 33033fef1837797b225da037915b15e0fec1613a..1eb79159d0d16c63421418aaf2696b02b41c27af 100644
--- a/test/Other/opt-O3-pipeline.ll
+++ b/test/Other/opt-O3-pipeline.ll
@@ -28,6 +28,7 @@
 ; CHECK-NEXT:     Force set function attributes
 ; CHECK-NEXT:     Infer set function attributes
 ; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Call-site splitting
 ; CHECK-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
@@ -254,6 +255,10 @@
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Loop Invariant Code Motion
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Warn about non-applied transformations
 ; CHECK-NEXT:       Alignment from assumptions
 ; CHECK-NEXT:     Strip Unused Function Prototypes
 ; CHECK-NEXT:     Dead Global Elimination
diff --git a/test/Other/opt-Os-pipeline.ll b/test/Other/opt-Os-pipeline.ll
index d1f874f5d199512c1d87b63dbd5522990087baf9..c44f6506c4ef76b83c9ff908accfd4b97365a2f6 100644
--- a/test/Other/opt-Os-pipeline.ll
+++ b/test/Other/opt-Os-pipeline.ll
@@ -237,6 +237,10 @@
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Loop Invariant Code Motion
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Warn about non-applied transformations
 ; CHECK-NEXT:       Alignment from assumptions
 ; CHECK-NEXT:     Strip Unused Function Prototypes
 ; CHECK-NEXT:     Dead Global Elimination
diff --git a/test/Other/opt-hot-cold-split.ll b/test/Other/opt-hot-cold-split.ll
index a3fbdeffb2fd4db8ce6d7649733bd017d828c0d9..ab08258359ffeeaf66b7210211f1624682b0aea8 100644
--- a/test/Other/opt-hot-cold-split.ll
+++ b/test/Other/opt-hot-cold-split.ll
@@ -236,6 +236,10 @@
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Loop Invariant Code Motion
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Warn about non-applied transformations
 ; CHECK-NEXT:       Alignment from assumptions
 ; CHECK-NEXT:     Strip Unused Function Prototypes
 ; CHECK-NEXT:     Dead Global Elimination
diff --git a/test/Other/scc-deleted-printer.ll b/test/Other/scc-deleted-printer.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e6323f0274a3811b07f1c9793d679706cf2141d5
--- /dev/null
+++ b/test/Other/scc-deleted-printer.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=inline -print-before-all -print-after-all | FileCheck %s -check-prefix=INL
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=inline -print-before-all -print-after-all -print-module-scope | FileCheck %s -check-prefix=INL-MOD
+
+; INL: IR Dump Before {{InlinerPass .*scc: .tester, foo}}
+; INL-NOT: IR Dump After {{InlinerPass}}
+; INL: IR Dump Before {{InlinerPass .*scc: .tester}}
+; INL: IR Dump After {{InlinerPass .*scc: .tester}}
+
+; INL-MOD: IR Dump Before {{InlinerPass .*scc: .tester, foo}}
+; INL-MOD-NOT: IR Dump After {{InlinerPass}}
+; INL-MOD: IR Dump Before {{InlinerPass .*scc: .tester}}
+; INL-MOD: IR Dump After {{InlinerPass .*scc: .tester}}
+
+
+define void @tester() noinline {
+  call void @foo()
+  ret void
+}
+
+define internal void @foo() alwaysinline {
+  call void @tester()
+  ret void
+}
diff --git a/test/Other/scc-pass-printer.ll b/test/Other/scc-pass-printer.ll
index 9d86bf039636a1da179aead92e09f9bf569fbed2..9e12a285f025717dfc0b8dee2c98cfd70f809391 100644
--- a/test/Other/scc-pass-printer.ll
+++ b/test/Other/scc-pass-printer.ll
@@ -18,6 +18,8 @@
 ; INL: IR Dump After
 
 ; INL-MOD: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .bar, foo}}
+; INL-MOD-NEXT: ModuleID =
+; INL-MOD-NEXT: source_filename =
 ; INL-MOD: define void @tester()
 ; INL-MOD-NEXT:  call void @foo()
 ; INL-MOD: define void @foo()
@@ -25,6 +27,8 @@
 ; INL-MOD: define void @bar()
 ; INL-MOD-NEXT:  call void @foo()
 ; INL-MOD: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .tester}}
+; INL-MOD-NEXT: ModuleID =
+; INL-MOD-NEXT: source_filename =
 ; INL-MOD: define void @tester()
 ; INL-MOD-NEXT:  call void @foo()
 ; INL-MOD: define void @foo()
@@ -32,6 +36,9 @@
 ; INL-MOD: define void @bar()
 ; INL-MOD-NEXT:  call void @foo()
 ; INL-MOD: IR Dump After
+; INL-MOD-NEXT: ModuleID =
+; INL-MOD-NEXT: source_filename =
+; INL-MOD-NOT: Printing <null> Function
 
 define void @tester() noinline {
   call void @foo()
diff --git a/test/TableGen/ConstraintChecking.inc b/test/TableGen/ConstraintChecking.inc
new file mode 100644
index 0000000000000000000000000000000000000000..1c5872953be55b21fdf87673417673fd4caf09c2
--- /dev/null
+++ b/test/TableGen/ConstraintChecking.inc
@@ -0,0 +1,33 @@
+include "llvm/Target/Target.td"
+
+def TestTarget : Target;
+
+class Encoding : Instruction {
+  field bits<8> Inst;
+}
+
+class TestReg<string name, bits<1> enc> : Register<name, []> {
+    let HWEncoding{15-1} = 0;
+    let HWEncoding{0} = enc;
+}
+
+def R0 : TestReg<"R0", 0>;
+def R1 : TestReg<"R1", 1>;
+def Reg : RegisterClass<"TestTarget", [i32], 32, (sequence "R%d", 0, 1)>;
+
+class TestInstructionWithConstraints<string cstr> : Encoding {
+  dag OutOperandList = (outs Reg:$dest1, Reg:$dest2);
+  dag InOperandList = (ins Reg:$src1, Reg:$src2);
+  string AsmString = "mnemonic $dest1, $dest2, $src1, $src2";
+  string AsmVariantName = "";
+  let Constraints = cstr;
+  field bits<1> dest1;
+  field bits<1> dest2;
+  field bits<1> src1;
+  field bits<1> src2;
+  let Inst{7-4} = 0b1010;
+  let Inst{3} = dest1{0};
+  let Inst{2} = dest2{0};
+  let Inst{1} = src1{0};
+  let Inst{0} = src2{0};
+}
diff --git a/test/TableGen/ConstraintChecking1.td b/test/TableGen/ConstraintChecking1.td
new file mode 100644
index 0000000000000000000000000000000000000000..2c0c3d9bfabb643895e011ff6fec27729d80b4b8
--- /dev/null
+++ b/test/TableGen/ConstraintChecking1.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -gen-asm-writer -I %p -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
+
+include "ConstraintChecking.inc"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:1: error: Unrecognized constraint '$dest1 ~ $src2' in 'Foo'
+def Foo : TestInstructionWithConstraints<"$dest1 ~ $src2">;
diff --git a/test/TableGen/ConstraintChecking2.td b/test/TableGen/ConstraintChecking2.td
new file mode 100644
index 0000000000000000000000000000000000000000..65ec882b18f821dd2266a17c228ef23f90b93919
--- /dev/null
+++ b/test/TableGen/ConstraintChecking2.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -gen-asm-writer -I %p -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
+
+include "ConstraintChecking.inc"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:1: error: Illegal format for @earlyclobber constraint in 'Foo'
+def Foo : TestInstructionWithConstraints<"@earlyclobber ">;
diff --git a/test/TableGen/ConstraintChecking3.td b/test/TableGen/ConstraintChecking3.td
new file mode 100644
index 0000000000000000000000000000000000000000..835ec2e035f7dff21ffcd4c6a988c4b72f395d22
--- /dev/null
+++ b/test/TableGen/ConstraintChecking3.td
@@ -0,0 +1,8 @@
+// RUN: not llvm-tblgen -gen-asm-writer -I %p -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
+
+include "ConstraintChecking.inc"
+
+// (This is illegal because the '=' has to be surrounded by whitespace)
+
+// CHECK: [[FILE]]:[[@LINE+1]]:1: error: Illegal format for tied-to constraint in 'Foo'
+def Foo : TestInstructionWithConstraints<"$dest1=$dest2">;
diff --git a/test/TableGen/ConstraintChecking4.td b/test/TableGen/ConstraintChecking4.td
new file mode 100644
index 0000000000000000000000000000000000000000..bd511eb1132df87c716048b34c7e17ec96367efc
--- /dev/null
+++ b/test/TableGen/ConstraintChecking4.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -gen-asm-writer -I %p -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
+
+include "ConstraintChecking.inc"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:1: error: Output operands '$dest1' and '$dest2' of 'Foo' cannot be tied!
+def Foo : TestInstructionWithConstraints<"$dest1 = $dest2">;
diff --git a/test/TableGen/ConstraintChecking5.td b/test/TableGen/ConstraintChecking5.td
new file mode 100644
index 0000000000000000000000000000000000000000..7db354637fa1a8082e3218fd7248b9644c129ac0
--- /dev/null
+++ b/test/TableGen/ConstraintChecking5.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -gen-asm-writer -I %p -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
+
+include "ConstraintChecking.inc"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:1: error: Input operands '$src1' and '$src2' of 'Foo' cannot be tied!
+def Foo : TestInstructionWithConstraints<"$src1 = $src2">;
diff --git a/test/TableGen/ConstraintChecking6.td b/test/TableGen/ConstraintChecking6.td
new file mode 100644
index 0000000000000000000000000000000000000000..2d3bdd2adabe7e74011dd37852afd23dbcc9ef72
--- /dev/null
+++ b/test/TableGen/ConstraintChecking6.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -gen-asm-writer -I %p -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
+
+include "ConstraintChecking.inc"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:1: error: Operand '$dest1' of 'Foo' cannot have multiple operands tied to it!
+def Foo : TestInstructionWithConstraints<"$dest1 = $src1, $dest1 = $src2">;
diff --git a/test/TableGen/ConstraintChecking7.td b/test/TableGen/ConstraintChecking7.td
new file mode 100644
index 0000000000000000000000000000000000000000..5a44ee4f1d8ae7bea566930a10686eada03bb58a
--- /dev/null
+++ b/test/TableGen/ConstraintChecking7.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -gen-asm-writer -I %p -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
+
+include "ConstraintChecking.inc"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:1: error: Operand '$src1' of 'Foo' cannot have multiple constraints!
+def Foo : TestInstructionWithConstraints<"$dest1 = $src1, $dest2 = $src1">;
diff --git a/test/TableGen/ambiguous-composition.td b/test/TableGen/ambiguous-composition.td
new file mode 100644
index 0000000000000000000000000000000000000000..da83466b4d0195a1b3c7e0dbd7c0b8a1b7a1ebaf
--- /dev/null
+++ b/test/TableGen/ambiguous-composition.td
@@ -0,0 +1,92 @@
+// RUN: llvm-tblgen -gen-register-info -I %p/../../include %s 2>&1 | FileCheck %s
+//
+// CHECK-NOT: warning: SubRegIndex Test::subreg_h64 and Test::subreg_h32 compose ambiguously as Test::subreg_hh32 or Test::subreg_h32
+// CHECK: warning: SubRegIndex Test::subreg_l64 and Test::subreg_l32 compose ambiguously as Test::subreg_ll32 or Test::subreg_l32
+
+include "llvm/Target/Target.td"
+
+def TestInstrInfo : InstrInfo {
+}
+
+def Test : Target {
+  let InstructionSet = TestInstrInfo;
+}
+
+let Namespace = "Test" in {
+  def subreg_l32  : SubRegIndex<32, 0>;
+  def subreg_h32  : SubRegIndex<32, 32>;
+  def subreg_h64  : SubRegIndex<64, 64>;
+  def subreg_l64  : SubRegIndex<64, 0>;
+  def subreg_hh32 : ComposedSubRegIndex<subreg_h64, subreg_h32>;
+  def subreg_ll32 : ComposedSubRegIndex<subreg_l64, subreg_l32>;
+}
+
+class TestReg<string n, list<Register> s> : RegisterWithSubRegs<n, s> {
+  let Namespace = "Test";
+}
+
+// --------------------------------------------------------------------
+// A situation that previously caused the warning about ambiguous
+// composition.
+//
+// The actual subregister actions are:
+//   subreg_h64:  { F0Q->F0D V0Q->F0D }
+//   subreg_h32:  { F0D->F0S F0Q->F2S V0Q->F0S }
+//   composition: { F0Q->F0S V0Q->F0S }  (this is the same as subreg_hh32)
+// 
+// For the register V0Q, subreg_hh32(V0Q) = subreg_h32(V0Q) = F0S, which
+// would be enough to trigger the warning about ambiguous composition.
+// However, for F0Q, subreg_hh32(F0Q) = F0S, while subreg_h32(F0Q) = F2S,
+// which shows that there two subregister indices are different.
+// Make sure that the warning is not emitted in this case.
+
+class FPR32<string n> : TestReg<n, []> {
+}
+
+class FPR64<string n, FPR32 high> : TestReg<n, [high]> {
+  let SubRegIndices = [subreg_h32];
+}
+
+class FPR128<string n, FPR64 high, FPR32 low> : TestReg<n, [high, low]> {
+  let SubRegIndices = [subreg_h64, subreg_h32];
+}
+
+class VPR128<string n, FPR64 high> : TestReg<n, [high]> {
+  let SubRegIndices = [subreg_h64];
+}
+
+def F0S : FPR32<"f0s">;
+def F1S : FPR32<"f1s">;
+def F2S : FPR32<"f2s">;
+
+def F0D : FPR64<"f0d",  F0S>;
+def F0Q : FPR128<"f0q", F0D, F2S>;
+def V0Q : VPR128<"v0q", F0D>;
+
+def FP32  : RegisterClass<"FP32",  [f32],   32,  (add F0S)>;
+def FP64  : RegisterClass<"FP64",  [f64],   64,  (add F0D)>;
+def FP128 : RegisterClass<"FP128", [v2f64], 128, (add F0Q)>;
+def VP128 : RegisterClass<"VP128", [v2f64], 128, (add V0Q)>;
+
+// --------------------------------------------------------------------
+// A situation where the warning is legitimate.
+// Make sure that the warning is still displayed.
+
+class GPR32<string n> : TestReg<n, []> {
+}
+
+class GPR64<string n, GPR32 low> : TestReg<n, [low]> {
+  let SubRegIndices = [subreg_l32];
+}
+
+class GPR128<string n, GPR64 low> : TestReg<n, [low]> {
+  let SubRegIndices = [subreg_l64];
+}
+
+def G0S : GPR32<"g0s">;
+def G0D : GPR64<"g0d", G0S>;
+def G0Q : GPR128<"g0q", G0D>;
+
+def GP32  : RegisterClass<"GP32",  [i32],   32,  (add G0S)>;
+def GP64  : RegisterClass<"GP64",  [i64],   64,  (add G0D)>;
+def GP128 : RegisterClass<"GP128", [v2i64], 128, (add G0Q)>;
diff --git a/test/TableGen/dag-isel-res-order.td b/test/TableGen/dag-isel-res-order.td
new file mode 100644
index 0000000000000000000000000000000000000000..4da6b6d6867c427012a7868f36c320db2fa56bb2
--- /dev/null
+++ b/test/TableGen/dag-isel-res-order.td
@@ -0,0 +1,22 @@
+// RUN: llvm-tblgen -gen-dag-isel -I %p/../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def TestTargetInstrInfo : InstrInfo;
+
+def TestTarget : Target {
+  let InstructionSet = TestTargetInstrInfo;
+}
+
+def REG : Register<"REG">;
+def GPR : RegisterClass<"TestTarget", [i32], 32, (add REG)>;
+
+// CHECK-LABEL: OPC_CheckOpcode, TARGET_VAL(ISD::UDIVREM)
+// CHECK: OPC_EmitNode2, TARGET_VAL(::INSTR)
+// CHECK: Results = #2 #3
+// CHECK: OPC_CompleteMatch, 2, 3, 2
+def INSTR : Instruction {
+  let OutOperandList = (outs GPR:$r1, GPR:$r0);
+  let InOperandList = (ins GPR:$t0, GPR:$t1);
+  let Pattern = [(set i32:$r0, i32:$r1, (udivrem i32:$t0, i32:$t1))];
+}
diff --git a/test/TableGen/prep-diag1.td b/test/TableGen/prep-diag1.td
new file mode 100644
index 0000000000000000000000000000000000000000..41b7d477c6942e3de820d1e2a3011deb94ffb70a
--- /dev/null
+++ b/test/TableGen/prep-diag1.td
@@ -0,0 +1,26 @@
+// RUN: not llvm-tblgen -DDIAG1 -I %p %s 2>&1 | FileCheck --check-prefixes=DIAG1 %s
+// RUN: not llvm-tblgen -DDIAG4 -I %p %s 2>&1 | FileCheck --check-prefixes=DIAG4 %s
+// RUN: not llvm-tblgen -DDIAG2 -I %p %s 2>&1 | FileCheck --check-prefixes=DIAG2 %s
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck --check-prefixes=DIAG3 %s
+
+#ifdef DIAG1
+// DIAG1: error: Only comments are supported after #define NAME
+#define ENABLED1/*
+*/class C;
+#endif // DIAG1
+
+#ifdef DIAG4
+// DIAG4: warning: Duplicate definition of macro: ENABLED1
+#define ENABLED1
+#define ENABLED1
+#endif // DIAG4
+
+#ifdef DIAG2
+// DIAG2: error: Only comments are supported after #ifdef NAME
+
+// Invalid #ifdef below should be detected even if DIAG2 is not defined.
+// DIAG3: error: Only comments are supported after #ifdef NAME
+#ifdef DIAG2/*
+*/class C;
+#endif
+#endif // DIAG2
diff --git a/test/TableGen/prep-diag10.td b/test/TableGen/prep-diag10.td
new file mode 100644
index 0000000000000000000000000000000000000000..eb387a07b066ca491911fe26e05abd1fc6095885
--- /dev/null
+++ b/test/TableGen/prep-diag10.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Reached EOF without matching #endif
+// CHECK: error: The latest preprocessor control is here
+#ifdef DISABLED
+#else
diff --git a/test/TableGen/prep-diag11-include.inc b/test/TableGen/prep-diag11-include.inc
new file mode 100644
index 0000000000000000000000000000000000000000..9f31d7be41687cf8733d9ac1cf88cbe3f76733dd
--- /dev/null
+++ b/test/TableGen/prep-diag11-include.inc
@@ -0,0 +1 @@
+#ifdef ENABLED
diff --git a/test/TableGen/prep-diag11.td b/test/TableGen/prep-diag11.td
new file mode 100644
index 0000000000000000000000000000000000000000..0042bc04f9e10143bdb90fb3ee25d852b2500cfd
--- /dev/null
+++ b/test/TableGen/prep-diag11.td
@@ -0,0 +1,8 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Reached EOF without matching #endif
+// CHECK: error: The latest preprocessor control is here
+#ifdef DISABLED
+#else
+#define ENABLED
+include "prep-diag11-include.inc"
diff --git a/test/TableGen/prep-diag12-include.inc b/test/TableGen/prep-diag12-include.inc
new file mode 100644
index 0000000000000000000000000000000000000000..995bca60e36e1df6fe11040df0d7be980d407006
--- /dev/null
+++ b/test/TableGen/prep-diag12-include.inc
@@ -0,0 +1,2 @@
+#ifdef ENABLED
+#else
diff --git a/test/TableGen/prep-diag12.td b/test/TableGen/prep-diag12.td
new file mode 100644
index 0000000000000000000000000000000000000000..c26301ee17ac2b18338d40f4675672206eb01482
--- /dev/null
+++ b/test/TableGen/prep-diag12.td
@@ -0,0 +1,8 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Reached EOF without matching #endif
+// CHECK: error: The latest preprocessor control is here
+#ifdef DISABLED
+#else
+#define ENABLED
+include "prep-diag12-include.inc"
diff --git a/test/TableGen/prep-diag13.td b/test/TableGen/prep-diag13.td
new file mode 100644
index 0000000000000000000000000000000000000000..aa3fdab4802d379b155fe0556cda2524b4634a6c
--- /dev/null
+++ b/test/TableGen/prep-diag13.td
@@ -0,0 +1,9 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Reached EOF without matching #endif
+// CHECK: error: The latest preprocessor control is here
+#ifdef DISABLED
+/*
+#else
+#endif
+*/
diff --git a/test/TableGen/prep-diag14.td b/test/TableGen/prep-diag14.td
new file mode 100644
index 0000000000000000000000000000000000000000..cae9bc3b7f5b6c8b2f4d7f8986464acdfde4f298
--- /dev/null
+++ b/test/TableGen/prep-diag14.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Reached EOF without matching #endif
+// CHECK: error: The latest preprocessor control is here
+#ifdef DISABLED
+// #endif
diff --git a/test/TableGen/prep-diag2.td b/test/TableGen/prep-diag2.td
new file mode 100644
index 0000000000000000000000000000000000000000..741026b9c8a2d6415ac0cf6a28fa6c8db25824c1
--- /dev/null
+++ b/test/TableGen/prep-diag2.td
@@ -0,0 +1,14 @@
+// RUN: not llvm-tblgen -DDIAG1 -I %p %s 2>&1 | FileCheck --check-prefixes=DIAG1 %s
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck --check-prefixes=DIAG2 %s
+
+#ifdef DIAG1
+// DIAG1: error: Only comments are supported after #else
+
+// Invalid #else below should be detected even if DIAG1 is not defined.
+// DIAG2: error: Only comments are supported after #else
+#ifdef DIAG2//DIAG2
+#else/*
+*/class C;
+#endif
+#endif // DIAG1
+
diff --git a/test/TableGen/prep-diag3.td b/test/TableGen/prep-diag3.td
new file mode 100644
index 0000000000000000000000000000000000000000..fbedfa290b9947dfcf3ba836159abb5a9c6ea202
--- /dev/null
+++ b/test/TableGen/prep-diag3.td
@@ -0,0 +1,14 @@
+// RUN: not llvm-tblgen -DDIAG1 -I %p %s 2>&1 | FileCheck --check-prefixes=DIAG1 %s
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck --check-prefixes=DIAG2 %s
+
+#ifdef DIAG1
+// DIAG1: error: Only comments are supported after #endif
+
+// Invalid #else below should be detected even if DIAG1 is not defined.
+// DIAG2: error: Only comments are supported after #endif
+#ifdef DIAG2//DIAG2
+#else/*!DIAG2*/
+#endif/* !DIAG2
+*/class C;
+#endif // DIAG1
+
diff --git a/test/TableGen/prep-diag4.td b/test/TableGen/prep-diag4.td
new file mode 100644
index 0000000000000000000000000000000000000000..4661ef8667d23fc7d6c213b637969cc6ea0c729d
--- /dev/null
+++ b/test/TableGen/prep-diag4.td
@@ -0,0 +1,8 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: double #else
+// CHECK: error: Previous #else is here
+#ifdef DIAG1
+#else
+#else
+#endif
diff --git a/test/TableGen/prep-diag5.td b/test/TableGen/prep-diag5.td
new file mode 100644
index 0000000000000000000000000000000000000000..7dc178bef22d1b1a44c83499d2070658d8972b47
--- /dev/null
+++ b/test/TableGen/prep-diag5.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: #else without #ifdef
+#else
+#else
+#endif
diff --git a/test/TableGen/prep-diag6.td b/test/TableGen/prep-diag6.td
new file mode 100644
index 0000000000000000000000000000000000000000..f4202d115da59a463c41dc92ed64b1857d058836
--- /dev/null
+++ b/test/TableGen/prep-diag6.td
@@ -0,0 +1,7 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Expected macro name after #ifdef
+#ifdef
+#else
+#else
+#endif
diff --git a/test/TableGen/prep-diag7.td b/test/TableGen/prep-diag7.td
new file mode 100644
index 0000000000000000000000000000000000000000..1efe5260d1371fc118d362197a4e69968576618e
--- /dev/null
+++ b/test/TableGen/prep-diag7.td
@@ -0,0 +1,4 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: #endif without #ifdef
+#endif
diff --git a/test/TableGen/prep-diag8.td b/test/TableGen/prep-diag8.td
new file mode 100644
index 0000000000000000000000000000000000000000..7a7bde62c79c4e127e560a5dc4a7a53a5467186b
--- /dev/null
+++ b/test/TableGen/prep-diag8.td
@@ -0,0 +1,5 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Expected macro name after #define
+#define
+#endif
diff --git a/test/TableGen/prep-diag9.td b/test/TableGen/prep-diag9.td
new file mode 100644
index 0000000000000000000000000000000000000000..4ecff575cdc7bbdd873ffb858c3efa5f32054cb4
--- /dev/null
+++ b/test/TableGen/prep-diag9.td
@@ -0,0 +1,5 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Reached EOF without matching #endif
+// CHECK: error: The latest preprocessor control is here
+#ifdef DISABLED
diff --git a/test/TableGen/prep-region-include.inc b/test/TableGen/prep-region-include.inc
new file mode 100644
index 0000000000000000000000000000000000000000..7b4e7850b365ad2002aba4d22551e15713705d2f
--- /dev/null
+++ b/test/TableGen/prep-region-include.inc
@@ -0,0 +1,8 @@
+#ifdef ENABLED4
+def ifdef_enabled4 : C;
+#else
+def ifdef_enabled4_else : C;
+#endif
+
+// EOF immediately after ENABLED5
+#define ENABLED5
diff --git a/test/TableGen/prep-region-processing.td b/test/TableGen/prep-region-processing.td
new file mode 100644
index 0000000000000000000000000000000000000000..ce6e19a5ec482a4af038b648e84b2406bb524b59
--- /dev/null
+++ b/test/TableGen/prep-region-processing.td
@@ -0,0 +1,150 @@
+// RUN: llvm-tblgen -I %p %s 2>&1 | FileCheck %s --implicit-check-not warning:
+
+class C;
+
+// TableGen prints records in alpabetical order.
+// CHECK-NOT: def ifdef_disabled1
+// CHECK-NOT: def ifdef_disabled2
+// CHECK:     def ifdef_disabled3
+// CHECK-NOT: def ifdef_disabled4
+// CHECK-NOT: def ifdef_disabled5
+// CHECK:     def ifdef_disabled4_else
+// CHECK-NOT: def ifdef_disabled5_else
+// CHECK:     def ifdef_enabled1
+// CHECK-NOT: def ifdef_enabled2
+// CHECK:     def ifdef_enabled3
+// CHECK:     def ifdef_enabled4
+// CHECK-NOT: def ifdef_enabled4_else
+// CHECK:     def ifdef_enabled5
+// CHECK:     def ifdef_enabled6
+// CHECK-NOT: def ifdef_enabled6_else
+// CHECK-NOT: def ifdef_disabled6
+// CHECK-NOT: def ifdef_disabled6_else
+
+#define ENABLED1
+#define ENABLED2
+
+#ifdef DISABLED1
+//
+def ifdef_disabled1 : C;
+
+#define DISABLED2/*This one is disabled,
+                   because DISABLED1 is.
+*/
+#endif
+
+#ifdef ENABLED1
+def ifdef_enabled1 : C;
+#endif
+
+#ifdef DISABLED2/*
+*/
+def ifdef_disabled2 : C;
+#endif
+
+/*
+#ifdef ENABLED2
+def ifdef_enabled2 : C;
+#endif
+*/
+
+//#ifdef DISABLED3
+def ifdef_disabled3 : C;
+
+//#endif
+
+/* _DISABLED4 */  /* padding */ #ifdef _DISABLED4
+def ifdef_disabled4 : C;
+#else//      /*!_DISABLED4
+def ifdef_disabled4_else : C;
+
+#define ENABLED3
+#endif
+
+#ifdef __DISABLED5
+def ifdef_disabled5 : C;
+/*
+
+*/#else
+#ifdef ENABLED3
+def ifdef_enabled3 : C;
+#else /* //!ENABLED3
+*/
+def ifdef_disabled5_else : C;
+#endif
+#endif
+
+#define ENABLED4
+include "prep-region-include.inc"//ENABLED5 is defined inside
+
+#ifdef ENABLED5
+def ifdef_enabled5 : C;
+#endif // ENABLED5
+
+#ifdef DISABLED6__
+// Double inclusion is an error.
+include "prep-region-include.inc"
+#else
+#endif
+
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+#ifdef DIS
+def ifdef_disabled6 : C;
+#endif
+#endif
+#endif
+#endif
+#else
+def ifdef_disabled6_else : C;
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#else
+#define ENAB//ENAB
+#endif
+
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+#ifdef ENAB
+def ifdef_enabled6 : C;
+#endif
+#endif
+#endif
+#endif
+#else
+def ifdef_enabled6_else : C;
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef DISABLED_7
+include "non-existent-file.inc"
+#endif
+
+#ifdef DISABLED_8
+\\\\\ invalid TD text /////
+#endif // DISABLED_8
diff --git a/test/TableGen/unterminated-c-comment-include.inc b/test/TableGen/unterminated-c-comment-include.inc
new file mode 100644
index 0000000000000000000000000000000000000000..d05d4350c029cbb033adb70756d5b901eb8eddaf
--- /dev/null
+++ b/test/TableGen/unterminated-c-comment-include.inc
@@ -0,0 +1,2 @@
+/* comment starts here and finished in the parent file.
+   TableGen used to allow such usage.
diff --git a/test/TableGen/unterminated-c-comment.td b/test/TableGen/unterminated-c-comment.td
new file mode 100644
index 0000000000000000000000000000000000000000..0f4cd9d633c66d995070375ffa47fa314e26d084
--- /dev/null
+++ b/test/TableGen/unterminated-c-comment.td
@@ -0,0 +1,5 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Unterminated comment!
+
+include "unterminated-c-comment-include.inc" */
diff --git a/test/TableGen/unterminated-code-block-include.inc b/test/TableGen/unterminated-code-block-include.inc
new file mode 100644
index 0000000000000000000000000000000000000000..a1363f810fd30fc8a53fd4636b4f2c3d161038b5
--- /dev/null
+++ b/test/TableGen/unterminated-code-block-include.inc
@@ -0,0 +1,8 @@
+class test<code C> {
+  code Code = C;
+}
+
+// TableGen used to allow code blocks starting in one file
+// and finishing in the parent one.  This test checks
+// that this is reported as an error from now on.
+def foo : test<[{ hello world!
diff --git a/test/TableGen/unterminated-code-block.td b/test/TableGen/unterminated-code-block.td
new file mode 100644
index 0000000000000000000000000000000000000000..86395abd789e8cd7d51ea1db03ace8baec7a7556
--- /dev/null
+++ b/test/TableGen/unterminated-code-block.td
@@ -0,0 +1,5 @@
+// RUN: not llvm-tblgen -I %p %s 2>&1 | FileCheck %s
+
+// CHECK: error: Unterminated Code Block
+
+include "unterminated-code-block-include.inc" }]>;
diff --git a/test/ThinLTO/X86/Inputs/function_entry_count.ll b/test/ThinLTO/X86/Inputs/function_entry_count.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ea2c2eda8d5a0f173f1de1d6918a723551dea283
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/function_entry_count.ll
@@ -0,0 +1,9 @@
+target triple = "x86_64-unknown-linux-gnu"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @h();
+
+define void @g() {
+  call void @h();
+  ret void
+}
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-alias.ll b/test/ThinLTO/X86/Inputs/index-const-prop-alias.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d5aa6ee0b616cd51a6973085894334fbfcaedc66
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-alias.ll
@@ -0,0 +1,5 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = global i32 42, align 4
+@g.alias = weak alias i32, i32* @g
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-cache-foo.ll b/test/ThinLTO/X86/Inputs/index-const-prop-cache-foo.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5f85f1e38c9ef16aede3490859bbbaa32a9f073c
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-cache-foo.ll
@@ -0,0 +1,19 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@gFoo = internal unnamed_addr global i32 1, align 4
+
+; Function Attrs: norecurse nounwind readonly ssp uwtable
+define i32 @foo() local_unnamed_addr {
+  %1 = load i32, i32* @gFoo, align 4
+  ret i32 %1
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @bar() local_unnamed_addr {
+  %1 = tail call i32 @rand()
+  store i32 %1, i32* @gFoo, align 4
+  ret void
+}
+
+declare i32 @rand() local_unnamed_addr
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-cache-test1.ll b/test/ThinLTO/X86/Inputs/index-const-prop-cache-test1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6bcf3b32d0d661c84adb3f454634f7cff70f5e45
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-cache-test1.ll
@@ -0,0 +1,10 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test() local_unnamed_addr {
+  %1 = tail call i32 (...) @foo()
+  ret i32 %1
+}
+
+declare i32 @foo(...) local_unnamed_addr
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-cache-test2.ll b/test/ThinLTO/X86/Inputs/index-const-prop-cache-test2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..75a8f071f485f6408fbfeb2f037b03b55da10cc3
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-cache-test2.ll
@@ -0,0 +1,14 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test() local_unnamed_addr {
+  %1 = tail call i32 (...) @foo()
+  %2 = tail call i32 (...) @bar()
+  %3 = add nsw i32 %2, %1
+  ret i32 %3
+}
+
+declare i32 @foo(...) local_unnamed_addr
+
+declare i32 @bar(...) local_unnamed_addr
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-comdat.ll b/test/ThinLTO/X86/Inputs/index-const-prop-comdat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..819ba8caa324e80e73506ea2b3472807493574d4
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-comdat.ll
@@ -0,0 +1,5 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$comdat.any = comdat any
+@g = global i32 42, comdat($comdat.any)
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-define-g.ll b/test/ThinLTO/X86/Inputs/index-const-prop-define-g.ll
new file mode 100644
index 0000000000000000000000000000000000000000..555f76f5930124d5eb788b53cef54135e2c46539
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-define-g.ll
@@ -0,0 +1,4 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = global i32 42, align 4
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-full-lto.ll b/test/ThinLTO/X86/Inputs/index-const-prop-full-lto.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e22f9198951ee8498868fbc4bc0a66b35ff600de
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-full-lto.ll
@@ -0,0 +1,12 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external global i32
+
+define i32 @foo() {
+  %v = load i32, i32* @g
+  ret i32 %v
+}
+
+!0 = !{i32 1, !"ThinLTO", i32 0}
+!llvm.module.flags = !{ !0 }
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-gvref.ll b/test/ThinLTO/X86/Inputs/index-const-prop-gvref.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ec0de9c91418e959937caf1c4c47460fb6c5512e
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-gvref.ll
@@ -0,0 +1,5 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = global i32* @a, align 8
+@a = global i32 42, align 4
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop-linkage.ll b/test/ThinLTO/X86/Inputs/index-const-prop-linkage.ll
new file mode 100644
index 0000000000000000000000000000000000000000..24c724743e08e1ec77ef7919257566d51668a2c1
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop-linkage.ll
@@ -0,0 +1,15 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g1 = common global i32 0, align 4
+@g2 = global i32 42, align 4
+@g3 = available_externally global i32 42, align 4
+
+define i32 @foo() {
+  %v1 = load i32, i32* @g1
+  %v2 = load i32, i32* @g2
+  %v3 = load i32, i32* @g3
+  %s1 = add i32 %v1, %v2
+  %s2 = add i32 %s1, %v3
+  ret i32 %s2
+}
diff --git a/test/ThinLTO/X86/Inputs/index-const-prop.ll b/test/ThinLTO/X86/Inputs/index-const-prop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eda62fb4e5bebef17f67ed138006397819622522
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/index-const-prop.ll
@@ -0,0 +1,64 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+@gBar = local_unnamed_addr global i32 2, align 4, !dbg !0
+@gFoo = internal unnamed_addr global i32 1, align 4, !dbg !6
+
+; Function Attrs: norecurse nounwind readonly
+define i32 @foo() local_unnamed_addr #0 !dbg !14 {
+  %1 = load i32, i32* @gFoo, align 4, !dbg !17
+  ret i32 %1, !dbg !18
+}
+
+; Function Attrs: norecurse nounwind readonly
+define i32 @bar() local_unnamed_addr #0 !dbg !19 {
+  %1 = load i32, i32* @gBar, align 4, !dbg !20
+  ret i32 %1, !dbg !21
+}
+
+define void @baz() local_unnamed_addr !dbg !22 {
+  %1 = tail call i32 @rand(), !dbg !25
+  store i32 %1, i32* @gFoo, align 4, !dbg !26
+  %2 = tail call i32 @rand(), !dbg !27
+  store i32 %2, i32* @gBar, align 4, !dbg !28
+  ret void, !dbg !29
+}
+
+declare i32 @rand() local_unnamed_addr
+
+attributes #0 = { norecurse nounwind readonly }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!9, !10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "gBar", scope: !2, file: !3, line: 4, type: !8, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 7.0.0 (trunk 332246)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "foo.c", directory: "/data/work/lto/roref/test")
+!4 = !{}
+!5 = !{!0, !6}
+!6 = !DIGlobalVariableExpression(var: !7, expr: !DIExpression())
+!7 = distinct !DIGlobalVariable(name: "gFoo", scope: !2, file: !3, line: 3, type: !8, isLocal: true, isDefinition: true)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{i32 1, !"wchar_size", i32 4}
+!12 = !{i32 7, !"PIC Level", i32 2}
+!13 = !{!"clang version 7.0.0 (trunk 332246)"}
+!14 = distinct !DISubprogram(name: "foo", scope: !3, file: !3, line: 6, type: !15, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !2, retainedNodes: !4)
+!15 = !DISubroutineType(types: !16)
+!16 = !{!8}
+!17 = !DILocation(line: 7, column: 10, scope: !14)
+!18 = !DILocation(line: 7, column: 3, scope: !14)
+!19 = distinct !DISubprogram(name: "bar", scope: !3, file: !3, line: 10, type: !15, isLocal: false, isDefinition: true, scopeLine: 10, isOptimized: true, unit: !2, retainedNodes: !4)
+!20 = !DILocation(line: 11, column: 10, scope: !19)
+!21 = !DILocation(line: 11, column: 3, scope: !19)
+!22 = distinct !DISubprogram(name: "baz", scope: !3, file: !3, line: 14, type: !23, isLocal: false, isDefinition: true, scopeLine: 14, isOptimized: true, unit: !2, retainedNodes: !4)
+!23 = !DISubroutineType(types: !24)
+!24 = !{null}
+!25 = !DILocation(line: 15, column: 10, scope: !22)
+!26 = !DILocation(line: 15, column: 8, scope: !22)
+!27 = !DILocation(line: 16, column: 10, scope: !22)
+!28 = !DILocation(line: 16, column: 8, scope: !22)
+!29 = !DILocation(line: 17, column: 1, scope: !22)
diff --git a/test/ThinLTO/X86/Inputs/local_name_conflict1.ll b/test/ThinLTO/X86/Inputs/local_name_conflict1.ll
index 2ef7bdd3eb7bec3f075f6e49f596683d3f50fe9b..d7b7ea61883021f508e6f1d2bf471a63e56d0d5c 100644
--- a/test/ThinLTO/X86/Inputs/local_name_conflict1.ll
+++ b/test/ThinLTO/X86/Inputs/local_name_conflict1.ll
@@ -3,6 +3,8 @@ source_filename = "local_name_conflict.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+@baz = internal constant i32 10, align 4
+
 ; Function Attrs: noinline nounwind uwtable
 define i32 @a() {
 entry:
@@ -13,5 +15,6 @@ entry:
 ; Function Attrs: noinline nounwind uwtable
 define internal i32 @foo() {
 entry:
-  ret i32 1
+  %0 = load i32, i32* @baz, align 4
+  ret i32 %0
 }
diff --git a/test/ThinLTO/X86/Inputs/local_name_conflict2.ll b/test/ThinLTO/X86/Inputs/local_name_conflict2.ll
index a8c20a29228a606d3f272f8e311be738d6ea415e..40a4e866599e8fce11c5b0da8ec6b0ba8be63b10 100644
--- a/test/ThinLTO/X86/Inputs/local_name_conflict2.ll
+++ b/test/ThinLTO/X86/Inputs/local_name_conflict2.ll
@@ -3,6 +3,8 @@ source_filename = "local_name_conflict.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+@baz = internal constant i32 10, align 4
+
 ; Function Attrs: noinline nounwind uwtable
 define i32 @b() {
 entry:
@@ -13,5 +15,6 @@ entry:
 ; Function Attrs: noinline nounwind uwtable
 define internal i32 @foo() {
 entry:
-  ret i32 2
+  %0 = load i32, i32* @baz, align 4
+  ret i32 %0
 }
diff --git a/test/ThinLTO/X86/Inputs/local_name_conflict_var1.ll b/test/ThinLTO/X86/Inputs/local_name_conflict_var1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fb5306fc337ff6b9a48c188792a9dd32d3ecf71c
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/local_name_conflict_var1.ll
@@ -0,0 +1,13 @@
+; ModuleID = 'local_name_conflict_var.o'
+source_filename = "local_name_conflict_var.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@baz = internal global i32 10, align 4
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @a() {
+entry:
+  %0 = load i32, i32* @baz, align 4
+  ret i32 %0
+}
diff --git a/test/ThinLTO/X86/Inputs/local_name_conflict_var2.ll b/test/ThinLTO/X86/Inputs/local_name_conflict_var2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bf3c262f180bff494965e413cb8c671e30f71359
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/local_name_conflict_var2.ll
@@ -0,0 +1,13 @@
+; ModuleID = 'local_name_conflict_var.o'
+source_filename = "local_name_conflict_var.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@baz = internal global i32 10, align 4
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @b() {
+entry:
+  %0 = load i32, i32* @baz, align 4
+  ret i32 %0
+}
diff --git a/test/ThinLTO/X86/Inputs/pr35472.ll b/test/ThinLTO/X86/Inputs/pr35472.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b9c92b3e3fbfa8b5a7dfd6b6467d56a0b118a7a8
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/pr35472.ll
@@ -0,0 +1,13 @@
+; ModuleID = 'b.cpp'
+source_filename = "b.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline optnone uwtable
+define void @_Z5Alphav() {
+entry:
+  call void @_Z5Bravov()
+  ret void
+}
+
+declare void @_Z5Bravov()
diff --git a/test/ThinLTO/X86/Inputs/strong_non_prevailing.ll b/test/ThinLTO/X86/Inputs/strong_non_prevailing.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5473f817f13750b28bf892448a706b810a93c10d
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/strong_non_prevailing.ll
@@ -0,0 +1,6 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_filename = comdat any
+
+@__llvm_profile_filename = constant [19 x i8] c"default_%m.profraw\00", comdat
diff --git a/test/ThinLTO/X86/cache-typeid-resolutions.ll b/test/ThinLTO/X86/cache-typeid-resolutions.ll
index 1609e90b0ae65b61c77a1fdd3fbf3bd7906b4699..e5092dd63ec06f54fe3ffdceabd79662c3ab65f0 100644
--- a/test/ThinLTO/X86/cache-typeid-resolutions.ll
+++ b/test/ThinLTO/X86/cache-typeid-resolutions.ll
@@ -9,23 +9,25 @@
 ; where both t and t-import are sensitive to typeid1's resolution
 ; so 4 distinct objects in total.
 ; RUN: rm -rf %t.cache
-; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
-; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t1.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx -r=%t1.bc,vt1,plx
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f1_actual,plx -r=%t.bc,f2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t1.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f1_actual,plx -r=%t.bc,f2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx -r=%t1.bc,vt1,plx
 ; RUN: ls %t.cache | count 4
 
 ; Three resolutions for typeid2: Indir, SingleImpl, UniqueRetVal
 ; where both t and t-import are sensitive to typeid2's resolution
 ; so 6 distinct objects in total.
 ; RUN: rm -rf %t.cache
-; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
-; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t2.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t2.bc,vt2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
-; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t3.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t3.bc,vt2a,plx -r=%t3.bc,vt2b,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t.bc,f1_actual,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t2.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t.bc,f1_actual,plx -r=%t2.bc,vt2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t3.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t.bc,f1_actual,plx -r=%t3.bc,vt2a,plx -r=%t3.bc,vt2b,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
 ; RUN: ls %t.cache | count 6
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i1 @f1(i8* %p) {
+@f1 = alias i1(i8*), i1 (i8*)* @f1_actual
+
+define i1 @f1_actual(i8* %p) {
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
   ret i1 %x
 }
diff --git a/test/ThinLTO/X86/cache.ll b/test/ThinLTO/X86/cache.ll
index c7eebf6703e7a3d3d4cbd85a12478c9cd45808cd..75e6b7fd6be76c3f31086cd0abb6266b275f5c03 100644
--- a/test/ThinLTO/X86/cache.ll
+++ b/test/ThinLTO/X86/cache.ll
@@ -1,5 +1,7 @@
-; Verify first that *without* hash, we don't use the cache.
+; NetBSD: noatime mounts currently inhibit 'touch -a' updates
+; UNSUPPORTED: system-netbsd
 
+; Verify first that *without* hash, we don't use the cache.
 ; RUN: opt -module-summary %s -o %t.bc
 ; RUN: opt -module-summary %p/Inputs/cache.ll -o %t2.bc
 
diff --git a/test/ThinLTO/X86/dot-dumper.ll b/test/ThinLTO/X86/dot-dumper.ll
index 72175a1ea4d09151cedae850378dba72f76d85e3..7578212ec44489a249d22c22797ef6456876ba94 100644
--- a/test/ThinLTO/X86/dot-dumper.ll
+++ b/test/ThinLTO/X86/dot-dumper.ll
@@ -20,7 +20,7 @@
 ; STRUCTURE-DAG:      subgraph cluster_1
 ; STRUCTURE:          // Cross-module edges:
 ; STRUCTURE-DAG:      M0_{{[0-9]+}} -> M1_{{[0-9]+}} // call
-; STRUCTURE-DAG:      M0_{{[0-9]+}} -> M1_{{[0-9]+}} [{{.*}}]; // ref
+; STRUCTURE-DAG:      M0_{{[0-9]+}} -> M1_{{[0-9]+}} [{{.*}}]; // const-ref
 ; STRUCTURE-NEXT:   }
 
 ; CLUSTER0:         // Module: {{.*}}1.bc
@@ -33,13 +33,13 @@
 
 ; CLUSTER1:         // Module: {{.*}}2.bc
 ; CLUSTER1-NEXT:    subgraph cluster_1 {
-; CLUSTER1-DAG:       M1_[[A:[0-9]+]] [{{.*}}A|extern{{.*}}]; // variable
+; CLUSTER1-DAG:       M1_[[A:[0-9]+]] [{{.*}}A|extern{{.*}}]; // variable, immutable
 ; CLUSTER1-DAG:       M1_[[FOO:[0-9]+]] [{{.*}}foo|extern{{.*}} ffl: 00001{{.*}}]; // function
-; CLUSTER1-DAG:       M1_[[B:[0-9]+]] [{{.*}}B|extern{{.*}}]; // variable
+; CLUSTER1-DAG:       M1_[[B:[0-9]+]] [{{.*}}B|extern{{.*}}]; // variable, immutable
 ; CLUSTER1-DAG:       M1_[[BAR:[0-9]+]] [{{.*}}bar|extern{{.*}}]; // function, dead
 ; CLUSTER1-NEXT:      // Edges:
-; CLUSTER1-DAG:       M1_[[FOO]] -> M1_[[B]] [{{.*}}]; // ref
-; CLUSTER1-DAG:       M1_[[FOO]] -> M1_[[A]] [{{.*}}]; // ref
+; CLUSTER1-DAG:       M1_[[FOO]] -> M1_[[B]] [{{.*}}]; // const-ref
+; CLUSTER1-DAG:       M1_[[FOO]] -> M1_[[A]] [{{.*}}]; // const-ref
 ; CLUSTER1-DAG:     }
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/ThinLTO/X86/funcimport.ll b/test/ThinLTO/X86/funcimport.ll
index 1e8784d7cac86a1cb15a2b3016b8f66b192211a6..fa1bdbf9f1199e22650c12b4905c7b79a4525111 100644
--- a/test/ThinLTO/X86/funcimport.ll
+++ b/test/ThinLTO/X86/funcimport.ll
@@ -40,7 +40,7 @@
 ; CODEGEN: T _main
 
 ; Verify that all run together
-; RUN: llvm-lto -thinlto-action=run %t2.bc  %t.bc
+; RUN: llvm-lto -thinlto-action=run %t2.bc  %t.bc  -exported-symbol=_main
 ; RUN: llvm-nm -o - < %t.bc.thinlto.o | FileCheck %s --check-prefix=ALL
 ; RUN: llvm-nm -o - < %t2.bc.thinlto.o | FileCheck %s --check-prefix=ALL2
 ; ALL: T _callfuncptr
diff --git a/test/ThinLTO/X86/function_entry_count.ll b/test/ThinLTO/X86/function_entry_count.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4ca2ec26a592218cbce3b61d366356b702f9e3ad
--- /dev/null
+++ b/test/ThinLTO/X86/function_entry_count.ll
@@ -0,0 +1,44 @@
+; RUN: opt -thinlto-bc %s -write-relbf-to-summary -thin-link-bitcode-file=%t1.thinlink.bc -o %t1.bc
+; RUN: opt -thinlto-bc %p/Inputs/function_entry_count.ll -write-relbf-to-summary -thin-link-bitcode-file=%t2.thinlink.bc -o %t2.bc
+
+; First perform the thin link on the normal bitcode file.
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps -thinlto-synthesize-entry-counts \
+; RUN:     -r=%t1.bc,g, \
+; RUN:     -r=%t1.bc,f,px \
+; RUN:     -r=%t1.bc,h,px \
+; RUN:     -r=%t2.bc,h, \
+; RUN:     -r=%t2.bc,g,px
+; RUN: llvm-dis -o - %t.o.1.3.import.bc | FileCheck %s
+
+; RUN: llvm-lto -thinlto-action=run -thinlto-synthesize-entry-counts -exported-symbol=f \
+; RUN:     -exported-symbol=g -exported-symbol=h -thinlto-save-temps=%t3. %t1.bc %t2.bc
+; RUN: llvm-dis %t3.0.3.imported.bc -o - | FileCheck %s
+
+; CHECK: define void @h() !prof ![[PROF2:[0-9]+]]
+; CHECK: define void @f(i32{{.*}}) !prof ![[PROF1:[0-9]+]]
+; CHECK: define available_externally void @g() !prof ![[PROF2]]
+; CHECK-DAG: ![[PROF1]] = !{!"synthetic_function_entry_count", i64 10}
+; CHECK-DAG: ![[PROF2]] = !{!"synthetic_function_entry_count", i64 198}
+
+target triple = "x86_64-unknown-linux-gnu"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @g();
+
+define void @h() {
+  ret void
+}
+
+define void @f(i32 %n) {
+entry:
+  %cmp = icmp slt i32 %n, 1
+  br i1 %cmp, label %exit, label %loop
+loop:
+  %n1 = phi i32 [%n, %entry], [%n2, %loop]
+  call void  @g()
+  %n2 = sub i32 %n1, 1
+  %cmp2 = icmp slt i32 %n, 1
+  br i1 %cmp2, label %exit, label %loop
+exit:
+  ret void
+}
diff --git a/test/ThinLTO/X86/globals-import-const-fold.ll b/test/ThinLTO/X86/globals-import-const-fold.ll
index 49e31b79a47b058de5a21663c2d5cee14d8d6442..a250ed2e92baed01c879c9bcc4b66663de7e908e 100644
--- a/test/ThinLTO/X86/globals-import-const-fold.ll
+++ b/test/ThinLTO/X86/globals-import-const-fold.ll
@@ -2,12 +2,12 @@
 ; RUN: opt -module-summary %p/Inputs/globals-import-cf-baz.ll -o %t2.bc
 ; RUN: llvm-lto -thinlto-action=thinlink %t1.bc %t2.bc -o %t3.index.bc
 
-; RUN: llvm-lto -thinlto-action=import %t1.bc %t2.bc -thinlto-index=%t3.index.bc
+; RUN: llvm-lto -thinlto-action=import -exported-symbol=main %t1.bc -thinlto-index=%t3.index.bc
 ; RUN: llvm-dis %t1.bc.thinlto.imported.bc -o - | FileCheck --check-prefix=IMPORT %s
 ; RUN: llvm-lto -thinlto-action=optimize %t1.bc.thinlto.imported.bc -o %t1.bc.thinlto.opt.bc
 ; RUN: llvm-dis %t1.bc.thinlto.opt.bc -o - | FileCheck --check-prefix=OPTIMIZE %s
 
-; IMPORT: @baz = available_externally local_unnamed_addr constant i32 10
+; IMPORT: @baz = internal local_unnamed_addr constant i32 10
 
 ; OPTIMIZE:       define i32 @main()
 ; OPTIMIZE-NEXT:    ret i32 10
diff --git a/test/ThinLTO/X86/index-const-prop-O0.ll b/test/ThinLTO/X86/index-const-prop-O0.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6426d726843fc55bc277422a478499dc32b43b70
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-O0.ll
@@ -0,0 +1,18 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-define-g.ll -o %t2.bc
+; RUN: llvm-lto2 run -O0 -save-temps %t2.bc -r=%t2.bc,g,pl %t1.bc -r=%t1.bc,main,plx -r=%t1.bc,g, -o %t3
+; RUN: llvm-dis %t3.1.3.import.bc -o - | FileCheck %s
+
+; With -O0 import is disabled so we must not internalize
+; read-only globals
+; CHECK: @g = dso_local global i32 42
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external global i32
+
+define i32 @main() {
+  %v = load i32, i32* @g
+  ret i32 %v
+}
diff --git a/test/ThinLTO/X86/index-const-prop-alias.ll b/test/ThinLTO/X86/index-const-prop-alias.ll
new file mode 100644
index 0000000000000000000000000000000000000000..592fc9e081a7c21c7d0653df7103ab0ce8fc3004
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-alias.ll
@@ -0,0 +1,42 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-alias.ll -o %t2.bc
+; RUN: llvm-lto2 run %t1.bc -r=%t1.bc,main,plx -r=%t1.bc,ret_ptr,pl -r=%t1.bc,g.alias,l -r=%t1.bc,g,l \
+; RUN:               %t2.bc -r=%t2.bc,g,pl -r=%t2.bc,g.alias,pl -save-temps -o %t3
+; RUN: llvm-dis %t3.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
+; RUN: llvm-dis %t3.1.5.precodegen.bc -o - | FileCheck %s --check-prefix=CODEGEN
+
+; When ret_ptr is preserved we return pointer to alias, so we can't internalize aliasee
+; RUN: llvm-lto2 run %t1.bc -r=%t1.bc,main,plx -r=%t1.bc,ret_ptr,plx -r=%t1.bc,g.alias,l -r=%t1.bc,g,l \
+; RUN:               %t2.bc -r=%t2.bc,g,pl -r=%t2.bc,g.alias,pl -save-temps -o %t4
+; RUN: llvm-dis %t4.1.3.import.bc -o - | FileCheck %s --check-prefix=PRESERVED
+
+; When g.alias is preserved we can't internalize aliasee either
+; RUN: llvm-lto2 run %t1.bc -r=%t1.bc,main,plx -r=%t1.bc,ret_ptr,pl -r=%t1.bc,g.alias,l -r=%t1.bc,g,l \
+; RUN:               %t2.bc -r=%t2.bc,g,pl -r=%t2.bc,g.alias,plx -save-temps -o %t5
+; RUN: llvm-dis %t5.1.3.import.bc -o - | FileCheck %s --check-prefix=PRESERVED
+
+; We currently don't support importing aliases
+; IMPORT:       @g.alias = external dso_local global i32
+; IMPORT-NEXT:  @g = internal global i32 42, align 4 #0
+; IMPORT:  attributes #0 = { "thinlto-internalize" }
+
+; CODEGEN:      define dso_local i32 @main
+; CODEGEN-NEXT:    ret i32 42
+
+; PRESERVED:      @g.alias = external dso_local global i32
+; PRESERVED-NEXT: @g = available_externally dso_local global i32 42, align 4
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g.alias = external global i32
+@g = external global i32
+
+define i32 @main() {
+  %v = load i32, i32* @g
+  ret i32 %v
+}
+
+define i32* @ret_ptr() {
+  ret i32* @g.alias
+}
diff --git a/test/ThinLTO/X86/index-const-prop-cache.ll b/test/ThinLTO/X86/index-const-prop-cache.ll
new file mode 100644
index 0000000000000000000000000000000000000000..62a22af0b1eba05f147771ba25b83a2506009d53
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-cache.ll
@@ -0,0 +1,41 @@
+; Check that we correctly handle 'ReadOnly' attribute when computing cache key
+; RUN: opt -module-summary -module-hash %s -o %t1.bc
+; RUN: opt -module-summary -module-hash %p/Inputs/index-const-prop-cache-foo.ll -o %t2.bc
+; RUN: opt -module-summary -module-hash %p/Inputs/index-const-prop-cache-test1.ll -o %t3.bc
+; RUN: opt -module-summary -module-hash %p/Inputs/index-const-prop-cache-test2.ll -o %t4.bc
+; RUN: rm -Rf %t.cache && mkdir %t.cache
+
+; Here @gFoo variable is writeable
+; RUN: llvm-lto -thinlto-action=run %t1.bc %t4.bc %t2.bc \
+; RUN:    -exported-symbol=main -exported-symbol=test -thinlto-cache-dir=%t.cache
+; RUN: ls %t.cache/llvmcache-* | count 3
+
+; Now gFoo is read-only and all modules should get different cache keys.
+; RUN: llvm-lto -thinlto-action=run %t1.bc %t3.bc %t2.bc \
+; RUN:    -exported-symbol=main -exported-symbol=test -thinlto-cache-dir=%t.cache
+; RUN: ls %t.cache/llvmcache-* | count 6
+
+; Do the same but with llvm-lto2
+; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: llvm-lto2 run  %t1.bc %t4.bc %t2.bc -cache-dir=%t.cache -o %t5 \
+; RUN:  -r=%t1.bc,main,plx -r=%t1.bc,foo,l \
+; RUN:  -r=%t4.bc,test,plx -r=%t4.bc,foo,l -r=%t4.bc,bar,l \
+; RUN:  -r=%t2.bc,foo,pl -r=%t2.bc,bar,pl -r=%t2.bc,rand,
+; RUN: ls %t.cache/llvmcache-* | count 3
+
+; RUN: llvm-lto2 run %t1.bc %t3.bc %t2.bc -cache-dir %t.cache -o %t6 \
+; RUN:  -r=%t1.bc,main,plx -r=%t1.bc,foo,l \
+; RUN:  -r=%t3.bc,test,plx -r=%t3.bc,foo,l \
+; RUN:  -r=%t2.bc,foo,pl -r=%t2.bc,bar,pl -r=%t2.bc,rand,
+; RUN: ls %t.cache/llvmcache-* | count 6
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() local_unnamed_addr {
+  %1 = tail call i32 (...) @foo()
+  ret i32 %1
+}
+
+declare i32 @foo(...) local_unnamed_addr
diff --git a/test/ThinLTO/X86/index-const-prop-comdat.ll b/test/ThinLTO/X86/index-const-prop-comdat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e05aaf98d7f0d69cbb0fd0059f26ccaee429bef1
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-comdat.ll
@@ -0,0 +1,17 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-comdat.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t2.bc -r=%t2.bc,g,pl %t1.bc -r=%t1.bc,main,plx -r=%t1.bc,g, -o %t3
+; RUN: llvm-dis %t3.2.3.import.bc -o - | FileCheck %s
+
+; Comdats are not internalized even if they are read only.
+; CHECK: @g = available_externally dso_local global i32 42 
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external global i32
+
+define i32 @main() {
+  %v = load i32, i32* @g
+  ret i32 %v
+}
diff --git a/test/ThinLTO/X86/index-const-prop-dead.ll b/test/ThinLTO/X86/index-const-prop-dead.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f3ffca8cbdf798b432f97a02a13ec81075e0489c
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-dead.ll
@@ -0,0 +1,26 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-define-g.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t2.bc -r=%t2.bc,g,pl \
+; RUN:               %t1.bc -r=%t1.bc,main,plx -r=%t1.bc,foo,pl -r=%t1.bc,g, -o %t3
+; RUN: llvm-dis %t3.2.3.import.bc -o - | FileCheck %s
+
+; Dead globals are converted to declarations by ThinLTO in dropDeadSymbols
+; If we try to internalize such we'll get a broken module. 
+; CHECK: @g = external dso_local global i32
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external global i32
+
+; We need at least one live symbol to enable dead stripping
+; Otherwise ModuleSummaryIndex::isGlobalValueLive will always
+; return true.
+define i32 @main() {
+  ret i32 42
+}
+
+define i32 @foo() {
+  %v = load i32, i32* @g
+  ret i32 %v
+}
diff --git a/test/ThinLTO/X86/index-const-prop-full-lto.ll b/test/ThinLTO/X86/index-const-prop-full-lto.ll
new file mode 100644
index 0000000000000000000000000000000000000000..26fe4d644b72486750dfdf5d8a22329671338f86
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-full-lto.ll
@@ -0,0 +1,24 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-define-g.ll -o %t2.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-full-lto.ll -o %t3.bc
+; RUN: llvm-lto2 run -save-temps %t2.bc -r=%t2.bc,g,pl \
+; RUN:                 %t1.bc -r=%t1.bc,foo,l -r=%t1.bc,main,plx -r=%t1.bc,g, \
+; RUN:                 %t3.bc -r=%t3.bc,foo,pl -r=%t3.bc,g, -o %t4
+; RUN: llvm-dis %t4.2.3.import.bc -o - | FileCheck %s
+
+; All references from functions in full LTO module are not constant.
+; We cannot internalize @g
+; CHECK: @g = available_externally dso_local global i32 42
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @foo()
+@g = external global i32
+
+define i32 @main() {
+  %v = call i32 @foo()
+  %v2 = load i32, i32* @g
+  %v3 = add i32 %v, %v2
+  ret i32 %v3
+}
diff --git a/test/ThinLTO/X86/index-const-prop-gvref.ll b/test/ThinLTO/X86/index-const-prop-gvref.ll
new file mode 100644
index 0000000000000000000000000000000000000000..87103fee1410ebd5c43ded688da6604f1bc867c0
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-gvref.ll
@@ -0,0 +1,27 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-gvref.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t2.bc -r=%t2.bc,b,pl -r=%t2.bc,a,pl \
+; RUN:               %t1.bc -r=%t1.bc,main,plx -r=%t1.bc,a, -r=%t1.bc,b, -o %t3
+; RUN: llvm-dis %t3.1.3.import.bc -o - | FileCheck %s --check-prefix=SRC
+; RUN: llvm-dis %t3.2.3.import.bc -o - | FileCheck %s --check-prefix=DEST
+
+; No variable in the source module should have been internalized
+; SRC:      @b = dso_local global i32* @a
+; SRC-NEXT: @a = dso_local global i32 42
+
+; We can't internalize globals referenced by other live globals
+; DEST:      @b = external dso_local global i32*
+; DEST-NEXT: @a = available_externally dso_local global i32 42, align 4
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global i32
+@b = external global i32*
+
+define i32 @main() {
+  %p = load i32*, i32** @b, align 8  
+  store i32 33, i32* %p, align 4
+  %v = load i32, i32* @a, align 4
+  ret i32 %v
+}
diff --git a/test/ThinLTO/X86/index-const-prop-ldst.ll b/test/ThinLTO/X86/index-const-prop-ldst.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4646557b6cfc3636421d24892d96f83e98287c2a
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-ldst.ll
@@ -0,0 +1,21 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-define-g.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t2.bc -r=%t2.bc,g,pl %t1.bc -r=%t1.bc,main,plx -r=%t1.bc,g, -o %t3
+; RUN: llvm-dis %t3.2.3.import.bc -o - | FileCheck %s
+
+; The 'store' instruction in @main should prevent internalization
+; even when there is 'load' instruction before it.
+; CHECK: @g = available_externally dso_local global i32 42
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external global i32
+
+define i32 @main() {
+  %v = load i32, i32* @g
+  %q = add i32 %v, 1
+  store i32 %q, i32* @g
+  
+  ret i32 %v
+}
diff --git a/test/ThinLTO/X86/index-const-prop-linkage.ll b/test/ThinLTO/X86/index-const-prop-linkage.ll
new file mode 100644
index 0000000000000000000000000000000000000000..aac917721113ca5d494f768bccfca780fa45f3e4
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop-linkage.ll
@@ -0,0 +1,27 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop-linkage.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t2.bc -r=%t2.bc,foo,pl -r=%t2.bc,g1,pl -r=%t2.bc,g2,pl -r=%t2.bc,g3, \
+; RUN:                           %t1.bc -r=%t1.bc,foo, -r=%t1.bc,main,plx -r=%t1.bc,g2,  -o %t3
+; RUN: llvm-dis %t3.2.3.import.bc -o - | FileCheck %s
+
+; Check that we never internalize anything with:
+; - appending linkage
+; - common linkage
+; - available_externally linkage
+; - reference from @llvm.used
+; CHECK:      @llvm.used = appending global [1 x i32*] [i32* @g2]
+; CHECK-NEXT: @g1 = external dso_local global i32, align 4
+; CHECK-NEXT: @g2 = available_externally dso_local global i32 42, align 4
+; CHECK-NEXT: @g3 = available_externally global i32 42, align 4
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @foo()
+@g2 = external global i32
+@llvm.used = appending global [1 x i32*] [i32* @g2]
+
+define i32 @main() {
+  %v = call i32 @foo()
+  ret i32 %v
+}
diff --git a/test/ThinLTO/X86/index-const-prop.ll b/test/ThinLTO/X86/index-const-prop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9dac8b81b910d97f00b36dc72b98b06fce7d3497
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop.ll
@@ -0,0 +1,49 @@
+; Check constant propagation in thinlto combined summary. This allows us to do 2 things:
+;  1. Internalize global definition which is not used externally if all accesses to it are read-only
+;  2. Make a local copy of internal definition if all accesses to it are readonly. This allows constant
+;     folding it during optimziation phase.
+
+; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop.ll -o %t2.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t3.index.bc %t1.bc %t2.bc
+; RUN: llvm-lto -thinlto-action=import -exported-symbol=main  %t1.bc -thinlto-index=%t3.index.bc -o %t1.imported.bc -stats 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: llvm-dis %t1.imported.bc -o - | FileCheck %s --check-prefix=IMPORT
+; RUN: llvm-lto -thinlto-action=optimize %t1.imported.bc -o - | llvm-dis - -o - | FileCheck %s --check-prefix=OPTIMIZE
+
+; STATS: 2 module-summary-index - Number of live global variables marked read only
+
+; Check that we don't internalize gBar when it is exported
+; RUN: llvm-lto -thinlto-action=import -exported-symbol main -exported-symbol gBar  %t1.bc -thinlto-index=%t3.index.bc -o %t1.imported2.bc
+; RUN: llvm-dis %t1.imported2.bc -o - | FileCheck %s --check-prefix=IMPORT2
+
+; IMPORT: @gFoo.llvm.0 = internal unnamed_addr global i32 1, align 4, !dbg !0
+; IMPORT-NEXT: @gBar = internal local_unnamed_addr global i32 2, align 4, !dbg !5
+; IMPORT: !DICompileUnit({{.*}})
+
+; OPTIMIZE:        define i32 @main
+; OPTIMIZE-NEXT:     ret i32 3
+
+; IMPORT2: @gBar = available_externally local_unnamed_addr global i32 2, align 4, !dbg !5
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+@gBar = external global i32
+
+; Should not be counted in the stats of live read only variables since it is
+; dead and will be dropped anyway.
+@gDead = internal unnamed_addr global i32 1, align 4
+
+define i32 @main() local_unnamed_addr {
+  %call = tail call i32 bitcast (i32 (...)* @foo to i32 ()*)()
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)()
+  %add = add nsw i32 %call1, %call
+  ret i32 %add
+}
+
+declare i32 @foo(...) local_unnamed_addr
+
+declare i32 @bar(...) local_unnamed_addr
diff --git a/test/ThinLTO/X86/index-const-prop2.ll b/test/ThinLTO/X86/index-const-prop2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..96b7593dd4fe88b9d015bccd549d1bbc992664a5
--- /dev/null
+++ b/test/ThinLTO/X86/index-const-prop2.ll
@@ -0,0 +1,59 @@
+; Check constant propagation in thinlto combined summary. This allows us to do 2 things:
+;  1. Internalize global definition which is not used externally if all accesses to it are read-only
+;  2. Make a local copy of internal definition if all accesses to it are readonly. This allows constant
+;     folding it during optimziation phase.
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/index-const-prop.ll -o %t2.bc
+; RUN: llvm-lto2 run %t1.bc %t2.bc -save-temps \
+; RUN:  -r=%t2.bc,foo,pl \
+; RUN:  -r=%t2.bc,bar,pl \
+; RUN:  -r=%t2.bc,baz,pl \
+; RUN:  -r=%t2.bc,rand, \
+; RUN:  -r=%t2.bc,gBar,pl \
+; RUN:  -r=%t1.bc,main,plx \
+; RUN:  -r=%t1.bc,foo, \
+; RUN:  -r=%t1.bc,bar, \
+; RUN:  -r=%t1.bc,gBar, \
+; RUN:  -o %t3
+; RUN: llvm-dis %t3.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
+; RUN: llvm-dis %t3.1.5.precodegen.bc -o - | FileCheck %s --check-prefix=CODEGEN
+
+; Now check that we won't internalize global (gBar) if it's externally referenced
+; RUN: llvm-lto2 run %t1.bc %t2.bc -save-temps \
+; RUN:  -r=%t2.bc,foo,pl \
+; RUN:  -r=%t2.bc,bar,pl \
+; RUN:  -r=%t2.bc,baz,pl \
+; RUN:  -r=%t2.bc,rand, \
+; RUN:  -r=%t2.bc,gBar,plx \
+; RUN:  -r=%t1.bc,main,plx \
+; RUN:  -r=%t1.bc,foo, \
+; RUN:  -r=%t1.bc,bar, \
+; RUN:  -r=%t1.bc,gBar, \
+; RUN:  -o %t3
+; RUN: llvm-dis %t3.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT2
+
+; IMPORT:       @gFoo.llvm.0 = internal unnamed_addr global i32 1, align 4
+; IMPORT-NEXT:  @gBar = internal local_unnamed_addr global i32 2, align 4
+; IMPORT:       !DICompileUnit({{.*}})
+
+; CODEGEN:        i32 @main()
+; CODEGEN-NEXT:     ret i32 3
+
+; IMPORT2: @gBar = available_externally dso_local local_unnamed_addr global i32 2, align 4
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; We should be able to link external definition of gBar to its declaration
+@gBar = external global i32
+
+define i32 @main() local_unnamed_addr {
+  %call = tail call i32 bitcast (i32 (...)* @foo to i32 ()*)()
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)()
+  %add = add nsw i32 %call1, %call
+  ret i32 %add
+}
+
+declare i32 @foo(...) local_unnamed_addr
+
+declare i32 @bar(...) local_unnamed_addr
diff --git a/test/ThinLTO/X86/local_name_conflict.ll b/test/ThinLTO/X86/local_name_conflict.ll
index e4eb33e524d724e327cb503eff3c0242d978c739..1912297cedee496f82c2adc2311522326cf2074f 100644
--- a/test/ThinLTO/X86/local_name_conflict.ll
+++ b/test/ThinLTO/X86/local_name_conflict.ll
@@ -1,5 +1,5 @@
 ; Test handling when two files with the same source file name contain
-; static functions with the same name (which will have the same GUID
+; static functions/variables with the same name (which will have the same GUID
 ; in the combined index.
 
 ; Do setup work for all below tests: generate bitcode and combined index
@@ -8,19 +8,22 @@
 ; RUN: opt -module-summary -module-hash %p/Inputs/local_name_conflict2.ll -o %t3.bc
 ; RUN: llvm-lto -thinlto-action=thinlink -o %t4.bc %t.bc %t2.bc %t3.bc
 
-; This module will import b() which should cause the copy of foo from
+; This module will import b() which should cause the copy of foo and baz from
 ; that module (%t3.bc) to be imported. Check that the imported reference's
 ; promoted name matches the imported copy.
 ; RUN: llvm-lto -thinlto-action=import %t.bc -thinlto-index=%t4.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=IMPORT
-; IMPORT: call i32 @foo.llvm.[[HASH:[0-9]+]]
+; IMPORT: @baz.llvm.[[HASH:[0-9]+]] = available_externally hidden constant i32 10, align 4
+; IMPORT: call i32 @foo.llvm.[[HASH]]
 ; IMPORT: define available_externally hidden i32 @foo.llvm.[[HASH]]()
 
 ; The copy in %t2.bc should not be exported/promoted/renamed
 ; RUN: llvm-lto -thinlto-action=promote %t2.bc -thinlto-index=%t4.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=NOEXPORTSTATIC
+; NOEXPORTSTATIC: @baz = internal constant i32 10, align 4
 ; NOEXPORTSTATIC: define internal i32 @foo()
 
 ; Make sure foo is promoted and renamed without complaint in %t3.bc.
 ; RUN: llvm-lto -thinlto-action=promote %t3.bc -thinlto-index=%t4.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=EXPORTSTATIC
+; EXPORTSTATIC: @baz.llvm.{{.*}} = hidden constant i32 10, align 4
 ; EXPORTSTATIC: define hidden i32 @foo.llvm.
 
 ; ModuleID = 'local_name_conflict_main.o'
diff --git a/test/ThinLTO/X86/local_name_conflict_var.ll b/test/ThinLTO/X86/local_name_conflict_var.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d203fe08e2c4c8616080b874853c98370aff6b67
--- /dev/null
+++ b/test/ThinLTO/X86/local_name_conflict_var.ll
@@ -0,0 +1,32 @@
+; Test handling when two files with the same source file name contain
+; static read only variables with the same name (which will have the same GUID
+; in the combined index).
+
+; Do setup work for all below tests: generate bitcode and combined index
+; RUN: opt -module-summary -module-hash %s -o %t.bc
+; RUN: opt -module-summary -module-hash %p/Inputs/local_name_conflict_var1.ll -o %t2.bc
+; RUN: opt -module-summary -module-hash %p/Inputs/local_name_conflict_var2.ll -o %t3.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t4.bc %t.bc %t2.bc %t3.bc
+
+; This module will import a() and b() which should cause the read only copy
+; of baz from each of those modules to be imported. Check that the both are
+; imported as local copies.
+; RUN: llvm-lto -thinlto-action=import -exported-symbol=main %t.bc -thinlto-index=%t4.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=IMPORT
+; IMPORT: @baz.llvm.{{.*}} = internal global i32 10
+; IMPORT: @baz.llvm.{{.*}} = internal global i32 10
+
+; ModuleID = 'local_name_conflict_var_main.o'
+source_filename = "local_name_conflict_var_main.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @main() {
+entry:
+  %call1 = call i32 (...) @a()
+  %call2 = call i32 (...) @b()
+  ret i32 0
+}
+
+declare i32 @a(...)
+declare i32 @b(...)
diff --git a/test/ThinLTO/X86/pr35472.ll b/test/ThinLTO/X86/pr35472.ll
new file mode 100644
index 0000000000000000000000000000000000000000..96a95aa463659b7668ecc8d85f8800ff9e1651cb
--- /dev/null
+++ b/test/ThinLTO/X86/pr35472.ll
@@ -0,0 +1,122 @@
+; Test to make sure that lazily loaded debug location scope metadata is
+; handled properly. Note that we need to have the DILexicalScope !34
+; referenced from multiple function's debug locs for this to be in the
+; lazily loaded module level metadata block.
+
+; RUN: opt -module-hash -module-summary %s -o %t1.bc
+; RUN: opt -module-hash -module-summary %p/Inputs/pr35472.ll -o %t2.bc
+; RUN: llvm-lto -thinlto-action=run %t1.bc %t2.bc -exported-symbol=_Z5Alphav
+; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck %s -check-prefix=ThinLTOa
+; RUN: llvm-nm %t2.bc.thinlto.o | FileCheck %s -check-prefix=ThinLTOb
+
+; ThinLTOa-DAG: T _Z5Bravov
+; ThinLTOa-DAG: W _ZN4EchoD2Ev
+; ThinLTOb-DAG: T _Z5Alphav
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.Delta = type { %struct.Charlie }
+%struct.Charlie = type { i32 }
+%struct.Echo = type { %struct.Charlie }
+
+$_ZN4EchoD2Ev = comdat any
+$_ZN5DeltaD2Ev = comdat any
+
+define void @_Z5Bravov() !dbg !7 {
+  %Hotel = alloca %struct.Delta, align 4
+  %India = alloca %struct.Echo, align 4
+  call void @llvm.dbg.declare(metadata %struct.Delta* %Hotel, metadata !10, metadata !DIExpression()), !dbg !22
+  call void @_ZN4EchoD2Ev(%struct.Echo* %India), !dbg !28
+  ret void, !dbg !28
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+define linkonce_odr void @_ZN4EchoD2Ev(%struct.Echo* %this) unnamed_addr comdat align 2 {
+  %this.addr.i = alloca %struct.Charlie*, align 8
+  call void @llvm.dbg.declare(metadata %struct.Charlie** %this.addr.i, metadata !29, metadata !DIExpression()), !dbg !32
+  %this1.i = load %struct.Charlie*, %struct.Charlie** %this.addr.i, align 8
+  %Golf.i = getelementptr inbounds %struct.Charlie, %struct.Charlie* %this1.i, i32 0, i32 0, !dbg !33
+  ret void
+}
+
+define linkonce_odr void @_ZN5DeltaD2Ev(%struct.Delta* %this) unnamed_addr comdat align 2 !dbg !36 {
+  %this.addr.i = alloca %struct.Charlie*, align 8
+  call void @llvm.dbg.declare(metadata %struct.Charlie** %this.addr.i, metadata !29, metadata !DIExpression()), !dbg !41
+  %this1.i = load %struct.Charlie*, %struct.Charlie** %this.addr.i, align 8
+  %Golf.i = getelementptr inbounds %struct.Charlie, %struct.Charlie* %this1.i, i32 0, i32 0, !dbg !48
+  ret void
+}
+
+!llvm.module.flags = !{!3, !4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321056)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "a.cpp", directory: "/home/sunil/185335/302")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!7 = distinct !DISubprogram(name: "Bravo", linkageName: "_Z5Bravov", scope: !1, file: !1, line: 17, type: !8, isLocal: false, isDefinition: true, scopeLine: 17, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocalVariable(name: "Hotel", scope: !7, file: !1, line: 18, type: !11)
+!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Delta", file: !1, line: 6, size: 32, elements: !12, identifier: "_ZTS5Delta")
+!12 = !{!13}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "Foxtrot", scope: !11, file: !1, line: 7, baseType: !14, size: 32)
+!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Charlie", file: !1, line: 1, size: 32, elements: !15, identifier: "_ZTS7Charlie")
+!15 = !{!16, !18}
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "Golf", scope: !14, file: !1, line: 3, baseType: !17, size: 32)
+!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!18 = !DISubprogram(name: "~Charlie", scope: !14, file: !1, line: 2, type: !19, isLocal: false, isDefinition: false, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null, !21}
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DILocation(line: 18, column: 11, scope: !7)
+!24 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Echo", file: !1, line: 10, size: 32, elements: !25, identifier: "_ZTS4Echo")
+!25 = !{!26}
+!26 = !DIDerivedType(tag: DW_TAG_member, name: "Foxtrot", scope: !24, file: !1, line: 11, baseType: !14, size: 32)
+!28 = !DILocation(line: 20, column: 1, scope: !7)
+!29 = !DILocalVariable(name: "this", arg: 1, scope: !30, type: !31, flags: DIFlagArtificial | DIFlagObjectPointer)
+!30 = distinct !DISubprogram(name: "~Charlie", linkageName: "_ZN7CharlieD2Ev", scope: !14, file: !1, line: 2, type: !19, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !18)
+!31 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+!32 = !DILocation(line: 0, scope: !30)
+!33 = !DILocation(line: 2, column: 53, scope: !34)
+!34 = distinct !DILexicalBlock(scope: !30, file: !1, line: 2, column: 51)
+!36 = distinct !DISubprogram(name: "~Delta", linkageName: "_ZN5DeltaD2Ev", scope: !11, file: !1, line: 6, type: !37, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !40)
+!37 = !DISubroutineType(types: !38)
+!38 = !{null, !39}
+!39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!40 = !DISubprogram(name: "~Delta", scope: !11, type: !37, isLocal: false, isDefinition: false, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: false)
+!41 = !DILocation(line: 0, scope: !30, inlinedAt: !42)
+!42 = distinct !DILocation(line: 6, column: 8, scope: !43)
+!43 = distinct !DILexicalBlock(scope: !36, file: !1, line: 6, column: 8)
+!48 = !DILocation(line: 2, column: 53, scope: !34, inlinedAt: !42)
+
+;----------------------------------------------------------------------------------------------
+; Compiled from following two source files with 'clang++ -S --std=c++11 -O0 -g -flto=thin' 
+; struct Charlie {
+;     __attribute__((__always_inline__)) ~Charlie() { Golf = 0; }
+;     int Golf;
+; };
+; 
+; struct Delta {
+;     Charlie Foxtrot;
+; };
+; 
+; struct Echo {
+;     Charlie Foxtrot;
+;     __attribute__((nodebug)) ~Echo() = default;
+; };
+; 
+; extern void Bravo();
+; 
+; void Bravo() {
+;     Delta Hotel;
+;     Echo India;
+; }
+; -----------------------------
+; extern void Bravo();
+; extern void Alpha();
+; void Alpha() { Bravo(); }
+
diff --git a/test/ThinLTO/X86/strong_non_prevailing.ll b/test/ThinLTO/X86/strong_non_prevailing.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f96e23adc3d511bc02844084fd5c008097c8fdb4
--- /dev/null
+++ b/test/ThinLTO/X86/strong_non_prevailing.ll
@@ -0,0 +1,16 @@
+; RUN: opt -module-summary %s -o %t.bc
+; RUN: opt -module-summary %p/Inputs/strong_non_prevailing.ll -o %t2.bc
+
+; RUN: llvm-lto -thinlto-action=run %t.bc %t2.bc -exported-symbol=__llvm_profile_filename
+; RUN: llvm-nm -o - < %t.bc.thinlto.o | FileCheck %s --check-prefix=EXPORTED
+; RUN: llvm-nm -o - < %t2.bc.thinlto.o 2>&1 | FileCheck %s --check-prefix=NOT_EXPORTED
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_filename = comdat any
+
+@__llvm_profile_filename = constant [19 x i8] c"default_%m.profraw\00", comdat
+
+; EXPORTED: N __llvm_profile_filename
+; NOT_EXPORTED-NOT: N __llvm_profile_filename
diff --git a/test/Transforms/AggressiveInstCombine/rotate.ll b/test/Transforms/AggressiveInstCombine/rotate.ll
new file mode 100644
index 0000000000000000000000000000000000000000..20499087e35cfb8d3c9ff65e8fc03ea7720faab8
--- /dev/null
+++ b/test/Transforms/AggressiveInstCombine/rotate.ll
@@ -0,0 +1,476 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s
+
+; https://bugs.llvm.org/show_bug.cgi?id=34924
+
+define i32 @rotl(i32 %a, i32 %b) {
+; CHECK-LABEL: @rotl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shr = lshr i32 %a, %sub
+  %shl = shl i32 %a, %b
+  %or = or i32 %shr, %shl
+  br label %end
+
+end:
+  %cond = phi i32 [ %or, %rotbb ], [ %a, %entry ]
+  ret i32 %cond
+}
+
+define i32 @rotl_commute_phi(i32 %a, i32 %b) {
+; CHECK-LABEL: @rotl_commute_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shr = lshr i32 %a, %sub
+  %shl = shl i32 %a, %b
+  %or = or i32 %shr, %shl
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+define i32 @rotl_commute_or(i32 %a, i32 %b) {
+; CHECK-LABEL: @rotl_commute_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shr = lshr i32 %a, %sub
+  %shl = shl i32 %a, %b
+  %or = or i32 %shl, %shr
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+; Verify that the intrinsic is inserted into a valid position.
+
+define i32 @rotl_insert_valid_location(i32 %a, i32 %b) {
+; CHECK-LABEL: @rotl_insert_valid_location(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ 1, [[ROTBB]] ], [ 2, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[TMP0]], [[OTHER]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shr = lshr i32 %a, %sub
+  %shl = shl i32 %a, %b
+  %or = or i32 %shr, %shl
+  br label %end
+
+end:
+  %cond = phi i32 [ %or, %rotbb ], [ %a, %entry ]
+  %other = phi i32 [ 1, %rotbb ], [ 2, %entry ]
+  %res = or i32 %cond, %other
+  ret i32 %res
+}
+
+define i32 @rotr(i32 %a, i32 %b) {
+; CHECK-LABEL: @rotr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shr, %shl
+  br label %end
+
+end:
+  %cond = phi i32 [ %or, %rotbb ], [ %a, %entry ]
+  ret i32 %cond
+}
+
+define i32 @rotr_commute_phi(i32 %a, i32 %b) {
+; CHECK-LABEL: @rotr_commute_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shr, %shl
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+define i32 @rotr_commute_or(i32 %a, i32 %b) {
+; CHECK-LABEL: @rotr_commute_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shl, %shr
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+; Negative test - non-power-of-2 might require urem expansion in the backend.
+
+define i12 @could_be_rotr_weird_type(i12 %a, i12 %b) {
+; CHECK-LABEL: @could_be_rotr_weird_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i12 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i12 12, [[B]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i12 [[A:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i12 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i12 [[SHL]], [[SHR]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i12 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
+; CHECK-NEXT:    ret i12 [[COND]]
+;
+entry:
+  %cmp = icmp eq i12 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i12 12, %b
+  %shl = shl i12 %a, %sub
+  %shr = lshr i12 %a, %b
+  %or = or i12 %shl, %shr
+  br label %end
+
+end:
+  %cond = phi i12 [ %a, %entry ], [ %or, %rotbb ]
+  ret i12 %cond
+}
+
+; Negative test - wrong phi ops.
+
+define i32 @not_rotr_1(i32 %a, i32 %b) {
+; CHECK-LABEL: @not_rotr_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[B]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shl, %shr
+  br label %end
+
+end:
+  %cond = phi i32 [ %b, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+; Negative test - too many phi ops.
+
+define i32 @not_rotr_2(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @not_rotr_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[B]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[CMP42:%.*]] = icmp ugt i32 [[OR]], 42
+; CHECK-NEXT:    br i1 [[CMP42]], label [[END]], label [[BOGUS:%.*]]
+; CHECK:       bogus:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ], [ [[C:%.*]], [[BOGUS]] ]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shl, %shr
+  %cmp42 = icmp ugt i32 %or, 42
+  br i1 %cmp42, label %end, label %bogus
+
+bogus:
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ], [ %c, %bogus ]
+  ret i32 %cond
+}
+
+; Negative test - wrong cmp (but this should match?).
+
+define i32 @not_rotr_3(i32 %a, i32 %b) {
+; CHECK-LABEL: @not_rotr_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[B]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %cmp = icmp sle i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shl, %shr
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+; Negative test - wrong shift.
+
+define i32 @not_rotr_4(i32 %a, i32 %b) {
+; CHECK-LABEL: @not_rotr_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[B]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = ashr i32 %a, %b
+  %or = or i32 %shl, %shr
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+; Negative test - wrong shift.
+
+define i32 @not_rotr_5(i32 %a, i32 %b) {
+; CHECK-LABEL: @not_rotr_5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[B]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[B]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %b, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shl, %shr
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+; Negative test - wrong sub.
+
+define i32 @not_rotr_6(i32 %a, i32 %b) {
+; CHECK-LABEL: @not_rotr_6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 8, [[B]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 8, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shl, %shr
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
+; Negative test - extra use. Technically, we could transform this
+; because it doesn't increase the instruction count, but we're
+; being cautious not to cause a potential perf pessimization for
+; targets that do not have a rotate instruction.
+
+define i32 @could_be_rotr(i32 %a, i32 %b, i32* %p) {
+; CHECK-LABEL: @could_be_rotr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
+; CHECK:       rotbb:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[B]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
+; CHECK-NEXT:    store i32 [[OR]], i32* [[P:%.*]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %a, %b
+  %or = or i32 %shl, %shr
+  store i32 %or, i32* %p
+  br label %end
+
+end:
+  %cond = phi i32 [ %a, %entry ], [ %or, %rotbb ]
+  ret i32 %cond
+}
+
diff --git a/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll b/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2ed362b873f8d343ce9378a4ff67c9eea7b6834e
--- /dev/null
+++ b/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+
+; ArgumentPromotion should preserve the default function address space
+; from the data layout.
+
+target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
+
+@g = common global i32 0, align 4
+
+define i32 @bar() {
+entry:
+  %call = call i32 @foo(i32* @g)
+; CHECK: %call = call addrspace(1) i32 @foo()
+  ret i32 %call
+}
+
+; CHECK: define internal i32 @foo() addrspace(1)
+define internal i32 @foo(i32*) {
+entry:
+  %retval = alloca i32, align 4
+  call void asm sideeffect "ldr r0, [r0] \0Abx lr        \0A", ""()
+  unreachable
+}
+
diff --git a/test/Transforms/BDCE/dead-uses.ll b/test/Transforms/BDCE/dead-uses.ll
new file mode 100644
index 0000000000000000000000000000000000000000..95c2893871f72411ba977e31e507ef76f9925872
--- /dev/null
+++ b/test/Transforms/BDCE/dead-uses.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -bdce -S < %s | FileCheck %s
+
+; Funnel shift based rotate test cases from PR39771
+
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+
+; First fshr operand is dead.
+define i32 @pr39771_fshr_multi_use_instr(i32 %a) {
+; CHECK-LABEL: @pr39771_fshr_multi_use_instr(
+; CHECK-NEXT:    [[X:%.*]] = or i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[B:%.*]] = tail call i32 @llvm.fshr.i32(i32 [[X]], i32 [[X]], i32 1)
+; CHECK-NEXT:    [[C:%.*]] = lshr i32 [[B]], 23
+; CHECK-NEXT:    [[D:%.*]] = xor i32 [[C]], [[B]]
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[D]], 31
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %x = or i32 %a, 0
+  %b = tail call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 1)
+  %c = lshr i32 %b, 23
+  %d = xor i32 %c, %b
+  %e = and i32 %d, 31
+  ret i32 %e
+}
+
+; First fshr operand is dead (vector variant).
+define <2 x i32> @pr39771_fshr_multi_use_instr_vec(<2 x i32> %a) {
+; CHECK-LABEL: @pr39771_fshr_multi_use_instr_vec(
+; CHECK-NEXT:    [[X:%.*]] = or <2 x i32> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = tail call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[X]], <2 x i32> [[X]], <2 x i32> <i32 1, i32 1>)
+; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 23, i32 23>
+; CHECK-NEXT:    [[D:%.*]] = xor <2 x i32> [[C]], [[B]]
+; CHECK-NEXT:    [[E:%.*]] = and <2 x i32> [[D]], <i32 31, i32 31>
+; CHECK-NEXT:    ret <2 x i32> [[E]]
+;
+  %x = or <2 x i32> %a, zeroinitializer
+  %b = tail call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> <i32 1, i32 1>)
+  %c = lshr <2 x i32> %b, <i32 23, i32 23>
+  %d = xor <2 x i32> %c, %b
+  %e = and <2 x i32> %d, <i32 31, i32 31>
+  ret <2 x i32> %e
+}
+
+; First fshr operand is dead, but it comes from an argument, not instruction.
+define i32 @pr39771_fshr_multi_use_arg(i32 %a) {
+; CHECK-LABEL: @pr39771_fshr_multi_use_arg(
+; CHECK-NEXT:    [[B:%.*]] = tail call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[A]], i32 1)
+; CHECK-NEXT:    [[C:%.*]] = lshr i32 [[B]], 23
+; CHECK-NEXT:    [[D:%.*]] = xor i32 [[C]], [[B]]
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[D]], 31
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %b = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 1)
+  %c = lshr i32 %b, 23
+  %d = xor i32 %c, %b
+  %e = and i32 %d, 31
+  ret i32 %e
+}
+
+; Second or operand is dead, but BDCE does not realize this.
+define i32 @pr39771_expanded_fshr_multi_use(i32 %a) {
+; CHECK-LABEL: @pr39771_expanded_fshr_multi_use(
+; CHECK-NEXT:    [[TMP:%.*]] = lshr i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[A]], 32
+; CHECK-NEXT:    [[B:%.*]] = or i32 [[TMP]], [[TMP2]]
+; CHECK-NEXT:    [[C:%.*]] = lshr i32 [[B]], 23
+; CHECK-NEXT:    [[D:%.*]] = xor i32 [[C]], [[B]]
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[D]], 31
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %tmp = lshr i32 %a, 1
+  %tmp2 = shl i32 %a, 32
+  %b = or i32 %tmp, %tmp2
+  %c = lshr i32 %b, 23
+  %d = xor i32 %c, %b
+  %e = and i32 %d, 31
+  ret i32 %e
+}
diff --git a/test/Transforms/BDCE/vectors.ll b/test/Transforms/BDCE/vectors.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fde22fd36b7883b7f9940ec69c784112689d6387
--- /dev/null
+++ b/test/Transforms/BDCE/vectors.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -bdce -instsimplify < %s | FileCheck %s
+; RUN: opt -S -instsimplify < %s | FileCheck %s -check-prefix=CHECK-IO
+; CHECK-IO lines to ensure that transformations are not performed by only instsimplify.
+
+; BDCE applied to integer vectors.
+
+define <2 x i32> @test_basic(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @test_basic(
+; CHECK-NEXT:    [[B2:%.*]] = add <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    [[B3:%.*]] = and <2 x i32> [[B2]], <i32 8, i32 8>
+; CHECK-NEXT:    [[D:%.*]] = ashr <2 x i32> [[B3]], <i32 3, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[D]]
+;
+; CHECK-IO-LABEL: @test_basic(
+; CHECK-IO-NEXT:    [[A2:%.*]] = add <2 x i32> [[A:%.*]], <i32 1, i32 1>
+; CHECK-IO-NEXT:    [[A3:%.*]] = and <2 x i32> [[A2]], <i32 4, i32 4>
+; CHECK-IO-NEXT:    [[B2:%.*]] = add <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-IO-NEXT:    [[B3:%.*]] = and <2 x i32> [[B2]], <i32 8, i32 8>
+; CHECK-IO-NEXT:    [[C:%.*]] = or <2 x i32> [[A3]], [[B3]]
+; CHECK-IO-NEXT:    [[D:%.*]] = ashr <2 x i32> [[C]], <i32 3, i32 3>
+; CHECK-IO-NEXT:    ret <2 x i32> [[D]]
+;
+  %a2 = add <2 x i32> %a, <i32 1, i32 1>
+  %a3 = and <2 x i32> %a2, <i32 4, i32 4>
+  %b2 = add <2 x i32> %b, <i32 1, i32 1>
+  %b3 = and <2 x i32> %b2, <i32 8, i32 8>
+  %c = or <2 x i32> %a3, %b3
+  %d = ashr <2 x i32> %c, <i32 3, i32 3>
+  ret <2 x i32> %d
+}
+
+; Going vector -> scalar
+define i32 @test_extractelement(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @test_extractelement(
+; CHECK-NEXT:    [[B2:%.*]] = add <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    [[B3:%.*]] = and <2 x i32> [[B2]], <i32 8, i32 8>
+; CHECK-NEXT:    [[D:%.*]] = extractelement <2 x i32> [[B3]], i32 0
+; CHECK-NEXT:    [[E:%.*]] = ashr i32 [[D]], 3
+; CHECK-NEXT:    ret i32 [[E]]
+;
+; CHECK-IO-LABEL: @test_extractelement(
+; CHECK-IO-NEXT:    [[A2:%.*]] = add <2 x i32> [[A:%.*]], <i32 1, i32 1>
+; CHECK-IO-NEXT:    [[A3:%.*]] = and <2 x i32> [[A2]], <i32 4, i32 4>
+; CHECK-IO-NEXT:    [[B2:%.*]] = add <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-IO-NEXT:    [[B3:%.*]] = and <2 x i32> [[B2]], <i32 8, i32 8>
+; CHECK-IO-NEXT:    [[C:%.*]] = or <2 x i32> [[A3]], [[B3]]
+; CHECK-IO-NEXT:    [[D:%.*]] = extractelement <2 x i32> [[C]], i32 0
+; CHECK-IO-NEXT:    [[E:%.*]] = ashr i32 [[D]], 3
+; CHECK-IO-NEXT:    ret i32 [[E]]
+;
+  %a2 = add <2 x i32> %a, <i32 1, i32 1>
+  %a3 = and <2 x i32> %a2, <i32 4, i32 4>
+  %b2 = add <2 x i32> %b, <i32 1, i32 1>
+  %b3 = and <2 x i32> %b2, <i32 8, i32 8>
+  %c = or <2 x i32> %a3, %b3
+  %d = extractelement <2 x i32> %c, i32 0
+  %e = ashr i32 %d, 3
+  ret i32 %e
+}
+
+; Going scalar -> vector
+define <2 x i32> @test_insertelement(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_insertelement(
+; CHECK-NEXT:    [[Y:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = insertelement <2 x i32> [[Y]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[Y3:%.*]] = and <2 x i32> [[Y2]], <i32 8, i32 8>
+; CHECK-NEXT:    [[U:%.*]] = ashr <2 x i32> [[Y3]], <i32 3, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[U]]
+;
+; CHECK-IO-LABEL: @test_insertelement(
+; CHECK-IO-NEXT:    [[X:%.*]] = insertelement <2 x i32> undef, i32 [[A:%.*]], i32 0
+; CHECK-IO-NEXT:    [[X2:%.*]] = insertelement <2 x i32> [[X]], i32 [[B:%.*]], i32 1
+; CHECK-IO-NEXT:    [[X3:%.*]] = and <2 x i32> [[X2]], <i32 4, i32 4>
+; CHECK-IO-NEXT:    [[Y:%.*]] = insertelement <2 x i32> undef, i32 [[B]], i32 0
+; CHECK-IO-NEXT:    [[Y2:%.*]] = insertelement <2 x i32> [[Y]], i32 [[A]], i32 1
+; CHECK-IO-NEXT:    [[Y3:%.*]] = and <2 x i32> [[Y2]], <i32 8, i32 8>
+; CHECK-IO-NEXT:    [[Z:%.*]] = or <2 x i32> [[X3]], [[Y3]]
+; CHECK-IO-NEXT:    [[U:%.*]] = ashr <2 x i32> [[Z]], <i32 3, i32 3>
+; CHECK-IO-NEXT:    ret <2 x i32> [[U]]
+;
+  %x = insertelement <2 x i32> undef, i32 %a, i32 0
+  %x2 = insertelement <2 x i32> %x, i32 %b, i32 1
+  %x3 = and <2 x i32> %x2, <i32 4, i32 4>
+  %y = insertelement <2 x i32> undef, i32 %b, i32 0
+  %y2 = insertelement <2 x i32> %y, i32 %a, i32 1
+  %y3 = and <2 x i32> %y2, <i32 8, i32 8>
+  %z = or <2 x i32> %x3, %y3
+  %u = ashr <2 x i32> %z, <i32 3, i32 3>
+  ret <2 x i32> %u
+}
+
+; Some non-int vectors and conversions
+define <2 x i32> @test_conversion(<2 x i32> %a) {
+; CHECK-LABEL: @test_conversion(
+; CHECK-NEXT:    [[A2:%.*]] = add <2 x i32> [[A:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    [[A3:%.*]] = and <2 x i32> [[A2]], <i32 2, i32 2>
+; CHECK-NEXT:    [[X:%.*]] = uitofp <2 x i32> [[A3]] to <2 x double>
+; CHECK-NEXT:    [[Y:%.*]] = fadd <2 x double> [[X]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[Z:%.*]] = fptoui <2 x double> [[Y]] to <2 x i32>
+; CHECK-NEXT:    [[U:%.*]] = ashr <2 x i32> [[Z]], <i32 3, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[U]]
+;
+; CHECK-IO-LABEL: @test_conversion(
+; CHECK-IO-NEXT:    [[A2:%.*]] = add <2 x i32> [[A:%.*]], <i32 1, i32 1>
+; CHECK-IO-NEXT:    [[A3:%.*]] = and <2 x i32> [[A2]], <i32 2, i32 2>
+; CHECK-IO-NEXT:    [[X:%.*]] = uitofp <2 x i32> [[A3]] to <2 x double>
+; CHECK-IO-NEXT:    [[Y:%.*]] = fadd <2 x double> [[X]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-IO-NEXT:    [[Z:%.*]] = fptoui <2 x double> [[Y]] to <2 x i32>
+; CHECK-IO-NEXT:    [[U:%.*]] = ashr <2 x i32> [[Z]], <i32 3, i32 3>
+; CHECK-IO-NEXT:    ret <2 x i32> [[U]]
+;
+  %a2 = add <2 x i32> %a, <i32 1, i32 1>
+  %a3 = and <2 x i32> %a2, <i32 2, i32 2>
+  %x = uitofp <2 x i32> %a3 to <2 x double>
+  %y = fadd <2 x double> %x, <double 1.0, double 1.0>
+  %z = fptoui <2 x double> %y to <2 x i32>
+  %u = ashr <2 x i32> %z, <i32 3, i32 3>
+  ret <2 x i32> %u
+}
+
+; Assumption invalidation (adapted from invalidate-assumptions.ll)
+define <2 x i1> @test_assumption_invalidation(<2 x i1> %b, <2 x i8> %x) {
+; CHECK-LABEL: @test_assumption_invalidation(
+; CHECK-NEXT:    [[LITTLE_NUMBER:%.*]] = zext <2 x i1> [[B:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i8> zeroinitializer, [[LITTLE_NUMBER]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i8> [[SUB]] to <2 x i1>
+; CHECK-NEXT:    ret <2 x i1> [[TRUNC]]
+;
+; CHECK-IO-LABEL: @test_assumption_invalidation(
+; CHECK-IO-NEXT:    [[SETBIT:%.*]] = or <2 x i8> [[X:%.*]], <i8 64, i8 64>
+; CHECK-IO-NEXT:    [[LITTLE_NUMBER:%.*]] = zext <2 x i1> [[B:%.*]] to <2 x i8>
+; CHECK-IO-NEXT:    [[BIG_NUMBER:%.*]] = shl <2 x i8> [[SETBIT]], <i8 1, i8 1>
+; CHECK-IO-NEXT:    [[SUB:%.*]] = sub nuw <2 x i8> [[BIG_NUMBER]], [[LITTLE_NUMBER]]
+; CHECK-IO-NEXT:    [[TRUNC:%.*]] = trunc <2 x i8> [[SUB]] to <2 x i1>
+; CHECK-IO-NEXT:    ret <2 x i1> [[TRUNC]]
+;
+  %setbit = or <2 x i8> %x, <i8 64, i8 64>
+  %little_number = zext <2 x i1> %b to <2 x i8>
+  %big_number = shl <2 x i8> %setbit, <i8 1, i8 1>
+  %sub = sub nuw <2 x i8> %big_number, %little_number
+  %trunc = trunc <2 x i8> %sub to <2 x i1>
+  ret <2 x i1> %trunc
+}
diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
index d5f31f9ac91c197c2b91e5ae08a058e8b3ca4e19..cd2edb1e33559d1a4db30d99d1dc7281c6c11418 100644
--- a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
+++ b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
@@ -37,9 +37,9 @@ End:
 
 ;CHECK-LABEL: @test_eq_eq_eq
 ;CHECK-LABEL: Header2.split:
-;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 10)
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 10)
 ;CHECK-LABEL: TBB.split:
-;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 %p)
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 %p)
 ;CHECK-LABEL: Tail
 ;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header2.split ], [ %[[CALL2]], %TBB.split ]
 ;CHECK: ret i32 %[[MERGED]]
@@ -123,14 +123,14 @@ End:
 ;CHECK-LABEL: Header2.split:
 ;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10)
 ;CHECK-LABEL: TBB.split:
-;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p)
 ;CHECK-LABEL: Tail
 ;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header2.split ], [ %[[CALL2]], %TBB.split ]
 ;CHECK: ret i32 %[[MERGED]]
 define i32 @test_ne_eq_ne(i32* %a, i32 %v, i32 %p) {
 Header:
   %tobool1 = icmp ne i32* %a, null
-  br i1 %tobool1, label %Header2, label %End
+  br i1 %tobool1, label %Header2, label %TBB
 
 Header2:
   %tobool2 = icmp eq i32 %p, 10
@@ -178,14 +178,14 @@ End:
 ;CHECK-LABEL: Header2.split:
 ;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
 ;CHECK-LABEL: TBB.split:
-;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p)
 ;CHECK-LABEL: Tail
 ;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header2.split ], [ %[[CALL2]], %TBB.split ]
 ;CHECK: ret i32 %[[MERGED]]
 define i32 @test_ne_ne_ne_constrain_same_pointer_arg(i32* %a, i32 %v, i32 %p, i32* %a2, i32* %a3) {
 Header:
   %tobool1 = icmp ne i32* %a, null
-  br i1 %tobool1, label %Header2, label %End
+  br i1 %tobool1, label %Header2, label %TBB
 
 Header2:
   %tobool2 = icmp ne i32* %a, %a2
@@ -235,14 +235,14 @@ End:
 ;CHECK-LABEL: Header2.split:
 ;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10)
 ;CHECK-LABEL: TBB.split:
-;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 %p)
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 %p)
 ;CHECK-LABEL: Tail
 ;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header2.split ], [ %[[CALL2]], %TBB.split ]
 ;CHECK: ret i32 %[[MERGED]]
 define i32 @test_eq_eq_eq_untaken(i32* %a, i32 %v, i32 %p) {
 Header:
   %tobool1 = icmp eq i32* %a, null
-  br i1 %tobool1, label %End, label %Header2
+  br i1 %tobool1, label %TBB, label %Header2
 
 Header2:
   %tobool2 = icmp eq i32 %p, 10
@@ -290,14 +290,14 @@ End:
 ;CHECK-LABEL: Header2.split:
 ;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 10)
 ;CHECK-LABEL: TBB.split:
-;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 %p)
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p)
 ;CHECK-LABEL: Tail
 ;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header2.split ], [ %[[CALL2]], %TBB.split ]
 ;CHECK: ret i32 %[[MERGED]]
 define i32 @test_ne_eq_ne_untaken(i32* %a, i32 %v, i32 %p) {
 Header:
   %tobool1 = icmp ne i32* %a, null
-  br i1 %tobool1, label %End, label %Header2
+  br i1 %tobool1, label %TBB, label %Header2
 
 Header2:
   %tobool2 = icmp eq i32 %p, 10
@@ -489,6 +489,31 @@ End:
   ret i32 %v
 }
 
+;CHECK-LABEL: @test_cond_no_effect
+;CHECK-NOT: Header.split:
+;CHECK-NOT: TBB.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 0)
+;CHECK: ret i32 %r
+define i32 @test_cond_no_effect(i32* %a, i32 %v) {
+Entry:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %Header, label %End
+
+Header:
+  br i1 undef, label %Tail, label %TBB
+
+TBB:
+  br i1 undef, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 0)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
 ;CHECK-LABEL: @test_unreachable
 ;CHECK-LABEL: Header.split:
 ;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 10)
diff --git a/test/Transforms/CallSiteSplitting/musttail.ll b/test/Transforms/CallSiteSplitting/musttail.ll
index faa352dd5bffa2d324cc5c61451118f4ed30fb01..d8f76aed3dcb4008b9e3b1576e5bc029e1133009 100644
--- a/test/Transforms/CallSiteSplitting/musttail.ll
+++ b/test/Transforms/CallSiteSplitting/musttail.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s -callsite-splitting -verify-dom-info -S | FileCheck %s
 
 ;CHECK-LABEL: @caller
 ;CHECK-LABEL: Top.split:
@@ -73,3 +73,32 @@ End:
 define void @void_callee(i8* %a, i8* %b) noinline {
   ret void
 }
+
+;   Include a test with a larger CFG that exercises the DomTreeUpdater
+;   machinery a bit more.
+;CHECK-LABEL: @larger_cfg_caller
+;CHECK-LABEL: Top.split:
+;CHECK: %r1 = musttail call i8* @callee(i8* null, i8* %b)
+;CHECK: ret i8* %r1
+;CHECK-LABEL: TBB.split
+;CHECK: %r2 = musttail call i8* @callee(i8* nonnull %a, i8* null)
+;CHECK: ret i8* %r2
+define i8* @larger_cfg_caller(i8* %a, i8* %b) {
+Top:
+  %cond1 = icmp eq i8* %a, null
+  br i1 %cond1, label %Tail, label %ExtraTest
+ExtraTest:
+  %a0 = load i8, i8* %a
+  %cond2 = icmp eq i8 %a0, 0
+  br i1 %cond2, label %TBB_pred, label %End
+TBB_pred:
+  br label %TBB
+TBB:
+  %cond3 = icmp eq i8* %b, null
+  br i1 %cond3, label %Tail, label %End
+Tail:
+  %r = musttail call i8* @callee(i8* %a, i8* %b)
+  ret i8* %r
+End:
+  ret i8* null
+}
diff --git a/test/Transforms/CodeExtractor/PartialInlineEntryPHICost.ll b/test/Transforms/CodeExtractor/PartialInlineEntryPHICost.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a8c2d62710d9521459076e62b03b0631b9ed5a2e
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineEntryPHICost.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
+
+; Check that we do not overcompute the outlined region cost, where the PHIs in
+; the outlined region entry (BB4) are moved outside the region by CodeExtractor.
+
+define i32 @bar(i32 %arg) {
+bb:
+  %tmp = icmp slt i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb2
+
+bb1:
+  br i1 undef, label %bb4, label %bb2
+
+bb2:                                              ; preds = %bb, %bb1
+  br i1 undef, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb1, %bb2
+  %xx1 = phi i32 [ 1, %bb1 ], [ 9, %bb2 ]
+  %xx2 = phi i32 [ 1, %bb1 ], [ 9, %bb2 ]
+  %xx3 = phi i32 [ 1, %bb1 ], [ 9, %bb2 ]
+  tail call void (...) @foo() #2
+  br label %bb5
+
+bb5:                                              ; preds = %bb4, %bb2
+  %tmp6 = phi i32 [ 1, %bb2 ], [ 9, %bb4 ]
+  ret i32 %tmp6
+}
+
+declare void @foo(...)
+
+define i32 @dummy_caller(i32 %arg) {
+bb:
+; CHECK-LABEL: @dummy_caller
+; CHECK: br i1
+; CHECK: br i1
+; CHECK: call void @bar.1.
+  %tmp = tail call i32 @bar(i32 %arg)
+  ret i32 %tmp
+}
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index dce2068a8d5563c0534650c6376d9f2449201e29..68d7390b0e6bccec803ef3d22afc4da97329f111 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -269,3 +269,42 @@ define i64 @test16.3() {
 ; BE-LABEL: @test16.3(
 ; BE: ret i64 0
 }
+
+@g7 = constant {[0 x i32], [0 x i8], {}*} { [0 x i32] undef, [0 x i8] undef, {}* null }
+
+define i64* @test_leading_zero_size_elems() {
+  %v = load i64*, i64** bitcast ({[0 x i32], [0 x i8], {}*}* @g7 to i64**)
+  ret i64* %v
+
+; LE-LABEL: @test_leading_zero_size_elems(
+; LE: ret i64* null
+
+; BE-LABEL: @test_leading_zero_size_elems(
+; BE: ret i64* null
+}
+
+@g8 = constant {[4294967295 x [0 x i32]], i64} { [4294967295 x [0 x i32]] undef, i64 123 }
+
+define i64 @test_leading_zero_size_elems_big() {
+  %v = load i64, i64* bitcast ({[4294967295 x [0 x i32]], i64}* @g8 to i64*)
+  ret i64 %v
+
+; LE-LABEL: @test_leading_zero_size_elems_big(
+; LE: ret i64 123
+
+; BE-LABEL: @test_leading_zero_size_elems_big(
+; BE: ret i64 123
+}
+
+@g9 = constant [4294967295 x [0 x i32]] zeroinitializer
+
+define i64 @test_array_of_zero_size_array() {
+  %v = load i64, i64* bitcast ([4294967295 x [0 x i32]]* @g9 to i64*)
+  ret i64 %v
+
+; LE-LABEL: @test_array_of_zero_size_array(
+; LE: ret i64 0
+
+; BE-LABEL: @test_array_of_zero_size_array(
+; BE: ret i64 0
+}
diff --git a/test/Transforms/Coroutines/no-suspend.ll b/test/Transforms/Coroutines/no-suspend.ll
index 804b38cc1abeaaaa3b884445da1b0bcbb5b9ab3e..fca096fa48bb39df14dc703a4f11c22df9cc2580 100644
--- a/test/Transforms/Coroutines/no-suspend.ll
+++ b/test/Transforms/Coroutines/no-suspend.ll
@@ -1,14 +1,16 @@
 ; Test no suspend coroutines
-; RUN: opt < %s -O2 -enable-coroutines -S | FileCheck %s
+; RUN: opt < %s -coro-split -S | FileCheck %s
 
 ; Coroutine with no-suspends will turn into:
 ;
 ; CHECK-LABEL: define void @no_suspends(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    alloca
+; CHECK-NEXT:    bitcast
 ; CHECK-NEXT:    call void @print(i32 %n)
 ; CHECK-NEXT:    ret void
 ;
-define void @no_suspends(i32 %n) {
+define void @no_suspends(i32 %n) "coroutine.presplit"="1" {
 entry:
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id)
@@ -37,15 +39,18 @@ suspend:
 }
 
 ; SimplifySuspendPoint will detect that coro.resume resumes itself and will
-; replace suspend with a jump to %resume label turning it into no-suspend 
+; replace suspend with a jump to %resume label turning it into no-suspend
 ; coroutine.
 ;
 ; CHECK-LABEL: define void @simplify_resume(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    alloca
+; CHECK-NEXT:    bitcast
+; CHECK-NEXT:    call void @llvm.memcpy
 ; CHECK-NEXT:    call void @print(i32 0)
 ; CHECK-NEXT:    ret void
 ;
-define void @simplify_resume() {
+define void @simplify_resume(i8* %src, i8* %dst) "coroutine.presplit"="1" {
 entry:
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id)
@@ -60,7 +65,11 @@ coro.begin:
   br label %body
 body:
   %save = call token @llvm.coro.save(i8* %hdl)
-  call void @llvm.coro.resume(i8* %hdl)
+  ; memcpy intrinsics should not prevent simplification.
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 1, i1 false)
+  %subfn = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 0)
+  %bres = bitcast i8* %subfn to void (i8*)*
+  call fastcc void %bres(i8* %hdl)
   %0 = call i8 @llvm.coro.suspend(token %save, i1 false)
   switch i8 %0, label %suspend [i8 0, label %resume
                                 i8 1, label %pre.cleanup]
@@ -82,15 +91,17 @@ suspend:
 }
 
 ; SimplifySuspendPoint will detect that coroutine destroys itself and will
-; replace suspend with a jump to %cleanup label turning it into no-suspend 
+; replace suspend with a jump to %cleanup label turning it into no-suspend
 ; coroutine.
 ;
 ; CHECK-LABEL: define void @simplify_destroy(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    alloca
+; CHECK-NEXT:    bitcast
 ; CHECK-NEXT:    call void @print(i32 1)
 ; CHECK-NEXT:    ret void
 ;
-define void @simplify_destroy() {
+define void @simplify_destroy() "coroutine.presplit"="1" personality i32 0 {
 entry:
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id)
@@ -105,7 +116,11 @@ coro.begin:
   br label %body
 body:
   %save = call token @llvm.coro.save(i8* %hdl)
-  call void @llvm.coro.destroy(i8* %hdl)
+  %subfn = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 1)
+  %bcast = bitcast i8* %subfn to void (i8*)*
+  invoke fastcc void %bcast(i8* %hdl) to label %real_susp unwind label %lpad
+
+real_susp:
   %0 = call i8 @llvm.coro.suspend(token %save, i1 false)
   switch i8 %0, label %suspend [i8 0, label %resume
                                 i8 1, label %pre.cleanup]
@@ -124,17 +139,83 @@ cleanup:
 suspend:
   call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
+lpad:
+  %lpval = landingpad { i8*, i32 }
+     cleanup
+
+  call void @print(i32 2)
+  resume { i8*, i32 } %lpval
 }
 
+; SimplifySuspendPoint will detect that coro.resume resumes itself and will
+; replace suspend with a jump to %resume label turning it into no-suspend
+; coroutine.
+;
+; CHECK-LABEL: define void @simplify_resume_with_inlined_if(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    alloca
+; CHECK-NEXT:    bitcast
+; CHECK-NEXT:    br i1
+; CHECK:         call void @print(i32 0)
+; CHECK-NEXT:    ret void
+;
+define void @simplify_resume_with_inlined_if(i8* %src, i8* %dst, i1 %cond) "coroutine.presplit"="1" {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id)
+  br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin
+dyn.alloc:
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  br label %coro.begin
+coro.begin:
+  %phi = phi i8* [ null, %entry ], [ %alloc, %dyn.alloc ]
+  %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %phi)
+  br label %body
+body:
+  %save = call token @llvm.coro.save(i8* %hdl)
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 1, i1 false)
+  br label %if.end
+if.else:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %src, i8* %dst, i64 1, i1 false)
+  br label %if.end
+if.end:
+  %subfn = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 0)
+  %bres = bitcast i8* %subfn to void (i8*)*
+  call fastcc void %bres(i8* %hdl)
+  %0 = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume
+                                i8 1, label %pre.cleanup]
+resume:
+  call void @print(i32 0)
+  br label %cleanup
+
+pre.cleanup:
+  call void @print(i32 1)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
+  ret void
+}
+
+
+
 ; SimplifySuspendPoint won't be able to simplify if it detects that there are
 ; other calls between coro.save and coro.suspend. They potentially can call
 ; resume or destroy, so we should not simplify this suspend point.
 ;
-; CHECK-LABEL: define void @cannot_simplify(
+; CHECK-LABEL: define void @cannot_simplify_other_calls(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call i8* @malloc
+; CHECK-NEXT:     llvm.coro.id
 
-define void @cannot_simplify() {
+define void @cannot_simplify_other_calls() "coroutine.presplit"="1" {
 entry:
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id)
@@ -149,8 +230,117 @@ coro.begin:
   br label %body
 body:
   %save = call token @llvm.coro.save(i8* %hdl)
+  br label %body1
+
+body1:
   call void @foo()
-  call void @llvm.coro.destroy(i8* %hdl)
+  br label %body2
+
+body2:
+  %subfn = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 1)
+  %bcast = bitcast i8* %subfn to void (i8*)*
+  call fastcc void %bcast(i8* %hdl)
+  %0 = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume
+                                i8 1, label %pre.cleanup]
+resume:
+  call void @print(i32 0)
+  br label %cleanup
+
+pre.cleanup:
+  call void @print(i32 1)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
+  ret void
+}
+
+; SimplifySuspendPoint won't be able to simplify if it detects that there are
+; other calls between coro.save and coro.suspend. They potentially can call
+; resume or destroy, so we should not simplify this suspend point.
+;
+; CHECK-LABEL: define void @cannot_simplify_calls_in_terminator(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:     llvm.coro.id
+
+define void @cannot_simplify_calls_in_terminator() "coroutine.presplit"="1" personality i32 0 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id)
+  br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin
+dyn.alloc:
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  br label %coro.begin
+coro.begin:
+  %phi = phi i8* [ null, %entry ], [ %alloc, %dyn.alloc ]
+  %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %phi)
+  br label %body
+body:
+  %save = call token @llvm.coro.save(i8* %hdl)
+  invoke void @foo() to label %resume_cont unwind label %lpad
+resume_cont:
+  %subfn = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 1)
+  %bcast = bitcast i8* %subfn to void (i8*)*
+  call fastcc void %bcast(i8* %hdl)
+  %0 = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume
+                                i8 1, label %pre.cleanup]
+resume:
+  call void @print(i32 0)
+  br label %cleanup
+
+pre.cleanup:
+  call void @print(i32 1)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
+  ret void
+lpad:
+  %lpval = landingpad { i8*, i32 }
+     cleanup
+
+  call void @print(i32 2)
+  resume { i8*, i32 } %lpval
+}
+
+; SimplifySuspendPoint won't be able to simplify if it detects that resume or
+; destroy does not immediately preceed coro.suspend.
+;
+; CHECK-LABEL: define void @cannot_simplify_not_last_instr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:     llvm.coro.id
+
+define void @cannot_simplify_not_last_instr(i8* %dst, i8* %src) "coroutine.presplit"="1" {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id)
+  br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin
+dyn.alloc:
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  br label %coro.begin
+coro.begin:
+  %phi = phi i8* [ null, %entry ], [ %alloc, %dyn.alloc ]
+  %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %phi)
+  br label %body
+body:
+  %save = call token @llvm.coro.save(i8* %hdl)
+  %subfn = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 1)
+  %bcast = bitcast i8* %subfn to void (i8*)*
+  call fastcc void %bcast(i8* %hdl)
+  ; memcpy separates destory from suspend, therefore cannot simplify.
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 1, i1 false)
   %0 = call i8 @llvm.coro.suspend(token %save, i1 false)
   switch i8 %0, label %suspend [i8 0, label %resume
                                 i8 1, label %pre.cleanup]
@@ -185,5 +375,6 @@ declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
 declare i1 @llvm.coro.end(i8*, i1)
 
-declare void @llvm.coro.resume(i8*)
-declare void @llvm.coro.destroy(i8*)
+declare i8* @llvm.coro.subfn.addr(i8*, i8)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)
diff --git a/test/Transforms/CorrelatedValuePropagation/ashr.ll b/test/Transforms/CorrelatedValuePropagation/ashr.ll
index 88b9ed08b015ab5d9116bcf98794c17b6a6a6625..5aa0754cf6a2784d960d23ece2227698aebb8118 100644
--- a/test/Transforms/CorrelatedValuePropagation/ashr.ll
+++ b/test/Transforms/CorrelatedValuePropagation/ashr.ll
@@ -1,5 +1,11 @@
 ; RUN: opt < %s -correlated-propagation -S | FileCheck %s
 
+; Check that debug locations are preserved. For more info see:
+;   https://llvm.org/docs/SourceLevelDebugging.html#fixing-errors
+; RUN: opt < %s -enable-debugify -correlated-propagation -S 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=DEBUG
+; DEBUG: CheckModuleDebugify: PASS
+
 ; CHECK-LABEL: @test1
 define void @test1(i32 %n) {
 entry:
diff --git a/test/Transforms/CorrelatedValuePropagation/basic.ll b/test/Transforms/CorrelatedValuePropagation/basic.ll
index 14b9a1999cc3ceab7308ed8441a6f2b09546829d..520d7c2b66cbf23b7c209f84b62b88b867adf411 100644
--- a/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -1,74 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -correlated-propagation -S | FileCheck %s
 ; PR2581
 
+define i32 @test1(i1 %C) {
 ; CHECK-LABEL: @test1(
-define i32 @test1(i1 %C) nounwind  {
-        br i1 %C, label %exit, label %body
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[EXIT:%.*]], label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    ret i32 11
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 10
+;
+  br i1 %C, label %exit, label %body
 
 body:           ; preds = %0
-; CHECK-NOT: select
-        %A = select i1 %C, i32 10, i32 11               ; <i32> [#uses=1]
-; CHECK: ret i32 11
-        ret i32 %A
+  %A = select i1 %C, i32 10, i32 11
+  ret i32 %A
 
 exit:           ; preds = %0
-; CHECK: ret i32 10
-        ret i32 10
+  ret i32 10
 }
 
 ; PR4420
 declare i1 @ext()
-; CHECK-LABEL: @test2(
 define i1 @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @ext()
+; CHECK-NEXT:    br i1 [[COND]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[COND2:%.*]] = tail call i1 @ext()
+; CHECK-NEXT:    br i1 [[COND2]], label [[BB3:%.*]], label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       bb3:
+; CHECK-NEXT:    [[RES:%.*]] = tail call i1 @ext()
+; CHECK-NEXT:    ret i1 [[RES]]
+;
 entry:
-        %cond = tail call i1 @ext()             ; <i1> [#uses=2]
-        br i1 %cond, label %bb1, label %bb2
-
-bb1:            ; preds = %entry
-        %cond2 = tail call i1 @ext()            ; <i1> [#uses=1]
-        br i1 %cond2, label %bb3, label %bb2
-
-bb2:            ; preds = %bb1, %entry
-; CHECK-NOT: phi i1
-        %cond_merge = phi i1 [ %cond, %entry ], [ false, %bb1 ]         ; <i1> [#uses=1]
-; CHECK: ret i1 false
-        ret i1 %cond_merge
-
-bb3:            ; preds = %bb1
-        %res = tail call i1 @ext()              ; <i1> [#uses=1]
-; CHECK: ret i1 %res
-        ret i1 %res
+  %cond = tail call i1 @ext()
+  br i1 %cond, label %bb1, label %bb2
+
+bb1:
+  %cond2 = tail call i1 @ext()
+  br i1 %cond2, label %bb3, label %bb2
+
+bb2:
+  %cond_merge = phi i1 [ %cond, %entry ], [ false, %bb1 ]
+  ret i1 %cond_merge
+
+bb3:
+  %res = tail call i1 @ext()
+  ret i1 %res
 }
 
 ; PR4855
 @gv = internal constant i8 7
-; CHECK-LABEL: @test3(
 define i8 @test3(i8* %a) nounwind {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8* [[A:%.*]], @gv
+; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret i8 0
+; CHECK:       bb2:
+; CHECK-NEXT:    [[SHOULD_BE_CONST:%.*]] = load i8, i8* @gv
+; CHECK-NEXT:    ret i8 [[SHOULD_BE_CONST]]
+;
 entry:
-        %cond = icmp eq i8* %a, @gv
-        br i1 %cond, label %bb2, label %bb
+  %cond = icmp eq i8* %a, @gv
+  br i1 %cond, label %bb2, label %bb
 
-bb:             ; preds = %entry
-        ret i8 0
+bb:
+  ret i8 0
 
-bb2:            ; preds = %entry
-; CHECK: %should_be_const = load i8, i8* @gv
-        %should_be_const = load i8, i8* %a
-        ret i8 %should_be_const
+bb2:
+  %should_be_const = load i8, i8* %a
+  ret i8 %should_be_const
 }
 
 ; PR1757
-; CHECK-LABEL: @test4(
 define i32 @test4(i32) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  EntryBlock:
+; CHECK-NEXT:    [[DOTDEMORGAN:%.*]] = icmp sgt i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    br i1 [[DOTDEMORGAN]], label [[GREATERTHANTWO:%.*]], label [[LESSTHANOREQUALTOTWO:%.*]]
+; CHECK:       GreaterThanTwo:
+; CHECK-NEXT:    br i1 false, label [[IMPOSSIBLE:%.*]], label [[NOTTWOANDGREATERTHANTWO:%.*]]
+; CHECK:       NotTwoAndGreaterThanTwo:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       Impossible:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       LessThanOrEqualToTwo:
+; CHECK-NEXT:    ret i32 0
+;
 EntryBlock:
-; CHECK: icmp sgt i32 %0, 2  
-  %.demorgan = icmp sgt i32 %0, 2    
+  %.demorgan = icmp sgt i32 %0, 2
   br i1 %.demorgan, label %GreaterThanTwo, label %LessThanOrEqualToTwo
 
 GreaterThanTwo:
-; CHECK-NOT: icmp eq i32 %0, 2
   icmp eq i32 %0, 2
-; CHECK: br i1 false
   br i1 %1, label %Impossible, label %NotTwoAndGreaterThanTwo
 
 NotTwoAndGreaterThanTwo:
@@ -84,13 +114,25 @@ LessThanOrEqualToTwo:
 declare i32* @f(i32*)
 define void @test5(i32* %x, i32* %y) {
 ; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PRE:%.*]] = icmp eq i32* [[X:%.*]], null
+; CHECK-NEXT:    br i1 [[PRE]], label [[RETURN:%.*]], label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32* [ [[F:%.*]], [[LOOP]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[F]] = tail call i32* @f(i32* [[PHI]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32* [[F]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32* [[F]], i32* null
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32* [[SEL]], null
+; CHECK-NEXT:    br i1 [[CMP2]], label [[RETURN]], label [[LOOP]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 entry:
   %pre = icmp eq i32* %x, null
   br i1 %pre, label %return, label %loop
 
 loop:
   %phi = phi i32* [ %sel, %loop ], [ %x, %entry ]
-; CHECK: %phi = phi i32* [ %f, %loop ], [ %x, %entry ]
   %f = tail call i32* @f(i32* %phi)
   %cmp1 = icmp ne i32* %f, %y
   %sel = select i1 %cmp1, i32* %f, i32* null
@@ -103,25 +145,32 @@ return:
 
 define i32 @switch1(i32 %s) {
 ; CHECK-LABEL: @switch1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[S:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[NEGATIVE:%.*]], label [[OUT:%.*]]
+; CHECK:       negative:
+; CHECK-NEXT:    switch i32 [[S]], label [[OUT]] [
+; CHECK-NEXT:    i32 -2, label [[NEXT:%.*]]
+; CHECK-NEXT:    i32 -1, label [[NEXT]]
+; CHECK-NEXT:    ]
+; CHECK:       out:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ -1, [[NEGATIVE]] ]
+; CHECK-NEXT:    ret i32 [[P]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %cmp = icmp slt i32 %s, 0
   br i1 %cmp, label %negative, label %out
 
 negative:
   switch i32 %s, label %out [
-; CHECK: switch i32 %s, label %out
-    i32 0, label %out
-; CHECK-NOT: i32 0
-    i32 1, label %out
-; CHECK-NOT: i32 1
-    i32 -1, label %next
-; CHECK-DAG: i32 -1, label %next
-    i32 -2, label %next
-; CHECK-DAG: i32 -2, label %next
-    i32 2, label %out
-; CHECK-NOT: i32 2
-    i32 3, label %out
-; CHECK-NOT: i32 3
+  i32 0, label %out
+  i32 1, label %out
+  i32 -1, label %next
+  i32 -2, label %next
+  i32 2, label %out
+  i32 3, label %out
   ]
 
 out:
@@ -135,17 +184,27 @@ next:
 
 define i32 @switch2(i32 %s) {
 ; CHECK-LABEL: @switch2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[S:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[POSITIVE:%.*]], label [[OUT:%.*]]
+; CHECK:       positive:
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ 1, [[POSITIVE]] ]
+; CHECK-NEXT:    ret i32 [[P]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %cmp = icmp sgt i32 %s, 0
   br i1 %cmp, label %positive, label %out
 
 positive:
   switch i32 %s, label %out [
-    i32 0, label %out
-    i32 -1, label %next
-    i32 -2, label %next
+  i32 0, label %out
+  i32 -1, label %next
+  i32 -2, label %next
   ]
-; CHECK: br label %out
 
 out:
   %p = phi i32 [ -1, %entry ], [ 1, %positive ], [ 1, %positive ]
@@ -158,17 +217,27 @@ next:
 
 define i32 @switch3(i32 %s) {
 ; CHECK-LABEL: @switch3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[S:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[POSITIVE:%.*]], label [[OUT:%.*]]
+; CHECK:       positive:
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ 1, [[POSITIVE]] ]
+; CHECK-NEXT:    ret i32 [[P]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %cmp = icmp sgt i32 %s, 0
   br i1 %cmp, label %positive, label %out
 
 positive:
   switch i32 %s, label %out [
-    i32 -1, label %out
-    i32 -2, label %next
-    i32 -3, label %next
+  i32 -1, label %out
+  i32 -2, label %next
+  i32 -3, label %next
   ]
-; CHECK: br label %out
 
 out:
   %p = phi i32 [ -1, %entry ], [ 1, %positive ], [ 1, %positive ]
@@ -181,17 +250,26 @@ next:
 
 define void @switch4(i32 %s) {
 ; CHECK-LABEL: @switch4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[S:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[ZERO:%.*]], label [[OUT:%.*]]
+; CHECK:       zero:
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       out:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp = icmp eq i32 %s, 0
   br i1 %cmp, label %zero, label %out
 
 zero:
   switch i32 %s, label %out [
-    i32 0, label %next
-    i32 1, label %out
-    i32 -1, label %out
+  i32 0, label %next
+  i32 1, label %out
+  i32 -1, label %out
   ]
-; CHECK: br label %next
 
 out:
   ret void
@@ -202,7 +280,10 @@ next:
 
 define i1 @arg_attribute(i8* nonnull %a) {
 ; CHECK-LABEL: @arg_attribute(
-; CHECK: ret i1 false
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = icmp eq i8* %a, null
   br label %exit
 
@@ -213,7 +294,12 @@ exit:
 declare nonnull i8* @return_nonnull()
 define i1 @call_attribute() {
 ; CHECK-LABEL: @call_attribute(
-; CHECK: ret i1 false
+; CHECK-NEXT:    [[A:%.*]] = call i8* @return_nonnull()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[A]], null
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 false
+;
   %a = call i8* @return_nonnull()
   %cmp = icmp eq i8* %a, null
   br label %exit
@@ -224,6 +310,22 @@ exit:
 
 define i1 @umin(i32 %a, i32 %b) {
 ; CHECK-LABEL: @umin(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[B:%.*]], 20
+; CHECK-NEXT:    br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]]
+; CHECK:       b_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 [[B]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[MIN]], 7
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp ult i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -238,8 +340,6 @@ b_guard:
   %res = icmp eq i32 %min, 7
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
@@ -247,6 +347,22 @@ out:
 
 define i1 @smin(i32 %a, i32 %b) {
 ; CHECK-LABEL: @smin(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[B:%.*]], 20
+; CHECK-NEXT:    br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]]
+; CHECK:       b_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp sle i32 [[A]], [[B]]
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 [[B]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[MIN]], 7
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp ult i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -261,8 +377,6 @@ b_guard:
   %res = icmp eq i32 %min, 7
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
@@ -270,6 +384,22 @@ out:
 
 define i1 @smax(i32 %a, i32 %b) {
 ; CHECK-LABEL: @smax(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[B:%.*]], 20
+; CHECK-NEXT:    br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]]
+; CHECK:       b_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp sge i32 [[A]], [[B]]
+; CHECK-NEXT:    [[MAX:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 [[B]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[MAX]], 7
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp sgt i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -284,8 +414,6 @@ b_guard:
   %res = icmp eq i32 %max, 7
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
@@ -293,6 +421,22 @@ out:
 
 define i1 @umax(i32 %a, i32 %b) {
 ; CHECK-LABEL: @umax(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[B:%.*]], 20
+; CHECK-NEXT:    br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]]
+; CHECK:       b_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp uge i32 [[A]], [[B]]
+; CHECK-NEXT:    [[MAX:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 [[B]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[MAX]], 7
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp sgt i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -307,8 +451,6 @@ b_guard:
   %res = icmp eq i32 %max, 7
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
@@ -316,6 +458,20 @@ out:
 
 define i1 @clamp_low1(i32 %a) {
 ; CHECK-LABEL: @clamp_low1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp eq i32 [[A]], 5
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[SEL_CMP]], i32 5, i32 [[A]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[SEL]], 4
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp sge i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -327,8 +483,6 @@ a_guard:
   %res = icmp eq i32 %sel, 4
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
@@ -336,6 +490,20 @@ out:
 
 define i1 @clamp_low2(i32 %a) {
 ; CHECK-LABEL: @clamp_low2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ne i32 [[A]], 5
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 5
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[SEL]], 4
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp sge i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -347,8 +515,6 @@ a_guard:
   %res = icmp eq i32 %sel, 4
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
@@ -356,6 +522,20 @@ out:
 
 define i1 @clamp_high1(i32 %a) {
 ; CHECK-LABEL: @clamp_high1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp eq i32 [[A]], 5
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[SEL_CMP]], i32 5, i32 [[A]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[SEL]], 6
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp sle i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -367,8 +547,6 @@ a_guard:
   %res = icmp eq i32 %sel, 6
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
@@ -376,6 +554,20 @@ out:
 
 define i1 @clamp_high2(i32 %a) {
 ; CHECK-LABEL: @clamp_high2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ne i32 [[A]], 5
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 5
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[SEL]], 6
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp sle i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -387,8 +579,6 @@ a_guard:
   %res = icmp eq i32 %sel, 6
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
@@ -397,6 +587,20 @@ out:
 ; Just showing arbitrary constants work, not really a clamp
 define i1 @clamp_high3(i32 %a) {
 ; CHECK-LABEL: @clamp_high3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]]
+; CHECK:       a_guard:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp ne i32 [[A]], 5
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], 100
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 5
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[SEL]], 105
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %cmp = icmp sle i32 %a, 5
   br i1 %cmp, label %a_guard, label %out
@@ -408,16 +612,20 @@ a_guard:
   %res = icmp eq i32 %sel, 105
   br label %next
 next:
-; CHECK: next:
-; CHECK: ret i1 false
   ret i1 %res
 out:
   ret i1 false
 }
 
 define i1 @zext_unknown(i8 %a) {
-; CHECK-LABEL: @zext_unknown
-; CHECK: ret i1 true
+; CHECK-LABEL: @zext_unknown(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A32]], 256
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %a32 = zext i8 %a to i32
   %cmp = icmp sle i32 %a32, 256
@@ -427,8 +635,15 @@ exit:
 }
 
 define i1 @trunc_unknown(i32 %a) {
-; CHECK-LABEL: @trunc_unknown
-; CHECK: ret i1 true
+; CHECK-LABEL: @trunc_unknown(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A8:%.*]] = trunc i32 [[A:%.*]] to i8
+; CHECK-NEXT:    [[A32:%.*]] = sext i8 [[A8]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A32]], 128
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %a8 = trunc i32 %a to i8
   %a32 = sext i8 %a8 to i32
@@ -439,11 +654,16 @@ exit:
 }
 
 ; TODO: missed optimization
-; Make sure we exercise non-integer inputs to unary operators (i.e. crash 
-; check).
+; Make sure we exercise non-integer inputs to unary operators (i.e. crash check).
 define i1 @bitcast_unknown(float %a) {
-; CHECK-LABEL: @bitcast_unknown
-; CHECK: ret i1 %cmp
+; CHECK-LABEL: @bitcast_unknown(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A32:%.*]] = bitcast float [[A:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A32]], 128
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
 entry:
   %a32 = bitcast float %a to i32
   %cmp = icmp sle i32 %a32, 128
@@ -453,8 +673,14 @@ exit:
 }
 
 define i1 @bitcast_unknown2(i8* %p) {
-; CHECK-LABEL: @bitcast_unknown2
-; CHECK: ret i1 %cmp
+; CHECK-LABEL: @bitcast_unknown2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P64:%.*]] = ptrtoint i8* [[P:%.*]] to i64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i64 [[P64]], 128
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
 entry:
   %p64 = ptrtoint i8* %p to i64
   %cmp = icmp sle i64 %p64, 128
@@ -465,8 +691,14 @@ exit:
 
 
 define i1 @and_unknown(i32 %a) {
-; CHECK-LABEL: @and_unknown
-; CHECK: ret i1 true
+; CHECK-LABEL: @and_unknown(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], 128
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[AND]], 128
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %and = and i32 %a, 128
   %cmp = icmp sle i32 %and, 128
@@ -476,8 +708,14 @@ exit:
 }
 
 define i1 @lshr_unknown(i32 %a) {
-; CHECK-LABEL: @lshr_unknown
-; CHECK: ret i1 true
+; CHECK-LABEL: @lshr_unknown(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = lshr i32 [[A:%.*]], 30
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[AND]], 128
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %and = lshr i32 %a, 30
   %cmp = icmp sle i32 %and, 128
diff --git a/test/Transforms/CorrelatedValuePropagation/conflict.ll b/test/Transforms/CorrelatedValuePropagation/conflict.ll
index ef566856ed7c23d11d2a297d4bb76d906489702b..d9d655328e096854f206a23a1d9cf57e5c32118e 100644
--- a/test/Transforms/CorrelatedValuePropagation/conflict.ll
+++ b/test/Transforms/CorrelatedValuePropagation/conflict.ll
@@ -1,19 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -correlated-propagation -S < %s | FileCheck %s
-; Checks that we don't crash on conflicting facts about a value 
+
+; Checks that we don't crash on conflicting facts about a value
 ; (i.e. unreachable code)
 
 ; Test that we can handle conflict edge facts
+
 define i8 @test(i8 %a) {
-; CHECK-LABEL: @test
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP1]], label [[NEXT:%.*]], label [[EXIT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    br i1 false, label [[DEAD:%.*]], label [[EXIT]]
+; CHECK:       dead:
+; CHECK-NEXT:    ret i8 5
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
   %cmp1 = icmp eq i8 %a, 5
   br i1 %cmp1, label %next, label %exit
 next:
   %cmp2 = icmp eq i8 %a, 3
-; CHECK: br i1 false, label %dead, label %exit
   br i1 %cmp2, label %dead, label %exit
 dead:
-; CHECK-LABEL: dead:
-; CHECK: ret i8 5
 ; NOTE: undef, or 3 would be equal valid
   ret i8 %a
 exit:
@@ -23,12 +32,20 @@ exit:
 declare void @llvm.assume(i1)
 
 ; Test that we can handle conflicting assume vs edge facts
+
 define i8 @test2(i8 %a) {
-; CHECK-LABEL: @test2
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[A:%.*]], 5
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    br i1 false, label [[DEAD:%.*]], label [[EXIT:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    ret i8 5
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
   %cmp1 = icmp eq i8 %a, 5
   call void @llvm.assume(i1 %cmp1)
   %cmp2 = icmp eq i8 %a, 3
-; CHECK: br i1 false, label %dead, label %exit
   br i1 %cmp2, label %dead, label %exit
 dead:
   ret i8 %a
@@ -37,14 +54,22 @@ exit:
 }
 
 define i8 @test3(i8 %a) {
-; CHECK-LABEL: @test3
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP1]], label [[DEAD:%.*]], label [[EXIT:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    call void @llvm.assume(i1 false)
+; CHECK-NEXT:    ret i8 5
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
   %cmp1 = icmp eq i8 %a, 5
   br i1 %cmp1, label %dead, label %exit
 dead:
   %cmp2 = icmp eq i8 %a, 3
-; CHECK: call void @llvm.assume(i1 false)
   call void @llvm.assume(i1 %cmp2)
   ret i8 %a
 exit:
   ret i8 0
 }
+
diff --git a/test/Transforms/CorrelatedValuePropagation/deopt.ll b/test/Transforms/CorrelatedValuePropagation/deopt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cb1f72221bc7bfc9ea7b0c11f458c2a74b0b0690
--- /dev/null
+++ b/test/Transforms/CorrelatedValuePropagation/deopt.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -correlated-propagation -S < %s | FileCheck %s
+
+declare void @use()
+; test requires a mix of context sensative refinement, and analysis
+; of the originating IR pattern.  Neither part is enough in isolation.
+define void @test1(i1 %c, i1 %c2) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i64 -1, i64 1
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2:%.*]], i64 [[SEL]], i64 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SEL2]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[TAKEN:%.*]], label [[UNTAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @use() [ "deopt"(i64 [[SEL2]]) ]
+; CHECK-NEXT:    ret void
+; CHECK:       untaken:
+; CHECK-NEXT:    ret void
+;
+  %sel = select i1 %c, i64 -1, i64 1
+  %sel2 = select i1 %c2, i64 %sel, i64 0
+  %cmp = icmp sgt i64 %sel2, 0
+  br i1 %cmp, label %taken, label %untaken
+taken:
+  call void @use() ["deopt" (i64 %sel2)]
+  ret void
+untaken:
+  ret void
+}
+
+declare void @llvm.assume(i1)
+declare void @llvm.experimental.guard(i1,...)
+
+; Same as test1, but with assume not branch
+define void @test1_assume(i1 %c, i1 %c2) {
+; CHECK-LABEL: @test1_assume(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i64 -1, i64 1
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2:%.*]], i64 [[SEL]], i64 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SEL2]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    call void @use() [ "deopt"(i64 [[SEL2]]) ]
+; CHECK-NEXT:    ret void
+;
+  %sel = select i1 %c, i64 -1, i64 1
+  %sel2 = select i1 %c2, i64 %sel, i64 0
+  %cmp = icmp sgt i64 %sel2, 0
+  call void @llvm.assume(i1 %cmp)
+  call void @use() ["deopt" (i64 %sel2)]
+  ret void
+}
+
+; Same as test1, but with guard not branch
+define void @test1_guard(i1 %c, i1 %c2) {
+; CHECK-LABEL: @test1_guard(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i64 -1, i64 1
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2:%.*]], i64 [[SEL]], i64 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SEL2]], 0
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[CMP]]) [ "deopt"(i64 [[SEL2]]) ]
+; CHECK-NEXT:    call void @use() [ "deopt"(i64 [[SEL2]]) ]
+; CHECK-NEXT:    ret void
+;
+  %sel = select i1 %c, i64 -1, i64 1
+  %sel2 = select i1 %c2, i64 %sel, i64 0
+  %cmp = icmp sgt i64 %sel2, 0
+  call void (i1, ...) @llvm.experimental.guard(i1 %cmp) ["deopt" (i64 %sel2)]
+  call void @use() ["deopt" (i64 %sel2)]
+  ret void
+}
+
+;; The rest of these are slight variations on the patterns
+;; producing 1 of several adjacent constants to test generality
+
+define void @test2(i1 %c, i1 %c2) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2:%.*]], i64 [[SEL]], i64 -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SEL2]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[TAKEN:%.*]], label [[UNTAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @use() [ "deopt"(i64 [[SEL2]]) ]
+; CHECK-NEXT:    ret void
+; CHECK:       untaken:
+; CHECK-NEXT:    ret void
+;
+  %sel = select i1 %c, i64 0, i64 1
+  %sel2 = select i1 %c2, i64 %sel, i64 -1
+  %cmp = icmp sgt i64 %sel2, 0
+  br i1 %cmp, label %taken, label %untaken
+taken:
+  call void @use() ["deopt" (i64 %sel2)]
+  ret void
+untaken:
+  ret void
+}
+define void @test3(i1 %c, i1 %c2) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2:%.*]], i64 [[SEL]], i64 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SEL2]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[TAKEN:%.*]], label [[UNTAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @use() [ "deopt"(i64 [[SEL2]]) ]
+; CHECK-NEXT:    ret void
+; CHECK:       untaken:
+; CHECK-NEXT:    ret void
+;
+  %sel = select i1 %c, i64 0, i64 1
+  %sel2 = select i1 %c2, i64 %sel, i64 2
+  %cmp = icmp sgt i64 %sel2, 1
+  br i1 %cmp, label %taken, label %untaken
+taken:
+  call void @use() ["deopt" (i64 %sel2)]
+  ret void
+untaken:
+  ret void
+}
+
+define void @test4(i1 %c, i1 %c2) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 0, [[SEL]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[ADD1]], [[SEL2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[ADD2]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[TAKEN:%.*]], label [[UNTAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @use() [ "deopt"(i64 [[ADD2]]) ]
+; CHECK-NEXT:    ret void
+; CHECK:       untaken:
+; CHECK-NEXT:    ret void
+;
+  %sel = select i1 %c, i64 0, i64 1
+  %sel2 = select i1 %c2, i64 0, i64 1
+  %add1 = add i64 0, %sel
+  %add2 = add i64 %add1, %sel2
+  %cmp = icmp sgt i64 %add2, 0
+  br i1 %cmp, label %taken, label %untaken
+taken:
+  call void @use() ["deopt" (i64 %add2)]
+  ret void
+untaken:
+  ret void
+}
diff --git a/test/Transforms/CorrelatedValuePropagation/icmp.ll b/test/Transforms/CorrelatedValuePropagation/icmp.ll
index 9e525a39dad56c63e4bc51ed4e9c847c630c9183..de97e858b58db81e2b420afee896686f241cef4c 100644
--- a/test/Transforms/CorrelatedValuePropagation/icmp.ll
+++ b/test/Transforms/CorrelatedValuePropagation/icmp.ll
@@ -1,20 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -correlated-propagation -S %s | FileCheck %s
 ; RUN: opt -passes=correlated-propagation -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.10.0"
 
-; Function Attrs: noreturn
 declare void @check1(i1) #1
-
-; Function Attrs: noreturn
 declare void @check2(i1) #1
 
 ; Make sure we propagate the value of %tmp35 to the true/false cases
-; CHECK-LABEL: @test1
-; CHECK: call void @check1(i1 false)
-; CHECK: call void @check2(i1 true)
+
 define void @test1(i64 %tmp35) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp sgt i64 [[TMP35:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP36]], label [[BB_TRUE:%.*]], label [[BB_FALSE:%.*]]
+; CHECK:       bb_true:
+; CHECK-NEXT:    tail call void @check1(i1 false) #0
+; CHECK-NEXT:    unreachable
+; CHECK:       bb_false:
+; CHECK-NEXT:    tail call void @check2(i1 true) #0
+; CHECK-NEXT:    unreachable
+;
 bb:
   %tmp36 = icmp sgt i64 %tmp35, 0
   br i1 %tmp36, label %bb_true, label %bb_false
@@ -30,13 +37,27 @@ bb_false:
   unreachable
 }
 
-; Function Attrs: noreturn
 ; This is the same as test1 but with a diamond to ensure we
 ; get %tmp36 from both true and false BBs.
-; CHECK-LABEL: @test2
-; CHECK: call void @check1(i1 false)
-; CHECK: call void @check2(i1 true)
+
 define void @test2(i64 %tmp35, i1 %inner_cmp) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp sgt i64 [[TMP35:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP36]], label [[BB_TRUE:%.*]], label [[BB_FALSE:%.*]]
+; CHECK:       bb_true:
+; CHECK-NEXT:    br i1 [[INNER_CMP:%.*]], label [[INNER_TRUE:%.*]], label [[INNER_FALSE:%.*]]
+; CHECK:       inner_true:
+; CHECK-NEXT:    br label [[MERGE:%.*]]
+; CHECK:       inner_false:
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    tail call void @check1(i1 false)
+; CHECK-NEXT:    unreachable
+; CHECK:       bb_false:
+; CHECK-NEXT:    tail call void @check2(i1 true) #0
+; CHECK-NEXT:    unreachable
+;
 bb:
   %tmp36 = icmp sgt i64 %tmp35, 0
   br i1 %tmp36, label %bb_true, label %bb_false
@@ -61,4 +82,164 @@ bb_false:
   unreachable
 }
 
+; Make sure binary operator transfer functions are run when RHS is non-constant
+
+define i1 @test3(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X:%.*]], 10
+; CHECK-NEXT:    br i1 [[CMP1]], label [[CONT1:%.*]], label [[OUT:%.*]]
+; CHECK:       cont1:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[Y:%.*]], 10
+; CHECK-NEXT:    br i1 [[CMP2]], label [[CONT2:%.*]], label [[OUT]]
+; CHECK:       cont2:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[CONT3:%.*]]
+; CHECK:       cont3:
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %cmp1 = icmp ult i32 %x, 10
+  br i1 %cmp1, label %cont1, label %out
+
+cont1:
+  %cmp2 = icmp ult i32 %y, 10
+  br i1 %cmp2, label %cont2, label %out
+
+cont2:
+  %add = add i32 %x, %y
+  br label %cont3
+
+cont3:
+  %cmp3 = icmp ult i32 %add, 25
+  br label %out
+
+out:
+  %ret = phi i1 [ true, %entry], [ true, %cont1 ], [ %cmp3, %cont3 ]
+  ret i1 %ret
+}
+
+; Same as previous but make sure nobody gets over-zealous
+
+define i1 @test4(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X:%.*]], 10
+; CHECK-NEXT:    br i1 [[CMP1]], label [[CONT1:%.*]], label [[OUT:%.*]]
+; CHECK:       cont1:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[Y:%.*]], 10
+; CHECK-NEXT:    br i1 [[CMP2]], label [[CONT2:%.*]], label [[OUT]]
+; CHECK:       cont2:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[CONT3:%.*]]
+; CHECK:       cont3:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[ADD]], 15
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    [[RET:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ true, [[CONT1]] ], [ [[CMP3]], [[CONT3]] ]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cmp1 = icmp ult i32 %x, 10
+  br i1 %cmp1, label %cont1, label %out
+
+cont1:
+  %cmp2 = icmp ult i32 %y, 10
+  br i1 %cmp2, label %cont2, label %out
+
+cont2:
+  %add = add i32 %x, %y
+  br label %cont3
+
+cont3:
+  %cmp3 = icmp ult i32 %add, 15
+  br label %out
+
+out:
+  %ret = phi i1 [ true, %entry], [ true, %cont1 ], [ %cmp3, %cont3 ]
+  ret i1 %ret
+}
+
+; Make sure binary operator transfer functions are run when RHS is non-constant
+
+define i1 @test5(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP1]], label [[CONT1:%.*]], label [[OUT:%.*]]
+; CHECK:       cont1:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[Y:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP2]], label [[CONT2:%.*]], label [[OUT]]
+; CHECK:       cont2:
+; CHECK-NEXT:    [[SHIFTED:%.*]] = shl i32 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[CONT3:%.*]]
+; CHECK:       cont3:
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %cmp1 = icmp ult i32 %x, 5
+  br i1 %cmp1, label %cont1, label %out
+
+cont1:
+  %cmp2 = icmp ult i32 %y, 5
+  br i1 %cmp2, label %cont2, label %out
+
+cont2:
+  %shifted = shl i32 %x, %y
+  br label %cont3
+
+cont3:
+  %cmp3 = icmp ult i32 %shifted, 65536
+  br label %out
+
+out:
+  %ret = phi i1 [ true, %entry], [ true, %cont1 ], [ %cmp3, %cont3 ]
+  ret i1 %ret
+}
+
+; Same as previous but make sure nobody gets over-zealous
+
+define i1 @test6(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP1]], label [[CONT1:%.*]], label [[OUT:%.*]]
+; CHECK:       cont1:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[Y:%.*]], 15
+; CHECK-NEXT:    br i1 [[CMP2]], label [[CONT2:%.*]], label [[OUT]]
+; CHECK:       cont2:
+; CHECK-NEXT:    [[SHIFTED:%.*]] = shl i32 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[CONT3:%.*]]
+; CHECK:       cont3:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[SHIFTED]], 65536
+; CHECK-NEXT:    br label [[OUT]]
+; CHECK:       out:
+; CHECK-NEXT:    [[RET:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ true, [[CONT1]] ], [ [[CMP3]], [[CONT3]] ]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cmp1 = icmp ult i32 %x, 5
+  br i1 %cmp1, label %cont1, label %out
+
+cont1:
+  %cmp2 = icmp ult i32 %y, 15
+  br i1 %cmp2, label %cont2, label %out
+
+cont2:
+  %shifted = shl i32 %x, %y
+  br label %cont3
+
+cont3:
+  %cmp3 = icmp ult i32 %shifted, 65536
+  br label %out
+
+out:
+  %ret = phi i1 [ true, %entry], [ true, %cont1 ], [ %cmp3, %cont3 ]
+  ret i1 %ret
+}
+
 attributes #4 = { noreturn }
diff --git a/test/Transforms/CorrelatedValuePropagation/non-null.ll b/test/Transforms/CorrelatedValuePropagation/non-null.ll
index a5cd90524fa498d519b5799b49d00358d2c26e46..d86031d312fbfe78a81dff06b835f90aa131a1e9 100644
--- a/test/Transforms/CorrelatedValuePropagation/non-null.ll
+++ b/test/Transforms/CorrelatedValuePropagation/non-null.ll
@@ -1,193 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -correlated-propagation -S | FileCheck %s
 
 define void @test1(i8* %ptr) {
-; CHECK: test1
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[PTR:%.*]]
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+;
   %A = load i8, i8* %ptr
   br label %bb
 bb:
   icmp ne i8* %ptr, null
-; CHECK-NOT: icmp
   ret void
 }
 
 define void @test1_no_null_opt(i8* %ptr) #0 {
-; CHECK: test1_no_null_opt
+; CHECK-LABEL: @test1_no_null_opt(
+; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[PTR:%.*]]
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i8* [[PTR]], null
+; CHECK-NEXT:    ret void
+;
   %A = load i8, i8* %ptr
   br label %bb
 bb:
   icmp ne i8* %ptr, null
-; CHECK: icmp ne i8* %ptr, null
   ret void
 }
 
 define void @test2(i8* %ptr) {
-; CHECK: test2
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    store i8 0, i8* [[PTR:%.*]]
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+;
   store i8 0, i8* %ptr
   br label %bb
 bb:
   icmp ne i8* %ptr, null
-; CHECK-NOT: icmp
   ret void
 }
 
 define void @test2_no_null_opt(i8* %ptr) #0 {
-; CHECK: test2_no_null_opt
+; CHECK-LABEL: @test2_no_null_opt(
+; CHECK-NEXT:    store i8 0, i8* [[PTR:%.*]]
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i8* [[PTR]], null
+; CHECK-NEXT:    ret void
+;
   store i8 0, i8* %ptr
   br label %bb
 bb:
   icmp ne i8* %ptr, null
-; CHECK: icmp ne i8* %ptr, null
   ret void
 }
 
 define void @test3() {
-; CHECK: test3
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+;
   %ptr = alloca i8
   br label %bb
 bb:
   icmp ne i8* %ptr, null
-; CHECK-NOT: icmp
   ret void
 }
 
 ;; OK to remove icmp here since ptr is coming from alloca.
+
 define void @test3_no_null_opt() #0 {
-; CHECK: test3
+; CHECK-LABEL: @test3_no_null_opt(
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+;
   %ptr = alloca i8
   br label %bb
 bb:
   icmp ne i8* %ptr, null
-; CHECK-NOT: icmp
   ret void
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i1)
+
 define void @test4(i8* %dest, i8* %src) {
-; CHECK: test4
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DEST:%.*]], i8* [[SRC:%.*]], i32 1, i1 false)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 1, i1 false)
   br label %bb
 bb:
   icmp ne i8* %dest, null
   icmp ne i8* %src, null
-; CHECK-NOT: icmp
   ret void
 }
 
 define void @test4_no_null_opt(i8* %dest, i8* %src) #0 {
-; CHECK: test4_no_null_opt
+; CHECK-LABEL: @test4_no_null_opt(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DEST:%.*]], i8* [[SRC:%.*]], i32 1, i1 false)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i8* [[DEST]], null
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8* [[SRC]], null
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 1, i1 false)
   br label %bb
 bb:
   icmp ne i8* %dest, null
   icmp ne i8* %src, null
-; CHECK: icmp ne i8* %dest, null
-; CHECK: icmp ne i8* %src, null
   ret void
 }
 
 declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i1)
 define void @test5(i8* %dest, i8* %src) {
-; CHECK: test5
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i32(i8* [[DEST:%.*]], i8* [[SRC:%.*]], i32 1, i1 false)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 1, i1 false)
   br label %bb
 bb:
   icmp ne i8* %dest, null
   icmp ne i8* %src, null
-; CHECK-NOT: icmp
   ret void
 }
 
 define void @test5_no_null_opt(i8* %dest, i8* %src) #0 {
-; CHECK: test5_no_null_opt
+; CHECK-LABEL: @test5_no_null_opt(
+; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i32(i8* [[DEST:%.*]], i8* [[SRC:%.*]], i32 1, i1 false)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i8* [[DEST]], null
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8* [[SRC]], null
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 1, i1 false)
   br label %bb
 bb:
   icmp ne i8* %dest, null
   icmp ne i8* %src, null
-; CHECK: icmp ne i8* %dest, null
-; CHECK: icmp ne i8* %src, null
   ret void
 }
 
 declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i1)
 define void @test6(i8* %dest) {
-; CHECK: test6
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i32(i8* [[DEST:%.*]], i8 -1, i32 1, i1 false)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 255, i32 1, i1 false)
   br label %bb
 bb:
   icmp ne i8* %dest, null
-; CHECK-NOT: icmp
   ret void
 }
 
 define void @test6_no_null_opt(i8* %dest) #0 {
-; CHECK: test6_no_null_opt
+; CHECK-LABEL: @test6_no_null_opt(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i32(i8* [[DEST:%.*]], i8 -1, i32 1, i1 false)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i8* [[DEST]], null
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 255, i32 1, i1 false)
   br label %bb
 bb:
   icmp ne i8* %dest, null
-; CHECK: icmp ne i8* %dest, null
   ret void
 }
 
 define void @test7(i8* %dest, i8* %src, i32 %len) {
-; CHECK: test7
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DEST:%.*]], i8* [[SRC:%.*]], i32 [[LEN:%.*]], i1 false)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[KEEP1:%.*]] = icmp ne i8* [[DEST]], null
+; CHECK-NEXT:    [[KEEP2:%.*]] = icmp ne i8* [[SRC]], null
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 false)
   br label %bb
 bb:
   %KEEP1 = icmp ne i8* %dest, null
-; CHECK: KEEP1
   %KEEP2 = icmp ne i8* %src, null
-; CHECK: KEEP2
   ret void
 }
 
 declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1) *, i8 addrspace(1) *, i32, i1)
 define void @test8(i8 addrspace(1) * %dest, i8 addrspace(1) * %src) {
-; CHECK: test8
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    call void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* [[DEST:%.*]], i8 addrspace(1)* [[SRC:%.*]], i32 1, i1 false)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[KEEP1:%.*]] = icmp ne i8 addrspace(1)* [[DEST]], null
+; CHECK-NEXT:    [[KEEP2:%.*]] = icmp ne i8 addrspace(1)* [[SRC]], null
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1) * %dest, i8 addrspace(1) * %src, i32 1, i1 false)
   br label %bb
 bb:
   %KEEP1 = icmp ne i8 addrspace(1) * %dest, null
-; CHECK: KEEP1
   %KEEP2 = icmp ne i8 addrspace(1) * %src, null
-; CHECK: KEEP2
   ret void
 }
 
 define void @test9(i8* %dest, i8* %src) {
-; CHECK: test9
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DEST:%.*]], i8* [[SRC:%.*]], i32 1, i1 true)
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[KEEP1:%.*]] = icmp ne i8* [[DEST]], null
+; CHECK-NEXT:    [[KEEP2:%.*]] = icmp ne i8* [[SRC]], null
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 1, i1 true)
   br label %bb
 bb:
   %KEEP1 = icmp ne i8* %dest, null
-; CHECK: KEEP1
   %KEEP2 = icmp ne i8* %src, null
-; CHECK: KEEP2
   ret void
 }
 
 declare void @test10_helper(i8* %arg1, i8* %arg2, i32 %non-pointer-arg)
+
 define void @test10(i8* %arg1, i8* %arg2, i32 %non-pointer-arg) {
-; CHECK-LABEL: @test10
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IS_NULL:%.*]] = icmp eq i8* [[ARG1:%.*]], null
+; CHECK-NEXT:    br i1 [[IS_NULL]], label [[NULL:%.*]], label [[NON_NULL:%.*]]
+; CHECK:       non_null:
+; CHECK-NEXT:    call void @test10_helper(i8* nonnull [[ARG1]], i8* [[ARG2:%.*]], i32 [[NON_POINTER_ARG:%.*]])
+; CHECK-NEXT:    br label [[NULL]]
+; CHECK:       null:
+; CHECK-NEXT:    call void @test10_helper(i8* [[ARG1]], i8* [[ARG2]], i32 [[NON_POINTER_ARG]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %is_null = icmp eq i8* %arg1, null
   br i1 %is_null, label %null, label %non_null
 
 non_null:
   call void @test10_helper(i8* %arg1, i8* %arg2, i32 %non-pointer-arg)
-  ; CHECK: call void @test10_helper(i8* nonnull %arg1, i8* %arg2, i32 %non-pointer-arg)
   br label %null
 
 null:
   call void @test10_helper(i8* %arg1, i8* %arg2, i32 %non-pointer-arg)
-  ; CHECK: call void @test10_helper(i8* %arg1, i8* %arg2, i32 %non-pointer-arg)
   ret void
 }
 
 declare void @test11_helper(i8* %arg)
+
 define void @test11(i8* %arg1, i8** %arg2) {
-; CHECK-LABEL: @test11
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IS_NULL:%.*]] = icmp eq i8* [[ARG1:%.*]], null
+; CHECK-NEXT:    br i1 [[IS_NULL]], label [[NULL:%.*]], label [[NON_NULL:%.*]]
+; CHECK:       non_null:
+; CHECK-NEXT:    br label [[MERGE:%.*]]
+; CHECK:       null:
+; CHECK-NEXT:    [[ANOTHER_ARG:%.*]] = alloca i8
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    [[MERGED_ARG:%.*]] = phi i8* [ [[ANOTHER_ARG]], [[NULL]] ], [ [[ARG1]], [[NON_NULL]] ]
+; CHECK-NEXT:    call void @test11_helper(i8* nonnull [[MERGED_ARG]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %is_null = icmp eq i8* %arg1, null
   br i1 %is_null, label %null, label %non_null
@@ -202,13 +296,26 @@ null:
 merge:
   %merged_arg = phi i8* [%another_arg, %null], [%arg1, %non_null]
   call void @test11_helper(i8* %merged_arg)
-  ; CHECK: call void @test11_helper(i8* nonnull %merged_arg)
   ret void
 }
 
 declare void @test12_helper(i8* %arg)
+
 define void @test12(i8* %arg1, i8** %arg2) {
-; CHECK-LABEL: @test12
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IS_NULL:%.*]] = icmp eq i8* [[ARG1:%.*]], null
+; CHECK-NEXT:    br i1 [[IS_NULL]], label [[NULL:%.*]], label [[NON_NULL:%.*]]
+; CHECK:       non_null:
+; CHECK-NEXT:    br label [[MERGE:%.*]]
+; CHECK:       null:
+; CHECK-NEXT:    [[ANOTHER_ARG:%.*]] = load i8*, i8** [[ARG2:%.*]], !nonnull !0
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    [[MERGED_ARG:%.*]] = phi i8* [ [[ANOTHER_ARG]], [[NULL]] ], [ [[ARG1]], [[NON_NULL]] ]
+; CHECK-NEXT:    call void @test12_helper(i8* nonnull [[MERGED_ARG]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %is_null = icmp eq i8* %arg1, null
   br i1 %is_null, label %null, label %non_null
@@ -223,7 +330,6 @@ null:
 merge:
   %merged_arg = phi i8* [%another_arg, %null], [%arg1, %non_null]
   call void @test12_helper(i8* %merged_arg)
-  ; CHECK: call void @test12_helper(i8* nonnull %merged_arg)
   ret void
 }
 
diff --git a/test/Transforms/CorrelatedValuePropagation/overflows.ll b/test/Transforms/CorrelatedValuePropagation/overflows.ll
index a131038b8e0daf1253108b8f3970eb64daae1543..eccc72d45dd7291172a1588074d917329382a089 100644
--- a/test/Transforms/CorrelatedValuePropagation/overflows.ll
+++ b/test/Transforms/CorrelatedValuePropagation/overflows.ll
@@ -1,5 +1,11 @@
 ; RUN: opt -S -correlated-propagation < %s | FileCheck %s
 
+; Check that debug locations are preserved. For more info see:
+;   https://llvm.org/docs/SourceLevelDebugging.html#fixing-errors
+; RUN: opt < %s -enable-debugify -correlated-propagation -S 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=DEBUG
+; DEBUG: CheckModuleDebugify: PASS
+
 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
 
 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32)
diff --git a/test/Transforms/CorrelatedValuePropagation/range.ll b/test/Transforms/CorrelatedValuePropagation/range.ll
index 002dfc5a544764a1aff6cc7803b6d207471702bb..0e6915a289bcb2f5c7b009969958d7843feff0eb 100644
--- a/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -1,8 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -correlated-propagation -S < %s | FileCheck %s
 
 declare i32 @foo()
 
 define i32 @test1(i32 %a) nounwind {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 false, label [[END:%.*]], label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 2
+;
   %a.off = add i32 %a, -8
   %cmp = icmp ult i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -16,13 +28,20 @@ else:
 
 end:
   ret i32 2
-
-; CHECK-LABEL: @test1(
-; CHECK: then:
-; CHECK-NEXT: br i1 false, label %end, label %else
 }
 
 define i32 @test2(i32 %a) nounwind {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 false, label [[END:%.*]], label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 2
+;
   %a.off = add i32 %a, -8
   %cmp = icmp ult i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -36,14 +55,26 @@ else:
 
 end:
   ret i32 2
-
-; CHECK-LABEL: @test2(
-; CHECK: then:
-; CHECK-NEXT: br i1 false, label %end, label %else
 }
 
-; CHECK-LABEL: @test3(
 define i32 @test3(i32 %c) nounwind {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[C:%.*]], 2
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       if.end:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[C]], 3
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END8:%.*]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    br i1 true, label [[IF_THEN4:%.*]], label [[IF_END6:%.*]]
+; CHECK:       if.end6:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       if.then4:
+; CHECK-NEXT:    ret i32 3
+; CHECK:       if.end8:
+; CHECK-NEXT:    ret i32 4
+;
   %cmp = icmp slt i32 %c, 2
   br i1 %cmp, label %if.then, label %if.end
 
@@ -54,13 +85,10 @@ if.end:
   %cmp1 = icmp slt i32 %c, 3
   br i1 %cmp1, label %if.then2, label %if.end8
 
-; CHECK: if.then2
 if.then2:
   %cmp2 = icmp eq i32 %c, 2
-; CHECK: br i1 true
   br i1 %cmp2, label %if.then4, label %if.end6
 
-; CHECK: if.end6
 if.end6:
   ret i32 2
 
@@ -71,18 +99,33 @@ if.end8:
   ret i32 4
 }
 
-; CHECK-LABEL: @test4(
 define i32 @test4(i32 %c) nounwind {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
+; CHECK-NEXT:    i32 2, label [[SW_BB]]
+; CHECK-NEXT:    i32 4, label [[SW_BB]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       sw.default:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 42, [[SW_DEFAULT]] ], [ 4, [[IF_THEN]] ], [ 9, [[IF_END]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
   switch i32 %c, label %sw.default [
-    i32 1, label %sw.bb
-    i32 2, label %sw.bb
-    i32 4, label %sw.bb
+  i32 1, label %sw.bb
+  i32 2, label %sw.bb
+  i32 4, label %sw.bb
   ]
 
-; CHECK: sw.bb
 sw.bb:
   %cmp = icmp sge i32 %c, 1
-; CHECK: br i1 true
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:
@@ -99,8 +142,20 @@ return:
   ret i32 %retval.0
 }
 
-; CHECK-LABEL: @test5(
 define i1 @test5(i32 %c) nounwind {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[C:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[C]], 4
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_END]], label [[IF_END8:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.end8:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[C]], 3
+; CHECK-NEXT:    [[OR:%.*]] = or i1 false, false
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
   %cmp = icmp slt i32 %c, 5
   br i1 %cmp, label %if.then, label %if.end
 
@@ -115,23 +170,29 @@ if.end8:
   %cmp2 = icmp eq i32 %c, 3
   %cmp3 = icmp eq i32 %c, 4
   %cmp4 = icmp eq i32 %c, 6
-; CHECK: %or = or i1 false, false
   %or = or i1 %cmp3, %cmp4
-; CHECK: ret i1 %cmp2
   ret i1 %cmp2
 }
 
-; CHECK-LABEL: @test6(
 define i1 @test6(i32 %c) nounwind {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[C:%.*]], 7
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[C]], 6
+; CHECK-NEXT:    br i1 [[COND]], label [[SW_BB:%.*]], label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       sw.bb:
+; CHECK-NEXT:    ret i1 true
+;
   %cmp = icmp ule i32 %c, 7
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:
-; CHECK: icmp eq i32 %c, 6
-; CHECK: br i1
   switch i32 %c, label %if.end [
-    i32 6, label %sw.bb
-    i32 8, label %sw.bb
+  i32 6, label %sw.bb
+  i32 8, label %sw.bb
   ]
 
 if.end:
@@ -139,52 +200,72 @@ if.end:
 
 sw.bb:
   %cmp2 = icmp eq i32 %c, 6
-; CHECK: ret i1 true
   ret i1 %cmp2
 }
 
-; CHECK-LABEL: @test7(
 define i1 @test7(i32 %c) nounwind {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 6, label [[SW_BB:%.*]]
+; CHECK-NEXT:    i32 7, label [[SW_BB]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       sw.default:
+; CHECK-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[C]], 5
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[C]], 8
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP5]], false
+; CHECK-NEXT:    [[OR2:%.*]] = or i1 false, [[CMP8]]
+; CHECK-NEXT:    ret i1 false
+;
 entry:
- switch i32 %c, label %sw.default [
-   i32 6, label %sw.bb
-   i32 7, label %sw.bb
- ]
+  switch i32 %c, label %sw.default [
+  i32 6, label %sw.bb
+  i32 7, label %sw.bb
+  ]
 
 sw.bb:
- ret i1 true
+  ret i1 true
 
 sw.default:
- %cmp5 = icmp eq i32 %c, 5
- %cmp6 = icmp eq i32 %c, 6
- %cmp7 = icmp eq i32 %c, 7
- %cmp8 = icmp eq i32 %c, 8
-; CHECK: %or = or i1 %cmp5, false
- %or = or i1 %cmp5, %cmp6
-; CHECK: %or2 = or i1 false, %cmp8
- %or2 = or i1 %cmp7, %cmp8
- ret i1 false
+  %cmp5 = icmp eq i32 %c, 5
+  %cmp6 = icmp eq i32 %c, 6
+  %cmp7 = icmp eq i32 %c, 7
+  %cmp8 = icmp eq i32 %c, 8
+  %or = or i1 %cmp5, %cmp6
+  %or2 = or i1 %cmp7, %cmp8
+  ret i1 false
 }
 
 define i1 @test8(i64* %p) {
-; CHECK-LABEL: @test8
-; CHECK: ret i1 false
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[A:%.*]] = load i64, i64* [[P:%.*]], !range !0
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i64 [[A]], 0
+; CHECK-NEXT:    ret i1 false
+;
   %a = load i64, i64* %p, !range !{i64 4, i64 255}
   %res = icmp eq i64 %a, 0
   ret i1 %res
 }
 
 define i1 @test9(i64* %p) {
-; CHECK-LABEL: @test9
-; CHECK: ret i1 true
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[A:%.*]] = load i64, i64* [[P:%.*]], !range !1
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i64 [[A]], 0
+; CHECK-NEXT:    ret i1 true
+;
   %a = load i64, i64* %p, !range !{i64 0, i64 1}
   %res = icmp eq i64 %a, 0
   ret i1 %res
 }
 
 define i1 @test10(i64* %p) {
-; CHECK-LABEL: @test10
-; CHECK: ret i1 false
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[A:%.*]] = load i64, i64* [[P:%.*]], !range !2
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i64 [[A]], 0
+; CHECK-NEXT:    ret i1 false
+;
   %a = load i64, i64* %p, !range !{i64 4, i64 8, i64 15, i64 20}
   %res = icmp eq i64 %a, 0
   ret i1 %res
@@ -193,8 +274,14 @@ define i1 @test10(i64* %p) {
 @g = external global i32
 
 define i1 @test11() {
-; CHECK: @test11
-; CHECK: ret i1 true
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[POSITIVE:%.*]] = load i32, i32* @g, !range !3
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[POSITIVE]], 1
+; CHECK-NEXT:    [[TEST:%.*]] = icmp sgt i32 [[ADD]], 0
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    ret i1 true
+;
   %positive = load i32, i32* @g, !range !{i32 1, i32 2048}
   %add = add i32 %positive, 1
   %test = icmp sgt i32 %add, 0
@@ -206,8 +293,15 @@ next:
 
 define i32 @test12(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test12(
-; CHECK: then:
-; CHECK-NEXT: br i1 false, label %end, label %else
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 false, label [[END:%.*]], label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 2
+;
   %cmp = icmp ult i32 %a, %b
   br i1 %cmp, label %then, label %else
 
@@ -224,8 +318,15 @@ end:
 
 define i32 @test12_swap(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test12_swap(
-; CHECK: then:
-; CHECK-NEXT: br i1 false, label %end, label %else
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 false, label [[END:%.*]], label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 2
+;
   %cmp = icmp ugt i32 %b, %a
   br i1 %cmp, label %then, label %else
 
@@ -240,11 +341,20 @@ end:
   ret i32 2
 }
 
-define i32 @test12_neg(i32 %a, i32 %b) {
 ; The same as @test12 but the second check is on the false path
+
+define i32 @test12_neg(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test12_neg(
-; CHECK: else:
-; CHECK-NEXT: %alive = icmp eq i32 %a, -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[ALIVE:%.*]] = icmp eq i32 [[A]], -1
+; CHECK-NEXT:    br i1 [[ALIVE]], label [[END:%.*]], label [[THEN]]
+; CHECK:       then:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 2
+;
   %cmp = icmp ult i32 %a, %b
   br i1 %cmp, label %then, label %else
 
@@ -259,11 +369,19 @@ end:
   ret i32 2
 }
 
-define i32 @test12_signed(i32 %a, i32 %b) {
 ; The same as @test12 but with signed comparison
+
+define i32 @test12_signed(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test12_signed(
-; CHECK: then:
-; CHECK-NEXT: br i1 false, label %end, label %else
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 false, label [[END:%.*]], label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 2
+;
   %cmp = icmp slt i32 %a, %b
   br i1 %cmp, label %then, label %else
 
@@ -280,8 +398,16 @@ end:
 
 define i32 @test13(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test13(
-; CHECK: then:
-; CHECK-NEXT: br i1 false, label %end, label %else
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A_OFF]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 false, label [[END:%.*]], label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 2
+;
   %a.off = add i32 %a, -8
   %cmp = icmp ult i32 %a.off, %b
   br i1 %cmp, label %then, label %else
@@ -299,8 +425,16 @@ end:
 
 define i32 @test13_swap(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test13_swap(
-; CHECK: then:
-; CHECK-NEXT: br i1 false, label %end, label %else
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[B:%.*]], [[A_OFF]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 false, label [[END:%.*]], label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 2
+;
   %a.off = add i32 %a, -8
   %cmp = icmp ugt i32 %b, %a.off
   br i1 %cmp, label %then, label %else
@@ -318,8 +452,15 @@ end:
 
 define i1 @test14_slt(i32 %a) {
 ; CHECK-LABEL: @test14_slt(
-; CHECK: then:
-; CHECK-NEXT: %result = or i1 false, false
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, false
+; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
   %a.off = add i32 %a, -8
   %cmp = icmp slt i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -336,9 +477,16 @@ else:
 
 define i1 @test14_sle(i32 %a) {
 ; CHECK-LABEL: @test14_sle(
-; CHECK: then:
-; CHECK-NEXT: %alive = icmp eq i32 %a, 16
-; CHECK-NEXT: %result = or i1 false, %alive
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[ALIVE:%.*]] = icmp eq i32 [[A]], 16
+; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, [[ALIVE]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
   %a.off = add i32 %a, -8
   %cmp = icmp sle i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -355,8 +503,15 @@ else:
 
 define i1 @test14_sgt(i32 %a) {
 ; CHECK-LABEL: @test14_sgt(
-; CHECK: then:
-; CHECK-NEXT: %result = or i1 false, false
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, false
+; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
   %a.off = add i32 %a, -8
   %cmp = icmp sgt i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -373,9 +528,16 @@ else:
 
 define i1 @test14_sge(i32 %a) {
 ; CHECK-LABEL: @test14_sge(
-; CHECK: then:
-; CHECK-NEXT: %alive = icmp eq i32 %a, 16
-; CHECK-NEXT: %result = or i1 false, %alive
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[ALIVE:%.*]] = icmp eq i32 [[A]], 16
+; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, [[ALIVE]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
   %a.off = add i32 %a, -8
   %cmp = icmp sge i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -392,9 +554,16 @@ else:
 
 define i1 @test14_ule(i32 %a) {
 ; CHECK-LABEL: @test14_ule(
-; CHECK: then:
-; CHECK-NEXT: %alive = icmp eq i32 %a, 16
-; CHECK-NEXT: %result = or i1 false, %alive
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[ALIVE:%.*]] = icmp eq i32 [[A]], 16
+; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, [[ALIVE]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
   %a.off = add i32 %a, -8
   %cmp = icmp ule i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -411,8 +580,15 @@ else:
 
 define i1 @test14_ugt(i32 %a) {
 ; CHECK-LABEL: @test14_ugt(
-; CHECK: then:
-; CHECK-NEXT: %result = or i1 false, false
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, false
+; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
   %a.off = add i32 %a, -8
   %cmp = icmp ugt i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -429,9 +605,16 @@ else:
 
 define i1 @test14_uge(i32 %a) {
 ; CHECK-LABEL: @test14_uge(
-; CHECK: then:
-; CHECK-NEXT: %alive = icmp eq i32 %a, 16
-; CHECK-NEXT: %result = or i1 false, %alive
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 [[A:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[A_OFF]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[ALIVE:%.*]] = icmp eq i32 [[A]], 16
+; CHECK-NEXT:    [[RESULT:%.*]] = or i1 false, [[ALIVE]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
   %a.off = add i32 %a, -8
   %cmp = icmp uge i32 %a.off, 8
   br i1 %cmp, label %then, label %else
@@ -449,8 +632,14 @@ else:
 @limit = external global i32
 define i1 @test15(i32 %a) {
 ; CHECK-LABEL: @test15(
-; CHECK: then:
-; CHECK-NEXT: ret i1 false
+; CHECK-NEXT:    [[LIMIT:%.*]] = load i32, i32* @limit, !range !4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
   %limit = load i32, i32* @limit, !range !{i32 0, i32 256}
   %cmp = icmp ult i32 %a, %limit
   br i1 %cmp, label %then, label %else
@@ -464,6 +653,16 @@ else:
 }
 
 define i32 @test16(i8 %a) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    br label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 93
+; CHECK-NEXT:    br i1 [[CMP]], label [[TARGET93:%.*]], label [[DISPATCH]]
+; CHECK:       target93:
+; CHECK-NEXT:    ret i32 93
+;
 entry:
   %b = zext i8 %a to i32
   br label %dispatch
@@ -472,14 +671,20 @@ dispatch:
   %cmp = icmp eq i8 %a, 93
   br i1 %cmp, label %target93, label %dispatch
 
-; CHECK-LABEL: @test16(
-; CHECK: target93:
-; CHECK-NEXT: ret i32 93
 target93:
   ret i32 %b
 }
 
 define i32 @test16_i1(i1 %a) {
+; CHECK-LABEL: @test16_i1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = zext i1 [[A:%.*]] to i32
+; CHECK-NEXT:    br label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[DISPATCH]]
+; CHECK:       true:
+; CHECK-NEXT:    ret i32 1
+;
 entry:
   %b = zext i1 %a to i32
   br label %dispatch
@@ -487,14 +692,21 @@ entry:
 dispatch:
   br i1 %a, label %true, label %dispatch
 
-; CHECK-LABEL: @test16_i1(
-; CHECK: true:
-; CHECK-NEXT: ret i32 1
 true:
   ret i32 %b
 }
 
 define i8 @test17(i8 %a) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = add i8 [[A:%.*]], 3
+; CHECK-NEXT:    br label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 93
+; CHECK-NEXT:    br i1 [[CMP]], label [[TARGET93:%.*]], label [[DISPATCH]]
+; CHECK:       target93:
+; CHECK-NEXT:    ret i8 96
+;
 entry:
   %c = add i8 %a, 3
   br label %dispatch
@@ -503,14 +715,21 @@ dispatch:
   %cmp = icmp eq i8 %a, 93
   br i1 %cmp, label %target93, label %dispatch
 
-; CHECK-LABEL: @test17(
-; CHECK: target93:
-; CHECK-NEXT: ret i8 96
 target93:
   ret i8 %c
 }
 
 define i8 @test17_2(i8 %a) {
+; CHECK-LABEL: @test17_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = add i8 [[A:%.*]], [[A]]
+; CHECK-NEXT:    br label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 93
+; CHECK-NEXT:    br i1 [[CMP]], label [[TARGET93:%.*]], label [[DISPATCH]]
+; CHECK:       target93:
+; CHECK-NEXT:    ret i8 -70
+;
 entry:
   %c = add i8 %a, %a
   br label %dispatch
@@ -519,14 +738,20 @@ dispatch:
   %cmp = icmp eq i8 %a, 93
   br i1 %cmp, label %target93, label %dispatch
 
-; CHECK-LABEL: @test17_2(
-; CHECK: target93:
-; CHECK-NEXT: ret i8 -70
 target93:
   ret i8 %c
 }
 
 define i1 @test17_i1(i1 %a) {
+; CHECK-LABEL: @test17_i1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = and i1 [[A:%.*]], true
+; CHECK-NEXT:    br label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[DISPATCH]]
+; CHECK:       true:
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %c = and i1 %a, true
   br label %dispatch
@@ -534,58 +759,92 @@ entry:
 dispatch:
   br i1 %a, label %true, label %dispatch
 
-; CHECK-LABEL: @test17_i1(
-; CHECK: true:
-; CHECK-NEXT: ret i1 true
 true:
   ret i1 %c
 }
 
 define i32 @test18(i8 %a) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    br label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
+; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:    ]
+; CHECK:       target93:
+; CHECK-NEXT:    ret i32 93
+;
 entry:
   %b = zext i8 %a to i32
   br label %dispatch
 
 dispatch:
   switch i8 %a, label %dispatch [
-    i8 93, label %target93
-    i8 -111, label %dispatch
+  i8 93, label %target93
+  i8 -111, label %dispatch
   ]
 
-; CHECK-LABEL: @test18(
-; CHECK: target93:
-; CHECK-NEXT: ret i32 93
 target93:
   ret i32 %b
 }
 
 define i8 @test19(i8 %a) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = add i8 [[A:%.*]], 3
+; CHECK-NEXT:    br label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
+; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:    ]
+; CHECK:       target93:
+; CHECK-NEXT:    ret i8 96
+;
 entry:
   %c = add i8 %a, 3
   br label %dispatch
 
 dispatch:
   switch i8 %a, label %dispatch [
-    i8 93, label %target93
-    i8 -111, label %dispatch
+  i8 93, label %target93
+  i8 -111, label %dispatch
   ]
 
-; CHECK-LABEL: @test19(
-; CHECK: target93:
-; CHECK-NEXT: ret i8 96
 target93:
   ret i8 %c
 }
 
+; Negative test. Shouldn't be incorrectly optimized to "ret i1 false".
+
 define i1 @test20(i64 %a) {
+; CHECK-LABEL: @test20(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = and i64 [[A:%.*]], 7
+; CHECK-NEXT:    br label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i64 0, label [[EXIT2:%.*]]
+; CHECK-NEXT:    i64 -2147483647, label [[EXIT2]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[B]], 0
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[C]]
+; CHECK:       exit2:
+; CHECK-NEXT:    ret i1 false
+;
 entry:
   %b = and i64 %a, 7
   br label %dispatch
 
 dispatch:
   switch i64 %a, label %default [
-    i64 0, label %exit2
-    i64 -2147483647, label %exit2
+  i64 0, label %exit2
+  i64 -2147483647, label %exit2
   ]
 
 default:
@@ -593,11 +852,6 @@ default:
   br label %exit
 
 exit:
-; Negative test. Shouldn't be incorrectly optimized to "ret i1 false".
-; CHECK-LABEL: @test20(
-; CHECK: exit:
-; CHECK-NOT: ret i1 false
-; CHECK: exit2:
   ret i1 %c
 
 exit2:
@@ -606,7 +860,11 @@ exit2:
 
 define i1 @slt(i8 %a, i8 %b) {
 ; CHECK-LABEL: @slt(
-; CHECK: ret i1 true
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %cmp = icmp slt i8 %a, %b
   call void @llvm.assume(i1 %cmp)
@@ -616,7 +874,11 @@ entry:
 
 define i1 @sgt(i8 %a, i8 %b) {
 ; CHECK-LABEL: @sgt(
-; CHECK: ret i1 true
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %cmp = icmp sgt i8 %a, %b
   call void @llvm.assume(i1 %cmp)
@@ -626,7 +888,11 @@ entry:
 
 define i1 @ult(i8 %a, i8 %b) {
 ; CHECK-LABEL: @ult(
-; CHECK: ret i1 true
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %cmp = icmp ult i8 %a, %b
   call void @llvm.assume(i1 %cmp)
@@ -636,7 +902,11 @@ entry:
 
 define i1 @ugt(i8 %a, i8 %b) {
 ; CHECK-LABEL: @ugt(
-; CHECK: ret i1 true
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 true
+;
 entry:
   %cmp = icmp ugt i8 %a, %b
   call void @llvm.assume(i1 %cmp)
diff --git a/test/Transforms/CorrelatedValuePropagation/udiv.ll b/test/Transforms/CorrelatedValuePropagation/udiv.ll
index 19078134e6fc23fb5ad5c4ab7f2d333790bae078..cc32840dfd0f33a226249ffedcb9b1e53e12c5c5 100644
--- a/test/Transforms/CorrelatedValuePropagation/udiv.ll
+++ b/test/Transforms/CorrelatedValuePropagation/udiv.ll
@@ -1,5 +1,11 @@
 ; RUN: opt < %s -correlated-propagation -S | FileCheck %s
 
+; Check that debug locations are preserved. For more info see:
+;   https://llvm.org/docs/SourceLevelDebugging.html#fixing-errors
+; RUN: opt < %s -enable-debugify -correlated-propagation -S 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=DEBUG
+; DEBUG: CheckModuleDebugify: PASS
+
 ; CHECK-LABEL: @test_nop
 define void @test_nop(i32 %n) {
 ; CHECK udiv i32 %n, 100
diff --git a/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll b/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1ec7764e72aaa123d655abd281cb989a1f3a69d8
--- /dev/null
+++ b/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll
@@ -0,0 +1,67 @@
+; RUN: opt -deadargelim -S < %s | FileCheck %s
+
+; Verify that the dbg.value intrinsics that use the dead argument and return
+; value are marked as undef to indicate that the values are optimized out.
+
+; Reproducer for PR23260.
+
+; CHECK-LABEL: define internal void @bar()
+; CHECK: call void @llvm.dbg.value(metadata i32 undef, metadata ![[LOCAL1:[0-9]+]]
+; CHECK: call void @sink()
+
+; Function Attrs: alwaysinline nounwind uwtable
+define internal i32 @bar(i32 %deadarg) #1 !dbg !10 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %deadarg, metadata !15, metadata !DIExpression()), !dbg !17
+  call void @sink(), !dbg !17
+  ret i32 123, !dbg !17
+}
+
+; CHECK-LABEL: define void @foo()
+; CHECK: call void @bar()
+; CHECK: call void @llvm.dbg.value(metadata i32 undef, metadata ![[LOCAL2:[0-9]+]]
+; CHECK: call void @bar()
+
+; Function Attrs: nounwind uwtable
+define void @foo() #0 !dbg !6 {
+entry:
+  %deadret = call i32 @bar(i32 0), !dbg !9
+  call void @llvm.dbg.value(metadata i32 %deadret, metadata !16, metadata !DIExpression()), !dbg !9
+  call i32 @bar(i32 1), !dbg !9
+  ret void, !dbg !9
+}
+
+declare void @sink() local_unnamed_addr
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { alwaysinline nounwind uwtable }
+attributes #2 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+; CHECK: ![[LOCAL1]] = !DILocalVariable(name: "local1"
+; CHECK: ![[LOCAL2]] = !DILocalVariable(name: "local2"
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "pr23260.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 8.0.0"}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null}
+!9 = !DILocation(line: 4, column: 3, scope: !6)
+!10 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14)
+!11 = !DISubroutineType(types: !12)
+!12 = !{!13, !13}
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !{!15}
+!15 = !DILocalVariable(name: "local1", arg: 1, scope: !10, file: !1, line: 2, type: !13)
+!16 = !DILocalVariable(name: "local2", arg: 1, scope: !6, file: !1, line: 2, type: !13)
+!17 = !DILocation(line: 2, column: 52, scope: !10)
diff --git a/test/Transforms/DeadArgElim/nonzero-address-spaces.ll b/test/Transforms/DeadArgElim/nonzero-address-spaces.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1b2aa06adf24897073142aad3557bc4f9781cbeb
--- /dev/null
+++ b/test/Transforms/DeadArgElim/nonzero-address-spaces.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -deadargelim %s | FileCheck %s
+
+; DeadArgumentElimination should respect the function address space
+; in the data layout.
+
+target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
+
+; CHECK: define internal i32 @foo() addrspace(1)
+define internal i32 @foo(i32 %x) #0 {
+  tail call void asm sideeffect inteldialect "mov eax, [esp + $$4]\0A\09ret", "~{eax},~{dirflag},~{fpsr},~{flags}"()
+  unreachable
+}
+
+define i32 @f(i32 %x, i32 %y) {
+  ; CHECK: %r = call addrspace(1) i32 @foo()
+  %r = call i32 @foo(i32 %x)
+
+  ret i32 %r
+}
+
diff --git a/test/Transforms/EarlyCSE/debug-info-undef.ll b/test/Transforms/EarlyCSE/debug-info-undef.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4615aa264b6d482295c05ef86865a3fa3676f306
--- /dev/null
+++ b/test/Transforms/EarlyCSE/debug-info-undef.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S %s -early-cse | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = global i8 25, align 1, !dbg !0
+
+define signext i16 @b() !dbg !12 {
+entry:
+  call void @llvm.dbg.value(metadata i16 23680, metadata !17, metadata !DIExpression()), !dbg !18
+  %0 = load i8, i8* @a, align 1, !dbg !19, !tbaa !20
+  %conv = sext i8 %0 to i16, !dbg !19
+
+; CHECK: call void @llvm.dbg.value(metadata i16 undef, metadata !17, metadata !DIExpression()), !dbg !18
+; CHECK-NEXT:  call i32 (...) @optimize_me_not()
+
+  call void @llvm.dbg.value(metadata i16 %conv, metadata !17, metadata !DIExpression()), !dbg !18
+  %call = call i32 (...) @optimize_me_not(), !dbg !23
+  %1 = load i8, i8* @a, align 1, !dbg !24, !tbaa !20
+  %conv1 = sext i8 %1 to i16, !dbg !24
+  ret i16 %conv1, !dbg !25
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare i32 @optimize_me_not(...)
+
+define i32 @main() !dbg !26 {
+entry:
+  %call = call signext i16 @b(), !dbg !30
+  ret i32 0, !dbg !31
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10}
+!llvm.ident = !{!11}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 8.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: GNU)
+!3 = !DIFile(filename: "patatino.c", directory: "/Users/davide/llvm-monorepo/llvm-mono/build/bin")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 7, !"PIC Level", i32 2}
+!11 = !{!"clang version 8.0.0 "}
+!12 = distinct !DISubprogram(name: "b", scope: !3, file: !3, line: 2, type: !13, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !16)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!15}
+!15 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+!16 = !{!17}
+!17 = !DILocalVariable(name: "i", scope: !12, file: !3, line: 3, type: !15)
+!18 = !DILocation(line: 3, column: 9, scope: !12)
+!19 = !DILocation(line: 4, column: 7, scope: !12)
+!20 = !{!21, !21, i64 0}
+!21 = !{!"omnipotent char", !22, i64 0}
+!22 = !{!"Simple C/C++ TBAA"}
+!23 = !DILocation(line: 5, column: 3, scope: !12)
+!24 = !DILocation(line: 6, column: 10, scope: !12)
+!25 = !DILocation(line: 6, column: 3, scope: !12)
+!26 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 8, type: !27, scopeLine: 8, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
+!27 = !DISubroutineType(types: !28)
+!28 = !{!29}
+!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!30 = !DILocation(line: 8, column: 14, scope: !26)
+!31 = !DILocation(line: 8, column: 19, scope: !26)
diff --git a/test/Transforms/FunctionImport/Inputs/funcimport.ll b/test/Transforms/FunctionImport/Inputs/funcimport.ll
index c4ef37a168f2fb5d4d8a23b5f4d742e3b223ec8b..201d449267121f664e6af1f0f2d7b014ef10d842 100644
--- a/test/Transforms/FunctionImport/Inputs/funcimport.ll
+++ b/test/Transforms/FunctionImport/Inputs/funcimport.ll
@@ -11,7 +11,7 @@
 define void @globalfunc1() #0 {
 entry:
   call void @funcwithpersonality()
-  call void (...) @variadic()
+  call void (...) @variadic_va_start()
   ret void
 }
 
@@ -147,8 +147,19 @@ entry:
   ret void
 }
 
-; Variadic function should not be imported because inliner doesn't handle it.
-define void @variadic(...) {
+; Variadic function without va_start can be imported because inliner
+; can handle it.
+define void @variadic_no_va_start(...) {
     ret void
 }
 
+; Variadic function with va_start should not be imported because inliner
+; doesn't handle it.
+define void @variadic_va_start(...) {
+    %ap = alloca i8*, align 8
+    %ap.0 = bitcast i8** %ap to i8*
+    call void @llvm.va_start(i8* %ap.0)
+    ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/Transforms/FunctionImport/funcimport.ll b/test/Transforms/FunctionImport/funcimport.ll
index bb974b587e39713ad241c91b683f58f40f95bb74..0d09d94a0fcb58c760ff663bd5114cb188eb6ad2 100644
--- a/test/Transforms/FunctionImport/funcimport.ll
+++ b/test/Transforms/FunctionImport/funcimport.ll
@@ -29,7 +29,8 @@ entry:
   call void (...) @weakfunc()
   call void (...) @linkoncefunc2()
   call void (...) @referencelargelinkonce()
-  call void (...) @variadic()
+  call void (...) @variadic_no_va_start()
+  call void (...) @variadic_va_start()
   ret i32 0
 }
 
@@ -103,11 +104,19 @@ declare void @linkoncefunc2(...) #1
 ; INSTLIMDEF-DAG: define available_externally hidden void @funcwithpersonality.llvm.{{.*}}() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !thinlto_src_module !0 {
 ; INSTLIM5-DAG: declare hidden void @funcwithpersonality.llvm.{{.*}}()
 
-; CHECK-DAG: declare void @variadic(...)
-declare void @variadic(...)
+; We can import variadic functions without a va_start, since the inliner
+; can handle them.
+; INSTLIMDEF-DAG: Import variadic_no_va_start
+; CHECK-DAG: define available_externally void @variadic_no_va_start(...) !thinlto_src_module !0 {
+declare void @variadic_no_va_start(...)
+
+; We can import variadic functions with a va_start, since the inliner
+; can sometimes handle them.
+; CHECK-DAG: define available_externally void @variadic_va_start(...)
+declare void @variadic_va_start(...)
 
 ; INSTLIMDEF-DAG: Import globalfunc2
-; INSTLIMDEF-DAG: 13 function-import - Number of functions imported
+; INSTLIMDEF-DAG: 15 function-import - Number of functions imported
 ; INSTLIMDEF-DAG: 4 function-import - Number of global variables imported
 
 ; CHECK-DAG: !0 = !{!"{{.*}}/Inputs/funcimport.ll"}
@@ -147,7 +156,7 @@ declare void @variadic(...)
 ; GUID-DAG: GUID {{.*}} is linkoncefunc
 
 ; DUMP:       Module [[M1:.*]] imports from 1 module
-; DUMP-NEXT:  13 functions imported from [[M2:.*]]
+; DUMP-NEXT:  15 functions imported from [[M2:.*]]
 ; DUMP-NEXT:  4 vars imported from [[M2]]
-; DUMP:       Imported 13 functions for Module [[M1]]
+; DUMP:       Imported 15 functions for Module [[M1]]
 ; DUMP-NEXT:  Imported 4 global variables for Module [[M1]]
diff --git a/test/Transforms/FunctionImport/funcimport_var.ll b/test/Transforms/FunctionImport/funcimport_var.ll
index a93cabba69a6891c13fe7b61c3d81de3ba535e81..edd874e6297d75a4256727fcdd62930982899722 100644
--- a/test/Transforms/FunctionImport/funcimport_var.ll
+++ b/test/Transforms/FunctionImport/funcimport_var.ll
@@ -4,7 +4,7 @@
 ; RUN: opt -module-summary %p/Inputs/funcimport_var2.ll -o %t2.bc
 ; RUN: llvm-lto -thinlto -thinlto-action=thinlink -o %t3 %t.bc %t2.bc
 ; RUN: llvm-lto -thinlto -thinlto-action=import -thinlto-index=%t3 %t.bc %t2.bc
-; RUN: llvm-lto -thinlto -thinlto-action=run %t.bc %t2.bc
+; RUN: llvm-lto -thinlto -thinlto-action=run %t.bc %t2.bc -exported-symbol=_Z4LinkPKcS0_
 ; RUN: llvm-nm %t.bc.thinlto.o | FileCheck %s
 ; RUN: llvm-lto2 run %t.bc %t2.bc -o %t.out \
 ; RUN:   -r %t.bc,_Z4LinkPKcS0_,plx \
diff --git a/test/Transforms/GCOVProfiling/three-element-mdnode.ll b/test/Transforms/GCOVProfiling/three-element-mdnode.ll
index 3136ff7a4826c4046c2b32358149cfc466b181ff..0af01188986305b84e9bec5f62755d983c08a1c8 100644
--- a/test/Transforms/GCOVProfiling/three-element-mdnode.ll
+++ b/test/Transforms/GCOVProfiling/three-element-mdnode.ll
@@ -19,7 +19,7 @@ entry:
 !1 = !DIFile(filename: "hello.cc", directory: "/home/nlewycky")
 !2 = !DIFile(filename: "hello.cc", directory: "/home/nlewycky")
 !3 = !{}
-!5 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !1, type: !6, retainedNodes: !3)
+!5 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", line: 1, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, scopeLine: 1, file: !1, scope: !1, type: !6, retainedNodes: !3)
 !6 = !DISubroutineType(types: !7)
 !7 = !{null}
 !8 = !DILocation(line: 1, scope: !5)
diff --git a/test/Transforms/GVN/PRE/pre-gep-load.ll b/test/Transforms/GVN/PRE/pre-gep-load.ll
index 1b2b4d20d31da7c2eb3b8c827f7dcc32a847c5e2..fdce79e15bba3fa9c213c835e6882a2739a4d70f 100644
--- a/test/Transforms/GVN/PRE/pre-gep-load.ll
+++ b/test/Transforms/GVN/PRE/pre-gep-load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basicaa -gvn -enable-load-pre -S | FileCheck %s
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=gvn -enable-load-pre -S | FileCheck %s
 
@@ -6,11 +7,49 @@ target triple = "aarch64--linux-gnu"
 
 define double @foo(i32 %stat, i32 %i, double** %p) {
 ; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[STAT:%.*]], label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:    i32 1, label [[SW_BB]]
+; CHECK-NEXT:    i32 2, label [[ENTRY_SW_BB2_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       entry.sw.bb2_crit_edge:
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load double*, double** [[P:%.*]], align 8
+; CHECK-NEXT:    [[DOTPRE1:%.*]] = sext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds double, double* [[DOTPRE]], i64 [[DOTPRE1]]
+; CHECK-NEXT:    [[DOTPRE2:%.*]] = load double, double* [[ARRAYIDX5_PHI_TRANS_INSERT]], align 8
+; CHECK-NEXT:    br label [[SW_BB2:%.*]]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = load double*, double** [[P]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[SUB]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[SW_BB2]]
+; CHECK:       sw.bb2:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[DOTPRE2]], [[ENTRY_SW_BB2_CRIT_EDGE]] ], [ [[TMP1]], [[IF_END]] ]
+; CHECK-NEXT:    [[IDXPROM3_PRE_PHI:%.*]] = phi i64 [ [[DOTPRE1]], [[ENTRY_SW_BB2_CRIT_EDGE]] ], [ [[IDXPROM]], [[IF_END]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double* [ [[DOTPRE]], [[ENTRY_SW_BB2_CRIT_EDGE]] ], [ [[TMP0]], [[IF_END]] ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[TMP3]], i64 [[IDXPROM3_PRE_PHI]]
+; CHECK-NEXT:    [[SUB6:%.*]] = fsub double 3.000000e+00, [[TMP2]]
+; CHECK-NEXT:    store double [[SUB6]], double* [[ARRAYIDX5]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       sw.default:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi double [ 0.000000e+00, [[SW_DEFAULT]] ], [ [[SUB6]], [[SW_BB2]] ], [ [[SUB]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret double [[RETVAL_0]]
+;
 entry:
   switch i32 %stat, label %sw.default [
-    i32 0, label %sw.bb
-    i32 1, label %sw.bb
-    i32 2, label %sw.bb2
+  i32 0, label %sw.bb
+  i32 1, label %sw.bb
+  i32 2, label %sw.bb2
   ]
 
 sw.bb:                                            ; preds = %entry, %entry
@@ -35,11 +74,8 @@ sw.bb2:                                           ; preds = %if.end, %entry
   %2 = load double*, double** %arrayidx4, align 8
   %arrayidx5 = getelementptr inbounds double, double* %2, i64 %idxprom3
   %3 = load double, double* %arrayidx5, align 8
-; CHECK: sw.bb2:
-; CHECK-NOT: sext
-; CHECK: phi double [
-; CHECK-NOT: load
   %sub6 = fsub double 3.000000e+00, %3
+  store double %sub6, double* %arrayidx5
   br label %return
 
 sw.default:                                       ; preds = %entry
@@ -55,12 +91,24 @@ return:                                           ; preds = %sw.default, %sw.bb2
 ; actually processed. Make sure we can deal with the situation.
 
 define void @test_shortcut_safe(i1 %tst, i32 %p1, i32* %a) {
-; CHECK-LABEL: define void @test_shortcut_safe
-; CHECK: [[SEXT1:%.*]] = sext i32 %p1 to i64
-; CHECK: [[PHI1:%.*]] = phi i64 [ [[SEXT1]], {{%.*}} ], [ [[PHI2:%.*]], {{%.*}} ]
-; CHECK: [[SEXT2:%.*]] = sext i32 %p1 to i64
-; CHECK: [[PHI2]] = phi i64 [ [[SEXT2]], {{.*}} ], [ [[PHI1]], {{%.*}} ]
-; CHECK: getelementptr inbounds i32, i32* %a, i64 [[PHI2]]
+; CHECK-LABEL: @test_shortcut_safe(
+; CHECK-NEXT:    br i1 [[TST:%.*]], label [[SEXT1:%.*]], label [[DOTPRE_DEST_CRIT_EDGE:%.*]]
+; CHECK:       .pre.dest_crit_edge:
+; CHECK-NEXT:    [[DOTPRE1:%.*]] = sext i32 [[P1:%.*]] to i64
+; CHECK-NEXT:    br label [[PRE_DEST:%.*]]
+; CHECK:       pre.dest:
+; CHECK-NEXT:    [[DOTPRE_PRE_PHI:%.*]] = phi i64 [ [[DOTPRE1]], [[DOTPRE_DEST_CRIT_EDGE]] ], [ [[IDXPROM2_PRE_PHI:%.*]], [[SEXT_USE:%.*]] ]
+; CHECK-NEXT:    br label [[SEXT_USE]]
+; CHECK:       sext1:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[P1]] to i64
+; CHECK-NEXT:    br label [[SEXT_USE]]
+; CHECK:       sext.use:
+; CHECK-NEXT:    [[IDXPROM2_PRE_PHI]] = phi i64 [ [[IDXPROM]], [[SEXT1]] ], [ [[DOTPRE_PRE_PHI]], [[PRE_DEST]] ]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM2_PRE_PHI]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    tail call void @g(i32 [[VAL]])
+; CHECK-NEXT:    br label [[PRE_DEST]]
+;
 
   br i1 %tst, label %sext1, label %pre.dest
 
diff --git a/test/Transforms/GlobalDCE/global_ctors.ll b/test/Transforms/GlobalDCE/global_ctors.ll
index 91bb9ab7f6bc428d89750f4a0d55fb35cd17f667..bd1a97e6ec25258a261e672a2778ca206c3967b6 100644
--- a/test/Transforms/GlobalDCE/global_ctors.ll
+++ b/test/Transforms/GlobalDCE/global_ctors.ll
@@ -1,14 +1,37 @@
 ; RUN: opt -S -globaldce < %s | FileCheck %s
 
+; Test that the presence of debug intrinsics isn't affecting GlobalDCE.
 ; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_notremovable }]
 ; CHECK-NOT: @_GLOBAL__I_a
 
 declare void @_notremovable()
 
-@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }, { i32, void ()* } { i32 65535, void ()* @_notremovable }]
+@llvm.global_ctors = appending global [3 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }, { i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_b }, { i32, void ()* } { i32 65535, void ()* @_notremovable }]
+
+@x = internal unnamed_addr constant i8 undef, align 1
 
 ; Function Attrs: nounwind readnone
 define internal void @_GLOBAL__I_a() #1 section "__TEXT,__StaticInit,regular,pure_instructions" {
 entry:
   ret void
 }
+
+; Function Attrs: nounwind readnone
+define internal void @_GLOBAL__I_b() #1 section "__TEXT,__StaticInit,regular,pure_instructions" {
+entry:
+  tail call void @llvm.dbg.value(metadata i8* @x, metadata !4, metadata !DIExpression(DW_OP_deref, DW_OP_constu, 0, DW_OP_swap, DW_OP_xderef)), !dbg !5
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, emissionKind: FullDebug)
+!2 = !DIFile(filename: "filename", directory: "directory")
+!3 = distinct !DISubprogram(name: "h1", unit: !1)
+!4 = !DILocalVariable(name: "b", arg: 1, scope: !3)
+!5 = !DILocation(scope: !3)
diff --git a/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll b/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
index 17001f95468602954ee04fc92f55d280e0858b10..5b697727e7d36c5f21d88b0798c92cd5459e45f7 100644
--- a/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
+++ b/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
@@ -15,9 +15,9 @@ declare void @sink() cold
 ; CHECK: call {{.*}}@sideeffect(
 ; CHECK: call {{.*}}@realloc(
 ; CHECK-LABEL: codeRepl:
-; CHECK-NEXT: call {{.*}}@realloc2.cold.1(i64 %size, i8* %ptr)
+; CHECK-NEXT: call {{.*}}@realloc2.cold.1(i64 %size, i8* %ptr, i8** %retval.0.ce.loc)
 ; CHECK-LABEL: cleanup:
-; CHECK-NEXT: phi i8* [ null, %if.then ], [ null, %codeRepl ], [ %call, %if.end ]
+; CHECK-NEXT: phi i8* [ null, %if.then ], [ %call, %if.end ], [ %retval.0.ce.reload, %codeRepl ]
 define i8* @realloc2(i8* %ptr, i64 %size) {
 entry:
   %0 = add i64 %size, -1
diff --git a/test/Transforms/HotColdSplit/eh-pads.ll b/test/Transforms/HotColdSplit/eh-pads.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1197a54bc32f9b27a110b2dca1e5275953408485
--- /dev/null
+++ b/test/Transforms/HotColdSplit/eh-pads.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: landingpad
+; CHECK: sideeffect(i32 2)
+define void @foo(i32 %cond) personality i8 0 {
+entry:
+  invoke void @llvm.donothing() to label %normal unwind label %exception
+
+exception:
+  ; Note: EH pads are not candidates for region entry points.
+  %cleanup = landingpad i8 cleanup
+  br label %continue_exception
+
+continue_exception:
+  call void @sideeffect(i32 0)
+  call void @sideeffect(i32 1)
+  call void @sink()
+  ret void
+
+normal:
+  call void @sideeffect(i32 2)
+  ret void
+}
+
+; See llvm.org/PR39917. It's currently not safe to outline landingpad
+; instructions.
+;
+; CHECK-LABEL: define {{.*}}@bar(
+; CHECK: landingpad
+define void @bar(i32 %cond) personality i8 0 {
+entry:
+  br i1 undef, label %exit, label %continue
+
+exit:
+  ret void
+
+continue:
+  invoke void @sink() to label %normal unwind label %exception
+
+exception:
+  ; Note: EH pads are not candidates for region entry points.
+  %cleanup = landingpad i8 cleanup
+  ret void
+
+normal:
+  call void @sideeffect(i32 0)
+  call void @sideeffect(i32 1)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: sideeffect(i32 0)
+; CHECK: sideeffect(i32 1)
+; CHECK: sink
+
+declare void @sideeffect(i32)
+
+declare void @sink() cold
+
+declare void @llvm.donothing() nounwind readnone
diff --git a/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll b/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll
new file mode 100644
index 0000000000000000000000000000000000000000..11256a443fa70245f78c72d52b946016112eb504
--- /dev/null
+++ b/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: call {{.*}}@foo.cold.1(
+; CHECK: unreachable
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: switch i32 undef, label %sw.epilog.i
+define void @foo(i32 %QMM) {
+entry:
+  switch i32 %QMM, label %entry.if.end16_crit_edge [
+    i32 1, label %if.then
+  ]
+
+entry.if.end16_crit_edge:                         ; preds = %entry
+  br label %if.end16
+
+if.then:                                          ; preds = %entry
+  br i1 undef, label %cond.true.i.i, label %_ZN10StringView8popFrontEv.exit.i
+
+cond.true.i.i:                                    ; preds = %if.then
+  ret void
+
+_ZN10StringView8popFrontEv.exit.i:                ; preds = %if.then
+  switch i32 undef, label %sw.epilog.i [
+    i32 81, label %if.end16
+    i32 82, label %sw.bb4.i
+    i32 83, label %sw.bb8.i
+    i32 84, label %sw.bb12.i
+    i32 65, label %if.end16
+    i32 66, label %sw.bb20.i
+    i32 67, label %sw.bb24.i
+    i32 68, label %sw.bb28.i
+  ]
+
+sw.bb4.i:                                         ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb8.i:                                         ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb12.i:                                        ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb20.i:                                        ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb24.i:                                        ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb28.i:                                        ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.epilog.i:                                      ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+if.end16:                                         ; preds = %sw.epilog.i, %sw.bb28.i, %sw.bb24.i, %sw.bb20.i, %sw.bb12.i, %sw.bb8.i, %sw.bb4.i, %_ZN10StringView8popFrontEv.exit.i, %_ZN10StringView8popFrontEv.exit.i, %entry.if.end16_crit_edge
+  %0 = phi i8 [ 0, %entry.if.end16_crit_edge ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 1, %sw.bb4.i ], [ 2, %sw.bb8.i ], [ 3, %sw.bb12.i ], [ 1, %sw.bb20.i ], [ 2, %sw.bb24.i ], [ 3, %sw.bb28.i ], [ 0, %sw.epilog.i ]
+  unreachable
+}
diff --git a/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll b/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll
new file mode 100644
index 0000000000000000000000000000000000000000..81a8f4231593440ef6a5970b8844d9d1a70fd24f
--- /dev/null
+++ b/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll
@@ -0,0 +1,29 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@fun
+; CHECK: call {{.*}}@fun.cold.1(
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ; This will be marked by the inverse DFS on sink-predecesors.
+  br label %sink
+
+sink:
+  call void @sink()
+
+  ; Do not allow the forward-DFS on sink-successors to mark the block again.
+  br i1 undef, label %if.then, label %if.then.exit
+
+if.then.exit:
+  ret void
+
+if.else:
+  ret void
+}
+
+declare void @sink() cold
diff --git a/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll b/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d8bd55e46dc9cd456ae9c07250d30fd1115c156e
--- /dev/null
+++ b/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+; 
+; extern __attribute__((cold)) void sink();
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+;     if (cond1) {
+;         if (cond2) {
+;             sideeffect(0);
+;         } else {
+;             sideeffect(1);
+;         }
+;         sink();
+;     } else {
+;         sideeffect(2);
+;     }
+;     sink();
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK: define {{.*}}@_Z3fooii{{.*}}#[[outlined_func_attr:[0-9]+]]
+; CHECK-NOT: _Z3fooii.cold
+; CHECK: attributes #[[outlined_func_attr]] = { {{.*}}minsize
+define void @_Z3fooii(i32, i32) {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  %5 = load i32, i32* %3, align 4
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %7, label %13
+
+; <label>:7:                                      ; preds = %2
+  %8 = load i32, i32* %4, align 4
+  %9 = icmp ne i32 %8, 0
+  br i1 %9, label %10, label %11
+
+; <label>:10:                                     ; preds = %7
+  call void @_Z10sideeffecti(i32 0)
+  br label %12
+
+; <label>:11:                                     ; preds = %7
+  call void @_Z10sideeffecti(i32 1)
+  br label %12
+
+; <label>:12:                                     ; preds = %11, %10
+  call void @_Z4sinkv() #3
+  br label %14
+
+; <label>:13:                                     ; preds = %2
+  call void @_Z10sideeffecti(i32 2)
+  br label %14
+
+; <label>:14:                                     ; preds = %13, %12
+  call void @_Z4sinkv() #3
+  ret void
+}
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinkv() cold
diff --git a/test/Transforms/HotColdSplit/noreturn.ll b/test/Transforms/HotColdSplit/noreturn.ll
new file mode 100644
index 0000000000000000000000000000000000000000..66de93432b87511272531bb41ebc1cd480f133b8
--- /dev/null
+++ b/test/Transforms/HotColdSplit/noreturn.ll
@@ -0,0 +1,73 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+%struct.__jmp_buf_tag = type { [8 x i64], i32, %struct.__sigset_t }
+%struct.__sigset_t = type { [16 x i64] }
+
+; Don't outline noreturn calls which aren't explicitly marked cold.
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK-NOT: foo.cold.1
+define void @foo(i32, %struct.__jmp_buf_tag*) {
+  %3 = icmp eq i32 %0, 0
+  tail call void @_Z10sideeffectv()
+  br i1 %3, label %5, label %4
+
+; <label>:4:                                      ; preds = %2
+  tail call void @longjmp(%struct.__jmp_buf_tag* %1, i32 0)
+  unreachable
+
+; <label>:5:                                      ; preds = %2
+  ret void
+}
+
+; Do outline noreturn calls marked cold.
+
+; CHECK-LABEL: define {{.*}}@bar(
+; CHECK: call {{.*}}@bar.cold.1(
+define void @bar(i32) {
+  %2 = icmp eq i32 %0, 0
+  tail call void @_Z10sideeffectv()
+  br i1 %2, label %sink, label %exit
+
+sink:
+  tail call void @_Z10sideeffectv()
+  tail call void @_Z10sideeffectv()
+  tail call void @_Z10sideeffectv()
+  call void @llvm.trap()
+  unreachable
+
+exit:
+  ret void
+}
+
+; Do outline noreturn calls preceded by a cold call.
+
+; CHECK-LABEL: define {{.*}}@baz(
+; CHECK: call {{.*}}@baz.cold.1(
+define void @baz(i32, %struct.__jmp_buf_tag*) {
+  %3 = icmp eq i32 %0, 0
+  tail call void @_Z10sideeffectv()
+  br i1 %3, label %5, label %4
+
+; <label>:4:                                      ; preds = %2
+  call void @sink()
+  tail call void @longjmp(%struct.__jmp_buf_tag* %1, i32 0)
+  unreachable
+
+; <label>:5:                                      ; preds = %2
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@bar.cold.1(
+; CHECK: call {{.*}}@llvm.trap(
+
+declare void @sink() cold
+
+declare void @llvm.trap() noreturn cold
+
+declare void @_Z10sideeffectv()
+
+declare void @longjmp(%struct.__jmp_buf_tag*, i32) noreturn nounwind
diff --git a/test/Transforms/HotColdSplit/outline-cold-asm.ll b/test/Transforms/HotColdSplit/outline-cold-asm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f3685030c1d8b136eec0f81fcaea3c41d29e05b7
--- /dev/null
+++ b/test/Transforms/HotColdSplit/outline-cold-asm.ll
@@ -0,0 +1,27 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@fun(
+; CHECK: call {{.*}}@fun.cold.1(
+
+; CHECK-LABEL: define {{.*}}@fun.cold.1(
+; CHECK: asm ""
+
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ret void
+
+if.else:
+  call void asm "", ""()
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  ret void
+}
+
+declare void @sink() cold
diff --git a/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d10240ab3795d410cf208fe4ee4be1d7ef490e6c
--- /dev/null
+++ b/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -hotcoldsplit < %s 2>&1 | FileCheck %s
+
+; CHECK-LABEL: define {{.*}}@fun
+; CHECK: call {{.*}}@fun.cold.2(
+; CHECK-NEXT: ret void
+; CHECK: call {{.*}}@fun.cold.1(
+; CHECK-NEXT: ret void
+define void @fun() {
+entry:
+  br i1 undef, label %A.then, label %A.else
+
+A.else:
+  br label %A.then4
+
+A.then4:
+  br i1 undef, label %A.then5, label %A.end
+
+A.then5:
+  br label %A.cleanup
+
+A.end:
+  br label %A.cleanup
+
+A.cleanup:
+  %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
+  unreachable
+
+A.then:
+  br i1 undef, label %B.then, label %B.else
+
+B.then:
+  ret void
+
+B.else:
+  br label %B.then4
+
+B.then4:
+  br i1 undef, label %B.then5, label %B.end
+
+B.then5:
+  br label %B.cleanup
+
+B.end:
+  br label %B.cleanup
+
+B.cleanup:
+  %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
+  unreachable
+}
+
+; CHECK-LABEL: define {{.*}}@fun.cold.1(
+; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
+; CHECK-NEXT: unreachable
+
+; CHECK-LABEL: define {{.*}}@fun.cold.2(
+; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
+; CHECK-NEXT: unreachable
diff --git a/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll b/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8fd20c0cee78cc4131bbc26eb328793b1a4dc03d
--- /dev/null
+++ b/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll
@@ -0,0 +1,81 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern __attribute__((cold)) void sink();
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+;     while (true) {
+;         if (cond1) {
+;             sideeffect(0); // This is cold (it reaches sink()).
+;             break;
+;         }
+;         if (cond2) {
+;             sideeffect(1); // This is cold (it reaches sink()).
+;             break;
+;         }
+;         sideeffect(2);
+;         return;
+;     }
+;     sink();
+;     sideeffect(3);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.1
+; CHECK: call void @_Z10sideeffecti(i32 1)
+; CHECK: call void @_Z10sideeffecti(i32 11)
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.2
+; CHECK: call void @_Z10sideeffecti(i32 0)
+; CHECK: call void @_Z10sideeffecti(i32 10)
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.3
+; CHECK: call void @_Z4sinkv
+; CHECK: call void @_Z10sideeffecti(i32 3)
+
+define void @_Z3fooii(i32, i32) {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  br label %5
+
+; <label>:5:                                      ; preds = %2
+  %6 = load i32, i32* %3, align 4
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %8, label %9
+
+; <label>:8:                                      ; preds = %5
+  call void @_Z10sideeffecti(i32 0)
+  call void @_Z10sideeffecti(i32 10)
+  br label %14
+
+; <label>:9:                                      ; preds = %5
+  %10 = load i32, i32* %4, align 4
+  %11 = icmp ne i32 %10, 0
+  br i1 %11, label %12, label %13
+
+; <label>:12:                                     ; preds = %9
+  call void @_Z10sideeffecti(i32 1)
+  call void @_Z10sideeffecti(i32 11)
+  br label %14
+
+; <label>:13:                                     ; preds = %9
+  call void @_Z10sideeffecti(i32 2)
+  br label %15
+
+; <label>:14:                                     ; preds = %12, %8
+  call void @_Z4sinkv() #3
+  call void @_Z10sideeffecti(i32 3)
+  br label %15
+
+; <label>:15:                                     ; preds = %14, %13
+  ret void
+}
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinkv() cold
diff --git a/test/Transforms/HotColdSplit/outline-while-loop.ll b/test/Transforms/HotColdSplit/outline-while-loop.ll
index 2a132bda7f00a29f5a9ea39d7c5785b1eb6107d3..4c4e1f9284e95723217b5af970fe3fadf6823a24 100644
--- a/test/Transforms/HotColdSplit/outline-while-loop.ll
+++ b/test/Transforms/HotColdSplit/outline-while-loop.ll
@@ -55,6 +55,47 @@ if.end:                                           ; preds = %entry
   ret void
 }
 
+; This is the same as @foo, but the while loop comes after the sink block.
+; CHECK-LABEL: define {{.*}}@while_loop_after_sink(
+; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @while_loop_after_sink.cold.1
+; CHECK-LABEL: if.end:
+; CHECK: call void @sideeffect(i32 1)
+define void @while_loop_after_sink(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %sink
+
+sink:
+  tail call void (...) @sink()
+  br label %while.cond.preheader
+
+while.cond.preheader:
+  %cmp3 = icmp sgt i32 %cond, 10
+  br i1 %cmp3, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %while.cond.preheader
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]
+  %dec = add nsw i32 %cond.addr.04, -1
+  tail call void @sideeffect(i32 0) #3
+  %cmp = icmp sgt i32 %dec, 10
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %while.cond.preheader
+  ret void
+
+if.end:                                           ; preds = %entry
+  tail call void @sideeffect(i32 1)
+  ret void
+}
+
 ; CHECK-LABEL: define {{.*}}@foo.cold.1
 ; CHECK: phi i32
 ; CHECK-NEXT: add nsw i32
@@ -62,6 +103,14 @@ if.end:                                           ; preds = %entry
 ; CHECK-NEXT: icmp
 ; CHECK-NEXT: br
 
+; CHECK-LABEL: define {{.*}}@while_loop_after_sink.cold.1
+; CHECK: call {{.*}}@sink
+; CHECK: phi i32
+; CHECK-NEXT: add nsw i32
+; CHECK-NEXT: call {{.*}}@sideeffect
+; CHECK-NEXT: icmp
+; CHECK-NEXT: br
+
 declare void @sideeffect(i32)
 
 declare void @sink(...) cold
diff --git a/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll b/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6e8b13a31223b43fa0e47005cbc9d00302693919
--- /dev/null
+++ b/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: call {{.*}}@sink
+; CHECK: %p.ce = phi i32 [ 1, %coldbb ], [ 3, %coldbb2 ]
+; CHECK-NEXT: store i32 %p.ce, i32* %p.ce.out 
+
+define void @foo(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+  call void @sink()
+  call void @sideeffect()
+  call void @sideeffect()
+  br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+  br label %if.end
+
+if.end:
+  %p = phi i32 [0, %entry], [1, %coldbb], [3, %coldbb2]
+  ret void
+}
+
+declare void @sink() cold
+
+declare void @sideeffect()
diff --git a/test/Transforms/HotColdSplit/region-overlap.ll b/test/Transforms/HotColdSplit/region-overlap.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8064a545200f2ac9a1d715156ef60e58c035d41b
--- /dev/null
+++ b/test/Transforms/HotColdSplit/region-overlap.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; __attribute__((cold)) extern void sink(int);
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+;     if (cond1) {
+;         if (cond2) { // This is the first cold region we visit.
+;             sideeffect(0);
+;             sideeffect(10);
+;             sink(0);
+;         }
+;
+;         // There's a larger, overlapping cold region here. But we ignore it.
+;         // This could be improved.
+;         sideeffect(1);
+;         sideeffect(11);
+;         sink(1);
+;     }
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii
+; CHECK: call {{.*}}@_Z3fooii.cold.1
+; CHECK-NOT: _Z3fooii.cold
+define void @_Z3fooii(i32, i32) {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  %5 = load i32, i32* %3, align 4
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %7, label %12
+
+; <label>:7:                                      ; preds = %2
+  %8 = load i32, i32* %4, align 4
+  %9 = icmp ne i32 %8, 0
+  br i1 %9, label %10, label %11
+
+; <label>:10:                                     ; preds = %7
+  call void @_Z10sideeffecti(i32 0)
+  call void @_Z10sideeffecti(i32 10)
+  call void @_Z4sinki(i32 0) #3
+  br label %11
+
+; <label>:11:                                     ; preds = %10, %7
+  call void @_Z10sideeffecti(i32 1)
+  call void @_Z10sideeffecti(i32 11)
+  call void @_Z4sinki(i32 1) #3
+  br label %12
+
+; <label>:12:                                     ; preds = %11, %2
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.1
+; CHECK: call void @_Z10sideeffecti(i32 0)
+; CHECK: call void @_Z10sideeffecti(i32 10)
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinki(i32) cold
diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll
index ac7d856608cfb33e67e6f1a7e328606e3e6e760d..66c6fbb1b8d3cbf0a8a820472b95416c609b2035 100644
--- a/test/Transforms/HotColdSplit/split-cold-2.ll
+++ b/test/Transforms/HotColdSplit/split-cold-2.ll
@@ -9,6 +9,9 @@
 ; CHECK: codeRepl:
 ; CHECK-NEXT: call void @fun.cold.1
 
+; CHECK: define {{.*}}@fun.cold.1{{.*}} [[cold_attr:#[0-9]+]]
+; CHECK: attributes [[cold_attr]] = { {{.*}}noreturn
+
 define void @fun() {
 entry:
   br i1 undef, label %if.then, label %if.else
diff --git a/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll b/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f8cff9e863e246b4b3101de918ba723d81658dfe
--- /dev/null
+++ b/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@exit_block_with_same_incoming_vals
+; CHECK: call {{.*}}@exit_block_with_same_incoming_vals.cold.1(
+; CHECK-NOT: br i1 undef
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+define void @exit_block_with_same_incoming_vals(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+  call void @sink()
+  call void @sideeffect()
+  call void @sideeffect()
+  br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+  %p2 = phi i32 [0, %coldbb], [1, %coldbb2]
+  br i1 undef, label %if.end, label %coldbb2
+
+if.end:
+  %p = phi i32 [0, %entry], [1, %coldbb], [1, %coldbb2]
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@exit_block_with_distinct_incoming_vals
+; CHECK: call {{.*}}@exit_block_with_distinct_incoming_vals.cold.1(
+; CHECK-NOT: br i1 undef
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+define void @exit_block_with_distinct_incoming_vals(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+  call void @sink()
+  call void @sideeffect()
+  call void @sideeffect()
+  br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+  %p2 = phi i32 [0, %coldbb], [1, %coldbb2]
+  br i1 undef, label %if.end, label %coldbb2
+
+if.end:
+  %p = phi i32 [0, %entry], [1, %coldbb], [2, %coldbb2]
+  ret void
+}
+
+declare void @sink() cold
+
+declare void @sideeffect()
diff --git a/test/Transforms/HotColdSplit/unwind.ll b/test/Transforms/HotColdSplit/unwind.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4715ef2ab8676a852233c8918f433a8de77a9025
--- /dev/null
+++ b/test/Transforms/HotColdSplit/unwind.ll
@@ -0,0 +1,37 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; Do not mark outlined functions which resume exception unwinding as noreturn.
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: resume
+; CHECK-NOT: noreturn
+define i32 @foo(i32 %cond) personality i8 0 {
+entry:
+  invoke void @llvm.donothing() to label %normal unwind label %exception
+
+exception:
+  %cleanup = landingpad i32 cleanup
+  br i1 undef, label %normal, label %continue_exception
+
+continue_exception:
+  call void @sideeffect(i32 0)
+  call void @sideeffect(i32 1)
+  call void @sink()
+  resume i32 undef
+
+normal:
+  br i1 undef, label %continue_exception, label %exit
+
+exit:
+  call void @sideeffect(i32 2)
+  ret i32 0
+}
+
+declare void @sideeffect(i32)
+
+declare void @sink() cold
+
+declare void @llvm.donothing() nounwind readnone
diff --git a/test/Transforms/IndVarSimplify/dont-recompute.ll b/test/Transforms/IndVarSimplify/dont-recompute.ll
index c87cd6596c623b0316ed203db81a665f4ebaa76a..22087710a9ca866baebff009cf2598a33b6b682c 100644
--- a/test/Transforms/IndVarSimplify/dont-recompute.ll
+++ b/test/Transforms/IndVarSimplify/dont-recompute.ll
@@ -123,3 +123,54 @@ for.end:                                          ; preds = %for.body
   tail call void @func(i32 %soft_use)
   ret void
 }
+
+; CHECK-LABEL: @test5(
+define void @test5(i32 %m) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add i32 %a.05, %m
+  %soft_use = add i32 %add, 123
+; CHECK: tail call void @func(i32 %soft_use)
+  tail call void @func(i32 %soft_use)
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 186
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+; CHECK: for.end:
+; CHECK-NOT: mul i32 %m, 186
+; CHECK:%add.lcssa = phi i32 [ %add, %for.body ]
+; CHECK-NEXT: tail call void @func(i32 %add.lcssa)
+  tail call void @func(i32 %add)
+  ret void
+}
+
+; CHECK-LABEL: @test6(
+define void @test6(i32 %m, i32* %p) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add i32 %a.05, %m
+  %soft_use = add i32 %add, 123
+; CHECK: store i32 %soft_use, i32* %pidx
+  %pidx = getelementptr i32, i32* %p, i32 %add
+  store i32 %soft_use, i32* %pidx
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 186
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+; CHECK: for.end:
+; CHECK-NOT: mul i32 %m, 186
+; CHECK:%add.lcssa = phi i32 [ %add, %for.body ]
+; CHECK-NEXT: tail call void @func(i32 %add.lcssa)
+  tail call void @func(i32 %add)
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
index 961c9fd944d9340135d2e9dfe81b1fd96dce89d8..fff76675f1755bf99353aa2b2cdfcfafdf6681cc 100644
--- a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
+++ b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -indvars < %s | FileCheck %s
 
+; Do not rewrite the user outside the loop because we must keep the instruction
+; inside the loop due to store. Rewrite doesn't give us any profit.
 define void @f(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) {
 ; CHECK-LABEL: @f(
 not_zero11.preheader:
@@ -22,6 +24,42 @@ not_zero11:
   %tmp23 = icmp slt i32 %tmp22, %tmp14
   br i1 %tmp23, label %not_zero11, label %main.exit.selector
 
+main.exit.selector:
+; CHECK-LABEL: main.exit.selector:
+; CHECK:   %tmp22.lcssa = phi i32 [ %tmp22, %not_zero11 ]
+; CHECK:   %tmp24 = icmp slt i32 %tmp22.lcssa, %length.
+  %tmp24 = icmp slt i32 %tmp22, %length.i
+  br i1 %tmp24, label %not_zero11.postloop, label %leave
+
+leave:
+  ret void
+
+not_zero11.postloop:
+  ret void
+}
+
+; Rewrite the user outside the loop because there is no hard users inside the loop.
+define void @f1(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) {
+; CHECK-LABEL: @f1(
+not_zero11.preheader:
+  %tmp13 = icmp ugt i32 %length.i, %length.i.88
+  %tmp14 = select i1 %tmp13, i32 %length.i.88, i32 %length.i
+  %tmp15 = icmp sgt i32 %tmp14, 0
+  br i1 %tmp15, label %not_zero11, label %not_zero11.postloop
+
+not_zero11:
+  %v_1 = phi i32 [ %tmp22, %not_zero11 ], [ 0, %not_zero11.preheader ]
+  %tmp16 = zext i32 %v_1 to i64
+  %tmp17 = getelementptr inbounds i8, i8* %tmp8, i64 %tmp16
+  %tmp18 = load i8, i8* %tmp17, align 1
+  %tmp19 = zext i8 %tmp18 to i32
+  %tmp20 = or i32 %tmp19, %tmp10
+  %tmp21 = trunc i32 %tmp20 to i8
+  %addr22 = getelementptr inbounds i8, i8* %tmp12, i64 %tmp16
+  %tmp22 = add nuw nsw i32 %v_1, 1
+  %tmp23 = icmp slt i32 %tmp22, %tmp14
+  br i1 %tmp23, label %not_zero11, label %main.exit.selector
+
 main.exit.selector:
 ; CHECK-LABEL: main.exit.selector:
 ; CHECK: %tmp24 = icmp slt i32 %tmp14, %length.i
diff --git a/test/Transforms/InferFunctionAttrs/norecurse_debug.ll b/test/Transforms/InferFunctionAttrs/norecurse_debug.ll
new file mode 100644
index 0000000000000000000000000000000000000000..575bbac5e5a9a974a83939769242aff0408d28d5
--- /dev/null
+++ b/test/Transforms/InferFunctionAttrs/norecurse_debug.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -O2 -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t-none-unknown-eabi"
+
+@foo.coefficient1 = internal unnamed_addr global i32* null, align 4, !dbg !0
+@iirLow1 = external dso_local local_unnamed_addr global i32*, align 4
+
+; Function Attrs: nounwind
+define dso_local void @foo(i32 %i2) local_unnamed_addr #0 !dbg !2 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %i2, metadata !11, metadata !DIExpression()), !dbg !18
+  %0 = load i32*, i32** @iirLow1, align 4, !dbg !19
+  store i32 0, i32* %0, align 4, !dbg !24
+  %1 = ptrtoint i32* %0 to i32, !dbg !27
+  store i32 %1, i32* bitcast (i32** @foo.coefficient1 to i32*), align 4, !dbg !28
+  ret void, !dbg !29
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!7}
+!llvm.module.flags = !{!13, !14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "coefficient1", scope: !2, file: !3, line: 5, type: !12, isLocal: true, isDefinition: true)
+!2 = distinct !DISubprogram(name: "foo", scope: !3, file: !3, line: 3, type: !4, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !7, retainedNodes: !10)
+!3 = !DIFile(filename: "norecurse_debug.c", directory: "/")
+!4 = !DISubroutineType(types: !5)
+!5 = !{null, !6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !8, globals: !9, nameTableKind: None)
+!8 = !{}
+!9 = !{!0}
+!10 = !{!11}
+!11 = !DILocalVariable(name: "i2", arg: 1, scope: !2, file: !3, line: 3, type: !6)
+!12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 32)
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 3}
+!15 = !{i32 1, !"wchar_size", i32 4}
+!16 = !{i32 1, !"min_enum_size", i32 4}
+!17 = !{!""}
+!18 = !DILocation(line: 3, column: 14, scope: !2)
+!19 = !DILocation(line: 7, column: 6, scope: !2)
+!24 = !DILocation(line: 7, column: 14, scope: !2)
+!27 = !DILocation(line: 9, column: 20, scope: !2)
+!28 = !DILocation(line: 9, column: 18, scope: !2)
+!29 = !DILocation(line: 10, column: 1, scope: !2)
+
+; CHECK: attributes #0 = { norecurse nounwind }
+; CHECK-NOT foo.coefficient1
diff --git a/test/Transforms/Inline/inline-cold-callsite-samplepgo.ll b/test/Transforms/Inline/inline-cold-callsite-samplepgo.ll
deleted file mode 100644
index cf2e75757e7eaa0fc6e6e777dfedc0e705c2981f..0000000000000000000000000000000000000000
--- a/test/Transforms/Inline/inline-cold-callsite-samplepgo.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; For SamplePGO, if -profile-sample-accurate is specified, cold callsite
-; heuristics should be honored if the caller has no profile.
-
-; RUN: opt < %s -inline -S -inline-cold-callsite-threshold=0 | FileCheck %s
-
-define i32 @callee(i32 %x) {
-  %x1 = add i32 %x, 1
-  %x2 = add i32 %x1, 1
-  %x3 = add i32 %x2, 1
-  call void @extern()
-  call void @extern()
-  ret i32 %x3
-}
-
-define i32 @caller(i32 %y1) {
-; CHECK-LABEL: @caller
-; CHECK-NOT: call i32 @callee
-  %y2 = call i32 @callee(i32 %y1)
-  ret i32 %y2
-}
-
-define i32 @caller_accurate(i32 %y1) #0 {
-; CHECK-LABEL: @caller_accurate
-; CHECK: call i32 @callee
-  %y2 = call i32 @callee(i32 %y1)
-  ret i32 %y2
-}
-
-declare void @extern()
-
-attributes #0 = { "profile-sample-accurate" }
-
-!llvm.module.flags = !{!1}
-!1 = !{i32 1, !"ProfileSummary", !2}
-!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
-!3 = !{!"ProfileFormat", !"SampleProfile"}
-!4 = !{!"TotalCount", i64 10000}
-!5 = !{!"MaxCount", i64 1000}
-!6 = !{!"MaxInternalCount", i64 1}
-!7 = !{!"MaxFunctionCount", i64 1000}
-!8 = !{!"NumCounts", i64 3}
-!9 = !{!"NumFunctions", i64 3}
-!10 = !{!"DetailedSummary", !11}
-!11 = !{!12, !13, !14}
-!12 = !{i32 10000, i64 100, i32 1}
-!13 = !{i32 999000, i64 100, i32 1}
-!14 = !{i32 999999, i64 1, i32 2}
diff --git a/test/Transforms/Inline/inline-min-legal-vector-width.ll b/test/Transforms/Inline/inline-min-legal-vector-width.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ec727419f01b62d27e2674debe49c37910eeba2a
--- /dev/null
+++ b/test/Transforms/Inline/inline-min-legal-vector-width.ll
@@ -0,0 +1,44 @@
+; RUN: opt %s -inline -S | FileCheck %s
+
+define internal void @innerSmall() "min-legal-vector-width"="128" {
+  ret void
+}
+
+define internal void @innerLarge() "min-legal-vector-width"="512" {
+  ret void
+}
+
+define internal void @innerNoAttribute() {
+  ret void
+}
+
+; We should not add an attribute during inlining. No attribute means unknown.
+; Inlining doesn't change the fact that we don't know anything about this
+; function.
+define void @outerNoAttribute() {
+  call void @innerLarge()
+  ret void
+}
+
+define void @outerConflictingAttributeSmall() "min-legal-vector-width"="128" {
+  call void @innerLarge()
+  ret void
+}
+
+define void @outerConflictingAttributeLarge() "min-legal-vector-width"="512" {
+  call void @innerSmall()
+  ret void
+}
+
+; We should remove the attribute after inlining since the callee's
+; vector width requirements are unknown.
+define void @outerAttribute() "min-legal-vector-width"="128" {
+  call void @innerNoAttribute()
+  ret void
+}
+
+; CHECK: define void @outerNoAttribute() {
+; CHECK: define void @outerConflictingAttributeSmall() #0
+; CHECK: define void @outerConflictingAttributeLarge() #0
+; CHECK: define void @outerAttribute() {
+; CHECK: attributes #0 = { "min-legal-vector-width"="512" }
diff --git a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 22f96be694d6f1f197f3eae162145d87b038acad..0a179d1f96d70f9b41819086114aaec8640aba86 100644
--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -934,32 +934,23 @@ define i32 @ubfe_offset_33(i32 %src, i32 %width) {
 }
 
 ; CHECK-LABEL: @ubfe_offset_0(
-; CHECK-NEXT: %1 = sub i32 32, %width
-; CHECK-NEXT: %2 = lshr i32 -1, %1
-; CHECK-NEXT: %bfe = and i32 %2, %src
-; CHECK-NEXT: ret i32 %bfe
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
 define i32 @ubfe_offset_0(i32 %src, i32 %width) {
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
   ret i32 %bfe
 }
 
 ; CHECK-LABEL: @ubfe_offset_32(
-; CHECK-NEXT: %1 = sub i32 32, %width
-; CHECK-NEXT: %2 = lshr i32 -1, %1
-; CHECK-NEXT: %bfe = and i32 %2, %src
-; CHECK-NEXT: ret i32 %bfe
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
 define i32 @ubfe_offset_32(i32 %src, i32 %width) {
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
   ret i32 %bfe
 }
 
 ; CHECK-LABEL: @ubfe_offset_31(
-; CHECK-NEXT: %1 = sub i32 32, %width
-; CHECK-NEXT: %2 = lshr i32 -1, %1
-; CHECK-NEXT: %bfe = and i32 %2, %src
-; CHECK-NEXT: ret i32 %bfe
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 31, i32 %width)
 define i32 @ubfe_offset_31(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 31, i32 %width)
   ret i32 %bfe
 }
 
@@ -1040,11 +1031,7 @@ define i64 @ubfe_offset_33_width_4_i64(i64 %src) {
 }
 
 ; CHECK-LABEL: @ubfe_offset_0_i64(
-; CHECK-NEXT: %1 = sub i32 64, %width
-; CHECK-NEXT: %2 = zext i32 %1 to i64
-; CHECK-NEXT: %3 = lshr i64 -1, %2
-; CHECK-NEXT: %bfe = and i64 %3, %src
-; CHECK-NEXT: ret i64 %bfe
+; CHECK-NEXT: %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
 define i64 @ubfe_offset_0_i64(i64 %src, i32 %width) {
   %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
   ret i64 %bfe
@@ -1066,12 +1053,9 @@ declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) nounwind readnone
 declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) nounwind readnone
 
 ; CHECK-LABEL: @sbfe_offset_31(
-; CHECK-NEXT: %1 = sub i32 32, %width
-; CHECK-NEXT: %2 = shl i32 %src, %1
-; CHECK-NEXT: %bfe = ashr i32 %2, %1
-; CHECK-NEXT: ret i32 %bfe
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 31, i32 %width)
 define i32 @sbfe_offset_31(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 32, i32 %width)
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 31, i32 %width)
   ret i32 %bfe
 }
 
diff --git a/test/Transforms/InstCombine/X86/x86-movmsk.ll b/test/Transforms/InstCombine/X86/x86-movmsk.ll
index 11acc1dbca845aaf761f6982800f56b89bb2aef0..ff323b0a0e8ab17f89c7e1bc83a44501b9f8a3a5 100644
--- a/test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ b/test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
 ; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
@@ -19,7 +19,7 @@ define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
 
 define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) {
 ; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
@@ -29,7 +29,7 @@ define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) {
 
 define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) {
 ; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
@@ -39,7 +39,7 @@ define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) {
 
 define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
 ; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
@@ -49,7 +49,7 @@ define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
 
 define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) {
 ; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
@@ -59,7 +59,7 @@ define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) {
 
 define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
 ; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
@@ -313,6 +313,121 @@ define i32 @fold_x86_avx2_pmovmskb() {
   ret i32 %1
 }
 
+define i32 @sext_sse_movmsk_ps(<4 x i1> %x) {
+; CHECK-LABEL: @sext_sse_movmsk_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <4 x i1> %x to <4 x i32>
+  %bc = bitcast <4 x i32> %sext to <4 x float>
+  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
+  ret i32 %r
+}
+
+define i32 @sext_sse2_movmsk_pd(<2 x i1> %x) {
+; CHECK-LABEL: @sext_sse2_movmsk_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i2 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <2 x i1> %x to <2 x i64>
+  %bc = bitcast <2 x i64> %sext to <2 x double>
+  %r = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %bc)
+  ret i32 %r
+}
+
+define i32 @sext_sse2_pmovmskb_128(<16 x i1> %x) {
+; CHECK-LABEL: @sext_sse2_pmovmskb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i1> [[X:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <16 x i1> %x to <16 x i8>
+  %r = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %sext)
+  ret i32 %r
+}
+
+define i32 @sext_avx_movmsk_ps_256(<8 x i1> %x) {
+; CHECK-LABEL: @sext_avx_movmsk_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <8 x i1> %x to <8 x i32>
+  %bc = bitcast <8 x i32> %sext to <8 x float>
+  %r = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %bc)
+  ret i32 %r
+}
+
+define i32 @sext_avx_movmsk_pd_256(<4 x i1> %x) {
+; CHECK-LABEL: @sext_avx_movmsk_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <4 x i1> %x to <4 x i64>
+  %bc = bitcast <4 x i64> %sext to <4 x double>
+  %r = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %bc)
+  ret i32 %r
+}
+
+define i32 @sext_avx2_pmovmskb(<32 x i1> %x) {
+; CHECK-LABEL: @sext_avx2_pmovmskb(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x i1> [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %sext = sext <32 x i1> %x to <32 x i8>
+  %r = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %sext)
+  ret i32 %r
+}
+
+; Negative test - bitcast from scalar.
+
+define i32 @sext_sse_movmsk_ps_scalar_source(i1 %x) {
+; CHECK-LABEL: @sext_sse_movmsk_ps_scalar_source(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[X:%.*]] to i128
+; CHECK-NEXT:    [[BC:%.*]] = bitcast i128 [[SEXT]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[BC]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sext = sext i1 %x to i128
+  %bc = bitcast i128 %sext to <4 x float>
+  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
+  ret i32 %r
+}
+
+; Negative test - bitcast from vector type with more elements.
+
+define i32 @sext_sse_movmsk_ps_too_many_elts(<8 x i1> %x) {
+; CHECK-LABEL: @sext_sse_movmsk_ps_too_many_elts(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[X:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x i16> [[SEXT]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[BC]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sext = sext <8 x i1> %x to <8 x i16>
+  %bc = bitcast <8 x i16> %sext to <4 x float>
+  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
+  ret i32 %r
+}
+
+; TODO: We could handle this by doing a bitcasted sign-bit test after the sext?
+; But need to make sure the backend handles that correctly.
+
+define i32 @sext_sse_movmsk_ps_must_replicate_bits(<2 x i1> %x) {
+; CHECK-LABEL: @sext_sse_movmsk_ps_must_replicate_bits(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[X:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[BC]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sext = sext <2 x i1> %x to <2 x i64>
+  %bc = bitcast <2 x i64> %sext to <4 x float>
+  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
+  ret i32 %r
+}
+
 declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx)
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
diff --git a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll
index 7be784a452fd1f2454d85e3ffee355f51cd7920a..ca1b86c0623ab007cb6f5ff5a66d56a2a8da258a 100644
--- a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll
+++ b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll
@@ -23,18 +23,6 @@ define i1 @p0(i8 %x) {
   ret i1 %ret
 }
 
-define i1 @pv(i8 %x, i8 %y) {
-; CHECK-LABEL: @pv(
-; CHECK-NEXT:    [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sge i8 [[TMP0]], [[X:%.*]]
-; CHECK-NEXT:    ret i1 [[TMP1]]
-;
-  %tmp0 = lshr i8 -1, %y
-  %tmp1 = and i8 %tmp0, %x
-  %ret = icmp sge i8 %tmp1, %x
-  ret i1 %ret
-}
-
 ; ============================================================================ ;
 ; Vector tests
 ; ============================================================================ ;
@@ -120,8 +108,9 @@ define i1 @cv0(i8 %y) {
 ; CHECK-LABEL: @cv0(
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sle i8 [[X]], [[TMP0]]
-; CHECK-NEXT:    ret i1 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X]], [[TMP0]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp sge i8 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %x = call i8 @gen8()
   %tmp0 = lshr i8 -1, %y
@@ -196,3 +185,42 @@ define <2 x i1> @n2(<2 x i8> %x) {
   %ret = icmp sge <2 x i8> %tmp0, %x
   ret <2 x i1> %ret
 }
+
+; ============================================================================ ;
+; Potential miscompiles.
+; ============================================================================ ;
+
+define i1 @nv(i8 %x, i8 %y) {
+; CHECK-LABEL: @nv(
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp sge i8 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %tmp0 = lshr i8 -1, %y
+  %tmp1 = and i8 %tmp0, %x
+  %ret = icmp sge i8 %tmp1, %x
+  ret i1 %ret
+}
+
+define <2 x i1> @n3_vec(<2 x i8> %x) {
+; CHECK-LABEL: @n3_vec(
+; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i8> [[X:%.*]], <i8 3, i8 -1>
+; CHECK-NEXT:    [[RET:%.*]] = icmp sge <2 x i8> [[TMP0]], [[X]]
+; CHECK-NEXT:    ret <2 x i1> [[RET]]
+;
+  %tmp0 = and <2 x i8> %x, <i8 3, i8 -1>
+  %ret = icmp sge <2 x i8> %tmp0, %x
+  ret <2 x i1> %ret
+}
+
+define <3 x i1> @n4_vec(<3 x i8> %x) {
+; CHECK-LABEL: @n4_vec(
+; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], <i8 3, i8 undef, i8 -1>
+; CHECK-NEXT:    [[RET:%.*]] = icmp sge <3 x i8> [[TMP0]], [[X]]
+; CHECK-NEXT:    ret <3 x i1> [[RET]]
+;
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 -1>
+  %ret = icmp sge <3 x i8> %tmp0, %x
+  ret <3 x i1> %ret
+}
diff --git a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll
index d1792d1e075b8ca8e7f5fd28ed93d09b54116060..2957ad5731c7de3eb3afaa00c6b49db88e4c8793 100644
--- a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll
+++ b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll
@@ -23,18 +23,6 @@ define i1 @p0(i8 %x) {
   ret i1 %ret
 }
 
-define i1 @pv(i8 %x, i8 %y) {
-; CHECK-LABEL: @pv(
-; CHECK-NEXT:    [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i8 [[TMP0]], [[X:%.*]]
-; CHECK-NEXT:    ret i1 [[TMP1]]
-;
-  %tmp0 = lshr i8 -1, %y
-  %tmp1 = and i8 %tmp0, %x
-  %ret = icmp slt i8 %tmp1, %x
-  ret i1 %ret
-}
-
 ; ============================================================================ ;
 ; Vector tests
 ; ============================================================================ ;
@@ -120,8 +108,9 @@ define i1 @cv0(i8 %y) {
 ; CHECK-LABEL: @cv0(
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i8 [[X]], [[TMP0]]
-; CHECK-NEXT:    ret i1 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X]], [[TMP0]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp slt i8 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %x = call i8 @gen8()
   %tmp0 = lshr i8 -1, %y
@@ -196,3 +185,42 @@ define <2 x i1> @n2(<2 x i8> %x) {
   %ret = icmp slt <2 x i8> %tmp0, %x
   ret <2 x i1> %ret
 }
+
+; ============================================================================ ;
+; Potential miscompiles.
+; ============================================================================ ;
+
+define i1 @nv(i8 %x, i8 %y) {
+; CHECK-LABEL: @nv(
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp slt i8 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %tmp0 = lshr i8 -1, %y
+  %tmp1 = and i8 %tmp0, %x
+  %ret = icmp slt i8 %tmp1, %x
+  ret i1 %ret
+}
+
+define <2 x i1> @n3(<2 x i8> %x) {
+; CHECK-LABEL: @n3(
+; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i8> [[X:%.*]], <i8 3, i8 -1>
+; CHECK-NEXT:    [[RET:%.*]] = icmp slt <2 x i8> [[TMP0]], [[X]]
+; CHECK-NEXT:    ret <2 x i1> [[RET]]
+;
+  %tmp0 = and <2 x i8> %x, <i8 3, i8 -1>
+  %ret = icmp slt <2 x i8> %tmp0, %x
+  ret <2 x i1> %ret
+}
+
+define <3 x i1> @n4(<3 x i8> %x) {
+; CHECK-LABEL: @n4(
+; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], <i8 3, i8 undef, i8 -1>
+; CHECK-NEXT:    [[RET:%.*]] = icmp slt <3 x i8> [[TMP0]], [[X]]
+; CHECK-NEXT:    ret <3 x i1> [[RET]]
+;
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 -1>
+  %ret = icmp slt <3 x i8> %tmp0, %x
+  ret <3 x i1> %ret
+}
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 64a7416184373c1e1c0aae457ebef3f3a1450b6d..b6d1eda0601dd65508f0c23034a80f29f1450f5f 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -3,110 +3,105 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "E-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64"
 
-@inbuf = external global [32832 x i8]           ; <[32832 x i8]*> [#uses=1]
+@inbuf = external global [32832 x i8]
 
 define i32 @test1(i32 %A) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    ret i32 %A
+; CHECK-NEXT:    ret i32 [[A:%.*]]
 ;
-  %c1 = bitcast i32 %A to i32             ; <i32> [#uses=1]
-  %c2 = bitcast i32 %c1 to i32            ; <i32> [#uses=1]
+  %c1 = bitcast i32 %A to i32
+  %c2 = bitcast i32 %c1 to i32
   ret i32 %c2
 }
 
 define i64 @test2(i8 %A) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[RET:%.*]] = zext i8 %A to i64
+; CHECK-NEXT:    [[RET:%.*]] = zext i8 [[A:%.*]] to i64
 ; CHECK-NEXT:    ret i64 [[RET]]
 ;
-  %c1 = zext i8 %A to i16         ; <i16> [#uses=1]
-  %c2 = zext i16 %c1 to i32               ; <i32> [#uses=1]
-  %Ret = zext i32 %c2 to i64              ; <i64> [#uses=1]
+  %c1 = zext i8 %A to i16
+  %c2 = zext i16 %c1 to i32
+  %Ret = zext i32 %c2 to i64
   ret i64 %Ret
 }
 
-; This function should just use bitwise AND
 define i64 @test3(i64 %A) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[C2:%.*]] = and i64 %A, 255
+; CHECK-NEXT:    [[C2:%.*]] = and i64 [[A:%.*]], 255
 ; CHECK-NEXT:    ret i64 [[C2]]
 ;
-  %c1 = trunc i64 %A to i8                ; <i8> [#uses=1]
-  %c2 = zext i8 %c1 to i64                ; <i64> [#uses=1]
+  %c1 = trunc i64 %A to i8
+  %c2 = zext i8 %c1 to i64
   ret i64 %c2
 }
 
 define i32 @test4(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 %A, %B
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RESULT:%.*]] = zext i1 [[COND]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
-  %COND = icmp slt i32 %A, %B             ; <i1> [#uses=1]
-  ; Booleans are unsigned integrals
-  %c = zext i1 %COND to i8                ; <i8> [#uses=1]
-  ; for the cast elim purpose
-  %result = zext i8 %c to i32             ; <i32> [#uses=1]
+  %COND = icmp slt i32 %A, %B
+  %c = zext i1 %COND to i8
+  %result = zext i8 %c to i32
   ret i32 %result
 }
 
 define i32 @test5(i1 %B) {
-        ; This cast should get folded into
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[RESULT:%.*]] = zext i1 %B to i32
+; CHECK-NEXT:    [[RESULT:%.*]] = zext i1 [[B:%.*]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
-  %c = zext i1 %B to i8           ; <i8> [#uses=1]
-  ; this cast
-  %result = zext i8 %c to i32             ; <i32> [#uses=1]
+  %c = zext i1 %B to i8
+  %result = zext i8 %c to i32
   ret i32 %result
 }
 
 define i32 @test6(i64 %A) {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[C1:%.*]] = trunc i64 %A to i32
+; CHECK-NEXT:    [[C1:%.*]] = trunc i64 [[A:%.*]] to i32
 ; CHECK-NEXT:    ret i32 [[C1]]
 ;
-  %c1 = trunc i64 %A to i32               ; <i32> [#uses=1]
-  %res = bitcast i32 %c1 to i32           ; <i32> [#uses=1]
+  %c1 = trunc i64 %A to i32
+  %res = bitcast i32 %c1 to i32
   ret i32 %res
 }
 
 define i64 @test7(i1 %A) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[RES:%.*]] = zext i1 %A to i64
+; CHECK-NEXT:    [[RES:%.*]] = zext i1 [[A:%.*]] to i64
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
-  %c1 = zext i1 %A to i32         ; <i32> [#uses=1]
-  %res = sext i32 %c1 to i64              ; <i64> [#uses=1]
+  %c1 = zext i1 %A to i32
+  %res = sext i32 %c1 to i64
   ret i64 %res
 }
 
 define i64 @test8(i8 %A) {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[C1:%.*]] = sext i8 %A to i64
+; CHECK-NEXT:    [[C1:%.*]] = sext i8 [[A:%.*]] to i64
 ; CHECK-NEXT:    ret i64 [[C1]]
 ;
-  %c1 = sext i8 %A to i64         ; <i64> [#uses=1]
-  %res = bitcast i64 %c1 to i64           ; <i64> [#uses=1]
+  %c1 = sext i8 %A to i64
+  %res = bitcast i64 %c1 to i64
   ret i64 %res
 }
 
 define i16 @test9(i16 %A) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    ret i16 %A
+; CHECK-NEXT:    ret i16 [[A:%.*]]
 ;
-  %c1 = sext i16 %A to i32                ; <i32> [#uses=1]
-  %c2 = trunc i32 %c1 to i16              ; <i16> [#uses=1]
+  %c1 = sext i16 %A to i32
+  %c2 = trunc i32 %c1 to i16
   ret i16 %c2
 }
 
 define i16 @test10(i16 %A) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i16 %A
+; CHECK-NEXT:    ret i16 [[A:%.*]]
 ;
-  %c1 = sext i16 %A to i32                ; <i32> [#uses=1]
-  %c2 = trunc i32 %c1 to i16              ; <i16> [#uses=1]
+  %c1 = sext i16 %A to i32
+  %c2 = trunc i32 %c1 to i16
   ret i16 %c2
 }
 
@@ -114,10 +109,10 @@ declare void @varargs(i32, ...)
 
 define void @test11(i32* %P) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    call void (i32, ...) @varargs(i32 5, i32* %P)
+; CHECK-NEXT:    call void (i32, ...) @varargs(i32 5, i32* [[P:%.*]])
 ; CHECK-NEXT:    ret void
 ;
-  %c = bitcast i32* %P to i16*            ; <i16*> [#uses=1]
+  %c = bitcast i32* %P to i16*
   call void (i32, ...) @varargs( i32 5, i16* %c )
   ret void
 }
@@ -126,12 +121,14 @@ declare i32 @__gxx_personality_v0(...)
 define void @test_invoke_vararg_cast(i32* %a, i32* %b) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 ; CHECK-LABEL: @test_invoke_vararg_cast(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    invoke void (i32, ...) @varargs(i32 1, i32* %b, i32* %a)
-; CHECK-NEXT:    to label %invoke.cont unwind label %lpad
+; CHECK-NEXT:    invoke void (i32, ...) @varargs(i32 1, i32* [[B:%.*]], i32* [[A:%.*]])
+; CHECK-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
 ; CHECK:       invoke.cont:
 ; CHECK-NEXT:    ret void
 ; CHECK:       lpad:
-; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32
+; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    ret void
 ;
 entry:
   %0 = bitcast i32* %b to i8*
@@ -139,10 +136,10 @@ entry:
   invoke void (i32, ...) @varargs(i32 1, i8* %0, i64* %1)
   to label %invoke.cont unwind label %lpad
 
-invoke.cont:                                      ; preds = %entry
+invoke.cont:
   ret void
 
-lpad:                                             ; preds = %entry
+lpad:
   %2 = landingpad { i8*, i32 }
   cleanup
   ret void
@@ -150,20 +147,20 @@ lpad:                                             ; preds = %entry
 
 define i8* @test13(i64 %A) {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [32832 x i8], [32832 x i8]* @inbuf, i64 0, i64 %A
+; CHECK-NEXT:    [[C:%.*]] = getelementptr [32832 x i8], [32832 x i8]* @inbuf, i64 0, i64 [[A:%.*]]
 ; CHECK-NEXT:    ret i8* [[C]]
 ;
-  %c = getelementptr [0 x i8], [0 x i8]* bitcast ([32832 x i8]* @inbuf to [0 x i8]*), i64 0, i64 %A             ; <i8*> [#uses=1]
+  %c = getelementptr [0 x i8], [0 x i8]* bitcast ([32832 x i8]* @inbuf to [0 x i8]*), i64 0, i64 %A
   ret i8* %c
 }
 
 define i1 @test14(i8 %A) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    [[X:%.*]] = icmp sgt i8 %A, -1
+; CHECK-NEXT:    [[X:%.*]] = icmp sgt i8 [[A:%.*]], -1
 ; CHECK-NEXT:    ret i1 [[X]]
 ;
-  %c = bitcast i8 %A to i8                ; <i8> [#uses=1]
-  %X = icmp ult i8 %c, -128               ; <i1> [#uses=1]
+  %c = bitcast i8 %A to i8
+  %X = icmp ult i8 %c, -128
   ret i1 %X
 }
 
@@ -177,36 +174,36 @@ define i1 @test14(i8 %A) {
 
 define i1 @test16(i32* %P) {
 ; CHECK-LABEL: @test16(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32* %P, null
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32* [[P:%.*]], null
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
-  %c = icmp ne i32* %P, null              ; <i1> [#uses=1]
+  %c = icmp ne i32* %P, null
   ret i1 %c
 }
 
-define i16 @test17(i1 %tmp3) {
+define i16 @test17(i1 %x) {
 ; CHECK-LABEL: @test17(
-; CHECK-NEXT:    [[T86:%.*]] = zext i1 %tmp3 to i16
+; CHECK-NEXT:    [[T86:%.*]] = zext i1 [[X:%.*]] to i16
 ; CHECK-NEXT:    ret i16 [[T86]]
 ;
-  %c = zext i1 %tmp3 to i32               ; <i32> [#uses=1]
-  %t86 = trunc i32 %c to i16              ; <i16> [#uses=1]
+  %c = zext i1 %x to i32
+  %t86 = trunc i32 %c to i16
   ret i16 %t86
 }
 
-define i16 @test18(i8 %tmp3) {
+define i16 @test18(i8 %x) {
 ; CHECK-LABEL: @test18(
-; CHECK-NEXT:    [[T86:%.*]] = sext i8 %tmp3 to i16
+; CHECK-NEXT:    [[T86:%.*]] = sext i8 [[X:%.*]] to i16
 ; CHECK-NEXT:    ret i16 [[T86]]
 ;
-  %c = sext i8 %tmp3 to i32               ; <i32> [#uses=1]
-  %t86 = trunc i32 %c to i16              ; <i16> [#uses=1]
+  %c = sext i8 %x to i32
+  %t86 = trunc i32 %c to i16
   ret i16 %t86
 }
 
 define i1 @test19(i32 %X) {
 ; CHECK-LABEL: @test19(
-; CHECK-NEXT:    [[Z:%.*]] = icmp slt i32 %X, 12345
+; CHECK-NEXT:    [[Z:%.*]] = icmp slt i32 [[X:%.*]], 12345
 ; CHECK-NEXT:    ret i1 [[Z]]
 ;
   %c = sext i32 %X to i64
@@ -216,7 +213,7 @@ define i1 @test19(i32 %X) {
 
 define <2 x i1> @test19vec(<2 x i32> %X) {
 ; CHECK-LABEL: @test19vec(
-; CHECK-NEXT:    [[Z:%.*]] = icmp slt <2 x i32> %X, <i32 12345, i32 2147483647>
+; CHECK-NEXT:    [[Z:%.*]] = icmp slt <2 x i32> [[X:%.*]], <i32 12345, i32 2147483647>
 ; CHECK-NEXT:    ret <2 x i1> [[Z]]
 ;
   %c = sext <2 x i32> %X to <2 x i64>
@@ -226,7 +223,7 @@ define <2 x i1> @test19vec(<2 x i32> %X) {
 
 define <3 x i1> @test19vec2(<3 x i1> %X) {
 ; CHECK-LABEL: @test19vec2(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = xor <3 x i1> %X, <i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[CMPEQ:%.*]] = xor <3 x i1> [[X:%.*]], <i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    ret <3 x i1> [[CMPEQ]]
 ;
   %sext = sext <3 x i1> %X to <3 x i32>
@@ -238,45 +235,40 @@ define i1 @test20(i1 %B) {
 ; CHECK-LABEL: @test20(
 ; CHECK-NEXT:    ret i1 false
 ;
-  %c = zext i1 %B to i32          ; <i32> [#uses=1]
-  %D = icmp slt i32 %c, -1                ; <i1> [#uses=1]
-  ;; false
+  %c = zext i1 %B to i32
+  %D = icmp slt i32 %c, -1
   ret i1 %D
 }
 
 define i32 @test21(i32 %X) {
 ; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[C21:%.*]] = and i32 %X, 255
-; CHECK-NEXT:    ret i32 [[C21]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], 255
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
-  %c1 = trunc i32 %X to i8                ; <i8> [#uses=1]
-  ;; sext -> zext -> and -> nop
-  %c2 = sext i8 %c1 to i32                ; <i32> [#uses=1]
-  %RV = and i32 %c2, 255          ; <i32> [#uses=1]
+  %c1 = trunc i32 %X to i8
+  %c2 = sext i8 %c1 to i32
+  %RV = and i32 %c2, 255
   ret i32 %RV
 }
 
 define i32 @test22(i32 %X) {
 ; CHECK-LABEL: @test22(
-; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 %X, 24
+; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 [[X:%.*]], 24
 ; CHECK-NEXT:    ret i32 [[SEXT]]
 ;
-  %c1 = trunc i32 %X to i8                ; <i8> [#uses=1]
-  ;; sext -> zext -> and -> nop
-  %c2 = sext i8 %c1 to i32                ; <i32> [#uses=1]
-  %RV = shl i32 %c2, 24           ; <i32> [#uses=1]
+  %c1 = trunc i32 %X to i8
+  %c2 = sext i8 %c1 to i32
+  %RV = shl i32 %c2, 24
   ret i32 %RV
 }
 
 define i32 @test23(i32 %X) {
-        ;; Turn into an AND even though X
 ; CHECK-LABEL: @test23(
-; CHECK-NEXT:    [[C2:%.*]] = and i32 %X, 65535
+; CHECK-NEXT:    [[C2:%.*]] = and i32 [[X:%.*]], 65535
 ; CHECK-NEXT:    ret i32 [[C2]]
 ;
-  %c1 = trunc i32 %X to i16               ; <i16> [#uses=1]
-  ;; and Z are signed.
-  %c2 = zext i16 %c1 to i32               ; <i32> [#uses=1]
+  %c1 = trunc i32 %X to i16
+  %c2 = zext i16 %c1 to i32
   ret i32 %c2
 }
 
@@ -284,70 +276,68 @@ define i1 @test24(i1 %C) {
 ; CHECK-LABEL: @test24(
 ; CHECK-NEXT:    ret i1 true
 ;
-  %X = select i1 %C, i32 14, i32 1234             ; <i32> [#uses=1]
-  ;; Fold cast into select
-  %c = icmp ne i32 %X, 0          ; <i1> [#uses=1]
+  %X = select i1 %C, i32 14, i32 1234
+  %c = icmp ne i32 %X, 0
   ret i1 %c
 }
 
 define i32 @test26(float %F) {
-        ;; no need to cast from float->double.
 ; CHECK-LABEL: @test26(
-; CHECK-NEXT:    [[D:%.*]] = fptosi float %F to i32
+; CHECK-NEXT:    [[D:%.*]] = fptosi float [[F:%.*]] to i32
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
-  %c = fpext float %F to double           ; <double> [#uses=1]
-  %D = fptosi double %c to i32            ; <i32> [#uses=1]
+  %c = fpext float %F to double
+  %D = fptosi double %c to i32
   ret i32 %D
 }
 
 define [4 x float]* @test27([9 x [4 x float]]* %A) {
 ; CHECK-LABEL: @test27(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [9 x [4 x float]], [9 x [4 x float]]* %A, i64 0, i64 0
+; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [9 x [4 x float]], [9 x [4 x float]]* [[A:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    ret [4 x float]* [[C]]
 ;
-  %c = bitcast [9 x [4 x float]]* %A to [4 x float]*              ; <[4 x float]*> [#uses=1]
+  %c = bitcast [9 x [4 x float]]* %A to [4 x float]*
   ret [4 x float]* %c
 }
 
 define float* @test28([4 x float]* %A) {
 ; CHECK-LABEL: @test28(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [4 x float], [4 x float]* %A, i64 0, i64 0
+; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[A:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    ret float* [[C]]
 ;
-  %c = bitcast [4 x float]* %A to float*          ; <float*> [#uses=1]
+  %c = bitcast [4 x float]* %A to float*
   ret float* %c
 }
 
 define i32 @test29(i32 %c1, i32 %c2) {
 ; CHECK-LABEL: @test29(
-; CHECK-NEXT:    [[TMP2:%.*]] = or i32 %c2, %c1
-; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[TMP2]], 255
-; CHECK-NEXT:    ret i32 [[TMP10]]
+; CHECK-NEXT:    [[T21:%.*]] = or i32 [[C2:%.*]], [[C1:%.*]]
+; CHECK-NEXT:    [[T10:%.*]] = and i32 [[T21]], 255
+; CHECK-NEXT:    ret i32 [[T10]]
 ;
-  %tmp1 = trunc i32 %c1 to i8             ; <i8> [#uses=1]
-  %tmp4.mask = trunc i32 %c2 to i8                ; <i8> [#uses=1]
-  %tmp = or i8 %tmp4.mask, %tmp1          ; <i8> [#uses=1]
-  %tmp10 = zext i8 %tmp to i32            ; <i32> [#uses=1]
-  ret i32 %tmp10
+  %t1 = trunc i32 %c1 to i8
+  %tmask = trunc i32 %c2 to i8
+  %t2 = or i8 %tmask, %t1
+  %t10 = zext i8 %t2 to i32
+  ret i32 %t10
 }
 
 define i32 @test30(i32 %c1) {
 ; CHECK-LABEL: @test30(
-; CHECK-NEXT:    [[C3:%.*]] = and i32 %c1, 255
+; CHECK-NEXT:    [[C3:%.*]] = and i32 [[C1:%.*]], 255
 ; CHECK-NEXT:    [[C4:%.*]] = xor i32 [[C3]], 1
 ; CHECK-NEXT:    ret i32 [[C4]]
 ;
-  %c2 = trunc i32 %c1 to i8               ; <i8> [#uses=1]
-  %c3 = xor i8 %c2, 1             ; <i8> [#uses=1]
-  %c4 = zext i8 %c3 to i32                ; <i32> [#uses=1]
+  %c2 = trunc i32 %c1 to i8
+  %c3 = xor i8 %c2, 1
+  %c4 = zext i8 %c3 to i32
   ret i32 %c4
 }
 
 define i1 @test31(i64 %A) {
 ; CHECK-LABEL: @test31(
-; CHECK-NEXT:    [[C:%.*]] = and i64 %A, 42
-; CHECK-NEXT:    [[D:%.*]] = icmp eq i64 [[C]], 10
+; CHECK-NEXT:    [[C1:%.*]] = and i64 [[A:%.*]], 42
+; CHECK-NEXT:    [[D:%.*]] = icmp eq i64 [[C1]], 10
 ; CHECK-NEXT:    ret i1 [[D]]
 ;
   %B = trunc i64 %A to i32
@@ -360,7 +350,7 @@ define i1 @test31(i64 %A) {
 ; Does this depend on the whether the source/dest types of the trunc are legal in the data layout?
 define <2 x i1> @test31vec(<2 x i64> %A) {
 ; CHECK-LABEL: @test31vec(
-; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> %A to <2 x i32>
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
 ; CHECK-NEXT:    [[C:%.*]] = and <2 x i32> [[B]], <i32 42, i32 42>
 ; CHECK-NEXT:    [[D:%.*]] = icmp eq <2 x i32> [[C]], <i32 10, i32 10>
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
@@ -376,7 +366,7 @@ define <2 x i1> @test31vec(<2 x i64> %A) {
 
 define <2 x i1> @test32vec(<2 x i8> %A) {
 ; CHECK-LABEL: @test32vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> %A, <i8 42, i8 42>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[A:%.*]], <i8 42, i8 42>
 ; CHECK-NEXT:    [[D:%.*]] = icmp eq <2 x i8> [[TMP1]], <i8 10, i8 10>
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
@@ -388,39 +378,39 @@ define <2 x i1> @test32vec(<2 x i8> %A) {
 
 define i32 @test33(i32 %c1) {
 ; CHECK-LABEL: @test33(
-; CHECK-NEXT:    ret i32 %c1
+; CHECK-NEXT:    ret i32 [[C1:%.*]]
 ;
-  %x = bitcast i32 %c1 to float           ; <float> [#uses=1]
-  %y = bitcast float %x to i32            ; <i32> [#uses=1]
+  %x = bitcast i32 %c1 to float
+  %y = bitcast float %x to i32
   ret i32 %y
 }
 
 define i16 @test34(i16 %a) {
 ; CHECK-LABEL: @test34(
-; CHECK-NEXT:    [[TMP21:%.*]] = lshr i16 %a, 8
-; CHECK-NEXT:    ret i16 [[TMP21]]
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[A:%.*]], 8
+; CHECK-NEXT:    ret i16 [[TMP1]]
 ;
-  %c1 = zext i16 %a to i32                ; <i32> [#uses=1]
-  %tmp21 = lshr i32 %c1, 8                ; <i32> [#uses=1]
-  %c2 = trunc i32 %tmp21 to i16           ; <i16> [#uses=1]
+  %c1 = zext i16 %a to i32
+  %t21 = lshr i32 %c1, 8
+  %c2 = trunc i32 %t21 to i16
   ret i16 %c2
 }
 
 define i16 @test35(i16 %a) {
 ; CHECK-LABEL: @test35(
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i16 %a, 8
-; CHECK-NEXT:    ret i16 [[TMP2]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i16 [[A:%.*]], 8
+; CHECK-NEXT:    ret i16 [[T2]]
 ;
-  %c1 = bitcast i16 %a to i16             ; <i16> [#uses=1]
-  %tmp2 = lshr i16 %c1, 8         ; <i16> [#uses=1]
-  %c2 = bitcast i16 %tmp2 to i16          ; <i16> [#uses=1]
+  %c1 = bitcast i16 %a to i16
+  %t2 = lshr i16 %c1, 8
+  %c2 = bitcast i16 %t2 to i16
   ret i16 %c2
 }
 
 ; rdar://6480391
 define i1 @test36(i32 %a) {
 ; CHECK-LABEL: @test36(
-; CHECK-NEXT:    [[D:%.*]] = icmp sgt i32 %a, -1
+; CHECK-NEXT:    [[D:%.*]] = icmp sgt i32 [[A:%.*]], -1
 ; CHECK-NEXT:    ret i1 [[D]]
 ;
   %b = lshr i32 %a, 31
@@ -431,7 +421,7 @@ define i1 @test36(i32 %a) {
 
 define <2 x i1> @test36vec(<2 x i32> %a) {
 ; CHECK-LABEL: @test36vec(
-; CHECK-NEXT:    [[D:%.*]] = icmp sgt <2 x i32> %a, <i32 -1, i32 -1>
+; CHECK-NEXT:    [[D:%.*]] = icmp sgt <2 x i32> [[A:%.*]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %b = lshr <2 x i32> %a, <i32 31, i32 31>
@@ -453,7 +443,7 @@ define i1 @test37(i32 %a) {
 
 define i64 @test38(i32 %a) {
 ; CHECK-LABEL: @test38(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 %a, -2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[A:%.*]], -2
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
@@ -466,91 +456,91 @@ define i64 @test38(i32 %a) {
 
 define i16 @test39(i16 %a) {
 ; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[TMP_UPGRD_32:%.*]] = call i16 @llvm.bswap.i16(i16 %a)
-; CHECK-NEXT:    ret i16 [[TMP_UPGRD_32]]
+; CHECK-NEXT:    [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]])
+; CHECK-NEXT:    ret i16 [[REV]]
 ;
-  %tmp = zext i16 %a to i32
-  %tmp21 = lshr i32 %tmp, 8
-  %tmp5 = shl i32 %tmp, 8
-  %tmp.upgrd.32 = or i32 %tmp21, %tmp5
-  %tmp.upgrd.3 = trunc i32 %tmp.upgrd.32 to i16
-  ret i16 %tmp.upgrd.3
+  %t = zext i16 %a to i32
+  %t21 = lshr i32 %t, 8
+  %t5 = shl i32 %t, 8
+  %t32 = or i32 %t21, %t5
+  %r = trunc i32 %t32 to i16
+  ret i16 %r
 }
 
 define i16 @test40(i16 %a) {
 ; CHECK-LABEL: @test40(
-; CHECK-NEXT:    [[TMP21:%.*]] = lshr i16 %a, 9
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i16 %a, 8
-; CHECK-NEXT:    [[TMP_UPGRD_32:%.*]] = or i16 [[TMP21]], [[TMP5]]
-; CHECK-NEXT:    ret i16 [[TMP_UPGRD_32]]
+; CHECK-NEXT:    [[T21:%.*]] = lshr i16 [[A:%.*]], 9
+; CHECK-NEXT:    [[T5:%.*]] = shl i16 [[A]], 8
+; CHECK-NEXT:    [[T32:%.*]] = or i16 [[T21]], [[T5]]
+; CHECK-NEXT:    ret i16 [[T32]]
 ;
-  %tmp = zext i16 %a to i32
-  %tmp21 = lshr i32 %tmp, 9
-  %tmp5 = shl i32 %tmp, 8
-  %tmp.upgrd.32 = or i32 %tmp21, %tmp5
-  %tmp.upgrd.3 = trunc i32 %tmp.upgrd.32 to i16
-  ret i16 %tmp.upgrd.3
+  %t = zext i16 %a to i32
+  %t21 = lshr i32 %t, 9
+  %t5 = shl i32 %t, 8
+  %t32 = or i32 %t21, %t5
+  %r = trunc i32 %t32 to i16
+  ret i16 %r
 }
 
 define <2 x i16> @test40vec(<2 x i16> %a) {
 ; CHECK-LABEL: @test40vec(
-; CHECK-NEXT:    [[TMP21:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 9>
-; CHECK-NEXT:    [[TMP5:%.*]] = shl <2 x i16> [[A]], <i16 8, i16 8>
-; CHECK-NEXT:    [[TMP_UPGRD_32:%.*]] = or <2 x i16> [[TMP21]], [[TMP5]]
-; CHECK-NEXT:    ret <2 x i16> [[TMP_UPGRD_32]]
+; CHECK-NEXT:    [[T21:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 9>
+; CHECK-NEXT:    [[T5:%.*]] = shl <2 x i16> [[A]], <i16 8, i16 8>
+; CHECK-NEXT:    [[T32:%.*]] = or <2 x i16> [[T21]], [[T5]]
+; CHECK-NEXT:    ret <2 x i16> [[T32]]
 ;
-  %tmp = zext <2 x i16> %a to <2 x i32>
-  %tmp21 = lshr <2 x i32> %tmp, <i32 9, i32 9>
-  %tmp5 = shl <2 x i32> %tmp, <i32 8, i32 8>
-  %tmp.upgrd.32 = or <2 x i32> %tmp21, %tmp5
-  %tmp.upgrd.3 = trunc <2 x i32> %tmp.upgrd.32 to <2 x i16>
-  ret <2 x i16> %tmp.upgrd.3
+  %t = zext <2 x i16> %a to <2 x i32>
+  %t21 = lshr <2 x i32> %t, <i32 9, i32 9>
+  %t5 = shl <2 x i32> %t, <i32 8, i32 8>
+  %t32 = or <2 x i32> %t21, %t5
+  %r = trunc <2 x i32> %t32 to <2 x i16>
+  ret <2 x i16> %r
 }
 
 ; PR1263
-define i32* @test41(i32* %tmp1) {
+define i32* @test41(i32* %t1) {
 ; CHECK-LABEL: @test41(
-; CHECK-NEXT:    ret i32* %tmp1
+; CHECK-NEXT:    ret i32* [[T1:%.*]]
 ;
-  %tmp64 = bitcast i32* %tmp1 to { i32 }*
-  %tmp65 = getelementptr { i32 }, { i32 }* %tmp64, i32 0, i32 0
-  ret i32* %tmp65
+  %t64 = bitcast i32* %t1 to { i32 }*
+  %t65 = getelementptr { i32 }, { i32 }* %t64, i32 0, i32 0
+  ret i32* %t65
 }
 
-define i32 addrspace(1)* @test41_addrspacecast_smaller(i32* %tmp1) {
+define i32 addrspace(1)* @test41_addrspacecast_smaller(i32* %t1) {
 ; CHECK-LABEL: @test41_addrspacecast_smaller(
-; CHECK-NEXT:    [[TMP65:%.*]] = addrspacecast i32* %tmp1 to i32 addrspace(1)*
-; CHECK-NEXT:    ret i32 addrspace(1)* [[TMP65]]
+; CHECK-NEXT:    [[T65:%.*]] = addrspacecast i32* [[T1:%.*]] to i32 addrspace(1)*
+; CHECK-NEXT:    ret i32 addrspace(1)* [[T65]]
 ;
-  %tmp64 = addrspacecast i32* %tmp1 to { i32 } addrspace(1)*
-  %tmp65 = getelementptr { i32 }, { i32 } addrspace(1)* %tmp64, i32 0, i32 0
-  ret i32 addrspace(1)* %tmp65
+  %t64 = addrspacecast i32* %t1 to { i32 } addrspace(1)*
+  %t65 = getelementptr { i32 }, { i32 } addrspace(1)* %t64, i32 0, i32 0
+  ret i32 addrspace(1)* %t65
 }
 
-define i32* @test41_addrspacecast_larger(i32 addrspace(1)* %tmp1) {
+define i32* @test41_addrspacecast_larger(i32 addrspace(1)* %t1) {
 ; CHECK-LABEL: @test41_addrspacecast_larger(
-; CHECK-NEXT:    [[TMP65:%.*]] = addrspacecast i32 addrspace(1)* %tmp1 to i32*
-; CHECK-NEXT:    ret i32* [[TMP65]]
+; CHECK-NEXT:    [[T65:%.*]] = addrspacecast i32 addrspace(1)* [[T1:%.*]] to i32*
+; CHECK-NEXT:    ret i32* [[T65]]
 ;
-  %tmp64 = addrspacecast i32 addrspace(1)* %tmp1 to { i32 }*
-  %tmp65 = getelementptr { i32 }, { i32 }* %tmp64, i32 0, i32 0
-  ret i32* %tmp65
+  %t64 = addrspacecast i32 addrspace(1)* %t1 to { i32 }*
+  %t65 = getelementptr { i32 }, { i32 }* %t64, i32 0, i32 0
+  ret i32* %t65
 }
 
 define i32 @test42(i32 %X) {
 ; CHECK-LABEL: @test42(
-; CHECK-NEXT:    [[Z:%.*]] = and i32 %X, 255
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[X:%.*]], 255
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
-  %Y = trunc i32 %X to i8         ; <i8> [#uses=1]
-  %Z = zext i8 %Y to i32          ; <i32> [#uses=1]
+  %Y = trunc i32 %X to i8
+  %Z = zext i8 %Y to i32
   ret i32 %Z
 }
 
 ; rdar://6598839
-define zeroext i64 @test43(i8 zeroext %on_off) nounwind readonly {
+define zeroext i64 @test43(i8 zeroext %on_off) {
 ; CHECK-LABEL: @test43(
-; CHECK-NEXT:    [[A:%.*]] = zext i8 %on_off to i64
+; CHECK-NEXT:    [[A:%.*]] = zext i8 [[ON_OFF:%.*]] to i64
 ; CHECK-NEXT:    [[B:%.*]] = add nsw i64 [[A]], -1
 ; CHECK-NEXT:    ret i64 [[B]]
 ;
@@ -562,7 +552,7 @@ define zeroext i64 @test43(i8 zeroext %on_off) nounwind readonly {
 
 define i64 @test44(i8 %T) {
 ; CHECK-LABEL: @test44(
-; CHECK-NEXT:    [[A:%.*]] = zext i8 %T to i64
+; CHECK-NEXT:    [[A:%.*]] = zext i8 [[T:%.*]] to i64
 ; CHECK-NEXT:    [[B:%.*]] = or i64 [[A]], 1234
 ; CHECK-NEXT:    ret i64 [[B]]
 ;
@@ -574,8 +564,8 @@ define i64 @test44(i8 %T) {
 
 define i64 @test45(i8 %A, i64 %Q) {
 ; CHECK-LABEL: @test45(
-; CHECK-NEXT:    [[B:%.*]] = sext i8 %A to i64
-; CHECK-NEXT:    [[C:%.*]] = or i64 [[B]], %Q
+; CHECK-NEXT:    [[B:%.*]] = sext i8 [[A:%.*]] to i64
+; CHECK-NEXT:    [[C:%.*]] = or i64 [[B]], [[Q:%.*]]
 ; CHECK-NEXT:    [[E:%.*]] = and i64 [[C]], 4294967295
 ; CHECK-NEXT:    ret i64 [[E]]
 ;
@@ -589,7 +579,7 @@ define i64 @test45(i8 %A, i64 %Q) {
 
 define i64 @test46(i64 %A) {
 ; CHECK-LABEL: @test46(
-; CHECK-NEXT:    [[C:%.*]] = shl i64 %A, 8
+; CHECK-NEXT:    [[C:%.*]] = shl i64 [[A:%.*]], 8
 ; CHECK-NEXT:    [[D:%.*]] = and i64 [[C]], 10752
 ; CHECK-NEXT:    ret i64 [[D]]
 ;
@@ -602,9 +592,11 @@ define i64 @test46(i64 %A) {
 
 define <2 x i64> @test46vec(<2 x i64> %A) {
 ; CHECK-LABEL: @test46vec(
-; CHECK-NEXT:    [[C:%.*]] = shl <2 x i64> [[A:%.*]], <i64 8, i64 8>
-; CHECK-NEXT:    [[D:%.*]] = and <2 x i64> [[C]], <i64 10752, i64 10752>
-; CHECK-NEXT:    ret <2 x i64> [[D]]
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[C:%.*]] = shl <2 x i32> [[B]], <i32 8, i32 8>
+; CHECK-NEXT:    [[D:%.*]] = and <2 x i32> [[C]], <i32 10752, i32 10752>
+; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[D]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[E]]
 ;
   %B = trunc <2 x i64> %A to <2 x i32>
   %C = and <2 x i32> %B, <i32 42, i32 42>
@@ -628,7 +620,7 @@ define i64 @test47(i8 %A) {
 
 define i64 @test48(i8 %A1, i8 %a2) {
 ; CHECK-LABEL: @test48(
-; CHECK-NEXT:    [[Z2:%.*]] = zext i8 %A1 to i32
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[A1:%.*]] to i32
 ; CHECK-NEXT:    [[C:%.*]] = shl nuw nsw i32 [[Z2]], 8
 ; CHECK-NEXT:    [[D:%.*]] = or i32 [[C]], [[Z2]]
 ; CHECK-NEXT:    [[E:%.*]] = zext i32 [[D]] to i64
@@ -644,7 +636,7 @@ define i64 @test48(i8 %A1, i8 %a2) {
 
 define i64 @test49(i64 %A) {
 ; CHECK-LABEL: @test49(
-; CHECK-NEXT:    [[C:%.*]] = shl i64 %A, 32
+; CHECK-NEXT:    [[C:%.*]] = shl i64 [[A:%.*]], 32
 ; CHECK-NEXT:    [[SEXT:%.*]] = ashr exact i64 [[C]], 32
 ; CHECK-NEXT:    [[D:%.*]] = or i64 [[SEXT]], 1
 ; CHECK-NEXT:    ret i64 [[D]]
@@ -655,15 +647,15 @@ define i64 @test49(i64 %A) {
   ret i64 %D
 }
 
-define i64 @test50(i64 %A) {
+define i64 @test50(i64 %x) {
 ; CHECK-LABEL: @test50(
-; CHECK-NEXT:    [[A:%.*]] = lshr i64 %A, 2
+; CHECK-NEXT:    [[A:%.*]] = lshr i64 [[X:%.*]], 2
 ; CHECK-NEXT:    [[D:%.*]] = shl i64 [[A]], 32
 ; CHECK-NEXT:    [[SEXT:%.*]] = add i64 [[D]], -4294967296
 ; CHECK-NEXT:    [[E:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-NEXT:    ret i64 [[E]]
 ;
-  %a = lshr i64 %A, 2
+  %a = lshr i64 %x, 2
   %B = trunc i64 %a to i32
   %D = add i32 %B, -1
   %E = sext i32 %D to i64
@@ -673,9 +665,9 @@ define i64 @test50(i64 %A) {
 
 define i64 @test51(i64 %A, i1 %cond) {
 ; CHECK-LABEL: @test51(
-; CHECK-NEXT:    [[C:%.*]] = and i64 %A, 4294967294
-; CHECK-NEXT:    [[D:%.*]] = or i64 %A, 1
-; CHECK-NEXT:    [[E:%.*]] = select i1 %cond, i64 [[C]], i64 [[D]]
+; CHECK-NEXT:    [[C:%.*]] = and i64 [[A:%.*]], 4294967294
+; CHECK-NEXT:    [[D:%.*]] = or i64 [[A]], 1
+; CHECK-NEXT:    [[E:%.*]] = select i1 [[COND:%.*]], i64 [[C]], i64 [[D]]
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[E]], 32
 ; CHECK-NEXT:    [[F:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-NEXT:    ret i64 [[F]]
@@ -690,7 +682,7 @@ define i64 @test51(i64 %A, i1 %cond) {
 
 define i32 @test52(i64 %A) {
 ; CHECK-LABEL: @test52(
-; CHECK-NEXT:    [[B:%.*]] = trunc i64 %A to i32
+; CHECK-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
 ; CHECK-NEXT:    [[C:%.*]] = and i32 [[B]], 7224
 ; CHECK-NEXT:    [[D:%.*]] = or i32 [[C]], 32962
 ; CHECK-NEXT:    ret i32 [[D]]
@@ -704,7 +696,7 @@ define i32 @test52(i64 %A) {
 
 define i64 @test53(i32 %A) {
 ; CHECK-LABEL: @test53(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %A, 7224
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 7224
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 32962
 ; CHECK-NEXT:    [[D:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    ret i64 [[D]]
@@ -718,7 +710,7 @@ define i64 @test53(i32 %A) {
 
 define i32 @test54(i64 %A) {
 ; CHECK-LABEL: @test54(
-; CHECK-NEXT:    [[B:%.*]] = trunc i64 %A to i32
+; CHECK-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
 ; CHECK-NEXT:    [[C:%.*]] = and i32 [[B]], 7224
 ; CHECK-NEXT:    [[D:%.*]] = or i32 [[C]], -32574
 ; CHECK-NEXT:    ret i32 [[D]]
@@ -732,7 +724,7 @@ define i32 @test54(i64 %A) {
 
 define i64 @test55(i32 %A) {
 ; CHECK-LABEL: @test55(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %A, 7224
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 7224
 ; CHECK-NEXT:    [[C:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[D:%.*]] = or i64 [[C]], -32574
 ; CHECK-NEXT:    ret i64 [[D]]
@@ -744,35 +736,35 @@ define i64 @test55(i32 %A) {
   ret i64 %E
 }
 
-define i64 @test56(i16 %A) nounwind {
+define i64 @test56(i16 %A) {
 ; CHECK-LABEL: @test56(
-; CHECK-NEXT:    [[TMP353:%.*]] = sext i16 %A to i64
-; CHECK-NEXT:    [[TMP354:%.*]] = lshr i64 [[TMP353]], 5
-; CHECK-NEXT:    [[TMP355:%.*]] = and i64 [[TMP354]], 134217727
-; CHECK-NEXT:    ret i64 [[TMP355]]
+; CHECK-NEXT:    [[P353:%.*]] = sext i16 [[A:%.*]] to i64
+; CHECK-NEXT:    [[P354:%.*]] = lshr i64 [[P353]], 5
+; CHECK-NEXT:    [[P355:%.*]] = and i64 [[P354]], 134217727
+; CHECK-NEXT:    ret i64 [[P355]]
 ;
-  %tmp353 = sext i16 %A to i32
-  %tmp354 = lshr i32 %tmp353, 5
-  %tmp355 = zext i32 %tmp354 to i64
-  ret i64 %tmp355
+  %p353 = sext i16 %A to i32
+  %p354 = lshr i32 %p353, 5
+  %p355 = zext i32 %p354 to i64
+  ret i64 %p355
 }
 
-define <2 x i64> @test56vec(<2 x i16> %A) nounwind {
+define <2 x i64> @test56vec(<2 x i16> %A) {
 ; CHECK-LABEL: @test56vec(
-; CHECK-NEXT:    [[TMP353:%.*]] = sext <2 x i16> [[A:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[TMP354:%.*]] = lshr <2 x i64> [[TMP353]], <i64 5, i64 5>
-; CHECK-NEXT:    [[TMP355:%.*]] = and <2 x i64> [[TMP354]], <i64 134217727, i64 134217727>
-; CHECK-NEXT:    ret <2 x i64> [[TMP355]]
+; CHECK-NEXT:    [[P353:%.*]] = sext <2 x i16> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[P354:%.*]] = lshr <2 x i32> [[P353]], <i32 5, i32 5>
+; CHECK-NEXT:    [[P355:%.*]] = zext <2 x i32> [[P354]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[P355]]
 ;
-  %tmp353 = sext <2 x i16> %A to <2 x i32>
-  %tmp354 = lshr <2 x i32> %tmp353, <i32 5, i32 5>
-  %tmp355 = zext <2 x i32> %tmp354 to <2 x i64>
-  ret <2 x i64> %tmp355
+  %p353 = sext <2 x i16> %A to <2 x i32>
+  %p354 = lshr <2 x i32> %p353, <i32 5, i32 5>
+  %p355 = zext <2 x i32> %p354 to <2 x i64>
+  ret <2 x i64> %p355
 }
 
-define i64 @test57(i64 %A) nounwind {
+define i64 @test57(i64 %A) {
 ; CHECK-LABEL: @test57(
-; CHECK-NEXT:    [[C:%.*]] = lshr i64 %A, 8
+; CHECK-NEXT:    [[C:%.*]] = lshr i64 [[A:%.*]], 8
 ; CHECK-NEXT:    [[E:%.*]] = and i64 [[C]], 16777215
 ; CHECK-NEXT:    ret i64 [[E]]
 ;
@@ -782,10 +774,11 @@ define i64 @test57(i64 %A) nounwind {
   ret i64 %E
 }
 
-define <2 x i64> @test57vec(<2 x i64> %A) nounwind {
+define <2 x i64> @test57vec(<2 x i64> %A) {
 ; CHECK-LABEL: @test57vec(
-; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i64> [[A:%.*]], <i64 8, i64 8>
-; CHECK-NEXT:    [[E:%.*]] = and <2 x i64> [[C]], <i64 16777215, i64 16777215>
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 8, i32 8>
+; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[C]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[E]]
 ;
   %B = trunc <2 x i64> %A to <2 x i32>
@@ -794,9 +787,9 @@ define <2 x i64> @test57vec(<2 x i64> %A) nounwind {
   ret <2 x i64> %E
 }
 
-define i64 @test58(i64 %A) nounwind {
+define i64 @test58(i64 %A) {
 ; CHECK-LABEL: @test58(
-; CHECK-NEXT:    [[C:%.*]] = lshr i64 %A, 8
+; CHECK-NEXT:    [[C:%.*]] = lshr i64 [[A:%.*]], 8
 ; CHECK-NEXT:    [[D:%.*]] = and i64 [[C]], 16777087
 ; CHECK-NEXT:    [[E:%.*]] = or i64 [[D]], 128
 ; CHECK-NEXT:    ret i64 [[E]]
@@ -809,12 +802,12 @@ define i64 @test58(i64 %A) nounwind {
 
 }
 
-define i64 @test59(i8 %A, i8 %B) nounwind {
+define i64 @test59(i8 %A, i8 %B) {
 ; CHECK-LABEL: @test59(
-; CHECK-NEXT:    [[C:%.*]] = zext i8 %A to i64
+; CHECK-NEXT:    [[C:%.*]] = zext i8 [[A:%.*]] to i64
 ; CHECK-NEXT:    [[D:%.*]] = shl nuw nsw i64 [[C]], 4
 ; CHECK-NEXT:    [[E:%.*]] = and i64 [[D]], 48
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 %B, 4
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 [[B:%.*]], 4
 ; CHECK-NEXT:    [[G:%.*]] = zext i8 [[TMP1]] to i64
 ; CHECK-NEXT:    [[H:%.*]] = or i64 [[E]], [[G]]
 ; CHECK-NEXT:    ret i64 [[H]]
@@ -829,76 +822,74 @@ define i64 @test59(i8 %A, i8 %B) nounwind {
   ret i64 %I
 }
 
-define <3 x i32> @test60(<4 x i32> %call4) nounwind {
+define <3 x i32> @test60(<4 x i32> %call4) {
 ; CHECK-LABEL: @test60(
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> %call4, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    ret <3 x i32> [[TMP10]]
+; CHECK-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    ret <3 x i32> [[P10]]
 ;
-  %tmp11 = bitcast <4 x i32> %call4 to i128
-  %tmp9 = trunc i128 %tmp11 to i96
-  %tmp10 = bitcast i96 %tmp9 to <3 x i32>
-  ret <3 x i32> %tmp10
+  %p11 = bitcast <4 x i32> %call4 to i128
+  %p9 = trunc i128 %p11 to i96
+  %p10 = bitcast i96 %p9 to <3 x i32>
+  ret <3 x i32> %p10
 
 }
 
-define <4 x i32> @test61(<3 x i32> %call4) nounwind {
+define <4 x i32> @test61(<3 x i32> %call4) {
 ; CHECK-LABEL: @test61(
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <3 x i32> %call4, <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[P10]]
 ;
-  %tmp11 = bitcast <3 x i32> %call4 to i96
-  %tmp9 = zext i96 %tmp11 to i128
-  %tmp10 = bitcast i128 %tmp9 to <4 x i32>
-  ret <4 x i32> %tmp10
+  %p11 = bitcast <3 x i32> %call4 to i96
+  %p9 = zext i96 %p11 to i128
+  %p10 = bitcast i128 %p9 to <4 x i32>
+  ret <4 x i32> %p10
 }
 
-define <4 x i32> @test62(<3 x float> %call4) nounwind {
+define <4 x i32> @test62(<3 x float> %call4) {
 ; CHECK-LABEL: @test62(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> %call4 to <3 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> [[CALL4:%.*]] to <3 x i32>
+; CHECK-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[P10]]
 ;
-  %tmp11 = bitcast <3 x float> %call4 to i96
-  %tmp9 = zext i96 %tmp11 to i128
-  %tmp10 = bitcast i128 %tmp9 to <4 x i32>
-  ret <4 x i32> %tmp10
+  %p11 = bitcast <3 x float> %call4 to i96
+  %p9 = zext i96 %p11 to i128
+  %p10 = bitcast i128 %p9 to <4 x i32>
+  ret <4 x i32> %p10
 }
 
 ; PR7311 - Don't create invalid IR on scalar->vector cast.
-define <2 x float> @test63(i64 %tmp8) nounwind {
+define <2 x float> @test63(i64 %t8) {
 ; CHECK-LABEL: @test63(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = bitcast i64 %tmp8 to <2 x i32>
+; CHECK-NEXT:    [[A:%.*]] = bitcast i64 [[T8:%.*]] to <2 x i32>
 ; CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
 ;
-entry:
-  %a = bitcast i64 %tmp8 to <2 x i32>
+  %a = bitcast i64 %t8 to <2 x i32>
   %vcvt.i = uitofp <2 x i32> %a to <2 x float>
   ret <2 x float> %vcvt.i
 }
 
-define <4 x float> @test64(<4 x float> %c) nounwind {
+define <4 x float> @test64(<4 x float> %c) {
 ; CHECK-LABEL: @test64(
-; CHECK-NEXT:    ret <4 x float> %c
+; CHECK-NEXT:    ret <4 x float> [[C:%.*]]
 ;
   %t0 = bitcast <4 x float> %c to <4 x i32>
   %t1 = bitcast <4 x i32> %t0 to <4 x float>
   ret <4 x float> %t1
 }
 
-define <4 x float> @test65(<4 x float> %c) nounwind {
+define <4 x float> @test65(<4 x float> %c) {
 ; CHECK-LABEL: @test65(
-; CHECK-NEXT:    ret <4 x float> %c
+; CHECK-NEXT:    ret <4 x float> [[C:%.*]]
 ;
   %t0 = bitcast <4 x float> %c to <2 x double>
   %t1 = bitcast <2 x double> %t0 to <4 x float>
   ret <4 x float> %t1
 }
 
-define <2 x float> @test66(<2 x float> %c) nounwind {
+define <2 x float> @test66(<2 x float> %c) {
 ; CHECK-LABEL: @test66(
-; CHECK-NEXT:    ret <2 x float> %c
+; CHECK-NEXT:    ret <2 x float> [[C:%.*]]
 ;
   %t0 = bitcast <2 x float> %c to double
   %t1 = bitcast double %t0 to <2 x float>
@@ -912,18 +903,18 @@ define float @test2c() {
   ret float extractelement (<2 x float> bitcast (double bitcast (<2 x float> <float -1.000000e+00, float -1.000000e+00> to double) to <2 x float>), i32 0)
 }
 
-define i64 @test_mmx(<2 x i32> %c) nounwind {
+define i64 @test_mmx(<2 x i32> %x) {
 ; CHECK-LABEL: @test_mmx(
-; CHECK-NEXT:    [[C:%.*]] = bitcast <2 x i32> %c to i64
+; CHECK-NEXT:    [[C:%.*]] = bitcast <2 x i32> [[X:%.*]] to i64
 ; CHECK-NEXT:    ret i64 [[C]]
 ;
-  %A = bitcast <2 x i32> %c to x86_mmx
+  %A = bitcast <2 x i32> %x to x86_mmx
   %B = bitcast x86_mmx %A to <2 x i32>
   %C = bitcast <2 x i32> %B to i64
   ret i64 %C
 }
 
-define i64 @test_mmx_const(<2 x i32> %c) nounwind {
+define i64 @test_mmx_const(<2 x i32> %c) {
 ; CHECK-LABEL: @test_mmx_const(
 ; CHECK-NEXT:    ret i64 0
 ;
@@ -938,8 +929,8 @@ define i1 @test67(i1 %a, i32 %b) {
 ; CHECK-LABEL: @test67(
 ; CHECK-NEXT:    ret i1 false
 ;
-  %tmp2 = zext i1 %a to i32
-  %conv6 = xor i32 %tmp2, 1
+  %t2 = zext i1 %a to i32
+  %conv6 = xor i32 %t2, 1
   %and = and i32 %b, %conv6
   %sext = shl nuw nsw i32 %and, 24
   %neg.i = xor i32 %sext, -16777216
@@ -953,9 +944,9 @@ define i1 @test67(i1 %a, i32 %b) {
 
 define %s @test68(%s *%p, i64 %i) {
 ; CHECK-LABEL: @test68(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr %s, %s* %p, i64 %i
-; CHECK-NEXT:    [[L:%.*]] = load %s, %s* [[PP1]], align 4
-; CHECK-NEXT:    ret %s [[L]]
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; CHECK-NEXT:    ret [[S]] %l
 ;
   %o = mul i64 %i, 12
   %q = bitcast %s* %p to i8*
@@ -968,9 +959,9 @@ define %s @test68(%s *%p, i64 %i) {
 ; addrspacecasts should be eliminated.
 define %s @test68_addrspacecast(%s* %p, i64 %i) {
 ; CHECK-LABEL: @test68_addrspacecast(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr %s, %s* %p, i64 %i
-; CHECK-NEXT:    [[L:%.*]] = load %s, %s* [[PP1]], align 4
-; CHECK-NEXT:    ret %s [[L]]
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; CHECK-NEXT:    ret [[S]] %l
 ;
   %o = mul i64 %i, 12
   %q = addrspacecast %s* %p to i8 addrspace(2)*
@@ -982,10 +973,10 @@ define %s @test68_addrspacecast(%s* %p, i64 %i) {
 
 define %s @test68_addrspacecast_2(%s* %p, i64 %i) {
 ; CHECK-LABEL: @test68_addrspacecast_2(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr %s, %s* %p, i64 %i
-; CHECK-NEXT:    [[R:%.*]] = addrspacecast %s* [[PP1]] to %s addrspace(1)*
-; CHECK-NEXT:    [[L:%.*]] = load %s, %s addrspace(1)* [[R]], align 4
-; CHECK-NEXT:    ret %s [[L]]
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = addrspacecast %s* [[PP1]] to [[S]] addrspace(1)*
+; CHECK-NEXT:    [[L:%.*]] = load [[S]], [[S]] addrspace(1)* [[R]], align 4
+; CHECK-NEXT:    ret [[S]] %l
 ;
   %o = mul i64 %i, 12
   %q = addrspacecast %s* %p to i8 addrspace(2)*
@@ -997,9 +988,9 @@ define %s @test68_addrspacecast_2(%s* %p, i64 %i) {
 
 define %s @test68_as1(%s addrspace(1)* %p, i32 %i) {
 ; CHECK-LABEL: @test68_as1(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr %s, %s addrspace(1)* %p, i32 %i
-; CHECK-NEXT:    [[L:%.*]] = load %s, %s addrspace(1)* [[PP1]], align 4
-; CHECK-NEXT:    ret %s [[L]]
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], [[S]] addrspace(1)* [[P:%.*]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[L:%.*]] = load [[S]], [[S]] addrspace(1)* [[PP1]], align 4
+; CHECK-NEXT:    ret [[S]] %l
 ;
   %o = mul i32 %i, 12
   %q = bitcast %s addrspace(1)* %p to i8 addrspace(1)*
@@ -1011,7 +1002,7 @@ define %s @test68_as1(%s addrspace(1)* %p, i32 %i) {
 
 define double @test69(double *%p, i64 %i) {
 ; CHECK-LABEL: @test69(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* %p, i64 %i
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[I:%.*]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
@@ -1025,10 +1016,10 @@ define double @test69(double *%p, i64 %i) {
 
 define %s @test70(%s *%p, i64 %i) {
 ; CHECK-LABEL: @test70(
-; CHECK-NEXT:    [[O:%.*]] = mul nsw i64 %i, 3
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds %s, %s* %p, i64 [[O]]
-; CHECK-NEXT:    [[L:%.*]] = load %s, %s* [[PP1]], align 4
-; CHECK-NEXT:    ret %s [[L]]
+; CHECK-NEXT:    [[O:%.*]] = mul nsw i64 [[I:%.*]], 3
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds [[S:%.*]], %s* [[P:%.*]], i64 [[O]]
+; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; CHECK-NEXT:    ret [[S]] %l
 ;
   %o = mul nsw i64 %i, 36
   %q = bitcast %s* %p to i8*
@@ -1040,8 +1031,8 @@ define %s @test70(%s *%p, i64 %i) {
 
 define double @test71(double *%p, i64 %i) {
 ; CHECK-LABEL: @test71(
-; CHECK-NEXT:    [[O:%.*]] = shl i64 %i, 2
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr double, double* %p, i64 [[O]]
+; CHECK-NEXT:    [[O:%.*]] = shl i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr double, double* [[P:%.*]], i64 [[O]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
@@ -1055,8 +1046,8 @@ define double @test71(double *%p, i64 %i) {
 
 define double @test72(double *%p, i32 %i) {
 ; CHECK-LABEL: @test72(
-; CHECK-NEXT:    [[O:%.*]] = sext i32 %i to i64
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* %p, i64 [[O]]
+; CHECK-NEXT:    [[O:%.*]] = sext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[O]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
@@ -1071,8 +1062,8 @@ define double @test72(double *%p, i32 %i) {
 
 define double @test73(double *%p, i128 %i) {
 ; CHECK-LABEL: @test73(
-; CHECK-NEXT:    [[O:%.*]] = trunc i128 %i to i64
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr double, double* %p, i64 [[O]]
+; CHECK-NEXT:    [[I_TR:%.*]] = trunc i128 [[I:%.*]] to i64
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr double, double* [[P:%.*]], i64 [[I_TR]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
@@ -1087,7 +1078,7 @@ define double @test73(double *%p, i128 %i) {
 
 define double @test74(double *%p, i64 %i) {
 ; CHECK-LABEL: @test74(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* %p, i64 %i
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[I:%.*]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
@@ -1100,9 +1091,9 @@ define double @test74(double *%p, i64 %i) {
 
 define i32* @test75(i32* %p, i32 %x) {
 ; CHECK-LABEL: @test75(
-; CHECK-NEXT:    [[Y:%.*]] = shl i32 %x, 3
+; CHECK-NEXT:    [[Y:%.*]] = shl i32 [[X:%.*]], 3
 ; CHECK-NEXT:    [[Z:%.*]] = sext i32 [[Y]] to i64
-; CHECK-NEXT:    [[Q:%.*]] = bitcast i32* %p to i8*
+; CHECK-NEXT:    [[Q:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[R:%.*]] = getelementptr i8, i8* [[Q]], i64 [[Z]]
 ; CHECK-NEXT:    [[S:%.*]] = bitcast i8* [[R]] to i32*
 ; CHECK-NEXT:    ret i32* [[S]]
@@ -1117,10 +1108,10 @@ define i32* @test75(i32* %p, i32 %x) {
 
 define %s @test76(%s *%p, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test76(
-; CHECK-NEXT:    [[O2:%.*]] = mul i64 %i, %j
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr %s, %s* %p, i64 [[O2]]
-; CHECK-NEXT:    [[L:%.*]] = load %s, %s* [[PP1]], align 4
-; CHECK-NEXT:    ret %s [[L]]
+; CHECK-NEXT:    [[O2:%.*]] = mul i64 [[I:%.*]], [[J:%.*]]
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[O2]]
+; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; CHECK-NEXT:    ret [[S]] %l
 ;
   %o = mul i64 %i, 12
   %o2 = mul nsw i64 %o, %j
@@ -1133,11 +1124,11 @@ define %s @test76(%s *%p, i64 %i, i64 %j) {
 
 define %s @test77(%s *%p, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test77(
-; CHECK-NEXT:    [[O:%.*]] = mul nsw i64 %i, 3
-; CHECK-NEXT:    [[O2:%.*]] = mul nsw i64 [[O]], %j
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds %s, %s* %p, i64 [[O2]]
-; CHECK-NEXT:    [[L:%.*]] = load %s, %s* [[PP1]], align 4
-; CHECK-NEXT:    ret %s [[L]]
+; CHECK-NEXT:    [[O:%.*]] = mul nsw i64 [[I:%.*]], 3
+; CHECK-NEXT:    [[O2:%.*]] = mul nsw i64 [[O]], [[J:%.*]]
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds [[S:%.*]], %s* [[P:%.*]], i64 [[O2]]
+; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; CHECK-NEXT:    ret [[S]] %l
 ;
   %o = mul nsw i64 %i, 36
   %o2 = mul nsw i64 %o, %j
@@ -1150,17 +1141,17 @@ define %s @test77(%s *%p, i64 %i, i64 %j) {
 
 define %s @test78(%s *%p, i64 %i, i64 %j, i32 %k, i32 %l, i128 %m, i128 %n) {
 ; CHECK-LABEL: @test78(
-; CHECK-NEXT:    [[A:%.*]] = mul nsw i32 %k, 3
-; CHECK-NEXT:    [[B:%.*]] = mul nsw i32 [[A]], %l
+; CHECK-NEXT:    [[A:%.*]] = mul nsw i32 [[K:%.*]], 3
+; CHECK-NEXT:    [[B:%.*]] = mul nsw i32 [[A]], [[L:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = sext i32 [[B]] to i128
-; CHECK-NEXT:    [[D:%.*]] = mul nsw i128 [[C]], %m
-; CHECK-NEXT:    [[E:%.*]] = mul i128 [[D]], %n
+; CHECK-NEXT:    [[D:%.*]] = mul nsw i128 [[C]], [[M:%.*]]
+; CHECK-NEXT:    [[E:%.*]] = mul i128 [[D]], [[N:%.*]]
 ; CHECK-NEXT:    [[F:%.*]] = trunc i128 [[E]] to i64
-; CHECK-NEXT:    [[G:%.*]] = mul i64 [[F]], %i
-; CHECK-NEXT:    [[H:%.*]] = mul i64 [[G]], %j
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr %s, %s* %p, i64 [[H]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load %s, %s* [[PP1]], align 4
-; CHECK-NEXT:    ret %s [[LOAD]]
+; CHECK-NEXT:    [[G:%.*]] = mul i64 [[F]], [[I:%.*]]
+; CHECK-NEXT:    [[H:%.*]] = mul i64 [[G]], [[J:%.*]]
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[H]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load [[S]], %s* [[PP1]], align 4
+; CHECK-NEXT:    ret [[S]] %load
 ;
   %a = mul nsw i32 %k, 36
   %b = mul nsw i32 %a, %l
@@ -1179,15 +1170,15 @@ define %s @test78(%s *%p, i64 %i, i64 %j, i32 %k, i32 %l, i128 %m, i128 %n) {
 
 define %s @test79(%s *%p, i64 %i, i32 %j) {
 ; CHECK-LABEL: @test79(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %i to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = mul i32 [[TMP1]], 36
-; CHECK-NEXT:    [[C:%.*]] = mul i32 [[B]], %j
-; CHECK-NEXT:    [[Q:%.*]] = bitcast %s* %p to i8*
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[B]], [[J:%.*]]
+; CHECK-NEXT:    [[Q:%.*]] = bitcast %s* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[C]] to i64
 ; CHECK-NEXT:    [[PP:%.*]] = getelementptr inbounds i8, i8* [[Q]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[R:%.*]] = bitcast i8* [[PP]] to %s*
-; CHECK-NEXT:    [[L:%.*]] = load %s, %s* [[R]], align 4
-; CHECK-NEXT:    ret %s [[L]]
+; CHECK-NEXT:    [[L:%.*]] = load [[S:%.*]], %s* [[R]], align 4
+; CHECK-NEXT:    ret [[S]] %l
 ;
   %a = mul nsw i64 %i, 36
   %b = trunc i64 %a to i32
@@ -1201,14 +1192,14 @@ define %s @test79(%s *%p, i64 %i, i32 %j) {
 
 define double @test80([100 x double]* %p, i32 %i) {
 ; CHECK-LABEL: @test80(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 %i to i64
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double]* %p, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double]* [[P:%.*]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
-  %tmp = shl nsw i32 %i, 3
+  %t = shl nsw i32 %i, 3
   %q = bitcast [100 x double]* %p to i8*
-  %pp = getelementptr i8, i8* %q, i32 %tmp
+  %pp = getelementptr i8, i8* %q, i32 %t
   %r = bitcast i8* %pp to double*
   %l = load double, double* %r
   ret double %l
@@ -1216,13 +1207,13 @@ define double @test80([100 x double]* %p, i32 %i) {
 
 define double @test80_addrspacecast([100 x double] addrspace(1)* %p, i32 %i) {
 ; CHECK-LABEL: @test80_addrspacecast(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* %p, i32 0, i32 %i
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[I:%.*]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, double addrspace(1)* [[PP1]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
-  %tmp = shl nsw i32 %i, 3
+  %t = shl nsw i32 %i, 3
   %q = addrspacecast [100 x double] addrspace(1)* %p to i8 addrspace(2)*
-  %pp = getelementptr i8, i8 addrspace(2)* %q, i32 %tmp
+  %pp = getelementptr i8, i8 addrspace(2)* %q, i32 %t
   %r = addrspacecast i8 addrspace(2)* %pp to double addrspace(1)*
   %l = load double, double addrspace(1)* %r
   ret double %l
@@ -1230,14 +1221,14 @@ define double @test80_addrspacecast([100 x double] addrspace(1)* %p, i32 %i) {
 
 define double @test80_addrspacecast_2([100 x double] addrspace(1)* %p, i32 %i) {
 ; CHECK-LABEL: @test80_addrspacecast_2(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* %p, i32 0, i32 %i
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[I:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = addrspacecast double addrspace(1)* [[PP1]] to double addrspace(3)*
 ; CHECK-NEXT:    [[L:%.*]] = load double, double addrspace(3)* [[R]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
-  %tmp = shl nsw i32 %i, 3
+  %t = shl nsw i32 %i, 3
   %q = addrspacecast [100 x double] addrspace(1)* %p to i8 addrspace(2)*
-  %pp = getelementptr i8, i8 addrspace(2)* %q, i32 %tmp
+  %pp = getelementptr i8, i8 addrspace(2)* %q, i32 %t
   %r = addrspacecast i8 addrspace(2)* %pp to double addrspace(3)*
   %l = load double, double addrspace(3)* %r
   ret double %l
@@ -1245,14 +1236,14 @@ define double @test80_addrspacecast_2([100 x double] addrspace(1)* %p, i32 %i) {
 
 define double @test80_as1([100 x double] addrspace(1)* %p, i16 %i) {
 ; CHECK-LABEL: @test80_as1(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 %i to i32
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* %p, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[I:%.*]] to i32
+; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, double addrspace(1)* [[PP1]], align 8
 ; CHECK-NEXT:    ret double [[L]]
 ;
-  %tmp = shl nsw i16 %i, 3
+  %t = shl nsw i16 %i, 3
   %q = bitcast [100 x double] addrspace(1)* %p to i8 addrspace(1)*
-  %pp = getelementptr i8, i8 addrspace(1)* %q, i16 %tmp
+  %pp = getelementptr i8, i8 addrspace(1)* %q, i16 %t
   %r = bitcast i8 addrspace(1)* %pp to double addrspace(1)*
   %l = load double, double addrspace(1)* %r
   ret double %l
@@ -1260,8 +1251,8 @@ define double @test80_as1([100 x double] addrspace(1)* %p, i16 %i) {
 
 define double @test81(double *%p, float %f) {
 ; CHECK-LABEL: @test81(
-; CHECK-NEXT:    [[I:%.*]] = fptosi float %f to i64
-; CHECK-NEXT:    [[Q:%.*]] = bitcast double* %p to i8*
+; CHECK-NEXT:    [[I:%.*]] = fptosi float [[F:%.*]] to i64
+; CHECK-NEXT:    [[Q:%.*]] = bitcast double* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[PP:%.*]] = getelementptr i8, i8* [[Q]], i64 [[I]]
 ; CHECK-NEXT:    [[R:%.*]] = bitcast i8* [[PP]] to double*
 ; CHECK-NEXT:    [[L:%.*]] = load double, double* [[R]], align 8
@@ -1275,9 +1266,9 @@ define double @test81(double *%p, float %f) {
   ret double %l
 }
 
-define i64 @test82(i64 %A) nounwind {
+define i64 @test82(i64 %A) {
 ; CHECK-LABEL: @test82(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 %A, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[A:%.*]], 1
 ; CHECK-NEXT:    [[E:%.*]] = and i64 [[TMP1]], 4294966784
 ; CHECK-NEXT:    ret i64 [[E]]
 ;
@@ -1291,8 +1282,8 @@ define i64 @test82(i64 %A) nounwind {
 ; PR15959
 define i64 @test83(i16 %a, i64 %k) {
 ; CHECK-LABEL: @test83(
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 %a to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %k to i32
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[K:%.*]] to i32
 ; CHECK-NEXT:    [[SH_PROM:%.*]] = add i32 [[TMP1]], -1
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[CONV]], [[SH_PROM]]
 ; CHECK-NEXT:    [[SH_PROM1:%.*]] = zext i32 [[SHL]] to i64
@@ -1308,7 +1299,7 @@ define i64 @test83(i16 %a, i64 %k) {
 
 define i8 @test84(i32 %a) {
 ; CHECK-LABEL: @test84(
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 %a, 2130706432
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], 2130706432
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR]] to i8
 ; CHECK-NEXT:    ret i8 [[TRUNC]]
@@ -1321,7 +1312,7 @@ define i8 @test84(i32 %a) {
 
 define i8 @test85(i32 %a) {
 ; CHECK-LABEL: @test85(
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 %a, 2130706432
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], 2130706432
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR]] to i8
 ; CHECK-NEXT:    ret i8 [[TRUNC]]
@@ -1334,8 +1325,8 @@ define i8 @test85(i32 %a) {
 
 define i16 @test86(i16 %v) {
 ; CHECK-LABEL: @test86(
-; CHECK-NEXT:    [[S1:%.*]] = ashr i16 %v, 4
-; CHECK-NEXT:    ret i16 [[S1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 4
+; CHECK-NEXT:    ret i16 [[TMP1]]
 ;
   %a = sext i16 %v to i32
   %s = ashr i32 %a, 4
@@ -1345,8 +1336,8 @@ define i16 @test86(i16 %v) {
 
 define i16 @test87(i16 %v) {
 ; CHECK-LABEL: @test87(
-; CHECK-NEXT:    [[A1:%.*]] = ashr i16 %v, 12
-; CHECK-NEXT:    ret i16 [[A1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 12
+; CHECK-NEXT:    ret i16 [[TMP1]]
 ;
   %c = sext i16 %v to i32
   %m = mul nsw i32 %c, 16
@@ -1357,7 +1348,7 @@ define i16 @test87(i16 %v) {
 
 define i16 @test88(i16 %v) {
 ; CHECK-LABEL: @test88(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i16 %v, 15
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 15
 ; CHECK-NEXT:    ret i16 [[TMP1]]
 ;
   %a = sext i16 %v to i32
@@ -1368,7 +1359,7 @@ define i16 @test88(i16 %v) {
 
 define i32 @PR21388(i32* %v) {
 ; CHECK-LABEL: @PR21388(
-; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i32* %v, null
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i32* [[V:%.*]], null
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
 ; CHECK-NEXT:    ret i32 [[SEXT]]
 ;
@@ -1379,7 +1370,7 @@ define i32 @PR21388(i32* %v) {
 
 define float @sitofp_zext(i16 %a) {
 ; CHECK-LABEL: @sitofp_zext(
-; CHECK-NEXT:    [[SITOFP:%.*]] = uitofp i16 %a to float
+; CHECK-NEXT:    [[SITOFP:%.*]] = uitofp i16 [[A:%.*]] to float
 ; CHECK-NEXT:    ret float [[SITOFP]]
 ;
   %zext = zext i16 %a to i32
@@ -1389,7 +1380,7 @@ define float @sitofp_zext(i16 %a) {
 
 define i1 @PR23309(i32 %A, i32 %B) {
 ; CHECK-LABEL: @PR23309(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 %A, %B
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[SUB]], 1
 ; CHECK-NEXT:    [[TRUNC:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[TRUNC]]
@@ -1402,7 +1393,7 @@ define i1 @PR23309(i32 %A, i32 %B) {
 
 define i1 @PR23309v2(i32 %A, i32 %B) {
 ; CHECK-LABEL: @PR23309v2(
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 %A, %B
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[SUB]], 1
 ; CHECK-NEXT:    [[TRUNC:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[TRUNC]]
@@ -1415,7 +1406,7 @@ define i1 @PR23309v2(i32 %A, i32 %B) {
 
 define i16 @PR24763(i8 %V) {
 ; CHECK-LABEL: @PR24763(
-; CHECK-NEXT:    [[L:%.*]] = ashr i8 %V, 1
+; CHECK-NEXT:    [[L:%.*]] = ashr i8 [[V:%.*]], 1
 ; CHECK-NEXT:    [[T:%.*]] = sext i8 [[L]] to i16
 ; CHECK-NEXT:    ret i16 [[T]]
 ;
@@ -1428,7 +1419,7 @@ define i16 @PR24763(i8 %V) {
 define i64 @PR28745() {
 ; CHECK-LABEL: @PR28745(
 ; CHECK-NEXT:    ret i64 1
-
+;
   %b = zext i32 extractvalue ({ i32 } select (i1 icmp eq (i16 extractelement (<2 x i16> bitcast (<1 x i32> <i32 1> to <2 x i16>), i32 0), i16 0), { i32 } { i32 1 }, { i32 } zeroinitializer), 0) to i64
   ret i64 %b
 }
@@ -1436,20 +1427,22 @@ define i64 @PR28745() {
 define i32 @test89() {
 ; CHECK-LABEL: @test89(
 ; CHECK-NEXT:    ret i32 393216
+;
   ret i32 bitcast (<2 x i16> <i16 6, i16 undef> to i32)
 }
 
 define <2 x i32> @test90() {
 ; CHECK-LABEL: @test90(
-; CHECK: ret <2 x i32> <i32 0, i32 15360>
-  %tmp6 = bitcast <4 x half> <half undef, half undef, half undef, half 0xH3C00> to <2 x i32>
-  ret <2 x i32> %tmp6
+; CHECK-NEXT:    ret <2 x i32> <i32 0, i32 15360>
+;
+  %t6 = bitcast <4 x half> <half undef, half undef, half undef, half 0xH3C00> to <2 x i32>
+  ret <2 x i32> %t6
 }
 
 ; Do not optimize to ashr i64 (shift by 48 > 96 - 64)
 define i64 @test91(i64 %A) {
 ; CHECK-LABEL: @test91(
-; CHECK-NEXT:    [[B:%.*]] = sext i64 %A to i96
+; CHECK-NEXT:    [[B:%.*]] = sext i64 [[A:%.*]] to i96
 ; CHECK-NEXT:    [[C:%.*]] = lshr i96 [[B]], 48
 ; CHECK-NEXT:    [[D:%.*]] = trunc i96 [[C]] to i64
 ; CHECK-NEXT:    ret i64 [[D]]
@@ -1463,8 +1456,8 @@ define i64 @test91(i64 %A) {
 ; Do optimize to ashr i64 (shift by 32 <= 96 - 64)
 define i64 @test92(i64 %A) {
 ; CHECK-LABEL: @test92(
-; CHECK-NEXT:    [[C:%.*]] = ashr i64 %A, 32
-; CHECK-NEXT:    ret i64 [[C]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i64 [[A:%.*]], 32
+; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %B = sext i64 %A to i96
   %C = lshr i96 %B, 32
@@ -1475,8 +1468,8 @@ define i64 @test92(i64 %A) {
 ; When optimizing to ashr i32, don't shift by more than 31.
 define i32 @test93(i32 %A) {
 ; CHECK-LABEL: @test93(
-; CHECK-NEXT:    [[C:%.*]] = ashr i32 %A, 31
-; CHECK-NEXT:    ret i32 [[C]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[A:%.*]], 31
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %B = sext i32 %A to i96
   %C = lshr i96 %B, 64
@@ -1489,8 +1482,8 @@ define i32 @test93(i32 %A) {
 
 define i8 @pr33078_1(i8 %A) {
 ; CHECK-LABEL: @pr33078_1(
-; CHECK-NEXT:    [[C:%.*]] = ashr i8 [[A:%.*]], 7
-; CHECK-NEXT:    ret i8 [[C]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i8 [[A:%.*]], 7
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %B = sext i8 %A to i16
   %C = lshr i16 %B, 8
@@ -1526,10 +1519,11 @@ define i4 @pr33078_3(i8 %A) {
 define i8 @pr33078_4(i3 %x) {
 ; Don't turn this in an `ashr`. This was getting miscompiled
 ; CHECK-LABEL: @pr33078_4(
-; CHECK-NEXT:    [[B:%.*]] = sext i3 %x to i16
+; CHECK-NEXT:    [[B:%.*]] = sext i3 [[X:%.*]] to i16
 ; CHECK-NEXT:    [[C:%.*]] = lshr i16 [[B]], 13
 ; CHECK-NEXT:    [[D:%.*]] = trunc i16 [[C]] to i8
 ; CHECK-NEXT:    ret i8 [[D]]
+;
   %B = sext i3 %x to i16
   %C = lshr i16 %B, 13
   %D = trunc i16 %C to i8
diff --git a/test/Transforms/InstCombine/cmp-intrinsic.ll b/test/Transforms/InstCombine/cmp-intrinsic.ll
index 7fc1d12916bf88a765e35cd86699830c533253b3..1ea608d5945b79207aec7ac93a95ca6cc4cf3696 100644
--- a/test/Transforms/InstCombine/cmp-intrinsic.ll
+++ b/test/Transforms/InstCombine/cmp-intrinsic.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 declare i16 @llvm.bswap.i16(i16)
@@ -13,7 +14,7 @@ declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
 
 define i1 @bswap_eq_i16(i16 %x) {
 ; CHECK-LABEL: @bswap_eq_i16(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 %x, 256
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[X:%.*]], 256
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %bs = call i16 @llvm.bswap.i16(i16 %x)
@@ -23,7 +24,7 @@ define i1 @bswap_eq_i16(i16 %x) {
 
 define i1 @bswap_ne_i32(i32 %x) {
 ; CHECK-LABEL: @bswap_ne_i32(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 %x, 33554432
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 33554432
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %bs = tail call i32 @llvm.bswap.i32(i32 %x)
@@ -33,7 +34,7 @@ define i1 @bswap_ne_i32(i32 %x) {
 
 define <2 x i1> @bswap_eq_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: @bswap_eq_v2i64(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i64> %x, <i64 216172782113783808, i64 216172782113783808>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i64> [[X:%.*]], <i64 216172782113783808, i64 216172782113783808>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %bs = tail call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %x)
@@ -43,7 +44,7 @@ define <2 x i1> @bswap_eq_v2i64(<2 x i64> %x) {
 
 define i1 @ctlz_eq_bitwidth_i32(i32 %x) {
 ; CHECK-LABEL: @ctlz_eq_bitwidth_i32(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %lz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
@@ -51,9 +52,84 @@ define i1 @ctlz_eq_bitwidth_i32(i32 %x) {
   ret i1 %cmp
 }
 
+define i1 @ctlz_eq_zero_i32(i32 %x) {
+; CHECK-LABEL: @ctlz_eq_zero_i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %lz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %cmp = icmp eq i32 %lz, 0
+  ret i1 %cmp
+}
+
+define <2 x i1> @ctlz_ne_zero_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: @ctlz_ne_zero_v2i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i32> [[A:%.*]], <i32 -1, i32 -1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
+  %cmp = icmp ne <2 x i32> %x, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define i1 @ctlz_eq_bw_minus_1_i32(i32 %x) {
+; CHECK-LABEL: @ctlz_eq_bw_minus_1_i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %lz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %cmp = icmp eq i32 %lz, 31
+  ret i1 %cmp
+}
+
+define <2 x i1> @ctlz_ne_bw_minus_1_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: @ctlz_ne_bw_minus_1_v2i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[A:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
+  %cmp = icmp ne <2 x i32> %x, <i32 31, i32 31>
+  ret <2 x i1> %cmp
+}
+
+define i1 @ctlz_eq_other_i32(i32 %x) {
+; CHECK-LABEL: @ctlz_eq_other_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], -128
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 128
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %lz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %cmp = icmp eq i32 %lz, 24
+  ret i1 %cmp
+}
+
+define <2 x i1> @ctlz_ne_other_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: @ctlz_ne_other_v2i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], <i32 -128, i32 -128>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[TMP1]], <i32 128, i32 128>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
+  %cmp = icmp ne <2 x i32> %x, <i32 24, i32 24>
+  ret <2 x i1> %cmp
+}
+
+define i1 @ctlz_eq_other_i32_multiuse(i32 %x, i32* %p) {
+; CHECK-LABEL: @ctlz_eq_other_i32_multiuse(
+; CHECK-NEXT:    [[LZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    store i32 [[LZ]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LZ]], 24
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %lz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  store i32 %lz, i32* %p
+  %cmp = icmp eq i32 %lz, 24
+  ret i1 %cmp
+}
+
 define <2 x i1> @ctlz_ne_bitwidth_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: @ctlz_ne_bitwidth_v2i32(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> %a, zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
@@ -63,7 +139,7 @@ define <2 x i1> @ctlz_ne_bitwidth_v2i32(<2 x i32> %a) {
 
 define i1 @cttz_ne_bitwidth_i33(i33 %x) {
 ; CHECK-LABEL: @cttz_ne_bitwidth_i33(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i33 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i33 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %tz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false)
@@ -73,7 +149,7 @@ define i1 @cttz_ne_bitwidth_i33(i33 %x) {
 
 define <2 x i1> @cttz_eq_bitwidth_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: @cttz_eq_bitwidth_v2i32(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
@@ -81,9 +157,86 @@ define <2 x i1> @cttz_eq_bitwidth_v2i32(<2 x i32> %a) {
   ret <2 x i1> %cmp
 }
 
+define i1 @cttz_eq_zero_i33(i33 %x) {
+; CHECK-LABEL: @cttz_eq_zero_i33(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i33 [[X:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i33 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %tz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false)
+  %cmp = icmp eq i33 %tz, 0
+  ret i1 %cmp
+}
+
+define <2 x i1> @cttz_ne_zero_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: @cttz_ne_zero_v2i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
+  %cmp = icmp ne <2 x i32> %x, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define i1 @cttz_eq_bw_minus_1_i33(i33 %x) {
+; CHECK-LABEL: @cttz_eq_bw_minus_1_i33(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i33 [[X:%.*]], -4294967296
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %tz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false)
+  %cmp = icmp eq i33 %tz, 32
+  ret i1 %cmp
+}
+
+define <2 x i1> @cttz_ne_bw_minus_1_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: @cttz_ne_bw_minus_1_v2i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[A:%.*]], <i32 -2147483648, i32 -2147483648>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
+  %cmp = icmp ne <2 x i32> %x, <i32 31, i32 31>
+  ret <2 x i1> %cmp
+}
+
+define i1 @cttz_eq_other_i33(i33 %x) {
+; CHECK-LABEL: @cttz_eq_other_i33(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i33 [[X:%.*]], 31
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i33 [[TMP1]], 16
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %tz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false)
+  %cmp = icmp eq i33 %tz, 4
+  ret i1 %cmp
+}
+
+define <2 x i1> @cttz_ne_other_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: @cttz_ne_other_v2i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], <i32 31, i32 31>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[TMP1]], <i32 16, i32 16>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
+  %cmp = icmp ne <2 x i32> %x, <i32 4, i32 4>
+  ret <2 x i1> %cmp
+}
+
+define i1 @cttz_eq_other_i33_multiuse(i33 %x, i33* %p) {
+; CHECK-LABEL: @cttz_eq_other_i33_multiuse(
+; CHECK-NEXT:    [[LZ:%.*]] = tail call i33 @llvm.cttz.i33(i33 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    store i33 [[LZ]], i33* [[P:%.*]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i33 [[LZ]], 4
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %lz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false)
+  store i33 %lz, i33* %p
+  %cmp = icmp eq i33 %lz, 4
+  ret i1 %cmp
+}
+
 define i1 @ctpop_eq_zero_i11(i11 %x) {
 ; CHECK-LABEL: @ctpop_eq_zero_i11(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i11 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i11 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %pop = tail call i11 @llvm.ctpop.i11(i11 %x)
@@ -93,7 +246,7 @@ define i1 @ctpop_eq_zero_i11(i11 %x) {
 
 define <2 x i1> @ctpop_ne_zero_v2i32(<2 x i32> %x) {
 ; CHECK-LABEL: @ctpop_ne_zero_v2i32(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> %x, zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %pop = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
@@ -103,7 +256,7 @@ define <2 x i1> @ctpop_ne_zero_v2i32(<2 x i32> %x) {
 
 define i1 @ctpop_eq_bitwidth_i8(i8 %x) {
 ; CHECK-LABEL: @ctpop_eq_bitwidth_i8(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 %x, -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %pop = tail call i8 @llvm.ctpop.i8(i8 %x)
@@ -113,7 +266,7 @@ define i1 @ctpop_eq_bitwidth_i8(i8 %x) {
 
 define <2 x i1> @ctpop_ne_bitwidth_v2i32(<2 x i32> %x) {
 ; CHECK-LABEL: @ctpop_ne_bitwidth_v2i32(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> %x, <i32 -1, i32 -1>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[X:%.*]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %pop = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
diff --git a/test/Transforms/InstCombine/fsh.ll b/test/Transforms/InstCombine/fsh.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f4c54ad3c9227612681cc1ce6215ac537eccbf99
--- /dev/null
+++ b/test/Transforms/InstCombine/fsh.ll
@@ -0,0 +1,430 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i33 @llvm.fshr.i33(i33, i33, i33)
+declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+declare <2 x i31> @llvm.fshl.v2i31(<2 x i31>, <2 x i31>, <2 x i31>)
+
+; If the shift mask doesn't include any demanded bits, the funnel shift can be eliminated.
+
+define i32 @fshl_mask_simplify1(i32 %x, i32 %y, i32 %sh) {
+; CHECK-LABEL: @fshl_mask_simplify1(
+; CHECK-NEXT:    ret i32 [[X:%.*]]
+;
+  %maskedsh = and i32 %sh, 32
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %maskedsh)
+  ret i32 %r
+}
+
+define <2 x i32> @fshr_mask_simplify2(<2 x i32> %x, <2 x i32> %y, <2 x i32> %sh) {
+; CHECK-LABEL: @fshr_mask_simplify2(
+; CHECK-NEXT:    ret <2 x i32> [[Y:%.*]]
+;
+  %maskedsh = and <2 x i32> %sh, <i32 64, i32 64>
+  %r = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %maskedsh)
+  ret <2 x i32> %r
+}
+
+; Negative test.
+
+define i32 @fshl_mask_simplify3(i32 %x, i32 %y, i32 %sh) {
+; CHECK-LABEL: @fshl_mask_simplify3(
+; CHECK-NEXT:    [[MASKEDSH:%.*]] = and i32 [[SH:%.*]], 16
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[MASKEDSH]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %maskedsh = and i32 %sh, 16
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %maskedsh)
+  ret i32 %r
+}
+
+; Check again with weird bitwidths - the analysis is invalid with non-power-of-2.
+
+define i33 @fshr_mask_simplify1(i33 %x, i33 %y, i33 %sh) {
+; CHECK-LABEL: @fshr_mask_simplify1(
+; CHECK-NEXT:    [[MASKEDSH:%.*]] = and i33 [[SH:%.*]], 64
+; CHECK-NEXT:    [[R:%.*]] = call i33 @llvm.fshr.i33(i33 [[X:%.*]], i33 [[Y:%.*]], i33 [[MASKEDSH]])
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %maskedsh = and i33 %sh, 64
+  %r = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 %maskedsh)
+  ret i33 %r
+}
+
+; Check again with weird bitwidths - the analysis is invalid with non-power-of-2.
+
+define <2 x i31> @fshl_mask_simplify2(<2 x i31> %x, <2 x i31> %y, <2 x i31> %sh) {
+; CHECK-LABEL: @fshl_mask_simplify2(
+; CHECK-NEXT:    [[MASKEDSH:%.*]] = and <2 x i31> [[SH:%.*]], <i31 32, i31 32>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> [[X:%.*]], <2 x i31> [[Y:%.*]], <2 x i31> [[MASKEDSH]])
+; CHECK-NEXT:    ret <2 x i31> [[R]]
+;
+  %maskedsh = and <2 x i31> %sh, <i31 32, i31 32>
+  %r = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> %y, <2 x i31> %maskedsh)
+  ret <2 x i31> %r
+}
+
+; Check again with weird bitwidths - the analysis is invalid with non-power-of-2.
+
+define i33 @fshr_mask_simplify3(i33 %x, i33 %y, i33 %sh) {
+; CHECK-LABEL: @fshr_mask_simplify3(
+; CHECK-NEXT:    [[MASKEDSH:%.*]] = and i33 [[SH:%.*]], 32
+; CHECK-NEXT:    [[R:%.*]] = call i33 @llvm.fshr.i33(i33 [[X:%.*]], i33 [[Y:%.*]], i33 [[MASKEDSH]])
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %maskedsh = and i33 %sh, 32
+  %r = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 %maskedsh)
+  ret i33 %r
+}
+
+; This mask op is unnecessary.
+
+define i32 @fshl_mask_not_required(i32 %x, i32 %y, i32 %sh) {
+; CHECK-LABEL: @fshl_mask_not_required(
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[SH:%.*]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %maskedsh = and i32 %sh, 31
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %maskedsh)
+  ret i32 %r
+}
+
+; This mask op can be reduced.
+
+define i32 @fshl_mask_reduce_constant(i32 %x, i32 %y, i32 %sh) {
+; CHECK-LABEL: @fshl_mask_reduce_constant(
+; CHECK-NEXT:    [[MASKEDSH:%.*]] = and i32 [[SH:%.*]], 1
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[MASKEDSH]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %maskedsh = and i32 %sh, 33
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %maskedsh)
+  ret i32 %r
+}
+
+; But this mask op is required.
+
+define i32 @fshl_mask_negative(i32 %x, i32 %y, i32 %sh) {
+; CHECK-LABEL: @fshl_mask_negative(
+; CHECK-NEXT:    [[MASKEDSH:%.*]] = and i32 [[SH:%.*]], 15
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[MASKEDSH]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %maskedsh = and i32 %sh, 15
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %maskedsh)
+  ret i32 %r
+}
+
+; The transform is not limited to mask ops.
+
+define <2 x i32> @fshr_set_but_not_demanded_vec(<2 x i32> %x, <2 x i32> %y, <2 x i32> %sh) {
+; CHECK-LABEL: @fshr_set_but_not_demanded_vec(
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[SH:%.*]])
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %bogusbits = or <2 x i32> %sh, <i32 32, i32 32>
+  %r = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %bogusbits)
+  ret <2 x i32> %r
+}
+
+; Check again with weird bitwidths - the analysis is invalid with non-power-of-2.
+
+define <2 x i31> @fshl_set_but_not_demanded_vec(<2 x i31> %x, <2 x i31> %y, <2 x i31> %sh) {
+; CHECK-LABEL: @fshl_set_but_not_demanded_vec(
+; CHECK-NEXT:    [[BOGUSBITS:%.*]] = or <2 x i31> [[SH:%.*]], <i31 32, i31 32>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> [[X:%.*]], <2 x i31> [[Y:%.*]], <2 x i31> [[BOGUSBITS]])
+; CHECK-NEXT:    ret <2 x i31> [[R]]
+;
+  %bogusbits = or <2 x i31> %sh, <i31 32, i31 32>
+  %r = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> %y, <2 x i31> %bogusbits)
+  ret <2 x i31> %r
+}
+
+; Simplify one undef or zero operand and constant shift amount.
+
+define i32 @fshl_op0_undef(i32 %x) {
+; CHECK-LABEL: @fshl_op0_undef(
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[X:%.*]], 25
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = call i32 @llvm.fshl.i32(i32 undef, i32 %x, i32 7)
+  ret i32 %r
+}
+
+define i32 @fshl_op0_zero(i32 %x) {
+; CHECK-LABEL: @fshl_op0_zero(
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[X:%.*]], 25
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = call i32 @llvm.fshl.i32(i32 0, i32 %x, i32 7)
+  ret i32 %r
+}
+
+define i33 @fshr_op0_undef(i33 %x) {
+; CHECK-LABEL: @fshr_op0_undef(
+; CHECK-NEXT:    [[R:%.*]] = lshr i33 [[X:%.*]], 7
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %r = call i33 @llvm.fshr.i33(i33 undef, i33 %x, i33 7)
+  ret i33 %r
+}
+
+define i33 @fshr_op0_zero(i33 %x) {
+; CHECK-LABEL: @fshr_op0_zero(
+; CHECK-NEXT:    [[R:%.*]] = lshr i33 [[X:%.*]], 7
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %r = call i33 @llvm.fshr.i33(i33 0, i33 %x, i33 7)
+  ret i33 %r
+}
+
+define i32 @fshl_op1_undef(i32 %x) {
+; CHECK-LABEL: @fshl_op1_undef(
+; CHECK-NEXT:    [[R:%.*]] = shl i32 [[X:%.*]], 7
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 undef, i32 7)
+  ret i32 %r
+}
+
+define i32 @fshl_op1_zero(i32 %x) {
+; CHECK-LABEL: @fshl_op1_zero(
+; CHECK-NEXT:    [[R:%.*]] = shl i32 [[X:%.*]], 7
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 0, i32 7)
+  ret i32 %r
+}
+
+define i33 @fshr_op1_undef(i33 %x) {
+; CHECK-LABEL: @fshr_op1_undef(
+; CHECK-NEXT:    [[R:%.*]] = shl i33 [[X:%.*]], 26
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %r = call i33 @llvm.fshr.i33(i33 %x, i33 undef, i33 7)
+  ret i33 %r
+}
+
+define i33 @fshr_op1_zero(i33 %x) {
+; CHECK-LABEL: @fshr_op1_zero(
+; CHECK-NEXT:    [[R:%.*]] = shl i33 [[X:%.*]], 26
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %r = call i33 @llvm.fshr.i33(i33 %x, i33 0, i33 7)
+  ret i33 %r
+}
+
+define <2 x i31> @fshl_op0_zero_vec(<2 x i31> %x) {
+; CHECK-LABEL: @fshl_op0_zero_vec(
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i31> [[X:%.*]], <i31 24, i31 24>
+; CHECK-NEXT:    ret <2 x i31> [[R]]
+;
+  %r = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> zeroinitializer, <2 x i31> %x, <2 x i31> <i31 7, i31 7>)
+  ret <2 x i31> %r
+}
+
+define <2 x i31> @fshl_op1_undef_vec(<2 x i31> %x) {
+; CHECK-LABEL: @fshl_op1_undef_vec(
+; CHECK-NEXT:    [[R:%.*]] = shl <2 x i31> [[X:%.*]], <i31 7, i31 7>
+; CHECK-NEXT:    ret <2 x i31> [[R]]
+;
+  %r = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> undef, <2 x i31> <i31 7, i31 7>)
+  ret <2 x i31> %r
+}
+
+define <2 x i32> @fshr_op0_undef_vec(<2 x i32> %x) {
+; CHECK-LABEL: @fshr_op0_undef_vec(
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i32> [[X:%.*]], <i32 7, i32 7>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %r = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> undef, <2 x i32> %x, <2 x i32> <i32 7, i32 7>)
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @fshr_op1_zero_vec(<2 x i32> %x) {
+; CHECK-LABEL: @fshr_op1_zero_vec(
+; CHECK-NEXT:    [[R:%.*]] = shl <2 x i32> [[X:%.*]], <i32 25, i32 25>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %r = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> zeroinitializer, <2 x i32> <i32 7, i32 7>)
+  ret <2 x i32> %r
+}
+
+; Only demand bits from one of the operands.
+
+define i32 @fshl_only_op0_demanded(i32 %x, i32 %y) {
+; CHECK-LABEL: @fshl_only_op0_demanded(
+; CHECK-NEXT:    [[Z:%.*]] = shl i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = and i32 [[Z]], 128
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %z = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
+  %r = and i32 %z, 128
+  ret i32 %r
+}
+
+define i32 @fshl_only_op1_demanded(i32 %x, i32 %y) {
+; CHECK-LABEL: @fshl_only_op1_demanded(
+; CHECK-NEXT:    [[Z:%.*]] = lshr i32 [[Y:%.*]], 25
+; CHECK-NEXT:    [[R:%.*]] = and i32 [[Z]], 63
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %z = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
+  %r = and i32 %z, 63
+  ret i32 %r
+}
+
+define i33 @fshr_only_op1_demanded(i33 %x, i33 %y) {
+; CHECK-LABEL: @fshr_only_op1_demanded(
+; CHECK-NEXT:    [[Z:%.*]] = lshr i33 [[Y:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = and i33 [[Z]], 12392
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %z = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 7)
+  %r = and i33 %z, 12392
+  ret i33 %r
+}
+
+define i33 @fshr_only_op0_demanded(i33 %x, i33 %y) {
+; CHECK-LABEL: @fshr_only_op0_demanded(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i33 [[X:%.*]], 4
+; CHECK-NEXT:    [[R:%.*]] = and i33 [[TMP1]], 7
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %z = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 7)
+  %r = lshr i33 %z, 30
+  ret i33 %r
+}
+
+define <2 x i31> @fshl_only_op1_demanded_vec_splat(<2 x i31> %x, <2 x i31> %y) {
+; CHECK-LABEL: @fshl_only_op1_demanded_vec_splat(
+; CHECK-NEXT:    [[Z:%.*]] = lshr <2 x i31> [[Y:%.*]], <i31 24, i31 24>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i31> [[Z]], <i31 63, i31 31>
+; CHECK-NEXT:    ret <2 x i31> [[R]]
+;
+  %z = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> %y, <2 x i31> <i31 7, i31 7>)
+  %r = and <2 x i31> %z, <i31 63, i31 31>
+  ret <2 x i31> %r
+}
+
+; The shift modulo bitwidth is the same for all vector elements, but this is not simplified yet.
+define <2 x i31> @fshl_only_op1_demanded_vec_nonsplat(<2 x i31> %x, <2 x i31> %y) {
+; CHECK-LABEL: @fshl_only_op1_demanded_vec_nonsplat(
+; CHECK-NEXT:    [[Z:%.*]] = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> [[X:%.*]], <2 x i31> [[Y:%.*]], <2 x i31> <i31 7, i31 38>)
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i31> [[Z]], <i31 63, i31 31>
+; CHECK-NEXT:    ret <2 x i31> [[R]]
+;
+  %z = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> %y, <2 x i31> <i31 7, i31 38>)
+  %r = and <2 x i31> %z, <i31 63, i31 31>
+  ret <2 x i31> %r
+}
+
+; Demand bits from both operands -- cannot simplify.
+
+define i32 @fshl_both_ops_demanded(i32 %x, i32 %y) {
+; CHECK-LABEL: @fshl_both_ops_demanded(
+; CHECK-NEXT:    [[Z:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 7)
+; CHECK-NEXT:    [[R:%.*]] = and i32 [[Z]], 192
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %z = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
+  %r = and i32 %z, 192
+  ret i32 %r
+}
+
+define i33 @fshr_both_ops_demanded(i33 %x, i33 %y) {
+; CHECK-LABEL: @fshr_both_ops_demanded(
+; CHECK-NEXT:    [[Z:%.*]] = call i33 @llvm.fshr.i33(i33 [[X:%.*]], i33 [[Y:%.*]], i33 26)
+; CHECK-NEXT:    [[R:%.*]] = and i33 [[Z]], 192
+; CHECK-NEXT:    ret i33 [[R]]
+;
+  %z = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 26)
+  %r = and i33 %z, 192
+  ret i33 %r
+}
+
+; Both operands are demanded, but there are known bits.
+
+define i32 @fshl_known_bits(i32 %x, i32 %y) {
+; CHECK-LABEL: @fshl_known_bits(
+; CHECK-NEXT:    ret i32 128
+;
+  %x2 = or i32 %x, 1   ; lo bit set
+  %y2 = lshr i32 %y, 1 ; hi bit clear
+  %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 7)
+  %r = and i32 %z, 192
+  ret i32 %r
+}
+
+define i33 @fshr_known_bits(i33 %x, i33 %y) {
+; CHECK-LABEL: @fshr_known_bits(
+; CHECK-NEXT:    ret i33 128
+;
+  %x2 = or i33 %x, 1 ; lo bit set
+  %y2 = lshr i33 %y, 1 ; hi bit set
+  %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 26)
+  %r = and i33 %z, 192
+  ret i33 %r
+}
+
+; This case fails to simplify due to multiple uses.
+
+define i33 @fshr_multi_use(i33 %a) {
+; CHECK-LABEL: @fshr_multi_use(
+; CHECK-NEXT:    [[B:%.*]] = tail call i33 @llvm.fshr.i33(i33 [[A:%.*]], i33 [[A]], i33 1)
+; CHECK-NEXT:    [[C:%.*]] = lshr i33 [[B]], 23
+; CHECK-NEXT:    [[D:%.*]] = xor i33 [[C]], [[B]]
+; CHECK-NEXT:    [[E:%.*]] = and i33 [[D]], 31
+; CHECK-NEXT:    ret i33 [[E]]
+;
+  %b = tail call i33 @llvm.fshr.i33(i33 %a, i33 %a, i33 1)
+  %c = lshr i33 %b, 23
+  %d = xor i33 %c, %b
+  %e = and i33 %d, 31
+  ret i33 %e
+}
+
+; This demonstrates the same simplification working if the fshr intrinsic
+; is expanded into shifts and or.
+
+define i33 @expanded_fshr_multi_use(i33 %a) {
+; CHECK-LABEL: @expanded_fshr_multi_use(
+; CHECK-NEXT:    [[TMP:%.*]] = lshr i33 [[A:%.*]], 1
+; CHECK-NEXT:    [[C:%.*]] = lshr i33 [[A]], 24
+; CHECK-NEXT:    [[D:%.*]] = xor i33 [[C]], [[TMP]]
+; CHECK-NEXT:    [[E:%.*]] = and i33 [[D]], 31
+; CHECK-NEXT:    ret i33 [[E]]
+;
+  %tmp = lshr i33 %a, 1
+  %tmp2 = shl i33 %a, 32
+  %b = or i33 %tmp, %tmp2
+  %c = lshr i33 %b, 23
+  %d = xor i33 %c, %b
+  %e = and i33 %d, 31
+  ret i33 %e
+}
+
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+
+; Special-case: rotate a 16-bit value left/right by 8-bits is bswap.
+
+define i16 @fshl_bswap(i16 %x) {
+; CHECK-LABEL: @fshl_bswap(
+; CHECK-NEXT:    [[R:%.*]] = call i16 @llvm.fshl.i16(i16 [[X:%.*]], i16 [[X]], i16 8)
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %r = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 8)
+  ret i16 %r
+}
+
+define i16 @fshr_bswap(i16 %x) {
+; CHECK-LABEL: @fshr_bswap(
+; CHECK-NEXT:    [[R:%.*]] = call i16 @llvm.fshr.i16(i16 [[X:%.*]], i16 [[X]], i16 8)
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %r = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 8)
+  ret i16 %r
+}
+
diff --git a/test/Transforms/InstCombine/icmp-dom.ll b/test/Transforms/InstCombine/icmp-dom.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7676eaaf08baca11e3e3a5ffb4e06681246d591d
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-dom.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @idom_sign_bit_check_edge_dominates(i64 %a) {
+; CHECK-LABEL: @idom_sign_bit_check_edge_dominates(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[LOR_RHS:%.*]]
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br label [[LOR_END:%.*]]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LOR_END]], label [[LAND_RHS:%.*]]
+; CHECK:       land.rhs:
+; CHECK-NEXT:    br label [[LOR_END]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp slt i64 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %lor.rhs
+
+land.lhs.true:
+  br label %lor.end
+
+lor.rhs:
+  %cmp2 = icmp sgt i64 %a, 0
+  br i1 %cmp2, label %land.rhs, label %lor.end
+
+land.rhs:
+  br label %lor.end
+
+lor.end:
+  ret void
+}
+
+define void @idom_sign_bit_check_edge_not_dominates(i64 %a) {
+; CHECK-LABEL: @idom_sign_bit_check_edge_not_dominates(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[LOR_RHS:%.*]]
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br i1 undef, label [[LOR_END:%.*]], label [[LOR_RHS]]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LAND_RHS:%.*]], label [[LOR_END]]
+; CHECK:       land.rhs:
+; CHECK-NEXT:    br label [[LOR_END]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp slt i64 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %lor.rhs
+
+land.lhs.true:
+  br i1 undef, label %lor.end, label %lor.rhs
+
+lor.rhs:
+  %cmp2 = icmp sgt i64 %a, 0
+  br i1 %cmp2, label %land.rhs, label %lor.end
+
+land.rhs:
+  br label %lor.end
+
+lor.end:
+  ret void
+}
+
+define void @idom_sign_bit_check_edge_dominates_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @idom_sign_bit_check_edge_dominates_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[A:%.*]], 5
+; CHECK-NEXT:    br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[LOR_RHS:%.*]]
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br label [[LOR_END:%.*]]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i64 [[A]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOR_END]], label [[LAND_RHS:%.*]]
+; CHECK:       land.rhs:
+; CHECK-NEXT:    br label [[LOR_END]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp slt i64 %a, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.rhs
+
+land.lhs.true:
+  br label %lor.end
+
+lor.rhs:
+  %cmp2 = icmp sgt i64 %a, 5
+  %select = select i1 %cmp2, i64 %a, i64 5
+  %cmp3 = icmp ne i64 %select, %b
+  br i1 %cmp3, label %land.rhs, label %lor.end
+
+land.rhs:
+  br label %lor.end
+
+lor.end:
+  ret void
+}
+
+define void @idom_zbranch(i64 %a) {
+; CHECK-LABEL: @idom_zbranch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOR_END:%.*]], label [[LOR_RHS:%.*]]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i64 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LAND_RHS:%.*]], label [[LOR_END]]
+; CHECK:       land.rhs:
+; CHECK-NEXT:    br label [[LOR_END]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i64 %a, 0
+  br i1 %cmp, label %lor.end, label %lor.rhs
+
+lor.rhs:
+  %cmp2 = icmp slt i64 %a, 0
+  br i1 %cmp2, label %land.rhs, label %lor.end
+
+land.rhs:
+  br label %lor.end
+
+lor.end:
+  ret void
+}
+
+define void @idom_not_zbranch(i32 %a, i32 %b) {
+; CHECK-LABEL: @idom_not_zbranch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[A]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[RETURN]], label [[IF_THEN3:%.*]]
+; CHECK:       if.then3:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  %cmp1 = icmp slt i32 %a, 0
+  %a. = select i1 %cmp1, i32 %a, i32 0
+  %cmp2 = icmp ne i32 %a., %b
+  br i1 %cmp2, label %if.then3, label %return
+
+if.then3:
+  br label %return
+
+return:
+  ret void
+}
+
+define i1 @trueblock_cmp_is_false(i32 %x, i32 %y) {
+; CHECK-LABEL: @trueblock_cmp_is_false(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       f:
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cmp = icmp sgt i32 %x, %y
+  br i1 %cmp, label %t, label %f
+t:
+  %cmp2 = icmp slt i32 %x, %y
+  ret i1 %cmp2
+f:
+  ret i1 %cmp
+}
+
+define i1 @trueblock_cmp_is_false_commute(i32 %x, i32 %y) {
+; CHECK-LABEL: @trueblock_cmp_is_false_commute(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       f:
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %t, label %f
+t:
+  %cmp2 = icmp sgt i32 %y, %x
+  ret i1 %cmp2
+f:
+  ret i1 %cmp
+}
+
+define i1 @trueblock_cmp_is_true(i32 %x, i32 %y) {
+; CHECK-LABEL: @trueblock_cmp_is_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       f:
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cmp = icmp ult i32 %x, %y
+  br i1 %cmp, label %t, label %f
+t:
+  %cmp2 = icmp ne i32 %x, %y
+  ret i1 %cmp2
+f:
+  ret i1 %cmp
+}
+
+define i1 @trueblock_cmp_is_true_commute(i32 %x, i32 %y) {
+; CHECK-LABEL: @trueblock_cmp_is_true_commute(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       f:
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cmp = icmp ugt i32 %x, %y
+  br i1 %cmp, label %t, label %f
+t:
+  %cmp2 = icmp ne i32 %y, %x
+  ret i1 %cmp2
+f:
+  ret i1 %cmp
+}
+
+define i1 @falseblock_cmp_is_false(i32 %x, i32 %y) {
+; CHECK-LABEL: @falseblock_cmp_is_false(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK:       f:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp sle i32 %x, %y
+  br i1 %cmp, label %t, label %f
+t:
+  ret i1 %cmp
+f:
+  %cmp2 = icmp slt i32 %x, %y
+  ret i1 %cmp2
+}
+
+define i1 @falseblock_cmp_is_false_commute(i32 %x, i32 %y) {
+; CHECK-LABEL: @falseblock_cmp_is_false_commute(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK:       f:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %t, label %f
+t:
+  ret i1 %cmp
+f:
+  %cmp2 = icmp eq i32 %y, %x
+  ret i1 %cmp2
+}
+
+define i1 @falseblock_cmp_is_true(i32 %x, i32 %y) {
+; CHECK-LABEL: @falseblock_cmp_is_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK:       f:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %cmp = icmp ult i32 %x, %y
+  br i1 %cmp, label %t, label %f
+t:
+  ret i1 %cmp
+f:
+  %cmp2 = icmp uge i32 %x, %y
+  ret i1 %cmp2
+}
+
+define i1 @falseblock_cmp_is_true_commute(i32 %x, i32 %y) {
+; CHECK-LABEL: @falseblock_cmp_is_true_commute(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK:       f:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %cmp = icmp sgt i32 %x, %y
+  br i1 %cmp, label %t, label %f
+t:
+  ret i1 %cmp
+f:
+  %cmp2 = icmp sge i32 %y, %x
+  ret i1 %cmp2
+}
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 1f05bb67e9656ca87ba0e28c4e75696eb948ddd8..7bf0cc9de8295e79c2d3d53dc92b4ac22b89eada 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -2875,125 +2875,6 @@ define i1 @cmp_inverse_mask_bits_set_ne(i32 %x) {
   ret i1 %cmp
 }
 
-; CHECK-LABEL: @idom_sign_bit_check_edge_dominates
-define void @idom_sign_bit_check_edge_dominates(i64 %a) {
-entry:
-  %cmp = icmp slt i64 %a, 0
-  br i1 %cmp, label %land.lhs.true, label %lor.rhs
-
-land.lhs.true:                                    ; preds = %entry
-  br label %lor.end
-
-; CHECK-LABEL: lor.rhs:
-; CHECK-NOT: icmp sgt i64 %a, 0
-; CHECK: icmp eq i64 %a, 0
-lor.rhs:                                          ; preds = %entry
-  %cmp2 = icmp sgt i64 %a, 0
-  br i1 %cmp2, label %land.rhs, label %lor.end
-
-land.rhs:                                         ; preds = %lor.rhs
-  br label %lor.end
-
-lor.end:                                          ; preds = %land.rhs, %lor.rhs, %land.lhs.true
-  ret void
-}
-
-; CHECK-LABEL: @idom_sign_bit_check_edge_not_dominates
-define void @idom_sign_bit_check_edge_not_dominates(i64 %a) {
-entry:
-  %cmp = icmp slt i64 %a, 0
-  br i1 %cmp, label %land.lhs.true, label %lor.rhs
-
-land.lhs.true:                                    ; preds = %entry
-  br i1 undef, label %lor.end, label %lor.rhs
-
-; CHECK-LABEL: lor.rhs:
-; CHECK: icmp sgt i64 %a, 0
-; CHECK-NOT: icmp eq i64 %a, 0
-lor.rhs:                                          ; preds = %land.lhs.true, %entry
-  %cmp2 = icmp sgt i64 %a, 0
-  br i1 %cmp2, label %land.rhs, label %lor.end
-
-land.rhs:                                         ; preds = %lor.rhs
-  br label %lor.end
-
-lor.end:                                          ; preds = %land.rhs, %lor.rhs, %land.lhs.true
-  ret void
-}
-
-; CHECK-LABEL: @idom_sign_bit_check_edge_dominates_select
-define void @idom_sign_bit_check_edge_dominates_select(i64 %a, i64 %b) {
-entry:
-  %cmp = icmp slt i64 %a, 5
-  br i1 %cmp, label %land.lhs.true, label %lor.rhs
-
-land.lhs.true:                                    ; preds = %entry
-  br label %lor.end
-
-; CHECK-LABEL: lor.rhs:
-; CHECK-NOT: [[B:%.*]] = icmp sgt i64 %a, 5
-; CHECK: [[C:%.*]] = icmp eq i64 %a, %b
-; CHECK-NOT: [[D:%.*]] = select i1 [[B]], i64 %a, i64 5
-; CHECK-NOT: icmp ne i64 [[D]], %b
-; CHECK-NEXT: br i1 [[C]], label %lor.end, label %land.rhs
-lor.rhs:                                          ; preds = %entry
-  %cmp2 = icmp sgt i64 %a, 5
-  %select = select i1 %cmp2, i64 %a, i64 5
-  %cmp3 = icmp ne i64 %select, %b
-  br i1 %cmp3, label %land.rhs, label %lor.end
-
-land.rhs:                                         ; preds = %lor.rhs
-  br label %lor.end
-
-lor.end:                                          ; preds = %land.rhs, %lor.rhs, %land.lhs.true
-  ret void
-}
-
-; CHECK-LABEL: @idom_zbranch
-define void @idom_zbranch(i64 %a) {
-entry:
-  %cmp = icmp sgt i64 %a, 0
-  br i1 %cmp, label %lor.end, label %lor.rhs
-
-; CHECK-LABEL: lor.rhs:
-; CHECK: icmp slt i64 %a, 0
-; CHECK-NOT: icmp eq i64 %a, 0
-lor.rhs:                                          ; preds = %entry
-  %cmp2 = icmp slt i64 %a, 0
-  br i1 %cmp2, label %land.rhs, label %lor.end
-
-land.rhs:                                         ; preds = %lor.rhs
-  br label %lor.end
-
-lor.end:                                          ; preds = %land.rhs, %lor.rhs
-  ret void
-}
-
-; CHECK-LABEL: @idom_not_zbranch
-define void @idom_not_zbranch(i32 %a, i32 %b) {
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %return, label %if.end
-
-; CHECK-LABEL: if.end:
-; CHECK-NOT: [[B:%.*]] = icmp slt i32 %a, 0
-; CHECK: [[C:%.*]] = icmp eq i32 %a, %b
-; CHECK-NOT: [[D:%.*]] = select i1 [[B]], i32 %a, i32 0
-; CHECK-NOT: icmp ne i32 [[D]], %b
-; CHECK-NEXT: br i1 [[C]], label %return, label %if.then3
-if.end:                                           ; preds = %entry
-  %cmp1 = icmp slt i32 %a, 0
-  %a. = select i1 %cmp1, i32 %a, i32 0
-  %cmp2 = icmp ne i32 %a., %b
-  br i1 %cmp2, label %if.then3, label %return
-
-if.then3:                                         ; preds = %if.end
-  br label %return
-
-return:                                           ; preds = %if.end, %entry, %if.then3
-  ret void
-}
-
 ; When canonicalizing to 'gt/lt', make sure the constant is correct.
 
 define i1 @PR27792(i128 %a) {
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 73f1cd9201641b9a59970e3fb979f1e977db3f0c..8de892fd81b82ee1e215642a741baa6cffaab21b 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -322,30 +322,26 @@ define <2 x i1> @cttz_knownbits_vec(<2 x i32> %arg) {
   ret <2 x i1> %res
 }
 
-define i1 @cttz_knownbits2(i32 %arg) {
+define i32 @cttz_knownbits2(i32 %arg) {
 ; CHECK-LABEL: @cttz_knownbits2(
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG:%.*]], 4
 ; CHECK-NEXT:    [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true) #2, !range ![[$CTTZ_RANGE:[0-9]+]]
-; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[CNT]], 2
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i32 [[CNT]]
 ;
   %or = or i32 %arg, 4
   %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone
-  %res = icmp eq i32 %cnt, 2
-  ret i1 %res
+  ret i32 %cnt
 }
 
-define <2 x i1> @cttz_knownbits2_vec(<2 x i32> %arg) {
+define <2 x i32> @cttz_knownbits2_vec(<2 x i32> %arg) {
 ; CHECK-LABEL: @cttz_knownbits2_vec(
 ; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[ARG:%.*]], <i32 4, i32 4>
 ; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[OR]], i1 true)
-; CHECK-NEXT:    [[RES:%.*]] = icmp eq <2 x i32> [[CNT]], <i32 2, i32 2>
-; CHECK-NEXT:    ret <2 x i1> [[RES]]
+; CHECK-NEXT:    ret <2 x i32> [[CNT]]
 ;
   %or = or <2 x i32> %arg, <i32 4, i32 4>
   %cnt = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %or, i1 true) nounwind readnone
-  %res = icmp eq <2 x i32> %cnt, <i32 2, i32 2>
-  ret <2 x i1> %res
+  ret <2 x i32> %cnt
 }
 
 define i1 @cttz_knownbits3(i32 %arg) {
@@ -358,13 +354,9 @@ define i1 @cttz_knownbits3(i32 %arg) {
   ret i1 %res
 }
 
-; TODO: The icmp is unnecessary given the known bits of the input.
 define <2 x i1> @cttz_knownbits3_vec(<2 x i32> %arg) {
 ; CHECK-LABEL: @cttz_knownbits3_vec(
-; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[ARG:%.*]], <i32 4, i32 4>
-; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[OR]], i1 true)
-; CHECK-NEXT:    [[RES:%.*]] = icmp eq <2 x i32> [[CNT]], <i32 3, i32 3>
-; CHECK-NEXT:    ret <2 x i1> [[RES]]
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %or = or <2 x i32> %arg, <i32 4, i32 4>
   %cnt = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %or, i1 true) nounwind readnone
@@ -422,30 +414,26 @@ define <2 x i1> @ctlz_knownbits_vec(<2 x i8> %arg) {
   ret <2 x i1> %res
 }
 
-define i1 @ctlz_knownbits2(i8 %arg) {
+define i8 @ctlz_knownbits2(i8 %arg) {
 ; CHECK-LABEL: @ctlz_knownbits2(
 ; CHECK-NEXT:    [[OR:%.*]] = or i8 [[ARG:%.*]], 32
 ; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true) #2, !range ![[$CTLZ_RANGE:[0-9]+]]
-; CHECK-NEXT:    [[RES:%.*]] = icmp eq i8 [[CNT]], 2
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i8 [[CNT]]
 ;
   %or = or i8 %arg, 32
   %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone
-  %res = icmp eq i8 %cnt, 2
-  ret i1 %res
+  ret i8 %cnt
 }
 
-define <2 x i1> @ctlz_knownbits2_vec(<2 x i8> %arg) {
+define <2 x i8> @ctlz_knownbits2_vec(<2 x i8> %arg) {
 ; CHECK-LABEL: @ctlz_knownbits2_vec(
 ; CHECK-NEXT:    [[OR:%.*]] = or <2 x i8> [[ARG:%.*]], <i8 32, i8 32>
 ; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[OR]], i1 true)
-; CHECK-NEXT:    [[RES:%.*]] = icmp eq <2 x i8> [[CNT]], <i8 2, i8 2>
-; CHECK-NEXT:    ret <2 x i1> [[RES]]
+; CHECK-NEXT:    ret <2 x i8> [[CNT]]
 ;
   %or = or <2 x i8> %arg, <i8 32, i8 32>
   %cnt = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %or, i1 true) nounwind readnone
-  %res = icmp eq <2 x i8> %cnt, <i8 2, i8 2>
-  ret <2 x i1> %res
+  ret <2 x i8> %cnt
 }
 
 define i1 @ctlz_knownbits3(i8 %arg) {
@@ -458,13 +446,9 @@ define i1 @ctlz_knownbits3(i8 %arg) {
   ret i1 %res
 }
 
-; TODO: The icmp is unnecessary given the known bits of the input.
 define <2 x i1> @ctlz_knownbits3_vec(<2 x i8> %arg) {
 ; CHECK-LABEL: @ctlz_knownbits3_vec(
-; CHECK-NEXT:    [[OR:%.*]] = or <2 x i8> [[ARG:%.*]], <i8 32, i8 32>
-; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[OR]], i1 true)
-; CHECK-NEXT:    [[RES:%.*]] = icmp eq <2 x i8> [[CNT]], <i8 3, i8 3>
-; CHECK-NEXT:    ret <2 x i1> [[RES]]
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %or = or <2 x i8> %arg, <i8 32, i8 32>
   %cnt = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %or, i1 true) nounwind readnone
diff --git a/test/Transforms/InstCombine/load.ll b/test/Transforms/InstCombine/load.ll
index 028bc4b3fb772a2f30fd03eb01d1e5f4976381c8..5129349b3949f5ab1185337ad5ccb2310ce24ab1 100644
--- a/test/Transforms/InstCombine/load.ll
+++ b/test/Transforms/InstCombine/load.ll
@@ -1,8 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 ; RUN: opt -passes=instcombine -S < %s | FileCheck %s
 
-; This test makes sure that these instructions are properly eliminated.
-
 target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 @X = constant i32 42		; <i32*> [#uses=2]
@@ -13,99 +12,125 @@ target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128"
 @GLOBAL = internal constant [4 x i32] zeroinitializer
 
 
-; CHECK-LABEL: @test1(
-; CHECK-NOT: load
 define i32 @test1() {
-	%B = load i32, i32* @X		; <i32> [#uses=1]
-	ret i32 %B
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 42
+;
+  %B = load i32, i32* @X		; <i32> [#uses=1]
+  ret i32 %B
 }
 
-; CHECK-LABEL: @test2(
-; CHECK-NOT: load
 define float @test2() {
-	%A = getelementptr [2 x { i32, float }], [2 x { i32, float }]* @Y, i64 0, i64 1, i32 1		; <float*> [#uses=1]
-	%B = load float, float* %A		; <float> [#uses=1]
-	ret float %B
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret float 0x3FF3B2FEC0000000
+;
+  %A = getelementptr [2 x { i32, float }], [2 x { i32, float }]* @Y, i64 0, i64 1, i32 1		; <float*> [#uses=1]
+  %B = load float, float* %A		; <float> [#uses=1]
+  ret float %B
 }
 
-; CHECK-LABEL: @test3(
-; CHECK-NOT: load
 define i32 @test3() {
-	%A = getelementptr [2 x { i32, float }], [2 x { i32, float }]* @Y, i64 0, i64 0, i32 0		; <i32*> [#uses=1]
-	%B = load i32, i32* %A		; <i32> [#uses=1]
-	ret i32 %B
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    ret i32 12
+;
+  %A = getelementptr [2 x { i32, float }], [2 x { i32, float }]* @Y, i64 0, i64 0, i32 0		; <i32*> [#uses=1]
+  %B = load i32, i32* %A		; <i32> [#uses=1]
+  ret i32 %B
 }
 
-; CHECK-LABEL: @test4(
-; CHECK-NOT: load
 define i32 @test4() {
-	%A = getelementptr [2 x { i32, float }], [2 x { i32, float }]* @Z, i64 0, i64 1, i32 0		; <i32*> [#uses=1]
-	%B = load i32, i32* %A		; <i32> [#uses=1]
-	ret i32 %B
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    ret i32 0
+;
+  %A = getelementptr [2 x { i32, float }], [2 x { i32, float }]* @Z, i64 0, i64 1, i32 0		; <i32*> [#uses=1]
+  %B = load i32, i32* %A		; <i32> [#uses=1]
+  ret i32 %B
 }
 
-; CHECK-LABEL: @test5(
-; CHECK-NOT: load
 define i32 @test5(i1 %C) {
-	%Y = select i1 %C, i32* @X, i32* @X2		; <i32*> [#uses=1]
-	%Z = load i32, i32* %Y		; <i32> [#uses=1]
-	ret i32 %Z
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[Z:%.*]] = select i1 [[C:%.*]], i32 42, i32 47
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %Y = select i1 %C, i32* @X, i32* @X2		; <i32*> [#uses=1]
+  %Z = load i32, i32* %Y		; <i32> [#uses=1]
+  ret i32 %Z
 }
 
-; CHECK-LABEL: @test7(
-; CHECK-NOT: load
 define i32 @test7(i32 %X) {
-	%V = getelementptr i32, i32* null, i32 %X		; <i32*> [#uses=1]
-	%R = load i32, i32* %V		; <i32> [#uses=1]
-	ret i32 %R
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    store i32 undef, i32* null, align 536870912
+; CHECK-NEXT:    ret i32 undef
+;
+  %V = getelementptr i32, i32* null, i32 %X		; <i32*> [#uses=1]
+  %R = load i32, i32* %V		; <i32> [#uses=1]
+  ret i32 %R
 }
 
-; CHECK-LABEL: @test7_no_null_opt(
-; CHECK: %V = getelementptr i32, i32* null
-; CHECK: %R = load i32, i32* %V
 define i32 @test7_no_null_opt(i32 %X) #0 {
-        %V = getelementptr i32, i32* null, i32 %X               ; <i32*> [#uses=1]
-        %R = load i32, i32* %V          ; <i32> [#uses=1]
-        ret i32 %R
+; CHECK-LABEL: @test7_no_null_opt(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[V:%.*]] = getelementptr i32, i32* null, i64 [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[V]], align 4
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %V = getelementptr i32, i32* null, i32 %X               ; <i32*> [#uses=1]
+  %R = load i32, i32* %V          ; <i32> [#uses=1]
+  ret i32 %R
 }
 attributes #0 = { "null-pointer-is-valid"="true" }
 
-; CHECK-LABEL: @test8(
-; CHECK-NOT: load
 define i32 @test8(i32* %P) {
-	store i32 1, i32* %P
-	%X = load i32, i32* %P		; <i32> [#uses=1]
-	ret i32 %X
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 1
+;
+  store i32 1, i32* %P
+  %X = load i32, i32* %P		; <i32> [#uses=1]
+  ret i32 %X
 }
 
-; CHECK-LABEL: @test9(
-; CHECK-NOT: load
 define i32 @test9(i32* %P) {
-	%X = load i32, i32* %P		; <i32> [#uses=1]
-	%Y = load i32, i32* %P		; <i32> [#uses=1]
-	%Z = sub i32 %X, %Y		; <i32> [#uses=1]
-	ret i32 %Z
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    ret i32 0
+;
+  %X = load i32, i32* %P		; <i32> [#uses=1]
+  %Y = load i32, i32* %P		; <i32> [#uses=1]
+  %Z = sub i32 %X, %Y		; <i32> [#uses=1]
+  ret i32 %Z
 }
 
-; CHECK-LABEL: @test10(
-; CHECK-NOT: load
 define i32 @test10(i1 %C.upgrd.1, i32* %P, i32* %Q) {
-	br i1 %C.upgrd.1, label %T, label %F
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    br i1 [[C_UPGRD_1:%.*]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    store i32 1, i32* [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[C:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    br label [[C]]
+; CHECK:       C:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 0
+;
+  br i1 %C.upgrd.1, label %T, label %F
 T:		; preds = %0
-	store i32 1, i32* %Q
-	store i32 0, i32* %P
-	br label %C
+  store i32 1, i32* %Q
+  store i32 0, i32* %P
+  br label %C
 F:		; preds = %0
-	store i32 0, i32* %P
-	br label %C
+  store i32 0, i32* %P
+  br label %C
 C:		; preds = %F, %T
-	%V = load i32, i32* %P		; <i32> [#uses=1]
-	ret i32 %V
+  %V = load i32, i32* %P		; <i32> [#uses=1]
+  ret i32 %V
 }
 
-; CHECK-LABEL: @test11(
-; CHECK-NOT: load
 define double @test11(double* %p) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[T0:%.*]] = getelementptr double, double* [[P:%.*]], i64 1
+; CHECK-NEXT:    store double 2.000000e+00, double* [[T0]], align 8
+; CHECK-NEXT:    ret double 2.000000e+00
+;
   %t0 = getelementptr double, double* %p, i32 1
   store double 2.0, double* %t0
   %t1 = getelementptr double, double* %p, i32 1
@@ -113,9 +138,10 @@ define double @test11(double* %p) {
   ret double %x
 }
 
-; CHECK-LABEL: @test12(
-; CHECK-NOT: load
 define i32 @test12(i32* %P) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    ret i32 123
+;
   %A = alloca i32
   store i32 123, i32* %A
   ; Cast the result of the load not the source
@@ -124,22 +150,29 @@ define i32 @test12(i32* %P) {
   ret i32 %V
 }
 
-; CHECK-LABEL: @test13(
-; CHECK-NOT: load
 define <16 x i8> @test13(<2 x i64> %x) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
   %tmp = load <16 x i8>, <16 x i8>* bitcast ([4 x i32]* @GLOBAL to <16 x i8>*)
   ret <16 x i8> %tmp
 }
 
-define i8 @test14(i8 %x, i32 %y) {
 ; This test must not have the store of %x forwarded to the load -- there is an
 ; intervening store if %y. However, the intervening store occurs with a different
 ; type and size and to a different pointer value. This is ensuring that none of
 ; those confuse the analysis into thinking that the second store does not alias
 ; the first.
+
+define i8 @test14(i8 %x, i32 %y) {
 ; CHECK-LABEL: @test14(
-; CHECK:         %[[R:.*]] = load i8, i8*
-; CHECK-NEXT:    ret i8 %[[R]]
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A_I8:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    store i8 [[X:%.*]], i8* [[A_I8]], align 4
+; CHECK-NEXT:    store i32 [[Y:%.*]], i32* [[A]], align 4
+; CHECK-NEXT:    [[R:%.*]] = load i8, i8* [[A_I8]], align 4
+; CHECK-NEXT:    ret i8 [[R]]
+;
   %a = alloca i32
   %a.i8 = bitcast i32* %a to i8*
   store i8 %x, i8* %a.i8
@@ -150,11 +183,15 @@ define i8 @test14(i8 %x, i32 %y) {
 
 @test15_global = external global i32
 
-define i8 @test15(i8 %x, i32 %y) {
 ; Same test as @test14 essentially, but using a global instead of an alloca.
+
+define i8 @test15(i8 %x, i32 %y) {
 ; CHECK-LABEL: @test15(
-; CHECK:         %[[R:.*]] = load i8, i8*
-; CHECK-NEXT:    ret i8 %[[R]]
+; CHECK-NEXT:    store i8 [[X:%.*]], i8* bitcast (i32* @test15_global to i8*), align 4
+; CHECK-NEXT:    store i32 [[Y:%.*]], i32* @test15_global, align 4
+; CHECK-NEXT:    [[R:%.*]] = load i8, i8* bitcast (i32* @test15_global to i8*), align 4
+; CHECK-NEXT:    ret i8 [[R]]
+;
   %g.i8 = bitcast i32* @test15_global to i8*
   store i8 %x, i8* %g.i8
   store i32 %y, i32* @test15_global
@@ -162,22 +199,26 @@ define i8 @test15(i8 %x, i32 %y) {
   ret i8 %r
 }
 
-define void @test16(i8* %x, i8* %a, i8* %b, i8* %c) {
 ; Check that we canonicalize loads which are only stored to use integer types
 ; when there is a valid integer type.
-; CHECK-LABEL: @test16(
-; CHECK: %[[L1:.*]] = load i32, i32*
-; CHECK-NOT: load
-; CHECK: store i32 %[[L1]], i32*
-; CHECK: store i32 %[[L1]], i32*
-; CHECK-NOT: store
-; CHECK: %[[L1:.*]] = load i32, i32*
-; CHECK-NOT: load
-; CHECK: store i32 %[[L1]], i32*
-; CHECK: store i32 %[[L1]], i32*
-; CHECK-NOT: store
-; CHECK: ret
 
+define void @test16(i8* %x, i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_CAST:%.*]] = bitcast i8* [[C:%.*]] to i32*
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; CHECK-NEXT:    [[X11:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[A:%.*]] to i32*
+; CHECK-NEXT:    store i32 [[X11]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[B:%.*]] to i32*
+; CHECK-NEXT:    store i32 [[X11]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X]] to i32*
+; CHECK-NEXT:    [[X22:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[B]] to i32*
+; CHECK-NEXT:    store i32 [[X22]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[X22]], i32* [[C_CAST]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %x.cast = bitcast i8* %x to float*
   %a.cast = bitcast i8* %a to float*
@@ -196,15 +237,17 @@ entry:
   ret void
 }
 
-define void @test17(i8** %x, i8 %y) {
 ; Check that in cases similar to @test16 we don't try to rewrite a load when
 ; its only use is a store but it is used as the pointer to that store rather
 ; than the value.
-;
-; CHECK-LABEL: @test17(
-; CHECK: %[[L:.*]] = load i8*, i8**
-; CHECK: store i8 %y, i8* %[[L]]
 
+define void @test17(i8** %x, i8 %y) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_LOAD:%.*]] = load i8*, i8** [[X:%.*]], align 8
+; CHECK-NEXT:    store i8 [[Y:%.*]], i8* [[X_LOAD]], align 1
+; CHECK-NEXT:    ret void
+;
 entry:
   %x.load = load i8*, i8** %x
   store i8 %y, i8* %x.load
@@ -214,13 +257,19 @@ entry:
 
 ; Check that we don't try change the type of the load by inserting a bitcast
 ; generating invalid IR.
-; CHECK-LABEL: @test18(
-; CHECK-NOT: bitcast
-; CHECK: ret
 %swift.error = type opaque
 declare void @useSwiftError(%swift.error** swifterror)
 
 define void @test18(%swift.error** swifterror %err) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWIFTERROR:%.*]] = alloca swifterror %swift.error*, align 8
+; CHECK-NEXT:    store %swift.error* null, %swift.error** [[SWIFTERROR]], align 8
+; CHECK-NEXT:    call void @useSwiftError(%swift.error** nonnull swifterror [[SWIFTERROR]])
+; CHECK-NEXT:    [[ERR_RES:%.*]] = load %swift.error*, %swift.error** [[SWIFTERROR]], align 8
+; CHECK-NEXT:    store %swift.error* [[ERR_RES]], %swift.error** [[ERR:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
 entry:
   %swifterror = alloca swifterror %swift.error*, align 8
   store %swift.error* null, %swift.error** %swifterror, align 8
@@ -231,15 +280,18 @@ entry:
 }
 
 ; Make sure we preseve the type of the store to a swifterror pointer.
-; CHECK-LABEL: @test19(
-; CHECK: [[A:%.*]] = alloca
-; CHECK: call
-; CHECK: [[BC:%.*]] = bitcast i8** [[A]] to
-; CHECK: [[ERRVAL:%.*]] =  load {{.*}}[[BC]]
-; CHECK: store {{.*}}[[ERRVAL]]
-; CHECK: ret
+
 declare void @initi8(i8**)
 define void @test19(%swift.error** swifterror %err) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i8*, align 8
+; CHECK-NEXT:    call void @initi8(i8** nonnull [[TMP]])
+; CHECK-NEXT:    [[SWIFTERROR:%.*]] = bitcast i8** [[TMP]] to %swift.error**
+; CHECK-NEXT:    [[ERR_RES:%.*]] = load %swift.error*, %swift.error** [[SWIFTERROR]], align 8
+; CHECK-NEXT:    store %swift.error* [[ERR_RES]], %swift.error** [[ERR:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
 entry:
   %tmp = alloca i8*, align 8
   call void @initi8(i8** %tmp)
diff --git a/test/Transforms/InstCombine/overflow.ll b/test/Transforms/InstCombine/overflow.ll
index 4c0a8a2c0df7d7c49a305a09a8e874a710344154..f5558890d138066f2405826ddee44d09587c1fdf 100644
--- a/test/Transforms/InstCombine/overflow.ll
+++ b/test/Transforms/InstCombine/overflow.ll
@@ -1,17 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 ; <rdar://problem/8558713>
 
 declare void @throwAnExceptionOrWhatever()
 
-; CHECK-LABEL: @test1(
 define i32 @test1(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SADD:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[B:%.*]], i32 [[A:%.*]])
+; CHECK-NEXT:    [[SADD_RESULT:%.*]] = extractvalue { i32, i1 } [[SADD]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { i32, i1 } [[SADD]], 1
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i32 [[SADD_RESULT]]
+;
 entry:
-; CHECK-NOT: sext
   %conv = sext i32 %a to i64
   %conv2 = sext i32 %b to i64
   %add = add nsw i64 %conv2, %conv
   %add.off = add i64 %add, 2147483648
-; CHECK: llvm.sadd.with.overflow.i32
   %0 = icmp ugt i64 %add.off, 4294967295
   br i1 %0, label %if.then, label %if.end
 
@@ -20,26 +30,37 @@ if.then:
   br label %if.end
 
 if.end:
-; CHECK-NOT: trunc
   %conv9 = trunc i64 %add to i32
-; CHECK: ret i32
   ret i32 %conv9
 }
 
-; CHECK-LABEL: @test2(
 ; This form should not be promoted for two reasons: 1) it is unprofitable to
 ; promote it since the add.off instruction has another use, and 2) it is unsafe
 ; because the add-with-off makes the high bits of the original add live.
+
 define i32 @test2(i32 %a, i32 %b, i64* %P) nounwind ssp {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[ADD_OFF:%.*]] = add nsw i64 [[ADD]], 2147483648
+; CHECK-NEXT:    store i64 [[ADD_OFF]], i64* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[ADD_OFF]], 4294967295
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[CONV9:%.*]] = trunc i64 [[ADD]] to i32
+; CHECK-NEXT:    ret i32 [[CONV9]]
+;
 entry:
   %conv = sext i32 %a to i64
   %conv2 = sext i32 %b to i64
   %add = add nsw i64 %conv2, %conv
   %add.off = add i64 %add, 2147483648
-  
   store i64 %add.off, i64* %P
-  
-; CHECK-NOT: llvm.sadd.with.overflow
   %0 = icmp ugt i64 %add.off, 4294967295
   br i1 %0, label %if.then, label %if.end
 
@@ -49,21 +70,32 @@ if.then:
 
 if.end:
   %conv9 = trunc i64 %add to i32
-; CHECK: ret i32
   ret i32 %conv9
 }
 
-; CHECK: test3
 ; PR8816
 ; This is illegal to transform because the high bits of the original add are
 ; live out.
 define i64 @test3(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[ADD_OFF:%.*]] = add nsw i64 [[ADD]], 2147483648
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[ADD_OFF]], 4294967295
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i64 [[ADD]]
+;
 entry:
   %conv = sext i32 %a to i64
   %conv2 = sext i32 %b to i64
   %add = add nsw i64 %conv2, %conv
   %add.off = add i64 %add, 2147483648
-; CHECK-NOT: llvm.sadd.with.overflow
   %0 = icmp ugt i64 %add.off, 4294967295
   br i1 %0, label %if.then, label %if.end
 
@@ -73,12 +105,23 @@ if.then:
 
 if.end:
   ret i64 %add
-; CHECK: ret i64
 }
 
-; CHECK-LABEL: @test4(
 ; Should be able to form an i8 sadd computed in an i32.
+
 define zeroext i8 @test4(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SADD:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[B:%.*]], i8 [[A:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = extractvalue { i8, i1 } [[SADD]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #2
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end:
+; CHECK-NEXT:    [[SADD_RESULT:%.*]] = extractvalue { i8, i1 } [[SADD]], 0
+; CHECK-NEXT:    ret i8 [[SADD_RESULT]]
+;
 entry:
   %conv = sext i8 %a to i32
   %conv2 = sext i8 %b to i32
@@ -86,27 +129,34 @@ entry:
   %add4 = add nsw i32 %add, 128
   %cmp = icmp ugt i32 %add4, 255
   br i1 %cmp, label %if.then, label %if.end
-; CHECK: llvm.sadd.with.overflow.i8
-if.then:                                          ; preds = %entry
+if.then:
   tail call void @throwAnExceptionOrWhatever() nounwind
   unreachable
 
-if.end:                                           ; preds = %entry
+if.end:
   %conv7 = trunc i32 %add to i8
   ret i8 %conv7
-; CHECK: ret i8
 }
 
-; CHECK-LABEL: @test8(
 ; PR11438
 ; This is @test1, but the operands are not sign-extended.  Make sure
 ; we don't transform this case.
+
 define i32 @test8(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD_OFF:%.*]] = add i64 [[ADD]], 2147483648
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[ADD_OFF]], 4294967295
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[CONV9:%.*]] = trunc i64 [[ADD]] to i32
+; CHECK-NEXT:    ret i32 [[CONV9]]
+;
 entry:
-; CHECK-NOT: llvm.sadd
-; CHECK: add i64 %a, %b
-; CHECK-NOT: llvm.sadd
-; CHECK: ret
   %add = add i64 %a, %b
   %add.off = add i64 %add, 2147483648
   %0 = icmp ugt i64 %add.off, 4294967295
@@ -120,3 +170,4 @@ if.end:
   %conv9 = trunc i64 %add to i32
   ret i32 %conv9
 }
+
diff --git a/test/Transforms/InstCombine/pr39908.ll b/test/Transforms/InstCombine/pr39908.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bd7a82990ad8f988e54840c6177a86edcb722a38
--- /dev/null
+++ b/test/Transforms/InstCombine/pr39908.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "p:32:32"
+
+%S = type { [2 x i32] }
+
+define i1 @test([0 x %S]* %p, i32 %n) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %start.cast = bitcast [0 x %S]* %p to %S*
+  %end = getelementptr inbounds [0 x %S], [0 x %S]* %p, i32 0, i32 %n, i32 0, i32 0
+  %end.cast = bitcast i32* %end to %S*
+  %last = getelementptr inbounds %S, %S* %end.cast, i32 -1
+  %cmp = icmp eq %S* %last, %start.cast
+  ret i1 %cmp
+}
+
+; Same test using 64-bit indices.
+define i1 @test64([0 x %S]* %p, i64 %n) {
+; CHECK-LABEL: @test64(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %start.cast = bitcast [0 x %S]* %p to %S*
+  %end = getelementptr inbounds [0 x %S], [0 x %S]* %p, i64 0, i64 %n, i32 0, i64 0
+  %end.cast = bitcast i32* %end to %S*
+  %last = getelementptr inbounds %S, %S* %end.cast, i64 -1
+  %cmp = icmp eq %S* %last, %start.cast
+  ret i1 %cmp
+}
+
+; Here the offset overflows and is treated modulo 2^32. This is UB.
+define i1 @test64_overflow([0 x %S]* %p, i64 %n) {
+; CHECK-LABEL: @test64_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %start.cast = bitcast [0 x %S]* %p to %S*
+  %end = getelementptr inbounds [0 x %S], [0 x %S]* %p, i64 0, i64 %n, i32 0, i64 8589934592
+  %end.cast = bitcast i32* %end to %S*
+  %last = getelementptr inbounds %S, %S* %end.cast, i64 -1
+  %cmp = icmp eq %S* %last, %start.cast
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/rotate.ll b/test/Transforms/InstCombine/rotate.ll
index 0e20c81b3609a1cfa4692e764953f76e17a4ea62..492817e47e22b804f51614be785b57a43b770652 100644
--- a/test/Transforms/InstCombine/rotate.ll
+++ b/test/Transforms/InstCombine/rotate.ll
@@ -1,20 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
-; These are UB-free rotate left/right patterns that are narrowed to a smaller bitwidth.
-; See PR34046 and PR16726 for motivating examples:
+; TODO: Canonicalize rotate by constant to funnel shift intrinsics.
+; This should help cost modeling for vectorization, inlining, etc.
+; If a target does not have a rotate instruction, the expansion will
+; be exactly these same 3 basic ops (shl/lshr/or).
+
+define i32 @rotl_i32_constant(i32 %x) {
+; CHECK-LABEL: @rotl_i32_constant(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 11
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 21
+; CHECK-NEXT:    [[R:%.*]] = or i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shl = shl i32 %x, 11
+  %shr = lshr i32 %x, 21
+  %r = or i32 %shr, %shl
+  ret i32 %r
+}
+
+define i42 @rotr_i42_constant(i42 %x) {
+; CHECK-LABEL: @rotr_i42_constant(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i42 [[X:%.*]], 31
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i42 [[X]], 11
+; CHECK-NEXT:    [[R:%.*]] = or i42 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i42 [[R]]
+;
+  %shl = shl i42 %x, 31
+  %shr = lshr i42 %x, 11
+  %r = or i42 %shr, %shl
+  ret i42 %r
+}
+
+define i8 @rotr_i8_constant_commute(i8 %x) {
+; CHECK-LABEL: @rotr_i8_constant_commute(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X]], 3
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %shl = shl i8 %x, 5
+  %shr = lshr i8 %x, 3
+  %r = or i8 %shl, %shr
+  ret i8 %r
+}
+
+define i88 @rotl_i88_constant_commute(i88 %x) {
+; CHECK-LABEL: @rotl_i88_constant_commute(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i88 [[X:%.*]], 44
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i88 [[X]], 44
+; CHECK-NEXT:    [[R:%.*]] = or i88 [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret i88 [[R]]
+;
+  %shl = shl i88 %x, 44
+  %shr = lshr i88 %x, 44
+  %r = or i88 %shl, %shr
+  ret i88 %r
+}
+
+; Vector types are allowed.
+
+define <2 x i16> @rotl_v2i16_constant_splat(<2 x i16> %x) {
+; CHECK-LABEL: @rotl_v2i16_constant_splat(
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i16> [[X:%.*]], <i16 1, i16 1>
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i16> [[X]], <i16 15, i16 15>
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret <2 x i16> [[R]]
+;
+  %shl = shl <2 x i16> %x, <i16 1, i16 1>
+  %shr = lshr <2 x i16> %x, <i16 15, i16 15>
+  %r = or <2 x i16> %shl, %shr
+  ret <2 x i16> %r
+}
+
+; Non-power-of-2 vector types are allowed.
+
+define <2 x i17> @rotr_v2i17_constant_splat(<2 x i17> %x) {
+; CHECK-LABEL: @rotr_v2i17_constant_splat(
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i17> [[X:%.*]], <i17 12, i17 12>
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i17> [[X]], <i17 5, i17 5>
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i17> [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret <2 x i17> [[R]]
+;
+  %shl = shl <2 x i17> %x, <i17 12, i17 12>
+  %shr = lshr <2 x i17> %x, <i17 5, i17 5>
+  %r = or <2 x i17> %shr, %shl
+  ret <2 x i17> %r
+}
+
+; Allow arbitrary shift constants.
+
+define <2 x i32> @rotr_v2i32_constant_nonsplat(<2 x i32> %x) {
+; CHECK-LABEL: @rotr_v2i32_constant_nonsplat(
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i32> [[X:%.*]], <i32 17, i32 19>
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[X]], <i32 15, i32 13>
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i32> [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shl = shl <2 x i32> %x, <i32 17, i32 19>
+  %shr = lshr <2 x i32> %x, <i32 15, i32 13>
+  %r = or <2 x i32> %shl, %shr
+  ret <2 x i32> %r
+}
+
+define <2 x i36> @rotl_v2i16_constant_nonsplat(<2 x i36> %x) {
+; CHECK-LABEL: @rotl_v2i16_constant_nonsplat(
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i36> [[X:%.*]], <i36 21, i36 11>
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i36> [[X]], <i36 15, i36 25>
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i36> [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret <2 x i36> [[R]]
+;
+  %shl = shl <2 x i36> %x, <i36 21, i36 11>
+  %shr = lshr <2 x i36> %x, <i36 15, i36 25>
+  %r = or <2 x i36> %shl, %shr
+  ret <2 x i36> %r
+}
+
+; The most basic rotate by variable - no guards for UB due to oversized shifts.
+; This cannot be canonicalized to funnel shift target-independently. The safe
+; expansion includes masking for the shift amount that is not included here,
+; so it could be more expensive.
+
+define i32 @rotl_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: @rotl_i32(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = or i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sub = sub i32 32, %y
+  %shl = shl i32 %x, %y
+  %shr = lshr i32 %x, %sub
+  %r = or i32 %shr, %shl
+  ret i32 %r
+}
+
+; Non-power-of-2 types should follow the same reasoning. Left/right is determined by subtract.
+
+define i37 @rotr_i37(i37 %x, i37 %y) {
+; CHECK-LABEL: @rotr_i37(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i37 37, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i37 [[X:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i37 [[X]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = or i37 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i37 [[R]]
+;
+  %sub = sub i37 37, %y
+  %shl = shl i37 %x, %sub
+  %shr = lshr i37 %x, %y
+  %r = or i37 %shr, %shl
+  ret i37 %r
+}
+
+; Commute 'or' operands.
+
+define i8 @rotr_i8_commute(i8 %x, i8 %y) {
+; CHECK-LABEL: @rotr_i8_commute(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 8, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sub = sub i8 8, %y
+  %shl = shl i8 %x, %sub
+  %shr = lshr i8 %x, %y
+  %r = or i8 %shl, %shr
+  ret i8 %r
+}
+
+; Vector types should follow the same rules.
+
+define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @rotl_v4i32(
+; CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <4 x i32> [[X]], [[SUB]]
+; CHECK-NEXT:    [[R:%.*]] = or <4 x i32> [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sub = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %shl = shl <4 x i32> %x, %y
+  %shr = lshr <4 x i32> %x, %sub
+  %r = or <4 x i32> %shl, %shr
+  ret <4 x i32> %r
+}
+
+; Non-power-of-2 vector types should follow the same rules.
+
+define <3 x i42> @rotr_v3i42(<3 x i42> %x, <3 x i42> %y) {
+; CHECK-LABEL: @rotr_v3i42(
+; CHECK-NEXT:    [[SUB:%.*]] = sub <3 x i42> <i42 42, i42 42, i42 42>, [[Y:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl <3 x i42> [[X:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <3 x i42> [[X]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = or <3 x i42> [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret <3 x i42> [[R]]
+;
+  %sub = sub <3 x i42> <i42 42, i42 42, i42 42>, %y
+  %shl = shl <3 x i42> %x, %sub
+  %shr = lshr <3 x i42> %x, %y
+  %r = or <3 x i42> %shr, %shl
+  ret <3 x i42> %r
+}
+
+; TODO:
+; This is the canonical pattern for a UB-safe rotate-by-variable with power-of-2-size scalar type.
+; The backend expansion of funnel shift for targets that don't have a rotate instruction should
+; match the original IR, so it is always good to canonicalize to the intrinsics for this pattern.
+
+define i32 @rotl_safe_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: @rotl_safe_i32(
+; CHECK-NEXT:    [[NEGY:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[YMASK:%.*]] = and i32 [[Y]], 31
+; CHECK-NEXT:    [[NEGYMASK:%.*]] = and i32 [[NEGY]], 31
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], [[YMASK]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], [[NEGYMASK]]
+; CHECK-NEXT:    [[R:%.*]] = or i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %negy = sub i32 0, %y
+  %ymask = and i32 %y, 31
+  %negymask = and i32 %negy, 31
+  %shl = shl i32 %x, %ymask
+  %shr = lshr i32 %x, %negymask
+  %r = or i32 %shr, %shl
+  ret i32 %r
+}
+
+; TODO:
+; Extra uses don't change anything.
+
+define i16 @rotl_safe_i16_commute_extra_use(i16 %x, i16 %y, i16* %p) {
+; CHECK-LABEL: @rotl_safe_i16_commute_extra_use(
+; CHECK-NEXT:    [[NEGY:%.*]] = sub i16 0, [[Y:%.*]]
+; CHECK-NEXT:    [[YMASK:%.*]] = and i16 [[Y]], 15
+; CHECK-NEXT:    [[NEGYMASK:%.*]] = and i16 [[NEGY]], 15
+; CHECK-NEXT:    store i16 [[NEGYMASK]], i16* [[P:%.*]], align 2
+; CHECK-NEXT:    [[SHL:%.*]] = shl i16 [[X:%.*]], [[YMASK]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[X]], [[NEGYMASK]]
+; CHECK-NEXT:    [[R:%.*]] = or i16 [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %negy = sub i16 0, %y
+  %ymask = and i16 %y, 15
+  %negymask = and i16 %negy, 15
+  store i16 %negymask, i16* %p
+  %shl = shl i16 %x, %ymask
+  %shr = lshr i16 %x, %negymask
+  %r = or i16 %shl, %shr
+  ret i16 %r
+}
+
+; TODO:
+; Left/right is determined by the negation.
+
+define i64 @rotr_safe_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: @rotr_safe_i64(
+; CHECK-NEXT:    [[NEGY:%.*]] = sub i64 0, [[Y:%.*]]
+; CHECK-NEXT:    [[YMASK:%.*]] = and i64 [[Y]], 63
+; CHECK-NEXT:    [[NEGYMASK:%.*]] = and i64 [[NEGY]], 63
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[X:%.*]], [[NEGYMASK]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[X]], [[YMASK]]
+; CHECK-NEXT:    [[R:%.*]] = or i64 [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %negy = sub i64 0, %y
+  %ymask = and i64 %y, 63
+  %negymask = and i64 %negy, 63
+  %shl = shl i64 %x, %negymask
+  %shr = lshr i64 %x, %ymask
+  %r = or i64 %shr, %shl
+  ret i64 %r
+}
+
+; TODO:
+; Extra uses don't change anything.
+
+define i8 @rotr_safe_i8_commute_extra_use(i8 %x, i8 %y, i8* %p) {
+; CHECK-LABEL: @rotr_safe_i8_commute_extra_use(
+; CHECK-NEXT:    [[NEGY:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    [[YMASK:%.*]] = and i8 [[Y]], 7
+; CHECK-NEXT:    [[NEGYMASK:%.*]] = and i8 [[NEGY]], 7
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], [[NEGYMASK]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X]], [[YMASK]]
+; CHECK-NEXT:    store i8 [[SHR]], i8* [[P:%.*]], align 1
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[SHL]], [[SHR]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %negy = sub i8 0, %y
+  %ymask = and i8 %y, 7
+  %negymask = and i8 %negy, 7
+  %shl = shl i8 %x, %negymask
+  %shr = lshr i8 %x, %ymask
+  store i8 %shr, i8* %p
+  %r = or i8 %shl, %shr
+  ret i8 %r
+}
+
+; TODO:
+; Vectors follow the same rules.
+
+define <2 x i32> @rotl_safe_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @rotl_safe_v2i32(
+; CHECK-NEXT:    [[NEGY:%.*]] = sub <2 x i32> zeroinitializer, [[Y:%.*]]
+; CHECK-NEXT:    [[YMASK:%.*]] = and <2 x i32> [[Y]], <i32 31, i32 31>
+; CHECK-NEXT:    [[NEGYMASK:%.*]] = and <2 x i32> [[NEGY]], <i32 31, i32 31>
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i32> [[X:%.*]], [[YMASK]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[X]], [[NEGYMASK]]
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i32> [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %negy = sub <2 x i32> zeroinitializer, %y
+  %ymask = and <2 x i32> %y, <i32 31, i32 31>
+  %negymask = and <2 x i32> %negy, <i32 31, i32 31>
+  %shl = shl <2 x i32> %x, %ymask
+  %shr = lshr <2 x i32> %x, %negymask
+  %r = or <2 x i32> %shr, %shl
+  ret <2 x i32> %r
+}
+
+; TODO:
+; Vectors follow the same rules.
+
+define <3 x i16> @rotr_safe_v3i16(<3 x i16> %x, <3 x i16> %y) {
+; CHECK-LABEL: @rotr_safe_v3i16(
+; CHECK-NEXT:    [[NEGY:%.*]] = sub <3 x i16> zeroinitializer, [[Y:%.*]]
+; CHECK-NEXT:    [[YMASK:%.*]] = and <3 x i16> [[Y]], <i16 15, i16 15, i16 15>
+; CHECK-NEXT:    [[NEGYMASK:%.*]] = and <3 x i16> [[NEGY]], <i16 15, i16 15, i16 15>
+; CHECK-NEXT:    [[SHL:%.*]] = shl <3 x i16> [[X:%.*]], [[NEGYMASK]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <3 x i16> [[X]], [[YMASK]]
+; CHECK-NEXT:    [[R:%.*]] = or <3 x i16> [[SHR]], [[SHL]]
+; CHECK-NEXT:    ret <3 x i16> [[R]]
+;
+  %negy = sub <3 x i16> zeroinitializer, %y
+  %ymask = and <3 x i16> %y, <i16 15, i16 15, i16 15>
+  %negymask = and <3 x i16> %negy, <i16 15, i16 15, i16 15>
+  %shl = shl <3 x i16> %x, %negymask
+  %shr = lshr <3 x i16> %x, %ymask
+  %r = or <3 x i16> %shr, %shl
+  ret <3 x i16> %r
+}
+
+; These are optionally UB-free rotate left/right patterns that are narrowed to a smaller bitwidth.
+; See PR34046, PR16726, and PR39624 for motivating examples:
 ; https://bugs.llvm.org/show_bug.cgi?id=34046
 ; https://bugs.llvm.org/show_bug.cgi?id=16726
+; https://bugs.llvm.org/show_bug.cgi?id=39624
 
 define i16 @rotate_left_16bit(i16 %v, i32 %shift) {
 ; CHECK-LABEL: @rotate_left_16bit(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %shift to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHIFT:%.*]] to i16
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub i16 0, [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i16 [[TMP3]], 15
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i16 %v, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i16 %v, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i16 [[V:%.*]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i16 [[V]], [[TMP2]]
 ; CHECK-NEXT:    [[CONV2:%.*]] = or i16 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    ret i16 [[CONV2]]
 ;
@@ -32,12 +375,12 @@ define i16 @rotate_left_16bit(i16 %v, i32 %shift) {
 
 define <2 x i16> @rotate_left_commute_16bit_vec(<2 x i16> %v, <2 x i32> %shift) {
 ; CHECK-LABEL: @rotate_left_commute_16bit_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> %shift to <2 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[SHIFT:%.*]] to <2 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i16> [[TMP1]], <i16 15, i16 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i16> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i16> [[TMP3]], <i16 15, i16 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = shl <2 x i16> %v, [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr <2 x i16> %v, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <2 x i16> [[V:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <2 x i16> [[V]], [[TMP4]]
 ; CHECK-NEXT:    [[CONV2:%.*]] = or <2 x i16> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    ret <2 x i16> [[CONV2]]
 ;
@@ -55,11 +398,11 @@ define <2 x i16> @rotate_left_commute_16bit_vec(<2 x i16> %v, <2 x i32> %shift)
 
 define i8 @rotate_right_8bit(i8 %v, i3 %shift) {
 ; CHECK-LABEL: @rotate_right_8bit(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 %shift to i8
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i3 0, %shift
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[SHIFT:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i3 0, [[SHIFT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i3 [[TMP2]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i8 %v, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 %v, [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i8 [[V:%.*]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[V]], [[TMP1]]
 ; CHECK-NEXT:    [[CONV2:%.*]] = or i8 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret i8 [[CONV2]]
 ;
@@ -78,11 +421,11 @@ define i8 @rotate_right_8bit(i8 %v, i3 %shift) {
 
 define i8 @rotate_right_commute_8bit(i32 %v, i32 %shift) {
 ; CHECK-LABEL: @rotate_right_commute_8bit(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %shift to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHIFT:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i8 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP3]], 7
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 %v to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[V:%.*]] to i8
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i8 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i8 [[TMP5]], [[TMP4]]
 ; CHECK-NEXT:    [[CONV2:%.*]] = or i8 [[TMP6]], [[TMP7]]
@@ -103,12 +446,12 @@ define i8 @rotate_right_commute_8bit(i32 %v, i32 %shift) {
 
 define i8 @rotate8_not_safe(i8 %v, i32 %shamt) {
 ; CHECK-LABEL: @rotate8_not_safe(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %shamt to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 0, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP2]], 7
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 %v, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i8 %v, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[V:%.*]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i8 [[V]], [[TMP3]]
 ; CHECK-NEXT:    [[RET:%.*]] = or i8 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    ret i8 [[RET]]
 ;
@@ -121,110 +464,340 @@ define i8 @rotate8_not_safe(i8 %v, i32 %shamt) {
   ret i8 %ret
 }
 
-; The next two tests mask sure we narrower (x << (x & 15)) | (x >> (-x & 15))
-; when types have been promoted.
-; FIXME: We should be able to narrow this.
-
-define i16 @rotate16_neg_mask(i16 %v, i16 %shamt) {
-; CHECK-LABEL: @rotate16_neg_mask(
-; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[V:%.*]] to i32
-; CHECK-NEXT:    [[RSHAMT:%.*]] = and i16 [[SHAMT:%.*]], 15
-; CHECK-NEXT:    [[RSHAMTCONV:%.*]] = zext i16 [[RSHAMT]] to i32
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[CONV]], [[RSHAMTCONV]]
-; CHECK-NEXT:    [[NEG:%.*]] = sub i16 0, [[SHAMT]]
-; CHECK-NEXT:    [[LSHAMT:%.*]] = and i16 [[NEG]], 15
-; CHECK-NEXT:    [[LSHAMTCONV:%.*]] = zext i16 [[LSHAMT]] to i32
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[CONV]], [[LSHAMTCONV]]
+; A non-power-of-2 destination type can't be masked as above.
+
+define i9 @rotate9_not_safe(i9 %v, i32 %shamt) {
+; CHECK-LABEL: @rotate9_not_safe(
+; CHECK-NEXT:    [[CONV:%.*]] = zext i9 [[V:%.*]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 9, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[CONV]], [[SUB]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[CONV]], [[SHAMT]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[OR]] to i16
+; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[OR]] to i9
+; CHECK-NEXT:    ret i9 [[RET]]
+;
+  %conv = zext i9 %v to i32
+  %sub = sub i32 9, %shamt
+  %shr = lshr i32 %conv, %sub
+  %shl = shl i32 %conv, %shamt
+  %or = or i32 %shr, %shl
+  %ret = trunc i32 %or to i9
+  ret i9 %ret
+}
+
+; We should narrow (v << (s & 15)) | (v >> (-s & 15))
+; when both v and s have been promoted.
+
+define i16 @rotateleft_16_neg_mask(i16 %v, i16 %shamt) {
+; CHECK-LABEL: @rotateleft_16_neg_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i16 0, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[SHAMT]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = and i16 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i16 [[V:%.*]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i16 [[V]], [[TMP2]]
+; CHECK-NEXT:    [[RET:%.*]] = or i16 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret i16 [[RET]]
 ;
-  %conv = zext i16 %v to i32
-  %rshamt = and i16 %shamt, 15
+  %neg = sub i16 0, %shamt
+  %lshamt = and i16 %shamt, 15
+  %lshamtconv = zext i16 %lshamt to i32
+  %rshamt = and i16 %neg, 15
   %rshamtconv = zext i16 %rshamt to i32
+  %conv = zext i16 %v to i32
+  %shl = shl i32 %conv, %lshamtconv
   %shr = lshr i32 %conv, %rshamtconv
+  %or = or i32 %shr, %shl
+  %ret = trunc i32 %or to i16
+  ret i16 %ret
+}
+
+define i16 @rotateleft_16_neg_mask_commute(i16 %v, i16 %shamt) {
+; CHECK-LABEL: @rotateleft_16_neg_mask_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i16 0, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[SHAMT]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = and i16 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i16 [[V:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i16 [[V]], [[TMP3]]
+; CHECK-NEXT:    [[RET:%.*]] = or i16 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i16 [[RET]]
+;
   %neg = sub i16 0, %shamt
-  %lshamt = and i16 %neg, 15
+  %lshamt = and i16 %shamt, 15
   %lshamtconv = zext i16 %lshamt to i32
+  %rshamt = and i16 %neg, 15
+  %rshamtconv = zext i16 %rshamt to i32
+  %conv = zext i16 %v to i32
   %shl = shl i32 %conv, %lshamtconv
-  %or = or i32 %shr, %shl
+  %shr = lshr i32 %conv, %rshamtconv
+  %or = or i32 %shl, %shr
   %ret = trunc i32 %or to i16
   ret i16 %ret
 }
 
-define i8 @rotate8_neg_mask(i8 %v, i8 %shamt) {
-; CHECK-LABEL: @rotate8_neg_mask(
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[V:%.*]] to i32
-; CHECK-NEXT:    [[RSHAMT:%.*]] = and i8 [[SHAMT:%.*]], 7
-; CHECK-NEXT:    [[RSHAMTCONV:%.*]] = zext i8 [[RSHAMT]] to i32
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[CONV]], [[RSHAMTCONV]]
-; CHECK-NEXT:    [[NEG:%.*]] = sub i8 0, [[SHAMT]]
-; CHECK-NEXT:    [[LSHAMT:%.*]] = and i8 [[NEG]], 7
-; CHECK-NEXT:    [[LSHAMTCONV:%.*]] = zext i8 [[LSHAMT]] to i32
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[CONV]], [[LSHAMTCONV]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[OR]] to i8
+define i8 @rotateright_8_neg_mask(i8 %v, i8 %shamt) {
+; CHECK-LABEL: @rotateright_8_neg_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 0, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[SHAMT]], 7
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i8 [[V:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i8 [[V]], [[TMP3]]
+; CHECK-NEXT:    [[RET:%.*]] = or i8 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret i8 [[RET]]
 ;
-  %conv = zext i8 %v to i32
+  %neg = sub i8 0, %shamt
   %rshamt = and i8 %shamt, 7
   %rshamtconv = zext i8 %rshamt to i32
+  %lshamt = and i8 %neg, 7
+  %lshamtconv = zext i8 %lshamt to i32
+  %conv = zext i8 %v to i32
+  %shl = shl i32 %conv, %lshamtconv
   %shr = lshr i32 %conv, %rshamtconv
+  %or = or i32 %shr, %shl
+  %ret = trunc i32 %or to i8
+  ret i8 %ret
+}
+
+define i8 @rotateright_8_neg_mask_commute(i8 %v, i8 %shamt) {
+; CHECK-LABEL: @rotateright_8_neg_mask_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 0, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[SHAMT]], 7
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i8 [[V:%.*]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[V]], [[TMP2]]
+; CHECK-NEXT:    [[RET:%.*]] = or i8 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
   %neg = sub i8 0, %shamt
+  %rshamt = and i8 %shamt, 7
+  %rshamtconv = zext i8 %rshamt to i32
   %lshamt = and i8 %neg, 7
   %lshamtconv = zext i8 %lshamt to i32
+  %conv = zext i8 %v to i32
   %shl = shl i32 %conv, %lshamtconv
-  %or = or i32 %shr, %shl
+  %shr = lshr i32 %conv, %rshamtconv
+  %or = or i32 %shl, %shr
   %ret = trunc i32 %or to i8
   ret i8 %ret
 }
 
-; The next two types have a shift amount that is already i32 so we would still
-; need a truncate for it going into the rotate pattern.
-; FIXME: We can narrow this, but we would still need a trunc on the shift amt.
+; The shift amount may already be in the wide type,
+; so we need to truncate it going into the rotate pattern.
 
-define i16 @rotate16_neg_mask_wide_amount(i16 %v, i32 %shamt) {
-; CHECK-LABEL: @rotate16_neg_mask_wide_amount(
-; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[V:%.*]] to i32
-; CHECK-NEXT:    [[RSHAMT:%.*]] = and i32 [[SHAMT:%.*]], 15
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[CONV]], [[RSHAMT]]
-; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[SHAMT]]
-; CHECK-NEXT:    [[LSHAMT:%.*]] = and i32 [[NEG]], 15
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[CONV]], [[LSHAMT]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[OR]] to i16
+define i16 @rotateright_16_neg_mask_wide_amount(i16 %v, i32 %shamt) {
+; CHECK-LABEL: @rotateright_16_neg_mask_wide_amount(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i16 0, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i16 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = and i16 [[TMP2]], 15
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i16 [[V:%.*]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i16 [[V]], [[TMP4]]
+; CHECK-NEXT:    [[RET:%.*]] = or i16 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    ret i16 [[RET]]
 ;
-  %conv = zext i16 %v to i32
+  %neg = sub i32 0, %shamt
   %rshamt = and i32 %shamt, 15
+  %lshamt = and i32 %neg, 15
+  %conv = zext i16 %v to i32
+  %shl = shl i32 %conv, %lshamt
   %shr = lshr i32 %conv, %rshamt
+  %or = or i32 %shr, %shl
+  %ret = trunc i32 %or to i16
+  ret i16 %ret
+}
+
+define i16 @rotateright_16_neg_mask_wide_amount_commute(i16 %v, i32 %shamt) {
+; CHECK-LABEL: @rotateright_16_neg_mask_wide_amount_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i16 0, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i16 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = and i16 [[TMP2]], 15
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i16 [[V:%.*]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i16 [[V]], [[TMP3]]
+; CHECK-NEXT:    [[RET:%.*]] = or i16 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i16 [[RET]]
+;
   %neg = sub i32 0, %shamt
+  %rshamt = and i32 %shamt, 15
   %lshamt = and i32 %neg, 15
+  %conv = zext i16 %v to i32
   %shl = shl i32 %conv, %lshamt
-  %or = or i32 %shr, %shl
+  %shr = lshr i32 %conv, %rshamt
+  %or = or i32 %shl, %shr
   %ret = trunc i32 %or to i16
   ret i16 %ret
 }
 
-define i8 @rotate8_neg_mask_wide_amount(i8 %v, i32 %shamt) {
-; CHECK-LABEL: @rotate8_neg_mask_wide_amount(
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[V:%.*]] to i32
-; CHECK-NEXT:    [[RSHAMT:%.*]] = and i32 [[SHAMT:%.*]], 7
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[CONV]], [[RSHAMT]]
-; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[SHAMT]]
-; CHECK-NEXT:    [[LSHAMT:%.*]] = and i32 [[NEG]], 7
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[CONV]], [[LSHAMT]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
-; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[OR]] to i8
+define i8 @rotateleft_8_neg_mask_wide_amount(i8 %v, i32 %shamt) {
+; CHECK-LABEL: @rotateleft_8_neg_mask_wide_amount(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 0, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP2]], 7
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[V:%.*]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i8 [[V]], [[TMP3]]
+; CHECK-NEXT:    [[RET:%.*]] = or i8 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    ret i8 [[RET]]
 ;
+  %neg = sub i32 0, %shamt
+  %lshamt = and i32 %shamt, 7
+  %rshamt = and i32 %neg, 7
   %conv = zext i8 %v to i32
-  %rshamt = and i32 %shamt, 7
+  %shl = shl i32 %conv, %lshamt
   %shr = lshr i32 %conv, %rshamt
+  %or = or i32 %shr, %shl
+  %ret = trunc i32 %or to i8
+  ret i8 %ret
+}
+
+define i8 @rotateleft_8_neg_mask_wide_amount_commute(i8 %v, i32 %shamt) {
+; CHECK-LABEL: @rotateleft_8_neg_mask_wide_amount_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 0, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP2]], 7
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i8 [[V:%.*]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i8 [[V]], [[TMP4]]
+; CHECK-NEXT:    [[RET:%.*]] = or i8 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
   %neg = sub i32 0, %shamt
-  %lshamt = and i32 %neg, 7
+  %lshamt = and i32 %shamt, 7
+  %rshamt = and i32 %neg, 7
+  %conv = zext i8 %v to i32
   %shl = shl i32 %conv, %lshamt
-  %or = or i32 %shr, %shl
+  %shr = lshr i32 %conv, %rshamt
+  %or = or i32 %shl, %shr
   %ret = trunc i32 %or to i8
   ret i8 %ret
 }
+
+; Non-power-of-2 types. This could be transformed, but it's not a typical rotate pattern.
+
+define i9 @rotateleft_9_neg_mask_wide_amount_commute(i9 %v, i33 %shamt) {
+; CHECK-LABEL: @rotateleft_9_neg_mask_wide_amount_commute(
+; CHECK-NEXT:    [[NEG:%.*]] = sub i33 0, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[LSHAMT:%.*]] = and i33 [[SHAMT]], 8
+; CHECK-NEXT:    [[RSHAMT:%.*]] = and i33 [[NEG]], 8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i9 [[V:%.*]] to i33
+; CHECK-NEXT:    [[SHL:%.*]] = shl i33 [[CONV]], [[LSHAMT]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i33 [[CONV]], [[RSHAMT]]
+; CHECK-NEXT:    [[OR:%.*]] = or i33 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[RET:%.*]] = trunc i33 [[OR]] to i9
+; CHECK-NEXT:    ret i9 [[RET]]
+;
+  %neg = sub i33 0, %shamt
+  %lshamt = and i33 %shamt, 8
+  %rshamt = and i33 %neg, 8
+  %conv = zext i9 %v to i33
+  %shl = shl i33 %conv, %lshamt
+  %shr = lshr i33 %conv, %rshamt
+  %or = or i33 %shl, %shr
+  %ret = trunc i33 %or to i9
+  ret i9 %ret
+}
+
+; Convert select pattern to masked shift that ends in 'or'.
+
+define i32 @rotr_select(i32 %x, i32 %shamt) {
+; CHECK-LABEL: @rotr_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[SHAMT]], 31
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], 31
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[X:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i32 [[X]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = or i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %cmp = icmp eq i32 %shamt, 0
+  %sub = sub i32 32, %shamt
+  %shr = lshr i32 %x, %shamt
+  %shl = shl i32 %x, %sub
+  %or = or i32 %shr, %shl
+  %r = select i1 %cmp, i32 %x, i32 %or
+  ret i32 %r
+}
+
+; Convert select pattern to masked shift that ends in 'or'.
+
+define i8 @rotr_select_commute(i8 %x, i8 %shamt) {
+; CHECK-LABEL: @rotr_select_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 0, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[SHAMT]], 7
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i8 [[X:%.*]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[X]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %cmp = icmp eq i8 %shamt, 0
+  %sub = sub i8 8, %shamt
+  %shr = lshr i8 %x, %shamt
+  %shl = shl i8 %x, %sub
+  %or = or i8 %shl, %shr
+  %r = select i1 %cmp, i8 %x, i8 %or
+  ret i8 %r
+}
+
+; Convert select pattern to masked shift that ends in 'or'.
+
+define i16 @rotl_select(i16 %x, i16 %shamt) {
+; CHECK-LABEL: @rotl_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i16 0, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[SHAMT]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = and i16 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i16 [[X:%.*]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i16 [[X]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = or i16 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %cmp = icmp eq i16 %shamt, 0
+  %sub = sub i16 16, %shamt
+  %shr = lshr i16 %x, %sub
+  %shl = shl i16 %x, %shamt
+  %or = or i16 %shr, %shl
+  %r = select i1 %cmp, i16 %x, i16 %or
+  ret i16 %r
+}
+
+; Convert select pattern to masked shift that ends in 'or'.
+
+define <2 x i64> @rotl_select_commute(<2 x i64> %x, <2 x i64> %shamt) {
+; CHECK-LABEL: @rotl_select_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i64> zeroinitializer, [[SHAMT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[SHAMT]], <i64 63, i64 63>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP1]], <i64 63, i64 63>
+; CHECK-NEXT:    [[TMP4:%.*]] = shl <2 x i64> [[X:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <2 x i64> [[X]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %cmp = icmp eq <2 x i64> %shamt, zeroinitializer
+  %sub = sub <2 x i64> <i64 64, i64 64>, %shamt
+  %shr = lshr <2 x i64> %x, %sub
+  %shl = shl <2 x i64> %x, %shamt
+  %or = or <2 x i64> %shl, %shr
+  %r = select <2 x i1> %cmp, <2 x i64> %x, <2 x i64> %or
+  ret <2 x i64> %r
+}
+
+; Negative test - the transform is only valid with power-of-2 types.
+
+define i24 @rotl_select_weird_type(i24 %x, i24 %shamt) {
+; CHECK-LABEL: @rotl_select_weird_type(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i24 [[SHAMT:%.*]], 0
+; CHECK-NEXT:    [[SUB:%.*]] = sub i24 24, [[SHAMT]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i24 [[X:%.*]], [[SUB]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i24 [[X]], [[SHAMT]]
+; CHECK-NEXT:    [[OR:%.*]] = or i24 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP]], i24 [[X]], i24 [[OR]]
+; CHECK-NEXT:    ret i24 [[R]]
+;
+  %cmp = icmp eq i24 %shamt, 0
+  %sub = sub i24 24, %shamt
+  %shr = lshr i24 %x, %sub
+  %shl = shl i24 %x, %shamt
+  %or = or i24 %shl, %shr
+  %r = select i1 %cmp, i24 %x, i24 %or
+  ret i24 %r
+}
+
diff --git a/test/Transforms/InstCombine/saturating-add-sub.ll b/test/Transforms/InstCombine/saturating-add-sub.ll
new file mode 100644
index 0000000000000000000000000000000000000000..51f2670ec1229d130966075dd117415e98367910
--- /dev/null
+++ b/test/Transforms/InstCombine/saturating-add-sub.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;
+; Saturating addition.
+;
+
+declare i8 @llvm.uadd.sat.i8(i8, i8)
+declare i8 @llvm.sadd.sat.i8(i8, i8)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+; Constant uadd argument is canonicalized to the right.
+define i8 @test_scalar_uadd_canonical(i8 %a) {
+; CHECK-LABEL: @test_scalar_uadd_canonical(
+; CHECK-NEXT:    [[X:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[A:%.*]], i8 10)
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %x = call i8 @llvm.uadd.sat.i8(i8 10, i8 %a)
+  ret i8 %x
+}
+
+define <2 x i8> @test_vector_uadd_canonical(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_uadd_canonical(
+; CHECK-NEXT:    [[X:%.*]] = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 10, i8 20>)
+; CHECK-NEXT:    ret <2 x i8> [[X]]
+;
+  %x = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> <i8 10, i8 20>, <2 x i8> %a)
+  ret <2 x i8> %x
+}
+
+; Constant sadd argument is canonicalized to the right.
+define i8 @test_scalar_sadd_canonical(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_canonical(
+; CHECK-NEXT:    [[X:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 -10)
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %x = call i8 @llvm.sadd.sat.i8(i8 -10, i8 %a)
+  ret i8 %x
+}
+
+define <2 x i8> @test_vector_sadd_canonical(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_sadd_canonical(
+; CHECK-NEXT:    [[X:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 10, i8 -20>)
+; CHECK-NEXT:    ret <2 x i8> [[X]]
+;
+  %x = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 10, i8 -20>, <2 x i8> %a)
+  ret <2 x i8> %x
+}
+
+; Can combine uadds with constant operands.
+define i8 @test_scalar_uadd_combine(i8 %a) {
+; CHECK-LABEL: @test_scalar_uadd_combine(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[A:%.*]], i8 30)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %x1 = call i8 @llvm.uadd.sat.i8(i8 %a, i8 10)
+  %x2 = call i8 @llvm.uadd.sat.i8(i8 %x1, i8 20)
+  ret i8 %x2
+}
+
+define <2 x i8> @test_vector_uadd_combine(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_uadd_combine(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 30, i8 30>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %x1 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 10>)
+  %x2 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %x1, <2 x i8> <i8 20, i8 20>)
+  ret <2 x i8> %x2
+}
+
+; This could simplify, but currently doesn't.
+define <2 x i8> @test_vector_uadd_combine_non_splat(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_uadd_combine_non_splat(
+; CHECK-NEXT:    [[X1:%.*]] = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 10, i8 20>)
+; CHECK-NEXT:    [[X2:%.*]] = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> [[X1]], <2 x i8> <i8 30, i8 40>)
+; CHECK-NEXT:    ret <2 x i8> [[X2]]
+;
+  %x1 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 20>)
+  %x2 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %x1, <2 x i8> <i8 30, i8 40>)
+  ret <2 x i8> %x2
+}
+
+; Can combine uadds even if they overflow.
+define i8 @test_scalar_uadd_overflow(i8 %a) {
+; CHECK-LABEL: @test_scalar_uadd_overflow(
+; CHECK-NEXT:    ret i8 -1
+;
+  %y1 = call i8 @llvm.uadd.sat.i8(i8 %a, i8 100)
+  %y2 = call i8 @llvm.uadd.sat.i8(i8 %y1, i8 200)
+  ret i8 %y2
+}
+
+define <2 x i8> @test_vector_uadd_overflow(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_uadd_overflow(
+; CHECK-NEXT:    ret <2 x i8> <i8 -1, i8 -1>
+;
+  %y1 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 100, i8 100>)
+  %y2 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %y1, <2 x i8> <i8 200, i8 200>)
+  ret <2 x i8> %y2
+}
+
+; Can combine sadds if sign matches.
+define i8 @test_scalar_sadd_both_positive(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_both_positive(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 30)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %z1 = call i8 @llvm.sadd.sat.i8(i8 %a, i8 10)
+  %z2 = call i8 @llvm.sadd.sat.i8(i8 %z1, i8 20)
+  ret i8 %z2
+}
+
+define <2 x i8> @test_vector_sadd_both_positive(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_sadd_both_positive(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 30, i8 30>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %z1 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 10>)
+  %z2 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %z1, <2 x i8> <i8 20, i8 20>)
+  ret <2 x i8> %z2
+}
+
+define i8 @test_scalar_sadd_both_negative(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_both_negative(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 -30)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %u1 = call i8 @llvm.sadd.sat.i8(i8 %a, i8 -10)
+  %u2 = call i8 @llvm.sadd.sat.i8(i8 %u1, i8 -20)
+  ret i8 %u2
+}
+
+define <2 x i8> @test_vector_sadd_both_negative(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_sadd_both_negative(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 -30, i8 -30>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %u1 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 -10, i8 -10>)
+  %u2 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %u1, <2 x i8> <i8 -20, i8 -20>)
+  ret <2 x i8> %u2
+}
+
+; Can't combine sadds if constants have different sign.
+define i8 @test_scalar_sadd_different_sign(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_different_sign(
+; CHECK-NEXT:    [[V1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 10)
+; CHECK-NEXT:    [[V2:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[V1]], i8 -20)
+; CHECK-NEXT:    ret i8 [[V2]]
+;
+  %v1 = call i8 @llvm.sadd.sat.i8(i8 %a, i8 10)
+  %v2 = call i8 @llvm.sadd.sat.i8(i8 %v1, i8 -20)
+  ret i8 %v2
+}
+
+; Can't combine sadds if they overflow.
+define i8 @test_scalar_sadd_overflow(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_overflow(
+; CHECK-NEXT:    [[W1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 100)
+; CHECK-NEXT:    [[W2:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[W1]], i8 100)
+; CHECK-NEXT:    ret i8 [[W2]]
+;
+  %w1 = call i8 @llvm.sadd.sat.i8(i8 %a, i8 100)
+  %w2 = call i8 @llvm.sadd.sat.i8(i8 %w1, i8 100)
+  ret i8 %w2
+}
+
+; neg uadd neg always overflows.
+define i8 @test_scalar_uadd_neg_neg(i8 %a) {
+; CHECK-LABEL: @test_scalar_uadd_neg_neg(
+; CHECK-NEXT:    ret i8 -1
+;
+  %a_neg = or i8 %a, -128
+  %r = call i8 @llvm.uadd.sat.i8(i8 %a_neg, i8 -10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_uadd_neg_neg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_uadd_neg_neg(
+; CHECK-NEXT:    ret <2 x i8> <i8 -1, i8 -1>
+;
+  %a_neg = or <2 x i8> %a, <i8 -128, i8 -128>
+  %r = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %a_neg, <2 x i8> <i8 -10, i8 -20>)
+  ret <2 x i8> %r
+}
+
+; nneg uadd nneg never overflows.
+define i8 @test_scalar_uadd_nneg_nneg(i8 %a) {
+; CHECK-LABEL: @test_scalar_uadd_nneg_nneg(
+; CHECK-NEXT:    [[A_NNEG:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[R:%.*]] = add nuw i8 [[A_NNEG]], 10
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_nneg = and i8 %a, 127
+  %r = call i8 @llvm.uadd.sat.i8(i8 %a_nneg, i8 10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_uadd_nneg_nneg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_uadd_nneg_nneg(
+; CHECK-NEXT:    [[A_NNEG:%.*]] = and <2 x i8> [[A:%.*]], <i8 127, i8 127>
+; CHECK-NEXT:    [[R:%.*]] = add nuw <2 x i8> [[A_NNEG]], <i8 10, i8 20>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_nneg = and <2 x i8> %a, <i8 127, i8 127>
+  %r = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %a_nneg, <2 x i8> <i8 10, i8 20>)
+  ret <2 x i8> %r
+}
+
+; neg uadd nneg might overflow.
+define i8 @test_scalar_uadd_neg_nneg(i8 %a) {
+; CHECK-LABEL: @test_scalar_uadd_neg_nneg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[A_NEG]], i8 10)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_neg = or i8 %a, -128
+  %r = call i8 @llvm.uadd.sat.i8(i8 %a_neg, i8 10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_uadd_neg_nneg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_uadd_neg_nneg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or <2 x i8> [[A:%.*]], <i8 -128, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> [[A_NEG]], <2 x i8> <i8 10, i8 20>)
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_neg = or <2 x i8> %a, <i8 -128, i8 -128>
+  %r = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %a_neg, <2 x i8> <i8 10, i8 20>)
+  ret <2 x i8> %r
+}
+
+; neg sadd nneg never overflows.
+define i8 @test_scalar_sadd_neg_nneg(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_neg_nneg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[R:%.*]] = add nsw i8 [[A_NEG]], 10
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_neg = or i8 %a, -128
+  %r = call i8 @llvm.sadd.sat.i8(i8 %a_neg, i8 10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_sadd_neg_nneg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_sadd_neg_nneg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or <2 x i8> [[A:%.*]], <i8 -128, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i8> [[A_NEG]], <i8 10, i8 20>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_neg = or <2 x i8> %a, <i8 -128, i8 -128>
+  %r = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a_neg, <2 x i8> <i8 10, i8 20>)
+  ret <2 x i8> %r
+}
+
+; nneg sadd neg never overflows.
+define i8 @test_scalar_sadd_nneg_neg(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_nneg_neg(
+; CHECK-NEXT:    [[A_NNEG:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[R:%.*]] = add nsw i8 [[A_NNEG]], -10
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_nneg = and i8 %a, 127
+  %r = call i8 @llvm.sadd.sat.i8(i8 %a_nneg, i8 -10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_sadd_nneg_neg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_sadd_nneg_neg(
+; CHECK-NEXT:    [[A_NNEG:%.*]] = and <2 x i8> [[A:%.*]], <i8 127, i8 127>
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i8> [[A_NNEG]], <i8 -10, i8 -20>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_nneg = and <2 x i8> %a, <i8 127, i8 127>
+  %r = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a_nneg, <2 x i8> <i8 -10, i8 -20>)
+  ret <2 x i8> %r
+}
+
+; neg sadd neg might overflow.
+define i8 @test_scalar_sadd_neg_neg(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_neg_neg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A_NEG]], i8 -10)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_neg = or i8 %a, -128
+  %r = call i8 @llvm.sadd.sat.i8(i8 %a_neg, i8 -10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_sadd_neg_neg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_sadd_neg_neg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or <2 x i8> [[A:%.*]], <i8 -128, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A_NEG]], <2 x i8> <i8 -10, i8 -20>)
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_neg = or <2 x i8> %a, <i8 -128, i8 -128>
+  %r = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a_neg, <2 x i8> <i8 -10, i8 -20>)
+  ret <2 x i8> %r
+}
+
+;
+; Saturating subtraction.
+;
+
+declare i8 @llvm.usub.sat.i8(i8, i8)
+declare i8 @llvm.ssub.sat.i8(i8, i8)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+; Cannot canonicalize usub to uadd.
+define i8 @test_scalar_usub_canonical(i8 %a) {
+; CHECK-LABEL: @test_scalar_usub_canonical(
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[A:%.*]], i8 10)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %r = call i8 @llvm.usub.sat.i8(i8 %a, i8 10)
+  ret i8 %r
+}
+
+; Canonicalize ssub to sadd.
+define i8 @test_scalar_ssub_canonical(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_canonical(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 -10)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %r = call i8 @llvm.ssub.sat.i8(i8 %a, i8 10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_ssub_canonical(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_ssub_canonical(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 -10, i8 -10>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %r = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 10>)
+  ret <2 x i8> %r
+}
+
+define <2 x i8> @test_vector_ssub_canonical_min_non_splat(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_ssub_canonical_min_non_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 -10, i8 10>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %r = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 -10>)
+  ret <2 x i8> %r
+}
+
+; Cannot canonicalize signed min.
+define i8 @test_scalar_ssub_canonical_min(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_canonical_min(
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.ssub.sat.i8(i8 [[A:%.*]], i8 -128)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %r = call i8 @llvm.ssub.sat.i8(i8 %a, i8 -128)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_ssub_canonical_min(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_ssub_canonical_min(
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 -128, i8 -10>)
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %r = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 -128, i8 -10>)
+  ret <2 x i8> %r
+}
+
+; Can combine usubs with constant operands.
+define i8 @test_scalar_usub_combine(i8 %a) {
+; CHECK-LABEL: @test_scalar_usub_combine(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[A:%.*]], i8 30)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %x1 = call i8 @llvm.usub.sat.i8(i8 %a, i8 10)
+  %x2 = call i8 @llvm.usub.sat.i8(i8 %x1, i8 20)
+  ret i8 %x2
+}
+
+define <2 x i8> @test_vector_usub_combine(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_usub_combine(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 30, i8 30>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %x1 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 10>)
+  %x2 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %x1, <2 x i8> <i8 20, i8 20>)
+  ret <2 x i8> %x2
+}
+
+; This could simplify, but currently doesn't.
+define <2 x i8> @test_vector_usub_combine_non_splat(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_usub_combine_non_splat(
+; CHECK-NEXT:    [[X1:%.*]] = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 10, i8 20>)
+; CHECK-NEXT:    [[X2:%.*]] = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> [[X1]], <2 x i8> <i8 30, i8 40>)
+; CHECK-NEXT:    ret <2 x i8> [[X2]]
+;
+  %x1 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 20>)
+  %x2 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %x1, <2 x i8> <i8 30, i8 40>)
+  ret <2 x i8> %x2
+}
+
+; Can combine usubs even if they overflow.
+define i8 @test_scalar_usub_overflow(i8 %a) {
+; CHECK-LABEL: @test_scalar_usub_overflow(
+; CHECK-NEXT:    ret i8 0
+;
+  %y1 = call i8 @llvm.usub.sat.i8(i8 %a, i8 100)
+  %y2 = call i8 @llvm.usub.sat.i8(i8 %y1, i8 200)
+  ret i8 %y2
+}
+
+define <2 x i8> @test_vector_usub_overflow(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_usub_overflow(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %y1 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 100, i8 100>)
+  %y2 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %y1, <2 x i8> <i8 200, i8 200>)
+  ret <2 x i8> %y2
+}
+
+; Can combine ssubs if sign matches.
+define i8 @test_scalar_ssub_both_positive(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_both_positive(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 -30)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %z1 = call i8 @llvm.ssub.sat.i8(i8 %a, i8 10)
+  %z2 = call i8 @llvm.ssub.sat.i8(i8 %z1, i8 20)
+  ret i8 %z2
+}
+
+define <2 x i8> @test_vector_ssub_both_positive(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_ssub_both_positive(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 -30, i8 -30>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %z1 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 10>)
+  %z2 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %z1, <2 x i8> <i8 20, i8 20>)
+  ret <2 x i8> %z2
+}
+
+define i8 @test_scalar_ssub_both_negative(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_both_negative(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 30)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %u1 = call i8 @llvm.ssub.sat.i8(i8 %a, i8 -10)
+  %u2 = call i8 @llvm.ssub.sat.i8(i8 %u1, i8 -20)
+  ret i8 %u2
+}
+
+define <2 x i8> @test_vector_ssub_both_negative(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_ssub_both_negative(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 30, i8 30>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %u1 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 -10, i8 -10>)
+  %u2 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %u1, <2 x i8> <i8 -20, i8 -20>)
+  ret <2 x i8> %u2
+}
+
+; Can't combine ssubs if constants have different sign.
+define i8 @test_scalar_ssub_different_sign(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_different_sign(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 -10)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[TMP1]], i8 20)
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %v1 = call i8 @llvm.ssub.sat.i8(i8 %a, i8 10)
+  %v2 = call i8 @llvm.ssub.sat.i8(i8 %v1, i8 -20)
+  ret i8 %v2
+}
+
+; Can combine sadd and ssub with appropriate signs.
+define i8 @test_scalar_sadd_ssub(i8 %a) {
+; CHECK-LABEL: @test_scalar_sadd_ssub(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 30)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %v1 = call i8 @llvm.sadd.sat.i8(i8 10, i8 %a)
+  %v2 = call i8 @llvm.ssub.sat.i8(i8 %v1, i8 -20)
+  ret i8 %v2
+}
+
+define <2 x i8> @test_vector_sadd_ssub(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_sadd_ssub(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 -30, i8 -30>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %v1 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 -10, i8 -10>, <2 x i8> %a)
+  %v2 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %v1, <2 x i8> <i8 20, i8 20>)
+  ret <2 x i8> %v2
+}
+
+; Can't combine ssubs if they overflow.
+define i8 @test_scalar_ssub_overflow(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 -100)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[TMP1]], i8 -100)
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %w1 = call i8 @llvm.ssub.sat.i8(i8 %a, i8 100)
+  %w2 = call i8 @llvm.ssub.sat.i8(i8 %w1, i8 100)
+  ret i8 %w2
+}
+
+; nneg usub neg always overflows.
+define i8 @test_scalar_usub_nneg_neg(i8 %a) {
+; CHECK-LABEL: @test_scalar_usub_nneg_neg(
+; CHECK-NEXT:    ret i8 0
+;
+  %a_nneg = and i8 %a, 127
+  %r = call i8 @llvm.usub.sat.i8(i8 %a_nneg, i8 -10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_usub_nneg_neg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_usub_nneg_neg(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %a_nneg = and <2 x i8> %a, <i8 127, i8 127>
+  %r = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a_nneg, <2 x i8> <i8 -10, i8 -20>)
+  ret <2 x i8> %r
+}
+
+; neg usub nneg never overflows.
+define i8 @test_scalar_usub_neg_nneg(i8 %a) {
+; CHECK-LABEL: @test_scalar_usub_neg_nneg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[R:%.*]] = add i8 [[A_NEG]], -10
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_neg = or i8 %a, -128
+  %r = call i8 @llvm.usub.sat.i8(i8 %a_neg, i8 10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_usub_neg_nneg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_usub_neg_nneg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or <2 x i8> [[A:%.*]], <i8 -128, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i8> [[A_NEG]], <i8 -10, i8 -20>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_neg = or <2 x i8> %a, <i8 -128, i8 -128>
+  %r = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a_neg, <2 x i8> <i8 10, i8 20>)
+  ret <2 x i8> %r
+}
+
+; nneg usub nneg never may overflow.
+define i8 @test_scalar_usub_nneg_nneg(i8 %a) {
+; CHECK-LABEL: @test_scalar_usub_nneg_nneg(
+; CHECK-NEXT:    [[A_NNEG:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[A_NNEG]], i8 10)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_nneg = and i8 %a, 127
+  %r = call i8 @llvm.usub.sat.i8(i8 %a_nneg, i8 10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_usub_nneg_nneg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_usub_nneg_nneg(
+; CHECK-NEXT:    [[A_NNEG:%.*]] = and <2 x i8> [[A:%.*]], <i8 127, i8 127>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> [[A_NNEG]], <2 x i8> <i8 10, i8 20>)
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_nneg = and <2 x i8> %a, <i8 127, i8 127>
+  %r = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a_nneg, <2 x i8> <i8 10, i8 20>)
+  ret <2 x i8> %r
+}
+
+; neg ssub neg never overflows.
+define i8 @test_scalar_ssub_neg_neg(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_neg_neg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[R:%.*]] = add nsw i8 [[A_NEG]], 10
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_neg = or i8 %a, -128
+  %r = call i8 @llvm.ssub.sat.i8(i8 %a_neg, i8 -10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_ssub_neg_neg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_ssub_neg_neg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or <2 x i8> [[A:%.*]], <i8 -128, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i8> [[A_NEG]], <i8 10, i8 20>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_neg = or <2 x i8> %a, <i8 -128, i8 -128>
+  %r = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a_neg, <2 x i8> <i8 -10, i8 -20>)
+  ret <2 x i8> %r
+}
+
+; nneg ssub nneg never overflows.
+define i8 @test_scalar_ssub_nneg_nneg(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_nneg_nneg(
+; CHECK-NEXT:    [[A_NNEG:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[R:%.*]] = add nsw i8 [[A_NNEG]], -10
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a_nneg = and i8 %a, 127
+  %r = call i8 @llvm.ssub.sat.i8(i8 %a_nneg, i8 10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_ssub_nneg_nneg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_ssub_nneg_nneg(
+; CHECK-NEXT:    [[A_NNEG:%.*]] = and <2 x i8> [[A:%.*]], <i8 127, i8 127>
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i8> [[A_NNEG]], <i8 -10, i8 -20>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a_nneg = and <2 x i8> %a, <i8 127, i8 127>
+  %r = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a_nneg, <2 x i8> <i8 10, i8 20>)
+  ret <2 x i8> %r
+}
+
+; neg ssub nneg may overflow.
+define i8 @test_scalar_ssub_neg_nneg(i8 %a) {
+; CHECK-LABEL: @test_scalar_ssub_neg_nneg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A_NEG]], i8 -10)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %a_neg = or i8 %a, -128
+  %r = call i8 @llvm.ssub.sat.i8(i8 %a_neg, i8 10)
+  ret i8 %r
+}
+
+define <2 x i8> @test_vector_ssub_neg_nneg(<2 x i8> %a) {
+; CHECK-LABEL: @test_vector_ssub_neg_nneg(
+; CHECK-NEXT:    [[A_NEG:%.*]] = or <2 x i8> [[A:%.*]], <i8 -128, i8 -128>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A_NEG]], <2 x i8> <i8 -10, i8 -20>)
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %a_neg = or <2 x i8> %a, <i8 -128, i8 -128>
+  %r = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a_neg, <2 x i8> <i8 10, i8 20>)
+  ret <2 x i8> %r
+}
diff --git a/test/Transforms/InstCombine/scalarization.ll b/test/Transforms/InstCombine/scalarization.ll
index 3bfdfd2885b4c3c28404c9bfe28351c21b0370ee..586509542b8b82a8944af8785a2cee0049fc76ca 100644
--- a/test/Transforms/InstCombine/scalarization.ll
+++ b/test/Transforms/InstCombine/scalarization.ll
@@ -1,6 +1,63 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
+define i32 @extract_load(<4 x i32>* %p) {
+; CHECK-LABEL: @extract_load(
+; CHECK-NEXT:    [[X:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; CHECK-NEXT:    ret i32 [[EXT]]
+;
+  %x = load <4 x i32>, <4 x i32>* %p, align 4
+  %ext = extractelement <4 x i32> %x, i32 1
+  ret i32 %ext
+}
+
+define double @extract_load_fp(<4 x double>* %p) {
+; CHECK-LABEL: @extract_load_fp(
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 3
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %x = load <4 x double>, <4 x double>* %p, align 32
+  %ext = extractelement <4 x double> %x, i32 3
+  ret double %ext
+}
+
+define double @extract_load_volatile(<4 x double>* %p) {
+; CHECK-LABEL: @extract_load_volatile(
+; CHECK-NEXT:    [[X:%.*]] = load volatile <4 x double>, <4 x double>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 2
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %x = load volatile <4 x double>, <4 x double>* %p
+  %ext = extractelement <4 x double> %x, i32 2
+  ret double %ext
+}
+
+define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) {
+; CHECK-LABEL: @extract_load_extra_use(
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 8
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 0
+; CHECK-NEXT:    store <4 x double> [[X]], <4 x double>* [[P2:%.*]], align 32
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %x = load <4 x double>, <4 x double>* %p, align 8
+  %ext = extractelement <4 x double> %x, i32 0
+  store <4 x double> %x, <4 x double>* %p2
+  ret double %ext
+}
+
+define double @extract_load_variable_index(<4 x double>* %p, i32 %y) {
+; CHECK-LABEL: @extract_load_variable_index(
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 [[Y:%.*]]
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %x = load <4 x double>, <4 x double>* %p
+  %ext = extractelement <4 x double> %x, i32 %y
+  ret double %ext
+}
+
 define void @scalarize_phi(i32 * %n, float * %inout) {
 ; CHECK-LABEL: @scalarize_phi(
 ; CHECK-NEXT:  entry:
@@ -45,21 +102,105 @@ for.end:
   ret void
 }
 
-define float @extract_element_constant_index(<4 x float> %x) {
-; CHECK-LABEL: @extract_element_constant_index(
-; CHECK-NEXT:    [[R_LHS:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
-; CHECK-NEXT:    [[R:%.*]] = fadd float [[R_LHS]], 0x4002A3D700000000
+define float @extract_element_binop_splat_constant_index(<4 x float> %x) {
+; CHECK-LABEL: @extract_element_binop_splat_constant_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], 0x4002A3D700000000
+; CHECK-NEXT:    ret float [[R]]
+;
+  %b = fadd <4 x float> %x, <float 0x4002A3D700000000, float 0x4002A3D700000000, float 0x4002A3D700000000, float 0x4002A3D700000000>
+  %r = extractelement <4 x float> %b, i32 2
+  ret float %r
+}
+
+define double @extract_element_binop_splat_with_undef_constant_index(<2 x double> %x) {
+; CHECK-LABEL: @extract_element_binop_splat_with_undef_constant_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = fdiv double 4.200000e+01, [[TMP1]]
+; CHECK-NEXT:    ret double [[R]]
+;
+  %b = fdiv <2 x double> <double 42.0, double undef>, %x
+  %r = extractelement <2 x double> %b, i32 0
+  ret double %r
+}
+
+define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) {
+; CHECK-LABEL: @extract_element_binop_nonsplat_constant_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
+; CHECK-NEXT:    [[R:%.*]] = fmul float [[TMP1]], 4.300000e+01
+; CHECK-NEXT:    ret float [[R]]
+;
+  %b = fmul <2 x float> %x, <float 42.0, float 43.0>
+  %r = extractelement <2 x float> %b, i32 1
+  ret float %r
+}
+
+define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) {
+; CHECK-LABEL: @extract_element_binop_splat_variable_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sdiv i8 [[TMP1]], 42
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %b = sdiv <4 x i8> %x, <i8 42, i8 42, i8 42, i8 42>
+  %r = extractelement <4 x i8> %b, i32 %y
+  ret i8 %r
+}
+
+define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i32 %y) {
+; CHECK-LABEL: @extract_element_binop_splat_with_undef_variable_index(
+; CHECK-NEXT:    [[B:%.*]] = mul <4 x i8> [[X:%.*]], <i8 42, i8 42, i8 undef, i8 42>
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %b = mul <4 x i8> %x, <i8 42, i8 42, i8 undef, i8 42>
+  %r = extractelement <4 x i8> %b, i32 %y
+  ret i8 %r
+}
+
+define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) {
+; CHECK-LABEL: @extract_element_binop_nonsplat_variable_index(
+; CHECK-NEXT:    [[B:%.*]] = lshr <4 x i8> [[X:%.*]], <i8 4, i8 3, i8 undef, i8 2>
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %b = lshr <4 x i8> %x, <i8 4, i8 3, i8 undef, i8 2>
+  %r = extractelement <4 x i8> %b, i32 %y
+  ret i8 %r
+}
+
+define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) {
+; CHECK-LABEL: @extract_element_load(
+; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %load = load <4 x float>, <4 x float>* %ptr
+  %add = fadd <4 x float> %x, %load
+  %r = extractelement <4 x float> %add, i32 2
+  ret float %r
+}
+
+define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0, <4 x float>* %ptr1) {
+; CHECK-LABEL: @extract_element_multi_Use_load(
+; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR0:%.*]], align 16
+; CHECK-NEXT:    store <4 x float> [[LOAD]], <4 x float>* [[PTR1:%.*]], align 16
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[LOAD]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[ADD]], i32 2
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %add = fadd <4 x float> %x, <float 0x4002A3D700000000, float 0x4002A3D700000000, float 0x4002A3D700000000, float 0x4002A3D700000000>
+  %load = load <4 x float>, <4 x float>* %ptr0
+  store <4 x float> %load, <4 x float>* %ptr1
+  %add = fadd <4 x float> %x, %load
   %r = extractelement <4 x float> %add, i32 2
   ret float %r
 }
 
 define float @extract_element_variable_index(<4 x float> %x, i32 %y) {
 ; CHECK-LABEL: @extract_element_variable_index(
-; CHECK-NEXT:    [[R_LHS:%.*]] = extractelement <4 x float> [[X:%.*]], i32 [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = fadd float [[R_LHS]], 1.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -67,12 +208,34 @@ define float @extract_element_variable_index(<4 x float> %x, i32 %y) {
   ret float %r
 }
 
-define float @extract_element_splat_constant_vector_variable_index(i32 %y) {
-; CHECK-LABEL: @extract_element_splat_constant_vector_variable_index(
-; CHECK-NEXT:    ret float 2.000000e+00
+define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) {
+; CHECK-LABEL: @extelt_binop_insertelt(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[E:%.*]] = fmul nnan float [[TMP1]], [[F:%.*]]
+; CHECK-NEXT:    ret float [[E]]
 ;
-  %r = extractelement <4 x float> <float 2.0, float 2.0, float 2.0, float 2.0>, i32 %y
-  ret float %r
+  %C = insertelement <4 x float> %A, float %f, i32 0
+  %D = fmul nnan <4 x float> %C, %B
+  %E = extractelement <4 x float> %D, i32 0
+  ret float %E
+}
+
+; We recurse to find a scalarizable operand.
+; FIXME: We should propagate the IR flags including wrapping flags.
+
+define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) {
+; CHECK-LABEL: @extelt_binop_binop_insertelt(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], [[F:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[B]], i32 0
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %v = insertelement <4 x i32> %A, i32 %f, i32 0
+  %C = add <4 x i32> %v, %B
+  %D = mul nsw <4 x i32> %C, %B
+  %E = extractelement <4 x i32> %D, i32 0
+  ret i32 %E
 }
 
 define float @extract_element_constant_vector_variable_index(i32 %y) {
@@ -84,3 +247,89 @@ define float @extract_element_constant_vector_variable_index(i32 %y) {
   ret float %r
 }
 
+define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) {
+; CHECK-LABEL: @cheap_to_extract_icmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp eq <4 x i32> %x, zeroinitializer
+  %and = and <4 x i1> %cmp, %y
+  %r = extractelement <4 x i1> %and, i32 2
+  ret i1 %r
+}
+
+define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) {
+; CHECK-LABEL: @cheap_to_extract_fcmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = fcmp oeq <4 x float> %x, zeroinitializer
+  %and = and <4 x i1> %cmp, %y
+  %r = extractelement <4 x i1> %and, i32 2
+  ret i1 %r
+}
+
+define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) {
+; CHECK-LABEL: @extractelt_vector_icmp_constrhs(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %cmp = icmp eq <2 x i32> %arg, zeroinitializer
+  %ext = extractelement <2 x i1> %cmp, i32 0
+  ret i1 %ext
+}
+
+define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) {
+; CHECK-LABEL: @extractelt_vector_fcmp_constrhs(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %cmp = fcmp oeq <2 x float> %arg, zeroinitializer
+  %ext = extractelement <2 x i1> %cmp, i32 0
+  ret i1 %ext
+}
+
+define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) {
+; CHECK-LABEL: @extractelt_vector_icmp_constrhs_dynidx(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %cmp = icmp eq <2 x i32> %arg, zeroinitializer
+  %ext = extractelement <2 x i1> %cmp, i32 %idx
+  ret i1 %ext
+}
+
+define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) {
+; CHECK-LABEL: @extractelt_vector_fcmp_constrhs_dynidx(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %cmp = fcmp oeq <2 x float> %arg, zeroinitializer
+  %ext = extractelement <2 x i1> %cmp, i32 %idx
+  ret i1 %ext
+}
+
+define i1 @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(<2 x float> %arg0, <2 x float> %arg1, <2 x float> %arg2, i32 %idx) {
+; CHECK-LABEL: @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[ARG1:%.*]], [[ARG2:%.*]]
+; CHECK-NEXT:    store volatile <2 x float> [[ADD]], <2 x float>* undef, align 8
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq <2 x float> [[ADD]], [[ARG0:%.*]]
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <2 x i1> [[CMP]], i32 0
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %add = fadd <2 x float> %arg1, %arg2
+  store volatile <2 x float> %add, <2 x float>* undef
+  %cmp = fcmp oeq <2 x float> %arg0, %add
+  %ext = extractelement <2 x i1> %cmp, i32 0
+  ret i1 %ext
+}
diff --git a/test/Transforms/InstCombine/select-bitext.ll b/test/Transforms/InstCombine/select-bitext.ll
index b66a9eef4ab63ee8b1ca8ca9e4bdf041b0584301..d44be27357354276a144904b679ac85491609ba4 100644
--- a/test/Transforms/InstCombine/select-bitext.ll
+++ b/test/Transforms/InstCombine/select-bitext.ll
@@ -5,7 +5,7 @@
 
 define i16 @sel_sext_constants(i1 %cmp) {
 ; CHECK-LABEL: @sel_sext_constants(
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i16 -1, i16 42
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i16 -1, i16 42
 ; CHECK-NEXT:    ret i16 [[EXT]]
 ;
   %sel = select i1 %cmp, i8 255, i8 42
@@ -15,7 +15,7 @@ define i16 @sel_sext_constants(i1 %cmp) {
 
 define i16 @sel_zext_constants(i1 %cmp) {
 ; CHECK-LABEL: @sel_zext_constants(
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i16 255, i16 42
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i16 255, i16 42
 ; CHECK-NEXT:    ret i16 [[EXT]]
 ;
   %sel = select i1 %cmp, i8 255, i8 42
@@ -25,7 +25,7 @@ define i16 @sel_zext_constants(i1 %cmp) {
 
 define double @sel_fpext_constants(i1 %cmp) {
 ; CHECK-LABEL: @sel_fpext_constants(
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, double -2.550000e+02, double 4.200000e+01
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], double -2.550000e+02, double 4.200000e+01
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
   %sel = select i1 %cmp, float -255.0, float 42.0
@@ -37,8 +37,8 @@ define double @sel_fpext_constants(i1 %cmp) {
 
 define i64 @sel_sext(i32 %a, i1 %cmp) {
 ; CHECK-LABEL: @sel_sext(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 %a to i64
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i64 [[TMP1]], i64 42
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i64 [[TMP1]], i64 42
 ; CHECK-NEXT:    ret i64 [[EXT]]
 ;
   %sel = select i1 %cmp, i32 %a, i32 42
@@ -48,8 +48,8 @@ define i64 @sel_sext(i32 %a, i1 %cmp) {
 
 define <4 x i64> @sel_sext_vec(<4 x i32> %a, <4 x i1> %cmp) {
 ; CHECK-LABEL: @sel_sext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> %a to <4 x i64>
-; CHECK-NEXT:    [[EXT:%.*]] = select <4 x i1> %cmp, <4 x i64> [[TMP1]], <4 x i64> <i64 42, i64 42, i64 42, i64 42>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[A:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[EXT:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i64> [[TMP1]], <4 x i64> <i64 42, i64 42, i64 42, i64 42>
 ; CHECK-NEXT:    ret <4 x i64> [[EXT]]
 ;
   %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
@@ -59,8 +59,8 @@ define <4 x i64> @sel_sext_vec(<4 x i32> %a, <4 x i1> %cmp) {
 
 define i64 @sel_zext(i32 %a, i1 %cmp) {
 ; CHECK-LABEL: @sel_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 %a to i64
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i64 [[TMP1]], i64 42
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i64 [[TMP1]], i64 42
 ; CHECK-NEXT:    ret i64 [[EXT]]
 ;
   %sel = select i1 %cmp, i32 %a, i32 42
@@ -70,8 +70,8 @@ define i64 @sel_zext(i32 %a, i1 %cmp) {
 
 define <4 x i64> @sel_zext_vec(<4 x i32> %a, <4 x i1> %cmp) {
 ; CHECK-LABEL: @sel_zext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> %a to <4 x i64>
-; CHECK-NEXT:    [[EXT:%.*]] = select <4 x i1> %cmp, <4 x i64> [[TMP1]], <4 x i64> <i64 42, i64 42, i64 42, i64 42>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[A:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[EXT:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i64> [[TMP1]], <4 x i64> <i64 42, i64 42, i64 42, i64 42>
 ; CHECK-NEXT:    ret <4 x i64> [[EXT]]
 ;
   %sel = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
@@ -85,9 +85,9 @@ define <4 x i64> @sel_zext_vec(<4 x i32> %a, <4 x i1> %cmp) {
 
 define i64 @trunc_sel_larger_sext(i32 %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_larger_sext(
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 %a to i16
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[A:%.*]] to i16
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TRUNC]] to i64
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i64 [[TMP1]], i64 42
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i64 [[TMP1]], i64 42
 ; CHECK-NEXT:    ret i64 [[EXT]]
 ;
   %trunc = trunc i32 %a to i16
@@ -98,10 +98,9 @@ define i64 @trunc_sel_larger_sext(i32 %a, i1 %cmp) {
 
 define <2 x i64> @trunc_sel_larger_sext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_larger_sext_vec(
-; CHECK-NEXT:    [[TRUNC:%.*]] = zext <2 x i32> %a to <2 x i64>
-; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i64> [[TRUNC]], <i64 48, i64 48>
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact <2 x i64> [[SEXT]], <i64 48, i64 48>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i64> [[TMP1]], <2 x i64> <i64 42, i64 43>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i32> [[A:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i16> [[TRUNC]] to <2 x i64>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i64> [[TMP1]], <2 x i64> <i64 42, i64 43>
 ; CHECK-NEXT:    ret <2 x i64> [[EXT]]
 ;
   %trunc = trunc <2 x i32> %a to <2 x i16>
@@ -112,9 +111,9 @@ define <2 x i64> @trunc_sel_larger_sext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 
 define i32 @trunc_sel_smaller_sext(i64 %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_smaller_sext(
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 %a to i16
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[A:%.*]] to i16
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TRUNC]] to i32
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i32 [[TMP1]], i32 42
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i32 [[TMP1]], i32 42
 ; CHECK-NEXT:    ret i32 [[EXT]]
 ;
   %trunc = trunc i64 %a to i16
@@ -125,10 +124,9 @@ define i32 @trunc_sel_smaller_sext(i64 %a, i1 %cmp) {
 
 define <2 x i32> @trunc_sel_smaller_sext_vec(<2 x i64> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_smaller_sext_vec(
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i64> %a to <2 x i32>
-; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i32> [[TRUNC]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact <2 x i32> [[SEXT]], <i32 16, i32 16>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i32> [[TMP1]], <2 x i32> <i32 42, i32 43>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i16> [[TRUNC]] to <2 x i32>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i32> [[TMP1]], <2 x i32> <i32 42, i32 43>
 ; CHECK-NEXT:    ret <2 x i32> [[EXT]]
 ;
   %trunc = trunc <2 x i64> %a to <2 x i16>
@@ -139,9 +137,9 @@ define <2 x i32> @trunc_sel_smaller_sext_vec(<2 x i64> %a, <2 x i1> %cmp) {
 
 define i32 @trunc_sel_equal_sext(i32 %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_equal_sext(
-; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 %a, 16
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[SEXT]], 16
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i32 [[TMP1]], i32 42
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[A:%.*]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr exact i32 [[TMP1]], 16
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i32 [[TMP2]], i32 42
 ; CHECK-NEXT:    ret i32 [[EXT]]
 ;
   %trunc = trunc i32 %a to i16
@@ -152,9 +150,9 @@ define i32 @trunc_sel_equal_sext(i32 %a, i1 %cmp) {
 
 define <2 x i32> @trunc_sel_equal_sext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_equal_sext_vec(
-; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i32> %a, <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact <2 x i32> [[SEXT]], <i32 16, i32 16>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i32> [[TMP1]], <2 x i32> <i32 42, i32 43>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i32> [[A:%.*]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr exact <2 x i32> [[TMP1]], <i32 16, i32 16>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i32> [[TMP2]], <2 x i32> <i32 42, i32 43>
 ; CHECK-NEXT:    ret <2 x i32> [[EXT]]
 ;
   %trunc = trunc <2 x i32> %a to <2 x i16>
@@ -165,9 +163,9 @@ define <2 x i32> @trunc_sel_equal_sext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 
 define i64 @trunc_sel_larger_zext(i32 %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_larger_zext(
-; CHECK-NEXT:    [[TRUNC_MASK:%.*]] = and i32 %a, 65535
+; CHECK-NEXT:    [[TRUNC_MASK:%.*]] = and i32 [[A:%.*]], 65535
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TRUNC_MASK]] to i64
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i64 [[TMP1]], i64 42
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i64 [[TMP1]], i64 42
 ; CHECK-NEXT:    ret i64 [[EXT]]
 ;
   %trunc = trunc i32 %a to i16
@@ -178,9 +176,9 @@ define i64 @trunc_sel_larger_zext(i32 %a, i1 %cmp) {
 
 define <2 x i64> @trunc_sel_larger_zext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_larger_zext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> %a, <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i64> [[TMP2]], <2 x i64> <i64 42, i64 43>
+; CHECK-NEXT:    [[TRUNC_MASK:%.*]] = and <2 x i32> [[A:%.*]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[TRUNC_MASK]] to <2 x i64>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i64> [[TMP1]], <2 x i64> <i64 42, i64 43>
 ; CHECK-NEXT:    ret <2 x i64> [[EXT]]
 ;
   %trunc = trunc <2 x i32> %a to <2 x i16>
@@ -191,9 +189,9 @@ define <2 x i64> @trunc_sel_larger_zext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 
 define i32 @trunc_sel_smaller_zext(i64 %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_smaller_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %a to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 65535
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i32 [[TMP2]], i32 42
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i32 [[TMP2]], i32 42
 ; CHECK-NEXT:    ret i32 [[EXT]]
 ;
   %trunc = trunc i64 %a to i16
@@ -204,9 +202,9 @@ define i32 @trunc_sel_smaller_zext(i64 %a, i1 %cmp) {
 
 define <2 x i32> @trunc_sel_smaller_zext_vec(<2 x i64> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_smaller_zext_vec(
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i64> %a to <2 x i32>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[TRUNC]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i32> [[TMP1]], <2 x i32> <i32 42, i32 43>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i32> [[TMP2]], <2 x i32> <i32 42, i32 43>
 ; CHECK-NEXT:    ret <2 x i32> [[EXT]]
 ;
   %trunc = trunc <2 x i64> %a to <2 x i16>
@@ -217,8 +215,8 @@ define <2 x i32> @trunc_sel_smaller_zext_vec(<2 x i64> %a, <2 x i1> %cmp) {
 
 define i32 @trunc_sel_equal_zext(i32 %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_equal_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %a, 65535
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, i32 [[TMP1]], i32 42
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 65535
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], i32 [[TMP1]], i32 42
 ; CHECK-NEXT:    ret i32 [[EXT]]
 ;
   %trunc = trunc i32 %a to i16
@@ -229,8 +227,8 @@ define i32 @trunc_sel_equal_zext(i32 %a, i1 %cmp) {
 
 define <2 x i32> @trunc_sel_equal_zext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_equal_zext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> %a, <i32 65535, i32 65535>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i32> [[TMP1]], <2 x i32> <i32 42, i32 43>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i32> [[TMP1]], <2 x i32> <i32 42, i32 43>
 ; CHECK-NEXT:    ret <2 x i32> [[EXT]]
 ;
   %trunc = trunc <2 x i32> %a to <2 x i16>
@@ -241,9 +239,9 @@ define <2 x i32> @trunc_sel_equal_zext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 
 define double @trunc_sel_larger_fpext(float %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_larger_fpext(
-; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc float %a to half
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc float [[A:%.*]] to half
 ; CHECK-NEXT:    [[TMP1:%.*]] = fpext half [[TRUNC]] to double
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, double [[TMP1]], double 4.200000e+01
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], double [[TMP1]], double 4.200000e+01
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
   %trunc = fptrunc float %a to half
@@ -254,9 +252,9 @@ define double @trunc_sel_larger_fpext(float %a, i1 %cmp) {
 
 define <2 x double> @trunc_sel_larger_fpext_vec(<2 x float> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_larger_fpext_vec(
-; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x float> %a to <2 x half>
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x float> [[A:%.*]] to <2 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x half> [[TRUNC]] to <2 x double>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x double> [[TMP1]], <2 x double> <double 4.200000e+01, double 4.300000e+01>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x double> [[TMP1]], <2 x double> <double 4.200000e+01, double 4.300000e+01>
 ; CHECK-NEXT:    ret <2 x double> [[EXT]]
 ;
   %trunc = fptrunc <2 x float> %a to <2 x half>
@@ -267,9 +265,9 @@ define <2 x double> @trunc_sel_larger_fpext_vec(<2 x float> %a, <2 x i1> %cmp) {
 
 define float @trunc_sel_smaller_fpext(double %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_smaller_fpext(
-; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc double %a to half
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc double [[A:%.*]] to half
 ; CHECK-NEXT:    [[TMP1:%.*]] = fpext half [[TRUNC]] to float
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, float [[TMP1]], float 4.200000e+01
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], float [[TMP1]], float 4.200000e+01
 ; CHECK-NEXT:    ret float [[EXT]]
 ;
   %trunc = fptrunc double %a to half
@@ -280,9 +278,9 @@ define float @trunc_sel_smaller_fpext(double %a, i1 %cmp) {
 
 define <2 x float> @trunc_sel_smaller_fpext_vec(<2 x double> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_smaller_fpext_vec(
-; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x double> %a to <2 x half>
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x double> [[A:%.*]] to <2 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x half> [[TRUNC]] to <2 x float>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x float> [[TMP1]], <2 x float> <float 4.200000e+01, float 4.300000e+01>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x float> [[TMP1]], <2 x float> <float 4.200000e+01, float 4.300000e+01>
 ; CHECK-NEXT:    ret <2 x float> [[EXT]]
 ;
   %trunc = fptrunc <2 x double> %a to <2 x half>
@@ -293,9 +291,9 @@ define <2 x float> @trunc_sel_smaller_fpext_vec(<2 x double> %a, <2 x i1> %cmp)
 
 define float @trunc_sel_equal_fpext(float %a, i1 %cmp) {
 ; CHECK-LABEL: @trunc_sel_equal_fpext(
-; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc float %a to half
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc float [[A:%.*]] to half
 ; CHECK-NEXT:    [[TMP1:%.*]] = fpext half [[TRUNC]] to float
-; CHECK-NEXT:    [[EXT:%.*]] = select i1 %cmp, float [[TMP1]], float 4.200000e+01
+; CHECK-NEXT:    [[EXT:%.*]] = select i1 [[CMP:%.*]], float [[TMP1]], float 4.200000e+01
 ; CHECK-NEXT:    ret float [[EXT]]
 ;
   %trunc = fptrunc float %a to half
@@ -306,9 +304,9 @@ define float @trunc_sel_equal_fpext(float %a, i1 %cmp) {
 
 define <2 x float> @trunc_sel_equal_fpext_vec(<2 x float> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_equal_fpext_vec(
-; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x float> %a to <2 x half>
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x float> [[A:%.*]] to <2 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x half> [[TRUNC]] to <2 x float>
-; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x float> [[TMP1]], <2 x float> <float 4.200000e+01, float 4.300000e+01>
+; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x float> [[TMP1]], <2 x float> <float 4.200000e+01, float 4.300000e+01>
 ; CHECK-NEXT:    ret <2 x float> [[EXT]]
 ;
   %trunc = fptrunc <2 x float> %a to <2 x half>
@@ -319,8 +317,8 @@ define <2 x float> @trunc_sel_equal_fpext_vec(<2 x float> %a, <2 x i1> %cmp) {
 
 define i32 @test_sext1(i1 %cca, i1 %ccb) {
 ; CHECK-LABEL: @test_sext1(
-; CHECK-NEXT:    [[FOLD_R:%.*]] = and i1 %ccb, %cca
-; CHECK-NEXT:    [[R:%.*]] = sext i1 [[FOLD_R]] to i32
+; CHECK-NEXT:    [[NARROW:%.*]] = and i1 [[CCB:%.*]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext i1 [[NARROW]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %ccax = sext i1 %cca to i32
@@ -330,8 +328,8 @@ define i32 @test_sext1(i1 %cca, i1 %ccb) {
 
 define i32 @test_sext2(i1 %cca, i1 %ccb) {
 ; CHECK-LABEL: @test_sext2(
-; CHECK-NEXT:    [[FOLD_R:%.*]] = or i1 %ccb, %cca
-; CHECK-NEXT:    [[R:%.*]] = sext i1 [[FOLD_R]] to i32
+; CHECK-NEXT:    [[NARROW:%.*]] = or i1 [[CCB:%.*]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext i1 [[NARROW]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %ccax = sext i1 %cca to i32
@@ -341,9 +339,9 @@ define i32 @test_sext2(i1 %cca, i1 %ccb) {
 
 define i32 @test_sext3(i1 %cca, i1 %ccb) {
 ; CHECK-LABEL: @test_sext3(
-; CHECK-NEXT:    [[NOT_CCB:%.*]] = xor i1 %ccb, true
-; CHECK-NEXT:    [[FOLD_R:%.*]] = and i1 [[NOT_CCB]], %cca
-; CHECK-NEXT:    [[R:%.*]] = sext i1 [[FOLD_R]] to i32
+; CHECK-NEXT:    [[NOT_CCB:%.*]] = xor i1 [[CCB:%.*]], true
+; CHECK-NEXT:    [[NARROW:%.*]] = and i1 [[NOT_CCB]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext i1 [[NARROW]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %ccax = sext i1 %cca to i32
@@ -353,9 +351,9 @@ define i32 @test_sext3(i1 %cca, i1 %ccb) {
 
 define i32 @test_sext4(i1 %cca, i1 %ccb) {
 ; CHECK-LABEL: @test_sext4(
-; CHECK-NEXT:    [[NOT_CCB:%.*]] = xor i1 %ccb, true
-; CHECK-NEXT:    [[FOLD_R:%.*]] = or i1 [[NOT_CCB]], %cca
-; CHECK-NEXT:    [[R:%.*]] = sext i1 [[FOLD_R]] to i32
+; CHECK-NEXT:    [[NOT_CCB:%.*]] = xor i1 [[CCB:%.*]], true
+; CHECK-NEXT:    [[NARROW:%.*]] = or i1 [[NOT_CCB]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext i1 [[NARROW]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %ccax = sext i1 %cca to i32
@@ -365,8 +363,8 @@ define i32 @test_sext4(i1 %cca, i1 %ccb) {
 
 define i32 @test_zext1(i1 %cca, i1 %ccb) {
 ; CHECK-LABEL: @test_zext1(
-; CHECK-NEXT:    [[FOLD_R:%.*]] = and i1 %ccb, %cca
-; CHECK-NEXT:    [[R:%.*]] = zext i1 [[FOLD_R]] to i32
+; CHECK-NEXT:    [[NARROW:%.*]] = and i1 [[CCB:%.*]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[NARROW]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %ccax = zext i1 %cca to i32
@@ -376,8 +374,8 @@ define i32 @test_zext1(i1 %cca, i1 %ccb) {
 
 define i32 @test_zext2(i1 %cca, i1 %ccb) {
 ; CHECK-LABEL: @test_zext2(
-; CHECK-NEXT:    [[FOLD_R:%.*]] = or i1 %ccb, %cca
-; CHECK-NEXT:    [[R:%.*]] = zext i1 [[FOLD_R]] to i32
+; CHECK-NEXT:    [[NARROW:%.*]] = or i1 [[CCB:%.*]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[NARROW]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %ccax = zext i1 %cca to i32
@@ -387,9 +385,9 @@ define i32 @test_zext2(i1 %cca, i1 %ccb) {
 
 define i32 @test_zext3(i1 %cca, i1 %ccb) {
 ; CHECK-LABEL: @test_zext3(
-; CHECK-NEXT:    [[NOT_CCB:%.*]] = xor i1 %ccb, true
-; CHECK-NEXT:    [[FOLD_R:%.*]] = and i1 [[NOT_CCB]], %cca
-; CHECK-NEXT:    [[R:%.*]] = zext i1 [[FOLD_R]] to i32
+; CHECK-NEXT:    [[NOT_CCB:%.*]] = xor i1 [[CCB:%.*]], true
+; CHECK-NEXT:    [[NARROW:%.*]] = and i1 [[NOT_CCB]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[NARROW]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %ccax = zext i1 %cca to i32
@@ -399,9 +397,9 @@ define i32 @test_zext3(i1 %cca, i1 %ccb) {
 
 define i32 @test_zext4(i1 %cca, i1 %ccb) {
 ; CHECK-LABEL: @test_zext4(
-; CHECK-NEXT:    [[NOT_CCB:%.*]] = xor i1 %ccb, true
-; CHECK-NEXT:    [[FOLD_R:%.*]] = or i1 [[NOT_CCB]], %cca
-; CHECK-NEXT:    [[R:%.*]] = zext i1 [[FOLD_R]] to i32
+; CHECK-NEXT:    [[NOT_CCB:%.*]] = xor i1 [[CCB:%.*]], true
+; CHECK-NEXT:    [[NARROW:%.*]] = or i1 [[NOT_CCB]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[NARROW]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %ccax = zext i1 %cca to i32
@@ -411,8 +409,8 @@ define i32 @test_zext4(i1 %cca, i1 %ccb) {
 
 define i32 @test_negative_sext(i1 %a, i1 %cc) {
 ; CHECK-LABEL: @test_negative_sext(
-; CHECK-NEXT:    [[A_EXT:%.*]] = sext i1 %a to i32
-; CHECK-NEXT:    [[R:%.*]] = select i1 %cc, i32 [[A_EXT]], i32 1
+; CHECK-NEXT:    [[A_EXT:%.*]] = sext i1 [[A:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CC:%.*]], i32 [[A_EXT]], i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a.ext = sext i1 %a to i32
@@ -422,8 +420,8 @@ define i32 @test_negative_sext(i1 %a, i1 %cc) {
 
 define i32 @test_negative_zext(i1 %a, i1 %cc) {
 ; CHECK-LABEL: @test_negative_zext(
-; CHECK-NEXT:    [[A_EXT:%.*]] = zext i1 %a to i32
-; CHECK-NEXT:    [[R:%.*]] = select i1 %cc, i32 [[A_EXT]], i32 -1
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i1 [[A:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CC:%.*]], i32 [[A_EXT]], i32 -1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a.ext = zext i1 %a to i32
@@ -433,8 +431,8 @@ define i32 @test_negative_zext(i1 %a, i1 %cc) {
 
 define i32 @test_bits_sext(i8 %a, i1 %cc) {
 ; CHECK-LABEL: @test_bits_sext(
-; CHECK-NEXT:    [[A_EXT:%.*]] = sext i8 %a to i32
-; CHECK-NEXT:    [[R:%.*]] = select i1 %cc, i32 [[A_EXT]], i32 -128
+; CHECK-NEXT:    [[A_EXT:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CC:%.*]], i32 [[A_EXT]], i32 -128
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a.ext = sext i8 %a to i32
@@ -444,8 +442,8 @@ define i32 @test_bits_sext(i8 %a, i1 %cc) {
 
 define i32 @test_bits_zext(i8 %a, i1 %cc) {
 ; CHECK-LABEL: @test_bits_zext(
-; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 %a to i32
-; CHECK-NEXT:    [[R:%.*]] = select i1 %cc, i32 [[A_EXT]], i32 255
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CC:%.*]], i32 [[A_EXT]], i32 255
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a.ext = zext i8 %a to i32
@@ -455,11 +453,11 @@ define i32 @test_bits_zext(i8 %a, i1 %cc) {
 
 define i32 @test_op_op(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: @test_op_op(
-; CHECK-NEXT:    [[CCA:%.*]] = icmp sgt i32 %a, 0
-; CHECK-NEXT:    [[CCB:%.*]] = icmp sgt i32 %b, 0
-; CHECK-NEXT:    [[CCC:%.*]] = icmp sgt i32 %c, 0
+; CHECK-NEXT:    [[CCA:%.*]] = icmp sgt i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CCB:%.*]] = icmp sgt i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[CCC:%.*]] = icmp sgt i32 [[C:%.*]], 0
 ; CHECK-NEXT:    [[R_V:%.*]] = select i1 [[CCC]], i1 [[CCA]], i1 [[CCB]]
-; CHECK-NEXT:    [[R:%.*]] = sext i1 [[R:%.*]].v to i32
+; CHECK-NEXT:    [[R:%.*]] = sext i1 [[R_V]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %cca = icmp sgt i32 %a, 0
@@ -473,8 +471,8 @@ define i32 @test_op_op(i32 %a, i32 %b, i32 %c) {
 
 define <2 x i32> @test_vectors_sext(<2 x i1> %cca, <2 x i1> %ccb) {
 ; CHECK-LABEL: @test_vectors_sext(
-; CHECK-NEXT:    [[FOLD_R:%.*]] = and <2 x i1> %ccb, %cca
-; CHECK-NEXT:    [[R:%.*]] = sext <2 x i1> [[FOLD_R]] to <2 x i32>
+; CHECK-NEXT:    [[NARROW:%.*]] = and <2 x i1> [[CCB:%.*]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <2 x i1> [[NARROW]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %ccax = sext <2 x i1> %cca to <2 x i32>
@@ -484,7 +482,7 @@ define <2 x i32> @test_vectors_sext(<2 x i1> %cca, <2 x i1> %ccb) {
 
 define <2 x i32> @test_vectors_sext_nonsplat(<2 x i1> %cca, <2 x i1> %ccb) {
 ; CHECK-LABEL: @test_vectors_sext_nonsplat(
-; CHECK-NEXT:    [[NARROW:%.*]] = select <2 x i1> %ccb, <2 x i1> %cca, <2 x i1> <i1 false, i1 true>
+; CHECK-NEXT:    [[NARROW:%.*]] = select <2 x i1> [[CCB:%.*]], <2 x i1> [[CCA:%.*]], <2 x i1> <i1 false, i1 true>
 ; CHECK-NEXT:    [[R:%.*]] = sext <2 x i1> [[NARROW]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
@@ -495,8 +493,8 @@ define <2 x i32> @test_vectors_sext_nonsplat(<2 x i1> %cca, <2 x i1> %ccb) {
 
 define <2 x i32> @test_vectors_zext(<2 x i1> %cca, <2 x i1> %ccb) {
 ; CHECK-LABEL: @test_vectors_zext(
-; CHECK-NEXT:    [[FOLD_R:%.*]] = and <2 x i1> %ccb, %cca
-; CHECK-NEXT:    [[R:%.*]] = zext <2 x i1> [[FOLD_R]] to <2 x i32>
+; CHECK-NEXT:    [[NARROW:%.*]] = and <2 x i1> [[CCB:%.*]], [[CCA:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext <2 x i1> [[NARROW]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %ccax = zext <2 x i1> %cca to <2 x i32>
@@ -506,7 +504,7 @@ define <2 x i32> @test_vectors_zext(<2 x i1> %cca, <2 x i1> %ccb) {
 
 define <2 x i32> @test_vectors_zext_nonsplat(<2 x i1> %cca, <2 x i1> %ccb) {
 ; CHECK-LABEL: @test_vectors_zext_nonsplat(
-; CHECK-NEXT:    [[NARROW:%.*]] = select <2 x i1> %ccb, <2 x i1> %cca, <2 x i1> <i1 true, i1 false>
+; CHECK-NEXT:    [[NARROW:%.*]] = select <2 x i1> [[CCB:%.*]], <2 x i1> [[CCA:%.*]], <2 x i1> <i1 true, i1 false>
 ; CHECK-NEXT:    [[R:%.*]] = zext <2 x i1> [[NARROW]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
@@ -517,8 +515,8 @@ define <2 x i32> @test_vectors_zext_nonsplat(<2 x i1> %cca, <2 x i1> %ccb) {
 
 define <2 x i32> @scalar_select_of_vectors_sext(<2 x i1> %cca, i1 %ccb) {
 ; CHECK-LABEL: @scalar_select_of_vectors_sext(
-; CHECK-NEXT:    [[FOLD_R:%.*]] = select i1 %ccb, <2 x i1> %cca, <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = sext <2 x i1> [[FOLD_R]] to <2 x i32>
+; CHECK-NEXT:    [[NARROW:%.*]] = select i1 [[CCB:%.*]], <2 x i1> [[CCA:%.*]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = sext <2 x i1> [[NARROW]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %ccax = sext <2 x i1> %cca to <2 x i32>
@@ -528,8 +526,8 @@ define <2 x i32> @scalar_select_of_vectors_sext(<2 x i1> %cca, i1 %ccb) {
 
 define <2 x i32> @scalar_select_of_vectors_zext(<2 x i1> %cca, i1 %ccb) {
 ; CHECK-LABEL: @scalar_select_of_vectors_zext(
-; CHECK-NEXT:    [[FOLD_R:%.*]] = select i1 %ccb, <2 x i1> %cca, <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = zext <2 x i1> [[FOLD_R]] to <2 x i32>
+; CHECK-NEXT:    [[NARROW:%.*]] = select i1 [[CCB:%.*]], <2 x i1> [[CCA:%.*]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = zext <2 x i1> [[NARROW]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %ccax = zext <2 x i1> %cca to <2 x i32>
@@ -539,7 +537,7 @@ define <2 x i32> @scalar_select_of_vectors_zext(<2 x i1> %cca, i1 %ccb) {
 
 define i32 @sext_true_val_must_be_all_ones(i1 %x) {
 ; CHECK-LABEL: @sext_true_val_must_be_all_ones(
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 %x, i32 -1, i32 42, !prof !0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[X:%.*]], i32 -1, i32 42, !prof !0
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %ext = sext i1 %x to i32
@@ -549,7 +547,7 @@ define i32 @sext_true_val_must_be_all_ones(i1 %x) {
 
 define <2 x i32> @sext_true_val_must_be_all_ones_vec(<2 x i1> %x) {
 ; CHECK-LABEL: @sext_true_val_must_be_all_ones_vec(
-; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> %x, <2 x i32> <i32 -1, i32 -1>, <2 x i32> <i32 42, i32 12>, !prof !0
+; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> [[X:%.*]], <2 x i32> <i32 -1, i32 -1>, <2 x i32> <i32 42, i32 12>, !prof !0
 ; CHECK-NEXT:    ret <2 x i32> [[SEL]]
 ;
   %ext = sext <2 x i1> %x to <2 x i32>
@@ -559,7 +557,7 @@ define <2 x i32> @sext_true_val_must_be_all_ones_vec(<2 x i1> %x) {
 
 define i32 @zext_true_val_must_be_one(i1 %x) {
 ; CHECK-LABEL: @zext_true_val_must_be_one(
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 %x, i32 1, i32 42, !prof !0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[X:%.*]], i32 1, i32 42, !prof !0
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %ext = zext i1 %x to i32
@@ -569,7 +567,7 @@ define i32 @zext_true_val_must_be_one(i1 %x) {
 
 define <2 x i32> @zext_true_val_must_be_one_vec(<2 x i1> %x) {
 ; CHECK-LABEL: @zext_true_val_must_be_one_vec(
-; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> %x, <2 x i32> <i32 1, i32 1>, <2 x i32> <i32 42, i32 12>, !prof !0
+; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> [[X:%.*]], <2 x i32> <i32 1, i32 1>, <2 x i32> <i32 42, i32 12>, !prof !0
 ; CHECK-NEXT:    ret <2 x i32> [[SEL]]
 ;
   %ext = zext <2 x i1> %x to <2 x i32>
@@ -579,7 +577,7 @@ define <2 x i32> @zext_true_val_must_be_one_vec(<2 x i1> %x) {
 
 define i32 @sext_false_val_must_be_zero(i1 %x) {
 ; CHECK-LABEL: @sext_false_val_must_be_zero(
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 %x, i32 42, i32 0, !prof !0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[X:%.*]], i32 42, i32 0, !prof !0
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %ext = sext i1 %x to i32
@@ -589,7 +587,7 @@ define i32 @sext_false_val_must_be_zero(i1 %x) {
 
 define <2 x i32> @sext_false_val_must_be_zero_vec(<2 x i1> %x) {
 ; CHECK-LABEL: @sext_false_val_must_be_zero_vec(
-; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> %x, <2 x i32> <i32 42, i32 12>, <2 x i32> zeroinitializer, !prof !0
+; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> [[X:%.*]], <2 x i32> <i32 42, i32 12>, <2 x i32> zeroinitializer, !prof !0
 ; CHECK-NEXT:    ret <2 x i32> [[SEL]]
 ;
   %ext = sext <2 x i1> %x to <2 x i32>
@@ -599,7 +597,7 @@ define <2 x i32> @sext_false_val_must_be_zero_vec(<2 x i1> %x) {
 
 define i32 @zext_false_val_must_be_zero(i1 %x) {
 ; CHECK-LABEL: @zext_false_val_must_be_zero(
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 %x, i32 42, i32 0, !prof !0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[X:%.*]], i32 42, i32 0, !prof !0
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %ext = zext i1 %x to i32
@@ -609,7 +607,7 @@ define i32 @zext_false_val_must_be_zero(i1 %x) {
 
 define <2 x i32> @zext_false_val_must_be_zero_vec(<2 x i1> %x) {
 ; CHECK-LABEL: @zext_false_val_must_be_zero_vec(
-; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> %x, <2 x i32> <i32 42, i32 12>, <2 x i32> zeroinitializer, !prof !0
+; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> [[X:%.*]], <2 x i32> <i32 42, i32 12>, <2 x i32> zeroinitializer, !prof !0
 ; CHECK-NEXT:    ret <2 x i32> [[SEL]]
 ;
   %ext = zext <2 x i1> %x to <2 x i32>
diff --git a/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll b/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
index 3ac02795b4786857060aa8d0bae032168ba3509e..6ccc6155e4c8661ccf6b725d5e26d8259021f3ab 100644
--- a/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
+++ b/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
 ; This test is to verify that the instruction combiner is able to fold
@@ -6,144 +7,146 @@
 
 define i16 @test1(i16 %x) {
 ; CHECK-LABEL: @test1(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
-; CHECK-NEXT: ret i16 [[VAR]]
-entry:
-  %0 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %ct = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
   %tobool = icmp ne i16 %x, 0
-  %cond = select i1 %tobool, i16 %0, i16 16
+  %cond = select i1 %tobool, i16 %ct, i16 16
   ret i16 %cond
 }
 
 define i32 @test2(i32 %x) {
 ; CHECK-LABEL: @test2(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
-; CHECK-NEXT: ret i32 [[VAR]]
-entry:
-  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %ct = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
   %tobool = icmp ne i32 %x, 0
-  %cond = select i1 %tobool, i32 %0, i32 32
+  %cond = select i1 %tobool, i32 %ct, i32 32
   ret i32 %cond
 }
 
 define i64 @test3(i64 %x) {
 ; CHECK-LABEL: @test3(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
-; CHECK-NEXT: ret i64 [[VAR]]
-entry:
-  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X:%.*]], i1 false), !range !2
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %ct = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
   %tobool = icmp ne i64 %x, 0
-  %cond = select i1 %tobool, i64 %0, i64 64
+  %cond = select i1 %tobool, i64 %ct, i64 64
   ret i64 %cond
 }
 
 define i16 @test4(i16 %x) {
 ; CHECK-LABEL: @test4(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
-; CHECK-NEXT: ret i16 [[VAR]]
-entry:
-  %0 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %ct = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
   %tobool = icmp eq i16 %x, 0
-  %cond = select i1 %tobool, i16 16, i16 %0
+  %cond = select i1 %tobool, i16 16, i16 %ct
   ret i16 %cond
 }
 
 define i32 @test5(i32 %x) {
 ; CHECK-LABEL: @test5(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
-; CHECK-NEXT: ret i32 [[VAR]]
-entry:
-  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %ct = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
   %tobool = icmp eq i32 %x, 0
-  %cond = select i1 %tobool, i32 32, i32 %0
+  %cond = select i1 %tobool, i32 32, i32 %ct
   ret i32 %cond
 }
 
 define i64 @test6(i64 %x) {
 ; CHECK-LABEL: @test6(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
-; CHECK-NEXT: ret i64 [[VAR]]
-entry:
-  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X:%.*]], i1 false), !range !2
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %ct = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
   %tobool = icmp eq i64 %x, 0
-  %cond = select i1 %tobool, i64 64, i64 %0
+  %cond = select i1 %tobool, i64 64, i64 %ct
   ret i64 %cond
 }
 
 define i16 @test1b(i16 %x) {
 ; CHECK-LABEL: @test1b(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
-; CHECK-NEXT: ret i16 [[VAR]]
-entry:
-  %0 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %ct = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
   %tobool = icmp ne i16 %x, 0
-  %cond = select i1 %tobool, i16 %0, i16 16
+  %cond = select i1 %tobool, i16 %ct, i16 16
   ret i16 %cond
 }
 
 define i32 @test2b(i32 %x) {
 ; CHECK-LABEL: @test2b(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-NEXT: ret i32 [[VAR]]
-entry:
-  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %ct = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
   %tobool = icmp ne i32 %x, 0
-  %cond = select i1 %tobool, i32 %0, i32 32
+  %cond = select i1 %tobool, i32 %ct, i32 32
   ret i32 %cond
 }
 
 define i64 @test3b(i64 %x) {
 ; CHECK-LABEL: @test3b(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
-; CHECK-NEXT: ret i64 [[VAR]]
-entry:
-  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X:%.*]], i1 false), !range !2
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %ct = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
   %tobool = icmp ne i64 %x, 0
-  %cond = select i1 %tobool, i64 %0, i64 64
+  %cond = select i1 %tobool, i64 %ct, i64 64
   ret i64 %cond
 }
 
 define i16 @test4b(i16 %x) {
 ; CHECK-LABEL: @test4b(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
-; CHECK-NEXT: ret i16 [[VAR]]
-entry:
-  %0 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %ct = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
   %tobool = icmp eq i16 %x, 0
-  %cond = select i1 %tobool, i16 16, i16 %0
+  %cond = select i1 %tobool, i16 16, i16 %ct
   ret i16 %cond
 }
 
 define i32 @test5b(i32 %x) {
 ; CHECK-LABEL: @test5b(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-NEXT: ret i32 [[VAR]]
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
 entry:
-  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %ct = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
   %tobool = icmp eq i32 %x, 0
-  %cond = select i1 %tobool, i32 32, i32 %0
+  %cond = select i1 %tobool, i32 32, i32 %ct
   ret i32 %cond
 }
 
 define i64 @test6b(i64 %x) {
 ; CHECK-LABEL: @test6b(
-; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
-; CHECK-NEXT: ret i64 [[VAR]]
-entry:
-  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X:%.*]], i1 false), !range !2
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %ct = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
   %tobool = icmp eq i64 %x, 0
-  %cond = select i1 %tobool, i64 64, i64 %0
+  %cond = select i1 %tobool, i64 64, i64 %ct
   ret i64 %cond
 }
 
 define i32 @test1c(i16 %x) {
 ; CHECK-LABEL: @test1c(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i16 [[VAR1]] to i32
-; CHECK-NEXT: ret i32 [[VAR2]]
-entry:
-  %0 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
-  %cast2 = zext i16 %0 to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %ct = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %cast2 = zext i16 %ct to i32
   %tobool = icmp ne i16 %x, 0
   %cond = select i1 %tobool, i32 %cast2, i32 16
   ret i32 %cond
@@ -151,12 +154,12 @@ entry:
 
 define i64 @test2c(i16 %x) {
 ; CHECK-LABEL: @test2c(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i16 [[VAR1]] to i64
-; CHECK-NEXT: ret i64 [[VAR2]]
-entry:
-  %0 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
-  %conv = zext i16 %0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %ct = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %conv = zext i16 %ct to i64
   %tobool = icmp ne i16 %x, 0
   %cond = select i1 %tobool, i64 %conv, i64 16
   ret i64 %cond
@@ -164,12 +167,12 @@ entry:
 
 define i64 @test3c(i32 %x) {
 ; CHECK-LABEL: @test3c(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i32 [[VAR1]] to i64
-; CHECK-NEXT: ret i64 [[VAR2]]
-entry:
-  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
-  %conv = zext i32 %0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %ct = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %conv = zext i32 %ct to i64
   %tobool = icmp ne i32 %x, 0
   %cond = select i1 %tobool, i64 %conv, i64 32
   ret i64 %cond
@@ -177,12 +180,12 @@ entry:
 
 define i32 @test4c(i16 %x) {
 ; CHECK-LABEL: @test4c(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i16 [[VAR1]] to i32
-; CHECK-NEXT: ret i32 [[VAR2]]
-entry:
-  %0 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
-  %cast = zext i16 %0 to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %ct = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+  %cast = zext i16 %ct to i32
   %tobool = icmp ne i16 %x, 0
   %cond = select i1 %tobool, i32 %cast, i32 16
   ret i32 %cond
@@ -190,12 +193,12 @@ entry:
 
 define i64 @test5c(i16 %x) {
 ; CHECK-LABEL: @test5c(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i16 [[VAR1]] to i64
-; CHECK-NEXT: ret i64 [[VAR2]]
-entry:
-  %0 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
-  %cast = zext i16 %0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range !0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %ct = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+  %cast = zext i16 %ct to i64
   %tobool = icmp ne i16 %x, 0
   %cond = select i1 %tobool, i64 %cast, i64 16
   ret i64 %cond
@@ -203,12 +206,12 @@ entry:
 
 define i64 @test6c(i32 %x) {
 ; CHECK-LABEL: @test6c(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i32 [[VAR1]] to i64
-; CHECK-NEXT: ret i64 [[VAR2]]
-entry:
-  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
-  %cast = zext i32 %0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %ct = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %cast = zext i32 %ct to i64
   %tobool = icmp ne i32 %x, 0
   %cond = select i1 %tobool, i64 %cast, i64 32
   ret i64 %cond
@@ -216,12 +219,12 @@ entry:
 
 define i16 @test1d(i64 %x) {
 ; CHECK-LABEL: @test1d(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i64 [[VAR1]] to i16
-; CHECK-NEXT: ret i16 [[VAR2]]
-entry:
-  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
-  %conv = trunc i64 %0 to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X:%.*]], i1 false), !range !2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+  %ct = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %conv = trunc i64 %ct to i16
   %tobool = icmp ne i64 %x, 0
   %cond = select i1 %tobool, i16 %conv, i16 64
   ret i16 %cond
@@ -229,12 +232,12 @@ entry:
 
 define i32 @test2d(i64 %x) {
 ; CHECK-LABEL: @test2d(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i64 [[VAR1]] to i32
-; CHECK-NEXT: ret i32 [[VAR2]]
-entry:
-  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
-  %cast = trunc i64 %0 to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X:%.*]], i1 false), !range !2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %ct = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %ct to i32
   %tobool = icmp ne i64 %x, 0
   %cond = select i1 %tobool, i32 %cast, i32 64
   ret i32 %cond
@@ -242,12 +245,12 @@ entry:
 
 define i16 @test3d(i32 %x) {
 ; CHECK-LABEL: @test3d(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i32 [[VAR1]] to i16
-; CHECK-NEXT: ret i16 [[VAR2]]
-entry:
-  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
-  %cast = trunc i32 %0 to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+  %ct = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %cast = trunc i32 %ct to i16
   %tobool = icmp ne i32 %x, 0
   %cond = select i1 %tobool, i16 %cast, i16 32
   ret i16 %cond
@@ -255,12 +258,12 @@ entry:
 
 define i16 @test4d(i64 %x) {
 ; CHECK-LABEL: @test4d(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i64 [[VAR1]] to i16
-; CHECK-NEXT: ret i16 [[VAR2]]
-entry:
-  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
-  %cast = trunc i64 %0 to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X:%.*]], i1 false), !range !2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+  %ct = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %ct to i16
   %tobool = icmp ne i64 %x, 0
   %cond = select i1 %tobool, i16 %cast, i16 64
   ret i16 %cond
@@ -268,12 +271,12 @@ entry:
 
 define i32 @test5d(i64 %x) {
 ; CHECK-LABEL: @test5d(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i64 [[VAR1]] to i32
-; CHECK-NEXT: ret i32 [[VAR2]]
-entry:
-  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
-  %cast = trunc i64 %0 to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X:%.*]], i1 false), !range !2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %ct = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %ct to i32
   %tobool = icmp ne i64 %x, 0
   %cond = select i1 %tobool, i32 %cast, i32 64
   ret i32 %cond
@@ -281,12 +284,12 @@ entry:
 
 define i16 @test6d(i32 %x) {
 ; CHECK-LABEL: @test6d(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i32 [[VAR1]] to i16
-; CHECK-NEXT: ret i16 [[VAR2]]
-entry:
-  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
-  %cast = trunc i32 %0 to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+  %ct = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %cast = trunc i32 %ct to i16
   %tobool = icmp ne i32 %x, 0
   %cond = select i1 %tobool, i16 %cast, i16 32
   ret i16 %cond
@@ -294,12 +297,12 @@ entry:
 
 define i64 @select_bug1(i32 %x) {
 ; CHECK-LABEL: @select_bug1(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i32 [[VAR1]] to i64
-; CHECK-NEXT: ret i64 [[VAR2]]
-entry:
-  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-  %conv = zext i32 %0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %ct = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %conv = zext i32 %ct to i64
   %tobool = icmp ne i32 %x, 0
   %cond = select i1 %tobool, i64 %conv, i64 32
   ret i64 %cond
@@ -307,12 +310,12 @@ entry:
 
 define i16 @select_bug2(i32 %x) {
 ; CHECK-LABEL: @select_bug2(
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i32 [[VAR1]] to i16
-; CHECK-NEXT: ret i16 [[VAR2]]
-entry:
-  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-  %conv = trunc i32 %0 to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+  %ct = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %conv = trunc i32 %ct to i16
   %tobool = icmp ne i32 %x, 0
   %cond = select i1 %tobool, i16 %conv, i16 32
   ret i16 %cond
@@ -323,9 +326,9 @@ define i128 @test7(i128 %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i128 @llvm.ctlz.i128(i128 [[X:%.*]], i1 false), !range !3
 ; CHECK-NEXT:    ret i128 [[TMP1]]
 ;
-  %1 = tail call i128 @llvm.ctlz.i128(i128 %x, i1 true)
+  %ct = tail call i128 @llvm.ctlz.i128(i128 %x, i1 true)
   %tobool = icmp ne i128 %x, 0
-  %cond = select i1 %tobool, i128 %1, i128 128
+  %cond = select i1 %tobool, i128 %ct, i128 128
   ret i128 %cond
 }
 
@@ -334,12 +337,68 @@ define i128 @test8(i128 %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i128 @llvm.cttz.i128(i128 [[X:%.*]], i1 false), !range !3
 ; CHECK-NEXT:    ret i128 [[TMP1]]
 ;
-  %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true)
+  %ct = tail call i128 @llvm.cttz.i128(i128 %x, i1 true)
   %tobool = icmp ne i128 %x, 0
-  %cond = select i1 %tobool, i128 %1, i128 128
+  %cond = select i1 %tobool, i128 %ct, i128 128
   ret i128 %cond
 }
 
+define i32 @test_ctlz_not_bw(i32 %x) {
+; CHECK-LABEL: @test_ctlz_not_bw(
+; CHECK-NEXT:    [[CT:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 123, i32 [[CT]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %ct = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %cmp = icmp ne i32 %x, 0
+  %res = select i1 %cmp, i32 %ct, i32 123
+  ret i32 %res
+}
+
+define i32 @test_ctlz_not_bw_multiuse(i32 %x) {
+; CHECK-LABEL: @test_ctlz_not_bw_multiuse(
+; CHECK-NEXT:    [[CT:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 123, i32 [[CT]]
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[SEL]], [[CT]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %ct = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %cmp = icmp ne i32 %x, 0
+  %sel = select i1 %cmp, i32 %ct, i32 123
+  %res = or i32 %sel, %ct
+  ret i32 %res
+}
+
+define i32 @test_cttz_not_bw(i32 %x) {
+; CHECK-LABEL: @test_cttz_not_bw(
+; CHECK-NEXT:    [[CT:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 123, i32 [[CT]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %ct = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %cmp = icmp ne i32 %x, 0
+  %res = select i1 %cmp, i32 %ct, i32 123
+  ret i32 %res
+}
+
+define i32 @test_cttz_not_bw_multiuse(i32 %x) {
+; CHECK-LABEL: @test_cttz_not_bw_multiuse(
+; CHECK-NEXT:    [[CT:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false), !range !1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 123, i32 [[CT]]
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[SEL]], [[CT]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %ct = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %cmp = icmp ne i32 %x, 0
+  %sel = select i1 %cmp, i32 %ct, i32 123
+  %res = or i32 %sel, %ct
+  ret i32 %res
+}
+
 declare i16 @llvm.ctlz.i16(i16, i1)
 declare i32 @llvm.ctlz.i32(i32, i1)
 declare i64 @llvm.ctlz.i64(i64, i1)
diff --git a/test/Transforms/InstCombine/select-implied.ll b/test/Transforms/InstCombine/select-implied.ll
deleted file mode 100644
index 2558745c18f3c0a94956014ecc4a9797946dce95..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/select-implied.ll
+++ /dev/null
@@ -1,200 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; A == B implies A >u B is false.
-; CHECK-LABEL: @test1
-; CHECK-NOT: select
-; CHECK: call void @foo(i32 10)
-define void @test1(i32 %a, i32 %b) {
-  %cmp1 = icmp eq i32 %a, %b
-  br i1 %cmp1, label %taken, label %end
-
-taken:
-  %cmp2 = icmp ugt i32 %a, %b
-  %c = select i1 %cmp2, i32 0, i32 10
-  call void @foo(i32 %c)
-  br label %end
-
-end:
-  ret void
-}
-
-; If A == B is false then A != B is true.
-; CHECK-LABEL: @test2
-; CHECK-NOT: select
-; CHECK: call void @foo(i32 20)
-define void @test2(i32 %a, i32 %b) {
-  %cmp1 = icmp eq i32 %a, %b
-  br i1 %cmp1, label %end, label %taken
-
-taken:
-  %cmp2 = icmp ne i32 %a, %b
-  %c = select i1 %cmp2, i32 20, i32 0
-  call void @foo(i32 %c)
-  br label %end
-
-end:
-  ret void
-}
-
-; A >u 10 implies A >u 10 is true.
-; CHECK-LABEL: @test3
-; CHECK-NOT: select
-; CHECK: call void @foo(i32 30)
-define void @test3(i32 %a) {
-  %cmp1 = icmp ugt i32 %a, 10
-  br i1 %cmp1, label %taken, label %end
-
-taken:
-  %cmp2 = icmp ugt i32 %a, 10
-  %c = select i1 %cmp2, i32 30, i32 0
-  call void @foo(i32 %c)
-  br label %end
-
-end:
-  ret void
-}
-
-; CHECK-LABEL: @PR23333
-; CHECK-NOT: select
-; CHECK: ret i8 1
-define i8 @PR23333(i8 addrspace(1)* %ptr) {
-   %cmp = icmp eq i8 addrspace(1)* %ptr, null
-   br i1 %cmp, label %taken, label %end
-
-taken:
-   %cmp2 = icmp ne i8 addrspace(1)* %ptr, null
-   %res = select i1 %cmp2, i8 2, i8 1
-   ret i8 %res
-
-end:
-   ret i8 0
-}
-
-; We know the condition of the select is true based on a dominating condition.
-; Therefore, we can replace %cond with %len. However, now the inner icmp is
-; always false and can be elided.
-; CHECK-LABEL: @test4
-; CHECK-NOT: select
-define void @test4(i32 %len) {
-entry:
-  %0 = call i32 @bar(i32 %len);
-  %cmp = icmp ult i32 %len, 4
-  br i1 %cmp, label %bb, label %b1
-bb:
-  %cond = select i1 %cmp, i32 %len, i32 8
-; CHECK-NOT:  %cmp11 = icmp eq i32 %{{.*}}, 8
-  %cmp11 = icmp eq i32 %cond, 8
-; CHECK: br i1 false, label %b0, label %b1
-  br i1 %cmp11, label %b0, label %b1
-
-b0:
-  call void @foo(i32 %len)
-  br label %b1
-
-b1:
-; CHECK: phi i32 [ %len, %bb ], [ undef, %b0 ], [ %0, %entry ]
-  %1 = phi i32 [ %cond, %bb ], [ undef, %b0 ], [ %0, %entry ]
-  br label %ret
-
-ret:
-  call void @foo(i32 %1)
-  ret void
-}
-
-; A >u 10 implies A >u 9 is true.
-; CHECK-LABEL: @test5
-; CHECK-NOT: select
-; CHECK: call void @foo(i32 30)
-define void @test5(i32 %a) {
-  %cmp1 = icmp ugt i32 %a, 10
-  br i1 %cmp1, label %taken, label %end
-
-taken:
-  %cmp2 = icmp ugt i32 %a, 9
-  %c = select i1 %cmp2, i32 30, i32 0
-  call void @foo(i32 %c)
-  br label %end
-
-end:
-  ret void
-}
-
-declare void @foo(i32)
-declare i32 @bar(i32)
-
-; CHECK-LABEL: @test_and
-; CHECK: tpath:
-; CHECK-NOT: select
-; CHECK: ret i32 313
-define i32 @test_and(i32 %a, i32 %b) {
-entry:
-  %cmp1 = icmp ne i32 %a, 0
-  %cmp2 = icmp ne i32 %b, 0
-  %and = and i1 %cmp1, %cmp2
-  br i1 %and, label %tpath, label %end
-
-tpath:
-  %cmp3 = icmp eq i32 %a, 0 ;; <-- implied false
-  %c = select i1 %cmp3, i32 0, i32 313
-  ret i32 %c
-
-end:
-  ret i32 0
-}
-
-; cmp1 and cmp2 are false on the 'fpath' path and thus cmp3 is true.
-; CHECK-LABEL: @test_or1
-; CHECK: fpath:
-; CHECK-NOT: select
-; CHECK: ret i32 37
-define i32 @test_or1(i32 %a, i32 %b) {
-entry:
-  %cmp1 = icmp eq i32 %a, 0
-  %cmp2 = icmp eq i32 %b, 0
-  %or = or i1 %cmp1, %cmp2
-  br i1 %or, label %end, label %fpath
-
-fpath:
-  %cmp3 = icmp ne i32 %a, 0  ;; <-- implied true
-  %c = select i1 %cmp3, i32 37, i32 0
-  ret i32 %c
-
-end:
-  ret i32 0
-}
-
-; LHS ==> RHS by definition (true -> true)
-; CHECK-LABEL: @test6
-; CHECK: taken:
-; CHECK-NOT: select
-; CHECK: call void @foo(i32 10)
-define void @test6(i32 %a, i32 %b) {
-  %cmp1 = icmp eq i32 %a, %b
-  br i1 %cmp1, label %taken, label %end
-
-taken:
-  %c = select i1 %cmp1, i32 10, i32 0
-  call void @foo(i32 %c)
-  br label %end
-
-end:
-  ret void
-}
-
-; LHS ==> RHS by definition (false -> false)
-; CHECK-LABEL: @test7
-; CHECK: taken:
-; CHECK-NOT: select
-; CHECK: call void @foo(i32 11)
-define void @test7(i32 %a, i32 %b) {
-  %cmp1 = icmp eq i32 %a, %b
-  br i1 %cmp1, label %end, label %taken
-
-taken:
-  %c = select i1 %cmp1, i32 0, i32 11
-  call void @foo(i32 %c)
-  br label %end
-
-end:
-  ret void
-}
diff --git a/test/Transforms/InstCombine/select-pr39595.ll b/test/Transforms/InstCombine/select-pr39595.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0f88d66e4d1d7e5df97c76d1666f777cc0ef77b9
--- /dev/null
+++ b/test/Transforms/InstCombine/select-pr39595.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i32 @foo(i32 %x, i32 %y) {
+; CHECK-LABEL: foo
+; CHECK:      [[TMP1:%.*]] = icmp ult i32 %y, %x
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 %x, i32 %y, !prof ![[$MD0:[0-9]+]]
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], -1
+; CHECK-NEXT: ret i32 [[TMP3:%.*]]
+; CHECK-DAG:  !0 = !{!"branch_weights", i32 6, i32 1}
+
+  %1 = xor i32 %x, -1
+  %2 = xor i32 %y, -1
+  %3 = icmp ugt i32 %1, %2
+  %4 = select i1 %3, i32 %2, i32 %1, !prof !1
+  ret i32 %4
+}
+
+!1 = !{!"branch_weights", i32 1, i32 6}
diff --git a/test/Transforms/InstCombine/sink-into-catchswitch.ll b/test/Transforms/InstCombine/sink-into-catchswitch.ll
index 04a62250fc5d50e0a3447fc2ccb17b2913c141a3..893bf2b16f7ecba6bb35ee0066078dc7b3f4cf26 100644
--- a/test/Transforms/InstCombine/sink-into-catchswitch.ll
+++ b/test/Transforms/InstCombine/sink-into-catchswitch.ll
@@ -1,16 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S < %s | FileCheck %s
+
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc18.0.0"
 
 %struct.B = type { i64, i64 }
 
 define void @test1(%struct.B* %p) personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  invoke.cont:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.B* [[P:%.*]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    invoke void @throw()
+; CHECK-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK:       catch.dispatch:
+; CHECK-NEXT:    [[CS:%.*]] = catchswitch within none [label %invoke.cont1] unwind label [[EHCLEANUP:%.*]]
+; CHECK:       invoke.cont1:
+; CHECK-NEXT:    [[CATCH:%.*]] = catchpad within [[CS]] [i8* null, i32 64, i8* null]
+; CHECK-NEXT:    invoke void @throw() [ "funclet"(token [[CATCH]]) ]
+; CHECK-NEXT:    to label [[UNREACHABLE]] unwind label [[EHCLEANUP]]
+; CHECK:       ehcleanup:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[TMP2]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ]
+; CHECK-NEXT:    [[CLEANUP:%.*]] = cleanuppad within none []
+; CHECK-NEXT:    call void @release(i64 [[PHI]]) [ "funclet"(token [[CLEANUP]]) ]
+; CHECK-NEXT:    cleanupret from [[CLEANUP]] unwind to caller
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+;
 invoke.cont:
   %0 = bitcast %struct.B* %p to <2 x i64>*
   %1 = load <2 x i64>, <2 x i64>* %0, align 8
   %2 = extractelement <2 x i64> %1, i32 0
   invoke void @throw()
-          to label %unreachable unwind label %catch.dispatch
+  to label %unreachable unwind label %catch.dispatch
 
 catch.dispatch:                                   ; preds = %invoke.cont
   %cs = catchswitch within none [label %invoke.cont1] unwind label %ehcleanup
@@ -18,7 +41,7 @@ catch.dispatch:                                   ; preds = %invoke.cont
 invoke.cont1:                                     ; preds = %catch.dispatch
   %catch = catchpad within %cs [i8* null, i32 64, i8* null]
   invoke void @throw() [ "funclet"(token %catch) ]
-          to label %unreachable unwind label %ehcleanup
+  to label %unreachable unwind label %ehcleanup
 
 ehcleanup:                                        ; preds = %invoke.cont1, %catch.dispatch
   %phi = phi i64 [ %2, %catch.dispatch ], [ 9, %invoke.cont1 ]
@@ -30,16 +53,6 @@ unreachable:                                      ; preds = %invoke.cont1, %invo
   unreachable
 }
 
-; CHECK-LABEL: define void @test1(
-; CHECK: %[[bc:.*]] = bitcast %struct.B* %p to <2 x i64>*
-; CHECK: %[[ld:.*]] = load <2 x i64>, <2 x i64>* %[[bc]], align 8
-; CHECK: %[[ee:.*]] = extractelement <2 x i64> %[[ld]], i32 0
-
-; CHECK: %[[phi:.*]] = phi i64 [ %[[ee]], {{.*}} ], [ 9, {{.*}} ]
-; CHECK: call void @release(i64 %[[phi]])
-
 declare i32 @__CxxFrameHandler3(...)
-
 declare void @throw()
-
 declare void @release(i64)
diff --git a/test/Transforms/InstCombine/store.ll b/test/Transforms/InstCombine/store.ll
index 52ce52768a63894911cde7a54194fd5e6905b637..77b6a276f7c78da728f1f6ba5648e563b8dd700c 100644
--- a/test/Transforms/InstCombine/store.ll
+++ b/test/Transforms/InstCombine/store.ll
@@ -1,40 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define void @test1(i32* %P) {
-        store i32 undef, i32* %P
-        store i32 123, i32* undef
-        store i32 124, i32* null
-        ret void
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT: store i32 123, i32* undef
-; CHECK-NEXT: store i32 undef, i32* null
-; CHECK-NEXT: ret void
+; CHECK-NEXT:    store i32 123, i32* undef, align 4
+; CHECK-NEXT:    store i32 undef, i32* null, align 536870912
+; CHECK-NEXT:    ret void
+;
+  store i32 undef, i32* %P
+  store i32 123, i32* undef
+  store i32 124, i32* null
+  ret void
 }
 
 define void @test2(i32* %P) {
-        %X = load i32, i32* %P               ; <i32> [#uses=1]
-        %Y = add i32 %X, 0              ; <i32> [#uses=1]
-        store i32 %Y, i32* %P
-        ret void
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT: ret void
+; CHECK-NEXT:    ret void
+;
+  %X = load i32, i32* %P
+  %Y = add i32 %X, 0
+  store i32 %Y, i32* %P
+  ret void
 }
 
 define void @store_at_gep_off_null(i64 %offset) {
-; CHECK-LABEL: @store_at_gep_off_null
-; CHECK: store i32 undef, i32* %ptr
-   %ptr = getelementptr i32, i32 *null, i64 %offset
-   store i32 24, i32* %ptr
-   ret void
+; CHECK-LABEL: @store_at_gep_off_null(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, i32* null, i64 [[OFFSET:%.*]]
+; CHECK-NEXT:    store i32 undef, i32* [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, i32 *null, i64 %offset
+  store i32 24, i32* %ptr
+  ret void
 }
 
 define void @store_at_gep_off_no_null_opt(i64 %offset) #0 {
-   %ptr = getelementptr i32, i32 *null, i64 %offset
-   store i32 24, i32* %ptr
-   ret void
-; CHECK-LABEL: @store_at_gep_off_no_null_opt(i64 %offset)
-; CHECK-NEXT: %ptr = getelementptr i32, i32* null, i64 %offset
-; CHECK-NEXT: store i32 24, i32* %ptr
+; CHECK-LABEL: @store_at_gep_off_no_null_opt(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, i32* null, i64 [[OFFSET:%.*]]
+; CHECK-NEXT:    store i32 24, i32* [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, i32 *null, i64 %offset
+  store i32 24, i32* %ptr
+  ret void
 }
 
 attributes #0 = { "null-pointer-is-valid"="true" }
@@ -43,79 +51,109 @@ attributes #0 = { "null-pointer-is-valid"="true" }
 
 ; "if then else"
 define i32 @test3(i1 %C) {
-	%A = alloca i32
-        br i1 %C, label %Cond, label %Cond2
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[COND:%.*]], label [[COND2:%.*]]
+; CHECK:       Cond:
+; CHECK-NEXT:    br label [[CONT:%.*]]
+; CHECK:       Cond2:
+; CHECK-NEXT:    br label [[CONT]]
+; CHECK:       Cont:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ -987654321, [[COND]] ], [ 47, [[COND2]] ]
+; CHECK-NEXT:    ret i32 [[STOREMERGE]]
+;
+  %A = alloca i32
+  br i1 %C, label %Cond, label %Cond2
 
 Cond:
-        store i32 -987654321, i32* %A
-        br label %Cont
+  store i32 -987654321, i32* %A
+  br label %Cont
 
 Cond2:
-	store i32 47, i32* %A
-	br label %Cont
+  store i32 47, i32* %A
+  br label %Cont
 
 Cont:
-	%V = load i32, i32* %A
-	ret i32 %V
-; CHECK-LABEL: @test3(
-; CHECK-NOT: alloca
-; CHECK: Cont:
-; CHECK-NEXT:  %storemerge = phi i32 [ -987654321, %Cond ], [ 47, %Cond2 ]
-; CHECK-NEXT:  ret i32 %storemerge
+  %V = load i32, i32* %A
+  ret i32 %V
 }
 
 ; "if then"
 define i32 @test4(i1 %C) {
-	%A = alloca i32
-	store i32 47, i32* %A
-        br i1 %C, label %Cond, label %Cont
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[COND:%.*]], label [[CONT:%.*]]
+; CHECK:       Cond:
+; CHECK-NEXT:    br label [[CONT]]
+; CHECK:       Cont:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ -987654321, [[COND]] ], [ 47, [[TMP0:%.*]] ]
+; CHECK-NEXT:    ret i32 [[STOREMERGE]]
+;
+  %A = alloca i32
+  store i32 47, i32* %A
+  br i1 %C, label %Cond, label %Cont
 
 Cond:
-        store i32 -987654321, i32* %A
-        br label %Cont
+  store i32 -987654321, i32* %A
+  br label %Cont
 
 Cont:
-	%V = load i32, i32* %A
-	ret i32 %V
-; CHECK-LABEL: @test4(
-; CHECK-NOT: alloca
-; CHECK: Cont:
-; CHECK-NEXT:  %storemerge = phi i32 [ -987654321, %Cond ], [ 47, %0 ]
-; CHECK-NEXT:  ret i32 %storemerge
+  %V = load i32, i32* %A
+  ret i32 %V
 }
 
 ; "if then"
 define void @test5(i1 %C, i32* %P) {
-	store i32 47, i32* %P, align 1
-        br i1 %C, label %Cond, label %Cont
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[COND:%.*]], label [[CONT:%.*]]
+; CHECK:       Cond:
+; CHECK-NEXT:    br label [[CONT]]
+; CHECK:       Cont:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ -987654321, [[COND]] ], [ 47, [[TMP0:%.*]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], i32* [[P:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+  store i32 47, i32* %P, align 1
+  br i1 %C, label %Cond, label %Cont
 
 Cond:
-        store i32 -987654321, i32* %P, align 1
-        br label %Cont
+  store i32 -987654321, i32* %P, align 1
+  br label %Cont
 
 Cont:
-	ret void
-; CHECK-LABEL: @test5(
-; CHECK: Cont:
-; CHECK-NEXT:  %storemerge = phi i32
-; CHECK-NEXT:  store i32 %storemerge, i32* %P, align 1
-; CHECK-NEXT:  ret void
+  ret void
 }
 
 
 ; PR14753 - merging two stores should preserve the TBAA tag.
 define void @test6(i32 %n, float* %a, i32* %gi) nounwind uwtable ssp {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ 42, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], i32* [[GI:%.*]], align 4, !tbaa !0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[STOREMERGE]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STOREMERGE]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX]], align 4, !tbaa !4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[GI]], align 4, !tbaa !0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[TMP0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   store i32 42, i32* %gi, align 4, !tbaa !0
   br label %for.cond
 
-for.cond:                                         ; preds = %for.body, %entry
+for.cond:
   %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %0 = load i32, i32* %gi, align 4, !tbaa !0
   %cmp = icmp slt i32 %0, %n
   br i1 %cmp, label %for.body, label %for.end
 
-for.body:                                         ; preds = %for.cond
+for.body:
   %idxprom = sext i32 %0 to i64
   %arrayidx = getelementptr inbounds float, float* %a, i64 %idxprom
   store float 0.000000e+00, float* %arrayidx, align 4, !tbaa !3
@@ -124,122 +162,130 @@ for.body:                                         ; preds = %for.cond
   store i32 %inc, i32* %gi, align 4, !tbaa !0
   br label %for.cond
 
-for.end:                                          ; preds = %for.cond
+for.end:
   ret void
-; CHECK-LABEL: @test6(
-; CHECK: for.cond:
-; CHECK-NEXT: phi i32 [ 42
-; CHECK-NEXT: store i32 %storemerge, i32* %gi, align 4, !tbaa !0
 }
 
 define void @dse1(i32* %p) {
-; CHECK-LABEL: dse1
-; CHECK-NEXT: store
-; CHECK-NEXT: ret
+; CHECK-LABEL: @dse1(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
   store i32 0, i32* %p
   store i32 0, i32* %p
   ret void
-} 
+}
 
 ; Slightly subtle: if we're mixing atomic and non-atomic access to the
 ; same location, then the contents of the location are undefined if there's
-; an actual race.  As such, we're free to pick either store under the 
+; an actual race.  As such, we're free to pick either store under the
 ; assumption that we're not racing with any other thread.
 define void @dse2(i32* %p) {
-; CHECK-LABEL: dse2
-; CHECK-NEXT: store i32 0, i32* %p
-; CHECK-NEXT: ret
+; CHECK-LABEL: @dse2(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
   store atomic i32 0, i32* %p unordered, align 4
   store i32 0, i32* %p
   ret void
-} 
+}
 
 define void @dse3(i32* %p) {
-; CHECK-LABEL: dse3
-; CHECK-NEXT: store atomic i32 0, i32* %p unordered, align 4
-; CHECK-NEXT: ret
+; CHECK-LABEL: @dse3(
+; CHECK-NEXT:    store atomic i32 0, i32* [[P:%.*]] unordered, align 4
+; CHECK-NEXT:    ret void
+;
   store i32 0, i32* %p
   store atomic i32 0, i32* %p unordered, align 4
   ret void
-} 
+}
 
 define void @dse4(i32* %p) {
-; CHECK-LABEL: dse4
-; CHECK-NEXT: store atomic i32 0, i32* %p unordered, align 4
-; CHECK-NEXT: ret
+; CHECK-LABEL: @dse4(
+; CHECK-NEXT:    store atomic i32 0, i32* [[P:%.*]] unordered, align 4
+; CHECK-NEXT:    ret void
+;
   store atomic i32 0, i32* %p unordered, align 4
   store atomic i32 0, i32* %p unordered, align 4
   ret void
-} 
+}
 
 ; Implementation limit - could remove unordered store here, but
 ; currently don't.
 define void @dse5(i32* %p) {
-; CHECK-LABEL: dse5
-; CHECK-NEXT: store
-; CHECK-NEXT: store
-; CHECK-NEXT: ret
+; CHECK-LABEL: @dse5(
+; CHECK-NEXT:    store atomic i32 0, i32* [[P:%.*]] unordered, align 4
+; CHECK-NEXT:    store atomic i32 0, i32* [[P]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
   store atomic i32 0, i32* %p unordered, align 4
   store atomic i32 0, i32* %p seq_cst, align 4
   ret void
 }
 
 define void @write_back1(i32* %p) {
-; CHECK-LABEL: write_back1
-; CHECK-NEXT: ret
+; CHECK-LABEL: @write_back1(
+; CHECK-NEXT:    ret void
+;
   %v = load i32, i32* %p
   store i32 %v, i32* %p
   ret void
-} 
+}
 
 define void @write_back2(i32* %p) {
-; CHECK-LABEL: write_back2
-; CHECK-NEXT: ret
+; CHECK-LABEL: @write_back2(
+; CHECK-NEXT:    ret void
+;
   %v = load atomic i32, i32* %p unordered, align 4
   store i32 %v, i32* %p
   ret void
-} 
+}
 
 define void @write_back3(i32* %p) {
-; CHECK-LABEL: write_back3
-; CHECK-NEXT: ret
+; CHECK-LABEL: @write_back3(
+; CHECK-NEXT:    ret void
+;
   %v = load i32, i32* %p
   store atomic i32 %v, i32* %p unordered, align 4
   ret void
-} 
+}
 
 define void @write_back4(i32* %p) {
-; CHECK-LABEL: write_back4
-; CHECK-NEXT: ret
+; CHECK-LABEL: @write_back4(
+; CHECK-NEXT:    ret void
+;
   %v = load atomic i32, i32* %p unordered, align 4
   store atomic i32 %v, i32* %p unordered, align 4
   ret void
-} 
+}
 
 ; Can't remove store due to ordering side effect
 define void @write_back5(i32* %p) {
-; CHECK-LABEL: write_back5
-; CHECK-NEXT: load
-; CHECK-NEXT: store
-; CHECK-NEXT: ret
+; CHECK-LABEL: @write_back5(
+; CHECK-NEXT:    [[V:%.*]] = load atomic i32, i32* [[P:%.*]] unordered, align 4
+; CHECK-NEXT:    store atomic i32 [[V]], i32* [[P]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
   %v = load atomic i32, i32* %p unordered, align 4
   store atomic i32 %v, i32* %p seq_cst, align 4
   ret void
 }
 
 define void @write_back6(i32* %p) {
-; CHECK-LABEL: write_back6
-; CHECK-NEXT: load
-; CHECK-NEXT: ret
+; CHECK-LABEL: @write_back6(
+; CHECK-NEXT:    [[V:%.*]] = load atomic i32, i32* [[P:%.*]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
   %v = load atomic i32, i32* %p seq_cst, align 4
   store atomic i32 %v, i32* %p unordered, align 4
   ret void
 }
 
 define void @write_back7(i32* %p) {
-; CHECK-LABEL: write_back7
-; CHECK-NEXT: load
-; CHECK-NEXT: ret
+; CHECK-LABEL: @write_back7(
+; CHECK-NEXT:    [[V:%.*]] = load atomic volatile i32, i32* [[P:%.*]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
   %v = load atomic volatile i32, i32* %p seq_cst, align 4
   store atomic i32 %v, i32* %p unordered, align 4
   ret void
diff --git a/test/Transforms/InstCombine/storemerge-dbg.ll b/test/Transforms/InstCombine/storemerge-dbg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dc40dd7f787a7f0426cca23c3bdc85fee6c699c8
--- /dev/null
+++ b/test/Transforms/InstCombine/storemerge-dbg.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -debugify -instcombine -S | FileCheck %s
+
+declare i32 @escape(i32)
+
+; CHECK-LABEL: define {{.*}}@foo(
+define i32 @foo() {
+entry:
+  %baz = alloca i32
+  br i1 undef, label %lhs, label %rhs
+
+lhs:
+  store i32 1, i32* %baz
+  br label %cleanup
+
+rhs:
+  store i32 2, i32* %baz
+  br label %cleanup
+
+cleanup:
+  ; CHECK: %storemerge = phi i32 [ 1, %lhs ], [ 2, %rhs ], !dbg [[merge_loc:![0-9]+]]
+  %baz.val = load i32, i32* %baz
+  %ret.val = call i32 @escape(i32 %baz.val)
+  ret i32 %ret.val
+}
+
+; CHECK: [[merge_loc]] = !DILocation(line: 0
diff --git a/test/Transforms/InstCombine/unrecognized_three-way-comparison.ll b/test/Transforms/InstCombine/unrecognized_three-way-comparison.ll
index 551efa7078a8e229e6eb9264a79fd2d626fa83c2..dcd046e6760267d520199a8d45aba8b86a8bef16 100644
--- a/test/Transforms/InstCombine/unrecognized_three-way-comparison.ll
+++ b/test/Transforms/InstCombine/unrecognized_three-way-comparison.ll
@@ -1,6 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; Various patterns of three-ways comparison that are not currently recognized.
-
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
@@ -8,17 +6,12 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 declare void @foo(i32 %x)
 
 define i32 @compare_against_arbitrary_value(i32 %x, i32 %c) {
-; TODO: We can prove that if %x s> %c then %x != c, so there should be no actual
-;       calculations in callfoo block. @foo can be invoked with 1. We only do it
-;       for constants that are not 0 currently while it could be generalized.
 ; CHECK-LABEL: @compare_against_arbitrary_value(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[X:%.*]], [[C:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[CALLFOO:%.*]], label [[EXIT:%.*]]
 ; CHECK:       callfoo:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[X]], [[C]]
-; CHECK-NEXT:    [[SELECT2:%.*]] = zext i1 [[CMP1]] to i32
-; CHECK-NEXT:    call void @foo(i32 [[SELECT2]])
+; CHECK-NEXT:    call void @foo(i32 1)
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 42
@@ -353,17 +346,12 @@ exit:
 }
 
 define i32 @compare_against_arbitrary_value_type_mismatch(i64 %x, i64 %c) {
-; TODO: We can prove that if %x s> %c then %x != c, so there should be no actual
-;       calculations in callfoo block. @foo can be invoked with 1. We only do it
-;       for constants that are not 0 currently while it could be generalized.
 ; CHECK-LABEL: @compare_against_arbitrary_value_type_mismatch(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[X:%.*]], [[C:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[CALLFOO:%.*]], label [[EXIT:%.*]]
 ; CHECK:       callfoo:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[X]], [[C]]
-; CHECK-NEXT:    [[SELECT2:%.*]] = zext i1 [[CMP1]] to i32
-; CHECK-NEXT:    call void @foo(i32 [[SELECT2]])
+; CHECK-NEXT:    call void @foo(i32 1)
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 42
diff --git a/test/Transforms/InstCombine/vec_narrow.ll b/test/Transforms/InstCombine/vec_narrow.ll
deleted file mode 100644
index b4c41f6d297d62ef667c743dc9f24b4824be4dc7..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/vec_narrow.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep "fadd float"
-
-
-define float @test(<4 x float> %A, <4 x float> %B, float %f) {
-        %C = insertelement <4 x float> %A, float %f, i32 0               ; <%V> [#uses=1]
-        %D = fadd <4 x float> %C, %B              ; <%V> [#uses=1]
-        %E = extractelement <4 x float> %D, i32 0                ; <float> [#uses=1]
-        ret float %E
-}
-
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index 7692fe3e05c046880233ecc4efd0b05e59eb7063..b82c8117eebf58bf98df5c1039f3a1503888c53c 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -265,8 +265,8 @@ define <3 x i8> @fold_inselts_with_widening_shuffle(i8 %x, i8 %y) {
 
 define <2 x i8> @test13b(i8 %x) {
 ; CHECK-LABEL: @test13b(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X:%.*]], i32 1
-; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+; CHECK-NEXT:    [[B:%.*]] = insertelement <2 x i8> undef, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[B]]
 ;
   %A = insertelement <2 x i8> undef, i8 %x, i32 0
   %B = shufflevector <2 x i8> %A, <2 x i8> undef, <2 x i32> <i32 undef, i32 0>
@@ -605,6 +605,62 @@ define <4 x i8> @widening_shuffle_add_invalid_mask(<2 x i8> %x) {
   ret <4 x i8> %r
 }
 
+; A binop that produces undef in the high lanes can be moved before the shuffle.
+; This is ok because 'shl C, undef --> undef'.
+
+define <4 x i16> @widening_shuffle_shl_constant_op0(<2 x i16> %v) {
+; CHECK-LABEL: @widening_shuffle_shl_constant_op0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i16> <i16 42, i16 -42>, [[V:%.*]]
+; CHECK-NEXT:    [[BO:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i16> [[BO]]
+;
+  %shuf = shufflevector <2 x i16> %v, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %bo = shl <4 x i16> <i16 42, i16 -42, i16 -1, i16 -1>, %shuf
+  ret <4 x i16> %bo
+}
+
+; A binop that produces undef in the high lanes can be moved before the shuffle.
+; This is ok because 'shl undef, 0 --> undef'.
+
+define <4 x i16> @widening_shuffle_shl_constant_op1(<2 x i16> %v) {
+; CHECK-LABEL: @widening_shuffle_shl_constant_op1(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i16> [[V:%.*]], <i16 2, i16 4>
+; CHECK-NEXT:    [[BO:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i16> [[BO]]
+;
+  %shuf = shufflevector <2 x i16> %v, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %bo = shl <4 x i16> %shuf, <i16 2, i16 4, i16 0, i16 0>
+  ret <4 x i16> %bo
+}
+
+; A binop that does not produce undef in the high lanes can not be moved before the shuffle.
+; This is not ok because 'shl undef, 1 (or 2)' --> 0' but moving the shuffle results in undef instead.
+
+define <4 x i16> @widening_shuffle_shl_constant_op1_non0(<2 x i16> %v) {
+; CHECK-LABEL: @widening_shuffle_shl_constant_op1_non0(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i16> [[V:%.*]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BO:%.*]] = shl <4 x i16> [[SHUF]], <i16 2, i16 4, i16 1, i16 2>
+; CHECK-NEXT:    ret <4 x i16> [[BO]]
+;
+  %shuf = shufflevector <2 x i16> %v, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %bo = shl <4 x i16> %shuf, <i16 2, i16 4, i16 1, i16 2>
+  ret <4 x i16> %bo
+}
+
+; A binop that does not produce undef in the high lanes can not be moved before the shuffle.
+; This is not ok because 'or -1, undef --> -1' but moving the shuffle results in undef instead.
+
+define <4 x i16> @widening_shuffle_or(<2 x i16> %v) {
+; CHECK-LABEL: @widening_shuffle_or(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i16> [[V:%.*]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BO:%.*]] = or <4 x i16> [[SHUF]], <i16 42, i16 -42, i16 -1, i16 -1>
+; CHECK-NEXT:    ret <4 x i16> [[BO]]
+;
+  %shuf = shufflevector <2 x i16> %v, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %bo = or <4 x i16> %shuf, <i16 42, i16 -42, i16 -1, i16 -1>
+  ret <4 x i16> %bo
+}
+
 define <4 x i32> @shuffle_17add2(<4 x i32> %v) {
 ; CHECK-LABEL: @shuffle_17add2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[V:%.*]], <i32 1, i32 1, i32 1, i32 1>
diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll
index e0d6083a969e4c6d3f67b9dd6bec101404ea6660..d2acefc0fbfde350909f8591159b76920c10daa3 100644
--- a/test/Transforms/InstCombine/vector-casts.ll
+++ b/test/Transforms/InstCombine/vector-casts.ll
@@ -163,8 +163,8 @@ define void @convert(<2 x i32>* %dst.addr, <2 x i64> %src) {
 
 define <2 x i65> @foo(<2 x i64> %t) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[T:%.*]], <i64 4294967295, i64 4294967295>
-; CHECK-NEXT:    [[B:%.*]] = zext <2 x i64> [[TMP1]] to <2 x i65>
+; CHECK-NEXT:    [[A_MASK:%.*]] = and <2 x i64> [[T:%.*]], <i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[B:%.*]] = zext <2 x i64> [[A_MASK]] to <2 x i65>
 ; CHECK-NEXT:    ret <2 x i65> [[B]]
 ;
   %a = trunc <2 x i64> %t to <2 x i32>
@@ -174,8 +174,8 @@ define <2 x i65> @foo(<2 x i64> %t) {
 
 define <2 x i64> @bar(<2 x i65> %t) {
 ; CHECK-LABEL: @bar(
-; CHECK-NEXT:    [[A:%.*]] = trunc <2 x i65> [[T:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[B:%.*]] = and <2 x i64> [[A]], <i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i65> [[T:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[B:%.*]] = and <2 x i64> [[TMP1]], <i64 4294967295, i64 4294967295>
 ; CHECK-NEXT:    ret <2 x i64> [[B]]
 ;
   %a = trunc <2 x i65> %t to <2 x i32>
@@ -185,9 +185,8 @@ define <2 x i64> @bar(<2 x i65> %t) {
 
 define <2 x i64> @bars(<2 x i65> %t) {
 ; CHECK-LABEL: @bars(
-; CHECK-NEXT:    [[A:%.*]] = trunc <2 x i65> [[T:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i64> [[A]], <i64 32, i64 32>
-; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[SEXT]], <i64 32, i64 32>
+; CHECK-NEXT:    [[A:%.*]] = trunc <2 x i65> [[T:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[B:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[B]]
 ;
   %a = trunc <2 x i65> %t to <2 x i32>
@@ -197,8 +196,8 @@ define <2 x i64> @bars(<2 x i65> %t) {
 
 define <2 x i64> @quxs(<2 x i64> %t) {
 ; CHECK-LABEL: @quxs(
-; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i64> [[T:%.*]], <i64 32, i64 32>
-; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[SEXT]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[T:%.*]], <i64 32, i64 32>
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[TMP1]], <i64 32, i64 32>
 ; CHECK-NEXT:    ret <2 x i64> [[B]]
 ;
   %a = trunc <2 x i64> %t to <2 x i32>
@@ -377,3 +376,38 @@ define <3 x float> @fptrunc_inselt2(<3 x double> %x) {
   ret <3 x float> %trunc
 }
 
+; Converting to a wide type might reduce instruction count,
+; but we can not do that unless the backend can recover from
+; the creation of a potentially illegal op (like a 64-bit vmul).
+; PR40032 - https://bugs.llvm.org/show_bug.cgi?id=40032
+
+define <2 x i64> @sext_less_casting_with_wideop(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @sext_less_casting_with_wideop(
+; CHECK-NEXT:    [[XNARROW:%.*]] = trunc <2 x i64> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[YNARROW:%.*]] = trunc <2 x i64> [[Y:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[XNARROW]], [[YNARROW]]
+; CHECK-NEXT:    [[R:%.*]] = sext <2 x i32> [[MUL]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %xnarrow = trunc <2 x i64> %x to <2 x i32>
+  %ynarrow = trunc <2 x i64> %y to <2 x i32>
+  %mul = mul <2 x i32> %xnarrow, %ynarrow
+  %r = sext <2 x i32> %mul to <2 x i64>
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @zext_less_casting_with_wideop(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @zext_less_casting_with_wideop(
+; CHECK-NEXT:    [[XNARROW:%.*]] = trunc <2 x i64> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[YNARROW:%.*]] = trunc <2 x i64> [[Y:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[XNARROW]], [[YNARROW]]
+; CHECK-NEXT:    [[R:%.*]] = zext <2 x i32> [[MUL]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %xnarrow = trunc <2 x i64> %x to <2 x i32>
+  %ynarrow = trunc <2 x i64> %y to <2 x i32>
+  %mul = mul <2 x i32> %xnarrow, %ynarrow
+  %r = zext <2 x i32> %mul to <2 x i64>
+  ret <2 x i64> %r
+}
+
diff --git a/test/Transforms/InstSimplify/call.ll b/test/Transforms/InstSimplify/call.ll
index 1e581dd4d7c760c756925cdbd89afe325dca5bdc..b3a2f96870e53f4e12527f827174f24a70dcd6a9 100644
--- a/test/Transforms/InstSimplify/call.ll
+++ b/test/Transforms/InstSimplify/call.ll
@@ -500,3 +500,167 @@ define <2 x i8> @fshr_no_shift_modulo_bitwidth_splat(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i8> %z
 }
 
+; When the shift amount is 0, fshl returns its 1st parameter (x), so the guard is not needed.
+
+define i8 @fshl_zero_shift_guard(i8 %x, i8 %y, i8 %sh) {
+; CHECK-LABEL: @fshl_zero_shift_guard(
+; CHECK-NEXT:    [[F:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[SH:%.*]])
+; CHECK-NEXT:    ret i8 [[F]]
+;
+  %c = icmp eq i8 %sh, 0
+  %f = call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %sh)
+  %s = select i1 %c, i8 %x, i8 %f
+  ret i8 %s
+}
+
+; When the shift amount is 0, fshl returns its 1st parameter (x), so the guard is not needed.
+
+define i8 @fshl_zero_shift_guard_swapped(i8 %x, i8 %y, i8 %sh) {
+; CHECK-LABEL: @fshl_zero_shift_guard_swapped(
+; CHECK-NEXT:    [[F:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[SH:%.*]])
+; CHECK-NEXT:    ret i8 [[F]]
+;
+  %c = icmp ne i8 %sh, 0
+  %f = call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %sh)
+  %s = select i1 %c, i8 %f, i8 %x
+  ret i8 %s
+}
+
+; When the shift amount is 0, fshl returns its 1st parameter (x), so everything is deleted.
+
+define i8 @fshl_zero_shift_guard_inverted(i8 %x, i8 %y, i8 %sh) {
+; CHECK-LABEL: @fshl_zero_shift_guard_inverted(
+; CHECK-NEXT:    ret i8 [[X:%.*]]
+;
+  %c = icmp eq i8 %sh, 0
+  %f = call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %sh)
+  %s = select i1 %c, i8 %f, i8 %x
+  ret i8 %s
+}
+
+; When the shift amount is 0, fshl returns its 1st parameter (x), so everything is deleted.
+
+define i8 @fshl_zero_shift_guard_inverted_swapped(i8 %x, i8 %y, i8 %sh) {
+; CHECK-LABEL: @fshl_zero_shift_guard_inverted_swapped(
+; CHECK-NEXT:    ret i8 [[X:%.*]]
+;
+  %c = icmp ne i8 %sh, 0
+  %f = call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %sh)
+  %s = select i1 %c, i8 %x, i8 %f
+  ret i8 %s
+}
+
+; When the shift amount is 0, fshr returns its 2nd parameter (y), so the guard is not needed.
+
+define i9 @fshr_zero_shift_guard(i9 %x, i9 %y, i9 %sh) {
+; CHECK-LABEL: @fshr_zero_shift_guard(
+; CHECK-NEXT:    [[F:%.*]] = call i9 @llvm.fshr.i9(i9 [[X:%.*]], i9 [[Y:%.*]], i9 [[SH:%.*]])
+; CHECK-NEXT:    ret i9 [[F]]
+;
+  %c = icmp eq i9 %sh, 0
+  %f = call i9 @llvm.fshr.i9(i9 %x, i9 %y, i9 %sh)
+  %s = select i1 %c, i9 %y, i9 %f
+  ret i9 %s
+}
+
+; When the shift amount is 0, fshr returns its 2nd parameter (y), so the guard is not needed.
+
+define i9 @fshr_zero_shift_guard_swapped(i9 %x, i9 %y, i9 %sh) {
+; CHECK-LABEL: @fshr_zero_shift_guard_swapped(
+; CHECK-NEXT:    [[F:%.*]] = call i9 @llvm.fshr.i9(i9 [[X:%.*]], i9 [[Y:%.*]], i9 [[SH:%.*]])
+; CHECK-NEXT:    ret i9 [[F]]
+;
+  %c = icmp ne i9 %sh, 0
+  %f = call i9 @llvm.fshr.i9(i9 %x, i9 %y, i9 %sh)
+  %s = select i1 %c, i9 %f, i9 %y
+  ret i9 %s
+}
+
+; When the shift amount is 0, fshr returns its 2nd parameter (y), so everything is deleted.
+
+define i9 @fshr_zero_shift_guard_inverted(i9 %x, i9 %y, i9 %sh) {
+; CHECK-LABEL: @fshr_zero_shift_guard_inverted(
+; CHECK-NEXT:    ret i9 [[Y:%.*]]
+;
+  %c = icmp eq i9 %sh, 0
+  %f = call i9 @llvm.fshr.i9(i9 %x, i9 %y, i9 %sh)
+  %s = select i1 %c, i9 %f, i9 %y
+  ret i9 %s
+}
+
+; When the shift amount is 0, fshr returns its 2nd parameter (y), so everything is deleted.
+
+define i9 @fshr_zero_shift_guard_inverted_swapped(i9 %x, i9 %y, i9 %sh) {
+; CHECK-LABEL: @fshr_zero_shift_guard_inverted_swapped(
+; CHECK-NEXT:    ret i9 [[Y:%.*]]
+;
+  %c = icmp ne i9 %sh, 0
+  %f = call i9 @llvm.fshr.i9(i9 %x, i9 %y, i9 %sh)
+  %s = select i1 %c, i9 %y, i9 %f
+  ret i9 %s
+}
+
+; Negative test - make sure we're matching the correct parameter of fshl.
+
+define i8 @fshl_zero_shift_guard_wrong_select_op(i8 %x, i8 %y, i8 %sh) {
+; CHECK-LABEL: @fshl_zero_shift_guard_wrong_select_op(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[SH:%.*]], 0
+; CHECK-NEXT:    [[F:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[SH]])
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i8 [[Y]], i8 [[F]]
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %c = icmp eq i8 %sh, 0
+  %f = call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %sh)
+  %s = select i1 %c, i8 %y, i8 %f
+  ret i8 %s
+}
+
+; Vector types work too.
+
+define <2 x i8> @fshr_zero_shift_guard_splat(<2 x i8> %x, <2 x i8> %y, <2 x i8> %sh) {
+; CHECK-LABEL: @fshr_zero_shift_guard_splat(
+; CHECK-NEXT:    [[F:%.*]] = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i8> [[SH:%.*]])
+; CHECK-NEXT:    ret <2 x i8> [[F]]
+;
+  %c = icmp eq <2 x i8> %sh, zeroinitializer
+  %f = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %sh)
+  %s = select <2 x i1> %c, <2 x i8> %y, <2 x i8> %f
+  ret <2 x i8> %s
+}
+
+; If first two operands of funnel shift are undef, the result is undef
+
+define i8 @fshl_ops_undef(i8 %shamt) {
+; CHECK-LABEL: @fshl_ops_undef(
+; CHECK-NEXT:    ret i8 undef
+;
+  %r = call i8 @llvm.fshl.i8(i8 undef, i8 undef, i8 %shamt)
+  ret i8 %r
+}
+
+define i9 @fshr_ops_undef(i9 %shamt) {
+; CHECK-LABEL: @fshr_ops_undef(
+; CHECK-NEXT:    ret i9 undef
+;
+  %r = call i9 @llvm.fshr.i9(i9 undef, i9 undef, i9 %shamt)
+  ret i9 %r
+}
+
+; If shift amount is undef, treat it as zero, returning operand 0 or 1
+
+define i8 @fshl_shift_undef(i8 %x, i8 %y) {
+; CHECK-LABEL: @fshl_shift_undef(
+; CHECK-NEXT:    ret i8 [[X:%.*]]
+;
+  %r = call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 undef)
+  ret i8 %r
+}
+
+define i9 @fshr_shift_undef(i9 %x, i9 %y) {
+; CHECK-LABEL: @fshr_shift_undef(
+; CHECK-NEXT:    ret i9 [[Y:%.*]]
+;
+  %r = call i9 @llvm.fshr.i9(i9 %x, i9 %y, i9 undef)
+  ret i9 %r
+}
+
diff --git a/test/Transforms/InstSimplify/extract-element.ll b/test/Transforms/InstSimplify/extract-element.ll
index 051478913127bd5b4bc6156a17a3968c7759a70e..ef12a78d1dc0558a1ff96a6cc83cad780bf230e9 100644
--- a/test/Transforms/InstSimplify/extract-element.ll
+++ b/test/Transforms/InstSimplify/extract-element.ll
@@ -39,9 +39,18 @@ define i129 @vec_extract_undef_index(<3 x i129> %a) {
 
 define i129 @vec_extract_in_bounds(<3 x i129> %a) {
 ; CHECK-LABEL: @vec_extract_in_bounds(
-; CHECK-NEXT:    %E1 = extractelement <3 x i129> %a, i129 2
-; CHECK-NEXT:     ret i129 %E1
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <3 x i129> [[A:%.*]], i129 2
+; CHECK-NEXT:    ret i129 [[E1]]
 ;
   %E1 = extractelement <3 x i129> %a, i129 2
   ret i129 %E1
 }
+
+define float @extract_element_splat_constant_vector_variable_index(i32 %y) {
+; CHECK-LABEL: @extract_element_splat_constant_vector_variable_index(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %r = extractelement <4 x float> <float 2.0, float 2.0, float 2.0, float 2.0>, i32 %y
+  ret float %r
+}
+
diff --git a/test/Transforms/InstSimplify/fp-undef.ll b/test/Transforms/InstSimplify/fp-undef.ll
index 58c1825c428aaa4ca57eb64135c211ef3b12b945..f4a71e99a8734e7ddd04ed48af706a2e13bbd554 100644
--- a/test/Transforms/InstSimplify/fp-undef.ll
+++ b/test/Transforms/InstSimplify/fp-undef.ll
@@ -451,3 +451,83 @@ define double @frem_undef_op1_fast_constant_inf(double %x) {
   ret double %r
 }
 
+define <2 x double> @fadd_undef_op1_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @fadd_undef_op1_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double undef>
+;
+  %r = fadd <2 x double> <double 42.0, double undef>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @fadd_undef_op0_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @fadd_undef_op0_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double undef, double 0x7FF8000000000000>
+;
+  %r = fadd <2 x double> undef, <double undef, double 42.0>
+  ret <2 x double> %r
+}
+
+define <2 x double> @fsub_undef_op1_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @fsub_undef_op1_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double undef, double 0x7FF8000000000000>
+;
+  %r = fsub <2 x double> <double undef, double 42.0>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @fsub_undef_op0_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @fsub_undef_op0_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double undef>
+;
+  %r = fsub <2 x double> undef, <double 42.0, double undef>
+  ret <2 x double> %r
+}
+
+define <2 x double> @fmul_undef_op1_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @fmul_undef_op1_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double undef>
+;
+  %r = fmul <2 x double> <double 42.0, double undef>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @fmul_undef_op0_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @fmul_undef_op0_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double undef, double 0x7FF8000000000000>
+;
+  %r = fmul <2 x double> undef, <double undef, double 42.0>
+  ret <2 x double> %r
+}
+
+define <2 x double> @fdiv_undef_op1_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @fdiv_undef_op1_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double undef>
+;
+  %r = fdiv <2 x double> <double 42.0, double undef>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @fdiv_undef_op0_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @fdiv_undef_op0_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double undef, double 0x7FF8000000000000>
+;
+  %r = fdiv <2 x double> undef, <double undef, double 42.0>
+  ret <2 x double> %r
+}
+
+define <2 x double> @frem_undef_op1_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @frem_undef_op1_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double undef, double 0x7FF8000000000000>
+;
+  %r = frem <2 x double> <double undef, double 42.0>, undef
+  ret <2 x double> %r
+}
+
+define <2 x double> @frem_undef_op0_constant_vec(<2 x double> %x) {
+; CHECK-LABEL: @frem_undef_op0_constant_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double undef>
+;
+  %r = frem <2 x double> undef, <double 42.0, double undef>
+  ret <2 x double> %r
+}
+
diff --git a/test/Transforms/InstSimplify/saturating-add-sub.ll b/test/Transforms/InstSimplify/saturating-add-sub.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a226cc456ac51a08d7e7ca656478a891b8fefb90
--- /dev/null
+++ b/test/Transforms/InstSimplify/saturating-add-sub.ll
@@ -0,0 +1,666 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+declare i3 @llvm.uadd.sat.i3(i3, i3)
+declare i8 @llvm.uadd.sat.i8(i8, i8)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <2 x i9> @llvm.uadd.sat.v2i9(<2 x i9>, <2 x i9>)
+
+declare i8 @llvm.sadd.sat.i8(i8, i8)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i8 @llvm.usub.sat.i8(i8, i8)
+declare i8 @llvm.ssub.sat.i8(i8, i8)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+define i8 @uadd_scalar_0(i8 %a) {
+; CHECK-LABEL: @uadd_scalar_0(
+; CHECK-NEXT:    ret i8 [[A:%.*]]
+;
+  %x1 = call i8 @llvm.uadd.sat.i8(i8 %a, i8 0)
+  ret i8 %x1
+}
+
+define <2 x i8> @uadd_vector_0(<2 x i8> %a) {
+; CHECK-LABEL: @uadd_vector_0(
+; CHECK-NEXT:    ret <2 x i8> [[A:%.*]]
+;
+  %x1v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+  ret <2 x i8> %x1v
+}
+
+define i3 @uadd_scalar_0_commute(i3 %a) {
+; CHECK-LABEL: @uadd_scalar_0_commute(
+; CHECK-NEXT:    ret i3 [[A:%.*]]
+;
+  %x2 = call i3 @llvm.uadd.sat.i3(i3 0, i3 %a)
+  ret i3 %x2
+}
+
+define <2 x i8> @uadd_vector_0_commute(<2 x i8> %a) {
+; CHECK-LABEL: @uadd_vector_0_commute(
+; CHECK-NEXT:    ret <2 x i8> [[A:%.*]]
+;
+  %x2v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> <i8 0, i8 undef>, <2 x i8> %a)
+  ret <2 x i8> %x2v
+}
+
+define i8 @uadd_scalar_maxval(i8 %a) {
+; CHECK-LABEL: @uadd_scalar_maxval(
+; CHECK-NEXT:    ret i8 -1
+;
+  %x3 = call i8 @llvm.uadd.sat.i8(i8 %a, i8 255)
+  ret i8 %x3
+}
+
+define <2 x i9> @uadd_vector_maxval(<2 x i9> %a) {
+; CHECK-LABEL: @uadd_vector_maxval(
+; CHECK-NEXT:    ret <2 x i9> <i9 -1, i9 -1>
+;
+  %x3v = call <2 x i9> @llvm.uadd.sat.v2i9(<2 x i9> %a, <2 x i9> <i9 511, i9 511>)
+  ret <2 x i9> %x3v
+}
+
+define i3 @uadd_scalar_maxval_commute(i3 %a) {
+; CHECK-LABEL: @uadd_scalar_maxval_commute(
+; CHECK-NEXT:    ret i3 -1
+;
+  %x4 = call i3 @llvm.uadd.sat.i3(i3 7, i3 %a)
+  ret i3 %x4
+}
+
+define <2 x i8> @uadd_vector_maxval_commute(<2 x i8> %a) {
+; CHECK-LABEL: @uadd_vector_maxval_commute(
+; CHECK-NEXT:    ret <2 x i8> <i8 -1, i8 -1>
+;
+  %x4v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> <i8 255, i8 255>, <2 x i8> %a)
+  ret <2 x i8> %x4v
+}
+
+define i8 @uadd_scalar_undef(i8 %a) {
+; CHECK-LABEL: @uadd_scalar_undef(
+; CHECK-NEXT:    ret i8 -1
+;
+  %x5 = call i8 @llvm.uadd.sat.i8(i8 %a, i8 undef)
+  ret i8 %x5
+}
+
+define <2 x i8> @uadd_vector_undef(<2 x i8> %a) {
+; CHECK-LABEL: @uadd_vector_undef(
+; CHECK-NEXT:    ret <2 x i8> <i8 -1, i8 -1>
+;
+  %x5v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 undef, i8 undef>)
+  ret <2 x i8> %x5v
+}
+
+define i8 @uadd_scalar_undef_commute(i8 %a) {
+; CHECK-LABEL: @uadd_scalar_undef_commute(
+; CHECK-NEXT:    ret i8 -1
+;
+  %x6 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 %a)
+  ret i8 %x6
+}
+
+define <2 x i8> @uadd_vector_undef_commute(<2 x i8> %a) {
+; CHECK-LABEL: @uadd_vector_undef_commute(
+; CHECK-NEXT:    ret <2 x i8> <i8 -1, i8 -1>
+;
+  %x5v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> %a)
+  ret <2 x i8> %x5v
+}
+
+define i8 @sadd_scalar_0(i8 %a) {
+; CHECK-LABEL: @sadd_scalar_0(
+; CHECK-NEXT:    ret i8 [[A:%.*]]
+;
+  %y1 = call i8 @llvm.sadd.sat.i8(i8 %a, i8 0)
+  ret i8 %y1
+}
+
+define <2 x i8> @sadd_vector_0(<2 x i8> %a) {
+; CHECK-LABEL: @sadd_vector_0(
+; CHECK-NEXT:    ret <2 x i8> [[A:%.*]]
+;
+  %y1v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 undef, i8 0>)
+  ret <2 x i8> %y1v
+}
+
+define i8 @sadd_scalar_0_commute(i8 %a) {
+; CHECK-LABEL: @sadd_scalar_0_commute(
+; CHECK-NEXT:    ret i8 [[A:%.*]]
+;
+  %y2 = call i8 @llvm.sadd.sat.i8(i8 0, i8 %a)
+  ret i8 %y2
+}
+
+define <2 x i8> @sadd_vector_0_commute(<2 x i8> %a) {
+; CHECK-LABEL: @sadd_vector_0_commute(
+; CHECK-NEXT:    ret <2 x i8> [[A:%.*]]
+;
+  %y2v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> zeroinitializer, <2 x i8> %a)
+  ret <2 x i8> %y2v
+}
+
+define i8 @sadd_scalar_maxval(i8 %a) {
+; CHECK-LABEL: @sadd_scalar_maxval(
+; CHECK-NEXT:    [[Y3:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 127)
+; CHECK-NEXT:    ret i8 [[Y3]]
+;
+  %y3 = call i8 @llvm.sadd.sat.i8(i8 %a, i8 127)
+  ret i8 %y3
+}
+
+define <2 x i8> @sadd_vector_maxval(<2 x i8> %a) {
+; CHECK-LABEL: @sadd_vector_maxval(
+; CHECK-NEXT:    [[Y3V:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 127, i8 127>)
+; CHECK-NEXT:    ret <2 x i8> [[Y3V]]
+;
+  %y3v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 127, i8 127>)
+  ret <2 x i8> %y3v
+}
+
+define i8 @sadd_scalar_maxval_commute(i8 %a) {
+; CHECK-LABEL: @sadd_scalar_maxval_commute(
+; CHECK-NEXT:    [[Y4:%.*]] = call i8 @llvm.sadd.sat.i8(i8 127, i8 [[A:%.*]])
+; CHECK-NEXT:    ret i8 [[Y4]]
+;
+  %y4 = call i8 @llvm.sadd.sat.i8(i8 127, i8 %a)
+  ret i8 %y4
+}
+
+define <2 x i8> @sadd_vector_maxval_commute(<2 x i8> %a) {
+; CHECK-LABEL: @sadd_vector_maxval_commute(
+; CHECK-NEXT:    [[Y4V:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 undef, i8 127>, <2 x i8> [[A:%.*]])
+; CHECK-NEXT:    ret <2 x i8> [[Y4V]]
+;
+  %y4v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 undef, i8 127>, <2 x i8> %a)
+  ret <2 x i8> %y4v
+}
+
+define i8 @sadd_scalar_undef(i8 %a) {
+; CHECK-LABEL: @sadd_scalar_undef(
+; CHECK-NEXT:    ret i8 -1
+;
+  %y5 = call i8 @llvm.sadd.sat.i8(i8 %a, i8 undef)
+  ret i8 %y5
+}
+
+define <2 x i8> @sadd_vector_undef(<2 x i8> %a) {
+; CHECK-LABEL: @sadd_vector_undef(
+; CHECK-NEXT:    ret <2 x i8> <i8 -1, i8 -1>
+;
+  %y5v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> undef)
+  ret <2 x i8> %y5v
+}
+
+define i8 @sadd_scalar_undef_commute(i8 %a) {
+; CHECK-LABEL: @sadd_scalar_undef_commute(
+; CHECK-NEXT:    ret i8 -1
+;
+  %y6 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 %a)
+  ret i8 %y6
+}
+
+define <2 x i8> @sadd_vector_undef_commute(<2 x i8> %a) {
+; CHECK-LABEL: @sadd_vector_undef_commute(
+; CHECK-NEXT:    ret <2 x i8> <i8 -1, i8 -1>
+;
+  %y6v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> %a)
+  ret <2 x i8> %y6v
+}
+
+define i8 @usub_scalar_0(i8 %a) {
+; CHECK-LABEL: @usub_scalar_0(
+; CHECK-NEXT:    ret i8 [[A:%.*]]
+;
+  %x1 = call i8 @llvm.usub.sat.i8(i8 %a, i8 0)
+  ret i8 %x1
+}
+
+define <2 x i8> @usub_vector_0(<2 x i8> %a) {
+; CHECK-LABEL: @usub_vector_0(
+; CHECK-NEXT:    ret <2 x i8> [[A:%.*]]
+;
+  %x1v = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 0, i8 0>)
+  ret <2 x i8> %x1v
+}
+
+define i8 @usub_scalar_0_commute(i8 %a) {
+; CHECK-LABEL: @usub_scalar_0_commute(
+; CHECK-NEXT:    ret i8 0
+;
+  %x2 = call i8 @llvm.usub.sat.i8(i8 0, i8 %a)
+  ret i8 %x2
+}
+
+define <2 x i8> @usub_vector_0_commute(<2 x i8> %a) {
+; CHECK-LABEL: @usub_vector_0_commute(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %x2v = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> <i8 0, i8 0>, <2 x i8> %a)
+  ret <2 x i8> %x2v
+}
+
+define i8 @usub_scalar_maxval(i8 %a) {
+; CHECK-LABEL: @usub_scalar_maxval(
+; CHECK-NEXT:    ret i8 0
+;
+  %x3 = call i8 @llvm.usub.sat.i8(i8 %a, i8 255)
+  ret i8 %x3
+}
+
+define <2 x i8> @usub_vector_maxval(<2 x i8> %a) {
+; CHECK-LABEL: @usub_vector_maxval(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %x3v = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 255, i8 255>)
+  ret <2 x i8> %x3v
+}
+
+define i8 @usub_scalar_undef(i8 %a) {
+; CHECK-LABEL: @usub_scalar_undef(
+; CHECK-NEXT:    ret i8 0
+;
+  %x4 = call i8 @llvm.usub.sat.i8(i8 %a, i8 undef)
+  ret i8 %x4
+}
+
+define <2 x i8> @usub_vector_undef(<2 x i8> %a) {
+; CHECK-LABEL: @usub_vector_undef(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %x4v = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 undef, i8 undef>)
+  ret <2 x i8> %x4v
+}
+
+define i8 @usub_scalar_undef_commute(i8 %a) {
+; CHECK-LABEL: @usub_scalar_undef_commute(
+; CHECK-NEXT:    ret i8 0
+;
+  %x5 = call i8 @llvm.usub.sat.i8(i8 undef, i8 %a)
+  ret i8 %x5
+}
+
+define <2 x i8> @usub_vector_undef_commute(<2 x i8> %a) {
+; CHECK-LABEL: @usub_vector_undef_commute(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %x5v = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> <i8 undef, i8 undef>, <2 x i8> %a)
+  ret <2 x i8> %x5v
+}
+
+define i8 @usub_scalar_same(i8 %a) {
+; CHECK-LABEL: @usub_scalar_same(
+; CHECK-NEXT:    ret i8 0
+;
+  %x6 = call i8 @llvm.usub.sat.i8(i8 %a, i8 %a)
+  ret i8 %x6
+}
+
+define <2 x i8> @usub_vector_same(<2 x i8> %a) {
+; CHECK-LABEL: @usub_vector_same(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %x6v = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %a, <2 x i8> %a)
+  ret <2 x i8> %x6v
+}
+
+define i8 @ssub_scalar_0(i8 %a) {
+; CHECK-LABEL: @ssub_scalar_0(
+; CHECK-NEXT:    ret i8 [[A:%.*]]
+;
+  %y1 = call i8 @llvm.ssub.sat.i8(i8 %a, i8 0)
+  ret i8 %y1
+}
+
+define <2 x i8> @ssub_vector_0(<2 x i8> %a) {
+; CHECK-LABEL: @ssub_vector_0(
+; CHECK-NEXT:    ret <2 x i8> [[A:%.*]]
+;
+  %y1v = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 0, i8 0>)
+  ret <2 x i8> %y1v
+}
+
+define i8 @ssub_scalar_0_commute(i8 %a) {
+; CHECK-LABEL: @ssub_scalar_0_commute(
+; CHECK-NEXT:    [[Y2:%.*]] = call i8 @llvm.ssub.sat.i8(i8 0, i8 [[A:%.*]])
+; CHECK-NEXT:    ret i8 [[Y2]]
+;
+  %y2 = call i8 @llvm.ssub.sat.i8(i8 0, i8 %a)
+  ret i8 %y2
+}
+
+define <2 x i8> @ssub_vector_0_commute(<2 x i8> %a) {
+; CHECK-LABEL: @ssub_vector_0_commute(
+; CHECK-NEXT:    [[Y2V:%.*]] = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> zeroinitializer, <2 x i8> [[A:%.*]])
+; CHECK-NEXT:    ret <2 x i8> [[Y2V]]
+;
+  %y2v = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> <i8 0, i8 0>, <2 x i8> %a)
+  ret <2 x i8> %y2v
+}
+
+define i8 @ssub_scalar_maxval(i8 %a) {
+; CHECK-LABEL: @ssub_scalar_maxval(
+; CHECK-NEXT:    [[Y3:%.*]] = call i8 @llvm.ssub.sat.i8(i8 [[A:%.*]], i8 127)
+; CHECK-NEXT:    ret i8 [[Y3]]
+;
+  %y3 = call i8 @llvm.ssub.sat.i8(i8 %a, i8 127)
+  ret i8 %y3
+}
+
+define <2 x i8> @ssub_vector_maxval(<2 x i8> %a) {
+; CHECK-LABEL: @ssub_vector_maxval(
+; CHECK-NEXT:    [[Y3V:%.*]] = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 127, i8 127>)
+; CHECK-NEXT:    ret <2 x i8> [[Y3V]]
+;
+  %y3v = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 127, i8 127>)
+  ret <2 x i8> %y3v
+}
+
+define i8 @ssub_scalar_undef(i8 %a) {
+; CHECK-LABEL: @ssub_scalar_undef(
+; CHECK-NEXT:    ret i8 0
+;
+  %y4 = call i8 @llvm.ssub.sat.i8(i8 %a, i8 undef)
+  ret i8 %y4
+}
+
+define <2 x i8> @ssub_vector_undef(<2 x i8> %a) {
+; CHECK-LABEL: @ssub_vector_undef(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %y4v = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> undef)
+  ret <2 x i8> %y4v
+}
+
+define i8 @ssub_scalar_undef_commute(i8 %a) {
+; CHECK-LABEL: @ssub_scalar_undef_commute(
+; CHECK-NEXT:    ret i8 0
+;
+  %y5 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 %a)
+  ret i8 %y5
+}
+
+define <2 x i8> @ssub_vector_undef_commute(<2 x i8> %a) {
+; CHECK-LABEL: @ssub_vector_undef_commute(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %y5v = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> <i8 undef, i8 undef>, <2 x i8> %a)
+  ret <2 x i8> %y5v
+}
+
+define i8 @ssub_scalar_same(i8 %a) {
+; CHECK-LABEL: @ssub_scalar_same(
+; CHECK-NEXT:    ret i8 0
+;
+  %y6 = call i8 @llvm.ssub.sat.i8(i8 %a, i8 %a)
+  ret i8 %y6
+}
+
+define <2 x i8> @ssub_vector_same(<2 x i8> %a) {
+; CHECK-LABEL: @ssub_vector_same(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %y6v = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %a, <2 x i8> %a)
+  ret <2 x i8> %y6v
+}
+
+define i1 @uadd_icmp_op0_known(i8 %a) {
+; CHECK-LABEL: @uadd_icmp_op0_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.uadd.sat.i8(i8 10, i8 %a)
+  %c = icmp uge i8 %b, 10
+  ret i1 %c
+}
+
+define i1 @uadd_icmp_op0_unknown(i8 %a) {
+; CHECK-LABEL: @uadd_icmp_op0_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.uadd.sat.i8(i8 10, i8 [[A:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i8 [[B]], 10
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.uadd.sat.i8(i8 10, i8 %a)
+  %c = icmp ugt i8 %b, 10
+  ret i1 %c
+}
+
+define i1 @uadd_icmp_op1_known(i8 %a) {
+; CHECK-LABEL: @uadd_icmp_op1_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.uadd.sat.i8(i8 %a, i8 10)
+  %c = icmp uge i8 %b, 10
+  ret i1 %c
+}
+
+define i1 @uadd_icmp_op1_unknown(i8 %a) {
+; CHECK-LABEL: @uadd_icmp_op1_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[A:%.*]], i8 10)
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i8 [[B]], 10
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.uadd.sat.i8(i8 %a, i8 10)
+  %c = icmp ugt i8 %b, 10
+  ret i1 %c
+}
+
+define i1 @sadd_icmp_op0_pos_known(i8 %a) {
+; CHECK-LABEL: @sadd_icmp_op0_pos_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.sadd.sat.i8(i8 10, i8 %a)
+  %c = icmp sge i8 %b, -118
+  ret i1 %c
+}
+
+define i1 @sadd_icmp_op0_pos_unknown(i8 %a) {
+; CHECK-LABEL: @sadd_icmp_op0_pos_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.sadd.sat.i8(i8 10, i8 [[A:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i8 [[B]], -118
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.sadd.sat.i8(i8 10, i8 %a)
+  %c = icmp sgt i8 %b, -118
+  ret i1 %c
+}
+
+define i1 @sadd_icmp_op0_neg_known(i8 %a) {
+; CHECK-LABEL: @sadd_icmp_op0_neg_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.sadd.sat.i8(i8 -10, i8 %a)
+  %c = icmp sle i8 %b, 117
+  ret i1 %c
+}
+
+define i1 @sadd_icmp_op0_neg_unknown(i8 %a) {
+; CHECK-LABEL: @sadd_icmp_op0_neg_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.sadd.sat.i8(i8 -10, i8 [[A:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[B]], 117
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.sadd.sat.i8(i8 -10, i8 %a)
+  %c = icmp slt i8 %b, 117
+  ret i1 %c
+}
+
+define i1 @sadd_icmp_op1_pos_known(i8 %a) {
+; CHECK-LABEL: @sadd_icmp_op1_pos_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.sadd.sat.i8(i8 %a, i8 10)
+  %c = icmp sge i8 %b, -118
+  ret i1 %c
+}
+
+define i1 @sadd_icmp_op1_pos_unknown(i8 %a) {
+; CHECK-LABEL: @sadd_icmp_op1_pos_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 10)
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i8 [[B]], -118
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.sadd.sat.i8(i8 %a, i8 10)
+  %c = icmp sgt i8 %b, -118
+  ret i1 %c
+}
+
+define i1 @sadd_icmp_op1_neg_known(i8 %a) {
+; CHECK-LABEL: @sadd_icmp_op1_neg_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.sadd.sat.i8(i8 %a, i8 -10)
+  %c = icmp sle i8 %b, 117
+  ret i1 %c
+}
+
+define i1 @sadd_icmp_op1_neg_unknown(i8 %a) {
+; CHECK-LABEL: @sadd_icmp_op1_neg_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[A:%.*]], i8 -10)
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[B]], 117
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.sadd.sat.i8(i8 %a, i8 -10)
+  %c = icmp slt i8 %b, 117
+  ret i1 %c
+}
+
+define i1 @usub_icmp_op0_known(i8 %a) {
+; CHECK-LABEL: @usub_icmp_op0_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.usub.sat.i8(i8 10, i8 %a)
+  %c = icmp ule i8 %b, 10
+  ret i1 %c
+}
+
+define i1 @usub_icmp_op0_unknown(i8 %a) {
+; CHECK-LABEL: @usub_icmp_op0_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.usub.sat.i8(i8 10, i8 [[A:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[B]], 10
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.usub.sat.i8(i8 10, i8 %a)
+  %c = icmp ult i8 %b, 10
+  ret i1 %c
+}
+
+define i1 @usub_icmp_op1_known(i8 %a) {
+; CHECK-LABEL: @usub_icmp_op1_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.usub.sat.i8(i8 %a, i8 10)
+  %c = icmp ule i8 %b, 245
+  ret i1 %c
+}
+
+define i1 @usub_icmp_op1_unknown(i8 %a) {
+; CHECK-LABEL: @usub_icmp_op1_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[A:%.*]], i8 10)
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[B]], -11
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.usub.sat.i8(i8 %a, i8 10)
+  %c = icmp ult i8 %b, 245
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op0_pos_known(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op0_pos_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 10, i8 %a)
+  %c = icmp sge i8 %b, -117
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op0_pos_unknown(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op0_pos_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.ssub.sat.i8(i8 10, i8 [[A:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i8 [[B]], -117
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 10, i8 %a)
+  %c = icmp sgt i8 %b, -117
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op0_neg_known(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op0_neg_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 -10, i8 %a)
+  %c = icmp sle i8 %b, 118
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op0_neg_unknown(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op0_neg_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.ssub.sat.i8(i8 -10, i8 [[A:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[B]], 118
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 -10, i8 %a)
+  %c = icmp slt i8 %b, 118
+  ret i1 %c
+}
+
+; Peculiar case: ssub.sat(0, x) is never signed min.
+define i1 @ssub_icmp_op0_zero(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op0_zero(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 0, i8 %a)
+  %c = icmp ne i8 %b, -128
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op1_pos_known(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op1_pos_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 %a, i8 10)
+  %c = icmp sle i8 %b, 117
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op1_pos_unknown(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op1_pos_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.ssub.sat.i8(i8 [[A:%.*]], i8 10)
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[B]], 117
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 %a, i8 10)
+  %c = icmp slt i8 %b, 117
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op1_neg_known(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op1_neg_known(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 %a, i8 -10)
+  %c = icmp sge i8 %b, -118
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op1_neg_unknown(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op1_neg_unknown(
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.ssub.sat.i8(i8 [[A:%.*]], i8 -10)
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i8 [[B]], -118
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 %a, i8 -10)
+  %c = icmp sgt i8 %b, -118
+  ret i1 %c
+}
+
+define i1 @ssub_icmp_op1_smin(i8 %a) {
+; CHECK-LABEL: @ssub_icmp_op1_smin(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = call i8 @llvm.ssub.sat.i8(i8 %a, i8 -128)
+  %c = icmp sge i8 %b, 0
+  ret i1 %c
+}
diff --git a/test/Transforms/InstSimplify/select-implied.ll b/test/Transforms/InstSimplify/select-implied.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a456e291b51aecfffd972468325d4c3590fc6ff2
--- /dev/null
+++ b/test/Transforms/InstSimplify/select-implied.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; A == B implies A >u B is false.
+
+define void @test1(i32 %a, i32 %b) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @foo(i32 10)
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp eq i32 %a, %b
+  br i1 %cmp1, label %taken, label %end
+
+taken:
+  %cmp2 = icmp ugt i32 %a, %b
+  %c = select i1 %cmp2, i32 0, i32 10
+  call void @foo(i32 %c)
+  br label %end
+
+end:
+  ret void
+}
+
+; If A == B is false then A != B is true.
+
+define void @test2(i32 %a, i32 %b) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[END:%.*]], label [[TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @foo(i32 20)
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp eq i32 %a, %b
+  br i1 %cmp1, label %end, label %taken
+
+taken:
+  %cmp2 = icmp ne i32 %a, %b
+  %c = select i1 %cmp2, i32 20, i32 0
+  call void @foo(i32 %c)
+  br label %end
+
+end:
+  ret void
+}
+
+; A >u 10 implies A >u 10 is true.
+
+define void @test3(i32 %a) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[A:%.*]], 10
+; CHECK-NEXT:    br i1 [[CMP1]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @foo(i32 30)
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp ugt i32 %a, 10
+  br i1 %cmp1, label %taken, label %end
+
+taken:
+  %cmp2 = icmp ugt i32 %a, 10
+  %c = select i1 %cmp2, i32 30, i32 0
+  call void @foo(i32 %c)
+  br label %end
+
+end:
+  ret void
+}
+
+define i8 @PR23333(i8 addrspace(1)* %ptr) {
+; CHECK-LABEL: @PR23333(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 addrspace(1)* [[PTR:%.*]], null
+; CHECK-NEXT:    br i1 [[CMP]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    ret i8 1
+; CHECK:       end:
+; CHECK-NEXT:    ret i8 0
+;
+  %cmp = icmp eq i8 addrspace(1)* %ptr, null
+  br i1 %cmp, label %taken, label %end
+
+taken:
+  %cmp2 = icmp ne i8 addrspace(1)* %ptr, null
+  %res = select i1 %cmp2, i8 2, i8 1
+  ret i8 %res
+
+end:
+  ret i8 0
+}
+
+; We know the condition of the select is true based on a dominating condition.
+; Therefore, we can replace %cond with %len. 
+; TODO: len == 8 is known false in bb. This is handled by other passes, but should it be handled here? 
+
+define void @test4(i32 %len) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @bar(i32 [[LEN:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[LEN]], 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB:%.*]], label [[B1:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp eq i32 [[LEN]], 8
+; CHECK-NEXT:    br i1 [[CMP11]], label [[B0:%.*]], label [[B1]]
+; CHECK:       b0:
+; CHECK-NEXT:    call void @foo(i32 [[LEN]])
+; CHECK-NEXT:    br label [[B1]]
+; CHECK:       b1:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[LEN]], [[BB]] ], [ undef, [[B0]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[RET:%.*]]
+; CHECK:       ret:
+; CHECK-NEXT:    call void @foo(i32 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i32 @bar(i32 %len);
+  %cmp = icmp ult i32 %len, 4
+  br i1 %cmp, label %bb, label %b1
+bb:
+  %cond = select i1 %cmp, i32 %len, i32 8
+  %cmp11 = icmp eq i32 %cond, 8
+  br i1 %cmp11, label %b0, label %b1
+
+b0:
+  call void @foo(i32 %len)
+  br label %b1
+
+b1:
+  %1 = phi i32 [ %cond, %bb ], [ undef, %b0 ], [ %0, %entry ]
+  br label %ret
+
+ret:
+  call void @foo(i32 %1)
+  ret void
+}
+
+; A >u 10 implies A >u 9 is true.
+
+define void @test5(i32 %a) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[A:%.*]], 10
+; CHECK-NEXT:    br i1 [[CMP1]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @foo(i32 30)
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp ugt i32 %a, 10
+  br i1 %cmp1, label %taken, label %end
+
+taken:
+  %cmp2 = icmp ugt i32 %a, 9
+  %c = select i1 %cmp2, i32 30, i32 0
+  call void @foo(i32 %c)
+  br label %end
+
+end:
+  ret void
+}
+
+declare void @foo(i32)
+declare i32 @bar(i32)
+
+define i32 @test_and(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[TPATH:%.*]], label [[END:%.*]]
+; CHECK:       tpath:
+; CHECK-NEXT:    ret i32 313
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cmp1 = icmp ne i32 %a, 0
+  %cmp2 = icmp ne i32 %b, 0
+  %and = and i1 %cmp1, %cmp2
+  br i1 %and, label %tpath, label %end
+
+tpath:
+  %cmp3 = icmp eq i32 %a, 0 ;; <-- implied false
+  %c = select i1 %cmp3, i32 0, i32 313
+  ret i32 %c
+
+end:
+  ret i32 0
+}
+
+; cmp1 and cmp2 are false on the 'fpath' path and thus cmp3 is true.
+
+define i32 @test_or1(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_or1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    br i1 [[OR]], label [[END:%.*]], label [[FPATH:%.*]]
+; CHECK:       fpath:
+; CHECK-NEXT:    ret i32 37
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp2 = icmp eq i32 %b, 0
+  %or = or i1 %cmp1, %cmp2
+  br i1 %or, label %end, label %fpath
+
+fpath:
+  %cmp3 = icmp ne i32 %a, 0  ;; <-- implied true
+  %c = select i1 %cmp3, i32 37, i32 0
+  ret i32 %c
+
+end:
+  ret i32 0
+}
+
+; LHS ==> RHS by definition (true -> true)
+
+define void @test6(i32 %a, i32 %b) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @foo(i32 10)
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp eq i32 %a, %b
+  br i1 %cmp1, label %taken, label %end
+
+taken:
+  %c = select i1 %cmp1, i32 10, i32 0
+  call void @foo(i32 %c)
+  br label %end
+
+end:
+  ret void
+}
+
+; LHS ==> RHS by definition (false -> false)
+
+define void @test7(i32 %a, i32 %b) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[END:%.*]], label [[TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @foo(i32 11)
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp eq i32 %a, %b
+  br i1 %cmp1, label %end, label %taken
+
+taken:
+  %c = select i1 %cmp1, i32 0, i32 11
+  call void @foo(i32 %c)
+  br label %end
+
+end:
+  ret void
+}
+
diff --git a/test/Transforms/InstSimplify/shr-scalar-vector-consistency.ll b/test/Transforms/InstSimplify/shr-scalar-vector-consistency.ll
new file mode 100644
index 0000000000000000000000000000000000000000..90725a153a2f2a07a5121ab37d62acad77caecf7
--- /dev/null
+++ b/test/Transforms/InstSimplify/shr-scalar-vector-consistency.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; This tests checks optimization consistency for scalar and vector code.
+; If m_Zero() is able to match a vector undef, but not a scalar undef,
+; the two cases will simplify differently.
+
+define i32 @test_scalar(i32 %a, i1 %b) {
+; CHECK-LABEL: @test_scalar(
+; CHECK-NEXT:    ret i32 undef
+;
+  %c = sext i1 %b to i32
+  %d = ashr i32 undef, %c
+  ret i32 %d
+}
+
+define <2 x i32> @test_vector(<2 x i32> %a, <2 x i1> %b) {
+; CHECK-LABEL: @test_vector(
+; CHECK-NEXT:    ret <2 x i32> undef
+;
+  %c = sext <2 x i1> %b to <2 x i32>
+  %d = ashr <2 x i32> undef, %c
+  ret <2 x i32> %d
+}
+
diff --git a/test/Transforms/JumpThreading/crash.ll b/test/Transforms/JumpThreading/crash.ll
index 900a7738d3bfbb0ac841de61dfce2216f9c70d83..01bec4b2c2deef512f922eaf40056adaf888f323 100644
--- a/test/Transforms/JumpThreading/crash.ll
+++ b/test/Transforms/JumpThreading/crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -jump-threading -disable-output
+; RUN: opt < %s -jump-threading -S | FileCheck %s
 ; PR2285
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -564,3 +564,63 @@ ur:
 declare i8* @PR14233.f1()
 
 declare i8* @PR14233.f2()
+
+; Make sure the following compiles in a sane amount of time, as opposed
+; to taking exponential time.
+; (CHECK to make sure the condition doesn't get simplified somehow;
+; if it does, the testcase will need to be revised.)
+; CHECK-LABEL: define void @almost_infinite_loop
+; CHECK: %x39 = or i1 %x38, %x38
+; CHECK: br i1 %x39, label %dest1, label %dest2
+define void @almost_infinite_loop(i1 %x0) {
+entry:
+  br label %if.then57.i
+
+if.then57.i:
+  %x1 = or i1 %x0, %x0
+  %x2 = or i1 %x1, %x1
+  %x3 = or i1 %x2, %x2
+  %x4 = or i1 %x3, %x3
+  %x5 = or i1 %x4, %x4
+  %x6 = or i1 %x5, %x5
+  %x7 = or i1 %x6, %x6
+  %x8 = or i1 %x7, %x7
+  %x9 = or i1 %x8, %x8
+  %x10 = or i1 %x9, %x9
+  %x11 = or i1 %x10, %x10
+  %x12 = or i1 %x11, %x11
+  %x13 = or i1 %x12, %x12
+  %x14 = or i1 %x13, %x13
+  %x15 = or i1 %x14, %x14
+  %x16 = or i1 %x15, %x15
+  %x17 = or i1 %x16, %x16
+  %x18 = or i1 %x17, %x17
+  %x19 = or i1 %x18, %x18
+  %x20 = or i1 %x19, %x19
+  %x21 = or i1 %x20, %x20
+  %x22 = or i1 %x21, %x21
+  %x23 = or i1 %x22, %x22
+  %x24 = or i1 %x23, %x23
+  %x25 = or i1 %x24, %x24
+  %x26 = or i1 %x25, %x25
+  %x27 = or i1 %x26, %x26
+  %x28 = or i1 %x27, %x27
+  %x29 = or i1 %x28, %x28
+  %x30 = or i1 %x29, %x29
+  %x31 = or i1 %x30, %x30
+  %x32 = or i1 %x31, %x31
+  %x33 = or i1 %x32, %x32
+  %x34 = or i1 %x33, %x33
+  %x35 = or i1 %x34, %x34
+  %x36 = or i1 %x35, %x35
+  %x37 = or i1 %x36, %x36
+  %x38 = or i1 %x37, %x37
+  %x39 = or i1 %x38, %x38
+  br i1 %x39, label %dest1, label %dest2
+
+dest1:
+  unreachable
+
+dest2:
+  unreachable
+}
diff --git a/test/Transforms/LICM/explicit_guards.ll b/test/Transforms/LICM/explicit_guards.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c9e091359b0a4ac608a7ad6f93ae02a0b23eae70
--- /dev/null
+++ b/test/Transforms/LICM/explicit_guards.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -make-guards-explicit -basicaa -licm < %s        | FileCheck %s
+; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,make-guards-explicit,loop(licm)' < %s | FileCheck %s
+
+; Test interaction between explicit guards and LICM: make sure that we do not
+; hoist explicit conditions while we can hoist invariant loads in presence of
+; explicit guards.
+
+declare void @llvm.experimental.guard(i1,...)
+
+; Make sure that we do not hoist widenable_cond out of loop.
+define void @do_not_hoist_widenable_cond(i1 %cond, i32 %N, i32 %M) {
+; CHECK-LABEL: @do_not_hoist_widenable_cond(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ]
+; CHECK-NEXT:    [[GUARD_COND:%.*]] = icmp slt i32 [[IV]], [[N:%.*]]
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[GUARD_COND]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"() ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV]], [[M:%.*]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %guard_cond = icmp slt i32 %iv, %N
+  call void(i1, ...) @llvm.experimental.guard(i1 %guard_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv, %M
+  %iv.next = add i32 %iv, 1
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @hoist_invariant_load(i1 %cond, i32* %np, i32 %M) {
+; CHECK-LABEL: @hoist_invariant_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N:%.*]] = load i32, i32* [[NP:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ]
+; CHECK-NEXT:    [[GUARD_COND:%.*]] = icmp slt i32 [[IV]], [[N]]
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[GUARD_COND]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"() ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV]], [[M:%.*]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %N = load i32, i32* %np
+  %guard_cond = icmp slt i32 %iv, %N
+  call void(i1, ...) @llvm.experimental.guard(i1 %guard_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv, %M
+  %iv.next = add i32 %iv, 1
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/LICM/guards.ll b/test/Transforms/LICM/guards.ll
index b37c418928406f133f1803387d225e5cbe555288..d0c9533823d829cb4dcc5ab8c20abbc6dec2258b 100644
--- a/test/Transforms/LICM/guards.ll
+++ b/test/Transforms/LICM/guards.ll
@@ -85,16 +85,15 @@ loop:
 }
 
 
-; TODO: We can also hoist this guard from mustexec non-header block.
 define void @test4(i1 %c, i32* %p) {
 
 ; CHECK-LABEL: @test4(
 ; CHECK-LABEL: entry:
 ; CHECK:       %a = load i32, i32* %p
 ; CHECK:       %invariant_cond = icmp ne i32 %a, 100
+; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond)
 ; CHECK-LABEL: loop:
 ; CHECK-LABEL: backedge:
-; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond)
 
 entry:
   br label %loop
@@ -197,6 +196,155 @@ exit:
   ret void
 }
 
+; Check that we don't hoist across a store in the header.
+define void @test4c(i1 %c, i32* %p, i8* noalias %s) {
+
+; CHECK-LABEL: @test4c(
+; CHECK-LABEL: entry:
+; CHECK:       %a = load i32, i32* %p
+; CHECK:       %invariant_cond = icmp ne i32 %a, 100
+; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond)
+; CHECK-LABEL: loop:
+; CHECK-LABEL: backedge:
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  store i8 0, i8* %s
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't hoist across a store in a conditionally execute block.
+define void @test4d(i1 %c, i32* %p, i8* noalias %s) {
+
+; CHECK-LABEL: @test4d(
+; CHECK-LABEL: entry:
+; CHECK:       %a = load i32, i32* %p
+; CHECK:       %invariant_cond = icmp ne i32 %a, 100
+; CHECK-LABEL: loop:
+; CHECK-LABEL: backedge:
+; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond)
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  store i8 0, i8* %s
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't hoist across a store before the guard in the backedge.
+define void @test4e(i1 %c, i32* %p, i8* noalias %s) {
+
+; CHECK-LABEL: @test4e(
+; CHECK-LABEL: entry:
+; CHECK:       %a = load i32, i32* %p
+; CHECK:       %invariant_cond = icmp ne i32 %a, 100
+; CHECK:       store i8 0, i8* %s
+; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond)
+; CHECK-LABEL: loop:
+; CHECK-LABEL: backedge:
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  store i8 0, i8* %s
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can hoist the guard in spite of store which happens after.
+define void @test4f(i1 %c, i32* %p, i8* noalias %s) {
+
+; CHECK-LABEL: @test4f(
+; CHECK-LABEL: entry:
+; CHECK:       %a = load i32, i32* %p
+; CHECK:       %invariant_cond = icmp ne i32 %a, 100
+; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond)
+; CHECK-LABEL: loop:
+; CHECK-LABEL: backedge:
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  store i8 0, i8* %s
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 ; Do not hoist an invariant guard across a variant guard.
 define void @test5(i1 %c, i32* %p, i32* %q) {
 
diff --git a/test/Transforms/LICM/hoist-phi.ll b/test/Transforms/LICM/hoist-phi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..26d0c0114f6be1be3fda63dce451bf31e55b79d0
--- /dev/null
+++ b/test/Transforms/LICM/hoist-phi.ll
@@ -0,0 +1,1351 @@
+; RUN: opt -S -licm < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+; RUN: opt -S -licm -licm-control-flow-hoisting=1 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-ENABLED
+; RUN: opt -S -licm -licm-control-flow-hoisting=0 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -licm-control-flow-hoisting=1 -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-ENABLED
+; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -licm-control-flow-hoisting=0 -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+
+; CHECK-LABEL: @triangle_phi
+define void @triangle_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK: %add = add i32 %x, 1
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: phi i32 [ %add, %[[IF_LICM]] ], [ %x, %entry ]
+; CHECK-ENABLED: store i32 %phi, i32* %p
+; CHECK-ENABLED: %cmp2 = icmp ne i32 %phi, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %add = add i32 %x, 1
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %x, %loop ]
+; CHECK-DISABLED: %cmp2 = icmp ne i32 %phi, 0
+then:
+  %phi = phi i32 [ %add, %if ], [ %x, %loop ]
+  store i32 %phi, i32* %p
+  %cmp2 = icmp ne i32 %phi, 0
+  br i1 %cmp2, label %loop, label %end
+
+; CHECK-LABEL: end:
+; CHECK-DISABLED: %[[PHI_LCSSA:.*]] = phi i32 [ %phi, %then ]
+; CHECK-DISABLED: store i32 %[[PHI_LCSSA]], i32* %p
+end:
+  ret void
+}
+
+; CHECK-LABEL: @diamond_phi
+define void @diamond_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]
+; CHECK-ENABLED: %phi = phi i32 [ %add, %[[IF_LICM]] ], [ %sub, %[[ELSE_LICM]] ]
+; CHECK-ENABLED: store i32 %phi, i32* %p
+; CHECK-ENABLED: %cmp2 = icmp ne i32 %phi, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  br label %then
+
+else:
+  %sub = sub i32 %x, 1
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+; CHECK-DISABLED: %cmp2 = icmp ne i32 %phi, 0
+then:
+  %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+  store i32 %phi, i32* %p
+  %cmp2 = icmp ne i32 %phi, 0
+  br i1 %cmp2, label %loop, label %end
+
+; CHECK-LABEL: end:
+; CHECK-DISABLED: %[[PHI_LCSSA:.*]] = phi i32 [ %phi, %then ]
+; CHECK-DISABLED: store i32 %[[PHI_LCSSA]], i32* %p
+end:
+  ret void
+}
+
+; TODO: This is currently too complicated for us to be able to hoist the phi.
+; CHECK-LABEL: @three_way_phi
+define void @three_way_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp2 = icmp sgt i32 %add, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK: %sub = sub i32 %x, 1
+; CHECK: br label %loop
+
+entry:
+  br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 %add, 0
+  br i1 %cmp2, label %if.if, label %then
+
+if.if:
+  %sub = sub i32 %x, 1
+  br label %then
+
+then:
+  %phi = phi i32 [ 0, %loop ], [ %add, %if ], [ %sub, %if.if ]
+  store i32 %phi, i32* %p
+  %cmp3 = icmp ne i32 %phi, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; TODO: This is currently too complicated for us to be able to hoist the phi.
+; CHECK-LABEL: @tree_phi
+define void @tree_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp2 = icmp sgt i32 %add, 0
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK: br label %loop
+
+entry:
+  br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 %add, 0
+  br i1 %cmp2, label %if.if, label %if.else
+
+if.if:
+  br label %then
+
+if.else:
+  br label %then
+
+else:
+  %sub = sub i32 %x, 1
+  br label %then
+
+then:
+  %phi = phi i32 [ %add, %if.if ], [ 0, %if.else ], [ %sub, %else ]
+  store i32 %phi, i32* %p
+  %cmp3 = icmp ne i32 %phi, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; TODO: We can hoist the first phi, but not the second.
+; CHECK-LABEL: @phi_phi
+define void @phi_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp2 = icmp sgt i32 %add, 0
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK-ENABLED: br i1 %cmp2, label %[[IF_IF_LICM:.*]], label %[[IF_ELSE_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_IF_LICM]]:
+; CHECK-ENABLED: br label %[[IF_THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[IF_THEN_LICM]]
+
+; CHECK-ENABLED: [[IF_THEN_LICM]]:
+; CHECK-ENABLED: %phi1 = phi i32 [ %add, %[[IF_IF_LICM]] ], [ 0, %[[IF_ELSE_LICM]] ]
+; CHECK: br label %loop
+
+entry:
+  br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 %add, 0
+  br i1 %cmp2, label %if.if, label %if.else
+
+if.if:
+  br label %if.then
+
+if.else:
+  br label %if.then
+
+; CHECK-LABEL: if.then:
+; CHECK-DISABLED: %phi1 = phi i32 [ %add, %if.if ], [ 0, %if.else ]
+if.then:
+  %phi1 = phi i32 [ %add, %if.if ], [ 0, %if.else ]
+  br label %then
+
+else:
+  %sub = sub i32 %x, 1
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK: %phi2 = phi i32 [ %phi1, %if.then ], [ %sub, %else ]
+then:
+  %phi2 = phi i32 [ %phi1, %if.then ], [ %sub, %else ]
+  store i32 %phi2, i32* %p
+  %cmp3 = icmp ne i32 %phi2, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; Check that we correctly duplicate empty control flow.
+; CHECK-LABEL: @empty_triangle_phi
+define i8 @empty_triangle_phi(i32 %x, i32 %y) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp eq i32 %x, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %entry ]
+; CHECK: %cmp2 = icmp eq i32 %y, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp eq i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %loop ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %loop ]
+  %cmp2 = icmp eq i32 %y, 0
+  br i1 %cmp2, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; CHECK-LABEL: @empty_diamond_phi
+define i8 @empty_diamond_phi(i32 %x, i32 %y) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp eq i32 %x, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %[[ELSE_LICM]] ]
+; CHECK: %cmp2 = icmp eq i32 %y, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp eq i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  br label %then
+
+else:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %else ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %else ]
+  %cmp2 = icmp eq i32 %y, 0
+  br i1 %cmp2, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; Check that we correctly handle the case that the first thing we try to hoist is a phi.
+; CHECK-LABEL: @empty_triangle_phi_first
+define i8 @empty_triangle_phi_first(i32 %x, i1 %cond) {
+; CHECK-LABEL: entry:
+; CHECK-ENABLED: br i1 %cond, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %entry ]
+; CHECK: %cmp = icmp eq i32 %x, 0
+; CHECK: br label %loop
+
+loop:
+  br i1 %cond, label %if, label %then
+
+if:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %loop ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %loop ]
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; CHECK-LABEL: @empty_diamond_phi
+define i8 @empty_diamond_phi_first(i32 %x, i1 %cond) {
+; CHECK-LABEL: entry:
+; CHECK-ENABLED: br i1 %cond, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %[[ELSE_LICM]] ]
+; CHECK: %cmp = icmp eq i32 %x, 0
+; CHECK: br label %loop
+
+loop:
+  br i1 %cond, label %if, label %else
+
+if:
+  br label %then
+
+else:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %else ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %else ]
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; CHECK-LABEL: @empty_triangle_phi_first
+define i8 @empty_triangle_phi_first_empty_loop_head(i32 %x, i1 %cond) {
+; CHECK-LABEL: entry:
+; CHECK-ENABLED: br i1 %cond, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %entry ]
+; CHECK: %cmp = icmp eq i32 %x, 0
+; CHECK: br label %loop
+
+loop:
+  br label %test
+
+test:
+  br i1 %cond, label %if, label %then
+
+if:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %test ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %test ]
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; CHECK-LABEL: @empty_diamond_phi_first_empty_loop_head
+define i8 @empty_diamond_phi_first_empty_loop_head(i32 %x, i1 %cond) {
+; CHECK-LABEL: entry:
+; CHECK-ENABLED: br i1 %cond, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %[[ELSE_LICM]] ]
+; CHECK: %cmp = icmp eq i32 %x, 0
+; CHECK: br label %loop
+
+loop:
+  br label %test
+
+test:
+  br i1 %cond, label %if, label %else
+
+if:
+  br label %then
+
+else:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %else ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %else ]
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; The phi is on one branch of a diamond while simultaneously at the end of a
+; triangle. Check that we duplicate the triangle and not the diamond.
+; CHECK-LABEL: @triangle_diamond
+define void @triangle_diamond(i32* %ptr, i32 %x, i32 %y) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp ne i32 %y, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i32 [ 0, %[[IF_LICM]] ], [ 127, %entry ]
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp ne i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %cmp2 = icmp ne i32 %y, 0
+  br i1 %cmp2, label %if.then, label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ 0, %if ], [ 127, %loop ]
+then:
+  %phi = phi i32 [ 0, %if ], [ 127, %loop ]
+  store i32 %phi, i32* %ptr
+  br label %end
+
+if.then:
+  br label %end
+
+end:
+  br label %loop
+}
+
+; As the previous, but the end of the diamond is the head of the loop.
+; CHECK-LABEL: @triangle_diamond_backedge
+define void @triangle_diamond_backedge(i32* %ptr, i32 %x, i32 %y) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp ne i32 %y, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i32 [ 0, %[[IF_LICM]] ], [ 127, %entry ]
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp ne i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %cmp2 = icmp ne i32 %y, 0
+  br i1 %cmp2, label %backedge, label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ 0, %if ], [ 127, %loop ]
+then:
+  %phi = phi i32 [ 0, %if ], [ 127, %loop ]
+  store i32 %phi, i32* %ptr
+  br label %loop
+
+backedge:
+  br label %loop
+}
+
+; TODO: The inner diamonds can be hoisted, but not currently the outer diamond
+; CHECK-LABEL: @diamonds_inside_diamond
+define void @diamonds_inside_diamond(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %cmp3 = icmp slt i32 %x, -10
+; CHECK-ENABLED: br i1 %cmp3, label %[[ELSE_IF_LICM:.*]], label %[[ELSE_ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[ELSE_IF_LICM]]:
+; CHECK-ENABLED: br label %[[ELSE_THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[ELSE_THEN_LICM]]
+
+; CHECK-ENABLED: [[ELSE_THEN_LICM]]:
+; CHECK-ENABLED: %phi2 = phi i32 [ 2, %[[ELSE_IF_LICM]] ], [ 3, %[[ELSE_ELSE_LICM]] ]
+; CHECK: %cmp2 = icmp sgt i32 %x, 10
+; CHECK-ENABLED: br i1 %cmp2, label %[[IF_IF_LICM:.*]], label %[[IF_ELSE_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_IF_LICM]]:
+; CHECK-ENABLED: br label %[[IF_THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[IF_THEN_LICM]]
+
+; CHECK-ENABLED: [[IF_THEN_LICM]]:
+; CHECK-ENABLED: %phi1 = phi i32 [ 0, %[[IF_IF_LICM]] ], [ 1, %[[IF_ELSE_LICM]] ]
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %cmp2 = icmp sgt i32 %x, 10
+  br i1 %cmp2, label %if.if, label %if.else
+
+if.if:
+  br label %if.then
+
+if.else:
+  br label %if.then
+
+; CHECK-LABEL: if.then:
+; CHECK-DISABLED: %phi1 = phi i32 [ 0, %if.if ], [ 1, %if.else ]
+if.then:
+  %phi1 = phi i32 [ 0, %if.if ], [ 1, %if.else ]
+  br label %then
+
+else:
+  %cmp3 = icmp slt i32 %x, -10
+  br i1 %cmp3, label %else.if, label %else.else
+
+else.if:
+  br label %else.then
+
+else.else:
+  br label %else.then
+
+; CHECK-LABEL: else.then:
+; CHECK-DISABLED: %phi2 = phi i32 [ 2, %else.if ], [ 3, %else.else ]
+else.then:
+  %phi2 = phi i32 [ 2, %else.if ], [ 3, %else.else ]
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK: %phi3 = phi i32 [ %phi1, %if.then ], [ %phi2, %else.then ]
+; CHECK: %cmp4 = icmp ne i32 %phi3, 0
+then:
+  %phi3 = phi i32 [ %phi1, %if.then ], [ %phi2, %else.then ]
+  store i32 %phi3, i32* %p
+  %cmp4 = icmp ne i32 %phi3, 0
+  br i1 %cmp4, label %loop, label %end
+
+end:
+  ret void
+}
+
+; We can hoist blocks that contain an edge that exits the loop by ignoring that
+; edge in the hoisted block.
+; CHECK-LABEL: @triangle_phi_loopexit
+define void @triangle_phi_loopexit(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp sgt i32 10, %add
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i32 [ %add, %[[IF_LICM]] ], [ %x, %entry ]
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 10, %add
+  br i1 %cmp2, label %then, label %end
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %x, %loop ]
+then:
+  %phi = phi i32 [ %add, %if ], [ %x, %loop ]
+  store i32 %phi, i32* %p
+  %cmp3 = icmp ne i32 %phi, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; CHECK-LABEL: @diamond_phi_oneloopexit
+define void @diamond_phi_oneloopexit(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp sgt i32 10, %add
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]
+; CHECK-ENABLED: %phi = phi i32 [ %add, %[[IF_LICM]] ], [ %sub, %[[ELSE_LICM]] ]
+; CHECK-ENABLED: %cmp3 = icmp ne i32 %phi, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 10, %add
+  br i1 %cmp2, label %then, label %end
+
+else:
+  %sub = sub i32 %x, 1
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+then:
+  %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+  store i32 %phi, i32* %p
+  %cmp3 = icmp ne i32 %phi, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; CHECK-LABEL: @diamond_phi_twoloopexit
+define void @diamond_phi_twoloopexit(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp sgt i32 10, %add
+; CHECK-DAG: %cmp3 = icmp sgt i32 10, %sub
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]
+; CHECK-ENABLED: %phi = phi i32 [ %add, %[[IF_LICM]] ], [ %sub, %[[ELSE_LICM]] ]
+; CHECK-ENABLED: %cmp4 = icmp ne i32 %phi, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 10, %add
+  br i1 %cmp2, label %then, label %end
+
+else:
+  %sub = sub i32 %x, 1
+  %cmp3 = icmp sgt i32 10, %sub
+  br i1 %cmp3, label %then, label %end
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+; CHECK-DISABLED: %cmp4 = icmp ne i32 %phi, 0
+then:
+  %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+  store i32 %phi, i32* %p
+  %cmp4 = icmp ne i32 %phi, 0
+  br i1 %cmp4, label %loop, label %end
+
+end:
+  ret void
+}
+
+; The store cannot be hoisted, so add and shr cannot be hoisted into a
+; conditional block.
+; CHECK-LABEL: @conditional_use
+define void @conditional_use(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cond = icmp ugt i32 %x, 0
+; CHECK-DAG: %add = add i32 %x, 5
+; CHECK-DAG: %shr = ashr i32 %add, 1
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  %cond = icmp ugt i32 %x, 0
+  br i1 %cond, label %if, label %else
+
+; CHECK-LABEL: if:
+; CHECK: store i32 %shr, i32* %p, align 4
+if:
+  %add = add i32 %x, 5
+  %shr = ashr i32 %add, 1
+  store i32 %shr, i32* %p, align 4
+  br label %then
+
+else:
+  br label %then
+
+then:
+  br label %loop
+}
+
+; A diamond with two triangles on the left and one on the right. This test is
+; to check that we have a unique loop preheader when we hoist the store (and so
+; don't fail an assertion).
+; CHECK-LABEL: @triangles_in_diamond
+define void @triangles_in_diamond(i32* %ptr) {
+; CHECK-LABEL: entry:
+; CHECK: store i32 0, i32* %ptr, align 4
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  br i1 undef, label %left_triangle_1, label %right_triangle
+
+left_triangle_1:
+  br i1 undef, label %left_triangle_1_if, label %left_triangle_2
+
+left_triangle_1_if:
+  br label %left_triangle_2
+
+left_triangle_2:
+  br i1 undef, label %left_triangle_2_if, label %left_triangle_2_then
+
+left_triangle_2_if:
+  br label %left_triangle_2_then
+
+left_triangle_2_then:
+  br label %loop.end
+
+right_triangle:
+  br i1 undef, label %right_triangle.if, label %right_triangle.then
+
+right_triangle.if:
+  br label %right_triangle.then
+
+right_triangle.then:
+  br label %loop.end
+
+loop.end:
+  store i32 0, i32* %ptr, align 4
+  br label %loop
+}
+
+; %cmp dominates its used after being hoisted, but not after %brmerge is rehoisted
+; CHECK-LABEL: @rehoist
+define void @rehoist(i8* %this, i32 %x) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %sub = add nsw i32 %x, -1
+; CHECK-DAG: %fptr = bitcast i8* %this to void (i8*)*
+; CHECK-DAG: %cmp = icmp eq i32 0, %sub
+; CHECK-DAG: %brmerge = or i1 %cmp, true
+entry:
+  %sub = add nsw i32 %x, -1
+  br label %loop
+
+loop:
+  br i1 undef, label %if1, label %else1
+
+if1:
+  %fptr = bitcast i8* %this to void (i8*)*
+  call void %fptr(i8* %this)
+  br label %then1
+
+else1:
+  br label %then1
+
+then1:
+  %cmp = icmp eq i32 0, %sub
+  br i1 %cmp, label %end, label %else2
+
+else2:
+  %brmerge = or i1 %cmp, true
+  br i1 %brmerge, label %if3, label %end
+
+if3:
+  br label %end
+
+end:
+  br label %loop
+}
+
+; A test case that uses empty blocks in a way that can cause control flow
+; hoisting to get confused.
+; CHECK-LABEL: @empty_blocks_multiple_conditional_branches
+define void @empty_blocks_multiple_conditional_branches(float %arg, float* %ptr) {
+; CHECK-LABEL: entry
+; CHECK-DAG: %div1 = fmul float %arg, 4.000000e+00
+; CHECK-DAG: %div2 = fmul float %arg, 2.000000e+00
+entry:
+  br label %loop
+
+; The exact path to the phi isn't checked here, because it depends on whether
+; cond2 or cond3 is hoisted first
+; CHECK-ENABLED: %phi = phi float [ 0.000000e+00, %{{.*}} ], [ %div1, %{{.*}} ]
+; CHECK: br label %loop
+
+loop:
+  br i1 undef, label %backedge2, label %cond1
+
+cond1:
+  br i1 undef, label %cond1.if, label %cond1.else
+
+cond1.else:
+  br label %cond3
+
+cond1.if:
+  br label %cond1.if.next
+
+cond1.if.next:
+  br label %cond2
+
+cond2:
+  %div1 = fmul float %arg, 4.000000e+00
+  br i1 undef, label %cond2.if, label %cond2.then
+
+cond2.if:
+  br label %cond2.then
+
+; CHECK-LABEL: cond2.then:
+; CHECK-DISABLED: %phi = phi float [ 0.000000e+00, %cond2 ], [ %div1, %cond2.if ]
+cond2.then:
+  %phi = phi float [ 0.000000e+00, %cond2 ], [ %div1, %cond2.if ]
+  store float %phi, float* %ptr
+  br label %backedge2
+
+cond3:
+  br i1 undef, label %cond3.then, label %cond3.if
+
+cond3.if:
+  %div2 = fmul float %arg, 2.000000e+00
+  store float %div2, float* %ptr
+  br label %cond3.then
+
+cond3.then:
+  br label %loop
+
+backedge2:
+  br label %loop
+}
+
+; We can't do much here, so mainly just check that we don't crash.
+; CHECK-LABEL: @many_path_phi
+define void @many_path_phi(i32* %ptr1, i32* %ptr2) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %gep3 = getelementptr inbounds i32, i32* %ptr2, i32 2
+; CHECK-DAG: %gep2 = getelementptr inbounds i32, i32* %ptr2, i32 2
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  %phi1 = phi i32 [ 0, %entry ], [ %phi2, %end ]
+  %cmp1 = icmp ugt i32 %phi1, 3
+  br i1 %cmp1, label %cond2, label %cond1
+
+cond1:
+  br i1 undef, label %end, label %cond1.else
+
+cond1.else:
+  %gep2 = getelementptr inbounds i32, i32* %ptr2, i32 2
+  %val2 = load i32, i32* %gep2, align 4
+  %cmp2 = icmp eq i32 %val2, 13
+  br i1 %cmp2, label %cond1.end, label %end
+
+cond1.end:
+  br label %end
+
+cond2:
+  br i1 undef, label %end, label %cond2.else
+
+cond2.else:
+  %gep3 = getelementptr inbounds i32, i32* %ptr2, i32 2
+  %val3 = load i32, i32* %gep3, align 4
+  %cmp3 = icmp eq i32 %val3, 13
+  br i1 %cmp3, label %cond2.end, label %end
+
+cond2.end:
+  br label %end
+
+end:
+  %phi2 = phi i32 [ 1, %cond1 ], [ 2, %cond1.else ], [ 3, %cond1.end ], [ 4, %cond2 ], [ 5, %cond2.else ], [ 6, %cond2.end ]
+  br label %loop
+}
+
+; Check that we correctly handle the hoisting of %gep when theres a critical
+; edge that branches to the preheader.
+; CHECK-LABEL: @crit_edge
+define void @crit_edge(i32* %ptr, i32 %idx, i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: entry:
+; CHECK: %gep = getelementptr inbounds i32, i32* %ptr, i32 %idx
+; CHECK: br label %preheader
+entry:
+  br label %preheader
+
+preheader:
+  br label %loop
+
+loop:
+  br i1 %cond1, label %then, label %if
+
+if:
+  %gep = getelementptr inbounds i32, i32* %ptr, i32 %idx
+  %val = load i32, i32* %gep
+  br label %then
+
+then:
+  %phi = phi i32 [ %val, %if ], [ 0, %loop ]
+  store i32 %phi, i32* %ptr
+  br i1 %cond2, label %loop, label %crit_edge
+
+crit_edge:
+  br label %preheader
+}
+
+; Check that the conditional sub is correctly hoisted from the inner loop to the
+; preheader of the outer loop.
+; CHECK-LABEL: @hoist_from_innermost_loop
+define void @hoist_from_innermost_loop(i32 %nx, i32* %ptr) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %sub = sub nsw i32 0, %nx
+; CHECK: br label %outer_loop
+entry:
+  br label %outer_loop
+
+outer_loop:
+  br label %middle_loop
+
+middle_loop:
+  br label %inner_loop
+
+inner_loop:
+  br i1 undef, label %inner_loop_end, label %if
+
+if:
+  %sub = sub nsw i32 0, %nx
+  store i32 %sub, i32* %ptr, align 4
+  br label %inner_loop_end
+
+inner_loop_end:
+  br i1 undef, label %inner_loop, label %middle_loop_end
+
+middle_loop_end:
+  br i1 undef, label %middle_loop, label %outer_loop_end
+
+outer_loop_end:
+  br label %outer_loop
+}
+
+; We have a diamond starting from %if, but %if.if is also reachable from %loop,
+; so %gep should not be conditionally hoisted.
+; CHECK-LABEL: @diamond_with_extra_in_edge
+define void @diamond_with_extra_in_edge(i32* %ptr1, i32* %ptr2, i32 %arg) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp2 = icmp ne i32 0, %arg
+; CHECK-DAG: %gep = getelementptr i32, i32* %ptr1, i32 4
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  %phi1 = phi i32 [ 0, %entry ], [ %phi2, %then ]
+  %cmp1 = icmp ugt i32 16, %phi1
+  br i1 %cmp1, label %if, label %if.if
+
+if:
+  %cmp2 = icmp ne i32 0, %arg
+  br i1 %cmp2, label %if.if, label %if.else
+
+if.if:
+  %gep = getelementptr i32, i32* %ptr1, i32 4
+  %val = load i32, i32* %gep, align 4
+  br label %then
+
+if.else:
+  br label %then
+
+then:
+  %phi2 = phi i32 [ %val, %if.if ], [ %phi1, %if.else ]
+  store i32 %phi2, i32* %ptr2, align 4
+  br label %loop
+}
+
+; %loop/%if/%then form a triangle, but %loop/%if/%then/%end also form a diamond.
+; The triangle should be picked for conditional hoisting.
+; CHECK-LABEL: @both_triangle_and_diamond
+define void @both_triangle_and_diamond(i32* %ptr1, i32* %ptr2, i32 %arg) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 0, %arg
+; CHECK-DAG: %gep = getelementptr i32, i32* %ptr1, i32 4
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi2 = phi i32 [ 0, %[[IF_LICM]] ], [ 1, %entry ]
+; CHECK: br label %loop
+
+loop:
+  %phi1 = phi i32 [ 0, %entry ], [ %phi3, %end ]
+  %cmp1 = icmp ne i32 0, %arg
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %gep = getelementptr i32, i32* %ptr1, i32 4
+  %val = load i32, i32* %gep, align 4
+  %cmp2 = icmp ugt i32 16, %phi1
+  br i1 %cmp2, label %end, label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi2 = phi i32 [ 0, %if ], [ 1, %loop ]
+then:
+  %phi2 = phi i32 [ 0, %if ], [ 1, %loop ]
+  br label %end
+
+end:
+  %phi3 = phi i32 [ %phi2, %then ], [ %val, %if ]
+  store i32 %phi3, i32* %ptr2, align 4
+  br label %loop
+}
+
+; We shouldn't duplicate the branch at the end of %loop and should instead hoist
+; %val to %entry.
+; CHECK-LABEL: @same_destination_branch
+define i32 @same_destination_branch(i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 %arg2, 0
+; CHECK-DAG: %val = add i32 %arg1, 1
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+; CHECK-LABEL: loop:
+; CHECK: %phi = phi i32 [ 0, %entry ], [ %add, %then ]
+loop:
+  %phi = phi i32 [ 0, %entry ], [ %add, %then ]
+  %add = add i32 %phi, 1
+  %cmp1 = icmp ne i32 %arg2, 0
+  br i1 %cmp1, label %if, label %if
+
+if:
+  %val = add i32 %arg1, 1
+  br label %then
+
+then:
+  %cmp2 = icmp ne i32 %val, %phi
+  br i1 %cmp2, label %loop, label %end
+
+end:
+  ret i32 %val
+}
+
+; Diamond-like control flow but the left/right blocks actually have the same
+; destinations.
+; TODO: We could potentially hoist all of phi2-4, but currently only hoist phi2.
+; CHECK-LABEL: @diamond_like_same_destinations
+define i32 @diamond_like_same_destinations(i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 %arg1, 0
+; CHECK-DAG: %cmp2 = icmp ugt i32 %arg2, 1
+; CHECK-DAG: %cmp3 = icmp ugt i32 %arg2, 2
+; CHECK-ENABLED: br i1 %cmp1, label %[[LEFT1_LICM:.*]], label %[[RIGHT1_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[LEFT1_LICM]]:
+; CHECK-ENABLED: br label %[[LEFT2_LICM:.*]]
+
+; CHECK-ENABLED: [[RIGHT1_LICM]]:
+; CHECK-ENABLED: br label %[[LEFT2_LICM]]
+
+; CHECK-ENABLED: [[LEFT2_LICM]]:
+; CHECK-ENABLED: %phi2 = phi i32 [ 0, %[[LEFT1_LICM]] ], [ 1, %[[RIGHT1_LICM]] ]
+; CHECK: br label %loop
+
+loop:
+  %phi1 = phi i32 [ 0, %entry ], [ %add, %loopend ]
+  %add = add i32 %phi1, 1
+  %cmp1 = icmp ne i32 %arg1, 0
+  br i1 %cmp1, label %left1, label %right1
+
+left1:
+  %cmp2 = icmp ugt i32 %arg2, 1
+  br i1 %cmp2, label %left2, label %right2
+
+right1:
+  %cmp3 = icmp ugt i32 %arg2, 2
+  br i1 %cmp3, label %left2, label %right2
+
+; CHECK-LABEL: left2:
+; CHECK-DISABLED: %phi2 = phi i32 [ 0, %left1 ], [ 1, %right1 ]
+left2:
+  %phi2 = phi i32 [ 0, %left1 ], [ 1, %right1 ]
+  br label %loopend
+
+; CHECK-LABEL: right2:
+; CHECK: %phi3 = phi i32 [ 2, %left1 ], [ 3, %right1 ]
+right2:
+  %phi3 = phi i32 [ 2, %left1 ], [ 3, %right1 ]
+  br label %loopend
+
+; CHECK-LABEL: loopend:
+; CHECK: %phi4 = phi i32 [ %phi2, %left2 ], [ %phi3, %right2 ]
+loopend:
+  %phi4 = phi i32 [ %phi2, %left2 ], [ %phi3, %right2 ]
+  %cmp4 = icmp ne i32 %phi1, 32
+  br i1 %cmp4, label %loop, label %end
+
+end:
+  ret i32 %phi4
+}
+
+; A phi with multiple incoming values for the same block due to a branch with
+; two destinations that are actually the same. We can't hoist this.
+; TODO: This could be hoisted by erasing one of the incoming values.
+; CHECK-LABEL: @phi_multiple_values_same_block
+define i32 @phi_multiple_values_same_block(i32 %arg) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp = icmp sgt i32 %arg, 4
+; CHECK-NOT: phi
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  %cmp = icmp sgt i32 %arg, 4
+  br i1 %cmp, label %if, label %then
+
+if:
+  br i1 undef, label %then, label %then
+
+then:
+  %phi = phi i32 [ %arg, %loop ], [ 1, %if ], [ 1, %if ]
+  br i1 undef, label %exit, label %loop
+
+exit:
+  ret i32 %phi
+}
+
+; %phi is conditionally used in %d, and the store that %d is used in cannot be
+; hoisted. This means that we have to rehoist %d, but have to make sure to
+; rehoist it after %phi.
+; CHECK-LABEL: @phi_conditional_use
+define i64 @phi_conditional_use(i32 %f, i32* %g) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp eq i32 %f, 1
+; CHECK: %cmp2 = icmp eq i32 %f, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_END_LICM:.*]], label %[[IF_THEN_LICM:.*]]
+entry:
+  %cmp1 = icmp eq i32 %f, 1
+  %cmp2 = icmp eq i32 %f, 0
+  br label %loop
+
+; CHECK-ENABLED: [[IF_THEN_LICM]]:
+; CHECK-ENABLED: br label %[[IF_END_LICM]]
+
+; CHECK-ENABLED: [[IF_END_LICM]]:
+; CHECK-ENABLED: %phi = phi i64 [ 0, %entry ], [ 1, %[[IF_THEN_LICM]] ]
+; CHECK-ENABLED: %d = getelementptr inbounds i32, i32* %g, i64 %phi
+; CHECK-ENABLED: i1 %cmp2, label %[[LOOP_BACKEDGE_LICM:.*]], label %[[IF_THEN2_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_THEN2_LICM]]:
+; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM]]
+
+; CHECK-ENABLED: [[LOOP_BACKEDGE_LICM]]:
+; CHECK: br label %loop
+
+loop:
+  br i1 %cmp1, label %if.end, label %if.then
+
+if.then:
+  br label %if.end
+
+; CHECK-LABEL: if.end:
+; CHECK-DISABLED: %phi = phi i64 [ 0, %loop ], [ 1, %if.then ]
+if.end:
+  %phi = phi i64 [ 0, %loop ], [ 1, %if.then ]
+  br i1 %cmp2, label %loop.backedge, label %if.then2
+
+; CHECK-LABEL: if.then2:
+; CHECK-DISABLED: %d = getelementptr inbounds i32, i32* %g, i64 %phi
+if.then2:
+  %d = getelementptr inbounds i32, i32* %g, i64 %phi
+  store i32 1, i32* %d, align 4
+  br label %loop.backedge
+
+loop.backedge:
+  br label %loop
+}
+
+; As above, but we have two such phis
+; CHECK-LABEL: @phi_conditional_use_twice
+define i64 @phi_conditional_use_twice(i32 %f, i32* %g) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp eq i32 %f, 1
+; CHECK: %cmp2 = icmp eq i32 %f, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_END_LICM:.*]], label %[[IF_THEN_LICM:.*]]
+entry:
+  %cmp1 = icmp eq i32 %f, 1
+  %cmp2 = icmp eq i32 %f, 0
+  %cmp3 = icmp sgt i32 %f, 0
+  br label %loop
+
+; CHECK-ENABLED: [[IF_THEN_LICM]]:
+; CHECK-ENABLED: br label %[[IF_END_LICM]]
+
+; CHECK-ENABLED: [[IF_END_LICM]]:
+; CHECK-ENABLED: %phi1 = phi i64 [ 0, %entry ], [ 1, %[[IF_THEN_LICM]] ]
+; CHECK-ENABLED: %d = getelementptr inbounds i32, i32* %g, i64 %phi1
+; CHECK-ENABLED: i1 %cmp2, label %[[IF_END2_LICM:.*]], label %[[IF_THEN2_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_THEN2_LICM]]:
+; CHECK-ENABLED: br label %[[IF_END2_LICM]]
+
+; CHECK-ENABLED: [[IF_END2_LICM]]:
+; CHECK-ENABLED: %phi2 = phi i64 [ 2, %[[IF_END_LICM]] ], [ 3, %[[IF_THEN2_LICM]] ]
+; CHECK-ENABLED: %e = getelementptr inbounds i32, i32* %g, i64 %phi2
+; CHECK-ENABLED: i1 %cmp3, label %[[LOOP_BACKEDGE_LICM:.*]], label %[[IF_THEN3_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_THEN3_LICM]]:
+; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM]]
+
+; CHECK-ENABLED: [[LOOP_BACKEDGE_LICM]]:
+; CHECK: br label %loop
+
+loop:
+  br i1 %cmp1, label %if.end, label %if.then
+
+if.then:
+  br label %if.end
+
+; CHECK-LABEL: if.end:
+; CHECK-DISABLED: %phi1 = phi i64 [ 0, %loop ], [ 1, %if.then ]
+if.end:
+  %phi1 = phi i64 [ 0, %loop ], [ 1, %if.then ]
+  br i1 %cmp2, label %if.end2, label %if.then2
+
+; CHECK-LABEL: if.then2:
+; CHECK-DISABLED: %d = getelementptr inbounds i32, i32* %g, i64 %phi1
+if.then2:
+  %d = getelementptr inbounds i32, i32* %g, i64 %phi1
+  store i32 1, i32* %d, align 4
+  br label %if.end2
+
+; CHECK-LABEL: if.end2:
+; CHECK-DISABLED: %phi2 = phi i64 [ 2, %if.end ], [ 3, %if.then2 ]
+if.end2:
+  %phi2 = phi i64 [ 2, %if.end ], [ 3, %if.then2 ]
+  br i1 %cmp3, label %loop.backedge, label %if.then3
+
+; CHECK-LABEL: if.then3:
+; CHECK-DISABLED: %e = getelementptr inbounds i32, i32* %g, i64 %phi2
+if.then3:
+  %e = getelementptr inbounds i32, i32* %g, i64 %phi2
+  store i32 1, i32* %e, align 4
+  br label %loop.backedge
+
+loop.backedge:
+  br label %loop
+}
diff --git a/test/Transforms/LICM/loopsink-pr39695.ll b/test/Transforms/LICM/loopsink-pr39695.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4d33210352de8b8505cdb41475d7bdfd3a9e5b69
--- /dev/null
+++ b/test/Transforms/LICM/loopsink-pr39695.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -loop-sink < %s | FileCheck %s
+
+; The load instruction should not be sunk into following loop.
+; CHECK:      @foo
+; CHECK-NEXT: entry
+; CHECK-NEXT: %ptr = load i8*, i8** %pp, align 8
+; CHECK-NEXT: store i8* null, i8** %pp, align 8
+
+define i32 @foo(i32 %n, i8** %pp) !prof !0 {
+entry:
+  %ptr = load i8*, i8** %pp, align 8
+  store i8* null, i8** %pp, align 8
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:                                         ; preds = %for.cond
+  %0 = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i8, i8* %ptr, i64 %0
+  %1 = load i8, i8* %arrayidx, align 1
+  %or19 = call i8 @llvm.bitreverse.i8(i8 %1)
+  %v = sext i8 %or19 to i32
+  %inc = add i32 %i.0, %v
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret i32 %i.0
+}
+
+declare i8 @llvm.bitreverse.i8(i8) #0
+attributes #0 = { nounwind readnone speculatable }
+
+!0 = !{!"function_entry_count", i64 1}
+!1 = !{!"branch_weights", i32 1, i32 2000}
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
index fb704efb1e96d7ba53917c66166f6cc6c54d1870..b0dd5d185c77b7fd9376361aacaa3d7d162cc6fe 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
 ; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
+; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
+; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
 
 target triple = "amdgcn--"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
index a118ff8fdd5c8b02266139050fd29592675fdd42..cd1c7fdc521b54e0827c8c3ef2a6d89e780fb23b 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
@@ -48,4 +49,4 @@ entry:
   %cstoreval2 = fptrunc double %storeval2 to float
   store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
   ret void
-}
\ No newline at end of file
+}
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
index 317c2571dff42981e76ecbaba1c4ca0c3e3d2994..b8e95a6793e8ccfd1531c25636dfd5b5b3bfd6e2 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
index 537922e89f09e2bafb4edde790e58cc1539b3705..5bb6289ff19e6f5cbe7b94e4e67359af26a2be69 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -mtriple=amdgcn--amdhsa -load-store-vectorizer < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn--amdhsa -passes='function(load-store-vectorizer)' < %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
index a7411490a391de8fa9a95efa4401a4b2540b519d..35836f80456d4b12a0ecff3821ae6a181030c765 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
index ccca9e6efa80a4b706c0ddf1b802e65a1d8784a4..81ebb712e335dfb1361f5f23b9dd217ce9140703 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
index e055b8a2fc35a6fd39a4b83d7936b4a1f2a066ef..15c47716aafc20db108119eada6f55a817b3132b 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll b/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
index a9b72294d9046be165bb4eacfa5970284cbae23d..e29f3dfa537fd0abdcffcdaf66cbcf4578c58010 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s
 ; RUN: opt                 -load-store-vectorizer %s -S -o - | FileCheck %s
+; RUN: opt -codegenprepare -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
+; RUN: opt                 -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
 
 target triple = "x86_64--"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll b/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll
index 7f29a73bcf9f5e7fce54c1d6eaeb0d33dd7dd5d8..e2181f6086c50d871dc526da34b42eae30747c0f 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -load-store-vectorizer %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S | FileCheck %s
 
 ; Check that setting wrapping flags after a SCEV node is created
 ; does not invalidate "sorted by complexity" invariant for
diff --git a/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
index fd2ae51fc1f0cd77ce83a5acaef79b901359d26d..043d6ea7e920540754b11acc28ef232435b207af 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
index a61b25119a14cdb82ec4fdead9673955b5de1950..ac5f3ea9f0f8c681424755deca8644ac09ec247f 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
 
 define <8 x double> @loadwidth_insert_extract(double* %ptr) {
     %a = bitcast double* %ptr to <2 x double> *
diff --git a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
index b4493a8ab96ceba7f3b47decaa8910aebf3de6df..a93e9aceb73364c584eeef8fc344c4e88cffc8f3 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \
 ; RUN:     FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S < %s | \
+; RUN:     FileCheck %s
 ;
 ; The GPU Load & Store Vectorizer may merge differently-typed accesses into a
 ; single instruction. This test checks that we merge TBAA tags for such
diff --git a/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll b/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
index 1f00f980eac19952bc4ef5384eb92864a2fd283b..7a0073808a0167b40f8266093f3d51ec54958363 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -load-store-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 %rec = type { i32, i28 }
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll b/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
index 12d882a51fa2a78c89a6b665a715c6e542ac95be..92d05f76fc61f6423181b3d39f79aafa2b3cbbd8 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86-linux -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll b/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
index bf75ecf629554854642cad1983d562b6c3fb3747..3ae0d891dc54c9c7e88e0ca51bce06678709b748 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
index 915b94ac15574eafb434f874f5d4eb0f66470363..72b29912d8133cad93f17c8deaf746a6b9a1c676 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll b/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
index 379b2353dc3dc37c6ed7c62fcb6cf56b1a3c1b69..00971f350388488c0a33a14fbbae88dca2a15be2 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck %s
 
 ; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T.
 
diff --git a/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll b/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
index 07bdc9123f9ab5c16237c6dd6c76ee835e9aa0ba..07487b578039519fc382cae5d37d999260d83030 100644
--- a/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
+++ b/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S < %s -load-store-vectorizer | FileCheck %s
+; RUN: opt -S < %s -passes='function(load-store-vectorizer)' | FileCheck %s
 
 declare void @llvm.sideeffect()
 
diff --git a/test/Transforms/LoopDeletion/crashbc.ll b/test/Transforms/LoopDeletion/crashbc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..056c61f77aea18cd49b3d2b65f34af2ec18ce587
--- /dev/null
+++ b/test/Transforms/LoopDeletion/crashbc.ll
@@ -0,0 +1,31 @@
+; Make sure we don't crash when writing bitcode.
+; RUN: opt -loop-deletion -o /dev/null
+
+define void @f() {
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %0
+  call void @llvm.dbg.value(metadata i16 undef, metadata !1, metadata !DIExpression()), !dbg !11
+  br i1 undef, label %bb1, label %bb3
+
+bb3:                                              ; preds = %bb1
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!9}
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !DILocalVariable(name: "i", scope: !2, file: !3, line: 31, type: !7)
+!2 = distinct !DILexicalBlock(scope: !4, file: !3, line: 31, column: 9)
+!3 = !DIFile(filename: "foo.c", directory: "/bar")
+!4 = distinct !DISubprogram(name: "f", scope: !3, file: !3, line: 26, type: !5, scopeLine: 27, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !9, retainedNodes: !10)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !8, !7}
+!7 = !DIBasicType(name: "int", size: 16, encoding: DW_ATE_signed)
+!8 = !DIBasicType(size: 16, encoding: DW_ATE_signed)
+!9 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "My Compiler", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !10, retainedTypes: !10, globals: !10)
+!10 = !{}
+!11 = !DILocation(line: 31, column: 13, scope: !2)
diff --git a/test/Transforms/LoopDeletion/diundef.ll b/test/Transforms/LoopDeletion/diundef.ll
new file mode 100644
index 0000000000000000000000000000000000000000..11dc05cb94d2632aef3081e3658b49f7eef22807
--- /dev/null
+++ b/test/Transforms/LoopDeletion/diundef.ll
@@ -0,0 +1,75 @@
+; RUN: opt %s -loop-deletion -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+@a = common local_unnamed_addr global i32 0, align 4, !dbg !0
+
+define i32 @b() local_unnamed_addr !dbg !12 {
+entry:
+  call void @llvm.dbg.value(metadata i32 0, metadata !16, metadata !DIExpression()), !dbg !17
+  br label %for.cond, !dbg !18
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.cond ], !dbg !20
+  call void @llvm.dbg.value(metadata i32 %i.0, metadata !16, metadata !DIExpression()), !dbg !17
+  %inc = add nuw nsw i32 %i.0, 1, !dbg !21
+  call void @llvm.dbg.value(metadata i32 %inc, metadata !16, metadata !DIExpression()), !dbg !17
+  %exitcond = icmp ne i32 %inc, 3, !dbg !23
+  br i1 %exitcond, label %for.cond, label %for.end, !dbg !24, !llvm.loop !25
+
+; CHECK: call void @llvm.dbg.value(metadata i32 undef, metadata !16, metadata !DIExpression()), !dbg !17
+; CHECK-NEXT: %call = tail call i32 {{.*}} @patatino()
+for.end:                                          ; preds = %for.cond
+  %call = tail call i32 (...) @patatino() #3, !dbg !27
+  %0 = load i32, i32* @a, align 4, !dbg !28
+  ret i32 %0, !dbg !33
+}
+
+declare i32 @patatino(...) local_unnamed_addr
+
+define i32 @main() local_unnamed_addr !dbg !34 {
+entry:
+  %call = call i32 @b(), !dbg !35
+  ret i32 0, !dbg !36
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10}
+!llvm.ident = !{!11}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 8.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: GNU)
+!3 = !DIFile(filename: "a.c", directory: "/Users/davide/work/llvm-project-20170507/build-debug/bin")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 7, !"PIC Level", i32 2}
+!11 = !{!"clang version 8.0.0 "}
+!12 = distinct !DISubprogram(name: "b", scope: !3, file: !3, line: 2, type: !13, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !15)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!6}
+!15 = !{!16}
+!16 = !DILocalVariable(name: "i", scope: !12, file: !3, line: 3, type: !6)
+!17 = !DILocation(line: 3, column: 7, scope: !12)
+!18 = !DILocation(line: 4, column: 8, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !12, file: !3, line: 4, column: 3)
+!20 = !DILocation(line: 0, scope: !19)
+!21 = !DILocation(line: 4, column: 23, scope: !22)
+!22 = distinct !DILexicalBlock(scope: !19, file: !3, line: 4, column: 3)
+!23 = !DILocation(line: 4, column: 17, scope: !22)
+!24 = !DILocation(line: 4, column: 3, scope: !19)
+!25 = distinct !{!25, !24, !26}
+!26 = !DILocation(line: 5, column: 5, scope: !19)
+!27 = !DILocation(line: 6, column: 3, scope: !12)
+!28 = !DILocation(line: 7, column: 10, scope: !12)
+!33 = !DILocation(line: 7, column: 3, scope: !12)
+!34 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 9, type: !13, scopeLine: 9, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
+!35 = !DILocation(line: 9, column: 14, scope: !34)
+!36 = !DILocation(line: 9, column: 19, scope: !34)
diff --git a/test/Transforms/LoopDistribute/disable_nonforced.ll b/test/Transforms/LoopDistribute/disable_nonforced.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0dd5d9a068be525ab311433c176d98d63c09a28b
--- /dev/null
+++ b/test/Transforms/LoopDistribute/disable_nonforced.ll
@@ -0,0 +1,50 @@
+; RUN: opt -loop-distribute -enable-loop-distribute=1 -S < %s | FileCheck %s
+;
+; Check that the disable_nonforced is honored by loop distribution.
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced(
+; CHECK-NOT: for.body.ldist1:
+define void @disable_nonforced(i32* noalias %a,
+                         i32* noalias %b,
+                         i32* noalias %c,
+                         i32* noalias %d,
+                         i32* noalias %e) {
+entry:
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.disable_nonforced"}}
diff --git a/test/Transforms/LoopDistribute/disable_nonforced_enable.ll b/test/Transforms/LoopDistribute/disable_nonforced_enable.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c6dcd707e9cc2a54b19d5f92d7ce83543f89e764
--- /dev/null
+++ b/test/Transforms/LoopDistribute/disable_nonforced_enable.ll
@@ -0,0 +1,51 @@
+; RUN: opt -loop-distribute -S < %s | FileCheck %s
+;
+; Check that llvm.loop.distribute.enable overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced(
+; CHECK: for.body.ldist1:
+define void @disable_nonforced(i32* noalias %a,
+                         i32* noalias %b,
+                         i32* noalias %c,
+                         i32* noalias %d,
+                         i32* noalias %e) {
+entry:
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.distribute.enable", i32 1}}
diff --git a/test/Transforms/LoopDistribute/followup.ll b/test/Transforms/LoopDistribute/followup.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a70a6d4fb9efa3c0d914d390176cba70ea6afc92
--- /dev/null
+++ b/test/Transforms/LoopDistribute/followup.ll
@@ -0,0 +1,66 @@
+; RUN: opt -basicaa -loop-distribute -S < %s | FileCheck %s
+;
+; Check that followup loop-attributes are applied to the loops after
+; loop distribution.
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %a, i32* %b, i32* %c, i32* %d, i32* %e) {
+entry:
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.distribute.enable", i1 true}
+!2 = !{!"llvm.loop.distribute.followup_all", !{!"FollowupAll"}}
+!3 = !{!"llvm.loop.distribute.followup_coincident", !{!"FollowupCoincident", i1 false}}
+!4 = !{!"llvm.loop.distribute.followup_sequential", !{!"FollowupSequential", i32 8}}
+!5 = !{!"llvm.loop.distribute.followup_fallback", !{!"FollowupFallback"}}
+
+
+; CHECK-LABEL: for.body.lver.orig:
+; CHECK: br i1 %exitcond.lver.orig, label %for.end, label %for.body.lver.orig, !llvm.loop ![[LOOP_ORIG:[0-9]+]]
+; CHECK-LABEL: for.body.ldist1:
+; CHECK: br i1 %exitcond.ldist1, label %for.body.ph, label %for.body.ldist1, !llvm.loop ![[LOOP_SEQUENTIAL:[0-9]+]]
+; CHECK-LABEL: for.body:
+; CHECK: br i1 %exitcond, label %for.end, label %for.body, !llvm.loop ![[LOOP_COINCIDENT:[0-9]+]]
+
+; CHECK: ![[LOOP_ORIG]] = distinct !{![[LOOP_ORIG]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOUP_FALLBACK:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
+; CHECK: ![[FOLLOUP_FALLBACK]] = !{!"FollowupFallback"}
+; CHECK: ![[LOOP_SEQUENTIAL]] = distinct !{![[LOOP_SEQUENTIAL]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_SEQUENTIAL:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_SEQUENTIAL]] = !{!"FollowupSequential", i32 8}
+; CHECK: ![[LOOP_COINCIDENT]] = distinct !{![[LOOP_COINCIDENT]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_COINCIDENT:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_COINCIDENT]] = !{!"FollowupCoincident", i1 false}
diff --git a/test/Transforms/LoopInterchange/inner-only-reductions.ll b/test/Transforms/LoopInterchange/inner-only-reductions.ll
index 74543fb16472540f2fef41a1424c3355f4d01eb0..7c77b10dd8e2f6d8cdb5b8cccdba54f50e46150c 100644
--- a/test/Transforms/LoopInterchange/inner-only-reductions.ll
+++ b/test/Transforms/LoopInterchange/inner-only-reductions.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
-; RUN:     -verify-dom-info -verify-loop-info 2>&1 | FileCheck -check-prefix=IR %s
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 | FileCheck -check-prefix=IR %s
 ; RUN: FileCheck --input-file=%t %s
 
 ; Inner loop only reductions are not supported currently. See discussion at
diff --git a/test/Transforms/LoopInterchange/lcssa.ll b/test/Transforms/LoopInterchange/lcssa.ll
index 2bd9ee69c163b99b52608859b171961e18b9c228..edc9800ecd76f011d2a9dfe2a7cf5384a00b9517 100644
--- a/test/Transforms/LoopInterchange/lcssa.ll
+++ b/test/Transforms/LoopInterchange/lcssa.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t
-; RUN: cat %t |  FileCheck --check-prefix REMARK %s
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t -S
+; RUN: FileCheck --input-file %t --check-prefix REMARK %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LoopInterchange/outer-only-reductions.ll b/test/Transforms/LoopInterchange/outer-only-reductions.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c58e7d3d2722479f54b576f91ef095433fdbfc69
--- /dev/null
+++ b/test/Transforms/LoopInterchange/outer-only-reductions.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
+
+; Outer loop only reductions are not supported currently.
+
+@A = common global [500 x [500 x i32]] zeroinitializer
+
+;; global X
+
+;;  for( int i=1;i<N;i++) {
+;;    for( int j=1;j<N;j++)
+;;      ;
+;;    X+=A[j][i];
+;;  }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHI
+; CHECK-NEXT: Function:        reduction_01
+
+; IR-LABEL: @reduction_01(
+; IR-NOT: split
+
+define i32 @reduction_01(i32 %N) {
+entry:
+  br label %outer.header
+
+outer.header:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %outer.inc ], [ 1, %entry ]
+  %add15 = phi i32 [ 0, %entry ], [ %add, %outer.inc ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 1, %outer.header ], [ %indvars.iv.next, %for.body3 ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:                     ; preds = %for.body3
+  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %add15, %0
+  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
+  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
+  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
+  br i1 %exitcond21, label %for.end8, label %outer.header
+
+for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  ret i32 %add
+}
diff --git a/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll b/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7ac96f14825dcfe5c7585c5a4caaa80af89a4a8b
--- /dev/null
+++ b/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
@@ -0,0 +1,150 @@
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        test1
+
+define i64 @test1([100 x [100 x i64]]* %Arr) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR2_PREHEADER:%.*]]
+; CHECK:       for1.header.preheader:
+; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
+; CHECK:       for1.header:
+; CHECK-NEXT:    [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR1_INC:%.*]] ], [ 0, [[FOR1_HEADER_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[SUM_INNER:%.*]] = phi i64 [ [[SUM_INC:%.*]], [[FOR1_INC]] ], [ [[SUM_OUTER:%.*]], [[FOR1_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR2_SPLIT1:%.*]]
+; CHECK:       for2.preheader:
+; CHECK-NEXT:    br label [[FOR2:%.*]]
+; CHECK:       for2:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR2_SPLIT:%.*]] ], [ 0, [[FOR2_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_OUTER]] = phi i64 [ [[SUM_INC_LCSSA:%.*]], [[FOR2_SPLIT]] ], [ 0, [[FOR2_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR1_HEADER_PREHEADER]]
+; CHECK:       for2.split1:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* [[ARR:%.*]], i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV23]]
+; CHECK-NEXT:    [[LV:%.*]] = load i64, i64* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SUM_INC]] = add i64 [[SUM_INNER]], [[LV]]
+; CHECK-NEXT:    br label [[FOR1_INC]]
+; CHECK:       for2.split:
+; CHECK-NEXT:    [[SUM_INC_LCSSA]] = phi i64 [ [[SUM_INC]], %for1.inc ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXIT1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 100
+; CHECK-NEXT:    br i1 [[EXIT1]], label [[FOR1_LOOPEXIT:%.*]], label [[FOR2]]
+; CHECK:       for1.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1
+; CHECK-NEXT:    [[EXIT2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT24]], 100
+; CHECK-NEXT:    br i1 [[EXIT2]], label [[FOR2_SPLIT]], label [[FOR1_HEADER]]
+; CHECK:       for1.loopexit:
+; CHECK-NEXT:    [[SUM_INC_LCSSA2:%.*]] = phi i64 [ [[SUM_INC_LCSSA]], [[FOR2_SPLIT]] ]
+; CHECK-NEXT:    ret i64 [[SUM_INC_LCSSA2]]
+;
+entry:
+  br label %for1.header
+
+for1.header:                                         ; preds = %for1.inc, %entry
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ]
+  %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ]
+  br label %for2
+
+for2:                                        ; preds = %for2, %for1.header
+  %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ]
+  %sum.inner = phi i64 [ %sum.outer, %for1.header ], [ %sum.inc, %for2 ]
+  %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+  %lv = load i64, i64* %arrayidx, align 4
+  %sum.inc = add i64 %sum.inner, %lv
+  %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1
+  %exit1 = icmp eq i64 %indvars.iv.next.3, 100
+  br i1 %exit1, label %for1.inc, label %for2
+
+for1.inc:                                ; preds = %for2
+  %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %exit2 = icmp eq i64 %indvars.iv.next24, 100
+  br i1 %exit2, label %for1.loopexit, label %for1.header
+
+for1.loopexit:                                 ; preds = %for1.inc
+  %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ]
+  ret i64 %sum.inc.lcssa2
+}
+
+; In this test case, the inner reduction PHI %inner does not involve the outer
+; reduction PHI %sum.outer, do not interchange.
+; REMARKS: --- !Missed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            UnsupportedPHIOuter
+; REMARKS-NEXT: Function:        test2
+
+define i64 @test2([100 x [100 x i64]]* %Arr) {
+entry:
+  br label %for1.header
+
+for1.header:                                         ; preds = %for1.inc, %entry
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ]
+  %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ]
+  br label %for2
+
+for2:                                        ; preds = %for2, %for1.header
+  %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ]
+  %inner = phi i64 [ %indvars.iv23, %for1.header ], [ %sum.inc, %for2 ]
+  %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+  %lv = load i64, i64* %arrayidx, align 4
+  %sum.inc = add i64 %inner, %lv
+  %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1
+  %exit1 = icmp eq i64 %indvars.iv.next.3, 100
+  br i1 %exit1, label %for1.inc, label %for2
+
+for1.inc:                                ; preds = %for2
+  %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %exit2 = icmp eq i64 %indvars.iv.next24, 100
+  br i1 %exit2, label %for1.loopexit, label %for1.header
+
+for1.loopexit:                                 ; preds = %for1.inc
+  %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ]
+  ret i64 %sum.inc.lcssa2
+}
+
+; Check that we do not interchange if there is an additional instruction
+; between the outer and inner reduction PHIs.
+; REMARKS: --- !Missed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            UnsupportedPHIOuter
+; REMARKS-NEXT: Function:        test3
+
+define i64 @test3([100 x [100 x i64]]* %Arr) {
+entry:
+  br label %for1.header
+
+for1.header:                                         ; preds = %for1.inc, %entry
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ]
+  %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ]
+  %so = add i64 %sum.outer, 10
+  br label %for2
+
+for2:                                        ; preds = %for2, %for1.header
+  %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ]
+  %sum.inner = phi i64 [ %so, %for1.header ], [ %sum.inc, %for2 ]
+  %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+  %lv = load i64, i64* %arrayidx, align 4
+  %sum.inc = add i64 %sum.inner, %lv
+  %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1
+  %exit1 = icmp eq i64 %indvars.iv.next.3, 100
+  br i1 %exit1, label %for1.inc, label %for2
+
+for1.inc:                                ; preds = %for2
+  %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %exit2 = icmp eq i64 %indvars.iv.next24, 100
+  br i1 %exit2, label %for1.loopexit, label %for1.header
+
+for1.loopexit:                                 ; preds = %for1.inc
+  %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ]
+  ret i64 %sum.inc.lcssa2
+}
diff --git a/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll b/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 44f1c0bcd889d858fef0b027718952233332be14..7998aa360ddb24a6127a4a6c6a49b553640b3cbd 100644
--- a/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -loop-simplifycfg < %s | FileCheck %s
-; RUN: opt -S -passes='require<domtree>,loop(simplify-cfg)' < %s | FileCheck %s
-; RUN: opt -S -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; REQUIRES: asserts
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(simplify-cfg)' -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
@@ -707,7 +708,7 @@ define i32 @live_block_test_branch_loop(i1 %c, i32 %end) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[CHECK:%.*]], label [[LIVE:%.*]]
 ; CHECK:       check:
-; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[LIVE]]
+; CHECK-NEXT:    br label [[BACKEDGE]]
 ; CHECK:       live:
 ; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
 ; CHECK-NEXT:    br label [[BACKEDGE]]
@@ -744,6 +745,54 @@ exit:
   ret i32 %i.inc
 }
 
+; Check that when the block is not actually dead, we don't remove it. Version
+; with Phi node.
+define i32 @live_block_test_branch_loop_phis(i1 %c, i32 %end) {
+; CHECK-LABEL: @live_block_test_branch_loop_phis(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[CHECK:%.*]], label [[LIVE:%.*]]
+; CHECK:       check:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       live:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[CHECK]] ], [ [[I_2]], [[LIVE]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 %c, label %check, label %live
+
+check:
+  br i1 true, label %backedge, label %live
+
+live:
+  %phi = phi i32 [ 1, %header ], [ -1, %check ]
+  %i.2 = add i32 %i, %phi
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %check], [%i.2, %live]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
 define i32 @live_block_test_switch_loop(i1 %c, i32 %end) {
 ; CHECK-LABEL: @live_block_test_switch_loop(
 ; CHECK-NEXT:  preheader:
@@ -752,11 +801,7 @@ define i32 @live_block_test_switch_loop(i1 %c, i32 %end) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[CHECK:%.*]], label [[LIVE:%.*]]
 ; CHECK:       check:
-; CHECK-NEXT:    switch i32 1, label [[LIVE]] [
-; CHECK-NEXT:    i32 0, label [[LIVE]]
-; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:    i32 2, label [[LIVE]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    br label [[BACKEDGE]]
 ; CHECK:       live:
 ; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
 ; CHECK-NEXT:    br label [[BACKEDGE]]
@@ -795,6 +840,54 @@ exit:
   ret i32 %i.inc
 }
 
+define i32 @live_block_test_switch_loop_phis(i1 %c, i32 %end) {
+; CHECK-LABEL: @live_block_test_switch_loop_phis(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[CHECK:%.*]], label [[LIVE:%.*]]
+; CHECK:       check:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       live:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[CHECK]] ], [ [[I_2]], [[LIVE]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 %c, label %check, label %live
+
+check:
+  switch i32 1, label %live [i32 0, label %live
+  i32 1, label %backedge
+  i32 2, label %live]
+
+live:
+  %phi = phi i32 [ 1, %header ], [ -1, %check ], [ -1, %check ], [ -1, %check ]
+  %i.2 = add i32 %i, %phi
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %check], [%i.2, %live]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
 ; Check that we can remove part of blocks of inner loop while the loop still
 ; preserves, in presence of outer loop.
 define i32 @partial_sub_loop_test_branch_loop(i32 %end) {
@@ -1445,3 +1538,1072 @@ outer_backedge:
 exit:
   ret i32 %i.inc
 }
+
+define i32 @exit_branch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @exit_branch_from_inner_to_grandparent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 true, label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE_LOOPEXIT1:%.*]]
+; CHECK:       loop_1_backedge.loopexit:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge.loopexit1:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %loop_1_backedge
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 true, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @exit_switch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @exit_switch_from_inner_to_grandparent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    switch i32 1, label [[LOOP_3]] [
+; CHECK-NEXT:    i32 0, label [[LOOP_2_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE_LOOPEXIT1:%.*]]
+; CHECK:       loop_1_backedge.loopexit:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge.loopexit1:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %loop_1_backedge
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  switch i32 1, label %loop_3 [i32 0, label %loop_2_backedge]
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_branch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @intermediate_branch_from_inner_to_grandparent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br i1 false, label [[LOOP_3_BACKEDGE]], label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE_LOOPEXIT1:%.*]]
+; CHECK:       loop_1_backedge.loopexit:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge.loopexit1:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br i1 false, label %loop_3_backedge, label %loop_1_backedge
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_switch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @intermediate_switch_from_inner_to_grandparent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE_LOOPEXIT1:%.*]]
+; CHECK:       loop_1_backedge.loopexit:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge.loopexit1:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  switch i32 1, label %loop_1_backedge [i32 0, label %loop_3_backedge]
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_branch_from_inner_to_parent(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @intermediate_branch_from_inner_to_parent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br i1 false, label [[LOOP_3_BACKEDGE]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br i1 false, label %loop_3_backedge, label %loop_2_backedge
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_switch_from_inner_to_parent(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @intermediate_switch_from_inner_to_parent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    switch i32 1, label [[LOOP_2_BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  switch i32 1, label %loop_2_backedge [i32 0, label %loop_3_backedge]
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_subloop_branch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i1 %cond3, i32 %N) {
+; CHECK-LABEL: @intermediate_subloop_branch_from_inner_to_grandparent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP:%.*]]
+; CHECK:       intermediate_loop:
+; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP]], label [[INTERMEDIATE_EXIT:%.*]]
+; CHECK:       intermediate_exit:
+; CHECK-NEXT:    br i1 false, label [[LOOP_3_BACKEDGE]], label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE_LOOPEXIT1:%.*]]
+; CHECK:       loop_1_backedge.loopexit:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge.loopexit1:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br label %intermediate_loop
+
+intermediate_loop:
+  br i1 %cond3, label %intermediate_loop, label %intermediate_exit
+
+intermediate_exit:
+  br i1 false, label %loop_3_backedge, label %loop_1_backedge
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_subloop_switch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i1 %cond3, i32 %N) {
+; CHECK-LABEL: @intermediate_subloop_switch_from_inner_to_grandparent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP:%.*]]
+; CHECK:       intermediate_loop:
+; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP]], label [[INTERMEDIATE_EXIT:%.*]]
+; CHECK:       intermediate_exit:
+; CHECK-NEXT:    switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE_LOOPEXIT1:%.*]]
+; CHECK:       loop_1_backedge.loopexit:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge.loopexit1:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br label %intermediate_loop
+
+intermediate_loop:
+  br i1 %cond3, label %intermediate_loop, label %intermediate_exit
+
+intermediate_exit:
+  switch i32 1, label %loop_1_backedge [i32 0, label %loop_3_backedge]
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_subloop_branch_from_inner_to_parent(i1 %cond1, i1 %cond2, i1 %cond3, i32 %N) {
+; CHECK-LABEL: @intermediate_subloop_branch_from_inner_to_parent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP:%.*]]
+; CHECK:       intermediate_loop:
+; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP]], label [[INTERMEDIATE_EXIT:%.*]]
+; CHECK:       intermediate_exit:
+; CHECK-NEXT:    br i1 false, label [[LOOP_3_BACKEDGE]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br label %intermediate_loop
+
+intermediate_loop:
+  br i1 %cond3, label %intermediate_loop, label %intermediate_exit
+
+intermediate_exit:
+  br i1 false, label %loop_3_backedge, label %loop_2_backedge
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_subloop_switch_from_inner_to_parent(i1 %cond1, i1 %cond2, i1 %cond3, i32 %N) {
+; CHECK-LABEL: @intermediate_subloop_switch_from_inner_to_parent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP:%.*]]
+; CHECK:       intermediate_loop:
+; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP]], label [[INTERMEDIATE_EXIT:%.*]]
+; CHECK:       intermediate_exit:
+; CHECK-NEXT:    switch i32 1, label [[LOOP_2_BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br label %intermediate_loop
+
+intermediate_loop:
+  br i1 %cond3, label %intermediate_loop, label %intermediate_exit
+
+intermediate_exit:
+  switch i32 1, label %loop_2_backedge [i32 0, label %loop_3_backedge]
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_complex_subloop_branch_from_inner_to_parent(i1 %cond1, i1 %cond2, i1 %cond3, i32 %N) {
+; CHECK-LABEL: @intermediate_complex_subloop_branch_from_inner_to_parent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP:%.*]]
+; CHECK:       intermediate_loop:
+; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE:%.*]], label [[INTERMEDIATE_BLOCK:%.*]]
+; CHECK:       intermediate_loop.backedge:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP]]
+; CHECK:       intermediate_block:
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE]], label [[INTERMEDIATE_EXIT:%.*]]
+; CHECK:       intermediate_exit:
+; CHECK-NEXT:    br i1 false, label [[LOOP_3_BACKEDGE]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br label %intermediate_loop
+
+intermediate_loop:
+  br i1 %cond3, label %intermediate_loop, label %intermediate_block
+
+intermediate_block:
+  br i1 %cond2, label %intermediate_loop, label %intermediate_exit
+
+intermediate_exit:
+  br i1 false, label %loop_3_backedge, label %loop_2_backedge
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_complex_subloop_switch_from_inner_to_parent(i1 %cond1, i1 %cond2, i1 %cond3, i32 %N) {
+; CHECK-LABEL: @intermediate_complex_subloop_switch_from_inner_to_parent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP:%.*]]
+; CHECK:       intermediate_loop:
+; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE:%.*]], label [[INTERMEDIATE_BLOCK:%.*]]
+; CHECK:       intermediate_loop.backedge:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP]]
+; CHECK:       intermediate_block:
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE]], label [[INTERMEDIATE_EXIT:%.*]]
+; CHECK:       intermediate_exit:
+; CHECK-NEXT:    switch i32 1, label [[LOOP_2_BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br label %intermediate_loop
+
+intermediate_loop:
+  br i1 %cond3, label %intermediate_loop, label %intermediate_block
+
+intermediate_block:
+  br i1 %cond2, label %intermediate_loop, label %intermediate_exit
+
+intermediate_exit:
+  switch i32 1, label %loop_2_backedge [i32 0, label %loop_3_backedge]
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+
+define i32 @intermediate_complex_subloop_branch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i1 %cond3, i32 %N) {
+; CHECK-LABEL: @intermediate_complex_subloop_branch_from_inner_to_grandparent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP:%.*]]
+; CHECK:       intermediate_loop:
+; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE:%.*]], label [[INTERMEDIATE_BLOCK:%.*]]
+; CHECK:       intermediate_loop.backedge:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP]]
+; CHECK:       intermediate_block:
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE]], label [[INTERMEDIATE_EXIT:%.*]]
+; CHECK:       intermediate_exit:
+; CHECK-NEXT:    br i1 false, label [[LOOP_3_BACKEDGE]], label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE_LOOPEXIT1:%.*]]
+; CHECK:       loop_1_backedge.loopexit:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge.loopexit1:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br label %intermediate_loop
+
+intermediate_loop:
+  br i1 %cond3, label %intermediate_loop, label %intermediate_block
+
+intermediate_block:
+  br i1 %cond2, label %intermediate_loop, label %intermediate_exit
+
+intermediate_exit:
+  br i1 false, label %loop_3_backedge, label %loop_1_backedge
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
+
+define i32 @intermediate_complex_subloop_switch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i1 %cond3, i32 %N) {
+; CHECK-LABEL: @intermediate_complex_subloop_switch_from_inner_to_grandparent(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[LOOP_1:%.*]]
+; CHECK:       loop_1:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_1_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_2:%.*]]
+; CHECK:       loop_2:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_3:%.*]]
+; CHECK:       loop_3:
+; CHECK-NEXT:    [[K:%.*]] = phi i32 [ 0, [[LOOP_2]] ], [ [[K_NEXT:%.*]], [[LOOP_3_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
+; CHECK:       intermediate:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP:%.*]]
+; CHECK:       intermediate_loop:
+; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE:%.*]], label [[INTERMEDIATE_BLOCK:%.*]]
+; CHECK:       intermediate_loop.backedge:
+; CHECK-NEXT:    br label [[INTERMEDIATE_LOOP]]
+; CHECK:       intermediate_block:
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE]], label [[INTERMEDIATE_EXIT:%.*]]
+; CHECK:       intermediate_exit:
+; CHECK-NEXT:    switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       loop_3_backedge:
+; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
+; CHECK-NEXT:    br i1 [[COND2]], label [[LOOP_3]], label [[LOOP_2_BACKEDGE]]
+; CHECK:       loop_2_backedge:
+; CHECK-NEXT:    [[J_NEXT]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[J_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_2]], label [[LOOP_1_BACKEDGE_LOOPEXIT1:%.*]]
+; CHECK:       loop_1_backedge.loopexit:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge.loopexit1:
+; CHECK-NEXT:    br label [[LOOP_1_BACKEDGE]]
+; CHECK:       loop_1_backedge:
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_1]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP_1_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_LCSSA]]
+;
+preheader:
+  br label %loop_1
+
+loop_1:
+  %i = phi i32 [ 0, %preheader ], [ %i.next, %loop_1_backedge ]
+  br label %loop_2
+
+loop_2:
+  %j = phi i32 [ 0, %loop_1 ], [ %j.next, %loop_2_backedge ]
+  br label %loop_3
+
+loop_3:
+  %k = phi i32 [ 0, %loop_2 ], [ %k.next, %loop_3_backedge ]
+  br i1 %cond1, label %loop_3_backedge, label %intermediate
+
+intermediate:
+  br label %intermediate_loop
+
+intermediate_loop:
+  br i1 %cond3, label %intermediate_loop, label %intermediate_block
+
+intermediate_block:
+  br i1 %cond2, label %intermediate_loop, label %intermediate_exit
+
+intermediate_exit:
+  switch i32 1, label %loop_1_backedge [i32 0, label %loop_3_backedge]
+
+loop_3_backedge:
+  %k.next = add i32 %k, 1
+  br i1 %cond2, label %loop_3, label %loop_2_backedge
+
+loop_2_backedge:
+  %j.next = add i32 %j, 1
+  %c_2 = icmp slt i32 %j.next, %N
+  br i1 %c_2, label %loop_2, label %loop_1_backedge
+
+loop_1_backedge:
+  %i.next = add i32 %i, 1
+  %c_1 = icmp slt i32 %i.next, %N
+  br i1 %c_1, label %loop_1, label %exit
+
+exit:
+  ret i32 %i
+}
diff --git a/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll b/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8afc389d23a71f1e1a299359754d7a74358b8
--- /dev/null
+++ b/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(simplify-cfg)' -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+; This test has irreducible CFG, and RPO may be the following:
+; Header, Dead, Irreducible2, Latch, Irreducible3, Irreducible1.
+; As result, we will process Irreducible2 before its predecessor Irreducible1.
+; The current algorithm gets confused in this case. We may support irreducible
+; CFG in the future.
+define void @irreducible_cfg(i1 %cond) {
+; CHECK-LABEL: @irreducible_cfg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    br i1 false, label [[DEAD:%.*]], label [[IRREDUCIBLE1:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    br label [[IRREDUCIBLE2:%.*]]
+; CHECK:       irreducible2:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[LATCH:%.*]], label [[IRREDUCIBLE3:%.*]]
+; CHECK:       latch:
+; CHECK-NEXT:    br label [[HEADER]]
+; CHECK:       irreducible3:
+; CHECK-NEXT:    br label [[IRREDUCIBLE1]]
+; CHECK:       irreducible1:
+; CHECK-NEXT:    br label [[IRREDUCIBLE2]]
+;
+entry:
+  br label %header
+
+header:                                             ; preds = %latch, %entry
+  br i1 false, label %dead, label %irreducible1
+
+dead:                                          ; preds = %header
+  br label %irreducible2
+
+irreducible2:                                             ; preds = %irreducible1, %dead
+  br i1 %cond, label %latch, label %irreducible3
+
+latch:                                         ; preds = %irreducible2
+  br label %header
+
+irreducible3:                                           ; preds = %irreducible2
+  br label %irreducible1
+
+irreducible1:                                          ; preds = %irreducible3, %header
+  br label %irreducible2
+}
diff --git a/test/Transforms/LoopSimplifyCFG/lcssa.ll b/test/Transforms/LoopSimplifyCFG/lcssa.ll
new file mode 100644
index 0000000000000000000000000000000000000000..82920f153044bcfdf578e5da27592ce696b96068
--- /dev/null
+++ b/test/Transforms/LoopSimplifyCFG/lcssa.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(simplify-cfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @c() {
+; CHECK-LABEL: @c(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[D:%.*]]
+; CHECK:       d.loopexit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_COND:%.*]] ]
+; CHECK-NEXT:    br label [[D]]
+; CHECK:       d:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[DOTLCSSA]], [[D_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP1]] = phi i32 [ [[TMP0]], [[D]] ], [ 0, [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL2]], label [[IF_END]], label [[D_LOOPEXIT]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[FOR_COND]]
+;
+entry:
+  br label %d
+
+d.loopexit:                                       ; preds = %if.end.7, %for.body
+  %.lcssa = phi i32 [ %1, %for.body ], [ 0, %if.end.7 ]
+  br label %d
+
+d:                                                ; preds = %d.loopexit, %entry
+  %0 = phi i32 [ undef, %entry ], [ %.lcssa, %d.loopexit ]
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end.8, %d
+  %1 = phi i32 [ %0, %d ], [ 0, %if.end.8 ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %tobool2 = icmp eq i32 %1, 0
+  br i1 %tobool2, label %if.end, label %d.loopexit
+
+if.end:                                           ; preds = %for.body
+  br label %if.end.7
+
+if.end.7:                                         ; preds = %if.end
+  br i1 true, label %if.end.8, label %d.loopexit
+
+if.end.8:                                         ; preds = %if.end.7
+  br label %for.cond
+}
diff --git a/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll b/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fc1f5578f107f62dfa14a69f966c56e411c76aaf
--- /dev/null
+++ b/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; This is currently failing because of bug in LoopSimplifyCFG. It does not update
+; duplicating Phi inputs properly.
+; REQUIRES: asserts
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(simplify-cfg)' -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+
+target datalayout = "P40"
+
+@a = external global i16, align 1
+
+define void @f1(i1 %cond) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* @a, align 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i16 [[TMP0]], 0
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[C_1:%.*]] = phi i16 [ 2, [[IF_THEN]] ], [ 1, [[FOR_COND]] ]
+; CHECK-NEXT:    br label [[FOR_COND]]
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  br i1 %cond, label %if.then, label %for.inc
+
+if.then:
+  %0 = load i16, i16* @a, align 1
+  %tobool = icmp ne i16 %0, 0
+  br i1 %tobool, label %for.inc, label %for.inc
+
+for.inc:
+  %c.1 = phi i16 [ 2, %if.then ], [ 2, %if.then ], [ 1, %for.cond ]
+  br label %for.cond
+}
diff --git a/test/Transforms/LoopSimplifyCFG/pr39783.ll b/test/Transforms/LoopSimplifyCFG/pr39783.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9ebcc2a66a9353cbd64699f5fba2dabdf1075ded
--- /dev/null
+++ b/test/Transforms/LoopSimplifyCFG/pr39783.ll
@@ -0,0 +1,110 @@
+; REQUIRES: asserts
+; RUN: opt -march=z13 -S -loop-simplifycfg -enable-mssa-loop-dependency -enable-loop-simplifycfg-term-folding -verify-memoryssa 2>&1 < %s | FileCheck %s
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+
+@global = external dso_local local_unnamed_addr global i8, align 2
+@global.1 = external dso_local local_unnamed_addr global i32, align 4
+@global.2 = external dso_local local_unnamed_addr global i32, align 4
+@global.3 = external dso_local local_unnamed_addr global i16, align 2
+@global.4 = external dso_local local_unnamed_addr global i32, align 4
+
+; CHECK-LABEL: @test_01(
+
+define internal fastcc void @test_01() unnamed_addr {
+bb:
+  %tmp = load i32, i32* @global.2, align 4
+  %tmp1 = icmp eq i32 %tmp, 0
+  br i1 %tmp1, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb
+  br label %bb7
+
+bb3:                                              ; preds = %bb
+  br label %bb4
+
+bb4:                                              ; preds = %bb6, %bb3
+  br i1 true, label %bb5, label %bb6
+
+bb5:                                              ; preds = %bb4
+  store i16 0, i16* @global.3, align 2
+  br label %bb6
+
+bb6:                                              ; preds = %bb5, %bb4
+  br label %bb4
+
+bb7:                                              ; preds = %bb7, %bb2
+  %tmp8 = phi i32 [ 1, %bb7 ], [ 0, %bb2 ]
+  %tmp9 = icmp eq i32 %tmp8, 0
+  br i1 %tmp9, label %bb7, label %bb10
+
+bb10:                                             ; preds = %bb7
+  br label %bb11
+
+bb11:                                             ; preds = %bb13, %bb10
+  %tmp12 = icmp ult i32 %tmp, 6
+  br i1 %tmp12, label %bb13, label %bb14
+
+bb13:                                             ; preds = %bb11
+  store i32 0, i32* @global.1, align 4
+  br label %bb11
+
+bb14:                                             ; preds = %bb11
+  ret void
+}
+
+@global.5 = external dso_local local_unnamed_addr global i16, align 2
+
+declare dso_local void @spam() local_unnamed_addr
+
+declare dso_local void @blam() local_unnamed_addr
+
+declare dso_local i64 @quux.1() local_unnamed_addr
+
+declare dso_local void @bar() local_unnamed_addr
+
+; CHECK-LABEL: @test_02(
+
+define dso_local void @test_02(i8 signext %arg) local_unnamed_addr {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb16, %bb
+  %tmp = phi i8 [ %arg, %bb ], [ %tmp17, %bb16 ]
+  %tmp2 = load i16, i16* @global.5, align 2
+  %tmp3 = icmp ugt i16 %tmp2, 56
+  br i1 %tmp3, label %bb4, label %bb18
+
+bb4:                                              ; preds = %bb1
+  %tmp5 = tail call i64 @quux.1()
+  %tmp6 = icmp eq i64 %tmp5, 0
+  br i1 %tmp6, label %bb13, label %bb7
+
+bb7:                                              ; preds = %bb4
+  br label %bb8
+
+bb8:                                              ; preds = %bb8, %bb7
+  %tmp9 = phi i32 [ 26, %bb7 ], [ %tmp10, %bb8 ]
+  tail call void @bar()
+  %tmp10 = add nsw i32 %tmp9, -1
+  %tmp11 = icmp eq i32 %tmp10, 12
+  br i1 %tmp11, label %bb12, label %bb8
+
+bb12:                                             ; preds = %bb8
+  br i1 false, label %bb14, label %bb16
+
+bb13:                                             ; preds = %bb4
+  tail call void @spam()
+  br label %bb14
+
+bb14:                                             ; preds = %bb13, %bb12
+  %tmp15 = phi i8 [ -23, %bb12 ], [ %tmp, %bb13 ]
+  tail call void @blam()
+  br label %bb16
+
+bb16:                                             ; preds = %bb14, %bb12
+  %tmp17 = phi i8 [ %tmp15, %bb14 ], [ -23, %bb12 ]
+  br label %bb1
+
+bb18:                                             ; preds = %bb1
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll b/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
index 585759dd178795906fe641550e1293d1c218d5bc..04ad762df99acf9bd29a3c5be3feece65b32b90b 100644
--- a/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
+++ b/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
@@ -2,45 +2,10 @@
 
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
 
-; LSR doesn't consider bumping a pointer by constants outside the loop when the
-; constants fit as immediate add operands. The constants are re-associated as an
-; unfolded offset rather than a register and are not combined later with
-; loop-invariant registers. For large-enough constants LSR produces better
-; solutions for these test cases, with test1 switching from:
-;
-; The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 scale cost, plus 4 imm cost, plus 1 setup cost:
-;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
-;     -7 + reg({(7 + %start)<nsw>,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
-;     reg(%arr) + 4*reg({(7 + %start)<nsw>,+,1}<nsw><%for.body>)
-;
-; to:
-;
-; The chosen solution requires 1 instruction 2 regs, with addrec cost 1, plus 1 scale cost, plus 1 setup cost:
-;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
-;     reg((88888 + %arr)) + 4*reg({%start,+,1}<nsw><%for.body>)
-;
-; and test2 switching from:
-;
-; The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 base add, plus 1 scale cost:
-;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Basic, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
-;     reg(%arr) + 4*reg({%start,+,1}<nsw><%for.body>) + imm(28)
-;
-; to:
-;
-; The chosen solution requires 1 instruction 2 regs, with addrec cost 1, plus 1 scale cost, plus 1 setup cost:
-;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Basic, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
-;     reg((88888 + %arr)) + 4*reg({%start,+,1}<nsw><%for.body>)
+; Test LSR for giving small constants, which get re-associated as unfolded
+; offset, a chance to get combined with loop-invariant registers (same as
+; large constants which do not fit as add immediate operands). LSR
+; favors here to bump the base pointer outside the loop.
 
 ; float test(float *arr, long long start, float threshold) {
 ;   for (long long i = start; i != 0; ++i) {
@@ -56,17 +21,16 @@ define float @test1(float* nocapture readonly %arr, i64 %start, float %threshold
 ; CHECK-NEXT:    fmov s2, #-7.00000000
 ; CHECK-NEXT:    cbz x1, .LBB0_5
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    add x8, x1, #7 // =7
+; CHECK-NEXT:    add x8, x0, #28 // =28
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr s1, [x0, x8, lsl #2]
+; CHECK-NEXT:    ldr s1, [x8, x1, lsl #2]
 ; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    b.gt .LBB0_6
 ; CHECK-NEXT:  // %bb.3: // %for.cond
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    add x8, x8, #1 // =1
-; CHECK-NEXT:    cmp x8, #7 // =7
-; CHECK-NEXT:    b.ne .LBB0_2
+; CHECK-NEXT:    add x1, x1, #1 // =1
+; CHECK-NEXT:    cbnz x1, .LBB0_2
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
@@ -104,26 +68,27 @@ define float @test2(float* nocapture readonly %arr, i64 %start, float %threshold
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s2, #-7.00000000
-; CHECK-NEXT:    cbz x1, .LBB1_4
-; CHECK-NEXT:  .LBB1_1: // %for.body
+; CHECK-NEXT:    cbz x1, .LBB1_5
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    add x8, x0, #28 // =28
+; CHECK-NEXT:  .LBB1_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x8, x0, x1, lsl #2
-; CHECK-NEXT:    ldr s1, [x8, #28]
+; CHECK-NEXT:    ldr s1, [x8, x1, lsl #2]
 ; CHECK-NEXT:    scvtf s3, x1
 ; CHECK-NEXT:    fadd s3, s3, s0
 ; CHECK-NEXT:    fcmp s1, s3
-; CHECK-NEXT:    b.gt .LBB1_5
-; CHECK-NEXT:  // %bb.2: // %for.cond
-; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    b.gt .LBB1_6
+; CHECK-NEXT:  // %bb.3: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    add x1, x1, #1 // =1
-; CHECK-NEXT:    cbnz x1, .LBB1_1
-; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    cbnz x1, .LBB1_2
+; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:  .LBB1_5:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_5: // %cleanup4
+; CHECK-NEXT:  .LBB1_6: // %cleanup4
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/ARM/complexity.ll b/test/Transforms/LoopStrengthReduce/ARM/complexity.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f2cc0a5a6f8ffb693124712e33a9e380617debf0
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/ARM/complexity.ll
@@ -0,0 +1,118 @@
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
+; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s --check-prefix=CHECK-COMPLEX
+
+; CHECK-DEFAULT-LABEL: for.body12.us.us:
+; CHECK-DEFAULT: phi i32
+; CHECK-DEFAULT: [[LSR_IV:%[^ ]+]] = phi i32 [ [[LSR_IV_NEXT:%[^ ]+]], %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
+; CHECK-DEFAULT: phi i32
+; CHECK-DEFAULT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8
+
+; CHECK-COMPLEX-LABEL: for.body12.us.us:
+; CHECK-COMPLEX: phi i32
+; CHECK-COMPLEX: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ]
+; CHECK-COMPLEX: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ]
+; CHECK-COMPLEX: phi i32
+; CHECK-COMPLEX: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK-COMPLEX: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4
+
+define void @convolve(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, i32 %filter_dim, i32 %out_width, i32 %out_height, i32** nocapture readonly %convolved) {
+entry:
+  %cmp92 = icmp eq i32 %out_height, 0
+  br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %xtraiter = and i32 %filter_dim, 3
+  %unroll_iter = sub i32 %filter_dim, %xtraiter
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
+  %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
+  %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093
+  %tmp3 = load i32*, i32** %arrayidx22, align 4
+  br label %for.cond9.preheader.us.us.preheader
+
+for.cond9.preheader.us.us.preheader:              ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
+  %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
+  br label %for.cond9.preheader.us.us
+
+for.cond9.preheader.us.us:                        ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
+  %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
+  %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
+  %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
+  %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us
+  %tmp5 = load i16*, i16** %arrayidx.us.us, align 4
+  %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us
+  %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4
+  br label %for.body12.us.us
+
+for.body12.us.us:                                 ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
+  %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
+  %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
+  %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
+  %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
+  %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us
+  %tmp9 = load i16, i16* %arrayidx14.us.us, align 2
+  %conv.us.us = sext i16 %tmp9 to i32
+  %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us
+  %tmp10 = load i16, i16* %arrayidx16.us.us, align 2
+  %conv17.us.us = sext i16 %tmp10 to i32
+  %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
+  %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
+  %inc.us.us = or i32 %filter_x.053.us.us, 1
+  %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
+  %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us
+  %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2
+  %conv.us.us.1 = sext i16 %tmp11 to i32
+  %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1
+  %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2
+  %conv17.us.us.1 = sext i16 %tmp12 to i32
+  %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
+  %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
+  %inc.us.us.1 = or i32 %filter_x.053.us.us, 2
+  %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
+  %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1
+  %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2
+  %conv.us.us.2 = sext i16 %tmp13 to i32
+  %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2
+  %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2
+  %conv17.us.us.2 = sext i16 %tmp14 to i32
+  %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
+  %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
+  %inc.us.us.2 = or i32 %filter_x.053.us.us, 3
+  %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
+  %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2
+  %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2
+  %conv.us.us.3 = sext i16 %tmp15 to i32
+  %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3
+  %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2
+  %conv17.us.us.3 = sext i16 %tmp16 to i32
+  %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
+  %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
+  %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
+
+for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
+  %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
+  %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
+  br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
+
+for.cond5.for.cond.cleanup7_crit_edge.us:         ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
+  %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us
+  store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4
+  %add25.us = add nuw i32 %res_x.060.us, 1
+  %exitcond99 = icmp eq i32 %add25.us, %out_width
+  br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
+
+for.cond.cleanup3:                                ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
+  %add28 = add nuw i32 %res_y.093, 1
+  %exitcond100 = icmp eq i32 %add28, %out_height
+  br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+}
+
diff --git a/test/Transforms/LoopStrengthReduce/lsr-comp-time.ll b/test/Transforms/LoopStrengthReduce/lsr-comp-time.ll
index 959d517a5401d7994116bc1412c71913a3a1104c..6221c4a2b04ca799fc59674ce880f1b4a209f38a 100644
--- a/test/Transforms/LoopStrengthReduce/lsr-comp-time.ll
+++ b/test/Transforms/LoopStrengthReduce/lsr-comp-time.ll
@@ -1,4 +1,6 @@
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
+; RUN: opt -loop-reduce -lsr-complexity-limit=2147483647 -S < %s | FileCheck %s
+
 ; Test compile time should be <1sec (no hang).
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
new file mode 100644
index 0000000000000000000000000000000000000000..21917f5959cba376f8f92d10a3fb0dd6aacc4a3e
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
@@ -0,0 +1,55 @@
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+
+; This test is adapted from the n-body test of the LLVM test-suite: A bug in
+; r345114 caused LSR to generate incorrect code. The test verifies that the
+; induction variable generated for the inner loop depends on the induction
+; variable of the outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.planet.0.3.6.11.12.15.16.17.24.25.26.33.44 = type { double, double, double, double, double, double, double }
+
+; Function Attrs: nounwind uwtable
+define dso_local void @advance(i32 %nbodies, %struct.planet.0.3.6.11.12.15.16.17.24.25.26.33.44* nocapture %bodies) local_unnamed_addr #0 {
+; CHECK-LABEL: @advance(
+; CHECK:  for.cond.loopexit:
+; CHECK:    [[LSR_IV_NEXT:%.*]] = add i64 [[LSR_IV:%.*]], -1
+; CHECK:    br label %for.body
+; CHECK:  for.body:
+; CHECK:    [[LSR_IV]] = phi i64 [ [[LSR_IV_NEXT]]
+; CHECK:    br label %for.body3
+; CHECK:  for.body3:
+; CHECK:    [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], %for.body3 ], [ [[LSR_IV]], %for.body ]
+; CHECK:    [[LSR_IV_NEXT2]] = add i64 [[LSR_IV1]], -1
+; CHECK:    [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT2]], 0
+; CHECK:    br i1 [[EXITCOND]], label %for.cond.loopexit, label %for.body3
+;
+entry:
+  %wide.trip.count = zext i32 %nbodies to i64
+  br label %for.body
+
+for.cond.loopexit:                                ; preds = %for.body3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond.loopexit, %entry
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.cond.loopexit ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv98 = phi i64 [ %indvars.iv, %for.body ], [ %indvars.iv.next99, %for.body3 ]
+  %z9 = getelementptr inbounds %struct.planet.0.3.6.11.12.15.16.17.24.25.26.33.44, %struct.planet.0.3.6.11.12.15.16.17.24.25.26.33.44* %bodies, i64 %indvars.iv98, i32 2
+  %tmp = load double, double* %z9, align 8, !tbaa !0
+  %indvars.iv.next99 = add nuw nsw i64 %indvars.iv98, 1
+  %exitcond = icmp eq i64 %indvars.iv.next99, %wide.trip.count
+  br i1 %exitcond, label %for.cond.loopexit, label %for.body3
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!1, !2, i64 16}
+!1 = !{!"planet", !2, i64 0, !2, i64 8, !2, i64 16, !2, i64 24, !2, i64 32, !2, i64 40, !2, i64 48}
+!2 = !{!"double", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopTransformWarning/distribution-remarks-missed.ll b/test/Transforms/LoopTransformWarning/distribution-remarks-missed.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c60af79976722ea613e6c81d9d463681875a557c
--- /dev/null
+++ b/test/Transforms/LoopTransformWarning/distribution-remarks-missed.ll
@@ -0,0 +1,99 @@
+; Legacy pass manager
+; RUN: opt < %s -transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 2>&1 | FileCheck %s
+; RUN: opt < %s -transform-warning -disable-output -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; New pass manager
+; RUN: opt < %s -passes=transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=transform-warning -disable-output -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+
+; CHECK: warning: source.cpp:19:5: loop not distributed: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; YAML:     --- !Failure
+; YAML-NEXT: Pass:            transform-warning
+; YAML-NEXT: Name:            FailedRequestedDistribution
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not distributed: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering'
+; YAML-NEXT: ...
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) !dbg !8 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0, !dbg !32
+  br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32
+
+for.body.preheader:                          
+  br label %for.body, !dbg !35
+
+for.body:                                    
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !35
+  store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !50
+
+for.end.loopexit:                            
+  br label %for.end
+
+for.end:                                      
+  ret void, !dbg !36
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 10, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = distinct !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 16, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.5.0"}
+!12 = !DILocation(line: 3, column: 8, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!16 = !DILocation(line: 4, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !13)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 5, column: 9, scope: !23)
+!23 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !17)
+!24 = !DILocation(line: 8, column: 1, scope: !4)
+!25 = !DILocation(line: 12, column: 8, scope: !26)
+!26 = distinct !DILexicalBlock(line: 12, column: 3, file: !1, scope: !7)
+!30 = !DILocation(line: 13, column: 5, scope: !26)
+!31 = !DILocation(line: 14, column: 1, scope: !7)
+!32 = !DILocation(line: 18, column: 8, scope: !33)
+!33 = distinct !DILexicalBlock(line: 18, column: 3, file: !1, scope: !8)
+!35 = !DILocation(line: 19, column: 5, scope: !33)
+!36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, retainedNodes: !2)
+
+!50 = !{!50, !{!"llvm.loop.distribute.enable"}}
diff --git a/test/Transforms/LoopTransformWarning/optnone.ll b/test/Transforms/LoopTransformWarning/optnone.ll
new file mode 100644
index 0000000000000000000000000000000000000000..866dac8e9d882cd52716f5ee33921d2bb1289fcd
--- /dev/null
+++ b/test/Transforms/LoopTransformWarning/optnone.ll
@@ -0,0 +1,50 @@
+; Legacy pass manager
+; RUN: opt -transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning < %s 2>&1 | FileCheck -allow-empty %s
+;
+; New pass manager
+; RUN: opt -passes=transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning < %s 2>&1 | FileCheck -allow-empty %s
+;
+; Verify that no transformation warnings are emitted for functions with
+; 'optnone' attribute.
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @func(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !0
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+attributes #0 = { noinline optnone }
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.unroll.enable"}
+!2 = !{!"llvm.loop.distribute.enable"}
+!3 = !{!"llvm.loop.unroll_and_jam.enable"}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+; CHECK-NOT: warning
diff --git a/test/Transforms/LoopTransformWarning/unrollandjam-remarks-missed.ll b/test/Transforms/LoopTransformWarning/unrollandjam-remarks-missed.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1c32fb17ca4d338c40fd6a90734605bdba2075a6
--- /dev/null
+++ b/test/Transforms/LoopTransformWarning/unrollandjam-remarks-missed.ll
@@ -0,0 +1,99 @@
+; Legacy pass manager
+; RUN: opt < %s -transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 2>&1 | FileCheck %s
+; RUN: opt < %s -transform-warning -disable-output -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; New pass manager
+; RUN: opt < %s -passes=transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=transform-warning -disable-output -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+
+; CHECK: warning: source.cpp:19:5: loop not unroll-and-jammed: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; YAML:     --- !Failure
+; YAML-NEXT: Pass:            transform-warning
+; YAML-NEXT: Name:            FailedRequestedUnrollAndJamming
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not unroll-and-jammed: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering'
+; YAML-NEXT: ...
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) !dbg !8 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0, !dbg !32
+  br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32
+
+for.body.preheader:                          
+  br label %for.body, !dbg !35
+
+for.body:                                    
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !35
+  store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !50
+
+for.end.loopexit:                            
+  br label %for.end
+
+for.end:                                      
+  ret void, !dbg !36
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 10, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = distinct !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 16, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.5.0"}
+!12 = !DILocation(line: 3, column: 8, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!16 = !DILocation(line: 4, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !13)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 5, column: 9, scope: !23)
+!23 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !17)
+!24 = !DILocation(line: 8, column: 1, scope: !4)
+!25 = !DILocation(line: 12, column: 8, scope: !26)
+!26 = distinct !DILexicalBlock(line: 12, column: 3, file: !1, scope: !7)
+!30 = !DILocation(line: 13, column: 5, scope: !26)
+!31 = !DILocation(line: 14, column: 1, scope: !7)
+!32 = !DILocation(line: 18, column: 8, scope: !33)
+!33 = distinct !DILexicalBlock(line: 18, column: 3, file: !1, scope: !8)
+!35 = !DILocation(line: 19, column: 5, scope: !33)
+!36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, retainedNodes: !2)
+
+!50 = !{!50, !{!"llvm.loop.unroll_and_jam.enable"}}
diff --git a/test/Transforms/LoopTransformWarning/unrolling-remarks-missed.ll b/test/Transforms/LoopTransformWarning/unrolling-remarks-missed.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6934486f06d4c1dc8c3a81c8927cc3eaa3867490
--- /dev/null
+++ b/test/Transforms/LoopTransformWarning/unrolling-remarks-missed.ll
@@ -0,0 +1,99 @@
+; Legacy pass manager
+; RUN: opt < %s -transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 2>&1 | FileCheck %s
+; RUN: opt < %s -transform-warning -disable-output -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; New pass manager
+; RUN: opt < %s -passes=transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=transform-warning -disable-output -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+
+; CHECK: warning: source.cpp:19:5: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; YAML:     --- !Failure
+; YAML-NEXT: Pass:            transform-warning
+; YAML-NEXT: Name:            FailedRequestedUnrolling
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering'
+; YAML-NEXT: ...
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) !dbg !8 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0, !dbg !32
+  br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32
+
+for.body.preheader:                          
+  br label %for.body, !dbg !35
+
+for.body:                                    
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !35
+  store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !50
+
+for.end.loopexit:                            
+  br label %for.end
+
+for.end:                                      
+  ret void, !dbg !36
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 10, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = distinct !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 16, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.5.0"}
+!12 = !DILocation(line: 3, column: 8, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!16 = !DILocation(line: 4, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !13)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 5, column: 9, scope: !23)
+!23 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !17)
+!24 = !DILocation(line: 8, column: 1, scope: !4)
+!25 = !DILocation(line: 12, column: 8, scope: !26)
+!26 = distinct !DILexicalBlock(line: 12, column: 3, file: !1, scope: !7)
+!30 = !DILocation(line: 13, column: 5, scope: !26)
+!31 = !DILocation(line: 14, column: 1, scope: !7)
+!32 = !DILocation(line: 18, column: 8, scope: !33)
+!33 = distinct !DILexicalBlock(line: 18, column: 3, file: !1, scope: !8)
+!35 = !DILocation(line: 19, column: 5, scope: !33)
+!36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, retainedNodes: !2)
+
+!50 = !{!50, !{!"llvm.loop.unroll.enable"}}
diff --git a/test/Transforms/LoopTransformWarning/vectorization-remarks-missed.ll b/test/Transforms/LoopTransformWarning/vectorization-remarks-missed.ll
new file mode 100644
index 0000000000000000000000000000000000000000..30cdbb56ff4ec19512a488466c17fcf817d079e0
--- /dev/null
+++ b/test/Transforms/LoopTransformWarning/vectorization-remarks-missed.ll
@@ -0,0 +1,113 @@
+; Legacy pass manager
+; RUN: opt < %s -transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 2>&1 | FileCheck %s
+; RUN: opt < %s -transform-warning -disable-output -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; New pass manager
+; RUN: opt < %s -passes=transform-warning -disable-output -pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=transform-warning -disable-output -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+
+; C/C++ code for tests
+; void test(int *A, int Length) {
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < Length; i++) {
+;     A[i] = i;
+;     if (A[i] > Length)
+;       break;
+;   }
+; }
+; File, line, and column should match those specified in the metadata
+; CHECK: warning: source.cpp:19:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; YAML:     --- !Failure
+; YAML-NEXT: Pass:            transform-warning
+; YAML-NEXT: Name:            FailedRequestedVectorization
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering'
+; YAML-NEXT: ...
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) !dbg !8 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0, !dbg !32
+  br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32, !llvm.loop !34
+
+for.body.preheader:                          
+  br label %for.body, !dbg !35
+
+for.body:                                    
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !35
+  store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !34
+
+for.end.loopexit:                            
+  br label %for.end
+
+for.end:                                      
+  ret void, !dbg !36
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 10, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = distinct !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 16, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.5.0"}
+!12 = !DILocation(line: 3, column: 8, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!14 = !{!14, !15, !15}
+!15 = !{!"llvm.loop.vectorize.enable", i1 true}
+!16 = !DILocation(line: 4, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !13)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 5, column: 9, scope: !23)
+!23 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !17)
+!24 = !DILocation(line: 8, column: 1, scope: !4)
+!25 = !DILocation(line: 12, column: 8, scope: !26)
+!26 = distinct !DILexicalBlock(line: 12, column: 3, file: !1, scope: !7)
+!27 = !{!27, !28, !29}
+!28 = !{!"llvm.loop.interleave.count", i32 1}
+!29 = !{!"llvm.loop.vectorize.width", i32 1}
+!30 = !DILocation(line: 13, column: 5, scope: !26)
+!31 = !DILocation(line: 14, column: 1, scope: !7)
+!32 = !DILocation(line: 18, column: 8, scope: !33)
+!33 = distinct !DILexicalBlock(line: 18, column: 3, file: !1, scope: !8)
+!34 = !{!34, !15}
+!35 = !DILocation(line: 19, column: 5, scope: !33)
+!36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, retainedNodes: !2)
diff --git a/test/Transforms/LoopUnroll/disable-loop-unrolling_forced.ll b/test/Transforms/LoopUnroll/disable-loop-unrolling_forced.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9a0900df1a1ee2ef2ca7a3889dafd739c8728c86
--- /dev/null
+++ b/test/Transforms/LoopUnroll/disable-loop-unrolling_forced.ll
@@ -0,0 +1,30 @@
+; RUN: opt -disable-loop-unrolling -O1 -S < %s | FileCheck %s
+;
+; Check loop unrolling metadata is honored despite automatic unrolling
+; being disabled in the pass builder.
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @forced(
+; CHECK: load
+; CHECK: load
+define void @forced(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.unroll.enable"},
+                    !{!"llvm.loop.unroll.count", i32 2}}
diff --git a/test/Transforms/LoopUnroll/disable_nonforced.ll b/test/Transforms/LoopUnroll/disable_nonforced.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0678cca44d9adf6117a9d0a1b7df2f302c457b30
--- /dev/null
+++ b/test/Transforms/LoopUnroll/disable_nonforced.ll
@@ -0,0 +1,29 @@
+; RUN: opt -loop-unroll -unroll-count=2 -S < %s | FileCheck %s
+;
+; Check that the disable_nonforced loop property is honored by
+; loop unroll.
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced(
+; CHECK: load
+; CHECK-NOT: load
+define void @disable_nonforced(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}}
diff --git a/test/Transforms/LoopUnroll/disable_nonforced_count.ll b/test/Transforms/LoopUnroll/disable_nonforced_count.ll
new file mode 100644
index 0000000000000000000000000000000000000000..73517e5ffe747cb5f58c6bf1a833a00681906737
--- /dev/null
+++ b/test/Transforms/LoopUnroll/disable_nonforced_count.ll
@@ -0,0 +1,30 @@
+; RUN: opt -loop-unroll -unroll-count=2 -S < %s | FileCheck %s
+;
+; Check whether the llvm.loop.unroll.count loop property overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced_count(
+; CHECK: store
+; CHECK: store
+; CHECK-NOT: store
+define void @disable_nonforced_count(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.unroll.count", i32 2}}
diff --git a/test/Transforms/LoopUnroll/disable_nonforced_enable.ll b/test/Transforms/LoopUnroll/disable_nonforced_enable.ll
new file mode 100644
index 0000000000000000000000000000000000000000..75bbc3ed1493ba95ff6f2d71f24d9de7d6d7c982
--- /dev/null
+++ b/test/Transforms/LoopUnroll/disable_nonforced_enable.ll
@@ -0,0 +1,30 @@
+; RUN: opt -loop-unroll -unroll-count=2 -S < %s | FileCheck %s
+;
+; Check that the llvm.loop.unroll.enable loop property overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced_enable(
+; CHECK: store
+; CHECK: store
+; CHECK-NOT: store
+define void @disable_nonforced_enable(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.unroll.enable"}}
diff --git a/test/Transforms/LoopUnroll/disable_nonforced_full.ll b/test/Transforms/LoopUnroll/disable_nonforced_full.ll
new file mode 100644
index 0000000000000000000000000000000000000000..447108b257d42e82dac35e705f34d19f532c7cb2
--- /dev/null
+++ b/test/Transforms/LoopUnroll/disable_nonforced_full.ll
@@ -0,0 +1,32 @@
+; RUN: opt -loop-unroll -S < %s | FileCheck %s
+;
+; Check that the llvm.loop.unroll.full loop property overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced_full(
+; CHECK: store
+; CHECK: store
+; CHECK: store
+; CHECK: store
+; CHECK-NOT: store
+define void @disable_nonforced_full(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.unroll.full"}}
diff --git a/test/Transforms/LoopUnroll/followup.ll b/test/Transforms/LoopUnroll/followup.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8d26159563d6ff366fae4e2739f8f365b8cb237a
--- /dev/null
+++ b/test/Transforms/LoopUnroll/followup.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -S -loop-unroll -unroll-count=2 | FileCheck %s -check-prefixes=COUNT,COMMON
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=EPILOG,COMMON
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON
+;
+; Check that followup-attributes are applied after LoopUnroll.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+!1 = !{!"llvm.loop.unroll.followup_all", !{!"FollowupAll"}}
+!2 = !{!"llvm.loop.unroll.followup_unrolled", !{!"FollowupUnrolled"}}
+!3 = !{!"llvm.loop.unroll.followup_remainder", !{!"FollowupRemainder"}}
+!4 = distinct !{!4, !1, !2, !3}
+
+
+; COMMON-LABEL: @test(
+
+
+; COUNT: br i1 %exitcond.1, label %for.end.loopexit, label %for.body, !llvm.loop ![[LOOP:[0-9]+]]
+
+; COUNT: ![[FOLLOWUP_ALL:[0-9]+]] = !{!"FollowupAll"}
+; COUNT: ![[FOLLOWUP_UNROLLED:[0-9]+]] = !{!"FollowupUnrolled"}
+; COUNT: ![[LOOP]] = distinct !{![[LOOP]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_UNROLLED]]}
+
+
+; EPILOG: br i1 %niter.ncmp.7, label %for.end.loopexit.unr-lcssa.loopexit, label %for.body, !llvm.loop ![[LOOP_0:[0-9]+]]
+; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop ![[LOOP_2:[0-9]+]]
+
+; EPILOG: ![[LOOP_0]] = distinct !{![[LOOP_0]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_UNROLLED:[0-9]+]]}
+; EPILOG: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
+; EPILOG: ![[FOLLOWUP_UNROLLED]] = !{!"FollowupUnrolled"}
+; EPILOG: ![[LOOP_2]] = distinct !{![[LOOP_2]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_REMAINDER:[0-9]+]]}
+; EPILOG: ![[FOLLOWUP_REMAINDER]] = !{!"FollowupRemainder"}
+
+
+; PROLOG:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop ![[LOOP_0:[0-9]+]]
+; PROLOG:  br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_2:[0-9]+]]
+
+; PROLOG: ![[LOOP_0]] = distinct !{![[LOOP_0]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_REMAINDER:[0-9]+]]}
+; PROLOG: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
+; PROLOG: ![[FOLLOWUP_REMAINDER]] = !{!"FollowupRemainder"}
+; PROLOG: ![[LOOP_2]] = distinct !{![[LOOP_2]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_UNROLLED:[0-9]+]]}
+; PROLOG: ![[FOLLOWUP_UNROLLED]] = !{!"FollowupUnrolled"}
diff --git a/test/Transforms/LoopUnrollAndJam/disable_nonforced.ll b/test/Transforms/LoopUnrollAndJam/disable_nonforced.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c67ffb1b41414c2fa82ed22ed70878c245fc90e8
--- /dev/null
+++ b/test/Transforms/LoopUnrollAndJam/disable_nonforced.ll
@@ -0,0 +1,50 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=2 -S < %s | FileCheck %s
+;
+; Check that the disable_nonforced loop property is honored by
+; loop unroll-and-jam.
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: disable_nonforced
+; CHECK: load
+; CHECK-NOT: load
+define void @disable_nonforced(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !0
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.disable_nonforced"}}
diff --git a/test/Transforms/LoopUnrollAndJam/disable_nonforced_count.ll b/test/Transforms/LoopUnrollAndJam/disable_nonforced_count.ll
new file mode 100644
index 0000000000000000000000000000000000000000..13498cff06e6487dad75a85bbba8424d1962783e
--- /dev/null
+++ b/test/Transforms/LoopUnrollAndJam/disable_nonforced_count.ll
@@ -0,0 +1,52 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -S < %s | FileCheck %s
+;
+; Verify that the llvm.loop.unroll_and_jam.count loop property overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: @disable_nonforced_enable(
+; CHECK: load
+; CHECK: load
+; CHECK-NOT: load
+; CHECK: br i1
+define void @disable_nonforced_enable(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !0
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.unroll_and_jam.count", i32 2}}
diff --git a/test/Transforms/LoopUnrollAndJam/disable_nonforced_enable.ll b/test/Transforms/LoopUnrollAndJam/disable_nonforced_enable.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2194f6f1a484a6a2a9a9d8ae13b8517b3833834d
--- /dev/null
+++ b/test/Transforms/LoopUnrollAndJam/disable_nonforced_enable.ll
@@ -0,0 +1,52 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=2 -S < %s | FileCheck %s
+;
+; Verify that the llvm.loop.unroll_and_jam.enable loop property
+; overrides llvm.loop.disable_nonforced.
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: disable_nonforced_enable
+; CHECK: load
+; CHECK: load
+; CHECK-NOT: load
+; CHECK: br i1
+define void @disable_nonforced_enable(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !0
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.unroll_and_jam.enable"}}
diff --git a/test/Transforms/LoopUnrollAndJam/followup.ll b/test/Transforms/LoopUnrollAndJam/followup.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1c0975b1ce888c7f9c297c1daedc0fbde8108f3d
--- /dev/null
+++ b/test/Transforms/LoopUnrollAndJam/followup.ll
@@ -0,0 +1,66 @@
+; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
+;
+; Check that followup attributes are set in the new loops.
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @followup(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !0
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !1, !2, !3, !4, !6}
+!1 = !{!"llvm.loop.unroll_and_jam.enable"}
+!2 = !{!"llvm.loop.unroll_and_jam.followup_outer", !{!"FollowupOuter"}}
+!3 = !{!"llvm.loop.unroll_and_jam.followup_inner", !{!"FollowupInner"}}
+!4 = !{!"llvm.loop.unroll_and_jam.followup_all", !{!"FollowupAll"}}
+!6 = !{!"llvm.loop.unroll_and_jam.followup_remainder_inner", !{!"FollowupRemainderInner"}}
+
+
+; CHECK: br i1 %exitcond.3, label %for.latch, label %for.inner, !llvm.loop ![[LOOP_INNER:[0-9]+]]
+; CHECK: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit, label %for.outer, !llvm.loop ![[LOOP_OUTER:[0-9]+]]
+; CHECK: br i1 %exitcond.epil, label %for.latch.epil, label %for.inner.epil, !llvm.loop ![[LOOP_REMAINDER_INNER:[0-9]+]]
+; CHECK: br i1 %exitcond.epil.1, label %for.latch.epil.1, label %for.inner.epil.1, !llvm.loop ![[LOOP_REMAINDER_INNER]]
+; CHECK: br i1 %exitcond.epil.2, label %for.latch.epil.2, label %for.inner.epil.2, !llvm.loop ![[LOOP_REMAINDER_INNER]]
+
+; CHECK: ![[LOOP_INNER]] = distinct !{![[LOOP_INNER]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_INNER:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
+; CHECK: ![[FOLLOWUP_INNER]] = !{!"FollowupInner"}
+; CHECK: ![[LOOP_OUTER]] = distinct !{![[LOOP_OUTER]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_OUTER:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_OUTER]] = !{!"FollowupOuter"}
+; CHECK: ![[LOOP_REMAINDER_INNER]] = distinct !{![[LOOP_REMAINDER_INNER]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_REMAINDER_INNER:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_REMAINDER_INNER]] = !{!"FollowupRemainderInner"}
diff --git a/test/Transforms/LoopUnrollAndJam/pragma.ll b/test/Transforms/LoopUnrollAndJam/pragma.ll
index d45a04cdea843fd6c06de5cb18628e25430ca79f..1babfcf72f20847810639ebbe62f52d7e657202a 100644
--- a/test/Transforms/LoopUnrollAndJam/pragma.ll
+++ b/test/Transforms/LoopUnrollAndJam/pragma.ll
@@ -316,4 +316,4 @@ for.end:
 !8 = distinct !{!"llvm.loop.unroll.disable"}
 !9 = distinct !{!9, !10}
 !10 = distinct !{!"llvm.loop.unroll.enable"}
-!11 = distinct !{!11, !8, !6}
\ No newline at end of file
+!11 = distinct !{!11, !8, !6}
diff --git a/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
index 76b6cae5c3b4df1f77406159a8ca766dea499bf9..f38782cb7e597a49e1ea8d5dccfab1a75638184b 100644
--- a/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
+++ b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  < %s | FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f  < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index fb9d29e34a1017c83ab451d4d6e3ba2299a38703..95be7eb76b3a3549058da6285475e992fcfcfacb 100644
--- a/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -95,65 +95,68 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
 ; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
 ; FVW2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
-; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; FVW2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; FVW2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; FVW2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
-; FVW2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; FVW2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
-; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; FVW2-NEXT:    [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer
-; FVW2-NEXT:    [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD11]], zeroinitializer
-; FVW2-NEXT:    [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD12]], zeroinitializer
-; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
-; FVW2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> undef)
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2
-; FVW2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> undef)
-; FVW2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4
-; FVW2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> undef)
-; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 6
-; FVW2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> undef)
-; FVW2-NEXT:    [[TMP20:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
-; FVW2-NEXT:    [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64>
-; FVW2-NEXT:    [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD14]] to <2 x i64>
-; FVW2-NEXT:    [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD15]] to <2 x i64>
-; FVW2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]]
-; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]]
-; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]]
-; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]]
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef)
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER17:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef)
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER18:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef)
-; FVW2-NEXT:    [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER17]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER18]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
-; FVW2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]])
-; FVW2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 2
-; FVW2-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]])
-; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 4
-; FVW2-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]])
-; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 6
+; FVW2-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
+; FVW2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP5]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP6]], i32 4, <2 x i1> [[TMP2]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP7]], <2 x float>* [[TMP9]], i32 4, <2 x i1> [[TMP2]])
+; FVW2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 2
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer
+; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP15:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64>
+; FVW2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP15]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP16]], i32 4, <2 x i1> [[TMP12]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP17]], <2 x float>* [[TMP19]], i32 4, <2 x i1> [[TMP12]])
+; FVW2-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
+; FVW2-NEXT:    [[TMP22:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_2]], zeroinitializer
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP25:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_2]] to <2 x i64>
+; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP25]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP22]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP27:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP27]], <2 x float>* [[TMP29]], i32 4, <2 x i1> [[TMP22]])
+; FVW2-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 6
+; FVW2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP31]], align 4
+; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_3]], zeroinitializer
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP35:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_3]] to <2 x i64>
+; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP35]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP36]], i32 4, <2 x i1> [[TMP32]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP37:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
 ; FVW2-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]])
-; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
-; FVW2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP37]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP32]])
+; FVW2-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
 ; FVW2-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index c78bcdd172147d6e10bdbd50bb4f5cd0a6166549..683e857e5f2acd437683fd5e0740ac5cb9cd11b6 100644
--- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -1,12 +1,9 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK-LABEL: @foo(
-; CHECK: <4 x i32>
-; CHECK: ret void
-
 ; PR15794
 ; incorrect addition of llvm.mem.parallel_loop_access metadata is undefined
 ; behaviour. Vectorizer ignores the memory dependency checks and goes ahead and
@@ -21,8 +18,98 @@ target triple = "x86_64-unknown-linux-gnu"
 ;   }
 ; }
 
-; Function Attrs: nounwind uwtable 
+; Function Attrs: nounwind uwtable
 define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
+; CHECK:       for.body3.lr.ph.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP4]], 3
+; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP5]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3
+; CHECK:       for.body3.lr.ph.us:
+; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP9]], [[K]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, i1 [[TMP12]], i1 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 false, [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[ADD_US]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP24]], i32 0
+; CHECK-NEXT:    store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP24]], i32 1
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP24]], i32 2
+; CHECK-NEXT:    store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3
+; CHECK-NEXT:    store i32 [[TMP28]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY3_US]]
+; CHECK:       for.end15.loopexit:
+; CHECK-NEXT:    br label [[FOR_END15]]
+; CHECK:       for.end15:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp27 = icmp sgt i32 %m, 0
   br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
@@ -66,10 +153,46 @@ for.end15:                                        ; preds = %for.end.us, %entry
 
 ; Here we can see the vectorizer does the mem dep checks and decides it is
 ; unsafe to vectorize.
-; CHECK-LABEL: no-par-mem-metadata(
-; CHECK-NOT: <4 x i32>
-; CHECK:     ret void
 define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+; CHECK-LABEL: @no-par-mem-metadata(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
+; CHECK:       for.body3.lr.ph.us.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP0]], 3
+; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ 0, [[FOR_BODY3_LR_PH_US]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !1
+; CHECK:       for.body3.lr.ph.us:
+; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP3]], [[K:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    br label [[FOR_BODY3_US]]
+; CHECK:       for.end15.loopexit:
+; CHECK-NEXT:    br label [[FOR_END15]]
+; CHECK:       for.end15:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp27 = icmp sgt i32 %m, 0
   br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
diff --git a/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index c203a6605d80f86c0e55d9b66db33945a58d46c6..cf6cc1356e0c466b29c6e15c67fbaa2800e6642c 100644
--- a/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 -instcombine < %s | FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f -instcombine < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 9428a6d6f7404e81baa64c8caf21f4fae0065bd7..69f578cf789eb551816dea110423b0eda75a0621 100644
--- a/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  -instcombine < %s | FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f  -instcombine < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index 8be74113f77500ef5311c5e5d4e76634ff129b06..4aa96df94fffde29a5caa6844912b3f6254f6f6c 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -1,9 +1,9 @@
-; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-vectorize -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: opt < %s -loop-vectorize -transform-warning -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -transform-warning -o /dev/null -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 
-; RUN: opt < %s -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
-; RUN: opt < %s -passes=loop-vectorize -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: opt < %s -passes=loop-vectorize,transform-warning -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize'  2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize,transform-warning -o /dev/null -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 
 ; C/C++ code for tests
@@ -33,7 +33,7 @@
 ; }
 ; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
 ; CHECK: remark: source.cpp:19:5: loop not vectorized
-; CHECK: warning: source.cpp:19:5: loop not vectorized: failed explicitly specified loop vectorization
+; CHECK: warning: source.cpp:19:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
 
 ; int foo();
 ; void test_multiple_failures(int *A) {
@@ -94,13 +94,12 @@
 ; YAML-NEXT:   - String:          ')'
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Failure
-; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Pass:            transform-warning
 ; YAML-NEXT: Name:            FailedRequestedVectorization
 ; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
 ; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'loop not vectorized: '
-; YAML-NEXT:   - String:          failed explicitly specified loop vectorization
+; YAML-NEXT:   - String:          'loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering'
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
diff --git a/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll b/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1e5f18e371cd648ba6b6c47ec41a585b9623176d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; getDemandedBits() is called on the pointer-typed GEP instruction here.
+; Only make sure we do not crash.
+
+; CHECK: @test
+define void @test(i8* %ptr, i8* %ptr_end) {
+start:
+  br label %loop
+
+loop:
+  %ptr2 = phi i8* [ %ptr3, %loop ], [ %ptr, %start ]
+  %x = sext i8 undef to i64
+  %ptr3 = getelementptr inbounds i8, i8* %ptr2, i64 1
+  %cmp = icmp ult i8* %ptr3, %ptr_end
+  br i1 %cmp, label %loop, label %end
+
+end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/disable_nonforced.ll b/test/Transforms/LoopVectorize/disable_nonforced.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7df63ac7e2758955cd8b99f631f7e5c198e529c0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/disable_nonforced.ll
@@ -0,0 +1,29 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Check that the disable_nonforced loop property is honored by the
+; loop vectorizer.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced(
+; CHECK-NOT: x i32>
+define void @disable_nonforced(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}}
diff --git a/test/Transforms/LoopVectorize/disable_nonforced_enable.ll b/test/Transforms/LoopVectorize/disable_nonforced_enable.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7541ac389996b6b7f843ecc5b6dcd8e025637345
--- /dev/null
+++ b/test/Transforms/LoopVectorize/disable_nonforced_enable.ll
@@ -0,0 +1,29 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Check whether the llvm.loop.vectorize.enable loop property overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced_enable(
+; CHECK: store <2 x i32>
+define void @disable_nonforced_enable(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.vectorize.enable", i32 1}}
diff --git a/test/Transforms/LoopVectorize/followup.ll b/test/Transforms/LoopVectorize/followup.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a075061876fce191cbb37faad4d5a8d323f93d79
--- /dev/null
+++ b/test/Transforms/LoopVectorize/followup.ll
@@ -0,0 +1,43 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s
+;
+; Check that the followup loop attributes are applied.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @followup(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !3, !4, !5}
+!3 = !{!"llvm.loop.vectorize.followup_vectorized", !{!"FollowupVectorized"}}
+!4 = !{!"llvm.loop.vectorize.followup_epilogue", !{!"FollowupEpilogue"}}
+!5 = !{!"llvm.loop.vectorize.followup_all", !{!"FollowupAll"}}
+
+
+; CHECK-LABEL @followup(
+
+; CHECK-LABEL: vector.body:
+; CHECK: br i1 %13, label %middle.block, label %vector.body, !llvm.loop ![[LOOP_VECTOR:[0-9]+]]
+; CHECK-LABEL: for.body:
+; CHECK: br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop ![[LOOP_EPILOGUE:[0-9]+]]
+
+; CHECK: ![[LOOP_VECTOR]] = distinct !{![[LOOP_VECTOR]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_VECTORIZED:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
+; CHECK: ![[FOLLOWUP_VECTORIZED:[0-9]+]] = !{!"FollowupVectorized"}
+; CHECK: ![[LOOP_EPILOGUE]] = distinct !{![[LOOP_EPILOGUE]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_EPILOGUE:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_EPILOGUE]] = !{!"FollowupEpilogue"}
diff --git a/test/Transforms/LoopVectorize/if-reduction.ll b/test/Transforms/LoopVectorize/if-reduction.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e3ce3ef7c9bb27fefdf606150b36f25b76939491
--- /dev/null
+++ b/test/Transforms/LoopVectorize/if-reduction.ll
@@ -0,0 +1,821 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fadd_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fadd_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fadd_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fadd_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fadd_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fadd_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and a floating-point
+;   value.
+;
+; float fcmp_val_fadd_select1(float * restrict x, float y, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_val_fadd_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat2
+; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, %y
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and a floating-point
+;   value.
+;
+; double fcmp_val_fadd_select2(double * restrict x, double y, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_val_fadd_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat2
+; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, %y
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and another array
+;   element.
+;
+; float fcmp_array_elm_fadd_select1(float * restrict x, float * restrict y,
+;                                   const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y[i])
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_array_elm_fadd_select1(
+; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]]
+; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]]
+; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]]
+define float @fcmp_array_elm_fadd_select1(float* noalias %x, float* noalias %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %1 = load float, float* %arrayidx.2, align 4
+  %cmp.2 = fcmp fast ogt float %0, %1
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %2 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %2
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and another array
+;   element.
+;
+; double fcmp_array_elm_fadd_select2(double * restrict x, double * restrict y,
+;                                    const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y[i])
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_array_elm_fadd_select2(
+; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]]
+; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]]
+; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]]
+define double @fcmp_array_elm_fadd_select2(double* noalias %x, double* noalias %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx.1 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %1 = load double, double* %arrayidx.2, align 4
+  %cmp.2 = fcmp fast ogt double %0, %1
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %2 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %2
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fsub instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fsub_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fsub fast <4 x float> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fsub_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %sub = fsub fast float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %sub, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Float pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property.
+; float fcmp_0_fsub_select1_novectorize(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select1_novectorize(
+; CHECK-NOT: <4 x float>
+define float @fcmp_0_fsub_select1_novectorize(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %sub = fsub float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %sub, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fsub instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fsub_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fsub fast <4 x double> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fsub_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %sub = fsub fast double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %sub, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Double pattern:
+; Check that is not vectorized if fp-instruction has no fast-math property. 
+;
+; double fcmp_0_fsub_select2_notvectorize(double * restrict x, const int N) {
+;   double sum = 0.                                              
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select2_notvectorize(
+; CHECK-NOT: <4 x doubole>
+define double @fcmp_0_fsub_select2_notvectorize(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp ogt double %0, 0.000000e+00
+  %sub = fsub double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %sub, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fmul instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fmult_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fmul fast <4 x float> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fmult_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %mult = fmul fast float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %mult, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Float pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property. 
+;
+; float fcmp_0_fmult_select1_notvectorize(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select1_notvectorize(
+; CHECK-NOT: <4 x float>
+define float @fcmp_0_fmult_select1_notvectorize(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %mult = fmul float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %mult, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fmul instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fmult_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fmul fast <4 x double> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fmult_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %mult = fmul fast double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %mult, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Double pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property.
+;
+; double fcmp_0_fmult_select2_notvectorize(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select2_notvectorize(
+; CHECK-NOT: <4 x double>
+define double @fcmp_0_fmult_select2_notvectorize(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp ogt double %0, 0.000000e+00
+  %mult = fmul double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %mult, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float multi pattern
+;   Check vectorisation of reduction code with a pair of selects to different
+;   fadd patterns.
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum+=2*a[i];
+;     else
+;       sum+=3*a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_multi(
+; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
+; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00,
+; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
+; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
+; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]]
+; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]]
+; CHECK: fadd fast <4 x float> %[[S2]],
+define float @fcmp_multi(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.011 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %for.inc, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %if.else14
+
+if.then10:                                        ; preds = %if.else
+  %mul = fmul fast float %0, 2.000000e+00
+  br label %for.inc
+
+if.else14:                                        ; preds = %if.else
+  %mul17 = fmul fast float %0, 3.000000e+00
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.else14, %if.then10
+  %.pn = phi float [ %mul, %if.then10 ], [ %mul17, %if.else14 ], [ %0, %for.body ]
+  %sum.1 = fadd fast float %.pn, %sum.011
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + fsub patterns
+;   Check vectorisation of reduction code with a pair of selects to different
+;   instructions { fadd, fsub } but equivalent (change in constant).
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum-=a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_fadd_fsub(
+; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
+; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float>
+; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float>
+; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
+; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
+; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]]
+; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]]
+define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = fadd fast float %0, %sum.010
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %for.inc
+
+if.then10:                                        ; preds = %if.else
+  %sub = fsub fast float %sum.010, %0
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then10, %if.else
+  %sum.1 = phi float [ %add, %if.then ], [ %sub, %if.then10 ], [ %sum.010, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + fmul patterns
+;   Check lack of vectorisation of reduction code with a pair of non-compatible
+;   instructions { fadd, fmul }.
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum*=a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_fadd_fmul(
+; CHECK-NOT: <4 x float>
+define float @fcmp_fadd_fmul(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = fadd fast float %0, %sum.010
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %for.inc
+
+if.then10:                                        ; preds = %if.else
+  %mul = fmul fast float %0, %sum.010
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then10, %if.else
+  %sum.1 = phi float [ %add, %if.then ], [ %mul, %if.then10 ], [ %sum.010, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + store patterns
+;   Check lack of vectorisation of reduction code with a store back, given it
+;   has loop dependency on a[i].
+;
+; float fcmp_store_back(float a[], int LEN) {
+;     float sum = 0.0;
+;     for (int i = 0; i < LEN; i++) {
+;       sum += a[i];
+;       a[i] = sum;
+;     }
+;     return sum;
+; }
+
+; CHECK-LABEL: @fcmp_store_back(
+; CHECK-NOT: <4 x float>
+define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly {
+entry:
+  %cmp7 = icmp sgt i32 %LEN, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %LEN to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.08 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %0, %sum.08
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  ret float %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index 203c4435c88c8fb21f555319a3026f68561e7598..50cdb73ae8ec96dbbc3f98b7ccb5e3672b82fe0e 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1192,6 +1192,58 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+declare i32 @llvm.fshl.i32 (i32, i32, i32)
+
+define void @fshl_i32(i32 %n, i32* noalias %x, i32* noalias %y, i32 %shAmt) {
+; CHECK-LABEL: @fshl_i32(
+; CHECK:         call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]], <4 x i32> [[SPLAT:%.*]])
+; CHECK:         ret void
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %end
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %xi = getelementptr inbounds i32, i32* %x, i32 %iv
+  %yi = getelementptr inbounds i32, i32* %y, i32 %iv
+  %xld = load i32, i32* %xi, align 4
+  %yld = load i32, i32* %yi, align 4
+  %call = tail call i32 @llvm.fshl.i32(i32 %xld, i32 %yld, i32 %shAmt)
+  store i32 %call, i32* %xi, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret void
+}
+
+declare i32 @llvm.fshr.i32 (i32, i32, i32)
+
+define void @fshr_i32(i32 %n, i32* noalias %x, i32* noalias %y, i32 %shAmt) {
+; CHECK-LABEL: @fshr_i32(
+; CHECK:         call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]], <4 x i32> [[SPLAT:%.*]])
+; CHECK:         ret void
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %end
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %xi = getelementptr inbounds i32, i32* %x, i32 %iv
+  %yi = getelementptr inbounds i32, i32* %y, i32 %iv
+  %xld = load i32, i32* %xi, align 4
+  %yld = load i32, i32* %yi, align 4
+  %call = tail call i32 @llvm.fshr.i32(i32 %xld, i32 %yld, i32 %shAmt)
+  store i32 %call, i32* %xi, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret void
+}
+
 declare float @llvm.minnum.f32(float, float) nounwind readnone
 
 ;CHECK-LABEL: @minnum_f32(
diff --git a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index e341576e5316e21ba74347b0957c19e85340e6b7..cc4c55c3f444a53e4eed8d0dea4e1dd72b4b89cc 100644
--- a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -549,3 +549,45 @@ for.inc8:                                         ; preds = %for.body3, %for.con
 for.end10:                                        ; preds = %for.inc8, %entry
   ret i32 undef
 }
+
+; cannot vectorize loop with unsafe dependency between uniform load (%tmp10) and store
+; (%tmp12) to the same address
+; PR39653
+; Note: %tmp10 could be replaced by phi(%arg4, %tmp12), a potentially vectorizable
+; 1st-order-recurrence
+define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) {
+; CHECK-LABEL: unsafe_dep_uniform_load_store
+; CHECK-NOT: <4 x i32>
+bb:
+  %tmp = alloca i32
+  store i32 %arg4, i32* %tmp
+  %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5
+  br label %bb7
+
+bb7:
+  %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ]
+  %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ]
+  %tmp10 = load i32, i32* %tmp
+  %tmp11 = mul nsw i32 %tmp9, %tmp10
+  %tmp12 = srem i32 %tmp11, 65536
+  %tmp13 = add nsw i32 %tmp12, %tmp9
+  %tmp14 = trunc i32 %tmp13 to i16
+  %tmp15 = trunc i64 %tmp8 to i32
+  %tmp16 = add i32 %arg, %tmp15
+  %tmp17 = zext i32 %tmp16 to i64
+  %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17
+  store i16 %tmp14, i16* %tmp18, align 2
+  %tmp19 = add i32 %tmp13, %tmp9
+  %tmp20 = trunc i32 %tmp19 to i16
+  %tmp21 = and i16 %tmp20, 255
+  %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17
+  store i16 %tmp21, i16* %tmp22, align 2
+  %tmp23 = add nsw i32 %tmp9, 1
+  %tmp24 = add nuw nsw i64 %tmp8, 1
+  %tmp25 = icmp eq i64 %tmp24, %arg2
+  store i32 %tmp12, i32* %tmp
+  br i1 %tmp25, label %bb26, label %bb7
+
+bb26:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/no_array_bounds.ll b/test/Transforms/LoopVectorize/no_array_bounds.ll
index a799784ac250fbc76cd366ea236a7d377f448a04..c6a2431eba5ba5f72d83cdf94759410a823f36b1 100644
--- a/test/Transforms/LoopVectorize/no_array_bounds.ll
+++ b/test/Transforms/LoopVectorize/no_array_bounds.ll
@@ -1,8 +1,8 @@
-; RUN: opt < %s -loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -transform-warning -S 2>&1 | FileCheck %s
 
 ; Verify warning is generated when vectorization/ interleaving is explicitly specified and fails to occur.
-; CHECK: warning: no_array_bounds.cpp:5:5: loop not vectorized: failed explicitly specified loop vectorization
-; CHECK: warning: no_array_bounds.cpp:10:5: loop not interleaved: failed explicitly specified loop interleaving
+; CHECK: warning: no_array_bounds.cpp:5:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+; CHECK: warning: no_array_bounds.cpp:10:5: loop not interleaved: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
 
 ;  #pragma clang loop vectorize(enable)
 ;  for (int i = 0; i < number; i++) {
diff --git a/test/Transforms/LoopVectorize/no_switch.ll b/test/Transforms/LoopVectorize/no_switch.ll
index b40c73b112666f232bf260bfbb8fba674475cb88..3976463e766c7be258a9ae235f32e711da341d5b 100644
--- a/test/Transforms/LoopVectorize/no_switch.ll
+++ b/test/Transforms/LoopVectorize/no_switch.ll
@@ -1,16 +1,16 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=1 -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -transform-warning -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO
 
 ; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
-; CHECK: warning: source.cpp:4:5: loop not vectorized: failed explicitly specified loop vectorization
+; CHECK: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
 
 ; NOANALYSIS-NOT: remark: {{.*}}
-; NOANALYSIS: warning: source.cpp:4:5: loop not interleaved: failed explicitly specified loop interleaving
+; NOANALYSIS: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
 
 ; MOREINFO: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
 ; MOREINFO: remark: source.cpp:4:5: loop not vectorized (Force=true, Vector Width=4)
-; MOREINFO: warning: source.cpp:4:5: loop not vectorized: failed explicitly specified loop vectorization
+; MOREINFO: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
 
 ; CHECK: _Z11test_switchPii
 ; CHECK-NOT: x i32>
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
index 88489faa831a7a5b696e8cc8b9c4a7b45396a837..19bfa50f769b4299e0faf5846551ddff4b5b4109 100644
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -117,6 +117,46 @@ loopexit:
   ret void
 }
 
+; Check we do generate unnecessary runtime checks. They will always fail.
+
+; void test_runtime_check2(float *a, float b, unsigned offset, unsigned offset2, unsigned n, float *c) {
+;   for (unsigned i = 1; i < n; i++) {
+;     a[i+o1] += a[i+o2] + b;
+;     c[i] = c[i-1] + b;
+;   }
+; }
+;
+; CHECK-LABEL: test_runtime_check2
+; CHECK:      <4 x float>
+define void @test_runtime_check2(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n, float* %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ind.sum = add i64 %iv, %offset
+  %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum
+  %l1 = load float, float* %arr.idx, align 4
+  %ind.sum2 = add i64 %iv, %offset2
+  %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2
+  %l2 = load float, float* %arr.idx2, align 4
+  %m = fmul fast float %b, %l2
+  %ad = fadd fast float %l1, %m
+  store float %ad, float* %arr.idx, align 4
+  %c.ind = add i64 %iv, -1
+  %c.idx = getelementptr inbounds float, float* %c, i64 %c.ind
+  %lc = load float, float* %c.idx, align 4
+  %vc = fadd float %lc, 1.0
+  %c.idx2 = getelementptr inbounds float, float* %c, i64 %iv
+  store float %vc, float* %c.idx2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  ret void
+}
+
 ; CHECK: !9 = !DILocation(line: 101, column: 1, scope: !{{.*}})
 
 !llvm.module.flags = !{!0, !1}
diff --git a/test/Transforms/MakeGuardsExplicit/basic.ll b/test/Transforms/MakeGuardsExplicit/basic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3f2f4c4f747caa3c79db63334ed5af09a8d2fec0
--- /dev/null
+++ b/test/Transforms/MakeGuardsExplicit/basic.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -make-guards-explicit < %s        | FileCheck %s
+; RUN: opt -S -passes=make-guards-explicit < %s | FileCheck %s
+
+declare void @llvm.experimental.guard(i1,...)
+
+; Check that a sole guard can be turned into explicit guards form.
+define void @trivial_guard(i1 %cond) {
+; CHECK-LABEL: @trivial_guard(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[COND:%.*]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[GUARDED:%.*]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 123, i64 456) ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"(i32 123, i64 456) ]
+  ret void
+}
+
+; Check that a sequence of guards can be turned into explicit guards form.
+define void @trivial_guard_sequence(i1 %cond1, i1 %cond2, i1 %cond3) {
+; CHECK-LABEL: @trivial_guard_sequence(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[COND1:%.*]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[GUARDED:%.*]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 123, i64 456) ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded:
+; CHECK-NEXT:    [[WIDENABLE_COND3:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND4:%.*]] = and i1 [[COND2:%.*]], [[WIDENABLE_COND3]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND4]], label [[GUARDED1:%.*]], label [[DEOPT2:%.*]], !prof !0
+; CHECK:       deopt2:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 789, i64 123) ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded1:
+; CHECK-NEXT:    [[WIDENABLE_COND7:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND8:%.*]] = and i1 [[COND3:%.*]], [[WIDENABLE_COND7]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND8]], label [[GUARDED5:%.*]], label [[DEOPT6:%.*]], !prof !0
+; CHECK:       deopt6:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 456, i64 789) ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded5:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"(i32 123, i64 456) ]
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"(i32 789, i64 123) ]
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond3) [ "deopt"(i32 456, i64 789) ]
+  ret void
+}
+
+; Check that all instructions between the guards preserve.
+define void @split_block_contents(i1 %cond1, i1 %cond2, i1 %cond3, i32* %p) {
+; CHECK-LABEL: @split_block_contents(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[COND1:%.*]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[GUARDED:%.*]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 123, i64 456) ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded:
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    [[WIDENABLE_COND3:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND4:%.*]] = and i1 [[COND2:%.*]], [[WIDENABLE_COND3]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND4]], label [[GUARDED1:%.*]], label [[DEOPT2:%.*]], !prof !0
+; CHECK:       deopt2:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 789, i64 123) ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded1:
+; CHECK-NEXT:    store i32 2, i32* [[P]]
+; CHECK-NEXT:    [[WIDENABLE_COND7:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND8:%.*]] = and i1 [[COND3:%.*]], [[WIDENABLE_COND7]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND8]], label [[GUARDED5:%.*]], label [[DEOPT6:%.*]], !prof !0
+; CHECK:       deopt6:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 456, i64 789) ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded5:
+; CHECK-NEXT:    store i32 3, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 0, i32* %p
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"(i32 123, i64 456) ]
+  store i32 1, i32* %p
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"(i32 789, i64 123) ]
+  store i32 2, i32* %p
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond3) [ "deopt"(i32 456, i64 789) ]
+  store i32 3, i32* %p
+  ret void
+}
+
+; Check that the guard can split a loop properly.
+define void @split_loop(i1 %cond, i32 %N, i32 %M) {
+; CHECK-LABEL: @split_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ]
+; CHECK-NEXT:    [[GUARD_COND:%.*]] = icmp slt i32 [[IV]], [[N:%.*]]
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[GUARD_COND]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 123, i64 456) ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV]], [[M:%.*]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %guard_cond = icmp slt i32 %iv, %N
+  call void(i1, ...) @llvm.experimental.guard(i1 %guard_cond) [ "deopt"(i32 123, i64 456) ]
+  %loop_cond = icmp slt i32 %iv, %M
+  %iv.next = add i32 %iv, 1
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/Mem2Reg/undef-order.ll b/test/Transforms/Mem2Reg/undef-order.ll
new file mode 100644
index 0000000000000000000000000000000000000000..09687518c87240b0ae515d54236f66b19b9d2938
--- /dev/null
+++ b/test/Transforms/Mem2Reg/undef-order.ll
@@ -0,0 +1,53 @@
+;RUN: opt -mem2reg -S < %s | FileCheck %s
+
+declare i1 @cond()
+
+define i32 @foo() {
+Entry:
+    %val = alloca i32
+    %c1 = call i1 @cond()
+    br i1 %c1, label %Store1, label %Store2
+Block1:
+    br label %Join
+Block2:
+    br label %Join
+Block3:
+    br label %Join
+Block4:
+    br label %Join
+Block5:
+    br label %Join
+Store1:
+    store i32 1, i32* %val
+    br label %Join
+Block6:
+    br label %Join
+Block7:
+    br label %Join
+Block8:
+    br label %Join
+Block9:
+    br label %Join
+Block10:
+    br label %Join
+Store2:
+    store i32 2, i32* %val
+    br label %Join
+Block11:
+    br label %Join
+Block12:
+    br label %Join
+Block13:
+    br label %Join
+Block14:
+    br label %Join
+Block15:
+    br label %Join
+Block16:
+    br label %Join
+Join:
+; Phi inserted here should have operands appended deterministically
+; CHECK: %val.0 = phi i32 [ 1, %Store1 ], [ 2, %Store2 ], [ undef, %Block1 ], [ undef, %Block2 ], [ undef, %Block3 ], [ undef, %Block4 ], [ undef, %Block5 ], [ undef, %Block6 ], [ undef, %Block7 ], [ undef, %Block8 ], [ undef, %Block9 ], [ undef, %Block10 ], [ undef, %Block11 ], [ undef, %Block12 ], [ undef, %Block13 ], [ undef, %Block14 ], [ undef, %Block15 ], [ undef, %Block16 ]
+    %result = load i32, i32* %val
+    ret i32 %result
+}
diff --git a/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7ee0682ed2295225d0ad00dad71d9cbbd3fd6def
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -memcpyopt -S %s | FileCheck %s
+
+; memset -> memcpy forwarding, if memcpy is larger than memset, but trailing
+; bytes are known to be undef.
+
+
+%T = type { i64, i32, i32 }
+
+define void @test_alloca(i8* %result) {
+; CHECK-LABEL: @test_alloca(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca %T, align 8
+  %b = bitcast %T* %a to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false)
+  ret void
+}
+
+define void @test_alloca_with_lifetimes(i8* %result) {
+; CHECK-LABEL: @test_alloca_with_lifetimes(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 16, i8* [[B]])
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16, i8* [[B]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca %T, align 8
+  %b = bitcast %T* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %b)
+  call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %b)
+  ret void
+}
+
+define void @test_malloc_with_lifetimes(i8* %result) {
+; CHECK-LABEL: @test_malloc_with_lifetimes(
+; CHECK-NEXT:    [[A:%.*]] = call i8* @malloc(i64 16)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 16, i8* [[A]])
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[A]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16, i8* [[A]])
+; CHECK-NEXT:    call void @free(i8* [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = call i8* @malloc(i64 16)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %a)
+  call void @llvm.memset.p0i8.i64(i8* align 8 %a, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %a, i64 16, i1 false)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %a)
+  call void @free(i8* %a)
+  ret void
+}
+
+; memcpy size is larger than lifetime, don't optimize.
+define void @test_copy_larger_than_lifetime_size(i8* %result) {
+; CHECK-LABEL: @test_copy_larger_than_lifetime_size(
+; CHECK-NEXT:    [[A:%.*]] = call i8* @malloc(i64 16)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 12, i8* [[A]])
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[A]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[A]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 12, i8* [[A]])
+; CHECK-NEXT:    call void @free(i8* [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = call i8* @malloc(i64 16)
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %a)
+  call void @llvm.memset.p0i8.i64(i8* align 8 %a, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %a, i64 16, i1 false)
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %a)
+  call void @free(i8* %a)
+  ret void
+}
+
+; The trailing bytes are not known to be undef, we can't ignore them.
+define void @test_not_undef_memory(i8* %result, i8* %input) {
+; CHECK-LABEL: @test_not_undef_memory(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[INPUT:%.*]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[INPUT]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* align 8 %input, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %input, i64 16, i1 false)
+  ret void
+}
+
+; Memset is volatile, memcpy is not. Can be optimized.
+define void @test_volatile_memset(i8* %result) {
+; CHECK-LABEL: @test_volatile_memset(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 true)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca %T, align 8
+  %b = bitcast %T* %a to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false)
+  ret void
+}
+
+; Memcpy is volatile, memset is not. Cannot be optimized.
+define void @test_volatile_memcpy(i8* %result) {
+; CHECK-LABEL: @test_volatile_memcpy(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 true)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca %T, align 8
+  %b = bitcast %T* %a to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 true)
+  ret void
+}
+
+; Write between memset and memcpy, can't optimize.
+define void @test_write_between(i8* %result) {
+; CHECK-LABEL: @test_write_between(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    store i8 -1, i8* [[B]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca %T, align 8
+  %b = bitcast %T* %a to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false)
+  store i8 -1, i8* %b
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false)
+  ret void
+}
+
+; A write prior to the memset, which is part of the memset region.
+; We could optimize this, but currently don't, because the used memory location is imprecise.
+define void @test_write_before_memset_in_memset_region(i8* %result) {
+; CHECK-LABEL: @test_write_before_memset_in_memset_region(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
+; CHECK-NEXT:    store i8 -1, i8* [[B]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca %T, align 8
+  %b = bitcast %T* %a to i8*
+  store i8 -1, i8* %b
+  call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false)
+  ret void
+}
+
+; A write prior to the memset, which is part of the memcpy (but not memset) region.
+; This cannot be optimized.
+define void @test_write_before_memset_in_memcpy_region(i8* %result) {
+; CHECK-LABEL: @test_write_before_memset_in_memcpy_region(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
+; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[T]], %T* [[A]], i64 0, i32 2
+; CHECK-NEXT:    store i32 -1, i32* [[C]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca %T, align 8
+  %b = bitcast %T* %a to i8*
+  %c = getelementptr inbounds %T, %T* %a, i64 0, i32 2
+  store i32 -1, i32* %c
+  call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false)
+  ret void
+}
+
+; A write prior to the memset, which is part of both the memset and memcpy regions.
+; This cannot be optimized.
+define void @test_write_before_memset_in_both_regions(i8* %result) {
+; CHECK-LABEL: @test_write_before_memset_in_both_regions(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
+; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[T]], %T* [[A]], i64 0, i32 1
+; CHECK-NEXT:    store i32 -1, i32* [[C]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 10, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca %T, align 8
+  %b = bitcast %T* %a to i8*
+  %c = getelementptr inbounds %T, %T* %a, i64 0, i32 1
+  store i32 -1, i32* %c
+  call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 10, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false)
+  ret void
+}
+
+declare i8* @malloc(i64)
+declare void @free(i8*)
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/test/Transforms/MergeFunc/alias.ll b/test/Transforms/MergeFunc/alias.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ee1c7af5e817f9aacca4a5db50a109acd51a98a1
--- /dev/null
+++ b/test/Transforms/MergeFunc/alias.ll
@@ -0,0 +1,116 @@
+; RUN: opt -S -mergefunc -mergefunc-use-aliases < %s | FileCheck %s
+
+; Aliases should always be created for the weak functions, and
+; for external functions if there is no local function
+
+; CHECK: @external_external_2 = unnamed_addr alias void (float*), bitcast (void (i32*)* @external_external_1 to void (float*)*)
+; CHECK: @weak_weak_2 = weak unnamed_addr alias void (float*), bitcast (void (i32*)* @0 to void (float*)*)
+; CHECK: @weak_weak_1 = weak unnamed_addr alias void (i32*), void (i32*)* @0
+; CHECK: @weak_external_1 = weak unnamed_addr alias void (i32*), bitcast (void (float*)* @weak_external_2 to void (i32*)*)
+; CHECK: @external_weak_2 = weak unnamed_addr alias void (float*), bitcast (void (i32*)* @external_weak_1 to void (float*)*)
+; CHECK: @weak_internal_1 = weak unnamed_addr alias void (i32*), bitcast (void (float*)* @weak_internal_2 to void (i32*)*)
+; CHECK: @internal_weak_2 = weak unnamed_addr alias void (float*), bitcast (void (i32*)* @internal_weak_1 to void (float*)*)
+
+; A strong backing function had to be created for the weak-weak pair
+
+; CHECK: define private void @0(i32* %a) unnamed_addr
+; CHECK_NEXT: call void @dummy4()
+
+; These internal functions are dropped in favor of the external ones
+
+; CHECK-NOT: define internal void @external_internal_2(float *%a) unnamed_addr
+; CHECK-NOT: define internal void @internal_external_1(i32 *%a) unnamed_addr
+; CHECK-NOT: define internal void @internal_external_1(i32 *%a) unnamed_addr
+; CHECK-NOT: define internal void @internal_external_2(float *%a) unnamed_addr
+
+; Only used to mark which functions should be merged.
+declare void @dummy1()
+declare void @dummy2()
+declare void @dummy3()
+declare void @dummy4()
+declare void @dummy5()
+declare void @dummy6()
+declare void @dummy7()
+declare void @dummy8()
+declare void @dummy9()
+
+define void @external_external_1(i32 *%a) unnamed_addr {
+  call void @dummy1()
+  ret void
+}
+define void @external_external_2(float *%a) unnamed_addr {
+  call void @dummy1()
+  ret void
+}
+
+define void @external_internal_1(i32 *%a) unnamed_addr {
+  call void @dummy2()
+  ret void
+}
+define internal void @external_internal_2(float *%a) unnamed_addr {
+  call void @dummy2()
+  ret void
+}
+
+define internal void @internal_external_1(i32 *%a) unnamed_addr {
+  call void @dummy3()
+  ret void
+}
+define void @internal_external_2(float *%a) unnamed_addr {
+  call void @dummy3()
+  ret void
+}
+
+define weak void @weak_weak_1(i32 *%a) unnamed_addr {
+  call void @dummy4()
+  ret void
+}
+define weak void @weak_weak_2(float *%a) unnamed_addr {
+  call void @dummy4()
+  ret void
+}
+
+define weak void @weak_external_1(i32 *%a) unnamed_addr {
+  call void @dummy5()
+  ret void
+}
+define external void @weak_external_2(float *%a) unnamed_addr {
+  call void @dummy5()
+  ret void
+}
+
+define external void @external_weak_1(i32 *%a) unnamed_addr {
+  call void @dummy6()
+  ret void
+}
+define weak void @external_weak_2(float *%a) unnamed_addr {
+  call void @dummy6()
+  ret void
+}
+
+define weak void @weak_internal_1(i32 *%a) unnamed_addr {
+  call void @dummy7()
+  ret void
+}
+define internal void @weak_internal_2(float *%a) unnamed_addr {
+  call void @dummy7()
+  ret void
+}
+
+define internal void @internal_weak_1(i32 *%a) unnamed_addr {
+  call void @dummy8()
+  ret void
+}
+define weak void @internal_weak_2(float *%a) unnamed_addr {
+  call void @dummy8()
+  ret void
+}
+
+define internal void @internal_internal_1(i32 *%a) unnamed_addr {
+  call void @dummy9()
+  ret void
+}
+define internal void @internal_internal_2(float *%a) unnamed_addr {
+  call void @dummy9()
+  ret void
+}
diff --git a/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll b/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
index 806ca3c17a6af578e51ee8b511738f7a13cf6a96..f138ac429144ebd948ccbc9527a39ea14b9af3de 100644
--- a/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
+++ b/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
@@ -63,6 +63,14 @@ lpad:
   resume { i8*, i32 } zeroinitializer
 }
 
+define i8 @call_with_same_range() {
+; CHECK-LABEL: @call_with_same_range
+; CHECK: tail call i8 @call_with_range
+  bitcast i8 0 to i8
+  %out = call i8 @dummy(), !range !0
+  ret i8 %out
+}
+
 define i8 @invoke_with_same_range() personality i8* undef {
 ; CHECK-LABEL: @invoke_with_same_range()
 ; CHECK: tail call i8 @invoke_with_range()
@@ -76,14 +84,6 @@ lpad:
   resume { i8*, i32 } zeroinitializer
 }
 
-define i8 @call_with_same_range() {
-; CHECK-LABEL: @call_with_same_range
-; CHECK: tail call i8 @call_with_range
-  bitcast i8 0 to i8
-  %out = call i8 @dummy(), !range !0
-  ret i8 %out
-}
-
 
 
 declare i8 @dummy();
diff --git a/test/Transforms/MergeFunc/nonzero-address-spaces.ll b/test/Transforms/MergeFunc/nonzero-address-spaces.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3ee887c9de07322cc20000ad9be93190c45bb4ab
--- /dev/null
+++ b/test/Transforms/MergeFunc/nonzero-address-spaces.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+; MergeFunctions should respect the default function address
+; space specified in the data layout.
+
+target datalayout = "e-P1-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @stuff()
+
+; CHECK-LABEL: @f0(
+define void @f0(i64 %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
+; CHECK-LABEL: @f1(
+; CHECK: ptrtoint i64*
+; CHECK: tail call addrspace(1) void @f0(i64
+
+define void @f1(i64* %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
diff --git a/test/Transforms/NewGVN/eliminate-ssacopy.ll b/test/Transforms/NewGVN/eliminate-ssacopy.ll
new file mode 100644
index 0000000000000000000000000000000000000000..57bed023a49aed122e8844811870d8fd9be5ef29
--- /dev/null
+++ b/test/Transforms/NewGVN/eliminate-ssacopy.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -newgvn -S < %s | FileCheck %s
+
+; Make sure the created ssa copies are cleaned up. See PR38804.
+
+; CHECK-NOT: ssa_copy
+
+@b = external dso_local local_unnamed_addr global i32, align 4
+@a = external dso_local local_unnamed_addr global i8, align 1
+@f = external dso_local local_unnamed_addr global i16, align 2
+
+define void @g() {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND1THREAD_PRE_SPLIT:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    unreachable
+; CHECK:       for.cond1thread-pre-split:
+; CHECK-NEXT:    br label [[FOR_END4_SPLIT:%.*]]
+; CHECK:       for.end4.split:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND6_PREHEADER:%.*]], label [[IF_END11:%.*]]
+; CHECK:       for.cond6.preheader:
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND6_PREHEADER3:%.*]], label [[IF_END11_LOOPEXIT:%.*]]
+; CHECK:       for.cond6.preheader3:
+; CHECK-NEXT:    br label [[IF_END11_LOOPEXIT]]
+; CHECK:       if.end11.loopexit:
+; CHECK-NEXT:    [[STOREMERGE_LCSSA:%.*]] = phi i32 [ 0, [[FOR_COND6_PREHEADER]] ], [ 1, [[FOR_COND6_PREHEADER3]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE_LCSSA]], i32* @b, align 4
+; CHECK-NEXT:    br label [[IF_END11]]
+; CHECK:       if.end11:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* @a, align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp eq i32 [[TMP0]], [[CONV]]
+; CHECK-NEXT:    br i1 [[CMP12]], label [[IF_THEN14:%.*]], label [[IF_END16:%.*]]
+; CHECK:       if.then14:
+; CHECK-NEXT:    [[CONV15:%.*]] = trunc i32 [[TMP0]] to i16
+; CHECK-NEXT:    store i16 [[CONV15]], i16* @f, align 2
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end16:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tobool = icmp eq i32 undef, 0
+  br i1 %tobool, label %for.cond1thread-pre-split, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %entry
+  unreachable
+
+for.cond1thread-pre-split:                        ; preds = %entry
+  br label %for.end4.split
+
+for.end4.split:                                   ; preds = %for.cond1thread-pre-split
+  br i1 %tobool, label %for.cond6.preheader, label %if.end11
+
+for.cond6.preheader:                              ; preds = %for.end4.split
+  br i1 undef, label %for.cond6.preheader3, label %if.end11.loopexit
+
+for.cond6.preheader3:                             ; preds = %for.cond6.preheader
+  br label %if.end11.loopexit
+
+if.end11.loopexit:                                ; preds = %for.cond6.preheader3, %for.cond6.preheader
+  %storemerge.lcssa = phi i32 [ 0, %for.cond6.preheader ], [ 1, %for.cond6.preheader3 ]
+  store i32 %storemerge.lcssa, i32* @b, align 4
+  br label %if.end11
+
+if.end11:                                         ; preds = %if.end11.loopexit, %for.end4.split
+  %0 = load i32, i32* @b, align 4
+  %1 = load i8, i8* @a, align 1
+  %conv = sext i8 %1 to i32
+  %cmp12 = icmp eq i32 %0, %conv
+  br i1 %cmp12, label %if.then14, label %if.end16
+
+if.then14:                                        ; preds = %if.end11
+  %conv15 = trunc i32 %0 to i16
+  store i16 %conv15, i16* @f, align 2
+  unreachable
+
+if.end16:                                         ; preds = %if.end11
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/allocas.ll b/test/Transforms/ObjCARC/allocas.ll
index 5074256b39492b3879a54d41049a142bf7b4b093..bf2039d68d64347f14f8cd0811f21e8117a10c41 100644
--- a/test/Transforms/ObjCARC/allocas.ll
+++ b/test/Transforms/ObjCARC/allocas.ll
@@ -1,13 +1,13 @@
 ; RUN: opt -objc-arc -S < %s | FileCheck %s
 
-declare i8* @objc_retain(i8*)
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare void @objc_release(i8*)
-declare i8* @objc_autorelease(i8*)
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare void @objc_autoreleasePoolPop(i8*)
-declare i8* @objc_autoreleasePoolPush()
-declare i8* @objc_retainBlock(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare void @llvm.objc.autoreleasePoolPop(i8*)
+declare i8* @llvm.objc.autoreleasePoolPush()
+declare i8* @llvm.objc.retainBlock(i8*)
 
 declare i8* @objc_retainedObject(i8*)
 declare i8* @objc_unretainedObject(i8*)
@@ -25,7 +25,7 @@ declare void @use_alloca(i8**)
 
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
-declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...)
 
 
 ; In the presence of allocas, unconditionally remove retain/release pairs only
@@ -44,77 +44,77 @@ declare i8* @objc_msgSend(i8*, i8*, ...)
 ; rdar://13750319
 
 ; CHECK: define void @test1a(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test1a(i8* %x) {
 entry:
   %A = alloca i8*
-  tail call i8* @objc_retain(i8* %x)
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   store i8* %x, i8** %A, align 8
   %y = load i8*, i8** %A
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK: define void @test1b(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test1b(i8* %x) {
 entry:
   %A = alloca i8*
   %gep = getelementptr i8*, i8** %A, i32 0
-  tail call i8* @objc_retain(i8* %x)
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   store i8* %x, i8** %gep, align 8
   %y = load i8*, i8** %A
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
 
 ; CHECK: define void @test1c(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test1c(i8* %x) {
 entry:
   %A = alloca i8*, i32 3
   %gep = getelementptr i8*, i8** %A, i32 2
-  tail call i8* @objc_retain(i8* %x)
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   store i8* %x, i8** %gep, align 8
   %y = load i8*, i8** %gep
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
 
 ; CHECK: define void @test1d(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test1d(i8* %x) {
@@ -132,22 +132,22 @@ use_allocaB:
 exit:
   %A = phi i8** [ %allocaA, %use_allocaA ], [ %allocaB, %use_allocaB ]
   %gep = getelementptr i8*, i8** %A, i32 0
-  tail call i8* @objc_retain(i8* %x)
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   store i8* %x, i8** %gep, align 8
   %y = load i8*, i8** %gep
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK: define void @test1e(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test1e(i8* %x) {
@@ -165,22 +165,22 @@ use_allocaB:
 exit:
   %A = phi i8** [ %allocaA, %use_allocaA ], [ %allocaB, %use_allocaB ]
   %gep = getelementptr i8*, i8** %A, i32 2
-  tail call i8* @objc_retain(i8* %x)
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   store i8* %x, i8** %gep, align 8
   %y = load i8*, i8** %gep
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK: define void @test1f(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test1f(i8* %x) {
@@ -188,14 +188,14 @@ entry:
   %allocaOne = alloca i8*
   %allocaTwo = alloca i8*
   %A = select i1 undef, i8** %allocaOne, i8** %allocaTwo
-  tail call i8* @objc_retain(i8* %x)
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   store i8* %x, i8** %A, align 8
   %y = load i8*, i8** %A
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
@@ -204,10 +204,10 @@ entry:
 
 
 ; CHECK: define void @test2a(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test2a(i8* %x) {
@@ -224,20 +224,20 @@ bb2:
   br label %bb3
 
 bb3:
-  tail call i8* @objc_retain(i8* %x)
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK: define void @test2b(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test2b(i8* %x) {
@@ -256,20 +256,20 @@ bb2:
   br label %bb3
 
 bb3:
-  tail call i8* @objc_retain(i8* %x)
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK: define void @test2c(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test2c(i8* %x) {
@@ -279,7 +279,7 @@ entry:
   store i8* %x, i8** %gep1, align 8
   %gep2 = getelementptr i8*, i8** %A, i32 2
   %y = load i8*, i8** %gep2
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   br label %bb1
 
 bb1:
@@ -289,24 +289,24 @@ bb2:
   br label %bb3
 
 bb3:
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK: define void @test2d(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %y)
-; CHECK: @objc_release(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %y)
+; CHECK: @llvm.objc.release(i8* %x)
 ; CHECK: ret void
 ; CHECK: }
 define void @test2d(i8* %x) {
 entry:
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   br label %bb1
 
 bb1:
@@ -328,11 +328,11 @@ bb2:
 bb3:
   %A = phi i8** [ %Abb1, %bb1 ], [ %Abb2, %bb2 ]
   %y = phi i8* [ %ybb1, %bb1 ], [ %ybb2, %bb2 ]
-  tail call i8* @objc_retain(i8* %x)
+  tail call i8* @llvm.objc.retain(i8* %x)
   call void @use_alloca(i8** %A)
-  call void @objc_release(i8* %y), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %y), !clang.imprecise_release !0
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
@@ -344,21 +344,21 @@ bb3:
 
 ; CHECK: define void @test3a() {
 ; CHECK: entry:
-; CHECK:   @objc_retainAutoreleasedReturnValue
-; CHECK:   @objc_retain
-; CHECK:   @objc_retain
-; CHECK:   @objc_retain
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retainAutoreleasedReturnValue
+; CHECK:   @llvm.objc.retain
+; CHECK:   @llvm.objc.retain
+; CHECK:   @llvm.objc.retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: arraydestroy.body:
-; CHECK:   @objc_release
-; CHECK-NOT: @objc_release
+; CHECK:   @llvm.objc.release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: arraydestroy.done:
-; CHECK-NOT: @objc_release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: arraydestroy.body1:
-; CHECK:   @objc_release
-; CHECK-NOT: @objc_release
+; CHECK:   @llvm.objc.release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: arraydestroy.done1:
-; CHECK: @objc_release
+; CHECK: @llvm.objc.release
 ; CHECK: ret void
 ; CHECK: }
 define void @test3a() {
@@ -367,22 +367,22 @@ entry:
   %objs = alloca [2 x i8*], align 16
   
   %call1 = call i8* @returner()
-  %tmp0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call1)
+  %tmp0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call1)
 
   %objs.begin = getelementptr inbounds [2 x i8*], [2 x i8*]* %objs, i64 0, i64 0
-  tail call i8* @objc_retain(i8* %call1)
+  tail call i8* @llvm.objc.retain(i8* %call1)
   store i8* %call1, i8** %objs.begin, align 8
   %objs.elt = getelementptr inbounds [2 x i8*], [2 x i8*]* %objs, i64 0, i64 1
-  tail call i8* @objc_retain(i8* %call1)
+  tail call i8* @llvm.objc.retain(i8* %call1)
   store i8* %call1, i8** %objs.elt
 
   %call2 = call i8* @returner1()
   %call3 = call i8* @returner2()
   %keys.begin = getelementptr inbounds [2 x i8*], [2 x i8*]* %keys, i64 0, i64 0
-  tail call i8* @objc_retain(i8* %call2)
+  tail call i8* @llvm.objc.retain(i8* %call2)
   store i8* %call2, i8** %keys.begin, align 8
   %keys.elt = getelementptr inbounds [2 x i8*], [2 x i8*]* %keys, i64 0, i64 1
-  tail call i8* @objc_retain(i8* %call3)
+  tail call i8* @llvm.objc.retain(i8* %call3)
   store i8* %call3, i8** %keys.elt  
   
   %gep = getelementptr inbounds [2 x i8*], [2 x i8*]* %objs, i64 0, i64 2
@@ -392,7 +392,7 @@ arraydestroy.body:
   %arraydestroy.elementPast = phi i8** [ %gep, %entry ], [ %arraydestroy.element, %arraydestroy.body ]
   %arraydestroy.element = getelementptr inbounds i8*, i8** %arraydestroy.elementPast, i64 -1
   %destroy_tmp = load i8*, i8** %arraydestroy.element, align 8
-  call void @objc_release(i8* %destroy_tmp), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %destroy_tmp), !clang.imprecise_release !0
   %objs_ptr = getelementptr inbounds [2 x i8*], [2 x i8*]* %objs, i64 0, i64 0
   %arraydestroy.cmp = icmp eq i8** %arraydestroy.element, %objs_ptr
   br i1 %arraydestroy.cmp, label %arraydestroy.done, label %arraydestroy.body
@@ -405,13 +405,13 @@ arraydestroy.body1:
   %arraydestroy.elementPast1 = phi i8** [ %gep1, %arraydestroy.done ], [ %arraydestroy.element1, %arraydestroy.body1 ]
   %arraydestroy.element1 = getelementptr inbounds i8*, i8** %arraydestroy.elementPast1, i64 -1
   %destroy_tmp1 = load i8*, i8** %arraydestroy.element1, align 8
-  call void @objc_release(i8* %destroy_tmp1), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %destroy_tmp1), !clang.imprecise_release !0
   %keys_ptr = getelementptr inbounds [2 x i8*], [2 x i8*]* %keys, i64 0, i64 0
   %arraydestroy.cmp1 = icmp eq i8** %arraydestroy.element1, %keys_ptr
   br i1 %arraydestroy.cmp1, label %arraydestroy.done1, label %arraydestroy.body1
 
 arraydestroy.done1:
-  call void @objc_release(i8* %call1), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %call1), !clang.imprecise_release !0
   ret void
 }
 
@@ -422,21 +422,21 @@ arraydestroy.done1:
 
 ; CHECK: define void @test3b() {
 ; CHECK: entry:
-; CHECK:   @objc_retainAutoreleasedReturnValue
-; CHECK:   @objc_retain
-; CHECK:   @objc_retain
-; CHECK:   @objc_retain
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retainAutoreleasedReturnValue
+; CHECK:   @llvm.objc.retain
+; CHECK:   @llvm.objc.retain
+; CHECK:   @llvm.objc.retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: arraydestroy.body:
-; CHECK:   @objc_release
-; CHECK-NOT: @objc_release
+; CHECK:   @llvm.objc.release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: arraydestroy.done:
-; CHECK-NOT: @objc_release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: arraydestroy.body1:
-; CHECK:   @objc_release
-; CHECK-NOT: @objc_release
+; CHECK:   @llvm.objc.release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: arraydestroy.done1:
-; CHECK: @objc_release
+; CHECK: @llvm.objc.release
 ; CHECK: ret void
 ; CHECK: }
 define void @test3b() {
@@ -445,23 +445,23 @@ entry:
   %objs = alloca [2 x i8*], align 16
   
   %call1 = call i8* @returner()
-  %tmp0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call1)
-  %tmp1 = tail call i8* @objc_retain(i8* %call1)
+  %tmp0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call1)
+  %tmp1 = tail call i8* @llvm.objc.retain(i8* %call1)
 
   %objs.begin = getelementptr inbounds [2 x i8*], [2 x i8*]* %objs, i64 0, i64 0
-  tail call i8* @objc_retain(i8* %call1)
+  tail call i8* @llvm.objc.retain(i8* %call1)
   store i8* %call1, i8** %objs.begin, align 8
   %objs.elt = getelementptr inbounds [2 x i8*], [2 x i8*]* %objs, i64 0, i64 1
-  tail call i8* @objc_retain(i8* %call1)
+  tail call i8* @llvm.objc.retain(i8* %call1)
   store i8* %call1, i8** %objs.elt
 
   %call2 = call i8* @returner1()
   %call3 = call i8* @returner2()
   %keys.begin = getelementptr inbounds [2 x i8*], [2 x i8*]* %keys, i64 0, i64 0
-  tail call i8* @objc_retain(i8* %call2)
+  tail call i8* @llvm.objc.retain(i8* %call2)
   store i8* %call2, i8** %keys.begin, align 8
   %keys.elt = getelementptr inbounds [2 x i8*], [2 x i8*]* %keys, i64 0, i64 1
-  tail call i8* @objc_retain(i8* %call3)
+  tail call i8* @llvm.objc.retain(i8* %call3)
   store i8* %call3, i8** %keys.elt  
   
   %gep = getelementptr inbounds [2 x i8*], [2 x i8*]* %objs, i64 0, i64 2
@@ -471,7 +471,7 @@ arraydestroy.body:
   %arraydestroy.elementPast = phi i8** [ %gep, %entry ], [ %arraydestroy.element, %arraydestroy.body ]
   %arraydestroy.element = getelementptr inbounds i8*, i8** %arraydestroy.elementPast, i64 -1
   %destroy_tmp = load i8*, i8** %arraydestroy.element, align 8
-  call void @objc_release(i8* %destroy_tmp), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %destroy_tmp), !clang.imprecise_release !0
   %objs_ptr = getelementptr inbounds [2 x i8*], [2 x i8*]* %objs, i64 0, i64 0
   %arraydestroy.cmp = icmp eq i8** %arraydestroy.element, %objs_ptr
   br i1 %arraydestroy.cmp, label %arraydestroy.done, label %arraydestroy.body
@@ -484,14 +484,14 @@ arraydestroy.body1:
   %arraydestroy.elementPast1 = phi i8** [ %gep1, %arraydestroy.done ], [ %arraydestroy.element1, %arraydestroy.body1 ]
   %arraydestroy.element1 = getelementptr inbounds i8*, i8** %arraydestroy.elementPast1, i64 -1
   %destroy_tmp1 = load i8*, i8** %arraydestroy.element1, align 8
-  call void @objc_release(i8* %destroy_tmp1), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %destroy_tmp1), !clang.imprecise_release !0
   %keys_ptr = getelementptr inbounds [2 x i8*], [2 x i8*]* %keys, i64 0, i64 0
   %arraydestroy.cmp1 = icmp eq i8** %arraydestroy.element1, %keys_ptr
   br i1 %arraydestroy.cmp1, label %arraydestroy.done1, label %arraydestroy.body1
 
 arraydestroy.done1:
-  call void @objc_release(i8* %call1), !clang.imprecise_release !0
-  call void @objc_release(i8* %call1), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %call1), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %call1), !clang.imprecise_release !0
   ret void
 }
 
diff --git a/test/Transforms/ObjCARC/apelim.ll b/test/Transforms/ObjCARC/apelim.ll
index 14412c6fc9c7932723e0c75f0089e837f46723f4..da3a1f4265457bf4ab60382a7127dc8c4ebc4135 100644
--- a/test/Transforms/ObjCARC/apelim.ll
+++ b/test/Transforms/ObjCARC/apelim.ll
@@ -31,25 +31,25 @@ entry:
 ; CHECK: }
 define internal void @_GLOBAL__I_x() {
 entry:
-  %0 = call i8* @objc_autoreleasePoolPush() nounwind
+  %0 = call i8* @llvm.objc.autoreleasePoolPush() nounwind
   call void @__cxx_global_var_init()
-  call void @objc_autoreleasePoolPop(i8* %0) nounwind
+  call void @llvm.objc.autoreleasePoolPop(i8* %0) nounwind
   ret void
 }
 
 ; CHECK: define internal void @_GLOBAL__I_y() {
-; CHECK: %0 = call i8* @objc_autoreleasePoolPush() [[NUW:#[0-9]+]]
-; CHECK: call void @objc_autoreleasePoolPop(i8* %0) [[NUW]]
+; CHECK: %0 = call i8* @llvm.objc.autoreleasePoolPush() [[NUW:#[0-9]+]]
+; CHECK: call void @llvm.objc.autoreleasePoolPop(i8* %0) [[NUW]]
 ; CHECK: }
 define internal void @_GLOBAL__I_y() {
 entry:
-  %0 = call i8* @objc_autoreleasePoolPush() nounwind
+  %0 = call i8* @llvm.objc.autoreleasePoolPush() nounwind
   call void @__dxx_global_var_init()
-  call void @objc_autoreleasePoolPop(i8* %0) nounwind
+  call void @llvm.objc.autoreleasePoolPop(i8* %0) nounwind
   ret void
 }
 
-declare i8* @objc_autoreleasePoolPush()
-declare void @objc_autoreleasePoolPop(i8*)
+declare i8* @llvm.objc.autoreleasePoolPush()
+declare void @llvm.objc.autoreleasePoolPop(i8*)
 
 ; CHECK: attributes #0 = { nounwind }
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 70b83b9313865e3a02ca2e5e8d48a8d4fee447e9..6524dad4ae371fd51428052e67b533f4fc29e01e 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -2,19 +2,19 @@
 
 target datalayout = "e-p:64:64:64"
 
-declare i8* @objc_retain(i8*)
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare i8* @objc_unsafeClaimAutoreleasedReturnValue(i8*)
-declare void @objc_release(i8*)
-declare i8* @objc_autorelease(i8*)
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare void @objc_autoreleasePoolPop(i8*)
-declare i8* @objc_autoreleasePoolPush()
-declare i8* @objc_retainBlock(i8*)
-
-declare i8* @objc_retainedObject(i8*)
-declare i8* @objc_unretainedObject(i8*)
-declare i8* @objc_unretainedPointer(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare void @llvm.objc.autoreleasePoolPop(i8*)
+declare i8* @llvm.objc.autoreleasePoolPush()
+declare i8* @llvm.objc.retainBlock(i8*)
+
+declare i8* @llvm.objc.retainedObject(i8*)
+declare i8* @llvm.objc.unretainedObject(i8*)
+declare i8* @llvm.objc.unretainedPointer(i8*)
 
 declare void @use_pointer(i8*)
 declare void @callee()
@@ -25,19 +25,19 @@ declare void @bar(i32 ()*)
 
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
-declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...)
 
 ; Simple retain+release pair deletion, with some intervening control
 ; flow and harmless instructions.
 
 ; CHECK: define void @test0_precise(i32* %x, i1 %p) [[NUW:#[0-9]+]] {
-; CHECK: @objc_retain
-; CHECK: @objc_release
+; CHECK: @llvm.objc.retain
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test0_precise(i32* %x, i1 %p) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -52,17 +52,17 @@ f:
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind
+  call void @llvm.objc.release(i8* %c) nounwind
   ret void
 }
 
 ; CHECK: define void @test0_imprecise(i32* %x, i1 %p) [[NUW]] {
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test0_imprecise(i32* %x, i1 %p) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -77,23 +77,23 @@ f:
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %c) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; Like test0 but the release isn't always executed when the retain is,
 ; so the optimization is not safe.
 
-; TODO: Make the objc_release's argument be %0.
+; TODO: Make the llvm.objc.release's argument be %0.
 
 ; CHECK: define void @test1_precise(i32* %x, i1 %p, i1 %q) [[NUW]] {
-; CHECK: @objc_retain(i8* %a)
-; CHECK: @objc_release
+; CHECK: @llvm.objc.retain(i8* %a)
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test1_precise(i32* %x, i1 %p, i1 %q) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -109,7 +109,7 @@ f:
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind
+  call void @llvm.objc.release(i8* %c) nounwind
   ret void
 
 alt_return:
@@ -117,13 +117,13 @@ alt_return:
 }
 
 ; CHECK: define void @test1_imprecise(i32* %x, i1 %p, i1 %q) [[NUW]] {
-; CHECK: @objc_retain(i8* %a)
-; CHECK: @objc_release
+; CHECK: @llvm.objc.retain(i8* %a)
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test1_imprecise(i32* %x, i1 %p, i1 %q) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -139,7 +139,7 @@ f:
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %c) nounwind, !clang.imprecise_release !0
   ret void
 
 alt_return:
@@ -151,15 +151,15 @@ alt_return:
 
 ; CHECK: define void @test1b_precise(i8* %x, i1 %p, i1 %q) {
 ; CHECK: entry:
-; CHECK:   tail call i8* @objc_retain(i8* %x) [[NUW]]
-; CHECK-NOT: @objc_
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
+; CHECK-NOT: @llvm.objc.
 ; CHECK: if.end5:
-; CHECK:   tail call void @objc_release(i8* %x) [[NUW]]
-; CHECK-NOT: @objc_
+; CHECK:   tail call void @llvm.objc.release(i8* %x) [[NUW]]
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test1b_precise(i8* %x, i1 %p, i1 %q) {
 entry:
-  tail call i8* @objc_retain(i8* %x) nounwind
+  tail call i8* @llvm.objc.retain(i8* %x) nounwind
   br i1 %p, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
@@ -174,21 +174,21 @@ if.then3:                                         ; preds = %if.end
   br label %if.end5
 
 if.end5:                                          ; preds = %if.then3, %if.end
-  tail call void @objc_release(i8* %x) nounwind
+  tail call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test1b_imprecise(
 ; CHECK: entry:
-; CHECK:   tail call i8* @objc_retain(i8* %x) [[NUW:#[0-9]+]]
-; CHECK-NOT: @objc_
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %x) [[NUW:#[0-9]+]]
+; CHECK-NOT: @llvm.objc.
 ; CHECK: if.end5:
-; CHECK:   tail call void @objc_release(i8* %x) [[NUW]], !clang.imprecise_release ![[RELEASE:[0-9]+]]
-; CHECK-NOT: @objc_
+; CHECK:   tail call void @llvm.objc.release(i8* %x) [[NUW]], !clang.imprecise_release ![[RELEASE:[0-9]+]]
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test1b_imprecise(i8* %x, i1 %p, i1 %q) {
 entry:
-  tail call i8* @objc_retain(i8* %x) nounwind
+  tail call i8* @llvm.objc.retain(i8* %x) nounwind
   br i1 %p, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
@@ -203,7 +203,7 @@ if.then3:                                         ; preds = %if.end
   br label %if.end5
 
 if.end5:                                          ; preds = %if.then3, %if.end
-  tail call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -212,13 +212,13 @@ if.end5:                                          ; preds = %if.then3, %if.end
 ; so the optimization is not safe.
 
 ; CHECK-LABEL: define void @test2_precise(
-; CHECK: @objc_retain(i8* %a)
-; CHECK: @objc_release
+; CHECK: @llvm.objc.retain(i8* %a)
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test2_precise(i32* %x, i1 %p) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -236,18 +236,18 @@ f:
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind
+  call void @llvm.objc.release(i8* %c) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test2_imprecise(
-; CHECK: @objc_retain(i8* %a)
-; CHECK: @objc_release
+; CHECK: @llvm.objc.retain(i8* %a)
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test2_imprecise(i32* %x, i1 %p) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -265,7 +265,7 @@ f:
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %c) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -275,18 +275,18 @@ return:
 ; TODO: For now, assume this can't happen.
 
 ; CHECK-LABEL: define void @test3_precise(
-; TODO: @objc_retain(i8* %a)
-; TODO: @objc_release
+; TODO: @llvm.objc.retain(i8* %a)
+; TODO: @llvm.objc.release
 ; CHECK: }
 define void @test3_precise(i32* %x, i1* %q) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind
+  call void @llvm.objc.release(i8* %c) nounwind
   %j = load volatile i1, i1* %q
   br i1 %j, label %loop, label %return
 
@@ -295,18 +295,18 @@ return:
 }
 
 ; CHECK-LABEL: define void @test3_imprecise(
-; TODO: @objc_retain(i8* %a)
-; TODO: @objc_release
+; TODO: @llvm.objc.retain(i8* %a)
+; TODO: @llvm.objc.release
 ; CHECK: }
 define void @test3_imprecise(i32* %x, i1* %q) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %c) nounwind, !clang.imprecise_release !0
   %j = load volatile i1, i1* %q
   br i1 %j, label %loop, label %return
 
@@ -321,8 +321,8 @@ return:
 ; so the optimization is not safe.
 
 ; CHECK-LABEL: define void @test4_precise(
-; TODO: @objc_retain(i8* %a)
-; TODO: @objc_release
+; TODO: @llvm.objc.retain(i8* %a)
+; TODO: @llvm.objc.release
 ; CHECK: }
 define void @test4_precise(i32* %x, i1* %q) nounwind {
 entry:
@@ -330,19 +330,19 @@ entry:
 
 loop:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   %j = load volatile i1, i1* %q
   br i1 %j, label %loop, label %return
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind
+  call void @llvm.objc.release(i8* %c) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test4_imprecise(
-; TODO: @objc_retain(i8* %a)
-; TODO: @objc_release
+; TODO: @llvm.objc.retain(i8* %a)
+; TODO: @llvm.objc.release
 ; CHECK: }
 define void @test4_imprecise(i32* %x, i1* %q) nounwind {
 entry:
@@ -350,13 +350,13 @@ entry:
 
 loop:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   %j = load volatile i1, i1* %q
   br i1 %j, label %loop, label %return
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %c) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -365,34 +365,34 @@ return:
 ; so the optimization is not safe.
 
 ; CHECK-LABEL: define void @test5a(
-; CHECK: @objc_retain(i8*
-; CHECK: @objc_release
+; CHECK: @llvm.objc.retain(i8*
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test5a(i32* %x, i1 %q, i8* %y) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   %s = select i1 %q, i8* %y, i8* %0
   call void @use_pointer(i8* %s)
   store i32 7, i32* %x
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind
+  call void @llvm.objc.release(i8* %c) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test5b(
-; CHECK: @objc_retain(i8*
-; CHECK: @objc_release
+; CHECK: @llvm.objc.retain(i8*
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test5b(i32* %x, i1 %q, i8* %y) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   %s = select i1 %q, i8* %y, i8* %0
   call void @use_pointer(i8* %s)
   store i32 7, i32* %x
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %c) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -402,17 +402,17 @@ entry:
 
 ; CHECK-LABEL: define void @test6a(
 ; CHECK: entry:
-; CHECK:   tail call i8* @objc_retain(
+; CHECK:   tail call i8* @llvm.objc.retain
 ; CHECK: t:
-; CHECK:   call void @objc_release(
+; CHECK:   call void @llvm.objc.release
 ; CHECK: f:
-; CHECK:   call void @objc_release(
+; CHECK:   call void @llvm.objc.release
 ; CHECK: return:
 ; CHECK: }
 define void @test6a(i32* %x, i1 %p) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -420,14 +420,14 @@ t:
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   %ct = bitcast i32* %x to i8*
-  call void @objc_release(i8* %ct) nounwind
+  call void @llvm.objc.release(i8* %ct) nounwind
   br label %return
 
 f:
   store i32 7, i32* %x
   call void @callee()
   %cf = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cf) nounwind
+  call void @llvm.objc.release(i8* %cf) nounwind
   br label %return
 
 return:
@@ -435,12 +435,12 @@ return:
 }
 
 ; CHECK-LABEL: define void @test6b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test6b(i32* %x, i1 %p) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -448,14 +448,14 @@ t:
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   %ct = bitcast i32* %x to i8*
-  call void @objc_release(i8* %ct) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %ct) nounwind, !clang.imprecise_release !0
   br label %return
 
 f:
   store i32 7, i32* %x
   call void @callee()
   %cf = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cf) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %cf) nounwind, !clang.imprecise_release !0
   br label %return
 
 return:
@@ -464,17 +464,17 @@ return:
 
 ; CHECK-LABEL: define void @test6c(
 ; CHECK: entry:
-; CHECK:   tail call i8* @objc_retain(
+; CHECK:   tail call i8* @llvm.objc.retain
 ; CHECK: t:
-; CHECK:   call void @objc_release(
+; CHECK:   call void @llvm.objc.release
 ; CHECK: f:
-; CHECK:   call void @objc_release(
+; CHECK:   call void @llvm.objc.release
 ; CHECK: return:
 ; CHECK: }
 define void @test6c(i32* %x, i1 %p) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -482,14 +482,14 @@ t:
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   %ct = bitcast i32* %x to i8*
-  call void @objc_release(i8* %ct) nounwind
+  call void @llvm.objc.release(i8* %ct) nounwind
   br label %return
 
 f:
   store i32 7, i32* %x
   call void @callee()
   %cf = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cf) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %cf) nounwind, !clang.imprecise_release !0
   br label %return
 
 return:
@@ -498,17 +498,17 @@ return:
 
 ; CHECK-LABEL: define void @test6d(
 ; CHECK: entry:
-; CHECK:   tail call i8* @objc_retain(
+; CHECK:   tail call i8* @llvm.objc.retain
 ; CHECK: t:
-; CHECK:   call void @objc_release(
+; CHECK:   call void @llvm.objc.release
 ; CHECK: f:
-; CHECK:   call void @objc_release(
+; CHECK:   call void @llvm.objc.release
 ; CHECK: return:
 ; CHECK: }
 define void @test6d(i32* %x, i1 %p) nounwind {
 entry:
   %a = bitcast i32* %x to i8*
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   br i1 %p, label %t, label %f
 
 t:
@@ -516,14 +516,14 @@ t:
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   %ct = bitcast i32* %x to i8*
-  call void @objc_release(i8* %ct) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %ct) nounwind, !clang.imprecise_release !0
   br label %return
 
 f:
   store i32 7, i32* %x
   call void @callee()
   %cf = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cf) nounwind
+  call void @llvm.objc.release(i8* %cf) nounwind
   br label %return
 
 return:
@@ -536,13 +536,13 @@ return:
 
 ; CHECK-LABEL:     define void @test7(
 ; CHECK:     entry:
-; CHECK-NOT:   objc_
+; CHECK-NOT:   llvm.objc.
 ; CHECK:     t:
-; CHECK:       call i8* @objc_retain
+; CHECK:       call i8* @llvm.objc.retain
 ; CHECK:     f:
-; CHECK:       call i8* @objc_retain
+; CHECK:       call i8* @llvm.objc.retain
 ; CHECK:     return:
-; CHECK:       call void @objc_release
+; CHECK:       call void @llvm.objc.release
 ; CHECK: }
 define void @test7(i32* %x, i1 %p) nounwind {
 entry:
@@ -550,26 +550,26 @@ entry:
   br i1 %p, label %t, label %f
 
 t:
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i8 3, i8* %a
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   br label %return
 
 f:
-  %1 = call i8* @objc_retain(i8* %a) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i32 7, i32* %x
   call void @callee()
   br label %return
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind
+  call void @llvm.objc.release(i8* %c) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test7b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test7b(i32* %x, i1 %p) nounwind {
 entry:
@@ -577,21 +577,21 @@ entry:
   br i1 %p, label %t, label %f
 
 t:
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i8 3, i8* %a
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   br label %return
 
 f:
-  %1 = call i8* @objc_retain(i8* %a) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i32 7, i32* %x
   call void @callee()
   br label %return
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %c) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -599,11 +599,11 @@ return:
 
 ; CHECK-LABEL: define void @test7c(
 ; CHECK: t:
-; CHECK:   call i8* @objc_retainBlock
+; CHECK:   call i8* @llvm.objc.retainBlock
 ; CHECK: f:
-; CHECK:   call i8* @objc_retain
+; CHECK:   call i8* @llvm.objc.retain
 ; CHECK: return:
-; CHECK:   call void @objc_release
+; CHECK:   call void @llvm.objc.release
 ; CHECK: }
 define void @test7c(i32* %x, i1 %p) nounwind {
 entry:
@@ -611,21 +611,21 @@ entry:
   br i1 %p, label %t, label %f
 
 t:
-  %0 = call i8* @objc_retainBlock(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retainBlock(i8* %a) nounwind
   store i8 3, i8* %a
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   br label %return
 
 f:
-  %1 = call i8* @objc_retain(i8* %a) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i32 7, i32* %x
   call void @callee()
   br label %return
 
 return:
   %c = bitcast i32* %x to i8*
-  call void @objc_release(i8* %c) nounwind
+  call void @llvm.objc.release(i8* %c) nounwind
   ret void
 }
 
@@ -635,14 +635,14 @@ return:
 ; CHECK-LABEL: define void @test8a(
 ; CHECK: entry:
 ; CHECK: t:
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: f:
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: mid:
 ; CHECK: u:
-; CHECK:   @objc_release
+; CHECK:   @llvm.objc.release
 ; CHECK: g:
-; CHECK:   @objc_release
+; CHECK:   @llvm.objc.release
 ; CHECK: return:
 ; CHECK: }
 define void @test8a(i32* %x, i1 %p, i1 %q) nounwind {
@@ -651,14 +651,14 @@ entry:
   br i1 %p, label %t, label %f
 
 t:
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i8 3, i8* %a
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   br label %mid
 
 f:
-  %1 = call i8* @objc_retain(i8* %a) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i32 7, i32* %x
   br label %mid
 
@@ -668,12 +668,12 @@ mid:
 u:
   call void @callee()
   %cu = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cu) nounwind
+  call void @llvm.objc.release(i8* %cu) nounwind
   br label %return
 
 g:
   %cg = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cg) nounwind
+  call void @llvm.objc.release(i8* %cg) nounwind
   br label %return
 
 return:
@@ -681,7 +681,7 @@ return:
 }
 
 ; CHECK-LABEL: define void @test8b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test8b(i32* %x, i1 %p, i1 %q) nounwind {
 entry:
@@ -689,14 +689,14 @@ entry:
   br i1 %p, label %t, label %f
 
 t:
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i8 3, i8* %a
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   br label %mid
 
 f:
-  %1 = call i8* @objc_retain(i8* %a) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i32 7, i32* %x
   br label %mid
 
@@ -706,12 +706,12 @@ mid:
 u:
   call void @callee()
   %cu = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cu) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %cu) nounwind, !clang.imprecise_release !0
   br label %return
 
 g:
   %cg = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cg) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %cg) nounwind, !clang.imprecise_release !0
   br label %return
 
 return:
@@ -721,14 +721,14 @@ return:
 ; CHECK-LABEL: define void @test8c(
 ; CHECK: entry:
 ; CHECK: t:
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: f:
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: mid:
 ; CHECK: u:
-; CHECK:   @objc_release
+; CHECK:   @llvm.objc.release
 ; CHECK: g:
-; CHECK:   @objc_release
+; CHECK:   @llvm.objc.release
 ; CHECK: return:
 ; CHECK: }
 define void @test8c(i32* %x, i1 %p, i1 %q) nounwind {
@@ -737,14 +737,14 @@ entry:
   br i1 %p, label %t, label %f
 
 t:
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i8 3, i8* %a
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   br label %mid
 
 f:
-  %1 = call i8* @objc_retain(i8* %a) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i32 7, i32* %x
   br label %mid
 
@@ -754,12 +754,12 @@ mid:
 u:
   call void @callee()
   %cu = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cu) nounwind
+  call void @llvm.objc.release(i8* %cu) nounwind
   br label %return
 
 g:
   %cg = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cg) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %cg) nounwind, !clang.imprecise_release !0
   br label %return
 
 return:
@@ -769,14 +769,14 @@ return:
 ; CHECK-LABEL: define void @test8d(
 ; CHECK: entry:
 ; CHECK: t:
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: f:
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: mid:
 ; CHECK: u:
-; CHECK:   @objc_release
+; CHECK:   @llvm.objc.release
 ; CHECK: g:
-; CHECK:   @objc_release
+; CHECK:   @llvm.objc.release
 ; CHECK: return:
 ; CHECK: }
 define void @test8d(i32* %x, i1 %p, i1 %q) nounwind {
@@ -785,14 +785,14 @@ entry:
   br i1 %p, label %t, label %f
 
 t:
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i8 3, i8* %a
   %b = bitcast i32* %x to float*
   store float 2.0, float* %b
   br label %mid
 
 f:
-  %1 = call i8* @objc_retain(i8* %a) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %a) nounwind
   store i32 7, i32* %x
   br label %mid
 
@@ -802,12 +802,12 @@ mid:
 u:
   call void @callee()
   %cu = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cu) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %cu) nounwind, !clang.imprecise_release !0
   br label %return
 
 g:
   %cg = bitcast i32* %x to i8*
-  call void @objc_release(i8* %cg) nounwind
+  call void @llvm.objc.release(i8* %cg) nounwind
   br label %return
 
 return:
@@ -817,58 +817,58 @@ return:
 ; Trivial retain+release pair deletion.
 
 ; CHECK-LABEL: define void @test9(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test9(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retain(i8* %x) nounwind
-  call void @objc_release(i8* %0) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %0) nounwind
   ret void
 }
 
 ; Retain+release pair, but on an unknown pointer relationship. Don't delete!
 
 ; CHECK-LABEL: define void @test9b(
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_release(i8* %s)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.release(i8* %s)
 ; CHECK: }
 define void @test9b(i8* %x, i1 %j, i8* %p) nounwind {
 entry:
-  %0 = call i8* @objc_retain(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
   %s = select i1 %j, i8* %x, i8* %p
-  call void @objc_release(i8* %s) nounwind
+  call void @llvm.objc.release(i8* %s) nounwind
   ret void
 }
 
 ; Trivial retain+release pair with intervening calls - don't delete!
 
 ; CHECK-LABEL: define void @test10(
-; CHECK: @objc_retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
 ; CHECK: @callee
 ; CHECK: @use_pointer
-; CHECK: @objc_release
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test10(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retain(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @callee()
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %0) nounwind
+  call void @llvm.objc.release(i8* %0) nounwind
   ret void
 }
 
 ; Trivial retain+autoreleaserelease pair. Don't delete!
-; Also, add a tail keyword, since objc_retain can never be passed
+; Also, add a tail keyword, since llvm.objc.retain can never be passed
 ; a stack argument.
 
 ; CHECK-LABEL: define void @test11(
-; CHECK: tail call i8* @objc_retain(i8* %x) [[NUW]]
-; CHECK: call i8* @objc_autorelease(i8* %0) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
+; CHECK: call i8* @llvm.objc.autorelease(i8* %0) [[NUW]]
 ; CHECK: }
 define void @test11(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_autorelease(i8* %0) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %0) nounwind
   call void @use_pointer(i8* %x)
   ret void
 }
@@ -881,8 +881,8 @@ entry:
 ; CHECK: }
 define void @test11a(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_autorelease(i8* %0) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %0) nounwind
   ret void
 }
 
@@ -891,13 +891,13 @@ entry:
 ; want it to be in the autorelease pool.
 
 ; CHECK-LABEL: define i8* @test11b(
-; CHECK: tail call i8* @objc_retain(i8* %x) [[NUW]]
-; CHECK: call i8* @objc_autorelease(i8* %0) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
+; CHECK: call i8* @llvm.objc.autorelease(i8* %0) [[NUW]]
 ; CHECK: }
 define i8* @test11b(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_autorelease(i8* %0) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %0) nounwind
   ret i8* %x
 }
 
@@ -906,34 +906,34 @@ entry:
 
 ; CHECK-LABEL: define void @test12(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_retain(i8* %x)
-; CHECK-NEXT: @objc_retain
-; CHECK: @objc_release
+; CHECK-NEXT: @llvm.objc.retain(i8* %x)
+; CHECK-NEXT: @llvm.objc.retain
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test12(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
 ; Trivial retain,autorelease pair. Don't delete!
 
 ; CHECK-LABEL: define void @test13(
-; CHECK: tail call i8* @objc_retain(i8* %x) [[NUW]]
-; CHECK: tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
 ; CHECK: @use_pointer(i8* %x)
-; CHECK: call i8* @objc_autorelease(i8* %x) [[NUW]]
+; CHECK: call i8* @llvm.objc.autorelease(i8* %x) [[NUW]]
 ; CHECK: }
 define void @test13(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  call i8* @objc_autorelease(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %x) nounwind
   ret void
 }
 
@@ -941,22 +941,22 @@ entry:
 
 ; CHECK-LABEL: define void @test13b(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_retain(i8* %x)
+; CHECK-NEXT: @llvm.objc.retain(i8* %x)
 ; CHECK-NEXT: @use_pointer
 ; CHECK-NEXT: @use_pointer
 ; CHECK-NEXT: @use_pointer
-; CHECK-NEXT: @objc_release
+; CHECK-NEXT: @llvm.objc.release
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test13b(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
@@ -964,20 +964,20 @@ entry:
 ; autoreleasePoolPop in the way.
 
 ; CHECK-LABEL: define void @test13c(
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc_autoreleasePoolPop
-; CHECK: @objc_retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc.autoreleasePoolPop
+; CHECK: @llvm.objc.retain(i8* %x)
 ; CHECK: @use_pointer
-; CHECK: @objc_release
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test13c(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call void @objc_autoreleasePoolPop(i8* undef)
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call void @llvm.objc.autoreleasePoolPop(i8* undef)
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
@@ -986,24 +986,24 @@ entry:
 
 ; CHECK-LABEL: define void @test13d(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_retain(i8* %x)
-; CHECK-NEXT: @objc_autoreleasePoolPush
+; CHECK-NEXT: @llvm.objc.retain(i8* %x)
+; CHECK-NEXT: @llvm.objc.autoreleasePoolPush
 ; CHECK-NEXT: @use_pointer
 ; CHECK-NEXT: @use_pointer
 ; CHECK-NEXT: @use_pointer
-; CHECK-NEXT: @objc_release
+; CHECK-NEXT: @llvm.objc.release
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test13d(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_autoreleasePoolPush()
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.autoreleasePoolPush()
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
@@ -1013,20 +1013,20 @@ entry:
 
 ; CHECK-LABEL: define void @test14(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_retain
+; CHECK-NEXT: @llvm.objc.retain
 ; CHECK-NEXT: @use_pointer
 ; CHECK-NEXT: @use_pointer
-; CHECK-NEXT: @objc_release
-; CHECK-NEXT: @objc_release
+; CHECK-NEXT: @llvm.objc.release
+; CHECK-NEXT: @llvm.objc.release
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test14(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
@@ -1035,18 +1035,18 @@ entry:
 
 ; CHECK-LABEL: define void @test15(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_retain(i8* %x)
+; CHECK-NEXT: @llvm.objc.retain(i8* %x)
 ; CHECK-NEXT: @use_pointer
-; CHECK-NEXT: @objc_autorelease(i8* %x)
-; CHECK-NEXT: @objc_release
+; CHECK-NEXT: @llvm.objc.autorelease(i8* %x)
+; CHECK-NEXT: @llvm.objc.release
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test15(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  call i8* @objc_autorelease(i8* %x) nounwind
-  call void @objc_release(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
@@ -1055,52 +1055,52 @@ entry:
 
 ; CHECK-LABEL: define void @test15b(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_retain
-; CHECK-NEXT: @objc_autorelease
-; CHECK-NEXT: @objc_release
+; CHECK-NEXT: @llvm.objc.retain
+; CHECK-NEXT: @llvm.objc.autorelease
+; CHECK-NEXT: @llvm.objc.release
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test15b(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_autorelease(i8* %x) nounwind
-  call void @objc_release(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test15c(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_autorelease
+; CHECK-NEXT: @llvm.objc.autorelease
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test15c(i8* %x, i64 %n) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_autorelease(i8* %x) nounwind
-  call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; Retain+release pairs in diamonds, all dominated by a retain.
 
 ; CHECK-LABEL: define void @test16a(
-; CHECK: @objc_retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
 ; CHECK-NOT: @objc
 ; CHECK: purple:
 ; CHECK: @use_pointer
-; CHECK: @objc_release
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test16a(i1 %a, i1 %b, i8* %x) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br i1 %a, label %red, label %orange
 
 red:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br label %yellow
 
 orange:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br label %yellow
 
 yellow:
@@ -1109,38 +1109,38 @@ yellow:
   br i1 %b, label %green, label %blue
 
 green:
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   br label %purple
 
 blue:
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   br label %purple
 
 purple:
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test16b(
-; CHECK: @objc_retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
 ; CHECK-NOT: @objc
 ; CHECK: purple:
 ; CHECK-NEXT: @use_pointer
 ; CHECK-NEXT: @use_pointer
-; CHECK-NEXT: @objc_release
+; CHECK-NEXT: @llvm.objc.release
 ; CHECK: }
 define void @test16b(i1 %a, i1 %b, i8* %x) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br i1 %a, label %red, label %orange
 
 red:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br label %yellow
 
 orange:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br label %yellow
 
 yellow:
@@ -1149,38 +1149,38 @@ yellow:
   br i1 %b, label %green, label %blue
 
 green:
-  call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   br label %purple
 
 blue:
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   br label %purple
 
 purple:
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test16c(
-; CHECK: @objc_retain(i8* %x)
+; CHECK: @llvm.objc.retain(i8* %x)
 ; CHECK-NOT: @objc
 ; CHECK: purple:
 ; CHECK: @use_pointer
-; CHECK: @objc_release
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test16c(i1 %a, i1 %b, i8* %x) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br i1 %a, label %red, label %orange
 
 red:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br label %yellow
 
 orange:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br label %yellow
 
 yellow:
@@ -1189,34 +1189,34 @@ yellow:
   br i1 %b, label %green, label %blue
 
 green:
-  call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   br label %purple
 
 blue:
-  call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   br label %purple
 
 purple:
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK-LABEL: define void @test16d(
-; CHECK: @objc_retain(i8* %x)
-; CHECK: @objc
+; CHECK: @llvm.objc.retain(i8* %x)
+; CHECK: @llvm.objc
 ; CHECK: }
 define void @test16d(i1 %a, i1 %b, i8* %x) {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br i1 %a, label %red, label %orange
 
 red:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br label %yellow
 
 orange:
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   br label %yellow
 
 yellow:
@@ -1225,11 +1225,11 @@ yellow:
   br i1 %b, label %green, label %blue
 
 green:
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   br label %purple
 
 blue:
-  call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   br label %purple
 
 purple:
@@ -1239,24 +1239,24 @@ purple:
 ; Delete no-ops.
 
 ; CHECK-LABEL: define void @test18(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test18() {
-  call i8* @objc_retain(i8* null)
-  call void @objc_release(i8* null)
-  call i8* @objc_autorelease(i8* null)
+  call i8* @llvm.objc.retain(i8* null)
+  call void @llvm.objc.release(i8* null)
+  call i8* @llvm.objc.autorelease(i8* null)
   ret void
 }
 
 ; Delete no-ops where undef can be assumed to be null.
 
 ; CHECK-LABEL: define void @test18b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test18b() {
-  call i8* @objc_retain(i8* undef)
-  call void @objc_release(i8* undef)
-  call i8* @objc_autorelease(i8* undef)
+  call i8* @llvm.objc.retain(i8* undef)
+  call void @llvm.objc.release(i8* undef)
+  call i8* @llvm.objc.autorelease(i8* undef)
   ret void
 }
 
@@ -1266,34 +1266,34 @@ define void @test18b() {
 ; CHECK: define void @test19(i32* %y) {
 ; CHECK:   %z = bitcast i32* %y to i8*
 ; CHECK:   %0 = bitcast i32* %y to i8*
-; CHECK:   %1 = tail call i8* @objc_retain(i8* %0)
+; CHECK:   %1 = tail call i8* @llvm.objc.retain(i8* %0)
 ; CHECK:   call void @use_pointer(i8* %z)
 ; CHECK:   call void @use_pointer(i8* %z)
 ; CHECK:   %2 = bitcast i32* %y to i8*
-; CHECK:   call void @objc_release(i8* %2)
+; CHECK:   call void @llvm.objc.release(i8* %2)
 ; CHECK:   ret void
 ; CHECK: }
 define void @test19(i32* %y) {
 entry:
   %x = bitcast i32* %y to i8*
-  %0 = call i8* @objc_retain(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
   %z = bitcast i32* %y to i8*
   call void @use_pointer(i8* %z)
   call void @use_pointer(i8* %z)
-  call void @objc_release(i8* %x)
+  call void @llvm.objc.release(i8* %x)
   ret void
 }
 
 ; Bitcast insertion
 
 ; CHECK-LABEL: define void @test20(
-; CHECK: %tmp1 = tail call i8* @objc_retain(i8* %tmp) [[NUW]]
+; CHECK: %tmp1 = tail call i8* @llvm.objc.retain(i8* %tmp) [[NUW]]
 ; CHECK-NEXT: invoke
 ; CHECK: }
 define void @test20(double* %self) personality i32 (...)* @__gxx_personality_v0 {
 if.then12:
   %tmp = bitcast double* %self to i8*
-  %tmp1 = call i8* @objc_retain(i8* %tmp) nounwind
+  %tmp1 = call i8* @llvm.objc.retain(i8* %tmp) nounwind
   invoke void @invokee()
           to label %invoke.cont23 unwind label %lpad20
 
@@ -1321,8 +1321,8 @@ if.end:                                           ; preds = %invoke.cont23
 define i8* @test21() {
 entry:
   %call = call i8* @returner()
-  %0 = call i8* @objc_retain(i8* %call) nounwind
-  %1 = call i8* @objc_autorelease(i8* %0) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %call) nounwind
+  %1 = call i8* @llvm.objc.autorelease(i8* %0) nounwind
   ret i8* %1
 }
 
@@ -1331,10 +1331,10 @@ entry:
 ; CHECK-LABEL: define void @test22(
 ; CHECK: B:
 ; CHECK:   %1 = bitcast double* %p to i8*
-; CHECK:   call void @objc_release(i8* %1)
+; CHECK:   call void @llvm.objc.release(i8* %1)
 ; CHECK:   br label %C
 ; CHECK: C:                                                ; preds = %B, %A
-; CHECK-NOT: @objc_release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: }
 define void @test22(double* %p, i1 %a) {
   br i1 %a, label %A, label %B
@@ -1345,16 +1345,16 @@ B:
 C:
   %h = phi double* [ null, %A ], [ %p, %B ]
   %c = bitcast double* %h to i8*
-  call void @objc_release(i8* %c), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %c), !clang.imprecise_release !0
   ret void
 }
 
-; Do not move an objc_release that doesn't have the clang.imprecise_release tag.
+; Do not move an llvm.objc.release that doesn't have the clang.imprecise_release tag.
 
 ; CHECK-LABEL: define void @test22_precise(
 ; CHECK: %[[P0:.*]] = phi double*
 ; CHECK: %[[V0:.*]] = bitcast double* %[[P0]] to i8*
-; CHECK: call void @objc_release(i8* %[[V0]])
+; CHECK: call void @llvm.objc.release(i8* %[[V0]])
 ; CHECK: ret void
 define void @test22_precise(double* %p, i1 %a) {
   br i1 %a, label %A, label %B
@@ -1365,21 +1365,21 @@ B:
 C:
   %h = phi double* [ null, %A ], [ %p, %B ]
   %c = bitcast double* %h to i8*
-  call void @objc_release(i8* %c)
+  call void @llvm.objc.release(i8* %c)
   ret void
 }
 
 ; Any call can decrement a retain count.
 
 ; CHECK-LABEL: define void @test24(
-; CHECK: @objc_retain(i8* %a)
-; CHECK: @objc_release
+; CHECK: @llvm.objc.retain(i8* %a)
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test24(i8* %r, i8* %a) {
-  call i8* @objc_retain(i8* %a)
+  call i8* @llvm.objc.retain(i8* %a)
   call void @use_pointer(i8* %r)
   %q = load i8, i8* %a
-  call void @objc_release(i8* %a)
+  call void @llvm.objc.release(i8* %a)
   ret void
 }
 
@@ -1388,14 +1388,14 @@ define void @test24(i8* %r, i8* %a) {
 
 ; CHECK-LABEL: define void @test25(
 ; CHECK: entry:
-; CHECK:   call i8* @objc_retain(i8* %p)
+; CHECK:   call i8* @llvm.objc.retain(i8* %p)
 ; CHECK: true:
 ; CHECK: done:
-; CHECK:   call void @objc_release(i8* %p)
+; CHECK:   call void @llvm.objc.release(i8* %p)
 ; CHECK: }
 define void @test25(i8* %p, i1 %x) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   br i1 %x, label %true, label %done
 
@@ -1404,7 +1404,7 @@ true:
   br label %done
 
 done:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -1413,14 +1413,14 @@ done:
 
 ; CHECK-LABEL: define void @test26(
 ; CHECK: entry:
-; CHECK:   call i8* @objc_retain(i8* %p)
+; CHECK:   call i8* @llvm.objc.retain(i8* %p)
 ; CHECK: true:
 ; CHECK: done:
-; CHECK:   call void @objc_release(i8* %p)
+; CHECK:   call void @llvm.objc.release(i8* %p)
 ; CHECK: }
 define void @test26(i8* %p, i1 %x) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1429,7 +1429,7 @@ true:
 
 done:
   store i8 0, i8* %p
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -1437,15 +1437,15 @@ done:
 
 ; CHECK-LABEL: define void @test27(
 ; CHECK: entry:
-; CHECK: call i8* @objc_retain(i8* %p)
+; CHECK: call i8* @llvm.objc.retain(i8* %p)
 ; CHECK: loop:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: done:
-; CHECK: call void @objc_release
+; CHECK: call void @llvm.objc.release
 ; CHECK: }
 define void @test27(i8* %p, i1 %x, i1 %y) {
 entry: 
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %loop, label %done
 
 loop:
@@ -1454,25 +1454,25 @@ loop:
   br i1 %y, label %done, label %loop
   
 done: 
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; Trivial code motion case: Triangle.
 
 ; CHECK-LABEL: define void @test28(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: true:
-; CHECK: call i8* @objc_retain(
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: call void @callee()
 ; CHECK: store
-; CHECK: call void @objc_release
+; CHECK: call void @llvm.objc.release
 ; CHECK: done:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test28(i8* %p, i1 %x) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1481,7 +1481,7 @@ true:
   br label %done
 
 done:
-  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %p), !clang.imprecise_release !0
   ret void
 }
 
@@ -1489,19 +1489,19 @@ done:
 ; unrelated memory references!
 
 ; CHECK-LABEL: define void @test28b(
-; CHECK: call i8* @objc_retain(
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: true:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: call void @callee()
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: store
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: done:
-; CHECK: @objc_release
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test28b(i8* %p, i1 %x, i8* noalias %t) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1511,7 +1511,7 @@ true:
 
 done:
   store i8 0, i8* %t
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -1519,18 +1519,18 @@ done:
 ; unrelated memory references! And preserve the metadata.
 
 ; CHECK-LABEL: define void @test28c(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: true:
-; CHECK: call i8* @objc_retain(
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: call void @callee()
 ; CHECK: store
-; CHECK: call void @objc_release(i8* %p) [[NUW]], !clang.imprecise_release
+; CHECK: call void @llvm.objc.release(i8* %p) [[NUW]], !clang.imprecise_release
 ; CHECK: done:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test28c(i8* %p, i1 %x, i8* noalias %t) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1540,28 +1540,28 @@ true:
 
 done:
   store i8 0, i8* %t
-  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %p), !clang.imprecise_release !0
   ret void
 }
 
 ; Like test28. but with two releases.
 
 ; CHECK-LABEL: define void @test29(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: true:
-; CHECK: call i8* @objc_retain(
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: call void @callee()
 ; CHECK: store
-; CHECK: call void @objc_release
-; CHECK-NOT: @objc_release
+; CHECK: call void @llvm.objc.release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: done:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: ohno:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test29(i8* %p, i1 %x, i1 %y) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1570,11 +1570,11 @@ true:
   br i1 %y, label %done, label %ohno
 
 done:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 
 ohno:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -1582,23 +1582,23 @@ ohno:
 ; with an extra release.
 
 ; CHECK-LABEL: define void @test30(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: true:
-; CHECK: call i8* @objc_retain(
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: call void @callee()
 ; CHECK: store
-; CHECK: call void @objc_release
-; CHECK-NOT: @objc_release
+; CHECK: call void @llvm.objc.release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: false:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: done:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: ohno:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test30(i8* %p, i1 %x, i1 %y, i1 %z) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %false
 
 true:
@@ -1610,58 +1610,58 @@ false:
   br i1 %z, label %done, label %ohno
 
 done:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 
 ohno:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; Basic case with a mergeable release.
 
 ; CHECK-LABEL: define void @test31(
-; CHECK: call i8* @objc_retain(i8* %p)
+; CHECK: call i8* @llvm.objc.retain(i8* %p)
 ; CHECK: call void @callee()
 ; CHECK: store
-; CHECK: call void @objc_release
-; CHECK-NOT: @objc_release
+; CHECK: call void @llvm.objc.release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: true:
-; CHECK-NOT: @objc_release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: false:
-; CHECK-NOT: @objc_release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: ret void
-; CHECK-NOT: @objc_release
+; CHECK-NOT: @llvm.objc.release
 ; CHECK: }
 define void @test31(i8* %p, i1 %x) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   store i8 0, i8* %p
   br i1 %x, label %true, label %false
 true:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 false:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; Don't consider bitcasts or getelementptrs direct uses.
 
 ; CHECK-LABEL: define void @test32(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: true:
-; CHECK: call i8* @objc_retain(
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: call void @callee()
 ; CHECK: store
-; CHECK: call void @objc_release
+; CHECK: call void @llvm.objc.release
 ; CHECK: done:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test32(i8* %p, i1 %x) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1672,25 +1672,25 @@ true:
 done:
   %g = bitcast i8* %p to i8*
   %h = getelementptr i8, i8* %g, i64 0
-  call void @objc_release(i8* %g)
+  call void @llvm.objc.release(i8* %g)
   ret void
 }
 
 ; Do consider icmps to be direct uses.
 
 ; CHECK-LABEL: define void @test33(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: true:
-; CHECK: call i8* @objc_retain(
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: call void @callee()
 ; CHECK: icmp
-; CHECK: call void @objc_release
+; CHECK: call void @llvm.objc.release
 ; CHECK: done:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test33(i8* %p, i1 %x, i8* %y) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1701,7 +1701,7 @@ true:
 done:
   %g = bitcast i8* %p to i8*
   %h = getelementptr i8, i8* %g, i64 0
-  call void @objc_release(i8* %g)
+  call void @llvm.objc.release(i8* %g)
   ret void
 }
 
@@ -1709,14 +1709,14 @@ done:
 ; releases.
 
 ; CHECK-LABEL: define void @test34a(
-; CHECK:   call i8* @objc_retain
+; CHECK:   call i8* @llvm.objc.retain
 ; CHECK: true:
 ; CHECK: done:
-; CHECK: call void @objc_release
+; CHECK: call void @llvm.objc.release
 ; CHECK: }
 define void @test34a(i8* %p, i1 %x, i8* %y) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1726,16 +1726,16 @@ true:
 done:
   %g = bitcast i8* %p to i8*
   %h = getelementptr i8, i8* %g, i64 0
-  call void @objc_release(i8* %g)
+  call void @llvm.objc.release(i8* %g)
   ret void
 }
 
 ; CHECK-LABEL: define void @test34b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test34b(i8* %p, i1 %x, i8* %y) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1745,7 +1745,7 @@ true:
 done:
   %g = bitcast i8* %p to i8*
   %h = getelementptr i8, i8* %g, i64 0
-  call void @objc_release(i8* %g), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %g), !clang.imprecise_release !0
   ret void
 }
 
@@ -1756,14 +1756,14 @@ done:
 ; Precise.
 ; CHECK-LABEL: define void @test35a(
 ; CHECK: entry:
-; CHECK:   call i8* @objc_retain
+; CHECK:   call i8* @llvm.objc.retain
 ; CHECK: true:
 ; CHECK: done:
-; CHECK:   call void @objc_release
+; CHECK:   call void @llvm.objc.release
 ; CHECK: }
 define void @test35a(i8* %p, i1 %x, i8* %y) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1773,17 +1773,17 @@ true:
 done:
   %g = bitcast i8* %p to i8*
   %h = getelementptr i8, i8* %g, i64 0
-  call void @objc_release(i8* %g)
+  call void @llvm.objc.release(i8* %g)
   ret void
 }
 
 ; Imprecise.
 ; CHECK-LABEL: define void @test35b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test35b(i8* %p, i1 %x, i8* %y) {
 entry:
-  %f0 = call i8* @objc_retain(i8* %p)
+  %f0 = call i8* @llvm.objc.retain(i8* %p)
   br i1 %x, label %true, label %done
 
 true:
@@ -1793,50 +1793,50 @@ true:
 done:
   %g = bitcast i8* %p to i8*
   %h = getelementptr i8, i8* %g, i64 0
-  call void @objc_release(i8* %g), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %g), !clang.imprecise_release !0
   ret void
 }
 
 ; Delete a retain,release if there's no actual use and we have precise release.
 
 ; CHECK-LABEL: define void @test36a(
-; CHECK: @objc_retain
+; CHECK: @llvm.objc.retain
 ; CHECK: call void @callee()
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: call void @callee()
-; CHECK: @objc_release
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test36a(i8* %p) {
 entry:
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   call void @callee()
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; Like test36, but with metadata.
 
 ; CHECK-LABEL: define void @test36b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test36b(i8* %p) {
 entry:
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   call void @callee()
-  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %p), !clang.imprecise_release !0
   ret void
 }
 
 ; Be aggressive about analyzing phis to eliminate possible uses.
 
 ; CHECK-LABEL: define void @test38(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test38(i8* %p, i1 %u, i1 %m, i8* %z, i8* %y, i8* %x, i8* %w) {
 entry:
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   br i1 %u, label %true, label %false
 true:
   br i1 %m, label %a, label %b
@@ -1859,36 +1859,36 @@ f:
 g:
   %h = phi i8* [ %j, %e ], [ %k, %f ]
   call void @use_pointer(i8* %h)
-  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %p), !clang.imprecise_release !0
   ret void
 }
 
 ; Delete retain,release pairs around loops.
 
 ; CHECK-LABEL: define void @test39(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test39(i8* %p) {
 entry:
-  %0 = call i8* @objc_retain(i8* %p)
+  %0 = call i8* @llvm.objc.retain(i8* %p)
   br label %loop
 
 loop:                                             ; preds = %loop, %entry
   br i1 undef, label %loop, label %exit
 
 exit:                                             ; preds = %loop
-  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0), !clang.imprecise_release !0
   ret void
 }
 
 ; Delete retain,release pairs around loops containing uses.
 
 ; CHECK-LABEL: define void @test39b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test39b(i8* %p) {
 entry:
-  %0 = call i8* @objc_retain(i8* %p)
+  %0 = call i8* @llvm.objc.retain(i8* %p)
   br label %loop
 
 loop:                                             ; preds = %loop, %entry
@@ -1896,18 +1896,18 @@ loop:                                             ; preds = %loop, %entry
   br i1 undef, label %loop, label %exit
 
 exit:                                             ; preds = %loop
-  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0), !clang.imprecise_release !0
   ret void
 }
 
 ; Delete retain,release pairs around loops containing potential decrements.
 
 ; CHECK-LABEL: define void @test39c(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test39c(i8* %p) {
 entry:
-  %0 = call i8* @objc_retain(i8* %p)
+  %0 = call i8* @llvm.objc.retain(i8* %p)
   br label %loop
 
 loop:                                             ; preds = %loop, %entry
@@ -1915,7 +1915,7 @@ loop:                                             ; preds = %loop, %entry
   br i1 undef, label %loop, label %exit
 
 exit:                                             ; preds = %loop
-  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0), !clang.imprecise_release !0
   ret void
 }
 
@@ -1923,11 +1923,11 @@ exit:                                             ; preds = %loop
 ; the successors are in a different order.
 
 ; CHECK-LABEL: define void @test40(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test40(i8* %p) {
 entry:
-  %0 = call i8* @objc_retain(i8* %p)
+  %0 = call i8* @llvm.objc.retain(i8* %p)
   br label %loop
 
 loop:                                             ; preds = %loop, %entry
@@ -1935,7 +1935,7 @@ loop:                                             ; preds = %loop, %entry
   br i1 undef, label %exit, label %loop
 
 exit:                                             ; preds = %loop
-  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0), !clang.imprecise_release !0
   ret void
 }
 
@@ -1944,26 +1944,26 @@ exit:                                             ; preds = %loop
 
 ; CHECK-LABEL: define void @test42(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: call i8* @objc_retain(i8* %p)
-; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.autorelease(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
-; CHECK-NEXT: call void @objc_release(i8* %p)
+; CHECK-NEXT: call void @llvm.objc.release(i8* %p)
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test42(i8* %p) {
 entry:
-  call i8* @objc_retain(i8* %p)
-  call i8* @objc_autorelease(i8* %p)
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
+  call i8* @llvm.objc.autorelease(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @use_pointer(i8* %p)
   call void @use_pointer(i8* %p)
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   call void @use_pointer(i8* %p)
   call void @use_pointer(i8* %p)
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -1972,24 +1972,24 @@ entry:
 
 ; CHECK-LABEL: define void @test43(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: call i8* @objc_retain(i8* %p)
-; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
-; CHECK-NEXT: call i8* @objc_retain
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.autorelease(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.retain
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
-; CHECK-NEXT: call void @objc_autoreleasePoolPop(i8* undef)
-; CHECK-NEXT: call void @objc_release
+; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(i8* undef)
+; CHECK-NEXT: call void @llvm.objc.release
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test43(i8* %p) {
 entry:
-  call i8* @objc_retain(i8* %p)
-  call i8* @objc_autorelease(i8* %p)
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
+  call i8* @llvm.objc.autorelease(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @use_pointer(i8* %p)
   call void @use_pointer(i8* %p)
-  call void @objc_autoreleasePoolPop(i8* undef)
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.autoreleasePoolPop(i8* undef)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -1998,74 +1998,74 @@ entry:
 
 ; CHECK-LABEL: define void @test43b(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: call i8* @objc_retain(i8* %p)
-; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.autorelease(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
-; CHECK-NEXT: call i8* @objc_autoreleasePoolPush()
+; CHECK-NEXT: call i8* @llvm.objc.autoreleasePoolPush()
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
-; CHECK-NEXT: call void @objc_release
+; CHECK-NEXT: call void @llvm.objc.release
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test43b(i8* %p) {
 entry:
-  call i8* @objc_retain(i8* %p)
-  call i8* @objc_autorelease(i8* %p)
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
+  call i8* @llvm.objc.autorelease(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @use_pointer(i8* %p)
   call void @use_pointer(i8* %p)
-  call i8* @objc_autoreleasePoolPush()
-  call void @objc_release(i8* %p)
+  call i8* @llvm.objc.autoreleasePoolPush()
+  call void @llvm.objc.release(i8* %p)
   call void @use_pointer(i8* %p)
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; Do retain+release elimination for non-provenance pointers.
 
 ; CHECK-LABEL: define void @test44(
-; CHECK-NOT: objc_
+; CHECK-NOT: llvm.objc.
 ; CHECK: }
 define void @test44(i8** %pp) {
   %p = load i8*, i8** %pp
-  %q = call i8* @objc_retain(i8* %p)
-  call void @objc_release(i8* %q)
+  %q = call i8* @llvm.objc.retain(i8* %p)
+  call void @llvm.objc.release(i8* %q)
   ret void
 }
 
 ; Don't delete retain+release with an unknown-provenance
-; may-alias objc_release between them.
+; may-alias llvm.objc.release between them.
 
 ; CHECK-LABEL: define void @test45(
-; CHECK: call i8* @objc_retain(i8* %p)
-; CHECK: call void @objc_release(i8* %q)
+; CHECK: call i8* @llvm.objc.retain(i8* %p)
+; CHECK: call void @llvm.objc.release(i8* %q)
 ; CHECK: call void @use_pointer(i8* %p)
-; CHECK: call void @objc_release(i8* %p)
+; CHECK: call void @llvm.objc.release(i8* %p)
 ; CHECK: }
 define void @test45(i8** %pp, i8** %qq) {
   %p = load i8*, i8** %pp
   %q = load i8*, i8** %qq
-  call i8* @objc_retain(i8* %p)
-  call void @objc_release(i8* %q)
+  call i8* @llvm.objc.retain(i8* %p)
+  call void @llvm.objc.release(i8* %q)
   call void @use_pointer(i8* %p)
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; Don't delete retain and autorelease here.
 
 ; CHECK-LABEL: define void @test46(
-; CHECK: tail call i8* @objc_retain(i8* %p) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retain(i8* %p) [[NUW]]
 ; CHECK: true:
-; CHECK: call i8* @objc_autorelease(i8* %p) [[NUW]]
+; CHECK: call i8* @llvm.objc.autorelease(i8* %p) [[NUW]]
 ; CHECK: }
 define void @test46(i8* %p, i1 %a) {
 entry:
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   br i1 %a, label %true, label %false
 
 true:
-  call i8* @objc_autorelease(i8* %p)
+  call i8* @llvm.objc.autorelease(i8* %p)
   call void @use_pointer(i8* %p)
   ret void
 
@@ -2080,7 +2080,7 @@ false:
 ; CHECK: ret i8* %p
 ; CHECK: }
 define i8* @test47(i8* %p) nounwind {
-  %x = call i8* @objc_retainedObject(i8* %p)
+  %x = call i8* @llvm.objc.retainedObject(i8* %p)
   ret i8* %x
 }
 
@@ -2091,7 +2091,7 @@ define i8* @test47(i8* %p) nounwind {
 ; CHECK: ret i8* %p
 ; CHECK: }
 define i8* @test48(i8* %p) nounwind {
-  %x = call i8* @objc_unretainedObject(i8* %p)
+  %x = call i8* @llvm.objc.unretainedObject(i8* %p)
   ret i8* %x
 }
 
@@ -2102,36 +2102,36 @@ define i8* @test48(i8* %p) nounwind {
 ; CHECK: ret i8* %p
 ; CHECK: }
 define i8* @test49(i8* %p) nounwind {
-  %x = call i8* @objc_unretainedPointer(i8* %p)
+  %x = call i8* @llvm.objc.unretainedPointer(i8* %p)
   ret i8* %x
 }
 
 ; Do delete retain+release with intervening stores of the address value if we
-; have imprecise release attached to objc_release.
+; have imprecise release attached to llvm.objc.release.
 
 ; CHECK-LABEL:      define void @test50a(
-; CHECK-NEXT:   call i8* @objc_retain
+; CHECK-NEXT:   call i8* @llvm.objc.retain
 ; CHECK-NEXT:   call void @callee
 ; CHECK-NEXT:   store
-; CHECK-NEXT:   call void @objc_release
+; CHECK-NEXT:   call void @llvm.objc.release
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test50a(i8* %p, i8** %pp) {
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   store i8* %p, i8** %pp
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; CHECK-LABEL: define void @test50b(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test50b(i8* %p, i8** %pp) {
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   store i8* %p, i8** %pp
-  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %p), !clang.imprecise_release !0
   ret void
 }
 
@@ -2140,28 +2140,28 @@ define void @test50b(i8* %p, i8** %pp) {
 ; address value.
 
 ; CHECK-LABEL: define void @test51a(
-; CHECK: call i8* @objc_retain(i8* %p)
-; CHECK: call void @objc_release(i8* %p)
+; CHECK: call i8* @llvm.objc.retain(i8* %p)
+; CHECK: call void @llvm.objc.release(i8* %p)
 ; CHECK: ret void
 ; CHECK: }
 define void @test51a(i8* %p) {
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   store i8 0, i8* %p
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; CHECK-LABEL: define void @test51b(
-; CHECK: call i8* @objc_retain(i8* %p)
-; CHECK: call void @objc_release(i8* %p)
+; CHECK: call i8* @llvm.objc.retain(i8* %p)
+; CHECK: call void @llvm.objc.release(i8* %p)
 ; CHECK: ret void
 ; CHECK: }
 define void @test51b(i8* %p) {
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   store i8 0, i8* %p
-  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %p), !clang.imprecise_release !0
   ret void
 }
 
@@ -2169,36 +2169,36 @@ define void @test51b(i8* %p) {
 ; unknown provenance.
 
 ; CHECK-LABEL: define void @test52a(
-; CHECK: call i8* @objc_retain
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: call void @callee()
 ; CHECK: call void @use_pointer(i8* %z)
-; CHECK: call void @objc_release
+; CHECK: call void @llvm.objc.release
 ; CHECK: ret void
 ; CHECK: }
 define void @test52a(i8** %zz, i8** %pp) {
   %p = load i8*, i8** %pp
-  %1 = call i8* @objc_retain(i8* %p)
+  %1 = call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   %z = load i8*, i8** %zz
   call void @use_pointer(i8* %z)
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
 ; CHECK-LABEL: define void @test52b(
-; CHECK: call i8* @objc_retain
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: call void @callee()
 ; CHECK: call void @use_pointer(i8* %z)
-; CHECK: call void @objc_release
+; CHECK: call void @llvm.objc.release
 ; CHECK: ret void
 ; CHECK: }
 define void @test52b(i8** %zz, i8** %pp) {
   %p = load i8*, i8** %pp
-  %1 = call i8* @objc_retain(i8* %p)
+  %1 = call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   %z = load i8*, i8** %zz
   call void @use_pointer(i8* %z)
-  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %p), !clang.imprecise_release !0
   ret void
 }
 
@@ -2208,15 +2208,15 @@ define void @test52b(i8** %zz, i8** %pp) {
 ; See rdar://10551239.
 
 ; CHECK-LABEL: define void @test53(
-; CHECK: @objc_
+; CHECK: @llvm.objc.
 ; CHECK: }
 define void @test53(void ()** %zz, i8** %pp) {
   %p = load i8*, i8** %pp
-  %1 = call i8* @objc_retain(i8* %p)
+  %1 = call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   %z = load void ()*, void ()** %zz
   call void @callee_fnptr(void ()* %z)
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -2224,12 +2224,12 @@ define void @test53(void ()** %zz, i8** %pp) {
 
 ; CHECK-LABEL: define void @test54(
 ; CHECK: call i8* @returner()
-; CHECK-NEXT: call void @objc_release(i8* %t) [[NUW]], !clang.imprecise_release ![[RELEASE]]
+; CHECK-NEXT: call void @llvm.objc.release(i8* %t) [[NUW]], !clang.imprecise_release ![[RELEASE]]
 ; CHECK-NEXT: ret void
 ; CHECK: }
 define void @test54() {
   %t = call i8* @returner()
-  call i8* @objc_autorelease(i8* %t)
+  call i8* @llvm.objc.autorelease(i8* %t)
   ret void
 }
 
@@ -2240,10 +2240,10 @@ define void @test54() {
 ; CHECK: }
 define void @test55(i8* %x) { 
 entry: 
-  %0 = call i8* @objc_retain(i8* %x) nounwind 
-  %1 = call i8* @objc_retain(i8* %x) nounwind 
-  call void @objc_release(i8* %x) nounwind 
-  call void @objc_release(i8* %x) nounwind 
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind 
+  %1 = call i8* @llvm.objc.retain(i8* %x) nounwind 
+  call void @llvm.objc.release(i8* %x) nounwind 
+  call void @llvm.objc.release(i8* %x) nounwind 
   ret void 
 }
 
@@ -2255,30 +2255,30 @@ entry:
 ; CHECK-LABEL: define void @test56(
 ; CHECK-NOT: @objc
 ; CHECK: if.then:
-; CHECK-NEXT: %0 = tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK-NEXT: %0 = tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
 ; CHECK-NEXT: tail call void @use_pointer(i8* %x)
 ; CHECK-NEXT: tail call void @use_pointer(i8* %x)
-; CHECK-NEXT: tail call void @objc_release(i8* %x) [[NUW]], !clang.imprecise_release ![[RELEASE]]
+; CHECK-NEXT: tail call void @llvm.objc.release(i8* %x) [[NUW]], !clang.imprecise_release ![[RELEASE]]
 ; CHECK-NEXT: br label %if.end
 ; CHECK-NOT: @objc
 ; CHECK: }
 define void @test56(i8* %x, i32 %n) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %x) nounwind
-  %1 = tail call i8* @objc_retain(i8* %0) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %x) nounwind
+  %1 = tail call i8* @llvm.objc.retain(i8* %0) nounwind
   %tobool = icmp eq i32 %n, 0
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
-  %2 = tail call i8* @objc_retain(i8* %1) nounwind
+  %2 = tail call i8* @llvm.objc.retain(i8* %1) nounwind
   tail call void @use_pointer(i8* %2)
   tail call void @use_pointer(i8* %2)
-  tail call void @objc_release(i8* %2) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %2) nounwind, !clang.imprecise_release !0
   br label %if.end
 
 if.end:                                           ; preds = %entry, %if.then
-  tail call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
-  tail call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %1) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -2288,26 +2288,26 @@ if.end:                                           ; preds = %entry, %if.then
 
 ; CHECK-LABEL:      define void @test57(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK-NEXT:   tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK-NEXT:   tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
-; CHECK-NEXT:   call void @objc_release(i8* %x) [[NUW]]
+; CHECK-NEXT:   call void @llvm.objc.release(i8* %x) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test57(i8* %x) nounwind {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
-  call i8* @objc_retain(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
@@ -2316,20 +2316,20 @@ entry:
 
 ; CHECK-LABEL:      define void @test58(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   @objc_retain
+; CHECK-NEXT:   @llvm.objc.retain
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test58(i8* %x) nounwind {
 entry:
-  call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
-  call i8* @objc_retain(i8* %x) nounwind
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
+  call i8* @llvm.objc.retain(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
@@ -2337,20 +2337,20 @@ entry:
 
 ; CHECK-LABEL:      define void @test59(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK-NEXT:   %0 = tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
-; CHECK-NEXT:   call void @objc_release(i8* %x) [[NUW]]
+; CHECK-NEXT:   call void @llvm.objc.release(i8* %x) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test59(i8* %x) nounwind {
 entry:
-  %a = call i8* @objc_retain(i8* %x) nounwind
-  call void @objc_release(i8* %x) nounwind
-  %b = call i8* @objc_retain(i8* %x) nounwind
+  %a = call i8* @llvm.objc.retain(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
+  %b = call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
@@ -2363,71 +2363,71 @@ entry:
 ; @something is not constant.
 
 ; CHECK-LABEL: define void @test60a(
-; CHECK: call i8* @objc_retain
-; CHECK: call void @objc_release
+; CHECK: call i8* @llvm.objc.retain
+; CHECK: call void @llvm.objc.release
 ; CHECK: }
 define void @test60a() {
   %t = load i8*, i8** @constptr
   %s = load i8*, i8** @something
-  call i8* @objc_retain(i8* %s)
+  call i8* @llvm.objc.retain(i8* %s)
   call void @callee()
   call void @use_pointer(i8* %t)
-  call void @objc_release(i8* %s)
+  call void @llvm.objc.release(i8* %s)
   ret void
 }
 
 ; CHECK-LABEL: define void @test60b(
-; CHECK: call i8* @objc_retain
-; CHECK-NOT: call i8* @objc_retain
-; CHECK-NOT: call i8* @objc_release
+; CHECK: call i8* @llvm.objc.retain
+; CHECK-NOT: call i8* @llvm.objc.retain
+; CHECK-NOT: call i8* @llvm.objc.release
 ; CHECK: }
 define void @test60b() {
   %t = load i8*, i8** @constptr
   %s = load i8*, i8** @something
-  call i8* @objc_retain(i8* %t)
-  call i8* @objc_retain(i8* %t)
+  call i8* @llvm.objc.retain(i8* %t)
+  call i8* @llvm.objc.retain(i8* %t)
   call void @callee()
   call void @use_pointer(i8* %s)
-  call void @objc_release(i8* %t)
+  call void @llvm.objc.release(i8* %t)
   ret void
 }
 
 ; CHECK-LABEL: define void @test60c(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test60c() {
   %t = load i8*, i8** @constptr
   %s = load i8*, i8** @something
-  call i8* @objc_retain(i8* %t)
+  call i8* @llvm.objc.retain(i8* %t)
   call void @callee()
   call void @use_pointer(i8* %s)
-  call void @objc_release(i8* %t), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %t), !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK-LABEL: define void @test60d(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test60d() {
   %t = load i8*, i8** @constptr
   %s = load i8*, i8** @something
-  call i8* @objc_retain(i8* %t)
+  call i8* @llvm.objc.retain(i8* %t)
   call void @callee()
   call void @use_pointer(i8* %s)
-  call void @objc_release(i8* %t)
+  call void @llvm.objc.release(i8* %t)
   ret void
 }
 
 ; CHECK-LABEL: define void @test60e(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test60e() {
   %t = load i8*, i8** @constptr
   %s = load i8*, i8** @something
-  call i8* @objc_retain(i8* %t)
+  call i8* @llvm.objc.retain(i8* %t)
   call void @callee()
   call void @use_pointer(i8* %s)
-  call void @objc_release(i8* %t), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %t), !clang.imprecise_release !0
   ret void
 }
 
@@ -2435,14 +2435,14 @@ define void @test60e() {
 ; pointers.
 
 ; CHECK-LABEL: define void @test61(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test61() {
   %t = load i8*, i8** @constptr
-  call i8* @objc_retain(i8* %t)
+  call i8* @llvm.objc.retain(i8* %t)
   call void @callee()
   call void @use_pointer(i8* %t)
-  call void @objc_release(i8* %t)
+  call void @llvm.objc.release(i8* %t)
   ret void
 }
 
@@ -2450,23 +2450,23 @@ define void @test61() {
 ; other is outside the loop.
 
 ; CHECK-LABEL: define void @test62(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test62(i8* %x, i1* %p) nounwind {
 entry:
   br label %loop
 
 loop:
-  call i8* @objc_retain(i8* %x)
+  call i8* @llvm.objc.retain(i8* %x)
   %q = load i1, i1* %p
   br i1 %q, label %loop.more, label %exit
 
 loop.more:
-  call void @objc_release(i8* %x)
+  call void @llvm.objc.release(i8* %x)
   br label %loop
 
 exit:
-  call void @objc_release(i8* %x)
+  call void @llvm.objc.release(i8* %x)
   ret void
 }
 
@@ -2475,21 +2475,21 @@ exit:
 
 ; CHECK-LABEL: define void @test63(
 ; CHECK: loop:
-; CHECK:   tail call i8* @objc_retain(i8* %x)
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %x)
 ; CHECK: loop.more:
-; CHECK:   call void @objc_release(i8* %x)
+; CHECK:   call void @llvm.objc.release(i8* %x)
 ; CHECK: }
 define void @test63(i8* %x, i1* %p) nounwind {
 entry:
   br label %loop
 
 loop:
-  call i8* @objc_retain(i8* %x)
+  call i8* @llvm.objc.retain(i8* %x)
   %q = load i1, i1* %p
   br i1 %q, label %loop.more, label %exit
 
 loop.more:
-  call void @objc_release(i8* %x)
+  call void @llvm.objc.release(i8* %x)
   br label %loop
 
 exit:
@@ -2501,16 +2501,16 @@ exit:
 
 ; CHECK-LABEL: define void @test64(
 ; CHECK: loop:
-; CHECK:   tail call i8* @objc_retain(i8* %x)
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %x)
 ; CHECK: exit:
-; CHECK:   call void @objc_release(i8* %x)
+; CHECK:   call void @llvm.objc.release(i8* %x)
 ; CHECK: }
 define void @test64(i8* %x, i1* %p) nounwind {
 entry:
   br label %loop
 
 loop:
-  call i8* @objc_retain(i8* %x)
+  call i8* @llvm.objc.retain(i8* %x)
   %q = load i1, i1* %p
   br i1 %q, label %loop.more, label %exit
 
@@ -2518,7 +2518,7 @@ loop.more:
   br label %loop
 
 exit:
-  call void @objc_release(i8* %x)
+  call void @llvm.objc.release(i8* %x)
   ret void
 }
 
@@ -2526,9 +2526,9 @@ exit:
 
 ; CHECK-LABEL: define i8* @test65(
 ; CHECK: if.then:
-; CHECK:   call i8* @objc_autorelease(
+; CHECK:   call i8* @llvm.objc.autorelease(
 ; CHECK: return:
-; CHECK-NOT: @objc_autorelease
+; CHECK-NOT: @llvm.objc.autorelease
 ; CHECK: }
 define i8* @test65(i1 %x) {
 entry:
@@ -2536,12 +2536,12 @@ entry:
 
 if.then:                                          ; preds = %entry
   %c = call i8* @returner()
-  %s = call i8* @objc_retainAutoreleasedReturnValue(i8* %c) nounwind
+  %s = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %c) nounwind
   br label %return
 
 return:                                           ; preds = %if.then, %entry
   %retval = phi i8* [ %s, %if.then ], [ null, %entry ]
-  %q = call i8* @objc_autorelease(i8* %retval) nounwind
+  %q = call i8* @llvm.objc.autorelease(i8* %retval) nounwind
   ret i8* %retval
 }
 
@@ -2549,24 +2549,24 @@ return:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: define i8* @test65b(
 ; CHECK: if.then:
-; CHECK-NOT: @objc_autorelease
+; CHECK-NOT: @llvm.objc.autorelease
 ; CHECK: return:
-; CHECK:   call i8* @objc_autorelease(
+; CHECK:   call i8* @llvm.objc.autorelease(
 ; CHECK: }
 define i8* @test65b(i1 %x) {
 entry:
-  %t = call i8* @objc_autoreleasePoolPush()
+  %t = call i8* @llvm.objc.autoreleasePoolPush()
   br i1 %x, label %return, label %if.then
 
 if.then:                                          ; preds = %entry
   %c = call i8* @returner()
-  %s = call i8* @objc_retainAutoreleasedReturnValue(i8* %c) nounwind
+  %s = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %c) nounwind
   br label %return
 
 return:                                           ; preds = %if.then, %entry
   %retval = phi i8* [ %s, %if.then ], [ null, %entry ]
-  call void @objc_autoreleasePoolPop(i8* %t)
-  %q = call i8* @objc_autorelease(i8* %retval) nounwind
+  call void @llvm.objc.autoreleasePoolPop(i8* %t)
+  %q = call i8* @llvm.objc.autorelease(i8* %retval) nounwind
   ret i8* %retval
 }
 
@@ -2575,9 +2575,9 @@ return:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: define i8* @test65c(
 ; CHECK: if.then:
-; CHECK-NOT: @objc_autorelease
+; CHECK-NOT: @llvm.objc.autorelease
 ; CHECK: return:
-; CHECK:   call i8* @objc_autoreleaseReturnValue(
+; CHECK:   call i8* @llvm.objc.autoreleaseReturnValue(
 ; CHECK: }
 define i8* @test65c(i1 %x) {
 entry:
@@ -2585,20 +2585,20 @@ entry:
 
 if.then:                                          ; preds = %entry
   %c = call i8* @returner()
-  %s = call i8* @objc_retainAutoreleasedReturnValue(i8* %c) nounwind
+  %s = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %c) nounwind
   br label %return
 
 return:                                           ; preds = %if.then, %entry
   %retval = phi i8* [ %s, %if.then ], [ null, %entry ]
-  %q = call i8* @objc_autoreleaseReturnValue(i8* %retval) nounwind
+  %q = call i8* @llvm.objc.autoreleaseReturnValue(i8* %retval) nounwind
   ret i8* %retval
 }
 
 ; CHECK-LABEL: define i8* @test65d(
 ; CHECK: if.then:
-; CHECK-NOT: @objc_autorelease
+; CHECK-NOT: @llvm.objc.autorelease
 ; CHECK: return:
-; CHECK:   call i8* @objc_autoreleaseReturnValue(
+; CHECK:   call i8* @llvm.objc.autoreleaseReturnValue(
 ; CHECK: }
 define i8* @test65d(i1 %x) {
 entry:
@@ -2606,23 +2606,23 @@ entry:
 
 if.then:                                          ; preds = %entry
   %c = call i8* @returner()
-  %s = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %c) nounwind
+  %s = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %c) nounwind
   br label %return
 
 return:                                           ; preds = %if.then, %entry
   %retval = phi i8* [ %s, %if.then ], [ null, %entry ]
-  %q = call i8* @objc_autoreleaseReturnValue(i8* %retval) nounwind
+  %q = call i8* @llvm.objc.autoreleaseReturnValue(i8* %retval) nounwind
   ret i8* %retval
 }
 
-; An objc_retain can serve as a may-use for a different pointer.
+; An llvm.objc.retain can serve as a may-use for a different pointer.
 ; rdar://11931823
 
 ; CHECK-LABEL: define void @test66a(
-; CHECK:   tail call i8* @objc_retain(i8* %cond) [[NUW]]
-; CHECK:   tail call void @objc_release(i8* %call) [[NUW]]
-; CHECK:   tail call i8* @objc_retain(i8* %tmp8) [[NUW]]
-; CHECK:   tail call void @objc_release(i8* %cond) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %cond) [[NUW]]
+; CHECK:   tail call void @llvm.objc.release(i8* %call) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %tmp8) [[NUW]]
+; CHECK:   tail call void @llvm.objc.release(i8* %cond) [[NUW]]
 ; CHECK: }
 define void @test66a(i8* %tmp5, i8* %bar, i1 %tobool, i1 %tobool1, i8* %call) {
 entry:
@@ -2633,19 +2633,19 @@ cond.true:
 
 cond.end:                                         ; preds = %cond.true, %entry
   %cond = phi i8* [ %tmp5, %cond.true ], [ %call, %entry ]
-  %tmp7 = tail call i8* @objc_retain(i8* %cond) nounwind
-  tail call void @objc_release(i8* %call) nounwind
+  %tmp7 = tail call i8* @llvm.objc.retain(i8* %cond) nounwind
+  tail call void @llvm.objc.release(i8* %call) nounwind
   %tmp8 = select i1 %tobool1, i8* %cond, i8* %bar
-  %tmp9 = tail call i8* @objc_retain(i8* %tmp8) nounwind
-  tail call void @objc_release(i8* %cond) nounwind
+  %tmp9 = tail call i8* @llvm.objc.retain(i8* %tmp8) nounwind
+  tail call void @llvm.objc.release(i8* %cond) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test66b(
-; CHECK:   tail call i8* @objc_retain(i8* %cond) [[NUW]]
-; CHECK:   tail call void @objc_release(i8* %call) [[NUW]]
-; CHECK:   tail call i8* @objc_retain(i8* %tmp8) [[NUW]]
-; CHECK:   tail call void @objc_release(i8* %cond) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %cond) [[NUW]]
+; CHECK:   tail call void @llvm.objc.release(i8* %call) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %tmp8) [[NUW]]
+; CHECK:   tail call void @llvm.objc.release(i8* %cond) [[NUW]]
 ; CHECK: }
 define void @test66b(i8* %tmp5, i8* %bar, i1 %tobool, i1 %tobool1, i8* %call) {
 entry:
@@ -2656,19 +2656,19 @@ cond.true:
 
 cond.end:                                         ; preds = %cond.true, %entry
   %cond = phi i8* [ %tmp5, %cond.true ], [ %call, %entry ]
-  %tmp7 = tail call i8* @objc_retain(i8* %cond) nounwind
-  tail call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+  %tmp7 = tail call i8* @llvm.objc.retain(i8* %cond) nounwind
+  tail call void @llvm.objc.release(i8* %call) nounwind, !clang.imprecise_release !0
   %tmp8 = select i1 %tobool1, i8* %cond, i8* %bar
-  %tmp9 = tail call i8* @objc_retain(i8* %tmp8) nounwind
-  tail call void @objc_release(i8* %cond) nounwind
+  %tmp9 = tail call i8* @llvm.objc.retain(i8* %tmp8) nounwind
+  tail call void @llvm.objc.release(i8* %cond) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test66c(
-; CHECK:   tail call i8* @objc_retain(i8* %cond) [[NUW]]
-; CHECK:   tail call void @objc_release(i8* %call) [[NUW]]
-; CHECK:   tail call i8* @objc_retain(i8* %tmp8) [[NUW]]
-; CHECK:   tail call void @objc_release(i8* %cond) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %cond) [[NUW]]
+; CHECK:   tail call void @llvm.objc.release(i8* %call) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %tmp8) [[NUW]]
+; CHECK:   tail call void @llvm.objc.release(i8* %cond) [[NUW]]
 ; CHECK: }
 define void @test66c(i8* %tmp5, i8* %bar, i1 %tobool, i1 %tobool1, i8* %call) {
 entry:
@@ -2679,19 +2679,19 @@ cond.true:
 
 cond.end:                                         ; preds = %cond.true, %entry
   %cond = phi i8* [ %tmp5, %cond.true ], [ %call, %entry ]
-  %tmp7 = tail call i8* @objc_retain(i8* %cond) nounwind
-  tail call void @objc_release(i8* %call) nounwind
+  %tmp7 = tail call i8* @llvm.objc.retain(i8* %cond) nounwind
+  tail call void @llvm.objc.release(i8* %call) nounwind
   %tmp8 = select i1 %tobool1, i8* %cond, i8* %bar
-  %tmp9 = tail call i8* @objc_retain(i8* %tmp8) nounwind, !clang.imprecise_release !0
-  tail call void @objc_release(i8* %cond) nounwind
+  %tmp9 = tail call i8* @llvm.objc.retain(i8* %tmp8) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %cond) nounwind
   ret void
 }
 
 ; CHECK-LABEL: define void @test66d(
-; CHECK:   tail call i8* @objc_retain(i8* %cond) [[NUW]]
-; CHECK:   tail call void @objc_release(i8* %call) [[NUW]]
-; CHECK:   tail call i8* @objc_retain(i8* %tmp8) [[NUW]]
-; CHECK:   tail call void @objc_release(i8* %cond) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %cond) [[NUW]]
+; CHECK:   tail call void @llvm.objc.release(i8* %call) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %tmp8) [[NUW]]
+; CHECK:   tail call void @llvm.objc.release(i8* %cond) [[NUW]]
 ; CHECK: }
 define void @test66d(i8* %tmp5, i8* %bar, i1 %tobool, i1 %tobool1, i8* %call) {
 entry:
@@ -2702,11 +2702,11 @@ cond.true:
 
 cond.end:                                         ; preds = %cond.true, %entry
   %cond = phi i8* [ %tmp5, %cond.true ], [ %call, %entry ]
-  %tmp7 = tail call i8* @objc_retain(i8* %cond) nounwind
-  tail call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+  %tmp7 = tail call i8* @llvm.objc.retain(i8* %cond) nounwind
+  tail call void @llvm.objc.release(i8* %call) nounwind, !clang.imprecise_release !0
   %tmp8 = select i1 %tobool1, i8* %cond, i8* %bar
-  %tmp9 = tail call i8* @objc_retain(i8* %tmp8) nounwind
-  tail call void @objc_release(i8* %cond) nounwind, !clang.imprecise_release !0
+  %tmp9 = tail call i8* @llvm.objc.retain(i8* %tmp8) nounwind
+  tail call void @llvm.objc.release(i8* %cond) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -2719,13 +2719,13 @@ declare i32 @puts(i8* nocapture) nounwind
 @str = internal constant [16 x i8] c"-[ Top0 _getX ]\00"
 
 ; CHECK: define { <2 x float>, <2 x float> } @"\01-[A z]"({}* %self, i8* nocapture %_cmd) [[NUW]] {
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 
 define {<2 x float>, <2 x float>} @"\01-[A z]"({}* %self, i8* nocapture %_cmd) nounwind {
 invoke.cont:
   %0 = bitcast {}* %self to i8*
-  %1 = tail call i8* @objc_retain(i8* %0) nounwind
+  %1 = tail call i8* @llvm.objc.retain(i8* %0) nounwind
   tail call void @llvm.dbg.value(metadata {}* %self, metadata !DILocalVariable(scope: !2), metadata !DIExpression()), !dbg !DILocation(scope: !2)
   tail call void @llvm.dbg.value(metadata {}* %self, metadata !DILocalVariable(scope: !2), metadata !DIExpression()), !dbg !DILocation(scope: !2)
   %ivar = load i64, i64* @"OBJC_IVAR_$_A.myZ", align 8
@@ -2753,7 +2753,7 @@ invoke.cont:
   %add.ptr24 = getelementptr i8, i8* %0, i64 %ivar23
   %4 = bitcast i8* %add.ptr24 to i128*
   %srcval = load i128, i128* %4, align 4
-  tail call void @objc_release(i8* %0) nounwind
+  tail call void @llvm.objc.release(i8* %0) nounwind
   %tmp29 = trunc i128 %srcval to i64
   %tmp30 = bitcast i64 %tmp29 to <2 x float>
   %tmp31 = insertvalue {<2 x float>, <2 x float>} undef, <2 x float> %tmp30, 0
@@ -2765,15 +2765,15 @@ invoke.cont:
 }
 
 ; CHECK: @"\01-[Top0 _getX]"({}* %self, i8* nocapture %_cmd) [[NUW]] {
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 
 define i32 @"\01-[Top0 _getX]"({}* %self, i8* nocapture %_cmd) nounwind {
 invoke.cont:
   %0 = bitcast {}* %self to i8*
-  %1 = tail call i8* @objc_retain(i8* %0) nounwind
+  %1 = tail call i8* @llvm.objc.retain(i8* %0) nounwind
   %puts = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @str, i64 0, i64 0))
-  tail call void @objc_release(i8* %0) nounwind
+  tail call void @llvm.objc.release(i8* %0) nounwind
   ret i32 0
 }
 
@@ -2785,36 +2785,36 @@ invoke.cont:
 
 ; CHECK: define void @loop(i8* %x, i64 %n) {
 ; CHECK: for.body:
-; CHECK-NOT: @objc_
-; CHECK: @objc_msgSend
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
+; CHECK: @llvm.objc.msgSend
+; CHECK-NOT: @llvm.objc.
 ; CHECK: for.end:
 ; CHECK: }
 define void @loop(i8* %x, i64 %n) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %x) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %x) nounwind
   %cmp9 = icmp sgt i64 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
 
 for.body:                                         ; preds = %entry, %for.body
   %i.010 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
-  %1 = tail call i8* @objc_retain(i8* %x) nounwind
+  %1 = tail call i8* @llvm.objc.retain(i8* %x) nounwind
   %tmp5 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call = tail call i8* (i8*, i8*, ...) @objc_msgSend(i8* %1, i8* %tmp5)
-  tail call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
+  %call = tail call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %1, i8* %tmp5)
+  tail call void @llvm.objc.release(i8* %1) nounwind, !clang.imprecise_release !0
   %inc = add nsw i64 %i.010, 1
   %exitcond = icmp eq i64 %inc, %n
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
-  tail call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; ObjCARCOpt can delete the retain,release on self.
 
 ; CHECK: define void @TextEditTest(%2* %self, %3* %pboard) {
-; CHECK-NOT: call i8* @objc_retain(i8* %tmp7)
+; CHECK-NOT: call i8* @llvm.objc.retain(i8* %tmp7)
 ; CHECK: }
 
 %0 = type { i8* (i8*, %struct._message_ref_t*, ...)*, i8* }
@@ -2873,34 +2873,34 @@ define void @TextEditTest(%2* %self, %3* %pboard) {
 entry:
   %err = alloca %4*, align 8
   %tmp7 = bitcast %2* %self to i8*
-  %tmp8 = call i8* @objc_retain(i8* %tmp7) nounwind
+  %tmp8 = call i8* @llvm.objc.retain(i8* %tmp7) nounwind
   store %4* null, %4** %err, align 8
   %tmp1 = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_17", align 8
   %tmp2 = load %struct.__CFString*, %struct.__CFString** @kUTTypePlainText, align 8
   %tmp3 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_19", align 8
   %tmp4 = bitcast %struct._class_t* %tmp1 to i8*
-  %call5 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp4, i8* %tmp3, %struct.__CFString* %tmp2)
+  %call5 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp4, i8* %tmp3, %struct.__CFString* %tmp2)
   %tmp5 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_21", align 8
   %tmp6 = bitcast %3* %pboard to i8*
-  %call76 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp6, i8* %tmp5, i8* %call5)
-  %tmp9 = call i8* @objc_retain(i8* %call76) nounwind
+  %call76 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp6, i8* %tmp5, i8* %call5)
+  %tmp9 = call i8* @llvm.objc.retain(i8* %call76) nounwind
   %tobool = icmp eq i8* %tmp9, null
   br i1 %tobool, label %end, label %land.lhs.true
 
 land.lhs.true:                                    ; preds = %entry
   %tmp11 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_23", align 8
-  %call137 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp6, i8* %tmp11, i8* %tmp9)
+  %call137 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp6, i8* %tmp11, i8* %tmp9)
   %tmp = bitcast i8* %call137 to %1*
-  %tmp10 = call i8* @objc_retain(i8* %call137) nounwind
-  call void @objc_release(i8* null) nounwind
-  %tmp12 = call i8* @objc_retain(i8* %call137) nounwind
-  call void @objc_release(i8* null) nounwind
+  %tmp10 = call i8* @llvm.objc.retain(i8* %call137) nounwind
+  call void @llvm.objc.release(i8* null) nounwind
+  %tmp12 = call i8* @llvm.objc.retain(i8* %call137) nounwind
+  call void @llvm.objc.release(i8* null) nounwind
   %tobool16 = icmp eq i8* %call137, null
   br i1 %tobool16, label %end, label %if.then
 
 if.then:                                          ; preds = %land.lhs.true
   %tmp19 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_25", align 8
-  %call21 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* %call137, i8* %tmp19)
+  %call21 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*)*)(i8* %call137, i8* %tmp19)
   %tobool22 = icmp eq i8 %call21, 0
   br i1 %tobool22, label %if.then44, label %land.lhs.true23
 
@@ -2908,10 +2908,10 @@ land.lhs.true23:                                  ; preds = %if.then
   %tmp24 = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_26", align 8
   %tmp26 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_28", align 8
   %tmp27 = bitcast %struct._class_t* %tmp24 to i8*
-  %call2822 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp27, i8* %tmp26, i8* %call137)
+  %call2822 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp27, i8* %tmp26, i8* %call137)
   %tmp13 = bitcast i8* %call2822 to %5*
-  %tmp14 = call i8* @objc_retain(i8* %call2822) nounwind
-  call void @objc_release(i8* null) nounwind
+  %tmp14 = call i8* @llvm.objc.retain(i8* %call2822) nounwind
+  call void @llvm.objc.release(i8* null) nounwind
   %tobool30 = icmp eq i8* %call2822, null
   br i1 %tobool30, label %if.then44, label %if.end
 
@@ -2919,38 +2919,38 @@ if.end:                                           ; preds = %land.lhs.true23
   %tmp32 = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_29", align 8
   %tmp33 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_31", align 8
   %tmp34 = bitcast %struct._class_t* %tmp32 to i8*
-  %call35 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp34, i8* %tmp33)
+  %call35 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp34, i8* %tmp33)
   %tmp37 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_33", align 8
-  %call3923 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %call35, i8* %tmp37, i8* %call2822, i32 signext 1, %4** %err)
+  %call3923 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %call35, i8* %tmp37, i8* %call2822, i32 signext 1, %4** %err)
   %cmp = icmp eq i8* %call3923, null
   br i1 %cmp, label %if.then44, label %end
 
 if.then44:                                        ; preds = %if.end, %land.lhs.true23, %if.then
   %url.025 = phi %5* [ %tmp13, %if.end ], [ %tmp13, %land.lhs.true23 ], [ null, %if.then ]
   %tmp49 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_35", align 8
-  %call51 = call %struct._NSRange bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %struct._NSRange (i8*, i8*, i64, i64)*)(i8* %call137, i8* %tmp49, i64 0, i64 0)
+  %call51 = call %struct._NSRange bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %struct._NSRange (i8*, i8*, i64, i64)*)(i8* %call137, i8* %tmp49, i64 0, i64 0)
   %call513 = extractvalue %struct._NSRange %call51, 0
   %call514 = extractvalue %struct._NSRange %call51, 1
   %tmp52 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_37", align 8
-  %call548 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %call137, i8* %tmp52, i64 %call513, i64 %call514)
+  %call548 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %call137, i8* %tmp52, i64 %call513, i64 %call514)
   %tmp55 = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_38", align 8
   %tmp56 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_40", align 8
   %tmp57 = bitcast %struct._class_t* %tmp55 to i8*
-  %call58 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp57, i8* %tmp56)
+  %call58 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp57, i8* %tmp56)
   %tmp59 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_42", align 8
-  %call6110 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %call548, i8* %tmp59, i8* %call58)
-  %tmp15 = call i8* @objc_retain(i8* %call6110) nounwind
-  call void @objc_release(i8* %call137) nounwind
+  %call6110 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %call548, i8* %tmp59, i8* %call58)
+  %tmp15 = call i8* @llvm.objc.retain(i8* %call6110) nounwind
+  call void @llvm.objc.release(i8* %call137) nounwind
   %tmp64 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_46", align 8
-  %call66 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, %1*)*)(i8* %call6110, i8* %tmp64, %1* bitcast (%struct.NSConstantString* @_unnamed_cfstring_44 to %1*))
+  %call66 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, %1*)*)(i8* %call6110, i8* %tmp64, %1* bitcast (%struct.NSConstantString* @_unnamed_cfstring_44 to %1*))
   %tobool67 = icmp eq i8 %call66, 0
   br i1 %tobool67, label %if.end74, label %if.then68
 
 if.then68:                                        ; preds = %if.then44
   %tmp70 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_48", align 8
-  %call7220 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %call6110, i8* %tmp70)
-  %tmp16 = call i8* @objc_retain(i8* %call7220) nounwind
-  call void @objc_release(i8* %call6110) nounwind
+  %call7220 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %call6110, i8* %tmp70)
+  %tmp16 = call i8* @llvm.objc.retain(i8* %call7220) nounwind
+  call void @llvm.objc.release(i8* %call6110) nounwind
   br label %if.end74
 
 if.end74:                                         ; preds = %if.then68, %if.then44
@@ -2964,7 +2964,7 @@ if.end74:                                         ; preds = %if.then68, %if.then
 
 land.lhs.true80:                                  ; preds = %if.end74
   %tmp82 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_25", align 8
-  %call84 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* %filename.0.in, i8* %tmp82)
+  %call84 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*)*)(i8* %filename.0.in, i8* %tmp82)
   %tobool86 = icmp eq i8 %call84, 0
   br i1 %tobool86, label %if.then109, label %if.end106
 
@@ -2972,17 +2972,17 @@ if.end106:                                        ; preds = %land.lhs.true80
   %tmp88 = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_26", align 8
   %tmp90 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_28", align 8
   %tmp91 = bitcast %struct._class_t* %tmp88 to i8*
-  %call9218 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp91, i8* %tmp90, i8* %filename.0.in)
+  %call9218 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp91, i8* %tmp90, i8* %filename.0.in)
   %tmp20 = bitcast i8* %call9218 to %5*
-  %tmp21 = call i8* @objc_retain(i8* %call9218) nounwind
+  %tmp21 = call i8* @llvm.objc.retain(i8* %call9218) nounwind
   %tmp22 = bitcast %5* %url.025 to i8*
-  call void @objc_release(i8* %tmp22) nounwind
+  call void @llvm.objc.release(i8* %tmp22) nounwind
   %tmp94 = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_29", align 8
   %tmp95 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_31", align 8
   %tmp96 = bitcast %struct._class_t* %tmp94 to i8*
-  %call97 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp96, i8* %tmp95)
+  %call97 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp96, i8* %tmp95)
   %tmp99 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_33", align 8
-  %call10119 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %call97, i8* %tmp99, i8* %call9218, i32 signext 1, %4** %err)
+  %call10119 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %call97, i8* %tmp99, i8* %call9218, i32 signext 1, %4** %err)
   %phitmp = icmp eq i8* %call10119, null
   br i1 %phitmp, label %if.then109, label %end
 
@@ -3000,12 +3000,12 @@ if.then112:                                       ; preds = %if.then109
   %tmp118 = load %1*, %1** @NSFilePathErrorKey, align 8
   %tmp119 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_53", align 8
   %tmp120 = bitcast %struct._class_t* %tmp115 to i8*
-  %call12113 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp120, i8* %tmp119, %1* %call117, %1* %tmp118, i8* null)
+  %call12113 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp120, i8* %tmp119, %1* %call117, %1* %tmp118, i8* null)
   %tmp122 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_55", align 8
   %tmp123 = bitcast %struct._class_t* %tmp113 to i8*
-  %call12414 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp123, i8* %tmp122, %1* %tmp114, i64 258, i8* %call12113)
-  %tmp23 = call i8* @objc_retain(i8* %call12414) nounwind
-  %tmp25 = call i8* @objc_autorelease(i8* %tmp23) nounwind
+  %call12414 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp123, i8* %tmp122, %1* %tmp114, i64 258, i8* %call12113)
+  %tmp23 = call i8* @llvm.objc.retain(i8* %call12414) nounwind
+  %tmp25 = call i8* @llvm.objc.autorelease(i8* %tmp23) nounwind
   %tmp28 = bitcast i8* %tmp25 to %4*
   store %4* %tmp28, %4** %err, align 8
   br label %if.end125
@@ -3015,44 +3015,44 @@ if.end125:                                        ; preds = %if.then112, %if.the
   %tmp126 = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_56", align 8
   %tmp128 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_58", align 8
   %tmp129 = bitcast %struct._class_t* %tmp126 to i8*
-  %call13015 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %tmp129, i8* %tmp128, %4* %tmp127)
+  %call13015 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %tmp129, i8* %tmp128, %4* %tmp127)
   %tmp131 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_60", align 8
-  %call13317 = call i8* (i8*, i8*, ...) @objc_msgSend(i8* %call13015, i8* %tmp131)
+  %call13317 = call i8* (i8*, i8*, ...) @llvm.objc.msgSend(i8* %call13015, i8* %tmp131)
   br label %end
 
 end:                                              ; preds = %if.end125, %if.end106, %if.end, %land.lhs.true, %entry
   %filename.2 = phi %1* [ %filename.0, %if.end106 ], [ %filename.0, %if.end125 ], [ %tmp, %land.lhs.true ], [ null, %entry ], [ %tmp, %if.end ]
   %origFilename.0 = phi %1* [ %tmp, %if.end106 ], [ %tmp, %if.end125 ], [ %tmp, %land.lhs.true ], [ null, %entry ], [ %tmp, %if.end ]
   %url.2 = phi %5* [ %tmp20, %if.end106 ], [ %url.129, %if.end125 ], [ null, %land.lhs.true ], [ null, %entry ], [ %tmp13, %if.end ]
-  call void @objc_release(i8* %tmp9) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp9) nounwind, !clang.imprecise_release !0
   %tmp29 = bitcast %5* %url.2 to i8*
-  call void @objc_release(i8* %tmp29) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp29) nounwind, !clang.imprecise_release !0
   %tmp30 = bitcast %1* %origFilename.0 to i8*
-  call void @objc_release(i8* %tmp30) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp30) nounwind, !clang.imprecise_release !0
   %tmp31 = bitcast %1* %filename.2 to i8*
-  call void @objc_release(i8* %tmp31) nounwind, !clang.imprecise_release !0
-  call void @objc_release(i8* %tmp7) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp31) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp7) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 declare i32 @__gxx_personality_v0(...)
 
-declare i32 @objc_sync_enter(i8*)
-declare i32 @objc_sync_exit(i8*)
+declare i32 @llvm.objc.sync.enter(i8*)
+declare i32 @llvm.objc.sync.exit(i8*)
 
 ; Make sure that we understand that objc_sync_{enter,exit} are IC_User not
 ; IC_Call/IC_CallOrUser.
 
 ; CHECK-LABEL:      define void @test67(
-; CHECK-NEXT:   call i32 @objc_sync_enter(i8* %x)
-; CHECK-NEXT:   call i32 @objc_sync_exit(i8* %x)
+; CHECK-NEXT:   call i32 @llvm.objc.sync.enter(i8* %x)
+; CHECK-NEXT:   call i32 @llvm.objc.sync.exit(i8* %x)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test67(i8* %x) {
-  call i8* @objc_retain(i8* %x)
-  call i32 @objc_sync_enter(i8* %x)
-  call i32 @objc_sync_exit(i8* %x)
-  call void @objc_release(i8* %x), !clang.imprecise_release !0
+  call i8* @llvm.objc.retain(i8* %x)
+  call i32 @llvm.objc.sync.enter(i8* %x)
+  call i32 @llvm.objc.sync.exit(i8* %x)
+  call void @llvm.objc.release(i8* %x), !clang.imprecise_release !0
   ret void
 }
 
@@ -3069,6 +3069,6 @@ define void @test67(i8* %x) {
 !4 = !DIFile(filename: "path/to/file", directory: "/path/to/dir")
 !5 = !{i32 2, !"Debug Info Version", i32 3}
 
-; CHECK: attributes #0 = { nounwind readnone speculatable }
 ; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { nounwind readnone speculatable }
 ; CHECK: ![[RELEASE]] = !{}
diff --git a/test/Transforms/ObjCARC/cfg-hazards.ll b/test/Transforms/ObjCARC/cfg-hazards.ll
index 8407e446b4f3759a960897d2a2728e0052789696..9559b3c31146eda9406120d65a4d24aedcdbd18e 100644
--- a/test/Transforms/ObjCARC/cfg-hazards.ll
+++ b/test/Transforms/ObjCARC/cfg-hazards.ll
@@ -5,21 +5,21 @@
 ; across them.
 
 declare void @use_pointer(i8*)
-declare i8* @objc_retain(i8*)
-declare void @objc_release(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare void @llvm.objc.release(i8*)
 declare void @callee()
 declare void @block_callee(void ()*)
 
 ; CHECK-LABEL: define void @test0(
-; CHECK:   call i8* @objc_retain(
+; CHECK:   call i8* @llvm.objc.retain
 ; CHECK: for.body:
 ; CHECK-NOT: @objc
 ; CHECK: for.end:
-; CHECK:   call void @objc_release(
+; CHECK:   call void @llvm.objc.release
 ; CHECK: }
 define void @test0(i8* %digits) {
 entry:
-  %tmp1 = call i8* @objc_retain(i8* %digits) nounwind
+  %tmp1 = call i8* @llvm.objc.retain(i8* %digits) nounwind
   call void @use_pointer(i8* %digits)
   br label %for.body
 
@@ -31,20 +31,20 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.end:                                          ; preds = %for.body
-  call void @objc_release(i8* %digits) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %digits) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK-LABEL: define void @test1(
-; CHECK:   call i8* @objc_retain(
+; CHECK:   call i8* @llvm.objc.retain
 ; CHECK: for.body:
 ; CHECK-NOT: @objc
 ; CHECK: for.end:
-; CHECK:   void @objc_release(
+; CHECK:   void @llvm.objc.release
 ; CHECK: }
 define void @test1(i8* %digits) {
 entry:
-  %tmp1 = call i8* @objc_retain(i8* %digits) nounwind
+  %tmp1 = call i8* @llvm.objc.retain(i8* %digits) nounwind
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
@@ -56,20 +56,20 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.end:                                          ; preds = %for.body
-  call void @objc_release(i8* %digits) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %digits) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; CHECK-LABEL: define void @test2(
-; CHECK:   call i8* @objc_retain(
+; CHECK:   call i8* @llvm.objc.retain
 ; CHECK: for.body:
 ; CHECK-NOT: @objc
 ; CHECK: for.end:
-; CHECK:   void @objc_release(
+; CHECK:   void @llvm.objc.release
 ; CHECK: }
 define void @test2(i8* %digits) {
 entry:
-  %tmp1 = call i8* @objc_retain(i8* %digits) nounwind
+  %tmp1 = call i8* @llvm.objc.retain(i8* %digits) nounwind
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
@@ -81,7 +81,7 @@ for.body:                                         ; preds = %for.body, %entry
 
 for.end:                                          ; preds = %for.body
   call void @use_pointer(i8* %digits)
-  call void @objc_release(i8* %digits) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %digits) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -89,17 +89,17 @@ for.end:                                          ; preds = %for.body
 
 ;      CHECK: define void @test3(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW:#[0-9]+]]
+; CHECK-NEXT:   tail call i8* @llvm.objc.retain(i8* %a) [[NUW:#[0-9]+]]
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
-; CHECK-NEXT:   call void @objc_release(i8* %a)
+; CHECK-NEXT:   call void @llvm.objc.release(i8* %a)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test3(i8* %a) nounwind {
 entry:
-  %outer = call i8* @objc_retain(i8* %a) nounwind
-  %inner = call i8* @objc_retain(i8* %a) nounwind
+  %outer = call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
@@ -108,24 +108,24 @@ loop:
   br i1 undef, label %loop, label %exit
 
 exit:
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ;      CHECK: define void @test4(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK-NEXT:   tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
-; CHECK-NEXT:   call void @objc_release(i8* %a)
+; CHECK-NEXT:   call void @llvm.objc.release(i8* %a)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test4(i8* %a) nounwind {
 entry:
-  %outer = call i8* @objc_retain(i8* %a) nounwind
-  %inner = call i8* @objc_retain(i8* %a) nounwind
+  %outer = call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
@@ -138,26 +138,26 @@ more:
   br i1 undef, label %loop, label %exit
 
 exit:
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ;      CHECK: define void @test5(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK-NEXT:   tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   call void @callee()
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
 ; CHECK-NEXT:   call void @use_pointer(i8* %a)
-; CHECK-NEXT:   call void @objc_release(i8* %a)
+; CHECK-NEXT:   call void @llvm.objc.release(i8* %a)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test5(i8* %a) nounwind {
 entry:
-  %outer = tail call i8* @objc_retain(i8* %a) nounwind
-  %inner = tail call i8* @objc_retain(i8* %a) nounwind
+  %outer = tail call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   call void @callee()
   br label %loop
 
@@ -172,25 +172,25 @@ more:
 
 exit:
   call void @use_pointer(i8* %a)
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ;      CHECK: define void @test6(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK-NEXT:   tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
 ; CHECK-NEXT:   call void @use_pointer(i8* %a)
-; CHECK-NEXT:   call void @objc_release(i8* %a)
+; CHECK-NEXT:   call void @llvm.objc.release(i8* %a)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test6(i8* %a) nounwind {
 entry:
-  %outer = tail call i8* @objc_retain(i8* %a) nounwind
-  %inner = tail call i8* @objc_retain(i8* %a) nounwind
+  %outer = tail call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
@@ -205,25 +205,25 @@ more:
 
 exit:
   call void @use_pointer(i8* %a)
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ;      CHECK: define void @test7(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK-NEXT:   tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   call void @callee()
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
-; CHECK-NEXT:   call void @objc_release(i8* %a)
+; CHECK-NEXT:   call void @llvm.objc.release(i8* %a)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test7(i8* %a) nounwind {
 entry:
-  %outer = tail call i8* @objc_retain(i8* %a) nounwind
-  %inner = tail call i8* @objc_retain(i8* %a) nounwind
+  %outer = tail call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   call void @callee()
   br label %loop
 
@@ -238,24 +238,24 @@ more:
   br i1 undef, label %exit, label %loop
 
 exit:
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ;      CHECK: define void @test8(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK-NEXT:   tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
-; CHECK-NEXT:   call void @objc_release(i8* %a)
+; CHECK-NEXT:   call void @llvm.objc.release(i8* %a)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test8(i8* %a) nounwind {
 entry:
-  %outer = tail call i8* @objc_retain(i8* %a) nounwind
-  %inner = tail call i8* @objc_retain(i8* %a) nounwind
+  %outer = tail call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
@@ -270,22 +270,22 @@ more:
   br i1 undef, label %exit, label %loop
 
 exit:
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ;      CHECK: define void @test9(i8* %a) #0 {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test9(i8* %a) nounwind {
 entry:
-  %outer = tail call i8* @objc_retain(i8* %a) nounwind
-  %inner = tail call i8* @objc_retain(i8* %a) nounwind
+  %outer = tail call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
@@ -299,22 +299,22 @@ more:
   br i1 undef, label %exit, label %loop
 
 exit:
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ;      CHECK: define void @test10(i8* %a) #0 {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test10(i8* %a) nounwind {
 entry:
-  %outer = tail call i8* @objc_retain(i8* %a) nounwind
-  %inner = tail call i8* @objc_retain(i8* %a) nounwind
+  %outer = tail call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
@@ -328,22 +328,22 @@ more:
   br i1 undef, label %exit, label %loop
 
 exit:
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ;      CHECK: define void @test11(i8* %a) #0 {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test11(i8* %a) nounwind {
 entry:
-  %outer = tail call i8* @objc_retain(i8* %a) nounwind
-  %inner = tail call i8* @objc_retain(i8* %a) nounwind
+  %outer = tail call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
@@ -356,8 +356,8 @@ more:
   br i1 undef, label %exit, label %loop
 
 exit:
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -365,19 +365,19 @@ exit:
 
 ;      CHECK: define void @test12(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %outer = tail call i8* @objc_retain(i8* %a) [[NUW]]
-; CHECK-NEXT:   %inner = tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK-NEXT:   %outer = tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
+; CHECK-NEXT:   %inner = tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   br label %loop
-;  CHECK-NOT:   @objc_
+;  CHECK-NOT:   @llvm.objc.
 ;      CHECK: exit:
-; CHECK-NEXT: call void @objc_release(i8* %a) [[NUW]]
-; CHECK-NEXT: call void @objc_release(i8* %a) [[NUW]], !clang.imprecise_release !0
+; CHECK-NEXT: call void @llvm.objc.release(i8* %a) [[NUW]]
+; CHECK-NEXT: call void @llvm.objc.release(i8* %a) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test12(i8* %a) nounwind {
 entry:
-  %outer = tail call i8* @objc_retain(i8* %a) nounwind
-  %inner = tail call i8* @objc_retain(i8* %a) nounwind
+  %outer = tail call i8* @llvm.objc.retain(i8* %a) nounwind
+  %inner = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
@@ -390,8 +390,8 @@ more:
   br i1 undef, label %exit, label %loop
 
 exit:
-  call void @objc_release(i8* %a) nounwind
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -402,31 +402,31 @@ exit:
 
 ; CHECK: define void @test13(i8* %a) [[NUW]] {
 ; CHECK: entry:
-; CHECK:   tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
 ; CHECK: loop:
-; CHECK:   tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK:   tail call i8* @llvm.objc.retain(i8* %a) [[NUW]]
 ; CHECK:   call void @block_callee
-; CHECK:   call void @objc_release(i8* %reloaded_a) [[NUW]]
+; CHECK:   call void @llvm.objc.release(i8* %reloaded_a) [[NUW]]
 ; CHECK: exit:
-; CHECK:   call void @objc_release(i8* %a) [[NUW]]
+; CHECK:   call void @llvm.objc.release(i8* %a) [[NUW]]
 ; CHECK: }
 define void @test13(i8* %a) nounwind {
 entry:
   %block = alloca i8*
-  %a1 = tail call i8* @objc_retain(i8* %a) nounwind
+  %a1 = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   br label %loop
 
 loop:
-  %a2 = tail call i8* @objc_retain(i8* %a) nounwind
+  %a2 = tail call i8* @llvm.objc.retain(i8* %a) nounwind
   store i8* %a, i8** %block, align 8
   %casted_block = bitcast i8** %block to void ()*
   call void @block_callee(void ()* %casted_block)
   %reloaded_a = load i8*, i8** %block, align 8
-  call void @objc_release(i8* %reloaded_a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %reloaded_a) nounwind, !clang.imprecise_release !0
   br i1 undef, label %loop, label %exit
   
 exit:
-  call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %a) nounwind, !clang.imprecise_release !0
   ret void
 }
 
diff --git a/test/Transforms/ObjCARC/clang-arc-use-barrier.ll b/test/Transforms/ObjCARC/clang-arc-use-barrier.ll
index 98d49ec512eee55d16258c9844e7870ba5e0049d..a00c11731c115e660665f6ad2ab5ab4a9f17f54c 100644
--- a/test/Transforms/ObjCARC/clang-arc-use-barrier.ll
+++ b/test/Transforms/ObjCARC/clang-arc-use-barrier.ll
@@ -2,36 +2,36 @@
 
 %0 = type opaque
 
-; Make sure ARC optimizer doesn't sink @obj_retain past @clang.arc.use.
+; Make sure ARC optimizer doesn't sink @obj_retain past @llvm.objc.clang.arc.use.
 
-; CHECK: call i8* @objc_retain(
-; CHECK: call void (...) @clang.arc.use(
-; CHECK: call i8* @objc_retain(
-; CHECK: call void (...) @clang.arc.use(
+; CHECK: call i8* @llvm.objc.retain
+; CHECK: call void (...) @llvm.objc.clang.arc.use(
+; CHECK: call i8* @llvm.objc.retain
+; CHECK: call void (...) @llvm.objc.clang.arc.use(
 
 define void @runTest() local_unnamed_addr {
   %1 = alloca %0*, align 8
   %2 = alloca %0*, align 8
   %3 = tail call %0* @foo0()
   %4 = bitcast %0* %3 to i8*
-  %5 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %4)
+  %5 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %4)
   store %0* %3, %0** %1, align 8
   call void @foo1(%0** nonnull %1)
   %6 = load %0*, %0** %1, align 8
   %7 = bitcast %0* %6 to i8*
-  %8 = call i8* @objc_retain(i8* %7)
-  call void (...) @clang.arc.use(%0* %3)
-  call void @objc_release(i8* %4)
+  %8 = call i8* @llvm.objc.retain(i8* %7)
+  call void (...) @llvm.objc.clang.arc.use(%0* %3)
+  call void @llvm.objc.release(i8* %4)
   store %0* %6, %0** %2, align 8
   call void @foo1(%0** nonnull %2)
   %9 = load %0*, %0** %2, align 8
   %10 = bitcast %0* %9 to i8*
-  %11 = call i8* @objc_retain(i8* %10)
-  call void (...) @clang.arc.use(%0* %6)
+  %11 = call i8* @llvm.objc.retain(i8* %10)
+  call void (...) @llvm.objc.clang.arc.use(%0* %6)
   %tmp1 = load %0*, %0** %2, align 8
-  call void @objc_release(i8* %7)
+  call void @llvm.objc.release(i8* %7)
   call void @foo2(%0* %9)
-  call void @objc_release(i8* %10)
+  call void @llvm.objc.release(i8* %10)
   ret void
 }
 
@@ -39,7 +39,7 @@ declare %0* @foo0() local_unnamed_addr
 declare void @foo1(%0**) local_unnamed_addr
 declare void @foo2(%0*) local_unnamed_addr
 
-declare i8* @objc_retainAutoreleasedReturnValue(i8*) local_unnamed_addr
-declare i8* @objc_retain(i8*) local_unnamed_addr
-declare void @clang.arc.use(...) local_unnamed_addr
-declare void @objc_release(i8*) local_unnamed_addr
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*) local_unnamed_addr
+declare i8* @llvm.objc.retain(i8*) local_unnamed_addr
+declare void @llvm.objc.clang.arc.use(...) local_unnamed_addr
+declare void @llvm.objc.release(i8*) local_unnamed_addr
diff --git a/test/Transforms/ObjCARC/comdat-ipo.ll b/test/Transforms/ObjCARC/comdat-ipo.ll
index 0a5713e9ab6f22227238001cab776b4846135ce9..44d7b10104911b01f6a0e21e941e00608be03c38 100644
--- a/test/Transforms/ObjCARC/comdat-ipo.ll
+++ b/test/Transforms/ObjCARC/comdat-ipo.ll
@@ -30,24 +30,24 @@ entry:
 ; CHECK-LABEL: define internal void @_GLOBAL__I_x() {
 define internal void @_GLOBAL__I_x() {
 entry:
-; CHECK:  call i8* @objc_autoreleasePoolPush()
+; CHECK:  call i8* @llvm.objc.autoreleasePoolPush()
 ; CHECK-NEXT:  call void @__cxx_global_var_init()
-; CHECK-NEXT:  call void @objc_autoreleasePoolPop(i8* %0)
+; CHECK-NEXT:  call void @llvm.objc.autoreleasePoolPop(i8* %0)
 ; CHECK-NEXT:  ret void
 
-  %0 = call i8* @objc_autoreleasePoolPush() nounwind
+  %0 = call i8* @llvm.objc.autoreleasePoolPush() nounwind
   call void @__cxx_global_var_init()
-  call void @objc_autoreleasePoolPop(i8* %0) nounwind
+  call void @llvm.objc.autoreleasePoolPop(i8* %0) nounwind
   ret void
 }
 
 define internal void @_GLOBAL__I_y() {
 entry:
-  %0 = call i8* @objc_autoreleasePoolPush() nounwind
+  %0 = call i8* @llvm.objc.autoreleasePoolPush() nounwind
   call void @__dxx_global_var_init()
-  call void @objc_autoreleasePoolPop(i8* %0) nounwind
+  call void @llvm.objc.autoreleasePoolPop(i8* %0) nounwind
   ret void
 }
 
-declare i8* @objc_autoreleasePoolPush()
-declare void @objc_autoreleasePoolPop(i8*)
+declare i8* @llvm.objc.autoreleasePoolPush()
+declare void @llvm.objc.autoreleasePoolPop(i8*)
diff --git a/test/Transforms/ObjCARC/contract-catchswitch.ll b/test/Transforms/ObjCARC/contract-catchswitch.ll
index 3f5dd93f58416084a9c5a9c94e52d1bf4b62dd12..90b6522d68117e41b6eac980aa82cba32895a13d 100644
--- a/test/Transforms/ObjCARC/contract-catchswitch.ll
+++ b/test/Transforms/ObjCARC/contract-catchswitch.ll
@@ -6,8 +6,8 @@ target triple = "i686--windows-msvc19.11.0"
 %0 = type opaque
 
 declare i32 @__CxxFrameHandler3(...)
-declare dllimport void @objc_release(i8*) local_unnamed_addr
-declare dllimport i8* @objc_retain(i8* returned) local_unnamed_addr
+declare dllimport void @llvm.objc.release(i8*) local_unnamed_addr
+declare dllimport i8* @llvm.objc.retain(i8* returned) local_unnamed_addr
 
 @p = global i8* null, align 4
 
@@ -17,7 +17,7 @@ define void @g() local_unnamed_addr personality i8* bitcast (i32 (...)* @__CxxFr
 entry:
   %tmp = load i8*, i8** @p, align 4
   %cast = bitcast i8* %tmp to %0*
-  %tmp1 = tail call i8* @objc_retain(i8* %tmp) #0
+  %tmp1 = tail call i8* @llvm.objc.retain(i8* %tmp) #0
   ; Split the basic block to ensure bitcast ends up in entry.split.
   br label %entry.split
 
@@ -43,8 +43,8 @@ catch1:
 invoke.cont:
   %tmp6 = load i8*, i8** @p, align 4
   %cast1 = bitcast i8* %tmp6 to %0*
-  %tmp7 = tail call i8* @objc_retain(i8* %tmp6) #0
-  call void @objc_release(i8* %tmp) #0, !clang.imprecise_release !0
+  %tmp7 = tail call i8* @llvm.objc.retain(i8* %tmp6) #0
+  call void @llvm.objc.release(i8* %tmp) #0, !clang.imprecise_release !0
   ; Split the basic block to ensure bitcast ends up in invoke.cont.split.
   br label %invoke.cont.split
 
@@ -59,7 +59,7 @@ ehcleanup:
   %tmp8 = phi %0* [ %cast, %catch.dispatch1 ], [ %cast1, %invoke.cont.split ]
   %tmp9 = cleanuppad within none []
   %tmp10 = bitcast %0* %tmp8 to i8*
-  call void @objc_release(i8* %tmp10) #0 [ "funclet"(token %tmp9) ]
+  call void @llvm.objc.release(i8* %tmp10) #0 [ "funclet"(token %tmp9) ]
   cleanupret from %tmp9 unwind to caller
 }
 
diff --git a/test/Transforms/ObjCARC/contract-end-of-use-list.ll b/test/Transforms/ObjCARC/contract-end-of-use-list.ll
index a38cd8a1da1786ce2ef03f03c8b846847101441f..364d72252fa215063b5f1b0e6b9c5e93e82ed4e0 100644
--- a/test/Transforms/ObjCARC/contract-end-of-use-list.ll
+++ b/test/Transforms/ObjCARC/contract-end-of-use-list.ll
@@ -10,14 +10,14 @@ target triple = "x86_64-apple-darwin13.2.0"
 define internal i8* @foo() {
 entry:
   %call = call i8* @bar()
-; CHECK: %retained1 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call)
-  %retained1 = call i8* @objc_retain(i8* %call)
+; CHECK: %retained1 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call)
+  %retained1 = call i8* @llvm.objc.retain(i8* %call)
   %isnull = icmp eq i8* %retained1, null
   br i1 %isnull, label %cleanup, label %if.end
 
 if.end:
-; CHECK: %retained2 = call i8* @objc_retain(i8* %retained1)
-  %retained2 = call i8* @objc_retain(i8* %retained1)
+; CHECK: %retained2 = call i8* @llvm.objc.retain(i8* %retained1)
+  %retained2 = call i8* @llvm.objc.retain(i8* %retained1)
   br label %cleanup
 
 cleanup:
@@ -27,4 +27,4 @@ cleanup:
 
 declare i8* @bar()
 
-declare extern_weak i8* @objc_retain(i8*)
+declare extern_weak i8* @llvm.objc.retain(i8*)
diff --git a/test/Transforms/ObjCARC/contract-marker-funclet.ll b/test/Transforms/ObjCARC/contract-marker-funclet.ll
index 4e116c47f6438e899fafe0331e4eb0de4a62a951..462b24c42b14388c2dc0a5c81bf127a0e5852122 100644
--- a/test/Transforms/ObjCARC/contract-marker-funclet.ll
+++ b/test/Transforms/ObjCARC/contract-marker-funclet.ll
@@ -21,22 +21,22 @@ catch.dispatch:                                   ; preds = %entry
 catch:                                            ; preds = %catch.dispatch
   %1 = catchpad within %0 [i8* null, i32 64, i8* null]
   %call1 = call i8* @"\01?f@@YAPAUobjc_object@@XZ"() [ "funclet"(token %1) ]
-  %2 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call1) [ "funclet"(token %1) ]
-  call void @objc_release(i8* %2) [ "funclet"(token %1) ]
+  %2 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call1) [ "funclet"(token %1) ]
+  call void @llvm.objc.release(i8* %2) [ "funclet"(token %1) ]
   br label %catch.1
 
 catch.1:                                          ; preds = %catch
   %call2 = call i8* @"\01?f@@YAPAUobjc_object@@XZ"() [ "funclet"(token %1) ]
-  %3 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call2) [ "funclet"(token %1) ]
-  call void @objc_release(i8* %3) [ "funclet"(token %1) ]
+  %3 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call2) [ "funclet"(token %1) ]
+  call void @llvm.objc.release(i8* %3) [ "funclet"(token %1) ]
   catchret from %1 to label %catchret.dest
 
 catchret.dest:                                    ; preds = %catch.1
   ret void
 
 invoke.cont:                                      ; preds = %entry
-  %4 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call)
-  call void @objc_release(i8* %4)
+  %4 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call)
+  call void @llvm.objc.release(i8* %4)
   ret void
 }
 
@@ -44,9 +44,9 @@ declare i8* @"\01?f@@YAPAUobjc_object@@XZ"()
 
 declare i32 @__CxxFrameHandler3(...)
 
-declare dllimport i8* @objc_retainAutoreleasedReturnValue(i8*)
+declare dllimport i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
 
-declare dllimport void @objc_release(i8*)
+declare dllimport void @llvm.objc.release(i8*)
 
 !clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
 !0 = !{!"movl\09%ebp, %ebp\09\09// marker for objc_retainAutoreleaseReturnValue"}
diff --git a/test/Transforms/ObjCARC/contract-marker.ll b/test/Transforms/ObjCARC/contract-marker.ll
index bf70d4e9d04486f9273607effb92288e905bfaf7..6dc93feb0453d4f407d4820eb7a4ccbe4b3984bd 100644
--- a/test/Transforms/ObjCARC/contract-marker.ll
+++ b/test/Transforms/ObjCARC/contract-marker.ll
@@ -4,14 +4,14 @@
 ; CHECK:      %call = tail call i32* @qux()
 ; CHECK-NEXT: %tcall = bitcast i32* %call to i8*
 ; CHECK-NEXT: call void asm sideeffect "mov\09r7, r7\09\09@ marker for return value optimization", ""()
-; CHECK-NEXT: %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tcall) [[NUW:#[0-9]+]]
+; CHECK-NEXT: %0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %tcall) [[NUW:#[0-9]+]]
 ; CHECK: }
 
 define void @foo() {
 entry:
   %call = tail call i32* @qux()
   %tcall = bitcast i32* %call to i8*
-  %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tcall) nounwind
+  %0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %tcall) nounwind
   tail call void @bar(i8* %0)
   ret void
 }
@@ -20,22 +20,22 @@ entry:
 ; CHECK:      %call = tail call i32* @qux()
 ; CHECK-NEXT: %tcall = bitcast i32* %call to i8*
 ; CHECK-NEXT: call void asm sideeffect "mov\09r7, r7\09\09@ marker for return value optimization", ""()
-; CHECK-NEXT: %0 = tail call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %tcall) [[NUW:#[0-9]+]]
+; CHECK-NEXT: %0 = tail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %tcall) [[NUW:#[0-9]+]]
 ; CHECK: }
 
 define void @foo2() {
 entry:
   %call = tail call i32* @qux()
   %tcall = bitcast i32* %call to i8*
-  %0 = tail call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %tcall) nounwind
+  %0 = tail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %tcall) nounwind
   tail call void @bar(i8* %0)
   ret void
 }
 
 
 declare i32* @qux()
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare i8* @objc_unsafeClaimAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*)
 declare void @bar(i8*)
 
 !clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
diff --git a/test/Transforms/ObjCARC/contract-replace-arg-use.ll b/test/Transforms/ObjCARC/contract-replace-arg-use.ll
index 4cff9f7fc098c4e30f8d138f6a98a13436e00bce..28e2f6e0e6386c440e69eb94a402c68aa74d474e 100644
--- a/test/Transforms/ObjCARC/contract-replace-arg-use.ll
+++ b/test/Transforms/ObjCARC/contract-replace-arg-use.ll
@@ -1,20 +1,20 @@
 ; RUN: opt -objc-arc-contract -S < %s | FileCheck %s
 
-declare i8* @objc_autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
 declare i8* @foo1()
 
 ; Check that ARC contraction replaces the function return with the value
-; returned by @objc_autoreleaseReturnValue.
+; returned by @llvm.objc.autoreleaseReturnValue.
 
 ; CHECK-LABEL: define i32* @autoreleaseRVTailCall(
-; CHECK: %[[V0:[0-9]+]] = tail call i8* @objc_autoreleaseReturnValue(
+; CHECK: %[[V0:[0-9]+]] = tail call i8* @llvm.objc.autoreleaseReturnValue(
 ; CHECK: %[[V1:[0-9]+]] = bitcast i8* %[[V0]] to i32*
 ; CHECK: ret i32* %[[V1]]
 
 define i32* @autoreleaseRVTailCall() {
   %1 = call i8* @foo1()
   %2 = bitcast i8* %1 to i32*
-  %3 = tail call i8* @objc_autoreleaseReturnValue(i8* %1)
+  %3 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %1)
   ret i32* %2
 }
 
@@ -23,7 +23,7 @@ declare i32* @foo2(i32);
 ; CHECK-LABEL: define i32* @autoreleaseRVTailCallPhi(
 ; CHECK: %[[PHIVAL:.*]] = phi i8* [ %{{.*}}, %bb1 ], [ %{{.*}}, %bb2 ]
 ; CHECK: %[[RETVAL:.*]] = phi i32* [ %{{.*}}, %bb1 ], [ %{{.*}}, %bb2 ]
-; CHECK: %[[V4:.*]] = tail call i8* @objc_autoreleaseReturnValue(i8* %[[PHIVAL]])
+; CHECK: %[[V4:.*]] = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %[[PHIVAL]])
 ; CHECK: %[[V0:.*]] = bitcast i8* %[[V4]] to i32*
 ; CHECK: ret i32* %[[V0]]
 
@@ -41,6 +41,6 @@ bb2:
 bb3:
   %phival = phi i8* [ %v1, %bb1 ], [ %v3, %bb2 ]
   %retval = phi i32* [ %v0, %bb1 ], [ %v2, %bb2 ]
-  %v4 = tail call i8* @objc_autoreleaseReturnValue(i8* %phival)
+  %v4 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %phival)
   ret i32* %retval
 }
diff --git a/test/Transforms/ObjCARC/contract-storestrong-funclet.ll b/test/Transforms/ObjCARC/contract-storestrong-funclet.ll
index 2155a36db41f570ff651c03a74ecd28425193c8c..afeab0e6a7fce15136d019d7555a7fdca36edf9b 100644
--- a/test/Transforms/ObjCARC/contract-storestrong-funclet.ll
+++ b/test/Transforms/ObjCARC/contract-storestrong-funclet.ll
@@ -2,9 +2,9 @@
 
 declare void @f()
 declare i32 @__CxxFrameHandler3(...)
-declare dllimport i8* @objc_retain(i8*)
-declare dllimport i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare dllimport void @objc_release(i8*)
+declare dllimport i8* @llvm.objc.retain(i8*)
+declare dllimport i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare dllimport void @llvm.objc.release(i8*)
 
 @x = external global i8*
 
@@ -12,26 +12,26 @@ define void @g(i8* %p) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 t
   invoke void @f() to label %invoke.cont unwind label %ehcleanup
 
 invoke.cont:
-  %call = tail call i8* @objc_retain(i8* %p) nounwind
+  %call = tail call i8* @llvm.objc.retain(i8* %p) nounwind
   %tmp = load i8*, i8** @x, align 4
   store i8* %call, i8** @x, align 4
-  tail call void @objc_release(i8* %tmp) nounwind
+  tail call void @llvm.objc.release(i8* %tmp) nounwind
   ret void
 
 ehcleanup:
   %1 = cleanuppad within none []
-  %call1 = tail call i8* @objc_retain(i8* %p) nounwind [ "funclet"(token %1) ]
+  %call1 = tail call i8* @llvm.objc.retain(i8* %p) nounwind [ "funclet"(token %1) ]
   %tmp1 = load i8*, i8** @x, align 4
   store i8* %call1, i8** @x, align 4
-  tail call void @objc_release(i8* %tmp1) nounwind [ "funclet"(token %1) ]
+  tail call void @llvm.objc.release(i8* %tmp1) nounwind [ "funclet"(token %1) ]
   cleanupret from %1 unwind to caller
 }
 
 ; CHECK-LABEL: invoke.cont:
-; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %p) #0{{$}}
+; CHECK: tail call void @llvm.objc.storeStrong(i8** @x, i8* %p) #0{{$}}
 ; CHECK: ret void
 
 ; CHECK-LABEL: ehcleanup:
 ; CHECK: %1 = cleanuppad within none []
-; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %p) #0 [ "funclet"(token %1) ]
+; CHECK: tail call void @llvm.objc.storeStrong(i8** @x, i8* %p) #0 [ "funclet"(token %1) ]
 ; CHECK: cleanupret from %1 unwind to caller
diff --git a/test/Transforms/ObjCARC/contract-storestrong-ivar.ll b/test/Transforms/ObjCARC/contract-storestrong-ivar.ll
index 8b1a02f3feb9c46ad14a7316193fa6db23b0cff1..79db46a21ebb1fa2a9fd3c81ed250898efac881f 100644
--- a/test/Transforms/ObjCARC/contract-storestrong-ivar.ll
+++ b/test/Transforms/ObjCARC/contract-storestrong-ivar.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -objc-arc-contract -S < %s | FileCheck %s
 
-; CHECK: tail call void @objc_storeStrong(i8**
+; CHECK: tail call void @llvm.objc.storeStrong(i8**
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin11.0.0"
@@ -10,9 +10,9 @@ target triple = "x86_64-apple-darwin11.0.0"
 
 @"OBJC_IVAR_$_Controller.preferencesController" = external global i64, section "__DATA, __objc_const", align 8
 
-declare i8* @objc_retain(i8*)
+declare i8* @llvm.objc.retain(i8*)
 
-declare void @objc_release(i8*)
+declare void @llvm.objc.release(i8*)
 
 define hidden void @y(%0* nocapture %self, %1* %preferencesController) nounwind {
 entry:
@@ -22,9 +22,9 @@ entry:
   %tmp1 = bitcast i8* %add.ptr to %1**
   %tmp2 = load %1*, %1** %tmp1, align 8
   %tmp3 = bitcast %1* %preferencesController to i8*
-  %tmp4 = tail call i8* @objc_retain(i8* %tmp3) nounwind
+  %tmp4 = tail call i8* @llvm.objc.retain(i8* %tmp3) nounwind
   %tmp5 = bitcast %1* %tmp2 to i8*
-  tail call void @objc_release(i8* %tmp5) nounwind
+  tail call void @llvm.objc.release(i8* %tmp5) nounwind
   %tmp6 = bitcast i8* %tmp4 to %1*
   store %1* %tmp6, %1** %tmp1, align 8
   ret void
diff --git a/test/Transforms/ObjCARC/contract-storestrong.ll b/test/Transforms/ObjCARC/contract-storestrong.ll
index a02f7b7019125549888aaee8582740cf169b9ee6..eff0a6fdf900a9a413dd129aa30c818ef595a9be 100644
--- a/test/Transforms/ObjCARC/contract-storestrong.ll
+++ b/test/Transforms/ObjCARC/contract-storestrong.ll
@@ -2,23 +2,23 @@
 
 target datalayout = "e-p:64:64:64"
 
-declare i8* @objc_retain(i8*)
-declare void @objc_release(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare void @llvm.objc.release(i8*)
 declare void @use_pointer(i8*)
 
 @x = external global i8*
 
 ; CHECK-LABEL: define void @test0(
 ; CHECK: entry:
-; CHECK-NEXT: tail call void @objc_storeStrong(i8** @x, i8* %p) [[NUW:#[0-9]+]]
+; CHECK-NEXT: tail call void @llvm.objc.storeStrong(i8** @x, i8* %p) [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test0(i8* %p) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %p) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %p) nounwind
   %tmp = load i8*, i8** @x, align 8
   store i8* %0, i8** @x, align 8
-  tail call void @objc_release(i8* %tmp) nounwind
+  tail call void @llvm.objc.release(i8* %tmp) nounwind
   ret void
 }
 
@@ -26,18 +26,18 @@ entry:
 
 ; CHECK-LABEL: define void @test1(i8* %p) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
+; CHECK-NEXT:   %0 = tail call i8* @llvm.objc.retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load volatile i8*, i8** @x, align 8
 ; CHECK-NEXT:   store i8* %0, i8** @x, align 8
-; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) [[NUW]]
+; CHECK-NEXT:   tail call void @llvm.objc.release(i8* %tmp) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test1(i8* %p) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %p) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %p) nounwind
   %tmp = load volatile i8*, i8** @x, align 8
   store i8* %0, i8** @x, align 8
-  tail call void @objc_release(i8* %tmp) nounwind
+  tail call void @llvm.objc.release(i8* %tmp) nounwind
   ret void
 }
 
@@ -45,18 +45,18 @@ entry:
 
 ; CHECK-LABEL: define void @test2(i8* %p) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
+; CHECK-NEXT:   %0 = tail call i8* @llvm.objc.retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load i8*, i8** @x, align 8
 ; CHECK-NEXT:   store volatile i8* %0, i8** @x, align 8
-; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) [[NUW]]
+; CHECK-NEXT:   tail call void @llvm.objc.release(i8* %tmp) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test2(i8* %p) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %p) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %p) nounwind
   %tmp = load i8*, i8** @x, align 8
   store volatile i8* %0, i8** @x, align 8
-  tail call void @objc_release(i8* %tmp) nounwind
+  tail call void @llvm.objc.release(i8* %tmp) nounwind
   ret void
 }
 
@@ -65,20 +65,20 @@ entry:
 
 ; CHECK-LABEL: define void @test3(i8* %newValue) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    %x0 = tail call i8* @objc_retain(i8* %newValue) [[NUW]]
+; CHECK-NEXT:    %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) [[NUW]]
 ; CHECK-NEXT:    %x1 = load i8*, i8** @x, align 8
 ; CHECK-NEXT:    store i8* %x0, i8** @x, align 8
 ; CHECK-NEXT:    tail call void @use_pointer(i8* %x1), !clang.arc.no_objc_arc_exceptions !0
-; CHECK-NEXT:    tail call void @objc_release(i8* %x1) [[NUW]], !clang.imprecise_release !0
+; CHECK-NEXT:    tail call void @llvm.objc.release(i8* %x1) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NEXT:    ret void
 ; CHECK-NEXT:  }
 define void @test3(i8* %newValue) {
 entry:
-  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) nounwind
   %x1 = load i8*, i8** @x, align 8
   store i8* %newValue, i8** @x, align 8
   tail call void @use_pointer(i8* %x1), !clang.arc.no_objc_arc_exceptions !0
-  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x1) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -86,36 +86,36 @@ entry:
 
 ; CHECK-LABEL:  define i1 @test4(i8* %newValue, i8* %foo) {
 ; CHECK-NEXT:   entry:
-; CHECK-NEXT:     %x0 = tail call i8* @objc_retain(i8* %newValue) [[NUW]]
+; CHECK-NEXT:     %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) [[NUW]]
 ; CHECK-NEXT:     %x1 = load i8*, i8** @x, align 8
 ; CHECK-NEXT:     store i8* %x0, i8** @x, align 8
 ; CHECK-NEXT:     %t = icmp eq i8* %x1, %foo
-; CHECK-NEXT:     tail call void @objc_release(i8* %x1) [[NUW]], !clang.imprecise_release !0
+; CHECK-NEXT:     tail call void @llvm.objc.release(i8* %x1) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NEXT:     ret i1 %t
 ; CHECK-NEXT:   }
 define i1 @test4(i8* %newValue, i8* %foo) {
 entry:
-  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) nounwind
   %x1 = load i8*, i8** @x, align 8
   store i8* %newValue, i8** @x, align 8
   %t = icmp eq i8* %x1, %foo
-  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x1) nounwind, !clang.imprecise_release !0
   ret i1 %t
 }
 
-; Do form an objc_storeStrong here, because the use is before the store.
+; Do form an llvm.objc.storeStrong here, because the use is before the store.
 
 ; CHECK-LABEL: define i1 @test5(i8* %newValue, i8* %foo) {
 ; CHECK: %t = icmp eq i8* %x1, %foo
-; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) [[NUW]]
+; CHECK: tail call void @llvm.objc.storeStrong(i8** @x, i8* %newValue) [[NUW]]
 ; CHECK: }
 define i1 @test5(i8* %newValue, i8* %foo) {
 entry:
-  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) nounwind
   %x1 = load i8*, i8** @x, align 8
   %t = icmp eq i8* %x1, %foo
   store i8* %newValue, i8** @x, align 8
-  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x1) nounwind, !clang.imprecise_release !0
   ret i1 %t
 }
 
@@ -123,49 +123,49 @@ entry:
 
 ; CHECK-LABEL: define i1 @test6(i8* %newValue, i8* %foo) {
 ; CHECK: %t = icmp eq i8* %x1, %foo
-; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) [[NUW]]
+; CHECK: tail call void @llvm.objc.storeStrong(i8** @x, i8* %newValue) [[NUW]]
 ; CHECK: }
 define i1 @test6(i8* %newValue, i8* %foo) {
 entry:
-  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) nounwind
   %x1 = load i8*, i8** @x, align 8
-  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x1) nounwind, !clang.imprecise_release !0
   %t = icmp eq i8* %x1, %foo
   store i8* %newValue, i8** @x, align 8
   ret i1 %t
 }
 
-; Like test0, but there's no store, so don't form an objc_storeStrong.
+; Like test0, but there's no store, so don't form an llvm.objc.storeStrong.
 
 ; CHECK-LABEL: define void @test7(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
+; CHECK-NEXT:   %0 = tail call i8* @llvm.objc.retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load i8*, i8** @x, align 8
-; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) [[NUW]]
+; CHECK-NEXT:   tail call void @llvm.objc.release(i8* %tmp) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test7(i8* %p) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %p) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %p) nounwind
   %tmp = load i8*, i8** @x, align 8
-  tail call void @objc_release(i8* %tmp) nounwind
+  tail call void @llvm.objc.release(i8* %tmp) nounwind
   ret void
 }
 
-; Like test0, but there's no retain, so don't form an objc_storeStrong.
+; Like test0, but there's no retain, so don't form an llvm.objc.storeStrong.
 
 ; CHECK-LABEL: define void @test8(
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %tmp = load i8*, i8** @x, align 8
 ; CHECK-NEXT:   store i8* %p, i8** @x, align 8
-; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) [[NUW]]
+; CHECK-NEXT:   tail call void @llvm.objc.release(i8* %tmp) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test8(i8* %p) {
 entry:
   %tmp = load i8*, i8** @x, align 8
   store i8* %p, i8** @x, align 8
-  tail call void @objc_release(i8* %tmp) nounwind
+  tail call void @llvm.objc.release(i8* %tmp) nounwind
   ret void
 }
 
@@ -176,13 +176,13 @@ entry:
 ; pointer.
 ;
 ; CHECK-LABEL: define i1 @test9(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
-; CHECK-NOT: objc_storeStrong
+; CHECK-NOT: llvm.objc.storeStrong
 define i1 @test9(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
 entry:
-  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
-  tail call void @objc_release(i8* %unrelated_ptr) nounwind, !clang.imprecise_release !0
+  %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) nounwind
+  tail call void @llvm.objc.release(i8* %unrelated_ptr) nounwind, !clang.imprecise_release !0
   %x1 = load i8*, i8** @x, align 8
-  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x1) nounwind, !clang.imprecise_release !0
   %t = icmp eq i8* %x1, %foo
   store i8* %newValue, i8** @x, align 8
   ret i1 %t  
@@ -191,13 +191,13 @@ entry:
 ; Make sure that we don't perform the optimization when we just have a call.
 ;
 ; CHECK-LABEL: define i1 @test10(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
-; CHECK-NOT: objc_storeStrong
+; CHECK-NOT: llvm.objc.storeStrong
 define i1 @test10(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
 entry:
-  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) nounwind
   call void @use_pointer(i8* %unrelated_ptr)
   %x1 = load i8*, i8** @x, align 8
-  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x1) nounwind, !clang.imprecise_release !0
   %t = icmp eq i8* %x1, %foo
   store i8* %newValue, i8** @x, align 8
   ret i1 %t
@@ -206,13 +206,13 @@ entry:
 ; Make sure we form the store strong if the use in between the retain
 ; and the store does not touch reference counts.
 ; CHECK-LABEL: define i1 @test11(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
-; CHECK: objc_storeStrong
+; CHECK: llvm.objc.storeStrong
 define i1 @test11(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
 entry:
-  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %x0 = tail call i8* @llvm.objc.retain(i8* %newValue) nounwind
   %t = icmp eq i8* %newValue, %foo
   %x1 = load i8*, i8** @x, align 8
-  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %x1) nounwind, !clang.imprecise_release !0
   store i8* %newValue, i8** @x, align 8
   ret i1 %t
 }
@@ -227,31 +227,31 @@ entry:
 ; CHECK-NEXT: %p32 = bitcast i8** @x to i32**
 ; CHECK-NEXT: %v32 = bitcast i8* %p to i32*
 ; CHECK-NEXT: %0 = bitcast i16** %p16 to i8**
-; CHECK-NEXT: tail call void @objc_storeStrong(i8** %0, i8* %p)
+; CHECK-NEXT: tail call void @llvm.objc.storeStrong(i8** %0, i8* %p)
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test12(i8* %p) {
 entry:
-  %retain = tail call i8* @objc_retain(i8* %p) nounwind
+  %retain = tail call i8* @llvm.objc.retain(i8* %p) nounwind
   %p16 = bitcast i8** @x to i16**
   %tmp16 = load i16*, i16** %p16, align 8
   %tmp8 = bitcast i16* %tmp16 to i8*
   %p32 = bitcast i8** @x to i32**
   %v32 = bitcast i8* %retain to i32*
   store i32* %v32, i32** %p32, align 8
-  tail call void @objc_release(i8* %tmp8) nounwind
+  tail call void @llvm.objc.release(i8* %tmp8) nounwind
   ret void
 }
 
 ; This used to crash.
 ; CHECK-LABEL: define i8* @test13(
-; CHECK: tail call void @objc_storeStrong(i8** %{{.*}}, i8* %[[NEW:.*]])
+; CHECK: tail call void @llvm.objc.storeStrong(i8** %{{.*}}, i8* %[[NEW:.*]])
 ; CHECK-NEXT: ret i8* %[[NEW]]
 
 define i8* @test13(i8* %a0, i8* %a1, i8** %addr, i8* %new) {
   %old = load i8*, i8** %addr, align 8
-  call void @objc_release(i8* %old)
-  %retained = call i8* @objc_retain(i8* %new)
+  call void @llvm.objc.release(i8* %old)
+  %retained = call i8* @llvm.objc.retain(i8* %new)
   store i8* %retained, i8** %addr, align 8
   ret i8* %retained
 }
diff --git a/test/Transforms/ObjCARC/contract-testcases.ll b/test/Transforms/ObjCARC/contract-testcases.ll
index e6d34a9426f4f1c0a9113aed4046b6af4f704009..57157356d73851a0c2173646b7fc5445c1a0b535 100644
--- a/test/Transforms/ObjCARC/contract-testcases.ll
+++ b/test/Transforms/ObjCARC/contract-testcases.ll
@@ -7,13 +7,13 @@
 %4 = type opaque
 
 declare %0* @"\01-[NSAttributedString(Terminal) pathAtIndex:effectiveRange:]"(%1*, i8* nocapture, i64, %2*) optsize
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare i8* @objc_msgSend_fixup(i8*, i8*, ...)
-declare i8* @objc_msgSend(i8*, i8*, ...)
-declare void @objc_release(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.msgSend_fixup(i8*, i8*, ...)
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...)
+declare void @llvm.objc.release(i8*)
 declare %2 @NSUnionRange(i64, i64, i64, i64) optsize
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare i8* @objc_autorelease(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
 declare i32 @__gxx_personality_sj0(...)
 
 ; Don't get in trouble on bugpointed code.
@@ -22,7 +22,7 @@ declare i32 @__gxx_personality_sj0(...)
 define void @test0() {
 bb:
   %tmp = bitcast %4* undef to i8*
-  %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tmp) nounwind
+  %tmp1 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %tmp) nounwind
   br label %bb3
 
 bb3:                                              ; preds = %bb2
@@ -53,9 +53,9 @@ bb6:                                              ; preds = %bb5, %bb4, %bb4, %b
 ; CHECK: }
 define void @test1() {
 bb:
-  %tmp = tail call %0* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %0* ()*)()
+  %tmp = tail call %0* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %0* ()*)()
   %tmp2 = bitcast %0* %tmp to i8*
-  %tmp3 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tmp2) nounwind
+  %tmp3 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %tmp2) nounwind
   br i1 undef, label %bb7, label %bb7
 
 bb7:                                              ; preds = %bb6, %bb6, %bb5
@@ -70,15 +70,15 @@ bb7:                                              ; preds = %bb6, %bb6, %bb5
 ; CHECK: define void @_Z6doTestP8NSString() personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
 ; CHECK: invoke.cont:                                      ; preds = %entry
 ; CHECK-NEXT: call void asm sideeffect "mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue", ""()
-; CHECK-NEXT: %tmp = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) [[NUW:#[0-9]+]]
+; CHECK-NEXT: %tmp = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) [[NUW:#[0-9]+]]
 ; CHECK: }
 define void @_Z6doTestP8NSString() personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
 entry:
-  %call = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* ()*)()
+  %call = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* ()*)()
           to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
-  %tmp = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %tmp = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   unreachable
 
 lpad:                                             ; preds = %entry
diff --git a/test/Transforms/ObjCARC/contract.ll b/test/Transforms/ObjCARC/contract.ll
index 6ad46f2c85c5a59cbd754c9be7ee2adb93442f06..7cf3f5ea886fbc100e995ac47087b6d4ef346438 100644
--- a/test/Transforms/ObjCARC/contract.ll
+++ b/test/Transforms/ObjCARC/contract.ll
@@ -2,11 +2,11 @@
 
 target datalayout = "e-p:64:64:64"
 
-declare i8* @objc_retain(i8*)
-declare void @objc_release(i8*)
-declare i8* @objc_autorelease(i8*)
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
 
 declare void @use_pointer(i8*)
 declare i8* @returner()
@@ -17,7 +17,7 @@ declare void @callee()
 ; CHECK: }
 define void @test0(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retain(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   ret void
 }
@@ -27,7 +27,7 @@ entry:
 ; CHECK: }
 define void @test1(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_autorelease(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.autorelease(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   ret void
 }
@@ -35,12 +35,12 @@ entry:
 ; Merge objc_retain and objc_autorelease into objc_retainAutorelease.
 
 ; CHECK-LABEL: define void @test2(
-; CHECK: tail call i8* @objc_retainAutorelease(i8* %x) [[NUW:#[0-9]+]]
+; CHECK: tail call i8* @llvm.objc.retainAutorelease(i8* %x) [[NUW:#[0-9]+]]
 ; CHECK: }
 define void @test2(i8* %x) nounwind {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %x) nounwind
-  call i8* @objc_autorelease(i8* %0) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %0) nounwind
   call void @use_pointer(i8* %x)
   ret void
 }
@@ -48,26 +48,26 @@ entry:
 ; Same as test2 but the value is returned. Do an RV optimization.
 
 ; CHECK-LABEL: define i8* @test2b(
-; CHECK: tail call i8* @objc_retainAutoreleaseReturnValue(i8* %x) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retainAutoreleaseReturnValue(i8* %x) [[NUW]]
 ; CHECK: }
 define i8* @test2b(i8* %x) nounwind {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %x) nounwind
-  tail call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %x) nounwind
+  tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %0) nounwind
   ret i8* %x
 }
 
 ; Merge a retain,autorelease pair around a call.
 
 ; CHECK-LABEL: define void @test3(
-; CHECK: tail call i8* @objc_retainAutorelease(i8* %x) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retainAutorelease(i8* %x) [[NUW]]
 ; CHECK: @use_pointer(i8* %0)
 ; CHECK: }
 define void @test3(i8* %x, i64 %n) {
 entry:
-  tail call i8* @objc_retain(i8* %x) nounwind
+  tail call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  call i8* @objc_autorelease(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %x) nounwind
   ret void
 }
 
@@ -76,34 +76,34 @@ entry:
 
 ; CHECK-LABEL: define void @test4(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_retainAutorelease(i8* %x) [[NUW]]
+; CHECK-NEXT: @llvm.objc.retainAutorelease(i8* %x) [[NUW]]
 ; CHECK-NEXT: @use_pointer
-; CHECK-NEXT: @objc_release
+; CHECK-NEXT: @llvm.objc.release
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test4(i8* %x, i64 %n) {
 entry:
-  tail call i8* @objc_retain(i8* %x) nounwind
+  tail call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  call i8* @objc_autorelease(i8* %x) nounwind
-  tail call void @objc_release(i8* %x) nounwind
+  call i8* @llvm.objc.autorelease(i8* %x) nounwind
+  tail call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
 ; Don't merge retain and autorelease if they're not control-equivalent.
 
 ; CHECK-LABEL: define void @test5(
-; CHECK: tail call i8* @objc_retain(i8* %p) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retain(i8* %p) [[NUW]]
 ; CHECK: true:
-; CHECK: call i8* @objc_autorelease(i8* %0) [[NUW]]
+; CHECK: call i8* @llvm.objc.autorelease(i8* %0) [[NUW]]
 ; CHECK: }
 define void @test5(i8* %p, i1 %a) {
 entry:
-  tail call i8* @objc_retain(i8* %p) nounwind
+  tail call i8* @llvm.objc.retain(i8* %p) nounwind
   br i1 %a, label %true, label %false
 
 true:
-  call i8* @objc_autorelease(i8* %p) nounwind
+  call i8* @llvm.objc.autorelease(i8* %p) nounwind
   call void @use_pointer(i8* %p)
   ret void
 
@@ -120,13 +120,13 @@ false:
 ; Those entrypoints don't exist yet though.
 
 ; CHECK-LABEL: define i8* @test6(
-; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %p) [[NUW]]
-; CHECK: %t = tail call i8* @objc_autoreleaseReturnValue(i8* %1) [[NUW]]
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p) [[NUW]]
+; CHECK: %t = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %1) [[NUW]]
 ; CHECK: }
 define i8* @test6() {
   %p = call i8* @returner()
-  tail call i8* @objc_retainAutoreleasedReturnValue(i8* %p) nounwind
-  %t = tail call i8* @objc_autoreleaseReturnValue(i8* %p) nounwind
+  tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p) nounwind
+  %t = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %p) nounwind
   call void @use_pointer(i8* %t)
   ret i8* %t
 }
@@ -134,15 +134,15 @@ define i8* @test6() {
 ; Don't spoil the RV optimization.
 
 ; CHECK: define i8* @test7(i8* %p)
-; CHECK: tail call i8* @objc_retain(i8* %p)
+; CHECK: tail call i8* @llvm.objc.retain(i8* %p)
 ; CHECK: call void @use_pointer(i8* %1)
-; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %1)
+; CHECK: tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %1)
 ; CHECK: ret i8* %2
 ; CHECK-NEXT: }
 define i8* @test7(i8* %p) {
-  %1 = tail call i8* @objc_retain(i8* %p)
+  %1 = tail call i8* @llvm.objc.retain(i8* %p)
   call void @use_pointer(i8* %p)
-  %2 = tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+  %2 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
   ret i8* %p
 }
 
@@ -156,7 +156,7 @@ entry:
   br i1 %x, label %return, label %if.then
 
 if.then:                                          ; preds = %entry
-  %p = call i8* @objc_retain(i8* %c) nounwind
+  %p = call i8* @llvm.objc.retain(i8* %c) nounwind
   br label %return
 
 return:                                           ; preds = %if.then, %entry
@@ -164,12 +164,12 @@ return:                                           ; preds = %if.then, %entry
   ret i8* %retval
 }
 
-; Kill calls to @clang.arc.use(...)
+; Kill calls to @llvm.objc.clang.arc.use(...)
 ; CHECK-LABEL: define void @test9(
 ; CHECK-NOT: clang.arc.use
 ; CHECK: }
 define void @test9(i8* %a, i8* %b) {
-  call void (...) @clang.arc.use(i8* %a, i8* %b) nounwind
+  call void (...) @llvm.objc.clang.arc.use(i8* %a, i8* %b) nounwind
   ret void
 }
 
@@ -178,10 +178,10 @@ define void @test9(i8* %a, i8* %b) {
 ; is a return value.
 
 ; CHECK: define void @test10()
-; CHECK: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+; CHECK: tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
 define void @test10() {
   %p = call i8* @returner()
-  tail call i8* @objc_retain(i8* %p) nounwind
+  tail call i8* @llvm.objc.retain(i8* %p) nounwind
   ret void
 }
 
@@ -190,11 +190,11 @@ define void @test10() {
 
 ; CHECK-LABEL: define void @test11(
 ; CHECK-NEXT: %y = call i8* @returner()
-; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) [[NUW]]
+; CHECK-NEXT: tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %y) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test11() {
   %y = call i8* @returner()
-  tail call i8* @objc_retain(i8* %y) nounwind
+  tail call i8* @llvm.objc.retain(i8* %y) nounwind
   ret void
 }
 
@@ -202,11 +202,11 @@ define void @test11() {
 ; argument is not a return value.
 
 ; CHECK-LABEL: define void @test12(
-; CHECK-NEXT: tail call i8* @objc_retain(i8* %y) [[NUW]]
+; CHECK-NEXT: tail call i8* @llvm.objc.retain(i8* %y) [[NUW]]
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test12(i8* %y) {
-  tail call i8* @objc_retain(i8* %y) nounwind
+  tail call i8* @llvm.objc.retain(i8* %y) nounwind
   ret void
 }
 
@@ -216,17 +216,17 @@ define void @test12(i8* %y) {
 ; CHECK-LABEL: define void @test13(
 ; CHECK-NEXT: %y = call i8* @returner()
 ; CHECK-NEXT: call void @callee()
-; CHECK-NEXT: tail call i8* @objc_retain(i8* %y) [[NUW]]
+; CHECK-NEXT: tail call i8* @llvm.objc.retain(i8* %y) [[NUW]]
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test13() {
   %y = call i8* @returner()
   call void @callee()
-  tail call i8* @objc_retain(i8* %y) nounwind
+  tail call i8* @llvm.objc.retain(i8* %y) nounwind
   ret void
 }
 
 
-declare void @clang.arc.use(...) nounwind
+declare void @llvm.objc.clang.arc.use(...) nounwind
 
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/empty-block.ll b/test/Transforms/ObjCARC/empty-block.ll
index cc82d1088110006215bf3dd0970bd7c9baef97d3..68372e7cd5654b7ea8cd5dce1c7e652c1c12698b 100644
--- a/test/Transforms/ObjCARC/empty-block.ll
+++ b/test/Transforms/ObjCARC/empty-block.ll
@@ -3,33 +3,33 @@
 
 %0 = type opaque
 
-declare i8* @objc_retain(i8*)
+declare i8* @llvm.objc.retain(i8*)
 
-declare void @objc_release(i8*)
+declare void @llvm.objc.release(i8*)
 
-declare i8* @objc_autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
 
 ; Don't delete the autorelease.
 
 ; CHECK-LABEL: define %0* @test0(
-; CHECK:   @objc_retain
+; CHECK:   @llvm.objc.retain
 ; CHECK: .lr.ph:
-; CHECK-NOT: @objc_r
-; CHECK: @objc_autoreleaseReturnValue
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.r
+; CHECK: @llvm.objc.autoreleaseReturnValue
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define %0* @test0(%0* %buffer) nounwind {
   %1 = bitcast %0* %buffer to i8*
-  %2 = tail call i8* @objc_retain(i8* %1) nounwind
+  %2 = tail call i8* @llvm.objc.retain(i8* %1) nounwind
   br i1 undef, label %.lr.ph, label %._crit_edge
 
 .lr.ph:                                           ; preds = %.lr.ph, %0
   br i1 false, label %.lr.ph, label %._crit_edge
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  %3 = tail call i8* @objc_retain(i8* %1) nounwind
-  tail call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
-  %4 = tail call i8* @objc_autoreleaseReturnValue(i8* %1) nounwind
+  %3 = tail call i8* @llvm.objc.retain(i8* %1) nounwind
+  tail call void @llvm.objc.release(i8* %1) nounwind, !clang.imprecise_release !0
+  %4 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %1) nounwind
   ret %0* %buffer
 }
 
@@ -41,16 +41,16 @@ define %0* @test0(%0* %buffer) nounwind {
 define %0* @test1() nounwind {
   %buffer = call %0* @foo()
   %1 = bitcast %0* %buffer to i8*
-  %2 = tail call i8* @objc_retain(i8* %1) nounwind
+  %2 = tail call i8* @llvm.objc.retain(i8* %1) nounwind
   br i1 undef, label %.lr.ph, label %._crit_edge
 
 .lr.ph:                                           ; preds = %.lr.ph, %0
   br i1 false, label %.lr.ph, label %._crit_edge
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  %3 = tail call i8* @objc_retain(i8* %1) nounwind
-  tail call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
-  %4 = tail call i8* @objc_autoreleaseReturnValue(i8* %1) nounwind
+  %3 = tail call i8* @llvm.objc.retain(i8* %1) nounwind
+  tail call void @llvm.objc.release(i8* %1) nounwind, !clang.imprecise_release !0
+  %4 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %1) nounwind
   ret %0* %buffer
 }
 
diff --git a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index be351bee29c829c836aad45ec89d955967f90d28..589cb7b946a439711b3b4fe6f20542e37f8445fd 100644
--- a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -39,66 +39,66 @@ entry:
   %tmp = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 8, !dbg !37
   %tmp1 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8, !dbg !37, !invariant.load !38
   %tmp2 = bitcast %struct._class_t* %tmp to i8*, !dbg !37
-; CHECK: call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1)
-  %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1), !dbg !37, !clang.arc.no_objc_arc_exceptions !38
+; CHECK: call i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1)
+  %call = call i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1), !dbg !37, !clang.arc.no_objc_arc_exceptions !38
   call void @llvm.dbg.value(metadata i8* %call, metadata !25, metadata !DIExpression()), !dbg !37
-; CHECK: call i8* @objc_retain(i8* %call) [[NUW:#[0-9]+]]
-  %tmp3 = call i8* @objc_retain(i8* %call) nounwind, !dbg !39
+; CHECK: call i8* @llvm.objc.retain(i8* %call) [[NUW:#[0-9]+]]
+  %tmp3 = call i8* @llvm.objc.retain(i8* %call) nounwind, !dbg !39
   call void @llvm.dbg.value(metadata i8* %call, metadata !25, metadata !DIExpression()), !dbg !39
   invoke fastcc void @ThrowFunc(i8* %call)
           to label %eh.cont unwind label %lpad, !dbg !40, !clang.arc.no_objc_arc_exceptions !38
 
 eh.cont:                                          ; preds = %entry
-; CHECK: call void @objc_release(i8* %call)
-  call void @objc_release(i8* %call) nounwind, !dbg !42, !clang.imprecise_release !38
+; CHECK: call void @llvm.objc.release(i8* %call)
+  call void @llvm.objc.release(i8* %call) nounwind, !dbg !42, !clang.imprecise_release !38
   br label %if.end, !dbg !43
 
 lpad:                                             ; preds = %entry
   %tmp4 = landingpad { i8*, i32 }
           catch i8* null, !dbg !40
   %tmp5 = extractvalue { i8*, i32 } %tmp4, 0, !dbg !40
-  %exn.adjusted = call i8* @objc_begin_catch(i8* %tmp5) nounwind, !dbg !44
+  %exn.adjusted = call i8* @llvm.objc.begin_catch(i8* %tmp5) nounwind, !dbg !44
   call void @llvm.dbg.value(metadata i8 0, metadata !21, metadata !DIExpression()), !dbg !46
-  call void @objc_end_catch(), !dbg !49, !clang.arc.no_objc_arc_exceptions !38
-; CHECK: call void @objc_release(i8* %call)
-  call void @objc_release(i8* %call) nounwind, !dbg !42, !clang.imprecise_release !38
+  call void @llvm.objc.end_catch(), !dbg !49, !clang.arc.no_objc_arc_exceptions !38
+; CHECK: call void @llvm.objc.release(i8* %call)
+  call void @llvm.objc.release(i8* %call) nounwind, !dbg !42, !clang.imprecise_release !38
   call void (i8*, ...) @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to i8*), i8* %call), !dbg !50, !clang.arc.no_objc_arc_exceptions !38
   br label %if.end, !dbg !52
 
 if.end:                                           ; preds = %lpad, %eh.cont
   call void (i8*, ...) @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to i8*), i8* %call), !dbg !53, !clang.arc.no_objc_arc_exceptions !38
-; CHECK: call void @objc_release(i8* %call)
-  call void @objc_release(i8* %call) nounwind, !dbg !54, !clang.imprecise_release !38
+; CHECK: call void @llvm.objc.release(i8* %call)
+  call void @llvm.objc.release(i8* %call) nounwind, !dbg !54, !clang.imprecise_release !38
   ret i32 0, !dbg !54
 }
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...) nonlazybind
 
-declare i8* @objc_retain(i8*) nonlazybind
+declare i8* @llvm.objc.retain(i8*) nonlazybind
 
-declare i8* @objc_begin_catch(i8*)
+declare i8* @llvm.objc.begin_catch(i8*)
 
-declare void @objc_end_catch()
+declare void @llvm.objc.end_catch()
 
-declare void @objc_exception_rethrow()
+declare void @llvm.objc.exception_rethrow()
 
 define internal fastcc void @ThrowFunc(i8* %obj) uwtable noinline ssp !dbg !27 {
 entry:
-  %tmp = call i8* @objc_retain(i8* %obj) nounwind
+  %tmp = call i8* @llvm.objc.retain(i8* %obj) nounwind
   call void @llvm.dbg.value(metadata i8* %obj, metadata !32, metadata !DIExpression()), !dbg !55
   %tmp1 = load %struct._class_t*, %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_1", align 8, !dbg !56
   %tmp2 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_5", align 8, !dbg !56, !invariant.load !38
   %tmp3 = bitcast %struct._class_t* %tmp1 to i8*, !dbg !56
-  call void (i8*, i8*, %0*, %0*, ...) bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %0*, %0*, ...)*)(i8* %tmp3, i8* %tmp2, %0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_3 to %0*), %0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_3 to %0*)), !dbg !56, !clang.arc.no_objc_arc_exceptions !38
-  call void @objc_release(i8* %obj) nounwind, !dbg !58, !clang.imprecise_release !38
+  call void (i8*, i8*, %0*, %0*, ...) bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, %0*, %0*, ...)*)(i8* %tmp3, i8* %tmp2, %0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_3 to %0*), %0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_3 to %0*)), !dbg !56, !clang.arc.no_objc_arc_exceptions !38
+  call void @llvm.objc.release(i8* %obj) nounwind, !dbg !58, !clang.imprecise_release !38
   ret void, !dbg !58
 }
 
 declare i32 @__objc_personality_v0(...)
 
-declare void @objc_release(i8*) nonlazybind
+declare void @llvm.objc.release(i8*) nonlazybind
 
 declare void @NSLog(i8*, ...)
 
@@ -107,8 +107,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 ; CHECK: attributes #0 = { ssp uwtable }
 ; CHECK: attributes #1 = { nounwind readnone speculatable }
 ; CHECK: attributes #2 = { nonlazybind }
-; CHECK: attributes #3 = { noinline ssp uwtable }
 ; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #4 = { noinline ssp uwtable }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33, !34, !35, !36, !61}
diff --git a/test/Transforms/ObjCARC/escape.ll b/test/Transforms/ObjCARC/escape.ll
index c7a1b03c16fcd80be21fef3653b3fa619eec5e23..f9eeca88163bf184a918eadb13aeaa45dd681586 100644
--- a/test/Transforms/ObjCARC/escape.ll
+++ b/test/Transforms/ObjCARC/escape.ll
@@ -10,8 +10,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; with the objc_storeWeak call.
 
 ; CHECK-LABEL: define void @test0(
-; CHECK: %tmp7 = call i8* @objc_retainBlock(i8* %tmp6) [[NUW:#[0-9]+]], !clang.arc.copy_on_escape !0
-; CHECK: call void @objc_release(i8* %tmp7) [[NUW]], !clang.imprecise_release !0
+; CHECK: %tmp7 = call i8* @llvm.objc.retainBlock(i8* %tmp6) [[NUW:#[0-9]+]], !clang.arc.copy_on_escape !0
+; CHECK: call void @llvm.objc.release(i8* %tmp7) [[NUW]], !clang.imprecise_release !0
 ; CHECK: }
 define void @test0() nounwind {
 entry:
@@ -31,7 +31,7 @@ entry:
   store i8* bitcast (void (i8*)* @__Block_byref_object_dispose_ to i8*), i8** %tmp2, align 8
   %weakLogNTimes1 = getelementptr inbounds %struct.__block_byref_weakLogNTimes, %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 6
   %tmp3 = bitcast void (...)** %weakLogNTimes1 to i8**
-  %tmp4 = call i8* @objc_initWeak(i8** %tmp3, i8* null) nounwind
+  %tmp4 = call i8* @llvm.objc.initWeak(i8** %tmp3, i8* null) nounwind
   %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 0
   store i8* null, i8** %block.isa, align 8
   %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 1
@@ -46,19 +46,19 @@ entry:
   %tmp5 = bitcast %struct.__block_byref_weakLogNTimes* %weakLogNTimes to i8*
   store i8* %tmp5, i8** %block.captured, align 8
   %tmp6 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block to i8*
-  %tmp7 = call i8* @objc_retainBlock(i8* %tmp6) nounwind, !clang.arc.copy_on_escape !0
+  %tmp7 = call i8* @llvm.objc.retainBlock(i8* %tmp6) nounwind, !clang.arc.copy_on_escape !0
   %tmp8 = load %struct.__block_byref_weakLogNTimes*, %struct.__block_byref_weakLogNTimes** %byref.forwarding, align 8
   %weakLogNTimes3 = getelementptr inbounds %struct.__block_byref_weakLogNTimes, %struct.__block_byref_weakLogNTimes* %tmp8, i64 0, i32 6
   %tmp9 = bitcast void (...)** %weakLogNTimes3 to i8**
-  %tmp10 = call i8* @objc_storeWeak(i8** %tmp9, i8* %tmp7) nounwind
+  %tmp10 = call i8* @llvm.objc.storeWeak(i8** %tmp9, i8* %tmp7) nounwind
   %tmp11 = getelementptr inbounds i8, i8* %tmp7, i64 16
   %tmp12 = bitcast i8* %tmp11 to i8**
   %tmp13 = load i8*, i8** %tmp12, align 8
   %tmp14 = bitcast i8* %tmp13 to void (i8*, i32)*
   call void %tmp14(i8* %tmp7, i32 10) nounwind, !clang.arc.no_objc_arc_exceptions !0
-  call void @objc_release(i8* %tmp7) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp7) nounwind, !clang.imprecise_release !0
   call void @_Block_object_dispose(i8* %tmp5, i32 8) nounwind
-  call void @objc_destroyWeak(i8** %tmp3) nounwind
+  call void @llvm.objc.destroyWeak(i8** %tmp3) nounwind
   ret void
 }
 
@@ -66,7 +66,7 @@ entry:
 ; so the optimization is valid.
 
 ; CHECK-LABEL: define void @test1(
-; CHECK-NOT: @objc_retainBlock
+; CHECK-NOT: @llvm.objc.retainBlock
 ; CHECK: }
 define void @test1() nounwind {
 entry:
@@ -86,7 +86,7 @@ entry:
   store i8* bitcast (void (i8*)* @__Block_byref_object_dispose_ to i8*), i8** %tmp2, align 8
   %weakLogNTimes1 = getelementptr inbounds %struct.__block_byref_weakLogNTimes, %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 6
   %tmp3 = bitcast void (...)** %weakLogNTimes1 to i8**
-  %tmp4 = call i8* @objc_initWeak(i8** %tmp3, i8* null) nounwind
+  %tmp4 = call i8* @llvm.objc.initWeak(i8** %tmp3, i8* null) nounwind
   %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 0
   store i8* null, i8** %block.isa, align 8
   %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 1
@@ -101,7 +101,7 @@ entry:
   %tmp5 = bitcast %struct.__block_byref_weakLogNTimes* %weakLogNTimes to i8*
   store i8* %tmp5, i8** %block.captured, align 8
   %tmp6 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block to i8*
-  %tmp7 = call i8* @objc_retainBlock(i8* %tmp6) nounwind, !clang.arc.copy_on_escape !0
+  %tmp7 = call i8* @llvm.objc.retainBlock(i8* %tmp6) nounwind, !clang.arc.copy_on_escape !0
   %tmp8 = load %struct.__block_byref_weakLogNTimes*, %struct.__block_byref_weakLogNTimes** %byref.forwarding, align 8
   %weakLogNTimes3 = getelementptr inbounds %struct.__block_byref_weakLogNTimes, %struct.__block_byref_weakLogNTimes* %tmp8, i64 0, i32 6
   %tmp9 = bitcast void (...)** %weakLogNTimes3 to i8**
@@ -111,22 +111,22 @@ entry:
   %tmp13 = load i8*, i8** %tmp12, align 8
   %tmp14 = bitcast i8* %tmp13 to void (i8*, i32)*
   call void %tmp14(i8* %tmp7, i32 10) nounwind, !clang.arc.no_objc_arc_exceptions !0
-  call void @objc_release(i8* %tmp7) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp7) nounwind, !clang.imprecise_release !0
   call void @_Block_object_dispose(i8* %tmp5, i32 8) nounwind
-  call void @objc_destroyWeak(i8** %tmp3) nounwind
+  call void @llvm.objc.destroyWeak(i8** %tmp3) nounwind
   ret void
 }
 
 declare void @__Block_byref_object_copy_(i8*, i8*) nounwind
 declare void @__Block_byref_object_dispose_(i8*) nounwind
-declare void @objc_destroyWeak(i8**)
-declare i8* @objc_initWeak(i8**, i8*)
+declare void @llvm.objc.destroyWeak(i8**)
+declare i8* @llvm.objc.initWeak(i8**, i8*)
 declare void @__main_block_invoke_0(i8* nocapture, i32) nounwind ssp
 declare void @_Block_object_dispose(i8*, i32)
-declare i8* @objc_retainBlock(i8*)
-declare i8* @objc_storeWeak(i8**, i8*)
+declare i8* @llvm.objc.retainBlock(i8*)
+declare i8* @llvm.objc.storeWeak(i8**, i8*)
 declare i8* @not_really_objc_storeWeak(i8**, i8*)
-declare void @objc_release(i8*)
+declare void @llvm.objc.release(i8*)
 
 !0 = !{}
 
diff --git a/test/Transforms/ObjCARC/expand.ll b/test/Transforms/ObjCARC/expand.ll
index fe47ee52e9079bf0f3097887e369d419880041ce..b89c5d524e1f6c692f665cd3f318e334791a60c3 100644
--- a/test/Transforms/ObjCARC/expand.ll
+++ b/test/Transforms/ObjCARC/expand.ll
@@ -2,78 +2,78 @@
 
 target datalayout = "e-p:64:64:64"
 
-declare i8* @objc_retain(i8*)
-declare i8* @objc_autorelease(i8*)
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare i8* @objc_retainAutorelease(i8*)
-declare i8* @objc_retainAutoreleaseReturnValue(i8*)
-declare i8* @objc_retainBlock(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.retainAutorelease(i8*)
+declare i8* @llvm.objc.retainAutoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.retainBlock(i8*)
 
 declare void @use_pointer(i8*)
 
 ; CHECK: define void @test_retain(i8* %x) [[NUW:#[0-9]+]] {
-; CHECK: call i8* @objc_retain(i8* %x)
+; CHECK: call i8* @llvm.objc.retain(i8* %x)
 ; CHECK: call void @use_pointer(i8* %x)
 ; CHECK: }
 define void @test_retain(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retain(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
   call void @use_pointer(i8* %0)
   ret void
 }
 
 ; CHECK: define void @test_retainAutoreleasedReturnValue(i8* %x) [[NUW]] {
-; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %x)
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %x)
 ; CHECK: call void @use_pointer(i8* %x)
 ; CHECK: }
 define void @test_retainAutoreleasedReturnValue(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %x) nounwind
   call void @use_pointer(i8* %0)
   ret void
 }
 
 ; CHECK: define void @test_retainAutorelease(i8* %x) [[NUW]] {
-; CHECK: call i8* @objc_retainAutorelease(i8* %x)
+; CHECK: call i8* @llvm.objc.retainAutorelease(i8* %x)
 ; CHECK: call void @use_pointer(i8* %x)
 ; CHECK: }
 define void @test_retainAutorelease(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retainAutorelease(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retainAutorelease(i8* %x) nounwind
   call void @use_pointer(i8* %0)
   ret void
 }
 
 ; CHECK: define void @test_retainAutoreleaseReturnValue(i8* %x) [[NUW]] {
-; CHECK: call i8* @objc_retainAutoreleaseReturnValue(i8* %x)
+; CHECK: call i8* @llvm.objc.retainAutoreleaseReturnValue(i8* %x)
 ; CHECK: call void @use_pointer(i8* %x)
 ; CHECK: }
 define void @test_retainAutoreleaseReturnValue(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retainAutoreleaseReturnValue(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleaseReturnValue(i8* %x) nounwind
   call void @use_pointer(i8* %0)
   ret void
 }
 
 ; CHECK: define void @test_autorelease(i8* %x) [[NUW]] {
-; CHECK: call i8* @objc_autorelease(i8* %x)
+; CHECK: call i8* @llvm.objc.autorelease(i8* %x)
 ; CHECK: call void @use_pointer(i8* %x)
 ; CHECK: }
 define void @test_autorelease(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_autorelease(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.autorelease(i8* %x) nounwind
   call void @use_pointer(i8* %0)
   ret void
 }
 
 ; CHECK: define void @test_autoreleaseReturnValue(i8* %x) [[NUW]] {
-; CHECK: call i8* @objc_autoreleaseReturnValue(i8* %x)
+; CHECK: call i8* @llvm.objc.autoreleaseReturnValue(i8* %x)
 ; CHECK: call void @use_pointer(i8* %x)
 ; CHECK: }
 define void @test_autoreleaseReturnValue(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_autoreleaseReturnValue(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.autoreleaseReturnValue(i8* %x) nounwind
   call void @use_pointer(i8* %0)
   ret void
 }
@@ -83,12 +83,12 @@ entry:
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ; CHECK: define void @test_retainBlock(i8* %x) [[NUW]] {
-; CHECK: call i8* @objc_retainBlock(i8* %x)
+; CHECK: call i8* @llvm.objc.retainBlock(i8* %x)
 ; CHECK: call void @use_pointer(i8* %0)
 ; CHECK: }
 define void @test_retainBlock(i8* %x) nounwind {
 entry:
-  %0 = call i8* @objc_retainBlock(i8* %x) nounwind
+  %0 = call i8* @llvm.objc.retainBlock(i8* %x) nounwind
   call void @use_pointer(i8* %0)
   ret void
 }
diff --git a/test/Transforms/ObjCARC/funclet.ll b/test/Transforms/ObjCARC/funclet.ll
index 57e6b49534424aeb0daec376c2be41c396eb0e1d..346a690bc046ae2cffdcb2f664e77ebe9296f21e 100644
--- a/test/Transforms/ObjCARC/funclet.ll
+++ b/test/Transforms/ObjCARC/funclet.ll
@@ -14,8 +14,8 @@
 declare zeroext i1 @"\01?g@@YA_NXZ"() local_unnamed_addr
 declare i8* @"\01?h@@YAPEAUobjc_object@@XZ"() local_unnamed_addr
 
-declare dllimport void @objc_release(i8*) local_unnamed_addr
-declare dllimport i8* @objc_retainAutoreleasedReturnValue(i8* returned) local_unnamed_addr
+declare dllimport void @llvm.objc.release(i8*) local_unnamed_addr
+declare dllimport i8* @llvm.objc.retainAutoreleasedReturnValue(i8* returned) local_unnamed_addr
 
 declare i32 @__CxxFrameHandler3(...)
 
@@ -32,8 +32,8 @@ if.then:                                          ; preds = %invoke.cont
           to label %invoke.cont1 unwind label %ehcleanup6
 
 invoke.cont1:                                     ; preds = %if.then
-  %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call2)
-  tail call void @objc_release(i8* null), !clang.imprecise_release !1
+  %0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call2)
+  tail call void @llvm.objc.release(i8* null), !clang.imprecise_release !1
   br label %if.end
 
 if.end:                                           ; preds = %invoke.cont1, %invoke.cont
@@ -42,25 +42,25 @@ if.end:                                           ; preds = %invoke.cont1, %invo
           to label %invoke.cont3 unwind label %ehcleanup
 
 invoke.cont3:                                     ; preds = %if.end
-  tail call void @objc_release(i8* null), !clang.imprecise_release !1
-  tail call void @objc_release(i8* %a.0), !clang.imprecise_release !1
+  tail call void @llvm.objc.release(i8* null), !clang.imprecise_release !1
+  tail call void @llvm.objc.release(i8* %a.0), !clang.imprecise_release !1
   ret void
 
 ehcleanup:                                        ; preds = %if.end
   %1 = cleanuppad within none []
-  call void @objc_release(i8* null) [ "funclet"(token %1) ], !clang.imprecise_release !1
+  call void @llvm.objc.release(i8* null) [ "funclet"(token %1) ], !clang.imprecise_release !1
   cleanupret from %1 unwind label %ehcleanup6
 
 ehcleanup6:                                       ; preds = %ehcleanup, %if.then, %entry
   %a.1 = phi i8* [ %a.0, %ehcleanup ], [ null, %if.then ], [ null, %entry ]
   %2 = cleanuppad within none []
-  call void @objc_release(i8* %a.1) [ "funclet"(token %2) ], !clang.imprecise_release !1
+  call void @llvm.objc.release(i8* %a.1) [ "funclet"(token %2) ], !clang.imprecise_release !1
   cleanupret from %2 unwind to caller
 }
 
 ; CHECK-LABEL: ?f@@YAXXZ
-; CHECK: call void @objc_release(i8* {{.*}}) {{.*}}[ "funclet"(token %1) ]
-; CHECK-NOT: call void @objc_release(i8* {{.*}}) {{.*}}[ "funclet"(token %2) ]
+; CHECK: call void @llvm.objc.release(i8* {{.*}}) {{.*}}[ "funclet"(token %1) ]
+; CHECK-NOT: call void @llvm.objc.release(i8* {{.*}}) {{.*}}[ "funclet"(token %2) ]
 
 define void @"\01?i@@YAXXZ"() local_unnamed_addr personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
@@ -75,8 +75,8 @@ if.then:                                          ; preds = %invoke.cont
           to label %invoke.cont1 unwind label %ehcleanup6
 
 invoke.cont1:                                     ; preds = %if.then
-  %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call2)
-  tail call void @objc_release(i8* null), !clang.imprecise_release !1
+  %0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call2)
+  tail call void @llvm.objc.release(i8* null), !clang.imprecise_release !1
   br label %if.end
 
 if.end:                                           ; preds = %invoke.cont1, %invoke.cont
@@ -85,13 +85,13 @@ if.end:                                           ; preds = %invoke.cont1, %invo
           to label %invoke.cont3 unwind label %ehcleanup
 
 invoke.cont3:                                     ; preds = %if.end
-  tail call void @objc_release(i8* null), !clang.imprecise_release !1
-  tail call void @objc_release(i8* %a.0), !clang.imprecise_release !1
+  tail call void @llvm.objc.release(i8* null), !clang.imprecise_release !1
+  tail call void @llvm.objc.release(i8* %a.0), !clang.imprecise_release !1
   ret void
 
 ehcleanup:                                        ; preds = %if.end
   %1 = cleanuppad within none []
-  call void @objc_release(i8* null) [ "funclet"(token %1) ], !clang.imprecise_release !1
+  call void @llvm.objc.release(i8* null) [ "funclet"(token %1) ], !clang.imprecise_release !1
   br label %ehcleanup.1
 
 ehcleanup.1:
@@ -100,13 +100,13 @@ ehcleanup.1:
 ehcleanup6:                                       ; preds = %ehcleanup, %if.then, %entry
   %a.1 = phi i8* [ %a.0, %ehcleanup.1 ], [ null, %if.then ], [ null, %entry ]
   %2 = cleanuppad within none []
-  call void @objc_release(i8* %a.1) [ "funclet"(token %2) ], !clang.imprecise_release !1
+  call void @llvm.objc.release(i8* %a.1) [ "funclet"(token %2) ], !clang.imprecise_release !1
   cleanupret from %2 unwind to caller
 }
 
 ; CHECK-LABEL: ?i@@YAXXZ
-; CHECK: call void @objc_release(i8* {{.*}}) {{.*}}[ "funclet"(token %1) ]
-; CHECK-NOT: call void @objc_release(i8* {{.*}}) {{.*}}[ "funclet"(token %2) ]
+; CHECK: call void @llvm.objc.release(i8* {{.*}}) {{.*}}[ "funclet"(token %1) ]
+; CHECK-NOT: call void @llvm.objc.release(i8* {{.*}}) {{.*}}[ "funclet"(token %2) ]
 
 !1 = !{}
 
diff --git a/test/Transforms/ObjCARC/gvn.ll b/test/Transforms/ObjCARC/gvn.ll
index 6f828545bc6fa8e5394ff3236fcbb77982587b56..f2977d0c513e88138ac511173bae258bb8caee61 100644
--- a/test/Transforms/ObjCARC/gvn.ll
+++ b/test/Transforms/ObjCARC/gvn.ll
@@ -2,9 +2,9 @@
 
 @x = common global i8* null, align 8
 
-declare i8* @objc_retain(i8*)
-declare i32 @objc_sync_enter(i8*)
-declare i32 @objc_sync_exit(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i32 @llvm.objc.sync.enter(i8*)
+declare i32 @llvm.objc.sync.exit(i8*)
 
 ; GVN should be able to eliminate this redundant load, with ARC-specific
 ; alias analysis.
@@ -18,7 +18,7 @@ declare i32 @objc_sync_exit(i8*)
 define i8* @test0(i32 %n) nounwind {
 entry:
   %s = load i8*, i8** @x
-  %0 = tail call i8* @objc_retain(i8* %s) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %s) nounwind
   %t = load i8*, i8** @x
   ret i8* %t
 }
@@ -34,8 +34,8 @@ entry:
 define i8* @test1(i32 %n) nounwind {
 entry:
   %s = load i8*, i8** @x
-  %0 = call i32 @objc_sync_enter(i8* %s)
+  %0 = call i32 @llvm.objc.sync.enter(i8* %s)
   %t = load i8*, i8** @x
-  %1 = call i32 @objc_sync_exit(i8* %s)
+  %1 = call i32 @llvm.objc.sync.exit(i8* %s)
   ret i8* %t
 }
diff --git a/test/Transforms/ObjCARC/intrinsic-use-isolated.ll b/test/Transforms/ObjCARC/intrinsic-use-isolated.ll
index 03d7520dde917ab262453fb1a998abca37b4a359..4ccad033aeb8926a03fc43319df5262ef6c0b65b 100644
--- a/test/Transforms/ObjCARC/intrinsic-use-isolated.ll
+++ b/test/Transforms/ObjCARC/intrinsic-use-isolated.ll
@@ -3,14 +3,14 @@
 ; This file makes sure that clang.arc.used is removed even if no other ARC
 ; interesting calls are in the module.
 
-declare void @clang.arc.use(...) nounwind
+declare void @llvm.objc.clang.arc.use(...) nounwind
 
-; Kill calls to @clang.arc.use(...)
+; Kill calls to @llvm.objc.clang.arc.use(...)
 ; CHECK-LABEL: define void @test0(
 ; CHECK-NOT: clang.arc.use
 ; CHECK: }
 define void @test0(i8* %a, i8* %b) {
-  call void (...) @clang.arc.use(i8* %a, i8* %b) nounwind
+  call void (...) @llvm.objc.clang.arc.use(i8* %a, i8* %b) nounwind
   ret void
 }
 
diff --git a/test/Transforms/ObjCARC/intrinsic-use.ll b/test/Transforms/ObjCARC/intrinsic-use.ll
index f5956201454c3f2a5d7b95f470a70447471f9747..8a4ac52d2a6db82d964abe44f6aed056ecf2e133 100644
--- a/test/Transforms/ObjCARC/intrinsic-use.ll
+++ b/test/Transforms/ObjCARC/intrinsic-use.ll
@@ -2,12 +2,12 @@
 
 target datalayout = "e-p:64:64:64"
 
-declare i8* @objc_retain(i8*)
-declare i8* @objc_retainAutorelease(i8*)
-declare void @objc_release(i8*)
-declare i8* @objc_autorelease(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.retainAutorelease(i8*)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
 
-declare void @clang.arc.use(...)
+declare void @llvm.objc.clang.arc.use(...)
 
 declare void @test0_helper(i8*, i8**)
 
@@ -15,70 +15,70 @@ declare void @test0_helper(i8*, i8**)
 ; the reduced test case from <rdar://13195034>.
 ;
 ; CHECK-LABEL:      define void @test0(
-; CHECK:        @objc_retain(i8* %x)
+; CHECK:        @llvm.objc.retain(i8* %x)
 ; CHECK-NEXT:   store i8* %y, i8** %temp0
-; CHECK-NEXT:   @objc_retain(i8* %y)
+; CHECK-NEXT:   @llvm.objc.retain(i8* %y)
 ; CHECK-NEXT:   call void @test0_helper
 ; CHECK-NEXT:   [[VAL1:%.*]] = load i8*, i8** %temp0
-; CHECK-NEXT:   @objc_retain(i8* [[VAL1]])
-; CHECK-NEXT:   call void (...) @clang.arc.use(i8* %y)
-; CHECK-NEXT:   @objc_release(i8* %y)
+; CHECK-NEXT:   @llvm.objc.retain(i8* [[VAL1]])
+; CHECK-NEXT:   call void (...) @llvm.objc.clang.arc.use(i8* %y)
+; CHECK-NEXT:   @llvm.objc.release(i8* %y)
 ; CHECK-NEXT:   store i8* [[VAL1]], i8** %temp1
 ; CHECK-NEXT:   call void @test0_helper
 ; CHECK-NEXT:   [[VAL2:%.*]] = load i8*, i8** %temp1
-; CHECK-NEXT:   @objc_retain(i8* [[VAL2]])
-; CHECK-NEXT:   call void (...) @clang.arc.use(i8* [[VAL1]])
-; CHECK-NEXT:   @objc_release(i8* [[VAL1]])
-; CHECK-NEXT:   @objc_autorelease(i8* %x)
+; CHECK-NEXT:   @llvm.objc.retain(i8* [[VAL2]])
+; CHECK-NEXT:   call void (...) @llvm.objc.clang.arc.use(i8* [[VAL1]])
+; CHECK-NEXT:   @llvm.objc.release(i8* [[VAL1]])
+; CHECK-NEXT:   @llvm.objc.autorelease(i8* %x)
 ; CHECK-NEXT:   store i8* %x, i8** %out
-; CHECK-NEXT:   @objc_retain(i8* %x)
-; CHECK-NEXT:   @objc_release(i8* [[VAL2]])
-; CHECK-NEXT:   @objc_release(i8* %x)
+; CHECK-NEXT:   @llvm.objc.retain(i8* %x)
+; CHECK-NEXT:   @llvm.objc.release(i8* [[VAL2]])
+; CHECK-NEXT:   @llvm.objc.release(i8* %x)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test0(i8** %out, i8* %x, i8* %y) {
 entry:
   %temp0 = alloca i8*, align 8
   %temp1 = alloca i8*, align 8
-  %0 = call i8* @objc_retain(i8* %x) nounwind
-  %1 = call i8* @objc_retain(i8* %y) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %y) nounwind
   store i8* %y, i8** %temp0
   call void @test0_helper(i8* %x, i8** %temp0)
   %val1 = load i8*, i8** %temp0
-  %2 = call i8* @objc_retain(i8* %val1) nounwind
-  call void (...) @clang.arc.use(i8* %y) nounwind
-  call void @objc_release(i8* %y) nounwind
+  %2 = call i8* @llvm.objc.retain(i8* %val1) nounwind
+  call void (...) @llvm.objc.clang.arc.use(i8* %y) nounwind
+  call void @llvm.objc.release(i8* %y) nounwind
   store i8* %val1, i8** %temp1
   call void @test0_helper(i8* %x, i8** %temp1)
   %val2 = load i8*, i8** %temp1
-  %3 = call i8* @objc_retain(i8* %val2) nounwind
-  call void (...) @clang.arc.use(i8* %val1) nounwind
-  call void @objc_release(i8* %val1) nounwind
-  %4 = call i8* @objc_retain(i8* %x) nounwind
-  %5 = call i8* @objc_autorelease(i8* %x) nounwind
+  %3 = call i8* @llvm.objc.retain(i8* %val2) nounwind
+  call void (...) @llvm.objc.clang.arc.use(i8* %val1) nounwind
+  call void @llvm.objc.release(i8* %val1) nounwind
+  %4 = call i8* @llvm.objc.retain(i8* %x) nounwind
+  %5 = call i8* @llvm.objc.autorelease(i8* %x) nounwind
   store i8* %x, i8** %out
-  call void @objc_release(i8* %val2) nounwind
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %val2) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
 ; CHECK-LABEL:      define void @test0a(
-; CHECK:        @objc_retain(i8* %x)
+; CHECK:        @llvm.objc.retain(i8* %x)
 ; CHECK-NEXT:   store i8* %y, i8** %temp0
-; CHECK-NEXT:   @objc_retain(i8* %y)
+; CHECK-NEXT:   @llvm.objc.retain(i8* %y)
 ; CHECK-NEXT:   call void @test0_helper
 ; CHECK-NEXT:   [[VAL1:%.*]] = load i8*, i8** %temp0
-; CHECK-NEXT:   @objc_retain(i8* [[VAL1]])
-; CHECK-NEXT:   call void (...) @clang.arc.use(i8* %y)
-; CHECK-NEXT:   @objc_release(i8* %y)
+; CHECK-NEXT:   @llvm.objc.retain(i8* [[VAL1]])
+; CHECK-NEXT:   call void (...) @llvm.objc.clang.arc.use(i8* %y)
+; CHECK-NEXT:   @llvm.objc.release(i8* %y)
 ; CHECK-NEXT:   store i8* [[VAL1]], i8** %temp1
 ; CHECK-NEXT:   call void @test0_helper
 ; CHECK-NEXT:   [[VAL2:%.*]] = load i8*, i8** %temp1
-; CHECK-NEXT:   @objc_retain(i8* [[VAL2]])
-; CHECK-NEXT:   call void (...) @clang.arc.use(i8* [[VAL1]])
-; CHECK-NEXT:   @objc_release(i8* [[VAL1]])
-; CHECK-NEXT:   @objc_autorelease(i8* %x)
-; CHECK-NEXT:   @objc_release(i8* [[VAL2]])
+; CHECK-NEXT:   @llvm.objc.retain(i8* [[VAL2]])
+; CHECK-NEXT:   call void (...) @llvm.objc.clang.arc.use(i8* [[VAL1]])
+; CHECK-NEXT:   @llvm.objc.release(i8* [[VAL1]])
+; CHECK-NEXT:   @llvm.objc.autorelease(i8* %x)
+; CHECK-NEXT:   @llvm.objc.release(i8* [[VAL2]])
 ; CHECK-NEXT:   store i8* %x, i8** %out
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
@@ -86,25 +86,25 @@ define void @test0a(i8** %out, i8* %x, i8* %y) {
 entry:
   %temp0 = alloca i8*, align 8
   %temp1 = alloca i8*, align 8
-  %0 = call i8* @objc_retain(i8* %x) nounwind
-  %1 = call i8* @objc_retain(i8* %y) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %x) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %y) nounwind
   store i8* %y, i8** %temp0
   call void @test0_helper(i8* %x, i8** %temp0)
   %val1 = load i8*, i8** %temp0
-  %2 = call i8* @objc_retain(i8* %val1) nounwind
-  call void (...) @clang.arc.use(i8* %y) nounwind
-  call void @objc_release(i8* %y) nounwind, !clang.imprecise_release !0
+  %2 = call i8* @llvm.objc.retain(i8* %val1) nounwind
+  call void (...) @llvm.objc.clang.arc.use(i8* %y) nounwind
+  call void @llvm.objc.release(i8* %y) nounwind, !clang.imprecise_release !0
   store i8* %val1, i8** %temp1
   call void @test0_helper(i8* %x, i8** %temp1)
   %val2 = load i8*, i8** %temp1
-  %3 = call i8* @objc_retain(i8* %val2) nounwind
-  call void (...) @clang.arc.use(i8* %val1) nounwind
-  call void @objc_release(i8* %val1) nounwind, !clang.imprecise_release !0
-  %4 = call i8* @objc_retain(i8* %x) nounwind
-  %5 = call i8* @objc_autorelease(i8* %x) nounwind
+  %3 = call i8* @llvm.objc.retain(i8* %val2) nounwind
+  call void (...) @llvm.objc.clang.arc.use(i8* %val1) nounwind
+  call void @llvm.objc.release(i8* %val1) nounwind, !clang.imprecise_release !0
+  %4 = call i8* @llvm.objc.retain(i8* %x) nounwind
+  %5 = call i8* @llvm.objc.autorelease(i8* %x) nounwind
   store i8* %x, i8** %out
-  call void @objc_release(i8* %val2) nounwind, !clang.imprecise_release !0
-  call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %val2) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %x) nounwind, !clang.imprecise_release !0
   ret void
 }
 
diff --git a/test/Transforms/ObjCARC/invoke-2.ll b/test/Transforms/ObjCARC/invoke-2.ll
index ef5c675a5939b2f08c77cae4ec20312020f327fb..b34de1a1e7ebed9bb7d4de202211e5c7ea76f86c 100644
--- a/test/Transforms/ObjCARC/invoke-2.ll
+++ b/test/Transforms/ObjCARC/invoke-2.ll
@@ -4,51 +4,51 @@ target triple = "x86_64-unknown-windows-msvc"
 
 declare i32 @__CxxFrameHandler3(...)
 
-declare dllimport i8* @objc_msgSend(i8*, i8*, ...) local_unnamed_addr
+declare dllimport i8* @llvm.objc.msgSend(i8*, i8*, ...) local_unnamed_addr
 
-declare dllimport i8* @objc_retain(i8* returned) local_unnamed_addr
-declare dllimport void @objc_release(i8*) local_unnamed_addr
-declare dllimport i8* @objc_retainAutoreleasedReturnValue(i8* returned) local_unnamed_addr
+declare dllimport i8* @llvm.objc.retain(i8* returned) local_unnamed_addr
+declare dllimport void @llvm.objc.release(i8*) local_unnamed_addr
+declare dllimport i8* @llvm.objc.retainAutoreleasedReturnValue(i8* returned) local_unnamed_addr
 
-declare dllimport i8* @objc_begin_catch(i8*) local_unnamed_addr
-declare dllimport void @objc_end_catch() local_unnamed_addr
+declare dllimport i8* @llvm.objc.begin_catch(i8*) local_unnamed_addr
+declare dllimport void @llvm.objc.end_catch() local_unnamed_addr
 
-@OBJC_METH_VAR_NAME_ = private unnamed_addr constant [2 x i8] c"m\00", align 1
-@OBJC_SELECTOR_REFERENCES_ = private externally_initialized global i8* getelementptr inbounds ([2 x i8], [2 x i8]* @OBJC_METH_VAR_NAME_, i64 0, i64 0), section ".objc_selrefs$B", align 8
+@llvm.objc.METH_VAR_NAME_ = private unnamed_addr constant [2 x i8] c"m\00", align 1
+@llvm.objc.SELECTOR_REFERENCES_ = private externally_initialized global i8* getelementptr inbounds ([2 x i8], [2 x i8]* @llvm.objc.METH_VAR_NAME_, i64 0, i64 0), section ".objc_selrefs$B", align 8
 
 define void @f(i8* %i) local_unnamed_addr personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %i)
-  %1 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !0
-  %call = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %0, i8* %1)
+  %0 = tail call i8* @llvm.objc.retain(i8* %i)
+  %1 = load i8*, i8** @llvm.objc.SELECTOR_REFERENCES_, align 8, !invariant.load !0
+  %call = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* %0, i8* %1)
           to label %invoke.cont unwind label %catch.dispatch, !clang.arc.no_objc_arc_exceptions !0
 
 catch.dispatch:                                   ; preds = %entry
   %2 = catchswitch within none [label %catch] unwind to caller
 
 invoke.cont:                                      ; preds = %entry
-  %3 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call)
-  tail call void @objc_release(i8* %3) #0, !clang.imprecise_release !0
+  %3 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call)
+  tail call void @llvm.objc.release(i8* %3) #0, !clang.imprecise_release !0
   br label %eh.cont
 
 eh.cont:                                          ; preds = %invoke.cont, %catch
-  tail call void @objc_release(i8* %0) #0, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %0) #0, !clang.imprecise_release !0
   ret void
 
 catch:                                            ; preds = %catch.dispatch
   %4 = catchpad within %2 [i8* null, i32 0, i8* null]
-  %exn.adjusted = tail call i8* @objc_begin_catch(i8* undef)
-  tail call void @objc_end_catch(), !clang.arc.no_objc_arc_exceptions !0
+  %exn.adjusted = tail call i8* @llvm.objc.begin_catch(i8* undef)
+  tail call void @llvm.objc.end_catch(), !clang.arc.no_objc_arc_exceptions !0
   br label %eh.cont
 }
 
 ; CHECK-LABEL: @f
 
-; CHECK-NOT: tail call i8* @objc_retain(i8* %i)
-; CHECK: load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8
+; CHECK-NOT: tail call i8* @llvm.objc.retain(i8* %i)
+; CHECK: load i8*, i8** @llvm.objc.SELECTOR_REFERENCES_, align 8
 
 ; CHECK: eh.cont:
-; CHECK-NOT: call void @objc_release(i8*
+; CHECK-NOT: call void @llvm.objc.release(i8*
 ; CHECK: ret void
 
 attributes #0 = { nounwind }
diff --git a/test/Transforms/ObjCARC/invoke.ll b/test/Transforms/ObjCARC/invoke.ll
index 06105c17397f69e938e7fbec7557a97f4ecb9f9b..3dc95cd2eb0b60314b3e5c0d0e86aa7f0a4a360b 100644
--- a/test/Transforms/ObjCARC/invoke.ll
+++ b/test/Transforms/ObjCARC/invoke.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -S -objc-arc < %s | FileCheck %s
 
-declare i8* @objc_retain(i8*)
-declare void @objc_release(i8*)
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i8* @llvm.objc.retain(i8*)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...)
 declare void @use_pointer(i8*)
 declare void @callee()
 declare i8* @returner()
@@ -12,27 +12,27 @@ declare i8* @returner()
 
 ; CHECK-LABEL: define void @test0(
 ; CHECK: invoke.cont:
-; CHECK:   call void @objc_release(i8* %zipFile) [[NUW:#[0-9]+]], !clang.imprecise_release !0
+; CHECK:   call void @llvm.objc.release(i8* %zipFile) [[NUW:#[0-9]+]], !clang.imprecise_release !0
 ; CHECK:   ret void
 ; CHECK: lpad:
-; CHECK:   call void @objc_release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
+; CHECK:   call void @llvm.objc.release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
 ; CHECK:   ret void
 ; CHECK-NEXT: }
 define void @test0(i8* %zipFile) personality i32 (...)* @__gxx_personality_v0 {
 entry:
-  call i8* @objc_retain(i8* %zipFile) nounwind
+  call i8* @llvm.objc.retain(i8* %zipFile) nounwind
   call void @use_pointer(i8* %zipFile)
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*)*)(i8* %zipFile) 
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*)*)(i8* %zipFile) 
           to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
-  call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %zipFile) nounwind, !clang.imprecise_release !0
   ret void
 
 lpad:                                             ; preds = %entry
   %exn = landingpad {i8*, i32}
            cleanup
-  call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %zipFile) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -40,11 +40,11 @@ lpad:                                             ; preds = %entry
 
 ; CHECK-LABEL: define void @test1(
 ; CHECK: invoke.cont:
-; CHECK:   call void @objc_release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
+; CHECK:   call void @llvm.objc.release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
 ; CHECK:   call void @callee()
 ; CHECK:   br label %done
 ; CHECK: lpad:
-; CHECK:   call void @objc_release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
+; CHECK:   call void @llvm.objc.release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
 ; CHECK:   call void @callee()
 ; CHECK:   br label %done
 ; CHECK: done:
@@ -52,9 +52,9 @@ lpad:                                             ; preds = %entry
 ; CHECK-NEXT: }
 define void @test1(i8* %zipFile) personality i32 (...)* @__gxx_personality_v0 {
 entry:
-  call i8* @objc_retain(i8* %zipFile) nounwind
+  call i8* @llvm.objc.retain(i8* %zipFile) nounwind
   call void @use_pointer(i8* %zipFile)
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*)*)(i8* %zipFile)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*)*)(i8* %zipFile)
           to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
@@ -68,7 +68,7 @@ lpad:                                             ; preds = %entry
   br label %done
 
 done:
-  call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %zipFile) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -77,27 +77,27 @@ done:
 
 ; CHECK: define void @test2() personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*) {
 ; CHECK: invoke.cont:
-; CHECK-NEXT: call i8* @objc_retain
-; CHECK-NOT: @objc_r
+; CHECK-NEXT: call i8* @llvm.objc.retain
+; CHECK-NOT: @llvm.objc.r
 ; CHECK: finally.cont:
-; CHECK-NEXT: call void @objc_release
+; CHECK-NEXT: call void @llvm.objc.release
 ; CHECK-NOT: @objc
 ; CHECK: finally.rethrow:
 ; CHECK-NOT: @objc
 ; CHECK: }
 define void @test2() personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*) {
 entry:
-  %call = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* ()*)()
+  %call = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* ()*)()
           to label %invoke.cont unwind label %finally.rethrow, !clang.arc.no_objc_arc_exceptions !0
 
 invoke.cont:                                      ; preds = %entry
-  %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
-  call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void ()*)(), !clang.arc.no_objc_arc_exceptions !0
+  %tmp1 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
+  call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void ()*)(), !clang.arc.no_objc_arc_exceptions !0
   invoke void @use_pointer(i8* %call)
           to label %finally.cont unwind label %finally.rethrow, !clang.arc.no_objc_arc_exceptions !0
 
 finally.cont:                                     ; preds = %invoke.cont
-  tail call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %call) nounwind, !clang.imprecise_release !0
   ret void
 
 finally.rethrow:                                  ; preds = %invoke.cont, %entry
@@ -110,12 +110,12 @@ finally.rethrow:                                  ; preds = %invoke.cont, %entry
 
 ; CHECK-LABEL: define void @test3(
 ; CHECK: if.end:
-; CHECK-NEXT: call void @objc_release(i8* %p) [[NUW]]
+; CHECK-NEXT: call void @llvm.objc.release(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test3(i8* %p, i1 %b) personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*) {
 entry:
-  %0 = call i8* @objc_retain(i8* %p)
+  %0 = call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   br i1 %b, label %if.else, label %if.then
 
@@ -133,7 +133,7 @@ lpad:
   ret void
 
 if.end:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -143,15 +143,15 @@ if.end:
 ; CHECK: lpad:
 ; CHECK-NEXT: %r = landingpad { i8*, i32 }
 ; CHECK-NEXT: cleanup
-; CHECK-NEXT: call void @objc_release(i8* %p) [[NUW]]
+; CHECK-NEXT: call void @llvm.objc.release(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret void
 ; CHECK: if.end:
-; CHECK-NEXT: call void @objc_release(i8* %p) [[NUW]]
+; CHECK-NEXT: call void @llvm.objc.release(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test4(i8* %p, i1 %b) personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*) {
 entry:
-  %0 = call i8* @objc_retain(i8* %p)
+  %0 = call i8* @llvm.objc.retain(i8* %p)
   call void @callee()
   br i1 %b, label %if.else, label %if.then
 
@@ -166,11 +166,11 @@ if.else:
 lpad:
   %r = landingpad { i8*, i32 }
        cleanup
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 
 if.end:
-  call void @objc_release(i8* %p)
+  call void @llvm.objc.release(i8* %p)
   ret void
 }
 
@@ -178,7 +178,7 @@ if.end:
 ; for an invoke which we can assume codegen will put immediately prior.
 
 ; CHECK-LABEL: define void @test5(
-; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %z)
 ; CHECK: }
 define void @test5() personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*) {
 entry:
@@ -191,14 +191,14 @@ lpad:
   ret void
 
 if.end:
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %z)
   ret void
 }
 
 ; Like test5, but there's intervening code.
 
 ; CHECK-LABEL: define void @test6(
-; CHECK: call i8* @objc_retain(i8* %z)
+; CHECK: call i8* @llvm.objc.retain(i8* %z)
 ; CHECK: }
 define void @test6() personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*) {
 entry:
@@ -212,7 +212,7 @@ lpad:
 
 if.end:
   call void @callee()
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %z)
   ret void
 }
 
diff --git a/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll b/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
index 9894eb4f534558a03244467dced096078c87c65a..91b865e52688ea4f4974db14b14dea3a14efa66b 100644
--- a/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
+++ b/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
@@ -4,7 +4,7 @@
 ; and various scary looking things and fold it into an objc_retainAutorelease.
 
 ; CHECK: bb57:
-; CHECK: tail call i8* @objc_retainAutorelease(i8* %tmp71x) [[NUW:#[0-9]+]]
+; CHECK: tail call i8* @llvm.objc.retainAutorelease(i8* %tmp71x) [[NUW:#[0-9]+]]
 ; CHECK: bb99:
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -68,30 +68,30 @@ target triple = "x86_64-apple-darwin11.0.0"
 @"\01L_OBJC_SELECTOR_REFERENCES_413" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
 @"\01L_OBJC_SELECTOR_REFERENCES_415" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
 
-declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...)
 
-declare i8* @objc_retain(i8*)
+declare i8* @llvm.objc.retain(i8*)
 
-declare void @objc_release(i8*)
+declare void @llvm.objc.release(i8*)
 
-declare i8* @objc_autorelease(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
 
-declare i8* @objc_explicit_autorelease(i8*)
+declare i8* @llvm.objc.explicit_autorelease(i8*)
 
 define hidden %14* @foo(%15* %arg, %16* %arg2) {
 bb:
   %tmp = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_3725", align 8
   %tmp4 = bitcast %15* %arg to i8*
-  %tmp5 = tail call %18* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %18* (i8*, i8*)*)(i8* %tmp4, i8* %tmp)
+  %tmp5 = tail call %18* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %18* (i8*, i8*)*)(i8* %tmp4, i8* %tmp)
   %tmp6 = bitcast %18* %tmp5 to i8*
-  %tmp7 = tail call i8* @objc_retain(i8* %tmp6) nounwind
+  %tmp7 = tail call i8* @llvm.objc.retain(i8* %tmp6) nounwind
   %tmp8 = load %2*, %2** @"\01L_OBJC_CLASSLIST_REFERENCES_$_40", align 8
   %tmp9 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_4227", align 8
   %tmp10 = bitcast %2* %tmp8 to i8*
-  %tmp11 = tail call %19* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %19* (i8*, i8*)*)(i8* %tmp10, i8* %tmp9)
+  %tmp11 = tail call %19* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %19* (i8*, i8*)*)(i8* %tmp10, i8* %tmp9)
   %tmp12 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_4631", align 8
   %tmp13 = bitcast %19* %tmp11 to i8*
-  %tmp14 = tail call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, %13*)*)(i8* %tmp13, i8* %tmp12, %13* bitcast (%12* @_unnamed_cfstring_386 to %13*))
+  %tmp14 = tail call signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, %13*)*)(i8* %tmp13, i8* %tmp12, %13* bitcast (%12* @_unnamed_cfstring_386 to %13*))
   %tmp15 = bitcast %16* %arg2 to i8*
   %tmp16 = load i8*, i8** bitcast (%0* @"\01l_objc_msgSend_fixup_count" to i8**), align 16
   %tmp17 = bitcast i8* %tmp16 to i64 (i8*, %1*)*
@@ -111,35 +111,35 @@ bb22:                                             ; preds = %bb
 bb25:                                             ; preds = %bb22, %bb20
   %tmp26 = phi i1 [ %tmp21, %bb20 ], [ false, %bb22 ]
   %tmp27 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_188", align 8
-  %tmp28 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp7, i8* %tmp27)
-  %tmp29 = tail call i8* @objc_explicit_autorelease(i8* %tmp28) nounwind
+  %tmp28 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* %tmp7, i8* %tmp27)
+  %tmp29 = tail call i8* @llvm.objc.explicit_autorelease(i8* %tmp28) nounwind
   %tmp30 = bitcast i8* %tmp29 to %18*
-  tail call void @objc_release(i8* %tmp7) nounwind
+  tail call void @llvm.objc.release(i8* %tmp7) nounwind
   %tmp31 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_389", align 8
-  %tmp32 = tail call %20* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %20* (i8*, i8*)*)(i8* %tmp29, i8* %tmp31)
+  %tmp32 = tail call %20* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %20* (i8*, i8*)*)(i8* %tmp29, i8* %tmp31)
   %tmp33 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_391", align 8
   %tmp34 = bitcast %20* %tmp32 to i8*
-  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %16*)*)(i8* %tmp34, i8* %tmp33, %16* %arg2)
+  tail call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, %16*)*)(i8* %tmp34, i8* %tmp33, %16* %arg2)
   br i1 %tmp26, label %bb46, label %bb35
 
 bb35:                                             ; preds = %bb25
   %tmp36 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_389", align 8
-  %tmp37 = tail call %20* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %20* (i8*, i8*)*)(i8* %tmp29, i8* %tmp36)
+  %tmp37 = tail call %20* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %20* (i8*, i8*)*)(i8* %tmp29, i8* %tmp36)
   %tmp38 = load %2*, %2** @"\01L_OBJC_CLASSLIST_REFERENCES_$_70", align 8
   %tmp39 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_393", align 8
   %tmp40 = bitcast %2* %tmp38 to i8*
-  %tmp41 = tail call %21* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %21* (i8*, i8*, i8)*)(i8* %tmp40, i8* %tmp39, i8 signext 1)
+  %tmp41 = tail call %21* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %21* (i8*, i8*, i8)*)(i8* %tmp40, i8* %tmp39, i8 signext 1)
   %tmp42 = bitcast %21* %tmp41 to i8*
   %tmp43 = load %13*, %13** @NSPrintHeaderAndFooter, align 8
   %tmp44 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_159", align 8
   %tmp45 = bitcast %20* %tmp37 to i8*
-  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, %13*)*)(i8* %tmp45, i8* %tmp44, i8* %tmp42, %13* %tmp43)
+  tail call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, %13*)*)(i8* %tmp45, i8* %tmp44, i8* %tmp42, %13* %tmp43)
   br label %bb46
 
 bb46:                                             ; preds = %bb35, %bb25, %bb22
   %tmp47 = phi %18* [ %tmp30, %bb35 ], [ %tmp30, %bb25 ], [ %tmp23, %bb22 ]
   %tmp48 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_328", align 8
-  %tmp49 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp48)
+  %tmp49 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp48)
   %tmp50 = bitcast %22* %tmp49 to i8*
   %tmp51 = load i8*, i8** bitcast (%0* @"\01l_objc_msgSend_fixup_count" to i8**), align 16
   %tmp52 = bitcast i8* %tmp51 to i64 (i8*, %1*)*
@@ -149,74 +149,74 @@ bb46:                                             ; preds = %bb35, %bb25, %bb22
 
 bb55:                                             ; preds = %bb46
   %tmp56 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_395", align 8
-  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*)*)(i8* %tmp4, i8* %tmp56)
+  tail call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*)*)(i8* %tmp4, i8* %tmp56)
   br label %bb57
 
 bb57:                                             ; preds = %bb55, %bb46
   %tmp58 = load %2*, %2** @"\01L_OBJC_CLASSLIST_REFERENCES_$_396", align 8
   %tmp59 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_328", align 8
-  %tmp60 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp59)
+  %tmp60 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp59)
   %tmp61 = bitcast %22* %tmp60 to i8*
   %tmp62 = load i8*, i8** bitcast (%0* @"\01l_objc_msgSend_fixup_objectAtIndex_" to i8**), align 16
   %tmp63 = bitcast i8* %tmp62 to i8* (i8*, %1*, i64)*
   %tmp64 = tail call i8* %tmp63(i8* %tmp61, %1* bitcast (%0* @"\01l_objc_msgSend_fixup_objectAtIndex_" to %1*), i64 0)
   %tmp65 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_398", align 8
-  %tmp66 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp64, i8* %tmp65)
+  %tmp66 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* %tmp64, i8* %tmp65)
   %tmp67 = bitcast i8* %tmp66 to %23*
   %tmp68 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_400", align 8
   %tmp69 = bitcast %2* %tmp58 to i8*
-  %tmp70 = tail call %14* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %14* (i8*, i8*, %23*, %18*)*)(i8* %tmp69, i8* %tmp68, %23* %tmp67, %18* %tmp47)
+  %tmp70 = tail call %14* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %14* (i8*, i8*, %23*, %18*)*)(i8* %tmp69, i8* %tmp68, %23* %tmp67, %18* %tmp47)
   %tmp71 = bitcast %14* %tmp70 to i8*
   ; hack to prevent the optimize from using objc_retainAutoreleasedReturnValue.
   %tmp71x = getelementptr i8, i8* %tmp71, i64 1
-  %tmp72 = tail call i8* @objc_retain(i8* %tmp71x) nounwind
+  %tmp72 = tail call i8* @llvm.objc.retain(i8* %tmp71x) nounwind
   %tmp73 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_402", align 8
-  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8)*)(i8* %tmp72, i8* %tmp73, i8 signext 1)
+  tail call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8)*)(i8* %tmp72, i8* %tmp73, i8 signext 1)
   %tmp74 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_404", align 8
-  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8)*)(i8* %tmp72, i8* %tmp74, i8 signext 1)
+  tail call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8)*)(i8* %tmp72, i8* %tmp74, i8 signext 1)
   %tmp75 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_328", align 8
-  %tmp76 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp75)
+  %tmp76 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp75)
   %tmp77 = bitcast %22* %tmp76 to i8*
   %tmp78 = load i8*, i8** bitcast (%0* @"\01l_objc_msgSend_fixup_objectAtIndex_" to i8**), align 16
   %tmp79 = bitcast i8* %tmp78 to i8* (i8*, %1*, i64)*
   %tmp80 = tail call i8* %tmp79(i8* %tmp77, %1* bitcast (%0* @"\01l_objc_msgSend_fixup_objectAtIndex_" to %1*), i64 0)
   %tmp81 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_406", align 8
-  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i64)*)(i8* %tmp80, i8* %tmp81, i64 9223372036854775807)
+  tail call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i64)*)(i8* %tmp80, i8* %tmp81, i64 9223372036854775807)
   %tmp82 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_408", align 8
-  %tmp83 = tail call %24* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %24* (i8*, i8*)*)(i8* %tmp72, i8* %tmp82)
+  %tmp83 = tail call %24* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %24* (i8*, i8*)*)(i8* %tmp72, i8* %tmp82)
   %tmp84 = bitcast %24* %tmp83 to i8*
-  %tmp85 = tail call i8* @objc_retain(i8* %tmp84) nounwind
+  %tmp85 = tail call i8* @llvm.objc.retain(i8* %tmp84) nounwind
   %tmp86 = load %2*, %2** @"\01L_OBJC_CLASSLIST_REFERENCES_$_409", align 8
   %tmp87 = bitcast %2* %tmp86 to i8*
   %tmp88 = load i8*, i8** bitcast (%0* @"\01l_objc_msgSend_fixup_alloc" to i8**), align 16
   %tmp89 = bitcast i8* %tmp88 to i8* (i8*, %1*)*
   %tmp90 = tail call i8* %tmp89(i8* %tmp87, %1* bitcast (%0* @"\01l_objc_msgSend_fixup_alloc" to %1*))
   %tmp91 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_8", align 8
-  %tmp92 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp90, i8* %tmp91)
-  %tmp93 = tail call i8* @objc_explicit_autorelease(i8* %tmp92) nounwind
+  %tmp92 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* %tmp90, i8* %tmp91)
+  %tmp93 = tail call i8* @llvm.objc.explicit_autorelease(i8* %tmp92) nounwind
   %tmp94 = bitcast i8* %tmp93 to %25*
   %tmp95 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_411", align 8
-  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %25*)*)(i8* %tmp85, i8* %tmp95, %25* %tmp94)
-  tail call void @objc_release(i8* %tmp93) nounwind
+  tail call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, %25*)*)(i8* %tmp85, i8* %tmp95, %25* %tmp94)
+  tail call void @llvm.objc.release(i8* %tmp93) nounwind
   %tmp96 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_148", align 8
-  %tmp97 = tail call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* %tmp4, i8* %tmp96)
+  %tmp97 = tail call signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*)*)(i8* %tmp4, i8* %tmp96)
   %tmp98 = icmp eq i8 %tmp97, 0
   br i1 %tmp98, label %bb99, label %bb104
 
 bb99:                                             ; preds = %bb57
   %tmp100 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_413", align 8
-  %tmp101 = tail call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*)*)(i8* %tmp85, i8* %tmp100)
+  %tmp101 = tail call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*)*)(i8* %tmp85, i8* %tmp100)
   %tmp102 = or i64 %tmp101, 12
   %tmp103 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_415", align 8
-  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i64)*)(i8* %tmp85, i8* %tmp103, i64 %tmp102)
+  tail call void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i64)*)(i8* %tmp85, i8* %tmp103, i64 %tmp102)
   br label %bb104
 
 bb104:                                            ; preds = %bb99, %bb57
-  %tmp105 = call i8* @objc_autorelease(i8* %tmp72) nounwind
+  %tmp105 = call i8* @llvm.objc.autorelease(i8* %tmp72) nounwind
   %tmp106 = bitcast i8* %tmp105 to %14*
-  tail call void @objc_release(i8* %tmp85) nounwind
+  tail call void @llvm.objc.release(i8* %tmp85) nounwind
   %tmp107 = bitcast %18* %tmp47 to i8*
-  tail call void @objc_release(i8* %tmp107) nounwind
+  tail call void @llvm.objc.release(i8* %tmp107) nounwind
   ret %14* %tmp106
 }
 
diff --git a/test/Transforms/ObjCARC/move-and-merge-autorelease.ll b/test/Transforms/ObjCARC/move-and-merge-autorelease.ll
index 0a68541d9350a5a91fcc68d78848a09a4f690c91..eaf1fc17aa48c73394b253f19a6dd2ebf3edf5bf 100644
--- a/test/Transforms/ObjCARC/move-and-merge-autorelease.ll
+++ b/test/Transforms/ObjCARC/move-and-merge-autorelease.ll
@@ -4,7 +4,7 @@
 ; and fold it with the release in bb65.
 
 ; CHECK: bb65:
-; CHECK: call i8* @objc_retainAutorelease
+; CHECK: call i8* @llvm.objc.retainAutorelease
 ; CHECK: br label %bb76
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -25,24 +25,24 @@ target triple = "x86_64-apple-darwin11.0.0"
 @"\01L_OBJC_SELECTOR_REFERENCES_624" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
 @"\01L_OBJC_SELECTOR_REFERENCES_626" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
 
-declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...)
 
-declare i8* @objc_retain(i8*)
+declare i8* @llvm.objc.retain(i8*)
 
-declare void @objc_release(i8*)
+declare void @llvm.objc.release(i8*)
 
-declare i8* @objc_autorelease(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
 
 define hidden %0* @foo(%1* %arg, %3* %arg3) {
 bb:
   %tmp16 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_620", align 8
   %tmp17 = bitcast %3* %arg3 to i8*
-  %tmp18 = call %4* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %4* (i8*, i8*)*)(i8* %tmp17, i8* %tmp16)
+  %tmp18 = call %4* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %4* (i8*, i8*)*)(i8* %tmp17, i8* %tmp16)
   %tmp19 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_622", align 8
   %tmp20 = bitcast %4* %tmp18 to i8*
-  %tmp21 = call %5* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %5* (i8*, i8*)*)(i8* %tmp20, i8* %tmp19)
+  %tmp21 = call %5* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %5* (i8*, i8*)*)(i8* %tmp20, i8* %tmp19)
   %tmp22 = bitcast %5* %tmp21 to i8*
-  %tmp23 = call i8* @objc_retain(i8* %tmp22) nounwind
+  %tmp23 = call i8* @llvm.objc.retain(i8* %tmp22) nounwind
   %tmp24 = bitcast i8* %tmp23 to %5*
   %tmp26 = icmp eq i8* %tmp23, null
   br i1 %tmp26, label %bb81, label %bb27
@@ -50,22 +50,22 @@ bb:
 bb27:                                             ; preds = %bb
   %tmp29 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_11", align 8
   %tmp30 = bitcast %1* %arg to i8*
-  %tmp31 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp30, i8* %tmp29)
-  %tmp34 = call i8* @objc_retain(i8* %tmp31) nounwind
+  %tmp31 = call i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* %tmp30, i8* %tmp29)
+  %tmp34 = call i8* @llvm.objc.retain(i8* %tmp31) nounwind
   %tmp37 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_421455", align 8
-  %tmp39 = call %0* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %0* (i8*, i8*)*)(i8* %tmp34, i8* %tmp37)
+  %tmp39 = call %0* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %0* (i8*, i8*)*)(i8* %tmp34, i8* %tmp37)
   %tmp40 = bitcast %0* %tmp39 to i8*
-  %tmp41 = call i8* @objc_retain(i8* %tmp40) nounwind
+  %tmp41 = call i8* @llvm.objc.retain(i8* %tmp40) nounwind
   %tmp42 = bitcast i8* %tmp41 to %0*
   %tmp44 = icmp eq i8* %tmp41, null
   br i1 %tmp44, label %bb45, label %bb55
 
 bb45:                                             ; preds = %bb27
   %tmp47 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_624", align 8
-  %tmp49 = call %0* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %0* (i8*, i8*)*)(i8* %tmp34, i8* %tmp47)
+  %tmp49 = call %0* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %0* (i8*, i8*)*)(i8* %tmp34, i8* %tmp47)
   %tmp51 = bitcast %0* %tmp49 to i8*
-  %tmp52 = call i8* @objc_retain(i8* %tmp51) nounwind
-  call void @objc_release(i8* %tmp41) nounwind
+  %tmp52 = call i8* @llvm.objc.retain(i8* %tmp51) nounwind
+  call void @llvm.objc.release(i8* %tmp41) nounwind
   br label %bb55
 
 bb55:                                             ; preds = %bb27, %bb45
@@ -76,33 +76,33 @@ bb55:                                             ; preds = %bb27, %bb45
 bb58:                                             ; preds = %bb55
   %tmp60 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_598", align 8
   %tmp61 = bitcast %0* %tmp13.0 to i8*
-  %tmp62 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* %tmp61, i8* %tmp60)
+  %tmp62 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*)*)(i8* %tmp61, i8* %tmp60)
   %tmp64 = icmp eq i8 %tmp62, 0
   br i1 %tmp64, label %bb76, label %bb65
 
 bb65:                                             ; preds = %bb58
   %tmp68 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_626", align 8
   %tmp69 = bitcast %0* %tmp13.0 to i8*
-  %tmp70 = call %0* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %0* (i8*, i8*, %5*)*)(i8* %tmp69, i8* %tmp68, %5* %tmp24)
+  %tmp70 = call %0* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to %0* (i8*, i8*, %5*)*)(i8* %tmp69, i8* %tmp68, %5* %tmp24)
   %tmp72 = bitcast %0* %tmp70 to i8*
-  %tmp73 = call i8* @objc_retain(i8* %tmp72) nounwind
+  %tmp73 = call i8* @llvm.objc.retain(i8* %tmp72) nounwind
   br label %bb76
 
 bb76:                                             ; preds = %bb58, %bb55, %bb65
   %tmp10.0 = phi %0* [ %tmp70, %bb65 ], [ null, %bb58 ], [ null, %bb55 ]
   %tmp78 = bitcast %0* %tmp13.0 to i8*
-  call void @objc_release(i8* %tmp78) nounwind
-  call void @objc_release(i8* %tmp34) nounwind
+  call void @llvm.objc.release(i8* %tmp78) nounwind
+  call void @llvm.objc.release(i8* %tmp34) nounwind
   br label %bb81
 
 bb81:                                             ; preds = %bb, %bb76
   %tmp10.1 = phi %0* [ %tmp10.0, %bb76 ], [ null, %bb ]
   %tmp83 = bitcast %0* %tmp10.1 to i8*
-  %tmp84 = call i8* @objc_retain(i8* %tmp83) nounwind
-  call void @objc_release(i8* %tmp23) nounwind
-  %tmp87 = call i8* @objc_autorelease(i8* %tmp84) nounwind
+  %tmp84 = call i8* @llvm.objc.retain(i8* %tmp83) nounwind
+  call void @llvm.objc.release(i8* %tmp23) nounwind
+  %tmp87 = call i8* @llvm.objc.autorelease(i8* %tmp84) nounwind
   %tmp88 = bitcast i8* %tmp87 to %0*
   %tmp92 = bitcast %0* %tmp10.1 to i8*
-  call void @objc_release(i8* %tmp92) nounwind
+  call void @llvm.objc.release(i8* %tmp92) nounwind
   ret %0* %tmp88
 }
diff --git a/test/Transforms/ObjCARC/nested.ll b/test/Transforms/ObjCARC/nested.ll
index b317cd8029596216ad0022995c08dc4d55fd5591..8b7e673e11aef869e4288bdb9d4ba612070d8717 100644
--- a/test/Transforms/ObjCARC/nested.ll
+++ b/test/Transforms/ObjCARC/nested.ll
@@ -9,16 +9,16 @@
 
 declare void @callee()
 declare i8* @returner()
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare i8* @objc_retain(i8*)
-declare void @objc_enumerationMutation(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare void @llvm.objc.enumerationMutation(i8*)
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
-declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...) nonlazybind
 declare void @use(i8*)
-declare void @objc_release(i8*)
+declare void @llvm.objc.release(i8*)
 declare i8* @def()
 declare void @__crasher_block_invoke(i8* nocapture)
-declare i8* @objc_retainBlock(i8*)
+declare i8* @llvm.objc.retainBlock(i8*)
 declare void @__crasher_block_invoke1(i8* nocapture)
 
 !0 = !{}
@@ -26,19 +26,19 @@ declare void @__crasher_block_invoke1(i8* nocapture)
 ; Delete a nested retain+release pair.
 
 ; CHECK-LABEL: define void @test0(
-; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: call i8* @llvm.objc.retain
+; CHECK-NOT: @llvm.objc.retain
 ; CHECK: }
 define void @test0(i8* %a) nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
-  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %a) nounwind
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %1 = call i8* @objc_retain(i8* %0) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp2 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -63,7 +63,7 @@ forcoll.loopbody:
   br i1 %2, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %1)
+  call void @llvm.objc.enumerationMutation(i8* %1)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -77,33 +77,33 @@ forcoll.notmutated:
 
 forcoll.refetch:
   %tmp5 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call6 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp5, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call6 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp5, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %5 = icmp eq i64 %call6, 0
   br i1 %5, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %1) nounwind
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %1) nounwind
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; Delete a nested retain+release pair.
 
 ; CHECK-LABEL: define void @test2(
-; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK-NOT: @llvm.objc.retain
 ; CHECK: }
 define void @test2() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %1 = call i8* @objc_retain(i8* %0) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp2 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call3, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -128,7 +128,7 @@ forcoll.loopbody:
   br i1 %2, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %1)
+  call void @llvm.objc.enumerationMutation(i8* %1)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -142,33 +142,33 @@ forcoll.notmutated:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %5 = icmp eq i64 %call7, 0
   br i1 %5, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %1) nounwind
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %1) nounwind
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; Delete a nested retain+release pair.
 
 ; CHECK-LABEL: define void @test4(
-; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: call i8* @llvm.objc.retain
+; CHECK-NOT: @llvm.objc.retain
 ; CHECK: }
 define void @test4() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %tmp = load i8*, i8** @g, align 8
-  %0 = call i8* @objc_retain(i8* %tmp) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %tmp) nounwind
   %tmp2 = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp2, i8 0, i64 64, i1 false)
-  %1 = call i8* @objc_retain(i8* %0) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp4 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp4, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp4, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -193,7 +193,7 @@ forcoll.loopbody:
   br i1 %2, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %1)
+  call void @llvm.objc.enumerationMutation(i8* %1)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -207,33 +207,33 @@ forcoll.notmutated:
 
 forcoll.refetch:
   %tmp7 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call8 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp7, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call8 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp7, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %5 = icmp eq i64 %call8, 0
   br i1 %5, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %1) nounwind
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %1) nounwind
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; Delete a nested retain+release pair.
 
 ; CHECK-LABEL: define void @test5(
-; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK-NOT: @llvm.objc.retain
 ; CHECK: }
 define void @test5() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %1 = call i8* @objc_retain(i8* %0) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp2 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call3, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -258,7 +258,7 @@ forcoll.loopbody:
   br i1 %2, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %1)
+  call void @llvm.objc.enumerationMutation(i8* %1)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -272,13 +272,13 @@ forcoll.notmutated:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %5 = icmp eq i64 %call7, 0
   br i1 %5, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %1) nounwind
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %1) nounwind
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -286,20 +286,20 @@ forcoll.empty:
 ; use.
 ;
 ; CHECK-LABEL: define void @test6(
-; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK-NOT: @llvm.objc.retain
 ; CHECK: }
 define void @test6() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %1 = call i8* @objc_retain(i8* %0) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp2 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call3, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -324,7 +324,7 @@ forcoll.loopbody:
   br i1 %2, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %1)
+  call void @llvm.objc.enumerationMutation(i8* %1)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -338,14 +338,14 @@ forcoll.notmutated:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %5 = icmp eq i64 %call7, 0
   br i1 %5, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %1) nounwind
+  call void @llvm.objc.release(i8* %1) nounwind
   call void @callee()
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -354,21 +354,21 @@ forcoll.empty:
 ; reasnoning about nesting.
 
 ; CHECK-LABEL: define void @test7(
-; CHECK: call i8* @objc_retain
-; CHECK: @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: @llvm.objc.retain
 ; CHECK: }
 define void @test7() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   call void @callee()
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %1 = call i8* @objc_retain(i8* %0) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp2 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call3, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -393,7 +393,7 @@ forcoll.loopbody:
   br i1 %2, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %1)
+  call void @llvm.objc.enumerationMutation(i8* %1)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -407,34 +407,34 @@ forcoll.notmutated:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %5 = icmp eq i64 %call7, 0
   br i1 %5, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %1) nounwind
+  call void @llvm.objc.release(i8* %1) nounwind
   call void @callee()
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; Delete a nested retain+release pair.
 
 ; CHECK-LABEL: define void @test8(
-; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK-NOT: @llvm.objc.retain
 ; CHECK: }
 define void @test8() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %1 = call i8* @objc_retain(i8* %0) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp2 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call3 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp2, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call3, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -459,7 +459,7 @@ forcoll.loopbody:
   br i1 %2, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %1)
+  call void @llvm.objc.enumerationMutation(i8* %1)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -480,13 +480,13 @@ forcoll.next:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %1, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %5 = icmp eq i64 %call7, 0
   br i1 %5, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %1) nounwind
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %1) nounwind
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -495,23 +495,23 @@ forcoll.empty:
 ; See test9b for the same testcase without a split backedge.
 
 ; CHECK-LABEL: define void @test9(
-; CHECK: call i8* @objc_retain
-; CHECK: call i8* @objc_retain
-; CHECK: call i8* @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: }
 define void @test9() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   %call1 = call i8* @returner()
-  %1 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call1) nounwind
+  %1 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call1) nounwind
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %2 = call i8* @objc_retain(i8* %0) nounwind
+  %2 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp3 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call4 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp3, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call4 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp3, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call4, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -535,7 +535,7 @@ forcoll.loopbody:
   br i1 %3, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %2)
+  call void @llvm.objc.enumerationMutation(i8* %2)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -548,37 +548,37 @@ forcoll.notmutated.forcoll.loopbody_crit_edge:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %4 = icmp eq i64 %call7, 0
   br i1 %4, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %2) nounwind
-  call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %2) nounwind
+  call void @llvm.objc.release(i8* %1) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; Like test9, but without a split backedge. TODO: optimize this.
 
 ; CHECK-LABEL: define void @test9b(
-; CHECK: call i8* @objc_retain
-; CHECK: call i8* @objc_retain
-; CHECK: @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: @llvm.objc.retain
 ; CHECK: }
 define void @test9b() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   %call1 = call i8* @returner()
-  %1 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call1) nounwind
+  %1 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call1) nounwind
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %2 = call i8* @objc_retain(i8* %0) nounwind
+  %2 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp3 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call4 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp3, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call4 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp3, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call4, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -602,7 +602,7 @@ forcoll.loopbody:
   br i1 %3, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %2)
+  call void @llvm.objc.enumerationMutation(i8* %2)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -612,14 +612,14 @@ forcoll.notmutated:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %4 = icmp eq i64 %call7, 0
   br i1 %4, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %2) nounwind
-  call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %2) nounwind
+  call void @llvm.objc.release(i8* %1) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -628,24 +628,24 @@ forcoll.empty:
 ; See test10b for the same testcase without a split backedge.
 
 ; CHECK-LABEL: define void @test10(
-; CHECK: call i8* @objc_retain
-; CHECK: call i8* @objc_retain
-; CHECK: call i8* @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: call i8* @llvm.objc.retain
 ; CHECK: }
 define void @test10() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   %call1 = call i8* @returner()
-  %1 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call1) nounwind
+  %1 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call1) nounwind
   call void @callee()
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %2 = call i8* @objc_retain(i8* %0) nounwind
+  %2 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp3 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call4 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp3, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call4 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp3, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call4, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -669,7 +669,7 @@ forcoll.loopbody:
   br i1 %3, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %2)
+  call void @llvm.objc.enumerationMutation(i8* %2)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -682,38 +682,38 @@ forcoll.notmutated.forcoll.loopbody_crit_edge:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %4 = icmp eq i64 %call7, 0
   br i1 %4, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %2) nounwind
-  call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %2) nounwind
+  call void @llvm.objc.release(i8* %1) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 ; Like test10, but without a split backedge. TODO: optimize this.
 
 ; CHECK-LABEL: define void @test10b(
-; CHECK: call i8* @objc_retain
-; CHECK: call i8* @objc_retain
-; CHECK: @objc_retain
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue
+; CHECK: @llvm.objc.retain
 ; CHECK: }
 define void @test10b() nounwind {
 entry:
   %state.ptr = alloca %struct.__objcFastEnumerationState, align 8
   %items.ptr = alloca [16 x i8*], align 8
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
   %call1 = call i8* @returner()
-  %1 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call1) nounwind
+  %1 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call1) nounwind
   call void @callee()
   %tmp = bitcast %struct.__objcFastEnumerationState* %state.ptr to i8*
   call void @llvm.memset.p0i8.i64(i8* align 8 %tmp, i8 0, i64 64, i1 false)
-  %2 = call i8* @objc_retain(i8* %0) nounwind
+  %2 = call i8* @llvm.objc.retain(i8* %0) nounwind
   %tmp3 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call4 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp3, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call4 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp3, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %iszero = icmp eq i64 %call4, 0
   br i1 %iszero, label %forcoll.empty, label %forcoll.loopinit
 
@@ -737,7 +737,7 @@ forcoll.loopbody:
   br i1 %3, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:
-  call void @objc_enumerationMutation(i8* %2)
+  call void @llvm.objc.enumerationMutation(i8* %2)
   br label %forcoll.notmutated
 
 forcoll.notmutated:
@@ -747,14 +747,14 @@ forcoll.notmutated:
 
 forcoll.refetch:
   %tmp6 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
+  %call7 = call i64 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i64 (i8*, i8*, %struct.__objcFastEnumerationState*, [16 x i8*]*, i64)*)(i8* %2, i8* %tmp6, %struct.__objcFastEnumerationState* %state.ptr, [16 x i8*]* %items.ptr, i64 16)
   %4 = icmp eq i64 %call7, 0
   br i1 %4, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:
-  call void @objc_release(i8* %2) nounwind
-  call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %2) nounwind
+  call void @llvm.objc.release(i8* %1) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -768,9 +768,9 @@ forcoll.empty:
 @__block_d_tmp5 = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
 
 ; CHECK-LABEL: define void @test11(
-; CHECK: tail call i8* @objc_retain(i8* %call) [[NUW:#[0-9]+]]
-; CHECK: tail call i8* @objc_retain(i8* %call) [[NUW]]
-; CHECK: call void @objc_release(i8* %call) [[NUW]], !clang.imprecise_release !0
+; CHECK: tail call i8* @llvm.objc.retain(i8* %call) [[NUW:#[0-9]+]]
+; CHECK: tail call i8* @llvm.objc.retain(i8* %call) [[NUW]]
+; CHECK: call void @llvm.objc.release(i8* %call) [[NUW]], !clang.imprecise_release !0
 ; CHECK: }
 define void @test11() {
 entry:
@@ -788,14 +788,14 @@ entry:
   store i8* bitcast (void (i8*)* @__crasher_block_invoke to i8*), i8** %block.invoke, align 8
   %block.d = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 4
   store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp to %struct.__block_d*), %struct.__block_d** %block.d, align 8
-  %foo2 = tail call i8* @objc_retain(i8* %call) nounwind
+  %foo2 = tail call i8* @llvm.objc.retain(i8* %call) nounwind
   store i8* %foo2, i8** %foo, align 8
   %foo4 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block to i8*
-  %foo5 = call i8* @objc_retainBlock(i8* %foo4) nounwind
+  %foo5 = call i8* @llvm.objc.retainBlock(i8* %foo4) nounwind
   call void @use(i8* %foo5), !clang.arc.no_objc_arc_exceptions !0
-  call void @objc_release(i8* %foo5) nounwind
+  call void @llvm.objc.release(i8* %foo5) nounwind
   %strongdestroy = load i8*, i8** %foo, align 8
-  call void @objc_release(i8* %strongdestroy) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %strongdestroy) nounwind, !clang.imprecise_release !0
   %foo10 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 5
   %block.isa11 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 0
   store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa11, align 8
@@ -807,19 +807,19 @@ entry:
   store i8* bitcast (void (i8*)* @__crasher_block_invoke1 to i8*), i8** %block.invoke14, align 8
   %block.d15 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 4
   store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp5 to %struct.__block_d*), %struct.__block_d** %block.d15, align 8
-  %foo18 = call i8* @objc_retain(i8* %call) nounwind
+  %foo18 = call i8* @llvm.objc.retain(i8* %call) nounwind
   store i8* %call, i8** %foo10, align 8
   %foo20 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9 to i8*
-  %foo21 = call i8* @objc_retainBlock(i8* %foo20) nounwind
+  %foo21 = call i8* @llvm.objc.retainBlock(i8* %foo20) nounwind
   call void @use(i8* %foo21), !clang.arc.no_objc_arc_exceptions !0
-  call void @objc_release(i8* %foo21) nounwind
+  call void @llvm.objc.release(i8* %foo21) nounwind
   %strongdestroy25 = load i8*, i8** %foo10, align 8
-  call void @objc_release(i8* %strongdestroy25) nounwind, !clang.imprecise_release !0
-  call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %strongdestroy25) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %call) nounwind, !clang.imprecise_release !0
   ret void
 }
 
 
-; CHECK: attributes #0 = { argmemonly nounwind }
-; CHECK: attributes #1 = { nonlazybind }
 ; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { argmemonly nounwind }
+; CHECK: attributes #2 = { nonlazybind }
diff --git a/test/Transforms/ObjCARC/opt-catchswitch.ll b/test/Transforms/ObjCARC/opt-catchswitch.ll
index 5af62e021eb4c049520ac49c4bd1615a4525397b..b627c11d89b48c6ee6bc265f284529ccff1b70b9 100644
--- a/test/Transforms/ObjCARC/opt-catchswitch.ll
+++ b/test/Transforms/ObjCARC/opt-catchswitch.ll
@@ -7,15 +7,15 @@ declare i8* @f(i8*, i8*)
 
 declare i32 @__CxxFrameHandler3(...)
 
-declare dllimport i8* @objc_autoreleaseReturnValue(i8* returned)
-declare dllimport i8* @objc_retain(i8* returned)
-declare dllimport i8* @objc_retainAutoreleasedReturnValue(i8* returned)
-declare dllimport void @objc_release(i8*)
+declare dllimport i8* @llvm.objc.autoreleaseReturnValue(i8* returned)
+declare dllimport i8* @llvm.objc.retain(i8* returned)
+declare dllimport i8* @llvm.objc.retainAutoreleasedReturnValue(i8* returned)
+declare dllimport void @llvm.objc.release(i8*)
 
 define i8* @g(i8* %p, i8* %q) local_unnamed_addr personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %p) #0
-  %1 = tail call i8* @objc_retain(i8* %q) #0
+  %0 = tail call i8* @llvm.objc.retain(i8* %p) #0
+  %1 = tail call i8* @llvm.objc.retain(i8* %q) #0
   %call = invoke i8* @f(i8* %p, i8* %q)
           to label %invoke.cont unwind label %catch.dispatch, !clang.arc.no_objc_arc_exceptions !0
 
@@ -27,19 +27,19 @@ catch:
   catchret from %3 to label %cleanup
 
 invoke.cont:
-  %4 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) #0
+  %4 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) #0
   br label %cleanup
 
 cleanup:
   %retval.0 = phi i8* [ %call, %invoke.cont ], [ null, %catch ]
-  tail call void @objc_release(i8* %q) #0, !clang.imprecise_release !0
-  tail call void @objc_release(i8* %p) #0, !clang.imprecise_release !0
-  %5 = tail call i8* @objc_autoreleaseReturnValue(i8* %retval.0) #0
+  tail call void @llvm.objc.release(i8* %q) #0, !clang.imprecise_release !0
+  tail call void @llvm.objc.release(i8* %p) #0, !clang.imprecise_release !0
+  %5 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %retval.0) #0
   ret i8* %retval.0
 }
 
 ; CHECK-LABEL: entry:
-; CHECK-NEXT:    %0 = tail call i8* @objc_retain(i8* %p) #0
+; CHECK-NEXT:    %0 = tail call i8* @llvm.objc.retain(i8* %p) #0
 ; CHECK-NEXT:    %call = invoke i8* @f(i8* %p, i8* %q)
 ; CHECK-NEXT:            to label %invoke.cont unwind label %catch.dispatch
 
@@ -47,7 +47,7 @@ cleanup:
 ; CHECK-NEXT:    %1 = catchswitch within none [label %catch] unwind to caller
 
 ; CHECK-LABEL: cleanup:
-; CHECK:         tail call void @objc_release(i8* %p) #0
+; CHECK:         tail call void @llvm.objc.release(i8* %p) #0
 
 attributes #0 = { nounwind }
 
diff --git a/test/Transforms/ObjCARC/path-overflow.ll b/test/Transforms/ObjCARC/path-overflow.ll
index 82c9fbe31b9d50ed227a74cb68cb58654e2cfa5f..227d6e5b047af032ea5139635ef33f93c2337275 100644
--- a/test/Transforms/ObjCARC/path-overflow.ll
+++ b/test/Transforms/ObjCARC/path-overflow.ll
@@ -18,13 +18,13 @@ target triple = "thumbv7-apple-ios5.0.0"
 @_unnamed_cfstring = external constant %struct.NSConstantString, section "__DATA,__cfstring"
 @_unnamed_cfstring_2 = external constant %struct.NSConstantString, section "__DATA,__cfstring"
 
-declare i8* @objc_retain(i8*) nonlazybind
-declare i8* @objc_retainAutoreleasedReturnValue(i8*) nonlazybind
-declare void @objc_release(i8*) nonlazybind
+declare i8* @llvm.objc.retain(i8*) nonlazybind
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*) nonlazybind
+declare void @llvm.objc.release(i8*) nonlazybind
 declare i8* @returner()
-declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...) nonlazybind
 declare void @NSLog(i8*, ...)
-declare void @objc_msgSend_stret(i8*, i8*, ...)
+declare void @llvm.objc.msgSend_stret(i8*, i8*, ...)
 declare i32 @__gxx_personality_sj0(...)
 declare i32 @__objc_personality_v0(...)
 
@@ -41,7 +41,7 @@ msgSend.nullinit:                                 ; preds = %entry
 
 msgSend.cont:                                     ; preds = %msgSend.nullinit, %msgSend.call
   %0 = bitcast %struct.NSConstantString* @_unnamed_cfstring to i8*
-  %1 = call i8* @objc_retain(i8* %0) nounwind
+  %1 = call i8* @llvm.objc.retain(i8* %0) nounwind
   br i1 undef, label %msgSend.nullinit33, label %msgSend.call32
 
 msgSend.call32:                                   ; preds = %if.end10
@@ -336,7 +336,7 @@ msgSend.nullinit506:                              ; preds = %msgSend.cont501
   br label %msgSend.cont507
 
 msgSend.cont507:                                  ; preds = %msgSend.nullinit506, %msgSend.call505
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
@@ -779,9 +779,9 @@ bb184:                                            ; preds = %bb182
   br i1 undef, label %bb186, label %bb195
 
 bb186:                                            ; preds = %bb184
-  %tmp188 = call i8* @objc_retainAutoreleasedReturnValue(i8* %tmp185)
-  %tmp189 = call i8* @objc_retain(i8* %tmp188)
-  call void @objc_release(i8* %tmp189), !clang.imprecise_release !0
+  %tmp188 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %tmp185)
+  %tmp189 = call i8* @llvm.objc.retain(i8* %tmp188)
+  call void @llvm.objc.release(i8* %tmp189), !clang.imprecise_release !0
   br i1 undef, label %bb197, label %bb190
 
 bb190:                                            ; preds = %bb186
@@ -866,18 +866,18 @@ bb222:                                            ; preds = %bb20, %bb19
 ; Function Attrs: ssp
 define void @test3() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
 entry:
-  %call2 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call2 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
-  %call5 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* undef, i8* undef)
+  %call5 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont4 unwind label %lpad3
 
 invoke.cont4:                                     ; preds = %invoke.cont
   br i1 undef, label %land.end, label %land.rhs
 
 land.rhs:                                         ; preds = %invoke.cont4
-  %call7 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+  %call7 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
           to label %land.end unwind label %lpad3
 
 land.end:                                         ; preds = %land.rhs, %invoke.cont4
@@ -896,11 +896,11 @@ lpad.i:                                           ; preds = %land.end
   unreachable
 
 invoke.cont8:                                     ; preds = %if.then.i, %invoke.cont.i
-  %call18 = invoke i8* (i8*, i8*, i8*, ...) bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*, ...)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
+  %call18 = invoke i8* (i8*, i8*, i8*, ...) bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*, ...)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
           to label %invoke.cont17 unwind label %lpad16
 
 invoke.cont17:                                    ; preds = %invoke.cont8
-  %call22 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call22 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont21 unwind label %lpad20
 
 invoke.cont21:                                    ; preds = %invoke.cont17
@@ -919,14 +919,14 @@ lpad.i1982:                                       ; preds = %invoke.cont21
   unreachable
 
 invoke.cont24:                                    ; preds = %if.then.i1981, %invoke.cont.i1980
-  %call37 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* undef, i8* undef)
+  %call37 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont36 unwind label %lpad35
 
 invoke.cont36:                                    ; preds = %invoke.cont24
   br i1 undef, label %land.end43, label %land.rhs39
 
 land.rhs39:                                       ; preds = %invoke.cont36
-  %call41 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call41 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %land.end43 unwind label %lpad35
 
 land.end43:                                       ; preds = %land.rhs39, %invoke.cont36
@@ -945,18 +945,18 @@ lpad.i1988:                                       ; preds = %land.end43
   unreachable
 
 invoke.cont44:                                    ; preds = %if.then.i1987, %invoke.cont.i1986
-  %call53 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call53 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont52 unwind label %lpad51
 
 invoke.cont52:                                    ; preds = %invoke.cont44
   br i1 undef, label %land.end70, label %land.rhs58
 
 land.rhs58:                                       ; preds = %invoke.cont52
-  %call63 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 42)
+  %call63 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 42)
           to label %invoke.cont62 unwind label %lpad61
 
 invoke.cont62:                                    ; preds = %land.rhs58
-  %call68 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
+  %call68 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
           to label %land.end70 unwind label %lpad66.body.thread
 
 land.end70:                                       ; preds = %invoke.cont62, %invoke.cont52
@@ -985,11 +985,11 @@ lpad.i2000:                                       ; preds = %invoke.cont71
   br label %ehcleanup102
 
 invoke.cont91:                                    ; preds = %if.then.i1999, %invoke.cont.i1998
-  %call96 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+  %call96 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont95 unwind label %lpad94
 
 invoke.cont95:                                    ; preds = %invoke.cont91
-  %call98 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* %call96)
+  %call98 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* %call96)
           to label %invoke.cont97 unwind label %lpad94
 
 invoke.cont97:                                    ; preds = %invoke.cont95
@@ -1008,7 +1008,7 @@ lpad.i2006:                                       ; preds = %invoke.cont97
   unreachable
 
 invoke.cont100:                                   ; preds = %if.then.i2005, %invoke.cont.i2004
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont110 unwind label %lpad109
 
 invoke.cont110:                                   ; preds = %invoke.cont100
@@ -1111,11 +1111,11 @@ if.then.i2029:                                    ; preds = %invoke.cont.i2028
   br label %invoke.cont165
 
 invoke.cont165:                                   ; preds = %if.then.i2029, %invoke.cont.i2028
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, void (i8*, i8*)*)*)(i8* undef, i8* undef, void (i8*, i8*)* undef)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, void (i8*, i8*)*)*)(i8* undef, i8* undef, void (i8*, i8*)* undef)
           to label %invoke.cont184 unwind label %lpad183
 
 invoke.cont184:                                   ; preds = %invoke.cont165
-  %call186 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call186 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont185 unwind label %lpad183
 
 invoke.cont185:                                   ; preds = %invoke.cont184
@@ -1134,15 +1134,15 @@ lpad.i2036:                                       ; preds = %invoke.cont185
   br label %lpad183.body
 
 invoke.cont190:                                   ; preds = %if.then.i2035, %invoke.cont.i2034
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont197 unwind label %lpad196
 
 invoke.cont197:                                   ; preds = %invoke.cont190
-  %call202 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call202 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont201 unwind label %lpad200
 
 invoke.cont201:                                   ; preds = %invoke.cont197
-  %call205 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call205 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont204 unwind label %lpad203
 
 invoke.cont204:                                   ; preds = %invoke.cont201
@@ -1161,7 +1161,7 @@ lpad.i2042:                                       ; preds = %invoke.cont204
   unreachable
 
 invoke.cont207:                                   ; preds = %if.then.i2041, %invoke.cont.i2040
-  %call209 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+  %call209 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont208 unwind label %lpad203
 
 invoke.cont208:                                   ; preds = %invoke.cont207
@@ -1175,11 +1175,11 @@ if.then.i2047:                                    ; preds = %invoke.cont.i2046
   br label %invoke.cont213
 
 invoke.cont213:                                   ; preds = %if.then.i2047, %invoke.cont.i2046
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont221 unwind label %lpad220
 
 invoke.cont221:                                   ; preds = %invoke.cont213
-  %call229 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call229 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont228 unwind label %lpad227
 
 invoke.cont228:                                   ; preds = %invoke.cont221
@@ -1198,7 +1198,7 @@ lpad.i2054:                                       ; preds = %invoke.cont228
   unreachable
 
 invoke.cont231:                                   ; preds = %if.then.i2053, %invoke.cont.i2052
-  %call233 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+  %call233 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont232 unwind label %lpad227
 
 invoke.cont232:                                   ; preds = %invoke.cont231
@@ -1212,39 +1212,39 @@ if.then.i2059:                                    ; preds = %invoke.cont.i2058
   br label %invoke.cont237
 
 invoke.cont237:                                   ; preds = %if.then.i2059, %invoke.cont.i2058
-  %call246 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call246 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont245 unwind label %lpad244
 
 invoke.cont245:                                   ; preds = %invoke.cont237
-  %call248 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 13)
+  %call248 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 13)
           to label %invoke.cont247 unwind label %lpad244
 
 invoke.cont247:                                   ; preds = %invoke.cont245
-  %call251 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 2)
+  %call251 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 2)
           to label %invoke.cont250 unwind label %lpad249
 
 invoke.cont250:                                   ; preds = %invoke.cont247
-  %call254 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 7)
+  %call254 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 7)
           to label %invoke.cont253 unwind label %lpad252
 
 invoke.cont253:                                   ; preds = %invoke.cont250
-  %call257 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8**, i32)*)(i8* undef, i8* undef, i8** undef, i32 3)
+  %call257 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8**, i32)*)(i8* undef, i8* undef, i8** undef, i32 3)
           to label %invoke.cont256 unwind label %lpad255
 
 invoke.cont256:                                   ; preds = %invoke.cont253
-  %call260 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* undef)
+  %call260 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* undef)
           to label %invoke.cont259 unwind label %lpad258
 
 invoke.cont259:                                   ; preds = %invoke.cont256
-  %call267 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call267 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont266 unwind label %lpad265
 
 invoke.cont266:                                   ; preds = %invoke.cont259
-  %call275 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
+  %call275 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
           to label %invoke.cont274 unwind label %lpad273
 
 invoke.cont274:                                   ; preds = %invoke.cont266
-  %call279 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+  %call279 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont278 unwind label %lpad277
 
 invoke.cont278:                                   ; preds = %invoke.cont274
@@ -1263,34 +1263,34 @@ lpad.i2066:                                       ; preds = %invoke.cont278
   unreachable
 
 invoke.cont281:                                   ; preds = %if.then.i2065, %invoke.cont.i2064
-  %call291 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call291 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont290 unwind label %lpad289
 
 invoke.cont290:                                   ; preds = %invoke.cont281
-  %call303 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 8)
+  %call303 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 8)
           to label %invoke.cont302 unwind label %lpad301
 
 invoke.cont302:                                   ; preds = %invoke.cont290
-  %call310 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, double)*)(i8* undef, i8* undef, double 5.000000e-01)
+  %call310 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, double)*)(i8* undef, i8* undef, double 5.000000e-01)
           to label %invoke.cont309 unwind label %lpad308
 
 invoke.cont309:                                   ; preds = %invoke.cont302
-  %call313 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 42)
+  %call313 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 42)
           to label %invoke.cont312 unwind label %lpad311
 
 invoke.cont312:                                   ; preds = %invoke.cont309
-  %call316 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8**, i8**, i32)*)(i8* undef, i8* undef, i8** undef, i8** undef, i32 2)
+  %call316 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8**, i8**, i32)*)(i8* undef, i8* undef, i8** undef, i8** undef, i32 2)
           to label %invoke.cont315 unwind label %lpad314
 
 invoke.cont315:                                   ; preds = %invoke.cont312
-  %call322 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
+  %call322 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
           to label %invoke.cont321 unwind label %lpad320
 
 invoke.cont321:                                   ; preds = %invoke.cont315
   br i1 undef, label %land.end344, label %land.rhs335
 
 land.rhs335:                                      ; preds = %invoke.cont321
-  %call342 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call342 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %land.end344 unwind label %lpad340.body.thread
 
 land.end344:                                      ; preds = %land.rhs335, %invoke.cont321
@@ -1304,15 +1304,15 @@ if.then.i2071:                                    ; preds = %invoke.cont.i2070
   br label %invoke.cont345
 
 invoke.cont345:                                   ; preds = %if.then.i2071, %invoke.cont.i2070
-  %call362 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
+  %call362 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
           to label %invoke.cont361 unwind label %lpad360
 
 invoke.cont361:                                   ; preds = %invoke.cont345
-  %call365 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call365 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont364 unwind label %lpad363
 
 invoke.cont364:                                   ; preds = %invoke.cont361
-  %call371 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+  %call371 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont370 unwind label %lpad369
 
 invoke.cont370:                                   ; preds = %invoke.cont364
@@ -1331,15 +1331,15 @@ lpad.i2078:                                       ; preds = %invoke.cont370
   unreachable
 
 invoke.cont373:                                   ; preds = %if.then.i2077, %invoke.cont.i2076
-  %call377 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32, i8*)*)(i8* undef, i8* undef, i32 42, i8* undef)
+  %call377 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32, i8*)*)(i8* undef, i8* undef, i32 42, i8* undef)
           to label %invoke.cont376 unwind label %lpad363
 
 invoke.cont376:                                   ; preds = %invoke.cont373
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 5)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 5)
           to label %invoke.cont382 unwind label %lpad381
 
 invoke.cont382:                                   ; preds = %invoke.cont376
-  %call384 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call384 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont383 unwind label %lpad381
 
 invoke.cont383:                                   ; preds = %invoke.cont382
@@ -1358,19 +1358,19 @@ lpad.i2084:                                       ; preds = %invoke.cont383
   unreachable
 
 invoke.cont392:                                   ; preds = %if.then.i2083, %invoke.cont.i2082
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 -2)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 -2)
           to label %invoke.cont395 unwind label %lpad381
 
 invoke.cont395:                                   ; preds = %invoke.cont392
-  %call397 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call397 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont396 unwind label %lpad381
 
 invoke.cont396:                                   ; preds = %invoke.cont395
-  %call400 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+  %call400 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont399 unwind label %lpad398
 
 invoke.cont399:                                   ; preds = %invoke.cont396
-  %call403 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+  %call403 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont402 unwind label %lpad401
 
 invoke.cont402:                                   ; preds = %invoke.cont399
@@ -1389,15 +1389,15 @@ lpad.i2090:                                       ; preds = %invoke.cont402
   unreachable
 
 invoke.cont405:                                   ; preds = %if.then.i2089, %invoke.cont.i2088
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 -1)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 -1)
           to label %invoke.cont408 unwind label %lpad381
 
 invoke.cont408:                                   ; preds = %invoke.cont405
-  %call410 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call410 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont409 unwind label %lpad381
 
 invoke.cont409:                                   ; preds = %invoke.cont408
-  %call413 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+  %call413 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont412 unwind label %lpad411
 
 invoke.cont412:                                   ; preds = %invoke.cont409
@@ -1416,19 +1416,19 @@ lpad.i2096:                                       ; preds = %invoke.cont412
   unreachable
 
 invoke.cont418:                                   ; preds = %if.then.i2095, %invoke.cont.i2094
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 0)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 0)
           to label %invoke.cont422 unwind label %lpad381
 
 invoke.cont422:                                   ; preds = %invoke.cont418
-  %call424 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call424 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont423 unwind label %lpad381
 
 invoke.cont423:                                   ; preds = %invoke.cont422
-  %call427 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+  %call427 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont426 unwind label %lpad425
 
 invoke.cont426:                                   ; preds = %invoke.cont423
-  %call430 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+  %call430 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont429 unwind label %lpad428
 
 invoke.cont429:                                   ; preds = %invoke.cont426
@@ -1447,7 +1447,7 @@ lpad.i2102:                                       ; preds = %invoke.cont429
   unreachable
 
 invoke.cont432:                                   ; preds = %if.then.i2101, %invoke.cont.i2100
-  %call436 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 0)
+  %call436 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 0)
           to label %invoke.cont435 unwind label %lpad381
 
 invoke.cont435:                                   ; preds = %invoke.cont432
@@ -1455,7 +1455,7 @@ invoke.cont435:                                   ; preds = %invoke.cont432
           to label %invoke.cont.i2106 unwind label %lpad.i2108
 
 invoke.cont.i2106:                                ; preds = %invoke.cont435
-  %call444 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 5)
+  %call444 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 5)
           to label %invoke.cont443 unwind label %lpad381
 
 lpad.i2108:                                       ; preds = %invoke.cont435
@@ -1479,11 +1479,11 @@ lpad.i2114:                                       ; preds = %invoke.cont443
   unreachable
 
 invoke.cont449:                                   ; preds = %if.then.i2113, %invoke.cont.i2112
-  %call453 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 -2)
+  %call453 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 -2)
           to label %invoke.cont452 unwind label %lpad381
 
 invoke.cont452:                                   ; preds = %invoke.cont449
-  %call456 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+  %call456 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont455 unwind label %lpad454
 
 invoke.cont455:                                   ; preds = %invoke.cont452
@@ -1502,7 +1502,7 @@ lpad.i2120:                                       ; preds = %invoke.cont455
   unreachable
 
 invoke.cont458:                                   ; preds = %if.then.i2119, %invoke.cont.i2118
-  %call461 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 -1)
+  %call461 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 -1)
           to label %invoke.cont460 unwind label %lpad381
 
 invoke.cont460:                                   ; preds = %invoke.cont458
@@ -1521,7 +1521,7 @@ lpad.i2126:                                       ; preds = %invoke.cont460
   br label %ehcleanup477
 
 invoke.cont466:                                   ; preds = %if.then.i2125, %invoke.cont.i2124
-  %call470 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 0)
+  %call470 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 0)
           to label %invoke.cont469 unwind label %lpad381
 
 invoke.cont469:                                   ; preds = %invoke.cont466
@@ -1540,34 +1540,34 @@ lpad.i2132:                                       ; preds = %invoke.cont469
   br label %ehcleanup477
 
 invoke.cont475:                                   ; preds = %if.then.i2131, %invoke.cont.i2130
-  %call491 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 1)
+  %call491 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 1)
           to label %invoke.cont490 unwind label %lpad489
 
 invoke.cont490:                                   ; preds = %invoke.cont475
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont499 unwind label %lpad498
 
 invoke.cont499:                                   ; preds = %invoke.cont490
-  %call504 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call504 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont503 unwind label %lpad489
 
 invoke.cont503:                                   ; preds = %invoke.cont499
-  %call507 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 3)
+  %call507 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 3)
           to label %invoke.cont506 unwind label %lpad505
 
 invoke.cont506:                                   ; preds = %invoke.cont503
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont509 unwind label %lpad508
 
 invoke.cont509:                                   ; preds = %invoke.cont506
-  %call513 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call513 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont512 unwind label %lpad489
 
 invoke.cont512:                                   ; preds = %invoke.cont509
   br i1 undef, label %msgSend.null-receiver, label %msgSend.call
 
 msgSend.call:                                     ; preds = %invoke.cont512
-  invoke void bitcast (void (i8*, i8*, ...)* @objc_msgSend_stret to void (%struct.CGPoint*, i8*, i8*)*)(%struct.CGPoint* sret undef, i8* undef, i8* undef)
+  invoke void bitcast (void (i8*, i8*, ...)* @llvm.objc.msgSend_stret to void (%struct.CGPoint*, i8*, i8*)*)(%struct.CGPoint* sret undef, i8* undef, i8* undef)
           to label %msgSend.cont unwind label %lpad514
 
 msgSend.null-receiver:                            ; preds = %invoke.cont512
@@ -1589,15 +1589,15 @@ lpad.i2138:                                       ; preds = %msgSend.cont
   unreachable
 
 invoke.cont521:                                   ; preds = %if.then.i2137, %invoke.cont.i2136
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
           to label %invoke.cont528 unwind label %lpad527
 
 invoke.cont528:                                   ; preds = %invoke.cont521
-  %call532 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call532 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont531 unwind label %lpad489
 
 invoke.cont531:                                   ; preds = %invoke.cont528
-  %call535 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+  %call535 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont534 unwind label %lpad533
 
 invoke.cont534:                                   ; preds = %invoke.cont531
@@ -1616,43 +1616,43 @@ lpad.i2144:                                       ; preds = %invoke.cont534
   unreachable
 
 invoke.cont540:                                   ; preds = %if.then.i2143, %invoke.cont.i2142
-  %call544 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i32 3)
+  %call544 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i32)*)(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i32 3)
           to label %invoke.cont543 unwind label %lpad489
 
 invoke.cont543:                                   ; preds = %invoke.cont540
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
           to label %invoke.cont546 unwind label %lpad545
 
 invoke.cont546:                                   ; preds = %invoke.cont543
-  %call549 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call549 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont548 unwind label %lpad489
 
 invoke.cont548:                                   ; preds = %invoke.cont546
-  %call555 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  %call555 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont554 unwind label %lpad553
 
 invoke.cont554:                                   ; preds = %invoke.cont548
-  %tmp499 = call i8* @objc_retain(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*)) #3
+  %tmp499 = call i8* @llvm.objc.retain(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*)) #3
   invoke void (i8*, ...) @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* %tmp499, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont.i2148 unwind label %lpad.i2150
 
 invoke.cont.i2148:                                ; preds = %invoke.cont554
-  call void @objc_release(i8* %tmp499) #3, !clang.imprecise_release !0
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  call void @llvm.objc.release(i8* %tmp499) #3, !clang.imprecise_release !0
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont566 unwind label %lpad565
 
 lpad.i2150:                                       ; preds = %invoke.cont554
   %tmp500 = landingpad { i8*, i32 }
           cleanup
-  call void @objc_release(i8* %tmp499) #3, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp499) #3, !clang.imprecise_release !0
   unreachable
 
 invoke.cont566:                                   ; preds = %invoke.cont.i2148
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
           to label %invoke.cont572 unwind label %lpad571
 
 invoke.cont572:                                   ; preds = %invoke.cont566
-  %call582 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+  %call582 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
           to label %invoke.cont581 unwind label %lpad580
 
 invoke.cont581:                                   ; preds = %invoke.cont572
@@ -1927,7 +1927,7 @@ if.then10:                                        ; preds = %entry
   br label %if.end13
 
 if.end13:                                         ; preds = %if.then10, %entry
-  %0 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*, i64, i8*, i8)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i64 2, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_2 to i8*), i8 signext 0), !clang.arc.no_objc_arc_exceptions !0
+  %0 = call i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*, i8*, i8*, i64, i8*, i8)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i64 2, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_2 to i8*), i8 signext 0), !clang.arc.no_objc_arc_exceptions !0
   br i1 undef, label %if.then17, label %if.end18
 
 if.then17:                                        ; preds = %if.end13
@@ -2162,14 +2162,14 @@ if.then398:                                       ; preds = %if.end392
   br label %if.end399
 
 if.end399:                                        ; preds = %if.then398, %if.end392
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*)*)(i8* undef, i8* undef)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*, i8*)*)(i8* undef, i8* undef)
           to label %eh.cont unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
 
 eh.cont:                                          ; preds = %if.end399
   br i1 undef, label %if.then430, label %if.end439.critedge
 
 if.then430:                                       ; preds = %eh.cont
-  %1 = call i8* @objc_retain(i8* %0)
+  %1 = call i8* @llvm.objc.retain(i8* %0)
   br label %if.end439
 
 lpad:                                             ; preds = %if.end399
@@ -2178,11 +2178,11 @@ lpad:                                             ; preds = %if.end399
   unreachable
 
 if.end439.critedge:                               ; preds = %eh.cont
-  %3 = call i8* @objc_retain(i8* %0)
+  %3 = call i8* @llvm.objc.retain(i8* %0)
   br label %if.end439
 
 if.end439:                                        ; preds = %if.end439.critedge, %if.then430
-  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %0), !clang.imprecise_release !0
   unreachable
 
 return:                                           ; No predecessors!
diff --git a/test/Transforms/ObjCARC/pointer-types.ll b/test/Transforms/ObjCARC/pointer-types.ll
index 257560d9f7b206ac2239ab19534b3df07254bb36..b7fcad0360d2c719bfbb0326f393b02ce915061a 100644
--- a/test/Transforms/ObjCARC/pointer-types.ll
+++ b/test/Transforms/ObjCARC/pointer-types.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -objc-arc -S < %s | FileCheck %s
 
-; Don't hoist @objc_release past a use of its pointer, even
+; Don't hoist @llvm.objc.release past a use of its pointer, even
 ; if the use has function type, because clang uses function types
 ; in dubious ways.
 ; rdar://10551239
@@ -9,7 +9,7 @@
 ; CHECK: %otherBlock = phi void ()* [ %b1, %if.then ], [ null, %entry ]
 ; CHECK-NEXT: call void @use_fptr(void ()* %otherBlock)
 ; CHECK-NEXT: %tmp11 = bitcast void ()* %otherBlock to i8*
-; CHECK-NEXT: call void @objc_release(i8* %tmp11)
+; CHECK-NEXT: call void @llvm.objc.release(i8* %tmp11)
 
 define void @test0(i1 %tobool, void ()* %b1) {
 entry:
@@ -22,10 +22,10 @@ if.end:                                           ; preds = %if.then, %entry
   %otherBlock = phi void ()* [ %b1, %if.then ], [ null, %entry ]
   call void @use_fptr(void ()* %otherBlock)
   %tmp11 = bitcast void ()* %otherBlock to i8*
-  call void @objc_release(i8* %tmp11) nounwind
+  call void @llvm.objc.release(i8* %tmp11) nounwind
   ret void
 }
 
 declare void @use_fptr(void ()*)
-declare void @objc_release(i8*)
+declare void @llvm.objc.release(i8*)
 
diff --git a/test/Transforms/ObjCARC/post-inlining.ll b/test/Transforms/ObjCARC/post-inlining.ll
index b2d6112cf4d9fc06a7a5630a9957985d78ba10f8..0304d59c8b992cf98918beafcc2fe87526f77196 100644
--- a/test/Transforms/ObjCARC/post-inlining.ll
+++ b/test/Transforms/ObjCARC/post-inlining.ll
@@ -2,9 +2,9 @@
 
 declare void @use_pointer(i8*)
 declare i8* @returner()
-declare i8* @objc_retain(i8*)
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
 
 ; Clean up residue left behind after inlining.
 
@@ -14,8 +14,8 @@ declare i8* @objc_retainAutoreleasedReturnValue(i8*)
 ; CHECK-NEXT: }
 define void @test0(i8* %call.i) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %call.i) nounwind
-  %1 = tail call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %call.i) nounwind
+  %1 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %0) nounwind
   ret void
 }
 
@@ -27,8 +27,8 @@ entry:
 ; CHECK-NEXT: }
 define void @test1(i8* %call.i) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %call.i) nounwind
-  %1 = tail call i8* @objc_autoreleaseReturnValue(i8* %call.i) nounwind
+  %0 = tail call i8* @llvm.objc.retain(i8* %call.i) nounwind
+  %1 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %call.i) nounwind
   ret void
 }
 
@@ -41,8 +41,8 @@ entry:
 ; CHECK-NEXT: }
 define void @test24(i8* %p) {
 entry:
-  call i8* @objc_autoreleaseReturnValue(i8* %p) nounwind
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %p) nounwind
+  call i8* @llvm.objc.autoreleaseReturnValue(i8* %p) nounwind
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p) nounwind
   call void @use_pointer(i8* %p)
   ret void
 }
diff --git a/test/Transforms/ObjCARC/pr12270.ll b/test/Transforms/ObjCARC/pr12270.ll
index bdff0d7b4d581561d7da30439c274adfa041717f..b1d99024bc6d2967898b3d2a57ff059570182557 100644
--- a/test/Transforms/ObjCARC/pr12270.ll
+++ b/test/Transforms/ObjCARC/pr12270.ll
@@ -8,14 +8,14 @@ entry:
 
 return:                                           ; No predecessors!
   %bar = bitcast %2* %x to i8*
-  %foo = call i8* @objc_autoreleaseReturnValue(i8* %bar) nounwind
+  %foo = call i8* @llvm.objc.autoreleaseReturnValue(i8* %bar) nounwind
   call void @callee()
   call void @use_pointer(i8* %foo)
-  call void @objc_release(i8* %foo) nounwind
+  call void @llvm.objc.release(i8* %foo) nounwind
   ret void
 }
 
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare void @objc_release(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare void @llvm.objc.release(i8*)
 declare void @callee()
 declare void @use_pointer(i8*)
diff --git a/test/Transforms/ObjCARC/retain-block-side-effects.ll b/test/Transforms/ObjCARC/retain-block-side-effects.ll
index 5f5def9ff036eabbe52b93e0610916dfc5246d30..a980ffd63a34c1d9a3282a7d80d6d68203b3218a 100644
--- a/test/Transforms/ObjCARC/retain-block-side-effects.ll
+++ b/test/Transforms/ObjCARC/retain-block-side-effects.ll
@@ -4,7 +4,7 @@
 ; objc_retainBlock stores into %repeater so the load from after the
 ; call isn't forwardable from the store before the call.
 
-; CHECK: %tmp16 = call i8* @objc_retainBlock(i8* %tmp15) [[NUW:#[0-9]+]]
+; CHECK: %tmp16 = call i8* @llvm.objc.retainBlock(i8* %tmp15) [[NUW:#[0-9]+]]
 ; CHECK: %tmp17 = bitcast i8* %tmp16 to void ()*
 ; CHECK: %tmp18 = load %struct.__block_byref_repeater*, %struct.__block_byref_repeater** %byref.forwarding, align 8
 ; CHECK: %repeater12 = getelementptr inbounds %struct.__block_byref_repeater, %struct.__block_byref_repeater* %tmp18, i64 0, i32 6
@@ -27,7 +27,7 @@ entry:
   %tmp14 = bitcast %struct.__block_byref_repeater* %repeater to i8*
   store i8* %tmp14, i8** %block.captured11, align 8
   %tmp15 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0*, i8* }>* %block to i8*
-  %tmp16 = call i8* @objc_retainBlock(i8* %tmp15) nounwind
+  %tmp16 = call i8* @llvm.objc.retainBlock(i8* %tmp15) nounwind
   %tmp17 = bitcast i8* %tmp16 to void ()*
   %tmp18 = load %struct.__block_byref_repeater*, %struct.__block_byref_repeater** %byref.forwarding, align 8
   %repeater12 = getelementptr inbounds %struct.__block_byref_repeater, %struct.__block_byref_repeater* %tmp18, i64 0, i32 6
@@ -36,7 +36,7 @@ entry:
   ret void
 }
 
-declare i8* @objc_retainBlock(i8*)
+declare i8* @llvm.objc.retainBlock(i8*)
 
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/retain-not-declared.ll b/test/Transforms/ObjCARC/retain-not-declared.ll
index f7ac908a76e549c8f2ace2aa67c5741463450b9c..7df51596d9ce6ee17a0acdbc520165ccdf7153a8 100644
--- a/test/Transforms/ObjCARC/retain-not-declared.ll
+++ b/test/Transforms/ObjCARC/retain-not-declared.ll
@@ -1,11 +1,11 @@
 ; RUN: opt -S -objc-arc -objc-arc-contract < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-declare i8* @objc_unretainedObject(i8*)
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare i8* @objc_msgSend(i8*, i8*, ...)
-declare void @objc_release(i8*)
+declare i8* @llvm.objc.unretainedObject(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.msgSend(i8*, i8*, ...)
+declare void @llvm.objc.release(i8*)
 
 ; Test that the optimizer can create an objc_retainAutoreleaseReturnValue
 ; declaration even if no objc_retain declaration exists.
@@ -13,41 +13,41 @@ declare void @objc_release(i8*)
 
 ; CHECK:      define i8* @test0(i8* %p) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retainAutoreleaseReturnValue(i8* %p) [[NUW:#[0-9]+]]
+; CHECK-NEXT:   %0 = tail call i8* @llvm.objc.retainAutoreleaseReturnValue(i8* %p) [[NUW:#[0-9]+]]
 ; CHECK-NEXT:   ret i8* %0
 ; CHECK-NEXT: }
 
 define i8* @test0(i8* %p) {
 entry:
-  %call = tail call i8* @objc_unretainedObject(i8* %p)
-  %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
-  %1 = tail call i8* @objc_autoreleaseReturnValue(i8* %call) nounwind
+  %call = tail call i8* @llvm.objc.unretainedObject(i8* %p)
+  %0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
+  %1 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %call) nounwind
   ret i8* %call
 }
 
-; Properly create the @objc_retain declaration when it doesn't already exist.
+; Properly create the @llvm.objc.retain declaration when it doesn't already exist.
 ; rdar://9825114
 
 ; CHECK-LABEL: @test1(
-; CHECK: @objc_retain(
-; CHECK: @objc_retainAutoreleasedReturnValue(
-; CHECK: @objc_release(
-; CHECK: @objc_release(
+; CHECK: @llvm.objc.retain
+; CHECK: @llvm.objc.retainAutoreleasedReturnValue(
+; CHECK: @llvm.objc.release
+; CHECK: @llvm.objc.release
 ; CHECK: }
 define void @test1(i8* %call88) nounwind personality i32 (...)* @__gxx_personality_v0 {
 entry:
-  %tmp1 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call88) nounwind
-  %call94 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*)*)(i8* %tmp1)
+  %tmp1 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call88) nounwind
+  %call94 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to i8* (i8*)*)(i8* %tmp1)
           to label %invoke.cont93 unwind label %lpad91
 
 invoke.cont93:                                    ; preds = %entry
-  %tmp2 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call94) nounwind
-  call void @objc_release(i8* %tmp1) nounwind
-  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*)*)(i8* %tmp2)
+  %tmp2 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call94) nounwind
+  call void @llvm.objc.release(i8* %tmp1) nounwind
+  invoke void bitcast (i8* (i8*, i8*, ...)* @llvm.objc.msgSend to void (i8*)*)(i8* %tmp2)
           to label %invoke.cont102 unwind label %lpad100
 
 invoke.cont102:                                   ; preds = %invoke.cont93
-  call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp2) nounwind, !clang.imprecise_release !0
   unreachable
 
 lpad91:                                           ; preds = %entry
@@ -58,7 +58,7 @@ lpad91:                                           ; preds = %entry
 lpad100:                                          ; preds = %invoke.cont93
   %exn100 = landingpad {i8*, i32}
               cleanup
-  call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %tmp2) nounwind, !clang.imprecise_release !0
   unreachable
 }
 
diff --git a/test/Transforms/ObjCARC/rle-s2l.ll b/test/Transforms/ObjCARC/rle-s2l.ll
index 2865c94dc88c6aaa3007f4dbe245598a1f96e943..5bf63f20aa08c25bac010510dd4e5083eb588caf 100644
--- a/test/Transforms/ObjCARC/rle-s2l.ll
+++ b/test/Transforms/ObjCARC/rle-s2l.ll
@@ -1,71 +1,71 @@
 ; RUN: opt -S -basicaa -objc-arc < %s | FileCheck %s
 
-declare i8* @objc_loadWeak(i8**)
-declare i8* @objc_loadWeakRetained(i8**)
-declare i8* @objc_storeWeak(i8**, i8*)
-declare i8* @objc_initWeak(i8**, i8*)
+declare i8* @llvm.objc.loadWeak(i8**)
+declare i8* @llvm.objc.loadWeakRetained(i8**)
+declare i8* @llvm.objc.storeWeak(i8**, i8*)
+declare i8* @llvm.objc.initWeak(i8**, i8*)
 declare void @use_pointer(i8*)
 declare void @callee()
 
-; Basic redundant @objc_loadWeak elimination.
+; Basic redundant @llvm.objc.loadWeak elimination.
 
 ; CHECK:      define void @test0(i8** %p) {
-; CHECK-NEXT:   %y = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   %y = call i8* @llvm.objc.loadWeak(i8** %p)
 ; CHECK-NEXT:   call void @use_pointer(i8* %y)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test0(i8** %p) {
-  %x = call i8* @objc_loadWeak(i8** %p)
-  %y = call i8* @objc_loadWeak(i8** %p)
+  %x = call i8* @llvm.objc.loadWeak(i8** %p)
+  %y = call i8* @llvm.objc.loadWeak(i8** %p)
   call void @use_pointer(i8* %y)
   ret void
 }
 
-; DCE the @objc_loadWeak.
+; DCE the @llvm.objc.loadWeak.
 
 ; CHECK:      define void @test1(i8** %p) {
-; CHECK-NEXT:   %y = call i8* @objc_loadWeakRetained(i8** %p)
+; CHECK-NEXT:   %y = call i8* @llvm.objc.loadWeakRetained(i8** %p)
 ; CHECK-NEXT:   call void @use_pointer(i8* %y)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test1(i8** %p) {
-  %x = call i8* @objc_loadWeak(i8** %p)
-  %y = call i8* @objc_loadWeakRetained(i8** %p)
+  %x = call i8* @llvm.objc.loadWeak(i8** %p)
+  %y = call i8* @llvm.objc.loadWeakRetained(i8** %p)
   call void @use_pointer(i8* %y)
   ret void
 }
 
-; Basic redundant @objc_loadWeakRetained elimination.
+; Basic redundant @llvm.objc.loadWeakRetained elimination.
 
 ; CHECK:      define void @test2(i8** %p) {
-; CHECK-NEXT:   %x = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   %x = call i8* @llvm.objc.loadWeak(i8** %p)
 ; CHECK-NEXT:   store i8 3, i8* %x
-; CHECK-NEXT:   %1 = tail call i8* @objc_retain(i8* %x)
+; CHECK-NEXT:   %1 = tail call i8* @llvm.objc.retain(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test2(i8** %p) {
-  %x = call i8* @objc_loadWeak(i8** %p)
+  %x = call i8* @llvm.objc.loadWeak(i8** %p)
   store i8 3, i8* %x
-  %y = call i8* @objc_loadWeakRetained(i8** %p)
+  %y = call i8* @llvm.objc.loadWeakRetained(i8** %p)
   call void @use_pointer(i8* %y)
   ret void
 }
 
-; Basic redundant @objc_loadWeakRetained elimination, this time
+; Basic redundant @llvm.objc.loadWeakRetained elimination, this time
 ; with a readonly call instead of a store.
 
 ; CHECK:      define void @test3(i8** %p) {
-; CHECK-NEXT:   %x = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   %x = call i8* @llvm.objc.loadWeak(i8** %p)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x) [[RO:#[0-9]+]]
-; CHECK-NEXT:   %1 = tail call i8* @objc_retain(i8* %x)
+; CHECK-NEXT:   %1 = tail call i8* @llvm.objc.retain(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test3(i8** %p) {
-  %x = call i8* @objc_loadWeak(i8** %p)
+  %x = call i8* @llvm.objc.loadWeak(i8** %p)
   call void @use_pointer(i8* %x) readonly
-  %y = call i8* @objc_loadWeakRetained(i8** %p)
+  %y = call i8* @llvm.objc.loadWeakRetained(i8** %p)
   call void @use_pointer(i8* %y)
   ret void
 }
@@ -73,18 +73,18 @@ define void @test3(i8** %p) {
 ; A regular call blocks redundant weak load elimination.
 
 ; CHECK:      define void @test4(i8** %p) {
-; CHECK-NEXT:   %x = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   %x = call i8* @llvm.objc.loadWeak(i8** %p)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x) [[RO]]
 ; CHECK-NEXT:   call void @callee()
-; CHECK-NEXT:   %y = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   %y = call i8* @llvm.objc.loadWeak(i8** %p)
 ; CHECK-NEXT:   call void @use_pointer(i8* %y)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test4(i8** %p) {
-  %x = call i8* @objc_loadWeak(i8** %p)
+  %x = call i8* @llvm.objc.loadWeak(i8** %p)
   call void @use_pointer(i8* %x) readonly
   call void @callee()
-  %y = call i8* @objc_loadWeak(i8** %p)
+  %y = call i8* @llvm.objc.loadWeak(i8** %p)
   call void @use_pointer(i8* %y)
   ret void
 }
@@ -92,13 +92,13 @@ define void @test4(i8** %p) {
 ; Store to load forwarding.
 
 ; CHECK:      define void @test5(i8** %p, i8* %n) {
-; CHECK-NEXT:   %1 = call i8* @objc_storeWeak(i8** %p, i8* %n)
+; CHECK-NEXT:   %1 = call i8* @llvm.objc.storeWeak(i8** %p, i8* %n)
 ; CHECK-NEXT:   call void @use_pointer(i8* %n)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test5(i8** %p, i8* %n) {
-  call i8* @objc_storeWeak(i8** %p, i8* %n)
-  %y = call i8* @objc_loadWeak(i8** %p)
+  call i8* @llvm.objc.storeWeak(i8** %p, i8* %n)
+  %y = call i8* @llvm.objc.loadWeak(i8** %p)
   call void @use_pointer(i8* %y)
   ret void
 }
@@ -106,13 +106,13 @@ define void @test5(i8** %p, i8* %n) {
 ; Store to load forwarding with objc_initWeak.
 
 ; CHECK:      define void @test6(i8** %p, i8* %n) {
-; CHECK-NEXT:   %1 = call i8* @objc_initWeak(i8** %p, i8* %n)
+; CHECK-NEXT:   %1 = call i8* @llvm.objc.initWeak(i8** %p, i8* %n)
 ; CHECK-NEXT:   call void @use_pointer(i8* %n)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test6(i8** %p, i8* %n) {
-  call i8* @objc_initWeak(i8** %p, i8* %n)
-  %y = call i8* @objc_loadWeak(i8** %p)
+  call i8* @llvm.objc.initWeak(i8** %p, i8* %n)
+  %y = call i8* @llvm.objc.loadWeak(i8** %p)
   call void @use_pointer(i8* %y)
   ret void
 }
@@ -120,16 +120,16 @@ define void @test6(i8** %p, i8* %n) {
 ; Don't forward if there's a may-alias store in the way.
 
 ; CHECK:      define void @test7(i8** %p, i8* %n, i8** %q, i8* %m) {
-; CHECK-NEXT:   call i8* @objc_initWeak(i8** %p, i8* %n)
-; CHECK-NEXT:   call i8* @objc_storeWeak(i8** %q, i8* %m)
-; CHECK-NEXT:   %y = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   call i8* @llvm.objc.initWeak(i8** %p, i8* %n)
+; CHECK-NEXT:   call i8* @llvm.objc.storeWeak(i8** %q, i8* %m)
+; CHECK-NEXT:   %y = call i8* @llvm.objc.loadWeak(i8** %p)
 ; CHECK-NEXT:   call void @use_pointer(i8* %y)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test7(i8** %p, i8* %n, i8** %q, i8* %m) {
-  call i8* @objc_initWeak(i8** %p, i8* %n)
-  call i8* @objc_storeWeak(i8** %q, i8* %m)
-  %y = call i8* @objc_loadWeak(i8** %p)
+  call i8* @llvm.objc.initWeak(i8** %p, i8* %n)
+  call i8* @llvm.objc.storeWeak(i8** %q, i8* %m)
+  %y = call i8* @llvm.objc.loadWeak(i8** %p)
   call void @use_pointer(i8* %y)
   ret void
 }
diff --git a/test/Transforms/ObjCARC/rv.ll b/test/Transforms/ObjCARC/rv.ll
index 425f86cafb324b5207b6d7d84f734d0c16205b53..604ef59f119d16dcf54f9176cecf428b7cbff9cc 100644
--- a/test/Transforms/ObjCARC/rv.ll
+++ b/test/Transforms/ObjCARC/rv.ll
@@ -2,15 +2,15 @@
 
 target datalayout = "e-p:64:64:64"
 
-declare i8* @objc_retain(i8*)
-declare i8* @objc_retainAutoreleasedReturnValue(i8*)
-declare void @objc_release(i8*)
-declare i8* @objc_autorelease(i8*)
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare i8* @objc_retainAutoreleaseReturnValue(i8*)
-declare void @objc_autoreleasePoolPop(i8*)
-declare void @objc_autoreleasePoolPush()
-declare i8* @objc_retainBlock(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.autorelease(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.retainAutoreleaseReturnValue(i8*)
+declare void @llvm.objc.autoreleasePoolPop(i8*)
+declare void @llvm.objc.autoreleasePoolPush()
+declare i8* @llvm.objc.retainBlock(i8*)
 
 declare i8* @objc_retainedObject(i8*)
 declare i8* @objc_unretainedObject(i8*)
@@ -29,17 +29,17 @@ declare i8* @returner()
 ; CHECK-LABEL:      define void @test0(
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %x = call i8* @returner
-; CHECK-NEXT:   %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT:   %0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %x) [[NUW:#[0-9]+]]
 ; CHECK: t:
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: return:
-; CHECK-NEXT: call void @objc_release(i8* %x)
+; CHECK-NEXT: call void @llvm.objc.release(i8* %x)
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test0(i1 %p) nounwind {
 entry:
   %x = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %x)
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %x)
   br i1 %p, label %t, label %return
 
 t:
@@ -48,19 +48,19 @@ t:
   br label %return
 
 return:
-  call void @objc_release(i8* %x) nounwind
+  call void @llvm.objc.release(i8* %x) nounwind
   ret void
 }
 
 ; Delete no-ops.
 
 ; CHECK-LABEL: define void @test2(
-; CHECK-NOT: @objc_
+; CHECK-NOT: @llvm.objc.
 ; CHECK: }
 define void @test2() {
-  call i8* @objc_retainAutoreleasedReturnValue(i8* null)
-  call i8* @objc_autoreleaseReturnValue(i8* null)
-  ; call i8* @objc_retainAutoreleaseReturnValue(i8* null) ; TODO
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* null)
+  call i8* @llvm.objc.autoreleaseReturnValue(i8* null)
+  ; call i8* @llvm.objc.retainAutoreleaseReturnValue(i8* null) ; TODO
   ret void
 }
 
@@ -73,8 +73,8 @@ define void @test2() {
 define i8* @test3() {
 entry:
   %call = call i8* @returner()
-  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
-  %1 = call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %call) nounwind
+  %1 = call i8* @llvm.objc.autoreleaseReturnValue(i8* %0) nounwind
   ret i8* %1
 }
 
@@ -87,8 +87,8 @@ entry:
 define i8* @test4() {
 entry:
   %call = call i8* @returner()
-  %0 = call i8* @objc_retain(i8* %call) nounwind
-  %1 = call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %call) nounwind
+  %1 = call i8* @llvm.objc.autoreleaseReturnValue(i8* %0) nounwind
   ret i8* %1
 }
 
@@ -102,7 +102,7 @@ entry:
 ;define i8* @test5() {
 ;entry:
 ;  %call = call i8* @returner()
-;  %0 = call i8* @objc_retainAutoreleaseReturnValue(i8* %call) nounwind
+;  %0 = call i8* @llvm.objc.retainAutoreleaseReturnValue(i8* %call) nounwind
 ;  ret i8* %0
 ;}
 
@@ -115,45 +115,45 @@ entry:
 ; Those entrypoints don't exist yet though.
 
 ; CHECK-LABEL: define i8* @test7(
-; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
-; CHECK: %t = tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+; CHECK: call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
+; CHECK: %t = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
 define i8* @test7() {
   %p = call i8* @returner()
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
-  %t = call i8* @objc_autoreleaseReturnValue(i8* %p)
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
+  %t = call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
   call void @use_pointer(i8* %p)
   ret i8* %t
 }
 
 ; CHECK-LABEL: define i8* @test7b(
-; CHECK: call i8* @objc_retain(i8* %p)
-; CHECK: %t = tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+; CHECK: call i8* @llvm.objc.retain(i8* %p)
+; CHECK: %t = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
 define i8* @test7b() {
   %p = call i8* @returner()
   call void @use_pointer(i8* %p)
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
-  %t = call i8* @objc_autoreleaseReturnValue(i8* %p)
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
+  %t = call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
   ret i8* %p
 }
 
 ; Don't apply the RV optimization to autorelease if there's no retain.
 
 ; CHECK: define i8* @test9(i8* %p)
-; CHECK: call i8* @objc_autorelease(i8* %p)
+; CHECK: call i8* @llvm.objc.autorelease(i8* %p)
 define i8* @test9(i8* %p) {
-  call i8* @objc_autorelease(i8* %p)
+  call i8* @llvm.objc.autorelease(i8* %p)
   ret i8* %p
 }
 
 ; Do not apply the RV optimization.
 
 ; CHECK: define i8* @test10(i8* %p)
-; CHECK: tail call i8* @objc_retain(i8* %p) [[NUW]]
-; CHECK: call i8* @objc_autorelease(i8* %p) [[NUW]]
+; CHECK: tail call i8* @llvm.objc.retain(i8* %p) [[NUW]]
+; CHECK: call i8* @llvm.objc.autorelease(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret i8* %p
 define i8* @test10(i8* %p) {
-  %1 = call i8* @objc_retain(i8* %p)
-  %2 = call i8* @objc_autorelease(i8* %p)
+  %1 = call i8* @llvm.objc.retain(i8* %p)
+  %2 = call i8* @llvm.objc.autorelease(i8* %p)
   ret i8* %p
 }
 
@@ -161,42 +161,42 @@ define i8* @test10(i8* %p) {
 ; could undo the retain.
 
 ; CHECK: define i8* @test11(i8* %p)
-; CHECK: tail call i8* @objc_retain(i8* %p)
+; CHECK: tail call i8* @llvm.objc.retain(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
-; CHECK: call i8* @objc_autorelease(i8* %p)
+; CHECK: call i8* @llvm.objc.autorelease(i8* %p)
 ; CHECK-NEXT: ret i8* %p
 define i8* @test11(i8* %p) {
-  %1 = call i8* @objc_retain(i8* %p)
+  %1 = call i8* @llvm.objc.retain(i8* %p)
   call void @use_pointer(i8* %p)
-  %2 = call i8* @objc_autorelease(i8* %p)
+  %2 = call i8* @llvm.objc.autorelease(i8* %p)
   ret i8* %p
 }
 
 ; Don't spoil the RV optimization.
 
 ; CHECK: define i8* @test12(i8* %p)
-; CHECK: tail call i8* @objc_retain(i8* %p)
+; CHECK: tail call i8* @llvm.objc.retain(i8* %p)
 ; CHECK: call void @use_pointer(i8* %p)
-; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+; CHECK: tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
 ; CHECK: ret i8* %p
 define i8* @test12(i8* %p) {
-  %1 = call i8* @objc_retain(i8* %p)
+  %1 = call i8* @llvm.objc.retain(i8* %p)
   call void @use_pointer(i8* %p)
-  %2 = call i8* @objc_autoreleaseReturnValue(i8* %p)
+  %2 = call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
   ret i8* %p
 }
 
 ; Don't zap the objc_retainAutoreleasedReturnValue.
 
 ; CHECK-LABEL: define i8* @test13(
-; CHECK: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
-; CHECK: call i8* @objc_autorelease(i8* %p)
+; CHECK: tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
+; CHECK: call i8* @llvm.objc.autorelease(i8* %p)
 ; CHECK: ret i8* %p
 define i8* @test13() {
   %p = call i8* @returner()
-  %1 = call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  %1 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
   call void @callee()
-  %2 = call i8* @objc_autorelease(i8* %p)
+  %2 = call i8* @llvm.objc.autorelease(i8* %p)
   ret i8* %p
 }
 
@@ -204,10 +204,10 @@ define i8* @test13() {
 ; argument is not a return value.
 
 ; CHECK-LABEL: define void @test14(
-; CHECK-NEXT: tail call i8* @objc_retain(i8* %p) [[NUW]]
+; CHECK-NEXT: tail call i8* @llvm.objc.retain(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test14(i8* %p) {
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
   ret void
 }
 
@@ -216,11 +216,11 @@ define void @test14(i8* %p) {
 
 ; CHECK-LABEL: define void @test15(
 ; CHECK-NEXT: %y = call i8* @returner()
-; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) [[NUW]]
+; CHECK-NEXT: tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %y) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test15() {
   %y = call i8* @returner()
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %y)
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %y)
   ret void
 }
 
@@ -229,54 +229,54 @@ define void @test15() {
 ; CHECK: define i8* @test19(i8* %p) {
 ; CHECK-NEXT: ret i8* %p
 define i8* @test19(i8* %p) {
-  call i8* @objc_autoreleaseReturnValue(i8* %p)
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
   ret i8* %p
 }
 
 ; Like test19 but with plain autorelease.
 
 ; CHECK: define i8* @test20(i8* %p) {
-; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
-; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.autorelease(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %p)
 ; CHECK-NEXT: ret i8* %p
 define i8* @test20(i8* %p) {
-  call i8* @objc_autorelease(i8* %p)
-  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  call i8* @llvm.objc.autorelease(i8* %p)
+  call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %p)
   ret i8* %p
 }
 
 ; Like test19 but with plain retain.
 
 ; CHECK: define i8* @test21(i8* %p) {
-; CHECK-NEXT: call i8* @objc_autoreleaseReturnValue(i8* %p)
-; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %p)
 ; CHECK-NEXT: ret i8* %p
 define i8* @test21(i8* %p) {
-  call i8* @objc_autoreleaseReturnValue(i8* %p)
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   ret i8* %p
 }
 
 ; Like test19 but with plain retain and autorelease.
 
 ; CHECK: define i8* @test22(i8* %p) {
-; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
-; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.autorelease(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %p)
 ; CHECK-NEXT: ret i8* %p
 define i8* @test22(i8* %p) {
-  call i8* @objc_autorelease(i8* %p)
-  call i8* @objc_retain(i8* %p)
+  call i8* @llvm.objc.autorelease(i8* %p)
+  call i8* @llvm.objc.retain(i8* %p)
   ret i8* %p
 }
 
 ; Convert autoreleaseRV to autorelease.
 
 ; CHECK-LABEL: define void @test23(
-; CHECK: call i8* @objc_autorelease(i8* %p) [[NUW]]
+; CHECK: call i8* @llvm.objc.autorelease(i8* %p) [[NUW]]
 define void @test23(i8* %p) {
   store i8 0, i8* %p
-  call i8* @objc_autoreleaseReturnValue(i8* %p)
+  call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
   ret void
 }
 
@@ -284,9 +284,9 @@ define void @test23(i8* %p) {
 ; even through a bitcast.
 
 ; CHECK-LABEL: define {}* @test24(
-; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+; CHECK: tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
 define {}* @test24(i8* %p) {
-  %t = call i8* @objc_autoreleaseReturnValue(i8* %p)
+  %t = call i8* @llvm.objc.autoreleaseReturnValue(i8* %p)
   %s = bitcast i8* %p to {}*
   ret {}* %s
 }
@@ -301,16 +301,16 @@ declare void @somecall_test25();
 
 ; CHECK-LABEL: define void @test25(
 ; CHECK: %[[CALL1:.*]] = call i8* @second_test25(
-; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %[[CALL1]])
+; CHECK-NEXT: tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %[[CALL1]])
 
 define void @test25() {
   %first = call i8* @first_test25()
-  %v0 = call i8* @objc_retain(i8* %first)
+  %v0 = call i8* @llvm.objc.retain(i8* %first)
   call void @somecall_test25()
   %second = call i8* @second_test25(i8* %first)
-  %call2 = call i8* @objc_retainAutoreleasedReturnValue(i8* %second)
-  call void @objc_release(i8* %second), !clang.imprecise_release !0
-  call void @objc_release(i8* %first), !clang.imprecise_release !0
+  %call2 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %second)
+  call void @llvm.objc.release(i8* %second), !clang.imprecise_release !0
+  call void @llvm.objc.release(i8* %first), !clang.imprecise_release !0
   ret void
 }
 
@@ -324,10 +324,10 @@ define void @test25() {
 define i8* @test26() {
 bb0:
   %v0 = call i8* @returner()
-  %v1 = tail call i8* @objc_retain(i8* %v0)
+  %v1 = tail call i8* @llvm.objc.retain(i8* %v0)
   br label %bb1
 bb1:
-  %v2 = tail call i8* @objc_autoreleaseReturnValue(i8* %v1)
+  %v2 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %v1)
   br label %bb2
 bb2:
   ret i8* %v2
@@ -336,14 +336,14 @@ bb2:
 declare i32* @func27(i32);
 
 ; Check that ObjCARCOpt::OptimizeAutoreleaseRVCall doesn't turn a call to
-; @objc_autoreleaseReturnValue into a call to @objc_autorelease when a return
-; instruction uses a value equivalent to @objc_autoreleaseReturnValue's operand.
+; @llvm.objc.autoreleaseReturnValue into a call to @llvm.objc.autorelease when a return
+; instruction uses a value equivalent to @llvm.objc.autoreleaseReturnValue's operand.
 ; In the code below, %phival and %retval are considered equivalent.
 
 ; CHECK-LABEL: define i32* @test27(
 ; CHECK: %[[PHIVAL:.*]] = phi i8* [ %{{.*}}, %bb1 ], [ %{{.*}}, %bb2 ]
 ; CHECK: %[[RETVAL:.*]] = phi i32* [ %{{.*}}, %bb1 ], [ %{{.*}}, %bb2 ]
-; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %[[PHIVAL]])
+; CHECK: tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %[[PHIVAL]])
 ; CHECK: ret i32* %[[RETVAL]]
 
 define i32* @test27(i1 %cond) {
@@ -360,7 +360,7 @@ bb2:
 bb3:
   %phival = phi i8* [ %v1, %bb1 ], [ %v3, %bb2 ]
   %retval = phi i32* [ %v0, %bb1 ], [ %v2, %bb2 ]
-  %v4 = tail call i8* @objc_autoreleaseReturnValue(i8* %phival)
+  %v4 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %phival)
   ret i32* %retval
 }
 
diff --git a/test/Transforms/ObjCARC/split-backedge.ll b/test/Transforms/ObjCARC/split-backedge.ll
index 6851487ed505389055b93ed02d5981fdc73b46ba..e9239ae21730988f2388da07ffc8f8819204f343 100644
--- a/test/Transforms/ObjCARC/split-backedge.ll
+++ b/test/Transforms/ObjCARC/split-backedge.ll
@@ -4,12 +4,12 @@
 ; rdar://11256239
 
 ; CHECK-LABEL: define void @test0(
-; CHECK: call i8* @objc_retain(i8* %call) [[NUW:#[0-9]+]]
-; CHECK: call i8* @objc_retain(i8* %call) [[NUW]]
-; CHECK: call i8* @objc_retain(i8* %cond) [[NUW]]
-; CHECK: call void @objc_release(i8* %call) [[NUW]]
-; CHECK: call void @objc_release(i8* %call) [[NUW]]
-; CHECK: call void @objc_release(i8* %cond) [[NUW]]
+; CHECK: call i8* @llvm.objc.retain(i8* %call) [[NUW:#[0-9]+]]
+; CHECK: call i8* @llvm.objc.retain(i8* %call) [[NUW]]
+; CHECK: call i8* @llvm.objc.retain(i8* %cond) [[NUW]]
+; CHECK: call void @llvm.objc.release(i8* %call) [[NUW]]
+; CHECK: call void @llvm.objc.release(i8* %call) [[NUW]]
+; CHECK: call void @llvm.objc.release(i8* %cond) [[NUW]]
 define void @test0() personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*) {
 entry:
   br label %while.body
@@ -19,18 +19,18 @@ while.body:                                       ; preds = %while.cond
           to label %invoke.cont unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
 
 invoke.cont:                                      ; preds = %while.body
-  %t0 = call i8* @objc_retain(i8* %call) nounwind
-  %t1 = call i8* @objc_retain(i8* %call) nounwind
+  %t0 = call i8* @llvm.objc.retain(i8* %call) nounwind
+  %t1 = call i8* @llvm.objc.retain(i8* %call) nounwind
   %call.i1 = invoke i8* @returner()
           to label %invoke.cont1 unwind label %lpad
 
 invoke.cont1:                                     ; preds = %invoke.cont
   %cond = select i1 undef, i8* null, i8* %call
-  %t2 = call i8* @objc_retain(i8* %cond) nounwind
-  call void @objc_release(i8* %call) nounwind
-  call void @objc_release(i8* %call) nounwind
+  %t2 = call i8* @llvm.objc.retain(i8* %cond) nounwind
+  call void @llvm.objc.release(i8* %call) nounwind
+  call void @llvm.objc.release(i8* %call) nounwind
   call void @use_pointer(i8* %cond)
-  call void @objc_release(i8* %cond) nounwind
+  call void @llvm.objc.release(i8* %cond) nounwind
   br label %while.body
 
 lpad:                                             ; preds = %invoke.cont, %while.body
@@ -41,8 +41,8 @@ lpad:                                             ; preds = %invoke.cont, %while
 
 declare i8* @returner()
 declare i32 @__objc_personality_v0(...)
-declare void @objc_release(i8*)
-declare i8* @objc_retain(i8*)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.retain(i8*)
 declare void @use_pointer(i8*)
 
 !0 = !{}
diff --git a/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll b/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll
index 3073abf7bf5ace81695ca28232ea1f3371f5f276..fcb28dd169cbb05da8c024e340b8097678c89ffa 100644
--- a/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll
+++ b/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll
@@ -1,23 +1,23 @@
 ; RUN: opt -objc-arc -S < %s | FileCheck %s
 
-declare void @objc_release(i8* %x)
-declare i8* @objc_retain(i8* %x)
-declare i8* @objc_autorelease(i8* %x)
-declare i8* @objc_autoreleaseReturnValue(i8* %x)
-declare i8* @objc_retainAutoreleasedReturnValue(i8* %x)
-declare i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %x)
+declare void @llvm.objc.release(i8* %x)
+declare i8* @llvm.objc.retain(i8* %x)
+declare i8* @llvm.objc.autorelease(i8* %x)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8* %x)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %x)
+declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %x)
 declare i8* @tmp(i8*)
 
 ; Never tail call objc_autorelease.
 
 ; CHECK: define i8* @test0(i8* %x) [[NUW:#[0-9]+]] {
-; CHECK: %tmp0 = call i8* @objc_autorelease(i8* %x) [[NUW]]
-; CHECK: %tmp1 = call i8* @objc_autorelease(i8* %x) [[NUW]]
+; CHECK: %tmp0 = call i8* @llvm.objc.autorelease(i8* %x) [[NUW]]
+; CHECK: %tmp1 = call i8* @llvm.objc.autorelease(i8* %x) [[NUW]]
 ; CHECK: }
 define i8* @test0(i8* %x) nounwind {
 entry:
-  %tmp0 = call i8* @objc_autorelease(i8* %x)
-  %tmp1 = tail call i8* @objc_autorelease(i8* %x)
+  %tmp0 = call i8* @llvm.objc.autorelease(i8* %x)
+  %tmp1 = tail call i8* @llvm.objc.autorelease(i8* %x)
 
   ret i8* %x
 }
@@ -25,78 +25,78 @@ entry:
 ; Always tail call autoreleaseReturnValue.
 
 ; CHECK: define i8* @test1(i8* %x) [[NUW]] {
-; CHECK: %tmp0 = tail call i8* @objc_autoreleaseReturnValue(i8* %x) [[NUW]]
-; CHECK: %tmp1 = tail call i8* @objc_autoreleaseReturnValue(i8* %x) [[NUW]]
+; CHECK: %tmp0 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %x) [[NUW]]
+; CHECK: %tmp1 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %x) [[NUW]]
 ; CHECK: }
 define i8* @test1(i8* %x) nounwind {
 entry:
-  %tmp0 = call i8* @objc_autoreleaseReturnValue(i8* %x)
-  %tmp1 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  %tmp0 = call i8* @llvm.objc.autoreleaseReturnValue(i8* %x)
+  %tmp1 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %x)
   ret i8* %x
 }
 
 ; Always tail call objc_retain.
 
 ; CHECK: define i8* @test2(i8* %x) [[NUW]] {
-; CHECK: %tmp0 = tail call i8* @objc_retain(i8* %x) [[NUW]]
-; CHECK: %tmp1 = tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK: %tmp0 = tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
+; CHECK: %tmp1 = tail call i8* @llvm.objc.retain(i8* %x) [[NUW]]
 ; CHECK: }
 define i8* @test2(i8* %x) nounwind {
 entry:
-  %tmp0 = call i8* @objc_retain(i8* %x)
-  %tmp1 = tail call i8* @objc_retain(i8* %x)
+  %tmp0 = call i8* @llvm.objc.retain(i8* %x)
+  %tmp1 = tail call i8* @llvm.objc.retain(i8* %x)
   ret i8* %x
 }
 
 ; Always tail call objc_retainAutoreleasedReturnValue.
 ; CHECK: define i8* @test3(i8* %x) [[NUW]] {
-; CHECK: %tmp0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) [[NUW]]
-; CHECK: %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %z) [[NUW]]
+; CHECK: %tmp0 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %y) [[NUW]]
+; CHECK: %tmp1 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %z) [[NUW]]
 ; CHECK: }
 define i8* @test3(i8* %x) nounwind {
 entry:
   %y = call i8* @tmp(i8* %x)
-  %tmp0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %y)
+  %tmp0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %y)
   %z = call i8* @tmp(i8* %x)
-  %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  %tmp1 = tail call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %z)
   ret i8* %x
 }
 
 ; By itself, we should never change whether or not objc_release is tail called.
 
 ; CHECK: define void @test4(i8* %x) [[NUW]] {
-; CHECK: call void @objc_release(i8* %x) [[NUW]]
-; CHECK: tail call void @objc_release(i8* %x) [[NUW]]
+; CHECK: call void @llvm.objc.release(i8* %x) [[NUW]]
+; CHECK: tail call void @llvm.objc.release(i8* %x) [[NUW]]
 ; CHECK: }
 define void @test4(i8* %x) nounwind {
 entry:
-  call void @objc_release(i8* %x)
-  tail call void @objc_release(i8* %x)
+  call void @llvm.objc.release(i8* %x)
+  tail call void @llvm.objc.release(i8* %x)
   ret void
 }
 
-; If we convert a tail called @objc_autoreleaseReturnValue to an
-; @objc_autorelease, ensure that the tail call is removed.
+; If we convert a tail called @llvm.objc.autoreleaseReturnValue to an
+; @llvm.objc.autorelease, ensure that the tail call is removed.
 ; CHECK: define i8* @test5(i8* %x) [[NUW]] {
-; CHECK: %tmp0 = call i8* @objc_autorelease(i8* %x) [[NUW]]
+; CHECK: %tmp0 = call i8* @llvm.objc.autorelease(i8* %x) [[NUW]]
 ; CHECK: }
 define i8* @test5(i8* %x) nounwind {
 entry:
-  %tmp0 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  %tmp0 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %x)
   ret i8* %tmp0
 }
 
-; Always tail call objc_unsafeClaimAutoreleasedReturnValue.
+; Always tail call llvm.objc.unsafeClaimAutoreleasedReturnValue.
 ; CHECK: define i8* @test6(i8* %x) [[NUW]] {
-; CHECK: %tmp0 = tail call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %y) [[NUW]]
-; CHECK: %tmp1 = tail call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %z) [[NUW]]
+; CHECK: %tmp0 = tail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %y) [[NUW]]
+; CHECK: %tmp1 = tail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %z) [[NUW]]
 ; CHECK: }
 define i8* @test6(i8* %x) nounwind {
 entry:
   %y = call i8* @tmp(i8* %x)
-  %tmp0 = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %y)
+  %tmp0 = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %y)
   %z = call i8* @tmp(i8* %x)
-  %tmp1 = tail call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %z)
+  %tmp1 = tail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %z)
   ret i8* %x
 }
 
diff --git a/test/Transforms/ObjCARC/unsafe-claim-rv.ll b/test/Transforms/ObjCARC/unsafe-claim-rv.ll
index addd0c8f97369b78032457092cd2a0437c536a4e..8b64802565521fc6a5f0e787213e0bbd46391550 100644
--- a/test/Transforms/ObjCARC/unsafe-claim-rv.ll
+++ b/test/Transforms/ObjCARC/unsafe-claim-rv.ll
@@ -15,33 +15,33 @@
 ;
 ; And then hand-reduced further. 
 
-declare i8* @objc_autoreleaseReturnValue(i8*)
-declare i8* @objc_unsafeClaimAutoreleasedReturnValue(i8*)
-declare i8* @objc_retain(i8*)
-declare void @objc_release(i8*)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare void @llvm.objc.release(i8*)
 
 define void @foo(i8* %X) {
 entry:
-  %0 = tail call i8* @objc_retain(i8* %X) 
+  %0 = tail call i8* @llvm.objc.retain(i8* %X) 
   %tobool = icmp eq i8* %0, null
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
-  %1 = tail call i8* @objc_retain(i8* nonnull %0)
+  %1 = tail call i8* @llvm.objc.retain(i8* nonnull %0)
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %entry
   %Y.0 = phi i8* [ %1, %if.then ], [ null, %entry ]
-  %2 = tail call i8* @objc_autoreleaseReturnValue(i8* %Y.0)
-  %3 = tail call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %2)
-  tail call void @objc_release(i8* %0) 
+  %2 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %Y.0)
+  %3 = tail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %2)
+  tail call void @llvm.objc.release(i8* %0) 
   ret void
 }
 
 ; CHECK: if.then
-; CHECK: tail call i8* @objc_retain
-; CHECK-NEXT: call i8* @objc_autorelease
+; CHECK: tail call i8* @llvm.objc.retain
+; CHECK-NEXT: call i8* @llvm.objc.autorelease
 ; CHECK: %Y.0 = phi
-; CHECK-NEXT: tail call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %Y.0)
-; CHECK-NEXT: tail call void @objc_release
+; CHECK-NEXT: tail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %Y.0)
+; CHECK-NEXT: tail call void @llvm.objc.release
 
diff --git a/test/Transforms/ObjCARC/weak-contract.ll b/test/Transforms/ObjCARC/weak-contract.ll
index ca69c7087ddd87776b4ae109e4c6c9d9c6171b42..ca37711529153608b7d44ae2be8b16f6262d18b6 100644
--- a/test/Transforms/ObjCARC/weak-contract.ll
+++ b/test/Transforms/ObjCARC/weak-contract.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -objc-arc-contract -S < %s | FileCheck %s
 
-declare i8* @objc_initWeak(i8**, i8*)
+declare i8* @llvm.objc.initWeak(i8**, i8*)
 
 ; Convert objc_initWeak(p, null) to *p = null.
 
@@ -9,6 +9,6 @@ declare i8* @objc_initWeak(i8**, i8*)
 ; CHECK-NEXT:   ret i8* null
 ; CHECK-NEXT: }
 define i8* @test0(i8** %p) {
-  %t = call i8* @objc_initWeak(i8** %p, i8* null)
+  %t = call i8* @llvm.objc.initWeak(i8** %p, i8* null)
   ret i8* %t
 }
diff --git a/test/Transforms/ObjCARC/weak-copies.ll b/test/Transforms/ObjCARC/weak-copies.ll
index d3177bb727fc0d288cf3febba9461cd52a3c7f7f..440f3fb2bb3f7ac51516b94caf00e76c084ceda8 100644
--- a/test/Transforms/ObjCARC/weak-copies.ll
+++ b/test/Transforms/ObjCARC/weak-copies.ll
@@ -27,13 +27,13 @@ entry:
   %w = alloca i8*, align 8
   %x = alloca i8*, align 8
   %call = call i8* @bar()
-  %0 = call i8* @objc_initWeak(i8** %w, i8* %call) nounwind
-  %1 = call i8* @objc_loadWeak(i8** %w) nounwind
-  %2 = call i8* @objc_initWeak(i8** %x, i8* %1) nounwind
-  %3 = call i8* @objc_loadWeak(i8** %x) nounwind
+  %0 = call i8* @llvm.objc.initWeak(i8** %w, i8* %call) nounwind
+  %1 = call i8* @llvm.objc.loadWeak(i8** %w) nounwind
+  %2 = call i8* @llvm.objc.initWeak(i8** %x, i8* %1) nounwind
+  %3 = call i8* @llvm.objc.loadWeak(i8** %x) nounwind
   call void @use(i8* %3) nounwind
-  call void @objc_destroyWeak(i8** %x) nounwind
-  call void @objc_destroyWeak(i8** %w) nounwind
+  call void @llvm.objc.destroyWeak(i8** %x) nounwind
+  call void @llvm.objc.destroyWeak(i8** %w) nounwind
   ret void
 }
 
@@ -48,8 +48,8 @@ define void @qux(i8* %me) nounwind {
 entry:
   %w = alloca i8*, align 8
   %block = alloca %1, align 8
-  %0 = call i8* @objc_retain(i8* %me) nounwind
-  %1 = call i8* @objc_initWeak(i8** %w, i8* %0) nounwind
+  %0 = call i8* @llvm.objc.retain(i8* %me) nounwind
+  %1 = call i8* @llvm.objc.initWeak(i8** %w, i8* %0) nounwind
   %block.isa = getelementptr inbounds %1, %1* %block, i64 0, i32 0
   store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
   %block.flags = getelementptr inbounds %1, %1* %block, i64 0, i32 1
@@ -61,28 +61,28 @@ entry:
   %block.descriptor = getelementptr inbounds %1, %1* %block, i64 0, i32 4
   store %struct.__block_descriptor* bitcast (%0* @__block_descriptor_tmp to %struct.__block_descriptor*), %struct.__block_descriptor** %block.descriptor, align 8
   %block.captured = getelementptr inbounds %1, %1* %block, i64 0, i32 5
-  %2 = call i8* @objc_loadWeak(i8** %w) nounwind
-  %3 = call i8* @objc_initWeak(i8** %block.captured, i8* %2) nounwind
+  %2 = call i8* @llvm.objc.loadWeak(i8** %w) nounwind
+  %3 = call i8* @llvm.objc.initWeak(i8** %block.captured, i8* %2) nounwind
   %4 = bitcast %1* %block to void ()*
   call void @use_block(void ()* %4) nounwind
-  call void @objc_destroyWeak(i8** %block.captured) nounwind
-  call void @objc_destroyWeak(i8** %w) nounwind
-  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  call void @llvm.objc.destroyWeak(i8** %block.captured) nounwind
+  call void @llvm.objc.destroyWeak(i8** %w) nounwind
+  call void @llvm.objc.release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
 
-declare i8* @objc_retain(i8*)
+declare i8* @llvm.objc.retain(i8*)
 declare void @use_block(void ()*) nounwind
 declare void @__qux_block_invoke_0(i8* %.block_descriptor) nounwind
 declare void @__copy_helper_block_(i8*, i8*) nounwind
-declare void @objc_copyWeak(i8**, i8**)
+declare void @llvm.objc.copyWeak(i8**, i8**)
 declare void @__destroy_helper_block_(i8*) nounwind
-declare void @objc_release(i8*)
+declare void @llvm.objc.release(i8*)
 declare i8* @bar()
-declare i8* @objc_initWeak(i8**, i8*)
-declare i8* @objc_loadWeak(i8**)
+declare i8* @llvm.objc.initWeak(i8**, i8*)
+declare i8* @llvm.objc.loadWeak(i8**)
 declare void @use(i8*) nounwind
-declare void @objc_destroyWeak(i8**)
+declare void @llvm.objc.destroyWeak(i8**)
 
 ; CHECK: attributes [[NUW]] = { nounwind }
 
diff --git a/test/Transforms/ObjCARC/weak-dce.ll b/test/Transforms/ObjCARC/weak-dce.ll
index f09467182b6f9eaf55b25478d29e8f105442657a..e499ac13cf9095683ce0df21e7c071fb02dca8bc 100644
--- a/test/Transforms/ObjCARC/weak-dce.ll
+++ b/test/Transforms/ObjCARC/weak-dce.ll
@@ -4,43 +4,43 @@
 ; Delete the weak calls and replace them with just the net retain.
 
 ;      CHECK: define void @test0(i8* %p) {
-; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %p)
 ; CHECK-NEXT: ret void
 
 define void @test0(i8* %p) {
   %weakBlock = alloca i8*, align 8
-  %tmp7 = call i8* @objc_initWeak(i8** %weakBlock, i8* %p) nounwind
-  %tmp26 = call i8* @objc_loadWeakRetained(i8** %weakBlock) nounwind
-  call void @objc_destroyWeak(i8** %weakBlock) nounwind
+  %tmp7 = call i8* @llvm.objc.initWeak(i8** %weakBlock, i8* %p) nounwind
+  %tmp26 = call i8* @llvm.objc.loadWeakRetained(i8** %weakBlock) nounwind
+  call void @llvm.objc.destroyWeak(i8** %weakBlock) nounwind
   ret void
 }
 
 ;      CHECK: define i8* @test1(i8* %p) {
-; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %p)
 ; CHECK-NEXT: ret i8* %p
 
 define i8* @test1(i8* %p) {
   %weakBlock = alloca i8*, align 8
-  %tmp7 = call i8* @objc_initWeak(i8** %weakBlock, i8* %p) nounwind
-  %tmp26 = call i8* @objc_loadWeakRetained(i8** %weakBlock) nounwind
-  call void @objc_destroyWeak(i8** %weakBlock) nounwind
+  %tmp7 = call i8* @llvm.objc.initWeak(i8** %weakBlock, i8* %p) nounwind
+  %tmp26 = call i8* @llvm.objc.loadWeakRetained(i8** %weakBlock) nounwind
+  call void @llvm.objc.destroyWeak(i8** %weakBlock) nounwind
   ret i8* %tmp26
 }
 
 ;      CHECK: define i8* @test2(i8* %p, i8* %q) {
-; CHECK-NEXT: call i8* @objc_retain(i8* %q)
+; CHECK-NEXT: call i8* @llvm.objc.retain(i8* %q)
 ; CHECK-NEXT: ret i8* %q
 
 define i8* @test2(i8* %p, i8* %q) {
   %weakBlock = alloca i8*, align 8
-  %tmp7 = call i8* @objc_initWeak(i8** %weakBlock, i8* %p) nounwind
-  %tmp19 = call i8* @objc_storeWeak(i8** %weakBlock, i8* %q) nounwind
-  %tmp26 = call i8* @objc_loadWeakRetained(i8** %weakBlock) nounwind
-  call void @objc_destroyWeak(i8** %weakBlock) nounwind
+  %tmp7 = call i8* @llvm.objc.initWeak(i8** %weakBlock, i8* %p) nounwind
+  %tmp19 = call i8* @llvm.objc.storeWeak(i8** %weakBlock, i8* %q) nounwind
+  %tmp26 = call i8* @llvm.objc.loadWeakRetained(i8** %weakBlock) nounwind
+  call void @llvm.objc.destroyWeak(i8** %weakBlock) nounwind
   ret i8* %tmp26
 }
 
-declare i8* @objc_initWeak(i8**, i8*)
-declare void @objc_destroyWeak(i8**)
-declare i8* @objc_loadWeakRetained(i8**)
-declare i8* @objc_storeWeak(i8** %weakBlock, i8* %q)
+declare i8* @llvm.objc.initWeak(i8**, i8*)
+declare void @llvm.objc.destroyWeak(i8**)
+declare i8* @llvm.objc.loadWeakRetained(i8**)
+declare i8* @llvm.objc.storeWeak(i8** %weakBlock, i8* %q)
diff --git a/test/Transforms/ObjCARC/weak.ll b/test/Transforms/ObjCARC/weak.ll
index 119aa8257866e7053159ba4987ecd37bcfa126a5..caaeba7280e80692946af142c5b357c45a382478 100644
--- a/test/Transforms/ObjCARC/weak.ll
+++ b/test/Transforms/ObjCARC/weak.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -objc-arc -S < %s | FileCheck %s
 
-declare i8* @objc_initWeak(i8**, i8*)
-declare i8* @objc_storeWeak(i8**, i8*)
-declare i8* @objc_loadWeak(i8**)
-declare void @objc_destroyWeak(i8**)
-declare i8* @objc_loadWeakRetained(i8**)
-declare void @objc_moveWeak(i8**, i8**)
-declare void @objc_copyWeak(i8**, i8**)
+declare i8* @llvm.objc.initWeak(i8**, i8*)
+declare i8* @llvm.objc.storeWeak(i8**, i8*)
+declare i8* @llvm.objc.loadWeak(i8**)
+declare void @llvm.objc.destroyWeak(i8**)
+declare i8* @llvm.objc.loadWeakRetained(i8**)
+declare void @llvm.objc.moveWeak(i8**, i8**)
+declare void @llvm.objc.copyWeak(i8**, i8**)
 
 ; If the pointer-to-weak-pointer is null, it's undefined behavior.
 
@@ -32,26 +32,26 @@ declare void @objc_copyWeak(i8**, i8**)
 ; CHECK: ret void
 define void @test0(i8* %p, i8** %q) {
 entry:
-  call i8* @objc_storeWeak(i8** null, i8* %p)
-  call i8* @objc_storeWeak(i8** undef, i8* %p)
-  call i8* @objc_loadWeakRetained(i8** null)
-  call i8* @objc_loadWeakRetained(i8** undef)
-  call i8* @objc_loadWeak(i8** null)
-  call i8* @objc_loadWeak(i8** undef)
-  call i8* @objc_initWeak(i8** null, i8* %p)
-  call i8* @objc_initWeak(i8** undef, i8* %p)
-  call void @objc_destroyWeak(i8** null)
-  call void @objc_destroyWeak(i8** undef)
+  call i8* @llvm.objc.storeWeak(i8** null, i8* %p)
+  call i8* @llvm.objc.storeWeak(i8** undef, i8* %p)
+  call i8* @llvm.objc.loadWeakRetained(i8** null)
+  call i8* @llvm.objc.loadWeakRetained(i8** undef)
+  call i8* @llvm.objc.loadWeak(i8** null)
+  call i8* @llvm.objc.loadWeak(i8** undef)
+  call i8* @llvm.objc.initWeak(i8** null, i8* %p)
+  call i8* @llvm.objc.initWeak(i8** undef, i8* %p)
+  call void @llvm.objc.destroyWeak(i8** null)
+  call void @llvm.objc.destroyWeak(i8** undef)
 
-  call void @objc_copyWeak(i8** null, i8** %q)
-  call void @objc_copyWeak(i8** undef, i8** %q)
-  call void @objc_copyWeak(i8** %q, i8** null)
-  call void @objc_copyWeak(i8** %q, i8** undef)
+  call void @llvm.objc.copyWeak(i8** null, i8** %q)
+  call void @llvm.objc.copyWeak(i8** undef, i8** %q)
+  call void @llvm.objc.copyWeak(i8** %q, i8** null)
+  call void @llvm.objc.copyWeak(i8** %q, i8** undef)
 
-  call void @objc_moveWeak(i8** null, i8** %q)
-  call void @objc_moveWeak(i8** undef, i8** %q)
-  call void @objc_moveWeak(i8** %q, i8** null)
-  call void @objc_moveWeak(i8** %q, i8** undef)
+  call void @llvm.objc.moveWeak(i8** null, i8** %q)
+  call void @llvm.objc.moveWeak(i8** undef, i8** %q)
+  call void @llvm.objc.moveWeak(i8** %q, i8** null)
+  call void @llvm.objc.moveWeak(i8** %q, i8** undef)
 
   ret void
 }
diff --git a/test/Transforms/PhaseOrdering/rotate.ll b/test/Transforms/PhaseOrdering/rotate.ll
new file mode 100644
index 0000000000000000000000000000000000000000..190807668ee8d6bfa5921fcae581245b9d79be8f
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/rotate.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O3 -S < %s                    | FileCheck %s --check-prefixes=ANY,OLDPM
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s --check-prefixes=ANY,NEWPM
+
+; This should become a single funnel shift through a combination
+; of aggressive-instcombine, simplifycfg, and instcombine.
+; https://bugs.llvm.org/show_bug.cgi?id=34924
+
+define i32 @rotl(i32 %a, i32 %b) {
+; OLDPM-LABEL: @rotl(
+; OLDPM-NEXT:  entry:
+; OLDPM-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B:%.*]])
+; OLDPM-NEXT:    ret i32 [[TMP0]]
+;
+; NEWPM-LABEL: @rotl(
+; NEWPM-NEXT:  entry:
+; NEWPM-NEXT:    [[TMP0:%.*]] = sub i32 0, [[B:%.*]]
+; NEWPM-NEXT:    [[TMP1:%.*]] = and i32 [[B]], 31
+; NEWPM-NEXT:    [[TMP2:%.*]] = and i32 [[TMP0]], 31
+; NEWPM-NEXT:    [[TMP3:%.*]] = lshr i32 [[A:%.*]], [[TMP2]]
+; NEWPM-NEXT:    [[TMP4:%.*]] = shl i32 [[A]], [[TMP1]]
+; NEWPM-NEXT:    [[SPEC_SELECT:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; NEWPM-NEXT:    ret i32 [[SPEC_SELECT]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shr = lshr i32 %a, %sub
+  %shl = shl i32 %a, %b
+  %or = or i32 %shr, %shl
+  br label %end
+
+end:
+  %cond = phi i32 [ %or, %rotbb ], [ %a, %entry ]
+  ret i32 %cond
+}
+
diff --git a/test/Transforms/PreISelIntrinsicLowering/objc-arc.ll b/test/Transforms/PreISelIntrinsicLowering/objc-arc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8b7d11ea75f43314f16d45c6f8caa89edc625164
--- /dev/null
+++ b/test/Transforms/PreISelIntrinsicLowering/objc-arc.ll
@@ -0,0 +1,312 @@
+; RUN: opt -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+; RUN: opt -passes='pre-isel-intrinsic-lowering' -S -o - %s | FileCheck %s
+
+; Make sure calls to the objc intrinsics are translated to calls in to the
+; runtime
+
+define i8* @test_objc_autorelease(i8* %arg0) {
+; CHECK-LABEL: test_objc_autorelease
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_autorelease(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.autorelease(i8* %arg0)
+	ret i8* %0
+}
+
+define void @test_objc_autoreleasePoolPop(i8* %arg0) {
+; CHECK-LABEL: test_objc_autoreleasePoolPop
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_autoreleasePoolPop(i8* %arg0)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.autoreleasePoolPop(i8* %arg0)
+  ret void
+}
+
+define i8* @test_objc_autoreleasePoolPush() {
+; CHECK-LABEL: test_objc_autoreleasePoolPush
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_autoreleasePoolPush()
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.autoreleasePoolPush()
+	ret i8* %0
+}
+
+define i8* @test_objc_autoreleaseReturnValue(i8* %arg0) {
+; CHECK-LABEL: test_objc_autoreleaseReturnValue
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_autoreleaseReturnValue(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.autoreleaseReturnValue(i8* %arg0)
+	ret i8* %0
+}
+
+define void @test_objc_copyWeak(i8** %arg0, i8** %arg1) {
+; CHECK-LABEL: test_objc_copyWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_copyWeak(i8** %arg0, i8** %arg1)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.copyWeak(i8** %arg0, i8** %arg1)
+  ret void
+}
+
+define void @test_objc_destroyWeak(i8** %arg0) {
+; CHECK-LABEL: test_objc_destroyWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_destroyWeak(i8** %arg0)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.destroyWeak(i8** %arg0)
+  ret void
+}
+
+define i8* @test_objc_initWeak(i8** %arg0, i8* %arg1) {
+; CHECK-LABEL: test_objc_initWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_initWeak(i8** %arg0, i8* %arg1)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.initWeak(i8** %arg0, i8* %arg1)
+	ret i8* %0
+}
+
+define i8* @test_objc_loadWeak(i8** %arg0) {
+; CHECK-LABEL: test_objc_loadWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_loadWeak(i8** %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.loadWeak(i8** %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_loadWeakRetained(i8** %arg0) {
+; CHECK-LABEL: test_objc_loadWeakRetained
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_loadWeakRetained(i8** %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.loadWeakRetained(i8** %arg0)
+	ret i8* %0
+}
+
+define void @test_objc_moveWeak(i8** %arg0, i8** %arg1) {
+; CHECK-LABEL: test_objc_moveWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_moveWeak(i8** %arg0, i8** %arg1)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.moveWeak(i8** %arg0, i8** %arg1)
+  ret void
+}
+
+define void @test_objc_release(i8* %arg0) {
+; CHECK-LABEL: test_objc_release
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_release(i8* %arg0)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.release(i8* %arg0)
+  ret void
+}
+
+define i8* @test_objc_retain(i8* %arg0) {
+; CHECK-LABEL: test_objc_retain
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retain(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retain(i8* %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_retainAutorelease(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainAutorelease
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retainAutorelease(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retainAutorelease(i8* %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_retainAutoreleaseReturnValue(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainAutoreleaseReturnValue
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = tail call i8* @objc_retainAutoreleaseReturnValue(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = tail call i8* @llvm.objc.retainAutoreleaseReturnValue(i8* %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_retainAutoreleasedReturnValue(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainAutoreleasedReturnValue
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_retainBlock(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainBlock
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retainBlock(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retainBlock(i8* %arg0)
+	ret i8* %0
+}
+
+define void @test_objc_storeStrong(i8** %arg0, i8* %arg1) {
+; CHECK-LABEL: test_objc_storeStrong
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_storeStrong(i8** %arg0, i8* %arg1)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.storeStrong(i8** %arg0, i8* %arg1)
+	ret void
+}
+
+define i8* @test_objc_storeWeak(i8** %arg0, i8* %arg1) {
+; CHECK-LABEL: test_objc_storeWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_storeWeak(i8** %arg0, i8* %arg1)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.storeWeak(i8** %arg0, i8* %arg1)
+	ret i8* %0
+}
+
+define i8* @test_objc_unsafeClaimAutoreleasedReturnValue(i8* %arg0) {
+; CHECK-LABEL: test_objc_unsafeClaimAutoreleasedReturnValue
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %arg0)
+  ret i8* %0
+}
+
+define i8* @test_objc_retainedObject(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainedObject
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retainedObject(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retainedObject(i8* %arg0)
+  ret i8* %0
+}
+
+define i8* @test_objc_unretainedObject(i8* %arg0) {
+; CHECK-LABEL: test_objc_unretainedObject
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_unretainedObject(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.unretainedObject(i8* %arg0)
+  ret i8* %0
+}
+
+define i8* @test_objc_unretainedPointer(i8* %arg0) {
+; CHECK-LABEL: test_objc_unretainedPointer
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_unretainedPointer(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.unretainedPointer(i8* %arg0)
+  ret i8* %0
+}
+
+define i8* @test_objc_retain_autorelease(i8* %arg0) {
+; CHECK-LABEL: test_objc_retain_autorelease
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retain_autorelease(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retain.autorelease(i8* %arg0)
+  ret i8* %0
+}
+
+define i32 @test_objc_sync_enter(i8* %arg0) {
+; CHECK-LABEL: test_objc_sync_enter
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i32 @objc_sync_enter(i8* %arg0)
+; CHECK-NEXT: ret i32 %0
+entry:
+  %0 = call i32 @llvm.objc.sync.enter(i8* %arg0)
+  ret i32 %0
+}
+
+define i32 @test_objc_sync_exit(i8* %arg0) {
+; CHECK-LABEL: test_objc_sync_exit
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i32 @objc_sync_exit(i8* %arg0)
+; CHECK-NEXT: ret i32 %0
+entry:
+  %0 = call i32 @llvm.objc.sync.exit(i8* %arg0)
+  ret i32 %0
+}
+
+declare i8* @llvm.objc.autorelease(i8*)
+declare void @llvm.objc.autoreleasePoolPop(i8*)
+declare i8* @llvm.objc.autoreleasePoolPush()
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare void @llvm.objc.copyWeak(i8**, i8**)
+declare void @llvm.objc.destroyWeak(i8**)
+declare extern_weak i8* @llvm.objc.initWeak(i8**, i8*)
+declare i8* @llvm.objc.loadWeak(i8**)
+declare i8* @llvm.objc.loadWeakRetained(i8**)
+declare void @llvm.objc.moveWeak(i8**, i8**)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.retainAutorelease(i8*)
+declare i8* @llvm.objc.retainAutoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retainBlock(i8*)
+declare void @llvm.objc.storeStrong(i8**, i8*)
+declare i8* @llvm.objc.storeWeak(i8**, i8*)
+declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retainedObject(i8*)
+declare i8* @llvm.objc.unretainedObject(i8*)
+declare i8* @llvm.objc.unretainedPointer(i8*)
+declare i8* @llvm.objc.retain.autorelease(i8*)
+declare i32 @llvm.objc.sync.enter(i8*)
+declare i32 @llvm.objc.sync.exit(i8*)
+
+attributes #0 = { nounwind }
+
+; CHECK: declare i8* @objc_autorelease(i8*)
+; CHECK: declare void @objc_autoreleasePoolPop(i8*)
+; CHECK: declare i8* @objc_autoreleasePoolPush()
+; CHECK: declare i8* @objc_autoreleaseReturnValue(i8*)
+; CHECK: declare void @objc_copyWeak(i8**, i8**)
+; CHECK: declare void @objc_destroyWeak(i8**)
+; CHECK: declare extern_weak i8* @objc_initWeak(i8**, i8*)
+; CHECK: declare i8* @objc_loadWeak(i8**)
+; CHECK: declare i8* @objc_loadWeakRetained(i8**)
+; CHECK: declare void @objc_moveWeak(i8**, i8**)
+; CHECK: declare void @objc_release(i8*) [[NLB:#[0-9]+]]
+; CHECK: declare i8* @objc_retain(i8*) [[NLB]]
+; CHECK: declare i8* @objc_retainAutorelease(i8*)
+; CHECK: declare i8* @objc_retainAutoreleaseReturnValue(i8*)
+; CHECK: declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+; CHECK: declare i8* @objc_retainBlock(i8*)
+; CHECK: declare void @objc_storeStrong(i8**, i8*)
+; CHECK: declare i8* @objc_storeWeak(i8**, i8*)
+; CHECK: declare i8* @objc_unsafeClaimAutoreleasedReturnValue(i8*)
+; CHECK: declare i8* @objc_retainedObject(i8*)
+; CHECK: declare i8* @objc_unretainedObject(i8*)
+; CHECK: declare i8* @objc_unretainedPointer(i8*)
+; CHECK: declare i8* @objc_retain_autorelease(i8*)
+; CHECK: declare i32 @objc_sync_enter(i8*)
+; CHECK: declare i32 @objc_sync_exit(i8*)
+
+; CHECK: attributes #0 = { nounwind }
+; CHECK: attributes [[NLB]] = { nonlazybind }
diff --git a/test/Transforms/PruneEH/looptest.ll b/test/Transforms/PruneEH/looptest.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2729ec933ca798a066bdcaba973eae8d673d8be9
--- /dev/null
+++ b/test/Transforms/PruneEH/looptest.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -prune-eh -S | FileCheck %s
+
+declare void @nounwind() nounwind
+
+define internal void @foo() {
+	call void @nounwind()
+	ret void
+}
+
+; CHECK-LABEL: @caller
+define i32 @caller(i32 %n) personality i32 (...)* @__gxx_personality_v0 {
+entry:
+  br label %for
+
+for:
+  %j = phi i32 [0, %entry], [%j.inc, %inc]
+  %j.cmp = icmp slt i32 %j, %n
+  br i1 %j.cmp, label %body, label %exit, !llvm.loop !0
+
+body:
+; CHECK: call void @foo(), !llvm.mem.parallel_loop_access !0
+	invoke void @foo( )
+			to label %Normal unwind label %Except, !llvm.mem.parallel_loop_access !0
+  br label %inc
+
+inc:
+  %j.inc = add nuw nsw i32 %j, 1
+  br label %for, !llvm.loop !0
+
+exit:
+  br label %Normal
+
+Normal:
+	ret i32 0
+
+Except:
+        landingpad { i8*, i32 }
+                catch i8* null
+	ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+!0 = distinct !{!0}
diff --git a/test/Transforms/Reassociate/fp-expr.ll b/test/Transforms/Reassociate/fp-expr.ll
index dcbf835ba54402018fa7da71b33dcea0835f2ee7..e616c52f28e66e97d070294879ae29161c8d52d7 100644
--- a/test/Transforms/Reassociate/fp-expr.ll
+++ b/test/Transforms/Reassociate/fp-expr.ll
@@ -4,8 +4,8 @@
 define void @test1() {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[T1:%.*]] = tail call <4 x float> @blam()
-; CHECK-NEXT:    [[T23:%.*]] = fsub fast <4 x float> undef, [[T1]]
-; CHECK-NEXT:    [[T24:%.*]] = fadd fast <4 x float> [[T23]], undef
+; CHECK-NEXT:    [[T1_NEG:%.*]] = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[T1]]
+; CHECK-NEXT:    [[T24:%.*]] = fadd fast <4 x float> [[T1_NEG]], undef
 ; CHECK-NEXT:    tail call void @wombat(<4 x float> [[T24]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/test/Transforms/SCCP/apint-bigint2.ll b/test/Transforms/SCCP/apint-bigint2.ll
index f28b966aceba3b696c40049de4c27dc92e094c13..5277d9fa5c6ec86f62624b0e6f52b3a65f317014 100644
--- a/test/Transforms/SCCP/apint-bigint2.ll
+++ b/test/Transforms/SCCP/apint-bigint2.ll
@@ -1,11 +1,11 @@
-; RUN: opt < %s -sccp -S | not grep load
+; RUN: opt < %s -sccp -S | FileCheck %s
 
 @Y = constant [6 x i101] [ i101 12, i101 123456789000000, i101 -12,
                            i101 -123456789000000, i101 0,i101 9123456789000000]
 
-define i101 @array()
-{
-Head:
+; CHECK-LABEL: @array
+; CHECK-NEXT: ret i101 123456789000000
+define i101 @array() {
    %A = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 1
    %B = load i101, i101* %A
    %D = and i101 %B, 1
@@ -13,6 +13,28 @@ Head:
    %E = trunc i101 %DD to i32
    %F = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 %E
    %G = load i101, i101* %F
- 
+
    ret i101 %G
 }
+
+; CHECK-LABEL: @large_aggregate
+; CHECK-NEXT: ret i101 undef
+define i101 @large_aggregate() {
+  %B = load i101, i101* undef
+  %D = and i101 %B, 1
+  %DD = or i101 %D, 1
+  %F = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 5
+  %G = getelementptr i101, i101* %F, i101 %DD
+  %L3 = load i101, i101* %G
+  ret i101 %L3
+}
+
+; CHECK-LABEL: @index_too_large
+; CHECK-NEXT: store i101* getelementptr (i101, i101* getelementptr ([6 x i101], [6 x i101]* @Y, i32 0, i32 -1), i101 9224497936761618431), i101** undef
+; CHECK-NEXT: ret void
+define void @index_too_large() {
+  %ptr1 = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 -1
+  %ptr2 = getelementptr i101, i101* %ptr1, i101 9224497936761618431
+  store i101* %ptr2, i101** undef
+  ret void
+}
diff --git a/test/Transforms/SCCP/ipsccp-preserve-analysis.ll b/test/Transforms/SCCP/ipsccp-preserve-analysis.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b840e5e09fdc2368edec193ce3f0e26c3330fdc6
--- /dev/null
+++ b/test/Transforms/SCCP/ipsccp-preserve-analysis.ll
@@ -0,0 +1,56 @@
+; Basic test to check that DominatorTreeAnalysis is preserved by IPSCCP and
+; the following analysis can re-use it. The test contains two trivial functions
+; IPSCCP can simplify, so we can test the case where IPSCCP makes changes.
+
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN:     -passes='function(require<domtree>,require<postdomtree>),ipsccp,function(require<domtree>,require<postdomtree>)' -S  %s 2>&1 \
+; RUN:     | FileCheck -check-prefixes='IR,NEW-PM' %s
+
+; RUN: opt -passes='function(require<postdomtree>),ipsccp,function(verify<domtree>)' -S  %s | FileCheck -check-prefixes='IR' %s
+
+; NEW-PM: Starting llvm::Module pass manager run.
+; NEW-PM: Running analysis: DominatorTreeAnalysis on f1
+; NEW-PM: Running analysis: PostDominatorTreeAnalysis on f1
+; NEW-PM: Running analysis: DominatorTreeAnalysis on f2
+; NEW-PM: Running analysis: PostDominatorTreeAnalysis on f2
+; NEW-PM: Running pass: IPSCCPPass
+; NEW-PM-DAG: Running analysis: AssumptionAnalysis on f1
+; NEW-PM-DAG: Running analysis: AssumptionAnalysis on f2
+; NEW-PM-NEXT: Invalidating all non-preserved analyses for:
+; NEW-PM-NEXT: Invalidating all non-preserved analyses for: f1
+; NEW-PM-NEXT: Invalidating all non-preserved analyses for: f2
+; NEW-PM-NEXT: Running pass: ModuleToFunctionPassAdaptor
+; NEW-PM-NOT: Running analysis:
+
+; IR-LABEL: @f1
+; IR-LABEL: entry:
+; IR-NEXT: br label %bb2
+; IR-LABEL: bb2:
+; IR-NEXT: undef
+
+; IR-LABEL: @f2
+; IR-NOT: icmp
+; IR:    br label %bbtrue
+; IR-LABEL: bbtrue:
+; IR-NEXT:   ret i32 0
+define internal i32 @f1() readnone {
+entry:
+  br i1 false, label %bb1, label %bb2
+bb1:
+  ret i32 10
+bb2:
+  ret i32 10
+}
+
+define i32 @f2(i32 %n) {
+  %i = call i32 @f1()
+  %cmp = icmp eq i32 %i, 10
+  br i1 %cmp, label %bbtrue, label %bbfalse
+
+bbtrue:
+  ret i32 0
+
+bbfalse:
+  %res = add i32 %n, %i
+  ret i32 %res
+}
diff --git a/test/Transforms/SCCP/ipsccp-ssa-copy-nested-conds.ll b/test/Transforms/SCCP/ipsccp-ssa-copy-nested-conds.ll
new file mode 100644
index 0000000000000000000000000000000000000000..82f7db801bd1ebedecb809e0c79a46f2e848227d
--- /dev/null
+++ b/test/Transforms/SCCP/ipsccp-ssa-copy-nested-conds.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
+
+; Test for PR39772
+; CHECK-LABEL: cleanup:
+; CHECK-NEXT:   %retval.0 = phi i32 [ 0, %if.then ], [ %add, %if.then7 ], [ %add8, %if.else ]
+
+
+%struct.Node = type { %struct.Node*, %struct.Node*, i32 }
+
+define i32 @check(%struct.Node* %node) {
+entry:
+  %cmp = icmp eq %struct.Node* %node, null
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %cleanup
+
+if.end:                                           ; preds = %entry
+  %left = getelementptr inbounds %struct.Node, %struct.Node* %node, i32 0, i32 0
+  %0 = load %struct.Node*, %struct.Node** %left
+  %call = call i32 @check(%struct.Node* %0)
+  %right = getelementptr inbounds %struct.Node, %struct.Node* %node, i32 0, i32 1
+  %1 = load %struct.Node*, %struct.Node** %right
+  %call1 = call i32 @check(%struct.Node* %1)
+  %2 = load %struct.Node*, %struct.Node** %right
+  %height = getelementptr inbounds %struct.Node, %struct.Node* %2, i32 0, i32 2
+  %3 = load i32, i32* %height
+  %cmp3 = icmp ne i32 %3, %call1
+  br i1 %cmp3, label %if.then4, label %if.end5
+
+if.then4:                                         ; preds = %if.end
+  unreachable
+
+if.end5:                                          ; preds = %if.end
+  %cmp6 = icmp sgt i32 %call, %call1
+  br i1 %cmp6, label %if.then7, label %if.else
+
+if.then7:                                         ; preds = %if.end5
+  %add = add nsw i32 %call, 1
+  br label %cleanup
+
+if.else:                                          ; preds = %if.end5
+  %add8 = add nsw i32 %call1, 1
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.else, %if.then7, %if.then
+  %retval.0 = phi i32 [ 0, %if.then ], [ %add, %if.then7 ], [ %add8, %if.else ]
+  ret i32 %retval.0
+}
diff --git a/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
index a573fc911eef3b5a835084579be015c680dc59cb..625748f6820932f03ffeb4c867c4feed6d3183df 100644
--- a/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@@ -19,12 +19,12 @@ define void @mainTest(i32* %ptr) #0  {
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 1, undef
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], undef
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], undef
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP6]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], undef
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
diff --git a/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
index 52a6d73db9813fe33d84b2c09be194538df093db..e8a83fa4b7a600c5065ba3ab9bb125a2f5ac1d01 100644
--- a/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -23,7 +23,7 @@ define void @test() #0 {
 ; CHECK-NEXT:    [[SUM1:%.*]] = add i64 undef, undef
 ; CHECK-NEXT:    [[SUM2:%.*]] = add i64 [[SUM1]], undef
 ; CHECK-NEXT:    [[ZSUM:%.*]] = add i64 [[SUM2]], 0
-; CHECK-NEXT:    [[JOIN:%.*]] = add i64 undef, [[ZSUM]]
+; CHECK-NEXT:    [[JOIN:%.*]] = add i64 [[TMP6]], [[ZSUM]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
diff --git a/test/Transforms/SLPVectorizer/X86/PR39774.ll b/test/Transforms/SLPVectorizer/X86/PR39774.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8405117090b46faa0a024773403fc5cabb010c95
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 | FileCheck %s --check-prefixes=ALL,CHECK
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefixes=ALL,FORCE_REDUCTION
+
+define void @Test(i32) {
+; ALL-LABEL: @Test(
+; ALL-NEXT:  entry:
+; ALL-NEXT:    br label [[LOOP:%.*]]
+; ALL:       loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; ALL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ALL-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; ALL-NEXT:    [[TMP3:%.*]] = add <8 x i32> <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>, [[SHUFFLE]]
+; ALL-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
+; ALL-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
+; ALL-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
+; ALL-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
+; ALL-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
+; ALL-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], undef
+; ALL-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
+; ALL-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
+; ALL-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
+; ALL-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], undef
+; ALL-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
+; ALL-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
+; ALL-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
+; ALL-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
+; ALL-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
+; ALL-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], undef
+; ALL-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], undef
+; ALL-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
+; ALL-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
+; ALL-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
+; ALL-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
+; ALL-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
+; ALL-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
+; ALL-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
+; ALL-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
+; ALL-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
+; ALL-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
+; ALL-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
+; ALL-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
+; ALL-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], undef
+; ALL-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
+; ALL-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
+; ALL-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
+; ALL-NEXT:    [[VAL_40:%.*]] = and i32 [[VAL_38]], undef
+; ALL-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0
+; ALL-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP2]], i32 1
+; ALL-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; ALL-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP6]], i32 0
+; ALL-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1
+; ALL-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP5]], [[TMP8]]
+; ALL-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP5]], [[TMP8]]
+; ALL-NEXT:    [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; ALL-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]]
+; ALL-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; ALL-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; ALL-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; ALL-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; ALL-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; ALL-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP12]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA31:%.*]] = and i32 [[OP_EXTRA30]], [[TMP2]]
+; ALL-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
+; ALL-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %local_4_39.us = phi i32 [ %val_42, %loop ], [ 0, %entry ]
+  %local_8_43.us = phi i32 [ %val_43, %loop ], [ 0, %entry ]
+  %val_0 = add i32 %local_4_39.us, 0
+  %val_1 = and i32 %local_8_43.us, %val_0
+  %val_2 = and i32 %val_1, %0
+  %val_3 = and i32 %val_2, %0
+  %val_4 = and i32 %val_3, %0
+  %val_5 = and i32 %val_4, %0
+  %val_6 = add i32 %local_8_43.us, 55
+  %val_7 = and i32 %val_5, %val_6
+  %val_8 = and i32 %val_7, %0
+  %val_9 = and i32 %val_8, %0
+  %val_10 = and i32 %val_9, %0
+  %val_11 = add i32 %local_8_43.us, 285
+  %val_12 = and i32 %val_10, %val_11
+  %val_13 = and i32 %val_12, %0
+  %val_14 = and i32 %val_13, %0
+  %val_15 = and i32 %val_14, %0
+  %val_16 = and i32 %val_15, %0
+  %val_17 = and i32 %val_16, %0
+  %val_18 = add i32 %local_8_43.us, 1240
+  %val_19 = and i32 %val_17, %val_18
+  %val_20 = add i32 %local_8_43.us, 1496
+  %val_21 = and i32 %val_19, %val_20
+  %val_22 = and i32 %val_21, %0
+  %val_23 = and i32 %val_22, %0
+  %val_24 = and i32 %val_23, %0
+  %val_25 = and i32 %val_24, %0
+  %val_26 = and i32 %val_25, %0
+  %val_27 = and i32 %val_26, %0
+  %val_28 = and i32 %val_27, %0
+  %val_29 = and i32 %val_28, %0
+  %val_30 = and i32 %val_29, %0
+  %val_31 = and i32 %val_30, %0
+  %val_32 = and i32 %val_31, %0
+  %val_33 = and i32 %val_32, %0
+  %val_34 = add i32 %local_8_43.us, 8555
+  %val_35 = and i32 %val_33, %val_34
+  %val_36 = and i32 %val_35, %0
+  %val_37 = and i32 %val_36, %0
+  %val_38 = and i32 %val_37, %0
+  %val_39 = add i32 %local_8_43.us, 12529
+  %val_40 = and i32 %val_38, %val_39
+  %val_41 = add i32 %local_8_43.us, 13685
+  %val_42 = and i32 %val_40, %val_41
+  %val_43 = add i32 %local_8_43.us, 14910
+  br label %loop
+}
diff --git a/test/Transforms/SLPVectorizer/X86/arith-add.ll b/test/Transforms/SLPVectorizer/X86/arith-add.ll
index 0e84f5a70fa01f9a4402e72a3824c02d3fd47a9a..5c6430c689f5c701cf3cc6e280f4789e93af48a0 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-add.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -4,7 +4,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
 @b64 = common global [8 x i64] zeroinitializer, align 64
diff --git a/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/test/Transforms/SLPVectorizer/X86/arith-mul.ll
index 4763a9a2bf12b1f717909426b3c0aea489121946..8a01c85bcdb2749f958b72ed2bcbda44aefcd8e0 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-mul.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -4,7 +4,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
 @b64 = common global [8 x i64] zeroinitializer, align 64
diff --git a/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/test/Transforms/SLPVectorizer/X86/arith-sub.ll
index 2bbaaca02d88b07bdd137479cf3524df0e65e33f..28c807408b7dffe9257c160c05acc7827976c480 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-sub.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -4,7 +4,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
 @b64 = common global [8 x i64] zeroinitializer, align 64
diff --git a/test/Transforms/SLPVectorizer/X86/fabs.ll b/test/Transforms/SLPVectorizer/X86/fabs.ll
index 3ad434ea61cb15a6003300781e51b1506e04ee90..101fd50e3e57c9addf2eae056631611c6a3118c0 100644
--- a/test/Transforms/SLPVectorizer/X86/fabs.ll
+++ b/test/Transforms/SLPVectorizer/X86/fabs.ll
@@ -3,7 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/test/Transforms/SLPVectorizer/X86/fcopysign.ll b/test/Transforms/SLPVectorizer/X86/fcopysign.ll
index 1477d43fcffde262085e092547ed963c68b0820d..b304fed3dc9988fedb370ea3fe7bbf7fc5d0239c 100644
--- a/test/Transforms/SLPVectorizer/X86/fcopysign.ll
+++ b/test/Transforms/SLPVectorizer/X86/fcopysign.ll
@@ -3,7 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/test/Transforms/SLPVectorizer/X86/fma.ll b/test/Transforms/SLPVectorizer/X86/fma.ll
index b30c340616118bbea7624e5054cc0b01b483893d..95b6a71e5455c8dc19194e404cd7e02a42ad57bc 100644
--- a/test/Transforms/SLPVectorizer/X86/fma.ll
+++ b/test/Transforms/SLPVectorizer/X86/fma.ll
@@ -3,7 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/test/Transforms/SLPVectorizer/X86/fptosi.ll b/test/Transforms/SLPVectorizer/X86/fptosi.ll
index 9e95416b4cc48a537b785b0d51ff8bc04d72d98e..69b9f32b73337304d9918492e55a5e9f7b6aaf6a 100644
--- a/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ b/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -46,38 +47,47 @@ define void @fptosi_8f64_8i64() #0 {
 ; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @fptosi_8f64_8i64(
-; AVX256-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
-; AVX256-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
-; AVX256-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
-; AVX256-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
-; AVX256-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
-; AVX256-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
-; AVX256-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
-; AVX256-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
-; AVX256-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @fptosi_8f64_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptosi_8f64_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
 ; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f64_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -297,38 +307,47 @@ define void @fptosi_8f32_8i64() #0 {
 ; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @fptosi_8f32_8i64(
-; AVX256-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
-; AVX256-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
-; AVX256-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
-; AVX256-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
-; AVX256-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
-; AVX256-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
-; AVX256-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
-; AVX256-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
-; AVX256-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @fptosi_8f32_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptosi_8f32_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64>
 ; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f32_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
@@ -495,10 +514,10 @@ define void @fptosi_8f32_8i8() #0 {
 
 define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %a3) #0 {
 ; CHECK-LABEL: @fptosi_4xf64_4i32(
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double %a0 to i32
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double %a1 to i32
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double %a2 to i32
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double %a3 to i32
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0:%.*]] to i32
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1:%.*]] to i32
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2:%.*]] to i32
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3:%.*]] to i32
 ; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
 ; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
 ; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
@@ -518,10 +537,10 @@ define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %
 
 define <4 x i32> @fptosi_4xf32_4i32(float %a0, float %a1, float %a2, float %a3) #0 {
 ; CHECK-LABEL: @fptosi_4xf32_4i32(
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float %a0 to i32
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float %a1 to i32
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float %a2 to i32
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float %a3 to i32
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0:%.*]] to i32
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1:%.*]] to i32
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2:%.*]] to i32
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3:%.*]] to i32
 ; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
 ; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
 ; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
diff --git a/test/Transforms/SLPVectorizer/X86/fptoui.ll b/test/Transforms/SLPVectorizer/X86/fptoui.ll
index c958ed716c93ac1fc404576cb55817c5bb2d6bb1..33d998ffbbc7e5eaf06823c6b12ff0d1d269500f 100644
--- a/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -46,38 +47,47 @@ define void @fptoui_8f64_8i64() #0 {
 ; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @fptoui_8f64_8i64(
-; AVX256-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i64
-; AVX256-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i64
-; AVX256-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i64
-; AVX256-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i64
-; AVX256-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i64
-; AVX256-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
-; AVX256-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
-; AVX256-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
-; AVX256-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @fptoui_8f64_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f64_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
 ; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f64_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -134,38 +144,44 @@ define void @fptoui_8f64_8i32() #0 {
 ; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @fptoui_8f64_8i32(
-; AVX256-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
-; AVX256-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
-; AVX256-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
-; AVX256-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
-; AVX256-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
-; AVX256-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
-; AVX256-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
-; AVX256-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; AVX256-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @fptoui_8f64_8i32(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
+; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f64_8i32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
 ; AVX512-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f64_8i32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
+; AVX256DQ-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -222,38 +238,44 @@ define void @fptoui_8f64_8i16() #0 {
 ; SSE-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @fptoui_8f64_8i16(
-; AVX256-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i16
-; AVX256-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i16
-; AVX256-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i16
-; AVX256-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i16
-; AVX256-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i16
-; AVX256-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i16
-; AVX256-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i16
-; AVX256-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i16
-; AVX256-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; AVX256-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; AVX256-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; AVX256-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; AVX256-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; AVX256-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; AVX256-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; AVX256-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @fptoui_8f64_8i16(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i16
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i16
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i16
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i16
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i16
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i16
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i16
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i16
+; AVX256NODQ-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f64_8i16(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
 ; AVX512-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f64_8i16(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
+; AVX256DQ-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -369,38 +391,47 @@ define void @fptoui_8f32_8i64() #0 {
 ; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @fptoui_8f32_8i64(
-; AVX256-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i64
-; AVX256-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i64
-; AVX256-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i64
-; AVX256-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i64
-; AVX256-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i64
-; AVX256-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
-; AVX256-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
-; AVX256-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
-; AVX256-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX256-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
-; AVX256-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
-; AVX256-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
-; AVX256-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
-; AVX256-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @fptoui_8f32_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f32_8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
 ; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f32_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
@@ -457,38 +488,44 @@ define void @fptoui_8f32_8i32() #0 {
 ; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @fptoui_8f32_8i32(
-; AVX256-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; AVX256-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; AVX256-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; AVX256-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; AVX256-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; AVX256-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; AVX256-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; AVX256-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; AVX256-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @fptoui_8f32_8i32(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
+; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f32_8i32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32>
 ; AVX512-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f32_8i32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32>
+; AVX256DQ-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
diff --git a/test/Transforms/SLPVectorizer/X86/fround.ll b/test/Transforms/SLPVectorizer/X86/fround.ll
index c4e6e682b99e93f1e0f5354f664a0f2388d32937..071632b215647f3dd798056d67b20b6dac5a5824 100644
--- a/test/Transforms/SLPVectorizer/X86/fround.ll
+++ b/test/Transforms/SLPVectorizer/X86/fround.ll
@@ -3,7 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE41
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 5685dd151dbe3da85626b9676fa41ca23811fa05..6c3994b0a22b39afb6dba184d125ca27f330e40a 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -1679,7 +1679,7 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
 ; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
-; CHECK-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
+; CHECK-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], [[TMP9]]
 ; CHECK-NEXT:    ret i32 [[OP_EXTRA3]]
 ;
 ; THRESHOLD-LABEL: @wobble(
@@ -1707,7 +1707,7 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
-; THRESHOLD-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
+; THRESHOLD-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], [[TMP9]]
 ; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA3]]
 ;
   bb:
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 122c47a91dec2a57b1b3b353ceb8cefb187ac9f4..d49d557916c9b3489646f2dec55a6d948f10377a 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefix=SKX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,SKX
 
 @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
 @arr1 = local_unnamed_addr global [32 x float] zeroinitializer, align 16
@@ -11,113 +11,32 @@
 
 define i32 @maxi8(i32) {
 ; CHECK-LABEL: @maxi8(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; CHECK-NEXT:    ret i32 [[TMP23]]
-;
-; AVX-LABEL: @maxi8(
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; AVX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP16]]
-;
-; AVX2-LABEL: @maxi8(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; AVX2-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; AVX2-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP16]]
-;
-; SKX-LABEL: @maxi8(
-; SKX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; SKX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; SKX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP16]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP16]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -191,147 +110,6 @@ define i32 @maxi16(i32) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
 ; CHECK-NEXT:    ret i32 [[TMP32]]
-;
-; AVX-LABEL: @maxi16(
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; AVX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; AVX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
-; AVX-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
-; AVX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
-; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
-; AVX-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
-; AVX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
-; AVX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], undef
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 undef
-; AVX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], undef
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; AVX-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP32]]
-;
-; AVX2-LABEL: @maxi16(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; AVX2-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; AVX2-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; AVX2-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
-; AVX2-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
-; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
-; AVX2-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
-; AVX2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
-; AVX2-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], undef
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 undef
-; AVX2-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], undef
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP32]]
-;
-; SKX-LABEL: @maxi16(
-; SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; SKX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; SKX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; SKX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
-; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
-; SKX-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
-; SKX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
-; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
-; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
-; SKX-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
-; SKX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
-; SKX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], undef
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 undef
-; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], undef
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; SKX-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP32]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -464,252 +242,6 @@ define i32 @maxi32(i32) {
 ; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
 ; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], i32 [[TMP62]], i32 undef
 ; CHECK-NEXT:    ret i32 [[TMP64]]
-;
-; AVX-LABEL: @maxi32(
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; AVX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; AVX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
-; AVX-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
-; AVX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
-; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
-; AVX-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
-; AVX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
-; AVX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], undef
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 undef
-; AVX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], undef
-; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
-; AVX-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], undef
-; AVX-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 undef
-; AVX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP34]], undef
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 undef
-; AVX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP36]], undef
-; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 undef
-; AVX-NEXT:    [[TMP39:%.*]] = icmp sgt i32 [[TMP38]], undef
-; AVX-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], i32 [[TMP38]], i32 undef
-; AVX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP40]], undef
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP40]], i32 undef
-; AVX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP42]], undef
-; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP42]], i32 undef
-; AVX-NEXT:    [[TMP45:%.*]] = icmp sgt i32 [[TMP44]], undef
-; AVX-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], i32 [[TMP44]], i32 undef
-; AVX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP46]], undef
-; AVX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP46]], i32 undef
-; AVX-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP48]], undef
-; AVX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP48]], i32 undef
-; AVX-NEXT:    [[TMP51:%.*]] = icmp sgt i32 [[TMP50]], undef
-; AVX-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], i32 [[TMP50]], i32 undef
-; AVX-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP52]], undef
-; AVX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP52]], i32 undef
-; AVX-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP54]], undef
-; AVX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP54]], i32 undef
-; AVX-NEXT:    [[TMP57:%.*]] = icmp sgt i32 [[TMP56]], undef
-; AVX-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], i32 [[TMP56]], i32 undef
-; AVX-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP58]], undef
-; AVX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP58]], i32 undef
-; AVX-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP60]], undef
-; AVX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP60]], i32 undef
-; AVX-NEXT:    [[TMP63:%.*]] = icmp sgt i32 [[TMP62]], undef
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; AVX-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], i32 [[TMP62]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP64]]
-;
-; AVX2-LABEL: @maxi32(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; AVX2-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; AVX2-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; AVX2-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
-; AVX2-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
-; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
-; AVX2-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
-; AVX2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
-; AVX2-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], undef
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 undef
-; AVX2-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], undef
-; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
-; AVX2-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], undef
-; AVX2-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 undef
-; AVX2-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP34]], undef
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 undef
-; AVX2-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP36]], undef
-; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 undef
-; AVX2-NEXT:    [[TMP39:%.*]] = icmp sgt i32 [[TMP38]], undef
-; AVX2-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], i32 [[TMP38]], i32 undef
-; AVX2-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP40]], undef
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP40]], i32 undef
-; AVX2-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP42]], undef
-; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP42]], i32 undef
-; AVX2-NEXT:    [[TMP45:%.*]] = icmp sgt i32 [[TMP44]], undef
-; AVX2-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], i32 [[TMP44]], i32 undef
-; AVX2-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP46]], undef
-; AVX2-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP46]], i32 undef
-; AVX2-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP48]], undef
-; AVX2-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP48]], i32 undef
-; AVX2-NEXT:    [[TMP51:%.*]] = icmp sgt i32 [[TMP50]], undef
-; AVX2-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], i32 [[TMP50]], i32 undef
-; AVX2-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP52]], undef
-; AVX2-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP52]], i32 undef
-; AVX2-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP54]], undef
-; AVX2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP54]], i32 undef
-; AVX2-NEXT:    [[TMP57:%.*]] = icmp sgt i32 [[TMP56]], undef
-; AVX2-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], i32 [[TMP56]], i32 undef
-; AVX2-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP58]], undef
-; AVX2-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP58]], i32 undef
-; AVX2-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP60]], undef
-; AVX2-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP60]], i32 undef
-; AVX2-NEXT:    [[TMP63:%.*]] = icmp sgt i32 [[TMP62]], undef
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; AVX2-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX2-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], i32 [[TMP62]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP64]]
-;
-; SKX-LABEL: @maxi32(
-; SKX-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
-; SKX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
-; SKX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
-; SKX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
-; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
-; SKX-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
-; SKX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
-; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
-; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
-; SKX-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
-; SKX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
-; SKX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], undef
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 undef
-; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], undef
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
-; SKX-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], undef
-; SKX-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 undef
-; SKX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP34]], undef
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 undef
-; SKX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP36]], undef
-; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 undef
-; SKX-NEXT:    [[TMP39:%.*]] = icmp sgt i32 [[TMP38]], undef
-; SKX-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], i32 [[TMP38]], i32 undef
-; SKX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP40]], undef
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP40]], i32 undef
-; SKX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP42]], undef
-; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP42]], i32 undef
-; SKX-NEXT:    [[TMP45:%.*]] = icmp sgt i32 [[TMP44]], undef
-; SKX-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], i32 [[TMP44]], i32 undef
-; SKX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP46]], undef
-; SKX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP46]], i32 undef
-; SKX-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP48]], undef
-; SKX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP48]], i32 undef
-; SKX-NEXT:    [[TMP51:%.*]] = icmp sgt i32 [[TMP50]], undef
-; SKX-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], i32 [[TMP50]], i32 undef
-; SKX-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP52]], undef
-; SKX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP52]], i32 undef
-; SKX-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP54]], undef
-; SKX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP54]], i32 undef
-; SKX-NEXT:    [[TMP57:%.*]] = icmp sgt i32 [[TMP56]], undef
-; SKX-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], i32 [[TMP56]], i32 undef
-; SKX-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP58]], undef
-; SKX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP58]], i32 undef
-; SKX-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP60]], undef
-; SKX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP60]], i32 undef
-; SKX-NEXT:    [[TMP63:%.*]] = icmp sgt i32 [[TMP62]], undef
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; SKX-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; SKX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], i32 [[TMP62]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP64]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -809,30 +341,30 @@ define i32 @maxi32(i32) {
 }
 
 define float @maxf8(float) {
-; CHECK-LABEL: @maxf8(
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; CHECK-NEXT:    ret float [[TMP23]]
+; SSE-LABEL: @maxf8(
+; SSE-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; SSE-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; SSE-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; SSE-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; SSE-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; SSE-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; SSE-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; SSE-NEXT:    ret float [[TMP23]]
 ;
 ; AVX-LABEL: @maxf8(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
@@ -944,54 +476,54 @@ define float @maxf8(float) {
 }
 
 define float @maxf16(float) {
-; CHECK-LABEL: @maxf16(
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; CHECK-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; CHECK-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; CHECK-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; CHECK-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; CHECK-NEXT:    ret float [[TMP47]]
+; SSE-LABEL: @maxf16(
+; SSE-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; SSE-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; SSE-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; SSE-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; SSE-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; SSE-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; SSE-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; SSE-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; SSE-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; SSE-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; SSE-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; SSE-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; SSE-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; SSE-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; SSE-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; SSE-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; SSE-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; SSE-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; SSE-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; SSE-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; SSE-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; SSE-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; SSE-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; SSE-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; SSE-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; SSE-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; SSE-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; SSE-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; SSE-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; SSE-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; SSE-NEXT:    ret float [[TMP47]]
 ;
 ; AVX-LABEL: @maxf16(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
@@ -1185,347 +717,86 @@ define float @maxf16(float) {
 
 define float @maxf32(float) {
 ; CHECK-LABEL: @maxf32(
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; CHECK-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; CHECK-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; CHECK-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; CHECK-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; CHECK-NEXT:    [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
-; CHECK-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
-; CHECK-NEXT:    [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
-; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
-; CHECK-NEXT:    [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
-; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
-; CHECK-NEXT:    [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
-; CHECK-NEXT:    [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
-; CHECK-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
-; CHECK-NEXT:    [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
-; CHECK-NEXT:    [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
-; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
-; CHECK-NEXT:    [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
-; CHECK-NEXT:    [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
-; CHECK-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
-; CHECK-NEXT:    [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
-; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
-; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
-; CHECK-NEXT:    [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
-; CHECK-NEXT:    [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
-; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
-; CHECK-NEXT:    [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
-; CHECK-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
-; CHECK-NEXT:    [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
-; CHECK-NEXT:    [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
-; CHECK-NEXT:    [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
-; CHECK-NEXT:    [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
-; CHECK-NEXT:    [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
-; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
-; CHECK-NEXT:    [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
-; CHECK-NEXT:    [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
-; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
-; CHECK-NEXT:    [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
-; CHECK-NEXT:    [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
-; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
-; CHECK-NEXT:    [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
-; CHECK-NEXT:    [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
-; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
-; CHECK-NEXT:    ret float [[TMP95]]
-;
-; AVX-LABEL: @maxf32(
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; AVX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; AVX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; AVX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
-; AVX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
-; AVX-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
-; AVX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
-; AVX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
-; AVX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
-; AVX-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
-; AVX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
-; AVX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
-; AVX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
-; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
-; AVX-NEXT:    [[TMP33:%.*]] = fcmp fast ogt float [[TMP32]], undef
-; AVX-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], float [[TMP32]], float undef
-; AVX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP34]], undef
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP34]], float undef
-; AVX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP36]], undef
-; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP36]], float undef
-; AVX-NEXT:    [[TMP39:%.*]] = fcmp fast ogt float [[TMP38]], undef
-; AVX-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], float [[TMP38]], float undef
-; AVX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP40]], undef
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP40]], float undef
-; AVX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP42]], undef
-; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP42]], float undef
-; AVX-NEXT:    [[TMP45:%.*]] = fcmp fast ogt float [[TMP44]], undef
-; AVX-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], float [[TMP44]], float undef
-; AVX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP46]], undef
-; AVX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP46]], float undef
-; AVX-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP48]], undef
-; AVX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP48]], float undef
-; AVX-NEXT:    [[TMP51:%.*]] = fcmp fast ogt float [[TMP50]], undef
-; AVX-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], float [[TMP50]], float undef
-; AVX-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP52]], undef
-; AVX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP52]], float undef
-; AVX-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP54]], undef
-; AVX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP54]], float undef
-; AVX-NEXT:    [[TMP57:%.*]] = fcmp fast ogt float [[TMP56]], undef
-; AVX-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], float [[TMP56]], float undef
-; AVX-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP58]], undef
-; AVX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP58]], float undef
-; AVX-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP60]], undef
-; AVX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP60]], float undef
-; AVX-NEXT:    [[TMP63:%.*]] = fcmp fast ogt float [[TMP62]], undef
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; AVX-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], float [[TMP62]], float undef
-; AVX-NEXT:    ret float [[TMP64]]
-;
-; AVX2-LABEL: @maxf32(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; AVX2-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; AVX2-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; AVX2-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; AVX2-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
-; AVX2-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
-; AVX2-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
-; AVX2-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
-; AVX2-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
-; AVX2-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
-; AVX2-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
-; AVX2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
-; AVX2-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
-; AVX2-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
-; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
-; AVX2-NEXT:    [[TMP33:%.*]] = fcmp fast ogt float [[TMP32]], undef
-; AVX2-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], float [[TMP32]], float undef
-; AVX2-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP34]], undef
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP34]], float undef
-; AVX2-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP36]], undef
-; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP36]], float undef
-; AVX2-NEXT:    [[TMP39:%.*]] = fcmp fast ogt float [[TMP38]], undef
-; AVX2-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], float [[TMP38]], float undef
-; AVX2-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP40]], undef
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP40]], float undef
-; AVX2-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP42]], undef
-; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP42]], float undef
-; AVX2-NEXT:    [[TMP45:%.*]] = fcmp fast ogt float [[TMP44]], undef
-; AVX2-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], float [[TMP44]], float undef
-; AVX2-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP46]], undef
-; AVX2-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP46]], float undef
-; AVX2-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP48]], undef
-; AVX2-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP48]], float undef
-; AVX2-NEXT:    [[TMP51:%.*]] = fcmp fast ogt float [[TMP50]], undef
-; AVX2-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], float [[TMP50]], float undef
-; AVX2-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP52]], undef
-; AVX2-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP52]], float undef
-; AVX2-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP54]], undef
-; AVX2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP54]], float undef
-; AVX2-NEXT:    [[TMP57:%.*]] = fcmp fast ogt float [[TMP56]], undef
-; AVX2-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], float [[TMP56]], float undef
-; AVX2-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP58]], undef
-; AVX2-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP58]], float undef
-; AVX2-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP60]], undef
-; AVX2-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP60]], float undef
-; AVX2-NEXT:    [[TMP63:%.*]] = fcmp fast ogt float [[TMP62]], undef
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; AVX2-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX2-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], float [[TMP62]], float undef
-; AVX2-NEXT:    ret float [[TMP64]]
-;
-; SKX-LABEL: @maxf32(
-; SKX-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; SKX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; SKX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; SKX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; SKX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
-; SKX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
-; SKX-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
-; SKX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
-; SKX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
-; SKX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
-; SKX-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
-; SKX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
-; SKX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
-; SKX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
-; SKX-NEXT:    [[TMP33:%.*]] = fcmp fast ogt float [[TMP32]], undef
-; SKX-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], float [[TMP32]], float undef
-; SKX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP34]], undef
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP34]], float undef
-; SKX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP36]], undef
-; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP36]], float undef
-; SKX-NEXT:    [[TMP39:%.*]] = fcmp fast ogt float [[TMP38]], undef
-; SKX-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], float [[TMP38]], float undef
-; SKX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP40]], undef
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP40]], float undef
-; SKX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP42]], undef
-; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP42]], float undef
-; SKX-NEXT:    [[TMP45:%.*]] = fcmp fast ogt float [[TMP44]], undef
-; SKX-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], float [[TMP44]], float undef
-; SKX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP46]], undef
-; SKX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP46]], float undef
-; SKX-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP48]], undef
-; SKX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP48]], float undef
-; SKX-NEXT:    [[TMP51:%.*]] = fcmp fast ogt float [[TMP50]], undef
-; SKX-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], float [[TMP50]], float undef
-; SKX-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP52]], undef
-; SKX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP52]], float undef
-; SKX-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP54]], undef
-; SKX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP54]], float undef
-; SKX-NEXT:    [[TMP57:%.*]] = fcmp fast ogt float [[TMP56]], undef
-; SKX-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], float [[TMP56]], float undef
-; SKX-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP58]], undef
-; SKX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP58]], float undef
-; SKX-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP60]], undef
-; SKX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP60]], float undef
-; SKX-NEXT:    [[TMP63:%.*]] = fcmp fast ogt float [[TMP62]], undef
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; SKX-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; SKX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], float [[TMP62]], float undef
-; SKX-NEXT:    ret float [[TMP64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
+; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
+; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
+; CHECK-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
+; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
+; CHECK-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
+; CHECK-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
+; CHECK-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
+; CHECK-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
+; CHECK-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
+; CHECK-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
+; CHECK-NEXT:    [[TMP33:%.*]] = fcmp fast ogt float [[TMP32]], undef
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], float [[TMP32]], float undef
+; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP34]], undef
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP34]], float undef
+; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP36]], undef
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP36]], float undef
+; CHECK-NEXT:    [[TMP39:%.*]] = fcmp fast ogt float [[TMP38]], undef
+; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], float [[TMP38]], float undef
+; CHECK-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP40]], undef
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP40]], float undef
+; CHECK-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP42]], undef
+; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP42]], float undef
+; CHECK-NEXT:    [[TMP45:%.*]] = fcmp fast ogt float [[TMP44]], undef
+; CHECK-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], float [[TMP44]], float undef
+; CHECK-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP46]], undef
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP46]], float undef
+; CHECK-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP48]], undef
+; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP48]], float undef
+; CHECK-NEXT:    [[TMP51:%.*]] = fcmp fast ogt float [[TMP50]], undef
+; CHECK-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], float [[TMP50]], float undef
+; CHECK-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP52]], undef
+; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP52]], float undef
+; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP54]], undef
+; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP54]], float undef
+; CHECK-NEXT:    [[TMP57:%.*]] = fcmp fast ogt float [[TMP56]], undef
+; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], float [[TMP56]], float undef
+; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP58]], undef
+; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP58]], float undef
+; CHECK-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP60]], undef
+; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP60]], float undef
+; CHECK-NEXT:    [[TMP63:%.*]] = fcmp fast ogt float [[TMP62]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
+; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], float [[TMP62]], float undef
+; CHECK-NEXT:    ret float [[TMP64]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -1625,32 +896,40 @@ define float @maxf32(float) {
 }
 
 define i32 @maxi8_mutiple_uses(i32) {
-; CHECK-LABEL: @maxi8_mutiple_uses(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP4]], i32 3, i32 4
-; CHECK-NEXT:    store i32 [[TMP24]], i32* @var, align 8
-; CHECK-NEXT:    ret i32 [[TMP23]]
+; SSE-LABEL: @maxi8_mutiple_uses(
+; SSE-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], undef
+; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 undef
+; SSE-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; SSE-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; SSE-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; SSE-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; SSE-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SSE-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; SSE-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = icmp sgt i32 [[TMP17]], [[TMP15]]
+; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP17]], i32 [[TMP15]]
+; SSE-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], [[TMP5]]
+; SSE-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP5]]
+; SSE-NEXT:    [[TMP21:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; SSE-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP22]]
+; SSE-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[OP_EXTRA]], i32 [[TMP22]]
+; SSE-NEXT:    [[TMP25:%.*]] = select i1 [[TMP4]], i32 3, i32 4
+; SSE-NEXT:    store i32 [[TMP25]], i32* @var, align 8
+; SSE-NEXT:    ret i32 [[TMP24]]
 ;
 ; AVX-LABEL: @maxi8_mutiple_uses(
 ; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@@ -1791,32 +1070,42 @@ define i32 @maxi8_mutiple_uses(i32) {
 }
 
 define i32 @maxi8_wrong_parent(i32) {
-; CHECK-LABEL: @maxi8_wrong_parent(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    br label [[PP:%.*]]
-; CHECK:       pp:
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; CHECK-NEXT:    ret i32 [[TMP23]]
+; SSE-LABEL: @maxi8_wrong_parent(
+; SSE-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; SSE-NEXT:    br label [[PP:%.*]]
+; SSE:       pp:
+; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], undef
+; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 undef
+; SSE-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; SSE-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; SSE-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; SSE-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; SSE-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SSE-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; SSE-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; SSE-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; SSE-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], [[TMP15]]
+; SSE-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 [[TMP15]]
+; SSE-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], [[TMP18]]
+; SSE-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP18]]
+; SSE-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], [[TMP5]]
+; SSE-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP5]]
+; SSE-NEXT:    [[TMP26:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; SSE-NEXT:    ret i32 [[OP_EXTRA]]
 ;
 ; AVX-LABEL: @maxi8_wrong_parent(
 ; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@@ -1967,30 +1256,30 @@ pp:
 
 ; PR38191 - We don't handle array-of-pointer reductions.
 define i32* @maxp8(i32) {
-; CHECK-LABEL: @maxp8(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i32* [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32* [[TMP2]], i32* [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32* [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32* [[TMP5]], i32* [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i32* [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32* [[TMP8]], i32* [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i32* [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32* [[TMP11]], i32* [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ugt i32* [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32* [[TMP14]], i32* [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt i32* [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32* [[TMP17]], i32* [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i32* [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32* [[TMP20]], i32* [[TMP21]]
-; CHECK-NEXT:    ret i32* [[TMP23]]
+; SSE-LABEL: @maxp8(
+; SSE-NEXT:    [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = icmp ugt i32* [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32* [[TMP2]], i32* [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 2), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = icmp ugt i32* [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32* [[TMP5]], i32* [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = icmp ugt i32* [[TMP8]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32* [[TMP8]], i32* [[TMP9]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 4), align 16
+; SSE-NEXT:    [[TMP13:%.*]] = icmp ugt i32* [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32* [[TMP11]], i32* [[TMP12]]
+; SSE-NEXT:    [[TMP15:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 5), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = icmp ugt i32* [[TMP14]], [[TMP15]]
+; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32* [[TMP14]], i32* [[TMP15]]
+; SSE-NEXT:    [[TMP18:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 6), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = icmp ugt i32* [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32* [[TMP17]], i32* [[TMP18]]
+; SSE-NEXT:    [[TMP21:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP22:%.*]] = icmp ugt i32* [[TMP20]], [[TMP21]]
+; SSE-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32* [[TMP20]], i32* [[TMP21]]
+; SSE-NEXT:    ret i32* [[TMP23]]
 ;
 ; AVX-LABEL: @maxp8(
 ; AVX-NEXT:    [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16
diff --git a/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
index 80c9044e80a26ff6697198a14d4c30b49820a70e..4dd40876a70a8b56f226d5fd8eff1c1a5200cff9 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -debug < %s 2>&1 | FileCheck %s
-; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -debug < %s 2>&1 | FileCheck --check-prefix=SSE2 %s
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -debug < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -debug < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE
 ; REQUIRES: asserts
 
 ; int test_add(unsigned int *p) {
@@ -11,9 +11,9 @@
 ; }
 
 ; Vector cost is 5, Scalar cost is 7
-; CHECK: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
-; Vector cost is 11, Scalar cost is 7
-; SSE2:  Adding cost 4 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; AVX: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; Vector cost is 6, Scalar cost is 7
+; SSE: Adding cost -1 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
 define i32 @test_add(i32* nocapture readonly %p) {
 ; CHECK-LABEL: @test_add(
 ; CHECK-NEXT:  entry:
@@ -42,33 +42,6 @@ define i32 @test_add(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
-; SSE2-LABEL: @test_add(
-; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
-; SSE2-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
-; SSE2-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
-; SSE2-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
-; SSE2-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
-; SSE2-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
-; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; SSE2-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[TMP2]]
-;
 entry:
   %0 = load i32, i32* %p, align 4
   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
@@ -129,32 +102,6 @@ define i32 @test_mul(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
 ; CHECK-NEXT:    ret i32 [[MUL_714]]
 ;
-; SSE2-LABEL: @test_mul(
-; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
-; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; SSE2-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
-; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; SSE2-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
-; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; SSE2-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
-; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; SSE2-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
-; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; SSE2-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
-; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; SSE2-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[MUL_714]]
-;
 entry:
   %0 = load i32, i32* %p, align 4
   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
@@ -191,56 +138,30 @@ entry:
 define i32 @test_and(i32* nocapture readonly %p) {
 ; CHECK-LABEL: @test_and(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = and i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; CHECK-NEXT:    [[MUL_29:%.*]] = and i32 [[TMP2]], [[MUL_18]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; CHECK-NEXT:    [[MUL_310:%.*]] = and i32 [[TMP3]], [[MUL_29]]
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[MUL_411:%.*]] = and i32 [[TMP4]], [[MUL_310]]
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; CHECK-NEXT:    [[MUL_512:%.*]] = and i32 [[TMP5]], [[MUL_411]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; CHECK-NEXT:    [[MUL_613:%.*]] = and i32 [[TMP6]], [[MUL_512]]
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; CHECK-NEXT:    [[MUL_714:%.*]] = and i32 [[TMP7]], [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[MUL_714]]
-;
-; SSE2-LABEL: @test_and(
-; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
-; SSE2-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
-; SSE2-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
-; SSE2-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
-; SSE2-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
-; SSE2-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
-; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; SSE2-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -278,56 +199,30 @@ entry:
 define i32 @test_or(i32* nocapture readonly %p) {
 ; CHECK-LABEL: @test_or(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = or i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; CHECK-NEXT:    [[MUL_29:%.*]] = or i32 [[TMP2]], [[MUL_18]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; CHECK-NEXT:    [[MUL_310:%.*]] = or i32 [[TMP3]], [[MUL_29]]
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[MUL_411:%.*]] = or i32 [[TMP4]], [[MUL_310]]
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; CHECK-NEXT:    [[MUL_512:%.*]] = or i32 [[TMP5]], [[MUL_411]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; CHECK-NEXT:    [[MUL_613:%.*]] = or i32 [[TMP6]], [[MUL_512]]
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; CHECK-NEXT:    [[MUL_714:%.*]] = or i32 [[TMP7]], [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[MUL_714]]
-;
-; SSE2-LABEL: @test_or(
-; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
-; SSE2-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
-; SSE2-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
-; SSE2-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
-; SSE2-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
-; SSE2-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
-; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; SSE2-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -365,56 +260,30 @@ entry:
 define i32 @test_xor(i32* nocapture readonly %p) {
 ; CHECK-LABEL: @test_xor(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; CHECK-NEXT:    [[MUL_29:%.*]] = xor i32 [[TMP2]], [[MUL_18]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; CHECK-NEXT:    [[MUL_310:%.*]] = xor i32 [[TMP3]], [[MUL_29]]
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[MUL_411:%.*]] = xor i32 [[TMP4]], [[MUL_310]]
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; CHECK-NEXT:    [[MUL_512:%.*]] = xor i32 [[TMP5]], [[MUL_411]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; CHECK-NEXT:    [[MUL_613:%.*]] = xor i32 [[TMP6]], [[MUL_512]]
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; CHECK-NEXT:    [[MUL_714:%.*]] = xor i32 [[TMP7]], [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[MUL_714]]
-;
-; SSE2-LABEL: @test_xor(
-; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
-; SSE2-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
-; SSE2-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
-; SSE2-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
-; SSE2-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
-; SSE2-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
-; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; SSE2-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -441,3 +310,43 @@ entry:
   %mul.714 = xor i32 %7, %mul.613
   ret i32 %mul.714
 }
+
+define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unnamed_addr #0 {
+; CHECK-LABEL: @PR37731(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
+; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %self, align 16
+  %1 = shl <4 x i32> %0, <i32 6, i32 2, i32 13, i32 3>
+  %2 = xor <4 x i32> %1, %0
+  %3 = lshr <4 x i32> %2, <i32 13, i32 27, i32 21, i32 12>
+  %4 = and <4 x i32> %0, <i32 -2, i32 -8, i32 -16, i32 -128>
+  %5 = shl <4 x i32> %4, <i32 18, i32 2, i32 7, i32 13>
+  %6 = xor <4 x i32> %3, %5
+  store <4 x i32> %6, <4 x i32>* %self, align 16
+  %7 = extractelement <4 x i32> %6, i32 0
+  %8 = extractelement <4 x i32> %6, i32 1
+  %9 = xor i32 %7, %8
+  %10 = extractelement <4 x i32> %6, i32 2
+  %11 = xor i32 %9, %10
+  %12 = extractelement <4 x i32> %6, i32 3
+  %13 = xor i32 %11, %12
+  ret i32 %13
+}
diff --git a/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
index e108c4960c0915509f83550b344cb318e446ece5..96c8d7f3b68753af8f5457497b8450c1d45441ca 100644
--- a/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -56,19 +56,28 @@ for.body:                                         ; preds = %for.body, %entry
   
   %add52 = add nsw i32 %add38, %add45
  ; CHECK: add nsw <{{[0-9]+}} x i32>
- ; CHECK-NOT: add nsw <{{[0-9]+}} x i32>
+ ; CHECK: add nsw <{{[0-9]+}} x i32>
  
- ; YAML:      --- !Missed
+ ; YAML:      --- !Passed
  ; YAML-NEXT: Pass:            slp-vectorizer
- ; YAML-NEXT: Name:            HorSLPNotBeneficial
+ ; YAML-NEXT: Name:            StoresVectorized
  ; YAML-NEXT: Function:        foo
  ; YAML-NEXT: Args:
- ; YAML-NEXT:   - String:          Vectorizing horizontal reduction is possible
- ; YAML-NEXT:   - String:          'but not beneficial with cost ' 
- ; YAML-NEXT:   - Cost:            '1'
- ; YAML-NEXT:   - String:          ' and threshold '
- ; YAML-NEXT:   - Threshold:       '0'
+ ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+ ; YAML-NEXT:   - Cost:            '-8'
+ ; YAML-NEXT:   - String:          ' and with tree size '
+ ; YAML-NEXT:   - TreeSize:        '4'
 
+ ; YAML:      --- !Passed
+ ; YAML-NEXT: Pass:            slp-vectorizer
+ ; YAML-NEXT: Name:            VectorizedHorizontalReduction
+ ; YAML-NEXT: Function:        foo
+ ; YAML-NEXT: Args:
+ ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+ ; YAML-NEXT:   - Cost:            '-2'
+ ; YAML-NEXT:   - String:          ' and with tree size '
+ ; YAML-NEXT:   - TreeSize:        '1'
+ 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 8
   br i1 %exitcond, label %for.end, label %for.body
diff --git a/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
index 646f599ce34001cf0f162402562584706d004c2e..e4b71ba974d1b0b8b5300b6e6e82408ad91aa240 100644
--- a/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
+++ b/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -3,7 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
diff --git a/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
index 6fd78e7c9699a454ff6875d981360151e91547da..237673da5a9c59ce1656cf654144bd76e9a815cb 100644
--- a/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
+++ b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -3,7 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
diff --git a/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/test/Transforms/SLPVectorizer/X86/shift-shl.ll
index 70de82bdea5f6611d5dea4be0d7304749ed7616f..8eadd04d1d0b7e8aa13286cbc1e3888ea85a5904 100644
--- a/test/Transforms/SLPVectorizer/X86/shift-shl.ll
+++ b/test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -3,7 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
diff --git a/test/Transforms/SLPVectorizer/X86/sitofp.ll b/test/Transforms/SLPVectorizer/X86/sitofp.ll
index 3b0d33877840f19df53080bdc8983fa6c137d191..d4c4be5bbf6a4c3c1f7197b243ee5eb3a3552e17 100644
--- a/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -29,20 +30,26 @@ define void @sitofp_2i64_2f64() #0 {
 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @sitofp_2i64_2f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @sitofp_2i64_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_2i64_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_2i64_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -69,26 +76,32 @@ define void @sitofp_4i64_4f64() #0 {
 ; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @sitofp_4i64_4f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
-; AVX256-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
-; AVX256-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX256-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @sitofp_4i64_4f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_4i64_4f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
 ; AVX512-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_4i64_4f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -133,38 +146,47 @@ define void @sitofp_8i64_8f64() #0 {
 ; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @sitofp_8i64_8f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
-; AVX256-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
-; AVX256-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
-; AVX256-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
-; AVX256-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
-; AVX256-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
-; AVX256-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX256-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; AVX256-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; AVX256-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; AVX256-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @sitofp_8i64_8f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_8i64_8f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
 ; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_8i64_8f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -610,26 +632,32 @@ define void @sitofp_4i64_4f32() #0 {
 ; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @sitofp_4i64_4f32(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; AVX256-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; AVX256-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
-; AVX256-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
-; AVX256-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @sitofp_4i64_4f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_4i64_4f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
 ; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_4i64_4f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -674,38 +702,44 @@ define void @sitofp_8i64_8f32() #0 {
 ; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @sitofp_8i64_8f32(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; AVX256-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; AVX256-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
-; AVX256-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
-; AVX256-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
-; AVX256-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
-; AVX256-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
-; AVX256-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
-; AVX256-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; AVX256-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; AVX256-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; AVX256-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @sitofp_8i64_8f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sitofp_8i64_8f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
 ; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_8i64_8f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -1181,10 +1215,10 @@ define void @sitofp_16i8_16f32() #0 {
 
 define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-LABEL: @sitofp_4xi32_4f64(
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 %a0 to double
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 %a1 to double
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 %a2 to double
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 %a3 to double
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double
 ; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
 ; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
 ; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
@@ -1204,10 +1238,10 @@ define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 
 define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-LABEL: @sitofp_4xi32_4f32(
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 %a0 to float
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 %a1 to float
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 %a2 to float
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 %a3 to float
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float
 ; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
 ; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
 ; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
diff --git a/test/Transforms/SLPVectorizer/X86/uitofp.ll b/test/Transforms/SLPVectorizer/X86/uitofp.ll
index 652184094234bb4638ea959e22af436c1005e6bb..092918115ebac48c5d5b01f4a8c03390e212c524 100644
--- a/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -134,20 +135,26 @@ define void @uitofp_2i32_2f64() #0 {
 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @uitofp_2i32_2f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @uitofp_2i32_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_2i32_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_2i32_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
@@ -401,20 +408,26 @@ define void @uitofp_2i8_2f64() #0 {
 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @uitofp_2i8_2f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @uitofp_2i8_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_2i8_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
 ; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_2i8_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
   %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
@@ -571,26 +584,32 @@ define void @uitofp_4i64_4f32() #0 {
 ; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @uitofp_4i64_4f32(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; AVX256-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; AVX256-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; AVX256-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @uitofp_4i64_4f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_4i64_4f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
 ; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_4i64_4f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -635,38 +654,44 @@ define void @uitofp_8i64_8f32() #0 {
 ; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @uitofp_8i64_8f32(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; AVX256-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; AVX256-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; AVX256-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
-; AVX256-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
-; AVX256-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
-; AVX256-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
-; AVX256-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; AVX256-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; AVX256-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; AVX256-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; AVX256-NEXT:    ret void
+; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i64_8f32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
 ; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_8i64_8f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
diff --git a/test/Transforms/SROA/alignment.ll b/test/Transforms/SROA/alignment.ll
index 8bc901812461bd8507390ef86505a9744c611636..cbae6be018c7c32bf74e31b65415cbafcdfcf502 100644
--- a/test/Transforms/SROA/alignment.ll
+++ b/test/Transforms/SROA/alignment.ll
@@ -181,3 +181,50 @@ entry:
   ret void
 ; CHECK: ret void
 }
+
+define void @test8() {
+; CHECK-LABEL: @test8(
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+
+  %ptr = alloca [5 x i32], align 1
+  %ptr.8 = bitcast [5 x i32]* %ptr to i8*
+  call void @populate(i8* %ptr.8)
+  %val = load [5 x i32], [5 x i32]* %ptr, align 1
+  ret void
+}
+
+define void @test9() {
+; CHECK-LABEL: @test9(
+; CHECK: load i32, {{.*}}, align 8
+; CHECK: load i32, {{.*}}, align 4
+; CHECK: load i32, {{.*}}, align 8
+; CHECK: load i32, {{.*}}, align 4
+; CHECK: load i32, {{.*}}, align 8
+
+  %ptr = alloca [5 x i32], align 8
+  %ptr.8 = bitcast [5 x i32]* %ptr to i8*
+  call void @populate(i8* %ptr.8)
+  %val = load [5 x i32], [5 x i32]* %ptr, align 8
+  ret void
+}
+
+define void @test10() {
+; CHECK-LABEL: @test10(
+; CHECK: load i32, {{.*}}, align 2
+; CHECK: load i8, {{.*}}, align 2
+; CHECK: load i8, {{.*}}, align 1
+; CHECK: load i8, {{.*}}, align 2
+; CHECK: load i16, {{.*}}, align 2
+
+  %ptr = alloca {i32, i8, i8, {i8, i16}}, align 2
+  %ptr.8 = bitcast {i32, i8, i8, {i8, i16}}* %ptr to i8*
+  call void @populate(i8* %ptr.8)
+  %val = load {i32, i8, i8, {i8, i16}}, {i32, i8, i8, {i8, i16}}* %ptr, align 2
+  ret void
+}
+
+declare void @populate(i8*)
diff --git a/test/Transforms/SampleProfile/inline-cold-callsite-samplepgo.ll b/test/Transforms/SampleProfile/inline-cold-callsite-samplepgo.ll
new file mode 100644
index 0000000000000000000000000000000000000000..de2fa7cdcbfbe2a80d8713e7bd3b94178e13046f
--- /dev/null
+++ b/test/Transforms/SampleProfile/inline-cold-callsite-samplepgo.ll
@@ -0,0 +1,31 @@
+; For SamplePGO, if -profile-sample-accurate is specified, cold callsite
+; heuristics should be honored if the caller has no profile.
+
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline.prof -inline -S -inline-cold-callsite-threshold=0 | FileCheck %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline.prof -profile-sample-accurate -inline -S -inline-cold-callsite-threshold=0 | FileCheck %s --check-prefix ACCURATE
+
+declare void @extern()
+define void @callee() {
+  call void @extern()
+  ret void
+}
+
+define void @caller(i32 %y1) {
+; CHECK-LABEL: @caller
+; CHECK-NOT: call void @callee
+; ACCURATE-LABEL: @caller
+; ACCURATE: call void @callee
+  call void @callee()
+  ret void
+}
+
+define void @caller_accurate(i32 %y1) #0 {
+; CHECK-LABEL: @caller_accurate
+; CHECK: call void @callee
+; ACCURATE-LABEL: @caller_accurate
+; ACCURATE: call void @callee
+  call void @callee()
+  ret void
+}
+
+attributes #0 = { "profile-sample-accurate" }
diff --git a/test/Transforms/SampleProfile/section-accurate-samplepgo.ll b/test/Transforms/SampleProfile/section-accurate-samplepgo.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3a985ff7b5ea340aaac72fdb8120ba59e12c9d7e
--- /dev/null
+++ b/test/Transforms/SampleProfile/section-accurate-samplepgo.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline.prof -codegenprepare -S | FileCheck %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline.prof -codegenprepare -profile-sample-accurate -S | FileCheck %s --check-prefix ACCURATE
+
+target triple = "x86_64-pc-linux-gnu"
+
+; The test checks that function without profile gets unlikely section prefix
+; if -profile-sample-accurate is specified or the function has the
+; profile-sample-accurate attribute.
+
+declare void @hot_func()
+
+; CHECK-NOT: foo_not_in_profile{{.*}}!section_prefix
+; CHECK: foo_not_in_profile{{.*}}!prof ![[UNKNOWN_ID:[0-9]+]]
+; ACCURATE: foo_not_in_profile{{.*}}!prof ![[ZERO_ID:[0-9]+]] !section_prefix ![[COLD_ID:[0-9]+]]
+; The function not appearing in profile is cold when -profile-sample-accurate
+; is on.
+define void @foo_not_in_profile() {
+  call void @hot_func()
+  ret void
+}
+
+; CHECK: bar_not_in_profile{{.*}}!prof ![[ZERO_ID:[0-9]+]] !section_prefix ![[COLD_ID:[0-9]+]]
+; ACCURATE: bar_not_in_profile{{.*}}!prof ![[ZERO_ID:[0-9]+]] !section_prefix ![[COLD_ID:[0-9]+]]
+; The function not appearing in profile is cold when the func has
+; profile-sample-accurate attribute.
+define void @bar_not_in_profile() #0 {
+  call void @hot_func()
+  ret void
+}
+
+attributes #0 = { "profile-sample-accurate" }
+
+; CHECK: ![[UNKNOWN_ID]] = !{!"function_entry_count", i64 -1}
+; CHECK: ![[ZERO_ID]] = !{!"function_entry_count", i64 0}
+; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".unlikely"}
+; ACCURATE: ![[ZERO_ID]] = !{!"function_entry_count", i64 0}
+; ACCURATE: ![[COLD_ID]] = !{!"function_section_prefix", !".unlikely"}
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
diff --git a/test/Transforms/Scalarizer/basic.ll b/test/Transforms/Scalarizer/basic.ll
index 150eb7d43c462e0d1ef49855a54e515f704c82cf..c3ba1fe83fb9cefec9110f72975e80ac55b1b8ca 100644
--- a/test/Transforms/Scalarizer/basic.ll
+++ b/test/Transforms/Scalarizer/basic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -scalarizer -scalarize-load-store -dce -S | FileCheck %s
+; RUN: opt %s -passes='function(scalarizer,dce)' -scalarize-load-store -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare <4 x float> @ext(<4 x float>)
diff --git a/test/Transforms/Scalarizer/cache-bug.ll b/test/Transforms/Scalarizer/cache-bug.ll
index f8c2d100d59a9bfabaa6cd0b3fe72028e9221d04..cfb4140e41c5b2e4fec0aba44ef7ccd308c95523 100644
--- a/test/Transforms/Scalarizer/cache-bug.ll
+++ b/test/Transforms/Scalarizer/cache-bug.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -scalarizer -S < %s | FileCheck %s
+; RUN: opt -passes='function(scalarizer)' -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 
diff --git a/test/Transforms/Scalarizer/crash-bug.ll b/test/Transforms/Scalarizer/crash-bug.ll
index 95055d00ba1c3df360d04a4b1861daa508b572d4..d0d019564977c3d4a8b2d2e8fd8b76398de741d1 100644
--- a/test/Transforms/Scalarizer/crash-bug.ll
+++ b/test/Transforms/Scalarizer/crash-bug.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -scalarizer -S -o - | FileCheck %s
+; RUN: opt %s -passes='function(scalarizer)' -S -o - | FileCheck %s
 
 ; Don't crash
 
diff --git a/test/Transforms/Scalarizer/dbginfo.ll b/test/Transforms/Scalarizer/dbginfo.ll
index b92714c9b2222b8928b71286f6856abc3b92da22..37452ec70a9bc33f2d5ca46b7eee09f3016ebce0 100644
--- a/test/Transforms/Scalarizer/dbginfo.ll
+++ b/test/Transforms/Scalarizer/dbginfo.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -scalarizer -scalarize-load-store -S | FileCheck %s
+; RUN: opt %s -passes='function(scalarizer)' -scalarize-load-store -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
diff --git a/test/Transforms/Scalarizer/dbgloc-bug.ll b/test/Transforms/Scalarizer/dbgloc-bug.ll
index c0e1b090e5c92b34a4d8b958c3e3e2406f77bf2c..7c627ee78a3446845bcd06728bbde42b4cd6c0a7 100644
--- a/test/Transforms/Scalarizer/dbgloc-bug.ll
+++ b/test/Transforms/Scalarizer/dbgloc-bug.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -march=x86 -scalarizer %s | FileCheck %s
+; RUN: opt -S -march=x86 -passes='function(scalarizer)' %s | FileCheck %s
 
 ; Reproducer for pr27938
 ; https://llvm.org/bugs/show_bug.cgi?id=27938
diff --git a/test/Transforms/Scalarizer/intrinsics.ll b/test/Transforms/Scalarizer/intrinsics.ll
index 7cebdffab7c40d673cb191f7962f19a2de4058d8..7cd324122d9094d14882536f5116c704499acf00 100644
--- a/test/Transforms/Scalarizer/intrinsics.ll
+++ b/test/Transforms/Scalarizer/intrinsics.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -scalarizer %s | FileCheck %s
+; RUN: opt -S -passes='function(scalarizer)' %s | FileCheck %s
 
 ; Unary fp
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
diff --git a/test/Transforms/Scalarizer/order-bug.ll b/test/Transforms/Scalarizer/order-bug.ll
index 891c1eb4fa00a06bc2ae0c8801e65aa2c92d0d38..1265bb07bf17dfc09740e27eb3cef09222ce40da 100644
--- a/test/Transforms/Scalarizer/order-bug.ll
+++ b/test/Transforms/Scalarizer/order-bug.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -scalarizer -S -o - | FileCheck %s
+; RUN: opt %s -passes='function(scalarizer)' -S -o - | FileCheck %s
 
 ; This input caused the scalarizer to replace & erase gathered results when 
 ; future gathered results depended on them being alive
diff --git a/test/Transforms/Scalarizer/phi-bug.ll b/test/Transforms/Scalarizer/phi-bug.ll
index 03bffb96451dc3e7ba4732584a045b4e8f677871..3fd08130c53634ffb96b127f9345c3fb03c086ee 100644
--- a/test/Transforms/Scalarizer/phi-bug.ll
+++ b/test/Transforms/Scalarizer/phi-bug.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -scalarizer -verify -S -o - | FileCheck %s
+; RUN: opt %s -passes='function(scalarizer,verify)' -S -o - | FileCheck %s
 
 define void @f3() local_unnamed_addr {
 bb1:
diff --git a/test/Transforms/Scalarizer/store-bug.ll b/test/Transforms/Scalarizer/store-bug.ll
index 84c2b3f840a0c4be89c840c4a4ea3e24b54c6f03..8f4d30db1a874d7b9c5ea7e9fdd6fc5428101f6f 100644
--- a/test/Transforms/Scalarizer/store-bug.ll
+++ b/test/Transforms/Scalarizer/store-bug.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -scalarizer -scalarize-load-store -S < %s | FileCheck %s
+; RUN: opt -passes='function(scalarizer)' -scalarize-load-store -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; This input caused the scalarizer not to clear cached results
diff --git a/test/Transforms/Scalarizer/vector-gep.ll b/test/Transforms/Scalarizer/vector-gep.ll
index eacddf136a324dfd780e2fa1bd7a03a48aefe000..81566067401c9d185b719ab69cda4095b1d0b286 100644
--- a/test/Transforms/Scalarizer/vector-gep.ll
+++ b/test/Transforms/Scalarizer/vector-gep.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -scalarizer %s | FileCheck %s
+; RUN: opt -S -passes='function(scalarizer)' %s | FileCheck %s
 
 ; Check that the scalarizer can handle vector GEPs with scalar indices
 
diff --git a/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll b/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll
index 3b22687ca91ee07e9a1a90c2b5de8d3919911950..0a769ec5da6b47f8a467dffb8b7e6dbb9f75f59b 100644
--- a/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 
 	%struct.BLEND_MAP = type { i16, i16, i16, i32, %struct.BLEND_MAP_ENTRY* }
 	%struct.BLEND_MAP_ENTRY = type { float, i8, { [5 x float], [4 x i8] } }
diff --git a/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll b/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll
index 61825b6c1d85fc07c23cf8369e0e598db3ff523e..85066168e1e23d59e4312faa2ca9b937ce9b3c49 100644
--- a/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 
 define void @init_caller_save() {
 entry:
diff --git a/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll b/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll
index 4bbcc80ff3bbc7083ac7d7b1a31908ca88d04b90..02c7a96deb5dc024863bc6f8d13fad3192a5bde4 100644
--- a/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll
@@ -1,5 +1,6 @@
 ; PR1333
 ; RUN: opt < %s -simple-loop-unswitch -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i686-pc-linux-gnu"
diff --git a/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll b/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll
index 835715264e1bef10ca19db1f8fd4dfe154bba1ec..a0408c8ea6a2e58196a9a5a4a4ab0213767c4d3a 100644
--- a/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 ; PR1333
 
 define void @pp_cxx_expression() {
diff --git a/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll b/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll
index 2a73a429c6ac505b2332b5b721edda39febb3765..571e3eb6696df9f9c200f30c656fcb9dd94b233c 100644
--- a/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -instcombine -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output
 
 @str3 = external constant [3 x i8]		; <[3 x i8]*> [#uses=1]
 
diff --git a/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll b/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll
index 7299b2b1908b09c824237e4369e5a6bb8b3e0ff1..626ac848cfb6b386b64acd8557b1e30d7d834808 100644
--- a/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 
 define i32 @main(i32 %argc, i8** %argv) {
 entry:
diff --git a/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll b/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll
index e1ef5064a8fec6331f4ee7d55cfa336d34bad3a1..52d96893060e13def9444eb666a9910867530fcc 100644
--- a/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 ; PR1559
 
 target triple = "i686-pc-linux-gnu"
diff --git a/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll b/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll
index cb65d2fb64469fe05d6455ff72cfdace25df68de..7c65459a65ccc57c46a21354adf738abbe238bcd 100644
--- a/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -instcombine -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output
 	%struct.ClassDef = type { %struct.QByteArray, %struct.QByteArray, %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", i8, i8, %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QMap<QByteArray,QByteArray>", %"struct.QList<ArgumentDef>", %"struct.QMap<QByteArray,QByteArray>", i32, i32 }
 	%struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
 	%struct.Generator = type { %struct.FILE*, %struct.ClassDef*, %"struct.QList<ArgumentDef>", %struct.QByteArray, %"struct.QList<ArgumentDef>" }
diff --git a/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll b/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll
index 5b3ca9c187d115509bfca6288b78531631c40a65..5db1ced473f6a576a47d5b20d2aaba0b61976b86 100644
--- a/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -instcombine -gvn -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -gvn -disable-output
 ; PR2372
 target triple = "i386-pc-linux-gnu"
 
diff --git a/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll b/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll
index 9f71417df8ed7cf0336435f15e7b669e4ec282f5..f3a382d813b8b95040cb0fce39967a1ea78fd33e 100644
--- a/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa
 ; PR8622
 @g_38 = external global i32, align 4
 
diff --git a/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll b/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
index 8c13e1854c99debf35346d8d388f47f033fb1b5c..b861d3029d566d3d7f7f49c05bc8cbb5ce77cb2c 100644
--- a/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -simple-loop-unswitch -disable-output < %s
+; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output < %s
 ; PR10031
 
 define i32 @test(i32 %command) {
diff --git a/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll b/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll
index 6b7d9faf70f4194a87364d56f870183590cd5e19..16886bfec034db10ea5419265163a329a37222ad 100644
--- a/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -sroa -simple-loop-unswitch -disable-output
+; RUN: opt < %s -sroa -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 ; PR11016
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.2"
diff --git a/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll b/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll
index 133f41744a964db8c18a914ba9302987c2b78c54..72af984081877d2f38f2bc80990f809e8b863653 100644
--- a/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -simple-loop-unswitch -verify-loop-info -verify-dom-info | FileCheck %s
+; RUN: opt < %s -S -simple-loop-unswitch -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
 ; PR12343: -simple-loop-unswitch crash on indirect branch
 
 ; CHECK:       %0 = icmp eq i64 undef, 0
diff --git a/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll b/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll
index e5549fe92f9a1c6f96ff8d775f01f2d815cf7e86..e9f35709ba730e3c9a4850c6df94694caa9716e5 100644
--- a/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 ; PR12887
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll b/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
index ddb888af8ba9e79c9af79573cb71059c2b1b7ed0..c8de642a5d09e94f3083ff40facab6a6a62b0c82 100644
--- a/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
+++ b/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -S | FileCheck %s
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 ; In cases where two address spaces do not have the same size pointer, the
 ; input for the addrspacecast should not be used as a substitute for itself
diff --git a/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll b/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
index 03c23da6c9622863610c2a864eef91192925c22d..59c14e937b637091ee95982aadc4bafb6f89bf0b 100644
--- a/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
+++ b/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -S 2>&1 | FileCheck %s
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S 2>&1 | FileCheck %s
 
 ; This is to test trivial loop unswitch only happens when trivial condition
 ; itself is an LIV loop condition (not partial LIV which could occur in and/or).
diff --git a/test/Transforms/SimpleLoopUnswitch/basictest.ll b/test/Transforms/SimpleLoopUnswitch/basictest.ll
index afca20fe251821bfc60ea30eeec0a116526dbd73..240f433a8dbf053dfbfc4c3281ba7abab8cdad58 100644
--- a/test/Transforms/SimpleLoopUnswitch/basictest.ll
+++ b/test/Transforms/SimpleLoopUnswitch/basictest.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes='loop(unswitch),verify<loops>' -S < %s | FileCheck %s
+; RUN: opt -enable-mssa-loop-dependency=true -verify-memoryssa -passes='loop(unswitch),verify<loops>' -S < %s | FileCheck %s
 
 define i32 @test(i32* %A, i1 %C) {
 entry:
diff --git a/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll b/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll
index 8ab23cb1f61c1695d3551803af23614b18ea762f..1cade22b65905c4259e24af300a666ddd988c0bb 100644
--- a/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll
+++ b/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -simple-loop-unswitch < %s | FileCheck %s
+; RUN: opt -S -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 target triple = "x86_64-pc-win32"
 
 define void @f(i32 %doit, i1 %x, i1 %y) personality i32 (...)* @__CxxFrameHandler3 {
diff --git a/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll b/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll
index 7085ed8e10fe3aaf36911cb492055bd8b9635f0c..09d7d792c7c6c6b8f7337008bf77c751d41e9fa7 100644
--- a/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll
+++ b/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -S | FileCheck %s
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 ; This test checks if unswitched condition preserve make.implicit metadata.
 define i32 @test(i1 %cond) {
diff --git a/test/Transforms/SimpleLoopUnswitch/crash.ll b/test/Transforms/SimpleLoopUnswitch/crash.ll
index a6c64113c08ffa8a1a3b3e8554d4fd337707384a..cf6a19d254019b215730bc811512af9c32f93085 100644
--- a/test/Transforms/SimpleLoopUnswitch/crash.ll
+++ b/test/Transforms/SimpleLoopUnswitch/crash.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -simple-loop-unswitch -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 
 define void @test1(i32* %S2) {
 entry:
diff --git a/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll b/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll
index 52e9aa1acc2b020460d2c732f2649db681cbc0ee..1c46ddbf51a8669c6675c227e5cb9f4cfadad355 100644
--- a/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll
+++ b/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -simple-loop-unswitch -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
 
 define void @f(i32 %n, i32* %ptr) {
 ; CHECK-LABEL: @f(
diff --git a/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll b/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll
new file mode 100644
index 0000000000000000000000000000000000000000..711c476a5e5e137ad073b70ec5750abe2a419ce1
--- /dev/null
+++ b/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll
@@ -0,0 +1,139 @@
+;
+; There should be just a single copy of each loop when strictest mutiplier
+; candidates formula (unscaled candidates == 0) is enforced:
+
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+;
+; When we relax the candidates part of a multiplier formula
+; (unscaled candidates == 4) we start getting  some unswitches,
+; which leads to siblings multiplier kicking in.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=4 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-UNSCALE4-DIV1
+;
+; NB: sort -b is essential here and below, otherwise blanks might lead to different
+; order depending on locale.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=4 -unswitch-siblings-toplevel-div=2 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-UNSCALE4-DIV2
+;
+;
+; Get
+;    2^(num conds) == 2^5 = 32
+; loop nests when cost multiplier is disabled:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:	   sort -b -k 1 | FileCheck %s --check-prefixes=LOOP32
+;
+; Single loop nest, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1:     Loop at depth 2 containing:
+; LOOP1:     Loop at depth 3 containing:
+; LOOP1-NOT: Loop at depth {{[0-9]+}} containing:
+;
+; Half unswitched loop nests, with unscaled4 and div1 it gets less depth1 loops unswitched
+; since they have more cost.
+; LOOP-UNSCALE4-DIV1-COUNT-6: Loop at depth 1 containing:
+; LOOP-UNSCALE4-DIV1-COUNT-19: Loop at depth 2 containing:
+; LOOP-UNSCALE4-DIV1-COUNT-29: Loop at depth 3 containing:
+; LOOP-UNSCALE4-DIV1-NOT:      Loop at depth {{[0-9]+}} containing:
+;
+; Half unswitched loop nests, with unscaled4 and div2 it gets more depth1 loops unswitched
+; as div2 kicks in.
+; LOOP-UNSCALE4-DIV2-COUNT-11: Loop at depth 1 containing:
+; LOOP-UNSCALE4-DIV2-COUNT-22: Loop at depth 2 containing:
+; LOOP-UNSCALE4-DIV2-COUNT-29: Loop at depth 3 containing:
+; LOOP-UNSCALE4-DIV2-NOT:      Loop at depth {{[0-9]+}} containing:
+;
+; 32 loop nests, fully unswitched
+; LOOP32-COUNT-32: Loop at depth 1 containing:
+; LOOP32-COUNT-32: Loop at depth 2 containing:
+; LOOP32-COUNT-32: Loop at depth 3 containing:
+; LOOP32-NOT:      Loop at depth {{[0-9]+}} containing:
+
+declare void @bar()
+
+define void @loop_nested3_conds5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
+entry:
+  %addr1 = getelementptr i32, i32* %addr, i64 0
+  %addr2 = getelementptr i32, i32* %addr, i64 1
+  %addr3 = getelementptr i32, i32* %addr, i64 2
+  br label %outer
+outer:
+  %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
+  %iv1.next = add i32 %iv1, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br label %middle
+middle:
+  %iv2 = phi i32 [0, %outer], [%iv2.next, %middle_latch]
+  %iv2.next = add i32 %iv2, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br label %loop
+loop:
+  %iv3 = phi i32 [0, %middle], [%iv3.next, %loop_latch]
+  %iv3.next = add i32 %iv3, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br i1 %c1, label %loop_next1_left, label %loop_next1_right
+loop_next1_left:
+  br label %loop_next1
+loop_next1_right:
+  br label %loop_next1
+
+loop_next1:
+  br i1 %c2, label %loop_next2_left, label %loop_next2_right
+loop_next2_left:
+  br label %loop_next2
+loop_next2_right:
+  br label %loop_next2
+
+loop_next2:
+  br i1 %c3, label %loop_next3_left, label %loop_next3_right
+loop_next3_left:
+  br label %loop_next3
+loop_next3_right:
+  br label %loop_next3
+
+loop_next3:
+  br i1 %c4, label %loop_next4_left, label %loop_next4_right
+loop_next4_left:
+  br label %loop_next4
+loop_next4_right:
+  br label %loop_next4
+
+loop_next4:
+  br i1 %c5, label %loop_latch_left, label %loop_latch_right
+loop_latch_left:
+  br label %loop_latch
+loop_latch_right:
+  br label %loop_latch
+
+loop_latch:
+  store volatile i32 0, i32* %addr1
+  %test_loop = icmp slt i32 %iv3, 50
+  br i1 %test_loop, label %loop, label %middle_latch
+middle_latch:
+  store volatile i32 0, i32* %addr2
+  %test_middle = icmp slt i32 %iv2, 50
+  br i1 %test_middle, label %middle, label %outer_latch
+outer_latch:
+  store volatile i32 0, i32* %addr3
+  %test_outer = icmp slt i32 %iv1, 50
+  br i1 %test_outer, label %outer, label %exit
+exit:
+  ret void
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll b/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..447d42beeb4d8189433f7a97eadfc86e8766db78
--- /dev/null
+++ b/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll
@@ -0,0 +1,149 @@
+;
+; Here all the branches we unswitch are exiting from the inner loop.
+; That means we should not be getting exponential behavior on inner-loop
+; unswitch. In fact there should be just a single version of inner-loop,
+; with possibly some outer loop copies.
+;
+; There should be just a single copy of each loop when strictest mutiplier
+; candidates formula (unscaled candidates == 0) is enforced:
+
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+;
+; When we relax the candidates part of a multiplier formula
+; (unscaled candidates == 2) we start getting some unswitches in outer loops,
+; which leads to siblings multiplier kicking in.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=3 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-UNSCALE3-DIV1
+;
+; NB: sort -b is essential here and below, otherwise blanks might lead to different
+; order depending on locale.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=3 -unswitch-siblings-toplevel-div=2 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-UNSCALE3-DIV2
+;
+; With disabled cost-multiplier we get maximal possible amount of unswitches.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:	   sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-MAX
+;
+; Single loop nest, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1-NOT:  Loop at depth 1 containing:
+; LOOP1:     Loop at depth 2 containing:
+; LOOP1-NOT:  Loop at depth 2 containing:
+; LOOP1:     Loop at depth 3 containing:
+; LOOP1-NOT:  Loop at depth 3 containing:
+;
+; Half unswitched loop nests, with unscaled3 and div1 it gets less depth1 loops unswitched
+; since they have more cost.
+; LOOP-UNSCALE3-DIV1-COUNT-4: Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV1-NOT:      Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV1-COUNT-1: Loop at depth 2 containing:
+; LOOP-UNSCALE3-DIV1-NOT:      Loop at depth 2 containing:
+; LOOP-UNSCALE3-DIV1-COUNT-1: Loop at depth 3 containing:
+; LOOP-UNSCALE3-DIV1-NOT:      Loop at depth 3 containing:
+;
+; Half unswitched loop nests, with unscaled3 and div2 it gets more depth1 loops unswitched
+; as div2 kicks in.
+; LOOP-UNSCALE3-DIV2-COUNT-6: Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV2-NOT:      Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV2-COUNT-1: Loop at depth 2 containing:
+; LOOP-UNSCALE3-DIV2-NOT:      Loop at depth 2 containing:
+; LOOP-UNSCALE3-DIV2-COUNT-1: Loop at depth 3 containing:
+; LOOP-UNSCALE3-DIV2-NOT:      Loop at depth 3 containing:
+;
+; Maximally unswitched (copy of the outer loop per each condition)
+; LOOP-MAX-COUNT-6: Loop at depth 1 containing:
+; LOOP-MAX-NOT:      Loop at depth 1 containing:
+; LOOP-MAX-COUNT-1: Loop at depth 2 containing:
+; LOOP-MAX-NOT:      Loop at depth 2 containing:
+; LOOP-MAX-COUNT-1: Loop at depth 3 containing:
+; LOOP-MAX-NOT:      Loop at depth 3 containing:
+
+declare void @bar()
+
+define void @loop_nested3_conds5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
+entry:
+  %addr1 = getelementptr i32, i32* %addr, i64 0
+  %addr2 = getelementptr i32, i32* %addr, i64 1
+  %addr3 = getelementptr i32, i32* %addr, i64 2
+  br label %outer
+outer:
+  %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
+  %iv1.next = add i32 %iv1, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br label %middle
+middle:
+  %iv2 = phi i32 [0, %outer], [%iv2.next, %middle_latch]
+  %iv2.next = add i32 %iv2, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br label %loop
+loop:
+  %iv3 = phi i32 [0, %middle], [%iv3.next, %loop_latch]
+  %iv3.next = add i32 %iv3, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br i1 %c1, label %loop_next1_left, label %outer_latch
+loop_next1_left:
+  br label %loop_next1
+loop_next1_right:
+  br label %loop_next1
+
+loop_next1:
+  br i1 %c2, label %loop_next2_left, label %outer_latch
+loop_next2_left:
+  br label %loop_next2
+loop_next2_right:
+  br label %loop_next2
+
+loop_next2:
+  br i1 %c3, label %loop_next3_left, label %outer_latch
+loop_next3_left:
+  br label %loop_next3
+loop_next3_right:
+  br label %loop_next3
+
+loop_next3:
+  br i1 %c4, label %loop_next4_left, label %outer_latch
+loop_next4_left:
+  br label %loop_next4
+loop_next4_right:
+  br label %loop_next4
+
+loop_next4:
+  br i1 %c5, label %loop_latch_left, label %outer_latch
+loop_latch_left:
+  br label %loop_latch
+loop_latch_right:
+  br label %loop_latch
+
+loop_latch:
+  store volatile i32 0, i32* %addr1
+  %test_loop = icmp slt i32 %iv3, 50
+  br i1 %test_loop, label %loop, label %middle_latch
+middle_latch:
+  store volatile i32 0, i32* %addr2
+  %test_middle = icmp slt i32 %iv2, 50
+  br i1 %test_middle, label %middle, label %outer_latch
+outer_latch:
+  store volatile i32 0, i32* %addr3
+  %test_outer = icmp slt i32 %iv1, 50
+  br i1 %test_outer, label %outer, label %exit
+exit:
+  ret void
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d013c4f636290bbd27aa54d5e0e6b755c8ce9fff
--- /dev/null
+++ b/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll
@@ -0,0 +1,80 @@
+;
+; There should be just a single copy of loop when strictest mutiplier candidates
+; formula (unscaled candidates == 0) is enforced:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; With relaxed candidates multiplier (unscaled candidates == 8) we should allow
+; some unswitches to happen until siblings multiplier starts kicking in:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP5
+;
+; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed
+; siblings multiplier for top-level loops (toplevel-div == 8) we should get
+;    2^(num conds) == 2^5 == 32
+; copies of the loop:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
+;
+; Similarly get
+;    2^(num conds) == 2^5 == 32
+; copies of the loop when cost multiplier is disabled:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
+;
+;
+; Single loop, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1-NOT: Loop at depth 1 containing:
+
+; 5 loops, unswitched 4 times
+; LOOP5-COUNT-5: Loop at depth 1 containing:
+; LOOP5-NOT:     Loop at depth 1 containing:
+
+; 32 loops, fully unswitched
+; LOOP32-COUNT-32: Loop at depth 1 containing:
+; LOOP32-NOT:     Loop at depth 1 containing:
+
+define void @loop_simple5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
+entry:
+  br label %loop
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop_latch]
+  %iv.next = add i32 %iv, 1
+  br i1 %c1, label %loop_next1, label %loop_next1_right
+loop_next1_right:
+  br label %loop_next1
+loop_next1:
+  br i1 %c2, label %loop_next2, label %loop_next2_right
+loop_next2_right:
+  br label %loop_next2
+loop_next2:
+  br i1 %c3, label %loop_next3, label %loop_next3_right
+loop_next3_right:
+  br label %loop_next3
+loop_next3:
+  br i1 %c4, label %loop_next4, label %loop_next4_right
+loop_next4_right:
+  br label %loop_next4
+loop_next4:
+  br i1 %c5, label %loop_latch, label %loop_latch_right
+loop_latch_right:
+  br label %loop_latch
+loop_latch:
+  store volatile i32 0, i32* %addr
+  %test_loop = icmp slt i32 %iv, 50
+  br i1 %test_loop, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch2.ll b/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b98754069338fae6a55acc18273b06ca7497d23c
--- /dev/null
+++ b/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch2.ll
@@ -0,0 +1,56 @@
+;
+; Here all the branches are exiting ones. Checking that we dont have
+; exponential behavior with any kind of controlling heuristics here.
+;
+; There we should have just a single loop.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+;
+; Single loop, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1-NOT: Loop at depth 1 containing:
+
+declare void @bar()
+
+define void @loop_simple5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
+entry:
+  br label %loop
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop_latch]
+  %iv.next = add i32 %iv, 1
+  ;; disabling trivial unswitch
+  call void @bar()
+  br i1 %c1, label %loop_next1, label %exit
+loop_next1:
+  br i1 %c2, label %loop_next2, label %exit
+loop_next2:
+  br i1 %c3, label %loop_next3, label %exit
+loop_next3:
+  br i1 %c4, label %loop_next4, label %exit
+loop_next4:
+  br i1 %c5, label %loop_latch, label %exit
+loop_latch:
+  store volatile i32 0, i32* %addr
+  %test_loop = icmp slt i32 %iv, 50
+  br i1 %test_loop, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..407b632764e3eaa90688f3590753856ca14803db
--- /dev/null
+++ b/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll
@@ -0,0 +1,118 @@
+;
+; Here we have 5-way unswitchable switch with each successor also having an unswitchable
+; exiting branch in it. If we start unswitching those branches we start duplicating the
+; whole switch. This can easily lead to exponential behavior w/o proper control.
+; On a real-life testcase there was 16-way switch and that took forever to compile w/o
+; a cost control.
+;
+;
+; When we use the stricted multiplier candidates formula (unscaled candidates == 0)
+; we should be getting just a single loop.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+;
+; With relaxed candidates multiplier (unscaled candidates == 8) we should allow
+; some unswitches to happen until siblings multiplier starts kicking in:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-RELAX
+;
+; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed
+; siblings multiplier for top-level loops (toplevel-div == 8) we should get
+; considerably more copies of the loop (especially top-level ones).
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-RELAX2
+;
+; We get hundreds of copies of the loop when cost multiplier is disabled:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-MAX
+;
+
+; Single loop nest, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1-NOT: Loop at depth 1 containing:
+; LOOP1:     Loop at depth 2 containing:
+; LOOP1-NOT: Loop at depth 2 containing:
+;
+; Somewhat relaxed restrictions on candidates:
+; LOOP-RELAX-COUNT-5:     Loop at depth 1 containing:
+; LOOP-RELAX-NOT: Loop at depth 1 containing:
+; LOOP-RELAX-COUNT-32:     Loop at depth 2 containing:
+; LOOP-RELAX-NOT: Loop at depth 2 containing:
+;
+; Even more relaxed restrictions on candidates and siblings.
+; LOOP-RELAX2-COUNT-11:     Loop at depth 1 containing:
+; LOOP-RELAX2-NOT: Loop at depth 1 containing:
+; LOOP-RELAX2-COUNT-40:     Loop at depth 2 containing:
+; LOOP-RELAX-NOT: Loop at depth 2 containing:
+;
+; Unswitched as much as it could (with multiplier disabled).
+; LOOP-MAX-COUNT-56:     Loop at depth 1 containing:
+; LOOP-MAX-NOT: Loop at depth 1 containing:
+; LOOP-MAX-COUNT-111:     Loop at depth 2 containing:
+; LOOP-MAX-NOT: Loop at depth 2 containing:
+
+define i32 @loop_switch(i32* %addr, i32 %c1, i32 %c2) {
+entry:
+  %addr1 = getelementptr i32, i32* %addr, i64 0
+  %addr2 = getelementptr i32, i32* %addr, i64 1
+  %check0 = icmp eq i32 %c2, 0
+  %check1 = icmp eq i32 %c2, 31
+  %check2 = icmp eq i32 %c2, 32
+  %check3 = icmp eq i32 %c2, 33
+  %check4 = icmp eq i32 %c2, 34
+  br label %outer_loop
+
+outer_loop:
+  %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
+  %iv1.next = add i32 %iv1, 1
+  br label %inner_loop
+inner_loop:
+  %iv2 = phi i32 [0, %outer_loop], [%iv2.next, %inner_latch]
+  %iv2.next = add i32 %iv2, 1
+  switch i32 %c1, label %inner_latch [
+    i32 0, label %case0
+    i32 1, label %case1
+    i32 2, label %case2
+    i32 3, label %case3
+    i32 4, label %case4
+  ]
+
+case4:
+  br i1 %check4, label %exit, label %inner_latch
+case3:
+  br i1 %check3, label %exit, label %inner_latch
+case2:
+  br i1 %check2, label %exit, label %inner_latch
+case1:
+  br i1 %check1, label %exit, label %inner_latch
+case0:
+  br i1 %check0, label %exit, label %inner_latch
+
+inner_latch:
+  store volatile i32 0, i32* %addr1
+  %test_inner = icmp slt i32 %iv2, 50
+  br i1 %test_inner, label %inner_loop, label %outer_latch
+
+outer_latch:
+  store volatile i32 0, i32* %addr2
+  %test_outer = icmp slt i32 %iv1, 50
+  br i1 %test_outer, label %outer_loop, label %exit
+
+exit:                                            ; preds = %bci_0
+  ret i32 1
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/guards.ll b/test/Transforms/SimpleLoopUnswitch/guards.ll
index 95661c425e184f203659b8c171cf3046c7cca27f..f1b92524332367533358635921ab082846074f0d 100644
--- a/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -passes='loop(unswitch),verify<loops>' -enable-nontrivial-unswitch -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -simple-loop-unswitch-guards -S < %s | FileCheck %s
+; RUN: opt -passes='loop(unswitch),verify<loops>' -enable-nontrivial-unswitch -simple-loop-unswitch-guards -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
 
 declare void @llvm.experimental.guard(i1, ...)
 
diff --git a/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll b/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll
index 9164fd2478384b9def2c6db307991abc628b2aa9..91e1f486b8286bd812290d7842f4c9a402a70400 100644
--- a/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll
+++ b/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll
@@ -1,6 +1,7 @@
 ; REQUIRES: asserts
 ; RUN: opt -simple-loop-unswitch -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
 ; RUN: opt -simple-loop-unswitch -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
 ; PR5373
 
 ; Loop unswitching shouldn't trivially unswitch the true case of condition %a
diff --git a/test/Transforms/SimpleLoopUnswitch/msan.ll b/test/Transforms/SimpleLoopUnswitch/msan.ll
index ec1110ac9d66c5c3fd8fd0ab712526277a0c92b5..8a296bcd279da76575453591033c553e4b16e6bd 100644
--- a/test/Transforms/SimpleLoopUnswitch/msan.ll
+++ b/test/Transforms/SimpleLoopUnswitch/msan.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes='loop(unswitch),verify<loops>' -S < %s | FileCheck %s
+; RUN: opt -enable-mssa-loop-dependency=true -verify-memoryssa -passes='loop(unswitch),verify<loops>' -S < %s | FileCheck %s
 
 declare void @unknown()
 declare void @unknown2()
diff --git a/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll b/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll
index 2d98f148d46d8a24282c757813ec6e52d02727dd..333378a098489607568a9113c5e03cdf26629ff1 100644
--- a/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll
+++ b/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll
@@ -2,6 +2,7 @@
 ;
 ; RUN: opt -passes='loop(unswitch),verify<loops>' -enable-nontrivial-unswitch -unswitch-threshold=5 -S < %s | FileCheck %s
 ; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -unswitch-threshold=5 -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -unswitch-threshold=5 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
 
 declare void @a()
 declare void @b()
diff --git a/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
index 367d6fe28e978eae4aa21aed0a577d04b6f88c0b..f07c812819fc5d36978d747a7b03718416bd8378 100644
--- a/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -passes='loop(unswitch),verify<loops>' -enable-nontrivial-unswitch -S < %s | FileCheck %s
 ; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
 
 declare i32 @a()
 declare i32 @b()
diff --git a/test/Transforms/SimpleLoopUnswitch/pr37888.ll b/test/Transforms/SimpleLoopUnswitch/pr37888.ll
index 307706fc34ab613f5c06f511fb3878613a12f25e..e8e34a2e882401233834e88620a43738b402227e 100644
--- a/test/Transforms/SimpleLoopUnswitch/pr37888.ll
+++ b/test/Transforms/SimpleLoopUnswitch/pr37888.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -simple-loop-unswitch -loop-deletion -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -loop-deletion -S < %s | FileCheck %s
 ;
 ; Check that when we do unswitching where we re-enqueue the loop to be processed
 ; again, but manage to delete the loop before ever getting to iterate on it, it
diff --git a/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll b/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
index 76b41e73ede5af3bcb6311e71a8c508a59dacbd8..114825348dadd172ad8e025ed79927d646f12b09 100644
--- a/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
+++ b/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -simple-loop-unswitch -verify-loop-info -verify-dom-info -disable-output < %s
+; RUN: opt -simple-loop-unswitch -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output < %s
 
 ; Loop unswitch should be able to unswitch these loops and
 ; preserve LCSSA and LoopSimplify forms.
diff --git a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch-iteration.ll b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch-iteration.ll
index 6c409e40b281b0bbae39745ca142fb3650e6c3a0..18b39ca808245ba94ce8388b885059a70f3ca49f 100644
--- a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch-iteration.ll
+++ b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch-iteration.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes='loop(loop-instsimplify,simplify-cfg,unswitch),verify<loops>' -S < %s | FileCheck %s
+; RUN: opt -enable-mssa-loop-dependency=true -verify-memoryssa -passes='loop(loop-instsimplify,simplify-cfg,unswitch),verify<loops>' -S < %s | FileCheck %s
 
 declare void @some_func() noreturn
 
diff --git a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
index 6a025a4136be9cf386dbd97186847577651bb3e5..56a9bac898093c51361aa27ffba9416a84b867f8 100644
--- a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes='loop(unswitch),verify<loops>' -S < %s | FileCheck %s
+; RUN: opt -enable-mssa-loop-dependency=true -verify-memoryssa -passes='loop(unswitch),verify<loops>' -S < %s | FileCheck %s
 
 declare void @some_func() noreturn
 declare void @sink(i32)
diff --git a/test/Transforms/SimpleLoopUnswitch/update-scev.ll b/test/Transforms/SimpleLoopUnswitch/update-scev.ll
index 96bdc6a91b85c6992eefee60e80c19093d6b7ab0..1f4c1421b4ecda68c9f24f6750ee45f062db3ab5 100644
--- a/test/Transforms/SimpleLoopUnswitch/update-scev.ll
+++ b/test/Transforms/SimpleLoopUnswitch/update-scev.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes='print<scalar-evolution>,loop(unswitch,loop-instsimplify),print<scalar-evolution>' -enable-nontrivial-unswitch -S < %s 2>%t.scev | FileCheck %s
+; RUN: opt -enable-mssa-loop-dependency=true -verify-memoryssa -passes='print<scalar-evolution>,loop(unswitch,loop-instsimplify),print<scalar-evolution>' -enable-nontrivial-unswitch -S < %s 2>%t.scev | FileCheck %s
 ; RUN: FileCheck %s --check-prefix=SCEV < %t.scev
 
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/SimplifyCFG/branch-fold-three.ll b/test/Transforms/SimplifyCFG/branch-fold-three.ll
new file mode 100644
index 0000000000000000000000000000000000000000..953749bfb16a3a73891ed8e1d6d4f3c0b960c8bc
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/branch-fold-three.ll
@@ -0,0 +1,259 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+; s >= t, s != t, s <= t
+define void @foo1(i32 %s, i32 %t) {
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[S:%.*]], [[T:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END6:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @bar1(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[S]], [[T]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END6]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    call void @bar2(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[S]], [[T]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN4:%.*]], label [[IF_END6]]
+; CHECK:       if.then4:
+; CHECK-NEXT:    call void @bar3(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    br label [[IF_END6]]
+; CHECK:       if.end6:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sge i32 %s, %t
+  br i1 %cmp, label %if.then, label %if.end6
+
+if.then:
+  call void @bar1(i32 %s, i32 %t)
+  %cmp1 = icmp ne i32 %s, %t
+  br i1 %cmp1, label %if.then2, label %if.end6
+
+if.then2:
+  call void @bar2(i32 %s, i32 %t)
+  %cmp3 = icmp sle i32 %s, %t
+  br i1 %cmp3, label %if.then4, label %if.end6
+
+if.then4:
+  call void @bar3(i32 %s, i32 %t)
+  br label %if.end6
+
+if.end6:
+  ret void
+}
+
+; s != t, s >= t, s <= t
+define void @foo11(i32 %s, i32 %t) {
+; CHECK-LABEL: @foo11(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[S:%.*]], [[T:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END6:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @bar1(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[S]], [[T]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END6]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    call void @bar2(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[S]], [[T]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN4:%.*]], label [[IF_END6]]
+; CHECK:       if.then4:
+; CHECK-NEXT:    call void @bar3(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    br label [[IF_END6]]
+; CHECK:       if.end6:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp ne i32 %s, %t
+  br i1 %cmp, label %if.then, label %if.end6
+
+if.then:
+  call void @bar1(i32 %s, i32 %t)
+  %cmp1 = icmp sge i32 %s, %t
+  br i1 %cmp1, label %if.then2, label %if.end6
+
+if.then2:
+  call void @bar2(i32 %s, i32 %t)
+  %cmp3 = icmp sle i32 %s, %t
+  br i1 %cmp3, label %if.then4, label %if.end6
+
+if.then4:
+  call void @bar3(i32 %s, i32 %t)
+  br label %if.end6
+
+if.end6:
+  ret void
+}
+
+; s >= t, t != s, s <= t
+define void @foo2(i32 %s, i32 %t) {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[S:%.*]], [[T:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END6:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @bar1(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[T]], [[S]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END6]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    call void @bar2(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[S]], [[T]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN4:%.*]], label [[IF_END6]]
+; CHECK:       if.then4:
+; CHECK-NEXT:    call void @bar3(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    br label [[IF_END6]]
+; CHECK:       if.end6:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sge i32 %s, %t
+  br i1 %cmp, label %if.then, label %if.end6
+
+if.then:
+  call void @bar1(i32 %s, i32 %t)
+  %cmp1 = icmp ne i32 %t, %s
+  br i1 %cmp1, label %if.then2, label %if.end6
+
+if.then2:
+  call void @bar2(i32 %s, i32 %t)
+  %cmp3 = icmp sle i32 %s, %t
+  br i1 %cmp3, label %if.then4, label %if.end6
+
+if.then4:
+  call void @bar3(i32 %s, i32 %t)
+  br label %if.end6
+
+if.end6:
+  ret void
+}
+
+; s != t, t <= s, s <= t
+define void @foo21(i32 %s, i32 %t) {
+; CHECK-LABEL: @foo21(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[S:%.*]], [[T:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END6:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @bar1(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[T]], [[S]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END6]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    call void @bar2(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[S]], [[T]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN4:%.*]], label [[IF_END6]]
+; CHECK:       if.then4:
+; CHECK-NEXT:    call void @bar3(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    br label [[IF_END6]]
+; CHECK:       if.end6:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp ne i32 %s, %t
+  br i1 %cmp, label %if.then, label %if.end6
+
+if.then:
+  call void @bar1(i32 %s, i32 %t)
+  %cmp1 = icmp sle i32 %t, %s
+  br i1 %cmp1, label %if.then2, label %if.end6
+
+if.then2:
+  call void @bar2(i32 %s, i32 %t)
+  %cmp3 = icmp sle i32 %s, %t
+  br i1 %cmp3, label %if.then4, label %if.end6
+
+if.then4:
+  call void @bar3(i32 %s, i32 %t)
+  br label %if.end6
+
+if.end6:
+  ret void
+}
+
+; t <= s, t != s, s <= t
+define void @foo3(i32 %s, i32 %t) {
+; CHECK-LABEL: @foo3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[T:%.*]], [[S:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END6:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @bar1(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[T]], [[S]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END6]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    call void @bar2(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[S]], [[T]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN4:%.*]], label [[IF_END6]]
+; CHECK:       if.then4:
+; CHECK-NEXT:    call void @bar3(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    br label [[IF_END6]]
+; CHECK:       if.end6:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sle i32 %t, %s
+  br i1 %cmp, label %if.then, label %if.end6
+
+if.then:
+  call void @bar1(i32 %s, i32 %t)
+  %cmp1 = icmp ne i32 %t, %s
+  br i1 %cmp1, label %if.then2, label %if.end6
+
+if.then2:
+  call void @bar2(i32 %s, i32 %t)
+  %cmp3 = icmp sle i32 %s, %t
+  br i1 %cmp3, label %if.then4, label %if.end6
+
+if.then4:
+  call void @bar3(i32 %s, i32 %t)
+  br label %if.end6
+
+if.end6:
+  ret void
+}
+
+; t != s, t <= s, s <= t
+define void @foo31(i32 %s, i32 %t) {
+; CHECK-LABEL: @foo31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T:%.*]], [[S:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END6:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @bar1(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[T]], [[S]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END6]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    call void @bar2(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[S]], [[T]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN4:%.*]], label [[IF_END6]]
+; CHECK:       if.then4:
+; CHECK-NEXT:    call void @bar3(i32 [[S]], i32 [[T]])
+; CHECK-NEXT:    br label [[IF_END6]]
+; CHECK:       if.end6:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp ne i32 %t, %s
+  br i1 %cmp, label %if.then, label %if.end6
+
+if.then:
+  call void @bar1(i32 %s, i32 %t)
+  %cmp1 = icmp sle i32 %t, %s
+  br i1 %cmp1, label %if.then2, label %if.end6
+
+if.then2:
+  call void @bar2(i32 %s, i32 %t)
+  %cmp3 = icmp sle i32 %s, %t
+  br i1 %cmp3, label %if.then4, label %if.end6
+
+if.then4:
+  call void @bar3(i32 %s, i32 %t)
+  br label %if.end6
+
+if.end6:
+  ret void
+}
+
+declare void @bar1(i32, i32)
+declare void @bar2(i32, i32)
+declare void @bar3(i32, i32)
+
diff --git a/test/Transforms/SimplifyCFG/implied-and-or.ll b/test/Transforms/SimplifyCFG/implied-and-or.ll
index e615f302feefd4f091878204512a6c92e5a47717..dd885c94bdc9c75b6d1f29ce33cf567c334b7758 100644
--- a/test/Transforms/SimplifyCFG/implied-and-or.ll
+++ b/test/Transforms/SimplifyCFG/implied-and-or.ll
@@ -1,16 +1,23 @@
-; RUN: opt %s -S -simplifycfg | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -simplifycfg | FileCheck %s
 
 declare void @foo()
 declare void @bar()
 
-
-; CHECK-LABEL: @test_and1
-; CHECK: taken:
-; CHECK-NOT: cmp3
-; CHECK: call void @bar()
-; CHECK-NEXT: call void @foo()
-; CHECK: ret
 define void @test_and1(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_and1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp1 = icmp eq i32 %a, 0
   %cmp2 = icmp eq i32 %b, 0
@@ -31,15 +38,24 @@ end:
 }
 
 ; We can't infer anything if the result of the 'and' is false
-; CHECK-LABEL: @test_and2
-; CHECK: taken:
-; CHECK:   call void @bar()
-; CHECK:   %cmp3
-; CHECK:   br i1 %cmp3
-; CHECK: if.then:
-; CHECK:   call void @foo()
-; CHECK: ret
+
 define void @test_and2(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_and2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[END:%.*]], label [[TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp1 = icmp eq i32 %a, 0
   %cmp2 = icmp eq i32 %b, 0
@@ -59,13 +75,20 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_or1
-; CHECK: taken:
-; CHECK-NOT: cmp3
-; CHECK: call void @bar()
-; CHECK-NEXT: call void @foo()
-; CHECK: ret
 define void @test_or1(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_or1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    br i1 [[OR]], label [[END:%.*]], label [[TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp1 = icmp eq i32 %a, 0
   %cmp2 = icmp eq i32 %b, 0
@@ -86,14 +109,24 @@ end:
 }
 
 ; We can't infer anything if the result of the 'or' is true
-; CHECK-LABEL: @test_or2
-; CHECK:   call void @bar()
-; CHECK:   %cmp3
-; CHECK:   br i1 %cmp3
-; CHECK: if.then:
-; CHECK:   call void @foo()
-; CHECK: ret
+
 define void @test_or2(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_or2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    br i1 [[OR]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp1 = icmp eq i32 %a, 0
   %cmp2 = icmp eq i32 %b, 0
@@ -114,13 +147,23 @@ end:
 }
 
 ; We can recurse a tree of 'and' or 'or's.
-; CHECK-LABEL: @test_and_recurse1
-; CHECK: taken:
-; CHECK-NEXT:  call void @bar()
-; CHECK-NEXT:  call void @foo()
-; CHECK-NEXT:  br label %end
-; CHECK: ret
+
 define void @test_and_recurse1(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @test_and_recurse1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMPA:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CMPB:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[CMPC:%.*]] = icmp eq i32 [[C:%.*]], 0
+; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[CMPA]], [[CMPB]]
+; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[CMPC]]
+; CHECK-NEXT:    br i1 [[AND2]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmpa = icmp eq i32 %a, 0
   %cmpb = icmp eq i32 %b, 0
@@ -143,14 +186,37 @@ end:
 }
 
 ; Check to make sure we don't recurse too deep.
-; CHECK-LABEL: @test_and_recurse2
-; CHECK: taken:
-; CHECK-NEXT:  call void @bar()
-; CHECK-NEXT:  %cmp3 = icmp eq i32 %a, 0
-; CHECK-NEXT:  br i1 %cmp3, label %if.then, label %end
-; CHECK: ret
+
 define void @test_and_recurse2(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f,
-                               i32 %g, i32 %h) {
+; CHECK-LABEL: @test_and_recurse2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMPA:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[CMPB:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[CMPC:%.*]] = icmp eq i32 [[C:%.*]], 0
+; CHECK-NEXT:    [[CMPD:%.*]] = icmp eq i32 [[D:%.*]], 0
+; CHECK-NEXT:    [[CMPE:%.*]] = icmp eq i32 [[E:%.*]], 0
+; CHECK-NEXT:    [[CMPF:%.*]] = icmp eq i32 [[F:%.*]], 0
+; CHECK-NEXT:    [[CMPG:%.*]] = icmp eq i32 [[G:%.*]], 0
+; CHECK-NEXT:    [[CMPH:%.*]] = icmp eq i32 [[H:%.*]], 0
+; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[CMPA]], [[CMPB]]
+; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[CMPC]]
+; CHECK-NEXT:    [[AND3:%.*]] = and i1 [[AND2]], [[CMPD]]
+; CHECK-NEXT:    [[AND4:%.*]] = and i1 [[AND3]], [[CMPE]]
+; CHECK-NEXT:    [[AND5:%.*]] = and i1 [[AND4]], [[CMPF]]
+; CHECK-NEXT:    [[AND6:%.*]] = and i1 [[AND5]], [[CMPG]]
+; CHECK-NEXT:    [[AND7:%.*]] = and i1 [[AND6]], [[CMPH]]
+; CHECK-NEXT:    br i1 [[AND7]], label [[TAKEN:%.*]], label [[END:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+  i32 %g, i32 %h) {
 entry:
   %cmpa = icmp eq i32 %a, 0
   %cmpb = icmp eq i32 %b, 0
@@ -181,3 +247,4 @@ if.then:
 end:
   ret void
 }
+
diff --git a/test/Transforms/SimplifyCFG/pr39807.ll b/test/Transforms/SimplifyCFG/pr39807.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8148f79503c8461492a62138d59258487017b35b
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/pr39807.ll
@@ -0,0 +1,43 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+
+declare void @personality()
+
+define void @test(i1 %b) personality void()* @personality !dbg !1 {
+; CHECK:      invoke void @inlinable()
+; CHECK-NEXT:    to label %success unwind label %failure, !dbg ![[DBGLOC:[0-9]+]]
+    br i1 %b, label %if, label %else
+
+if:
+    invoke void @inlinable()
+        to label %success unwind label %failure, !dbg !2
+
+else:
+    invoke void @inlinable()
+        to label %success unwind label %failure, !dbg !8
+
+success:
+    ret void
+
+failure:
+    landingpad {}
+        cleanup
+    ret void
+}
+
+define internal void @inlinable() !dbg !7 {
+    ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!4, !5, !6}
+
+; CHECK: ![[DBGLOC]] = !DILocation(line: 0
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, runtimeVersion: 0, file: !3)
+!1 = distinct !DISubprogram(name: "test", unit: !0)
+!2 = !DILocation(line: 2, scope: !1)
+!3 = !DIFile(filename: "foo", directory: ".")
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = distinct !DISubprogram(name: "inlinable", unit: !0)
+!8 = !DILocation(line: 3, scope: !1)
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index dba5dcf68b0c4d4fb5b03a4cf9e7f02ddba3646d..66a34bab3e5b01e982edf17e212c791713c2fa5b 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -5,9 +5,20 @@ declare void @helper(i32)
 
 define void @test1(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_NOT:%.*]] = xor i1 [[A:%.*]], true
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[A_NOT]], [[C]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof !0
+; CHECK:       Y:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       Z:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    ret void
+;
 entry:
   br i1 %a, label %Y, label %X, !prof !0
-; CHECK: br i1 %or.cond, label %Z, label %Y, !prof !0
 
 X:
   %c = or i1 %b, false
@@ -26,11 +37,20 @@ Z:
 
 define void @fake_weights(i1 %a, i1 %b) {
 ; CHECK-LABEL: @fake_weights(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_NOT:%.*]] = xor i1 [[A:%.*]], true
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[A_NOT]], [[C]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof !1
+; CHECK:       Y:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       Z:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    ret void
+;
 entry:
   br i1 %a, label %Y, label %X, !prof !12
-; CHECK:        %or.cond = and i1 %a.not, %c
-; CHECK-NEXT:   br i1 %or.cond, label %Z, label %Y, !prof !1
-; CHECK:      Y:
 X:
   %c = or i1 %b, false
   br i1 %c, label %Z, label %Y, !prof !1
@@ -46,10 +66,19 @@ Z:
 
 define void @test2(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[A:%.*]], [[C]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof !2
+; CHECK:       Y:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       Z:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    ret void
+;
 entry:
   br i1 %a, label %X, label %Y, !prof !1
-; CHECK: br i1 %or.cond, label %Z, label %Y, !prof !2
-; CHECK-NOT: !prof
 
 X:
   %c = or i1 %b, false
@@ -66,7 +95,17 @@ Z:
 
 define void @test3(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test3(
-; CHECK: br i1 %or.cond, label %Z, label %Y, !prof !1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[A:%.*]], [[C]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof !1
+; CHECK:       Y:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       Z:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    ret void
+;
 entry:
   br i1 %a, label %X, label %Y, !prof !1
 
@@ -85,7 +124,17 @@ Z:
 
 define void @test4(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test4(
-; CHECK: br i1 %or.cond, label %Z, label %Y, !prof !1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[A:%.*]], [[C]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof !1
+; CHECK:       Y:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       Z:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    ret void
+;
 entry:
   br i1 %a, label %X, label %Y
 
@@ -104,17 +153,30 @@ Z:
 
 ;; test5 - The case where it jumps to the default target will be removed.
 define void @test5(i32 %M, i32 %N) nounwind uwtable {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[N:%.*]], label [[SW2:%.*]] [
+; CHECK-NEXT:    i32 3, label [[SW_BB1:%.*]]
+; CHECK-NEXT:    i32 2, label [[SW_BB:%.*]]
+; CHECK-NEXT:    ], !prof !3
+; CHECK:       sw.bb:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
+; CHECK:       sw.bb1:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    br label [[SW_EPILOG]]
+; CHECK:       sw2:
+; CHECK-NEXT:    call void @helper(i32 2)
+; CHECK-NEXT:    br label [[SW_EPILOG]]
+; CHECK:       sw.epilog:
+; CHECK-NEXT:    ret void
+;
 entry:
   switch i32 %N, label %sw2 [
-    i32 1, label %sw2
-    i32 2, label %sw.bb
-    i32 3, label %sw.bb1
+  i32 1, label %sw2
+  i32 2, label %sw.bb
+  i32 3, label %sw.bb1
   ], !prof !3
-; CHECK-LABEL: @test5(
-; CHECK: switch i32 %N, label %sw2 [
-; CHECK: i32 3, label %sw.bb1
-; CHECK: i32 2, label %sw.bb
-; CHECK: ], !prof !3
 
 sw.bb:
   call void @helper(i32 0)
@@ -136,18 +198,31 @@ sw.epilog:
 ;; Then the second switch will be converted to a branch, finally, the first
 ;; switch and the branch will be merged into a single switch.
 define void @test6(i32 %M, i32 %N) nounwind uwtable {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[N:%.*]], label [[SW_EPILOG:%.*]] [
+; CHECK-NEXT:    i32 3, label [[SW_BB1:%.*]]
+; CHECK-NEXT:    i32 2, label [[SW_BB:%.*]]
+; CHECK-NEXT:    i32 4, label [[SW_BB5:%.*]]
+; CHECK-NEXT:    ], !prof !4
+; CHECK:       sw.bb:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    br label [[SW_EPILOG]]
+; CHECK:       sw.bb1:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    br label [[SW_EPILOG]]
+; CHECK:       sw.bb5:
+; CHECK-NEXT:    call void @helper(i32 3)
+; CHECK-NEXT:    br label [[SW_EPILOG]]
+; CHECK:       sw.epilog:
+; CHECK-NEXT:    ret void
+;
 entry:
   switch i32 %N, label %sw2 [
-    i32 1, label %sw2
-    i32 2, label %sw.bb
-    i32 3, label %sw.bb1
+  i32 1, label %sw2
+  i32 2, label %sw.bb
+  i32 3, label %sw.bb1
   ], !prof !4
-; CHECK-LABEL: @test6(
-; CHECK: switch i32 %N, label %sw.epilog
-; CHECK: i32 3, label %sw.bb1
-; CHECK: i32 2, label %sw.bb
-; CHECK: i32 4, label %sw.bb5
-; CHECK: ], !prof !4
 
 sw.bb:
   call void @helper(i32 0)
@@ -161,8 +236,8 @@ sw2:
 ;; Here "case 2" is invalidated since the default case of the first switch
 ;; does not include "case 2".
   switch i32 %N, label %sw.epilog [
-    i32 2, label %sw.bb4
-    i32 4, label %sw.bb5
+  i32 2, label %sw.bb4
+  i32 4, label %sw.bb5
   ], !prof !5
 
 sw.bb4:
@@ -180,9 +255,19 @@ sw.epilog:
 ;; This test is based on test1 but swapped the targets of the second branch.
 define void @test1_swap(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test1_swap(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[A:%.*]], [[C]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Y:%.*]], label [[Z:%.*]], !prof !5
+; CHECK:       Y:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       Z:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    ret void
+;
 entry:
   br i1 %a, label %Y, label %X, !prof !0
-; CHECK: br i1 %or.cond, label %Y, label %Z, !prof !5
 
 X:
   %c = or i1 %b, false
@@ -199,10 +284,20 @@ Z:
 
 define void @test7(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
+; CHECK-NEXT:    [[BRMERGE:%.*]] = or i1 [[A:%.*]], [[C]]
+; CHECK-NEXT:    br i1 [[BRMERGE]], label [[Y:%.*]], label [[Z:%.*]], !prof !6
+; CHECK:       Y:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       Z:
+; CHECK-NEXT:    call void @helper(i32 1)
+; CHECK-NEXT:    ret void
+;
 entry:
   %c = or i1 %b, false
   br i1 %a, label %Y, label %X, !prof !0
-; CHECK: br i1 %brmerge, label %Y, label %Z, !prof !6
 
 X:
   br i1 %c, label %Y, label %Z, !prof !6
@@ -219,114 +314,150 @@ Z:
 ; Test basic folding to a conditional branch.
 define void @test8(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: @test8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LT:%.*]] = icmp slt i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[LT]], label [[A:%.*]], label [[B:%.*]], !prof !7
+; CHECK:       a:
+; CHECK-NEXT:    call void @helper(i32 0) #1
+; CHECK-NEXT:    ret void
+; CHECK:       b:
+; CHECK-NEXT:    call void @helper(i32 1) #1
+; CHECK-NEXT:    ret void
+;
 entry:
-    %lt = icmp slt i64 %x, %y
-; CHECK: br i1 %lt, label %a, label %b, !prof !7
-    %qux = select i1 %lt, i32 0, i32 2
-    switch i32 %qux, label %bees [
-        i32 0, label %a
-        i32 1, label %b
-        i32 2, label %b
-    ], !prof !7
+  %lt = icmp slt i64 %x, %y
+  %qux = select i1 %lt, i32 0, i32 2
+  switch i32 %qux, label %bees [
+  i32 0, label %a
+  i32 1, label %b
+  i32 2, label %b
+  ], !prof !7
 a:
-    call void @helper(i32 0) nounwind
-    ret void
+  call void @helper(i32 0) nounwind
+  ret void
 b:
-    call void @helper(i32 1) nounwind
-    ret void
+  call void @helper(i32 1) nounwind
+  ret void
 bees:
-    call void @helper(i32 2) nounwind
-    ret void
+  call void @helper(i32 2) nounwind
+  ret void
 }
 
 ; Test edge splitting when the default target has icmp and unconditinal
 ; branch
 define i1 @test9(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: @test9(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[BEES:%.*]] [
+; CHECK-NEXT:    i32 0, label [[A:%.*]]
+; CHECK-NEXT:    i32 1, label [[END:%.*]]
+; CHECK-NEXT:    i32 2, label [[END]]
+; CHECK-NEXT:    i32 92, label [[END]]
+; CHECK-NEXT:    ], !prof !8
+; CHECK:       a:
+; CHECK-NEXT:    call void @helper(i32 0) #1
+; CHECK-NEXT:    [[RETA:%.*]] = icmp slt i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[RETA]]
+; CHECK:       bees:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[BEES]] ], [ true, [[ENTRY]] ], [ true, [[ENTRY]] ]
+; CHECK-NEXT:    call void @helper(i32 2) #1
+; CHECK-NEXT:    ret i1 [[RET]]
+;
 entry:
-    switch i32 %x, label %bees [
-        i32 0, label %a
-        i32 1, label %end
-        i32 2, label %end
-    ], !prof !7
-; CHECK: switch i32 %x, label %bees [
-; CHECK: i32 0, label %a
-; CHECK: i32 1, label %end
-; CHECK: i32 2, label %end
-; CHECK: i32 92, label %end
-; CHECK: ], !prof !8
+  switch i32 %x, label %bees [
+  i32 0, label %a
+  i32 1, label %end
+  i32 2, label %end
+  ], !prof !7
 
 a:
-    call void @helper(i32 0) nounwind
-    %reta = icmp slt i32 %x, %y
-    ret i1 %reta
+  call void @helper(i32 0) nounwind
+  %reta = icmp slt i32 %x, %y
+  ret i1 %reta
 
 bees:
-    %tmp = icmp eq i32 %x, 92
-    br label %end
+  %tmp = icmp eq i32 %x, 92
+  br label %end
 
 end:
-; CHECK: end:
-; CHECK: %ret = phi i1 [ true, %entry ], [ false, %bees ], [ true, %entry ], [ true, %entry ]
-    %ret = phi i1 [ true, %entry ], [%tmp, %bees], [true, %entry]
-    call void @helper(i32 2) nounwind
-    ret i1 %ret
+  %ret = phi i1 [ true, %entry ], [%tmp, %bees], [true, %entry]
+  call void @helper(i32 2) nounwind
+  ret i1 %ret
 }
 
 define void @test10(i32 %x) nounwind readnone ssp noredzone {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i32 [[X_OFF]], 3
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[LOR_END:%.*]], label [[LOR_RHS:%.*]], !prof !9
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    call void @helper(i32 1) #1
+; CHECK-NEXT:    ret void
+; CHECK:       lor.end:
+; CHECK-NEXT:    call void @helper(i32 0) #1
+; CHECK-NEXT:    ret void
+;
 entry:
- switch i32 %x, label %lor.rhs [
-   i32 2, label %lor.end
-   i32 1, label %lor.end
-   i32 3, label %lor.end
- ], !prof !7
+  switch i32 %x, label %lor.rhs [
+  i32 2, label %lor.end
+  i32 1, label %lor.end
+  i32 3, label %lor.end
+  ], !prof !7
 
 lor.rhs:
- call void @helper(i32 1) nounwind
- ret void
+  call void @helper(i32 1) nounwind
+  ret void
 
 lor.end:
- call void @helper(i32 0) nounwind
- ret void
+  call void @helper(i32 0) nounwind
+  ret void
 
-; CHECK-LABEL: @test10(
-; CHECK: %x.off = add i32 %x, -1
-; CHECK: %switch = icmp ult i32 %x.off, 3
-; CHECK: br i1 %switch, label %lor.end, label %lor.rhs, !prof !9
 }
 
 ; Remove dead cases from the switch.
 define void @test11(i32 %x) nounwind {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[I:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[I]], 24
+; CHECK-NEXT:    br i1 [[COND]], label [[C:%.*]], label [[A:%.*]], !prof !10
+; CHECK:       a:
+; CHECK-NEXT:    call void @helper(i32 0) #1
+; CHECK-NEXT:    ret void
+; CHECK:       c:
+; CHECK-NEXT:    call void @helper(i32 2) #1
+; CHECK-NEXT:    ret void
+;
   %i = shl i32 %x, 1
   switch i32 %i, label %a [
-    i32 21, label %b
-    i32 24, label %c
+  i32 21, label %b
+  i32 24, label %c
   ], !prof !8
-; CHECK-LABEL: @test11(
-; CHECK: %cond = icmp eq i32 %i, 24
-; CHECK: br i1 %cond, label %c, label %a, !prof !10
 
 a:
- call void @helper(i32 0) nounwind
- ret void
+  call void @helper(i32 0) nounwind
+  ret void
 b:
- call void @helper(i32 1) nounwind
- ret void
+  call void @helper(i32 1) nounwind
+  ret void
 c:
- call void @helper(i32 2) nounwind
- ret void
+  call void @helper(i32 2) nounwind
+  ret void
 }
 
 ;; test12 - Don't crash if the whole switch is removed
 define void @test12(i32 %M, i32 %N) nounwind uwtable {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @helper(i32 0)
+; CHECK-NEXT:    ret void
+;
 entry:
   switch i32 %N, label %sw.bb [
-    i32 1, label %sw.bb
+  i32 1, label %sw.bb
   ], !prof !9
-; CHECK-LABEL: @test12(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @helper
-; CHECK-NEXT: ret void
 
 sw.bb:
   call void @helper(i32 0)
@@ -339,26 +470,27 @@ sw.epilog:
 ;; If every case is dead, make sure they are all removed. This used to
 ;; crash trying to merge the metadata.
 define void @test13(i32 %x) nounwind {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @helper(i32 0) #1
+; CHECK-NEXT:    ret void
+;
 entry:
   %i = shl i32 %x, 1
   switch i32 %i, label %a [
-    i32 21, label %b
-    i32 25, label %c
+  i32 21, label %b
+  i32 25, label %c
   ], !prof !8
-; CHECK-LABEL: @test13(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @helper
-; CHECK-NEXT: ret void
 
 a:
- call void @helper(i32 0) nounwind
- ret void
+  call void @helper(i32 0) nounwind
+  ret void
 b:
- call void @helper(i32 1) nounwind
- ret void
+  call void @helper(i32 1) nounwind
+  ret void
 c:
- call void @helper(i32 2) nounwind
- ret void
+  call void @helper(i32 2) nounwind
+  ret void
 }
 
 ;; When folding branches to common destination, the updated branch weights
@@ -366,8 +498,24 @@ c:
 ;; weights until they can fit into uint32.
 @max_regno = common global i32 0, align 4
 define void @test14(i32* %old, i32 %final) {
-; CHECK-LABEL: @test14
-; CHECK: br i1 %or.cond, label %for.exit, label %for.inc, !prof !11
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:  for.cond:
+; CHECK-NEXT:    br label [[FOR_COND2:%.*]]
+; CHECK:       for.cond2:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[INC19:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_COND:%.*]] ]
+; CHECK-NEXT:    [[BIT_0:%.*]] = phi i32 [ [[SHL:%.*]], [[FOR_INC]] ], [ 1, [[FOR_COND]] ]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[BIT_0]], 0
+; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* @max_regno, align 4
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp eq i32 [[I_1]], [[V3]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[TOBOOL]], [[CMP4]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_EXIT:%.*]], label [[FOR_INC]], !prof !11
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[SHL]] = shl i32 [[BIT_0]], 1
+; CHECK-NEXT:    [[INC19]] = add nsw i32 [[I_1]], 1
+; CHECK-NEXT:    br label [[FOR_COND2]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
 for.cond:
   br label %for.cond2
 for.cond2:
@@ -392,7 +540,7 @@ for.exit:
 define i32 @HoistThenElseCodeToIf(i32 %n) {
 ; CHECK-LABEL: @HoistThenElseCodeToIf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 %n, 0
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[N:%.*]], 0
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 1, i32 234, !prof !12
 ; CHECK-NEXT:    ret i32 [[DOT]]
 ;
@@ -416,8 +564,8 @@ return:
 define i32 @SimplifyCondBranchToCondBranch(i1 %cmpa, i1 %cmpb) {
 ; CHECK-LABEL: @SimplifyCondBranchToCondBranch(
 ; CHECK-NEXT:  block1:
-; CHECK-NEXT:    [[BRMERGE:%.*]] = or i1 %cmpa, %cmpb
-; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 %cmpa, i32 0, i32 2, !prof !13
+; CHECK-NEXT:    [[BRMERGE:%.*]] = or i1 [[CMPA:%.*]], [[CMPB:%.*]]
+; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[CMPA]], i32 0, i32 2, !prof !13
 ; CHECK-NEXT:    [[OUTVAL:%.*]] = select i1 [[BRMERGE]], i32 [[DOTMUX]], i32 1, !prof !14
 ; CHECK-NEXT:    ret i32 [[OUTVAL]]
 ;
@@ -441,8 +589,8 @@ exit:
 define i32 @SimplifyCondBranchToCondBranchSwap(i1 %cmpa, i1 %cmpb) {
 ; CHECK-LABEL: @SimplifyCondBranchToCondBranchSwap(
 ; CHECK-NEXT:  block1:
-; CHECK-NEXT:    [[CMPA_NOT:%.*]] = xor i1 %cmpa, true
-; CHECK-NEXT:    [[CMPB_NOT:%.*]] = xor i1 %cmpb, true
+; CHECK-NEXT:    [[CMPA_NOT:%.*]] = xor i1 [[CMPA:%.*]], true
+; CHECK-NEXT:    [[CMPB_NOT:%.*]] = xor i1 [[CMPB:%.*]], true
 ; CHECK-NEXT:    [[BRMERGE:%.*]] = or i1 [[CMPA_NOT]], [[CMPB_NOT]]
 ; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[CMPA_NOT]], i32 0, i32 2, !prof !15
 ; CHECK-NEXT:    [[OUTVAL:%.*]] = select i1 [[BRMERGE]], i32 [[DOTMUX]], i32 1, !prof !16
@@ -466,8 +614,8 @@ exit:
 define i32 @SimplifyCondBranchToCondBranchSwapMissingWeight(i1 %cmpa, i1 %cmpb) {
 ; CHECK-LABEL: @SimplifyCondBranchToCondBranchSwapMissingWeight(
 ; CHECK-NEXT:  block1:
-; CHECK-NEXT:    [[CMPA_NOT:%.*]] = xor i1 %cmpa, true 
-; CHECK-NEXT:    [[CMPB_NOT:%.*]] = xor i1 %cmpb, true
+; CHECK-NEXT:    [[CMPA_NOT:%.*]] = xor i1 [[CMPA:%.*]], true
+; CHECK-NEXT:    [[CMPB_NOT:%.*]] = xor i1 [[CMPB:%.*]], true
 ; CHECK-NEXT:    [[BRMERGE:%.*]] = or i1 [[CMPA_NOT]], [[CMPB_NOT]]
 ; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[CMPA_NOT]], i32 0, i32 2, !prof !17
 ; CHECK-NEXT:    [[OUTVAL:%.*]] = select i1 [[BRMERGE]], i32 [[DOTMUX]], i32 1, !prof !18
diff --git a/test/Transforms/SimplifyCFG/switch-on-const-select.ll b/test/Transforms/SimplifyCFG/switch-on-const-select.ll
index c33030fa5d121561e1c1be0f26cefd7e56e24bf1..165e5b264aef733ab3d13cacf9b1298aed656155 100644
--- a/test/Transforms/SimplifyCFG/switch-on-const-select.ll
+++ b/test/Transforms/SimplifyCFG/switch-on-const-select.ll
@@ -138,4 +138,4 @@ declare void @bees.a() nounwind
 declare void @bees.b() nounwind
 
 ; CHECK: attributes [[$NUW]] = { nounwind }
-; CHECK: attributes #1 = { noreturn nounwind }
+; CHECK: attributes #1 = { cold noreturn nounwind }
diff --git a/test/Transforms/Util/call-promotion-utils-ptrcast-attribute.ll b/test/Transforms/Util/call-promotion-utils-ptrcast-attribute.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2b4c8c8e85fb5391e7207b0b2a9bac77cca87864
--- /dev/null
+++ b/test/Transforms/Util/call-promotion-utils-ptrcast-attribute.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -pgo-icall-prom -icp-total-percent-threshold=0 < %s 2>&1 | FileCheck %s
+
+; Test that CallPromotionUtils will promote calls which require pointer cast
+; safely, i.e. drop incompatible attributes.
+
+@foo = common global i8* (i8*)* null, align 8
+
+; casting to i64 and pointer attribute at callsite dropped.
+define i64 @func2(i64 %a) {
+  ret i64 undef
+}
+
+; no casting needed, attribute at callsite preserved.
+define i8* @func4(i8* %a) {
+  ret i8* undef
+}
+
+define i8* @bar(i8* %arg) {
+  %tmp = load i8* (i8*)*, i8* (i8*)** @foo, align 8
+
+; Make sure callsite attributes are preserved on arguments and retval.
+; CHECK: call noalias i8* @func4(i8* nonnull
+
+; Make sure callsite attributes are dropped on arguments and retval.
+; CHECK: [[ARG:%[0-9]+]] = ptrtoint i8* %arg to i64
+; CHECK-NEXT: call i64 @func2(i64 [[ARG]])
+
+  %call = call noalias i8* %tmp(i8* nonnull %arg), !prof !1
+  ret i8* %call
+}
+
+!1 = !{!"VP", i32 0, i64 1440, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410}
diff --git a/test/Transforms/Util/split-bit-piece.ll b/test/Transforms/Util/dbg-user-of-aext.ll
similarity index 88%
rename from test/Transforms/Util/split-bit-piece.ll
rename to test/Transforms/Util/dbg-user-of-aext.ll
index 86f464266cfb5d97e9fcb62ac7913d8008d00d30..9a31066938946e16d03ba1c2d48543c3891e1905 100644
--- a/test/Transforms/Util/split-bit-piece.ll
+++ b/test/Transforms/Util/dbg-user-of-aext.ll
@@ -1,6 +1,6 @@
 ; Checks that llvm.dbg.declare -> llvm.dbg.value conversion utility
-; (here exposed through the SROA) pass, properly inserts bit_piece expressions
-; if it only describes part of the variable.
+; (here exposed through the SROA) pass refers to [s|z]exts of values (as
+; opposed to the operand of a [s|z]ext).
 ; RUN: opt -S -sroa %s | FileCheck %s
 
 ; Built from:
@@ -21,8 +21,8 @@
 
 ; CHECK: call void @llvm.dbg.value(metadata i8 %g.coerce0, metadata ![[VAR_STRUCT:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 8))
 ; CHECK: call void @llvm.dbg.value(metadata i64 %g.coerce1, metadata ![[VAR_STRUCT]], metadata !DIExpression(DW_OP_LLVM_fragment, 32, 64))
-; CHECK: call void @llvm.dbg.value(metadata i1 %b, metadata ![[VAR_BOOL:[0-9]+]], metadata !DIExpression())
-; CHECK: call void @llvm.dbg.value(metadata i1 %frag, metadata ![[VAR_FRAG:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 1))
+; CHECK: call void @llvm.dbg.value(metadata i8 %frombool, metadata ![[VAR_BOOL:[0-9]+]], metadata !DIExpression())
+; CHECK: call void @llvm.dbg.value(metadata i8 %frombool1, metadata ![[VAR_FRAG:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 4))
 
 %struct.foo = type { i8, i64 }
 
@@ -40,13 +40,17 @@ entry:
   %frombool = zext i1 %b to i8
   store i8 %frombool, i8* %b.addr, align 1
   call void @llvm.dbg.declare(metadata i8* %b.addr, metadata !15, metadata !16), !dbg !17
-  %frombool1 = zext i1 %frag to i8
+  %frombool1 = sext i1 %frag to i8
   store i8 %frombool1, i8* %frag.addr, align 1
   call void @llvm.dbg.declare(metadata i8* %frag.addr, metadata !18, metadata !23), !dbg !19
   call void @llvm.dbg.declare(metadata %struct.foo* %g, metadata !20, metadata !16), !dbg !21
   ret void, !dbg !22
 }
 
+; CHECK: ![[VAR_STRUCT]] = !DILocalVariable(name: "g"
+; CHECK: ![[VAR_BOOL]] = !DILocalVariable(name: "b"
+; CHECK: ![[VAR_FRAG]] = !DILocalVariable(name: "frag"
+
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
diff --git a/test/Verifier/commandline-meta1.ll b/test/Verifier/commandline-meta1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5c39bbdde6da4edc6f84d536895839bd609f73f6
--- /dev/null
+++ b/test/Verifier/commandline-meta1.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; Verify that llvm.commandline is properly structured.
+; llvm.commandline takes a list of metadata entries.
+; Each metadata entry can have only one string.
+
+!llvm.commandline = !{!0}
+!0 = !{!"string1", !"string2"}
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: incorrect number of operands in llvm.commandline metadata
+; CHECK-NEXT: !0
diff --git a/test/Verifier/commandline-meta2.ll b/test/Verifier/commandline-meta2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..46eeb8cdf4eb14bb89173c81bb73680f966300ee
--- /dev/null
+++ b/test/Verifier/commandline-meta2.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; Verify that llvm.commandline is properly structured.
+; llvm.commandline takes a list of metadata entries.
+; Each metadata entry can contain one string only.
+
+!llvm.commandline = !{!0}
+!0 = !{i32 1}
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: invalid value for llvm.commandline metadata entry operand(the operand should be a string)
+; CHECK-NEXT: i32 1
diff --git a/test/Verifier/commandline-meta3.ll b/test/Verifier/commandline-meta3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f27f7ca56502ff483ee325f3a598daa9d953f638
--- /dev/null
+++ b/test/Verifier/commandline-meta3.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; Verify that llvm.commandline is properly structured.
+; llvm.commandline takes a list of metadata entries.
+; Each metadata entry can contain one string only.
+
+!llvm.commandline = !{!0}
+!0 = !{!{!"nested metadata"}}
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: invalid value for llvm.commandline metadata entry operand(the operand should be a string)
+; CHECK-NEXT: !1
diff --git a/test/Verifier/commandline-meta4.ll b/test/Verifier/commandline-meta4.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b0d103c4b235e6d2f1e99b2e8c06f8ab850ca0c7
--- /dev/null
+++ b/test/Verifier/commandline-meta4.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; Verify that llvm.commandline is properly structured.
+; llvm.commandline takes a list of metadata entries.
+; Each metadata entry can contain one string only.
+
+!llvm.commandline = !{!0}
+!0 = !{null}
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: invalid value for llvm.commandline metadata entry operand(the operand should be a string)
diff --git a/test/Verifier/gisel-g_build_vector.mir b/test/Verifier/gisel-g_build_vector.mir
new file mode 100644
index 0000000000000000000000000000000000000000..a40942ef47bc35e146b2c64e20731e3c5ce82b91
--- /dev/null
+++ b/test/Verifier/gisel-g_build_vector.mir
@@ -0,0 +1,27 @@
+#RUN: not llc -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+# REQUIRES: global-isel, aarch64-registered-target
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-unknown"
+  
+  define i32 @g_build_vector() {
+    ret i32 0
+  }
+
+...
+---
+name:            g_build_vector
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: _, preferred-register: '' }
+liveins:         
+body:             |
+  bb.0:
+    ; CHECK: Bad machine code: G_BUILD_VECTOR src operands total size don't match dest size
+
+    %0(s32) = IMPLICIT_DEF
+    %1:_(<2 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0
+...
diff --git a/test/Verifier/gisel-g_build_vector_trunc.mir b/test/Verifier/gisel-g_build_vector_trunc.mir
new file mode 100644
index 0000000000000000000000000000000000000000..3b9b304b6673de4c4252cf2ed04c49925102082b
--- /dev/null
+++ b/test/Verifier/gisel-g_build_vector_trunc.mir
@@ -0,0 +1,27 @@
+#RUN: not llc -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+# REQUIRES: global-isel, aarch64-registered-target
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-unknown"
+  
+  define i32 @g_build_vector_trunc() {
+    ret i32 0
+  }
+
+...
+---
+name:            g_build_vector_trunc
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: _, preferred-register: '' }
+liveins:         
+body:             |
+  bb.0:
+    ; CHECK: Bad machine code: G_BUILD_VECTOR_TRUNC source operand types are not larger than dest elt type
+
+    %0(s32) = IMPLICIT_DEF
+    %1:_(<2 x s32>) = G_BUILD_VECTOR_TRUNC %0, %0
+...
diff --git a/test/Verifier/gisel-g_concat_vector.mir b/test/Verifier/gisel-g_concat_vector.mir
new file mode 100644
index 0000000000000000000000000000000000000000..640c4a4ceedb80bf36eea734215ecd3af34580dc
--- /dev/null
+++ b/test/Verifier/gisel-g_concat_vector.mir
@@ -0,0 +1,29 @@
+#RUN: not llc -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+# REQUIRES: global-isel, aarch64-registered-target
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-unknown"
+  
+  define i32 @g_concat_vectors() {
+    ret i32 0
+  }
+
+...
+---
+name:            g_concat_vectors
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: _, preferred-register: '' }
+  - { id: 1, class: _, preferred-register: '' }
+liveins:         
+body:             |
+  bb.0:
+    ; CHECK: Bad machine code: G_CONCAT_VECTOR num dest and source elements should match
+
+    %0(<2 x s32>) = IMPLICIT_DEF
+    %1(<2 x s32>) = IMPLICIT_DEF
+    %2:_(<2 x s32>) = G_CONCAT_VECTORS %0, %1
+...
diff --git a/test/Verifier/statepoint.ll b/test/Verifier/statepoint.ll
index c07a85b9bd368bedc269348904fc2573b1c499d1..8dba5c4d02af4e5c5526831f9f97e7761547f44f 100644
--- a/test/Verifier/statepoint.ll
+++ b/test/Verifier/statepoint.ll
@@ -4,6 +4,7 @@ declare void @use(...)
 declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32)
 declare i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token, i32, i32)
 declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidp0s_structsf(i64, i32, void (%struct*)*, i32, i32, ...)
 declare i32 @"personality_function"()
 
 ;; Basic usage
@@ -79,3 +80,20 @@ exceptional_return:
   %obj1.relocated1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %landing_pad, i32 12, i32 12)
   ret i8 addrspace(1)* %obj1.relocated1
 }
+
+; Test for statepoint with sret attribute.
+; This should be allowed as long as the wrapped function is not vararg.
+%struct = type { i64, i64, i64 }
+
+declare void @fn_sret(%struct* sret)
+
+define void @test_sret() gc "statepoint-example" {
+  %x = alloca %struct
+  %statepoint_token = call token (i64, i32, void (%struct*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp0s_structsf(i64 0, i32 0, void (%struct*)* @fn_sret, i32 1, i32 0, %struct* sret %x, i32 0, i32 0)
+  ret void
+  ; CHECK-LABEL: test_sret
+  ; CHECK: alloca
+  ; CHECK: statepoint
+  ; CHECK-SAME: sret
+  ; CHECK: ret
+}
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index ff6a5f391883090e5db6d1e5e81119d9ea403c76..3dff11bd0d2fc6aefe1dff949836d35e3daa67bf 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -146,11 +146,11 @@ tools.extend([
     'llvm-isel-fuzzer', 'llvm-opt-fuzzer', 'llvm-lib', 'llvm-link', 'llvm-lto',
     'llvm-lto2', 'llvm-mc', 'llvm-mca', 'llvm-modextract', 'llvm-nm',
     'llvm-objcopy', 'llvm-objdump', 'llvm-pdbutil', 'llvm-profdata',
-    'llvm-ranlib', 'llvm-readobj', 'llvm-rtdyld', 'llvm-size', 'llvm-split',
-    'llvm-strings', 'llvm-strip', 'llvm-tblgen', 'llvm-undname', 'llvm-c-test',
-    'llvm-cxxfilt', 'llvm-xray', 'yaml2obj', 'obj2yaml', 'yaml-bench',
-    'verify-uselistorder', 'bugpoint', 'llc', 'llvm-symbolizer', 'opt',
-    'sancov', 'sanstats'])
+    'llvm-ranlib', 'llvm-readelf', 'llvm-readobj', 'llvm-rtdyld', 'llvm-size',
+    'llvm-split', 'llvm-strings', 'llvm-strip', 'llvm-tblgen', 'llvm-undname',
+    'llvm-c-test', 'llvm-cxxfilt', 'llvm-xray', 'yaml2obj', 'obj2yaml',
+    'yaml-bench', 'verify-uselistorder', 'bugpoint', 'llc', 'llvm-symbolizer',
+    'opt', 'sancov', 'sanstats'])
 
 # The following tools are optional
 tools.extend([
diff --git a/test/tools/dsymutil/Inputs/global_downgraded_to_static.x86_64 b/test/tools/dsymutil/Inputs/global_downgraded_to_static.x86_64
new file mode 100755
index 0000000000000000000000000000000000000000..fa7a1d14f1d06ec76432fedf5442957671fd138d
Binary files /dev/null and b/test/tools/dsymutil/Inputs/global_downgraded_to_static.x86_64 differ
diff --git a/test/tools/dsymutil/Inputs/global_downgraded_to_static/1.o b/test/tools/dsymutil/Inputs/global_downgraded_to_static/1.o
new file mode 100644
index 0000000000000000000000000000000000000000..cd00c7677fd1046b1524a3fd40f6b5e4c3e15b03
Binary files /dev/null and b/test/tools/dsymutil/Inputs/global_downgraded_to_static/1.o differ
diff --git a/test/tools/dsymutil/Inputs/global_downgraded_to_static/1.r.o b/test/tools/dsymutil/Inputs/global_downgraded_to_static/1.r.o
new file mode 100644
index 0000000000000000000000000000000000000000..6629b4c1d9eb3fe2f8fb58571a279f73bbe802f7
Binary files /dev/null and b/test/tools/dsymutil/Inputs/global_downgraded_to_static/1.r.o differ
diff --git a/test/tools/dsymutil/Inputs/global_downgraded_to_static/2.o b/test/tools/dsymutil/Inputs/global_downgraded_to_static/2.o
new file mode 100644
index 0000000000000000000000000000000000000000..2bb89443292c15fab6cfb9ed46d029c9ba339c57
Binary files /dev/null and b/test/tools/dsymutil/Inputs/global_downgraded_to_static/2.o differ
diff --git a/test/tools/dsymutil/Inputs/lc_build_version.x86_64 b/test/tools/dsymutil/Inputs/lc_build_version.x86_64
new file mode 100644
index 0000000000000000000000000000000000000000..07a5e9b0edaf7a2b354922adc351166239b6e98a
Binary files /dev/null and b/test/tools/dsymutil/Inputs/lc_build_version.x86_64 differ
diff --git a/test/tools/dsymutil/X86/global_downgraded_to_static.c b/test/tools/dsymutil/X86/global_downgraded_to_static.c
new file mode 100644
index 0000000000000000000000000000000000000000..35c324703fd67e382f819b2b0da83e96d9f1ed24
--- /dev/null
+++ b/test/tools/dsymutil/X86/global_downgraded_to_static.c
@@ -0,0 +1,24 @@
+// REQUIRES : system-darwin
+// RUN: dsymutil -oso-prepend-path %p/.. -dump-debug-map %p/../Inputs/global_downgraded_to_static.x86_64 2>&1 | FileCheck %s
+//
+//  To build:
+//    clang -g -c -DFILE1 global_downgraded_to_static.c -o 1.o
+//    clang -g -c -DFILE2 global_downgraded_to_static.c -o 2.o
+//    ld -r -exported_symbol _foo 1.o -o 1.r.o
+//    clang 1.r.o 2.o -o global_downgraded_to_static.x86_64
+
+#if defined(FILE1)
+int global_to_become_static = 42;
+// CHECK: sym: _global_to_become_static,
+// CHECK-SAME: binAddr: 0x0000000100001000
+int foo() {
+  return global_to_become_static;
+}
+#elif defined(FILE2)
+int foo(void);
+int main() {
+  return foo();
+}
+#else
+#error Define FILE1 or FILE2
+#endif
diff --git a/test/tools/dsymutil/X86/lc_build_version.test b/test/tools/dsymutil/X86/lc_build_version.test
new file mode 100644
index 0000000000000000000000000000000000000000..d0a8598b03628b1b1928c8421979bfe6140354e1
--- /dev/null
+++ b/test/tools/dsymutil/X86/lc_build_version.test
@@ -0,0 +1,11 @@
+# RUN: dsymutil -f %p/../Inputs/lc_build_version.x86_64 -o - \
+# RUN:   | obj2yaml | FileCheck %s
+
+CHECK: LoadCommands:
+CHECK:   - cmd:             LC_BUILD_VERSION
+CHECK-NEXT:    cmdsize:         24
+CHECK-NEXT:    platform:        1
+CHECK-NEXT:    minos:           658944
+CHECK-NEXT:    sdk:             658944
+CHECK-NEXT:    ntools:          0
+CHECK-NEXT:   - cmd:             LC_BUILD_VERSION
diff --git a/test/tools/gold/X86/split-dwarf.ll b/test/tools/gold/X86/split-dwarf.ll
index 1743654b4a48545e3b6809c9e82a03a78d293cd4..ce7ebc946a9dc1cf9d6e40068935a2d12554899e 100644
--- a/test/tools/gold/X86/split-dwarf.ll
+++ b/test/tools/gold/X86/split-dwarf.ll
@@ -10,8 +10,8 @@
 ; CHECK: DW_AT_GNU_dwo_name{{.*}}dwo_dir/1.dwo
 ; CHECK-NOT: DW_TAG_subprogram
 ; RUN: llvm-dwarfdump -debug-info %t/dwo_dir/1.dwo | FileCheck --check-prefix DWOCHECK %s
-; DWOCHECK: DW_AT_GNU_dwo_name{{.*}}dwo_dir/1.dwo
 ; DWOCHECK: DW_AT_name{{.*}}split-dwarf.c
+; DWOCHECK: DW_AT_GNU_dwo_name{{.*}}dwo_dir/1.dwo
 ; DWOCHECK: DW_TAG_subprogram
 
 ; RUN:rm -rf %t/dwo_dir
@@ -26,8 +26,8 @@
 ; LTOCHECK: DW_AT_GNU_dwo_name{{.*}}dwo_dir/0.dwo
 ; LTOCHECK-NOT: DW_TAG_subprogram
 ; RUN: llvm-dwarfdump -debug-info %t/dwo_dir/0.dwo | FileCheck --check-prefix LTODWOCHECK %s
-; LTODWOCHECK: DW_AT_GNU_dwo_name{{.*}}dwo_dir/0.dwo
 ; LTODWOCHECK: DW_AT_name{{.*}}split-dwarf.c
+; LTODWOCHECK: DW_AT_GNU_dwo_name{{.*}}dwo_dir/0.dwo
 ; LTODWOCHECK: DW_TAG_subprogram
 
 ; ModuleID = 'split-dwarf.c'
diff --git a/test/tools/llvm-cov/showLineExecutionCounts-lcov.test b/test/tools/llvm-cov/showLineExecutionCounts-lcov.test
new file mode 100644
index 0000000000000000000000000000000000000000..9cacfb19eae9c4f44e307641004e6a1fa9eedacb
--- /dev/null
+++ b/test/tools/llvm-cov/showLineExecutionCounts-lcov.test
@@ -0,0 +1,38 @@
+// FULL: SF:{{.*}}showLineExecutionCounts.cpp
+// FULL: FN:6,main
+// FULL: FNDA:161,main
+// FULL: FNF:1
+// FULL: FNH:1
+int main() {                              // FULL: DA:[[@LINE]],161
+  int x = 0;                              // FULL: DA:[[@LINE]],161
+                                          // FULL: DA:[[@LINE]],161
+  if (x) {                                // FULL: DA:[[@LINE]],161
+    x = 0;                                // FULL: DA:[[@LINE]],0
+  } else {                                // FULL: DA:[[@LINE]],161
+    x = 1;                                // FULL: DA:[[@LINE]],161
+  }                                       // FULL: DA:[[@LINE]],161
+                                          // FULL: DA:[[@LINE]],161
+  for (int i = 0; i < 100; ++i) {         // FULL: DA:[[@LINE]],16261
+    x = 1;                                // FULL: DA:[[@LINE]],16100
+  }                                       // FULL: DA:[[@LINE]],16100
+                                          // FULL: DA:[[@LINE]],161
+  x = x < 10 ? x + 1 : x - 1;             // FULL: DA:[[@LINE]],161
+  x = x > 10 ?                            // FULL: DA:[[@LINE]],161
+        x - 1:                            // FULL: DA:[[@LINE]],0
+        x + 1;                            // FULL: DA:[[@LINE]],161
+                                          // FULL: DA:[[@LINE]],161
+  return 0;                               // FULL: DA:[[@LINE]],161
+}                                         // FULL: DA:[[@LINE]],161
+// FULL: LF:20
+// FULL: LH:18
+// FULL: end_of_record
+// RUN: llvm-profdata merge %S/Inputs/lineExecutionCounts.proftext -o %t.profdata
+// RUN: llvm-cov export -format=lcov %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata %s | FileCheck -check-prefixes=FULL %s
+
+// RUN: llvm-cov export -format=lcov -summary-only %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata %s | FileCheck -check-prefixes=SUMMARYONLY %s
+// SUMMARYONLY: SF:{{.*}}showLineExecutionCounts.cpp
+// SUMMARYONLY: FNF:1
+// SUMMARYONLY: FNH:1
+// SUMMARYONLY: LF:20
+// SUMMARYONLY: LH:18
+// SUMMARYONLY: end_of_record
diff --git a/test/tools/llvm-cxxdump/trivial.test b/test/tools/llvm-cxxdump/trivial.test
index d4982b87d8d22c921a37ef3791973b0c9fed6921..8df0bc32e1e3d10bb84bd00f0fb7c5f15b1a7cd0 100644
--- a/test/tools/llvm-cxxdump/trivial.test
+++ b/test/tools/llvm-cxxdump/trivial.test
@@ -63,4 +63,4 @@ ELF-I386-NEXT: _ZTV1A[8]: _ZN1A1fEv
 MIXEDARCOFF-I386:      ??_7S@@6B@[0]: ??_R4S@@6B@
 
 RUN: not llvm-cxxdump %t.blah 2>&1 | FileCheck --check-prefix=ENOENT %s
-ENOENT: {{.*}}.blah: {{[Nn]}}o such file or directory
+ENOENT: {{.*}}.blah: error: {{[Nn]}}o such file or directory
diff --git a/test/tools/llvm-dwarfdump/X86/debug_tls_relocs.s b/test/tools/llvm-dwarfdump/X86/debug_tls_relocs.s
new file mode 100644
index 0000000000000000000000000000000000000000..d432ecf2baf978f1532a30b5b1de07772f0265d7
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/debug_tls_relocs.s
@@ -0,0 +1,68 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s
+
+# CHECK-NOT: error
+# CHECK: DW_AT_location [DW_FORM_exprloc] (DW_OP_const8u 0x0, DW_OP_GNU_push_tls_address)
+# CHECK: DW_AT_location [DW_FORM_exprloc] (DW_OP_const4u 0x0, DW_OP_GNU_push_tls_address)
+
+.section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+ .asciz "X"
+
+.section .debug_abbrev,"",@progbits
+ .byte 1                # Abbreviation Code
+ .byte 17               # DW_TAG_compile_unit
+ .byte 1                # DW_CHILDREN_yes
+ .byte 37               # DW_AT_producer
+ .byte 14               # DW_FORM_strp
+ .byte 19               # DW_AT_language
+ .byte 5                # DW_FORM_data2
+ .byte 3                # DW_AT_name
+ .byte 14               # DW_FORM_strp
+ .byte 0                # EOM(1)
+ .byte 0                # EOM(2)
+
+ .byte 2                # Abbreviation Code
+ .byte 52               # DW_TAG_variable
+ .byte 0                # DW_CHILDREN_no
+ .byte 3                # DW_AT_name
+ .byte 14               # DW_FORM_strp
+ .byte 73               # DW_AT_type
+ .byte 19               # DW_FORM_ref4
+ .byte 63               # DW_AT_external
+ .byte 25               # DW_FORM_flag_present
+ .byte 2                # DW_AT_location
+ .byte 24               # DW_FORM_exprloc
+ .byte 0                # EOM(1)
+ .byte 0                # EOM(2)
+
+ .byte 0                # EOM(3)
+
+.section .debug_info,"",@progbits
+ .long 49               # Length of Unit
+ .short 4               # DWARF version number
+ .long .debug_abbrev    # Offset Into Abbrev. Section
+ .byte 8                # Address Size (in bytes)
+ .byte 1                # Abbrev [1] 0xb:0x6c DW_TAG_compile_unit
+ .long .Linfo_string0   # DW_AT_producer
+ .short 4               # DW_AT_language
+ .long .Linfo_string0   # DW_AT_name
+                        
+ .byte 2                # Abbrev [2] 0x2a:0x16 DW_TAG_variable
+ .long .Linfo_string0   # DW_AT_name
+ .long 0                # DW_AT_type
+ .byte 10               # DW_AT_location
+ .byte 14
+ .quad tdata1@DTPOFF
+ .byte 224
+
+ .byte 2                # Abbrev [2] 0x47:0x16 DW_TAG_variable
+ .long .Linfo_string0   # DW_AT_name
+ .long 0                # DW_AT_type
+ .byte 6                # DW_AT_location
+ .byte 12
+ .long tdata2@DTPOFF
+ .byte 224
+
+ .byte 0                # End Of Children Mark
+ .byte 0                # End Of Children Mark
diff --git a/test/tools/llvm-dwarfdump/X86/eh-frame-return-address-reg.s b/test/tools/llvm-dwarfdump/X86/eh-frame-return-address-reg.s
new file mode 100644
index 0000000000000000000000000000000000000000..033766534b6cf52d58d771ee455573427956afde
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/eh-frame-return-address-reg.s
@@ -0,0 +1,51 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s
+
+# The format of the .eh_frame section is similar in
+# format and purpose to the .debug_frame section.
+# Version 1 is often used for .eh_frame,
+# and also it was used for DWARF v2. For that case,
+# return address register should be encoded as ubyte,
+# while later versions use ULEB128. This test case
+# checks that we are able to dump it correctly.
+
+# CHECK:      .eh_frame contents:
+# CHECK:      00000000 00000010 ffffffff CIE
+# CHECK-NEXT:   Version:               1
+# CHECK-NEXT:   Augmentation:          "zR"
+# CHECK-NEXT:   Code alignment factor: 1
+# CHECK-NEXT:   Data alignment factor: 1
+# CHECK-NEXT:   Return address column: 240
+# CHECK-NEXT:   Augmentation data:     1A
+
+.text
+.global _start
+_start:
+ nop
+
+.section .eh_frame, "a"
+  .long 16   # Size
+  .long 0x00 # ID
+  .byte 0x01 # Version
+
+  .byte 0x7A # Augmentation string: "zR"
+  .byte 0x52
+  .byte 0x00
+
+  .byte 0x01 # Code alignment factor, ULEB128
+  .byte 0x01 # Data alignment factor, ULEB128
+  
+  .byte 0xF0 # Return address register, ubyte for version 1.
+
+  .byte 0x01 # LEB128
+  .byte 0x1A # DW_EH_PE_pcrel | DW_EH_PE_sdata2
+
+  .byte 0x00
+  .byte 0x00
+  .byte 0x00
+
+  .long 10   # Size
+  .long 24   # ID
+fde:
+  .long _start - fde
+  .word 0
diff --git a/test/tools/llvm-dwarfdump/X86/no_debug_addr.s b/test/tools/llvm-dwarfdump/X86/no_debug_addr.s
new file mode 100644
index 0000000000000000000000000000000000000000..ce1ae23cf8dc319bc4291dc0840544801d9209a0
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/no_debug_addr.s
@@ -0,0 +1,51 @@
+# RUN: llvm-mc %s -filetype obj -triple=x86_64-pc-linux -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s -match-full-lines
+
+## Ensure bogus empty section names are not printed when dumping
+## rnglists that reference debug_addr when it is not present (such as in .dwo files)
+
+# CHECK:       DW_AT_ranges [DW_FORM_rnglistx]   (indexed (0x0) rangelist = 0x00000004
+# CHECK-NEXT:    [0x0000000000000000, 0x0000000000000001)
+# CHECK-NEXT:    [0x0000000000000000, 0x0000000000000002))
+
+.section .debug_info.dwo,"e",@progbits
+.long .Ldebug_info_dwo_end1-.Ldebug_info_dwo_start1   # Length of Unit
+.Ldebug_info_dwo_start1:
+  .short 5                       # DWARF version number
+  .byte  5                       # DWARF Unit Type
+  .byte  8                       # Address Size (in bytes)
+  .long  0                       # Offset Into Abbrev. Section
+  .quad  -6809755978868859807
+  .byte  1                       # Abbrev [1] 0x14:0x32 DW_TAG_compile_unit
+  .byte  0                       # DW_AT_ranges
+.Ldebug_info_dwo_end1:
+
+.section .debug_abbrev.dwo,"e",@progbits
+  .byte 1                        # Abbreviation Code
+  .byte 17                       # DW_TAG_compile_unit
+  .byte 0                        # DW_CHILDREN_no
+  .byte 85                       # DW_AT_ranges
+  .byte 35                       # DW_FORM_rnglistx
+  .byte 0                        # EOM(1)
+  .byte 0                        # EOM(2)
+
+.section .debug_rnglists.dwo,"e",@progbits
+  .long  .Ldebug_rnglist_table_end1-.Ldebug_rnglist_table_start1 # Length
+.Ldebug_rnglist_table_start1:
+  .short  5                      # Version
+  .byte  8                       # Address size
+  .byte  0                       # Segment selector size
+  .long  1                       # Offset entry count
+.Lrnglists_dwo_table_base0:
+  .long  .Ldebug_ranges0-.Lrnglists_dwo_table_base0
+.Ldebug_ranges0:
+  .byte  1                       # DW_RLE_base_addressx
+  .byte  0                       #   base address index
+  .byte  4                       # DW_RLE_offset_pair
+  .byte  0                       #   starting offset
+  .byte  1                       #   ending offset
+  .byte  3                       # DW_RLE_startx_length
+  .byte  1                       #   start index
+  .byte  2                       #   length
+  .byte  0                       # DW_RLE_end_of_list
+.Ldebug_rnglist_table_end1:
diff --git a/test/tools/llvm-dwarfdump/X86/prettyprint_types.s b/test/tools/llvm-dwarfdump/X86/prettyprint_types.s
new file mode 100644
index 0000000000000000000000000000000000000000..b3e871e7c82e270d77b6aa2430e51049f34a4a16
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/prettyprint_types.s
@@ -0,0 +1,238 @@
+# RUN: llvm-mc < %s -filetype obj -triple i386-pc-linux -o - \
+# RUN:   | llvm-dwarfdump - | FileCheck %s
+
+# CHECK: DW_TAG_variable
+
+# base_type
+# CHECK:   DW_AT_type{{.*}}"int"
+
+# pointer_type
+# CHECK:   DW_AT_type{{.*}}"int*"
+
+# reference_type
+# CHECK:   DW_AT_type{{.*}}"int&"
+
+# rvalue_reference_type
+# CHECK:   DW_AT_type{{.*}}"int&&"
+
+# ptr_to_member_type
+# CHECK:   DW_AT_type{{.*}}"int foo::*"
+
+# array_type
+# Testing lower_bound, upper_bound, lower and upper, lower and count, and count separately.
+# CHECK:   DW_AT_type{{.*}}"int[1-][2][1-2][1-3][2]"
+
+# subroutine types
+# CHECK:   DW_AT_type{{.*}}"int()"
+# CHECK:   DW_AT_type{{.*}}"void(int)"
+# CHECK:   DW_AT_type{{.*}}"void(int, int)"
+	.section	.debug_str,"MS",@progbits,1
+.Lint_name:
+	.asciz	"int"
+.Lfoo_name:
+	.asciz	"foo"
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                       # Abbreviation Code
+	.byte	17                      # DW_TAG_compile_unit
+	.byte	1                       # DW_CHILDREN_yes
+	.byte	19                      # DW_AT_language
+	.byte	5                       # DW_FORM_data2
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	2                       # Abbreviation Code
+	.byte	36                      # DW_TAG_base_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	3                       # DW_AT_name
+	.byte	14                      # DW_FORM_strp
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	3                       # Abbreviation Code
+	.byte	52                      # DW_TAG_variable
+	.byte	0                       # DW_CHILDREN_no
+	.byte	73                      # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	4                       # Abbreviation Code
+	.byte	0xf                     # DW_TAG_pointer_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	73                      # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	5                       # Abbreviation Code
+	.byte	0x10                    # DW_TAG_reference_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	73                      # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	6                       # Abbreviation Code
+	.byte	0x42                    # DW_TAG_rvalue_reference_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	73                      # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	7                       # Abbreviation Code
+	.byte	0x1f                    # DW_TAG_ptr_to_member_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	73                      # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0x1d                    # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	8                       # Abbreviation Code
+	.byte	1                       # DW_TAG_array_type
+	.byte	1                       # DW_CHILDREN_yes
+	.byte	73                      # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	9                       # Abbreviation Code
+	.byte	0x21                    # DW_TAG_subrange_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	0x22                    # DW_AT_lower_bound
+	.byte	0xb                     # DW_FORM_data1
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	10                      # Abbreviation Code
+	.byte	0x21                    # DW_TAG_subrange_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	0x2f                    # DW_AT_upper_bound
+	.byte	0xb                     # DW_FORM_data1
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	11                      # Abbreviation Code
+	.byte	0x21                    # DW_TAG_subrange_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	0x22                    # DW_AT_lower_bound
+	.byte	0xb                     # DW_FORM_data1
+	.byte	0x2f                    # DW_AT_upper_bound
+	.byte	0xb                     # DW_FORM_data1
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	12                      # Abbreviation Code
+	.byte	0x21                    # DW_TAG_subrange_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	0x22                    # DW_AT_lower_bound
+	.byte	0xb                     # DW_FORM_data1
+	.byte	0x37                    # DW_AT_count
+	.byte	0xb                     # DW_FORM_data1
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	13                      # Abbreviation Code
+	.byte	0x21                    # DW_TAG_subrange_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	0x37                    # DW_AT_count
+	.byte	0xb                     # DW_FORM_data1
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	14                      # Abbreviation Code
+	.byte	0x13                    # DW_TAG_structure_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	3                       # DW_AT_name
+	.byte	14                      # DW_FORM_strp
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	15                      # Abbreviation Code
+	.byte	0x15                    # DW_TAG_subroutine_type
+	.byte	0                       # DW_CHILDREN_no
+	.byte	73                      # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	16                      # Abbreviation Code
+	.byte	0x15                    # DW_TAG_subroutine_type
+	.byte	1                       # DW_CHILDREN_yes
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	17                      # Abbreviation Code
+	.byte	0x5                     # DW_TAG_formal_parameter
+	.byte	0                       # DW_CHILDREN_no
+	.byte	73                      # DW_AT_type
+	.byte	19                      # DW_FORM_ref4
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	0                       # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin:
+	.long	.Lunit_end - .Lunit_start # Length of Unit
+.Lunit_start:
+	.short	4                       # DWARF version number
+	.long	.debug_abbrev           # Offset Into Abbrev. Section
+	.byte	8                       # Address Size (in bytes)
+	.byte	1                       # DW_TAG_compile_unit
+	.short	12                      #   DW_AT_language
+.Lint_type:
+	.byte	2                       # DW_TAG_base_type
+	.long	.Lint_name              #   DW_AT_name
+.Lpointer_type:
+	.byte	4                       # DW_TAG_pointer_type
+	.long	.Lint_type - .Lcu_begin #   DW_AT_type
+.Lreference_type:
+	.byte	5                       # DW_TAG_reference_type
+	.long	.Lint_type - .Lcu_begin #   DW_AT_type
+.Lrvalue_reference_type:
+	.byte	6                       # DW_TAG_rvalue_reference_type
+	.long	.Lint_type - .Lcu_begin #   DW_AT_type
+.Lstruct_type:
+	.byte	14			# DW_TAG_structure_type
+	.long	.Lfoo_name              #   DW_AT_name
+.Lptr_to_member_type:
+	.byte	7                       # DW_TAG_ptr_to_member_type
+	.long	.Lint_type - .Lcu_begin #   DW_AT_type
+	.long   .Lstruct_type - .Lcu_begin #   DW_AT_containing_type
+.Larray_type:
+	.byte	8                       # DW_TAG_array_type
+	.long	.Lint_type - .Lcu_begin #   DW_AT_type
+	.byte	9                       #   DW_AT_subrange_type
+	.byte   1                       #     DW_AT_lower_bound
+	.byte	10                      #   DW_AT_subrange_type
+	.byte   2                       #     DW_AT_upper_bound
+	.byte	11                      #   DW_AT_subrange_type
+	.byte   1                       #     DW_AT_lower_bound
+	.byte   2                       #     DW_AT_upper_bound
+	.byte	12                      #   DW_AT_subrange_type
+	.byte   1                       #     DW_AT_lower_bound
+	.byte   2                       #     DW_AT_count
+	.byte	13                      #   DW_AT_subrange_type
+	.byte   2                       #     DW_AT_count
+	.byte	0                       # End Of Children Mark
+.Lsub_int_empty_type:
+	.byte	15                      # DW_TAG_subroutine_type
+	.long	.Lint_type - .Lcu_begin #   DW_AT_type
+.Lsub_void_int_type:
+	.byte	16                      # DW_TAG_subroutine_type
+	.byte	17                       #   DW_TAG_formal_parameter
+	.long	.Lint_type - .Lcu_begin #     DW_AT_type
+	.byte	0                       # End Of Children Mark
+.Lsub_void_int_int_type:
+	.byte	16                      # DW_TAG_subroutine_type
+	.byte	17                       #   DW_TAG_formal_parameter
+	.long	.Lint_type - .Lcu_begin #     DW_AT_type
+	.byte	17                       #   DW_TAG_formal_parameter
+	.long	.Lint_type - .Lcu_begin #     DW_AT_type
+	.byte	0                       # End Of Children Mark
+
+	.byte	3                       # DW_TAG_variable
+	.long	.Lint_type - .Lcu_begin #   DW_AT_type
+	.byte	3                       # DW_TAG_variable
+	.long	.Lpointer_type - .Lcu_begin #   DW_AT_type
+	.byte	3                       # DW_TAG_variable
+	.long	.Lreference_type - .Lcu_begin #   DW_AT_type
+	.byte	3                       # DW_TAG_variable
+	.long	.Lrvalue_reference_type - .Lcu_begin #   DW_AT_type
+	.byte	3                       # DW_TAG_variable
+	.long	.Lptr_to_member_type - .Lcu_begin #   DW_AT_type
+	.byte	3                       # DW_TAG_variable
+	.long	.Larray_type - .Lcu_begin #   DW_AT_type
+	.byte	3                       # DW_TAG_variable
+	.long	.Lsub_int_empty_type - .Lcu_begin #   DW_AT_type
+	.byte	3                       # DW_TAG_variable
+	.long	.Lsub_void_int_type - .Lcu_begin #   DW_AT_type
+	.byte	3                       # DW_TAG_variable
+	.long	.Lsub_void_int_int_type - .Lcu_begin #   DW_AT_type
+	.byte	0                       # End Of Children Mark
+.Lunit_end:
diff --git a/test/tools/llvm-dwarfdump/X86/statistics.ll b/test/tools/llvm-dwarfdump/X86/statistics.ll
index ab3d49c378ef14442c54de49972422db086153bb..0ddae7e39acb4a8fc6e8d5ab8307998cb945b690 100644
--- a/test/tools/llvm-dwarfdump/X86/statistics.ll
+++ b/test/tools/llvm-dwarfdump/X86/statistics.ll
@@ -27,6 +27,9 @@
 ; CHECK-NOT: "scope bytes covered":0
 ; CHECK-NOT "scope bytes covered":[[BYTES]]
 ; CHECK: "scope bytes covered":
+; CHECK: "total function size":[[FUNCSIZE:[0-9]+]]
+; CHECK: "total inlined function size":[[INLINESIZE:[0-9]+]]
+
 
 ; ModuleID = '/tmp/quality.cpp'
 source_filename = "/tmp/quality.cpp"
diff --git a/test/tools/llvm-exegesis/X86/lit.local.cfg b/test/tools/llvm-exegesis/X86/lit.local.cfg
index 5d8dfc48342d288867a30dafb430e07ed437d80c..506ba07ef5da75549321063b227d2159545cfb3f 100644
--- a/test/tools/llvm-exegesis/X86/lit.local.cfg
+++ b/test/tools/llvm-exegesis/X86/lit.local.cfg
@@ -20,10 +20,14 @@ else:
     else:
       try:
           with open(os.devnull, 'w') as quiet:
-              check_llvm_exegesis_result = subprocess.call(
+              check_llvm_exegesis_uops_result = subprocess.call(
                 [llvm_exegesis_exe, '-mode', 'uops', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
+              check_llvm_exegesis_latency_result = subprocess.call(
+                [llvm_exegesis_exe, '-mode', 'latency', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
       except OSError:
           print('could not exec llvm-exegesis')
           config.unsupported = True
-      if not check_llvm_exegesis_result == 0:
+      if not check_llvm_exegesis_uops_result == 0:
+        config.unsupported = True
+      if not check_llvm_exegesis_latency_result == 0:
         config.unsupported = True
diff --git a/test/tools/llvm-mca/AArch64/CortexA57/shifted-register.s b/test/tools/llvm-mca/AArch64/CortexA57/shifted-register.s
new file mode 100644
index 0000000000000000000000000000000000000000..82c046c5eba17e8b43bdcf312c905a87255ba24c
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/CortexA57/shifted-register.s
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -march=aarch64 -mcpu=cortex-a57 -resource-pressure=false < %s | FileCheck %s
+
+  add	w0, w1, w2, lsl #0
+  sub	x3, x4, x5, lsl #1
+  adds	x6, x7, x8, lsr #2
+  subs	x9, x10, x11, asr #3
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      304
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.32
+# CHECK-NEXT: IPC:               1.32
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        add	w0, w1, w2
+# CHECK-NEXT:  1      2     1.00                        sub	x3, x4, x5, lsl #1
+# CHECK-NEXT:  1      2     1.00                        adds	x6, x7, x8, lsr #2
+# CHECK-NEXT:  1      2     1.00                        subs	x9, x10, x11, asr #3
diff --git a/test/tools/llvm-mca/AArch64/Cyclone/register-offset.s b/test/tools/llvm-mca/AArch64/Cyclone/register-offset.s
new file mode 100644
index 0000000000000000000000000000000000000000..37ad0b2bba1356cd83351b37351d73caa1649c6b
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Cyclone/register-offset.s
@@ -0,0 +1,29 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -march=aarch64 -mcpu=cyclone -resource-pressure=false < %s | FileCheck %s
+
+  ldr	x7, [x1, #8]
+  ldr	x6, [x1, x2]
+  ldr	x4, [x1, x2, sxtx]
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      300
+# CHECK-NEXT: Total Cycles:      157
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    3.18
+# CHECK-NEXT: IPC:               1.91
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     0.50    *                   ldr	x7, [x1, #8]
+# CHECK-NEXT:  2      5     0.50    *                   ldr	x6, [x1, x2]
+# CHECK-NEXT:  2      5     0.50    *                   ldr	x4, [x1, x2, sxtx]
diff --git a/test/tools/llvm-mca/AArch64/Exynos/direct-branch.s b/test/tools/llvm-mca/AArch64/Exynos/direct-branch.s
index f24a20b56325ca8715ca3eb47a7fd8727ac52e0c..7c3cc8fa48c1d796142699be372a77afccfdfe0d 100644
--- a/test/tools/llvm-mca/AArch64/Exynos/direct-branch.s
+++ b/test/tools/llvm-mca/AArch64/Exynos/direct-branch.s
@@ -1,25 +1,25 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -iterations=300 -timeline -timeline-max-iterations=3 -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=M3
-# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m1 -iterations=300 -timeline -timeline-max-iterations=3 -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=M1
+# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M1
+# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
 
-  b   t
+  b	main
 
-# ALL:      Iterations:        300
-# ALL-NEXT: Instructions:      300
+# ALL:      Iterations:        100
+# ALL-NEXT: Instructions:      100
 
-# M1-NEXT:  Total Cycles:      76
-# M3-NEXT:  Total Cycles:      51
+# M1-NEXT:  Total Cycles:      26
+# M3-NEXT:  Total Cycles:      18
 
-# ALL-NEXT: Total uOps:        300
+# ALL-NEXT: Total uOps:        100
 
 # M1:       Dispatch Width:    4
-# M1-NEXT:  uOps Per Cycle:    3.95
-# M1-NEXT:  IPC:               3.95
+# M1-NEXT:  uOps Per Cycle:    3.85
+# M1-NEXT:  IPC:               3.85
 # M1-NEXT:  Block RThroughput: 0.3
 
 # M3:       Dispatch Width:    6
-# M3-NEXT:  uOps Per Cycle:    5.88
-# M3-NEXT:  IPC:               5.88
+# M3-NEXT:  uOps Per Cycle:    5.56
+# M3-NEXT:  IPC:               5.56
 # M3-NEXT:  Block RThroughput: 0.2
 
 # ALL:      Instruction Info:
@@ -32,21 +32,5 @@
 
 # ALL:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 
-# M1-NEXT:   1      0     0.25                        b	t
-# M3-NEXT:   1      0     0.17                        b	t
-
-# ALL:      Timeline view:
-# ALL-NEXT: Index     01
-
-# ALL:      [0,0]     DR   b	t
-# ALL-NEXT: [1,0]     DR   b	t
-# ALL-NEXT: [2,0]     DR   b	t
-
-# ALL:      Average Wait times (based on the timeline view):
-# ALL-NEXT: [0]: Executions
-# ALL-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# ALL-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# ALL-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# ALL:            [0]    [1]    [2]    [3]
-# ALL-NEXT: 0.     3     0.0    0.0    0.0       b	t
+# M1-NEXT:   1      0     0.25                        b	main
+# M3-NEXT:   1      0     0.17                        b	main
diff --git a/test/tools/llvm-mca/AArch64/Exynos/extended-register.s b/test/tools/llvm-mca/AArch64/Exynos/extended-register.s
new file mode 100644
index 0000000000000000000000000000000000000000..f136bd35d8603e1f7e2d64a5d52c21b50ae630c6
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Exynos/extended-register.s
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m1 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM1
+# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3
+
+  sub	w0, w1, w2, sxtb #0
+  add	x3, x4, w5, sxth #1
+  subs	x6, x7, w8, uxtw #2
+  adds	x9, x10, x11, uxtx #3
+  sub	w12, w13, w14, uxtb #0
+  add	x15, x16, w17, uxth #1
+  subs	x18, x19, w20, sxtw #2
+  adds	x21, x22, x23, sxtx #3
+
+# ALL:      Iterations:        100
+# ALL-NEXT: Instructions:      800
+
+# EM1-NEXT: Total Cycles:      403
+# EM3-NEXT: Total Cycles:      304
+
+# ALL-NEXT: Total uOps:        800
+
+# EM1:      Dispatch Width:    4
+# EM1-NEXT: uOps Per Cycle:    1.99
+# EM1-NEXT: IPC:               1.99
+# EM1-NEXT: Block RThroughput: 4.0
+
+# EM3:      Dispatch Width:    6
+# EM3-NEXT: uOps Per Cycle:    2.63
+# EM3-NEXT: IPC:               2.63
+# EM3-NEXT: Block RThroughput: 3.0
+
+# ALL:      Instruction Info:
+# ALL-NEXT: [1]: #uOps
+# ALL-NEXT: [2]: Latency
+# ALL-NEXT: [3]: RThroughput
+# ALL-NEXT: [4]: MayLoad
+# ALL-NEXT: [5]: MayStore
+# ALL-NEXT: [6]: HasSideEffects (U)
+
+# ALL:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+
+# EM1-NEXT:  1      1     0.33                        sub	w0, w1, w2, sxtb
+# EM1-NEXT:  1      2     0.67                        add	x3, x4, w5, sxth #1
+# EM1-NEXT:  1      1     0.33                        subs	x6, x7, w8, uxtw #2
+# EM1-NEXT:  1      1     0.33                        adds	x9, x10, x11, uxtx #3
+# EM1-NEXT:  1      1     0.33                        sub	w12, w13, w14, uxtb
+# EM1-NEXT:  1      2     0.67                        add	x15, x16, w17, uxth #1
+# EM1-NEXT:  1      2     0.67                        subs	x18, x19, w20, sxtw #2
+# EM1-NEXT:  1      2     0.67                        adds	x21, x22, x23, sxtx #3
+
+# EM3-NEXT:  1      1     0.25                        sub	w0, w1, w2, sxtb
+# EM3-NEXT:  1      2     0.50                        add	x3, x4, w5, sxth #1
+# EM3-NEXT:  1      1     0.25                        subs	x6, x7, w8, uxtw #2
+# EM3-NEXT:  1      1     0.25                        adds	x9, x10, x11, uxtx #3
+# EM3-NEXT:  1      1     0.25                        sub	w12, w13, w14, uxtb
+# EM3-NEXT:  1      2     0.50                        add	x15, x16, w17, uxth #1
+# EM3-NEXT:  1      2     0.50                        subs	x18, x19, w20, sxtw #2
+# EM3-NEXT:  1      2     0.50                        adds	x21, x22, x23, sxtx #3
diff --git a/test/tools/llvm-mca/AArch64/Exynos/register-offset.s b/test/tools/llvm-mca/AArch64/Exynos/register-offset.s
new file mode 100644
index 0000000000000000000000000000000000000000..4fc4cf48fdcc1baa2018d241e72686299548976c
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Exynos/register-offset.s
@@ -0,0 +1,81 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m1 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM1
+# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3
+
+  ldrb	w0, [x1, x2, lsl #0]
+  ldrh	w3, [x4, x5, sxtx #1]
+  ldr	w6, [x7, w8, uxtw #2]
+  ldr	x9, [x10, w11, sxtw #3]
+  ldr	b12, [x13, w14, sxtw #0]
+  ldr	h15, [x16, w17, uxtw #1]
+  ldr	s18, [x19, x20, sxtx #2]
+  ldr	d21, [x22, x23, lsl #3]
+  ldr	q24, [x25, x26, lsl #4]
+
+  strb	w0, [x1, x2, lsl #0]
+  strh	w3, [x4, x5, sxtx #1]
+  str	w6, [x7, w8, uxtw #2]
+  str	x9, [x10, w11, sxtw #3]
+  str	b12, [x13, w14, sxtw #0]
+  str	h15, [x16, w17, uxtw #1]
+  str	s18, [x19, x20, sxtx #2]
+  str	d21, [x22, x23, lsl #3]
+  str	q24, [x25, x26, lsl #4]
+
+# ALL:      Iterations:        100
+# ALL-NEXT: Instructions:      1800
+
+# EM1-NEXT: Total Cycles:      1719
+# EM3-NEXT: Total Cycles:      1713
+
+# ALL-NEXT: Total uOps:        2800
+
+# EM1:      Dispatch Width:    4
+# EM1-NEXT: uOps Per Cycle:    1.63
+# EM1-NEXT: IPC:               1.05
+# EM1-NEXT: Block RThroughput: 12.0
+
+# EM3:      Dispatch Width:    6
+# EM3-NEXT: uOps Per Cycle:    1.63
+# EM3-NEXT: IPC:               1.05
+# EM3-NEXT: Block RThroughput: 9.0
+
+# ALL:      Instruction Info:
+# ALL-NEXT: [1]: #uOps
+# ALL-NEXT: [2]: Latency
+# ALL-NEXT: [3]: RThroughput
+# ALL-NEXT: [4]: MayLoad
+# ALL-NEXT: [5]: MayStore
+# ALL-NEXT: [6]: HasSideEffects (U)
+
+# ALL:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+
+# EM1-NEXT:  1      5     1.00    *                   ldrb	w0, [x1, x2, lsl #0]
+# EM1-NEXT:  1      5     1.00    *                   ldrh	w3, [x4, x5, sxtx #1]
+# EM1-NEXT:  2      5     1.00    *                   ldr	w6, [x7, w8, uxtw #2]
+# EM1-NEXT:  2      5     1.00    *                   ldr	x9, [x10, w11, sxtw #3]
+# EM1-NEXT:  2      6     2.00    *                   ldr	b12, [x13, w14, sxtw #0]
+# EM1-NEXT:  2      6     2.00    *                   ldr	h15, [x16, w17, uxtw #1]
+# EM1-NEXT:  1      5     1.00    *                   ldr	s18, [x19, x20, sxtx #2]
+# EM1-NEXT:  1      5     1.00    *                   ldr	d21, [x22, x23, lsl #3]
+# EM1-NEXT:  2      6     2.00    *                   ldr	q24, [x25, x26, lsl #4]
+
+# EM3-NEXT:  1      5     0.50    *                   ldrb	w0, [x1, x2, lsl #0]
+# EM3-NEXT:  1      5     0.50    *                   ldrh	w3, [x4, x5, sxtx #1]
+# EM3-NEXT:  2      5     0.50    *                   ldr	w6, [x7, w8, uxtw #2]
+# EM3-NEXT:  2      5     0.50    *                   ldr	x9, [x10, w11, sxtw #3]
+# EM3-NEXT:  2      6     0.50    *                   ldr	b12, [x13, w14, sxtw #0]
+# EM3-NEXT:  2      6     0.50    *                   ldr	h15, [x16, w17, uxtw #1]
+# EM3-NEXT:  1      5     0.50    *                   ldr	s18, [x19, x20, sxtx #2]
+# EM3-NEXT:  1      5     0.50    *                   ldr	d21, [x22, x23, lsl #3]
+# EM3-NEXT:  2      6     0.50    *                   ldr	q24, [x25, x26, lsl #4]
+
+# ALL-NEXT:  1      1     1.00           *            strb	w0, [x1, x2, lsl #0]
+# ALL-NEXT:  1      1     1.00           *            strh	w3, [x4, x5, sxtx #1]
+# ALL-NEXT:  2      2     1.00           *            str	w6, [x7, w8, uxtw #2]
+# ALL-NEXT:  2      2     1.00           *            str	x9, [x10, w11, sxtw #3]
+# ALL-NEXT:  2      3     1.00           *            str	b12, [x13, w14, sxtw #0]
+# ALL-NEXT:  2      3     1.00           *            str	h15, [x16, w17, uxtw #1]
+# ALL-NEXT:  1      1     1.00           *            str	s18, [x19, x20, sxtx #2]
+# ALL-NEXT:  1      1     1.00           *            str	d21, [x22, x23, lsl #3]
+# ALL-NEXT:  2      3     1.00           *            str	q24, [x25, x26, lsl #4]
diff --git a/test/tools/llvm-mca/AArch64/Exynos/scheduler-queue-usage.s b/test/tools/llvm-mca/AArch64/Exynos/scheduler-queue-usage.s
index e0024612decc4a6d2a9cde22aff03ae4e431f4aa..bccae7d41b15cfe96d60d4f6cae8a7c21c19edc0 100644
--- a/test/tools/llvm-mca/AArch64/Exynos/scheduler-queue-usage.s
+++ b/test/tools/llvm-mca/AArch64/Exynos/scheduler-queue-usage.s
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -iterations=1 -scheduler-stats -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefixes=ALL,M3
 # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m1 -iterations=1 -scheduler-stats -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefixes=ALL,M1
+# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -iterations=1 -scheduler-stats -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefixes=ALL,M3
 
-  b   t
+  b	main
 
 # ALL:      Iterations:        1
 # ALL-NEXT: Instructions:      1
diff --git a/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s b/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s
new file mode 100644
index 0000000000000000000000000000000000000000..a7ff007129fb9555d3bce3c96a692b0ebda75925
--- /dev/null
+++ b/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m1 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM1
+# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3
+
+  add	w0, w1, w2, lsl #0
+  sub	x3, x4, x5, lsr #1
+  adds	x6, x7, x8, lsl #2
+  subs	w9, w10, w11, asr #3
+  add	w12, w13, w14, lsl #4
+  sub	x15, x16, x17, lsr #6
+  adds	x18, x19, x20, lsl #8
+  subs	w21, w22, w23, asr #10
+
+# ALL:      Iterations:        100
+# ALL-NEXT: Instructions:      800
+
+# EM1-NEXT: Total Cycles:      470
+# EM3-NEXT: Total Cycles:      354
+
+# ALL-NEXT: Total uOps:        800
+
+# EM1:      Dispatch Width:    4
+# EM1-NEXT: uOps Per Cycle:    1.70
+# EM1-NEXT: IPC:               1.70
+# EM1-NEXT: Block RThroughput: 4.7
+
+# EM3:      Dispatch Width:    6
+# EM3-NEXT: uOps Per Cycle:    2.26
+# EM3-NEXT: IPC:               2.26
+# EM3-NEXT: Block RThroughput: 3.5
+
+# ALL:      Instruction Info:
+# ALL-NEXT: [1]: #uOps
+# ALL-NEXT: [2]: Latency
+# ALL-NEXT: [3]: RThroughput
+# ALL-NEXT: [4]: MayLoad
+# ALL-NEXT: [5]: MayStore
+# ALL-NEXT: [6]: HasSideEffects (U)
+
+# ALL:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+
+# EM1-NEXT:  1      1     0.33                        add	w0, w1, w2
+# EM1-NEXT:  1      2     0.67                        sub	x3, x4, x5, lsr #1
+# EM1-NEXT:  1      1     0.33                        adds	x6, x7, x8, lsl #2
+# EM1-NEXT:  1      2     0.67                        subs	w9, w10, w11, asr #3
+# EM1-NEXT:  1      2     0.67                        add	w12, w13, w14, lsl #4
+# EM1-NEXT:  1      2     0.67                        sub	x15, x16, x17, lsr #6
+# EM1-NEXT:  1      2     0.67                        adds	x18, x19, x20, lsl #8
+# EM1-NEXT:  1      2     0.67                        subs	w21, w22, w23, asr #10
+
+# EM3-NEXT:  1      1     0.25                        add	w0, w1, w2
+# EM3-NEXT:  1      2     0.50                        sub	x3, x4, x5, lsr #1
+# EM3-NEXT:  1      1     0.25                        adds	x6, x7, x8, lsl #2
+# EM3-NEXT:  1      2     0.50                        subs	w9, w10, w11, asr #3
+# EM3-NEXT:  1      2     0.50                        add	w12, w13, w14, lsl #4
+# EM3-NEXT:  1      2     0.50                        sub	x15, x16, x17, lsr #6
+# EM3-NEXT:  1      2     0.50                        adds	x18, x19, x20, lsl #8
+# EM3-NEXT:  1      2     0.50                        subs	w21, w22, w23, asr #10
diff --git a/test/tools/llvm-mca/ARM/memcpy-ldm-stm.s b/test/tools/llvm-mca/ARM/memcpy-ldm-stm.s
new file mode 100644
index 0000000000000000000000000000000000000000..634a6aa966ab921eb6fcf7b7411d46ad452835fc
--- /dev/null
+++ b/test/tools/llvm-mca/ARM/memcpy-ldm-stm.s
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=armv7-unknown-unknown -mcpu=swift -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+  ldm	r2!, {r3, r4, r5, r6, r12, lr}
+  stm	r0!, {r3, r4, r5, r6, r12, lr}
+
+# CHECK:      Iterations:        300
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1295
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.85
+# CHECK-NEXT: IPC:               0.46
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      18    2.00    *                   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT:  5      1     2.00           *            stm	r0!, {r3, r4, r5, r6, r12, lr}
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SwiftUnitDiv
+# CHECK-NEXT: [1]   - SwiftUnitP0
+# CHECK-NEXT: [2]   - SwiftUnitP1
+# CHECK-NEXT: [3]   - SwiftUnitP2
+# CHECK-NEXT: [4.0] - SwiftUnitP01
+# CHECK-NEXT: [4.1] - SwiftUnitP01
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4.0]  [4.1]
+# CHECK-NEXT:  -      -      -     4.00   2.46   2.54
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4.0]  [4.1]  Instructions:
+# CHECK-NEXT:  -      -      -     2.00   1.09   0.91   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT:  -      -      -     2.00   1.37   1.63   stm	r0!, {r3, r4, r5, r6, r12, lr}
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeER    .  .   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [0,1]     .D=================eER   .  .   stm	r0!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [1,0]     .  DeeeeeeeeeeeeeeeeeeER .  .   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [1,1]     .   D=================eER.  .   stm	r0!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [2,0]     .    .DeeeeeeeeeeeeeeeeeeER .   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [2,1]     .    . D==================eER   stm	r0!, {r3, r4, r5, r6, r12, lr}
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.0    1.0    0.0       ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: 1.     3     18.3   0.3    0.0       stm	r0!, {r3, r4, r5, r6, r12, lr}
diff --git a/test/tools/llvm-mca/ARM/unsupported-write-variant.s b/test/tools/llvm-mca/ARM/unsupported-write-variant.s
index f4511f54ab55ba4483178c845d3099206cf8c029..6f95a01767035cc4a6b3d2dafe60cf893e78a300 100644
--- a/test/tools/llvm-mca/ARM/unsupported-write-variant.s
+++ b/test/tools/llvm-mca/ARM/unsupported-write-variant.s
@@ -1,4 +1,6 @@
 # RUN: not llvm-mca -march=arm -mcpu=swift -all-views=false 2>&1 < %s | FileCheck %s
+# D54648 results in this test to become valid.
+# XFAIL: *
 
 add r3, r1, r12, lsl #2
 
diff --git a/test/tools/llvm-mca/ARM/vld1-index-update.s b/test/tools/llvm-mca/ARM/vld1-index-update.s
new file mode 100644
index 0000000000000000000000000000000000000000..3bb02fcbcb4d11fdb72ce5d6c0783b40378e4b41
--- /dev/null
+++ b/test/tools/llvm-mca/ARM/vld1-index-update.s
@@ -0,0 +1,72 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=armv7-unknown-unknown -mcpu=swift -timeline -iterations=5 < %s | FileCheck %s
+
+# Register r1 is updated in one cycle by instruction vld1.32, so the add.w can
+# start one cycle later.
+
+add.w	r1, r1, r12
+vld1.32	{d16, d17}, [r1]!
+
+# CHECK:      Iterations:        5
+# CHECK-NEXT: Instructions:      10
+# CHECK-NEXT: Total Cycles:      16
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.94
+# CHECK-NEXT: IPC:               0.63
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        add	r1, r1, r12
+# CHECK-NEXT:  2      4     1.00    *                   vld1.32	{d16, d17}, [r1]!
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SwiftUnitDiv
+# CHECK-NEXT: [1]   - SwiftUnitP0
+# CHECK-NEXT: [2]   - SwiftUnitP1
+# CHECK-NEXT: [3]   - SwiftUnitP2
+# CHECK-NEXT: [4.0] - SwiftUnitP01
+# CHECK-NEXT: [4.1] - SwiftUnitP01
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4.0]  [4.1]
+# CHECK-NEXT:  -      -      -     1.00   1.00   1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4.0]  [4.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00   add	r1, r1, r12
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     vld1.32	{d16, d17}, [r1]!
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .    .   add	r1, r1, r12
+# CHECK-NEXT: [0,1]     D=eeeeER  .    .   vld1.32	{d16, d17}, [r1]!
+# CHECK-NEXT: [1,0]     .D=eE--R  .    .   add	r1, r1, r12
+# CHECK-NEXT: [1,1]     .D==eeeeER.    .   vld1.32	{d16, d17}, [r1]!
+# CHECK-NEXT: [2,0]     . D==eE--R.    .   add	r1, r1, r12
+# CHECK-NEXT: [2,1]     . D===eeeeER   .   vld1.32	{d16, d17}, [r1]!
+# CHECK-NEXT: [3,0]     .  D===eE--R   .   add	r1, r1, r12
+# CHECK-NEXT: [3,1]     .  D====eeeeER .   vld1.32	{d16, d17}, [r1]!
+# CHECK-NEXT: [4,0]     .   D====eE--R .   add	r1, r1, r12
+# CHECK-NEXT: [4,1]     .   D=====eeeeER   vld1.32	{d16, d17}, [r1]!
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     5     3.0    0.2    1.6       add	r1, r1, r12
+# CHECK-NEXT: 1.     5     4.0    0.0    0.0       vld1.32	{d16, d17}, [r1]!
diff --git a/test/tools/llvm-mca/SystemZ/lit.local.cfg b/test/tools/llvm-mca/SystemZ/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..5c02dd3614a492fd2e14ddb54e4756c07db05a51
--- /dev/null
+++ b/test/tools/llvm-mca/SystemZ/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'SystemZ' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/tools/llvm-mca/SystemZ/stm-lm.s b/test/tools/llvm-mca/SystemZ/stm-lm.s
new file mode 100644
index 0000000000000000000000000000000000000000..db2d79663d4aec5cfecf38db7d43c20849f89815
--- /dev/null
+++ b/test/tools/llvm-mca/SystemZ/stm-lm.s
@@ -0,0 +1,72 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=s390x-linux-gnu -mcpu=z14 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+stmg	%r6, %r15, 48(%r15)
+lmg	%r6, %r15, 48(%r15)
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      1     1.50           *            stmg	%r6, %r15, 48(%r15)
+# CHECK-NEXT:  3      10    2.50    *                   lmg	%r6, %r15, 48(%r15)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - Z14_FXaUnit
+# CHECK-NEXT: [0.1] - Z14_FXaUnit
+# CHECK-NEXT: [1.0] - Z14_FXbUnit
+# CHECK-NEXT: [1.1] - Z14_FXbUnit
+# CHECK-NEXT: [2.0] - Z14_LSUnit
+# CHECK-NEXT: [2.1] - Z14_LSUnit
+# CHECK-NEXT: [3]   - Z14_MCD
+# CHECK-NEXT: [4.0] - Z14_VBUnit
+# CHECK-NEXT: [4.1] - Z14_VBUnit
+# CHECK-NEXT: [5.0] - Z14_VecFPdUnit
+# CHECK-NEXT: [5.1] - Z14_VecFPdUnit
+# CHECK-NEXT: [6.0] - Z14_VecUnit
+# CHECK-NEXT: [6.1] - Z14_VecUnit
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [3]    [4.0]  [4.1]  [5.0]  [5.1]  [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.50   1.50   2.06   4.94    -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [3]    [4.0]  [4.1]  [5.0]  [5.1]  [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     1.50   1.50   1.96   0.04    -      -      -      -      -      -      -     stmg	%r6, %r15, 48(%r15)
+# CHECK-NEXT:  -      -      -      -     0.10   4.90    -      -      -      -      -      -      -     lmg	%r6, %r15, 48(%r15)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeER .    .    .    .    .    .  .   stmg	%r6, %r15, 48(%r15)
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeER .    .    .    .  .   lmg	%r6, %r15, 48(%r15)
+# CHECK-NEXT: [1,0]     . D=========eER.    .    .    .  .   stmg	%r6, %r15, 48(%r15)
+# CHECK-NEXT: [1,1]     .  D========eeeeeeeeeeER .    .  .   lmg	%r6, %r15, 48(%r15)
+# CHECK-NEXT: [2,0]     .   D=================eER.    .  .   stmg	%r6, %r15, 48(%r15)
+# CHECK-NEXT: [2,1]     .    D================eeeeeeeeeeER   lmg	%r6, %r15, 48(%r15)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     9.7    0.3    0.0       stmg	%r6, %r15, 48(%r15)
+# CHECK-NEXT: 1.     3     9.0    0.3    0.0       lmg	%r6, %r15, 48(%r15)
diff --git a/test/tools/llvm-mca/X86/BdVer2/add-sequence.s b/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
index 004def6ab71be99157173b21707e4fcbdf1c857e..dc9eaed21103800d276f892b3995b27835ab98d3 100644
--- a/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
+++ b/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
@@ -48,17 +48,20 @@ add %eax, %edx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %edx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %edx
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234567
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
index 12bf3748cb1dfa4fedd628c48e8ff8c037c4cf56..53c066bd5603c03ab3774444556e2514592bce2e 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
@@ -51,16 +51,19 @@ cmovae %ebx, %eax
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%eax, %eax
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%ebx, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%ebx, %eax
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
index 4f869e656f3d6f228110b0374dcad6e064df79a0..dc28f6d42490476c13a2075c463b95e05c1a4c78 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
@@ -56,18 +56,21 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
index 019d3fd5067e1c7b1cb4865d2cfc6aae5e9d6114..f30c78b9e1793f272aa4281a2bb60edf566dc6a4 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
@@ -57,18 +57,21 @@ vpcmpgtq %xmm3, %xmm3, %xmm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
index 0503bd8552bccc2600150ccf17c2b301b2c9c980..be0bf9d7dd4bee32362ba48916b4c11fe61fff6f 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
@@ -52,16 +52,19 @@ sbb %eax, %eax
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%edx, %edx
-# CHECK-NEXT:  -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%edx, %edx
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
index ba29a29e7dcdbb11abf6705edb589c00aa780bb5..dcf1a1faff875fbb8c0273a6969d123aca75d1a8 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
@@ -54,17 +54,20 @@ sbb %eax, %eax
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edx, %eax
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %edx
-# CHECK-NEXT:  -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imull	%edx, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %edx
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s b/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
index bd5b724bbd1a1fa6273582002dd5828549942faa..41a3d2af33244930fe693700c2321f6e50388c84 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
@@ -48,17 +48,20 @@ vpaddd %xmm0, %xmm0, %xmm3
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00   1.00    -     1.50   1.50    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00   1.00    -     1.50   1.50    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -      -      -     vpaddd	%xmm0, %xmm0, %xmm3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789          0123456789
diff --git a/test/tools/llvm-mca/X86/BdVer2/dot-product.s b/test/tools/llvm-mca/X86/BdVer2/dot-product.s
index d83cda27b0aaff8aa947d66bbe0777be44fe3f95..ba9405e45673df460857d64b4ba0fe53cb6b5635 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dot-product.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dot-product.s
@@ -48,17 +48,20 @@ vhaddps  %xmm3, %xmm3, %xmm4
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.49   1.51    -      -      -      -     2.00   1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -     2.00   1.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.49   0.51    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.49   0.51    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.51   0.49    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vhaddps	%xmm3, %xmm3, %xmm4
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          012
diff --git a/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s b/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
index 90d0d392977f7c5f409505220b8b770ce01aae63..715079992727ee42a9943d6622391cbdc4f3db49 100644
--- a/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
+++ b/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
@@ -58,22 +58,25 @@ vmovaps %xmm0, 48(%rdi)
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -     8.00    -      -      -      -      -      -      -     4.00    -      -      -     4.00   3.99   4.01    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 4.00   4.00    -      -      -      -      -      -     2.00   2.00    -      -      -     4.00   3.99   4.01    -      -      -     2.00   2.00    -     4.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	(%rsi), %xmm0
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	16(%rsi), %xmm0
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	32(%rsi), %xmm0
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	48(%rsi), %xmm0
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 48(%rdi)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     0.99   0.01    -      -      -      -     1.00    -      -     vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -     vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -     vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, 48(%rdi)
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
diff --git a/test/tools/llvm-mca/X86/BdVer2/load-throughput.s b/test/tools/llvm-mca/X86/BdVer2/load-throughput.s
new file mode 100644
index 0000000000000000000000000000000000000000..9c45ce63fa41b6564b0d272677d4c474774e5b36
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/load-throughput.s
@@ -0,0 +1,841 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -scheduler-stats -dispatch-stats -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN
+movb (%rax), %spl
+movb (%rcx), %bpl
+movb (%rdx), %sil
+movb (%rbx), %dil
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movw (%rax), %sp
+movw (%rcx), %bp
+movw (%rdx), %si
+movw (%rbx), %di
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movl (%rax), %esp
+movl (%rcx), %ebp
+movl (%rdx), %esi
+movl (%rbx), %edi
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movq (%rax), %rsp
+movq (%rcx), %rbp
+movq (%rdx), %rsi
+movq (%rbx), %rdi
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movd (%rax), %mm0
+movd (%rcx), %mm1
+movd (%rdx), %mm2
+movd (%rbx), %mm3
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movaps (%rax), %xmm0
+movaps (%rcx), %xmm1
+movaps (%rdx), %xmm2
+movaps (%rbx), %xmm3
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+vmovaps (%rax), %ymm0
+vmovaps (%rcx), %ymm1
+vmovaps (%rdx), %ymm2
+vmovaps (%rbx), %ymm3
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      207
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.93
+# CHECK-NEXT: IPC:               1.93
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   movb	(%rax), %spl
+# CHECK-NEXT:  1      5     0.50    *                   movb	(%rcx), %bpl
+# CHECK-NEXT:  1      5     0.50    *                   movb	(%rdx), %sil
+# CHECK-NEXT:  1      5     0.50    *                   movb	(%rbx), %dil
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          7  (3.4%)
+# CHECK-NEXT:  2,          200  (96.6%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             27         30         40
+# CHECK-NEXT: PdFPU            0          0          64
+# CHECK-NEXT: PdLoad           36         40         40
+# CHECK-NEXT: PdStore          0          0          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     movb	(%rax), %spl
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     movb	(%rcx), %bpl
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     movb	(%rdx), %sil
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     movb	(%rbx), %dil
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   movb	(%rax), %spl
+# CHECK-NEXT: [0,1]     DeeeeeER.   movb	(%rcx), %bpl
+# CHECK-NEXT: [0,2]     D=eeeeeER   movb	(%rdx), %sil
+# CHECK-NEXT: [0,3]     D=eeeeeER   movb	(%rbx), %dil
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movb	(%rax), %spl
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       movb	(%rcx), %bpl
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       movb	(%rdx), %sil
+# CHECK-NEXT: 3.     1     2.0    2.0    0.0       movb	(%rbx), %dil
+
+# CHECK:      [1] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      207
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.93
+# CHECK-NEXT: IPC:               1.93
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   movw	(%rax), %sp
+# CHECK-NEXT:  1      5     0.50    *                   movw	(%rcx), %bp
+# CHECK-NEXT:  1      5     0.50    *                   movw	(%rdx), %si
+# CHECK-NEXT:  1      5     0.50    *                   movw	(%rbx), %di
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          7  (3.4%)
+# CHECK-NEXT:  2,          200  (96.6%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             27         30         40
+# CHECK-NEXT: PdFPU            0          0          64
+# CHECK-NEXT: PdLoad           36         40         40
+# CHECK-NEXT: PdStore          0          0          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     movw	(%rax), %sp
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     movw	(%rcx), %bp
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     movw	(%rdx), %si
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     movw	(%rbx), %di
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   movw	(%rax), %sp
+# CHECK-NEXT: [0,1]     DeeeeeER.   movw	(%rcx), %bp
+# CHECK-NEXT: [0,2]     D=eeeeeER   movw	(%rdx), %si
+# CHECK-NEXT: [0,3]     D=eeeeeER   movw	(%rbx), %di
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movw	(%rax), %sp
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       movw	(%rcx), %bp
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       movw	(%rdx), %si
+# CHECK-NEXT: 3.     1     2.0    2.0    0.0       movw	(%rbx), %di
+
+# CHECK:      [2] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      207
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.93
+# CHECK-NEXT: IPC:               1.93
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   movl	(%rax), %esp
+# CHECK-NEXT:  1      5     0.50    *                   movl	(%rcx), %ebp
+# CHECK-NEXT:  1      5     0.50    *                   movl	(%rdx), %esi
+# CHECK-NEXT:  1      5     0.50    *                   movl	(%rbx), %edi
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          7  (3.4%)
+# CHECK-NEXT:  2,          200  (96.6%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             27         30         40
+# CHECK-NEXT: PdFPU            0          0          64
+# CHECK-NEXT: PdLoad           36         40         40
+# CHECK-NEXT: PdStore          0          0          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     movl	(%rax), %esp
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     movl	(%rcx), %ebp
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     movl	(%rdx), %esi
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     movl	(%rbx), %edi
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   movl	(%rax), %esp
+# CHECK-NEXT: [0,1]     DeeeeeER.   movl	(%rcx), %ebp
+# CHECK-NEXT: [0,2]     D=eeeeeER   movl	(%rdx), %esi
+# CHECK-NEXT: [0,3]     D=eeeeeER   movl	(%rbx), %edi
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movl	(%rax), %esp
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       movl	(%rcx), %ebp
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       movl	(%rdx), %esi
+# CHECK-NEXT: 3.     1     2.0    2.0    0.0       movl	(%rbx), %edi
+
+# CHECK:      [3] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      207
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.93
+# CHECK-NEXT: IPC:               1.93
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %rsp
+# CHECK-NEXT:  1      5     0.50    *                   movq	(%rcx), %rbp
+# CHECK-NEXT:  1      5     0.50    *                   movq	(%rdx), %rsi
+# CHECK-NEXT:  1      5     0.50    *                   movq	(%rbx), %rdi
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          7  (3.4%)
+# CHECK-NEXT:  2,          200  (96.6%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             27         30         40
+# CHECK-NEXT: PdFPU            0          0          64
+# CHECK-NEXT: PdLoad           36         40         40
+# CHECK-NEXT: PdStore          0          0          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     movq	(%rax), %rsp
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     movq	(%rcx), %rbp
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     movq	(%rdx), %rsi
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     movq	(%rbx), %rdi
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   movq	(%rax), %rsp
+# CHECK-NEXT: [0,1]     DeeeeeER.   movq	(%rcx), %rbp
+# CHECK-NEXT: [0,2]     D=eeeeeER   movq	(%rdx), %rsi
+# CHECK-NEXT: [0,3]     D=eeeeeER   movq	(%rbx), %rdi
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movq	(%rax), %rsp
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       movq	(%rcx), %rbp
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       movq	(%rdx), %rsi
+# CHECK-NEXT: 3.     1     2.0    2.0    0.0       movq	(%rbx), %rdi
+
+# CHECK:      [4] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      207
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.93
+# CHECK-NEXT: IPC:               1.93
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   movd	(%rax), %mm0
+# CHECK-NEXT:  1      5     0.50    *                   movd	(%rcx), %mm1
+# CHECK-NEXT:  1      5     0.50    *                   movd	(%rdx), %mm2
+# CHECK-NEXT:  1      5     0.50    *                   movd	(%rbx), %mm3
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          7  (3.4%)
+# CHECK-NEXT:  2,          200  (96.6%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             27         30         40
+# CHECK-NEXT: PdFPU            27         30         64
+# CHECK-NEXT: PdLoad           36         40         40
+# CHECK-NEXT: PdStore          0          0          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -      -      -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     1.00    -      -     movd	(%rax), %mm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     1.00    -      -      -     movd	(%rcx), %mm1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     1.00    -      -     movd	(%rdx), %mm2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     1.00    -      -      -     movd	(%rbx), %mm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   movd	(%rax), %mm0
+# CHECK-NEXT: [0,1]     DeeeeeER.   movd	(%rcx), %mm1
+# CHECK-NEXT: [0,2]     D=eeeeeER   movd	(%rdx), %mm2
+# CHECK-NEXT: [0,3]     D=eeeeeER   movd	(%rbx), %mm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movd	(%rax), %mm0
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       movd	(%rcx), %mm1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       movd	(%rdx), %mm2
+# CHECK-NEXT: 3.     1     2.0    2.0    0.0       movd	(%rbx), %mm3
+
+# CHECK:      [5] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      207
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.93
+# CHECK-NEXT: IPC:               1.93
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   movaps	(%rax), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   movaps	(%rcx), %xmm1
+# CHECK-NEXT:  1      5     0.50    *                   movaps	(%rdx), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movaps	(%rbx), %xmm3
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          7  (3.4%)
+# CHECK-NEXT:  2,          200  (96.6%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             27         30         40
+# CHECK-NEXT: PdFPU            27         30         64
+# CHECK-NEXT: PdLoad           36         40         40
+# CHECK-NEXT: PdStore          0          0          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     2.00   2.00    -      -      -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -     movaps	(%rax), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -     movaps	(%rcx), %xmm1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -     movaps	(%rdx), %xmm2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -     movaps	(%rbx), %xmm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   movaps	(%rax), %xmm0
+# CHECK-NEXT: [0,1]     DeeeeeER.   movaps	(%rcx), %xmm1
+# CHECK-NEXT: [0,2]     D=eeeeeER   movaps	(%rdx), %xmm2
+# CHECK-NEXT: [0,3]     D=eeeeeER   movaps	(%rbx), %xmm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movaps	(%rax), %xmm0
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       movaps	(%rcx), %xmm1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       movaps	(%rdx), %xmm2
+# CHECK-NEXT: 3.     1     2.0    2.0    0.0       movaps	(%rbx), %xmm3
+
+# CHECK:      [6] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      207
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    3.86
+# CHECK-NEXT: IPC:               1.93
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     0.50    *                   vmovaps	(%rax), %ymm0
+# CHECK-NEXT:  2      5     0.50    *                   vmovaps	(%rcx), %ymm1
+# CHECK-NEXT:  2      5     0.50    *                   vmovaps	(%rdx), %ymm2
+# CHECK-NEXT:  2      5     0.50    *                   vmovaps	(%rbx), %ymm3
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              7  (3.4%)
+# CHECK-NEXT:  4,              200  (96.6%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          7  (3.4%)
+# CHECK-NEXT:  2,          200  (96.6%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             1          2          40
+# CHECK-NEXT: PdFPU            1          2          64
+# CHECK-NEXT: PdLoad           11         12         40
+# CHECK-NEXT: PdStore          0          0          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     2.00   2.00    -      -      -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -     vmovaps	(%rax), %ymm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -     vmovaps	(%rcx), %ymm1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -     vmovaps	(%rdx), %ymm2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -     vmovaps	(%rbx), %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   vmovaps	(%rax), %ymm0
+# CHECK-NEXT: [0,1]     DeeeeeER.   vmovaps	(%rcx), %ymm1
+# CHECK-NEXT: [0,2]     .DeeeeeER   vmovaps	(%rdx), %ymm2
+# CHECK-NEXT: [0,3]     .DeeeeeER   vmovaps	(%rbx), %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps	(%rax), %ymm0
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       vmovaps	(%rcx), %ymm1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       vmovaps	(%rdx), %ymm2
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       vmovaps	(%rbx), %ymm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s b/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
index b69f77b36938b1b551cfd9c791c721834ade6142..9aa3d5c0c886dea8193902db2aca4b14b6667edc 100644
--- a/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
+++ b/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
@@ -58,22 +58,25 @@ vmovaps %xmm0, 48(%rdi)
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 4.05   3.95    -      -      -      -      -      -     3.95   0.05    -      -      -     4.00   3.95   4.05    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 4.00   4.00    -      -      -      -      -      -     2.00   2.00    -      -      -     4.00   3.95   4.05    -      -      -     2.00   2.00    -     4.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     0.97   0.03    -      -      -      -     0.97   0.03    -      -      -      -     vmovaps	(%rsi), %xmm0
-# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -      -     vmovaps	16(%rsi), %xmm0
-# CHECK-NEXT: 0.02   0.98    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT: 0.02   0.98    -      -      -      -      -      -     1.00    -      -      -      -      -     0.98   0.02    -      -      -      -     vmovaps	32(%rsi), %xmm0
-# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT: 0.98   0.02    -      -      -      -      -      -     0.98   0.02    -      -      -      -     1.00    -      -      -      -      -     vmovaps	48(%rsi), %xmm0
-# CHECK-NEXT: 0.03   0.97    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 48(%rdi)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT: 0.97   0.03    -      -      -      -      -      -      -     1.00    -      -      -      -     0.97   0.03    -      -      -      -     1.00    -      -     vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -     vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 0.02   0.98    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 0.98   0.02    -      -      -      -      -      -      -     1.00    -      -      -      -     0.98   0.02    -      -      -      -     1.00    -      -     vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -     vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 0.03   0.97    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, 48(%rdi)
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
diff --git a/test/tools/llvm-mca/X86/BdVer2/one-idioms.s b/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
index c2e0debcf351ffb9e919e1c08c04e0f285ec15e5..062aa374bbf7de8bfa026efc68f57b5ebdb02540 100644
--- a/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
+++ b/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
@@ -96,29 +96,32 @@ vpcmpeqw  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     7.50   7.50    -      -     7.50   7.50    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     7.50   7.50    -      -     7.50   7.50    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -      -      -      -     pcmpeqq	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vpcmpeqd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqw	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
index 4aad4729a5f7d22e0b14835ee0970177cf84cf33..a21541800b09dee1775c2b85690b6dd39d74e4ee 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
@@ -53,17 +53,20 @@ xor %bx, %dx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %dx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movw	%ax, %dx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%bx, %dx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movw	%ax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%bx, %dx
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
index 6194ecbb127d8e2e0803c70e8470edd4c860bcd8..d8d250876b822e1cb736f0265808abbb63157d8c 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
@@ -53,17 +53,20 @@ add %cx, %bx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     1.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%ax, %bx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %bx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imulw	%ax, %bx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %bx
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
index ee892a4231fe7bc9d72093b4c9131dc542098967..1671f1519429ab48f80f78d36a73c50ab5a8b70c 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
@@ -46,15 +46,18 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
index 8723744aaa681ba353189ca91fbefbade476da75..9cf25c525d79c3e71c412e5eee14a9d4b214d0cd 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
@@ -54,17 +54,20 @@ lzcnt 2(%rsp), %cx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -     2.00    -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     1.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00   1.00    -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edx, %ecx
-# CHECK-NEXT:  -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	(%rsp), %cx
-# CHECK-NEXT:  -     1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imull	%edx, %ecx
+# CHECK-NEXT:  -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     lzcntw	(%rsp), %cx
+# CHECK-NEXT: 1.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     lzcntw	2(%rsp), %cx
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
diff --git a/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s b/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
index 86fee396350869ec3282f7888ffebd0fb633c75a..2cd66ae163079b240def54a688854dcf0d79ab5b 100644
--- a/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
+++ b/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
@@ -65,22 +65,25 @@ vsqrtps     %ymm0, %ymm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     32.71  32.29   -     2.00   3.00   1.00   6.00   6.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     32.29  32.71  1.00   1.00   3.00   1.00   6.00   6.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00   2.00    -     2.00   1.00    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpand	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.71  10.29   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.02   0.98   2.00    -     2.00   1.00    -      -      -      -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.98   0.02    -      -      -     1.00    -      -      -      -      -      -      -     vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.71  10.29   -      -      -      -      -     1.00    -      -      -      -      -      -      -     vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     20.58  21.42   -      -      -      -      -     2.00    -      -      -      -      -      -      -     vsqrtps	%ymm0, %ymm2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789          012345678
diff --git a/test/tools/llvm-mca/X86/BdVer2/pr37790.s b/test/tools/llvm-mca/X86/BdVer2/pr37790.s
index 2471c42e4455d6ac46e4a16d756f77bf14b10758..8a55e7f348754a56b03243221e982df90079dfad 100644
--- a/test/tools/llvm-mca/X86/BdVer2/pr37790.s
+++ b/test/tools/llvm-mca/X86/BdVer2/pr37790.s
@@ -12,7 +12,7 @@ stmxcsr (%rsp)
 # CHECK:      Dispatch Width:    4
 # CHECK-NEXT: uOps Per Cycle:    0.03
 # CHECK-NEXT: IPC:               0.02
-# CHECK-NEXT: Block RThroughput: 0.8
+# CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -24,7 +24,7 @@ stmxcsr (%rsp)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      100   0.50    *      *      U     int3
-# CHECK-NEXT:  2      1     0.50    *      *      U     stmxcsr	(%rsp)
+# CHECK-NEXT:  2      1     1.00    *      *      U     stmxcsr	(%rsp)
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789
diff --git a/test/tools/llvm-mca/X86/BdVer2/rank.s b/test/tools/llvm-mca/X86/BdVer2/rank.s
index 87f7d527c03b4893989016fa17b171ffc40daede..eadea1f5b3596c1fc1268581c5e15a8ca8fd7891 100644
--- a/test/tools/llvm-mca/X86/BdVer2/rank.s
+++ b/test/tools/llvm-mca/X86/BdVer2/rank.s
@@ -58,22 +58,25 @@ add %ebx, %eax
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.01   0.99    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %edx
-# CHECK-NEXT:  -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ebx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %esi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %esi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.01   0.99    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %edx
+# CHECK-NEXT:  -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ebx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %esi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %esi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234567
diff --git a/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s b/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
index 42467f7b3a18ecf1916c2e77fc4f5da61b989451..e0f27cc5406c2f8528dd542f00d94fadf8ba0daf 100644
--- a/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
+++ b/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
@@ -60,3 +60,7 @@
 # CHECK-NEXT:  1,           9  (40.9%)
 # CHECK-NEXT:  3,           1  (4.5%)
 # CHECK-NEXT:  4,           1  (4.5%)
+
+# CHECK:      Total ROB Entries:                128
+# CHECK-NEXT: Max Used ROB Entries:             16  ( 12.5% )
+# CHECK-NEXT: Average Used ROB Entries per cy:  9  ( 7.0% )
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
index 19737e85d191dd3cbf366c9385e9554b7fc8c0ec..813dc8f322594a7d44e6ee0c846a0eddf09f8765 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
@@ -65,17 +65,20 @@ vaddps %xmm1, %xmm1, %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   1.33    -      -      -      -     1.00   1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -      -     1.00    -      -      -      -     vmovaps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
index ee9fddec67390fe68640c02676f1b34a622882e7..2263f592425574141735fff12049eaf55194b4cd 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
@@ -75,23 +75,26 @@ movdqu %xmm5, %xmm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.67   1.33    -     3.00    -      -     3.33   3.67    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00   1.33   1.67    -      -     3.33   3.67    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm0, %mm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     movq	%mm0, %mm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movaps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movups	%xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     1.00    -      -      -      -      -     movapd	%xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movupd	%xmm3, %xmm4
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     movdqa	%xmm4, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.33   0.67    -      -      -      -     movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm0, %mm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.67   0.33    -      -      -     1.00    -      -      -      -      -      -      -     movq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     0.33   0.67    -      -      -      -      -      -      -     movaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -     movups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -      -      -      -     movapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -     movupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.67   0.33    -      -     1.00    -      -      -      -      -      -      -      -     movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.33   0.67    -      -      -      -      -      -      -     movdqu	%xmm5, %xmm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234567
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
index ada52545a9b184b14ddb3a6210de0566feb75602..14f2ad2133a16c6f0d44bb9ba8a666dfd8cd0297 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
@@ -70,21 +70,24 @@ vmovdqu %xmm5, %xmm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00   1.33   0.67    -      -     3.00   3.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00   1.00   1.00    -      -     3.00   3.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.67   0.33    -      -      -      -     vmovaps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     0.67   0.33    -      -      -      -     vmovups	%xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     0.33   0.67    -      -      -      -     vmovapd	%xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     vmovupd	%xmm3, %xmm4
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.67   0.33    -      -     0.33   0.67    -      -      -      -     vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.67   0.33    -      -     0.67   0.33    -      -      -      -     vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     0.67   0.33    -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -     vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     0.33   0.67    -      -      -      -      -      -      -     vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -      -      -      -     vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.33   0.67    -      -      -      -      -      -      -     vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     0.67   0.33    -      -      -      -      -      -      -     vmovdqu	%xmm5, %xmm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
index e651ff0becb307b91cec51bb6bd37e2142d25b67..fde7284a8893b5819f8c051305f9755613ed5887 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
@@ -66,19 +66,22 @@ mov %edx, %eax
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%eax, %ebx
-# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ebx, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ecx, %edx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%edx, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%eax, %ebx
+# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ecx, %edx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%edx, %eax
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
index 188eb5dd15808dbe84846e85d22206f4e7767195..76d24b16bc8efd9a07088a2d683e6d3aa33561dc 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
@@ -66,19 +66,22 @@ mov %rdx, %rax
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rax, %rbx
-# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rbx, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rcx, %rdx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rdx, %rax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rax, %rbx
+# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rcx, %rdx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rdx, %rax
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-1.s b/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
index 70685f1726add50f98e7f4bf0dcd2abaeef2ecf8..acbe8e8548198784846907c846fba1d8b46b7c3e 100644
--- a/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
@@ -62,16 +62,19 @@ vmulps %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789          012
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-2.s b/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
index 354876befcdddf7e1ceca39855273e559fa982b7..316b9c7a078911cc0d8a223db826700f1910be66 100644
--- a/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
@@ -62,16 +62,19 @@ vmulps %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789          012
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-3.s b/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
index a5f5746d7f9be5700184c861ef5061621c81cf07..64d5a045c3bbc9146e0cda5fd338e39395aae1b5 100644
--- a/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
@@ -71,15 +71,18 @@ idiv %eax
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%eax
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s b/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
index 12d3e6f2cc03bc32f6ffac6d1a2ebe405312d3e6..0b548d95f28bbeaa6d205238c7f9cdb309902764 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
@@ -159,62 +159,65 @@ pswapd      (%rax), %mm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 13.00  13.00   -      -      -      -      -      -     17.50  17.50  2.00   2.00   2.00   8.00   38.50  10.50   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 13.00  13.00   -      -      -      -      -      -     17.50  17.50  2.00   2.00   2.00   8.00   38.50  10.50   -      -      -     13.00  13.00   -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     femms
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgusb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgusb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2id	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2id	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2iw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2iw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfacc	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfacc	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfadd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfadd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpeq	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpeq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpge	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpge	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpgt	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpgt	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmax	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmax	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmin	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmin	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmul	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmul	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfnacc	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfnacc	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfpnacc	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfpnacc	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcp	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcp	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit1	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit1	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit2	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit2	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqit1	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqit1	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqrt	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqrt	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsub	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsub	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsubr	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsubr	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrw	(%rax), %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetch	(%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pswapd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pswapd	(%rax), %mm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     femms
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pavgusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pavgusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     pf2id	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     pf2id	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     pf2iw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     pf2iw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfadd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfadd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfcmpeq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfcmpeq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfcmpge	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfcmpge	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfcmpgt	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfcmpgt	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfmax	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfmax	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfmin	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfmin	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfmul	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfmul	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfnacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfnacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfpnacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfpnacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfrcp	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfrcp	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfrcpit1	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfrcpit1	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfrcpit2	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfrcpit2	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfrsqit1	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfrsqit1	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfrsqrt	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfrsqrt	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfsub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfsub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pfsubr	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     pfsubr	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     pi2fd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     pi2fd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     pi2fw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     pi2fw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmulhrw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmulhrw	(%rax), %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     prefetch	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     prefetchw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pswapd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pswapd	(%rax), %mm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-adx.s b/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
index a24213966edcc14c3cd67f38e426fe6dd6e6f376..4844d72bb557b7d319cef9e1aa9a34eeebb000bb 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
@@ -49,19 +49,22 @@ adox        (%rbx), %rcx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 2.00   2.00    -      -      -     8.00   8.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -     8.00   8.00    -      -      -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxl	%ebx, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxl	(%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxq	%rbx, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxq	(%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxl	%ebx, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxl	(%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxq	%rbx, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxq	(%rbx), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcxl	%ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     adcxl	(%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcxq	%rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     adcxq	(%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adoxl	%ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     adoxl	(%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adoxq	%rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     adoxq	(%rbx), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-aes.s b/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
index c8d400142df75c7c8485083e19c1151cbd1be895..63f1d45cf078173dab9463501d471677b6eb2f6f 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
@@ -61,23 +61,26 @@ aeskeygenassist $22, (%rax), %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 3.00   3.00    -      -      -      -      -      -      -      -      -      -     12.00   -     12.00   -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 3.00   3.00    -      -      -      -      -      -      -      -      -      -     12.00   -     12.00   -      -      -      -     3.00   3.00    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdec	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdec	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdeclast	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdeclast	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenc	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenc	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenclast	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenclast	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesimc	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesimc	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aeskeygenassist	$22, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     aesdec	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     aesdec	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     aesenc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     aesenc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     aesenclast	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     aesimc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     aesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     aeskeygenassist	$22, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s b/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
index 837127d4e58498949d0b84335e140a87801f2934..91d230dda1591d38b1c63a770fa55270281ea87c 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
@@ -1670,7 +1670,7 @@ vzeroupper
 # CHECK-NEXT:  1      14    13.50   *                   vsqrtsd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      9     10.50                       vsqrtss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      14    10.50   *                   vsqrtss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      1     0.50    *      *      U     vstmxcsr	(%rax)
+# CHECK-NEXT:  2      1     1.00    *      *      U     vstmxcsr	(%rax)
 # CHECK-NEXT:  1      5     1.00                        vsubpd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      10    1.00    *                   vsubpd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  2      5     2.00                        vsubpd	%ymm0, %ymm1, %ymm2
@@ -1742,702 +1742,705 @@ vzeroupper
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 229.00 229.00  -      -      -     56.00   -      -     588.00 588.00 127.50 127.50 38.00  107.00 402.50 429.50  -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 229.00 229.00  -      -      -     56.00   -      -     588.00 588.00 127.50 127.50 38.00  107.00 402.50 429.50  -      -      -     204.00 204.00  -     50.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdec	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdec	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdeclast	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdeclast	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenc	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenc	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenclast	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenclast	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesimc	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesimc	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaeskeygenassist	$22, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaeskeygenassist	$22, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendpd	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendpd	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendpd	$11, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendpd	$11, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendps	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendps	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendps	$11, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendps	$11, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvps	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvps	%ymm3, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vbroadcastf128	(%rax), %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastsd	(%rax), %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vbroadcastss	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastss	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmppd	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmppd	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmppd	$0, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmppd	$0, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpps	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpps	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmpps	$0, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmpps	$0, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpsd	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpsd	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpss	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpss	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomisd	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomisd	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomiss	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomiss	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2pd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2pd	%xmm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2pd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2ps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2ps	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2ps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2dq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2dqx	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2dq	%ymm0, %xmm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2dqy	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2ps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2psx	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2ps	%ymm0, %xmm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2psy	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2dq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2dq	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2dq	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2pd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2pd	%xmm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2pd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdl	%ecx, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdq	%rcx, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdl	(%rax), %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdq	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssl	%ecx, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssq	%rcx, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssl	(%rax), %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssq	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttpd2dq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttpd2dqx	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvttpd2dq	%ymm0, %xmm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvttpd2dqy	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvttps2dq	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvttps2dq	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdppd	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdpps	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -      -     2.00    -      -      -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -      -     2.00    -      -      -      -     vdpps	$22, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vextractf128	$1, %ymm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vextractf128	$1, %ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vextractps	$1, %xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vextractps	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertf128	$1, %xmm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertf128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertps	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertps	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vlddqu	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vlddqu	(%rax), %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vldmxcsr	(%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmaskmovdqu	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovpd	(%rax), %xmm0, %xmm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovpd	(%rax), %ymm0, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovpd	%xmm0, %xmm1, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovpd	%ymm0, %ymm1, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovps	(%rax), %xmm0, %xmm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovps	(%rax), %ymm0, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovps	%xmm0, %xmm1, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovps	%ymm0, %ymm1, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovapd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovapd	%ymm0, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovapd	%ymm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovaps	%ymm0, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%ymm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovd	%eax, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovd	%xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovddup	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovddup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovddup	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovddup	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqa	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -      -      -      -     vmovdqa	%ymm0, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqa	%ymm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqu	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -      -      -      -     vmovdqu	%ymm0, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqu	%ymm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhlps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlhps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovhpd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovhps	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovlpd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovlps	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskpd	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskpd	%ymm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskps	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskps	%ymm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntdq	%xmm0, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntpd	%xmm0, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntpd	%ymm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntps	%xmm0, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntps	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovq	%rax, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovq	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovq	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovsd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovshdup	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovshdup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovshdup	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovshdup	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsldup	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsldup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovsldup	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovsldup	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovss	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovupd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovupd	%ymm0, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovupd	%ymm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovups	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovups	%ymm0, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovups	%ymm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     vmpsadbw	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackssdw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackssdw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpacksswb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpacksswb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackusdw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackusdw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackuswb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackuswb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpalignr	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpalignr	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpand	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpand	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpandn	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpandn	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestri	$1, %xmm0, %xmm2
-# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistri	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vperm2f128	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vperm2f128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	$1, %ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	$1, %ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrb	$1, %xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrb	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrd	$1, %xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrd	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrq	$1, %xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrq	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrw	$1, %xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vphminposuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrb	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrb	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrd	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrq	$1, %rax, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrq	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrw	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrw	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxub	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxub	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxud	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxud	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminub	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminub	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminud	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminud	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxdq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxdq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhrsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhrsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmulld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmullw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmullw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuludq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuludq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpor	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpor	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsadbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsadbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpshufb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpshufb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufd	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufhw	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufhw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshuflw	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshuflw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslldq	$1, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrldq	$1, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vptest	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vptest	%ymm0, %ymm1
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vptest	(%rax), %ymm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhqdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhqdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckldq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckldq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklqdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklqdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpxor	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpxor	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrcpps	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrcpps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundps	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundps	$1, %ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrsqrtps	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrsqrtps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufpd	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufpd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufpd	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufpd	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufps	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufps	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufps	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufps	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     27.00  27.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtpd	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     27.00  27.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtpd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	%ymm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vstmxcsr	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestpd	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestpd	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestpd	%ymm0, %ymm1
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestpd	(%rax), %ymm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestps	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestps	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestps	%ymm0, %ymm1
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestps	(%rax), %ymm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomisd	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomisd	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomiss	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomiss	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vzeroall
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vzeroupper
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vaddsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vaddss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vaddsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vaddsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vaddsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vaddsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vaddsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vaesdec	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vaesdec	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vaesdeclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vaesdeclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vaesenc	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vaesenc	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vaesenclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vaesenclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vaesimc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vaesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vaeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vaeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vandnpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vandnpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vandnpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vandnpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vandnps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vandnps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vandnps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vandnps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vandpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vandpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vandpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vandpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vandps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vandps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vandps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vandps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vblendpd	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vblendpd	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vblendpd	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vblendpd	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vblendps	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vblendps	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vblendps	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vblendps	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vblendvps	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vblendvps	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vbroadcastf128	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -     0.50   0.50    -      -     vbroadcastsd	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vbroadcastss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -     0.50   0.50    -      -     vbroadcastss	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vcmppd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vcmppd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vcmppd	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vcmppd	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vcmpps	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vcmpps	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vcmpps	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vcmpps	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vcmpsd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vcmpsd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vcmpss	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vcmpss	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vcomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vcomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vcomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vcomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvtdq2pd	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvtdq2pd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvtdq2ps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvtdq2ps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvtpd2dq	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvtpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtpd2psx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvtpd2ps	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvtpd2psy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvtps2dq	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvtps2dq	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvtps2pd	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvtps2pd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtsd2ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtsd2ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtsi2sdl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtsi2sdq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtsi2sdl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtsi2sdq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtsi2ssl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtsi2ssq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtsi2ssl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtsi2ssq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtss2sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtss2sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvttpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvttpd2dq	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvttpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvttps2dq	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvttps2dq	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvttsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvttsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvttss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvttss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvttss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vdivpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vdivpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -      -      -      -     vdivpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vdivpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vdivps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vdivps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -      -      -      -     vdivps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vdivps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vdivsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vdivsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vdivss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vdivss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vextractf128	$1, %ymm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vextractf128	$1, %ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vextractps	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vextractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vhaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vhaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vhaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vhaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vhaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vhaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vhaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vhaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vhsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vhsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vhsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vhsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vhsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vhsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vhsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vhsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vinsertf128	$1, %xmm0, %ymm1, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vinsertf128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vinsertps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vinsertps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vlddqu	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vlddqu	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     vldmxcsr	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmaskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmaskmovpd	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vmaskmovpd	(%rax), %ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -     1.00   vmaskmovpd	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -     2.00   vmaskmovpd	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmaskmovps	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vmaskmovps	(%rax), %ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -     1.00   vmaskmovps	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -     2.00   vmaskmovps	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmaxpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vmaxpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vmaxpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vmaxpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmaxps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vmaxps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vmaxps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vmaxps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmaxss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vmaxss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vminpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vminpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vminpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vminpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vminps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vminps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vminps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vminps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vminss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vminss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovapd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovapd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovapd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vmovapd	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovapd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovapd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovaps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vmovaps	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovaps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovd	%eax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmovd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovddup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovddup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vmovddup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vmovddup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vmovdqa	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovdqa	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -      -      -      -      -      -      -     vmovdqa	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovdqa	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovdqa	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vmovdqu	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovdqu	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovdqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -      -      -      -      -      -      -     vmovdqu	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovdqu	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovdqu	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovhlps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovlhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovhpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovhps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovlpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovlpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovlps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovlps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmovmskpd	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmovmskpd	%ymm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmovmskps	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmovmskps	%ymm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovntdq	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -      -      -     2.00   vmovntdq	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovntpd	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -      -      -     2.00   vmovntpd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovntps	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -      -      -     2.00   vmovntps	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vmovq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovq	%rax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vmovq	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovshdup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovshdup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vmovshdup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vmovshdup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovsldup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovsldup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vmovsldup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vmovsldup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovss	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovupd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovupd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovupd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vmovupd	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovupd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovupd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vmovups	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovups	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovups	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vmovups	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovups	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vmovups	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -      -      -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmulpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vmulpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vmulpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vmulpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vmulps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vmulps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmulsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vmulsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmulss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vmulss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpabsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpabsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpabsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpabsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpabsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpabsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpackssdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpackssdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpacksswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpacksswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpackusdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpackusdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpackuswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpackuswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpaddb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpaddq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpaddsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpaddusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpaddusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpalignr	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpalignr	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpand	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpandn	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpandn	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpavgb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpavgb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpavgw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpavgw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -      -      -      -     vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpblendw	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmpeqb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmpeqd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmpeqq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpeqw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmpeqw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -     3.00   3.00    -     2.00   vpcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -     3.50   3.50    -     2.00   vpcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -     3.00   3.00    -     2.00   vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -     3.50   3.50    -     2.00   vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpgtb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmpgtb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpgtd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmpgtd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpgtq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmpgtq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmpgtw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmpgtw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vpcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vpcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vperm2f128	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vperm2f128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vpermilpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpermilpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vpermilpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpermilpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vpermilpd	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vpermilpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vpermilpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vpermilpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vpermilps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpermilps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vpermilps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpermilps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vpermilps	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vpermilps	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vpermilps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vpermilps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vpextrb	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vpextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vpextrd	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vpextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vpextrq	$1, %xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vpextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vpextrw	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vpextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -      -      -      -     vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -     0.50   0.50    -      -     vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpinsrb	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpinsrb	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpinsrd	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpinsrd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpinsrq	$1, %rax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpinsrq	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpinsrw	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpinsrw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmaddubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmaddubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmaddwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmaddwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmaxsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmaxsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmaxsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmaxsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmaxub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmaxub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmaxud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmaxud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmaxuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmaxuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpminsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpminsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpminsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpminsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpminub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpminub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpminud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpminud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpminuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpminuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vpmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmuldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmuldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmulhrsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmulhrsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmulhuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmulhuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmulhw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmulhw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -     0.50   0.50    -      -     vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmullw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmullw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmuludq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmuludq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsadbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsadbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshufb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshufb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsignb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsignb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsignd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsignd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsignw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsignw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpslld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpslld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpslld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpslldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsllq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsllq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsllq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsllw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsllw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsllw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrad	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrad	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsrad	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsraw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsraw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsraw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsrld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrlq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrlq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsrlq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrlw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsrlw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsrlw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsubb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsubb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsubq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsubq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsubsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsubsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsubusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsubusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsubusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsubusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vptest	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vptest	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vptest	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vptest	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpunpckhbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpunpckhbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpunpckhdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpunpckhdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpunpckhqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpunpckhqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpunpckhwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpunpckhwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpunpcklbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpunpcklbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpunpckldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpunpckldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpunpcklqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpunpcklqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpunpcklwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpunpcklwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpxor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpxor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vrcpps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vrcpps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vrcpps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vrcpps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vrcpss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vroundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vroundpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vroundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vroundps	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vroundss	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vrsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vrsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vrsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vrsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vrsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vshufpd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vshufpd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vshufpd	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vshufpd	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vshufps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vshufps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vshufps	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vshufps	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     vsqrtpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vsqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     27.00  27.00   -      -      -      -      -     2.00    -      -      -      -      -      -      -     vsqrtpd	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     27.00  27.00   -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vsqrtpd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -      -      -      -     vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -     1.00   1.00    -      -     vsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     vsqrtsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vsqrtsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     vsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     vsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   vstmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -     vsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsubsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vsubsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsubss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vsubss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vtestpd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vtestpd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vtestpd	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vtestpd	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vtestps	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vtestps	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vtestps	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vtestps	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vucomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vucomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     vucomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     vucomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vunpckhpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vunpckhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vunpckhpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vunpckhpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vunpckhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vunpckhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vunpckhps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vunpckhps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vunpcklpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vunpcklpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vunpcklpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vunpcklpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vunpcklps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vunpcklps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vunpcklps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vunpcklps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vxorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vxorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vxorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vxorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vxorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vxorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vxorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vxorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vzeroall
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s b/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
index f1b155346b3c76ea3e55a63fdb7be9504adb6520..c67d73cd43e3a40d014a317b30e234890077c74b 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
@@ -91,35 +91,38 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 6.00   6.00    -      -      -     14.00  14.00   -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 6.00   6.00    -      -      -     14.00  14.00   -      -      -      -      -      -      -      -      -      -      -      -     6.00   6.00    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnl	%eax, %ebx, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnl	(%rax), %ebx, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnq	%rax, %rbx, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnq	(%rax), %rbx, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	%eax, %ebx, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	%eax, (%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	%rax, %rbx, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	%rax, (%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsil	%eax, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsil	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsiq	%rax, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsiq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskl	%eax, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskq	%rax, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrl	%eax, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	%rax, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	%eax, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntq	%rax, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnl	%eax, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     andnl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnq	%rax, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     andnq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	%eax, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bextrl	%eax, (%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	%rax, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bextrq	%rax, (%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsil	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     blsil	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsiq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     blsiq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     blsmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     blsmskq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsrl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     blsrl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     blsrq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     tzcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     tzcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s b/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
index 1f6b9ed0b3a3f552d25d40be67356827ea1f7342..bd4c62b1eb7c8ba0974f4060b74d6bbcadf4c05b 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
@@ -34,12 +34,15 @@ clflushopt (%rax)
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     clflushopt	(%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     clflushopt	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s b/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
index 93151dc1a72598fd1964723456b32d40621eb172..0f6a649818941c8d6e21fac7c016b781b364688e 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
@@ -229,107 +229,110 @@ cmovgq    (%rax), %rdi
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 24.00  24.00   -      -      -     48.00  48.00   -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 24.00  24.00   -      -      -     48.00  48.00   -      -      -      -      -      -      -      -      -      -      -      -     24.00  24.00   -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovow	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnow	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaew	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovew	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnew	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbew	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgew	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlew	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgw	%si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovow	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnow	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaew	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovew	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnew	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbew	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgew	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlew	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovol	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnol	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovel	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnel	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbel	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoval	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovll	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgel	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlel	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgl	%esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovol	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnol	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbl	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovel	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnel	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbel	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoval	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsl	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsl	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpl	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpl	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovll	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgel	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlel	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovoq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnoq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaeq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoveq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovneq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbeq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgeq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovleq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgq	%rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovoq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnoq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaeq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoveq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovneq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbeq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgeq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovleq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgq	(%rax), %rdi
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovow	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnow	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovow	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnow	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovbw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovaew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovbew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovaw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovsw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnsw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovpw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnpw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovlw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovgew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovlew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovgw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovol	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnol	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmoval	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovll	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovol	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnol	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovbl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovael	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovbel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmoval	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovsl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnsl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovpl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnpl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovll	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovgel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovlel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovgl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovoq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnoq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmoveq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovneq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovleq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovoq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnoq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovbq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovaeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmoveq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovneq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovbeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovaq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovsq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnsq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovpq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovnpq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovlq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovgeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovleq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmovgq	(%rax), %rdi
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s b/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
index d0ec04a5ee01aab281ad3b72ef3dead35b7e681f..7be673330059ce560cb911692e6b7e9b3fcc2c69 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
@@ -36,13 +36,16 @@ cmpxchg16b (%rax)
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s b/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
index 757687a4af7b93de69a9f13c1b0b7321323e7436..a81fb1ac803eb20d7fcca595980067ca7a094234 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
@@ -51,19 +51,22 @@ vcvtps2ph   $0, %ymm0, (%rax)
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 2.50   2.50    -      -      -      -      -      -     1.00   1.00    -      -      -     8.00    -     12.00   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.50   2.50    -      -      -      -      -      -     1.00   1.00    -      -      -     8.00    -     12.00   -      -      -     1.50   1.50    -     2.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtph2ps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtph2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtph2ps	%xmm0, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtph2ps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2ph	$0, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2ph	$0, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2ph	$0, %ymm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2ph	$0, %ymm0, (%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtph2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     vcvtph2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvtph2ps	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -     1.00   1.00    -      -     vcvtph2ps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vcvtps2ph	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vcvtps2ph	$0, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vcvtps2ph	$0, %ymm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -      -      -     1.00   vcvtps2ph	$0, %ymm0, (%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-fma.s b/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
index 104b07fc5e6ff98da1aab861ed2d4d75299c87d3..cb5eab8a7d040f30c934bdc816895f12f3ea12b6 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
@@ -511,203 +511,206 @@ vfnmsub231ss (%rax), %xmm1, %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 66.00  66.00   -      -      -      -      -      -     96.00  96.00   -      -      -      -     48.00  48.00  48.00  48.00   -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 66.00  66.00   -      -      -      -      -      -     96.00  96.00   -      -      -      -     48.00  48.00  48.00  48.00   -     66.00  66.00   -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsub231ss	(%rax), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s b/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
index b45abdfd387e4fcc667794a8a1ccdc6b30ab192f..c650201b6b7053fef0fcf80920f818304f48278b 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
@@ -255,107 +255,110 @@ vfnmsubss   %xmm0, (%rax), %xmm2, %xmm3
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 44.00  44.00   -      -      -      -      -      -     48.00  48.00   -      -      -      -     24.00  24.00  24.00  24.00   -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 44.00  44.00   -      -      -      -      -      -     48.00  48.00   -      -      -      -     24.00  24.00  24.00  24.00   -     44.00  44.00   -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     1.00   1.00    -      -     vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -     vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -     0.50   0.50    -      -     vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-lea.s b/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
index 246d968a32ec5a4e0474586c5fd55cc969c41a25..73ffc9074d19ebac7174c503d78373c118b4749f 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
@@ -304,146 +304,149 @@ lea 1024(%rax, %rbx, 2), %rcx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     67.50  67.50   -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     67.50  67.50   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	0, %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	0, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16, %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024, %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx,2), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx,2), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	0, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	0, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx,2), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s b/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
index 1b2b38fb4a55e5c6b995701fe323bcad2a4e86fe..f07f724d19b6835f5be661599d798405a0a03d2a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
@@ -46,17 +46,20 @@ lzcntq      (%rax), %rcx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 1.50   1.50    -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 1.50   1.50    -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -     1.50   1.50    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%cx, %cx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	(%rax), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntl	%eax, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntq	%rax, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%cx, %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     lzcntw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     lzcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     lzcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s b/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
index 3dcc8083125eb2b2825ccff84a1af60664cce3ac..66dfc084c27d63ee2c12050f3a1b9844f91ffbc8 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
@@ -290,116 +290,119 @@ pxor        (%rax), %mm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 24.00  24.00   -      -      -     2.00    -      -     2.50   2.50   46.00  46.00  6.00   2.00   55.50  49.50   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 24.00  24.00   -      -      -     2.00    -      -     2.50   2.50   46.00  46.00  6.00   2.00   55.50  49.50   -      -      -     23.00  23.00   -     2.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     emms
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movd	%eax, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movd	%mm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movd	%mm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movq	%rax, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movq	%mm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movq	%mm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	(%rax), %mm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     emms
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movd	%eax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     movd	%mm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movd	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movq	%rax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     movq	%mm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movq	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     packsswb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     packsswb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     packssdw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     packssdw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     packuswb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     packuswb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddusw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddusw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pand	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pand	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pandn	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pandn	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpeqb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpeqd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpeqw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpgtb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpgtb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpgtd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpgtd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpgtw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpgtw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmaddwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmaddwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmulhw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmulhw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmullw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmullw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     por	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     por	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pslld	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pslld	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pslld	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psllq	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psllq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psllq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psllw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psllw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psllw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrad	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrad	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psrad	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psraw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psraw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psraw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrld	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrld	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psrld	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrlq	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrlq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psrlq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrlw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrlw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psrlw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubusw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubusw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckhbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckhbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckhdq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckhdq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckhwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckhwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpcklbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpcklbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckldq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckldq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpcklwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpcklwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pxor	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pxor	(%rax), %mm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s b/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
index 92367b17eef3b4ead146ac6cc19d8d89a6f97aaf..5c00cbddf03e0f5b3dc6cc419d7bc7a1857ede64 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
@@ -19,11 +19,11 @@ movbe  (%rax), %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.50           *            movbew	%cx, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movbew	%cx, (%rax)
 # CHECK-NEXT:  1      5     0.50    *                   movbew	(%rax), %cx
-# CHECK-NEXT:  1      1     0.50           *            movbel	%ecx, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movbel	%ecx, (%rax)
 # CHECK-NEXT:  1      5     0.50    *                   movbel	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.50           *            movbeq	%rcx, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movbeq	%rcx, (%rax)
 # CHECK-NEXT:  1      5     0.50    *                   movbeq	(%rax), %rcx
 
 # CHECK:      Resources:
@@ -46,17 +46,20 @@ movbe  (%rax), %rcx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 3.00   3.00    -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 3.00   3.00    -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     1.50   1.50    -     3.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbew	%cx, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbew	(%rax), %cx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbel	%ecx, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbel	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbeq	%rcx, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbeq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movbew	%cx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movbew	(%rax), %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movbel	%ecx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movbel	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movbeq	%rcx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movbeq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s b/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
index 81bbc40143a3aab0af413328bb496d035f506795..8ed24f9662a2cc5dc2b213d65b61d19932a548d2 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
@@ -36,13 +36,16 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     0.50   0.50    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pclmulqdq	$11, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s b/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
index d31ed6cc528647b0692069cf6b1a8312fe894ff7..379c60b5a3379c0ab6ad99a019b582999cbd0672 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
@@ -46,17 +46,20 @@ popcntq     (%rax), %rcx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 1.50   1.50    -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 1.50   1.50    -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -     1.50   1.50    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntw	%cx, %cx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntw	(%rax), %cx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntl	%eax, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntq	%rax, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     popcntw	%cx, %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     popcntw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     popcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     popcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     popcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     popcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s b/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
index c6973d7bb86e75ea8bdfeca636b94308853a488b..2b075e8edd4f9773097d7048e58b59c3915cebe8 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
@@ -36,13 +36,16 @@ prefetchw   (%rax)
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetch	(%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchw	(%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     prefetch	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     prefetchw	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
index 85fa5d56db93e7d110b7a936295ef35ba7090e59..7fcb7fb49f4155f3f161302ce79ca015eca9c6a8 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
@@ -298,14 +298,14 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT:  1      10    1.00    *                   rsqrtps	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        rsqrtss	%xmm0, %xmm2
 # CHECK-NEXT:  1      10    1.00    *                   rsqrtss	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50    *      *      U     sfence
+# CHECK-NEXT:  1      1     1.00    *      *      U     sfence
 # CHECK-NEXT:  1      2     0.50                        shufps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  1      7     0.50    *                   shufps	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      9     10.50                       sqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  1      14    10.50   *                   sqrtps	(%rax), %xmm2
 # CHECK-NEXT:  1      9     10.50                       sqrtss	%xmm0, %xmm2
 # CHECK-NEXT:  1      14    10.50   *                   sqrtss	(%rax), %xmm2
-# CHECK-NEXT:  2      1     0.50    *      *      U     stmxcsr	(%rax)
+# CHECK-NEXT:  2      1     1.00    *      *      U     stmxcsr	(%rax)
 # CHECK-NEXT:  1      5     1.00                        subps	%xmm0, %xmm2
 # CHECK-NEXT:  1      10    1.00    *                   subps	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        subss	%xmm0, %xmm2
@@ -339,135 +339,138 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 33.50  33.50   -      -      -     15.00   -      -     115.50 115.50 9.50   9.50   2.00   25.00  50.50  66.50   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 33.50  33.50   -      -      -     15.00   -      -     115.50 115.50 9.50   9.50   2.00   25.00  50.50  66.50   -      -      -     29.00  29.00   -     9.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpps	$0, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpps	$0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpss	$0, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpss	$0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comiss	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comiss	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2ps	%mm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pi	%xmm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pi	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	%ecx, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssq	%rcx, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2pi	%xmm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2pi	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divss	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     ldmxcsr	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     maskmovq	%mm0, %mm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movaps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movaps	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movaps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhlps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlhps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movhps	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhps	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movlps	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movmskps	%xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntps	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntq	%mm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movss	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movups	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movups	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movups	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrw	$1, %mm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrw	$1, %eax, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrw	$1, (%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	(%rax), %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht0	(%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht1	(%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht2	(%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchnta	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufw	$1, %mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufw	$1, (%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtss	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sfence
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufps	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtss	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     stmxcsr	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomiss	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomiss	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorps	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     addps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     addps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     addss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     addss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     andnps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     andnps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     andps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     andps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     cmpps	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     cmpps	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     cmpss	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     cmpss	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     comiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     comiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtpi2ps	%mm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtpi2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtps2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtps2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtsi2ssl	%ecx, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtsi2ssq	%rcx, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvttps2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvttps2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvttss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvttss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvttss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     divps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     divps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     divss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     divss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     ldmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     maskmovq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     maxps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     maxps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     maxss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     maxss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     minps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     minps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     minss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     minss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movaps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movaps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movaps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movhlps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movlhps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movhps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movhps	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movlps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movlps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     movmskps	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movntps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movntq	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movss	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movups	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movups	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movups	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     mulps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     mulps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     mulss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     mulss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     orps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     orps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pavgb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pavgb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pavgw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pavgw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pextrw	$1, %mm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pinsrw	$1, %eax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pinsrw	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmaxsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmaxsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmaxub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmaxub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pminsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pminsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pminub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pminub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmulhuw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmulhuw	(%rax), %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     prefetcht0	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     prefetcht1	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     prefetcht2	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     prefetchnta	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psadbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psadbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pshufw	$1, %mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pshufw	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     rcpps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     rcpps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     rcpss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     rcpss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     rsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     rsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     rsqrtss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     rsqrtss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   sfence
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     shufps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     shufps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     sqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     sqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     sqrtss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     sqrtss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   stmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     subps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     subps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     subss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     subss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     ucomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     ucomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     unpckhps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     unpckhps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     unpcklps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     unpcklps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     xorps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     xorps	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
index 23be05e554a66bc131ac89d1c90327715e89169b..84bf65664c86bb97905134f115e71cb3907f3066 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
@@ -459,7 +459,7 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  1      14    9.50    *                   divpd	(%rax), %xmm2
 # CHECK-NEXT:  1      9     9.50                        divsd	%xmm0, %xmm2
 # CHECK-NEXT:  1      14    9.50    *                   divsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50    *      *      U     lfence
+# CHECK-NEXT:  1      1     1.00    *      *      U     lfence
 # CHECK-NEXT:  1      1     1.00    *      *      U     maskmovdqu	%xmm0, %xmm1
 # CHECK-NEXT:  1      2     1.00                        maxpd	%xmm0, %xmm2
 # CHECK-NEXT:  1      7     1.00    *                   maxpd	(%rax), %xmm2
@@ -488,8 +488,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  1      2     1.00           *            movlpd	%xmm0, (%rax)
 # CHECK-NEXT:  1      7     0.50    *                   movlpd	(%rax), %xmm2
 # CHECK-NEXT:  2      10    1.00                        movmskpd	%xmm0, %ecx
-# CHECK-NEXT:  1      1     0.50           *            movntil	%eax, (%rax)
-# CHECK-NEXT:  1      1     0.50           *            movntiq	%rax, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movntil	%eax, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movntiq	%rax, (%rax)
 # CHECK-NEXT:  1      2     1.00           *            movntdq	%xmm0, (%rax)
 # CHECK-NEXT:  1      3     1.00           *            movntpd	%xmm0, (%rax)
 # CHECK-NEXT:  1      2     0.50                        movq	%xmm0, %xmm2
@@ -687,275 +687,278 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 66.00  66.00   -      -      -     17.00   -      -     124.50 124.50 66.50  66.50  12.00  50.00  119.50 140.50  -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 66.00  66.00   -      -      -     17.00   -      -     124.50 124.50 66.50  66.50  12.00  50.00  119.50 140.50  -      -      -     58.50  58.50   -     15.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andpd	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     clflush	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmppd	$0, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmppd	$0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpsd	$0, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpsd	$0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comisd	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comisd	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2pd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2ps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2dq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2pi	%xmm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2pi	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2ps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2pd	%mm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2dq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2ss	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2ss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	%ecx, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdq	%rcx, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtss2sd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtss2sd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2dq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2pi	%xmm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2pi	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	(%rax), %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divsd	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lfence
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     maskmovdqu	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movapd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movapd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movapd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movd	%eax, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movd	%xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqa	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movdqa	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqu	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movdqu	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqu	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdq2q	%xmm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movhpd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhpd	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movlpd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movmskpd	%xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movntil	%eax, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movntiq	%rax, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntdq	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntpd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movq	%rax, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movq	%xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movq	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq2dq	%mm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movsd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movupd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movupd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movupd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrw	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufd	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufhw	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufhw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshuflw	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshuflw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslldq	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrldq	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	$1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhqdq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhqdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklqdq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklqdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufpd	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomisd	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomisd	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorpd	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     addpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     addpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     addsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     addsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     andnpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     andnpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     andpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     andpd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     clflush	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     cmppd	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     cmppd	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     cmpsd	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     cmpsd	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     comisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     comisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtpd2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtpd2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtpd2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtpi2pd	%mm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtpi2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtsd2ss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtsd2ss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtsi2sdl	%ecx, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtsi2sdq	%rcx, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvtss2sd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvtss2sd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvttpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvttpd2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvttpd2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     cvttsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvttsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     cvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     divpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     divpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     divsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     divsd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   lfence
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   maskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     maxpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     maxpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     maxsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     maxsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     minpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     minpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     minsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     minsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movapd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movapd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movapd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movd	%eax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     movd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     movdqa	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movdqa	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     movdqu	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movdqu	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movdqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     movdq2q	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movhpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movhpd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movlpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movlpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     movmskpd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movntil	%eax, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movntiq	%rax, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movntdq	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movntpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     movq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movq	%rax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     movq	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     movq2dq	%mm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movupd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movupd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movupd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     mulpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     mulpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     mulsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     mulsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     orpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     orpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     packssdw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     packssdw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     packsswb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     packsswb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     packuswb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     packuswb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddusb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddusb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddusw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddusw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     paddw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     paddw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pand	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pand	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pandn	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pandn	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pavgb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pavgb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pavgw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pavgw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpeqb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpeqd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpeqw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpgtb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpgtb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpgtd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpgtd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpgtw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpgtw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmaddwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmaddwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmaxsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmaxsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmaxub	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmaxub	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pminsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pminsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pminub	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pminub	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmulhuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmulhuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmulhw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmulhw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmullw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmullw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmuludq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmuludq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmuludq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmuludq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     por	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     por	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psadbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psadbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pslld	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pslld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pslld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pslldq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psllq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psllq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psllq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psllw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psllw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psllw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrad	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrad	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psrad	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psraw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psraw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psraw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrld	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psrld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrldq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrlq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrlq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psrlq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrlw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psrlw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psrlw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubusb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubusb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubusw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubusw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psubw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psubw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckhbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckhbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckhdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckhdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckhqdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckhqdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckhwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckhwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpcklbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpcklbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpckldq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpckldq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpcklqdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpcklqdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     punpcklwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     punpcklwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pxor	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pxor	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     shufpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     shufpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     sqrtpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     sqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     sqrtsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     sqrtsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     subpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     subpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     subsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     subsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     ucomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     ucomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     unpckhpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     unpckhpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     unpcklpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     unpcklpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     xorpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     xorpd	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
index ce08757f7de53a1f4d4ce59a2b4b4260ece9d455..aedbd80c033e039adea444b2936668ce74c21417 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
@@ -79,30 +79,33 @@ movsldup  (%rax), %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 5.00   5.00    -      -      -      -      -      -     9.00   9.00   0.50   0.50    -      -     15.50  3.50    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 5.00   5.00    -      -      -      -      -      -     9.00   9.00   0.50   0.50    -      -     15.50  3.50    -      -      -     5.00   5.00    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubpd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubps	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubps	(%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     lddqu	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movddup	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movddup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movshdup	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movshdup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsldup	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsldup	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     addsubpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     addsubpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     addsubps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     addsubps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     haddpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     haddpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     haddps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     haddps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     hsubpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     hsubpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     hsubps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     hsubps	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     lddqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movddup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movddup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movshdup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movshdup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     movsldup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movsldup	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
index d1b36d10b2f62fba9d99e83ce38e3fe27ab93fe9..4aa29de694e014087dda774575df80cca3776957 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
@@ -272,107 +272,110 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 24.50  24.50   -      -      -     6.00    -      -     20.00  20.00  32.50  32.50  10.00  13.00  49.50  50.50   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 24.50  24.50   -      -      -     6.00    -      -     20.00  20.00  32.50  32.50  10.00  13.00  49.50  50.50   -      -      -     22.00  22.00   -     5.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendpd	$11, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendpd	$11, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendps	$11, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendps	$11, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvpd	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvpd	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvps	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvps	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dppd	$22, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dppd	$22, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dpps	$22, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     extractps	$1, %xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     extractps	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     insertps	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     insertps	$1, (%rax), %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movntdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     mpsadbw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packusdw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packusdw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pblendvb	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pblendvb	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pblendw	$11, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pblendw	$11, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrb	$1, %xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrb	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrd	$1, %xmm0, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrd	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrq	$1, %xmm0, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrq	$1, %xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     phminposuw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     phminposuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrb	$1, %eax, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrb	$1, (%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrd	$1, %eax, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrd	$1, (%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrq	$1, %rax, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrq	$1, (%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxud	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxud	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxuw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminud	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminud	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminuw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxdq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxdq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuldq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuldq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     pmulld	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ptest	%xmm0, %xmm1
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundpd	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundps	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundsd	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundss	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundss	$1, (%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     blendpd	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     blendpd	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     blendps	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     blendps	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     blendvpd	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     blendvpd	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     blendvps	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     blendvps	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     dppd	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     dppd	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     extractps	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   extractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     insertps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     insertps	$1, (%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -     0.50   0.50    -      -     mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     packusdw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     packusdw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -      -      -      -     pblendvb	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pblendvb	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pblendw	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pblendw	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpeqq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpeqq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pextrb	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   pextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pextrd	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   pextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     pextrq	$1, %xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   pextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   pextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -      -      -      -     phminposuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -     0.50   0.50    -      -     phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pinsrb	$1, %eax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pinsrb	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pinsrd	$1, %eax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pinsrd	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pinsrq	$1, %rax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pinsrq	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmaxsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmaxsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmaxsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmaxsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmaxud	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmaxud	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmaxuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmaxuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pminsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pminsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pminsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pminsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pminud	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pminud	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pminuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pminuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmuldq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmuldq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -      -      -      -     pmulld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -     0.50   0.50    -      -     pmulld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     ptest	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     ptest	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     roundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     roundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     roundsd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -     0.50   0.50    -      -     roundss	$1, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
index 2d3a0ef4049b5162d4fb737ddc3af17e5809a893..433511f7ee55bcda3396c5ea36c6c3fbb48406ce 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
@@ -81,31 +81,34 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 21.00  21.00   -      -      -     28.00  20.00   -     6.00   6.00   9.00   9.00    -      -     1.00   9.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 21.00  21.00   -      -      -     28.00  20.00   -     6.00   6.00   9.00   9.00    -      -     1.00   9.00    -      -      -     17.00  17.00   -     8.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	%al, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32l	%eax, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32l	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32w	%ax, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32w	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	%al, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32q	%rax, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32q	(%rax), %rcx
-# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestri	$1, %xmm0, %xmm2
-# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistri	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtq	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtq	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	%al, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     crc32b	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     crc32l	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     crc32l	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     crc32w	%ax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     crc32w	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	%al, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     crc32b	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     crc32q	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     crc32q	(%rax), %rcx
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -     3.00   3.00    -     2.00   pcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -     3.50   3.50    -     2.00   pcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -     3.00   3.00    -     2.00   pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -     3.50   3.50    -     2.00   pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -      -      -      -     pcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     pcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -      -      -      -     pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pcmpgtq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pcmpgtq	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
index 55347137df458251e76aba5da1c993fb9594626c..6a33f650b417e145b7af5bea164c0b32f774d91d 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
@@ -46,17 +46,20 @@ movntss     %xmm0, (%rax)
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     5.00   5.00    -     2.00   2.00   4.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     5.00   5.00    -     2.00   2.00   4.00    -      -      -      -      -      -     2.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     extrq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     extrq	$22, $2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     insertq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     insertq	$22, $22, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntsd	%xmm0, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntss	%xmm0, (%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     extrq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     extrq	$22, $2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -      -      -      -     insertq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -      -      -      -     insertq	$22, $22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movntsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movntss	%xmm0, (%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s b/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
index c89ef2976295c55d6898d953af2eada893780c44..3d16508c7130ac73fc13d5e34d43e797a0ee3523 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
@@ -191,75 +191,78 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 16.00  16.00   -      -      -      -      -      -      -      -     34.00  34.00  8.00    -     36.00  28.00   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 16.00  16.00   -      -      -      -      -      -      -      -     34.00  34.00  8.00    -     36.00  28.00   -      -      -     16.00  16.00   -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, %mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, (%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, %xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	%mm0, %mm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pabsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pabsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pabsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pabsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pabsd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pabsd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pabsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pabsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pabsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pabsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     pabsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pabsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     palignr	$1, %mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     palignr	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     palignr	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     palignr	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phaddd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phaddd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phaddd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phaddd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phaddsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phaddsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phaddsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phaddsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phaddw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phaddw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phaddw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phaddw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phsubd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phsubd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phsubd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phsubd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phsubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phsubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phsubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phsubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phsubw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phsubw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     phsubw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     phsubw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmaddubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmaddubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmaddubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmaddubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmulhrsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmulhrsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     pmulhrsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -      -      -      -     pshufb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pshufb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -      -      -      -     pshufb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     pshufb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psignb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psignb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psignb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psignb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psignd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psignd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psignd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psignd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psignw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psignw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     psignw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     psignw	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s b/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
index 0287d973171f632c35a5cd59538625b5eb8641dc..2b7f8d091da9b890b7fc0a737e2a6a075d46e948 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
@@ -131,51 +131,54 @@ tzmsk        (%rax), %rcx
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 1.00   1.00    -      -      -     20.00  20.00   -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 1.00   1.00    -      -      -     20.00  20.00   -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	$8192, %ebx, %ecx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	$8192, (%rbx), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	$16384, %rbx, %rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	$16384, (%rbx), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfilll	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfilll	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfillq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfillq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcil	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcil	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blciq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blciq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfilll	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfilll	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfillq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfillq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskq	%rax, %rcx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	$8192, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bextrl	$8192, (%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	$16384, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bextrq	$16384, (%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcfilll	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcfilll	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcfillq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcfillq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcil	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcil	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blciq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blciq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcicl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcicl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcicq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcicq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcsl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcsl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcsq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blcsq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsfilll	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsfilll	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsfillq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsfillq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsicl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsicl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsicq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsicq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s b/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
index 5a6ee53713cda7c48ac591c0c0e05450a4af33c3..ebb20696a45792bcbe6b80b307ec694ce8d5d933 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
@@ -67,24 +67,27 @@ salc
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     6.50   6.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     6.50   6.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aaa
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aad
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aad	$7
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aam
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aam	$7
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aas
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bound	%bx, (%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bound	%ebx, (%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     daa
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     das
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     into
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leave
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     salc
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     aaa
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     aad
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     aad	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     aam
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     aam	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     aas
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bound	%bx, (%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bound	%ebx, (%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     daa
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     das
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     into
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leave
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     salc
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s b/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
index b72522411b82efef7b8a90940c7b4cfe729c7742..84eec15a04923351c410e69b08989adaa6e64f01 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
@@ -1439,29 +1439,29 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      100   0.50                  U     scasl	%es:(%rdi), %eax
 # CHECK-NEXT:  1      100   0.50                  U     scasq	%es:(%rdi), %rax
 # CHECK-NEXT:  1      1     0.50                        seto	%al
-# CHECK-NEXT:  1      1     0.50           *            seto	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            seto	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setno	%al
-# CHECK-NEXT:  1      1     0.50           *            setno	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            setno	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setb	%al
-# CHECK-NEXT:  1      1     0.50           *            setb	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            setb	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setae	%al
-# CHECK-NEXT:  1      1     0.50           *            setae	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            setae	(%rax)
 # CHECK-NEXT:  1      1     0.50                        sete	%al
-# CHECK-NEXT:  1      1     0.50           *            sete	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            sete	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setne	%al
-# CHECK-NEXT:  1      1     0.50           *            setne	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            setne	(%rax)
 # CHECK-NEXT:  1      1     0.50                        seta	%al
-# CHECK-NEXT:  1      1     0.50           *            seta	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            seta	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setbe	%al
-# CHECK-NEXT:  1      1     0.50           *            setbe	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            setbe	(%rax)
 # CHECK-NEXT:  1      1     0.50                        sets	%al
-# CHECK-NEXT:  1      1     0.50           *            sets	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            sets	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setns	%al
-# CHECK-NEXT:  1      1     0.50           *            setns	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            setns	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setp	%al
-# CHECK-NEXT:  1      1     0.50           *            setp	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            setp	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setnp	%al
-# CHECK-NEXT:  1      1     0.50           *            setnp	(%rax)
+# CHECK-NEXT:  1      1     1.00           *            setnp	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setl	%al
 # CHECK-NEXT:  2      1     1.00           *            setl	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setge	%al
@@ -1628,757 +1628,760 @@ xorq (%rax), %rdi
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 262.00 262.00  -      -     246.00 547.50 622.50  -      -      -      -      -      -      -      -      -      -      -      -     64.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 262.00 262.00  -      -     246.00 547.50 622.50  -      -      -      -      -      -      -      -      -      -      -      -     156.50 156.50 64.00  211.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	%sil, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	%sil, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	(%rax), %dil
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	%si, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	%si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	%esi, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	%esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	%rsi, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	%rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	%sil, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	%sil, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	(%rax), %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%si, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	%sil, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	%sil, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	(%rax), %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	%si, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	%si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	%esi, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	%esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	%rsi, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	%rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrw	%si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrl	%esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfl	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrq	%rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bswapl	%eax
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bswapq	%rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	%si, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	%si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	%si, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	%si, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	%si, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	%si, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	$7, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	$7, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	$7, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	$7, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	%esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	%esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	%esi, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	%esi, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	%esi, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	%esi, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	$7, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	$7, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	$7, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	$7, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	%rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	%rsi, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	%rsi, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	%rsi, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	%rsi, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	$7, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	$7, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	$7, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	$7, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cbtw
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cwtl
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cltq
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cwtd
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cltd
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cqto
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     clc
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cld
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmc
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, %dil
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	%sil, %dil
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	%sil, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	(%rax), %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$7, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	%si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	%si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$7, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$7, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsb	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsw	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsl	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsq	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb	%cl, %bl
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb	%cl, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw	%cx, %bx
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw	%cx, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl	%ecx, %ebx
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl	%ecx, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq	%rcx, %rbx
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq	%rcx, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cpuid
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decb	%dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decb	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decw	%di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decl	%edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decl	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decq	%rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decq	(%rax)
-# CHECK-NEXT:  -      -      -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
-# CHECK-NEXT: 0.50   0.50    -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divb	(%rax)
-# CHECK-NEXT:  -      -      -      -     15.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
-# CHECK-NEXT: 0.50   0.50    -      -     15.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divw	(%rax)
-# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
-# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divl	(%rax)
-# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
-# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divq	(%rax)
-# CHECK-NEXT:  -      -      -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
-# CHECK-NEXT: 0.50   0.50    -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	(%rax)
-# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
-# CHECK-NEXT: 0.50   0.50    -      -     17.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	(%rax)
-# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
-# CHECK-NEXT: 0.50   0.50    -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	(%rax)
-# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
-# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulb	%dil
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulb	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%di
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$511, %si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$511, (%rax), %di
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$7, %si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$7, (%rax), %di
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edi
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$665536, %esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$665536, (%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$7, %esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$7, (%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	%rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	%rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$665536, %rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$665536, (%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$7, %rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$7, (%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inb	%dx, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inw	$7, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inw	%dx, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inl	$7, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inl	%dx, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incb	%dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incb	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incw	%di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incl	%edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incl	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incq	%rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incq	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insb	%dx, %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insw	%dx, %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insl	%dx, %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     int	$7
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lahf
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsb	(%rsi), %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsw	(%rsi), %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsl	(%rsi), %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsq	(%rsi), %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsb	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsw	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsl	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsq	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbw	%al, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbw	%al, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbw	(%rax), %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbl	%al, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbl	%al, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbl	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbq	%al, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbq	%al, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswl	%ax, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwl	%ax, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswl	(%rax), %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswq	%ax, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwq	%ax, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswq	(%rax), %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movslq	%eax, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movslq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulb	%dil
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulb	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulw	%si
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mull	%edx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mull	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   mulq	%rcx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   mulq	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negb	%dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negb	(%r8)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negw	%si
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negw	(%r9)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negl	%edx
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negl	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negq	%rcx
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negq	(%r10)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nop
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopw	%di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopw	(%rcx)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopl	%esi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopl	(%r8)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopq	%rdx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopq	(%r9)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notb	%dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notb	(%r8)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notw	%si
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notw	(%r9)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notl	%edx
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notl	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notq	%rcx
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notq	(%r10)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	%sil, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	%sil, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	(%rax), %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	%si, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	%si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	%esi, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	%esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	%rsi, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	%rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outb	%al, $7
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outb	%al, %dx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outw	%ax, $7
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outw	%ax, %dx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outl	%eax, $7
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outl	%eax, %dx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsb	(%rsi), %dx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsw	(%rsi), %dx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsl	(%rsi), %dx
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     pause
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	$7, %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%cl, %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%cl, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	$7, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%cl, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%cl, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	$7, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%cl, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%cl, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	$7, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%cl, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%cl, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	$7, %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%cl, %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%cl, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	$7, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%cl, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%cl, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	$7, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%cl, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%cl, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	$7, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%cl, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%cl, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sahf
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	$7, %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	$7, %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%cl, %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%cl, %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%cl, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	$7, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	$7, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%cl, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%cl, %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%cl, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	$7, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	$7, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%cl, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%cl, %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%cl, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	(%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	$7, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	$7, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	$7, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%cl, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%cl, %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%cl, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%cl, (%rax)
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%cl, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	%sil, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	%sil, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	(%rax), %dil
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	%si, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	%si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%esi, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	%rsi, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	%rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasb	%es:(%rdi), %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasw	%es:(%rdi), %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasl	%es:(%rdi), %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasq	%es:(%rdi), %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seto	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seto	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setno	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setno	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setb	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setb	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setae	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setae	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sete	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sete	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setne	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setne	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seta	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seta	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setbe	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setbe	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sets	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sets	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setns	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setns	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setp	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setp	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setnp	%al
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setnp	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setl	%al
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setl	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setge	%al
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setge	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setg	%al
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setg	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setle	%al
-# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setle	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	%cl, %si, %di
-# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	%cl, %si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	%cl, %si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	%cl, %si, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	$7, %si, %di
-# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	$7, %si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	$7, %si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	$7, %si, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	%cl, %esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	%cl, %esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	%cl, %esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	%cl, %esi, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	$7, %esi, %edi
-# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	$7, %esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	$7, %esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	$7, %esi, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	%cl, %rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	%cl, %rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	%cl, %rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	%cl, %rsi, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	$7, %rsi, %rdi
-# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	$7, %rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	$7, %rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	$7, %rsi, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stc
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     std
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosb	%al, %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosw	%ax, %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosl	%eax, %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosq	%rax, %es:(%rdi)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	%sil, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	%sil, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	(%rax), %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	%si, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	%si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%esi, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rsi, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, %dil
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	%sil, %dil
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	%sil, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$7, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	%si, %di
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	%si, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$7, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	%esi, %edi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	%esi, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$7, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	%rsi, %rdi
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	%rsi, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     ud2
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddb	%bl, %cl
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddb	%bl, (%rcx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddw	%bx, %cx
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddw	%ax, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddl	%ebx, %ecx
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddl	%eax, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddq	%rbx, %rcx
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddq	%rax, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgb	%bl, %cl
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgb	%bl, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%bx, %ax
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%bx, %cx
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%ax, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%ebx, %eax
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%ebx, %ecx
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%eax, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rbx, %rax
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rbx, %rcx
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rax, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xlatb
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, %al
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	%sil, %dil
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	%sil, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	(%rax), %dil
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, %ax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$7, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%si, %di
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%si, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, %eax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$7, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%esi, %edi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%esi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, %rax
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$7, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$7, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rsi, %rdi
-# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rsi, (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	(%rax), %rdi
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     adcb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     adcw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     adcl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   adcq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     adcq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     addb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     addw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     addl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   addq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     addq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     andb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     andw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     andl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   andq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     andq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bsfw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bsrw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bsfw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bsrw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bsfl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bsrl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bsfl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bsrl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bsfq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bsrq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bsfq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     bsrq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bswapl	%eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     bswapq	%rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     btw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btcw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btrw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btsw	%si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     btw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btcw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btrw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btsw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     btl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btcl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btrl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btsl	%esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     btl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btcl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btrl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btsl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     btq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btcq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btrq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btsq	%rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     btq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btcq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btrq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   btsq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cbtw
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cwtl
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cltq
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cwtd
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cltd
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cqto
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     clc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cld
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	%sil, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     cmpq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsb	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsw	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsl	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsq	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb	%cl, %bl
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb	%cl, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw	%cx, %bx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw	%cx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl	%ecx, %ebx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl	%ecx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq	%rcx, %rbx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq	%rcx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cpuid
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     decb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   decb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     decw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   decw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     decl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   decl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     decq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   decq	(%rax)
+# CHECK-NEXT:  -      -      -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     divb	(%rax)
+# CHECK-NEXT:  -      -      -      -     15.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
+# CHECK-NEXT: 0.50   0.50    -      -     15.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     divw	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     divl	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     divq	(%rax)
+# CHECK-NEXT:  -      -      -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     idivb	(%rax)
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
+# CHECK-NEXT: 0.50   0.50    -      -     17.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     idivw	(%rax)
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
+# CHECK-NEXT: 0.50   0.50    -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     idivl	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     idivq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imulb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imulb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imulw	%di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imulw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imulw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imulw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imulw	$511, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imulw	$511, (%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imulw	$7, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imulw	$7, (%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imull	%edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imull	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imull	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imull	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imull	$665536, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imull	$665536, (%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     imull	$7, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     imull	$7, (%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00    -     imulq	%rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   4.00    -     imulq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00    -     imulq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   4.00    -     imulq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00    -     imulq	$665536, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   4.00    -     imulq	$665536, (%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00    -     imulq	$7, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   4.00    -     imulq	$7, (%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     inb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     inb	%dx, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     inw	$7, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     inw	%dx, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     inl	$7, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     inl	%dx, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     incb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   incb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     incw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   incw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     incl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   incl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     incq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   incq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     insb	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     insw	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     insl	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     int	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lahf
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lodsb	(%rsi), %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lodsw	(%rsi), %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lodsl	(%rsi), %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lodsq	(%rsi), %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movsb	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movsw	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movsl	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movsq	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movsbw	%al, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movzbw	%al, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movsbw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movzbw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movsbl	%al, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movzbl	%al, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movsbl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movzbl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movsbq	%al, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movzbq	%al, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movsbq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movzbq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movswl	%ax, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movzwl	%ax, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movswl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movzwl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movswq	%ax, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movzwq	%ax, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movswq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movzwq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movslq	%eax, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movslq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     mulb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     mulb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     mulw	%si
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     mulw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     mull	%edx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     mull	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00    -     mulq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   4.00    -     mulq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     negb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   negb	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     negw	%si
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   negw	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     negl	%edx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   negl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     negq	%rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   negq	(%r10)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     nop
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     nopw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     nopw	(%rcx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     nopl	%esi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     nopl	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     nopq	%rdx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     nopq	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     notb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   notb	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     notw	%si
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   notw	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     notl	%edx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   notl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     notq	%rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   notq	(%r10)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     orb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     orw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     orl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   orq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     orq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outb	%al, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outb	%al, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outw	%ax, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outw	%ax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outl	%eax, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outl	%eax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outsb	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outsw	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     outsl	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pause
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rclq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rcrq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   roll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     roll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   roll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   roll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rolq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   rorq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sahf
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarl	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarl	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sarq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shlq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   shrq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     sbbb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     sbbw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     sbbl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   sbbq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     sbbq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     scasb	%es:(%rdi), %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     scasw	%es:(%rdi), %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     scasl	%es:(%rdi), %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     scasq	%es:(%rdi), %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     seto	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   seto	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setno	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   setno	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setb	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   setb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setae	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   setae	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sete	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   sete	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setne	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   setne	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     seta	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   seta	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setbe	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   setbe	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sets	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   sets	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setns	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   setns	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setp	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   setp	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setnp	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   setnp	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setl	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setge	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setge	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setg	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setg	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setle	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     setle	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	%cl, %si, %di
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	%cl, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shldw	%cl, %si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shrdw	%cl, %si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	$7, %si, %di
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	$7, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shldw	$7, %si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shrdw	$7, %si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	%cl, %esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	%cl, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shldl	%cl, %esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shrdl	%cl, %esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	$7, %esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	$7, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shldl	$7, %esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shrdl	$7, %esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	%cl, %rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	%cl, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shldq	%cl, %rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shrdq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	$7, %rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	$7, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shldq	$7, %rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     shrdq	$7, %rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     stc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     std
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     stosb	%al, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     stosw	%ax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     stosl	%eax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     stosq	%rax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     subb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     subw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     subl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   subq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     subq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testb	%sil, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testb	%sil, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testw	%si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testl	%esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     testq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     testq	%rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     ud2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xaddb	%bl, %cl
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xaddb	%bl, (%rcx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xaddw	%bx, %cx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xaddw	%ax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xaddl	%ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xaddl	%eax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xaddq	%rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xaddq	%rax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xchgb	%bl, %cl
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xchgb	%bl, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%bx, %ax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%bx, %cx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xchgw	%ax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%ebx, %ecx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xchgl	%eax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rbx, %rax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rbx, %rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xchgq	%rax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xlatb
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     xorb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     xorw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     xorl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     1.00   xorq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     xorq	(%rax), %rdi
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x87.s b/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
index f64944cb11285efa56a66ea97b7bd1f7bc4dcd66..ad72714c74c1bbaf47a96b3b57ef85430ef9125a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
@@ -267,14 +267,14 @@ fyl2xp1
 # CHECK-NEXT:  1      5     0.50    *             U     fildll	(%eax)
 # CHECK-NEXT:  1      100   0.50                  U     fincstp
 # CHECK-NEXT:  1      100   0.50                  U     fninit
-# CHECK-NEXT:  1      1     0.50           *      U     fists	(%edx)
-# CHECK-NEXT:  1      1     0.50           *      U     fistl	(%ecx)
-# CHECK-NEXT:  1      1     0.50           *      U     fistps	(%edx)
-# CHECK-NEXT:  1      1     0.50           *      U     fistpl	(%ecx)
-# CHECK-NEXT:  1      1     0.50           *      U     fistpll	(%eax)
-# CHECK-NEXT:  1      1     0.50           *      U     fisttps	(%edx)
-# CHECK-NEXT:  1      1     0.50           *      U     fisttpl	(%ecx)
-# CHECK-NEXT:  1      1     0.50           *      U     fisttpll	(%eax)
+# CHECK-NEXT:  1      1     1.00           *      U     fists	(%edx)
+# CHECK-NEXT:  1      1     1.00           *      U     fistl	(%ecx)
+# CHECK-NEXT:  1      1     1.00           *      U     fistps	(%edx)
+# CHECK-NEXT:  1      1     1.00           *      U     fistpl	(%ecx)
+# CHECK-NEXT:  1      1     1.00           *      U     fistpll	(%eax)
+# CHECK-NEXT:  1      1     1.00           *      U     fisttps	(%edx)
+# CHECK-NEXT:  1      1     1.00           *      U     fisttpl	(%ecx)
+# CHECK-NEXT:  1      1     1.00           *      U     fisttpll	(%eax)
 # CHECK-NEXT:  1      1     0.50                  U     fld	%st(0)
 # CHECK-NEXT:  1      5     0.50    *             U     flds	(%edx)
 # CHECK-NEXT:  1      5     0.50    *             U     fldl	(%ecx)
@@ -309,12 +309,12 @@ fyl2xp1
 # CHECK-NEXT:  1      100   0.50                  U     fsincos
 # CHECK-NEXT:  1      1     17.50                 U     fsqrt
 # CHECK-NEXT:  1      1     0.50                  U     fst	%st(0)
-# CHECK-NEXT:  1      1     0.50           *      U     fsts	(%edx)
-# CHECK-NEXT:  1      1     0.50           *      U     fstl	(%ecx)
+# CHECK-NEXT:  1      1     1.00           *      U     fsts	(%edx)
+# CHECK-NEXT:  1      1     1.00           *      U     fstl	(%ecx)
 # CHECK-NEXT:  1      1     0.50                  U     fstp	%st(0)
-# CHECK-NEXT:  1      1     0.50           *      U     fstpl	(%edx)
-# CHECK-NEXT:  1      1     0.50           *      U     fstpl	(%ecx)
-# CHECK-NEXT:  1      1     0.50           *      U     fstpt	(%eax)
+# CHECK-NEXT:  1      1     1.00           *      U     fstpl	(%edx)
+# CHECK-NEXT:  1      1     1.00           *      U     fstpl	(%ecx)
+# CHECK-NEXT:  1      1     1.00           *      U     fstpt	(%eax)
 # CHECK-NEXT:  1      1     0.50           *      U     fnstcw	(%eax)
 # CHECK-NEXT:  1      100   0.50                  U     fnstenv	(%eax)
 # CHECK-NEXT:  1      100   0.50                  U     fnstsw	(%eax)
@@ -375,159 +375,162 @@ fyl2xp1
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 24.00  24.00   -      -      -     36.00  20.00   -     201.50 201.50  -      -      -     7.00   48.00  40.00   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 24.00  24.00   -      -      -     36.00  20.00   -     201.50 201.50  -      -      -     7.00   48.00  40.00   -      -      -     17.50  17.50   -     13.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     f2xm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fabs
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadd	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadd	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadds	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddl	(%ecx)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddp	%st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddp	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fiadds	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fiaddl	(%ecx)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fbld	(%ecx)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fbstp	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fchs
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnclex
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovb	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovbe	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmove	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnb	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnbe	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovne	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnu	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovu	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcom	%st(1)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcom	%st(3)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcoms	(%ecx)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcoml	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomp	%st(1)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomp	%st(3)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcomps	(%ecx)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcompl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fcompp
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomi	%st(3)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcompi	%st(3)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fcos
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fdecstp
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdiv	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdiv	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivs	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivp	%st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivp	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivs	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivr	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivr	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrs	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrp	%st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrp	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivrs	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivrl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     ffree	%st(0)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficoms	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficoml	(%eax)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficomps	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficompl	(%eax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     filds	(%edx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fildl	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fildll	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fincstp
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fninit
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fists	(%edx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistl	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistps	(%edx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistpl	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistpll	(%eax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttps	(%edx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttpl	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttpll	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fld	%st(0)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     flds	(%edx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldl	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldt	(%eax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldcw	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fldenv	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fld1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldl2e
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldl2t
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldlg2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldln2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldpi
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldz
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmul	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmul	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmuls	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmull	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmulp	%st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmulp	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fimuls	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fimull	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnop
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fpatan
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fprem
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fprem1
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fptan
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frndint
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frstor	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnsave	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fscale
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fsin
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fsincos
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     17.50  17.50   -      -      -      -      -     1.00    -      -      -      -     fsqrt
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fst	%st(0)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fsts	(%edx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstl	(%ecx)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fstp	%st(0)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpl	(%edx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpl	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpt	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstcw	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstenv	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstsw	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frstor	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     wait
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnsave	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsub	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsub	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubs	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubp	%st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubp	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubs	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubr	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubr	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrs	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrp	%st(1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrp	%st(2)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubrs	(%ecx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubrl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     ftst
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucom	%st(1)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucom	%st(3)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomp	%st(1)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomp	%st(3)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fucompp
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomi	%st(3)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucompi	%st(3)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     wait
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxam
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxch	%st(1)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxch	%st(3)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxrstor	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxsave	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxtract
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fyl2x
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fyl2xp1
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     f2xm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fabs
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fadd	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fadd	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fadds	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     faddl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     faddp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     faddp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fiadds	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fiaddl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fbld	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fbstp	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fchs
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fnclex
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcmovb	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcmovbe	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcmove	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcmovnb	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcmovnbe	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcmovne	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcmovnu	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcmovu	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcom	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcom	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fcoms	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fcoml	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcomp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcomp	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fcomps	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fcompl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fcompp
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcomi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fcompi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fcos
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fdecstp
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fdiv	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fdiv	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fdivs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fdivl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fdivp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fdivp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fidivs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fidivl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fdivr	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fdivr	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fdivrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fdivrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fdivrp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fdivrp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fidivrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fidivrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     ffree	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     ficoms	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     ficoml	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     ficomps	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     ficompl	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     filds	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     fildl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     fildll	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fincstp
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fninit
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fists	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fistl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fistps	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fistpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fistpll	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fisttps	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fisttpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fisttpll	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fld	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     flds	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     fldl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     fldt	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     fldcw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldenv	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     fld1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     fldl2e
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     fldl2t
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     fldlg2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     fldln2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     fldpi
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     fldz
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fmul	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fmul	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fmuls	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fmull	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fmulp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fmulp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fimuls	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -     0.50   0.50    -      -     fimull	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fnop
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fpatan
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fprem
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fprem1
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fptan
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     frndint
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     frstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fnsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fscale
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fsin
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fsincos
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     17.50  17.50   -      -      -      -      -     1.00    -      -      -      -      -      -      -     fsqrt
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fst	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fsts	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fstl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstp	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fstpl	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fstpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   fstpt	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fnstcw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fnstenv	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fnstsw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     frstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     wait
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fnsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fsub	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fsub	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fsubs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fsubl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fsubp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fsubp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fisubs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fisubl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fsubr	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fsubr	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fsubrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fsubrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fsubrp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fsubrp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fisubrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -     0.50   0.50    -      -     fisubrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     ftst
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fucom	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fucom	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fucomp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fucomp	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -      -     fucompp
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fucomi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -     fucompi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     wait
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fxam
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fxch	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fxch	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fxrstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fxsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fxtract
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fyl2x
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fyl2xp1
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-xop.s b/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
index 306917defb10257b21bf562b2c4c64caa23ef003..88d15c958bc9474bce2b85ea61cf421db59b66f6 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
@@ -389,158 +389,161 @@ vpshlw %xmm0, (%rax), %xmm3
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT: 41.50  41.50   -      -      -      -      -      -     30.00  30.00  60.00  60.00  36.00  12.00  100.50 80.50   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 41.50  41.50   -      -      -      -      -      -     30.00  30.00  60.00  60.00  36.00  12.00  100.50 80.50   -      -      -     41.50  41.50   -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczpd	%ymm0, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczpd	(%rax), %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczps	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczps	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczps	%ymm0, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczps	(%rax), %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczsd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczsd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczss	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczss	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	(%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	(%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomb	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomb	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomd	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomd	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomq	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomq	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomub	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomub	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomud	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomud	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuq	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuq	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuw	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuw	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomw	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomw	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbd	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbq	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbw	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbw	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadddq	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadddq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubd	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubq	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubw	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubw	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddudq	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddudq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwd	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwq	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwd	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwq	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubbw	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubbw	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubdq	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubdq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubwd	%xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubwd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	(%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	$0, %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	$0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	$0, %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	$0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	$0, %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	$0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	$0, %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	$0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczpd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczpd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vfrczpd	%ymm0, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vfrczpd	(%rax), %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczps	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczps	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vfrczps	%ymm0, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -      -      -      -     vfrczps	(%rax), %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczsd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczsd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczss	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczss	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmov	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcmov	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     1.00   1.00    -      -     vpcmov	(%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     1.00   1.00    -      -     vpcmov	%ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcomb	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcomb	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcomd	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcomd	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcomq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcomq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcomub	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcomub	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcomud	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcomud	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcomuq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcomuq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcomuw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcomuw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpcomw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpcomw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -     1.00   1.00    -      -     vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddbd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddbd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddbq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddbq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddbw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddbw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphadddq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphadddq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddubd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddubd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddubq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddubq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddubw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddubw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddudq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddudq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphadduwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphadduwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphadduwq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphadduwq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphaddwq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphaddwq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphsubbw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphsubbw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphsubdq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphsubdq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vphsubwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vphsubwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -      -      -      -     vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -     0.50   0.50    -      -     vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -      -      -      -     vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -     0.50   0.50    -      -     vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -      -      -      -     vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -     0.50   0.50    -      -     vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -      -      -      -     vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -     0.50   0.50    -      -     vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -      -      -      -     vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -     0.50   0.50    -      -     vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -      -      -      -     vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -     0.50   0.50    -      -     vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     0.50   0.50    -      -     vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -      -      -      -     vpperm	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpperm	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpperm	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vprotb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vprotb	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotb	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vprotd	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotd	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotd	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vprotd	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotd	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vprotq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vprotq	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotq	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vprotw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vprotw	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vprotw	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshab	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshab	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshab	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshad	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshad	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshad	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshaq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshaq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshaq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshaw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshaw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshaw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshlb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshlb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshlb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshld	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshld	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshld	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshlq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshlq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshlq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -      -     vpshlw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshlw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -     0.50   0.50    -      -     vpshlw	%xmm0, (%rax), %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s b/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
index f1a7a47b47a4a7c2739fa966d2955f5e37202790..db7a373a23ead35e469da4714533141bba49d671 100644
--- a/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
+++ b/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
@@ -41,7 +41,7 @@ add  %rsi, %rsi
 # CHECK-NEXT: PdEX             0          2          40
 # CHECK-NEXT: PdFPU            0          1          64
 # CHECK-NEXT: PdLoad           0          1          40
-# CHECK-NEXT: PdStore          0          1          24
+# CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -63,13 +63,16 @@ add  %rsi, %rsi
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -     1.00    -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -     1.00    -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	(%rsi), %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, %rsi
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     1.00    -      -     vmulps	(%rsi), %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, %rsi
diff --git a/test/tools/llvm-mca/X86/BdVer2/simple-test.s b/test/tools/llvm-mca/X86/BdVer2/simple-test.s
index 562bfbb0c07148a3a3c5b5cbe53e9eb3d9bae90c..7246a384f2639e28cef38196bdeec792721d56ef 100644
--- a/test/tools/llvm-mca/X86/BdVer2/simple-test.s
+++ b/test/tools/llvm-mca/X86/BdVer2/simple-test.s
@@ -44,12 +44,15 @@ add %edi, %eax
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edi, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edi, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/store-throughput.s b/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
new file mode 100644
index 0000000000000000000000000000000000000000..67f13c3ccdda2d0e6227281c6aa8ebd193026949
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
@@ -0,0 +1,848 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -scheduler-stats -dispatch-stats -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN
+movb %spl, (%rax)
+movb %bpl, (%rcx)
+movb %sil, (%rdx)
+movb %dil, (%rbx)
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movw %sp, (%rax)
+movw %bp, (%rcx)
+movw %si, (%rdx)
+movw %di, (%rbx)
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movl %esp, (%rax)
+movl %ebp, (%rcx)
+movl %esi, (%rdx)
+movl %edi, (%rbx)
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movq %rsp, (%rax)
+movq %rbp, (%rcx)
+movq %rsi, (%rdx)
+movq %rdi, (%rbx)
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movd %mm0, (%rax)
+movd %mm1, (%rcx)
+movd %mm2, (%rdx)
+movd %mm3, (%rbx)
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+movaps %xmm0, (%rax)
+movaps %xmm1, (%rcx)
+movaps %xmm2, (%rdx)
+movaps %xmm3, (%rbx)
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+vmovaps %ymm0, (%rax)
+vmovaps %ymm1, (%rcx)
+vmovaps %ymm2, (%rdx)
+vmovaps %ymm3, (%rbx)
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00           *            movb	%spl, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movb	%bpl, (%rcx)
+# CHECK-NEXT:  1      1     1.00           *            movb	%sil, (%rdx)
+# CHECK-NEXT:  1      1     1.00           *            movb	%dil, (%rbx)
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  4,              7  (1.7%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          3  (0.7%)
+# CHECK-NEXT:  1,          400  (99.3%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdFPU            0          0          64
+# CHECK-NEXT: PdLoad           0          0          40
+# CHECK-NEXT: PdStore          23         24         24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movb	%spl, (%rax)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movb	%bpl, (%rcx)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movb	%sil, (%rdx)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movb	%dil, (%rbx)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   movb	%spl, (%rax)
+# CHECK-NEXT: [0,1]     D=eER..   movb	%bpl, (%rcx)
+# CHECK-NEXT: [0,2]     D==eER.   movb	%sil, (%rdx)
+# CHECK-NEXT: [0,3]     D===eER   movb	%dil, (%rbx)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movb	%spl, (%rax)
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       movb	%bpl, (%rcx)
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       movb	%sil, (%rdx)
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       movb	%dil, (%rbx)
+
+# CHECK:      [1] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00           *            movw	%sp, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movw	%bp, (%rcx)
+# CHECK-NEXT:  1      1     1.00           *            movw	%si, (%rdx)
+# CHECK-NEXT:  1      1     1.00           *            movw	%di, (%rbx)
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  4,              7  (1.7%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          3  (0.7%)
+# CHECK-NEXT:  1,          400  (99.3%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdFPU            0          0          64
+# CHECK-NEXT: PdLoad           0          0          40
+# CHECK-NEXT: PdStore          23         24         24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movw	%sp, (%rax)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movw	%bp, (%rcx)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movw	%si, (%rdx)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movw	%di, (%rbx)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   movw	%sp, (%rax)
+# CHECK-NEXT: [0,1]     D=eER..   movw	%bp, (%rcx)
+# CHECK-NEXT: [0,2]     D==eER.   movw	%si, (%rdx)
+# CHECK-NEXT: [0,3]     D===eER   movw	%di, (%rbx)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movw	%sp, (%rax)
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       movw	%bp, (%rcx)
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       movw	%si, (%rdx)
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       movw	%di, (%rbx)
+
+# CHECK:      [2] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00           *            movl	%esp, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movl	%ebp, (%rcx)
+# CHECK-NEXT:  1      1     1.00           *            movl	%esi, (%rdx)
+# CHECK-NEXT:  1      1     1.00           *            movl	%edi, (%rbx)
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  4,              7  (1.7%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          3  (0.7%)
+# CHECK-NEXT:  1,          400  (99.3%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdFPU            0          0          64
+# CHECK-NEXT: PdLoad           0          0          40
+# CHECK-NEXT: PdStore          23         24         24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movl	%esp, (%rax)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movl	%ebp, (%rcx)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movl	%esi, (%rdx)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movl	%edi, (%rbx)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   movl	%esp, (%rax)
+# CHECK-NEXT: [0,1]     D=eER..   movl	%ebp, (%rcx)
+# CHECK-NEXT: [0,2]     D==eER.   movl	%esi, (%rdx)
+# CHECK-NEXT: [0,3]     D===eER   movl	%edi, (%rbx)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movl	%esp, (%rax)
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       movl	%ebp, (%rcx)
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       movl	%esi, (%rdx)
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       movl	%edi, (%rbx)
+
+# CHECK:      [3] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00           *            movq	%rsp, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movq	%rbp, (%rcx)
+# CHECK-NEXT:  1      1     1.00           *            movq	%rsi, (%rdx)
+# CHECK-NEXT:  1      1     1.00           *            movq	%rdi, (%rbx)
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  4,              7  (1.7%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          3  (0.7%)
+# CHECK-NEXT:  1,          400  (99.3%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdFPU            0          0          64
+# CHECK-NEXT: PdLoad           0          0          40
+# CHECK-NEXT: PdStore          23         24         24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     4.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movq	%rsp, (%rax)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movq	%rbp, (%rcx)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movq	%rsi, (%rdx)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   movq	%rdi, (%rbx)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   movq	%rsp, (%rax)
+# CHECK-NEXT: [0,1]     D=eER..   movq	%rbp, (%rcx)
+# CHECK-NEXT: [0,2]     D==eER.   movq	%rsi, (%rdx)
+# CHECK-NEXT: [0,3]     D===eER   movq	%rdi, (%rbx)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movq	%rsp, (%rax)
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       movq	%rbp, (%rcx)
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       movq	%rsi, (%rdx)
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       movq	%rdi, (%rbx)
+
+# CHECK:      [4] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     1.00           *      U     movd	%mm0, (%rax)
+# CHECK-NEXT:  1      2     1.00           *      U     movd	%mm1, (%rcx)
+# CHECK-NEXT:  1      2     1.00           *      U     movd	%mm2, (%rdx)
+# CHECK-NEXT:  1      2     1.00           *      U     movd	%mm3, (%rbx)
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          747  (93.0%)
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              422  (52.6%)
+# CHECK-NEXT:  1,              374  (46.6%)
+# CHECK-NEXT:  2,              1  (0.1%)
+# CHECK-NEXT:  4,              6  (0.7%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          403  (50.2%)
+# CHECK-NEXT:  1,          400  (49.8%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdFPU            22         23         64
+# CHECK-NEXT: PdLoad           0          0          40
+# CHECK-NEXT: PdStore          23         24         24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -     4.00    -     4.00    -      -      -      -      -      -     4.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movd	%mm0, (%rax)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movd	%mm1, (%rcx)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movd	%mm2, (%rdx)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movd	%mm3, (%rbx)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .   movd	%mm0, (%rax)
+# CHECK-NEXT: [0,1]     D==eeER   .   movd	%mm1, (%rcx)
+# CHECK-NEXT: [0,2]     D====eeER .   movd	%mm2, (%rdx)
+# CHECK-NEXT: [0,3]     D======eeER   movd	%mm3, (%rbx)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movd	%mm0, (%rax)
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       movd	%mm1, (%rcx)
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       movd	%mm2, (%rdx)
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       movd	%mm3, (%rbx)
+
+# CHECK:      [5] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00           *            movaps	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movaps	%xmm1, (%rcx)
+# CHECK-NEXT:  1      1     1.00           *            movaps	%xmm2, (%rdx)
+# CHECK-NEXT:  1      1     1.00           *            movaps	%xmm3, (%rbx)
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  4,              7  (1.7%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          3  (0.7%)
+# CHECK-NEXT:  1,          400  (99.3%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdFPU            22         23         64
+# CHECK-NEXT: PdLoad           0          0          40
+# CHECK-NEXT: PdStore          23         24         24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -     4.00    -     4.00    -      -      -      -      -      -     4.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movaps	%xmm0, (%rax)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movaps	%xmm1, (%rcx)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movaps	%xmm2, (%rdx)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   movaps	%xmm3, (%rbx)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   movaps	%xmm0, (%rax)
+# CHECK-NEXT: [0,1]     D=eER..   movaps	%xmm1, (%rcx)
+# CHECK-NEXT: [0,2]     D==eER.   movaps	%xmm2, (%rdx)
+# CHECK-NEXT: [0,3]     D===eER   movaps	%xmm3, (%rbx)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       movaps	%xmm0, (%rax)
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       movaps	%xmm1, (%rcx)
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       movaps	%xmm2, (%rdx)
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       movaps	%xmm3, (%rbx)
+
+# CHECK:      [6] Code Region
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        1600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  4      1     1.00           *            vmovaps	%ymm0, (%rax)
+# CHECK-NEXT:  4      1     1.00           *            vmovaps	%ymm1, (%rcx)
+# CHECK-NEXT:  4      1     1.00           *            vmovaps	%ymm2, (%rdx)
+# CHECK-NEXT:  4      1     1.00           *            vmovaps	%ymm3, (%rbx)
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              3  (0.7%)
+# CHECK-NEXT:  4,              400  (99.3%)
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          3  (0.7%)
+# CHECK-NEXT:  1,          400  (99.3%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             1          1          40
+# CHECK-NEXT: PdFPU            1          1          64
+# CHECK-NEXT: PdLoad           0          0          40
+# CHECK-NEXT: PdStore          2          2          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -     4.00    -     4.00    -      -      -      -      -      -     4.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%ymm0, (%rax)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%ymm1, (%rcx)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%ymm2, (%rdx)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -     1.00   vmovaps	%ymm3, (%rbx)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   vmovaps	%ymm0, (%rax)
+# CHECK-NEXT: [0,1]     .DeER..   vmovaps	%ymm1, (%rcx)
+# CHECK-NEXT: [0,2]     . DeER.   vmovaps	%ymm2, (%rdx)
+# CHECK-NEXT: [0,3]     .  DeER   vmovaps	%ymm3, (%rbx)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps	%ymm0, (%rax)
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       vmovaps	%ymm1, (%rcx)
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       vmovaps	%ymm2, (%rdx)
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       vmovaps	%ymm3, (%rbx)
diff --git a/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s b/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
index 9ab4ab0baeb4d29bdd243c1a0674dad698900b09..0246216b734961897741d62281a702e9190da538 100644
--- a/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
+++ b/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
@@ -46,16 +46,19 @@ vbroadcastss (%rax), %ymm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -     1.00    -      -      -     0.50   0.50    -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -     0.50   0.50    -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	8(%rsp,%rdi,2), %rax
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -     0.50   0.50    -      -     vbroadcastss	(%rax), %ymm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
diff --git a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
index 678e6938bcebc1ed86a116a20010ccd2566d277e..4ee9df175f13e0ce43aae451c72a9d1a287688c3 100644
--- a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
@@ -54,20 +54,23 @@
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.04   4.96    -      -      -     1.00   4.00   7.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.50   2.50    -      -      -     1.00   4.00   7.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	%xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.02   0.98    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.66   0.34    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.34   0.66    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.66   0.34    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.34   0.66    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
diff --git a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
index c864c545f998753dcf7ed8dbcab704da4a42b120..bd5dc2a33b55bcc32e28af9613ff43f1adb36d62 100644
--- a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
@@ -54,20 +54,23 @@
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.54   4.46    -      -      -      -     4.99   6.01    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.42   4.58    -      -      -      -     4.99   6.01    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.52   0.48    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.99   0.01    -      -      -      -     vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.52   0.48    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.77   0.23    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.64   2.36    -      -      -      -     0.99   0.01    -      -      -      -      -      -      -     vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.80   0.20    -      -      -      -      -     2.00    -      -      -      -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.21   0.79    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          012
diff --git a/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s b/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
index b98f36f325888195c1572bd65797abb2df288336..08b87424e568d7bc6862dec9fee2932ffa23c5fb 100644
--- a/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
+++ b/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
@@ -81,17 +81,20 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.52   2.48    -      -      -      -     3.00   3.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.01   2.99    -      -      -      -     3.00   3.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.98   1.02    -      -      -      -     vxorps	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.02   0.98    -      -      -      -     0.02   1.98    -      -      -      -     vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.98   1.02    -      -      -      -      -      -      -     vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.02   1.98    -      -      -      -     0.02   1.98    -      -      -      -      -      -      -     vblendps	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -163,17 +166,20 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.52   2.48    -      -      -      -     3.00   3.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.01   2.99    -      -      -      -     3.00   3.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.98   1.02    -      -      -      -     vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.02   0.98    -      -      -      -     0.02   1.98    -      -      -      -     vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.98   1.02    -      -      -      -      -      -      -     vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.02   1.98    -      -      -      -     0.02   1.98    -      -      -      -      -      -      -     vblendpd	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -244,16 +250,19 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -     2.00   2.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   1.00    -      -      -      -     2.00   2.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -     2.00    -      -      -      -     vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00    -      -      -      -      -      -     2.00    -      -      -      -      -      -      -     vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -320,16 +329,19 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -     2.00   2.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   1.00    -      -      -      -     2.00   2.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -     2.00    -      -      -      -     vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00    -      -      -      -      -      -     2.00    -      -      -      -      -      -      -     vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -396,16 +408,19 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     2.00   1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     2.00   1.00    -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vperm2f128	$136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -      -      -      -     vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     2.00    -      -      -      -      -      -      -      -     vaddps	%ymm1, %ymm1, %ymm0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
diff --git a/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s b/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
index 3f9c4dbb8f5977895fb82bb5bb7d564a9610337c..d9637017ef60316df8978fc4fa96340c1925fd40 100644
--- a/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
+++ b/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
@@ -213,85 +213,88 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT: [13]  - PdFPU2
 # CHECK-NEXT: [14]  - PdFPU3
 # CHECK-NEXT: [15]  - PdFPXBR
-# CHECK-NEXT: [16]  - PdMul
+# CHECK-NEXT: [16.0] - PdLoad
+# CHECK-NEXT: [16.1] - PdLoad
+# CHECK-NEXT: [17]  - PdMul
+# CHECK-NEXT: [18]  - PdStore
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     8.00   11.00   -      -     9.00   10.00   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     9.00   10.00   -      -     9.00   10.00   -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%eax, %eax
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rax, %rax
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubq	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubq	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     psubsb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     psubsw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     psubsb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     psubsw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     psubusb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     psubusw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     psubusb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     psubusw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpsubusb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpsubusw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnps	%xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnpd	%xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pandn	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pandn	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubq	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     psubsb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     psubsw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     psubsb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     psubsw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     psubusb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     psubusw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     psubusb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     psubusw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnpd	%xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pandn	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pandn	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
diff --git a/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s b/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
index 9b8fc63c2e5f15f101b6fd689912d8e03bbfb50a..3cd20c3b2b9b009aaf0e9a72bd52af7c951f25c8 100644
--- a/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
+++ b/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
@@ -1720,7 +1720,7 @@ vzeroupper
 # CHECK-NEXT:  1      1     1.00                        vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      7     1.00    *                   vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  20     8     1.00    *      *      U     vzeroall
-# CHECK-NEXT:  4      4     1.00    *      *      U     vzeroupper
+# CHECK-NEXT:  4      0     1.00    *      *      U     vzeroupper
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - BWDivider
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     257.00 216.33 236.33 176.17 176.17 38.00  427.33 3.00   12.67
+# CHECK-NEXT:  -     257.00 215.25 235.25 176.17 176.17 38.00  426.25 2.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2430,4 +2430,4 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -     vzeroall
-# CHECK-NEXT:  -      -     1.08   1.08    -      -      -     1.08   0.75    -     vzeroupper
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/Broadwell/resources-rdrand.s b/test/tools/llvm-mca/X86/Broadwell/resources-rdrand.s
new file mode 100644
index 0000000000000000000000000000000000000000..42e59996bbefcb0ac7ae7ce7db51cb33c26d8ac1
--- /dev/null
+++ b/test/tools/llvm-mca/X86/Broadwell/resources-rdrand.s
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -instruction-tables < %s | FileCheck %s
+
+rdrand   %ax
+rdrand   %eax
+rdrand   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  5      9     1.00                  U     rdrandw	%ax
+# CHECK-NEXT:  5      9     1.00                  U     rdrandl	%eax
+# CHECK-NEXT:  5      9     1.00                  U     rdrandq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - BWDivider
+# CHECK-NEXT: [1]   - BWFPDivider
+# CHECK-NEXT: [2]   - BWPort0
+# CHECK-NEXT: [3]   - BWPort1
+# CHECK-NEXT: [4]   - BWPort2
+# CHECK-NEXT: [5]   - BWPort3
+# CHECK-NEXT: [6]   - BWPort4
+# CHECK-NEXT: [7]   - BWPort5
+# CHECK-NEXT: [8]   - BWPort6
+# CHECK-NEXT: [9]   - BWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     3.75   2.25   1.50   1.50    -     2.25   3.75    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     1.25   0.75   0.50   0.50    -     0.75   1.25    -     rdrandw	%ax
+# CHECK-NEXT:  -      -     1.25   0.75   0.50   0.50    -     0.75   1.25    -     rdrandl	%eax
+# CHECK-NEXT:  -      -     1.25   0.75   0.50   0.50    -     0.75   1.25    -     rdrandq	%rax
diff --git a/test/tools/llvm-mca/X86/Broadwell/resources-rdseed.s b/test/tools/llvm-mca/X86/Broadwell/resources-rdseed.s
new file mode 100644
index 0000000000000000000000000000000000000000..33c38d953cc39389a860765b7bbd9ce649dd8f44
--- /dev/null
+++ b/test/tools/llvm-mca/X86/Broadwell/resources-rdseed.s
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -instruction-tables < %s | FileCheck %s
+
+rdseed   %ax
+rdseed   %eax
+rdseed   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.25                  U     rdseedw	%ax
+# CHECK-NEXT:  1      100   0.25                  U     rdseedl	%eax
+# CHECK-NEXT:  1      100   0.25                  U     rdseedq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - BWDivider
+# CHECK-NEXT: [1]   - BWFPDivider
+# CHECK-NEXT: [2]   - BWPort0
+# CHECK-NEXT: [3]   - BWPort1
+# CHECK-NEXT: [4]   - BWPort2
+# CHECK-NEXT: [5]   - BWPort3
+# CHECK-NEXT: [6]   - BWPort4
+# CHECK-NEXT: [7]   - BWPort5
+# CHECK-NEXT: [8]   - BWPort6
+# CHECK-NEXT: [9]   - BWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     0.75   0.75    -      -      -     0.75   0.75    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedw	%ax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedl	%eax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedq	%rax
diff --git a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-7.s b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-7.s
new file mode 100644
index 0000000000000000000000000000000000000000..720a1ed1440b68852f659ac86c4bdc6d4f2d57bc
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-7.s
@@ -0,0 +1,104 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=5 < %s | FileCheck %s
+
+sete %r9b
+movzbl %al, %eax
+shll $2, %eax
+imull %ecx, %eax
+cmpl $1025, %eax
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.19
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        sete	%r9b
+# CHECK-NEXT:  1      1     0.50                        movzbl	%al, %eax
+# CHECK-NEXT:  1      1     0.50                        shll	$2, %eax
+# CHECK-NEXT:  2      3     1.00                        imull	%ecx, %eax
+# CHECK-NEXT:  1      1     0.50                        cmpl	$1025, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT: 2.00   3.00    -      -      -      -      -      -     1.00    -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT: 0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -     sete	%r9b
+# CHECK-NEXT: 0.01   0.99    -      -      -      -      -      -      -      -      -      -      -      -     movzbl	%al, %eax
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     shll	$2, %eax
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     1.00    -      -      -      -      -     imull	%ecx, %eax
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$1025, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeER .    .    .    .    .  .   sete	%r9b
+# CHECK-NEXT: [0,1]     DeER .    .    .    .    .  .   movzbl	%al, %eax
+# CHECK-NEXT: [0,2]     .DeER.    .    .    .    .  .   shll	$2, %eax
+# CHECK-NEXT: [0,3]     . DeeeER  .    .    .    .  .   imull	%ecx, %eax
+# CHECK-NEXT: [0,4]     .  D==eER .    .    .    .  .   cmpl	$1025, %eax
+# CHECK-NEXT: [1,0]     .  D===eER.    .    .    .  .   sete	%r9b
+# CHECK-NEXT: [1,1]     .   D=eE-R.    .    .    .  .   movzbl	%al, %eax
+# CHECK-NEXT: [1,2]     .   D==eE-R    .    .    .  .   shll	$2, %eax
+# CHECK-NEXT: [1,3]     .    D==eeeER  .    .    .  .   imull	%ecx, %eax
+# CHECK-NEXT: [1,4]     .    .D====eER .    .    .  .   cmpl	$1025, %eax
+# CHECK-NEXT: [2,0]     .    .D=====eER.    .    .  .   sete	%r9b
+# CHECK-NEXT: [2,1]     .    . D===eE-R.    .    .  .   movzbl	%al, %eax
+# CHECK-NEXT: [2,2]     .    . D====eE-R    .    .  .   shll	$2, %eax
+# CHECK-NEXT: [2,3]     .    .  D====eeeER  .    .  .   imull	%ecx, %eax
+# CHECK-NEXT: [2,4]     .    .   D======eER .    .  .   cmpl	$1025, %eax
+# CHECK-NEXT: [3,0]     .    .   D=======eER.    .  .   sete	%r9b
+# CHECK-NEXT: [3,1]     .    .    D=====eE-R.    .  .   movzbl	%al, %eax
+# CHECK-NEXT: [3,2]     .    .    D======eE-R    .  .   shll	$2, %eax
+# CHECK-NEXT: [3,3]     .    .    .D======eeeER  .  .   imull	%ecx, %eax
+# CHECK-NEXT: [3,4]     .    .    . D========eER .  .   cmpl	$1025, %eax
+# CHECK-NEXT: [4,0]     .    .    . D=========eER.  .   sete	%r9b
+# CHECK-NEXT: [4,1]     .    .    .  D=======eE-R.  .   movzbl	%al, %eax
+# CHECK-NEXT: [4,2]     .    .    .  D========eE-R  .   shll	$2, %eax
+# CHECK-NEXT: [4,3]     .    .    .   D========eeeER.   imull	%ecx, %eax
+# CHECK-NEXT: [4,4]     .    .    .    D==========eER   cmpl	$1025, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     5     5.8    0.2    0.0       sete	%r9b
+# CHECK-NEXT: 1.     5     4.2    0.2    0.8       movzbl	%al, %eax
+# CHECK-NEXT: 2.     5     5.0    0.0    0.8       shll	$2, %eax
+# CHECK-NEXT: 3.     5     5.0    0.0    0.0       imull	%ecx, %eax
+# CHECK-NEXT: 4.     5     7.0    0.0    0.0       cmpl	$1025, %eax
diff --git a/test/tools/llvm-mca/X86/BtVer2/rcu-statistics.s b/test/tools/llvm-mca/X86/BtVer2/rcu-statistics.s
index 4a1f8706d9613eac316430cdd6eaa9c0614518a7..59fbd48589413ccbd1cf7ad1b89b171570865e28 100644
--- a/test/tools/llvm-mca/X86/BtVer2/rcu-statistics.s
+++ b/test/tools/llvm-mca/X86/BtVer2/rcu-statistics.s
@@ -58,3 +58,7 @@
 # CHECK-NEXT: [# retired], [# cycles]
 # CHECK-NEXT:  0,           23  (74.2%)
 # CHECK-NEXT:  2,           8  (25.8%)
+
+# CHECK:      Total ROB Entries:                64
+# CHECK-NEXT: Max Used ROB Entries:             16  ( 25.0% )
+# CHECK-NEXT: Average Used ROB Entries per cy:  11  ( 17.2% )
diff --git a/test/tools/llvm-mca/X86/Generic/resources-avx1.s b/test/tools/llvm-mca/X86/Generic/resources-avx1.s
index f0bf9e27294529a865cf1312b28a70d972a8111b..cc4a3a54a04ff3ec6a3d549a551b9b5335f5cb04 100644
--- a/test/tools/llvm-mca/X86/Generic/resources-avx1.s
+++ b/test/tools/llvm-mca/X86/Generic/resources-avx1.s
@@ -1720,7 +1720,7 @@ vzeroupper
 # CHECK-NEXT:  1      1     1.00                        vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      8     1.00    *                   vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  20     9     2.00    *      *      U     vzeroall
-# CHECK-NEXT:  1      100   0.33    *      *      U     vzeroupper
+# CHECK-NEXT:  4      1     1.00    *      *      U     vzeroupper
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - SBDivider
@@ -1734,7 +1734,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     572.00 246.83 317.33 39.00  365.83 179.50 179.50
+# CHECK-NEXT:  -     572.00 246.50 317.00 39.00  365.50 179.50 179.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -2428,4 +2428,4 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -     2.00    -      -     vzeroall
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vzeroupper
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/Generic/resources-rdrand.s b/test/tools/llvm-mca/X86/Generic/resources-rdrand.s
new file mode 100644
index 0000000000000000000000000000000000000000..c7a2149326616572e4add947ad70a137f2216023
--- /dev/null
+++ b/test/tools/llvm-mca/X86/Generic/resources-rdrand.s
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+rdrand   %ax
+rdrand   %eax
+rdrand   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.33                  U     rdrandw	%ax
+# CHECK-NEXT:  1      100   0.33                  U     rdrandl	%eax
+# CHECK-NEXT:  1      100   0.33                  U     rdrandq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdrandw	%ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdrandl	%eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdrandq	%rax
diff --git a/test/tools/llvm-mca/X86/Generic/resources-rdseed.s b/test/tools/llvm-mca/X86/Generic/resources-rdseed.s
new file mode 100644
index 0000000000000000000000000000000000000000..c97d4192b323b3388ec9e4144042f9e0808fb529
--- /dev/null
+++ b/test/tools/llvm-mca/X86/Generic/resources-rdseed.s
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+rdseed   %ax
+rdseed   %eax
+rdseed   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.33                  U     rdseedw	%ax
+# CHECK-NEXT:  1      100   0.33                  U     rdseedl	%eax
+# CHECK-NEXT:  1      100   0.33                  U     rdseedq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdseedw	%ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdseedl	%eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdseedq	%rax
diff --git a/test/tools/llvm-mca/X86/Haswell/resources-avx1.s b/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
index 28d6c67b6767bedb62e2a077e346eefde0f000ab..387b4b2f1c252c6da39f7b6073c067bd22b6410b 100644
--- a/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
+++ b/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
@@ -1720,7 +1720,7 @@ vzeroupper
 # CHECK-NEXT:  1      1     1.00                        vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      8     1.00    *                   vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  20     8     1.00    *      *      U     vzeroall
-# CHECK-NEXT:  4      4     1.00    *      *      U     vzeroupper
+# CHECK-NEXT:  4      0     1.00    *      *      U     vzeroupper
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - HWDivider
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     336.00 215.67 237.67 176.17 176.17 38.00  430.67 3.00   12.67
+# CHECK-NEXT:  -     336.00 214.58 236.58 176.17 176.17 38.00  429.58 2.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2430,4 +2430,4 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -     vzeroall
-# CHECK-NEXT:  -      -     1.08   1.08    -      -      -     1.08   0.75    -     vzeroupper
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/Haswell/resources-rdrand.s b/test/tools/llvm-mca/X86/Haswell/resources-rdrand.s
new file mode 100644
index 0000000000000000000000000000000000000000..08876f311277b0feb45245ac03b90ca3bf794594
--- /dev/null
+++ b/test/tools/llvm-mca/X86/Haswell/resources-rdrand.s
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -instruction-tables < %s | FileCheck %s
+
+rdrand   %ax
+rdrand   %eax
+rdrand   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  17     1     5.33                  U     rdrandw	%ax
+# CHECK-NEXT:  17     1     5.33                  U     rdrandl	%eax
+# CHECK-NEXT:  17     1     5.33                  U     rdrandq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - HWDivider
+# CHECK-NEXT: [1]   - HWFPDivider
+# CHECK-NEXT: [2]   - HWPort0
+# CHECK-NEXT: [3]   - HWPort1
+# CHECK-NEXT: [4]   - HWPort2
+# CHECK-NEXT: [5]   - HWPort3
+# CHECK-NEXT: [6]   - HWPort4
+# CHECK-NEXT: [7]   - HWPort5
+# CHECK-NEXT: [8]   - HWPort6
+# CHECK-NEXT: [9]   - HWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     16.00  16.00  1.50   1.50    -     16.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     5.33   5.33   0.50   0.50    -     5.33    -      -     rdrandw	%ax
+# CHECK-NEXT:  -      -     5.33   5.33   0.50   0.50    -     5.33    -      -     rdrandl	%eax
+# CHECK-NEXT:  -      -     5.33   5.33   0.50   0.50    -     5.33    -      -     rdrandq	%rax
diff --git a/test/tools/llvm-mca/X86/SLM/resources-aes.s b/test/tools/llvm-mca/X86/SLM/resources-aes.s
new file mode 100644
index 0000000000000000000000000000000000000000..ae60a08fa7ecf8a269bb7898626278eac6eb6562
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SLM/resources-aes.s
@@ -0,0 +1,71 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=goldmont -instruction-tables < %s | FileCheck %s
+
+aesdec          %xmm0, %xmm2
+aesdec          (%rax), %xmm2
+
+aesdeclast      %xmm0, %xmm2
+aesdeclast      (%rax), %xmm2
+
+aesenc          %xmm0, %xmm2
+aesenc          (%rax), %xmm2
+
+aesenclast      %xmm0, %xmm2
+aesenclast      (%rax), %xmm2
+
+aesimc          %xmm0, %xmm2
+aesimc          (%rax), %xmm2
+
+aeskeygenassist $22, %xmm0, %xmm2
+aeskeygenassist $22, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      8     5.00                        aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     5.00    *                   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  1      8     5.00                        aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     5.00    *                   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  1      8     5.00                        aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     5.00    *                   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  1      8     5.00                        aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     5.00    *                   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  1      8     5.00                        aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     5.00    *                   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  1      8     5.00                        aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  1      8     5.00    *                   aeskeygenassist	$22, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SLMDivider
+# CHECK-NEXT: [1]   - SLMFPDivider
+# CHECK-NEXT: [2]   - SLMFPMultiplier
+# CHECK-NEXT: [3]   - SLM_FPC_RSV0
+# CHECK-NEXT: [4]   - SLM_FPC_RSV1
+# CHECK-NEXT: [5]   - SLM_IEC_RSV0
+# CHECK-NEXT: [6]   - SLM_IEC_RSV1
+# CHECK-NEXT: [7]   - SLM_MEC_RSV
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -      -     60.00   -      -      -     6.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -      -     5.00    -      -      -      -     aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -     1.00   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -      -     aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -     1.00   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -      -     aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -     1.00   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -      -     aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -     1.00   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -      -     aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -     1.00   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -      -     aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     5.00    -      -      -     1.00   aeskeygenassist	$22, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/SLM/resources-rdrand.s b/test/tools/llvm-mca/X86/SLM/resources-rdrand.s
new file mode 100644
index 0000000000000000000000000000000000000000..360544866707398ce8d0861e7f850a3216e96ee5
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SLM/resources-rdrand.s
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=slm -instruction-tables < %s | FileCheck %s
+
+rdrand   %ax
+rdrand   %eax
+rdrand   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   1.00                  U     rdrandw	%ax
+# CHECK-NEXT:  1      100   1.00                  U     rdrandl	%eax
+# CHECK-NEXT:  1      100   1.00                  U     rdrandq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SLMDivider
+# CHECK-NEXT: [1]   - SLMFPDivider
+# CHECK-NEXT: [2]   - SLMFPMultiplier
+# CHECK-NEXT: [3]   - SLM_FPC_RSV0
+# CHECK-NEXT: [4]   - SLM_FPC_RSV1
+# CHECK-NEXT: [5]   - SLM_IEC_RSV0
+# CHECK-NEXT: [6]   - SLM_IEC_RSV1
+# CHECK-NEXT: [7]   - SLM_MEC_RSV
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rdrandw	%ax
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rdrandl	%eax
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rdrandq	%rax
diff --git a/test/tools/llvm-mca/X86/SLM/resources-rdseed.s b/test/tools/llvm-mca/X86/SLM/resources-rdseed.s
new file mode 100644
index 0000000000000000000000000000000000000000..9acd1719c470ed94a20b22729f3e435210bfd233
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SLM/resources-rdseed.s
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=goldmont -instruction-tables < %s | FileCheck %s
+
+rdseed   %ax
+rdseed   %eax
+rdseed   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   1.00                  U     rdseedw	%ax
+# CHECK-NEXT:  1      100   1.00                  U     rdseedl	%eax
+# CHECK-NEXT:  1      100   1.00                  U     rdseedq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SLMDivider
+# CHECK-NEXT: [1]   - SLMFPDivider
+# CHECK-NEXT: [2]   - SLMFPMultiplier
+# CHECK-NEXT: [3]   - SLM_FPC_RSV0
+# CHECK-NEXT: [4]   - SLM_FPC_RSV1
+# CHECK-NEXT: [5]   - SLM_IEC_RSV0
+# CHECK-NEXT: [6]   - SLM_IEC_RSV1
+# CHECK-NEXT: [7]   - SLM_MEC_RSV
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rdseedw	%ax
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rdseedl	%eax
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rdseedq	%rax
diff --git a/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s b/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s
index eadc1416ef3876e43882383078de57746c125323..2a4a5bf45d01dfc8d0fc599bab411fd1416a60cc 100644
--- a/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s
+++ b/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s
@@ -1720,7 +1720,7 @@ vzeroupper
 # CHECK-NEXT:  1      1     1.00                        vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      8     1.00    *                   vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  20     9     2.00    *      *      U     vzeroall
-# CHECK-NEXT:  1      100   0.33    *      *      U     vzeroupper
+# CHECK-NEXT:  4      1     1.00    *      *      U     vzeroupper
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - SBDivider
@@ -1734,7 +1734,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     572.00 246.83 317.33 39.00  365.83 179.50 179.50
+# CHECK-NEXT:  -     572.00 246.50 317.00 39.00  365.50 179.50 179.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -2428,4 +2428,4 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -     2.00    -      -     vzeroall
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vzeroupper
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/SandyBridge/resources-rdrand.s b/test/tools/llvm-mca/X86/SandyBridge/resources-rdrand.s
new file mode 100644
index 0000000000000000000000000000000000000000..5b0838907ba5b7ee11094f689558cb178aa6ab3e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SandyBridge/resources-rdrand.s
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=ivybridge -instruction-tables < %s | FileCheck %s
+
+rdrand   %ax
+rdrand   %eax
+rdrand   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.33                  U     rdrandw	%ax
+# CHECK-NEXT:  1      100   0.33                  U     rdrandl	%eax
+# CHECK-NEXT:  1      100   0.33                  U     rdrandq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdrandw	%ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdrandl	%eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     rdrandq	%rax
diff --git a/test/tools/llvm-mca/X86/SkylakeClient/resources-aes.s b/test/tools/llvm-mca/X86/SkylakeClient/resources-aes.s
new file mode 100644
index 0000000000000000000000000000000000000000..ce02b831f494a5acd4c158a9c741a2a779a08e86
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SkylakeClient/resources-aes.s
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -instruction-tables < %s | FileCheck %s
+
+aesdec          %xmm0, %xmm2
+aesdec          (%rax), %xmm2
+
+aesdeclast      %xmm0, %xmm2
+aesdeclast      (%rax), %xmm2
+
+aesenc          %xmm0, %xmm2
+aesenc          (%rax), %xmm2
+
+aesenclast      %xmm0, %xmm2
+aesenclast      (%rax), %xmm2
+
+aesimc          %xmm0, %xmm2
+aesimc          (%rax), %xmm2
+
+aeskeygenassist $22, %xmm0, %xmm2
+aeskeygenassist $22, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  2      8     2.00                        aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  3      14    2.00    *                   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  11     20    6.00                        aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  11     25    6.00    *                   aeskeygenassist	$22, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKLDivider
+# CHECK-NEXT: [1]   - SKLFPDivider
+# CHECK-NEXT: [2]   - SKLPort0
+# CHECK-NEXT: [3]   - SKLPort1
+# CHECK-NEXT: [4]   - SKLPort2
+# CHECK-NEXT: [5]   - SKLPort3
+# CHECK-NEXT: [6]   - SKLPort4
+# CHECK-NEXT: [7]   - SKLPort5
+# CHECK-NEXT: [8]   - SKLPort6
+# CHECK-NEXT: [9]   - SKLPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     19.00  1.00   3.00   3.00    -     13.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     aesdec	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     aesenc	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -     2.00    -      -      -      -      -      -      -     aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     aesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -     3.67   0.67    -      -      -     6.67    -      -     aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     3.33   0.33   0.50   0.50    -     6.33    -      -     aeskeygenassist	$22, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s b/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
index 2acf109a961cb668c5582401a5cd15572a2d888a..6002700cc96f8e4f11ec78c9488ec36f215228ae 100644
--- a/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
+++ b/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
@@ -1720,7 +1720,7 @@ vzeroupper
 # CHECK-NEXT:  1      1     0.33                        vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  16     16    4.00    *      *      U     vzeroall
-# CHECK-NEXT:  4      4     1.00    *      *      U     vzeroupper
+# CHECK-NEXT:  4      0     0.67    *      *      U     vzeroupper
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - SKLDivider
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     123.00 333.17 204.17 173.17 173.17 34.00  324.67 6.00   12.67
+# CHECK-NEXT:  -     123.00 332.08 203.08 173.17 173.17 34.00  323.58 5.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2430,4 +2430,4 @@ vzeroupper
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     4.00   4.00    -      -      -     4.00   4.00    -     vzeroall
-# CHECK-NEXT:  -      -     1.08   1.08    -      -      -     1.08   0.75    -     vzeroupper
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/SkylakeClient/resources-rdrand.s b/test/tools/llvm-mca/X86/SkylakeClient/resources-rdrand.s
new file mode 100644
index 0000000000000000000000000000000000000000..3c77caffa3572e9ef482700b36065c0b4b3aeffa
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SkylakeClient/resources-rdrand.s
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -instruction-tables < %s | FileCheck %s
+
+rdrand   %ax
+rdrand   %eax
+rdrand   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.25                  U     rdrandw	%ax
+# CHECK-NEXT:  1      100   0.25                  U     rdrandl	%eax
+# CHECK-NEXT:  1      100   0.25                  U     rdrandq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKLDivider
+# CHECK-NEXT: [1]   - SKLFPDivider
+# CHECK-NEXT: [2]   - SKLPort0
+# CHECK-NEXT: [3]   - SKLPort1
+# CHECK-NEXT: [4]   - SKLPort2
+# CHECK-NEXT: [5]   - SKLPort3
+# CHECK-NEXT: [6]   - SKLPort4
+# CHECK-NEXT: [7]   - SKLPort5
+# CHECK-NEXT: [8]   - SKLPort6
+# CHECK-NEXT: [9]   - SKLPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     0.75   0.75    -      -      -     0.75   0.75    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdrandw	%ax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdrandl	%eax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdrandq	%rax
diff --git a/test/tools/llvm-mca/X86/SkylakeClient/resources-rdseed.s b/test/tools/llvm-mca/X86/SkylakeClient/resources-rdseed.s
new file mode 100644
index 0000000000000000000000000000000000000000..648886ee7b099d334f2112dcbed88eef3c463f79
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SkylakeClient/resources-rdseed.s
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -instruction-tables < %s | FileCheck %s
+
+rdseed   %ax
+rdseed   %eax
+rdseed   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.25                  U     rdseedw	%ax
+# CHECK-NEXT:  1      100   0.25                  U     rdseedl	%eax
+# CHECK-NEXT:  1      100   0.25                  U     rdseedq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKLDivider
+# CHECK-NEXT: [1]   - SKLFPDivider
+# CHECK-NEXT: [2]   - SKLPort0
+# CHECK-NEXT: [3]   - SKLPort1
+# CHECK-NEXT: [4]   - SKLPort2
+# CHECK-NEXT: [5]   - SKLPort3
+# CHECK-NEXT: [6]   - SKLPort4
+# CHECK-NEXT: [7]   - SKLPort5
+# CHECK-NEXT: [8]   - SKLPort6
+# CHECK-NEXT: [9]   - SKLPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     0.75   0.75    -      -      -     0.75   0.75    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedw	%ax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedl	%eax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedq	%rax
diff --git a/test/tools/llvm-mca/X86/SkylakeServer/resources-aes.s b/test/tools/llvm-mca/X86/SkylakeServer/resources-aes.s
new file mode 100644
index 0000000000000000000000000000000000000000..6eafe463df36b5ff2dc6b0947bd248835fa86118
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SkylakeServer/resources-aes.s
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -instruction-tables < %s | FileCheck %s
+
+aesdec          %xmm0, %xmm2
+aesdec          (%rax), %xmm2
+
+aesdeclast      %xmm0, %xmm2
+aesdeclast      (%rax), %xmm2
+
+aesenc          %xmm0, %xmm2
+aesenc          (%rax), %xmm2
+
+aesenclast      %xmm0, %xmm2
+aesenclast      (%rax), %xmm2
+
+aesimc          %xmm0, %xmm2
+aesimc          (%rax), %xmm2
+
+aeskeygenassist $22, %xmm0, %xmm2
+aeskeygenassist $22, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  2      8     2.00                        aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  3      14    2.00    *                   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  11     20    6.00                        aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  11     25    6.00    *                   aeskeygenassist	$22, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKXDivider
+# CHECK-NEXT: [1]   - SKXFPDivider
+# CHECK-NEXT: [2]   - SKXPort0
+# CHECK-NEXT: [3]   - SKXPort1
+# CHECK-NEXT: [4]   - SKXPort2
+# CHECK-NEXT: [5]   - SKXPort3
+# CHECK-NEXT: [6]   - SKXPort4
+# CHECK-NEXT: [7]   - SKXPort5
+# CHECK-NEXT: [8]   - SKXPort6
+# CHECK-NEXT: [9]   - SKXPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     19.00  1.00   3.00   3.00    -     13.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     aesdec	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     aesenc	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -     2.00    -      -      -      -      -      -      -     aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     aesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -     3.67   0.67    -      -      -     6.67    -      -     aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     3.33   0.33   0.50   0.50    -     6.33    -      -     aeskeygenassist	$22, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s b/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
index 5e3f4f012422e2ee4872345e0dd091569f8bc0c1..5d97881062f6b0a9f5afd9bd36794f8319ed59a5 100644
--- a/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
+++ b/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
@@ -1720,7 +1720,7 @@ vzeroupper
 # CHECK-NEXT:  1      1     0.33                        vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  34     12    5.00    *      *      U     vzeroall
-# CHECK-NEXT:  4      4     1.00    *      *      U     vzeroupper
+# CHECK-NEXT:  4      0     0.67    *      *      U     vzeroupper
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - SKXDivider
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     123.00 318.50 197.50 173.17 173.17 34.00  339.00 7.00   12.67
+# CHECK-NEXT:  -     123.00 317.42 196.42 173.17 173.17 34.00  337.92 6.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2430,4 +2430,4 @@ vzeroupper
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vxorps	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vxorps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     2.00   5.00    -     vzeroall
-# CHECK-NEXT:  -      -     1.08   1.08    -      -      -     1.08   0.75    -     vzeroupper
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/SkylakeServer/resources-rdrand.s b/test/tools/llvm-mca/X86/SkylakeServer/resources-rdrand.s
new file mode 100644
index 0000000000000000000000000000000000000000..b6225d1a5d8f975fa46bc66c032612eaa1c96b28
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SkylakeServer/resources-rdrand.s
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -instruction-tables < %s | FileCheck %s
+
+rdrand   %ax
+rdrand   %eax
+rdrand   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.25                  U     rdrandw	%ax
+# CHECK-NEXT:  1      100   0.25                  U     rdrandl	%eax
+# CHECK-NEXT:  1      100   0.25                  U     rdrandq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKXDivider
+# CHECK-NEXT: [1]   - SKXFPDivider
+# CHECK-NEXT: [2]   - SKXPort0
+# CHECK-NEXT: [3]   - SKXPort1
+# CHECK-NEXT: [4]   - SKXPort2
+# CHECK-NEXT: [5]   - SKXPort3
+# CHECK-NEXT: [6]   - SKXPort4
+# CHECK-NEXT: [7]   - SKXPort5
+# CHECK-NEXT: [8]   - SKXPort6
+# CHECK-NEXT: [9]   - SKXPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     0.75   0.75    -      -      -     0.75   0.75    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdrandw	%ax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdrandl	%eax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdrandq	%rax
diff --git a/test/tools/llvm-mca/X86/SkylakeServer/resources-rdseed.s b/test/tools/llvm-mca/X86/SkylakeServer/resources-rdseed.s
new file mode 100644
index 0000000000000000000000000000000000000000..b6209ce48d1f70916d2bb71632e80af13241415b
--- /dev/null
+++ b/test/tools/llvm-mca/X86/SkylakeServer/resources-rdseed.s
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -instruction-tables < %s | FileCheck %s
+
+rdseed   %ax
+rdseed   %eax
+rdseed   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.25                  U     rdseedw	%ax
+# CHECK-NEXT:  1      100   0.25                  U     rdseedl	%eax
+# CHECK-NEXT:  1      100   0.25                  U     rdseedq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKXDivider
+# CHECK-NEXT: [1]   - SKXFPDivider
+# CHECK-NEXT: [2]   - SKXPort0
+# CHECK-NEXT: [3]   - SKXPort1
+# CHECK-NEXT: [4]   - SKXPort2
+# CHECK-NEXT: [5]   - SKXPort3
+# CHECK-NEXT: [6]   - SKXPort4
+# CHECK-NEXT: [7]   - SKXPort5
+# CHECK-NEXT: [8]   - SKXPort6
+# CHECK-NEXT: [9]   - SKXPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     0.75   0.75    -      -      -     0.75   0.75    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedw	%ax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedl	%eax
+# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     rdseedq	%rax
diff --git a/test/tools/llvm-mca/X86/Znver1/resources-aes.s b/test/tools/llvm-mca/X86/Znver1/resources-aes.s
new file mode 100644
index 0000000000000000000000000000000000000000..980f6194b54daf1e88a0afb6cf90e919a1bc2e18
--- /dev/null
+++ b/test/tools/llvm-mca/X86/Znver1/resources-aes.s
@@ -0,0 +1,75 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -instruction-tables < %s | FileCheck %s
+
+aesdec          %xmm0, %xmm2
+aesdec          (%rax), %xmm2
+
+aesdeclast      %xmm0, %xmm2
+aesdeclast      (%rax), %xmm2
+
+aesenc          %xmm0, %xmm2
+aesenc          (%rax), %xmm2
+
+aesenclast      %xmm0, %xmm2
+aesenclast      (%rax), %xmm2
+
+aesimc          %xmm0, %xmm2
+aesimc          (%rax), %xmm2
+
+aeskeygenassist $22, %xmm0, %xmm2
+aeskeygenassist $22, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     0.50                        aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  1      11    0.50    *                   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  1      4     0.50                        aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  1      11    0.50    *                   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  1      4     0.50                        aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  1      11    0.50    *                   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  1      4     0.50                        aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  1      11    0.50    *                   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  1      4     0.50                        aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  1      11    0.50    *                   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  1      4     0.50                        aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  1      11    0.50    *                   aeskeygenassist	$22, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - ZnAGU0
+# CHECK-NEXT: [1]   - ZnAGU1
+# CHECK-NEXT: [2]   - ZnALU0
+# CHECK-NEXT: [3]   - ZnALU1
+# CHECK-NEXT: [4]   - ZnALU2
+# CHECK-NEXT: [5]   - ZnALU3
+# CHECK-NEXT: [6]   - ZnDivider
+# CHECK-NEXT: [7]   - ZnFPU0
+# CHECK-NEXT: [8]   - ZnFPU1
+# CHECK-NEXT: [9]   - ZnFPU2
+# CHECK-NEXT: [10]  - ZnFPU3
+# CHECK-NEXT: [11]  - ZnMultiplier
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
+# CHECK-NEXT: 3.00   3.00    -      -      -      -      -     6.00   6.00    -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     aesdec	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     0.50   0.50    -      -      -     aesdec	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     0.50   0.50    -      -      -     aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     aesenc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     0.50   0.50    -      -      -     aesenc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     aesenclast	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     0.50   0.50    -      -      -     aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     aesimc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     0.50   0.50    -      -      -     aesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50    -      -      -     aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     0.50   0.50    -      -      -     aeskeygenassist	$22, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/Znver1/resources-rdrand.s b/test/tools/llvm-mca/X86/Znver1/resources-rdrand.s
new file mode 100644
index 0000000000000000000000000000000000000000..07d717be25332f77e63251d3ecd18aa81e67e583
--- /dev/null
+++ b/test/tools/llvm-mca/X86/Znver1/resources-rdrand.s
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -instruction-tables < %s | FileCheck %s
+
+rdrand   %ax
+rdrand   %eax
+rdrand   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.25                  U     rdrandw	%ax
+# CHECK-NEXT:  1      100   0.25                  U     rdrandl	%eax
+# CHECK-NEXT:  1      100   0.25                  U     rdrandq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - ZnAGU0
+# CHECK-NEXT: [1]   - ZnAGU1
+# CHECK-NEXT: [2]   - ZnALU0
+# CHECK-NEXT: [3]   - ZnALU1
+# CHECK-NEXT: [4]   - ZnALU2
+# CHECK-NEXT: [5]   - ZnALU3
+# CHECK-NEXT: [6]   - ZnDivider
+# CHECK-NEXT: [7]   - ZnFPU0
+# CHECK-NEXT: [8]   - ZnFPU1
+# CHECK-NEXT: [9]   - ZnFPU2
+# CHECK-NEXT: [10]  - ZnFPU3
+# CHECK-NEXT: [11]  - ZnMultiplier
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     rdrandw	%ax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     rdrandl	%eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     rdrandq	%rax
diff --git a/test/tools/llvm-mca/X86/Znver1/resources-rdseed.s b/test/tools/llvm-mca/X86/Znver1/resources-rdseed.s
new file mode 100644
index 0000000000000000000000000000000000000000..3ac9de7bf1b17ea3fe8882db7f0a6ad8f110f78e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/Znver1/resources-rdseed.s
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -instruction-tables < %s | FileCheck %s
+
+rdseed   %ax
+rdseed   %eax
+rdseed   %rax
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.25                  U     rdseedw	%ax
+# CHECK-NEXT:  1      100   0.25                  U     rdseedl	%eax
+# CHECK-NEXT:  1      100   0.25                  U     rdseedq	%rax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - ZnAGU0
+# CHECK-NEXT: [1]   - ZnAGU1
+# CHECK-NEXT: [2]   - ZnALU0
+# CHECK-NEXT: [3]   - ZnALU1
+# CHECK-NEXT: [4]   - ZnALU2
+# CHECK-NEXT: [5]   - ZnALU3
+# CHECK-NEXT: [6]   - ZnDivider
+# CHECK-NEXT: [7]   - ZnFPU0
+# CHECK-NEXT: [8]   - ZnFPU1
+# CHECK-NEXT: [9]   - ZnFPU2
+# CHECK-NEXT: [10]  - ZnFPU3
+# CHECK-NEXT: [11]  - ZnMultiplier
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     rdseedw	%ax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     rdseedl	%eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     rdseedq	%rax
diff --git a/test/tools/llvm-mca/X86/option-all-stats-1.s b/test/tools/llvm-mca/X86/option-all-stats-1.s
index dd3fafb60a6210ae537ed13fee92e817695312df..d2265583c1d2a7ed98230a83884bedf1d64638a1 100644
--- a/test/tools/llvm-mca/X86/option-all-stats-1.s
+++ b/test/tools/llvm-mca/X86/option-all-stats-1.s
@@ -62,6 +62,10 @@ add %eax, %eax
 # FULLREPORT-NEXT:  0,           3  (2.9%)
 # FULLREPORT-NEXT:  1,           100  (97.1%)
 
+# FULLREPORT:      Total ROB Entries:                64
+# FULLREPORT-NEXT: Max Used ROB Entries:             22  ( 34.4% )
+# FULLREPORT-NEXT: Average Used ROB Entries per cy:  17  ( 26.6% )
+
 # FULLREPORT:      Register File statistics:
 # FULLREPORT-NEXT: Total number of mappings created:    200
 # FULLREPORT-NEXT: Max number of mappings used:         44
diff --git a/test/tools/llvm-mca/X86/option-all-stats-2.s b/test/tools/llvm-mca/X86/option-all-stats-2.s
index a7002d9a58d369345d3975d71be0c7e72844dd25..e752d82bd0f9c945dd0d5870a7e6ae7af70097e8 100644
--- a/test/tools/llvm-mca/X86/option-all-stats-2.s
+++ b/test/tools/llvm-mca/X86/option-all-stats-2.s
@@ -63,6 +63,10 @@ add %eax, %eax
 # ALL-NEXT:   0,           3  (2.9%)
 # ALL-NEXT:   1,           100  (97.1%)
 
+# ALL:       Total ROB Entries:                64
+# ALL-NEXT:  Max Used ROB Entries:             22  ( 34.4% )
+# ALL-NEXT:  Average Used ROB Entries per cy:  17  ( 26.6% )
+
 # ALL:       Register File statistics:
 # ALL-NEXT:  Total number of mappings created:    200
 # ALL-NEXT:  Max number of mappings used:         44
diff --git a/test/tools/llvm-mca/X86/option-all-views-1.s b/test/tools/llvm-mca/X86/option-all-views-1.s
index b0545ab9be01a839d21d9897ea90142156101d6a..298a54b9ed4524e4653b6fed0ef454a246f2da6e 100644
--- a/test/tools/llvm-mca/X86/option-all-views-1.s
+++ b/test/tools/llvm-mca/X86/option-all-views-1.s
@@ -64,6 +64,10 @@ add %eax, %eax
 # FULLREPORT-NEXT:     0,           3  (2.9%)
 # FULLREPORT-NEXT:     1,           100  (97.1%)
 
+# FULLREPORT:         Total ROB Entries:                64
+# FULLREPORT-NEXT:    Max Used ROB Entries:             22  ( 34.4% )
+# FULLREPORT-NEXT:    Average Used ROB Entries per cy:  17  ( 26.6% )
+
 # FULLREPORT:         Register File statistics:
 # FULLREPORT-NEXT:    Total number of mappings created:    200
 # FULLREPORT-NEXT:    Max number of mappings used:         44
diff --git a/test/tools/llvm-mca/X86/option-all-views-2.s b/test/tools/llvm-mca/X86/option-all-views-2.s
index 66eb3bbdd66aab2c23c515d7ff8fe14954fb2e1c..0afd21fc26338bf7566e2fa420e577801a28c78f 100644
--- a/test/tools/llvm-mca/X86/option-all-views-2.s
+++ b/test/tools/llvm-mca/X86/option-all-views-2.s
@@ -63,6 +63,10 @@ add %eax, %eax
 # ALL-NEXT:         0,           3  (2.9%)
 # ALL-NEXT:         1,           100  (97.1%)
 
+# ALL:             Total ROB Entries:                64
+# ALL-NEXT:        Max Used ROB Entries:             22  ( 34.4% )
+# ALL-NEXT:        Average Used ROB Entries per cy:  17  ( 26.6% )
+
 # ALL:             Register File statistics:
 # ALL-NEXT:        Total number of mappings created:    200
 # ALL-NEXT:        Max number of mappings used:         44
diff --git a/test/tools/llvm-mca/X86/register-file-statistics.s b/test/tools/llvm-mca/X86/register-file-statistics.s
index 914eeaa82ddf0a218d55f9e3cbf2e18b6bd9c174..cf847abe5165b3606909188a4aa50ba3151be797 100644
--- a/test/tools/llvm-mca/X86/register-file-statistics.s
+++ b/test/tools/llvm-mca/X86/register-file-statistics.s
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL,BDVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL,BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL,ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL %s
@@ -17,6 +18,11 @@ xor %eax, %ebx
 # ALL-NEXT:    Total number of mappings created:    2
 # ALL-NEXT:    Max number of mappings used:         2
 
+# BDVER2:      *  Register File #1 -- PdFpuPRF:
+# BDVER2-NEXT:    Number of physical registers:     160
+# BDVER2-NEXT:    Total number of mappings created: 0
+# BDVER2-NEXT:    Max number of mappings used:      0
+
 # BTVER2:      *  Register File #1 -- JFpuPRF:
 # BTVER2-NEXT:    Number of physical registers:     72
 # BTVER2-NEXT:    Total number of mappings created: 0
@@ -27,6 +33,11 @@ xor %eax, %ebx
 # ZNVER1-NEXT:    Total number of mappings created: 0
 # ZNVER1-NEXT:    Max number of mappings used:      0
 
+# BDVER2:      *  Register File #2 -- PdIntegerPRF:
+# BDVER2-NEXT:    Number of physical registers:     96
+# BDVER2-NEXT:    Total number of mappings created: 2
+# BDVER2-NEXT:    Max number of mappings used:      2
+
 # BTVER2:      *  Register File #2 -- JIntegerPRF:
 # BTVER2-NEXT:    Number of physical registers:     64
 # BTVER2-NEXT:    Total number of mappings created: 2
diff --git a/test/tools/llvm-nm/ARM/macho-print-size.test b/test/tools/llvm-nm/ARM/macho-print-size.test
index 98784587ee762aea21a0933aad1e5ebb71919bd7..5a8517a90128ec5fda1768e00b7eb273dee3d65b 100644
--- a/test/tools/llvm-nm/ARM/macho-print-size.test
+++ b/test/tools/llvm-nm/ARM/macho-print-size.test
@@ -1,3 +1,3 @@
 @ RUN: llvm-nm -print-size -arch armv7m %p/Inputs/print-size.macho-armv7m 2>&1 | FileCheck %s
 
-@ CHECK: warning sizes with -print-size for Mach-O files are always zero.
+@ CHECK: warning: sizes with -print-size for Mach-O files are always zero.
diff --git a/test/tools/llvm-objcopy/COFF/Inputs/i386-exe.yaml b/test/tools/llvm-objcopy/COFF/Inputs/i386-exe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f74368c6ad8974b968379b4f2c86c4e7370fa55
--- /dev/null
+++ b/test/tools/llvm-objcopy/COFF/Inputs/i386-exe.yaml
@@ -0,0 +1,84 @@
+--- !COFF
+OptionalHeader:  
+  AddressOfEntryPoint: 4144
+  ImageBase:       4194304
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_NO_SEH, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  ImportTable:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  ResourceTable:   
+    RelativeVirtualAddress: 0
+    Size:            0
+  ExceptionTable:  
+    RelativeVirtualAddress: 0
+    Size:            0
+  CertificateTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  BaseRelocationTable: 
+    RelativeVirtualAddress: 12288
+    Size:            12
+  Debug:           
+    RelativeVirtualAddress: 0
+    Size:            0
+  Architecture:    
+    RelativeVirtualAddress: 0
+    Size:            0
+  GlobalPtr:       
+    RelativeVirtualAddress: 0
+    Size:            0
+  TlsTable:        
+    RelativeVirtualAddress: 0
+    Size:            0
+  LoadConfigTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  BoundImport:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  IAT:             
+    RelativeVirtualAddress: 0
+    Size:            0
+  DelayImportDescriptor: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  ClrRuntimeHeader: 
+    RelativeVirtualAddress: 0
+    Size:            0
+header:          
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE ]
+sections:        
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     83
+    SectionData:     5589E5508B45088B0D00204000034D088945FC89C883C4045DC3660F1F4400005589E55DC3662E0F1F840000000000905589E583EC08E8E5FFFFFFC745FC00000000C7042402000000E8B2FFFFFF83C4085DC3
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  8192
+    VirtualSize:     4
+    SectionData:     '01000000'
+  - Name:            .reloc
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  12288
+    VirtualSize:     12
+    SectionData:     001000000C00000009300000
+symbols:         []
+...
diff --git a/test/tools/llvm-objcopy/COFF/Inputs/i386-obj.yaml b/test/tools/llvm-objcopy/COFF/Inputs/i386-obj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4bf09b13fc31e54f1f1dff96a58d229501d85f59
--- /dev/null
+++ b/test/tools/llvm-objcopy/COFF/Inputs/i386-obj.yaml
@@ -0,0 +1,244 @@
+--- !COFF
+header:          
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [  ]
+sections:        
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       16
+    SectionData:     5589E5508B45088B0D00000000034D088945FC89C883C4045DC3660F1F4400005589E55DC3662E0F1F840000000000905589E583EC08E800000000C745FC00000000C7042402000000E80000000083C4085DC3
+    Relocations:     
+      - VirtualAddress:  9
+        SymbolName:      _x
+        Type:            IMAGE_REL_I386_DIR32
+      - VirtualAddress:  55
+        SymbolName:      ___main
+        Type:            IMAGE_REL_I386_REL32
+      - VirtualAddress:  74
+        SymbolName:      _f
+        Type:            IMAGE_REL_I386_REL32
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '01000000'
+  - Name:            .bss
+    Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            .debug_str
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     636C616E672076657273696F6E20382E302E3020287472756E6B203334363337382920286C6C766D2F7472756E6B203334363339302900736F757263652E63002F686F6D652F6D617274696E2F636F64652F6C6C766D2F6275696C642F6F626A636F70792D696E707574007800696E740066005F5F6D61696E006D61696E007900
+  - Name:            .debug_abbrev
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     011101250E1305030E10171B0E110112060000023400030E49133A0B3B0B02180000032400030E3E0B0B0B0000042E01110112064018030E3A0B3B0B271949133F1900000505000218030E3A0B3B0B49130000062E00110112064018030E3A0B3B0B27193F190000072E00110112064018030E3A0B3B0B49133F19000000
+  - Name:            .debug_info
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     850000000400000000000401000000000C003700000000000000400000000000000053000000026B000000370000000101050300000000036D000000050404000000001A000000015571000000010337000000050291087F00000001033700000000062000000005000000015573000000010707300000002300000001557A00000001093700000000
+    Relocations:     
+      - VirtualAddress:  6
+        SymbolName:      .debug_abbrev
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  12
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  18
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  22
+        SymbolName:      .debug_line
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  26
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  30
+        SymbolName:      .text
+        Type:            IMAGE_REL_I386_DIR32
+      - VirtualAddress:  39
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  51
+        SymbolName:      _x
+        Type:            IMAGE_REL_I386_DIR32
+      - VirtualAddress:  56
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  63
+        SymbolName:      .text
+        Type:            IMAGE_REL_I386_DIR32
+      - VirtualAddress:  73
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  87
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  99
+        SymbolName:      .text
+        Type:            IMAGE_REL_I386_DIR32
+      - VirtualAddress:  109
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+      - VirtualAddress:  116
+        SymbolName:      .text
+        Type:            IMAGE_REL_I386_DIR32
+      - VirtualAddress:  126
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_I386_SECREL
+  - Name:            .debug_macinfo
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     '00'
+  - Name:            .debug_line
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     52000000040020000000010101FB0E0D00010101010000000100000100736F757263652E6300000000000005020000000014050A0A75050C0666050366050006CB05010A3D0500C9050A0A0821050306BA0205000101
+    Relocations:     
+      - VirtualAddress:  45
+        SymbolName:      .text
+        Type:            IMAGE_REL_I386_DIR32
+  - Name:            .llvm_addrsig
+    Characteristics: [ IMAGE_SCN_LNK_REMOVE ]
+    Alignment:       1
+    SectionData:     '1314'
+symbols:         
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          83
+      NumberOfRelocations: 3
+      NumberOfLinenumbers: 0
+      CheckSum:        4183332250
+      Number:          1
+  - Name:            .data
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        3099354981
+      Number:          2
+  - Name:            .bss
+    Value:           0
+    SectionNumber:   3
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+  - Name:            .debug_str
+    Value:           0
+    SectionNumber:   4
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          129
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        2876129505
+      Number:          4
+  - Name:            .debug_abbrev
+    Value:           0
+    SectionNumber:   5
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          126
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        2218663305
+      Number:          5
+  - Name:            .debug_info
+    Value:           0
+    SectionNumber:   6
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          137
+      NumberOfRelocations: 16
+      NumberOfLinenumbers: 0
+      CheckSum:        2577207131
+      Number:          6
+  - Name:            .debug_macinfo
+    Value:           0
+    SectionNumber:   7
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          7
+  - Name:            .debug_line
+    Value:           0
+    SectionNumber:   8
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          86
+      NumberOfRelocations: 1
+      NumberOfLinenumbers: 0
+      CheckSum:        2357396799
+      Number:          8
+  - Name:            .llvm_addrsig
+    Value:           0
+    SectionNumber:   9
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          2
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        2067109359
+      Number:          9
+  - Name:            '@feat.00'
+    Value:           1
+    SectionNumber:   -1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            _f
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            _x
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            ___main
+    Value:           32
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            _main
+    Value:           48
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/test/tools/llvm-objcopy/COFF/Inputs/x86_64-exe.yaml b/test/tools/llvm-objcopy/COFF/Inputs/x86_64-exe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4c4df4529a0e6bb0ff442f4f1594de56606e1b3
--- /dev/null
+++ b/test/tools/llvm-objcopy/COFF/Inputs/x86_64-exe.yaml
@@ -0,0 +1,89 @@
+--- !COFF
+OptionalHeader:  
+  AddressOfEntryPoint: 4144
+  ImageBase:       1073741824
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA, IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  ImportTable:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  ResourceTable:   
+    RelativeVirtualAddress: 0
+    Size:            0
+  ExceptionTable:  
+    RelativeVirtualAddress: 16384
+    Size:            24
+  CertificateTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  BaseRelocationTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  Debug:           
+    RelativeVirtualAddress: 0
+    Size:            0
+  Architecture:    
+    RelativeVirtualAddress: 0
+    Size:            0
+  GlobalPtr:       
+    RelativeVirtualAddress: 0
+    Size:            0
+  TlsTable:        
+    RelativeVirtualAddress: 0
+    Size:            0
+  LoadConfigTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  BoundImport:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  IAT:             
+    RelativeVirtualAddress: 0
+    Size:            0
+  DelayImportDescriptor: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  ClrRuntimeHeader: 
+    RelativeVirtualAddress: 0
+    Size:            0
+header:          
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LARGE_ADDRESS_AWARE ]
+sections:        
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     87
+    SectionData:     50894C24048B0DF51F0000034C240489C859C3662E0F1F8400000000000F1F00C3662E0F1F8400000000000F1F440000554883EC30488D6C2430E8E1FFFFFFC745FC00000000B902000000E8B0FFFFFF904883C4305DC3
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     20
+    SectionData:     0101010001020000010A03350A03055201500000
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  12288
+    VirtualSize:     4
+    SectionData:     '01000000'
+  - Name:            .pdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  16384
+    VirtualSize:     24
+    SectionData:     '001000001310000000200000301000005710000008200000'
+symbols:         []
+...
diff --git a/test/tools/llvm-objcopy/COFF/Inputs/x86_64-obj.yaml b/test/tools/llvm-objcopy/COFF/Inputs/x86_64-obj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef1666c2e3350e3ba4712ff1bcb60e874d7617c0
--- /dev/null
+++ b/test/tools/llvm-objcopy/COFF/Inputs/x86_64-obj.yaml
@@ -0,0 +1,295 @@
+--- !COFF
+header:          
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:        
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       16
+    SectionData:     50894C24048B0D00000000034C240489C859C3662E0F1F8400000000000F1F00C3662E0F1F8400000000000F1F440000554883EC30488D6C2430E800000000C745FC00000000B902000000E800000000904883C4305DC3
+    Relocations:     
+      - VirtualAddress:  7
+        SymbolName:      x
+        Type:            IMAGE_REL_AMD64_REL32
+      - VirtualAddress:  59
+        SymbolName:      __main
+        Type:            IMAGE_REL_AMD64_REL32
+      - VirtualAddress:  76
+        SymbolName:      f
+        Type:            IMAGE_REL_AMD64_REL32
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '01000000'
+  - Name:            .bss
+    Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     ''
+  - Name:            .xdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     0101010001020000010A03350A03055201500000
+  - Name:            .debug_str
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     636C616E672076657273696F6E20382E302E3020287472756E6B203334363337382920286C6C766D2F7472756E6B203334363339302900736F757263652E63002F686F6D652F6D617274696E2F636F64652F6C6C766D2F6275696C642F6F626A636F70792D696E707574007800696E740066005F5F6D61696E006D61696E007900
+  - Name:            .debug_abbrev
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     011101250E1305030E10171B0E110112060000023400030E49133A0B3B0B02180000032400030E3E0B0B0B0000042E01110112064018030E3A0B3B0B271949133F1900000505000218030E3A0B3B0B49130000062E00110112064018030E3A0B3B0B27193F190000072E00110112064018030E3A0B3B0B49133F19000000
+  - Name:            .debug_info
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     990000000400000000000801000000000C00370000000000000040000000000000000000000057000000026B0000003F000000010109030000000000000000036D00000005040400000000000000001300000001577100000001033F000000050291047F00000001033F000000000620000000000000000100000001577300000001070730000000000000002700000001567A00000001093F00000000
+    Relocations:     
+      - VirtualAddress:  6
+        SymbolName:      .debug_abbrev
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  12
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  18
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  22
+        SymbolName:      .debug_line
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  26
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  30
+        SymbolName:      .text
+        Type:            IMAGE_REL_AMD64_ADDR64
+      - VirtualAddress:  43
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  55
+        SymbolName:      x
+        Type:            IMAGE_REL_AMD64_ADDR64
+      - VirtualAddress:  64
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  71
+        SymbolName:      .text
+        Type:            IMAGE_REL_AMD64_ADDR64
+      - VirtualAddress:  85
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  99
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  111
+        SymbolName:      .text
+        Type:            IMAGE_REL_AMD64_ADDR64
+      - VirtualAddress:  125
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+      - VirtualAddress:  132
+        SymbolName:      .text
+        Type:            IMAGE_REL_AMD64_ADDR64
+      - VirtualAddress:  146
+        SymbolName:      .debug_str
+        Type:            IMAGE_REL_AMD64_SECREL
+  - Name:            .debug_macinfo
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     '00'
+  - Name:            .pdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     '000000001300000000000000000000002700000008000000'
+    Relocations:     
+      - VirtualAddress:  0
+        SymbolName:      f
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+      - VirtualAddress:  4
+        SymbolName:      f
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+      - VirtualAddress:  8
+        SymbolName:      .xdata
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+      - VirtualAddress:  12
+        SymbolName:      main
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+      - VirtualAddress:  16
+        SymbolName:      main
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+      - VirtualAddress:  20
+        SymbolName:      .xdata
+        Type:            IMAGE_REL_AMD64_ADDR32NB
+  - Name:            .debug_line
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     57000000040020000000010101FB0E0D00010101010000000100000100736F757263652E630000000000000902000000000000000014050A0A59050C066605034A050006081505010A130500F3050A0A08590503069E0207000101
+    Relocations:     
+      - VirtualAddress:  45
+        SymbolName:      .text
+        Type:            IMAGE_REL_AMD64_ADDR64
+  - Name:            .llvm_addrsig
+    Characteristics: [ IMAGE_SCN_LNK_REMOVE ]
+    Alignment:       1
+    SectionData:     '1718'
+symbols:         
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          87
+      NumberOfRelocations: 3
+      NumberOfLinenumbers: 0
+      CheckSum:        4237828689
+      Number:          1
+  - Name:            .data
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        3099354981
+      Number:          2
+  - Name:            .bss
+    Value:           0
+    SectionNumber:   3
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+  - Name:            .xdata
+    Value:           0
+    SectionNumber:   4
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          20
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        3415491858
+      Number:          4
+  - Name:            .debug_str
+    Value:           0
+    SectionNumber:   5
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          129
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        2876129505
+      Number:          5
+  - Name:            .debug_abbrev
+    Value:           0
+    SectionNumber:   6
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          126
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        2218663305
+      Number:          6
+  - Name:            .debug_info
+    Value:           0
+    SectionNumber:   7
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          157
+      NumberOfRelocations: 16
+      NumberOfLinenumbers: 0
+      CheckSum:        603506744
+      Number:          7
+  - Name:            .debug_macinfo
+    Value:           0
+    SectionNumber:   8
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          8
+  - Name:            .pdata
+    Value:           0
+    SectionNumber:   9
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          24
+      NumberOfRelocations: 6
+      NumberOfLinenumbers: 0
+      CheckSum:        2036901199
+      Number:          9
+  - Name:            .debug_line
+    Value:           0
+    SectionNumber:   10
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          91
+      NumberOfRelocations: 1
+      NumberOfLinenumbers: 0
+      CheckSum:        633454091
+      Number:          10
+  - Name:            .llvm_addrsig
+    Value:           0
+    SectionNumber:   11
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          2
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        384769216
+      Number:          11
+  - Name:            '@feat.00'
+    Value:           0
+    SectionNumber:   -1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            f
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            x
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            __main
+    Value:           32
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            main
+    Value:           48
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/test/tools/llvm-objcopy/COFF/basic-copy.test b/test/tools/llvm-objcopy/COFF/basic-copy.test
new file mode 100644
index 0000000000000000000000000000000000000000..ddd12dcaf2fb8f3d702bfdedd14a76eb06b6df9c
--- /dev/null
+++ b/test/tools/llvm-objcopy/COFF/basic-copy.test
@@ -0,0 +1,42 @@
+Test plain passthrough copying with llvm-objcopy, by checking that obj2yaml
+produces identical output for both input and output object files/executables.
+(Intentionally not comparing to the original input yaml, in case there are
+superficial differences like line endings.) In order to have a check of the
+whole file and not just individual fields with FileCheck, checking with
+obj2yaml+cmp instead of llvm-readobj+cmp, as llvm-readobj also prints file
+details that can differ between otherwise equal files (such as file offsets).
+
+Actual copied object files/executables can differ in, among others, the
+following aspects:
+- The padding of executable sections (lld uses 0xcc, which is int3 on x86)
+- The actual layout of the string table (it can be filled linearly,
+  strings can be dedupliated, the table can be optimized by sharing tails
+  of longer strings; different parts in llvm do each of these three options)
+- The size indication for an empty/missing string table can either be 4
+  or left out altogether
+- Alignment of section data
+- Checksums
+
+RUN: yaml2obj %p/Inputs/i386-obj.yaml > %t.i386.o
+RUN: llvm-objcopy %t.i386.o %t.i386-copy.o
+RUN: obj2yaml %t.i386.o > %t.i386.o.yaml
+RUN: obj2yaml %t.i386-copy.o > %t.i386-copy.o.yaml
+RUN: cmp %t.i386.o.yaml %t.i386-copy.o.yaml
+
+RUN: yaml2obj %p/Inputs/x86_64-obj.yaml > %t.x86_64.o
+RUN: llvm-objcopy %t.x86_64.o %t.x86_64-copy.o
+RUN: obj2yaml %t.x86_64.o > %t.x86_64.o.yaml
+RUN: obj2yaml %t.x86_64-copy.o > %t.x86_64-copy.o.yaml
+RUN: cmp %t.x86_64.o.yaml %t.x86_64-copy.o.yaml
+
+RUN: yaml2obj %p/Inputs/i386-exe.yaml > %t.i386.exe
+RUN: llvm-objcopy %t.i386.exe %t.i386-copy.exe
+RUN: obj2yaml %t.i386.exe > %t.i386.exe.yaml
+RUN: obj2yaml %t.i386-copy.exe > %t.i386-copy.exe.yaml
+RUN: cmp %t.i386.exe.yaml %t.i386-copy.exe.yaml
+
+RUN: yaml2obj %p/Inputs/x86_64-exe.yaml > %t.x86_64.exe
+RUN: llvm-objcopy %t.x86_64.exe %t.x86_64-copy.exe
+RUN: obj2yaml %t.x86_64.exe > %t.x86_64.exe.yaml
+RUN: obj2yaml %t.x86_64-copy.exe > %t.x86_64-copy.exe.yaml
+RUN: cmp %t.x86_64.exe.yaml %t.x86_64-copy.exe.yaml
diff --git a/test/tools/llvm-objcopy/Inputs/alloc-symtab.o b/test/tools/llvm-objcopy/ELF/Inputs/alloc-symtab.o
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/alloc-symtab.o
rename to test/tools/llvm-objcopy/ELF/Inputs/alloc-symtab.o
diff --git a/test/tools/llvm-objcopy/Inputs/compress-debug-sections.yaml b/test/tools/llvm-objcopy/ELF/Inputs/compress-debug-sections.yaml
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/compress-debug-sections.yaml
rename to test/tools/llvm-objcopy/ELF/Inputs/compress-debug-sections.yaml
diff --git a/test/tools/llvm-objcopy/Inputs/dwarf.dwo b/test/tools/llvm-objcopy/ELF/Inputs/dwarf.dwo
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/dwarf.dwo
rename to test/tools/llvm-objcopy/ELF/Inputs/dwarf.dwo
diff --git a/test/tools/llvm-objcopy/Inputs/dynamic.so b/test/tools/llvm-objcopy/ELF/Inputs/dynamic.so
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/dynamic.so
rename to test/tools/llvm-objcopy/ELF/Inputs/dynamic.so
diff --git a/test/tools/llvm-objcopy/Inputs/dynrel.elf b/test/tools/llvm-objcopy/ELF/Inputs/dynrel.elf
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/dynrel.elf
rename to test/tools/llvm-objcopy/ELF/Inputs/dynrel.elf
diff --git a/test/tools/llvm-objcopy/Inputs/dynsym.so b/test/tools/llvm-objcopy/ELF/Inputs/dynsym.so
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/dynsym.so
rename to test/tools/llvm-objcopy/ELF/Inputs/dynsym.so
diff --git a/test/tools/llvm-objcopy/Inputs/groups.o b/test/tools/llvm-objcopy/ELF/Inputs/groups.o
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/groups.o
rename to test/tools/llvm-objcopy/ELF/Inputs/groups.o
diff --git a/test/tools/llvm-objcopy/Inputs/many-sections.o.gz b/test/tools/llvm-objcopy/ELF/Inputs/many-sections.o.gz
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/many-sections.o.gz
rename to test/tools/llvm-objcopy/ELF/Inputs/many-sections.o.gz
diff --git a/test/tools/llvm-objcopy/Inputs/pt-phdr.elf b/test/tools/llvm-objcopy/ELF/Inputs/pt-phdr.elf
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/pt-phdr.elf
rename to test/tools/llvm-objcopy/ELF/Inputs/pt-phdr.elf
diff --git a/test/tools/llvm-objcopy/Inputs/ungzip.py b/test/tools/llvm-objcopy/ELF/Inputs/ungzip.py
similarity index 100%
rename from test/tools/llvm-objcopy/Inputs/ungzip.py
rename to test/tools/llvm-objcopy/ELF/Inputs/ungzip.py
diff --git a/test/tools/llvm-objcopy/abs-symbol.test b/test/tools/llvm-objcopy/ELF/abs-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/abs-symbol.test
rename to test/tools/llvm-objcopy/ELF/abs-symbol.test
diff --git a/test/tools/llvm-objcopy/add-gnu-debuglink.test b/test/tools/llvm-objcopy/ELF/add-gnu-debuglink.test
similarity index 100%
rename from test/tools/llvm-objcopy/add-gnu-debuglink.test
rename to test/tools/llvm-objcopy/ELF/add-gnu-debuglink.test
diff --git a/test/tools/llvm-objcopy/add-section-remove.test b/test/tools/llvm-objcopy/ELF/add-section-remove.test
similarity index 100%
rename from test/tools/llvm-objcopy/add-section-remove.test
rename to test/tools/llvm-objcopy/ELF/add-section-remove.test
diff --git a/test/tools/llvm-objcopy/add-section.test b/test/tools/llvm-objcopy/ELF/add-section.test
similarity index 100%
rename from test/tools/llvm-objcopy/add-section.test
rename to test/tools/llvm-objcopy/ELF/add-section.test
diff --git a/test/tools/llvm-objcopy/adjacent-segments.test b/test/tools/llvm-objcopy/ELF/adjacent-segments.test
similarity index 100%
rename from test/tools/llvm-objcopy/adjacent-segments.test
rename to test/tools/llvm-objcopy/ELF/adjacent-segments.test
diff --git a/test/tools/llvm-objcopy/armexidx-link.test b/test/tools/llvm-objcopy/ELF/armexidx-link.test
similarity index 100%
rename from test/tools/llvm-objcopy/armexidx-link.test
rename to test/tools/llvm-objcopy/ELF/armexidx-link.test
diff --git a/test/tools/llvm-objcopy/auto-remove-shndx.test b/test/tools/llvm-objcopy/ELF/auto-remove-shndx.test
similarity index 100%
rename from test/tools/llvm-objcopy/auto-remove-shndx.test
rename to test/tools/llvm-objcopy/ELF/auto-remove-shndx.test
diff --git a/test/tools/llvm-objcopy/ELF/bad-build-id.test b/test/tools/llvm-objcopy/ELF/bad-build-id.test
new file mode 100644
index 0000000000000000000000000000000000000000..24b3cb697a059931ce34f818fd0177ffccce0a5e
--- /dev/null
+++ b/test/tools/llvm-objcopy/ELF/bad-build-id.test
@@ -0,0 +1,21 @@
+# RUN: yaml2obj %s > %t
+# RUN: not llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-input=.debug %t 2>&1 >/dev/null | FileCheck %s
+
+# CHECK: build ID in file {{.*}} is smaller than two bytes.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Content:         040000000100000003000000474E55004F000000
+ProgramHeaders:
+  - Type: PT_NOTE
+    Flags: [ PF_R ]
+    Sections:
+      - Section: .note.gnu.build-id
diff --git a/test/tools/llvm-objcopy/basic-archive-copy.test b/test/tools/llvm-objcopy/ELF/basic-archive-copy.test
similarity index 100%
rename from test/tools/llvm-objcopy/basic-archive-copy.test
rename to test/tools/llvm-objcopy/ELF/basic-archive-copy.test
diff --git a/test/tools/llvm-objcopy/basic-binary-copy.test b/test/tools/llvm-objcopy/ELF/basic-binary-copy.test
similarity index 100%
rename from test/tools/llvm-objcopy/basic-binary-copy.test
rename to test/tools/llvm-objcopy/ELF/basic-binary-copy.test
diff --git a/test/tools/llvm-objcopy/basic-copy.test b/test/tools/llvm-objcopy/ELF/basic-copy.test
similarity index 100%
rename from test/tools/llvm-objcopy/basic-copy.test
rename to test/tools/llvm-objcopy/ELF/basic-copy.test
diff --git a/test/tools/llvm-objcopy/basic-keep.test b/test/tools/llvm-objcopy/ELF/basic-keep.test
similarity index 77%
rename from test/tools/llvm-objcopy/basic-keep.test
rename to test/tools/llvm-objcopy/ELF/basic-keep.test
index 8f4acb0c971ed94b7486bb650cab08dfd62d87ce..79d7717d9c0957179da8835d68674973c3633e41 100644
--- a/test/tools/llvm-objcopy/basic-keep.test
+++ b/test/tools/llvm-objcopy/ELF/basic-keep.test
@@ -1,6 +1,6 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -strip-non-alloc -keep=.test %t %t2
-# RUN: llvm-strip --strip-all -keep=.test %t -o %t3
+# RUN: llvm-objcopy -strip-non-alloc -keep-section=.test %t %t2
+# RUN: llvm-strip --strip-all -keep-section=.test %t -o %t3
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
 # RUN: cmp %t2 %t3
 
diff --git a/test/tools/llvm-objcopy/basic-only-keep-debug.test b/test/tools/llvm-objcopy/ELF/basic-only-keep-debug.test
similarity index 100%
rename from test/tools/llvm-objcopy/basic-only-keep-debug.test
rename to test/tools/llvm-objcopy/ELF/basic-only-keep-debug.test
diff --git a/test/tools/llvm-objcopy/basic-only-keep.test b/test/tools/llvm-objcopy/ELF/basic-only-section.test
similarity index 91%
rename from test/tools/llvm-objcopy/basic-only-keep.test
rename to test/tools/llvm-objcopy/ELF/basic-only-section.test
index 2f1c5e0a5ae563acc62812adf809bbce011c3e98..536a0fc4cca919af488ad009e78085b71696ed17 100644
--- a/test/tools/llvm-objcopy/basic-only-keep.test
+++ b/test/tools/llvm-objcopy/ELF/basic-only-section.test
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -only-keep=.test %t %t2
+# RUN: llvm-objcopy -only-section=.test %t %t2
 # RUN: llvm-objcopy -j .test %t %t3
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
 # RUN: diff %t2 %t3
diff --git a/test/tools/llvm-objcopy/basic-relocations.test b/test/tools/llvm-objcopy/ELF/basic-relocations.test
similarity index 100%
rename from test/tools/llvm-objcopy/basic-relocations.test
rename to test/tools/llvm-objcopy/ELF/basic-relocations.test
diff --git a/test/tools/llvm-objcopy/binary-first-seg-offset-zero.test b/test/tools/llvm-objcopy/ELF/binary-first-seg-offset-zero.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-first-seg-offset-zero.test
rename to test/tools/llvm-objcopy/ELF/binary-first-seg-offset-zero.test
diff --git a/test/tools/llvm-objcopy/binary-input-and-output.test b/test/tools/llvm-objcopy/ELF/binary-input-and-output.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-input-and-output.test
rename to test/tools/llvm-objcopy/ELF/binary-input-and-output.test
diff --git a/test/tools/llvm-objcopy/binary-input-arch.test b/test/tools/llvm-objcopy/ELF/binary-input-arch.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-input-arch.test
rename to test/tools/llvm-objcopy/ELF/binary-input-arch.test
diff --git a/test/tools/llvm-objcopy/binary-input-error.test b/test/tools/llvm-objcopy/ELF/binary-input-error.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-input-error.test
rename to test/tools/llvm-objcopy/ELF/binary-input-error.test
diff --git a/test/tools/llvm-objcopy/binary-input.test b/test/tools/llvm-objcopy/ELF/binary-input.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-input.test
rename to test/tools/llvm-objcopy/ELF/binary-input.test
diff --git a/test/tools/llvm-objcopy/binary-no-paddr.test b/test/tools/llvm-objcopy/ELF/binary-no-paddr.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-no-paddr.test
rename to test/tools/llvm-objcopy/ELF/binary-no-paddr.test
diff --git a/test/tools/llvm-objcopy/binary-out-error.test b/test/tools/llvm-objcopy/ELF/binary-out-error.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-out-error.test
rename to test/tools/llvm-objcopy/ELF/binary-out-error.test
diff --git a/test/tools/llvm-objcopy/binary-paddr.test b/test/tools/llvm-objcopy/ELF/binary-paddr.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-paddr.test
rename to test/tools/llvm-objcopy/ELF/binary-paddr.test
diff --git a/test/tools/llvm-objcopy/binary-remove-all-but-one.test b/test/tools/llvm-objcopy/ELF/binary-remove-all-but-one.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-remove-all-but-one.test
rename to test/tools/llvm-objcopy/ELF/binary-remove-all-but-one.test
diff --git a/test/tools/llvm-objcopy/binary-remove-end.test b/test/tools/llvm-objcopy/ELF/binary-remove-end.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-remove-end.test
rename to test/tools/llvm-objcopy/ELF/binary-remove-end.test
diff --git a/test/tools/llvm-objcopy/binary-remove-middle.test b/test/tools/llvm-objcopy/ELF/binary-remove-middle.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-remove-middle.test
rename to test/tools/llvm-objcopy/ELF/binary-remove-middle.test
diff --git a/test/tools/llvm-objcopy/binary-segment-layout.test b/test/tools/llvm-objcopy/ELF/binary-segment-layout.test
similarity index 100%
rename from test/tools/llvm-objcopy/binary-segment-layout.test
rename to test/tools/llvm-objcopy/ELF/binary-segment-layout.test
diff --git a/test/tools/llvm-objcopy/ELF/build-id-link-dir.test b/test/tools/llvm-objcopy/ELF/build-id-link-dir.test
new file mode 100644
index 0000000000000000000000000000000000000000..9fe9a68cfdb44690d9f14cde5c7a6cb4f65b9e6f
--- /dev/null
+++ b/test/tools/llvm-objcopy/ELF/build-id-link-dir.test
@@ -0,0 +1,56 @@
+# RUN: yaml2obj %s > %t
+# RUN: mkdir -p %t-dir
+
+# RUN: llvm-objcopy --build-id-link-dir=%t-dir %t %t2
+# RUN: not test -e %t-dir/4f/cb712aa6387724a9f465a32cd8c14b
+
+# RUN: llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-input=.debug %t %t3
+# RUN: cmp %t-dir/4f/cb712aa6387724a9f465a32cd8c14b.debug %t
+# RUN: rm %t-dir/4f/cb712aa6387724a9f465a32cd8c14b.debug
+
+# RUN: llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-output "" --strip-sections %t %t4
+# RUN: cmp %t-dir/4f/cb712aa6387724a9f465a32cd8c14b %t4
+# RUN: rm %t-dir/4f/cb712aa6387724a9f465a32cd8c14b
+
+# Linking the output of an inplace argument means that the file in the build-id
+# directory will be hard-linked to the resulting file.
+# RUN: cp %t %t5
+# RUN: llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-output "" --strip-sections %t5
+# RUN: cmp %t-dir/4f/cb712aa6387724a9f465a32cd8c14b %t5
+# RUN: rm %t-dir/4f/cb712aa6387724a9f465a32cd8c14b
+
+# Linking the input of an inplace argument means that the file in the build-id
+# directory will be hard-linked to the original file.
+# RUN: cp %t %t6
+# RUN: llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-input=.debug --strip-sections %t6
+# RUN: cmp %t-dir/4f/cb712aa6387724a9f465a32cd8c14b.debug %t
+# RUN: rm %t-dir/4f/cb712aa6387724a9f465a32cd8c14b.debug
+
+# You can use both at once.
+# RUN: llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-output "" --build-id-link-input=.debug --strip-sections %t %t7
+# RUN: cmp %t-dir/4f/cb712aa6387724a9f465a32cd8c14b.debug %t
+# RUN: cmp %t-dir/4f/cb712aa6387724a9f465a32cd8c14b %t7
+# RUN: rm %t-dir/4f/cb712aa6387724a9f465a32cd8c14b.debug
+# RUN: rm %t-dir/4f/cb712aa6387724a9f465a32cd8c14b
+
+# --build-id-link-output can have a suffix as well
+# RUN: llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-output=.debug --only-keep-debug %t %t8
+# RUN: cmp %t-dir/4f/cb712aa6387724a9f465a32cd8c14b.debug %t8
+# RUN: rm %t-dir/4f/cb712aa6387724a9f465a32cd8c14b.debug
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Content:         040000001000000003000000474E55004FCB712AA6387724A9F465A32CD8C14B
+ProgramHeaders:
+  - Type: PT_NOTE
+    Flags: [ PF_R ]
+    Sections:
+      - Section: .note.gnu.build-id
diff --git a/test/tools/llvm-objcopy/cannot-delete-dest.test b/test/tools/llvm-objcopy/ELF/cannot-delete-dest.test
similarity index 100%
rename from test/tools/llvm-objcopy/cannot-delete-dest.test
rename to test/tools/llvm-objcopy/ELF/cannot-delete-dest.test
diff --git a/test/tools/llvm-objcopy/check-addr-offset-align-binary.test b/test/tools/llvm-objcopy/ELF/check-addr-offset-align-binary.test
similarity index 100%
rename from test/tools/llvm-objcopy/check-addr-offset-align-binary.test
rename to test/tools/llvm-objcopy/ELF/check-addr-offset-align-binary.test
diff --git a/test/tools/llvm-objcopy/check-addr-offset-align.test b/test/tools/llvm-objcopy/ELF/check-addr-offset-align.test
similarity index 100%
rename from test/tools/llvm-objcopy/check-addr-offset-align.test
rename to test/tools/llvm-objcopy/ELF/check-addr-offset-align.test
diff --git a/test/tools/llvm-objcopy/common-symbol.test b/test/tools/llvm-objcopy/ELF/common-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/common-symbol.test
rename to test/tools/llvm-objcopy/ELF/common-symbol.test
diff --git a/test/tools/llvm-objcopy/compress-and-decompress-debug-sections-error.test b/test/tools/llvm-objcopy/ELF/compress-and-decompress-debug-sections-error.test
similarity index 100%
rename from test/tools/llvm-objcopy/compress-and-decompress-debug-sections-error.test
rename to test/tools/llvm-objcopy/ELF/compress-and-decompress-debug-sections-error.test
diff --git a/test/tools/llvm-objcopy/compress-debug-sections-default-gnu.test b/test/tools/llvm-objcopy/ELF/compress-debug-sections-default-gnu.test
similarity index 100%
rename from test/tools/llvm-objcopy/compress-debug-sections-default-gnu.test
rename to test/tools/llvm-objcopy/ELF/compress-debug-sections-default-gnu.test
diff --git a/test/tools/llvm-objcopy/compress-debug-sections-default.test b/test/tools/llvm-objcopy/ELF/compress-debug-sections-default.test
similarity index 100%
rename from test/tools/llvm-objcopy/compress-debug-sections-default.test
rename to test/tools/llvm-objcopy/ELF/compress-debug-sections-default.test
diff --git a/test/tools/llvm-objcopy/compress-debug-sections-invalid-format.test b/test/tools/llvm-objcopy/ELF/compress-debug-sections-invalid-format.test
similarity index 100%
rename from test/tools/llvm-objcopy/compress-debug-sections-invalid-format.test
rename to test/tools/llvm-objcopy/ELF/compress-debug-sections-invalid-format.test
diff --git a/test/tools/llvm-objcopy/compress-debug-sections-zlib-gnu.test b/test/tools/llvm-objcopy/ELF/compress-debug-sections-zlib-gnu.test
similarity index 100%
rename from test/tools/llvm-objcopy/compress-debug-sections-zlib-gnu.test
rename to test/tools/llvm-objcopy/ELF/compress-debug-sections-zlib-gnu.test
diff --git a/test/tools/llvm-objcopy/compress-debug-sections-zlib.test b/test/tools/llvm-objcopy/ELF/compress-debug-sections-zlib.test
similarity index 100%
rename from test/tools/llvm-objcopy/compress-debug-sections-zlib.test
rename to test/tools/llvm-objcopy/ELF/compress-debug-sections-zlib.test
diff --git a/test/tools/llvm-objcopy/compress-debug-sections.test b/test/tools/llvm-objcopy/ELF/compress-debug-sections.test
similarity index 100%
rename from test/tools/llvm-objcopy/compress-debug-sections.test
rename to test/tools/llvm-objcopy/ELF/compress-debug-sections.test
diff --git a/test/tools/llvm-objcopy/deterministic-archive.test b/test/tools/llvm-objcopy/ELF/deterministic-archive.test
similarity index 100%
rename from test/tools/llvm-objcopy/deterministic-archive.test
rename to test/tools/llvm-objcopy/ELF/deterministic-archive.test
diff --git a/test/tools/llvm-objcopy/discard-all.test b/test/tools/llvm-objcopy/ELF/discard-all.test
similarity index 100%
rename from test/tools/llvm-objcopy/discard-all.test
rename to test/tools/llvm-objcopy/ELF/discard-all.test
diff --git a/test/tools/llvm-objcopy/drawf-fission.test b/test/tools/llvm-objcopy/ELF/drawf-fission.test
similarity index 100%
rename from test/tools/llvm-objcopy/drawf-fission.test
rename to test/tools/llvm-objcopy/ELF/drawf-fission.test
diff --git a/test/tools/llvm-objcopy/dump-section.test b/test/tools/llvm-objcopy/ELF/dump-section.test
similarity index 95%
rename from test/tools/llvm-objcopy/dump-section.test
rename to test/tools/llvm-objcopy/ELF/dump-section.test
index cf9accdfd512ac231abdcb5c16ce56a10da45510..8e66f65e04a7b452f57bca2016f74af0fe450f50 100644
--- a/test/tools/llvm-objcopy/dump-section.test
+++ b/test/tools/llvm-objcopy/ELF/dump-section.test
@@ -1,6 +1,6 @@
 # RUN: yaml2obj %s > %t
 # RUN: llvm-objcopy -O binary -j .text %t %t2
-# RUN: llvm-objcopy -O binary -only-keep .text %t %t3
+# RUN: llvm-objcopy -O binary -only-section .text %t %t3
 # RUN: llvm-objcopy --dump-section .text=%t4 %t %t5
 # RUN: llvm-objcopy --dump-section .foo=%t6 %t %t7
 # RUN: not llvm-objcopy --dump-section .bar=%t8 %t %t9 2>&1 | FileCheck %s --check-prefix=NOBITS
diff --git a/test/tools/llvm-objcopy/dynamic-relocations.test b/test/tools/llvm-objcopy/ELF/dynamic-relocations.test
similarity index 100%
rename from test/tools/llvm-objcopy/dynamic-relocations.test
rename to test/tools/llvm-objcopy/ELF/dynamic-relocations.test
diff --git a/test/tools/llvm-objcopy/dynamic.test b/test/tools/llvm-objcopy/ELF/dynamic.test
similarity index 100%
rename from test/tools/llvm-objcopy/dynamic.test
rename to test/tools/llvm-objcopy/ELF/dynamic.test
diff --git a/test/tools/llvm-objcopy/dynstr.test b/test/tools/llvm-objcopy/ELF/dynstr.test
similarity index 100%
rename from test/tools/llvm-objcopy/dynstr.test
rename to test/tools/llvm-objcopy/ELF/dynstr.test
diff --git a/test/tools/llvm-objcopy/dynsym-error-remove-strtab.test b/test/tools/llvm-objcopy/ELF/dynsym-error-remove-strtab.test
similarity index 100%
rename from test/tools/llvm-objcopy/dynsym-error-remove-strtab.test
rename to test/tools/llvm-objcopy/ELF/dynsym-error-remove-strtab.test
diff --git a/test/tools/llvm-objcopy/dynsym.test b/test/tools/llvm-objcopy/ELF/dynsym.test
similarity index 100%
rename from test/tools/llvm-objcopy/dynsym.test
rename to test/tools/llvm-objcopy/ELF/dynsym.test
diff --git a/test/tools/llvm-objcopy/elf32be.test b/test/tools/llvm-objcopy/ELF/elf32be.test
similarity index 100%
rename from test/tools/llvm-objcopy/elf32be.test
rename to test/tools/llvm-objcopy/ELF/elf32be.test
diff --git a/test/tools/llvm-objcopy/elf32le.test b/test/tools/llvm-objcopy/ELF/elf32le.test
similarity index 100%
rename from test/tools/llvm-objcopy/elf32le.test
rename to test/tools/llvm-objcopy/ELF/elf32le.test
diff --git a/test/tools/llvm-objcopy/elf64be.test b/test/tools/llvm-objcopy/ELF/elf64be.test
similarity index 100%
rename from test/tools/llvm-objcopy/elf64be.test
rename to test/tools/llvm-objcopy/ELF/elf64be.test
diff --git a/test/tools/llvm-objcopy/empty-section.test b/test/tools/llvm-objcopy/ELF/empty-section.test
similarity index 100%
rename from test/tools/llvm-objcopy/empty-section.test
rename to test/tools/llvm-objcopy/ELF/empty-section.test
diff --git a/test/tools/llvm-objcopy/explicit-keep-remove.test b/test/tools/llvm-objcopy/ELF/explicit-keep-remove.test
similarity index 89%
rename from test/tools/llvm-objcopy/explicit-keep-remove.test
rename to test/tools/llvm-objcopy/ELF/explicit-keep-remove.test
index 5ebd2a5081c032075f99a9ecdd993553c71449d3..fea708edfd4936a8ce968f2cff56a733d5d1b3bd 100644
--- a/test/tools/llvm-objcopy/explicit-keep-remove.test
+++ b/test/tools/llvm-objcopy/ELF/explicit-keep-remove.test
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -R=.test -keep=.test %t %t2
+# RUN: llvm-objcopy -R=.test -keep-section=.test %t %t2
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
 
 !ELF
diff --git a/test/tools/llvm-objcopy/explicit-only-keep-remove.test b/test/tools/llvm-objcopy/ELF/explicit-only-section-remove.test
similarity index 89%
rename from test/tools/llvm-objcopy/explicit-only-keep-remove.test
rename to test/tools/llvm-objcopy/ELF/explicit-only-section-remove.test
index 10d49e10c09c33dab3856bee3ade30ca619a0e9f..15a315906abf80079fed5fba7f08c7f5359da482 100644
--- a/test/tools/llvm-objcopy/explicit-only-keep-remove.test
+++ b/test/tools/llvm-objcopy/ELF/explicit-only-section-remove.test
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -R=.test -only-keep=.test %t %t2
+# RUN: llvm-objcopy -R=.test -only-section=.test %t %t2
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
 
 !ELF
diff --git a/test/tools/llvm-objcopy/fail-no-output-directory.test b/test/tools/llvm-objcopy/ELF/fail-no-output-directory.test
similarity index 100%
rename from test/tools/llvm-objcopy/fail-no-output-directory.test
rename to test/tools/llvm-objcopy/ELF/fail-no-output-directory.test
diff --git a/test/tools/llvm-objcopy/globalize.test b/test/tools/llvm-objcopy/ELF/globalize.test
similarity index 100%
rename from test/tools/llvm-objcopy/globalize.test
rename to test/tools/llvm-objcopy/ELF/globalize.test
diff --git a/test/tools/llvm-objcopy/group-big-endian.test b/test/tools/llvm-objcopy/ELF/group-big-endian.test
similarity index 100%
rename from test/tools/llvm-objcopy/group-big-endian.test
rename to test/tools/llvm-objcopy/ELF/group-big-endian.test
diff --git a/test/tools/llvm-objcopy/group-unchanged.test b/test/tools/llvm-objcopy/ELF/group-unchanged.test
similarity index 100%
rename from test/tools/llvm-objcopy/group-unchanged.test
rename to test/tools/llvm-objcopy/ELF/group-unchanged.test
diff --git a/test/tools/llvm-objcopy/group.test b/test/tools/llvm-objcopy/ELF/group.test
similarity index 100%
rename from test/tools/llvm-objcopy/group.test
rename to test/tools/llvm-objcopy/ELF/group.test
diff --git a/test/tools/llvm-objcopy/help-message.test b/test/tools/llvm-objcopy/ELF/help-message.test
similarity index 100%
rename from test/tools/llvm-objcopy/help-message.test
rename to test/tools/llvm-objcopy/ELF/help-message.test
diff --git a/test/tools/llvm-objcopy/hexagon-unsupported-on-x86.test b/test/tools/llvm-objcopy/ELF/hexagon-unsupported-on-x86.test
similarity index 100%
rename from test/tools/llvm-objcopy/hexagon-unsupported-on-x86.test
rename to test/tools/llvm-objcopy/ELF/hexagon-unsupported-on-x86.test
diff --git a/test/tools/llvm-objcopy/identical-segments.test b/test/tools/llvm-objcopy/ELF/identical-segments.test
similarity index 100%
rename from test/tools/llvm-objcopy/identical-segments.test
rename to test/tools/llvm-objcopy/ELF/identical-segments.test
diff --git a/test/tools/llvm-objcopy/input-output-target.test b/test/tools/llvm-objcopy/ELF/input-output-target.test
similarity index 100%
rename from test/tools/llvm-objcopy/input-output-target.test
rename to test/tools/llvm-objcopy/ELF/input-output-target.test
diff --git a/test/tools/llvm-objcopy/keep-file-symbols.test b/test/tools/llvm-objcopy/ELF/keep-file-symbols.test
similarity index 100%
rename from test/tools/llvm-objcopy/keep-file-symbols.test
rename to test/tools/llvm-objcopy/ELF/keep-file-symbols.test
diff --git a/test/tools/llvm-objcopy/keep-global-symbols-mix-globalize.test b/test/tools/llvm-objcopy/ELF/keep-global-symbols-mix-globalize.test
similarity index 100%
rename from test/tools/llvm-objcopy/keep-global-symbols-mix-globalize.test
rename to test/tools/llvm-objcopy/ELF/keep-global-symbols-mix-globalize.test
diff --git a/test/tools/llvm-objcopy/keep-global-symbols.test b/test/tools/llvm-objcopy/ELF/keep-global-symbols.test
similarity index 100%
rename from test/tools/llvm-objcopy/keep-global-symbols.test
rename to test/tools/llvm-objcopy/ELF/keep-global-symbols.test
diff --git a/test/tools/llvm-objcopy/keep-many.test b/test/tools/llvm-objcopy/ELF/keep-many.test
similarity index 87%
rename from test/tools/llvm-objcopy/keep-many.test
rename to test/tools/llvm-objcopy/ELF/keep-many.test
index 662737789930e64a2ceb6628b5f75a0f28bfcde9..2abb19dd92b8bd2c972fdc1f8de626eca2984ddc 100644
--- a/test/tools/llvm-objcopy/keep-many.test
+++ b/test/tools/llvm-objcopy/ELF/keep-many.test
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -strip-non-alloc -keep=.test -keep=.test3 %t %t2
+# RUN: llvm-objcopy -strip-non-alloc -keep-section=.test -keep-section=.test3 %t %t2
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
 
 !ELF
diff --git a/test/tools/llvm-objcopy/keep-only-keep.test b/test/tools/llvm-objcopy/ELF/keep-only-section.test
similarity index 82%
rename from test/tools/llvm-objcopy/keep-only-keep.test
rename to test/tools/llvm-objcopy/ELF/keep-only-section.test
index 75f7d7251a8b14d3b1aa13d066909b8e3e64170d..38aa52cc70f35d9e39abde8345122eda42ac2a1c 100644
--- a/test/tools/llvm-objcopy/keep-only-keep.test
+++ b/test/tools/llvm-objcopy/ELF/keep-only-section.test
@@ -1,6 +1,6 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -keep=.test2 -only-keep=.test %t %t2
-# RUN: llvm-objcopy -j .test -keep=.test2 %t %t3
+# RUN: llvm-objcopy -keep-section=.test2 -only-section=.test %t %t2
+# RUN: llvm-objcopy -j .test -keep-section=.test2 %t %t3
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
 # RUN: diff %t2 %t3
 
diff --git a/test/tools/llvm-objcopy/keep-symbol-remove-section.test b/test/tools/llvm-objcopy/ELF/keep-symbol-remove-section.test
similarity index 100%
rename from test/tools/llvm-objcopy/keep-symbol-remove-section.test
rename to test/tools/llvm-objcopy/ELF/keep-symbol-remove-section.test
diff --git a/test/tools/llvm-objcopy/keep-symbol.test b/test/tools/llvm-objcopy/ELF/keep-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/keep-symbol.test
rename to test/tools/llvm-objcopy/ELF/keep-symbol.test
diff --git a/test/tools/llvm-objcopy/localize-hidden.test b/test/tools/llvm-objcopy/ELF/localize-hidden.test
similarity index 100%
rename from test/tools/llvm-objcopy/localize-hidden.test
rename to test/tools/llvm-objcopy/ELF/localize-hidden.test
diff --git a/test/tools/llvm-objcopy/localize.test b/test/tools/llvm-objcopy/ELF/localize.test
similarity index 100%
rename from test/tools/llvm-objcopy/localize.test
rename to test/tools/llvm-objcopy/ELF/localize.test
diff --git a/test/tools/llvm-objcopy/many-sections.test b/test/tools/llvm-objcopy/ELF/many-sections.test
similarity index 100%
rename from test/tools/llvm-objcopy/many-sections.test
rename to test/tools/llvm-objcopy/ELF/many-sections.test
diff --git a/test/tools/llvm-objcopy/marker-segment.test b/test/tools/llvm-objcopy/ELF/marker-segment.test
similarity index 100%
rename from test/tools/llvm-objcopy/marker-segment.test
rename to test/tools/llvm-objcopy/ELF/marker-segment.test
diff --git a/test/tools/llvm-objcopy/ELF/no-build-id-no-notes.test b/test/tools/llvm-objcopy/ELF/no-build-id-no-notes.test
new file mode 100644
index 0000000000000000000000000000000000000000..357e0cdba4883b6d3e2dd48a9a9c849f7c586b6e
--- /dev/null
+++ b/test/tools/llvm-objcopy/ELF/no-build-id-no-notes.test
@@ -0,0 +1,11 @@
+# RUN: yaml2obj %s > %t
+# RUN: not llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-input=.debug %t 2>&1 >/dev/null | FileCheck %s
+
+# CHECK: Could not find build ID.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
diff --git a/test/tools/llvm-objcopy/ELF/no-build-id.test b/test/tools/llvm-objcopy/ELF/no-build-id.test
new file mode 100644
index 0000000000000000000000000000000000000000..d75f3be178084820cf7eec3ff392685077a2f8e9
--- /dev/null
+++ b/test/tools/llvm-objcopy/ELF/no-build-id.test
@@ -0,0 +1,21 @@
+# RUN: yaml2obj %s > %t
+# RUN: not llvm-objcopy --build-id-link-dir=%t-dir --build-id-link-input=.debug %t 2>&1 >/dev/null | FileCheck %s
+
+# CHECK: Could not find build ID.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .note.foo
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Content:         000000000000000000000000
+ProgramHeaders:
+  - Type: PT_NOTE
+    Flags: [ PF_R ]
+    Sections:
+      - Section: .note.gnu.foo
diff --git a/test/tools/llvm-objcopy/no-symbol-relocation.test b/test/tools/llvm-objcopy/ELF/no-symbol-relocation.test
similarity index 100%
rename from test/tools/llvm-objcopy/no-symbol-relocation.test
rename to test/tools/llvm-objcopy/ELF/no-symbol-relocation.test
diff --git a/test/tools/llvm-objcopy/null-symbol.test b/test/tools/llvm-objcopy/ELF/null-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/null-symbol.test
rename to test/tools/llvm-objcopy/ELF/null-symbol.test
diff --git a/test/tools/llvm-objcopy/objcopy-version.test b/test/tools/llvm-objcopy/ELF/objcopy-version.test
similarity index 74%
rename from test/tools/llvm-objcopy/objcopy-version.test
rename to test/tools/llvm-objcopy/ELF/objcopy-version.test
index 3b5cd593ddf8cc0290e03872e52e16fb769b5eea..f2f7fa0530650ad586b9440f93618f0ff4e364f6 100644
--- a/test/tools/llvm-objcopy/objcopy-version.test
+++ b/test/tools/llvm-objcopy/ELF/objcopy-version.test
@@ -1,4 +1,5 @@
 # RUN: llvm-objcopy -version | FileCheck %s
 # RUN: llvm-objcopy --version | FileCheck %s
+# RUN: llvm-objcopy -V | FileCheck %s
 
 # CHECK: {{ version }}
diff --git a/test/tools/llvm-objcopy/only-keep-many.test b/test/tools/llvm-objcopy/ELF/only-section-many.test
similarity index 100%
rename from test/tools/llvm-objcopy/only-keep-many.test
rename to test/tools/llvm-objcopy/ELF/only-section-many.test
diff --git a/test/tools/llvm-objcopy/only-keep-remove-strtab.test b/test/tools/llvm-objcopy/ELF/only-section-remove-strtab.test
similarity index 87%
rename from test/tools/llvm-objcopy/only-keep-remove-strtab.test
rename to test/tools/llvm-objcopy/ELF/only-section-remove-strtab.test
index f6885c8e131c31ba2b8bc279439b532f7a101c5d..e336181d9fa43cdca30386f402d212495c3abd0a 100644
--- a/test/tools/llvm-objcopy/only-keep-remove-strtab.test
+++ b/test/tools/llvm-objcopy/ELF/only-section-remove-strtab.test
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -R .symtab -R .strtab -only-keep=.test %t %t2
+# RUN: llvm-objcopy -R .symtab -R .strtab -only-section=.test %t %t2
 # RUN: llvm-objcopy -j .test -R .strtab -R .symtab %t %t3
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
 # RUN: diff %t2 %t3
diff --git a/test/tools/llvm-objcopy/only-keep-strip-non-alloc.test b/test/tools/llvm-objcopy/ELF/only-section-strip-non-alloc.test
similarity index 86%
rename from test/tools/llvm-objcopy/only-keep-strip-non-alloc.test
rename to test/tools/llvm-objcopy/ELF/only-section-strip-non-alloc.test
index bad00228492a61a547a1405d036f7612f6e63fe0..f61b4ae664c6586b2285e6fa9f3b820c2debd95e 100644
--- a/test/tools/llvm-objcopy/only-keep-strip-non-alloc.test
+++ b/test/tools/llvm-objcopy/ELF/only-section-strip-non-alloc.test
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -strip-non-alloc -only-keep=.test %t %t2
+# RUN: llvm-objcopy -strip-non-alloc -only-section=.test %t %t2
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
 
 !ELF
diff --git a/test/tools/llvm-objcopy/overlap-chain.test b/test/tools/llvm-objcopy/ELF/overlap-chain.test
similarity index 100%
rename from test/tools/llvm-objcopy/overlap-chain.test
rename to test/tools/llvm-objcopy/ELF/overlap-chain.test
diff --git a/test/tools/llvm-objcopy/parent-loop-check.test b/test/tools/llvm-objcopy/ELF/parent-loop-check.test
similarity index 100%
rename from test/tools/llvm-objcopy/parent-loop-check.test
rename to test/tools/llvm-objcopy/ELF/parent-loop-check.test
diff --git a/test/tools/llvm-objcopy/prefix-symbols.test b/test/tools/llvm-objcopy/ELF/prefix-symbols.test
similarity index 100%
rename from test/tools/llvm-objcopy/prefix-symbols.test
rename to test/tools/llvm-objcopy/ELF/prefix-symbols.test
diff --git a/test/tools/llvm-objcopy/program-headers.test b/test/tools/llvm-objcopy/ELF/program-headers.test
similarity index 100%
rename from test/tools/llvm-objcopy/program-headers.test
rename to test/tools/llvm-objcopy/ELF/program-headers.test
diff --git a/test/tools/llvm-objcopy/pt-phdr.test b/test/tools/llvm-objcopy/ELF/pt-phdr.test
similarity index 100%
rename from test/tools/llvm-objcopy/pt-phdr.test
rename to test/tools/llvm-objcopy/ELF/pt-phdr.test
diff --git a/test/tools/llvm-objcopy/redefine-symbol.test b/test/tools/llvm-objcopy/ELF/redefine-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/redefine-symbol.test
rename to test/tools/llvm-objcopy/ELF/redefine-symbol.test
diff --git a/test/tools/llvm-objcopy/reloc-error-remove-symtab.test b/test/tools/llvm-objcopy/ELF/reloc-error-remove-symtab.test
similarity index 100%
rename from test/tools/llvm-objcopy/reloc-error-remove-symtab.test
rename to test/tools/llvm-objcopy/ELF/reloc-error-remove-symtab.test
diff --git a/test/tools/llvm-objcopy/reloc-no-symtab.test b/test/tools/llvm-objcopy/ELF/reloc-no-symtab.test
similarity index 100%
rename from test/tools/llvm-objcopy/reloc-no-symtab.test
rename to test/tools/llvm-objcopy/ELF/reloc-no-symtab.test
diff --git a/test/tools/llvm-objcopy/relocatable-phdr.test b/test/tools/llvm-objcopy/ELF/relocatable-phdr.test
similarity index 100%
rename from test/tools/llvm-objcopy/relocatable-phdr.test
rename to test/tools/llvm-objcopy/ELF/relocatable-phdr.test
diff --git a/test/tools/llvm-objcopy/remove-multiple-sections.test b/test/tools/llvm-objcopy/ELF/remove-multiple-sections.test
similarity index 100%
rename from test/tools/llvm-objcopy/remove-multiple-sections.test
rename to test/tools/llvm-objcopy/ELF/remove-multiple-sections.test
diff --git a/test/tools/llvm-objcopy/remove-section-with-symbol.test b/test/tools/llvm-objcopy/ELF/remove-section-with-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/remove-section-with-symbol.test
rename to test/tools/llvm-objcopy/ELF/remove-section-with-symbol.test
diff --git a/test/tools/llvm-objcopy/remove-section.test b/test/tools/llvm-objcopy/ELF/remove-section.test
similarity index 100%
rename from test/tools/llvm-objcopy/remove-section.test
rename to test/tools/llvm-objcopy/ELF/remove-section.test
diff --git a/test/tools/llvm-objcopy/remove-shndx.test b/test/tools/llvm-objcopy/ELF/remove-shndx.test
similarity index 100%
rename from test/tools/llvm-objcopy/remove-shndx.test
rename to test/tools/llvm-objcopy/ELF/remove-shndx.test
diff --git a/test/tools/llvm-objcopy/remove-shstrtab-error.test b/test/tools/llvm-objcopy/ELF/remove-shstrtab-error.test
similarity index 100%
rename from test/tools/llvm-objcopy/remove-shstrtab-error.test
rename to test/tools/llvm-objcopy/ELF/remove-shstrtab-error.test
diff --git a/test/tools/llvm-objcopy/remove-symtab.test b/test/tools/llvm-objcopy/ELF/remove-symtab.test
similarity index 100%
rename from test/tools/llvm-objcopy/remove-symtab.test
rename to test/tools/llvm-objcopy/ELF/remove-symtab.test
diff --git a/test/tools/llvm-objcopy/rename-section-flag-osproc-mask.test b/test/tools/llvm-objcopy/ELF/rename-section-flag-osproc-mask.test
similarity index 100%
rename from test/tools/llvm-objcopy/rename-section-flag-osproc-mask.test
rename to test/tools/llvm-objcopy/ELF/rename-section-flag-osproc-mask.test
diff --git a/test/tools/llvm-objcopy/rename-section-flag-preserved.test b/test/tools/llvm-objcopy/ELF/rename-section-flag-preserved.test
similarity index 100%
rename from test/tools/llvm-objcopy/rename-section-flag-preserved.test
rename to test/tools/llvm-objcopy/ELF/rename-section-flag-preserved.test
diff --git a/test/tools/llvm-objcopy/rename-section-flag.test b/test/tools/llvm-objcopy/ELF/rename-section-flag.test
similarity index 100%
rename from test/tools/llvm-objcopy/rename-section-flag.test
rename to test/tools/llvm-objcopy/ELF/rename-section-flag.test
diff --git a/test/tools/llvm-objcopy/rename-section-multiple.test b/test/tools/llvm-objcopy/ELF/rename-section-multiple.test
similarity index 100%
rename from test/tools/llvm-objcopy/rename-section-multiple.test
rename to test/tools/llvm-objcopy/ELF/rename-section-multiple.test
diff --git a/test/tools/llvm-objcopy/rename-section.test b/test/tools/llvm-objcopy/ELF/rename-section.test
similarity index 100%
rename from test/tools/llvm-objcopy/rename-section.test
rename to test/tools/llvm-objcopy/ELF/rename-section.test
diff --git a/test/tools/llvm-objcopy/section-index-unsupported.test b/test/tools/llvm-objcopy/ELF/section-index-unsupported.test
similarity index 100%
rename from test/tools/llvm-objcopy/section-index-unsupported.test
rename to test/tools/llvm-objcopy/ELF/section-index-unsupported.test
diff --git a/test/tools/llvm-objcopy/sectionless-segment.test b/test/tools/llvm-objcopy/ELF/sectionless-segment.test
similarity index 100%
rename from test/tools/llvm-objcopy/sectionless-segment.test
rename to test/tools/llvm-objcopy/ELF/sectionless-segment.test
diff --git a/test/tools/llvm-objcopy/segment-shift-section-remove.test b/test/tools/llvm-objcopy/ELF/segment-shift-section-remove.test
similarity index 100%
rename from test/tools/llvm-objcopy/segment-shift-section-remove.test
rename to test/tools/llvm-objcopy/ELF/segment-shift-section-remove.test
diff --git a/test/tools/llvm-objcopy/segment-shift.test b/test/tools/llvm-objcopy/ELF/segment-shift.test
similarity index 100%
rename from test/tools/llvm-objcopy/segment-shift.test
rename to test/tools/llvm-objcopy/ELF/segment-shift.test
diff --git a/test/tools/llvm-objcopy/segment-test-remove-section.test b/test/tools/llvm-objcopy/ELF/segment-test-remove-section.test
similarity index 100%
rename from test/tools/llvm-objcopy/segment-test-remove-section.test
rename to test/tools/llvm-objcopy/ELF/segment-test-remove-section.test
diff --git a/test/tools/llvm-objcopy/strict-no-add.test b/test/tools/llvm-objcopy/ELF/strict-no-add.test
similarity index 100%
rename from test/tools/llvm-objcopy/strict-no-add.test
rename to test/tools/llvm-objcopy/ELF/strict-no-add.test
diff --git a/test/tools/llvm-objcopy/strip-all-and-keep-symbol.test b/test/tools/llvm-objcopy/ELF/strip-all-and-keep-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-all-and-keep-symbol.test
rename to test/tools/llvm-objcopy/ELF/strip-all-and-keep-symbol.test
diff --git a/test/tools/llvm-objcopy/strip-all-and-remove.test b/test/tools/llvm-objcopy/ELF/strip-all-and-remove.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-all-and-remove.test
rename to test/tools/llvm-objcopy/ELF/strip-all-and-remove.test
diff --git a/test/tools/llvm-objcopy/strip-all-gnu.test b/test/tools/llvm-objcopy/ELF/strip-all-gnu.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-all-gnu.test
rename to test/tools/llvm-objcopy/ELF/strip-all-gnu.test
diff --git a/test/tools/llvm-objcopy/strip-all.test b/test/tools/llvm-objcopy/ELF/strip-all.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-all.test
rename to test/tools/llvm-objcopy/ELF/strip-all.test
diff --git a/test/tools/llvm-objcopy/strip-debug-and-remove.test b/test/tools/llvm-objcopy/ELF/strip-debug-and-remove.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-debug-and-remove.test
rename to test/tools/llvm-objcopy/ELF/strip-debug-and-remove.test
diff --git a/test/tools/llvm-objcopy/strip-debug.test b/test/tools/llvm-objcopy/ELF/strip-debug.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-debug.test
rename to test/tools/llvm-objcopy/ELF/strip-debug.test
diff --git a/test/tools/llvm-objcopy/strip-dwo-groups.test b/test/tools/llvm-objcopy/ELF/strip-dwo-groups.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-dwo-groups.test
rename to test/tools/llvm-objcopy/ELF/strip-dwo-groups.test
diff --git a/test/tools/llvm-objcopy/strip-dwo-inplace.test b/test/tools/llvm-objcopy/ELF/strip-dwo-inplace.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-dwo-inplace.test
rename to test/tools/llvm-objcopy/ELF/strip-dwo-inplace.test
diff --git a/test/tools/llvm-objcopy/strip-group-symbol.test b/test/tools/llvm-objcopy/ELF/strip-group-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-group-symbol.test
rename to test/tools/llvm-objcopy/ELF/strip-group-symbol.test
diff --git a/test/tools/llvm-objcopy/strip-multiple-files.test b/test/tools/llvm-objcopy/ELF/strip-multiple-files.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-multiple-files.test
rename to test/tools/llvm-objcopy/ELF/strip-multiple-files.test
diff --git a/test/tools/llvm-objcopy/strip-non-alloc.test b/test/tools/llvm-objcopy/ELF/strip-non-alloc.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-non-alloc.test
rename to test/tools/llvm-objcopy/ELF/strip-non-alloc.test
diff --git a/test/tools/llvm-objcopy/strip-preserve-time.test b/test/tools/llvm-objcopy/ELF/strip-preserve-atime.test
similarity index 69%
rename from test/tools/llvm-objcopy/strip-preserve-time.test
rename to test/tools/llvm-objcopy/ELF/strip-preserve-atime.test
index afede2c21ecb142518d9d025429cd6f8530fa1f6..5cee21fb96c47ea2c81bfeb4a47037cb3b322325 100644
--- a/test/tools/llvm-objcopy/strip-preserve-time.test
+++ b/test/tools/llvm-objcopy/ELF/strip-preserve-atime.test
@@ -1,40 +1,34 @@
-# Note: ls -lu prints the accessed timestamp, ls -l prints the modified timestamp
+# Note: ls -lu prints the accessed timestamp
+# NetBSD: noatime mounts currently inhibit 'touch -a' updates
+# UNSUPPORTED: system-netbsd
 
 # Preserve dates when stripping to an output file.
 # RUN: yaml2obj %s > %t.1.o
 # RUN: touch -a -t 199505050555.55 %t.1.o
-# RUN: touch -m -t 199705050555.55 %t.1.o
 # RUN: llvm-strip -p %t.1.o -o %t-preserved.1.o
 # RUN: ls -lu %t-preserved.1.o | FileCheck %s --check-prefix=CHECK-PRESERVE-ATIME
-# RUN: ls -l %t-preserved.1.o | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
 # Check that the stripped output is in fact a valid object file.
 # RUN: llvm-readobj %t-preserved.1.o
 
 # Preserve dates available via objcopy interface as well.
 # RUN: yaml2obj %s > %t.2.o
 # RUN: touch -a -t 199505050555.55 %t.2.o
-# RUN: touch -m -t 199705050555.55 %t.2.o
 # RUN: llvm-objcopy -p %t.2.o %t-preserved.2.o
 # RUN: ls -lu %t-preserved.2.o | FileCheck %s --check-prefix=CHECK-PRESERVE-ATIME
-# RUN: ls -l %t-preserved.2.o | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
 # RUN: llvm-readobj %t-preserved.2.o
 
 # Preserve dates when stripping in place.
 # RUN: yaml2obj %s > %t.3.o
 # RUN: touch -a -t 199505050555.55 %t.3.o
-# RUN: touch -m -t 199705050555.55 %t.3.o
 # RUN: llvm-strip -p %t.3.o
 # RUN: ls -lu %t.3.o | FileCheck %s --check-prefix=CHECK-PRESERVE-ATIME
-# RUN: ls -l %t.3.o | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
 # RUN: llvm-readobj %t.3.o
 
 # Without -p set, don't preserve dates.
 # RUN: yaml2obj %s > %t.4.o
 # RUN: touch -a -t 199505050555.55 %t.4.o
-# RUN: touch -m -t 199705050555.55 %t.4.o
 # RUN: llvm-strip %t.4.o
 # RUN: ls -lu %t.4.o | FileCheck %s --check-prefix=CHECK-NO-PRESERVE-ATIME
-# RUN: ls -l %t.4.o | FileCheck %s --check-prefix=CHECK-NO-PRESERVE-MTIME
 # RUN: llvm-readobj %t.4.o
 
 # Preserve dates in archives.
@@ -42,29 +36,21 @@
 # RUN: rm -f %t.a
 # RUN: llvm-ar cr %t.a %t.5.o
 # RUN: touch -a -t 199505050555.55 %t.a
-# RUN: touch -m -t 199705050555.55 %t.a
 # RUN: llvm-strip -p %t.a
 # RUN: ls -lu %t.a | FileCheck %s --check-prefix=CHECK-PRESERVE-ATIME
-# RUN: ls -l %t.a | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
 # RUN: llvm-readobj %t.a
 
 # Preserve dates in split DWO files.
 # RUN: cp %p/Inputs/dwarf.dwo %t-input.dwo
 # RUN: touch -a -t 199505050555.55 %t-input.dwo
-# RUN: touch -m -t 199705050555.55 %t-input.dwo
 # RUN: llvm-objcopy -p -split-dwo=%t-dwo %t-input.dwo %t-nondwo
 # RUN: ls -lu %t-dwo | FileCheck %s --check-prefix=CHECK-PRESERVE-ATIME
-# RUN: ls -l %t-dwo | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
 # RUN: llvm-readobj %t-dwo
 # RUN: ls -lu %t-nondwo | FileCheck %s --check-prefix=CHECK-PRESERVE-ATIME
-# RUN: ls -l %t-nondwo | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
 # RUN: llvm-readobj %t-nondwo
 
 # CHECK-PRESERVE-ATIME:        {{[[:space:]]1995}}
-# CHECK-PRESERVE-MTIME:        {{[[:space:]]1997}}
-
 # CHECK-NO-PRESERVE-ATIME-NOT: {{[[:space:]]1995}}
-# CHECK-NO-PRESERVE-MTIME-NOT: {{[[:space:]]1997}}
 
 !ELF
 FileHeader:
diff --git a/test/tools/llvm-objcopy/ELF/strip-preserve-mtime.test b/test/tools/llvm-objcopy/ELF/strip-preserve-mtime.test
new file mode 100644
index 0000000000000000000000000000000000000000..ac430b4ecbd81abe8a6ccf5711b0f6c705f3f3f8
--- /dev/null
+++ b/test/tools/llvm-objcopy/ELF/strip-preserve-mtime.test
@@ -0,0 +1,62 @@
+# Note: ls -l prints the modified timestamp
+
+# Preserve dates when stripping to an output file.
+# RUN: yaml2obj %s > %t.1.o
+# RUN: touch -m -t 199705050555.55 %t.1.o
+# RUN: llvm-strip -p %t.1.o -o %t-preserved.1.o
+# RUN: ls -l %t-preserved.1.o | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
+# Check that the stripped output is in fact a valid object file.
+# RUN: llvm-readobj %t-preserved.1.o
+
+# Preserve dates available via objcopy interface as well.
+# RUN: yaml2obj %s > %t.2.o
+# RUN: touch -m -t 199705050555.55 %t.2.o
+# RUN: llvm-objcopy -p %t.2.o %t-preserved.2.o
+# RUN: ls -l %t-preserved.2.o | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
+# RUN: llvm-readobj %t-preserved.2.o
+
+# Preserve dates when stripping in place.
+# RUN: yaml2obj %s > %t.3.o
+# RUN: touch -m -t 199705050555.55 %t.3.o
+# RUN: llvm-strip -p %t.3.o
+# RUN: ls -l %t.3.o | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
+# RUN: llvm-readobj %t.3.o
+
+# Without -p set, don't preserve dates.
+# RUN: yaml2obj %s > %t.4.o
+# RUN: touch -m -t 199705050555.55 %t.4.o
+# RUN: llvm-strip %t.4.o
+# RUN: ls -l %t.4.o | FileCheck %s --check-prefix=CHECK-NO-PRESERVE-MTIME
+# RUN: llvm-readobj %t.4.o
+
+# Preserve dates in archives.
+# RUN: yaml2obj %s > %t.5.o
+# RUN: rm -f %t.a
+# RUN: llvm-ar cr %t.a %t.5.o
+# RUN: touch -m -t 199705050555.55 %t.a
+# RUN: llvm-strip -p %t.a
+# RUN: ls -l %t.a | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
+# RUN: llvm-readobj %t.a
+
+# Preserve dates in split DWO files.
+# RUN: cp %p/Inputs/dwarf.dwo %t-input.dwo
+# RUN: touch -m -t 199705050555.55 %t-input.dwo
+# RUN: llvm-objcopy -p -split-dwo=%t-dwo %t-input.dwo %t-nondwo
+# RUN: ls -l %t-dwo | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
+# RUN: llvm-readobj %t-dwo
+# RUN: ls -l %t-nondwo | FileCheck %s --check-prefix=CHECK-PRESERVE-MTIME
+# RUN: llvm-readobj %t-nondwo
+
+# CHECK-PRESERVE-MTIME:        {{[[:space:]]1997}}
+# CHECK-NO-PRESERVE-MTIME-NOT: {{[[:space:]]1997}}
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
diff --git a/test/tools/llvm-objcopy/strip-reloc-symbol.test b/test/tools/llvm-objcopy/ELF/strip-reloc-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-reloc-symbol.test
rename to test/tools/llvm-objcopy/ELF/strip-reloc-symbol.test
diff --git a/test/tools/llvm-objcopy/strip-sections-keep.test b/test/tools/llvm-objcopy/ELF/strip-sections-keep.test
similarity index 78%
rename from test/tools/llvm-objcopy/strip-sections-keep.test
rename to test/tools/llvm-objcopy/ELF/strip-sections-keep.test
index dcf696853824b327c3e056b463db3dbe902c396d..f0031faef6e778e18e5ffcc35571bf76487303cc 100644
--- a/test/tools/llvm-objcopy/strip-sections-keep.test
+++ b/test/tools/llvm-objcopy/ELF/strip-sections-keep.test
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -strip-sections -keep=.shstrtab %t %t2
+# RUN: llvm-objcopy -strip-sections -keep-section=.shstrtab %t %t2
 # RUN: od -Ax -t c %t2 | FileCheck %s
 
 !ELF
diff --git a/test/tools/llvm-objcopy/strip-sections-only-keep.test b/test/tools/llvm-objcopy/ELF/strip-sections-only-section.test
similarity index 87%
rename from test/tools/llvm-objcopy/strip-sections-only-keep.test
rename to test/tools/llvm-objcopy/ELF/strip-sections-only-section.test
index 2c9400cf34c14279bbda7537756ab3c750e515e3..2c14a7272e3ee1ff68001c92ba81683bcfbd0f94 100644
--- a/test/tools/llvm-objcopy/strip-sections-only-keep.test
+++ b/test/tools/llvm-objcopy/ELF/strip-sections-only-section.test
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy -strip-sections -only-keep=.test %t %t2
+# RUN: llvm-objcopy -strip-sections -only-section=.test %t %t2
 # RUN: od -Ax -t x1 %t2 | FileCheck %s
 # RUN: od -Ax -t c  %t2 | FileCheck %s -check-prefix=TEXT
 
diff --git a/test/tools/llvm-objcopy/strip-sections.test b/test/tools/llvm-objcopy/ELF/strip-sections.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-sections.test
rename to test/tools/llvm-objcopy/ELF/strip-sections.test
diff --git a/test/tools/llvm-objcopy/strip-symbol.test b/test/tools/llvm-objcopy/ELF/strip-symbol.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-symbol.test
rename to test/tools/llvm-objcopy/ELF/strip-symbol.test
diff --git a/test/tools/llvm-objcopy/strip-unneeded.test b/test/tools/llvm-objcopy/ELF/strip-unneeded.test
similarity index 100%
rename from test/tools/llvm-objcopy/strip-unneeded.test
rename to test/tools/llvm-objcopy/ELF/strip-unneeded.test
diff --git a/test/tools/llvm-objcopy/strip-version.test b/test/tools/llvm-objcopy/ELF/strip-version.test
similarity index 50%
rename from test/tools/llvm-objcopy/strip-version.test
rename to test/tools/llvm-objcopy/ELF/strip-version.test
index bd91dd3de831601441396b60342a69e79d5cee9a..d1d3165106a5c6756d7ad2c4b7177e0e8c255905 100644
--- a/test/tools/llvm-objcopy/strip-version.test
+++ b/test/tools/llvm-objcopy/ELF/strip-version.test
@@ -1,4 +1,6 @@
 # RUN: llvm-strip -version | FileCheck %s
 # RUN: llvm-strip --version | FileCheck %s
+# RUN: llvm-strip -V | FileCheck %s
 
-# CHECK: {{ version }}
+# CHECK-DAG: {{ version }}
+# CHECK-DAG: GNU strip
diff --git a/test/tools/llvm-objcopy/symbol-copy.test b/test/tools/llvm-objcopy/ELF/symbol-copy.test
similarity index 100%
rename from test/tools/llvm-objcopy/symbol-copy.test
rename to test/tools/llvm-objcopy/ELF/symbol-copy.test
diff --git a/test/tools/llvm-objcopy/symtab-error-on-remove-strtab.test b/test/tools/llvm-objcopy/ELF/symtab-error-on-remove-strtab.test
similarity index 100%
rename from test/tools/llvm-objcopy/symtab-error-on-remove-strtab.test
rename to test/tools/llvm-objcopy/ELF/symtab-error-on-remove-strtab.test
diff --git a/test/tools/llvm-objcopy/symtab-link.test b/test/tools/llvm-objcopy/ELF/symtab-link.test
similarity index 100%
rename from test/tools/llvm-objcopy/symtab-link.test
rename to test/tools/llvm-objcopy/ELF/symtab-link.test
diff --git a/test/tools/llvm-objcopy/triple-overlap.test b/test/tools/llvm-objcopy/ELF/triple-overlap.test
similarity index 100%
rename from test/tools/llvm-objcopy/triple-overlap.test
rename to test/tools/llvm-objcopy/ELF/triple-overlap.test
diff --git a/test/tools/llvm-objcopy/two-seg-remove-end.test b/test/tools/llvm-objcopy/ELF/two-seg-remove-end.test
similarity index 100%
rename from test/tools/llvm-objcopy/two-seg-remove-end.test
rename to test/tools/llvm-objcopy/ELF/two-seg-remove-end.test
diff --git a/test/tools/llvm-objcopy/two-seg-remove-first.test b/test/tools/llvm-objcopy/ELF/two-seg-remove-first.test
similarity index 100%
rename from test/tools/llvm-objcopy/two-seg-remove-first.test
rename to test/tools/llvm-objcopy/ELF/two-seg-remove-first.test
diff --git a/test/tools/llvm-objcopy/two-seg-remove-third-sec.test b/test/tools/llvm-objcopy/ELF/two-seg-remove-third-sec.test
similarity index 100%
rename from test/tools/llvm-objcopy/two-seg-remove-third-sec.test
rename to test/tools/llvm-objcopy/ELF/two-seg-remove-third-sec.test
diff --git a/test/tools/llvm-objcopy/weaken-all.test b/test/tools/llvm-objcopy/ELF/weaken-all.test
similarity index 100%
rename from test/tools/llvm-objcopy/weaken-all.test
rename to test/tools/llvm-objcopy/ELF/weaken-all.test
diff --git a/test/tools/llvm-objcopy/weaken.test b/test/tools/llvm-objcopy/ELF/weaken.test
similarity index 100%
rename from test/tools/llvm-objcopy/weaken.test
rename to test/tools/llvm-objcopy/ELF/weaken.test
diff --git a/test/tools/llvm-objdump/Inputs/eh_frame-coff.yaml b/test/tools/llvm-objdump/Inputs/eh_frame-coff.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8fcd001b070b68ca5a54aec90903b235570eda04
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/eh_frame-coff.yaml
@@ -0,0 +1,104 @@
+--- !COFF
+OptionalHeader:  
+  AddressOfEntryPoint: 5056
+  ImageBase:       4194304
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_NO_SEH, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  ImportTable:     
+    RelativeVirtualAddress: 13032
+    Size:            240
+  ResourceTable:   
+    RelativeVirtualAddress: 0
+    Size:            0
+  ExceptionTable:  
+    RelativeVirtualAddress: 0
+    Size:            0
+  CertificateTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  BaseRelocationTable: 
+    RelativeVirtualAddress: 32768
+    Size:            756
+  Debug:           
+    RelativeVirtualAddress: 0
+    Size:            0
+  Architecture:    
+    RelativeVirtualAddress: 0
+    Size:            0
+  GlobalPtr:       
+    RelativeVirtualAddress: 0
+    Size:            0
+  TlsTable:        
+    RelativeVirtualAddress: 12296
+    Size:            24
+  LoadConfigTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  BoundImport:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  IAT:             
+    RelativeVirtualAddress: 13568
+    Size:            296
+  DelayImportDescriptor: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  ClrRuntimeHeader: 
+    RelativeVirtualAddress: 0
+    Size:            0
+header:          
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE ]
+sections:        
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     5921
+    SectionData:     0FB70500004000C705DC43400001000000C705E043400001000000C705E443400001000000C705B4404000010000003D4D5A00007546A13C00400081B8000040005045000075350FB7881800400081F90B020000741D0FB7C981F90B010000751B83B8740040000F721283B8E800400000EB5E83B8840040000F734E31C056833DEC434000000FB6C0A38040400074046A02EB026A01E8EA15000083C4048B35FC434000E85E1500008930E8F0040000833D1440400001750D68D0174000E8FD06000083C40431C05EC383B8F8004000000F95C0EBA8662E0F1F840000000000A1D4434000A3844040006884404000FF35B04040006890404000688C4040006888404000E8E70B000083C414C30F1F00C705EC43400001000000E891040000E90C000000662E0F1F84000000000066905589E553575683E4F083EC60833DEC434000000F57C0C7442450000000000F294424400F294424300F294424200F29442410740B8D44241050FF15C035400064A11800000031F68B780431C0F00FB13DAC404000742A8B1DDC3540000F1F400039C7741768E8030000FFD331C031F6F00FB13DAC40400075E7EB05BE01000000A1A840400083F801750C6A1FE80F0C000083C404EB2E833DA8404000007409C6059440400001EB1CC705A84040000100000068C432400068B8324000E80414000083C408A1A840400083F801751C68AC32400068A4324000E8E813000083C408C705A84040000200000085F6750831C08705AC404000A12030400085C074086A006A026A00FFD0E83406000068201C4000FF15D8354000A3004440006800144000E8A513000083C404E8E2050000C7059C40400000004000E87C1300008B0085C0745031C9EB17660F1F84000000000031DB85C90F94C380FA220F44CB400FB61080FA207FEA85C9740484D275E284D2741C80FA207F17660F1F8400000000000FB648014084C9740580F9217CF2A3A0404000833DEC4340000074198B44243C0FB7542440B90A000000A8010F45CA890D004040008B35884040008D04B50400000050E8E512000083C40485F67E4D8B1D8C4040008974240C31F68944240890FF34B3E8F512000083C40489C74757E8B912000083C4048B4C24088904B157FF34B350E8AB12000083C40C463974240C75CE8B4C240C8B442408EB0231C9C7048800000000A38C404000E8C1010000A1904040008B0D1C4040008901FF3590404000FF358C404000FF3588404000E88D00000083C40C833D8040400000A398404000741B803D9440400000750AE84D120000A1984040008D65F45E5F5B5DC350E84C120000662E0F1F84000000000090C705EC43400000000000E8E1010000E95CFDFFFF662E0F1F8400000000006690FF742404E8C709000083C40483F80119C0C3662E0F1F8400000000000F1F4000C3CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5589E583EC288B450C8B4D088945F0894DECE8F9000000C745FC0000000089E0C70004000000E88411000089C18945E8E85B110000A10435400089E18941048B45E88901C74108B9254000E87111000083EC0CE9200000008945F88955F48B45F8890424E84C1100008945E4E84A11000031C083C4285DC3CCCCCCCCCCCCCCCCA1044040008B0085C0741C0F1F440000FFD0A1044040008D4804890D044040008B400485C075E9C30F1F840000000000568B35D832400083FEFF7521BEFFFFFFFF662E0F1F8400000000000F1F440000833CB5E0324000008D760175F385F67419662E0F1F8400000000000F1F440000FF14B5D83240004E75F66890144000E8CCFEFFFF83C4045EC30F1F8000000000803DA440400000755F568B35D8324000C605A44040000183FEFF7521BEFFFFFFFF662E0F1F8400000000000F1F440000833CB5E0324000008D760175F385F67419662E0F1F8400000000000F1F440000FF14B5D83240004E75F66890144000E85CFEFFFF83C4045EC3CCCCCCCCCCCCCCFF2520364000CCCCCCCCCCCCCCCCCCCC31C0C3CCCCCCCCCCCCCCCCCCCCCCCCCC5589E553575683E4F883EC18A174404000C744240400000000C70424000000003D4EE640BB755489E050FF15C43540008B5C2404331C24FF15B435400089C6FF15B835400089C731F7FF15C835400089C68D44240831FE31DE50FF15D435400033742408B84FE640BB3374240C81FE4EE640BB0F45C6A374404000F7D0A3784040008D65F45E5F5B5DC3660F1F4400005589E583EC088B550489E88B4D0883C004891570414000A37C414000A174404000891590434000C70584434000090400C0890D64414000C70588434000010000008945F8A1784040008945FC6A00FF15D83540006800304000FF15E8354000FF15B035400068090400C050FF15E0354000E89F0F0000CCCCCCCCCCCCCCCCCCCC57568B442410833D6840400002740AC705684040000200000083F801743383F8027540BEB432400081FEB43240007433BFB4324000662E0F1F840000000000908B0685C07402FFD083C60439F775F1EB12FF7424146A01FF742414E8A008000083C40CB8010000005E5FC20C000F1F0031C0C3662E0F1F8400000000000F1F008B44240883F803740485C07511FF74240C50FF74240CE86508000083C40CB801000000C20C00CCCCCCCCCCCCCCCCCCCCA1E843400085C074425589E583E4F883EC208B55088B4D0CF20F104520F20F104D18F20F105510891424894C240489E1F20F11542408F20F114C2410F20F1144241851FFD083C40489EC5DC30F1F40008B442404A3E8434000E98D0E00006690575683EC288B7C2434BE093140008B078D48FF83F90577078B34854C314000C7042402000000E8660E00008B4F04F20F104708F20F104F10F20F10571889742408890424C744240417314000F20F11542420F20F114C2418F20F11442410894C240CE8F908000031C083C4285E5FC3CCDBE3C3CCCCCCCCCCCCCCCCCCCCCCCCCC8B442404C3662E0F1F840000000000908B442404C3CCCCCCCCCCCCCCCCCCCCCC5589E553575683E4F083EC3089E6803DF0434000000F8502010000C605F043400001E8590A0000C1E0028D0480E89E0C000089E083E0F089C4A3F4434000BBA4324000B8A4324000C705F84340000000000029D883F8070F8EC000000083F80C7C2F833DA432400000BBA43240007521833DA832400000BBA43240007513833DAC32400000BBA43240007505BBB0324000833B00750A837B04000F848500000081FBA432400073338D7E180F1F4400008B53048B030382000040008D8A0000400089FA8946186A04E85301000083C40483C30881FBA432400072D5A1F843400085C07E3931FFBB080000000F1F4400008B0DF44340008B5419F885D274178D461C5052FF3419FF7419FCFF15EC354000A1F84340004783C31439C77CD38D65F45E5F5B5DC38B430883F8010F85E100000083C30C81FBA4324000739FB800004000F7D8894614662E0F1F8400000000008B4B04B8000040008B7E148B530801C18B0329C78B800000400080FA20746D80FA10743880FA080F85870000000FB6118956088B5608807E08008D9200FFFFFF89560C8B560C0F49560801F889560C8D561003460C8946106A01EB3C0FB7118956088B560866837E08008D920000FFFF89560C8B560C0F49560801F889560C8D561003460C8946106A02EB0C01F88D561003018946106A04E83300000083C40483C30C81FBA43240000F8251FFFFFFE9D7FEFFFF0FB6C250689A314000E84E010000506868314000E8430100000F1F005553575683EC20A1F843400089D789CE85C07E2A8B0DF443400031ED83C110908B51FC39F2770D8B1903530839F20F87BA0000004583C11439C57CE4EB0231ED893C2456E89707000083C40485C00F84AD00000089C3A1F44340008D0CAD000000008D3C89895C3810C7043800000000E89B08000003430C8B0DF44340008944390C8D4C24046A1C5150FF15F035400085C074788B4424188B3C2483F83F7F0C83F804744383F808743EEB0C83F84074373D8000000074308B4424048B0DF44340008D54AD00894491048D3C918B5C2410895C9108578B7C24046A405350FF15EC35400085C0743EFF05F8434000FF7424345756E85A0A000083C42C5E5F5B5DC35668C4314000E834000000A1F44340008D4CAD00FF74880CFF730868E4314000E81A000000FF15BC354000506815324000E809000000660F1F84000000000056508B74240C8D4424108904246A02E87D0A000083C404506A016A1B683C324000E8830A000083C4106A02E8610A000083C404FF34245650E87309000083C40CE8400A0000CCCCCCCCCCCCCCCCCCCCCC57568B44240C8B008B008D8873FFFF3F83F909773F31FFFF248D58324000BF010000006A006A08E8330A000083C40885C0747183F80175456A016A08E81E0A000083C408BEFFFFFFFF85FF7478E8CEFBFFFFEB713D1D0000C074263D050000C075426A006A0BE8F409000083C40885C0743283F80175426A016A0BEB1D6A08EB3A6A006A04E8D509000083C40885C0741383F801751F6A016A04E8C009000083C408EB1CA10044400085C074045E5FFFE031F6EB106A04EB026A0BFFD083C404BEFFFFFFFF89F05E5FC20400CCCCCCCC555357568B7424248B7C241C8B5C24188B6C2414E882090000837C242001B80200000083D80050E87509000083C404E8730900008B00894500E86F0900008B008903E86C0900008B008907FF36E84309000083C40431C05E5F5B5DC30F1F4000555357568B7424248B7C241C8B5C24188B6C2414E840090000837C242001B80200000083D80050E83309000083C404E8130900008B00894500E8270900008B008903E8240900008B008907FF36E8E308000083C40431C05E5F5B5DC30F1F4000568B74240856E80609000083C40431C985C00F44CE89C85EC30F1F8000000000568B7424086A02E88508000083C40456688032400050E84503000083C40C5EC331C0C3662E0F1F8400000000000F1F00FF1518364000E8BC080000A32C404000E8B8080000A334404000E8B4080000A33C404000C3662E0F1F84000000000090FF1518364000E88C080000A32C404000E888080000A334404000E884080000A33C404000C3662E0F1F84000000000090508B4424088B4C240C8D542410891424526A0051506A006A00E85B08000083C41859C3662E0F1F8400000000000F1F0056508B44240C8B4C24108B5424148D742418893424566A005251506A006A02E82B08000083C4205EC30F1F8000000000508B4424088B4C240C8D542410891424526A0051506A006A04E8FB07000083C41859C3CCCCCCCCCCCCCCCCCCCCCCCCCC575631F6833D0C44400000744A6A0C6A01E8E507000083C40885C0743589C78B4C240C8B442410890F8947046810444000FF15AC354000A128444000893D284440008947086810444000FF15D0354000EB05BEFFFFFFFF89F05E5FC30F1F4000833D0C44400000745F566810444000FF15AC3540008B0D2844400085C9743D8B5424088B0131F639D0750589C8EB12908B410885C074258B3039D689CE89C175EF8B480885F67405894E08EB06890D2844400050E85407000083C4046810444000FF15D03540005E31C0C30F1F440000555357568B44241883F8030F874A010000FF248594324000833D0C44400000745A6810444000FF15AC3540008B3D2844400085FF743A8B1DE43540008B2DBC354000662E0F1F8400000000000F1F4000FF37FFD389C6FFD585F6740D85C075098B470456FFD083C4048B7F0885FF75E06810444000FF15D0354000A10C44400083F8010F85D2000000A12844400085C07420662E0F1F8400000000000F1F40008B700850E89406000083C40485F689F075EEC7052844400000000000C7050C444000000000006810444000FF15A8354000E985000000833D0C44400000750B6810444000FF15CC354000C7050C44400001000000EB65E885F7FFFFEB5E833D0C4440000074556810444000FF15AC3540008B3D2844400085FF74358B1DE43540008B2DBC354000660F1F840000000000FF37FFD389C6FFD585F6740D85C075098B470456FFD083C4048B7F0885FF75E06810444000FF15D0354000B8010000005E5F5B5DC3CCCCCCCCCCCCCCCCCCCCCC508B4424088B4C240C8D542410891424526A0051506A006A00E8C705000083C41859C3CCCCCCCCCCCCCCCCCCCCCCCCCC8B4C240431C00FB71181FA4D5A0000751C8B513C813C115045000075100FB74C111831C081F90B0100000F94C0C3669057568B44240C8B703C0FB74C300685C9743401F08B5424100FB770148D44301831F6662E0F1F8400000000000F1F40008B780C39D7770703780839D7770A4683C02839CE72EA31C05E5FC30F1F440000555357568B7C241457E81F04000083C40431F683F80877620FB705000040003D4D5A00007554A13C00400081B8000040005045000075438D80000040000FB7481881F90B01000075310FB7580685DB74290FB7481431ED8D7408180F1F4400006A085756E8D204000083C40C85C0740A4583C62839DD72E831F689F05E5F5B5DC3662E0F1F8400000000000F1F44000057560FB70D0000400031C081F94D5A000075658B0D3C00400081B9000040005045000075538DB1000040000FB74E1881F90B01000075410FB74E0685C974398B54240CB80000400029C20FB746148D44061831F6662E0F1F84000000000066908B780C39D7770703780839D7770A4683C02839CE72EA31C05E5FC30F1F4400000FB70D0000400031C081F94D5A000075288B0D3C00400081B9000040005045000075160FB7911800400081FA0B01000075070FB78106004000C3660F1F440000560FB70D0000400031C081F94D5A000075538B0D3C00400081B9000040005045000075418DB1000040000FB74E1881F90B010000752F0FB74E0685C974270FB746148B5424088D44061831F60F1F4000F6402720740585D2740B4A4683C02839CE72ED31C05EC3660F1F8400000000000FB705000040003D4D5A000075268B0D3C00400081B9000040005045000075140FB78918004000B80000400081F90B010000740231C0C3660F1F84000000000057560FB70D0000400031C081F94D5A0000756F8B0D3C00400081B90000400050450000755D8D91000040000FB74A1881F90B010000754B0FB74A0685C974438B44240CBE0000400029F00FB772148D54322031F6662E0F1F84000000000066908B7A0439C77706033A39C7770C4683C22839CE72EB31C0EB098B421CC1E81F83F0015E5FC3662E0F1F840000000000905357560FB70D0000400031C081F94D5A00000F85B20000008B0D3C00400081B900004000504500000F859C0000008DB9000040000FB74F1881F90B0100000F85860000008B978000000085D2747C0FB7770685F674740FB747148B4C24108D44072431FF662E0F1F84000000000066908B1839D377070358FC39D3770C4783C02839F772EB31C0EB4189D631C081C60000400074358D920C004000837AF800751EEB17662E0F1F8400000000000F1F004983C214837AF8007505833A00740B85C97FEDB80000400003025E5F5BC3CCCC513D001000008D4C2408721481E90010000085092D001000003D0010000077EC29C185098D44240489CC8B48FCFF3029E0C3CCCCCCCCCCCCCCCCCCCCCCCCCCCCFF74240C6A00FF742410FF7424106A006A00E87E01000083C418C3CCCCCCCCCC5589E550A10835400083C008894DFC8B4DFC890183C4045DC3FF2500354000FF250C354000FF2510354000FF2514354000FF2518354000FF251C354000FF252C354000FF2534354000FF2548354000FF254C354000FF2564354000FF256C354000FF2574354000FF2584354000FF259C354000FF25C0354000FF25D8354000FF25DC354000FF25C4354000FF25B4354000FF25B8354000FF25C8354000FF25D4354000FF25E8354000FF25B0354000FF25E0354000FF2570354000FF25F8354000FF2580354000FF25EC354000FF25F0354000FF25BC354000FF2594354000FF2578354000FF2568354000FF255C354000FF2550354000FF253C354000FF2540354000FF2500364000FF2560354000FF2554354000FF2544354000FF2504364000FF2558354000FF2514364000FF2510364000FF250C364000FF258C354000FF2590354000FF2520364000FF2524354000FF25AC354000FF25D0354000FF2528354000FF25E4354000FF25A8354000FF25CC354000FF2588354000FF25A0354000
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  12288
+    VirtualSize:     3074
+    SectionData:     84434000B84040000070400004704000D8434000CC3240000000000000000000C0164000000000000000000000000000417267756D656E7420646F6D61696E206572726F722028444F4D41494E2900417267756D656E742073696E67756C617269747920285349474E29004F766572666C6F772072616E6765206572726F7220284F564552464C4F5729005061727469616C206C6F7373206F66207369676E69666963616E63652028504C4F53532900546F74616C206C6F7373206F66207369676E69666963616E63652028544C4F5353290054686520726573756C7420697320746F6F20736D616C6C20746F20626520726570726573656E7465642028554E444552464C4F572900556E6B6E6F776E206572726F72005F6D61746865727228293A20257320696E2025732825672C2025672920202872657476616C3D2567290A000000000000000000000000000000303040004F3040006B304000D3304000B03040008B3040002020556E6B6E6F776E2070736575646F2072656C6F636174696F6E2070726F746F636F6C2076657273696F6E2025642E0A002020556E6B6E6F776E2070736575646F2072656C6F636174696F6E206269742073697A652025642E0A004164647265737320257020686173206E6F20696D6167652D73656374696F6E0020205669727475616C5175657279206661696C656420666F7220256420627974657320617420616464726573732025700020205669727475616C50726F74656374206661696C6564207769746820636F64652030782578004D696E67772D7736342072756E74696D65206661696C7572653A0A003E1C40003E1C40003E1C40003E1C40003E1C4000C41C40003E1C4000431C4000C41C4000A11C400072756E74696D65206572726F722025640A000000D81F400096204000B6204000BD20400000000000E01040000000000000000000000000000000000000104000901540000000000000000000C01640004017400000000000FFFFFFFF00000000FFFFFFFF00000000D83300000000000000000000BE3A000000350000FC3300000000000000000000CC3A0000243500000C3400000000000000000000EB3A0000343500001434000000000000000000000D3B00003C3500005834000000000000000000002F3B0000803500007434000000000000000000004F3B00009C350000803400000000000000000000703B0000A8350000D034000000000000000000007D3B0000F8350000D834000000000000000000009C3B000000360000E43400000000000000000000C23B00000C360000F83400000000000000000000E13B0000203600000000000000000000000000000000000000000000283600003E36000052360000663600008236000096360000A8360000B636000000000000CE360000D8360000E036000000000000EA36000000000000F436000002370000103700001E3700002C3700003637000050370000683700007637000098370000B8370000C4370000D4370000F6370000FE37000006380000000000001038000022380000303800004A3800006638000082380000000000008C3800009638000000000000A0380000B8380000D0380000E4380000FA3800001039000020390000323900004C3900005C3900007839000090390000AA390000C8390000D0390000E4390000F23900000E3A0000203A000000000000303A000000000000443A0000543A000000000000643A0000723A0000803A00008C3A000000000000963A000000000000283600003E36000052360000663600008236000096360000A8360000B636000000000000CE360000D8360000E036000000000000EA36000000000000F436000002370000103700001E3700002C3700003637000050370000683700007637000098370000B8370000C4370000D4370000F6370000FE37000006380000000000001038000022380000303800004A3800006638000082380000000000008C3800009638000000000000A0380000B8380000D0380000E4380000FA3800001039000020390000323900004C3900005C3900007839000090390000AA390000C8390000D0390000E4390000F23900000E3A0000203A000000000000303A000000000000443A0000543A000000000000643A0000723A0000803A00008C3A000000000000963A00000000000000005F5A4E537439657863657074696F6E443145760000005F5A5449537439657863657074696F6E000000005F5A5456537439657863657074696F6E000000005F5F6378615F616C6C6F636174655F657863657074696F6E000000005F5F6378615F626567696E5F63617463680000005F5F6378615F656E645F63617463680000005F5F6378615F7468726F770000005F5F6778785F706572736F6E616C6974795F76300000000063616C6C6F630000000066726565000000006D616C6C6F63000000006D656D637079000000005F5F705F5F5F61726763000000005F5F705F5F5F61726776000000005F5F705F5F5F77617267760000005F5F705F5F61636D646C6E0000005F6365786974000000005F636F6E6669677572655F6E6172726F775F61726776000000005F636F6E6669677572655F776964655F61726776000000005F6372745F6174657869740000005F696E697469616C697A655F6E6172726F775F656E7669726F6E6D656E74000000005F696E697469616C697A655F776964655F656E7669726F6E6D656E74000000005F696E69747465726D0000005F7365745F6170705F747970650000005F7365745F696E76616C69645F706172616D657465725F68616E646C65720000000061626F727400000065786974000000007369676E616C000000005F5F616372745F696F625F66756E630000005F5F705F5F666D6F6465000000005F5F737464696F5F636F6D6D6F6E5F76667072696E74660000005F5F737464696F5F636F6D6D6F6E5F7666777072696E7466000000005F5F737464696F5F636F6D6D6F6E5F7673777072696E746600000000667772697465000000007374726C656E000000007374726E636D7000000044656C657465437269746963616C53656374696F6E000000456E746572437269746963616C53656374696F6E0000000047657443757272656E7450726F6365737300000047657443757272656E7450726F63657373496400000047657443757272656E745468726561644964000000004765744C6173744572726F720000000047657453746172747570496E666F4100000047657453797374656D54696D65417346696C6554696D650000004765745469636B436F756E7400000000496E697469616C697A65437269746963616C53656374696F6E0000004C65617665437269746963616C53656374696F6E000000005175657279506572666F726D616E6365436F756E746572000000536574556E68616E646C6564457863657074696F6E46696C746572000000536C6565700000005465726D696E61746550726F6365737300000000546C7347657456616C7565000000556E68616E646C6564457863657074696F6E46696C746572000000005669727475616C50726F74656374000000005669727475616C5175657279000000005F5F736574757365726D617468657272000000005F5F705F5F656E7669726F6E000000005F5F705F5F77656E7669726F6E0000005F5F6461796C69676874000000005F5F74696D657A6F6E65000000005F5F747A6E616D65000000005F747A736574000000005F5F696E697469616C697A655F6C636F6E765F666F725F756E7369676E65645F6368617200006C6962632B2B6162692E646C6C006170692D6D732D77696E2D6372742D686561702D6C312D312D302E646C6C006170692D6D732D77696E2D6372742D707269766174652D6C312D312D302E646C6C006170692D6D732D77696E2D6372742D72756E74696D652D6C312D312D302E646C6C006170692D6D732D77696E2D6372742D737464696F2D6C312D312D302E646C6C006170692D6D732D77696E2D6372742D737472696E672D6C312D312D302E646C6C004B45524E454C33322E646C6C006170692D6D732D77696E2D6372742D6D6174682D6C312D312D302E646C6C006170692D6D732D77696E2D6372742D656E7669726F6E6D656E742D6C312D312D302E646C6C006170692D6D732D77696E2D6372742D74696D652D6C312D312D302E646C6C006170692D6D732D77696E2D6372742D6C6F63616C652D6C312D312D302E646C6C00
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  16384
+    VirtualSize:     1072
+    SectionData:     0A000000E4324000FFFFFFFFFFFFFFFFFF000000FFFFFFFFB01D4000044440000844400060404000644040002440400080700000304040000100000038404000F01C4000501D4000D01D4000F01D4000301E4000601E4000901E4000C01E400050535400504454000200000030214000802540004EE640BBB119BF4400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+  - Name:            .eh_fram
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  20480
+    VirtualSize:     64
+    SectionData:     1C00000000000000017A504C5200017C080700D725400000000C0404880100001C0000002400000010144000780000000400604000410E088502420D05000000
+  - Name:            .gcc_exc
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  24576
+    VirtualSize:     24
+    SectionData:     FF0015010C00350000351E58015325000001000000000000
+  - Name:            .tls
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  28672
+    VirtualSize:     8
+    SectionData:     '0000000000000000'
+  - Name:            .reloc
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  32768
+    VirtualSize:     756
+    SectionData:     001000007C0100000330093013301D30273037303D304A3063306C30753081308A30A030B230BA30CC30E130E630EB30F130F630FB30003112313E316B3180318831A331B131C831D131DA31E331E831F531FF3104321232223227323D32433248324D3260326432BF32C532E132E73201335633603366336E3374337A3388338E339633A333C233463457349134A334AC34C334E33403350B3522352C353235533573357B359235BD35DC35E935F135FB350C3627362E36533658365D3663366936733679368536903695369B36A136AD36C836D136E436EA36F1367137C537DA37EB37183880388D38AA38AF38B438BA38D438DA38E238E838F038F638FD38123927392D3945394C3962397C398139A639AD39C439D639653A793A843A983AA63AE73A0A3B1C3B4E3B703B7A3B933B9D3BAD3BB83BBE3BED3B3A3CC53CE13D023E0C3E163E203E323E3C3E463E503EF63E1D3F233F283F2E3F363F3C3F523F5B3F613F673F9F3FAD3FB33FD43FDA3FE23FE83FEE3FF83FFE3F000000200000F0000000313037303C304A3074307E3087308D309830A030A630AC30BF30C730CD30D330DD30E33011311731FB3107320D321932753285328B329732B432F3320333093316332533343344334A335633A333B033B633C333C833E533F533FB330734243476348A349034A034FF3407353435A535BB35C135C735CD35D335D935DF35E535EB35F135F735FD35033609360F3615361B36213627362D36333639363F3645364B36513657365D36633669366F3675367B36813687368D36933699369F36A536AB36B136B736BD36C336C936CF36D536DB36E136E736ED36F336F936FF3605370B37113717371D3700300000480000000030043008300C301030143020305031543158315C316031643158325C326032643268326C327032743278327C32943298329C32A032A832BC32C032CC32D0320040000030000000043018301C302030243028302C3034303C304030443048304C305030543058305C306C307030000000500000100000001330283031300000
+symbols:         []
+...
diff --git a/test/tools/llvm-objdump/WebAssembly/symbol-table.test b/test/tools/llvm-objdump/WebAssembly/symbol-table.test
index fff4c9fe52ca56473e5f02905b179bdffc5e1c20..b8455aaf5aea9239d703302ca052c6ac13dba59a 100644
--- a/test/tools/llvm-objdump/WebAssembly/symbol-table.test
+++ b/test/tools/llvm-objdump/WebAssembly/symbol-table.test
@@ -2,8 +2,8 @@ RUN: llvm-objdump -t %p/../Inputs/trivial.obj.wasm | FileCheck %s
 
 CHECK:      SYMBOL TABLE:
 CHECK-NEXT: 00000002 g     F CODE	main
-CHECK-NEXT: 00000000 l       DATA	.L.str
+CHECK-NEXT: 00000000 l     O DATA	.L.str
 CHECK-NEXT: 00000000 g     F *UND*	puts
 CHECK-NEXT: 00000003 l     F CODE	.LSomeOtherFunction_bitcast
 CHECK-NEXT: 00000000 g     F *UND*	SomeOtherFunction
-CHECK-NEXT: 00000010 g       DATA	var
+CHECK-NEXT: 00000010 g     O DATA	var
diff --git a/test/tools/llvm-objdump/X86/macho-symbol-table.test b/test/tools/llvm-objdump/X86/macho-symbol-table.test
index 19c619e73d074c86d16a10e4279101bc49d9b783..663edc926a94497d9e8895dcff0d5415c4c09548 100644
--- a/test/tools/llvm-objdump/X86/macho-symbol-table.test
+++ b/test/tools/llvm-objdump/X86/macho-symbol-table.test
@@ -1,8 +1,8 @@
 RUN: llvm-objdump -macho -t %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
 
-CHECK: SYMBOL TABLE:
-CHECK: 000000000000003b l       __TEXT,__cstring	L_.str
-CHECK: 0000000000000068 l       __TEXT,__eh_frame	EH_frame0
-CHECK: 0000000000000000 g     F __TEXT,__text	        _main
-CHECK: 0000000000000080 g       __TEXT,__eh_frame	_main.eh
-CHECK: 0000000000000000         *UND*	                _printf
+CHECK:      SYMBOL TABLE:
+CHECK-NEXT: 000000000000003b l     O __TEXT,__cstring	L_.str
+CHECK-NEXT: 0000000000000068 l     O __TEXT,__eh_frame	EH_frame0
+CHECK-NEXT: 0000000000000000 g     F __TEXT,__text	        _main
+CHECK-NEXT: 0000000000000080 g     O __TEXT,__eh_frame	_main.eh
+CHECK-NEXT: 0000000000000000         *UND*	                _printf
diff --git a/test/tools/llvm-objdump/common-symbol-elf.test b/test/tools/llvm-objdump/common-symbol-elf.test
index 32df05ac051b6219aed8935c9573751146e408d6..3ffea44f532078a46fea4f94af2374f1efbff724 100644
--- a/test/tools/llvm-objdump/common-symbol-elf.test
+++ b/test/tools/llvm-objdump/common-symbol-elf.test
@@ -1,3 +1,3 @@
 // RUN: llvm-objdump -t %p/Inputs/common-symbol-elf | FileCheck %s
 
-CHECK: 00000400 g       *COM*  00000008 common_symbol
+CHECK: 00000400 g     O *COM*  00000008 common_symbol
diff --git a/test/tools/llvm-objdump/eh_frame-coff.test b/test/tools/llvm-objdump/eh_frame-coff.test
new file mode 100644
index 0000000000000000000000000000000000000000..74eceeec4f31aed7bbcc1cbd1d8ba1a3ffff1842
--- /dev/null
+++ b/test/tools/llvm-objdump/eh_frame-coff.test
@@ -0,0 +1,28 @@
+# RUN: yaml2obj %p/Inputs/eh_frame-coff.yaml | llvm-objdump -dwarf=frames - 2>/dev/null | FileCheck %s
+
+# CHECK: .eh_frame contents:
+
+# CHECK: 00000000 0000001c ffffffff CIE
+# CHECK:   Version:               1
+# CHECK:   Augmentation:          "zPLR"
+# CHECK:   Code alignment factor: 1
+# CHECK:   Data alignment factor: -4
+# CHECK:   Return address column: 8
+# CHECK:   Personality Address: 004025d7
+# CHECK:   Augmentation data:     00 D7 25 40 00 00 00
+
+# CHECK:   DW_CFA_def_cfa: reg4 +4
+# CHECK:   DW_CFA_offset: reg8 -4
+# CHECK:   DW_CFA_nop:
+# CHECK:   DW_CFA_nop:
+
+# CHECK: 00000020 0000001c 00000024 FDE cie=00000024 pc=00401410...00401488
+# CHECK:   LSDA Address: 00406000
+# CHECK:   DW_CFA_advance_loc: 1
+# CHECK:   DW_CFA_def_cfa_offset: +8
+# CHECK:   DW_CFA_offset: reg5 -8
+# CHECK:   DW_CFA_advance_loc: 2
+# CHECK:   DW_CFA_def_cfa_register: reg5
+# CHECK:   DW_CFA_nop:
+# CHECK:   DW_CFA_nop:
+# CHECK:   DW_CFA_nop:
diff --git a/test/tools/llvm-objdump/full-contents.test b/test/tools/llvm-objdump/full-contents.test
index de0d584df324b60736e832e299411a3d22361805..facf1a3da7cfc7a0bb9c853cde8fa8b8f1b93f2d 100644
--- a/test/tools/llvm-objdump/full-contents.test
+++ b/test/tools/llvm-objdump/full-contents.test
@@ -45,3 +45,4 @@ Sections:
     Flags:           [ SHF_ALLOC ]
     AddressAlign:    0x0000000000000010
     Size:            32
+
diff --git a/test/tools/llvm-objdump/relocations-elf.test b/test/tools/llvm-objdump/relocations-elf.test
index a29b3e6a6fbadd64186c0b68c3a5f4325055e764..4cbd85ca3f1b267ddb723fa1a9ad91b5a93dc11a 100644
--- a/test/tools/llvm-objdump/relocations-elf.test
+++ b/test/tools/llvm-objdump/relocations-elf.test
@@ -1,6 +1,8 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objdump --reloc %t | FileCheck %s
-# RUN: llvm-objdump -r      %t | FileCheck %s
+# RUN: llvm-objdump --reloc %t > %t1
+# RUN: llvm-objdump -r      %t > %t2
+# RUN: cmp %t1 %t2
+# RUN: FileCheck %s --input-file=%t1
 
 # CHECK: RELOCATION RECORDS FOR [.rel.text]:
 # CHECK: 0000000000000001 R_X86_64_32 glob1
diff --git a/test/tools/llvm-objdump/symbol-table-elf.test b/test/tools/llvm-objdump/symbol-table-elf.test
index fc1eccdffb74d9c75437a026b3d2a43aeb7e1c89..abe339d7d536bd3fd7c3951003af297c26aa93fc 100644
--- a/test/tools/llvm-objdump/symbol-table-elf.test
+++ b/test/tools/llvm-objdump/symbol-table-elf.test
@@ -1,13 +1,15 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objdump --syms %t | FileCheck %s
-# RUN: llvm-objdump -t     %t | FileCheck %s
+# RUN: llvm-objdump --syms %t > %t1
+# RUN: llvm-objdump -t     %t > %t2
+# RUN: cmp %t1 %t2
+# RUN: FileCheck %s --input-file=%t1
 
 # CHECK:      SYMBOL TABLE:
 # CHECK-NEXT: 0000000000000000         *UND*     00000000
 # CHECK-NEXT: 0000000000001004 l     F .text     00000000 lfoo
-# CHECK-NEXT: 0000000000001008 l       .text     00000000 lbar
+# CHECK-NEXT: 0000000000001008 l     O .text     00000000 lbar
 # CHECK-NEXT: 0000000000001004 g     F .text     00000000 foo
-# CHECK-NEXT: 0000000000001008 g       .text     00000000 bar
+# CHECK-NEXT: 0000000000001008 g     O .text     00000000 bar
 
 !ELF
 FileHeader:
@@ -45,3 +47,4 @@ Symbols:
        Type:     STT_OBJECT
        Section:  .text
        Value:    0x1008
+
diff --git a/test/tools/llvm-rc/Inputs/empty.rc b/test/tools/llvm-rc/Inputs/empty.rc
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/test/tools/llvm-rc/Inputs/not-expr.rc b/test/tools/llvm-rc/Inputs/not-expr.rc
new file mode 100644
index 0000000000000000000000000000000000000000..1b5c8177393e3499970c6043a6eb07f7920e58d8
Binary files /dev/null and b/test/tools/llvm-rc/Inputs/not-expr.rc differ
diff --git a/test/tools/llvm-rc/Inputs/tag-dialog-headers.rc b/test/tools/llvm-rc/Inputs/tag-dialog-headers.rc
index 65bc24342fd6d4af32aca598f392bda8b0295095..3a71f233a148128068713119dab54c6972197613 100644
Binary files a/test/tools/llvm-rc/Inputs/tag-dialog-headers.rc and b/test/tools/llvm-rc/Inputs/tag-dialog-headers.rc differ
diff --git a/test/tools/llvm-rc/absolute.test b/test/tools/llvm-rc/absolute.test
new file mode 100644
index 0000000000000000000000000000000000000000..5e8a6c8c33663f4043c972017c8b733623c4c274
--- /dev/null
+++ b/test/tools/llvm-rc/absolute.test
@@ -0,0 +1,3 @@
+; RUN: touch %t.manifest
+; RUN: echo "1 24 \"%t.manifest\"" > %t.rc
+; RUN: llvm-rc %t.rc
diff --git a/test/tools/llvm-rc/flags.test b/test/tools/llvm-rc/flags.test
new file mode 100644
index 0000000000000000000000000000000000000000..452e90a406bbab5c78d1029a3846c462bfc7c0c8
--- /dev/null
+++ b/test/tools/llvm-rc/flags.test
@@ -0,0 +1,4 @@
+; RUN: llvm-rc /dry-run /FO %t %p/Inputs/empty.rc 2>&1 | FileCheck %s --allow-empty --check-prefix=FO
+; RUN: llvm-rc /dry-run /FO%t %p/Inputs/empty.rc 2>&1 | FileCheck %s --allow-empty --check-prefix=FO
+
+; FO-NOT: Exactly one input file should be provided.
diff --git a/test/tools/llvm-rc/not-expr.test b/test/tools/llvm-rc/not-expr.test
new file mode 100644
index 0000000000000000000000000000000000000000..c60223425c7157e7d1199e2702dc914c8f1683d7
--- /dev/null
+++ b/test/tools/llvm-rc/not-expr.test
@@ -0,0 +1,20 @@
+; RUN: llvm-rc /FO %t %p/Inputs/not-expr.rc
+; RUN: llvm-readobj %t | FileCheck %s --check-prefix=NOTEXPR
+
+; NOTEXPR: Resource type (int): 5
+; NOTEXPR-NEXT: Resource name (string): NOT
+; NOTEXPR-NEXT: Data version: 0
+; NOTEXPR-NEXT: Memory flags: 0x1030
+; NOTEXPR-NEXT: Language ID: 1033
+; NOTEXPR-NEXT: Version (major): 0
+; NOTEXPR-NEXT: Version (minor): 0
+; NOTEXPR-NEXT: Characteristics: 0
+; NOTEXPR-NEXT: Data size: 96
+; NOTEXPR-NEXT: Data: (
+; NOTEXPR-NEXT:   0000: 0100FFFF 04000000 F7FFFFFF 03FF00FF  |................|
+; NOTEXPR-NEXT:   0010: 02000000 02003C00 03000000 00000000  |......<.........|
+; NOTEXPR-NEXT:   0020: 00000000 00000000 22000050 00000A00  |........"..P....|
+; NOTEXPR-NEXT:   0030: 00000000 00000000 FFFF8000 00000000  |................|
+; NOTEXPR-NEXT:   0040: 00000000 00000000 01000050 00000000  |...........P....|
+; NOTEXPR-NEXT:   0050: 00000000 01000000 FFFF8000 00000000  |................|
+; NOTEXPR-NEXT: )
diff --git a/test/tools/llvm-rc/tag-dialog.test b/test/tools/llvm-rc/tag-dialog.test
index 36fcb6d7360d9549120831f066fe9e19a609ffdd..85a8c20edc7d647dd497e2f7eeb9b5fc0bbd4b38 100644
--- a/test/tools/llvm-rc/tag-dialog.test
+++ b/test/tools/llvm-rc/tag-dialog.test
@@ -579,6 +579,62 @@
 ; HEADERS-NEXT:   0010: 05000000 FFFF2A00 0000               |......*...|
 ; HEADERS-NEXT: )
 
+; HEADERS-DAG: Resource type (int): 5
+; HEADERS-NEXT: Resource name (int): 29
+; HEADERS-NEXT: Data version: 0
+; HEADERS-NEXT: Memory flags: 0x1030
+; HEADERS-NEXT: Language ID: 1033
+; HEADERS-NEXT: Version (major): 0
+; HEADERS-NEXT: Version (minor): 0
+; HEADERS-NEXT: Characteristics: 0
+; HEADERS-NEXT: Data size: 32
+; HEADERS-NEXT: Data: (
+; HEADERS-NEXT:   0000: 0100FFFF 00000000 67452301 00008880  |........gE#.....|
+; HEADERS-NEXT:   0010: 00000000 01000000 09000000 00000000  |................|
+; HEADERS-NEXT: )
+
+; HEADERS-DAG: Resource type (int): 5
+; HEADERS-NEXT: Resource name (int): 30
+; HEADERS-NEXT: Data version: 0
+; HEADERS-NEXT: Memory flags: 0x1030
+; HEADERS-NEXT: Language ID: 1033
+; HEADERS-NEXT: Version (major): 0
+; HEADERS-NEXT: Version (minor): 0
+; HEADERS-NEXT: Characteristics: 0
+; HEADERS-NEXT: Data size: 24
+; HEADERS-NEXT: Data: (
+; HEADERS-NEXT:   0000: 00008880 67452301 00000000 01000000  |....gE#.........|
+; HEADERS-NEXT:   0010: 09000000 00000000                    |........|
+; HEADERS-NEXT: )
+
+; HEADERS-DAG: Resource type (int): 5
+; HEADERS-NEXT: Resource name (int): 31
+; HEADERS-NEXT: Data version: 0
+; HEADERS-NEXT: Memory flags: 0x1030
+; HEADERS-NEXT: Language ID: 1033
+; HEADERS-NEXT: Version (major): 0
+; HEADERS-NEXT: Version (minor): 0
+; HEADERS-NEXT: Characteristics: 0
+; HEADERS-NEXT: Data size: 32
+; HEADERS-NEXT: Data: (
+; HEADERS-NEXT:   0000: 0100FFFF 00000000 67452301 10325476  |........gE#..2Tv|
+; HEADERS-NEXT:   0010: 00000000 01000000 09000000 00000000  |................|
+; HEADERS-NEXT: )
+
+; HEADERS-DAG: Resource type (int): 5
+; HEADERS-NEXT: Resource name (int): 32
+; HEADERS-NEXT: Data version: 0
+; HEADERS-NEXT: Memory flags: 0x1030
+; HEADERS-NEXT: Language ID: 1033
+; HEADERS-NEXT: Version (major): 0
+; HEADERS-NEXT: Version (minor): 0
+; HEADERS-NEXT: Characteristics: 0
+; HEADERS-NEXT: Data size: 24
+; HEADERS-NEXT: Data: (
+; HEADERS-NEXT:   0000: 10325476 56341200 00000000 01000000  |.2TvV4..........|
+; HEADERS-NEXT:   0010: 09000000 00000000                    |........|
+; HEADERS-NEXT: )
+
 
 ; RUN: not llvm-rc /FO %t %p/Inputs/tag-dialog-large-coord.rc 2>&1 | FileCheck %s --check-prefix COORD1
 
diff --git a/test/tools/llvm-readobj/all.test b/test/tools/llvm-readobj/all.test
new file mode 100644
index 0000000000000000000000000000000000000000..b8ec174d5aa3f7fc81647e8dcdf9605635bea3f5
--- /dev/null
+++ b/test/tools/llvm-readobj/all.test
@@ -0,0 +1,15 @@
+RUN: llvm-readobj -a %p/Inputs/trivial.obj.elf-i386 \
+RUN:   | FileCheck %s -check-prefix ALL
+RUN: llvm-readobj --all %p/Inputs/trivial.obj.elf-i386 \
+RUN:   | FileCheck %s -check-prefix ALL
+
+ALL: Format: ELF32-i386
+ALL: Arch: i386
+ALL: AddressSize: 32bit
+ALL: LoadName:
+ALL: ElfHeader {
+ALL: Sections [
+ALL: Relocations [
+ALL: Symbols [
+ALL: ProgramHeaders [
+ALL: Notes [
diff --git a/test/tools/llvm-readobj/gnu-note-size.test b/test/tools/llvm-readobj/gnu-note-size.test
new file mode 100644
index 0000000000000000000000000000000000000000..4a160d13a5cf0562cfd35e9f33122293ecf8c047
--- /dev/null
+++ b/test/tools/llvm-readobj/gnu-note-size.test
@@ -0,0 +1,32 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s --check-prefix=GNU
+# RUN: llvm-readobj -elf-output-style LLVM --notes %t | FileCheck %s --check-prefix=LLVM
+
+# GNU:        Owner                 Data size       Description
+# GNU-NEXT:   GNU                   0x00000004      NT_GNU_ABI_TAG (ABI version tag)
+# GNU-NEXT:     <corrupt GNU_ABI_TAG>
+
+# LLVM:      Notes [
+# LLVM-NEXT:   NoteSection {
+# LLVM-NEXT:     Offset:
+# LLVM-NEXT:     Size: 0x14
+# LLVM-NEXT:     Note {
+# LLVM-NEXT:       Owner: GNU
+# LLVM-NEXT:       Data size: 0x4
+# LLVM-NEXT:       Type: NT_GNU_ABI_TAG (ABI version tag)
+# LLVM-NEXT:       ABI: <corrupt GNU_ABI_TAG>
+# LLVM-NEXT:     }
+# LLVM-NEXT:   }
+# LLVM-NEXT: ]
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .note.ABI-tag
+    Type:            SHT_NOTE
+    AddressAlign:    0x0000000000000004
+    Content:         040000000400000001000000474E550000000000
diff --git a/test/tools/llvm-readobj/headers.test b/test/tools/llvm-readobj/headers.test
new file mode 100644
index 0000000000000000000000000000000000000000..cc05ece49fbcf4cd53aa35e03d3e0812aa6e0a02
--- /dev/null
+++ b/test/tools/llvm-readobj/headers.test
@@ -0,0 +1,5 @@
+RUN: llvm-readelf -e %p/Inputs/trivial.obj.elf-i386 > %t.e
+RUN: llvm-readelf --headers %p/Inputs/trivial.obj.elf-i386 > %t.headers
+RUN: llvm-readelf -h -l -S %p/Inputs/trivial.obj.elf-i386 > %t.hlS
+RUN: cmp %t.e %t.headers
+RUN: cmp %t.e %t.hlS
diff --git a/test/tools/llvm-readobj/readelf-s-alias.test b/test/tools/llvm-readobj/readelf-s-alias.test
new file mode 100644
index 0000000000000000000000000000000000000000..e34f81ec6c26424829883e61bc8c3def1ed6f8dd
--- /dev/null
+++ b/test/tools/llvm-readobj/readelf-s-alias.test
@@ -0,0 +1,49 @@
+# In llvm-readobj, -s is an alias for --sections.
+RUN: llvm-readobj -s %p/Inputs/trivial.obj.elf-i386 \
+RUN:   | FileCheck %s -check-prefix SEC
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.elf-i386 \
+RUN:   | FileCheck %s -check-prefix SEC
+
+# In llvm-readelf, -s is an alias for --symbols.
+RUN: llvm-readelf -s %p/Inputs/trivial.obj.elf-i386 \
+RUN:   | FileCheck %s -check-prefix SYM
+RUN: llvm-readelf --symbols %p/Inputs/trivial.obj.elf-i386 \
+RUN:   | FileCheck %s -check-prefix SYM
+
+SEC:      Sections [
+SEC-NEXT:   Section {
+SEC-NEXT:     Index: 0
+SEC-NEXT:     Name:  (0)
+SEC-NEXT:     Type: SHT_NULL (0x0)
+SEC-NEXT:     Flags [ (0x0)
+SEC-NEXT:     ]
+SEC-NEXT:     Address: 0x0
+SEC-NEXT:     Offset: 0x0
+SEC-NEXT:     Size: 0
+SEC-NEXT:     Link: 0
+SEC-NEXT:     Info: 0
+SEC-NEXT:     AddressAlignment: 0
+SEC-NEXT:     EntrySize: 0
+SEC-NEXT:   }
+SEC-NEXT:   Section {
+SEC-NEXT:     Index: 1
+SEC-NEXT:     Name: .text (5)
+SEC-NEXT:     Type: SHT_PROGBITS (0x1)
+SEC-NEXT:     Flags [ (0x6)
+SEC-NEXT:       SHF_ALLOC (0x2)
+SEC-NEXT:       SHF_EXECINSTR (0x4)
+SEC-NEXT:     ]
+SEC-NEXT:     Address: 0x0
+SEC-NEXT:     Offset: 0x40
+SEC-NEXT:     Size: 42
+SEC-NEXT:     Link: 0
+SEC-NEXT:     Info: 0
+SEC-NEXT:     AddressAlignment: 16
+SEC-NEXT:     EntrySize: 0
+SEC-NEXT:   }
+
+SYM:      Symbol table '.symtab' contains {{.*}} entries:
+SYM-NEXT:    Num:    Value  Size Type    Bind   Vis      Ndx Name
+SYM-NEXT:      0: {{.*}} NOTYPE  {{.*}} UND
+SYM-NEXT:      1: {{.*}} FILE    {{.*}} trivial.ll
+SYM-NEXT:      2: {{.*}} OBJECT  {{.*}} .L.str
diff --git a/test/tools/llvm-readobj/sections.test b/test/tools/llvm-readobj/sections.test
index c371f4bb644817ba3a23807adb7a3999f9e20ae2..a66e663d0dea3e45ea89ed5005f28f80aa44f966 100644
--- a/test/tools/llvm-readobj/sections.test
+++ b/test/tools/llvm-readobj/sections.test
@@ -1,22 +1,29 @@
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.coff-i386 \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.coff-i386 \
 RUN:   | FileCheck %s -check-prefix COFF
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.elf-i386 \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.elf-i386 \
 RUN:   | FileCheck %s -check-prefix ELF
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.elf-mipsel \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.elf-mipsel \
 RUN:   | FileCheck %s -check-prefix ELF-MIPSEL
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.macho-i386 \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.macho-i386 \
 RUN:   | FileCheck %s -check-prefix MACHO-I386
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.macho-x86-64 \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.macho-x86-64 \
 RUN:   | FileCheck %s -check-prefix MACHO-X86-64
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.macho-ppc \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.macho-ppc \
 RUN:   | FileCheck %s -check-prefix MACHO-PPC
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.macho-ppc64 \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.macho-ppc64 \
 RUN:   | FileCheck %s -check-prefix MACHO-PPC64
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.macho-arm \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.macho-arm \
 RUN:   | FileCheck %s -check-prefix MACHO-ARM
-RUN: llvm-readobj -s %p/Inputs/trivial.obj.wasm \
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.wasm \
 RUN:   | FileCheck %s -check-prefix WASM
 
+# Check flag aliases produce identical output.
+RUN: llvm-readobj --sections %p/Inputs/trivial.obj.elf-i386 > %t.sections
+RUN: llvm-readobj -S %p/Inputs/trivial.obj.elf-i386 > %t.uppers
+RUN: cmp %t.sections %t.uppers
+RUN: llvm-readobj -s %p/Inputs/trivial.obj.elf-i386 > %t.lowers
+RUN: cmp %t.sections %t.lowers
+
 COFF:      Sections [
 COFF-NEXT:   Section {
 COFF-NEXT:     Number: 1
diff --git a/test/tools/llvm-readobj/symbols.test b/test/tools/llvm-readobj/symbols.test
index 1a0cacdeccdd44cf023c2355835f08e511408c71..e68d654bd65d849bd0d9536333177a643d0c3c66 100644
--- a/test/tools/llvm-readobj/symbols.test
+++ b/test/tools/llvm-readobj/symbols.test
@@ -1,10 +1,19 @@
-RUN: llvm-readobj -t %p/Inputs/trivial.obj.coff-i386 \
+RUN: llvm-readobj --symbols %p/Inputs/trivial.obj.coff-i386 \
 RUN:   | FileCheck %s -check-prefix COFF
-RUN: llvm-readobj -t %p/Inputs/trivial.obj.elf-i386 \
+RUN: llvm-readobj --symbols %p/Inputs/trivial.obj.elf-i386 \
 RUN:   | FileCheck %s -check-prefix ELF
-RUN: llvm-readobj -t %p/Inputs/trivial.obj.wasm \
+RUN: llvm-readobj --symbols %p/Inputs/trivial.obj.wasm \
 RUN:   | FileCheck %s -check-prefix WASM
 
+# Check flag aliases produce identical output.
+RUN: llvm-readobj --symbols %p/Inputs/trivial.obj.elf-i386 > %t.symbols
+RUN: llvm-readobj --syms %p/Inputs/trivial.obj.elf-i386 > %t.syms
+RUN: cmp %t.symbols %t.syms
+RUN: llvm-readobj -t %p/Inputs/trivial.obj.elf-i386 > %t.t
+RUN: cmp %t.symbols %t.t
+RUN: llvm-readelf -s -elf-output-style LLVM %p/Inputs/trivial.obj.elf-i386 > %t.lowers
+RUN: cmp %t.symbols %t.lowers
+
 COFF:      Symbols [
 COFF-NEXT:   Symbol {
 COFF-NEXT:     Name: @comp.id
diff --git a/test/tools/llvm-size/X86/elf-sizes.test b/test/tools/llvm-size/X86/elf-sizes.test
new file mode 100644
index 0000000000000000000000000000000000000000..cb7fcd3a8851080aaf16d9d1780020fa7b753f1e
--- /dev/null
+++ b/test/tools/llvm-size/X86/elf-sizes.test
@@ -0,0 +1,55 @@
+# RUN: yaml2obj %s > %t.o
+# RUN: llvm-size -B %t.o | FileCheck %s
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC, SHF_WRITE ]
+    Size:            1
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Size:            2
+  - Name:            .unusual_name_for_code
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Size:            64
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Size:            4
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_WRITE ]
+    Size:            8
+  - Name:            .moar_stuff
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_WRITE ]
+    Size:            128
+  - Name:            .text.but_not_really
+    Type:            SHT_PROGBITS
+    Flags:           [ ]
+    Size:            256
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    Flags:           [ ]
+    Size:            16
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_ALLOC, SHF_WRITE ]
+    Size:            32
+
+# text is .text, .eh_frame, .unusual_name_for_code: 2 + 4 + 64 = 70
+# data is .data, .init_array, .moar_stuff: 8 + 32 + 128 = 168
+# bss is .bss: 1
+# total: 239
+# unaccounted for (not affecting total) is .debug_info, .text.but_not_really
+
+# CHECK: text data bss dec
+# CHECK: 70   168  1   239
diff --git a/test/tools/llvm-size/X86/ignore-sections.s b/test/tools/llvm-size/X86/ignore-sections.s
index c597f848b0b5311c2920460fd8fd5c227131910d..61aa2e780e513649926a28fc21d8fb92291fd933 100644
--- a/test/tools/llvm-size/X86/ignore-sections.s
+++ b/test/tools/llvm-size/X86/ignore-sections.s
@@ -25,5 +25,5 @@
 // SYSV-NEXT:    Total                 69
 
 // BSD:        text    data     bss     dec     hex filename
-// BSD-NEXT:      4       4       4      12       c {{[ -\(\)_A-Za-z0-9.\\/:]+}}
-// BSD-NEXT:      4       4       4      12       c (TOTALS)
+// BSD-NEXT:     52       4       4      60      3c {{[ -\(\)_A-Za-z0-9.\\/:]+}}
+// BSD-NEXT:     52       4       4      60      3c (TOTALS)
diff --git a/test/tools/llvm-strings/file-filename.test b/test/tools/llvm-strings/file-filename.test
index de0ce9744e6a5cb3c3eff24de8dc67c0febb30e8..26f3b4d334ff7c9867c9fb51e350ee8b5f9d05a0 100644
--- a/test/tools/llvm-strings/file-filename.test
+++ b/test/tools/llvm-strings/file-filename.test
@@ -1,3 +1,3 @@
 RUN: llvm-strings -f %S/Inputs/abcd | FileCheck %s
-RUN: llvm-strings --print-file-name %S/Inputs/abcd | FileCheck %s
+RUN: llvm-strings --print-file-name %S/Inputs/abcd | FileCheck %s --strict-whitespace
 CHECK: {{[\\/]}}abcd: abcd
diff --git a/test/tools/llvm-strings/negative-char.test b/test/tools/llvm-strings/negative-char.test
index 331dde47078927295994e08f0cb755375a12c4fb..e91dcda0cca77598e6d49fa53f9f26a6e7116e67 100644
--- a/test/tools/llvm-strings/negative-char.test
+++ b/test/tools/llvm-strings/negative-char.test
@@ -1,3 +1,3 @@
 # RUN: echo -e "z\0\x80\0a\0" | llvm-strings --bytes 1 - | FileCheck %s
 # CHECK: z{{$}}
-# CHECK-NEXT: {{^}} a
+# CHECK-NEXT: {{^}}a
diff --git a/test/tools/llvm-strings/radix-filename.test b/test/tools/llvm-strings/radix-filename.test
new file mode 100644
index 0000000000000000000000000000000000000000..60d78da2a13eda43855ea3237c1cd7d8a36538b2
--- /dev/null
+++ b/test/tools/llvm-strings/radix-filename.test
@@ -0,0 +1,36 @@
+RUN: llvm-strings --print-file-name %S/Inputs/numbers \
+RUN:     | FileCheck %s --check-prefix CHECK-NONE
+RUN: llvm-strings --print-file-name -t d %S/Inputs/numbers \
+RUN:    | FileCheck %s --check-prefix CHECK-DEC --strict-whitespace
+RUN: llvm-strings --print-file-name -t o %S/Inputs/numbers \
+RUN:     | FileCheck %s --check-prefix CHECK-OCT --strict-whitespace
+RUN: llvm-strings --print-file-name -t x %S/Inputs/numbers \
+RUN:     | FileCheck %s --check-prefix CHECK-HEX --strict-whitespace
+
+CHECK-NONE: numbers: three
+CHECK-NONE: numbers: four
+CHECK-NONE: numbers: five
+CHECK-NONE: numbers: seven
+CHECK-NONE: numbers: eight
+CHECK-NONE: numbers: nine
+
+CHECK-DEC: numbers:       8 three
+CHECK-DEC: numbers:      14 four
+CHECK-DEC: numbers:      19 five
+CHECK-DEC: numbers:      28 seven
+CHECK-DEC: numbers:      34 eight
+CHECK-DEC: numbers:      40 nine
+
+CHECK-OCT: numbers:      10 three
+CHECK-OCT: numbers:      16 four
+CHECK-OCT: numbers:      23 five
+CHECK-OCT: numbers:      34 seven
+CHECK-OCT: numbers:      42 eight
+CHECK-OCT: numbers:      50 nine
+
+CHECK-HEX: numbers:       8 three
+CHECK-HEX: numbers:       e four
+CHECK-HEX: numbers:      13 five
+CHECK-HEX: numbers:      1c seven
+CHECK-HEX: numbers:      22 eight
+CHECK-HEX: numbers:      28 nine
diff --git a/test/tools/llvm-strings/radix.test b/test/tools/llvm-strings/radix.test
index c81d9fe66c33cf9b160472c0b1f7b39954f1e6df..403b85808059fb1fd6f8edc534c0fc98aecc4b19 100644
--- a/test/tools/llvm-strings/radix.test
+++ b/test/tools/llvm-strings/radix.test
@@ -1,33 +1,32 @@
 RUN: llvm-strings %S/Inputs/numbers | FileCheck %s -check-prefix CHECK-NONE
-RUN: llvm-strings -t d %S/Inputs/numbers | FileCheck %s -check-prefix CHECK-DEC
-RUN: llvm-strings -t o %S/Inputs/numbers | FileCheck %s -check-prefix CHECK-OCT
-RUN: llvm-strings -t x %S/Inputs/numbers | FileCheck %s -check-prefix CHECK-HEX
+RUN: llvm-strings -t d %S/Inputs/numbers | FileCheck %s -check-prefix CHECK-DEC --strict-whitespace
+RUN: llvm-strings -t o %S/Inputs/numbers | FileCheck %s -check-prefix CHECK-OCT --strict-whitespace
+RUN: llvm-strings -t x %S/Inputs/numbers | FileCheck %s -check-prefix CHECK-HEX --strict-whitespace
 
-CHECK-NONE: three
-CHECK-NONE: four
-CHECK-NONE: five
-CHECK-NONE: seven
-CHECK-NONE: eight
-CHECK-NONE: nine
+CHECK-NONE: {{^}}three
+CHECK-NONE: {{^}}four
+CHECK-NONE: {{^}}five
+CHECK-NONE: {{^}}seven
+CHECK-NONE: {{^}}eight
+CHECK-NONE: {{^}}nine
 
-CHECK-DEC:      8 three
-CHECK-DEC:     14 four
-CHECK-DEC:     19 five
-CHECK-DEC:     28 seven
-CHECK-DEC:     34 eight
-CHECK-DEC:     40 nine
+CHECK-DEC: {{^}}      8 three
+CHECK-DEC: {{^}}     14 four
+CHECK-DEC: {{^}}     19 five
+CHECK-DEC: {{^}}     28 seven
+CHECK-DEC: {{^}}     34 eight
+CHECK-DEC: {{^}}     40 nine
 
-CHECK-OCT:     10 three
-CHECK-OCT:     16 four
-CHECK-OCT:     23 five
-CHECK-OCT:     34 seven
-CHECK-OCT:     42 eight
-CHECK-OCT:     50 nine
-
-CHECK-HEX:      8 three
-CHECK-HEX:      e four
-CHECK-HEX:     13 five
-CHECK-HEX:     1c seven
-CHECK-HEX:     22 eight
-CHECK-HEX:     28 nine
+CHECK-OCT: {{^}}     10 three
+CHECK-OCT: {{^}}     16 four
+CHECK-OCT: {{^}}     23 five
+CHECK-OCT: {{^}}     34 seven
+CHECK-OCT: {{^}}     42 eight
+CHECK-OCT: {{^}}     50 nine
 
+CHECK-HEX: {{^}}      8 three
+CHECK-HEX: {{^}}      e four
+CHECK-HEX: {{^}}     13 five
+CHECK-HEX: {{^}}     1c seven
+CHECK-HEX: {{^}}     22 eight
+CHECK-HEX: {{^}}     28 nine
diff --git a/test/tools/llvm-strings/whitespace.test b/test/tools/llvm-strings/whitespace.test
new file mode 100644
index 0000000000000000000000000000000000000000..f34a671e9977ef7d5ddb00b8ae4ba550b4e138c7
--- /dev/null
+++ b/test/tools/llvm-strings/whitespace.test
@@ -0,0 +1,2 @@
+RUN: echo -n abcd | llvm-strings - | FileCheck %s --strict-whitespace
+CHECK: {{^}}abcd{{$}}
diff --git a/test/tools/llvm-symbolizer/split-debug.test b/test/tools/llvm-symbolizer/split-debug.test
index 2d5b0732c6dbd0721de49cb0973db7a10be176cd..7474cd6e6f248cd7731e91df7f9b027127417406 100644
--- a/test/tools/llvm-symbolizer/split-debug.test
+++ b/test/tools/llvm-symbolizer/split-debug.test
@@ -18,7 +18,7 @@
 #Build as : clang -g -O2 addr.c
 
 RUN: mkdir -p %t/.debug
-RUN: llvm-objcopy --keep=.debug_info %p/Inputs/addr.exe %t/.debug/addr
+RUN: llvm-objcopy --keep-section=.debug_info %p/Inputs/addr.exe %t/.debug/addr
 RUN: llvm-objcopy --strip-debug --add-gnu-debuglink=%t/.debug/addr %p/Inputs/addr.exe %t/addr.exe
 RUN: llvm-symbolizer -print-address -obj=%t/addr.exe < %p/Inputs/addr.inp | FileCheck %s
 
diff --git a/test/tools/llvm-xray/X86/Inputs/elf64-pie.bin b/test/tools/llvm-xray/X86/Inputs/elf64-pie.bin
new file mode 100755
index 0000000000000000000000000000000000000000..ab2745eeba894089af7965cc534cf214b4ea50df
Binary files /dev/null and b/test/tools/llvm-xray/X86/Inputs/elf64-pie.bin differ
diff --git a/test/tools/llvm-xray/X86/extract-instrmap-pie.ll b/test/tools/llvm-xray/X86/extract-instrmap-pie.ll
new file mode 100644
index 0000000000000000000000000000000000000000..987aaf9801052628e8b67e6189a37aa626ddf480
--- /dev/null
+++ b/test/tools/llvm-xray/X86/extract-instrmap-pie.ll
@@ -0,0 +1,11 @@
+; This test makes sure we can extract the instrumentation map from an
+; XRay-instrumented PIE file.
+;
+; RUN: llvm-xray extract %S/Inputs/elf64-pie.bin -s | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: - { id: 1, address: 0x00000000000299C0, function: 0x00000000000299C0, kind: function-enter, always-instrument: true, function-name: {{.*foo.*}} }
+; CHECK-NEXT: - { id: 1, address: 0x00000000000299D0, function: 0x00000000000299C0, kind: function-exit, always-instrument: true, function-name: {{.*foo.*}} }
+; CHECK-NEXT: - { id: 2, address: 0x00000000000299E0, function: 0x00000000000299E0, kind: function-enter, always-instrument: true, function-name: {{.*bar.*}} }
+; CHECK-NEXT: - { id: 2, address: 0x00000000000299F6, function: 0x00000000000299E0, kind: function-exit, always-instrument: true, function-name: {{.*bar.*}} }
+; CHECK-NEXT: ...
diff --git a/test/tools/sanstats/elf.test b/test/tools/sanstats/elf.test
index ab90b50020f150d5645bc6309c065fe84b8a4237..54878a1d526b8f40c436c23a96ede79bedad05e0 100644
--- a/test/tools/sanstats/elf.test
+++ b/test/tools/sanstats/elf.test
@@ -25,19 +25,29 @@
 
 # RUN: sanstats %t.stats | FileCheck %s
 
-# CHECK: /tmp{{[/\\]}}f.c:1 f1 cfi-vcall 1
-# CHECK: /tmp{{[/\\]}}f.c:2 f2 cfi-nvcall 2
-# CHECK: /tmp{{[/\\]}}f.c:3 f3 cfi-derived-cast 3
-# CHECK: /tmp{{[/\\]}}f.c:1 f1 cfi-unrelated-cast 4
-# CHECK: /tmp{{[/\\]}}f.c:2 f2 cfi-icall 5
-# CHECK: /tmp{{[/\\]}}f.c:3 f3 <unknown> 6
+# Test that if binaries are not in the original location then sanstats
+# searches them next to the stats file.
+# RUN: mkdir -p %t.dir
+# RUN: mv -f %t1.o %t2.o %t.dir
+# RUN: sanstats %t.stats | FileCheck %s --check-prefix=INVALID
+# RUN: mv -f %t.stats %t.dir/copy.stats
+# RUN: cd %t.dir && sanstats copy.stats | FileCheck %s
 
-# CHECK: /tmp{{[/\\]}}f.c:3 f3 cfi-vcall 7
-# CHECK: /tmp{{[/\\]}}f.c:2 f2 cfi-nvcall 8
-# CHECK: /tmp{{[/\\]}}f.c:1 f1 cfi-derived-cast 9
-# CHECK: /tmp{{[/\\]}}f.c:3 f3 cfi-unrelated-cast 11
-# CHECK: /tmp{{[/\\]}}f.c:2 f2 cfi-icall 12
-# CHECK: /tmp{{[/\\]}}f.c:1 f1 <unknown> 14
+# INVALID: <invalid>
+# CHECK-NOT: <invalid>
+# CHECK: 0x0000000000000000 /tmp{{[/\\]}}f.c:1 f1 cfi-vcall 1
+# CHECK: 0x0000000000000010 /tmp{{[/\\]}}f.c:2 f2 cfi-nvcall 2
+# CHECK: 0x0000000000000020 /tmp{{[/\\]}}f.c:3 f3 cfi-derived-cast 3
+# CHECK: 0x0000000000000000 /tmp{{[/\\]}}f.c:1 f1 cfi-unrelated-cast 4
+# CHECK: 0x0000000000000010 /tmp{{[/\\]}}f.c:2 f2 cfi-icall 5
+# CHECK: 0x0000000000000020 /tmp{{[/\\]}}f.c:3 f3 <unknown> 6
+
+# CHECK: 0x0000000000000020 /tmp{{[/\\]}}f.c:3 f3 cfi-vcall 7
+# CHECK: 0x0000000000000010 /tmp{{[/\\]}}f.c:2 f2 cfi-nvcall 8
+# CHECK: 0x0000000000000000 /tmp{{[/\\]}}f.c:1 f1 cfi-derived-cast 9
+# CHECK: 0x0000000000000020 /tmp{{[/\\]}}f.c:3 f3 cfi-unrelated-cast 11
+# CHECK: 0x0000000000000010 /tmp{{[/\\]}}f.c:2 f2 cfi-icall 12
+# CHECK: 0x0000000000000000 /tmp{{[/\\]}}f.c:1 f1 <unknown> 14
 
 --- !ELF
 FileHeader:      
diff --git a/test/tools/yaml2obj/coff-arm64.yaml b/test/tools/yaml2obj/coff-arm64.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d734e300c040d4882a28385e7b4b722df8f176e2
--- /dev/null
+++ b/test/tools/yaml2obj/coff-arm64.yaml
@@ -0,0 +1,94 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj -file-headers %t | FileCheck %s
+# RUN: obj2yaml %t | FileCheck %s --check-prefix=ROUNDTRIP
+
+# CHECK: OptionalHeaderSize: 240
+
+# ROUNDTRIP: VirtualAddress:  4096
+# ROUNDTRIP: VirtualAddress:  8192
+# ROUNDTRIP: VirtualAddress:  12288
+
+--- !COFF
+OptionalHeader:  
+  AddressOfEntryPoint: 4096
+  ImageBase:       1073741824
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA, IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  ImportTable:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  ResourceTable:   
+    RelativeVirtualAddress: 0
+    Size:            0
+  ExceptionTable:  
+    RelativeVirtualAddress: 12288
+    Size:            8
+  CertificateTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  BaseRelocationTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  Debug:           
+    RelativeVirtualAddress: 0
+    Size:            0
+  Architecture:    
+    RelativeVirtualAddress: 0
+    Size:            0
+  GlobalPtr:       
+    RelativeVirtualAddress: 0
+    Size:            0
+  TlsTable:        
+    RelativeVirtualAddress: 0
+    Size:            0
+  LoadConfigTable: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  BoundImport:     
+    RelativeVirtualAddress: 0
+    Size:            0
+  IAT:             
+    RelativeVirtualAddress: 0
+    Size:            0
+  DelayImportDescriptor: 
+    RelativeVirtualAddress: 0
+    Size:            0
+  ClrRuntimeHeader: 
+    RelativeVirtualAddress: 0
+    Size:            0
+header:          
+  Machine:         IMAGE_FILE_MACHINE_ARM64
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LARGE_ADDRESS_AWARE ]
+sections:        
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     4
+    SectionData:     C0035FD6
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     12
+    SectionData:     0100400800000000E4E3E3E3
+  - Name:            .pdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  12288
+    VirtualSize:     8
+    SectionData:     '0010000000200000'
+symbols:         []
+...
diff --git a/tools/bugpoint-passes/TestPasses.cpp b/tools/bugpoint-passes/TestPasses.cpp
index 22ded6261a1a313534020c8a5a8e734d17bed9b3..6b1463685b2ca8d3fea995998906366e63e90600 100644
--- a/tools/bugpoint-passes/TestPasses.cpp
+++ b/tools/bugpoint-passes/TestPasses.cpp
@@ -123,3 +123,28 @@ char CrashOnTooManyCUs::ID = 0;
 static RegisterPass<CrashOnTooManyCUs>
     A("bugpoint-crash-too-many-cus",
       "BugPoint Test Pass - Intentionally crash on too many CUs");
+
+namespace {
+class CrashOnFunctionAttribute : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  CrashOnFunctionAttribute() : FunctionPass(ID) {}
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  bool runOnFunction(Function &F) override {
+    AttributeSet A = F.getAttributes().getFnAttributes();
+    if (A.hasAttribute("bugpoint-crash"))
+      abort();
+    return false;
+  }
+};
+} // namespace
+
+char CrashOnFunctionAttribute::ID = 0;
+static RegisterPass<CrashOnFunctionAttribute>
+    B("bugpoint-crashfuncattr", "BugPoint Test Pass - Intentionally crash on "
+                                "function attribute 'bugpoint-crash'");
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index a50ff4c255bdabe4fd9e9b0f281d3d85a0e8b7aa..ef6a214fde2054a5a84138c39beb4f9dad92d1b5 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -314,6 +314,66 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
   return false;
 }
 
+namespace {
+/// ReduceCrashingFunctionAttributes reducer - This works by removing
+/// attributes on a particular function and seeing if the program still crashes.
+/// If it does, then keep the newer, smaller program.
+///
+class ReduceCrashingFunctionAttributes : public ListReducer<Attribute> {
+  BugDriver &BD;
+  std::string FnName;
+  BugTester TestFn;
+
+public:
+  ReduceCrashingFunctionAttributes(BugDriver &bd, const std::string &FnName,
+                                   BugTester testFn)
+      : BD(bd), FnName(FnName), TestFn(testFn) {}
+
+  Expected<TestResult> doTest(std::vector<Attribute> &Prefix,
+                              std::vector<Attribute> &Kept) override {
+    if (!Kept.empty() && TestFuncAttrs(Kept))
+      return KeepSuffix;
+    if (!Prefix.empty() && TestFuncAttrs(Prefix))
+      return KeepPrefix;
+    return NoFailure;
+  }
+
+  bool TestFuncAttrs(std::vector<Attribute> &Attrs);
+};
+}
+
+bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
+    std::vector<Attribute> &Attrs) {
+  // Clone the program to try hacking it apart...
+  std::unique_ptr<Module> M = CloneModule(BD.getProgram());
+  Function *F = M->getFunction(FnName);
+
+  // Build up an AttributeList from the attributes we've been given by the
+  // reducer.
+  AttrBuilder AB;
+  for (auto A : Attrs)
+    AB.addAttribute(A);
+  AttributeList NewAttrs;
+  NewAttrs =
+      NewAttrs.addAttributes(BD.getContext(), AttributeList::FunctionIndex, AB);
+
+  // Set this new list of attributes on the function.
+  F->setAttributes(NewAttrs);
+
+  // Try running on the hacked up program...
+  if (TestFn(BD, M.get())) {
+    BD.setNewProgram(std::move(M)); // It crashed, keep the trimmed version...
+
+    // Pass along the set of attributes that caused the crash.
+    Attrs.clear();
+    for (Attribute A : NewAttrs.getFnAttributes()) {
+      Attrs.push_back(A);
+    }
+    return true;
+  }
+  return false;
+}
+
 namespace {
 /// Simplify the CFG without completely destroying it.
 /// This is not well defined, but basically comes down to "try to eliminate
@@ -1056,6 +1116,38 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
       BD.EmitProgressBitcode(BD.getProgram(), "reduced-function");
   }
 
+  // For each remaining function, try to reduce that function's attributes.
+  std::vector<std::string> FunctionNames;
+  for (Function &F : BD.getProgram())
+    FunctionNames.push_back(F.getName());
+
+  if (!FunctionNames.empty() && !BugpointIsInterrupted) {
+    outs() << "\n*** Attempting to reduce the number of function attributes in "
+              "the testcase\n";
+
+    unsigned OldSize = 0;
+    unsigned NewSize = 0;
+    for (std::string &Name : FunctionNames) {
+      Function *Fn = BD.getProgram().getFunction(Name);
+      assert(Fn && "Could not find funcion?");
+
+      std::vector<Attribute> Attrs;
+      for (Attribute A : Fn->getAttributes().getFnAttributes())
+        Attrs.push_back(A);
+
+      OldSize += Attrs.size();
+      Expected<bool> Result =
+          ReduceCrashingFunctionAttributes(BD, Name, TestFn).reduceList(Attrs);
+      if (Error E = Result.takeError())
+        return E;
+
+      NewSize += Attrs.size();
+    }
+
+    if (OldSize < NewSize)
+      BD.EmitProgressBitcode(BD.getProgram(), "reduced-function-attributes");
+  }
+
   // Attempt to change conditional branches into unconditional branches to
   // eliminate blocks.
   if (!DisableSimplifyCFG && !BugpointIsInterrupted) {
diff --git a/tools/bugpoint/ExecutionDriver.cpp b/tools/bugpoint/ExecutionDriver.cpp
index 690a48de35aeb1b5da673571d31813cfc3f7c17e..1b86b103d83546d08e361cdb0223f6737dbe4449 100644
--- a/tools/bugpoint/ExecutionDriver.cpp
+++ b/tools/bugpoint/ExecutionDriver.cpp
@@ -148,8 +148,9 @@ Error BugDriver::initializeExecutionEnvironment() {
   std::string Message;
 
   if (CCBinary.empty()) {
-    if (sys::findProgramByName("clang"))
-      CCBinary = "clang";
+    if (ErrorOr<std::string> ClangPath =
+            FindProgramByName("clang", getToolName(), &AbsTolerance))
+      CCBinary = *ClangPath;
     else
       CCBinary = "gcc";
   }
@@ -193,11 +194,11 @@ Error BugDriver::initializeExecutionEnvironment() {
     break;
   case CompileCustom:
     Interpreter = AbstractInterpreter::createCustomCompiler(
-        Message, CustomCompileCommand);
+        getToolName(), Message, CustomCompileCommand);
     break;
   case Custom:
-    Interpreter =
-        AbstractInterpreter::createCustomExecutor(Message, CustomExecCommand);
+    Interpreter = AbstractInterpreter::createCustomExecutor(
+        getToolName(), Message, CustomExecCommand);
     break;
   }
   if (!Interpreter)
@@ -239,8 +240,8 @@ Error BugDriver::initializeExecutionEnvironment() {
         SafeInterpreterSel == RunLLCIA);
     break;
   case Custom:
-    SafeInterpreter =
-        AbstractInterpreter::createCustomExecutor(Message, CustomExecCommand);
+    SafeInterpreter = AbstractInterpreter::createCustomExecutor(
+        getToolName(), Message, CustomExecCommand);
     break;
   default:
     Message = "Sorry, this back-end is not supported by bugpoint as the "
@@ -252,7 +253,7 @@ Error BugDriver::initializeExecutionEnvironment() {
     exit(1);
   }
 
-  cc = CC::create(Message, CCBinary, &CCToolArgv);
+  cc = CC::create(getToolName(), Message, CCBinary, &CCToolArgv);
   if (!cc) {
     outs() << Message << "\nExiting.\n";
     exit(1);
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index cbb048db8fe7fa42dd1edf4246bd48021364f0ea..64fe675de20ccaf7bb1cd9e44c237f111dfd3b60 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -16,6 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "BugDriver.h"
+#include "ToolRunner.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
@@ -166,7 +167,8 @@ bool BugDriver::runPasses(Module &Program,
 
   std::string tool = OptCmd;
   if (OptCmd.empty()) {
-    if (ErrorOr<std::string> Path = sys::findProgramByName("opt"))
+    if (ErrorOr<std::string> Path =
+            FindProgramByName("opt", getToolName(), &OutputPrefix))
       tool = *Path;
     else
       errs() << Path.getError().message() << "\n";
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index 812e8e3bbae57a31063e4b3292224c95bc8e91e8..7ba8ea1f16c572bccf9b99c1d419f9ccd007743c 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -202,19 +202,7 @@ Expected<int> LLI::ExecuteProgram(const std::string &Bitcode,
 
 void AbstractInterpreter::anchor() {}
 
-#if defined(LLVM_ON_UNIX)
-const char EXESuffix[] = "";
-#elif defined(_WIN32)
-const char EXESuffix[] = "exe";
-#endif
-
-/// Prepend the path to the program being executed
-/// to \p ExeName, given the value of argv[0] and the address of main()
-/// itself. This allows us to find another LLVM tool if it is built in the same
-/// directory. An empty string is returned on error; note that this function
-/// just mainpulates the path and doesn't check for executability.
-/// Find a named executable.
-static std::string PrependMainExecutablePath(const std::string &ExeName,
+ErrorOr<std::string> llvm::FindProgramByName(const std::string &ExeName,
                                              const char *Argv0,
                                              void *MainAddr) {
   // Check the directory that the calling program is in.  We can do
@@ -222,30 +210,25 @@ static std::string PrependMainExecutablePath(const std::string &ExeName,
   // is a relative path to the executable itself.
   std::string Main = sys::fs::getMainExecutable(Argv0, MainAddr);
   StringRef Result = sys::path::parent_path(Main);
+  if (ErrorOr<std::string> Path = sys::findProgramByName(ExeName, Result))
+    return *Path;
 
-  if (!Result.empty()) {
-    SmallString<128> Storage = Result;
-    sys::path::append(Storage, ExeName);
-    sys::path::replace_extension(Storage, EXESuffix);
-    return Storage.str();
-  }
-
-  return Result.str();
+  // Check the user PATH.
+  return sys::findProgramByName(ExeName);
 }
 
 // LLI create method - Try to find the LLI executable
 AbstractInterpreter *
 AbstractInterpreter::createLLI(const char *Argv0, std::string &Message,
                                const std::vector<std::string> *ToolArgs) {
-  std::string LLIPath =
-      PrependMainExecutablePath("lli", Argv0, (void *)(intptr_t)&createLLI);
-  if (!LLIPath.empty()) {
-    Message = "Found lli: " + LLIPath + "\n";
-    return new LLI(LLIPath, ToolArgs);
+  if (ErrorOr<std::string> LLIPath =
+      FindProgramByName("lli", Argv0, (void *)(intptr_t)&createLLI)) {
+    Message = "Found lli: " + *LLIPath + "\n";
+    return new LLI(*LLIPath, ToolArgs);
+  } else {
+    Message = LLIPath.getError().message() + "\n";
+    return nullptr;
   }
-
-  Message = "Cannot find `lli' in executable directory!\n";
-  return nullptr;
 }
 
 //===---------------------------------------------------------------------===//
@@ -368,8 +351,9 @@ Expected<int> CustomExecutor::ExecuteProgram(
 // '\ ' -> ' '
 // 'exa\mple' -> 'example'
 //
-static void lexCommand(std::string &Message, const std::string &CommandLine,
-                       std::string &CmdPath, std::vector<std::string> &Args) {
+static void lexCommand(const char *Argv0, std::string &Message,
+                       const std::string &CommandLine, std::string &CmdPath,
+                       std::vector<std::string> &Args) {
 
   std::string Token;
   std::string Command;
@@ -402,7 +386,7 @@ static void lexCommand(std::string &Message, const std::string &CommandLine,
     Token.push_back(CommandLine[Pos]);
   }
 
-  auto Path = sys::findProgramByName(Command);
+  auto Path = FindProgramByName(Command, Argv0, (void *)(intptr_t)&lexCommand);
   if (!Path) {
     Message = std::string("Cannot find '") + Command +
               "' in PATH: " + Path.getError().message() + "\n";
@@ -416,11 +400,12 @@ static void lexCommand(std::string &Message, const std::string &CommandLine,
 // Custom execution environment create method, takes the execution command
 // as arguments
 AbstractInterpreter *AbstractInterpreter::createCustomCompiler(
-    std::string &Message, const std::string &CompileCommandLine) {
+    const char *Argv0, std::string &Message,
+    const std::string &CompileCommandLine) {
 
   std::string CmdPath;
   std::vector<std::string> Args;
-  lexCommand(Message, CompileCommandLine, CmdPath, Args);
+  lexCommand(Argv0, Message, CompileCommandLine, CmdPath, Args);
   if (CmdPath.empty())
     return nullptr;
 
@@ -430,12 +415,13 @@ AbstractInterpreter *AbstractInterpreter::createCustomCompiler(
 // Custom execution environment create method, takes the execution command
 // as arguments
 AbstractInterpreter *
-AbstractInterpreter::createCustomExecutor(std::string &Message,
+AbstractInterpreter::createCustomExecutor(const char *Argv0,
+                                          std::string &Message,
                                           const std::string &ExecCommandLine) {
 
   std::string CmdPath;
   std::vector<std::string> Args;
-  lexCommand(Message, ExecCommandLine, CmdPath, Args);
+  lexCommand(Argv0, Message, ExecCommandLine, CmdPath, Args);
   if (CmdPath.empty())
     return nullptr;
 
@@ -524,20 +510,20 @@ LLC *AbstractInterpreter::createLLC(const char *Argv0, std::string &Message,
                                     const std::vector<std::string> *Args,
                                     const std::vector<std::string> *CCArgs,
                                     bool UseIntegratedAssembler) {
-  std::string LLCPath =
-      PrependMainExecutablePath("llc", Argv0, (void *)(intptr_t)&createLLC);
-  if (LLCPath.empty()) {
-    Message = "Cannot find `llc' in executable directory!\n";
+  ErrorOr<std::string> LLCPath =
+      FindProgramByName("llc", Argv0, (void *)(intptr_t)&createLLC);
+  if (!LLCPath) {
+    Message = LLCPath.getError().message() + "\n";
     return nullptr;
   }
 
-  CC *cc = CC::create(Message, CCBinary, CCArgs);
+  CC *cc = CC::create(Argv0, Message, CCBinary, CCArgs);
   if (!cc) {
     errs() << Message << "\n";
     exit(1);
   }
-  Message = "Found llc: " + LLCPath + "\n";
-  return new LLC(LLCPath, cc, Args, UseIntegratedAssembler);
+  Message = "Found llc: " + *LLCPath + "\n";
+  return new LLC(*LLCPath, cc, Args, UseIntegratedAssembler);
 }
 
 //===---------------------------------------------------------------------===//
@@ -606,15 +592,14 @@ Expected<int> JIT::ExecuteProgram(const std::string &Bitcode,
 AbstractInterpreter *
 AbstractInterpreter::createJIT(const char *Argv0, std::string &Message,
                                const std::vector<std::string> *Args) {
-  std::string LLIPath =
-      PrependMainExecutablePath("lli", Argv0, (void *)(intptr_t)&createJIT);
-  if (!LLIPath.empty()) {
-    Message = "Found lli: " + LLIPath + "\n";
-    return new JIT(LLIPath, Args);
+  if (ErrorOr<std::string> LLIPath =
+          FindProgramByName("lli", Argv0, (void *)(intptr_t)&createJIT)) {
+    Message = "Found lli: " + *LLIPath + "\n";
+    return new JIT(*LLIPath, Args);
+  } else {
+    Message = LLIPath.getError().message() + "\n";
+    return nullptr;
   }
-
-  Message = "Cannot find `lli' in executable directory!\n";
-  return nullptr;
 }
 
 //===---------------------------------------------------------------------===//
@@ -855,9 +840,10 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType,
 
 /// create - Try to find the CC executable
 ///
-CC *CC::create(std::string &Message, const std::string &CCBinary,
+CC *CC::create(const char *Argv0, std::string &Message,
+               const std::string &CCBinary,
                const std::vector<std::string> *Args) {
-  auto CCPath = sys::findProgramByName(CCBinary);
+  auto CCPath = FindProgramByName(CCBinary, Argv0, (void *)(intptr_t)&create);
   if (!CCPath) {
     Message = "Cannot find `" + CCBinary + "' in PATH: " +
               CCPath.getError().message() + "\n";
diff --git a/tools/bugpoint/ToolRunner.h b/tools/bugpoint/ToolRunner.h
index f218ad534ee9b5f894ddfbabd12c20b34dbeb89c..ef8551cc669b44942dbdb5c42a3a78d70fa32818 100644
--- a/tools/bugpoint/ToolRunner.h
+++ b/tools/bugpoint/ToolRunner.h
@@ -49,7 +49,8 @@ class CC {
 public:
   enum FileType { AsmFile, ObjectFile, CFile };
 
-  static CC *create(std::string &Message, const std::string &CCBinary,
+  static CC *create(const char *Argv0, std::string &Message,
+                    const std::string &CCBinary,
                     const std::vector<std::string> *Args);
 
   /// ExecuteProgram - Execute the program specified by "ProgramFile" (which is
@@ -98,11 +99,11 @@ public:
             const std::vector<std::string> *Args = nullptr);
 
   static AbstractInterpreter *
-  createCustomCompiler(std::string &Message,
+  createCustomCompiler(const char *Argv0, std::string &Message,
                        const std::string &CompileCommandLine);
 
   static AbstractInterpreter *
-  createCustomExecutor(std::string &Message,
+  createCustomExecutor(const char *Argv0, std::string &Message,
                        const std::string &ExecCommandLine);
 
   virtual ~AbstractInterpreter() {}
@@ -178,6 +179,13 @@ public:
                                     unsigned MemoryLimit = 0) override;
 };
 
+/// Find the first executable file \ExeName, either in the user's PATH or,
+/// failing that, in the same directory as argv[0]. This allows us to find
+/// another LLVM tool if it is built in the same directory. If no executable is
+/// found, an error is returned.
+ErrorOr<std::string> FindProgramByName(const std::string &ExeName,
+                                       const char *Argv0, void *MainAddr);
+
 } // End llvm namespace
 
 #endif
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index 2c5cce50132ffb306c1016ab1e20d4fe8c9c64d9..2862739fd73ac86a0ea97cbe7dfdfc69b7ce56a1 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -2417,15 +2417,20 @@ bool DwarfLinker::link(const DebugMap &Map) {
         warn(Err.message());
         continue;
       }
-      if (!Options.NoTimestamp &&
-          Stat.getLastModificationTime() !=
-              sys::TimePoint<>(LinkContext.DMO.getTimestamp())) {
-        // Not using the helper here as we can easily stream TimePoint<>.
-        WithColor::warning()
-            << "Timestamp mismatch for " << File << ": "
-            << Stat.getLastModificationTime() << " and "
-            << sys::TimePoint<>(LinkContext.DMO.getTimestamp()) << "\n";
-        continue;
+      if (!Options.NoTimestamp) {
+        // The modification can have sub-second precision so we need to cast
+        // away the extra precision that's not present in the debug map.
+        auto ModificationTime =
+            std::chrono::time_point_cast<std::chrono::seconds>(
+                Stat.getLastModificationTime());
+        if (ModificationTime != LinkContext.DMO.getTimestamp()) {
+          // Not using the helper here as we can easily stream TimePoint<>.
+          WithColor::warning()
+              << "Timestamp mismatch for " << File << ": "
+              << Stat.getLastModificationTime() << " and "
+              << sys::TimePoint<>(LinkContext.DMO.getTimestamp()) << "\n";
+          continue;
+        }
       }
 
       // Copy the module into the .swift_ast section.
diff --git a/tools/dsymutil/MachODebugMapParser.cpp b/tools/dsymutil/MachODebugMapParser.cpp
index 48155b402042d224565c720fcf8b9680523e8604..d696e1d4cc84e9f15ed3eeba6f17543a60e99877 100644
--- a/tools/dsymutil/MachODebugMapParser.cpp
+++ b/tools/dsymutil/MachODebugMapParser.cpp
@@ -511,14 +511,16 @@ void MachODebugMapParser::loadMainBinarySymbols(
     // Skip undefined and STAB entries.
     if ((Type == SymbolRef::ST_Debug) || (Type == SymbolRef::ST_Unknown))
       continue;
-    // The only symbols of interest are the global variables. These
-    // are the only ones that need to be queried because the address
-    // of common data won't be described in the debug map. All other
-    // addresses should be fetched for the debug map.
+    // In theory, the only symbols of interest are the global variables. These
+    // are the only ones that need to be queried because the address of common
+    // data won't be described in the debug map. All other addresses should be
+    // fetched for the debug map. In reality, by playing with 'ld -r' and
+    // export lists, you can get symbols described as N_GSYM in the debug map,
+    // but associated with a local symbol. Gather all the symbols, but prefer
+    // the global ones.
     uint8_t SymType =
         MainBinary.getSymbolTableEntry(Sym.getRawDataRefImpl()).n_type;
-    if (!(SymType & (MachO::N_EXT | MachO::N_PEXT)))
-      continue;
+    bool Extern = SymType & (MachO::N_EXT | MachO::N_PEXT);
     Expected<section_iterator> SectionOrErr = Sym.getSection();
     if (!SectionOrErr) {
       // TODO: Actually report errors helpfully.
@@ -538,7 +540,11 @@ void MachODebugMapParser::loadMainBinarySymbols(
     StringRef Name = *NameOrErr;
     if (Name.size() == 0 || Name[0] == '\0')
       continue;
-    MainBinarySymbolAddresses[Name] = Addr;
+    // Override only if the new key is global.
+    if (Extern)
+      MainBinarySymbolAddresses[Name] = Addr;
+    else
+      MainBinarySymbolAddresses.try_emplace(Name, Addr);
   }
 }
 
diff --git a/tools/dsymutil/MachOUtils.cpp b/tools/dsymutil/MachOUtils.cpp
index d213cfea5098a32c2c45e611ec80f0c39bcc138b..cac4ad89eb20c792e42b1eee862b189be748ca32 100644
--- a/tools/dsymutil/MachOUtils.cpp
+++ b/tools/dsymutil/MachOUtils.cpp
@@ -368,25 +368,37 @@ bool generateDsymCompanion(const DebugMap &DM, MCStreamer &MS,
   bool Is64Bit = Writer.is64Bit();
   MachO::symtab_command SymtabCmd = InputBinary.getSymtabLoadCommand();
 
-  // Get UUID.
+  // Compute the number of load commands we will need.
+  unsigned LoadCommandSize = 0;
+  unsigned NumLoadCommands = 0;
+
+  // Get LC_UUID and LC_BUILD_VERSION.
   MachO::uuid_command UUIDCmd;
+  SmallVector<MachO::build_version_command, 2> BuildVersionCmd;
   memset(&UUIDCmd, 0, sizeof(UUIDCmd));
-  UUIDCmd.cmd = MachO::LC_UUID;
-  UUIDCmd.cmdsize = sizeof(MachO::uuid_command);
   for (auto &LCI : InputBinary.load_commands()) {
-    if (LCI.C.cmd == MachO::LC_UUID) {
+    switch (LCI.C.cmd) {
+    case MachO::LC_UUID:
+      if (UUIDCmd.cmd)
+        return error("Binary contains more than one UUID");
       UUIDCmd = InputBinary.getUuidCommand(LCI);
+      ++NumLoadCommands;
+      LoadCommandSize += sizeof(UUIDCmd);
+      break;
+   case MachO::LC_BUILD_VERSION: {
+      MachO::build_version_command Cmd;
+      memset(&Cmd, 0, sizeof(Cmd));
+      Cmd = InputBinary.getBuildVersionLoadCommand(LCI);
+      ++NumLoadCommands;
+      LoadCommandSize += sizeof(Cmd);
+      // LLDB doesn't care about the build tools for now.
+      Cmd.ntools = 0;
+      BuildVersionCmd.push_back(Cmd);
+      break;
+    }
+    default:
       break;
     }
-  }
-
-  // Compute the number of load commands we will need.
-  unsigned LoadCommandSize = 0;
-  unsigned NumLoadCommands = 0;
-  // We will copy the UUID if there is one.
-  if (UUIDCmd.cmd != 0) {
-    ++NumLoadCommands;
-    LoadCommandSize += sizeof(MachO::uuid_command);
   }
 
   // If we have a valid symtab to copy, do it.
@@ -452,10 +464,18 @@ bool generateDsymCompanion(const DebugMap &DM, MCStreamer &MS,
   assert(OutFile.tell() == HeaderSize);
   if (UUIDCmd.cmd != 0) {
     Writer.W.write<uint32_t>(UUIDCmd.cmd);
-    Writer.W.write<uint32_t>(UUIDCmd.cmdsize);
+    Writer.W.write<uint32_t>(sizeof(UUIDCmd));
     OutFile.write(reinterpret_cast<const char *>(UUIDCmd.uuid), 16);
     assert(OutFile.tell() == HeaderSize + sizeof(UUIDCmd));
   }
+  for (auto Cmd : BuildVersionCmd) {
+    Writer.W.write<uint32_t>(Cmd.cmd);
+    Writer.W.write<uint32_t>(sizeof(Cmd));
+    Writer.W.write<uint32_t>(Cmd.platform);
+    Writer.W.write<uint32_t>(Cmd.minos);
+    Writer.W.write<uint32_t>(Cmd.sdk);
+    Writer.W.write<uint32_t>(Cmd.ntools);
+  }
 
   assert(SymtabCmd.cmd && "No symbol table.");
   uint64_t StringStart = SymtabStart + NumSyms * NListSize;
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 71e5b72a40c802e6a29a1ca19f8a5e2edfd3f170..8ffa5877ba665c7dbfe9c9f0bdc7737a3c4b8a84 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -833,8 +833,10 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
   Conf.Options.RelaxELFRelocations = false;
 
   // Toggle function/data sections.
-  Conf.Options.FunctionSections = SplitSections;
-  Conf.Options.DataSections = SplitSections;
+  if (FunctionSections.getNumOccurrences() == 0)
+    Conf.Options.FunctionSections = SplitSections;
+  if (DataSections.getNumOccurrences() == 0)
+    Conf.Options.DataSections = SplitSections;
 
   Conf.MAttrs = MAttrs;
   Conf.RelocModel = RelocationModel;
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index c3c57e2cdeed3cbfb2a22f463d9587fa2bd7c484..e4a7462a6352ad488a7717fd0167d52550307238 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -511,7 +511,7 @@ int main(int argc, char **argv, char * const *envp) {
     if (!ArOrErr) {
       std::string Buf;
       raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(ArOrErr.takeError(), OS, "");
+      logAllUnhandledErrors(ArOrErr.takeError(), OS);
       OS.flush();
       errs() << Buf;
       exit(1);
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index 5b82e4634c187a78b38ec7b35eba9463b8a1d564..789a666cb41a2530c494af443d9843d29ee4d622 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -247,6 +247,7 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(CST_CODE, CE_CMP)
       STRINGIFY_CODE(CST_CODE, INLINEASM)
       STRINGIFY_CODE(CST_CODE, CE_SHUFVEC_EX)
+      STRINGIFY_CODE(CST_CODE, CE_UNOP)
     case bitc::CST_CODE_BLOCKADDRESS:    return "CST_CODE_BLOCKADDRESS";
       STRINGIFY_CODE(CST_CODE, DATA)
     }
@@ -267,6 +268,7 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FUNC_CODE, INST_BR)
       STRINGIFY_CODE(FUNC_CODE, INST_SWITCH)
       STRINGIFY_CODE(FUNC_CODE, INST_INVOKE)
+      STRINGIFY_CODE(FUNC_CODE, INST_UNOP)
       STRINGIFY_CODE(FUNC_CODE, INST_UNREACHABLE)
       STRINGIFY_CODE(FUNC_CODE, INST_CLEANUPRET)
       STRINGIFY_CODE(FUNC_CODE, INST_CATCHRET)
diff --git a/tools/llvm-config/CMakeLists.txt b/tools/llvm-config/CMakeLists.txt
index a0bd36c373153bf5f0d516327796c45423fc9dbd..a7db17386fb3bca91f9bceca49f551660e88aeef 100644
--- a/tools/llvm-config/CMakeLists.txt
+++ b/tools/llvm-config/CMakeLists.txt
@@ -29,12 +29,20 @@ string(REPLACE ";" " " SYSTEM_LIBS "${SYSTEM_LIBS}")
 # Fetch target specific compile options, e.g. RTTI option
 get_property(COMPILE_FLAGS TARGET llvm-config PROPERTY COMPILE_FLAGS)
 
+# The language standard potentially affects the ABI/API of LLVM, so we want
+# to make sure it is reported by llvm-config.
+# NOTE: We don't want to start extracting any random C/CXX flags that the
+# user may add that could affect the ABI.  We only want to extract flags
+# that have been added by the LLVM build system.
+string(REGEX MATCH "-std=[^ ]\+" LLVM_CXX_STD_FLAG ${CMAKE_CXX_FLAGS})
+string(REGEX MATCH "-std=[^ ]\+" LLVM_C_STD_FLAG ${CMAKE_C_FLAGS})
+
 # Use configure_file to create BuildVariables.inc.
 set(LLVM_SRC_ROOT ${LLVM_MAIN_SRC_DIR})
 set(LLVM_OBJ_ROOT ${LLVM_BINARY_DIR})
-set(LLVM_CPPFLAGS "${CMAKE_CPP_FLAGS} ${CMAKE_CPP_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
-set(LLVM_CFLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
-set(LLVM_CXXFLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${COMPILE_FLAGS} ${LLVM_DEFINITIONS}")
+set(LLVM_CPPFLAGS "${LLVM_DEFINITIONS}")
+set(LLVM_CFLAGS "${LLVM_C_STD_FLAG} ${LLVM_DEFINITIONS}")
+set(LLVM_CXXFLAGS "${LLVM_CXX_STD_FLAG} ${COMPILE_FLAGS} ${LLVM_DEFINITIONS}")
 set(LLVM_BUILD_SYSTEM cmake)
 set(LLVM_HAS_RTTI ${LLVM_CONFIG_HAS_RTTI})
 set(LLVM_DYLIB_VERSION "${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX}")
diff --git a/tools/llvm-cov/CMakeLists.txt b/tools/llvm-cov/CMakeLists.txt
index d0416b06f9cde3620594d618ab5b8ca755bec133..c3afec86cba20dc3ddd055bad0acb2bab23b614d 100644
--- a/tools/llvm-cov/CMakeLists.txt
+++ b/tools/llvm-cov/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_tool(llvm-cov
   gcov.cpp
   CodeCoverage.cpp
   CoverageExporterJson.cpp
+  CoverageExporterLcov.cpp
   CoverageFilters.cpp
   CoverageReport.cpp
   CoverageSummaryInfo.cpp
diff --git a/tools/llvm-cov/CodeCoverage.cpp b/tools/llvm-cov/CodeCoverage.cpp
index 19b2618b55203d87fd92053e10e3c1fa8eefc677..1dc6eedecc11c48e4b2d2d022de8a30ad4ae84cb 100644
--- a/tools/llvm-cov/CodeCoverage.cpp
+++ b/tools/llvm-cov/CodeCoverage.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CoverageExporterJson.h"
+#include "CoverageExporterLcov.h"
 #include "CoverageFilters.h"
 #include "CoverageReport.h"
 #include "CoverageSummaryInfo.h"
@@ -566,7 +567,9 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       cl::values(clEnumValN(CoverageViewOptions::OutputFormat::Text, "text",
                             "Text output"),
                  clEnumValN(CoverageViewOptions::OutputFormat::HTML, "html",
-                            "HTML output")),
+                            "HTML output"),
+                 clEnumValN(CoverageViewOptions::OutputFormat::Lcov, "lcov",
+                            "lcov tracefile output")),
       cl::init(CoverageViewOptions::OutputFormat::Text));
 
   cl::opt<std::string> PathRemap(
@@ -674,6 +677,11 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
         errs() << "Color output cannot be disabled when generating html.\n";
       ViewOpts.Colors = true;
       break;
+    case CoverageViewOptions::OutputFormat::Lcov:
+      if (UseColor == cl::BOU_TRUE)
+        errs() << "Color output cannot be enabled when generating lcov.\n";
+      ViewOpts.Colors = false;
+      break;
     }
 
     // If path-equivalence was given and is a comma seperated pair then set
@@ -833,6 +841,11 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
   if (Err)
     return Err;
 
+  if (ViewOpts.Format == CoverageViewOptions::OutputFormat::Lcov) {
+    error("Lcov format should be used with 'llvm-cov export'.");
+    return 1;
+  }
+
   ViewOpts.ShowLineNumbers = true;
   ViewOpts.ShowLineStats = ShowLineExecutionCounts.getNumOccurrences() != 0 ||
                            !ShowRegions || ShowBestLineRegionsCounts;
@@ -964,6 +977,9 @@ int CodeCoverageTool::doReport(int argc, const char **argv,
   if (ViewOpts.Format == CoverageViewOptions::OutputFormat::HTML) {
     error("HTML output for summary reports is not yet supported.");
     return 1;
+  } else if (ViewOpts.Format == CoverageViewOptions::OutputFormat::Lcov) {
+    error("Lcov format should be used with 'llvm-cov export'.");
+    return 1;
   }
 
   auto Coverage = load();
@@ -995,8 +1011,10 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
   if (Err)
     return Err;
 
-  if (ViewOpts.Format != CoverageViewOptions::OutputFormat::Text) {
-    error("Coverage data can only be exported as textual JSON.");
+  if (ViewOpts.Format != CoverageViewOptions::OutputFormat::Text &&
+      ViewOpts.Format != CoverageViewOptions::OutputFormat::Lcov) {
+    error("Coverage data can only be exported as textual JSON or an "
+          "lcov tracefile.");
     return 1;
   }
 
@@ -1006,12 +1024,27 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
     return 1;
   }
 
-  auto Exporter = CoverageExporterJson(*Coverage.get(), ViewOpts, outs());
+  std::unique_ptr<CoverageExporter> Exporter;
+
+  switch (ViewOpts.Format) {
+  case CoverageViewOptions::OutputFormat::Text:
+    Exporter = llvm::make_unique<CoverageExporterJson>(*Coverage.get(),
+                                                       ViewOpts, outs());
+    break;
+  case CoverageViewOptions::OutputFormat::HTML:
+    // Unreachable because we should have gracefully terminated with an error
+    // above.
+    llvm_unreachable("Export in HTML is not supported!");
+  case CoverageViewOptions::OutputFormat::Lcov:
+    Exporter = llvm::make_unique<CoverageExporterLcov>(*Coverage.get(),
+                                                       ViewOpts, outs());
+    break;
+  }
 
   if (SourceFiles.empty())
-    Exporter.renderRoot(IgnoreFilenameFilters);
+    Exporter->renderRoot(IgnoreFilenameFilters);
   else
-    Exporter.renderRoot(SourceFiles);
+    Exporter->renderRoot(SourceFiles);
 
   return 0;
 }
diff --git a/tools/llvm-cov/CoverageExporterLcov.cpp b/tools/llvm-cov/CoverageExporterLcov.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d149ba1a4c87423834885dc0fc5cc04b4ac21667
--- /dev/null
+++ b/tools/llvm-cov/CoverageExporterLcov.cpp
@@ -0,0 +1,125 @@
+//===- CoverageExporterLcov.cpp - Code coverage export --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements export of code coverage data to lcov trace file format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// The trace file code coverage export follows the following format (see also
+// https://linux.die.net/man/1/geninfo). Each quoted string appears on its own
+// line; the indentation shown here is only for documentation purposes.
+//
+// - for each source file:
+//   - "SF:<absolute path to source file>"
+//   - for each function:
+//     - "FN:<line number of function start>,<function name>"
+//   - for each function:
+//     - "FNDA:<execution count>,<function name>"
+//   - "FNF:<number of functions found>"
+//   - "FNH:<number of functions hit>"
+//   - for each instrumented line:
+//     - "DA:<line number>,<execution count>[,<checksum>]
+//   - "LH:<number of lines with non-zero execution count>"
+//   - "LF:<nubmer of instrumented lines>"
+//   - "end_of_record"
+//
+// If the user is exporting summary information only, then the FN, FNDA, and DA
+// lines will not be present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CoverageExporterLcov.h"
+#include "CoverageReport.h"
+
+using namespace llvm;
+
+namespace {
+
+void renderFunctionSummary(raw_ostream &OS,
+                           const FileCoverageSummary &Summary) {
+  OS << "FNF:" << Summary.FunctionCoverage.getNumFunctions() << '\n'
+     << "FNH:" << Summary.FunctionCoverage.getExecuted() << '\n';
+}
+
+void renderFunctions(
+    raw_ostream &OS,
+    const iterator_range<coverage::FunctionRecordIterator> &Functions) {
+  for (const auto &F : Functions) {
+    auto StartLine = F.CountedRegions.front().LineStart;
+    OS << "FN:" << StartLine << ',' << F.Name << '\n';
+  }
+  for (const auto &F : Functions)
+    OS << "FNDA:" << F.ExecutionCount << ',' << F.Name << '\n';
+}
+
+void renderLineExecutionCounts(raw_ostream &OS,
+                               const coverage::CoverageData &FileCoverage) {
+  coverage::LineCoverageIterator LCI{FileCoverage, 1};
+  coverage::LineCoverageIterator LCIEnd = LCI.getEnd();
+  for (; LCI != LCIEnd; ++LCI) {
+    const coverage::LineCoverageStats &LCS = *LCI;
+    if (LCS.isMapped()) {
+      OS << "DA:" << LCS.getLine() << ',' << LCS.getExecutionCount() << '\n';
+    }
+  }
+}
+
+void renderLineSummary(raw_ostream &OS, const FileCoverageSummary &Summary) {
+  OS << "LF:" << Summary.LineCoverage.getNumLines() << '\n'
+     << "LH:" << Summary.LineCoverage.getCovered() << '\n';
+}
+
+void renderFile(raw_ostream &OS, const coverage::CoverageMapping &Coverage,
+                const std::string &Filename,
+                const FileCoverageSummary &FileReport, bool ExportSummaryOnly) {
+  OS << "SF:" << Filename << '\n';
+
+  if (!ExportSummaryOnly) {
+    renderFunctions(OS, Coverage.getCoveredFunctions());
+  }
+  renderFunctionSummary(OS, FileReport);
+
+  if (!ExportSummaryOnly) {
+    // Calculate and render detailed coverage information for given file.
+    auto FileCoverage = Coverage.getCoverageForFile(Filename);
+    renderLineExecutionCounts(OS, FileCoverage);
+  }
+  renderLineSummary(OS, FileReport);
+
+  OS << "end_of_record\n";
+}
+
+void renderFiles(raw_ostream &OS, const coverage::CoverageMapping &Coverage,
+                 ArrayRef<std::string> SourceFiles,
+                 ArrayRef<FileCoverageSummary> FileReports,
+                 bool ExportSummaryOnly) {
+  for (unsigned I = 0, E = SourceFiles.size(); I < E; ++I)
+    renderFile(OS, Coverage, SourceFiles[I], FileReports[I], ExportSummaryOnly);
+}
+
+} // end anonymous namespace
+
+void CoverageExporterLcov::renderRoot(const CoverageFilters &IgnoreFilters) {
+  std::vector<std::string> SourceFiles;
+  for (StringRef SF : Coverage.getUniqueSourceFiles()) {
+    if (!IgnoreFilters.matchesFilename(SF))
+      SourceFiles.emplace_back(SF);
+  }
+  renderRoot(SourceFiles);
+}
+
+void CoverageExporterLcov::renderRoot(ArrayRef<std::string> SourceFiles) {
+  FileCoverageSummary Totals = FileCoverageSummary("Totals");
+  auto FileReports = CoverageReport::prepareFileReports(Coverage, Totals,
+                                                        SourceFiles, Options);
+  renderFiles(OS, Coverage, SourceFiles, FileReports,
+              Options.ExportSummaryOnly);
+}
diff --git a/tools/llvm-cov/CoverageExporterLcov.h b/tools/llvm-cov/CoverageExporterLcov.h
new file mode 100644
index 0000000000000000000000000000000000000000..539b2dacd38424d86e9a0da745d0a3e0f12624c5
--- /dev/null
+++ b/tools/llvm-cov/CoverageExporterLcov.h
@@ -0,0 +1,36 @@
+//===- CoverageExporterLcov.h - Code coverage lcov exporter ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements a code coverage exporter for lcov trace file format.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_COVERAGEEXPORTERLCOV_H
+#define LLVM_COV_COVERAGEEXPORTERLCOV_H
+
+#include "CoverageExporter.h"
+
+namespace llvm {
+
+class CoverageExporterLcov : public CoverageExporter {
+public:
+  CoverageExporterLcov(const coverage::CoverageMapping &CoverageMapping,
+                       const CoverageViewOptions &Options, raw_ostream &OS)
+      : CoverageExporter(CoverageMapping, Options, OS) {}
+
+  /// Render the CoverageMapping object.
+  void renderRoot(const CoverageFilters &IgnoreFilters) override;
+
+  /// Render the CoverageMapping object for specified source files.
+  void renderRoot(ArrayRef<std::string> SourceFiles) override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_COV_COVERAGEEXPORTERLCOV_H
diff --git a/tools/llvm-cov/CoverageViewOptions.h b/tools/llvm-cov/CoverageViewOptions.h
index 20085a957bb5b61f481e33cdeeecb2e328716dbf..c8a47286002794aa509582031c2fdc1e1708b937 100644
--- a/tools/llvm-cov/CoverageViewOptions.h
+++ b/tools/llvm-cov/CoverageViewOptions.h
@@ -20,7 +20,8 @@ namespace llvm {
 struct CoverageViewOptions {
   enum class OutputFormat {
     Text,
-    HTML
+    HTML,
+    Lcov
   };
 
   bool Debug;
diff --git a/tools/llvm-cov/SourceCoverageView.cpp b/tools/llvm-cov/SourceCoverageView.cpp
index a5a8fa9a481497a4a0093836ba742e0d56da4072..775322b2de2fef7bab9100130c754f20db7edf08 100644
--- a/tools/llvm-cov/SourceCoverageView.cpp
+++ b/tools/llvm-cov/SourceCoverageView.cpp
@@ -80,6 +80,10 @@ CoveragePrinter::create(const CoverageViewOptions &Opts) {
     return llvm::make_unique<CoveragePrinterText>(Opts);
   case CoverageViewOptions::OutputFormat::HTML:
     return llvm::make_unique<CoveragePrinterHTML>(Opts);
+  case CoverageViewOptions::OutputFormat::Lcov:
+    // Unreachable because CodeCoverage.cpp should terminate with an error
+    // before we get here.
+    llvm_unreachable("Lcov format is not supported!");
   }
   llvm_unreachable("Unknown coverage output format!");
 }
@@ -143,6 +147,10 @@ SourceCoverageView::create(StringRef SourceName, const MemoryBuffer &File,
   case CoverageViewOptions::OutputFormat::HTML:
     return llvm::make_unique<SourceCoverageViewHTML>(
         SourceName, File, Options, std::move(CoverageInfo));
+  case CoverageViewOptions::OutputFormat::Lcov:
+    // Unreachable because CodeCoverage.cpp should terminate with an error
+    // before we get here.
+    llvm_unreachable("Lcov format is not supported!");
   }
   llvm_unreachable("Unknown coverage output format!");
 }
diff --git a/tools/llvm-cov/TestingSupport.cpp b/tools/llvm-cov/TestingSupport.cpp
index e07abdbd17f1417ee5edb35fd53c8cc443cd697c..16a1c266529968ab0cdf5648a13a246b125df528 100644
--- a/tools/llvm-cov/TestingSupport.cpp
+++ b/tools/llvm-cov/TestingSupport.cpp
@@ -33,7 +33,7 @@ int convertForTestingMain(int argc, const char *argv[]) {
   if (!ObjErr) {
     std::string Buf;
     raw_string_ostream OS(Buf);
-    logAllUnhandledErrors(ObjErr.takeError(), OS, "");
+    logAllUnhandledErrors(ObjErr.takeError(), OS);
     OS.flush();
     errs() << "error: " << Buf;
     return 1;
diff --git a/tools/llvm-cxxdump/llvm-cxxdump.cpp b/tools/llvm-cxxdump/llvm-cxxdump.cpp
index 09e40d9b0db75751264e3284a1abc5570c17c4d3..7594066a395dd79c0d77cda047f463ef8f5eb0c7 100644
--- a/tools/llvm-cxxdump/llvm-cxxdump.cpp
+++ b/tools/llvm-cxxdump/llvm-cxxdump.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <string>
@@ -43,17 +44,18 @@ namespace llvm {
 static void error(std::error_code EC) {
   if (!EC)
     return;
-  outs() << "\nError reading file: " << EC.message() << ".\n";
+  WithColor::error(outs(), "") << "reading file: " << EC.message() << ".\n";
   outs().flush();
   exit(1);
 }
 
 static void error(Error Err) {
-  if (Err) {
-    logAllUnhandledErrors(std::move(Err), outs(), "Error reading file: ");
-    outs().flush();
-    exit(1);
-  }
+  if (!Err)
+    return;
+  logAllUnhandledErrors(std::move(Err), WithColor::error(outs()),
+                        "reading file: ");
+  outs().flush();
+  exit(1);
 }
 
 } // namespace llvm
@@ -61,7 +63,7 @@ static void error(Error Err) {
 static void reportError(StringRef Input, StringRef Message) {
   if (Input == "-")
     Input = "<stdin>";
-  errs() << Input << ": " << Message << "\n";
+  WithColor::error(errs(), Input) << Message << "\n";
   errs().flush();
   exit(1);
 }
@@ -496,7 +498,7 @@ static void dumpArchive(const Archive *Arc) {
       if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) {
         std::string Buf;
         raw_string_ostream OS(Buf);
-        logAllUnhandledErrors(std::move(E), OS, "");
+        logAllUnhandledErrors(std::move(E), OS);
         OS.flush();
         reportError(Arc->getFileName(), Buf);
       }
diff --git a/tools/llvm-dwarfdump/Statistics.cpp b/tools/llvm-dwarfdump/Statistics.cpp
index 4119dbf1b3ce5d1f731437a1f532c614941ef32a..5fe7e8b4615b8f5b3659ada76310353ea3c64a79 100644
--- a/tools/llvm-dwarfdump/Statistics.cpp
+++ b/tools/llvm-dwarfdump/Statistics.cpp
@@ -25,7 +25,7 @@ struct PerFunctionStats {
   bool IsFunction = false;
 };
 
-/// Holds accumulated global statistics about local variables.
+/// Holds accumulated global statistics about DIEs.
 struct GlobalStats {
   /// Total number of PC range bytes covered by DW_AT_locations.
   unsigned ScopeBytesCovered = 0;
@@ -34,6 +34,13 @@ struct GlobalStats {
   unsigned ScopeBytesFromFirstDefinition = 0;
   /// Total number of call site entries (DW_TAG_call_site).
   unsigned CallSiteEntries = 0;
+  /// Total byte size of concrete functions. This byte size includes
+  /// inline functions contained in the concrete functions.
+  uint64_t FunctionSize = 0;
+  /// Total byte size of inlined functions. This is the total number of bytes
+  /// for the top inline functions within concrete functions. This can help
+  /// tune the inline settings when compiling to match user expectations.
+  uint64_t InlineFunctionSize = 0;
 };
 
 /// Extract the low pc from a Die.
@@ -53,6 +60,7 @@ static uint64_t getLowPC(DWARFDie Die) {
 static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
                                std::string VarPrefix, uint64_t ScopeLowPC,
                                uint64_t BytesInScope,
+                               uint32_t InlineDepth,
                                StringMap<PerFunctionStats> &FnStatMap,
                                GlobalStats &GlobalStats) {
   bool HasLoc = false;
@@ -137,24 +145,27 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
 static void collectStatsRecursive(DWARFDie Die, std::string FnPrefix,
                                   std::string VarPrefix, uint64_t ScopeLowPC,
                                   uint64_t BytesInScope,
+                                  uint32_t InlineDepth,
                                   StringMap<PerFunctionStats> &FnStatMap,
                                   GlobalStats &GlobalStats) {
   // Handle any kind of lexical scope.
-  if (Die.getTag() == dwarf::DW_TAG_subprogram ||
-      Die.getTag() == dwarf::DW_TAG_inlined_subroutine ||
-      Die.getTag() == dwarf::DW_TAG_lexical_block) {
+  const dwarf::Tag Tag = Die.getTag();
+  const bool IsFunction = Tag == dwarf::DW_TAG_subprogram;
+  const bool IsBlock = Tag == dwarf::DW_TAG_lexical_block;
+  const bool IsInlinedFunction = Tag == dwarf::DW_TAG_inlined_subroutine;
+  if (IsFunction || IsInlinedFunction || IsBlock) {
 
     // Reset VarPrefix when entering a new function.
     if (Die.getTag() == dwarf::DW_TAG_subprogram ||
         Die.getTag() == dwarf::DW_TAG_inlined_subroutine)
       VarPrefix = "v";
-  
+
     // Ignore forward declarations.
     if (Die.find(dwarf::DW_AT_declaration))
       return;
 
     // Count the function.
-    if (Die.getTag() != dwarf::DW_TAG_lexical_block) {
+    if (!IsBlock) {
       StringRef Name = Die.getName(DINameKind::LinkageName);
       if (Name.empty())
         Name = Die.getName(DINameKind::ShortName);
@@ -174,21 +185,32 @@ static void collectStatsRecursive(DWARFDie Die, std::string FnPrefix,
       llvm::consumeError(RangesOrError.takeError());
       return;
     }
-       
+
     auto Ranges = RangesOrError.get();
     uint64_t BytesInThisScope = 0;
     for (auto Range : Ranges)
       BytesInThisScope += Range.HighPC - Range.LowPC;
     ScopeLowPC = getLowPC(Die);
 
-    if (BytesInThisScope)
+    if (BytesInThisScope) {
       BytesInScope = BytesInThisScope;
+      if (IsFunction)
+        GlobalStats.FunctionSize += BytesInThisScope;
+      else if (IsInlinedFunction && InlineDepth == 0)
+        GlobalStats.InlineFunctionSize += BytesInThisScope;
+    }
   } else {
     // Not a scope, visit the Die itself. It could be a variable.
     collectStatsForDie(Die, FnPrefix, VarPrefix, ScopeLowPC, BytesInScope,
-                       FnStatMap, GlobalStats);
+                       InlineDepth, FnStatMap, GlobalStats);
   }
 
+  // Set InlineDepth correctly for child recursion
+  if (IsFunction)
+    InlineDepth = 0;
+  else if (IsInlinedFunction)
+    ++InlineDepth;
+
   // Traverse children.
   unsigned LexicalBlockIndex = 0;
   DWARFDie Child = Die.getFirstChild();
@@ -198,7 +220,7 @@ static void collectStatsRecursive(DWARFDie Die, std::string FnPrefix,
       ChildVarPrefix += toHex(LexicalBlockIndex++) + '.';
 
     collectStatsRecursive(Child, FnPrefix, ChildVarPrefix, ScopeLowPC,
-                          BytesInScope, FnStatMap, GlobalStats);
+                          BytesInScope, InlineDepth, FnStatMap, GlobalStats);
     Child = Child.getSibling();
   }
 }
@@ -231,7 +253,7 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   StringMap<PerFunctionStats> Statistics;
   for (const auto &CU : static_cast<DWARFContext *>(&DICtx)->compile_units())
     if (DWARFDie CUDie = CU->getUnitDIE(false))
-      collectStatsRecursive(CUDie, "/", "g", 0, 0, Statistics, GlobalStats);
+      collectStatsRecursive(CUDie, "/", "g", 0, 0, 0, Statistics, GlobalStats);
 
   /// The version number should be increased every time the algorithm is changed
   /// (including bug fixes). New metrics may be added without increasing the
@@ -271,6 +293,8 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   printDatum(OS, "scope bytes total",
              GlobalStats.ScopeBytesFromFirstDefinition);
   printDatum(OS, "scope bytes covered", GlobalStats.ScopeBytesCovered);
+  printDatum(OS, "total function size", GlobalStats.FunctionSize);
+  printDatum(OS, "total inlined function size", GlobalStats.InlineFunctionSize);
   OS << "}\n";
   LLVM_DEBUG(
       llvm::dbgs() << "Total Availability: "
diff --git a/tools/llvm-exegesis/lib/AArch64/Target.cpp b/tools/llvm-exegesis/lib/AArch64/Target.cpp
index 0197420f43364ba343d610fa06a3e724a721d943..d6b6cff54727dd952eca7a69231f3af4e365c2e2 100644
--- a/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -14,22 +14,6 @@
 namespace llvm {
 namespace exegesis {
 
-namespace {
-
-class AArch64LatencyBenchmarkRunner : public LatencyBenchmarkRunner {
-public:
-  AArch64LatencyBenchmarkRunner(const LLVMState &State)
-      : LatencyBenchmarkRunner(State) {}
-
-private:
-  const char *getCounterName() const override {
-    // All AArch64 subtargets have CPU_CYCLES as the cycle counter name
-    return "CPU_CYCLES";
-  }
-};
-
-namespace {
-
 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
   switch (RegBitWidth) {
   case 32:
@@ -50,11 +34,13 @@ static llvm::MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
       .addImm(Value.getZExtValue());
 }
 
-} // namespace
+#include "AArch64GenExegesis.inc"
+
+namespace {
 
 class ExegesisAArch64Target : public ExegesisTarget {
 public:
-  ExegesisAArch64Target() : ExegesisTarget({}) {}
+  ExegesisAArch64Target() : ExegesisTarget(AArch64CpuPfmCounters) {}
 
 private:
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
@@ -76,11 +62,6 @@ private:
     // Function return is a pseudo-instruction that needs to be expanded
     PM.add(llvm::createAArch64ExpandPseudoPass());
   }
-
-  std::unique_ptr<BenchmarkRunner>
-  createLatencyBenchmarkRunner(const LLVMState &State) const override {
-    return llvm::make_unique<AArch64LatencyBenchmarkRunner>(State);
-  }
 };
 
 } // namespace
diff --git a/tools/llvm-exegesis/lib/Analysis.cpp b/tools/llvm-exegesis/lib/Analysis.cpp
index 0dd6bcbd46619c73cad0947d72c6e04e5ceae953..0a91679fe1d1c94a2f6b952bfc393f8b08385391 100644
--- a/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/tools/llvm-exegesis/lib/Analysis.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/FormatVariadic.h"
+#include <limits>
 #include <unordered_set>
 #include <vector>
 
@@ -95,7 +96,21 @@ writeClusterId(llvm::raw_ostream &OS,
 
 template <EscapeTag Tag>
 static void writeMeasurementValue(llvm::raw_ostream &OS, const double Value) {
-  writeEscaped<Tag>(OS, llvm::formatv("{0:F}", Value).str());
+  // Given Value, if we wanted to serialize it to a string,
+  // how many base-10 digits will we need to store, max?
+  static constexpr auto MaxDigitCount =
+      std::numeric_limits<decltype(Value)>::max_digits10;
+  // Also, we will need a decimal separator.
+  static constexpr auto DecimalSeparatorLen = 1; // '.' e.g.
+  // So how long of a string will the serialization produce, max?
+  static constexpr auto SerializationLen = MaxDigitCount + DecimalSeparatorLen;
+
+  // WARNING: when changing the format, also adjust the small-size estimate ^.
+  static constexpr StringLiteral SimpleFloatFormat = StringLiteral("{0:F}");
+
+  writeEscaped<Tag>(
+      OS,
+      llvm::formatv(SimpleFloatFormat.data(), Value).sstr<SerializationLen>());
 }
 
 template <typename EscapeTag, EscapeTag Tag>
@@ -114,13 +129,11 @@ void Analysis::writeSnippet(llvm::raw_ostream &OS,
       writeEscaped<Tag>(OS, "[error decoding asm snippet]");
       return;
     }
-    Lines.emplace_back();
-    std::string &Line = Lines.back();
-    llvm::raw_string_ostream OSS(Line);
+    llvm::SmallString<128> InstPrinterStr; // FIXME: magic number.
+    llvm::raw_svector_ostream OSS(InstPrinterStr);
     InstPrinter_->printInst(&MI, OSS, "", *SubtargetInfo_);
     Bytes = Bytes.drop_front(MISize);
-    OSS.flush();
-    Line = llvm::StringRef(Line).trim().str();
+    Lines.emplace_back(llvm::StringRef(InstPrinterStr).trim());
   }
   writeEscaped<Tag>(OS, llvm::join(Lines, Separator));
 }
diff --git a/tools/llvm-exegesis/lib/CMakeLists.txt b/tools/llvm-exegesis/lib/CMakeLists.txt
index 8fdf8b997e0f2c72569107ba4d5c0b107aa45b1a..ef85056db0a4d2cb0bd2fd26a406a17462df5611 100644
--- a/tools/llvm-exegesis/lib/CMakeLists.txt
+++ b/tools/llvm-exegesis/lib/CMakeLists.txt
@@ -8,6 +8,10 @@ if (LLVM_TARGETS_TO_BUILD MATCHES "AArch64")
   add_subdirectory(AArch64)
   set(TARGETS_TO_APPEND "${TARGETS_TO_APPEND} AArch64")
 endif()
+if (LLVM_TARGETS_TO_BUILD MATCHES "PowerPC")
+  add_subdirectory(PowerPC)
+  set(TARGETS_TO_APPEND "${TARGETS_TO_APPEND} PowerPC")
+endif()
 
 set(LLVM_EXEGESIS_TARGETS "${LLVM_EXEGESIS_TARGETS} ${TARGETS_TO_APPEND}" PARENT_SCOPE)
 
diff --git a/tools/llvm-exegesis/lib/Clustering.cpp b/tools/llvm-exegesis/lib/Clustering.cpp
index 761629167bb65fdc63ca9f5302c5dfbbc950c282..b2cd97c12eb0c82b9d9e81c68b249f7c00192923 100644
--- a/tools/llvm-exegesis/lib/Clustering.cpp
+++ b/tools/llvm-exegesis/lib/Clustering.cpp
@@ -8,8 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "Clustering.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include <string>
-#include <unordered_set>
 
 namespace llvm {
 namespace exegesis {
@@ -32,9 +33,10 @@ namespace exegesis {
 
 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not
 // including Q).
-std::vector<size_t>
-InstructionBenchmarkClustering::rangeQuery(const size_t Q) const {
-  std::vector<size_t> Neighbors;
+void InstructionBenchmarkClustering::rangeQuery(
+    const size_t Q, std::vector<size_t> &Neighbors) const {
+  Neighbors.clear();
+  Neighbors.reserve(Points_.size() - 1); // The Q itself isn't a neighbor.
   const auto &QMeasurements = Points_[Q].Measurements;
   for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
     if (P == Q)
@@ -46,18 +48,6 @@ InstructionBenchmarkClustering::rangeQuery(const size_t Q) const {
       Neighbors.push_back(P);
     }
   }
-  return Neighbors;
-}
-
-bool InstructionBenchmarkClustering::isNeighbour(
-    const std::vector<BenchmarkMeasure> &P,
-    const std::vector<BenchmarkMeasure> &Q) const {
-  double DistanceSquared = 0.0;
-  for (size_t I = 0, E = P.size(); I < E; ++I) {
-    const auto Diff = P[I].PerInstructionValue - Q[I].PerInstructionValue;
-    DistanceSquared += Diff * Diff;
-  }
-  return DistanceSquared <= EpsilonSquared_;
 }
 
 InstructionBenchmarkClustering::InstructionBenchmarkClustering(
@@ -102,10 +92,11 @@ llvm::Error InstructionBenchmarkClustering::validateAndSetup() {
 }
 
 void InstructionBenchmarkClustering::dbScan(const size_t MinPts) {
+  std::vector<size_t> Neighbors; // Persistent buffer to avoid allocs.
   for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
     if (!ClusterIdForPoint_[P].isUndef())
       continue; // Previously processed in inner loop.
-    const auto Neighbors = rangeQuery(P);
+    rangeQuery(P, Neighbors);
     if (Neighbors.size() + 1 < MinPts) { // Density check.
       // The region around P is not dense enough to create a new cluster, mark
       // as noise for now.
@@ -120,11 +111,12 @@ void InstructionBenchmarkClustering::dbScan(const size_t MinPts) {
     CurrentCluster.PointIndices.push_back(P);
 
     // Process P's neighbors.
-    std::unordered_set<size_t> ToProcess(Neighbors.begin(), Neighbors.end());
+    llvm::SetVector<size_t, std::deque<size_t>> ToProcess;
+    ToProcess.insert(Neighbors.begin(), Neighbors.end());
     while (!ToProcess.empty()) {
       // Retrieve a point from the set.
       const size_t Q = *ToProcess.begin();
-      ToProcess.erase(Q);
+      ToProcess.erase(ToProcess.begin());
 
       if (ClusterIdForPoint_[Q].isNoise()) {
         // Change noise point to border point.
@@ -139,12 +131,14 @@ void InstructionBenchmarkClustering::dbScan(const size_t MinPts) {
       ClusterIdForPoint_[Q] = CurrentCluster.Id;
       CurrentCluster.PointIndices.push_back(Q);
       // And extend to the neighbors of Q if the region is dense enough.
-      const auto Neighbors = rangeQuery(Q);
+      rangeQuery(Q, Neighbors);
       if (Neighbors.size() + 1 >= MinPts) {
         ToProcess.insert(Neighbors.begin(), Neighbors.end());
       }
     }
   }
+  // assert(Neighbors.capacity() == (Points_.size() - 1));
+  // ^ True, but it is not quaranteed to be true in all the cases.
 
   // Add noisy points to noise cluster.
   for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
diff --git a/tools/llvm-exegesis/lib/Clustering.h b/tools/llvm-exegesis/lib/Clustering.h
index 9dc0adffb1e5e6fac949e53f3e01bec83bdddfb7..932cbba2038e4cfeb3196a67e44dfcb764921f09 100644
--- a/tools/llvm-exegesis/lib/Clustering.h
+++ b/tools/llvm-exegesis/lib/Clustering.h
@@ -90,14 +90,21 @@ public:
 
   // Returns true if the given point is within a distance Epsilon of each other.
   bool isNeighbour(const std::vector<BenchmarkMeasure> &P,
-                   const std::vector<BenchmarkMeasure> &Q) const;
+                   const std::vector<BenchmarkMeasure> &Q) const {
+    double DistanceSquared = 0.0;
+    for (size_t I = 0, E = P.size(); I < E; ++I) {
+      const auto Diff = P[I].PerInstructionValue - Q[I].PerInstructionValue;
+      DistanceSquared += Diff * Diff;
+    }
+    return DistanceSquared <= EpsilonSquared_;
+  }
 
 private:
   InstructionBenchmarkClustering(
       const std::vector<InstructionBenchmark> &Points, double EpsilonSquared);
   llvm::Error validateAndSetup();
   void dbScan(size_t MinPts);
-  std::vector<size_t> rangeQuery(size_t Q) const;
+  void rangeQuery(size_t Q, std::vector<size_t> &Scratchpad) const;
 
   const std::vector<InstructionBenchmark> &Points_;
   const double EpsilonSquared_;
diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index 3d18e37f4c3806de4b45d490a53aaf7fb474fd41..9a56b275088bc7c20dc2c4d3024443edf2c13930 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -166,13 +166,6 @@ LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
   return std::move(Results);
 }
 
-const char *LatencyBenchmarkRunner::getCounterName() const {
-  const char *CounterName = State.getPfmCounters().CycleCounter;
-  if (!CounterName)
-    llvm::report_fatal_error("sched model does not define a cycle counter");
-  return CounterName;
-}
-
 LatencyBenchmarkRunner::~LatencyBenchmarkRunner() = default;
 
 llvm::Expected<std::vector<BenchmarkMeasure>>
@@ -182,9 +175,9 @@ LatencyBenchmarkRunner::runMeasurements(
   // measure several times and take the minimum value.
   constexpr const int NumMeasurements = 30;
   int64_t MinValue = std::numeric_limits<int64_t>::max();
-  const char *CounterName = getCounterName();
+  const char *CounterName = State.getPfmCounters().CycleCounter;
   if (!CounterName)
-    llvm::report_fatal_error("could not determine cycle counter name");
+    llvm::report_fatal_error("sched model does not define a cycle counter");
   for (size_t I = 0; I < NumMeasurements; ++I) {
     auto ExpectedCounterValue = Executor.runAndMeasure(CounterName);
     if (!ExpectedCounterValue)
diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h
index fef72cde5a6a224c5fdb56a4581dea03299ade07..513c5f3ce780d37ffde485efecae156d14c938c9 100644
--- a/tools/llvm-exegesis/lib/Latency.h
+++ b/tools/llvm-exegesis/lib/Latency.h
@@ -40,8 +40,6 @@ public:
 private:
   llvm::Expected<std::vector<BenchmarkMeasure>>
   runMeasurements(const FunctionExecutor &Executor) const override;
-
-  virtual const char *getCounterName() const;
 };
 } // namespace exegesis
 } // namespace llvm
diff --git a/tools/llvm-exegesis/lib/LlvmState.cpp b/tools/llvm-exegesis/lib/LlvmState.cpp
index b5580c83cf5c2952c7b8964c35a8597bf1359927..cc01f1fa4ca5777c2dd2860ea20bb8b7cc71b04d 100644
--- a/tools/llvm-exegesis/lib/LlvmState.cpp
+++ b/tools/llvm-exegesis/lib/LlvmState.cpp
@@ -22,15 +22,16 @@
 namespace llvm {
 namespace exegesis {
 
-LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) {
+LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName,
+                     const std::string &Features) {
   std::string Error;
   const llvm::Target *const TheTarget =
       llvm::TargetRegistry::lookupTarget(Triple, Error);
   assert(TheTarget && "unknown target for host");
   const llvm::TargetOptions Options;
-  TargetMachine.reset(static_cast<llvm::LLVMTargetMachine *>(
-      TheTarget->createTargetMachine(Triple, CpuName, /*Features*/ "", Options,
-                                     llvm::Reloc::Model::Static)));
+  TargetMachine.reset(
+      static_cast<llvm::LLVMTargetMachine *>(TheTarget->createTargetMachine(
+          Triple, CpuName, Features, Options, llvm::Reloc::Model::Static)));
   TheExegesisTarget = ExegesisTarget::lookup(TargetMachine->getTargetTriple());
   if (!TheExegesisTarget) {
     llvm::errs() << "no exegesis target for " << Triple << ", using default\n";
@@ -45,8 +46,8 @@ LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) {
 
 LLVMState::LLVMState(const std::string &CpuName)
     : LLVMState(llvm::sys::getProcessTriple(),
-                CpuName.empty() ? llvm::sys::getHostCPUName().str() : CpuName) {
-}
+                CpuName.empty() ? llvm::sys::getHostCPUName().str() : CpuName,
+                "") {}
 
 std::unique_ptr<llvm::LLVMTargetMachine>
 LLVMState::createTargetMachine() const {
diff --git a/tools/llvm-exegesis/lib/LlvmState.h b/tools/llvm-exegesis/lib/LlvmState.h
index 159a8a51c5cd1d6e48f1e84fd396850fbaeab6f2..e4f12a3f858e6ca02b00b12b6b29c26f9709ce2e 100644
--- a/tools/llvm-exegesis/lib/LlvmState.h
+++ b/tools/llvm-exegesis/lib/LlvmState.h
@@ -40,7 +40,8 @@ public:
   LLVMState(const std::string &CpuName);
 
   LLVMState(const std::string &Triple,
-            const std::string &CpuName); // For tests.
+            const std::string &CpuName,
+            const std::string &Features = ""); // For tests.
 
   const llvm::TargetMachine &getTargetMachine() const { return *TargetMachine; }
   std::unique_ptr<llvm::LLVMTargetMachine> createTargetMachine() const;
diff --git a/tools/llvm-exegesis/lib/PowerPC/CMakeLists.txt b/tools/llvm-exegesis/lib/PowerPC/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89e33437952e4e3818d974877ac32c7d44ffdae0
--- /dev/null
+++ b/tools/llvm-exegesis/lib/PowerPC/CMakeLists.txt
@@ -0,0 +1,18 @@
+include_directories(
+  ${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC
+  ${LLVM_BINARY_DIR}/lib/Target/PowerPC
+  )
+
+add_library(LLVMExegesisPowerPC
+  STATIC
+  Target.cpp
+  )
+
+llvm_update_compile_flags(LLVMExegesisPowerPC)
+llvm_map_components_to_libnames(libs
+  PowerPC
+  Exegesis
+  )
+
+target_link_libraries(LLVMExegesisPowerPC ${libs})
+set_target_properties(LLVMExegesisPowerPC PROPERTIES FOLDER "Libraries")
diff --git a/tools/llvm-exegesis/lib/PowerPC/LLVMBuild.txt b/tools/llvm-exegesis/lib/PowerPC/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c535562065eba6f2b3d667087934d3a719fdc65c
--- /dev/null
+++ b/tools/llvm-exegesis/lib/PowerPC/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/llvm-exegesis/lib/PowerPC/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ExegesisPowerPC
+parent = Libraries
+required_libraries = PowerPC
diff --git a/tools/llvm-exegesis/lib/PowerPC/Target.cpp b/tools/llvm-exegesis/lib/PowerPC/Target.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4bdb5dc4adc33a65563c493c2ccae75a9748d95
--- /dev/null
+++ b/tools/llvm-exegesis/lib/PowerPC/Target.cpp
@@ -0,0 +1,76 @@
+//===-- Target.cpp ----------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// The PowerPC ExegesisTarget.
+//===----------------------------------------------------------------------===//
+#include "../Target.h"
+#include "../Latency.h"
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+
+namespace llvm {
+namespace exegesis {
+
+#include "PPCGenExegesis.inc"
+
+namespace {
+class ExegesisPowerPCTarget : public ExegesisTarget {
+public:
+  ExegesisPowerPCTarget() : ExegesisTarget(PPCCpuPfmCounters) {}
+
+private:
+  std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
+                                     unsigned Reg,
+                                     const llvm::APInt &Value) const override;
+  bool matchesArch(llvm::Triple::ArchType Arch) const override {
+    return Arch == llvm::Triple::ppc64le;
+  }
+};
+} // end anonymous namespace
+
+static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
+  switch (RegBitWidth) {
+  case 32:
+    return llvm::PPC::LI;
+  case 64:
+    return llvm::PPC::LI8;
+  }
+  llvm_unreachable("Invalid Value Width");
+}
+
+// Generates instruction to load an immediate value into a register.
+static llvm::MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
+                                  const llvm::APInt &Value) {
+  if (Value.getBitWidth() > RegBitWidth)
+    llvm_unreachable("Value must fit in the Register");
+  return llvm::MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
+      .addReg(Reg)
+      .addImm(Value.getZExtValue());
+}
+
+std::vector<llvm::MCInst>
+ExegesisPowerPCTarget::setRegTo(const llvm::MCSubtargetInfo &STI, unsigned Reg,
+                                const llvm::APInt &Value) const {
+  if (llvm::PPC::GPRCRegClass.contains(Reg))
+    return {loadImmediate(Reg, 32, Value)};
+  if (llvm::PPC::G8RCRegClass.contains(Reg))
+    return {loadImmediate(Reg, 64, Value)};
+  llvm::errs() << "setRegTo is not implemented, results will be unreliable\n";
+  return {};
+}
+
+static ExegesisTarget *getTheExegesisPowerPCTarget() {
+  static ExegesisPowerPCTarget Target;
+  return &Target;
+}
+
+void InitializePowerPCExegesisTarget() {
+  ExegesisTarget::registerTarget(getTheExegesisPowerPCTarget());
+}
+
+} // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/RegisterValue.h b/tools/llvm-exegesis/lib/RegisterValue.h
index 51ea30ac8eb47e54d0a05340eb37caa3c6d51f54..689e354e2416fe74d0b8861ed4e517f44aeacc19 100644
--- a/tools/llvm-exegesis/lib/RegisterValue.h
+++ b/tools/llvm-exegesis/lib/RegisterValue.h
@@ -14,6 +14,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H
+#define LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H
+
 #include <llvm/ADT/APFloat.h>
 #include <llvm/ADT/APInt.h>
 
@@ -22,6 +25,7 @@ namespace exegesis {
 
 // A simple object storing the value for a particular register.
 struct RegisterValue {
+  static RegisterValue zero(unsigned Reg) { return {Reg, llvm::APInt()}; }
   unsigned Register;
   llvm::APInt Value;
 };
@@ -45,3 +49,5 @@ llvm::APInt bitcastFloatValue(const llvm::fltSemantics &FltSemantics,
 
 } // namespace exegesis
 } // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_EXEGESIS_REGISTERVALUE_H
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index eb6a8577b5773cde66edf261e0bd30608868b673..88ba315548d3570866869b04e5f61090180623b8 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -91,7 +91,7 @@ std::vector<RegisterValue> SnippetGenerator::computeRegisterInitialValues(
       if (Op.isUse()) {
         const unsigned Reg = GetOpReg(Op);
         if (Reg > 0 && !DefinedRegs.test(Reg)) {
-          RIV.push_back(RegisterValue{Reg, llvm::APInt()});
+          RIV.push_back(RegisterValue::zero(Reg));
           DefinedRegs.set(Reg);
         }
       }
diff --git a/tools/llvm-exegesis/lib/Target.cpp b/tools/llvm-exegesis/lib/Target.cpp
index 06557770418c3278988421a7cead7cafba7834b0..2c89d27b4ab601e98581ef8159c20f1ed931202a 100644
--- a/tools/llvm-exegesis/lib/Target.cpp
+++ b/tools/llvm-exegesis/lib/Target.cpp
@@ -87,7 +87,8 @@ ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const {
 
 static_assert(std::is_pod<PfmCountersInfo>::value,
               "We shouldn't have dynamic initialization here");
-const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr, 0u};
+const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr,
+                                                  0u};
 
 const PfmCountersInfo &
 ExegesisTarget::getPfmCounters(llvm::StringRef CpuName) const {
@@ -103,7 +104,13 @@ ExegesisTarget::getPfmCounters(llvm::StringRef CpuName) const {
       std::lower_bound(CpuPfmCounters.begin(), CpuPfmCounters.end(), CpuName);
   if (Found == CpuPfmCounters.end() ||
       llvm::StringRef(Found->CpuName) != CpuName) {
-    return PfmCountersInfo::Default;
+    // Use the default.
+    if (CpuPfmCounters.begin() != CpuPfmCounters.end() &&
+        CpuPfmCounters.begin()->CpuName[0] == '\0') {
+      Found = CpuPfmCounters.begin(); // The target specifies a default.
+    } else {
+      return PfmCountersInfo::Default; // No default for the target.
+    }
   }
   assert(Found->PCI && "Missing counters");
   return *Found->PCI;
diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 618e4d77db4cd5fe8e6ad3a0f08b2c67b5409186..43bf9b750580cfee92f05beb55daeac1c4301813 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -20,11 +20,9 @@
 namespace llvm {
 namespace exegesis {
 
-namespace {
-
 // Returns an error if we cannot handle the memory references in this
 // instruction.
-Error isInvalidMemoryInstr(const Instruction &Instr) {
+static Error isInvalidMemoryInstr(const Instruction &Instr) {
   switch (Instr.Description->TSFlags & X86II::FormMask) {
   default:
     llvm_unreachable("Unknown FormMask value");
@@ -169,78 +167,90 @@ static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
   return llvm::Error::success();
 }
 
-static unsigned GetX86FPFlags(const Instruction &Instr) {
+static unsigned getX86FPFlags(const Instruction &Instr) {
   return Instr.Description->TSFlags & llvm::X86II::FPTypeMask;
 }
 
+namespace {
 class X86LatencySnippetGenerator : public LatencySnippetGenerator {
 public:
   using LatencySnippetGenerator::LatencySnippetGenerator;
 
   llvm::Expected<std::vector<CodeTemplate>>
-  generateCodeTemplates(const Instruction &Instr) const override {
-    if (auto E = IsInvalidOpcode(Instr))
-      return std::move(E);
-
-    switch (GetX86FPFlags(Instr)) {
-    case llvm::X86II::NotFP:
-      return LatencySnippetGenerator::generateCodeTemplates(Instr);
-    case llvm::X86II::ZeroArgFP:
-    case llvm::X86II::OneArgFP:
-    case llvm::X86II::SpecialFP:
-    case llvm::X86II::CompareFP:
-    case llvm::X86II::CondMovFP:
-      return llvm::make_error<BenchmarkFailure>("Unsupported x87 Instruction");
-    case llvm::X86II::OneArgFPRW:
-    case llvm::X86II::TwoArgFP:
-      // These are instructions like
-      //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
-      //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
-      // They are intrinsically serial and do not modify the state of the stack.
-      return generateSelfAliasingCodeTemplates(Instr);
-    default:
-      llvm_unreachable("Unknown FP Type!");
-    }
-  }
+  generateCodeTemplates(const Instruction &Instr) const override;
 };
+} // namespace
 
+llvm::Expected<std::vector<CodeTemplate>>
+X86LatencySnippetGenerator::generateCodeTemplates(
+    const Instruction &Instr) const {
+  if (auto E = IsInvalidOpcode(Instr))
+    return std::move(E);
+
+  switch (getX86FPFlags(Instr)) {
+  case llvm::X86II::NotFP:
+    return LatencySnippetGenerator::generateCodeTemplates(Instr);
+  case llvm::X86II::ZeroArgFP:
+  case llvm::X86II::OneArgFP:
+  case llvm::X86II::SpecialFP:
+  case llvm::X86II::CompareFP:
+  case llvm::X86II::CondMovFP:
+    return llvm::make_error<BenchmarkFailure>("Unsupported x87 Instruction");
+  case llvm::X86II::OneArgFPRW:
+  case llvm::X86II::TwoArgFP:
+    // These are instructions like
+    //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
+    //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
+    // They are intrinsically serial and do not modify the state of the stack.
+    return generateSelfAliasingCodeTemplates(Instr);
+  default:
+    llvm_unreachable("Unknown FP Type!");
+  }
+}
+
+namespace {
 class X86UopsSnippetGenerator : public UopsSnippetGenerator {
 public:
   using UopsSnippetGenerator::UopsSnippetGenerator;
 
   llvm::Expected<std::vector<CodeTemplate>>
-  generateCodeTemplates(const Instruction &Instr) const override {
-    if (auto E = IsInvalidOpcode(Instr))
-      return std::move(E);
-
-    switch (GetX86FPFlags(Instr)) {
-    case llvm::X86II::NotFP:
-      return UopsSnippetGenerator::generateCodeTemplates(Instr);
-    case llvm::X86II::ZeroArgFP:
-    case llvm::X86II::OneArgFP:
-    case llvm::X86II::SpecialFP:
-      return llvm::make_error<BenchmarkFailure>("Unsupported x87 Instruction");
-    case llvm::X86II::OneArgFPRW:
-    case llvm::X86II::TwoArgFP:
-      // These are instructions like
-      //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
-      //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
-      // They are intrinsically serial and do not modify the state of the stack.
-      // We generate the same code for latency and uops.
-      return generateSelfAliasingCodeTemplates(Instr);
-    case llvm::X86II::CompareFP:
-    case llvm::X86II::CondMovFP:
-      // We can compute uops for any FP instruction that does not grow or shrink
-      // the stack (either do not touch the stack or push as much as they pop).
-      return generateUnconstrainedCodeTemplates(
-          Instr, "instruction does not grow/shrink the FP stack");
-    default:
-      llvm_unreachable("Unknown FP Type!");
-    }
-  }
+  generateCodeTemplates(const Instruction &Instr) const override;
 };
+} // namespace
+
+llvm::Expected<std::vector<CodeTemplate>>
+X86UopsSnippetGenerator::generateCodeTemplates(
+    const Instruction &Instr) const {
+  if (auto E = IsInvalidOpcode(Instr))
+    return std::move(E);
+
+  switch (getX86FPFlags(Instr)) {
+  case llvm::X86II::NotFP:
+    return UopsSnippetGenerator::generateCodeTemplates(Instr);
+  case llvm::X86II::ZeroArgFP:
+  case llvm::X86II::OneArgFP:
+  case llvm::X86II::SpecialFP:
+    return llvm::make_error<BenchmarkFailure>("Unsupported x87 Instruction");
+  case llvm::X86II::OneArgFPRW:
+  case llvm::X86II::TwoArgFP:
+    // These are instructions like
+    //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
+    //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
+    // They are intrinsically serial and do not modify the state of the stack.
+    // We generate the same code for latency and uops.
+    return generateSelfAliasingCodeTemplates(Instr);
+  case llvm::X86II::CompareFP:
+  case llvm::X86II::CondMovFP:
+    // We can compute uops for any FP instruction that does not grow or shrink
+    // the stack (either do not touch the stack or push as much as they pop).
+    return generateUnconstrainedCodeTemplates(
+        Instr, "instruction does not grow/shrink the FP stack");
+  default:
+    llvm_unreachable("Unknown FP Type!");
+  }
+}
 
-static unsigned GetLoadImmediateOpcode(unsigned RegBitWidth) {
+static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
   switch (RegBitWidth) {
   case 8:
     return llvm::X86::MOV8ri;
@@ -259,7 +269,7 @@ static llvm::MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
                                   const llvm::APInt &Value) {
   if (Value.getBitWidth() > RegBitWidth)
     llvm_unreachable("Value must fit in the Register");
-  return llvm::MCInstBuilder(GetLoadImmediateOpcode(RegBitWidth))
+  return llvm::MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
       .addReg(Reg)
       .addImm(Value.getZExtValue());
 }
@@ -308,181 +318,123 @@ static llvm::MCInst releaseStackSpace(unsigned Bytes) {
 
 // Reserves some space on the stack, fills it with the content of the provided
 // constant and provide methods to load the stack value into a register.
+namespace {
 struct ConstantInliner {
   explicit ConstantInliner(const llvm::APInt &Constant) : Constant_(Constant) {}
 
   std::vector<llvm::MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
-                                            unsigned Opcode) {
-    assert((RegBitWidth & 7) == 0 &&
-           "RegBitWidth must be a multiple of 8 bits");
-    initStack(RegBitWidth / 8);
-    add(loadToReg(Reg, Opcode));
-    add(releaseStackSpace(RegBitWidth / 8));
-    return std::move(Instructions);
-  }
+                                            unsigned Opcode);
 
-  std::vector<llvm::MCInst> loadX87STAndFinalize(unsigned Reg) {
-    initStack(kF80Bytes);
-    add(llvm::MCInstBuilder(llvm::X86::LD_F80m)
-            // Address = ESP
-            .addReg(llvm::X86::RSP) // BaseReg
-            .addImm(1)              // ScaleAmt
-            .addReg(0)              // IndexReg
-            .addImm(0)              // Disp
-            .addReg(0));            // Segment
-    if (Reg != llvm::X86::ST0)
-      add(llvm::MCInstBuilder(llvm::X86::ST_Frr).addReg(Reg));
-    add(releaseStackSpace(kF80Bytes));
-    return std::move(Instructions);
-  }
+  std::vector<llvm::MCInst> loadX87STAndFinalize(unsigned Reg);
 
-  std::vector<llvm::MCInst> loadX87FPAndFinalize(unsigned Reg) {
-    initStack(kF80Bytes);
-    add(llvm::MCInstBuilder(llvm::X86::LD_Fp80m)
-            .addReg(Reg)
-            // Address = ESP
-            .addReg(llvm::X86::RSP) // BaseReg
-            .addImm(1)              // ScaleAmt
-            .addReg(0)              // IndexReg
-            .addImm(0)              // Disp
-            .addReg(0));            // Segment
-    add(releaseStackSpace(kF80Bytes));
-    return std::move(Instructions);
-  }
+  std::vector<llvm::MCInst> loadX87FPAndFinalize(unsigned Reg);
 
-  std::vector<llvm::MCInst> popFlagAndFinalize() {
-    initStack(8);
-    add(llvm::MCInstBuilder(llvm::X86::POPF64));
-    return std::move(Instructions);
-  }
+  std::vector<llvm::MCInst> popFlagAndFinalize();
 
 private:
-  static constexpr const unsigned kF80Bytes = 10; // 80 bits.
-
   ConstantInliner &add(const llvm::MCInst &Inst) {
     Instructions.push_back(Inst);
     return *this;
   }
 
-  void initStack(unsigned Bytes) {
-    assert(Constant_.getBitWidth() <= Bytes * 8 &&
-           "Value does not have the correct size");
-    const llvm::APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
-                                         ? Constant_.sext(Bytes * 8)
-                                         : Constant_;
-    add(allocateStackSpace(Bytes));
-    size_t ByteOffset = 0;
-    for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
-      add(fillStackSpace(
-          llvm::X86::MOV32mi, ByteOffset,
-          WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
-    if (Bytes - ByteOffset >= 2) {
-      add(fillStackSpace(
-          llvm::X86::MOV16mi, ByteOffset,
-          WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
-      ByteOffset += 2;
-    }
-    if (Bytes - ByteOffset >= 1)
-      add(fillStackSpace(
-          llvm::X86::MOV8mi, ByteOffset,
-          WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
-  }
+  void initStack(unsigned Bytes);
+
+  static constexpr const unsigned kF80Bytes = 10; // 80 bits.
 
   llvm::APInt Constant_;
   std::vector<llvm::MCInst> Instructions;
 };
+} // namespace
+
+std::vector<llvm::MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
+                                                           unsigned RegBitWidth,
+                                                           unsigned Opcode) {
+  assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
+  initStack(RegBitWidth / 8);
+  add(loadToReg(Reg, Opcode));
+  add(releaseStackSpace(RegBitWidth / 8));
+  return std::move(Instructions);
+}
+
+std::vector<llvm::MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
+  initStack(kF80Bytes);
+  add(llvm::MCInstBuilder(llvm::X86::LD_F80m)
+          // Address = ESP
+          .addReg(llvm::X86::RSP) // BaseReg
+          .addImm(1)              // ScaleAmt
+          .addReg(0)              // IndexReg
+          .addImm(0)              // Disp
+          .addReg(0));            // Segment
+  if (Reg != llvm::X86::ST0)
+    add(llvm::MCInstBuilder(llvm::X86::ST_Frr).addReg(Reg));
+  add(releaseStackSpace(kF80Bytes));
+  return std::move(Instructions);
+}
+
+std::vector<llvm::MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
+  initStack(kF80Bytes);
+  add(llvm::MCInstBuilder(llvm::X86::LD_Fp80m)
+          .addReg(Reg)
+          // Address = ESP
+          .addReg(llvm::X86::RSP) // BaseReg
+          .addImm(1)              // ScaleAmt
+          .addReg(0)              // IndexReg
+          .addImm(0)              // Disp
+          .addReg(0));            // Segment
+  add(releaseStackSpace(kF80Bytes));
+  return std::move(Instructions);
+}
+
+std::vector<llvm::MCInst> ConstantInliner::popFlagAndFinalize() {
+  initStack(8);
+  add(llvm::MCInstBuilder(llvm::X86::POPF64));
+  return std::move(Instructions);
+}
+
+void ConstantInliner::initStack(unsigned Bytes) {
+  assert(Constant_.getBitWidth() <= Bytes * 8 &&
+         "Value does not have the correct size");
+  const llvm::APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
+                                       ? Constant_.sext(Bytes * 8)
+                                       : Constant_;
+  add(allocateStackSpace(Bytes));
+  size_t ByteOffset = 0;
+  for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
+    add(fillStackSpace(
+        llvm::X86::MOV32mi, ByteOffset,
+        WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
+  if (Bytes - ByteOffset >= 2) {
+    add(fillStackSpace(
+        llvm::X86::MOV16mi, ByteOffset,
+        WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
+    ByteOffset += 2;
+  }
+  if (Bytes - ByteOffset >= 1)
+    add(fillStackSpace(
+        llvm::X86::MOV8mi, ByteOffset,
+        WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
+}
 
 #include "X86GenExegesis.inc"
 
+namespace {
 class ExegesisX86Target : public ExegesisTarget {
 public:
   ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
 
 private:
-  void addTargetSpecificPasses(llvm::PassManagerBase &PM) const override {
-    // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
-    PM.add(llvm::createX86FloatingPointStackifierPass());
-  }
+  void addTargetSpecificPasses(llvm::PassManagerBase &PM) const override;
 
-  unsigned getScratchMemoryRegister(const llvm::Triple &TT) const override {
-    if (!TT.isArch64Bit()) {
-      // FIXME: This would require popping from the stack, so we would have to
-      // add some additional setup code.
-      return 0;
-    }
-    return TT.isOSWindows() ? llvm::X86::RCX : llvm::X86::RDI;
-  }
+  unsigned getScratchMemoryRegister(const llvm::Triple &TT) const override;
 
   unsigned getMaxMemoryAccessSize() const override { return 64; }
 
   void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
-                          unsigned Offset) const override {
-    assert(!isInvalidMemoryInstr(IT.Instr) &&
-           "fillMemoryOperands requires a valid memory instruction");
-    int MemOpIdx = X86II::getMemoryOperandNo(IT.Instr.Description->TSFlags);
-    assert(MemOpIdx >= 0 && "invalid memory operand index");
-    // getMemoryOperandNo() ignores tied operands, so we have to add them back.
-    for (unsigned I = 0; I <= static_cast<unsigned>(MemOpIdx); ++I) {
-      const auto &Op = IT.Instr.Operands[I];
-      if (Op.isTied() && Op.getTiedToIndex() < I) {
-        ++MemOpIdx;
-      }
-    }
-    // Now fill in the memory operands.
-    const auto SetOp = [&IT](int OpIdx, const MCOperand &OpVal) {
-      const auto Op = IT.Instr.Operands[OpIdx];
-      assert(Op.isMemory() && Op.isExplicit() && "invalid memory pattern");
-      IT.getValueFor(Op) = OpVal;
-    };
-    SetOp(MemOpIdx + 0, MCOperand::createReg(Reg));    // BaseReg
-    SetOp(MemOpIdx + 1, MCOperand::createImm(1));      // ScaleAmt
-    SetOp(MemOpIdx + 2, MCOperand::createReg(0));      // IndexReg
-    SetOp(MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
-    SetOp(MemOpIdx + 4, MCOperand::createReg(0));      // Segment
-  }
+                          unsigned Offset) const override;
 
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
                                      unsigned Reg,
-                                     const llvm::APInt &Value) const override {
-    if (llvm::X86::GR8RegClass.contains(Reg))
-      return {loadImmediate(Reg, 8, Value)};
-    if (llvm::X86::GR16RegClass.contains(Reg))
-      return {loadImmediate(Reg, 16, Value)};
-    if (llvm::X86::GR32RegClass.contains(Reg))
-      return {loadImmediate(Reg, 32, Value)};
-    if (llvm::X86::GR64RegClass.contains(Reg))
-      return {loadImmediate(Reg, 64, Value)};
-    ConstantInliner CI(Value);
-    if (llvm::X86::VR64RegClass.contains(Reg))
-      return CI.loadAndFinalize(Reg, 64, llvm::X86::MMX_MOVQ64rm);
-    if (llvm::X86::VR128XRegClass.contains(Reg)) {
-      if (STI.getFeatureBits()[llvm::X86::FeatureAVX512])
-        return CI.loadAndFinalize(Reg, 128, llvm::X86::VMOVDQU32Z128rm);
-      if (STI.getFeatureBits()[llvm::X86::FeatureAVX])
-        return CI.loadAndFinalize(Reg, 128, llvm::X86::VMOVDQUrm);
-      return CI.loadAndFinalize(Reg, 128, llvm::X86::MOVDQUrm);
-    }
-    if (llvm::X86::VR256XRegClass.contains(Reg)) {
-      if (STI.getFeatureBits()[llvm::X86::FeatureAVX512])
-        return CI.loadAndFinalize(Reg, 256, llvm::X86::VMOVDQU32Z256rm);
-      if (STI.getFeatureBits()[llvm::X86::FeatureAVX])
-        return CI.loadAndFinalize(Reg, 256, llvm::X86::VMOVDQUYrm);
-    }
-    if (llvm::X86::VR512RegClass.contains(Reg))
-      if (STI.getFeatureBits()[llvm::X86::FeatureAVX512])
-        return CI.loadAndFinalize(Reg, 512, llvm::X86::VMOVDQU32Zrm);
-    if (llvm::X86::RSTRegClass.contains(Reg)) {
-      return CI.loadX87STAndFinalize(Reg);
-    }
-    if (llvm::X86::RFP32RegClass.contains(Reg) ||
-        llvm::X86::RFP64RegClass.contains(Reg) ||
-        llvm::X86::RFP80RegClass.contains(Reg)) {
-      return CI.loadX87FPAndFinalize(Reg);
-    }
-    if (Reg == llvm::X86::EFLAGS)
-      return CI.popFlagAndFinalize();
-    return {}; // Not yet implemented.
-  }
+                                     const llvm::APInt &Value) const override;
 
   std::unique_ptr<SnippetGenerator>
   createLatencySnippetGenerator(const LLVMState &State) const override {
@@ -498,9 +450,94 @@ private:
     return Arch == llvm::Triple::x86_64 || Arch == llvm::Triple::x86;
   }
 };
-
 } // namespace
 
+void ExegesisX86Target::addTargetSpecificPasses(
+    llvm::PassManagerBase &PM) const {
+  // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
+  PM.add(llvm::createX86FloatingPointStackifierPass());
+}
+
+unsigned
+ExegesisX86Target::getScratchMemoryRegister(const llvm::Triple &TT) const {
+  if (!TT.isArch64Bit()) {
+    // FIXME: This would require popping from the stack, so we would have to
+    // add some additional setup code.
+    return 0;
+  }
+  return TT.isOSWindows() ? llvm::X86::RCX : llvm::X86::RDI;
+}
+
+void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
+                                           unsigned Reg,
+                                           unsigned Offset) const {
+  assert(!isInvalidMemoryInstr(IT.Instr) &&
+         "fillMemoryOperands requires a valid memory instruction");
+  int MemOpIdx = X86II::getMemoryOperandNo(IT.Instr.Description->TSFlags);
+  assert(MemOpIdx >= 0 && "invalid memory operand index");
+  // getMemoryOperandNo() ignores tied operands, so we have to add them back.
+  for (unsigned I = 0; I <= static_cast<unsigned>(MemOpIdx); ++I) {
+    const auto &Op = IT.Instr.Operands[I];
+    if (Op.isTied() && Op.getTiedToIndex() < I) {
+      ++MemOpIdx;
+    }
+  }
+  // Now fill in the memory operands.
+  const auto SetOp = [&IT](int OpIdx, const MCOperand &OpVal) {
+    const auto Op = IT.Instr.Operands[OpIdx];
+    assert(Op.isMemory() && Op.isExplicit() && "invalid memory pattern");
+    IT.getValueFor(Op) = OpVal;
+  };
+  SetOp(MemOpIdx + 0, MCOperand::createReg(Reg));    // BaseReg
+  SetOp(MemOpIdx + 1, MCOperand::createImm(1));      // ScaleAmt
+  SetOp(MemOpIdx + 2, MCOperand::createReg(0));      // IndexReg
+  SetOp(MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
+  SetOp(MemOpIdx + 4, MCOperand::createReg(0));      // Segment
+}
+
+std::vector<llvm::MCInst>
+ExegesisX86Target::setRegTo(const llvm::MCSubtargetInfo &STI, unsigned Reg,
+                            const llvm::APInt &Value) const {
+  if (llvm::X86::GR8RegClass.contains(Reg))
+    return {loadImmediate(Reg, 8, Value)};
+  if (llvm::X86::GR16RegClass.contains(Reg))
+    return {loadImmediate(Reg, 16, Value)};
+  if (llvm::X86::GR32RegClass.contains(Reg))
+    return {loadImmediate(Reg, 32, Value)};
+  if (llvm::X86::GR64RegClass.contains(Reg))
+    return {loadImmediate(Reg, 64, Value)};
+  ConstantInliner CI(Value);
+  if (llvm::X86::VR64RegClass.contains(Reg))
+    return CI.loadAndFinalize(Reg, 64, llvm::X86::MMX_MOVQ64rm);
+  if (llvm::X86::VR128XRegClass.contains(Reg)) {
+    if (STI.getFeatureBits()[llvm::X86::FeatureAVX512])
+      return CI.loadAndFinalize(Reg, 128, llvm::X86::VMOVDQU32Z128rm);
+    if (STI.getFeatureBits()[llvm::X86::FeatureAVX])
+      return CI.loadAndFinalize(Reg, 128, llvm::X86::VMOVDQUrm);
+    return CI.loadAndFinalize(Reg, 128, llvm::X86::MOVDQUrm);
+  }
+  if (llvm::X86::VR256XRegClass.contains(Reg)) {
+    if (STI.getFeatureBits()[llvm::X86::FeatureAVX512])
+      return CI.loadAndFinalize(Reg, 256, llvm::X86::VMOVDQU32Z256rm);
+    if (STI.getFeatureBits()[llvm::X86::FeatureAVX])
+      return CI.loadAndFinalize(Reg, 256, llvm::X86::VMOVDQUYrm);
+  }
+  if (llvm::X86::VR512RegClass.contains(Reg))
+    if (STI.getFeatureBits()[llvm::X86::FeatureAVX512])
+      return CI.loadAndFinalize(Reg, 512, llvm::X86::VMOVDQU32Zrm);
+  if (llvm::X86::RSTRegClass.contains(Reg)) {
+    return CI.loadX87STAndFinalize(Reg);
+  }
+  if (llvm::X86::RFP32RegClass.contains(Reg) ||
+      llvm::X86::RFP64RegClass.contains(Reg) ||
+      llvm::X86::RFP80RegClass.contains(Reg)) {
+    return CI.loadX87FPAndFinalize(Reg);
+  }
+  if (Reg == llvm::X86::EFLAGS)
+    return CI.popFlagAndFinalize();
+  return {}; // Not yet implemented.
+}
+
 static ExegesisTarget *getTheExegesisX86Target() {
   static ExegesisX86Target Target;
   return &Target;
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index dd19424157c9edc2bb94dc3c7c06f0403d5667c9..b6facc919b5148bd0a5d9a0a6d345590f617baa8 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -562,11 +562,14 @@ private:
 
     auto Index = loadCombinedIndex();
     for (auto &Filename : InputFilenames) {
+      LLVMContext Ctx;
+      auto TheModule = loadModule(Filename, Ctx);
+
       // Build a map of module to the GUIDs and summary objects that should
       // be written to its index.
       std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
-      ThinLTOCodeGenerator::gatherImportedSummariesForModule(
-          Filename, *Index, ModuleToSummariesForIndex);
+      ThinGenerator.gatherImportedSummariesForModule(*TheModule, *Index,
+                                                     ModuleToSummariesForIndex);
 
       std::string OutputName = OutputFilename;
       if (OutputName.empty()) {
@@ -594,12 +597,14 @@ private:
 
     auto Index = loadCombinedIndex();
     for (auto &Filename : InputFilenames) {
+      LLVMContext Ctx;
+      auto TheModule = loadModule(Filename, Ctx);
       std::string OutputName = OutputFilename;
       if (OutputName.empty()) {
         OutputName = Filename + ".imports";
       }
       OutputName = getThinLTOOutputFile(OutputName, OldPrefix, NewPrefix);
-      ThinLTOCodeGenerator::emitImports(Filename, OutputName, *Index);
+      ThinGenerator.emitImports(*TheModule, OutputName, *Index);
     }
   }
 
diff --git a/tools/llvm-mca/CMakeLists.txt b/tools/llvm-mca/CMakeLists.txt
index 4339d48d461831e7fe3c96c561106d069f508fb9..1fceb08c1ca3d3b18f11a4fc1f3ed4e32996428a 100644
--- a/tools/llvm-mca/CMakeLists.txt
+++ b/tools/llvm-mca/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
   AllTargetsDescs
   AllTargetsDisassemblers
   AllTargetsInfos
+  MCA
   MC
   MCParser
   Support
@@ -28,5 +29,3 @@ add_llvm_tool(llvm-mca
   )
 
 set(LLVM_MCA_SOURCE_DIR ${CURRENT_SOURCE_DIR})
-add_subdirectory(lib)
-target_link_libraries(llvm-mca PRIVATE LLVMMCA)
diff --git a/tools/llvm-mca/LLVMBuild.txt b/tools/llvm-mca/LLVMBuild.txt
index 0afcd3129ecd2adb769562c5416eacb41eb79e3f..a704612934ff11dd490bc780db47932c797a7372 100644
--- a/tools/llvm-mca/LLVMBuild.txt
+++ b/tools/llvm-mca/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-mca
 parent = Tools
-required_libraries = MC MCParser Support all-targets
+required_libraries = MC MCA MCParser Support all-targets
diff --git a/tools/llvm-mca/PipelinePrinter.h b/tools/llvm-mca/PipelinePrinter.h
index 7e426383f21bc8fd8f5d295190eaa87ca63d813b..456026e12df39fe029619e07d15794f52bd5ad68 100644
--- a/tools/llvm-mca/PipelinePrinter.h
+++ b/tools/llvm-mca/PipelinePrinter.h
@@ -17,9 +17,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
 #define LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
 
-#include "Pipeline.h"
 #include "Views/View.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/Pipeline.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "llvm-mca"
diff --git a/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp b/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
index 7e2fd316c97760b95942fde254ece4506c8d6b67..54eb28f1add9da0dafe9f4e4aa0a6fb55a0b9184 100644
--- a/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
+++ b/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
@@ -18,9 +18,39 @@
 namespace llvm {
 namespace mca {
 
+RetireControlUnitStatistics::RetireControlUnitStatistics(const MCSchedModel &SM)
+    : NumRetired(0), NumCycles(0), EntriesInUse(0), MaxUsedEntries(0),
+      SumOfUsedEntries(0) {
+  TotalROBEntries = SM.MicroOpBufferSize;
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    if (EPI.ReorderBufferSize)
+      TotalROBEntries = EPI.ReorderBufferSize;
+  }
+}
+
 void RetireControlUnitStatistics::onEvent(const HWInstructionEvent &Event) {
-  if (Event.Type == HWInstructionEvent::Retired)
+  if (Event.Type == HWInstructionEvent::Dispatched) {
+    unsigned NumEntries =
+        static_cast<const HWInstructionDispatchedEvent &>(Event).MicroOpcodes;
+    EntriesInUse += NumEntries;
+  }
+
+  if (Event.Type == HWInstructionEvent::Retired) {
+    unsigned ReleasedEntries = Event.IR.getInstruction()->getDesc().NumMicroOps;
+    assert(EntriesInUse >= ReleasedEntries && "Invalid internal state!");
+    EntriesInUse -= ReleasedEntries;
     ++NumRetired;
+  }
+}
+
+void RetireControlUnitStatistics::onCycleEnd() {
+  // Update histogram
+  RetiredPerCycle[NumRetired]++;
+  NumRetired = 0;
+  ++NumCycles;
+  MaxUsedEntries = std::max(MaxUsedEntries, EntriesInUse);
+  SumOfUsedEntries += EntriesInUse;
 }
 
 void RetireControlUnitStatistics::printView(raw_ostream &OS) const {
@@ -41,6 +71,18 @@ void RetireControlUnitStatistics::printView(raw_ostream &OS) const {
                << "%)\n";
   }
 
+  unsigned AvgUsage = (double)SumOfUsedEntries / NumCycles;
+  double MaxUsagePercentage = ((double)MaxUsedEntries / TotalROBEntries) * 100.0;
+  double NormalizedMaxPercentage = floor((MaxUsagePercentage * 10) + 0.5) / 10;
+  double AvgUsagePercentage = ((double)AvgUsage / TotalROBEntries) * 100.0;
+  double NormalizedAvgPercentage = floor((AvgUsagePercentage * 10) + 0.5) / 10;
+
+  TempStream << "\nTotal ROB Entries:                " << TotalROBEntries
+             << "\nMax Used ROB Entries:             " << MaxUsedEntries
+             << format("  ( %.1f%% )", NormalizedMaxPercentage)
+             << "\nAverage Used ROB Entries per cy:  " << AvgUsage
+             << format("  ( %.1f%% )\n", NormalizedAvgPercentage);
+
   TempStream.flush();
   OS << Buffer;
 }
diff --git a/tools/llvm-mca/Views/RetireControlUnitStatistics.h b/tools/llvm-mca/Views/RetireControlUnitStatistics.h
index 9a4821ec31a157675bcb7188b2d823931798d26d..02aa13bc444a24b246bfbe7b2b3a4a029ab24a84 100644
--- a/tools/llvm-mca/Views/RetireControlUnitStatistics.h
+++ b/tools/llvm-mca/Views/RetireControlUnitStatistics.h
@@ -16,10 +16,13 @@
 ///
 /// Retire Control Unit - number of cycles where we saw N instructions retired:
 /// [# retired], [# cycles]
-///  0,           9  (6.9%)
-///  1,           6  (4.6%)
-///  2,           1  (0.8%)
-///  4,           3  (2.3%)
+///  0,           109  (17.9%)
+///  1,           102  (16.7%)
+///  2,           399  (65.4%)
+///
+/// Total ROB Entries:                64
+/// Max Used ROB Entries:             35  ( 54.7% )
+/// Average Used ROB Entries per cy:  32  ( 50.0% )
 ///
 //===----------------------------------------------------------------------===//
 
@@ -27,7 +30,7 @@
 #define LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H
 
 #include "Views/View.h"
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSchedule.h"
 #include <map>
 
 namespace llvm {
@@ -39,21 +42,19 @@ class RetireControlUnitStatistics : public View {
 
   unsigned NumRetired;
   unsigned NumCycles;
-
-  void updateHistograms() {
-    RetiredPerCycle[NumRetired]++;
-    NumRetired = 0;
-  }
+  unsigned TotalROBEntries;
+  unsigned EntriesInUse;
+  unsigned MaxUsedEntries;
+  unsigned SumOfUsedEntries;
 
 public:
-  RetireControlUnitStatistics() : NumRetired(0), NumCycles(0) {}
+  RetireControlUnitStatistics(const MCSchedModel &SM);
 
   void onEvent(const HWInstructionEvent &Event) override;
-  void onCycleBegin() override { NumCycles++; }
-  void onCycleEnd() override { updateHistograms(); }
-
+  void onCycleEnd() override;
   void printView(llvm::raw_ostream &OS) const override;
 };
+
 } // namespace mca
 } // namespace llvm
 
diff --git a/tools/llvm-mca/Views/SchedulerStatistics.cpp b/tools/llvm-mca/Views/SchedulerStatistics.cpp
index edd6056c1e8aea585ea9b391697cd14421f53787..670f90127f18b51e83530e00ccc686da39f4fa48 100644
--- a/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -19,29 +19,83 @@
 namespace llvm {
 namespace mca {
 
+SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
+    : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0),
+      NumCycles(0), MostRecentLoadDispatched(~0U),
+      MostRecentStoreDispatched(~0U),
+      IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
+      Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    LQResourceID = EPI.LoadQueueID;
+    SQResourceID = EPI.StoreQueueID;
+  }
+}
+
+// FIXME: This implementation works under the assumption that load/store queue
+// entries are reserved at 'instruction dispatched' stage, and released at
+// 'instruction executed' stage. This currently matches the behavior of LSUnit.
+//
+// The current design minimizes the number of events generated by the
+// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method
+// `onEvent`. However, it introduces a subtle dependency between this view and
+// how the LSUnit works.
+//
+// In future we should add a new "memory queue" event type, so that we stop
+// making assumptions on how LSUnit internally works (See PR39828).
 void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
   if (Event.Type == HWInstructionEvent::Issued)
     ++NumIssued;
+  else if (Event.Type == HWInstructionEvent::Dispatched) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    const unsigned Index = Event.IR.getSourceIndex();
+    if (LQResourceID && Inst.getDesc().MayLoad &&
+        MostRecentLoadDispatched != Index) {
+      Usage[LQResourceID].SlotsInUse++;
+      MostRecentLoadDispatched = Index;
+    }
+    if (SQResourceID && Inst.getDesc().MayStore &&
+        MostRecentStoreDispatched != Index) {
+      Usage[SQResourceID].SlotsInUse++;
+      MostRecentStoreDispatched = Index;
+    }
+  } else if (Event.Type == HWInstructionEvent::Executed) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    if (LQResourceID && Inst.getDesc().MayLoad) {
+      assert(Usage[LQResourceID].SlotsInUse);
+      Usage[LQResourceID].SlotsInUse--;
+    }
+    if (SQResourceID && Inst.getDesc().MayStore) {
+      assert(Usage[SQResourceID].SlotsInUse);
+      Usage[SQResourceID].SlotsInUse--;
+    }
+  }
 }
 
 void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */,
                                             ArrayRef<unsigned> Buffers) {
   for (const unsigned Buffer : Buffers) {
-    BufferUsage &BU = Usage[Buffer];
-    BU.SlotsInUse++;
-    BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+    if (Buffer == LQResourceID || Buffer == SQResourceID)
+      continue;
+    Usage[Buffer].SlotsInUse++;
   }
 }
 
 void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */,
                                             ArrayRef<unsigned> Buffers) {
-  for (const unsigned Buffer : Buffers)
+  for (const unsigned Buffer : Buffers) {
+    if (Buffer == LQResourceID || Buffer == SQResourceID)
+      continue;
     Usage[Buffer].SlotsInUse--;
+  }
 }
 
 void SchedulerStatistics::updateHistograms() {
-  for (BufferUsage &BU : Usage)
+  for (BufferUsage &BU : Usage) {
     BU.CumulativeNumUsedSlots += BU.SlotsInUse;
+    BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+  }
+
   IssuedPerCycle[NumIssued]++;
   NumIssued = 0;
 }
diff --git a/tools/llvm-mca/Views/SchedulerStatistics.h b/tools/llvm-mca/Views/SchedulerStatistics.h
index 56dd3af19124aa516e5e0e0423e013d6d9bc5644..d99a395a726d0d2de72832ceceaba097d7f5df88 100644
--- a/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -47,9 +47,15 @@ namespace mca {
 
 class SchedulerStatistics final : public View {
   const llvm::MCSchedModel &SM;
+  unsigned LQResourceID;
+  unsigned SQResourceID;
+
   unsigned NumIssued;
   unsigned NumCycles;
 
+  unsigned MostRecentLoadDispatched;
+  unsigned MostRecentStoreDispatched;
+
   // Tracks the usage of a scheduler's queue.
   struct BufferUsage {
     unsigned SlotsInUse;
@@ -65,11 +71,7 @@ class SchedulerStatistics final : public View {
   void printSchedulerUsage(llvm::raw_ostream &OS) const;
 
 public:
-  SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
-      : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0),
-        IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
-        Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
-
+  SchedulerStatistics(const llvm::MCSubtargetInfo &STI);
   void onEvent(const HWInstructionEvent &Event) override;
   void onCycleBegin() override { NumCycles++; }
   void onCycleEnd() override { updateHistograms(); }
diff --git a/tools/llvm-mca/Views/SummaryView.cpp b/tools/llvm-mca/Views/SummaryView.cpp
index fdf27600c933e204cfb8aad446e7cab955c7cf58..3a7eb827c478e1b37152a3c3328cbc03113e2a2e 100644
--- a/tools/llvm-mca/Views/SummaryView.cpp
+++ b/tools/llvm-mca/Views/SummaryView.cpp
@@ -14,8 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Views/SummaryView.h"
-#include "Support.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/Support.h"
 #include "llvm/Support/Format.h"
 
 namespace llvm {
diff --git a/tools/llvm-mca/Views/View.h b/tools/llvm-mca/Views/View.h
index c332bb53938ab732f535babf0d7eb79273703e58..4b82b0da0d27d6e4ace69cd3f2a9f11594a76753 100644
--- a/tools/llvm-mca/Views/View.h
+++ b/tools/llvm-mca/Views/View.h
@@ -16,7 +16,7 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_VIEW_H
 #define LLVM_TOOLS_LLVM_MCA_VIEW_H
 
-#include "HWEventListener.h"
+#include "llvm/MCA/HWEventListener.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index 3a066f713bc80d13b7bfbe100f946f4dc5d28471..d0da3b85c28e9023b514ae6ccf1d713436859768 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -24,8 +24,6 @@
 #include "CodeRegion.h"
 #include "CodeRegionGenerator.h"
 #include "PipelinePrinter.h"
-#include "Stages/FetchStage.h"
-#include "Stages/InstructionTables.h"
 #include "Views/DispatchStatistics.h"
 #include "Views/InstructionInfoView.h"
 #include "Views/RegisterFileStatistics.h"
@@ -34,13 +32,15 @@
 #include "Views/SchedulerStatistics.h"
 #include "Views/SummaryView.h"
 #include "Views/TimelineView.h"
-#include "include/Context.h"
-#include "include/Pipeline.h"
-#include "include/Support.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MCA/Context.h"
+#include "llvm/MCA/Pipeline.h"
+#include "llvm/MCA/Stages/EntryStage.h"
+#include "llvm/MCA/Stages/InstructionTables.h"
+#include "llvm/MCA/Support.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
@@ -151,12 +151,12 @@ static cl::opt<bool>
 
 static cl::opt<unsigned>
     LoadQueueSize("lqueue",
-                  cl::desc("Size of the load queue (unbound by default)"),
+                  cl::desc("Size of the load queue"),
                   cl::cat(ToolOptions), cl::init(0));
 
 static cl::opt<unsigned>
     StoreQueueSize("squeue",
-                   cl::desc("Size of the store queue (unbound by default)"),
+                   cl::desc("Size of the store queue"),
                    cl::cat(ToolOptions), cl::init(0));
 
 static cl::opt<bool>
@@ -240,8 +240,9 @@ static void processViewOptions() {
 // Returns true on success.
 static bool runPipeline(mca::Pipeline &P) {
   // Handle pipeline errors here.
-  if (auto Err = P.run()) {
-    WithColor::error() << toString(std::move(Err));
+  Expected<unsigned> Cycles = P.run();
+  if (!Cycles) {
+    WithColor::error() << toString(Cycles.takeError());
     return false;
   }
   return true;
@@ -251,9 +252,9 @@ int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
 
   // Initialize targets and assembly parsers.
-  llvm::InitializeAllTargetInfos();
-  llvm::InitializeAllTargetMCs();
-  llvm::InitializeAllAsmParsers();
+  InitializeAllTargetInfos();
+  InitializeAllTargetMCs();
+  InitializeAllAsmParsers();
 
   // Enable printing of available targets when flag --version is specified.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
@@ -368,7 +369,7 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  std::unique_ptr<llvm::ToolOutputFile> TOF = std::move(*OF);
+  std::unique_ptr<ToolOutputFile> TOF = std::move(*OF);
 
   const MCSchedModel &SM = STI->getSchedModel();
 
@@ -377,7 +378,7 @@ int main(int argc, char **argv) {
     Width = DispatchWidth;
 
   // Create an instruction builder.
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get());
 
   // Create a context to control ownership of the pipeline hardware.
   mca::Context MCA(*MRI, *STI);
@@ -407,7 +408,7 @@ int main(int argc, char **argv) {
     ArrayRef<MCInst> Insts = Region->getInstructions();
     std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence;
     for (const MCInst &MCI : Insts) {
-      llvm::Expected<std::unique_ptr<mca::Instruction>> Inst =
+      Expected<std::unique_ptr<mca::Instruction>> Inst =
           IB.createInstruction(MCI);
       if (!Inst) {
         if (auto NewE = handleErrors(
@@ -435,7 +436,7 @@ int main(int argc, char **argv) {
     if (PrintInstructionTables) {
       //  Create a pipeline, stages, and a printer.
       auto P = llvm::make_unique<mca::Pipeline>();
-      P->appendStage(llvm::make_unique<mca::FetchStage>(S));
+      P->appendStage(llvm::make_unique<mca::EntryStage>(S));
       P->appendStage(llvm::make_unique<mca::InstructionTables>(SM));
       mca::PipelinePrinter Printer(*P);
 
@@ -472,7 +473,7 @@ int main(int argc, char **argv) {
       Printer.addView(llvm::make_unique<mca::SchedulerStatistics>(*STI));
 
     if (PrintRetireStats)
-      Printer.addView(llvm::make_unique<mca::RetireControlUnitStatistics>());
+      Printer.addView(llvm::make_unique<mca::RetireControlUnitStatistics>(SM));
 
     if (PrintRegisterFileStats)
       Printer.addView(llvm::make_unique<mca::RegisterFileStatistics>(*STI));
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 21f3a2bade5e2ca812972705a2972c39d3045f47..88217a5403fd68cffc65324f05702234044c5674 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <vector>
 
@@ -196,7 +197,7 @@ std::string ToolName;
 
 static void error(Twine Message, Twine Path = Twine()) {
   HadError = true;
-  errs() << ToolName << ": " << Path << ": " << Message << ".\n";
+  WithColor::error(errs(), ToolName) << Path << ": " << Message << ".\n";
 }
 
 static bool error(std::error_code EC, Twine Path = Twine()) {
@@ -209,11 +210,11 @@ static bool error(std::error_code EC, Twine Path = Twine()) {
 
 // This version of error() prints the archive name and member name, for example:
 // "libx.a(foo.o)" after the ToolName before the error message.  It sets
-// HadError but returns allowing the code to move on to other archive members. 
+// HadError but returns allowing the code to move on to other archive members.
 static void error(llvm::Error E, StringRef FileName, const Archive::Child &C,
                   StringRef ArchitectureName = StringRef()) {
   HadError = true;
-  errs() << ToolName << ": " << FileName;
+  WithColor::error(errs(), ToolName) << FileName;
 
   Expected<StringRef> NameOrErr = C.getName();
   // TODO: if we have a error getting the name then it would be nice to print
@@ -230,7 +231,7 @@ static void error(llvm::Error E, StringRef FileName, const Archive::Child &C,
 
   std::string Buf;
   raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS, "");
+  logAllUnhandledErrors(std::move(E), OS);
   OS.flush();
   errs() << " " << Buf << "\n";
 }
@@ -238,18 +239,18 @@ static void error(llvm::Error E, StringRef FileName, const Archive::Child &C,
 // This version of error() prints the file name and which architecture slice it
 // is from, for example: "foo.o (for architecture i386)" after the ToolName
 // before the error message.  It sets HadError but returns allowing the code to
-// move on to other architecture slices. 
+// move on to other architecture slices.
 static void error(llvm::Error E, StringRef FileName,
                   StringRef ArchitectureName = StringRef()) {
   HadError = true;
-  errs() << ToolName << ": " << FileName;
+  WithColor::error(errs(), ToolName) << FileName;
 
   if (!ArchitectureName.empty())
     errs() << " (for architecture " << ArchitectureName << ") ";
 
   std::string Buf;
   raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS, "");
+  logAllUnhandledErrors(std::move(E), OS);
   OS.flush();
   errs() << " " << Buf << "\n";
 }
@@ -1029,8 +1030,7 @@ static char getSymbolNMTypeChar(MachOObjectFile &Obj, basic_symbol_iterator I) {
     StringRef SectionName;
     Obj.getSectionName(Ref, SectionName);
     StringRef SegmentName = Obj.getSectionFinalSegmentName(Ref);
-    if (Obj.is64Bit() && 
-        Obj.getHeader64().filetype == MachO::MH_KEXT_BUNDLE &&
+    if (Obj.is64Bit() && Obj.getHeader64().filetype == MachO::MH_KEXT_BUNDLE &&
         SegmentName == "__TEXT_EXEC" && SectionName == "__text")
       return 't';
     if (SegmentName == "__TEXT" && SectionName == "__text")
@@ -1606,7 +1606,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       uint64_t lc_main_offset = UINT64_MAX;
       for (const auto &Command : MachO->load_commands()) {
         if (Command.C.cmd == MachO::LC_FUNCTION_STARTS) {
-          // We found a function starts segment, parse the addresses for 
+          // We found a function starts segment, parse the addresses for
           // consumption.
           MachO::linkedit_data_command LLC =
             MachO->getLinkeditDataLoadCommand(Command);
@@ -1776,8 +1776,8 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
         }
         if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
           if (!MachOPrintSizeWarning && PrintSize &&  isa<MachOObjectFile>(O)) {
-            errs() << ToolName << ": warning sizes with -print-size for Mach-O "
-                      "files are always zero.\n";
+            WithColor::warning(errs(), ToolName)
+                << "sizes with -print-size for Mach-O files are always zero.\n";
             MachOPrintSizeWarning = true;
           }
           if (!checkMachOAndArchFlags(O, Filename))
@@ -1889,7 +1889,8 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
     // No architecture flags were specified so if this contains a slice that
     // matches the host architecture dump only that.
     if (!ArchAll) {
-      StringRef HostArchName = MachOObjectFile::getHostArch().getArchName();
+      Triple HostTriple = MachOObjectFile::getHostArch();
+      StringRef HostArchName = HostTriple.getArchName();
       for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
                                                  E = UB->end_objects();
            I != E; ++I) {
@@ -2017,8 +2018,8 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
   }
   if (SymbolicFile *O = dyn_cast<SymbolicFile>(&Bin)) {
     if (!MachOPrintSizeWarning && PrintSize &&  isa<MachOObjectFile>(O)) {
-      errs() << ToolName << ": warning sizes with -print-size for Mach-O files "
-                "are always zero.\n";
+      WithColor::warning(errs(), ToolName)
+          << "sizes with -print-size for Mach-O files are always zero.\n";
       MachOPrintSizeWarning = true;
     }
     if (!checkMachOAndArchFlags(O, Filename))
diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt
index afbf78791766334db9a0d31dad9c0a4ca184c5bb..3b6c345b292b6fbe0b31aaea810af3b90ede36d3 100644
--- a/tools/llvm-objcopy/CMakeLists.txt
+++ b/tools/llvm-objcopy/CMakeLists.txt
@@ -17,6 +17,9 @@ add_llvm_tool(llvm-objcopy
   Buffer.cpp
   CopyConfig.cpp
   llvm-objcopy.cpp
+  COFF/COFFObjcopy.cpp
+  COFF/Reader.cpp
+  COFF/Writer.cpp
   ELF/ELFObjcopy.cpp
   ELF/Object.cpp
   DEPENDS
diff --git a/tools/llvm-objcopy/COFF/COFFObjcopy.cpp b/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89e926af567e93181fbd99a33ac1d1ec9f4cd2c0
--- /dev/null
+++ b/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
@@ -0,0 +1,40 @@
+//===- COFFObjcopy.cpp ----------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "COFFObjcopy.h"
+#include "Buffer.h"
+#include "CopyConfig.h"
+#include "Object.h"
+#include "Reader.h"
+#include "Writer.h"
+#include "llvm-objcopy.h"
+
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/COFF.h"
+#include <cassert>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+using namespace object;
+using namespace COFF;
+
+void executeObjcopyOnBinary(const CopyConfig &Config,
+                            object::COFFObjectFile &In, Buffer &Out) {
+  COFFReader Reader(In);
+  std::unique_ptr<Object> Obj = Reader.create();
+  assert(Obj && "Unable to deserialize COFF object");
+  COFFWriter Writer(*Obj, Out);
+  Writer.write();
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/COFF/COFFObjcopy.h b/tools/llvm-objcopy/COFF/COFFObjcopy.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf70bd9b4d844a1b751f391c82e69a82326e7cab
--- /dev/null
+++ b/tools/llvm-objcopy/COFF/COFFObjcopy.h
@@ -0,0 +1,31 @@
+//===- COFFObjcopy.h --------------------------------------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
+#define LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
+
+namespace llvm {
+
+namespace object {
+class COFFObjectFile;
+} // end namespace object
+
+namespace objcopy {
+struct CopyConfig;
+class Buffer;
+
+namespace coff {
+void executeObjcopyOnBinary(const CopyConfig &Config,
+                            object::COFFObjectFile &In, Buffer &Out);
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
diff --git a/tools/llvm-objcopy/COFF/Object.h b/tools/llvm-objcopy/COFF/Object.h
new file mode 100644
index 0000000000000000000000000000000000000000..89f9903c0fd2e6698b07d0a8d0c89316897d3fa6
--- /dev/null
+++ b/tools/llvm-objcopy/COFF/Object.h
@@ -0,0 +1,110 @@
+//===- Object.h -------------------------------------------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
+#define LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+struct Section {
+  object::coff_section Header;
+  ArrayRef<uint8_t> Contents;
+  std::vector<object::coff_relocation> Relocs;
+  StringRef Name;
+};
+
+struct Symbol {
+  object::coff_symbol32 Sym;
+  StringRef Name;
+  ArrayRef<uint8_t> AuxData;
+};
+
+struct Object {
+  bool IsPE = false;
+
+  object::dos_header DosHeader;
+  ArrayRef<uint8_t> DosStub;
+
+  object::coff_file_header CoffFileHeader;
+
+  bool Is64 = false;
+  object::pe32plus_header PeHeader;
+  uint32_t BaseOfData = 0; // pe32plus_header lacks this field.
+
+  std::vector<object::data_directory> DataDirectories;
+  std::vector<Section> Sections;
+  std::vector<Symbol> Symbols;
+};
+
+// Copy between coff_symbol16 and coff_symbol32.
+// The source and destination files can use either coff_symbol16 or
+// coff_symbol32, while we always store them as coff_symbol32 in the
+// intermediate data structure.
+template <class Symbol1Ty, class Symbol2Ty>
+void copySymbol(Symbol1Ty &Dest, const Symbol2Ty &Src) {
+  static_assert(sizeof(Dest.Name.ShortName) == sizeof(Src.Name.ShortName),
+                "Mismatched name sizes");
+  memcpy(Dest.Name.ShortName, Src.Name.ShortName, sizeof(Dest.Name.ShortName));
+  Dest.Value = Src.Value;
+  Dest.SectionNumber = Src.SectionNumber;
+  Dest.Type = Src.Type;
+  Dest.StorageClass = Src.StorageClass;
+  Dest.NumberOfAuxSymbols = Src.NumberOfAuxSymbols;
+}
+
+// Copy between pe32_header and pe32plus_header.
+// We store the intermediate state in a pe32plus_header.
+template <class PeHeader1Ty, class PeHeader2Ty>
+void copyPeHeader(PeHeader1Ty &Dest, const PeHeader2Ty &Src) {
+  Dest.Magic = Src.Magic;
+  Dest.MajorLinkerVersion = Src.MajorLinkerVersion;
+  Dest.MinorLinkerVersion = Src.MinorLinkerVersion;
+  Dest.SizeOfCode = Src.SizeOfCode;
+  Dest.SizeOfInitializedData = Src.SizeOfInitializedData;
+  Dest.SizeOfUninitializedData = Src.SizeOfUninitializedData;
+  Dest.AddressOfEntryPoint = Src.AddressOfEntryPoint;
+  Dest.BaseOfCode = Src.BaseOfCode;
+  Dest.ImageBase = Src.ImageBase;
+  Dest.SectionAlignment = Src.SectionAlignment;
+  Dest.FileAlignment = Src.FileAlignment;
+  Dest.MajorOperatingSystemVersion = Src.MajorOperatingSystemVersion;
+  Dest.MinorOperatingSystemVersion = Src.MinorOperatingSystemVersion;
+  Dest.MajorImageVersion = Src.MajorImageVersion;
+  Dest.MinorImageVersion = Src.MinorImageVersion;
+  Dest.MajorSubsystemVersion = Src.MajorSubsystemVersion;
+  Dest.MinorSubsystemVersion = Src.MinorSubsystemVersion;
+  Dest.Win32VersionValue = Src.Win32VersionValue;
+  Dest.SizeOfImage = Src.SizeOfImage;
+  Dest.SizeOfHeaders = Src.SizeOfHeaders;
+  Dest.CheckSum = Src.CheckSum;
+  Dest.Subsystem = Src.Subsystem;
+  Dest.DLLCharacteristics = Src.DLLCharacteristics;
+  Dest.SizeOfStackReserve = Src.SizeOfStackReserve;
+  Dest.SizeOfStackCommit = Src.SizeOfStackCommit;
+  Dest.SizeOfHeapReserve = Src.SizeOfHeapReserve;
+  Dest.SizeOfHeapCommit = Src.SizeOfHeapCommit;
+  Dest.LoaderFlags = Src.LoaderFlags;
+  Dest.NumberOfRvaAndSize = Src.NumberOfRvaAndSize;
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
diff --git a/tools/llvm-objcopy/COFF/Reader.cpp b/tools/llvm-objcopy/COFF/Reader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8a3576084cd1fa74ef82a5e3bf02d6593d78775
--- /dev/null
+++ b/tools/llvm-objcopy/COFF/Reader.cpp
@@ -0,0 +1,141 @@
+//===- Reader.cpp ---------------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Reader.h"
+#include "Object.h"
+#include "llvm-objcopy.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+#include <cstdint>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+using namespace object;
+
+Reader::~Reader() {}
+
+void COFFReader::readExecutableHeaders(Object &Obj) const {
+  const dos_header *DH = COFFObj.getDOSHeader();
+  Obj.Is64 = COFFObj.is64();
+  if (!DH)
+    return;
+
+  Obj.IsPE = true;
+  Obj.DosHeader = *DH;
+  if (DH->AddressOfNewExeHeader > sizeof(*DH))
+    Obj.DosStub = ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&DH[1]),
+                                    DH->AddressOfNewExeHeader - sizeof(*DH));
+
+  if (COFFObj.is64()) {
+    const pe32plus_header *PE32Plus = nullptr;
+    if (auto EC = COFFObj.getPE32PlusHeader(PE32Plus))
+      reportError(COFFObj.getFileName(), std::move(EC));
+    Obj.PeHeader = *PE32Plus;
+  } else {
+    const pe32_header *PE32 = nullptr;
+    if (auto EC = COFFObj.getPE32Header(PE32))
+      reportError(COFFObj.getFileName(), std::move(EC));
+    copyPeHeader(Obj.PeHeader, *PE32);
+    // The pe32plus_header (stored in Object) lacks the BaseOfData field.
+    Obj.BaseOfData = PE32->BaseOfData;
+  }
+
+  for (size_t I = 0; I < Obj.PeHeader.NumberOfRvaAndSize; I++) {
+    const data_directory *Dir;
+    if (auto EC = COFFObj.getDataDirectory(I, Dir))
+      reportError(COFFObj.getFileName(), std::move(EC));
+    Obj.DataDirectories.emplace_back(*Dir);
+  }
+}
+
+void COFFReader::readSections(Object &Obj) const {
+  // Section indexing starts from 1.
+  for (size_t I = 1, E = COFFObj.getNumberOfSections(); I <= E; I++) {
+    const coff_section *Sec;
+    if (auto EC = COFFObj.getSection(I, Sec))
+      reportError(COFFObj.getFileName(), std::move(EC));
+    Obj.Sections.push_back(Section());
+    Section &S = Obj.Sections.back();
+    S.Header = *Sec;
+    if (auto EC = COFFObj.getSectionContents(Sec, S.Contents))
+      reportError(COFFObj.getFileName(), std::move(EC));
+    ArrayRef<coff_relocation> Relocs = COFFObj.getRelocations(Sec);
+    S.Relocs.insert(S.Relocs.end(), Relocs.begin(), Relocs.end());
+    if (auto EC = COFFObj.getSectionName(Sec, S.Name))
+      reportError(COFFObj.getFileName(), std::move(EC));
+    if (Sec->hasExtendedRelocations())
+      reportError(
+          COFFObj.getFileName(),
+          make_error<StringError>("Extended relocations not supported yet",
+                                  object_error::parse_failed));
+  }
+}
+
+void COFFReader::readSymbols(Object &Obj, bool IsBigObj) const {
+  for (uint32_t I = 0, E = COFFObj.getRawNumberOfSymbols(); I < E;) {
+    Expected<COFFSymbolRef> SymOrErr = COFFObj.getSymbol(I);
+    if (!SymOrErr)
+      reportError(COFFObj.getFileName(), SymOrErr.takeError());
+    COFFSymbolRef SymRef = *SymOrErr;
+
+    Obj.Symbols.push_back(Symbol());
+    Symbol &Sym = Obj.Symbols.back();
+    // Copy symbols from the original form into an intermediate coff_symbol32.
+    if (IsBigObj)
+      copySymbol(Sym.Sym,
+                 *reinterpret_cast<const coff_symbol32 *>(SymRef.getRawPtr()));
+    else
+      copySymbol(Sym.Sym,
+                 *reinterpret_cast<const coff_symbol16 *>(SymRef.getRawPtr()));
+    if (auto EC = COFFObj.getSymbolName(SymRef, Sym.Name))
+      reportError(COFFObj.getFileName(), std::move(EC));
+    Sym.AuxData = COFFObj.getSymbolAuxData(SymRef);
+    assert((Sym.AuxData.size() %
+            (IsBigObj ? sizeof(coff_symbol32) : sizeof(coff_symbol16))) == 0);
+    I += 1 + SymRef.getNumberOfAuxSymbols();
+  }
+}
+
+std::unique_ptr<Object> COFFReader::create() const {
+  auto Obj = llvm::make_unique<Object>();
+
+  const coff_file_header *CFH = nullptr;
+  const coff_bigobj_file_header *CBFH = nullptr;
+  COFFObj.getCOFFHeader(CFH);
+  COFFObj.getCOFFBigObjHeader(CBFH);
+  bool IsBigObj = false;
+  if (CFH) {
+    Obj->CoffFileHeader = *CFH;
+  } else {
+    if (!CBFH)
+      reportError(COFFObj.getFileName(),
+                  make_error<StringError>("No COFF file header returned",
+                                          object_error::parse_failed));
+    // Only copying the few fields from the bigobj header that we need
+    // and won't recreate in the end.
+    Obj->CoffFileHeader.Machine = CBFH->Machine;
+    Obj->CoffFileHeader.TimeDateStamp = CBFH->TimeDateStamp;
+    IsBigObj = true;
+  }
+
+  readExecutableHeaders(*Obj);
+  readSections(*Obj);
+  readSymbols(*Obj, IsBigObj);
+
+  return Obj;
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/COFF/Reader.h b/tools/llvm-objcopy/COFF/Reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..c81a9751f1804cd83f60f2ab93bdcca46a191b8a
--- /dev/null
+++ b/tools/llvm-objcopy/COFF/Reader.h
@@ -0,0 +1,47 @@
+//===- Reader.h -------------------------------------------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_COFF_READER_H
+#define LLVM_TOOLS_OBJCOPY_COFF_READER_H
+
+#include "Buffer.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+struct Object;
+
+using object::COFFObjectFile;
+
+class Reader {
+public:
+  virtual ~Reader();
+  virtual std::unique_ptr<Object> create() const = 0;
+};
+
+class COFFReader : public Reader {
+  const COFFObjectFile &COFFObj;
+
+  void readExecutableHeaders(Object &Obj) const;
+  void readSections(Object &Obj) const;
+  void readSymbols(Object &Obj, bool IsBigObj) const;
+
+public:
+  explicit COFFReader(const COFFObjectFile &O) : COFFObj(O) {}
+  std::unique_ptr<Object> create() const override;
+};
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_OBJCOPY_COFF_READER_H
diff --git a/tools/llvm-objcopy/COFF/Writer.cpp b/tools/llvm-objcopy/COFF/Writer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48afe7889e872148c766c00cc7d23817696a3cf7
--- /dev/null
+++ b/tools/llvm-objcopy/COFF/Writer.cpp
@@ -0,0 +1,318 @@
+//===- Writer.cpp ---------------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Writer.h"
+#include "Object.h"
+#include "llvm-objcopy.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+#include <cstdint>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+using namespace object;
+using namespace COFF;
+
+Writer::~Writer() {}
+
+void COFFWriter::layoutSections() {
+  for (auto &S : Obj.Sections) {
+    if (S.Header.SizeOfRawData > 0)
+      S.Header.PointerToRawData = FileSize;
+    FileSize += S.Header.SizeOfRawData; // For executables, this is already
+                                        // aligned to FileAlignment.
+    if (S.Header.NumberOfRelocations > 0)
+      S.Header.PointerToRelocations = FileSize;
+    FileSize += S.Relocs.size() * sizeof(coff_relocation);
+    FileSize = alignTo(FileSize, FileAlignment);
+
+    if (S.Header.Characteristics & IMAGE_SCN_CNT_INITIALIZED_DATA)
+      SizeOfInitializedData += S.Header.SizeOfRawData;
+  }
+}
+
+size_t COFFWriter::finalizeStringTable() {
+  for (auto &S : Obj.Sections)
+    if (S.Name.size() > COFF::NameSize)
+      StrTabBuilder.add(S.Name);
+
+  for (const auto &S : Obj.Symbols)
+    if (S.Name.size() > COFF::NameSize)
+      StrTabBuilder.add(S.Name);
+
+  StrTabBuilder.finalize();
+
+  for (auto &S : Obj.Sections) {
+    if (S.Name.size() > COFF::NameSize) {
+      snprintf(S.Header.Name, sizeof(S.Header.Name), "/%d",
+               (int)StrTabBuilder.getOffset(S.Name));
+    } else {
+      strncpy(S.Header.Name, S.Name.data(), COFF::NameSize);
+    }
+  }
+  for (auto &S : Obj.Symbols) {
+    if (S.Name.size() > COFF::NameSize) {
+      S.Sym.Name.Offset.Zeroes = 0;
+      S.Sym.Name.Offset.Offset = StrTabBuilder.getOffset(S.Name);
+    } else {
+      strncpy(S.Sym.Name.ShortName, S.Name.data(), COFF::NameSize);
+    }
+  }
+  return StrTabBuilder.getSize();
+}
+
+template <class SymbolTy>
+std::pair<size_t, size_t> COFFWriter::finalizeSymbolTable() {
+  size_t SymTabSize = Obj.Symbols.size() * sizeof(SymbolTy);
+  for (const auto &S : Obj.Symbols)
+    SymTabSize += S.AuxData.size();
+  return std::make_pair(SymTabSize, sizeof(SymbolTy));
+}
+
+void COFFWriter::finalize(bool IsBigObj) {
+  size_t SizeOfHeaders = 0;
+  FileAlignment = 1;
+  size_t PeHeaderSize = 0;
+  if (Obj.IsPE) {
+    Obj.DosHeader.AddressOfNewExeHeader =
+        sizeof(Obj.DosHeader) + Obj.DosStub.size();
+    SizeOfHeaders += Obj.DosHeader.AddressOfNewExeHeader + sizeof(PEMagic);
+
+    FileAlignment = Obj.PeHeader.FileAlignment;
+    Obj.PeHeader.NumberOfRvaAndSize = Obj.DataDirectories.size();
+
+    PeHeaderSize = Obj.Is64 ? sizeof(pe32plus_header) : sizeof(pe32_header);
+    SizeOfHeaders +=
+        PeHeaderSize + sizeof(data_directory) * Obj.DataDirectories.size();
+  }
+  Obj.CoffFileHeader.NumberOfSections = Obj.Sections.size();
+  SizeOfHeaders +=
+      IsBigObj ? sizeof(coff_bigobj_file_header) : sizeof(coff_file_header);
+  SizeOfHeaders += sizeof(coff_section) * Obj.Sections.size();
+  SizeOfHeaders = alignTo(SizeOfHeaders, FileAlignment);
+
+  Obj.CoffFileHeader.SizeOfOptionalHeader =
+      PeHeaderSize + sizeof(data_directory) * Obj.DataDirectories.size();
+
+  FileSize = SizeOfHeaders;
+  SizeOfInitializedData = 0;
+
+  layoutSections();
+
+  if (Obj.IsPE) {
+    Obj.PeHeader.SizeOfHeaders = SizeOfHeaders;
+    Obj.PeHeader.SizeOfInitializedData = SizeOfInitializedData;
+
+    if (!Obj.Sections.empty()) {
+      const Section &S = Obj.Sections.back();
+      Obj.PeHeader.SizeOfImage =
+          alignTo(S.Header.VirtualAddress + S.Header.VirtualSize,
+                  Obj.PeHeader.SectionAlignment);
+    }
+
+    // If the PE header had a checksum, clear it, since it isn't valid
+    // any longer. (We don't calculate a new one.)
+    Obj.PeHeader.CheckSum = 0;
+  }
+
+  size_t StrTabSize = finalizeStringTable();
+  size_t SymTabSize, SymbolSize;
+  std::tie(SymTabSize, SymbolSize) = IsBigObj
+                                         ? finalizeSymbolTable<coff_symbol32>()
+                                         : finalizeSymbolTable<coff_symbol16>();
+
+  size_t PointerToSymbolTable = FileSize;
+  // StrTabSize <= 4 is the size of an empty string table, only consisting
+  // of the length field.
+  if (SymTabSize == 0 && StrTabSize <= 4) {
+    // Don't point to the symbol table if empty.
+    PointerToSymbolTable = 0;
+    // For executables, skip the length field of an empty string table.
+    if (Obj.IsPE)
+      StrTabSize = 0;
+  }
+
+  size_t NumRawSymbols = SymTabSize / SymbolSize;
+  Obj.CoffFileHeader.PointerToSymbolTable = PointerToSymbolTable;
+  Obj.CoffFileHeader.NumberOfSymbols = NumRawSymbols;
+  FileSize += SymTabSize + StrTabSize;
+  FileSize = alignTo(FileSize, FileAlignment);
+}
+
+void COFFWriter::writeHeaders(bool IsBigObj) {
+  uint8_t *Ptr = Buf.getBufferStart();
+  if (Obj.IsPE) {
+    memcpy(Ptr, &Obj.DosHeader, sizeof(Obj.DosHeader));
+    Ptr += sizeof(Obj.DosHeader);
+    memcpy(Ptr, Obj.DosStub.data(), Obj.DosStub.size());
+    Ptr += Obj.DosStub.size();
+    memcpy(Ptr, PEMagic, sizeof(PEMagic));
+    Ptr += sizeof(PEMagic);
+  }
+  if (!IsBigObj) {
+    memcpy(Ptr, &Obj.CoffFileHeader, sizeof(Obj.CoffFileHeader));
+    Ptr += sizeof(Obj.CoffFileHeader);
+  } else {
+    // Generate a coff_bigobj_file_header, filling it in with the values
+    // from Obj.CoffFileHeader. All extra fields that don't exist in
+    // coff_file_header can be set to hardcoded values.
+    coff_bigobj_file_header BigObjHeader;
+    BigObjHeader.Sig1 = IMAGE_FILE_MACHINE_UNKNOWN;
+    BigObjHeader.Sig2 = 0xffff;
+    BigObjHeader.Version = BigObjHeader::MinBigObjectVersion;
+    BigObjHeader.Machine = Obj.CoffFileHeader.Machine;
+    BigObjHeader.TimeDateStamp = Obj.CoffFileHeader.TimeDateStamp;
+    memcpy(BigObjHeader.UUID, BigObjMagic, sizeof(BigObjMagic));
+    BigObjHeader.unused1 = 0;
+    BigObjHeader.unused2 = 0;
+    BigObjHeader.unused3 = 0;
+    BigObjHeader.unused4 = 0;
+    // The value in Obj.CoffFileHeader.NumberOfSections is truncated, thus
+    // get the original one instead.
+    BigObjHeader.NumberOfSections = Obj.Sections.size();
+    BigObjHeader.PointerToSymbolTable = Obj.CoffFileHeader.PointerToSymbolTable;
+    BigObjHeader.NumberOfSymbols = Obj.CoffFileHeader.NumberOfSymbols;
+
+    memcpy(Ptr, &BigObjHeader, sizeof(BigObjHeader));
+    Ptr += sizeof(BigObjHeader);
+  }
+  if (Obj.IsPE) {
+    if (Obj.Is64) {
+      memcpy(Ptr, &Obj.PeHeader, sizeof(Obj.PeHeader));
+      Ptr += sizeof(Obj.PeHeader);
+    } else {
+      pe32_header PeHeader;
+      copyPeHeader(PeHeader, Obj.PeHeader);
+      // The pe32plus_header (stored in Object) lacks the BaseOfData field.
+      PeHeader.BaseOfData = Obj.BaseOfData;
+
+      memcpy(Ptr, &PeHeader, sizeof(PeHeader));
+      Ptr += sizeof(PeHeader);
+    }
+    for (const auto &DD : Obj.DataDirectories) {
+      memcpy(Ptr, &DD, sizeof(DD));
+      Ptr += sizeof(DD);
+    }
+  }
+  for (const auto &S : Obj.Sections) {
+    memcpy(Ptr, &S.Header, sizeof(S.Header));
+    Ptr += sizeof(S.Header);
+  }
+}
+
+void COFFWriter::writeSections() {
+  for (const auto &S : Obj.Sections) {
+    uint8_t *Ptr = Buf.getBufferStart() + S.Header.PointerToRawData;
+    memcpy(Ptr, S.Contents.data(), S.Contents.size());
+
+    // For executable sections, pad the remainder of the raw data size with
+    // 0xcc, which is int3 on x86.
+    if ((S.Header.Characteristics & IMAGE_SCN_CNT_CODE) &&
+        S.Header.SizeOfRawData > S.Contents.size())
+      memset(Ptr + S.Contents.size(), 0xcc,
+             S.Header.SizeOfRawData - S.Contents.size());
+
+    Ptr += S.Header.SizeOfRawData;
+    memcpy(Ptr, S.Relocs.data(), S.Relocs.size() * sizeof(coff_relocation));
+  }
+}
+
+template <class SymbolTy> void COFFWriter::writeSymbolStringTables() {
+  uint8_t *Ptr = Buf.getBufferStart() + Obj.CoffFileHeader.PointerToSymbolTable;
+  for (const auto &S : Obj.Symbols) {
+    // Convert symbols back to the right size, from coff_symbol32.
+    copySymbol<SymbolTy, coff_symbol32>(*reinterpret_cast<SymbolTy *>(Ptr),
+                                        S.Sym);
+    Ptr += sizeof(SymbolTy);
+    memcpy(Ptr, S.AuxData.data(), S.AuxData.size());
+    Ptr += S.AuxData.size();
+  }
+  if (StrTabBuilder.getSize() > 4 || !Obj.IsPE) {
+    // Always write a string table in object files, even an empty one.
+    StrTabBuilder.write(Ptr);
+    Ptr += StrTabBuilder.getSize();
+  }
+}
+
+void COFFWriter::write(bool IsBigObj) {
+  finalize(IsBigObj);
+
+  Buf.allocate(FileSize);
+
+  writeHeaders(IsBigObj);
+  writeSections();
+  if (IsBigObj)
+    writeSymbolStringTables<coff_symbol32>();
+  else
+    writeSymbolStringTables<coff_symbol16>();
+
+  if (Obj.IsPE)
+    patchDebugDirectory();
+
+  if (auto E = Buf.commit())
+    reportError(Buf.getName(), errorToErrorCode(std::move(E)));
+}
+
+// Locate which sections contain the debug directories, iterate over all
+// the debug_directory structs in there, and set the PointerToRawData field
+// in all of them, according to their new physical location in the file.
+void COFFWriter::patchDebugDirectory() {
+  if (Obj.DataDirectories.size() < DEBUG_DIRECTORY)
+    return;
+  const data_directory *Dir = &Obj.DataDirectories[DEBUG_DIRECTORY];
+  if (Dir->Size <= 0)
+    return;
+  for (const auto &S : Obj.Sections) {
+    if (Dir->RelativeVirtualAddress >= S.Header.VirtualAddress &&
+        Dir->RelativeVirtualAddress <
+            S.Header.VirtualAddress + S.Header.SizeOfRawData) {
+      if (Dir->RelativeVirtualAddress + Dir->Size >
+          S.Header.VirtualAddress + S.Header.SizeOfRawData)
+        reportError(Buf.getName(),
+                    make_error<StringError>(
+                        "Debug directory extends past end of section",
+                        object_error::parse_failed));
+
+      size_t Offset = Dir->RelativeVirtualAddress - S.Header.VirtualAddress;
+      uint8_t *Ptr = Buf.getBufferStart() + S.Header.PointerToRawData + Offset;
+      uint8_t *End = Ptr + Dir->Size;
+      while (Ptr < End) {
+        debug_directory *Debug = reinterpret_cast<debug_directory *>(Ptr);
+        Debug->PointerToRawData =
+            S.Header.PointerToRawData + Offset + sizeof(debug_directory);
+        Ptr += sizeof(debug_directory) + Debug->SizeOfData;
+        Offset += sizeof(debug_directory) + Debug->SizeOfData;
+      }
+      // Debug directory found and patched, all done.
+      return;
+    }
+  }
+  reportError(Buf.getName(),
+              make_error<StringError>("Debug directory not found",
+                                      object_error::parse_failed));
+}
+
+void COFFWriter::write() {
+  bool IsBigObj = Obj.Sections.size() > MaxNumberOfSections16;
+  if (IsBigObj && Obj.IsPE)
+    reportError(Buf.getName(),
+                make_error<StringError>("Too many sections for executable",
+                                        object_error::parse_failed));
+  write(IsBigObj);
+}
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/COFF/Writer.h b/tools/llvm-objcopy/COFF/Writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..397fda02ee980761333e12d6105c64a6c38579ea
--- /dev/null
+++ b/tools/llvm-objcopy/COFF/Writer.h
@@ -0,0 +1,68 @@
+//===- Writer.h -------------------------------------------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
+#define LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
+
+#include "Buffer.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include <cstddef>
+#include <utility>
+
+namespace llvm {
+namespace objcopy {
+namespace coff {
+
+struct Object;
+
+class Writer {
+protected:
+  Object &Obj;
+  Buffer &Buf;
+
+public:
+  virtual ~Writer();
+  virtual void write() = 0;
+
+  Writer(Object &O, Buffer &B) : Obj(O), Buf(B) {}
+};
+
+class COFFWriter : public Writer {
+  size_t FileSize;
+  size_t FileAlignment;
+  size_t SizeOfInitializedData;
+  StringTableBuilder StrTabBuilder;
+
+  void layoutSections();
+  size_t finalizeStringTable();
+  template <class SymbolTy> std::pair<size_t, size_t> finalizeSymbolTable();
+
+  void finalize(bool IsBigObj);
+
+  void writeHeaders(bool IsBigObj);
+  void writeSections();
+  template <class SymbolTy> void writeSymbolStringTables();
+
+  void write(bool IsBigObj);
+
+  void patchDebugDirectory();
+
+public:
+  virtual ~COFFWriter() {}
+  void write() override;
+
+  COFFWriter(Object &Obj, Buffer &Buf)
+      : Writer(Obj, Buf), StrTabBuilder(StringTableBuilder::WinCOFF) {}
+};
+
+} // end namespace coff
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
index 67963a22a1c0eacc65547f327108e6b2f569f300..24f0e29d4926f37f07a8b6ba1816d2a6d8f54292 100644
--- a/tools/llvm-objcopy/CopyConfig.cpp
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -226,6 +226,7 @@ DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
   }
 
   if (InputArgs.hasArg(OBJCOPY_version)) {
+    outs() << "llvm-objcopy, compatible with GNU objcopy\n";
     cl::PrintVersionMessage();
     exit(0);
   }
@@ -285,8 +286,15 @@ DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
     }
   }
 
-  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
   Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
+  Config.BuildIdLinkDir = InputArgs.getLastArgValue(OBJCOPY_build_id_link_dir);
+  if (InputArgs.hasArg(OBJCOPY_build_id_link_input))
+    Config.BuildIdLinkInput =
+        InputArgs.getLastArgValue(OBJCOPY_build_id_link_input);
+  if (InputArgs.hasArg(OBJCOPY_build_id_link_output))
+    Config.BuildIdLinkOutput =
+        InputArgs.getLastArgValue(OBJCOPY_build_id_link_output);
+  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
   Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols);
 
   for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
@@ -305,10 +313,10 @@ DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
 
   for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
     Config.ToRemove.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep))
-    Config.Keep.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_only_keep))
-    Config.OnlyKeep.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_section))
+    Config.KeepSection.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_only_section))
+    Config.OnlySection.push_back(Arg->getValue());
   for (auto Arg : InputArgs.filtered(OBJCOPY_add_section))
     Config.AddSection.push_back(Arg->getValue());
   for (auto Arg : InputArgs.filtered(OBJCOPY_dump_section))
@@ -383,6 +391,7 @@ DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr) {
   }
 
   if (InputArgs.hasArg(STRIP_version)) {
+    outs() << "llvm-strip, compatible with GNU strip\n";
     cl::PrintVersionMessage();
     exit(0);
   }
@@ -411,8 +420,8 @@ DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr) {
       !Config.StripAllGNU)
     Config.StripAll = true;
 
-  for (auto Arg : InputArgs.filtered(STRIP_keep))
-    Config.Keep.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(STRIP_keep_section))
+    Config.KeepSection.push_back(Arg->getValue());
 
   for (auto Arg : InputArgs.filtered(STRIP_remove_section))
     Config.ToRemove.push_back(Arg->getValue());
diff --git a/tools/llvm-objcopy/CopyConfig.h b/tools/llvm-objcopy/CopyConfig.h
index 7ebe2a072bb4b3ca12590338cb22050e91ec6f20..ce6ead80b7c5e08eec3197211b14f6383973a505 100644
--- a/tools/llvm-objcopy/CopyConfig.h
+++ b/tools/llvm-objcopy/CopyConfig.h
@@ -51,14 +51,17 @@ struct CopyConfig {
 
   // Advanced options
   StringRef AddGnuDebugLink;
+  StringRef BuildIdLinkDir;
+  Optional<StringRef> BuildIdLinkInput;
+  Optional<StringRef> BuildIdLinkOutput;
   StringRef SplitDWO;
   StringRef SymbolsPrefix;
 
   // Repeated options
   std::vector<StringRef> AddSection;
   std::vector<StringRef> DumpSection;
-  std::vector<StringRef> Keep;
-  std::vector<StringRef> OnlyKeep;
+  std::vector<StringRef> KeepSection;
+  std::vector<StringRef> OnlySection;
   std::vector<StringRef> SymbolsToGlobalize;
   std::vector<StringRef> SymbolsToKeep;
   std::vector<StringRef> SymbolsToLocalize;
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index a367a30c467f5dc50bc2d7f86121dc542b2eea5c..8a136de24a87bce94a80e5493e13e82cace4e30b 100644
--- a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -10,8 +10,8 @@
 #include "ELFObjcopy.h"
 #include "Buffer.h"
 #include "CopyConfig.h"
-#include "llvm-objcopy.h"
 #include "Object.h"
+#include "llvm-objcopy.h"
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/Optional.h"
@@ -28,6 +28,7 @@
 #include "llvm/Option/Option.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
@@ -115,6 +116,58 @@ static std::unique_ptr<Writer> createWriter(const CopyConfig &Config,
   llvm_unreachable("Invalid output format");
 }
 
+template <class ELFT>
+static Expected<ArrayRef<uint8_t>>
+findBuildID(const object::ELFFile<ELFT> &In) {
+  for (const auto &Phdr : unwrapOrError(In.program_headers())) {
+    if (Phdr.p_type != PT_NOTE)
+      continue;
+    Error Err = Error::success();
+    for (const auto &Note : In.notes(Phdr, Err))
+      if (Note.getType() == NT_GNU_BUILD_ID && Note.getName() == ELF_NOTE_GNU)
+        return Note.getDesc();
+    if (Err)
+      return std::move(Err);
+  }
+  return createStringError(llvm::errc::invalid_argument,
+                           "Could not find build ID.");
+}
+
+static Expected<ArrayRef<uint8_t>>
+findBuildID(const object::ELFObjectFileBase &In) {
+  if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(&In))
+    return findBuildID(*O->getELFFile());
+  else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(&In))
+    return findBuildID(*O->getELFFile());
+  else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(&In))
+    return findBuildID(*O->getELFFile());
+  else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(&In))
+    return findBuildID(*O->getELFFile());
+
+  llvm_unreachable("Bad file format");
+}
+
+static void linkToBuildIdDir(const CopyConfig &Config, StringRef ToLink,
+                             StringRef Suffix, ArrayRef<uint8_t> BuildIdBytes) {
+  SmallString<128> Path = Config.BuildIdLinkDir;
+  sys::path::append(Path, llvm::toHex(BuildIdBytes[0], /*LowerCase*/ true));
+  if (auto EC = sys::fs::create_directories(Path))
+    error("cannot create build ID link directory " + Path + ": " +
+          EC.message());
+
+  sys::path::append(Path,
+                    llvm::toHex(BuildIdBytes.slice(1), /*LowerCase*/ true));
+  Path += Suffix;
+  if (auto EC = sys::fs::create_hard_link(ToLink, Path)) {
+    // Hard linking failed, try to remove the file first if it exists.
+    if (sys::fs::exists(Path))
+      sys::fs::remove(Path);
+    EC = sys::fs::create_hard_link(ToLink, Path);
+    if (EC)
+      error("cannot link " + ToLink + " to " + Path + ": " + EC.message());
+  }
+}
+
 static void splitDWOToFile(const CopyConfig &Config, const Reader &Reader,
                            StringRef File, ElfType OutputElfType) {
   auto DWOFile = Reader.create();
@@ -216,8 +269,7 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
       if (!Sym.isCommon() &&
           ((Config.LocalizeHidden &&
             (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
-           (!Config.SymbolsToLocalize.empty() &&
-            is_contained(Config.SymbolsToLocalize, Sym.Name))))
+           is_contained(Config.SymbolsToLocalize, Sym.Name)))
         Sym.Binding = STB_LOCAL;
 
       // Note: these two globalize flags have very similar names but different
@@ -235,13 +287,11 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
           Sym.getShndx() != SHN_UNDEF)
         Sym.Binding = STB_LOCAL;
 
-      if (!Config.SymbolsToGlobalize.empty() &&
-          is_contained(Config.SymbolsToGlobalize, Sym.Name) &&
+      if (is_contained(Config.SymbolsToGlobalize, Sym.Name) &&
           Sym.getShndx() != SHN_UNDEF)
         Sym.Binding = STB_GLOBAL;
 
-      if (!Config.SymbolsToWeaken.empty() &&
-          is_contained(Config.SymbolsToWeaken, Sym.Name) &&
+      if (is_contained(Config.SymbolsToWeaken, Sym.Name) &&
           Sym.Binding == STB_GLOBAL)
         Sym.Binding = STB_WEAK;
 
@@ -266,8 +316,7 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
     }
 
     Obj.removeSymbols([&](const Symbol &Sym) {
-      if ((!Config.SymbolsToKeep.empty() &&
-           is_contained(Config.SymbolsToKeep, Sym.Name)) ||
+      if (is_contained(Config.SymbolsToKeep, Sym.Name) ||
           (Config.KeepFileSymbols && Sym.Type == STT_FILE))
         return false;
 
@@ -279,10 +328,8 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
       if (Config.StripAll || Config.StripAllGNU)
         return true;
 
-      if (!Config.SymbolsToRemove.empty() &&
-          is_contained(Config.SymbolsToRemove, Sym.Name)) {
+      if (is_contained(Config.SymbolsToRemove, Sym.Name))
         return true;
-      }
 
       if (Config.StripUnneeded && !Sym.Referenced &&
           (Sym.Binding == STB_LOCAL || Sym.getShndx() == SHN_UNDEF) &&
@@ -363,10 +410,10 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
     };
 
   // Explicit copies:
-  if (!Config.OnlyKeep.empty()) {
+  if (!Config.OnlySection.empty()) {
     RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
       // Explicitly keep these sections regardless of previous removes.
-      if (is_contained(Config.OnlyKeep, Sec.Name))
+      if (is_contained(Config.OnlySection, Sec.Name))
         return false;
 
       // Allow all implicit removes.
@@ -385,10 +432,10 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
     };
   }
 
-  if (!Config.Keep.empty()) {
-    RemovePred = [Config, RemovePred](const SectionBase &Sec) {
+  if (!Config.KeepSection.empty()) {
+    RemovePred = [&Config, RemovePred](const SectionBase &Sec) {
       // Explicitly keep these sections regardless of previous removes.
-      if (is_contained(Config.Keep, Sec.Name))
+      if (is_contained(Config.KeepSection, Sec.Name))
         return false;
       // Otherwise defer to RemovePred.
       return RemovePred(Sec);
@@ -494,11 +541,28 @@ void executeObjcopyOnBinary(const CopyConfig &Config,
   ELFReader Reader(&In);
   std::unique_ptr<Object> Obj = Reader.create();
   const ElfType OutputElfType = getOutputElfType(In);
+  ArrayRef<uint8_t> BuildIdBytes;
+
+  if (!Config.BuildIdLinkDir.empty()) {
+    BuildIdBytes = unwrapOrError(findBuildID(In));
+    if (BuildIdBytes.size() < 2)
+      error("build ID in file '" + Config.InputFilename +
+            "' is smaller than two bytes");
+  }
+
+  if (!Config.BuildIdLinkDir.empty() && Config.BuildIdLinkInput) {
+    linkToBuildIdDir(Config, Config.InputFilename,
+                     Config.BuildIdLinkInput.getValue(), BuildIdBytes);
+  }
   handleArgs(Config, *Obj, Reader, OutputElfType);
   std::unique_ptr<Writer> Writer =
       createWriter(Config, *Obj, Out, OutputElfType);
   Writer->finalize();
   Writer->write();
+  if (!Config.BuildIdLinkDir.empty() && Config.BuildIdLinkOutput) {
+    linkToBuildIdDir(Config, Config.OutputFilename,
+                     Config.BuildIdLinkOutput.getValue(), BuildIdBytes);
+  }
 }
 
 } // end namespace elf
diff --git a/tools/llvm-objcopy/ELF/Object.cpp b/tools/llvm-objcopy/ELF/Object.cpp
index c2af99fc197dbbba05f21a30884aad68f65406ca..ae02966b730841acf5161d969085c0f462f4a521 100644
--- a/tools/llvm-objcopy/ELF/Object.cpp
+++ b/tools/llvm-objcopy/ELF/Object.cpp
@@ -97,14 +97,14 @@ void SectionWriter::visit(const Section &Sec) {
   if (Sec.Type == SHT_NOBITS)
     return;
   uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
-  std::copy(std::begin(Sec.Contents), std::end(Sec.Contents), Buf);
+  llvm::copy(Sec.Contents, Buf);
 }
 
 void Section::accept(SectionVisitor &Visitor) const { Visitor.visit(*this); }
 
 void SectionWriter::visit(const OwnedDataSection &Sec) {
   uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
-  std::copy(std::begin(Sec.Data), std::end(Sec.Data), Buf);
+  llvm::copy(Sec.Data, Buf);
 }
 
 static const std::vector<uint8_t> ZlibGnuMagic = {'Z', 'L', 'I', 'B'};
@@ -269,7 +269,7 @@ template <class ELFT>
 void ELFSectionWriter<ELFT>::visit(const SectionIndexSection &Sec) {
   uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
   auto *IndexesBuffer = reinterpret_cast<Elf_Word *>(Buf);
-  std::copy(std::begin(Sec.Indexes), std::end(Sec.Indexes), IndexesBuffer);
+  llvm::copy(Sec.Indexes, IndexesBuffer);
 }
 
 void SectionIndexSection::initialize(SectionTableRef SecTable) {
@@ -554,7 +554,7 @@ void RelocationSection::markSymbols() {
 }
 
 void SectionWriter::visit(const DynamicRelocationSection &Sec) {
-  std::copy(std::begin(Sec.Contents), std::end(Sec.Contents),
+  llvm::copy(Sec.Contents,
             Out.getBufferStart() + Sec.Offset);
 }
 
@@ -641,7 +641,7 @@ void ELFSectionWriter<ELFT>::visit(const GnuDebugLinkSection &Sec) {
   Elf_Word *CRC =
       reinterpret_cast<Elf_Word *>(Buf + Sec.Size - sizeof(Elf_Word));
   *CRC = Sec.CRC32;
-  std::copy(std::begin(Sec.FileName), std::end(Sec.FileName), File);
+  llvm::copy(Sec.FileName, File);
 }
 
 void GnuDebugLinkSection::accept(SectionVisitor &Visitor) const {
@@ -1540,10 +1540,10 @@ void BinaryWriter::finalize() {
   // loading and physical addresses are intended for ROM loading.
   // However, if no segment has a physical address, we'll fallback to using
   // virtual addresses for all.
-  if (std::all_of(std::begin(OrderedSegments), std::end(OrderedSegments),
-                  [](const Segment *Segment) { return Segment->PAddr == 0; }))
-    for (const auto &Segment : OrderedSegments)
-      Segment->PAddr = Segment->VAddr;
+  if (all_of(OrderedSegments,
+             [](const Segment *Seg) { return Seg->PAddr == 0; }))
+    for (Segment *Seg : OrderedSegments)
+      Seg->PAddr = Seg->VAddr;
 
   std::stable_sort(std::begin(OrderedSegments), std::end(OrderedSegments),
                    compareSegmentsByPAddr);
@@ -1558,8 +1558,8 @@ void BinaryWriter::finalize() {
   uint64_t Offset = 0;
 
   // Modify the first segment so that there is no gap at the start. This allows
-  // our layout algorithm to proceed as expected while not out writing out the
-  // gap at the start.
+  // our layout algorithm to proceed as expected while not writing out the gap
+  // at the start.
   if (!OrderedSegments.empty()) {
     auto Seg = OrderedSegments[0];
     auto Sec = Seg->firstSection();
diff --git a/tools/llvm-objcopy/ELF/Object.h b/tools/llvm-objcopy/ELF/Object.h
index 91ff1cddac17c6f3bccab3e0801f576ff7bbd9ee..4b8406511066958158699836816ff729c98f3fa1 100644
--- a/tools/llvm-objcopy/ELF/Object.h
+++ b/tools/llvm-objcopy/ELF/Object.h
@@ -254,21 +254,21 @@ private:
   };
 
   std::set<const SectionBase *, SectionCompare> Sections;
-  ArrayRef<uint8_t> Contents;
 
 public:
-  uint64_t Align;
-  uint64_t FileSize;
+  uint32_t Type;
   uint32_t Flags;
-  uint32_t Index;
-  uint64_t MemSize;
   uint64_t Offset;
-  uint64_t PAddr;
-  uint64_t Type;
   uint64_t VAddr;
+  uint64_t PAddr;
+  uint64_t FileSize;
+  uint64_t MemSize;
+  uint64_t Align;
 
+  uint32_t Index;
   uint64_t OriginalOffset;
   Segment *ParentSegment = nullptr;
+  ArrayRef<uint8_t> Contents;
 
   explicit Segment(ArrayRef<uint8_t> Data) : Contents(Data) {}
   Segment() {}
diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td
index 285ab9d69db29589d2621f2af63fd09fe4c7282f..1f7e64e4091c811f23a3fce36148288d624adef6 100644
--- a/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/tools/llvm-objcopy/ObjcopyOpts.td
@@ -76,10 +76,11 @@ defm rename_section
 defm redefine_symbol
     : Eq<"redefine-sym", "Change the name of a symbol old to new">,
       MetaVarName<"old=new">;
-defm keep : Eq<"keep", "Keep <section>">, MetaVarName<"section">;
-defm only_keep : Eq<"only-keep", "Remove all but <section>">,
-                 MetaVarName<"section">;
-def j : JoinedOrSeparate<["-"], "j">, Alias<only_keep>;
+defm keep_section : Eq<"keep-section", "Keep <section>">,
+                    MetaVarName<"section">;
+defm only_section : Eq<"only-section", "Remove all but <section>">,
+                    MetaVarName<"section">;
+def j : JoinedOrSeparate<["-"], "j">, Alias<only_section>;
 defm add_section
     : Eq<"add-section",
          "Make a section named <section> with the contents of <file>.">,
@@ -164,3 +165,16 @@ defm prefix_symbols
 
 def version : Flag<["-", "--"], "version">,
               HelpText<"Print the version and exit.">;
+def V : Flag<["-"], "V">, Alias<version>;
+defm build_id_link_dir
+    : Eq<"build-id-link-dir", "Set directory for --build-id-link-input and "
+                              "--build-id-link-output to <dir>">,
+      MetaVarName<"dir">;
+defm build_id_link_input
+    : Eq<"build-id-link-input", "Hard-link the input to <dir>/xx/xxx<suffix> "
+                                "name derived from hex build ID">,
+      MetaVarName<"suffix">;
+defm build_id_link_output
+    : Eq<"build-id-link-output", "Hard-link the output to <dir>/xx/xxx<suffix> "
+                                 "name derived from hex build ID">,
+      MetaVarName<"suffix">;
diff --git a/tools/llvm-objcopy/StripOpts.td b/tools/llvm-objcopy/StripOpts.td
index 3660148f88380ca4fdec73ab86e3b04d30843e5e..fa98e27e9321b41a244b6e5ec5a14ef70dad7b9e 100644
--- a/tools/llvm-objcopy/StripOpts.td
+++ b/tools/llvm-objcopy/StripOpts.td
@@ -51,7 +51,8 @@ defm remove_section : Eq<"remove-section", "Remove <section>">,
                       MetaVarName<"section">;
 def R : JoinedOrSeparate<["-"], "R">, Alias<remove_section>;
 
-defm keep : Eq<"keep", "Keep <section>">, MetaVarName<"section">;
+defm keep_section : Eq<"keep-section", "Keep <section>">,
+                    MetaVarName<"section">;
 defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
                    MetaVarName<"symbol">;
 def K : JoinedOrSeparate<["-"], "K">, Alias<keep_symbol>;
@@ -63,3 +64,4 @@ def x : Flag<["-"], "x">, Alias<discard_all>;
 
 def version : Flag<["-", "--"], "version">,
               HelpText<"Print the version and exit.">;
+def V : Flag<["-"], "V">, Alias<version>;
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index 755c786cee95e58968872a10e2cc541e1d1c5692..fb1ff18b015b153e8c9c72f5b898654e4edcf118 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm-objcopy.h"
 #include "Buffer.h"
+#include "COFF/COFFObjcopy.h"
 #include "CopyConfig.h"
 #include "ELF/ELFObjcopy.h"
 
@@ -19,6 +20,7 @@
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFTypes.h"
 #include "llvm/Object/Error.h"
@@ -66,7 +68,7 @@ LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) {
   assert(E);
   std::string Buf;
   raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS, "");
+  logAllUnhandledErrors(std::move(E), OS);
   OS.flush();
   WithColor::error(errs(), ToolName) << "'" << File << "': " << Buf;
   exit(1);
@@ -125,6 +127,8 @@ static void executeObjcopyOnBinary(const CopyConfig &Config, object::Binary &In,
                                    Buffer &Out) {
   if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In))
     return elf::executeObjcopyOnBinary(Config, *ELFBinary, Out);
+  else if (auto *COFFBinary = dyn_cast<object::COFFObjectFile>(&In))
+    return coff::executeObjcopyOnBinary(Config, *COFFBinary, Out);
   else
     error("Unsupported object file format");
 }
diff --git a/tools/llvm-objcopy/llvm-objcopy.h b/tools/llvm-objcopy/llvm-objcopy.h
index e222b65dc78fbbc7609f95ba7ad5f4f258b9bfcd..d8edf3e29ee097484e6f360e391287134d5548cf 100644
--- a/tools/llvm-objcopy/llvm-objcopy.h
+++ b/tools/llvm-objcopy/llvm-objcopy.h
@@ -31,7 +31,7 @@ template <class T> T unwrapOrError(Expected<T> EO) {
     return *EO;
   std::string Buf;
   raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(EO.takeError(), OS, "");
+  logAllUnhandledErrors(EO.takeError(), OS);
   OS.flush();
   error(Buf);
 }
diff --git a/tools/llvm-objdump/COFFDump.cpp b/tools/llvm-objdump/COFFDump.cpp
index 3503eda42f2a2389379b450d344c48d4a68dfceb..a7ee49535d6544982146a7ac20322b99db732794 100644
--- a/tools/llvm-objdump/COFFDump.cpp
+++ b/tools/llvm-objdump/COFFDump.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Win64EH.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -579,8 +580,9 @@ static void printRuntimeFunctionRels(const COFFObjectFile *Obj,
 
 void llvm::printCOFFUnwindInfo(const COFFObjectFile *Obj) {
   if (Obj->getMachine() != COFF::IMAGE_FILE_MACHINE_AMD64) {
-    errs() << "Unsupported image machine type "
-              "(currently only AMD64 is supported).\n";
+    WithColor::error(errs(), "llvm-objdump")
+        << "unsupported image machine type "
+           "(currently only AMD64 is supported).\n";
     return;
   }
 
diff --git a/tools/llvm-objdump/ELFDump.cpp b/tools/llvm-objdump/ELFDump.cpp
index f4d36656a6c427e6d63b9aa7f472f719c1133e43..b17a15a0d8fc9bd5066aee16935441558738f285 100644
--- a/tools/llvm-objdump/ELFDump.cpp
+++ b/tools/llvm-objdump/ELFDump.cpp
@@ -158,37 +158,23 @@ template <class ELFT> void printProgramHeaders(const ELFFile<ELFT> *o) {
 }
 
 void llvm::printELFFileHeader(const object::ObjectFile *Obj) {
-  // Little-endian 32-bit
-  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
+  if (const auto *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
     printProgramHeaders(ELFObj->getELFFile());
-
-  // Big-endian 32-bit
-  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
+  else if (const auto *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
     printProgramHeaders(ELFObj->getELFFile());
-
-  // Little-endian 64-bit
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
+  else if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
     printProgramHeaders(ELFObj->getELFFile());
-
-  // Big-endian 64-bit
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
+  else if (const auto *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
     printProgramHeaders(ELFObj->getELFFile());
 }
 
 void llvm::printELFDynamicSection(const object::ObjectFile *Obj) {
-  // Little-endian 32-bit
-  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
+  if (const auto *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
     printDynamicSection(ELFObj->getELFFile(), Obj->getFileName());
-
-  // Big-endian 32-bit
-  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
+  else if (const auto *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
     printDynamicSection(ELFObj->getELFFile(), Obj->getFileName());
-
-  // Little-endian 64-bit
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
+  else if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
     printDynamicSection(ELFObj->getELFFile(), Obj->getFileName());
-
-  // Big-endian 64-bit
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
+  else if (const auto *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
     printDynamicSection(ELFObj->getELFFile(), Obj->getFileName());
 }
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 9d72d0f4b6a0a800bf4c43bf6ace89cb0d6cbbe5..b5c0439c24c2916c4583d60275b5acd7b66cfd5d 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstring>
@@ -166,7 +167,7 @@ static const Target *GetTarget(const MachOObjectFile *MachOObj,
   if (*ThumbTarget)
     return TheTarget;
 
-  errs() << "llvm-objdump: error: unable to get target for '";
+  WithColor::error(errs(), "llvm-objdump") << "unable to get target for '";
   if (!TheTarget)
     errs() << TripleName;
   else
@@ -483,7 +484,7 @@ static void PrintRType(const uint64_t cputype, const unsigned r_type) {
     "GOTLDP  ", "GOTLDPOF", "PTRTGOT ", "TLVLDP  ", "TLVLDPOF",
     "ADDEND  ", " 11 (?) ", " 12 (?) ", " 13 (?) ", " 14 (?) ", " 15 (?) "
   };
-  
+
   if (r_type > 0xf){
     outs() << format("%-7u", r_type) << " ";
     return;
@@ -552,7 +553,7 @@ static void PrintRelocationEntries(const MachOObjectFile *O,
   bool previous_arm_half = false;
   bool previous_sectdiff = false;
   uint32_t sectdiff_r_type = 0;
-  
+
   for (relocation_iterator Reloc = Begin; Reloc != End; ++Reloc) {
     const DataRefImpl Rel = Reloc->getRawDataRefImpl();
     const MachO::any_relocation_info RE = O->getRelocation(Rel);
@@ -567,7 +568,7 @@ static void PrintRelocationEntries(const MachOObjectFile *O,
                               O->getScatteredRelocationValue(RE) : 0);
     const unsigned r_symbolnum = (r_scattered ? 0 :
                                   O->getPlainRelocationSymbolNum(RE));
-    
+
     if (r_scattered && cputype != MachO::CPU_TYPE_X86_64) {
       if (verbose) {
         // scattered: address
@@ -578,20 +579,20 @@ static void PrintRelocationEntries(const MachOObjectFile *O,
           outs() << "         ";
         else
           outs() << format("%08x ", (unsigned int)r_address);
-        
+
         // scattered: pcrel
         if (r_pcrel)
           outs() << "True  ";
         else
           outs() << "False ";
-        
+
         // scattered: length
         PrintRLength(cputype, r_type, r_length, previous_arm_half);
-        
+
         // scattered: extern & type
         outs() << "n/a    ";
         PrintRType(cputype, r_type);
-        
+
         // scattered: scattered & value
         outs() << format("True      0x%08x", (unsigned int)r_value);
         if (previous_sectdiff == false) {
@@ -639,22 +640,22 @@ static void PrintRelocationEntries(const MachOObjectFile *O,
           outs() << "         ";
         else
           outs() << format("%08x ", (unsigned int)r_address);
-        
+
         // plain: pcrel
         if (r_pcrel)
           outs() << "True  ";
         else
           outs() << "False ";
-        
+
         // plain: length
         PrintRLength(cputype, r_type, r_length, previous_arm_half);
-        
+
         if (r_extern) {
           // plain: extern & type & scattered
           outs() << "True   ";
           PrintRType(cputype, r_type);
           outs() << "False     ";
-          
+
           // plain: symbolnum/value
           if (r_symbolnum > Symtab.nsyms)
             outs() << format("?(%d)\n", r_symbolnum);
@@ -675,7 +676,7 @@ static void PrintRelocationEntries(const MachOObjectFile *O,
           outs() << "False  ";
           PrintRType(cputype, r_type);
           outs() << "False     ";
-          
+
           // plain: symbolnum/value
           if (cputype == MachO::CPU_TYPE_ARM &&
                    r_type == llvm::MachO::ARM_RELOC_PAIR)
@@ -1559,7 +1560,8 @@ static bool checkMachOAndArchFlags(ObjectFile *O, StringRef Filename) {
   if (none_of(ArchFlags, [&](const std::string &Name) {
         return Name == ArchFlagName;
       })) {
-    errs() << "llvm-objdump: " + Filename + ": No architecture specified.\n";
+    WithColor::error(errs(), "llvm-objdump")
+        << Filename << ": no architecture specified.\n";
     return false;
   }
   return true;
@@ -1944,8 +1946,9 @@ static bool ValidateArchFlags() {
       ArchAll = true;
     } else {
       if (!MachOObjectFile::isValidArch(ArchFlags[i])) {
-        errs() << "llvm-objdump: Unknown architecture named '" + ArchFlags[i] +
-                      "'for the -arch option\n";
+        WithColor::error(errs(), "llvm-objdump")
+            << "unknown architecture named '" + ArchFlags[i] +
+                   "'for the -arch option\n";
         return false;
       }
     }
@@ -2005,8 +2008,9 @@ void llvm::ParseInputMachO(StringRef Filename) {
     if (MachOObjectFile *MachOOF = dyn_cast<MachOObjectFile>(&*O))
       ProcessMachO(Filename, MachOOF);
     else
-      errs() << "llvm-objdump: '" << Filename << "': "
-             << "Object is not a Mach-O file type.\n";
+      WithColor::error(errs(), "llvm-objdump")
+          << Filename << "': "
+          << "object is not a Mach-O file type.\n";
     return;
   }
   llvm_unreachable("Input object can't be invalid at this point");
@@ -2079,8 +2083,9 @@ void llvm::ParseInputMachO(MachOUniversalBinary *UB) {
         }
       }
       if (!ArchFound) {
-        errs() << "llvm-objdump: file: " + Filename + " does not contain "
-                << "architecture: " + ArchFlags[i] + "\n";
+        WithColor::error(errs(), "llvm-objdump")
+            << "file: " + Filename + " does not contain "
+            << "architecture: " + ArchFlags[i] + "\n";
         return;
       }
     }
@@ -5621,7 +5626,9 @@ static void print_image_info64(SectionRef S, struct DisassembleInfo *info) {
     else if(swift_version == 5)
       outs() << " Swift 4.0";
     else if(swift_version == 6)
-      outs() << " Swift 4.1";
+      outs() << " Swift 4.1/Swift 4.2";
+    else if(swift_version == 7)
+      outs() << " Swift 5 or later";
     else
       outs() << " unknown future Swift version (" << swift_version << ")";
   }
@@ -5672,7 +5679,9 @@ static void print_image_info32(SectionRef S, struct DisassembleInfo *info) {
     else if(swift_version == 5)
       outs() << " Swift 4.0";
     else if(swift_version == 6)
-      outs() << " Swift 4.1";
+      outs() << " Swift 4.1/Swift 4.2";
+    else if(swift_version == 7)
+      outs() << " Swift 5 or later";
     else
       outs() << " unknown future Swift version (" << swift_version << ")";
   }
@@ -6184,8 +6193,9 @@ static void PrintXarFilesSummary(const char *XarFilename, xar_t xar) {
 
   ScopedXarIter xi;
   if (!xi) {
-    errs() << "Can't obtain an xar iterator for xar archive "
-           << XarFilename << "\n";
+    WithColor::error(errs(), "llvm-objdump")
+        << "can't obtain an xar iterator for xar archive " << XarFilename
+        << "\n";
     return;
   }
 
@@ -6193,8 +6203,9 @@ static void PrintXarFilesSummary(const char *XarFilename, xar_t xar) {
   for (xf = xar_file_first(xar, xi); xf; xf = xar_file_next(xi)) {
     ScopedXarIter xp;
     if(!xp){
-      errs() << "Can't obtain an xar iterator for xar archive "
-             << XarFilename << "\n";
+      WithColor::error(errs(), "llvm-objdump")
+          << "can't obtain an xar iterator for xar archive " << XarFilename
+          << "\n";
       return;
     }
     type = nullptr;
@@ -6318,7 +6329,7 @@ static void DumpBitcodeSection(MachOObjectFile *O, const char *sect,
   std::error_code XarEC =
       sys::fs::createTemporaryFile("llvm-objdump", "xar", FD, XarFilename);
   if (XarEC) {
-    errs() << XarEC.message() << "\n";
+    WithColor::error(errs(), "llvm-objdump") << XarEC.message() << "\n";
     return;
   }
   ToolOutputFile XarFile(XarFilename, FD);
@@ -6331,7 +6342,8 @@ static void DumpBitcodeSection(MachOObjectFile *O, const char *sect,
 
   ScopedXarFile xar(XarFilename.c_str(), READ);
   if (!xar) {
-    errs() << "Can't create temporary xar archive " << XarFilename << "\n";
+    WithColor::error(errs(), "llvm-objdump")
+        << "can't create temporary xar archive " << XarFilename << "\n";
     return;
   }
 
@@ -6339,7 +6351,7 @@ static void DumpBitcodeSection(MachOObjectFile *O, const char *sect,
   std::error_code TocEC =
       sys::fs::createTemporaryFile("llvm-objdump", "toc", TocFilename);
   if (TocEC) {
-    errs() << TocEC.message() << "\n";
+    WithColor::error(errs(), "llvm-objdump") << TocEC.message() << "\n";
     return;
   }
   xar_serialize(xar, TocFilename.c_str());
@@ -6356,7 +6368,7 @@ static void DumpBitcodeSection(MachOObjectFile *O, const char *sect,
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
     MemoryBuffer::getFileOrSTDIN(TocFilename.c_str());
   if (std::error_code EC = FileOrErr.getError()) {
-    errs() << EC.message() << "\n";
+    WithColor::error(errs(), "llvm-objdump") << EC.message() << "\n";
     return;
   }
   std::unique_ptr<MemoryBuffer> &Buffer = FileOrErr.get();
@@ -6371,8 +6383,9 @@ static void DumpBitcodeSection(MachOObjectFile *O, const char *sect,
   // TODO: Go through the xar's files.
   ScopedXarIter xi;
   if(!xi){
-    errs() << "Can't obtain an xar iterator for xar archive "
-           << XarFilename.c_str() << "\n";
+    WithColor::error(errs(), "llvm-objdump")
+        << "can't obtain an xar iterator for xar archive "
+        << XarFilename.c_str() << "\n";
     return;
   }
   for(xar_file_t xf = xar_file_first(xar, xi); xf; xf = xar_file_next(xi)){
@@ -6382,8 +6395,9 @@ static void DumpBitcodeSection(MachOObjectFile *O, const char *sect,
 
     ScopedXarIter xp;
     if(!xp){
-      errs() << "Can't obtain an xar iterator for xar archive "
-             << XarFilename.c_str() << "\n";
+      WithColor::error(errs(), "llvm-objdump")
+          << "can't obtain an xar iterator for xar archive "
+          << XarFilename.c_str() << "\n";
       return;
     }
     member_name = NULL;
@@ -6860,8 +6874,8 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   // IP->setCommentStream(CommentStream);
 
   if (!AsmInfo || !STI || !DisAsm || !IP) {
-    errs() << "error: couldn't initialize disassembler for target "
-           << TripleName << '\n';
+    WithColor::error(errs(), "llvm-objdump")
+        << "couldn't initialize disassembler for target " << TripleName << '\n';
     return;
   }
 
@@ -6902,8 +6916,9 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   }
 
   if (ThumbTarget && (!ThumbAsmInfo || !ThumbSTI || !ThumbDisAsm || !ThumbIP)) {
-    errs() << "error: couldn't initialize disassembler for target "
-           << ThumbTripleName << '\n';
+    WithColor::error(errs(), "llvm-objdump")
+        << "couldn't initialize disassembler for target " << ThumbTripleName
+        << '\n';
     return;
   }
 
@@ -6956,7 +6971,8 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
       ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
           MemoryBuffer::getFileOrSTDIN(DSYMFile);
       if (std::error_code EC = BufOrErr.getError()) {
-        errs() << "llvm-objdump: " << Filename << ": " << EC.message() << '\n';
+        WithColor::error(errs(), "llvm-objdump")
+            << Filename << ": " << EC.message() << '\n';
         return;
       }
       Expected<std::unique_ptr<MachOObjectFile>> DbgObjCheck =
@@ -7243,7 +7259,8 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
             outs() << format("\t.short\t0x%04x\n", opcode);
             Size = 2;
           } else{
-            errs() << "llvm-objdump: warning: invalid instruction encoding\n";
+            WithColor::warning(errs(), "llvm-objdump")
+                << "invalid instruction encoding\n";
             if (Size == 0)
               Size = 1; // skip illegible bytes
           }
@@ -7290,7 +7307,8 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
                              *(Bytes.data() + Index) & 0xff);
             InstSize = 1; // skip exactly one illegible byte and move on.
           } else {
-            errs() << "llvm-objdump: warning: invalid instruction encoding\n";
+            WithColor::warning(errs(), "llvm-objdump")
+                << "invalid instruction encoding\n";
             if (InstSize == 0)
               InstSize = 1; // skip illegible bytes
           }
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 671e8a2c4f79243808e46bd81c62916e87278dcc..9a405c6b4665b78d8a22b632e2acd0d52a783e7a 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -59,6 +59,7 @@
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cctype>
@@ -337,33 +338,35 @@ SectionFilter ToolSectionFilter(llvm::object::ObjectFile const &O) {
 void llvm::error(std::error_code EC) {
   if (!EC)
     return;
-
-  errs() << ToolName << ": error reading file: " << EC.message() << ".\n";
+  WithColor::error(errs(), ToolName)
+      << "reading file: " << EC.message() << ".\n";
   errs().flush();
   exit(1);
 }
 
 LLVM_ATTRIBUTE_NORETURN void llvm::error(Twine Message) {
-  errs() << ToolName << ": " << Message << ".\n";
+  WithColor::error(errs(), ToolName) << Message << ".\n";
   errs().flush();
   exit(1);
 }
 
 void llvm::warn(StringRef Message) {
-  errs() << ToolName << ": warning: " << Message << ".\n";
+  WithColor::warning(errs(), ToolName) << Message << ".\n";
   errs().flush();
 }
 
 LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef File,
                                                 Twine Message) {
-  errs() << ToolName << ": '" << File << "': " << Message << ".\n";
+  WithColor::error(errs(), ToolName)
+      << "'" << File << "': " << Message << ".\n";
   exit(1);
 }
 
 LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef File,
                                                 std::error_code EC) {
   assert(EC);
-  errs() << ToolName << ": '" << File << "': " << EC.message() << ".\n";
+  WithColor::error(errs(), ToolName)
+      << "'" << File << "': " << EC.message() << ".\n";
   exit(1);
 }
 
@@ -372,9 +375,9 @@ LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef File,
   assert(E);
   std::string Buf;
   raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS, "");
+  logAllUnhandledErrors(std::move(E), OS);
   OS.flush();
-  errs() << ToolName << ": '" << File << "': " << Buf;
+  WithColor::error(errs(), ToolName) << "'" << File << "': " << Buf;
   exit(1);
 }
 
@@ -383,7 +386,7 @@ LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef ArchiveName,
                                                 llvm::Error E,
                                                 StringRef ArchitectureName) {
   assert(E);
-  errs() << ToolName << ": ";
+  WithColor::error(errs(), ToolName);
   if (ArchiveName != "")
     errs() << ArchiveName << "(" << FileName << ")";
   else
@@ -392,7 +395,7 @@ LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef ArchiveName,
     errs() << " (for architecture " << ArchitectureName << ")";
   std::string Buf;
   raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS, "");
+  logAllUnhandledErrors(std::move(E), OS);
   OS.flush();
   errs() << ": " << Buf;
   exit(1);
@@ -1911,6 +1914,7 @@ void llvm::PrintSectionHeaders(const ObjectFile *Obj) {
                      (unsigned)Section.getIndex(), Name.str().c_str(), Size,
                      Address, Type.c_str());
   }
+  outs() << "\n";
 }
 
 void llvm::PrintSectionContents(const ObjectFile *Obj) {
@@ -2014,6 +2018,8 @@ void llvm::PrintSymbolTable(const ObjectFile *o, StringRef ArchiveName,
       FileFunc = 'f';
     else if (Type == SymbolRef::ST_Function)
       FileFunc = 'F';
+    else if (Type == SymbolRef::ST_Data)
+      FileFunc = 'O';
 
     const char *Fmt = o->getBytesInAddress() > 4 ? "%016" PRIx64 :
                                                    "%08" PRIx64;
@@ -2069,8 +2075,9 @@ static void PrintUnwindInfo(const ObjectFile *o) {
     printMachOUnwindInfo(MachO);
   else {
     // TODO: Extract DWARF dump tool to objdump.
-    errs() << "This operation is only currently supported "
-              "for COFF and MachO object files.\n";
+    WithColor::error(errs(), ToolName)
+        << "This operation is only currently supported "
+           "for COFF and MachO object files.\n";
     return;
   }
 }
@@ -2080,8 +2087,9 @@ void llvm::printExportsTrie(const ObjectFile *o) {
   if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOExportsTrie(MachO);
   else {
-    errs() << "This operation is only currently supported "
-              "for Mach-O executable files.\n";
+    WithColor::error(errs(), ToolName)
+        << "This operation is only currently supported "
+           "for Mach-O executable files.\n";
     return;
   }
 }
@@ -2091,8 +2099,9 @@ void llvm::printRebaseTable(ObjectFile *o) {
   if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachORebaseTable(MachO);
   else {
-    errs() << "This operation is only currently supported "
-              "for Mach-O executable files.\n";
+    WithColor::error(errs(), ToolName)
+        << "This operation is only currently supported "
+           "for Mach-O executable files.\n";
     return;
   }
 }
@@ -2102,8 +2111,9 @@ void llvm::printBindTable(ObjectFile *o) {
   if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOBindTable(MachO);
   else {
-    errs() << "This operation is only currently supported "
-              "for Mach-O executable files.\n";
+    WithColor::error(errs(), ToolName)
+        << "This operation is only currently supported "
+           "for Mach-O executable files.\n";
     return;
   }
 }
@@ -2113,8 +2123,9 @@ void llvm::printLazyBindTable(ObjectFile *o) {
   if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOLazyBindTable(MachO);
   else {
-    errs() << "This operation is only currently supported "
-              "for Mach-O executable files.\n";
+    WithColor::error(errs(), ToolName)
+        << "This operation is only currently supported "
+           "for Mach-O executable files.\n";
     return;
   }
 }
@@ -2124,8 +2135,9 @@ void llvm::printWeakBindTable(ObjectFile *o) {
   if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOWeakBindTable(MachO);
   else {
-    errs() << "This operation is only currently supported "
-              "for Mach-O executable files.\n";
+    WithColor::error(errs(), ToolName)
+        << "This operation is only currently supported "
+           "for Mach-O executable files.\n";
     return;
   }
 }
@@ -2134,10 +2146,11 @@ void llvm::printWeakBindTable(ObjectFile *o) {
 /// into llvm-bcanalyzer.
 void llvm::printRawClangAST(const ObjectFile *Obj) {
   if (outs().is_displayed()) {
-    errs() << "The -raw-clang-ast option will dump the raw binary contents of "
-              "the clang ast section.\n"
-              "Please redirect the output to a file or another program such as "
-              "llvm-bcanalyzer.\n";
+    WithColor::error(errs(), ToolName)
+        << "The -raw-clang-ast option will dump the raw binary contents of "
+           "the clang ast section.\n"
+           "Please redirect the output to a file or another program such as "
+           "llvm-bcanalyzer.\n";
     return;
   }
 
@@ -2171,8 +2184,9 @@ static void printFaultMaps(const ObjectFile *Obj) {
   } else if (isa<MachOObjectFile>(Obj)) {
     FaultMapSectionName = "__llvm_faultmaps";
   } else {
-    errs() << "This operation is only currently supported "
-              "for ELF and Mach-O executable files.\n";
+    WithColor::error(errs(), ToolName)
+        << "This operation is only currently supported "
+           "for ELF and Mach-O executable files.\n";
     return;
   }
 
@@ -2241,7 +2255,7 @@ static void printFileHeaders(const ObjectFile *o) {
 static void printArchiveChild(StringRef Filename, const Archive::Child &C) {
   Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
   if (!ModeOrErr) {
-    errs() << "ill-formed archive entry.\n";
+    WithColor::error(errs(), ToolName) << "ill-formed archive entry.\n";
     consumeError(ModeOrErr.takeError());
     return;
   }
diff --git a/tools/llvm-pdbutil/Analyze.cpp b/tools/llvm-pdbutil/Analyze.cpp
deleted file mode 100644
index 974ab49d9440ae09675c987a5dd5958eb626726b..0000000000000000000000000000000000000000
--- a/tools/llvm-pdbutil/Analyze.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-//===- Analyze.cpp - PDB analysis functions ---------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Analyze.h"
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
-
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <list>
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::pdb;
-
-static StringRef getLeafTypeName(TypeLeafKind LT) {
-  switch (LT) {
-#define TYPE_RECORD(ename, value, name)                                        \
-  case ename:                                                                  \
-    return #name;
-#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
-  default:
-    break;
-  }
-  return "UnknownLeaf";
-}
-
-namespace {
-struct HashLookupVisitor : public TypeVisitorCallbacks {
-  struct Entry {
-    TypeIndex TI;
-    CVType Record;
-  };
-
-  explicit HashLookupVisitor(TpiStream &Tpi) : Tpi(Tpi) {}
-
-  Error visitTypeBegin(CVType &Record) override {
-    uint32_t H = Tpi.getHashValues()[I];
-    Record.Hash = H;
-    TypeIndex TI(I + TypeIndex::FirstNonSimpleIndex);
-    Lookup[H].push_back(Entry{TI, Record});
-    ++I;
-    return Error::success();
-  }
-
-  uint32_t I = 0;
-  DenseMap<uint32_t, std::list<Entry>> Lookup;
-  TpiStream &Tpi;
-};
-}
-
-AnalysisStyle::AnalysisStyle(PDBFile &File) : File(File) {}
-
-Error AnalysisStyle::dump() {
-  auto Tpi = File.getPDBTpiStream();
-  if (!Tpi)
-    return Tpi.takeError();
-
-  HashLookupVisitor Hasher(*Tpi);
-
-  uint32_t RecordCount = Tpi->getNumTypeRecords();
-  auto Offsets = Tpi->getTypeIndexOffsets();
-  auto Types = llvm::make_unique<LazyRandomTypeCollection>(
-      Tpi->typeArray(), RecordCount, Offsets);
-
-  if (auto EC = codeview::visitTypeStream(*Types, Hasher))
-    return EC;
-
-  auto &Adjusters = Tpi->getHashAdjusters();
-  DenseSet<uint32_t> AdjusterSet;
-  for (const auto &Adj : Adjusters) {
-    assert(AdjusterSet.find(Adj.second) == AdjusterSet.end());
-    AdjusterSet.insert(Adj.second);
-  }
-
-  uint32_t Count = 0;
-  outs() << "Searching for hash collisions\n";
-  for (const auto &H : Hasher.Lookup) {
-    if (H.second.size() <= 1)
-      continue;
-    ++Count;
-    outs() << formatv("Hash: {0}, Count: {1} records\n", H.first,
-                      H.second.size());
-    for (const auto &R : H.second) {
-      auto Iter = AdjusterSet.find(R.TI.getIndex());
-      StringRef Prefix;
-      if (Iter != AdjusterSet.end()) {
-        Prefix = "[HEAD]";
-        AdjusterSet.erase(Iter);
-      }
-      StringRef LeafName = getLeafTypeName(R.Record.Type);
-      uint32_t TI = R.TI.getIndex();
-      StringRef TypeName = Types->getTypeName(R.TI);
-      outs() << formatv("{0,-6} {1} ({2:x}) {3}\n", Prefix, LeafName, TI,
-                        TypeName);
-    }
-  }
-
-  outs() << "\n";
-  outs() << "Dumping hash adjustment chains\n";
-  for (const auto &A : Tpi->getHashAdjusters()) {
-    TypeIndex TI(A.second);
-    StringRef TypeName = Types->getTypeName(TI);
-    const CVType &HeadRecord = Types->getType(TI);
-    assert(HeadRecord.Hash.hasValue());
-
-    auto CollisionsIter = Hasher.Lookup.find(*HeadRecord.Hash);
-    if (CollisionsIter == Hasher.Lookup.end())
-      continue;
-
-    const auto &Collisions = CollisionsIter->second;
-    outs() << TypeName << "\n";
-    outs() << formatv("    [HEAD] {0:x} {1} {2}\n", uint32_t(A.second),
-                      getLeafTypeName(HeadRecord.Type), TypeName);
-    for (const auto &Chain : Collisions) {
-      if (Chain.TI == TI)
-        continue;
-      const CVType &TailRecord = Types->getType(Chain.TI);
-      outs() << formatv("           {0:x} {1} {2}\n", Chain.TI.getIndex(),
-                        getLeafTypeName(TailRecord.Type),
-                        Types->getTypeName(Chain.TI));
-    }
-  }
-  outs() << formatv("There are {0} orphaned hash adjusters\n",
-                    AdjusterSet.size());
-  for (const auto &Adj : AdjusterSet) {
-    outs() << formatv("    {0}\n", Adj);
-  }
-
-  uint32_t DistinctHashValues = Hasher.Lookup.size();
-  outs() << formatv("{0}/{1} hash collisions", Count, DistinctHashValues);
-  return Error::success();
-}
diff --git a/tools/llvm-pdbutil/Analyze.h b/tools/llvm-pdbutil/Analyze.h
deleted file mode 100644
index 7230ae45b0c8cef3d06a41494b8e49e70be69eb8..0000000000000000000000000000000000000000
--- a/tools/llvm-pdbutil/Analyze.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- Analyze.h - PDB analysis functions -----------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_ANALYSIS_H
-#define LLVM_TOOLS_LLVMPDBDUMP_ANALYSIS_H
-
-#include "OutputStyle.h"
-
-namespace llvm {
-namespace pdb {
-class PDBFile;
-class AnalysisStyle : public OutputStyle {
-public:
-  explicit AnalysisStyle(PDBFile &File);
-
-  Error dump() override;
-
-private:
-  PDBFile &File;
-};
-}
-}
-
-#endif
diff --git a/tools/llvm-pdbutil/CMakeLists.txt b/tools/llvm-pdbutil/CMakeLists.txt
index 1ccbfdfc4df4768d25c531d8a00ba75c7b023e51..e403d54eef5871974f115f93a858bf78defcbaaf 100644
--- a/tools/llvm-pdbutil/CMakeLists.txt
+++ b/tools/llvm-pdbutil/CMakeLists.txt
@@ -9,7 +9,6 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_tool(llvm-pdbutil
-  Analyze.cpp
   BytesOutputStyle.cpp
   DumpOutputStyle.cpp
   ExplainOutputStyle.cpp
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 215bfbeb2060c63d6cbfeaf3d6534f87a267e093..76f61a2a95a79ae7a2513075856dfdea1ebc40fe 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm-pdbutil.h"
 
-#include "Analyze.h"
 #include "BytesOutputStyle.h"
 #include "DumpOutputStyle.h"
 #include "ExplainOutputStyle.h"
@@ -117,10 +116,6 @@ cl::SubCommand
     PdbToYamlSubcommand("pdb2yaml",
                         "Generate a detailed YAML description of a PDB File");
 
-cl::SubCommand
-    AnalyzeSubcommand("analyze",
-                      "Analyze various aspects of a PDB's structure");
-
 cl::SubCommand MergeSubcommand("merge",
                                "Merge multiple PDBs into a single PDB");
 
@@ -686,14 +681,6 @@ cl::list<std::string> InputFilename(cl::Positional,
                                     cl::sub(PdbToYamlSubcommand));
 } // namespace pdb2yaml
 
-namespace analyze {
-cl::opt<bool> StringTable("hash-collisions", cl::desc("Find hash collisions"),
-                          cl::sub(AnalyzeSubcommand), cl::init(false));
-cl::list<std::string> InputFilename(cl::Positional,
-                                    cl::desc("<input PDB file>"), cl::Required,
-                                    cl::sub(AnalyzeSubcommand));
-}
-
 namespace merge {
 cl::list<std::string> InputFilenames(cl::Positional,
                                      cl::desc("<input PDB files>"),
@@ -891,14 +878,6 @@ static void dumpBytes(StringRef Path) {
   ExitOnErr(O->dump());
 }
 
-static void dumpAnalysis(StringRef Path) {
-  std::unique_ptr<IPDBSession> Session;
-  auto &File = loadPDB(Path, Session);
-  auto O = llvm::make_unique<AnalysisStyle>(File);
-
-  ExitOnErr(O->dump());
-}
-
 bool opts::pretty::shouldDumpSymLevel(SymLevel Search) {
   if (SymTypes.empty())
     return true;
@@ -1526,8 +1505,6 @@ int main(int Argc, const char **Argv) {
       opts::yaml2pdb::YamlPdbOutputFile = OutputFilename.str();
     }
     yamlToPdb(opts::yaml2pdb::InputFilename);
-  } else if (opts::AnalyzeSubcommand) {
-    dumpAnalysis(opts::analyze::InputFilename.front());
   } else if (opts::DiaDumpSubcommand) {
     llvm::for_each(opts::diadump::InputFilenames, dumpDia);
   } else if (opts::PrettySubcommand) {
diff --git a/tools/llvm-rc/Opts.td b/tools/llvm-rc/Opts.td
index 11f40f57103775741ffcc07140a82074b5cf14c9..3ff5ac2d4980f335d877eba11d3602e0588838b7 100644
--- a/tools/llvm-rc/Opts.td
+++ b/tools/llvm-rc/Opts.td
@@ -4,7 +4,7 @@ include "llvm/Option/OptParser.td"
 // These options seem to be important for the tool
 // and should be implemented.
 
-def FILEOUT : Separate<[ "/", "-" ], "FO">,
+def FILEOUT : JoinedOrSeparate<[ "/", "-" ], "FO">,
               HelpText<"Change the output file location.">;
 
 def DEFINE : Separate<[ "/", "-" ], "D">,
diff --git a/tools/llvm-rc/ResourceFileWriter.cpp b/tools/llvm-rc/ResourceFileWriter.cpp
index 4b561940d2ec04fb99365850f882fb9a3d1056c2..7fe95669083bf65386c68eef013eedc0553c8f2a 100644
--- a/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/tools/llvm-rc/ResourceFileWriter.cpp
@@ -514,6 +514,11 @@ Error ResourceFileWriter::visitCharacteristicsStmt(
   return Error::success();
 }
 
+Error ResourceFileWriter::visitExStyleStmt(const ExStyleStmt *Stmt) {
+  ObjectData.ExStyle = Stmt->Value;
+  return Error::success();
+}
+
 Error ResourceFileWriter::visitFontStmt(const FontStmt *Stmt) {
   RETURN_IF_ERROR(checkNumberFits<uint16_t>(Stmt->Size, "Font size"));
   RETURN_IF_ERROR(checkNumberFits<uint16_t>(Stmt->Weight, "Font weight"));
@@ -982,7 +987,8 @@ Error ResourceFileWriter::writeSingleDialogControl(const Control &Ctl,
   padStream(sizeof(uint32_t));
 
   auto TypeInfo = Control::SupportedCtls.lookup(Ctl.Type);
-  uint32_t CtlStyle = TypeInfo.Style | Ctl.Style.getValueOr(0);
+  IntWithNotMask CtlStyle(TypeInfo.Style);
+  CtlStyle |= Ctl.Style.getValueOr(RCInt(0));
   uint32_t CtlExtStyle = Ctl.ExtStyle.getValueOr(0);
 
   // DIALOG(EX) item header prefix.
@@ -990,7 +996,7 @@ Error ResourceFileWriter::writeSingleDialogControl(const Control &Ctl,
     struct {
       ulittle32_t Style;
       ulittle32_t ExtStyle;
-    } Prefix{ulittle32_t(CtlStyle), ulittle32_t(CtlExtStyle)};
+    } Prefix{ulittle32_t(CtlStyle.getValue()), ulittle32_t(CtlExtStyle)};
     writeObject(Prefix);
   } else {
     struct {
@@ -998,7 +1004,7 @@ Error ResourceFileWriter::writeSingleDialogControl(const Control &Ctl,
       ulittle32_t ExtStyle;
       ulittle32_t Style;
     } Prefix{ulittle32_t(Ctl.HelpID.getValueOr(0)), ulittle32_t(CtlExtStyle),
-             ulittle32_t(CtlStyle)};
+             ulittle32_t(CtlStyle.getValue())};
     writeObject(Prefix);
   }
 
@@ -1065,6 +1071,7 @@ Error ResourceFileWriter::writeDialogBody(const RCResource *Base) {
     UsedStyle |= StyleCaptionFlag;
 
   const uint16_t DialogExMagic = 0xFFFF;
+  uint32_t ExStyle = ObjectData.ExStyle.getValueOr(0);
 
   // Write DIALOG(EX) header prefix. These are pretty different.
   if (!Res->IsExtended) {
@@ -1083,7 +1090,7 @@ Error ResourceFileWriter::writeDialogBody(const RCResource *Base) {
       ulittle32_t Style;
       ulittle32_t ExtStyle;
     } Prefix{ulittle32_t(UsedStyle),
-             ulittle32_t(0)}; // As of now, we don't keep EXSTYLE.
+             ulittle32_t(ExStyle)};
 
     writeObject(Prefix);
   } else {
@@ -1094,7 +1101,7 @@ Error ResourceFileWriter::writeDialogBody(const RCResource *Base) {
       ulittle32_t ExtStyle;
       ulittle32_t Style;
     } Prefix{ulittle16_t(1), ulittle16_t(DialogExMagic),
-             ulittle32_t(Res->HelpID), ulittle32_t(0), ulittle32_t(UsedStyle)};
+             ulittle32_t(Res->HelpID), ulittle32_t(ExStyle), ulittle32_t(UsedStyle)};
 
     writeObject(Prefix);
   }
@@ -1502,6 +1509,10 @@ ResourceFileWriter::loadFile(StringRef File) const {
   SmallString<128> Cwd;
   std::unique_ptr<MemoryBuffer> Result;
 
+  // 0. The file path is absolute and the file exists.
+  if (sys::path::is_absolute(File))
+    return errorOrToExpected(MemoryBuffer::getFile(File, -1, false));
+
   // 1. The current working directory.
   sys::fs::current_path(Cwd);
   Path.assign(Cwd.begin(), Cwd.end());
@@ -1510,8 +1521,7 @@ ResourceFileWriter::loadFile(StringRef File) const {
     return errorOrToExpected(MemoryBuffer::getFile(Path, -1, false));
 
   // 2. The directory of the input resource file, if it is different from the
-  // current
-  //    working directory.
+  // current working directory.
   StringRef InputFileDir = sys::path::parent_path(Params.InputFilePath);
   Path.assign(InputFileDir.begin(), InputFileDir.end());
   sys::path::append(Path, File);
diff --git a/tools/llvm-rc/ResourceFileWriter.h b/tools/llvm-rc/ResourceFileWriter.h
index 4c31d018bde489b7d727766297b6a016102b1779..36d57da11eb6105acbf8b76d4047734bb3d1cf95 100644
--- a/tools/llvm-rc/ResourceFileWriter.h
+++ b/tools/llvm-rc/ResourceFileWriter.h
@@ -63,6 +63,7 @@ public:
   Error visitCaptionStmt(const CaptionStmt *) override;
   Error visitCharacteristicsStmt(const CharacteristicsStmt *) override;
   Error visitClassStmt(const ClassStmt *) override;
+  Error visitExStyleStmt(const ExStyleStmt *) override;
   Error visitFontStmt(const FontStmt *) override;
   Error visitLanguageStmt(const LanguageResource *) override;
   Error visitStyleStmt(const StyleStmt *) override;
@@ -80,6 +81,7 @@ public:
     uint32_t VersionInfo;
 
     Optional<uint32_t> Style;
+    Optional<uint32_t> ExStyle;
     StringRef Caption;
     struct FontInfo {
       uint32_t Size;
diff --git a/tools/llvm-rc/ResourceScriptParser.cpp b/tools/llvm-rc/ResourceScriptParser.cpp
index 8cc0b50933c266ba3f448fab1127a570fa3e6482..c66fc4fc2e70c3516f2fea4c2dd1c11448edc171 100644
--- a/tools/llvm-rc/ResourceScriptParser.cpp
+++ b/tools/llvm-rc/ResourceScriptParser.cpp
@@ -114,16 +114,23 @@ void RCParser::consume() {
 
 // An integer description might consist of a single integer or
 // an arithmetic expression evaluating to the integer. The expressions
-// can contain the following tokens: <int> ( ) + - | & ~. Their meaning
-// is the same as in C++.
+// can contain the following tokens: <int> ( ) + - | & ~ not. Their meaning
+// is the same as in C++ except for 'not' expression.
 // The operators in the original RC implementation have the following
 // precedence:
-//   1) Unary operators (- ~),
+//   1) Unary operators (- ~ not),
 //   2) Binary operators (+ - & |), with no precedence.
 //
+// 'not' expression is mostly useful for style values. It evaluates to 0,
+// but value given to the operator is stored separately from integer value.
+// It's mostly useful for control style expressions and causes bits from
+// default control style to be excluded from generated style. For binary
+// operators the mask from the right operand is applied to the left operand
+// and masks from both operands are combined in operator result.
+//
 // The following grammar is used to parse the expressions Exp1:
 //   Exp1 ::= Exp2 || Exp1 + Exp2 || Exp1 - Exp2 || Exp1 | Exp2 || Exp1 & Exp2
-//   Exp2 ::= -Exp2 || ~Exp2 || Int || (Exp1).
+//   Exp2 ::= -Exp2 || ~Exp2 || not Expr2 || Int || (Exp1).
 // (More conveniently, Exp1 is a non-empty sequence of Exp2 expressions,
 // separated by binary operators.)
 //
@@ -139,12 +146,15 @@ void RCParser::consume() {
 //    1 => 01 00, -1 => ff ff, --1 => 01 00, ---1 => ff ff;
 //    1 => 01 00, ~1 => fe ff, ~~1 => 01 00, ~~~1 => fe ff.
 
-Expected<RCInt> RCParser::readInt() { return parseIntExpr1(); }
+Expected<RCInt> RCParser::readInt() {
+  ASSIGN_OR_RETURN(Value, parseIntExpr1());
+  return (*Value).getValue();
+}
 
-Expected<RCInt> RCParser::parseIntExpr1() {
+Expected<IntWithNotMask> RCParser::parseIntExpr1() {
   // Exp1 ::= Exp2 || Exp1 + Exp2 || Exp1 - Exp2 || Exp1 | Exp2 || Exp1 & Exp2.
   ASSIGN_OR_RETURN(FirstResult, parseIntExpr2());
-  RCInt Result = *FirstResult;
+  IntWithNotMask Result = *FirstResult;
 
   while (!isEof() && look().isBinaryOp()) {
     auto OpToken = read();
@@ -175,8 +185,8 @@ Expected<RCInt> RCParser::parseIntExpr1() {
   return Result;
 }
 
-Expected<RCInt> RCParser::parseIntExpr2() {
-  // Exp2 ::= -Exp2 || ~Exp2 || Int || (Exp1).
+Expected<IntWithNotMask> RCParser::parseIntExpr2() {
+  // Exp2 ::= -Exp2 || ~Exp2 || not Expr2 || Int || (Exp1).
   static const char ErrorMsg[] = "'-', '~', integer or '('";
 
   if (isEof())
@@ -205,6 +215,13 @@ Expected<RCInt> RCParser::parseIntExpr2() {
     return *Result;
   }
 
+  case Kind::Identifier: {
+    if (!read().value().equals_lower("not"))
+      return getExpectedError(ErrorMsg, true);
+    ASSIGN_OR_RETURN(Result, parseIntExpr2());
+    return IntWithNotMask(0, (*Result).getValue());
+  }
+
   default:
     return getExpectedError(ErrorMsg);
   }
@@ -388,6 +405,8 @@ RCParser::parseSingleOptionalStatement(OptStmtType StmtsType) {
       return parseCaptionStmt();
     if (TypeToken->equals_lower("CLASS"))
       return parseClassStmt();
+    if (TypeToken->equals_lower("EXSTYLE"))
+      return parseExStyleStmt();
     if (TypeToken->equals_lower("FONT"))
       return parseFontStmt(StmtsType);
     if (TypeToken->equals_lower("STYLE"))
@@ -537,13 +556,13 @@ Expected<Control> RCParser::parseControl() {
   RETURN_IF_ERROR(consumeType(Kind::Comma));
 
   IntOrString Class;
-  Optional<uint32_t> Style;
+  Optional<IntWithNotMask> Style;
   if (ClassUpper == "CONTROL") {
     // CONTROL text, id, class, style, x, y, width, height [, exstyle] [, helpID]
     ASSIGN_OR_RETURN(ClassStr, readString());
     RETURN_IF_ERROR(consumeType(Kind::Comma));
     Class = *ClassStr;
-    ASSIGN_OR_RETURN(StyleVal, readInt());
+    ASSIGN_OR_RETURN(StyleVal, parseIntExpr1());
     RETURN_IF_ERROR(consumeType(Kind::Comma));
     Style = *StyleVal;
   } else {
@@ -555,7 +574,7 @@ Expected<Control> RCParser::parseControl() {
 
   if (ClassUpper != "CONTROL") {
     if (consumeOptionalType(Kind::Comma)) {
-      ASSIGN_OR_RETURN(Val, readInt());
+      ASSIGN_OR_RETURN(Val, parseIntExpr1());
       Style = *Val;
     }
   }
@@ -817,6 +836,11 @@ RCParser::ParseOptionType RCParser::parseStyleStmt() {
   return llvm::make_unique<StyleStmt>(*Arg);
 }
 
+RCParser::ParseOptionType RCParser::parseExStyleStmt() {
+  ASSIGN_OR_RETURN(Arg, readInt());
+  return llvm::make_unique<ExStyleStmt>(*Arg);
+}
+
 Error RCParser::getExpectedError(const Twine &Message, bool IsAlreadyRead) {
   return make_error<ParserError>(
       Message, IsAlreadyRead ? std::prev(CurLoc) : CurLoc, End);
diff --git a/tools/llvm-rc/ResourceScriptParser.h b/tools/llvm-rc/ResourceScriptParser.h
index 3dd110f7e384160a6ce4145ae77861358ff874b1..20d8ff9624d9f21f6c18130bab9e26818d800c46 100644
--- a/tools/llvm-rc/ResourceScriptParser.h
+++ b/tools/llvm-rc/ResourceScriptParser.h
@@ -89,8 +89,8 @@ private:
   Expected<IntOrString> readTypeOrName();  // Parse an integer or an identifier.
 
   // Helper integer expression parsing methods.
-  Expected<RCInt> parseIntExpr1();
-  Expected<RCInt> parseIntExpr2();
+  Expected<IntWithNotMask> parseIntExpr1();
+  Expected<IntWithNotMask> parseIntExpr2();
 
   // Advance the state by one, discarding the current token.
   // If the discarded token had an incorrect type, fail.
@@ -172,6 +172,7 @@ private:
   ParseOptionType parseVersionStmt();
   ParseOptionType parseCaptionStmt();
   ParseOptionType parseClassStmt();
+  ParseOptionType parseExStyleStmt();
   ParseOptionType parseFontStmt(OptStmtType DialogType);
   ParseOptionType parseStyleStmt();
 
diff --git a/tools/llvm-rc/ResourceScriptStmt.cpp b/tools/llvm-rc/ResourceScriptStmt.cpp
index 728c24b36693ee2655fc0a7e211fb8284bc00199..7b8b0def4c9a955ae95abb1dac3046870e203261 100644
--- a/tools/llvm-rc/ResourceScriptStmt.cpp
+++ b/tools/llvm-rc/ResourceScriptStmt.cpp
@@ -151,7 +151,7 @@ raw_ostream &Control::log(raw_ostream &OS) const {
      << ", loc: (" << X << ", " << Y << "), size: [" << Width << ", " << Height
      << "]";
   if (Style)
-    OS << ", style: " << *Style;
+    OS << ", style: " << (*Style).getValue();
   if (ExtStyle)
     OS << ", ext. style: " << *ExtStyle;
   if (HelpID)
@@ -283,5 +283,9 @@ raw_ostream &StyleStmt::log(raw_ostream &OS) const {
   return OS << "Style: " << Value << "\n";
 }
 
+raw_ostream &ExStyleStmt::log(raw_ostream &OS) const {
+  return OS << "ExStyle: " << Value << "\n";
+}
+
 } // namespace rc
 } // namespace llvm
diff --git a/tools/llvm-rc/ResourceScriptStmt.h b/tools/llvm-rc/ResourceScriptStmt.h
index 2071ac6a9a3ad75c84e393108d00aa85de41cb29..3ba1f6619eb8d2a7c0329034bb3b78a5e665d43c 100644
--- a/tools/llvm-rc/ResourceScriptStmt.h
+++ b/tools/llvm-rc/ResourceScriptStmt.h
@@ -67,6 +67,59 @@ public:
   }
 };
 
+class IntWithNotMask {
+private:
+  RCInt Value;
+  int32_t NotMask;
+
+public:
+  IntWithNotMask() : IntWithNotMask(RCInt(0)) {}
+  IntWithNotMask(RCInt Value, int32_t NotMask = 0) : Value(Value), NotMask(NotMask) {}
+
+  RCInt getValue() const {
+    return Value;
+  }
+
+  uint32_t getNotMask() const {
+    return NotMask;
+  }
+
+  IntWithNotMask &operator+=(const IntWithNotMask &Rhs) {
+    Value &= ~Rhs.NotMask;
+    Value += Rhs.Value;
+    NotMask |= Rhs.NotMask;
+    return *this;
+  }
+
+  IntWithNotMask &operator-=(const IntWithNotMask &Rhs) {
+    Value &= ~Rhs.NotMask;
+    Value -= Rhs.Value;
+    NotMask |= Rhs.NotMask;
+    return *this;
+  }
+
+  IntWithNotMask &operator|=(const IntWithNotMask &Rhs) {
+    Value &= ~Rhs.NotMask;
+    Value |= Rhs.Value;
+    NotMask |= Rhs.NotMask;
+    return *this;
+  }
+
+  IntWithNotMask &operator&=(const IntWithNotMask &Rhs) {
+    Value &= ~Rhs.NotMask;
+    Value &= Rhs.Value;
+    NotMask |= Rhs.NotMask;
+    return *this;
+  }
+
+  IntWithNotMask operator-() const { return {-Value, NotMask}; }
+  IntWithNotMask operator~() const { return {~Value, 0}; }
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const IntWithNotMask &Int) {
+    return OS << Int.Value;
+  }
+};
+
 // A class holding a name - either an integer or a reference to the string.
 class IntOrString {
 private:
@@ -556,7 +609,8 @@ public:
   StringRef Type;
   IntOrString Title;
   uint32_t ID, X, Y, Width, Height;
-  Optional<uint32_t> Style, ExtStyle, HelpID;
+  Optional<IntWithNotMask> Style;
+  Optional<uint32_t> ExtStyle, HelpID;
   IntOrString Class;
 
   // Control classes as described in DLGITEMTEMPLATEEX documentation.
@@ -580,7 +634,7 @@ public:
 
   Control(StringRef CtlType, IntOrString CtlTitle, uint32_t CtlID,
           uint32_t PosX, uint32_t PosY, uint32_t ItemWidth, uint32_t ItemHeight,
-          Optional<uint32_t> ItemStyle, Optional<uint32_t> ExtItemStyle,
+          Optional<IntWithNotMask> ItemStyle, Optional<uint32_t> ExtItemStyle,
           Optional<uint32_t> CtlHelpID, IntOrString CtlClass)
       : Type(CtlType), Title(CtlTitle), ID(CtlID), X(PosX), Y(PosY),
         Width(ItemWidth), Height(ItemHeight), Style(ItemStyle),
@@ -866,6 +920,19 @@ public:
   Error visit(Visitor *V) const override { return V->visitStyleStmt(this); }
 };
 
+// EXSTYLE optional statement.
+//
+// Ref: docs.microsoft.com/en-us/windows/desktop/menurc/exstyle-statement
+class ExStyleStmt : public OptionalStmt {
+public:
+  uint32_t Value;
+
+  ExStyleStmt(uint32_t ExStyle) : Value(ExStyle) {}
+  raw_ostream &log(raw_ostream &) const override;
+  Twine getResourceTypeName() const override { return "EXSTYLE"; }
+  Error visit(Visitor *V) const override { return V->visitExStyleStmt(this); }
+};
+
 // CLASS optional statement.
 //
 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380883(v=vs.85).aspx
diff --git a/tools/llvm-rc/ResourceVisitor.h b/tools/llvm-rc/ResourceVisitor.h
index 53ef3c5cf7df3f48f2c5be2e3254c72d155422a7..2117b12d8f5f102527e2447d2013f8f7acb9c4ee 100644
--- a/tools/llvm-rc/ResourceVisitor.h
+++ b/tools/llvm-rc/ResourceVisitor.h
@@ -24,6 +24,7 @@ class RCResource;
 class CaptionStmt;
 class ClassStmt;
 class CharacteristicsStmt;
+class ExStyleStmt;
 class FontStmt;
 class LanguageResource;
 class StyleStmt;
@@ -46,6 +47,7 @@ public:
   virtual Error visitCaptionStmt(const CaptionStmt *) = 0;
   virtual Error visitClassStmt(const ClassStmt *) = 0;
   virtual Error visitCharacteristicsStmt(const CharacteristicsStmt *) = 0;
+  virtual Error visitExStyleStmt(const ExStyleStmt *) = 0;
   virtual Error visitFontStmt(const FontStmt *) = 0;
   virtual Error visitLanguageStmt(const LanguageResource *) = 0;
   virtual Error visitStyleStmt(const StyleStmt *) = 0;
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
index eb575894db587bc6e630bc8cd07bc5d739d26b4f..4b823b816c35a1d67f35ab74905d1b10bc9ceab7 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -903,7 +903,7 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
     if (!Name) {
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(Name.takeError(), OS, "");
+      logAllUnhandledErrors(Name.takeError(), OS);
       OS.flush();
       report_fatal_error(Buf);
     }
@@ -942,7 +942,7 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     if (!FunctionNameOrErr) {
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS, "");
+      logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS);
       OS.flush();
       report_fatal_error(Buf);
     }
@@ -951,7 +951,7 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     if (!FunctionAddressOrErr) {
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(FunctionAddressOrErr.takeError(), OS, "");
+      logAllUnhandledErrors(FunctionAddressOrErr.takeError(), OS);
       OS.flush();
       report_fatal_error(Buf);
     }
@@ -967,7 +967,7 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     if (!Name) {
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(Name.takeError(), OS, "");
+      logAllUnhandledErrors(Name.takeError(), OS);
       OS.flush();
       report_fatal_error(Buf);
     }
@@ -976,7 +976,7 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     if (!AddressOrErr) {
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(AddressOrErr.takeError(), OS, "");
+      logAllUnhandledErrors(AddressOrErr.takeError(), OS);
       OS.flush();
       report_fatal_error(Buf);
     }
@@ -1025,7 +1025,7 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
     if (!FunctionNameOrErr) {
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS, "");
+      logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS);
       OS.flush();
       report_fatal_error(Buf);
     }
@@ -1034,7 +1034,7 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
     if (!FunctionAddressOrErr) {
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(FunctionAddressOrErr.takeError(), OS, "");
+      logAllUnhandledErrors(FunctionAddressOrErr.takeError(), OS);
       OS.flush();
       report_fatal_error(Buf);
     }
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 7f5907139930457e35cf1a9cf77deaa00f72513c..36654919201b53555111c58d64d085ac36eb4847 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -79,7 +79,7 @@ public:
       : ObjDumper(Writer), Obj(Obj), Writer(Writer), Types(100) {}
 
   void printFileHeaders() override;
-  void printSections() override;
+  void printSectionHeaders() override;
   void printRelocations() override;
   void printSymbols() override;
   void printDynamicSymbols() override;
@@ -959,7 +959,7 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
   StringMap<StringRef> FunctionLineTables;
 
   ListScope D(W, "CodeViewDebugInfo");
-  // Print the section to allow correlation with printSections.
+  // Print the section to allow correlation with printSectionHeaders.
   W.printNumber("Section", SectionName, Obj->getSectionID(Section));
 
   uint32_t Magic;
@@ -1279,7 +1279,7 @@ void COFFDumper::printCodeViewTypeSection(StringRef SectionName,
   W.flush();
 }
 
-void COFFDumper::printSections() {
+void COFFDumper::printSectionHeaders() {
   ListScope SectionsD(W, "Sections");
   int SectionNumber = 0;
   for (const SectionRef &Sec : Obj->sections()) {
diff --git a/tools/llvm-readobj/DwarfCFIEHPrinter.h b/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 5a1eef1d007d8e2c623acce63e8a148f9dd0a4d7..d91d764c4d0a674b050192b2007e6c24023e7029 100644
--- a/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -16,6 +16,7 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/Debug.h"
@@ -31,15 +32,15 @@ namespace DwarfCFIEH {
 template <typename ELFT>
 class PrinterContext {
   ScopedPrinter &W;
-  const object::ELFFile<ELFT> *Obj;
+  const object::ELFObjectFile<ELFT> *ObjF;
 
   void printEHFrameHdr(uint64_t Offset, uint64_t Address, uint64_t Size) const;
 
   void printEHFrame(const typename ELFT::Shdr *EHFrameShdr) const;
 
 public:
-  PrinterContext(ScopedPrinter &W, const object::ELFFile<ELFT> *Obj)
-      : W(W), Obj(Obj) {}
+  PrinterContext(ScopedPrinter &W, const object::ELFObjectFile<ELFT> *ObjF)
+      : W(W), ObjF(ObjF) {}
 
   void printUnwindInformation() const;
 };
@@ -59,6 +60,7 @@ static const typename ELFO::Elf_Shdr *findSectionByAddress(const ELFO *Obj,
 
 template <typename ELFT>
 void PrinterContext<ELFT>::printUnwindInformation() const {
+  const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
   const typename ELFT::Phdr *EHFramePhdr = nullptr;
 
   auto PHs = Obj->program_headers();
@@ -101,6 +103,7 @@ void PrinterContext<ELFT>::printEHFrameHdr(uint64_t EHFrameHdrOffset,
   W.startLine() << format("Offset: 0x%" PRIx64 "\n", EHFrameHdrOffset);
   W.startLine() << format("Size: 0x%" PRIx64 "\n", EHFrameHdrSize);
 
+  const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
   const auto *EHFrameHdrShdr = findSectionByAddress(Obj, EHFrameHdrAddress);
   if (EHFrameHdrShdr) {
     auto SectionName = Obj->getSectionName(EHFrameHdrShdr);
@@ -173,6 +176,7 @@ void PrinterContext<ELFT>::printEHFrame(
                           ShOffset, Address);
   W.indent();
 
+  const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
   auto Result = Obj->getSectionContents(EHFrameShdr);
   if (Error E = Result.takeError())
     reportError(toString(std::move(E)));
@@ -183,7 +187,8 @@ void PrinterContext<ELFT>::printEHFrame(
                 Contents.size()),
       ELFT::TargetEndianness == support::endianness::little,
       ELFT::Is64Bits ? 8 : 4);
-  DWARFDebugFrame EHFrame(/*IsEH=*/true, /*EHFrameAddress=*/Address);
+  DWARFDebugFrame EHFrame(Triple::ArchType(ObjF->getArch()), /*IsEH=*/true,
+                          /*EHFrameAddress=*/Address);
   EHFrame.parse(DE);
 
   for (const auto &Entry : EHFrame) {
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index c91d2c548bfb321619b38ef5fd286a2f113eaa6f..b1b0b6f3a35847e7be9fb1c131fe8d371a867a26 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
@@ -140,10 +141,10 @@ struct DynRegionInfo {
 template<typename ELFT>
 class ELFDumper : public ObjDumper {
 public:
-  ELFDumper(const ELFFile<ELFT> *Obj, ScopedPrinter &Writer);
+  ELFDumper(const object::ELFObjectFile<ELFT> *ObjF, ScopedPrinter &Writer);
 
   void printFileHeaders() override;
-  void printSections() override;
+  void printSectionHeaders() override;
   void printRelocations() override;
   void printDynamicRelocations() override;
   void printSymbols() override;
@@ -182,6 +183,7 @@ private:
   TYPEDEF_ELF_TYPES(ELFT)
 
   DynRegionInfo checkDRI(DynRegionInfo DRI) {
+    const ELFFile<ELFT> *Obj = ObjF->getELFFile();
     if (DRI.Addr < Obj->base() ||
         (const uint8_t *)DRI.Addr + DRI.Size > Obj->base() + Obj->getBufSize())
       error(llvm::object::object_error::parse_failed);
@@ -189,11 +191,11 @@ private:
   }
 
   DynRegionInfo createDRIFrom(const Elf_Phdr *P, uintX_t EntSize) {
-    return checkDRI({Obj->base() + P->p_offset, P->p_filesz, EntSize});
+    return checkDRI({ObjF->getELFFile()->base() + P->p_offset, P->p_filesz, EntSize});
   }
 
   DynRegionInfo createDRIFrom(const Elf_Shdr *S) {
-    return checkDRI({Obj->base() + S->sh_offset, S->sh_size, S->sh_entsize});
+    return checkDRI({ObjF->getELFFile()->base() + S->sh_offset, S->sh_size, S->sh_entsize});
   }
 
   void parseDynamicTable(ArrayRef<const Elf_Phdr *> LoadSegments);
@@ -207,7 +209,7 @@ private:
   void LoadVersionNeeds(const Elf_Shdr *ec) const;
   void LoadVersionDefs(const Elf_Shdr *sec) const;
 
-  const ELFO *Obj;
+  const object::ELFObjectFile<ELFT> *ObjF;
   DynRegionInfo DynRelRegion;
   DynRegionInfo DynRelaRegion;
   DynRegionInfo DynRelrRegion;
@@ -290,6 +292,7 @@ void ELFDumper<ELFT>::printSymbolsHelper(bool IsDynamic) const {
   StringRef StrTable, SymtabName;
   size_t Entries = 0;
   Elf_Sym_Range Syms(nullptr, nullptr);
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   if (IsDynamic) {
     StrTable = DynamicStringTable;
     Syms = dynamic_symbols();
@@ -324,7 +327,7 @@ public:
   virtual void printFileHeaders(const ELFFile<ELFT> *Obj) = 0;
   virtual void printGroupSections(const ELFFile<ELFT> *Obj) = 0;
   virtual void printRelocations(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printSections(const ELFFile<ELFT> *Obj) = 0;
+  virtual void printSectionHeaders(const ELFFile<ELFT> *Obj) = 0;
   virtual void printSymbols(const ELFFile<ELFT> *Obj) = 0;
   virtual void printDynamicSymbols(const ELFFile<ELFT> *Obj) = 0;
   virtual void printDynamicRelocations(const ELFFile<ELFT> *Obj) = 0;
@@ -359,7 +362,7 @@ public:
   void printFileHeaders(const ELFO *Obj) override;
   void printGroupSections(const ELFFile<ELFT> *Obj) override;
   void printRelocations(const ELFO *Obj) override;
-  void printSections(const ELFO *Obj) override;
+  void printSectionHeaders(const ELFO *Obj) override;
   void printSymbols(const ELFO *Obj) override;
   void printDynamicSymbols(const ELFO *Obj) override;
   void printDynamicRelocations(const ELFO *Obj) override;
@@ -452,7 +455,7 @@ public:
   void printGroupSections(const ELFFile<ELFT> *Obj) override;
   void printRelocations(const ELFO *Obj) override;
   void printRelocations(const Elf_Shdr *Sec, const ELFO *Obj);
-  void printSections(const ELFO *Obj) override;
+  void printSectionHeaders(const ELFO *Obj) override;
   void printSymbols(const ELFO *Obj) override;
   void printDynamicSymbols(const ELFO *Obj) override;
   void printDynamicRelocations(const ELFO *Obj) override;
@@ -479,7 +482,7 @@ private:
 namespace llvm {
 
 template <class ELFT>
-static std::error_code createELFDumper(const ELFFile<ELFT> *Obj,
+static std::error_code createELFDumper(const ELFObjectFile<ELFT> *Obj,
                                        ScopedPrinter &Writer,
                                        std::unique_ptr<ObjDumper> &Result) {
   Result.reset(new ELFDumper<ELFT>(Obj, Writer));
@@ -491,19 +494,19 @@ std::error_code createELFDumper(const object::ObjectFile *Obj,
                                 std::unique_ptr<ObjDumper> &Result) {
   // Little-endian 32-bit
   if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    return createELFDumper(ELFObj->getELFFile(), Writer, Result);
+    return createELFDumper(ELFObj, Writer, Result);
 
   // Big-endian 32-bit
   if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    return createELFDumper(ELFObj->getELFFile(), Writer, Result);
+    return createELFDumper(ELFObj, Writer, Result);
 
   // Little-endian 64-bit
   if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return createELFDumper(ELFObj->getELFFile(), Writer, Result);
+    return createELFDumper(ELFObj, Writer, Result);
 
   // Big-endian 64-bit
   if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    return createELFDumper(ELFObj->getELFFile(), Writer, Result);
+    return createELFDumper(ELFObj, Writer, Result);
 
   return readobj_error::unsupported_obj_file_format;
 }
@@ -516,7 +519,7 @@ template <class ELFT>
 void ELFDumper<ELFT>::LoadVersionNeeds(const Elf_Shdr *sec) const {
   unsigned vn_size = sec->sh_size;  // Size of section in bytes
   unsigned vn_count = sec->sh_info; // Number of Verneed entries
-  const char *sec_start = (const char *)Obj->base() + sec->sh_offset;
+  const char *sec_start = (const char *)ObjF->getELFFile()->base() + sec->sh_offset;
   const char *sec_end = sec_start + vn_size;
   // The first Verneed entry is at the start of the section.
   const char *p = sec_start;
@@ -550,7 +553,7 @@ template <class ELFT>
 void ELFDumper<ELFT>::LoadVersionDefs(const Elf_Shdr *sec) const {
   unsigned vd_size = sec->sh_size;  // Size of section in bytes
   unsigned vd_count = sec->sh_info; // Number of Verdef entries
-  const char *sec_start = (const char *)Obj->base() + sec->sh_offset;
+  const char *sec_start = (const char *)ObjF->getELFFile()->base() + sec->sh_offset;
   const char *sec_end = sec_start + vd_size;
   // The first Verdef entry is at the start of the section.
   const char *p = sec_start;
@@ -639,9 +642,12 @@ static void printVersionDefinitionSection(ELFDumper<ELFT> *Dumper,
   // is determined by DT_VERDEFNUM tag.
   unsigned VerDefsNum = 0;
   for (const typename ELFO::Elf_Dyn &Dyn : Dumper->dynamic_table()) {
-    if (Dyn.d_tag == DT_VERDEFNUM)
+    if (Dyn.d_tag == DT_VERDEFNUM) {
       VerDefsNum = Dyn.d_un.d_val;
+      break;
+    }
   }
+
   const uint8_t *SecStartAddress =
       (const uint8_t *)Obj->base() + Sec->sh_offset;
   const uint8_t *SecEndAddress = SecStartAddress + Sec->sh_size;
@@ -692,9 +698,12 @@ static void printVersionDependencySection(ELFDumper<ELFT> *Dumper,
     return;
 
   unsigned VerNeedNum = 0;
-  for (const typename ELFO::Elf_Dyn &Dyn : Dumper->dynamic_table())
-    if (Dyn.d_tag == DT_VERNEEDNUM)
+  for (const typename ELFO::Elf_Dyn &Dyn : Dumper->dynamic_table()) {
+    if (Dyn.d_tag == DT_VERNEEDNUM) {
       VerNeedNum = Dyn.d_un.d_val;
+      break;
+    }
+  }
 
   const uint8_t *SecData = (const uint8_t *)Obj->base() + Sec->sh_offset;
   const typename ELFO::Elf_Shdr *StrTab =
@@ -728,13 +737,13 @@ static void printVersionDependencySection(ELFDumper<ELFT> *Dumper,
 
 template <typename ELFT> void ELFDumper<ELFT>::printVersionInfo() {
   // Dump version symbol section.
-  printVersionSymbolSection(this, Obj, dot_gnu_version_sec, W);
+  printVersionSymbolSection(this, ObjF->getELFFile(), dot_gnu_version_sec, W);
 
   // Dump version definition section.
-  printVersionDefinitionSection(this, Obj, dot_gnu_version_d_sec, W);
+  printVersionDefinitionSection(this, ObjF->getELFFile(), dot_gnu_version_d_sec, W);
 
   // Dump version dependency section.
-  printVersionDependencySection(this, Obj, dot_gnu_version_r_sec, W);
+  printVersionDependencySection(this, ObjF->getELFFile(), dot_gnu_version_r_sec, W);
 }
 
 template <typename ELFT>
@@ -755,7 +764,7 @@ StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
 
   // Get the corresponding version index entry
   const Elf_Versym *vs = unwrapOrError(
-      Obj->template getEntry<Elf_Versym>(dot_gnu_version_sec, entry_index));
+      ObjF->getELFFile()->template getEntry<Elf_Versym>(dot_gnu_version_sec, entry_index));
   size_t version_index = vs->vs_index & ELF::VERSYM_VERSION;
 
   // Special markers for unversioned symbols.
@@ -788,6 +797,7 @@ StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
 
 template <typename ELFT>
 StringRef ELFDumper<ELFT>::getStaticSymbolName(uint32_t Index) const {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*DotSymtabSec));
   Elf_Sym_Range Syms = unwrapOrError(Obj->symbols(DotSymtabSec));
   if (Index >= Syms.size())
@@ -835,6 +845,7 @@ void ELFDumper<ELFT>::getSectionNameIndex(const Elf_Sym *Symbol,
     if (SectionIndex == SHN_XINDEX)
       SectionIndex = unwrapOrError(object::getExtendedSymbolTableIndex<ELFT>(
           Symbol, FirstSym, ShndxTable));
+    const ELFFile<ELFT> *Obj = ObjF->getELFFile();
     const typename ELFT::Shdr *Sec =
         unwrapOrError(Obj->getSection(SectionIndex));
     SectionName = unwrapOrError(Obj->getSectionName(Sec));
@@ -1407,9 +1418,11 @@ static const char *getElfMipsOptionsOdkType(unsigned Odk) {
 }
 
 template <typename ELFT>
-ELFDumper<ELFT>::ELFDumper(const ELFFile<ELFT> *Obj, ScopedPrinter &Writer)
-    : ObjDumper(Writer), Obj(Obj) {
+ELFDumper<ELFT>::ELFDumper(const object::ELFObjectFile<ELFT> *ObjF,
+    ScopedPrinter &Writer)
+    : ObjDumper(Writer), ObjF(ObjF) {
   SmallVector<const Elf_Phdr *, 4> LoadSegments;
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   for (const Elf_Phdr &Phdr : unwrapOrError(Obj->program_headers())) {
     if (Phdr.p_type == ELF::PT_DYNAMIC) {
       DynamicTable = createDRIFrom(&Phdr, sizeof(Elf_Dyn));
@@ -1478,19 +1491,10 @@ template <typename ELFT>
 void ELFDumper<ELFT>::parseDynamicTable(
     ArrayRef<const Elf_Phdr *> LoadSegments) {
   auto toMappedAddr = [&](uint64_t VAddr) -> const uint8_t * {
-    const Elf_Phdr *const *I =
-        std::upper_bound(LoadSegments.begin(), LoadSegments.end(), VAddr,
-                         [](uint64_t VAddr, const Elf_Phdr_Impl<ELFT> *Phdr) {
-                           return VAddr < Phdr->p_vaddr;
-                         });
-    if (I == LoadSegments.begin())
-      report_fatal_error("Virtual address is not in any segment");
-    --I;
-    const Elf_Phdr &Phdr = **I;
-    uint64_t Delta = VAddr - Phdr.p_vaddr;
-    if (Delta >= Phdr.p_filesz)
-      report_fatal_error("Virtual address is not in any segment");
-    return Obj->base() + Phdr.p_offset + Delta;
+    auto MappedAddrOrError = ObjF->getELFFile()->toMappedAddr(VAddr);
+    if (!MappedAddrOrError)
+      report_fatal_error(MappedAddrOrError.takeError());
+    return MappedAddrOrError.get();
   };
 
   uint64_t SONameOffset = 0;
@@ -1589,51 +1593,51 @@ typename ELFDumper<ELFT>::Elf_Relr_Range ELFDumper<ELFT>::dyn_relrs() const {
 
 template<class ELFT>
 void ELFDumper<ELFT>::printFileHeaders() {
-  ELFDumperStyle->printFileHeaders(Obj);
+  ELFDumperStyle->printFileHeaders(ObjF->getELFFile());
 }
 
 template<class ELFT>
-void ELFDumper<ELFT>::printSections() {
-  ELFDumperStyle->printSections(Obj);
+void ELFDumper<ELFT>::printSectionHeaders() {
+  ELFDumperStyle->printSectionHeaders(ObjF->getELFFile());
 }
 
 template<class ELFT>
 void ELFDumper<ELFT>::printRelocations() {
-  ELFDumperStyle->printRelocations(Obj);
+  ELFDumperStyle->printRelocations(ObjF->getELFFile());
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printProgramHeaders() {
-  ELFDumperStyle->printProgramHeaders(Obj);
+  ELFDumperStyle->printProgramHeaders(ObjF->getELFFile());
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printDynamicRelocations() {
-  ELFDumperStyle->printDynamicRelocations(Obj);
+  ELFDumperStyle->printDynamicRelocations(ObjF->getELFFile());
 }
 
 template<class ELFT>
 void ELFDumper<ELFT>::printSymbols() {
-  ELFDumperStyle->printSymbols(Obj);
+  ELFDumperStyle->printSymbols(ObjF->getELFFile());
 }
 
 template<class ELFT>
 void ELFDumper<ELFT>::printDynamicSymbols() {
-  ELFDumperStyle->printDynamicSymbols(Obj);
+  ELFDumperStyle->printDynamicSymbols(ObjF->getELFFile());
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printHashHistogram() {
-  ELFDumperStyle->printHashHistogram(Obj);
+  ELFDumperStyle->printHashHistogram(ObjF->getELFFile());
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printCGProfile() {
-  ELFDumperStyle->printCGProfile(Obj);
+  ELFDumperStyle->printCGProfile(ObjF->getELFFile());
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printNotes() {
-  ELFDumperStyle->printNotes(Obj);
+  ELFDumperStyle->printNotes(ObjF->getELFFile());
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printELFLinkerOptions() {
-  ELFDumperStyle->printELFLinkerOptions(Obj);
+  ELFDumperStyle->printELFLinkerOptions(ObjF->getELFFile());
 }
 
 static const char *getTypeString(unsigned Arch, uint64_t Type) {
@@ -1877,9 +1881,9 @@ void ELFDumper<ELFT>::printValue(uint64_t Type, uint64_t Value) {
 
 template<class ELFT>
 void ELFDumper<ELFT>::printUnwindInfo() {
-  const unsigned Machine = Obj->getHeader()->e_machine;
+  const unsigned Machine = ObjF->getELFFile()->getHeader()->e_machine;
   if (Machine == EM_386 || Machine == EM_X86_64) {
-    DwarfCFIEH::PrinterContext<ELFT> Ctx(W, Obj);
+    DwarfCFIEH::PrinterContext<ELFT> Ctx(W, ObjF);
     return Ctx.printUnwindInformation();
   }
   W.startLine() << "UnwindInfo not implemented.\n";
@@ -1888,6 +1892,7 @@ void ELFDumper<ELFT>::printUnwindInfo() {
 namespace {
 
 template <> void ELFDumper<ELF32LE>::printUnwindInfo() {
+  const ELFFile<ELF32LE> *Obj = ObjF->getELFFile();
   const unsigned Machine = Obj->getHeader()->e_machine;
   if (Machine == EM_ARM) {
     ARM::EHABI::PrinterContext<ELF32LE> Ctx(W, Obj, DotSymtabSec);
@@ -1930,7 +1935,7 @@ void ELFDumper<ELFT>::printDynamicTable() {
     uintX_t Tag = Entry.getTag();
     ++I;
     W.startLine() << "  " << format_hex(Tag, Is64 ? 18 : 10, opts::Output != opts::GNU) << " "
-                  << format("%-21s", getTypeString(Obj->getHeader()->e_machine, Tag));
+                  << format("%-21s", getTypeString(ObjF->getELFFile()->getHeader()->e_machine, Tag));
     printValue(Tag, Entry.getVal());
     OS << "\n";
   }
@@ -1997,6 +2002,7 @@ void ELFDumper<ELFT>::printAttributes() {
 namespace {
 
 template <> void ELFDumper<ELF32LE>::printAttributes() {
+  const ELFFile<ELF32LE> *Obj = ObjF->getELFFile();
   if (Obj->getHeader()->e_machine != EM_ARM) {
     W.startLine() << "Attributes not implemented.\n";
     return;
@@ -2282,6 +2288,7 @@ MipsGOTParser<ELFT>::getPltSym(const Entry *E) const {
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printMipsPLTGOT() {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   if (Obj->getHeader()->e_machine != EM_MIPS)
     reportError("MIPS PLT GOT is available for MIPS targets only");
 
@@ -2366,6 +2373,7 @@ static int getMipsRegisterSize(uint8_t Flag) {
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printMipsABIFlags() {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   const Elf_Shdr *Shdr = findSectionByName(*Obj, ".MIPS.abiflags");
   if (!Shdr) {
     W.startLine() << "There is no .MIPS.abiflags section in the file.\n";
@@ -2411,6 +2419,7 @@ static void printMipsReginfoData(ScopedPrinter &W,
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printMipsReginfo() {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   const Elf_Shdr *Shdr = findSectionByName(*Obj, ".reginfo");
   if (!Shdr) {
     W.startLine() << "There is no .reginfo section in the file.\n";
@@ -2428,6 +2437,7 @@ template <class ELFT> void ELFDumper<ELFT>::printMipsReginfo() {
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printMipsOptions() {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   const Elf_Shdr *Shdr = findSectionByName(*Obj, ".MIPS.options");
   if (!Shdr) {
     W.startLine() << "There is no .MIPS.options section in the file.\n";
@@ -2457,6 +2467,7 @@ template <class ELFT> void ELFDumper<ELFT>::printMipsOptions() {
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printStackMap() const {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   const Elf_Shdr *StackMapSection = nullptr;
   for (const auto &Sec : unwrapOrError(Obj->sections())) {
     StringRef Name = unwrapOrError(Obj->getSectionName(&Sec));
@@ -2477,11 +2488,11 @@ template <class ELFT> void ELFDumper<ELFT>::printStackMap() const {
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printGroupSections() {
-  ELFDumperStyle->printGroupSections(Obj);
+  ELFDumperStyle->printGroupSections(ObjF->getELFFile());
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printAddrsig() {
-  ELFDumperStyle->printAddrsig(Obj);
+  ELFDumperStyle->printAddrsig(ObjF->getELFFile());
 }
 
 static inline void printFields(formatted_raw_ostream &OS, StringRef Str1,
@@ -2920,7 +2931,8 @@ std::string getSectionTypeString(unsigned Arch, unsigned Type) {
   return "";
 }
 
-template <class ELFT> void GNUStyle<ELFT>::printSections(const ELFO *Obj) {
+template <class ELFT>
+void GNUStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
   size_t SectionIndex = 0;
   std::string Number, Type, Size, Address, Offset, Flags, Link, Info, EntrySize,
       Alignment;
@@ -3631,7 +3643,7 @@ static std::string getFreeBSDNoteTypeName(const uint32_t NT) {
   return OS.str();
 }
 
-static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
+static std::string getAMDNoteTypeName(const uint32_t NT) {
   static const struct {
     uint32_t ID;
     const char *Name;
@@ -3654,6 +3666,16 @@ static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
   return OS.str();
 }
 
+static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
+  if (NT == ELF::NT_AMDGPU_METADATA)
+    return std::string("NT_AMDGPU_METADATA (AMDGPU Metadata)");
+
+  std::string string;
+  raw_string_ostream OS(string);
+  OS << format("Unknown note type (0x%08x)", NT);
+  return OS.str();
+}
+
 template <typename ELFT>
 static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
                                   ArrayRef<uint8_t> Data) {
@@ -3711,12 +3733,10 @@ static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
 
 template <typename ELFT>
 static SmallVector<std::string, 4>
-getGNUPropertyList(ArrayRef<typename ELFT::Word> Words) {
+getGNUPropertyList(ArrayRef<uint8_t> Arr) {
   using Elf_Word = typename ELFT::Word;
 
   SmallVector<std::string, 4> Properties;
-  ArrayRef<uint8_t> Arr(reinterpret_cast<const uint8_t *>(Words.data()),
-                        Words.size());
   while (Arr.size() >= 8) {
     uint32_t Type = *reinterpret_cast<const Elf_Word *>(Arr.data());
     uint32_t DataSize = *reinterpret_cast<const Elf_Word *>(Arr.data() + 4);
@@ -3749,7 +3769,12 @@ struct GNUAbiTag {
 };
 
 template <typename ELFT>
-static GNUAbiTag getGNUAbiTag(ArrayRef<typename ELFT::Word> Words) {
+static GNUAbiTag getGNUAbiTag(ArrayRef<uint8_t> Desc) {
+  typedef typename ELFT::Word Elf_Word;
+
+  ArrayRef<Elf_Word> Words(reinterpret_cast<const Elf_Word*>(Desc.begin()),
+                           reinterpret_cast<const Elf_Word*>(Desc.end()));
+
   if (Words.size() < 4)
     return {"", "", /*IsValid=*/false};
 
@@ -3766,30 +3791,26 @@ static GNUAbiTag getGNUAbiTag(ArrayRef<typename ELFT::Word> Words) {
   return {OSName, ABI.str(), /*IsValid=*/true};
 }
 
-template <typename ELFT>
-static std::string getGNUBuildId(ArrayRef<typename ELFT::Word> Words) {
+static std::string getGNUBuildId(ArrayRef<uint8_t> Desc) {
   std::string str;
   raw_string_ostream OS(str);
-  ArrayRef<uint8_t> ID(reinterpret_cast<const uint8_t *>(Words.data()),
-                       Words.size());
-  for (const auto &B : ID)
+  for (const auto &B : Desc)
     OS << format_hex_no_prefix(B, 2);
   return OS.str();
 }
 
-template <typename ELFT>
-static StringRef getGNUGoldVersion(ArrayRef<typename ELFT::Word> Words) {
-  return StringRef(reinterpret_cast<const char *>(Words.data()), Words.size());
+static StringRef getGNUGoldVersion(ArrayRef<uint8_t> Desc) {
+  return StringRef(reinterpret_cast<const char *>(Desc.data()), Desc.size());
 }
 
 template <typename ELFT>
 static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
-                         ArrayRef<typename ELFT::Word> Words) {
+                         ArrayRef<uint8_t> Desc) {
   switch (NoteType) {
   default:
     return;
   case ELF::NT_GNU_ABI_TAG: {
-    const GNUAbiTag &AbiTag = getGNUAbiTag<ELFT>(Words);
+    const GNUAbiTag &AbiTag = getGNUAbiTag<ELFT>(Desc);
     if (!AbiTag.IsValid)
       OS << "    <corrupt GNU_ABI_TAG>";
     else
@@ -3797,44 +3818,43 @@ static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
     break;
   }
   case ELF::NT_GNU_BUILD_ID: {
-    OS << "    Build ID: " << getGNUBuildId<ELFT>(Words);
+    OS << "    Build ID: " << getGNUBuildId(Desc);
     break;
   }
   case ELF::NT_GNU_GOLD_VERSION:
-    OS << "    Version: " << getGNUGoldVersion<ELFT>(Words);
+    OS << "    Version: " << getGNUGoldVersion(Desc);
     break;
   case ELF::NT_GNU_PROPERTY_TYPE_0:
     OS << "    Properties:";
-    for (const auto &Property : getGNUPropertyList<ELFT>(Words))
+    for (const auto &Property : getGNUPropertyList<ELFT>(Desc))
       OS << "    " << Property << "\n";
     break;
   }
   OS << '\n';
 }
 
-struct AMDGPUNote {
-  std::string type;
-  std::string value;
+struct AMDNote {
+  std::string Type;
+  std::string Value;
 };
 
 template <typename ELFT>
-static AMDGPUNote getAMDGPUNote(uint32_t NoteType,
-                                ArrayRef<typename ELFT::Word> Words) {
+static AMDNote getAMDNote(uint32_t NoteType, ArrayRef<uint8_t> Desc) {
   switch (NoteType) {
   default:
     return {"", ""};
   case ELF::NT_AMD_AMDGPU_HSA_METADATA:
     return {"HSA Metadata",
-            std::string(reinterpret_cast<const char *>(Words.data()),
-                        Words.size())};
+            std::string(reinterpret_cast<const char *>(Desc.data()),
+                        Desc.size())};
   case ELF::NT_AMD_AMDGPU_ISA:
     return {"ISA Version",
-            std::string(reinterpret_cast<const char *>(Words.data()),
-                        Words.size())};
+            std::string(reinterpret_cast<const char *>(Desc.data()),
+                        Desc.size())};
   case ELF::NT_AMD_AMDGPU_PAL_METADATA:
     const uint32_t *PALMetadataBegin =
-        reinterpret_cast<const uint32_t *>(Words.data());
-    const uint32_t *PALMetadataEnd = PALMetadataBegin + Words.size();
+        reinterpret_cast<const uint32_t *>(Desc.data());
+    const uint32_t *PALMetadataEnd = PALMetadataBegin + Desc.size();
     std::vector<uint32_t> PALMetadata(PALMetadataBegin, PALMetadataEnd);
     std::string PALMetadataString;
     auto Error = AMDGPU::PALMD::toString(PALMetadata, PALMetadataString);
@@ -3845,6 +3865,41 @@ static AMDGPUNote getAMDGPUNote(uint32_t NoteType,
   }
 }
 
+struct AMDGPUNote {
+  std::string Type;
+  std::string Value;
+};
+
+template <typename ELFT>
+static AMDGPUNote getAMDGPUNote(uint32_t NoteType, ArrayRef<uint8_t> Desc) {
+  switch (NoteType) {
+  default:
+    return {"", ""};
+  case ELF::NT_AMDGPU_METADATA:
+    auto MsgPackString =
+        StringRef(reinterpret_cast<const char *>(Desc.data()), Desc.size());
+    msgpack::Reader MsgPackReader(MsgPackString);
+    auto OptMsgPackNodeOrErr = msgpack::Node::read(MsgPackReader);
+    if (errorToBool(OptMsgPackNodeOrErr.takeError()))
+      return {"AMDGPU Metadata", "Invalid AMDGPU Metadata"};
+    auto &OptMsgPackNode = *OptMsgPackNodeOrErr;
+    if (!OptMsgPackNode)
+      return {"AMDGPU Metadata", "Invalid AMDGPU Metadata"};
+    auto &MsgPackNode = *OptMsgPackNode;
+
+    AMDGPU::HSAMD::V3::MetadataVerifier Verifier(true);
+    if (!Verifier.verify(*MsgPackNode))
+      return {"AMDGPU Metadata", "Invalid AMDGPU Metadata"};
+
+    std::string HSAMetadataString;
+    raw_string_ostream StrOS(HSAMetadataString);
+    yaml::Output YOut(StrOS);
+    YOut << MsgPackNode;
+
+    return {"AMDGPU Metadata", StrOS.str()};
+  }
+}
+
 template <class ELFT>
 void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   const Elf_Ehdr *e = Obj->getHeader();
@@ -3859,7 +3914,7 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
 
   auto ProcessNote = [&](const Elf_Note &Note) {
     StringRef Name = Note.getName();
-    ArrayRef<Elf_Word> Descriptor = Note.getDesc();
+    ArrayRef<uint8_t> Descriptor = Note.getDesc();
     Elf_Word Type = Note.getType();
 
     OS << "  " << Name << std::string(22 - Name.size(), ' ')
@@ -3871,10 +3926,15 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
     } else if (Name == "FreeBSD") {
       OS << getFreeBSDNoteTypeName(Type) << '\n';
     } else if (Name == "AMD") {
+      OS << getAMDNoteTypeName(Type) << '\n';
+      const AMDNote N = getAMDNote<ELFT>(Type, Descriptor);
+      if (!N.Type.empty())
+        OS << "    " << N.Type << ":\n        " << N.Value << '\n';
+    } else if (Name == "AMDGPU") {
       OS << getAMDGPUNoteTypeName(Type) << '\n';
       const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
-      if (!N.type.empty())
-        OS << "    " << N.type << ":\n        " << N.value << '\n';
+      if (!N.Type.empty())
+        OS << "    " << N.Type << ":\n        " << N.Value << '\n';
     } else {
       OS << "Unknown note type: (" << format_hex(Type, 10) << ')';
     }
@@ -4213,7 +4273,8 @@ void LLVMStyle<ELFT>::printRelocation(const ELFO *Obj, Elf_Rela Rel,
   }
 }
 
-template <class ELFT> void LLVMStyle<ELFT>::printSections(const ELFO *Obj) {
+template <class ELFT>
+void LLVMStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
   ListScope SectionsD(W, "Sections");
 
   int SectionIndex = -1;
@@ -4480,13 +4541,13 @@ void LLVMStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
 
 template <typename ELFT>
 static void printGNUNoteLLVMStyle(uint32_t NoteType,
-                                  ArrayRef<typename ELFT::Word> Words,
+                                  ArrayRef<uint8_t> Desc,
                                   ScopedPrinter &W) {
   switch (NoteType) {
   default:
     return;
   case ELF::NT_GNU_ABI_TAG: {
-    const GNUAbiTag &AbiTag = getGNUAbiTag<ELFT>(Words);
+    const GNUAbiTag &AbiTag = getGNUAbiTag<ELFT>(Desc);
     if (!AbiTag.IsValid) {
       W.printString("ABI", "<corrupt GNU_ABI_TAG>");
     } else {
@@ -4496,15 +4557,15 @@ static void printGNUNoteLLVMStyle(uint32_t NoteType,
     break;
   }
   case ELF::NT_GNU_BUILD_ID: {
-    W.printString("Build ID", getGNUBuildId<ELFT>(Words));
+    W.printString("Build ID", getGNUBuildId(Desc));
     break;
   }
   case ELF::NT_GNU_GOLD_VERSION:
-    W.printString("Version", getGNUGoldVersion<ELFT>(Words));
+    W.printString("Version", getGNUGoldVersion(Desc));
     break;
   case ELF::NT_GNU_PROPERTY_TYPE_0:
     ListScope D(W, "Property");
-    for (const auto &Property : getGNUPropertyList<ELFT>(Words))
+    for (const auto &Property : getGNUPropertyList<ELFT>(Desc))
       W.printString(Property);
     break;
   }
@@ -4525,7 +4586,7 @@ void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   auto ProcessNote = [&](const Elf_Note &Note) {
     DictScope D2(W, "Note");
     StringRef Name = Note.getName();
-    ArrayRef<Elf_Word> Descriptor = Note.getDesc();
+    ArrayRef<uint8_t> Descriptor = Note.getDesc();
     Elf_Word Type = Note.getType();
 
     W.printString("Owner", Name);
@@ -4536,10 +4597,15 @@ void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
     } else if (Name == "FreeBSD") {
       W.printString("Type", getFreeBSDNoteTypeName(Type));
     } else if (Name == "AMD") {
+      W.printString("Type", getAMDNoteTypeName(Type));
+      const AMDNote N = getAMDNote<ELFT>(Type, Descriptor);
+      if (!N.Type.empty())
+        W.printString(N.Type, N.Value);
+    } else if (Name == "AMDGPU") {
       W.printString("Type", getAMDGPUNoteTypeName(Type));
       const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
-      if (!N.type.empty())
-        W.printString(N.type, N.value);
+      if (!N.Type.empty())
+        W.printString(N.Type, N.Value);
     } else {
       W.getOStream() << "Unknown note type: (" << format_hex(Type, 10) << ')';
     }
diff --git a/tools/llvm-readobj/MachODumper.cpp b/tools/llvm-readobj/MachODumper.cpp
index 69ef1556f78d4d44bfb600518deb6d5a41319dee..35e4cfcb6b10db95099d9be2d2866f9ee8bec00b 100644
--- a/tools/llvm-readobj/MachODumper.cpp
+++ b/tools/llvm-readobj/MachODumper.cpp
@@ -32,7 +32,7 @@ public:
       : ObjDumper(Writer), Obj(Obj) {}
 
   void printFileHeaders() override;
-  void printSections() override;
+  void printSectionHeaders() override;
   void printRelocations() override;
   void printSymbols() override;
   void printDynamicSymbols() override;
@@ -59,7 +59,7 @@ private:
 
   void printRelocation(const MachOObjectFile *Obj, const RelocationRef &Reloc);
 
-  void printSections(const MachOObjectFile *Obj);
+  void printSectionHeaders(const MachOObjectFile *Obj);
 
   const MachOObjectFile *Obj;
 };
@@ -428,11 +428,9 @@ void MachODumper::printFileHeaders(const MachHeader &Header) {
   W.printFlags("Flags", Header.flags, makeArrayRef(MachOHeaderFlags));
 }
 
-void MachODumper::printSections() {
-  return printSections(Obj);
-}
+void MachODumper::printSectionHeaders() { return printSectionHeaders(Obj); }
 
-void MachODumper::printSections(const MachOObjectFile *Obj) {
+void MachODumper::printSectionHeaders(const MachOObjectFile *Obj) {
   ListScope Group(W, "Sections");
 
   int SectionIndex = -1;
diff --git a/tools/llvm-readobj/ObjDumper.h b/tools/llvm-readobj/ObjDumper.h
index 8c3a7bec73be4c210c984c7c490c0dfcea6c4703..13de563469abd51a3fe03998aff0949f5a9e8400 100644
--- a/tools/llvm-readobj/ObjDumper.h
+++ b/tools/llvm-readobj/ObjDumper.h
@@ -33,7 +33,7 @@ public:
   virtual ~ObjDumper();
 
   virtual void printFileHeaders() = 0;
-  virtual void printSections() = 0;
+  virtual void printSectionHeaders() = 0;
   virtual void printRelocations() = 0;
   virtual void printSymbols() = 0;
   virtual void printDynamicSymbols() = 0;
diff --git a/tools/llvm-readobj/WasmDumper.cpp b/tools/llvm-readobj/WasmDumper.cpp
index 6f13bcda3edcaeef261b4040148dc8529e8744f4..79d3db4e2d291659bc7c92529118062c5492ed3c 100644
--- a/tools/llvm-readobj/WasmDumper.cpp
+++ b/tools/llvm-readobj/WasmDumper.cpp
@@ -25,20 +25,19 @@ namespace {
 static const EnumEntry<unsigned> WasmSymbolTypes[] = {
 #define ENUM_ENTRY(X)                                                          \
   { #X, wasm::WASM_SYMBOL_TYPE_##X }
-    ENUM_ENTRY(FUNCTION),
-    ENUM_ENTRY(DATA),
-    ENUM_ENTRY(GLOBAL),
-    ENUM_ENTRY(SECTION),
+    ENUM_ENTRY(FUNCTION), ENUM_ENTRY(DATA),  ENUM_ENTRY(GLOBAL),
+    ENUM_ENTRY(SECTION),  ENUM_ENTRY(EVENT),
 #undef ENUM_ENTRY
 };
 
 static const EnumEntry<uint32_t> WasmSectionTypes[] = {
 #define ENUM_ENTRY(X)                                                          \
   { #X, wasm::WASM_SEC_##X }
-    ENUM_ENTRY(CUSTOM),   ENUM_ENTRY(TYPE),   ENUM_ENTRY(IMPORT),
-    ENUM_ENTRY(FUNCTION), ENUM_ENTRY(TABLE),  ENUM_ENTRY(MEMORY),
-    ENUM_ENTRY(GLOBAL),   ENUM_ENTRY(EXPORT), ENUM_ENTRY(START),
-    ENUM_ENTRY(ELEM),     ENUM_ENTRY(CODE),   ENUM_ENTRY(DATA),
+    ENUM_ENTRY(CUSTOM),   ENUM_ENTRY(TYPE),  ENUM_ENTRY(IMPORT),
+    ENUM_ENTRY(FUNCTION), ENUM_ENTRY(TABLE), ENUM_ENTRY(MEMORY),
+    ENUM_ENTRY(GLOBAL),   ENUM_ENTRY(EVENT), ENUM_ENTRY(EXPORT),
+    ENUM_ENTRY(START),    ENUM_ENTRY(ELEM),  ENUM_ENTRY(CODE),
+    ENUM_ENTRY(DATA),
 #undef ENUM_ENTRY
 };
 
@@ -48,7 +47,7 @@ public:
       : ObjDumper(Writer), Obj(Obj) {}
 
   void printFileHeaders() override;
-  void printSections() override;
+  void printSectionHeaders() override;
   void printRelocations() override;
   void printSymbols() override;
   void printDynamicSymbols() override { llvm_unreachable("unimplemented"); }
@@ -148,7 +147,7 @@ void WasmDumper::printSymbols() {
     printSymbol(Symbol);
 }
 
-void WasmDumper::printSections() {
+void WasmDumper::printSectionHeaders() {
   ListScope Group(W, "Sections");
   for (const SectionRef &Section : Obj->sections()) {
     const WasmSection &WasmSec = Obj->getWasmSection(Section);
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 61bec321b237e658d84b1128e6abb9111a926c3d..ffc58450192198c14daa5c795df2a42bd62949e5 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -48,30 +48,49 @@ namespace opts {
     cl::desc("<input object files>"),
     cl::ZeroOrMore);
 
+  // -all, -a
+  cl::opt<bool>
+      All("all",
+          cl::desc("Equivalent to setting: --file-headers, --program-headers, "
+                   "--section-headers, --symbols, --relocations, "
+                   "--dynamic-table, --notes, --version-info, --unwind, "
+                   "--section-groups and --elf-hash-histogram."));
+  cl::alias AllShort("a", cl::desc("Alias for --all"), cl::aliasopt(All));
+
+  // --headers -e
+  cl::opt<bool>
+      Headers("headers",
+          cl::desc("Equivalent to setting: --file-headers, --program-headers, "
+                   "--section-headers"));
+  cl::alias HeadersShort("e", cl::desc("Alias for --headers"),
+     cl::aliasopt(Headers));
+
   // -wide, -W
-  cl::opt<bool> WideOutput("wide",
-    cl::desc("Ignored for compatibility with GNU readelf"));
+  cl::opt<bool>
+      WideOutput("wide", cl::desc("Ignored for compatibility with GNU readelf"),
+                 cl::Hidden);
   cl::alias WideOutputShort("W",
     cl::desc("Alias for --wide"),
     cl::aliasopt(WideOutput));
 
-  // -file-headers, -h
+  // -file-headers, -file-header, -h
   cl::opt<bool> FileHeaders("file-headers",
     cl::desc("Display file headers "));
-  cl::alias FileHeadersShort("h",
-    cl::desc("Alias for --file-headers"),
-    cl::aliasopt(FileHeaders));
-
-  // -sections, -s, -S
-  // Note: In GNU readelf, -s means --symbols!
-  cl::opt<bool> Sections("sections",
-    cl::desc("Display all sections."));
-  cl::alias SectionsShort("s",
-    cl::desc("Alias for --sections"),
-    cl::aliasopt(Sections));
-  cl::alias SectionsShortUpper("S",
-    cl::desc("Alias for --sections"),
-    cl::aliasopt(Sections));
+  cl::alias FileHeadersShort("h", cl::desc("Alias for --file-headers"),
+                             cl::aliasopt(FileHeaders), cl::NotHidden);
+  cl::alias FileHeadersSingular("file-header",
+                                cl::desc("Alias for --file-headers"),
+                                cl::aliasopt(FileHeaders));
+
+  // -section-headers, -sections, -S
+  // Also -s in llvm-readobj mode.
+  cl::opt<bool> SectionHeaders("section-headers",
+                               cl::desc("Display all section headers."));
+  cl::alias SectionsShortUpper("S", cl::desc("Alias for --section-headers"),
+                               cl::aliasopt(SectionHeaders), cl::NotHidden);
+  cl::alias SectionHeadersAlias("sections",
+                                cl::desc("Alias for --section-headers"),
+                                cl::aliasopt(SectionHeaders), cl::NotHidden);
 
   // -section-relocations, -sr
   cl::opt<bool> SectionRelocations("section-relocations",
@@ -94,12 +113,13 @@ namespace opts {
     cl::desc("Alias for --section-data"),
     cl::aliasopt(SectionData));
 
-  // -relocations, -r
+  // -relocations, -relocs, -r
   cl::opt<bool> Relocations("relocations",
     cl::desc("Display the relocation entries in the file"));
-  cl::alias RelocationsShort("r",
-    cl::desc("Alias for --relocations"),
-    cl::aliasopt(Relocations));
+  cl::alias RelocationsShort("r", cl::desc("Alias for --relocations"),
+                             cl::aliasopt(Relocations), cl::NotHidden);
+  cl::alias RelocationsGNU("relocs", cl::desc("Alias for --relocations"),
+                           cl::aliasopt(Relocations));
 
   // -notes, -n
   cl::opt<bool> Notes("notes", cl::desc("Display the ELF notes in the file"));
@@ -109,19 +129,21 @@ namespace opts {
   cl::opt<bool> DynRelocs("dyn-relocations",
     cl::desc("Display the dynamic relocation entries in the file"));
 
-  // -symbols, -t
+  // -symbols
+  // Also -s in llvm-readelf mode, or -t in llvm-readobj mode.
   cl::opt<bool> Symbols("symbols",
     cl::desc("Display the symbol table"));
-  cl::alias SymbolsShort("t",
-    cl::desc("Alias for --symbols"),
-    cl::aliasopt(Symbols));
+  cl::alias SymbolsGNU("syms", cl::desc("Alias for --symbols"),
+                       cl::aliasopt(Symbols));
 
-  // -dyn-symbols, -dt
+  // -dyn-symbols, -dyn-syms, -dt
   cl::opt<bool> DynamicSymbols("dyn-symbols",
     cl::desc("Display the dynamic symbol table"));
   cl::alias DynamicSymbolsShort("dt",
     cl::desc("Alias for --dyn-symbols"),
     cl::aliasopt(DynamicSymbols));
+  cl::alias DynSymsGNU("dyn-syms", cl::desc("Alias for --dyn-symbols"),
+                       cl::aliasopt(DynamicSymbols));
 
   // -unwind, -u
   cl::opt<bool> UnwindInfo("unwind",
@@ -130,29 +152,33 @@ namespace opts {
     cl::desc("Alias for --unwind"),
     cl::aliasopt(UnwindInfo));
 
-  // -dynamic-table
+  // -dynamic-table, -dynamic, -d
   cl::opt<bool> DynamicTable("dynamic-table",
     cl::desc("Display the ELF .dynamic section table"));
   cl::alias DynamicTableShort("d", cl::desc("Alias for --dynamic-table"),
+                              cl::aliasopt(DynamicTable), cl::NotHidden);
+  cl::alias DynamicTableAlias("dynamic", cl::desc("Alias for --dynamic-table"),
                               cl::aliasopt(DynamicTable));
 
   // -needed-libs
   cl::opt<bool> NeededLibraries("needed-libs",
     cl::desc("Display the needed libraries"));
 
-  // -program-headers
+  // -program-headers, -segments, -l
   cl::opt<bool> ProgramHeaders("program-headers",
     cl::desc("Display ELF program headers"));
   cl::alias ProgramHeadersShort("l", cl::desc("Alias for --program-headers"),
-                                cl::aliasopt(ProgramHeaders));
+                                cl::aliasopt(ProgramHeaders), cl::NotHidden);
+  cl::alias SegmentsAlias("segments", cl::desc("Alias for --program-headers"),
+                          cl::aliasopt(ProgramHeaders));
 
-  // -string-dump
+  // -string-dump, -p
   cl::list<std::string> StringDump("string-dump", cl::desc("<number|name>"),
                                    cl::ZeroOrMore);
   cl::alias StringDumpShort("p", cl::desc("Alias for --string-dump"),
                             cl::aliasopt(StringDump));
 
-  // -hex-dump
+  // -hex-dump, -x
   cl::list<std::string> HexDump("hex-dump", cl::desc("<number|name>"),
                                 cl::ZeroOrMore);
   cl::alias HexDumpShort("x", cl::desc("Alias for --hex-dump"),
@@ -188,11 +214,9 @@ namespace opts {
       "codeview-subsection-bytes",
       cl::desc("Dump raw contents of codeview debug sections and records"));
 
-  // -arm-attributes, -a
+  // -arm-attributes
   cl::opt<bool> ARMAttributes("arm-attributes",
                               cl::desc("Display the ARM attributes section"));
-  cl::alias ARMAttributesShort("a", cl::desc("Alias for --arm-attributes"),
-                               cl::aliasopt(ARMAttributes));
 
   // -mips-plt-got
   cl::opt<bool>
@@ -283,28 +307,40 @@ namespace opts {
   PrintStackMap("stackmap",
                 cl::desc("Display contents of stackmap section"));
 
-  // -version-info
+  // -version-info, -V
   cl::opt<bool>
       VersionInfo("version-info",
                   cl::desc("Display ELF version sections (if present)"));
   cl::alias VersionInfoShort("V", cl::desc("Alias for -version-info"),
                              cl::aliasopt(VersionInfo));
 
+  // -elf-section-groups, -section-groups, -g
   cl::opt<bool> SectionGroups("elf-section-groups",
                               cl::desc("Display ELF section group contents"));
+  cl::alias SectionGroupsAlias("section-groups",
+                               cl::desc("Alias for -elf-sections-groups"),
+                               cl::aliasopt(SectionGroups));
   cl::alias SectionGroupsShort("g", cl::desc("Alias for -elf-sections-groups"),
                                cl::aliasopt(SectionGroups));
+
+  // -elf-hash-histogram, -histogram, -I
   cl::opt<bool> HashHistogram(
       "elf-hash-histogram",
       cl::desc("Display bucket list histogram for hash sections"));
   cl::alias HashHistogramShort("I", cl::desc("Alias for -elf-hash-histogram"),
                                cl::aliasopt(HashHistogram));
+  cl::alias HistogramAlias("histogram",
+                           cl::desc("Alias for --elf-hash-histogram"),
+                           cl::aliasopt(HashHistogram));
 
+  // -elf-cg-profile
   cl::opt<bool> CGProfile("elf-cg-profile", cl::desc("Display callgraph profile section"));
 
+  // -addrsig
   cl::opt<bool> Addrsig("addrsig",
                         cl::desc("Display address-significance table"));
 
+  // -elf-output-style
   cl::opt<OutputStyleTy>
       Output("elf-output-style", cl::desc("Specify ELF dump style"),
              cl::values(clEnumVal(LLVM, "LLVM default style"),
@@ -418,8 +454,8 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer) {
 
   if (opts::FileHeaders)
     Dumper->printFileHeaders();
-  if (opts::Sections)
-    Dumper->printSections();
+  if (opts::SectionHeaders)
+    Dumper->printSectionHeaders();
   if (opts::Relocations)
     Dumper->printRelocations();
   if (opts::DynRelocs)
@@ -588,28 +624,62 @@ static void dumpInput(StringRef File) {
     reportError(File, readobj_error::unrecognized_file_format);
 }
 
+/// Registers aliases that should only be allowed by readobj.
+static void registerReadobjAliases() {
+  // -s has meant --sections for a very long time in llvm-readobj despite
+  // meaning --symbols in readelf.
+  static cl::alias SectionsShort("s", cl::desc("Alias for --section-headers"),
+                                 cl::aliasopt(opts::SectionHeaders),
+                                 cl::NotHidden);
+
+  // Only register -t in llvm-readobj, as readelf reserves it for
+  // --section-details (not implemented yet).
+  static cl::alias SymbolsShort("t", cl::desc("Alias for --symbols"),
+                                cl::aliasopt(opts::Symbols), cl::NotHidden);
+}
+
+/// Registers aliases that should only be allowed by readelf.
+static void registerReadelfAliases() {
+  // -s is here because for readobj it means --sections.
+  static cl::alias SymbolsShort("s", cl::desc("Alias for --symbols"),
+                                cl::aliasopt(opts::Symbols), cl::NotHidden);
+}
+
 int main(int argc, const char *argv[]) {
   InitLLVM X(argc, argv);
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
 
-  // Make some commonly used short options visibile in -help.
-  opts::DynamicTableShort.setHiddenFlag(cl::NotHidden);
-  opts::FileHeadersShort.setHiddenFlag(cl::NotHidden);
-  opts::ProgramHeadersShort.setHiddenFlag(cl::NotHidden);
-  opts::RelocationsShort.setHiddenFlag(cl::NotHidden);
-  opts::SectionsShort.setHiddenFlag(cl::NotHidden);
-  opts::SectionsShortUpper.setHiddenFlag(cl::NotHidden);
-  opts::SymbolsShort.setHiddenFlag(cl::NotHidden);
-
-  opts::WideOutput.setHiddenFlag(cl::Hidden);
-
-  if (sys::path::stem(argv[0]).find("readelf") != StringRef::npos)
+  if (sys::path::stem(argv[0]).contains("readelf")) {
     opts::Output = opts::GNU;
+    registerReadelfAliases();
+  } else {
+    registerReadobjAliases();
+  }
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n");
 
+  if (opts::All) {
+    opts::FileHeaders = true;
+    opts::ProgramHeaders = true;
+    opts::SectionHeaders = true;
+    opts::Symbols = true;
+    opts::Relocations = true;
+    opts::DynamicTable = true;
+    opts::Notes = true;
+    opts::VersionInfo = true;
+    opts::UnwindInfo = true;
+    opts::SectionGroups = true;
+    opts::HashHistogram = true;
+  }
+
+  if (opts::Headers) {
+    opts::FileHeaders = true;
+    opts::ProgramHeaders = true;
+    opts::SectionHeaders = true;
+  }
+
   // Default to stdin if no filename is specified.
   if (opts::InputFilenames.size() == 0)
     opts::InputFilenames.push_back("-");
diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
index 374ffd03e13a24f413ade28d9aac29e1e07e7e09..92ed098dc6421cfe5f1bd8999fe0a65584ef520a 100644
--- a/tools/llvm-readobj/llvm-readobj.h
+++ b/tools/llvm-readobj/llvm-readobj.h
@@ -40,7 +40,7 @@ namespace llvm {
       return *EO;
     std::string Buf;
     raw_string_ostream OS(Buf);
-    logAllUnhandledErrors(EO.takeError(), OS, "");
+    logAllUnhandledErrors(EO.takeError(), OS);
     OS.flush();
     reportError(Buf);
   }
@@ -49,22 +49,13 @@ namespace llvm {
 } // namespace llvm
 
 namespace opts {
-  extern llvm::cl::list<std::string> InputFilenames;
-  extern llvm::cl::opt<bool> FileHeaders;
-  extern llvm::cl::opt<bool> Sections;
   extern llvm::cl::opt<bool> SectionRelocations;
   extern llvm::cl::opt<bool> SectionSymbols;
   extern llvm::cl::opt<bool> SectionData;
-  extern llvm::cl::opt<bool> Relocations;
-  extern llvm::cl::opt<bool> Symbols;
   extern llvm::cl::opt<bool> DynamicSymbols;
-  extern llvm::cl::opt<bool> UnwindInfo;
   extern llvm::cl::opt<bool> ExpandRelocs;
   extern llvm::cl::opt<bool> RawRelr;
-  extern llvm::cl::opt<bool> CodeView;
   extern llvm::cl::opt<bool> CodeViewSubsectionBytes;
-  extern llvm::cl::opt<bool> ARMAttributes;
-  extern llvm::cl::opt<bool> MipsPLTGOT;
   enum OutputStyleTy { LLVM, GNU };
   extern llvm::cl::opt<OutputStyleTy> Output;
 } // namespace opts
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 6ef282365745a7d2122cdd976a0b613ca2f91ceb..975638ed82d1dc0dacc330378adfd2ebf7d1e845 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -309,7 +309,7 @@ static int printLineInfoForInput(bool LoadObjects, bool UseDebugObj) {
     if (!MaybeObj) {
       std::string Buf;
       raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(MaybeObj.takeError(), OS, "");
+      logAllUnhandledErrors(MaybeObj.takeError(), OS);
       OS.flush();
       ErrorAndExit("unable to create object file: '" + Buf + "'");
     }
@@ -438,7 +438,7 @@ static int executeInput() {
     if (!MaybeObj) {
       std::string Buf;
       raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(MaybeObj.takeError(), OS, "");
+      logAllUnhandledErrors(MaybeObj.takeError(), OS);
       OS.flush();
       ErrorAndExit("unable to create object file: '" + Buf + "'");
     }
@@ -710,7 +710,7 @@ static int linkAndVerify() {
     if (!MaybeObj) {
       std::string Buf;
       raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(MaybeObj.takeError(), OS, "");
+      logAllUnhandledErrors(MaybeObj.takeError(), OS);
       OS.flush();
       ErrorAndExit("unable to create object file: '" + Buf + "'");
     }
diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index ad1aefcafcbdbfd9977f933658b42c867d9f5351..be1e5bb72681d7e9878eeba85d37742a1cdf8563 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp
@@ -140,7 +140,7 @@ static void error(llvm::Error E, StringRef FileName, const Archive::Child &C,
 
   std::string Buf;
   raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS, "");
+  logAllUnhandledErrors(std::move(E), OS);
   OS.flush();
   errs() << " " << Buf << "\n";
 }
@@ -158,7 +158,7 @@ static void error(llvm::Error E, StringRef FileName,
 
   std::string Buf;
   raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS, "");
+  logAllUnhandledErrors(std::move(E), OS);
   OS.flush();
   errs() << " " << Buf << "\n";
 }
@@ -457,8 +457,8 @@ static void printObjectSectionSizes(ObjectFile *Obj) {
     // Make one pass over the section table to calculate sizes.
     for (const SectionRef &Section : Obj->sections()) {
       uint64_t size = Section.getSize();
-      bool isText = Section.isText();
-      bool isData = Section.isData();
+      bool isText = Section.isBerkeleyText();
+      bool isData = Section.isBerkeleyData();
       bool isBSS = Section.isBSS();
       if (isText)
         total_text += size;
@@ -869,19 +869,19 @@ int main(int argc, char **argv) {
   if (RadixShort.getNumOccurrences())
     Radix = RadixShort.getValue();
 
-  for (unsigned i = 0; i < ArchFlags.size(); ++i) {
-    if (ArchFlags[i] == "all") {
+  for (StringRef Arch : ArchFlags) {
+    if (Arch == "all") {
       ArchAll = true;
     } else {
-      if (!MachOObjectFile::isValidArch(ArchFlags[i])) {
+      if (!MachOObjectFile::isValidArch(Arch)) {
         outs() << ToolName << ": for the -arch option: Unknown architecture "
-               << "named '" << ArchFlags[i] << "'";
+               << "named '" << Arch << "'";
         return 1;
       }
     }
   }
 
-  if (InputFilenames.size() == 0)
+  if (InputFilenames.empty())
     InputFilenames.push_back("a.out");
 
   MoreThanOneFile = InputFilenames.size() > 1;
diff --git a/tools/llvm-strings/llvm-strings.cpp b/tools/llvm-strings/llvm-strings.cpp
index c355caf899d55b557313ba53b18ee2788a21a3f5..cdc2a6ef033b8e1e2fed7988aad92a88036430b5 100644
--- a/tools/llvm-strings/llvm-strings.cpp
+++ b/tools/llvm-strings/llvm-strings.cpp
@@ -60,21 +60,21 @@ static void strings(raw_ostream &OS, StringRef FileName, StringRef Contents) {
     if (L.size() < static_cast<size_t>(MinLength))
       return;
     if (PrintFileName)
-      OS << FileName << ":";
+      OS << FileName << ": ";
     switch (Radix) {
     case none:
       break;
     case octal:
-      OS << format("%8o", Offset);
+      OS << format("%7o ", Offset);
       break;
     case hexadecimal:
-      OS << format("%8x", Offset);
+      OS << format("%7x ", Offset);
       break;
     case decimal:
-      OS << format("%8u", Offset);
+      OS << format("%7u ", Offset);
       break;
     }
-    OS << " " << L << '\n';
+    OS << L << '\n';
   };
 
   const char *B = Contents.begin();
diff --git a/tools/llvm-undname/llvm-undname.cpp b/tools/llvm-undname/llvm-undname.cpp
index 31d0590af2ddcf5e5abd887ead4004c47cd8c7d1..60520c8f7be0c521bb4764432807a423e24e6568 100644
--- a/tools/llvm-undname/llvm-undname.cpp
+++ b/tools/llvm-undname/llvm-undname.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 #include <cstring>
@@ -44,7 +45,7 @@ static void demangle(const std::string &S) {
     outs() << ResultBuf << "\n";
     outs().flush();
   } else {
-    errs() << "Error: Invalid mangled name\n";
+    WithColor::error() << "Invalid mangled name\n";
   }
   std::free(ResultBuf);
 }
diff --git a/tools/llvm-xray/xray-account.cpp b/tools/llvm-xray/xray-account.cpp
index 3f01605fd85242e5f3246d5ca1fe7a470cb27bdd..9985c9adcf6c1073ad1e1d2f2f28d98b89d7ffc2 100644
--- a/tools/llvm-xray/xray-account.cpp
+++ b/tools/llvm-xray/xray-account.cpp
@@ -259,9 +259,18 @@ ResultRow getStats(std::vector<uint64_t> &Timings) {
 
 } // namespace
 
+using TupleType = std::tuple<int32_t, uint64_t, ResultRow>;
+
+template <typename F>
+static void sortByKey(std::vector<TupleType> &Results, F Fn) {
+  bool ASC = AccountSortOrder == SortDirection::ASCENDING;
+  llvm::sort(Results, [=](const TupleType &L, const TupleType &R) {
+    return ASC ? Fn(L) < Fn(R) : Fn(L) > Fn(R);
+  });
+}
+
 template <class F>
 void LatencyAccountant::exportStats(const XRayFileHeader &Header, F Fn) const {
-  using TupleType = std::tuple<int32_t, uint64_t, ResultRow>;
   std::vector<TupleType> Results;
   Results.reserve(FunctionLatencies.size());
   for (auto FT : FunctionLatencies) {
@@ -286,77 +295,31 @@ void LatencyAccountant::exportStats(const XRayFileHeader &Header, F Fn) const {
   // Sort the data according to user-provided flags.
   switch (AccountSortOutput) {
   case SortField::FUNCID:
-    llvm::sort(Results, [](const TupleType &L, const TupleType &R) {
-      if (AccountSortOrder == SortDirection::ASCENDING)
-        return std::get<0>(L) < std::get<0>(R);
-      if (AccountSortOrder == SortDirection::DESCENDING)
-        return std::get<0>(L) > std::get<0>(R);
-      llvm_unreachable("Unknown sort direction");
-    });
+    sortByKey(Results, [](const TupleType &X) { return std::get<0>(X); });
     break;
   case SortField::COUNT:
-    llvm::sort(Results, [](const TupleType &L, const TupleType &R) {
-      if (AccountSortOrder == SortDirection::ASCENDING)
-        return std::get<1>(L) < std::get<1>(R);
-      if (AccountSortOrder == SortDirection::DESCENDING)
-        return std::get<1>(L) > std::get<1>(R);
-      llvm_unreachable("Unknown sort direction");
-    });
+    sortByKey(Results, [](const TupleType &X) { return std::get<1>(X); });
     break;
-  default:
-    // Here we need to look into the ResultRow for the rest of the data that
-    // we want to sort by.
-    llvm::sort(Results, [&](const TupleType &L, const TupleType &R) {
-      auto &LR = std::get<2>(L);
-      auto &RR = std::get<2>(R);
-      switch (AccountSortOutput) {
-      case SortField::COUNT:
-        if (AccountSortOrder == SortDirection::ASCENDING)
-          return LR.Count < RR.Count;
-        if (AccountSortOrder == SortDirection::DESCENDING)
-          return LR.Count > RR.Count;
-        llvm_unreachable("Unknown sort direction");
-      case SortField::MIN:
-        if (AccountSortOrder == SortDirection::ASCENDING)
-          return LR.Min < RR.Min;
-        if (AccountSortOrder == SortDirection::DESCENDING)
-          return LR.Min > RR.Min;
-        llvm_unreachable("Unknown sort direction");
-      case SortField::MED:
-        if (AccountSortOrder == SortDirection::ASCENDING)
-          return LR.Median < RR.Median;
-        if (AccountSortOrder == SortDirection::DESCENDING)
-          return LR.Median > RR.Median;
-        llvm_unreachable("Unknown sort direction");
-      case SortField::PCT90:
-        if (AccountSortOrder == SortDirection::ASCENDING)
-          return LR.Pct90 < RR.Pct90;
-        if (AccountSortOrder == SortDirection::DESCENDING)
-          return LR.Pct90 > RR.Pct90;
-        llvm_unreachable("Unknown sort direction");
-      case SortField::PCT99:
-        if (AccountSortOrder == SortDirection::ASCENDING)
-          return LR.Pct99 < RR.Pct99;
-        if (AccountSortOrder == SortDirection::DESCENDING)
-          return LR.Pct99 > RR.Pct99;
-        llvm_unreachable("Unknown sort direction");
-      case SortField::MAX:
-        if (AccountSortOrder == SortDirection::ASCENDING)
-          return LR.Max < RR.Max;
-        if (AccountSortOrder == SortDirection::DESCENDING)
-          return LR.Max > RR.Max;
-        llvm_unreachable("Unknown sort direction");
-      case SortField::SUM:
-        if (AccountSortOrder == SortDirection::ASCENDING)
-          return LR.Sum < RR.Sum;
-        if (AccountSortOrder == SortDirection::DESCENDING)
-          return LR.Sum > RR.Sum;
-        llvm_unreachable("Unknown sort direction");
-      default:
-        llvm_unreachable("Unsupported sort order");
-      }
-    });
+  case SortField::MIN:
+    sortByKey(Results, [](const TupleType &X) { return std::get<2>(X).Min; });
+    break;
+  case SortField::MED:
+    sortByKey(Results, [](const TupleType &X) { return std::get<2>(X).Median; });
+    break;
+  case SortField::PCT90:
+    sortByKey(Results, [](const TupleType &X) { return std::get<2>(X).Pct90; });
+    break;
+  case SortField::PCT99:
+    sortByKey(Results, [](const TupleType &X) { return std::get<2>(X).Pct99; });
+    break;
+  case SortField::MAX:
+    sortByKey(Results, [](const TupleType &X) { return std::get<2>(X).Max; });
+    break;
+  case SortField::SUM:
+    sortByKey(Results, [](const TupleType &X) { return std::get<2>(X).Sum; });
     break;
+  case SortField::FUNC:
+    llvm_unreachable("Not implemented");
   }
 
   if (AccountTop > 0) {
diff --git a/tools/llvm-xray/xray-stacks.cpp b/tools/llvm-xray/xray-stacks.cpp
index 059940b7756ff2b428d85c352c050118bccce6dc..d3af9e25e6f28eafe3ec5447be8aaa11265aa947 100644
--- a/tools/llvm-xray/xray-stacks.cpp
+++ b/tools/llvm-xray/xray-stacks.cpp
@@ -737,7 +737,7 @@ static CommandRegistration Unused(&Stack, []() -> Error {
                 Twine("Failed loading input file '") + Filename + "'",
                 std::make_error_code(std::errc::invalid_argument)),
             TraceOrErr.takeError());
-      logAllUnhandledErrors(TraceOrErr.takeError(), errs(), "");
+      logAllUnhandledErrors(TraceOrErr.takeError(), errs());
       continue;
     }
     auto &T = *TraceOrErr;
diff --git a/tools/obj2yaml/coff2yaml.cpp b/tools/obj2yaml/coff2yaml.cpp
index e7e9e19fd3dc1b95c33155a8112b4676b32d9b5a..6835dcf71ffec81939dab384a29b633662cd5bfb 100644
--- a/tools/obj2yaml/coff2yaml.cpp
+++ b/tools/obj2yaml/coff2yaml.cpp
@@ -147,7 +147,7 @@ void COFFDumper::dumpSections(unsigned NumSections) {
     COFFYAML::Section NewYAMLSection;
     ObjSection.getName(NewYAMLSection.Name);
     NewYAMLSection.Header.Characteristics = COFFSection->Characteristics;
-    NewYAMLSection.Header.VirtualAddress = ObjSection.getAddress();
+    NewYAMLSection.Header.VirtualAddress = COFFSection->VirtualAddress;
     NewYAMLSection.Header.VirtualSize = COFFSection->VirtualSize;
     NewYAMLSection.Header.NumberOfLineNumbers =
         COFFSection->NumberOfLinenumbers;
@@ -188,7 +188,7 @@ void COFFDumper::dumpSections(unsigned NumSections) {
       if (!SymbolNameOrErr) {
        std::string Buf;
        raw_string_ostream OS(Buf);
-       logAllUnhandledErrors(SymbolNameOrErr.takeError(), OS, "");
+       logAllUnhandledErrors(SymbolNameOrErr.takeError(), OS);
        OS.flush();
        report_fatal_error(Buf);
       }
diff --git a/tools/obj2yaml/dwarf2yaml.cpp b/tools/obj2yaml/dwarf2yaml.cpp
index 91cec8b7c6ca0b70a6ddd6d8d836281e3b51e666..1ce6c036e6f5f5a5c7b8136f3731a248c1c2dc20 100644
--- a/tools/obj2yaml/dwarf2yaml.cpp
+++ b/tools/obj2yaml/dwarf2yaml.cpp
@@ -81,8 +81,9 @@ void dumpDebugARanges(DWARFContext &DCtx, DWARFYAML::Data &Y) {
 }
 
 void dumpPubSection(DWARFContext &DCtx, DWARFYAML::PubSection &Y,
-                    StringRef Section) {
-  DataExtractor PubSectionData(Section, DCtx.isLittleEndian(), 0);
+                    DWARFSection Section) {
+  DWARFDataExtractor PubSectionData(DCtx.getDWARFObj(), Section,
+                                    DCtx.isLittleEndian(), 0);
   uint32_t Offset = 0;
   dumpInitialLength(PubSectionData, Offset, Y.Length);
   Y.Version = PubSectionData.getU16(&Offset);
diff --git a/tools/obj2yaml/obj2yaml.cpp b/tools/obj2yaml/obj2yaml.cpp
index 76786158d9898169cef66e0d4c580b649118c26b..459572a57344db7368c0c6f05cb8743e9989ec22 100644
--- a/tools/obj2yaml/obj2yaml.cpp
+++ b/tools/obj2yaml/obj2yaml.cpp
@@ -50,7 +50,7 @@ static void reportError(StringRef Input, Error Err) {
     Input = "<stdin>";
   std::string ErrMsg;
   raw_string_ostream OS(ErrMsg);
-  logAllUnhandledErrors(std::move(Err), OS, "");
+  logAllUnhandledErrors(std::move(Err), OS);
   OS.flush();
   errs() << "Error reading file: " << Input << ": " << ErrMsg;
   errs().flush();
diff --git a/tools/obj2yaml/wasm2yaml.cpp b/tools/obj2yaml/wasm2yaml.cpp
index a1120a96571a2b4ba719b6277ed8243cc7e2182b..7581bbef50ad602b31a5c60d1eadc38dac1b541f 100644
--- a/tools/obj2yaml/wasm2yaml.cpp
+++ b/tools/obj2yaml/wasm2yaml.cpp
@@ -52,7 +52,17 @@ static WasmYAML::Limits make_limits(const wasm::WasmLimits &Limits) {
 std::unique_ptr<WasmYAML::CustomSection>
 WasmDumper::dumpCustomSection(const WasmSection &WasmSec) {
   std::unique_ptr<WasmYAML::CustomSection> CustomSec;
-  if (WasmSec.Name == "name") {
+  if (WasmSec.Name == "dylink") {
+    std::unique_ptr<WasmYAML::DylinkSection> DylinkSec =
+        make_unique<WasmYAML::DylinkSection>();
+    const wasm::WasmDylinkInfo& Info = Obj.dylinkInfo();
+    DylinkSec->MemorySize = Info.MemorySize;
+    DylinkSec->MemoryAlignment = Info.MemoryAlignment;
+    DylinkSec->TableSize = Info.TableSize;
+    DylinkSec->TableAlignment = Info.TableAlignment;
+    DylinkSec->Needed = Info.Needed;
+    CustomSec = std::move(DylinkSec);
+  } else if (WasmSec.Name == "name") {
     std::unique_ptr<WasmYAML::NameSection> NameSec =
         make_unique<WasmYAML::NameSection>();
     for (const llvm::wasm::WasmFunctionName &Func : Obj.debugNames()) {
@@ -107,6 +117,7 @@ WasmDumper::dumpCustomSection(const WasmSection &WasmSec) {
         break;
       case wasm::WASM_SYMBOL_TYPE_FUNCTION:
       case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+      case wasm::WASM_SYMBOL_TYPE_EVENT:
         Info.ElementIndex = Symbol.ElementIndex;
         break;
       case wasm::WASM_SYMBOL_TYPE_SECTION:
@@ -182,6 +193,10 @@ ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
           Im.GlobalImport.Type = Import.Global.Type;
           Im.GlobalImport.Mutable = Import.Global.Mutable;
           break;
+        case wasm::WASM_EXTERNAL_EVENT:
+          Im.EventImport.Attribute = Import.Event.Attribute;
+          Im.EventImport.SigIndex = Import.Event.SigIndex;
+          break;
         case wasm::WASM_EXTERNAL_TABLE:
           Im.TableImport = make_table(Import.Table);
           break;
@@ -231,6 +246,18 @@ ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
       S = std::move(GlobalSec);
       break;
     }
+    case wasm::WASM_SEC_EVENT: {
+      auto EventSec = make_unique<WasmYAML::EventSection>();
+      for (auto &Event : Obj.events()) {
+        WasmYAML::Event E;
+        E.Index = Event.Index;
+        E.Attribute = Event.Type.Attribute;
+        E.SigIndex = Event.Type.SigIndex;
+        EventSec->Events.push_back(E);
+      }
+      S = std::move(EventSec);
+      break;
+    }
     case wasm::WASM_SEC_START: {
       auto StartSec = make_unique<WasmYAML::StartSection>();
       StartSec->StartFunction = Obj.startFunction();
diff --git a/tools/opt/Debugify.cpp b/tools/opt/Debugify.cpp
index 6c3cdc75e3342e50086bbf3ba29da8087c60446e..3b1effba1592226f2500ff806e03c05be06fd515 100644
--- a/tools/opt/Debugify.cpp
+++ b/tools/opt/Debugify.cpp
@@ -96,11 +96,12 @@ bool applyDebugifyMetadata(Module &M,
       continue;
 
     auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
-    bool IsLocalToUnit = F.hasPrivateLinkage() || F.hasInternalLinkage();
-    auto SP =
-        DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine, SPType,
-                           IsLocalToUnit, /*isDefinition=*/true, NextLine,
-                           DINode::FlagZero, /*isOptimized=*/true);
+    DISubprogram::DISPFlags SPFlags =
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized;
+    if (F.hasPrivateLinkage() || F.hasInternalLinkage())
+      SPFlags |= DISubprogram::SPFlagLocalToUnit;
+    auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine,
+                                 SPType, NextLine, DINode::FlagZero, SPFlags);
     F.setSubprogram(SP);
     for (BasicBlock &BB : F) {
       // Attach debug locations.
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index e2f9a06523a81c608f6f14c5d5ff5bae361c34ea..211a3b151fe1ee4830a9bb35b67c62a35820ac1b 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -95,6 +95,12 @@ static cl::opt<std::string> PipelineStartEPPipeline(
     cl::desc("A textual description of the function pass pipeline inserted at "
              "the PipelineStart extension point into default pipelines"),
     cl::Hidden);
+static cl::opt<std::string> OptimizerLastEPPipeline(
+    "passes-ep-optimizer-last",
+    cl::desc("A textual description of the function pass pipeline inserted at "
+             "the OptimizerLast extension point into default pipelines"),
+    cl::Hidden);
+
 enum PGOKind { NoPGO, InstrGen, InstrUse, SampleUse };
 static cl::opt<PGOKind> PGOKindFlag(
     "pgo-kind", cl::init(NoPGO), cl::Hidden,
@@ -195,6 +201,14 @@ static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
           Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
                                    DebugLogging));
         });
+  if (tryParsePipelineText<FunctionPassManager>(PB, OptimizerLastEPPipeline))
+    PB.registerOptimizerLastEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](FunctionPassManager &PM,
+                                            PassBuilder::OptimizationLevel) {
+          ExitOnError Err("Unable to parse OptimizerLastEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, OptimizerLastEPPipeline, VerifyEachPass,
+                                   DebugLogging));
+        });
 }
 
 #ifdef LINK_POLLY_INTO_TOOLS
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 6e287b6c0ab6959b9af6d51f01a0e6d321736481..266603992aa905ebbb961445d78e0cee6d635d5c 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -463,6 +463,7 @@ int main(int argc, char **argv) {
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
   initializeGlobalMergePass(Registry);
   initializeIndirectBrExpandPassPass(Registry);
+  initializeInterleavedLoadCombinePass(Registry);
   initializeInterleavedAccessPass(Registry);
   initializeEntryExitInstrumenterPass(Registry);
   initializePostInlineEntryExitInstrumenterPass(Registry);
diff --git a/tools/sanstats/sanstats.cpp b/tools/sanstats/sanstats.cpp
index 71f0207bab50dd6fa388308ee4f0df455fa3a91c..a345e9f06bedd8b5e98b883424725f3ca320c0cc 100644
--- a/tools/sanstats/sanstats.cpp
+++ b/tools/sanstats/sanstats.cpp
@@ -15,7 +15,9 @@
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Transforms/Utils/SanitizerStats.h"
 #include <stdint.h>
 
@@ -52,7 +54,11 @@ const char *ReadModule(char SizeofPtr, const char *Begin, const char *End) {
     ++Begin;
   if (Begin == End)
     return nullptr;
-  StringRef Filename(FilenameBegin, Begin - FilenameBegin);
+  std::string Filename(FilenameBegin, Begin - FilenameBegin);
+
+  if (!llvm::sys::fs::exists(Filename))
+    Filename = std::string(llvm::sys::path::parent_path(ClInputFile)) +
+               std::string(llvm::sys::path::filename(Filename));
 
   ++Begin;
   if (Begin == End)
@@ -81,8 +87,9 @@ const char *ReadModule(char SizeofPtr, const char *Begin, const char *End) {
     // remove one from the address to get the correct DI.
     if (Expected<DILineInfo> LineInfo =
             Symbolizer.symbolizeCode(Filename, Addr - 1)) {
-      llvm::outs() << LineInfo->FileName << ':' << LineInfo->Line << ' '
-                   << LineInfo->FunctionName << ' ';
+      llvm::outs() << format_hex(Addr - 1, 18) << ' ' << LineInfo->FileName
+                   << ':' << LineInfo->Line << ' ' << LineInfo->FunctionName
+                   << ' ';
     } else {
       logAllUnhandledErrors(LineInfo.takeError(), llvm::outs(), "<error> ");
     }
diff --git a/tools/yaml2obj/yaml2coff.cpp b/tools/yaml2obj/yaml2coff.cpp
index befa01b369c0427ea035a08532df07b91ae867ba..f9d255e1ad20ee09a9a9667c9a593e8ad1587e2e 100644
--- a/tools/yaml2obj/yaml2coff.cpp
+++ b/tools/yaml2obj/yaml2coff.cpp
@@ -46,7 +46,8 @@ struct COFFParser {
 
   bool isPE() const { return Obj.OptionalHeader.hasValue(); }
   bool is64Bit() const {
-    return Obj.Header.Machine == COFF::IMAGE_FILE_MACHINE_AMD64;
+    return Obj.Header.Machine == COFF::IMAGE_FILE_MACHINE_AMD64 ||
+           Obj.Header.Machine == COFF::IMAGE_FILE_MACHINE_ARM64;
   }
 
   uint32_t getFileAlignment() const {
diff --git a/tools/yaml2obj/yaml2elf.cpp b/tools/yaml2obj/yaml2elf.cpp
index 28ffa885080e0d1f834c3edd3afaded8692b4fd3..672d1540daf058b8f07fbcadf40630d7d34a80df 100644
--- a/tools/yaml2obj/yaml2elf.cpp
+++ b/tools/yaml2obj/yaml2elf.cpp
@@ -226,6 +226,16 @@ void ELFState<ELFT>::initProgramHeaders(std::vector<Elf_Phdr> &PHeaders) {
   }
 }
 
+static bool convertSectionIndex(NameToIdxMap &SN2I, StringRef SecName,
+                                StringRef IndexSrc, unsigned &IndexDest) {
+  if (SN2I.lookup(IndexSrc, IndexDest) && !to_integer(IndexSrc, IndexDest)) {
+    WithColor::error() << "Unknown section referenced: '" << IndexSrc
+                       << "' at YAML section '" << SecName << "'.\n";
+    return false;
+  }
+  return true;
+}
+
 template <class ELFT>
 bool ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
                                         ContiguousBlobAccumulator &CBA) {
@@ -245,11 +255,8 @@ bool ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
 
     if (!Sec->Link.empty()) {
       unsigned Index;
-      if (SN2I.lookup(Sec->Link, Index) && !to_integer(Sec->Link, Index)) {
-        WithColor::error() << "Unknown section referenced: '" << Sec->Link
-                           << "' at YAML section '" << Sec->Name << "'.\n";
+      if (!convertSectionIndex(SN2I, Sec->Name, Sec->Link, Index))
         return false;
-      }
       SHeader.sh_link = Index;
     }
 
@@ -261,13 +268,9 @@ bool ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
         SHeader.sh_link = getDotSymTabSecNo();
 
       unsigned Index;
-      if (SN2I.lookup(S->Info, Index) && !to_integer(S->Info, Index)) {
-        WithColor::error() << "Unknown section referenced: '" << S->Info
-                           << "' at YAML section '" << S->Name << "'.\n";
+      if (!convertSectionIndex(SN2I, S->Name, S->Info, Index))
         return false;
-      }
       SHeader.sh_info = Index;
-
       if (!writeSectionContent(SHeader, *S, CBA))
         return false;
     } else if (auto S = dyn_cast<ELFYAML::Group>(Sec.get())) {
@@ -535,13 +538,9 @@ bool ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     unsigned int sectionIndex = 0;
     if (member.sectionNameOrType == "GRP_COMDAT")
       sectionIndex = llvm::ELF::GRP_COMDAT;
-    else if (SN2I.lookup(member.sectionNameOrType, sectionIndex) &&
-             !to_integer(member.sectionNameOrType, sectionIndex)) {
-      WithColor::error() << "Unknown section referenced: '"
-                         << member.sectionNameOrType << "' at YAML section' "
-                         << Section.Name << "\n";
+    else if (!convertSectionIndex(SN2I, Section.Name, member.sectionNameOrType,
+                                  sectionIndex))
       return false;
-    }
     SIdx = sectionIndex;
     OS.write((const char *)&SIdx, sizeof(SIdx));
   }
@@ -686,7 +685,8 @@ template <class ELFT> bool ELFState<ELFT>::hasDynamicSymbols() const {
          Doc.DynamicSymbols.Local.size() > 0;
 }
 
-template <class ELFT> SmallVector<const char *, 5> ELFState<ELFT>::implicitSectionNames() const {
+template <class ELFT>
+SmallVector<const char *, 5> ELFState<ELFT>::implicitSectionNames() const {
   if (!hasDynamicSymbols())
     return {".symtab", ".strtab", ".shstrtab"};
   return {".symtab", ".strtab", ".shstrtab", ".dynsym", ".dynstr"};
diff --git a/tools/yaml2obj/yaml2wasm.cpp b/tools/yaml2obj/yaml2wasm.cpp
index 2b1e256f8f6488b9af8eab0cfe8512c1d88e9aec..e8d998d991468e91661db0dca93bc88174aa5c38 100644
--- a/tools/yaml2obj/yaml2wasm.cpp
+++ b/tools/yaml2obj/yaml2wasm.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 //
 
+#include "llvm/Object/Wasm.h"
 #include "llvm/ObjectYAML/ObjectYAML.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
@@ -37,6 +38,7 @@ private:
   int writeSectionContent(raw_ostream &OS, WasmYAML::TableSection &Section);
   int writeSectionContent(raw_ostream &OS, WasmYAML::MemorySection &Section);
   int writeSectionContent(raw_ostream &OS, WasmYAML::GlobalSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::EventSection &Section);
   int writeSectionContent(raw_ostream &OS, WasmYAML::ExportSection &Section);
   int writeSectionContent(raw_ostream &OS, WasmYAML::StartSection &Section);
   int writeSectionContent(raw_ostream &OS, WasmYAML::ElemSection &Section);
@@ -44,11 +46,13 @@ private:
   int writeSectionContent(raw_ostream &OS, WasmYAML::DataSection &Section);
 
   // Custom section types
+  int writeSectionContent(raw_ostream &OS, WasmYAML::DylinkSection &Section);
   int writeSectionContent(raw_ostream &OS, WasmYAML::NameSection &Section);
   int writeSectionContent(raw_ostream &OS, WasmYAML::LinkingSection &Section);
   WasmYAML::Object &Obj;
   uint32_t NumImportedFunctions = 0;
   uint32_t NumImportedGlobals = 0;
+  uint32_t NumImportedEvents = 0;
 };
 
 static int writeUint64(raw_ostream &OS, uint64_t Value) {
@@ -130,6 +134,20 @@ public:
   raw_ostream &GetStream() { return StringStream; }
 };
 
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::DylinkSection &Section) {
+  writeStringRef(Section.Name, OS);
+  encodeULEB128(Section.MemorySize, OS);
+  encodeULEB128(Section.MemoryAlignment, OS);
+  encodeULEB128(Section.TableSize, OS);
+  encodeULEB128(Section.TableAlignment, OS);
+  encodeULEB128(Section.Needed.size(), OS);
+  for (StringRef Needed : Section.Needed) {
+    writeStringRef(Needed, OS);
+  }
+  return 0;
+}
+
 int WasmWriter::writeSectionContent(raw_ostream &OS,
                                     WasmYAML::LinkingSection &Section) {
   writeStringRef(Section.Name, OS);
@@ -152,6 +170,7 @@ int WasmWriter::writeSectionContent(raw_ostream &OS,
       switch (Info.Kind) {
       case wasm::WASM_SYMBOL_TYPE_FUNCTION:
       case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+      case wasm::WASM_SYMBOL_TYPE_EVENT:
         encodeULEB128(Info.ElementIndex, SubSection.GetStream());
         if ((Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0)
           writeStringRef(Info.Name, SubSection.GetStream());
@@ -238,7 +257,10 @@ int WasmWriter::writeSectionContent(raw_ostream &OS,
 
 int WasmWriter::writeSectionContent(raw_ostream &OS,
                                     WasmYAML::CustomSection &Section) {
-  if (auto S = dyn_cast<WasmYAML::NameSection>(&Section)) {
+  if (auto S = dyn_cast<WasmYAML::DylinkSection>(&Section)) {
+    if (auto Err = writeSectionContent(OS, *S))
+      return Err;
+  } else if (auto S = dyn_cast<WasmYAML::NameSection>(&Section)) {
     if (auto Err = writeSectionContent(OS, *S))
       return Err;
   } else if (auto S = dyn_cast<WasmYAML::LinkingSection>(&Section)) {
@@ -292,6 +314,11 @@ int WasmWriter::writeSectionContent(raw_ostream &OS,
       writeUint8(OS, Import.GlobalImport.Mutable);
       NumImportedGlobals++;
       break;
+    case wasm::WASM_EXTERNAL_EVENT:
+      writeUint32(OS, Import.EventImport.Attribute);
+      writeUint32(OS, Import.EventImport.SigIndex);
+      NumImportedGlobals++;
+      break;
     case wasm::WASM_EXTERNAL_MEMORY:
       writeLimits(Import.Memory, OS);
       break;
@@ -369,6 +396,22 @@ int WasmWriter::writeSectionContent(raw_ostream &OS,
   return 0;
 }
 
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::EventSection &Section) {
+  encodeULEB128(Section.Events.size(), OS);
+  uint32_t ExpectedIndex = NumImportedEvents;
+  for (auto &Event : Section.Events) {
+    if (Event.Index != ExpectedIndex) {
+      errs() << "Unexpected event index: " << Event.Index << "\n";
+      return 1;
+    }
+    ++ExpectedIndex;
+    encodeULEB128(Event.Attribute, OS);
+    encodeULEB128(Event.SigIndex, OS);
+  }
+  return 0;
+}
+
 int WasmWriter::writeSectionContent(raw_ostream &OS,
                                     WasmYAML::ElemSection &Section) {
   encodeULEB128(Section.Segments.size(), OS);
@@ -474,17 +517,15 @@ int WasmWriter::writeWasm(raw_ostream &OS) {
   writeUint32(OS, Obj.Header.Version);
 
   // Write each section
-  uint32_t LastType = 0;
+  llvm::object::WasmSectionOrderChecker Checker;
   for (const std::unique_ptr<WasmYAML::Section> &Sec : Obj.Sections) {
-    uint32_t Type = Sec->Type;
-    if (Type != wasm::WASM_SEC_CUSTOM) {
-      if (Type < LastType) {
-        errs() << "Out of order section type: " << Type << "\n";
-        return 1;
-      }
-      LastType = Type;
+    StringRef SecName = "";
+    if (auto S = dyn_cast<WasmYAML::CustomSection>(Sec.get()))
+      SecName = S->Name;
+    if (!Checker.isValidSectionOrder(Sec->Type, SecName)) {
+      errs() << "Out of order section type: " << Sec->Type << "\n";
+      return 1;
     }
-
     encodeULEB128(Sec->Type, OS);
     std::string OutString;
     raw_string_ostream StringStream(OutString);
@@ -509,6 +550,9 @@ int WasmWriter::writeWasm(raw_ostream &OS) {
     } else if (auto S = dyn_cast<WasmYAML::GlobalSection>(Sec.get())) {
       if (auto Err = writeSectionContent(StringStream, *S))
         return Err;
+    } else if (auto S = dyn_cast<WasmYAML::EventSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
     } else if (auto S = dyn_cast<WasmYAML::ExportSection>(Sec.get())) {
       if (auto Err = writeSectionContent(StringStream, *S))
         return Err;
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index b739e857849d8fae1ee83f61c8b2a7ef47a91e55..64053a811a35f6ebee2897dbbfdfa38a7367d269 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -1070,33 +1070,49 @@ TEST(APFloatTest, toInteger) {
   EXPECT_EQ(APSInt::getMaxValue(5, false), result);
 }
 
-static APInt nanbits(const fltSemantics &Sem,
-                     bool SNaN, bool Negative, uint64_t fill) {
-  APInt apfill(64, fill);
+static APInt nanbitsFromAPInt(const fltSemantics &Sem, bool SNaN, bool Negative,
+                              uint64_t payload) {
+  APInt appayload(64, payload);
   if (SNaN)
-    return APFloat::getSNaN(Sem, Negative, &apfill).bitcastToAPInt();
+    return APFloat::getSNaN(Sem, Negative, &appayload).bitcastToAPInt();
   else
-    return APFloat::getQNaN(Sem, Negative, &apfill).bitcastToAPInt();
+    return APFloat::getQNaN(Sem, Negative, &appayload).bitcastToAPInt();
 }
 
 TEST(APFloatTest, makeNaN) {
-  ASSERT_EQ(0x7fc00000, nanbits(APFloat::IEEEsingle(), false, false, 0));
-  ASSERT_EQ(0xffc00000, nanbits(APFloat::IEEEsingle(), false, true, 0));
-  ASSERT_EQ(0x7fc0ae72, nanbits(APFloat::IEEEsingle(), false, false, 0xae72));
-  ASSERT_EQ(0x7fffae72, nanbits(APFloat::IEEEsingle(), false, false, 0xffffae72));
-  ASSERT_EQ(0x7fa00000, nanbits(APFloat::IEEEsingle(), true, false, 0));
-  ASSERT_EQ(0xffa00000, nanbits(APFloat::IEEEsingle(), true, true, 0));
-  ASSERT_EQ(0x7f80ae72, nanbits(APFloat::IEEEsingle(), true, false, 0xae72));
-  ASSERT_EQ(0x7fbfae72, nanbits(APFloat::IEEEsingle(), true, false, 0xffffae72));
-
-  ASSERT_EQ(0x7ff8000000000000ULL, nanbits(APFloat::IEEEdouble(), false, false, 0));
-  ASSERT_EQ(0xfff8000000000000ULL, nanbits(APFloat::IEEEdouble(), false, true, 0));
-  ASSERT_EQ(0x7ff800000000ae72ULL, nanbits(APFloat::IEEEdouble(), false, false, 0xae72));
-  ASSERT_EQ(0x7fffffffffffae72ULL, nanbits(APFloat::IEEEdouble(), false, false, 0xffffffffffffae72ULL));
-  ASSERT_EQ(0x7ff4000000000000ULL, nanbits(APFloat::IEEEdouble(), true, false, 0));
-  ASSERT_EQ(0xfff4000000000000ULL, nanbits(APFloat::IEEEdouble(), true, true, 0));
-  ASSERT_EQ(0x7ff000000000ae72ULL, nanbits(APFloat::IEEEdouble(), true, false, 0xae72));
-  ASSERT_EQ(0x7ff7ffffffffae72ULL, nanbits(APFloat::IEEEdouble(), true, false, 0xffffffffffffae72ULL));
+  const struct {
+    uint64_t expected;
+    const fltSemantics &semantics;
+    bool SNaN;
+    bool Negative;
+    uint64_t payload;
+  } tests[] = {
+    /*             expected              semantics   SNaN    Neg                payload */
+    {         0x7fc00000ULL, APFloat::IEEEsingle(), false, false,         0x00000000ULL },
+    {         0xffc00000ULL, APFloat::IEEEsingle(), false,  true,         0x00000000ULL },
+    {         0x7fc0ae72ULL, APFloat::IEEEsingle(), false, false,         0x0000ae72ULL },
+    {         0x7fffae72ULL, APFloat::IEEEsingle(), false, false,         0xffffae72ULL },
+    {         0x7fdaae72ULL, APFloat::IEEEsingle(), false, false,         0x00daae72ULL },
+    {         0x7fa00000ULL, APFloat::IEEEsingle(),  true, false,         0x00000000ULL },
+    {         0xffa00000ULL, APFloat::IEEEsingle(),  true,  true,         0x00000000ULL },
+    {         0x7f80ae72ULL, APFloat::IEEEsingle(),  true, false,         0x0000ae72ULL },
+    {         0x7fbfae72ULL, APFloat::IEEEsingle(),  true, false,         0xffffae72ULL },
+    {         0x7f9aae72ULL, APFloat::IEEEsingle(),  true, false,         0x001aae72ULL },
+    { 0x7ff8000000000000ULL, APFloat::IEEEdouble(), false, false, 0x0000000000000000ULL },
+    { 0xfff8000000000000ULL, APFloat::IEEEdouble(), false,  true, 0x0000000000000000ULL },
+    { 0x7ff800000000ae72ULL, APFloat::IEEEdouble(), false, false, 0x000000000000ae72ULL },
+    { 0x7fffffffffffae72ULL, APFloat::IEEEdouble(), false, false, 0xffffffffffffae72ULL },
+    { 0x7ffdaaaaaaaaae72ULL, APFloat::IEEEdouble(), false, false, 0x000daaaaaaaaae72ULL },
+    { 0x7ff4000000000000ULL, APFloat::IEEEdouble(),  true, false, 0x0000000000000000ULL },
+    { 0xfff4000000000000ULL, APFloat::IEEEdouble(),  true,  true, 0x0000000000000000ULL },
+    { 0x7ff000000000ae72ULL, APFloat::IEEEdouble(),  true, false, 0x000000000000ae72ULL },
+    { 0x7ff7ffffffffae72ULL, APFloat::IEEEdouble(),  true, false, 0xffffffffffffae72ULL },
+    { 0x7ff1aaaaaaaaae72ULL, APFloat::IEEEdouble(),  true, false, 0x0001aaaaaaaaae72ULL },
+  };
+
+  for (const auto &t : tests) {
+    ASSERT_EQ(t.expected, nanbitsFromAPInt(t.semantics, t.SNaN, t.Negative, t.payload));
+  }
 }
 
 #ifdef GTEST_HAS_DEATH_TEST
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index 6bee61c89adb0a736f381f6dbc78fbc0e7d9d93f..8b876f3369576dfd94749acb9cb1df6b3157a87b 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -1176,6 +1176,30 @@ TEST(APIntTest, fromString) {
   EXPECT_EQ(APInt(32, uint64_t(-72LL)), APInt(32, "-20", 36));
 }
 
+TEST(APIntTest, SaturatingMath) {
+  APInt AP_10 = APInt(8, 10);
+  APInt AP_100 = APInt(8, 100);
+  APInt AP_200 = APInt(8, 200);
+
+  EXPECT_EQ(APInt(8, 200), AP_100.uadd_sat(AP_100));
+  EXPECT_EQ(APInt(8, 255), AP_100.uadd_sat(AP_200));
+  EXPECT_EQ(APInt(8, 255), APInt(8, 255).uadd_sat(APInt(8, 255)));
+
+  EXPECT_EQ(APInt(8, 110), AP_10.sadd_sat(AP_100));
+  EXPECT_EQ(APInt(8, 127), AP_100.sadd_sat(AP_100));
+  EXPECT_EQ(APInt(8, -128), (-AP_100).sadd_sat(-AP_100));
+  EXPECT_EQ(APInt(8, -128), APInt(8, -128).sadd_sat(APInt(8, -128)));
+
+  EXPECT_EQ(APInt(8, 90), AP_100.usub_sat(AP_10));
+  EXPECT_EQ(APInt(8, 0), AP_100.usub_sat(AP_200));
+  EXPECT_EQ(APInt(8, 0), APInt(8, 0).usub_sat(APInt(8, 255)));
+
+  EXPECT_EQ(APInt(8, -90), AP_10.ssub_sat(AP_100));
+  EXPECT_EQ(APInt(8, 127), AP_100.ssub_sat(-AP_100));
+  EXPECT_EQ(APInt(8, -128), (-AP_100).ssub_sat(AP_100));
+  EXPECT_EQ(APInt(8, -128), APInt(8, -128).ssub_sat(APInt(8, 127)));
+}
+
 TEST(APIntTest, FromArray) {
   EXPECT_EQ(APInt(32, uint64_t(1)), APInt(32, ArrayRef<uint64_t>(1)));
 }
diff --git a/unittests/ADT/BitVectorTest.cpp b/unittests/ADT/BitVectorTest.cpp
index e7ab2f3dbecdf2bea384052a2da4e280546ef7cb..84290b3b854d3e8694f0c87b1c46764433ac2aa3 100644
--- a/unittests/ADT/BitVectorTest.cpp
+++ b/unittests/ADT/BitVectorTest.cpp
@@ -182,13 +182,8 @@ TYPED_TEST(BitVectorTest, TrivialOperation) {
   EXPECT_TRUE(Vec.empty());
 }
 
-TYPED_TEST(BitVectorTest, SimpleFindOps) {
-  // Test finding in an empty BitVector.
+TYPED_TEST(BitVectorTest, SimpleFindOpsMultiWord) {
   TypeParam A;
-  EXPECT_EQ(-1, A.find_first());
-  EXPECT_EQ(-1, A.find_last());
-  EXPECT_EQ(-1, A.find_first_unset());
-  EXPECT_EQ(-1, A.find_last_unset());
 
   // Test finding next set and unset bits in a BitVector with multiple words
   A.resize(100);
@@ -232,12 +227,33 @@ TYPED_TEST(BitVectorTest, SimpleFindOps) {
   EXPECT_EQ(0, A.find_first_unset());
   EXPECT_EQ(99, A.find_last_unset());
   EXPECT_EQ(99, A.find_next_unset(98));
+}
+
+// Check if a SmallBitVector is in small mode. This check is used in tests
+// that run for both SmallBitVector and BitVector. This check doesn't apply
+// to BitVector so we provide an overload that returns true to get the tests
+// to compile.
+static bool SmallBitVectorIsSmallMode(const SmallBitVector &bv) {
+  return bv.isSmall();
+}
+static bool SmallBitVectorIsSmallMode(const BitVector &) { return true; }
+
+// These tests are intended to exercise the single-word case of BitVector
+// and the small-mode case of SmallBitVector.
+TYPED_TEST(BitVectorTest, SimpleFindOpsSingleWord) {
+  // Test finding in an empty BitVector.
+  TypeParam A;
+  ASSERT_TRUE(SmallBitVectorIsSmallMode(A));
+  EXPECT_EQ(-1, A.find_first());
+  EXPECT_EQ(-1, A.find_last());
+  EXPECT_EQ(-1, A.find_first_unset());
+  EXPECT_EQ(-1, A.find_last_unset());
 
-  // Also test with a vector that is small enough to fit in 1 word.
   A.resize(20);
   A.set(3);
   A.set(4);
   A.set(16);
+  ASSERT_TRUE(SmallBitVectorIsSmallMode(A));
   EXPECT_EQ(16, A.find_last());
   EXPECT_EQ(3, A.find_first());
   EXPECT_EQ(3, A.find_next(1));
@@ -431,6 +447,8 @@ TYPED_TEST(BitVectorTest, CompoundAssignment) {
   A &= B;
   EXPECT_FALSE(A.test(2));
   EXPECT_FALSE(A.test(7));
+  EXPECT_TRUE(A.test(4));
+  EXPECT_TRUE(A.test(5));
   EXPECT_EQ(2U, A.count());
   EXPECT_EQ(50U, A.size());
 
@@ -444,6 +462,242 @@ TYPED_TEST(BitVectorTest, CompoundAssignment) {
   EXPECT_EQ(100U, A.size());
 }
 
+// Test SmallBitVector operations with mixed big/small representations
+TYPED_TEST(BitVectorTest, MixedBigSmall) {
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+    Big.set(2);
+    Big.set(16);
+
+    Small &= Big;
+    EXPECT_TRUE(Small.test(0));
+    EXPECT_EQ(1u, Small.count());
+    // FIXME BitVector and SmallBitVector behave differently here.
+    // SmallBitVector resizes the LHS to max(LHS.size(), RHS.size())
+    // but BitVector does not.
+    // EXPECT_EQ(20u, Small.size());
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+    Big.set(2);
+    Big.set(16);
+
+    Big &= Small;
+    EXPECT_TRUE(Big.test(0));
+    EXPECT_EQ(1u, Big.count());
+    // FIXME BitVector and SmallBitVector behave differently here.
+    // SmallBitVector resizes the LHS to max(LHS.size(), RHS.size())
+    // but BitVector does not.
+    // EXPECT_EQ(20u, Big.size());
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+    Big.set(2);
+    Big.set(16);
+
+    Small |= Big;
+    EXPECT_TRUE(Small.test(0));
+    EXPECT_TRUE(Small.test(1));
+    EXPECT_TRUE(Small.test(2));
+    EXPECT_TRUE(Small.test(16));
+    EXPECT_EQ(4u, Small.count());
+    EXPECT_EQ(20u, Small.size());
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+    Big.set(2);
+    Big.set(16);
+
+    Big |= Small;
+    EXPECT_TRUE(Big.test(0));
+    EXPECT_TRUE(Big.test(1));
+    EXPECT_TRUE(Big.test(2));
+    EXPECT_TRUE(Big.test(16));
+    EXPECT_EQ(4u, Big.count());
+    EXPECT_EQ(20u, Big.size());
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+    Big.set(2);
+    Big.set(16);
+
+    Small ^= Big;
+    EXPECT_TRUE(Small.test(1));
+    EXPECT_TRUE(Small.test(2));
+    EXPECT_TRUE(Small.test(16));
+    EXPECT_EQ(3u, Small.count());
+    EXPECT_EQ(20u, Small.size());
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+    Big.set(2);
+    Big.set(16);
+
+    Big ^= Small;
+    EXPECT_TRUE(Big.test(1));
+    EXPECT_TRUE(Big.test(2));
+    EXPECT_TRUE(Big.test(16));
+    EXPECT_EQ(3u, Big.count());
+    EXPECT_EQ(20u, Big.size());
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+    Big.set(2);
+    Big.set(16);
+
+    Small.reset(Big);
+    EXPECT_TRUE(Small.test(1));
+    EXPECT_EQ(1u, Small.count());
+    EXPECT_EQ(10u, Small.size());
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+    Big.set(2);
+    Big.set(16);
+
+    Big.reset(Small);
+    EXPECT_TRUE(Big.test(2));
+    EXPECT_TRUE(Big.test(16));
+    EXPECT_EQ(2u, Big.count());
+    EXPECT_EQ(20u, Big.size());
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(10);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+
+    EXPECT_FALSE(Big == Small);
+    EXPECT_FALSE(Small == Big);
+    Big.set(1);
+    EXPECT_TRUE(Big == Small);
+    EXPECT_TRUE(Small == Big);
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(20);
+    Small.resize(10);
+
+    Small.set(0);
+    Big.set(1);
+
+    EXPECT_FALSE(Small.anyCommon(Big));
+    EXPECT_FALSE(Big.anyCommon(Small));
+    Big.set(0);
+    EXPECT_TRUE(Small.anyCommon(Big));
+    EXPECT_TRUE(Big.anyCommon(Small));
+  }
+
+  {
+    TypeParam Big;
+    TypeParam Small;
+
+    Big.reserve(100);
+    Big.resize(10);
+    Small.resize(10);
+
+    Small.set(0);
+    Small.set(1);
+    Big.set(0);
+
+    EXPECT_TRUE(Small.test(Big));
+    EXPECT_FALSE(Big.test(Small));
+    Big.set(1);
+    EXPECT_FALSE(Small.test(Big));
+    EXPECT_FALSE(Big.test(Small));
+  }
+}
+
 TYPED_TEST(BitVectorTest, ProxyIndex) {
   TypeParam Vec(3);
   EXPECT_TRUE(Vec.none());
diff --git a/unittests/ADT/IteratorTest.cpp b/unittests/ADT/IteratorTest.cpp
index 50c3b01bbc747ad280a5cb305eca8963e370f800..902ddfb49f22a2e21114fd5ff5d0fde37ea61652 100644
--- a/unittests/ADT/IteratorTest.cpp
+++ b/unittests/ADT/IteratorTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ilist.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -35,6 +36,34 @@ static_assert(std::is_same<typename AdaptedIter::pointer, Shadow<2>>::value,
 static_assert(std::is_same<typename AdaptedIter::reference, Shadow<3>>::value,
               "");
 
+// Ensure that pointe{e,r}_iterator adaptors correctly forward the category of
+// the underlying iterator.
+
+using RandomAccessIter = SmallVectorImpl<int*>::iterator;
+using BidiIter = ilist<int*>::iterator;
+
+template<class T>
+using pointee_iterator_defaulted = pointee_iterator<T>;
+template<class T>
+using pointer_iterator_defaulted = pointer_iterator<T>;
+
+// Ensures that an iterator and its adaptation have the same iterator_category.
+template<template<typename> class A, typename It>
+using IsAdaptedIterCategorySame =
+  std::is_same<typename std::iterator_traits<It>::iterator_category,
+               typename std::iterator_traits<A<It>>::iterator_category>;
+
+// pointeE_iterator
+static_assert(IsAdaptedIterCategorySame<pointee_iterator_defaulted,
+                                        RandomAccessIter>::value, "");
+static_assert(IsAdaptedIterCategorySame<pointee_iterator_defaulted,
+                                        BidiIter>::value, "");
+// pointeR_iterator
+static_assert(IsAdaptedIterCategorySame<pointer_iterator_defaulted,
+                                        RandomAccessIter>::value, "");
+static_assert(IsAdaptedIterCategorySame<pointer_iterator_defaulted,
+                                        BidiIter>::value, "");
+
 TEST(PointeeIteratorTest, Basic) {
   int arr[4] = {1, 2, 3, 4};
   SmallVector<int *, 4> V;
@@ -299,6 +328,40 @@ TEST(ZipIteratorTest, ZipFirstBasic) {
   EXPECT_EQ(iters, 4u);
 }
 
+TEST(ZipIteratorTest, ZipLongestBasic) {
+  using namespace std;
+  const vector<unsigned> pi{3, 1, 4, 1, 5, 9};
+  const vector<StringRef> e{"2", "7", "1", "8"};
+
+  {
+    // Check left range longer than right.
+    const vector<tuple<Optional<unsigned>, Optional<StringRef>>> expected{
+        make_tuple(3, StringRef("2")), make_tuple(1, StringRef("7")),
+        make_tuple(4, StringRef("1")), make_tuple(1, StringRef("8")),
+        make_tuple(5, None),           make_tuple(9, None)};
+    size_t iters = 0;
+    for (auto tup : zip_longest(pi, e)) {
+      EXPECT_EQ(tup, expected[iters]);
+      iters += 1;
+    }
+    EXPECT_EQ(iters, expected.size());
+  }
+
+  {
+    // Check right range longer than left.
+    const vector<tuple<Optional<StringRef>, Optional<unsigned>>> expected{
+        make_tuple(StringRef("2"), 3), make_tuple(StringRef("7"), 1),
+        make_tuple(StringRef("1"), 4), make_tuple(StringRef("8"), 1),
+        make_tuple(None, 5),           make_tuple(None, 9)};
+    size_t iters = 0;
+    for (auto tup : zip_longest(e, pi)) {
+      EXPECT_EQ(tup, expected[iters]);
+      iters += 1;
+    }
+    EXPECT_EQ(iters, expected.size());
+  }
+}
+
 TEST(ZipIteratorTest, Mutability) {
   using namespace std;
   const SmallVector<unsigned, 4> pi{3, 1, 4, 1, 5, 9};
diff --git a/unittests/ADT/OptionalTest.cpp b/unittests/ADT/OptionalTest.cpp
index 2e09c5340fa3f0b84e5d6ef0fb04491fd44125ec..20bc9da4d594f4d11fa51379949e8298e5db2b34 100644
--- a/unittests/ADT/OptionalTest.cpp
+++ b/unittests/ADT/OptionalTest.cpp
@@ -518,13 +518,5 @@ TEST_F(OptionalTest, OperatorGreaterEqual) {
   CheckRelation<GreaterEqual>(InequalityLhs, InequalityRhs, !IsLess);
 }
 
-#if __has_feature(is_trivially_copyable) && defined(_LIBCPP_VERSION)
-static_assert(std::is_trivially_copyable<Optional<int>>::value,
-              "Should be trivially copyable");
-static_assert(
-    !std::is_trivially_copyable<Optional<NonDefaultConstructible>>::value,
-    "Shouldn't be trivially copyable");
-#endif
-
 } // end anonymous namespace
 
diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index 71a922a92b64d65e831fb82317cbd2b1a51d791f..efe859f158f26122b1a30cf68f268e1390d7d754 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp
@@ -93,6 +93,12 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::Contiki, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("i386-pc-hurd-gnu");
+  EXPECT_EQ(Triple::x86, T.getArch());
+  EXPECT_EQ(Triple::PC, T.getVendor());
+  EXPECT_EQ(Triple::Hurd, T.getOS());
+  EXPECT_EQ(Triple::GNU, T.getEnvironment());
+
   T = Triple("x86_64-pc-linux-gnu");
   EXPECT_EQ(Triple::x86_64, T.getArch());
   EXPECT_EQ(Triple::PC, T.getVendor());
diff --git a/unittests/Analysis/ProfileSummaryInfoTest.cpp b/unittests/Analysis/ProfileSummaryInfoTest.cpp
index a37d1490fadd9579b36c470675cc907a48d80384..95d8426c3f213a1ec16aa576a66644f77d033cfa 100644
--- a/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -118,8 +118,8 @@ TEST_F(ProfileSummaryInfoTest, TestNoProfile) {
   BasicBlock *BB1 = BB0.getTerminator()->getSuccessor(0);
 
   BlockFrequencyInfo BFI = buildBFI(*F);
-  EXPECT_FALSE(PSI.isHotBB(&BB0, &BFI));
-  EXPECT_FALSE(PSI.isColdBB(&BB0, &BFI));
+  EXPECT_FALSE(PSI.isHotBlock(&BB0, &BFI));
+  EXPECT_FALSE(PSI.isColdBlock(&BB0, &BFI));
 
   CallSite CS1(BB1->getFirstNonPHI());
   EXPECT_FALSE(PSI.isHotCallSite(CS1, &BFI));
@@ -156,10 +156,10 @@ TEST_F(ProfileSummaryInfoTest, InstrProf) {
   BasicBlock *BB3 = BB1->getSingleSuccessor();
 
   BlockFrequencyInfo BFI = buildBFI(*F);
-  EXPECT_TRUE(PSI.isHotBB(&BB0, &BFI));
-  EXPECT_TRUE(PSI.isHotBB(BB1, &BFI));
-  EXPECT_FALSE(PSI.isHotBB(BB2, &BFI));
-  EXPECT_TRUE(PSI.isHotBB(BB3, &BFI));
+  EXPECT_TRUE(PSI.isHotBlock(&BB0, &BFI));
+  EXPECT_TRUE(PSI.isHotBlock(BB1, &BFI));
+  EXPECT_FALSE(PSI.isHotBlock(BB2, &BFI));
+  EXPECT_TRUE(PSI.isHotBlock(BB3, &BFI));
 
   CallSite CS1(BB1->getFirstNonPHI());
   auto *CI2 = BB2->getFirstNonPHI();
@@ -188,10 +188,10 @@ TEST_F(ProfileSummaryInfoTest, SampleProf) {
   BasicBlock *BB3 = BB1->getSingleSuccessor();
 
   BlockFrequencyInfo BFI = buildBFI(*F);
-  EXPECT_TRUE(PSI.isHotBB(&BB0, &BFI));
-  EXPECT_TRUE(PSI.isHotBB(BB1, &BFI));
-  EXPECT_FALSE(PSI.isHotBB(BB2, &BFI));
-  EXPECT_TRUE(PSI.isHotBB(BB3, &BFI));
+  EXPECT_TRUE(PSI.isHotBlock(&BB0, &BFI));
+  EXPECT_TRUE(PSI.isHotBlock(BB1, &BFI));
+  EXPECT_FALSE(PSI.isHotBlock(BB2, &BFI));
+  EXPECT_TRUE(PSI.isHotBlock(BB3, &BFI));
 
   CallSite CS1(BB1->getFirstNonPHI());
   auto *CI2 = BB2->getFirstNonPHI();
diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index c4adde4abe345b537b4e93635e6ca3e3793787bd..5b36d63b3906304f42f4d11b33e41ae6fa32f5ff 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 
 namespace {
 
-class MatchSelectPatternTest : public testing::Test {
+class ValueTrackingTest : public testing::Test {
 protected:
   void parseAssembly(const char *Assembly) {
     SMDiagnostic Error;
@@ -51,6 +51,13 @@ protected:
       report_fatal_error("@test must have an instruction %A");
   }
 
+  LLVMContext Context;
+  std::unique_ptr<Module> M;
+  Instruction *A;
+};
+
+class MatchSelectPatternTest : public ValueTrackingTest {
+protected:
   void expectPattern(const SelectPatternResult &P) {
     Value *LHS, *RHS;
     Instruction::CastOps CastOp;
@@ -59,10 +66,16 @@ protected:
     EXPECT_EQ(P.NaNBehavior, R.NaNBehavior);
     EXPECT_EQ(P.Ordered, R.Ordered);
   }
+};
 
-  LLVMContext Context;
-  std::unique_ptr<Module> M;
-  Instruction *A, *B;
+class ComputeKnownBitsTest : public ValueTrackingTest {
+protected:
+  void expectKnownBits(uint64_t Zero, uint64_t One) {
+    auto Known = computeKnownBits(A, M->getDataLayout());
+    ASSERT_FALSE(Known.hasConflict());
+    EXPECT_EQ(Known.One.getZExtValue(), One);
+    EXPECT_EQ(Known.Zero.getZExtValue(), Zero);
+  }
 };
 
 }
@@ -497,117 +510,109 @@ TEST(ValueTracking, GuaranteedToTransferExecutionToSuccessor) {
   }
 }
 
-TEST(ValueTracking, ComputeNumSignBits_PR32045) {
-  StringRef Assembly = "define i32 @f(i32 %a) { "
-                       "  %val = ashr i32 %a, -1 "
-                       "  ret i32 %val "
-                       "} ";
-
-  LLVMContext Context;
-  SMDiagnostic Error;
-  auto M = parseAssemblyString(Assembly, Error, Context);
-  assert(M && "Bad assembly?");
-
-  auto *F = M->getFunction("f");
-  assert(F && "Bad assembly?");
-
-  auto *RVal =
-      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
-  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
+TEST_F(ValueTrackingTest, ComputeNumSignBits_PR32045) {
+  parseAssembly(
+      "define i32 @test(i32 %a) {\n"
+      "  %A = ashr i32 %a, -1\n"
+      "  ret i32 %A\n"
+      "}\n");
+  EXPECT_EQ(ComputeNumSignBits(A, M->getDataLayout()), 1u);
 }
 
 // No guarantees for canonical IR in this analysis, so this just bails out. 
-TEST(ValueTracking, ComputeNumSignBits_Shuffle) {
-  StringRef Assembly = "define <2 x i32> @f() { "
-                       "  %val = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 0> "
-                       "  ret <2 x i32> %val "
-                       "} ";
-
-  LLVMContext Context;
-  SMDiagnostic Error;
-  auto M = parseAssemblyString(Assembly, Error, Context);
-  assert(M && "Bad assembly?");
-
-  auto *F = M->getFunction("f");
-  assert(F && "Bad assembly?");
-
-  auto *RVal =
-      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
-  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
+TEST_F(ValueTrackingTest, ComputeNumSignBits_Shuffle) {
+  parseAssembly(
+      "define <2 x i32> @test() {\n"
+      "  %A = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 0>\n"
+      "  ret <2 x i32> %A\n"
+      "}\n");
+  EXPECT_EQ(ComputeNumSignBits(A, M->getDataLayout()), 1u);
 }
 
 // No guarantees for canonical IR in this analysis, so a shuffle element that
 // references an undef value means this can't return any extra information. 
-TEST(ValueTracking, ComputeNumSignBits_Shuffle2) {
-  StringRef Assembly = "define <2 x i32> @f(<2 x i1> %x) { "
-                       "  %sext = sext <2 x i1> %x to <2 x i32> "
-                       "  %val = shufflevector <2 x i32> %sext, <2 x i32> undef, <2 x i32> <i32 0, i32 2> "
-                       "  ret <2 x i32> %val "
-                       "} ";
-
-  LLVMContext Context;
-  SMDiagnostic Error;
-  auto M = parseAssemblyString(Assembly, Error, Context);
-  assert(M && "Bad assembly?");
-
-  auto *F = M->getFunction("f");
-  assert(F && "Bad assembly?");
-
-  auto *RVal =
-      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
-  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
+TEST_F(ValueTrackingTest, ComputeNumSignBits_Shuffle2) {
+  parseAssembly(
+      "define <2 x i32> @test(<2 x i1> %x) {\n"
+      "  %sext = sext <2 x i1> %x to <2 x i32>\n"
+      "  %A = shufflevector <2 x i32> %sext, <2 x i32> undef, <2 x i32> <i32 0, i32 2>\n"
+      "  ret <2 x i32> %A\n"
+      "}\n");
+  EXPECT_EQ(ComputeNumSignBits(A, M->getDataLayout()), 1u);
 }
 
-TEST(ValueTracking, ComputeKnownBits) {
-  StringRef Assembly = "define i32 @f(i32 %a, i32 %b) { "
-                       "  %ash = mul i32 %a, 8 "
-                       "  %aad = add i32 %ash, 7 "
-                       "  %aan = and i32 %aad, 4095 "
-                       "  %bsh = shl i32 %b, 4 "
-                       "  %bad = or i32 %bsh, 6 "
-                       "  %ban = and i32 %bad, 4095 "
-                       "  %mul = mul i32 %aan, %ban "
-                       "  ret i32 %mul "
-                       "} ";
-
-  LLVMContext Context;
-  SMDiagnostic Error;
-  auto M = parseAssemblyString(Assembly, Error, Context);
-  assert(M && "Bad assembly?");
-
-  auto *F = M->getFunction("f");
-  assert(F && "Bad assembly?");
-
-  auto *RVal =
-      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
-  auto Known = computeKnownBits(RVal, M->getDataLayout());
-  ASSERT_FALSE(Known.hasConflict());
-  EXPECT_EQ(Known.One.getZExtValue(), 10u);
-  EXPECT_EQ(Known.Zero.getZExtValue(), 4278190085u);
+TEST_F(ComputeKnownBitsTest, ComputeKnownBits) {
+  parseAssembly(
+      "define i32 @test(i32 %a, i32 %b) {\n"
+      "  %ash = mul i32 %a, 8\n"
+      "  %aad = add i32 %ash, 7\n"
+      "  %aan = and i32 %aad, 4095\n"
+      "  %bsh = shl i32 %b, 4\n"
+      "  %bad = or i32 %bsh, 6\n"
+      "  %ban = and i32 %bad, 4095\n"
+      "  %A = mul i32 %aan, %ban\n"
+      "  ret i32 %A\n"
+      "}\n");
+  expectKnownBits(/*zero*/ 4278190085u, /*one*/ 10u);
 }
 
-TEST(ValueTracking, ComputeKnownMulBits) {
-  StringRef Assembly = "define i32 @f(i32 %a, i32 %b) { "
-                       "  %aa = shl i32 %a, 5 "
-                       "  %bb = shl i32 %b, 5 "
-                       "  %aaa = or i32 %aa, 24 "
-                       "  %bbb = or i32 %bb, 28 "
-                       "  %mul = mul i32 %aaa, %bbb "
-                       "  ret i32 %mul "
-                       "} ";
-
-  LLVMContext Context;
-  SMDiagnostic Error;
-  auto M = parseAssemblyString(Assembly, Error, Context);
-  assert(M && "Bad assembly?");
-
-  auto *F = M->getFunction("f");
-  assert(F && "Bad assembly?");
-
-  auto *RVal =
-      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
-  auto Known = computeKnownBits(RVal, M->getDataLayout());
-  ASSERT_FALSE(Known.hasConflict());
-  EXPECT_EQ(Known.One.getZExtValue(), 32u);
-  EXPECT_EQ(Known.Zero.getZExtValue(), 95u);
+TEST_F(ComputeKnownBitsTest, ComputeKnownMulBits) {
+  parseAssembly(
+      "define i32 @test(i32 %a, i32 %b) {\n"
+      "  %aa = shl i32 %a, 5\n"
+      "  %bb = shl i32 %b, 5\n"
+      "  %aaa = or i32 %aa, 24\n"
+      "  %bbb = or i32 %bb, 28\n"
+      "  %A = mul i32 %aaa, %bbb\n"
+      "  ret i32 %A\n"
+      "}\n");
+  expectKnownBits(/*zero*/ 95u, /*one*/ 32u);
+}
+
+TEST_F(ComputeKnownBitsTest, ComputeKnownFshl) {
+  // fshl(....1111....0000, 00..1111........, 6)
+  // = 11....000000..11
+  parseAssembly(
+      "define i16 @test(i16 %a, i16 %b) {\n"
+      "  %aa = shl i16 %a, 4\n"
+      "  %bb = lshr i16 %b, 2\n"
+      "  %aaa = or i16 %aa, 3840\n"
+      "  %bbb = or i16 %bb, 3840\n"
+      "  %A = call i16 @llvm.fshl.i16(i16 %aaa, i16 %bbb, i16 6)\n"
+      "  ret i16 %A\n"
+      "}\n"
+      "declare i16 @llvm.fshl.i16(i16, i16, i16)\n");
+  expectKnownBits(/*zero*/ 1008u, /*one*/ 49155u);
+}
+
+TEST_F(ComputeKnownBitsTest, ComputeKnownFshr) {
+  // fshr(....1111....0000, 00..1111........, 26)
+  // = 11....000000..11
+  parseAssembly(
+      "define i16 @test(i16 %a, i16 %b) {\n"
+      "  %aa = shl i16 %a, 4\n"
+      "  %bb = lshr i16 %b, 2\n"
+      "  %aaa = or i16 %aa, 3840\n"
+      "  %bbb = or i16 %bb, 3840\n"
+      "  %A = call i16 @llvm.fshr.i16(i16 %aaa, i16 %bbb, i16 26)\n"
+      "  ret i16 %A\n"
+      "}\n"
+      "declare i16 @llvm.fshr.i16(i16, i16, i16)\n");
+  expectKnownBits(/*zero*/ 1008u, /*one*/ 49155u);
+}
+
+TEST_F(ComputeKnownBitsTest, ComputeKnownFshlZero) {
+  // fshl(....1111....0000, 00..1111........, 0)
+  // = ....1111....0000
+  parseAssembly(
+      "define i16 @test(i16 %a, i16 %b) {\n"
+      "  %aa = shl i16 %a, 4\n"
+      "  %bb = lshr i16 %b, 2\n"
+      "  %aaa = or i16 %aa, 3840\n"
+      "  %bbb = or i16 %bb, 3840\n"
+      "  %A = call i16 @llvm.fshl.i16(i16 %aaa, i16 %bbb, i16 0)\n"
+      "  ret i16 %A\n"
+      "}\n"
+      "declare i16 @llvm.fshl.i16(i16, i16, i16)\n");
+  expectKnownBits(/*zero*/ 15u, /*one*/ 3840u);
 }
diff --git a/unittests/BinaryFormat/CMakeLists.txt b/unittests/BinaryFormat/CMakeLists.txt
index c0aac4db9c19ce15df73324030db300a5636c5d8..82d76ecf64a886df030d4e009225ef27a156aa5d 100644
--- a/unittests/BinaryFormat/CMakeLists.txt
+++ b/unittests/BinaryFormat/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_unittest(BinaryFormatTests
   DwarfTest.cpp
   MachOTest.cpp
   MsgPackReaderTest.cpp
+  MsgPackTypesTest.cpp
   MsgPackWriterTest.cpp
   TestFileMagic.cpp
   )
diff --git a/unittests/BinaryFormat/MsgPackTypesTest.cpp b/unittests/BinaryFormat/MsgPackTypesTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..87bb6fa7e4da81f418e2063101206d688bac3594
--- /dev/null
+++ b/unittests/BinaryFormat/MsgPackTypesTest.cpp
@@ -0,0 +1,188 @@
+//===- MsgPackTypesTest.cpp -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/MsgPackTypes.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace msgpack;
+
+TEST(MsgPackTypes, TestReadInt) {
+  Reader MPReader(StringRef("\xd0\x00", 2));
+  auto OptNodeOrErr = Node::read(MPReader);
+  ASSERT_TRUE(static_cast<bool>(OptNodeOrErr));
+  ASSERT_TRUE(*OptNodeOrErr);
+  auto *S = dyn_cast<ScalarNode>((*OptNodeOrErr)->get());
+  ASSERT_TRUE(S);
+  ASSERT_EQ(S->getScalarKind(), ScalarNode::SK_Int);
+  ASSERT_EQ(S->getInt(), 0);
+}
+
+TEST(MsgPackTypes, TestReadArray) {
+  Reader MPReader(StringRef("\x92\xd0\x01\xc0"));
+  auto OptNodeOrErr = Node::read(MPReader);
+  ASSERT_TRUE(static_cast<bool>(OptNodeOrErr));
+  ASSERT_TRUE(*OptNodeOrErr);
+  auto *A = dyn_cast<ArrayNode>((*OptNodeOrErr)->get());
+  ASSERT_TRUE(A);
+  ASSERT_EQ(A->size(), 2u);
+  auto *SI = dyn_cast<ScalarNode>((*A)[0].get());
+  ASSERT_TRUE(SI);
+  ASSERT_EQ(SI->getScalarKind(), ScalarNode::SK_Int);
+  ASSERT_EQ(SI->getInt(), 1);
+  auto *SN = dyn_cast<ScalarNode>((*A)[1].get());
+  ASSERT_TRUE(SN);
+  ASSERT_EQ(SN->getScalarKind(), ScalarNode::SK_Nil);
+}
+
+TEST(MsgPackTypes, TestReadMap) {
+  Reader MPReader(StringRef("\x82\xa3"
+                            "foo"
+                            "\xd0\x01\xa3"
+                            "bar"
+                            "\xd0\x02"));
+  auto OptNodeOrErr = Node::read(MPReader);
+  ASSERT_TRUE(static_cast<bool>(OptNodeOrErr));
+  ASSERT_TRUE(*OptNodeOrErr);
+  auto *A = dyn_cast<MapNode>((*OptNodeOrErr)->get());
+  ASSERT_TRUE(A);
+  ASSERT_EQ(A->size(), 2u);
+  auto *FooS = dyn_cast<ScalarNode>((*A)["foo"].get());
+  ASSERT_TRUE(FooS);
+  ASSERT_EQ(FooS->getScalarKind(), ScalarNode::SK_Int);
+  ASSERT_EQ(FooS->getInt(), 1);
+  auto *BarS = dyn_cast<ScalarNode>((*A)["bar"].get());
+  ASSERT_TRUE(BarS);
+  ASSERT_EQ(BarS->getScalarKind(), ScalarNode::SK_Int);
+  ASSERT_EQ(BarS->getInt(), 2);
+}
+
+TEST(MsgPackTypes, TestWriteInt) {
+  std::string Buffer;
+  raw_string_ostream OStream(Buffer);
+  Writer MPWriter(OStream);
+  ScalarNode I(int64_t(1));
+  I.write(MPWriter);
+  ASSERT_EQ(OStream.str(), "\x01");
+}
+
+TEST(MsgPackTypes, TestWriteArray) {
+  std::string Buffer;
+  raw_string_ostream OStream(Buffer);
+  Writer MPWriter(OStream);
+  ArrayNode A;
+  A.push_back(std::make_shared<ScalarNode>(int64_t(1)));
+  A.push_back(std::make_shared<ScalarNode>());
+  A.write(MPWriter);
+  ASSERT_EQ(OStream.str(), "\x92\x01\xc0");
+}
+
+TEST(MsgPackTypes, TestWriteMap) {
+  std::string Buffer;
+  raw_string_ostream OStream(Buffer);
+  Writer MPWriter(OStream);
+  MapNode M;
+  M["foo"] = std::make_shared<ScalarNode>(int64_t(1));
+  M["bar"] = std::make_shared<ScalarNode>(int64_t(2));
+  M.write(MPWriter);
+  ASSERT_EQ(OStream.str(), "\x82\xa3"
+                           "foo"
+                           "\x01\xa3"
+                           "bar"
+                           "\x02");
+}
+
+TEST(MsgPackTypes, TestOutputYAMLArray) {
+  std::string Buffer;
+  raw_string_ostream OStream(Buffer);
+  yaml::Output yout(OStream);
+  ArrayNode A;
+  A.push_back(std::make_shared<ScalarNode>(int64_t(1)));
+  A.push_back(std::make_shared<ScalarNode>(int64_t(2)));
+  yout << A;
+  ASSERT_EQ(OStream.str(), "---\n- !int 1\n- !int 2\n...\n");
+}
+
+TEST(MsgPackTypes, TestInputYAMLArray) {
+  NodePtr RootNode;
+  yaml::Input yin("---\n- !int 1\n- !str 2\n...\n");
+  yin >> RootNode;
+  auto *A = dyn_cast<ArrayNode>(RootNode.get());
+  ASSERT_TRUE(A);
+  ASSERT_EQ(A->size(), 2u);
+  auto *SI = dyn_cast<ScalarNode>((*A)[0].get());
+  ASSERT_TRUE(SI);
+  ASSERT_EQ(SI->getScalarKind(), ScalarNode::SK_UInt);
+  ASSERT_EQ(SI->getUInt(), 1u);
+  auto *SS = dyn_cast<ScalarNode>((*A)[1].get());
+  ASSERT_TRUE(SS);
+  ASSERT_EQ(SS->getScalarKind(), ScalarNode::SK_String);
+  ASSERT_EQ(SS->getString(), "2");
+}
+
+TEST(MsgPackTypes, TestOutputYAMLMap) {
+  std::string Buffer;
+  raw_string_ostream OStream(Buffer);
+  yaml::Output yout(OStream);
+  MapNode M;
+  M["foo"] = std::make_shared<ScalarNode>(int64_t(1));
+  M["bar"] = std::make_shared<ScalarNode>(uint64_t(2));
+  auto N = std::make_shared<MapNode>();
+  (*N)["baz"] = std::make_shared<ScalarNode>(true);
+  M["qux"] = std::move(N);
+  yout << M;
+  ASSERT_EQ(OStream.str(), "---\nfoo:             !int 1\nbar:             "
+                           "!int 2\nqux:             \n  baz:             "
+                           "!bool true\n...\n");
+}
+
+TEST(MsgPackTypes, TestInputYAMLMap) {
+  NodePtr RootNode;
+  yaml::Input yin("---\nfoo: !int 1\nbaz: !str 2\n...\n");
+  yin >> RootNode;
+  auto *M = dyn_cast<MapNode>(RootNode.get());
+  ASSERT_TRUE(M);
+  ASSERT_EQ(M->size(), 2u);
+  auto *SI = dyn_cast<ScalarNode>((*M)["foo"].get());
+  ASSERT_TRUE(SI);
+  ASSERT_EQ(SI->getScalarKind(), ScalarNode::SK_UInt);
+  ASSERT_EQ(SI->getUInt(), 1u);
+  auto *SS = dyn_cast<ScalarNode>((*M)["baz"].get());
+  ASSERT_TRUE(SS);
+  ASSERT_EQ(SS->getScalarKind(), ScalarNode::SK_String);
+  ASSERT_EQ(SS->getString(), "2");
+}
+
+// Test that the document is parsed into a tree of shared_ptr where each node
+// can have multiple owners.
+TEST(MsgPackTypes, TestInputShared) {
+  yaml::Input yin("---\nfoo:\n  bar: !int 1\n...\n");
+  NodePtr InnerMap;
+  NodePtr IntNode;
+  {
+    {
+      {
+        NodePtr RootNode;
+        yin >> RootNode;
+        auto *M = dyn_cast<MapNode>(RootNode.get());
+        ASSERT_TRUE(M);
+        ASSERT_EQ(M->size(), 1u);
+        InnerMap = (*M)["foo"];
+      }
+      auto *N = dyn_cast<MapNode>(InnerMap.get());
+      ASSERT_TRUE(N);
+      ASSERT_EQ(N->size(), 1u);
+      IntNode = (*N)["bar"];
+    }
+    auto *S = dyn_cast<ScalarNode>(IntNode.get());
+    ASSERT_TRUE(S);
+    ASSERT_EQ(S->getScalarKind(), ScalarNode::SK_UInt);
+    ASSERT_EQ(S->getUInt(), 1u);
+  }
+}
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 5dba2de4a88b5e8b5f9159f13a7f34b3c94f07a3..459e89d707f5361b5894d998a832c93278ed1b83 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -30,6 +30,7 @@ add_subdirectory(OptRemarks)
 add_subdirectory(Passes)
 add_subdirectory(ProfileData)
 add_subdirectory(Support)
+add_subdirectory(TextAPI)
 add_subdirectory(Target)
 add_subdirectory(Transforms)
 add_subdirectory(XRay)
diff --git a/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index 0c184d371875c5260c863a889c9f25fd1ddb03c0..03bfdc2b5b29ef78dd2295f2837437b1d29bb6b0 100644
--- a/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -88,7 +88,7 @@ TEST_F(AArch64SelectionDAGTest, computeKnownBits_ZERO_EXTEND_VECTOR_INREG) {
   auto OutVecVT = EVT::getVectorVT(Context, Int16VT, 2);
   auto InVec = DAG->getConstant(0, Loc, InVecVT);
   auto Op = DAG->getNode(ISD::ZERO_EXTEND_VECTOR_INREG, Loc, OutVecVT, InVec);
-  auto DemandedElts = APInt(4, 15);
+  auto DemandedElts = APInt(2, 3);
   KnownBits Known;
   DAG->computeKnownBits(Op, Known, DemandedElts);
   EXPECT_TRUE(Known.isZero());
@@ -120,7 +120,7 @@ TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_SIGN_EXTEND_VECTOR_INREG) {
   auto OutVecVT = EVT::getVectorVT(Context, Int16VT, 2);
   auto InVec = DAG->getConstant(1, Loc, InVecVT);
   auto Op = DAG->getNode(ISD::SIGN_EXTEND_VECTOR_INREG, Loc, OutVecVT, InVec);
-  auto DemandedElts = APInt(4, 15);
+  auto DemandedElts = APInt(2, 3);
   EXPECT_EQ(DAG->ComputeNumSignBits(Op, DemandedElts), 15u);
 }
 
diff --git a/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
index 0cc989be3c091ce477094b127f33a76669c661f2..ee84aeff994f09463025c9db838294536e28dbf3 100644
--- a/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
+++ b/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
@@ -11,6 +11,14 @@
 
 namespace {
 
+class DummyGISelObserver : public GISelChangeObserver {
+public:
+  void changingInstr(const MachineInstr &MI) override {}
+  void changedInstr(const MachineInstr &MI) override {}
+  void createdInstr(const MachineInstr &MI) override {}
+  void erasingInstr(const MachineInstr &MI) override {}
+};
+
 // Test CTTZ expansion when CTTZ_ZERO_UNDEF is legal or custom,
 // in which case it becomes CTTZ_ZERO_UNDEF with select.
 TEST_F(LegalizerHelperTest, LowerBitCountingCTTZ0) {
@@ -21,9 +29,11 @@ TEST_F(LegalizerHelperTest, LowerBitCountingCTTZ0) {
   DefineLegalizerInfo(
       A, { getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).legalFor({s64}); });
   // Build Instr
-  auto MIBCTTZ = B.buildInstr(TargetOpcode::G_CTTZ, LLT::scalar(64), Copies[0]);
+  auto MIBCTTZ =
+      B.buildInstr(TargetOpcode::G_CTTZ, {LLT::scalar(64)}, {Copies[0]});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   // Perform Legalization
   ASSERT_TRUE(Helper.lower(*MIBCTTZ, 0, LLT::scalar(64)) ==
               LegalizerHelper::LegalizeResult::Legalized);
@@ -49,9 +59,11 @@ TEST_F(LegalizerHelperTest, LowerBitCountingCTTZ1) {
   DefineLegalizerInfo(A,
                       { getActionDefinitionsBuilder(G_CTLZ).legalFor({s64}); });
   // Build Instr
-  auto MIBCTTZ = B.buildInstr(TargetOpcode::G_CTTZ, LLT::scalar(64), Copies[0]);
+  auto MIBCTTZ =
+      B.buildInstr(TargetOpcode::G_CTTZ, {LLT::scalar(64)}, {Copies[0]});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   // Perform Legalization
   ASSERT_TRUE(Helper.lower(*MIBCTTZ, 0, LLT::scalar(64)) ==
               LegalizerHelper::LegalizeResult::Legalized);
@@ -79,9 +91,11 @@ TEST_F(LegalizerHelperTest, LowerBitCountingCTTZ2) {
   DefineLegalizerInfo(
       A, { getActionDefinitionsBuilder(G_CTPOP).legalFor({s64}); });
   // Build
-  auto MIBCTTZ = B.buildInstr(TargetOpcode::G_CTTZ, LLT::scalar(64), Copies[0]);
+  auto MIBCTTZ =
+      B.buildInstr(TargetOpcode::G_CTTZ, {LLT::scalar(64)}, {Copies[0]});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.lower(*MIBCTTZ, 0, LLT::scalar(64)) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -106,10 +120,11 @@ TEST_F(LegalizerHelperTest, LowerBitCountingCTTZ3) {
   DefineLegalizerInfo(A,
                       { getActionDefinitionsBuilder(G_CTTZ).legalFor({s64}); });
   // Build
-  auto MIBCTTZ =
-      B.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, LLT::scalar(64), Copies[0]);
+  auto MIBCTTZ = B.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF,
+                              {LLT::scalar(64)}, {Copies[0]});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.lower(*MIBCTTZ, 0, LLT::scalar(64)) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -130,9 +145,11 @@ TEST_F(LegalizerHelperTest, LowerBitCountingCTLZ0) {
   DefineLegalizerInfo(
       A, { getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).legalFor({s64}); });
   // Build
-  auto MIBCTLZ = B.buildInstr(TargetOpcode::G_CTLZ, LLT::scalar(64), Copies[0]);
+  auto MIBCTLZ =
+      B.buildInstr(TargetOpcode::G_CTLZ, {LLT::scalar(64)}, {Copies[0]});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.lower(*MIBCTLZ, 0, LLT::scalar(64)) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -148,6 +165,35 @@ TEST_F(LegalizerHelperTest, LowerBitCountingCTLZ0) {
   ASSERT_TRUE(CheckMachineFunction(*MF, CheckStr));
 }
 
+// CTLZ expansion in terms of CTLZ_ZERO_UNDEF if the latter is a libcall
+TEST_F(LegalizerHelperTest, LowerBitCountingCTLZLibcall) {
+  if (!TM)
+    return;
+
+  // Declare your legalization info
+  DefineLegalizerInfo(
+      A, { getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).libcallFor({s64}); });
+  // Build
+  auto MIBCTLZ =
+      B.buildInstr(TargetOpcode::G_CTLZ, {LLT::scalar(64)}, {Copies[0]});
+  AInfo Info(MF->getSubtarget());
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
+  ASSERT_TRUE(Helper.lower(*MIBCTLZ, 0, LLT::scalar(64)) ==
+              LegalizerHelper::LegalizeResult::Legalized);
+
+  auto CheckStr = R"(
+  CHECK: [[CZU:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_UNDEF %0
+  CHECK: [[ZERO:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  CHECK: [[THIRTY2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+  CHECK: [[CMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), %0:_(s64), [[ZERO]]
+  CHECK: [[SEL:%[0-9]+]]:_(s64) = G_SELECT [[CMP]]:_(s1), [[THIRTY2]]:_, [[CZU]]
+  )";
+
+  // Check
+  ASSERT_TRUE(CheckMachineFunction(*MF, CheckStr));
+}
+
 // CTLZ expansion
 TEST_F(LegalizerHelperTest, LowerBitCountingCTLZ1) {
   if (!TM)
@@ -160,9 +206,10 @@ TEST_F(LegalizerHelperTest, LowerBitCountingCTLZ1) {
   // Trunc it to s8.
   LLT s8{LLT::scalar(8)};
   auto MIBTrunc = B.buildTrunc(s8, Copies[0]);
-  auto MIBCTLZ = B.buildInstr(TargetOpcode::G_CTLZ, s8, MIBTrunc);
+  auto MIBCTLZ = B.buildInstr(TargetOpcode::G_CTLZ, {s8}, {MIBTrunc});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.lower(*MIBCTLZ, 0, s8) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -199,9 +246,10 @@ TEST_F(LegalizerHelperTest, WidenBitCountingCTLZ) {
   LLT s8{LLT::scalar(8)};
   LLT s16{LLT::scalar(16)};
   auto MIBTrunc = B.buildTrunc(s8, Copies[0]);
-  auto MIBCTLZ = B.buildInstr(TargetOpcode::G_CTLZ, s8, MIBTrunc);
+  auto MIBCTLZ = B.buildInstr(TargetOpcode::G_CTLZ, {s8}, {MIBTrunc});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.widenScalar(*MIBCTLZ, 0, s16) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -231,9 +279,11 @@ TEST_F(LegalizerHelperTest, WidenBitCountingCTLZZeroUndef) {
   LLT s8{LLT::scalar(8)};
   LLT s16{LLT::scalar(16)};
   auto MIBTrunc = B.buildTrunc(s8, Copies[0]);
-  auto MIBCTLZ_ZU = B.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, s8, MIBTrunc);
+  auto MIBCTLZ_ZU =
+      B.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, {s8}, {MIBTrunc});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.widenScalar(*MIBCTLZ_ZU, 0, s16) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -263,9 +313,10 @@ TEST_F(LegalizerHelperTest, WidenBitCountingCTPOP) {
   LLT s8{LLT::scalar(8)};
   LLT s16{LLT::scalar(16)};
   auto MIBTrunc = B.buildTrunc(s8, Copies[0]);
-  auto MIBCTPOP = B.buildInstr(TargetOpcode::G_CTPOP, s8, MIBTrunc);
+  auto MIBCTPOP = B.buildInstr(TargetOpcode::G_CTPOP, {s8}, {MIBTrunc});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.widenScalar(*MIBCTPOP, 0, s16) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -294,9 +345,10 @@ TEST_F(LegalizerHelperTest, WidenBitCountingCTTZ_ZERO_UNDEF) {
   LLT s16{LLT::scalar(16)};
   auto MIBTrunc = B.buildTrunc(s8, Copies[0]);
   auto MIBCTTZ_ZERO_UNDEF =
-      B.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, s8, MIBTrunc);
+      B.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, {s8}, {MIBTrunc});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.widenScalar(*MIBCTTZ_ZERO_UNDEF, 0, s16) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -324,9 +376,10 @@ TEST_F(LegalizerHelperTest, WidenBitCountingCTTZ) {
   LLT s8{LLT::scalar(8)};
   LLT s16{LLT::scalar(16)};
   auto MIBTrunc = B.buildTrunc(s8, Copies[0]);
-  auto MIBCTTZ = B.buildInstr(TargetOpcode::G_CTTZ, s8, MIBTrunc);
+  auto MIBCTTZ = B.buildInstr(TargetOpcode::G_CTTZ, {s8}, {MIBTrunc});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.widenScalar(*MIBCTTZ, 0, s16) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -356,12 +409,11 @@ TEST_F(LegalizerHelperTest, WidenUADDO) {
   LLT s16{LLT::scalar(16)};
   auto MIBTrunc = B.buildTrunc(s8, Copies[0]);
   unsigned CarryReg = MRI->createGenericVirtualRegister(LLT::scalar(1));
-  auto MIBUAddO = B.buildInstr(TargetOpcode::G_UADDO, s8)
-                      .addDef(CarryReg)
-                      .addUse(MIBTrunc->getOperand(0).getReg())
-                      .addUse(MIBTrunc->getOperand(0).getReg());
+  auto MIBUAddO =
+      B.buildInstr(TargetOpcode::G_UADDO, {s8, CarryReg}, {MIBTrunc, MIBTrunc});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.widenScalar(*MIBUAddO, 0, s16) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
@@ -394,12 +446,11 @@ TEST_F(LegalizerHelperTest, WidenUSUBO) {
   LLT s16{LLT::scalar(16)};
   auto MIBTrunc = B.buildTrunc(s8, Copies[0]);
   unsigned CarryReg = MRI->createGenericVirtualRegister(LLT::scalar(1));
-  auto MIBUSUBO = B.buildInstr(TargetOpcode::G_USUBO, s8)
-                      .addDef(CarryReg)
-                      .addUse(MIBTrunc->getOperand(0).getReg())
-                      .addUse(MIBTrunc->getOperand(0).getReg());
+  auto MIBUSUBO =
+      B.buildInstr(TargetOpcode::G_USUBO, {s8, CarryReg}, {MIBTrunc, MIBTrunc});
   AInfo Info(MF->getSubtarget());
-  LegalizerHelper Helper(*MF, Info);
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer);
   ASSERT_TRUE(Helper.widenScalar(*MIBUSUBO, 0, s16) ==
               LegalizerHelper::LegalizeResult::Legalized);
 
diff --git a/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h b/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
index 28af811e1f1701740196dbcec4805b3886399fa0..0a171a76b4787ca98cd20dd5a27d58cb973f71fa 100644
--- a/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
+++ b/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
diff --git a/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp b/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
index 1f3a690ad0155a7d8380f7132ba9d084a2cda439..466a2dd6b06c2794b3ad93f8c26186da7d60cfe8 100644
--- a/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
+++ b/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
@@ -204,8 +204,8 @@ TEST(PatternMatchInstr, MatchBinaryOp) {
                    m_GSub(m_ICst(Cst), m_Reg(Src0)));
   ASSERT_FALSE(match);
 
-  auto MIBFMul = B.buildInstr(TargetOpcode::G_FMUL, s64, Copies[0],
-                              B.buildConstant(s64, 42));
+  auto MIBFMul = B.buildInstr(TargetOpcode::G_FMUL, {s64},
+                              {Copies[0], B.buildConstant(s64, 42)});
   // Match and test commutativity for FMUL.
   match = mi_match(MIBFMul->getOperand(0).getReg(), MRI,
                    m_GFMul(m_ICst(Cst), m_Reg(Src0)));
@@ -214,8 +214,8 @@ TEST(PatternMatchInstr, MatchBinaryOp) {
   ASSERT_EQ(Src0, Copies[0]);
 
   // FSUB
-  auto MIBFSub = B.buildInstr(TargetOpcode::G_FSUB, s64, Copies[0],
-                              B.buildConstant(s64, 42));
+  auto MIBFSub = B.buildInstr(TargetOpcode::G_FSUB, {s64},
+                              {Copies[0], B.buildConstant(s64, 42)});
   match = mi_match(MIBFSub->getOperand(0).getReg(), MRI,
                    m_GFSub(m_Reg(Src0), m_Reg()));
   ASSERT_TRUE(match);
@@ -249,8 +249,8 @@ TEST(PatternMatchInstr, MatchBinaryOp) {
   ASSERT_TRUE(match);
   ASSERT_EQ(Cst, 1);
   auto MIBCAdd1 =
-      CFB.buildInstr(TargetOpcode::G_ADD, s32, CFB.buildConstant(s32, 0),
-                     CFB.buildConstant(s32, 1));
+      CFB.buildInstr(TargetOpcode::G_ADD, {s32},
+                     {CFB.buildConstant(s32, 0), CFB.buildConstant(s32, 1)});
   // This should be a constant now.
   match = mi_match(MIBCAdd1->getOperand(0).getReg(), MRI, m_ICst(Cst));
   ASSERT_TRUE(match);
@@ -261,8 +261,8 @@ TEST(PatternMatchInstr, MatchBinaryOp) {
   ConstantFoldingMIRBuilder CFB1(*MF);
   CFB1.setInsertPt(*EntryMBB, EntryMBB->end());
   auto MIBCSub =
-      CFB1.buildInstr(TargetOpcode::G_SUB, s32, CFB1.buildConstant(s32, 1),
-                      CFB1.buildConstant(s32, 1));
+      CFB1.buildInstr(TargetOpcode::G_SUB, {s32},
+                      {CFB1.buildConstant(s32, 1), CFB1.buildConstant(s32, 1)});
   // This should be a constant now.
   match = mi_match(MIBCSub->getOperand(0).getReg(), MRI, m_ICst(Cst));
   ASSERT_TRUE(match);
@@ -289,12 +289,12 @@ TEST(PatternMatchInstr, MatchFPUnaryOp) {
   auto Copy0s32 = B.buildFPTrunc(s32, Copies[0]);
 
   // Match G_FABS.
-  auto MIBFabs = B.buildInstr(TargetOpcode::G_FABS, s32, Copy0s32);
+  auto MIBFabs = B.buildInstr(TargetOpcode::G_FABS, {s32}, {Copy0s32});
   bool match = mi_match(MIBFabs->getOperand(0).getReg(), MRI, m_GFabs(m_Reg()));
   ASSERT_TRUE(match);
 
   unsigned Src;
-  auto MIBFNeg = B.buildInstr(TargetOpcode::G_FNEG, s32, Copy0s32);
+  auto MIBFNeg = B.buildInstr(TargetOpcode::G_FNEG, {s32}, {Copy0s32});
   match = mi_match(MIBFNeg->getOperand(0).getReg(), MRI, m_GFNeg(m_Reg(Src)));
   ASSERT_TRUE(match);
   ASSERT_EQ(Src, Copy0s32->getOperand(0).getReg());
diff --git a/unittests/CodeGen/MachineInstrTest.cpp b/unittests/CodeGen/MachineInstrTest.cpp
index 6883d3ed63e9c4b12ec1097c32149d1c54029e6f..73d3d25e8154df10ca1c92b7d42fe3b903a68785 100644
--- a/unittests/CodeGen/MachineInstrTest.cpp
+++ b/unittests/CodeGen/MachineInstrTest.cpp
@@ -257,8 +257,8 @@ TEST(MachineInstrPrintingTest, DebugLocPrinting) {
   LLVMContext Ctx;
   DIFile *DIF = DIFile::getDistinct(Ctx, "filename", "");
   DISubprogram *DIS = DISubprogram::getDistinct(
-      Ctx, nullptr, "", "", DIF, 0, nullptr, false, false, 0, nullptr, 0, 0, 0,
-      DINode::FlagZero, false, nullptr);
+      Ctx, nullptr, "", "", DIF, 0, nullptr, 0, nullptr, 0, 0, DINode::FlagZero,
+      DISubprogram::SPFlagZero, nullptr);
   DILocation *DIL = DILocation::get(Ctx, 1, 5, DIS);
   DebugLoc DL(DIL);
   MachineInstr *MI = MF->CreateMachineInstr(MCID, DL);
diff --git a/unittests/Demangle/CMakeLists.txt b/unittests/Demangle/CMakeLists.txt
index 2f9d71a37e9f2e1817c8243261875d3d9881c52d..954f3d05eacb3a2d329e8000d1d7c3ac0ede1cb6 100644
--- a/unittests/Demangle/CMakeLists.txt
+++ b/unittests/Demangle/CMakeLists.txt
@@ -6,5 +6,4 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(DemangleTests
   ItaniumDemangleTest.cpp
   PartialDemangleTest.cpp
-  FindTypesInMangledNameTest.cpp
 )
diff --git a/unittests/Demangle/FindTypesInMangledNameTest.cpp b/unittests/Demangle/FindTypesInMangledNameTest.cpp
deleted file mode 100644
index a2f77dc6d9603fb3a99050c8e396a964e1924aa6..0000000000000000000000000000000000000000
--- a/unittests/Demangle/FindTypesInMangledNameTest.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//===------------------ FindTypesInMangledNameTest.cpp --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cstdlib>
-#include <vector>
-#include "llvm/Demangle/Demangle.h"
-#include "gtest/gtest.h"
-
-TEST(FindTypesInMangledNameTest, Test) {
-  std::vector<const char *> Types;
-  const char *Mangled = "_Z1fiv";
-  EXPECT_FALSE(llvm::itaniumFindTypesInMangledName(
-      Mangled, static_cast<void *>(&Types), [](void *Ty, const char *P) {
-        static_cast<std::vector<const char *> *>(Ty)->push_back(P);
-      }));
-  EXPECT_EQ(Types.size(), size_t(2));
-  EXPECT_EQ(Mangled + 4, Types.front());
-  EXPECT_EQ(Mangled + 5, Types.back());
-
-  EXPECT_TRUE(llvm::itaniumFindTypesInMangledName(
-      "Not a mangled name!", nullptr, [](void *, const char *) {}));
-
-  int TC = 0;
-  EXPECT_FALSE(llvm::itaniumFindTypesInMangledName(
-      "_Z1fPRic", static_cast<void *>(&TC),
-      [](void *Ctx, const char *) { ++*static_cast<int *>(Ctx); }));
-  EXPECT_EQ(TC, 4); // pointer, reference, int, char.
-}
diff --git a/unittests/ExecutionEngine/Orc/RemoteObjectLayerTest.cpp b/unittests/ExecutionEngine/Orc/RemoteObjectLayerTest.cpp
index ae408a06f84d71273c58acf060490e95c8a00a0d..09224c289b4d1f421fca6614a93e8f7d92ddd017 100644
--- a/unittests/ExecutionEngine/Orc/RemoteObjectLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RemoteObjectLayerTest.cpp
@@ -112,10 +112,9 @@ TEST(RemoteObjectLayer, AddObject) {
 
   auto Channels = createPairedQueueChannels();
 
-  auto ReportError =
-    [](Error Err) {
-      logAllUnhandledErrors(std::move(Err), llvm::errs(), "");
-    };
+  auto ReportError = [](Error Err) {
+    logAllUnhandledErrors(std::move(Err), llvm::errs());
+  };
 
   // Copy the bytes out of the test object: the copy will be used to verify
   // that the original is correctly transmitted over RPC to the mock layer.
@@ -225,10 +224,9 @@ TEST(RemoteObjectLayer, RemoveObject) {
 
   auto Channels = createPairedQueueChannels();
 
-  auto ReportError =
-    [](Error Err) {
-      logAllUnhandledErrors(std::move(Err), llvm::errs(), "");
-    };
+  auto ReportError = [](Error Err) {
+    logAllUnhandledErrors(std::move(Err), llvm::errs());
+  };
 
   RPCEndpoint ClientEP(*Channels.first, true);
   RemoteObjectClientLayer<RPCEndpoint> Client(ClientEP, ReportError);
@@ -489,10 +487,9 @@ TEST(RemoteObjectLayer, EmitAndFinalize) {
 
   auto Channels = createPairedQueueChannels();
 
-  auto ReportError =
-    [](Error Err) {
-      logAllUnhandledErrors(std::move(Err), llvm::errs(), "");
-    };
+  auto ReportError = [](Error Err) {
+    logAllUnhandledErrors(std::move(Err), llvm::errs());
+  };
 
   RPCEndpoint ClientEP(*Channels.first, true);
   RemoteObjectClientLayer<RPCEndpoint> Client(ClientEP, ReportError);
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index be29b41309a440ffcbefa77de37192047ccf8a34..db174fa2ff76585bcf6795816e0b1c00a0575e2e 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -481,14 +481,16 @@ TEST_F(IRBuilderTest, createFunction) {
   auto CU =
       DIB.createCompileUnit(dwarf::DW_LANG_Swift, File, "swiftc", true, "", 0);
   auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
-  auto NoErr = DIB.createFunction(CU, "noerr", "", File, 1, Type, false, true, 1,
-                               DINode::FlagZero, true);
+  auto NoErr = DIB.createFunction(
+      CU, "noerr", "", File, 1, Type, 1, DINode::FlagZero,
+      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
   EXPECT_TRUE(!NoErr->getThrownTypes());
   auto Int = DIB.createBasicType("Int", 64, dwarf::DW_ATE_signed);
   auto Error = DIB.getOrCreateArray({Int});
-  auto Err =
-      DIB.createFunction(CU, "err", "", File, 1, Type, false, true, 1,
-      DINode::FlagZero, true, nullptr, nullptr, Error.get());
+  auto Err = DIB.createFunction(
+      CU, "err", "", File, 1, Type, 1, DINode::FlagZero,
+      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized, nullptr,
+      nullptr, Error.get());
   EXPECT_TRUE(Err->getThrownTypes().get() == Error.get());
   DIB.finalize();
 }
@@ -501,12 +503,14 @@ TEST_F(IRBuilderTest, DIBuilder) {
                                   DIB.createFile("F.CBL", "/"), "llvm-cobol74",
                                   true, "", 0);
   auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
-  auto SP = DIB.createFunction(CU, "foo", "", File, 1, Type, false, true, 1,
-                               DINode::FlagZero, true);
+  auto SP = DIB.createFunction(
+      CU, "foo", "", File, 1, Type, 1, DINode::FlagZero,
+      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
   F->setSubprogram(SP);
   AllocaInst *I = Builder.CreateAlloca(Builder.getInt8Ty());
-  auto BarSP = DIB.createFunction(CU, "bar", "", File, 1, Type, false, true, 1,
-                                  DINode::FlagZero, true);
+  auto BarSP = DIB.createFunction(
+      CU, "bar", "", File, 1, Type, 1, DINode::FlagZero,
+      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
   auto BadScope = DIB.createLexicalBlockFile(BarSP, File, 0);
   I->setDebugLoc(DebugLoc::get(2, 0, BadScope));
   DIB.finalize();
@@ -521,10 +525,10 @@ TEST_F(IRBuilderTest, createArtificialSubprogram) {
                                   /*isOptimized=*/true, /*Flags=*/"",
                                   /*Runtime Version=*/0);
   auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
-  auto SP = DIB.createFunction(CU, "foo", /*LinkageName=*/"", File,
-                               /*LineNo=*/1, Type, /*isLocalToUnit=*/false,
-                               /*isDefinition=*/true, /*ScopeLine=*/2,
-                               DINode::FlagZero, /*isOptimized=*/true);
+  auto SP = DIB.createFunction(
+      CU, "foo", /*LinkageName=*/"", File,
+      /*LineNo=*/1, Type, /*ScopeLine=*/2, DINode::FlagZero,
+      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
   EXPECT_TRUE(SP->isDistinct());
 
   F->setSubprogram(SP);
@@ -611,7 +615,8 @@ TEST_F(IRBuilderTest, DebugLoc) {
                                   0);
   auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
   auto SP =
-      DIB.createFunction(CU, "foo", "foo", File, 1, SPType, false, true, 1);
+      DIB.createFunction(CU, "foo", "foo", File, 1, SPType, 1, DINode::FlagZero,
+                         DISubprogram::SPFlagDefinition);
   DebugLoc DL1 = DILocation::get(Ctx, 2, 0, SP);
   DebugLoc DL2 = DILocation::get(Ctx, 3, 0, SP);
 
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 100c4ed5e159078a7b2ca49631286160450544b4..3f744cd5e6d5fb0ee37276b1a3f17d6ab9ed70a6 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -84,9 +84,9 @@ protected:
                                          getNode(nullptr));
   }
   DISubprogram *getSubprogram() {
-    return DISubprogram::getDistinct(Context, nullptr, "", "", nullptr, 0,
-                                     nullptr, false, false, 0, nullptr, 0, 0, 0,
-                                     DINode::FlagZero, false, nullptr);
+    return DISubprogram::getDistinct(
+        Context, nullptr, "", "", nullptr, 0, nullptr, 0, nullptr, 0, 0,
+        DINode::FlagZero, DISubprogram::SPFlagZero, nullptr);
   }
   DIFile *getFile() {
     return DIFile::getDistinct(Context, "file.c", "/path/to/dir");
@@ -96,7 +96,7 @@ protected:
         Context, 1, getFile(), "clang", false, "-g", 2, "",
         DICompileUnit::FullDebug, getTuple(), getTuple(), getTuple(),
         getTuple(), getTuple(), 0, true, false,
-        DICompileUnit::DebugNameTableKind::Default);
+        DICompileUnit::DebugNameTableKind::Default, false);
   }
   DIType *getBasicType(StringRef Name) {
     return DIBasicType::get(Context, dwarf::DW_TAG_unspecified_type, Name);
@@ -147,11 +147,9 @@ TEST_F(MDStringTest, CreateSame) {
 
 // Test that MDString prints out the string we fed it.
 TEST_F(MDStringTest, PrintingSimple) {
-  char *str = new char[13];
-  strncpy(str, "testing 1 2 3", 13);
-  MDString *s = MDString::get(Context, StringRef(str, 13));
-  strncpy(str, "aaaaaaaaaaaaa", 13);
-  delete[] str;
+  char str[14] = "testing 1 2 3";
+  MDString *s = MDString::get(Context, StringRef(&str[0], 13));
+  strncpy(str, "aaaaaaaaaaaaa", 14);
 
   std::string Str;
   raw_string_ostream oss(Str);
@@ -929,11 +927,11 @@ TEST_F(DILocationTest, Merge) {
     // Different function, same inlined-at.
     auto *F = getFile();
     auto *SP1 = DISubprogram::getDistinct(Context, F, "a", "a", F, 0, nullptr,
-                                          false, false, 0, nullptr, 0, 0, 0,
-                                          DINode::FlagZero, false, nullptr);
+                                          0, nullptr, 0, 0, DINode::FlagZero,
+                                          DISubprogram::SPFlagZero, nullptr);
     auto *SP2 = DISubprogram::getDistinct(Context, F, "b", "b", F, 0, nullptr,
-                                          false, false, 0, nullptr, 0, 0, 0,
-                                          DINode::FlagZero, false, nullptr);
+                                          0, nullptr, 0, 0, DINode::FlagZero,
+                                          DISubprogram::SPFlagZero, nullptr);
 
     auto *I = DILocation::get(Context, 2, 7, N);
     auto *A = DILocation::get(Context, 1, 6, SP1, I);
@@ -1607,7 +1605,7 @@ TEST_F(DICompileUnitTest, get) {
       Context, SourceLanguage, File, Producer, IsOptimized, Flags,
       RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
       RetainedTypes, GlobalVariables, ImportedEntities, Macros, DWOId, true,
-      false, DICompileUnit::DebugNameTableKind::Default);
+      false, DICompileUnit::DebugNameTableKind::Default, false);
 
   EXPECT_EQ(dwarf::DW_TAG_compile_unit, N->getTag());
   EXPECT_EQ(SourceLanguage, N->getSourceLanguage());
@@ -1665,7 +1663,7 @@ TEST_F(DICompileUnitTest, replaceArrays) {
       Context, SourceLanguage, File, Producer, IsOptimized, Flags,
       RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
       RetainedTypes, nullptr, ImportedEntities, nullptr, DWOId, true, false,
-      DICompileUnit::DebugNameTableKind::Default);
+      DICompileUnit::DebugNameTableKind::Default, false);
 
   auto *GlobalVariables = MDTuple::getDistinct(Context, None);
   EXPECT_EQ(nullptr, N->getGlobalVariables().get());
@@ -1705,12 +1703,16 @@ TEST_F(DISubprogramTest, get) {
   MDTuple *RetainedNodes = getTuple();
   MDTuple *ThrownTypes = getTuple();
   DICompileUnit *Unit = getUnit();
+  DISubprogram::DISPFlags SPFlags =
+      static_cast<DISubprogram::DISPFlags>(Virtuality);
+  assert(!IsLocalToUnit && IsDefinition && !IsOptimized &&
+         "bools and SPFlags have to match");
+  SPFlags |= DISubprogram::SPFlagDefinition;
 
   auto *N = DISubprogram::get(
-      Context, Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
-      IsDefinition, ScopeLine, ContainingType, Virtuality, VirtualIndex,
-      ThisAdjustment, Flags, IsOptimized, Unit, TemplateParams, Declaration,
-      RetainedNodes, ThrownTypes);
+      Context, Scope, Name, LinkageName, File, Line, Type, ScopeLine,
+      ContainingType, VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit,
+      TemplateParams, Declaration, RetainedNodes, ThrownTypes);
 
   EXPECT_EQ(dwarf::DW_TAG_subprogram, N->getTag());
   EXPECT_EQ(Scope, N->getScope());
@@ -1733,108 +1735,101 @@ TEST_F(DISubprogramTest, get) {
   EXPECT_EQ(Declaration, N->getDeclaration());
   EXPECT_EQ(RetainedNodes, N->getRetainedNodes().get());
   EXPECT_EQ(ThrownTypes, N->getThrownTypes().get());
-  EXPECT_EQ(N, DISubprogram::get(
-                   Context, Scope, Name, LinkageName, File, Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
+  EXPECT_EQ(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, ScopeLine, ContainingType, VirtualIndex,
+                                 ThisAdjustment, Flags, SPFlags, Unit,
+                                 TemplateParams, Declaration, RetainedNodes,
+                                 ThrownTypes));
 
-  EXPECT_NE(N, DISubprogram::get(
-                   Context, getCompositeType(), Name, LinkageName, File, Line,
-                   Type, IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
-  EXPECT_NE(N, DISubprogram::get(
-                   Context, Scope, "other", LinkageName, File, Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
-  EXPECT_NE(N, DISubprogram::get(
-                   Context, Scope, Name, "other", File, Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
-  EXPECT_NE(N, DISubprogram::get(
-                   Context, Scope, Name, LinkageName, getFile(), Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
-  EXPECT_NE(N, DISubprogram::get(
-                   Context, Scope, Name, LinkageName, File, Line + 1, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
+  EXPECT_NE(N, DISubprogram::get(Context, getCompositeType(), Name, LinkageName,
+                                 File, Line, Type, ScopeLine, ContainingType,
+                                 VirtualIndex, ThisAdjustment, Flags, SPFlags,
+                                 Unit, TemplateParams, Declaration,
+                                 RetainedNodes, ThrownTypes));
+  EXPECT_NE(N, DISubprogram::get(Context, Scope, "other", LinkageName, File,
+                                 Line, Type, ScopeLine, ContainingType,
+                                 VirtualIndex, ThisAdjustment, Flags, SPFlags,
+                                 Unit, TemplateParams, Declaration,
+                                 RetainedNodes, ThrownTypes));
+  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, "other", File, Line,
+                                 Type, ScopeLine, ContainingType, VirtualIndex,
+                                 ThisAdjustment, Flags, SPFlags, Unit,
+                                 TemplateParams, Declaration, RetainedNodes,
+                                 ThrownTypes));
+  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, getFile(),
+                                 Line, Type, ScopeLine, ContainingType,
+                                 VirtualIndex, ThisAdjustment, Flags, SPFlags,
+                                 Unit, TemplateParams, Declaration,
+                                 RetainedNodes, ThrownTypes));
+  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File,
+                                 Line + 1, Type, ScopeLine, ContainingType,
+                                 VirtualIndex, ThisAdjustment, Flags, SPFlags,
+                                 Unit, TemplateParams, Declaration,
+                                 RetainedNodes, ThrownTypes));
   EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
-                                 getSubroutineType(), IsLocalToUnit,
-                                 IsDefinition, ScopeLine, ContainingType,
-                                 Virtuality, VirtualIndex, ThisAdjustment,
-                                 Flags, IsOptimized, Unit, TemplateParams,
-                                 Declaration, RetainedNodes, ThrownTypes));
+                                 getSubroutineType(), ScopeLine, ContainingType,
+                                 VirtualIndex, ThisAdjustment, Flags, SPFlags,
+                                 Unit, TemplateParams, Declaration,
+                                 RetainedNodes, ThrownTypes));
   EXPECT_NE(N, DISubprogram::get(
                    Context, Scope, Name, LinkageName, File, Line, Type,
-                   !IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
+                   ScopeLine, ContainingType, VirtualIndex, ThisAdjustment,
+                   Flags, SPFlags ^ DISubprogram::SPFlagLocalToUnit, Unit,
+                   TemplateParams, Declaration, RetainedNodes, ThrownTypes));
   EXPECT_NE(N, DISubprogram::get(
                    Context, Scope, Name, LinkageName, File, Line, Type,
-                   IsLocalToUnit, !IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
+                   ScopeLine, ContainingType, VirtualIndex, ThisAdjustment,
+                   Flags, SPFlags ^ DISubprogram::SPFlagDefinition, Unit,
+                   TemplateParams, Declaration, RetainedNodes, ThrownTypes));
+  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, ScopeLine + 1, ContainingType,
+                                 VirtualIndex, ThisAdjustment, Flags, SPFlags,
+                                 Unit, TemplateParams, Declaration,
+                                 RetainedNodes, ThrownTypes));
+  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, ScopeLine, getCompositeType(),
+                                 VirtualIndex, ThisAdjustment, Flags, SPFlags,
+                                 Unit, TemplateParams, Declaration,
+                                 RetainedNodes, ThrownTypes));
   EXPECT_NE(N, DISubprogram::get(
                    Context, Scope, Name, LinkageName, File, Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine + 1, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
+                   ScopeLine, ContainingType, VirtualIndex, ThisAdjustment,
+                   Flags, SPFlags ^ DISubprogram::SPFlagVirtual, Unit,
+                   TemplateParams, Declaration, RetainedNodes, ThrownTypes));
+  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, ScopeLine, ContainingType,
+                                 VirtualIndex + 1, ThisAdjustment, Flags,
+                                 SPFlags, Unit, TemplateParams, Declaration,
+                                 RetainedNodes, ThrownTypes));
   EXPECT_NE(N, DISubprogram::get(
                    Context, Scope, Name, LinkageName, File, Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, getCompositeType(),
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, ThrownTypes));
-  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
-                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
-                                 ContainingType, Virtuality + 1, VirtualIndex,
-                                 ThisAdjustment, Flags, IsOptimized, Unit,
-                                 TemplateParams, Declaration, RetainedNodes,
-                                 ThrownTypes));
-  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
-                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
-                                 ContainingType, Virtuality, VirtualIndex + 1,
-                                 ThisAdjustment, Flags, IsOptimized, Unit,
-                                 TemplateParams, Declaration, RetainedNodes,
-                                 ThrownTypes));
-  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
-                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
-                                 ContainingType, Virtuality, VirtualIndex,
-                                 ThisAdjustment, Flags, !IsOptimized, Unit,
-                                 TemplateParams, Declaration, RetainedNodes,
-                                 ThrownTypes));
+                   ScopeLine, ContainingType, VirtualIndex, ThisAdjustment,
+                   Flags, SPFlags ^ DISubprogram::SPFlagOptimized, Unit,
+                   TemplateParams, Declaration, RetainedNodes, ThrownTypes));
   EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
-                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
-                                 ContainingType, Virtuality, VirtualIndex,
-                                 ThisAdjustment, Flags, IsOptimized, nullptr,
+                                 Type, ScopeLine, ContainingType, VirtualIndex,
+                                 ThisAdjustment, Flags, SPFlags, nullptr,
                                  TemplateParams, Declaration, RetainedNodes,
                                  ThrownTypes));
-  EXPECT_NE(N, DISubprogram::get(
-                   Context, Scope, Name, LinkageName, File, Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, getTuple(), Declaration, RetainedNodes, ThrownTypes));
+  EXPECT_NE(N,
+            DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                              Type, ScopeLine, ContainingType, VirtualIndex,
+                              ThisAdjustment, Flags, SPFlags, Unit, getTuple(),
+                              Declaration, RetainedNodes, ThrownTypes));
   EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
-                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
-                                 ContainingType, Virtuality, VirtualIndex,
-                                 ThisAdjustment, Flags, IsOptimized, Unit,
+                                 Type, ScopeLine, ContainingType, VirtualIndex,
+                                 ThisAdjustment, Flags, SPFlags, Unit,
                                  TemplateParams, getSubprogram(), RetainedNodes,
                                  ThrownTypes));
   EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
-                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
-                                 ContainingType, Virtuality, VirtualIndex,
-                                 ThisAdjustment, Flags, IsOptimized, Unit,
+                                 Type, ScopeLine, ContainingType, VirtualIndex,
+                                 ThisAdjustment, Flags, SPFlags, Unit,
                                  TemplateParams, Declaration, getTuple()));
-  EXPECT_NE(N, DISubprogram::get(
-                   Context, Scope, Name, LinkageName, File, Line, Type,
-                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                   Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized,
-                   Unit, TemplateParams, Declaration, RetainedNodes, getTuple()));
+  EXPECT_NE(N, DISubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, ScopeLine, ContainingType, VirtualIndex,
+                                 ThisAdjustment, Flags, SPFlags, Unit,
+                                 TemplateParams, Declaration, RetainedNodes,
+                                 getTuple()));
 
   TempDISubprogram Temp = N->clone();
   EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
diff --git a/unittests/IR/PassBuilderCallbacksTest.cpp b/unittests/IR/PassBuilderCallbacksTest.cpp
index 20c47b045e75cbc67bb11e8f3eb3e46d24192a8d..63fdc67b11524f53d97a3068e4102478a40ad1ed 100644
--- a/unittests/IR/PassBuilderCallbacksTest.cpp
+++ b/unittests/IR/PassBuilderCallbacksTest.cpp
@@ -167,6 +167,11 @@ struct MockPassHandle<Loop>
   MOCK_METHOD4(run,
                PreservedAnalyses(Loop &, LoopAnalysisManager &,
                                  LoopStandardAnalysisResults &, LPMUpdater &));
+  static void invalidateLoop(Loop &L, LoopAnalysisManager &,
+                             LoopStandardAnalysisResults &,
+                             LPMUpdater &Updater) {
+    Updater.markLoopAsDeleted(L, L.getName());
+  }
   MockPassHandle() { setDefaults(); }
 };
 
@@ -187,6 +192,11 @@ struct MockPassHandle<LazyCallGraph::SCC>
                PreservedAnalyses(LazyCallGraph::SCC &, CGSCCAnalysisManager &,
                                  LazyCallGraph &G, CGSCCUpdateResult &UR));
 
+  static void invalidateSCC(LazyCallGraph::SCC &C, CGSCCAnalysisManager &,
+                            LazyCallGraph &, CGSCCUpdateResult &UR) {
+    UR.InvalidatedSCCs.insert(&C);
+  }
+
   MockPassHandle() { setDefaults(); }
 };
 
@@ -310,6 +320,7 @@ struct MockPassInstrumentationCallbacks {
   }
   MOCK_METHOD2(runBeforePass, bool(StringRef PassID, llvm::Any));
   MOCK_METHOD2(runAfterPass, void(StringRef PassID, llvm::Any));
+  MOCK_METHOD1(runAfterPassInvalidated, void(StringRef PassID));
   MOCK_METHOD2(runBeforeAnalysis, void(StringRef PassID, llvm::Any));
   MOCK_METHOD2(runAfterAnalysis, void(StringRef PassID, llvm::Any));
 
@@ -319,6 +330,8 @@ struct MockPassInstrumentationCallbacks {
     });
     Callbacks.registerAfterPassCallback(
         [this](StringRef P, llvm::Any IR) { this->runAfterPass(P, IR); });
+    Callbacks.registerAfterPassInvalidatedCallback(
+        [this](StringRef P) { this->runAfterPassInvalidated(P); });
     Callbacks.registerBeforeAnalysisCallback([this](StringRef P, llvm::Any IR) {
       return this->runBeforeAnalysis(P, IR);
     });
@@ -571,6 +584,11 @@ TEST_F(FunctionCallbacksTest, InstrumentedPasses) {
               runAfterPass(HasNameRegex("MockPassHandle"), HasName("foo")))
       .InSequence(PISequence);
 
+  // Our mock pass does not invalidate IR.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("MockPassHandle")))
+      .Times(0);
+
   StringRef PipelineText = "test-transform";
   ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
@@ -595,6 +613,9 @@ TEST_F(FunctionCallbacksTest, InstrumentedSkippedPasses) {
   // as well.
   EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("MockPassHandle"), _))
       .Times(0);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("MockPassHandle")))
+      .Times(0);
   EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("MockPassHandle"), _))
       .Times(0);
   EXPECT_CALL(CallbacksHandle,
@@ -650,6 +671,55 @@ TEST_F(LoopCallbacksTest, InstrumentedPasses) {
               runAfterPass(HasNameRegex("MockPassHandle"), HasName("loop")))
       .InSequence(PISequence);
 
+  // Our mock pass does not invalidate IR.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("MockPassHandle")))
+      .Times(0);
+
+  StringRef PipelineText = "test-transform";
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+      << "Pipeline was: " << PipelineText;
+  PM.run(*M, AM);
+}
+
+TEST_F(LoopCallbacksTest, InstrumentedInvalidatingPasses) {
+  CallbacksHandle.registerPassInstrumentation();
+  // Non-mock instrumentation not specifically mentioned below can be ignored.
+  CallbacksHandle.ignoreNonMockPassInstrumentation("<string>");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("foo");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("loop");
+
+  EXPECT_CALL(AnalysisHandle, run(HasName("loop"), _, _));
+  EXPECT_CALL(PassHandle, run(HasName("loop"), _, _, _))
+      .WillOnce(DoAll(WithArgs<0, 1, 2, 3>(Invoke(PassHandle.invalidateLoop)),
+                      WithArgs<0, 1, 2>(Invoke(getAnalysisResult))));
+
+  // PassInstrumentation calls should happen in-sequence, in the same order
+  // as passes/analyses are scheduled.
+  ::testing::Sequence PISequence;
+  EXPECT_CALL(CallbacksHandle,
+              runBeforePass(HasNameRegex("MockPassHandle"), HasName("loop")))
+      .InSequence(PISequence);
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("loop")))
+      .InSequence(PISequence);
+  EXPECT_CALL(
+      CallbacksHandle,
+      runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("loop")))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("MockPassHandle")))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("^PassManager")))
+      .InSequence(PISequence);
+
+  // Our mock pass invalidates IR, thus normal runAfterPass is never called.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPass(HasNameRegex("MockPassHandle"), HasName("loop")))
+      .Times(0);
+
   StringRef PipelineText = "test-transform";
   ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
@@ -675,6 +745,9 @@ TEST_F(LoopCallbacksTest, InstrumentedSkippedPasses) {
   // as well.
   EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("MockPassHandle"), _))
       .Times(0);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("MockPassHandle")))
+      .Times(0);
   EXPECT_CALL(CallbacksHandle,
               runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), _))
       .Times(0);
@@ -727,6 +800,54 @@ TEST_F(CGSCCCallbacksTest, InstrumentedPasses) {
               runAfterPass(HasNameRegex("MockPassHandle"), HasName("(foo)")))
       .InSequence(PISequence);
 
+  // Our mock pass does not invalidate IR.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("MockPassHandle")))
+      .Times(0);
+
+  StringRef PipelineText = "test-transform";
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+      << "Pipeline was: " << PipelineText;
+  PM.run(*M, AM);
+}
+
+TEST_F(CGSCCCallbacksTest, InstrumentedInvalidatingPasses) {
+  CallbacksHandle.registerPassInstrumentation();
+  // Non-mock instrumentation not specifically mentioned below can be ignored.
+  CallbacksHandle.ignoreNonMockPassInstrumentation("<string>");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("(foo)");
+
+  EXPECT_CALL(AnalysisHandle, run(HasName("(foo)"), _, _));
+  EXPECT_CALL(PassHandle, run(HasName("(foo)"), _, _, _))
+      .WillOnce(DoAll(WithArgs<0, 1, 2, 3>(Invoke(PassHandle.invalidateSCC)),
+                      WithArgs<0, 1, 2>(Invoke(getAnalysisResult))));
+
+  // PassInstrumentation calls should happen in-sequence, in the same order
+  // as passes/analyses are scheduled.
+  ::testing::Sequence PISequence;
+  EXPECT_CALL(CallbacksHandle,
+              runBeforePass(HasNameRegex("MockPassHandle"), HasName("(foo)")))
+      .InSequence(PISequence);
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("(foo)")))
+      .InSequence(PISequence);
+  EXPECT_CALL(
+      CallbacksHandle,
+      runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("(foo)")))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("MockPassHandle")))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("^PassManager")))
+      .InSequence(PISequence);
+
+  // Our mock pass does invalidate IR, thus normal runAfterPass is never called.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPass(HasNameRegex("MockPassHandle"), HasName("(foo)")))
+      .Times(0);
+
   StringRef PipelineText = "test-transform";
   ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
@@ -752,6 +873,9 @@ TEST_F(CGSCCCallbacksTest, InstrumentedSkippedPasses) {
   // as well.
   EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("MockPassHandle"), _))
       .Times(0);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("MockPassHandle")))
+      .Times(0);
   EXPECT_CALL(CallbacksHandle,
               runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), _))
       .Times(0);
diff --git a/unittests/IR/PatternMatch.cpp b/unittests/IR/PatternMatch.cpp
index 6b5686d53c66629c675fa513c10e178db1a70d48..976e42d0dbba1ce4305f785041b953d7aae6e43c 100644
--- a/unittests/IR/PatternMatch.cpp
+++ b/unittests/IR/PatternMatch.cpp
@@ -534,6 +534,62 @@ TEST_F(PatternMatchTest, VectorOps) {
   EXPECT_TRUE(A == Val);
 }
 
+TEST_F(PatternMatchTest, VectorUndefInt) {
+  Type *ScalarTy = IRB.getInt8Ty();
+  Type *VectorTy = VectorType::get(ScalarTy, 4);
+  Constant *ScalarUndef = UndefValue::get(ScalarTy);
+  Constant *VectorUndef = UndefValue::get(VectorTy);
+  Constant *ScalarZero = Constant::getNullValue(ScalarTy);
+  Constant *VectorZero = Constant::getNullValue(VectorTy);
+
+  SmallVector<Constant *, 4> Elems;
+  Elems.push_back(ScalarUndef);
+  Elems.push_back(ScalarZero);
+  Elems.push_back(ScalarUndef);
+  Elems.push_back(ScalarZero);
+  Constant *VectorZeroUndef = ConstantVector::get(Elems);
+
+  EXPECT_TRUE(match(ScalarUndef, m_Undef()));
+  EXPECT_TRUE(match(VectorUndef, m_Undef()));
+  EXPECT_FALSE(match(ScalarZero, m_Undef()));
+  EXPECT_FALSE(match(VectorZero, m_Undef()));
+  EXPECT_FALSE(match(VectorZeroUndef, m_Undef()));
+
+  EXPECT_FALSE(match(ScalarUndef, m_Zero()));
+  EXPECT_FALSE(match(VectorUndef, m_Zero()));
+  EXPECT_TRUE(match(ScalarZero, m_Zero()));
+  EXPECT_TRUE(match(VectorZero, m_Zero()));
+  EXPECT_TRUE(match(VectorZeroUndef, m_Zero()));
+}
+
+TEST_F(PatternMatchTest, VectorUndefFloat) {
+  Type *ScalarTy = IRB.getFloatTy();
+  Type *VectorTy = VectorType::get(ScalarTy, 4);
+  Constant *ScalarUndef = UndefValue::get(ScalarTy);
+  Constant *VectorUndef = UndefValue::get(VectorTy);
+  Constant *ScalarZero = Constant::getNullValue(ScalarTy);
+  Constant *VectorZero = Constant::getNullValue(VectorTy);
+
+  SmallVector<Constant *, 4> Elems;
+  Elems.push_back(ScalarUndef);
+  Elems.push_back(ScalarZero);
+  Elems.push_back(ScalarUndef);
+  Elems.push_back(ScalarZero);
+  Constant *VectorZeroUndef = ConstantVector::get(Elems);
+
+  EXPECT_TRUE(match(ScalarUndef, m_Undef()));
+  EXPECT_TRUE(match(VectorUndef, m_Undef()));
+  EXPECT_FALSE(match(ScalarZero, m_Undef()));
+  EXPECT_FALSE(match(VectorZero, m_Undef()));
+  EXPECT_FALSE(match(VectorZeroUndef, m_Undef()));
+
+  EXPECT_FALSE(match(ScalarUndef, m_AnyZeroFP()));
+  EXPECT_FALSE(match(VectorUndef, m_AnyZeroFP()));
+  EXPECT_TRUE(match(ScalarZero, m_AnyZeroFP()));
+  EXPECT_TRUE(match(VectorZero, m_AnyZeroFP()));
+  EXPECT_TRUE(match(VectorZeroUndef, m_AnyZeroFP()));
+}
+
 template <typename T> struct MutableConstTest : PatternMatchTest { };
 
 typedef ::testing::Types<std::tuple<Value*, Instruction*>,
diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp
index ac94eb102d6a6fc05424e9340e811070442a8d0c..53e5d6fbbe4d5cde69772b5e36e2f5c8b0d07ff1 100644
--- a/unittests/IR/VerifierTest.cpp
+++ b/unittests/IR/VerifierTest.cpp
@@ -178,9 +178,10 @@ TEST(VerifierTest, DetectInvalidDebugInfo) {
         "f", FunctionType::get(Type::getVoidTy(C), false)));
     IRBuilder<> Builder(BasicBlock::Create(C, "", F));
     Builder.CreateUnreachable();
-    F->setSubprogram(DIB.createFunction(CU, "f", "f",
-                                        DIB.createFile("broken.c", "/"), 1,
-                                        nullptr, true, true, 1));
+    F->setSubprogram(DIB.createFunction(
+        CU, "f", "f", DIB.createFile("broken.c", "/"), 1, nullptr, 1,
+        DINode::FlagZero,
+        DISubprogram::SPFlagLocalToUnit | DISubprogram::SPFlagDefinition));
     DIB.finalize();
     EXPECT_FALSE(verifyModule(M));
 
diff --git a/unittests/ProfileData/SampleProfTest.cpp b/unittests/ProfileData/SampleProfTest.cpp
index 67e6e9fc95b958634ae4e04aa17c104cbddae5e3..a31eccefb4d65cc0bad2b26a03fa744de7ccb75c 100644
--- a/unittests/ProfileData/SampleProfTest.cpp
+++ b/unittests/ProfileData/SampleProfTest.cpp
@@ -128,11 +128,15 @@ struct SampleProfTest : ::testing::Test {
 
     FunctionSamples *ReadFooSamples = Reader->getSamplesFor(FooName);
     ASSERT_TRUE(ReadFooSamples != nullptr);
+    if (Format != SampleProfileFormat::SPF_Compact_Binary)
+      ASSERT_EQ("_Z3fooi", ReadFooSamples->getName());
     ASSERT_EQ(7711u, ReadFooSamples->getTotalSamples());
     ASSERT_EQ(610u, ReadFooSamples->getHeadSamples());
 
     FunctionSamples *ReadBarSamples = Reader->getSamplesFor(BarName);
     ASSERT_TRUE(ReadBarSamples != nullptr);
+    if (Format != SampleProfileFormat::SPF_Compact_Binary)
+      ASSERT_EQ("_Z3bari", ReadBarSamples->getName());
     ASSERT_EQ(20301u, ReadBarSamples->getTotalSamples());
     ASSERT_EQ(1437u, ReadBarSamples->getHeadSamples());
     ErrorOr<SampleRecord::CallTargetMap> CTMap =
diff --git a/unittests/Support/DynamicLibrary/CMakeLists.txt b/unittests/Support/DynamicLibrary/CMakeLists.txt
index ec202295ac15a6a5b6e49e8b25b26d318e51dd7e..1ea9826a3d155af918b0588f0b5f5757d69615f2 100644
--- a/unittests/Support/DynamicLibrary/CMakeLists.txt
+++ b/unittests/Support/DynamicLibrary/CMakeLists.txt
@@ -8,6 +8,13 @@ add_library(DynamicLibraryLib STATIC
   )
 set_target_properties(DynamicLibraryLib PROPERTIES FOLDER "Tests")
 
+# extract_symbols.py relies on all its library arguments being in the same
+# directory, so we must set the output directory in the same way as if
+# add_llvm_library was used.
+set_output_directory(DynamicLibraryLib
+  LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}
+  )
+
 add_llvm_unittest(DynamicLibraryTests
   DynamicLibraryTest.cpp
   )
diff --git a/unittests/Support/ErrorTest.cpp b/unittests/Support/ErrorTest.cpp
index ed1e2bb458ba9eb221eceee77a6cb7519cc470c1..eee2fe2d649379b9b32cc7813358d05023567e78 100644
--- a/unittests/Support/ErrorTest.cpp
+++ b/unittests/Support/ErrorTest.cpp
@@ -434,9 +434,8 @@ TEST(Error, CatchErrorFromHandler) {
 TEST(Error, StringError) {
   std::string Msg;
   raw_string_ostream S(Msg);
-  logAllUnhandledErrors(make_error<StringError>("foo" + Twine(42),
-                                                inconvertibleErrorCode()),
-                        S, "");
+  logAllUnhandledErrors(
+      make_error<StringError>("foo" + Twine(42), inconvertibleErrorCode()), S);
   EXPECT_EQ(S.str(), "foo42\n") << "Unexpected StringError log result";
 
   auto EC =
@@ -451,13 +450,13 @@ TEST(Error, createStringError) {
   std::string Msg;
   raw_string_ostream S(Msg);
   logAllUnhandledErrors(createStringError(EC, "foo%s%d0x%" PRIx8, Bar, 1, 0xff),
-                        S, "");
+                        S);
   EXPECT_EQ(S.str(), "foobar10xff\n")
     << "Unexpected createStringError() log result";
 
   S.flush();
   Msg.clear();
-  logAllUnhandledErrors(createStringError(EC, Bar), S, "");
+  logAllUnhandledErrors(createStringError(EC, Bar), S);
   EXPECT_EQ(S.str(), "bar\n")
     << "Unexpected createStringError() (overloaded) log result";
 
diff --git a/unittests/Support/Host.cpp b/unittests/Support/Host.cpp
index 623591803e07abc8608ab80359c1181f5c1af146..0d257a705910d214c851ed0794013ac84366b6cc 100644
--- a/unittests/Support/Host.cpp
+++ b/unittests/Support/Host.cpp
@@ -242,6 +242,11 @@ CPU part	: 0x0a1
                                               "CPU implementer	: 0x43\n"
                                               "CPU part	: 0xa1"),
             "thunderxt88");
+
+  // Verify HiSilicon processors.
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x48\n"
+                                              "CPU part        : 0xd01"),
+            "tsv110");
 }
 
 #if defined(__APPLE__)
diff --git a/unittests/Support/MemoryTest.cpp b/unittests/Support/MemoryTest.cpp
index 650be7b6f1dde08901353df0e4ff3c5e561a461c..bc2fba05f6ad653c8ac3679e710d4374b0be9183 100644
--- a/unittests/Support/MemoryTest.cpp
+++ b/unittests/Support/MemoryTest.cpp
@@ -10,13 +10,43 @@
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Process.h"
 #include "gtest/gtest.h"
+#include <cassert>
 #include <cstdlib>
 
+#if defined(__NetBSD__)
+// clang-format off
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <err.h>
+#include <unistd.h>
+// clang-format on
+#endif
+
 using namespace llvm;
 using namespace sys;
 
 namespace {
 
+bool IsMPROTECT() {
+#if defined(__NetBSD__)
+  int mib[3];
+  int paxflags;
+  size_t len = sizeof(paxflags);
+
+  mib[0] = CTL_PROC;
+  mib[1] = getpid();
+  mib[2] = PROC_PID_PAXFLAGS;
+
+  if (sysctl(mib, 3, &paxflags, &len, NULL, 0) != 0)
+    err(EXIT_FAILURE, "sysctl");
+
+  return !!(paxflags & CTL_PROC_PAXFLAGS_MPROTECT);
+#else
+  return false;
+#endif
+}
+
 class MappedMemoryTest : public ::testing::TestWithParam<unsigned> {
 public:
   MappedMemoryTest() {
@@ -56,7 +86,16 @@ protected:
   size_t   PageSize;
 };
 
+// MPROTECT prevents W+X mmaps
+#define CHECK_UNSUPPORTED() \
+  do { \
+    if ((Flags & Memory::MF_WRITE) && (Flags & Memory::MF_EXEC) && \
+        IsMPROTECT()) \
+      return; \
+  } while (0)
+
 TEST_P(MappedMemoryTest, AllocAndRelease) {
+  CHECK_UNSUPPORTED();
   std::error_code EC;
   MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,EC);
   EXPECT_EQ(std::error_code(), EC);
@@ -68,6 +107,7 @@ TEST_P(MappedMemoryTest, AllocAndRelease) {
 }
 
 TEST_P(MappedMemoryTest, MultipleAllocAndRelease) {
+  CHECK_UNSUPPORTED();
   std::error_code EC;
   MemoryBlock M1 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
   EXPECT_EQ(std::error_code(), EC);
@@ -102,6 +142,7 @@ TEST_P(MappedMemoryTest, BasicWrite) {
   if (Flags &&
       !((Flags & Memory::MF_READ) && (Flags & Memory::MF_WRITE)))
     return;
+  CHECK_UNSUPPORTED();
 
   std::error_code EC;
   MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,EC);
@@ -122,6 +163,8 @@ TEST_P(MappedMemoryTest, MultipleWrite) {
   if (Flags &&
       !((Flags & Memory::MF_READ) && (Flags & Memory::MF_WRITE)))
     return;
+  CHECK_UNSUPPORTED();
+
   std::error_code EC;
   MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,
                                                 EC);
@@ -180,6 +223,11 @@ TEST_P(MappedMemoryTest, MultipleWrite) {
 }
 
 TEST_P(MappedMemoryTest, EnabledWrite) {
+  // MPROTECT prevents W+X, and since this test always adds W we need
+  // to block any variant with X.
+  if ((Flags & Memory::MF_EXEC) && IsMPROTECT())
+    return;
+
   std::error_code EC;
   MemoryBlock M1 = Memory::allocateMappedMemory(2 * sizeof(int), nullptr, Flags,
                                                 EC);
@@ -237,6 +285,7 @@ TEST_P(MappedMemoryTest, EnabledWrite) {
 }
 
 TEST_P(MappedMemoryTest, SuccessiveNear) {
+  CHECK_UNSUPPORTED();
   std::error_code EC;
   MemoryBlock M1 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
   EXPECT_EQ(std::error_code(), EC);
@@ -262,6 +311,7 @@ TEST_P(MappedMemoryTest, SuccessiveNear) {
 }
 
 TEST_P(MappedMemoryTest, DuplicateNear) {
+  CHECK_UNSUPPORTED();
   std::error_code EC;
   MemoryBlock Near((void*)(3*PageSize), 16);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
@@ -284,6 +334,7 @@ TEST_P(MappedMemoryTest, DuplicateNear) {
 }
 
 TEST_P(MappedMemoryTest, ZeroNear) {
+  CHECK_UNSUPPORTED();
   std::error_code EC;
   MemoryBlock Near(nullptr, 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
@@ -310,6 +361,7 @@ TEST_P(MappedMemoryTest, ZeroNear) {
 }
 
 TEST_P(MappedMemoryTest, ZeroSizeNear) {
+  CHECK_UNSUPPORTED();
   std::error_code EC;
   MemoryBlock Near((void*)(4*PageSize), 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
@@ -336,6 +388,7 @@ TEST_P(MappedMemoryTest, ZeroSizeNear) {
 }
 
 TEST_P(MappedMemoryTest, UnalignedNear) {
+  CHECK_UNSUPPORTED();
   std::error_code EC;
   MemoryBlock Near((void*)(2*PageSize+5), 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(15, &Near, Flags, EC);
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index c18eb929e916f23d74b01e33147d8dbf24080b59..40faa669f87dd9d0597ee9d2c8a27617e2b5afc8 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -516,6 +516,8 @@ TEST_F(FileSystemTest, RealPath) {
   EXPECT_EQ(Expected, Actual);
 
   SmallString<64> HomeDir;
+
+  // This can fail if $HOME is not set and getpwuid fails.
   bool Result = llvm::sys::path::home_directory(HomeDir);
   if (Result) {
     ASSERT_NO_ERROR(fs::real_path(HomeDir, Expected));
@@ -528,6 +530,31 @@ TEST_F(FileSystemTest, RealPath) {
   ASSERT_NO_ERROR(fs::remove_directories(Twine(TestDirectory) + "/test1"));
 }
 
+TEST_F(FileSystemTest, ExpandTilde) {
+  SmallString<64> Expected;
+  SmallString<64> Actual;
+  SmallString<64> HomeDir;
+
+  // This can fail if $HOME is not set and getpwuid fails.
+  bool Result = llvm::sys::path::home_directory(HomeDir);
+  if (Result) {
+    fs::expand_tilde(HomeDir, Expected);
+
+    fs::expand_tilde("~", Actual);
+    EXPECT_EQ(Expected, Actual);
+
+#ifdef _WIN32
+    Expected += "\\foo";
+    fs::expand_tilde("~\\foo", Actual);
+#else
+    Expected += "/foo";
+    fs::expand_tilde("~/foo", Actual);
+#endif
+
+    EXPECT_EQ(Expected, Actual);
+  }
+}
+
 #ifdef LLVM_ON_UNIX
 TEST_F(FileSystemTest, RealPathNoReadPerm) {
   SmallString<64> Expanded;
diff --git a/unittests/Support/ProcessTest.cpp b/unittests/Support/ProcessTest.cpp
index c6ba35fd94158eb29b98ec859d1e150360c0c37e..85b1839ddbcbaa7fa028d373c5de89a5d8e513de 100644
--- a/unittests/Support/ProcessTest.cpp
+++ b/unittests/Support/ProcessTest.cpp
@@ -42,7 +42,7 @@ TEST(ProcessTest, None) {
   Optional<std::string> val(
       Process::GetEnv("__LLVM_TEST_ENVIRON_NO_SUCH_VAR__"));
   EXPECT_FALSE(val.hasValue());
-} 
+}
 #endif
 
 #ifdef _WIN32
diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp
index be87bec3644dbb0d6fa8be90c1b7bd0379a12c55..a6a6150d2b550706d470242b97622cc69c9f1189 100644
--- a/unittests/Support/TargetParserTest.cpp
+++ b/unittests/Support/TargetParserTest.cpp
@@ -488,7 +488,7 @@ TEST(TargetParserTest, testARMExtension) {
                                 ARM::ArchKind::ARMV7EM, "crypto"));
   EXPECT_FALSE(testARMExtension("generic", ARM::ArchKind::ARMV8A, "ras"));
   EXPECT_FALSE(testARMExtension("generic", ARM::ArchKind::ARMV8_1A, "ras"));
-  EXPECT_FALSE(testARMExtension("generic", ARM::ArchKind::ARMV8_2A, "spe"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::ArchKind::ARMV8_2A, "profile"));
   EXPECT_FALSE(testARMExtension("generic", ARM::ArchKind::ARMV8_2A, "fp16"));
   EXPECT_FALSE(testARMExtension("generic", ARM::ArchKind::ARMV8_2A, "fp16fml"));
   EXPECT_FALSE(testARMExtension("generic", ARM::ArchKind::ARMV8_3A, "fp16"));
@@ -508,7 +508,7 @@ TEST(TargetParserTest, testARMExtension) {
 }
 
 TEST(TargetParserTest, ARMFPUVersion) {
-  for (ARM::FPUKind FK = static_cast<ARM::FPUKind>(0); 
+  for (ARM::FPUKind FK = static_cast<ARM::FPUKind>(0);
        FK <= ARM::FPUKind::FK_LAST;
        FK = static_cast<ARM::FPUKind>(static_cast<unsigned>(FK) + 1))
     if (FK == ARM::FK_LAST || ARM::getFPUName(FK) == "invalid" ||
@@ -690,8 +690,6 @@ bool testAArch64CPU(StringRef CPUName, StringRef ExpectedArch,
                     StringRef CPUAttr) {
   AArch64::ArchKind AK = AArch64::parseCPUArch(CPUName);
   bool pass = AArch64::getArchName(AK).equals(ExpectedArch);
-  unsigned FPUKind = AArch64::getDefaultFPU(CPUName, AK);
-  pass &= AArch64::getFPUName(FPUKind).equals(ExpectedFPU);
 
   unsigned ExtKind = AArch64::getDefaultExtensions(CPUName, AK);
   if (ExtKind > 1 && (ExtKind & AArch64::AEK_NONE))
@@ -795,9 +793,16 @@ TEST(TargetParserTest, testAArch64CPU) {
       AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD |
       AArch64::AEK_FP | AArch64::AEK_PROFILE,
       "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "tsv110", "armv8.2-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
+      AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
+      AArch64::AEK_RDM | AArch64::AEK_PROFILE | AArch64::AEK_FP16 |
+      AArch64::AEK_FP16FML | AArch64::AEK_DOTPROD,
+      "8.2-A"));
 }
 
-static constexpr unsigned NumAArch64CPUArchs = 20;
+static constexpr unsigned NumAArch64CPUArchs = 21;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;
@@ -888,10 +893,14 @@ TEST(TargetParserTest, testAArch64Extension) {
                                     AArch64::ArchKind::INVALID, "fp16"));
   EXPECT_FALSE(testAArch64Extension("cortex-a55",
                                     AArch64::ArchKind::INVALID, "fp16fml"));
+  EXPECT_TRUE(testAArch64Extension("cortex-a55",
+                                    AArch64::ArchKind::INVALID, "dotprod"));
   EXPECT_TRUE(testAArch64Extension("cortex-a75",
                                     AArch64::ArchKind::INVALID, "fp16"));
   EXPECT_FALSE(testAArch64Extension("cortex-a75",
                                     AArch64::ArchKind::INVALID, "fp16fml"));
+  EXPECT_TRUE(testAArch64Extension("cortex-a75",
+                                    AArch64::ArchKind::INVALID, "dotprod"));
   EXPECT_FALSE(testAArch64Extension("thunderx2t99",
                                     AArch64::ArchKind::INVALID, "ras"));
   EXPECT_FALSE(testAArch64Extension("thunderx",
@@ -902,13 +911,29 @@ TEST(TargetParserTest, testAArch64Extension) {
                                     AArch64::ArchKind::INVALID, "lse"));
   EXPECT_FALSE(testAArch64Extension("thunderxt88",
                                     AArch64::ArchKind::INVALID, "lse"));
+  EXPECT_TRUE(testAArch64Extension("tsv110",
+                                   AArch64::ArchKind::INVALID, "crypto"));
+  EXPECT_FALSE(testAArch64Extension("tsv110",
+                                    AArch64::ArchKind::INVALID, "sha3"));
+  EXPECT_FALSE(testAArch64Extension("tsv110",
+                                    AArch64::ArchKind::INVALID, "sm4"));
+  EXPECT_TRUE(testAArch64Extension("tsv110",
+                                   AArch64::ArchKind::INVALID, "ras"));
+  EXPECT_TRUE(testAArch64Extension("tsv110",
+                                   AArch64::ArchKind::INVALID, "profile"));
+  EXPECT_TRUE(testAArch64Extension("tsv110",
+                                   AArch64::ArchKind::INVALID, "fp16"));
+  EXPECT_TRUE(testAArch64Extension("tsv110",
+                                   AArch64::ArchKind::INVALID, "fp16fml"));
+  EXPECT_TRUE(testAArch64Extension("tsv110",
+                                   AArch64::ArchKind::INVALID, "dotprod"));
 
   EXPECT_FALSE(testAArch64Extension(
       "generic", AArch64::ArchKind::ARMV8A, "ras"));
   EXPECT_FALSE(testAArch64Extension(
       "generic", AArch64::ArchKind::ARMV8_1A, "ras"));
   EXPECT_FALSE(testAArch64Extension(
-      "generic", AArch64::ArchKind::ARMV8_2A, "spe"));
+      "generic", AArch64::ArchKind::ARMV8_2A, "profile"));
   EXPECT_FALSE(testAArch64Extension(
       "generic", AArch64::ArchKind::ARMV8_2A, "fp16"));
   EXPECT_FALSE(testAArch64Extension(
@@ -940,13 +965,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
 
 TEST(TargetParserTest, AArch64ArchFeatures) {
   std::vector<StringRef> Features;
-  AArch64::ArchKind ArchKinds[] = {
-#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT)       \
-     AArch64::ArchKind::ID,
-#include "llvm/Support/AArch64TargetParser.def"
-  };
 
-  for (auto AK : ArchKinds)
+  for (auto AK : AArch64::ArchKinds)
     EXPECT_TRUE((AK == AArch64::ArchKind::INVALID)
                     ? !AArch64::getArchFeatures(AK, Features)
                     : AArch64::getArchFeatures(AK, Features));
@@ -967,7 +987,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
                               {"dotprod", "nodotprod", "+dotprod", "-dotprod"},
                               {"rcpc", "norcpc", "+rcpc", "-rcpc" },
                               {"rng", "norng", "+rand", "-rand"},
-                              {"memtag", "nomemtag", "+mte", "-mte"}};
+                              {"memtag", "nomemtag", "+mte", "-mte"},
+                              {"ssbs", "nossbs", "+ssbs", "-ssbs"}};
 
   for (unsigned i = 0; i < array_lengthof(ArchExt); i++) {
     EXPECT_EQ(StringRef(ArchExt[i][2]),
diff --git a/unittests/Support/VirtualFileSystemTest.cpp b/unittests/Support/VirtualFileSystemTest.cpp
index 466cd117a507fbabc52598622b38c7567e8b6c8b..7b42943f0fd969817ac8817eb572203c00528fc6 100644
--- a/unittests/Support/VirtualFileSystemTest.cpp
+++ b/unittests/Support/VirtualFileSystemTest.cpp
@@ -743,6 +743,43 @@ TEST(VirtualFileSystemTest, HiddenInIteration) {
   }
 }
 
+TEST(ProxyFileSystemTest, Basic) {
+  IntrusiveRefCntPtr<vfs::InMemoryFileSystem> Base(
+      new vfs::InMemoryFileSystem());
+  vfs::ProxyFileSystem PFS(Base);
+
+  Base->addFile("/a", 0, MemoryBuffer::getMemBuffer("test"));
+
+  auto Stat = PFS.status("/a");
+  ASSERT_FALSE(Stat.getError());
+
+  auto File = PFS.openFileForRead("/a");
+  ASSERT_FALSE(File.getError());
+  EXPECT_EQ("test", (*(*File)->getBuffer("ignored"))->getBuffer());
+
+  std::error_code EC;
+  vfs::directory_iterator I = PFS.dir_begin("/", EC);
+  ASSERT_FALSE(EC);
+  ASSERT_EQ("/a", I->path());
+  I.increment(EC);
+  ASSERT_FALSE(EC);
+  ASSERT_EQ(vfs::directory_iterator(), I);
+
+  ASSERT_FALSE(PFS.setCurrentWorkingDirectory("/"));
+
+  auto PWD = PFS.getCurrentWorkingDirectory();
+  ASSERT_FALSE(PWD.getError());
+  ASSERT_EQ("/", *PWD);
+
+  SmallString<16> Path;
+  ASSERT_FALSE(PFS.getRealPath("a", Path));
+  ASSERT_EQ("/a", Path);
+
+  bool Local = true;
+  ASSERT_FALSE(PFS.isLocal("/a", Local));
+  ASSERT_EQ(false, Local);
+}
+
 class InMemoryFileSystemTest : public ::testing::Test {
 protected:
   llvm::vfs::InMemoryFileSystem FS;
@@ -1775,3 +1812,49 @@ TEST_F(VFSFromYAMLTest, DirectoryIterationErrorInVFSLayer) {
   checkContents(FS->dir_begin("//root/foo", EC),
                 {"//root/foo/a", "//root/foo/b"});
 }
+
+TEST_F(VFSFromYAMLTest, GetRealPath) {
+  IntrusiveRefCntPtr<DummyFileSystem> Lower(new DummyFileSystem());
+  Lower->addDirectory("//dir/");
+  Lower->addRegularFile("/foo");
+  Lower->addSymlink("/link");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(
+      "{ 'use-external-names': false,\n"
+      "  'roots': [\n"
+      "{\n"
+      "  'type': 'directory',\n"
+      "  'name': '//root/',\n"
+      "  'contents': [ {\n"
+      "                  'type': 'file',\n"
+      "                  'name': 'bar',\n"
+      "                  'external-contents': '/link'\n"
+      "                }\n"
+      "              ]\n"
+      "},\n"
+      "{\n"
+      "  'type': 'directory',\n"
+      "  'name': '//dir/',\n"
+      "  'contents': []\n"
+      "}\n"
+      "]\n"
+      "}",
+      Lower);
+  ASSERT_TRUE(FS.get() != nullptr);
+
+  // Regular file present in underlying file system.
+  SmallString<16> RealPath;
+  EXPECT_FALSE(FS->getRealPath("/foo", RealPath));
+  EXPECT_EQ(RealPath.str(), "/foo");
+
+  // File present in YAML pointing to symlink in underlying file system.
+  EXPECT_FALSE(FS->getRealPath("//root/bar", RealPath));
+  EXPECT_EQ(RealPath.str(), "/symlink");
+
+  // Directories should fall back to the underlying file system is possible.
+  EXPECT_FALSE(FS->getRealPath("//dir/", RealPath));
+  EXPECT_EQ(RealPath.str(), "//dir/");
+
+  // Try a non-existing file.
+  EXPECT_EQ(FS->getRealPath("/non_existing", RealPath),
+            errc::no_such_file_or_directory);
+}
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 94e9874147f1cb90ec3b8bc6e069b792a2138f96..17f38f9b86cf7e43b5f10e2df001bbd4c8bc933a 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
@@ -2642,3 +2643,235 @@ TEST(YAMLIO, Numeric) {
   EXPECT_FALSE(isNumeric("-inf"));
   EXPECT_FALSE(isNumeric("1,230.15"));
 }
+
+//===----------------------------------------------------------------------===//
+//  Test PolymorphicTraits and TaggedScalarTraits
+//===----------------------------------------------------------------------===//
+
+struct Poly {
+  enum NodeKind {
+    NK_Scalar,
+    NK_Seq,
+    NK_Map,
+  } Kind;
+
+  Poly(NodeKind Kind) : Kind(Kind) {}
+
+  virtual ~Poly() = default;
+
+  NodeKind getKind() const { return Kind; }
+};
+
+struct Scalar : Poly {
+  enum ScalarKind {
+    SK_Unknown,
+    SK_Double,
+    SK_Bool,
+  } SKind;
+
+  union {
+    double DoubleValue;
+    bool BoolValue;
+  };
+
+  Scalar() : Poly(NK_Scalar), SKind(SK_Unknown) {}
+  Scalar(double DoubleValue)
+      : Poly(NK_Scalar), SKind(SK_Double), DoubleValue(DoubleValue) {}
+  Scalar(bool BoolValue)
+      : Poly(NK_Scalar), SKind(SK_Bool), BoolValue(BoolValue) {}
+
+  static bool classof(const Poly *N) { return N->getKind() == NK_Scalar; }
+};
+
+struct Seq : Poly, std::vector<std::unique_ptr<Poly>> {
+  Seq() : Poly(NK_Seq) {}
+
+  static bool classof(const Poly *N) { return N->getKind() == NK_Seq; }
+};
+
+struct Map : Poly, llvm::StringMap<std::unique_ptr<Poly>> {
+  Map() : Poly(NK_Map) {}
+
+  static bool classof(const Poly *N) { return N->getKind() == NK_Map; }
+};
+
+namespace llvm {
+namespace yaml {
+
+template <> struct PolymorphicTraits<std::unique_ptr<Poly>> {
+  static NodeKind getKind(const std::unique_ptr<Poly> &N) {
+    if (isa<Scalar>(*N))
+      return NodeKind::Scalar;
+    if (isa<Seq>(*N))
+      return NodeKind::Sequence;
+    if (isa<Map>(*N))
+      return NodeKind::Map;
+    llvm_unreachable("unsupported node type");
+  }
+
+  static Scalar &getAsScalar(std::unique_ptr<Poly> &N) {
+    if (!N || !isa<Scalar>(*N))
+      N = llvm::make_unique<Scalar>();
+    return *cast<Scalar>(N.get());
+  }
+
+  static Seq &getAsSequence(std::unique_ptr<Poly> &N) {
+    if (!N || !isa<Seq>(*N))
+      N = llvm::make_unique<Seq>();
+    return *cast<Seq>(N.get());
+  }
+
+  static Map &getAsMap(std::unique_ptr<Poly> &N) {
+    if (!N || !isa<Map>(*N))
+      N = llvm::make_unique<Map>();
+    return *cast<Map>(N.get());
+  }
+};
+
+template <> struct TaggedScalarTraits<Scalar> {
+  static void output(const Scalar &S, void *Ctxt, raw_ostream &ScalarOS,
+                     raw_ostream &TagOS) {
+    switch (S.SKind) {
+    case Scalar::SK_Unknown:
+      report_fatal_error("output unknown scalar");
+      break;
+    case Scalar::SK_Double:
+      TagOS << "!double";
+      ScalarTraits<double>::output(S.DoubleValue, Ctxt, ScalarOS);
+      break;
+    case Scalar::SK_Bool:
+      TagOS << "!bool";
+      ScalarTraits<bool>::output(S.BoolValue, Ctxt, ScalarOS);
+      break;
+    }
+  }
+
+  static StringRef input(StringRef ScalarStr, StringRef Tag, void *Ctxt,
+                         Scalar &S) {
+    S.SKind = StringSwitch<Scalar::ScalarKind>(Tag)
+                  .Case("!double", Scalar::SK_Double)
+                  .Case("!bool", Scalar::SK_Bool)
+                  .Default(Scalar::SK_Unknown);
+    switch (S.SKind) {
+    case Scalar::SK_Unknown:
+      return StringRef("unknown scalar tag");
+    case Scalar::SK_Double:
+      return ScalarTraits<double>::input(ScalarStr, Ctxt, S.DoubleValue);
+    case Scalar::SK_Bool:
+      return ScalarTraits<bool>::input(ScalarStr, Ctxt, S.BoolValue);
+    }
+    llvm_unreachable("unknown scalar kind");
+  }
+
+  static QuotingType mustQuote(const Scalar &S, StringRef Str) {
+    switch (S.SKind) {
+    case Scalar::SK_Unknown:
+      report_fatal_error("quote unknown scalar");
+    case Scalar::SK_Double:
+      return ScalarTraits<double>::mustQuote(Str);
+    case Scalar::SK_Bool:
+      return ScalarTraits<bool>::mustQuote(Str);
+    }
+    llvm_unreachable("unknown scalar kind");
+  }
+};
+
+template <> struct CustomMappingTraits<Map> {
+  static void inputOne(IO &IO, StringRef Key, Map &M) {
+    IO.mapRequired(Key.str().c_str(), M[Key]);
+  }
+
+  static void output(IO &IO, Map &M) {
+    for (auto &N : M)
+      IO.mapRequired(N.getKey().str().c_str(), N.getValue());
+  }
+};
+
+template <> struct SequenceTraits<Seq> {
+  static size_t size(IO &IO, Seq &A) { return A.size(); }
+
+  static std::unique_ptr<Poly> &element(IO &IO, Seq &A, size_t Index) {
+    if (Index >= A.size())
+      A.resize(Index + 1);
+    return A[Index];
+  }
+};
+
+} // namespace yaml
+} // namespace llvm
+
+TEST(YAMLIO, TestReadWritePolymorphicScalar) {
+  std::string intermediate;
+  std::unique_ptr<Poly> node = llvm::make_unique<Scalar>(true);
+
+  llvm::raw_string_ostream ostr(intermediate);
+  Output yout(ostr);
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  EXPECT_DEATH(yout << node, "plain scalar documents are not supported");
+#endif
+#endif
+}
+
+TEST(YAMLIO, TestReadWritePolymorphicSeq) {
+  std::string intermediate;
+  {
+    auto seq = llvm::make_unique<Seq>();
+    seq->push_back(llvm::make_unique<Scalar>(true));
+    seq->push_back(llvm::make_unique<Scalar>(1.0));
+    auto node = llvm::unique_dyn_cast<Poly>(seq);
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << node;
+  }
+  {
+    Input yin(intermediate);
+    std::unique_ptr<Poly> node;
+    yin >> node;
+
+    EXPECT_FALSE(yin.error());
+    auto seq = llvm::dyn_cast<Seq>(node.get());
+    ASSERT_TRUE(seq);
+    ASSERT_EQ(seq->size(), 2u);
+    auto first = llvm::dyn_cast<Scalar>((*seq)[0].get());
+    ASSERT_TRUE(first);
+    EXPECT_EQ(first->SKind, Scalar::SK_Bool);
+    EXPECT_TRUE(first->BoolValue);
+    auto second = llvm::dyn_cast<Scalar>((*seq)[1].get());
+    ASSERT_TRUE(second);
+    EXPECT_EQ(second->SKind, Scalar::SK_Double);
+    EXPECT_EQ(second->DoubleValue, 1.0);
+  }
+}
+
+TEST(YAMLIO, TestReadWritePolymorphicMap) {
+  std::string intermediate;
+  {
+    auto map = llvm::make_unique<Map>();
+    (*map)["foo"] = llvm::make_unique<Scalar>(false);
+    (*map)["bar"] = llvm::make_unique<Scalar>(2.0);
+    std::unique_ptr<Poly> node = llvm::unique_dyn_cast<Poly>(map);
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << node;
+  }
+  {
+    Input yin(intermediate);
+    std::unique_ptr<Poly> node;
+    yin >> node;
+
+    EXPECT_FALSE(yin.error());
+    auto map = llvm::dyn_cast<Map>(node.get());
+    ASSERT_TRUE(map);
+    auto foo = llvm::dyn_cast<Scalar>((*map)["foo"].get());
+    ASSERT_TRUE(foo);
+    EXPECT_EQ(foo->SKind, Scalar::SK_Bool);
+    EXPECT_FALSE(foo->BoolValue);
+    auto bar = llvm::dyn_cast<Scalar>((*map)["bar"].get());
+    ASSERT_TRUE(bar);
+    EXPECT_EQ(bar->SKind, Scalar::SK_Double);
+    EXPECT_EQ(bar->DoubleValue, 2.0);
+  }
+}
diff --git a/unittests/TextAPI/CMakeLists.txt b/unittests/TextAPI/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f64c8a59005747221b5947216945e6e17096100
--- /dev/null
+++ b/unittests/TextAPI/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_LINK_COMPONENTS
+  TextAPI
+)
+
+add_llvm_unittest(TapiTests
+  ELFYAMLTest.cpp
+)
+
+target_link_libraries(TapiTests PRIVATE LLVMTestingSupport)
diff --git a/unittests/TextAPI/ELFYAMLTest.cpp b/unittests/TextAPI/ELFYAMLTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..802a2d7127b25bdb6c24b8ee10313f4eda02fb0d
--- /dev/null
+++ b/unittests/TextAPI/ELFYAMLTest.cpp
@@ -0,0 +1,211 @@
+//===- llvm/unittests/TextAPI/YAMLTest.cpp --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-----------------------------------------------------------------------===/
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/TextAPI/ELF/ELFStub.h"
+#include "llvm/TextAPI/ELF/TBEHandler.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using namespace llvm;
+using namespace llvm::ELF;
+using namespace llvm::elfabi;
+
+void compareByLine(StringRef LHS, StringRef RHS) {
+  StringRef Line1;
+  StringRef Line2;
+  while (LHS.size() > 0 && RHS.size() > 0) {
+    std::tie(Line1, LHS) = LHS.split('\n');
+    std::tie(Line2, RHS) = RHS.split('\n');
+    // Comparing StringRef objects works, but has messy output when not equal.
+    // Using STREQ on StringRef.data() doesn't work since these substrings are
+    // not null terminated.
+    // This is inefficient, but forces null terminated strings that can be
+    // cleanly compared.
+    EXPECT_STREQ(Line1.str().data(), Line2.str().data());
+  }
+}
+
+TEST(ElfYamlTextAPI, YAMLReadableTBE) {
+  const char Data[] = "--- !tapi-tbe\n"
+                      "TbeVersion: 1.0\n"
+                      "Arch: x86_64\n"
+                      "NeededLibs: [libc.so, libfoo.so, libbar.so]\n"
+                      "Symbols:\n"
+                      "  foo: { Type: Func, Undefined: true }\n"
+                      "...\n";
+  Expected<std::unique_ptr<ELFStub>> StubOrErr = readTBEFromBuffer(Data);
+  ASSERT_THAT_ERROR(StubOrErr.takeError(), Succeeded());
+  std::unique_ptr<ELFStub> Stub = std::move(StubOrErr.get());
+  EXPECT_NE(Stub.get(), nullptr);
+  EXPECT_FALSE(Stub->SoName.hasValue());
+  EXPECT_EQ(Stub->Arch, (uint16_t)llvm::ELF::EM_X86_64);
+  EXPECT_EQ(Stub->NeededLibs.size(), 3u);
+  EXPECT_STREQ(Stub->NeededLibs[0].c_str(), "libc.so");
+  EXPECT_STREQ(Stub->NeededLibs[1].c_str(), "libfoo.so");
+  EXPECT_STREQ(Stub->NeededLibs[2].c_str(), "libbar.so");
+}
+
+TEST(ElfYamlTextAPI, YAMLReadsTBESymbols) {
+  const char Data[] = "--- !tapi-tbe\n"
+                      "TbeVersion: 1.0\n"
+                      "SoName: test.so\n"
+                      "Arch: x86_64\n"
+                      "Symbols:\n"
+                      "  bar: { Type: Object, Size: 42 }\n"
+                      "  baz: { Type: TLS, Size: 3 }\n"
+                      "  foo: { Type: Func, Warning: \"Deprecated!\" }\n"
+                      "  nor: { Type: NoType, Undefined: true }\n"
+                      "  not: { Type: File, Undefined: true, Size: 111, "
+                      "Warning: \'All fields populated!\' }\n"
+                      "...\n";
+  Expected<std::unique_ptr<ELFStub>> StubOrErr = readTBEFromBuffer(Data);
+  ASSERT_THAT_ERROR(StubOrErr.takeError(), Succeeded());
+  std::unique_ptr<ELFStub> Stub = std::move(StubOrErr.get());
+  EXPECT_NE(Stub.get(), nullptr);
+  EXPECT_TRUE(Stub->SoName.hasValue());
+  EXPECT_STREQ(Stub->SoName->c_str(), "test.so");
+  EXPECT_EQ(Stub->Symbols.size(), 5u);
+
+  auto Iterator = Stub->Symbols.begin();
+  ELFSymbol const &SymBar = *Iterator++;
+  EXPECT_STREQ(SymBar.Name.c_str(), "bar");
+  EXPECT_EQ(SymBar.Size, 42u);
+  EXPECT_EQ(SymBar.Type, ELFSymbolType::Object);
+  EXPECT_FALSE(SymBar.Undefined);
+  EXPECT_FALSE(SymBar.Warning.hasValue());
+
+  ELFSymbol const &SymBaz = *Iterator++;
+  EXPECT_STREQ(SymBaz.Name.c_str(), "baz");
+  EXPECT_EQ(SymBaz.Size, 3u);
+  EXPECT_EQ(SymBaz.Type, ELFSymbolType::TLS);
+  EXPECT_FALSE(SymBaz.Undefined);
+  EXPECT_FALSE(SymBaz.Warning.hasValue());
+
+  ELFSymbol const &SymFoo = *Iterator++;
+  EXPECT_STREQ(SymFoo.Name.c_str(), "foo");
+  EXPECT_EQ(SymFoo.Size, 0u);
+  EXPECT_EQ(SymFoo.Type, ELFSymbolType::Func);
+  EXPECT_FALSE(SymFoo.Undefined);
+  EXPECT_TRUE(SymFoo.Warning.hasValue());
+  EXPECT_STREQ(SymFoo.Warning->c_str(), "Deprecated!");
+
+  ELFSymbol const &SymNor = *Iterator++;
+  EXPECT_STREQ(SymNor.Name.c_str(), "nor");
+  EXPECT_EQ(SymNor.Size, 0u);
+  EXPECT_EQ(SymNor.Type, ELFSymbolType::NoType);
+  EXPECT_TRUE(SymNor.Undefined);
+  EXPECT_FALSE(SymNor.Warning.hasValue());
+
+  ELFSymbol const &SymNot = *Iterator++;
+  EXPECT_STREQ(SymNot.Name.c_str(), "not");
+  EXPECT_EQ(SymNot.Size, 111u);
+  EXPECT_EQ(SymNot.Type, ELFSymbolType::Unknown);
+  EXPECT_TRUE(SymNot.Undefined);
+  EXPECT_TRUE(SymNot.Warning.hasValue());
+  EXPECT_STREQ(SymNot.Warning->c_str(), "All fields populated!");
+}
+
+TEST(ElfYamlTextAPI, YAMLReadsNoTBESyms) {
+  const char Data[] = "--- !tapi-tbe\n"
+                      "TbeVersion: 1.0\n"
+                      "SoName: test.so\n"
+                      "Arch: x86_64\n"
+                      "Symbols: {}\n"
+                      "...\n";
+  Expected<std::unique_ptr<ELFStub>> StubOrErr = readTBEFromBuffer(Data);
+  ASSERT_THAT_ERROR(StubOrErr.takeError(), Succeeded());
+  std::unique_ptr<ELFStub> Stub = std::move(StubOrErr.get());
+  EXPECT_NE(Stub.get(), nullptr);
+  EXPECT_EQ(0u, Stub->Symbols.size());
+}
+
+TEST(ElfYamlTextAPI, YAMLUnreadableTBE) {
+  // Can't read: wrong format/version.
+  const char Data[] = "--- !tapi-tbz\n"
+                      "TbeVersion: z.3\n"
+                      "SoName: test.so\n"
+                      "Arch: x86_64\n"
+                      "Symbols:\n"
+                      "  foo: { Type: Func, Undefined: true }\n";
+  Expected<std::unique_ptr<ELFStub>> StubOrErr = readTBEFromBuffer(Data);
+  ASSERT_THAT_ERROR(StubOrErr.takeError(), Failed());
+}
+
+TEST(ElfYamlTextAPI, YAMLWritesTBESymbols) {
+  const char Expected[] =
+      "--- !tapi-tbe\n"
+      "TbeVersion:      1.0\n"
+      "Arch:            AArch64\n"
+      "Symbols:         \n"
+      "  foo:             { Type: NoType, Size: 99, Warning: Does nothing }\n"
+      "  nor:             { Type: Func, Undefined: true }\n"
+      "  not:             { Type: Unknown, Size: 12345678901234 }\n"
+      "...\n";
+  ELFStub Stub;
+  Stub.TbeVersion = VersionTuple(1, 0);
+  Stub.Arch = ELF::EM_AARCH64;
+
+  ELFSymbol SymFoo("foo");
+  SymFoo.Size = 99u;
+  SymFoo.Type = ELFSymbolType::NoType;
+  SymFoo.Undefined = false;
+  SymFoo.Warning = "Does nothing";
+
+  ELFSymbol SymNor("nor");
+  SymNor.Type = ELFSymbolType::Func;
+  SymNor.Undefined = true;
+
+  ELFSymbol SymNot("not");
+  SymNot.Size = 12345678901234u;
+  SymNot.Type = ELFSymbolType::Unknown;
+  SymNot.Undefined = false;
+
+  // Deliberately not in order to check that result is sorted.
+  Stub.Symbols.insert(SymNot);
+  Stub.Symbols.insert(SymFoo);
+  Stub.Symbols.insert(SymNor);
+
+  // Ensure move constructor works as expected.
+  ELFStub Moved = std::move(Stub);
+
+  std::string Result;
+  raw_string_ostream OS(Result);
+  ASSERT_THAT_ERROR(writeTBEToOutputStream(OS, Moved), Succeeded());
+  Result = OS.str();
+  compareByLine(Result.c_str(), Expected);
+}
+
+TEST(ElfYamlTextAPI, YAMLWritesNoTBESyms) {
+  const char Expected[] = "--- !tapi-tbe\n"
+                          "TbeVersion:      1.0\n"
+                          "SoName:          nosyms.so\n"
+                          "Arch:            x86_64\n"
+                          "NeededLibs:      \n"
+                          "  - libc.so\n"
+                          "  - libfoo.so\n"
+                          "  - libbar.so\n"
+                          "Symbols:         {}\n"
+                          "...\n";
+  ELFStub Stub;
+  Stub.TbeVersion = VersionTuple(1, 0);
+  Stub.SoName = "nosyms.so";
+  Stub.Arch = ELF::EM_X86_64;
+  Stub.NeededLibs.push_back("libc.so");
+  Stub.NeededLibs.push_back("libfoo.so");
+  Stub.NeededLibs.push_back("libbar.so");
+
+  std::string Result;
+  raw_string_ostream OS(Result);
+  ASSERT_THAT_ERROR(writeTBEToOutputStream(OS, Stub), Succeeded());
+  Result = OS.str();
+  compareByLine(Result.c_str(), Expected);
+}
diff --git a/unittests/Transforms/Utils/CloningTest.cpp b/unittests/Transforms/Utils/CloningTest.cpp
index 9a1ad19ebaa4ad4f3255d18f775e53d62f233b7d..9d6bf0a5a966b5ad4afa4d63cd8ab610f40c6b12 100644
--- a/unittests/Transforms/Utils/CloningTest.cpp
+++ b/unittests/Transforms/Utils/CloningTest.cpp
@@ -14,6 +14,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DomTreeUpdater.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -224,9 +225,11 @@ TEST_F(CloneInstruction, DuplicateInstructionsToSplit) {
   Instruction *SubInst = cast<Instruction>(Builder2.CreateSub(MulInst, V));
   Builder2.CreateRetVoid();
 
+  // Dummy DTU.
   ValueToValueMapTy Mapping;
-
-  auto Split = DuplicateInstructionsInSplitBetween(BB2, BB1, SubInst, Mapping);
+  DomTreeUpdater DTU(DomTreeUpdater::UpdateStrategy::Lazy);
+  auto Split =
+      DuplicateInstructionsInSplitBetween(BB2, BB1, SubInst, Mapping, DTU);
 
   EXPECT_TRUE(Split);
   EXPECT_EQ(Mapping.size(), 2u);
@@ -271,9 +274,11 @@ TEST_F(CloneInstruction, DuplicateInstructionsToSplitBlocksEq1) {
   Instruction *SubInst = cast<Instruction>(Builder2.CreateSub(MulInst, V));
   Builder2.CreateBr(BB2);
 
+  // Dummy DTU.
+  DomTreeUpdater DTU(DomTreeUpdater::UpdateStrategy::Lazy);
   ValueToValueMapTy Mapping;
-
-  auto Split = DuplicateInstructionsInSplitBetween(BB2, BB2, BB2->getTerminator(), Mapping);
+  auto Split = DuplicateInstructionsInSplitBetween(
+      BB2, BB2, BB2->getTerminator(), Mapping, DTU);
 
   EXPECT_TRUE(Split);
   EXPECT_EQ(Mapping.size(), 3u);
@@ -322,9 +327,11 @@ TEST_F(CloneInstruction, DuplicateInstructionsToSplitBlocksEq2) {
   Instruction *SubInst = cast<Instruction>(Builder2.CreateSub(MulInst, V));
   Builder2.CreateBr(BB2);
 
+  // Dummy DTU.
+  DomTreeUpdater DTU(DomTreeUpdater::UpdateStrategy::Lazy);
   ValueToValueMapTy Mapping;
-
-  auto Split = DuplicateInstructionsInSplitBetween(BB2, BB2, SubInst, Mapping);
+  auto Split =
+      DuplicateInstructionsInSplitBetween(BB2, BB2, SubInst, Mapping, DTU);
 
   EXPECT_TRUE(Split);
   EXPECT_EQ(Mapping.size(), 2u);
@@ -384,9 +391,9 @@ protected:
                                                               "/file/dir"),
                                           "CloneFunc", false, "", 0);
 
-    auto *Subprogram =
-        DBuilder.createFunction(CU, "f", "f", File, 4, FuncType, true, true, 3,
-                                DINode::FlagZero, false);
+    auto *Subprogram = DBuilder.createFunction(
+        CU, "f", "f", File, 4, FuncType, 3, DINode::FlagZero,
+        DISubprogram::SPFlagLocalToUnit | DISubprogram::SPFlagDefinition);
     OldFunc->setSubprogram(Subprogram);
 
     // Function body
@@ -414,9 +421,9 @@ protected:
     auto *StructType = DICompositeType::getDistinct(
         C, dwarf::DW_TAG_structure_type, "some_struct", nullptr, 0, nullptr,
         nullptr, 32, 32, 0, DINode::FlagZero, nullptr, 0, nullptr, nullptr);
-    auto *InlinedSP =
-        DBuilder.createFunction(CU, "inlined", "inlined", File, 8, FuncType,
-                                true, true, 9, DINode::FlagZero, false);
+    auto *InlinedSP = DBuilder.createFunction(
+        CU, "inlined", "inlined", File, 8, FuncType, 9, DINode::FlagZero,
+        DISubprogram::SPFlagLocalToUnit | DISubprogram::SPFlagDefinition);
     auto *InlinedVar =
         DBuilder.createAutoVariable(InlinedSP, "inlined", File, 5, StructType, true);
     auto *Scope = DBuilder.createLexicalBlock(
@@ -599,9 +606,9 @@ protected:
                                                               "/file/dir"),
                                           "CloneModule", false, "", 0);
     // Function DI
-    auto *Subprogram =
-        DBuilder.createFunction(CU, "f", "f", File, 4, DFuncType, true, true, 3,
-                                DINode::FlagZero, false);
+    auto *Subprogram = DBuilder.createFunction(
+        CU, "f", "f", File, 4, DFuncType, 3, DINode::FlagZero,
+        DISubprogram::SPFlagLocalToUnit | DISubprogram::SPFlagDefinition);
     F->setSubprogram(Subprogram);
 
     // Create and assign DIGlobalVariableExpression to gv
diff --git a/unittests/Transforms/Utils/CodeExtractorTest.cpp b/unittests/Transforms/Utils/CodeExtractorTest.cpp
index c53b3152a7df142f70af0d23046eaeb8a88cde42..7e3512f56f32ac0a66abcff754b9d995a7ad66b4 100644
--- a/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -21,7 +22,14 @@
 using namespace llvm;
 
 namespace {
-TEST(CodeExtractor, DISABLED_ExitStub) {
+BasicBlock *getBlockByName(Function *F, StringRef name) {
+  for (auto &BB : *F)
+    if (BB.getName() == name)
+      return &BB;
+  return nullptr;
+}
+
+TEST(CodeExtractor, ExitStub) {
   LLVMContext Ctx;
   SMDiagnostic Err;
   std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
@@ -46,36 +54,10 @@ TEST(CodeExtractor, DISABLED_ExitStub) {
   )invalid",
                                                 Err, Ctx));
 
-  // CodeExtractor miscompiles this function. There appear to be some issues
-  // with the handling of outlined regions with live output values.
-  //
-  // In the original function, CE adds two reloads in the codeReplacer block:
-  //
-  //   codeRepl:                                         ; preds = %header
-  //     call void @foo_header.split(i32 %z, i32 %x, i32 %y, i32* %.loc, i32* %.loc1)
-  //     %.reload = load i32, i32* %.loc
-  //     %.reload2 = load i32, i32* %.loc1
-  //     br label %notExtracted
-  //
-  // These reloads must flow into the notExtracted block:
-  //
-  //   notExtracted:                                     ; preds = %codeRepl
-  //     %0 = phi i32 [ %.reload, %codeRepl ], [ %.reload2, %body2 ]
-  //
-  // The problem is that the PHI node in notExtracted now has an incoming
-  // value from a BasicBlock that's in a different function.
-
   Function *Func = M->getFunction("foo");
-  SmallVector<BasicBlock *, 3> Candidates;
-  for (auto &BB : *Func) {
-    if (BB.getName() == "body1")
-      Candidates.push_back(&BB);
-    if (BB.getName() == "body2")
-      Candidates.push_back(&BB);
-  }
-  // CodeExtractor requires the first basic block
-  // to dominate all the other ones.
-  Candidates.insert(Candidates.begin(), &Func->getEntryBlock());
+  SmallVector<BasicBlock *, 3> Candidates{ getBlockByName(Func, "header"),
+                                           getBlockByName(Func, "body1"),
+                                           getBlockByName(Func, "body2") };
 
   DominatorTree DT(*Func);
   CodeExtractor CE(Candidates, &DT);
@@ -83,6 +65,134 @@ TEST(CodeExtractor, DISABLED_ExitStub) {
 
   Function *Outlined = CE.extractCodeRegion();
   EXPECT_TRUE(Outlined);
+  BasicBlock *Exit = getBlockByName(Func, "notExtracted");
+  BasicBlock *ExitSplit = getBlockByName(Outlined, "notExtracted.split");
+  // Ensure that PHI in exit block has only one incoming value (from code
+  // replacer block).
+  EXPECT_TRUE(Exit && cast<PHINode>(Exit->front()).getNumIncomingValues() == 1);
+  // Ensure that there is a PHI in outlined function with 2 incoming values.
+  EXPECT_TRUE(ExitSplit &&
+              cast<PHINode>(ExitSplit->front()).getNumIncomingValues() == 2);
+  EXPECT_FALSE(verifyFunction(*Outlined));
+  EXPECT_FALSE(verifyFunction(*Func));
+}
+
+TEST(CodeExtractor, ExitPHIOnePredFromRegion) {
+  LLVMContext Ctx;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
+    define i32 @foo() {
+    header:
+      br i1 undef, label %extracted1, label %pred
+
+    pred:
+      br i1 undef, label %exit1, label %exit2
+
+    extracted1:
+      br i1 undef, label %extracted2, label %exit1
+
+    extracted2:
+      br label %exit2
+
+    exit1:
+      %0 = phi i32 [ 1, %extracted1 ], [ 2, %pred ]
+      ret i32 %0
+
+    exit2:
+      %1 = phi i32 [ 3, %extracted2 ], [ 4, %pred ]
+      ret i32 %1
+    }
+  )invalid", Err, Ctx));
+
+  Function *Func = M->getFunction("foo");
+  SmallVector<BasicBlock *, 2> ExtractedBlocks{
+    getBlockByName(Func, "extracted1"),
+    getBlockByName(Func, "extracted2")
+  };
+
+  DominatorTree DT(*Func);
+  CodeExtractor CE(ExtractedBlocks, &DT);
+  EXPECT_TRUE(CE.isEligible());
+
+  Function *Outlined = CE.extractCodeRegion();
+  EXPECT_TRUE(Outlined);
+  BasicBlock *Exit1 = getBlockByName(Func, "exit1");
+  BasicBlock *Exit2 = getBlockByName(Func, "exit2");
+  // Ensure that PHIs in exits are not splitted (since that they have only one
+  // incoming value from extracted region).
+  EXPECT_TRUE(Exit1 &&
+          cast<PHINode>(Exit1->front()).getNumIncomingValues() == 2);
+  EXPECT_TRUE(Exit2 &&
+          cast<PHINode>(Exit2->front()).getNumIncomingValues() == 2);
   EXPECT_FALSE(verifyFunction(*Outlined));
+  EXPECT_FALSE(verifyFunction(*Func));
 }
+
+TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) {
+  LLVMContext Ctx;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
+    declare i8 @hoge()
+
+    define i32 @foo() personality i8* null {
+      entry:
+        %call = invoke i8 @hoge()
+                to label %invoke.cont unwind label %lpad
+
+      invoke.cont:                                      ; preds = %entry
+        unreachable
+
+      lpad:                                             ; preds = %entry
+        %0 = landingpad { i8*, i32 }
+                catch i8* null
+        br i1 undef, label %catch, label %finally.catchall
+
+      catch:                                            ; preds = %lpad
+        %call2 = invoke i8 @hoge()
+                to label %invoke.cont2 unwind label %lpad2
+
+      invoke.cont2:                                    ; preds = %catch
+        %call3 = invoke i8 @hoge()
+                to label %invoke.cont3 unwind label %lpad2
+
+      invoke.cont3:                                    ; preds = %invoke.cont2
+        unreachable
+
+      lpad2:                                           ; preds = %invoke.cont2, %catch
+        %ex.1 = phi i8* [ undef, %invoke.cont2 ], [ null, %catch ]
+        %1 = landingpad { i8*, i32 }
+                catch i8* null
+        br label %finally.catchall
+
+      finally.catchall:                                 ; preds = %lpad33, %lpad
+        %ex.2 = phi i8* [ %ex.1, %lpad2 ], [ null, %lpad ]
+        unreachable
+    }
+  )invalid", Err, Ctx));
+
+	if (!M) {
+    Err.print("unit", errs());
+    exit(1);
+  }
+
+  Function *Func = M->getFunction("foo");
+  EXPECT_FALSE(verifyFunction(*Func, &errs()));
+
+  SmallVector<BasicBlock *, 2> ExtractedBlocks{
+    getBlockByName(Func, "catch"),
+    getBlockByName(Func, "invoke.cont2"),
+    getBlockByName(Func, "invoke.cont3"),
+    getBlockByName(Func, "lpad2")
+  };
+
+  DominatorTree DT(*Func);
+  CodeExtractor CE(ExtractedBlocks, &DT);
+  EXPECT_TRUE(CE.isEligible());
+
+  Function *Outlined = CE.extractCodeRegion();
+  EXPECT_TRUE(Outlined);
+  EXPECT_FALSE(verifyFunction(*Outlined, &errs()));
+  EXPECT_FALSE(verifyFunction(*Func, &errs()));
+}
+
 } // end anonymous namespace
diff --git a/unittests/Transforms/Vectorize/CMakeLists.txt b/unittests/Transforms/Vectorize/CMakeLists.txt
index 5a9142d17b1e50f28ee9534290ea93a1f023f42b..6e886bbe25af71b539e0d5f9596cd4a868cdf659 100644
--- a/unittests/Transforms/Vectorize/CMakeLists.txt
+++ b/unittests/Transforms/Vectorize/CMakeLists.txt
@@ -10,4 +10,5 @@ add_llvm_unittest(VectorizeTests
   VPlanLoopInfoTest.cpp
   VPlanTest.cpp
   VPlanHCFGTest.cpp
+  VPlanSlpTest.cpp
   )
diff --git a/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0381925655e716730de88a97c9a705cc634ec04e
--- /dev/null
+++ b/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -0,0 +1,899 @@
+//===- llvm/unittest/Transforms/Vectorize/VPlanSlpTest.cpp ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../lib/Transforms/Vectorize/VPlan.h"
+#include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h"
+#include "../lib/Transforms/Vectorize/VPlanHCFGTransforms.h"
+#include "VPlanTestBase.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace {
+
+class VPlanSlpTest : public VPlanTestBase {
+protected:
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI;
+  DataLayout DL;
+
+  std::unique_ptr<AssumptionCache> AC;
+  std::unique_ptr<ScalarEvolution> SE;
+  std::unique_ptr<AAResults> AARes;
+  std::unique_ptr<BasicAAResult> BasicAA;
+  std::unique_ptr<LoopAccessInfo> LAI;
+  std::unique_ptr<PredicatedScalarEvolution> PSE;
+  std::unique_ptr<InterleavedAccessInfo> IAI;
+
+  VPlanSlpTest()
+      : TLII(), TLI(TLII),
+        DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-"
+           "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:"
+           "16:32:64-S128") {}
+
+  VPInterleavedAccessInfo getInterleavedAccessInfo(Function &F, Loop *L,
+                                                   VPlan &Plan) {
+    AC.reset(new AssumptionCache(F));
+    SE.reset(new ScalarEvolution(F, TLI, *AC, *DT, *LI));
+    BasicAA.reset(new BasicAAResult(DL, F, TLI, *AC, &*DT, &*LI));
+    AARes.reset(new AAResults(TLI));
+    AARes->addAAResult(*BasicAA);
+    PSE.reset(new PredicatedScalarEvolution(*SE, *L));
+    LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI));
+    IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI));
+    IAI->analyzeInterleaving(false);
+    return {Plan, *IAI};
+  }
+};
+
+TEST_F(VPlanSlpTest, testSlpSimple_2) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  auto *CombinedLoadB = cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode());
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode());
+}
+
+TEST_F(VPlanSlpTest, testSlpSimple_3) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr %struct.Test, %struct.Test* %A, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  auto *CombinedLoadB = cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode());
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode());
+
+  VPInstruction *GetA = cast<VPInstruction>(&*std::next(Body->begin(), 1));
+  VPInstruction *GetB = cast<VPInstruction>(&*std::next(Body->begin(), 3));
+  EXPECT_EQ(GetA, CombinedLoadA->getOperand(0));
+  EXPECT_EQ(GetB, CombinedLoadB->getOperand(0));
+}
+
+TEST_F(VPlanSlpTest, testSlpReuse_1) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vA0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vA1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 8));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 10));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  EXPECT_EQ(CombinedLoadA, CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode());
+}
+
+TEST_F(VPlanSlpTest, testSlpReuse_2) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define i32 @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vA0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vA1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret i32 %vA1\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 5));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 10));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  Slp.buildGraph(StoreRoot);
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+static void checkReorderExample(VPInstruction *Store1, VPInstruction *Store2,
+                                VPBasicBlock *Body,
+                                VPInterleavedAccessInfo &&IAI) {
+  VPlanSlp Slp(IAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+
+  EXPECT_TRUE(Slp.isCompletelySLP());
+  EXPECT_EQ(CombinedStore->getOpcode(), VPInstruction::SLPStore);
+
+  VPInstruction *CombinedAdd =
+      cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(CombinedAdd->getOpcode(), Instruction::Add);
+
+  VPInstruction *CombinedMulAB =
+      cast<VPInstruction>(CombinedAdd->getOperand(0));
+  VPInstruction *CombinedMulCD =
+      cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(CombinedMulAB->getOpcode(), Instruction::Mul);
+
+  VPInstruction *CombinedLoadA =
+      cast<VPInstruction>(CombinedMulAB->getOperand(0));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode());
+  VPInstruction *LoadvA0 = cast<VPInstruction>(&*std::next(Body->begin(), 2));
+  VPInstruction *LoadvA1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  EXPECT_EQ(LoadvA0->getOperand(0), CombinedLoadA->getOperand(0));
+  EXPECT_EQ(LoadvA1->getOperand(0), CombinedLoadA->getOperand(1));
+
+  VPInstruction *CombinedLoadB =
+      cast<VPInstruction>(CombinedMulAB->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode());
+  VPInstruction *LoadvB0 = cast<VPInstruction>(&*std::next(Body->begin(), 4));
+  VPInstruction *LoadvB1 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+  EXPECT_EQ(LoadvB0->getOperand(0), CombinedLoadB->getOperand(0));
+  EXPECT_EQ(LoadvB1->getOperand(0), CombinedLoadB->getOperand(1));
+
+  EXPECT_EQ(CombinedMulCD->getOpcode(), Instruction::Mul);
+
+  VPInstruction *CombinedLoadC =
+      cast<VPInstruction>(CombinedMulCD->getOperand(0));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadC->getOpcode());
+  VPInstruction *LoadvC0 = cast<VPInstruction>(&*std::next(Body->begin(), 7));
+  VPInstruction *LoadvC1 = cast<VPInstruction>(&*std::next(Body->begin(), 17));
+  EXPECT_EQ(LoadvC0->getOperand(0), CombinedLoadC->getOperand(0));
+  EXPECT_EQ(LoadvC1->getOperand(0), CombinedLoadC->getOperand(1));
+
+  VPInstruction *CombinedLoadD =
+      cast<VPInstruction>(CombinedMulCD->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadD->getOpcode());
+  VPInstruction *LoadvD0 = cast<VPInstruction>(&*std::next(Body->begin(), 9));
+  VPInstruction *LoadvD1 = cast<VPInstruction>(&*std::next(Body->begin(), 19));
+  EXPECT_EQ(LoadvD0->getOperand(0), CombinedLoadD->getOperand(0));
+  EXPECT_EQ(LoadvD1->getOperand(0), CombinedLoadD->getOperand(1));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_1) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vA1, %vB1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vC1, %vD1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_2) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vB1, %vA1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vD1, %vC1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_3) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA1, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vB1, %vA0\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vD1, %vC1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot));
+
+  // FIXME Need to select better first value for lane0.
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_4) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vA1, %vB1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vC1, %vD1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+// Make sure we do not combine instructions with operands in different BBs.
+TEST_F(VPlanSlpTest, testInstrsInDifferentBBs) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  br label %bb2\n"
+      "bb2:\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+  VPBasicBlock *BB2 = Body->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(BB2->begin(), 3));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(BB2->begin(), 5));
+
+  VPlanSlp Slp(VPIAI, *BB2);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot));
+  EXPECT_EQ(0u, Slp.getWidestBundleBits());
+}
+
+// Make sure we do not combine instructions with operands in different BBs.
+TEST_F(VPlanSlpTest, testInstrsInDifferentBBs2) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  br label %bb2\n"
+      "bb2:\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+  VPBasicBlock *BB2 = Body->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(BB2->begin(), 1));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(BB2->begin(), 3));
+
+  VPlanSlp Slp(VPIAI, *BB2);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot));
+  EXPECT_EQ(0u, Slp.getWidestBundleBits());
+}
+
+TEST_F(VPlanSlpTest, testSlpAtomicLoad) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load atomic i32, i32* %A0 monotonic, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot));
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+TEST_F(VPlanSlpTest, testSlpAtomicStore) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store atomic i32 %add0, i32* %C0 monotonic, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  Slp.buildGraph(StoreRoot);
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+} // namespace
+} // namespace llvm
diff --git a/unittests/XRay/FDRProducerConsumerTest.cpp b/unittests/XRay/FDRProducerConsumerTest.cpp
index 09ec44db26ee92d5d14450a12a6725d94f199b64..03fabdf2372da9ed05807faa6a5a75b6c8f4699a 100644
--- a/unittests/XRay/FDRProducerConsumerTest.cpp
+++ b/unittests/XRay/FDRProducerConsumerTest.cpp
@@ -30,13 +30,10 @@ namespace {
 using ::testing::Eq;
 using ::testing::IsEmpty;
 using ::testing::Not;
+using ::testing::SizeIs;
 
 template <class RecordType> std::unique_ptr<Record> MakeRecord();
 
-template <> std::unique_ptr<Record> MakeRecord<BufferExtents>() {
-  return make_unique<BufferExtents>(1);
-}
-
 template <> std::unique_ptr<Record> MakeRecord<NewBufferRecord>() {
   return make_unique<NewBufferRecord>(1);
 }
@@ -69,10 +66,18 @@ template <> std::unique_ptr<Record> MakeRecord<FunctionRecord>() {
   return make_unique<FunctionRecord>(RecordTypes::ENTER, 1, 2);
 }
 
+template <> std::unique_ptr<Record> MakeRecord<CustomEventRecordV5>() {
+  return make_unique<CustomEventRecordV5>(4, 1, "data");
+}
+
+template <> std::unique_ptr<Record> MakeRecord<TypedEventRecord>() {
+  return make_unique<TypedEventRecord>(4, 1, 2, "data");
+}
+
 template <class T> class RoundTripTest : public ::testing::Test {
 public:
   RoundTripTest() : Data(), OS(Data) {
-    H.Version = 3;
+    H.Version = 4;
     H.Type = 1;
     H.ConstantTSC = true;
     H.NonstopTSC = true;
@@ -92,9 +97,36 @@ protected:
 
 TYPED_TEST_CASE_P(RoundTripTest);
 
+template <class T> class RoundTripTestV5 : public ::testing::Test {
+public:
+  RoundTripTestV5() : Data(), OS(Data) {
+    H.Version = 5;
+    H.Type = 1;
+    H.ConstantTSC = true;
+    H.NonstopTSC = true;
+    H.CycleFrequency = 3e9;
+
+    Writer = make_unique<FDRTraceWriter>(OS, H);
+    Rec = MakeRecord<T>();
+  }
+
+protected:
+  std::string Data;
+  raw_string_ostream OS;
+  XRayFileHeader H;
+  std::unique_ptr<FDRTraceWriter> Writer;
+  std::unique_ptr<Record> Rec;
+};
+
+TYPED_TEST_CASE_P(RoundTripTestV5);
+
 // This test ensures that the writing and reading implementations are in sync --
 // that given write(read(write(R))) == R.
 TYPED_TEST_P(RoundTripTest, RoundTripsSingleValue) {
+  // Always write a buffer extents record which will cover the correct size of
+  // the record, for version 3 and up.
+  BufferExtents BE(200);
+  ASSERT_FALSE(errorToBool(BE.apply(*this->Writer)));
   auto &R = this->Rec;
   ASSERT_FALSE(errorToBool(R->apply(*this->Writer)));
   this->OS.flush();
@@ -125,17 +157,67 @@ TYPED_TEST_P(RoundTripTest, RoundTripsSingleValue) {
 
   EXPECT_EQ(Data2.substr(sizeof(XRayFileHeader)),
             this->Data.substr(sizeof(XRayFileHeader)));
-  EXPECT_THAT(Records[0]->type(), Eq(R->type()));
+  ASSERT_THAT(Records, SizeIs(2));
+  EXPECT_THAT(Records[1]->getRecordType(), Eq(R->getRecordType()));
 }
 
 REGISTER_TYPED_TEST_CASE_P(RoundTripTest, RoundTripsSingleValue);
 
+// We duplicate the above case for the V5 version using different types and
+// encodings.
+TYPED_TEST_P(RoundTripTestV5, RoundTripsSingleValue) {
+  BufferExtents BE(200);
+  ASSERT_FALSE(errorToBool(BE.apply(*this->Writer)));
+  auto &R = this->Rec;
+  ASSERT_FALSE(errorToBool(R->apply(*this->Writer)));
+  this->OS.flush();
+
+  DataExtractor DE(this->Data, sys::IsLittleEndianHost, 8);
+  uint32_t OffsetPtr = 0;
+  auto HeaderOrErr = readBinaryFormatHeader(DE, OffsetPtr);
+  if (!HeaderOrErr)
+    FAIL() << HeaderOrErr.takeError();
+
+  FileBasedRecordProducer P(HeaderOrErr.get(), DE, OffsetPtr);
+  std::vector<std::unique_ptr<Record>> Records;
+  LogBuilderConsumer C(Records);
+  while (DE.isValidOffsetForDataOfSize(OffsetPtr, 1)) {
+    auto R = P.produce();
+    if (!R)
+      FAIL() << R.takeError();
+    if (auto E = C.consume(std::move(R.get())))
+      FAIL() << E;
+  }
+  ASSERT_THAT(Records, Not(IsEmpty()));
+  std::string Data2;
+  raw_string_ostream OS2(Data2);
+  FDRTraceWriter Writer2(OS2, this->H);
+  for (auto &P : Records)
+    ASSERT_FALSE(errorToBool(P->apply(Writer2)));
+  OS2.flush();
+
+  EXPECT_EQ(Data2.substr(sizeof(XRayFileHeader)),
+            this->Data.substr(sizeof(XRayFileHeader)));
+  ASSERT_THAT(Records, SizeIs(2));
+  EXPECT_THAT(Records[1]->getRecordType(), Eq(R->getRecordType()));
+}
+
+REGISTER_TYPED_TEST_CASE_P(RoundTripTestV5, RoundTripsSingleValue);
+
+// These are the record types we support for v4 and below.
 using RecordTypes =
-    ::testing::Types<BufferExtents, NewBufferRecord, NewCPUIDRecord,
-                     TSCWrapRecord, WallclockRecord, CustomEventRecord,
-                     CallArgRecord, BufferExtents, PIDRecord, FunctionRecord>;
+    ::testing::Types<NewBufferRecord, NewCPUIDRecord, TSCWrapRecord,
+                     WallclockRecord, CustomEventRecord, CallArgRecord,
+                     PIDRecord, FunctionRecord>;
 INSTANTIATE_TYPED_TEST_CASE_P(Records, RoundTripTest, RecordTypes);
 
+// For V5, we have two new types we're supporting.
+using RecordTypesV5 =
+    ::testing::Types<NewBufferRecord, NewCPUIDRecord, TSCWrapRecord,
+                     WallclockRecord, CustomEventRecordV5, TypedEventRecord,
+                     CallArgRecord, PIDRecord, FunctionRecord>;
+INSTANTIATE_TYPED_TEST_CASE_P(Records, RoundTripTestV5, RecordTypesV5);
+
 } // namespace
 } // namespace xray
 } // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp b/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
index 8a519bb2e7b5fd90d8b11189f4fd549be1bd5153..6dd5305d4dc6d0aa4e9bcc07c909ddf69aac7e9c 100644
--- a/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
+++ b/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
@@ -1,3 +1,12 @@
+//===-- TargetTest.cpp ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 #include "Target.h"
 
 #include <cassert>
@@ -59,6 +68,13 @@ TEST_F(AArch64TargetTest, SetRegToConstant) {
   EXPECT_THAT(Insts, Not(IsEmpty()));
 }
 
+TEST_F(AArch64TargetTest, DefaultPfmCounters) {
+  const std::string Expected = "CPU_CYCLES";
+  EXPECT_EQ(ExegesisTarget_->getPfmCounters("").CycleCounter, Expected);
+  EXPECT_EQ(ExegesisTarget_->getPfmCounters("unknown_cpu").CycleCounter,
+            Expected);
+}
+
 } // namespace
 } // namespace exegesis
 } // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/CMakeLists.txt b/unittests/tools/llvm-exegesis/CMakeLists.txt
index 1b37316104117420b73b9aa4fa8415b531f2c450..05b2b3bc605c862f47acf1cec79ee1e6ac68b9ee 100644
--- a/unittests/tools/llvm-exegesis/CMakeLists.txt
+++ b/unittests/tools/llvm-exegesis/CMakeLists.txt
@@ -27,3 +27,6 @@ endif()
 if(LLVM_TARGETS_TO_BUILD MATCHES "AArch64")
   add_subdirectory(AArch64)
 endif()
+if(LLVM_TARGETS_TO_BUILD MATCHES "PowerPC")
+  add_subdirectory(PowerPC)
+endif()
diff --git a/unittests/tools/llvm-exegesis/PowerPC/AnalysisTest.cpp b/unittests/tools/llvm-exegesis/PowerPC/AnalysisTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..15d81ca88dec3131fe146740aef2a1b5fdc35b1f
--- /dev/null
+++ b/unittests/tools/llvm-exegesis/PowerPC/AnalysisTest.cpp
@@ -0,0 +1,93 @@
+//===-- AnalysisTest.cpp ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Analysis.h"
+
+#include <cassert>
+#include <memory>
+
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace llvm{
+namespace exegesis {
+namespace {
+
+using testing::Pair;
+using testing::UnorderedElementsAre;
+
+class AnalysisTest : public ::testing::Test {
+protected:
+  AnalysisTest() {
+    const std::string TT = "powerpc64le-unknown-linux";
+    std::string error;
+    const llvm::Target *const TheTarget =
+        llvm::TargetRegistry::lookupTarget(TT, error);
+    if (!TheTarget) {
+      llvm::errs() << error << "\n";
+      return;
+    }
+    STI.reset(TheTarget->createMCSubtargetInfo(TT, "pwr9", ""));
+
+    // Compute the ProxResIdx of ports uses in tests.
+    const auto &SM = STI->getSchedModel();
+    for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+      const std::string Name = SM.getProcResource(I)->Name;
+      if (Name == "ALU") {
+        ALUIdx = I;
+      } else if (Name == "ALUE") {
+        ALUEIdx = I;
+      } else if (Name == "ALUO") {
+        ALUOIdx = I;
+      } else if (Name == "IP_AGEN") {
+        IPAGENIdx = I;
+      }
+    }
+    EXPECT_NE(ALUIdx, 0);
+    EXPECT_NE(ALUEIdx, 0);
+    EXPECT_NE(ALUOIdx, 0);
+    EXPECT_NE(IPAGENIdx, 0);
+  }
+
+  static void SetUpTestCase() {
+    LLVMInitializePowerPCTargetInfo();
+    LLVMInitializePowerPCTarget();
+    LLVMInitializePowerPCTargetMC();
+  }
+
+protected:
+  std::unique_ptr<const llvm::MCSubtargetInfo> STI;
+  uint16_t ALUIdx = 0;
+  uint16_t ALUEIdx = 0;
+  uint16_t ALUOIdx = 0;
+  uint16_t IPAGENIdx = 0;
+};
+
+TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_2ALU) {
+  const auto Pressure =
+      computeIdealizedProcResPressure(STI->getSchedModel(), {{ALUIdx, 2}});
+  EXPECT_THAT(Pressure, UnorderedElementsAre(Pair(ALUIdx, 2.0)));
+}
+
+TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_1ALUE) {
+  const auto Pressure =
+      computeIdealizedProcResPressure(STI->getSchedModel(), {{ALUEIdx, 2}});
+  EXPECT_THAT(Pressure, UnorderedElementsAre(Pair(ALUEIdx, 2.0)));
+}
+
+TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_1ALU1IPAGEN) {
+  const auto Pressure =
+      computeIdealizedProcResPressure(STI->getSchedModel(), {{ALUIdx, 1}, {IPAGENIdx, 1}});
+  EXPECT_THAT(Pressure, UnorderedElementsAre(Pair(ALUIdx, 1.0),Pair(IPAGENIdx, 1)));
+}
+} // namespace
+} // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/PowerPC/CMakeLists.txt b/unittests/tools/llvm-exegesis/PowerPC/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e8a53e3692aac900158d16ab4b3d728e8dfd3a8e
--- /dev/null
+++ b/unittests/tools/llvm-exegesis/PowerPC/CMakeLists.txt
@@ -0,0 +1,22 @@
+include_directories(
+  ${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC
+  ${LLVM_BINARY_DIR}/lib/Target/PowerPC
+  ${LLVM_MAIN_SRC_DIR}/tools/llvm-exegesis/lib
+  )
+
+set(LLVM_LINK_COMPONENTS
+  MC
+  MCParser
+  Object
+  Support
+  Symbolize
+  PowerPC
+  )
+
+add_llvm_unittest(LLVMExegesisPowerPCTests
+  AnalysisTest.cpp
+  TargetTest.cpp
+  )
+target_link_libraries(LLVMExegesisPowerPCTests PRIVATE
+  LLVMExegesis
+  LLVMExegesisPowerPC)
diff --git a/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp b/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e34a15d5e32288d84a325f8fd70b4f19277f80e
--- /dev/null
+++ b/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
@@ -0,0 +1,71 @@
+//===-- TargetTest.cpp ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Target.h"
+
+#include <cassert>
+#include <memory>
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace llvm{
+namespace exegesis {
+
+void InitializePowerPCExegesisTarget();
+
+namespace {
+
+using testing::NotNull;
+using testing::IsEmpty;
+using testing::Not;
+
+constexpr const char kTriple[] = "powerpc64le-unknown-linux";
+
+class PowerPCTargetTest : public ::testing::Test {
+protected:
+  PowerPCTargetTest()
+      : ExegesisTarget_(ExegesisTarget::lookup(llvm::Triple(kTriple))) {
+    EXPECT_THAT(ExegesisTarget_, NotNull());
+    std::string error;
+    Target_ = llvm::TargetRegistry::lookupTarget(kTriple, error);
+    EXPECT_THAT(Target_, NotNull());
+  }
+  static void SetUpTestCase() {
+    LLVMInitializePowerPCTargetInfo();
+    LLVMInitializePowerPCTarget();
+    LLVMInitializePowerPCTargetMC();
+    InitializePowerPCExegesisTarget();
+  }
+
+  const llvm::Target *Target_;
+  const ExegesisTarget *const ExegesisTarget_;
+};
+
+TEST_F(PowerPCTargetTest, SetRegToConstant) {
+  const std::unique_ptr<llvm::MCSubtargetInfo> STI(
+      Target_->createMCSubtargetInfo(kTriple, "generic", ""));
+  const auto Insts =
+      ExegesisTarget_->setRegTo(*STI, llvm::PPC::X0, llvm::APInt());
+  EXPECT_THAT(Insts, Not(IsEmpty()));
+}
+
+TEST_F(PowerPCTargetTest, DefaultPfmCounters) {
+  const std::string Expected = "CYCLES";
+  EXPECT_EQ(ExegesisTarget_->getPfmCounters("").CycleCounter, Expected);
+  EXPECT_EQ(ExegesisTarget_->getPfmCounters("unknown_cpu").CycleCounter,
+            Expected);
+}
+
+} // namespace
+} // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp b/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
index 00ac6290aed358d7b0f3dff1cf2d8e2d90490e97..c8939186eb1482bbdd5ff680e72e148e102895f1 100644
--- a/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
@@ -1,3 +1,12 @@
+//===-- AnalysisTest.cpp ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 #include "Analysis.h"
 
 #include <cassert>
@@ -28,7 +37,7 @@ protected:
     }
     STI.reset(TheTarget->createMCSubtargetInfo(TT, "haswell", ""));
 
-    // Compute the ProxResIdx of ports unes in tests.
+    // Compute the ProxResIdx of ports uses in tests.
     const auto &SM = STI->getSchedModel();
     for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
       const std::string Name = SM.getProcResource(I)->Name;
diff --git a/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp b/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
index 007b0156b1ffca66d42ae00bf50c219a7e17813c..c85c99c25ecd719e302cdecbe2222b449d66877d 100644
--- a/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
@@ -1,3 +1,13 @@
+//===-- RegisterAliasingTest.cpp --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
 #include "RegisterAliasing.h"
 
 #include <cassert>
diff --git a/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index 2d9d7bcd5598e60da9893ba9e5276ed9241b1656..b1ccb486bfb335a890d919f142c8ef6dd32c78b5 100644
--- a/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -1,3 +1,12 @@
+//===-- TargetTest.cpp -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 #include "Target.h"
 
 #include <cassert>
@@ -104,14 +113,7 @@ constexpr const char kTriple[] = "x86_64-unknown-linux";
 
 class X86TargetTest : public ::testing::Test {
 protected:
-  X86TargetTest(const char *Features)
-      : ExegesisTarget_(ExegesisTarget::lookup(llvm::Triple(kTriple))) {
-    EXPECT_THAT(ExegesisTarget_, NotNull());
-    std::string error;
-    Target_ = llvm::TargetRegistry::lookupTarget(kTriple, error);
-    EXPECT_THAT(Target_, NotNull());
-    STI_.reset(Target_->createMCSubtargetInfo(kTriple, "core2", Features));
-  }
+  X86TargetTest(const char *Features) : State(kTriple, "core2", Features) {}
 
   static void SetUpTestCase() {
     LLVMInitializeX86TargetInfo();
@@ -121,12 +123,11 @@ protected:
   }
 
   std::vector<MCInst> setRegTo(unsigned Reg, const APInt &Value) {
-    return ExegesisTarget_->setRegTo(*STI_, Reg, Value);
+    return State.getExegesisTarget().setRegTo(State.getSubtargetInfo(), Reg,
+                                              Value);
   }
 
-  const llvm::Target *Target_;
-  const ExegesisTarget *const ExegesisTarget_;
-  std::unique_ptr<llvm::MCSubtargetInfo> STI_;
+  LLVMState State;
 };
 
 class Core2TargetTest : public X86TargetTest {
@@ -360,8 +361,7 @@ TEST_F(Core2TargetTest, SetRegToFP1_32Bits) {
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 0),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
                   IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
-                  OpcodeIs(llvm::X86::LD_Fp80m),
-                  IsStackDeallocate(10)));
+                  OpcodeIs(llvm::X86::LD_Fp80m), IsStackDeallocate(10)));
 }
 
 TEST_F(Core2TargetTest, SetRegToFP1_4Bits) {
@@ -371,8 +371,33 @@ TEST_F(Core2TargetTest, SetRegToFP1_4Bits) {
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x00000001UL, 0),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
                   IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
-                  OpcodeIs(llvm::X86::LD_Fp80m),
-                  IsStackDeallocate(10)));
+                  OpcodeIs(llvm::X86::LD_Fp80m), IsStackDeallocate(10)));
+}
+
+TEST_F(Core2Avx512TargetTest, FillMemoryOperands_ADD64rm) {
+  Instruction I(State.getInstrInfo(), State.getRATC(), X86::ADD64rm);
+  InstructionTemplate IT(I);
+  constexpr const int kOffset = 42;
+  State.getExegesisTarget().fillMemoryOperands(IT, X86::RDI, kOffset);
+  // Memory is operands 2-6.
+  EXPECT_THAT(IT.getValueFor(I.Operands[2]), IsReg(X86::RDI));
+  EXPECT_THAT(IT.getValueFor(I.Operands[3]), IsImm(1));
+  EXPECT_THAT(IT.getValueFor(I.Operands[4]), IsReg(0));
+  EXPECT_THAT(IT.getValueFor(I.Operands[5]), IsImm(kOffset));
+  EXPECT_THAT(IT.getValueFor(I.Operands[6]), IsReg(0));
+}
+
+TEST_F(Core2Avx512TargetTest, FillMemoryOperands_VGATHERDPSZ128rm) {
+  Instruction I(State.getInstrInfo(), State.getRATC(), X86::VGATHERDPSZ128rm);
+  InstructionTemplate IT(I);
+  constexpr const int kOffset = 42;
+  State.getExegesisTarget().fillMemoryOperands(IT, X86::RDI, kOffset);
+  // Memory is operands 4-8.
+  EXPECT_THAT(IT.getValueFor(I.Operands[4]), IsReg(X86::RDI));
+  EXPECT_THAT(IT.getValueFor(I.Operands[5]), IsImm(1));
+  EXPECT_THAT(IT.getValueFor(I.Operands[6]), IsReg(0));
+  EXPECT_THAT(IT.getValueFor(I.Operands[7]), IsImm(kOffset));
+  EXPECT_THAT(IT.getValueFor(I.Operands[8]), IsReg(0));
 }
 
 } // namespace
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 967d22f12b6464b47ac1c9d6af027903e3240cf0..fdfd3e7cb7c56057940f4507077f419a86271510 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -19,12 +19,13 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/FileCheck.h"
 using namespace llvm;
 
 static cl::opt<std::string>
-    CheckFilename(cl::Positional, cl::desc("<check-file>"), cl::Required);
+    CheckFilename(cl::Positional, cl::desc("<check-file>"), cl::Optional);
 
 static cl::opt<std::string>
     InputFilename("input-file", cl::desc("File to check (defaults to stdin)"),
@@ -91,7 +92,27 @@ static cl::opt<bool> DumpInputOnFailure(
     "dump-input-on-failure", cl::init(std::getenv(DumpInputEnv)),
     cl::desc("Dump original input to stderr before failing.\n"
              "The value can be also controlled using\n"
-             "FILECHECK_DUMP_INPUT_ON_FAILURE environment variable.\n"));
+             "FILECHECK_DUMP_INPUT_ON_FAILURE environment variable.\n"
+             "This option is deprecated in favor of -dump-input=fail.\n"));
+
+enum DumpInputValue {
+  DumpInputDefault,
+  DumpInputHelp,
+  DumpInputNever,
+  DumpInputFail,
+  DumpInputAlways
+};
+
+static cl::opt<DumpInputValue> DumpInput(
+    "dump-input", cl::init(DumpInputDefault),
+    cl::desc("Dump input to stderr, adding annotations representing\n"
+             " currently enabled diagnostics\n"),
+    cl::value_desc("mode"),
+    cl::values(clEnumValN(DumpInputHelp, "help",
+                          "Explain dump format and quit"),
+               clEnumValN(DumpInputNever, "never", "Never dump input"),
+               clEnumValN(DumpInputFail, "fail", "Dump input on failure"),
+               clEnumValN(DumpInputAlways, "always", "Always dump input")));
 
 typedef cl::list<std::string>::const_iterator prefix_iterator;
 
@@ -108,6 +129,376 @@ static void DumpCommandLine(int argc, char **argv) {
   errs() << "\n";
 }
 
+struct MarkerStyle {
+  /// The starting char (before tildes) for marking the line.
+  char Lead;
+  /// What color to use for this annotation.
+  raw_ostream::Colors Color;
+  /// A note to follow the marker, or empty string if none.
+  std::string Note;
+  MarkerStyle() {}
+  MarkerStyle(char Lead, raw_ostream::Colors Color,
+              const std::string &Note = "")
+      : Lead(Lead), Color(Color), Note(Note) {}
+};
+
+static MarkerStyle GetMarker(FileCheckDiag::MatchType MatchTy) {
+  switch (MatchTy) {
+  case FileCheckDiag::MatchFoundAndExpected:
+    return MarkerStyle('^', raw_ostream::GREEN);
+  case FileCheckDiag::MatchFoundButExcluded:
+    return MarkerStyle('!', raw_ostream::RED, "error: no match expected");
+  case FileCheckDiag::MatchFoundButWrongLine:
+    return MarkerStyle('!', raw_ostream::RED, "error: match on wrong line");
+  case FileCheckDiag::MatchFoundButDiscarded:
+    return MarkerStyle('!', raw_ostream::CYAN,
+                       "discard: overlaps earlier match");
+  case FileCheckDiag::MatchNoneAndExcluded:
+    return MarkerStyle('X', raw_ostream::GREEN);
+  case FileCheckDiag::MatchNoneButExpected:
+    return MarkerStyle('X', raw_ostream::RED, "error: no match found");
+  case FileCheckDiag::MatchFuzzy:
+    return MarkerStyle('?', raw_ostream::MAGENTA, "possible intended match");
+  }
+  llvm_unreachable_internal("unexpected match type");
+}
+
+static void DumpInputAnnotationHelp(raw_ostream &OS) {
+  OS << "The following description was requested by -dump-input=help to\n"
+     << "explain the input annotations printed by -dump-input=always and\n"
+     << "-dump-input=fail:\n\n";
+
+  // Labels for input lines.
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "L:";
+  OS << "     labels line number L of the input file\n";
+
+  // Labels for annotation lines.
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "T:L";
+  OS << "    labels the only match result for a pattern of type T from "
+     << "line L of\n"
+     << "           the check file\n";
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "T:L'N";
+  OS << "  labels the Nth match result for a pattern of type T from line "
+     << "L of\n"
+     << "           the check file\n";
+
+  // Markers on annotation lines.
+  OS << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "^~~";
+  OS << "    marks good match (reported if -v)\n"
+     << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "!~~";
+  OS << "    marks bad match, such as:\n"
+     << "           - CHECK-NEXT on same line as previous match (error)\n"
+     << "           - CHECK-NOT found (error)\n"
+     << "           - CHECK-DAG overlapping match (discarded, reported if "
+     << "-vv)\n"
+     << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "X~~";
+  OS << "    marks search range when no match is found, such as:\n"
+     << "           - CHECK-NEXT not found (error)\n"
+     << "           - CHECK-NOT not found (success, reported if -vv)\n"
+     << "           - CHECK-DAG not found after discarded matches (error)\n"
+     << "  - ";
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "?";
+  OS << "      marks fuzzy match when no match is found\n";
+
+  // Colors.
+  OS << "  - colors ";
+  WithColor(OS, raw_ostream::GREEN, true) << "success";
+  OS << ", ";
+  WithColor(OS, raw_ostream::RED, true) << "error";
+  OS << ", ";
+  WithColor(OS, raw_ostream::MAGENTA, true) << "fuzzy match";
+  OS << ", ";
+  WithColor(OS, raw_ostream::CYAN, true, false) << "discarded match";
+  OS << ", ";
+  WithColor(OS, raw_ostream::CYAN, true, true) << "unmatched input";
+  OS << "\n\n"
+     << "If you are not seeing color above or in input dumps, try: -color\n";
+}
+
+/// An annotation for a single input line.
+struct InputAnnotation {
+  /// The check file line (one-origin indexing) where the directive that
+  /// produced this annotation is located.
+  unsigned CheckLine;
+  /// The index of the match result for this check.
+  unsigned CheckDiagIndex;
+  /// The label for this annotation.
+  std::string Label;
+  /// What input line (one-origin indexing) this annotation marks.  This might
+  /// be different from the starting line of the original diagnostic if this is
+  /// a non-initial fragment of a diagnostic that has been broken across
+  /// multiple lines.
+  unsigned InputLine;
+  /// The column range (one-origin indexing, open end) in which to to mark the
+  /// input line.  If InputEndCol is UINT_MAX, treat it as the last column
+  /// before the newline.
+  unsigned InputStartCol, InputEndCol;
+  /// The marker to use.
+  MarkerStyle Marker;
+  /// Whether this annotation represents a good match for an expected pattern.
+  bool FoundAndExpectedMatch;
+};
+
+/// Get an abbreviation for the check type.
+std::string GetCheckTypeAbbreviation(Check::FileCheckType Ty) {
+  switch (Ty) {
+  case Check::CheckPlain:
+    if (Ty.getCount() > 1)
+      return "count";
+    return "check";
+  case Check::CheckNext:
+    return "next";
+  case Check::CheckSame:
+    return "same";
+  case Check::CheckNot:
+    return "not";
+  case Check::CheckDAG:
+    return "dag";
+  case Check::CheckLabel:
+    return "label";
+  case Check::CheckEmpty:
+    return "empty";
+  case Check::CheckEOF:
+    return "eof";
+  case Check::CheckBadNot:
+    return "bad-not";
+  case Check::CheckBadCount:
+    return "bad-count";
+  case Check::CheckNone:
+    llvm_unreachable("invalid FileCheckType");
+  }
+  llvm_unreachable("unknown FileCheckType");
+}
+
+static void BuildInputAnnotations(const std::vector<FileCheckDiag> &Diags,
+                                  std::vector<InputAnnotation> &Annotations,
+                                  unsigned &LabelWidth) {
+  // How many diagnostics has the current check seen so far?
+  unsigned CheckDiagCount = 0;
+  // What's the widest label?
+  LabelWidth = 0;
+  for (auto DiagItr = Diags.begin(), DiagEnd = Diags.end(); DiagItr != DiagEnd;
+       ++DiagItr) {
+    InputAnnotation A;
+
+    // Build label, which uniquely identifies this check result.
+    A.CheckLine = DiagItr->CheckLine;
+    llvm::raw_string_ostream Label(A.Label);
+    Label << GetCheckTypeAbbreviation(DiagItr->CheckTy) << ":"
+          << DiagItr->CheckLine;
+    A.CheckDiagIndex = UINT_MAX;
+    auto DiagNext = std::next(DiagItr);
+    if (DiagNext != DiagEnd && DiagItr->CheckTy == DiagNext->CheckTy &&
+        DiagItr->CheckLine == DiagNext->CheckLine)
+      A.CheckDiagIndex = CheckDiagCount++;
+    else if (CheckDiagCount) {
+      A.CheckDiagIndex = CheckDiagCount;
+      CheckDiagCount = 0;
+    }
+    if (A.CheckDiagIndex != UINT_MAX)
+      Label << "'" << A.CheckDiagIndex;
+    else
+      A.CheckDiagIndex = 0;
+    Label.flush();
+    LabelWidth = std::max((std::string::size_type)LabelWidth, A.Label.size());
+
+    MarkerStyle Marker = GetMarker(DiagItr->MatchTy);
+    A.Marker = Marker;
+    A.FoundAndExpectedMatch =
+        DiagItr->MatchTy == FileCheckDiag::MatchFoundAndExpected;
+
+    // Compute the mark location, and break annotation into multiple
+    // annotations if it spans multiple lines.
+    A.InputLine = DiagItr->InputStartLine;
+    A.InputStartCol = DiagItr->InputStartCol;
+    if (DiagItr->InputStartLine == DiagItr->InputEndLine) {
+      // Sometimes ranges are empty in order to indicate a specific point, but
+      // that would mean nothing would be marked, so adjust the range to
+      // include the following character.
+      A.InputEndCol =
+          std::max(DiagItr->InputStartCol + 1, DiagItr->InputEndCol);
+      Annotations.push_back(A);
+    } else {
+      assert(DiagItr->InputStartLine < DiagItr->InputEndLine &&
+             "expected input range not to be inverted");
+      A.InputEndCol = UINT_MAX;
+      A.Marker.Note = "";
+      Annotations.push_back(A);
+      for (unsigned L = DiagItr->InputStartLine + 1, E = DiagItr->InputEndLine;
+           L <= E; ++L) {
+        // If a range ends before the first column on a line, then it has no
+        // characters on that line, so there's nothing to render.
+        if (DiagItr->InputEndCol == 1 && L == E) {
+          Annotations.back().Marker.Note = Marker.Note;
+          break;
+        }
+        InputAnnotation B;
+        B.CheckLine = A.CheckLine;
+        B.CheckDiagIndex = A.CheckDiagIndex;
+        B.Label = A.Label;
+        B.InputLine = L;
+        B.Marker = Marker;
+        B.Marker.Lead = '~';
+        B.InputStartCol = 1;
+        if (L != E) {
+          B.InputEndCol = UINT_MAX;
+          B.Marker.Note = "";
+        } else
+          B.InputEndCol = DiagItr->InputEndCol;
+        B.FoundAndExpectedMatch = A.FoundAndExpectedMatch;
+        Annotations.push_back(B);
+      }
+    }
+  }
+}
+
+static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req,
+                               StringRef InputFileText,
+                               std::vector<InputAnnotation> &Annotations,
+                               unsigned LabelWidth) {
+  OS << "Full input was:\n<<<<<<\n";
+
+  // Sort annotations.
+  //
+  // First, sort in the order of input lines to make it easier to find relevant
+  // annotations while iterating input lines in the implementation below.
+  // FileCheck diagnostics are not always reported and recorded in the order of
+  // input lines due to, for example, CHECK-DAG and CHECK-NOT.
+  //
+  // Second, for annotations for the same input line, sort in the order of the
+  // FileCheck directive's line in the check file (where there's at most one
+  // directive per line) and then by the index of the match result for that
+  // directive.  The rationale of this choice is that, for any input line, this
+  // sort establishes a total order of annotations that, with respect to match
+  // results, is consistent across multiple lines, thus making match results
+  // easier to track from one line to the next when they span multiple lines.
+  std::sort(Annotations.begin(), Annotations.end(),
+            [](const InputAnnotation &A, const InputAnnotation &B) {
+              if (A.InputLine != B.InputLine)
+                return A.InputLine < B.InputLine;
+              if (A.CheckLine != B.CheckLine)
+                return A.CheckLine < B.CheckLine;
+              // FIXME: Sometimes CHECK-LABEL reports its match twice with
+              // other diagnostics in between, and then diag index incrementing
+              // fails to work properly, and then this assert fails.  We should
+              // suppress one of those diagnostics or do a better job of
+              // computing this index.  For now, we just produce a redundant
+              // CHECK-LABEL annotation.
+              // assert(A.CheckDiagIndex != B.CheckDiagIndex &&
+              //        "expected diagnostic indices to be unique within a "
+              //        " check line");
+              return A.CheckDiagIndex < B.CheckDiagIndex;
+            });
+
+  // Compute the width of the label column.
+  const unsigned char *InputFilePtr = InputFileText.bytes_begin(),
+                      *InputFileEnd = InputFileText.bytes_end();
+  unsigned LineCount = InputFileText.count('\n');
+  if (InputFileEnd[-1] != '\n')
+    ++LineCount;
+  unsigned LineNoWidth = log10(LineCount) + 1;
+  // +3 below adds spaces (1) to the left of the (right-aligned) line numbers
+  // on input lines and (2) to the right of the (left-aligned) labels on
+  // annotation lines so that input lines and annotation lines are more
+  // visually distinct.  For example, the spaces on the annotation lines ensure
+  // that input line numbers and check directive line numbers never align
+  // horizontally.  Those line numbers might not even be for the same file.
+  // One space would be enough to achieve that, but more makes it even easier
+  // to see.
+  LabelWidth = std::max(LabelWidth, LineNoWidth) + 3;
+
+  // Print annotated input lines.
+  auto AnnotationItr = Annotations.begin(), AnnotationEnd = Annotations.end();
+  for (unsigned Line = 1;
+       InputFilePtr != InputFileEnd || AnnotationItr != AnnotationEnd;
+       ++Line) {
+    const unsigned char *InputFileLine = InputFilePtr;
+
+    // Print right-aligned line number.
+    WithColor(OS, raw_ostream::BLACK, true)
+        << format_decimal(Line, LabelWidth) << ": ";
+
+    // For the case where -v and colors are enabled, find the annotations for
+    // good matches for expected patterns in order to highlight everything
+    // else in the line.  There are no such annotations if -v is disabled.
+    std::vector<InputAnnotation> FoundAndExpectedMatches;
+    if (Req.Verbose && WithColor(OS).colorsEnabled()) {
+      for (auto I = AnnotationItr; I != AnnotationEnd && I->InputLine == Line;
+           ++I) {
+        if (I->FoundAndExpectedMatch)
+          FoundAndExpectedMatches.push_back(*I);
+      }
+    }
+
+    // Print numbered line with highlighting where there are no matches for
+    // expected patterns.
+    bool Newline = false;
+    {
+      WithColor COS(OS);
+      bool InMatch = false;
+      if (Req.Verbose)
+        COS.changeColor(raw_ostream::CYAN, true, true);
+      for (unsigned Col = 1; InputFilePtr != InputFileEnd && !Newline; ++Col) {
+        bool WasInMatch = InMatch;
+        InMatch = false;
+        for (auto M : FoundAndExpectedMatches) {
+          if (M.InputStartCol <= Col && Col < M.InputEndCol) {
+            InMatch = true;
+            break;
+          }
+        }
+        if (!WasInMatch && InMatch)
+          COS.resetColor();
+        else if (WasInMatch && !InMatch)
+          COS.changeColor(raw_ostream::CYAN, true, true);
+        if (*InputFilePtr == '\n')
+          Newline = true;
+        else
+          COS << *InputFilePtr;
+        ++InputFilePtr;
+      }
+    }
+    OS << '\n';
+    unsigned InputLineWidth = InputFilePtr - InputFileLine - Newline;
+
+    // Print any annotations.
+    while (AnnotationItr != AnnotationEnd &&
+           AnnotationItr->InputLine == Line) {
+      WithColor COS(OS, AnnotationItr->Marker.Color, true);
+      // The two spaces below are where the ": " appears on input lines.
+      COS << left_justify(AnnotationItr->Label, LabelWidth) << "  ";
+      unsigned Col;
+      for (Col = 1; Col < AnnotationItr->InputStartCol; ++Col)
+        COS << ' ';
+      COS << AnnotationItr->Marker.Lead;
+      // If InputEndCol=UINT_MAX, stop at InputLineWidth.
+      for (++Col; Col < AnnotationItr->InputEndCol && Col <= InputLineWidth;
+           ++Col)
+        COS << '~';
+      const std::string &Note = AnnotationItr->Marker.Note;
+      if (!Note.empty()) {
+        // Put the note at the end of the input line.  If we were to instead
+        // put the note right after the marker, subsequent annotations for the
+        // same input line might appear to mark this note instead of the input
+        // line.
+        for (; Col <= InputLineWidth; ++Col)
+          COS << ' ';
+        COS << ' ' << Note;
+      }
+      COS << '\n';
+      ++AnnotationItr;
+    }
+  }
+
+  OS << ">>>>>>\n";
+}
+
 int main(int argc, char **argv) {
   // Enable use of ANSI color codes because FileCheck is using them to
   // highlight text.
@@ -116,6 +507,14 @@ int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv, /*Overview*/ "", /*Errs*/ nullptr,
                               "FILECHECK_OPTS");
+  if (DumpInput == DumpInputHelp) {
+    DumpInputAnnotationHelp(outs());
+    return 0;
+  }
+  if (CheckFilename.empty()) {
+    errs() << "<check-file> not specified\n";
+    return 2;
+  }
 
   FileCheckRequest Req;
   for (auto Prefix : CheckPrefixes)
@@ -157,7 +556,6 @@ int main(int argc, char **argv) {
     return 2;
   }
 
-
   SourceMgr SM;
 
   // Read the expected strings from the check file.
@@ -204,10 +602,29 @@ int main(int argc, char **argv) {
                             InputFileText, InputFile.getBufferIdentifier()),
                         SMLoc());
 
-  int ExitCode =
-      FC.CheckInput(SM, InputFileText, CheckStrings) ? EXIT_SUCCESS : 1;
-  if (ExitCode == 1 && DumpInputOnFailure)
-    errs() << "Full input was:\n<<<<<<\n" << InputFileText << "\n>>>>>>\n";
+  if (DumpInput == DumpInputDefault)
+    DumpInput = DumpInputOnFailure ? DumpInputFail : DumpInputNever;
+
+  std::vector<FileCheckDiag> Diags;
+  int ExitCode = FC.CheckInput(SM, InputFileText, CheckStrings,
+                               DumpInput == DumpInputNever ? nullptr : &Diags)
+                     ? EXIT_SUCCESS
+                     : 1;
+  if (DumpInput == DumpInputAlways ||
+      (ExitCode == 1 && DumpInput == DumpInputFail)) {
+    errs() << "\n"
+           << "Input file: "
+           << (InputFilename == "-" ? "<stdin>" : InputFilename.getValue())
+           << "\n"
+           << "Check file: " << CheckFilename << "\n"
+           << "\n"
+           << "-dump-input=help describes the format of the following dump.\n"
+           << "\n";
+    std::vector<InputAnnotation> Annotations;
+    unsigned LabelWidth;
+    BuildInputAnnotations(Diags, Annotations, LabelWidth);
+    DumpAnnotatedInput(errs(), Req, InputFileText, Annotations, LabelWidth);
+  }
 
   return ExitCode;
 }
diff --git a/utils/LLVMVisualizers/llvm.natvis b/utils/LLVMVisualizers/llvm.natvis
index f3b2a3225e979c058c71c1e105b4154969ba80fe..bbded5dcc305bf6244d94f70a83258d3451609b2 100644
--- a/utils/LLVMVisualizers/llvm.natvis
+++ b/utils/LLVMVisualizers/llvm.natvis
@@ -85,10 +85,10 @@ For later versions of Visual Studio, no setup is required.
   </Type>
 
   <Type Name="llvm::PointerIntPair&lt;*,*,*,*&gt;">
-    <DisplayString>{IntMask}: {($T1)(Value &amp; PointerBitMask)} [{($T3)((Value &gt;&gt; IntShift) &amp; IntMask)}]</DisplayString>
+    <DisplayString>{$T5::IntMask}: {($T1)(Value &amp; $T5::PointerBitMask)} [{($T3)((Value &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}]</DisplayString>
     <Expand>
-      <Item Name="[ptr]">($T1)(Value &amp; PointerBitMask)</Item>
-      <Item Name="[int]">($T3)((Value &gt;&gt; IntShift) &amp; IntMask)</Item>
+      <Item Name="[ptr]">($T1)(Value &amp; $T5::PointerBitMask)</Item>
+      <Item Name="[int]">($T3)((Value &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)</Item>
     </Expand>
   </Type>
 
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 3c4c9c8e5c6e1e5aa0d677d9611a2a77224b4672..a8f191181766b35909fd0bb4d9708a8371c45a26 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -835,15 +835,20 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       for (unsigned i = 0, e = LastOpNo; i != e; ++i) {
         // Skip over tied operands as they're not part of an alias declaration.
         auto &Operands = CGA.ResultInst->Operands;
-        unsigned OpNum = Operands.getSubOperandNumber(MIOpNum).first;
-        if (Operands[OpNum].MINumOperands == 1 &&
-            Operands[OpNum].getTiedRegister() != -1) {
-          // Tied operands of different RegisterClass should be explicit within
-          // an instruction's syntax and so cannot be skipped.
-          int TiedOpNum = Operands[OpNum].getTiedRegister();
-          if (Operands[OpNum].Rec->getName() ==
-              Operands[TiedOpNum].Rec->getName())
-            ++MIOpNum;
+        while (true) {
+          unsigned OpNum = Operands.getSubOperandNumber(MIOpNum).first;
+          if (Operands[OpNum].MINumOperands == 1 &&
+              Operands[OpNum].getTiedRegister() != -1) {
+            // Tied operands of different RegisterClass should be explicit within
+            // an instruction's syntax and so cannot be skipped.
+            int TiedOpNum = Operands[OpNum].getTiedRegister();
+            if (Operands[OpNum].Rec->getName() ==
+                Operands[TiedOpNum].Rec->getName()) {
+              ++MIOpNum;
+              continue;
+            }
+          }
+          break;
         }
 
         std::string Op = "MI->getOperand(" + utostr(MIOpNum) + ")";
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 9a0fac54db1ceaf9d1479ec276c67f66ffff8255..96c90c9cf6bd84a979d38d946af35a3492bddea3 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenDAGPatterns.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -27,6 +28,7 @@
 #include "llvm/TableGen/Record.h"
 #include <algorithm>
 #include <cstdio>
+#include <iterator>
 #include <set>
 using namespace llvm;
 
@@ -824,6 +826,20 @@ TypeInfer::ValidateOnExit::~ValidateOnExit() {
 }
 #endif
 
+
+//===----------------------------------------------------------------------===//
+// ScopedName Implementation
+//===----------------------------------------------------------------------===//
+
+bool ScopedName::operator==(const ScopedName &o) const {
+  return Scope == o.Scope && Identifier == o.Identifier;
+}
+
+bool ScopedName::operator!=(const ScopedName &o) const {
+  return !(*this == o);
+}
+
+
 //===----------------------------------------------------------------------===//
 // TreePredicateFn Implementation
 //===----------------------------------------------------------------------===//
@@ -1069,6 +1085,9 @@ bool TreePredicateFn::isPredefinedPredicateEqualTo(StringRef Field,
     return false;
   return Result == Value;
 }
+bool TreePredicateFn::usesOperands() const {
+  return isPredefinedPredicateEqualTo("PredicateCodeUsesOperands", true);
+}
 bool TreePredicateFn::isLoad() const {
   return isPredefinedPredicateEqualTo("IsLoad", true);
 }
@@ -1250,7 +1269,7 @@ std::string TreePredicateFn::getCodeToRunOnSDNode() const {
   else
     Result = "    auto *N = cast<" + ClassName.str() + ">(Node);\n";
 
-  return Result + getPredCode();
+  return (Twine(Result) + "    (void)N;\n" + getPredCode()).str();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1276,7 +1295,7 @@ static unsigned getPatternSize(const TreePatternNode *P,
 
   // If this node has some predicate function that must match, it adds to the
   // complexity of this node.
-  if (!P->getPredicateFns().empty())
+  if (!P->getPredicateCalls().empty())
     ++Size;
 
   // Count children in the count if they are also nodes.
@@ -1296,7 +1315,7 @@ static unsigned getPatternSize(const TreePatternNode *P,
         Size += 5;  // Matches a ConstantSDNode (+3) and a specific value (+2).
       else if (Child->getComplexPatternInfo(CGP))
         Size += getPatternSize(Child, CGP);
-      else if (!Child->getPredicateFns().empty())
+      else if (!Child->getPredicateCalls().empty())
         ++Size;
     }
   }
@@ -1751,13 +1770,19 @@ void TreePatternNode::print(raw_ostream &OS) const {
     OS << ")";
   }
 
-  for (const TreePredicateFn &Pred : PredicateFns)
-    OS << "<<P:" << Pred.getFnName() << ">>";
+  for (const TreePredicateCall &Pred : PredicateCalls) {
+    OS << "<<P:";
+    if (Pred.Scope)
+      OS << Pred.Scope << ":";
+    OS << Pred.Fn.getFnName() << ">>";
+  }
   if (TransformFn)
     OS << "<<X:" << TransformFn->getName() << ">>";
   if (!getName().empty())
     OS << ":$" << getName();
 
+  for (const ScopedName &Name : NamesAsPredicateArg)
+    OS << ":$pred:" << Name.getScope() << ":" << Name.getIdentifier();
 }
 void TreePatternNode::dump() const {
   print(errs());
@@ -1774,7 +1799,7 @@ bool TreePatternNode::isIsomorphicTo(const TreePatternNode *N,
                                      const MultipleUseVarSet &DepVars) const {
   if (N == this) return true;
   if (N->isLeaf() != isLeaf() || getExtTypes() != N->getExtTypes() ||
-      getPredicateFns() != N->getPredicateFns() ||
+      getPredicateCalls() != N->getPredicateCalls() ||
       getTransformFn() != N->getTransformFn())
     return false;
 
@@ -1812,8 +1837,9 @@ TreePatternNodePtr TreePatternNode::clone() const {
                                             getNumTypes());
   }
   New->setName(getName());
+  New->setNamesAsPredicateArg(getNamesAsPredicateArg());
   New->Types = Types;
-  New->setPredicateFns(getPredicateFns());
+  New->setPredicateCalls(getPredicateCalls());
   New->setTransformFn(getTransformFn());
   return New;
 }
@@ -1845,8 +1871,8 @@ void TreePatternNode::SubstituteFormalArguments(
         // We found a use of a formal argument, replace it with its value.
         TreePatternNodePtr NewChild = ArgMap[Child->getName()];
         assert(NewChild && "Couldn't find formal argument!");
-        assert((Child->getPredicateFns().empty() ||
-                NewChild->getPredicateFns() == Child->getPredicateFns()) &&
+        assert((Child->getPredicateCalls().empty() ||
+                NewChild->getPredicateCalls() == Child->getPredicateCalls()) &&
                "Non-empty child predicate clobbered!");
         setChild(i, std::move(NewChild));
       }
@@ -1892,8 +1918,8 @@ void TreePatternNode::InlinePatternFragments(
         return;
 
       for (auto NewChild : ChildAlternatives[i])
-        assert((Child->getPredicateFns().empty() ||
-                NewChild->getPredicateFns() == Child->getPredicateFns()) &&
+        assert((Child->getPredicateCalls().empty() ||
+                NewChild->getPredicateCalls() == Child->getPredicateCalls()) &&
                "Non-empty child predicate clobbered!");
     }
 
@@ -1911,10 +1937,13 @@ void TreePatternNode::InlinePatternFragments(
 
       // Copy over properties.
       R->setName(getName());
-      R->setPredicateFns(getPredicateFns());
+      R->setNamesAsPredicateArg(getNamesAsPredicateArg());
+      R->setPredicateCalls(getPredicateCalls());
       R->setTransformFn(getTransformFn());
       for (unsigned i = 0, e = getNumTypes(); i != e; ++i)
         R->setType(i, getExtType(i));
+      for (unsigned i = 0, e = getNumResults(); i != e; ++i)
+        R->setResultIndex(i, getResultIndex(i));
 
       // Register alternative.
       OutAlternatives.push_back(R);
@@ -1946,10 +1975,19 @@ void TreePatternNode::InlinePatternFragments(
     return;
   }
 
+  TreePredicateFn PredFn(Frag);
+  unsigned Scope = 0;
+  if (TreePredicateFn(Frag).usesOperands())
+    Scope = TP.getDAGPatterns().allocateScope();
+
   // Compute the map of formal to actual arguments.
   std::map<std::string, TreePatternNodePtr> ArgMap;
   for (unsigned i = 0, e = Frag->getNumArgs(); i != e; ++i) {
-    const TreePatternNodePtr &Child = getChildShared(i);
+    TreePatternNodePtr Child = getChildShared(i);
+    if (Scope != 0) {
+      Child = Child->clone();
+      Child->addNameAsPredicateArg(ScopedName(Scope, Frag->getArgName(i)));
+    }
     ArgMap[Frag->getArgName(i)] = Child;
   }
 
@@ -1957,9 +1995,8 @@ void TreePatternNode::InlinePatternFragments(
   for (auto Alternative : Frag->getTrees()) {
     TreePatternNodePtr FragTree = Alternative->clone();
 
-    TreePredicateFn PredFn(Frag);
     if (!PredFn.isAlwaysTrue())
-      FragTree->addPredicateFn(PredFn);
+      FragTree->addPredicateCall(PredFn, Scope);
 
     // Resolve formal arguments to their actual value.
     if (Frag->getNumArgs())
@@ -1972,8 +2009,8 @@ void TreePatternNode::InlinePatternFragments(
       FragTree->UpdateNodeType(i, getExtType(i), TP);
 
     // Transfer in the old predicates.
-    for (const TreePredicateFn &Pred : getPredicateFns())
-      FragTree->addPredicateFn(Pred);
+    for (const TreePredicateCall &Pred : getPredicateCalls())
+      FragTree->addPredicateCall(Pred);
 
     // The fragment we inlined could have recursive inlining that is needed.  See
     // if there are any pattern fragments in it and inline them as needed.
@@ -3174,7 +3211,8 @@ static bool HandleUse(TreePattern &I, TreePatternNodePtr Pat,
 void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
     TreePattern &I, TreePatternNodePtr Pat,
     std::map<std::string, TreePatternNodePtr> &InstInputs,
-    std::map<std::string, TreePatternNodePtr> &InstResults,
+    MapVector<std::string, TreePatternNodePtr, std::map<std::string, unsigned>>
+        &InstResults,
     std::vector<Record *> &InstImpResults) {
 
   // The instruction pattern still has unresolved fragments.  For *named*
@@ -3494,7 +3532,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(
 
   // InstResults - Keep track of all the virtual registers that are 'set'
   // in the instruction, including what reg class they are.
-  std::map<std::string, TreePatternNodePtr> InstResults;
+  MapVector<std::string, TreePatternNodePtr, std::map<std::string, unsigned>>
+      InstResults;
 
   std::vector<Record*> InstImpResults;
 
@@ -3531,19 +3570,28 @@ void CodeGenDAGPatterns::parseInstructionPattern(
 
   // Check that all of the results occur first in the list.
   std::vector<Record*> Results;
+  std::vector<unsigned> ResultIndices;
   SmallVector<TreePatternNodePtr, 2> ResNodes;
   for (unsigned i = 0; i != NumResults; ++i) {
-    if (i == CGI.Operands.size())
-      I.error("'" + InstResults.begin()->first +
-               "' set but does not appear in operand list!");
+    if (i == CGI.Operands.size()) {
+      const std::string &OpName =
+          std::find_if(InstResults.begin(), InstResults.end(),
+                       [](const std::pair<std::string, TreePatternNodePtr> &P) {
+                         return P.second;
+                       })
+              ->first;
+
+      I.error("'" + OpName + "' set but does not appear in operand list!");
+    }
+
     const std::string &OpName = CGI.Operands[i].Name;
 
     // Check that it exists in InstResults.
-    TreePatternNodePtr RNode = InstResults[OpName];
-    if (!RNode)
+    auto InstResultIter = InstResults.find(OpName);
+    if (InstResultIter == InstResults.end() || !InstResultIter->second)
       I.error("Operand $" + OpName + " does not exist in operand list!");
 
-
+    TreePatternNodePtr RNode = InstResultIter->second;
     Record *R = cast<DefInit>(RNode->getLeafValue())->getDef();
     ResNodes.push_back(std::move(RNode));
     if (!R)
@@ -3556,8 +3604,11 @@ void CodeGenDAGPatterns::parseInstructionPattern(
     // Remember the return type.
     Results.push_back(CGI.Operands[i].Rec);
 
+    // Remember the result index.
+    ResultIndices.push_back(std::distance(InstResults.begin(), InstResultIter));
+
     // Okay, this one checks out.
-    InstResults.erase(OpName);
+    InstResultIter->second = nullptr;
   }
 
   // Loop over the inputs next.
@@ -3596,7 +3647,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
     TreePatternNodePtr OpNode = InVal->clone();
 
     // No predicate is useful on the result.
-    OpNode->clearPredicateFns();
+    OpNode->clearPredicateCalls();
 
     // Promote the xform function to be an explicit node if set.
     if (Record *Xform = OpNode->getTransformFn()) {
@@ -3621,6 +3672,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
   for (unsigned i = 0; i != NumResults; ++i) {
     assert(ResNodes[i]->getNumTypes() == 1 && "FIXME: Unhandled");
     ResultPattern->setType(i, ResNodes[i]->getExtType(0));
+    ResultPattern->setResultIndex(i, ResultIndices[i]);
   }
 
   // FIXME: Assume only the first tree is the pattern. The others are clobber
@@ -4080,7 +4132,8 @@ void CodeGenDAGPatterns::ParsePatterns() {
 
     // Validate that the input pattern is correct.
     std::map<std::string, TreePatternNodePtr> InstInputs;
-    std::map<std::string, TreePatternNodePtr> InstResults;
+    MapVector<std::string, TreePatternNodePtr, std::map<std::string, unsigned>>
+        InstResults;
     std::vector<Record*> InstImpResults;
     for (unsigned j = 0, ee = Pattern.getNumTrees(); j != ee; ++j)
       FindPatternInputsAndOutputs(Pattern, Pattern.getTree(j), InstInputs,
@@ -4251,7 +4304,8 @@ static void CombineChildVariants(
 
     // Copy over properties.
     R->setName(Orig->getName());
-    R->setPredicateFns(Orig->getPredicateFns());
+    R->setNamesAsPredicateArg(Orig->getNamesAsPredicateArg());
+    R->setPredicateCalls(Orig->getPredicateCalls());
     R->setTransformFn(Orig->getTransformFn());
     for (unsigned i = 0, e = Orig->getNumTypes(); i != e; ++i)
       R->setType(i, Orig->getExtType(i));
@@ -4303,7 +4357,7 @@ GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N,
   Record *Operator = N->getOperator();
 
   // Only permit raw nodes.
-  if (!N->getName().empty() || !N->getPredicateFns().empty() ||
+  if (!N->getName().empty() || !N->getPredicateCalls().empty() ||
       N->getTransformFn()) {
     Children.push_back(N);
     return;
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index 4e5eaa094616974674772f6c3c60f94d1dcb2c7c..4be9afdcacd22587a39f22259aadbfd82e3fba76 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -19,6 +19,7 @@
 #include "CodeGenIntrinsics.h"
 #include "CodeGenTarget.h"
 #include "SDNodeProperties.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
@@ -28,6 +29,7 @@
 #include <array>
 #include <functional>
 #include <map>
+#include <numeric>
 #include <set>
 #include <vector>
 
@@ -408,6 +410,29 @@ struct SDTypeConstraint {
                            TreePattern &TP) const;
 };
 
+/// ScopedName - A name of a node associated with a "scope" that indicates
+/// the context (e.g. instance of Pattern or PatFrag) in which the name was
+/// used. This enables substitution of pattern fragments while keeping track
+/// of what name(s) were originally given to various nodes in the tree.
+class ScopedName {
+  unsigned Scope;
+  std::string Identifier;
+public:
+  ScopedName(unsigned Scope, StringRef Identifier)
+    : Scope(Scope), Identifier(Identifier) {
+    assert(Scope != 0 &&
+           "Scope == 0 is used to indicate predicates without arguments");
+  }
+
+  unsigned getScope() const { return Scope; }
+  const std::string &getIdentifier() const { return Identifier; }
+
+  std::string getFullName() const;
+
+  bool operator==(const ScopedName &o) const;
+  bool operator!=(const ScopedName &o) const;
+};
+
 /// SDNodeInfo - One of these records is created for each SDNode instance in
 /// the target .td file.  This represents the various dag nodes we will be
 /// processing.
@@ -503,6 +528,9 @@ public:
   /// usable as part of an identifier.
   StringRef getImmTypeIdentifier() const;
 
+  // Predicate code uses the PatFrag's captured operands.
+  bool usesOperands() const;
+
   // Is the desired predefined predicate for a load?
   bool isLoad() const;
   // Is the desired predefined predicate for a store?
@@ -570,6 +598,23 @@ private:
   bool isPredefinedPredicateEqualTo(StringRef Field, bool Value) const;
 };
 
+struct TreePredicateCall {
+  TreePredicateFn Fn;
+
+  // Scope -- unique identifier for retrieving named arguments. 0 is used when
+  // the predicate does not use named arguments.
+  unsigned Scope;
+
+  TreePredicateCall(const TreePredicateFn &Fn, unsigned Scope)
+    : Fn(Fn), Scope(Scope) {}
+
+  bool operator==(const TreePredicateCall &o) const {
+    return Fn == o.Fn && Scope == o.Scope;
+  }
+  bool operator!=(const TreePredicateCall &o) const {
+    return !(*this == o);
+  }
+};
 
 class TreePatternNode {
   /// The type of each node result.  Before and during type inference, each
@@ -577,6 +622,9 @@ class TreePatternNode {
   /// each is a single concrete type.
   std::vector<TypeSetByHwMode> Types;
 
+  /// The index of each result in results of the pattern.
+  std::vector<unsigned> ResultPerm;
+
   /// Operator - The Record for the operator if this is an interior node (not
   /// a leaf).
   Record *Operator;
@@ -589,9 +637,11 @@ class TreePatternNode {
   ///
   std::string Name;
 
-  /// PredicateFns - The predicate functions to execute on this node to check
+  std::vector<ScopedName> NamesAsPredicateArg;
+
+  /// PredicateCalls - The predicate functions to execute on this node to check
   /// for a match.  If this list is empty, no predicate is involved.
-  std::vector<TreePredicateFn> PredicateFns;
+  std::vector<TreePredicateCall> PredicateCalls;
 
   /// TransformFn - The transformation function to execute on this node before
   /// it can be substituted into the resulting instruction on a pattern match.
@@ -605,16 +655,30 @@ public:
       : Operator(Op), Val(nullptr), TransformFn(nullptr),
         Children(std::move(Ch)) {
     Types.resize(NumResults);
+    ResultPerm.resize(NumResults);
+    std::iota(ResultPerm.begin(), ResultPerm.end(), 0);
   }
   TreePatternNode(Init *val, unsigned NumResults)    // leaf ctor
     : Operator(nullptr), Val(val), TransformFn(nullptr) {
     Types.resize(NumResults);
+    ResultPerm.resize(NumResults);
+    std::iota(ResultPerm.begin(), ResultPerm.end(), 0);
   }
 
   bool hasName() const { return !Name.empty(); }
   const std::string &getName() const { return Name; }
   void setName(StringRef N) { Name.assign(N.begin(), N.end()); }
 
+  const std::vector<ScopedName> &getNamesAsPredicateArg() const {
+    return NamesAsPredicateArg;
+  }
+  void setNamesAsPredicateArg(const std::vector<ScopedName>& Names) {
+    NamesAsPredicateArg = Names;
+  }
+  void addNameAsPredicateArg(const ScopedName &N) {
+    NamesAsPredicateArg.push_back(N);
+  }
+
   bool isLeaf() const { return Val != nullptr; }
 
   // Type accessors.
@@ -639,6 +703,10 @@ public:
     return Types[ResNo].empty();
   }
 
+  unsigned getNumResults() const { return ResultPerm.size(); }
+  unsigned getResultIndex(unsigned ResNo) const { return ResultPerm[ResNo]; }
+  void setResultIndex(unsigned ResNo, unsigned RI) { ResultPerm[ResNo] = RI; }
+
   Init *getLeafValue() const { assert(isLeaf()); return Val; }
   Record *getOperator() const { assert(!isLeaf()); return Operator; }
 
@@ -661,20 +729,24 @@ public:
   bool hasPossibleType() const;
   bool setDefaultMode(unsigned Mode);
 
-  bool hasAnyPredicate() const { return !PredicateFns.empty(); }
+  bool hasAnyPredicate() const { return !PredicateCalls.empty(); }
 
-  const std::vector<TreePredicateFn> &getPredicateFns() const {
-    return PredicateFns;
+  const std::vector<TreePredicateCall> &getPredicateCalls() const {
+    return PredicateCalls;
   }
-  void clearPredicateFns() { PredicateFns.clear(); }
-  void setPredicateFns(const std::vector<TreePredicateFn> &Fns) {
-    assert(PredicateFns.empty() && "Overwriting non-empty predicate list!");
-    PredicateFns = Fns;
+  void clearPredicateCalls() { PredicateCalls.clear(); }
+  void setPredicateCalls(const std::vector<TreePredicateCall> &Calls) {
+    assert(PredicateCalls.empty() && "Overwriting non-empty predicate list!");
+    PredicateCalls = Calls;
   }
-  void addPredicateFn(const TreePredicateFn &Fn) {
-    assert(!Fn.isAlwaysTrue() && "Empty predicate string!");
-    assert(!is_contained(PredicateFns, Fn) && "predicate applied recursively");
-    PredicateFns.push_back(Fn);
+  void addPredicateCall(const TreePredicateCall &Call) {
+    assert(!Call.Fn.isAlwaysTrue() && "Empty predicate string!");
+    assert(!is_contained(PredicateCalls, Call) && "predicate applied recursively");
+    PredicateCalls.push_back(Call);
+  }
+  void addPredicateCall(const TreePredicateFn &Fn, unsigned Scope) {
+    assert((Scope != 0) == Fn.usesOperands());
+    addPredicateCall(TreePredicateCall(Fn, Scope));
   }
 
   Record *getTransformFn() const { return TransformFn; }
@@ -1081,6 +1153,8 @@ class CodeGenDAGPatterns {
   using PatternRewriterFn = std::function<void (TreePattern *)>;
   PatternRewriterFn PatternRewriter;
 
+  unsigned NumScopes = 0;
+
 public:
   CodeGenDAGPatterns(RecordKeeper &R,
                      PatternRewriterFn PatternRewriter = nullptr);
@@ -1196,6 +1270,8 @@ public:
 
   bool hasTargetIntrinsics() { return !TgtIntrinsics.empty(); }
 
+  unsigned allocateScope() { return ++NumScopes; }
+
 private:
   void ParseNodeInfo();
   void ParseNodeTransforms();
@@ -1218,7 +1294,8 @@ private:
   void FindPatternInputsAndOutputs(
       TreePattern &I, TreePatternNodePtr Pat,
       std::map<std::string, TreePatternNodePtr> &InstInputs,
-      std::map<std::string, TreePatternNodePtr> &InstResults,
+      MapVector<std::string, TreePatternNodePtr,
+                std::map<std::string, unsigned>> &InstResults,
       std::vector<Record *> &InstImpResults);
 };
 
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index a6ca400914b49d1df4dd88e5d11189ac16d88924..6d06ba2c8b6768e6a1b651d3dfa9964776916143 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -202,7 +202,8 @@ CGIOperandList::ParseOperandName(const std::string &Op, bool AllowWholeOp) {
   return std::make_pair(0U, 0U);
 }
 
-static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) {
+static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops,
+                            Record *Rec) {
   // EARLY_CLOBBER: @early $reg
   std::string::size_type wpos = CStr.find_first_of(" \t");
   std::string::size_type start = CStr.find_first_not_of(" \t");
@@ -211,13 +212,17 @@ static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) {
     std::string Name = CStr.substr(wpos+1);
     wpos = Name.find_first_not_of(" \t");
     if (wpos == std::string::npos)
-      PrintFatalError("Illegal format for @earlyclobber constraint: '" + CStr + "'");
+      PrintFatalError(
+        Rec->getLoc(), "Illegal format for @earlyclobber constraint in '" +
+        Rec->getName() + "': '" + CStr + "'");
     Name = Name.substr(wpos);
     std::pair<unsigned,unsigned> Op = Ops.ParseOperandName(Name, false);
 
     // Build the string for the operand
     if (!Ops[Op.first].Constraints[Op.second].isNone())
-      PrintFatalError("Operand '" + Name + "' cannot have multiple constraints!");
+      PrintFatalError(
+        Rec->getLoc(), "Operand '" + Name + "' of '" + Rec->getName() +
+        "' cannot have multiple constraints!");
     Ops[Op.first].Constraints[Op.second] =
     CGIOperandList::ConstraintInfo::getEarlyClobber();
     return;
@@ -225,39 +230,73 @@ static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) {
 
   // Only other constraint is "TIED_TO" for now.
   std::string::size_type pos = CStr.find_first_of('=');
-  assert(pos != std::string::npos && "Unrecognized constraint");
+  if (pos == std::string::npos)
+    PrintFatalError(
+      Rec->getLoc(), "Unrecognized constraint '" + CStr +
+      "' in '" + Rec->getName() + "'");
   start = CStr.find_first_not_of(" \t");
-  std::string Name = CStr.substr(start, pos - start);
 
   // TIED_TO: $src1 = $dst
-  wpos = Name.find_first_of(" \t");
+  wpos = CStr.find_first_of(" \t", start);
+  if (wpos == std::string::npos || wpos > pos)
+    PrintFatalError(
+      Rec->getLoc(), "Illegal format for tied-to constraint in '" +
+      Rec->getName() + "': '" + CStr + "'");
+  std::string LHSOpName = StringRef(CStr).substr(start, wpos - start);
+  std::pair<unsigned,unsigned> LHSOp = Ops.ParseOperandName(LHSOpName, false);
+
+  wpos = CStr.find_first_not_of(" \t", pos + 1);
   if (wpos == std::string::npos)
-    PrintFatalError("Illegal format for tied-to constraint: '" + CStr + "'");
-  std::string DestOpName = Name.substr(0, wpos);
-  std::pair<unsigned,unsigned> DestOp = Ops.ParseOperandName(DestOpName, false);
-
-  Name = CStr.substr(pos+1);
-  wpos = Name.find_first_not_of(" \t");
-  if (wpos == std::string::npos)
-    PrintFatalError("Illegal format for tied-to constraint: '" + CStr + "'");
+    PrintFatalError(
+      Rec->getLoc(), "Illegal format for tied-to constraint: '" + CStr + "'");
+
+  std::string RHSOpName = StringRef(CStr).substr(wpos);
+  std::pair<unsigned,unsigned> RHSOp = Ops.ParseOperandName(RHSOpName, false);
+
+  // Sort the operands into order, which should put the output one
+  // first. But keep the original order, for use in diagnostics.
+  bool FirstIsDest = (LHSOp < RHSOp);
+  std::pair<unsigned,unsigned> DestOp = (FirstIsDest ? LHSOp : RHSOp);
+  StringRef DestOpName = (FirstIsDest ? LHSOpName : RHSOpName);
+  std::pair<unsigned,unsigned> SrcOp = (FirstIsDest ? RHSOp : LHSOp);
+  StringRef SrcOpName = (FirstIsDest ? RHSOpName : LHSOpName);
+
+  // Ensure one operand is a def and the other is a use.
+  if (DestOp.first >= Ops.NumDefs)
+    PrintFatalError(
+      Rec->getLoc(), "Input operands '" + LHSOpName + "' and '" + RHSOpName +
+      "' of '" + Rec->getName() + "' cannot be tied!");
+  if (SrcOp.first < Ops.NumDefs)
+    PrintFatalError(
+      Rec->getLoc(), "Output operands '" + LHSOpName + "' and '" + RHSOpName +
+      "' of '" + Rec->getName() + "' cannot be tied!");
+
+  // The constraint has to go on the operand with higher index, i.e.
+  // the source one. Check there isn't another constraint there
+  // already.
+  if (!Ops[SrcOp.first].Constraints[SrcOp.second].isNone())
+    PrintFatalError(
+      Rec->getLoc(), "Operand '" + SrcOpName + "' of '" + Rec->getName() +
+      "' cannot have multiple constraints!");
 
-  std::string SrcOpName = Name.substr(wpos);
-  std::pair<unsigned,unsigned> SrcOp = Ops.ParseOperandName(SrcOpName, false);
-  if (SrcOp > DestOp) {
-    std::swap(SrcOp, DestOp);
-    std::swap(SrcOpName, DestOpName);
+  unsigned DestFlatOpNo = Ops.getFlattenedOperandNumber(DestOp);
+  auto NewConstraint = CGIOperandList::ConstraintInfo::getTied(DestFlatOpNo);
+
+  // Check that the earlier operand is not the target of another tie
+  // before making it the target of this one.
+  for (const CGIOperandList::OperandInfo &Op : Ops) {
+    for (unsigned i = 0; i < Op.MINumOperands; i++)
+      if (Op.Constraints[i] == NewConstraint)
+        PrintFatalError(
+          Rec->getLoc(), "Operand '" + DestOpName + "' of '" + Rec->getName() +
+          "' cannot have multiple operands tied to it!");
   }
 
-  unsigned FlatOpNo = Ops.getFlattenedOperandNumber(SrcOp);
-
-  if (!Ops[DestOp.first].Constraints[DestOp.second].isNone())
-    PrintFatalError("Operand '" + DestOpName +
-      "' cannot have multiple constraints!");
-  Ops[DestOp.first].Constraints[DestOp.second] =
-    CGIOperandList::ConstraintInfo::getTied(FlatOpNo);
+  Ops[SrcOp.first].Constraints[SrcOp.second] = NewConstraint;
 }
 
-static void ParseConstraints(const std::string &CStr, CGIOperandList &Ops) {
+static void ParseConstraints(const std::string &CStr, CGIOperandList &Ops,
+                             Record *Rec) {
   if (CStr.empty()) return;
 
   const std::string delims(",");
@@ -269,7 +308,7 @@ static void ParseConstraints(const std::string &CStr, CGIOperandList &Ops) {
     if (eidx == std::string::npos)
       eidx = CStr.length();
 
-    ParseConstraint(CStr.substr(bidx, eidx - bidx), Ops);
+    ParseConstraint(CStr.substr(bidx, eidx - bidx), Ops, Rec);
     bidx = CStr.find_first_not_of(delims, eidx);
   }
 }
@@ -331,6 +370,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   isConvergent = R->getValueAsBit("isConvergent");
   hasNoSchedulingInfo = R->getValueAsBit("hasNoSchedulingInfo");
   FastISelShouldIgnore = R->getValueAsBit("FastISelShouldIgnore");
+  variadicOpsAreDefs = R->getValueAsBit("variadicOpsAreDefs");
 
   bool Unset;
   mayLoad      = R->getValueAsBitOrUnset("mayLoad", Unset);
@@ -353,7 +393,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   hasChain_Inferred = false;
 
   // Parse Constraints.
-  ParseConstraints(R->getValueAsString("Constraints"), Operands);
+  ParseConstraints(R->getValueAsString("Constraints"), Operands, R);
 
   // Parse the DisableEncoding field.
   Operands.ProcessDisableEncoding(R->getValueAsString("DisableEncoding"));
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index 6f79b4bcc5cd9abf71779baf957e9241996fe1a6..2e3d2f48a928f648fd27d5e847e8bb1bd78d6df3 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -57,6 +57,17 @@ template <typename T> class ArrayRef;
         assert(isTied());
         return OtherTiedOperand;
       }
+
+      bool operator==(const ConstraintInfo &RHS) const {
+        if (Kind != RHS.Kind)
+          return false;
+        if (Kind == Tied && OtherTiedOperand != RHS.OtherTiedOperand)
+          return false;
+        return true;
+      }
+      bool operator!=(const ConstraintInfo &RHS) const {
+        return !(*this == RHS);
+      }
     };
 
     /// OperandInfo - The information we keep track of for each operand in the
@@ -264,6 +275,7 @@ template <typename T> class ArrayRef;
     bool FastISelShouldIgnore : 1;
     bool hasChain : 1;
     bool hasChain_Inferred : 1;
+    bool variadicOpsAreDefs : 1;
 
     std::string DeprecatedReason;
     bool HasComplexDeprecationPredicate;
diff --git a/utils/TableGen/CodeGenIntrinsics.h b/utils/TableGen/CodeGenIntrinsics.h
index 5d07159591207e5ce0e180c7edd17a0611952230..9487a79c143238d20853564d4f562b4eb78bb7b0 100644
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h
@@ -124,6 +124,9 @@ struct CodeGenIntrinsic {
   /// True if the intrinsic is no-return.
   bool isNoReturn;
 
+  /// True if the intrinsic is cold.
+  bool isCold;
+
   /// True if the intrinsic is marked as convergent.
   bool isConvergent;
 
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index e70a79f08afc3c4c3378c9c216df4f6241f3de60..74a2b078dfb321261f3fc89b267afe64b3e6a2bd 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/IntEqClasses.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
@@ -1309,6 +1310,55 @@ getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8> &Parts) {
 }
 
 void CodeGenRegBank::computeComposites() {
+  using RegMap = std::map<const CodeGenRegister*, const CodeGenRegister*>;
+
+  // Subreg -> { Reg->Reg }, where the right-hand side is the mapping from
+  // register to (sub)register associated with the action of the left-hand
+  // side subregister.
+  std::map<const CodeGenSubRegIndex*, RegMap> SubRegAction;
+  for (const CodeGenRegister &R : Registers) {
+    const CodeGenRegister::SubRegMap &SM = R.getSubRegs();
+    for (std::pair<const CodeGenSubRegIndex*, const CodeGenRegister*> P : SM)
+      SubRegAction[P.first].insert({&R, P.second});
+  }
+
+  // Calculate the composition of two subregisters as compositions of their
+  // associated actions.
+  auto compose = [&SubRegAction] (const CodeGenSubRegIndex *Sub1,
+                                  const CodeGenSubRegIndex *Sub2) {
+    RegMap C;
+    const RegMap &Img1 = SubRegAction.at(Sub1);
+    const RegMap &Img2 = SubRegAction.at(Sub2);
+    for (std::pair<const CodeGenRegister*, const CodeGenRegister*> P : Img1) {
+      auto F = Img2.find(P.second);
+      if (F != Img2.end())
+        C.insert({P.first, F->second});
+    }
+    return C;
+  };
+
+  // Check if the two maps agree on the intersection of their domains.
+  auto agree = [] (const RegMap &Map1, const RegMap &Map2) {
+    // Technically speaking, an empty map agrees with any other map, but
+    // this could flag false positives. We're interested in non-vacuous
+    // agreements.
+    if (Map1.empty() || Map2.empty())
+      return false;
+    for (std::pair<const CodeGenRegister*, const CodeGenRegister*> P : Map1) {
+      auto F = Map2.find(P.first);
+      if (F == Map2.end() || P.second != F->second)
+        return false;
+    }
+    return true;
+  };
+
+  using CompositePair = std::pair<const CodeGenSubRegIndex*,
+                                  const CodeGenSubRegIndex*>;
+  SmallSet<CompositePair,4> UserDefined;
+  for (const CodeGenSubRegIndex &Idx : SubRegIndices)
+    for (auto P : Idx.getComposites())
+      UserDefined.insert(std::make_pair(&Idx, P.first));
+
   // Keep track of TopoSigs visited. We only need to visit each TopoSig once,
   // and many registers will share TopoSigs on regular architectures.
   BitVector TopoSigs(getNumTopoSigs());
@@ -1341,11 +1391,15 @@ void CodeGenRegBank::computeComposites() {
         assert(Idx3 && "Sub-register doesn't have an index");
 
         // Conflicting composition? Emit a warning but allow it.
-        if (CodeGenSubRegIndex *Prev = Idx1->addComposite(Idx2, Idx3))
-          PrintWarning(Twine("SubRegIndex ") + Idx1->getQualifiedName() +
-                       " and " + Idx2->getQualifiedName() +
-                       " compose ambiguously as " + Prev->getQualifiedName() +
-                       " or " + Idx3->getQualifiedName());
+        if (CodeGenSubRegIndex *Prev = Idx1->addComposite(Idx2, Idx3)) {
+          // If the composition was not user-defined, always emit a warning.
+          if (!UserDefined.count({Idx1, Idx2}) ||
+              agree(compose(Idx1, Idx2), SubRegAction.at(Idx3)))
+            PrintWarning(Twine("SubRegIndex ") + Idx1->getQualifiedName() +
+                         " and " + Idx2->getQualifiedName() +
+                         " compose ambiguously as " + Prev->getQualifiedName() +
+                         " or " + Idx3->getQualifiedName());
+        }          
       }
     }
   }
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index a9a36a87ef3fa9beab44c4e189e0b6214f335011..6d259cbb33ee3e2908cfa612a881ccf6c5802c71 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -479,6 +479,35 @@ void CodeGenSchedModels::collectRetireControlUnits() {
   }
 }
 
+void CodeGenSchedModels::collectLoadStoreQueueInfo() {
+  RecVec Queues = Records.getAllDerivedDefinitions("MemoryQueue");
+
+  for (Record *Queue : Queues) {
+    CodeGenProcModel &PM = getProcModel(Queue->getValueAsDef("SchedModel"));
+    if (Queue->isSubClassOf("LoadQueue")) {
+      if (PM.LoadQueue) {
+        PrintError(Queue->getLoc(),
+                   "Expected a single LoadQueue definition");
+        PrintNote(PM.LoadQueue->getLoc(),
+                  "Previous definition of LoadQueue was here");
+      }
+
+      PM.LoadQueue = Queue;
+    }
+
+    if (Queue->isSubClassOf("StoreQueue")) {
+      if (PM.StoreQueue) {
+        PrintError(Queue->getLoc(),
+                   "Expected a single StoreQueue definition");
+        PrintNote(PM.LoadQueue->getLoc(),
+                  "Previous definition of StoreQueue was here");
+      }
+
+      PM.StoreQueue = Queue;
+    }
+  }
+}
+
 /// Collect optional processor information.
 void CodeGenSchedModels::collectOptionalProcessorInfo() {
   // Find register file definitions for each processor.
@@ -487,6 +516,9 @@ void CodeGenSchedModels::collectOptionalProcessorInfo() {
   // Collect processor RetireControlUnit descriptors if available.
   collectRetireControlUnits();
 
+  // Collect information about load/store queues.
+  collectLoadStoreQueueInfo();
+
   checkCompleteness();
 }
 
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 9bde5f4e75946710cbe7b022f3155773c6b114f8..87a051b0c05ec6cc6bc77540e1820ddc98ce8ef6 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -246,10 +246,14 @@ struct CodeGenProcModel {
   // Optional Retire Control Unit definition.
   Record *RetireControlUnit;
 
+  // Load/Store queue descriptors.
+  Record *LoadQueue;
+  Record *StoreQueue;
+
   CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
                    Record *IDef) :
     Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
-    RetireControlUnit(nullptr) {}
+    RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
 
   bool hasItineraries() const {
     return !ItinsDef->getValueAsListOfDefs("IID").empty();
@@ -260,7 +264,8 @@ struct CodeGenProcModel {
   }
 
   bool hasExtraProcessorInfo() const {
-    return RetireControlUnit || !RegisterFiles.empty();
+    return RetireControlUnit || LoadQueue || StoreQueue ||
+           !RegisterFiles.empty();
   }
 
   unsigned getProcResourceIdx(Record *PRDef) const;
@@ -607,6 +612,8 @@ private:
 
   void collectSTIPredicates();
 
+  void collectLoadStoreQueueInfo();
+
   void checkCompleteness();
 
   void inferFromRW(ArrayRef<unsigned> OperWrites, ArrayRef<unsigned> OperReads,
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 305d2d19ff4dbfaebbe6585f9ba017875a38a173..bcb653135551f7321aa561403a5686b5de092721 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -536,6 +536,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   isCommutative = false;
   canThrow = false;
   isNoReturn = false;
+  isCold = false;
   isNoDuplicate = false;
   isConvergent = false;
   isSpeculatable = false;
@@ -682,6 +683,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
       isConvergent = true;
     else if (Property->getName() == "IntrNoReturn")
       isNoReturn = true;
+    else if (Property->getName() == "IntrCold")
+      isCold = true;
     else if (Property->getName() == "IntrSpeculatable")
       isSpeculatable = true;
     else if (Property->getName() == "IntrHasSideEffects")
diff --git a/utils/TableGen/DAGISelMatcher.cpp b/utils/TableGen/DAGISelMatcher.cpp
index 4a918d15691b1123c2fc462c3154c5b9c481afb5..c8e005739460c56e74d7bab81a5d3fa24e1f9952 100644
--- a/utils/TableGen/DAGISelMatcher.cpp
+++ b/utils/TableGen/DAGISelMatcher.cpp
@@ -93,13 +93,23 @@ SwitchTypeMatcher::~SwitchTypeMatcher() {
     delete C.second;
 }
 
-CheckPredicateMatcher::CheckPredicateMatcher(const TreePredicateFn &pred)
-  : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()) {}
+CheckPredicateMatcher::CheckPredicateMatcher(
+    const TreePredicateFn &pred, const SmallVectorImpl<unsigned> &Ops)
+  : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()),
+    Operands(Ops.begin(), Ops.end()) {}
 
 TreePredicateFn CheckPredicateMatcher::getPredicate() const {
   return TreePredicateFn(Pred);
 }
 
+unsigned CheckPredicateMatcher::getNumOperands() const {
+  return Operands.size();
+}
+
+unsigned CheckPredicateMatcher::getOperandNo(unsigned i) const {
+  assert(i < Operands.size());
+  return Operands[i];
+}
 
 
 // printImpl methods.
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index ecc1f1dd094a291220da2458f7d6caa34a679983..9be7295c67d4a99fb6473706d7abd8e6a185c549 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -414,10 +414,14 @@ private:
 /// see if the node is acceptable.
 class CheckPredicateMatcher : public Matcher {
   TreePattern *Pred;
+  const SmallVector<unsigned, 4> Operands;
 public:
-  CheckPredicateMatcher(const TreePredicateFn &pred);
+  CheckPredicateMatcher(const TreePredicateFn &pred,
+                        const SmallVectorImpl<unsigned> &Operands);
 
   TreePredicateFn getPredicate() const;
+  unsigned getNumOperands() const;
+  unsigned getOperandNo(unsigned i) const;
 
   static bool classof(const Matcher *N) {
     return N->getKind() == CheckPredicate;
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index e64943c1d0257f84ab9481ac3061293b8defda34..90ca1bff53443f90529c44e27180ebf85edc1a2b 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -50,6 +50,7 @@ class MatcherTableEmitter {
 
   DenseMap<TreePattern *, unsigned> NodePredicateMap;
   std::vector<TreePredicateFn> NodePredicates;
+  std::vector<TreePredicateFn> NodePredicatesWithOperands;
 
   // We de-duplicate the predicates by code string, and use this map to track
   // all the patterns with "identical" predicates.
@@ -92,6 +93,9 @@ public:
   void EmitPatternMatchTable(raw_ostream &OS);
 
 private:
+  void EmitNodePredicatesFunction(const std::vector<TreePredicateFn> &Preds,
+                                  StringRef Decl, raw_ostream &OS);
+
   unsigned EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
                        raw_ostream &OS);
 
@@ -103,12 +107,20 @@ private:
           NodePredicatesByCodeToRun[Pred.getCodeToRunOnSDNode()];
       if (SameCodePreds.empty()) {
         // We've never seen a predicate with the same code: allocate an entry.
-        NodePredicates.push_back(Pred);
-        Entry = NodePredicates.size();
+        if (Pred.usesOperands()) {
+          NodePredicatesWithOperands.push_back(Pred);
+          Entry = NodePredicatesWithOperands.size();
+        } else {
+          NodePredicates.push_back(Pred);
+          Entry = NodePredicates.size();
+        }
       } else {
         // We did see an identical predicate: re-use it.
         Entry = NodePredicateMap[SameCodePreds.front()];
         assert(Entry != 0);
+        assert(TreePredicateFn(SameCodePreds.front()).usesOperands() ==
+               Pred.usesOperands() &&
+               "PatFrags with some code must have same usesOperands setting");
       }
       // In both cases, we've never seen this particular predicate before, so
       // mark it in the list of predicates sharing the same code.
@@ -396,11 +408,23 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
   }
   case Matcher::CheckPredicate: {
     TreePredicateFn Pred = cast<CheckPredicateMatcher>(N)->getPredicate();
-    OS << "OPC_CheckPredicate, " << getNodePredicate(Pred) << ',';
+    unsigned OperandBytes = 0;
+
+    if (Pred.usesOperands()) {
+      unsigned NumOps = cast<CheckPredicateMatcher>(N)->getNumOperands();
+      OS << "OPC_CheckPredicateWithOperands, " << NumOps << "/*#Ops*/, ";
+      for (unsigned i = 0; i < NumOps; ++i)
+        OS << cast<CheckPredicateMatcher>(N)->getOperandNo(i) << ", ";
+      OperandBytes = 1 + NumOps;
+    } else {
+      OS << "OPC_CheckPredicate, ";
+    }
+
+    OS << getNodePredicate(Pred) << ',';
     if (!OmitComments)
       OS << " // " << Pred.getFnName();
     OS << '\n';
-    return 2;
+    return 2 + OperandBytes;
   }
 
   case Matcher::CheckOpcode:
@@ -783,6 +807,33 @@ EmitMatcherList(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
   return Size;
 }
 
+void MatcherTableEmitter::EmitNodePredicatesFunction(
+    const std::vector<TreePredicateFn> &Preds, StringRef Decl,
+    raw_ostream &OS) {
+  if (Preds.empty())
+    return;
+
+  BeginEmitFunction(OS, "bool", Decl, true/*AddOverride*/);
+  OS << "{\n";
+  OS << "  switch (PredNo) {\n";
+  OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // Emit the predicate code corresponding to this pattern.
+    TreePredicateFn PredFn = Preds[i];
+
+    assert(!PredFn.isAlwaysTrue() && "No code in this predicate");
+    OS << "  case " << i << ": { \n";
+    for (auto *SimilarPred :
+          NodePredicatesByCodeToRun[PredFn.getCodeToRunOnSDNode()])
+      OS << "    // " << TreePredicateFn(SimilarPred).getFnName() <<'\n';
+
+    OS << PredFn.getCodeToRunOnSDNode() << "\n  }\n";
+  }
+  OS << "  }\n";
+  OS << "}\n";
+  EndEmitFunction(OS);
+}
+
 void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
   // Emit pattern predicates.
   if (!PatternPredicates.empty()) {
@@ -799,29 +850,14 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
   }
 
   // Emit Node predicates.
-  if (!NodePredicates.empty()) {
-    BeginEmitFunction(OS, "bool",
-          "CheckNodePredicate(SDNode *Node, unsigned PredNo) const",
-          true/*AddOverride*/);
-    OS << "{\n";
-    OS << "  switch (PredNo) {\n";
-    OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
-    for (unsigned i = 0, e = NodePredicates.size(); i != e; ++i) {
-      // Emit the predicate code corresponding to this pattern.
-      TreePredicateFn PredFn = NodePredicates[i];
-
-      assert(!PredFn.isAlwaysTrue() && "No code in this predicate");
-      OS << "  case " << i << ": { \n";
-      for (auto *SimilarPred :
-           NodePredicatesByCodeToRun[PredFn.getCodeToRunOnSDNode()])
-        OS << "    // " << TreePredicateFn(SimilarPred).getFnName() <<'\n';
-
-      OS << PredFn.getCodeToRunOnSDNode() << "\n  }\n";
-    }
-    OS << "  }\n";
-    OS << "}\n";
-    EndEmitFunction(OS);
-  }
+  EmitNodePredicatesFunction(
+      NodePredicates, "CheckNodePredicate(SDNode *Node, unsigned PredNo) const",
+      OS);
+  EmitNodePredicatesFunction(
+      NodePredicatesWithOperands,
+      "CheckNodePredicateWithOperands(SDNode *Node, unsigned PredNo, "
+      "const SmallVectorImpl<SDValue> &Operands) const",
+      OS);
 
   // Emit CompletePattern matchers.
   // FIXME: This should be const.
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index f8c8a16efcd0cf60254d6871c84823a5564df77f..612342ddcddf14b41ba7936efea972752fb8d6fa 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -120,7 +120,7 @@ namespace {
     /// If this is the first time a node with unique identifier Name has been
     /// seen, record it. Otherwise, emit a check to make sure this is the same
     /// node. Returns true if this is the first encounter.
-    bool recordUniqueNode(const std::string &Name);
+    bool recordUniqueNode(ArrayRef<std::string> Names);
 
     // Result Code Generation.
     unsigned getNamedArgumentSlot(StringRef Name) {
@@ -319,8 +319,8 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   // to handle this.
   if ((N->getOperator()->getName() == "and" ||
        N->getOperator()->getName() == "or") &&
-      N->getChild(1)->isLeaf() && N->getChild(1)->getPredicateFns().empty() &&
-      N->getPredicateFns().empty()) {
+      N->getChild(1)->isLeaf() && N->getChild(1)->getPredicateCalls().empty() &&
+      N->getPredicateCalls().empty()) {
     if (IntInit *II = dyn_cast<IntInit>(N->getChild(1)->getLeafValue())) {
       if (!isPowerOf2_32(II->getValue())) {  // Don't bother with single bits.
         // If this is at the root of the pattern, we emit a redundant
@@ -441,21 +441,39 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   }
 }
 
-bool MatcherGen::recordUniqueNode(const std::string &Name) {
-  unsigned &VarMapEntry = VariableMap[Name];
-  if (VarMapEntry == 0) {
+bool MatcherGen::recordUniqueNode(ArrayRef<std::string> Names) {
+  unsigned Entry = 0;
+  for (const std::string &Name : Names) {
+    unsigned &VarMapEntry = VariableMap[Name];
+    if (!Entry)
+      Entry = VarMapEntry;
+    assert(Entry == VarMapEntry);
+  }
+
+  bool NewRecord = false;
+  if (Entry == 0) {
     // If it is a named node, we must emit a 'Record' opcode.
-    AddMatcher(new RecordMatcher("$" + Name, NextRecordedOperandNo));
-    VarMapEntry = ++NextRecordedOperandNo;
-    return true;
+    std::string WhatFor;
+    for (const std::string &Name : Names) {
+      if (!WhatFor.empty())
+        WhatFor += ',';
+      WhatFor += "$" + Name;
+    }
+    AddMatcher(new RecordMatcher(WhatFor, NextRecordedOperandNo));
+    Entry = ++NextRecordedOperandNo;
+    NewRecord = true;
+  } else {
+    // If we get here, this is a second reference to a specific name.  Since
+    // we already have checked that the first reference is valid, we don't
+    // have to recursively match it, just check that it's the same as the
+    // previously named thing.
+    AddMatcher(new CheckSameMatcher(Entry-1));
   }
 
-  // If we get here, this is a second reference to a specific name.  Since
-  // we already have checked that the first reference is valid, we don't
-  // have to recursively match it, just check that it's the same as the
-  // previously named thing.
-  AddMatcher(new CheckSameMatcher(VarMapEntry-1));
-  return false;
+  for (const std::string &Name : Names)
+    VariableMap[Name] = Entry;
+
+  return NewRecord;
 }
 
 void MatcherGen::EmitMatchCode(const TreePatternNode *N,
@@ -475,9 +493,18 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
 
   // If this node has a name associated with it, capture it in VariableMap. If
   // we already saw this in the pattern, emit code to verify dagness.
+  SmallVector<std::string, 4> Names;
   if (!N->getName().empty())
-    if (!recordUniqueNode(N->getName()))
+    Names.push_back(N->getName());
+
+  for (const ScopedName &Name : N->getNamesAsPredicateArg()) {
+    Names.push_back(("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str());
+  }
+
+  if (!Names.empty()) {
+    if (!recordUniqueNode(Names))
       return;
+  }
 
   if (N->isLeaf())
     EmitLeafMatchCode(N);
@@ -485,8 +512,19 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
     EmitOperatorMatchCode(N, NodeNoTypes, ForceMode);
 
   // If there are node predicates for this node, generate their checks.
-  for (unsigned i = 0, e = N->getPredicateFns().size(); i != e; ++i)
-    AddMatcher(new CheckPredicateMatcher(N->getPredicateFns()[i]));
+  for (unsigned i = 0, e = N->getPredicateCalls().size(); i != e; ++i) {
+    const TreePredicateCall &Pred = N->getPredicateCalls()[i];
+    SmallVector<unsigned, 4> Operands;
+    if (Pred.Fn.usesOperands()) {
+      TreePattern *TP = Pred.Fn.getOrigPatFragRecord();
+      for (unsigned i = 0; i < TP->getNumArgs(); ++i) {
+        std::string Name =
+            ("pred:" + Twine(Pred.Scope) + ":" + TP->getArgName(i)).str();
+        Operands.push_back(getNamedArgumentSlot(Name));
+      }
+    }
+    AddMatcher(new CheckPredicateMatcher(Pred.Fn, Operands));
+  }
 
   for (unsigned i = 0, e = ResultsToTypeCheck.size(); i != e; ++i)
     AddMatcher(new CheckTypeMatcher(N->getSimpleType(ResultsToTypeCheck[i]),
@@ -962,9 +1000,16 @@ void MatcherGen::EmitResultCode() {
   }
 
   assert(Ops.size() >= NumSrcResults && "Didn't provide enough results");
-  Ops.resize(NumSrcResults);
+  SmallVector<unsigned, 8> Results(Ops);
+
+  // Apply result permutation.
+  for (unsigned ResNo = 0; ResNo < Pattern.getDstPattern()->getNumResults();
+       ++ResNo) {
+    Results[ResNo] = Ops[Pattern.getDstPattern()->getResultIndex(ResNo)];
+  }
 
-  AddMatcher(new CompleteMatchMatcher(Ops, Pattern));
+  Results.resize(NumSrcResults);
+  AddMatcher(new CompleteMatchMatcher(Results, Pattern));
 }
 
 
diff --git a/utils/TableGen/ExegesisEmitter.cpp b/utils/TableGen/ExegesisEmitter.cpp
index 083d7439451404f5450e3ce132e685a25df88604..208237aca20c8e36339cf4543700e60e7ee6e924 100644
--- a/utils/TableGen/ExegesisEmitter.cpp
+++ b/utils/TableGen/ExegesisEmitter.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
@@ -114,10 +115,6 @@ void ExegesisEmitter::emitPfmCountersInfo(const Record &Def,
   const size_t NumIssueCounters =
       Def.getValueAsListOfDefs("IssueCounters").size();
 
-  // This is the default, do not emit.
-  if (CycleCounter.empty() && UopsCounter.empty() && NumIssueCounters == 0)
-    return;
-
   OS << "\nstatic const PfmCountersInfo " << Target << Def.getName()
      << " = {\n";
 
@@ -157,28 +154,35 @@ void ExegesisEmitter::emitPfmCounters(raw_ostream &OS) const {
   // Emit the IssueCounters table.
   const auto PfmCounterDefs =
       Records.getAllDerivedDefinitions("ProcPfmCounters");
-  OS << "static const PfmCountersInfo::IssueCounter " << Target
-     << "PfmIssueCounters[] = {\n";
-  for (const Record *Def : PfmCounterDefs) {
-    for (const Record *ICDef : Def->getValueAsListOfDefs("IssueCounters"))
-      OS << "  { " << Target << "PfmCounterNames["
-         << getPfmCounterId(ICDef->getValueAsString("Counter")) << "], \""
-         << ICDef->getValueAsString("ResourceName") << "\"},\n";
+  // Only emit if non-empty.
+  const bool HasAtLeastOnePfmIssueCounter =
+      llvm::any_of(PfmCounterDefs, [](const Record *Def) {
+        return !Def->getValueAsListOfDefs("IssueCounters").empty();
+      });
+  if (HasAtLeastOnePfmIssueCounter) {
+    OS << "static const PfmCountersInfo::IssueCounter " << Target
+       << "PfmIssueCounters[] = {\n";
+    for (const Record *Def : PfmCounterDefs) {
+      for (const Record *ICDef : Def->getValueAsListOfDefs("IssueCounters"))
+        OS << "  { " << Target << "PfmCounterNames["
+           << getPfmCounterId(ICDef->getValueAsString("Counter")) << "], \""
+           << ICDef->getValueAsString("ResourceName") << "\"},\n";
+    }
+    OS << "};\n";
   }
 
-  OS << "};\n";
-
   // Now generate the PfmCountersInfo.
   unsigned IssueCountersTableOffset = 0;
   for (const Record *Def : PfmCounterDefs)
     emitPfmCountersInfo(*Def, IssueCountersTableOffset, OS);
 
   OS << "\n";
-}
+} // namespace
 
 void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const {
   std::vector<Record *> Bindings =
       Records.getAllDerivedDefinitions("PfmCountersBinding");
+  assert(!Bindings.empty() && "there must be at least one binding");
   llvm::sort(Bindings, [](const Record *L, const Record *R) {
     return L->getValueAsString("CpuName") < R->getValueAsString("CpuName");
   });
diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index 6e04d1711cc56a025b562a22b2dcbe724c17618e..5134b684c6f9cdb5bfa5e458fa30b28d7bf3d685 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp
@@ -210,13 +210,13 @@ struct OperandsSignature {
       // Handle imm operands specially.
       if (!Op->isLeaf() && Op->getOperator()->getName() == "imm") {
         unsigned PredNo = 0;
-        if (!Op->getPredicateFns().empty()) {
-          TreePredicateFn PredFn = Op->getPredicateFns()[0];
+        if (!Op->getPredicateCalls().empty()) {
+          TreePredicateFn PredFn = Op->getPredicateCalls()[0].Fn;
           // If there is more than one predicate weighing in on this operand
           // then we don't handle it.  This doesn't typically happen for
           // immediates anyway.
-          if (Op->getPredicateFns().size() > 1 ||
-              !PredFn.isImmediatePattern())
+          if (Op->getPredicateCalls().size() > 1 ||
+              !PredFn.isImmediatePattern() || PredFn.usesOperands())
             return false;
           // Ignore any instruction with 'FastIselShouldIgnore', these are
           // not needed and just bloat the fast instruction selector.  For
@@ -236,7 +236,7 @@ struct OperandsSignature {
 
       // For now, filter out any operand with a predicate.
       // For now, filter out any operand with multiple values.
-      if (!Op->getPredicateFns().empty() || Op->getNumTypes() != 1)
+      if (!Op->getPredicateCalls().empty() || Op->getNumTypes() != 1)
         return false;
 
       if (!Op->isLeaf()) {
@@ -529,7 +529,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     }
 
     // For now, filter out any instructions with predicates.
-    if (!InstPatNode->getPredicateFns().empty())
+    if (!InstPatNode->getPredicateCalls().empty())
       continue;
 
     // Check all the operands.
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index 44cf6eadcb003e9f88a1383ff7001fc0e9b0a5fd..1206f7751a9ea203d9a030bc18a8262d8621bd37 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -87,8 +87,23 @@ struct DecoderTableInfo {
   DecoderSet Decoders;
 };
 
+struct EncodingAndInst {
+  const Record *EncodingDef;
+  const CodeGenInstruction *Inst;
+
+  EncodingAndInst(const Record *EncodingDef, const CodeGenInstruction *Inst)
+      : EncodingDef(EncodingDef), Inst(Inst) {}
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const EncodingAndInst &Value) {
+  if (Value.EncodingDef != Value.Inst->TheDef)
+    OS << Value.EncodingDef->getName() << ":";
+  OS << Value.Inst->TheDef->getName();
+  return OS;
+}
+
 class FixedLenDecoderEmitter {
-  ArrayRef<const CodeGenInstruction *> NumberedInstructions;
+  std::vector<EncodingAndInst> NumberedEncodings;
 
 public:
   // Defaults preserved here for documentation, even though they aren't
@@ -323,7 +338,7 @@ protected:
   friend class Filter;
 
   // Vector of codegen instructions to choose our filter.
-  ArrayRef<const CodeGenInstruction *> AllInstructions;
+  ArrayRef<EncodingAndInst> AllInstructions;
 
   // Vector of uid's for this filter chooser to work on.
   const std::vector<unsigned> &Opcodes;
@@ -351,25 +366,24 @@ protected:
   const FixedLenDecoderEmitter *Emitter;
 
 public:
-  FilterChooser(ArrayRef<const CodeGenInstruction *> Insts,
+  FilterChooser(ArrayRef<EncodingAndInst> Insts,
                 const std::vector<unsigned> &IDs,
                 const std::map<unsigned, std::vector<OperandInfo>> &Ops,
-                unsigned BW,
-                const FixedLenDecoderEmitter *E)
-    : AllInstructions(Insts), Opcodes(IDs), Operands(Ops),
-      FilterBitValues(BW, BIT_UNFILTERED), Parent(nullptr), BestIndex(-1),
-      BitWidth(BW), Emitter(E) {
+                unsigned BW, const FixedLenDecoderEmitter *E)
+      : AllInstructions(Insts), Opcodes(IDs), Operands(Ops),
+        FilterBitValues(BW, BIT_UNFILTERED), Parent(nullptr), BestIndex(-1),
+        BitWidth(BW), Emitter(E) {
     doFilter();
   }
 
-  FilterChooser(ArrayRef<const CodeGenInstruction *> Insts,
+  FilterChooser(ArrayRef<EncodingAndInst> Insts,
                 const std::vector<unsigned> &IDs,
                 const std::map<unsigned, std::vector<OperandInfo>> &Ops,
                 const std::vector<bit_value_t> &ParentFilterBitValues,
                 const FilterChooser &parent)
-    : AllInstructions(Insts), Opcodes(IDs), Operands(Ops),
-      FilterBitValues(ParentFilterBitValues), Parent(&parent), BestIndex(-1),
-      BitWidth(parent.BitWidth), Emitter(parent.Emitter) {
+      : AllInstructions(Insts), Opcodes(IDs), Operands(Ops),
+        FilterBitValues(ParentFilterBitValues), Parent(&parent), BestIndex(-1),
+        BitWidth(parent.BitWidth), Emitter(parent.Emitter) {
     doFilter();
   }
 
@@ -381,7 +395,7 @@ public:
 protected:
   // Populates the insn given the uid.
   void insnWithID(insn_t &Insn, unsigned Opcode) const {
-    BitsInit &Bits = getBitsField(*AllInstructions[Opcode]->TheDef, "Inst");
+    BitsInit &Bits = getBitsField(*AllInstructions[Opcode].EncodingDef, "Inst");
 
     // We may have a SoftFail bitmask, which specifies a mask where an encoding
     // may differ from the value in "Inst" and yet still be valid, but the
@@ -389,7 +403,7 @@ protected:
     //
     // This is used for marking UNPREDICTABLE instructions in the ARM world.
     BitsInit *SFBits =
-      AllInstructions[Opcode]->TheDef->getValueAsBitsInit("SoftFail");
+        AllInstructions[Opcode].EncodingDef->getValueAsBitsInit("SoftFail");
 
     for (unsigned i = 0; i < BitWidth; ++i) {
       if (SFBits && bitFromBits(*SFBits, i) == BIT_TRUE)
@@ -399,11 +413,6 @@ protected:
     }
   }
 
-  // Returns the record name.
-  const StringRef nameWithID(unsigned Opcode) const {
-    return AllInstructions[Opcode]->TheDef->getName();
-  }
-
   // Populates the field of the insn given the start position and the number of
   // consecutive bits to scan for.
   //
@@ -827,8 +836,7 @@ void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
       OS << (unsigned)*I++ << ", ";
 
       if (!IsTry) {
-        OS << "// Opcode: "
-           << NumberedInstructions[Opc]->TheDef->getName() << "\n";
+        OS << "// Opcode: " << NumberedEncodings[Opc] << "\n";
         break;
       }
 
@@ -845,8 +853,7 @@ void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
       OS << utostr(Byte) << ", ";
       NumToSkip |= Byte << 16;
 
-      OS << "// Opcode: "
-         << NumberedInstructions[Opc]->TheDef->getName()
+      OS << "// Opcode: " << NumberedEncodings[Opc]
          << ", skip to: " << ((I - Table.begin()) + NumToSkip) << "\n";
       break;
     }
@@ -1153,7 +1160,7 @@ static void emitSinglePredicateMatch(raw_ostream &o, StringRef str,
 bool FilterChooser::emitPredicateMatch(raw_ostream &o, unsigned &Indentation,
                                        unsigned Opc) const {
   ListInit *Predicates =
-    AllInstructions[Opc]->TheDef->getValueAsListInit("Predicates");
+      AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates");
   bool IsFirstEmission = true;
   for (unsigned i = 0; i < Predicates->size(); ++i) {
     Record *Pred = Predicates->getElementAsRecord(i);
@@ -1182,7 +1189,7 @@ bool FilterChooser::emitPredicateMatch(raw_ostream &o, unsigned &Indentation,
 
 bool FilterChooser::doesOpcodeNeedPredicate(unsigned Opc) const {
   ListInit *Predicates =
-    AllInstructions[Opc]->TheDef->getValueAsListInit("Predicates");
+      AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates");
   for (unsigned i = 0; i < Predicates->size(); ++i) {
     Record *Pred = Predicates->getElementAsRecord(i);
     if (!Pred->getValue("AssemblerMatcherPredicate"))
@@ -1247,9 +1254,10 @@ void FilterChooser::emitPredicateTableEntry(DecoderTableInfo &TableInfo,
 void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
                                            unsigned Opc) const {
   BitsInit *SFBits =
-    AllInstructions[Opc]->TheDef->getValueAsBitsInit("SoftFail");
+      AllInstructions[Opc].EncodingDef->getValueAsBitsInit("SoftFail");
   if (!SFBits) return;
-  BitsInit *InstBits = AllInstructions[Opc]->TheDef->getValueAsBitsInit("Inst");
+  BitsInit *InstBits =
+      AllInstructions[Opc].EncodingDef->getValueAsBitsInit("Inst");
 
   APInt PositiveMask(BitWidth, 0ULL);
   APInt NegativeMask(BitWidth, 0ULL);
@@ -1270,9 +1278,9 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
       break;
     default:
       // The bit is not set; this must be an error!
-      StringRef Name = AllInstructions[Opc]->TheDef->getName();
-      errs() << "SoftFail Conflict: bit SoftFail{" << i << "} in " << Name
-             << " is set but Inst{" << i << "} is unset!\n"
+      errs() << "SoftFail Conflict: bit SoftFail{" << i << "} in "
+             << AllInstructions[Opc] << " is set but Inst{" << i
+             << "} is unset!\n"
              << "  - You can only mark a bit as SoftFail if it is fully defined"
              << " (1/0 - not '?') in Inst\n";
       return;
@@ -1709,9 +1717,9 @@ void FilterChooser::emitTableEntries(DecoderTableInfo &TableInfo) const {
   dumpStack(errs(), "\t\t");
 
   for (unsigned i = 0; i < Opcodes.size(); ++i) {
-    errs() << '\t' << nameWithID(Opcodes[i]) << " ";
+    errs() << '\t' << Opcodes[i] << " ";
     dumpBits(errs(),
-             getBitsField(*AllInstructions[Opcodes[i]]->TheDef, "Inst"));
+             getBitsField(*AllInstructions[Opcodes[i]].EncodingDef, "Inst"));
     errs() << '\n';
   }
 }
@@ -2326,13 +2334,17 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
   Target.reverseBitsForLittleEndianEncoding();
 
   // Parameterize the decoders based on namespace and instruction width.
-  NumberedInstructions = Target.getInstructionsByEnumValue();
+  const auto &NumberedInstructions = Target.getInstructionsByEnumValue();
+  NumberedEncodings.reserve(NumberedInstructions.size());
+  for (const auto &NumberedInstruction : NumberedInstructions)
+    NumberedEncodings.emplace_back(NumberedInstruction->TheDef, NumberedInstruction);
+
   std::map<std::pair<std::string, unsigned>,
            std::vector<unsigned>> OpcMap;
   std::map<unsigned, std::vector<OperandInfo>> Operands;
 
-  for (unsigned i = 0; i < NumberedInstructions.size(); ++i) {
-    const CodeGenInstruction *Inst = NumberedInstructions[i];
+  for (unsigned i = 0; i < NumberedEncodings.size(); ++i) {
+    const CodeGenInstruction *Inst = NumberedEncodings[i].Inst;
     const Record *Def = Inst->TheDef;
     unsigned Size = Def->getValueAsInt("Size");
     if (Def->getValueAsString("Namespace") == "TargetOpcode" ||
@@ -2353,8 +2365,10 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
   DecoderTableInfo TableInfo;
   for (const auto &Opc : OpcMap) {
     // Emit the decoder for this namespace+width combination.
-    FilterChooser FC(NumberedInstructions, Opc.second, Operands,
-                     8*Opc.first.second, this);
+    ArrayRef<EncodingAndInst> NumberedEncodingsRef(NumberedEncodings.data(),
+                                                   NumberedEncodings.size());
+    FilterChooser FC(NumberedEncodingsRef, Opc.second, Operands,
+                     8 * Opc.first.second, this);
 
     // The decode table is cleared for each top level decoder function. The
     // predicates and decoders themselves, however, are shared across all
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index ad0ee3dfbc4205be5884f0db0f165990d0aad927..997ceb12becd053960fde0efcf912e03c69fde4d 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -200,7 +200,8 @@ static Optional<LLTCodeGen> MVTToLLT(MVT::SimpleValueType SVT) {
 static std::string explainPredicates(const TreePatternNode *N) {
   std::string Explanation = "";
   StringRef Separator = "";
-  for (const auto &P : N->getPredicateFns()) {
+  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+    const TreePredicateFn &P = Call.Fn;
     Explanation +=
         (Separator + P.getOrigPatFragRecord()->getRecord()->getName()).str();
     Separator = ", ";
@@ -284,7 +285,9 @@ static Error isTrivialOperatorNode(const TreePatternNode *N) {
   std::string Separator = "";
 
   bool HasUnsupportedPredicate = false;
-  for (const auto &Predicate : N->getPredicateFns()) {
+  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+    const TreePredicateFn &Predicate = Call.Fn;
+
     if (Predicate.isAlwaysTrue())
       continue;
 
@@ -3117,7 +3120,8 @@ Record *GlobalISelEmitter::findNodeEquiv(Record *N) const {
 
 const CodeGenInstruction *
 GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode *N) const {
-  for (const auto &Predicate : N->getPredicateFns()) {
+  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+    const TreePredicateFn &Predicate = Call.Fn;
     if (!Equiv.isValueUnset("IfSignExtend") && Predicate.isLoad() &&
         Predicate.isSignExtLoad())
       return &Target.getInstruction(Equiv.getValueAsDef("IfSignExtend"));
@@ -3186,7 +3190,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
                           " for result of Src pattern operator");
   }
 
-  for (const auto &Predicate : Src->getPredicateFns()) {
+  for (const TreePredicateCall &Call : Src->getPredicateCalls()) {
+    const TreePredicateFn &Predicate = Call.Fn;
     if (Predicate.isAlwaysTrue())
       continue;
 
diff --git a/utils/TableGen/InstrDocsEmitter.cpp b/utils/TableGen/InstrDocsEmitter.cpp
index 41dc37a7ab35cae97d69253e9d2819e1f1209745..9d50351854ec46fe36bba6b2c1925b80ebf3cbf8 100644
--- a/utils/TableGen/InstrDocsEmitter.cpp
+++ b/utils/TableGen/InstrDocsEmitter.cpp
@@ -138,6 +138,7 @@ void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
     FLAG(isInsertSubreg)
     FLAG(isConvergent)
     FLAG(hasNoSchedulingInfo)
+    FLAG(variadicOpsAreDefs)
     if (!FlagStrings.empty()) {
       OS << "Flags: ";
       bool IsFirst = true;
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index 55b6f192c2f0c504f2a5ca9bc7becee84957bae2..39d9e8526386e0977117918f5d70f1a247a1618d 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -358,8 +358,8 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
   if (TIIPredicates.empty())
     return;
 
-  OS << "#ifdef GET_GENINSTRINFO_MC_DECL\n";
-  OS << "#undef GET_GENINSTRINFO_MC_DECL\n\n";
+  OS << "#ifdef GET_INSTRINFO_MC_HELPER_DECLS\n";
+  OS << "#undef GET_INSTRINFO_MC_HELPER_DECLS\n\n";
 
   OS << "namespace llvm {\n";
   OS << "class MCInst;\n\n";
@@ -374,10 +374,10 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
   OS << "\n} // end " << TargetName << "_MC namespace\n";
   OS << "} // end llvm namespace\n\n";
 
-  OS << "#endif // GET_GENINSTRINFO_MC_DECL\n\n";
+  OS << "#endif // GET_INSTRINFO_MC_HELPER_DECLS\n\n";
 
-  OS << "#ifdef GET_GENINSTRINFO_MC_HELPERS\n";
-  OS << "#undef GET_GENINSTRINFO_MC_HELPERS\n\n";
+  OS << "#ifdef GET_INSTRINFO_MC_HELPERS\n";
+  OS << "#undef GET_INSTRINFO_MC_HELPERS\n\n";
 
   OS << "namespace llvm {\n";
   OS << "namespace " << TargetName << "_MC {\n\n";
@@ -391,10 +391,10 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
 
     OS.indent(PE.getIndentLevel() * 2);
     PE.expandStatement(OS, Rec->getValueAsDef("Body"));
-    OS << "\n}\n";
+    OS << "\n}\n\n";
   }
 
-  OS << "\n} // end " << TargetName << "_MC namespace\n";
+  OS << "} // end " << TargetName << "_MC namespace\n";
   OS << "} // end llvm namespace\n\n";
 
   OS << "#endif // GET_GENISTRINFO_MC_HELPERS\n";
@@ -409,10 +409,9 @@ void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
 
   PredicateExpander PE(TargetName);
   PE.setExpandForMC(false);
-  PE.setIndentLevel(2);
 
   for (const Record *Rec : TIIPredicates) {
-    OS << "\n  " << (ExpandDefinition ? "" : "static ") << "bool ";
+    OS << (ExpandDefinition ? "" : "static ") << "bool ";
     if (ExpandDefinition)
       OS << TargetName << "InstrInfo::";
     OS << Rec->getValueAsString("FunctionName");
@@ -423,10 +422,9 @@ void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
     }
 
     OS << " {\n";
-
     OS.indent(PE.getIndentLevel() * 2);
     PE.expandStatement(OS, Rec->getValueAsDef("Body"));
-    OS << "\n  }\n";
+    OS << "\n}\n\n";
   }
 }
 
@@ -533,15 +531,16 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
 
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
 
-  OS << "#ifdef GET_TII_HELPER_DECLS\n";
-  OS << "#undef GET_TII_HELPER_DECLS\n";
+  OS << "#ifdef GET_INSTRINFO_HELPER_DECLS\n";
+  OS << "#undef GET_INSTRINFO_HELPER_DECLS\n\n";
   emitTIIHelperMethods(OS, TargetName, /* ExpandDefintion = */false);
-  OS << "#endif // GET_TII_HELPER_DECLS\n\n";
+  OS << "\n";
+  OS << "#endif // GET_INSTRINFO_HELPER_DECLS\n\n";
 
-  OS << "#ifdef GET_TII_HELPERS\n";
-  OS << "#undef GET_TII_HELPERS\n";
+  OS << "#ifdef GET_INSTRINFO_HELPERS\n";
+  OS << "#undef GET_INSTRINFO_HELPERS\n\n";
   emitTIIHelperMethods(OS, TargetName, /* ExpandDefintion = */true);
-  OS << "#endif // GET_TTI_HELPERS\n\n";
+  OS << "#endif // GET_INSTRINFO_HELPERS\n\n";
 
   OS << "#ifdef GET_INSTRINFO_CTOR_DTOR\n";
   OS << "#undef GET_INSTRINFO_CTOR_DTOR\n";
@@ -625,6 +624,7 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   if (Inst.isExtractSubreg) OS << "|(1ULL<<MCID::ExtractSubreg)";
   if (Inst.isInsertSubreg) OS << "|(1ULL<<MCID::InsertSubreg)";
   if (Inst.isConvergent) OS << "|(1ULL<<MCID::Convergent)";
+  if (Inst.variadicOpsAreDefs) OS << "|(1ULL<<MCID::VariadicOpsAreDefs)";
 
   // Emit all of the target-specific flags...
   BitsInit *TSF = Inst.TheDef->getValueAsBitsInit("TSFlags");
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 06e44e3b57c136a3e6f6555e910e86c6eea780b5..049282e5ebfeed8d44beeb3e888703a83c24f18a 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -489,6 +489,9 @@ struct AttributeComparator {
     if (L->isNoReturn != R->isNoReturn)
       return R->isNoReturn;
 
+    if (L->isCold != R->isCold)
+      return R->isCold;
+
     if (L->isConvergent != R->isConvergent)
       return R->isConvergent;
 
@@ -622,7 +625,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
 
     if (!intrinsic.canThrow ||
         intrinsic.ModRef != CodeGenIntrinsic::ReadWriteMem ||
-        intrinsic.isNoReturn || intrinsic.isNoDuplicate ||
+        intrinsic.isNoReturn || intrinsic.isCold || intrinsic.isNoDuplicate ||
         intrinsic.isConvergent || intrinsic.isSpeculatable) {
       OS << "      const Attribute::AttrKind Atts[] = {";
       bool addComma = false;
@@ -636,6 +639,12 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
         OS << "Attribute::NoReturn";
         addComma = true;
       }
+      if (intrinsic.isCold) {
+        if (addComma)
+          OS << ",";
+        OS << "Attribute::Cold";
+        addComma = true;
+      }
       if (intrinsic.isNoDuplicate) {
         if (addComma)
           OS << ",";
diff --git a/utils/TableGen/PredicateExpander.cpp b/utils/TableGen/PredicateExpander.cpp
index ad7bf60caab9f4f586c3d470cc187337fe1b388c..2e01b7c3138e534a0ee2d99b99dec0170aacfa35 100644
--- a/utils/TableGen/PredicateExpander.cpp
+++ b/utils/TableGen/PredicateExpander.cpp
@@ -26,24 +26,39 @@ void PredicateExpander::expandCheckImmOperand(raw_ostream &OS, int OpIndex,
     OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
      << ").getImm()";
-  OS << (FunctionMapper.empty() ? " " : ") ");
-  OS << (shouldNegate() ? "!= " : "== ") << ImmVal;
+  if (!FunctionMapper.empty())
+    OS << ")";
+  OS << (shouldNegate() ? " != " : " == ") << ImmVal;
 }
 
 void PredicateExpander::expandCheckImmOperand(raw_ostream &OS, int OpIndex,
                                               StringRef ImmVal,
                                               StringRef FunctionMapper) {
+  if (ImmVal.empty())
+    expandCheckImmOperandSimple(OS, OpIndex, FunctionMapper);
+
   if (!FunctionMapper.empty())
     OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
      << ").getImm()";
-
-  OS << (FunctionMapper.empty() ? "" : ")");
-  if (ImmVal.empty())
-    return;
+  if (!FunctionMapper.empty())
+    OS << ")";
   OS << (shouldNegate() ? " != " : " == ") << ImmVal;
 }
 
+void PredicateExpander::expandCheckImmOperandSimple(raw_ostream &OS,
+                                                    int OpIndex,
+                                                    StringRef FunctionMapper) {
+  if (shouldNegate())
+    OS << "!";
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
+     << ").getImm()";
+  if (!FunctionMapper.empty())
+    OS << ")";
+}
+
 void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex,
                                               const Record *Reg,
                                               StringRef FunctionMapper) {
@@ -53,9 +68,8 @@ void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex,
     OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
      << ").getReg()";
-  OS << (FunctionMapper.empty() ? "" : ")");
-  if (!Reg)
-    return;
+  if (!FunctionMapper.empty())
+    OS << ")";
   OS << (shouldNegate() ? " != " : " == ");
   const StringRef Str = Reg->getValueAsString("Namespace");
   if (!Str.empty())
@@ -63,6 +77,20 @@ void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex,
   OS << Reg->getName();
 }
 
+
+void PredicateExpander::expandCheckRegOperandSimple(raw_ostream &OS,
+                                                    int OpIndex,
+                                                    StringRef FunctionMapper) {
+  if (shouldNegate())
+    OS << "!";
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
+  OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
+     << ").getReg()";
+  if (!FunctionMapper.empty())
+    OS << ")";
+}
+
 void PredicateExpander::expandCheckInvalidRegOperand(raw_ostream &OS,
                                                      int OpIndex) {
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
@@ -204,7 +232,7 @@ void PredicateExpander::expandOpcodeSwitchCase(raw_ostream &OS,
   for (const Record *Opcode : Opcodes) {
     OS.indent(getIndentLevel() * 2);
     OS << "case " << Opcode->getValueAsString("Namespace")
-       << "::" << Opcode->getName() << " :\n";
+       << "::" << Opcode->getName() << ":\n";
   }
 
   increaseIndentLevel();
@@ -227,7 +255,7 @@ void PredicateExpander::expandOpcodeSwitchStatement(raw_ostream &OS,
 
   // Expand the default case.
   SS.indent(getIndentLevel() * 2);
-  SS << "default :\n";
+  SS << "default:\n";
 
   increaseIndentLevel();
   SS.indent(getIndentLevel() * 2);
@@ -290,9 +318,8 @@ void PredicateExpander::expandPredicate(raw_ostream &OS, const Record *Rec) {
                                  Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckRegOperandSimple"))
-    return expandCheckRegOperand(OS, Rec->getValueAsInt("OpIndex"),
-                                 nullptr,
-                                 Rec->getValueAsString("FunctionMapper"));
+    return expandCheckRegOperandSimple(OS, Rec->getValueAsInt("OpIndex"),
+                                       Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckInvalidRegOperand"))
     return expandCheckInvalidRegOperand(OS, Rec->getValueAsInt("OpIndex"));
@@ -308,8 +335,8 @@ void PredicateExpander::expandPredicate(raw_ostream &OS, const Record *Rec) {
                                  Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckImmOperandSimple"))
-    return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"), "", 
-                                 Rec->getValueAsString("FunctionMapper"));
+    return expandCheckImmOperandSimple(OS, Rec->getValueAsInt("OpIndex"),
+                                       Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckSameRegOperand"))
     return expandCheckSameRegOperand(OS, Rec->getValueAsInt("FirstIndex"),
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 4ff52b3e44e8cbad9b1ef721090ec5408ab0502e..731c14bdb9a0d1445c4566d07148d4b39401f69f 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -93,6 +93,8 @@ class SubtargetEmitter {
                          &ProcItinLists);
   unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
                                   raw_ostream &OS);
+  void EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
+                              raw_ostream &OS);
   void EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
                               raw_ostream &OS);
   void EmitProcessorProp(raw_ostream &OS, const Record *R, StringRef Name,
@@ -697,6 +699,30 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
   return CostTblIndex;
 }
 
+void SubtargetEmitter::EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
+                                              raw_ostream &OS) {
+  unsigned QueueID = 0;
+  if (ProcModel.LoadQueue) {
+    const Record *Queue = ProcModel.LoadQueue->getValueAsDef("QueueDescriptor");
+    QueueID =
+        1 + std::distance(ProcModel.ProcResourceDefs.begin(),
+                          std::find(ProcModel.ProcResourceDefs.begin(),
+                                    ProcModel.ProcResourceDefs.end(), Queue));
+  }
+  OS << "  " << QueueID << ", // Resource Descriptor for the Load Queue\n";
+
+  QueueID = 0;
+  if (ProcModel.StoreQueue) {
+    const Record *Queue =
+        ProcModel.StoreQueue->getValueAsDef("QueueDescriptor");
+    QueueID =
+        1 + std::distance(ProcModel.ProcResourceDefs.begin(),
+                          std::find(ProcModel.ProcResourceDefs.begin(),
+                                    ProcModel.ProcResourceDefs.end(), Queue));
+  }
+  OS << "  " << QueueID << ", // Resource Descriptor for the Store Queue\n";
+}
+
 void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
                                               raw_ostream &OS) {
   // Generate a table of register file descriptors (one entry per each user
@@ -715,6 +741,9 @@ void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
   EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(),
                        NumCostEntries, OS);
 
+  // Add information about load/store queues.
+  EmitLoadStoreQueueInfo(ProcModel, OS);
+
   OS << "};\n";
 }
 
@@ -1504,9 +1533,9 @@ void collectVariantClasses(const CodeGenSchedModels &SchedModels,
       continue;
 
     if (OnlyExpandMCInstPredicates) {
-      // Ignore this variant scheduling class if transitions don't uses any
+      // Ignore this variant scheduling class no transitions use any meaningful
       // MCSchedPredicate definitions.
-      if (!all_of(SC.Transitions, [](const CodeGenSchedTransition &T) {
+      if (!any_of(SC.Transitions, [](const CodeGenSchedTransition &T) {
             return hasMCSchedPredicates(T);
           }))
         continue;
@@ -1560,6 +1589,7 @@ void SubtargetEmitter::emitSchedModelHelpersImpl(
     PE.setExpandForMC(OnlyExpandMCInstPredicates);
     for (unsigned PI : ProcIndices) {
       OS << "    ";
+
       // Emit a guard on the processor ID.
       if (PI != 0) {
         OS << (OnlyExpandMCInstPredicates
@@ -1573,11 +1603,23 @@ void SubtargetEmitter::emitSchedModelHelpersImpl(
       for (const CodeGenSchedTransition &T : SC.Transitions) {
         if (PI != 0 && !count(T.ProcIndices, PI))
           continue;
+
+        // Emit only transitions based on MCSchedPredicate, if it's the case.
+        // At least the transition specified by NoSchedPred is emitted,
+        // which becomes the default transition for those variants otherwise
+        // not based on MCSchedPredicate.
+        // FIXME: preferably, llvm-mca should instead assume a reasonable
+        // default when a variant transition is not based on MCSchedPredicate
+        // for a given processor.
+        if (OnlyExpandMCInstPredicates && !hasMCSchedPredicates(T))
+          continue;
+
         PE.setIndentLevel(3);
         emitPredicates(T, SchedModels.getSchedClass(T.ToClassIdx), PE, OS);
       }
 
       OS << "    }\n";
+
       if (PI == 0)
         break;
     }
diff --git a/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index a8edfdc623f437397d16d8470bc7c56eebbb0565..ab785715d974ea3b5bc5559421291dd3192ba5d8 100644
--- a/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -19,6 +19,8 @@
 
 namespace llvm {
 
+static constexpr int WebAssemblyInstructionTableSize = 256;
+
 void emitWebAssemblyDisassemblerTables(
     raw_ostream &OS,
     const ArrayRef<const CodeGenInstruction *> &NumberedInstructions) {
@@ -59,6 +61,8 @@ void emitWebAssemblyDisassemblerTables(
   OS << "#include \"MCTargetDesc/WebAssemblyMCTargetDesc.h\"\n";
   OS << "\n";
   OS << "namespace llvm {\n\n";
+  OS << "static constexpr int WebAssemblyInstructionTableSize = ";
+  OS << WebAssemblyInstructionTableSize << ";\n\n";
   OS << "enum EntryType : uint8_t { ";
   OS << "ET_Unused, ET_Prefix, ET_Instruction };\n\n";
   OS << "struct WebAssemblyInstruction {\n";
@@ -74,7 +78,7 @@ void emitWebAssemblyDisassemblerTables(
       continue;
     OS << "WebAssemblyInstruction InstructionTable" << PrefixPair.first;
     OS << "[] = {\n";
-    for (unsigned I = 0; I <= 0xFF; I++) {
+    for (unsigned I = 0; I < WebAssemblyInstructionTableSize; I++) {
       auto InstIt = PrefixPair.second.find(I);
       if (InstIt != PrefixPair.second.end()) {
         // Regular instruction.
diff --git a/utils/UpdateTestChecks/common.py b/utils/UpdateTestChecks/common.py
index daea395e31fa81f8689e4b6e47a428cac10a3196..ed591ae37a14797f7f822c4f775650dcae1e7a02 100644
--- a/utils/UpdateTestChecks/common.py
+++ b/utils/UpdateTestChecks/common.py
@@ -104,7 +104,7 @@ def build_function_body_dictionary(function_re, scrubber, scrubber_args, raw_too
     body = m.group('body')
     scrubbed_body = do_scrub(body, scrubber, scrubber_args, extra = False)
     scrubbed_extra = do_scrub(body, scrubber, scrubber_args, extra = True)
-    if m.groupdict().has_key('analysis'):
+    if 'analysis' in m.groupdict():
       analysis = m.group('analysis')
       if analysis.lower() != 'cost model analysis':
         print('WARNING: Unsupported analysis mode: %r!' % (analysis,), file=sys.stderr)
diff --git a/utils/benchmark/CMakeLists.txt b/utils/benchmark/CMakeLists.txt
index 686846bf1e052760236e5be022fa667a4aa01e59..788ad48d1c4d55932f820395fd8927cd10d3f881 100644
--- a/utils/benchmark/CMakeLists.txt
+++ b/utils/benchmark/CMakeLists.txt
@@ -1,5 +1,11 @@
 cmake_minimum_required (VERSION 2.8.12)
 
+# Tell cmake 3.0+ that it's safe to clear the PROJECT_VERSION variable in the
+# call to project() below.
+if(POLICY CMP0048)
+  cmake_policy(SET CMP0048 NEW)
+endif()
+
 project (benchmark)
 
 foreach(p
diff --git a/utils/benchmark/src/sysinfo.cc b/utils/benchmark/src/sysinfo.cc
index 29f4eb677be8bd91c87a39857aba39f000c363b8..01dd8a0317b05cbeaf8a174789510548caeaf247 100644
--- a/utils/benchmark/src/sysinfo.cc
+++ b/utils/benchmark/src/sysinfo.cc
@@ -333,6 +333,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
     C.num_sharing = static_cast<int>(B.count());
     C.level = Cache->Level;
     C.size = Cache->Size;
+    C.type = "Unknown";
     switch (Cache->Type) {
       case CacheUnified:
         C.type = "Unified";
@@ -346,9 +347,6 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
       case CacheTrace:
         C.type = "Trace";
         break;
-      default:
-        C.type = "Unknown";
-        break;
     }
     res.push_back(C);
   }
diff --git a/utils/emacs/llvm-mode.el b/utils/emacs/llvm-mode.el
index 95c7f3c0c6d3e7396730e9ae64dac2807f4ea007..cde66d122866f359366a994da555f56db2eeb557 100644
--- a/utils/emacs/llvm-mode.el
+++ b/utils/emacs/llvm-mode.el
@@ -62,7 +62,7 @@
    `(,(regexp-opt '("add" "sub" "mul" "sdiv" "udiv" "urem" "srem" "and" "or" "xor"
                     "setne" "seteq" "setlt" "setgt" "setle" "setge") 'symbols) . font-lock-keyword-face)
    ;; Floating-point operators
-   `(,(regexp-opt '("fadd" "fsub" "fmul" "fdiv" "frem") 'symbols) . font-lock-keyword-face)
+   `(,(regexp-opt '("fadd" "fsub" "fneg" "fmul" "fdiv" "frem") 'symbols) . font-lock-keyword-face)
    ;; Special instructions
    `(,(regexp-opt '("phi" "tail" "call" "select" "to" "shl" "lshr" "ashr" "fcmp" "icmp" "va_arg" "landingpad") 'symbols) . font-lock-keyword-face)
    ;; Control instructions
diff --git a/utils/git-svn/git-llvm b/utils/git-svn/git-llvm
index 15bce10d45ad4424d2c6eed331b5343d637efecc..53c0b24ae2c8568f68c0d5a227cf04badd436305 100755
--- a/utils/git-svn/git-llvm
+++ b/utils/git-svn/git-llvm
@@ -29,6 +29,16 @@ import tempfile
 import time
 assert sys.version_info >= (2, 7)
 
+try:
+    dict.iteritems
+except AttributeError:
+    # Python 3
+    def iteritems(d):
+        return iter(d.items())
+else:
+    # Python 2
+    def iteritems(d):
+        return d.iteritems()
 
 # It's *almost* a straightforward mapping from the monorepo to svn...
 GIT_TO_SVN_DIR = {
@@ -50,9 +60,11 @@ GIT_TO_SVN_DIR = {
         'openmp',
         'parallel-libs',
         'polly',
+        'pstl',
     ]
 }
 GIT_TO_SVN_DIR.update({'clang': 'cfe/trunk'})
+GIT_TO_SVN_DIR.update({'': 'monorepo-root/trunk'})
 
 VERBOSE = False
 QUIET = False
@@ -80,12 +92,12 @@ def die(msg):
     sys.exit(1)
 
 
-def first_dirname(d):
-    while True:
-        (head, tail) = os.path.split(d)
-        if not head or head == '/':
-            return tail
-        d = head
+def split_first_path_component(d):
+    # Assuming we have a git path, it'll use slashes even on windows...I hope.
+    if '/' in d:
+        return d.split('/', 1)
+    else:
+        return (d, None)
 
 
 def get_dev_null():
@@ -97,36 +109,35 @@ def get_dev_null():
 
 
 def shell(cmd, strip=True, cwd=None, stdin=None, die_on_failure=True,
-          ignore_errors=False, force_binary_stdin=False):
-    log_verbose('Running: %s' % ' '.join(cmd))
+          ignore_errors=False, text=True):
+    log_verbose('Running in %s: %s' % (cwd, ' '.join(cmd)))
 
     err_pipe = subprocess.PIPE
     if ignore_errors:
         # Silence errors if requested.
         err_pipe = get_dev_null()
 
-    if force_binary_stdin and stdin:
-        stdin = stdin.encode('utf-8')
-
     start = time.time()
-    text = not force_binary_stdin
     p = subprocess.Popen(cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=err_pipe,
-                         stdin=subprocess.PIPE, universal_newlines=text)
+                         stdin=subprocess.PIPE,
+                         universal_newlines=text)
     stdout, stderr = p.communicate(input=stdin)
     elapsed = time.time() - start
 
     log_verbose('Command took %0.1fs' % elapsed)
 
-    if not text:
-        stdout = stdout.decode('utf-8')
-        stderr = stderr.decode('utf-8')
-
     if p.returncode == 0 or ignore_errors:
         if stderr and not ignore_errors:
             eprint('`%s` printed to stderr:' % ' '.join(cmd))
             eprint(stderr.rstrip())
         if strip:
-            stdout = stdout.rstrip('\r\n')
+            if text:
+                stdout = stdout.rstrip('\r\n')
+            else:
+                stdout = stdout.rstrip(b'\r\n')
+        if VERBOSE:
+            for l in stdout.splitlines():
+                log_verbose("STDOUT: %s" % l)
         return stdout
     err_msg = '`%s` returned %s' % (' '.join(cmd), p.returncode)
     eprint(err_msg)
@@ -138,13 +149,11 @@ def shell(cmd, strip=True, cwd=None, stdin=None, die_on_failure=True,
 
 
 def git(*cmd, **kwargs):
-    return shell(['git'] + list(cmd), kwargs.get('strip', True))
+    return shell(['git'] + list(cmd), **kwargs)
 
 
 def svn(cwd, *cmd, **kwargs):
-    # TODO: Better way to do default arg when we have *cmd?
-    return shell(['svn'] + list(cmd), cwd=cwd, stdin=kwargs.get('stdin', None),
-                 ignore_errors=kwargs.get('ignore_errors', None))
+    return shell(['svn'] + list(cmd), cwd=cwd, **kwargs)
 
 def program_exists(cmd):
     if sys.platform == 'win32' and not cmd.endswith('.exe'):
@@ -181,7 +190,7 @@ def get_revs_to_push(rev_range):
     return revs
 
 
-def clean_and_update_svn(svn_repo):
+def clean_svn(svn_repo):
     svn(svn_repo, 'revert', '-R', '.')
 
     # Unfortunately it appears there's no svn equivalent for git clean, so we
@@ -192,24 +201,19 @@ def clean_and_update_svn(svn_repo):
         filename = line[1:].strip()
         os.remove(os.path.join(svn_repo, filename))
 
-    svn(svn_repo, 'update', *list(GIT_TO_SVN_DIR.values()))
-
 
 def svn_init(svn_root):
     if not os.path.exists(svn_root):
         log('Creating svn staging directory: (%s)' % (svn_root))
         os.makedirs(svn_root)
-        log('This is a one-time initialization, please be patient for a few'
-            ' minutes...')
-        svn(svn_root, 'checkout', '--depth=immediates',
+        svn(svn_root, 'checkout', '--depth=empty',
             'https://llvm.org/svn/llvm-project/', '.')
-        svn(svn_root, 'update', *list(GIT_TO_SVN_DIR.values()))
         log("svn staging area ready in '%s'" % svn_root)
     if not os.path.isdir(svn_root):
         die("Can't initialize svn staging dir (%s)" % svn_root)
 
 
-def fix_eol_style_native(rev, sr, svn_sr_path):
+def fix_eol_style_native(rev, svn_sr_path, files):
     """Fix line endings before applying patches with Unix endings
 
     SVN on Windows will check out files with CRLF for files with the
@@ -219,9 +223,6 @@ def fix_eol_style_native(rev, sr, svn_sr_path):
     SVN will not commit a mass line ending re-doing because it detects the line
     ending format for files with this property.
     """
-    files = git('diff-tree', '--no-commit-id', '--name-only', '-r', rev, '--',
-                sr).split('\n')
-    files = [f.split('/', 1)[1] for f in files]
     # Skip files that don't exist in SVN yet.
     files = [f for f in files if os.path.exists(os.path.join(svn_sr_path, f))]
     # Use ignore_errors because 'svn propget' prints errors if the file doesn't
@@ -248,35 +249,87 @@ def fix_eol_style_native(rev, sr, svn_sr_path):
             if eol_style == 'native':
                 crlf_files.append(f)
     if crlf_files:
-        # Reformat all files with native SVN line endings to Unix format. SVN knows
-        # files with native line endings are text files. It will commit just the
-        # diff, and not a mass line ending change.
+        # Reformat all files with native SVN line endings to Unix format. SVN
+        # knows files with native line endings are text files. It will commit
+        # just the diff, and not a mass line ending change.
         shell(['dos2unix'] + crlf_files, ignore_errors=True, cwd=svn_sr_path)
 
+def split_subrepo(f):
+    # Given a path, splits it into (subproject, rest-of-path). If the path is
+    # not in a subproject, returns ('', full-path).
+
+    subproject, remainder = split_first_path_component(f)
+
+    if subproject in GIT_TO_SVN_DIR:
+        return subproject, remainder
+    else:
+        return '', f
+
+def get_all_parent_dirs(name):
+    parts = []
+    head, tail = os.path.split(name)
+    while head:
+        parts.append(head)
+        head, tail = os.path.split(head)
+    return parts
+
 def svn_push_one_rev(svn_repo, rev, dry_run):
     files = git('diff-tree', '--no-commit-id', '--name-only', '-r',
                 rev).split('\n')
-    subrepos = {first_dirname(f) for f in files}
-    if not subrepos:
+    if not files:
         raise RuntimeError('Empty diff for rev %s?' % rev)
 
+    # Split files by subrepo
+    subrepo_files = collections.defaultdict(list)
+    for f in files:
+        subrepo, remainder = split_subrepo(f)
+        subrepo_files[subrepo].append(remainder)
+
     status = svn(svn_repo, 'status', '--no-ignore')
     if status:
         die("Can't push git rev %s because svn status is not empty:\n%s" %
             (rev, status))
 
-    for sr in subrepos:
+    svn_dirs_to_update = set()
+    for sr, files in iteritems(subrepo_files):
+        svn_sr_path = GIT_TO_SVN_DIR[sr]
+        for f in files:
+            svn_dirs_to_update.add(
+                os.path.dirname(os.path.join(svn_sr_path, f)))
+
+    # We also need to svn update any parent directories which are not yet present
+    parent_dirs = set()
+    for dir in svn_dirs_to_update:
+        parent_dirs.update(get_all_parent_dirs(dir))
+    parent_dirs = set(dir for dir in parent_dirs
+                      if not os.path.exists(os.path.join(svn_repo, dir)))
+    svn_dirs_to_update.update(parent_dirs)
+
+    # Sort by length to ensure that the parent directories are passed to svn
+    # before child directories.
+    sorted_dirs_to_update = sorted(svn_dirs_to_update, key=len)
+
+    # SVN update only in the affected directories.
+    svn(svn_repo, 'update', '--depth=files', *sorted_dirs_to_update)
+
+    for sr, files in iteritems(subrepo_files):
         svn_sr_path = os.path.join(svn_repo, GIT_TO_SVN_DIR[sr])
         if os.name == 'nt':
-            fix_eol_style_native(rev, sr, svn_sr_path)
-        diff = git('show', '--binary', rev, '--', sr, strip=False)
+            fix_eol_style_native(rev, svn_sr_path, files)
+        # We use text=False (and pass '--binary') so that we can get an exact
+        # diff that can be passed as-is to 'git apply' without any line ending,
+        # encoding, or other mangling.
+        diff = git('show', '--binary', rev, '--',
+                   *(os.path.join(sr, f) for f in files),
+                   strip=False, text=False)
         # git is the only thing that can handle its own patches...
-        log_verbose('Apply patch: %s' % diff)
+        if sr == '':
+            prefix_strip = '-p1'
+        else:
+            prefix_strip = '-p2'
         try:
-            # If we allow python to apply the diff in text mode, it will silently
-            # convert \n to \r\n which git doesn't like.
-            shell(['git', 'apply', '-p2', '-'], cwd=svn_sr_path, stdin=diff,
-                  die_on_failure=False, force_binary_stdin=True)
+            shell(['git', 'apply', prefix_strip, '-'], cwd=svn_sr_path,
+                  stdin=diff, die_on_failure=False, text=False)
         except RuntimeError as e:
             eprint("Patch doesn't apply: maybe you should try `git pull -r` "
                    "first?")
@@ -327,7 +380,7 @@ def cmd_push(args):
          else '', '\n'.join('  ' + git('show', '--oneline', '--quiet', c)
                             for c in revs)))
     for r in revs:
-        clean_and_update_svn(svn_root)
+        clean_svn(svn_root)
         svn_push_one_rev(svn_root, r, dry_run)
 
 
diff --git a/utils/gn/.gn b/utils/gn/.gn
new file mode 100644
index 0000000000000000000000000000000000000000..86d2da97565b37e798b7f7f7b160fee81742c02c
--- /dev/null
+++ b/utils/gn/.gn
@@ -0,0 +1,16 @@
+# FIXME: Once it's possible to add files to the root directory of the
+# monorepo, move this file to there.  Until then, you need to pass
+# `--dotfile=llvm/utils/gn/.gn --root=.` to the `gn gen` command.
+
+buildconfig = "//llvm/utils/gn/build/BUILDCONFIG.gn"
+
+# Disallow all calls to exec_script. We should be very conservative about
+# whitelisting things here.
+exec_script_whitelist = []
+
+# The normal GN setup is to have BUILD.gn files in-tree and a root BUILD.gn
+# file.  Since LLVM's GN build is unofficial, set secondary_source to tell GN
+# that e.g. llvm/lib/Demangle/BUILD.gn is found at
+# llvm/utils/gn/secondary/llvm/lib/Demangle/BUILD.gn and that the root BUILD.gn
+# file is at llvm/utils/gn/secondary/BUILD.gn
+secondary_source = "//llvm/utils/gn/secondary/"
diff --git a/utils/gn/README.rst b/utils/gn/README.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fd811e916e2cbfb2ed3ab319075485c9f314618b
--- /dev/null
+++ b/utils/gn/README.rst
@@ -0,0 +1,147 @@
+=====================
+Building LLVM with GN
+=====================
+
+.. contents::
+   :local:
+
+.. _Introduction:
+
+Introduction
+============
+
+*Warning* The GN build is experimental and best-effort. It might not work,
+and if you use it you're expected to feel comfortable to unbreak it if
+necessary. LLVM's official build system is CMake, if in doubt use that.
+If you add files, you're expected to update the CMake build but you don't need
+to update GN build files. Reviewers should not ask authors to update GN build
+files. Keeping the GN build files up-to-date is on the people who use the GN
+build.
+
+*Another Warning* Right now, we're in the process of getting the GN build
+checked in. As of this writing, it's not yet functional at all. Check back
+in a few weeks!
+
+`GN <https://gn.googlesource.com/gn/>`_ is another metabuild system. It always
+creates ninja files, but it can create some IDE projects (MSVC, Xcode, ...)
+which then shell out to ninja for the actual build.
+
+Its main features are that GN is very fast (it currently produces ninja files
+for LLVM's build in 35ms on the author's laptop, compared to 66s for CMake) --
+a 2000x difference), and since it's so fast it doesn't aggressively cache,
+making it possible to switch e.g. between release and debug builds in one build
+directory.
+
+It is arguable easier to configure than the CMake build, and has native support
+for building with multiple toolchains in one build directory. The build
+description is declarative-ish, allowing GN to print it in a json format that
+can fairly easily be converted to other metabuild system inputs.
+
+The main motivation behind the GN build is that some people find it more
+convenient for day-to-day hacking on LLVM than CMake. Distribution, building
+just parts of LLVM, and embedding the LLVM GN build from other builds are a
+non-goal for the GN build.
+
+This is a `good overview of GN <https://docs.google.com/presentation/d/15Zwb53JcncHfEwHpnG_PoIbbzQ3GQi_cpujYwbpcbZo/edit#slide=id.g119d702868_0_12>`_.
+
+.. _Quick start:
+
+Quick start
+===========
+
+*Warning* Right now, we're in the process of getting the GN build checked in.
+As of this writing, it's not yet functional at all.
+
+GN only works in the monorepo layout.
+
+#. Obtain a `gn binary <https://gn.googlesource.com/gn/#getting-started>`_.
+
+#. In the root of the monorepo, run
+   `gn gen --dotfile=$PWD/llvm/utils/gn/.gn --root=. out/gn` (`out/gn` is the
+   build directory, it can have any name, and you can have as many as you want,
+   each with different build settings).
+
+#. Run e.g. `ninja -C out/gn llvm-undname` to build all prerequisites for and
+   including the Microsoft symbol name pretty printing tool llvm-undname.
+
+By default, you get a release build with assertions enabled that targets
+the host arch. You can set various build options by editing `out/gn/args.gn`,
+for example putting `is_debug = true` in there gives you a debug build. Run
+`gn args --list out/gn` to see a list of all possible options. After touching
+`out/gn/args.gn`, just run ninja, it will re-invoke gn before starting the
+build.
+
+GN has extensive built-in help; try e.g. `gn help gen` to see the help
+for the `gen` command. The full GN reference is also `available online
+<https://gn.googlesource.com/gn/+/master/docs/reference.md>`_.
+
+GN has an autoformatter: `git ls-files '*.gn' '*.gni' | xargs -n 1 gn format`
+after making GN build changes is your friend.
+
+To not put `BUILD.gn` into the main tree, they are all below
+`utils/gn/secondary`.  For example, the build file for `llvm/lib/Support` is in
+`utils/gn/secondary/llvm/lib/Support`.
+
+.. _Syncing GN files from CMake files:
+
+Syncing GN files from CMake files
+=================================
+
+Sometimes after pulling in the latest changes, the GN build doesn't work.
+Most of the time this is due to someone adding a file to CMakeLists.txt file.
+Run `llvm/utils/gn/build/sync_source_lists_from_cmake.py` to print a report
+of which files need to be added to or removed from `BUILD.gn` files to
+match the corresponding `CMakeLists.txt`. You have to manually read the output
+of the script and implement its suggestions.
+
+If new `CMakeLists.txt` files have been added, you have to manually create
+a new corresponding `BUILD.gn` file below `llvm/utils/gn/secondary/`.
+
+If the dependencies in a `CMakeLists.txt` file have been changed, you have to
+manually analyze and fix.
+
+.. _Philosophy:
+
+Philosophy
+==========
+
+GN believes in using GN arguments to configure the build explicitly, instead
+of implicitly figuring out what to do based on what's available on the current
+system.
+
+configure is used for three classes of feature checks:
+
+- compiler checks. In GN, these could use exec_script to identify the host
+  compiler at GN time. For now the build has explicit toggles for compiler
+  features. (Maybe there could be a script that writes args.gn based on the
+  host compiler).  It's possible we'll use exec_script() for this going forward,
+  but we'd have one exec_script call to identify compiler id and version,
+  and then base GN arg default values of compiler id and version instead of
+  doing one exec_script per feature check.
+  (In theory, the config approach means a new os / compiler just needs to tweak
+  the checks and not the code, but in practice a) new os's / compilers are rare
+  b) they will require code changes anyhow, so the configure tradeoff seems
+  not worth it.)
+
+- library checks. For e.g. like zlib, GN thinks it's better to say "we require
+  zlib, else we error at build time" than silently omitting features. People
+  who really don't want to install zlib can explicitly set the GN arg to turn
+  off zlib.
+
+- header checks (does system header X exist). These are generally not needed
+  (just keying this off the host OS works fine), but if they should become
+  necessary in the future, they should be done at build time and the few
+  targets that need to know if header X exists then depend on that build-time
+  check while everything else can build parallel with it.
+
+- LLVM-specific build toggles (assertions on/off, debug on/off, targets to
+  build, ...). These map cleanly to GN args (which then get copied into
+  config.h in a build step).
+
+For the last two points, it would be nice if LLVM didn't have a single
+`config.h` header, but one header per toggle. That way, when e.g.
+`llvm_enable_terminfo` is toggled, only the 3 files caring about that setting
+would need to be rebuilt, instead of everything including `config.h`.
+
+GN doesn't believe in users setting arbitrary cflags from an environment
+variable, it wants the build to be controlled by .gn files.
diff --git a/utils/gn/build/BUILD.gn b/utils/gn/build/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..37aa4311651eea51493aeec8013b7aa8749a244d
--- /dev/null
+++ b/utils/gn/build/BUILD.gn
@@ -0,0 +1,143 @@
+import("//llvm/utils/gn/build/buildflags.gni")
+import("//llvm/utils/gn/build/mac_sdk.gni")
+import("//llvm/utils/gn/build/toolchain/compiler.gni")
+
+config("compiler_defaults") {
+  defines = []
+
+  # FIXME: Don't define this globally here.
+  if (host_os != "win") {
+    defines += [ "LLVM_ON_UNIX" ]
+  }
+
+  if (!llvm_enable_assertions) {
+    defines += [ "NDEBUG" ]
+  }
+
+  cflags = []
+
+  if (host_os == "mac" && clang_base_path != "") {
+    cflags += [
+      "-isysroot",
+      mac_sdk_path,
+    ]
+  }
+
+  if (host_os != "win") {
+    if (is_debug) {
+      cflags += [ "-g" ]
+    } else {
+      cflags += [ "-O3" ]
+    }
+    cflags += [ "-fdiagnostics-color" ]
+    cflags_cc = [
+      "-std=c++11",
+      "-fno-exceptions",
+      "-fno-rtti",
+      "-fvisibility-inlines-hidden",
+    ]
+  } else {
+    if (is_debug) {
+      cflags += [ "/Zi" ]
+    } else {
+      cflags += [
+        "/O2",
+        "/Zc:inline",
+      ]
+    }
+    defines += [
+      "_CRT_SECURE_NO_DEPRECATE",
+      "_CRT_SECURE_NO_WARNINGS",
+      "_CRT_NONSTDC_NO_DEPRECATE",
+      "_CRT_NONSTDC_NO_WARNINGS",
+      "_SCL_SECURE_NO_DEPRECATE",
+      "_SCL_SECURE_NO_WARNINGS",
+
+      "_HAS_EXCEPTIONS=0",
+      "_UNICODE",
+      "UNICODE",
+    ]
+    cflags += [
+      "/EHs-c-",
+      "/GR-",
+    ]
+
+    # The MSVC default value (1 MB) is not enough for parsing recursive C++
+    # templates in Clang.
+    ldflags = [ "/STACK:10000000" ]
+  }
+
+  # Warning setup.
+  if (host_os == "win" && !is_clang) {
+    cflags += [
+      # Suppress ''modifier' : used more than once' (__forceinline and inline).
+      "-wd4141",
+
+      # Suppress 'conversion from 'type1' to 'type2', possible loss of data'.
+      "-wd4244",
+
+      # Suppress 'conversion from 'size_t' to 'type', possible loss of data'.
+      "-wd4267",
+
+      # Suppress 'no matching operator delete found'.
+      "-wd4291",
+
+      # Suppress 'noexcept used with no exception handling mode specified'.
+      "-wd4577",
+
+      # Suppress 'destructor was implicitly defined as deleted'.
+      "-wd4624",
+
+      # Suppress 'unsafe mix of type <type> and type <type> in operation'.
+      "-wd4805",
+    ]
+  } else {
+    if (host_os == "win") {
+      cflags += [ "/W4" ]
+    } else {
+      cflags += [
+        "-Wall",
+        "-Wextra",
+      ]
+    }
+    cflags += [
+      "-Wno-unused-parameter",
+      "-Wstring-conversion",
+    ]
+    if (is_clang) {
+      cflags += [ "-Wdelete-non-virtual-dtor" ]
+    }
+    if (is_clang && use_goma) {
+      # goma converts all paths to lowercase on the server, breaking this
+      # warning.
+      cflags += [ "-Wno-nonportable-include-path" ]
+    }
+  }
+}
+
+config("llvm_code") {
+  include_dirs = [
+    "//llvm/include",
+    "$root_gen_dir/llvm/include",
+  ]
+}
+
+config("lld_code") {
+  include_dirs = [
+    "//lld/include",
+    "$root_gen_dir/lld/include",
+  ]
+}
+
+config("clang_code") {
+  include_dirs = [
+    "//clang/include",
+    "$root_gen_dir/clang/include",
+  ]
+}
+
+config("warn_covered_switch_default") {
+  if (is_clang) {
+    cflags = [ "-Wcovered-switch-default" ]
+  }
+}
diff --git a/utils/gn/build/BUILDCONFIG.gn b/utils/gn/build/BUILDCONFIG.gn
new file mode 100644
index 0000000000000000000000000000000000000000..c4403657e5df3881f452805d0b528b519a9662d7
--- /dev/null
+++ b/utils/gn/build/BUILDCONFIG.gn
@@ -0,0 +1,32 @@
+# All targets will get this list of configs by default.
+# Targets can opt out of a config by removing it from their local configs list.
+# If you're adding global flags and don't need targets to be able to opt out,
+# add the flags to compiler_defaults, not to a new config.
+_shared_binary_target_configs = [
+  "//llvm/utils/gn/build:compiler_defaults",
+  "//llvm/utils/gn/build:llvm_code",
+  "//llvm/utils/gn/build:warn_covered_switch_default",
+]
+
+# Apply that default list to the binary target types.
+set_defaults("executable") {
+  configs = _shared_binary_target_configs
+}
+set_defaults("loadable_module") {
+  configs = _shared_binary_target_configs
+}
+set_defaults("static_library") {
+  configs = _shared_binary_target_configs
+}
+set_defaults("shared_library") {
+  configs = _shared_binary_target_configs
+}
+set_defaults("source_set") {
+  configs = _shared_binary_target_configs
+}
+
+if (host_os == "win") {
+  set_default_toolchain("//llvm/utils/gn/build/toolchain:win")
+} else {
+  set_default_toolchain("//llvm/utils/gn/build/toolchain:unix")
+}
diff --git a/utils/gn/build/buildflags.gni b/utils/gn/build/buildflags.gni
new file mode 100644
index 0000000000000000000000000000000000000000..a28b788418dac70ea16fe475be805c0d8c9ab772
--- /dev/null
+++ b/utils/gn/build/buildflags.gni
@@ -0,0 +1,10 @@
+declare_args() {
+  # Whether to build with debug information and without optimizations.
+  is_debug = false
+}
+
+# args that depend on other args must live in a later declare_args() block.
+declare_args() {
+  # Whether to enable assertions.
+  llvm_enable_assertions = true
+}
diff --git a/utils/gn/build/libs/pthread/BUILD.gn b/utils/gn/build/libs/pthread/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..51e7f1f31fa8f9ac8b194da3e88e6f40179cb68f
--- /dev/null
+++ b/utils/gn/build/libs/pthread/BUILD.gn
@@ -0,0 +1,12 @@
+import("//llvm/utils/gn/build/libs/pthread/enable.gni")
+
+config("pthread_config") {
+  visibility = [ ":pthread" ]
+  libs = [ "pthread" ]
+}
+
+group("pthread") {
+  if (llvm_enable_threads && host_os != "win") {
+    public_configs = [ ":pthread_config" ]
+  }
+}
diff --git a/utils/gn/build/libs/pthread/enable.gni b/utils/gn/build/libs/pthread/enable.gni
new file mode 100644
index 0000000000000000000000000000000000000000..536d19e8f7485af94c7939fbc39bf940ec13345d
--- /dev/null
+++ b/utils/gn/build/libs/pthread/enable.gni
@@ -0,0 +1,4 @@
+declare_args() {
+  # Whether to enable threading.
+  llvm_enable_threads = true
+}
diff --git a/utils/gn/build/libs/terminfo/BUILD.gn b/utils/gn/build/libs/terminfo/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..10003d61c4df91d45d91b702dfb6733e7514a4b2
--- /dev/null
+++ b/utils/gn/build/libs/terminfo/BUILD.gn
@@ -0,0 +1,12 @@
+import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
+
+config("terminfo_config") {
+  visibility = [ ":terminfo" ]
+  libs = [ "ncurses" ]
+}
+
+group("terminfo") {
+  if (llvm_enable_terminfo) {
+    public_configs = [ ":terminfo_config" ]
+  }
+}
diff --git a/utils/gn/build/libs/terminfo/enable.gni b/utils/gn/build/libs/terminfo/enable.gni
new file mode 100644
index 0000000000000000000000000000000000000000..79ea2b601857ff5823fa00db7d33dc94583cad7c
--- /dev/null
+++ b/utils/gn/build/libs/terminfo/enable.gni
@@ -0,0 +1,4 @@
+declare_args() {
+  # Whether to link against terminfo.
+  llvm_enable_terminfo = false
+}
diff --git a/utils/gn/build/libs/xar/BUILD.gn b/utils/gn/build/libs/xar/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..0c30abf5d736cef97e3c6004607a2d4f0ad4f97e
--- /dev/null
+++ b/utils/gn/build/libs/xar/BUILD.gn
@@ -0,0 +1,12 @@
+import("//llvm/utils/gn/build/libs/xar/enable.gni")
+
+config("xar_config") {
+  visibility = [ ":xar" ]
+  libs = [ "xar" ]
+}
+
+group("xar") {
+  if (llvm_enable_libxar) {
+    public_configs = [ ":xar_config" ]
+  }
+}
diff --git a/utils/gn/build/libs/xar/enable.gni b/utils/gn/build/libs/xar/enable.gni
new file mode 100644
index 0000000000000000000000000000000000000000..c394a7efbd1eb9baffd771de7320dcd9bae52a0d
--- /dev/null
+++ b/utils/gn/build/libs/xar/enable.gni
@@ -0,0 +1,3 @@
+declare_args() {
+  llvm_enable_libxar = host_os == "mac"
+}
diff --git a/utils/gn/build/libs/xml/BUILD.gn b/utils/gn/build/libs/xml/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..f3590f4601cc8dc725298b0ce91b2b2e0f57f349
--- /dev/null
+++ b/utils/gn/build/libs/xml/BUILD.gn
@@ -0,0 +1,19 @@
+import("//llvm/utils/gn/build/libs/xml/enable.gni")
+import("//llvm/utils/gn/build/mac_sdk.gni")
+
+config("xml_config") {
+  visibility = [ ":xml" ]
+  defines = [ "LLVM_LIBXML2_ENABLED" ]
+  libs = [ "xml2" ]
+  if (host_os == "mac") {
+    include_dirs = [ "$mac_sdk_path/usr/include/libxml2" ]
+  } else {
+    include_dirs = [ "/usr/include/libxml2" ]
+  }
+}
+
+group("xml") {
+  if (llvm_enable_libxml2) {
+    public_configs = [ ":xml_config" ]
+  }
+}
diff --git a/utils/gn/build/libs/xml/enable.gni b/utils/gn/build/libs/xml/enable.gni
new file mode 100644
index 0000000000000000000000000000000000000000..3ef357b4fd2f7dc6f919cb7186934fb679576dca
--- /dev/null
+++ b/utils/gn/build/libs/xml/enable.gni
@@ -0,0 +1,4 @@
+declare_args() {
+  # Whether to include code that links against libxml2.
+  llvm_enable_libxml2 = host_os != "win"
+}
diff --git a/utils/gn/build/libs/zlib/BUILD.gn b/utils/gn/build/libs/zlib/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..5085578e760ece68a52de92c508e53b3397f9122
--- /dev/null
+++ b/utils/gn/build/libs/zlib/BUILD.gn
@@ -0,0 +1,12 @@
+import("//llvm/utils/gn/build/libs/zlib/enable.gni")
+
+config("zlib_config") {
+  visibility = [ ":zlib" ]
+  libs = [ "z" ]
+}
+
+group("zlib") {
+  if (llvm_enable_zlib) {
+    public_configs = [ ":zlib_config" ]
+  }
+}
diff --git a/utils/gn/build/libs/zlib/enable.gni b/utils/gn/build/libs/zlib/enable.gni
new file mode 100644
index 0000000000000000000000000000000000000000..9626301378bd9f1988f33bef246fafd389944c9f
--- /dev/null
+++ b/utils/gn/build/libs/zlib/enable.gni
@@ -0,0 +1,4 @@
+declare_args() {
+  # Whether to include code that links against zlib.
+  llvm_enable_zlib = host_os != "win"
+}
diff --git a/utils/gn/build/mac_sdk.gni b/utils/gn/build/mac_sdk.gni
new file mode 100644
index 0000000000000000000000000000000000000000..7999042a7ab5e6723df066b03691e47c1a0cde53
--- /dev/null
+++ b/utils/gn/build/mac_sdk.gni
@@ -0,0 +1,4 @@
+# Location of the mac sdk.
+# If that's not fixed, might want to shell out to xcrun at gn time to
+# retrieve this, but for now this seems to do the trick.
+mac_sdk_path = "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk"
diff --git a/utils/gn/build/run_tablegen.py b/utils/gn/build/run_tablegen.py
new file mode 100755
index 0000000000000000000000000000000000000000..c7694236be55f901262dce946a8e9f71238f8f02
--- /dev/null
+++ b/utils/gn/build/run_tablegen.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+"""Runs tablegen."""
+
+import subprocess
+import sys
+
+# Prefix with ./ to run built binary, not arbitrary stuff from PATH.
+sys.exit(subprocess.call(['./' + sys.argv[1]] + sys.argv[2:]))
diff --git a/utils/gn/build/symlink_or_copy.gni b/utils/gn/build/symlink_or_copy.gni
new file mode 100644
index 0000000000000000000000000000000000000000..499d2e897d53d0cc642774169249ca264104ca39
--- /dev/null
+++ b/utils/gn/build/symlink_or_copy.gni
@@ -0,0 +1,24 @@
+# Creates a symlink (or, on Windows, copies).
+# Args:
+#   source: Path to link to.
+#   output: Where to create the symlink.
+template("symlink_or_copy") {
+  action(target_name) {
+    forward_variables_from(invoker, [ "deps" ])
+
+    # Make a stamp file the output to work around
+    # https://github.com/ninja-build/ninja/issues/1186
+    stamp =
+        "$target_gen_dir/" + get_path_info(invoker.output, "file") + ".stamp"
+    outputs = [
+      stamp,
+    ]
+    script = "//llvm/utils/gn/build/symlink_or_copy.py"
+    args = [
+      "--stamp",
+      rebase_path(stamp, root_out_dir),
+      invoker.source,
+      rebase_path(invoker.output, root_out_dir),
+    ]
+  }
+}
diff --git a/utils/gn/build/symlink_or_copy.py b/utils/gn/build/symlink_or_copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9841dc97a8002bfdba5d52ece2d966ef954627e8
--- /dev/null
+++ b/utils/gn/build/symlink_or_copy.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+"""Symlinks, or on Windows copies, an existing file to a second location.
+
+Overwrites the target location if it exists.
+Updates the mtime on a stamp file when done."""
+
+import argparse
+import errno
+import os
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--stamp', required=True,
+                        help='name of a file whose mtime is updated on run')
+    parser.add_argument('source')
+    parser.add_argument('output')
+    args = parser.parse_args()
+
+    # FIXME: This should not check the host platform but the target platform
+    # (which needs to be passed in as an arg), for cross builds.
+    if sys.platform != 'win32':
+        try:
+            os.symlink(args.source, args.output)
+        except OSError as e:
+            if e.errno == errno.EEXIST:
+                os.remove(args.output)
+                os.symlink(args.source, args.output)
+            else:
+                raise
+    else:
+        import shutil
+        output = args.output + ".exe"
+        source = args.source + ".exe"
+        shutil.copyfile(os.path.join(os.path.dirname(output), source), output)
+
+    open(args.stamp, 'w') # Update mtime on stamp file.
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/utils/gn/build/sync_source_lists_from_cmake.py b/utils/gn/build/sync_source_lists_from_cmake.py
new file mode 100755
index 0000000000000000000000000000000000000000..77539672e33ad3c805ec87c629dea980400693eb
--- /dev/null
+++ b/utils/gn/build/sync_source_lists_from_cmake.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+"""Helps to keep BUILD.gn files in sync with the corresponding CMakeLists.txt.
+
+For each BUILD.gn file in the tree, checks if the list of cpp files in
+it is identical to the list of cpp files in the corresponding CMakeLists.txt
+file, and prints the difference if not."""
+
+from __future__ import print_function
+
+import os
+import re
+import subprocess
+
+def main():
+    gn_files = subprocess.check_output(
+            ['git', 'ls-files', '*BUILD.gn']).splitlines()
+
+    # Matches e.g. |   "foo.cpp",|.
+    gn_cpp_re = re.compile(r'^\s*"([^"]+\.(?:cpp|h))",$', re.MULTILINE)
+    # Matches e.g. |   "foo.cpp"|.
+    cmake_cpp_re = re.compile(r'^\s*([A-Za-z_0-9/-]+\.(?:cpp|h))$',
+                              re.MULTILINE)
+
+    for gn_file in gn_files:
+        # The CMakeLists.txt for llvm/utils/gn/secondary/foo/BUILD.gn is
+        # directly at foo/CMakeLists.txt.
+        strip_prefix = 'llvm/utils/gn/secondary/'
+        if not gn_file.startswith(strip_prefix):
+            continue
+        cmake_file = os.path.join(
+                os.path.dirname(gn_file[len(strip_prefix):]), 'CMakeLists.txt')
+        if not os.path.exists(cmake_file):
+            continue
+
+        def get_sources(source_re, text):
+            return set([m.group(1) for m in source_re.finditer(text)])
+        gn_cpp = get_sources(gn_cpp_re, open(gn_file).read())
+        cmake_cpp = get_sources(cmake_cpp_re, open(cmake_file).read())
+
+        if gn_cpp == cmake_cpp:
+            continue
+
+        print(gn_file)
+        add = cmake_cpp - gn_cpp
+        if add:
+            print('add:\n' + '\n'.join('    "%s",' % a for a in add))
+        remove = gn_cpp - cmake_cpp
+        if remove:
+            print('remove:\n' + '\n'.join(remove))
+        print()
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/gn/build/toolchain/BUILD.gn b/utils/gn/build/toolchain/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..927a53d9d63153e7b242068a8f4e723f7d867c04
--- /dev/null
+++ b/utils/gn/build/toolchain/BUILD.gn
@@ -0,0 +1,232 @@
+import("//llvm/utils/gn/build/toolchain/compiler.gni")
+
+declare_args() {
+  # If is_goma is true, the location of the goma client install.
+  if (host_os == "win") {
+    goma_dir = "c:\src\goma\goma-win64"
+  } else {
+    goma_dir = getenv("HOME") + "/goma"
+  }
+}
+
+toolchain("unix") {
+  cc = "cc"
+  cxx = "c++"
+
+  if (clang_base_path != "") {
+    cc = "$clang_base_path/bin/clang"
+    cxx = "$clang_base_path/bin/clang++"
+  }
+
+  ld = cxx  # Don't use goma wrapper for linking.
+  if (use_goma) {
+    cc = "$goma_dir/gomacc $cc"
+    cxx = "$goma_dir/gomacc $cxx"
+  }
+
+  tool("cc") {
+    depfile = "{{output}}.d"
+    command = "$cc -MMD -MF $depfile -o {{output}} -c {{source}} {{defines}} {{include_dirs}} {{cflags}} {{cflags_c}}"
+    depsformat = "gcc"
+    description = "CC {{output}}"
+    outputs = [
+      "{{source_out_dir}}/{{target_output_name}}.{{source_name_part}}.o",
+    ]
+  }
+
+  tool("cxx") {
+    depfile = "{{output}}.d"
+    command = "$cxx -MMD -MF $depfile -o {{output}} -c {{source}} {{defines}} {{include_dirs}} {{cflags}} {{cflags_cc}}"
+    depsformat = "gcc"
+    description = "CXX {{output}}"
+    outputs = [
+      "{{source_out_dir}}/{{target_output_name}}.{{source_name_part}}.o",
+    ]
+  }
+
+  tool("alink") {
+    if (host_os == "mac") {
+      command = "libtool -static -no_warning_for_no_symbols {{arflags}} -o {{output}} {{inputs}}"
+    } else {
+      # Remove the output file first so that ar doesn't try to modify the
+      # existing file.
+      command =
+          "rm -f {{output}} && ar rcsDT {{arflags}} -o {{output}} {{inputs}}"
+    }
+    description = "AR {{output}}"
+    outputs = [
+      "{{output_dir}}/{{target_output_name}}.a",
+    ]
+    output_prefix = "lib"
+    default_output_dir = "{{root_out_dir}}/lib"
+  }
+
+  tool("solink") {
+    outfile = "{{output_dir}}/{{target_output_name}}{{output_extension}}"
+    if (host_os == "mac") {
+      command = "$ld -shared {{ldflags}} -o $outfile {{libs}} {{inputs}}"
+      default_output_extension = ".dylib"
+    } else {
+      command =
+          "$ld -shared {{ldflags}} -Wl,-z,defs -o $outfile {{libs}} {{inputs}}"
+      default_output_extension = ".so"
+    }
+    description = "SOLINK $outfile"
+    outputs = [
+      outfile,
+    ]
+    lib_switch = "-l"
+    output_prefix = "lib"
+    default_output_dir = "{{root_out_dir}}/lib"
+  }
+
+  tool("solink_module") {
+    outfile = "{{output_dir}}/{{target_output_name}}{{output_extension}}"
+    if (host_os == "mac") {
+      command = "$ld -shared {{ldflags}} -Wl,-flat_namespace -Wl,-undefined,suppress -o $outfile {{libs}} {{inputs}}"
+      default_output_extension = ".dylib"
+    } else {
+      command = "$ld -shared {{ldflags}} -o $outfile {{libs}} {{inputs}}"
+      default_output_extension = ".so"
+    }
+    description = "SOLINK $outfile"
+    outputs = [
+      outfile,
+    ]
+    lib_switch = "-l"
+    default_output_dir = "{{root_out_dir}}/lib"
+  }
+
+  tool("link") {
+    outfile = "{{output_dir}}/{{target_output_name}}{{output_extension}}"
+    if (host_os == "mac") {
+      command = "$ld {{ldflags}} -o $outfile {{libs}} {{inputs}}"
+    } else {
+      command = "$ld {{ldflags}} -o $outfile {{libs}} -Wl,--start-group {{inputs}} -Wl,--end-group"
+    }
+    description = "LINK $outfile"
+    outputs = [
+      outfile,
+    ]
+    lib_switch = "-l"
+
+    # Setting this allows targets to override the default executable output by
+    # setting output_dir.
+    default_output_dir = "{{root_out_dir}}/bin"
+  }
+
+  tool("copy") {
+    command = "ln -f {{source}} {{output}} 2>/dev/null || (rm -rf {{output}} && cp -af {{source}} {{output}})"
+    description = "COPY {{source}} {{output}}"
+  }
+
+  tool("stamp") {
+    command = "touch {{output}}"
+    description = "STAMP {{output}}"
+  }
+}
+
+toolchain("win") {
+  cl = "cl"
+  link = "link"
+
+  if (clang_base_path != "") {
+    cl = "$clang_base_path/bin/clang-cl"
+    link = "$clang_base_path/bin/lld-link"
+  }
+
+  if (use_goma) {
+    cl = "$goma_dir/gomacc $cl"
+  }
+
+  tool("cc") {
+    command = "$cl /nologo /showIncludes /Fo{{output}} /c {{source}} {{defines}} {{include_dirs}} {{cflags}} {{cflags_c}}"
+    depsformat = "msvc"
+    description = "CC {{output}}"
+    outputs = [
+      "{{source_out_dir}}/{{target_output_name}}.{{source_name_part}}.obj",
+    ]
+  }
+
+  tool("cxx") {
+    command = "$cl /nologo /showIncludes /Fo{{output}} /c {{source}} {{defines}} {{include_dirs}} {{cflags}} {{cflags_cc}}"
+    depsformat = "msvc"
+    description = "CXX {{output}}"
+    outputs = [
+      "{{source_out_dir}}/{{target_output_name}}.{{source_name_part}}.obj",
+    ]
+  }
+
+  tool("alink") {
+    command = "lib /nologo {{arflags}} /out:{{output}} {{inputs}}"
+    description = "LIB {{output}}"
+    outputs = [
+      "{{output_dir}}/{{target_output_name}}.lib",
+    ]
+    default_output_dir = "{{root_out_dir}}/lib"
+  }
+
+  tool("solink") {
+    dllfile = "{{output_dir}}/{{target_output_name}}{{output_extension}}"
+    libfile = "$dllfile.lib"
+    command = "$link /nologo /dll {{ldflags}} /out:$dllfile /implib:$libfile {{libs}} /pdb:$dllfile.pdb {{inputs}}"
+    description = "LINK $dllfile"
+    link_output = libfile
+    depend_output = libfile
+    runtime_outputs = [ dllfile ]
+    outputs = [
+      dllfile,
+      libfile,
+    ]
+    default_output_extension = ".dll"
+    restat = true
+
+    # Put dlls next to the executables in bin/ on Windows, since Windows
+    # doesn't have a configurable rpath. This matches initialization of
+    # module_dir to bin/ in AddLLVM.cmake's set_output_directory().
+    default_output_dir = "{{root_out_dir}}/bin"
+  }
+
+  # Plugins for opt and clang and so on don't work in LLVM's Windows build
+  # since the code doesn't have export annotations, but there are a few
+  # standalone loadable modules used for unit-testing LLVM's dynamic library
+  # loading code.
+  tool("solink_module") {
+    dllfile = "{{output_dir}}/{{target_output_name}}{{output_extension}}"
+    command = "$link /nologo /dll {{ldflags}} /out:$dllfile {{libs}} /pdb:$dllfile.pdb {{inputs}}"
+    description = "LINK_MODULE $dllfile"
+    outputs = [
+      dllfile,
+    ]
+    runtime_outputs = outputs
+    default_output_extension = ".dll"
+
+    # No default_output_dir, all clients set output_dir.
+  }
+
+  tool("link") {
+    outfile = "{{output_dir}}/{{target_output_name}}{{output_extension}}"
+    command = "$link /nologo {{ldflags}} /out:$outfile {{libs}} /pdb:$outfile.pdb {{inputs}}"
+    description = "LINK $outfile"
+    outputs = [
+      outfile,
+    ]
+    default_output_extension = ".exe"
+
+    # Setting this allows targets to override the default executable output by
+    # setting output_dir.
+    default_output_dir = "{{root_out_dir}}/bin"
+  }
+
+  tool("copy") {
+    # GN hands out slash-using paths, but cmd's copy needs backslashes.
+    # Use cmd's %foo:a=b% substitution feature to convert.
+    command = "cmd /c set source=\"{{source}}\" & set output=\"{{output}}\" & call copy /Y %source:/=\% %output:\=/% > nul"
+    description = "COPY {{source}} {{output}}"
+  }
+
+  tool("stamp") {
+    command = "cmd /c type nul > {{output}}"
+    description = "STAMP {{output}}"
+  }
+}
diff --git a/utils/gn/build/toolchain/compiler.gni b/utils/gn/build/toolchain/compiler.gni
new file mode 100644
index 0000000000000000000000000000000000000000..92d965ef96738098c953d5b5e872c6158e288467
--- /dev/null
+++ b/utils/gn/build/toolchain/compiler.gni
@@ -0,0 +1,18 @@
+declare_args() {
+  # Whether to use goma (https://chromium.googlesource.com/infra/goma/client/)
+  use_goma = false
+
+  # Set this to a clang build directory. If set, that clang is used as compiler.
+  # goma only works with compiler binaries it knows about, so useful both for
+  # using a goma-approved compiler and for compiling clang with a locally-built
+  # clang in a different build directory.
+  # On Windows, setting this also causes lld-link to be used as linker.
+  # Example value: getenv("HOME") + "/src/llvm-build/Release+Asserts"
+  clang_base_path = ""
+}
+
+declare_args() {
+  # Set if the host compiler is clang.  On by default on Mac or if
+  # clang_base_path is set.
+  is_clang = host_os == "mac" || clang_base_path != ""
+}
diff --git a/utils/gn/build/write_cmake_config.py b/utils/gn/build/write_cmake_config.py
new file mode 100755
index 0000000000000000000000000000000000000000..6a54073d85b596a40de4feda13d929ab7f46549f
--- /dev/null
+++ b/utils/gn/build/write_cmake_config.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+"""Emulates the bits of CMake's configure_file() function needed in LLVM.
+
+The CMake build uses configure_file() for several things.  This emulates that
+function for the GN build.  In the GN build, this runs at build time instead
+of at generator time.
+
+Takes a list of KEY=VALUE pairs (where VALUE can be empty).
+
+The sequence `\` `n` in each VALUE is replaced by a newline character.
+
+On each line, replaces '${KEY}' or '@KEY@' with VALUE.
+
+Then, handles these special cases (note that FOO= sets the value of FOO to the
+empty string, which is falsy, but FOO=0 sets it to '0' which is truthy):
+
+1.) #cmakedefine01 FOO
+    Checks if key FOO is set to a truthy value, and depending on that prints
+    one of the following two lines:
+
+        #define FOO 1
+        #define FOO 0
+
+2.) #cmakedefine FOO [...]
+    Checks if key FOO is set to a truthy in value, and depending on that prints
+    one of the following two lines:
+
+        #define FOO [...]
+        /* #undef FOO */
+
+Fails if any of the KEY=VALUE arguments aren't needed for processing the
+input file, or if the input file references keys that weren't passed in.
+"""
+
+from __future__ import print_function
+
+import argparse
+import os
+import re
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser(
+                 epilog=__doc__,
+                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('input', help='input file')
+    parser.add_argument('values', nargs='*', help='several KEY=VALUE pairs')
+    parser.add_argument('-o', '--output', required=True,
+                        help='output file')
+    args = parser.parse_args()
+
+    values = {}
+    for value in args.values:
+        key, val = value.split('=', 1)
+        values[key] = val.replace('\\n', '\n')
+    unused_values = set(values.keys())
+
+    # Matches e.g. '${FOO}' or '@FOO@' and captures FOO in group 1 or 2.
+    var_re = re.compile(r'\$\{([^}]*)\}|@([^@]*)@')
+
+    in_lines = open(args.input).readlines()
+    out_lines = []
+    for in_line in in_lines:
+        def repl(m):
+            key = m.group(1) or m.group(2)
+            unused_values.discard(key)
+            return values[key]
+        in_line = var_re.sub(repl, in_line)
+        if in_line.startswith('#cmakedefine01 '):
+            _, var = in_line.split()
+            in_line = '#define %s %d\n' % (var, 1 if values[var] else 0)
+            unused_values.discard(var)
+        elif in_line.startswith('#cmakedefine '):
+            _, var = in_line.split(None, 1)
+            try:
+                var, val = var.split(None, 1)
+                in_line = '#define %s %s' % (var, val)  # val ends in \n.
+            except:
+                var = var.rstrip()
+                in_line = '#define %s\n' % var
+            if not values[var]:
+                in_line = '/* #undef %s */\n' % var
+            unused_values.discard(var)
+        out_lines.append(in_line)
+
+    if unused_values:
+        print('unused values args:', file=sys.stderr)
+        print('    ', '\n    '.join(unused_values), file=sys.stderr)
+        return 1
+
+    output = ''.join(out_lines)
+
+    leftovers = var_re.findall(output)
+    if leftovers:
+        print('unprocessed values:\n', '\n'.join(leftovers), file=sys.stderr)
+        return 1
+
+    if not os.path.exists(args.output) or open(args.output).read() != output:
+        open(args.output, 'w').write(output)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/utils/gn/build/write_vcsrevision.py b/utils/gn/build/write_vcsrevision.py
new file mode 100755
index 0000000000000000000000000000000000000000..ed7c05f1d0ef8730bcadfbaa1c752e0eeb081a70
--- /dev/null
+++ b/utils/gn/build/write_vcsrevision.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+"""Gets the current revision and writes it to VCSRevision.h."""
+
+from __future__ import print_function
+
+import argparse
+import os
+import subprocess
+import sys
+
+
+THIS_DIR = os.path.abspath(os.path.dirname(__file__))
+LLVM_DIR = os.path.dirname(os.path.dirname(os.path.dirname(THIS_DIR)))
+MONO_DIR = os.path.dirname(LLVM_DIR)
+
+
+def which(program):
+    # distutils.spawn.which() doesn't find .bat files,
+    # https://bugs.python.org/issue2200
+    for path in os.environ["PATH"].split(os.pathsep):
+        candidate = os.path.join(path, program)
+        if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
+            return candidate
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('-d', '--depfile',
+                        help='if set, writes a depfile that causes this script '
+                             'to re-run each time the current revision changes')
+    parser.add_argument('vcs_header', help='path to the output file to write')
+    args = parser.parse_args()
+
+    if os.path.isdir(os.path.join(LLVM_DIR, '.svn')):
+        print('SVN support not implemented', file=sys.stderr)
+        return 1
+    if os.path.isdir(os.path.join(LLVM_DIR, '.git')):
+        print('non-mono-repo git support not implemented', file=sys.stderr)
+        return 1
+
+    git_dir = os.path.join(MONO_DIR, '.git')
+    if not os.path.isdir(git_dir):
+        print('.git dir not found at "%s"' % git_dir, file=sys.stderr)
+        return 1
+
+    git, use_shell = which('git'), False
+    if not git:
+        git = which('git.exe')
+    if not git:
+        git = which('git.bat')
+        use_shell = True
+    rev = subprocess.check_output([git, 'rev-parse', '--short', 'HEAD'],
+                                  cwd=git_dir, shell=use_shell)
+    # FIXME: add pizzas such as the svn revision read off a git note?
+    vcsrevision_contents = '#define LLVM_REVISION "git-%s"\n' % rev.strip()
+
+    # If the output already exists and is identical to what we'd write,
+    # return to not perturb the existing file's timestamp.
+    if os.path.exists(args.vcs_header) and \
+            open(args.vcs_header).read() == vcsrevision_contents:
+        return 0
+
+    # http://neugierig.org/software/blog/2014/11/binary-revisions.html
+    if args.depfile:
+        build_dir = os.getcwd()
+        with open(args.depfile, 'w') as depfile:
+            depfile.write('%s: %s\n' % (
+                args.vcs_header,
+                os.path.relpath(os.path.join(git_dir, 'logs', 'HEAD'),
+                                build_dir)))
+    open(args.vcs_header, 'w').write(vcsrevision_contents)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/utils/gn/secondary/BUILD.gn b/utils/gn/secondary/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..38716d569a92a206944e66d02fbf23ab20a7b269
--- /dev/null
+++ b/utils/gn/secondary/BUILD.gn
@@ -0,0 +1,83 @@
+group("default") {
+  deps = [
+    ":lld",
+    "//llvm/tools/llc",
+    "//llvm/tools/llvm-ar:symlinks",
+    "//llvm/tools/llvm-as",
+    "//llvm/tools/llvm-bcanalyzer",
+    "//llvm/tools/llvm-dis",
+    "//llvm/tools/llvm-dwarfdump",
+    "//llvm/tools/llvm-mc",
+    "//llvm/tools/llvm-nm:symlinks",
+    "//llvm/tools/llvm-objcopy:symlinks",
+    "//llvm/tools/llvm-objdump:symlinks",
+    "//llvm/tools/llvm-pdbutil",
+    "//llvm/tools/llvm-readobj:symlinks",
+    "//llvm/tools/llvm-undname",
+    "//llvm/tools/obj2yaml",
+    "//llvm/tools/opt",
+    "//llvm/tools/yaml2obj",
+    "//llvm/utils/FileCheck",
+    "//llvm/utils/count",
+    "//llvm/utils/not",
+  ]
+}
+
+# Symlink handling.
+# On POSIX, symlinks to the target can be created before the target exist,
+# and the target can depend on the symlink targets, so that building the
+# target ensures the symlinks exist.
+# However, symlinks didn't exist on Windows until recently, so there the
+# binary needs to be copied -- which requires it to exist. So the symlink step
+# needs to run after the target that creates the binary.
+# In the cmake build, this is done via a "postbuild" on the target, which just
+# tacks on "&& copy out.exe out2.exe" to the link command.
+# GN doesn't have a way to express postbuild commands.  It could probably be
+# emulated by having the link command in the toolchain be a wrapper script that
+# reads a ".symlinks" file next to the target, and have an action write that
+# and make the target depend on that, but then every single link has to use the
+# wrapper (unless we do further acrobatics to use a different toolchain for
+# targets that need symlinks) even though most links don't need symlinks.
+# Instead, have a top-level target for each target that needs symlinks, and
+# make that depend on the symlinks. Then the symlinks can depend on the
+# executable.  This has the effect that `ninja lld` builds lld and then creates
+# symlinks (via this target), while `ninja bin/lld` only builds lld and doesn't
+# update symlinks (in particular, on Windows it doesn't copy the new lld to its
+# new locations).
+# That seems simpler, more explicit, and good enough.
+group("lld") {
+  deps = [
+    "//lld/tools/lld:symlinks",
+  ]
+}
+group("llvm-ar") {
+  deps = [
+    "//llvm/tools/llvm-ar:symlinks",
+  ]
+}
+group("llvm-nm") {
+  deps = [
+    "//llvm/tools/llvm-nm:symlinks",
+  ]
+}
+group("llvm-objcopy") {
+  deps = [
+    "//llvm/tools/llvm-objcopy:symlinks",
+  ]
+}
+group("llvm-objdump") {
+  deps = [
+    "//llvm/tools/llvm-objdump:symlinks",
+  ]
+}
+group("llvm-readobj") {
+  deps = [
+    "//llvm/tools/llvm-readobj:symlinks",
+  ]
+}
+
+# A pool called "console" in the root BUILD.gn is magic and represents ninja's
+# built-in console pool. (Requires a GN with `gn --version` >= 552353.)
+pool("console") {
+  depth = 1
+}
diff --git a/utils/gn/secondary/lld/COFF/BUILD.gn b/utils/gn/secondary/lld/COFF/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..394f07aa29142d6f925cfc0013dd0099aaadb507
--- /dev/null
+++ b/utils/gn/secondary/lld/COFF/BUILD.gn
@@ -0,0 +1,44 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("Options") {
+  visibility = [ ":COFF" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+static_library("COFF") {
+  output_name = "lldCOFF"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    ":Options",
+    "//lld/Common",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/DebugInfo/MSF",
+    "//llvm/lib/DebugInfo/PDB",
+    "//llvm/lib/IR",
+    "//llvm/lib/LTO",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target:TargetsToBuild",
+    "//llvm/lib/ToolDrivers/llvm-lib:LibDriver",
+    "//llvm/lib/WindowsManifest",
+  ]
+  sources = [
+    "Chunks.cpp",
+    "DLL.cpp",
+    "Driver.cpp",
+    "DriverUtils.cpp",
+    "ICF.cpp",
+    "InputFiles.cpp",
+    "LTO.cpp",
+    "MapFile.cpp",
+    "MarkLive.cpp",
+    "MinGW.cpp",
+    "PDB.cpp",
+    "SymbolTable.cpp",
+    "Symbols.cpp",
+    "Writer.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/Common/BUILD.gn b/utils/gn/secondary/lld/Common/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..b15f0928c044a2a9da8c6273cfaaac4343cbf320
--- /dev/null
+++ b/utils/gn/secondary/lld/Common/BUILD.gn
@@ -0,0 +1,28 @@
+static_library("Common") {
+  output_name = "lldCommon"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  public_deps = [
+    # public_dep because public header Version.h includes generated Version.inc.
+    "//lld/include/lld/Common:version",
+  ]
+  deps = [
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/Demangle",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+  ]
+  sources = [
+    "Args.cpp",
+    "ErrorHandler.cpp",
+    "Memory.cpp",
+    "Reproduce.cpp",
+    "Strings.cpp",
+    "TargetOptionsCommandFlags.cpp",
+    "Threads.cpp",
+    "Timer.cpp",
+    "Version.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/ELF/BUILD.gn b/utils/gn/secondary/lld/ELF/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..d5f8a8999956d531bc75c48b45164a77be7ee949
--- /dev/null
+++ b/utils/gn/secondary/lld/ELF/BUILD.gn
@@ -0,0 +1,64 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("Options") {
+  visibility = [ ":ELF" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+static_library("ELF") {
+  output_name = "lldELF"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    ":Options",
+    "//lld/Common",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/DebugInfo/DWARF",
+    "//llvm/lib/IR",
+    "//llvm/lib/LTO",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target:TargetsToBuild",
+  ]
+  include_dirs = [ "." ]
+  sources = [
+    "AArch64ErrataFix.cpp",
+    "Arch/AArch64.cpp",
+    "Arch/AMDGPU.cpp",
+    "Arch/ARM.cpp",
+    "Arch/AVR.cpp",
+    "Arch/Hexagon.cpp",
+    "Arch/Mips.cpp",
+    "Arch/MipsArchTree.cpp",
+    "Arch/PPC.cpp",
+    "Arch/PPC64.cpp",
+    "Arch/RISCV.cpp",
+    "Arch/SPARCV9.cpp",
+    "Arch/X86.cpp",
+    "Arch/X86_64.cpp",
+    "CallGraphSort.cpp",
+    "DWARF.cpp",
+    "Driver.cpp",
+    "DriverUtils.cpp",
+    "EhFrame.cpp",
+    "Filesystem.cpp",
+    "ICF.cpp",
+    "InputFiles.cpp",
+    "InputSection.cpp",
+    "LTO.cpp",
+    "LinkerScript.cpp",
+    "MapFile.cpp",
+    "MarkLive.cpp",
+    "OutputSections.cpp",
+    "Relocations.cpp",
+    "ScriptLexer.cpp",
+    "ScriptParser.cpp",
+    "SymbolTable.cpp",
+    "Symbols.cpp",
+    "SyntheticSections.cpp",
+    "Target.cpp",
+    "Thunks.cpp",
+    "Writer.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/MinGW/BUILD.gn b/utils/gn/secondary/lld/MinGW/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..4f3d5164d0639bd67e6710ee0b0fe6cda0bb2afa
--- /dev/null
+++ b/utils/gn/secondary/lld/MinGW/BUILD.gn
@@ -0,0 +1,21 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("Options") {
+  visibility = [ ":MinGW" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+static_library("MinGW") {
+  output_name = "lldMinGW"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    ":Options",
+    "//lld/COFF",
+    "//lld/Common",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "Driver.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/include/lld/Common/BUILD.gn b/utils/gn/secondary/lld/include/lld/Common/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..b45c04ea67ad44f6c74618aeb8854962af986f0e
--- /dev/null
+++ b/utils/gn/secondary/lld/include/lld/Common/BUILD.gn
@@ -0,0 +1,26 @@
+import("//llvm/version.gni")
+
+action("version") {
+  script = "//llvm/utils/gn/build/write_cmake_config.py"
+
+  sources = [
+    "Version.inc.in",
+  ]
+  outputs = [
+    "$target_gen_dir/Version.inc",
+  ]
+  args = [
+    "-o",
+    rebase_path(outputs[0], root_out_dir),
+
+    rebase_path(sources[0], root_out_dir),
+
+    "LLD_VERSION=$llvm_version",
+    "LLD_VERSION_MAJOR=$llvm_version_major",
+    "LLD_VERSION_MINOR=$llvm_version_minor",
+
+    # FIXME: Real values for these:
+    "LLD_REVISION=",
+    "LLD_REPOSITORY=",
+  ]
+}
diff --git a/utils/gn/secondary/lld/lib/Core/BUILD.gn b/utils/gn/secondary/lld/lib/Core/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..a1c974dccd79989cfcf761e92934bf44daac1a75
--- /dev/null
+++ b/utils/gn/secondary/lld/lib/Core/BUILD.gn
@@ -0,0 +1,20 @@
+static_library("Core") {
+  output_name = "lldCore"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+    "//llvm/utils/gn/build/libs/pthread",
+  ]
+  sources = [
+    "DefinedAtom.cpp",
+    "Error.cpp",
+    "File.cpp",
+    "LinkingContext.cpp",
+    "Reader.cpp",
+    "Resolver.cpp",
+    "SymbolTable.cpp",
+    "Writer.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/lib/Driver/BUILD.gn b/utils/gn/secondary/lld/lib/Driver/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..10a3e6abc3ecbc01a2374ab1e795df9ab5332c18
--- /dev/null
+++ b/utils/gn/secondary/lld/lib/Driver/BUILD.gn
@@ -0,0 +1,24 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("DarwinLdOptions") {
+  visibility = [ ":Driver" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+static_library("Driver") {
+  output_name = "lldDriver"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    ":DarwinLdOptions",
+    "//lld/Common",
+    "//lld/lib/Core",
+    "//lld/lib/ReaderWriter",
+    "//lld/lib/ReaderWriter/MachO",
+    "//lld/lib/ReaderWriter/YAML",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "DarwinLdDriver.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/lib/ReaderWriter/BUILD.gn b/utils/gn/secondary/lld/lib/ReaderWriter/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..0ca483a5e7a01fa8c214bb9f6dbdd385948d0a0b
--- /dev/null
+++ b/utils/gn/secondary/lld/lib/ReaderWriter/BUILD.gn
@@ -0,0 +1,17 @@
+static_library("ReaderWriter") {
+  output_name = "lldReaderWriter"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    "//lld/lib/Core",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "FileArchive.cpp",
+  ]
+
+  # FIXME:
+  # if (is_msvc) {
+  #    cflags = [ "-wd4062" ]
+  # }
+}
diff --git a/utils/gn/secondary/lld/lib/ReaderWriter/MachO/BUILD.gn b/utils/gn/secondary/lld/lib/ReaderWriter/MachO/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..746bb901a299587156755cdf68b7b751bdf1f33d
--- /dev/null
+++ b/utils/gn/secondary/lld/lib/ReaderWriter/MachO/BUILD.gn
@@ -0,0 +1,33 @@
+static_library("MachO") {
+  output_name = "lldMachO"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    "//lld/lib/Core",
+    "//lld/lib/ReaderWriter/YAML",
+    "//llvm/lib/DebugInfo/DWARF",
+    "//llvm/lib/Demangle",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "ArchHandler.cpp",
+    "ArchHandler_arm.cpp",
+    "ArchHandler_arm64.cpp",
+    "ArchHandler_x86.cpp",
+    "ArchHandler_x86_64.cpp",
+    "CompactUnwindPass.cpp",
+    "GOTPass.cpp",
+    "LayoutPass.cpp",
+    "MachOLinkingContext.cpp",
+    "MachONormalizedFileBinaryReader.cpp",
+    "MachONormalizedFileBinaryWriter.cpp",
+    "MachONormalizedFileFromAtoms.cpp",
+    "MachONormalizedFileToAtoms.cpp",
+    "MachONormalizedFileYAML.cpp",
+    "ObjCPass.cpp",
+    "ShimPass.cpp",
+    "StubsPass.cpp",
+    "TLVPass.cpp",
+    "WriterMachO.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/lib/ReaderWriter/YAML/BUILD.gn b/utils/gn/secondary/lld/lib/ReaderWriter/YAML/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..9fb1479d03918f9938732d4739c040c1babcba86
--- /dev/null
+++ b/utils/gn/secondary/lld/lib/ReaderWriter/YAML/BUILD.gn
@@ -0,0 +1,11 @@
+static_library("YAML") {
+  output_name = "lldYAML"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    "//lld/lib/Core",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "ReaderWriterYAML.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/tools/lld/BUILD.gn b/utils/gn/secondary/lld/tools/lld/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..c7e90e50f089c28e1bdc9c13d713b94da5104f34
--- /dev/null
+++ b/utils/gn/secondary/lld/tools/lld/BUILD.gn
@@ -0,0 +1,40 @@
+import("//llvm/utils/gn/build/symlink_or_copy.gni")
+
+symlinks = [
+  "lld-link",
+  "ld.lld",
+  "ld64.lld",
+  "wasm-ld",
+]
+foreach(target, symlinks) {
+  symlink_or_copy(target) {
+    deps = [
+      ":lld",
+    ]
+    source = "lld"
+    output = "$root_out_dir/bin/$target"
+  }
+}
+
+# //:lld depends on this symlink target, see comment in //BUILD.gn.
+group("symlinks") {
+  deps = []
+  foreach(target, symlinks) {
+    deps += [ ":$target" ]
+  }
+}
+
+executable("lld") {
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    "//lld/COFF",
+    "//lld/ELF",
+    "//lld/MinGW",
+    "//lld/lib/Driver",
+    "//lld/wasm",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "lld.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/lld/wasm/BUILD.gn b/utils/gn/secondary/lld/wasm/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..bec946a1d4dd70f03c1e66d19645ece742abe05e
--- /dev/null
+++ b/utils/gn/secondary/lld/wasm/BUILD.gn
@@ -0,0 +1,32 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("Options") {
+  visibility = [ ":wasm" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+static_library("wasm") {
+  output_name = "lldWasm"
+  configs += [ "//llvm/utils/gn/build:lld_code" ]
+  deps = [
+    ":Options",
+    "//lld/Common",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/Demangle",
+    "//llvm/lib/IR",
+    "//llvm/lib/Object",
+    "//llvm/lib/Option",
+  ]
+  sources = [
+    "Driver.cpp",
+    "InputChunks.cpp",
+    "InputFiles.cpp",
+    "LTO.cpp",
+    "MarkLive.cpp",
+    "OutputSections.cpp",
+    "SymbolTable.cpp",
+    "Symbols.cpp",
+    "Writer.cpp",
+    "WriterUtils.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..08924e36dab9126a92dbfab864f4acaf82e50d30
--- /dev/null
+++ b/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -0,0 +1,435 @@
+import("//llvm/lib/Target/targets.gni")
+import("//llvm/triples.gni")
+import("//llvm/utils/gn/build/buildflags.gni")
+import("//llvm/utils/gn/build/libs/pthread/enable.gni")
+import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
+import("//llvm/utils/gn/build/libs/xar/enable.gni")
+import("//llvm/utils/gn/build/libs/xml/enable.gni")
+import("//llvm/utils/gn/build/libs/zlib/enable.gni")
+import("//llvm/version.gni")
+
+# Contains actions to create config.h, llvm-config.h, abi-breaking.h,
+# and the various .def files used by llvm/lib/Target.
+# Other than in the cmake build, these are created at build time, not at
+# config time. That way, they can run in parallel, and this pattern means that
+# e.g. creating the clang config header (which happens in another gn file)
+# doesn't block building lld.
+
+# The headers are created by processing the foo.h.cmake files as input,
+# to keep the gn build close to the cmake build.
+
+# Other than in the cmake build, header generation doesn't do any feature
+# checking. See also "Philosophy" in llvm/utils/gn/README.rst.
+
+##############################################################################
+# config.h, llvm-config.h, abi-breaking.h
+
+# FIXME: Several of the config settings go in a global config header but
+# are only used in a single translation unit -- so toggling that value
+# causes a full rebuild when it really only has to rebuild a single file.
+# Instead monolithing config headers, investigate using something like
+# https://is.gd/buildflag_header_gni instead (needs to be done in both the
+# cmake build and here).
+
+# FIXME: This hardcodes a bunch of settings I never use; some of them should
+# become declare_args if anyone wants to set them.
+
+declare_args() {
+  # Enable additional checks that alter the LLVM C++ ABI.
+  llvm_enable_abi_breaking_checks = llvm_enable_assertions
+
+  # Iterate unordered llvm containers in reverse.
+  llvm_enable_reverse_iteration = false
+}
+
+action("abi-breaking") {
+  script = "//llvm/utils/gn/build/write_cmake_config.py"
+
+  sources = [
+    "abi-breaking.h.cmake",
+  ]
+  outputs = [
+    "$target_gen_dir/abi-breaking.h",
+  ]
+  args = [
+    "-o",
+    rebase_path(outputs[0], root_out_dir),
+
+    rebase_path(sources[0], root_out_dir),
+  ]
+
+  if (llvm_enable_abi_breaking_checks) {
+    args += [ "LLVM_ENABLE_ABI_BREAKING_CHECKS=1" ]
+  } else {
+    args += [ "LLVM_ENABLE_ABI_BREAKING_CHECKS=" ]
+  }
+
+  if (llvm_enable_reverse_iteration) {
+    args += [ "LLVM_ENABLE_REVERSE_ITERATION=1" ]
+  } else {
+    args += [ "LLVM_ENABLE_REVERSE_ITERATION=" ]
+  }
+}
+
+action("config") {
+  script = "//llvm/utils/gn/build/write_cmake_config.py"
+
+  public_deps = [
+    ":llvm-config",
+  ]
+
+  sources = [
+    "config.h.cmake",
+  ]
+  outputs = [
+    "$target_gen_dir/config.h",
+  ]
+  args = [
+    "-o",
+    rebase_path(outputs[0], root_out_dir),
+    rebase_path(sources[0], root_out_dir),
+
+    "BUG_REPORT_URL=https://bugs.llvm.org/",
+    "ENABLE_BACKTRACES=1",
+    "ENABLE_CRASH_OVERRIDES=1",
+    "BACKTRACE_HEADER=execinfo.h",
+    "HAVE_CRASHREPORTERCLIENT_H=",
+    "HAVE_DECL_FE_ALL_EXCEPT=1",
+    "HAVE_DECL_FE_INEXACT=1",
+    "LLVM_ENABLE_DIA_SDK=",
+    "LLVM_ENABLE_CRASH_DUMPS=",
+    "HAVE_ERRNO_H=1",
+    "HAVE_FCNTL_H=1",
+    "HAVE_FENV_H=1",
+    "HAVE_FFI_CALL=",
+    "HAVE_FFI_FFI_H=",
+    "HAVE_FFI_H=",
+    "HAVE_LIBPFM=",
+    "HAVE_LIBPSAPI=",
+    "HAVE_MALLCTL=",
+    "HAVE_SIGNAL_H=1",
+    "HAVE_STRERROR=1",
+    "HAVE_SYS_STAT_H=1",
+    "HAVE_SYS_TYPES_H=1",
+    "HAVE__ALLOCA=",
+    "HAVE___ALLOCA=",
+    "HAVE___ASHLDI3=",
+    "HAVE___ASHRDI3=",
+    "HAVE___CHKSTK=",
+    "HAVE___CHKSTK_MS=",
+    "HAVE___CMPDI2=",
+    "HAVE___DIVDI3=",
+    "HAVE___FIXDFDI=",
+    "HAVE___FIXSFDI=",
+    "HAVE___FLOATDIDF=",
+    "HAVE___LSHRDI3=",
+    "HAVE___MAIN=",
+    "HAVE___MODDI3=",
+    "HAVE___UDIVDI3=",
+    "HAVE___UMODDI3=",
+    "HAVE____CHKSTK=",
+    "HAVE____CHKSTK_MS=",
+    "HOST_LINK_VERSION=",
+    "LLVM_TARGET_TRIPLE_ENV=",
+    "LLVM_VERSION_INFO=",
+    "LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO=1",
+    "PACKAGE_BUGREPORT=https://bugs.llvm.org/",
+    "PACKAGE_NAME=LLVM",
+    "PACKAGE_STRING=LLVM ${llvm_version}svn",
+    "PACKAGE_VERSION=${llvm_version}svn",
+    "PACKAGE_VENDOR=",
+    "RETSIGTYPE=void",
+    "LLVM_GISEL_COV_ENABLED=",
+    "LLVM_GISEL_COV_PREFIX=",
+
+    # This is both in llvm-config.h and config.h; llvm-config.h doesn't
+    # define it if it's not set while config.h defines it to empty in that case.
+    "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
+  ]
+
+  if (host_os == "linux") {
+    args += [
+      "HAVE_FUTIMENS=1",
+      "HAVE_LINK_H=1",
+      "HAVE_LSEEK64=1",
+      "HAVE_MALLINFO=1",
+      "HAVE_POSIX_FALLOCATE=1",
+      "HAVE_SCHED_GETAFFINITY=1",
+      "HAVE_CPU_COUNT=1",
+      "HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC=1",
+      "HAVE_VALGRIND_VALGRIND_H=1",
+    ]
+  } else {
+    args += [
+      "HAVE_FUTIMENS=",
+      "HAVE_LINK_H=",
+      "HAVE_LSEEK64=",
+      "HAVE_MALLINFO=",
+      "HAVE_POSIX_FALLOCATE=",
+      "HAVE_SCHED_GETAFFINITY=",
+      "HAVE_CPU_COUNT=",
+      "HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC=",
+      "HAVE_VALGRIND_VALGRIND_H=",
+    ]
+  }
+
+  if (host_os == "mac") {
+    args += [
+      "HAVE_CRASHREPORTER_INFO=1",
+      "HAVE_DECL_ARC4RANDOM=1",
+      "HAVE_DLADDR=1",
+      "HAVE_LIBEDIT=1",
+      "HAVE_MALLOC_H=",
+      "HAVE_MACH_MACH_H=1",
+      "HAVE_MALLOC_MALLOC_H=1",
+      "HAVE_MALLOC_ZONE_STATISTICS=1",
+      "HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC=1",
+    ]
+  } else {
+    args += [
+      "HAVE_CRASHREPORTER_INFO=",
+      "HAVE_DECL_ARC4RANDOM=",
+      "HAVE_DLADDR=",
+      "HAVE_LIBEDIT=",
+      "HAVE_MACH_MACH_H=",
+      "HAVE_MALLOC_H=1",
+      "HAVE_MALLOC_MALLOC_H=",
+      "HAVE_MALLOC_ZONE_STATISTICS=",
+      "HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC=",
+    ]
+  }
+
+  if (host_os == "win") {
+    args += [
+      "HAVE_BACKTRACE=",
+      "HAVE_DECL_STRERROR_S=1",
+      "HAVE_DLFCN_H=",
+      "HAVE_DLOPEN=",
+      "HAVE_FUTIMES=",
+      "HAVE_GETPAGESIZE=",
+      "HAVE_GETRLIMIT=",
+      "HAVE_GETRUSAGE=",
+      "HAVE_ISATTY=",
+      "HAVE_LIBPTHREAD=",
+      "HAVE_PTHREAD_GETNAME_NP=",
+      "HAVE_PTHREAD_SETNAME_NP=",
+      "HAVE_LIBZ=",
+      "HAVE_POSIX_SPAWN=",
+      "HAVE_PREAD=",
+      "HAVE_PTHREAD_GETSPECIFIC=",
+      "HAVE_PTHREAD_H=",
+      "HAVE_PTHREAD_MUTEX_LOCK=",
+      "HAVE_PTHREAD_RWLOCK_INIT=",
+      "HAVE_REALPATH=",
+      "HAVE_SBRK=",
+      "HAVE_SETENV=",
+      "HAVE_SETRLIMIT=",
+      "HAVE_SIGALTSTACK=",
+      "HAVE_STRERROR_R=",
+      "HAVE_SYSCONF=",
+      "HAVE_SYS_IOCTL_H=",
+      "HAVE_SYS_MMAN_H=",
+      "HAVE_SYS_PARAM_H=",
+      "HAVE_SYS_RESOURCE_H=",
+      "HAVE_SYS_TIME_H=",
+      "HAVE_TERMIOS_H=",
+      "HAVE_UNISTD_H=",
+      "HAVE_ZLIB_H=",
+      "HAVE__CHSIZE_S=1",
+      "HAVE__UNWIND_BACKTRACE=",
+      "stricmp=_stricmp",
+      "strdup=_strdup",
+    ]
+  } else {
+    # POSIX-y system defaults.
+    args += [
+      "HAVE_BACKTRACE=1",
+      "HAVE_DECL_STRERROR_S=",
+      "HAVE_DLFCN_H=1",
+      "HAVE_DLOPEN=1",
+      "HAVE_FUTIMES=1",
+      "HAVE_GETPAGESIZE=1",
+      "HAVE_GETRLIMIT=1",
+      "HAVE_GETRUSAGE=1",
+      "HAVE_ISATTY=1",
+      "HAVE_LIBPTHREAD=1",
+      "HAVE_PTHREAD_GETNAME_NP=1",
+      "HAVE_PTHREAD_SETNAME_NP=1",
+      "HAVE_LIBZ=1",
+      "HAVE_POSIX_SPAWN=1",
+      "HAVE_PREAD=1",
+      "HAVE_PTHREAD_GETSPECIFIC=1",
+      "HAVE_PTHREAD_H=1",
+      "HAVE_PTHREAD_MUTEX_LOCK=1",
+      "HAVE_PTHREAD_RWLOCK_INIT=1",
+      "HAVE_REALPATH=1",
+      "HAVE_SBRK=1",
+      "HAVE_SETENV=1",
+      "HAVE_SETRLIMIT=1",
+      "HAVE_SIGALTSTACK=1",
+      "HAVE_STRERROR_R=1",
+      "HAVE_SYSCONF=1",
+      "HAVE_SYS_IOCTL_H=1",
+      "HAVE_SYS_MMAN_H=1",
+      "HAVE_SYS_PARAM_H=1",
+      "HAVE_SYS_RESOURCE_H=1",
+      "HAVE_SYS_TIME_H=1",
+      "HAVE_TERMIOS_H=1",
+      "HAVE_UNISTD_H=1",
+      "HAVE_ZLIB_H=1",
+      "HAVE__CHSIZE_S=",
+      "HAVE__UNWIND_BACKTRACE=1",
+      "stricmp=",
+      "strdup=",
+    ]
+  }
+
+  if (host_os == "linux") {
+    args += [ "LTDL_SHLIB_EXT=.so" ]
+  } else if (host_os == "mac") {
+    args += [ "LTDL_SHLIB_EXT=.dylib" ]
+  } else if (host_os == "win") {
+    args += [ "LTDL_SHLIB_EXT=.dll" ]
+  }
+
+  if (llvm_enable_libxar) {
+    args += [ "HAVE_LIBXAR=1" ]
+  } else {
+    args += [ "HAVE_LIBXAR=" ]
+  }
+
+  if (llvm_enable_terminfo) {
+    args += [ "HAVE_TERMINFO=1" ]
+  } else {
+    args += [ "HAVE_TERMINFO=" ]
+  }
+
+  if (llvm_enable_zlib) {
+    args += [ "LLVM_ENABLE_ZLIB=1" ]
+  } else {
+    args += [ "LLVM_ENABLE_ZLIB=" ]
+  }
+
+  if (llvm_enable_libxml2) {
+    args += [ "LLVM_LIBXML2_ENABLED=1" ]
+  } else {
+    args += [ "LLVM_LIBXML2_ENABLED=" ]
+  }
+}
+
+action("llvm-config") {
+  script = "//llvm/utils/gn/build/write_cmake_config.py"
+
+  sources = [
+    "llvm-config.h.cmake",
+  ]
+  outputs = [
+    "$target_gen_dir/llvm-config.h",
+  ]
+  args = [
+    "-o",
+    rebase_path(outputs[0], root_out_dir),
+    rebase_path(sources[0], root_out_dir),
+
+    "LLVM_ENABLE_DUMP=",
+    "LINK_POLLY_INTO_TOOLS=",
+    "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
+    "LLVM_HAS_ATOMICS=1",
+    "LLVM_HOST_TRIPLE=$llvm_host_triple",
+    "LLVM_NATIVE_ARCH=$native_target",
+    "LLVM_NATIVE_ASMPARSER=1",
+    "LLVM_NATIVE_ASMPRINTER=1",
+    "LLVM_NATIVE_DISASSEMBLER=1",
+    "LLVM_NATIVE_TARGET=1",
+    "LLVM_NATIVE_TARGETINFO=1",
+    "LLVM_NATIVE_TARGETMC=1",
+    "LLVM_USE_INTEL_JITEVENTS=",
+    "LLVM_USE_OPROFILE=",
+    "LLVM_USE_PERF=",
+    "LLVM_VERSION_MAJOR=$llvm_version_major",
+    "LLVM_VERSION_MINOR=$llvm_version_minor",
+    "LLVM_VERSION_PATCH=$llvm_version_patch",
+    "PACKAGE_VERSION=${llvm_version}svn",
+    "LLVM_FORCE_ENABLE_STATS=",
+  ]
+
+  if (host_os == "win") {
+    args += [ "LLVM_ON_UNIX=" ]
+  } else {
+    args += [ "LLVM_ON_UNIX=1" ]
+  }
+
+  if (llvm_enable_threads) {
+    args += [ "LLVM_ENABLE_THREADS=1" ]
+  } else {
+    args += [ "LLVM_ENABLE_THREADS=" ]
+  }
+}
+
+##############################################################################
+# .def files used by llvm/lib/Target
+
+template("write_target_def_file") {
+  assert(defined(invoker.key), "callers must set key")
+  assert(defined(invoker.value), "callers must set value")
+
+  action(target_name) {
+    visibility = [ ":write_target_def_files" ]
+    script = "//llvm/utils/gn/build/write_cmake_config.py"
+
+    sources = [
+      "$target_name.in",
+    ]
+    outputs = [
+      "$target_gen_dir/$target_name",
+    ]
+
+    # Build something like
+    # `LLVM_ENUM_ASM_PARSERS=LLVM_ASM_PARSER(ARM)\nLLVM_ASM_PARSER(X86)\n`. Note
+    # that \n is a literal '\' followed by a literal 'n', not a newline
+    # character.  (write_cmake_config.py replaces that with a real newline).
+    value = ""
+    foreach(target, llvm_targets_to_build) {
+      value = "$value${invoker.value}($target)\n"
+    }
+    args = [
+      "-o",
+      rebase_path(outputs[0], root_out_dir),
+      rebase_path(sources[0], root_out_dir),
+      "${invoker.key}=$value",
+    ]
+  }
+}
+
+write_target_def_file("AsmParsers.def") {
+  key = "LLVM_ENUM_ASM_PARSERS"
+  value = "LLVM_ASM_PARSER"
+}
+
+write_target_def_file("AsmPrinters.def") {
+  key = "LLVM_ENUM_ASM_PRINTERS"
+  value = "LLVM_ASM_PRINTER"
+}
+
+write_target_def_file("Disassemblers.def") {
+  key = "LLVM_ENUM_DISASSEMBLERS"
+  value = "LLVM_DISASSEMBLER"
+}
+
+write_target_def_file("Targets.def") {
+  key = "LLVM_ENUM_TARGETS"
+  value = "LLVM_TARGET"
+}
+
+group("write_target_def_files") {
+  visibility = [
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+  ]
+  deps = [
+    ":AsmParsers.def",
+    ":AsmPrinters.def",
+    ":Disassemblers.def",
+    ":Targets.def",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn b/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..cc73446a582d0a11edc8b94c1f07adf59cde7cb1
--- /dev/null
+++ b/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn
@@ -0,0 +1,18 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("IntrinsicEnums") {
+  visibility = [ "//llvm/lib/IR" ]
+  args = [ "-gen-intrinsic-enums" ]
+  td_file = "Intrinsics.td"
+}
+
+tablegen("IntrinsicImpl") {
+  visibility = [ "//llvm/lib/IR" ]
+  args = [ "-gen-intrinsic-impl" ]
+  td_file = "Intrinsics.td"
+}
+
+tablegen("Attributes") {
+  visibility = [ "//llvm/lib/IR" ]
+  args = [ "-gen-attrs" ]
+}
diff --git a/utils/gn/secondary/llvm/include/llvm/Support/BUILD.gn b/utils/gn/secondary/llvm/include/llvm/Support/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..cf26a214c327fb0f09cda5ac23ad2fbfc87c7d19
--- /dev/null
+++ b/utils/gn/secondary/llvm/include/llvm/Support/BUILD.gn
@@ -0,0 +1,25 @@
+declare_args() {
+  # If this set to false, VCSRevision.h is updated after every git commit.
+  # That's technically correct, but results in rebuilds after every commit.
+  # If it's true (default), VCSRevision.h will usually be somewhat
+  # out-of-date, but builds will be faster.
+  llvm_allow_tardy_revision = true
+}
+
+action("write_vcsrevision") {
+  script = "//llvm/utils/gn/build/write_vcsrevision.py"
+  header = "$target_gen_dir/VCSRevision.h"
+
+  args = [ rebase_path(header, root_build_dir) ]
+  if (!llvm_allow_tardy_revision) {
+    depfile = "$header.d"
+    args += [
+      "-d",
+      rebase_path(depfile, root_build_dir),
+    ]
+  }
+
+  outputs = [
+    header,
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn b/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..5b203d4d7035543c4418c19ee8046f46af414b29
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
@@ -0,0 +1,112 @@
+static_library("Analysis") {
+  output_name = "LLVMAnalysis"
+  public_deps = [
+    # Must be a public_dep because Analysis's headers include llvm-config.h.
+    "//llvm/include/llvm/Config:llvm-config",
+  ]
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/IR",
+    "//llvm/lib/Object",
+    "//llvm/lib/ProfileData",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "AliasAnalysis.cpp",
+    "AliasAnalysisEvaluator.cpp",
+    "AliasAnalysisSummary.cpp",
+    "AliasSetTracker.cpp",
+    "Analysis.cpp",
+    "AssumptionCache.cpp",
+    "BasicAliasAnalysis.cpp",
+    "BlockFrequencyInfo.cpp",
+    "BlockFrequencyInfoImpl.cpp",
+    "BranchProbabilityInfo.cpp",
+    "CFG.cpp",
+    "CFGPrinter.cpp",
+    "CFLAndersAliasAnalysis.cpp",
+    "CFLSteensAliasAnalysis.cpp",
+    "CGSCCPassManager.cpp",
+    "CallGraph.cpp",
+    "CallGraphSCCPass.cpp",
+    "CallPrinter.cpp",
+    "CaptureTracking.cpp",
+    "CmpInstAnalysis.cpp",
+    "CodeMetrics.cpp",
+    "ConstantFolding.cpp",
+    "CostModel.cpp",
+    "Delinearization.cpp",
+    "DemandedBits.cpp",
+    "DependenceAnalysis.cpp",
+    "DivergenceAnalysis.cpp",
+    "DomPrinter.cpp",
+    "DominanceFrontier.cpp",
+    "EHPersonalities.cpp",
+    "GlobalsModRef.cpp",
+    "GuardUtils.cpp",
+    "IVDescriptors.cpp",
+    "IVUsers.cpp",
+    "IndirectCallPromotionAnalysis.cpp",
+    "InlineCost.cpp",
+    "InstCount.cpp",
+    "InstructionPrecedenceTracking.cpp",
+    "InstructionSimplify.cpp",
+    "Interval.cpp",
+    "IntervalPartition.cpp",
+    "IteratedDominanceFrontier.cpp",
+    "LazyBlockFrequencyInfo.cpp",
+    "LazyBranchProbabilityInfo.cpp",
+    "LazyCallGraph.cpp",
+    "LazyValueInfo.cpp",
+    "LegacyDivergenceAnalysis.cpp",
+    "Lint.cpp",
+    "Loads.cpp",
+    "LoopAccessAnalysis.cpp",
+    "LoopAnalysisManager.cpp",
+    "LoopInfo.cpp",
+    "LoopPass.cpp",
+    "LoopUnrollAnalyzer.cpp",
+    "MemDepPrinter.cpp",
+    "MemDerefPrinter.cpp",
+    "MemoryBuiltins.cpp",
+    "MemoryDependenceAnalysis.cpp",
+    "MemoryLocation.cpp",
+    "MemorySSA.cpp",
+    "MemorySSAUpdater.cpp",
+    "ModuleDebugInfoPrinter.cpp",
+    "ModuleSummaryAnalysis.cpp",
+    "MustExecute.cpp",
+    "ObjCARCAliasAnalysis.cpp",
+    "ObjCARCAnalysisUtils.cpp",
+    "ObjCARCInstKind.cpp",
+    "OptimizationRemarkEmitter.cpp",
+    "OrderedBasicBlock.cpp",
+    "OrderedInstructions.cpp",
+    "PHITransAddr.cpp",
+    "PhiValues.cpp",
+    "PostDominators.cpp",
+    "ProfileSummaryInfo.cpp",
+    "PtrUseVisitor.cpp",
+    "RegionInfo.cpp",
+    "RegionPass.cpp",
+    "RegionPrinter.cpp",
+    "ScalarEvolution.cpp",
+    "ScalarEvolutionAliasAnalysis.cpp",
+    "ScalarEvolutionExpander.cpp",
+    "ScalarEvolutionNormalization.cpp",
+    "ScopedNoAliasAA.cpp",
+    "StackSafetyAnalysis.cpp",
+    "SyncDependenceAnalysis.cpp",
+    "SyntheticCountsUtils.cpp",
+    "TargetLibraryInfo.cpp",
+    "TargetTransformInfo.cpp",
+    "Trace.cpp",
+    "TypeBasedAliasAnalysis.cpp",
+    "TypeMetadataUtils.cpp",
+    "ValueLattice.cpp",
+    "ValueLatticeUtils.cpp",
+    "ValueTracking.cpp",
+    "VectorUtils.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/AsmParser/BUILD.gn b/utils/gn/secondary/llvm/lib/AsmParser/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..9ff794f7c9b32e94944ebd608eff73db5beab924
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/AsmParser/BUILD.gn
@@ -0,0 +1,13 @@
+static_library("AsmParser") {
+  output_name = "LLVMAsmParser"
+  deps = [
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "LLLexer.cpp",
+    "LLParser.cpp",
+    "Parser.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn b/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..1aa40fc282c190df8d0130dd5e3e07d26b25d2cd
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
@@ -0,0 +1,15 @@
+static_library("BinaryFormat") {
+  output_name = "LLVMBinaryFormat"
+  deps = [
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "AMDGPUMetadataVerifier.cpp",
+    "Dwarf.cpp",
+    "Magic.cpp",
+    "MsgPackReader.cpp",
+    "MsgPackTypes.cpp",
+    "MsgPackWriter.cpp",
+    "Wasm.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Bitcode/Reader/BUILD.gn b/utils/gn/secondary/llvm/lib/Bitcode/Reader/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..a7ecdf1f4e63ae4c3ca10dc417595190d65f5e86
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Bitcode/Reader/BUILD.gn
@@ -0,0 +1,16 @@
+static_library("Reader") {
+  output_name = "LLVMBitReader"
+  deps = [
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+  ]
+
+  sources = [
+    "BitReader.cpp",
+    "BitcodeReader.cpp",
+    "BitstreamReader.cpp",
+    "MetadataLoader.cpp",
+    "ValueList.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Bitcode/Writer/BUILD.gn b/utils/gn/secondary/llvm/lib/Bitcode/Writer/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..14683a0bf77b7079fc00818515cef1baf22167f1
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Bitcode/Writer/BUILD.gn
@@ -0,0 +1,18 @@
+static_library("Writer") {
+  output_name = "LLVMBitWriter"
+  deps = [
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+  ]
+
+  sources = [
+    "BitWriter.cpp",
+    "BitcodeWriter.cpp",
+    "BitcodeWriterPass.cpp",
+    "ValueEnumerator.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/CodeGen/AsmPrinter/BUILD.gn b/utils/gn/secondary/llvm/lib/CodeGen/AsmPrinter/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..ea42d0da36ab1eb07fa3ccb7d0f064340f4a0ff3
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/CodeGen/AsmPrinter/BUILD.gn
@@ -0,0 +1,42 @@
+static_library("AsmPrinter") {
+  output_name = "LLVMAsmPrinter"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/DebugInfo/MSF",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCParser",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+  ]
+  sources = [
+    "ARMException.cpp",
+    "AccelTable.cpp",
+    "AddressPool.cpp",
+    "AsmPrinter.cpp",
+    "AsmPrinterDwarf.cpp",
+    "AsmPrinterInlineAsm.cpp",
+    "CodeViewDebug.cpp",
+    "DIE.cpp",
+    "DIEHash.cpp",
+    "DbgEntityHistoryCalculator.cpp",
+    "DebugHandlerBase.cpp",
+    "DebugLocStream.cpp",
+    "DwarfCFIException.cpp",
+    "DwarfCompileUnit.cpp",
+    "DwarfDebug.cpp",
+    "DwarfExpression.cpp",
+    "DwarfFile.cpp",
+    "DwarfStringPool.cpp",
+    "DwarfUnit.cpp",
+    "EHStreamer.cpp",
+    "ErlangGCPrinter.cpp",
+    "OcamlGCPrinter.cpp",
+    "WasmException.cpp",
+    "WinCFGuard.cpp",
+    "WinException.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..222978cc2aff355e5bef42697dbbe6d94f28e283
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -0,0 +1,183 @@
+static_library("CodeGen") {
+  output_name = "LLVMCodeGen"
+  public_deps = [
+    # Must be a public_dep because CodeGen's headers include llvm-config.h.
+    "//llvm/include/llvm/Config:llvm-config",
+  ]
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/Bitcode/Writer",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/ProfileData",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+    "//llvm/lib/Transforms/Scalar",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "AggressiveAntiDepBreaker.cpp",
+    "AllocationOrder.cpp",
+    "Analysis.cpp",
+    "AtomicExpandPass.cpp",
+    "BasicTargetTransformInfo.cpp",
+    "BranchFolding.cpp",
+    "BranchRelaxation.cpp",
+    "BreakFalseDeps.cpp",
+    "BuiltinGCs.cpp",
+    "CFIInstrInserter.cpp",
+    "CalcSpillWeights.cpp",
+    "CallingConvLower.cpp",
+    "CodeGen.cpp",
+    "CodeGenPrepare.cpp",
+    "CriticalAntiDepBreaker.cpp",
+    "DFAPacketizer.cpp",
+    "DeadMachineInstructionElim.cpp",
+    "DetectDeadLanes.cpp",
+    "DwarfEHPrepare.cpp",
+    "EarlyIfConversion.cpp",
+    "EdgeBundles.cpp",
+    "ExecutionDomainFix.cpp",
+    "ExpandISelPseudos.cpp",
+    "ExpandMemCmp.cpp",
+    "ExpandPostRAPseudos.cpp",
+    "ExpandReductions.cpp",
+    "FEntryInserter.cpp",
+    "FaultMaps.cpp",
+    "FuncletLayout.cpp",
+    "GCMetadata.cpp",
+    "GCMetadataPrinter.cpp",
+    "GCRootLowering.cpp",
+    "GCStrategy.cpp",
+    "GlobalMerge.cpp",
+    "IfConversion.cpp",
+    "ImplicitNullChecks.cpp",
+    "IndirectBrExpandPass.cpp",
+    "InlineSpiller.cpp",
+    "InterferenceCache.cpp",
+    "InterleavedAccessPass.cpp",
+    "InterleavedLoadCombinePass.cpp",
+    "IntrinsicLowering.cpp",
+    "LLVMTargetMachine.cpp",
+    "LatencyPriorityQueue.cpp",
+    "LazyMachineBlockFrequencyInfo.cpp",
+    "LexicalScopes.cpp",
+    "LiveDebugValues.cpp",
+    "LiveDebugVariables.cpp",
+    "LiveInterval.cpp",
+    "LiveIntervalUnion.cpp",
+    "LiveIntervals.cpp",
+    "LivePhysRegs.cpp",
+    "LiveRangeCalc.cpp",
+    "LiveRangeEdit.cpp",
+    "LiveRangeShrink.cpp",
+    "LiveRegMatrix.cpp",
+    "LiveRegUnits.cpp",
+    "LiveStacks.cpp",
+    "LiveVariables.cpp",
+    "LocalStackSlotAllocation.cpp",
+    "LoopTraversal.cpp",
+    "LowLevelType.cpp",
+    "LowerEmuTLS.cpp",
+    "MIRCanonicalizerPass.cpp",
+    "MIRPrinter.cpp",
+    "MIRPrintingPass.cpp",
+    "MachineBasicBlock.cpp",
+    "MachineBlockFrequencyInfo.cpp",
+    "MachineBlockPlacement.cpp",
+    "MachineBranchProbabilityInfo.cpp",
+    "MachineCSE.cpp",
+    "MachineCombiner.cpp",
+    "MachineCopyPropagation.cpp",
+    "MachineDominanceFrontier.cpp",
+    "MachineDominators.cpp",
+    "MachineFrameInfo.cpp",
+    "MachineFunction.cpp",
+    "MachineFunctionPass.cpp",
+    "MachineFunctionPrinterPass.cpp",
+    "MachineInstr.cpp",
+    "MachineInstrBundle.cpp",
+    "MachineLICM.cpp",
+    "MachineLoopInfo.cpp",
+    "MachineModuleInfo.cpp",
+    "MachineModuleInfoImpls.cpp",
+    "MachineOperand.cpp",
+    "MachineOptimizationRemarkEmitter.cpp",
+    "MachineOutliner.cpp",
+    "MachinePipeliner.cpp",
+    "MachinePostDominators.cpp",
+    "MachineRegionInfo.cpp",
+    "MachineRegisterInfo.cpp",
+    "MachineSSAUpdater.cpp",
+    "MachineScheduler.cpp",
+    "MachineSink.cpp",
+    "MachineTraceMetrics.cpp",
+    "MachineVerifier.cpp",
+    "MacroFusion.cpp",
+    "OptimizePHIs.cpp",
+    "PHIElimination.cpp",
+    "PHIEliminationUtils.cpp",
+    "ParallelCG.cpp",
+    "PatchableFunction.cpp",
+    "PeepholeOptimizer.cpp",
+    "PostRAHazardRecognizer.cpp",
+    "PostRASchedulerList.cpp",
+    "PreISelIntrinsicLowering.cpp",
+    "ProcessImplicitDefs.cpp",
+    "PrologEpilogInserter.cpp",
+    "PseudoSourceValue.cpp",
+    "ReachingDefAnalysis.cpp",
+    "RegAllocBase.cpp",
+    "RegAllocBasic.cpp",
+    "RegAllocFast.cpp",
+    "RegAllocGreedy.cpp",
+    "RegAllocPBQP.cpp",
+    "RegUsageInfoCollector.cpp",
+    "RegUsageInfoPropagate.cpp",
+    "RegisterClassInfo.cpp",
+    "RegisterCoalescer.cpp",
+    "RegisterPressure.cpp",
+    "RegisterScavenging.cpp",
+    "RegisterUsageInfo.cpp",
+    "RenameIndependentSubregs.cpp",
+    "ResetMachineFunctionPass.cpp",
+    "SafeStack.cpp",
+    "SafeStackColoring.cpp",
+    "SafeStackLayout.cpp",
+    "ScalarizeMaskedMemIntrin.cpp",
+    "ScheduleDAG.cpp",
+    "ScheduleDAGInstrs.cpp",
+    "ScheduleDAGPrinter.cpp",
+    "ScoreboardHazardRecognizer.cpp",
+    "ShadowStackGCLowering.cpp",
+    "ShrinkWrap.cpp",
+    "SjLjEHPrepare.cpp",
+    "SlotIndexes.cpp",
+    "SpillPlacement.cpp",
+    "SplitKit.cpp",
+    "StackColoring.cpp",
+    "StackMapLivenessAnalysis.cpp",
+    "StackMaps.cpp",
+    "StackProtector.cpp",
+    "StackSlotColoring.cpp",
+    "TailDuplication.cpp",
+    "TailDuplicator.cpp",
+    "TargetFrameLoweringImpl.cpp",
+    "TargetInstrInfo.cpp",
+    "TargetLoweringBase.cpp",
+    "TargetLoweringObjectFileImpl.cpp",
+    "TargetOptionsImpl.cpp",
+    "TargetPassConfig.cpp",
+    "TargetRegisterInfo.cpp",
+    "TargetSchedule.cpp",
+    "TargetSubtargetInfo.cpp",
+    "TwoAddressInstructionPass.cpp",
+    "UnreachableBlockElim.cpp",
+    "ValueTypes.cpp",
+    "VirtRegMap.cpp",
+    "WasmEHPrepare.cpp",
+    "WinEHPrepare.cpp",
+    "XRayInstrumentation.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn b/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..df8a11035873b46d9da11ffd8c9b154fab765872
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
@@ -0,0 +1,35 @@
+static_library("GlobalISel") {
+  output_name = "LLVMGlobalISel"
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "CallLowering.cpp",
+    "Combiner.cpp",
+    "CombinerHelper.cpp",
+    "GISelChangeObserver.cpp",
+    "GlobalISel.cpp",
+    "IRTranslator.cpp",
+    "InstructionSelect.cpp",
+    "InstructionSelector.cpp",
+    "LegalityPredicates.cpp",
+    "LegalizeMutations.cpp",
+    "Legalizer.cpp",
+    "LegalizerHelper.cpp",
+    "LegalizerInfo.cpp",
+    "Localizer.cpp",
+    "MachineIRBuilder.cpp",
+    "RegBankSelect.cpp",
+    "RegisterBank.cpp",
+    "RegisterBankInfo.cpp",
+    "Utils.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/CodeGen/MIRParser/BUILD.gn b/utils/gn/secondary/llvm/lib/CodeGen/MIRParser/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..99465448f02fa17aa208e359b0b104f5b383fc19
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/CodeGen/MIRParser/BUILD.gn
@@ -0,0 +1,17 @@
+static_library("MIRParser") {
+  output_name = "LLVMMIRParser"
+  deps = [
+    "//llvm/lib/AsmParser",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+  ]
+  sources = [
+    "MILexer.cpp",
+    "MIParser.cpp",
+    "MIRParser.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/CodeGen/SelectionDAG/BUILD.gn b/utils/gn/secondary/llvm/lib/CodeGen/SelectionDAG/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..fda56d73453448c2dd650d9a6ffeae33813ca48f
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/CodeGen/SelectionDAG/BUILD.gn
@@ -0,0 +1,39 @@
+static_library("SelectionDAG") {
+  output_name = "LLVMSelectionDAG"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "DAGCombiner.cpp",
+    "FastISel.cpp",
+    "FunctionLoweringInfo.cpp",
+    "InstrEmitter.cpp",
+    "LegalizeDAG.cpp",
+    "LegalizeFloatTypes.cpp",
+    "LegalizeIntegerTypes.cpp",
+    "LegalizeTypes.cpp",
+    "LegalizeTypesGeneric.cpp",
+    "LegalizeVectorOps.cpp",
+    "LegalizeVectorTypes.cpp",
+    "ResourcePriorityQueue.cpp",
+    "ScheduleDAGFast.cpp",
+    "ScheduleDAGRRList.cpp",
+    "ScheduleDAGSDNodes.cpp",
+    "ScheduleDAGVLIW.cpp",
+    "SelectionDAG.cpp",
+    "SelectionDAGAddressAnalysis.cpp",
+    "SelectionDAGBuilder.cpp",
+    "SelectionDAGDumper.cpp",
+    "SelectionDAGISel.cpp",
+    "SelectionDAGPrinter.cpp",
+    "SelectionDAGTargetInfo.cpp",
+    "StatepointLowering.cpp",
+    "TargetLowering.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/DebugInfo/CodeView/BUILD.gn b/utils/gn/secondary/llvm/lib/DebugInfo/CodeView/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..43d363d5fbcb1a3d33d9adbd6156b8cd76a423e2
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/DebugInfo/CodeView/BUILD.gn
@@ -0,0 +1,49 @@
+static_library("CodeView") {
+  output_name = "LLVMDebugInfoCodeView"
+  deps = [
+    "//llvm/lib/DebugInfo/MSF",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "AppendingTypeTableBuilder.cpp",
+    "CVSymbolVisitor.cpp",
+    "CVTypeVisitor.cpp",
+    "CodeViewError.cpp",
+    "CodeViewRecordIO.cpp",
+    "ContinuationRecordBuilder.cpp",
+    "DebugChecksumsSubsection.cpp",
+    "DebugCrossExSubsection.cpp",
+    "DebugCrossImpSubsection.cpp",
+    "DebugFrameDataSubsection.cpp",
+    "DebugInlineeLinesSubsection.cpp",
+    "DebugLinesSubsection.cpp",
+    "DebugStringTableSubsection.cpp",
+    "DebugSubsection.cpp",
+    "DebugSubsectionRecord.cpp",
+    "DebugSubsectionVisitor.cpp",
+    "DebugSymbolRVASubsection.cpp",
+    "DebugSymbolsSubsection.cpp",
+    "EnumTables.cpp",
+    "Formatters.cpp",
+    "GlobalTypeTableBuilder.cpp",
+    "LazyRandomTypeCollection.cpp",
+    "Line.cpp",
+    "MergingTypeTableBuilder.cpp",
+    "RecordName.cpp",
+    "RecordSerialization.cpp",
+    "SimpleTypeSerializer.cpp",
+    "StringsAndChecksums.cpp",
+    "SymbolDumper.cpp",
+    "SymbolRecordHelpers.cpp",
+    "SymbolRecordMapping.cpp",
+    "SymbolSerializer.cpp",
+    "TypeDumpVisitor.cpp",
+    "TypeHashing.cpp",
+    "TypeIndex.cpp",
+    "TypeIndexDiscovery.cpp",
+    "TypeRecordHelpers.cpp",
+    "TypeRecordMapping.cpp",
+    "TypeStreamMerger.cpp",
+    "TypeTableCollection.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn b/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..c93427ce636b48a111753bbecee45731a070794b
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
@@ -0,0 +1,38 @@
+static_library("DWARF") {
+  output_name = "LLVMDebugInfoDWARF"
+  deps = [
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "DWARFAbbreviationDeclaration.cpp",
+    "DWARFAcceleratorTable.cpp",
+    "DWARFAddressRange.cpp",
+    "DWARFCompileUnit.cpp",
+    "DWARFContext.cpp",
+    "DWARFDataExtractor.cpp",
+    "DWARFDebugAbbrev.cpp",
+    "DWARFDebugAddr.cpp",
+    "DWARFDebugArangeSet.cpp",
+    "DWARFDebugAranges.cpp",
+    "DWARFDebugFrame.cpp",
+    "DWARFDebugInfoEntry.cpp",
+    "DWARFDebugLine.cpp",
+    "DWARFDebugLoc.cpp",
+    "DWARFDebugMacro.cpp",
+    "DWARFDebugPubTable.cpp",
+    "DWARFDebugRangeList.cpp",
+    "DWARFDebugRnglists.cpp",
+    "DWARFDie.cpp",
+    "DWARFExpression.cpp",
+    "DWARFFormValue.cpp",
+    "DWARFGdbIndex.cpp",
+    "DWARFListTable.cpp",
+    "DWARFTypeUnit.cpp",
+    "DWARFUnit.cpp",
+    "DWARFUnitIndex.cpp",
+    "DWARFVerifier.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/DebugInfo/MSF/BUILD.gn b/utils/gn/secondary/llvm/lib/DebugInfo/MSF/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..9f8fd86e6abf6fa6945dd7bb4866d088bfc019b6
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/DebugInfo/MSF/BUILD.gn
@@ -0,0 +1,12 @@
+static_library("MSF") {
+  output_name = "LLVMDebugInfoMSF"
+  deps = [
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "MSFBuilder.cpp",
+    "MSFCommon.cpp",
+    "MSFError.cpp",
+    "MappedBlockStream.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/DebugInfo/PDB/BUILD.gn b/utils/gn/secondary/llvm/lib/DebugInfo/PDB/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..63057681ad0afa7bcc3f90ccde758fa49b1720f3
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/DebugInfo/PDB/BUILD.gn
@@ -0,0 +1,118 @@
+import("//llvm/lib/DebugInfo/PDB/enable_dia.gni")
+
+static_library("PDB") {
+  output_name = "LLVMDebugInfoPDB"
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/DebugInfo/MSF",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "GenericError.cpp",
+    "IPDBSourceFile.cpp",
+    "Native/DbiModuleDescriptor.cpp",
+    "Native/DbiModuleDescriptorBuilder.cpp",
+    "Native/DbiModuleList.cpp",
+    "Native/DbiStream.cpp",
+    "Native/DbiStreamBuilder.cpp",
+    "Native/EnumTables.cpp",
+    "Native/GSIStreamBuilder.cpp",
+    "Native/GlobalsStream.cpp",
+    "Native/Hash.cpp",
+    "Native/HashTable.cpp",
+    "Native/InfoStream.cpp",
+    "Native/InfoStreamBuilder.cpp",
+    "Native/ModuleDebugStream.cpp",
+    "Native/NamedStreamMap.cpp",
+    "Native/NativeCompilandSymbol.cpp",
+    "Native/NativeEnumGlobals.cpp",
+    "Native/NativeEnumModules.cpp",
+    "Native/NativeEnumTypes.cpp",
+    "Native/NativeExeSymbol.cpp",
+    "Native/NativeRawSymbol.cpp",
+    "Native/NativeSession.cpp",
+    "Native/NativeSymbolEnumerator.cpp",
+    "Native/NativeTypeArray.cpp",
+    "Native/NativeTypeBuiltin.cpp",
+    "Native/NativeTypeEnum.cpp",
+    "Native/NativeTypeFunctionSig.cpp",
+    "Native/NativeTypePointer.cpp",
+    "Native/NativeTypeTypedef.cpp",
+    "Native/NativeTypeUDT.cpp",
+    "Native/NativeTypeVTShape.cpp",
+    "Native/PDBFile.cpp",
+    "Native/PDBFileBuilder.cpp",
+    "Native/PDBStringTable.cpp",
+    "Native/PDBStringTableBuilder.cpp",
+    "Native/PublicsStream.cpp",
+    "Native/RawError.cpp",
+    "Native/SymbolCache.cpp",
+    "Native/SymbolStream.cpp",
+    "Native/TpiHashing.cpp",
+    "Native/TpiStream.cpp",
+    "Native/TpiStreamBuilder.cpp",
+    "PDB.cpp",
+    "PDBContext.cpp",
+    "PDBExtras.cpp",
+    "PDBInterfaceAnchors.cpp",
+    "PDBSymDumper.cpp",
+    "PDBSymbol.cpp",
+    "PDBSymbolAnnotation.cpp",
+    "PDBSymbolBlock.cpp",
+    "PDBSymbolCompiland.cpp",
+    "PDBSymbolCompilandDetails.cpp",
+    "PDBSymbolCompilandEnv.cpp",
+    "PDBSymbolCustom.cpp",
+    "PDBSymbolData.cpp",
+    "PDBSymbolExe.cpp",
+    "PDBSymbolFunc.cpp",
+    "PDBSymbolFuncDebugEnd.cpp",
+    "PDBSymbolFuncDebugStart.cpp",
+    "PDBSymbolLabel.cpp",
+    "PDBSymbolPublicSymbol.cpp",
+    "PDBSymbolThunk.cpp",
+    "PDBSymbolTypeArray.cpp",
+    "PDBSymbolTypeBaseClass.cpp",
+    "PDBSymbolTypeBuiltin.cpp",
+    "PDBSymbolTypeCustom.cpp",
+    "PDBSymbolTypeDimension.cpp",
+    "PDBSymbolTypeEnum.cpp",
+    "PDBSymbolTypeFriend.cpp",
+    "PDBSymbolTypeFunctionArg.cpp",
+    "PDBSymbolTypeFunctionSig.cpp",
+    "PDBSymbolTypeManaged.cpp",
+    "PDBSymbolTypePointer.cpp",
+    "PDBSymbolTypeTypedef.cpp",
+    "PDBSymbolTypeUDT.cpp",
+    "PDBSymbolTypeVTable.cpp",
+    "PDBSymbolTypeVTableShape.cpp",
+    "PDBSymbolUnknown.cpp",
+    "PDBSymbolUsingNamespace.cpp",
+    "UDTLayout.cpp",
+  ]
+  if (llvm_enable_dia_sdk) {
+    sources += [
+      "DIA/DIADataStream.cpp",
+      "DIA/DIAEnumDebugStreams.cpp",
+      "DIA/DIAEnumFrameData.cpp",
+      "DIA/DIAEnumInjectedSources.cpp",
+      "DIA/DIAEnumLineNumbers.cpp",
+      "DIA/DIAEnumSectionContribs.cpp",
+      "DIA/DIAEnumSourceFiles.cpp",
+      "DIA/DIAEnumSymbols.cpp",
+      "DIA/DIAEnumTables.cpp",
+      "DIA/DIAError.cpp",
+      "DIA/DIAFrameData.cpp",
+      "DIA/DIAInjectedSource.cpp",
+      "DIA/DIALineNumber.cpp",
+      "DIA/DIARawSymbol.cpp",
+      "DIA/DIASectionContrib.cpp",
+      "DIA/DIASession.cpp",
+      "DIA/DIASourceFile.cpp",
+      "DIA/DIATable.cpp",
+    ]
+    # FIXME: Link against the right diaguids.lib too.
+  }
+}
diff --git a/utils/gn/secondary/llvm/lib/DebugInfo/PDB/enable_dia.gni b/utils/gn/secondary/llvm/lib/DebugInfo/PDB/enable_dia.gni
new file mode 100644
index 0000000000000000000000000000000000000000..9d397bcc9a4ecc09c40eb643f35830382d62a301
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/DebugInfo/PDB/enable_dia.gni
@@ -0,0 +1,4 @@
+declare_args() {
+  # Whether to build code that requires the Microsoft DIA SDK.
+  llvm_enable_dia_sdk = false
+}
diff --git a/utils/gn/secondary/llvm/lib/DebugInfo/Symbolize/BUILD.gn b/utils/gn/secondary/llvm/lib/DebugInfo/Symbolize/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..ecd59c311534d60e0d43dc58b982b734c31ad474
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/DebugInfo/Symbolize/BUILD.gn
@@ -0,0 +1,16 @@
+static_library("Symbolize") {
+  output_name = "LLVMSymbolize"
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/lib/DebugInfo/DWARF",
+    "//llvm/lib/DebugInfo/PDB",
+    "//llvm/lib/Demangle",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "DIPrinter.cpp",
+    "SymbolizableObjectFile.cpp",
+    "Symbolize.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Demangle/BUILD.gn b/utils/gn/secondary/llvm/lib/Demangle/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..8e97dfea30f77ab335527ddef683f2f4a66db614
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Demangle/BUILD.gn
@@ -0,0 +1,9 @@
+static_library("Demangle") {
+  output_name = "LLVMDemangle"
+
+  sources = [
+    "ItaniumDemangle.cpp",
+    "MicrosoftDemangle.cpp",
+    "MicrosoftDemangleNodes.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/utils/gn/secondary/llvm/lib/IR/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..35e3db91c93ccd91a3f03da024bde03266334143
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/IR/BUILD.gn
@@ -0,0 +1,81 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("AttributesCompatFunc") {
+  visibility = [ ":IR" ]
+  args = [ "-gen-attrs" ]
+}
+
+static_library("IR") {
+  output_name = "LLVMCore"
+  public_deps = [
+    # Must be public_dep because IR's public headers include llvm-config.h.
+    "//llvm/include/llvm/Config:llvm-config",
+
+    # Must be public_dep because IR's public headers include Attributes.inc.
+    "//llvm/include/llvm/IR:Attributes",
+
+    # Must be public_dep because IR's public headers include IntrinsicEnums.inc.
+    "//llvm/include/llvm/IR:IntrinsicEnums",
+  ]
+  deps = [
+    ":AttributesCompatFunc",
+    "//llvm/include/llvm/IR:IntrinsicImpl",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "AsmWriter.cpp",
+    "Attributes.cpp",
+    "AutoUpgrade.cpp",
+    "BasicBlock.cpp",
+    "Comdat.cpp",
+    "ConstantFold.cpp",
+    "ConstantRange.cpp",
+    "Constants.cpp",
+    "Core.cpp",
+    "DIBuilder.cpp",
+    "DataLayout.cpp",
+    "DebugInfo.cpp",
+    "DebugInfoMetadata.cpp",
+    "DebugLoc.cpp",
+    "DiagnosticHandler.cpp",
+    "DiagnosticInfo.cpp",
+    "DiagnosticPrinter.cpp",
+    "DomTreeUpdater.cpp",
+    "Dominators.cpp",
+    "Function.cpp",
+    "GVMaterializer.cpp",
+    "Globals.cpp",
+    "IRBuilder.cpp",
+    "IRPrintingPasses.cpp",
+    "InlineAsm.cpp",
+    "Instruction.cpp",
+    "Instructions.cpp",
+    "IntrinsicInst.cpp",
+    "LLVMContext.cpp",
+    "LLVMContextImpl.cpp",
+    "LegacyPassManager.cpp",
+    "MDBuilder.cpp",
+    "Mangler.cpp",
+    "Metadata.cpp",
+    "Module.cpp",
+    "ModuleSummaryIndex.cpp",
+    "Operator.cpp",
+    "OptBisect.cpp",
+    "Pass.cpp",
+    "PassInstrumentation.cpp",
+    "PassManager.cpp",
+    "PassRegistry.cpp",
+    "PassTimingInfo.cpp",
+    "ProfileSummary.cpp",
+    "SafepointIRVerifier.cpp",
+    "Statepoint.cpp",
+    "Type.cpp",
+    "TypeFinder.cpp",
+    "Use.cpp",
+    "User.cpp",
+    "Value.cpp",
+    "ValueSymbolTable.cpp",
+    "Verifier.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/IRReader/BUILD.gn b/utils/gn/secondary/llvm/lib/IRReader/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..1310f3581261f2b5780b31b3370cb88303346390
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/IRReader/BUILD.gn
@@ -0,0 +1,12 @@
+static_library("IRReader") {
+  output_name = "LLVMIRReader"
+  deps = [
+    "//llvm/lib/AsmParser",
+    "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "IRReader.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/LTO/BUILD.gn b/utils/gn/secondary/llvm/lib/LTO/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..f24270f834d50d2df5393ee3ac5cf98534474748
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/LTO/BUILD.gn
@@ -0,0 +1,33 @@
+static_library("LTO") {
+  output_name = "LLVMLTO"
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/Bitcode/Writer",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/IR",
+    "//llvm/lib/Linker",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/Passes",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+    "//llvm/lib/Transforms/AggressiveInstCombine",
+    "//llvm/lib/Transforms/IPO",
+    "//llvm/lib/Transforms/InstCombine",
+    "//llvm/lib/Transforms/ObjCARC",
+    "//llvm/lib/Transforms/Scalar",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "Caching.cpp",
+    "LTO.cpp",
+    "LTOBackend.cpp",
+    "LTOCodeGenerator.cpp",
+    "LTOModule.cpp",
+    "SummaryBasedOptimizations.cpp",
+    "ThinLTOCodeGenerator.cpp",
+    "UpdateCompilerUsed.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Linker/BUILD.gn b/utils/gn/secondary/llvm/lib/Linker/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..42dbd92b48a6ecee4f99c3b4356c96608d07f039
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Linker/BUILD.gn
@@ -0,0 +1,12 @@
+static_library("Linker") {
+  output_name = "LLVMLinker"
+  deps = [
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "IRMover.cpp",
+    "LinkModules.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/MC/BUILD.gn b/utils/gn/secondary/llvm/lib/MC/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..3a5251f966c0e9961a97bbe93e4dfee4811154de
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/MC/BUILD.gn
@@ -0,0 +1,69 @@
+static_library("MC") {
+  output_name = "LLVMMC"
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/Support",
+  ]
+  public_deps = [
+    # Must be a public_dep because MC's headers include llvm-config.h.
+    "//llvm/include/llvm/Config:llvm-config",
+  ]
+  sources = [
+    "ConstantPools.cpp",
+    "ELFObjectWriter.cpp",
+    "MCAsmBackend.cpp",
+    "MCAsmInfo.cpp",
+    "MCAsmInfoCOFF.cpp",
+    "MCAsmInfoDarwin.cpp",
+    "MCAsmInfoELF.cpp",
+    "MCAsmInfoWasm.cpp",
+    "MCAsmMacro.cpp",
+    "MCAsmStreamer.cpp",
+    "MCAssembler.cpp",
+    "MCCodeEmitter.cpp",
+    "MCCodePadder.cpp",
+    "MCCodeView.cpp",
+    "MCContext.cpp",
+    "MCDwarf.cpp",
+    "MCELFObjectTargetWriter.cpp",
+    "MCELFStreamer.cpp",
+    "MCExpr.cpp",
+    "MCFragment.cpp",
+    "MCInst.cpp",
+    "MCInstPrinter.cpp",
+    "MCInstrAnalysis.cpp",
+    "MCInstrDesc.cpp",
+    "MCLabel.cpp",
+    "MCLinkerOptimizationHint.cpp",
+    "MCMachOStreamer.cpp",
+    "MCMachObjectTargetWriter.cpp",
+    "MCNullStreamer.cpp",
+    "MCObjectFileInfo.cpp",
+    "MCObjectStreamer.cpp",
+    "MCObjectWriter.cpp",
+    "MCRegisterInfo.cpp",
+    "MCSchedule.cpp",
+    "MCSection.cpp",
+    "MCSectionCOFF.cpp",
+    "MCSectionELF.cpp",
+    "MCSectionMachO.cpp",
+    "MCSectionWasm.cpp",
+    "MCStreamer.cpp",
+    "MCSubtargetInfo.cpp",
+    "MCSymbol.cpp",
+    "MCSymbolELF.cpp",
+    "MCTargetOptions.cpp",
+    "MCValue.cpp",
+    "MCWasmObjectTargetWriter.cpp",
+    "MCWasmStreamer.cpp",
+    "MCWin64EH.cpp",
+    "MCWinCOFFStreamer.cpp",
+    "MCWinEH.cpp",
+    "MachObjectWriter.cpp",
+    "StringTableBuilder.cpp",
+    "SubtargetFeature.cpp",
+    "WasmObjectWriter.cpp",
+    "WinCOFFObjectWriter.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/MC/MCDisassembler/BUILD.gn b/utils/gn/secondary/llvm/lib/MC/MCDisassembler/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..6dcdacd8af7492de3410e57943dea9ab7dfa768f
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/MC/MCDisassembler/BUILD.gn
@@ -0,0 +1,14 @@
+static_library("MCDisassembler") {
+  output_name = "LLVMMCDisassembler"
+  deps = [
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "Disassembler.cpp",
+    "MCDisassembler.cpp",
+    "MCExternalSymbolizer.cpp",
+    "MCRelocationInfo.cpp",
+    "MCSymbolizer.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/MC/MCParser/BUILD.gn b/utils/gn/secondary/llvm/lib/MC/MCParser/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..5c9d23074693d23c5b087747702902c165b61503
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/MC/MCParser/BUILD.gn
@@ -0,0 +1,20 @@
+static_library("MCParser") {
+  output_name = "LLVMMCParser"
+  deps = [
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+  ]
+
+  sources = [
+    "AsmLexer.cpp",
+    "AsmParser.cpp",
+    "COFFAsmParser.cpp",
+    "DarwinAsmParser.cpp",
+    "ELFAsmParser.cpp",
+    "MCAsmLexer.cpp",
+    "MCAsmParser.cpp",
+    "MCAsmParserExtension.cpp",
+    "MCTargetAsmParser.cpp",
+    "WasmAsmParser.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Object/BUILD.gn b/utils/gn/secondary/llvm/lib/Object/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..4f2dd4e5181843bf2807abde2a4cdefcd36c1d32
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Object/BUILD.gn
@@ -0,0 +1,38 @@
+static_library("Object") {
+  output_name = "LLVMObject"
+  deps = [
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/include/llvm/Support:write_vcsrevision",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCParser",
+    "//llvm/lib/Support",
+  ]
+
+  sources = [
+    "Archive.cpp",
+    "ArchiveWriter.cpp",
+    "Binary.cpp",
+    "COFFImportFile.cpp",
+    "COFFModuleDefinition.cpp",
+    "COFFObjectFile.cpp",
+    "Decompressor.cpp",
+    "ELF.cpp",
+    "ELFObjectFile.cpp",
+    "Error.cpp",
+    "IRObjectFile.cpp",
+    "IRSymtab.cpp",
+    "MachOObjectFile.cpp",
+    "MachOUniversal.cpp",
+    "ModuleSymbolTable.cpp",
+    "Object.cpp",
+    "ObjectFile.cpp",
+    "RecordStreamer.cpp",
+    "SymbolSize.cpp",
+    "SymbolicFile.cpp",
+    "WasmObjectFile.cpp",
+    "WindowsResource.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/ObjectYAML/BUILD.gn b/utils/gn/secondary/llvm/lib/ObjectYAML/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..27367ff1b4ef3b5a56b713c667690e9aba67d310
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/ObjectYAML/BUILD.gn
@@ -0,0 +1,22 @@
+static_library("ObjectYAML") {
+  output_name = "LLVMObjectYAML"
+  deps = [
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "COFFYAML.cpp",
+    "CodeViewYAMLDebugSections.cpp",
+    "CodeViewYAMLSymbols.cpp",
+    "CodeViewYAMLTypeHashing.cpp",
+    "CodeViewYAMLTypes.cpp",
+    "DWARFEmitter.cpp",
+    "DWARFVisitor.cpp",
+    "DWARFYAML.cpp",
+    "ELFYAML.cpp",
+    "MachOYAML.cpp",
+    "ObjectYAML.cpp",
+    "WasmYAML.cpp",
+    "YAML.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Option/BUILD.gn b/utils/gn/secondary/llvm/lib/Option/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..51fe4651b168c5be6eb8424d45c02cff341cf768
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Option/BUILD.gn
@@ -0,0 +1,12 @@
+static_library("Option") {
+  output_name = "LLVMOption"
+  deps = [
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "Arg.cpp",
+    "ArgList.cpp",
+    "OptTable.cpp",
+    "Option.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..c8a9596da157877316312b79de6c4b3c068da7cf
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -0,0 +1,22 @@
+static_library("Passes") {
+  output_name = "LLVMPasses"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+    "//llvm/lib/Transforms/AggressiveInstCombine",
+    "//llvm/lib/Transforms/IPO",
+    "//llvm/lib/Transforms/InstCombine",
+    "//llvm/lib/Transforms/Instrumentation",
+    "//llvm/lib/Transforms/Scalar",
+    "//llvm/lib/Transforms/Utils",
+    "//llvm/lib/Transforms/Vectorize",
+  ]
+  sources = [
+    "PassBuilder.cpp",
+    "PassPlugin.cpp",
+    "StandardInstrumentations.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn b/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..d2197d0f0d4478905fcefea9d1ad9edebf3bad72
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
@@ -0,0 +1,17 @@
+static_library("ProfileData") {
+  output_name = "LLVMProfileData"
+  deps = [
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "GCOV.cpp",
+    "InstrProf.cpp",
+    "InstrProfReader.cpp",
+    "InstrProfWriter.cpp",
+    "ProfileSummaryBuilder.cpp",
+    "SampleProf.cpp",
+    "SampleProfReader.cpp",
+    "SampleProfWriter.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/utils/gn/secondary/llvm/lib/Support/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..84ae80fbcfa26856e2523af2413eda2a58603663
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -0,0 +1,161 @@
+static_library("Support") {
+  output_name = "LLVMSupport"
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/lib/Demangle",
+    "//llvm/utils/gn/build/libs/pthread",
+    "//llvm/utils/gn/build/libs/terminfo",
+    "//llvm/utils/gn/build/libs/zlib",
+  ]
+
+  # public_deps are used for depending on targets that generate headers
+  # which are included in public headers of this target. public_deps means
+  # that targets depending on Support will implicitly be built only after
+  # abi-breaking and llvm-config have been built.
+  public_deps = [
+    # abi-breaking.h is also include by public headers in ADT, but ADT has
+    # no target in the gn build.  Since everything depends on Support, this
+    # public_dep does double duty of abi-breaking.h uses in public headers of
+    # both Support and ADT.
+    "//llvm/include/llvm/Config:abi-breaking",
+    "//llvm/include/llvm/Config:llvm-config",
+
+    # public_dep because public header TargetSelect.h includes these .def files.
+    "//llvm/include/llvm/Config:write_target_def_files",
+  ]
+  include_dirs = [
+    "Unix",
+    "Windows",
+  ]
+  sources = [
+    "AArch64TargetParser.cpp",
+    "AMDGPUMetadata.cpp",
+    "APFloat.cpp",
+    "APInt.cpp",
+    "APSInt.cpp",
+    "ARMAttributeParser.cpp",
+    "ARMBuildAttrs.cpp",
+    "ARMTargetParser.cpp",
+    "ARMWinEH.cpp",
+    "Allocator.cpp",
+    "BinaryStreamError.cpp",
+    "BinaryStreamReader.cpp",
+    "BinaryStreamRef.cpp",
+    "BinaryStreamWriter.cpp",
+    "BlockFrequency.cpp",
+    "BranchProbability.cpp",
+    "BuryPointer.cpp",
+    "COM.cpp",
+    "CachePruning.cpp",
+    "Chrono.cpp",
+    "CodeGenCoverage.cpp",
+    "CommandLine.cpp",
+    "Compression.cpp",
+    "ConvertUTF.cpp",
+    "ConvertUTFWrapper.cpp",
+    "CrashRecoveryContext.cpp",
+    "DAGDeltaAlgorithm.cpp",
+    "DJB.cpp",
+    "DataExtractor.cpp",
+    "Debug.cpp",
+    "DebugCounter.cpp",
+    "DeltaAlgorithm.cpp",
+    "Error.cpp",
+    "ErrorHandling.cpp",
+    "FileCheck.cpp",
+    "FileOutputBuffer.cpp",
+    "FileUtilities.cpp",
+    "FoldingSet.cpp",
+    "FormatVariadic.cpp",
+    "FormattedStream.cpp",
+    "GlobPattern.cpp",
+    "GraphWriter.cpp",
+    "Hashing.cpp",
+    "InitLLVM.cpp",
+    "IntEqClasses.cpp",
+    "IntervalMap.cpp",
+    "ItaniumManglingCanonicalizer.cpp",
+    "JSON.cpp",
+    "JamCRC.cpp",
+    "KnownBits.cpp",
+    "LEB128.cpp",
+    "LineIterator.cpp",
+    "Locale.cpp",
+    "LockFileManager.cpp",
+    "LowLevelType.cpp",
+    "MD5.cpp",
+    "ManagedStatic.cpp",
+    "MathExtras.cpp",
+    "MemoryBuffer.cpp",
+    "NativeFormatting.cpp",
+    "Options.cpp",
+    "Parallel.cpp",
+    "PluginLoader.cpp",
+    "PrettyStackTrace.cpp",
+    "RandomNumberGenerator.cpp",
+    "Regex.cpp",
+    "SHA1.cpp",
+    "ScaledNumber.cpp",
+    "ScopedPrinter.cpp",
+    "SmallPtrSet.cpp",
+    "SmallVector.cpp",
+    "SourceMgr.cpp",
+    "SpecialCaseList.cpp",
+    "Statistic.cpp",
+    "StringExtras.cpp",
+    "StringMap.cpp",
+    "StringPool.cpp",
+    "StringRef.cpp",
+    "StringSaver.cpp",
+    "SymbolRemappingReader.cpp",
+    "SystemUtils.cpp",
+    "TarWriter.cpp",
+    "TargetParser.cpp",
+    "ThreadPool.cpp",
+    "Timer.cpp",
+    "ToolOutputFile.cpp",
+    "TrigramIndex.cpp",
+    "Triple.cpp",
+    "Twine.cpp",
+    "Unicode.cpp",
+    "UnicodeCaseFold.cpp",
+    "VersionTuple.cpp",
+    "WithColor.cpp",
+    "YAMLParser.cpp",
+    "YAMLTraits.cpp",
+    "circular_raw_ostream.cpp",
+    "raw_os_ostream.cpp",
+    "raw_ostream.cpp",
+    "regcomp.c",
+    "regerror.c",
+    "regexec.c",
+    "regfree.c",
+    "regstrlcpy.c",
+    "xxhash.cpp",
+
+    # System
+    "Atomic.cpp",
+    "DynamicLibrary.cpp",
+    "Errno.cpp",
+    "Host.cpp",
+    "Memory.cpp",
+    "Mutex.cpp",
+    "Path.cpp",
+    "Process.cpp",
+    "Program.cpp",
+    "RWMutex.cpp",
+    "Signals.cpp",
+    "TargetRegistry.cpp",
+    "ThreadLocal.cpp",
+    "Threading.cpp",
+    "Valgrind.cpp",
+    "VirtualFileSystem.cpp",
+    "Watchdog.cpp",
+  ]
+
+  libs = []
+
+  if (host_os == "linux") {
+    libs += [ "dl" ]
+  }
+}
diff --git a/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn b/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..8b94dbb93736b36dae251b3325be94894f632935
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn
@@ -0,0 +1,17 @@
+static_library("TableGen") {
+  output_name = "LLVMTableGen"
+  deps = [
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "Error.cpp",
+    "JSONBackend.cpp",
+    "Main.cpp",
+    "Record.cpp",
+    "SetTheory.cpp",
+    "StringMatcher.cpp",
+    "TGLexer.cpp",
+    "TGParser.cpp",
+    "TableGenBackend.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..54b6082fedaf3237b55cdd74bb25be7c51aea7eb
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/BUILD.gn
@@ -0,0 +1,90 @@
+import("//llvm/lib/Target/targets.gni")
+
+# This build file has two parts:
+# 1. The actual //llvm/lib/Target build target, which is just a static
+#    library containing the cpp files in this directory.  It contains general
+#    shared target code.
+# 2. Forwarding targets that forward to the concrete targets (X86, ARM, ...).
+#    These are defined in subdirectories, and the forwarding names match
+#    the names of the forwarding targets in CMake.  They all (indirectly,
+#    through CodeGen) depend on the //llvm/lib/Target build target.
+# (See also `gn help labels`).
+# The dependency chain is:
+# //llvm/lib/Target:TargetsToBuild (a target in this file) ->
+# /llvm/lib/Target/(X86|ARM|...) (in the subdirectories) ->
+# //llvm/lib/CodeGen ->
+# //llvm/lib/Target (a target in this file again)
+# Note that while this file appears twice in that stack, it's with different
+# targets in this file, so there's no cyclic dependency.
+
+# 1. Actual build target.
+static_library("Target") {
+  output_name = "LLVMTarget"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+  ]
+  public_deps = [
+    # This is a bit of a hack: llvm-c/Target.h includes llvm/Config/Targets.def,
+    # but there's no target corresponding to llvm-c. Since the functions
+    # declared in llvm-c/Target.h are defined in llvm/lib/Target, clients of
+    # it must depend on llvm/lib/Target, so add the public_dep for Targets.def
+    # here.
+    "//llvm/include/llvm/Config:write_target_def_files",
+  ]
+  sources = [
+    "Target.cpp",
+    "TargetIntrinsicInfo.cpp",
+    "TargetLoweringObjectFile.cpp",
+    "TargetMachine.cpp",
+    "TargetMachineC.cpp",
+  ]
+}
+
+# 2. Forwarding targets.
+group("NativeTarget") {
+  deps = [
+    "$native_target",
+  ]
+}
+
+group("TargetsToBuild") {
+  deps = llvm_targets_to_build
+}
+
+group("AllTargetsAsmParsers") {
+  deps = []
+  foreach(target, llvm_targets_to_build) {
+    deps += [ "$target/AsmParser" ]
+  }
+}
+
+group("AllTargetsAsmPrinters") {
+  deps = []
+  foreach(target, llvm_targets_to_build) {
+    deps += [ "$target/InstPrinter" ]
+  }
+}
+
+group("AllTargetsDescs") {
+  deps = []
+  foreach(target, llvm_targets_to_build) {
+    deps += [ "$target/MCTargetDesc" ]
+  }
+}
+
+group("AllTargetsDisassemblers") {
+  deps = []
+  foreach(target, llvm_targets_to_build) {
+    deps += [ "$target/Disassembler" ]
+  }
+}
+
+group("AllTargetsInfos") {
+  deps = []
+  foreach(target, llvm_targets_to_build) {
+    deps += [ "$target/TargetInfo" ]
+  }
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/X86/AsmParser/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/X86/AsmParser/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..f0dcf4d8a79faded831591f59f8ed3419eb20cdb
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/X86/AsmParser/BUILD.gn
@@ -0,0 +1,25 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("X86GenAsmMatcher") {
+  visibility = [ ":AsmParser" ]
+  args = [ "-gen-asm-matcher" ]
+  td_file = "../X86.td"
+}
+
+static_library("AsmParser") {
+  output_name = "LLVMX86AsmParser"
+  deps = [
+    ":X86GenAsmMatcher",
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCParser",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target/X86/InstPrinter",
+    "//llvm/lib/Target/X86/MCTargetDesc",
+    "//llvm/lib/Target/X86/TargetInfo",
+  ]
+  include_dirs = [ ".." ]
+  sources = [
+    "X86AsmInstrumentation.cpp",
+    "X86AsmParser.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..36ec6c53b463d4e0607ded5e4ed40576ea65ad61
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
@@ -0,0 +1,145 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+declare_args() {
+  # Buggy, only use if you know what you're doing.
+  x86_gen_fold_tables = false
+}
+
+tablegen("X86GenCallingConv") {
+  visibility = [ ":LLVMX86CodeGen" ]
+  args = [ "-gen-callingconv" ]
+  td_file = "X86.td"
+}
+
+tablegen("X86GenDAGISel") {
+  visibility = [ ":LLVMX86CodeGen" ]
+  args = [ "-gen-dag-isel" ]
+  td_file = "X86.td"
+}
+
+tablegen("X86GenEVEX2VEXTables") {
+  visibility = [ ":LLVMX86CodeGen" ]
+  args = [ "-gen-x86-EVEX2VEX-tables" ]
+  td_file = "X86.td"
+}
+
+tablegen("X86GenFastISel") {
+  visibility = [ ":LLVMX86CodeGen" ]
+  args = [ "-gen-fast-isel" ]
+  td_file = "X86.td"
+}
+
+tablegen("X86GenGlobalISel") {
+  visibility = [ ":LLVMX86CodeGen" ]
+  args = [ "-gen-global-isel" ]
+  td_file = "X86.td"
+}
+
+tablegen("X86GenRegisterBank") {
+  visibility = [ ":LLVMX86CodeGen" ]
+  args = [ "-gen-register-bank" ]
+  td_file = "X86.td"
+}
+
+if (x86_gen_fold_tables) {
+  tablegen("X86GenFoldTables") {
+    visibility = [ ":LLVMX86CodeGen" ]
+    args = [ "-gen-x86-fold-tables" ]
+    td_file = "X86.td"
+  }
+}
+
+static_library("LLVMX86CodeGen") {
+  deps = [
+    ":X86GenCallingConv",
+    ":X86GenDAGISel",
+    ":X86GenEVEX2VEXTables",
+    ":X86GenFastISel",
+    ":X86GenGlobalISel",
+    ":X86GenRegisterBank",
+    "InstPrinter",
+    "MCTargetDesc",
+    "TargetInfo",
+    "Utils",
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/CodeGen/AsmPrinter",
+    "//llvm/lib/CodeGen/GlobalISel",
+    "//llvm/lib/CodeGen/SelectionDAG",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+  ]
+  if (x86_gen_fold_tables) {
+    deps += [ ":X86GenFoldTables" ]
+  }
+  sources = [
+    "ShadowCallStack.cpp",
+    "X86AsmPrinter.cpp",
+    "X86AvoidStoreForwardingBlocks.cpp",
+    "X86CallFrameOptimization.cpp",
+    "X86CallLowering.cpp",
+    "X86CallingConv.cpp",
+    "X86CmovConversion.cpp",
+    "X86CondBrFolding.cpp",
+    "X86DiscriminateMemOps.cpp",
+    "X86DomainReassignment.cpp",
+    "X86EvexToVex.cpp",
+    "X86ExpandPseudo.cpp",
+    "X86FastISel.cpp",
+    "X86FixupBWInsts.cpp",
+    "X86FixupLEAs.cpp",
+    "X86FixupSetCC.cpp",
+    "X86FlagsCopyLowering.cpp",
+    "X86FloatingPoint.cpp",
+    "X86FrameLowering.cpp",
+    "X86ISelDAGToDAG.cpp",
+    "X86ISelLowering.cpp",
+    "X86IndirectBranchTracking.cpp",
+    "X86InsertPrefetch.cpp",
+    "X86InstrFMA3Info.cpp",
+    "X86InstrFoldTables.cpp",
+    "X86InstrInfo.cpp",
+    "X86InstructionSelector.cpp",
+    "X86InterleavedAccess.cpp",
+    "X86LegalizerInfo.cpp",
+    "X86MCInstLower.cpp",
+    "X86MachineFunctionInfo.cpp",
+    "X86MacroFusion.cpp",
+    "X86OptimizeLEAs.cpp",
+    "X86PadShortFunction.cpp",
+    "X86RegisterBankInfo.cpp",
+    "X86RegisterInfo.cpp",
+    "X86RetpolineThunks.cpp",
+    "X86SelectionDAGInfo.cpp",
+    "X86ShuffleDecodeConstantPool.cpp",
+    "X86SpeculativeLoadHardening.cpp",
+    "X86Subtarget.cpp",
+    "X86TargetMachine.cpp",
+    "X86TargetObjectFile.cpp",
+    "X86TargetTransformInfo.cpp",
+    "X86VZeroUpper.cpp",
+    "X86WinAllocaExpander.cpp",
+    "X86WinEHState.cpp",
+  ]
+}
+
+# This is a bit different from most build files: Due to this group
+# having the directory's name, "//llvm/lib/Target/X86" will refer to this
+# target, which pulls in the code in this directory *and all subdirectories*.
+# For most other directories, "//llvm/lib/Foo" only pulls in the code directly
+# in "llvm/lib/Foo". The forwarding targets in //llvm/lib/Target expect this
+# different behavior.
+group("X86") {
+  deps = [
+    ":LLVMX86CodeGen",
+    "AsmParser",
+    "Disassembler",
+    "InstPrinter",
+    "MCTargetDesc",
+    "TargetInfo",
+    "Utils",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/X86/Disassembler/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/X86/Disassembler/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..d3e9e30faead2902be4f8022627b422c94dbd459
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/X86/Disassembler/BUILD.gn
@@ -0,0 +1,23 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("X86GenDisassemblerTables") {
+  visibility = [ ":Disassembler" ]
+  args = [ "-gen-disassembler" ]
+  td_file = "../X86.td"
+}
+
+static_library("Disassembler") {
+  output_name = "LLVMX86Disassembler"
+  deps = [
+    ":X86GenDisassemblerTables",
+    "//llvm/lib/MC/MCDisassembler",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target/X86/MCTargetDesc",
+    "//llvm/lib/Target/X86/TargetInfo",
+  ]
+  include_dirs = [ ".." ]
+  sources = [
+    "X86Disassembler.cpp",
+    "X86DisassemblerDecoder.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/X86/InstPrinter/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/X86/InstPrinter/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..4ca806dba949724c2b3beaadff903eb8de9b3910
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/X86/InstPrinter/BUILD.gn
@@ -0,0 +1,38 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("X86GenAsmWriter") {
+  visibility = [ ":InstPrinter" ]
+  args = [ "-gen-asm-writer" ]
+  td_file = "../X86.td"
+}
+
+tablegen("X86GenAsmWriter1") {
+  visibility = [ ":InstPrinter" ]
+  args = [
+    "-gen-asm-writer",
+    "-asmwriternum=1",
+  ]
+  td_file = "../X86.td"
+}
+
+static_library("InstPrinter") {
+  output_name = "LLVMX86AsmPrinter"
+  deps = [
+    ":X86GenAsmWriter",
+    ":X86GenAsmWriter1",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+
+    # MCTargetDesc depends on InstPrinter, so we can't depend on the full
+    # MCTargetDesc target here: it would form a cycle.
+    "//llvm/lib/Target/X86/MCTargetDesc:tablegen",
+    "//llvm/lib/Target/X86/Utils",
+  ]
+  include_dirs = [ ".." ]
+  sources = [
+    "X86ATTInstPrinter.cpp",
+    "X86InstComments.cpp",
+    "X86InstPrinterCommon.cpp",
+    "X86IntelInstPrinter.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/X86/MCTargetDesc/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/X86/MCTargetDesc/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..426f9a5244c16effb15dd0d35c09c89881af63e7
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/X86/MCTargetDesc/BUILD.gn
@@ -0,0 +1,59 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("X86GenInstrInfo") {
+  visibility = [ ":tablegen" ]
+  args = [ "-gen-instr-info" ]
+  td_file = "../X86.td"
+}
+
+tablegen("X86GenRegisterInfo") {
+  visibility = [ ":tablegen" ]
+  args = [ "-gen-register-info" ]
+  td_file = "../X86.td"
+}
+
+tablegen("X86GenSubtargetInfo") {
+  visibility = [ ":tablegen" ]
+  args = [ "-gen-subtarget" ]
+  td_file = "../X86.td"
+}
+
+group("tablegen") {
+  visibility = [
+    ":MCTargetDesc",
+    "../InstPrinter",
+    "../TargetInfo",
+  ]
+  public_deps = [
+    ":X86GenInstrInfo",
+    ":X86GenRegisterInfo",
+    ":X86GenSubtargetInfo",
+  ]
+}
+
+static_library("MCTargetDesc") {
+  output_name = "LLVMX86Desc"
+  public_deps = [
+    ":tablegen",
+  ]
+  deps = [
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCDisassembler",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target/X86/InstPrinter",
+    "//llvm/lib/Target/X86/TargetInfo",
+  ]
+  include_dirs = [ ".." ]
+  sources = [
+    "X86AsmBackend.cpp",
+    "X86ELFObjectWriter.cpp",
+    "X86MCAsmInfo.cpp",
+    "X86MCCodeEmitter.cpp",
+    "X86MCTargetDesc.cpp",
+    "X86MachObjectWriter.cpp",
+    "X86WinCOFFObjectWriter.cpp",
+    "X86WinCOFFStreamer.cpp",
+    "X86WinCOFFTargetStreamer.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/X86/TargetInfo/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/X86/TargetInfo/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..6b0ed1d5448d055f6127c0156728f5c5c48b29f4
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/X86/TargetInfo/BUILD.gn
@@ -0,0 +1,14 @@
+static_library("TargetInfo") {
+  output_name = "LLVMX86Info"
+  deps = [
+    "//llvm/lib/Support",
+
+    # MCTargetDesc depends on TargetInfo, so we can't depend on the full
+    # MCTargetDesc target here: it would form a cycle.
+    "//llvm/lib/Target/X86/MCTargetDesc:tablegen",
+  ]
+  include_dirs = [ ".." ]
+  sources = [
+    "X86TargetInfo.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/X86/Utils/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/X86/Utils/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..6faf11c6a899a83900108389ef54ae5316974d62
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/X86/Utils/BUILD.gn
@@ -0,0 +1,9 @@
+static_library("Utils") {
+  output_name = "LLVMX86Utils"
+  deps = [
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "X86ShuffleDecode.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Target/targets.gni b/utils/gn/secondary/llvm/lib/Target/targets.gni
new file mode 100644
index 0000000000000000000000000000000000000000..76a3d57210cf96b8c17106a06829e27c5f90596e
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Target/targets.gni
@@ -0,0 +1,36 @@
+declare_args() {
+  # The target archs LLVM should support. Defaults to the host arch.
+  # Set to a list, e.g. `llvm_targets_to_build = [ "X86", "ARM" ]`,
+  # or to the string "all" to get all known targets.
+  llvm_targets_to_build = "host"
+}
+
+if (llvm_targets_to_build == "host") {
+  if (host_cpu == "x86" || host_cpu == "x64") {
+    llvm_targets_to_build = [ "X86" ]
+  } else {
+    assert(false, "add your host_cpu above")
+  }
+} else if (llvm_targets_to_build == "all") {
+  # FIXME: Port the remaining targets.
+  llvm_targets_to_build = [ "X86" ]
+}
+
+# Validate that llvm_targets_to_build is set to a list of valid targets,
+# and remember which targets are built.
+llvm_build_X86 = false
+foreach(target, llvm_targets_to_build) {
+  if (target == "X86") {
+    llvm_build_X86 = true
+  } else {
+    #FIXME : Port the remaining targets.
+    assert(false, "Unknown target '$target'.")
+  }
+}
+
+# FIXME: This should be based off target_cpu once cross compiles work.
+if (host_cpu == "x86" || host_cpu == "x64") {
+  native_target = "X86"
+} else {
+  assert(false, "Unsuppored host_cpu '$host_cpu'.")
+}
diff --git a/utils/gn/secondary/llvm/lib/ToolDrivers/llvm-dlltool/BUILD.gn b/utils/gn/secondary/llvm/lib/ToolDrivers/llvm-dlltool/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..b5d5a61c06aaa8380e369b38ce1366beafbe040e
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/ToolDrivers/llvm-dlltool/BUILD.gn
@@ -0,0 +1,19 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("Options") {
+  visibility = [ ":DlltoolDriver" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+static_library("DlltoolDriver") {
+  output_name = "LLVMDlltoolDriver"
+  deps = [
+    ":Options",
+    "//llvm/lib/Object",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "DlltoolDriver.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/ToolDrivers/llvm-lib/BUILD.gn b/utils/gn/secondary/llvm/lib/ToolDrivers/llvm-lib/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..0e8187c502c40bad0fbc071441eb708b6e9fe8f1
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/ToolDrivers/llvm-lib/BUILD.gn
@@ -0,0 +1,20 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("Options") {
+  visibility = [ ":LibDriver" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+static_library("LibDriver") {
+  output_name = "LLVMLibDriver"
+  deps = [
+    ":Options",
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/Object",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "LibDriver.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/AggressiveInstCombine/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/AggressiveInstCombine/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..7bc493350b1f6000eb6b1a0dd61a7fb51a011290
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/AggressiveInstCombine/BUILD.gn
@@ -0,0 +1,13 @@
+static_library("AggressiveInstCombine") {
+  output_name = "LLVMAggressiveInstCombine"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "AggressiveInstCombine.cpp",
+    "TruncInstCombine.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..bbc921c2cc823c45b32a3140c02ace0d0611c718
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn
@@ -0,0 +1,20 @@
+static_library("Coroutines") {
+  output_name = "LLVMCoroutines"
+  deps = [
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/IPO",
+    "//llvm/lib/Transforms/Scalar",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "CoroCleanup.cpp",
+    "CoroEarly.cpp",
+    "CoroElide.cpp",
+    "CoroFrame.cpp",
+    "CoroSplit.cpp",
+    "Coroutines.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..636936bf0ac23b77bcf5034aa0bee1141938ded0
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
@@ -0,0 +1,58 @@
+static_library("IPO") {
+  output_name = "LLVMipo"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/Bitcode/Writer",
+    "//llvm/lib/IR",
+    "//llvm/lib/IRReader",
+    "//llvm/lib/Linker",
+    "//llvm/lib/Object",
+    "//llvm/lib/ProfileData",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/AggressiveInstCombine",
+    "//llvm/lib/Transforms/InstCombine",
+    "//llvm/lib/Transforms/Instrumentation",
+    "//llvm/lib/Transforms/Scalar",
+    "//llvm/lib/Transforms/Utils",
+    "//llvm/lib/Transforms/Vectorize",
+  ]
+  sources = [
+    "AlwaysInliner.cpp",
+    "ArgumentPromotion.cpp",
+    "BarrierNoopPass.cpp",
+    "BlockExtractor.cpp",
+    "CalledValuePropagation.cpp",
+    "ConstantMerge.cpp",
+    "CrossDSOCFI.cpp",
+    "DeadArgumentElimination.cpp",
+    "ElimAvailExtern.cpp",
+    "ExtractGV.cpp",
+    "ForceFunctionAttrs.cpp",
+    "FunctionAttrs.cpp",
+    "FunctionImport.cpp",
+    "GlobalDCE.cpp",
+    "GlobalOpt.cpp",
+    "GlobalSplit.cpp",
+    "HotColdSplitting.cpp",
+    "IPConstantPropagation.cpp",
+    "IPO.cpp",
+    "InferFunctionAttrs.cpp",
+    "InlineSimple.cpp",
+    "Inliner.cpp",
+    "Internalize.cpp",
+    "LoopExtractor.cpp",
+    "LowerTypeTests.cpp",
+    "MergeFunctions.cpp",
+    "PartialInlining.cpp",
+    "PassManagerBuilder.cpp",
+    "PruneEH.cpp",
+    "SCCP.cpp",
+    "SampleProfile.cpp",
+    "StripDeadPrototypes.cpp",
+    "StripSymbols.cpp",
+    "SyntheticCountsPropagation.cpp",
+    "ThinLTOBitcodeWriter.cpp",
+    "WholeProgramDevirt.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/InstCombine/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/InstCombine/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..0082528307b4939e83b93c5b2e2f6163f2231d41
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/InstCombine/BUILD.gn
@@ -0,0 +1,32 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("InstCombineTables") {
+  visibility = [ ":InstCombine" ]
+  args = [ "-gen-searchable-tables" ]
+}
+
+static_library("InstCombine") {
+  output_name = "LLVMInstCombine"
+  deps = [
+    ":InstCombineTables",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "InstCombineAddSub.cpp",
+    "InstCombineAndOrXor.cpp",
+    "InstCombineCalls.cpp",
+    "InstCombineCasts.cpp",
+    "InstCombineCompares.cpp",
+    "InstCombineLoadStoreAlloca.cpp",
+    "InstCombineMulDivRem.cpp",
+    "InstCombinePHI.cpp",
+    "InstCombineSelect.cpp",
+    "InstCombineShifts.cpp",
+    "InstCombineSimplifyDemanded.cpp",
+    "InstCombineVectorOps.cpp",
+    "InstructionCombining.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..0a748893c23a13cd1310cc8de79dfb7fdc173bf7
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
@@ -0,0 +1,29 @@
+static_library("Instrumentation") {
+  output_name = "LLVMInstrumentation"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/ProfileData",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "AddressSanitizer.cpp",
+    "BoundsChecking.cpp",
+    "CGProfile.cpp",
+    "ControlHeightReduction.cpp",
+    "DataFlowSanitizer.cpp",
+    "EfficiencySanitizer.cpp",
+    "GCOVProfiling.cpp",
+    "HWAddressSanitizer.cpp",
+    "IndirectCallPromotion.cpp",
+    "InstrProfiling.cpp",
+    "Instrumentation.cpp",
+    "MemorySanitizer.cpp",
+    "PGOInstrumentation.cpp",
+    "PGOMemOPSizeOpt.cpp",
+    "SanitizerCoverage.cpp",
+    "ThreadSanitizer.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/ObjCARC/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/ObjCARC/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..e7b20845489160787c2d3396e139454f54d0b94f
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/ObjCARC/BUILD.gn
@@ -0,0 +1,20 @@
+static_library("ObjCARC") {
+  output_name = "LLVMObjCARCOpts"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "DependencyAnalysis.cpp",
+    "ObjCARC.cpp",
+    "ObjCARCAPElim.cpp",
+    "ObjCARCContract.cpp",
+    "ObjCARCExpand.cpp",
+    "ObjCARCOpts.cpp",
+    "ProvenanceAnalysis.cpp",
+    "ProvenanceAnalysisEvaluator.cpp",
+    "PtrState.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..3d000fa846b1c3a00855413fdf00b4fe6d8d7768
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
@@ -0,0 +1,85 @@
+static_library("Scalar") {
+  output_name = "LLVMScalarOpts"
+  deps = [
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/AggressiveInstCombine",
+    "//llvm/lib/Transforms/InstCombine",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "ADCE.cpp",
+    "AlignmentFromAssumptions.cpp",
+    "BDCE.cpp",
+    "CallSiteSplitting.cpp",
+    "ConstantHoisting.cpp",
+    "ConstantProp.cpp",
+    "CorrelatedValuePropagation.cpp",
+    "DCE.cpp",
+    "DeadStoreElimination.cpp",
+    "DivRemPairs.cpp",
+    "EarlyCSE.cpp",
+    "FlattenCFGPass.cpp",
+    "Float2Int.cpp",
+    "GVN.cpp",
+    "GVNHoist.cpp",
+    "GVNSink.cpp",
+    "GuardWidening.cpp",
+    "IVUsersPrinter.cpp",
+    "IndVarSimplify.cpp",
+    "InductiveRangeCheckElimination.cpp",
+    "InferAddressSpaces.cpp",
+    "InstSimplifyPass.cpp",
+    "JumpThreading.cpp",
+    "LICM.cpp",
+    "LoopAccessAnalysisPrinter.cpp",
+    "LoopDataPrefetch.cpp",
+    "LoopDeletion.cpp",
+    "LoopDistribute.cpp",
+    "LoopIdiomRecognize.cpp",
+    "LoopInstSimplify.cpp",
+    "LoopInterchange.cpp",
+    "LoopLoadElimination.cpp",
+    "LoopPassManager.cpp",
+    "LoopPredication.cpp",
+    "LoopRerollPass.cpp",
+    "LoopRotation.cpp",
+    "LoopSimplifyCFG.cpp",
+    "LoopSink.cpp",
+    "LoopStrengthReduce.cpp",
+    "LoopUnrollAndJamPass.cpp",
+    "LoopUnrollPass.cpp",
+    "LoopUnswitch.cpp",
+    "LoopVersioningLICM.cpp",
+    "LowerAtomic.cpp",
+    "LowerExpectIntrinsic.cpp",
+    "LowerGuardIntrinsic.cpp",
+    "MakeGuardsExplicit.cpp",
+    "MemCpyOptimizer.cpp",
+    "MergeICmps.cpp",
+    "MergedLoadStoreMotion.cpp",
+    "NaryReassociate.cpp",
+    "NewGVN.cpp",
+    "PartiallyInlineLibCalls.cpp",
+    "PlaceSafepoints.cpp",
+    "Reassociate.cpp",
+    "Reg2Mem.cpp",
+    "RewriteStatepointsForGC.cpp",
+    "SCCP.cpp",
+    "SROA.cpp",
+    "Scalar.cpp",
+    "Scalarizer.cpp",
+    "SeparateConstOffsetFromGEP.cpp",
+    "SimpleLoopUnswitch.cpp",
+    "SimplifyCFGPass.cpp",
+    "Sink.cpp",
+    "SpeculateAroundPHIs.cpp",
+    "SpeculativeExecution.cpp",
+    "StraightLineStrengthReduce.cpp",
+    "StructurizeCFG.cpp",
+    "TailRecursionElimination.cpp",
+    "WarnMissedTransforms.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..843dd78e8d7f1b9ed8270313f3afbe96d790130b
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -0,0 +1,68 @@
+static_library("Utils") {
+  output_name = "LLVMTransformUtils"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "ASanStackFrameLayout.cpp",
+    "AddDiscriminators.cpp",
+    "BasicBlockUtils.cpp",
+    "BreakCriticalEdges.cpp",
+    "BuildLibCalls.cpp",
+    "BypassSlowDivision.cpp",
+    "CallPromotionUtils.cpp",
+    "CloneFunction.cpp",
+    "CloneModule.cpp",
+    "CodeExtractor.cpp",
+    "CtorUtils.cpp",
+    "DemoteRegToStack.cpp",
+    "EntryExitInstrumenter.cpp",
+    "EscapeEnumerator.cpp",
+    "Evaluator.cpp",
+    "FlattenCFG.cpp",
+    "FunctionComparator.cpp",
+    "FunctionImportUtils.cpp",
+    "GlobalStatus.cpp",
+    "GuardUtils.cpp",
+    "ImportedFunctionsInliningStatistics.cpp",
+    "InlineFunction.cpp",
+    "InstructionNamer.cpp",
+    "IntegerDivision.cpp",
+    "LCSSA.cpp",
+    "LibCallsShrinkWrap.cpp",
+    "Local.cpp",
+    "LoopRotationUtils.cpp",
+    "LoopSimplify.cpp",
+    "LoopUnroll.cpp",
+    "LoopUnrollAndJam.cpp",
+    "LoopUnrollPeel.cpp",
+    "LoopUnrollRuntime.cpp",
+    "LoopUtils.cpp",
+    "LoopVersioning.cpp",
+    "LowerInvoke.cpp",
+    "LowerMemIntrinsics.cpp",
+    "LowerSwitch.cpp",
+    "Mem2Reg.cpp",
+    "MetaRenamer.cpp",
+    "ModuleUtils.cpp",
+    "NameAnonGlobals.cpp",
+    "PredicateInfo.cpp",
+    "PromoteMemoryToRegister.cpp",
+    "SSAUpdater.cpp",
+    "SSAUpdaterBulk.cpp",
+    "SanitizerStats.cpp",
+    "SimplifyCFG.cpp",
+    "SimplifyIndVar.cpp",
+    "SimplifyLibCalls.cpp",
+    "SplitModule.cpp",
+    "StripGCRelocates.cpp",
+    "StripNonLineTableDebugInfo.cpp",
+    "SymbolRewriter.cpp",
+    "UnifyFunctionExitNodes.cpp",
+    "Utils.cpp",
+    "VNCoercion.cpp",
+    "ValueMapper.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..24d52bac1cf8bb1524b36c1f08c2d48020cd9bef
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -0,0 +1,21 @@
+static_library("Vectorize") {
+  output_name = "LLVMVectorize"
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "LoadStoreVectorizer.cpp",
+    "LoopVectorizationLegality.cpp",
+    "LoopVectorize.cpp",
+    "SLPVectorizer.cpp",
+    "VPlan.cpp",
+    "VPlanHCFGBuilder.cpp",
+    "VPlanHCFGTransforms.cpp",
+    "VPlanSLP.cpp",
+    "VPlanVerifier.cpp",
+    "Vectorize.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/lib/WindowsManifest/BUILD.gn b/utils/gn/secondary/llvm/lib/WindowsManifest/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..d4c23c97397db0c9a8eb2cd3ea795ed19478fdaa
--- /dev/null
+++ b/utils/gn/secondary/llvm/lib/WindowsManifest/BUILD.gn
@@ -0,0 +1,11 @@
+static_library("WindowsManifest") {
+  output_name = "LLVMWindowsManifest"
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/lib/Support",
+    "//llvm/utils/gn/build/libs/xml",
+  ]
+  sources = [
+    "WindowsManifestMerger.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/binutils_symlinks.gni b/utils/gn/secondary/llvm/tools/binutils_symlinks.gni
new file mode 100644
index 0000000000000000000000000000000000000000..3149775b1de4fed9916bf12839f17b84a43ecfda
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/binutils_symlinks.gni
@@ -0,0 +1,5 @@
+declare_args() {
+  # If set, creates symlinks for nm, objdump, readelf in the build
+  # directory.
+  llvm_install_binutils_symlinks = false
+}
diff --git a/utils/gn/secondary/llvm/tools/llc/BUILD.gn b/utils/gn/secondary/llvm/tools/llc/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..45cde4a59b285610f0b1f611291cface142a797f
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llc/BUILD.gn
@@ -0,0 +1,28 @@
+executable("llc") {
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/CodeGen/AsmPrinter",
+    "//llvm/lib/CodeGen/MIRParser",
+    "//llvm/lib/CodeGen/SelectionDAG",
+    "//llvm/lib/IR",
+    "//llvm/lib/IRReader",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+    "//llvm/lib/Target:TargetsToBuild",
+    "//llvm/lib/Transforms/Scalar",
+    "//llvm/lib/Transforms/Utils",
+    "//llvm/lib/Transforms/Vectorize",
+  ]
+  sources = [
+    "llc.cpp",
+  ]
+
+  # Support plugins.
+  # FIXME: Disable dead stripping once other binaries are dead-stripped.
+  if (host_os == "linux") {
+    # Corresponds to export_executable_symbols() in cmake.
+    ldflags = [ "-rdynamic" ]
+  }
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-ar/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-ar/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..969fbed688cc54e095f511b52c56ba33fb2a4b47
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-ar/BUILD.gn
@@ -0,0 +1,46 @@
+import("//llvm/tools/binutils_symlinks.gni")
+import("//llvm/utils/gn/build/symlink_or_copy.gni")
+
+symlinks = [
+  "llvm-dlltool",
+  "llvm-lib",
+  "llvm-ranlib",
+]
+if (llvm_install_binutils_symlinks) {
+  symlinks += [
+    "ar",
+    "dlltool",
+    "ranlib",
+  ]
+}
+foreach(target, symlinks) {
+  symlink_or_copy(target) {
+    deps = [
+      ":llvm-ar",
+    ]
+    source = "llvm-ar"
+    output = "$root_out_dir/bin/$target"
+  }
+}
+
+# //:llvm-ar depends on this symlink target, see comment in //BUILD.gn.
+group("symlinks") {
+  deps = []
+  foreach(target, symlinks) {
+    deps += [ ":$target" ]
+  }
+}
+
+executable("llvm-ar") {
+  deps = [
+    "//llvm/lib/IR",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target:TargetsToBuild",
+    "//llvm/lib/ToolDrivers/llvm-dlltool:DlltoolDriver",
+    "//llvm/lib/ToolDrivers/llvm-lib:LibDriver",
+  ]
+  sources = [
+    "llvm-ar.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-as/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-as/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..914cb9a5f4e429fb1c1254a5517d86a50da92b97
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-as/BUILD.gn
@@ -0,0 +1,11 @@
+executable("llvm-as") {
+  deps = [
+    "//llvm/lib/AsmParser",
+    "//llvm/lib/Bitcode/Writer",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "llvm-as.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-bcanalyzer/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-bcanalyzer/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..a5d7581720b08b505c658ef34476a652c9a1fbf4
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-bcanalyzer/BUILD.gn
@@ -0,0 +1,9 @@
+executable("llvm-bcanalyzer") {
+  deps = [
+    "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "llvm-bcanalyzer.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-dis/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-dis/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..c0c0c49c132a025ba72a05f0da1430b911af5082
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-dis/BUILD.gn
@@ -0,0 +1,10 @@
+executable("llvm-dis") {
+  deps = [
+    "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "llvm-dis.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-dwarfdump/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-dwarfdump/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..b99086fd4dc20bc597eabf14fa3102a06b7b207d
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-dwarfdump/BUILD.gn
@@ -0,0 +1,14 @@
+executable("llvm-dwarfdump") {
+  deps = [
+    "//llvm/lib/DebugInfo/DWARF",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target:AllTargetsDescs",
+    "//llvm/lib/Target:AllTargetsInfos",
+  ]
+  sources = [
+    "Statistics.cpp",
+    "llvm-dwarfdump.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-mc/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-mc/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..ddaae808e7ab49c51a7860cc9a0f7f4b1133bd44
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-mc/BUILD.gn
@@ -0,0 +1,16 @@
+executable("llvm-mc") {
+  deps = [
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCParser",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target:AllTargetsAsmParsers",
+    "//llvm/lib/Target:AllTargetsAsmPrinters",
+    "//llvm/lib/Target:AllTargetsDescs",
+    "//llvm/lib/Target:AllTargetsDisassemblers",
+    "//llvm/lib/Target:AllTargetsInfos",
+  ]
+  sources = [
+    "Disassembler.cpp",
+    "llvm-mc.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-nm/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-nm/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..b433ef602471a39183c7fa377c2f5f24528e6f18
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-nm/BUILD.gn
@@ -0,0 +1,37 @@
+import("//llvm/tools/binutils_symlinks.gni")
+import("//llvm/utils/gn/build/symlink_or_copy.gni")
+
+if (llvm_install_binutils_symlinks) {
+  symlink_or_copy("nm") {
+    deps = [
+      ":llvm-nm",
+    ]
+    source = "llvm-nm"
+    output = "$root_out_dir/bin/nm"
+  }
+}
+
+# //:llvm-nm depends on this symlink target, see comment in //BUILD.gn.
+group("symlinks") {
+  if (llvm_install_binutils_symlinks) {
+    deps = [
+      ":nm",
+    ]
+  }
+}
+
+executable("llvm-nm") {
+  deps = [
+    "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/Demangle",
+    "//llvm/lib/IR",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target:AllTargetsAsmParsers",
+    "//llvm/lib/Target:AllTargetsDescs",
+    "//llvm/lib/Target:AllTargetsInfos",
+  ]
+  sources = [
+    "llvm-nm.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..6dd48d6ef8b055bab6415347e7e5f97c5b5551d8
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn
@@ -0,0 +1,57 @@
+import("//llvm/tools/binutils_symlinks.gni")
+import("//llvm/utils/TableGen/tablegen.gni")
+import("//llvm/utils/gn/build/symlink_or_copy.gni")
+
+tablegen("ObjcopyOpts") {
+  visibility = [ ":llvm-objcopy" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+tablegen("StripOpts") {
+  visibility = [ ":llvm-objcopy" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+symlinks = [ "llvm-strip" ]
+if (llvm_install_binutils_symlinks) {
+  symlinks += [
+    "objcopy",
+    "strip",
+  ]
+}
+foreach(target, symlinks) {
+  symlink_or_copy(target) {
+    deps = [
+      ":llvm-objcopy",
+    ]
+    source = "llvm-objcopy"
+    output = "$root_out_dir/bin/$target"
+  }
+}
+
+# //:llvm-objcopy depends on this symlink target, see comment in //BUILD.gn.
+group("symlinks") {
+  deps = []
+  foreach(target, symlinks) {
+    deps += [ ":$target" ]
+  }
+}
+
+executable("llvm-objcopy") {
+  deps = [
+    ":ObjcopyOpts",
+    ":StripOpts",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+  ]
+  include_dirs = [ "." ]
+  sources = [
+    "Buffer.cpp",
+    "CopyConfig.cpp",
+    "ELF/ELFObjcopy.cpp",
+    "ELF/Object.cpp",
+    "llvm-objcopy.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-objdump/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-objdump/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..e8ee2e760c7925b5375300753d6f81cadcbbbbfd
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-objdump/BUILD.gn
@@ -0,0 +1,48 @@
+import("//llvm/tools/binutils_symlinks.gni")
+import("//llvm/utils/gn/build/symlink_or_copy.gni")
+
+if (llvm_install_binutils_symlinks) {
+  symlink_or_copy("objdump") {
+    deps = [
+      ":llvm-objdump",
+    ]
+    source = "llvm-objdump"
+    output = "$root_out_dir/bin/objdump"
+  }
+}
+
+# //:llvm-nm depends on this symlink target, see comment in //BUILD.gn.
+group("symlinks") {
+  if (llvm_install_binutils_symlinks) {
+    deps = [
+      ":objdump",
+    ]
+  }
+}
+
+executable("llvm-objdump") {
+  deps = [
+    "//llvm/include/llvm/Config:config",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/DebugInfo/DWARF",
+    "//llvm/lib/DebugInfo/PDB",
+    "//llvm/lib/DebugInfo/Symbolize",
+    "//llvm/lib/Demangle",
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCDisassembler",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target:AllTargetsAsmPrinters",
+    "//llvm/lib/Target:AllTargetsDescs",
+    "//llvm/lib/Target:AllTargetsDisassemblers",
+    "//llvm/lib/Target:AllTargetsInfos",
+    "//llvm/utils/gn/build/libs/xar",
+  ]
+  sources = [
+    "COFFDump.cpp",
+    "ELFDump.cpp",
+    "MachODump.cpp",
+    "WasmDump.cpp",
+    "llvm-objdump.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-pdbutil/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-pdbutil/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..b72bd00e15617008164469da2fc42f326921fa43
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-pdbutil/BUILD.gn
@@ -0,0 +1,35 @@
+executable("llvm-pdbutil") {
+  deps = [
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/DebugInfo/MSF",
+    "//llvm/lib/DebugInfo/PDB",
+    "//llvm/lib/Object",
+    "//llvm/lib/ObjectYAML",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "BytesOutputStyle.cpp",
+    "DumpOutputStyle.cpp",
+    "ExplainOutputStyle.cpp",
+    "FormatUtil.cpp",
+    "InputFile.cpp",
+    "LinePrinter.cpp",
+    "MinimalSymbolDumper.cpp",
+    "MinimalTypeDumper.cpp",
+    "PdbYaml.cpp",
+    "PrettyBuiltinDumper.cpp",
+    "PrettyClassDefinitionDumper.cpp",
+    "PrettyClassLayoutGraphicalDumper.cpp",
+    "PrettyCompilandDumper.cpp",
+    "PrettyEnumDumper.cpp",
+    "PrettyExternalSymbolDumper.cpp",
+    "PrettyFunctionDumper.cpp",
+    "PrettyTypeDumper.cpp",
+    "PrettyTypedefDumper.cpp",
+    "PrettyVariableDumper.cpp",
+    "StreamUtil.cpp",
+    "YAMLOutputStyle.cpp",
+    "llvm-pdbutil.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-readobj/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-readobj/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..501d809d8096f869446719b2c0766f615650dfc9
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-readobj/BUILD.gn
@@ -0,0 +1,49 @@
+import("//llvm/tools/binutils_symlinks.gni")
+import("//llvm/utils/gn/build/symlink_or_copy.gni")
+
+symlinks = [ "llvm-readelf" ]
+if (llvm_install_binutils_symlinks) {
+  symlinks += [ "readelf" ]
+}
+foreach(target, symlinks) {
+  symlink_or_copy(target) {
+    deps = [
+      ":llvm-readobj",
+    ]
+    source = "llvm-readobj"
+    output = "$root_out_dir/bin/$target"
+  }
+}
+
+# //:llvm-readobj depends on this symlink target, see comment in //BUILD.gn.
+group("symlinks") {
+  deps = []
+  foreach(target, symlinks) {
+    deps += [ ":$target" ]
+  }
+}
+
+executable("llvm-readobj") {
+  deps = [
+    "//llvm/lib/BinaryFormat",
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/DebugInfo/DWARF",
+    "//llvm/lib/DebugInfo/MSF",
+    "//llvm/lib/DebugInfo/PDB",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "ARMWinEHPrinter.cpp",
+    "COFFDumper.cpp",
+    "COFFImportDumper.cpp",
+    "ELFDumper.cpp",
+    "Error.cpp",
+    "MachODumper.cpp",
+    "ObjDumper.cpp",
+    "WasmDumper.cpp",
+    "Win64EHDumper.cpp",
+    "WindowsResourceDumper.cpp",
+    "llvm-readobj.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/llvm-undname/BUILD.gn b/utils/gn/secondary/llvm/tools/llvm-undname/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..56f1ed11bc492d13509daa0268bb861423ff7eb0
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/llvm-undname/BUILD.gn
@@ -0,0 +1,9 @@
+executable("llvm-undname") {
+  deps = [
+    "//llvm/lib/Demangle",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "llvm-undname.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/obj2yaml/BUILD.gn b/utils/gn/secondary/llvm/tools/obj2yaml/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..6ba2022e66ba8fd09568bb0fc598ecce74bdfec7
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/obj2yaml/BUILD.gn
@@ -0,0 +1,18 @@
+executable("obj2yaml") {
+  deps = [
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/DebugInfo/DWARF",
+    "//llvm/lib/Object",
+    "//llvm/lib/ObjectYAML",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "Error.cpp",
+    "coff2yaml.cpp",
+    "dwarf2yaml.cpp",
+    "elf2yaml.cpp",
+    "macho2yaml.cpp",
+    "obj2yaml.cpp",
+    "wasm2yaml.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/tools/opt/BUILD.gn b/utils/gn/secondary/llvm/tools/opt/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..7deae1c79c315df9c8e3242b4c9bd1e1e0c551c5
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/opt/BUILD.gn
@@ -0,0 +1,39 @@
+executable("opt") {
+  deps = [
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/lib/Analysis",
+    "//llvm/lib/Bitcode/Writer",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/IR",
+    "//llvm/lib/MC",
+    "//llvm/lib/Passes",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target",
+    "//llvm/lib/Target:TargetsToBuild",
+    "//llvm/lib/Transforms/AggressiveInstCombine",
+    "//llvm/lib/Transforms/Coroutines",
+    "//llvm/lib/Transforms/IPO",
+    "//llvm/lib/Transforms/Instrumentation",
+    "//llvm/lib/Transforms/ObjCARC",
+    "//llvm/lib/Transforms/Scalar",
+    "//llvm/lib/Transforms/Utils",
+    "//llvm/lib/Transforms/Vectorize",
+  ]
+  sources = [
+    "AnalysisWrappers.cpp",
+    "BreakpointPrinter.cpp",
+    "Debugify.cpp",
+    "GraphPrinters.cpp",
+    "NewPMDriver.cpp",
+    "PassPrinters.cpp",
+    "PrintSCC.cpp",
+    "opt.cpp",
+  ]
+
+  # Support plugins.
+  # FIXME: Disable dead stripping once other binaries are dead-stripped.
+  if (host_os == "linux") {
+    # Corresponds to export_executable_symbols() in cmake.
+    ldflags = [ "-rdynamic" ]
+  }
+}
diff --git a/utils/gn/secondary/llvm/tools/yaml2obj/BUILD.gn b/utils/gn/secondary/llvm/tools/yaml2obj/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..56bebc43cd323efa1d0de5cce2eb95a7b4738d7b
--- /dev/null
+++ b/utils/gn/secondary/llvm/tools/yaml2obj/BUILD.gn
@@ -0,0 +1,16 @@
+executable("yaml2obj") {
+  deps = [
+    "//llvm/lib/DebugInfo/CodeView",
+    "//llvm/lib/MC",
+    "//llvm/lib/Object",
+    "//llvm/lib/ObjectYAML",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "yaml2coff.cpp",
+    "yaml2elf.cpp",
+    "yaml2macho.cpp",
+    "yaml2obj.cpp",
+    "yaml2wasm.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/triples.gni b/utils/gn/secondary/llvm/triples.gni
new file mode 100644
index 0000000000000000000000000000000000000000..b5ff76237218c3f16049f74618852adfa2bdb904
--- /dev/null
+++ b/utils/gn/secondary/llvm/triples.gni
@@ -0,0 +1,12 @@
+if (host_os == "linux") {
+  llvm_host_triple = "x86_64-unknown-linux-gnu"
+} else if (host_os == "mac") {
+  llvm_host_triple = "x86_64-apple-darwin"
+} else if (host_os == "win") {
+  llvm_host_triple = "x86_64-pc-windows"
+}
+
+declare_args() {
+  # The default target triple.
+  llvm_target_triple = llvm_host_triple
+}
diff --git a/utils/gn/secondary/llvm/utils/FileCheck/BUILD.gn b/utils/gn/secondary/llvm/utils/FileCheck/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..65a2b73e8dbe4d6438578f37089468ea98c3da39
--- /dev/null
+++ b/utils/gn/secondary/llvm/utils/FileCheck/BUILD.gn
@@ -0,0 +1,8 @@
+executable("FileCheck") {
+  deps = [
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "FileCheck.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..01219543d2db726960267d1d9a04fb7dc3a36ac5
--- /dev/null
+++ b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
@@ -0,0 +1,57 @@
+executable("llvm-tblgen") {
+  deps = [
+    "//llvm/include/llvm/Config:llvm-config",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+    "//llvm/lib/TableGen",
+  ]
+  sources = [
+    "AsmMatcherEmitter.cpp",
+    "AsmWriterEmitter.cpp",
+    "AsmWriterInst.cpp",
+    "Attributes.cpp",
+    "CTagsEmitter.cpp",
+    "CallingConvEmitter.cpp",
+    "CodeEmitterGen.cpp",
+    "CodeGenDAGPatterns.cpp",
+    "CodeGenHwModes.cpp",
+    "CodeGenInstruction.cpp",
+    "CodeGenMapTable.cpp",
+    "CodeGenRegisters.cpp",
+    "CodeGenSchedule.cpp",
+    "CodeGenTarget.cpp",
+    "DAGISelEmitter.cpp",
+    "DAGISelMatcher.cpp",
+    "DAGISelMatcherEmitter.cpp",
+    "DAGISelMatcherGen.cpp",
+    "DAGISelMatcherOpt.cpp",
+    "DFAPacketizerEmitter.cpp",
+    "DisassemblerEmitter.cpp",
+    "ExegesisEmitter.cpp",
+    "FastISelEmitter.cpp",
+    "FixedLenDecoderEmitter.cpp",
+    "GlobalISelEmitter.cpp",
+    "InfoByHwMode.cpp",
+    "InstrDocsEmitter.cpp",
+    "InstrInfoEmitter.cpp",
+    "IntrinsicEmitter.cpp",
+    "OptParserEmitter.cpp",
+    "PredicateExpander.cpp",
+    "PseudoLoweringEmitter.cpp",
+    "RISCVCompressInstEmitter.cpp",
+    "RegisterBankEmitter.cpp",
+    "RegisterInfoEmitter.cpp",
+    "SDNodeProperties.cpp",
+    "SearchableTableEmitter.cpp",
+    "SubtargetEmitter.cpp",
+    "SubtargetFeatureInfo.cpp",
+    "TableGen.cpp",
+    "Types.cpp",
+    "WebAssemblyDisassemblerEmitter.cpp",
+    "X86DisassemblerTables.cpp",
+    "X86EVEX2VEXTablesEmitter.cpp",
+    "X86FoldTablesEmitter.cpp",
+    "X86ModRMFilters.cpp",
+    "X86RecognizableInstr.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni b/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni
new file mode 100644
index 0000000000000000000000000000000000000000..2a5520e9f1bea1c2f4ad111353b866a4363a9d97
--- /dev/null
+++ b/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni
@@ -0,0 +1,91 @@
+# This file defines a template for running llvm-tblgen.
+#
+# Parameters:
+#
+#   args (required)
+#       [list of strings] Flags to pass to llvm-tblgen.
+#
+#   output_name (optional)
+#       Basename of the generated output file.
+#       Defaults to target name with ".inc" appended.
+#
+#   td_file (roptional)
+#       The .td file to pass to llvm-tblgen.
+#       Defaults to target name with ".td" appended.
+#
+#   visibility (optional)
+#       GN's regular visibility attribute, see `gn help visibility`.
+#
+# Example use:
+#
+#   tablegen("attributes_compat_func_gen") {
+#     visibility = [ ":IR" ]
+#     args = [ "-gen-attrs" ]
+#     td_file = "AttributesCompatFunc.td"
+#   }
+
+template("tablegen") {
+  assert(defined(invoker.args), "args must be defined for $target_name")
+
+  config_name = "${target_name}_config"
+  config(config_name) {
+    visibility = [ ":$target_name" ]
+    include_dirs = [ target_gen_dir ]
+  }
+
+  action(target_name) {
+    forward_variables_from(invoker, [ "visibility" ])
+
+    # FIXME: In cross builds, this should depend on the host binary.
+    tblgen_target = "//llvm/utils/TableGen:llvm-tblgen"
+    tblgen_executable = get_label_info(tblgen_target, "root_out_dir") +
+                        "/bin/" + get_label_info(tblgen_target, "name")
+    deps = [
+      tblgen_target,
+    ]
+    if (defined(invoker.td_file)) {
+      td_file = invoker.td_file
+    } else {
+      td_file = "$target_name.td"
+    }
+    sources = [
+      td_file,
+    ]
+    script = "//llvm/utils/gn/build/run_tablegen.py"
+    if (defined(invoker.output_name)) {
+      gen_output = "$target_gen_dir/" + invoker.output_name
+    } else {
+      gen_output = "$target_gen_dir/$target_name.inc"
+    }
+    depfile = "$gen_output.d"
+    td_file = rebase_path(td_file, root_build_dir)
+
+    # FIXME: The cmake build lets tablegen write to a temp file and then copies
+    # it over the final output only if it has changed, for ninja's restat
+    # optimization. Instead of doing that in cmake, llvm-tblgen should do this
+    # itself. r330742 tried this, but it caused problems. Fix those and reland,
+    # so that the gn build has the optimization too.
+    args = [
+             rebase_path(tblgen_executable, root_build_dir),
+             "-I",
+             rebase_path("//llvm/include", root_build_dir),
+
+             # FIXME: Rather than duplicating this -I flag in both the CMake and
+             # the gn build, let tablegen add input dir to include search path
+             # instead?
+             "-I",
+             get_path_info(td_file, "dir"),
+             "-d",
+             rebase_path(depfile, root_build_dir),
+             "-o",
+             rebase_path(gen_output, root_build_dir),
+             td_file,
+           ] + invoker.args
+    outputs = [
+      gen_output,
+    ]
+
+    # Let targets depending on this find the generated file.
+    public_configs = [ ":$config_name" ]
+  }
+}
diff --git a/utils/gn/secondary/llvm/utils/count/BUILD.gn b/utils/gn/secondary/llvm/utils/count/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..efbc2c995b462c10077449d251213beeec8673bf
--- /dev/null
+++ b/utils/gn/secondary/llvm/utils/count/BUILD.gn
@@ -0,0 +1,5 @@
+executable("count") {
+  sources = [
+    "count.c",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/utils/not/BUILD.gn b/utils/gn/secondary/llvm/utils/not/BUILD.gn
new file mode 100644
index 0000000000000000000000000000000000000000..ea450e1b3c1c7f6af4e6194a01a48ec01340d67a
--- /dev/null
+++ b/utils/gn/secondary/llvm/utils/not/BUILD.gn
@@ -0,0 +1,8 @@
+executable("not") {
+  deps = [
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "not.cpp",
+  ]
+}
diff --git a/utils/gn/secondary/llvm/version.gni b/utils/gn/secondary/llvm/version.gni
new file mode 100644
index 0000000000000000000000000000000000000000..b64e3f9bdec08812d2425a19ade42ddc277b6ddd
--- /dev/null
+++ b/utils/gn/secondary/llvm/version.gni
@@ -0,0 +1,4 @@
+llvm_version_major = 8
+llvm_version_minor = 0
+llvm_version_patch = 0
+llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/utils/kate/llvm.xml b/utils/kate/llvm.xml
index 5ba46ee46aca35c8402effd2f3ca2d701220a96c..bbb97b5222e0e3618506136ede1054de10d20db7 100644
--- a/utils/kate/llvm.xml
+++ b/utils/kate/llvm.xml
@@ -120,6 +120,7 @@
       <item> fadd </item>
       <item> sub </item>
       <item> fsub </item>
+      <item> fneg </item>
       <item> mul </item>
       <item> fmul </item>
       <item> udiv </item>
diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index d5adb535775d161db5352807c2386cc90f35fbf7..8060688116be7dc35a4fc8b67f2d1147b552e22d 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py
@@ -26,7 +26,16 @@ class TestingConfig:
                      'LSAN_OPTIONS', 'ADB', 'ANDROID_SERIAL',
                      'SANITIZER_IGNORE_CVE_2016_2143', 'TMPDIR', 'TMP', 'TEMP',
                      'TEMPDIR', 'AVRLIT_BOARD', 'AVRLIT_PORT',
-                     'FILECHECK_DUMP_INPUT_ON_FAILURE', 'FILECHECK_OPTS']
+                     'FILECHECK_DUMP_INPUT_ON_FAILURE', 'FILECHECK_OPTS',
+                     'VCINSTALLDIR', 'VCToolsinstallDir', 'VSINSTALLDIR',
+                     'WindowsSdkDir', 'WindowsSDKLibVersion']
+
+        if sys.platform == 'win32':
+            pass_vars.append('INCLUDE')
+            pass_vars.append('LIB')
+            pass_vars.append('PATHEXT')
+            environment['PYTHONBUFFERED'] = '1'
+
         for var in pass_vars:
             val = os.environ.get(var, '')
             # Check for empty string as some variables such as LD_PRELOAD cannot be empty
@@ -34,15 +43,6 @@ class TestingConfig:
             if val:
                 environment[var] = val
 
-        if sys.platform == 'win32':
-            environment.update({
-                    'INCLUDE' : os.environ.get('INCLUDE',''),
-                    'PATHEXT' : os.environ.get('PATHEXT',''),
-                    'PYTHONUNBUFFERED' : '1',
-                    'TEMP' : os.environ.get('TEMP',''),
-                    'TMP' : os.environ.get('TMP',''),
-                    })
-
         # Set the default available features based on the LitConfig.
         available_features = []
         if litConfig.useValgrind:
@@ -100,7 +100,6 @@ class TestingConfig:
             litConfig.fatal(
                 'unable to parse config file %r, traceback: %s' % (
                     path, traceback.format_exc()))
-
         self.finish(litConfig)
 
     def __init__(self, parent, name, suffixes, test_format,
diff --git a/utils/lit/lit/llvm/config.py b/utils/lit/lit/llvm/config.py
index 6bb7135f65907ba7265fb43a548f3f2950241baf..c7846bbe4e9c7ba2f971b4e222ef907d7f55b1c4 100644
--- a/utils/lit/lit/llvm/config.py
+++ b/utils/lit/lit/llvm/config.py
@@ -33,6 +33,8 @@ class LLVMConfig(object):
                                                 ['cmp.exe', 'grep.exe', 'sed.exe'])
             if path is not None:
                 self.with_environment('PATH', path, append_path=True)
+            # Many tools behave strangely if these environment variables aren't set.
+            self.with_system_environment(['SystemDrive', 'SystemRoot', 'TEMP', 'TMP'])
             self.use_lit_shell = True
 
         # Choose between lit's internal shell pipeline runner and a real shell.  If
@@ -57,6 +59,8 @@ class LLVMConfig(object):
             features.add('system-linux')
         elif platform.system() in ['FreeBSD']:
             config.available_features.add('system-freebsd')
+        elif platform.system() == "NetBSD":
+            features.add('system-netbsd')
 
         # Native compilation: host arch == default triple arch
         # Both of these values should probably be in every site config (e.g. as
@@ -333,7 +337,7 @@ class LLVMConfig(object):
                 self.lit_config.note('using {}: {}'.format(name, tool))
         return tool
 
-    def use_clang(self, required=True):
+    def use_clang(self, additional_tool_dirs=[], additional_flags=[], required=True):
         """Configure the test suite to be able to invoke clang.
 
         Sets up some environment variables important to clang, locates a
@@ -370,12 +374,16 @@ class LLVMConfig(object):
 
         # Tweak the PATH to include the tools dir and the scripts dir.
         # Put Clang first to avoid LLVM from overriding out-of-tree clang builds.
-        possible_paths = ['clang_tools_dir', 'llvm_tools_dir']
-        paths = [getattr(self.config, pp) for pp in possible_paths
+        exe_dir_props = [self.config.name.lower() + '_tools_dir', 'clang_tools_dir', 'llvm_tools_dir']
+        paths = [getattr(self.config, pp) for pp in exe_dir_props
                  if getattr(self.config, pp, None)]
+        paths = additional_tool_dirs + paths
         self.with_environment('PATH', paths, append_path=True)
 
-        paths = [self.config.llvm_shlib_dir, self.config.llvm_libs_dir]
+        lib_dir_props = [self.config.name.lower() + '_libs_dir', 'clang_libs_dir', 'llvm_shlib_dir', 'llvm_libs_dir']
+        paths = [getattr(self.config, pp) for pp in lib_dir_props
+                 if getattr(self.config, pp, None)]
+
         self.with_environment('LD_LIBRARY_PATH', paths, append_path=True)
 
         # Discover the 'clang' and 'clangcc' to use.
@@ -383,19 +391,21 @@ class LLVMConfig(object):
         self.config.clang = self.use_llvm_tool(
             'clang', search_env='CLANG', required=required)
 
-        self.config.substitutions.append(
-            ('%llvmshlibdir', self.config.llvm_shlib_dir))
-        self.config.substitutions.append(
-            ('%pluginext', self.config.llvm_plugin_ext))
+        shl = getattr(self.config, 'llvm_shlib_dir', None)
+        pext = getattr(self.config, 'llvm_plugin_ext', None)
+        if shl:
+            self.config.substitutions.append(('%llvmshlibdir', shl))
+        if pext:
+            self.config.substitutions.append(('%pluginext', pext))
 
         builtin_include_dir = self.get_clang_builtin_include_dir(self.config.clang)
         tool_substitutions = [
-            ToolSubst('%clang', command=self.config.clang),
-            ToolSubst('%clang_analyze_cc1', command='%clang_cc1', extra_args=['-analyze', '%analyze']),
-            ToolSubst('%clang_cc1', command=self.config.clang, extra_args=['-cc1', '-internal-isystem', builtin_include_dir, '-nostdsysteminc']),
-            ToolSubst('%clang_cpp', command=self.config.clang, extra_args=['--driver-mode=cpp']),
-            ToolSubst('%clang_cl', command=self.config.clang, extra_args=['--driver-mode=cl']),
-            ToolSubst('%clangxx', command=self.config.clang, extra_args=['--driver-mode=g++']),
+            ToolSubst('%clang', command=self.config.clang, extra_args=additional_flags),
+            ToolSubst('%clang_analyze_cc1', command='%clang_cc1', extra_args=['-analyze', '%analyze']+additional_flags),
+            ToolSubst('%clang_cc1', command=self.config.clang, extra_args=['-cc1', '-internal-isystem', builtin_include_dir, '-nostdsysteminc']+additional_flags),
+            ToolSubst('%clang_cpp', command=self.config.clang, extra_args=['--driver-mode=cpp']+additional_flags),
+            ToolSubst('%clang_cl', command=self.config.clang, extra_args=['--driver-mode=cl']+additional_flags),
+            ToolSubst('%clangxx', command=self.config.clang, extra_args=['--driver-mode=g++']+additional_flags),
             ]
         self.add_tool_substitutions(tool_substitutions)
 
@@ -415,34 +425,34 @@ class LLVMConfig(object):
             self.config.substitutions.append(
                 ('%target_itanium_abi_host_triple', ''))
 
-        self.config.substitutions.append(
-            ('%src_include_dir', self.config.clang_src_dir + '/include'))
-
         # FIXME: Find nicer way to prohibit this.
         self.config.substitutions.append(
-            (' clang ', """*** Do not use 'clang' in tests, use '%clang'. ***"""))
+            (' clang ', """\"*** Do not use 'clang' in tests, use '%clang'. ***\""""))
         self.config.substitutions.append(
-            (' clang\+\+ ', """*** Do not use 'clang++' in tests, use '%clangxx'. ***"""))
+            (' clang\+\+ ', """\"*** Do not use 'clang++' in tests, use '%clangxx'. ***\""""))
         self.config.substitutions.append(
             (' clang-cc ',
-             """*** Do not use 'clang-cc' in tests, use '%clang_cc1'. ***"""))
+             """\"*** Do not use 'clang-cc' in tests, use '%clang_cc1'. ***\""""))
+        self.config.substitutions.append(
+            (' clang-cl ',
+             """\"*** Do not use 'clang-cl' in tests, use '%clang_cl'. ***\""""))
         self.config.substitutions.append(
             (' clang -cc1 -analyze ',
-             """*** Do not use 'clang -cc1 -analyze' in tests, use '%clang_analyze_cc1'. ***"""))
+             """\"*** Do not use 'clang -cc1 -analyze' in tests, use '%clang_analyze_cc1'. ***\""""))
         self.config.substitutions.append(
             (' clang -cc1 ',
-             """*** Do not use 'clang -cc1' in tests, use '%clang_cc1'. ***"""))
+             """\"*** Do not use 'clang -cc1' in tests, use '%clang_cc1'. ***\""""))
         self.config.substitutions.append(
             (' %clang-cc1 ',
-             """*** invalid substitution, use '%clang_cc1'. ***"""))
+             """\"*** invalid substitution, use '%clang_cc1'. ***\""""))
         self.config.substitutions.append(
             (' %clang-cpp ',
-             """*** invalid substitution, use '%clang_cpp'. ***"""))
+             """\"*** invalid substitution, use '%clang_cpp'. ***\""""))
         self.config.substitutions.append(
             (' %clang-cl ',
-             """*** invalid substitution, use '%clang_cl'. ***"""))
+             """\"*** invalid substitution, use '%clang_cl'. ***\""""))
 
-    def use_lld(self, required=True):
+    def use_lld(self, additional_tool_dirs=[], required=True):
         """Configure the test suite to be able to invoke lld.
 
         Sets up some environment variables important to lld, locates a
@@ -450,20 +460,36 @@ class LLVMConfig(object):
         substitutions useful to any test suite that makes use of lld.
 
         """
-        # Tweak the PATH to include the tools dir
-        tool_dirs = [self.config.llvm_tools_dir]
-        lib_dirs = [self.config.llvm_libs_dir]
-        lld_tools_dir = getattr(self.config, 'lld_tools_dir', None)
-        lld_libs_dir = getattr(self.config, 'lld_libs_dir', None)
 
-        if lld_tools_dir:
-            tool_dirs = tool_dirs + [lld_tools_dir]
-        if lld_libs_dir:
-            lib_dirs = lib_dirs + [lld_libs_dir]
+        # Tweak the PATH to include the tools dir and the scripts dir.
+        exe_dir_props = [self.config.name.lower() + '_tools_dir', 'lld_tools_dir', 'llvm_tools_dir']
+        paths = [getattr(self.config, pp) for pp in exe_dir_props
+                 if getattr(self.config, pp, None)]
+        paths = additional_tool_dirs + paths
+        self.with_environment('PATH', paths, append_path=True)
 
-        self.with_environment('PATH', tool_dirs, append_path=True)
-        self.with_environment('LD_LIBRARY_PATH', lib_dirs, append_path=True)
+        lib_dir_props = [self.config.name.lower() + '_libs_dir', 'lld_libs_dir', 'llvm_libs_dir']
+        paths = [getattr(self.config, pp) for pp in lib_dir_props
+                 if getattr(self.config, pp, None)]
 
-        tool_patterns = ['lld', 'ld.lld', 'lld-link', 'ld64.lld', 'wasm-ld']
+        self.with_environment('LD_LIBRARY_PATH', paths, append_path=True)
 
-        self.add_tool_substitutions(tool_patterns, tool_dirs)
+        # Discover the 'clang' and 'clangcc' to use.
+
+        ld_lld = self.use_llvm_tool('ld.lld', required=required)
+        lld_link = self.use_llvm_tool('lld-link', required=required)
+        ld64_lld = self.use_llvm_tool('ld64.lld', required=required)
+        wasm_ld = self.use_llvm_tool('wasm-ld', required=required)
+
+        was_found = ld_lld and lld_link and ld64_lld and wasm_ld
+        tool_substitutions = []
+        if ld_lld:
+            tool_substitutions.append(ToolSubst('ld.lld', command=ld_lld))
+        if lld_link:
+            tool_substitutions.append(ToolSubst('lld-link', command=lld_link))
+        if ld64_lld:
+            tool_substitutions.append(ToolSubst('ld64.lld', command=ld64_lld))
+        if wasm_ld:
+            tool_substitutions.append(ToolSubst('wasm-ld', command=wasm_ld))
+        self.add_tool_substitutions(tool_substitutions)
+        return was_found
diff --git a/utils/lit/lit/llvm/subst.py b/utils/lit/lit/llvm/subst.py
index 3c8db1d31ff20347a0c2e05c0978dbc4a72eaf20..4275b8a56a3be4aad21a22fddfa914876fd89777 100644
--- a/utils/lit/lit/llvm/subst.py
+++ b/utils/lit/lit/llvm/subst.py
@@ -80,6 +80,7 @@ class ToolSubst(object):
         self.extra_args = extra_args
         self.key = key
         self.command = command if command is not None else FindTool(key)
+        self.was_resolved = False
         if verbatim:
             self.regex = key
             return
@@ -141,5 +142,6 @@ class ToolSubst(object):
                 return None
             else:
                 raise 'Unexpected value for ToolSubst.unresolved'
-
+        if command_str:
+            self.was_resolved = True
         return (self.regex, tool_pipe, command_str)
diff --git a/utils/not/not.cpp b/utils/not/not.cpp
index f48079d8875a299277b38f61e314cf352d415b88..e35c884944ce645c9f0617de0671c5f3ceec1c3d 100644
--- a/utils/not/not.cpp
+++ b/utils/not/not.cpp
@@ -13,7 +13,9 @@
 //     Will return true if cmd crashes (e.g. for testing crash reporting).
 
 #include "llvm/Support/Program.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 int main(int argc, const char **argv) {
@@ -33,8 +35,8 @@ int main(int argc, const char **argv) {
 
   auto Program = sys::findProgramByName(argv[0]);
   if (!Program) {
-    errs() << "Error: Unable to find `" << argv[0]
-           << "' in PATH: " << Program.getError().message() << "\n";
+    WithColor::error() << "unable to find `" << argv[0]
+                       << "' in PATH: " << Program.getError().message() << "\n";
     return 1;
   }
 
@@ -53,7 +55,7 @@ int main(int argc, const char **argv) {
     Result = -3;
 #endif
   if (Result < 0) {
-    errs() << "Error: " << ErrMsg << "\n";
+    WithColor::error() << ErrMsg << "\n";
     if (ExpectCrash)
       return 0;
     return 1;
diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
index 1dac7878ff3de8d9c1b356df6781b8d67811c9ce..30767f6f3509b29a0288851aad8d5d0bfb3703d2 100755
--- a/utils/release/build_llvm_package.bat
+++ b/utils/release/build_llvm_package.bat
@@ -44,8 +44,8 @@ svn.exe export -r %revision% http://llvm.org/svn/llvm-project/openmp/%branch% ll
 
 
 REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226.
-REM Excluding wasm target to work around PR39448.
-set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_USE_CRT_RELEASE=MT -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version% -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " -DLLVM_TARGETS_TO_BUILD="AArch64;AMDGPU;ARM;BPF;Hexagon;Lanai;Mips;MSP430;NVPTX;PowerPC;Sparc;SystemZ;X86;XCore"
+set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_USE_CRT_RELEASE=MT -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version% -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: "
+
 REM TODO: Run all tests, including lld and compiler-rt.
 
 set "VSCMD_START_DIR=%CD%"
diff --git a/utils/release/tag.sh b/utils/release/tag.sh
index 89aab6b1e7b438ea6709447624d0dbff8fb345d3..100831d14bd50dae56db49ae34d7c64c4d6f5c3f 100755
--- a/utils/release/tag.sh
+++ b/utils/release/tag.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #===-- tag.sh - Tag the LLVM release candidates ----------------------------===#
 #
 #                     The LLVM Compiler Infrastructure
@@ -17,7 +17,8 @@ set -e
 release=""
 rc=""
 rebranch="no"
-projects="llvm cfe test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp libunwind debuginfo-tests"
+# All the projects that make it into the monorepo, plus test-suite.
+projects="monorepo-root cfe clang-tools-extra compiler-rt debuginfo-tests libclc libcxx libcxxabi libunwind lld lldb llgo llvm openmp parallel-libs polly pstl test-suite"
 dryrun=""
 revision="HEAD"
 
@@ -36,35 +37,50 @@ usage() {
 }
 
 tag_version() {
+    local remove_args=()
+    local create_args=()
+    local message_prefix
     set -x
-    for proj in  $projects; do
+    for proj in $projects; do
         if svn ls $base_url/$proj/branches/release_$branch_release > /dev/null 2>&1 ; then
             if [ $rebranch = "no" ]; then
                 continue
             fi
-            ${dryrun} svn remove -m "Removing old release_$branch_release branch for rebranching." \
-                $base_url/$proj/branches/release_$branch_release
+            remove_args+=(rm "$proj/branches/release_$branch_release")
         fi
-        ${dryrun} svn copy -m "Creating release_$branch_release branch off revision ${revision}" \
-            -r ${revision} \
-            $base_url/$proj/trunk \
-            $base_url/$proj/branches/release_$branch_release
+        create_args+=(cp ${revision} "$proj/trunk" "$proj/branches/release_$branch_release")
     done
+    if [[ ${#remove_args[@]} -gt 0 ]]; then
+        message_prefix="Removing and recreating"
+    else
+        message_prefix="Creating"
+    fi
+    if [[ ${#create_args[@]} -gt 0 ]]; then
+        ${dryrun} svnmucc --root-url "$base_url" \
+            -m "$message_prefix release_$branch_release branch off revision ${revision}" \
+            "${remove_args[@]}" "${create_args[@]}"
+    fi
     set +x
 }
 
 tag_release_candidate() {
+    local create_args=()
     set -x
     for proj in $projects ; do
         if ! svn ls $base_url/$proj/tags/RELEASE_$tag_release > /dev/null 2>&1 ; then
-            ${dryrun} svn mkdir -m "Creating release directory for release_$tag_release." $base_url/$proj/tags/RELEASE_$tag_release
+            create_args+=(mkdir "$proj/tags/RELEASE_$tag_release")
         fi
         if ! svn ls $base_url/$proj/tags/RELEASE_$tag_release/$rc > /dev/null 2>&1 ; then
-            ${dryrun} svn copy -m "Creating release candidate $rc from release_$tag_release branch" \
-                $base_url/$proj/branches/release_$branch_release \
-                $base_url/$proj/tags/RELEASE_$tag_release/$rc
+            create_args+=(cp HEAD
+                          "$proj/branches/release_$branch_release"
+                          "$proj/tags/RELEASE_$tag_release/$rc")
         fi
     done
+    if [[ ${#create_args[@]} -gt 0 ]]; then
+        ${dryrun} svnmucc --root-url "$base_url" \
+            -m "Creating release candidate $rc from release_$tag_release branch" \
+            "${create_args[@]}"
+    fi
     set +x
 }
 
@@ -104,7 +120,7 @@ while [ $# -gt 0 ]; do
     shift
 done
 
-if [ "x$release" = "x" ]; then
+if [ "$release" = "" ]; then
     echo "error: need to specify a release version"
     echo
     usage
@@ -114,7 +130,7 @@ fi
 branch_release=`echo $release | sed -e 's,\([0-9]*\.[0-9]*\).*,\1,' | sed -e 's,\.,,g'`
 tag_release=`echo $release | sed -e 's,\.,,g'`
 
-if [ "x$rc" = "x" ]; then
+if [ "$rc" = "" ]; then
     tag_version
 else
     if [ "$revision" != "HEAD" ]; then
diff --git a/utils/sanitizers/ubsan_blacklist.txt b/utils/sanitizers/ubsan_blacklist.txt
index b5bbfddceef6dcf6232a9d35204cd98c742c09f6..144ab1f4685d1c4353d3396904b876fd05ff01ac 100644
--- a/utils/sanitizers/ubsan_blacklist.txt
+++ b/utils/sanitizers/ubsan_blacklist.txt
@@ -10,3 +10,7 @@ src:*bits/stl_tree.h
 # data() on an empty vector: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59829
 src:*bits/stl_iterator.h
 src:*bits/stl_vector.h
+
+# libstdc++ 8.0.1 casts an under-aligned pointer to type type_info:
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85930
+src:*bits/shared_ptr_base.h
diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim
index 33abbc823f7a84b32f3e01af7d1a89ccbaf61df7..9345a9c0901af9bde2d131bbd64e300cb2daf31b 100644
--- a/utils/vim/syntax/llvm.vim
+++ b/utils/vim/syntax/llvm.vim
@@ -26,7 +26,7 @@ syn keyword llvmStatement add addrspacecast alloca and arcp ashr atomicrmw
 syn keyword llvmStatement bitcast br catchpad catchswitch catchret call
 syn keyword llvmStatement cleanuppad cleanupret cmpxchg eq exact extractelement
 syn keyword llvmStatement extractvalue fadd fast fcmp fdiv fence fmul fpext
-syn keyword llvmStatement fptosi fptoui fptrunc free frem fsub getelementptr
+syn keyword llvmStatement fptosi fptoui fptrunc free frem fsub fneg getelementptr
 syn keyword llvmStatement icmp inbounds indirectbr insertelement insertvalue
 syn keyword llvmStatement inttoptr invoke landingpad load lshr malloc max min
 syn keyword llvmStatement mul nand ne ninf nnan nsw nsz nuw oeq oge ogt ole